From d9878379c849833a0f58918a2c6b8357ba54d2c1 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Fri, 21 Oct 2022 14:22:20 +0000
Subject: [PATCH 0001/1922] Fix typo under torch directory (#87274)

This PR fixes typo in .md files under torch directory

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87274
Approved by: https://github.com/albanD
---
 .../activation_sparsifier/README.md            |  2 +-
 .../_experimental/data_sparsifier/README.md    |  2 +-
 torch/ao/quantization/fx/README.md             |  2 +-
 .../ao/quantization/fx/_model_report/README.md | 18 +++++++++---------
 torch/csrc/jit/OVERVIEW.md                     |  2 +-
 torch/csrc/jit/codegen/cuda/README.md          |  4 ++--
 .../jit/codegen/cuda/python_frontend/README.md |  4 ++--
 torch/csrc/jit/docs/serialization.md           |  2 +-
 torch/csrc/jit/operator_upgraders/README.md    |  2 +-
 torch/csrc/jit/runtime/static/README.md        |  4 ++--
 torch/distributed/benchmarks/README.md         |  2 +-
 torch/fx/passes/README.md                      |  2 +-
 12 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/torch/ao/pruning/_experimental/activation_sparsifier/README.md b/torch/ao/pruning/_experimental/activation_sparsifier/README.md
index 3c2514c2f116b..810b053d92221 100644
--- a/torch/ao/pruning/_experimental/activation_sparsifier/README.md
+++ b/torch/ao/pruning/_experimental/activation_sparsifier/README.md
@@ -60,7 +60,7 @@ def mask_fn(tensor, threshold):  # threshold is the sparse config here
 ```
 
 ## API Design
-`ActivationSparsifier`: Attaches itself to a model layer and sparsifies the activation flowing through that layer. The user can pass in the default `aggregate_fn`, `reduce_fn` and `mask_fn`. Additionaly, `features` and `feature_dim` are also accepted.
+`ActivationSparsifier`: Attaches itself to a model layer and sparsifies the activation flowing through that layer. The user can pass in the default `aggregate_fn`, `reduce_fn` and `mask_fn`. Additionally, `features` and `feature_dim` are also accepted.
 
 `register_layer`: Registers a layer for sparsification. Specifically, registers `forward_pre_hook()` that performs aggregation.
 
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/README.md b/torch/ao/pruning/_experimental/data_sparsifier/README.md
index c6fc99b36c8c4..faea74355360a 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/README.md
+++ b/torch/ao/pruning/_experimental/data_sparsifier/README.md
@@ -3,7 +3,7 @@
 The data sparsifier inherits from the `BaseSparsifier` class. It attempts to sparsify data tensors in general (trainable and non-trainable).
 
 ## Implementation Details
-The data sparsifier does not receive a model or a layer to sparsify. Hence, the mask needs to be owned by the data sparsifier. This is acheived by introducing a private container model that registers the data as a parametrized buffer.
+The data sparsifier does not receive a model or a layer to sparsify. Hence, the mask needs to be owned by the data sparsifier. This is achieved by introducing a private container model that registers the data as a parametrized buffer.
 
 The BaseDataSparsifier handles all the housekeeping while allowing the user to just implement the `update_mask` logic in their implementation.
 
diff --git a/torch/ao/quantization/fx/README.md b/torch/ao/quantization/fx/README.md
index 389a5e428627d..0ee5c5ec7e3f5 100644
--- a/torch/ao/quantization/fx/README.md
+++ b/torch/ao/quantization/fx/README.md
@@ -248,7 +248,7 @@ Note: weight + FakeQuantize is a part of qat_linear_relu
 `backend_config` configurations used in this step:
 ```
 BackendConfig(nniqat.LinearReLU)
-    .set_observation_type(ObservationType.OUTPUT_USE_DIFFFERENT_OBSERVER_AS_INPUT)
+    .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)
     .set_dtype_configs([
         DTypeConfig(input_dtype=torch.quint8, output_dtype = torch.quint8, weight_dtype = torch.qint8, bias_dtype = torch.float32)]
     )
diff --git a/torch/ao/quantization/fx/_model_report/README.md b/torch/ao/quantization/fx/_model_report/README.md
index 0c4943ad6a755..dc11510f6c9ed 100644
--- a/torch/ao/quantization/fx/_model_report/README.md
+++ b/torch/ao/quantization/fx/_model_report/README.md
@@ -32,7 +32,7 @@ model_report = ModelReport(model, detector_set)
 ready_for_callibrate = model_report.prepare_detailed_callibration()
 
 # callibrate model and generate report
-ready_for_callibrate(example_input) # TODO run callibration of model with relavent data
+ready_for_callibrate(example_input) # TODO run callibration of model with relevant data
 reports = model_report.generate_model_report(remove_inserted_observers=True)
 for report_name in report.keys():
     text_report, report_dict = reports[report_name]
@@ -61,8 +61,8 @@ This is so that we can keep track of where we want to insert observers on a dete
 - `prepare_detailed_calibration(self)` &rarr; `GraphModule` inserts observers into the locations specified by each detector in the model.
 It then returns the GraphModule with the detectors inserted into both the regular module structure as well as the node structure.
 - `generate_model_report(self, remove_inserted_observers: bool)` &rarr; `Dict[str, Tuple[str, Dict]]` uses callibrated GraphModule to optionally removes inserted observers, and generate, for each detector the ModelReport instance was initialized with:
-  - A string-based report that is easily digestable and actionable explaining the data collected by relavent observers for that detector
-  - A dictionary containing statistics collected by the relavent observers and values calculated by the detector for futher analysis or plotting
+  - A string-based report that is easily digestable and actionable explaining the data collected by relevant observers for that detector
+  - A dictionary containing statistics collected by the relevant observers and values calculated by the detector for futher analysis or plotting
 
 ## ModelReportVisualizer Overview
 
@@ -127,21 +127,21 @@ return_dict = {
     "[unique_observer_fqn_of_insert_location]" :
     {
         "target_node" -> the node we are trying to observe with this observer (torch.fx.node.Node),
-        "insert_observer" -> the intialized observer we wish to insert (ObserverBase),
+        "insert_observer" -> the initialized observer we wish to insert (ObserverBase),
         "insert_post" -> True if this is meant to be a post-observer for target_node, False if pre-observer,
         "observer_args" -> The arguments that are meant to be passed into the observer,
     }
 }
 ```
 - `get_detector_name(self)` -> `str`: returns the name of the detector.
-You should give your detector a unique name different from exisiting detectors.
+You should give your detector a unique name different from existing detectors.
 - `generate_detector_report(self, model)` -> `Tuple[str, Dict[str, Any]]`: generates a report based on the information the detector is trying to collect.
 This report consists of both a text-based report as well as a dictionary of collected and calculated statistics.
 This report is returned to the `ModelReport` instance, which will then compile all the reports of all the Detectors requested by the user.
 
 ## ModelReportObserver Overview
 
-As seen in the [requirments to implement a detector section](#requirements-to-implement-a-detector), one of the key parts of implementing a detector is to specify what `Observer` we are trying to insert.
+As seen in the [requirements to implement a detector section](#requirements-to-implement-a-detector), one of the key parts of implementing a detector is to specify what `Observer` we are trying to insert.
 All the detectors in the ModelReport API use the [`ModelReportObserver`](https://github.com/pytorch/pytorch/blob/master/torch/ao/quantization/fx/_model_report/model_report_observer.py).
 While the core purpose of many observers in PyTorch's Quantization API is to collect min / max information to help determine quantization parameters, the `ModelReportObserver` collects additional statistics.
 
@@ -152,7 +152,7 @@ The statistics collected by the `ModelReportObserver` include:
 - Ratio of 100th percentile to some *n*th percentile
 - Number of constant value batches to pass through each channel
 
-After the `ModelReportObserver` collects the statistics above during the callibration process, the detectors then extract the information they need to generate their reports from the relavent observers.
+After the `ModelReportObserver` collects the statistics above during the callibration process, the detectors then extract the information they need to generate their reports from the relevant observers.
 
 ### Using Your Own Observer
 
@@ -187,7 +187,7 @@ Since you are also implementing your own detector in this case, it is up to you
     - A line plot (for both per-tensor and per-channel statistics)
     - A histogram (for both per-tensor and per-channel statistics)
 - `model_report.py`: File containing the `ModelReport` class
-  - Main class users are interacting with to go through the ModelReport worflow
+  - Main class users are interacting with to go through the ModelReport workflow
   - API described in detail in [Overview section](#modelreport-overview)
 
 # Tests
@@ -200,7 +200,7 @@ These tests include:
 - Test class for the `ModelReportVisualizer` class
 - Test class for **each** of the implemented Detectors
 
-If you wish to add a Detector, make sure to create a test class modeled after one of the exisiting classes and test your detector.
+If you wish to add a Detector, make sure to create a test class modeled after one of the existing classes and test your detector.
 Because users will be interacting with the Detectors through the `ModelReport` class and not directly, ensure that the tests follow this as well.
 
 # Future Tasks and Improvements
diff --git a/torch/csrc/jit/OVERVIEW.md b/torch/csrc/jit/OVERVIEW.md
index c1bcd57c73a5f..638cbf883bf71 100644
--- a/torch/csrc/jit/OVERVIEW.md
+++ b/torch/csrc/jit/OVERVIEW.md
@@ -1408,7 +1408,7 @@ TODO: differentiation, symbolic autograd, fusion, operators
 We attempt to reduce the number of `prim::Guard` nodes as these nodes may interfere with optimizations.
 * First, `GuardElimination::moveGuardsToDefs` tries to move `prim::Guards` to their definitions, so the guards guarding the same `Tensor` follow the definition directly or another guard on the same `Tensor`.
 * This ordering allows us to **coalesce** (done in `GuardElimination::coalesceGuards`) multiple guards into a single one.
-* After guards are  **coaslesced** , `GuardElimination::eliminateGuards` attempts to eliminate more guards as follows: it inspects each operation and its inputs. It checks if inputs to the operation are guarded and also if the operation produces the consistent shapes given the guarded inputs. For example, if two inputs to `add` are guaranteed to be of shape `(2, 3)`, the output shape will also always be `(2, 3)`. If this property holds, we are allowed to remove the guard guarding operation's output.
+* After guards are  **coalesced** , `GuardElimination::eliminateGuards` attempts to eliminate more guards as follows: it inspects each operation and its inputs. It checks if inputs to the operation are guarded and also if the operation produces the consistent shapes given the guarded inputs. For example, if two inputs to `add` are guaranteed to be of shape `(2, 3)`, the output shape will also always be `(2, 3)`. If this property holds, we are allowed to remove the guard guarding operation's output.
 
 Lastly, we need to be handle cases when the assumptions about `Tensor` shapes fail at runtime. To handle guard failures, we need to be able to run the original code i.e. the code  that doesn't rely on assumptions about shapes. As guards can be inserted and moved (by Optimizer) at/to arbitrary points in a computational graph, we need to be able to resume execution starting from those arbitrary points onward.
 
diff --git a/torch/csrc/jit/codegen/cuda/README.md b/torch/csrc/jit/codegen/cuda/README.md
index be8aed6c5ce44..284fd14111962 100644
--- a/torch/csrc/jit/codegen/cuda/README.md
+++ b/torch/csrc/jit/codegen/cuda/README.md
@@ -197,8 +197,8 @@ First thing is to check that you have fusion kernel running properly. Try to run
 
 If turning on NVFuser produces unexpected outputs, set the `PYTORCH_NVFUSER_DISABLE` environment variable to disable some of the optional features, e.g.:
 - `fma`: disable using FMA instructions
-- `index_hoist`: disble optimization to hoist comon index expressions
-- `predicate_elimination`: disble optimization to eliminate redundant predicates
+- `index_hoist`: disable optimization to hoist common index expressions
+- `predicate_elimination`: disable optimization to eliminate redundant predicates
 - `unroll_with_rng`: disable unrolling when RNG is used
 
 For example, `export PYTORCH_NVFUSER_DISABLE=fma,index_hoist` would disable FMA and index hoisting.
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/README.md b/torch/csrc/jit/codegen/cuda/python_frontend/README.md
index 7f3364e05c69b..d519e69bcda3c 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/README.md
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/README.md
@@ -51,7 +51,7 @@ nvf_out = fs.execute([input1, input2])[0]
 * `id()`: Returns the fusion id for a given `Fusion`.
 * `print()`: Prints the low level IR for the currently defined fusion.
 
-### `FusionDefiniton` Context Manager - Interface for Defining Fusions
+### `FusionDefinition` Context Manager - Interface for Defining Fusions
 
 #### Defining Input Tensors
 _All intermediate tensors are created by operations.  Constant tensors do not exist._
@@ -108,7 +108,7 @@ python -c "from torch._C._nvfuser import FusionDefinition; help(FusionDefinition
 ```
 #### Notating Outputs
 
-The `FusionDefintion` `add_output` method is used to indicate an intermediate is an output to the fusion.
+The `FusionDefinition` `add_output` method is used to indicate an intermediate is an output to the fusion.
 
 ```python
 add_output(output: Tensor)
diff --git a/torch/csrc/jit/docs/serialization.md b/torch/csrc/jit/docs/serialization.md
index 8c3461a9abe83..a374f5bed40ba 100644
--- a/torch/csrc/jit/docs/serialization.md
+++ b/torch/csrc/jit/docs/serialization.md
@@ -127,7 +127,7 @@ its methods or attributes.
 
 **Uses of tensor constants**. Most constants are inlined as literals, like
 strings or ints. But since tensors are potentially very large, when
-`PythonPrint` encouters a constant tensor it will emit a reference to a
+`PythonPrint` encounters a constant tensor it will emit a reference to a
 global `CONSTANTS` table (like `foo = CONSTANTS.c0`).
 
 When importing, the importer will know how to resolve this reference into an
diff --git a/torch/csrc/jit/operator_upgraders/README.md b/torch/csrc/jit/operator_upgraders/README.md
index 084e6688f148e..a4061bf17921a 100644
--- a/torch/csrc/jit/operator_upgraders/README.md
+++ b/torch/csrc/jit/operator_upgraders/README.md
@@ -1,6 +1,6 @@
 # Guidance for Operator Developer
 
-PyTorch’s operators sometimes require changes for different reasons (e.g. from improving their usability to fixing bugs). These changes can be backward compatibility (BC) breaking, where older programs will no longer run as expected (or at all) on the latest version of PyTorch (an old program / new runtime problem), or forward compatibility (FC) breaking, where new programs will not run on older versions of PyTorch (a new program / old runtime problem). This guidance focuses on the requirements for maintaining backwards comatibility when making changes to an operator.
+PyTorch’s operators sometimes require changes for different reasons (e.g. from improving their usability to fixing bugs). These changes can be backward compatibility (BC) breaking, where older programs will no longer run as expected (or at all) on the latest version of PyTorch (an old program / new runtime problem), or forward compatibility (FC) breaking, where new programs will not run on older versions of PyTorch (a new program / old runtime problem). This guidance focuses on the requirements for maintaining backwards compatibility when making changes to an operator.
 In order to do this we introduce the concept of the *upgrader*: a method to adapt the new operator to mimic the old operator behavior.
 When a new runtime reads an old program containing the old operator definition, the upgrader will adapt the old operator definition to comply with the new operator implementation. As you would expect, an upgrader is only applied when an old operation definition is encountered (i.e. if there are no "old" operators in the program, no upgrader would be used).
 For more details on the reasoning behind this new requirement please refer to the [PyTorch Operator Versioning RFC](https://github.com/pytorch/rfcs/blob/master/RFC-0017-PyTorch-Operator-Versioning.md).
diff --git a/torch/csrc/jit/runtime/static/README.md b/torch/csrc/jit/runtime/static/README.md
index 82d42d4b9f4c7..03e5ee6d75dc4 100644
--- a/torch/csrc/jit/runtime/static/README.md
+++ b/torch/csrc/jit/runtime/static/README.md
@@ -141,9 +141,9 @@ is selected instead.
 
 When loading a model, ops are selected for each `torch::jit::Node` in the graph as follows:
 
-1) If an out variant is registered, pass the node to the function that prodcues the `SROperator`. If
+1) If an out variant is registered, pass the node to the function that produces the `SROperator`. If
 the result is not `nulltpr`, use that op.
-2) If a native function is registered, pass the node to the function that prodcues the `SROperator`. If
+2) If a native function is registered, pass the node to the function that produces the `SROperator`. If
 the result is not `nulltpr`, use that op.
 3) Use the JIT implementation. Static runtime will throw an exception if it does not exist.
 
diff --git a/torch/distributed/benchmarks/README.md b/torch/distributed/benchmarks/README.md
index 082ab87af623c..f5b1ec6bff2de 100644
--- a/torch/distributed/benchmarks/README.md
+++ b/torch/distributed/benchmarks/README.md
@@ -11,7 +11,7 @@ There are different training paradigms where combining these two techniques migh
 2) Enable hybrid parallelism as described in the [PipeDream](https://arxiv.org/abs/1806.03377) paper. We can use the [Distributed RPC framework](https://pytorch.org/docs/master/rpc.html) to pipeline stages of the model across multiple workers and replicate each stage (if needed) using [DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel).
 
 ## Training Process
-This benchmark focuses on the first paradime above. The training process is executed as follows:
+This benchmark focuses on the first paradigm above. The training process is executed as follows:
 
 1) The master creates embedding tables on each of the 8 Parameter Servers and holds an [RRef](https://pytorch.org/docs/master/rpc.html#rref) to it.
 2) The master, then kicks off the training loop on the 8 trainers and passes the embedding table RRef to the trainers.
diff --git a/torch/fx/passes/README.md b/torch/fx/passes/README.md
index a2996848713e5..e972234f20824 100644
--- a/torch/fx/passes/README.md
+++ b/torch/fx/passes/README.md
@@ -1,5 +1,5 @@
 ## FX Pass Infrastructure
-This folder contains the pass infarstructure and passes for transforming fx.Graph.
+This folder contains the pass infrastructure and passes for transforming fx.Graph.
 
 
 ## Code Structure

From 1287a8968754ac63db965827bdc357ac274ce1ef Mon Sep 17 00:00:00 2001
From: Antonio Kim <antonio.kim@cerebras.net>
Date: Fri, 21 Oct 2022 14:28:14 +0000
Subject: [PATCH 0002/1922] Make LazyGraphExecutor extensible (#87218)

Add `LazyGraphExecutor` to backend interface so that its is extensible by a vendor backend.

I've made some preliminary methods virtual. Not sure if we want to make all methods in `LazyGraphExecutor` virtual.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87218
Approved by: https://github.com/wconstab, https://github.com/alanwaketan
---
 .github/ci_commit_pins/xla.txt                 |  2 +-
 torch/csrc/lazy/backend/backend_interface.cpp  |  5 -----
 torch/csrc/lazy/backend/backend_interface.h    |  3 +--
 torch/csrc/lazy/core/lazy_graph_executor.cpp   | 13 +++++++++++--
 torch/csrc/lazy/core/lazy_graph_executor.h     | 14 ++++++++++++--
 torch/csrc/lazy/ts_backend/ts_backend_impl.cpp |  4 ++++
 6 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index 2ca663bacdea0..e7375040708bd 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-e1f5a49664b904e3ec1ddb9095ca75b6bbb5c10d
+eff277e81fcfdeccba71e75ff40b6e2f3e29e27b
diff --git a/torch/csrc/lazy/backend/backend_interface.cpp b/torch/csrc/lazy/backend/backend_interface.cpp
index cbcd92b6a9924..250a8847351c6 100644
--- a/torch/csrc/lazy/backend/backend_interface.cpp
+++ b/torch/csrc/lazy/backend/backend_interface.cpp
@@ -18,11 +18,6 @@ const BackendImplInterface* getBackend() {
   return interface;
 }
 
-// default implementation
-bool BackendImplInterface::ShouldSyncTensor(const LazyTensorPtr tensor) const {
-  return tensor->GetIrValue()->op() != ltc_not_supported;
-}
-
 BackendRegistrar::BackendRegistrar(
     const BackendImplInterface* backend_impl_interface) {
   backend_impl_registry.store(backend_impl_interface);
diff --git a/torch/csrc/lazy/backend/backend_interface.h b/torch/csrc/lazy/backend/backend_interface.h
index 2936105dc6a3d..a70591c2a19c8 100644
--- a/torch/csrc/lazy/backend/backend_interface.h
+++ b/torch/csrc/lazy/backend/backend_interface.h
@@ -4,6 +4,7 @@
 #include <torch/csrc/lazy/backend/backend_data.h>
 #include <torch/csrc/lazy/backend/backend_device.h>
 #include <torch/csrc/lazy/backend/lowering_context.h>
+#include <torch/csrc/lazy/core/lazy_graph_executor.h>
 #include <torch/csrc/lazy/core/shape.h>
 #include <torch/csrc/lazy/core/tensor.h>
 #include <atomic>
@@ -41,8 +42,6 @@ class TORCH_API BackendImplInterface {
 
   virtual const IrBuilder* GetIrBuilder() const = 0;
 
-  virtual bool ShouldSyncTensor(const LazyTensorPtr tensor) const;
-
   /**
    * Data Transfer
    * */
diff --git a/torch/csrc/lazy/core/lazy_graph_executor.cpp b/torch/csrc/lazy/core/lazy_graph_executor.cpp
index 96476e4a9663b..06b37797d3fa6 100644
--- a/torch/csrc/lazy/core/lazy_graph_executor.cpp
+++ b/torch/csrc/lazy/core/lazy_graph_executor.cpp
@@ -390,10 +390,15 @@ bool TensorsHaveIR(const std::vector<LazyTensorPtr>& tensors) {
   return false;
 }
 
+std::atomic<LazyGraphExecutor*> lazy_graph_executor_registry;
 } // namespace
 
+void LazyGraphExecutor::Register(LazyGraphExecutor* executor) {
+  lazy_graph_executor_registry.store(executor);
+}
 LazyGraphExecutor* LazyGraphExecutor::Get() {
-  static LazyGraphExecutor* executor = new LazyGraphExecutor();
+  auto* executor = lazy_graph_executor_registry.load();
+  TORCH_CHECK(executor, "Lazy graph executor not registered.");
   return executor;
 }
 
@@ -604,6 +609,10 @@ void LazyGraphExecutor::Async::Wait() {
   }
 }
 
+bool LazyGraphExecutor::ShouldSyncTensor(const LazyTensorPtr tensor) const {
+  return tensor->GetIrValue()->op() != ltc_not_supported;
+}
+
 LazyGraphExecutor::SyncTensorCollection LazyGraphExecutor::CollectSyncTensors(
     const std::vector<LazyTensorPtr>& tensors,
     const SyncTensorsConfig& config) {
@@ -635,7 +644,7 @@ LazyGraphExecutor::SyncTensorCollection LazyGraphExecutor::CollectSyncTensors(
         tensors[i]->CurrentDataHandle() == nullptr) {
       Value ir_value = tensors[i]->CurrentIrValue();
       if (ir_value) {
-        if (getBackend()->ShouldSyncTensor(tensors[i])) {
+        if (ShouldSyncTensor(tensors[i])) {
           // Add only tensors which need to be synced.
           coll.hash = HashCombine(coll.hash, ir_value.hash());
           coll.indices.push_back(i);
diff --git a/torch/csrc/lazy/core/lazy_graph_executor.h b/torch/csrc/lazy/core/lazy_graph_executor.h
index 8116ad23ff068..7a4498d85fc0f 100644
--- a/torch/csrc/lazy/core/lazy_graph_executor.h
+++ b/torch/csrc/lazy/core/lazy_graph_executor.h
@@ -21,10 +21,18 @@ class TORCH_API LazyGraphExecutor {
     bool read_only = false;
   };
 
+  // Register a lazy graph executor instance that can be retrieved using Get()
+  static void Register(LazyGraphExecutor*);
   static LazyGraphExecutor* Get();
 
-  void RegisterTensor(std::shared_ptr<LazyTensor::Data> data);
-  void UnregisterTensor(LazyTensor::Data* data);
+  virtual ~LazyGraphExecutor() = default;
+
+  // Override these methods to perform custom tensor registration and
+  // unregistration Note: It is vital that the parent implementations are also
+  // called
+  //       in order for the tensors to show up in the live tensor list
+  virtual void RegisterTensor(std::shared_ptr<LazyTensor::Data> data);
+  virtual void UnregisterTensor(LazyTensor::Data* data);
 
   // Seed for random generator
   Value GetRngSeed(const BackendDevice& device);
@@ -181,6 +189,8 @@ class TORCH_API LazyGraphExecutor {
     std::vector<BackendDataPtr> tensors_data;
   };
 
+  virtual bool ShouldSyncTensor(const LazyTensorPtr tensor) const;
+
   SyncTensorCollection CollectSyncTensors(
       const std::vector<LazyTensorPtr>& tensors,
       const SyncTensorsConfig& config);
diff --git a/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp b/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp
index a390ac76c1260..4003a005fbfab 100644
--- a/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp
@@ -2,6 +2,7 @@
 
 #include <ATen/Functions.h>
 #include <torch/csrc/lazy/backend/backend_device.h>
+#include <torch/csrc/lazy/core/lazy_graph_executor.h>
 #include <torch/csrc/lazy/generated/LazyNativeFunctions.h>
 #include <torch/csrc/lazy/ts_backend/config.h>
 #include <torch/csrc/lazy/ts_backend/ir_builder.h>
@@ -273,6 +274,9 @@ void InitTorchScriptBackend() {
   register_ts_ltc_eager_fallback();
   static std::unique_ptr<BackendRegistrar> s_registrar;
   s_registrar = std::make_unique<BackendRegistrar>(GetTSBackendImpl());
+
+  static LazyGraphExecutor* executor = new LazyGraphExecutor();
+  LazyGraphExecutor::Register(executor);
 }
 
 } // namespace lazy

From 98f127692ee7893266dc82f91592165652cd24a3 Mon Sep 17 00:00:00 2001
From: Masaki Kozuki <mkozuki@nvidia.com>
Date: Fri, 21 Oct 2022 15:05:36 +0000
Subject: [PATCH 0003/1922] Reenable `isinstance` with
 `torch.distributed.ReduceOp` (#87303)

tentatively marking as draft as I haven't gotten a comprehensive list of side effects...

Ref: https://stackoverflow.com/questions/40244413/python-static-class-attribute-of-the-class-itself
Rel: https://github.com/pytorch/pytorch/issues/87191

cc @kwen2501
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87303
Approved by: https://github.com/wanchaol
---
 test/distributed/test_c10d_common.py  | 12 +++++++++++
 torch/_C/_distributed_c10d.pyi        |  3 +--
 torch/csrc/distributed/c10d/Types.hpp |  5 ++++-
 torch/csrc/distributed/c10d/init.cpp  | 29 +++++++++++++++++++--------
 torch/distributed/distributed_c10d.py | 11 ++++++++++
 5 files changed, 49 insertions(+), 11 deletions(-)

diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index 68c760beacbbf..454595f85735c 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -1622,6 +1622,18 @@ def comm_fn(tensor, group=None):
         self._test_work_wait(tensor, comm_fn=comm_fn)
 
 
+class ReduceOpTest(TestCase):
+
+    def test_op_isinstance_of_reduceop(self):
+        for reduce_op in (
+            c10d.ReduceOp.SUM, c10d.ReduceOp.AVG, c10d.ReduceOp.PRODUCT, c10d.ReduceOp.MIN, c10d.ReduceOp.MAX,
+            c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR,
+        ):
+            self.assertTrue(isinstance(reduce_op, c10d.ReduceOp))
+        for scale in ([torch.tensor(1.0)], 2.0):
+            self.assertTrue(isinstance(dist._make_nccl_premul_sum(scale), c10d.ReduceOp))
+
+
 if __name__ == "__main__":
     assert (
         not torch.cuda._initialized
diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index aad37d6a8c5ae..bdf0166b8daa9 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -63,7 +63,6 @@ class DebugLevel(Enum):
 
 class ReduceOp:
 
-    # note(crcrpar): These values are populated from Kind
     SUM = ...
     PRODUCT = ...
     MIN = ...
@@ -74,7 +73,7 @@ class ReduceOp:
     PREMUL_SUM = ...
     UNUSED = ...
 
-    class Kind(Enum): ...
+    class RedOpType(Enum): ...
 
 class BroadcastOptions:
     rootRank: int
diff --git a/torch/csrc/distributed/c10d/Types.hpp b/torch/csrc/distributed/c10d/Types.hpp
index 4d928976d87ee..64fbc45c6588c 100644
--- a/torch/csrc/distributed/c10d/Types.hpp
+++ b/torch/csrc/distributed/c10d/Types.hpp
@@ -29,6 +29,7 @@ struct NCCLPreMulSumSupplement : _SupplementBase {
 // Other ReduceOps that need different supplementary data can also
 // derive from _SupplementBase.
 struct TORCH_API ReduceOp : torch::CustomClassHolder {
+  // note(crcrpar): RedOpType could be defined outside of `ReduceOp`
   enum RedOpType : uint8_t {
     SUM = 0,
     AVG = 1,
@@ -46,7 +47,9 @@ struct TORCH_API ReduceOp : torch::CustomClassHolder {
 
   ReduceOp(RedOpType op) : op_(op) {
     TORCH_INTERNAL_ASSERT(
-      op_ != PREMUL_SUM, "PREMUL_SUM requires a scale factor tensor or scalar argument");
+      op_ != PREMUL_SUM,
+      "Use `torch.distributed._make_nccl_premul_sum` to create an instance of ReduceOp with PREMUL_SUM"
+    );
   }
 
   ReduceOp(RedOpType op, c10::intrusive_ptr<_SupplementBase> optional_supplement) {
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 327c041357266..6515a3d9a87d4 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -515,10 +515,14 @@ An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_CO
           R"(Sets the debug level of the torch.distributed package from the
           ``TORCH_DISTRIBUTED_DEBUG`` environment variable.)");
 
+  // TODO(crcrpar): Hardening `ReduceOp`.
+  //    While keeping most op types as enum value,
+  //    making `PREMUL_SUM` callable, i.e., allowing for
+  //    `ReduceOp.PREMUL_SUM(scale)` might be better as per @wanchaol.
   // https://pybind11.readthedocs.io/en/stable/classes.html#enumerations-and-internal-types
   py::class_<::c10d::ReduceOp> reduce_op(module, "ReduceOp", R"(
 An enum-like class for available reduction operations: ``SUM``, ``PRODUCT``,
-``MIN``, ``MAX``, ``BAND``, ``BOR``, and ``BXOR``.
+``MIN``, ``MAX``, ``BAND``, ``BOR``, ``BXOR``, and ``PREMUL_SUM``.
 
 ``BAND``, ``BOR``, and ``BXOR`` reductions are not available when
 using the ``NCCL`` backend.
@@ -529,13 +533,16 @@ and only for NCCL versions 2.10 or later.
 
 ``PREMUL_SUM`` multiplies inputs by a given scalar locally before reduction.
 ``PREMUL_SUM`` is only available with the ``NCCL`` backend,
-and only available for NCCL versions 2.11 or later.
+and only available for NCCL versions 2.11 or later. Users are supposed to
+use ``torch.distributed._make_nccl_premul_sum``.
 
 Additionally, ``MAX``, ``MIN`` and ``PRODUCT`` are not supported for complex tensors.
 
 The values of this class can be accessed as attributes, e.g., ``ReduceOp.SUM``.
 They are used in specifying strategies for reduction collectives, e.g.,
-:func:`reduce`, :func:`all_reduce_multigpu`, etc.)");
+:func:`reduce`, :func:`all_reduce_multigpu`, etc.
+
+This class does not support ``__members__`` property.)");
 
   reduce_op.def(py::init<::c10d::ReduceOp::RedOpType>())
       .def_readwrite("op", &::c10d::ReduceOp::op_);
@@ -555,8 +562,14 @@ They are used in specifying strategies for reduction collectives, e.g.,
           [](const ::c10d::ReduceOp& self, const ::c10d::ReduceOp& other) {
             return self == other.op_;
           })
-      .def("__hash__", [](const ::c10d::ReduceOp& self) { return self.op_; });
-
+      .def("__hash__", [](const ::c10d::ReduceOp& self) {
+        return static_cast<uint8_t>(self.op_);
+      });
+
+  // note(crcrpar): Deliberately skip
+  // [`export_values`](https://pybind11.readthedocs.io/en/stable/classes.html#enumerations-and-internal-types)
+  // here and manually set values in Python side. See note "ReduceOp static
+  // class attributes to support `isinstance`"
   py::enum_<::c10d::ReduceOp::RedOpType>(reduce_op, "RedOpType")
       .value("SUM", ::c10d::ReduceOp::RedOpType::SUM)
       .value("AVG", ::c10d::ReduceOp::RedOpType::AVG)
@@ -566,10 +579,10 @@ They are used in specifying strategies for reduction collectives, e.g.,
       .value("BAND", ::c10d::ReduceOp::RedOpType::BAND)
       .value("BOR", ::c10d::ReduceOp::RedOpType::BOR)
       .value("BXOR", ::c10d::ReduceOp::RedOpType::BXOR)
-      .value("PREMUL_SUM", ::c10d::ReduceOp::RedOpType::PREMUL_SUM)
-      .export_values();
+      .value("PREMUL_SUM", ::c10d::ReduceOp::RedOpType::PREMUL_SUM);
 
-  // Ref: [Implicit
+  // note(crcrpar): This could be removed because users will not pass
+  // `RedOpType` to reduce collective ops Ref: [Implicit
   // conversions](https://pybind11.readthedocs.io/en/stable/advanced/classes.html#implicit-conversions)
   // Let us skip the explicit construction of `c10d::ReduceOp` from
   // `c10d::ReduceOp::RedOpType` in Python.
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 5c49c6b821687..7de47876b5664 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -233,6 +233,17 @@ def register_backend(cls, name, func, extended_api=False):
 dist_backend = Backend
 
 
+# NOTE(crcrpar): [ReduceOp static class attributes to support `isinstance`]
+#   A ReduceOp instance of `PREMUL_SUM` is supposed to be created via `_make_nccl_premul_sum`
+#   while the other `op`s (meaning RedOpType members) can be directly passed to c10d reduce collectives.
+#   I changed `ReduceOp` to struct from enum class and introduced RedOpType enum class for PREMUL_SUM,
+#   which broke an implicit contract of ReduceOp being enum-like with which users apply isinstance to
+#   `op`, for example, `isinstance(ReduceOp.SUM, ReduceOp)`: https://github.com/pytorch/pytorch/issues/87191
+DENY_LIST = ("PREMUL_SUM", )
+for _red_op_name, _red_op_value in ReduceOp.RedOpType.__members__.items():
+    setattr(ReduceOp, _red_op_name, _red_op_value if _red_op_name in DENY_LIST else ReduceOp(_red_op_value))
+
+
 class _reduce_op(object):
     r"""
     Deprecated enum-like class for reduction operations: ``SUM``, ``PRODUCT``,

From 5bfe34ab4e33d3239f2781030eeb85806d118542 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Fri, 21 Oct 2022 13:29:31 +0100
Subject: [PATCH 0004/1922] OpInfo: Add test that sample_inputs_func returns a
 generator (#84567)

This also includes a small list exception for single element lists since none of the memory usage or performance implications of lists apply there.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84567
Approved by: https://github.com/lezcano, https://github.com/mruberry
---
 test/test_testing.py                          | 38 ++++++++++++++++++-
 .../_internal/common_methods_invocations.py   | 12 +-----
 2 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/test/test_testing.py b/test/test_testing.py
index e31872f7da6fd..fad72ab91de0a 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -12,7 +12,7 @@
 import subprocess
 import sys
 import unittest.mock
-from typing import Any, Callable, Iterator, List, Tuple
+from typing import Any, Callable, Iterator, List, Tuple, Generator, Sequence
 
 import torch
 
@@ -23,7 +23,7 @@
 from torch.testing._internal.common_device_type import \
     (PYTORCH_TESTING_DEVICE_EXCEPT_FOR_KEY, PYTORCH_TESTING_DEVICE_ONLY_FOR_KEY, dtypes,
      get_device_type_test_bases, instantiate_device_type_tests, onlyCUDA, onlyNativeDeviceTypes,
-     deviceCountAtLeast, ops, expectedFailureMeta)
+     deviceCountAtLeast, ops, expectedFailureMeta, OpDTypes)
 from torch.testing._internal.common_methods_invocations import op_db
 from torch.testing._internal import opinfo
 from torch.testing._internal.common_dtype import all_types_and_complex_and
@@ -1881,5 +1881,39 @@ def test_sample_input_metadata(self) -> None:
         self.assertEqual(s2.name, "foo")
 
 
+# Tests that validate the various sample generating functions on each OpInfo.
+class TestOpInfoSampleFunctions(TestCase):
+
+    def _assert_is_generator_or_singleton(self, item, property_name):
+        if isinstance(item, Sequence):
+            msg = (
+                "{property_name} may only return lists for single items"
+                ", please use a coroutine which yields items instead")
+            self.assertTrue(len(item) <= 1, msg=msg)
+        else:
+            self.assertIsInstance(item, Generator)
+
+    @ops(op_db, dtypes=OpDTypes.any_one)
+    def test_opinfo_sample_generators(self, device, dtype, op):
+        # Test op.sample_inputs doesn't generate multiple samples when called
+        samples = op.sample_inputs(device, dtype)
+        self._assert_is_generator_or_singleton(samples, "sample_inputs_func")
+
+    @ops([op for op in op_db if op.reference_inputs_func is not None], dtypes=OpDTypes.any_one)
+    def test_opinfo_reference_generators(self, device, dtype, op):
+        # Test op.reference_inputs doesn't generate multiple samples when called
+        samples = op.reference_inputs(device, dtype)
+        self._assert_is_generator_or_singleton(samples, "reference_inputs_func")
+
+    @ops([op for op in op_db if op.error_inputs_func is not None], dtypes=OpDTypes.none)
+    def test_opinfo_error_generators(self, device, op):
+        # Test op.error_inputs doesn't generate multiple inputs when called
+        samples = op.error_inputs(device)
+        self._assert_is_generator_or_singleton(samples, "error_inputs_func")
+
+
+instantiate_device_type_tests(TestOpInfoSampleFunctions, globals())
+
+
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 47c01caaecab6..e5d6e6efe18a9 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -1234,22 +1234,14 @@ def get_independent_tensor(tensor):
     return tensor.clone().requires_grad_(tensor.requires_grad)
 
 def sample_inputs_randint(self, device, dtype, requires_grad, **kwargs):
-    samples = []
     low = 2
     high = 10
 
     for sample in sample_inputs_like_fns(self, device, dtype, requires_grad, **kwargs):
         # With high
-        samples.append(SampleInput(
-            high,
-            args=(sample.input.shape,) + sample.args,
-            kwargs=sample.kwargs))
+        yield SampleInput(high, sample.input.shape, *sample.args, **sample.kwargs)
         # With low and high
-        samples.append(SampleInput(
-            low,
-            args=(high, sample.input.shape) + sample.args,
-            kwargs=sample.kwargs))
-    return tuple(samples)
+        yield SampleInput(low, high, sample.input.shape, *sample.args, **sample.kwargs)
 
 def sample_inputs_randint_like(self, device, dtype, requires_grad, **kwargs):
     low = 2

From 3095b59fdb12e183f1d21dec4e951cb8697b4b6e Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Fri, 21 Oct 2022 05:54:15 -0700
Subject: [PATCH 0005/1922] Audit for error prone isinstance int/float and add
 lint (#87345)

We recently fixed a bug on symbolic-shapes branch where
an isinstance(x, int) test failed when passed a SymIntNode.
To prevent this, I've added a lint for all the codepaths
where we may pass SymInt/SymFloat directly to reject
direct isinstance int/float tests, and instead use one of
the aliases.  The lint rule explains the options.  I then
go and fix all of them.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87345
Approved by: https://github.com/bdhirsh, https://github.com/albanD
---
 .lintrunner.toml                | 29 +++++++++++++++++
 torch/_C/__init__.pyi.in        |  2 ++
 torch/_decomp/decompositions.py |  4 +--
 torch/_meta_registrations.py    | 16 +++++-----
 torch/_prims/__init__.py        |  8 +++--
 torch/_prims_common/__init__.py | 21 +++++++++----
 torch/_refs/__init__.py         | 56 ++++++++++++++++++---------------
 torch/_refs/linalg/__init__.py  |  7 +++--
 8 files changed, 96 insertions(+), 47 deletions(-)

diff --git a/.lintrunner.toml b/.lintrunner.toml
index 56ecfc7295f4c..70e2a423edcc1 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -420,6 +420,35 @@ command = [
     '@{{PATHSFILE}}'
 ]
 
+[[linter]]
+code = 'ERROR_PRONE_ISINSTANCE'
+include_patterns = [
+    'torch/_refs/**/*.py',
+    'torch/_prims/**/*.py',
+    'torch/_prims_common/**/*.py',
+    'torch/_decomp/**/*.py',
+    'torch/_meta_registrations.py',
+]
+command = [
+    'python3',
+    'tools/linter/adapters/grep_linter.py',
+    '--pattern=isinstance\([^)]+(int|float)\)',
+    '--linter-name=ERROR_PRONE_ISINSTANCE',
+    '--error-name=error prone isinstance',
+    """--error-description=\
+        This line has an isinstance call that directly refers to \
+        int or float.  This is error-prone because you may also \
+        have wanted to allow SymIntNode or SymFloatNode in your test.  \
+        To suppress this lint, use an appropriate type alias defined \
+        in torch._prims_common; use IntLike/FloatLike when you would accept \
+        both regular and symbolic numbers, Dim for ints representing \
+        dimensions, or IntWithoutSymInt/FloatWithoutSymFloat if you really \
+        meant to exclude symbolic numbers.
+    """,
+    '--',
+    '@{{PATHSFILE}}'
+]
+
 [[linter]]
 code = 'PYBIND11_SPECIALIZATION'
 include_patterns = [
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 70248d1325274..3c81b63721ccd 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -181,6 +181,8 @@ class SymFloatNode(object):
     @staticmethod
     def new_symfloat(obj) -> SymFloatNode: ...
 
+    def __ceil__(self) -> SymIntNode: ...
+
 # Defined in torch/csrc/jit/passes/xnnpack_rewrite.h
 class MobileOptimizerType:
     ...
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 4f61dc9b26f8a..9e9c36104ddc5 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -11,7 +11,7 @@
 import torch.nn.functional as F
 from torch import Tensor
 from torch._decomp import register_decomposition
-from torch._prims_common import NumberType, TensorLike, TensorSequenceType
+from torch._prims_common import IntLike, NumberType, TensorLike, TensorSequenceType
 from torch._prims_common.wrappers import _maybe_resize_out, _safe_copy_out, out_wrapper
 from torch.utils._pytree import tree_flatten, tree_map
 
@@ -1740,7 +1740,7 @@ def compute_idx(in_size, out_size):
         return torch.mean(vals, dim=(-3, -1))
 
     def maybe_mask(vals, length, range_max, adaptive, dim):
-        if isinstance(length, int):
+        if isinstance(length, IntLike):
             return vals, length
         else:
             # zero-out the things we didn't really want to select
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 2e1c728c582dc..c17aa091120cc 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -11,6 +11,8 @@
     corresponding_real_dtype,
     elementwise_dtypes,
     ELEMENTWISE_TYPE_PROMOTION_KIND,
+    FloatLike,
+    IntLike,
 )
 
 from torch._prims_common.wrappers import out_wrapper
@@ -361,24 +363,24 @@ def calc_conv_nd_return_shape(
         output_padding: Optional[Union[List[int], int]] = None,
     ):
         ret_shape = []
-        if isinstance(stride, int):
+        if isinstance(stride, IntLike):
             stride = [stride] * len(dims)
         elif len(stride) == 1:
             stride = [stride[0]] * len(dims)
 
-        if isinstance(padding, int):
+        if isinstance(padding, IntLike):
             padding = [padding] * len(dims)
         elif len(padding) == 1:
             padding = [padding[0]] * len(dims)
 
-        if isinstance(dilation, int):
+        if isinstance(dilation, IntLike):
             dilation = [dilation] * len(dims)
         elif len(dilation) == 1:
             dilation = [dilation[0]] * len(dims)
 
         output_padding_list: Optional[List[int]] = None
         if output_padding:
-            if isinstance(output_padding, int):
+            if isinstance(output_padding, IntLike):
                 output_padding_list = [output_padding] * len(dims)
             elif len(output_padding) == 1:
                 output_padding_list = [output_padding[0]] * len(dims)
@@ -1393,11 +1395,11 @@ def meta_like(self, *args, **kwargs):
 # hacky: Please remove after math.ceil works with arange
 @register_meta(aten.arange.default)
 def arange(end, **kwargs):
-    if isinstance(end, float):
-        end = math.ceil(end)
+    if isinstance(end, FloatLike):
+        end = math.ceil(end)  # type: ignore[arg-type]
 
     def is_integral(x):
-        return isinstance(x, int) or isinstance(x, bool)
+        return isinstance(x, IntLike) or isinstance(x, bool)
 
     set_to_integral_dtype = kwargs.get("dtype", None) is None and is_integral(end)
     if set_to_integral_dtype:
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index 8ea992894cf5e..d724ac50e2839 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -16,8 +16,10 @@
 from torch._prims.nvfuser_prims import register_nvprims
 from torch._prims_common import (
     check,
+    Dim,
     DimsSequenceType,
     DimsType,
+    IntLike,
     Number,
     NumberType,
     RETURN_TYPE,
@@ -929,7 +931,7 @@ def _fill_aten(a: Tensor, value: NumberType) -> Tensor:
 # div prim performs truncation division on integer inputs
 #   and true division for floating and complex inputs
 def _div_aten(a, b):
-    is_integral = isinstance(a, (bool, int)) or (
+    is_integral = isinstance(a, (bool, int, torch.SymIntNode)) or (
         isinstance(a, torch.Tensor) and utils.is_integer_dtype(a.dtype)
     )
 
@@ -1198,7 +1200,7 @@ def _broadcast_in_dim_meta(
     # (no relative reordering of dims) of integers and
     # each dimension must be within the new shape
     def _greater_than_reduce(acc, x):
-        assert isinstance(x, int)
+        assert isinstance(x, Dim)
         assert x > acc
         assert x < len(shape)
 
@@ -2319,7 +2321,7 @@ def _arange_meta(
     )
     if dtype is not None:
         pass
-    elif all(isinstance(arg, int) for arg in (start, end, step)):
+    elif all(isinstance(arg, IntLike) for arg in (start, end, step)):
         dtype = torch.int64
     else:
         dtype = torch.get_default_dtype()
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index 72a01a85359c8..d8321ac9a47c7 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -47,7 +47,15 @@ def getnvFuserDtype(dtype: Union[torch.dtype, NumberTypeType]):
 # TODO: This needs a lot more type annotations
 # NumberType = Union[bool, int, float, complex, torch.SymIntNode, torch.SymFloatNode]
 NumberType = Union[bool, int, float, complex]
+
 Number = (bool, int, float, complex, torch.SymIntNode, torch.SymFloatNode)
+# I don't call it Integral because numbers.Integral includes bool, but IntLike
+# does not
+Dim = int
+IntLike = (int, torch.SymIntNode)
+FloatLike = (float, torch.SymFloatNode)
+IntWithoutSymInt = int
+FloatWithoutSymFloat = float
 DeviceLikeType = Union[str, torch.device]
 Tensor = torch.Tensor
 
@@ -433,8 +441,8 @@ def validate_idx(rank: int, idx: int):
     Assumes the index is already canonicalized.
     """
 
-    assert isinstance(idx, int)
-    assert isinstance(rank, int)
+    assert isinstance(idx, Dim)
+    assert isinstance(rank, Dim)
 
     assert idx >= 0 and idx < rank or idx == 0
 
@@ -450,8 +458,8 @@ def validate_exclusive_idx(rank: int, ex_idx: int):
     for the given shape.
     """
 
-    assert isinstance(ex_idx, int)
-    assert isinstance(rank, int)
+    assert isinstance(ex_idx, Dim)
+    assert isinstance(rank, Dim)
     assert ex_idx > 0 and ex_idx <= rank
 
 
@@ -500,7 +508,7 @@ def canonicalize_dims(rank: int, indices: int) -> int:
 
 
 def canonicalize_dims(rank, indices):
-    if isinstance(indices, int):
+    if isinstance(indices, Dim):
         return canonicalize_dim(rank, indices)
 
     return tuple(canonicalize_dim(rank, x) for x in indices)
@@ -1439,7 +1447,8 @@ def set_correction(
         correction = 1
     elif correction is None and unbiased is not None:
         correction = 0 if unbiased is False else 1
-    if not isinstance(correction, int):
+    # NB: we don't actually support symint here, but it's harmless to accept
+    if not isinstance(correction, IntLike):
         raise ValueError("correction argument should be integer")
     if correction < 0:
         raise ValueError("correction argument should be non-negative")
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index a37673afb72af..08e1361c76220 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -16,10 +16,13 @@
 from torch._prims_common import (
     check,
     DeviceLikeType,
+    Dim,
     DimsSequenceType,
     DimsType,
     dtype_to_type,
     ELEMENTWISE_TYPE_PROMOTION_KIND,
+    FloatLike,
+    IntLike,
     is_weakly_lesser_type,
     Number,
     NumberType,
@@ -39,6 +42,7 @@
     elementwise_unary_scalar_wrapper,
     out_wrapper,
 )
+from torch.fx.experimental.symbolic_shapes import sym_float, sym_int
 
 # Experimental module containing prototype Python references for existing
 #   PyTorch operations.
@@ -298,7 +302,7 @@
 
 def _broadcast_shapes(*_shapes):
     shapes = tuple(
-        (x,) if isinstance(x, int) else x
+        (x,) if isinstance(x, IntLike) else x
         for x in filter(lambda x: x is not None, _shapes)
     )
 
@@ -1939,8 +1943,8 @@ def _reduction(
                     "dtype argument and out dtype must match in reduction"
                 )
     if not accepts_dim_tuple:
-        assert dims is None or isinstance(dims, int)
-    if isinstance(dims, int):
+        assert dims is None or isinstance(dims, Dim)
+    if isinstance(dims, Dim):
         dims = (dims,)  # type: ignore[assignment]
     dims = utils.reduction_dims(a.shape, dims)
     if not has_identity:
@@ -1986,7 +1990,7 @@ def all(
     keepdim: bool = False,
 ) -> TensorLikeType:
     # Computes nelem
-    if isinstance(dim, int):
+    if isinstance(dim, Dim):
         dim = (dim,)  # type: ignore[assignment]
 
     a_ = _maybe_convert_to_dtype(a, torch.bool)
@@ -2246,7 +2250,7 @@ def mean(
     )
     if utils.is_integer_dtype(dtype):
         raise RuntimeError("result type should be floating point or complex")
-    if isinstance(dim, int):
+    if isinstance(dim, Dim):
         dim = (dim,)  # type: ignore[assignment]
     dims = utils.reduction_dims(a.shape, dim)  # type: ignore[arg-type]
     nelem = 1 if a.ndim == 0 else reduce(operator.mul, (a.shape[i] for i in dims), 1)
@@ -3299,7 +3303,7 @@ def tensor_split(
             raise ValueError(msg)
 
     # Case 0 -- indices_or_sections is an integer or a scalar tensor n and a is split along dim into n parts of equal-ish length
-    if isinstance(indices_or_sections, int) or (
+    if isinstance(indices_or_sections, IntLike) or (
         isinstance(indices_or_sections, TensorLike) and indices_or_sections.ndim == 0
     ):
         sections: int = (
@@ -3365,7 +3369,7 @@ def hsplit(
         ),
     )
     dim = 0 if a.ndim == 1 else 1
-    if isinstance(indices_or_sections, int):
+    if isinstance(indices_or_sections, IntLike):
         split_size = indices_or_sections
         check(
             (split_size != 0 and a.shape[dim] % split_size == 0),
@@ -3407,7 +3411,7 @@ def vsplit(
             + " dimensions!"
         ),
     )
-    if isinstance(indices_or_sections, int):
+    if isinstance(indices_or_sections, IntLike):
         split_size = indices_or_sections
         check(
             (split_size != 0 and a.shape[0] % split_size == 0),
@@ -3538,7 +3542,7 @@ def dsplit(a: TensorLikeType, sections: DimsType) -> TensorSequenceType:
         raise RuntimeError(
             f"torch.dsplit requires a tensor with at least 3 dimension, but got a tensor with {a.ndim} dimensions!"
         )
-    if isinstance(sections, int) and (sections == 0 or a.shape[2] % sections != 0):
+    if isinstance(sections, IntLike) and (sections == 0 or a.shape[2] % sections != 0):
         raise RuntimeError(
             "torch._refs.dsplit attempted to split along dimension 2, "
             + f"but the size of the dimension {a.shape[2]} is not divisible by the split_size {sections}!"
@@ -3983,21 +3987,21 @@ def linspace(
     #     cast than not, because it allows us to always go into the precise path
     #     if dtype is integral and not worry about whether start/end are float
     if prims.utils.is_integer_dtype(dtype):
-        if isinstance(start, float):
-            start = int(start)
-        if isinstance(end, float):
-            end = int(end)
+        if isinstance(start, FloatLike):
+            start = sym_int(start)
+        if isinstance(end, FloatLike):
+            end = sym_int(end)
 
     if py_any(isinstance(arg, complex) for arg in (start, end, steps)):
         raise NotImplementedError
     assert not isinstance(start, complex) and not isinstance(end, complex)  # for mypy
 
     check(
-        isinstance(steps, int),
+        isinstance(steps, IntLike),
         lambda: "steps must be int, not float",
         exc_type=TypeError,
     )
-    assert isinstance(steps, int)  # for mypy
+    assert isinstance(steps, IntLike)  # for mypy
     check(steps >= 0, lambda: "number of steps must be non-negative")
 
     factory_kwargs = {
@@ -4016,7 +4020,7 @@ def linspace(
         if prims.utils.is_integer_dtype(dtype):
             # We need to cast to int, so to avoid off-by-one issues
             # do the entire computation with ints when we can
-            assert isinstance(start, int) and isinstance(end, int)
+            assert isinstance(start, IntLike) and isinstance(end, IntLike)
             step_size_x_denom = end - start
             eps = 1 if end > start else -1
             denom = steps - 1
@@ -4063,10 +4067,10 @@ def logspace(
 
     # NB: NumPy doesn't have this cast
     if prims.utils.is_integer_dtype(dtype):
-        if isinstance(start, float):
-            start = int(start)
-        if isinstance(end, float):
-            end = int(end)
+        if isinstance(start, FloatLike):
+            start = sym_int(start)
+        if isinstance(end, FloatLike):
+            end = sym_int(end)
 
     assert not isinstance(base, complex)  # for mypy
     if base < 0:
@@ -4402,10 +4406,10 @@ def uniform(
 ) -> TensorLikeType:
     utils.validate_shape(shape)
 
-    assert isinstance(low, (bool, int, float))
-    assert isinstance(high, (bool, int, float))
-    low = float(low)
-    high = float(high)
+    assert isinstance(low, Number)
+    assert isinstance(high, Number)
+    low = sym_float(low)
+    high = sym_float(high)
 
     assert isinstance(dtype, torch.dtype)
     device = utils.canonicalize_device(device)
@@ -4505,10 +4509,10 @@ def norm(
 ) -> TensorLikeType:
     # In these cases we compute the "Frobenius norm"
     if (
-        p == "fro" and (dim is None or isinstance(dim, int) or len(dim) <= 2)
+        p == "fro" and (dim is None or isinstance(dim, Dim) or len(dim) <= 2)
     ) or p is None:
         p = 2
-    if isinstance(dim, int):
+    if isinstance(dim, Dim):
         dim = [dim]
     if isinstance(p, str):
         # Here we either call the nuclear norm, or we call matrix_norm with some arguments
diff --git a/torch/_refs/linalg/__init__.py b/torch/_refs/linalg/__init__.py
index c3b8a3c603524..c8c8f84570d8e 100644
--- a/torch/_refs/linalg/__init__.py
+++ b/torch/_refs/linalg/__init__.py
@@ -14,6 +14,7 @@
     check,
     check_fp_or_complex,
     check_is_matrix,
+    Dim,
     DimsType,
     NumberType,
     TensorLikeType,
@@ -69,7 +70,7 @@ def vector_norm(
     # Checks
     check_fp_or_complex(x.dtype, "linalg.vector_norm")
 
-    if isinstance(dim, int):
+    if isinstance(dim, Dim):
         dim = [dim]  # type: ignore[assignment]
     elif not isinstance(dim, List) and dim is not None:
         # refs.amin just accepts List rather than DimType (Tuple)
@@ -142,7 +143,7 @@ def matrix_norm(
     check_is_matrix(A, "linalg.matrix_norm")
     # dim
     dim = utils.canonicalize_dims(A.ndim, dim)
-    if isinstance(dim, int):
+    if isinstance(dim, Dim):
         dim = (dim,)  # type: ignore[assignment]
     check(len(dim) == 2, lambda: "linalg.matrix_norm: dim must be a 2-tuple. Got {dim}")
     check(
@@ -219,7 +220,7 @@ def norm(
     dtype: Optional[torch.dtype] = None,
 ) -> TensorLikeType:
     if dim is not None:
-        if isinstance(dim, int):
+        if isinstance(dim, Dim):
             dim = (dim,)  # type: ignore[assignment]
         check(
             len(dim) in (1, 2),

From 0cad1bbd9ae735eac5dfc064648fc3437915b423 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 21 Oct 2022 16:03:00 +0000
Subject: [PATCH 0006/1922] Revert "Back out "Revert D40198461: [pytorch][PR]
 Backport currently dont work with some models if:" (#87124)"

This reverts commit a42fbfa0cb467b582799a5132561c82a3d33b1b7.

Reverted https://github.com/pytorch/pytorch/pull/87124 on behalf of https://github.com/ZainRizvi due to This is causing periodic jobs to fail
---
 buckbuild.bzl                                        |  9 ++++-----
 test/cpp/jit/test_flatbuffer.cpp                     | 12 ++++--------
 .../jit/mobile/compatibility/backport_manager.cpp    |  2 --
 torch/csrc/jit/mobile/flatbuffer_loader.cpp          |  6 ++----
 .../csrc/jit/serialization/flatbuffer_serializer.cpp |  6 ++----
 5 files changed, 12 insertions(+), 23 deletions(-)

diff --git a/buckbuild.bzl b/buckbuild.bzl
index d0185aa313a47..24302e64c92f1 100644
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@@ -1697,7 +1697,7 @@ def define_buck_targets(
             "torch/csrc/jit/serialization/mobile_bytecode.fbs",
         ],
         outs = {
-            "mobile_bytecode_generated_fbsource.h": ["mobile_bytecode_generated.h"],
+            "mobile_bytecode_generated.h": ["mobile_bytecode_generated.h"],
         },
         cmd = "$(exe {})".format(third_party("flatc")) +
               " --cpp --gen-mutable --scoped-enums -o ${OUT} ${SRCS}",
@@ -1713,7 +1713,7 @@ def define_buck_targets(
         name = "mobile_bytecode",
         header_namespace = "",
         exported_headers = {
-            "torch/csrc/jit/serialization/mobile_bytecode_generated_fbsource.h": ":mobile_bytecode_header[mobile_bytecode_generated_fbsource.h]",
+            "torch/csrc/jit/serialization/mobile_bytecode_generated.h": ":mobile_bytecode_header[mobile_bytecode_generated.h]",
         },
         # Avoid leaking implementation details by only exposing this header to
         # the internals of the loader/serializer layer.
@@ -1721,9 +1721,6 @@ def define_buck_targets(
             "{}:flatbuffer_loader".format(ROOT),
             "{}:flatbuffer_serializer_mobile".format(ROOT),
         ],
-        exported_deps = [
-            third_party("flatbuffers-api"),
-        ],
     )
 
     fb_xplat_cxx_library(
@@ -1744,6 +1741,7 @@ def define_buck_targets(
             ":mobile_bytecode",
             ":torch_mobile_module",
             C10,
+            third_party("flatbuffers-api"),
         ],
         exported_deps = [
             ":torch_mobile_train",
@@ -1781,6 +1779,7 @@ def define_buck_targets(
         visibility = ["PUBLIC"],
         deps = [
             ":mobile_bytecode",
+            third_party("flatbuffers-api"),
         ],
         exported_deps = [
             ":torch_mobile_deserialize",
diff --git a/test/cpp/jit/test_flatbuffer.cpp b/test/cpp/jit/test_flatbuffer.cpp
index 89efcf7390179..de49838fc9ab6 100644
--- a/test/cpp/jit/test_flatbuffer.cpp
+++ b/test/cpp/jit/test_flatbuffer.cpp
@@ -27,14 +27,6 @@
 #include <caffe2/serialize/versions.h>
 #include <torch/csrc/jit/serialization/import_export_functions.h>
 #include <unordered_set>
-
-#if defined(FB_XPLAT_BUILD) || defined(FBCODE_CAFFE2)
-#include <torch/csrc/jit/serialization/mobile_bytecode_generated_fbsource.h> // NOLINT
-namespace flatbuffers = flatbuffers_fbsource;
-#define FLATBUFFERS_MAX_ALIGNMENT FLATBUFFERS_FBSOURCE_MAX_ALIGNMENT
-#else
-#include <torch/csrc/jit/serialization/mobile_bytecode_generated.h> // NOLINT
-#endif
 // Tests go in torch::jit
 namespace torch {
 namespace jit {
@@ -1804,9 +1796,13 @@ TEST(FlatbufferUpgraderTest, DivScalarInplaceIntV2) {
 
 } // namespace jit
 } // namespace torch
+#include <torch/csrc/jit/serialization/mobile_bytecode_generated.h>
 namespace torch {
 namespace jit {
 
+#if defined(FBCODE_CAFFE2) or defined(FB_XPLAT_BUILD)
+namespace flatbuffers = flatbuffers_fbsource;
+#endif
 /**
  * An Allocator that can only deallocate (using delete []), counting
  * the number of times that it has been asked to deallocate.
diff --git a/torch/csrc/jit/mobile/compatibility/backport_manager.cpp b/torch/csrc/jit/mobile/compatibility/backport_manager.cpp
index 489084912445f..2bad08c0765a2 100644
--- a/torch/csrc/jit/mobile/compatibility/backport_manager.cpp
+++ b/torch/csrc/jit/mobile/compatibility/backport_manager.cpp
@@ -7,7 +7,6 @@
 #include <torch/csrc/jit/mobile/import.h>
 #include <torch/csrc/jit/mobile/module.h>
 #include <torch/csrc/jit/serialization/export.h>
-#include <torch/csrc/jit/serialization/flatbuffer_serializer_jit.h>
 #include <torch/csrc/jit/serialization/import.h>
 #include <torch/csrc/jit/serialization/pickler.h>
 #include <cstddef>
@@ -504,7 +503,6 @@ std::stringstream backport_v7_to_v6(std::stringstream& input_model_stream) {
 
 std::stringstream backport_v9_to_v8(std::stringstream& input_model_stream) {
   ExtraFilesMap extra_files;
-  register_flatbuffer_all();
   Module torch_script =
       torch::jit::load(input_model_stream, c10::nullopt, extra_files);
   std::stringstream intermediate_model_stream;
diff --git a/torch/csrc/jit/mobile/flatbuffer_loader.cpp b/torch/csrc/jit/mobile/flatbuffer_loader.cpp
index 45e31fb5e1747..fb23e7ee97753 100644
--- a/torch/csrc/jit/mobile/flatbuffer_loader.cpp
+++ b/torch/csrc/jit/mobile/flatbuffer_loader.cpp
@@ -36,6 +36,7 @@
 #include <torch/csrc/jit/serialization/export_bytecode.h>
 #include <torch/csrc/jit/serialization/import_export_constants.h>
 #include <torch/csrc/jit/serialization/import_read.h>
+#include <torch/csrc/jit/serialization/mobile_bytecode_generated.h>
 #include <torch/custom_class.h>
 
 #ifndef DISABLE_UPGRADER
@@ -49,12 +50,9 @@
 #include <cstdlib>
 #endif
 
-#if defined(FB_XPLAT_BUILD) || defined(FBCODE_CAFFE2)
-#include <torch/csrc/jit/serialization/mobile_bytecode_generated_fbsource.h> // NOLINT
+#if defined(FBCODE_CAFFE2) or defined(FB_XPLAT_BUILD)
 namespace flatbuffers = flatbuffers_fbsource;
 #define FLATBUFFERS_MAX_ALIGNMENT FLATBUFFERS_FBSOURCE_MAX_ALIGNMENT
-#else
-#include <torch/csrc/jit/serialization/mobile_bytecode_generated.h> // NOLINT
 #endif
 
 namespace torch {
diff --git a/torch/csrc/jit/serialization/flatbuffer_serializer.cpp b/torch/csrc/jit/serialization/flatbuffer_serializer.cpp
index 54ec7c7b6ed3e..690541450a441 100644
--- a/torch/csrc/jit/serialization/flatbuffer_serializer.cpp
+++ b/torch/csrc/jit/serialization/flatbuffer_serializer.cpp
@@ -20,13 +20,11 @@
 #include <torch/csrc/jit/mobile/train/export_data.h>
 #include <torch/csrc/jit/passes/inliner.h>
 #include <torch/csrc/jit/runtime/instruction.h>
+#include <torch/csrc/jit/serialization/mobile_bytecode_generated.h> // NOLINT
 
-#if defined(FB_XPLAT_BUILD) || defined(FBCODE_CAFFE2)
-#include <torch/csrc/jit/serialization/mobile_bytecode_generated_fbsource.h> // NOLINT
+#if defined(FBCODE_CAFFE2) or defined(FB_XPLAT_BUILD)
 namespace flatbuffers = flatbuffers_fbsource;
 #define FLATBUFFERS_MAX_ALIGNMENT FLATBUFFERS_FBSOURCE_MAX_ALIGNMENT
-#else
-#include <torch/csrc/jit/serialization/mobile_bytecode_generated.h> // NOLINT
 #endif
 
 namespace torch {

From 786500c480d58bbfdee178babf19902329953bc7 Mon Sep 17 00:00:00 2001
From: jyx-su <108294040+jyx-su@users.noreply.github.com>
Date: Fri, 21 Oct 2022 16:28:29 +0000
Subject: [PATCH 0007/1922] Fix input dimension issue in RNN, LSTM, GRU error
 message (#87442)

Fixes #86576

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87442
Approved by: https://github.com/albanD
---
 torch/nn/modules/rnn.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py
index 4d6fd9c959ebc..f94728653b0f6 100644
--- a/torch/nn/modules/rnn.py
+++ b/torch/nn/modules/rnn.py
@@ -441,6 +441,7 @@ def forward(self, input, hx=None):  # noqa: F811
             max_batch_size = int(batch_sizes[0])
         else:
             batch_sizes = None
+            assert (input.dim() in (2, 3)), f"RNN: Expected input to be 2-D or 3-D but received {input.dim()}-D tensor"
             is_batched = input.dim() == 3
             batch_dim = 0 if self.batch_first else 1
             if not is_batched:
@@ -733,6 +734,7 @@ def forward(self, input, hx=None):  # noqa: F811
             max_batch_size = int(max_batch_size)
         else:
             batch_sizes = None
+            assert (input.dim() in (2, 3)), f"LSTM: Expected input to be 2-D or 3-D but received {input.dim()}-D tensor"
             is_batched = input.dim() == 3
             batch_dim = 0 if self.batch_first else 1
             if not is_batched:
@@ -923,6 +925,7 @@ def forward(self, input, hx=None):  # noqa: F811
             max_batch_size = int(max_batch_size)
         else:
             batch_sizes = None
+            assert (input.dim() in (2, 3)), f"GRU: Expected input to be 2-D or 3-D but received {input.dim()}-D tensor"
             is_batched = input.dim() == 3
             batch_dim = 0 if self.batch_first else 1
             if not is_batched:

From edc78a47985fad6cdf3e5c45dadbcc8606557747 Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@fb.com>
Date: Fri, 21 Oct 2022 06:21:41 -0700
Subject: [PATCH 0008/1922] Reland "add an API for external backends to
 register custom device names (#86992)" (#87453)

Re-land of https://github.com/pytorch/pytorch/pull/86992

This reverts commit a895af92506f206889610251624590798d0deabd.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87453
Approved by: https://github.com/ezyang, https://github.com/albanD
---
 aten/src/ATen/core/dispatch/OperatorEntry.cpp | 18 ++++++-
 c10/core/Device.cpp                           |  3 ++
 c10/core/DeviceType.cpp                       | 48 ++++++++++++++++++-
 c10/core/DeviceType.h                         |  3 ++
 torch/_C/__init__.pyi.in                      |  3 ++
 torch/csrc/Module.cpp                         | 18 +++++++
 torch/utils/__init__.py                       |  1 +
 torch/utils/backend_registration.py           | 30 ++++++++++++
 8 files changed, 122 insertions(+), 2 deletions(-)
 create mode 100644 torch/utils/backend_registration.py

diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
index 5d53500e7dfe0..822924a602533 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@@ -495,6 +495,22 @@ void OperatorEntry::reportSignatureError(const CppSignature& call_signature, con
   );
 };
 
+std::string post_process_dispatch_key_str(std::string dispatch_key) {
+  const std::string substr = "PrivateUse1";
+  if (substr.size() <= dispatch_key.size() && std::equal(substr.rbegin(), substr.rend(), dispatch_key.rbegin())) {
+    auto privateuse1_backend = get_privateuse1_backend();
+    if (privateuse1_backend != "privateuseone") {
+      // remove trailing "*PrivateUse1"
+      dispatch_key.erase(dispatch_key.length() - substr.length());
+      // append the registered backend's name.
+      // AutogradPrivateUse1 -> AutogradFoo
+      auto backend_name = c10::get_privateuse1_backend();
+      dispatch_key = dispatch_key + backend_name;
+    }
+  }
+  return dispatch_key;
+}
+
 void OperatorEntry::reportError(DispatchKey dispatchKey) const {
   // If there is an invariant problem, report it now.
   checkInvariants();
@@ -509,7 +525,7 @@ void OperatorEntry::reportError(DispatchKey dispatchKey) const {
   }
 
   TORCH_CHECK_NOT_IMPLEMENTED(false, "Could not run '", name_, "' with arguments",
-          " from the '", toString(dispatchKey), "' backend. This could be because "
+          " from the '", post_process_dispatch_key_str(toString(dispatchKey)), "' backend. This could be because "
           "the operator doesn't exist for this backend, or was omitted during ",
           "the selective/custom build process (if using custom build). If you are a ",
           "Facebook employee using PyTorch on mobile, please visit ",
diff --git a/c10/core/Device.cpp b/c10/core/Device.cpp
index 7b55d2dbe283b..96d2504ec7de5 100644
--- a/c10/core/Device.cpp
+++ b/c10/core/Device.cpp
@@ -47,6 +47,9 @@ DeviceType parse_type(const std::string& device_string) {
   if (device != types.end()) {
     return device->second;
   }
+  if (device_string == get_privateuse1_backend()) {
+    return DeviceType::PrivateUse1;
+  }
   std::vector<const char*> device_names;
   for (const auto& it : types) {
     if (it.first) {
diff --git a/c10/core/DeviceType.cpp b/c10/core/DeviceType.cpp
index ac4c1f653efbf..22f0029d747d4 100644
--- a/c10/core/DeviceType.cpp
+++ b/c10/core/DeviceType.cpp
@@ -1,5 +1,9 @@
 #include <c10/core/DeviceType.h>
 #include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
+#include <atomic>
+#include <memory>
+#include <mutex>
 
 namespace c10 {
 
@@ -46,7 +50,7 @@ std::string DeviceTypeName(DeviceType d, bool lower_case) {
     case DeviceType::IPU:
       return lower_case ? "ipu" : "IPU";
     case DeviceType::PrivateUse1:
-      return lower_case ? "privateuseone" : "PRIVATEUSEONE";
+      return get_privateuse1_backend(/*lowercase=*/lower_case);
     default:
       TORCH_CHECK(
           false,
@@ -101,4 +105,46 @@ std::ostream& operator<<(std::ostream& stream, DeviceType type) {
   return stream;
 }
 
+// We use both a mutex and an atomic here because:
+// (1) Mutex is needed during writing:
+//     We need to first check the value and potentially error,
+//     before setting the value (without any one else racing in the middle).
+//     It's also totally fine for this to be slow, since it happens exactly once
+//     at import time.
+// (2) Atomic is needed during reading:
+//     Whenever a user prints a privatuse1 device name, they need to read this
+//     variable. Although unlikely, we'll data race if someone else is trying to
+//     set this variable at the same time that another thread is print the
+//     device name. We could re-use the same mutex, but reading the atomic will
+//     be much faster.
+static std::atomic<bool> privateuse1_backend_name_set;
+static std::string privateuse1_backend_name;
+static std::mutex privateuse1_lock;
+
+std::string get_privateuse1_backend(bool lower_case) {
+  // Applying the same atomic read memory ordering logic as in Note [Memory
+  // ordering on Python interpreter tag].
+  auto name_registered =
+      privateuse1_backend_name_set.load(std::memory_order_acquire);
+  // Guaranteed that if the flag is set, then privateuse1_backend_name has been
+  // set, and will never be written to.
+  auto backend_name =
+      name_registered ? privateuse1_backend_name : "privateuseone";
+  return backend_name;
+}
+
+void register_privateuse1_backend(std::string backend_name) {
+  std::lock_guard<std::mutex> guard(privateuse1_lock);
+  TORCH_CHECK(
+      !privateuse1_backend_name_set.load() ||
+          privateuse1_backend_name == backend_name,
+      "torch.register_privateuse1_backend() has already been set! Current backend: ",
+      privateuse1_backend_name);
+
+  privateuse1_backend_name = backend_name;
+  // Invariant: once this flag is set, privateuse1_backend_name is NEVER written
+  // to.
+  privateuse1_backend_name_set.store(true, std::memory_order_relaxed);
+}
+
 } // namespace c10
diff --git a/c10/core/DeviceType.h b/c10/core/DeviceType.h
index 000ad331828b0..065444827833d 100644
--- a/c10/core/DeviceType.h
+++ b/c10/core/DeviceType.h
@@ -95,6 +95,9 @@ C10_API bool isValidDeviceType(DeviceType d);
 
 C10_API std::ostream& operator<<(std::ostream& stream, DeviceType type);
 
+C10_API void register_privateuse1_backend(std::string backend_name);
+C10_API std::string get_privateuse1_backend(bool lower_case = true);
+
 } // namespace c10
 
 namespace std {
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 3c81b63721ccd..5b9049e4bdc7d 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -1017,6 +1017,9 @@ def _jit_pass_lint(Graph) -> None: ...
 # Defined in torch/csrc/jit/python/python_custome_class.cpp
 def _get_custom_class_python_wrapper(name: str, attr: str) -> Any: ...
 
+# Defined in torch/csrc/Module.cpp
+def _rename_privateuse1_backend(backend: str) -> None: ...
+
 # Defined in torch/csrc/Generator.cpp
 class Generator(object):
     device: _device
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 08b9b81217e93..e41f0305a2e11 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -441,6 +441,20 @@ PyObject* THModule_getCppBacktrace(PyObject* _unused, PyObject* args) {
       c10::get_backtrace(frames_to_skip, maximum_number_of_frames, true));
   END_HANDLE_TH_ERRORS
 }
+static PyObject* THModule_rename_privateuse1_backend(
+    PyObject* _unused,
+    PyObject* arg) {
+  HANDLE_TH_ERRORS
+  THPUtils_assert(
+      THPUtils_checkString(arg),
+      "_rename_privateuse1_backend expects a str, "
+      "but got %s",
+      THPUtils_typename(arg));
+  const std::string backend_name = THPUtils_unpackString(arg);
+  c10::register_privateuse1_backend(backend_name);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
 
 PyObject* THPModule_setAllowTF32CuDNN(PyObject* _unused, PyObject* arg) {
   THPUtils_assert(
@@ -990,6 +1004,10 @@ static PyMethodDef TorchMethods[] = {
     {"_to_dlpack", THPModule_toDLPack, METH_O, nullptr},
     {"_from_dlpack", THPModule_fromDLPack, METH_O, nullptr},
     {"_get_cpp_backtrace", THModule_getCppBacktrace, METH_VARARGS, nullptr},
+    {"_rename_privateuse1_backend",
+     THModule_rename_privateuse1_backend,
+     METH_O,
+     nullptr},
     {"set_flush_denormal", THPModule_setFlushDenormal, METH_O, nullptr},
     {"get_default_dtype", THPModule_getDefaultDtype, METH_NOARGS, nullptr},
     {"_get_default_device", THPModule_getDefaultDevice, METH_NOARGS, nullptr},
diff --git a/torch/utils/__init__.py b/torch/utils/__init__.py
index f05ffc3fc96b8..c2054a9b5c653 100644
--- a/torch/utils/__init__.py
+++ b/torch/utils/__init__.py
@@ -4,6 +4,7 @@
 from .throughput_benchmark import ThroughputBenchmark
 from ._crash_handler import enable_minidumps, disable_minidumps, enable_minidumps_on_exceptions
 from .cpp_backtrace import get_cpp_backtrace
+from .backend_registration import rename_privateuse1_backend
 
 # Set the module for a given object for nicer printing
 def set_module(obj, mod):
diff --git a/torch/utils/backend_registration.py b/torch/utils/backend_registration.py
new file mode 100644
index 0000000000000..539d5c65d237e
--- /dev/null
+++ b/torch/utils/backend_registration.py
@@ -0,0 +1,30 @@
+from torch._C import _rename_privateuse1_backend
+
+def rename_privateuse1_backend(backend_name: str) -> None:
+    r"""
+    rename_privateuse1_backend(backend_name) -> None
+
+    This is a registration API for external backends that would like to register their
+    own device and C++ kernels out of tree.
+
+    The steps are:
+    (1) (In C++) implement kernels for various torch operations, and register them
+        to the PrivateUse1 dispatch key.
+    (2) (In python) call torch.register_privateuse1_backend("foo")
+
+    You can now use "foo" as an ordinary device string in python.
+
+    Note: this API can only be called once per process. Attempting to change
+    the external backend after it's already been set will result in an error.
+
+    For more details, see https://pytorch.org/tutorials/advanced/extend_dispatcher.html#get-a-dispatch-key-for-your-backend
+    For an existing example, see https://github.com/bdhirsh/pytorch_open_registration_example
+
+    Example::
+
+        >>> torch.register_privateuse1_backend("foo")
+        # This will work, assuming that you've implemented the right C++ kernels
+        # to implement torch.ones.
+        >>> a = torch.ones(2, device="foo")
+        """
+    return _rename_privateuse1_backend(backend_name)

From b9fb94821f059036b59c14df2349fb5b344ef429 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Fri, 21 Oct 2022 11:30:56 +0000
Subject: [PATCH 0009/1922] [FSDP][2/N] Remove
 `_fsdp_wrapped_module.flat_param` (#86122)

This removes **direct** usages of `_fsdp_wrapped_module.flat_param` with `_handles[0].flat_param`. The preferred way to access the `flat_param` will be through the handle. We may converge to only storing `self._handles` and no longer `self.params` in the future. Right now, `self.params` is always exactly `[handle.flat_param for handle in self._handles]`.

cc @mrshenli @pritamdamania87 @zhaojuanmao @satgera @rohan-varma @gqchen @aazzolini @osalpekar @jiayisuse @H-Huang @kwen2501
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86122
Approved by: https://github.com/zhaojuanmao
---
 test/distributed/fsdp/test_fsdp_misc.py       |  4 +--
 .../fsdp/test_fsdp_summon_full_params.py      | 28 +++++++------------
 .../fsdp/fully_sharded_data_parallel.py       | 12 ++++----
 3 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/test/distributed/fsdp/test_fsdp_misc.py b/test/distributed/fsdp/test_fsdp_misc.py
index f2ae0dcfcaeaf..ca566b984002a 100644
--- a/test/distributed/fsdp/test_fsdp_misc.py
+++ b/test/distributed/fsdp/test_fsdp_misc.py
@@ -206,8 +206,8 @@ def forward(self, x, y):
             loss.backward()
 
             # self.a receives grad, self.b does not
-            a_grad = fsdp.module.a._fsdp_wrapped_module.flat_param.grad
-            b_grad = fsdp.module.b._fsdp_wrapped_module.flat_param.grad
+            a_grad = fsdp.module.a._handles[0].flat_param.grad
+            b_grad = fsdp.module.b._handles[0].flat_param.grad
             self.assertIsNotNone(a_grad)
             self.assertIsNone(b_grad)
 
diff --git a/test/distributed/fsdp/test_fsdp_summon_full_params.py b/test/distributed/fsdp/test_fsdp_summon_full_params.py
index 29bf252b796fd..d78aa81a19d7a 100644
--- a/test/distributed/fsdp/test_fsdp_summon_full_params.py
+++ b/test/distributed/fsdp/test_fsdp_summon_full_params.py
@@ -52,10 +52,8 @@ def _run_test_summon_full_param_writeback(
         model = wrap(nn.Sequential(lin1, lin2))
 
     # set the value
-    outer_param = model.get_parameter("_fsdp_wrapped_module.flat_param")
-    inner_param = model.get_parameter(
-        "_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param"
-    )
+    outer_param = model._handles[0].flat_param
+    inner_param = model.module[0]._handles[0].flat_param
     p = outer_param if modify_outer else inner_param
 
     with torch.no_grad():
@@ -176,10 +174,8 @@ def test_summon_full_param_recursive(self, recurse, summon_outer, mixed_precisio
         shard_inner_numel = int(math.ceil(global_inner_numel / self.world_size))
         shard_outer_numel = int(math.ceil(global_outer_numel / self.world_size))
 
-        outer_param = model.get_parameter("_fsdp_wrapped_module.flat_param")
-        inner_param = model.get_parameter(
-            "_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param"
-        )
+        outer_param = model._handles[0].flat_param
+        inner_param = model.module[0]._handles[0].flat_param
         self.assertEqual(shard_outer_numel, outer_param.numel())
         self.assertEqual(shard_inner_numel, inner_param.numel())
 
@@ -259,10 +255,8 @@ def _test_summon_full_params_respects_reshard_after_forward(
             **fsdp_kwargs,
         ).cuda(self.rank)
 
-        outer_param = model.get_parameter("_fsdp_wrapped_module.flat_param")
-        inner_param = model.get_parameter(
-            "_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param"
-        )
+        outer_param = model._handles[0].flat_param
+        inner_param = model.module[0]._handles[0].flat_param
         outer_full_param_size = outer_param.numel() * self.world_size
 
         # trigger lazy init
@@ -285,7 +279,7 @@ def _test_summon_full_params_respects_reshard_after_forward(
     def test_summon_single_param(self):
         model = FSDP(nn.Linear(1, 1, bias=False)).cuda(self.rank)
 
-        p = model.get_parameter("_fsdp_wrapped_module.flat_param")
+        p = model._handles[0].flat_param
         self.assertEqual(1, p.numel())
 
         with torch.no_grad():
@@ -388,10 +382,8 @@ def test_reshard_outside_forward_backward_iteration(
             mixed_precision=mixed_precision,
         ).cuda(self.rank)
 
-        outer_param = model.get_parameter("_fsdp_wrapped_module.flat_param")
-        inner_param = model.get_parameter(
-            "_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param"
-        )
+        outer_param = model._handles[0].flat_param
+        inner_param = model.module[0]._handles[0].flat_param
         outer_full_param_size = outer_param.numel() * self.world_size
 
         # First lets validate our assumption about resharding
@@ -451,7 +443,7 @@ def test_params_are_unflattenned(self, rank0_only, offload_to_cpu, mixed_precisi
         )
 
         def _get_flat_param():
-            return fsdp_model.get_parameter("_fsdp_wrapped_module.flat_param")
+            return fsdp_model._handles[0].flat_param
 
         flattened_param = _get_flat_param()
         self.assertEqual(layer_shape[0] * layer_shape[1] / 2, flattened_param.numel())
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 31bb2d5000b2b..338a232a4271a 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -1631,7 +1631,7 @@ def module(self) -> nn.Module:
         return self._fsdp_wrapped_module.module
 
     def __getattr__(self, name: str) -> Any:
-        """Forward missing attributes to wrapped module."""
+        """Forward missing attributes to the wrapped module."""
         try:
             return super().__getattr__(name)  # defer to nn.Module's logic
         except AttributeError:
@@ -2538,7 +2538,7 @@ def state_dict(self, *args, **kwargs):
             self._state_dict_type == StateDictType.SHARDED_STATE_DICT
         ):
             if (
-                self._fsdp_wrapped_module.flat_param is not None and
+                self._fsdp_wrapped_module.has_params and
                 not self._fsdp_wrapped_module.handle.uses_sharded_strategy
             ):
                 raise RuntimeError(
@@ -2606,8 +2606,8 @@ def _local_pre_load_state_dict_hook(
         _replace_by_prefix(state_dict, prefix, f"{prefix}{FSDP_WRAPPED_MODULE}.")
         fqn = f"{prefix}{FSDP_WRAPPED_MODULE}.{FLAT_PARAM}"
         if fqn not in state_dict:
-            assert getattr(self._fsdp_wrapped_module, FLAT_PARAM, None) is None, (
-                "No flat parameter in state_dict but self._fsdp_wrapped_module.flat_param is not None"
+            assert not self._fsdp_wrapped_module.has_params, (
+                "No `FlatParameter` in `state_dict` for this FSDP instance but it has parameters"
             )
             return
         load_tensor = state_dict[fqn]
@@ -2622,7 +2622,7 @@ def _local_pre_load_state_dict_hook(
 
         # Get the metada of the flat_param to decide whether to pad the loaded
         # tensor.
-        flat_param = self._fsdp_wrapped_module.flat_param
+        flat_param = self._handles[0].flat_param
         assert flat_param is not None
         if flat_param._shard_numel_padded not in (0, flat_param.numel()):
             assert load_tensor.numel() < flat_param.numel(), (
@@ -2694,7 +2694,7 @@ def _sharded_pre_load_state_dict_hook(
             nonsharded_tensors.append(tensor)
 
         # Create a new flat_param from the loaded, non-sharded tensors.
-        flat_param = self._fsdp_wrapped_module.flat_param
+        flat_param = self._handles[0].flat_param
         loaded_flat_param = FlatParamHandle.flatten_params(nonsharded_tensors, requires_grad=False)
 
         # Get the chunk from the loaded flat_param for the local rank.

From b142154711e9186446364a4d1a1682ab3d57ee0b Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Fri, 21 Oct 2022 11:30:57 +0000
Subject: [PATCH 0010/1922] [FSDP][3/N] Register `flat_param` to wrapped module
 (#87086)

This PR registers each `FlatParameter` to the wrapped module, eliminating `FlattenParamsWrapper` usage completely from FSDP.

Registering each `FlatParameter` to the wrapped module is preferred over registering to the `FullyShardedDataParallel` instance for both functional-like and non-recursive wrapping. It simplifies the `FlatParameter` naming to be a function of the number of `FlatParameter`s per wrapped module instead of the number of `FlatParameter`s per FSDP instance. For now, we assume 1 `FlatParameter` per wrapped module, so we can simply use a single name `FLAT_PARAM = _flat_param`.

From an implementation perspective, we raise some methods from `FlattenParamsWrapper` directly up to `FullyShardedDataParallel`. There will need to be further refactoring for functional-like and non-recursive wrapping. For example, the property `self._has_params -> bool` may need to change to a method `self._has_params(wrapped_module) -> bool`. Such changes are out of scope for this PR and will be done in follow-ups.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87086
Approved by: https://github.com/zhaojuanmao
---
 test/distributed/fsdp/test_fsdp_state_dict.py |  20 +-
 torch/distributed/fsdp/flat_param.py          |   2 +-
 .../fsdp/fully_sharded_data_parallel.py       | 202 +++++++++++-------
 3 files changed, 133 insertions(+), 91 deletions(-)

diff --git a/test/distributed/fsdp/test_fsdp_state_dict.py b/test/distributed/fsdp/test_fsdp_state_dict.py
index af56ee956743f..6592ec108f074 100644
--- a/test/distributed/fsdp/test_fsdp_state_dict.py
+++ b/test/distributed/fsdp/test_fsdp_state_dict.py
@@ -23,9 +23,7 @@
     StateDictType,
 )
 from torch.distributed.fsdp._shard_utils import _gather_state_dict
-from torch.distributed.fsdp.fully_sharded_data_parallel import (
-    FullyShardedDataParallel,
-)
+from torch.distributed.fsdp.fully_sharded_data_parallel import FLAT_PARAM
 from torch.distributed.fsdp.wrap import (
     enable_wrap,
     transformer_auto_wrap_policy,
@@ -124,10 +122,14 @@ def _broadcast_state_dict(self, state_dict):
         return olist[0]
 
     def _compare_models(self, model, model_new, assert_fn, check_fp16=False):
-        with FullyShardedDataParallel.summon_full_params(model):
-            with FullyShardedDataParallel.summon_full_params(model_new):
+        assert assert_fn in (self.assertEqual, self.assertNotEqual)
+        with FSDP.summon_full_params(model):
+            with FSDP.summon_full_params(model_new):
                 params = list(model.parameters())
                 params_new = list(model_new.parameters())
+                # Regardless of `assert_fn`, the number of parameters should be
+                # the same
+                self.assertEqual(len(params), len(params_new))
                 assert_fn(params, params_new)
                 if check_fp16:
                     for tensor in model_new.parameters():
@@ -327,8 +329,8 @@ def test_state_dict_rank0_offload_save_load_flow(self, use_orig_params: bool):
                 assert_fn=self.assertEqual,
             )
         # Check FSDP models correctly loaded the checkpoint
-        with FullyShardedDataParallel.summon_full_params(fsdp_model):
-            with FullyShardedDataParallel.summon_full_params(new_fsdp_model):
+        with FSDP.summon_full_params(fsdp_model):
+            with FSDP.summon_full_params(new_fsdp_model):
                 params = list(fsdp_model.parameters())
                 params_new = list(new_fsdp_model.parameters())
                 self.assertEqual(params, params_new)
@@ -570,7 +572,7 @@ def test_state_dict_save_load_flow(self, state_dict_type):
     def test_fsdp_state_dict_keys(self, state_dict_type):
         state_dict = self._state_dict(self._initialize_model(True), state_dict_type)
         if state_dict_type == "local_state_dict":
-            self.assertEqual(set(["flat_param", "inner.flat_param"]), state_dict.keys())
+            self.assertEqual(set([FLAT_PARAM, f"inner.{FLAT_PARAM}"]), state_dict.keys())
         elif state_dict_type in ("state_dict", "sharded_state_dict"):
             # Keys should match local model.
             local_model = self._initialize_model(wrap_fsdp=False, wrap_ddp=False)
@@ -606,7 +608,7 @@ def test_state_dict_load_into_local_module(
             optim.step()
             optim.zero_grad()
 
-        with FullyShardedDataParallel.summon_full_params(model):
+        with FSDP.summon_full_params(model):
             fsdp_params = deepcopy(list(model.parameters()))
 
         # get FSDP state_dict. Note that by default we return full_state_dict.
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index 6e30a031a16c7..2c65dd80ea3c3 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -297,7 +297,7 @@ def __init__(
         device: torch.device,
         config: HandleConfig,
         use_orig_params: bool,
-    ) -> None:
+    ):
         super().__init__()
         self.device = device
         self._config = config
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 338a232a4271a..2cf3af6d540c0 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -83,11 +83,6 @@
     HandleShardingStrategy,
     HandleTrainingState,
 )
-from .flatten_params_wrapper import (
-    FLAT_PARAM,
-    FPW_MODULE,
-    FlattenParamsWrapper,
-)
 from .wrap import (
     ParamExecOrderWrapPolicy,
     _or_policy,
@@ -120,8 +115,12 @@
 ]
 
 
+# NOTE: `FSDP_WRAPPED_MODULE` cannot be a substring of any other module wrapper
+# name (e.g. for activation checkpointing) since then `replace()`-based FQN
+# cleaning breaks.
 FSDP_WRAPPED_MODULE = "_fsdp_wrapped_module"
-FSDP_PREFIX = FSDP_WRAPPED_MODULE + "." + FPW_MODULE + "."
+FSDP_PREFIX = FSDP_WRAPPED_MODULE + "."
+FLAT_PARAM = "_flat_param"
 
 _PARAM_BROADCAST_BUCKET_SIZE = int(250 * 1024 * 1024)
 
@@ -1087,27 +1086,22 @@ def __init__(
             self.mixed_precision.reduce_dtype,
             self.mixed_precision.keep_low_precision_grads,
         )
-        self._fsdp_wrapped_module = FlattenParamsWrapper(
-            module,
-            params_to_flatten,
-            self.compute_device,
-            config,
-            use_orig_params,
-        )
-        if not use_orig_params:
-            self._check_orig_params_flattened(ignored_params)
         # Invariant: `self.params` contains exactly the `FlatParameter`s of the
         # handles in `self._handles`
         self._handles: List[FlatParamHandle] = []
         self.params: List[FlatParameter] = []
-        if self._fsdp_wrapped_module.has_params:
-            handle = self._fsdp_wrapped_module.handle
+        self._fsdp_wrapped_module = module
+        if params_to_flatten:
+            handle = FlatParamHandle(params_to_flatten, module, self.compute_device, config, use_orig_params)
+            self._handles.append(handle)
             self.params.append(handle.flat_param)
             self._register_param_handle(handle)
             handle.shard(self.process_group)
             if self.cpu_offload.offload_params and handle.flat_param.device != torch.device("cpu"):
-                with torch.no_grad():
-                    handle.flat_param_to(torch.device("cpu"))
+                handle.flat_param_to(torch.device("cpu"))
+        if not use_orig_params:
+            self._check_orig_params_flattened(ignored_params)
+            self._register_flat_param()
 
         self._sync_gradients = True
         self._communication_hook = self._get_default_comm_hook()
@@ -1190,7 +1184,7 @@ def _get_ignored_modules(
             child
             for module in ignored_root_modules
             for child in module.modules()
-            if not isinstance(child, (FullyShardedDataParallel, FlattenParamsWrapper))
+            if not isinstance(child, FullyShardedDataParallel)
         )
         if root_module in ignored_modules:
             warnings.warn(
@@ -1243,13 +1237,10 @@ def _get_buffer_names(self, root_module: nn.Module) -> Set[str]:
         """
 
         def module_fn(module: nn.Module, prefix: str, buffer_names: Set[str]):
-            # For FSDP modules, only add the entry when considering the
-            # contained `FlattenParamsWrapper` to avoid duplication
-            if not isinstance(module, FullyShardedDataParallel):
-                for buffer_name, _ in module.named_buffers(recurse=False):
-                    # Clean module wrapper prefixes in case of nested wrapping
-                    prefixed_buffer_name = clean_tensor_name(prefix + buffer_name)
-                    buffer_names.add(prefixed_buffer_name)
+            for buffer_name, _ in module.named_buffers(recurse=False):
+                # Clean module wrapper prefixes in case of nested wrapping
+                prefixed_buffer_name = clean_tensor_name(prefix + buffer_name)
+                buffer_names.add(prefixed_buffer_name)
 
         def return_fn(buffer_names: Set[str], *args):
             return buffer_names
@@ -1627,8 +1618,16 @@ def module(self) -> nn.Module:
         """
         Returns the wrapped module (like :class:`DistributedDataParallel`).
         """
-        assert isinstance(self._fsdp_wrapped_module, FlattenParamsWrapper)
-        return self._fsdp_wrapped_module.module
+        return self._fsdp_wrapped_module
+
+    @property
+    def _has_params(self) -> bool:
+        """Returns whether this FSDP instance manages any parameters."""
+        return hasattr(self, "_handles") and len(self._handles) > 0
+
+    @property
+    def _flat_param(self) -> Optional[FlatParameter]:
+        return self._handles[0].flat_param if self._handles else None
 
     def __getattr__(self, name: str) -> Any:
         """Forward missing attributes to the wrapped module."""
@@ -1638,7 +1637,7 @@ def __getattr__(self, name: str) -> Any:
             return getattr(self._fsdp_wrapped_module, name)
 
     def __getitem__(self, key: int) -> Any:
-        """Forward indexing calls in case the module is a nn.Sequential."""
+        """Forward indexing calls in case the module is an ``nn.Sequential``."""
         return self._fsdp_wrapped_module.__getitem__(key)  # type: ignore[operator]
 
     def check_is_root(self) -> bool:
@@ -2228,8 +2227,8 @@ def state_dict_type(
                 )
 
     def _convert_to_wrapped_module_name(self, module_name: str) -> str:
-        module_name = module_name.replace(f"{FPW_MODULE}.", "")
-        module_name = module_name.replace(f"{FPW_MODULE}", "")
+        module_name = module_name.replace(f"{FSDP_PREFIX}", "")
+        module_name = module_name.replace(f"{FSDP_WRAPPED_MODULE}", "")
         if module_name:
             module_name = f"{module_name}."
         # Activation checkpoint adds a prefix that has to be
@@ -2241,6 +2240,8 @@ def _convert_to_wrapped_module_name(self, module_name: str) -> str:
 
     @property
     def _param_fqns(self) -> Iterator[Tuple[str, str, str]]:
+        if not self._has_params:
+            return
         for param_name, module_name in (
             self._handles[0].parameter_module_names()
         ):
@@ -2266,12 +2267,12 @@ def _full_post_state_dict_hook(
         Hook that runs after model.state_dict() is called before returning result to
         user. For FSDP, we may have to clone the tensors in state_dict as params go
         back to sharded version after _summon_full_params ends, and also remove
-        "_fsdp_wrapped_module" prefix.
+        the ``FSDP_WRAPPED_MODULE`` prefix.
         """
-        _replace_by_prefix(state_dict, prefix + f"{FSDP_WRAPPED_MODULE}.", prefix)
+        _replace_by_prefix(state_dict, prefix + f"{FSDP_PREFIX}", prefix)
         self._assert_state([TrainingState_.SUMMON_FULL_PARAMS])
         # Return early for trivial cases
-        if not state_dict or not self._fsdp_wrapped_module.has_params:
+        if not state_dict or not self._has_params:
             return state_dict
 
         # If a rank has already exited the `summon_full_params()` context here
@@ -2285,7 +2286,7 @@ def _full_post_state_dict_hook(
         if (
             (
                 not self._use_orig_params
-                and "flat_param" in self._fsdp_wrapped_module._parameters
+                and FLAT_PARAM in self.module._parameters
             )
             or (
                 self._use_orig_params
@@ -2299,8 +2300,8 @@ def _full_post_state_dict_hook(
         offload_to_cpu = self._state_dict_config.offload_to_cpu
         cpu_device = torch.device("cpu")
 
-        # Loop only the parameters saved in self._fsdp_wrapped_module to avoid
-        # processing buffers.
+        # Loop only the parameters saved in this instance's wrapped module to
+        # avoid processing buffers.
         for fqn, param_name, module_name in self._param_fqns:
             fqn = f"{prefix}{fqn}"
             clean_key = fqn
@@ -2361,16 +2362,16 @@ def _local_post_state_dict_hook(
         the state_dict[f"{prefix}{FLAT_PARAM}] with the ShardedTensor. No copy
         will happen. The underlying storage is the same.
         """
-        _replace_by_prefix(state_dict, f"{prefix}{FSDP_WRAPPED_MODULE}.", prefix)
-        if not self._fsdp_wrapped_module.has_params:
+        _replace_by_prefix(state_dict, f"{prefix}{FSDP_PREFIX}", prefix)
+        if not self._has_params:
             return state_dict
 
         # state_dict[f"{prefix}{FLAT_PARAM}"] exists and has the same tensor
         # value as the flat_param but it is a pure Tensor because
         # nn.Module.state_dict() will detach the parameter. Therefore, we need
-        # to get flat_param from the FlattenParamsWrapper to get the metadata.
-        flat_param = getattr(self._fsdp_wrapped_module, FLAT_PARAM, None)
-        assert flat_param is not None
+        # to get flat_param to get the metadata.
+        assert self._handles, "Should have returned early"
+        flat_param = self._handles[0].flat_param
         # Construct a ShardedTensor from the flat_param.
         full_numel = flat_param._unpadded_unsharded_size.numel()  # type: ignore[attr-defined]
         shard_offset = flat_param.numel() * self.rank
@@ -2398,8 +2399,8 @@ def _sharded_post_state_dict_hook(
         The hook replaces the unflattened, unsharded parameter in the state_dict
         with a unflattened, sharded parameter (a ShardedTensor).
         """
-        _replace_by_prefix(state_dict, f"{prefix}{FSDP_WRAPPED_MODULE}.", prefix)
-        if not self._fsdp_wrapped_module.has_params:
+        _replace_by_prefix(state_dict, f"{prefix}{FSDP_PREFIX}", prefix)
+        if not self._has_params:
             return state_dict
 
         assert self.training_state != TrainingState_.SUMMON_FULL_PARAMS, (
@@ -2538,8 +2539,8 @@ def state_dict(self, *args, **kwargs):
             self._state_dict_type == StateDictType.SHARDED_STATE_DICT
         ):
             if (
-                self._fsdp_wrapped_module.has_params and
-                not self._fsdp_wrapped_module.handle.uses_sharded_strategy
+                self._has_params and
+                not self._handles[0].uses_sharded_strategy
             ):
                 raise RuntimeError(
                     "sharded_state_dict/local_state_dict can only be called "
@@ -2588,7 +2589,7 @@ def _full_pre_load_state_dict_hook(
             recurse=False, writeback=True
         )
         self._full_param_ctx.__enter__()
-        _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP_WRAPPED_MODULE}.")
+        _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP_PREFIX}")
 
     def _local_post_load_state_dict_hook(self, *args, **kwargs) -> None:
         pass
@@ -2603,10 +2604,10 @@ def _local_pre_load_state_dict_hook(
         state_dict. The flat_param should be a ShardedTensor. This hook converts
         the ShardedTensor to a tensor. No copy happen unless padding is required.
         """
-        _replace_by_prefix(state_dict, prefix, f"{prefix}{FSDP_WRAPPED_MODULE}.")
-        fqn = f"{prefix}{FSDP_WRAPPED_MODULE}.{FLAT_PARAM}"
+        _replace_by_prefix(state_dict, prefix, f"{prefix}{FSDP_PREFIX}")
+        fqn = f"{prefix}{FSDP_PREFIX}{FLAT_PARAM}"
         if fqn not in state_dict:
-            assert not self._fsdp_wrapped_module.has_params, (
+            assert not self._has_params, (
                 "No `FlatParameter` in `state_dict` for this FSDP instance but it has parameters"
             )
             return
@@ -2645,11 +2646,11 @@ def _sharded_pre_load_state_dict_hook(
         The hook combines the unflattened, sharded parameters (ShardedTensor) to
         a new FlatParameter and shards the new FlatParameter to the local chunk.
         """
-        _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP_WRAPPED_MODULE}.")
-        if not self._fsdp_wrapped_module.has_params:
+        _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP_PREFIX}")
+        if not self._has_params:
             return
 
-        if not self._fsdp_wrapped_module.handle.uses_sharded_strategy:
+        if not self._handles[0].uses_sharded_strategy:
             raise RuntimeError(
                 "load_sharded_state_dict can only be called when parameters "
                 "are flatten and sharded."
@@ -2663,7 +2664,7 @@ def _sharded_pre_load_state_dict_hook(
         # https://github.com/pytorch/pytorch/issues/77461
         shared_fqns = [fqn for fqn, _, _ in self._shared_param_fqns]
         for fqn, _, _ in self._param_fqns:
-            full_fqn = f"{prefix}{FSDP_WRAPPED_MODULE}.{fqn}"
+            full_fqn = f"{prefix}{FSDP_PREFIX}{fqn}"
             param = state_dict.pop(full_fqn)
             if fqn in shared_fqns:
                 continue
@@ -2710,7 +2711,7 @@ def _sharded_pre_load_state_dict_hook(
             f"The loaded local chunk has different padding({num_to_pad}) "
             f"from the local chunk {flat_param._shard_numel_padded}."
         )
-        state_dict[f"{prefix}_fsdp_wrapped_module.flat_param"] = loaded_flat_param
+        state_dict[f"{prefix}{FSDP_PREFIX}{FLAT_PARAM}"] = loaded_flat_param
         if self._use_orig_params:
             self._deregister_orig_params()
 
@@ -3157,7 +3158,7 @@ def _summon_full_params(
                         # move parameters.
                 # TODO (awgu): This FPW call assumes 1 `FlatParameter`
                 if not self._use_orig_params:
-                    stack.enter_context(self._fsdp_wrapped_module.unflatten_as_params())
+                    stack.enter_context(self._unflatten_as_params())
                 try:
                     yield
                 finally:
@@ -3215,6 +3216,50 @@ def _writeback_to_local_shard(
                     )
                     existing_grad[:grad_shard.numel()].copy_(grad_shard)
 
+    @contextlib.contextmanager
+    def _unflatten_as_params(self) -> Generator:
+        """
+        Assumes that the flattened parameter is unsharded. When in the context,
+        de-registers the flattened parameter and unflattens the original
+        parameters as ``nn.Parameter`` views into the flattened parameter.
+        After the context, re-registers the flattened parameter and restores
+        the original parameters as ``Tensor`` views into the flattened
+        parameter.
+        """
+        if not self._handles:
+            yield
+        else:
+            self._deregister_flat_param()
+            try:
+                with self._handles[0].unflatten_as_params():
+                    yield
+            finally:
+                if not self._handles[0]._use_orig_params:
+                    self._register_flat_param()
+
+    def _register_flat_param(self):
+        """
+        Registers the flattened parameter to the wrapped module, making it
+        visible to ``nn.Module`` methods.
+
+        We do not use :meth:`nn.Module.register_parameter` because we want
+        ``FLAT_PARAM`` to always be an attribute but dynamically change whether
+        it is visible to ``nn.Module`` methods.
+        """
+        if self._has_params:
+            self.module._parameters[FLAT_PARAM] = self._handles[0].flat_param
+
+    def _deregister_flat_param(self):
+        """
+        De-registers the flattened parameter from the wrapped module, hiding it
+        from ``nn.Module`` methods.
+
+        We do not use ``del`` because we want ``FLAT_PARAM`` to always be an
+        attribute but dynamically change whether it is visible to ``nn.Module``
+        methods.
+        """
+        self.module._parameters.pop(FLAT_PARAM, None)
+
     @contextlib.contextmanager
     def _deregister_orig_params_ctx(self):
         """
@@ -3254,7 +3299,7 @@ def _deregister_orig_params(self):
             f"handle: {handle._use_orig_params}"
         )
         handle._deregister_orig_params()
-        self._fsdp_wrapped_module._register_flat_param()
+        self._register_flat_param()
 
     def _register_orig_params(self):
         """
@@ -3263,7 +3308,7 @@ def _register_orig_params(self):
         if not self._handles:
             return
         handle = self._handles[0]
-        self._fsdp_wrapped_module._deregister_flat_param()
+        self._deregister_flat_param()
         if handle.is_sharded(handle.flat_param):
             handle._use_sharded_views()
             handle._use_sharded_grad_views()
@@ -4613,25 +4658,22 @@ def _get_param_to_unflat_param_names(
             unflattened parameter names.
     """
     def module_fn(module, prefix, param_to_unflat_param_names):
-        # For FSDP modules, only add the entry when considering the contained
-        # `FlattenParamsWrapper` to avoid duplication
-        if not isinstance(module, FullyShardedDataParallel):
-            for param_name, param in module.named_parameters(recurse=False):
-                module_prefixed_param_names = (
-                    param._fqns if type(param) is FlatParameter
-                    else [param_name]
-                )  # prefixed from `module`
-                fully_prefixed_param_names = [
-                    clean_tensor_name(prefix + name)
-                    for name in module_prefixed_param_names
-                ]  # fully prefixed from the top level including `prefix`
-                # If this parameter has already been visited, then it is a
-                # shared parameter; then, only take the first parameter name
-                is_shared_param = param in param_to_unflat_param_names
-                if not is_shared_param:
-                    param_to_unflat_param_names[param] = fully_prefixed_param_names
-                elif not dedup_shared_params:
-                    param_to_unflat_param_names[param].extend(fully_prefixed_param_names)
+        for param_name, param in module.named_parameters(recurse=False):
+            module_prefixed_param_names = (
+                param._fqns if type(param) is FlatParameter
+                else [param_name]
+            )  # prefixed from `module`
+            fully_prefixed_param_names = [
+                clean_tensor_name(prefix + name)
+                for name in module_prefixed_param_names
+            ]  # fully prefixed from the top level including `prefix`
+            # If this parameter has already been visited, then it is a
+            # shared parameter; then, only take the first parameter name
+            is_shared_param = param in param_to_unflat_param_names
+            if not is_shared_param:
+                param_to_unflat_param_names[param] = fully_prefixed_param_names
+            elif not dedup_shared_params:
+                param_to_unflat_param_names[param].extend(fully_prefixed_param_names)
 
     def return_fn(param_to_unflat_param_names):
         return param_to_unflat_param_names
@@ -4684,9 +4726,7 @@ def _get_param_name_to_param(
 def clean_tensor_name(tensor_name: str) -> str:
     """Cleans the parameter or buffer name by removing any module wrapper
     prefixes."""
-    # Call `replace()` twice separately since the name may not have both
-    tensor_name = tensor_name.replace(FSDP_WRAPPED_MODULE + ".", "")
-    tensor_name = tensor_name.replace(FPW_MODULE + ".", "")
+    tensor_name = tensor_name.replace(FSDP_PREFIX, "")
     # TODO: Explicitly replacing checkpoint_wrapper prefix is not ideal,
     # as it increases coupling between CheckpointWrapper and FSDP. This is also not
     # scalable for additional wrapped modules, we should come up with a general solution

From 1b165436c7087f898756d6f9224bc08614ed1038 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Fri, 21 Oct 2022 11:30:57 +0000
Subject: [PATCH 0011/1922] [FSDP][4/N] Rework FPW test to not use FPW (#87112)

Testing coverage is pretty much preserved except that we do not test on CPU, which is not a tangible loss for FSDP anyway.

I renamed a few tests slightly, and I moved some helpers to be immediately below the corresponding test method. This makes it a bit easier to read.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87112
Approved by: https://github.com/zhaojuanmao
---
 ...wrapper.py => test_fsdp_flatten_params.py} | 375 +++++++++++-------
 1 file changed, 227 insertions(+), 148 deletions(-)
 rename test/distributed/fsdp/{test_flatten_params_wrapper.py => test_fsdp_flatten_params.py} (51%)

diff --git a/test/distributed/fsdp/test_flatten_params_wrapper.py b/test/distributed/fsdp/test_fsdp_flatten_params.py
similarity index 51%
rename from test/distributed/fsdp/test_flatten_params_wrapper.py
rename to test/distributed/fsdp/test_fsdp_flatten_params.py
index 016398c88deba..cfc2a494d4406 100644
--- a/test/distributed/fsdp/test_flatten_params_wrapper.py
+++ b/test/distributed/fsdp/test_fsdp_flatten_params.py
@@ -1,44 +1,45 @@
 # Owner(s): ["oncall: distributed"]
 
 import sys
-import unittest
 
 import torch
+import torch.nn as nn
 from torch import distributed as dist
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.flat_param import (
+    FlatParamHandle,
     FlatParamShardMetadata,
     HandleConfig,
     HandleShardingStrategy,
 )
-from torch.distributed.fsdp.flatten_params_wrapper import FlattenParamsWrapper
-from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import FSDPTest
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
 
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
 
-class TestFlattenParams(TestCase):
-    """Base test class and used for CPU case."""
-
-    def _get_default_config(self):
-        return HandleConfig(HandleShardingStrategy.FULL_SHARD, False, None, None)
-
-    def _get_empty_module(self, seed=0):
-        torch.manual_seed(seed)  # keep everything deterministic
 
-        class Test(torch.nn.Module):
-            def forward(self, x):
-                return x + 1
+class TestFlattenParams(FSDPTest):
+    """Tests parameter flattening and shard metadata logic."""
 
-        module = Test()
+    @property
+    def world_size(self) -> int:
+        # Clamp the world size to 1 since these unit tests either exercise only
+        # the flattening logic or check sharding subroutines directly without
+        # requiring multiple ranks
+        return 1
 
-        def get_input(device, dtype):
-            torch.manual_seed(1)  # keep everything deterministic
-            return torch.rand(1).to(device=device, dtype=dtype)
-
-        module.get_input = get_input
-        return module
+    def _get_default_config(self):
+        return HandleConfig(HandleShardingStrategy.FULL_SHARD, False, None, None)
 
     def _get_transformer(self, seed=0):
         torch.manual_seed(seed)  # keep everything deterministic
@@ -68,152 +69,243 @@ def _get_shared_params_transformer(self, seed=0):
             dec_layer.linear2.weight = enc_layer.linear2.weight
         return module
 
-    def _get_output(self, module):
-        device = next(module.parameters()).device
-        dtype = next(module.parameters()).dtype
-        input = module.get_input(device, dtype)
-        return module(*input)
-
-    def _get_pnorm_after_step(self, module):
-        optim = torch.optim.SGD(module.parameters(), lr=0.01)
-        loss = self._get_output(module).sum()
-        loss.backward()
-        optim.step()
-        return torch.norm(torch.stack([p.detach().norm() for p in module.parameters()]))
-
-    def _test_num_params(self, module):
-        ref_num_params = sum(p.numel() for p in module.parameters())
-
-        params_to_flatten = list(module.parameters())
-        flat_module = FlattenParamsWrapper(
-            module,
-            params_to_flatten,
-            torch.device("cuda"),
-            self._get_default_config(),
-            False,
-        )
-        flat_num_params = sum(p.numel() for p in flat_module.parameters())
-
-        self.assertEqual(ref_num_params, flat_num_params)
-        self.assertEqual(flat_num_params, flat_module.flat_param.numel())
-
-    def _test_output(self, module):
-        ref_output = self._get_output(module)
-
-        params_to_flatten = list(module.parameters())
-        flat_module = FlattenParamsWrapper(
-            module,
-            params_to_flatten,
-            torch.device("cuda"),
-            self._get_default_config(),
-            False,
+    @skip_if_lt_x_gpu(1)
+    def test_partial_flattening(self):
+        """Tests flattening some submodules but not others."""
+        self.run_subtests(
+            {"half": [False, True]},
+            self._test_partial_flattening,
         )
-        flat_output = self._get_output(flat_module)
-        self.assertEqual(ref_output, flat_output)
 
-    def test_partial_flattening(self):
+    def _test_partial_flattening(self, half: bool):
         module = self._get_transformer()
-        num_params = sum(p.numel() for p in module.parameters())
-
-        params_to_flatten = list(module.encoder.layers[1].parameters()) + list(
-            module.decoder.layers[0].parameters()
+        if half:
+            module = module.half()
+        numel = sum(p.numel() for p in module.parameters())
+
+        encoder_1_params = list(module.encoder.layers[1].parameters())
+        decoder_0_params = list(module.decoder.layers[0].parameters())
+        params_to_flatten = encoder_1_params + decoder_0_params
+        num_params = [len(encoder_1_params), len(decoder_0_params)]
+        numel_to_flatten = sum(p.numel() for p in params_to_flatten)
+        module.encoder.layers[1] = FSDP(module.encoder.layers[1])
+        module.decoder.layers[0] = FSDP(module.decoder.layers[0])
+        flat_params = [
+            module.encoder.layers[1]._flat_param,
+            module.decoder.layers[0]._flat_param,
+        ]
+
+        self.assertEqual(sum(fp.numel() for fp in flat_params), numel_to_flatten)
+        self.assertEqual(sum(p.numel() for p in module.parameters()), numel)
+
+        # Check that flattened parameters have been replaced with a single
+        # `FlatParameter`
+        self.assertEqual(len(list(module.encoder.layers[1].parameters())), 1)
+        self.assertEqual(len(list(module.decoder.layers[0].parameters())), 1)
+
+        # Check that non-flattened parameters remain
+        self.assertEqual(
+            len(list(module.encoder.layers[0].parameters())), num_params[0]
         )
-        num_params_to_flatten = sum(p.numel() for p in params_to_flatten)
-
-        module = FlattenParamsWrapper(
-            module,
-            params_to_flatten,
-            torch.device("cuda"),
-            self._get_default_config(),
-            False,
+        self.assertEqual(
+            len(list(module.decoder.layers[1].parameters())), num_params[1]
         )
-        self.assertEqual(module.flat_param.numel(), num_params_to_flatten)
-        self.assertEqual(sum(p.numel() for p in module.parameters()), num_params)
-
-        # flattened parameters are removed
-        self.assertEqual(len(list(module.encoder.layers[1].parameters())), 0)
-        self.assertEqual(len(list(module.decoder.layers[0].parameters())), 0)
-
-        # non-flattened parameters remain
-        self.assertGreater(len(list(module.encoder.layers[0].parameters())), 0)
-        self.assertGreater(len(list(module.decoder.layers[1].parameters())), 0)
 
-        # test that changing the module dtype works properly
+        # Check that calling `module.to()` affects the `FlatParameter`s
         orig_dtype = params_to_flatten[0].dtype
         new_dtype = torch.float32 if orig_dtype == torch.float16 else torch.float16
-        self.assertEqual(module.flat_param.dtype, orig_dtype)
+        for flat_param in flat_params:
+            self.assertEqual(flat_param.dtype, orig_dtype)
         self.assertTrue(
             all(p.dtype == orig_dtype for p in module.encoder.layers[0].parameters())
         )
         module = module.to(dtype=new_dtype)
-        self.assertEqual(module.flat_param.dtype, new_dtype)
+        for flat_param in flat_params:
+            self.assertEqual(flat_param.dtype, new_dtype)
         self.assertTrue(
             all(p.dtype == new_dtype for p in module.encoder.layers[0].parameters())
         )
 
     def test_flatten_nothing(self):
-        module = self._get_transformer()
-        module = FlattenParamsWrapper(
-            module,
-            [],
-            torch.device("cuda"),
-            self._get_default_config(),
-            False,
+        """
+        Tests that constructing a ``FlatParamHandle`` with no parameters
+        raises an error.
+        """
+        self.run_subtests(
+            {"half": [False, True]},
+            self._test_flatten_nothing,
         )
-        self.assertIsNone(module.flat_param)
 
+    def _test_flatten_nothing(self, half: bool):
+        module = self._get_transformer()
+        if half:
+            module = module.half()
+        with self.assertRaisesRegex(
+            ValueError,
+            "Cannot initialize a `FlatParameter` from an empty parameter list",
+        ):
+            FlatParamHandle(
+                [],
+                module,
+                torch.device("cuda"),
+                self._get_default_config(),
+                False,
+            )
+
+    @skip_if_lt_x_gpu(1)
     def test_empty_module(self):
+        """
+        Tests flattening an empty module (i.e. one without any parameters).
+        """
         module = self._get_empty_module()
         in_data = torch.rand(1)
         ref_out = module(in_data)
-        module = FlattenParamsWrapper(
-            module,
-            [],
-            torch.device("cuda"),
-            self._get_default_config(),
-            False,
+        fsdp_module = FSDP(module)
+        self.assertEqual(len(list(fsdp_module.parameters())), 0)
+        self.assertIsNone(fsdp_module._flat_param)
+        fsdp_out = fsdp_module(in_data)
+        self.assertEqual(ref_out, fsdp_out)
+
+    def _get_empty_module(self):
+        """Returns a module with no parameters."""
+        torch.manual_seed(0)  # keep everything deterministic
+
+        class EmptyModule(torch.nn.Module):
+            def forward(self, x):
+                return x + 1
+
+            def get_input(self, device, dtype):
+                torch.manual_seed(1)  # keep everything deterministic
+                return torch.rand(1).to(device=device, dtype=dtype)
+
+        return EmptyModule()
+
+    def test_numel_without_shared_params(self):
+        """
+        Tests that numel is preserved after flattening when there are no shared
+        parameters in the module.
+        """
+        self.run_subtests(
+            {"half": [False, True]},
+            self._test_numel_without_shared_params,
         )
-        self.assertEqual(len(list(module.parameters())), 0)
-        self.assertIsNone(module.flat_param)
-        fpw_out = module(in_data)
-        self.assertEqual(ref_out, fpw_out)
 
-    def test_num_params(self):
+    def _test_numel_without_shared_params(self, half: bool):
         module = self._get_transformer()
-        self._test_num_params(module)
+        if half:
+            module = module.half()
+        self._test_numel(module)
+
+    def test_numel_with_shared_params(self):
+        """
+        Tests that numel is preserved after flattening when there are shared
+        parameters in the module.
+        """
+        self.run_subtests(
+            {"half": [False, True]},
+            self._test_numel_with_shared_params,
+        )
 
-    def test_shared_params_num_params(self):
+    def _test_numel_with_shared_params(self, half: bool):
         module = self._get_shared_params_transformer()
-        self._test_num_params(module)
+        if half:
+            module = module.half()
+        self._test_numel(module)
 
-    def test_output(self):
+    def _test_numel(self, module):
+        ref_numel = sum(p.numel() for p in module.parameters())
+        params_to_flatten = list(module.parameters())
+        flat_param_handle = FlatParamHandle(
+            params_to_flatten,
+            module,
+            torch.device("cuda"),
+            self._get_default_config(),
+            False,
+        )
+        self.assertEqual(ref_numel, flat_param_handle.flat_param.numel())
+
+    @skip_if_lt_x_gpu(1)
+    def test_output_without_shared_params(self):
+        """
+        Tests a forward pass after flattening when there are no shared
+        parameters in the module.
+        """
+        self.run_subtests(
+            {"half": [False, True]},
+            self._test_output_without_shared_params,
+        )
+
+    def _test_output_without_shared_params(self, half: bool):
         module = self._get_transformer()
+        if half:
+            module = module.half()
         self._test_output(module)
 
-    def test_shared_params_output(self):
+    @skip_if_lt_x_gpu(1)
+    def test_output_with_shared_params(self):
+        """
+        Tests a forward pass after flattening when there are shared parameters
+        in the module.
+        """
+        self.run_subtests(
+            {"half": [False, True]},
+            self._test_output_with_shared_params,
+        )
+
+    def _test_output_with_shared_params(self, half: bool):
         module = self._get_shared_params_transformer()
+        if half:
+            module = module.half()
         self._test_output(module)
 
-    def test_shared_params_pnorm_after_step(self):
-        # incorrect parameter sharing is likely to cause problems after an
-        # optimization step
-        module = self._get_shared_params_transformer()
-        ref_pnorm_after_step = self._get_pnorm_after_step(module)
+    def _test_output(self, module: nn.Module):
+        module = module.to(self.rank)
+        ref_output = self._get_output(module)
+        fsdp_module = FSDP(module)
+        fsdp_output = self._get_output(fsdp_module)
+        self.assertEqual(ref_output, fsdp_output)
 
-        module = self._get_shared_params_transformer()  # recreate
-        params_to_flatten = list(module.parameters())
-        flat_module = FlattenParamsWrapper(
-            module,
-            params_to_flatten,
-            torch.device("cuda"),
-            self._get_default_config(),
-            False,
+    def _get_output(self, module):
+        device = next(module.parameters()).device
+        dtype = next(module.parameters()).dtype
+        input = module.get_input(device, dtype)
+        return module(*input)
+
+    @skip_if_lt_x_gpu(1)
+    def test_pnorm_after_step_with_shared_params(self):
+        """
+        Tests for parameter Frobenius norm parity after an optimizer step when
+        there are shared parameters in the module. If the parameter sharing is
+        handled incorrectly, then an optimizer step should reveal that.
+        """
+        self.run_subtests(
+            {"half": [False, True]},
+            self._test_pnorm_after_step_with_shared_params,
         )
-        flat_pnorm_after_step = self._get_pnorm_after_step(flat_module)
 
-        self.assertEqual(ref_pnorm_after_step, flat_pnorm_after_step)
+    def _test_pnorm_after_step_with_shared_params(self, half: bool):
+        module = self._get_shared_params_transformer().to(self.rank)
+        if half:
+            module = module.half()
+        ref_pnorm_after_step = self._get_pnorm_after_step(module)
+        module = self._get_shared_params_transformer().to(self.rank)  # recreate
+        if half:
+            module = module.half()
+        fsdp_module = FSDP(module)
+        fsdp_pnorm_after_step = self._get_pnorm_after_step(fsdp_module)
+        self.assertEqual(ref_pnorm_after_step, fsdp_pnorm_after_step)
 
-    def test_sharded_flat_param(self):
+    def _get_pnorm_after_step(self, module):
+        optim = torch.optim.SGD(module.parameters(), lr=0.01)
+        loss = self._get_output(module).sum()
+        loss.backward()
+        optim.step()
+        return torch.norm(torch.stack([p.detach().norm() for p in module.parameters()]))
+
+    def test_flat_param_shard_metadata(self):
+        """
+        Tests that ``FlatParameter`` shard metadata are computed as expected.
+        """
         module = torch.nn.Sequential(
             torch.nn.Linear(10, 10, bias=False),
             torch.nn.ReLU(),
@@ -223,14 +315,13 @@ def test_sharded_flat_param(self):
             torch.nn.ReLU(),
         )
         params_to_flatten = list(module.parameters())
-        flat_module = FlattenParamsWrapper(
-            module,
+        flat_param_handle = FlatParamHandle(
             params_to_flatten,
+            module,
             torch.device("cuda"),
             self._get_default_config(),
             False,
         )
-        flat_param_handle = flat_module.handle
 
         def _test(kwargs, expected):
             """
@@ -244,9 +335,11 @@ def _test(kwargs, expected):
             ``init_shard_info()`` with the start and end indices fixed based on
             rank and world size.
             """
-            flat_param = flat_module.flat_param
-            flat_param._shard_param_offsets, flat_param._shard_indices = \
-                flat_param_handle._get_shard_metadata(kwargs["start"], kwargs["end"])
+            flat_param = flat_param_handle.flat_param
+            (
+                flat_param._shard_param_offsets,
+                flat_param._shard_indices,
+            ) = flat_param_handle._get_shard_metadata(kwargs["start"], kwargs["end"])
             self.assertEqual(
                 flat_param_handle.shard_metadata(),
                 expected,
@@ -345,19 +438,5 @@ def _test(kwargs, expected):
         )
 
 
-@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
-class TestFlattenParamsCUDA(TestFlattenParams):
-    def _get_transformer(self, seed=0):
-        module = super()._get_transformer(seed=seed)
-        return module.cuda()
-
-
-@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
-class TestFlattenParamsCUDAHalf(TestFlattenParams):
-    def _get_transformer(self, seed=0):
-        module = super()._get_transformer(seed=seed)
-        return module.cuda().half()
-
-
 if __name__ == "__main__":
     run_tests()

From 1ff953af97b015c78c822f2bbc09a641ae0fa2ab Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Fri, 21 Oct 2022 11:30:58 +0000
Subject: [PATCH 0012/1922] [FSDP][5/N] Update `FlatParamHandle` after FPW
 deprecation (#87113)

This PR resolves a TODO left in `FlatParamHandle` that was conditional on deprecating `FlattenParamsWrapper`. We simply pass in the process group into the `FlatParamHandle` constructor instead of later in `shard()`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87113
Approved by: https://github.com/zhaojuanmao
---
 .../fsdp/test_fsdp_flatten_params.py          |  3 ++
 torch/distributed/fsdp/flat_param.py          | 30 +++++++++++--------
 .../fsdp/fully_sharded_data_parallel.py       | 11 +++++--
 3 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/test/distributed/fsdp/test_fsdp_flatten_params.py b/test/distributed/fsdp/test_fsdp_flatten_params.py
index cfc2a494d4406..4f7178df4a109 100644
--- a/test/distributed/fsdp/test_fsdp_flatten_params.py
+++ b/test/distributed/fsdp/test_fsdp_flatten_params.py
@@ -149,6 +149,7 @@ def _test_flatten_nothing(self, half: bool):
                 module,
                 torch.device("cuda"),
                 self._get_default_config(),
+                self.process_group,
                 False,
             )
 
@@ -220,6 +221,7 @@ def _test_numel(self, module):
             module,
             torch.device("cuda"),
             self._get_default_config(),
+            self.process_group,
             False,
         )
         self.assertEqual(ref_numel, flat_param_handle.flat_param.numel())
@@ -320,6 +322,7 @@ def test_flat_param_shard_metadata(self):
             module,
             torch.device("cuda"),
             self._get_default_config(),
+            self.process_group,
             False,
         )
 
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index 2c65dd80ea3c3..c96cd4a3f267a 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -296,11 +296,15 @@ def __init__(
         module: nn.Module,
         device: torch.device,
         config: HandleConfig,
+        process_group: dist.ProcessGroup,
         use_orig_params: bool,
     ):
         super().__init__()
         self.device = device
         self._config = config
+        self.process_group = process_group
+        self.rank = process_group.rank()
+        self.world_size = process_group.size()
         self._use_orig_params = use_orig_params
         self._training_state = HandleTrainingState.IDLE
         self._debug_level = dist.get_debug_level()
@@ -436,7 +440,7 @@ def flatten_params(
     # SHARD INITIALIZATION & METADATA #
     ###################################
     @torch.no_grad()
-    def shard(self, process_group: dist.ProcessGroup):
+    def shard(self):
         """
         Shards the handle's ``FlatParameter``. In terms of memory, this
         allocates new memory for the sharded flattened parameter and frees the
@@ -446,16 +450,8 @@ def shard(self, process_group: dist.ProcessGroup):
         Shard metadata attributes are set for all sharding strategies.
         ``process_group``, ``rank``, and ``world_size`` attributes are set if
         using a sharded strategy.
-
-        TODO (awgu): Once we retire ``FlattenParamsWrapper``, we should pass
-        the process group directly to the ``FlatParamHandle`` constructor. For
-        now, we decouple ``FlattenParamsWrapper` from a process group, but this
-        makes the process-group-related attributes not necessarily defined.
         """
         flat_param = self.flat_param
-        self.process_group = process_group
-        self.rank = process_group.rank()
-        self.world_size = process_group.size()
         if not self.uses_sharded_strategy:
             self._init_shard_metadata(0, 0, flat_param.numel() - 1)
         else:
@@ -863,7 +859,9 @@ def unshard_grad(self):
             sharded_grad = flat_param._saved_grad_shard  # type: ignore[attr-defined]
         dist.all_gather_into_tensor(padded_unsharded_grad, sharded_grad, self.process_group)
         unsharded_size = self.flat_param._unpadded_unsharded_size
-        flat_param.grad = padded_unsharded_grad[:unsharded_size.numel()].view(unsharded_size)
+        flat_param.grad = padded_unsharded_grad[: unsharded_size.numel()].view(
+            unsharded_size
+        )
         self._use_unsharded_grad_views()
 
     def reshard_grad(self):
@@ -913,7 +911,7 @@ def prepare_gradient_for_backward(self):
                 else:
                     p_assert(
                         hasattr(flat_param, "_cpu_grad"),
-                        "`_cpu_grad` should be defined if the gradient is on CPU"
+                        "`_cpu_grad` should be defined if the gradient is on CPU",
                     )
                     sharded_grad = flat_param._cpu_grad  # type: ignore[attr-defined]
                 # If user specified to keep the gradient in low precision, then
@@ -944,12 +942,15 @@ def prepare_gradient_for_optim(self):
         Prepares the gradient for optimizer computation by moving the sharded
         gradient to the ``.grad`` attribute.
         """
+
         def cast_grad_to_param_dtype_if_needed(flat_param):
             if self._config.keep_low_precision_grads:
                 assert flat_param.grad is not None  # mypy
                 # This cast is meaningful when `param_dtype` is a low precision
                 # dtype.
-                flat_param.grad.data = flat_param.grad.to(self._config.low_prec_param_dtype)
+                flat_param.grad.data = flat_param.grad.to(
+                    self._config.low_prec_param_dtype
+                )
 
         flat_param = self.flat_param
         # TODO (awgu): We should replace these conditional checks to encode
@@ -1517,7 +1518,10 @@ def is_sharded(self, tensor: Tensor) -> bool:
         Returns if ``tensor`` is *currently* sharded. For ``NO_SHARD``, we
         choose to have this always return ``False`` for clarity.
         """
-        if not hasattr(self.flat_param, "_sharded_size") or not self.uses_sharded_strategy:
+        if (
+            not hasattr(self.flat_param, "_sharded_size")
+            or not self.uses_sharded_strategy
+        ):
             # `_sharded_size` is defined iff `handle.shard()` has been called
             return False
         sharded_size = self.flat_param._sharded_size  # type: ignore[attr-defined]
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 2cf3af6d540c0..6648c606861c1 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -1092,11 +1092,18 @@ def __init__(
         self.params: List[FlatParameter] = []
         self._fsdp_wrapped_module = module
         if params_to_flatten:
-            handle = FlatParamHandle(params_to_flatten, module, self.compute_device, config, use_orig_params)
+            handle = FlatParamHandle(
+                params_to_flatten,
+                module,
+                self.compute_device,
+                config,
+                self.process_group,
+                use_orig_params,
+            )
             self._handles.append(handle)
             self.params.append(handle.flat_param)
             self._register_param_handle(handle)
-            handle.shard(self.process_group)
+            handle.shard()
             if self.cpu_offload.offload_params and handle.flat_param.device != torch.device("cpu"):
                 handle.flat_param_to(torch.device("cpu"))
         if not use_orig_params:

From 66027fdf958794efefac5f93e7f781b11c06a593 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Fri, 21 Oct 2022 11:30:58 +0000
Subject: [PATCH 0013/1922] [FSDP][6/N] Remove FPW! (#87114)

This PR simply deletes `flatten_params_wrapper.py`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87114
Approved by: https://github.com/zhaojuanmao
---
 .../fsdp/flatten_params_wrapper.py            | 190 ------------------
 1 file changed, 190 deletions(-)
 delete mode 100644 torch/distributed/fsdp/flatten_params_wrapper.py

diff --git a/torch/distributed/fsdp/flatten_params_wrapper.py b/torch/distributed/fsdp/flatten_params_wrapper.py
deleted file mode 100644
index 2c72cdcf158c2..0000000000000
--- a/torch/distributed/fsdp/flatten_params_wrapper.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-# Copyright (c) Tongzhou Wang
-# Licensed under the MIT License.
-
-import contextlib
-from typing import Any, Dict, Generator, List, Optional
-
-import torch
-import torch.nn as nn
-from torch.distributed.utils import _replace_by_prefix
-
-from .flat_param import FlatParameter, FlatParamHandle, HandleConfig
-
-FLAT_PARAM = "flat_param"
-FPW_MODULE = "_fpw_module"
-
-__all__ = ["FlattenParamsWrapper"]
-
-
-def _post_state_dict_hook(
-    module: nn.Module, state_dict: Dict[str, Any], prefix: str, *args: Any
-) -> Dict[str, Any]:
-    """
-    _post_state_dict_hook() is called after the state_dict() is executed
-    and before returning the state_dict to the users.
-    This API post-processes the keys of the state_dict to remove the
-    FlattenParamsWrapper internal prefix.
-    """
-    # Move everything from FPW_MODULE up one level.
-    _replace_by_prefix(state_dict, prefix + f"{FPW_MODULE}.", prefix)
-    return state_dict
-
-
-def _pre_load_state_dict_hook(
-    state_dict: Dict[str, Any],
-    prefix: str,
-    *args: Any,
-) -> None:
-    """
-    _pre_load_state_dict_hook() is called before the _load_from_state_dict() is
-    executed. This API pre-processes the keys of the state_dict to add the
-    FlattenParamsWrapper internal prefix.
-    """
-    # Push everything down to FPW_MODULE level.
-    _replace_by_prefix(state_dict, prefix, prefix + f"{FPW_MODULE}.")
-    # The flat_param_* keys actually needs to move one level up.
-    flat_param_key = prefix + f"{FPW_MODULE}.{FLAT_PARAM}"
-    for k in list(state_dict.keys()):
-        if k.startswith(flat_param_key):
-            last_part = k.split(".")[-1]
-            assert last_part.startswith(
-                FLAT_PARAM
-            ), f"Expected key to contain flat_param, but key name is {k}"
-            _replace_by_prefix(state_dict, k, prefix + last_part)
-
-
-class FlattenParamsWrapper(nn.Module):
-    """
-    This is a wrapper for flattening parameters in a ``nn.Module`` 's subtree
-    into a single flattened parameter and is based on [1]. This is used for
-    :class:`FullyShardedDataParallel` 's recursive wrapping.
-    [1] https://github.com/SsnL/PyTorch-Reparam-Module
-
-    Args:
-        module (nn.Module): Module to wrap.
-        params (List[nn.Parameter]): Parameters in ``module`` 's subtree to
-            flatten into a single flattened parameter.
-        device (torch.device): The compute and communication device for this
-            wrapper's handle.
-        config (HandleConfig): A config customizing this wrapper's handle based
-            on FSDP's available features.
-
-    Attributes:
-        flat_param (Optional[FlatParameter]): The flattened parameter.
-            ``flat_param`` is ``None`` either when (1) this wrapper manages no
-            parameters or (2) the wrapped module's parameters are unflattened.
-        _fpw_module (nn.Module): The wrapped module.
-        _flat_param_handle (FlatParamHandle): A handle for the flattened
-            parameter; only present if this wrapper manages parameters.
-    """
-
-    def __init__(
-        self,
-        module: nn.Module,
-        params: List[nn.Parameter],
-        device: torch.device,
-        config: HandleConfig,
-        use_orig_params: bool,
-    ) -> None:
-        super().__init__()
-        self._fpw_module = module
-        # Register hooks to clean parameter names for state dict (even if this
-        # wrapper itself manages no parameters since it must clean names from
-        # submodules)
-        self._register_state_dict_hook(_post_state_dict_hook)
-        self._register_load_state_dict_pre_hook(_pre_load_state_dict_hook)
-        if len(params) == 0:
-            return
-        self._flat_param_handle = FlatParamHandle(
-            params, module, device, config, use_orig_params
-        )
-        if not use_orig_params:
-            self._register_flat_param()
-        self._use_orig_params = use_orig_params
-        assert getattr(self, FPW_MODULE) is self._fpw_module
-        assert getattr(self, FLAT_PARAM) is self.flat_param
-
-    @property
-    def has_params(self) -> bool:
-        """Returns whether this wrapper manages any parameters."""
-        return hasattr(self, "_flat_param_handle")
-
-    @property
-    def flat_param(self) -> Optional[FlatParameter]:
-        return self.handle.flat_param if self.has_params else None
-
-    @property
-    def handle(self) -> FlatParamHandle:
-        assert hasattr(self, "_flat_param_handle"), (
-            "Accessing the handle of a `FlattenParamsWrapper` that does not "
-            "manage any parameters"
-        )
-        return self._flat_param_handle
-
-    @property
-    def module(self) -> Any:
-        """Returns the wrapped module (like DDP)."""
-        return self._fpw_module
-
-    @contextlib.contextmanager
-    def unflatten_as_params(self) -> Generator:
-        """
-        Assumes that the flattened parameter is unsharded. When in the context,
-        de-registers the flattened parameter and unflattens the original
-        parameters as ``nn.Parameter`` views into the flattened parameter.
-        After the context, re-registers the flattened parameter and restores
-        the original parameters as ``Tensor`` views into the flattened
-        parameter.
-        """
-        if self.flat_param is None:
-            yield
-        else:
-            self._deregister_flat_param()
-            try:
-                with self._flat_param_handle.unflatten_as_params():
-                    yield
-            finally:
-                if not self.handle._use_orig_params:
-                    self._register_flat_param()
-
-    def _register_flat_param(self):
-        """
-        Registers the flattened parameter, making it visible to ``nn.Module``
-        methods.
-
-        We do not use :meth:`nn.Module.register_parameter` because we want
-        ``flat_param`` to always be an attribute but dynamically change whether
-        it is visible to ``nn.Module`` methods.
-        """
-        self._parameters["flat_param"] = self.flat_param
-
-    def _deregister_flat_param(self):
-        """
-        De-registers the flattened parameter, hiding it from ``nn.Module``
-        methods.
-
-        We do not use ``del self.flat_param`` because we want ``flat_param`` to
-        always be an attribute but dynamically change whether it is visible to
-        ``nn.Module`` methods.
-        """
-        self._parameters.pop("flat_param", None)
-
-    def __getattr__(self, name: str) -> Any:
-        """Forward missing attributes of this wrapper to the wrapped module."""
-        try:
-            return super().__getattr__(name)  # defer to `nn.Module`'s logic
-        except AttributeError:
-            return getattr(self.module, name)  # fall back to the wrapped module
-
-    def __getitem__(self, key: int) -> Any:
-        """Forward indexing calls to the wrapped module in case the wrapped
-        module is an ``nn.Sequential``."""
-        return self.module.__getitem__(key)
-
-    def forward(self, *inputs: Any, **kwinputs: Any) -> Any:
-        return self.module(*inputs, **kwinputs)

From 471f6c6d62d092d24b90b6551a7e04e8c134908f Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@meta.com>
Date: Fri, 21 Oct 2022 16:57:33 +0000
Subject: [PATCH 0014/1922] [quant][api] Add assert for backend in
 get_default_qconfig related apis (#86259) (#87331)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86259

Add assertion to make sure backend is one of "fbgemm", "x86", "qnnpack" and "onednn"
for get_default_qconfig, get_default_qat_qconfig, get_default_qconfig_mapping and get_default_qat_qconfig_mapping

Test Plan:
python test/test_quantization.py -k test_get_default_qconfig_mapping

Imported from OSS

Reviewed By: jcaip

Differential Revision: D40236474

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87331
Approved by: https://github.com/andrewor14
---
 .../eager/test_quantize_eager_qat.py          |  1 +
 test/quantization/fx/test_quantize_fx.py      | 14 ++++++++++++++
 test/quantization/jit/test_quantize_jit.py    |  5 +++++
 torch/ao/quantization/qconfig.py              | 19 +++++++++++++++++--
 torch/ao/quantization/qconfig_mapping.py      |  8 ++++----
 5 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/test/quantization/eager/test_quantize_eager_qat.py b/test/quantization/eager/test_quantize_eager_qat.py
index bc118a82062d9..44911b6d9e11a 100644
--- a/test/quantization/eager/test_quantize_eager_qat.py
+++ b/test/quantization/eager/test_quantize_eager_qat.py
@@ -594,6 +594,7 @@ def forward(self, x):
         eps = 1e-5
         self.assertTrue(torch.abs(mq.quant.scale * 2 - res.q_scale()) < eps)
 
+    @override_qengines
     def test_qat_embedding_bag_errors(self):
         default_qat_qconfig = get_default_qat_qconfig(torch.backends.quantized.engine)
 
diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 6935081a5c923..2746b1c9a0173 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -5223,6 +5223,20 @@ def forward(self, x):
         # make sure this runs
         m = prepare_fx(m, qconfig_mapping, example_inputs, backend_config=backend_config)
 
+    def test_get_default_qconfig_valid_backend(self):
+        """ Checks that AssertionError is raised when non expected backend input is specified
+        """
+        invalid_backends = ["imaginary_backend", 3]
+        for invalid_backend in invalid_backends:
+            with self.assertRaisesRegex(AssertionError, "not supported"):
+                qconfig = get_default_qconfig(invalid_backend)
+            with self.assertRaisesRegex(AssertionError, "not supported"):
+                qconfig = get_default_qat_qconfig(invalid_backend)
+            with self.assertRaisesRegex(AssertionError, "not supported"):
+                qconfig_mapping = get_default_qconfig_mapping(invalid_backend)
+            with self.assertRaisesRegex(AssertionError, "not supported"):
+                qconfig_mapping = get_default_qat_qconfig_mapping(invalid_backend)
+
 @skipIfNoFBGEMM
 class TestQuantizeFxOps(QuantizationTestCase):
     def setUp(self):
diff --git a/test/quantization/jit/test_quantize_jit.py b/test/quantization/jit/test_quantize_jit.py
index 84ab3a723b70f..49152a1097ac2 100644
--- a/test/quantization/jit/test_quantize_jit.py
+++ b/test/quantization/jit/test_quantize_jit.py
@@ -2674,6 +2674,7 @@ def forward(self, x):
                     m.graph
                 )
 
+    @override_qengines
     def test_hardswish(self):
         class FunctionalHardswish(torch.nn.Module):
             def __init__(self, inplace):
@@ -2698,6 +2699,7 @@ def forward(self, input):
                 m.graph
             )
 
+    @override_qengines
     def test_elu(self):
         class FunctionalELU(torch.nn.Module):
             def __init__(self, inplace=False):
@@ -2714,6 +2716,7 @@ def forward(self, input):
             m = self.checkGraphModeOp(m, self.img_data_2d, "quantized::elu", tracing)
             FileCheck().check_not("aten::elu").check_not("aten::elu_").run(m.graph)
 
+    @override_qengines
     def test_layer_norm(self):
         data = [[torch.rand((1, 2, 5, 5), dtype=torch.float)] for _ in range(2)]
         layer_norm = torch.nn.LayerNorm([2, 5, 5])
@@ -2723,6 +2726,7 @@ def test_layer_norm(self):
             )
             FileCheck().check_not("aten::layer_norm").run(m.graph)
 
+    @override_qengines
     def test_group_norm(self):
         data = [[torch.rand((1, 4, 5, 5), dtype=torch.float)] for _ in range(2)]
         group_norm = torch.nn.GroupNorm(2, 4)
@@ -2732,6 +2736,7 @@ def test_group_norm(self):
             )
             FileCheck().check_not("aten::group_norm").run(m.graph)
 
+    @override_qengines
     def test_instance_norm(self):
         data_1d = [[torch.rand((1, 4, 5), dtype=torch.float)] for _ in range(2)]
         data_2d = [[torch.rand((1, 4, 5, 1), dtype=torch.float)] for _ in range(2)]
diff --git a/torch/ao/quantization/qconfig.py b/torch/ao/quantization/qconfig.py
index 8e662e5745ce6..d1eb0a64a125d 100644
--- a/torch/ao/quantization/qconfig.py
+++ b/torch/ao/quantization/qconfig.py
@@ -228,12 +228,19 @@ def get_default_qconfig(backend='fbgemm', version=0):
     Returns the default PTQ qconfig for the specified backend.
 
     Args:
-      * `backend`: a string representing the target backend. Currently supports
+      * `backend` (str): a string representing the target backend. Currently supports
         `x86`, `fbgemm` (default), `qnnpack` and `onednn`.
 
     Return:
         qconfig
     """
+    supported_backends = ["fbgemm", "x86", "qnnpack", "onednn"]
+    if backend not in supported_backends:
+        raise AssertionError(
+            "backend: " + str(backend) +
+            " not supported. backend must be one of {}".format(supported_backends)
+        )
+
     if version == 0:
         if backend == 'fbgemm':
             qconfig = QConfig(activation=HistogramObserver.with_args(reduce_range=True),
@@ -249,6 +256,7 @@ def get_default_qconfig(backend='fbgemm', version=0):
             qconfig = QConfig(activation=HistogramObserver.with_args(reduce_range=True),
                               weight=default_per_channel_weight_observer)
         else:
+            # won't reach
             qconfig = default_qconfig
     else:
         raise AssertionError("Version number: " + str(version) +
@@ -303,13 +311,20 @@ def get_default_qat_qconfig(backend='fbgemm', version=1):
     Returns the default QAT qconfig for the specified backend.
 
     Args:
-      * `backend`: a string representing the target backend. Currently supports
+      * `backend` (str): a string representing the target backend. Currently supports
         `x86`, `fbgemm` (default), `qnnpack` and `onednn`.
       * `version`: version, for backwards compatibility. Can be `None` or `1`.
 
     Return:
         qconfig
     """
+    supported_backends = ["fbgemm", "x86", "qnnpack", "onednn"]
+    if backend not in supported_backends:
+        raise AssertionError(
+            "backend: " + str(backend) +
+            " not supported. backend must be one of {}".format(supported_backends)
+        )
+
     # Histogram observer is too slow for quantization aware training
     if version == 0:
         if backend == 'fbgemm':
diff --git a/torch/ao/quantization/qconfig_mapping.py b/torch/ao/quantization/qconfig_mapping.py
index 4dc4431aa99d1..418cbb334814c 100644
--- a/torch/ao/quantization/qconfig_mapping.py
+++ b/torch/ao/quantization/qconfig_mapping.py
@@ -126,9 +126,9 @@ def get_default_qconfig_mapping(backend="fbgemm", version=0) -> QConfigMapping:
     Return the default QConfigMapping for post training quantization.
 
     Args:
-      * ``backend`` : the quantization backend for the default qconfig mapping, should be
+      * ``backend`` (str) : the quantization backend for the default qconfig mapping, should be
          one of ["x86", "fbgemm" (default), "qnnpack", "onednn"]
-      * ``version`` : the version for the default qconfig mapping
+      * ``version`` (int) : the version for the default qconfig mapping
     """
     # TODO: add assert for backend choices
     return _get_default_qconfig_mapping(False, backend, version)
@@ -138,9 +138,9 @@ def get_default_qat_qconfig_mapping(backend="fbgemm", version=1) -> QConfigMappi
     Return the default QConfigMapping for quantization aware training.
 
     Args:
-      * ``backend`` : the quantization backend for the default qconfig mapping, should be
+      * ``backend`` (str) : the quantization backend for the default qconfig mapping, should be
          one of ["x86", "fbgemm" (default), "qnnpack", "onednn"]
-      * ``version`` : the version for the default qconfig mapping
+      * ``version`` (int) : the version for the default qconfig mapping
     """
     return _get_default_qconfig_mapping(True, backend, version)
 

From 84ae92c52db4b859de531c70e9e65dab071c6501 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Fri, 21 Oct 2022 11:35:30 +0000
Subject: [PATCH 0015/1922] [FSDP][1/N] Update `summon_full_params(with_grads)`
 `None` gradient (#87314)

This PR changes `summon_full_params(with_grads=True)`'s behavior to be such that if all ranks have `flat_param.grad = None`, then the original parameters will correctly have `orig_param.grad = None`. This is achieved with a preliminary all-reduce. Note that if a particular original parameter's gradient is `None` on all of the containing ranks, but not all ranks' `flat_param.grad = None`, then that particular gradient is still going to be set to zeros. This can be handled if desired in follow-up work.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87314
Approved by: https://github.com/zhaojuanmao
---
 .../fsdp/test_fsdp_summon_full_params.py      | 42 +++++++++++++++++--
 torch/distributed/fsdp/flat_param.py          | 30 ++++++++++++-
 2 files changed, 68 insertions(+), 4 deletions(-)

diff --git a/test/distributed/fsdp/test_fsdp_summon_full_params.py b/test/distributed/fsdp/test_fsdp_summon_full_params.py
index d78aa81a19d7a..82fd8e1c0737b 100644
--- a/test/distributed/fsdp/test_fsdp_summon_full_params.py
+++ b/test/distributed/fsdp/test_fsdp_summon_full_params.py
@@ -574,7 +574,8 @@ def test_named_parameters_buffers(self, prefix: str, recurse: bool):
                     self.assertEqual(p1, p2)
 
     @skip_if_lt_x_gpu(2)
-    def test_with_grads(self):
+    def test_with_grads_core(self):
+        """Tests the core usage of ``summon_full_params(with_grads=True)``."""
         self.run_subtests(
             {
                 "writeback": [False, True],
@@ -586,10 +587,10 @@ def test_with_grads(self):
                 ],
                 "use_orig_params": [True],
             },
-            self._test_with_grads,
+            self._test_with_grads_core,
         )
 
-    def _test_with_grads(
+    def _test_with_grads_core(
         self,
         writeback: bool,
         offload_to_cpu: bool,
@@ -691,6 +692,41 @@ def _get_fsdp_grads(fsdp_model: FSDP, is_supported: bool):
         with _get_error_context(is_supported):
             _check_grads(ddp_model, fsdp_model, old_fsdp_grads)
 
+    @skip_if_lt_x_gpu(2)
+    def test_with_grads_none_grads(self):
+        """
+        Tests that if all ranks' ``FlatParameter`` has ``None`` gradient, then
+        each original parameter sees ``None`` gradient as well.
+        """
+        self.run_subtests(
+            {
+                "sharding_strategy": [
+                    ShardingStrategy.FULL_SHARD,
+                    ShardingStrategy.SHARD_GRAD_OP,
+                    ShardingStrategy.NO_SHARD,
+                ]
+            },
+            self._test_with_grads_none_grads
+        )
+
+    def _test_with_grads_none_grads(self, sharding_strategy: ShardingStrategy):
+        fsdp_model = TransformerWithSharedParams.init(
+            self.process_group,
+            FSDPInitMode.RECURSIVE,
+            CUDAInitMode.CUDA_BEFORE,
+            deterministic=True,
+            fsdp_kwargs={
+                "use_orig_params": True,
+                "sharding_strategy": sharding_strategy,
+            },
+        )
+        for fsdp_module in FSDP.fsdp_modules(fsdp_model):
+            for handle in fsdp_module._handles:
+                assert handle.flat_param.grad is None
+        with FSDP.summon_full_params(fsdp_model, with_grads=True):
+            for param in fsdp_model.parameters():
+                self.assertTrue(param.grad is None)
+
 
 instantiate_parametrized_tests(TestSummonFullParams)
 instantiate_parametrized_tests(TestSummonFullParamsNoShard)
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index c96cd4a3f267a..bb54e7c0e9613 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -841,18 +841,46 @@ def _free_low_precision_sharded_param(self):
 
     @torch.no_grad()
     def unshard_grad(self):
+        """
+        Unshards the handle's ``FlatParameter`` 's gradient. If all ranks have
+        ``None`` gradient, then all original parameters will as well. This
+        method performs an all-reduce and an all-gather. The additional
+        all-reduce is tolerable since this method is not meant to be used on
+        the computation critical path.
+
+        Postcondition: ``_saved_grad_shard`` is defined and contains the value
+        to set ``flat_param.grad`` after gradients are resharded.
+        """
         if not self.uses_sharded_strategy:
             self._use_unsharded_grad_views()
             return
         flat_param = self.flat_param
         self._check_unsharded(flat_param)
+
+        # Check if all ranks have a `None` gradient
+        num_grad_none = torch.zeros(1, dtype=torch.int32, device=self.device)
+        num_grad_none[0] = flat_param.grad is None
+        dist.all_reduce(num_grad_none, group=self.process_group)
+        if num_grad_none[0] == self.world_size:
+            flat_param._saved_grad_shard = None  # type: ignore[attr-defined]
+            self._use_unsharded_grad_views()
+            return
+
         padded_unsharded_grad = torch.empty(
             flat_param._padded_unsharded_size,  # type: ignore[attr-defined]
             device=self.device,
         )
         if flat_param.grad is None:
+            # In the case that only some ranks have `None` gradient, we use
+            # zeros to approximate as a best effort attempt
+            if self._debug_level == dist.DebugLevel.DETAIL:
+                warnings.warn(
+                    f"[Rank {self.rank}] Only some but not all ranks have a "
+                    "`None` `FlatParameter` gradient, so FSDP is using zeros to "
+                    "approximate those ranks' sharded gradients being `None`"
+                )
             flat_param._saved_grad_shard = None  # type: ignore[attr-defined]
-            sharded_grad = torch.zeros_like(flat_param)  # type: ignore[attr-defined]
+            sharded_grad = torch.zeros(flat_param._sharded_size, device=self.device)  # type: ignore[attr-defined]
         else:
             self._check_sharded(flat_param.grad)
             flat_param._saved_grad_shard = flat_param.grad  # type: ignore[attr-defined]

From 01449d66e927a13c4914873c3f161f0d386f8d9d Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Fri, 21 Oct 2022 11:35:30 +0000
Subject: [PATCH 0016/1922] [FSDP][2/N] Fix grad zero vs. `None` edge case
 (#87308)

Some original parameters corresponding to one `FlatParameter` may have `None` gradient while others do not. In that case, the `flat_param.grad` must be non-`None`. However, FSDP should take care to expose the original parameters' gradients regardless. To achieve this, we track a `_is_grad_none` mask over the parameters' gradients.
- `_is_grad_none` is initialized to `False` for all.
- `_is_grad_none[i]` is set to `True` when writing zeros in place of `None` when writing back the `i`th gradient.
- `_is_grad_none[i]` is set to `False` via `_reset_is_grad_none()`, which should be called in the post-backward. See the docstring for details.
- `_is_grad_none[i]` must be `False` in order to set `param.grad` to be a view into `flat_param.grad`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87308
Approved by: https://github.com/zhaojuanmao
---
 .../fsdp/test_fsdp_use_orig_params.py         | 211 ++++++++++++++----
 torch/distributed/fsdp/flat_param.py          | 124 ++++++++--
 .../fsdp/fully_sharded_data_parallel.py       |   5 +
 3 files changed, 283 insertions(+), 57 deletions(-)

diff --git a/test/distributed/fsdp/test_fsdp_use_orig_params.py b/test/distributed/fsdp/test_fsdp_use_orig_params.py
index 69b0645a3fa34..1091200206135 100644
--- a/test/distributed/fsdp/test_fsdp_use_orig_params.py
+++ b/test/distributed/fsdp/test_fsdp_use_orig_params.py
@@ -1,20 +1,23 @@
 # Owner(s): ["oncall: distributed"]
 
 import functools
+import itertools
 import sys
-from typing import Callable, Optional, Tuple, Type
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type
 
 import torch
 import torch.nn as nn
 from torch import distributed as dist
-from torch.distributed.fsdp import (
-    BackwardPrefetch,
-    CPUOffload,
-    FullyShardedDataParallel as FSDP,
-    ShardingStrategy,
+from torch.distributed.fsdp import BackwardPrefetch, CPUOffload
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import ShardingStrategy
+from torch.distributed.fsdp.fully_sharded_data_parallel import (
+    clean_tensor_name,
+)
+from torch.distributed.fsdp.wrap import (
+    always_wrap_policy,
+    transformer_auto_wrap_policy,
 )
-from torch.distributed.fsdp.fully_sharded_data_parallel import clean_tensor_name
-from torch.distributed.fsdp.wrap import always_wrap_policy, transformer_auto_wrap_policy
 from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer
 from torch.nn.parallel.distributed import DistributedDataParallel as DDP
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
@@ -25,10 +28,10 @@
     TransformerWithSharedParams,
 )
 from torch.testing._internal.common_utils import (
+    TEST_WITH_DEV_DBG_ASAN,
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
-    TEST_WITH_DEV_DBG_ASAN,
 )
 
 if not dist.is_available():
@@ -46,16 +49,14 @@
 class TestFSDPUseOrigParamsMultipleParamGroups(FSDPTest):
     """Tests multiple parameter groups."""
 
-    def _get_optim(
-        self,
-        model: nn.Module,
-        optim_class: Type[torch.optim.Optimizer],
-        multi_tensor: bool,
-    ) -> torch.optim.Optimizer:
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    def _get_param_groups(self, model: nn.Module) -> List[Dict[str, Any]]:
         """
-        Constructs an Adam optimizer with three parameter groups, one for
-        weights, one for biases, and one for everything else, each with
-        different weight decay and learning rates.
+        Constructs separate parameter groups for weights, biases, and other
+        parameters.
         """
         param_groups = [
             {"params": [], "weight_decay": 0.1, "lr": 1e-2},
@@ -69,18 +70,24 @@ def _get_optim(
                 param_groups[1]["params"].append(param)
             else:
                 param_groups[2]["params"].append(param)
-        return optim_class(param_groups, lr=5e-3, foreach=multi_tensor)
+        return param_groups
 
-    def _get_ddp_transformer_and_optim(
+    def _get_optim(
         self,
+        model: nn.Module,
         optim_class: Type[torch.optim.Optimizer],
         multi_tensor: bool,
-        find_unused_params: bool,
-    ) -> Tuple[DDP, torch.optim.Optimizer]:
+    ) -> torch.optim.Optimizer:
         """
-        Returns a transformer with shared parameters wrapped with DDP and a
-        corresponding optimizer.
+        Constructs an Adam optimizer with three parameter groups, one for
+        weights, one for biases, and one for everything else, each with
+        different weight decay and learning rates.
         """
+        param_groups = self._get_param_groups(model)
+        return optim_class(param_groups, lr=5e-3, foreach=multi_tensor)
+
+    def _get_ddp_transformer(self, find_unused_params: bool) -> DDP:
+        """Returns a transformer with shared parameters wrapped with DDP."""
         model = TransformerWithSharedParams.init(
             self.process_group,
             FSDPInitMode.NO_FSDP,
@@ -92,8 +99,7 @@ def _get_ddp_transformer_and_optim(
             device_ids=[self.rank],
             find_unused_parameters=find_unused_params,
         )
-        ddp_optim = self._get_optim(ddp_model, optim_class, multi_tensor)
-        return ddp_model, ddp_optim
+        return ddp_model
 
     def _get_fsdp_transformer_and_optim(
         self,
@@ -174,11 +180,17 @@ def _check_train_parity(
                     model.to(torch.device("cpu"))
                 optim.step()
                 if model is ddp_model and fsdp_model.cpu_offload.offload_params:
-                    model.to(torch.device("cuda"))
+                    model.to(device)
             torch.testing.assert_close(iter_losses[0], iter_losses[1])
             iter_losses.clear()
+        self._check_ddp_fsdp_param_parity(ddp_model, fsdp_model)
+
+    def _check_ddp_fsdp_param_parity(self, ddp_model: DDP, fsdp_model: FSDP):
         with FSDP.summon_full_params(fsdp_model):
-            for p1, p2 in zip(ddp_model.parameters(), fsdp_model.parameters()):
+            for (n1, p1), (n2, p2) in zip(
+                ddp_model.module.named_parameters(), fsdp_model.named_parameters()
+            ):
+                self.assertEqual(n1, n2)
                 torch.testing.assert_close(p1, p2)
 
     def _get_sharding_strategy_from_str(
@@ -271,11 +283,8 @@ def _test_diff_hyperparams(
         """
         if cuda_init_mode == CUDAInitMode.CUDA_AFTER and cpu_offload.offload_params:
             return  # not supported
-        ddp_model, ddp_optim = self._get_ddp_transformer_and_optim(
-            optim_class=optim_class,
-            multi_tensor=multi_tensor,
-            find_unused_params=False,
-        )
+        ddp_model = self._get_ddp_transformer(find_unused_params=False)
+        ddp_optim = self._get_optim(ddp_model, optim_class, multi_tensor)
         fsdp_model, fsdp_optim = self._get_fsdp_transformer_and_optim(
             cuda_init_mode=cuda_init_mode,
             init_optim_before_wrap=init_optim_before_wrap,
@@ -313,11 +322,8 @@ def _test_diff_trainability(
         sharding_strategy: ShardingStrategy,
     ):
         optim_class = torch.optim.Adam
-        ddp_model, ddp_optim = self._get_ddp_transformer_and_optim(
-            optim_class=optim_class,
-            multi_tensor=multi_tensor,
-            find_unused_params=True,
-        )
+        ddp_model = self._get_ddp_transformer(find_unused_params=True)
+        ddp_optim = self._get_optim(ddp_model, optim_class, multi_tensor)
         fsdp_model, fsdp_optim = self._get_fsdp_transformer_and_optim(
             cuda_init_mode=CUDAInitMode.CUDA_BEFORE,
             init_optim_before_wrap=False,
@@ -336,10 +342,139 @@ def _test_diff_trainability(
                 param.requires_grad_(False)
         self._check_train_parity(ddp_model, ddp_optim, fsdp_model, fsdp_optim, False)
 
+    @skip_if_lt_x_gpu(2)
+    def test_multiple_optimizers(self):
+        """
+        Tests using two optimizers where only one sets gradients to ``None``.
+        """
+        self.run_subtests(
+            {
+                "sharding_strategy": [
+                    ShardingStrategy.FULL_SHARD,
+                    # ShardingStrategy.SHARD_GRAD_OP,
+                ]
+            },
+            self._test_multiple_optimizers,
+        )
+
+    def _test_multiple_optimizers(self, sharding_strategy: ShardingStrategy):
+        ddp_model = self._get_ddp_transformer(find_unused_params=True)
+        ddp_param_groups = self._get_param_groups(ddp_model)
+        assert len(ddp_param_groups) == 3, f"{len(ddp_param_groups)}"
+        fsdp_model, _ = self._get_fsdp_transformer_and_optim(  # ignore returned optimizer
+            cuda_init_mode=CUDAInitMode.CUDA_BEFORE,
+            init_optim_before_wrap=False,
+            optim_class=torch.optim.Adam,  # ignored
+            multi_tensor=False,            # ignored
+            sharding_strategy=sharding_strategy,
+            backward_prefetch=BackwardPrefetch.BACKWARD_PRE,
+            cpu_offload=None,
+        )
+        fsdp_param_groups = self._get_param_groups(fsdp_model)
+        assert len(fsdp_param_groups) == 3, f"{len(fsdp_param_groups)}"
+        ddp_optims = []
+        fsdp_optims = []
+        # For the transformer model, every parameter is either a weight or a
+        # bias, so we only use the first two parameter groups. Moreover, we use
+        # Adam and AdamW in particular since they both use bias correction
+        # dependent on the step, which is incremented even if a parameter has a
+        # zero gradient but not if the gradient is `None`. This is to test that
+        # we are differentiating between a zero and `None` gradient correctly.
+        optim_ctors = [
+            functools.partial(torch.optim.Adam, lr=5e-3),
+            functools.partial(torch.optim.AdamW, lr=1e-2),
+        ]
+
+        for optim_ctor, ddp_param_group, fsdp_param_group in zip(
+            optim_ctors, ddp_param_groups[:2], fsdp_param_groups[:2],
+        ):
+            ddp_optims.append(optim_ctor(ddp_param_group["params"]))
+            fsdp_optims.append(optim_ctor(fsdp_param_group["params"]))
+        device = torch.device("cuda")
+
+        # Check that there exists a `FlatParameter` that has both a weight and
+        # a bias in this rank's shard
+        has_both = False
+        for fsdp_module in FSDP.fsdp_modules(fsdp_model):
+            for handle in fsdp_module._handles:
+                flat_param = handle.flat_param
+                assert flat_param._params is not None
+                has_weight = False
+                has_bias = False
+                for param, fqn in zip(flat_param._params, flat_param._fqns):
+                    if "weight" in fqn and param.numel() > 0:
+                        has_weight = True
+                    elif "bias" in fqn and param.numel() > 0:
+                        has_bias = True
+                has_both |= (has_weight and has_bias)
+        assert has_both, (
+            f"Rank {self.rank} does not have a `FlatParameter` with both a "
+            "weight and a bias in its shard, meaning that this test is vacuous"
+        )
+
+        # Run one iteration to generate gradients
+        def run_iter():
+            iter_losses = []
+            for model, optims in ((ddp_model, ddp_optims), (fsdp_model, fsdp_optims)):
+                module = model.module
+                inp = module.get_input(device)
+                output = model(*inp)
+                loss = module.get_loss(inp, output).to(device)
+                iter_losses.append(loss)
+                module.run_backward(loss)
+                for optim in optims:
+                    optim.step()
+            torch.testing.assert_close(iter_losses[0], iter_losses[1])
+            iter_losses.clear()
+            self._check_ddp_fsdp_param_parity(ddp_model, fsdp_model)
+
+        run_iter()
+
+        # Only set the weights' gradients to None
+        ddp_optims[0].zero_grad(set_to_none=True)
+        fsdp_optims[0].zero_grad(set_to_none=True)
+        inp = ddp_model.module.get_input(device)
+        ddp_output = ddp_model(*inp)
+        fsdp_output = fsdp_model(*inp)
+
+        # Check that FSDP correctly exposes gradients even after forward
+        # (namely, `None` for weights and non-`None` for biases)
+        for (ddp_n, ddp_p), (fsdp_n, fsdp_p) in zip(
+            ddp_model.module.named_parameters(), fsdp_model.named_parameters(),
+        ):
+            self.assertEqual(ddp_n, fsdp_n)
+            if fsdp_p.numel() == 0:
+                # Not in this rank's shard
+                self.assertTrue(fsdp_p.grad is None)
+                continue
+            if ddp_p.grad is None:
+                self.assertTrue(fsdp_p.grad is None)
+            else:
+                self.assertEqual(ddp_p.flatten(), fsdp_p.flatten())
+                self.assertEqual(ddp_p.grad.flatten(), fsdp_p.grad.flatten())
+        self._check_ddp_fsdp_param_parity(ddp_model, fsdp_model)
+
+        # Finish the iteration (backward pass and optimizer step)
+        ddp_loss = ddp_model.module.get_loss(inp, ddp_output).to(device)
+        fsdp_loss = fsdp_model.module.get_loss(inp, fsdp_output).to(device)
+        ddp_model.module.run_backward(ddp_loss)
+        fsdp_model.module.run_backward(fsdp_loss)
+        for optim in itertools.chain(ddp_optims, fsdp_optims):
+            optim.step()
+        self._check_ddp_fsdp_param_parity(ddp_model, fsdp_model)
+
+        # Run one more iteration to confirm bias corrections are correct
+        run_iter()
+        self._check_ddp_fsdp_param_parity(ddp_model, fsdp_model)
+
 
 class TestFSDPUseOrigParamsUnshardReshard(FSDPTest):
     """Tests the unshard/reshard flow."""
 
+    @property
+    def world_size(self) -> int:
+        return 2
+
     def _get_fsdp_models_and_optims(
         self,
         sharding_strategy: ShardingStrategy,
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index bb54e7c0e9613..1e34510bd0225 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -206,6 +206,20 @@ class FlatParameter(nn.Parameter):
             This is only defined when offloading parameters is enabled.
         _saved_grad_shard (Tensor): Sharded gradient with padding from previous
             iterations for gradient accumulation without :meth:`no_sync`.
+
+        _params (Optional[List[nn.Parameter]]): The original parameter
+            variables if ``use_orig_params=True`` and ``None`` otherwise.
+        _shared_params (Optional[List[nn.Parameter]]): The original shared
+            parameter variables if ``use_orig_params=True`` and ``None``
+            otherwise.
+        _is_grad_none (Optional[List[bool]]): A mask over the original
+            parameters' gradients indicating if it is logically ``None`` or not
+            if ``use_orig_params=True`` and ``None`` otherwise. This is needed
+            because only some of the parameters may have ``None`` gradient, in
+            which case the ``FlatParameter`` gradient must be non-``None`` and
+            must use zeros to approximate those original ``None`` gradients.
+            This mask informs FSDP to set the original parameter gradients to
+            ``None`` (instead of zeros) as needed.
     """
 
     def _init_metadata(
@@ -256,9 +270,13 @@ def _init_metadata(
             # another `FlatParameter` during recursive construction
             for param in chain(self._params, self._shared_params):
                 _set_fsdp_flattened(param)
+            self._is_grad_none: Optional[List[bool]] = [
+                False for _ in range(len(params))
+            ]
         else:
             self._params = None
             self._shared_params = None
+            self._is_grad_none = None
         self._unpadded_unsharded_size = self.size()
         _set_fsdp_flattened(self)
 
@@ -1115,6 +1133,11 @@ def _use_sharded_flat_param(self) -> None:
         flat_param.data = flat_param._local_shard  # type: ignore[attr-defined]
         if self._use_orig_params:
             self._use_sharded_views()
+            # For the post-forward reshard, we may try to use sharded gradient
+            # views, but for the post-backward reshard, we delay the call to
+            # after the reduce-scatter
+            if self._training_state == HandleTrainingState.FORWARD:
+                self._use_sharded_grad_views()
 
     #########
     # VIEWS #
@@ -1215,6 +1238,13 @@ def _use_unsharded_grad_views(self) -> None:
         """
         # Expects the gradient to be in `flat_param.grad`
         if self.flat_param.grad is None:
+            assert self.flat_param._params is not None  # mypy
+            assert self.flat_param._shared_params is not None  # mypy
+            for param in chain(
+                self.flat_param._params,  # type: ignore[attr-defined]
+                self.flat_param._shared_params,  # type: ignore[attr-defined]
+            ):
+                param.grad = None
             return
         self._check_unsharded(self.flat_param.grad)
         views = self._get_unflat_views(self.flat_param, self.flat_param.grad)
@@ -1329,21 +1359,26 @@ def _use_sharded_grad_views(self) -> None:
         self._check_sharded(flat_param)
         grad = self.sharded_grad
         if grad is None:
-            return  # no-op
+            assert flat_param._params is not None  # mypy
+            assert flat_param._shared_params is not None  # mypy
+            for param in chain(flat_param._params, flat_param._shared_params):  # type: ignore[attr-defined]
+                param.grad = None
+            return
         self._check_sharded(grad)
-        start, end = self.flat_param._shard_indices  # type: ignore[attr-defined]
+        start, end = flat_param._shard_indices  # type: ignore[attr-defined]
         offset = 0
-        assert self.flat_param._params is not None
-        for i, param in enumerate(self.flat_param._params):
+        assert flat_param._params is not None
+        for i, param in enumerate(flat_param._params):
             in_sharded_flat_param = (
                 i >= start
                 and i <= end
-                and self.flat_param._shard_param_offsets  # type: ignore[attr-defined]
+                and flat_param._shard_param_offsets  # type: ignore[attr-defined]
             )
             if in_sharded_flat_param:
-                param_start, param_end = self.flat_param._shard_param_offsets[i - start]  # type: ignore[attr-defined]
+                param_start, param_end = flat_param._shard_param_offsets[i - start]  # type: ignore[attr-defined]
                 numel_in_shard = param_end - param_start + 1
-                if param.requires_grad:
+                assert flat_param._is_grad_none is not None  # mypy
+                if param.requires_grad and not flat_param._is_grad_none[i]:
                     param.grad = grad[offset : offset + numel_in_shard].reshape(
                         param.shape
                     )
@@ -1352,9 +1387,9 @@ def _use_sharded_grad_views(self) -> None:
                 offset += numel_in_shard
             else:
                 param.grad = None
-        assert self.flat_param._shared_params is not None
+        assert flat_param._shared_params is not None
         for i, (param, (_, _, _, prim_param_name, prim_module, _)) in enumerate(
-            zip(self.flat_param._shared_params, self.flat_param._shared_param_infos)
+            zip(flat_param._shared_params, flat_param._shared_param_infos)
         ):
             in_sharded_flat_param = hasattr(prim_module, prim_param_name)
             if in_sharded_flat_param and param.requires_grad:
@@ -1412,7 +1447,9 @@ def _writeback_orig_params(self) -> bool:
                 flat_param._params[i] = param
             if needs_param_writeback:
                 expected_shape = torch.Size([numel_in_shard])
-                self._writeback_tensor(param, flat_param, expected_shape, offset, True)
+                self._writeback_tensor(
+                    param, flat_param, i, expected_shape, offset, True
+                )
                 wroteback = True
             # Check for gradient writeback
             # NOTE: Since this method is called in the pre-unshard, which is
@@ -1422,19 +1459,27 @@ def _writeback_orig_params(self) -> bool:
             if param.grad is None and flat_param.grad is not None:
                 expected_shape = torch.Size([numel_in_shard])
                 self._writeback_tensor(
-                    None, flat_param.grad, expected_shape, offset, False
+                    None, flat_param.grad, i, expected_shape, offset, False
                 )
             elif param.grad is not None:
-                needs_grad_writeback = flat_param.grad is None or not _same_storage(
-                    param.grad, flat_param.grad
+                # For `NO_SHARD` + CPU offloading, `_cpu_grad` is always in
+                # memory and owns the gradient storage, so it will never
+                # require gradient writeback.
+                flat_param_grad = (
+                    flat_param.grad if self.uses_sharded_strategy or not self._config.offload_params
+                    else flat_param._cpu_grad  # type: ignore[attr-defined]
+                )
+                needs_grad_writeback = flat_param_grad is None or not _same_storage(
+                    param.grad, flat_param_grad
                 )
                 if needs_grad_writeback:
-                    if flat_param.grad is None:
-                        flat_param.grad = torch.zeros_like(flat_param)
+                    if flat_param_grad is None:
+                        flat_param_grad = torch.zeros_like(flat_param)
                     expected_shape = torch.Size([numel_in_shard])
                     self._writeback_tensor(
-                        param.grad, flat_param.grad, expected_shape, offset, False
+                        param.grad, flat_param_grad, i, expected_shape, offset, False
                     )
+                    flat_param.grad = flat_param_grad
             offset += numel_in_shard
         # TODO (awgu): Handle shared parameters. We need to re-generate the
         # shared parameter data structures in case sharedness changed.
@@ -1456,6 +1501,7 @@ def _writeback_tensor(
         self,
         src_tensor: Optional[Tensor],
         dst_tensor: Tensor,
+        tensor_index: int,
         expected_shape: torch.Size,
         offset: int,
         is_param: bool,  # else gradient
@@ -1465,7 +1511,8 @@ def _writeback_tensor(
         where ``src_tensor`` should have shape ``expected_shape``. ``is_param``
         indicates if the tensor is the parameter (if ``True``) or gradient (if
         ``False``). If ``src_tensor`` is ``None``, then the effect is zeroing
-        instead of copying.
+        instead of copying. ``tensor_index`` gives the index of ``src_tensor``
+        in the metadata structures.
 
         Raises:
             RuntimeError: If the ``src_tensor`` does not have the expected
@@ -1497,6 +1544,8 @@ def _writeback_tensor(
             dst_tensor[offset : offset + expected_shape.numel()].copy_(src_tensor)
         else:
             dst_tensor[offset : offset + expected_shape.numel()].zero_()
+            assert self.flat_param._is_grad_none is not None
+            self.flat_param._is_grad_none[tensor_index] = True
 
     def _clear_grads_if_needed(self):
         """
@@ -1586,6 +1635,16 @@ def shared_parameter_module_names(self) -> Iterator[Tuple[str, str]]:
         ]:
             yield (param_name, module_name)
 
+    @property
+    def _fqns_in_shard(self) -> List[str]:
+        """Returns the FQNs of the parameters present in this rank's shard."""
+        fqns_in_shard: List[str] = []
+        start, end = self.flat_param._shard_indices  # type: ignore[attr-defined]
+        for i in range(len(self.flat_param._fqns)):
+            if i >= start and i <= end and self.flat_param._shard_param_offsets:  # type: ignore[attr-defined]
+                fqns_in_shard.append(self.flat_param._fqns[i])
+        return fqns_in_shard
+
     @property
     def sharded_grad(self) -> Optional[Tensor]:
         """Returns the handle's sharded gradient."""
@@ -1599,13 +1658,40 @@ def sharded_grad(self) -> Optional[Tensor]:
         elif hasattr(flat_param, "_saved_grad_shard"):
             grad = flat_param._saved_grad_shard  # type: ignore[attr-defined]
         else:
+            # If in the forward, then there may be an accumulated gradient,
+            # which will be in `.grad`
             p_assert(
-                flat_param.grad is None or not self.uses_sharded_strategy,
-                "Sharded strategies should use `_cpu_grad` or `_saved_grad_shard`",
+                flat_param.grad is None
+                or not self.uses_sharded_strategy
+                or self._training_state == HandleTrainingState.FORWARD,
+                "Sharded strategies should use `_cpu_grad` or `_saved_grad_shard` "
+                "unless in FORWARD (for the post-forward reshard)",
             )
             grad = flat_param.grad
         return grad
 
+    def _reset_is_grad_none(self) -> None:
+        """
+        Resets the ``_is_grad_none`` mask as needed. This method should only be
+        called in the post-backward after gradient computation, in which case
+        if a parameter requires gradient, then it will surely receive a
+        gradient and we may reset its mask entry to ``False``.
+        """
+        if not self._use_orig_params:
+            return
+        p_assert(
+            self._training_state == HandleTrainingState.BACKWARD_POST,
+            "Expects to only be called in the post-backward after gradient computation",
+        )
+        flat_param = self.flat_param
+        assert flat_param._params is not None  # mypy
+        for i, param in enumerate(flat_param._params):
+            # As long as the parameter requires gradient, it should receive a
+            # meaningful gradient (even if the gradient happens to be zeros)
+            if param.requires_grad:
+                assert flat_param._is_grad_none is not None  # mypy
+                flat_param._is_grad_none[i] = False
+
     #######################
     # CHECKS & INVARIANTS #
     #######################
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 6648c606861c1..a51df5195f0fc 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -3668,6 +3668,11 @@ def _post_backward_hook(
                 orig_grad_data.record_stream(self._streams["post_backward"])
 
                 if handle._use_orig_params:
+                    # Since the handle's `FlatParameter` completed its gradient
+                    # computation, we should reset the gradient noneness mask
+                    handle._reset_is_grad_none()
+                    # Delay using sharded gradient views until after the
+                    # reduce-scatter instead of immediately after resharding
                     handle._use_sharded_grad_views()
 
     def _cast_grad_to_param_dtype(

From 581c137fc9c8a1a5f4469a36e9b6f28b5ab7cfa8 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Fri, 21 Oct 2022 07:29:38 -0700
Subject: [PATCH 0017/1922] [dynamo] Unify raise_on_* config to suppress_errors
 and raise by default (#87440)

I noticed that a lot of bugs are being suppressed by torchdynamo's default
error suppression, and worse yet, there's no way to unsuppress them.  After
discussion with voz and soumith, we decided that we will unify error suppression
into a single option (suppress_errors) and default suppression to False.

If your model used to work and no longer works, try TORCHDYNAMO_SUPPRESS_ERRORS=1
to bring back the old suppression behavior.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87440
Approved by: https://github.com/voznesenskym, https://github.com/albanD
---
 benchmarks/dynamo/common.py                | 24 ++++------------------
 test/dynamo/test_aot_cudagraphs.py         |  3 +++
 test/dynamo/test_misc.py                   |  3 ++-
 test/dynamo/test_no_fake_tensors.py        | 12 +++++++++++
 test/dynamo/test_optimizers.py             |  5 -----
 test/dynamo/test_replay_record.py          |  5 +++++
 test/dynamo/test_repros.py                 |  8 ++++++++
 test/dynamo/test_unspec.py                 |  5 +++++
 test/inductor/test_torchinductor.py        |  1 -
 test/inductor/test_torchinductor_opinfo.py |  1 -
 torch/_dynamo/config.py                    | 10 ++++-----
 torch/_dynamo/convert_frame.py             | 14 ++++++-------
 torch/_dynamo/debug_utils.py               |  2 --
 torch/_dynamo/exc.py                       |  2 +-
 torch/_dynamo/guards.py                    | 11 ++++------
 torch/_dynamo/output_graph.py              |  3 +--
 torch/_dynamo/test_case.py                 |  3 ---
 torch/testing/_internal/common_utils.py    |  3 +++
 18 files changed, 59 insertions(+), 56 deletions(-)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 8ff1fb5c3ae93..a2f8af2bc825a 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -1466,20 +1466,9 @@ def parse_args():
         help="Use same settings as --inductor for baseline comparisons",
     )
     parser.add_argument(
-        "--raise-on-assertion-error",
+        "--suppress-errors",
         action="store_true",
-        help="Fail a benchmark if torch._dynamo triggers an internal assertion",
-    )
-    parser.add_argument(
-        "--raise-on-backend-error",
-        action="store_true",
-        help="Fail a benchmark if backend throws an exception",
-    )
-    parser.add_argument(
-        "--raise-on-any",
-        "--raise",
-        action="store_true",
-        help="Raise on assertion or backend errors",
+        help="Suppress errors instead of raising them",
     )
     parser.add_argument(
         "--output",
@@ -1672,7 +1661,7 @@ def main(runner, original_dir=None):
             os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 
         # Stricter check to disable fallbacks
-        args.raise_on_any = True
+        args.suppress_errors = False
 
     elif args.performance:
         # Ensure that we test on real scenarios
@@ -1736,12 +1725,7 @@ def main(runner, original_dir=None):
     if args.quiet:
         torch._dynamo.config.log_level = logging.ERROR
 
-    torch._dynamo.config.raise_on_assertion_error = (
-        args.raise_on_assertion_error or args.raise_on_any
-    )
-    torch._dynamo.config.raise_on_backend_error = (
-        args.raise_on_backend_error or args.raise_on_any
-    )
+    torch._dynamo.config.suppress_errors = args.suppress_errors
 
     if args.training:
         runner.model_iter_fn = runner.forward_and_backward_pass
diff --git a/test/dynamo/test_aot_cudagraphs.py b/test/dynamo/test_aot_cudagraphs.py
index fdb7c88762b8b..cb1d2a0e601ff 100644
--- a/test/dynamo/test_aot_cudagraphs.py
+++ b/test/dynamo/test_aot_cudagraphs.py
@@ -71,6 +71,7 @@ def fn(x, y):
         y = torch.randn(3, device="cuda")
         fn(x, y)
 
+    @patch("torch._dynamo.config.suppress_errors", True)
     @patch_all()
     def test_dtoh(self):
         def model(x, y):
@@ -104,6 +105,7 @@ def fn(x, y):
         y = torch.randn((), device="cpu")
         fn(x, y)
 
+    @patch("torch._dynamo.config.suppress_errors", True)
     @patch("functorch._src.config.use_functionalize", True)
     @patch_all(ok=False)  # input mutation not supported yet
     def test_mutate_input(self):
@@ -143,6 +145,7 @@ def fn(x, y):
         y = torch.randn(1, device="cuda")
         fn(x, y)
 
+    @patch("torch._dynamo.config.suppress_errors", True)
     @patch_all()
     def test_factory(self):
         def model(y):
diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index 542a0319a48d3..a0f592212f4e1 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -578,6 +578,8 @@ def fn(count):
         self.assertEqual(cnts.frame_count, 0)
         self.assertEqual(cnts.op_count, 0)
 
+    # KeyError: '__name__'
+    @patch.object(torch._dynamo.config, "suppress_errors", True)
     def test_user_getattr1(self):
         class MyConfig(dict):
             def __getattr__(self, name):
@@ -1959,7 +1961,6 @@ def check_sum_all(tensor: torch.Tensor) -> None:
 
         check_sum_all(torch.randn(200000, dtype=dtype, device=device))
 
-    @patch.object(torch._dynamo.config, "raise_on_backend_error", True)
     def test_raise_on_backend_error(self):
         def my_compiler(gm, _):
             raise RuntimeError("duck!")
diff --git a/test/dynamo/test_no_fake_tensors.py b/test/dynamo/test_no_fake_tensors.py
index d65166f5762c5..df511f1affd55 100644
--- a/test/dynamo/test_no_fake_tensors.py
+++ b/test/dynamo/test_no_fake_tensors.py
@@ -1,4 +1,6 @@
 # Owner(s): ["module: dynamo"]
+import unittest
+
 from torch._dynamo.testing import make_test_cls_with_patches
 
 try:
@@ -23,6 +25,16 @@ def make_no_fake_cls(cls):
 NoFakeTensorsNNModuleTests = make_no_fake_cls(test_modules.NNModuleTests)
 NoFakeTensorsUnspecTests = make_no_fake_cls(test_unspec.UnspecTests)
 
+unittest.expectedFailure(
+    NoFakeTensorsReproTests.test_guard_fail_tensor_bool_no_fake_tensors
+)
+NoFakeTensorsReproTests.test_numpy_list_no_fake_tensors.__unittest_expecting_failure__ = (
+    False
+)
+NoFakeTensorsUnspecTests.test_builtin_getitem_no_fake_tensors.__unittest_expecting_failure__ = (
+    False
+)
+
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/test/dynamo/test_optimizers.py b/test/dynamo/test_optimizers.py
index ebb2cde24f6ad..92b163b76d6dc 100644
--- a/test/dynamo/test_optimizers.py
+++ b/test/dynamo/test_optimizers.py
@@ -55,11 +55,6 @@ def setUpClass(cls):
                 torch._dynamo.config, "fake_tensor_propagation", False
             )
         )
-        cls._exit_stack.enter_context(
-            unittest.mock.patch.object(
-                torch._dynamo.config, "raise_on_assertion_error", True
-            )
-        )
 
     test_sgd = make_test(torch.optim.SGD, lr=0.01)
     # lgbfs has data-dependent control and internally iterates
diff --git a/test/dynamo/test_replay_record.py b/test/dynamo/test_replay_record.py
index c158590a9d7f4..378fd2b78a9bc 100644
--- a/test/dynamo/test_replay_record.py
+++ b/test/dynamo/test_replay_record.py
@@ -29,6 +29,11 @@ def setUpClass(cls):
         cls._exit_stack.enter_context(
             unittest.mock.patch.object(torch._dynamo.config, "print_graph_breaks", True)
         )
+        # Most of the tests are checking to see if errors got logged, so we
+        # ask for errors to be suppressed
+        cls._exit_stack.enter_context(
+            unittest.mock.patch.object(torch._dynamo.config, "suppress_errors", True)
+        )
         cls._exit_stack.enter_context(
             unittest.mock.patch.object(
                 torch._dynamo.config,
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 2bd3130958eb2..ffc71741d72c2 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -1347,6 +1347,8 @@ def fn(args):
 
         self.assertTrue(same(ref, res))
 
+    # AssertionError: ABCMeta
+    @unittest.expectedFailure
     def test_numpy_list(self):
         @torch._dynamo.disable
         def rand_gen():
@@ -1426,6 +1428,8 @@ def fn(x):
 
         fn(torch.randn(3))
 
+    # AssertionError: ABCMeta
+    @unittest.expectedFailure
     def test_isinstance_storage(self):
         @torch._dynamo.optimize("eager")
         def fn(x):
@@ -1464,6 +1468,8 @@ def forward(self, x):
 
         self.assertEqual(y, 10)
 
+    # AssertionError: ABCMeta
+    @unittest.expectedFailure
     def test_sort_out(self):
 
         dtype = torch.float32
@@ -1481,6 +1487,8 @@ def fn():
         opt_fn = torch._dynamo.optimize("eager")(fn)
         opt_fn()
 
+    # AssertionError: ABCMeta
+    @unittest.expectedFailure
     def test_sigmoid_out(self):
 
         dtype = torch.float32
diff --git a/test/dynamo/test_unspec.py b/test/dynamo/test_unspec.py
index fbf3983661935..22f975d0f9d68 100644
--- a/test/dynamo/test_unspec.py
+++ b/test/dynamo/test_unspec.py
@@ -50,6 +50,9 @@ class UnspecTest(cls):
 UnspecReproTests = make_unspec_cls(test_repros.ReproTests)
 UnspecNNModuleTests = make_unspec_cls(test_modules.NNModuleTests)
 
+# RuntimeError: a leaf Variable that requires grad is being used in an in-place operation.
+unittest.expectedFailure(UnspecReproTests.test_batch_norm_act_unspec)
+
 
 @patch.object(torch._dynamo.config, "specialize_int_float", False)
 class UnspecTests(torch._dynamo.test_case.TestCase):
@@ -171,6 +174,8 @@ def fn(x):
         res2 = opt_fn(x)
         self.assertTrue(same(res1, res2))
 
+    # TypeError: zeros(): argument 'size' (position 1) must be tuple of SymInts, not FakeTensor
+    @unittest.expectedFailure
     def test_builtin_getitem(self):
         # builtin getitem args[0] is python list and args[1] is unspec
         def fn(x, idx):
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index df5a7fb0a21de..c4e82a8092437 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -167,7 +167,6 @@ def gather_leaf_tensors(args, kwargs):
 
 
 @patch.object(torch._inductor.config.triton, "cudagraphs", False)
-@patch("torch._dynamo.config.raise_on_backend_error", True)
 def check_model(
     self: TestCase,
     model,
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index e0638341eaa2c..220b711efcb51 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -597,6 +597,5 @@ def fn(*args, **kwargs):
 instantiate_device_type_tests(TestInductorOpInfo, globals())
 
 if __name__ == "__main__":
-    torch._dynamo.config.raise_on_assertion_error = True
     if has_triton() and not TEST_WITH_ROCM:
         run_tests()
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index 18d1af0a743b4..7a2c79972ddaa 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -74,11 +74,11 @@
 # __torch_function__ logic of the subclass.
 traceable_tensor_subclasses = set()
 
-# Raise torchdynamo internal assertions
-raise_on_assertion_error = False
-
-# Propagate backend exceptions up to torchdynamo.optimize
-raise_on_backend_error = True
+# Suppress errors in torchdynamo.optimize, instead forcing a fallback to eager.
+# This is a good way to get your model to work one way or another, but you may
+# lose optimization opportunities this way.  Devs, if your benchmark model is failing
+# this way, you should figure out why instead of suppressing it.
+suppress_errors = bool(os.environ.get("TORCHDYNAMO_SUPPRESS_ERRORS", False))
 
 # Record and write an execution record of the current frame to a file
 # if an exception is encountered
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index d4afed9f63e37..46a23b330a0a4 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -231,7 +231,7 @@ def replay_record_msg():
         msg = f"WON'T CONVERT {code.co_name} {code.co_filename}\
  line {code.co_firstlineno} \ndue to: \n{traceback.format_exc(limit=-1)}"
 
-        if hasattr(exc, "real_stack"):
+        if hasattr(exc, "real_stack") and len(exc.real_stack) > 0:
             msg += f"\nfrom user code:\n {''.join(traceback.format_list([exc.real_stack[-1]]))}"
 
         msg += replay_record_msg()
@@ -439,7 +439,7 @@ def transform(instructions, code_options):
         raise
     except Exception as e:
         exception_handler(e, code, frame)
-        raise InternalTorchDynamoError()
+        raise InternalTorchDynamoError() from e
 
 
 def convert_frame(compiler_fn: typing.Callable, guard_export_fn=None):
@@ -452,13 +452,11 @@ def _convert_frame(frame: types.FrameType, cache_size: int):
             result = inner_convert(frame, cache_size)
             counters["frames"]["ok"] += 1
             return result
-        except AssertionError:
-            if config.raise_on_assertion_error:
-                raise
-        except BackendCompilerFailed:
-            raise
-        except Exception:
+        except (NotImplementedError, Unsupported):
             pass
+        except Exception:
+            if not config.suppress_errors:
+                raise
         return None
 
     _convert_frame._torchdynamo_orig_callable = compiler_fn
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index 7a2466637b767..845c518a4f85d 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -790,8 +790,6 @@ def wrap_backend_debug(compiler_fn, compiler_name: str):
     def debug_wrapper(gm, example_inputs, **kwargs):
         assert config.repro_after in ("dynamo", "aot", None)
         if config.repro_after == "dynamo":
-            # Ensure that we fail when backend fails
-            config.raise_on_backend_error = True
             if config.repro_level == 3:
                 dump_to_minify_after_dynamo(gm, example_inputs, compiler_name)
 
diff --git a/torch/_dynamo/exc.py b/torch/_dynamo/exc.py
index 3001c8c823924..80a2a75712ab4 100644
--- a/torch/_dynamo/exc.py
+++ b/torch/_dynamo/exc.py
@@ -44,7 +44,7 @@ def __init__(self, backend_fn, inner_exception):
             f"{self.backend_name} raised {type(inner_exception).__name__}: {inner_exception}"
             "\n\n"
             "You can suppress this exception and fall back to eager by setting:\n"
-            "    torchdynamo.config.raise_on_backend_error = False"
+            "    torchdynamo.config.suppress_errors = True"
         )
 
 
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index 8f94714784d73..1f43ac667e579 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -5,7 +5,6 @@
 import math
 import os
 import re
-import textwrap
 import types
 import weakref
 from inspect import currentframe, getframeinfo
@@ -560,12 +559,10 @@ def compile_check_fn(self, local_builder, global_builder):
             ]
         )
         closure_vars.update(CLOSURE_VARS)
-        py_code = textwrap.dedent(
-            f"""
-            def ___make_guard_fn({','.join(closure_vars.keys())}):
-                return lambda {args}: {code}
-            """
-        )
+        py_code = f"""\
+def ___make_guard_fn({','.join(closure_vars.keys())}):
+    return lambda {args}: {code}
+"""
         if os.environ.get("TORCHDYNAMO_PRINT_GUARDS", None) == "1":
             print("GUARDS", code)
         set_guard_fail_hook(guard_fail_hook)
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index f9b75b782aa00..f87b07996d73b 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -434,8 +434,7 @@ def call_user_compiler(self, gm):
             log.warning(e, exc_info=True)
             log.warning("-" * 40 + "\n")
             compiled_fn = gm.forward
-            if config.raise_on_backend_error:
-                raise BackendCompilerFailed(self.compiler_fn, e) from e
+            raise BackendCompilerFailed(self.compiler_fn, e) from e
         return compiled_fn
 
     def example_inputs(self):
diff --git a/torch/_dynamo/test_case.py b/torch/_dynamo/test_case.py
index 089e5053d0625..39eda31646d2a 100644
--- a/torch/_dynamo/test_case.py
+++ b/torch/_dynamo/test_case.py
@@ -51,9 +51,6 @@ def tearDownClass(cls):
     def setUpClass(cls):
         super().setUpClass()
         cls._exit_stack = contextlib.ExitStack()
-        cls._exit_stack.enter_context(
-            patch.object(config, "raise_on_backend_error", True)
-        )
         cls._exit_stack.enter_context(
             patch.object(config, "raise_on_ctx_manager_usage", True)
         )
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index edd7c31e98ac9..cb9b52c338118 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -940,6 +940,9 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
     torch._dynamo.config.log_level = logging.ERROR
     # Do not spend time on helper functions that are called with different inputs
     torch._dynamo.config.cache_size_limit = 8
+    # TODO: Remove this; this is grandfathered in because we suppressed errors
+    # on test suite previously
+    torch._dynamo.config.suppress_errors = True
 
 
 def skipIfTorchDynamo(msg="test doesn't currently work with dynamo"):

From 7955596db64ecc0e67604650f9c5a8fd5b10e6ba Mon Sep 17 00:00:00 2001
From: Zachary DeVito <zdevito@gmail.com>
Date: Fri, 21 Oct 2022 03:51:25 +0000
Subject: [PATCH 0018/1922] [inductor] force 'fork' method for processes,
 cleanup (#87411)

To cooperate with other multithreading methods, this
forces the process pool to use 'fork' even if others have set it
diferently. We require fork because otherwise `if __name__ == __main__`
needs to be set which we do not control as a library.

Furthermore this adds code to cleanup worker processes if
the parent exits abnormally (e.g. segfault). Previously we would leave
live but inactive workers around.

cc @jansel @lezcano @fdrocha
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87411
Approved by: https://github.com/soumith, https://github.com/anijain2305
---
 torch/_inductor/codecache.py | 31 +++++++++++++++++++++++++++++--
 torch/_inductor/config.py    |  3 ++-
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index c4400a35cce85..1d83633019cb8 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -3,16 +3,19 @@
 import getpass
 import hashlib
 import logging
+import multiprocessing
 import os
 import re
 import shutil
+import signal
 import subprocess
 import sysconfig
 import tempfile
 import types
 from concurrent.futures import Future, ProcessPoolExecutor, ThreadPoolExecutor
 from ctypes import cdll
-from time import time
+from threading import Thread
+from time import sleep, time
 from typing import Any, Dict
 
 import torch
@@ -279,7 +282,31 @@ def process_pool():
         # are forked
         cuda_properties._properties()
         assert config.compile_threads > 1
-        return ProcessPoolExecutor(config.compile_threads)
+        orig_ppid = os.getpid()
+
+        # if this process dies abnormally (e.g. segfault)
+        # it will not shut down the workers. Instead
+        # the workers will have their parent reassigned to the
+        # init process. This launches a separate thread to
+        # watch for the worker getting reassigned,
+        # and cleans it up in this case.
+        def init():
+            def run():
+                while True:
+                    sleep(1)
+                    if orig_ppid != os.getppid():
+                        os.kill(os.getpid(), signal.SIGKILL)
+
+            global _watchdog_thread
+            _watchdog_thread = Thread(target=run, daemon=True)
+            _watchdog_thread.start()
+
+        # we rely on 'fork' because we cannot control whether users
+        # have an `if __name__ == '__main__'` in their main process.
+        fork_context = multiprocessing.get_context("fork")
+        return ProcessPoolExecutor(
+            config.compile_threads, mp_context=fork_context, initializer=init
+        )
 
     @classmethod
     def warm_pool(cls):
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index cabaa3e7ce0ba..f4b847e50c820 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -1,4 +1,5 @@
 import os
+import sys
 
 # add some debug printouts
 debug = False
@@ -53,7 +54,7 @@
 
 comment_origin = False
 
-compile_threads = min(32, os.cpu_count())
+compile_threads = min(32, os.cpu_count()) if sys.platform != "win32" else 1
 
 # How to import torchinductor, either torchinductor or torch.inductor
 inductor_import = __name__.replace(".config", "")

From 885ecbf0c5238d60d662567859b139af5fcdc0c9 Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@fb.com>
Date: Fri, 21 Oct 2022 17:30:14 +0000
Subject: [PATCH 0019/1922] Add dynamo smoke test (#87400)

https://github.com/pytorch/torchdynamo/issues/1733

Move the old smoke test over from the old dynamo repo.

cc @jansel @lezcano @fdrocha
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87400
Approved by: https://github.com/msaroufim
---
 .lintrunner.toml              |   1 +
 tools/dynamo/verify_dynamo.py | 156 ++++++++++++++++++++++++++++++++++
 2 files changed, 157 insertions(+)
 create mode 100644 tools/dynamo/verify_dynamo.py

diff --git a/.lintrunner.toml b/.lintrunner.toml
index 70e2a423edcc1..a48d411ea9a83 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -156,6 +156,7 @@ include_patterns = [
 exclude_patterns = [
     # (linbinyu) copied from internal repo
     'tools/code_analyzer/gen_operators_yaml.py',
+    'tools/dynamo/verify_dynamo.py',
     'tools/gen_vulkan_spv.py',
     'tools/test/gen_operators_yaml_test.py',
     'tools/test/gen_oplist_test.py',
diff --git a/tools/dynamo/verify_dynamo.py b/tools/dynamo/verify_dynamo.py
new file mode 100644
index 0000000000000..cbc582a561573
--- /dev/null
+++ b/tools/dynamo/verify_dynamo.py
@@ -0,0 +1,156 @@
+import os
+import re
+import subprocess
+import sys
+import traceback
+import warnings
+
+from pkg_resources import packaging
+
+MIN_CUDA_VERSION = packaging.version.parse("11.6")
+MIN_PYTHON_VERSION = (3, 7)
+
+
+class VerifyDynamoError(BaseException):
+    pass
+
+
+def check_python():
+    if sys.version_info < MIN_PYTHON_VERSION:
+        raise VerifyDynamoError(
+            f"Python version not supported: {sys.version_info} "
+            f"- minimum requirement: {MIN_PYTHON_VERSION}"
+        )
+    return sys.version_info
+
+
+def check_torch():
+    import torch
+
+    return packaging.version.parse(torch.__version__)
+
+
+# based on torch/utils/cpp_extension.py
+def get_cuda_version():
+    from torch.utils import cpp_extension
+
+    CUDA_HOME = cpp_extension._find_cuda_home()
+    if not CUDA_HOME:
+        raise VerifyDynamoError(cpp_extension.CUDA_NOT_FOUND_MESSAGE)
+
+    nvcc = os.path.join(CUDA_HOME, "bin", "nvcc")
+    cuda_version_str = (
+        subprocess.check_output([nvcc, "--version"])
+        .strip()
+        .decode(*cpp_extension.SUBPROCESS_DECODE_ARGS)
+    )
+    cuda_version = re.search(r"release (\d+[.]\d+)", cuda_version_str)
+    if cuda_version is None:
+        raise VerifyDynamoError("CUDA version not found in `nvcc --version` output")
+
+    cuda_str_version = cuda_version.group(1)
+    return packaging.version.parse(cuda_str_version)
+
+
+def check_cuda():
+    import torch
+
+    if not torch.cuda.is_available():
+        return None
+
+    torch_cuda_ver = packaging.version.parse(torch.version.cuda)
+
+    # check if torch cuda version matches system cuda version
+    cuda_ver = get_cuda_version()
+    if cuda_ver != torch_cuda_ver:
+        # raise VerifyDynamoError(
+        warnings.warn(
+            f"CUDA version mismatch, `torch` version: {torch_cuda_ver}, env version: {cuda_ver}"
+        )
+
+    if torch_cuda_ver < MIN_CUDA_VERSION:
+        # raise VerifyDynamoError(
+        warnings.warn(
+            f"(`torch`) CUDA version not supported: {torch_cuda_ver} "
+            f"- minimum requirement: {MIN_CUDA_VERSION}"
+        )
+    if cuda_ver < MIN_CUDA_VERSION:
+        # raise VerifyDynamoError(
+        warnings.warn(
+            f"(env) CUDA version not supported: {cuda_ver} "
+            f"- minimum requirement: {MIN_CUDA_VERSION}"
+        )
+
+    return cuda_ver
+
+
+def check_dynamo(backend, device, err_msg):
+    import torch
+
+    if device == "cuda" and not torch.cuda.is_available():
+        print(f"CUDA not available -- skipping CUDA check on {backend} backend\n")
+        return
+
+    try:
+        import torch._dynamo as dynamo
+
+        dynamo.reset()
+
+        @dynamo.optimize(backend, nopython=True)
+        def fn(x):
+            return x + x
+
+        class Module(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return x + x
+
+        mod = Module()
+        opt_mod = dynamo.optimize(backend, nopython=True)(mod)
+
+        for f in (fn, opt_mod):
+            x = torch.randn(10, 10).to(device)
+            x.requires_grad = True
+            y = f(x)
+            torch.testing.assert_close(y, x + x)
+            z = y.sum()
+            z.backward()
+            torch.testing.assert_close(x.grad, 2 * torch.ones_like(x))
+    except Exception:
+        sys.stderr.write(traceback.format_exc() + "\n" + err_msg + "\n\n")
+        sys.exit(1)
+
+
+_SANITY_CHECK_ARGS = (
+    ("eager", "cpu", "CPU eager sanity check failed"),
+    ("eager", "cuda", "CUDA eager sanity check failed"),
+    ("aot_eager", "cpu", "CPU aot_eager sanity check failed"),
+    ("aot_eager", "cuda", "CUDA aot_eager sanity check failed"),
+    ("inductor", "cpu", "CPU inductor sanity check failed"),
+    (
+        "inductor",
+        "cuda",
+        "CUDA inductor sanity check failed\n"
+        + "NOTE: Please check that you installed the correct hash/version of `triton`",
+    ),
+)
+
+
+def main():
+    python_ver = check_python()
+    torch_ver = check_torch()
+    cuda_ver = check_cuda()
+    print(
+        f"Python version: {python_ver.major}.{python_ver.minor}.{python_ver.micro}\n"
+        f"`torch` version: {torch_ver}\n"
+        f"CUDA version: {cuda_ver}\n"
+    )
+    for args in _SANITY_CHECK_ARGS:
+        check_dynamo(*args)
+    print("All required checks passed")
+
+
+if __name__ == "__main__":
+    main()

From 61ebcf1dc5eb5b7999f0450c37fd932ca95617b9 Mon Sep 17 00:00:00 2001
From: chuksmbaka <mbakaforever@yahoo.com>
Date: Fri, 21 Oct 2022 17:30:18 +0000
Subject: [PATCH 0020/1922] Grammatical update of the tech docs. (#87357)

Fixes #ISSUE_NUMBER
A more appropriate and correct word.
![grammatical correction](https://user-images.githubusercontent.com/25278471/196927273-7e4c0c9b-96a6-43d1-9b10-17b40665feed.png)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87357
Approved by: https://github.com/albanD
---
 torch/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/__init__.py b/torch/__init__.py
index 8a824642ab57d..63995d6ec7f69 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -2,7 +2,7 @@
 r"""
 The torch package contains data structures for multi-dimensional
 tensors and defines mathematical operations over these tensors.
-Additionally, it provides many utilities for efficient serializing of
+Additionally, it provides many utilities for efficient serialization of
 Tensors and arbitrary types, and other useful utilities.
 
 It has a CUDA counterpart, that enables you to run your tensor computations

From 026cc96295b5f34f6fd8715a7912fb475e31938c Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Thu, 20 Oct 2022 18:06:25 +0100
Subject: [PATCH 0021/1922] Remove redundant zeroing in col2im/im2col (#87375)

All of the kernels already either start by zeroing the output, or are
careful in their implementation to write values to every output
location. So, these `zero_` calls should be redundant.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87375
Approved by: https://github.com/albanD
---
 aten/src/ATen/native/Col2Im.cpp     | 1 -
 aten/src/ATen/native/Im2Col.cpp     | 1 -
 aten/src/ATen/native/cuda/Col2Im.cu | 1 -
 aten/src/ATen/native/cuda/Im2Col.cu | 1 -
 4 files changed, 4 deletions(-)

diff --git a/aten/src/ATen/native/Col2Im.cpp b/aten/src/ATen/native/Col2Im.cpp
index 090a3a8a71db2..5ce747e9c7a7e 100644
--- a/aten/src/ATen/native/Col2Im.cpp
+++ b/aten/src/ATen/native/Col2Im.cpp
@@ -144,7 +144,6 @@ static void col2im_out_cpu_template(
   int64_t n_output_plane = n_input_plane / (kernel_width * kernel_height);
 
   output.resize_({batch_size, n_output_plane, output_height, output_width});
-  output.zero_();
 
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf,
       input.scalar_type(), "col2im_out_cpu", [&] {
diff --git a/aten/src/ATen/native/Im2Col.cpp b/aten/src/ATen/native/Im2Col.cpp
index dd6c8b303a5fe..7cb5133eef9ad 100644
--- a/aten/src/ATen/native/Im2Col.cpp
+++ b/aten/src/ATen/native/Im2Col.cpp
@@ -85,7 +85,6 @@ static void im2col_out_cpu_template(
   int64_t output_length = output_height * output_width;
 
   output.resize_({batch_size, n_output_plane, output_length});
-  output.zero_();
 
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kBFloat16, kHalf,
       input.scalar_type(), "im2col_out_cpu", [&] {
diff --git a/aten/src/ATen/native/cuda/Col2Im.cu b/aten/src/ATen/native/cuda/Col2Im.cu
index 98d1950004ef2..53eb2df3013eb 100644
--- a/aten/src/ATen/native/cuda/Col2Im.cu
+++ b/aten/src/ATen/native/cuda/Col2Im.cu
@@ -101,7 +101,6 @@ void col2im_out_cuda_template(
   int64_t input_batch_stride = input.stride(0);
 
   output.resize_({batch_size, n_output_plane, output_height, output_width});
-  output.zero_();
   int64_t output_batch_stride = output.stride(0);
 
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,
diff --git a/aten/src/ATen/native/cuda/Im2Col.cu b/aten/src/ATen/native/cuda/Im2Col.cu
index a18d4d822c659..a209aa2764639 100644
--- a/aten/src/ATen/native/cuda/Im2Col.cu
+++ b/aten/src/ATen/native/cuda/Im2Col.cu
@@ -102,7 +102,6 @@ static void im2col_out_cuda_template(
   int64_t output_length = output_height * output_width;
 
   output.resize_({batch_size, n_output_plane, output_length});
-  output.zero_();
 
   // Launch kernel
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16,

From afbee9d887b334e8d74348bc56e272f6575ddf1e Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Fri, 21 Oct 2022 17:39:01 +0000
Subject: [PATCH 0022/1922] Skip auto request review on forked PR (#87482)

Addresses the comment in https://github.com/pytorch/pytorch/pull/87409

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87482
Approved by: https://github.com/albanD
---
 .github/workflows/auto_request_review.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/auto_request_review.yml b/.github/workflows/auto_request_review.yml
index 01df7a054005f..352b1cf773b71 100644
--- a/.github/workflows/auto_request_review.yml
+++ b/.github/workflows/auto_request_review.yml
@@ -6,6 +6,8 @@ on:
 
 jobs:
   auto-request-review:
+    # Don't run on forked repos
+    if: github.repository_owner == 'pytorch'
     name: Auto Request Review
     runs-on: ubuntu-latest
     steps:

From 4e382ab15a7e7897a3d8c7bc45c96eed9f21ba6b Mon Sep 17 00:00:00 2001
From: Neel Patel <neelpatel@meta.com>
Date: Fri, 21 Oct 2022 17:39:27 +0000
Subject: [PATCH 0023/1922] Create workflow to make sure PRs have valid labels
 (#86829)

### Context
When a dev submits a PR against the repo, we want to validate that they applied two labels to the PR corresponding the module they edited and the kind of change they're making.

### Change
Extended the open source workflow CI to add a validation to ensure that the PR being checked has the required labels on it.  If it doesn't, the check fails and a bot will post a message on the PR with instructions on what labels the developer needs to add (https://github.com/pytorch/pytorch/wiki/PyTorch-AutoLabel-Bot#why-categorize-for-release-notes-and-how-does-it-work).

### Impact
Every time a new version of PyTorch is released, we want to compile all the changes made to each module. However, when devs forget to tag their PR, compiling the changes to write the release notes becomes a burdensome process (only ~20% of PRs are currently labeled appropriately, which means it can take up to 40 hours to compile release notes). With this new validation, the hope is that most PRs are labeled accordingly for more timely release notes compilation.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86829
Approved by: https://github.com/ZainRizvi
---
 .github/requirements-gha-cache.txt   |     2 +
 .github/scripts/check_labels.py      |    84 +
 .github/scripts/gql_mocks.json       | 15011 +++++++++++++++++++++++++
 .github/scripts/test_check_labels.py |    77 +
 .github/scripts/trymerge.py          |     4 +
 .github/workflows/lint.yml           |    32 +
 6 files changed, 15210 insertions(+)
 create mode 100755 .github/scripts/check_labels.py
 create mode 100644 .github/scripts/test_check_labels.py

diff --git a/.github/requirements-gha-cache.txt b/.github/requirements-gha-cache.txt
index f331d98351ae8..6badbe2cc65c8 100644
--- a/.github/requirements-gha-cache.txt
+++ b/.github/requirements-gha-cache.txt
@@ -5,12 +5,14 @@
 #   docs/cpp/requirements.txt
 #   functorch/docs/requirements.txt
 #   .circleci/docker/requirements-ci.txt
+boto3==1.19.12
 cffi==1.15.0
 dataclasses==0.6
 jinja2==3.0.1
 lintrunner==0.9.2
 ninja==1.10.0.post1
 pynvml==11.4.1
+pyyaml==6.0
 requests==2.26
 rich==10.9.0
 rockset==0.8.10
diff --git a/.github/scripts/check_labels.py b/.github/scripts/check_labels.py
new file mode 100755
index 0000000000000..ff40a94ee6fec
--- /dev/null
+++ b/.github/scripts/check_labels.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+"""check_labels.py"""
+
+from typing import Any, List
+from datetime import datetime, timedelta
+
+from export_pytorch_labels import get_pytorch_labels
+from gitutils import (
+    get_git_remote_name,
+    get_git_repo_dir,
+    GitRepo,
+)
+from trymerge import (
+    _fetch_url,
+    gh_post_pr_comment,
+    GitHubPR,
+)
+
+
+BOT_AUTHORS = ["github-actions", "pytorchmergebot", "pytorch-bot"]
+
+ERR_MSG_TITLE = "This PR needs a label"
+ERR_MSG = (
+    f"# {ERR_MSG_TITLE}\n"
+    "If your changes are user facing and intended to be a part of release notes, please use a label starting with `release notes:`.\n\n"  # noqa: E501  pylint: disable=line-too-long
+    "If not, please add the `topic: not user facing` label.\n\n"
+    "For more information, see https://github.com/pytorch/pytorch/wiki/PyTorch-AutoLabel-Bot#why-categorize-for-release-notes-and-how-does-it-work."  # noqa: E501  pylint: disable=line-too-long
+)
+
+
+def get_release_notes_labels() -> List[str]:
+    return [label for label in get_pytorch_labels() if label.lstrip().startswith("release notes:")]
+
+
+def delete_comment(comment_id: int) -> None:
+    url = f"https://api.github.com/repos/pytorch/pytorch/issues/comments/{comment_id}"
+    _fetch_url(url, method="DELETE")
+
+
+def has_required_labels(pr: GitHubPR) -> bool:
+    pr_labels = pr.get_labels()
+
+    # Check if PR is not user facing
+    is_not_user_facing_pr = any(label.strip() == "topic: not user facing" for label in pr_labels)
+    if is_not_user_facing_pr:
+        return True
+
+    # Check if bot has already posted a message within the past hour to include a release notes label
+    for comment in pr.get_comments():
+        if comment.body_text.lstrip(" #").startswith(ERR_MSG_TITLE) and comment.author_login in BOT_AUTHORS:
+            ts = datetime.strptime(comment.created_at, "%Y-%m-%dT%H:%M:%SZ")
+            if (datetime.utcnow() - ts) < timedelta(hours=1):
+                return True
+            delete_comment(comment.database_id)
+            break
+
+    return any(label.strip() in get_release_notes_labels() for label in pr_labels)
+
+
+def parse_args() -> Any:
+    from argparse import ArgumentParser
+    parser = ArgumentParser("Check PR labels")
+    parser.add_argument("pr_num", type=int)
+
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+    repo = GitRepo(get_git_repo_dir(), get_git_remote_name())
+    org, project = repo.gh_owner_and_name()
+    pr = GitHubPR(org, project, args.pr_num)
+
+    try:
+        if not has_required_labels(pr):
+            print(ERR_MSG)
+            gh_post_pr_comment(pr.org, pr.project, pr.pr_num, ERR_MSG)
+            exit(1)
+    except Exception as e:
+        pass
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/scripts/gql_mocks.json b/.github/scripts/gql_mocks.json
index 4a6ea6a6402c7..164c1ac147e5b 100644
--- a/.github/scripts/gql_mocks.json
+++ b/.github/scripts/gql_mocks.json
@@ -20855,5 +20855,15016 @@
         "team": null
       }
     }
+  },
+  "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=82169 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": false,
+          "author": {
+            "login": "ezyang"
+          },
+          "title": "Move test_dtypes so it runs later",
+          "body": "Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom):\n* __->__ #82169\n\nThe error messages it gives are very unhelpful (because a failure\ngets translated into \"dtype was not supported\" rather than the\nactual backtrace), so I'd rather get error messages about this after\nI've tested basic functionality.\n\nSigned-off-by: Edward Z. Yang <ezyang@fb.com>",
+          "headRefName": "gh/ezyang/1279/head",
+          "headRepository": {
+            "nameWithOwner": "pytorch/pytorch"
+          },
+          "baseRefName": "gh/ezyang/1279/base",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "cef34da55a59da5a32494bff218ccd4978b659d3"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "83ad7e73a07111ac1d85e931d14360cc22c01edd"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "28140e4008289251b695385acfb48ac7a47cd49c"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "Mw",
+              "hasNextPage": false
+            },
+            "totalCount": 3
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "lintrunner",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310707890"
+                              },
+                              {
+                                "name": "Test collect_env (with_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708140"
+                              },
+                              {
+                                "name": "Test collect_env (without_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708223"
+                              },
+                              {
+                                "name": "Test collect_env (older_python_version)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708332"
+                              },
+                              {
+                                "name": "quick-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708496"
+                              },
+                              {
+                                "name": "toc",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708710"
+                              },
+                              {
+                                "name": "Test tools",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310708937"
+                              },
+                              {
+                                "name": "workflow-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823981/jobs/4310709169"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAcGj1lc=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/28140e4008289251b695385acfb48ac7a47cd49c/checks?check_suite_id=7546696649"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRc8k="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "TorchBench CI (pytorch-linux-py3.7-cu102)"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED",
+                          "url": "https://github.com/pytorch/pytorch/commit/28140e4008289251b695385acfb48ac7a47cd49c/checks?check_suite_id=7546696651"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRc8s="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "TorchBench CI (pytorch-linux-py3.7-cu102)"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "run-torchbench",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747823982/jobs/4310707884"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAcGjz0w=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SKIPPED",
+                          "url": "https://github.com/pytorch/pytorch/commit/28140e4008289251b695385acfb48ac7a47cd49c/checks?check_suite_id=7546696656"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRc9A="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED",
+                          "url": "https://github.com/pytorch/pytorch/commit/28140e4008289251b695385acfb48ac7a47cd49c/checks?check_suite_id=7546696660"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRc9Q="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pull"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED",
+                          "url": "https://github.com/pytorch/pytorch/commit/28140e4008289251b695385acfb48ac7a47cd49c/checks?check_suite_id=7546696715"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdAs="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pull"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310708487"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.6-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310708713"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310708942"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang7-asan / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310709174"
+                              },
+                              {
+                                "name": "linux-bionic-py3_7-clang8-xla / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310709340"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7-no-ops / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310709579"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310709844"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310710003"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310710175"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.6-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310710516"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310710716"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310710890"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7-mobile-lightweight-dispatch-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711097"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang10-onnx / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711234"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711429"
+                              },
+                              {
+                                "name": "linux-focal-rocm5.2-py3.7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711603"
+                              },
+                              {
+                                "name": "linux-jammy-cuda11.6-cudnn8-py3.8-clang12 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711765"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310711946"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11_3-py3_7-gcc7-deploy / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310712129"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4310712276"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194495"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194591"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194659"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194749"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194858"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311194934"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (functorch, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311195003"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311220458"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311220540"
+                              },
+                              {
+                                "name": "linux-docs / build-docs (cpp)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311222725"
+                              },
+                              {
+                                "name": "linux-docs / build-docs (python)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311222869"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223128"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223225"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (distributed, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223324"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (functorch, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223396"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (docs_test, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223496"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223569"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311223690"
+                              },
+                              {
+                                "name": "linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311224360"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311230050"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 1, 5, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311301930"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 2, 5, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311302152"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 3, 5, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311302303"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 4, 5, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311302433"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 5, 5, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311302531"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491082"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491172"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491232"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491289"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.6-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2747824048/jobs/4311491348"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAcG0YME=",
+                              "hasNextPage": true
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/28140e4008289251b695385acfb48ac7a47cd49c/checks?check_suite_id=7546696836"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdIQ="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Facebook GitHub Tools",
+                            "databaseId": 12274
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Facebook CLA Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://code.intern.facebook.com/cla/"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAcGjyQg=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/28140e4008289251b695385acfb48ac7a47cd49c/checks?check_suite_id=7546696896"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdMA="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Netlify",
+                            "databaseId": 13473
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null,
+                          "url": "https://github.com/pytorch/pytorch/commit/28140e4008289251b695385acfb48ac7a47cd49c/checks?check_suite_id=7546697185"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdeE="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Azure Pipelines",
+                            "databaseId": 9426
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null,
+                          "url": "https://github.com/pytorch/pytorch/commit/28140e4008289251b695385acfb48ac7a47cd49c/checks?check_suite_id=7546697205"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdfU="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Dependabot",
+                            "databaseId": 29110
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null,
+                          "url": "https://github.com/pytorch/pytorch/commit/28140e4008289251b695385acfb48ac7a47cd49c/checks?check_suite_id=7546697224"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAcHRdgg="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": true
+                    }
+                  },
+                  "status": null,
+                  "pushedDate": "2022-07-27T15:34:17Z",
+                  "oid": "28140e4008289251b695385acfb48ac7a47cd49c"
+                }
+              }
+            ]
+          },
+          "changedFiles": 1,
+          "files": {
+            "nodes": [
+              {
+                "path": "test/test_ops.py"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MQ",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
+              {
+                "author": {
+                  "login": "zou3519"
+                },
+                "state": "APPROVED"
+              },
+              {
+                "author": {
+                  "login": "Chillee"
+                },
+                "state": "APPROVED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNy0yNVQxNDo0NTozNS0wNzowMLkyMDIyLTA3LTI1VDE0OjQ1OjM1LTA3OjAwzj6XYmg=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "@pytorchbot merge -f FORCE",
+                "createdAt": "2022-07-27T17:56:43Z",
+                "author": {
+                  "login": "malfet"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1197107402
+              },
+              {
+                "bodyText": "You need to provide a reason for using force merge, in the format @pytorchbot merge -f '[CATEGORY] Explanation'. With [CATEGORY] being one the following:\nEMERGENCY - an emergency fix to quickly address an issue\nMINOR - a minor fix such as cleaning locally unused variables, which shouldn't break anything\nPRE_TESTED - a previous CI run tested everything and you've only added minor changes like fixing lint\nOTHER - something not covered above",
+                "createdAt": "2022-07-27T17:56:45Z",
+                "author": {
+                  "login": "pytorch-bot"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1197107439
+              },
+              {
+                "bodyText": "@pytorchbot merge -f \"[OTHER] normal land failed twice already\"",
+                "createdAt": "2022-07-27T17:57:28Z",
+                "author": {
+                  "login": "malfet"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1197108130
+              },
+              {
+                "bodyText": "@pytorchbot successfully started a merge job. Check the current status here",
+                "createdAt": "2022-07-27T18:08:13Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1197119348
+              },
+              {
+                "bodyText": "Hey @ezyang.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
+                "createdAt": "2022-07-27T18:08:58Z",
+                "author": {
+                  "login": "github-actions"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1197120095
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOR1poyg==",
+              "hasPreviousPage": true
+            }
+          },
+          "labels": {
+            "edges": [
+              {
+                "node": {
+                  "name": "Merged"
+                }
+              },
+              {
+                "node": {
+                  "name": "cla signed"
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=73811 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": false,
+          "author": {
+            "login": "seemethere"
+          },
+          "title": "ci: Migrate metrics credentials to managed IAM",
+          "body": "Stack from [ghstack](https://github.com/ezyang/ghstack):\n* __->__ #73811\n\r\nMigrates our credentials to upload metrics statistics to managed IAM\r\ncredentials in order to make it easier to know where the credentials are\r\ncoming from and to make it easier to add more permissions / less\r\npermissions later on.\r\n\r\nRelates to work done in [D34535827](https://www.internalfb.com/diff/D34535827)\r\n\r\nSigned-off-by: Eli Uriegas <eliuriegas@fb.com>",
+          "headRefName": "gh/seemethere/215/head",
+          "headRepository": {
+            "nameWithOwner": "pytorch/pytorch"
+          },
+          "baseRefName": "gh/seemethere/215/base",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "seemethere"
+                    },
+                    "email": "eliuriegas@fb.com",
+                    "name": "Eli Uriegas"
+                  },
+                  "oid": "13c44d16a876a56bca479b4cf30715d21fa16e99"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "seemethere"
+                    },
+                    "email": "eliuriegas@fb.com",
+                    "name": "Eli Uriegas"
+                  },
+                  "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "Mg",
+              "hasNextPage": false
+            },
+            "totalCount": 2
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Facebook GitHub Tools",
+                            "databaseId": 12274
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Facebook CLA Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://code.intern.facebook.com/cla/"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqOaHA=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658275867"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcBs="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED",
+                          "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276090"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcPo="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "win-vs2019-cpu-py3"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED",
+                          "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276092"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcPw="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3-clang5-mobile-build"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED",
+                          "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276094"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcP4="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED",
+                          "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276095"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcP8="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED",
+                          "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276097"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQE="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "TorchBench CI (pytorch-linux-py3.7-cu102)"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED",
+                          "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276098"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQI="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-gcc7-no-ops"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1983602966/jobs/2839950629"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUqObRM=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276099"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQM="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Test tools"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED",
+                          "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276100"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQQ="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-clang7-asan"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "CANCELLED",
+                          "url": "https://github.com/pytorch/pytorch/commit/9d26f4e6d8c8df275ea546180fef42548257d2d7/checks?check_suite_id=5658276101"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVFCcQU="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": true
+                    }
+                  },
+                  "status": {
+                    "contexts": [
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17044969?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17045014?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17044975?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      }
+                    ]
+                  },
+                  "pushedDate": "2022-03-14T23:01:55Z",
+                  "oid": "9d26f4e6d8c8df275ea546180fef42548257d2d7"
+                }
+              }
+            ]
+          },
+          "changedFiles": 3,
+          "files": {
+            "nodes": [
+              {
+                "path": ".github/templates/common.yml.j2"
+              },
+              {
+                "path": ".github/workflows/generated-macos-11-py3-x86-64.yml"
+              },
+              {
+                "path": ".github/workflows/update_pytorch_labels.yml"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "Mw",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
+              {
+                "author": {
+                  "login": "kit1980"
+                },
+                "state": "APPROVED"
+              },
+              {
+                "author": {
+                  "login": "janeyx99"
+                },
+                "state": "APPROVED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMy0wNFQxNDoyNDo0OC0wODowMLkyMDIyLTAzLTA0VDE0OjI0OjQ4LTA4OjAwzjWwwqA=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "Merge failed due to Too many checksuites for commit\nRaised by https://github.com/pytorch/pytorch/actions/runs/1988337976",
+                "createdAt": "2022-03-15T17:43:28Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1068270969
+              },
+              {
+                "bodyText": "@pytorchbot force merge this",
+                "createdAt": "2022-03-15T20:26:36Z",
+                "author": {
+                  "login": "seemethere"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1068436128
+              },
+              {
+                "bodyText": "Merge failed due to Too many checksuites for commit\nRaised by https://github.com/pytorch/pytorch/actions/runs/1989076952",
+                "createdAt": "2022-03-15T20:27:47Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1068437098
+              },
+              {
+                "bodyText": "@pytorchbot merge this",
+                "createdAt": "2022-03-15T21:18:55Z",
+                "author": {
+                  "login": "seemethere"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1068482921
+              },
+              {
+                "bodyText": "Hey @seemethere.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
+                "createdAt": "2022-03-15T21:20:40Z",
+                "author": {
+                  "login": "github-actions"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1068484404
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOP6yFeQ==",
+              "hasPreviousPage": true
+            }
+          },
+          "labels": {
+            "edges": [
+              {
+                "node": {
+                  "name": "cla signed"
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=31093 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": true,
+          "author": {
+            "login": "mingxiaoh"
+          },
+          "title": "improve mkldnn convolution test coverage",
+          "body": "This pr will improve the test coverage of mkldnn convolution.\r\n1.test input: specific sensitive numbers\r\n2.pass criteria: output of mkldnn convolution matches output of thnn convolution\r\n3.coverage: by using coverage tool, we found out the following sensitive parameters. Overall the case will test 4352 patterns, takes 8.8s on my machine.\r\n\r\nto run the test case:\r\n\r\npython test_mkldnn_conv2d_ext.py\r\nor\r\npython run_test.py -i mkldnn_conv2d_ext\r\n\r\nIn case of failure, the pattern will be printed in the log for further debugging.\r\n\r\nactually, this PR is created to replace and improve that PR we created before(https://github.com/pytorch/pytorch/pull/25085) ",
+          "headRefName": "master",
+          "headRepository": {
+            "nameWithOwner": "mingxiaoh/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "11pikachu"
+                    },
+                    "email": "junx.du@intel.com",
+                    "name": "dujun"
+                  },
+                  "oid": "29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MQ",
+              "hasNextPage": false
+            },
+            "totalCount": 1
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "clang-format"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "clang-format",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/runs/1099676797?check_suite_focus=true"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHOQYu8fQ==",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9/checks?check_suite_id=1175281097"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHORg1dyQ=="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "flake8-py3",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/runs/1099676800?check_suite_focus=true"
+                              },
+                              {
+                                "name": "quick-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/runs/1099676817?check_suite_focus=true"
+                              },
+                              {
+                                "name": "clang-tidy",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/runs/1099676829?check_suite_focus=true"
+                              },
+                              {
+                                "name": "cmakelint",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/runs/1099676840?check_suite_focus=true"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHOQYu8qA==",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9/checks?check_suite_id=1175281099"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHORg1dyw=="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Codecov",
+                            "databaseId": 254
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "codecov/project",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://codecov.io"
+                              },
+                              {
+                                "name": "codecov/patch",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://codecov.io"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHOQZhcFQ==",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9/checks?check_suite_id=1176100822"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHORhnf1g=="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Codecov",
+                            "databaseId": 254
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "codecov/patch",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://codecov.io"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHOQZZsEQ==",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9/checks?check_suite_id=1176100824"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHORhnf2A=="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Facebook GitHub Tools",
+                            "databaseId": 12274
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Facebook CLA Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://code.facebook.com/cla/"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHOUquzJg==",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9/checks?check_suite_id=1487517306"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHOWKm2eg=="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": false
+                    }
+                  },
+                  "status": {
+                    "contexts": [
+                      {
+                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406538?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_devtoolset7_shared-with-deps_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406947?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406544?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406931?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: binary_windows_libtorch_3_7_cpu_debug_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406550?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: binary_windows_libtorch_3_7_cpu_debug_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406887?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: binary_windows_libtorch_3_7_cpu_release_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406526?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: binary_windows_libtorch_3_7_cpu_release_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406707?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: caffe2_onnx_main_py3_6_clang7_ubuntu16_04_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406533?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: caffe2_onnx_main_py3_6_clang7_ubuntu16_04_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407256?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: caffe2_onnx_ort1_py3_6_clang7_ubuntu16_04_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407254?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: caffe2_onnx_ort2_py3_6_clang7_ubuntu16_04_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407255?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-bionic-cuda10.2-cudnn7-py3.6-clang9",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406556?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-bionic-cuda10.2-cudnn7-py3.8-gcc9",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406532?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-bionic-cuda11.0-cudnn8-py3.6-gcc9",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406527?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-bionic-cuda11.0-cudnn8-py3.8-gcc9",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406553?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-bionic-py3.6-clang9",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406537?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-bionic-py3.8-gcc9",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406529?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-bionic-rocm3.5.1-py3.6",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406554?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-bionic-rocm3.7-py3.6",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406545?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406543?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-cuda10.1-cudnn7-py3-gcc7",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406536?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406552?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406535?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406540?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406528?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406541?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-asan",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406549?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3.6-clang7",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406555?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3.6-gcc4.8",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406546?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3.6-gcc5.4",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406531?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3.6-gcc7",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406534?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3.6-gcc7.2",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406523?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3.8",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406539?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-rocm3.3-py3.6",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406547?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-rocm3.5.1-py3.6",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406551?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407209?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406611?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_bazel_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406607?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_bazel_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406984?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_cpp_doc_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407013?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_doc_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407011?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_ios_11_2_1_x86_64_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406548?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_libtorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406563?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_libtorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7408680?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_backward_compatibility_check_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407014?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_bionic_py3_6_clang9_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406567?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_bionic_py3_6_clang9_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406945?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_bionic_py3_8_gcc9_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406561?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_bionic_py3_8_gcc9_coverage_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407422?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_bionic_rocm3_7_py3_6_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406562?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406612?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7408107?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_cuda10_2_cudnn7_py3_ge_config_legacy_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7408111?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_cuda10_2_cudnn7_py3_ge_config_profiling_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7408101?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc5_4_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406613?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_6_gcc5_4_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406565?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_legacy_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407017?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_profiling_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407019?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_simple_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407012?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_6_gcc5_4_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407016?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_vulkan_x86_32_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406608?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406609?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_asan_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406606?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_asan_test1",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407435?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_asan_test2",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407436?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_mobile_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406605?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_mobile_custom_build_dynamic",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406610?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_macos_10_13_py3_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406525?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_macos_10_13_py3_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407415?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_python_doc_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407018?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_vulkan_linux_bionic_py3_6_clang9_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406566?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_vulkan_linux_bionic_py3_6_clang9_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406946?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_windows_vs2019_py36_cpu_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406542?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_windows_vs2019_py36_cuda10.1_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406530?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_windows_vs2019_py36_cuda10.1_test1",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407028?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_windows_vs2019_py36_cuda10.1_test2",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407027?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_windows_vs2019_py36_cuda11.0_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406524?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_xla_linux_bionic_py3_6_clang9_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7406572?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_xla_linux_bionic_py3_6_clang9_test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/7407253?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "codecov/patch",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://codecov.io/gh/pytorch/pytorch/compare/69f6d94caa3559d4f50745c26af5df041b83fee8...29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9"
+                      },
+                      {
+                        "context": "codecov/project",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://codecov.io/gh/pytorch/pytorch/compare/69f6d94caa3559d4f50745c26af5df041b83fee8...29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9"
+                      },
+                      {
+                        "context": "pr/caffe2-pytorch-linux-bionic-rocm3.7-py3.6-test",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://ci.pytorch.org/jenkins/job/caffe2-builds/job/pytorch-linux-bionic-rocm3.7-py3.6-trigger-test/2319/"
+                      },
+                      {
+                        "context": "pr/pytorch-linux-bionic-rocm3.7-py3.6",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://ci.pytorch.org/jenkins/job/pytorch-builds/job/pytorch-linux-bionic-rocm3.7-py3.6-trigger/2325/"
+                      }
+                    ]
+                  },
+                  "pushedDate": "2020-09-11T01:58:24Z",
+                  "oid": "29f6aa6ecc2ece3fa58170ff4561f9d8d5c129f9"
+                }
+              }
+            ]
+          },
+          "changedFiles": 5,
+          "files": {
+            "nodes": [
+              {
+                "path": "test/math_libraries/convolutions.py"
+              },
+              {
+                "path": "test/math_libraries/convolutions_cases/shapes_googlenet_v3.json"
+              },
+              {
+                "path": "test/math_libraries/convolutions_cases/shapes_maskrcnn_p1.json"
+              },
+              {
+                "path": "test/math_libraries/convolutions_cases/shapes_mobilenet.json"
+              },
+              {
+                "path": "test/math_libraries/convolutions_cases/shapes_resnet_50.json"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "NQ",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "CHANGES_REQUESTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "CHANGES_REQUESTED"
+              },
+              {
+                "author": {
+                  "login": "ailzhang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ngimel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "VitalyFedyunin"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ngimel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "VitalyFedyunin"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "VitalyFedyunin"
+                },
+                "state": "APPROVED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAxOS0xMi0zMFQxMDoxOToxMS0wODowMLkyMDE5LTEyLTMwVDEwOjE5OjExLTA4OjAwzhQZLuY=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "I cloned your repo and ran the tests:\n~/pytorch/test/math_libraries$ python convolutions.py\nFFFF\n======================================================================\nFAIL: test_conv2d_ext_cpu_float32 (__main__.TestConvExtCPU)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float16 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float32 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float64 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n----------------------------------------------------------------------\nRan 4 tests in 33.838s\n\nFAILED (failures=4)\n\nStill fails.\n\n@mruberry  It is suggested by @VitalyFedyunin that, we need to display fail test to avoid invalid inputs, I guess we should set it as expected failures under the pytest test framework, right? we will change it as expected failure cases under pytest test framework. The result will looks like be low, is it ok?\n2500 passed, 136 skipped, 0 failed, 0 errors, 2 expected failures, 0 unexpected passes",
+                "createdAt": "2020-08-14T01:36:20Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": {
+                  "login": "mingxiaoh"
+                },
+                "databaseId": 673816925
+              },
+              {
+                "bodyText": "Displaying tests that fail is fine, but I don't think @VitalyFedyunin meant that it was OK if the tests didn't pass. If these are expected failures then yes, you can use with self.assertRaises(RuntimeError):... when testing them. If you also want to report that the test has test cases with these properties you can print or warn, which will appear in the test output.",
+                "createdAt": "2020-08-14T03:09:37Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 673858224
+              },
+              {
+                "bodyText": "Codecov Report\n\nMerging #31093 into master will not change coverage.\nThe diff coverage is n/a.\n\n\n@@           Coverage Diff           @@\n##           master   #31093   +/-   ##\n=======================================\n  Coverage   68.00%   68.00%           \n=======================================\n  Files         382      382           \n  Lines       49527    49527           \n=======================================\n  Hits        33679    33679           \n  Misses      15848    15848           \n\nContinue to review full report at Codecov.\n\nLegend - Click here to learn more\n\u0394 = absolute <relative> (impact), \u00f8 = not affected, ? = missing data\nPowered by Codecov. Last update 69f6d94...29f6aa6. Read the comment docs.",
+                "createdAt": "2020-09-04T05:41:01Z",
+                "author": {
+                  "login": "codecov"
+                },
+                "authorAssociation": "NONE",
+                "editor": {
+                  "login": "codecov"
+                },
+                "databaseId": 686921371
+              },
+              {
+                "bodyText": "Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as Stale.  Feel free to remove the Stale label if you feel this was a mistake.  If you are unable to remove the Stale label please contact a maintainer in order to do so.  Stale pull requests will automatically be closed 30 days after being marked Stale",
+                "createdAt": "2022-04-12T02:35:37Z",
+                "author": {
+                  "login": "pytorchbot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1095860944
+              },
+              {
+                "bodyText": "Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as Stale. Feel free to remove the Stale label if you feel this was a mistake. If you are unable to remove the Stale label please contact a maintainer in order to do so. If you want the bot to never mark this PR stale again, add the no-stale label.Stale pull requests will automatically be closed after 30 days of inactivity.",
+                "createdAt": "2022-06-11T04:40:16Z",
+                "author": {
+                  "login": "github-actions"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1152854802
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOKCmhXQ==",
+              "hasPreviousPage": true
+            }
+          },
+          "labels": {
+            "edges": [
+              {
+                "node": {
+                  "name": "triaged"
+                }
+              },
+              {
+                "node": {
+                  "name": "open source"
+                }
+              },
+              {
+                "node": {
+                  "name": "cla signed"
+                }
+              },
+              {
+                "node": {
+                  "name": "Stale"
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=2e2877d2452c4f233f042b7ccd50ab9c2a6e9a73d8819a0c876203c12364e8a3 cursor=Y3Vyc29yOnYyOpHOKCmhXQ== name=pytorch number=31093 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "Hi, @mingfeima  @soumith  @Jianhui-Li\nthis will improve the test coverage of mkldnn convolution, would you please review it?\nThe current code is forward only, do we need to cover backward, if yes, we can add backward.",
+                "createdAt": "2019-12-12T01:19:02Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 564806270
+              },
+              {
+                "bodyText": "@mingxiaoh, what is the value in testing DNNL as part of Pytorch validation for the Pytorch developers? Shouldn't having these tests run in DNNL validation be enough?",
+                "createdAt": "2019-12-12T01:28:32Z",
+                "author": {
+                  "login": "vpirogov"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 564808528
+              },
+              {
+                "bodyText": "@vpirogov  The main value is to serve as a blind test to DNNL. If DNNL adds these test to DNNL test sets, it lost the value as a blind test.  The spirit of validation is to cross check.\n@gottbrath @gchanan  The test was developed per the request of Pytorch team. Mingxiao made an effort to reduce the execution time to a few second but still with good coverage.  Although the test today is focused on DNNL, it could be easily extended to be blind test for any conv implementation used in Pytorch.",
+                "createdAt": "2019-12-20T07:44:30Z",
+                "author": {
+                  "login": "Jianhui-Li"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 567826907
+              },
+              {
+                "bodyText": "@mruberry thanks for the comment. As for the chainer dependency, we import it is because we would like to use its testing function for pytest test cases combinations, other wise we need to write much more code to achieve same effect. So, can we use it?",
+                "createdAt": "2020-01-15T09:04:34Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 574563012
+              },
+              {
+                "bodyText": "@mingxiaoh You cannot import chainer. Looking at the code you should be able to achieve the same effect without it.",
+                "createdAt": "2020-01-16T17:59:46Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 575272358
+              },
+              {
+                "bodyText": "@mruberry ok, we will change it according to your requirement. Thanks",
+                "createdAt": "2020-02-10T00:59:34Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 583917522
+              },
+              {
+                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/31093\n\ud83d\udd27 \u00a0Opt-in to CIFlow to control what jobs run on your PRs\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 29f6aa6 (more details on the Dr. CI page):\n\nCommit 29f6aa6 was recently pushed. Waiting for builds...\n\nThis comment was automatically generated by Dr. CI (expand for details).Follow this link to opt-out of these comments for your Pull Requests.\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
+                "createdAt": "2020-05-14T08:04:30Z",
+                "author": {
+                  "login": "dr-ci"
+                },
+                "authorAssociation": "NONE",
+                "editor": {
+                  "login": "facebook-github-bot"
+                },
+                "databaseId": 628466876
+              },
+              {
+                "bodyText": "@mruberry how about those cudnn UT error? we add check for it but it should be NV to fix cudnn bugs.",
+                "createdAt": "2020-05-18T05:34:11Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 629955767
+              },
+              {
+                "bodyText": "Hey @mingxiaoh! You're right, of course, that you shouldn't have to fix cuDNN bugs. Would you please:\n\nAssert that the test case fails, so we know it's failing and if someone fixes it they'll know what test to update.\nFile a new issue explaining the behavior and providing a short PyTorch program to reproduce the issue.\n\nThen we can ping NVIDIA on that issue.",
+                "createdAt": "2020-05-18T07:27:08Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 629997129
+              },
+              {
+                "bodyText": "about the suggestion 'Assert that the test case fails, so we know it's failing and if someone fixes it they'll know what test to update. ',  if we only assert it and continue the following test, I guess users might always ignore them in later test. Anyway, any similar example case for reference?",
+                "createdAt": "2020-05-18T07:55:08Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 630010734
+              },
+              {
+                "bodyText": "In this recent PR https://github.com/pytorch/pytorch/pull/38505/files, for example, you can see that the construction of bool tensors wasn't working properly, so the test author cited the relevant issue and asserted that the incorrect behavior happened, as expected. You can also see how these lines are being removed by https://github.com/pytorch/pytorch/pull/38392/files, which fixes the issue.\nAnother common pattern is to use with self.assertRaises(RuntimeError/AssertionError/etc.):.",
+                "createdAt": "2020-05-18T08:02:13Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 630014823
+              },
+              {
+                "bodyText": "@mruberry the failed UT case is not introduced by our modification, how to handle this issue?",
+                "createdAt": "2020-05-20T01:59:13Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 631187735
+              },
+              {
+                "bodyText": "@mingxiaoh You mean the failures on ROCm? You may ignore them. Be sure to re-request review when you're ready.",
+                "createdAt": "2020-05-20T02:12:58Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 631191425
+              },
+              {
+                "bodyText": "@mruberry  we already skipped those ROCm errors, but there are stil somel error caused by the original code, they are not introduced by our modification.",
+                "createdAt": "2020-05-21T05:18:07Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 631886529
+              },
+              {
+                "bodyText": "I understand. Let me know when you're ready for me to review.",
+                "createdAt": "2020-05-21T06:24:15Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 631908011
+              },
+              {
+                "bodyText": "@mruberry thanks, we are ready for review now.",
+                "createdAt": "2020-05-21T06:28:11Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 631909442
+              },
+              {
+                "bodyText": "@mingxiaoh Great! I'll take a look ASAP.",
+                "createdAt": "2020-05-21T06:31:10Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 631910556
+              },
+              {
+                "bodyText": "@mruberry we just pull the latest code and updated the patch according to your comment, may you please help double check it? BTW, the new failed case in preci is not introduced by our modification.",
+                "createdAt": "2020-05-25T07:44:58Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 633430458
+              },
+              {
+                "bodyText": "@ailzhang would you please check the comment below? Thanks.\nIs there a reason why this TestConv2dExt is a new class instead a test inside TestNN?\n//comment: it is actually suggested by Tongzhou Wang in another thread before.\nAlthough this test sits in generic testing framework, it's actually comparing thnn/mkldnn/cudnn results specially. I feel it's better to make it truly generic so that it compares any device result with CPU result. Alternatively you can mark this test only run when torch.backends.mkldnn.is_available()=True\n//comment: but our goal is to compare the result with that of thnn. Anyway, if you insist, we can start to compare it with cpu.",
+                "createdAt": "2020-05-27T05:11:08Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": {
+                  "login": "mingxiaoh"
+                },
+                "databaseId": 634432326
+              },
+              {
+                "bodyText": "Pruning reviewers. @ngimel, @VitalyFedyunin, this PR is looking pretty good from a test framework perspective. Would one of you like to review?",
+                "createdAt": "2020-05-27T09:58:42Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 634557563
+              },
+              {
+                "bodyText": "@mruberry  Thanks, would you please help review it again. BTW: failed case is not introduced by our modification.",
+                "createdAt": "2020-05-28T10:26:32Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 635256214
+              },
+              {
+                "bodyText": "@mruberry  we moved our case to TestNNDeviceType class, would you please help review it again? BTW, those failed cases are not introduced by our code",
+                "createdAt": "2020-06-02T08:00:01Z",
+                "author": {
+                  "login": "1pikachu"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 637364148
+              },
+              {
+                "bodyText": "@mruberry we moved our case to TestNNDeviceType class, would you please help review it again? BTW, those failed cases are not introduced by our code\n\n@ngimel will follow-up on the test itself sometime this week or early next week.",
+                "createdAt": "2020-06-02T10:23:47Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 637444457
+              },
+              {
+                "bodyText": "@mruberry we moved our case to TestNNDeviceType class, would you please help review it again? BTW, those failed cases are not introduced by our code\n\n@ngimel will follow-up on the test itself sometime this week or early next week.\n\n@mruberry  thank you",
+                "createdAt": "2020-06-02T11:32:06Z",
+                "author": {
+                  "login": "1pikachu"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 637479226
+              },
+              {
+                "bodyText": "Improving test coverage of math libraries is certainly a good goal and this PR is moving towards it. I have some doubts about implementation decisions made, and about running this PR as part of regular pytorch CI.\nIf the primary goal of this PR is to test correctness of the convolution implementations in the vendor library, then it does not serve this purpose. The absolute majority of the 4000+ test cases come from group 1, where different kernel sizes/strides/dilations are used to produce the output of size 1x1. This can test whether pytorch correctly passes convolution parameters to the backends (although there are cheaper ways to do that), but as actual library correctness check it is almost useless - libraries use very different kernels depending in the input/output sizes, and tests with toy sizes like this don't invoke the real bread-and-butter kernels.\nAlso, if this test suite is meant as primary a means of testing vendor libraries (which is a good goal!) it does not have a place as a part of pytorch regular CI, and should be run when the corresponding vendor libraries are updated. I'd suggest moving this test out into a separate file (maybe even outside of torch/test directory) and have it as a part of library update/qualification process rather than regular CI.\nAlso, if the primary goal is to enable easier testing of vendor libraries correctness, perhaps we should rethink the mechanism of the generation of test cases. It should be easy to add a test case with a particular set of parameters that was found to be buggy. Also, running a cross-product of cases in a multi-dimensional space (as this PR does) is rarely an efficient way of getting a signal, some forms of random sampling usually provide a way to get better correctness signal why using less resources.\nAlso, when testing libraries it is important to test both forward and backward functions, whereas this PR does forward only. I'm openminded on whether convTransposed should be tested or not - if we are testing vendor libraries, then it's not necessary, convTransposed calls the same underlying functions, if we are testing pytorch, then it makes sense to test it separately because it takes different codepaths.",
+                "createdAt": "2020-06-02T21:56:33Z",
+                "author": {
+                  "login": "ngimel"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 637827507
+              },
+              {
+                "bodyText": "@mruberry ngimel is quite responsible, but it seems that she is not familiar with the background of this pull-request, since this pull-request is pending for so such a long time, each time we are almost done, then reviewer changes, each reviewer has different idea, it is good, but, would it be better if you help review it or ask the same reviewer to review it considering that you are more familiar with the background/change history? Thanks in advance.",
+                "createdAt": "2020-06-03T02:16:07Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 637912105
+              },
+              {
+                "bodyText": "@mruberry ngimel is quite responsible, but it seems that she is not familiar with the background of this pull-request, since this pull-request is pending for so such a long time, each time we are almost done, then reviewer changes, each reviewer has different idea, it is good, but, would it be better if you help review it or ask the same reviewer to review it considering that you are more familiar with the background/change history? Thanks in advance.\n\nWe know this PR has been open for awhile and we respect that your time is valuable, but we want to make sure we're making the right change here, and I think @ngimel's comments reflect that and should not be too difficult to address. As I understand, her points are:\n\nThis is a good PR with an exciting idea. To let it run longer and test more cases maybe it should run outside the regular PyTorch CI.\nTo remedy this, let's create a test/math_libraries folder and put this test there: test/math_libaries/convolutions.py. Yes, this is different from our requests in the past, which is our mistake, but it should be an easy change.\nTo make the test more interesting it'd be good for the test cases to resemble convolutions used in practice. The current test cases seem like similar \"toy\" examples. Without time pressure we should be able to run larger, more computationally intensive convolutions.\nLet's change the test cases to include some practical convolutions, make it easy to add test cases, and think about how we might generate other interesting cases. (We should also test backwards once we have more time!)\n\nAnd I think these are good points. Maybe the PR doesn't create a new way to generate interesting convolutions to start and instead only runs a few representative convolutions, but @ngimel is positioning the work for success so that it's useful and we can continue to improve on it in the future.\nDoes that make sense?",
+                "createdAt": "2020-06-03T03:04:55Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 637924703
+              },
+              {
+                "bodyText": "@mruberry we were required to finish the test in limited time long long before, at that time, jianhui discussed this issue with you, and you are all agreed with the current test scope and test case number and test time, so you meant you change your mind now? you are not care about the test time currently? Sorry, this issue is pending so long, we are struggling with it now and would like to finish it asap.  Given this, it would be be better if you raise all the requirement at a time,  considering that we have many tasks at hand, we are hoping so eagerly that we can finish this PR and use it for further test for bugs finding.",
+                "createdAt": "2020-06-03T05:22:43Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": {
+                  "login": "mingxiaoh"
+                },
+                "databaseId": 637960626
+              },
+              {
+                "bodyText": "@mruberry we were required to finish the test in limited time long long before, at that time, jianhui discussed this issue with you, and you are all agreed with the current test scope and test case number and test time, so you meant you change your mind now? you are not care about the test time currently? Sorry, this issue is pending so long, we are struggling with it now and would like to finish it asap. Given this, it would be be better if you raise all the requirement at a time, considering that we have many tasks at hand, we are hoping so eagerly that we can finish this PR and use it for further test for bugs finding.\n\nI'm sorry, I don't think I've talked to @Jianhui-Li before. It's true that the team we expressed a concern about timing if the test was to be run in the CI initially, but I think now that we understand what the test is trying to do better we're not sure the CI is the best place for it. The PR was also closed after a lengthy period of inactivity, and we assumed it had simply been abandoned.\nDo you know who @Jianhui-Li spoke with about this issue originally? Maybe I can follow-up with them for more context.",
+                "createdAt": "2020-06-03T05:42:28Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 637967153
+              },
+              {
+                "bodyText": "@mruberry  it is reviewed and discussed with @soumith before. Anyway, since current reviewer is you, so, it should be decided by you. So, what we should do next?",
+                "createdAt": "2020-06-03T06:13:14Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 637978356
+              },
+              {
+                "bodyText": "@mruberry it is reviewed and discussed with @soumith before. Anyway, since current reviewer is you, so, it should be decided by you. So, what we should do next?\n\nI think this will be easier to discuss at the regular Intel-FB meeting.",
+                "createdAt": "2020-06-03T20:34:05Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 638446723
+              },
+              {
+                "bodyText": "@mruberry it is reviewed and discussed with @soumith before. Anyway, since current reviewer is you, so, it should be decided by you. So, what we should do next?\n\nI think this will be easier to discuss at the regular Intel-FB meeting.\n\nLet me sync with Mingxiao and follow up with this. Thanks.",
+                "createdAt": "2020-06-03T20:44:44Z",
+                "author": {
+                  "login": "Jianhui-Li"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 638451670
+              },
+              {
+                "bodyText": "@mruberry would you please help review it again?",
+                "createdAt": "2020-07-02T14:09:23Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 653028208
+              },
+              {
+                "bodyText": "@mruberry would you please help review it again?\n\nHappy to help out, but as last discussed this needs some follow-up at the Intel-FB meeting. Did you get a chance to discuss it there, yet? If so, what did you decide?",
+                "createdAt": "2020-07-06T20:15:04Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 654443242
+              },
+              {
+                "bodyText": "@mruberry would you please help review it again?\n\nHappy to help out, but as last discussed this needs some follow-up at the Intel-FB meeting. Did you get a chance to discuss it there, yet? If so, what did you decide?\n\nyes, we talked it with jianhui, and we decided to follow your ideas. Anyway, we would like to do so modification later, will contact you for review tomorrow. Thanks",
+                "createdAt": "2020-07-09T11:04:06Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 656062287
+              },
+              {
+                "bodyText": "@mruberry would you please help review it again?\n\nHappy to help out, but as last discussed this needs some follow-up at the Intel-FB meeting. Did you get a chance to discuss it there, yet? If so, what did you decide?\n\nyes, we talked it with jianhui, and we decided to follow your ideas. Anyway, we would like to do so modification later, will contact you for review tomorrow. Thanks\n\n@mruberry  the code is ready for review now, would you please take time for it? Thanks.",
+                "createdAt": "2020-07-14T09:16:48Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 658071151
+              },
+              {
+                "bodyText": "super nit: renaming files to .json will make it more IDE friendly.",
+                "createdAt": "2020-07-14T23:38:37Z",
+                "author": {
+                  "login": "VitalyFedyunin"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 658464685
+              },
+              {
+                "bodyText": "@mruberry would you please help review it again?\n\nHappy to help out, but as last discussed this needs some follow-up at the Intel-FB meeting. Did you get a chance to discuss it there, yet? If so, what did you decide?\n\nyes, we talked it with jianhui, and we decided to follow your ideas. Anyway, we would like to do so modification later, will contact you for review tomorrow. Thanks\n\n@mruberry the code is ready for review now, would you please take time for it? Thanks.\n\nCool! I took a look with @ngimel, once these issues are addressed I think we're good to go!",
+                "createdAt": "2020-07-16T05:17:29Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 659164401
+              },
+              {
+                "bodyText": "@ngimel  & @VitalyFedyunin We have changed the code according to your suggestions, would you please review it again? Thanks.",
+                "createdAt": "2020-07-20T08:30:01Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 660884305
+              },
+              {
+                "bodyText": "@ngimel & @VitalyFedyunin We have changed the code according to your suggestions, would you please review it again? Thanks.\n\nUpdated: one more question about tolerances, one code cleanup recommendation, and one task leftover from the last review.",
+                "createdAt": "2020-07-22T20:26:42Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 662678464
+              },
+              {
+                "bodyText": "Updated: one more question about tolerances, one code cleanup recommendation, and one task leftover from the last review.\n@mruberry we have finished the modification according to your comment, would you please review it again? Thanks.",
+                "createdAt": "2020-07-23T10:24:26Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 662930687
+              },
+              {
+                "bodyText": "The code looks good, but I tried running the test suite and hit the following failures:\n======================================================================\nFAIL: test_conv2d_ext_cuda_float16 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 241, in instantiated_test\n    result = test(self, device_arg, dtype)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 542, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 411, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 102, in test_conv2d_ext\n    msg=msg\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 1085, in assertEqual\n    self.assertTrue(result, msg=msg)\nAssertionError: False is not true : device:cuda:0, dtype:torch.float16, group:1, batchsize:22input channel:448, output channel:384, bias:False, padding:[1, 1], dilation:[1, 1], stride:[1, 1], kernel:[3, 3]\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float32 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 241, in instantiated_test\n    result = test(self, device_arg, dtype)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 542, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 411, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 102, in test_conv2d_ext\n    msg=msg\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 1085, in assertEqual\n    self.assertTrue(result, msg=msg)\nAssertionError: False is not true : device:cuda:0, dtype:torch.float32, group:1, batchsize:22input channel:80, output channel:192, bias:False, padding:[0, 0], dilation:[1, 1], stride:[1, 1], kernel:[3, 3]\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float64 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 777, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 241, in instantiated_test\n    result = test(self, device_arg, dtype)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 542, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 411, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 106, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\nLooking at the first invalid convolution, for example, it's:\n    {\n        \"case_name\":\"masknet_p1:conv33\",\n        \"mb\":1,\n        \"g\":1,\n        \"ic\":512,\n        \"ih\":64,\n        \"iw\":64,\n        \"oc\":12,\n        \"kh\":1,\n        \"kw\":1,\n        \"sh\":1,\n        \"sw\":1,\n        \"ph\":0,\n        \"pw\":0,\n        \"dh\":0,\n        \"dw\":0,\n        \"bias\":\"False\"\n    },\n\nwhich has a dh and dw of zero, causing it to be added to invalid cases here:\ndh, dw = case['dh'], case['dw']\n            has_bias = case['bias']\n            if dh == 0 or dw == 0:\n                invalid_cases.append(case_name)",
+                "createdAt": "2020-07-23T21:25:19Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": {
+                  "login": "mruberry"
+                },
+                "databaseId": 663240268
+              },
+              {
+                "bodyText": "@mruberry the failure was not detected is because we did not export the cudnn path. Yes, you are right, we need to a large atol of 1e-2 . Would you please help review it again? Thanks.",
+                "createdAt": "2020-07-27T12:43:44Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 664373079
+              },
+              {
+                "bodyText": "@mruberry the failure was not detected is because we did not export the cudnn path. Yes, you are right, we need to a large atol of 1e-2 . Would you please help review it again? Thanks.\n\nBefore I run these tests again, is an atol of 1e-2 needed for all types or just half? Also, how does 1e-2 compare to the values that are being compared?",
+                "createdAt": "2020-07-27T18:39:27Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 664569507
+              },
+              {
+                "bodyText": "@mruberry 1e-2 is experimental result, details see below, random means it might be failed sometimes.\n\n\n\natol,rtol\n1e-2,1e-2\n1e-2,1e-3\n1e-3,1e-2\n1e-3,1e-3\n1e-4,1e-3\n1e-3,1e-4\n1e-4,1e-4\n1e-4,1e-5\n1e-5,1e-4\n\n\n\n\nCuda float16\npass\npass\npass\npass\npass\nfail\nFail\nFail\nfail\n\n\nCuda float32\npass\nrandom\nrandom\nrandom\nrandom\nrandom\nrandom\nrandom\nfail",
+                "createdAt": "2020-07-31T03:33:27Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 666894774
+              },
+              {
+                "bodyText": "@mruberry  would you please find time to review it again? Thanks.",
+                "createdAt": "2020-08-04T05:01:20Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 668380451
+              },
+              {
+                "bodyText": "@mruberry would you please find time to review it again? Thanks.\n\nI was just about to try and run this again locally but it looks like the files describing the convolutions are missing?",
+                "createdAt": "2020-08-07T03:49:44Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 670306210
+              },
+              {
+                "bodyText": "@mruberry sorry but what is missing actually?",
+                "createdAt": "2020-08-07T05:00:20Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 670322557
+              },
+              {
+                "bodyText": "@mruberry sorry but what is missing actually?\n\nThe JSON files.",
+                "createdAt": "2020-08-07T16:06:41Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 670591170
+              },
+              {
+                "bodyText": "@mruberry sorry but what is missing actually?\n\nThe JSON files.\n\n@mruberry sorry, we add them now, would you please check it again? Thanks.",
+                "createdAt": "2020-08-13T10:40:11Z",
+                "author": {
+                  "login": "mingxiaoh"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 673402901
+              },
+              {
+                "bodyText": "I cloned your repo and ran the tests:\n~/pytorch/test/math_libraries$ python convolutions.py\nFFFF\n======================================================================\nFAIL: test_conv2d_ext_cpu_float32 (__main__.TestConvExtCPU)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float16 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float32 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n======================================================================\nFAIL: test_conv2d_ext_cuda_float64 (__main__.TestConvExtCUDA)\n----------------------------------------------------------------------\nTraceback (most recent call last):\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_utils.py\", line 815, in wrapper\n    method(*args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 244, in instantiated_test\n    result = test(self, *args)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 615, in only_fn\n    return fn(self, device, *args, **kwargs)\n  File \"/private/home/mruberry/git/pytorch/torch/testing/_internal/common_device_type.py\", line 472, in dep_fn\n    return fn(slf, device, *args, **kwargs)\n  File \"convolutions.py\", line 114, in test_conv2d_ext\n    \"invalid cases:\" + \",\".join(invalid_cases)\nAssertionError: invalid cases:masknet_p1:conv33,masknet_p1:conv8,masknet_p1:conv2*4,masknet_p1:conv12,masknet_p1:conv4*3,masknet_p1:conv19,masknet_p1:conv4,masknet_p1:conv4,masknet_p1:conv27,masknet_p1:conv39,masknet_p1:conv23,masknet_p1:conv20,masknet_p1:conv25,masknet_p1:conv17,masknet_p1:conv9*4,masknet_p1:conv36,masknet_p1:conv18,masknet_p1:conv5,masknet_p1:conv38,masknet_p1:conv31,masknet_p1:conv14,masknet_p1:conv26,masknet_p1:conv2,masknet_p1:conv5*2,masknet_p1:conv28,masknet_p1:conv16,masknet_p1:conv20*3,masknet_p1:conv9,masknet_p1:conv14*23,masknet_p1:conv32,masknet_p1:conv30,masknet_p1:conv35,masknet_p1:conv37,masknet_p1:conv3,masknet_p1:conv24,masknet_p1:conv13,masknet_p1:conv21*3,masknet_p1:conv10,masknet_p1:conv7,masknet_p1:conv34,masknet_p1:conv13*24,masknet_p1:conv10*4,masknet_p1:conv22*2,masknet_p1:conv6,masknet_p1:conv22,masknet_p1:conv11,masknet_p1:conv40,masknet_p1:conv15,masknet_p1:conv17*23,masknet_p1:conv29,masknet_p1:conv21,masknet_p1:conv1,masknet_p1:conv11*3,mobilenet:conv3,mobilenet:conv2*4,mobilenet:conv6,mobilenet:conv7,mobilenet:conv5*4,mobilenet:conv4*4,mobilenet:conv7*4,mobilenet:conv1*3,mobilenet:conv10,mobilenet:conv2,mobilenet:conv5,mobilenet:conv4,mobilenet:conv9*4,mobilenet:conv8,mobilenet:conv9,mobilenet:conv6*4,mobilenet:conv10*4,mobilenet:conv11,mobilenet:conv8*20,mobilenet:conv1,mobilenet:conv11*4,mobilenet:conv3*4\n\n----------------------------------------------------------------------\nRan 4 tests in 33.838s\n\nFAILED (failures=4)\n\nStill fails.",
+                "createdAt": "2020-08-13T23:35:00Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 673760580
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOIapCfg==",
+              "hasPreviousPage": false
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=76118 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": false,
+          "author": {
+            "login": "malfet"
+          },
+          "title": "Dummy change with lots of commits",
+          "body": "Draft PR with 100+ commits, to test mergebot ",
+          "headRefName": "malfet/pr-with-lots-of-commits",
+          "headRepository": {
+            "nameWithOwner": "pytorch/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "3067f2240afc7a29dc348000aa19eccbd9772303"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "andrewor14"
+                    },
+                    "email": "andrewor@fb.com",
+                    "name": "Andrew Or"
+                  },
+                  "oid": "2f655b71f70c496c4e645f6cdb27d7bb7e825701"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "0c6dcaa7f58a19c42a530f4ee14bb6f0f03ca9fb"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "cad11c563d41ebcffb1683fe1f1288b8157413b3"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "jwtan@fb.com",
+                    "name": "Jiewen Tan"
+                  },
+                  "oid": "4dfd0875a68d87fccb5ad0d81692db480043b86e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "2d37e74690582a4a26890e4c8b98f1f80e589c82"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "jwtan@fb.com",
+                    "name": "Jiewen Tan"
+                  },
+                  "oid": "d4aee60947e1a3ef23c7c42990621e0746fdd0a8"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "peterbell10"
+                    },
+                    "email": "peterbell10@live.co.uk",
+                    "name": "Peter Bell"
+                  },
+                  "oid": "aac6204bf710beb5e50a383d426ae6222396335a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "4b0362cab884584c24f5834b3874f5f357f56b5d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "7536df613cbc645a9e68e6a3b0a8450753260fd1"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "20a50cb966d28d7bf82924adf781cf72a01ef90e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "486387e8644afb46edff5aa5925b55c8119f67f0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "acb9d78b9b732d3667b881727e6ed9f92a8c549f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "683bb7959a5b973f8470c081ad02e8fc508e784a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "qihqi"
+                    },
+                    "email": "qihan@fb.com",
+                    "name": "Han Qi"
+                  },
+                  "oid": "a870cb40af65adf0b77d55f6b554d7093d284d7a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "Krovatkin"
+                    },
+                    "email": "korovaikon@gmail.com",
+                    "name": "Nikolay Korovaiko"
+                  },
+                  "oid": "70793b9f328ddf52cc86336104c3a064c8582ef4"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "suo"
+                    },
+                    "email": "suo@fb.com",
+                    "name": "Michael Suo"
+                  },
+                  "oid": "f70b31f62b1c5159eef2725484b175983517c88c"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dagitses"
+                    },
+                    "email": "mikeyd@fb.com",
+                    "name": "Michael Andreas Dagitses"
+                  },
+                  "oid": "04d3ec1db60defe1c6904bf77e9f8dfa87dc0b63"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "46b754a55b63e3168ad5854ad412c124934b675d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "robieta"
+                    },
+                    "email": "taylorrobie@fb.com",
+                    "name": "Taylor Robie"
+                  },
+                  "oid": "13df69e13ee571fdd716139419a00aec47ade7d6"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "70642e911ec80a47cdbf4a50aac475c11aa129b6"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "59bb7c39384bf3e0b284a037adef8b3caa53c1c4"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "007cfb97b55d70ff63e1ed71d1a674638f847376"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "0a7b858a5af1393fa3cf2853f92eca0e1d408dde"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "qihqi"
+                    },
+                    "email": "qihan@fb.com",
+                    "name": "Han Qi"
+                  },
+                  "oid": "7917d789f0a523715041ade5177d271082628236"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kit1980"
+                    },
+                    "email": "sdym@fb.com",
+                    "name": "Sergii Dymchenko (Meta Employee)"
+                  },
+                  "oid": "91eb6017f0fb8a1b29e8cb48fac93bc9709f73b3"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dagitses"
+                    },
+                    "email": "mikeyd@fb.com",
+                    "name": "Michael Andreas Dagitses"
+                  },
+                  "oid": "bd04dca5fabb0c2a51ac87063a515f256ef274fa"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dagitses"
+                    },
+                    "email": "mikeyd@fb.com",
+                    "name": "Michael Andreas Dagitses"
+                  },
+                  "oid": "1f805a5defda7dabc49d0059edb9ccb06bc29352"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@fb.com",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "4982c0a8db8f23d15ec4bfcbca4ce939afc04954"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pearu"
+                    },
+                    "email": "pearu.peterson@gmail.com",
+                    "name": "Pearu Peterson"
+                  },
+                  "oid": "28502265cb5925cb7db8dcb2dd2334963092714a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "e03fcaedb1342e6d65c7f7f20243000938ba60b2"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pritamdamania"
+                    },
+                    "email": "pritam.damania@fb.com",
+                    "name": "pritam"
+                  },
+                  "oid": "efb28f5a1a5d18aa96bd668ab2ab5c651be359f3"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "MagiaSN"
+                    },
+                    "email": "magialiao@tencent.com",
+                    "name": "magialiao"
+                  },
+                  "oid": "52cc1b9994f861ebdd3908759ed1ab11cba1f8de"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "3cd99f23d1acd6a5bedf6f3b02be79d64350a5b6"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "awgu"
+                    },
+                    "email": "andgu@fb.com",
+                    "name": "Andrew Gu"
+                  },
+                  "oid": "b00502c634a5146f4d996bd90e84d317f049e7b0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "davidberard98"
+                    },
+                    "email": "dberard@fb.com",
+                    "name": "David Berard"
+                  },
+                  "oid": "976eb7cee799dddfbe6a4122b249aaee1b6c8854"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ngimel"
+                    },
+                    "email": "ngimel@fb.com",
+                    "name": "Natalia Gimelshein"
+                  },
+                  "oid": "9608ab28744d5cae32f371490557b248c9549c66"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "4e119f0c39eb5ff0777f0e71561e6b633d85fb34"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "rohan-varma"
+                    },
+                    "email": "rvarm1@fb.com",
+                    "name": "Rohan Varma"
+                  },
+                  "oid": "447580dc565f3660eddb2c996c6ed25b88338684"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "2bc8f43e9233008ea23053fab87b83ab36fca5e3"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "c13a8e891c3e3e714f60649ca1e3b082e090e9fe"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "fddc861b7ee473f57d3c2161e4618a2663a237e8"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "jiyuanzFB"
+                    },
+                    "email": "jiyuanz@fb.com",
+                    "name": "Jiyuan Zhang"
+                  },
+                  "oid": "e2336dbc539d6c021720cbe43c92c9e4c8463299"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "bdhirsh"
+                    },
+                    "email": "hirsheybar@fb.com",
+                    "name": "Brian Hirsh"
+                  },
+                  "oid": "26e2759d1ad59aac12168b74d1ca55e42ba9455c"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "bdhirsh"
+                    },
+                    "email": "hirsheybar@fb.com",
+                    "name": "Brian Hirsh"
+                  },
+                  "oid": "ad7aa914ee3b3d1252e31514f010ba96c40aae87"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "bdhirsh"
+                    },
+                    "email": "hirsheybar@fb.com",
+                    "name": "Brian Hirsh"
+                  },
+                  "oid": "f113c5d78065aafbe7b1c0e611945bfe9f67b3c0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "bdhirsh"
+                    },
+                    "email": "hirsheybar@fb.com",
+                    "name": "Brian Hirsh"
+                  },
+                  "oid": "a366fd01136292544b7862968ae92feba4b6d8fe"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "seemethere"
+                    },
+                    "email": "eliuriegas@fb.com",
+                    "name": "Eli Uriegas"
+                  },
+                  "oid": "afeba0773749da5883c378a2e6ac066e1ce62ca0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "bdhirsh"
+                    },
+                    "email": "hirsheybar@fb.com",
+                    "name": "Brian Hirsh"
+                  },
+                  "oid": "d306c99addc543908f64666baeecacbd0749f4a7"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "awgu"
+                    },
+                    "email": "andgu@fb.com",
+                    "name": "Andrew Gu"
+                  },
+                  "oid": "c2456ea658f41f64ea054a422edf22a9c977399f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "awgu"
+                    },
+                    "email": "andgu@fb.com",
+                    "name": "Andrew Gu"
+                  },
+                  "oid": "a8b0a1b681c9fe41e0d553c962a5c93e81d92503"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "anjali411"
+                    },
+                    "email": "chourdiaanjali123@gmail.com",
+                    "name": "anjali411"
+                  },
+                  "oid": "af761d9a5d058c9188f16589bae4f307d35185be"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "clee2000"
+                    },
+                    "email": "csl@fb.com",
+                    "name": "Catherine Lee"
+                  },
+                  "oid": "beceb417baef35b15c2716e23178fb49f7fd6f9d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "1516554e22136db89d0aeba43a1a1a987e995d68"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "qihqi"
+                    },
+                    "email": "qihan@fb.com",
+                    "name": "Han Qi"
+                  },
+                  "oid": "68eb1fa8374eff6cbdcf0be5e37ed6775d22e722"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "janeyx99"
+                    },
+                    "email": "janeyx@fb.com",
+                    "name": "Jane Xu"
+                  },
+                  "oid": "3c7bcb99b5c0c879c2610f427880b03881f82f38"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "janeyx99"
+                    },
+                    "email": "janeyx@fb.com",
+                    "name": "Jane Xu"
+                  },
+                  "oid": "38c1a2028090353e40a019c673c9ab16b39e4825"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "albanD"
+                    },
+                    "email": "albandes@fb.com",
+                    "name": "Alban Desmaison"
+                  },
+                  "oid": "8091cbea2c95ed2c4c406b3c61547a27c6319bae"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "d81f59121969a47c8b2213a88e02cf9be0219be9"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "20d798b319cd107a767fe220f7a3027c18a1c844"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "eb35381a770b58c1cd41e935910cb4df2f3d8f14"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "e6498a657b9aa47546dcd92d1b4ffb2e1a50ebdb"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "7f821382db5ad08efe5b09a145c606852b8a9272"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "albanD"
+                    },
+                    "email": "albandes@fb.com",
+                    "name": "Alban Desmaison"
+                  },
+                  "oid": "995c0e11a97d854ff969962bd81d7341e46ecb07"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "davidberard98"
+                    },
+                    "email": "dberard@fb.com",
+                    "name": "David Berard"
+                  },
+                  "oid": "28d6258e62c9fc361a18689877c962c69889dc23"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "HarborYuan"
+                    },
+                    "email": "yuanhaobo@whu.edu.cn",
+                    "name": "Haobo Yuan"
+                  },
+                  "oid": "2350fad8391367ebf81c7236a2c883644b4ff622"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "zou3519"
+                    },
+                    "email": "zou3519@gmail.com",
+                    "name": "Richard Zou"
+                  },
+                  "oid": "3f789c9ccecdd7e2e52269453646e992a68c6b92"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "jeffdaily"
+                    },
+                    "email": "jeff.daily@amd.com",
+                    "name": "Jeff Daily"
+                  },
+                  "oid": "20f79f610c1a3314da96d49515bbfbee9442e4f8"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "peterbell10"
+                    },
+                    "email": "peterbell10@live.co.uk",
+                    "name": "Peter Bell"
+                  },
+                  "oid": "5823958f047f3b71a5dc8c52a20eb8ae3291bd3e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "peterbell10"
+                    },
+                    "email": "peterbell10@live.co.uk",
+                    "name": "Peter Bell"
+                  },
+                  "oid": "a0b15c49ecf3844daf2c0dcaef44f0214259db20"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "4afc38c25ca2ca126ba4987a419a58a5c572223b"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "b606f58d4a36683fbe0a7d02adfdde7d5cc694c2"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "albanD"
+                    },
+                    "email": "albandes@fb.com",
+                    "name": "Alban Desmaison"
+                  },
+                  "oid": "2d61b4d630f6482a6c3cc7437091fad6d27c347e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "george-qi"
+                    },
+                    "email": "georgeqi94@gmail.com",
+                    "name": "George Qi"
+                  },
+                  "oid": "bc5384c47036a6cda94129f3e2f9e43c43393698"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "60fc3277634365b64465712b13db2acb76d6c890"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "1b8762e95bc38d1847fe99ed3230546c8b800bfd"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "jerryzh168"
+                    },
+                    "email": "jerryzh168@gmail.com",
+                    "name": "Jerry Zhang"
+                  },
+                  "oid": "6acf60f95f59ecbc6e8ce830dea0abba7d3ec763"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ysiraichi"
+                    },
+                    "email": "yukio.siraichi@gmail.com",
+                    "name": "Yukio Siraichi"
+                  },
+                  "oid": "8fb0276561fdd530c5a06ea195e930e0584f8705"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "albanD"
+                    },
+                    "email": "albandes@fb.com",
+                    "name": "Alban Desmaison"
+                  },
+                  "oid": "1da7aed95a8700406671425eac1e4bbc2c7a24b5"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "thiagocrepaldi"
+                    },
+                    "email": "thiago.crepaldi@microsoft.com",
+                    "name": "Thiago Crepaldi"
+                  },
+                  "oid": "83208e7dee4503c1bee1df9f6632794694dffa01"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "1a46cf08dcd3d3564604c17b2c02d7e4eb45a7ff"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "b7f9b6689445f826c83694652fea5f7cfc7070d7"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "fatcat-z"
+                    },
+                    "email": "jiz@microsoft.com",
+                    "name": "Jay Zhang"
+                  },
+                  "oid": "f273961c1696b156e35f8c76f7ad37934031050d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pavithranrao"
+                    },
+                    "email": "pavithran@fb.com",
+                    "name": "Pavithran Ramachandran"
+                  },
+                  "oid": "eb410a51fcbc716873fd80a970eb932d4aaaea61"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ngimel"
+                    },
+                    "email": "ngimel@fb.com",
+                    "name": "Natalia Gimelshein"
+                  },
+                  "oid": "7dbb12cdc02332fa64264ed0df576511a5070d7e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "43675665fa6b5154de8b25125dd03d7be35c884f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "albanD"
+                    },
+                    "email": "albandes@fb.com",
+                    "name": "Alban Desmaison"
+                  },
+                  "oid": "6c4d23c402c413667463770d9a2fa801f493d3c5"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "cf3778a35129a40dee14366515201b7ed2c0f346"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "dzdang"
+                    },
+                    "email": "dzdang@umich.edu",
+                    "name": "dzdang"
+                  },
+                  "oid": "9d00a051373cb81f79cb6375942cf3ec9fff2fe6"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "pytorchmergebot"
+                    },
+                    "email": "pytorchmergebot@users.noreply.github.com",
+                    "name": "PyTorch MergeBot"
+                  },
+                  "oid": "1eae67cf404aa8dffb80b8e85180f943878d52a6"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "janeyx99"
+                    },
+                    "email": "janeyx@fb.com",
+                    "name": "Jane Xu"
+                  },
+                  "oid": "ce0e69dcda0fe41a6e964d6ac70ce8016979c71a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "swolchok"
+                    },
+                    "email": "swolchok@fb.com",
+                    "name": "Scott Wolchok"
+                  },
+                  "oid": "6faba554f6e49777f24911928edb3061b6ed0e3d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "IvanYashchuk"
+                    },
+                    "email": "ivan.yashchuk@aalto.fi",
+                    "name": "Ivan Yashchuk"
+                  },
+                  "oid": "d1d0e03f57a359f8f95331f9a34b8bed3e7cc845"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "Chillee"
+                    },
+                    "email": "chilli@fb.com",
+                    "name": "Horace He"
+                  },
+                  "oid": "bb46bd9233a9fc631802a902cb48a4c13c2722ca"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "mehtanirav"
+                    },
+                    "email": "niravmehta@fb.com",
+                    "name": "Nirav Mehta"
+                  },
+                  "oid": "3b1007fe4be12e483f2620fbac67cae42e703efc"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "mehtanirav"
+                    },
+                    "email": "niravmehta@fb.com",
+                    "name": "Nirav Mehta"
+                  },
+                  "oid": "b4b65228dd0c109f5fdf17c7d9e56f60a98e398b"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "albanD"
+                    },
+                    "email": "albandes@fb.com",
+                    "name": "Alban Desmaison"
+                  },
+                  "oid": "d629e300705196d3ae0bac5ed983b197101fa2ee"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "bigfootjon"
+                    },
+                    "email": "jonjanzen@fb.com",
+                    "name": "Jon Janzen"
+                  },
+                  "oid": "52754b9e515f378f8476ad44d75b0a692bad8cde"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "samdow"
+                    },
+                    "email": "samdow@fb.com",
+                    "name": "samdow"
+                  },
+                  "oid": "128c3ad747093f4970329a82c7c4720420faeff2"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "arindamroy-eng"
+                    },
+                    "email": "61168652+arindamroy-eng@users.noreply.github.com",
+                    "name": "arindamroy-eng"
+                  },
+                  "oid": "2a0bda7d32a5bcc9827f7254a7b77cceb16ba973"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MTAw",
+              "hasNextPage": true
+            },
+            "totalCount": 131
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Facebook GitHub Tools",
+                            "databaseId": 12274
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Facebook CLA Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://code.intern.facebook.com/cla/"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuNRg4=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193693698"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRAI="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Netlify",
+                            "databaseId": 13473
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null,
+                          "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193693712"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRBA="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Azure Pipelines",
+                            "databaseId": 9426
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null,
+                          "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193693725"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRB0="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Dependabot",
+                            "databaseId": 29110
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null,
+                          "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193693741"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRC0="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Codecov",
+                            "databaseId": 254
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null,
+                          "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193693761"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsREE="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "PyTorch Bot",
+                            "databaseId": 40112
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null,
+                          "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193693774"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRE4="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "TorchBench CI (pytorch-linux-py3.7-cu102)"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "run-torchbench",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192463/jobs/3232430975"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuNR-Y=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SKIPPED",
+                          "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193694412"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRsw="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Test collect_env (with_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461134"
+                              },
+                              {
+                                "name": "Test collect_env (without_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461211"
+                              },
+                              {
+                                "name": "toc",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461301"
+                              },
+                              {
+                                "name": "Test tools",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461386"
+                              },
+                              {
+                                "name": "quick-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461521"
+                              },
+                              {
+                                "name": "lintrunner",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461634"
+                              },
+                              {
+                                "name": "workflow-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192461/jobs/3232461717"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuN84s=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193694417"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRtE="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pull"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232460797"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232460951"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461088"
+                              },
+                              {
+                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461294"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461410"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461543"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461628"
+                              },
+                              {
+                                "name": "linux-bionic-rocm5.0-py3.7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461719"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461789"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461869"
+                              },
+                              {
+                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232461946"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462044"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462112"
+                              },
+                              {
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462244"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.3-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462360"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462432"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462521"
+                              },
+                              {
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462621"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462683"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232462738"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232545510"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232545571"
+                              },
+                              {
+                                "name": "linux-docs / build-docs (cpp)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547522"
+                              },
+                              {
+                                "name": "linux-docs / build-docs (python)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547612"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547714"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547764"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547824"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547869"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547909"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232547973"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232553452"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232553558"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232553605"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232553650"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232563716"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232563763"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232582650"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232582703"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232582741"
+                              },
+                              {
+                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232590204"
+                              },
+                              {
+                                "name": "linux-bionic-rocm5.0-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232608872"
+                              },
+                              {
+                                "name": "linux-bionic-rocm5.0-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232608976"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232637097"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232637199"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232637259"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232639932"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232687012"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232687074"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232785088"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2197192471/jobs/3232785153"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAWuVD9M=",
+                              "hasNextPage": true
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/5696e8357cf38f852ef3d680381513e26f202371/checks?check_suite_id=6193694439"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXEsRuc="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": false
+                    }
+                  },
+                  "status": null,
+                  "pushedDate": "2022-04-20T17:10:41Z",
+                  "oid": "5696e8357cf38f852ef3d680381513e26f202371"
+                }
+              }
+            ]
+          },
+          "changedFiles": 348,
+          "files": {
+            "nodes": [
+              {
+                "path": ".circleci/cimodel/data/pytorch_build_data.py"
+              },
+              {
+                "path": ".circleci/cimodel/data/pytorch_build_definitions.py"
+              },
+              {
+                "path": ".circleci/scripts/cpp_doc_push_script.sh"
+              },
+              {
+                "path": ".circleci/scripts/python_doc_push_script.sh"
+              },
+              {
+                "path": ".github/actions/checkout-pytorch/action.yml"
+              },
+              {
+                "path": ".github/merge_rules.json"
+              },
+              {
+                "path": ".github/scripts/gitutils.py"
+              },
+              {
+                "path": ".github/scripts/gql_mocks.json"
+              },
+              {
+                "path": ".github/scripts/trymerge.py"
+              },
+              {
+                "path": ".github/workflows/_bazel-build-test.yml"
+              },
+              {
+                "path": ".github/workflows/_linux-build.yml"
+              },
+              {
+                "path": ".github/workflows/_linux-test.yml"
+              },
+              {
+                "path": ".github/workflows/_mac-test.yml"
+              },
+              {
+                "path": ".github/workflows/_rocm-test.yml"
+              },
+              {
+                "path": ".github/workflows/_win-test.yml"
+              },
+              {
+                "path": ".github/workflows/buck_build_test.yml"
+              },
+              {
+                "path": ".github/workflows/lint.yml"
+              },
+              {
+                "path": ".github/workflows/periodic.yml"
+              },
+              {
+                "path": ".github/workflows/pull.yml"
+              },
+              {
+                "path": ".github/workflows/trunk.yml"
+              },
+              {
+                "path": ".jenkins/pytorch/macos-test.sh"
+              },
+              {
+                "path": ".jenkins/pytorch/test.sh"
+              },
+              {
+                "path": ".jenkins/pytorch/win-test.sh"
+              },
+              {
+                "path": ".lintrunner.toml"
+              },
+              {
+                "path": "BUILD.bazel"
+              },
+              {
+                "path": "CODEOWNERS"
+              },
+              {
+                "path": "README.md"
+              },
+              {
+                "path": "aten/src/ATen/BatchingRegistrations.cpp"
+              },
+              {
+                "path": "aten/src/ATen/Dispatch.h"
+              },
+              {
+                "path": "aten/src/ATen/ExpandUtils.h"
+              },
+              {
+                "path": "aten/src/ATen/FunctionalInverses.cpp"
+              },
+              {
+                "path": "aten/src/ATen/FunctionalStorageImpl.cpp"
+              },
+              {
+                "path": "aten/src/ATen/FunctionalStorageImpl.h"
+              },
+              {
+                "path": "aten/src/ATen/FunctionalTensorWrapper.cpp"
+              },
+              {
+                "path": "aten/src/ATen/FunctionalTensorWrapper.h"
+              },
+              {
+                "path": "aten/src/ATen/FunctionalizeFallbackKernel.cpp"
+              },
+              {
+                "path": "aten/src/ATen/NestedTensorImpl.cpp"
+              },
+              {
+                "path": "aten/src/ATen/OpMathType.h"
+              },
+              {
+                "path": "aten/src/ATen/SparseCsrTensorUtils.h"
+              },
+              {
+                "path": "aten/src/ATen/ThreadLocalState.cpp"
+              },
+              {
+                "path": "aten/src/ATen/ThreadLocalState.h"
+              },
+              {
+                "path": "aten/src/ATen/autocast_mode.cpp"
+              },
+              {
+                "path": "aten/src/ATen/autocast_mode.h"
+              },
+              {
+                "path": "aten/src/ATen/core/SymIntArrayRef.cpp"
+              },
+              {
+                "path": "aten/src/ATen/core/SymIntArrayRef.h"
+              },
+              {
+                "path": "aten/src/ATen/core/TensorBase.h"
+              },
+              {
+                "path": "aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h"
+              },
+              {
+                "path": "aten/src/ATen/core/dispatch/Dispatcher.h"
+              },
+              {
+                "path": "aten/src/ATen/core/interned_strings.h"
+              },
+              {
+                "path": "aten/src/ATen/core/ivalue.cpp"
+              },
+              {
+                "path": "aten/src/ATen/core/ivalue.h"
+              },
+              {
+                "path": "aten/src/ATen/core/ivalue_inl.h"
+              },
+              {
+                "path": "aten/src/ATen/core/jit_type.h"
+              },
+              {
+                "path": "aten/src/ATen/core/jit_type_base.h"
+              },
+              {
+                "path": "aten/src/ATen/core/type.cpp"
+              },
+              {
+                "path": "aten/src/ATen/cuda/CUDASparse.h"
+              },
+              {
+                "path": "aten/src/ATen/cuda/llvm_complex.cpp"
+              },
+              {
+                "path": "aten/src/ATen/cuda/llvm_jit_strings.h"
+              },
+              {
+                "path": "aten/src/ATen/native/Blas.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/Itertools.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/LinearAlgebra.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/SoftMax.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/TensorConversions.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/TensorShape.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/TensorShape.h"
+              },
+              {
+                "path": "aten/src/ATen/native/Unique.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/CUDAJitLoops.cuh"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/JitLoops.cuh"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/Lerp.cu"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/PersistentSoftmax.cuh"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/SoftMax.cu"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/UnarySpecialOpsKernel.cu"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/Unique.cu"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/jit_utils.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/jit_utils.h"
+              },
+              {
+                "path": "aten/src/ATen/native/native_functions.yaml"
+              },
+              {
+                "path": "aten/src/ATen/native/nested/NestedTensorMath.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/quantized/cpu/qsoftmax.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/quantized/cudnn/Linear.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/quantized/cudnn/utils.h"
+              },
+              {
+                "path": "aten/src/ATen/native/sparse/SparseCsrTensor.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/ts_native_functions.yaml"
+              },
+              {
+                "path": "aten/src/ATen/record_function.cpp"
+              },
+              {
+                "path": "aten/src/ATen/record_function.h"
+              },
+              {
+                "path": "aten/src/ATen/templates/Operators.h"
+              },
+              {
+                "path": "aten/src/ATen/templates/RegisterFunctionalization.cpp"
+              },
+              {
+                "path": "aten/src/ATen/test/basic.cpp"
+              },
+              {
+                "path": "aten/src/ATen/test/vmap_test.cpp"
+              },
+              {
+                "path": "binaries/record_function_benchmark.cc"
+              },
+              {
+                "path": "c10/core/DispatchKey.cpp"
+              },
+              {
+                "path": "c10/core/DispatchKey.h"
+              },
+              {
+                "path": "c10/core/DispatchKeySet.h"
+              },
+              {
+                "path": "c10/test/core/DispatchKeySet_test.cpp"
+              },
+              {
+                "path": "c10/util/ArrayRef.h"
+              },
+              {
+                "path": "caffe2/core/tensor.h"
+              },
+              {
+                "path": "docs/source/conf.py"
+              },
+              {
+                "path": "docs/source/fx.rst"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MTAw",
+              "hasNextPage": true
+            }
+          },
+          "reviews": {
+            "nodes": [],
+            "pageInfo": {
+              "startCursor": null,
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "Merge failed due to Matched rule superuser, but it was not reviewed yet by any of:zou3519,abhikrish,mehtanirav,wconstab,lc0, ...",
+                "createdAt": "2022-04-20T17:26:18Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1104215370
+              },
+              {
+                "bodyText": "Merge failed due to Matched rule superuser, but PR has not been reviewed yet",
+                "createdAt": "2022-04-20T17:31:26Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1104220908
+              },
+              {
+                "bodyText": "@pytorchbot merge this",
+                "createdAt": "2022-04-20T19:30:50Z",
+                "author": {
+                  "login": "malfet"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1104378397
+              },
+              {
+                "bodyText": "Merge failed due to Matched rule superuser, but PR has not been reviewed yet\nRaised by https://github.com/pytorch/pytorch/actions/runs/2197877090",
+                "createdAt": "2022-04-20T19:32:10Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1104379712
+              },
+              {
+                "bodyText": "Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as Stale. Feel free to remove the Stale label if you feel this was a mistake. If you are unable to remove the Stale label please contact a maintainer in order to do so. If you want the bot to never mark this PR stale again, add the no-stale label.Stale pull requests will automatically be closed after 30 days of inactivity.",
+                "createdAt": "2022-06-20T16:44:05Z",
+                "author": {
+                  "login": "github-actions"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1160658699
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOQdD9Sg==",
+              "hasPreviousPage": true
+            }
+          },
+          "labels": {
+            "edges": [
+              {
+                "node": {
+                  "name": "cla signed"
+                }
+              },
+              {
+                "node": {
+                  "name": "Stale"
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=76123 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": true,
+          "author": {
+            "login": "kumpera"
+          },
+          "title": "Introduce distributed checkpoint with ShardedTensor.",
+          "body": "Co-authored-by: Wen Zhang <zhangwen@fb.com>\r\nCo-authored-by: Yifu Wang <yifu@fb.com>\r\n\r\n",
+          "headRefName": "st_checkpoint",
+          "headRepository": {
+            "nameWithOwner": "kumpera/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kumpera"
+                    },
+                    "email": "kumpera@fb.com",
+                    "name": "Rodrigo Kumpera"
+                  },
+                  "oid": "6bf248bc20a71f248064b795f38276326fe43aae"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kumpera"
+                    },
+                    "email": "kumpera@fb.com",
+                    "name": "Rodrigo Kumpera"
+                  },
+                  "oid": "10f84fb90bf02d7062e565ebf2c1da6352b64db7"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kumpera"
+                    },
+                    "email": "kumpera@fb.com",
+                    "name": "Rodrigo Kumpera"
+                  },
+                  "oid": "96c5299740ec791f3cf0975c03a40a7b219b6747"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "Mw",
+              "hasNextPage": false
+            },
+            "totalCount": 3
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Facebook GitHub Tools",
+                            "databaseId": 12274
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Facebook CLA Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://code.intern.facebook.com/cla/"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgS2l4=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6380755666"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXxSmtI="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "TorchBench CI (pytorch-linux-py3.7-cu102)"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "run-torchbench",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063614/jobs/3379894109"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXd2r3Q=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SKIPPED",
+                          "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6380755785"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXxSm0k="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "quick-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894107"
+                              },
+                              {
+                                "name": "toc",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894332"
+                              },
+                              {
+                                "name": "lintrunner",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894444"
+                              },
+                              {
+                                "name": "Test collect_env (with_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894520"
+                              },
+                              {
+                                "name": "Test collect_env (without_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894567"
+                              },
+                              {
+                                "name": "Test tools",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894616"
+                              },
+                              {
+                                "name": "workflow-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063615/jobs/3379894672"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXd2shU=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6380755786"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXxSm0o="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pull"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902301"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902363"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902507"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902560"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902579"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902603"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902637"
+                              },
+                              {
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902685"
+                              },
+                              {
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902740"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902761"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902794"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379902874"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903006"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903111"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903193"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903284"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.3-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903357"
+                              },
+                              {
+                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903446"
+                              },
+                              {
+                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903512"
+                              },
+                              {
+                                "name": "linux-bionic-rocm5.1-py3.7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379903546"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379944655"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379944695"
+                              },
+                              {
+                                "name": "linux-docs / build-docs (cpp)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946308"
+                              },
+                              {
+                                "name": "linux-docs / build-docs (python)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946337"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946359"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946391"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946423"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946453"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946496"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379946529"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379950041"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379950137"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379950165"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379950192"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379950646"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379951202"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379951230"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 4, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379963877"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 4, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379963928"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 4, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379963976"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 4, 4, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379964018"
+                              },
+                              {
+                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379966372"
+                              },
+                              {
+                                "name": "linux-bionic-rocm5.1-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379996173"
+                              },
+                              {
+                                "name": "linux-bionic-rocm5.1-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379996218"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379997861"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379998374"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379998397"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379998422"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3379998441"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2273063632/jobs/3380042106"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXd5yuY=",
+                              "hasNextPage": true
+                            }
+                          },
+                          "conclusion": "FAILURE",
+                          "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6380755806"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXxSm14="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "lintrunner",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387419477"
+                              },
+                              {
+                                "name": "quick-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387419699"
+                              },
+                              {
+                                "name": "Test collect_env (with_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387419923"
+                              },
+                              {
+                                "name": "Test collect_env (without_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387419992"
+                              },
+                              {
+                                "name": "Test tools",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387420129"
+                              },
+                              {
+                                "name": "workflow-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387420208"
+                              },
+                              {
+                                "name": "toc",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796859/jobs/3387420309"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgS3SE=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6390363240"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXzlNGg="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "TorchBench CI (pytorch-linux-py3.7-cu102)"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "run-torchbench",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796862/jobs/3387419465"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgS1-o=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SKIPPED",
+                          "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6390363271"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXzlNIc="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pull"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "linux-bionic-rocm5.1-py3.7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387419999"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387420164"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387420316"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387420477"
+                              },
+                              {
+                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387420675"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387420934"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387421278"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387421672"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387421888"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387421982"
+                              },
+                              {
+                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387422191"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387422303"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387422476"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387422715"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387422963"
+                              },
+                              {
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387423092"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387423234"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387423421"
+                              },
+                              {
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387423622"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.3-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387423739"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387545789"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387546032"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387546119"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553028"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553144"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553251"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553438"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553556"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387553668"
+                              },
+                              {
+                                "name": "linux-docs / build-docs (cpp)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387554002"
+                              },
+                              {
+                                "name": "linux-docs / build-docs (python)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387554098"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387558927"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387559016"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387559071"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387559139"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387563803"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387563894"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 4, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387580868"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 4, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387580936"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 4, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387580993"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 4, 4, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387581053"
+                              },
+                              {
+                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387592286"
+                              },
+                              {
+                                "name": "linux-bionic-rocm5.1-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387631950"
+                              },
+                              {
+                                "name": "linux-bionic-rocm5.1-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387632035"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387649916"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387649974"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387650084"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387650151"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387650373"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2276796865/jobs/3387753429"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAXgaCXo=",
+                              "hasNextPage": true
+                            }
+                          },
+                          "conclusion": "FAILURE",
+                          "url": "https://github.com/pytorch/pytorch/commit/96c5299740ec791f3cf0975c03a40a7b219b6747/checks?check_suite_id=6390363300"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXzlNKQ="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": false
+                    }
+                  },
+                  "status": null,
+                  "pushedDate": "2022-05-05T00:34:26Z",
+                  "oid": "96c5299740ec791f3cf0975c03a40a7b219b6747"
+                }
+              }
+            ]
+          },
+          "changedFiles": 11,
+          "files": {
+            "nodes": [
+              {
+                "path": "test/distributed/_shard/checkpoint/test_checkpoint.py"
+              },
+              {
+                "path": "test/distributed/_shard/checkpoint/test_file_system_checkpoint.py"
+              },
+              {
+                "path": "test/distributed/_shard/sharded_tensor/test_sharded_tensor.py"
+              },
+              {
+                "path": "torch/distributed/_shard/checkpoint/__init__.py"
+              },
+              {
+                "path": "torch/distributed/_shard/checkpoint/filesystem.py"
+              },
+              {
+                "path": "torch/distributed/_shard/checkpoint/metadata.py"
+              },
+              {
+                "path": "torch/distributed/_shard/checkpoint/resharding.py"
+              },
+              {
+                "path": "torch/distributed/_shard/checkpoint/state_dict_loader.py"
+              },
+              {
+                "path": "torch/distributed/_shard/checkpoint/state_dict_saver.py"
+              },
+              {
+                "path": "torch/distributed/_shard/checkpoint/storage.py"
+              },
+              {
+                "path": "torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MTE",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "zzzwen"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "zzzwen"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "wanchaol"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "zzzwen"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "zzzwen"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "simpkins"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "zzzwen"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "zzzwen"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "simpkins"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "simpkins"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "pritamdamania87"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "pritamdamania87"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "pritamdamania87"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "wilson100hong"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "wilson100hong"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "wilson100hong"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "xunnanxu"
+                },
+                "state": "DISMISSED"
+              },
+              {
+                "author": {
+                  "login": "xunnanxu"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "xunnanxu"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "xunnanxu"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "xunnanxu"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "xunnanxu"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "pritamdamania87"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "pritamdamania87"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "pritamdamania87"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "pritamdamania87"
+                },
+                "state": "APPROVED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kumpera"
+                },
+                "state": "COMMENTED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNC0yNVQxMTozNTowMS0wNzowMLkyMDIyLTA0LTI1VDExOjM1OjAwLTA3OjAwzjjC2d0=",
+              "hasPreviousPage": true
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "Merge failed due to Can't fetch all PR reviews\nRaised by https://github.com/pytorch/pytorch/actions/runs/2275691136",
+                "createdAt": "2022-05-05T12:35:49Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1118495479
+              },
+              {
+                "bodyText": "Merge failed due to Can't fetch all PR reviews\nRaised by https://github.com/pytorch/pytorch/actions/runs/2275691136",
+                "createdAt": "2022-05-05T12:53:15Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1118511287
+              },
+              {
+                "bodyText": "Merge failed due to Can't fetch all PR reviews\nRaised by https://github.com/pytorch/pytorch/actions/runs/2275691136",
+                "createdAt": "2022-05-05T15:00:08Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1118662274
+              },
+              {
+                "bodyText": "Merge failed due to Can't fetch all PR reviews Raised by https://github.com/pytorch/pytorch/actions/runs/2275691136\n\n@osalpekar @malfet This is failing because there are 109 review comments on this PR but we only fetch the first 100. This could be solved with a similar concept as how we fetch more comments/check_runs.",
+                "createdAt": "2022-05-05T15:20:46Z",
+                "author": {
+                  "login": "janeyx99"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1118689010
+              },
+              {
+                "bodyText": "On a side note, has the test_fsdp_clip_grad_norm_norm_type_2_0_nested_fsdp_False_cpu_offload_CPUOffload failure on the distributed test first shard of this PR been addressed?",
+                "createdAt": "2022-05-05T15:24:08Z",
+                "author": {
+                  "login": "janeyx99"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1118693497
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOQqri9w==",
+              "hasPreviousPage": true
+            }
+          },
+          "labels": {
+            "edges": [
+              {
+                "node": {
+                  "name": "oncall: distributed"
+                }
+              },
+              {
+                "node": {
+                  "name": "cla signed"
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=71759 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": true,
+          "author": {
+            "login": "coolteemf"
+          },
+          "title": "Optimize grid sample 3d",
+          "body": "Fixes #71415\r\nI have implemented the changes that replicate what @to-mi did in this [PR](https://github.com/pytorch/pytorch/pull/65986#issue-1012959443) for the 3D case :\r\n\r\n> Fixes #64977\r\n> \r\n> Avoids creating a tensor for and calculating `input` gradient if it's not needed in the backward pass of `grid_sample` (2d case, native CPU & CUDA kernels). Especially the tensor creation seemed time consuming (see #64977).\r\n> \r\n> Brief description of the changes:\r\n> \r\n>     * I have tried to go with rather minimal changes. It would probably be possible to make a more elegant version with a bit larger refactoring (or possibly with better understanding of PyTorch internals and C++ functionalities).\r\n> \r\n>     * Changed the `native_functions.yaml` and `derivatives.yaml` so that the gradient input mask is passed to the functions.\r\n> \r\n>     * Changed the CPU kernels:\r\n>       (1) added `bool input_requires_grad` template parameter to the `backward` function,\r\n>       (2) added if branches based on it to remove `input` gradient computations if it's not requested,\r\n>       (3) feed in `TensorAccessor<scalar_t, 3>* gInp_slice_ptr` instead of `TensorAccessor<scalar_t, 3>& gInp_slice` so that I can pass a `nullptr` in case gradient for `input` is not requested. (A bit inelegant perhaps, but allows to keep one signature for `backward` function and not require breaking it to smaller pieces. Perhaps there's a more elegant way to achieve this?)\r\n> \r\n>     * Changed CUDA kernel:\r\n>       (1) added ~`bool input_requires_grad` template parameter~ `const bool input_requires_grad` argument to the `backward` function,\r\n>       (2) added if branches based on it to remove `input` gradient computations if it's not requested,\r\n>       (3) feed in `TensorInfo<scalar_t, index_t>()` instead of `getTensorInfo<scalar_t, index_t>(grad_input)` in case gradient for `input` is not requested.\r\n> \r\n>     * Modified tests in `test/test_nn.py` so that they run also cases with no `input` gradient needed.\r\n> \r\n>     * Have not touched the CPU fallback kernel.\r\n\r\nNote: the changes number (3) are N/A in this case.\r\n\r\n",
+          "headRefName": "optimize_grid_sample_3d",
+          "headRepository": {
+            "nameWithOwner": "coolteemf/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "e0b0d1e695aeddceaf265da602c4704592053e9e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "563ec73747ad53b63b36736c47c4342f962c2a09"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "51abe41a132d9dd5b1c0551bdca902aacc028ff8"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "be9898205992034a00e8ace8a55c2ecdcee2c2f8"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "2929c60b64384c2deae0f7dea8bab94ad4bc9ec8"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "9241b737e7e2b257905cc74ad9c50b737d7f9d0a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "64d6b795d0636928a8aa2fd3da01302fb5f5f7af"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "4503577e53760a0006f1e80ca6bfe04d2be90470"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "b16f4b11ffbbbf2ca2098f9702af4ef6b6fc5e1f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "7ffc23368a604afdc92d2818747f730ce31a2bb5"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "b85292604b9ad6c31706b76b5a5498c4f6d94309"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "9d81d7bae8ad91aaa24b3ceab83e3138894dbc69"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "e79f6a2202512b294c55bf4bfb2e0524fafd4c48"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "f683e8aec7aea76097a264eec01511e704c31154"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "coolteemf"
+                    },
+                    "email": "67541941+coolteemf@users.noreply.github.com",
+                    "name": "Fran\u00e7ois Lecomte"
+                  },
+                  "oid": "b932e9e286c22aaf352375186df851ef060b295a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "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",
+                    "name": "coolteemf"
+                  },
+                  "oid": "346e0c547953d98eb84d23c1391a95badb9c4a22"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MTY",
+              "hasNextPage": false
+            },
+            "totalCount": 16
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Facebook GitHub Tools",
+                            "databaseId": 12274
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Facebook CLA Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://code.intern.facebook.com/cla/"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGYqY=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801320"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_T6g="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-clang7-onnx"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754066/jobs/2663109808"
+                              },
+                              {
+                                "name": "test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754066/jobs/2663214802"
+                              },
+                              {
+                                "name": "test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754066/jobs/2663214856"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwIob0=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801849"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Ubk="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3-clang5-mobile-build"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754064/jobs/2663109676"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ1E=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801852"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Ubw="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-bionic-rocm4.5-py3.7"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754065/jobs/2663109684"
+                              },
+                              {
+                                "name": "test (default, 2, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754065/jobs/2663401083"
+                              },
+                              {
+                                "name": "test (default, 1, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754065/jobs/2663401143"
+                              },
+                              {
+                                "name": "test (distributed, 1, 1, linux.rocm.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754065/jobs/2663401186"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwMsZY=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "FAILURE",
+                          "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801853"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Ub0="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "win-vs2019-cuda11.3-py3"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754068/jobs/2663109680"
+                              },
+                              {
+                                "name": "test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754068/jobs/2663995756"
+                              },
+                              {
+                                "name": "test (force_on_cpu, 1, 1, windows.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754068/jobs/2663995819"
+                              },
+                              {
+                                "name": "test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754068/jobs/2663995900"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwZbzg=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "FAILURE",
+                          "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801855"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Ub8="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "mypy",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663109683"
+                              },
+                              {
+                                "name": "shellcheck",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663109827"
+                              },
+                              {
+                                "name": "py2-setup-validate-errormsg",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663109962"
+                              },
+                              {
+                                "name": "clang-format",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110044"
+                              },
+                              {
+                                "name": "cmakelint",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110132"
+                              },
+                              {
+                                "name": "toc",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110233"
+                              },
+                              {
+                                "name": "quick-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110320"
+                              },
+                              {
+                                "name": "clang-tidy",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110461"
+                              },
+                              {
+                                "name": "flake8-py3",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754069/jobs/2663110575"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGbAQ=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801856"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_UcA="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-clang7-asan"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754070/jobs/2663109804"
+                              },
+                              {
+                                "name": "test (default, 3, 3, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754070/jobs/2663233675"
+                              },
+                              {
+                                "name": "test (default, 1, 3, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754070/jobs/2663233731"
+                              },
+                              {
+                                "name": "test (default, 2, 3, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754070/jobs/2663233805"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwJC4U=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801857"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_UcE="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754076/jobs/2663109810"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ_w=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801862"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_UcY="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-gcc5.4"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663109777"
+                              },
+                              {
+                                "name": "test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201383"
+                              },
+                              {
+                                "name": "test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201458"
+                              },
+                              {
+                                "name": "test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201512"
+                              },
+                              {
+                                "name": "test (distributed, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201580"
+                              },
+                              {
+                                "name": "test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201672"
+                              },
+                              {
+                                "name": "test (docs_test, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754078/jobs/2663201839"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwIWu4=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801866"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Uco="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1886754079/jobs/2663109681"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATwGZ1k=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/346e0c547953d98eb84d23c1391a95badb9c4a22/checks?check_suite_id=5414801869"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAUK_Uc0="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": true
+                    }
+                  },
+                  "status": {
+                    "contexts": [
+                      {
+                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17017798?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17017799?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17017816?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17017800?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      }
+                    ]
+                  },
+                  "pushedDate": "2022-02-23T10:39:30Z",
+                  "oid": "346e0c547953d98eb84d23c1391a95badb9c4a22"
+                }
+              }
+            ]
+          },
+          "changedFiles": 9,
+          "files": {
+            "nodes": [
+              {
+                "path": "aten/src/ATen/native/GridSampler.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/cpu/GridSamplerKernel.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/GridSampler.cpp"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/GridSampler.cu"
+              },
+              {
+                "path": "aten/src/ATen/native/cuda/GridSampler.h"
+              },
+              {
+                "path": "aten/src/ATen/native/native_functions.yaml"
+              },
+              {
+                "path": "test/forward_backward_compatibility/check_forward_backward_compatibility.py"
+              },
+              {
+                "path": "test/test_nn.py"
+              },
+              {
+                "path": "tools/autograd/derivatives.yaml"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "OQ",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
+              {
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "coolteemf"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "coolteemf"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "coolteemf"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "coolteemf"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "coolteemf"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "coolteemf"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "coolteemf"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "APPROVED"
+              },
+              {
+                "author": {
+                  "login": "albanD"
+                },
+                "state": "APPROVED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMS0yNVQwODoyODoxMC0wODowMLkyMDIyLTAxLTI1VDA3OjU0OjA1LTA4OjAwzjNooqI=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "Merge failed due to 'NoneType' object is not subscriptable\nRaised by https://github.com/pytorch/pytorch/actions/runs/1887945630",
+                "createdAt": "2022-02-23T14:55:36Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1048868910
+              },
+              {
+                "bodyText": "Thanks for the update! The windows failure is not your fault, you can ignore it!\n\nThank you very much for all of your feedback and sorry for the delay !",
+                "createdAt": "2022-02-23T16:44:36Z",
+                "author": {
+                  "login": "coolteemf"
+                },
+                "authorAssociation": "CONTRIBUTOR",
+                "editor": null,
+                "databaseId": 1048983572
+              },
+              {
+                "bodyText": "@coolteemf can you please send either me or @albanD an email? (or I can send you and invite to collab on private repo)",
+                "createdAt": "2022-02-23T17:49:55Z",
+                "author": {
+                  "login": "malfet"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1049048119
+              },
+              {
+                "bodyText": "@pytorchbot merge this please",
+                "createdAt": "2022-02-23T19:23:55Z",
+                "author": {
+                  "login": "albanD"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1049131992
+              },
+              {
+                "bodyText": "Hey @coolteemf.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
+                "createdAt": "2022-02-23T19:26:51Z",
+                "author": {
+                  "login": "github-actions"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1049134520
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOPoR4Lg==",
+              "hasPreviousPage": true
+            }
+          },
+          "labels": {
+            "edges": [
+              {
+                "node": {
+                  "name": "triaged"
+                }
+              },
+              {
+                "node": {
+                  "name": "open source"
+                }
+              },
+              {
+                "node": {
+                  "name": "cla signed"
+                }
+              },
+              {
+                "node": {
+                  "name": "release notes: nn"
+                }
+              },
+              {
+                "node": {
+                  "name": "topic: performance"
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=75095 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": false,
+          "author": {
+            "login": "mruberry"
+          },
+          "title": "Initial prims, references, and test architecture for them",
+          "body": "This PR adds an initial set of experimental primitive operations and Python references that reimplement existing PyTorch operations using them. See https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-0/577 for additional context.\r\n\r\nThe following experimental primitives are added:\r\n\r\n- Elementwise unary prims -- abs, acos, acosh, asin, atan, cos, cosh, bessel_i0e, bessel_i1e, cbrt, ceil, digamma, erf, erf_inv, erfc, exp, expm1, floor, igamma, igammac, is_finite, lgamma, log, log1p, neg, reciprocal, round, sign, sinh, sqrt, square, tan. \r\n- Elementwise binary prims -- add, atan2, bitwise_and, bitwise_not, bitwise_or, bitwise_xor, div, eq, ge, gt, le, lt, max, min, mul, ne, nextafter, pow, rsqrt, shift_left, shift_right_arithmetic\r\n- View prims -- brodcast_in_dim, collapse_view, split_dim, squeeze\r\n- Shape prims -- collapse, concatenate, reshape\r\n- Conditional prims -- select\r\n- Data conversion & movement prims -- convert_element_type, device_put\r\n- Inplace prims -- copy_to, resize\r\n\r\nThese primitives do not add any new functionality to PyTorch, but are intended to be the semantic building blocks for reference operators. We have tried to make them consistent with the operations in [jax.lax](https://jax.readthedocs.io/en/latest/jax.lax.html) where possible (because PyTorch prefers being consistent with other frameworks), although there are key differences between these prims and operations in jax.lax. Most notably is that these prims model view semantics and inplace operations.\r\n\r\nIn addition to these primitives the following elementwise binary Python references are added:\r\n\r\n- Elementwise binary Python references -- add, atan2, bitwise_and, bitwise_left_shift, bitwise_or, bitwise_right_shift, bitwise_xor, eq, float_power, ge, gt, le, lt, maximum, minimum, mul, ne, nextafter, pow, sub, true_divide\r\n- Conditional Python references - where\r\n- Data conversion & movement references - copy_to\r\n\r\nA Python reference implements the same behavior as its corresponding PyTorch operator (excepting slight numerical differences, bug fixes, and in some cases additional features). \r\n\r\nThe start of an OpInfo-based test architecture for these references is also included in this PR. A new list, `python_ref_db`, is added to `common_methods_invocations.py`. This list introduces the new `ElementwiseBinaryPythonRefInfo`, which inherits input arguments from the original operators' OpInfo, allows them to be overridden, and then constructs the OpInfo for the Python reference using the (potentially modified) arguments. OpInfo-based tests can opt-into testing references by including this new list in the Sequence passed to the `@ops` decorator. \r\n\r\ncc @ngimel @csarofeen @kevinstephano @Lezcano ",
+          "headRefName": "prims_and_references",
+          "headRepository": {
+            "nameWithOwner": "pytorch/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "a790467c650be92775103cde5e866c90b56f5376"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "bd6fcf50692e208ebecdc2eaa517a2bfcdcd35cf"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "4a119c8f21529fe1375e7e8789b91f41a3df80c5"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "ea6750dc34d66be759fdfe84b09fb0e23ee59c79"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "2eef8a55fe0227e1921b51bf1f56f9d0a29b49ac"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "b886ed6c20dd1785fd31ed6fa6a8c5b6d0d0b16c"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "9ad9b63d09aa4f7a8549bcf1d88ea4ff0674299c"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "63fdd580118477416ae160e0670ae722ea248090"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "0ccf7dc292af1d40d0a094eb2b2fb0c7ab4ccc70"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "e8a8a4d1fbe35f20eb88e1a43cf5a653883638e5"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "186634dfdd25645c05b58a212f9e8d77c4125fc0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "f5b4741312b5c42a79f6c8a1d3930b79db38ed8f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "23d50391bb0fd12111fd3171591c4235ffb2fc1a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "bac9d45422d58f513b60b4b854441cfdc253d4c5"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "13240ae0b4a0332c3167b65ac026a3172da90cb7"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "1ee34468cb1db3dc6cbae204669f4fec20e2a466"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ezyang"
+                    },
+                    "email": "ezyang@fb.com",
+                    "name": "Edward Z. Yang"
+                  },
+                  "oid": "561d132bc686d00e8911f7feb3da5901b2bdc574"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ngimel"
+                    },
+                    "email": "ngimel@fb.com",
+                    "name": "Natalia Gimelshein"
+                  },
+                  "oid": "ac42bedc84b7c96256376ad09917263bb020b2c3"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ngimel"
+                    },
+                    "email": "ngimel@fb.com",
+                    "name": "Natalia Gimelshein"
+                  },
+                  "oid": "7f7d5ba40a0b5e10526d90b018b30b54673d12d8"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "37a6b4a8b1adb712d5777c7c3479866c27fb3c4e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ngimel"
+                    },
+                    "email": "ngimel@fb.com",
+                    "name": "Natalia Gimelshein"
+                  },
+                  "oid": "65b613868c44e519c1777af79b9fd3498c5a7e58"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "ngimel"
+                    },
+                    "email": "ngimel@fb.com",
+                    "name": "Natalia Gimelshein"
+                  },
+                  "oid": "442c405e9da0d66744ef03e379224c41eedf5b57"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "031ac49ae9c192989385986b6707fa781e3229e0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "9a6c3b00039c0c985c1c9cb59490012d1c0b38ba"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "d5c30e408af1889b90012d2e09f6ec3cda333bcb"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": null,
+                    "email": "mruberry@devfair044.h1.fair",
+                    "name": "Mike Ruberry"
+                  },
+                  "oid": "db355d55655bb252a699cd532441bb98e52b98d5"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MjY",
+              "hasNextPage": false
+            },
+            "totalCount": 26
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Facebook GitHub Tools",
+                            "databaseId": 12274
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Facebook CLA Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://code.intern.facebook.com/cla/"
+                              },
+                              {
+                                "name": "Meta Internal-Only Changes Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://opensource.facebook.com/"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6ux14=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241454954"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC2o="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Netlify",
+                            "databaseId": 13473
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null,
+                          "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241454956"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC2w="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Azure Pipelines",
+                            "databaseId": 9426
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null,
+                          "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241454965"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC3U="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Dependabot",
+                            "databaseId": 29110
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null,
+                          "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241454970"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC3o="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Codecov",
+                            "databaseId": 254
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null,
+                          "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241454974"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC34="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "PyTorch Bot",
+                            "databaseId": 40112
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null,
+                          "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241454977"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFC4E="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "TorchBench CI (pytorch-linux-py3.7-cu102)"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "run-torchbench",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622865/jobs/3270915028"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6e-c8=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SKIPPED",
+                          "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241455322"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFDNo="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "quick-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915027"
+                              },
+                              {
+                                "name": "lintrunner",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915071"
+                              },
+                              {
+                                "name": "Test tools",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915141"
+                              },
+                              {
+                                "name": "Test collect_env (with_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915194"
+                              },
+                              {
+                                "name": "Test collect_env (without_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915229"
+                              },
+                              {
+                                "name": "toc",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915283"
+                              },
+                              {
+                                "name": "workflow-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622869/jobs/3270915321"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6e-zM=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241455334"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFDOY="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pull"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927344"
+                              },
+                              {
+                                "name": "linux-bionic-rocm5.0-py3.7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927442"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927507"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927567"
+                              },
+                              {
+                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927674"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.3-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927727"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927802"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927853"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927948"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270927996"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928061"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928116"
+                              },
+                              {
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928198"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928256"
+                              },
+                              {
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928291"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928317"
+                              },
+                              {
+                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928338"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928367"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928410"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270928445"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991071"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991125"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991162"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991195"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991233"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991261"
+                              },
+                              {
+                                "name": "linux-docs / build-docs (cpp)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991305"
+                              },
+                              {
+                                "name": "linux-docs / build-docs (python)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270991349"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270996024"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270996068"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270996092"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270996505"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270998987"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3270999027"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271006886"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271006941"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271018097"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271018135"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271018162"
+                              },
+                              {
+                                "name": "pytorch-xla-linux-bionic-py3.7-clang8",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271021143"
+                              },
+                              {
+                                "name": "linux-bionic-rocm5.0-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271034041"
+                              },
+                              {
+                                "name": "linux-bionic-rocm5.0-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271034072"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271048218"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271049553"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271049587"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271049616"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271068293"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271068336"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271149276"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2217622878/jobs/3271149321"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAW6jVK8=",
+                              "hasNextPage": true
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/db355d55655bb252a699cd532441bb98e52b98d5/checks?check_suite_id=6241455360"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAXQFDQA="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": false
+                    }
+                  },
+                  "status": null,
+                  "pushedDate": "2022-04-25T02:30:31Z",
+                  "oid": "db355d55655bb252a699cd532441bb98e52b98d5"
+                }
+              }
+            ]
+          },
+          "changedFiles": 5,
+          "files": {
+            "nodes": [
+              {
+                "path": "test/test_ops.py"
+              },
+              {
+                "path": "torch/_prims/__init__.py"
+              },
+              {
+                "path": "torch/_prims/utils.py"
+              },
+              {
+                "path": "torch/_refs/__init__.py"
+              },
+              {
+                "path": "torch/testing/_internal/common_methods_invocations.py"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "NQ",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
+              {
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ngimel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ngimel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "zou3519"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "peterbell10"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ngimel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ngimel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "lezcano"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "ngimel"
+                },
+                "state": "APPROVED"
+              },
+              {
+                "author": {
+                  "login": "ezyang"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "mruberry"
+                },
+                "state": "COMMENTED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNC0wNlQxMjo1NjoyNC0wNzowMLkyMDIyLTA0LTA2VDA4OjQwOjM4LTA3OjAwzjenO6Y=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "Ref implementations by themselves can handle any shapes (and broadcast ops by themselves don't bake in any shapes). The question is can we decide if a particular trace is applicable for a different input, but that depends on the tracing technology and what we are caching on, so out of scope for initial PR.",
+                "createdAt": "2022-04-21T19:00:28Z",
+                "author": {
+                  "login": "ngimel"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1105643418
+              },
+              {
+                "bodyText": "@pytorchbot merge this please",
+                "createdAt": "2022-04-25T04:42:29Z",
+                "author": {
+                  "login": "mruberry"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1108072887
+              },
+              {
+                "bodyText": "Merge failed due to 'mruberry'\nRaised by https://github.com/pytorch/pytorch/actions/runs/2218044244",
+                "createdAt": "2022-04-25T04:43:54Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1108073536
+              },
+              {
+                "bodyText": "@mruberry has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.",
+                "createdAt": "2022-04-25T04:51:11Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1108075965
+              },
+              {
+                "bodyText": "Hey @mruberry.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
+                "createdAt": "2022-04-25T09:57:56Z",
+                "author": {
+                  "login": "github-actions"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1108351107
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOQebHmg==",
+              "hasPreviousPage": true
+            }
+          },
+          "labels": {
+            "edges": [
+              {
+                "node": {
+                  "name": "cla signed"
+                }
+              },
+              {
+                "node": {
+                  "name": "topic: not user facing"
+                }
+              },
+              {
+                "node": {
+                  "name": "module: primTorch"
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=77700 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": false,
+          "author": {
+            "login": "kit1980"
+          },
+          "title": "Move pull linux-docs job to Ubuntu 20.04",
+          "body": "",
+          "headRefName": "sdym/pull-xenial-focal-linux-docs",
+          "headRepository": {
+            "nameWithOwner": "pytorch/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kit1980"
+                    },
+                    "email": "sdym@fb.com",
+                    "name": "Sergii Dymchenko"
+                  },
+                  "oid": "81261599614423baa17df72300b8e109677b6799"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MQ",
+              "hasNextPage": false
+            },
+            "totalCount": 1
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Facebook GitHub Tools",
+                            "databaseId": 12274
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Facebook CLA Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://code.facebook.com/cla/"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNmNqE=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/81261599614423baa17df72300b8e109677b6799/checks?check_suite_id=6567147714"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuMI="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Netlify",
+                            "databaseId": 13473
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null,
+                          "url": "https://github.com/pytorch/pytorch/commit/81261599614423baa17df72300b8e109677b6799/checks?check_suite_id=6567147726"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuM4="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Azure Pipelines",
+                            "databaseId": 9426
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null,
+                          "url": "https://github.com/pytorch/pytorch/commit/81261599614423baa17df72300b8e109677b6799/checks?check_suite_id=6567147733"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuNU="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Dependabot",
+                            "databaseId": 29110
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null,
+                          "url": "https://github.com/pytorch/pytorch/commit/81261599614423baa17df72300b8e109677b6799/checks?check_suite_id=6567147746"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuOI="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Codecov",
+                            "databaseId": 254
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null,
+                          "url": "https://github.com/pytorch/pytorch/commit/81261599614423baa17df72300b8e109677b6799/checks?check_suite_id=6567147762"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuPI="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "PyTorch Bot",
+                            "databaseId": 40112
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null,
+                          "url": "https://github.com/pytorch/pytorch/commit/81261599614423baa17df72300b8e109677b6799/checks?check_suite_id=6567147780"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuQQ="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "lintrunner",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528127876"
+                              },
+                              {
+                                "name": "workflow-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128023"
+                              },
+                              {
+                                "name": "quick-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128196"
+                              },
+                              {
+                                "name": "Test collect_env (with_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128519"
+                              },
+                              {
+                                "name": "Test collect_env (without_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128575"
+                              },
+                              {
+                                "name": "toc",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128663"
+                              },
+                              {
+                                "name": "Test tools",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867841/jobs/3528128857"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNdYVY=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/81261599614423baa17df72300b8e109677b6799/checks?check_suite_id=6567148336"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuzA="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "TorchBench CI (pytorch-linux-py3.7-cu102)"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "run-torchbench",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867843/jobs/3528127882"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNdXEg=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SKIPPED",
+                          "url": "https://github.com/pytorch/pytorch/commit/81261599614423baa17df72300b8e109677b6799/checks?check_suite_id=6567148344"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduuzg="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "docker-builds"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "docker-build (pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528127883"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-bionic-cuda11.3-cudnn8-py3-clang9)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528127945"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128001"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-bionic-py3.7-clang9)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128067"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-bionic-rocm5.0-py3.7)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128124"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-bionic-rocm5.1-py3.7)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128191"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128259"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128321"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-xenial-py3-clang5-android-ndk-r19c)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128365"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-xenial-py3-clang5-asan)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128446"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-xenial-py3-clang7-asan)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128507"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-xenial-py3-clang7-onnx)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128563"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-xenial-py3.7-gcc5.4)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128639"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-xenial-py3.7-gcc7)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128687"
+                              },
+                              {
+                                "name": "docker-build (pytorch-linux-focal-py3.7-gcc7)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867844/jobs/3528128741"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNdYLI=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/81261599614423baa17df72300b8e109677b6799/checks?check_suite_id=6567148352"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduu0A="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pull"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528150762"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528150903"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528151086"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528151258"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528151511"
+                              },
+                              {
+                                "name": "linux-bionic-rocm5.1-py3.7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528151776"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528151896"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152014"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152139"
+                              },
+                              {
+                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152216"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.3-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152378"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152516"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152599"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152723"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152802"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152913"
+                              },
+                              {
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528152969"
+                              },
+                              {
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528153005"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528153062"
+                              },
+                              {
+                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528153125"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528153207"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528242483"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528242528"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528245875"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528245914"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528245964"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528246008"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528248520"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528255086"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528255128"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 5, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528274064"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 5, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528274097"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 5, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528274133"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 4, 5, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528274173"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 5, 5, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528274209"
+                              },
+                              {
+                                "name": "pytorch-xla-linux-bionic-py3.7-clang8 / test (xla, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528277014"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528308958"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309747"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309810"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309837"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 4, 4, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309864"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 2, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309895"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 2, 2, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528309925"
+                              },
+                              {
+                                "name": "linux-bionic-rocm5.1-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528310044"
+                              },
+                              {
+                                "name": "linux-bionic-rocm5.1-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528310101"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384337"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384379"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384408"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384441"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2348867849/jobs/3528384471"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAYNi1Nc=",
+                              "hasNextPage": true
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/81261599614423baa17df72300b8e109677b6799/checks?check_suite_id=6567148369"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAYduu1E="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": false
+                    }
+                  },
+                  "status": null,
+                  "pushedDate": "2022-05-19T00:02:11Z",
+                  "oid": "81261599614423baa17df72300b8e109677b6799"
+                }
+              }
+            ]
+          },
+          "changedFiles": 3,
+          "files": {
+            "nodes": [
+              {
+                "path": ".circleci/docker/build.sh"
+              },
+              {
+                "path": ".circleci/docker/common/install_katex.sh"
+              },
+              {
+                "path": ".github/workflows/pull.yml"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "Mw",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
+              {
+                "author": {
+                  "login": "suo"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "kit1980"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "janeyx99"
+                },
+                "state": "APPROVED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNS0xOFQxMjo0MTowNS0wNzowMLkyMDIyLTA1LTE4VDEyOjQxOjA0LTA3OjAwzjpD7es=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/77700\n\ud83d\udcc4 \u00a0Preview Python docs built from this PR\n\ud83d\udcc4 \u00a0Preview C++ docs built from this PR\n\u2753Need help or want to give feedback on the CI? Visit our office hours\n\n\u2705 No Failures (0 Pending)\nAs of commit 8126159 (more details on the Dr. CI page):\nExpand to see more\n\n\ud83d\udc9a \ud83d\udc9a Looks good so far! There are no failures yet. \ud83d\udc9a \ud83d\udc9a\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
+                "createdAt": "2022-05-17T23:01:48Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": {
+                  "login": "facebook-github-bot"
+                },
+                "databaseId": 1129400934
+              },
+              {
+                "bodyText": "@pytorchbot merge",
+                "createdAt": "2022-05-19T15:39:05Z",
+                "author": {
+                  "login": "kit1980"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1131884232
+              },
+              {
+                "bodyText": "Merge failed due to Refusing to merge as mandatory check(s) linux-docs / build-docs (cpp), linux-docs / build-docs (python) are pending/not yet run for rule OSS CI\nRaised by https://github.com/pytorch/pytorch/actions/runs/2353067846",
+                "createdAt": "2022-05-19T15:40:59Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1131886153
+              },
+              {
+                "bodyText": "@pytorchbot merge -f",
+                "createdAt": "2022-05-19T16:41:29Z",
+                "author": {
+                  "login": "kit1980"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1131945610
+              },
+              {
+                "bodyText": "Hey @kit1980.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
+                "createdAt": "2022-05-19T16:43:37Z",
+                "author": {
+                  "login": "github-actions"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1131947473
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOQ1FKZg==",
+              "hasPreviousPage": false
+            }
+          },
+          "labels": {
+            "edges": [
+              {
+                "node": {
+                  "name": "Merged"
+                }
+              },
+              {
+                "node": {
+                  "name": "cla signed"
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=68111 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": true,
+          "author": {
+            "login": "chunyuan-w"
+          },
+          "title": "Add JIT graph fuser for oneDNN Graph API (Preview4)",
+          "body": "## Description\r\nPreview4 PR of this [RFC](https://github.com/pytorch/pytorch/issues/49444).\r\n\r\nOn the basis of https://github.com/pytorch/pytorch/pull/50256, the below improvements are included:\r\n\r\n- The [preview4 release branch](https://github.com/oneapi-src/oneDNN/releases/tag/graph-v0.4.1) of the oneDNN Graph API is used\r\n- The fuser now works with the profiling graph executor. We have inserted type check nodes to guard the profiled tensor properties.\r\n\r\n### User API:\r\nThe optimization pass is disabled by default. Users could enable it by:\r\n```\r\ntorch.jit.enable_onednn_fusion(True)\r\n```\r\n\r\n### Performance:\r\n[pytorch/benchmark](https://github.com/pytorch/benchmark) tool is used to compare the performance:\r\n- SkyLake 8180 (1 socket of 28 cores):\r\n\r\n  ![image](https://user-images.githubusercontent.com/65992142/151162305-05e44425-a24e-4d5e-94e1-743b40b87a8c.png)\r\n\r\n- SkyLake 8180 (single thread):\r\n\r\n  ![image](https://user-images.githubusercontent.com/65992142/151162528-69f90b79-d08d-46b8-8775-d80a6ccbce8a.png)\r\n \\* By mapping hardswish to oneDNN Graph, it\u2019s 8% faster than PyTorch JIT (NNC + OFI)\r\n  \\** We expect performance gain after mapping transpose, contiguous & view to oneDNN graph ops\r\n\r\n\r\n### Directory structure of the integration code\r\nFuser-related code are placed under:\r\n```\r\ntorch/csrc/jit/codegen/onednn/\r\n```\r\n\r\nOptimization pass registration is done in:\r\n```\r\ntorch/csrc/jit/passes/onednn_graph_fuser.h\r\n```\r\n\r\nCMake for the integration code is:\r\n```\r\ncaffe2/CMakeLists.txt\r\n```\r\n\r\n## Limitations\r\n\r\n- In this PR, we have only supported the optimization on Linux platform. The support on Windows and MacOS will be enabled as the next step.\r\n- We have only optimized the inference use case.",
+          "headRefName": "chunyuan/llga_preview2",
+          "headRepository": {
+            "nameWithOwner": "chunyuan-w/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "0096fcc49f277fd8e006fcb42e0cb28a1422ec98"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "7bcc4de26a5472f1d252735dd425b46794b0844f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "3a2a588bfe6bbf9bf74d88d441cd22affda207da"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "ca7df12fbfaa3ddbabeca39b76300d17f4a33f2f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "81d44f35b8bc043c38837d0694e5bc072203b832"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "14fd5d1bfc2c58a71379f778871e3fca0a8e79b2"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "954dc23663125897f4b199eb2a8607dc5fca3274"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "9f77a0b476accc678b6f0569e4ff33fa6bbe97fc"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "fbf3b23bc1288697e1aec539a7c4ee3dc0bcb84c"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "f8b8e78f786586c3cdf3966fd83ffa124d3eda70"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "6fffa2f7453ee7e0f8d8e2f73ea8a65230539589"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "849385404e6f3cd1cf7cef19f931ecf4fa28afdb"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "adbae7b77f8c0dbc59fccf15207d97ba86cfade2"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "6dcf2a4981aff24fa16fc7461ae4ec29690f956f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "54f3e05ad524cffd0911ee93be3c50f589b51f58"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "edbfc640ea79a0af85757d9e73796dcc90231519"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "chunyuan-w"
+                    },
+                    "email": "chunyuan.wu@intel.com",
+                    "name": "chunyuan"
+                  },
+                  "oid": "67654db7cba562809d1b4a44cdda58af5cc9daaf"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "9c9d99b930b11af9ff03f52d45bf49c652df758d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "ffb25119cd9ce815cc4d9d14a2317fcbbfa9ea86"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "ab9eee84512ca1bdfbc81e25c6eb67b29d0f302a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "62a4642cf3330524990a69ac29e002c97812320a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "ca9b1223be4af2c8b4929303d498eafd71793128"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "6f4a23d24514a02954d2ec792830085f612223c9"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "b2a9a9c0926b02d0b2e87722ed61450f224a61d0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "e88b492be733f24b6aa395829c76add67d0901e7"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "c44336d7a914952bfb78e012e08d9a6d6dde5937"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "5157930f7b3921d41a586260582b574c915f6ca1"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "04cb8353813f6bbd0d913a994923cc7e1e291406"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "62991eaad0e638bb0bced327e03f932f66f68732"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "7496bf1588050191595d833d23b8972b2f22655e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "d9d35f23cca0cd29c78a845731b24826152dcf1c"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "f74ec134f18a65a7c72455bdf44f72e3ebb27105"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "eb32cc65a975361160948bfc3d6a577991ea262e"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "c7665f8d695b680c54db0bad2b7b7df46d886b50"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "e6321ad8f59ea01130568c202d186448bb9cb9d0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "a72cd0d02693f45e5354a70654581ad514581ec7"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "b3cd3028b4ed31805e82f7eaf02217ab74ca59b9"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "49a592d9788d08e6cd0593882f867e129057c1cc"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "0575766b2144b13f6a38227c4e2b8d22ec8db80f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "b5c9b10ff87d622350e8ca64fae3a476eb70d5aa"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "66bc652a30ccc329adb929870a4ac726bb98b38c"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "72b9ca9c8e2dac98cbb7199b3dfac7c7305b80c5"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "a7892ed7373207d96406c8b5734a089643c5cdbd"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "d54cb084e1daad8a08c3f8de0ad3f7afb5b05ac1"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "aef71d692a8a159e0ca56be363e2cc1225ce7647"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "bf618e205ec31cff962dcc8ab478e0a699a9572d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "e4a331f1088448f7d7d86256ce71e0e71da006b0"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "0b743523d1430fec759d5fefbb687f17c89335a5"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "e80a351a62d98b810ec8985c4b25257af1d6c5bb"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "c189eca154b6691919d0e21489d1c322c7435c0b"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "e080a067c75d7b888a8a362682a2d5ba70e0c3a8"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "028561fbf8f3ed90e074e6e0e3a4ca4dd7ffa2a8"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "d550cf14037badd4caa2f52202e2f20bc4db8432"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "574159ebadd1dec24daaf883879ffeca8d9e71b7"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "9eb3ee98ea756067ed1c8f52f309f6d3e211a904"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "29929f48be03dcdd1bbfade572de7feafa825547"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "8a7358ca8da547b40ea1a99ddc57ebed19959684"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "6606637d2c5525b43e294a8b366a85052e1be0c6"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "5ecfd1f28b87045deb8bc8ffe33b3d8b906f3264"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchit.jain"
+                  },
+                  "oid": "be2d4345c65442c4cfbe8afdfb2ae0893945da42"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "sanchitintel"
+                    },
+                    "email": "sanchit.jain@intel.com",
+                    "name": "sanchitintel"
+                  },
+                  "oid": "b5b89d3644a43e2dbda841cafb71b32edbe07c8a"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nikita.shulga@gmail.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "73881411e2bfb3aaa2e89926a82390b4c587ad75"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "NjI",
+              "hasNextPage": false
+            },
+            "totalCount": 62
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Facebook GitHub Tools",
+                            "databaseId": 12274
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Facebook CLA Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://code.intern.facebook.com/cla/"
+                              },
+                              {
+                                "name": "Meta Internal-Only Changes Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://opensource.facebook.com/"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_NXnc=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/73881411e2bfb3aaa2e89926a82390b4c587ad75/checks?check_suite_id=5743625010"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVZYwzI="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "clang-format",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903895825"
+                              },
+                              {
+                                "name": "py2-setup-validate-errormsg",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903895911"
+                              },
+                              {
+                                "name": "quick-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903895963"
+                              },
+                              {
+                                "name": "shellcheck",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896134"
+                              },
+                              {
+                                "name": "toc",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896253"
+                              },
+                              {
+                                "name": "clang-tidy",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896371"
+                              },
+                              {
+                                "name": "cmakelint",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896525"
+                              },
+                              {
+                                "name": "flake8-py3",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896658"
+                              },
+                              {
+                                "name": "Test collect_env (with_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896771"
+                              },
+                              {
+                                "name": "Test collect_env (without_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896795"
+                              },
+                              {
+                                "name": "Test tools",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896838"
+                              },
+                              {
+                                "name": "mypy",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440028/jobs/2903896897"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_NZqw=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/73881411e2bfb3aaa2e89926a82390b4c587ad75/checks?check_suite_id=5743625458"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVZYxPI="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "TorchBench CI (pytorch-linux-py3.7-cu102)"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "run-torchbench",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440031/jobs/2903895828"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_NYIw=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SKIPPED",
+                          "url": "https://github.com/pytorch/pytorch/commit/73881411e2bfb3aaa2e89926a82390b4c587ad75/checks?check_suite_id=5743625463"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVZYxPc="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pull"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "pytorch-xla-linux-bionic-py3.7-clang8",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896014"
+                              },
+                              {
+                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896165"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896394"
+                              },
+                              {
+                                "name": "linux-bionic-rocm4.5-py3.7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896572"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896666"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896778"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896837"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896896"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903896936"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897025"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897161"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897213"
+                              },
+                              {
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897280"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897368"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.3-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897431"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897476"
+                              },
+                              {
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897578"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897630"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897699"
+                              },
+                              {
+                                "name": "pytorch-xla-linux-bionic-py3.7-clang8",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2903897733"
+                              },
+                              {
+                                "name": "linux-docs / build-docs (cpp)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904327787"
+                              },
+                              {
+                                "name": "linux-docs / build-docs (python)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904327838"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904327956"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904327997"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904328035"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904328093"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904328131"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904328177"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904333962"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904334006"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904430419"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904430459"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (noarch, 1, 1, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904430508"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904430573"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904443663"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904443723"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904443787"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904454239"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904454303"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904554602"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904554698"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904588855"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904588886"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.3-py3 / test (force_on_cpu, 1, 1, windows.4xlarge)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904588924"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904655702"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904656104"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904656150"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904656192"
+                              },
+                              {
+                                "name": "linux-bionic-rocm4.5-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904706520"
+                              },
+                              {
+                                "name": "linux-bionic-rocm4.5-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2018440039/jobs/2904706565"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAU_fN1g=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "FAILURE",
+                          "url": "https://github.com/pytorch/pytorch/commit/73881411e2bfb3aaa2e89926a82390b4c587ad75/checks?check_suite_id=5743625483"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVZYxQs="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": false
+                    }
+                  },
+                  "status": {
+                    "contexts": [
+                      {
+                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17048428?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17048429?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17048431?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17048430?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      }
+                    ]
+                  },
+                  "pushedDate": "2022-03-21T19:58:52Z",
+                  "oid": "73881411e2bfb3aaa2e89926a82390b4c587ad75"
+                }
+              }
+            ]
+          },
+          "changedFiles": 37,
+          "files": {
+            "nodes": [
+              {
+                "path": "aten/src/ATen/core/interned_strings.h"
+              },
+              {
+                "path": "caffe2/CMakeLists.txt"
+              },
+              {
+                "path": "cmake/Dependencies.cmake"
+              },
+              {
+                "path": "cmake/Modules/FindMKLDNN.cmake"
+              },
+              {
+                "path": "cmake/public/mkldnn.cmake"
+              },
+              {
+                "path": "docs/source/jit.rst"
+              },
+              {
+                "path": "test/test_jit_llga_fuser.py"
+              },
+              {
+                "path": "torch/_C/__init__.pyi.in"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/LlgaTensorImpl.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/README.md"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/defer_size_check.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/defer_size_check.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/graph_fuser.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/graph_fuser.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/graph_helper.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/graph_helper.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/graph_rewriter.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/guard_shape.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/guard_shape.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/interface.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/interface.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/kernel.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/kernel.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/layout_propagation.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/layout_propagation.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/operator.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/prepare_binary.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/prepare_binary.h"
+              },
+              {
+                "path": "torch/csrc/jit/codegen/onednn/register_interface.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/ir/alias_analysis.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/ir/ir.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/passes/inline_autodiff_subgraphs.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/passes/onednn_graph_fuser.h"
+              },
+              {
+                "path": "torch/csrc/jit/python/init.cpp"
+              },
+              {
+                "path": "torch/csrc/jit/runtime/operator.cpp"
+              },
+              {
+                "path": "torch/jit/__init__.py"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "Mzc",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
+              {
+                "author": {
+                  "login": "pinzhenx"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "pinzhenx"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "pinzhenx"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "chunyuan-w"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "eellison"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "wukong1992"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "eellison"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "eellison"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "eellison"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "eellison"
+                },
+                "state": "APPROVED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "eellison"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "malfet"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "malfet"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "malfet"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              },
+              {
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "state": "COMMENTED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMS0xMi0xMFQwOToyNDoxOS0wODowMLkyMDIxLTEyLTEwVDA5OjI0OjE5LTA4OjAwzjFryLE=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "Looks like this broke master https://hud.pytorch.org/pytorch/pytorch/commit/7dd08230117f4fa8bb82b3524e90fb00340198c7. I am reverting.",
+                "createdAt": "2022-03-21T22:51:38Z",
+                "author": {
+                  "login": "suo"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1074498483
+              },
+              {
+                "bodyText": "@pytorchbot revert this",
+                "createdAt": "2022-03-21T22:51:44Z",
+                "author": {
+                  "login": "suo"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1074498550
+              },
+              {
+                "bodyText": "Looks like this broke master https://hud.pytorch.org/pytorch/pytorch/commit/7dd08230117f4fa8bb82b3524e90fb00340198c7. I am reverting.\n\nOops! Will fix it ASAP.",
+                "createdAt": "2022-03-21T22:53:34Z",
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1074499668
+              },
+              {
+                "bodyText": "This pull request has been reverted by e5bf879. To re-land this change, please open another pull request, assignthe same reviewers, fix the CI failures that caused the revert and make sure that the failing CI runs on the PR by applying the proper ciflow label (e.g., ciflow/trunk).",
+                "createdAt": "2022-03-21T23:07:23Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1074508608
+              },
+              {
+                "bodyText": "This pull request has been reverted by e5bf879. To re-land this change, please open another pull request, assignthe same reviewers, fix the CI failures that caused the revert and make sure that the failing CI runs on the PR by applying the proper ciflow label (e.g., ciflow/trunk).",
+                "createdAt": "2022-03-30T00:53:50Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1082508130
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOQAuLsw==",
+              "hasPreviousPage": true
+            }
+          },
+          "labels": {
+            "edges": [
+              {
+                "node": {
+                  "name": "oncall: jit"
+                }
+              },
+              {
+                "node": {
+                  "name": "triaged"
+                }
+              },
+              {
+                "node": {
+                  "name": "open source"
+                }
+              },
+              {
+                "node": {
+                  "name": "cla signed"
+                }
+              },
+              {
+                "node": {
+                  "name": "Reverted"
+                }
+              },
+              {
+                "node": {
+                  "name": "intel priority"
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=2e2877d2452c4f233f042b7ccd50ab9c2a6e9a73d8819a0c876203c12364e8a3 cursor=Y3Vyc29yOnYyOpHOQAuLsw== name=pytorch number=68111 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "CI Flow Status\n\u269b\ufe0f CI Flow\nRuleset - Version: v1\nRuleset - File: https://github.com/chunyuan-w/pytorch/blob/7496bf1588050191595d833d23b8972b2f22655e/.github/generated-ciflow-ruleset.json\nPR ciflow labels: ciflow/default\n\n\n\nWorkflows\nLabels (bold enabled)\nStatus\n\n\n\n\nTriggered Workflows\n\n\n\n\nlinux-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/noarch, ciflow/trunk\n\u2705 triggered\n\n\nlinux-docs\nciflow/all, ciflow/cpu, ciflow/default, ciflow/docs, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-vulkan-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk, ciflow/vulkan\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-bazel-test\nciflow/all, ciflow/bazel, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-build\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-custom-build-static\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-asan\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/sanitizers, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-onnx\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/onnx, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7-no-ops\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nwin-vs2019-cpu-py3\nciflow/all, ciflow/cpu, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nwin-vs2019-cuda11.3-py3\nciflow/all, ciflow/cuda, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nSkipped Workflows\n\n\n\n\ncaffe2-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\ndocker-builds\nciflow/all, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-custom-ops\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-full-jit\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-metal\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64-full-jit\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda10.2-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-binary-conda\nciflow/binaries, ciflow/binaries/conda\n\ud83d\udeab skipped\n\n\nlinux-binary-libtorch-cxx11-abi\nciflow/binaries, ciflow/binaries/libtorch\n\ud83d\udeab skipped\n\n\nlinux-binary-libtorch-pre-cxx11\nciflow/binaries, ciflow/binaries/libtorch\n\ud83d\udeab skipped\n\n\nlinux-binary-manywheel\nciflow/binaries, ciflow/binaries/wheel\n\ud83d\udeab skipped\n\n\nlinux-bionic-cuda10.2-py3.9-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/slow, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-docs-push\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-no-ops\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-arm64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-lite-interpreter-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-11-py3-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nparallelnative-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nperiodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled, ciflow/slow, ciflow/slow-gradcheck\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda11.1-py3.7-gcc7-debug\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-win-vs2019-cuda11.1-py3\nciflow/all, ciflow/cuda, ciflow/scheduled, ciflow/win\n\ud83d\udeab skipped\n\n\nperiodic-win-vs2019-cuda11.5-py3\nciflow/all, ciflow/cuda, ciflow/scheduled, ciflow/win\n\ud83d\udeab skipped\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-build\nciflow/all, ciflow/android, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\n\n\nYou can add a comment to the PR and tag @pytorchbot with the following commands:\n\n# ciflow rerun, \"ciflow/default\" will always be added automatically\n@pytorchbot ciflow rerun\n\n# ciflow rerun with additional labels \"-l <ciflow/label_name>\", which is equivalent to adding these labels manually and trigger the rerun\n@pytorchbot ciflow rerun -l ciflow/scheduled -l ciflow/slow\n\nFor more information, please take a look at the CI Flow Wiki.",
+                "createdAt": "2021-11-10T08:42:49Z",
+                "author": {
+                  "login": "pytorch-probot"
+                },
+                "authorAssociation": "NONE",
+                "editor": {
+                  "login": "pytorch-probot"
+                },
+                "databaseId": 964902865
+              },
+              {
+                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/68111\nNeed help or want to give feedback on the CI? Visit our office hours\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 7388141 (more details on the Dr. CI page):\n\n\n29/29 failures introduced in this PR\n\n\n\ud83d\udd75\ufe0f 29 new failures recognized by patterns\nThe following CI failures do not appear to be due to upstream breakages:\n pull / linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge) (1/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:31:38.6978776Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:31:38.3001628Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:31:38.5169168Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:31:38.5362923Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:31:38.5413452Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:31:38.5458747Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:31:38.5484014Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:31:38.5497924Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:31:38.5656491Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:31:38.5678893Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:31:38.6888479Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0f6488c20adb4dca4\n2022-03-21T21:31:38.6978776Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:31:38.6992648Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:31:38.7003010Z ##[error]Process completed with exit code 2.\n2022-03-21T21:31:38.7044027Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:31:38.7044261Z with:\n2022-03-21T21:31:38.7044413Z env:\n2022-03-21T21:31:38.7044565Z   IN_CI: 1\n2022-03-21T21:31:38.7044709Z   IS_GHA: 1\n2022-03-21T21:31:38.7044885Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:31:38.7045067Z ##[endgroup]\n2022-03-21T21:31:38.7060958Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge) (2/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:35:19.2635222Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:35:18.9028722Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:35:19.1132721Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:35:19.1310590Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:35:19.1360251Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:35:19.1386865Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:35:19.1429182Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:35:19.1441925Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:35:19.1468280Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:35:19.1617667Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:35:19.2545368Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-098be2985e0392130\n2022-03-21T21:35:19.2635222Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:35:19.2648463Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:35:19.2658727Z ##[error]Process completed with exit code 2.\n2022-03-21T21:35:19.2706355Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:35:19.2706591Z with:\n2022-03-21T21:35:19.2706748Z env:\n2022-03-21T21:35:19.2706908Z   IN_CI: 1\n2022-03-21T21:35:19.2707061Z   IS_GHA: 1\n2022-03-21T21:35:19.2707246Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:35:19.2707438Z ##[endgroup]\n2022-03-21T21:35:19.2724554Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / win-vs2019-cuda11.3-py3 / test (force_on_cpu, 1, 1, windows.4xlarge) (3/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T23:11:57.5531419Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T23:11:52.7662022Z   Downloading botocore-1.22.12-py3-none-any.whl (8.1 MB)\n2022-03-21T23:11:53.1213298Z      ---------------------------------------- 8.1/8.1 MB 23.6 MB/s eta 0:00:00\n2022-03-21T23:11:53.1644665Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T23:11:53.2218699Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-21T23:11:53.2389674Z   Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-21T23:11:53.2787295Z      -------------------------------------- 247.7/247.7 KB 7.4 MB/s eta 0:00:00\n2022-03-21T23:11:53.3761842Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T23:11:53.5457622Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-21T23:11:57.4175080Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-21T23:11:57.5296815Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0105d4db093574f40\n2022-03-21T23:11:57.5531419Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T23:11:57.5564814Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T23:11:57.5587712Z ##[error]Process completed with exit code 2.\n2022-03-21T23:11:57.5790311Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-21T23:11:57.5790832Z with:\n2022-03-21T23:11:57.5791104Z env:\n2022-03-21T23:11:57.5791358Z   IN_CI: 1\n2022-03-21T23:11:57.5791620Z   IS_GHA: 1\n2022-03-21T23:11:57.5791939Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T23:11:57.5792425Z   pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-21T23:11:57.5792884Z ##[endgroup]\n\n\n pull / linux-bionic-rocm4.5-py3.7 / test (default, 1, 2, linux.rocm.gpu) (4/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-22T02:17:12.6257577Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-22T02:17:11.9280556Z   Using cached https://files.pythonhosted.org/packages/7b/9c/f51775ebe7df5a7aa4e7c79ed671bde94e154bd968aca8d65bb24aba0c8c/s3transfer-0.5.2-py3-none-any.whl\n2022-03-22T02:17:11.9335199Z Collecting urllib3<1.27,>=1.25.4 (from botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T02:17:11.9682045Z   Using cached https://files.pythonhosted.org/packages/ec/03/062e6444ce4baf1eac17a6a0ebfe36bb1ad05e1df0e20b110de59c278498/urllib3-1.26.9-py2.py3-none-any.whl\n2022-03-22T02:17:11.9850357Z Collecting python-dateutil<3.0.0,>=2.1 (from botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T02:17:12.0403171Z   Using cached https://files.pythonhosted.org/packages/36/7a/87837f39d0296e723bb9b62bbb257d0355c7f6128853c78955f57342a56d/python_dateutil-2.8.2-py2.py3-none-any.whl\n2022-03-22T02:17:12.0468875Z Collecting six>=1.5 (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T02:17:12.0590000Z   Using cached https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl\n2022-03-22T02:17:12.0607093Z Installing collected packages: jmespath, urllib3, six, python-dateutil, botocore, s3transfer, boto3\n2022-03-22T02:17:12.5273459Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2 six-1.16.0 urllib3-1.26.9\n2022-03-22T02:17:12.6032812Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 worker-rocm-amd-114\n2022-03-22T02:17:12.6257577Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-22T02:17:12.6259543Z + GHA_WORKFLOW_JOB_ID=\n2022-03-22T02:17:12.6291924Z ##[error]Process completed with exit code 2.\n2022-03-22T02:17:12.6387977Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-22T02:17:12.6388298Z with:\n2022-03-22T02:17:12.6388521Z   wait-ssh: false\n2022-03-22T02:17:12.6388727Z env:\n2022-03-22T02:17:12.6388932Z   IN_CI: 1\n2022-03-22T02:17:12.6389143Z   IS_GHA: 1\n2022-03-22T02:17:12.6389368Z   GIT_DEFAULT_BRANCH: master\n2022-03-22T02:17:12.6389669Z   DOCKER_HOST: unix:///run/user/1121/docker.sock\n\n\n pull / linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge) (5/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:19:24.4890693Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:19:24.0962005Z + python3 -m pip install boto3==1.19.12\n2022-03-21T22:19:24.3152253Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T22:19:24.3341183Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T22:19:24.3391374Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T22:19:24.3436392Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T22:19:24.3448982Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T22:19:24.3474092Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T22:19:24.3502003Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:19:24.3655072Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:19:24.4799309Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0bc9250521f338cae\n2022-03-21T22:19:24.4890693Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:19:24.4903625Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:19:24.4913841Z ##[error]Process completed with exit code 2.\n2022-03-21T22:19:24.4957338Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T22:19:24.4957575Z with:\n2022-03-21T22:19:24.4957735Z env:\n2022-03-21T22:19:24.4957900Z   IN_CI: 1\n2022-03-21T22:19:24.4958055Z   IS_GHA: 1\n2022-03-21T22:19:24.4958246Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T22:19:24.4958437Z ##[endgroup]\n2022-03-21T22:19:24.4989649Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-bionic-rocm4.5-py3.7 / test (default, 2, 2, linux.rocm.gpu) (6/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-22T01:05:07.6983899Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-22T01:05:06.8364546Z   Using cached https://files.pythonhosted.org/packages/7b/9c/f51775ebe7df5a7aa4e7c79ed671bde94e154bd968aca8d65bb24aba0c8c/s3transfer-0.5.2-py3-none-any.whl\n2022-03-22T01:05:06.8431763Z Collecting urllib3<1.27,>=1.25.4 (from botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T01:05:06.8949391Z   Using cached https://files.pythonhosted.org/packages/ec/03/062e6444ce4baf1eac17a6a0ebfe36bb1ad05e1df0e20b110de59c278498/urllib3-1.26.9-py2.py3-none-any.whl\n2022-03-22T01:05:06.9180079Z Collecting python-dateutil<3.0.0,>=2.1 (from botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T01:05:06.9803351Z   Using cached https://files.pythonhosted.org/packages/36/7a/87837f39d0296e723bb9b62bbb257d0355c7f6128853c78955f57342a56d/python_dateutil-2.8.2-py2.py3-none-any.whl\n2022-03-22T01:05:06.9882133Z Collecting six>=1.5 (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12)\n2022-03-22T01:05:07.0067062Z   Using cached https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl\n2022-03-22T01:05:07.0088676Z Installing collected packages: urllib3, jmespath, six, python-dateutil, botocore, s3transfer, boto3\n2022-03-22T01:05:07.5819667Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2 six-1.16.0 urllib3-1.26.9\n2022-03-22T01:05:07.6774717Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 worker-rocm-amd-60\n2022-03-22T01:05:07.6983899Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-22T01:05:07.6988652Z + GHA_WORKFLOW_JOB_ID=\n2022-03-22T01:05:07.7023073Z ##[error]Process completed with exit code 2.\n2022-03-22T01:05:07.7102087Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-22T01:05:07.7102389Z with:\n2022-03-22T01:05:07.7102603Z   wait-ssh: false\n2022-03-22T01:05:07.7102820Z env:\n2022-03-22T01:05:07.7103015Z   IN_CI: 1\n2022-03-22T01:05:07.7103224Z   IS_GHA: 1\n2022-03-22T01:05:07.7103458Z   GIT_DEFAULT_BRANCH: master\n2022-03-22T01:05:07.7103737Z   DOCKER_HOST: unix:///run/user/1502/docker.sock\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge) (7/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T20:51:39.3637996Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T20:51:39.2041249Z   Attempting uninstall: s3transfer\n2022-03-21T20:51:39.2043010Z     Found existing installation: s3transfer 0.3.7\n2022-03-21T20:51:39.2083799Z     Uninstalling s3transfer-0.3.7:\n2022-03-21T20:51:39.2089675Z       Successfully uninstalled s3transfer-0.3.7\n2022-03-21T20:51:39.2480546Z   Attempting uninstall: boto3\n2022-03-21T20:51:39.2482953Z     Found existing installation: boto3 1.16.34\n2022-03-21T20:51:39.2584292Z     Uninstalling boto3-1.16.34:\n2022-03-21T20:51:39.2599474Z       Successfully uninstalled boto3-1.16.34\n2022-03-21T20:51:39.3130921Z Successfully installed boto3-1.19.12 botocore-1.22.12 s3transfer-0.5.2\n2022-03-21T20:51:39.3550598Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-03ef7efc3078e3da5\n2022-03-21T20:51:39.3637996Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T20:51:39.3650651Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T20:51:39.3660484Z ##[error]Process completed with exit code 2.\n2022-03-21T20:51:39.3696465Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T20:51:39.3696693Z with:\n2022-03-21T20:51:39.3696850Z env:\n2022-03-21T20:51:39.3697012Z   IN_CI: 1\n2022-03-21T20:51:39.3697161Z   IS_GHA: 1\n2022-03-21T20:51:39.3697342Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T20:51:39.3697528Z ##[endgroup]\n2022-03-21T20:51:39.3730420Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge) (8/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:03:36.3916860Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:03:36.0096309Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:03:36.2278560Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:03:36.2461618Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:03:36.2513260Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:03:36.2541524Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:03:36.2554899Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:03:36.2598277Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:03:36.2758299Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:03:36.2780690Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:03:36.3825021Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0a4a552890e6ef7d3\n2022-03-21T21:03:36.3916860Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:03:36.3930343Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:03:36.3941263Z ##[error]Process completed with exit code 2.\n2022-03-21T21:03:36.3979258Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:03:36.3979496Z with:\n2022-03-21T21:03:36.3979654Z env:\n2022-03-21T21:03:36.3979814Z   IN_CI: 1\n2022-03-21T21:03:36.3979968Z   IS_GHA: 1\n2022-03-21T21:03:36.3980157Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:03:36.3980360Z ##[endgroup]\n2022-03-21T21:03:36.3996257Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu) (9/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-22T00:41:15.5325784Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-22T00:41:10.3015614Z   Downloading s3transfer-0.5.2-py3-none-any.whl (79 kB)\n2022-03-22T00:41:10.3625659Z      ---------------------------------------- 79.5/79.5 KB 1.1 MB/s eta 0:00:00\n2022-03-22T00:41:10.4120236Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-22T00:41:10.4170155Z   Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-22T00:41:10.4722115Z      -------------------------------------- 247.7/247.7 KB 5.2 MB/s eta 0:00:00\n2022-03-22T00:41:10.4843512Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-22T00:41:10.6596108Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-22T00:41:10.8733354Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-22T00:41:15.3745408Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-22T00:41:15.4987162Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-09cacc848abc3dd32\n2022-03-22T00:41:15.5325784Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-22T00:41:15.5373630Z + GHA_WORKFLOW_JOB_ID=\n2022-03-22T00:41:15.5404353Z ##[error]Process completed with exit code 2.\n2022-03-22T00:41:15.5790508Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-22T00:41:15.5791192Z with:\n2022-03-22T00:41:15.5791530Z env:\n2022-03-22T00:41:15.5791849Z   IN_CI: 1\n2022-03-22T00:41:15.5792186Z   IS_GHA: 1\n2022-03-22T00:41:15.5792599Z   GIT_DEFAULT_BRANCH: master\n2022-03-22T00:41:15.5793237Z   pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-22T00:41:15.5793831Z ##[endgroup]\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge) (10/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T20:50:32.9799307Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T20:50:32.8167560Z   Attempting uninstall: s3transfer\n2022-03-21T20:50:32.8169351Z     Found existing installation: s3transfer 0.3.7\n2022-03-21T20:50:32.8213295Z     Uninstalling s3transfer-0.3.7:\n2022-03-21T20:50:32.8219209Z       Successfully uninstalled s3transfer-0.3.7\n2022-03-21T20:50:32.8602320Z   Attempting uninstall: boto3\n2022-03-21T20:50:32.8603289Z     Found existing installation: boto3 1.16.34\n2022-03-21T20:50:32.8704535Z     Uninstalling boto3-1.16.34:\n2022-03-21T20:50:32.8719403Z       Successfully uninstalled boto3-1.16.34\n2022-03-21T20:50:32.9244278Z Successfully installed boto3-1.19.12 botocore-1.22.12 s3transfer-0.5.2\n2022-03-21T20:50:32.9710449Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0c568461a276d4a71\n2022-03-21T20:50:32.9799307Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T20:50:32.9812238Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T20:50:32.9823052Z ##[error]Process completed with exit code 2.\n2022-03-21T20:50:32.9859290Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T20:50:32.9859527Z with:\n2022-03-21T20:50:32.9859664Z env:\n2022-03-21T20:50:32.9859817Z   IN_CI: 1\n2022-03-21T20:50:32.9859977Z   IS_GHA: 1\n2022-03-21T20:50:32.9860144Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T20:50:32.9860327Z ##[endgroup]\n2022-03-21T20:50:32.9893642Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge) (11/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:05:00.7163042Z SUMMARY: Undefined.../jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in\n\n2022-03-21T21:05:00.6660824Z     #10 0x55fc8a3ea801 in run_mod /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:1037\n2022-03-21T21:05:00.6661768Z     #11 0x55fc8a3f57a9 in PyRun_StringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:961\n2022-03-21T21:05:00.6662455Z     #12 0x55fc8a3f580b in PyRun_SimpleStringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:455\n2022-03-21T21:05:00.6663570Z     #13 0x55fc8a3f5908 in pymain_run_command /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:420\n2022-03-21T21:05:00.6663952Z     #14 0x55fc8a3f5908 in pymain_run_python /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:2907\n2022-03-21T21:05:00.6664431Z     #15 0x55fc8a3f5908 in pymain_main /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3460\n2022-03-21T21:05:00.6665304Z     #16 0x55fc8a3f5ccb in _Py_UnixMain /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3495\n2022-03-21T21:05:00.7162113Z     #17 0x7f940d00f83f in __libc_start_main /build/glibc-S7Ft5T/glibc-2.23/csu/../csu/libc-start.c:291\n2022-03-21T21:05:00.7162534Z     #18 0x55fc8a39a554 in _start (/opt/conda/bin/python3.7+0x1d7554)\n2022-03-21T21:05:00.7162711Z \n2022-03-21T21:05:00.7163042Z SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior /var/lib/jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in \n2022-03-21T21:05:00.7334595Z + retcode=1\n2022-03-21T21:05:00.7334954Z + set -e\n2022-03-21T21:05:00.7335215Z + return 1\n2022-03-21T21:05:00.7338688Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX-* ]]\n2022-03-21T21:05:00.7339232Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X ]]\n2022-03-21T21:05:00.7340113Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX2-* ]]\n2022-03-21T21:05:00.7340612Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\2 ]]\n2022-03-21T21:05:00.7341187Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX512-* ]]\n2022-03-21T21:05:00.7341668Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\5\\1\\2 ]]\n2022-03-21T21:05:00.7344466Z + [[ linux-xenial-py3.7-clang7-asan-default == *tbb* ]]\n\n\n pull / linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge) (12/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:06:03.4437430Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:06:03.0752199Z + python3 -m pip install boto3==1.19.12\n2022-03-21T22:06:03.2853252Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T22:06:03.3032326Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T22:06:03.3081589Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T22:06:03.3093911Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T22:06:03.3120244Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T22:06:03.3162406Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T22:06:03.3188431Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:06:03.3337181Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:06:03.4348072Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0ee48c8811fafc444\n2022-03-21T22:06:03.4437430Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:06:03.4450920Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:06:03.4461263Z ##[error]Process completed with exit code 2.\n2022-03-21T22:06:03.4502346Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T22:06:03.4502576Z with:\n2022-03-21T22:06:03.4502730Z env:\n2022-03-21T22:06:03.4502888Z   IN_CI: 1\n2022-03-21T22:06:03.4503038Z   IS_GHA: 1\n2022-03-21T22:06:03.4503302Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T22:06:03.4503492Z ##[endgroup]\n2022-03-21T22:06:03.4519156Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge) (13/29)\nStep: \"Test\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T20:50:13.2205634Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T20:50:12.8679322Z + python3 -m pip install boto3==1.19.12\n2022-03-21T20:50:13.0744228Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T20:50:13.0916284Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T20:50:13.0964264Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T20:50:13.1005656Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T20:50:13.1017299Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T20:50:13.1041042Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T20:50:13.1189450Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T20:50:13.1208751Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T20:50:13.2119445Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0d02da60fd18c22f5\n2022-03-21T20:50:13.2205634Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T20:50:13.2217939Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T20:50:13.2220259Z ##[error]Process completed with exit code 2.\n2022-03-21T20:50:13.2248664Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T20:50:13.2249012Z with:\n2022-03-21T20:50:13.2249260Z env:\n2022-03-21T20:50:13.2249500Z   IN_CI: 1\n2022-03-21T20:50:13.2249738Z   IS_GHA: 1\n2022-03-21T20:50:13.2250025Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T20:50:13.2250329Z ##[endgroup]\n2022-03-21T20:50:13.2272735Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu) (14/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T23:47:38.0451999Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T23:47:37.5554508Z + python3 -m pip install boto3==1.19.12\n2022-03-21T23:47:37.8411473Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T23:47:37.8631484Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T23:47:37.8699561Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T23:47:37.8737037Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T23:47:37.8754443Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T23:47:37.8814393Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T23:47:37.8849540Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T23:47:37.9059579Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T23:47:38.0336298Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0b44f47f4292089a2\n2022-03-21T23:47:38.0451999Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T23:47:38.0469471Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T23:47:38.0484106Z ##[error]Process completed with exit code 2.\n2022-03-21T23:47:38.0532678Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T23:47:38.0533007Z with:\n2022-03-21T23:47:38.0533223Z env:\n2022-03-21T23:47:38.0533440Z   IN_CI: 1\n2022-03-21T23:47:38.0533649Z   IS_GHA: 1\n2022-03-21T23:47:38.0533902Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T23:47:38.0534170Z   GPU_FLAG: --gpus all\n2022-03-21T23:47:38.0534401Z ##[endgroup]\n\n\n pull / linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge) (15/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:04:59.3115800Z SUMMARY: Undefined.../jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in\n\n2022-03-21T21:04:59.2595213Z     #10 0x55a7f39a4801 in run_mod /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:1037\n2022-03-21T21:04:59.2595707Z     #11 0x55a7f39af7a9 in PyRun_StringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:961\n2022-03-21T21:04:59.2597203Z     #12 0x55a7f39af80b in PyRun_SimpleStringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:455\n2022-03-21T21:04:59.2598205Z     #13 0x55a7f39af908 in pymain_run_command /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:420\n2022-03-21T21:04:59.2598697Z     #14 0x55a7f39af908 in pymain_run_python /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:2907\n2022-03-21T21:04:59.2599178Z     #15 0x55a7f39af908 in pymain_main /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3460\n2022-03-21T21:04:59.2599747Z     #16 0x55a7f39afccb in _Py_UnixMain /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3495\n2022-03-21T21:04:59.3114751Z     #17 0x7f3b3822383f in __libc_start_main /build/glibc-S7Ft5T/glibc-2.23/csu/../csu/libc-start.c:291\n2022-03-21T21:04:59.3115277Z     #18 0x55a7f3954554 in _start (/opt/conda/bin/python3.7+0x1d7554)\n2022-03-21T21:04:59.3115468Z \n2022-03-21T21:04:59.3115800Z SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior /var/lib/jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in \n2022-03-21T21:04:59.3292385Z + retcode=1\n2022-03-21T21:04:59.3292781Z + set -e\n2022-03-21T21:04:59.3293062Z + return 1\n2022-03-21T21:04:59.3295462Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX-* ]]\n2022-03-21T21:04:59.3295802Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X ]]\n2022-03-21T21:04:59.3296394Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX2-* ]]\n2022-03-21T21:04:59.3296700Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\2 ]]\n2022-03-21T21:04:59.3297055Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX512-* ]]\n2022-03-21T21:04:59.3297416Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\5\\1\\2 ]]\n2022-03-21T21:04:59.3299623Z + [[ linux-xenial-py3.7-clang7-asan-default == *tbb* ]]\n\n\n pull / win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge) (16/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:14:31.7846086Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:14:25.5525714Z Collecting jmespath<1.0.0,>=0.7.1\n2022-03-21T22:14:25.5568155Z   Downloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)\n2022-03-21T22:14:25.5952617Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-21T22:14:25.6169392Z   Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-21T22:14:25.6629996Z      -------------------------------------- 247.7/247.7 KB 5.1 MB/s eta 0:00:00\n2022-03-21T22:14:25.6710247Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:14:25.8284354Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:14:25.9816751Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-21T22:14:31.6672236Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-21T22:14:31.7630473Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0ed0915ecee5d2424\n2022-03-21T22:14:31.7846086Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:14:31.7876742Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:14:31.7897140Z ##[error]Process completed with exit code 2.\n2022-03-21T22:14:31.8195621Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-21T22:14:31.8196110Z with:\n2022-03-21T22:14:31.8196356Z env:\n2022-03-21T22:14:31.8196614Z   IN_CI: 1\n2022-03-21T22:14:31.8196876Z   IS_GHA: 1\n2022-03-21T22:14:31.8197169Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T22:14:31.8197652Z   pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-21T22:14:31.8198093Z ##[endgroup]\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge) (17/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:19:15.8845728Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:19:15.5116060Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:19:15.7231476Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:19:15.7409711Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:19:15.7458478Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:19:15.7470508Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:19:15.7496799Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:19:15.7538362Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:19:15.7566161Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:19:15.7711630Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:19:15.8753543Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0e2b3b4ddb246ff2a\n2022-03-21T21:19:15.8845728Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:19:15.8859814Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:19:15.8870165Z ##[error]Process completed with exit code 2.\n2022-03-21T21:19:15.8917039Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:19:15.8917279Z with:\n2022-03-21T21:19:15.8917433Z env:\n2022-03-21T21:19:15.8917586Z   IN_CI: 1\n2022-03-21T21:19:15.8917734Z   IS_GHA: 1\n2022-03-21T21:19:15.8917917Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:19:15.8918102Z ##[endgroup]\n2022-03-21T21:19:15.8934572Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu) (18/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T23:19:48.5900162Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T23:19:48.0742254Z + python3 -m pip install boto3==1.19.12\n2022-03-21T23:19:48.3742563Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T23:19:48.3976536Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T23:19:48.4048700Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T23:19:48.4065374Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T23:19:48.4128076Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T23:19:48.4164273Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T23:19:48.4202610Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T23:19:48.4416723Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T23:19:48.5773033Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-07ab7a3c4a5402af2\n2022-03-21T23:19:48.5900162Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T23:19:48.5919822Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T23:19:48.5936087Z ##[error]Process completed with exit code 2.\n2022-03-21T23:19:48.6007930Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T23:19:48.6008268Z with:\n2022-03-21T23:19:48.6008483Z env:\n2022-03-21T23:19:48.6008701Z   IN_CI: 1\n2022-03-21T23:19:48.6008920Z   IS_GHA: 1\n2022-03-21T23:19:48.6009170Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T23:19:48.6009440Z   GPU_FLAG: --gpus all\n2022-03-21T23:19:48.6009671Z ##[endgroup]\n\n\n pull / win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu) (19/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:54:04.2844259Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:53:59.0889659Z   Downloading botocore-1.22.12-py3-none-any.whl (8.1 MB)\n2022-03-21T22:53:59.6881416Z      ---------------------------------------- 8.1/8.1 MB 14.0 MB/s eta 0:00:00\n2022-03-21T22:53:59.7427779Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:53:59.7691882Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-21T22:53:59.7779847Z   Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-21T22:53:59.8281663Z      -------------------------------------- 247.7/247.7 KB 5.1 MB/s eta 0:00:00\n2022-03-21T22:54:00.0185115Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:54:00.2359770Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-21T22:54:04.1208891Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-21T22:54:04.2505862Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-03b4fbe63be8ef4b0\n2022-03-21T22:54:04.2844259Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:54:04.2891082Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:54:04.2919900Z ##[error]Process completed with exit code 2.\n2022-03-21T22:54:04.3377901Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-21T22:54:04.3378575Z with:\n2022-03-21T22:54:04.3378930Z env:\n2022-03-21T22:54:04.3379275Z   IN_CI: 1\n2022-03-21T22:54:04.3379600Z   IS_GHA: 1\n2022-03-21T22:54:04.3380023Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T22:54:04.3380691Z   pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-21T22:54:04.3381278Z ##[endgroup]\n\n\n pull / linux-bionic-py3.7-clang9 / test (noarch, 1, 1, linux.2xlarge) (20/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:09:34.0074610Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:09:33.6365531Z + python3 -m pip install boto3==1.19.12\n2022-03-21T22:09:33.8475619Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T22:09:33.8655152Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T22:09:33.8704395Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T22:09:33.8716774Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T22:09:33.8760145Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T22:09:33.8785000Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T22:09:33.8811316Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:09:33.8960134Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:09:33.9984866Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0d325eb9fd156146f\n2022-03-21T22:09:34.0074610Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:09:34.0087465Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:09:34.0101743Z ##[error]Process completed with exit code 2.\n2022-03-21T22:09:34.0154014Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T22:09:34.0154246Z with:\n2022-03-21T22:09:34.0154412Z env:\n2022-03-21T22:09:34.0154574Z   IN_CI: 1\n2022-03-21T22:09:34.0154728Z   IS_GHA: 1\n2022-03-21T22:09:34.0154917Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T22:09:34.0155112Z ##[endgroup]\n2022-03-21T22:09:34.0191047Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge) (21/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:03:17.8502655Z [E request_callbac...yUniqueId(created_on=0, local_id=0) to be created.\n\n2022-03-21T21:03:14.4669960Z INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmpxgdsmeer\n2022-03-21T21:03:14.4671407Z INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmpxgdsmeer/_remote_module_non_sriptable.py\n2022-03-21T21:03:14.4973023Z INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmp1i2hfmpc\n2022-03-21T21:03:14.4973800Z INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmp1i2hfmpc/_remote_module_non_sriptable.py\n2022-03-21T21:03:14.5532339Z INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmpgx4da7b0\n2022-03-21T21:03:14.5533064Z INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmpgx4da7b0/_remote_module_non_sriptable.py\n2022-03-21T21:03:14.7050673Z INFO:torch.testing._internal.common_distributed:Starting event listener thread for rank 0\n2022-03-21T21:03:14.7097127Z INFO:torch.testing._internal.common_distributed:Starting event listener thread for rank 3\n2022-03-21T21:03:14.7398339Z INFO:torch.testing._internal.common_distributed:Starting event listener thread for rank 2\n2022-03-21T21:03:14.7922283Z INFO:torch.testing._internal.common_distributed:Starting event listener thread for rank 1\n2022-03-21T21:03:17.8502655Z [E request_callback_no_python.cpp:559] Received error while processing request type 261: false INTERNAL ASSERT FAILED at \"/var/lib/jenkins/workspace/torch/csrc/distributed/rpc/rref_context.cpp\":387, please report a bug to PyTorch. Expected OwnerRRef with id GloballyUniqueId(created_on=0, local_id=0) to be created.\n2022-03-21T21:03:17.8503603Z Exception raised from getOwnerRRef at /var/lib/jenkins/workspace/torch/csrc/distributed/rpc/rref_context.cpp:387 (most recent call first):\n2022-03-21T21:03:17.8504385Z frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x69 (0x7f180df19e19 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)\n2022-03-21T21:03:17.8505131Z frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0xd2 (0x7f180df160e2 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)\n2022-03-21T21:03:17.8505927Z frame #2: c10::detail::torchInternalAssertFail(char const*, char const*, unsigned int, char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0x4e (0x7f180df17a7e in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)\n2022-03-21T21:03:17.8506674Z frame #3: torch::distributed::rpc::RRefContext::getOwnerRRef(torch::distributed::rpc::GloballyUniqueId const&, bool) + 0x4b4 (0x7f18118b7b64 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)\n2022-03-21T21:03:17.8507642Z frame #4: torch::distributed::rpc::RequestCallbackNoPython::assignOwnerRRef(torch::distributed::rpc::GloballyUniqueId const&, torch::distributed::rpc::GloballyUniqueId const&, c10::intrusive_ptr<c10::ivalue::Future, c10::detail::intrusive_target_default_null_type<c10::ivalue::Future> >) const + 0x70 (0x7f18118a7bf0 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)\n2022-03-21T21:03:17.8508613Z frame #5: torch::distributed::rpc::RequestCallbackImpl::processPythonRemoteCall(torch::distributed::rpc::RpcCommandBase&, std::vector<c10::Stream, std::allocator<c10::Stream> >) const + 0xc8 (0x7f1819736208 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)\n2022-03-21T21:03:17.8509749Z frame #6: torch::distributed::rpc::RequestCallbackNoPython::processRpc(torch::distributed::rpc::RpcCommandBase&, torch::distributed::rpc::MessageType const&, std::vector<c10::Stream, std::allocator<c10::Stream> >) const + 0x194 (0x7f18118ac914 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)\n2022-03-21T21:03:17.8510708Z frame #7: torch::distributed::rpc::RequestCallbackImpl::processRpcWithErrors(torch::distributed::rpc::RpcCommandBase&, torch::distributed::rpc::MessageType const&, std::vector<c10::Stream, std::allocator<c10::Stream> >) const + 0x65 (0x7f1819735865 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)\n2022-03-21T21:03:17.8511369Z frame #8: <unknown function> + 0x375249a (0x7f18118a949a in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test (22/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T20:01:07.7015580Z \ufffd[36;1m  echo \"ERR...t available for the merge-base of your branch\"\ufffd[0m\n\n2022-03-21T20:01:07.7012399Z \ufffd[36;1mfi\ufffd[0m\n2022-03-21T20:01:07.7012634Z \ufffd[36;1m# Covers the case where a previous tag doesn't exist for the tree\ufffd[0m\n2022-03-21T20:01:07.7012992Z \ufffd[36;1m# this is only really applicable on trees that don't have `.circleci/docker` at its merge base, i.e. nightly\ufffd[0m\n2022-03-21T20:01:07.7013373Z \ufffd[36;1mif ! git rev-parse \"$MERGE_BASE:.circleci/docker\"; then\ufffd[0m\n2022-03-21T20:01:07.7013784Z \ufffd[36;1m  echo \"Directory '.circleci/docker' not found in commit $MERGE_BASE, you should probably rebase onto a more recent commit\"\ufffd[0m\n2022-03-21T20:01:07.7014149Z \ufffd[36;1m  exit 1\ufffd[0m\n2022-03-21T20:01:07.7014325Z \ufffd[36;1mfi\ufffd[0m\n2022-03-21T20:01:07.7014573Z \ufffd[36;1mPREVIOUS_DOCKER_TAG=$(git rev-parse \"$MERGE_BASE:.circleci/docker\")\ufffd[0m\n2022-03-21T20:01:07.7014907Z \ufffd[36;1m# If no image exists but the hash is the same as the previous hash then we should error out here\ufffd[0m\n2022-03-21T20:01:07.7015231Z \ufffd[36;1mif [[ \"${PREVIOUS_DOCKER_TAG}\" = \"${DOCKER_TAG}\" ]]; then\ufffd[0m\n2022-03-21T20:01:07.7015580Z \ufffd[36;1m  echo \"ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch\"\ufffd[0m\n2022-03-21T20:01:07.7015931Z \ufffd[36;1m  echo \"       contact the PyTorch team to restore the original images\"\ufffd[0m\n2022-03-21T20:01:07.7016225Z \ufffd[36;1m  exit 1\ufffd[0m\n2022-03-21T20:01:07.7016400Z \ufffd[36;1mfi\ufffd[0m\n2022-03-21T20:01:07.7016608Z \ufffd[36;1mecho ::set-output name=rebuild::yes\ufffd[0m\n2022-03-21T20:01:07.7027605Z shell: /usr/bin/bash --noprofile --norc -e -o pipefail {0}\n2022-03-21T20:01:07.7027837Z env:\n2022-03-21T20:01:07.7028006Z   IN_CI: 1\n2022-03-21T20:01:07.7028159Z   IS_GHA: 1\n2022-03-21T20:01:07.7028346Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T20:01:07.7028589Z   BASE_REVISION: 6643522db9ff595f564b8081de58b3a33c546178\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu) (23/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-22T00:49:54.2949572Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-22T00:49:53.8049151Z + python3 -m pip install boto3==1.19.12\n2022-03-22T00:49:54.0981629Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-22T00:49:54.1207562Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-22T00:49:54.1277146Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-22T00:49:54.1315027Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-22T00:49:54.1331813Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-22T00:49:54.1391622Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-22T00:49:54.1609217Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-22T00:49:54.1637417Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-22T00:49:54.2830197Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0f7c32fe13be12fea\n2022-03-22T00:49:54.2949572Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-22T00:49:54.2966933Z + GHA_WORKFLOW_JOB_ID=\n2022-03-22T00:49:54.2982588Z ##[error]Process completed with exit code 2.\n2022-03-22T00:49:54.3031464Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-22T00:49:54.3031794Z with:\n2022-03-22T00:49:54.3032012Z env:\n2022-03-22T00:49:54.3032227Z   IN_CI: 1\n2022-03-22T00:49:54.3032434Z   IS_GHA: 1\n2022-03-22T00:49:54.3032681Z   GIT_DEFAULT_BRANCH: master\n2022-03-22T00:49:54.3033084Z   GPU_FLAG: --gpus all\n2022-03-22T00:49:54.3033312Z ##[endgroup]\n\n\n pull / win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge) (24/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:56:12.5872636Z C:\\actions-runner\\...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:56:07.3365589Z   Downloading botocore-1.22.12-py3-none-any.whl (8.1 MB)\n2022-03-21T21:56:07.7926584Z      ---------------------------------------- 8.1/8.1 MB 17.3 MB/s eta 0:00:00\n2022-03-21T21:56:07.9319362Z Collecting python-dateutil<3.0.0,>=2.1\n2022-03-21T21:56:07.9366132Z   Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)\n2022-03-21T21:56:08.0077590Z      -------------------------------------- 247.7/247.7 KB 3.0 MB/s eta 0:00:00\n2022-03-21T21:56:08.0164070Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:56:08.1775537Z Requirement already satisfied: six>=1.5 in c:\\actions-runner\\_work\\_tool\\python\\3.10.3\\x64\\lib\\site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:56:08.3393469Z Installing collected packages: python-dateutil, jmespath, botocore, s3transfer, boto3\n2022-03-21T21:56:12.4576766Z Successfully installed boto3-1.19.12 botocore-1.22.12 jmespath-0.10.0 python-dateutil-2.8.2 s3transfer-0.5.2\n2022-03-21T21:56:12.5641959Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0afad69838118af0e\n2022-03-21T21:56:12.5872636Z C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\\python3.exe: can't open file 'C:\\\\actions-runner\\\\_work\\\\pytorch\\\\pytorch\\\\.github\\\\scripts\\\\get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:56:12.5905611Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:56:12.5927729Z ##[error]Process completed with exit code 2.\n2022-03-21T21:56:12.6239531Z ##[group]Run pytorch/pytorch/.github/actions/teardown-win@master\n2022-03-21T21:56:12.6240039Z with:\n2022-03-21T21:56:12.6240299Z env:\n2022-03-21T21:56:12.6240557Z   IN_CI: 1\n2022-03-21T21:56:12.6240805Z   IS_GHA: 1\n2022-03-21T21:56:12.6241118Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:56:12.6241613Z   pythonLocation: C:\\actions-runner\\_work\\_tool\\Python\\3.10.3\\x64\n2022-03-21T21:56:12.6242052Z ##[endgroup]\n\n\n pull / linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge) (25/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:46:39.5474616Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:46:39.1884210Z + python3 -m pip install boto3==1.19.12\n2022-03-21T21:46:39.3928976Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T21:46:39.4105069Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T21:46:39.4152571Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T21:46:39.4194931Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T21:46:39.4218947Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T21:46:39.4230812Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T21:46:39.4380089Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T21:46:39.4399461Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T21:46:39.5387703Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0888bed1149cca415\n2022-03-21T21:46:39.5474616Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:46:39.5487145Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:46:39.5497480Z ##[error]Process completed with exit code 2.\n2022-03-21T21:46:39.5541319Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:46:39.5541544Z with:\n2022-03-21T21:46:39.5541698Z env:\n2022-03-21T21:46:39.5541851Z   IN_CI: 1\n2022-03-21T21:46:39.5541997Z   IS_GHA: 1\n2022-03-21T21:46:39.5542176Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:46:39.5542361Z ##[endgroup]\n2022-03-21T21:46:39.5557878Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge) (26/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:34:57.0623859Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:34:56.9039884Z   Attempting uninstall: s3transfer\n2022-03-21T21:34:56.9041446Z     Found existing installation: s3transfer 0.3.7\n2022-03-21T21:34:56.9090783Z     Uninstalling s3transfer-0.3.7:\n2022-03-21T21:34:56.9095968Z       Successfully uninstalled s3transfer-0.3.7\n2022-03-21T21:34:56.9453014Z   Attempting uninstall: boto3\n2022-03-21T21:34:56.9454356Z     Found existing installation: boto3 1.16.34\n2022-03-21T21:34:56.9564320Z     Uninstalling boto3-1.16.34:\n2022-03-21T21:34:56.9578035Z       Successfully uninstalled boto3-1.16.34\n2022-03-21T21:34:57.0091363Z Successfully installed boto3-1.19.12 botocore-1.22.12 s3transfer-0.5.2\n2022-03-21T21:34:57.0536230Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-034a3afd5d80b91fd\n2022-03-21T21:34:57.0623859Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:34:57.0637167Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:34:57.0647396Z ##[error]Process completed with exit code 2.\n2022-03-21T21:34:57.0688237Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:34:57.0688481Z with:\n2022-03-21T21:34:57.0688631Z env:\n2022-03-21T21:34:57.0688769Z   IN_CI: 1\n2022-03-21T21:34:57.0688930Z   IS_GHA: 1\n2022-03-21T21:34:57.0689109Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:34:57.0689462Z ##[endgroup]\n2022-03-21T21:34:57.0704768Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n pull / linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge) (27/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:05:00.7896545Z SUMMARY: Undefined.../jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in\n\n2022-03-21T21:05:00.7395504Z     #10 0x5597fd5a9801 in run_mod /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:1037\n2022-03-21T21:05:00.7396330Z     #11 0x5597fd5b47a9 in PyRun_StringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:961\n2022-03-21T21:05:00.7396688Z     #12 0x5597fd5b480b in PyRun_SimpleStringFlags /tmp/build/80754af9/python_1627392990942/work/Python/pythonrun.c:455\n2022-03-21T21:05:00.7398664Z     #13 0x5597fd5b4908 in pymain_run_command /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:420\n2022-03-21T21:05:00.7399177Z     #14 0x5597fd5b4908 in pymain_run_python /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:2907\n2022-03-21T21:05:00.7399663Z     #15 0x5597fd5b4908 in pymain_main /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3460\n2022-03-21T21:05:00.7399986Z     #16 0x5597fd5b4ccb in _Py_UnixMain /tmp/build/80754af9/python_1627392990942/work/Modules/main.c:3495\n2022-03-21T21:05:00.7895241Z     #17 0x7f0a5905983f in __libc_start_main /build/glibc-S7Ft5T/glibc-2.23/csu/../csu/libc-start.c:291\n2022-03-21T21:05:00.7895772Z     #18 0x5597fd559554 in _start (/opt/conda/bin/python3.7+0x1d7554)\n2022-03-21T21:05:00.7896033Z \n2022-03-21T21:05:00.7896545Z SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior /var/lib/jenkins/workspace/aten/src/ATen/Utils.cpp:20:3 in \n2022-03-21T21:05:00.8063448Z + retcode=1\n2022-03-21T21:05:00.8063787Z + set -e\n2022-03-21T21:05:00.8064058Z + return 1\n2022-03-21T21:05:00.8067638Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX-* ]]\n2022-03-21T21:05:00.8068127Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X ]]\n2022-03-21T21:05:00.8069018Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX2-* ]]\n2022-03-21T21:05:00.8069500Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\2 ]]\n2022-03-21T21:05:00.8070105Z + [[ linux-xenial-py3.7-clang7-asan-default == *-NO_AVX512-* ]]\n2022-03-21T21:05:00.8070580Z + [[ default == \\n\\o\\g\\p\\u\\_\\N\\O\\_\\A\\V\\X\\5\\1\\2 ]]\n2022-03-21T21:05:00.8072640Z + [[ linux-xenial-py3.7-clang7-asan-default == *tbb* ]]\n\n\n pull / linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu) (28/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T22:48:17.3384813Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T22:48:16.8599645Z + python3 -m pip install boto3==1.19.12\n2022-03-21T22:48:17.1464241Z Defaulting to user installation because normal site-packages is not writeable\n2022-03-21T22:48:17.1685222Z Requirement already satisfied: boto3==1.19.12 in /home/ec2-user/.local/lib/python3.7/site-packages (1.19.12)\n2022-03-21T22:48:17.1754164Z Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.10.0)\n2022-03-21T22:48:17.1771662Z Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (0.5.2)\n2022-03-21T22:48:17.1808722Z Requirement already satisfied: botocore<1.23.0,>=1.22.12 in /home/ec2-user/.local/lib/python3.7/site-packages (from boto3==1.19.12) (1.22.12)\n2022-03-21T22:48:17.1868636Z Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (2.8.2)\n2022-03-21T22:48:17.1903889Z Requirement already satisfied: urllib3<1.27,>=1.25.4 in /home/ec2-user/.local/lib/python3.7/site-packages (from botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.26.9)\n2022-03-21T22:48:17.2113746Z Requirement already satisfied: six>=1.5 in /home/ec2-user/.local/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.23.0,>=1.22.12->boto3==1.19.12) (1.16.0)\n2022-03-21T22:48:17.3267404Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-01fe178c405417375\n2022-03-21T22:48:17.3384813Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T22:48:17.3402286Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T22:48:17.3418376Z ##[error]Process completed with exit code 2.\n2022-03-21T22:48:17.3470528Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T22:48:17.3470874Z with:\n2022-03-21T22:48:17.3471096Z env:\n2022-03-21T22:48:17.3471327Z   IN_CI: 1\n2022-03-21T22:48:17.3471538Z   IS_GHA: 1\n2022-03-21T22:48:17.3471802Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T22:48:17.3472083Z   GPU_FLAG: --gpus all\n2022-03-21T22:48:17.3472322Z ##[endgroup]\n\n\n pull / linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge) (29/29)\nStep: \"Upload test statistics\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-03-21T21:16:38.9646300Z python3: can't ope...ow_job_id.py': [Errno 2] No such file or directory\n\n2022-03-21T21:16:38.7995969Z   Attempting uninstall: s3transfer\n2022-03-21T21:16:38.7998039Z     Found existing installation: s3transfer 0.3.7\n2022-03-21T21:16:38.8066994Z     Uninstalling s3transfer-0.3.7:\n2022-03-21T21:16:38.8072844Z       Successfully uninstalled s3transfer-0.3.7\n2022-03-21T21:16:38.8449275Z   Attempting uninstall: boto3\n2022-03-21T21:16:38.8451430Z     Found existing installation: boto3 1.16.34\n2022-03-21T21:16:38.8559828Z     Uninstalling boto3-1.16.34:\n2022-03-21T21:16:38.8574290Z       Successfully uninstalled boto3-1.16.34\n2022-03-21T21:16:38.9100438Z Successfully installed boto3-1.19.12 botocore-1.22.12 s3transfer-0.5.2\n2022-03-21T21:16:38.9558098Z ++ python3 .github/scripts/get_workflow_job_id.py 2018440039 i-0d779c59d277d32ee\n2022-03-21T21:16:38.9646300Z python3: can't open file '.github/scripts/get_workflow_job_id.py': [Errno 2] No such file or directory\n2022-03-21T21:16:38.9658894Z + GHA_WORKFLOW_JOB_ID=\n2022-03-21T21:16:38.9673240Z ##[error]Process completed with exit code 2.\n2022-03-21T21:16:38.9720106Z ##[group]Run pytorch/pytorch/.github/actions/teardown-linux@master\n2022-03-21T21:16:38.9720333Z with:\n2022-03-21T21:16:38.9720485Z env:\n2022-03-21T21:16:38.9720645Z   IN_CI: 1\n2022-03-21T21:16:38.9720793Z   IS_GHA: 1\n2022-03-21T21:16:38.9720970Z   GIT_DEFAULT_BRANCH: master\n2022-03-21T21:16:38.9721151Z ##[endgroup]\n2022-03-21T21:16:38.9736762Z ##[group]Run # ignore expansion of \"docker ps -q\" since it could be empty\n\n\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
+                "createdAt": "2021-11-10T08:42:52Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": {
+                  "login": "facebook-github-bot"
+                },
+                "databaseId": 964902894
+              },
+              {
+                "bodyText": "@vitaly-fedyunin @gottbrath  FYI that this is the oneDNN Graph API integration. It depends on the #63748.",
+                "createdAt": "2021-11-16T16:36:52Z",
+                "author": {
+                  "login": "Jianhui-Li"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 970451860
+              },
+              {
+                "bodyText": "CI failures are currently being caused by some issues in the CI infra, and are also occurring with other PRs.",
+                "createdAt": "2021-12-10T05:59:17Z",
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 990641309
+              },
+              {
+                "bodyText": "CI failures are unrelated.",
+                "createdAt": "2021-12-10T20:44:09Z",
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 991281407
+              },
+              {
+                "bodyText": "The CI failure is unrelated.",
+                "createdAt": "2021-12-16T02:45:59Z",
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 995389295
+              },
+              {
+                "bodyText": "Hi, thank you for the PR!\nDo you mind running a larger amount of torchbench and reporting numbers ? You can look at Jason's post here for what models are supported in script. Initially just the vision models would be useful. @Krovatkin also did some benchmarking of a traced Bert model and found on average a ~16% speedup with this PR.",
+                "createdAt": "2022-01-18T18:22:34Z",
+                "author": {
+                  "login": "eellison"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1015689390
+              },
+              {
+                "bodyText": "Thanks a lot for reviewing, @eellison & @Krovatkin!\nWe just wanted to let you know that we're working on the benchmarking & will get back to you in a day, or two.\nUPDATE (Jan 21): While running some TorchBench models, we discovered some composability issues, and are working to ensure that oneDNN Graph would complement PyTorch's existing fusion capabilities, not hinder them.\nUPDATE (Jan 24): We've resolved the issues & will update this PR later today. Thanks!",
+                "createdAt": "2022-01-20T00:31:01Z",
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": {
+                  "login": "sanchitintel"
+                },
+                "databaseId": 1016996190
+              },
+              {
+                "bodyText": "Hello @eellison,\nWe used this TorchBench branch for comparison. compare_llga.sh can be run for comparison.\nFor benchmarking mobilenet_v3_large with hardswish support in oneDNN Graph, this oneDNN Graph branch can be used in third_party/ideep/mkl-dnn. It delivers a speedup over PyTorch JIT (NNC + OFI) because 21 additional reorders are prevented (the major factor here), and fusion with conv also helps further.\nThe next release of oneDNN Graph would have hardswish support.\nWe're also exploring adding a hardsigmoid op in oneDNN Graph.\nThank you!",
+                "createdAt": "2022-01-26T23:51:38Z",
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": {
+                  "login": "sanchitintel"
+                },
+                "databaseId": 1022709513
+              },
+              {
+                "bodyText": "Please note that this PR should be merged after #71546, as #71546 changes the  third_party/ideep commit (this PR also uses that ideep commit, but it'd probably be better to merge #71546 first, so that oneDNN v2.5.2 upgrade would be in a separate PR). Thank you!",
+                "createdAt": "2022-01-31T23:57:21Z",
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1026330085
+              },
+              {
+                "bodyText": "@sanchitintel mind rebasing and i'll land ?",
+                "createdAt": "2022-03-01T20:07:57Z",
+                "author": {
+                  "login": "eellison"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1055813984
+              },
+              {
+                "bodyText": "@eellison has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.",
+                "createdAt": "2022-03-02T17:44:47Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1057203495
+              },
+              {
+                "bodyText": "Thanks a lot for taking a look, @eellison! To fix this error, we would enable Bazel build for oneDNN Graph.",
+                "createdAt": "2022-03-07T23:03:45Z",
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": {
+                  "login": "sanchitintel"
+                },
+                "databaseId": 1061230087
+              },
+              {
+                "bodyText": "@eellison has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.",
+                "createdAt": "2022-03-09T19:24:13Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1063276600
+              },
+              {
+                "bodyText": "@malfet has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.",
+                "createdAt": "2022-03-21T19:59:41Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1074355779
+              },
+              {
+                "bodyText": "And graph_rewriter.cpp is full of DOS newlines...",
+                "createdAt": "2022-03-21T20:53:40Z",
+                "author": {
+                  "login": "malfet"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1074407452
+              },
+              {
+                "bodyText": "Hey @chunyuan-w.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
+                "createdAt": "2022-03-21T22:12:51Z",
+                "author": {
+                  "login": "github-actions"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1074471758
+              },
+              {
+                "bodyText": "Thanks a ton for your help, @malfet & @eellison! :)\nWe'll incorporate your suggestions in subsequent PR(s).",
+                "createdAt": "2022-03-21T22:41:25Z",
+                "author": {
+                  "login": "sanchitintel"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": {
+                  "login": "sanchitintel"
+                },
+                "databaseId": 1074492365
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOOYM_0Q==",
+              "hasPreviousPage": false
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=73969 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": true,
+          "author": {
+            "login": "malfet"
+          },
+          "title": "Dummy change",
+          "body": "Test Plan: None at all\n\nDifferential Revision: D34753911\n\n",
+          "headRefName": "export-D34753911",
+          "headRepository": {
+            "nameWithOwner": "malfet/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "4746da707a9912356f5179625da89616b228dc21"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MQ",
+              "hasNextPage": false
+            },
+            "totalCount": 1
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-vulkan-bionic-py3.7-clang9"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280134/jobs/2794078044"
+                              },
+                              {
+                                "name": "test (default, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280134/jobs/2794189060"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRQMQ=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592963"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-QM="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280135/jobs/2794078023"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2aM=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592965"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-QU="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-bionic-rocm4.5-py3.7"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280132/jobs/2794078060"
+                              },
+                              {
+                                "name": "test (default, 1, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280132/jobs/2794292071"
+                              },
+                              {
+                                "name": "test (default, 2, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280132/jobs/2794292205"
+                              },
+                              {
+                                "name": "test (distributed, 1, 1, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280132/jobs/2794292306"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbTiXw=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592966"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-QY="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "win-vs2019-cuda11.3-py3"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280139/jobs/2794078053"
+                              },
+                              {
+                                "name": "test (force_on_cpu, 1, 1, windows.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280139/jobs/2794536907"
+                              },
+                              {
+                                "name": "test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280139/jobs/2794536998"
+                              },
+                              {
+                                "name": "test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280139/jobs/2794537089"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbY_vU=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592967"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Qc="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280136/jobs/2794078031"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2ao=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592969"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Qk="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-docs"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280138/jobs/2794078055"
+                              },
+                              {
+                                "name": "build-docs (cpp)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280138/jobs/2794183768"
+                              },
+                              {
+                                "name": "build-docs (python)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280138/jobs/2794183828"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRIt0=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592970"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Qo="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-gcc7"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280140/jobs/2794078017"
+                              },
+                              {
+                                "name": "test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280140/jobs/2794181109"
+                              },
+                              {
+                                "name": "test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280140/jobs/2794181305"
+                              },
+                              {
+                                "name": "test (distributed, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280140/jobs/2794181488"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbRFm4=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592971"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Qs="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3-clang5-mobile-build"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280143/jobs/2794078025"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2aw=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592974"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Q4="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "shellcheck",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078028"
+                              },
+                              {
+                                "name": "quick-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078196"
+                              },
+                              {
+                                "name": "clang-tidy",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078407"
+                              },
+                              {
+                                "name": "clang-format",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078610"
+                              },
+                              {
+                                "name": "cmakelint",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078760"
+                              },
+                              {
+                                "name": "toc",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078898"
+                              },
+                              {
+                                "name": "py2-setup-validate-errormsg",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794078999"
+                              },
+                              {
+                                "name": "flake8-py3",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794079087"
+                              },
+                              {
+                                "name": "mypy",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280145/jobs/2794079199"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO4Es=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592975"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-Q8="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1958280146/jobs/2794078040"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAUbO2b0=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/4746da707a9912356f5179625da89616b228dc21/checks?check_suite_id=5595592976"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAU2F-RA="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": true
+                    }
+                  },
+                  "status": {
+                    "contexts": [
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17040614?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17040643?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17040615?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      }
+                    ]
+                  },
+                  "pushedDate": "2022-03-09T15:57:16Z",
+                  "oid": "4746da707a9912356f5179625da89616b228dc21"
+                }
+              }
+            ]
+          },
+          "changedFiles": 1,
+          "files": {
+            "nodes": [
+              {
+                "path": "tools/build_variables.bzl"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MQ",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [],
+            "pageInfo": {
+              "startCursor": null,
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "CI Flow Status\n\u269b\ufe0f CI Flow\nRuleset - Version: v1\nRuleset - File: https://github.com/malfet/pytorch/blob/4746da707a9912356f5179625da89616b228dc21/.github/generated-ciflow-ruleset.json\nPR ciflow labels: ciflow/default\nAdd ciflow labels to this PR to trigger more builds:\n\n\n\nWorkflows\nLabels (bold enabled)\nStatus\n\n\n\n\nTriggered Workflows\n\n\n\n\nlinux-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nlinux-binary-libtorch-cxx11-abi\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nlinux-binary-libtorch-pre-cxx11\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nlinux-binary-manywheel\nciflow/all, ciflow/binaries, ciflow/binaries_wheel, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nlinux-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/noarch, ciflow/trunk\n\u2705 triggered\n\n\nlinux-bionic-rocm4.5-py3.7\nciflow/all, ciflow/default, ciflow/linux, ciflow/rocm, ciflow/trunk\n\u2705 triggered\n\n\nlinux-docs\nciflow/all, ciflow/cpu, ciflow/default, ciflow/docs, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-vulkan-bionic-py3.7-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk, ciflow/vulkan\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-bazel-test\nciflow/all, ciflow/bazel, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-build\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3-clang5-mobile-custom-build-static\nciflow/all, ciflow/default, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-asan\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/sanitizers, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-clang7-onnx\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/onnx, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build\nciflow/all, ciflow/cpu, ciflow/default, ciflow/libtorch, ciflow/linux, ciflow/mobile, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nlinux-xenial-py3.7-gcc7-no-ops\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nmacos-arm64-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nmacos-arm64-binary-wheel\nciflow/binaries, ciflow/binaries_wheel, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-libtorch-cxx11-abi\nciflow/binaries, ciflow/binaries_libtorch, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-libtorch-pre-cxx11\nciflow/binaries, ciflow/binaries_libtorch, ciflow/default\n\u2705 triggered\n\n\nmacos-binary-wheel\nciflow/binaries, ciflow/binaries_wheel, ciflow/default\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit\nciflow/all, ciflow/android, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/trunk\n\u2705 triggered\n\n\nwin-vs2019-cpu-py3\nciflow/all, ciflow/cpu, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nwin-vs2019-cuda11.3-py3\nciflow/all, ciflow/cuda, ciflow/default, ciflow/trunk, ciflow/win\n\u2705 triggered\n\n\nwindows-binary-conda\nciflow/binaries, ciflow/binaries_conda, ciflow/default\n\u2705 triggered\n\n\nwindows-binary-libtorch-debug\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nwindows-binary-libtorch-release\nciflow/all, ciflow/binaries, ciflow/binaries_libtorch, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nwindows-binary-wheel\nciflow/all, ciflow/binaries, ciflow/binaries_wheel, ciflow/default, ciflow/trunk\n\u2705 triggered\n\n\nSkipped Workflows\n\n\n\n\ncaffe2-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\ndocker-builds\nciflow/all, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-custom-ops\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-arm64-metal\nciflow/all, ciflow/ios, ciflow/macos, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nios-12-5-1-x86-64-coreml\nciflow/all, ciflow/ios, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda10.2-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda11.3-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-bionic-cuda10.2-py3.9-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/slow, ciflow/trunk\n\ud83d\udeab skipped\n\n\nlinux-docs-push\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nlinux-xenial-cuda11.3-py3.7-gcc7-no-ops\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-arm64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-10-15-py3-lite-interpreter-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nmacos-11-py3-x86-64\nciflow/all, ciflow/macos, ciflow/trunk\n\ud83d\udeab skipped\n\n\nparallelnative-linux-xenial-py3.7-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\nperiodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-bionic-cuda11.5-py3.7-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled, ciflow/slow, ciflow/slow-gradcheck\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda11.3-py3.7-gcc7-debug\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-win-vs2019-cuda11.5-py3\nciflow/all, ciflow/cuda, ciflow/scheduled, ciflow/win\n\ud83d\udeab skipped\n\n\npytorch-linux-xenial-py3-clang5-android-ndk-r19c-build\nciflow/all, ciflow/android, ciflow/cpu, ciflow/linux, ciflow/trunk\n\ud83d\udeab skipped\n\n\npytorch-xla-linux-bionic-py3.7-clang8\nciflow/all, ciflow/cpu, ciflow/linux, ciflow/trunk, ciflow/xla\n\ud83d\udeab skipped",
+                "createdAt": "2022-03-09T15:57:11Z",
+                "author": {
+                  "login": "pytorch-bot"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1063079053
+              },
+              {
+                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/73969\n\ud83d\udcc4 \u00a0Preview docs built from this PR\n\ud83d\udcc4 \u00a0Preview C++ docs built from this PR\n\ud83d\udd27 \u00a0Opt-in to CIFlow to control what jobs run on your PRs\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 4746da7 (more details on the Dr. CI page):\n\n\ud83d\udc9a \ud83d\udc9a Looks good so far! There are no failures yet. \ud83d\udc9a \ud83d\udc9a\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
+                "createdAt": "2022-03-09T15:57:12Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": {
+                  "login": "facebook-github-bot"
+                },
+                "databaseId": 1063079113
+              },
+              {
+                "bodyText": "This pull request was exported from Phabricator. Differential Revision: D34753911",
+                "createdAt": "2022-03-09T15:57:34Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1063079731
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOP11MjQ==",
+              "hasPreviousPage": false
+            }
+          },
+          "labels": {
+            "edges": [
+              {
+                "node": {
+                  "name": "fb-exported"
+                }
+              },
+              {
+                "node": {
+                  "name": "cla signed"
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=73099 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": false,
+          "author": {
+            "login": "BowenBao"
+          },
+          "title": "[ONNX] Make graph name spec-compliant (#71961)",
+          "body": "Stack from [ghstack](https://github.com/ezyang/ghstack):\n* #73104\n* #73103\n* #73102\n* #73101\n* #73100\n* __->__ #73099\n\n[According to the ONNX spec](https://github.com/onnx/onnx/blob/main/docs/IR.md#names-within-a-graph),\nall names must adhere to C90 identifier syntax rules, which means no\ndashes.\n\nFixes: #30952",
+          "headRefName": "gh/BowenBao/138/head",
+          "headRepository": {
+            "nameWithOwner": "pytorch/pytorch"
+          },
+          "baseRefName": "gh/BowenBao/138/base",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "BowenBao"
+                    },
+                    "email": "bowbao@microsoft.com",
+                    "name": "BowenBao"
+                  },
+                  "oid": "3038b939eb2069653305c419326a0f47d2598e39"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MQ",
+              "hasNextPage": false
+            },
+            "totalCount": 1
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "TorchBench CI (pytorch-linux-py3.7-cu102)"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "run-torchbench",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041786/jobs/2626264278"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNn9o=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SKIPPED",
+                          "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189561"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS7k="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041785/jobs/2626264385"
+                              },
+                              {
+                                "name": "test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041785/jobs/2626417658"
+                              },
+                              {
+                                "name": "test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041785/jobs/2626417743"
+                              },
+                              {
+                                "name": "test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041785/jobs/2626417885"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkRE_E=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189562"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS7o="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-gcc7-no-ops"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041789/jobs/2626264416"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoJE=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189563"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS7s="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3-clang5-mobile-build"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041787/jobs/2626264407"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoIY=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189564"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS7w="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041788/jobs/2626264422"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoJs=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189566"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS74="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-bionic-py3.7-clang9"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041790/jobs/2626264414"
+                              },
+                              {
+                                "name": "test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041790/jobs/2626349405"
+                              },
+                              {
+                                "name": "test (noarch, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041790/jobs/2626349522"
+                              },
+                              {
+                                "name": "test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041790/jobs/2626349618"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkPiwA=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189567"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS78="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-vulkan-bionic-py3.7-clang9"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041793/jobs/2626264431"
+                              },
+                              {
+                                "name": "test (default, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041793/jobs/2626359364"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkPxgQ=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189568"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS8A="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3-clang5-mobile-custom-build-static"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041792/jobs/2626264427"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkNoKA=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189570"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS8I="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "win-vs2019-cpu-py3"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041791/jobs/2626264386"
+                              },
+                              {
+                                "name": "test (default, 1, 2, windows.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041791/jobs/2626722677"
+                              },
+                              {
+                                "name": "test (default, 2, 2, windows.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041791/jobs/2626722710"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkX070=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189571"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS8M="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-xenial-py3.7-gcc7"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041803/jobs/2626264401"
+                              },
+                              {
+                                "name": "test (distributed, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041803/jobs/2626349045"
+                              },
+                              {
+                                "name": "test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041803/jobs/2626349141"
+                              },
+                              {
+                                "name": "test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/1866041803/jobs/2626349272"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAATkPiQA=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/3038b939eb2069653305c419326a0f47d2598e39/checks?check_suite_id=5365189572"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAT_KS8Q="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": true
+                    }
+                  },
+                  "status": {
+                    "contexts": [
+                      {
+                        "context": "ci/circleci: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17010288?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17010289?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17010488?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      },
+                      {
+                        "context": "ci/circleci: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build",
+                        "state": "SUCCESS",
+                        "targetUrl": "https://circleci.com/gh/pytorch/pytorch/17010326?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link"
+                      }
+                    ]
+                  },
+                  "pushedDate": "2022-02-18T18:46:28Z",
+                  "oid": "3038b939eb2069653305c419326a0f47d2598e39"
+                }
+              }
+            ]
+          },
+          "changedFiles": 162,
+          "files": {
+            "nodes": [
+              {
+                "path": "test/onnx/expect/TestOperators.test_acos.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_add_broadcast.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_add_left_broadcast.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_add_size1_broadcast.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_add_size1_right_broadcast.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_add_size1_singleton_broadcast.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_addconstant.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_addmm.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_arange_dynamic.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_argmax.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_asin.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_at_op.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_atan.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_aten_embedding_1.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_aten_embedding_2.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_avg_pool2d.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_baddbmm.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_basic.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_batchnorm.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_batchnorm_1d.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_batchnorm_noaffine.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_batchnorm_onnx_irv4.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_batchnorm_training.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_bitshift.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_c2_op.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_chunk.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_clip.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_clip_max.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_clip_min.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_concat2.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_conv.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_conv_onnx_irv4.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_conv_onnx_irv4_opset8.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_convtranspose.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_cos.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_cumsum.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_det.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dict.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dict_str.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dim.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dropout.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dropout_default.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dropout_opset12.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dropout_training.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dropout_training_opset12.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dynamic_axes_add.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dynamic_axes_add_inputs_same_symbolic_shape.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dynamic_axes_matmul.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dynamic_axes_reduce_mean.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_dynamic_axes_unchange.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_elu.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_embedding_bags.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_empty_like.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_empty_like_opset7.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_equal.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_erf.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_exp.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_expand.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_flatten.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_flatten2D.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_fmod.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_frobenius_norm.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_full.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_full_like.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_gather.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_gather_opset11.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_ge.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_gelu.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_gt.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_hardtanh.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_implicit_expand.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_index.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_isnan.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_layer_norm_aten.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_le.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_linear.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_log_sigmoid.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_logsoftmax.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_lstm_none_sequence_lens.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_lt.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_master_opset.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_max.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_maxpool.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_maxpool_dilations.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_maxpool_indices.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_mean.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_mean_dtype.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_meshgrid.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_min.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_mm.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_narrow.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_ne.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_nonzero.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_norm_p1.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_norm_p2.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_ones_like.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_pad.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_params.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_params_onnx_irv4.expect"
+              },
+              {
+                "path": "test/onnx/expect/TestOperators.test_permute2.expect"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MTAw",
+              "hasNextPage": true
+            }
+          },
+          "reviews": {
+            "nodes": [
+              {
+                "author": {
+                  "login": "garymm"
+                },
+                "state": "APPROVED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMi0xOFQxNzoxODo0NC0wODowMLkyMDIyLTAyLTE4VDE3OjE4OjQ0LTA4OjAwzjTr0H0=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "This PR cannot be merged by bot due to changing > 100 files. @malfet \n  \n    \n      pytorch/.github/scripts/trymerge.py\n    \n    \n         Line 63\n      in\n      932adf2\n    \n  \n  \n    \n\n        \n          \n                 files(last: 100) { \n        \n    \n  \n\n Can this be relaxed? If not please import.",
+                "createdAt": "2022-02-22T18:22:40Z",
+                "author": {
+                  "login": "BowenBao"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1048084569
+              },
+              {
+                "bodyText": "This PR cannot be merged by bot due to changing > 100 files. @malfet\nCan this be relaxed? If not please import.\n\nWow, you've hit a really interesting problem. 100 is a limitation enforced by GitHub, see https://docs.github.com/en/graphql/overview/resource-limitations, but I can implement a pagination. Do you mind keeping it like that for a bit, want to land a fix soonish.",
+                "createdAt": "2022-02-22T18:27:29Z",
+                "author": {
+                  "login": "malfet"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1048088691
+              },
+              {
+                "bodyText": "@malfet Thank you for info. Sure, I have separated the rest of stack from this one, we'll wait for the fix to try again.",
+                "createdAt": "2022-02-22T18:29:48Z",
+                "author": {
+                  "login": "BowenBao"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1048090640
+              },
+              {
+                "bodyText": "@pytorchbot merge this",
+                "createdAt": "2022-02-24T21:42:36Z",
+                "author": {
+                  "login": "BowenBao"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1050293881
+              },
+              {
+                "bodyText": "Hey @BowenBao.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
+                "createdAt": "2022-02-24T21:44:39Z",
+                "author": {
+                  "login": "github-actions"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1050295451
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOPniAWQ==",
+              "hasPreviousPage": true
+            }
+          },
+          "labels": {
+            "edges": [
+              {
+                "node": {
+                  "name": "oncall: jit"
+                }
+              },
+              {
+                "node": {
+                  "name": "open source"
+                }
+              },
+              {
+                "node": {
+                  "name": "cla signed"
+                }
+              },
+              {
+                "node": {
+                  "name": "release notes: onnx"
+                }
+              },
+              {
+                "node": {
+                  "name": "topic: bug fixes"
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=74649 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": false,
+          "author": {
+            "login": "malfet"
+          },
+          "title": "This should fail flake8",
+          "body": "Test issue for GHF mandatory checks",
+          "headRefName": "malfet-patch-8",
+          "headRepository": {
+            "nameWithOwner": "pytorch/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "57c86ff1c5ab948888fd329986c9d55796680e33"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "malfet"
+                    },
+                    "email": "nshulga@fb.com",
+                    "name": "Nikita Shulga"
+                  },
+                  "oid": "6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "Mg",
+              "hasNextPage": false
+            },
+            "totalCount": 2
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Facebook GitHub Tools",
+                            "databaseId": 12274
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Facebook CLA Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://code.intern.facebook.com/cla/"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVHsK3w=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018129"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlj1E="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Netlify",
+                            "databaseId": 13473
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null,
+                          "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018131"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlj1M="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Azure Pipelines",
+                            "databaseId": 9426
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null,
+                          "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018132"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlj1Q="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Dependabot",
+                            "databaseId": 29110
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null,
+                          "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018134"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlj1Y="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Codecov",
+                            "databaseId": 254
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null,
+                          "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018139"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlj1s="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "PyTorch Bot",
+                            "databaseId": 40112
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [],
+                            "pageInfo": {
+                              "endCursor": null,
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": null,
+                          "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018142"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlj14="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "clang-format",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925132"
+                              },
+                              {
+                                "name": "clang-tidy",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925189"
+                              },
+                              {
+                                "name": "cmakelint",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925230"
+                              },
+                              {
+                                "name": "flake8-py3",
+                                "conclusion": "FAILURE",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925307"
+                              },
+                              {
+                                "name": "mypy",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925365"
+                              },
+                              {
+                                "name": "Test collect_env (with_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925427"
+                              },
+                              {
+                                "name": "Test collect_env (without_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925449"
+                              },
+                              {
+                                "name": "Test tools",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925537"
+                              },
+                              {
+                                "name": "py2-setup-validate-errormsg",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925644"
+                              },
+                              {
+                                "name": "quick-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925688"
+                              },
+                              {
+                                "name": "toc",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925809"
+                              },
+                              {
+                                "name": "shellcheck",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576283/jobs/2928925945"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVHsMiY=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "FAILURE",
+                          "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018384"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlkFA="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "TorchBench CI (pytorch-linux-py3.7-cu102)"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "run-torchbench",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576288/jobs/2928925134"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVHsLW0=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SKIPPED",
+                          "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018395"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlkFs="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pull"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "pytorch-xla-linux-bionic-py3.7-clang8",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928935743"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928935775"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928935850"
+                              },
+                              {
+                                "name": "linux-bionic-rocm4.5-py3.7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928935994"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936064"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936179"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936265"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4-mobile-lightweight-dispatch-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936309"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936353"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936395"
+                              },
+                              {
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936426"
+                              },
+                              {
+                                "name": "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936483"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.3-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936516"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936558"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7-no-ops / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936633"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936705"
+                              },
+                              {
+                                "name": "deploy-linux-xenial-cuda11.3-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936736"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936756"
+                              },
+                              {
+                                "name": "pytorch-xla-linux-bionic-py3.7-clang8",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936796"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928936823"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928990551"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928990588"
+                              },
+                              {
+                                "name": "linux-docs / build-docs (cpp)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928992832"
+                              },
+                              {
+                                "name": "linux-docs / build-docs (python)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928992868"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928992932"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928992965"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (distributed, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928993011"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (docs_test, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928993042"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928993086"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-gcc5.4 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928993128"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928995802"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928995853"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (noarch, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928995889"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928997626"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928999058"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-onnx / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2928999075"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 1, 3, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929012407"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 2, 3, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929012438"
+                              },
+                              {
+                                "name": "linux-xenial-py3.7-clang7-asan / test (default, 3, 3, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929012469"
+                              },
+                              {
+                                "name": "linux-bionic-rocm4.5-py3.7 / test (default, 1, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929034328"
+                              },
+                              {
+                                "name": "linux-bionic-rocm4.5-py3.7 / test (default, 2, 2, linux.rocm.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929034340"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929040801"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 1, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929045939"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (default, 2, 2, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929046016"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7 / test (distributed, 1, 1, linux.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929046063"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / test (default, 1, 2, windows.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929082254"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929082275"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.3-py3 / test (default, 1, 2, windows.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929157614"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929157635"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.3-py3 / test (force_on_cpu, 1, 1, windows.4xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2031576300/jobs/2929157656"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAVHxIT4=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4/checks?check_suite_id=5778018405"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAVhlkGU="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": false
+                    }
+                  },
+                  "status": null,
+                  "pushedDate": "2022-03-24T00:42:33Z",
+                  "oid": "6c3c3de6a5c1183d9a08f3c54148bc0b5de11bb4"
+                }
+              }
+            ]
+          },
+          "changedFiles": 1,
+          "files": {
+            "nodes": [
+              {
+                "path": "torch/nn/cpp.py"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MQ",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
+              {
+                "author": {
+                  "login": "seemethere"
+                },
+                "state": "APPROVED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wMy0yM1QxNTo1MDo0NS0wNzowMLkyMDIyLTAzLTIzVDE1OjUwOjQ1LTA3OjAwzjbPEDg=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/74649\n\u21a9\ufe0f \u00a0[fb-only] Re-run with SSH instructions\nNeed help or want to give feedback on the CI? Visit our office hours\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 6c3c3de (more details on the Dr. CI page):\n\n\n1/1 failures introduced in this PR\n\n\n1 failure not recognized by patterns:\n\n\n\nJob\nStep\nAction\n\n\n\n\n Lint / flake8-py3\nFail if there were any warnings\n\ud83d\udd01 rerun\n\n\n\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
+                "createdAt": "2022-03-23T22:40:51Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": {
+                  "login": "facebook-github-bot"
+                },
+                "databaseId": 1076891218
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOQDAOUg==",
+              "hasPreviousPage": false
+            }
+          },
+          "labels": {
+            "edges": [
+              {
+                "node": {
+                  "name": "cla signed"
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=81fd873151c3cded18314e9e53bf54a93ffb0afa9c52fa2cbafb2ceab7df5e45 name=pytorch number=79694 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "closed": true,
+          "isCrossRepository": true,
+          "author": {
+            "login": "kshitij12345"
+          },
+          "title": "[complex] conv_transpose1d",
+          "body": "Reference: https://github.com/pytorch/pytorch/issues/71108",
+          "headRefName": "develop/complex/conv_transpose1d",
+          "headRepository": {
+            "nameWithOwner": "kshitij12345/pytorch"
+          },
+          "baseRefName": "master",
+          "baseRepository": {
+            "nameWithOwner": "pytorch/pytorch",
+            "isPrivate": false,
+            "defaultBranchRef": {
+              "name": "master"
+            }
+          },
+          "mergeCommit": null,
+          "commits_with_authors": {
+            "nodes": [
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "d1ea948e65ac6d31ad056287ab65d38ecc68b30d"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "b4ba1db9a3a71bd8c03158dcd1b68711360633d8"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "655a4220beae163bfe578f0318a130df01ec05d6"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "Kshiteej K"
+                  },
+                  "oid": "8181716be7a8005eb13ad5c3f2e1279ed1c60aff"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "9e5ca3663e7471786eeebebfdf84aea5d761712f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "9c110f39bcdc4e56386b6f9c4e2c082c8940ade6"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "49315e79d0eee8008e2a74575c6fc0f6a9531ee4"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "728752480760226270c374a0acc08e28b9b133f3"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "ffe43399d6f60ef7844523a5f465c11d9a67062f"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "9672a2198472567bae4ac6f55d004f7e1fa8a9fa"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "48a0ebf32b895286f036b36c871f671dc867e400"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "52fbe80d5c8a94e03d816c0bd21fd82019dcd5ac"
+                }
+              },
+              {
+                "commit": {
+                  "author": {
+                    "user": {
+                      "login": "kshitij12345"
+                    },
+                    "email": "kshitijkalambarkar@gmail.com",
+                    "name": "kshitij12345"
+                  },
+                  "oid": "2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce"
+                }
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "MTM",
+              "hasNextPage": false
+            },
+            "totalCount": 13
+          },
+          "commits": {
+            "nodes": [
+              {
+                "commit": {
+                  "checkSuites": {
+                    "edges": [
+                      {
+                        "node": {
+                          "app": {
+                            "name": "Facebook GitHub Tools",
+                            "databaseId": 12274
+                          },
+                          "workflowRun": null,
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "Facebook CLA Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://code.facebook.com/cla/"
+                              },
+                              {
+                                "name": "Meta Internal-Only Changes Check",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://opensource.facebook.com/"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdtq8Hc=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce/checks?check_suite_id=7929899098"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdioqFo="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "TorchBench CI (pytorch-linux-py3.7-cu102)"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "run-torchbench",
+                                "conclusion": "NEUTRAL",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393316/jobs/4628529923"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdqTEwk=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SKIPPED",
+                          "url": "https://github.com/pytorch/pytorch/commit/2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce/checks?check_suite_id=7929899387"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdioqXs="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "Lint"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "lintrunner",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628529910"
+                              },
+                              {
+                                "name": "quick-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628530162"
+                              },
+                              {
+                                "name": "Test collect_env (with_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628530698"
+                              },
+                              {
+                                "name": "Test collect_env (without_torch)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628530867"
+                              },
+                              {
+                                "name": "Test collect_env (older_python_version)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628530989"
+                              },
+                              {
+                                "name": "pr-sanity-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628531151"
+                              },
+                              {
+                                "name": "workflow-checks",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628531475"
+                              },
+                              {
+                                "name": "Test tools",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628531753"
+                              },
+                              {
+                                "name": "toc",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393315/jobs/4628531853"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdqTHFY=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce/checks?check_suite_id=7929899388"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdioqXw="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "pull"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "linux-focal-py3.7-clang7-asan / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628531149"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.6-py3.10-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628531473"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628531754"
+                              },
+                              {
+                                "name": "linux-jammy-cuda11.6-cudnn8-py3.8-clang12 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628531857"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7-pch / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628532179"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang10-onnx / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628532543"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.3-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628532694"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628532918"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533033"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7-no-ops / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533181"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533420"
+                              },
+                              {
+                                "name": "linux-xenial-cuda11.3-py3.7-gcc7-bazel-test / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533630"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit / build-and-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533825"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-custom-build-static / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628533959"
+                              },
+                              {
+                                "name": "linux-xenial-py3-clang5-mobile-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534129"
+                              },
+                              {
+                                "name": "linux-bionic-py3_7-clang8-xla / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534256"
+                              },
+                              {
+                                "name": "linux-focal-rocm5.2-py3.7 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534388"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7-mobile-lightweight-dispatch-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534571"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11_6-py3_10-gcc7-deploy / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534714"
+                              },
+                              {
+                                "name": "win-vs2019-cuda11.6-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628534989"
+                              },
+                              {
+                                "name": "win-vs2019-cpu-py3 / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628535311"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639115"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639198"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (distributed, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639265"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (functorch, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639339"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (docs_test, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639395"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (jit_legacy, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639450"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-gcc7 / test (backwards_compat, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639509"
+                              },
+                              {
+                                "name": "linux-docs / build-docs (cpp)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639572"
+                              },
+                              {
+                                "name": "linux-docs / build-docs (python)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628639635"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647047"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang10-onnx / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647119"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647215"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (default, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647277"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647348"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (crossref, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647432"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 1, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647522"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (dynamo, 2, 2, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647641"
+                              },
+                              {
+                                "name": "linux-bionic-py3.7-clang9 / test (functorch, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628647762"
+                              },
+                              {
+                                "name": "linux-vulkan-bionic-py3.7-clang9 / test (default, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628653797"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 1, 5, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628679376"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 2, 5, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628679431"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 3, 5, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628679469"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 4, 5, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628679519"
+                              },
+                              {
+                                "name": "linux-focal-py3.7-clang7-asan / test (default, 5, 5, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628679594"
+                              },
+                              {
+                                "name": "linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.2xlarge)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628681226"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11_6-py3_10-gcc7-deploy / test (deploy, 1, 1, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628854932"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 1, 4, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628856434"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 2, 4, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628856501"
+                              },
+                              {
+                                "name": "linux-bionic-cuda11.6-py3.10-gcc7 / test (default, 3, 4, linux.4xlarge.nvidia.gpu)",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2907393329/jobs/4628856575"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdqZ2fA=",
+                              "hasNextPage": true
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce/checks?check_suite_id=7929899419"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdioqZs="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "windows-binary-libtorch-debug"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-debug-build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351637/jobs/4634503587"
+                              },
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-debug-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351637/jobs/4635312938"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsbsmM=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce/checks?check_suite_id=7936953056"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUSuA="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "windows-binary-wheel"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "wheel-py3_7-cuda11_3-build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351640/jobs/4634503571"
+                              },
+                              {
+                                "name": "wheel-py3_7-cuda11_3-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351640/jobs/4636146265"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsskcw=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce/checks?check_suite_id=7936953059"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUSuM="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "windows-binary-libtorch-release"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-release-build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351643/jobs/4634503570"
+                              },
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-release-test",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351643/jobs/4635003925"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsVbD8=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce/checks?check_suite_id=7936953061"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUSuU="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-binary-libtorch-cxx11-abi"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-cxx11-abi-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351698/jobs/4634504079"
+                              },
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-cxx11-abi-test / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351698/jobs/4635072931"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsW5Aw=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce/checks?check_suite_id=7936953185"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUS2E="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-binary-libtorch-pre-cxx11"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-cxx11-abi-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351700/jobs/4634503897"
+                              },
+                              {
+                                "name": "libtorch-cpu-shared-with-deps-cxx11-abi-test / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351700/jobs/4635077148"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsW-jo=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce/checks?check_suite_id=7936953186"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUS2I="
+                      },
+                      {
+                        "node": {
+                          "app": {
+                            "name": "GitHub Actions",
+                            "databaseId": 15368
+                          },
+                          "workflowRun": {
+                            "workflow": {
+                              "name": "linux-binary-manywheel"
+                            }
+                          },
+                          "checkRuns": {
+                            "nodes": [
+                              {
+                                "name": "manywheel-py3_7-cuda10_2-build / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351699/jobs/4634503896"
+                              },
+                              {
+                                "name": "manywheel-py3_7-cuda10_2-test / build",
+                                "conclusion": "SUCCESS",
+                                "detailsUrl": "https://github.com/pytorch/pytorch/actions/runs/2910351699/jobs/4635934290"
+                              }
+                            ],
+                            "pageInfo": {
+                              "endCursor": "Y3Vyc29yOnYyOpHPAAAAAdsoMEA=",
+                              "hasNextPage": false
+                            }
+                          },
+                          "conclusion": "SUCCESS",
+                          "url": "https://github.com/pytorch/pytorch/commit/2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce/checks?check_suite_id=7936953187"
+                        },
+                        "cursor": "Y3Vyc29yOnYyOpHPAAAAAdkUS2M="
+                      }
+                    ],
+                    "pageInfo": {
+                      "hasNextPage": true
+                    }
+                  },
+                  "status": null,
+                  "pushedDate": "2022-08-22T22:04:19Z",
+                  "oid": "2fd08f1c669bbb0f2e14ae40e76f9e0d3195f4ce"
+                }
+              }
+            ]
+          },
+          "changedFiles": 3,
+          "files": {
+            "nodes": [
+              {
+                "path": "aten/src/ATen/native/Convolution.cpp"
+              },
+              {
+                "path": "torch/testing/_internal/common_methods_invocations.py"
+              },
+              {
+                "path": "torch/testing/_internal/common_modules.py"
+              }
+            ],
+            "pageInfo": {
+              "endCursor": "Mw",
+              "hasNextPage": false
+            }
+          },
+          "reviews": {
+            "nodes": [
+              {
+                "author": {
+                  "login": "ngimel"
+                },
+                "state": "APPROVED"
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpO5MjAyMi0wNy0xOVQxMDowNzo1NC0wNzowMLkyMDIyLTA3LTE5VDEwOjA3OjU0LTA3OjAwzj43QcY=",
+              "hasPreviousPage": false
+            }
+          },
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "@pytorchbot merge -g\nAll is green internally!",
+                "createdAt": "2022-08-23T19:29:55Z",
+                "author": {
+                  "login": "albanD"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1224702749
+              },
+              {
+                "bodyText": "@pytorchbot successfully started a merge job. Check the current status here.\nThe merge job was triggered with the green (-g) flag. This means that your change will be merged once all checks on your PR have passed (ETA: 0-4 Hours). If this is not the intended behavior, feel free to use some of the other merge options in the wiki.\nPlease reach out to the PyTorch DevX Team with feedback or questions!",
+                "createdAt": "2022-08-23T19:31:18Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1224705564
+              },
+              {
+                "bodyText": "Thanks for looking into it \ud83d\ude42 @albanD @jeanschmidt",
+                "createdAt": "2022-08-23T19:34:36Z",
+                "author": {
+                  "login": "kshitij12345"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1224712351
+              },
+              {
+                "bodyText": "Hey @kshitij12345.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
+                "createdAt": "2022-08-23T22:31:58Z",
+                "author": {
+                  "login": "github-actions"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1224956051
+              },
+              {
+                "bodyText": "Yeah, discussed with my manager and I got the required permissions to do so. Sorry for not responding promptly yesterday. But I am available from now on to provide assistance :)",
+                "createdAt": "2022-08-24T09:24:04Z",
+                "author": {
+                  "login": "jeanschmidt"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1225462612
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOSP97HQ==",
+              "hasPreviousPage": true
+            }
+          },
+          "labels": {
+            "edges": [
+              {
+                "node": {
+                  "name": "open source"
+                }
+              },
+              {
+                "node": {
+                  "name": "Merged"
+                }
+              },
+              {
+                "node": {
+                  "name": "cla signed"
+                }
+              },
+              {
+                "node": {
+                  "name": "Reverted"
+                }
+              },
+              {
+                "node": {
+                  "name": "ciflow/trunk"
+                }
+              },
+              {
+                "node": {
+                  "name": "ciflow/periodic"
+                }
+              }
+            ]
+          }
+        }
+      }
+    }
+  },
+  "query_sha=2e2877d2452c4f233f042b7ccd50ab9c2a6e9a73d8819a0c876203c12364e8a3 cursor=Y3Vyc29yOnYyOpHOSP97HQ== name=pytorch number=79694 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/79694\n\ud83d\udcc4 \u00a0Preview Python docs built from this PR\n\ud83d\udcc4 \u00a0Preview C++ docs built from this PR\n\u2753Need help or want to give feedback on the CI? Visit our office hours\n\n\u2705 No Failures (0 Pending)\nAs of commit 2fd08f1 (more details on the Dr. CI page):\nExpand to see more\n\n\ud83d\udc9a \ud83d\udc9a Looks good so far! There are no failures yet. \ud83d\udc9a \ud83d\udc9a\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
+                "createdAt": "2022-06-16T09:43:16Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": {
+                  "login": "facebook-github-bot"
+                },
+                "databaseId": 1157454523
+              },
+              {
+                "bodyText": "Unable to reproduce jit failure locally (will skip the test)\nCI Failure : https://github.com/pytorch/pytorch/runs/6926187074?check_suite_focus=true#step:9:20230\npytest test/test_ops_jit.py -k test_variant_consistency_jit_nn_functional_conv_transpose1d_cpu_complex64 -v\n=============================================================== test session starts ===============================================================\nplatform linux -- Python 3.10.0, pytest-6.2.5, py-1.10.0, pluggy-1.0.0 -- /home/kshiteej/.conda/envs/pytorch-cuda-dev/bin/python\ncachedir: .pytest_cache\nhypothesis profile 'default' -> database=DirectoryBasedExampleDatabase('/home/kshiteej/Pytorch/pytorch_complex_convolution.py/.hypothesis/examples')\nrootdir: /home/kshiteej/Pytorch/pytorch_complex_convolution.py, configfile: pytest.ini\nplugins: hypothesis-6.23.2, repeat-0.9.1\ncollected 1976 items / 1975 deselected / 1 selected                                                                                               \n\ntest/test_ops_jit.py::TestJitCPU::test_variant_consistency_jit_nn_functional_conv_transpose1d_cpu_complex64 PASSED                          [100%]\n\n================================================================ warnings summary =================================================================\n../../.conda/envs/pytorch-cuda-dev/lib/python3.10/site-packages/torch/testing/_internal/common_cuda.py:9\n  /home/kshiteej/.conda/envs/pytorch-cuda-dev/lib/python3.10/site-packages/torch/testing/_internal/common_cuda.py:9: DeprecationWarning: The distutils package is deprecated and slated for removal in Python 3.12. Use setuptools or check PEP 632 for potential alternatives\n    from distutils.version import LooseVersion\n\n../../.conda/envs/pytorch-cuda-dev/lib/python3.10/site-packages/torch/backends/cudnn/__init__.py:91\n  /home/kshiteej/.conda/envs/pytorch-cuda-dev/lib/python3.10/site-packages/torch/backends/cudnn/__init__.py:91: UserWarning: PyTorch was compiled without cuDNN/MIOpen support. To use cuDNN/MIOpen, rebuild PyTorch making sure the library is visible to the build system.\n    warnings.warn(\n\n-- Docs: https://docs.pytest.org/en/stable/warnings.html\n================================================= 1 passed, 1975 deselected, 2 warnings in 4.90s =================================================",
+                "createdAt": "2022-07-18T09:05:35Z",
+                "author": {
+                  "login": "kshitij12345"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": {
+                  "login": "kshitij12345"
+                },
+                "databaseId": 1186949486
+              },
+              {
+                "bodyText": "@pytorchbot merge",
+                "createdAt": "2022-07-19T17:12:23Z",
+                "author": {
+                  "login": "ngimel"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1189347786
+              },
+              {
+                "bodyText": "@pytorchbot successfully started a merge job. Check the current status here",
+                "createdAt": "2022-07-19T17:13:42Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1189350009
+              },
+              {
+                "bodyText": "Hey @kshitij12345.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
+                "createdAt": "2022-07-19T17:14:25Z",
+                "author": {
+                  "login": "github-actions"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1189350932
+              },
+              {
+                "bodyText": "@pytorchbot revert -m \"broke slow test https://github.com/pytorch/pytorch/runs/7414560957?check_suite_focus=true#step:9:31516\" -c \"nosignal\"",
+                "createdAt": "2022-07-19T19:15:41Z",
+                "author": {
+                  "login": "kshitij12345"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1189459845
+              },
+              {
+                "bodyText": "@pytorchbot successfully started a revert job. Check the current status here",
+                "createdAt": "2022-07-19T19:16:59Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1189460926
+              },
+              {
+                "bodyText": "Will not revert as @kshitij12345 is not a MEMBER, but COLLABORATOR",
+                "createdAt": "2022-07-19T19:17:00Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1189460942
+              },
+              {
+                "bodyText": "@pytorchbot revert -m \"broke slow test https://github.com/pytorch/pytorch/runs/7414560957?check_suite_focus=true#step:9:31516\" -c \"nosignal\"",
+                "createdAt": "2022-07-19T20:40:04Z",
+                "author": {
+                  "login": "anjali411"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1189529734
+              },
+              {
+                "bodyText": "@pytorchbot successfully started a revert job. Check the current status here",
+                "createdAt": "2022-07-19T20:41:20Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1189530756
+              },
+              {
+                "bodyText": "@kshitij12345 your PR has been successfully reverted.",
+                "createdAt": "2022-07-19T20:41:25Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1189530831
+              },
+              {
+                "bodyText": "@pytorchbot merge -g",
+                "createdAt": "2022-07-20T09:53:08Z",
+                "author": {
+                  "login": "kshitij12345"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1190070141
+              },
+              {
+                "bodyText": "@pytorchbot successfully started a merge job. Check the current status here",
+                "createdAt": "2022-07-20T09:54:24Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1190071424
+              },
+              {
+                "bodyText": "Hey @kshitij12345.\nYou've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. Please add one of each to the PR. The 'release notes: ...' label should represent the part of PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). The list of valid labels can be found here for the 'release notes: ...' and here for the 'topics: ...'.\nFor changes that are 'topic: not user facing' there is no need for a release notes label.",
+                "createdAt": "2022-07-20T13:00:51Z",
+                "author": {
+                  "login": "github-actions"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1190258272
+              },
+              {
+                "bodyText": "commit is breaking internal builds/tests https://pastebin.com/HX4RUusH (pytorch/functorch/test:test_eager_transforms)",
+                "createdAt": "2022-07-21T10:39:01Z",
+                "author": {
+                  "login": "jeanschmidt"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1191327616
+              },
+              {
+                "bodyText": "@pytorchbot revert -m \"breaking internal builds\" -c \"ghfirst\"",
+                "createdAt": "2022-07-21T10:39:27Z",
+                "author": {
+                  "login": "jeanschmidt"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1191328013
+              },
+              {
+                "bodyText": "@pytorchbot revert -m \"breaking internal builds\" -c \"ghfirst\"",
+                "createdAt": "2022-07-21T10:41:23Z",
+                "author": {
+                  "login": "jeanschmidt"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1191329792
+              },
+              {
+                "bodyText": "@pytorchbot successfully started a revert job. Check the current status here",
+                "createdAt": "2022-07-21T10:42:16Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1191330586
+              },
+              {
+                "bodyText": "@kshitij12345 your PR has been successfully reverted.",
+                "createdAt": "2022-07-21T10:42:23Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1191330690
+              },
+              {
+                "bodyText": "@jeanschmidt which test is it failing on? I tried running the test_eager_transforms in functorch but couldn't reproduce it.",
+                "createdAt": "2022-07-25T07:11:19Z",
+                "author": {
+                  "login": "kshitij12345"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1193667568
+              },
+              {
+                "bodyText": "@jbschlosser have added a ref as discussed offline. Can you please take a look? And if it looks good, can you import the PR to check if it is breaking anything internally.\nThanks",
+                "createdAt": "2022-08-03T18:30:17Z",
+                "author": {
+                  "login": "kshitij12345"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1204329491
+              },
+              {
+                "bodyText": "@jbschlosser @jeanschmidt @albanD anything we can do to unblock this on our side?",
+                "createdAt": "2022-08-20T09:27:17Z",
+                "author": {
+                  "login": "lezcano"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1221266218
+              },
+              {
+                "bodyText": "Functorch tests should be running here now so can you rebase on top of master please?",
+                "createdAt": "2022-08-22T21:42:37Z",
+                "author": {
+                  "login": "albanD"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1223129944
+              },
+              {
+                "bodyText": "@albanD have rebased on latest master.",
+                "createdAt": "2022-08-23T08:49:10Z",
+                "author": {
+                  "login": "kshitij12345"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1223758571
+              },
+              {
+                "bodyText": "I triggered all the tests not to have any issues with slow tests again",
+                "createdAt": "2022-08-23T09:20:18Z",
+                "author": {
+                  "login": "lezcano"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1223796413
+              },
+              {
+                "bodyText": "Thanks @lezcano! However, last time it was reverted for internal failures. So it would be great if someone can import and verify that.\ncc: @albanD @jeanschmidt",
+                "createdAt": "2022-08-23T10:17:50Z",
+                "author": {
+                  "login": "kshitij12345"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1223863075
+              },
+              {
+                "bodyText": "@albanD has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.",
+                "createdAt": "2022-08-23T14:43:02Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1224175731
+              },
+              {
+                "bodyText": "I am not the right person to provide assistence, as currently I am not based in a Tier 1 location, so my permissions to access are so restricted that I am not able to import this commit, run the tests and provide meaningful responses.",
+                "createdAt": "2022-08-23T15:57:48Z",
+                "author": {
+                  "login": "jeanschmidt"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1224272324
+              },
+              {
+                "bodyText": "@jeanschmidt has imported this pull request. If you are a Meta employee, you can view this diff on Phabricator.",
+                "createdAt": "2022-08-23T17:00:53Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1224351135
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHORP1auw==",
+              "hasPreviousPage": false
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=2e2877d2452c4f233f042b7ccd50ab9c2a6e9a73d8819a0c876203c12364e8a3 cursor=Y3Vyc29yOnYyOpHOR1poyg== name=pytorch number=82169 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/82169\n\ud83d\udcc4 \u00a0Preview Python docs built from this PR\n\ud83d\udcc4 \u00a0Preview C++ docs built from this PR\n\u2753Need help or want to give feedback on the CI? Visit our office hours\n\n\u2705 No Failures (0 Pending)\nAs of commit 28140e4 (more details on the Dr. CI page):\nExpand to see more\n\n\ud83d\udc9a \ud83d\udc9a Looks good so far! There are no failures yet. \ud83d\udc9a \ud83d\udc9a\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
+                "createdAt": "2022-07-25T21:41:41Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": {
+                  "login": "facebook-github-bot"
+                },
+                "databaseId": 1194667199
+              },
+              {
+                "bodyText": "@pytorchbot merge -g",
+                "createdAt": "2022-07-25T21:46:04Z",
+                "author": {
+                  "login": "ezyang"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1194671445
+              },
+              {
+                "bodyText": "@pytorchbot successfully started a merge job. Check the current status here",
+                "createdAt": "2022-07-25T21:47:25Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1194672744
+              },
+              {
+                "bodyText": "Merge failed due to Refusing to merge as mandatory check(s) pull failed for rule superuser\nRaised by https://github.com/pytorch/pytorch/actions/runs/2735501647",
+                "createdAt": "2022-07-25T23:22:45Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1194761219
+              },
+              {
+                "bodyText": "@pytorchbot rebase",
+                "createdAt": "2022-07-26T00:54:17Z",
+                "author": {
+                  "login": "ezyang"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1194839920
+              },
+              {
+                "bodyText": "@pytorchbot successfully started a rebase job. Check the current status here",
+                "createdAt": "2022-07-26T01:01:32Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1194846575
+              },
+              {
+                "bodyText": "Successfully rebased gh/ezyang/1279/orig onto refs/remotes/origin/master, please pull locally before adding more changes (for example, via ghstack checkout https://github.com/pytorch/pytorch/pull/82169)",
+                "createdAt": "2022-07-26T01:01:53Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1194846838
+              },
+              {
+                "bodyText": "@pytorchbot rebase",
+                "createdAt": "2022-07-27T15:32:13Z",
+                "author": {
+                  "login": "ezyang"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1196915484
+              },
+              {
+                "bodyText": "@pytorchbot successfully started a rebase job. Check the current status here",
+                "createdAt": "2022-07-27T15:33:49Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1196917359
+              },
+              {
+                "bodyText": "Successfully rebased gh/ezyang/1279/orig onto refs/remotes/origin/master, please pull locally before adding more changes (for example, via ghstack checkout https://github.com/pytorch/pytorch/pull/82169)",
+                "createdAt": "2022-07-27T15:34:03Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1196917609
+              },
+              {
+                "bodyText": "@pytorchbot merge -g",
+                "createdAt": "2022-07-27T15:41:52Z",
+                "author": {
+                  "login": "ezyang"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1196927174
+              },
+              {
+                "bodyText": "@pytorchbot successfully started a merge job. Check the current status here",
+                "createdAt": "2022-07-27T15:43:11Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1196928771
+              },
+              {
+                "bodyText": "Merge failed due to Refusing to merge as mandatory check(s) Lint failed for rule superuser\nRaised by https://github.com/pytorch/pytorch/actions/runs/2747872935",
+                "createdAt": "2022-07-27T15:43:14Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1196928849
+              },
+              {
+                "bodyText": "@pytorchbot merge -g",
+                "createdAt": "2022-07-27T16:59:37Z",
+                "author": {
+                  "login": "ezyang"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1197046487
+              },
+              {
+                "bodyText": "@pytorchbot successfully started a merge job. Check the current status here",
+                "createdAt": "2022-07-27T17:07:32Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1197055101
+              },
+              {
+                "bodyText": "Merge failed due to Refusing to merge as mandatory check(s) Lint failed for rule superuser\nRaised by https://github.com/pytorch/pytorch/actions/runs/2748317347",
+                "createdAt": "2022-07-27T17:07:36Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1197055259
+              },
+              {
+                "bodyText": "@pytorchbot merge -f",
+                "createdAt": "2022-07-27T17:56:26Z",
+                "author": {
+                  "login": "malfet"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1197107106
+              },
+              {
+                "bodyText": "\u274c \ud83e\udd16 pytorchbot command failed:\n@pytorchbot merge: error: argument -f/--force: expected one argument\n\nusage: @pytorchbot merge [-g | -f FORCE | -l]\n\nTry @pytorchbot --help for more info.",
+                "createdAt": "2022-07-27T17:56:27Z",
+                "author": {
+                  "login": "pytorch-bot"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1197107129
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHORzUsvw==",
+              "hasPreviousPage": false
+            }
+          }
+        }
+      }
+    }
+  },
+  "query_sha=2e2877d2452c4f233f042b7ccd50ab9c2a6e9a73d8819a0c876203c12364e8a3 cursor=Y3Vyc29yOnYyOpHOPoR4Lg== name=pytorch number=71759 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "CI Flow Status\n\u269b\ufe0f CI Flow\nRuleset - Version: v1\nRuleset - File: https://github.com/coolteemf/pytorch/blob/7647f7953a68e4f1c3feaa19c77d925abfe8e377/.github/generated-ciflow-ruleset.json\nPR ciflow labels: ciflow/default\nAdd ciflow labels to this PR to trigger more builds:\n\n\n\nWorkflows\nLabels (bold enabled)\nStatus\n\n\n\n\nTriggered Workflows\n\n\n\n\nlinux-bionic-py3.6-clang9\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/noarch, ciflow/xla\n\u2705 triggered\n\n\nlinux-xenial-cuda11.3-py3.6-gcc7\nciflow/all, ciflow/cuda, ciflow/default, ciflow/linux\n\u2705 triggered\n\n\nlinux-xenial-py3.6-clang7-asan\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux, ciflow/sanitizers\n\u2705 triggered\n\n\nlinux-xenial-py3.6-gcc5.4\nciflow/all, ciflow/cpu, ciflow/default, ciflow/linux\n\u2705 triggered\n\n\nlinux-xenial-py3.6-gcc7-bazel-test\nciflow/all, ciflow/bazel, ciflow/cpu, ciflow/default, ciflow/linux\n\u2705 triggered\n\n\nwin-vs2019-cpu-py3\nciflow/all, ciflow/cpu, ciflow/default, ciflow/win\n\u2705 triggered\n\n\nwin-vs2019-cuda11.3-py3\nciflow/all, ciflow/cuda, ciflow/default, ciflow/win\n\u2705 triggered\n\n\nSkipped Workflows\n\n\n\n\nlibtorch-linux-xenial-cuda10.2-py3.6-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux\n\ud83d\udeab skipped\n\n\nlibtorch-linux-xenial-cuda11.3-py3.6-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux\n\ud83d\udeab skipped\n\n\nlinux-bionic-cuda10.2-py3.9-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/slow\n\ud83d\udeab skipped\n\n\nlinux-xenial-cuda10.2-py3.6-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/slow\n\ud83d\udeab skipped\n\n\nparallelnative-linux-xenial-py3.6-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux\n\ud83d\udeab skipped\n\n\nperiodic-libtorch-linux-xenial-cuda11.1-py3.6-gcc7\nciflow/all, ciflow/cuda, ciflow/libtorch, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-linux-xenial-cuda11.1-py3.6-gcc7\nciflow/all, ciflow/cuda, ciflow/linux, ciflow/scheduled\n\ud83d\udeab skipped\n\n\nperiodic-win-vs2019-cuda11.1-py3\nciflow/all, ciflow/cuda, ciflow/scheduled, ciflow/win\n\ud83d\udeab skipped\n\n\npuretorch-linux-xenial-py3.6-gcc5.4\nciflow/all, ciflow/cpu, ciflow/linux\n\ud83d\udeab skipped",
+                "createdAt": "2022-01-25T09:31:05Z",
+                "author": {
+                  "login": "pytorch-bot"
+                },
+                "authorAssociation": "NONE",
+                "editor": null,
+                "databaseId": 1020983378
+              },
+              {
+                "bodyText": "Hi @coolteemf!\nThank you for your pull request and welcome to our community.\nAction Required\nIn order to merge any pull request (code, docs, etc.), we require contributors to sign our Contributor License Agreement, and we don't seem to have one on file for you.\nProcess\nIn order for us to review and merge your suggested changes, please sign at https://code.facebook.com/cla. If you are contributing on behalf of someone else (eg your employer), the individual CLA may not be sufficient and your employer may need to sign the corporate CLA.\nOnce the CLA is signed, our tooling will perform checks and validations. Afterwards, the pull request will be tagged with CLA signed. The tagging process may take up to 1 hour after signing. Please give it that time before contacting us about it.\nIf you have received this in error or have any questions, please contact us at cla@fb.com. Thanks!",
+                "createdAt": "2022-01-25T09:31:06Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1020983383
+              },
+              {
+                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/71759\n\ud83d\udcc4 \u00a0Preview docs built from this PR\n\ud83d\udcc4 \u00a0Preview C++ docs built from this PR\n\ud83d\udd27 \u00a0Opt-in to CIFlow to control what jobs run on your PRs\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit 346e0c5 (more details on the Dr. CI page):\n\n\n2/3 failures introduced in this PR\n1/3 tentatively recognized as flaky \u2744\ufe0f\n\nClick here to rerun these jobs\n\n\n\n\n\ud83d\udd75\ufe0f 2 new failures recognized by patterns\nThe following CI failures do not appear to be due to upstream breakages:\n win-vs2019-cpu-py3 / test (default, 2, 2, windows.4xlarge) (1/2)\nStep: \"Test\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-02-23T14:12:58.9371445Z FAIL [0.010s]: test_sparse_addmm_cpu_bfloat16 (__main__.TestSparseCPU)\n\n2022-02-23T14:12:58.9258506Z   test_sparse_zeros_tanh_cpu_float64 (__main__.TestSparseUnaryUfuncsCPU) ... ok (0.002s)\n2022-02-23T14:12:58.9274771Z   test_sparse_zeros_tanh_cpu_int16 (__main__.TestSparseUnaryUfuncsCPU) ... ok (0.001s)\n2022-02-23T14:12:58.9290805Z   test_sparse_zeros_tanh_cpu_int32 (__main__.TestSparseUnaryUfuncsCPU) ... ok (0.001s)\n2022-02-23T14:12:58.9306695Z   test_sparse_zeros_tanh_cpu_int64 (__main__.TestSparseUnaryUfuncsCPU) ... ok (0.000s)\n2022-02-23T14:12:58.9322595Z   test_sparse_zeros_tanh_cpu_int8 (__main__.TestSparseUnaryUfuncsCPU) ... ok (0.000s)\n2022-02-23T14:12:58.9338535Z   test_sparse_zeros_tanh_cpu_uint8 (__main__.TestSparseUnaryUfuncsCPU) ... ok (0.000s)\n2022-02-23T14:12:58.9354468Z   test_sparse_zeros_trunc_cpu_float32 (__main__.TestSparseUnaryUfuncsCPU) ... ok (0.000s)\n2022-02-23T14:12:58.9370208Z   test_sparse_zeros_trunc_cpu_float64 (__main__.TestSparseUnaryUfuncsCPU) ... ok (0.000s)\n2022-02-23T14:12:58.9370712Z \n2022-02-23T14:12:58.9370976Z ======================================================================\n2022-02-23T14:12:58.9371445Z FAIL [0.010s]: test_sparse_addmm_cpu_bfloat16 (__main__.TestSparseCPU)\n2022-02-23T14:12:58.9372134Z ----------------------------------------------------------------------\n2022-02-23T14:12:58.9372597Z Traceback (most recent call last):\n2022-02-23T14:12:58.9374021Z   File \"C:\\actions-runner\\_work\\pytorch\\pytorch\\build\\win_tmp\\build\\torch\\testing\\_internal\\common_device_type.py\", line 376, in instantiated_test\n2022-02-23T14:12:58.9374740Z     result = test(self, **param_kwargs)\n2022-02-23T14:12:58.9375570Z   File \"C:\\actions-runner\\_work\\pytorch\\pytorch\\build\\win_tmp\\build\\torch\\testing\\_internal\\common_utils.py\", line 2951, in wrapped\n2022-02-23T14:12:58.9376266Z     f(self, *args, **kwargs, coalesced=False)\n2022-02-23T14:12:58.9376972Z   File \"test_sparse.py\", line 1272, in test_sparse_addmm\n2022-02-23T14:12:58.9377402Z     test_shape(7, 8, 9, 20, True, None)\n2022-02-23T14:12:58.9377939Z   File \"test_sparse.py\", line 1264, in test_shape\n2022-02-23T14:12:58.9378373Z     self.assertEqual(Y, Y_dense)\n\n\n win-vs2019-cuda11.3-py3 / test (default, 2, 2, windows.8xlarge.nvidia.gpu) (2/2)\nStep: \"Test\" (full log | diagnosis details | \ud83d\udd01 rerun)\n\n\n2022-02-23T15:20:20.5710678Z FAIL [0.031s]: test_sparse_addmm_cpu_bfloat16 (__main__.TestSparseCPU)\n\n2022-02-23T15:20:20.5569146Z   test_sparse_zeros_tanh_cuda_float64 (__main__.TestSparseUnaryUfuncsCUDA) ... ok (0.000s)\n2022-02-23T15:20:20.5589083Z   test_sparse_zeros_tanh_cuda_int16 (__main__.TestSparseUnaryUfuncsCUDA) ... ok (0.000s)\n2022-02-23T15:20:20.5609025Z   test_sparse_zeros_tanh_cuda_int32 (__main__.TestSparseUnaryUfuncsCUDA) ... ok (0.000s)\n2022-02-23T15:20:20.5629080Z   test_sparse_zeros_tanh_cuda_int64 (__main__.TestSparseUnaryUfuncsCUDA) ... ok (0.016s)\n2022-02-23T15:20:20.5649102Z   test_sparse_zeros_tanh_cuda_int8 (__main__.TestSparseUnaryUfuncsCUDA) ... ok (0.000s)\n2022-02-23T15:20:20.5668867Z   test_sparse_zeros_tanh_cuda_uint8 (__main__.TestSparseUnaryUfuncsCUDA) ... ok (0.000s)\n2022-02-23T15:20:20.5688700Z   test_sparse_zeros_trunc_cuda_float32 (__main__.TestSparseUnaryUfuncsCUDA) ... ok (0.000s)\n2022-02-23T15:20:20.5708285Z   test_sparse_zeros_trunc_cuda_float64 (__main__.TestSparseUnaryUfuncsCUDA) ... ok (0.000s)\n2022-02-23T15:20:20.5709405Z \n2022-02-23T15:20:20.5709879Z ======================================================================\n2022-02-23T15:20:20.5710678Z FAIL [0.031s]: test_sparse_addmm_cpu_bfloat16 (__main__.TestSparseCPU)\n2022-02-23T15:20:20.5711399Z ----------------------------------------------------------------------\n2022-02-23T15:20:20.5712013Z Traceback (most recent call last):\n2022-02-23T15:20:20.5713280Z   File \"C:\\actions-runner\\_work\\pytorch\\pytorch\\build\\win_tmp\\build\\torch\\testing\\_internal\\common_device_type.py\", line 376, in instantiated_test\n2022-02-23T15:20:20.5714267Z     result = test(self, **param_kwargs)\n2022-02-23T15:20:20.5715299Z   File \"C:\\actions-runner\\_work\\pytorch\\pytorch\\build\\win_tmp\\build\\torch\\testing\\_internal\\common_utils.py\", line 2951, in wrapped\n2022-02-23T15:20:20.5716240Z     f(self, *args, **kwargs, coalesced=False)\n2022-02-23T15:20:20.5716943Z   File \"test_sparse.py\", line 1275, in test_sparse_addmm\n2022-02-23T15:20:20.5717516Z     test_shape(7, 8, 9, 20, False, (1, 1))\n2022-02-23T15:20:20.5718323Z   File \"test_sparse.py\", line 1264, in test_shape\n2022-02-23T15:20:20.5718915Z     self.assertEqual(Y, Y_dense)\n\n\n\n\u2744\ufe0f 1 failure tentatively classified as flaky\nbut reruns have not yet been triggered to confirm:\n linux-bionic-rocm4.5-py3.7 / test (distributed, 1, 1, linux.rocm.gpu) (1/1)\nStep: \"Test\" (full log | diagnosis details | \ud83d\udd01 rerun) \u2744\ufe0f\n\n\n2022-02-23T16:16:26.7221984Z RuntimeError: Proc...ated or timed out after 100.06913685798645 seconds\n\n2022-02-23T16:16:26.7207909Z ERROR [100.093s]: test_collect_shards (__main__.TestZeroRedundancyOptimizerDistributed)\n2022-02-23T16:16:26.7209206Z Check the state consolidation mechanism, and the state dict exposed by ZeroRedundancyOptimizer\n2022-02-23T16:16:26.7213073Z ----------------------------------------------------------------------\n2022-02-23T16:16:26.7213996Z Traceback (most recent call last):\n2022-02-23T16:16:26.7215434Z   File \"/opt/conda/lib/python3.7/site-packages/torch/testing/_internal/common_distributed.py\", line 483, in wrapper\n2022-02-23T16:16:26.7216409Z     self._join_processes(fn)\n2022-02-23T16:16:26.7217801Z   File \"/opt/conda/lib/python3.7/site-packages/torch/testing/_internal/common_distributed.py\", line 702, in _join_processes\n2022-02-23T16:16:26.7218822Z     self._check_return_codes(elapsed_time)\n2022-02-23T16:16:26.7220266Z   File \"/opt/conda/lib/python3.7/site-packages/torch/testing/_internal/common_distributed.py\", line 754, in _check_return_codes\n2022-02-23T16:16:26.7221201Z     i, elapsed_time\n2022-02-23T16:16:26.7221984Z RuntimeError: Process 0 terminated or timed out after 100.06913685798645 seconds\n2022-02-23T16:16:26.7222551Z \n2022-02-23T16:16:26.7223245Z ----------------------------------------------------------------------\n2022-02-23T16:16:26.7224032Z Ran 26 tests in 303.663s\n2022-02-23T16:16:26.7224400Z \n2022-02-23T16:16:26.7224780Z FAILED (errors=1, skipped=8, unexpected successes=3)\n2022-02-23T16:16:26.7225718Z \n2022-02-23T16:16:26.7225992Z Generating XML reports...\n2022-02-23T16:16:26.7336797Z Generated XML report: test-reports/python-unittest/distributed.optim.test_zero_redundancy_optimizer/TEST-TestZeroRedundancyOptimizerDistributed-20220223161123.xml\n2022-02-23T16:16:26.7349296Z Generated XML report: test-reports/python-unittest/distributed.optim.test_zero_redundancy_optimizer/TEST-TestZeroRedundancyOptimizerSingleRank-20220223161123.xml\n2022-02-23T16:16:27.6823633Z Traceback (most recent call last):\n\n\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
+                "createdAt": "2022-01-25T09:31:08Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": {
+                  "login": "facebook-github-bot"
+                },
+                "databaseId": 1020983433
+              },
+              {
+                "bodyText": "Thank you for signing our Contributor License Agreement. We can now accept your code for this (and any) Meta Open Source project. Thanks!",
+                "createdAt": "2022-01-25T18:07:45Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1021467314
+              },
+              {
+                "bodyText": "@albanD Is there something that needs to be done to correct the failed check ?",
+                "createdAt": "2022-02-04T13:18:05Z",
+                "author": {
+                  "login": "coolteemf"
+                },
+                "authorAssociation": "CONTRIBUTOR",
+                "editor": null,
+                "databaseId": 1029978104
+              },
+              {
+                "bodyText": "Hi,\nI think you didn't do the merge properly as there are now a lot more commits than it should be in this PR.\nYou can either clean up the branch locally and force push here or open a new clean PR.\nNote that in general, it is better to rebase on top of master than merge master into your branch!",
+                "createdAt": "2022-02-04T14:28:28Z",
+                "author": {
+                  "login": "albanD"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1030038719
+              },
+              {
+                "bodyText": "Okay thank you for the heads up",
+                "createdAt": "2022-02-04T16:44:46Z",
+                "author": {
+                  "login": "coolteemf"
+                },
+                "authorAssociation": "CONTRIBUTOR",
+                "editor": null,
+                "databaseId": 1030159616
+              },
+              {
+                "bodyText": "@albanD I just rebased and updated the branch to take into account changes from 28388b4. Is it all clear for merging ?",
+                "createdAt": "2022-02-16T15:34:59Z",
+                "author": {
+                  "login": "coolteemf"
+                },
+                "authorAssociation": "CONTRIBUTOR",
+                "editor": null,
+                "databaseId": 1041720345
+              },
+              {
+                "bodyText": "Thanks! The CI needs fixing for bc-compat and lint though\n\nThe lint should be fixed, however I didn't find clear instructions on how to fix the bc compat.\nI guess output_mask could be made optional, however in the case of native_group_norm_backward the same argument is not optional.",
+                "createdAt": "2022-02-17T08:04:30Z",
+                "author": {
+                  "login": "coolteemf"
+                },
+                "authorAssociation": "CONTRIBUTOR",
+                "editor": null,
+                "databaseId": 1042672732
+              },
+              {
+                "bodyText": "Since we are changing the signature on purpose here, you can add it to the list at https://github.com/pytorch/pytorch/blob/master/test/forward_backward_compatibility/check_forward_backward_compatibility.py#L29 to silence the test.",
+                "createdAt": "2022-02-17T14:41:16Z",
+                "author": {
+                  "login": "albanD"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1043020903
+              },
+              {
+                "bodyText": "@pytorchbot merge this please",
+                "createdAt": "2022-02-23T14:48:05Z",
+                "author": {
+                  "login": "albanD"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1048861185
+              },
+              {
+                "bodyText": "Merge failed due to 'NoneType' object is not subscriptable\nRaised by https://github.com/pytorch/pytorch/actions/runs/1887914411",
+                "createdAt": "2022-02-23T14:49:16Z",
+                "author": {
+                  "login": "pytorchmergebot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1048862374
+              },
+              {
+                "bodyText": "@coolteemf you can ignore me playing with the bot. Nothing is needed on your end anymore, I'll take it from here.",
+                "createdAt": "2022-02-23T14:52:10Z",
+                "author": {
+                  "login": "albanD"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1048865236
+              },
+              {
+                "bodyText": "@pytorchbot merge this",
+                "createdAt": "2022-02-23T14:54:23Z",
+                "author": {
+                  "login": "malfet"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": null,
+                "databaseId": 1048867615
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOPNr4Ug==",
+              "hasPreviousPage": false
+            }
+          }
+        }
+      }
+    }
   }
 }
diff --git a/.github/scripts/test_check_labels.py b/.github/scripts/test_check_labels.py
new file mode 100644
index 0000000000000..64e91dcd8ecbe
--- /dev/null
+++ b/.github/scripts/test_check_labels.py
@@ -0,0 +1,77 @@
+"""test_check_labels.py"""
+
+from typing import Any
+from unittest import TestCase, mock, main
+
+from trymerge import GitHubPR
+from test_trymerge import mocked_gh_graphql
+from check_labels import has_required_labels
+
+release_notes_labels = [
+    "release notes: AO frontend",
+    "release notes: autograd",
+    "release notes: benchmark",
+    "release notes: build",
+    "release notes: complex",
+    "release notes: composability",
+    "release notes: cpp",
+    "release notes: cuda",
+    "release notes: cudnn",
+    "release notes: dataloader",
+    "release notes: distributed (c10d)",
+    "release notes: distributed (ddp)",
+    "release notes: distributed (fsdp)",
+    "release notes: distributed (pipeline)",
+    "release notes: distributed (rpc)",
+    "release notes: distributed (sharded)",
+    "release notes: foreach_frontend",
+    "release notes: functorch",
+    "release notes: fx",
+    "release notes: hub",
+    "release notes: jit",
+    "release notes: lazy",
+    "release notes: linalg_frontend",
+    "release notes: memory format",
+    "release notes: Meta API",
+    "release notes: mobile",
+    "release notes: mps",
+    "release notes: nested tensor",
+    "release notes: nn",
+    "release notes: onnx",
+    "release notes: package/deploy",
+    "release notes: performance_as_product",
+    "release notes: profiler",
+    "release notes: python_frontend",
+    "release notes: quantization",
+    "release notes: releng",
+    "release notes: rocm",
+    "release notes: sparse",
+    "release notes: visualization",
+    "release notes: vulkan",
+]
+
+
+class TestCheckLabels(TestCase):
+    @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
+    @mock.patch('check_labels.get_release_notes_labels', return_value=release_notes_labels)
+    def test_pr_with_missing_labels(self, mocked_rn_labels: Any, mocked_gql: Any) -> None:
+        "Test PR with no 'release notes:' label or 'topic: not user facing' label"
+        pr = GitHubPR("pytorch", "pytorch", 82169)
+        self.assertFalse(has_required_labels(pr))
+
+    @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
+    @mock.patch('check_labels.get_release_notes_labels', return_value=release_notes_labels)
+    def test_pr_with_release_notes_label(self, mocked_rn_labels: Any, mocked_gql: Any) -> None:
+        "Test PR with 'release notes: nn' label"
+        pr = GitHubPR("pytorch", "pytorch", 71759)
+        self.assertTrue(has_required_labels(pr))
+
+    @mock.patch('trymerge.gh_graphql', side_effect=mocked_gh_graphql)
+    @mock.patch('check_labels.get_release_notes_labels', return_value=release_notes_labels)
+    def test_pr_with_not_user_facing_label(self, mocked_rn_labels: Any, mocked_gql: Any) -> None:
+        "Test PR with 'topic: not user facing' label"
+        pr = GitHubPR("pytorch", "pytorch", 75095)
+        self.assertTrue(has_required_labels(pr))
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py
index 87ca3ac06579c..502b22d847d23 100755
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@@ -179,6 +179,7 @@ class WorkflowCheckState(NamedTuple):
       comments(last: 5) {
         nodes {
           bodyText
+          createdAt
           author {
             login
           }
@@ -336,6 +337,7 @@ class WorkflowCheckState(NamedTuple):
       comments(last: 100, before: $cursor) {
         nodes {
           bodyText
+          createdAt
           author {
             login
           }
@@ -583,6 +585,7 @@ def can_skip_internal_checks(pr: "GitHubPR", comment_id: Optional[int] = None) -
 @dataclass
 class GitHubComment:
     body_text: str
+    created_at: str
     author_login: str
     author_association: str
     editor_login: Optional[str]
@@ -807,6 +810,7 @@ def get_pr_url(self) -> str:
     def _comment_from_node(node: Any) -> GitHubComment:
         editor = node["editor"]
         return GitHubComment(body_text=node["bodyText"],
+                             created_at=node["createdAt"] if "createdAt" in node else "",
                              author_login=node["author"]["login"],
                              author_association=node["authorAssociation"],
                              editor_login=editor["login"] if editor else None,
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 1803395f81d97..bcee8ab86c83e 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -226,6 +226,38 @@ jobs:
             false
           fi
 
+  check-labels:
+    name: Check labels
+    runs-on: linux.20_04.16x
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        with:
+          submodules: false
+          fetch-depth: 1
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.8
+          architecture: x64
+          cache: 'pip'
+          cache-dependency-path: |
+            **/.github/requirements-gha-cache.txt
+
+      - name: Install requirements
+        id: requirements
+        run: |
+          pip install -r .github/requirements-gha-cache.txt --user
+
+      - name: Check labels
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PR_NUM: ${{ github.event.number }}
+        run: |
+          set -ex
+          python3 .github/scripts/check_labels.py "${PR_NUM}"
+
   test-tools:
     name: Test tools
     if: ${{ github.repository == 'pytorch/pytorch' }}

From b6b67977dd1da2954a95d7f3e7f8dae78724c51d Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@fb.com>
Date: Fri, 21 Oct 2022 08:29:10 -0700
Subject: [PATCH 0024/1922] functionalization: make view_copy outputs always
 contiguous (#85747)

This fixes an issue with mobile: The output of view_copy ops should always be contiguous.

Later, we can consider adding optional arguments to the `view_copy()` functions to let you explicitly say what the contiguity of the output can be (e.g. channels_last)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/85747
Approved by: https://github.com/ezyang
---
 test/test_functionalization.py         | 28 +++++++++++++++++---------
 test/test_view_ops.py                  |  6 ++++++
 torchgen/gen_functionalization_type.py |  6 +++---
 3 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/test/test_functionalization.py b/test/test_functionalization.py
index 2eb79c73cc0bd..041e5b84f6945 100644
--- a/test/test_functionalization.py
+++ b/test/test_functionalization.py
@@ -659,23 +659,31 @@ def forward(self, a_1):
     getitem_1 = split_copy[1];  split_copy = None
     add_1 = torch.ops.aten.add.Tensor(getitem, ones);  getitem = ones = None
     select_copy = torch.ops.aten.select_copy.int(_reshape_alias_copy, 0, 0);  _reshape_alias_copy = None
-    clone = torch.ops.aten.clone.default(add_1, memory_format = torch.contiguous_format)
-    _unsafe_view = torch.ops.aten._unsafe_view.default(clone, [4]);  clone = None
+    _reshape_alias_copy_1 = torch.ops.aten._reshape_alias_copy.default(add_1, [4], [1])
     view_copy_1 = torch.ops.aten.view_copy.default(add, [8]);  add = None
-    _reshape_alias_copy_1 = torch.ops.aten._reshape_alias_copy.default(view_copy_1, [2, 4], [4, 1]);  view_copy_1 = None
-    transpose_copy_1 = torch.ops.aten.transpose_copy.int(_reshape_alias_copy_1, 1, 0);  _reshape_alias_copy_1 = None
+    _reshape_alias_copy_2 = torch.ops.aten._reshape_alias_copy.default(view_copy_1, [2, 4], [4, 1]);  view_copy_1 = None
+    transpose_copy_1 = torch.ops.aten.transpose_copy.int(_reshape_alias_copy_2, 1, 0);  _reshape_alias_copy_2 = None
     unsqueeze_copy_1 = torch.ops.aten.unsqueeze_copy.default(transpose_copy_1, 0);  transpose_copy_1 = None
     squeeze_copy_1 = torch.ops.aten.squeeze_copy.default(unsqueeze_copy_1);  unsqueeze_copy_1 = None
     slice_scatter = torch.ops.aten.slice_scatter.default(squeeze_copy_1, add_1, 0, 0, 2);  squeeze_copy_1 = None
     unsqueeze_copy_2 = torch.ops.aten.unsqueeze_copy.default(slice_scatter, 0);  slice_scatter = None
     squeeze_copy_2 = torch.ops.aten.squeeze_copy.dim(unsqueeze_copy_2, 0);  unsqueeze_copy_2 = None
     transpose_copy_2 = torch.ops.aten.transpose_copy.int(squeeze_copy_2, 1, 0);  squeeze_copy_2 = None
-    _reshape_alias_copy_2 = torch.ops.aten._reshape_alias_copy.default(transpose_copy_2, [8], [1]);  transpose_copy_2 = None
-    view_copy_2 = torch.ops.aten.view_copy.default(_reshape_alias_copy_2, [4, 2]);  _reshape_alias_copy_2 = None
-    view_copy_3 = torch.ops.aten.view_copy.default(view_copy_2, [8]);  view_copy_2 = None
-    _reshape_alias_copy_3 = torch.ops.aten._reshape_alias_copy.default(view_copy_3, [2, 4], [4, 1]);  view_copy_3 = None
-    select_copy_1 = torch.ops.aten.select_copy.int(_reshape_alias_copy_3, 0, 0);  _reshape_alias_copy_3 = None
-    add_2 = torch.ops.aten.add.Tensor(select_copy_1, _unsafe_view);  select_copy_1 = _unsafe_view = None
+    _reshape_alias_copy_3 = torch.ops.aten._reshape_alias_copy.default(transpose_copy_2, [8], [1]);  transpose_copy_2 = None
+    view_copy_2 = torch.ops.aten.view_copy.default(_reshape_alias_copy_3, [4, 2]);  _reshape_alias_copy_3 = None
+    view_copy_3 = torch.ops.aten.view_copy.default(view_copy_2, [8])
+    _reshape_alias_copy_4 = torch.ops.aten._reshape_alias_copy.default(view_copy_3, [2, 4], [4, 1]);  view_copy_3 = None
+    select_copy_1 = torch.ops.aten.select_copy.int(_reshape_alias_copy_4, 0, 0);  _reshape_alias_copy_4 = None
+    view_copy_4 = torch.ops.aten.view_copy.default(view_copy_2, [8]);  view_copy_2 = None
+    _reshape_alias_copy_5 = torch.ops.aten._reshape_alias_copy.default(view_copy_4, [2, 4], [4, 1]);  view_copy_4 = None
+    transpose_copy_3 = torch.ops.aten.transpose_copy.int(_reshape_alias_copy_5, 1, 0);  _reshape_alias_copy_5 = None
+    unsqueeze_copy_3 = torch.ops.aten.unsqueeze_copy.default(transpose_copy_3, 0);  transpose_copy_3 = None
+    squeeze_copy_3 = torch.ops.aten.squeeze_copy.default(unsqueeze_copy_3);  unsqueeze_copy_3 = None
+    split_copy_1 = torch.ops.aten.split_copy.Tensor(squeeze_copy_3, 2);  squeeze_copy_3 = None
+    getitem_2 = split_copy_1[0]
+    getitem_3 = split_copy_1[1];  split_copy_1 = None
+    _reshape_alias_copy_6 = torch.ops.aten._reshape_alias_copy.default(getitem_2, [4], [1]);  getitem_2 = None
+    add_2 = torch.ops.aten.add.Tensor(select_copy_1, _reshape_alias_copy_6);  select_copy_1 = _reshape_alias_copy_6 = None
     return add_1
     """)  # noqa: B950
 
diff --git a/test/test_view_ops.py b/test/test_view_ops.py
index 6c65457ae24f1..3c5987e65ae75 100644
--- a/test/test_view_ops.py
+++ b/test/test_view_ops.py
@@ -926,6 +926,12 @@ def test_view_copy(self, device):
         self.assertEqual(a_view_copy, a_view)
         self.assertEqual(a.grad, a_ref.grad)
 
+    # Testing that the output of a view_copy kernel (by default) is contiguous.
+    def test_view_copy_output_contiguous(self, device):
+        a = torch.randn(4, 4, 4, 4, device=device).to(memory_format=torch.channels_last)
+        b = torch.ops.aten.slice_copy(a, 0, 0, 2)
+        self.assertTrue(b.is_contiguous())
+
     def test_view_copy_out(self, device):
         a = torch.randn(2, 2, device=device)
         out = torch.empty(2, device=device)
diff --git a/torchgen/gen_functionalization_type.py b/torchgen/gen_functionalization_type.py
index a3f9b0b0ff2cb..a27b4f327b2ac 100644
--- a/torchgen/gen_functionalization_type.py
+++ b/torchgen/gen_functionalization_type.py
@@ -91,7 +91,7 @@ def gen_composite_view_copy_kernel(g: NativeFunctionsViewGroup) -> Optional[str]
     return self.reshape_symint(size);
   } else {
     auto output = at::_ops::view::call(self, size);
-    return output.clone();
+    return output.clone(/*memory_format=*/at::MemoryFormat::Contiguous);
   }
 }
 """
@@ -117,13 +117,13 @@ def gen_composite_view_copy_kernel(g: NativeFunctionsViewGroup) -> Optional[str]
 
     if g.view.func.returns[0].type == BaseType(BaseTy.Tensor):
         return_cloned_output = """\
-  return output.clone();"""
+  return output.clone(/*memory_format=*/at::MemoryFormat::Contiguous);"""
     else:
         # If the return type is a list, we need to clone each tensor in the list.
         return_cloned_output = f"""\
   {view_copy_sig.returns_type().cpp_type()} out_clone;
   for (const auto i : c10::irange(output.size())) {{
-    out_clone.push_back(output[i].clone());
+    out_clone.push_back(output[i].clone(/*memory_format=*/at::MemoryFormat::Contiguous));
   }}
   return out_clone;"""
 

From aa2decaa9cfd6177fe860fb36d037849607a391e Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@meta.com>
Date: Fri, 21 Oct 2022 11:17:39 -0400
Subject: [PATCH 0025/1922] ci: Allow nvidia-smi to continue with non-0 exit
 (#87464)

Allows nvidia-smi to return a non-0 exit status like status 14 since
status 14 is a warning and doesn't affect actual execution

see https://github.com/NVIDIA/gpu-operator/issues/285

Signed-off-by: Eli Uriegas <eliuriegas@meta.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87464
Approved by: https://github.com/atalman, https://github.com/malfet, https://github.com/ZainRizvi
---
 .github/scripts/install_nvidia_utils_linux.sh | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/install_nvidia_utils_linux.sh b/.github/scripts/install_nvidia_utils_linux.sh
index 855d15dde83b4..79f588633794e 100755
--- a/.github/scripts/install_nvidia_utils_linux.sh
+++ b/.github/scripts/install_nvidia_utils_linux.sh
@@ -51,7 +51,18 @@ install_nvidia_driver_amzn2() {
             sudo rm -fv /tmp/nvidia_driver
         fi
 
-        nvidia-smi
+        (
+            set +e
+            nvidia-smi
+            status=$?
+            # Allowable exit statuses for nvidia-smi, see: https://github.com/NVIDIA/gpu-operator/issues/285
+            if [ $status -eq 0 ] || [ $status -eq 14 ]; then
+                echo "INFO: Ignoring allowed status ${status}"
+            else
+                echo "ERROR: nvidia-smi exited with unresolved status ${status}"
+                exit ${status}
+            fi
+        )
     )
 }
 

From cb300de040239a690b94ca187a7ca8dbe522bc84 Mon Sep 17 00:00:00 2001
From: Catherine Lee <csl@fb.com>
Date: Fri, 21 Oct 2022 18:13:56 +0000
Subject: [PATCH 0026/1922] fix for dynamo xml reporting (#87378)

dynamo tests call a helper function in torch/_dynamo/test_case.py which then calls run_tests in common_utils.py so the test report path looked something like /opt/conda/lib/python3/10/site-packages/torch/_dynamo/test_case

* instead of using frame, use argv[0] which should be the invoking file
* got rid of sanitize functorch test name because theyve been moved into the test folder
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87378
Approved by: https://github.com/huydhn
---
 torch/testing/_internal/common_utils.py | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index cb9b52c338118..77887574e1888 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -612,15 +612,6 @@ def sanitize_test_filename(filename):
     strip_py = re.sub(r'.py$', '', filename)
     return re.sub('/', r'.', strip_py)
 
-# hack until https://github.com/pytorch/pytorch/issues/82109 is resolved
-def sanitize_if_functorch_test_filename(filename):
-    # absolute filenames must be converted to relative paths, otherwise,
-    # we cannot prepend test-reports/ to it
-    # (e.g. test-reports\\C:\\... on windows is nonsense)
-    if filename.startswith(CI_FUNCTORCH_ROOT):
-        filename = filename[len(CI_PT_ROOT) + 1:]
-    return filename
-
 def lint_test_case_extension(suite):
     succeed = True
     for test_case_or_suite in suite:
@@ -640,10 +631,8 @@ def lint_test_case_extension(suite):
     return succeed
 
 
-def get_report_path(pytest=False):
-    test_filename = inspect.getfile(sys._getframe(2))
-    test_filename = sanitize_if_functorch_test_filename(test_filename)
-    test_filename = sanitize_test_filename(test_filename)
+def get_report_path(argv=UNITTEST_ARGS, pytest=False):
+    test_filename = sanitize_test_filename(argv[0])
     test_report_path = TEST_SAVE_XML + LOG_SUFFIX
     test_report_path = os.path.join(test_report_path, test_filename)
     if pytest:

From bd7e75696f6d5d907db413d412a16ea4ac29c8de Mon Sep 17 00:00:00 2001
From: Zain Rizvi <zainr@fb.com>
Date: Fri, 21 Oct 2022 18:15:38 +0000
Subject: [PATCH 0027/1922] Only label checks against pull requests (#87488)

When a commit is triggered via any mechanism other than a pull request, there will not be a PR to check labels for.

The job will fail with the error:
```
2022-10-21T17:50:53.2938592Z + python3 .github/scripts/check_labels.py ''
2022-10-21T17:50:53.4758863Z usage: Check PR labels [-h] pr_num
2022-10-21T17:50:53.4759337Z Check PR labels: error: argument pr_num: invalid int value: ''
```

Instead, we should limit the workflow to only run on pull requests
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87488
Approved by: https://github.com/huydhn
---
 .github/workflows/lint.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index bcee8ab86c83e..669977b143a5e 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -229,6 +229,7 @@ jobs:
   check-labels:
     name: Check labels
     runs-on: linux.20_04.16x
+    if: github.event_name == 'pull_request'
     steps:
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@master

From 549d6e0828fd96a407c4a81fb53e975801397a2c Mon Sep 17 00:00:00 2001
From: Iris Zhang <irisz@meta.com>
Date: Fri, 21 Oct 2022 18:45:38 +0000
Subject: [PATCH 0028/1922] [1/N][C10D] Add a customized ScubaLogHandler
 implementation for internal FB use (#86699) (#87123)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86699

This diff does the following:
1. **c10d_error_logger.py**: Add an API  to create a logger with a specific logging handler based on the destination.
2. The API from above would get a logging handler based on the destination provided.
-  **caffe2/torch/distributed/logging_handlers.py**: For OSS, we simply use a NullHandler() for now.
3. Add associated test files for 1 and 2.

Test Plan:
## Unit Test
```
buck test @//mode/dev-nosan //caffe2/test/distributed:test_c10d_error_logger -- --print-passing-details
```
```
File changed: fbcode//caffe2/test/distributed/test_c10d_error_logger.py
File changed: fbsource//xplat/caffe2/test/distributed/TARGETS
9 additional file changes
waiting for all tests to finish...
✓ Listing success: caffe2/test/distributed:test_c10d_error_logger (0.2s)
Found 1 tests
✓ Pass: caffe2/test/distributed:test_c10d_error_logger - test_get_or_create_logger (caffe2.test.distributed.test_c10d_error_logger.C10dErrorLoggerTest) (0.2s)

stdout:

stderr:

Buck UI:      https://www.internalfb.com/buck2/b975f6b0-77e9-4287-8722-f95b48036181
Test Session: https://www.internalfb.com/intern/testinfra/testrun/1407375150206593
RE: reSessionID-4d7ab8ca-1051-48e9-a5a8-6edbe15d1fe4  Up: 124 B  Down: 0 B
Jobs completed: 5. Time elapsed: 3.5s.
Tests finished: Pass 1. Fail 0. Fatal 0. Skip 0. 0 builds failed
```

Differential Revision: D39920391

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87123
Approved by: https://github.com/fduwjj, https://github.com/H-Huang
---
 test/distributed/test_c10d_error_logger.py | 17 ++++++++++
 test/run_test.py                           |  1 +
 torch/distributed/c10d_error_logger.py     | 36 ++++++++++++++++++++++
 torch/distributed/logging_handlers.py      | 14 +++++++++
 4 files changed, 68 insertions(+)
 create mode 100644 test/distributed/test_c10d_error_logger.py
 create mode 100644 torch/distributed/c10d_error_logger.py
 create mode 100644 torch/distributed/logging_handlers.py

diff --git a/test/distributed/test_c10d_error_logger.py b/test/distributed/test_c10d_error_logger.py
new file mode 100644
index 0000000000000..8001f2b869d83
--- /dev/null
+++ b/test/distributed/test_c10d_error_logger.py
@@ -0,0 +1,17 @@
+# Owner(s): ["oncall: distributed"]
+
+import logging
+import unittest
+from unittest.mock import patch
+
+from torch.distributed.c10d_error_logger import _get_or_create_logger
+
+class C10dErrorLoggerTest(unittest.TestCase):
+
+    @patch("torch.distributed.c10d_error_logger._get_logging_handler")
+    def test_get_or_create_logger(self, logging_handler_mock):
+        logging_handler_mock.return_value = logging.NullHandler(), "NullHandler"
+        logger = _get_or_create_logger()
+        self.assertIsNotNone(logger)
+        self.assertEqual(1, len(logger.handlers))
+        self.assertIsInstance(logger.handlers[0], logging.NullHandler)
diff --git a/test/run_test.py b/test/run_test.py
index 35004406d0115..620a8b712aeeb 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -170,6 +170,7 @@ def skip_test_p(name: str) -> bool:
     "distributed/elastic/events/lib_test",
     "distributed/elastic/agent/server/test/api_test",
     "test_deploy",
+    "distributed/test_c10d_error_logger.py"
 ]
 
 WINDOWS_BLOCKLIST = [
diff --git a/torch/distributed/c10d_error_logger.py b/torch/distributed/c10d_error_logger.py
new file mode 100644
index 0000000000000..10605c69be476
--- /dev/null
+++ b/torch/distributed/c10d_error_logger.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import Tuple
+
+from torch.distributed.logging_handlers import _log_handlers
+
+_c10d_error_logger = None
+
+
+def _get_or_create_logger() -> logging.Logger:
+    global _c10d_error_logger
+    if _c10d_error_logger:
+        return _c10d_error_logger
+    logging_handler, log_handler_name = _get_logging_handler()
+    _c10d_error_logger = logging.getLogger(f"c10d-collectives-{log_handler_name}")
+    _c10d_error_logger.setLevel(logging.DEBUG)
+    formatter = logging.Formatter(
+        "%(asctime)s %(filename)s:%(lineno)s %(levelname)s p:%(processName)s t:%(threadName)s: %(message)s"
+    )
+    logging_handler.setFormatter(formatter)
+    _c10d_error_logger.propagate = False
+    _c10d_error_logger.addHandler(logging_handler)
+    return _c10d_error_logger
+
+
+def _get_logging_handler(destination: str = "default") -> Tuple[logging.Handler, str]:
+    log_handler = _log_handlers[destination]
+    log_handler_name = type(log_handler).__name__
+    return (log_handler, log_handler_name)
diff --git a/torch/distributed/logging_handlers.py b/torch/distributed/logging_handlers.py
new file mode 100644
index 0000000000000..7c3b3249f6c79
--- /dev/null
+++ b/torch/distributed/logging_handlers.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import Dict
+
+_log_handlers: Dict[str, logging.Handler] = {
+    "default": logging.NullHandler(),
+}

From ec6e885aa0fc4e2e457b21aebc03fccbdfde955f Mon Sep 17 00:00:00 2001
From: Alex <aslvrstn@gmail.com>
Date: Fri, 21 Oct 2022 19:03:00 +0000
Subject: [PATCH 0029/1922] Slowly introduce ops to be tested by test_numpy_ref
 on MPS backend (#87342)

Enable a test that would have caught https://github.com/pytorch/pytorch/issues/86239

Prior to the fix for that bug, this test fails with

```
_____________________________ TestCommonMPS.test_numpy_ref_mps_where_mps_float32 _____________________________
Traceback (most recent call last):
  File "/Users/alex/git/pytorch/test/test_ops.py", line 197, in test_numpy_ref_mps
    self.compare_with_reference(
  File "/Users/alex/git/pytorch/torch/testing/_internal/common_utils.py", line 2366, in compare_with_reference
    actual = torch_fn(t_inp, *t_args, **t_kwargs)
  File "/Users/alex/git/pytorch/torch/testing/_internal/opinfo/core.py", line 1068, in __call__
    return self.op(*args, **kwargs)
  File "/Users/alex/git/pytorch/torch/testing/_internal/common_methods_invocations.py", line 15167, in <lambda>
    op=lambda self, condition, other: torch.where(condition, self, other),
RuntimeError: 0'th index 3 of x tensor does not match the other tensors
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87342
Approved by: https://github.com/albanD
---
 test/test_mps.py                              | 75 +++++++++++++++++--
 torch/testing/_internal/common_device_type.py | 17 ++++-
 .../_internal/common_methods_invocations.py   | 40 ++++++++--
 .../_internal/opinfo/definitions/linalg.py    | 33 ++++++++
 .../_internal/opinfo/definitions/signal.py    | 21 ++++++
 5 files changed, 173 insertions(+), 13 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index 9702239df95df..8eeae7dbcaf7b 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -18,9 +18,10 @@
 from collections import defaultdict
 from torch._six import inf
 from torch.nn import Parameter
+from torch.testing._internal import opinfo
 from torch.testing._internal.common_utils import \
-    (gradcheck, gradgradcheck, run_tests, TestCase, download_file,
-     TEST_WITH_UBSAN, dtype_abbrs)
+    (gradcheck, gradgradcheck, run_tests, TestCase, download_file, IS_CI,
+     TEST_WITH_UBSAN, dtype_abbrs, skipIfSlowGradcheckEnv, TEST_WITH_ASAN, suppress_warnings)
 from torch.testing import make_tensor
 from torch.testing._comparison import TensorLikePair
 from torch.testing._internal.common_dtype import get_all_dtypes, integral_types
@@ -28,13 +29,31 @@
 from torch.distributions import Uniform, Exponential
 from functools import partial
 
-from torch.testing._internal.common_methods_invocations import op_db
-from torch.testing._internal.common_device_type import ops, instantiate_device_type_tests
+from torch.testing._internal.common_methods_invocations import (
+    op_db,
+    UnaryUfuncInfo,
+    ReductionOpInfo,
+    SpectralFuncInfo,
+    BinaryUfuncInfo,
+)
+from torch.testing._internal.common_device_type import ops, instantiate_device_type_tests, onlyMPS
 from torch.testing._internal.common_nn import NNTestCase
 import numpy as np
 import torch
 import torch.utils._pytree as pytree
 
+
+# Copied from `test_ops.py` for the purposes of duplicating `test_numpy_ref`
+_ref_test_ops = tuple(
+    filter(
+        lambda op: not isinstance(
+            op, (UnaryUfuncInfo, ReductionOpInfo, SpectralFuncInfo, BinaryUfuncInfo)
+        )
+        and op.ref is not None,
+        op_db,
+    )
+)
+
 # Same logic as test_cuda.py
 if not torch.backends.mps.is_available():
     print('MPS not available, skipping tests', file=sys.stderr)
@@ -7790,10 +7809,56 @@ def req_grad(t):
             # So each test append to the dict and write it.
             with open("new_mps_allowlist_grad.txt", "w") as f:
                 pprint.pprint(self.NEW_ALLOW_LIST_GRAD, stream=f)
+
+
+# Copied from `TestCommon` in `test_ops.py`, just enough to duplicate the `test_numpy_ref` for MPS
+@skipIfSlowGradcheckEnv
+class TestCommon(TestCase):
+    exact_dtype = True
+
+    # Verifies, on teardown, that no OpInfo is still using dynamic dtypes in CI
+    @classmethod
+    def tearDownClass(cls):
+        super().tearDownClass()
+
+        if IS_CI:
+            err_msg = (
+                "The operator(s) below is(are) using dynamic_dtypes in the OpInfo entries."
+                "This is OK for testing, but be sure to set the dtypes manually before landing your PR!"
+            )
+            # Assure no opinfo entry has dynamic_dtypes
+            filtered_ops = list(filter(opinfo.utils.is_dynamic_dtype_set, op_db))
+            for op in filtered_ops:
+                fmt_str = opinfo.utils.str_format_dynamic_dtype(op)
+                err_msg += "\n" + fmt_str
+
+            assert len(filtered_ops) == 0, err_msg
+
+    # This is the MPS equivalent of `test_numpy_ref` from `test_ops.py`. It lives over here while
+    # MPS still requires some fairly heavy special casing in the test framework.
+    # When MPS becomes more consistent, this can probably be merged with that test using
+    # `@dtypesIfMPS(torch.float32)`, but for now, the assertions themselves need to be loosened
+    @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
+    @onlyMPS
+    @suppress_warnings
+    # MPS only supports float32
+    @ops(_ref_test_ops, allowed_dtypes=(torch.float32,))
+    def test_numpy_ref_mps(self, device, dtype, op):
+        # Unlike `test_numpy_ref`, this test compares in `float32` since at the time of this test's creation MPS
+        # does not support float64 Tensors.
+        # A few ops are currently broken on their reference inputs, but not their sample inputs. These should
+        # get patched up and this workaround removed.
+        broken_on_ref_inputs = op.name in ['cat', 'clamp', 'where']
+        inputs = op.reference_inputs(device, dtype) if not broken_on_ref_inputs else op.sample_inputs(device, dtype)
+        for sample_input in inputs:
+            self.compare_with_reference(op, op.ref, sample_input)
+
 # TODO: Actually instantiate that test for the "mps" device to better reflect what it is doing.
 # This requires mps to be properly registered in the device generic test framework which is not the
-# case right now.
+# case right now. We can probably use `allow_mps` introduced in https://github.com/pytorch/pytorch/pull/87342
+# to achieve this.
 instantiate_device_type_tests(TestConsistency, globals(), only_for="cpu")
+instantiate_device_type_tests(TestCommon, globals(), allow_mps=True)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index aec7191c3c6eb..7d9f31330ef6d 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -10,6 +10,7 @@
 import unittest
 import os
 import torch
+import torch.backends.mps
 from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM, TEST_MKL, \
     skipCUDANonDefaultStreamIf, TEST_WITH_ASAN, TEST_WITH_UBSAN, TEST_WITH_TSAN, \
     IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, IS_WINDOWS, \
@@ -198,6 +199,8 @@
 #         Skips the test if the device is not a CPU device
 #     - @onlyCUDA
 #         Skips the test if the device is not a CUDA device
+#     - @onlyMPS
+#         Skips the test if the device is not a MPS device
 #     - @skipCPUIfNoLapack
 #         Skips the test if the device is a CPU device and LAPACK is not installed
 #     - @skipCPUIfNoMkl
@@ -590,7 +593,7 @@ def filter_desired_device_types(device_type_test_bases, except_for=None, only_fo
 # The tests in these test cases are derived from the generic tests in
 # generic_test_class.
 # See note "Generic Device Type Testing."
-def instantiate_device_type_tests(generic_test_class, scope, except_for=None, only_for=None, include_lazy=False):
+def instantiate_device_type_tests(generic_test_class, scope, except_for=None, only_for=None, include_lazy=False, allow_mps=False):
     # Removes the generic test class from its enclosing scope so its tests
     # are not discoverable.
     del scope[generic_test_class.__name__]
@@ -609,9 +612,13 @@ def instantiate_device_type_tests(generic_test_class, scope, except_for=None, on
     generic_members = set(generic_test_class.__dict__.keys()) - set(empty_class.__dict__.keys())
     generic_tests = [x for x in generic_members if x.startswith('test')]
 
+    # MPS backend support is disabled in `get_device_type_test_bases` while support is being ramped
+    # up, so allow callers to specifically opt tests into being tested on MPS, similar to `include_lazy`
+    test_bases = device_type_test_bases.copy()
+    if allow_mps and torch.backends.mps.is_available() and MPSTestBase not in test_bases:
+        test_bases.append(MPSTestBase)
     # Filter out the device types based on user inputs
-    desired_device_type_test_bases = filter_desired_device_types(device_type_test_bases,
-                                                                 except_for, only_for)
+    desired_device_type_test_bases = filter_desired_device_types(test_bases, except_for, only_for)
     if include_lazy:
         # Note [Lazy Tensor tests in device agnostic testing]
         # Right now, test_view_ops.py runs with LazyTensor.
@@ -1143,6 +1150,10 @@ def onlyCUDA(fn):
     return onlyOn('cuda')(fn)
 
 
+def onlyMPS(fn):
+    return onlyOn('mps')(fn)
+
+
 def disablecuDNN(fn):
 
     @wraps(fn)
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index e5d6e6efe18a9..34f54f2fb5ae1 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -6257,7 +6257,7 @@ def make_bool_mask(shape):
 
         if mask_t.sum() == 0:
             def random_index(shape):
-                return tuple(map(lambda max_idx: random.randint(0, max_idx), shape))
+                return tuple(map(lambda max_idx: random.randrange(0, max_idx), shape))
 
             mask_t[random_index(mask_t.shape)] = True
             return mask_t
@@ -6268,7 +6268,9 @@ def random_index(shape):
              ((M, 1, M), (M, M), (M, M, 1), True),
              ((), (), (), False),
              ((M, 1, M), (), (M, M, 1), True),
-             ((), (M, M), (), True),)
+             ((), (M, M), (), True),
+             ((), (2), (1, 1), True),
+             )
 
     for shape, mask_shape, other_shape, broadcasts_input in cases:
         yield SampleInput(make_arg(shape),
@@ -8206,6 +8208,11 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    toleranceOverride({torch.float32: tol(atol=1.3e-05, rtol=1.3e-05),
                                       torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}),
                    'TestCommon', 'test_numpy_refs'),
+               # MPS has slightly worse precision. Is this acceptable?
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=1.3e-04, rtol=1.3e-04),
+                                      torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}),
+                   'TestCommon', 'test_numpy_ref_mps'),
                DecorateInfo(
                    toleranceOverride({torch.float32: tol(atol=1e-5, rtol=1e-5)}),
                    'TestConsistency',
@@ -8701,6 +8708,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # TypeError: _copy_dispatcher() got an unexpected keyword argument 'memory_format'
                # (NumPy reference needs to be extended with memory_format)
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_numpy_ref'),
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_numpy_ref_mps'),
            ),),
     OpInfo('contiguous',
            op=lambda x, *args, **kwargs: x.contiguous(*args, **kwargs),
@@ -10398,6 +10406,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # Extremal value issue on aten::native_layer_norm, which returns 'nan' for mean on 'inf' inputs
                # possibly because of the welford implementation.
                DecorateInfo(unittest.skip("Skipped!"), 'TestCudaFuserOpInfo', 'test_nvfuser_extremal_values'),
+               DecorateInfo(unittest.skip("Unsupported on MPS for now"), 'TestCommon', 'test_numpy_ref_mps'),
            )),
     OpInfo('native_batch_norm',
            aten_name='native_batch_norm',
@@ -10664,6 +10673,19 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # RuntimeError: !lhs.isAliasOf(rhs)INTERNAL ASSERT FAILED at
                # "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":104, please report a bug to PyTorch.
                DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               # RuntimeError: UNSUPPORTED DTYPE: complex
+               DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness',
+                            dtypes=(torch.complex64, torch.complex128)),
+               # RuntimeError: "slow_conv2d_cpu_grad_input" not implemented for 'Long'
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_numpy_ref',
+                            dtypes=(torch.int64,)),
+               # Reference: https://github.com/pytorch/pytorch/issues/86356
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_numpy_ref',
+                            dtypes=(torch.double, torch.cdouble)),
+               DecorateInfo(unittest.skip("Unsupported on MPS for now"), 'TestCommon', 'test_numpy_ref_mps'),
+               # AssertionError: None mismatch: torch.complex64 is not None
+               DecorateInfo(unittest.expectedFailure, 'TestDtypeCustomRules', 'test_custom_rules',
+                            dtypes=(torch.complex64, torch.complex128)),
            ),
            supports_out=False,),
     OpInfo('nn.functional.conv_transpose3d',
@@ -10826,7 +10848,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(
                    toleranceOverride({torch.float32: tol(atol=1e-05, rtol=1e-03)}),
                    'TestCommon', 'test_numpy_refs'
-               )
+               ),
+               DecorateInfo(unittest.skip("Bug in MPS backend!"), 'TestCommon', 'test_numpy_ref_mps'),
            ],
            sample_inputs_func=sample_inputs_layer_norm,
            supports_expanded_weight=True,),
@@ -12116,7 +12139,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            skips=(
                # AssertionError: Tensor-likes are not close!
                # May not replicate in CI
-               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out'),)),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out'),
+               DecorateInfo(unittest.skip("Unsupported on MPS for now"), 'TestCommon', 'test_numpy_ref_mps'),
+           )),
     UnaryUfuncInfo('nn.functional.relu6',
                    aten_name="relu6",
                    dtypes=all_types_and(torch.bfloat16),
@@ -14507,6 +14532,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # JIT tests don't work with Tensor keyword arguments
                # https://github.com/pytorch/pytorch/issues/58507
                DecorateInfo(unittest.skip("Expected failure!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.skip("Unsupported on MPS for now"), 'TestCommon', 'test_numpy_ref_mps'),
            )),
     OpInfo('cat',
            ref=_cat_np,
@@ -16123,7 +16149,11 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         sample_inputs_func=sample_inputs_pdist,
         dtypes=floating_types(),
         supports_out=False,
-        supports_gradgrad=False),
+        supports_gradgrad=False,
+        skips=(
+            DecorateInfo(unittest.skip("Unsupported on MPS for now"), 'TestCommon', 'test_numpy_ref_mps'),
+        )
+    ),
     OpInfo(
         "nn.functional.poisson_nll_loss",
         dtypes=all_types_and(torch.bfloat16),
diff --git a/torch/testing/_internal/opinfo/definitions/linalg.py b/torch/testing/_internal/opinfo/definitions/linalg.py
index 2d899dcd0ca24..193f1f2db85cc 100644
--- a/torch/testing/_internal/opinfo/definitions/linalg.py
+++ b/torch/testing/_internal/opinfo/definitions/linalg.py
@@ -1093,6 +1093,13 @@ def make_input():
         supports_out=True,
         supports_fwgrad_bwgrad=True,
         supports_forward_ad=True,
+        skips=(
+            DecorateInfo(
+                unittest.skip("Unsupported on MPS for now"),
+                "TestCommon",
+                "test_numpy_ref_mps",
+            ),
+        ),
     ),
     OpInfo(
         "linalg.det",
@@ -1211,6 +1218,11 @@ def make_input():
                 "test_schema_correctness",
                 dtypes=(torch.complex64, torch.complex128),
             ),
+            DecorateInfo(
+                unittest.skip("Unsupported on MPS for now"),
+                "TestCommon",
+                "test_numpy_ref_mps",
+            ),
         ),
     ),
     OpInfo(
@@ -1647,6 +1659,13 @@ def make_input():
         supports_fwgrad_bwgrad=True,
         supports_out=False,
         sample_inputs_func=sample_inputs_linalg_vander,
+        skips=(
+            DecorateInfo(
+                unittest.skip("Unsupported on MPS for now"),
+                "TestCommon",
+                "test_numpy_ref_mps",
+            ),
+        ),
     ),
     ReductionOpInfo(
         "linalg.vector_norm",
@@ -2123,6 +2142,13 @@ def make_input():
         # See https://github.com/pytorch/pytorch/pull/78358
         check_batched_forward_grad=False,
         decorators=[skipCPUIfNoLapack, skipCUDAIfNoMagmaAndNoCusolver],
+        skips=(
+            DecorateInfo(
+                unittest.skip("Unsupported on MPS for now"),
+                "TestCommon",
+                "test_numpy_ref_mps",
+            ),
+        ),
     ),
     OpInfo(
         "linalg.tensorsolve",
@@ -2141,6 +2167,13 @@ def make_input():
                 device_type="cuda",
             ),
         ],
+        skips=(
+            DecorateInfo(
+                unittest.skip("Unsupported on MPS for now"),
+                "TestCommon",
+                "test_numpy_ref_mps",
+            ),
+        ),
     ),
 ]
 
diff --git a/torch/testing/_internal/opinfo/definitions/signal.py b/torch/testing/_internal/opinfo/definitions/signal.py
index 1f1c8d7e6a6d1..3b7f3e4de4001 100644
--- a/torch/testing/_internal/opinfo/definitions/signal.py
+++ b/torch/testing/_internal/opinfo/definitions/signal.py
@@ -291,6 +291,13 @@ def make_signal_windows_opinfo(
         sample_inputs_func=sample_inputs_window,
         reference_inputs_func=reference_inputs_window,
         error_inputs_func=error_inputs_window,
+        skips=(
+            DecorateInfo(
+                unittest.skip("Buggy on MPS for now (mistakenly promotes to float64)"),
+                "TestCommon",
+                "test_numpy_ref_mps",
+            ),
+        ),
     ),
     make_signal_windows_opinfo(
         name="signal.windows.exponential",
@@ -300,6 +307,13 @@ def make_signal_windows_opinfo(
         sample_inputs_func=partial(sample_inputs_window, tau=2.78),
         reference_inputs_func=partial(reference_inputs_exponential_window, tau=2.78),
         error_inputs_func=error_inputs_exponential_window,
+        skips=(
+            DecorateInfo(
+                unittest.skip("Buggy on MPS for now (mistakenly promotes to float64)"),
+                "TestCommon",
+                "test_numpy_ref_mps",
+            ),
+        ),
     ),
     make_signal_windows_opinfo(
         name="signal.windows.gaussian",
@@ -309,5 +323,12 @@ def make_signal_windows_opinfo(
         sample_inputs_func=partial(sample_inputs_window, std=1.92),
         reference_inputs_func=partial(reference_inputs_gaussian_window, std=1.92),
         error_inputs_func=error_inputs_gaussian_window,
+        skips=(
+            DecorateInfo(
+                unittest.skip("Buggy on MPS for now (mistakenly promotes to float64)"),
+                "TestCommon",
+                "test_numpy_ref_mps",
+            ),
+        ),
     ),
 ]

From bd60463f279b8aad2308562392d76f0d5ba6f08a Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Fri, 21 Oct 2022 19:14:28 +0000
Subject: [PATCH 0030/1922] [BE] Remove pip and conda installation in Linux
 build workflow (#87256)

All the dependencies should come from the Docker container already. This only updates Linux build workflow, Linux test workflow comes later in a separate PR.

The `opt-einsum` package that was installed as part of PyTorch wheel has already been installed in the Docker container [requirements-ci.txt](https://github.com/pytorch/pytorch/blob/master/.circleci/docker/requirements-ci.txt#L127)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87256
Approved by: https://github.com/malfet
---
 .jenkins/pytorch/build-asan.sh   | 2 +-
 .jenkins/pytorch/build-tsan.sh   | 2 +-
 .jenkins/pytorch/build.sh        | 7 +------
 .jenkins/pytorch/common_utils.sh | 6 ++++++
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/.jenkins/pytorch/build-asan.sh b/.jenkins/pytorch/build-asan.sh
index d2cafa323fc56..91953c322f223 100755
--- a/.jenkins/pytorch/build-asan.sh
+++ b/.jenkins/pytorch/build-asan.sh
@@ -26,7 +26,7 @@ CC="clang" CXX="clang++" LDSHARED="clang --shared" \
   CFLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -fsanitize-address-use-after-scope -shared-libasan" \
   USE_ASAN=1 USE_CUDA=0 USE_MKLDNN=0 \
   python setup.py bdist_wheel
-  python -mpip install "$(echo dist/*.whl)[opt-einsum]"
+  pip_install_whl "$(echo dist/*.whl)"
 
 # Test building via the sdist source tarball
 python setup.py sdist
diff --git a/.jenkins/pytorch/build-tsan.sh b/.jenkins/pytorch/build-tsan.sh
index 41ebdd5cb1eed..e10edb310d813 100755
--- a/.jenkins/pytorch/build-tsan.sh
+++ b/.jenkins/pytorch/build-tsan.sh
@@ -22,7 +22,7 @@ CC="clang" CXX="clang++" LDSHARED="clang --shared" \
   CFLAGS="-fsanitize=thread" \
   USE_TSAN=1 USE_CUDA=0 USE_MKLDNN=0 \
   python setup.py bdist_wheel
-  python -mpip install dist/*.whl
+  pip_install_whl "$(echo dist/*.whl)"
 
 print_sccache_stats
 
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index 24567449424a6..58cdc1227ac2d 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -62,9 +62,6 @@ elif [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
   export ATEN_THREADING=NATIVE
 fi
 
-# TODO: Don't run this...
-pip_install -r requirements.txt || true
-
 # Enable LLVM dependency for TensorExpr testing
 if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
   export USE_LLVM=/opt/rocm/llvm
@@ -74,13 +71,11 @@ else
   export LLVM_DIR=/opt/llvm/lib/cmake/llvm
 fi
 
-# TODO: Don't install this here
 if ! which conda; then
   # In ROCm CIs, we are doing cross compilation on build machines with
   # intel cpu and later run tests on machines with amd cpu.
   # Also leave out two builds to make sure non-mkldnn builds still work.
   if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
-    pip_install mkl mkl-devel
     export USE_MKLDNN=1
   else
     export USE_MKLDNN=0
@@ -230,7 +225,7 @@ else
     else
       python setup.py bdist_wheel
     fi
-    python -mpip install "$(echo dist/*.whl)[opt-einsum]"
+    pip_install_whl "$(echo dist/*.whl)"
 
     # TODO: I'm not sure why, but somehow we lose verbose commands
     set -x
diff --git a/.jenkins/pytorch/common_utils.sh b/.jenkins/pytorch/common_utils.sh
index c0e51bc80aa8c..d8c853f97ab23 100644
--- a/.jenkins/pytorch/common_utils.sh
+++ b/.jenkins/pytorch/common_utils.sh
@@ -49,6 +49,12 @@ function assert_git_not_dirty() {
     fi
 }
 
+function pip_install_whl() {
+  # This is used to install PyTorch and other build artifacts wheel locally
+  # without using any network connection
+  python3 -mpip install --no-index --no-deps "$@"
+}
+
 function pip_install() {
   # retry 3 times
   # old versions of pip don't have the "--progress-bar" flag

From 71a05b857d14033e5192cbc584ea414aa05f7fd1 Mon Sep 17 00:00:00 2001
From: samdow <samdow@fb.com>
Date: Wed, 19 Oct 2022 10:36:40 -0400
Subject: [PATCH 0031/1922] [Modes] refactor modes to only use a stack in cpp
 (#86458)

Refactors the mode code to only have the C++ mode stack and not the "C++ mode" like we originally had. This also simplifies the mode logic in a number of places
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86458
Approved by: https://github.com/zou3519
---
 aten/src/ATen/PythonTorchFunctionTLS.cpp      | 16 +-----
 aten/src/ATen/PythonTorchFunctionTLS.h        |  9 +--
 aten/src/ATen/core/PythonFallbackKernel.cpp   |  7 ++-
 c10/core/TensorImpl.cpp                       |  9 +--
 c10/core/impl/TorchDispatchModeTLS.cpp        | 55 ++++++-------------
 c10/core/impl/TorchDispatchModeTLS.h          | 11 ----
 test/test_overrides.py                        |  4 +-
 test/test_python_dispatch.py                  |  2 +-
 torch/csrc/autograd/init.cpp                  | 31 +----------
 torch/csrc/autograd/python_variable.cpp       |  4 +-
 torch/csrc/jit/python/pybind_utils.cpp        |  3 +-
 torch/csrc/utils/disable_torch_function.cpp   |  2 +-
 torch/csrc/utils/python_arg_parser.cpp        | 42 +++++++-------
 torch/csrc/utils/python_arg_parser.h          |  2 +-
 torch/csrc/utils/python_torch_function_mode.h | 15 +++--
 torch/csrc/utils/torch_dispatch_mode.h        | 24 +++++++-
 torch/overrides.py                            | 38 ++-----------
 torch/utils/_python_dispatch.py               | 22 +-------
 18 files changed, 98 insertions(+), 198 deletions(-)

diff --git a/aten/src/ATen/PythonTorchFunctionTLS.cpp b/aten/src/ATen/PythonTorchFunctionTLS.cpp
index c4e1241805a88..c9487c6958cbf 100644
--- a/aten/src/ATen/PythonTorchFunctionTLS.cpp
+++ b/aten/src/ATen/PythonTorchFunctionTLS.cpp
@@ -6,18 +6,6 @@ namespace impl {
 
 static thread_local PythonTorchFunctionTLS pythonTorchFunctionState;
 
-void PythonTorchFunctionTLS::set_mode(std::shared_ptr<c10::SafePyObject> mode) {
-  pythonTorchFunctionState.mode_ = std::move(mode);
-}
-
-const std::shared_ptr<c10::SafePyObject>& PythonTorchFunctionTLS::get_mode() {
-  return pythonTorchFunctionState.mode_;
-}
-
-void PythonTorchFunctionTLS::swap_mode(std::shared_ptr<c10::SafePyObject>& mode) {
-  pythonTorchFunctionState.mode_.swap(mode);
-}
-
 void PythonTorchFunctionTLS::push_onto_stack(std::shared_ptr<SafePyObject> mode) {
   pythonTorchFunctionState.stack_.push_back(std::move(mode));
 }
@@ -54,8 +42,8 @@ const PythonTorchFunctionTLS& PythonTorchFunctionTLS::get_state() {
   return pythonTorchFunctionState;
 }
 
-bool function_mode_enabled() {
-  return static_cast<bool>(PythonTorchFunctionTLS::get_mode());
+bool torch_function_mode_enabled() {
+  return PythonTorchFunctionTLS::stack_len() > 0;
 }
 
 } // namespace impl
diff --git a/aten/src/ATen/PythonTorchFunctionTLS.h b/aten/src/ATen/PythonTorchFunctionTLS.h
index ef283164246d3..5940fb6f2dee2 100644
--- a/aten/src/ATen/PythonTorchFunctionTLS.h
+++ b/aten/src/ATen/PythonTorchFunctionTLS.h
@@ -10,10 +10,6 @@ struct TORCH_API PythonTorchFunctionTLS {
   static void set_disabled(bool);
   static bool is_disabled();
 
-  static void set_mode(std::shared_ptr<c10::SafePyObject>);
-  static const std::shared_ptr<c10::SafePyObject>& get_mode();
-  static void swap_mode(std::shared_ptr<c10::SafePyObject>&);
-
   static void push_onto_stack(std::shared_ptr<SafePyObject> mode);
   static const std::shared_ptr<SafePyObject> pop_stack();
   static const std::shared_ptr<SafePyObject>& get_stack_at(int64_t idx);
@@ -26,16 +22,13 @@ struct TORCH_API PythonTorchFunctionTLS {
   // The mode TLS is split into
   //   - disabled_, which says whether or not to disable all torch function
   //   modes
-  //   - mode_, which is the C++ mode, that can only be the mode handling mode
-  //   or null
   //   - stack_, which is a vector of modes representing the stack of user
   //   defined modes
   bool disabled_;
-  std::shared_ptr<c10::SafePyObject> mode_ = nullptr;
   std::vector<std::shared_ptr<c10::SafePyObject>> stack_;
 };
 
-TORCH_API bool function_mode_enabled();
+TORCH_API bool torch_function_mode_enabled();
 
 } // namespace impl
 } // namespace at
diff --git a/aten/src/ATen/core/PythonFallbackKernel.cpp b/aten/src/ATen/core/PythonFallbackKernel.cpp
index fcdb018b6ff7b..e16874a83f966 100644
--- a/aten/src/ATen/core/PythonFallbackKernel.cpp
+++ b/aten/src/ATen/core/PythonFallbackKernel.cpp
@@ -52,9 +52,10 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
 
 
   // If Torch Dispatch Mode is active, use its PyInterpreter for dispatch
-  const auto& maybe_torch_dispatch_mode_state = c10::impl::TorchDispatchModeTLS::get_mode();
-  if (maybe_torch_dispatch_mode_state) {
-    maybe_torch_dispatch_mode_state->pyinterpreter()->dispatch(op, stack);
+  const auto mode_stack_len = c10::impl::TorchDispatchModeTLS::stack_len();
+  if (mode_stack_len > 0) {
+    const auto& cur_torch_dispatch_mode_state = c10::impl::TorchDispatchModeTLS::get_stack_at(mode_stack_len - 1);
+    cur_torch_dispatch_mode_state->pyinterpreter()->dispatch(op, stack);
     return;
   }
 
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index 3951578a848cc..976382cf2ee7f 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -611,12 +611,13 @@ c10::intrusive_ptr<TensorImpl> TensorImpl::shallow_copy_and_detach_core(
     VariableVersion&& version_counter,
     bool allow_tensor_metadata_change) const {
   c10::intrusive_ptr<TensorImpl> r;
-  const auto& maybe_torch_dispatch_mode_state =
-      c10::impl::TorchDispatchModeTLS::get_mode();
+  const auto mode_stack_len = c10::impl::TorchDispatchModeTLS::stack_len();
   // TODO: do we have to exclude after Python dispatch key set?
-  if (maybe_torch_dispatch_mode_state &&
+  if (mode_stack_len > 0 &&
       !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::Python)) {
-    r = maybe_torch_dispatch_mode_state->pyinterpreter()->detach(this);
+    const auto& cur_torch_dispatch_mode_state =
+        c10::impl::TorchDispatchModeTLS::get_stack_at(mode_stack_len - 1);
+    r = cur_torch_dispatch_mode_state->pyinterpreter()->detach(this);
   } else if (
       key_set_.has(DispatchKey::Python) &&
       !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::Python)) {
diff --git a/c10/core/impl/TorchDispatchModeTLS.cpp b/c10/core/impl/TorchDispatchModeTLS.cpp
index 5f02686584255..6755657b73687 100644
--- a/c10/core/impl/TorchDispatchModeTLS.cpp
+++ b/c10/core/impl/TorchDispatchModeTLS.cpp
@@ -8,44 +8,12 @@ namespace impl {
 
 thread_local TorchDispatchModeTLS torchDispatchModeState;
 
-// MODE
-void TorchDispatchModeTLS::set_mode(std::shared_ptr<SafePyObject> mode) {
-  if (mode) {
-    c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, true);
-    c10::impl::tls_set_dispatch_key_included(
-        DispatchKey::PythonTLSSnapshot, true);
-  } else {
-    TorchDispatchModeTLS::reset_mode();
-  }
-  torchDispatchModeState.mode_ = std::move(mode);
-}
-
-const std::shared_ptr<SafePyObject>& TorchDispatchModeTLS::get_mode() {
-  return torchDispatchModeState.mode_;
-}
-
-void TorchDispatchModeTLS::reset_mode() {
-  torchDispatchModeState.mode_.reset();
-  c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, false);
-  c10::impl::tls_set_dispatch_key_included(
-      DispatchKey::PythonTLSSnapshot, false);
-}
-
-void TorchDispatchModeTLS::swap_mode(std::shared_ptr<SafePyObject>& mode) {
-  if (mode) {
+void TorchDispatchModeTLS::push_onto_stack(std::shared_ptr<SafePyObject> mode) {
+  if (torchDispatchModeState.stack_.size() == 0) {
     c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, true);
     c10::impl::tls_set_dispatch_key_included(
         DispatchKey::PythonTLSSnapshot, true);
-  } else {
-    c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, false);
-    c10::impl::tls_set_dispatch_key_included(
-        DispatchKey::PythonTLSSnapshot, false);
   }
-  torchDispatchModeState.mode_.swap(mode);
-}
-
-// STACK
-void TorchDispatchModeTLS::push_onto_stack(std::shared_ptr<SafePyObject> mode) {
   torchDispatchModeState.stack_.push_back(std::move(mode));
 }
 
@@ -56,6 +24,12 @@ const std::shared_ptr<SafePyObject> TorchDispatchModeTLS::pop_stack() {
   const std::shared_ptr<SafePyObject> out =
       torchDispatchModeState.stack_.back();
   torchDispatchModeState.stack_.pop_back();
+
+  if (torchDispatchModeState.stack_.size() == 0) {
+    c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, false);
+    c10::impl::tls_set_dispatch_key_included(
+        DispatchKey::PythonTLSSnapshot, false);
+  }
   return out;
 }
 
@@ -71,20 +45,27 @@ int64_t TorchDispatchModeTLS::stack_len() {
   return torchDispatchModeState.stack_.size();
 }
 
-// STATE
-
 const TorchDispatchModeTLS& TorchDispatchModeTLS::get_state() {
   return torchDispatchModeState;
 }
 
 void TorchDispatchModeTLS::set_state(const TorchDispatchModeTLS& state) {
   torchDispatchModeState = state;
+  if (torchDispatchModeState.stack_.size() == 0) {
+    c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, false);
+    c10::impl::tls_set_dispatch_key_included(
+        DispatchKey::PythonTLSSnapshot, false);
+  } else {
+    c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, true);
+    c10::impl::tls_set_dispatch_key_included(
+        DispatchKey::PythonTLSSnapshot, true);
+  }
 }
 
 // UTIL
 
 bool dispatch_mode_enabled() {
-  return static_cast<bool>(c10::impl::TorchDispatchModeTLS::get_mode());
+  return TorchDispatchModeTLS::stack_len() > 0;
 }
 
 } // namespace impl
diff --git a/c10/core/impl/TorchDispatchModeTLS.h b/c10/core/impl/TorchDispatchModeTLS.h
index 708c22e014ad4..da30d0460427c 100644
--- a/c10/core/impl/TorchDispatchModeTLS.h
+++ b/c10/core/impl/TorchDispatchModeTLS.h
@@ -9,11 +9,6 @@ namespace c10 {
 namespace impl {
 
 struct C10_API TorchDispatchModeTLS {
-  static void set_mode(std::shared_ptr<SafePyObject> mode);
-  static const std::shared_ptr<SafePyObject>& get_mode();
-  static void reset_mode();
-  static void swap_mode(std::shared_ptr<SafePyObject>& mode);
-
   static void push_onto_stack(std::shared_ptr<SafePyObject> mode);
   static const std::shared_ptr<SafePyObject> pop_stack();
   static const std::shared_ptr<SafePyObject>& get_stack_at(int64_t idx);
@@ -23,12 +18,6 @@ struct C10_API TorchDispatchModeTLS {
   static void set_state(const TorchDispatchModeTLS& state);
 
  private:
-  // The mode TLS is split into
-  //   - mode_, which is the C++ mode, that can only be the mode handling mode
-  //   or null
-  //   - stack_, which is a vector of modes representing the stack of user
-  //   defined modes
-  std::shared_ptr<c10::SafePyObject> mode_;
   std::vector<std::shared_ptr<c10::SafePyObject>> stack_;
 };
 
diff --git a/test/test_overrides.py b/test/test_overrides.py
index e9e01684bda53..879b27277f0d8 100644
--- a/test/test_overrides.py
+++ b/test/test_overrides.py
@@ -1175,7 +1175,7 @@ def __torch_function__(self, *args, **kwargs):
             self.assertEqual(torch.mm(x, x), -1)
             self.assertEqual(bar(x), 1)
             self.assertRaisesRegex(
-                TypeError, r'SubTensor.+TorchFunctionStackMode',
+                TypeError, r'SubTensor',
                 lambda: self.assertEqual(torch.max(x, x)))
 
     def test_with_mode(self):
@@ -1248,7 +1248,7 @@ def __torch_function__(cls, func, _, args=(), kwargs=None):
                 return func(args, kwargs)
 
         x = torch.tensor(5.)
-        with self.assertRaisesRegex(RuntimeError, "should be a normal method not a class method"):
+        with self.assertRaisesRegex(RuntimeError, "classmethod is not supported, please make it a plain method"):
             with A():
                 x + x
 
diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py
index dea96d19b74c4..380f85f568f72 100644
--- a/test/test_python_dispatch.py
+++ b/test/test_python_dispatch.py
@@ -1050,7 +1050,7 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
                 return func(args, kwargs)
 
         x = torch.tensor(5.)
-        with self.assertRaisesRegex(RuntimeError, "should be a normal method not a class method"):
+        with self.assertRaisesRegex(RuntimeError, "classmethod is not supported, please make it a plain method"):
             with A():
                 x + x
 
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 007150002dbb6..ee963232d3166 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -28,6 +28,7 @@
 #include <torch/csrc/utils/disable_torch_function.h>
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/pycfunction_helpers.h>
+#include <torch/csrc/utils/python_torch_function_mode.h>
 
 #include <set>
 #include <unordered_set>
@@ -606,24 +607,11 @@ static PyObject* python_exit_dual_level(
   END_HANDLE_TH_ERRORS
 }
 
-static PyObject* set_torch_function_mode(PyObject* _unused, PyObject* arg) {
-  HANDLE_TH_ERRORS
-  if (arg == Py_None) {
-    at::impl::PythonTorchFunctionTLS::set_mode(nullptr);
-  } else {
-    Py_INCREF(arg);
-    at::impl::PythonTorchFunctionTLS::set_mode(
-        std::make_shared<c10::SafePyObject>(arg, getPyInterpreter()));
-  }
-  Py_RETURN_NONE;
-  END_HANDLE_TH_ERRORS;
-}
-
 static PyObject* is_torch_function_mode_enabled(
     PyObject* _unused,
     PyObject* _unused2) {
   HANDLE_TH_ERRORS
-  if (at::impl::function_mode_enabled()) {
+  if (at::impl::torch_function_mode_enabled()) {
     Py_RETURN_TRUE;
   } else {
     Py_RETURN_FALSE;
@@ -682,19 +670,6 @@ static PyObject* len_torch_function_stack(
   END_HANDLE_TH_ERRORS
 }
 
-static PyObject* set_torch_dispatch_mode(PyObject* _unused, PyObject* arg) {
-  HANDLE_TH_ERRORS
-  if (arg == Py_None) {
-    c10::impl::TorchDispatchModeTLS::set_mode(nullptr);
-  } else {
-    Py_INCREF(arg);
-    c10::impl::TorchDispatchModeTLS::set_mode(
-        std::make_shared<c10::SafePyObject>(arg, getPyInterpreter()));
-  }
-  Py_RETURN_NONE;
-  END_HANDLE_TH_ERRORS;
-}
-
 static PyObject* push_on_torch_dispatch_stack(
     PyObject* _unused,
     PyObject* arg) {
@@ -795,7 +770,6 @@ static PyMethodDef methods[] = { // NOLINT
      is_torch_function_mode_enabled,
      METH_NOARGS,
      nullptr},
-    {"_set_torch_function_mode", set_torch_function_mode, METH_O, nullptr},
     {"_push_on_torch_function_stack",
      push_on_torch_function_stack,
      METH_O,
@@ -812,7 +786,6 @@ static PyMethodDef methods[] = { // NOLINT
      len_torch_function_stack,
      METH_NOARGS,
      nullptr},
-    {"_set_torch_dispatch_mode", set_torch_dispatch_mode, METH_O, nullptr},
     {"_push_on_torch_dispatch_stack",
      push_on_torch_dispatch_stack,
      METH_O,
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index 9b52f7b50943a..66b8ad2d8351b 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -686,7 +686,9 @@ static PyObject* THPVariable_make_subclass(
     throw torch::TypeError(
         "cls must be a type (got %s)", Py_TYPE(cls)->tp_name);
   }
-  torch_dispatch_mode::StashTorchDispatchModeGuard td_g;
+  // guard completely turns off torch dispatch modes, doesn't just pop off the
+  // stack
+  torch_dispatch_mode::StashTorchDispatchStackGuard td_g;
   c10::impl::DisablePythonDispatcher dpd_g;
   auto data =
       r.tensor(1).detach(); // creates a fresh Tensor (DEFINITELY_UNINITIALIZED)
diff --git a/torch/csrc/jit/python/pybind_utils.cpp b/torch/csrc/jit/python/pybind_utils.cpp
index 60c7247ada62a..68317f76524b2 100644
--- a/torch/csrc/jit/python/pybind_utils.cpp
+++ b/torch/csrc/jit/python/pybind_utils.cpp
@@ -755,8 +755,7 @@ py::object _get_operation_for_overload_or_packet(
         total_arg_num,
         false /* throw_error */);
   }
-  if (overloaded_args.size() > 0 ||
-      at::impl::PythonTorchFunctionTLS::get_mode()) {
+  if (overloaded_args.size() > 0 || at::impl::torch_function_mode_enabled()) {
     py::object ret;
     std::string ns = symbol.ns().toUnqualString();
     std::string method_name = symbol.toUnqualString();
diff --git a/torch/csrc/utils/disable_torch_function.cpp b/torch/csrc/utils/disable_torch_function.cpp
index ac29a9157a9c1..3031493a704f6 100644
--- a/torch/csrc/utils/disable_torch_function.cpp
+++ b/torch/csrc/utils/disable_torch_function.cpp
@@ -221,7 +221,7 @@ inline bool has_torch_function_attr(PyObject* obj) {
 
 namespace torch {
 auto check_has_torch_function(PyObject* obj, bool ignore_mode) -> bool {
-  if (!ignore_mode && at::impl::PythonTorchFunctionTLS::get_mode())
+  if (!ignore_mode && at::impl::torch_function_mode_enabled())
     return true;
   PyTypeObject* tp = Py_TYPE(obj);
   return (
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index b1b0d2769df46..177346614704f 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -289,27 +289,41 @@ auto handle_torch_function_no_python_arg_parser(
   py::tuple py_types = py::cast(overloaded_types);
   py::object ret;
   PyObject* mode_obj = nullptr;
+
   const bool is_torch_function =
       torch_function_name == TorchFunctionName::TorchFunction;
-  auto get_mode = [&]() {
-    return is_torch_function ? at::impl::PythonTorchFunctionTLS::get_mode()
-                             : c10::impl::TorchDispatchModeTLS::get_mode();
+  auto get_stack_len = [&]() {
+    return is_torch_function ? at::impl::PythonTorchFunctionTLS::stack_len()
+                             : c10::impl::TorchDispatchModeTLS::stack_len();
   };
 
-  const auto& maybe_mode = get_mode();
-  if (maybe_mode) {
-    mode_obj = maybe_mode->ptr(getPyInterpreter());
-    TORCH_INTERNAL_ASSERT(py_types.ptr() != nullptr);
-    TORCH_INTERNAL_ASSERT(args != nullptr);
+  if (get_stack_len() > 0) {
     // Disable mode on the inside; this makes for a more user-friendly
     // experience if you try to, e.g., print your tensors.
     at::optional<torch::overrides::StashTorchFunctionModeGuard> tf_g;
     at::optional<torch_dispatch_mode::StashTorchDispatchModeGuard> td_g;
     if (is_torch_function) {
       tf_g.emplace();
+      mode_obj = tf_g->get_cur_mode()->ptr(getPyInterpreter());
     } else {
       td_g.emplace();
+      mode_obj = td_g->get_cur_mode()->ptr(getPyInterpreter());
     }
+    py::object torch_function =
+        PyObject_FastGetAttrString(mode_obj, torch_function_name_str);
+    if (!torch_function) {
+      TORCH_INTERNAL_ASSERT(0);
+    }
+    TORCH_INTERNAL_ASSERT(py_types.ptr() != nullptr);
+    TORCH_INTERNAL_ASSERT(args != nullptr);
+
+    TORCH_CHECK(
+        PyObject_FastGetAttrString(torch_function.ptr(), "__self__")
+            .is(py::reinterpret_borrow<py::object>(mode_obj)),
+        "Defining your mode's `",
+        torch_function_name_str,
+        "` as a classmethod is not supported, please make it a plain method");
+
     // Blegh.  This accidentally works in PyObject_CallFunctionObjArgs below
     // because the nullptr terminates the argument list ick ick ick.
     if (kwargs == nullptr) {
@@ -393,18 +407,6 @@ auto handle_torch_function_no_python_arg_parser(
       }
     }
     ss << "]";
-    if (mode_obj) {
-      // Note [Paranoid check mode is same]
-      // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-      // If a user forcibly changes the mode in a non-lexical way
-      // in the inner context, the mode could be invalid here.  So just be
-      // a bit safe, it doesn't cost us anything since this is error reporting
-      const auto& maybe_mode = get_mode();
-      TORCH_INTERNAL_ASSERT(
-          maybe_mode && mode_obj == maybe_mode->ptr(getPyInterpreter()));
-      ss << " nor was it found on the currently active mode "
-         << py::repr(mode_obj);
-    }
     const std::string& tmp = ss.str();
     PyErr_SetString(PyExc_TypeError, tmp.c_str());
     throw python_error();
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index 9b23af5829786..a08441369db82 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -443,7 +443,7 @@ inline PythonArgs PythonArgParser::parse(PyObject* self, ParsedArgs<0>& dst) {
 
 inline bool PythonArgs::has_torch_function() {
   return !this->signature.overloaded_args.empty() ||
-      at::impl::PythonTorchFunctionTLS::get_mode();
+      at::impl::torch_function_mode_enabled();
 }
 
 inline std::string PythonArgs::get_func_name() {
diff --git a/torch/csrc/utils/python_torch_function_mode.h b/torch/csrc/utils/python_torch_function_mode.h
index 5faf75778469d..f6652dfd93084 100644
--- a/torch/csrc/utils/python_torch_function_mode.h
+++ b/torch/csrc/utils/python_torch_function_mode.h
@@ -5,21 +5,20 @@
 namespace torch {
 namespace overrides {
 
-// Corresponds to torch.overrides._no_torch_function_mode.  We discourage use
-// of this in userland because it's non-compositional; there might be another
-// mode waiting to go after you, and you shouldn't just blindly disable it.
-// From C++ side, there is no such thing as compositional modes, there is one
-// mode and of course you should be able to clear it.
 struct StashTorchFunctionModeGuard {
   StashTorchFunctionModeGuard() {
-    at::impl::PythonTorchFunctionTLS::swap_mode(old_mode_);
+    cur_mode_ = at::impl::PythonTorchFunctionTLS::pop_stack();
   }
   ~StashTorchFunctionModeGuard() {
-    at::impl::PythonTorchFunctionTLS::set_mode(std::move(old_mode_));
+    at::impl::PythonTorchFunctionTLS::push_onto_stack(cur_mode_);
+  }
+
+  const std::shared_ptr<c10::SafePyObject>& get_cur_mode() {
+    return cur_mode_;
   }
 
  private:
-  std::shared_ptr<c10::SafePyObject> old_mode_;
+  std::shared_ptr<c10::SafePyObject> cur_mode_;
 };
 
 } // namespace overrides
diff --git a/torch/csrc/utils/torch_dispatch_mode.h b/torch/csrc/utils/torch_dispatch_mode.h
index 81b219f71c095..2c97a7d96c320 100644
--- a/torch/csrc/utils/torch_dispatch_mode.h
+++ b/torch/csrc/utils/torch_dispatch_mode.h
@@ -8,16 +8,36 @@ namespace torch_dispatch_mode {
 struct StashTorchDispatchModeGuard {
  public:
   StashTorchDispatchModeGuard() {
-    c10::impl::TorchDispatchModeTLS::swap_mode(saved_mode_);
+    saved_mode_ = c10::impl::TorchDispatchModeTLS::pop_stack();
   }
 
   ~StashTorchDispatchModeGuard() {
-    c10::impl::TorchDispatchModeTLS::set_mode(std::move(saved_mode_));
+    c10::impl::TorchDispatchModeTLS::push_onto_stack(std::move(saved_mode_));
+  }
+
+  const std::shared_ptr<c10::SafePyObject>& get_cur_mode() {
+    return saved_mode_;
   }
 
  private:
   std::shared_ptr<at::SafePyObject> saved_mode_;
 };
 
+struct StashTorchDispatchStackGuard {
+ public:
+  StashTorchDispatchStackGuard() {
+    const auto old = c10::impl::TorchDispatchModeTLS::get_state();
+    c10::impl::TorchDispatchModeTLS::set_state(saved_state_);
+    saved_state_ = std::move(old);
+  }
+
+  ~StashTorchDispatchStackGuard() {
+    c10::impl::TorchDispatchModeTLS::set_state(std::move(saved_state_));
+  }
+
+ private:
+  c10::impl::TorchDispatchModeTLS saved_state_;
+};
+
 } // namespace torch_dispatch_mode
 } // namespace torch
diff --git a/torch/overrides.py b/torch/overrides.py
index 0d252f1114aa8..c463cf3ca94d4 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -26,7 +26,7 @@
 import functools
 import types
 import warnings
-from typing import Dict, Set, List, Any, Callable, Iterable, Type, Iterator, Tuple
+from typing import Dict, Set, List, Any, Callable, Iterable, Type, Tuple
 import contextlib
 
 import torch
@@ -34,7 +34,7 @@
     _has_torch_function, _has_torch_function_unary,
     _has_torch_function_variadic, _add_docstr,
     _push_on_torch_function_stack, _pop_torch_function_stack, _get_function_stack_at, _len_torch_function_stack,
-    _set_torch_function_mode, _is_torch_function_mode_enabled)
+    _is_torch_function_mode_enabled)
 
 __all__ = [
     "get_ignored_functions",
@@ -1512,8 +1512,8 @@ def handle_torch_function(
     if _is_torch_function_mode_enabled():
         # if we're here, the mode must be set to a TorchFunctionStackMode
         # this unsets it and calls directly into TorchFunctionStackMode's torch function
-        with _no_torch_function_mode():
-            result = _TorchFunctionStackMode().__torch_function__(public_api, types, args, kwargs)
+        with _pop_mode_temporarily() as mode:
+            result = mode.__torch_function__(public_api, types, args, kwargs)
         if result is not NotImplemented:
             return result
 
@@ -1828,15 +1828,11 @@ def _get_current_function_mode_stack():
     return [_get_function_stack_at(i) for i in range(stack_len)]
 
 def _push_mode(mode):
-    if _len_torch_function_stack() == 0:
-        _set_torch_function_mode(_TorchFunctionStackMode())
     _push_on_torch_function_stack(mode)
 
 
 def _pop_mode():
     old = _pop_torch_function_stack()
-    if _len_torch_function_stack() == 0:
-        _set_torch_function_mode(None)
     return old
 
 
@@ -1848,19 +1844,6 @@ def _pop_mode_temporarily():
     finally:
         _push_mode(old)
 
-# a helper "mode" used by the torch_function push helper method. This is the only mode that will ever
-# be active at the C++ level and it will run the current mode
-class _TorchFunctionStackMode:
-    def __torch_function__(self, func, types, args=(), kwargs=None):
-        with _pop_mode_temporarily() as old:
-            if _len_torch_function_stack() > 0:
-                _set_torch_function_mode(self)
-            # we can't check the type of __torch_function__ here but this is sufficient for checking it's a classmethod
-            if old.__torch_function__.__self__ is type(old):
-                raise RuntimeError("TorchFunctionMode's torch_function function " +
-                                   "should be a normal method not a class method")
-            return old.__torch_function__(func, types, args, kwargs)
-
 class BaseTorchFunctionMode(TorchFunctionMode):
     def __torch_function__(self, func, types, args=(), kwargs=None):
         if kwargs is None:
@@ -1868,19 +1851,6 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
         return func(*args, **kwargs)
 
 
-# This is private API as I'm not sure it's possible for users to use this
-# compositionally (easy to discard too many modes).  It is useful for
-# library code though, e.g., in handle_torch_function
-@contextlib.contextmanager
-def _no_torch_function_mode() -> Iterator[None]:
-    _set_torch_function_mode(None)
-    try:
-        yield
-    finally:
-        if _len_torch_function_stack() > 0:
-            _set_torch_function_mode(_TorchFunctionStackMode())
-
-
 class enable_reentrant_dispatch():
     def __enter__(self):
         self._raii_guard = torch._C._RestorePythonTLSSnapshot()
diff --git a/torch/utils/_python_dispatch.py b/torch/utils/_python_dispatch.py
index 08ef67d7333fa..5d22ae69a185f 100644
--- a/torch/utils/_python_dispatch.py
+++ b/torch/utils/_python_dispatch.py
@@ -2,7 +2,7 @@
 
 import warnings
 from torch._C import _len_torch_dispatch_stack, _get_dispatch_stack_at,\
-    _pop_torch_dispatch_stack, _push_on_torch_dispatch_stack, _set_torch_dispatch_mode
+    _pop_torch_dispatch_stack, _push_on_torch_dispatch_stack
 
 
 # TODO: Limitations and things about enable_torch_dispatch_mode we should fix before exposing it:
@@ -67,16 +67,11 @@ def _get_current_dispatch_mode_stack():
     return [_get_dispatch_stack_at(i) for i in range(stack_len)]
 
 def _push_mode(mode):
-    if _len_torch_dispatch_stack() == 0:
-        _set_torch_dispatch_mode(_TorchDispatchStackMode())
     _push_on_torch_dispatch_stack(mode)
 
 
 def _pop_mode():
-    old = _pop_torch_dispatch_stack()
-    if _len_torch_dispatch_stack() == 0:
-        _set_torch_dispatch_mode(None)
-    return old
+    return _pop_torch_dispatch_stack()
 
 
 @contextlib.contextmanager
@@ -87,19 +82,6 @@ def _pop_mode_temporarily():
     finally:
         _push_mode(old)
 
-# a helper "mode" used by the torch dispatch push helper method. This is the only mode that will ever
-# be active at the C++ level and it will run the current mode
-class _TorchDispatchStackMode:
-    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
-        with _pop_mode_temporarily() as old:
-            if _len_torch_dispatch_stack() > 0:
-                _set_torch_dispatch_mode(self)
-            # we can't check the type of __torch_dispatch__ here but this is sufficient for checking it's a classmethod
-            if old.__torch_dispatch__.__self__ is type(old):
-                raise RuntimeError(f"{type(old)}'s torch_dispatch function " +
-                                   "should be a normal method not a class method")
-            return old.__torch_dispatch__(func, types, args, kwargs)
-
 class BaseTorchDispatchMode(TorchDispatchMode):
     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         if kwargs is None:

From 39be343c25c6fb5e11d660a1c06d3c8429aad284 Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Fri, 21 Oct 2022 16:21:42 +0000
Subject: [PATCH 0032/1922] Make torchbench setup a function (#87469)

cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87469
Approved by: https://github.com/anijain2305
---
 benchmarks/dynamo/torchbench.py | 40 +++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/benchmarks/dynamo/torchbench.py b/benchmarks/dynamo/torchbench.py
index c37422a19bfd9..b6577745ab154 100755
--- a/benchmarks/dynamo/torchbench.py
+++ b/benchmarks/dynamo/torchbench.py
@@ -20,25 +20,30 @@
 
 # We are primarily interested in tf32 datatype
 torch.backends.cuda.matmul.allow_tf32 = True
-original_dir = abspath(os.getcwd())
-
-os.environ["KALDI_ROOT"] = "/tmp"  # avoids some spam
-for torchbench_dir in (
-    "./torchbenchmark",
-    "../torchbenchmark",
-    "../torchbench",
-    "../benchmark",
-    "../../torchbenchmark",
-    "../../torchbench",
-    "../../benchmark",
-):
+
+
+def setup_torchbench_cwd():
+    original_dir = abspath(os.getcwd())
+
+    os.environ["KALDI_ROOT"] = "/tmp"  # avoids some spam
+    for torchbench_dir in (
+        "./torchbenchmark",
+        "../torchbenchmark",
+        "../torchbench",
+        "../benchmark",
+        "../../torchbenchmark",
+        "../../torchbench",
+        "../../benchmark",
+    ):
+        if exists(torchbench_dir):
+            break
+
     if exists(torchbench_dir):
-        break
+        torchbench_dir = abspath(torchbench_dir)
+        os.chdir(torchbench_dir)
+        sys.path.append(torchbench_dir)
 
-if exists(torchbench_dir):
-    torchbench_dir = abspath(torchbench_dir)
-    os.chdir(torchbench_dir)
-    sys.path.append(torchbench_dir)
+    return original_dir
 
 
 # Some models have large dataset that doesn't fit in memory. Lower the batch
@@ -338,6 +343,7 @@ def forward_and_backward_pass(self, mod, inputs, collect_outputs=True):
 
 if __name__ == "__main__":
 
+    original_dir = setup_torchbench_cwd()
     logging.basicConfig(level=logging.WARNING)
     warnings.filterwarnings("ignore")
     main(TorchBenchmarkRunner(), original_dir)

From 83a6448f8b1b6fabd347b39944c52753592d5270 Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Fri, 21 Oct 2022 16:21:43 +0000
Subject: [PATCH 0033/1922] Remove unused cold_start experiment (#87470)

- this `--cold_start` experiment didn't end up being used
- there is a new `--cold_start_latency` flag that is used
- this experiment was only hooked up for nvfuser anyway

cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87470
Approved by: https://github.com/anijain2305
---
 benchmarks/dynamo/common.py | 89 -------------------------------------
 1 file changed, 89 deletions(-)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index a2f8af2bc825a..507f9db2d5b11 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -348,84 +348,6 @@ def randomize_input(inputs):
         )
 
 
-def cold_start_experiment(args, model_iter_fn, model, example_inputs, optimize_ctx):
-    compile_iters = 2
-    total_iters = compile_iters + 2
-    timings = np.zeros((total_iters, 2), np.float64)
-    # if we randomize the input, we should also check the result is correct
-    should_check_result = should_randomize_input = args.randomize_input
-    is_correct = True
-
-    optimized_model_iter_fn = optimize_ctx(model_iter_fn)
-    for rep in range(total_iters):
-        inputs = (
-            randomize_input(copy.deepcopy(example_inputs))
-            if should_randomize_input
-            else example_inputs
-        )
-
-        # interleave the runs to handle frequency scaling and load changes
-        timings[rep, 0], expected_output = timed(
-            model, model_iter_fn, inputs, return_result=True
-        )
-        timings[rep, 1], actual_output = timed(
-            model, optimized_model_iter_fn, inputs, return_result=True
-        )
-        if should_check_result:
-            is_correct = is_correct and same(expected_output, actual_output)
-    pvalue = ttest_ind(timings[:, 0], timings[:, 1]).pvalue
-    worst = np.max(timings, axis=0)
-
-    def breakeven(dynamo_times, eager_times):
-        """
-        Solve for the number of iterations it takes dynamo to 'catch up' with eager,
-        taking into account the time it spent compiling.  Assumes all compilation
-        happens up front and the model is static thereafter, which is definitely not
-        true in general but might be across torchbench.
-
-            dc1, dc2 = dynamo compilation iterations (with Prof Exec)
-            d, e = dynamo, eager warmed up iteration
-            B = num iters to break even
-            dc1 + dc2 + (B-2)d = B*e
-            B = (dc1 + dc2 - 2d) / (e - d)
-        """
-        dc1, dc2, d = dynamo_times[0], dynamo_times[1], np.median(dynamo_times[2:])
-        e = np.median(eager_times)
-        if d < e:
-            return (dc1 + dc2 + 2 * d) / (e - d)
-        else:
-            # if optimized dynamo is not faster than eager we'll compute
-            # a nonsense negative number
-            return 0
-
-    speedup = worst[0] / worst[1]
-    eager_times, dynamo_times = timings[:, 0], timings[:, 1]
-    output_csv(
-        output_filename,
-        ("dev", "name", "batch_size", "cold-start speedup", "breakeven iters"),
-        [
-            current_device,
-            current_name,
-            current_batch_size,
-            float(speedup),
-            breakeven(dynamo_times, eager_times),
-        ],
-    )
-
-    def format_speedup(
-        speedup, pvalue, breakeven_iters, is_correct=True, pvalue_threshold=0.1
-    ):
-        if not is_correct:
-            return "ERROR"
-        if pvalue > pvalue_threshold:
-            return f"{speedup:.3f}x breakeven={breakeven_iters:.2f} iters SAME"
-        return f"{speedup:.3f}x breakeven={breakeven_iters:.2f} iters p={pvalue:.2f}"
-
-    return format_speedup(
-        speedup, pvalue, breakeven(dynamo_times, eager_times), is_correct=is_correct
-    )
-
-
 def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
     """
     Measure speedups over eager.
@@ -1527,9 +1449,6 @@ def parse_args():
         action="store_true",
         help="speedup using the ltc backend without reusing compiled graph",
     )
-    group.add_argument(
-        "--cold-start", action="store_true", help=help(cold_start_experiment)
-    )
     group.add_argument(
         "--overhead", action="store_true", help=help(overhead_experiment)
     )
@@ -1769,14 +1688,6 @@ def main(runner, original_dir=None):
         optimize_ctx = torch._dynamo.optimize(dummy_fx_compile, nopython=args.nopython)
         experiment = speedup_experiment
         output_filename = "overheads.csv"
-    elif args.cold_start:
-        optimize_ctx = torch._dynamo.optimize("aot_nvfuser", nopython=args.nopython)
-        experiment = cold_start_experiment
-        assert args.nvfuser, "TODO - Add another aot string for mem fusion with NNC"
-        backend_str = "nvfuser" if args.nvfuser else "nnc"
-        output_filename = f"cold_start_{backend_str}.csv"
-        # TODO(whc) should we move this to a more general part of the script?
-        torch.backends.cuda.matmul.allow_tf32 = True
     elif args.inductor or args.inductor_dynamic:
         from torch._inductor import config as inductor_config
 

From e83cc19610bc3610db7063b6876f78e7954fcb72 Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Fri, 21 Oct 2022 16:21:43 +0000
Subject: [PATCH 0034/1922] Delete unused ltc experiments (#87471)

cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87471
Approved by: https://github.com/anijain2305
---
 benchmarks/dynamo/common.py | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 507f9db2d5b11..b31cf1a0642ab 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -1439,16 +1439,6 @@ def parse_args():
     group.add_argument(
         "--coverage", action="store_true", help="(default) " + help(coverage_experiment)
     )
-    group.add_argument(
-        "--speedup-ltc",
-        action="store_true",
-        help="speedup using the ltc backend",
-    )
-    group.add_argument(
-        "--speedup-ltc-trivial",
-        action="store_true",
-        help="speedup using the ltc backend without reusing compiled graph",
-    )
     group.add_argument(
         "--overhead", action="store_true", help=help(overhead_experiment)
     )
@@ -1707,18 +1697,6 @@ def main(runner, original_dir=None):
         optimize_ctx = torch._dynamo.optimize("inductor", nopython=args.nopython)
         experiment = speedup_experiment
         output_filename = "inductor.csv"
-    elif args.speedup_ltc:
-        optimize_ctx = torch._dynamo.optimize(
-            backends.ltc_reuse_graph, nopython=args.nopython
-        )
-        experiment = speedup_experiment
-        output_filename = "speedups_ltc.csv"
-    elif args.speedup_ltc_trivial:
-        optimize_ctx = torch._dynamo.optimize(
-            backends.ltc_trivial, nopython=args.nopython
-        )
-        experiment = speedup_experiment
-        output_filename = "speedups_ltc_trivial.csv"
     elif args.speedup_ts:
         experiment = speedup_experiment_ts
         output_filename = "baseline_ts.csv"

From b1e71d0db2b56ac75fbafca16a83b7976a97ca5f Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Fri, 21 Oct 2022 16:21:43 +0000
Subject: [PATCH 0035/1922] Delete unused ts experiment (#87472)

cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87472
Approved by: https://github.com/anijain2305
---
 benchmarks/dynamo/common.py | 40 -------------------------------------
 1 file changed, 40 deletions(-)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index b31cf1a0642ab..0597cc513781d 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -587,40 +587,6 @@ def try_script(model, example_inputs):
         return None
 
 
-def speedup_experiment_ts(args, model_iter_fn, model, example_inputs):
-    """
-    Measure baseline performance (without using TorchDynamo) of TorchScript and optimize_for_inference.
-
-    Writes to ./baseline_ts.csv
-    """
-    if args.training:
-        return baselines(
-            [
-                ("eager", model),
-                ("ts", try_script(model, example_inputs)),
-            ],
-            model_iter_fn,
-            example_inputs,
-            args,
-        )
-
-    return baselines(
-        [
-            ("eager", model),
-            ("ts", try_script(model, example_inputs)),
-            (
-                "ofi",
-                backends.ofi(try_script(model, example_inputs), example_inputs),
-            ),
-            # ("nnc", backends.nnc(try_script(model, example_inputs), example_inputs)),
-            # ("nvfuser", backends.nvfuser(try_script(model, example_inputs), example_inputs)),
-        ],
-        model_iter_fn,
-        example_inputs,
-        args,
-    )
-
-
 def speedup_experiment_sr(args, model_iter_fn, model, example_inputs):
     """
     Measure baseline performance (without using TorchDynamo) of static runtime.
@@ -1442,9 +1408,6 @@ def parse_args():
     group.add_argument(
         "--overhead", action="store_true", help=help(overhead_experiment)
     )
-    group.add_argument(
-        "--speedup-ts", action="store_true", help=help(speedup_experiment_ts)
-    )
     group.add_argument(
         "--speedup-sr", action="store_true", help=help(speedup_experiment_sr)
     )
@@ -1697,9 +1660,6 @@ def main(runner, original_dir=None):
         optimize_ctx = torch._dynamo.optimize("inductor", nopython=args.nopython)
         experiment = speedup_experiment
         output_filename = "inductor.csv"
-    elif args.speedup_ts:
-        experiment = speedup_experiment_ts
-        output_filename = "baseline_ts.csv"
     elif args.speedup_sr:
         experiment = speedup_experiment_sr
         output_filename = "baseline_sr.csv"

From 4e4cd9d07b2a95bead2c577333434a25b1bea556 Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Fri, 21 Oct 2022 16:21:43 +0000
Subject: [PATCH 0036/1922] Delete unused static runtime experiment (#87473)

cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87473
Approved by: https://github.com/anijain2305
---
 benchmarks/dynamo/common.py | 32 --------------------------------
 1 file changed, 32 deletions(-)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 0597cc513781d..01793b01e0e03 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -587,32 +587,6 @@ def try_script(model, example_inputs):
         return None
 
 
-def speedup_experiment_sr(args, model_iter_fn, model, example_inputs):
-    """
-    Measure baseline performance (without using TorchDynamo) of static runtime.
-
-    Writes to ./baseline_sr.csv
-    """
-
-    if current_name not in ("opacus_cifar10", "timm_nfnet", "hf_T5"):
-        sr = backends.static_runtime(try_script(model, example_inputs), example_inputs)
-    else:
-        # segfaults on these models
-        sr = None
-    return baselines(
-        [
-            ("eager", model),
-            (
-                "sr",
-                sr,
-            ),
-        ],
-        model_iter_fn,
-        example_inputs,
-        args,
-    )
-
-
 def speedup_experiment_onnx(args, model_iter_fn, model, example_inputs):
     """
     Measure baseline performance (without using TorchDynamo) of ONNXRT and TensorFlow.
@@ -1408,9 +1382,6 @@ def parse_args():
     group.add_argument(
         "--overhead", action="store_true", help=help(overhead_experiment)
     )
-    group.add_argument(
-        "--speedup-sr", action="store_true", help=help(speedup_experiment_sr)
-    )
     group.add_argument(
         "--speedup-onnx", action="store_true", help=help(speedup_experiment_onnx)
     )
@@ -1660,9 +1631,6 @@ def main(runner, original_dir=None):
         optimize_ctx = torch._dynamo.optimize("inductor", nopython=args.nopython)
         experiment = speedup_experiment
         output_filename = "inductor.csv"
-    elif args.speedup_sr:
-        experiment = speedup_experiment_sr
-        output_filename = "baseline_sr.csv"
     elif args.speedup_onnx:
         experiment = speedup_experiment_onnx
         output_filename = "baseline_onnx.csv"

From a0001256b681844324a50070bd3e3df0f9e441ff Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Fri, 21 Oct 2022 12:57:55 -0400
Subject: [PATCH 0037/1922] as_strided_scatter storage offset defaults to None
 not 0 (#87481)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87481
Approved by: https://github.com/bdhirsh
---
 torch/_tensor_docs.py | 2 +-
 torch/_torch_docs.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 9f8ce0c0f8520..b564351acf590 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -1584,7 +1584,7 @@ def add_docstr_all(method, docstr):
 add_docstr_all(
     "as_strided_scatter",
     r"""
-as_strided_scatter(src, size, stride, storage_offset=0) -> Tensor
+as_strided_scatter(src, size, stride, storage_offset=None) -> Tensor
 
 See :func:`torch.as_strided_scatter`
 """,
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 00e7129cfb10e..d84ed259b6d38 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -3740,7 +3740,7 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.as_strided_scatter,
     r"""
-as_strided_scatter(input, src, size, stride, storage_offset=0) -> Tensor
+as_strided_scatter(input, src, size, stride, storage_offset=None) -> Tensor
 
 Embeds the values of the :attr:`src` tensor into :attr:`input` along
 the elements corresponding to the result of calling

From ba51abbc4016d6564c79acd6d9b132285090c9cf Mon Sep 17 00:00:00 2001
From: Catherine Lee <csl@fb.com>
Date: Fri, 21 Oct 2022 22:53:35 +0000
Subject: [PATCH 0038/1922] fix docs push (#87498)

push docs to temp branch first then push to actual branch to satisfy CLA check in branch protections
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87498
Approved by: https://github.com/malfet
---
 .circleci/scripts/cpp_doc_push_script.sh    | 3 +++
 .circleci/scripts/python_doc_push_script.sh | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/.circleci/scripts/cpp_doc_push_script.sh b/.circleci/scripts/cpp_doc_push_script.sh
index 4c22677e94bd3..6e66514ae93b9 100755
--- a/.circleci/scripts/cpp_doc_push_script.sh
+++ b/.circleci/scripts/cpp_doc_push_script.sh
@@ -98,6 +98,9 @@ git commit -m "Generate C++ docs from pytorch/pytorch@${GITHUB_SHA}" || true
 git status
 
 if [[ "${WITH_PUSH:-}" == true ]]; then
+  # push to a temp branch first to trigger CLA check and satisfy branch protections
+  git push -u origin HEAD:pytorchbot/temp-branch-cpp -f
+  sleep 30
   git push -u origin
 fi
 
diff --git a/.circleci/scripts/python_doc_push_script.sh b/.circleci/scripts/python_doc_push_script.sh
index f9b019ec069b3..d255f77c82e8e 100755
--- a/.circleci/scripts/python_doc_push_script.sh
+++ b/.circleci/scripts/python_doc_push_script.sh
@@ -135,6 +135,9 @@ git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true
 git status
 
 if [[ "${WITH_PUSH:-}" == true ]]; then
+  # push to a temp branch first to trigger CLA check and satisfy branch protections
+  git push -u origin HEAD:pytorchbot/temp-branch-py -f
+  sleep 30
   git push -u origin "${branch}"
 fi
 

From 8c2df0853a5cf7761031d1b5bfcca39a7fb15b05 Mon Sep 17 00:00:00 2001
From: Jason Ansel <jansel@fb.com>
Date: Fri, 21 Oct 2022 15:14:15 -0700
Subject: [PATCH 0039/1922] Reland #87025 and fix periodic tests (#87084)

- Relands #87025
- disables failing tests related to https://github.com/pytorch/torchdynamo/issues/1697
- Reverts https://github.com/pytorch/pytorch/commit/d01eea6027c26bf100fc99a705669f60648964ae

cc @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87084
Approved by: https://github.com/malfet, https://github.com/voznesenskym
---
 .jenkins/pytorch/common_utils.sh           |  1 +
 .jenkins/pytorch/test.sh                   | 16 ++++------------
 test/inductor/test_torchinductor.py        | 13 +++++++++++++
 test/inductor/test_torchinductor_opinfo.py | 19 +++++++++++++++++--
 torch/_inductor/decomposition.py           | 12 +++++++-----
 torch/_inductor/lowering.py                |  2 --
 6 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/.jenkins/pytorch/common_utils.sh b/.jenkins/pytorch/common_utils.sh
index d8c853f97ab23..d673a37f17b8f 100644
--- a/.jenkins/pytorch/common_utils.sh
+++ b/.jenkins/pytorch/common_utils.sh
@@ -140,6 +140,7 @@ function install_triton() {
   else
     commit=$(get_pinned_commit triton)
     pip_install --user "git+https://github.com/openai/triton@${commit}#subdirectory=python"
+    pip_install --user jinja2
   fi
 }
 
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 7e9d4f37edec1..ec77478769b4f 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -741,16 +741,12 @@ elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SH
 elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
   test_without_numpy
   install_torchvision
-  if ! [[ "${BUILD_ENVIRONMENT}" == *sm86 ]]; then
-    install_triton
-  fi
+  install_triton
   test_python_shard 1
   test_aten
 elif [[ "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then
   install_torchvision
-  if ! [[ "${BUILD_ENVIRONMENT}" == *sm86 ]]; then
-    install_triton
-  fi
+  install_triton
   test_python_shard 2
   test_libtorch
   test_aot_compilation
@@ -759,9 +755,7 @@ elif [[ "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then
   test_torch_function_benchmark
 elif [[ "${SHARD_NUMBER}" -gt 2 ]]; then
   # Handle arbitrary number of shards
-  if ! [[ "${BUILD_ENVIRONMENT}" == *sm86 ]]; then
-    install_triton
-  fi
+  install_triton
   test_python_shard "$SHARD_NUMBER"
 elif [[ "${BUILD_ENVIRONMENT}" == *vulkan* ]]; then
   test_vulkan
@@ -779,9 +773,7 @@ elif [[ "${TEST_CONFIG}" == *functorch* ]]; then
   test_functorch
 else
   install_torchvision
-  if ! [[ "${BUILD_ENVIRONMENT}" == *sm86 ]]; then
-    install_triton
-  fi
+  install_triton
   install_monkeytype
   test_python
   test_aten
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index c4e82a8092437..52f36500b5025 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -758,6 +758,11 @@ def fn(a):
         self.common(fn, ((torch.rand((10, 3, 352, 352), dtype=torch.float16),)))
 
     def test_expanded_reduction(self):
+        if self.device == "cpu":
+            raise unittest.SkipTest(
+                "https://github.com/pytorch/torchdynamo/issues/1697"
+            )
+
         def fn(x, y):
             z = x * y
             return z.sum((0, 1))
@@ -3145,6 +3150,9 @@ def fn(a, dim, index, b):
         )
 
     def test_scatter2(self):
+        if self.device == "cuda":
+            raise unittest.SkipTest("unstable on sm86")
+
         def fn(a, dim, index, b):
             return aten.scatter.reduce(a, dim, index, b, reduce="add")
 
@@ -3259,6 +3267,11 @@ def fn(a, dim, index, b):
 
     # issue #1150
     def test_dense_mask_index(self):
+        if self.device == "cpu":
+            raise unittest.SkipTest(
+                "https://github.com/pytorch/torchdynamo/issues/1697"
+            )
+
         def fn(x, y):
             y = torch.ops.aten.select.int(y, 0, 2)
             z = x * y
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 220b711efcb51..371a825b28a30 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -140,6 +140,23 @@ def process(device_type):
     # Disabled on migration to core
     "linalg.pinv.singular": {f32, f64},
     "linalg.householder_product": {f32},
+    # These might be passing now?
+    "T": {b8, f16, f32, f64, i32, i64},
+    "H": {b8, f16, f32, f64, i32, i64},
+    "__getitem__": {b8, f16, f32, f64, i32, i64},
+    "acos": {b8, f16, f32, f64, i32, i64},
+    "acosh": {b8, f16, f32, f64, i32, i64},
+    "nn.functional.conv_transpose3d": {f16},
+    "max.reduction_with_dim": {i32, i64},
+    "min.reduction_with_dim": {i32, i64},
+    "linalg.lu": {f32, f64},
+    "lu_unpack": {f32, f64},
+    "native_batch_norm": {f16, f32, f64},
+    "native_layer_norm": {f16, f32, f64},
+    # Issues on sm86 periodic job (complex numbers)
+    "cdouble": {b8, f16, f32, f64, i32, i64},
+    "cfloat": {b8, f16, f32, f64, i32, i64},
+    "randint": {b8, f16, f32, f64, i32, i64},
 }
 
 inductor_expected_failures_single_sample = defaultdict(dict)
@@ -354,7 +371,6 @@ def process(device_type):
     # AssertionError: Tensor-likes are not close!
     "erf": {b8, f64},
     "nn.functional.gelu": {f64},
-    "nn.functional.conv_transpose3d": {f16},
     "nn.functional.triplet_margin_loss": {f16},
 }
 
@@ -365,7 +381,6 @@ def process(device_type):
     "cumprod": {f16},
     "linalg.vector_norm": {f64, f64},
     "linalg.householder_product": {f32},
-    "linalg.lu": {f32, f64},
     "kron": {f16},
     "nanquantile": {f32, f64},
     "native_batch_norm": {f16, f32, f64},
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index 5e67bfe6ef29e..6fed9ca691240 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -81,24 +81,26 @@
         aten._reshape_alias,
         aten.select_backward,
         aten.select_scatter,
+        aten.sgn,
         aten.sigmoid_backward,
+        aten.silu,
         aten.silu_backward,
         aten.slice_backward,
-        aten.sgn,
-        aten.std_mean.correction,
         aten._softmax,
         aten._softmax_backward_data,
+        aten.softplus,
+        aten.softplus_backward,
         aten.stack,
+        aten.std_mean.correction,
         aten.t,
         aten.tanh_backward,
         aten.threshold_backward,
         aten.transpose.int,
         aten.tril.default,
+        aten.unfold,
+        aten.unfold_backward,
         aten.upsample_bilinear2d.vec,
         aten.upsample_nearest2d_backward,
-        aten.softplus,
-        aten.softplus_backward,
-        aten.silu,
     ]
 )
 
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 49a136b440ed2..fd94aa9bc5d5a 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -1084,8 +1084,6 @@ def inner_fn(index):
 make_fallback(aten._sparse_coo_tensor_with_dims_and_tensors)
 make_fallback(aten._thnn_fused_lstm_cell)
 make_fallback(aten.topk)
-make_fallback(aten.unfold)
-make_fallback(aten.unfold_backward)
 make_fallback(aten.upsample_bicubic2d_backward)
 make_fallback(aten.upsample_bilinear2d_backward)
 

From 95a7c042137260cdc0bf99038b4575f0aed9a315 Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Fri, 21 Oct 2022 23:13:39 +0000
Subject: [PATCH 0040/1922] Re-enable dynamo ddp tests (#87524)

- Move dynamo dist tests to another shard
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87524
Approved by: https://github.com/davidberard98
---
 .jenkins/pytorch/test.sh                      |  2 +
 .../test_dynamo_distributed.py}               | 47 +++++--------------
 2 files changed, 14 insertions(+), 35 deletions(-)
 rename test/{dynamo/test_distributed.py => distributed/test_dynamo_distributed.py} (88%)

diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index ec77478769b4f..adcaf82ffdd38 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -708,6 +708,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
   # TODO: run some C++ tests
   echo "no-op at the moment"
 elif [[ "$TEST_CONFIG" == distributed ]]; then
+  install_filelock
+  install_triton
   test_distributed
   # Only run RPC C++ tests on the first shard
   if [[ "${SHARD_NUMBER}" == 1 ]]; then
diff --git a/test/dynamo/test_distributed.py b/test/distributed/test_dynamo_distributed.py
similarity index 88%
rename from test/dynamo/test_distributed.py
rename to test/distributed/test_dynamo_distributed.py
index 695e34817f37b..0fefd4ec507a7 100644
--- a/test/dynamo/test_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -3,15 +3,14 @@
 import unittest
 from unittest.mock import patch
 
-import pytest
 import torch
-
 import torch._dynamo
 import torch._dynamo.test_case
 import torch.distributed as dist
 from torch import nn
 from torch._dynamo import config
-from torch._dynamo.testing import same
+from torch._dynamo.utils import same
+from torch.nn.parallel import DistributedDataParallel as DDP
 
 
 class ToyModel(nn.Module):
@@ -36,14 +35,6 @@ def compile_fn(self, gm, example_inputs):
         return gm
 
 
-def skip_if_no_active_ddp():
-    from torch.nn.parallel import DistributedDataParallel as DDP
-
-    if not hasattr(DDP, "_get_active_ddp_module"):
-        raise unittest.SkipTest("requires pytorch landing in parallel")
-
-
-@pytest.mark.skip("Module hangs in PyTorch CI")
 class TestDistributed(torch._dynamo.test_case.TestCase):
     """
     Test harness initializes dist process group
@@ -98,8 +89,10 @@ def test_ddp_baseline_inductor(self):
         outputs = ddp_m(inputs)
         self.assertTrue(same(correct_outputs, outputs))
 
-    # can't run with gloo (no support for _allgather_base) and nccl not available in CI
-    @pytest.mark.xfail
+    # TODO(whc) move these tests to 'distributed' shard to get nccl, or see if it's available already in pytorch CI?
+    @unittest.skip(
+        "can't run with gloo (no support for _allgather_base) and nccl not available in CI"
+    )
     @patch.object(config, "optimize_ddp", False)
     def test_fsdp_baseline_aot_eager(self):
         from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
@@ -110,8 +103,7 @@ def test_fsdp_baseline_aot_eager(self):
         outputs = fsdp_m(inputs)
         self.assertTrue(same(correct_outputs, outputs))
 
-    # hangs/crashes with inductor currently
-    @pytest.mark.skip
+    @unittest.skip("hangs/crashes with inductor currently")
     @patch.object(config, "optimize_ddp", False)
     def test_fsdp_baseline_inductor(self):
         from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
@@ -130,9 +122,6 @@ def test_graph_split(self):
         the user-provided compiler is called by the DDPOptimizer which is
         doing the graph splitting
         """
-        from torch.nn.parallel import DistributedDataParallel as DDP
-
-        skip_if_no_active_ddp()
 
         m, inputs, correct_outputs = self.get_model()
         ddp_m = DDP(m, device_ids=self.device_ids, bucket_cap_mb=25)
@@ -148,16 +137,13 @@ def opt_fn(inputs):
         self.assertEqual(check_splits_compiler.compiler_called, 3)
 
     # hangs/crashes with inductor currently
-    @pytest.mark.skip
+    @unittest.skip("hangs/crashes with inductor currently")
     @patch.object(config, "optimize_ddp", True)
     def test_graph_split_inductor(self):
         """
         Same as above, but using inductor backend.
         We observed issues with inductor/fx interface in the past.
         """
-        from torch.nn.parallel import DistributedDataParallel as DDP
-
-        skip_if_no_active_ddp()
         m, inputs, correct_outputs = self.get_model()
         ddp_m = DDP(m, device_ids=self.device_ids, bucket_cap_mb=25)
 
@@ -174,9 +160,6 @@ def test_no_split(self):
         Ensures the DDPOptimizer returns a correct, compiled module without
         introducing graph splits. (Based on model parmeters fitting in the bucket)
         """
-        from torch.nn.parallel import DistributedDataParallel as DDP
-
-        skip_if_no_active_ddp()
         m, inputs, correct_outputs = self.get_model()
         ddp_m = DDP(m, device_ids=self.device_ids, bucket_cap_mb=250)
 
@@ -196,9 +179,6 @@ def test_aot_autograd(self):
         Explicitly check AotAutograd family of compilers work,
         since they require example inputs propagated between graph splits.
         """
-        from torch.nn.parallel import DistributedDataParallel as DDP
-
-        skip_if_no_active_ddp()
         m, inputs, correct_outputs = self.get_model()
         ddp_m = DDP(m, device_ids=self.device_ids, bucket_cap_mb=25)
 
@@ -218,9 +198,6 @@ def test_custom_layer(self):
         the user-provided compiler is called by the DDPOptimizer which is
         doing the graph splitting
         """
-        from torch.nn.parallel import DistributedDataParallel as DDP
-
-        skip_if_no_active_ddp()
 
         class MyCustomLinear(torch.nn.Module):
             def __init__(self):
@@ -281,7 +258,7 @@ def fn():
         self.assertEqual(res, 1)
 
 
-# TODO(jansel): debug issues running this in CI
-# if __name__ == "__main__":
-#     from torch._dynamo.testing import run_tests
-#     run_tests()
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()

From 4dd3a33e784f2419b588e786c4d39ee88db7e381 Mon Sep 17 00:00:00 2001
From: Michael Lazos <mlazos@fb.com>
Date: Sat, 22 Oct 2022 03:43:08 +0000
Subject: [PATCH 0041/1922] Unified debug directory for dynamo/inductor tools
 (#87438)

Fixes https://github.com/pytorch/torchdynamo/issues/1705
Fixes https://github.com/pytorch/torchdynamo/issues/1383

Adds a debug directory by default called `torchdynamo_debug` in the current working directory.
In the debug directory for each run of dynamo (an enter and exit of optimize) folder run_\<timestamp\> is created which contains any minifier/inductor/torchdynamo artifacts under respective folders.

Updated the minifier, record replay, and inductor tracing to use this directory

cc @jansel @lezcano @fdrocha @soumith @voznesenskym @yanboliang
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87438
Approved by: https://github.com/soumith
---
 test/dynamo/test_debug_dir.py     | 96 +++++++++++++++++++++++++++++++
 test/dynamo/test_minifier.py      | 36 +++++++++---
 test/dynamo/test_replay_record.py |  7 +--
 torch/_dynamo/config.py           |  6 +-
 torch/_dynamo/debug_utils.py      | 24 ++++----
 torch/_dynamo/eval_frame.py       |  4 ++
 torch/_dynamo/utils.py            | 37 +++++++++++-
 torch/_inductor/debug.py          |  7 ++-
 8 files changed, 183 insertions(+), 34 deletions(-)
 create mode 100644 test/dynamo/test_debug_dir.py

diff --git a/test/dynamo/test_debug_dir.py b/test/dynamo/test_debug_dir.py
new file mode 100644
index 0000000000000..5827ff40ea781
--- /dev/null
+++ b/test/dynamo/test_debug_dir.py
@@ -0,0 +1,96 @@
+# Owner(s): ["module: dynamo"]
+import shutil
+import unittest
+
+import torch
+import torch._dynamo.test_case
+import torch._dynamo.testing
+from torch._dynamo.utils import DebugDir, get_debug_dir
+
+
+class DebugDirTests(torch._dynamo.test_case.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls._exit_stack.enter_context(
+            unittest.mock.patch.object(
+                torch._dynamo.config,
+                "debug_dir_root",
+                "/tmp/torch._dynamo_debug_dirs/",
+            )
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        shutil.rmtree(torch._dynamo.config.debug_dir_root, ignore_errors=True)
+        cls._exit_stack.close()
+
+    def setUp(self):
+        super().setUp()
+        torch._dynamo.utils.debug_dir = DebugDir()
+
+    def tearDown(self):
+        torch._dynamo.utils.debug_dir = DebugDir()
+        super().tearDown()
+
+    def _setup(self):
+        debug_dir = torch._dynamo.utils.debug_dir
+        debug_dir.setup()
+        self.assertIsNotNone(debug_dir.debug_path)
+        self.assertEqual(debug_dir.num_setup_calls, 1)
+        return debug_dir
+
+    def test_setup(self):
+        self._setup()
+
+    def test_clear(self):
+        debug_dir = self._setup()
+        debug_dir.clear()
+        self.assertIsNone(debug_dir.debug_path)
+        self.assertEqual(debug_dir.num_setup_calls, 0)
+
+    def test_multi_setup_single_clear(self):
+        debug_dir = self._setup()
+        prev = get_debug_dir()
+
+        debug_dir.setup()
+        self.assertEqual(prev, get_debug_dir())
+        self.assertEqual(debug_dir.num_setup_calls, 2)
+
+        debug_dir.clear()
+        self.assertEqual(prev, get_debug_dir())
+        self.assertEqual(debug_dir.num_setup_calls, 1)
+
+    def test_multi_setup_multi_clear(self):
+        debug_dir = self._setup()
+        prev = get_debug_dir()
+
+        debug_dir.setup()
+        self.assertEqual(prev, get_debug_dir())
+        self.assertEqual(debug_dir.num_setup_calls, 2)
+
+        debug_dir.clear()
+        self.assertEqual(prev, get_debug_dir())
+        self.assertEqual(debug_dir.num_setup_calls, 1)
+
+        debug_dir.clear()
+        self.assertIsNone(debug_dir.debug_path)
+        self.assertEqual(debug_dir.num_setup_calls, 0)
+
+    def test_single_setup_single_clear(self):
+        debug_dir = self._setup()
+        debug_dir.clear()
+        self.assertIsNone(debug_dir.debug_path)
+        self.assertEqual(debug_dir.num_setup_calls, 0)
+
+    def test_multi_get(self):
+        self._setup()
+        prev = get_debug_dir()
+        next = get_debug_dir()
+        self.assertEqual(prev, next)
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/dynamo/test_minifier.py b/test/dynamo/test_minifier.py
index 4570d15b2d148..a282485285797 100644
--- a/test/dynamo/test_minifier.py
+++ b/test/dynamo/test_minifier.py
@@ -1,10 +1,10 @@
 # Owner(s): ["module: dynamo"]
 import os
 import shutil
+import unittest
 from unittest.mock import patch
 
 import torch
-
 import torch._dynamo
 import torch._dynamo.test_case
 import torch._dynamo.testing
@@ -25,6 +25,30 @@ def forward(self, x):
 
 
 class MinfierTests(torch._dynamo.test_case.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls._exit_stack.enter_context(
+            unittest.mock.patch.object(
+                torch._dynamo.config,
+                "debug_dir_root",
+                "/tmp/_torchdynamo_debug_/",
+            )
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        shutil.rmtree(torch._dynamo.config.debug_dir_root, ignore_errors=True)
+        cls._exit_stack.close()
+
+    def setUp(self):
+        super().setUp()
+        torch._dynamo.utils.debug_dir.setup()
+
+    def tearDown(self):
+        torch._dynamo.utils.debug_dir.clear()
+        super().tearDown()
+
     def test_after_dynamo(self):
         @create_backend
         def bad_dynamo_backend(subgraph):
@@ -43,12 +67,9 @@ def f(*args):
 
         mod = MockModule()
         opt_mod = torch._dynamo.optimize("bad_dynamo_backend")(mod)
-        repro_dir = "/tmp/test_minifier"
-        repro_file = os.path.join(repro_dir, "minifier_launcher.py")
-        shutil.rmtree(repro_dir, ignore_errors=True)
+        repro_file = torch._dynamo.debug_utils.get_minifier_repro_path()
 
         @patch.object(torch._dynamo.config, "repro_after", "dynamo")
-        @patch.object(torch._dynamo.config, "repro_dir", repro_dir)
         def inner():
             x = torch.randn(4)
             try:
@@ -65,14 +86,11 @@ def inner():
     def _test_around_aot(self, error_at_aot):
         mod = MockModule()
         opt_mod = torch._dynamo.optimize("inductor")(mod)
-        repro_dir = "/tmp/test_minifier"
-        repro_file = os.path.join(repro_dir, "minifier_launcher.py")
-        shutil.rmtree(repro_dir, ignore_errors=True)
 
+        repro_file = torch._dynamo.debug_utils.get_minifier_repro_path()
         repro_after = "dynamo" if error_at_aot else "aot"
 
         @patch.object(torch._dynamo.config, "repro_after", repro_after)
-        @patch.object(torch._dynamo.config, "repro_dir", repro_dir)
         def inner():
             x = torch.randn(4)
             x.requires_grad = error_at_aot
diff --git a/test/dynamo/test_replay_record.py b/test/dynamo/test_replay_record.py
index 378fd2b78a9bc..5235e355e0d1c 100644
--- a/test/dynamo/test_replay_record.py
+++ b/test/dynamo/test_replay_record.py
@@ -5,7 +5,6 @@
 import unittest
 
 import torch
-
 import torch._dynamo.test_case
 import torch._dynamo.testing
 
@@ -37,14 +36,14 @@ def setUpClass(cls):
         cls._exit_stack.enter_context(
             unittest.mock.patch.object(
                 torch._dynamo.config,
-                "replay_record_dir_name",
-                "/tmp/torch._dynamo_error_records/",
+                "debug_dir_root",
+                "/tmp/_torchdynamo_debug_/",
             )
         )
 
     @classmethod
     def tearDownClass(cls):
-        shutil.rmtree(torch._dynamo.config.replay_record_dir_name, ignore_errors=True)
+        shutil.rmtree(torch._dynamo.config.debug_dir_root, ignore_errors=True)
         cls._exit_stack.close()
 
     def check_replay(self, fn, *args, exp_exc_name=None):
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index 7a2c79972ddaa..701036789ffcb 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -83,7 +83,6 @@
 # Record and write an execution record of the current frame to a file
 # if an exception is encountered
 replay_record_enabled = False
-replay_record_dir_name = "./torchdynamo_error_records"
 
 # Show a warning on every graph break
 print_graph_breaks = False
@@ -126,9 +125,6 @@
 # 4: Dumps a minifier_launcher.py if the accuracy fails.
 repro_level = int(os.environ.get("TORCHDYNAMO_REPRO_LEVEL", 2))
 
-# Specify the directory where to save the repro artifacts
-repro_dir = os.environ.get("TORCHDYNAMO_REPRO_DIR", None)
-
 # Not all backends support scalars. Some calls on torch.Tensor (like .item()) return a scalar type.
 # When this flag is set to False, we introduce a graph break instead of capturing.
 capture_scalar_outputs = False
@@ -159,6 +155,8 @@
 else:
     base_dir = dirname(dirname(abspath(__file__)))
 
+debug_dir_root = os.path.join(os.getcwd(), "torchdynamo_debug")
+
 
 class _AccessLimitingConfig(ModuleType):
     def __setattr__(self, name, value):
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index 845c518a4f85d..1134267c5f60d 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -16,13 +16,13 @@
 
 from . import config
 from .optimizations.backends import register_backend
-from .utils import clone_inputs
+from .utils import clone_inputs, get_debug_dir
 
 log = logging.getLogger(__name__)
 
 
 def minifier_dir():
-    path = config.repro_dir
+    path = os.path.join(get_debug_dir(), "minifier")
     if path is None:
         path = f"/tmp/minifier_{getpass.getuser()}"
     if not os.path.exists(path):
@@ -331,8 +331,12 @@ def inductor_accuracy_fails(fx_g, args, check_str=None):
     return backend_aot_accuracy_fails(fx_g, args, compile_fx_inner)
 
 
+def get_minifier_repro_path():
+    return os.path.join(minifier_dir(), "minifier_launcher.py")
+
+
 def helper_for_dump_minify(contents):
-    minified_repro_path = os.path.join(minifier_dir(), "minifier_launcher.py")
+    minified_repro_path = get_minifier_repro_path()
     log.warning(f"Writing minified repro to {minified_repro_path}")
     try:
         with open(minified_repro_path, "w") as fd:
@@ -341,15 +345,6 @@ def helper_for_dump_minify(contents):
         log.exception(e)
         raise NotImplementedError("Could not write to {minified_repro_path}")
 
-    local_path = os.path.join(config.base_dir, "minifier_launcher.py")
-    try:
-        shutil.copyfile(minified_repro_path, local_path)
-        log.warning(
-            f"Copying minified repro from {minified_repro_path} to {local_path} for convenience"
-        )
-    except OSError:
-        log.warning(f"Don't have write permissions for {local_path}")
-
 
 def dump_to_minify(gm, args, compiler_name: str):
     favored_device = 1 if torch.cuda.device_count() >= 2 else 0
@@ -827,7 +822,9 @@ def debug_wrapper(gm, example_inputs, **kwargs):
                             example_inputs,
                             compiler_name,
                         )
-                    raise ValueError("Issue deteced. Repro at minifier_launcher.py.")
+                    raise ValueError(
+                        f"Issue detected. Repro at {get_minifier_repro_path()}."
+                    )
         else:
             compiled_gm = compiler_fn(gm, example_inputs, **kwargs)
 
@@ -877,7 +874,6 @@ def dynamo_minifier_backend(gm, example_inputs, compiler_name):
 @register_backend
 def dynamo_accuracy_minifier_backend(gm, example_inputs, compiler_name):
     from functorch.compile import minifier
-
     from torchdynamo.optimizations.backends import BACKENDS
 
     if compiler_name == "inductor":
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index bf9a230a420b8..40beba357b1cf 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -103,12 +103,14 @@ def __enter__(self):
                 "Please refer to https://github.com/pytorch/torchdynamo#usage-example "
                 "to use torchdynamo.optimize(...) as an annotation/decorator. "
             )
+        utils.debug_dir.setup()
         self.on_enter()
         self.prior = set_eval_frame(self.callback)
         self.backend_ctx = self.extra_ctx_ctor()
         self.backend_ctx.__enter__()
 
     def __exit__(self, exc_type, exc_val, exc_tb):
+        utils.debug_dir.clear()
         set_eval_frame(self.prior)
         self.prior = unset
         self.backend_ctx.__exit__(exc_type, exc_val, exc_tb)
@@ -150,12 +152,14 @@ def __call__(self, *args, **kwargs):
         @functools.wraps(fn)
         def _fn(*args, **kwargs):
             on_enter()
+            utils.debug_dir.setup()
             prior = set_eval_frame(callback)
             backend_ctx = backend_ctx_ctor()
             backend_ctx.__enter__()
             try:
                 return fn(*args, **kwargs)
             finally:
+                utils.debug_dir.clear()
                 set_eval_frame(prior)
                 backend_ctx.__exit__(None, None, None)
 
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index b66c240e0f04d..aa64de0eeef3b 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -3,6 +3,7 @@
 import copy
 import cProfile
 import dataclasses
+import datetime
 import dis
 import functools
 import gc
@@ -197,7 +198,7 @@ def format_bytecode(prefix, name, filename, line_no, code):
 
 
 def gen_record_file_name(exc, code):
-    return f"{config.replay_record_dir_name}/\
+    return f"{get_debug_dir()}/error_recordings/\
 {code.co_name}_{type(exc).__name__}_{code.co_firstlineno}.rec"
 
 
@@ -928,3 +929,37 @@ def recompile_reasons(code):
             rpt += "No cache-limited recompilations detected.\n"
 
         return rpt
+
+
+class DebugDir:
+    def __init__(self):
+        self.num_setup_calls = 0
+        self.debug_path = None
+
+    def setup(self):
+        assert self.num_setup_calls >= 0
+        if self.num_setup_calls == 0:
+            debug_root = config.debug_dir_root
+            dir_name = "run_" + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")
+            self.debug_path = os.path.join(debug_root, dir_name)
+
+        self.num_setup_calls += 1
+
+    def clear(self):
+        assert self.num_setup_calls >= 0
+        if self.num_setup_calls == 1:
+            self.debug_path = None
+
+        self.num_setup_calls -= 1
+        assert self.num_setup_calls >= 0
+
+    def get(self):
+        assert self.debug_path is not None
+        return self.debug_path
+
+
+debug_dir = DebugDir()
+
+
+def get_debug_dir():
+    return debug_dir.get()
diff --git a/torch/_inductor/debug.py b/torch/_inductor/debug.py
index d2bc9bcd73344..f7fbfe218be39 100644
--- a/torch/_inductor/debug.py
+++ b/torch/_inductor/debug.py
@@ -19,7 +19,6 @@
 from torch.fx.passes.tools_common import legalize_graph
 
 from . import config, ir
-from .codecache import cache_dir
 from .scheduler import (
     BaseSchedulerNode,
     ExternKernelSchedulerNode,
@@ -182,7 +181,11 @@ def inner(*args, **kwargs):
     @staticmethod
     def create_debug_dir():
         for n in DebugContext._counter:
-            dirname = os.path.join(cache_dir(), f"debug.{os.getpid()}.{n}")
+            dirname = os.path.join(
+                dynamo_utils.get_debug_dir(),
+                "torchinductor",
+                f"debug.{os.getpid()}.{n}",
+            )
             if not os.path.exists(dirname):
                 os.makedirs(dirname)
                 return dirname

From 409a9a385383719690fb5e9c6bffc0a0abee1c92 Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Sat, 22 Oct 2022 01:03:41 +0000
Subject: [PATCH 0042/1922] Improvements for DDP Optimizer (#87525)

- adds support for 'first_bucket_cap' arg, to align bucketing more precisely
  with DDP, which may start a smaller first bucket
- refactors the bucket splitting logic to be cleaner
- adds pretty-print for bucket info, and a way to access bucket info
  from the DDPOptimizer class from a test case or benchmark
- dumps debug logs to stdout

cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87525
Approved by: https://github.com/davidberard98
---
 test/distributed/test_dynamo_distributed.py |   9 +-
 torch/_dynamo/optimizations/distributed.py  | 120 ++++++++++++--------
 2 files changed, 76 insertions(+), 53 deletions(-)

diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index 0fefd4ec507a7..43a4a23039175 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -18,8 +18,8 @@ def __init__(self, in_feat=10, hidden_feat=5000, num_hidden=2, out_feat=5):
         super().__init__()
         self.net = nn.Sequential(
             *[nn.Linear(in_feat, hidden_feat), nn.ReLU()]
-            + [nn.Linear(5000, 5000), nn.ReLU()] * num_hidden
-            + [nn.Linear(5000, 5), nn.ReLU()]
+            + [nn.Linear(hidden_feat, hidden_feat), nn.ReLU()] * num_hidden
+            + [nn.Linear(hidden_feat, 5), nn.ReLU()]
         )
 
     def forward(self, inputs):
@@ -160,7 +160,10 @@ def test_no_split(self):
         Ensures the DDPOptimizer returns a correct, compiled module without
         introducing graph splits. (Based on model parmeters fitting in the bucket)
         """
-        m, inputs, correct_outputs = self.get_model()
+        # DDP will always do a 'first bucket' with a really small size;  so only a tiny model will escape this
+        m = ToyModel(hidden_feat=5).to(self.device)
+        inputs = torch.randn(20, 10).to(self.device)
+        correct_outputs = m(inputs)
         ddp_m = DDP(m, device_ids=self.device_ids, bucket_cap_mb=250)
 
         check_splits_compiler = CheckSplitsCompiler()
diff --git a/torch/_dynamo/optimizations/distributed.py b/torch/_dynamo/optimizations/distributed.py
index f65c16483aec6..e674820032d97 100644
--- a/torch/_dynamo/optimizations/distributed.py
+++ b/torch/_dynamo/optimizations/distributed.py
@@ -1,3 +1,4 @@
+from dataclasses import dataclass, field
 from typing import Any, List
 
 import torch
@@ -18,6 +19,28 @@ def args_str(args):
         return str(args)
 
 
+@dataclass
+class Bucket:
+    size: int = 0
+    params: List[str] = field(default_factory=list)
+    nodes: List[fx.Node] = field(default_factory=list)
+
+
+def pretty_print_buckets(buckets: List[Bucket]):
+    headers = ("Index", "Size (b)", "Param Names")
+    rows = []
+    for idx, bucket in enumerate(reversed(buckets)):
+        rows.append((idx, bucket.size, bucket.params[0]))
+        for param in bucket.params[1:]:
+            rows.append((None, None, param))
+    try:
+        from tabulate import tabulate
+
+        print(tabulate(rows, headers=headers, tablefmt="simple_grid"))
+    except ImportError:
+        print("Please `pip install tabulate` in order to pretty-print ddp bucket sizes")
+
+
 class DDPOptimizer:
     def __init__(
         self,
@@ -25,8 +48,13 @@ def __init__(
         parameters_to_ignore: List[str],
         backend_compile_fn,
         debug=False,
+        first_bucket_cap: int = torch.distributed._DEFAULT_FIRST_BUCKET_BYTES,
     ):
         self.bucket_bytes_cap = bucket_bytes_cap
+        assert (
+            first_bucket_cap <= bucket_bytes_cap
+        ), "First bucket should be smaller/equal to other buckets to get comms warmed up ASAP"
+        self.first_bucket_cap = first_bucket_cap
         self.parameters_to_ignore = parameters_to_ignore
         self.backend_compile_fn = backend_compile_fn
         self.debug = debug
@@ -35,76 +63,69 @@ def compile_fn(self, gm: fx.GraphModule, example_inputs: List[torch.Tensor]):
         """
         TODO:
         - handle params_and_buffers_to_ignore
-        - handle kwargs
         """
 
         # 1: compute the partition map according to DDP bucket logic
-        bucket_bytes = 0
-        bucket_actual_sizes = []
-        node_splits = [[]]
+        buckets = [Bucket()]  # (size, param_names)
         for node in reversed(gm.graph.nodes):
-            if node.op == "output" or node.op == "placeholder":
+            if node.op in ("output", "placeholder"):
                 continue
 
-            if bucket_bytes >= self.bucket_bytes_cap:
-                bucket_actual_sizes.insert(0, bucket_bytes)
-                bucket_bytes = 0
-                node_splits.insert(0, [])
+            if (
+                buckets[0].size >= self.bucket_bytes_cap
+                or len(buckets) == 1
+                and buckets[0].size >= self.first_bucket_cap
+            ):
+                buckets.insert(0, Bucket())
 
-            elif node.op == "call_module":
+            if node.op == "call_module":
                 target = gm.get_submodule(node.target)
-                params_size_b = sum(
-                    [
-                        p.storage().nbytes()
-                        for p in target.parameters()
-                        if p.requires_grad
-                    ]
-                )
-                bucket_bytes += params_size_b
-                # print(f"accumulated {params_size_b} b from {node}")
+                for name, p in target.named_parameters():
+                    if p.requires_grad:
+                        buckets[0].size += p.storage().nbytes()
+                        # TODO correct FQ name?
+                        buckets[0].params.append(f"{node}_{name}")
             elif node.op == "get_attr":
                 maybe_param = getattr(gm, node.target)
                 if maybe_param.requires_grad:
-                    bucket_bytes += maybe_param.storage().nbytes()
-            else:
-                # TODO(whc) confirm this:
-                # (e.g. call_method, call_function aren't expected to 'have' parameters)
-                pass
-
-            node_splits[0].append(node)
-
-        if len(node_splits) == 1:
-            if self.debug:
-                print(
-                    "DDPOptimizer did not split graphs."
-                    f" Accumulated {bucket_bytes} bytes, and bucket cap is {self.bucket_bytes_cap}"
-                )
-            return self.backend_compile_fn(gm, example_inputs)
+                    buckets[0].size += maybe_param.storage().nbytes()
+                    buckets[0].params.append(node.target)
 
-        if len(bucket_actual_sizes) < len(node_splits):
-            bucket_actual_sizes.insert(0, bucket_bytes)
+            # All nodes have to be mapped to a bucket, even if they don't have their own params
+            buckets[0].nodes.append(node)
 
+        # stash buckets for testing/debugging purposes
+        self.buckets = buckets
         if self.debug:
             print(
-                f"DDPOptimizer used bucket cap {self.bucket_bytes_cap}"
-                f" and split graphs into parameter sizes {', '.join([str(b) for b in bucket_actual_sizes])}"
+                f"DDPOptimizer used bucket cap {self.bucket_bytes_cap} and produced the following buckets:"
             )
+            pretty_print_buckets(buckets)
+
+        if len(buckets) == 1:
+            # bypass split/fuse logic if there is only one bucket
+            return self.backend_compile_fn(gm, example_inputs)
 
         # 2: partition the graphmodule according to bucket capacity
         partition_map = {}
-        for p, nodes in enumerate(node_splits):
-            for node in nodes:
-                partition_map[node] = p
+        for idx, b in enumerate(buckets):
+            for node in b.nodes:
+                partition_map[node] = idx
 
         split_gm = fx.passes.split_module.split_module(
             gm, None, lambda node: partition_map[node]
         )
         if self.debug:
-            with open("debug_ddp_optimizer.log", "w") as dump_file:
-                dump_file.write("---orig graph---")
-                dump_file.write(str(gm.graph))
-                dump_file.write("\n---split graph---")
-                dump_file.write(str(split_gm.graph))
+            print("---orig graph---")
+            print(str(gm.graph))
+            print("\n---split graph---")
+            print(str(split_gm.graph))
+            for name, module in split_gm.named_modules():
+                if "." not in name:
+                    # only print the submod graphs, not their children
+                    print(f"\n---{name} graph---")
+                    print(str(module.graph))
+            print("---------------")
 
         # 3: compile each of the partitioned submodules using the user-provided compiler
         class SubmodCompiler(torch.fx.interpreter.Interpreter):
@@ -171,7 +192,6 @@ def run_node(self, n: Node) -> Any:
                         self.module.delete_submodule(n.target)
                         n.target = "compiled_" + n.target
                         self.module.add_submodule(n.target, compiled_submod)
-
                     # then we execute the modified node using the usual logic
                     return getattr(self, n.op)(n.target, args, kwargs)
 
@@ -180,8 +200,8 @@ def run_node(self, n: Node) -> Any:
         split_gm.recompile()
 
         if self.debug:
-            with open("debug_ddp_optimizer.log", "a") as dump_file:
-                dump_file.write("\n---final graph---")
-                dump_file.write(str(split_gm.graph))
+            print("\n---final graph---")
+            print(str(split_gm.graph))
+            print("---------------")
 
         return split_gm

From 1a0cc9ca29bf9d5336330def1acaf896194759ef Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Sat, 22 Oct 2022 04:51:33 +0000
Subject: [PATCH 0043/1922] Revert "Improvements for DDP Optimizer (#87525)"

This reverts commit cf693a02e0f6a022d10fd882af20efacfe7ecb76.

Reverted https://github.com/pytorch/pytorch/pull/87525 on behalf of https://github.com/ZainRizvi due to The macos error messages look like they were indeed caused by this PR
---
 test/distributed/test_dynamo_distributed.py |   9 +-
 torch/_dynamo/optimizations/distributed.py  | 120 ++++++++------------
 2 files changed, 53 insertions(+), 76 deletions(-)

diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index 43a4a23039175..0fefd4ec507a7 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -18,8 +18,8 @@ def __init__(self, in_feat=10, hidden_feat=5000, num_hidden=2, out_feat=5):
         super().__init__()
         self.net = nn.Sequential(
             *[nn.Linear(in_feat, hidden_feat), nn.ReLU()]
-            + [nn.Linear(hidden_feat, hidden_feat), nn.ReLU()] * num_hidden
-            + [nn.Linear(hidden_feat, 5), nn.ReLU()]
+            + [nn.Linear(5000, 5000), nn.ReLU()] * num_hidden
+            + [nn.Linear(5000, 5), nn.ReLU()]
         )
 
     def forward(self, inputs):
@@ -160,10 +160,7 @@ def test_no_split(self):
         Ensures the DDPOptimizer returns a correct, compiled module without
         introducing graph splits. (Based on model parmeters fitting in the bucket)
         """
-        # DDP will always do a 'first bucket' with a really small size;  so only a tiny model will escape this
-        m = ToyModel(hidden_feat=5).to(self.device)
-        inputs = torch.randn(20, 10).to(self.device)
-        correct_outputs = m(inputs)
+        m, inputs, correct_outputs = self.get_model()
         ddp_m = DDP(m, device_ids=self.device_ids, bucket_cap_mb=250)
 
         check_splits_compiler = CheckSplitsCompiler()
diff --git a/torch/_dynamo/optimizations/distributed.py b/torch/_dynamo/optimizations/distributed.py
index e674820032d97..f65c16483aec6 100644
--- a/torch/_dynamo/optimizations/distributed.py
+++ b/torch/_dynamo/optimizations/distributed.py
@@ -1,4 +1,3 @@
-from dataclasses import dataclass, field
 from typing import Any, List
 
 import torch
@@ -19,28 +18,6 @@ def args_str(args):
         return str(args)
 
 
-@dataclass
-class Bucket:
-    size: int = 0
-    params: List[str] = field(default_factory=list)
-    nodes: List[fx.Node] = field(default_factory=list)
-
-
-def pretty_print_buckets(buckets: List[Bucket]):
-    headers = ("Index", "Size (b)", "Param Names")
-    rows = []
-    for idx, bucket in enumerate(reversed(buckets)):
-        rows.append((idx, bucket.size, bucket.params[0]))
-        for param in bucket.params[1:]:
-            rows.append((None, None, param))
-    try:
-        from tabulate import tabulate
-
-        print(tabulate(rows, headers=headers, tablefmt="simple_grid"))
-    except ImportError:
-        print("Please `pip install tabulate` in order to pretty-print ddp bucket sizes")
-
-
 class DDPOptimizer:
     def __init__(
         self,
@@ -48,13 +25,8 @@ def __init__(
         parameters_to_ignore: List[str],
         backend_compile_fn,
         debug=False,
-        first_bucket_cap: int = torch.distributed._DEFAULT_FIRST_BUCKET_BYTES,
     ):
         self.bucket_bytes_cap = bucket_bytes_cap
-        assert (
-            first_bucket_cap <= bucket_bytes_cap
-        ), "First bucket should be smaller/equal to other buckets to get comms warmed up ASAP"
-        self.first_bucket_cap = first_bucket_cap
         self.parameters_to_ignore = parameters_to_ignore
         self.backend_compile_fn = backend_compile_fn
         self.debug = debug
@@ -63,69 +35,76 @@ def compile_fn(self, gm: fx.GraphModule, example_inputs: List[torch.Tensor]):
         """
         TODO:
         - handle params_and_buffers_to_ignore
+        - handle kwargs
         """
 
         # 1: compute the partition map according to DDP bucket logic
-        buckets = [Bucket()]  # (size, param_names)
+        bucket_bytes = 0
+        bucket_actual_sizes = []
+        node_splits = [[]]
         for node in reversed(gm.graph.nodes):
-            if node.op in ("output", "placeholder"):
+            if node.op == "output" or node.op == "placeholder":
                 continue
 
-            if (
-                buckets[0].size >= self.bucket_bytes_cap
-                or len(buckets) == 1
-                and buckets[0].size >= self.first_bucket_cap
-            ):
-                buckets.insert(0, Bucket())
+            if bucket_bytes >= self.bucket_bytes_cap:
+                bucket_actual_sizes.insert(0, bucket_bytes)
+                bucket_bytes = 0
+                node_splits.insert(0, [])
 
-            if node.op == "call_module":
+            elif node.op == "call_module":
                 target = gm.get_submodule(node.target)
-                for name, p in target.named_parameters():
-                    if p.requires_grad:
-                        buckets[0].size += p.storage().nbytes()
-                        # TODO correct FQ name?
-                        buckets[0].params.append(f"{node}_{name}")
+                params_size_b = sum(
+                    [
+                        p.storage().nbytes()
+                        for p in target.parameters()
+                        if p.requires_grad
+                    ]
+                )
+                bucket_bytes += params_size_b
+                # print(f"accumulated {params_size_b} b from {node}")
             elif node.op == "get_attr":
                 maybe_param = getattr(gm, node.target)
                 if maybe_param.requires_grad:
-                    buckets[0].size += maybe_param.storage().nbytes()
-                    buckets[0].params.append(node.target)
+                    bucket_bytes += maybe_param.storage().nbytes()
+            else:
+                # TODO(whc) confirm this:
+                # (e.g. call_method, call_function aren't expected to 'have' parameters)
+                pass
+
+            node_splits[0].append(node)
+
+        if len(node_splits) == 1:
+            if self.debug:
+                print(
+                    "DDPOptimizer did not split graphs."
+                    f" Accumulated {bucket_bytes} bytes, and bucket cap is {self.bucket_bytes_cap}"
+                )
+            return self.backend_compile_fn(gm, example_inputs)
 
-            # All nodes have to be mapped to a bucket, even if they don't have their own params
-            buckets[0].nodes.append(node)
+        if len(bucket_actual_sizes) < len(node_splits):
+            bucket_actual_sizes.insert(0, bucket_bytes)
 
-        # stash buckets for testing/debugging purposes
-        self.buckets = buckets
         if self.debug:
             print(
-                f"DDPOptimizer used bucket cap {self.bucket_bytes_cap} and produced the following buckets:"
+                f"DDPOptimizer used bucket cap {self.bucket_bytes_cap}"
+                f" and split graphs into parameter sizes {', '.join([str(b) for b in bucket_actual_sizes])}"
             )
-            pretty_print_buckets(buckets)
-
-        if len(buckets) == 1:
-            # bypass split/fuse logic if there is only one bucket
-            return self.backend_compile_fn(gm, example_inputs)
 
         # 2: partition the graphmodule according to bucket capacity
         partition_map = {}
-        for idx, b in enumerate(buckets):
-            for node in b.nodes:
-                partition_map[node] = idx
+        for p, nodes in enumerate(node_splits):
+            for node in nodes:
+                partition_map[node] = p
 
         split_gm = fx.passes.split_module.split_module(
             gm, None, lambda node: partition_map[node]
         )
         if self.debug:
-            print("---orig graph---")
-            print(str(gm.graph))
-            print("\n---split graph---")
-            print(str(split_gm.graph))
-            for name, module in split_gm.named_modules():
-                if "." not in name:
-                    # only print the submod graphs, not their children
-                    print(f"\n---{name} graph---")
-                    print(str(module.graph))
-            print("---------------")
+            with open("debug_ddp_optimizer.log", "w") as dump_file:
+                dump_file.write("---orig graph---")
+                dump_file.write(str(gm.graph))
+                dump_file.write("\n---split graph---")
+                dump_file.write(str(split_gm.graph))
 
         # 3: compile each of the partitioned submodules using the user-provided compiler
         class SubmodCompiler(torch.fx.interpreter.Interpreter):
@@ -192,6 +171,7 @@ def run_node(self, n: Node) -> Any:
                         self.module.delete_submodule(n.target)
                         n.target = "compiled_" + n.target
                         self.module.add_submodule(n.target, compiled_submod)
+
                     # then we execute the modified node using the usual logic
                     return getattr(self, n.op)(n.target, args, kwargs)
 
@@ -200,8 +180,8 @@ def run_node(self, n: Node) -> Any:
         split_gm.recompile()
 
         if self.debug:
-            print("\n---final graph---")
-            print(str(split_gm.graph))
-            print("---------------")
+            with open("debug_ddp_optimizer.log", "a") as dump_file:
+                dump_file.write("\n---final graph---")
+                dump_file.write(str(split_gm.graph))
 
         return split_gm

From f43dc9e5f9a47d53f3e679918f2afcd69cebfa36 Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Fri, 21 Oct 2022 23:01:17 +0000
Subject: [PATCH 0044/1922] Enable inductor CI for TIMM (#87462)

cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87462
Approved by: https://github.com/anijain2305
---
 .github/ci_commit_pins/timm.txt  |  2 +-
 .github/workflows/inductor.yml   | 12 ++++++++++--
 .jenkins/pytorch/test.sh         | 26 +++++++++++++++++++++++---
 benchmarks/dynamo/common.py      |  1 +
 benchmarks/dynamo/timm_models.py |  3 ++-
 5 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/.github/ci_commit_pins/timm.txt b/.github/ci_commit_pins/timm.txt
index 4b199567e9a7b..cdda1d14775c6 100644
--- a/.github/ci_commit_pins/timm.txt
+++ b/.github/ci_commit_pins/timm.txt
@@ -1 +1 @@
-ebee0a27940adfbb30444d83387b9ea0f1173f40
+6635bc3f7d06c6a0d0481803b24d6ad0004b61ac
diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml
index a5aa7acaec0b9..da27466b60e90 100644
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@@ -22,8 +22,16 @@ jobs:
       cuda-arch-list: 8.6
       test-matrix: |
         { include: [
-          { config: "inductor", shard: 1, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 1, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 3, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 4, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 5, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 6, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 7, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 8, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 9, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 10, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" },
         ]}
 
   linux-bionic-cuda11_6-py3_10-gcc7-inductor-test:
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index adcaf82ffdd38..2b0de2ec35a6f 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -277,6 +277,19 @@ test_inductor_huggingface_shard() {
   python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_huggingface_"$1".csv
 }
 
+test_inductor_timm_shard() {
+  if [[ -z "$NUM_TEST_SHARDS" ]]; then
+    echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
+    exit 1
+  fi
+  TEST_REPORTS_DIR=/tmp/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+  python benchmarks/dynamo/timm_models.py --ci --training --accuracy \
+    --device cuda --inductor --float32 --total-partitions 8 --partition-id "$1" \
+    --output "$TEST_REPORTS_DIR"/inductor_timm_"$1".csv
+  python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_timm_"$1".csv
+}
+
 test_python_gloo_with_tls() {
   source "$(dirname "${BASH_SOURCE[0]}")/run_glootls_test.sh"
   assert_git_not_dirty
@@ -729,17 +742,24 @@ elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHAR
   install_filelock
   install_triton
   test_dynamo_shard 2
-elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
+elif [[ "${TEST_CONFIG}" == *inductor* && $SHARD_NUMBER -lt 9 && $NUM_TEST_SHARDS -gt 1 ]]; then
   install_torchvision
   install_filelock
   install_triton
-  test_inductor
-elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then
+  install_timm
+  id=$((SHARD_NUMBER-1))
+  test_inductor_timm_shard $id
+elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 9 && $NUM_TEST_SHARDS -gt 1 ]]; then
   install_torchvision
   install_filelock
   install_triton
   install_huggingface
   test_inductor_huggingface_shard 0
+elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 10 && $NUM_TEST_SHARDS -gt 1 ]]; then
+  install_torchvision
+  install_filelock
+  install_triton
+  test_inductor
 elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
   test_without_numpy
   install_torchvision
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 01793b01e0e03..b1f8bbd993f3b 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -149,6 +149,7 @@ def set_model_name(name):
     "convit_base",  # fp64_OOM
     "gernet_l",  # accuracy
     "gluon_xception65",
+    "hrnet_w18",  # accuracy
     "lcnet_0500",  # accuracy
     "levit_128",  # levit_128
     "rexnet_100",  # accuracy
diff --git a/benchmarks/dynamo/timm_models.py b/benchmarks/dynamo/timm_models.py
index f7ff2559cbb8a..34b2078d23e36 100755
--- a/benchmarks/dynamo/timm_models.py
+++ b/benchmarks/dynamo/timm_models.py
@@ -205,7 +205,8 @@ def load_model(
                     drop_rate=0.0,
                     drop_path_rate=None,
                     drop_block_rate=None,
-                    pretrained=True,
+                    # Skip downloading pretrained models for speedy CI
+                    pretrained=not self.args.ci,
                     # global_pool=kwargs.pop('gp', 'fast'),
                     # num_classes=kwargs.pop('num_classes', None),
                     # drop_rate=kwargs.pop('drop', 0.),

From 2b0af59a61ca4330388161725ff2966759ed7158 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Sat, 22 Oct 2022 06:00:59 +0000
Subject: [PATCH 0045/1922] [BE] Delete BUILD_SPLIT_CUDA option (#87502)

As we are linking with cuDNN and cuBLAS dynamically for all configs anyway, as statically linked cuDNN is different library than dynamically linked one, increases default memory footprint, etc, and libtorch_cuda even if compiled for all GPU architectures is no longer approaching 2Gb binary size limit, so BUILD_SPLIT_CUDA can go away.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87502
Approved by: https://github.com/atalman
---
 .../actions/test-pytorch-binary/action.yml    |   1 -
 .github/workflows/_binary-build-linux.yml     |   6 -
 .jenkins/pytorch/build.sh                     |   2 -
 .jenkins/pytorch/test.sh                      |   4 -
 .../win-test-helpers/build_pytorch.bat        |   4 -
 .jenkins/pytorch/win-test.sh                  |   4 -
 CMakeLists.txt                                |   7 -
 aten/src/ATen/native/cuda/Bucketization.cu    |   6 -
 aten/src/ATen/native/native_functions.yaml    |  11 --
 caffe2/CMakeLists.txt                         | 128 ++----------------
 .../check_forward_backward_compatibility.py   |   1 +
 torch/csrc/jit/codegen/cuda/nvfuser.cmake     |   4 +-
 torch/utils/cpp_extension.py                  |  23 +---
 13 files changed, 20 insertions(+), 181 deletions(-)

diff --git a/.github/actions/test-pytorch-binary/action.yml b/.github/actions/test-pytorch-binary/action.yml
index bc2c546f57b28..be2090db533db 100644
--- a/.github/actions/test-pytorch-binary/action.yml
+++ b/.github/actions/test-pytorch-binary/action.yml
@@ -15,7 +15,6 @@ runs:
           -e BINARY_ENV_FILE \
           -e BUILDER_ROOT \
           -e BUILD_ENVIRONMENT \
-          -e BUILD_SPLIT_CUDA \
           -e DESIRED_CUDA \
           -e DESIRED_DEVTOOLSET \
           -e DESIRED_PYTHON \
diff --git a/.github/workflows/_binary-build-linux.yml b/.github/workflows/_binary-build-linux.yml
index a665f53bab5e0..6bd2ccd691918 100644
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@@ -167,11 +167,6 @@ jobs:
           git clean -fxd
         working-directory: builder
 
-      - name: Set BUILD_SPLIT_CUDA
-        if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && startsWith(inputs.GPU_ARCH_VERSION, '11') }}
-        shell: bash
-        run: |
-          echo "BUILD_SPLIT_CUDA='ON'" >> "$GITHUB_ENV"
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
@@ -184,7 +179,6 @@ jobs:
             -e BINARY_ENV_FILE \
             -e BUILDER_ROOT \
             -e BUILD_ENVIRONMENT \
-            -e BUILD_SPLIT_CUDA \
             -e DESIRED_CUDA \
             -e DESIRED_DEVTOOLSET \
             -e DESIRED_PYTHON \
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index 58cdc1227ac2d..13ee6309c0655 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -41,8 +41,6 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
 fi
 
 if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
-  # enable split torch_cuda build option in CMake
-  export BUILD_SPLIT_CUDA=ON
   if [[ "$BUILD_ENVIRONMENT" != *cuda11.3* && "$BUILD_ENVIRONMENT" != *clang* ]]; then
     # TODO: there is a linking issue when building with UCC using clang,
     # disable it for now and to be fix later.
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 2b0de2ec35a6f..b263c1949c10f 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -97,10 +97,6 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* || "$BUILD_ENVIRONMENT" == *rocm* ]]; then
   export PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda"
 fi
 
-if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
-  export BUILD_SPLIT_CUDA=ON
-fi
-
 if [[ "$TEST_CONFIG" == *crossref* ]]; then
   export PYTORCH_TEST_WITH_CROSSREF=1
 fi
diff --git a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
index b85dad0616cd7..9c9c9cd64290b 100644
--- a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
@@ -135,10 +135,6 @@ if "%REBUILD%" == "" (
     if not errorlevel 0 exit /b
   )
 )
-:: tests if BUILD_ENVIRONMENT contains cuda11 as a substring
-if not x%BUILD_ENVIRONMENT:cuda11=%==x%BUILD_ENVIRONMENT% (
-   set BUILD_SPLIT_CUDA=ON
-)
 
 python setup.py bdist_wheel && sccache --show-stats && python -c "import os, glob; os.system('python -mpip install ' + glob.glob('dist/*.whl')[0] + '[opt-einsum]')" (
   if "%BUILD_ENVIRONMENT%"=="" (
diff --git a/.jenkins/pytorch/win-test.sh b/.jenkins/pytorch/win-test.sh
index dc28521204878..560b039dbf679 100755
--- a/.jenkins/pytorch/win-test.sh
+++ b/.jenkins/pytorch/win-test.sh
@@ -39,10 +39,6 @@ fi
 
 export SCRIPT_HELPERS_DIR=$SCRIPT_PARENT_DIR/win-test-helpers
 
-if [[ "${BUILD_ENVIRONMENT}" == *cuda11* ]]; then
-  export BUILD_SPLIT_CUDA=ON
-fi
-
 if [[ "$TEST_CONFIG" = "force_on_cpu" ]]; then
   # run the full test suite for force_on_cpu test
   export USE_CUDA=0
diff --git a/CMakeLists.txt b/CMakeLists.txt
index dae1dd4bc14fb..e2e3bf0e3f8d5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -187,13 +187,6 @@ option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON)
 option(USE_ASAN "Use Address Sanitizer" OFF)
 option(USE_TSAN "Use Thread Sanitizer" OFF)
 option(USE_CUDA "Use CUDA" ON)
-# BUILD_SPLIT_CUDA must also be exported as an environment variable before building, with
-# `export BUILD_SPLIT_CUDA=1` because cpp_extension.py can only work properly if this variable
-# also exists in the environment.
-# This option is incompatible with CUDA_SEPARABLE_COMPILATION.
-cmake_dependent_option(
-    BUILD_SPLIT_CUDA "Split torch_cuda library into torch_cuda_cu and torch_cuda_cpp" OFF
-    "USE_CUDA AND NOT CUDA_SEPARABLE_COMPILATION" OFF)
 cmake_dependent_option(
      BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF)
 option(USE_FAST_NVCC "Use parallel NVCC build" OFF)
diff --git a/aten/src/ATen/native/cuda/Bucketization.cu b/aten/src/ATen/native/cuda/Bucketization.cu
index 2a3d5730d7860..21c582216628e 100644
--- a/aten/src/ATen/native/cuda/Bucketization.cu
+++ b/aten/src/ATen/native/cuda/Bucketization.cu
@@ -10,7 +10,6 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
-#include <ATen/ops/_torch_cuda_cu_linker_symbol_op_native.h>
 #include <ATen/ops/bucketize_native.h>
 #include <ATen/ops/empty.h>
 #include <ATen/ops/searchsorted_native.h>
@@ -191,11 +190,6 @@ Tensor searchsorted_cuda(
   return result;
 }
 
-// See [Note about _torch_cuda_cu_linker_symbol_op and torch_cuda_cu] in native_functions.yaml
-Tensor _torch_cuda_cu_linker_symbol_op_cuda(const Tensor& self) {
-  return self;
-}
-
 Tensor searchsorted_cuda(
     const Tensor& sorted_sequence,
     const Scalar& self,
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index ba1d38aa350b5..b827999cf54e9 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -9930,17 +9930,6 @@
     CPU: searchsorted_cpu
     CUDA: searchsorted_cuda
 
-# [Note about _torch_cuda_cu_linker_symbol_op and torch_cuda_cu]
-# This is a DUMMY function to force the linking against torch_cuda_cu on Windows.
-# Otherwise, the Windows linker will optimize and not include torch_cuda_cu even when we
-# want it to be included. This is similar to what we do with warp_size for torch_cuda_cpp,
-# described as the solution to this issue: https://github.com/pytorch/pytorch/issues/31611
-# This op should NOT be used or exposed or edited or else Windows builds (with BUILD_SPLIT_CUDA) will break.
-- func: _torch_cuda_cu_linker_symbol_op(Tensor self) -> Tensor
-  dispatch:
-    CUDA: _torch_cuda_cu_linker_symbol_op_cuda
-  autogen: _torch_cuda_cu_linker_symbol_op.out
-
 - func: searchsorted.Tensor_out(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU: searchsorted_out_cpu
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 64d53de5a64bb..aa6dfd2841bac 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -883,10 +883,6 @@ file(WRITE ${DUMMY_EMPTY_FILE} ${DUMMY_FILE_CONTENT})
 # Wrapper library for people who link against torch and expect both CPU and CUDA support
 # Contains "torch_cpu" and "torch_cuda"
 add_library(torch ${DUMMY_EMPTY_FILE})
-if(BUILD_SPLIT_CUDA)
-  # When we split torch_cuda, we want a dummy torch_cuda library that contains both parts
-  add_library(torch_cuda ${DUMMY_EMPTY_FILE})
-endif()
 if(HAVE_SOVERSION)
   set_target_properties(torch PROPERTIES
       VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
@@ -926,37 +922,19 @@ elseif(USE_CUDA)
         ${Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY})
     set_property(TARGET torch_cuda_w_sort_by_key PROPERTY CUDA_SEPARABLE_COMPILATION OFF)
     target_link_libraries(torch_cuda PRIVATE torch_cuda_w_sort_by_key)
-  elseif(BUILD_SPLIT_CUDA)
-    add_library(torch_cuda_cpp ${Caffe2_GPU_SRCS} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY})
-    add_library(torch_cuda_cu ${Caffe2_GPU_CU_SRCS} ${Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY})
   else()
     add_library(torch_cuda
         ${Caffe2_GPU_SRCS} ${Caffe2_GPU_SRCS_W_SORT_BY_KEY}
         ${Caffe2_GPU_CU_SRCS} ${Caffe2_GPU_CU_SRCS_W_SORT_BY_KEY})
   endif()
   set(CUDA_LINK_LIBRARIES_KEYWORD)
-  if(BUILD_SPLIT_CUDA)
-    torch_compile_options(torch_cuda_cpp)  # see cmake/public/utils.cmake
-    torch_compile_options(torch_cuda_cu)  # see cmake/public/utils.cmake
-    target_compile_definitions(torch_cuda_cpp PRIVATE BUILD_SPLIT_CUDA)
-    target_compile_definitions(torch_cuda_cpp PRIVATE USE_CUDA)
-    target_compile_definitions(torch_cuda_cu PRIVATE BUILD_SPLIT_CUDA)
-    target_compile_definitions(torch_cuda_cu PRIVATE USE_CUDA)
-  else()
-    torch_compile_options(torch_cuda)  # see cmake/public/utils.cmake
-    target_compile_definitions(torch_cuda PRIVATE USE_CUDA)
-  endif()
-  if(USE_NCCL AND BUILD_SPLIT_CUDA)
-    target_link_libraries(torch_cuda_cpp PRIVATE __caffe2_nccl)
-    target_compile_definitions(torch_cuda_cpp PRIVATE USE_NCCL)
-  elseif(USE_NCCL)
+  torch_compile_options(torch_cuda)  # see cmake/public/utils.cmake
+  target_compile_definitions(torch_cuda PRIVATE USE_CUDA)
+  if(USE_NCCL)
     target_link_libraries(torch_cuda PRIVATE __caffe2_nccl)
     target_compile_definitions(torch_cuda PRIVATE USE_NCCL)
   endif()
-  if(USE_UCC AND BUILD_SPLIT_CUDA)
-    target_link_libraries(torch_cuda_cpp PRIVATE __caffe2_ucc)
-    target_compile_definitions(torch_cuda_cpp PRIVATE USE_UCC)
-  elseif(USE_UCC)
+  if(USE_UCC)
     target_link_libraries(torch_cuda PRIVATE __caffe2_ucc)
     target_compile_definitions(torch_cuda PRIVATE USE_UCC)
   endif()
@@ -998,13 +976,8 @@ elseif(USE_CUDA)
   endif()
 
   if(USE_PRECOMPILED_HEADERS)
-    if(BUILD_SPLIT_CUDA)
-      target_precompile_headers(torch_cuda_cpp PRIVATE
-          "$<$<COMPILE_LANGUAGE:CXX>:ATen/core/ATen_pch.h>")
-    else()
-      target_precompile_headers(torch_cuda PRIVATE
-          "$<$<COMPILE_LANGUAGE:CXX>:ATen/core/ATen_pch.h>")
-    endif()
+    target_precompile_headers(torch_cuda PRIVATE
+        "$<$<COMPILE_LANGUAGE:CXX>:ATen/core/ATen_pch.h>")
   endif()
 endif()
 
@@ -1085,12 +1058,7 @@ if(NOT NO_API)
     ${TORCH_SRC_DIR}/csrc/api/include)
 endif()
 
-if(BUILD_SPLIT_CUDA AND MSVC)
-  # -INCLUDE is used to ensure torch_cuda_cpp/cu are linked against in a project that relies on them.
-  target_link_libraries(torch_cuda_cpp INTERFACE "-INCLUDE:?warp_size@cuda@at@@YAHXZ")
-  # See [Note about _torch_cuda_cu_linker_symbol_op and torch_cuda_cu] in native_functions.yaml
-  target_link_libraries(torch_cuda_cu INTERFACE "-INCLUDE:?_torch_cuda_cu_linker_symbol_op_cuda@native@at@@YA?AVTensor@2@AEBV32@@Z")
-elseif(USE_CUDA AND MSVC)
+if(USE_CUDA AND MSVC)
   # -INCLUDE is used to ensure torch_cuda is linked against in a project that relies on them.
   # Related issue: https://github.com/pytorch/pytorch/issues/31611
   target_link_libraries(torch_cuda INTERFACE "-INCLUDE:?warp_size@cuda@at@@YAHXZ")
@@ -1320,27 +1288,16 @@ if(USE_DISTRIBUTED)
   if(USE_UCC AND USE_C10D_UCC)
     target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
     if(USE_CUDA)
-      if(BUILD_SPLIT_CUDA)
-        target_compile_definitions(torch_cuda_cpp PUBLIC USE_C10D_UCC)
-      else()
-        target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
-      endif()
+      target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
     endif()
   endif()
   if(USE_NCCL AND USE_C10D_NCCL)
     if(USE_ROCM)
       target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
     else()
-      if(BUILD_SPLIT_CUDA)
-        target_compile_definitions(torch_cuda_cpp PUBLIC USE_C10D_NCCL)
-        if(USE_NCCL_WITH_UCC)
-          target_compile_definitions(torch_cuda_cpp PUBLIC USE_NCCL_WITH_UCC)
-        endif()
-      else()
-        target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
-        if(USE_NCCL_WITH_UCC)
-          target_compile_definitions(torch_cuda PUBLIC USE_NCCL_WITH_UCC)
-        endif()
+      target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
+      if(USE_NCCL_WITH_UCC)
+        target_compile_definitions(torch_cuda PUBLIC USE_NCCL_WITH_UCC)
       endif()
     endif()
   endif()
@@ -1423,14 +1380,7 @@ torch_set_target_props(torch_cpu)
 
 
 target_compile_options(torch_cpu PRIVATE "-DCAFFE2_BUILD_MAIN_LIB")
-if(BUILD_SPLIT_CUDA)
-  target_compile_options(torch_cuda_cu PRIVATE "-DTORCH_CUDA_CU_BUILD_MAIN_LIB")
-  target_compile_options(torch_cuda_cpp PRIVATE "-DTORCH_CUDA_CPP_BUILD_MAIN_LIB")
-  # NB: This must be target_compile_definitions, not target_compile_options,
-  # as the latter is not respected by nvcc
-  target_compile_definitions(torch_cuda_cu PRIVATE "-DTORCH_CUDA_CU_BUILD_MAIN_LIB")
-  target_compile_definitions(torch_cuda_cpp PRIVATE "-DTORCH_CUDA_CPP_BUILD_MAIN_LIB")
-elseif(USE_CUDA)
+if(USE_CUDA)
   target_compile_options(torch_cuda PRIVATE "-DTORCH_CUDA_BUILD_MAIN_LIB")
   # NB: This must be target_compile_definitions, not target_compile_options,
   # as the latter is not respected by nvcc
@@ -1441,10 +1391,7 @@ elseif(USE_ROCM)
 endif()
 
 if(USE_EXPERIMENTAL_CUDNN_V8_API)
-  if(BUILD_SPLIT_CUDA)
-    target_compile_definitions(torch_cuda_cu PRIVATE "-DUSE_EXPERIMENTAL_CUDNN_V8_API")
-    target_compile_definitions(torch_cuda_cpp PRIVATE "-DUSE_EXPERIMENTAL_CUDNN_V8_API")
-  elseif(USE_CUDA)
+  if(USE_CUDA)
     target_compile_definitions(torch_cuda PRIVATE "-DUSE_EXPERIMENTAL_CUDNN_V8_API")
   endif()
 endif()
@@ -1534,10 +1481,6 @@ caffe2_interface_library(torch_cpu torch_cpu_library)
 
 if(USE_CUDA)
   caffe2_interface_library(torch_cuda torch_cuda_library)
-  if(BUILD_SPLIT_CUDA)
-    caffe2_interface_library(torch_cuda_cu torch_cuda_cu_library)
-    caffe2_interface_library(torch_cuda_cpp torch_cuda_cpp_library)
-  endif()
 elseif(USE_ROCM)
   caffe2_interface_library(torch_hip torch_hip_library)
 endif()
@@ -1548,10 +1491,6 @@ install(TARGETS torch_cpu torch_cpu_library EXPORT Caffe2Targets DESTINATION "${
 
 if(USE_CUDA)
   install(TARGETS torch_cuda torch_cuda_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-  if(BUILD_SPLIT_CUDA)
-    install(TARGETS torch_cuda_cu torch_cuda_cu_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-    install(TARGETS torch_cuda_cpp torch_cuda_cpp_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-  endif()
 elseif(USE_ROCM)
   install(TARGETS torch_hip torch_hip_library EXPORT Caffe2Targets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 endif()
@@ -1561,11 +1500,6 @@ target_link_libraries(torch PUBLIC torch_cpu_library)
 
 if(USE_CUDA)
   target_link_libraries(torch PUBLIC torch_cuda_library)
-  if(BUILD_SPLIT_CUDA)
-    # NS: Library order is important here to prevent cudnn double linking
-    target_link_libraries(torch_cuda PUBLIC torch_cuda_cpp_library)
-    target_link_libraries(torch_cuda PUBLIC torch_cuda_cu_library)
-  endif()
 elseif(USE_ROCM)
   target_link_libraries(torch PUBLIC torch_hip_library)
 endif()
@@ -1578,10 +1512,7 @@ endif()
 # Install PDB files for MSVC builds
 if(MSVC AND BUILD_SHARED_LIBS)
   install(FILES $<TARGET_PDB_FILE:torch_cpu> DESTINATION "${TORCH_INSTALL_LIB_DIR}" OPTIONAL)
-  if(BUILD_SPLIT_CUDA)
-    install(FILES $<TARGET_PDB_FILE:torch_cuda_cu> DESTINATION "${TORCH_INSTALL_LIB_DIR}" OPTIONAL)
-    install(FILES $<TARGET_PDB_FILE:torch_cuda_cpp> DESTINATION "${TORCH_INSTALL_LIB_DIR}" OPTIONAL)
-  elseif(USE_CUDA)
+  if(USE_CUDA)
     install(FILES $<TARGET_PDB_FILE:torch_cuda> DESTINATION "${TORCH_INSTALL_LIB_DIR}" OPTIONAL)
   elseif(USE_ROCM)
     install(FILES $<TARGET_PDB_FILE:torch_hip> DESTINATION "${TORCH_INSTALL_LIB_DIR}" OPTIONAL)
@@ -1589,36 +1520,7 @@ if(MSVC AND BUILD_SHARED_LIBS)
 endif()
 
 # ---[ CUDA library.
-if(BUILD_SPLIT_CUDA)
-  target_link_libraries(torch_cuda_cu INTERFACE torch::cudart)
-  target_link_libraries(torch_cuda_cpp INTERFACE torch::cudart)
-  target_link_libraries(torch_cuda_cu PUBLIC c10_cuda torch::nvtoolsext)
-  target_link_libraries(torch_cuda_cpp PUBLIC c10_cuda torch::nvtoolsext)
-
-  target_include_directories(
-      torch_cuda_cu INTERFACE $<INSTALL_INTERFACE:include>)
-  target_include_directories(
-      torch_cuda_cpp INTERFACE $<INSTALL_INTERFACE:include>)
-  target_include_directories(
-      torch_cuda_cu PRIVATE ${Caffe2_GPU_INCLUDE})
-  target_include_directories(
-      torch_cuda_cpp PRIVATE ${Caffe2_GPU_INCLUDE})
-  target_link_libraries(
-      torch_cuda_cu PRIVATE ${Caffe2_CUDA_DEPENDENCY_LIBS})
-  target_link_libraries(
-      torch_cuda_cpp PRIVATE ${Caffe2_CUDA_DEPENDENCY_LIBS})
-  target_link_libraries(torch_cuda_cu PRIVATE torch_cuda_cpp)
-  if(USE_CUDNN)
-    target_link_libraries(
-        torch_cuda_cpp PRIVATE  caffe2::cudnn-private)
-  endif()
-
-  # These public dependencies must go after the previous dependencies, as the
-  # order of the libraries in the linker call matters here when statically
-  # linking; libculibos and cublas must be last.
-  target_link_libraries(torch_cuda_cpp PUBLIC torch_cpu_library ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
-  target_link_libraries(torch_cuda_cu PUBLIC torch_cpu_library ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
-elseif(USE_CUDA)
+if(USE_CUDA)
   target_link_libraries(torch_cuda INTERFACE torch::cudart)
   target_link_libraries(torch_cuda PUBLIC c10_cuda torch::nvtoolsext)
 
diff --git a/test/forward_backward_compatibility/check_forward_backward_compatibility.py b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
index f10fd14393580..5f13834ee77e0 100644
--- a/test/forward_backward_compatibility/check_forward_backward_compatibility.py
+++ b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
@@ -290,6 +290,7 @@
     ("aten::nested_to_padded_tensor", datetime.date(2022, 10, 1)),
     ("aten::nested_tensor", datetime.date(2022, 10, 15)),
     ("aten::_nested_tensor_layer_norm", datetime.date(2022, 10, 15)),
+    ("aten::_torch_cuda_cu_linker_symbol_op", datetime.date(2022, 11, 1)),
 
 ]
 
diff --git a/torch/csrc/jit/codegen/cuda/nvfuser.cmake b/torch/csrc/jit/codegen/cuda/nvfuser.cmake
index 526a674e4fb4c..147003054766b 100644
--- a/torch/csrc/jit/codegen/cuda/nvfuser.cmake
+++ b/torch/csrc/jit/codegen/cuda/nvfuser.cmake
@@ -1,6 +1,4 @@
-if(BUILD_SPLIT_CUDA)
-  set(TORCHLIB_FLAVOR torch_cuda_cu) # chose torch_cuda_cu here since JIT is in torch_cuda_cpp
-elseif(USE_CUDA)
+if(USE_CUDA)
   set(TORCHLIB_FLAVOR torch_cuda)
 elseif(USE_ROCM)
   set(TORCHLIB_FLAVOR torch_hip)
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 9ab43e5ccdd67..36811bf22dedc 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -38,9 +38,6 @@
 TORCH_LIB_PATH = os.path.join(_TORCH_PATH, 'lib')
 
 
-BUILD_SPLIT_CUDA = os.getenv('BUILD_SPLIT_CUDA') or (os.path.exists(os.path.join(
-    TORCH_LIB_PATH, f'{CLIB_PREFIX}torch_cuda_cu{CLIB_EXT}')) and os.path.exists(os.path.join(TORCH_LIB_PATH, f'{CLIB_PREFIX}torch_cuda_cpp{CLIB_EXT}')))
-
 SUBPROCESS_DECODE_ARGS = ('oem',) if IS_WINDOWS else ()
 MINIMUM_GCC_VERSION = (5, 0, 0)
 MINIMUM_MSVC_VERSION = (19, 0, 24215)
@@ -1060,11 +1057,7 @@ def CUDAExtension(name, sources, *args, **kwargs):
     else:
         libraries.append('cudart')
         libraries.append('c10_cuda')
-        if BUILD_SPLIT_CUDA:
-            libraries.append('torch_cuda_cu')
-            libraries.append('torch_cuda_cpp')
-        else:
-            libraries.append('torch_cuda')
+        libraries.append('torch_cuda')
     kwargs['libraries'] = libraries
 
     include_dirs = kwargs.get('include_dirs', [])
@@ -1657,15 +1650,7 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose, is_standalone):
         if with_cuda:
             extra_ldflags.append('c10_cuda.lib')
         extra_ldflags.append('torch_cpu.lib')
-        if BUILD_SPLIT_CUDA and with_cuda:
-            extra_ldflags.append('torch_cuda_cu.lib')
-            # See [Note about _torch_cuda_cu_linker_symbol_op and torch_cuda_cu] in native_functions.yaml
-            extra_ldflags.append('-INCLUDE:?_torch_cuda_cu_linker_symbol_op_cuda@native@at@@YA?AVTensor@2@AEBV32@@Z')
-            extra_ldflags.append('torch_cuda_cpp.lib')
-            # /INCLUDE is used to ensure torch_cuda_cpp is linked against in a project that relies on it.
-            # Related issue: https://github.com/pytorch/pytorch/issues/31611
-            extra_ldflags.append('-INCLUDE:?warp_size@cuda@at@@YAHXZ')
-        elif with_cuda:
+        if with_cuda:
             extra_ldflags.append('torch_cuda.lib')
             # /INCLUDE is used to ensure torch_cuda is linked against in a project that relies on it.
             # Related issue: https://github.com/pytorch/pytorch/issues/31611
@@ -1682,9 +1667,7 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose, is_standalone):
         if with_cuda:
             extra_ldflags.append('-lc10_hip' if IS_HIP_EXTENSION else '-lc10_cuda')
         extra_ldflags.append('-ltorch_cpu')
-        if BUILD_SPLIT_CUDA and with_cuda:
-            extra_ldflags.append('-ltorch_hip' if IS_HIP_EXTENSION else '-ltorch_cuda_cu -ltorch_cuda_cpp')
-        elif with_cuda:
+        if with_cuda:
             extra_ldflags.append('-ltorch_hip' if IS_HIP_EXTENSION else '-ltorch_cuda')
         extra_ldflags.append('-ltorch')
         if not is_standalone:

From be7d04682b8f987f37091eaa049e1639f6e355a7 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Sat, 22 Oct 2022 06:06:15 +0000
Subject: [PATCH 0046/1922] [CI] Run all MacOS builds on MacOS-12 (#87496)

Not sure why we needed macos-10.15 for libtorch

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87496
Approved by: https://github.com/atalman, https://github.com/seemethere
---
 .../macos_binary_build_workflow.yml.j2        | 10 +-------
 ...rated-macos-arm64-binary-conda-nightly.yml |  3 +++
 ...rated-macos-arm64-binary-wheel-nightly.yml |  4 ++++
 .../generated-macos-binary-conda-nightly.yml  |  4 ++++
 ...acos-binary-libtorch-cxx11-abi-nightly.yml | 24 +++++++++----------
 ...acos-binary-libtorch-pre-cxx11-nightly.yml | 24 +++++++++----------
 .../generated-macos-binary-wheel-nightly.yml  |  4 ++++
 7 files changed, 40 insertions(+), 33 deletions(-)

diff --git a/.github/templates/macos_binary_build_workflow.yml.j2 b/.github/templates/macos_binary_build_workflow.yml.j2
index 149c007daef9e..5e6b505664e60 100644
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@@ -58,17 +58,8 @@ jobs:
 {%- for config in build_configs %}
   !{{ config["build_name"] }}-build:
     if: ${{ github.repository_owner == 'pytorch' }}
-    {%- if config["package_type"] == "libtorch" %}
-    runs-on: macos-10.15
-    {%- else %}
     runs-on: macos-12-xl
-    {%- endif %}
-{%- if config["package_type"] == "libtorch" %}
-    # libtorch builds take a long time on github hosted runners
-    timeout-minutes: 720
-{%- else %}
     timeout-minutes: !{{ common.timeout_minutes }}
-{%- endif %}
     !{{ upload.binary_env(config, true) }}
       # For sccache access (only on non-forked PRs)
       AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
@@ -82,6 +73,7 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
       !{{ common.checkout(deep_clone=False, directory="pytorch") }}
       !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }}
       - name: Install sccache (only for non-forked PRs, and pushes to trunk)
diff --git a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
index 52fe582aa59ee..5d47cc77cf3a7 100644
--- a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
@@ -71,6 +71,7 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
@@ -180,6 +181,7 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
@@ -289,6 +291,7 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
diff --git a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
index fc5f84d9484ea..e58d153269b38 100644
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@@ -71,6 +71,7 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
@@ -180,6 +181,7 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
@@ -289,6 +291,7 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
@@ -398,6 +401,7 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
diff --git a/.github/workflows/generated-macos-binary-conda-nightly.yml b/.github/workflows/generated-macos-binary-conda-nightly.yml
index 8fab29ddaed9f..079687e6ff951 100644
--- a/.github/workflows/generated-macos-binary-conda-nightly.yml
+++ b/.github/workflows/generated-macos-binary-conda-nightly.yml
@@ -69,6 +69,7 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
@@ -178,6 +179,7 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
@@ -287,6 +289,7 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
@@ -396,6 +399,7 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
diff --git a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml
index ae63f95bc3189..dcb480b0a07ce 100644
--- a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml
@@ -34,9 +34,8 @@ concurrency:
 jobs:
   libtorch-cpu-shared-with-deps-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-10.15
-    # libtorch builds take a long time on github hosted runners
-    timeout-minutes: 720
+    runs-on: macos-12-xl
+    timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
@@ -74,6 +73,7 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
@@ -149,9 +149,8 @@ jobs:
     uses: ./.github/workflows/_binary-upload.yml
   libtorch-cpu-shared-without-deps-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-10.15
-    # libtorch builds take a long time on github hosted runners
-    timeout-minutes: 720
+    runs-on: macos-12-xl
+    timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
@@ -189,6 +188,7 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
@@ -264,9 +264,8 @@ jobs:
     uses: ./.github/workflows/_binary-upload.yml
   libtorch-cpu-static-with-deps-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-10.15
-    # libtorch builds take a long time on github hosted runners
-    timeout-minutes: 720
+    runs-on: macos-12-xl
+    timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
@@ -304,6 +303,7 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
@@ -379,9 +379,8 @@ jobs:
     uses: ./.github/workflows/_binary-upload.yml
   libtorch-cpu-static-without-deps-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-10.15
-    # libtorch builds take a long time on github hosted runners
-    timeout-minutes: 720
+    runs-on: macos-12-xl
+    timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
@@ -419,6 +418,7 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
diff --git a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml
index 39ad514a56702..5f02ea874b4e4 100644
--- a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml
+++ b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml
@@ -34,9 +34,8 @@ concurrency:
 jobs:
   libtorch-cpu-shared-with-deps-pre-cxx11-build:
     if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-10.15
-    # libtorch builds take a long time on github hosted runners
-    timeout-minutes: 720
+    runs-on: macos-12-xl
+    timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
@@ -74,6 +73,7 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
@@ -149,9 +149,8 @@ jobs:
     uses: ./.github/workflows/_binary-upload.yml
   libtorch-cpu-shared-without-deps-pre-cxx11-build:
     if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-10.15
-    # libtorch builds take a long time on github hosted runners
-    timeout-minutes: 720
+    runs-on: macos-12-xl
+    timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
@@ -189,6 +188,7 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
@@ -264,9 +264,8 @@ jobs:
     uses: ./.github/workflows/_binary-upload.yml
   libtorch-cpu-static-with-deps-pre-cxx11-build:
     if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-10.15
-    # libtorch builds take a long time on github hosted runners
-    timeout-minutes: 720
+    runs-on: macos-12-xl
+    timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
@@ -304,6 +303,7 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
@@ -379,9 +379,8 @@ jobs:
     uses: ./.github/workflows/_binary-upload.yml
   libtorch-cpu-static-without-deps-pre-cxx11-build:
     if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-10.15
-    # libtorch builds take a long time on github hosted runners
-    timeout-minutes: 720
+    runs-on: macos-12-xl
+    timeout-minutes: 240
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       BUILDER_ROOT: ${{ github.workspace }}/builder
@@ -419,6 +418,7 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
diff --git a/.github/workflows/generated-macos-binary-wheel-nightly.yml b/.github/workflows/generated-macos-binary-wheel-nightly.yml
index 70d6783dbe881..081f470d6109f 100644
--- a/.github/workflows/generated-macos-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-binary-wheel-nightly.yml
@@ -69,6 +69,7 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
@@ -178,6 +179,7 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
@@ -287,6 +289,7 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:
@@ -396,6 +399,7 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
         with:

From b0ae042db3c33f38b91b7dcbd01d224f8a15af6d Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Sat, 22 Oct 2022 02:21:07 +0000
Subject: [PATCH 0047/1922] Fix meta for meta_fill_ (#87493)

Existing meta_fill_ doesn't correctly reflect the aliasing relationship for aten.fill. A new MetaTensor should be return instead.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87493
Approved by: https://github.com/eellison, https://github.com/bdhirsh
---
 test/test_meta.py            | 10 ++++++++++
 torch/_meta_registrations.py | 10 ++++++----
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/test/test_meta.py b/test/test_meta.py
index 6b283da39cbe0..23e7025140138 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -1013,6 +1013,16 @@ def test_huber_loss_backward(self):
         self.assertEqual(r.device.type, 'meta')
         self.assertEqual(r.shape, inps[0].shape)
 
+    def test_fill_alias_relationship(self):
+        inps = torch.rand(2**52, device='meta')
+        r = torch.ops.aten.fill_(inps, 1.0)
+        # aten.fill_ returns an aliase
+        self.assertEqual(id(inps), id(r))
+
+        # aten.fill returns a new tensor
+        r2 = torch.ops.aten.fill(inps, 1.0)
+        self.assertNotEqual(id(inps), id(r2))
+
     def test_map_location_deserialize(self):
         import io
 
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index c17aa091120cc..7be63af9e051a 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1111,14 +1111,16 @@ def meta_zero_(self):
     return self
 
 
-@register_meta(
-    [aten.fill.Tensor, aten.fill.Scalar, aten.fill_.Tensor, aten.fill_.Scalar],
-    register_dispatcher=False,
-)
+@register_meta([aten.fill_.Tensor, aten.fill_.Scalar], register_dispatcher=False)
 def meta_fill_(self, val):
     return self
 
 
+@register_meta([aten.fill.Tensor, aten.fill.Scalar], register_dispatcher=False)
+def meta_fill(self, val):
+    return self.new_empty(self.shape)
+
+
 @register_meta(aten.relu_.default, register_dispatcher=False)
 def meta_relu_(self):
     return self

From 80f29948806963fc1e95048c14951473b83dea41 Mon Sep 17 00:00:00 2001
From: Ryan Spring <rdspring1@gmail.com>
Date: Sat, 22 Oct 2022 17:59:25 +0000
Subject: [PATCH 0048/1922] Add xlogy and xlog1py references (#77712)

 * Add reference implementations for `xlogy` and `xlog1py`
 * Replace `_wrap_scalar` helper function with `scalar_tensor` prim
Pull Request resolved: https://github.com/pytorch/pytorch/pull/77712
Approved by: https://github.com/mruberry
---
 test/functorch/test_aotdispatch.py            |  1 -
 test/test_proxy_tensor.py                     |  1 -
 torch/_decomp/decompositions.py               |  1 -
 torch/_refs/__init__.py                       | 29 +++++++++++++--
 torch/_refs/special/__init__.py               | 36 +++++++++++++++++--
 .../_internal/common_methods_invocations.py   | 15 ++++++++
 6 files changed, 75 insertions(+), 8 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index e9a46b0882e2e..c058b3618ecb1 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1212,7 +1212,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('sort', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('special.i1', ''),  # aten.i0.default - couldn't find symbolic meta function/decomposition
     xfail('special.polygamma', 'special_polygamma_n_0'),  # aten.polygamma.default - couldn't find symbolic ...
-    xfail('special.xlog1py', ''),  # aten.special_xlog1py.default - couldn't find symbolic meta function/deco...
     xfail('split', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('std', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('std_mean', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 914261ae1c6ab..0092daa77ab49 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1328,7 +1328,6 @@ def f(a, b, c, d, e):
     xfail('special.polygamma', 'special_polygamma_n_0'),  # aten.polygamma.default - couldn't find symbolic meta function/...
     xfail('special.scaled_modified_bessel_k0', ''),  # aten.special_scaled_modified_bessel_k0.default - couldn't find symbo...
     xfail('special.scaled_modified_bessel_k1', ''),  # aten.special_scaled_modified_bessel_k1.default - couldn't find symbo...
-    xfail('special.xlog1py', ''),  # aten.special_xlog1py.default - couldn't find symbolic meta function/decomposition
     xfail('split', ''),  # 'torch._C.SymIntNode' and 'int'
     xfail('stft', ''),  # argument 'size' must be tuple of ints, but found element of type torch._C.SymIntNode at...
     xfail('sum_to_size', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 9e9c36104ddc5..2b4d2914fe858 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -1433,7 +1433,6 @@ def _to_copy(
     return x
 
 
-@register_decomposition(aten.xlogy.Tensor)
 @pw_cast_for_int_to_real
 def xlogy(self: Tensor, other: Tensor) -> Tensor:
     return aten.where(
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 08e1361c76220..d6a8f476b3176 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -162,12 +162,10 @@
     "rsub",
     "rtruediv",
     "rfloordiv",
-    # # special.xlog1py
-    # # special.zeta
     "sub",
     "true_divide",
     "trunc_divide",
-    # 'xlogy', # where?, log, mul
+    "xlogy",
     #
     # Elementwise Ternary References
     #
@@ -1546,6 +1544,31 @@ def sub(
 )
 
 
+@register_decomposition(torch.ops.aten.xlogy)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a", "b"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+def xlogy(a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, NumberType]):
+    utils.check(
+        isinstance(a, TensorLike) or isinstance(b, TensorLike),
+        lambda: 'Expected either argument a or b to be a Tensor"',
+    )
+
+    # Operations like eq and log do not handle scalar values, so we convert them to scalar_tensors.
+    if isinstance(b, TensorLike) and isinstance(a, Number):
+        a = scalar_tensor(a, dtype=b.dtype, device=b.device)
+    elif isinstance(a, TensorLike) and isinstance(b, Number):
+        b = scalar_tensor(b, dtype=a.dtype, device=a.device)
+
+    # mypy: expected "Tensor"
+    assert isinstance(a, TensorLike)
+    assert isinstance(b, TensorLike)
+    rhs = torch.where(torch.eq(a, 0), 0, torch.mul(a, torch.log(b)))
+    return torch.where(torch.isnan(b), float("nan"), rhs)
+
+
 def _trunc_divide(
     a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, NumberType]
 ):
diff --git a/torch/_refs/special/__init__.py b/torch/_refs/special/__init__.py
index fae9f9d12dbe6..1227a2631475b 100644
--- a/torch/_refs/special/__init__.py
+++ b/torch/_refs/special/__init__.py
@@ -1,5 +1,5 @@
 import math
-from typing import Optional
+from typing import Optional, Union
 
 import torch
 import torch._prims as prims
@@ -8,7 +8,13 @@
 
 from torch import Tensor
 from torch._decomp import register_decomposition
-from torch._prims_common import ELEMENTWISE_TYPE_PROMOTION_KIND, TensorLikeType
+from torch._prims_common import (
+    ELEMENTWISE_TYPE_PROMOTION_KIND,
+    Number,
+    NumberType,
+    TensorLike,
+    TensorLikeType,
+)
 from torch._prims_common.wrappers import elementwise_type_promotion_wrapper, out_wrapper
 from torch._refs import (
     _make_elementwise_binary_reference,
@@ -33,6 +39,7 @@
     "ndtri",
     "softmax",
     "spherical_bessel_j0",
+    "xlog1py",
     "zeta",
 ]
 
@@ -134,6 +141,31 @@ def logit(self: TensorLikeType, eps: Optional[float] = None) -> TensorLikeType:
     return torch.log(torch.true_divide(self, torch.sub(1, self)))
 
 
+@register_decomposition(torch.ops.aten.special_xlog1py)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a", "b"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+def xlog1py(a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, NumberType]):
+    utils.check(
+        isinstance(a, TensorLike) or isinstance(b, TensorLike),
+        lambda: 'Expected either argument a or b to be a Tensor"',
+    )
+
+    # Operations like eq and log do not handle scalar values, so we convert them to scalar_tensors.
+    if isinstance(a, TensorLike) and isinstance(b, Number):
+        b = refs.scalar_tensor(b, dtype=a.dtype, device=a.device)
+    elif isinstance(b, TensorLike) and isinstance(a, Number):
+        a = refs.scalar_tensor(a, dtype=b.dtype, device=b.device)
+
+    # mypy: expected "Tensor"
+    assert isinstance(a, TensorLike)
+    assert isinstance(b, TensorLike)
+    rhs = torch.where(torch.eq(a, 0), 0, torch.mul(a, refs.log1p(b)))
+    return torch.where(torch.isnan(b), float("nan"), rhs)
+
+
 @register_decomposition(torch.ops.aten.mvlgamma)
 @out_wrapper()
 @elementwise_type_promotion_wrapper(
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 34f54f2fb5ae1..f637339f16d24 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -17461,6 +17461,21 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                          dtypes=(torch.uint8,), device_type="cpu"),
         )
     ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.xlogy",
+        torch_opinfo_name="xlogy",
+        supports_one_python_scalar=True,
+        supports_nvfuser=False,
+    ),
+    #
+    # Elementwise Binary Special OpInfos
+    #
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.special.xlog1py",
+        torch_opinfo_name="special.xlog1py",
+        supports_one_python_scalar=True,
+        supports_nvfuser=False,
+    ),
     #
     # Data Conversion & Data Movement Opinfos
     #

From f7b42a07cda47d3c2b1478004638b3bcb102fd55 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Sun, 23 Oct 2022 03:18:57 +0000
Subject: [PATCH 0049/1922] [vision hash update] update the pinned vision hash
 (#87528)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned vision hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87528
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/vision.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index db0aa4e7d73c4..02a12c728a3a5 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-7a62a545ce76f43ccc5cfe0009131f7db14ae7b5
+9c112935abe400222cca8f9fbc2d8386e0f25e80

From 3c6063a281b024666ca4c1f2fdea843d83233d7f Mon Sep 17 00:00:00 2001
From: Horace He <chilli@fb.com>
Date: Sun, 23 Oct 2022 02:53:37 +0000
Subject: [PATCH 0050/1922] Fix stupid N^2 naming behavior in FX and removed
 assert that slows things a lot sometimes (#87533)

cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87533
Approved by: https://github.com/ezyang, https://github.com/voznesenskym
---
 torch/_dynamo/variables/lists.py |  5 ++++-
 torch/fx/graph.py                | 10 +++++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/torch/_dynamo/variables/lists.py b/torch/_dynamo/variables/lists.py
index e1c0d584073e4..f63283819f350 100644
--- a/torch/_dynamo/variables/lists.py
+++ b/torch/_dynamo/variables/lists.py
@@ -395,7 +395,10 @@ class ListIteratorVariable(VariableTracker):
     def __init__(self, items, index: int = 0, **kwargs):
         super(ListIteratorVariable, self).__init__(**kwargs)
         assert isinstance(items, list)
-        assert all(isinstance(x, VariableTracker) for x in items)
+        # Removing this check as it slows things down too much
+        # https://github.com/pytorch/pytorch/pull/87533#issuecomment-1287574492
+
+        # assert all(isinstance(x, VariableTracker) for x in items)
         self.items = items
         self.index = index
 
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index 9397050bc29a5..3b8c96b6a43bf 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -1,3 +1,4 @@
+from collections import defaultdict
 from .node import Node, Argument, Target, map_arg, _type_repr, _get_qualified_name
 import torch.utils._pytree as pytree
 from . import _pytree as fx_pytree
@@ -120,7 +121,8 @@ class _Namespace:
     def __init__(self):
         self._obj_to_name: Dict[Any, str] = {}
         self._unassociated_names = set()
-        self._used_names: Dict[str, int] = {}
+        self._used_names: Set[str] = set()
+        self._base_count: Dict[str, int] = defaultdict(int)
 
         self._illegal_char_regex = re.compile('[^0-9a-zA-Z_]+')
         self._name_suffix_regex = re.compile(r"(.*)_(\d+)$")
@@ -150,13 +152,15 @@ def create_name(self, candidate: str, obj: Optional[Any]) -> str:
             num = int(num_str)
 
         candidate = base if num is None else f'{base}_{num}'
-        num = num if num else 0
+        if not num:
+            num = self._base_count[base]
 
         while candidate in self._used_names or self._is_illegal_name(candidate, obj):
             num += 1
             candidate = f'{base}_{num}'
 
-        self._used_names.setdefault(candidate, 0)
+        self._used_names.add(candidate)
+        self._base_count[base] = num
         if obj is None:
             self._unassociated_names.add(candidate)
         else:

From aeb5f0f0e7efc61fac5a2820ddc41b71eb840cfa Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Sat, 22 Oct 2022 17:37:54 -0700
Subject: [PATCH 0051/1922] [Profiler] Use parameter as key for optimizer state
 recording. (#86753)

While optimizer can store state however it likes, in practice most optimizer state corresponds to a particular parameter. (This is the case for all `torch.optim` optimizers.) Thus, it turns out to be ergonomic to collect using that structure. Note that this doesn't lock us into anything; we can always collect state with non Tensor keys if the use case arises.

One simplification that arises is that Module and Optimizer collection has very similar structure. So similar, in fact, that it is possible to use a common template for config. I also found that a lot of the `check_and_store` logic could be simplified and inlined by this joining of collected optimizer state.

Differential Revision: [D40210703](https://our.internmc.facebook.com/intern/diff/D40210703/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86753
Approved by: https://github.com/slgong-fb, https://github.com/aaronenyeshi
---
 test/profiler/test_profiler.py          |  27 ++--
 torch/csrc/autograd/profiler_python.cpp | 207 +++++++++++-------------
 torch/csrc/profiler/collection.h        |  27 ++--
 torch/csrc/profiler/python/init.cpp     |  46 ++----
 torch/csrc/profiler/python/init.h       |   5 +
 5 files changed, 144 insertions(+), 168 deletions(-)

diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py
index 5f3d7621dcfb3..09379bb02a531 100644
--- a/test/profiler/test_profiler.py
+++ b/test/profiler/test_profiler.py
@@ -1574,7 +1574,7 @@ def flat_out_extrafields(nodes, out=None):
                 out = []
             for node in nodes:
                 if isinstance(node.extra_fields, _ExtraFields_PyCall) and node.extra_fields.module:
-                    if node.extra_fields.module.params:
+                    if node.extra_fields.module.parameters:
                         out.append(node.extra_fields.module)
                 flat_out_extrafields(node.children, out)
             return out
@@ -1589,7 +1589,7 @@ def flat_out_extrafields(nodes, out=None):
         modules = flat_out_extrafields(p.profiler.kineto_results.experimental_event_tree())
         self.assertEqual(len(modules), 2, f"Expected two parameter list, but got {len(modules)}")
 
-        params = [(n, p.storage_data_ptr, g.storage_data_ptr) for module in modules for (n, p, g) in module.params]
+        params = [(n, p.storage_data_ptr, g.storage_data_ptr) for module in modules for (n, p, g) in module.parameters]
         expected = [(name, val.storage().data_ptr(), val.grad.storage().data_ptr()) for name, val in net.fc1._parameters.items()]
         expected += [(name, val.storage().data_ptr(), val.grad.storage().data_ptr()) for name, val in net.fc2._parameters.items()]
         self.assertEqual(expected, params, f"{expected} vs. {params}")
@@ -1599,29 +1599,34 @@ def _flat_out_extrafields(self, nodes, out=None):
             out = []
         for node in nodes:
             if (isinstance(node.extra_fields, _ExtraFields_PyCall) and
-                    node.extra_fields.optimizer and node.extra_fields.optimizer.param_addrs):
+                    node.extra_fields.optimizer and node.extra_fields.optimizer.parameters):
                 # avoiding OptInfo duplicates from iterations
-                addr = node.extra_fields.optimizer.param_addrs[0].storage_data_ptr
-                if not [o for o in out if addr == o.param_addrs[0].storage_data_ptr]:
+                addr = node.extra_fields.optimizer.parameters[0][0].storage_data_ptr
+                if not [o for o in out if addr == o.parameters[0][0].storage_data_ptr]:
                     out.append(node.extra_fields.optimizer)
             self._flat_out_extrafields(node.children, out)
         return out
 
     def _check_results(self, opt, opts, check_items=False):
         self.assertEqual(len(opts), 1, f"Expected 1 optimizer: len(opts): {len(opts)}")
-        self.assertEqual(id(opt), opts[0].self, f"Optimizer addr ({id(opt)}) vs. profiled addr ({opts[0].self})")
+        self.assertEqual(id(opt), opts[0].self_ptr, f"Optimizer addr ({id(opt)}) vs. profiled addr ({opts[0].self_ptr})")
         if check_items:
             self.assertEqual(len(opt.param_groups), len(opts))
             for group, opt_ in zip(opt.param_groups, opts):
                 self.assertEqual(
                     [(v.storage().data_ptr()) for v in group.get("params", [])],
-                    [(o.storage_data_ptr) for o in opt_.param_addrs]
+                    [(o.storage_data_ptr) for (o, _, _) in opt_.parameters]
                 )
             for opt_ in opts:
-                self.assertEqual(
-                    [(name, val.storage().data_ptr()) for dic in opt.state.values() for name, val in dic.items()],
-                    [(n, p.storage_data_ptr) for (n, p) in opt_.opt_state]
-                )
+                observed_state = {
+                    p.storage_data_ptr: {name: s.storage_data_ptr for name, s in state}
+                    for (p, _, state) in opt_.parameters
+                }
+                for parameter, parameter_state in opt.state.items():
+                    self.assertEqual(
+                        {name: value.storage().data_ptr() for name, value in parameter_state.items()},
+                        observed_state.get(parameter.storage().data_ptr(), [])
+                    )
 
     def test_optimizer(self):
         inputs = torch.rand(10)
diff --git a/torch/csrc/autograd/profiler_python.cpp b/torch/csrc/autograd/profiler_python.cpp
index aee3702b8b105..d971336ae0c4d 100644
--- a/torch/csrc/autograd/profiler_python.cpp
+++ b/torch/csrc/autograd/profiler_python.cpp
@@ -19,6 +19,7 @@
 #include <c10/util/C++17.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Logging.h>
+#include <c10/util/Optional.h>
 #include <c10/util/StringUtil.h>
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/irange.h>
@@ -204,20 +205,36 @@ struct Config<CallType::PyCall> {
   static constexpr EventType event_type = EventType::PyCall;
 };
 
-template <>
-struct Config<CallType::PyModuleCall> {
-  using key_t = PyModuleSelf;
-  using cls_t = PyModuleCls;
+template <typename Key, typename Cls, typename Info>
+struct ExtendedPyCallConfig {
+  using key_t = Key;
+  using cls_t = Cls;
   using ephemeral_t = PyFrameObject*;
-  using info_t = std::pair<cls_t, std::vector<ParameterInfo>>;
-  struct cache_t {
-    c10::optional<CodeLocation> location_; // nn.Module.forward;
-    ska::flat_hash_map<key_t, info_t> modules_and_params_;
+
+  struct ClsAndParameters {
+    cls_t cls_;
+    std::vector<typename Info::ParameterInfo> parameters_;
+  };
+
+  struct Cache {
+    // `nn.Module.forward` or `optim.Optimizer._optimizer_step_code`
+    c10::optional<CodeLocation> location_;
+    ska::flat_hash_map<key_t, ClsAndParameters> cls_and_parameters_;
     ska::flat_hash_map<cls_t, at::StringView> cls_names_;
   };
+  using cache_t = Cache;
+
   static constexpr EventType event_type = EventType::PyCall;
 };
 
+template <>
+struct Config<CallType::PyModuleCall>
+    : ExtendedPyCallConfig<PyModuleSelf, PyModuleCls, NNModuleInfo> {};
+
+template <>
+struct Config<CallType::PyOptimizerCall>
+    : ExtendedPyCallConfig<PyOptimizerSelf, PyOptimizerCls, OptimizerInfo> {};
+
 template <>
 struct Config<CallType::PyCCall> {
   using key_t = PyMethod;
@@ -226,25 +243,6 @@ struct Config<CallType::PyCCall> {
   static constexpr EventType event_type = EventType::PyCCall;
 };
 
-template <>
-struct Config<CallType::PyOptimizerCall> {
-  using key_t = PyOptimizerSelf;
-  using cls_t = PyOptimizerCls;
-  using ephemeral_t = PyFrameObject*;
-  struct info_t {
-    cls_t cls_;
-    std::vector<TensorMetadata> params_;
-    std::vector<std::pair<std::string, TensorMetadata>> states_;
-  };
-  struct cache_t {
-    c10::optional<CodeLocation>
-        location_; // optim.Optimizer._optimizer_step_code;
-    ska::flat_hash_map<key_t, info_t> optimizer_data_;
-    ska::flat_hash_map<cls_t, at::StringView> cls_names_;
-  };
-  static constexpr EventType event_type = EventType::PyCall;
-};
-
 // ============================================================================
 // == Callsite & ValueCache: Storage during profiling =========================
 // ============================================================================
@@ -269,52 +267,6 @@ class Callsite {
   Config<CallType::PyCall>::key_t caller_;
 };
 
-void check_and_store(
-    const pybind11::handle& name,
-    const pybind11::handle& param_handle,
-    std::vector<ParameterInfo>& storeroom) {
-  auto param_ptr = param_handle.ptr();
-  if (py::isinstance<py::str>(name) && THPVariable_CheckExact(param_ptr)) {
-    const auto& param = THPVariable_Unpack(param_ptr);
-    auto grad_ptr = py::getattr(param_handle, "grad", py::none()).ptr();
-    c10::optional<TensorMetadata> grad_metadata;
-
-    if (THPVariable_CheckExact(grad_ptr)) {
-      grad_metadata = c10::optional<TensorMetadata>(
-          TensorMetadata(THPVariable_Unpack(grad_ptr)));
-    } else {
-      grad_metadata = c10::nullopt;
-    }
-
-    storeroom.push_back(
-        {name.cast<std::string>(), TensorMetadata(param), grad_metadata});
-  }
-}
-
-void check_and_store(
-    const pybind11::handle& name,
-    const pybind11::handle& param_handle,
-    std::vector<std::pair<std::basic_string<char>, TensorMetadata>>&
-        storeroom) {
-  auto param_ptr = param_handle.ptr();
-  if (py::isinstance<py::str>(name) && THPVariable_CheckExact(param_ptr)) {
-    const auto& param = THPVariable_Unpack(param_ptr);
-
-    storeroom.emplace_back(name.cast<std::string>(), param);
-  }
-}
-
-void check_and_store(
-    const pybind11::handle& param_handle,
-    std::vector<TensorMetadata>& storeroom) {
-  auto param_ptr = param_handle.ptr();
-  if (THPVariable_CheckExact(param_ptr)) {
-    const auto& param = THPVariable_Unpack(param_ptr);
-
-    storeroom.emplace_back(param);
-  }
-}
-
 // ============================================================================
 // == Type specific store and load implementations. ===========================
 // ============================================================================
@@ -374,6 +326,28 @@ typename Config<C>::cls_t set_class(
   return cls;
 }
 
+auto toTensorMetadata(PyObject* self) {
+  TORCH_INTERNAL_ASSERT(THPVariable_CheckExact(self));
+  return TensorMetadata{THPVariable_Unpack(self)};
+}
+
+auto recordIfTensor(py::handle p) {
+  return THPVariable_CheckExact(p.ptr())
+      ? c10::optional<TensorMetadata>{toTensorMetadata(p.ptr())}
+      : c10::nullopt;
+}
+
+auto unpackTensorMap(py::dict tensor_map) {
+  std::vector<std::pair<std::string, TensorMetadata>> out;
+  for (auto& it : tensor_map) {
+    auto* value = it.second.ptr();
+    if (py::isinstance<py::str>(it.first) && THPVariable_CheckExact(value)) {
+      out.push_back({py::cast<std::string>(it.first), toTensorMetadata(value)});
+    }
+  }
+  return out;
+}
+
 template <>
 void ValueCache::store<CallType::PyCall>(const PyCallKey& key, no_ephemeral_t) {
   auto& locations = std::get<CallType::PyCall>(state_);
@@ -397,16 +371,22 @@ void ValueCache::store<CallType::PyModuleCall>(
     Config<CallType::PyModuleCall>::ephemeral_t frame) {
   auto& cache = std::get<CallType::PyModuleCall>(state_);
   if (C10_UNLIKELY(
-          cache.modules_and_params_.find(key) ==
-          cache.modules_and_params_.end())) {
+          cache.cls_and_parameters_.find(key) ==
+          cache.cls_and_parameters_.end())) {
     auto cls = set_class<CallType::PyModuleCall>(this, cache, key, frame);
 
     py::dict params = py::handle((PyObject*)key).attr("_parameters");
-    std::vector<ParameterInfo> params_;
+    std::vector<NNModuleInfo::ParameterInfo> params_;
     for (auto& it : params) {
-      check_and_store(it.first, it.second, params_);
+      auto* p = it.second.ptr();
+      if (py::isinstance<py::str>(it.first) && THPVariable_CheckExact(p)) {
+        params_.push_back(
+            {it.first.cast<std::string>(),
+             toTensorMetadata(p),
+             recordIfTensor(py::getattr(it.second, "grad", py::none()))});
+      }
     }
-    cache.modules_and_params_[key] = make_pair(cls, params_);
+    cache.cls_and_parameters_[key] = {cls, params_};
   }
 }
 
@@ -415,45 +395,45 @@ ExtraFields<EventType::PyCall>::args_t ValueCache::load<CallType::PyModuleCall>(
     const PyModuleCallKey& key) const {
   auto& cache = std::get<CallType::PyModuleCall>(state_);
   TORCH_INTERNAL_ASSERT(cache.location_.has_value());
-  auto cls = cache.modules_and_params_.at(key).first;
-  auto fwd = std::get<CallType::PyCall>(state_).at(*cache.location_);
+  const auto& cls_and_parameters = cache.cls_and_parameters_.at(key);
+  const auto& cls = cls_and_parameters.cls_;
+  NNModuleInfo info{
+      key, cls, cache.cls_names_.at(cls), cls_and_parameters.parameters_};
   return {
-      fwd,
-      NNModuleInfo{
-          key,
-          cls,
-          cache.cls_names_.at(cls),
-          cache.modules_and_params_.at(key).second}};
+      /*frame_state_=*/std::get<CallType::PyCall>(state_).at(*cache.location_),
+      /*module_info_=*/std::move(info),
+      /*optimizer_info_=*/c10::nullopt};
 }
+
 template <>
 void ValueCache::store<CallType::PyOptimizerCall>(
     const PyOptimizerCallKey& key,
     Config<CallType::PyOptimizerCall>::ephemeral_t frame) {
   auto& cache = std::get<CallType::PyOptimizerCall>(state_);
   if (C10_UNLIKELY(
-          cache.optimizer_data_.find(key) == cache.optimizer_data_.end())) {
+          cache.cls_and_parameters_.find(key) ==
+          cache.cls_and_parameters_.end())) {
     auto cls = set_class<CallType::PyOptimizerCall>(this, cache, key, frame);
-    py::list param_groups_handle =
-        py::handle((PyObject*)key).attr("param_groups");
-    std::vector<TensorMetadata> params_;
-    // param_groups is a list of dict
-    for (auto& param_group : param_groups_handle) {
-      for (auto& param :
-           py::cast<py::dict>(param_group).attr("get")("params")) {
-        check_and_store(param, params_);
-      }
-    }
-    std::vector<std::pair<std::string, TensorMetadata>> states_;
-    py::dict state_handle = py::handle((PyObject*)key).attr("state");
-    for (auto& it : state_handle) {
-      TORCH_INTERNAL_ASSERT(
-          py::isinstance<py::dict>(it.second), "Expects a dict type element");
-      for (auto& state_elem : py::cast<py::dict>(it.second)) {
-        check_and_store(state_elem.first, state_elem.second, states_);
+    const py::handle self{(PyObject*)key};
+    std::vector<OptimizerInfo::ParameterInfo> params;
+
+    for (const auto& i : (py::list)self.attr("param_groups")) {
+      for (auto& param : py::cast<py::dict>(i).attr("get")("params")) {
+        if (THPVariable_CheckExact(param.ptr())) {
+          // While `self.state` is permitted to store data in an arbitrary way,
+          // all generic optimizers (SGD, Adam, etc) use param as the key since
+          // the state in question is tied to particular parameters. We can
+          // relax this assumption if the need arises.
+          params.push_back(
+              {toTensorMetadata(param.ptr()),
+               recordIfTensor(py::getattr(param, "grad", py::none())),
+               unpackTensorMap(py::cast<py::dict>(self.attr("state"))
+                                   .attr("get")(param, py::dict()))});
+        }
       }
     }
 
-    cache.optimizer_data_[key] = {cls, params_, states_};
+    cache.cls_and_parameters_[key] = {cls, params};
   }
 }
 
@@ -461,17 +441,14 @@ template <>
 ExtraFields<EventType::PyCall>::args_t ValueCache::load<
     CallType::PyOptimizerCall>(const PyOptimizerCallKey& key) const {
   auto& cache = std::get<CallType::PyOptimizerCall>(state_);
-  auto cls = cache.optimizer_data_.at(key).cls_;
-  auto frame_state = std::get<CallType::PyCall>(state_).at(*cache.location_);
+  const auto& cls_and_parameters = cache.cls_and_parameters_.at(key);
+  auto cls = cls_and_parameters.cls_;
+  OptimizerInfo info{
+      key, cls, cache.cls_names_.at(cls), cls_and_parameters.parameters_};
   return {
-      frame_state,
-      c10::nullopt,
-      OptimizerInfo{
-          key,
-          cls,
-          cache.cls_names_.at(cls),
-          cache.optimizer_data_.at(key).params_,
-          cache.optimizer_data_.at(key).states_}};
+      /*frame_state_=*/std::get<CallType::PyCall>(state_).at(*cache.location_),
+      /*module_info_=*/c10::nullopt,
+      /*optimizer_info_=*/std::move(info)};
 }
 
 template <>
diff --git a/torch/csrc/profiler/collection.h b/torch/csrc/profiler/collection.h
index 9c979df8ff61b..5402e613eb858 100644
--- a/torch/csrc/profiler/collection.h
+++ b/torch/csrc/profiler/collection.h
@@ -238,29 +238,34 @@ using PyMethod = strong_t</*PyMethodDef*/ void*, struct PyMethod_>;
 using PyOptimizerSelf = strong_t<PyObject*, struct PyOptSelf_>;
 using PyOptimizerCls = strong_t<PyObject*, struct PyOptimizer_>;
 
-struct ParameterInfo {
-  std::string param_name_;
-  TensorMetadata param_;
-  c10::optional<TensorMetadata> grad_;
-};
-
 struct NNModuleInfo {
+  struct ParameterInfo {
+    std::string name_;
+    TensorMetadata metadata_;
+    c10::optional<TensorMetadata> grad_metadata_;
+  };
+
   PyModuleSelf self_;
   PyModuleCls cls_;
   at::StringView cls_name_;
 
-  std::vector<ParameterInfo> params_;
+  std::vector<ParameterInfo> parameters_;
   // Indicates that `self_` is the kth instance of `cls_` observed.
   size_t id_{std::numeric_limits<size_t>::max()};
 };
 
 struct OptimizerInfo {
+  struct ParameterInfo {
+    TensorMetadata metadata_;
+    c10::optional<TensorMetadata> grad_metadata_;
+    std::vector<std::pair<std::string, TensorMetadata>> state_;
+  };
+
   PyOptimizerSelf self_;
-  PyOptimizerCls opt_;
-  at::StringView opt_name_;
+  PyOptimizerCls cls_;
+  at::StringView cls_name_;
 
-  std::vector<TensorMetadata> params_addr_;
-  std::vector<std::pair<std::string, TensorMetadata>> opt_state_;
+  std::vector<ParameterInfo> parameters_;
 };
 
 struct PyExtraFieldsBase {
diff --git a/torch/csrc/profiler/python/init.cpp b/torch/csrc/profiler/python/init.cpp
index be25e5fc1b95c..63d893d4ef8d0 100644
--- a/torch/csrc/profiler/python/init.cpp
+++ b/torch/csrc/profiler/python/init.cpp
@@ -181,49 +181,33 @@ void initPythonBindings(PyObject* module) {
 
   py::class_<NNModuleInfo>(m, "_NNModuleInfo")
       .def_property_readonly(
-          "params",
+          "parameters",
           [](const NNModuleInfo& s) {
-            py::list list;
-            for (auto& p : s.params_) {
-              list.append(std::tuple<
-                          std::string,
-                          TensorMetadata,
-                          c10::optional<TensorMetadata>>(
-                  p.param_name_, p.param_, p.grad_));
+            py::list out;
+            for (const auto& p : s.parameters_) {
+              out.append(
+                  py::make_tuple(p.name_, p.metadata_, p.grad_metadata_));
             }
-            return list;
+            return out;
           })
       .def_property_readonly(
           "cls_name", [](const NNModuleInfo& s) { return s.cls_name_.str(); });
 
-  py::class_<OptimizerInfo>(m, "_OptInfo")
-      .def_property_readonly(
-          "self",
-          [](const OptimizerInfo& a) {
-            return reinterpret_cast<intptr_t>(a.self_.value_of());
-          })
-      .def_property_readonly(
-          "param_addrs",
-          [](const OptimizerInfo& s) {
-            py::list params_addrs;
-            for (auto& addr : s.params_addr_) {
-              params_addrs.append(addr);
-            }
-            return params_addrs;
-          })
-      .def_property_readonly("opt_state", [](const OptimizerInfo& s) {
-        py::list states;
-        for (auto& a : s.opt_state_) {
-          states.append(std::make_pair(a.first, a.second));
+  py::class_<OptimizerInfo>(m, "_OptimizerInfo")
+      .def_readonly("self_ptr", &OptimizerInfo::self_)
+      .def_property_readonly("parameters", [](const OptimizerInfo& s) {
+        py::list out;
+        for (const auto& p : s.parameters_) {
+          out.append(py::make_tuple(p.metadata_, p.grad_metadata_, p.state_));
         }
-        return states;
+        return out;
       });
 
   py::class_<ExtraFields<EventType::PyCall>>(m, "_ExtraFields_PyCall")
-      .def_readonly("optimizer", &ExtraFields<EventType::PyCall>::optimizer_)
       .def_readonly("callsite", &ExtraFields<EventType::PyCall>::callsite_)
       .def_readonly("caller", &ExtraFields<EventType::PyCall>::caller_)
-      .def_readonly("module", &ExtraFields<EventType::PyCall>::module_);
+      .def_readonly("module", &ExtraFields<EventType::PyCall>::module_)
+      .def_readonly("optimizer", &ExtraFields<EventType::PyCall>::optimizer_);
 
   py::class_<ExtraFields<EventType::PyCCall>>(m, "_ExtraFields_PyCCall")
       .def_readonly("caller", &ExtraFields<EventType::PyCall>::caller_);
diff --git a/torch/csrc/profiler/python/init.h b/torch/csrc/profiler/python/init.h
index d04b398d4554b..fac39fc62574a 100644
--- a/torch/csrc/profiler/python/init.h
+++ b/torch/csrc/profiler/python/init.h
@@ -7,6 +7,7 @@
 
 namespace pybind11 {
 namespace detail {
+using torch::profiler::impl::PyOptimizerSelf;
 using torch::profiler::impl::StorageImplData;
 using torch::profiler::impl::TensorID;
 using torch::profiler::impl::TensorImplAddress;
@@ -19,6 +20,10 @@ template <>
 struct type_caster<TensorImplAddress>
     : public strong_pointer_type_caster<TensorImplAddress> {};
 
+template <>
+struct type_caster<PyOptimizerSelf>
+    : public strong_pointer_type_caster<PyOptimizerSelf> {};
+
 template <>
 struct type_caster<TensorID> : public strong_uint_type_caster<TensorID> {};
 } // namespace detail

From eb6185041ab6de7d201fc1941e8460bdccaf9a2b Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Sat, 22 Oct 2022 17:37:55 -0700
Subject: [PATCH 0052/1922] [Profiler] Tensor IDs for Module and Optimizer
 variables (#86754)

More sophisticated profiling will increasingly rely on python tracer to contextualize observed results. This PR adds Tensors which are observed by the python tracer to the identity assignment loop.

Differential Revision: [D39852885](https://our.internmc.facebook.com/intern/diff/D39852885/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86754
Approved by: https://github.com/slgong-fb, https://github.com/aaronenyeshi
---
 test/profiler/test_profiler.py          | 80 +++++++++++++++++++++++--
 torch/csrc/autograd/profiler_python.cpp | 16 +++--
 torch/csrc/profiler/collection.cpp      | 38 ++++++++++--
 3 files changed, 120 insertions(+), 14 deletions(-)

diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py
index 09379bb02a531..3831b6bd1247d 100644
--- a/test/profiler/test_profiler.py
+++ b/test/profiler/test_profiler.py
@@ -1278,12 +1278,14 @@ def test_nested_tensor_with_shapes(self):
 
 
 def find_node_with_name(nodes, name):
-    for node in nodes:
+    for node in _utils.traverse_dfs(nodes):
         if node.name == name:
             return node
-        result = find_node_with_name(node.children, name)
-        if result is not None:
-            return result
+
+def find_node_with_regex(nodes, pattern):
+    for node in _utils.traverse_dfs(nodes):
+        if re.search(pattern, node.name):
+            return node
 
 
 class SimpleNet(nn.Module):
@@ -1368,6 +1370,73 @@ def get_fields(op_name, index):
         self.assertEqual(c_id, c_id_new)
         self.assertEqual(d_id, c_id_new)
 
+    def test_module_and_optimizer_ids(self) -> None:
+        model = torch.nn.Linear(2, 1, bias=True)
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+
+        def check(cold_start: bool) -> None:
+            with profile(with_stack=True, profile_memory=True, record_shapes=True) as p:
+                x = torch.ones((1, 2))
+                _ = x.sin()  # Mark `x`
+                model(x).backward()
+                optimizer.step()
+                _ = optimizer.state[model.weight]["momentum_buffer"].cos()  # Mark weight momentum
+                _ = model.weight.grad.tan()  # Mark weight gradient
+
+            nodes = p.profiler.kineto_results.experimental_event_tree()
+
+            def get_fields(op_name, index):
+                return self._get_tensor_fields(
+                    find_node_with_name(nodes, op_name),
+                    index)
+
+            # Marked Tensors act as ground truth for python tracer IDs.
+            _, _, x_id = get_fields("aten::sin", 0)
+            _, _, weight_momenumtum_id = get_fields("aten::cos", 0)
+            _, _, weight_grad_id = get_fields("aten::tan", 0)
+            self.assertNotEqual(x_id, weight_momenumtum_id)
+            self.assertNotEqual(x_id, weight_grad_id)
+            self.assertNotEqual(weight_momenumtum_id, weight_grad_id)
+
+            # Use linear op to identify weight ground truth.
+            linear_op_node = find_node_with_name(nodes, "aten::linear")
+            self.assertIsNotNone(linear_op_node)
+            x_metadata, weight_metadata, _ = linear_op_node.extra_fields.inputs.tensor_metadata
+            self.assertEqual(x_id, x_metadata.id)
+
+            # Module
+            linear_module_node = find_node_with_name(nodes, "nn.Module: Linear_0")
+            self.assertIsNotNone(linear_module_node)
+            self.assertIsNotNone(linear_module_node.extra_fields.module)
+            self.assertIsNone(linear_module_node.extra_fields.optimizer)
+
+            linear_parameters = linear_module_node.extra_fields.module.parameters
+            name, weight, weight_grad = linear_parameters[0]
+            self.assertEqual(name, "weight")
+            self.assertEqual(weight.id, weight_metadata.id)
+
+            self.assertEqual(weight_grad is None, cold_start)
+            if not cold_start:
+                self.assertEqual(weight_grad.id, weight_grad_id)
+
+            # Optimizer
+            step_node = find_node_with_regex(nodes, "_optimizer_step_code")
+            self.assertIsNotNone(step_node)
+            self.assertIsNone(step_node.extra_fields.module)
+            self.assertIsNotNone(step_node.extra_fields.optimizer)
+            optimizer_parameters = step_node.extra_fields.optimizer.parameters
+            self.assertEqual(len(optimizer_parameters), 2)  # Weight and bias
+            weight, weight_grad, state = optimizer_parameters[0]
+            self.assertEqual(weight.id, weight_metadata.id)
+            self.assertEqual(weight_grad.id, weight_grad_id)
+            self.assertEqual(len(state), 1)
+            self.assertEqual(state[0][0], "momentum_buffer")
+            self.assertEqual(state[0][1].id, weight_momenumtum_id)
+
+        # Check that we handle first step (lazy initalization) and steady state.
+        check(cold_start=True)
+        check(cold_start=False)
+
     def _test_allocation_ids(self, before_fn, after_fn) -> None:
         with profile(profile_memory=True, record_shapes=True) as p:
             # Introduce other operations and allocations to check robustness
@@ -1622,6 +1691,9 @@ def _check_results(self, opt, opts, check_items=False):
                     p.storage_data_ptr: {name: s.storage_data_ptr for name, s in state}
                     for (p, _, state) in opt_.parameters
                 }
+
+                # Make sure the profiler collected all optimizer state and check
+                # that the address recorded by the profiler is correct.
                 for parameter, parameter_state in opt.state.items():
                     self.assertEqual(
                         {name: value.storage().data_ptr() for name, value in parameter_state.items()},
diff --git a/torch/csrc/autograd/profiler_python.cpp b/torch/csrc/autograd/profiler_python.cpp
index d971336ae0c4d..308dcdcde49c8 100644
--- a/torch/csrc/autograd/profiler_python.cpp
+++ b/torch/csrc/autograd/profiler_python.cpp
@@ -205,7 +205,7 @@ struct Config<CallType::PyCall> {
   static constexpr EventType event_type = EventType::PyCall;
 };
 
-template <typename Key, typename Cls, typename Info>
+template <typename Key, typename Cls, typename ParameterInfo>
 struct ExtendedPyCallConfig {
   using key_t = Key;
   using cls_t = Cls;
@@ -213,7 +213,7 @@ struct ExtendedPyCallConfig {
 
   struct ClsAndParameters {
     cls_t cls_;
-    std::vector<typename Info::ParameterInfo> parameters_;
+    std::vector<ParameterInfo> parameters_;
   };
 
   struct Cache {
@@ -228,12 +228,16 @@ struct ExtendedPyCallConfig {
 };
 
 template <>
-struct Config<CallType::PyModuleCall>
-    : ExtendedPyCallConfig<PyModuleSelf, PyModuleCls, NNModuleInfo> {};
+struct Config<CallType::PyModuleCall> : ExtendedPyCallConfig<
+                                            PyModuleSelf,
+                                            PyModuleCls,
+                                            NNModuleInfo::ParameterInfo> {};
 
 template <>
-struct Config<CallType::PyOptimizerCall>
-    : ExtendedPyCallConfig<PyOptimizerSelf, PyOptimizerCls, OptimizerInfo> {};
+struct Config<CallType::PyOptimizerCall> : ExtendedPyCallConfig<
+                                               PyOptimizerSelf,
+                                               PyOptimizerCls,
+                                               OptimizerInfo::ParameterInfo> {};
 
 template <>
 struct Config<CallType::PyCCall> {
diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp
index 51fd2f9fd3df0..305cef5ffdf5d 100644
--- a/torch/csrc/profiler/collection.cpp
+++ b/torch/csrc/profiler/collection.cpp
@@ -850,14 +850,20 @@ void calculate_unique_tensor_ids(std::vector<result_ptr_t>& sorted_results) {
     };
 
     ska::flat_hash_set<storage_id_t> tensor_set;
+    auto insert_tensor = [&lookup, &tensors, &tensor_set](TensorMetadata& m) {
+      if (m.impl_ && m.data_) {
+        const auto id = lookup(m.data_);
+        tensor_set.insert(id);
+        tensors.emplace_back(TensorStoragePair{m.impl_, id, m.id_});
+      }
+    };
+
     for (auto& result : sorted_results) {
       result->visit(c10::overloaded(
           [&](ExtraFields<EventType::TorchOp>& torch_op) {
             for (auto& m : torch_op.inputs_.tensor_metadata_) {
-              if (m.has_value() && m->impl_ && m->data_) {
-                auto id = lookup(m->data_);
-                tensor_set.insert(id);
-                tensors.emplace_back(TensorStoragePair{m->impl_, id, m->id_});
+              if (m.has_value()) {
+                insert_tensor(*m);
               }
             }
           },
@@ -874,6 +880,30 @@ void calculate_unique_tensor_ids(std::vector<result_ptr_t>& sorted_results) {
               live_storage.erase(StorageImplData(alloc_op.ptr_));
             }
           },
+          [&](ExtraFields<EventType::PyCall>& py_call) {
+            // torch.nn.Module
+            if (py_call.module_.has_value()) {
+              for (auto& p : py_call.module_->parameters_) {
+                insert_tensor(p.metadata_);
+                if (p.grad_metadata_.has_value()) {
+                  insert_tensor(*p.grad_metadata_);
+                }
+              }
+            }
+
+            // torch.optim.Optimizer
+            if (py_call.optimizer_.has_value()) {
+              for (auto& p : py_call.optimizer_->parameters_) {
+                insert_tensor(p.metadata_);
+                if (p.grad_metadata_.has_value()) {
+                  insert_tensor(*p.grad_metadata_);
+                }
+                for (auto& state_i : p.state_) {
+                  insert_tensor(state_i.second);
+                }
+              }
+            }
+          },
           [](const auto&) {}));
     }
 

From 310683fb3fc428170b9ebfbb8f303104f11bce55 Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Sat, 22 Oct 2022 17:37:57 -0700
Subject: [PATCH 0053/1922] [Profiler][Trivial] Add Module cls and self
 bindings and type_caster macro (#86755)

Just a bit of clean up. We will need `self` and `cls` for memory profiling, and the type_caster specializations were getting quite verbose.

Differential Revision: [D39920728](https://our.internmc.facebook.com/intern/diff/D39920728/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86755
Approved by: https://github.com/slgong-fb, https://github.com/aaronenyeshi
---
 torch/_C/_profiler.pyi              |  4 ++++
 torch/csrc/profiler/python/init.cpp |  4 +++-
 torch/csrc/profiler/python/init.h   | 22 +++++++++-------------
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/torch/_C/_profiler.pyi b/torch/_C/_profiler.pyi
index 6d6c2893f4554..da6cfb165fb36 100644
--- a/torch/_C/_profiler.pyi
+++ b/torch/_C/_profiler.pyi
@@ -148,6 +148,10 @@ class _NNModuleInfo:
     @property
     def params(self) -> List[Tuple[str, int]]: ...
     @property
+    def self_ptr(self) -> int: ...
+    @property
+    def cls_ptr(self) -> int: ...
+    @property
     def cls_name(self) -> str: ...
 
 class _ExtraFields_PyCCall:
diff --git a/torch/csrc/profiler/python/init.cpp b/torch/csrc/profiler/python/init.cpp
index 63d893d4ef8d0..8a800a3d5f82b 100644
--- a/torch/csrc/profiler/python/init.cpp
+++ b/torch/csrc/profiler/python/init.cpp
@@ -191,7 +191,9 @@ void initPythonBindings(PyObject* module) {
             return out;
           })
       .def_property_readonly(
-          "cls_name", [](const NNModuleInfo& s) { return s.cls_name_.str(); });
+          "cls_name", [](const NNModuleInfo& s) { return s.cls_name_.str(); })
+      .def_readonly("self_ptr", &NNModuleInfo::self_)
+      .def_readonly("cls_ptr", &NNModuleInfo::cls_);
 
   py::class_<OptimizerInfo>(m, "_OptimizerInfo")
       .def_readonly("self_ptr", &OptimizerInfo::self_)
diff --git a/torch/csrc/profiler/python/init.h b/torch/csrc/profiler/python/init.h
index fac39fc62574a..226bf1a3f3bb3 100644
--- a/torch/csrc/profiler/python/init.h
+++ b/torch/csrc/profiler/python/init.h
@@ -7,22 +7,18 @@
 
 namespace pybind11 {
 namespace detail {
-using torch::profiler::impl::PyOptimizerSelf;
-using torch::profiler::impl::StorageImplData;
 using torch::profiler::impl::TensorID;
-using torch::profiler::impl::TensorImplAddress;
 
-template <>
-struct type_caster<StorageImplData>
-    : public strong_pointer_type_caster<StorageImplData> {};
-
-template <>
-struct type_caster<TensorImplAddress>
-    : public strong_pointer_type_caster<TensorImplAddress> {};
+#define STRONG_POINTER_TYPE_CASTER(T) \
+  template <>                         \
+  struct type_caster<T> : public strong_pointer_type_caster<T> {};
 
-template <>
-struct type_caster<PyOptimizerSelf>
-    : public strong_pointer_type_caster<PyOptimizerSelf> {};
+STRONG_POINTER_TYPE_CASTER(torch::profiler::impl::StorageImplData);
+STRONG_POINTER_TYPE_CASTER(torch::profiler::impl::TensorImplAddress);
+STRONG_POINTER_TYPE_CASTER(torch::profiler::impl::PyModuleSelf);
+STRONG_POINTER_TYPE_CASTER(torch::profiler::impl::PyModuleCls);
+STRONG_POINTER_TYPE_CASTER(torch::profiler::impl::PyOptimizerSelf);
+#undef STRONG_POINTER_TYPE_CASTER
 
 template <>
 struct type_caster<TensorID> : public strong_uint_type_caster<TensorID> {};

From dd23736655c5d83c7f18db6e78b5f8fe141dcdd9 Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Sat, 22 Oct 2022 17:37:58 -0700
Subject: [PATCH 0054/1922] [Profiler] Regularize `AccumulateGrad` name
 (#86909)

Memory profiler will use AccumulateGrad when detecting gradients. The name difference between Windows and other platforms has already cropped up with profiler trees so it makes sense to address it at the source.

Differential Revision: [D40347550](https://our.internmc.facebook.com/intern/diff/D40347550/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86909
Approved by: https://github.com/slgong-fb, https://github.com/aaronenyeshi
---
 test/profiler/test_profiler_tree.py |  5 -----
 torch/csrc/profiler/collection.cpp  | 13 +++++++++++++
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/test/profiler/test_profiler_tree.py b/test/profiler/test_profiler_tree.py
index f0097985f2940..21c3826c4a9cd 100644
--- a/test/profiler/test_profiler_tree.py
+++ b/test/profiler/test_profiler_tree.py
@@ -138,11 +138,6 @@ def flatten(nodes, depth=0, out=None):
 
     @staticmethod
     def fmt_name(name: str) -> str:
-        # torch::autograd::Node relies on c10::demangle to generate names, and
-        # Windows demangles to include `struct` in the name.
-        if IS_WINDOWS:
-            name = name.replace('struct torch::autograd::AccumulateGrad', 'torch::autograd::AccumulateGrad')
-
         match = re.match(r"^(.*)\.py\(([0-9]+)\): (.*)$", name)
         if match:
             filename, _, fn = match.groups()
diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp
index 305cef5ffdf5d..e76cfd5946db9 100644
--- a/torch/csrc/profiler/collection.cpp
+++ b/torch/csrc/profiler/collection.cpp
@@ -296,6 +296,19 @@ void ThreadLocalSubqueue::TorchOpStorage::materialize(
     }
   }
 
+  // `AccumulateGrad` is an important marker for profile analysis; however the
+  // annotation relies on `c10::demangle` which is platform dependent. In
+  // particular, Windows will add a "struct " prefix.
+  const std::string accumulate_grad = "torch::autograd::AccumulateGrad";
+  const std::string windows_pattern = std::string("struct ") + accumulate_grad;
+  for (auto& event : op_events_) {
+    auto& name = event.basic_fields_.name_;
+    auto position = name.find(windows_pattern);
+    if (position != std::string::npos) {
+      name.replace(position, windows_pattern.size(), accumulate_grad);
+    }
+  }
+
   auto input_getter = inputs_outputs_.getNextShapesAndDtypes();
 
   // TODO: CTAD will take care of template args when we move to C++17

From f93a861cc445175160ab96415586dd95e0d0eb73 Mon Sep 17 00:00:00 2001
From: efiks <5167930+efiks@users.noreply.github.com>
Date: Sun, 23 Oct 2022 19:29:25 +0000
Subject: [PATCH 0055/1922] [torch] Unify batch_box_cox implementations into
 perfkernels folder (#86569)

Summary:
1) Adding MKL/AVX2 based implementation into perfkernels. This implementation is similar to caffe2/operators/batch_box_cox_op.cc
2) Migrating batch_box_cox_op of caffe2 use this implementation

Test Plan: CI

Differential Revision: D40208074

Pull Request resolved: https://github.com/pytorch/pytorch/pull/86569
Approved by: https://github.com/hyuen
---
 caffe2/operators/batch_box_cox_op.cc     | 300 ++---------------------
 caffe2/operators/batch_box_cox_op.h      |  60 +----
 caffe2/perfkernels/batch_box_cox.cc      | 113 +++++++++
 caffe2/perfkernels/batch_box_cox.h       |  35 +++
 caffe2/perfkernels/batch_box_cox_avx2.cc | 299 ++++++++++++++++++++++
 caffe2/perfkernels/common.h              |   3 +
 6 files changed, 478 insertions(+), 332 deletions(-)
 create mode 100644 caffe2/perfkernels/batch_box_cox.cc
 create mode 100644 caffe2/perfkernels/batch_box_cox.h
 create mode 100644 caffe2/perfkernels/batch_box_cox_avx2.cc

diff --git a/caffe2/operators/batch_box_cox_op.cc b/caffe2/operators/batch_box_cox_op.cc
index aa444330969b5..6e2bb4d9a8d9d 100644
--- a/caffe2/operators/batch_box_cox_op.cc
+++ b/caffe2/operators/batch_box_cox_op.cc
@@ -2,72 +2,34 @@
 
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
-
-#ifdef CAFFE2_USE_MKL
-#include <mkl.h>
-#endif // CAFFE2_USE_MKL
+#include "caffe2/perfkernels/batch_box_cox.h"
 
 namespace caffe2 {
 
-#ifdef CAFFE2_USE_MKL
 namespace {
-
-// Helpers for copying parameters.
 template <typename T>
-void TileArrayIntoVector(const T* a, int D, int K, vector<T>* b) {
-  b->resize(K * D);
-  for (int k = 0; k < K; k++) {
-    std::copy(a, a + D, b->begin() + k * D);
-  }
-}
-
-void TileIndicesInPlace(vector<int>* v, int D, int K) {
-  int n = v->size();
-  v->resize(K * n);
-  for (int k = 1; k < K; k++) {
-    for (int j = 0; j < n; j++) {
-      (*v)[k * n + j] = (*v)[j] + k * D;
+void BoxCoxNaive(
+    int64_t N,
+    int64_t D,
+    const T* data_ptr,
+    const T* lambda1_ptr,
+    const T* lambda2_ptr,
+    T* output_ptr) {
+  constexpr T k_eps = static_cast<T>(1e-6);
+  for (int64_t i = 0; i < N; i++) {
+    for (int64_t j = 0; j < D; j++, data_ptr++, output_ptr++) {
+      T lambda1_v = lambda1_ptr[j];
+      T lambda2_v = lambda2_ptr[j];
+      T tmp = std::max(*data_ptr + lambda2_v, k_eps);
+      if (lambda1_v == 0) {
+        *output_ptr = std::log(tmp);
+      } else {
+        *output_ptr = (std::pow(tmp, lambda1_v) - 1) / lambda1_v;
+      }
     }
   }
 }
-
-// MKL VML function templates.
-template <typename T>
-void PackV(const int N, const T* a, const int* ia, T* y);
-template <typename T>
-void UnpackV(const int N, const T* a, T* y, const int* iy);
-template <typename T>
-void Pow(const int N, const T* a, const T* b, T* y);
-
-#define DELEGATE_PACKV_FUNCTION(T, OriginalFunc)                \
-  template <>                                                   \
-  void PackV<T>(const int N, const T* a, const int* ia, T* y) { \
-    OriginalFunc(N, a, ia, y);                                  \
-  }
-DELEGATE_PACKV_FUNCTION(float, vsPackV)
-DELEGATE_PACKV_FUNCTION(double, vdPackV)
-#undef DELEGATE_PACKV_FUNCTION
-
-#define DELEGATE_UNPACKV_FUNCTION(T, OriginalFunc)                \
-  template <>                                                     \
-  void UnpackV<T>(const int N, const T* a, T* y, const int* iy) { \
-    OriginalFunc(N, a, y, iy);                                    \
-  }
-DELEGATE_UNPACKV_FUNCTION(float, vsUnpackV)
-DELEGATE_UNPACKV_FUNCTION(double, vdUnpackV)
-#undef DELEGATE_UNPACKV_FUNCTION
-
-#define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Funcname, OriginalFunc) \
-  template <>                                                      \
-  void Funcname<T>(const int N, const T* a, const T* b, T* y) {    \
-    OriginalFunc(N, a, b, y);                                      \
-  }
-DELEGATE_SIMPLE_BINARY_FUNCTION(float, Pow, vsPow)
-DELEGATE_SIMPLE_BINARY_FUNCTION(double, Pow, vdPow)
-#undef DELEGATE_SIMPLE_BINARY_FUNCTION
-
-} // namespace
-#endif // CAFFE2_USE_MKL
+}
 
 template <>
 template <typename T>
@@ -93,227 +55,19 @@ bool BatchBoxCoxOp<CPUContext>::DoRunWithType() {
   const auto* lambda1_ptr = lambda1.template data<T>();
   const auto* lambda2_ptr = lambda2.template data<T>();
 
-  const T k_eps = static_cast<T>(1e-6);
-
 #ifdef CAFFE2_USE_MKL
   if (min_block_size_ < 1) {
-    BoxCoxNaive(N, D, data_ptr, lambda1_ptr, lambda2_ptr, k_eps, output_ptr);
-  } else {
-    // Find zero-valued columns, since they get special treatment.
-    nonzeros_.clear();
-    zeros_.clear();
-    nonzeros_.reserve(D);
-    zeros_.reserve(D);
-    for (int64_t j = 0; j < D; j++) {
-      if (lambda1_ptr[j] == 0) {
-        zeros_.push_back(j);
-      } else {
-        nonzeros_.push_back(j);
-      }
-    }
-
-    // Process K rows at a time for effective vectorization with small rows.
-    const int K = std::min(N, (min_block_size_ + D - 1) / D);
-
-    // Avoid copying data if all lambda1 values are zero, or if all are nonzero.
-    // In each of the three cases here, when K > 1, first process batches of K
-    // rows by replicating the input parameters K times. Then finish row-by-row.
-    TypedCachedBuffers<T>& b = GetBuffers<T>();
-    if (nonzeros_.size() == D) {
-      int64_t i = 0;
-      if (K > 1) {
-        TileArrayIntoVector(lambda1_ptr, D, K, &b.lambda1_);
-        TileArrayIntoVector(lambda2_ptr, D, K, &b.lambda2_);
-        TORCH_DCHECK_EQ(K * D, b.lambda1_.size());
-        TORCH_DCHECK_EQ(K * D, b.lambda2_.size());
-        for (; i < N - K + 1; i += K, data_ptr += K * D, output_ptr += K * D) {
-          BoxCoxNonzeroLambda(
-              K * D,
-              data_ptr,
-              b.lambda1_.data(),
-              b.lambda2_.data(),
-              k_eps,
-              output_ptr);
-        }
-      }
-      for (; i < N; i++, data_ptr += D, output_ptr += D) {
-        BoxCoxNonzeroLambda(
-            D, data_ptr, lambda1_ptr, lambda2_ptr, k_eps, output_ptr);
-      }
-    } else if (zeros_.size() == D) {
-      int64_t i = 0;
-      if (K > 1) {
-        TileArrayIntoVector(lambda2_ptr, D, K, &b.lambda2_z_);
-        TORCH_DCHECK_EQ(K * D, b.lambda2_z_.size());
-        for (; i < N - K + 1; i += K, data_ptr += K * D, output_ptr += K * D) {
-          BoxCoxZeroLambda(
-              K * D, data_ptr, b.lambda2_z_.data(), k_eps, output_ptr);
-        }
-      }
-      for (; i < N; i++, data_ptr += D, output_ptr += D) {
-        BoxCoxZeroLambda(D, data_ptr, lambda2_ptr, k_eps, output_ptr);
-      }
-    } else { // General case of mixed zero and non-zero lambda1 values.
-      int n = nonzeros_.size();
-      if (K > 1) {
-        TileIndicesInPlace(&nonzeros_, 0, K);
-        TileIndicesInPlace(&zeros_, 0, K);
-      }
-
-      // Gather parameter values into contiguous memory.
-      b.lambda1_.resize(nonzeros_.size());
-      b.lambda2_.resize(nonzeros_.size());
-      b.lambda2_z_.resize(zeros_.size());
-      PackV(nonzeros_.size(), lambda1_ptr, nonzeros_.data(), b.lambda1_.data());
-      PackV(nonzeros_.size(), lambda2_ptr, nonzeros_.data(), b.lambda2_.data());
-      PackV(zeros_.size(), lambda2_ptr, zeros_.data(), b.lambda2_z_.data());
-
-      int64_t i = 0;
-      b.accumulator_.resize(std::max(nonzeros_.size(), zeros_.size()));
-      if (K > 1) {
-        // Truncate to original size, and re-tile with offsets this time.
-        nonzeros_.resize(n);
-        zeros_.resize(D - n);
-        TileIndicesInPlace(&nonzeros_, D, K);
-        TileIndicesInPlace(&zeros_, D, K);
-        TORCH_DCHECK_EQ(nonzeros_.size(), b.lambda1_.size());
-        TORCH_DCHECK_EQ(nonzeros_.size(), b.lambda2_.size());
-        TORCH_DCHECK_EQ(zeros_.size(), b.lambda2_z_.size());
-        for (; i < N - K + 1; i += K, data_ptr += K * D, output_ptr += K * D) {
-          BoxCoxMixedLambda(
-              data_ptr,
-              nonzeros_,
-              zeros_,
-              b.lambda1_.data(),
-              b.lambda2_.data(),
-              b.lambda2_z_.data(),
-              k_eps,
-              b.accumulator_.data(),
-              output_ptr);
-        }
-        // Truncate to original size.
-        nonzeros_.resize(n);
-        zeros_.resize(D - n);
-      }
-      for (; i < N; i++, data_ptr += D, output_ptr += D) {
-        BoxCoxMixedLambda(
-            data_ptr,
-            nonzeros_,
-            zeros_,
-            b.lambda1_.data(),
-            b.lambda2_.data(),
-            b.lambda2_z_.data(),
-            k_eps,
-            b.accumulator_.data(),
-            output_ptr);
-      }
-    }
+    BoxCoxNaive(N, D, data_ptr, lambda1_ptr, lambda2_ptr, output_ptr);
+    return true;
   }
-#else // CAFFE2_USE_MKL
-  BoxCoxNaive(N, D, data_ptr, lambda1_ptr, lambda2_ptr, k_eps, output_ptr);
-#endif // CAFFE2_USE_MKL
+  caffe2::compute_batch_box_cox(
+    N, D, min_block_size_, data_ptr, lambda1_ptr, lambda2_ptr, output_ptr);
+#else
+  BoxCoxNaive(N, D, data_ptr, lambda1_ptr, lambda2_ptr, output_ptr);
+#endif
   return true;
 }
 
-template <>
-template <typename T>
-void BatchBoxCoxOp<CPUContext>::BoxCoxNaive(
-    int64_t N,
-    int64_t D,
-    const T* data_ptr,
-    const T* lambda1_ptr,
-    const T* lambda2_ptr,
-    T k_eps,
-    T* output_ptr) {
-  for (int64_t i = 0; i < N; i++) {
-    for (int64_t j = 0; j < D; j++, data_ptr++, output_ptr++) {
-      T lambda1_v = lambda1_ptr[j];
-      T lambda2_v = lambda2_ptr[j];
-      T tmp = std::max(*data_ptr + lambda2_v, k_eps);
-      if (lambda1_v == 0) {
-        *output_ptr = std::log(tmp);
-      } else {
-        *output_ptr = (std::pow(tmp, lambda1_v) - 1) / lambda1_v;
-      }
-    }
-  }
-}
-
-#ifdef CAFFE2_USE_MKL
-
-template <>
-template <typename T>
-void BatchBoxCoxOp<CPUContext>::BoxCoxNonzeroLambda(
-    int64_t D,
-    const T* data_ptr,
-    const T* lambda1,
-    const T* lambda2,
-    T k_eps,
-    T* out) {
-  caffe2::math::Add(D, data_ptr, lambda2, out, &context_);
-  for (int64_t j = 0; j < D; j++) {
-    out[j] = std::max(out[j], k_eps);
-  }
-  Pow(D, out, lambda1, out);
-  for (int64_t j = 0; j < D; j++) {
-    out[j] -= 1.0;
-  }
-  caffe2::math::Div(D, out, lambda1, out, &context_);
-}
-
-template <>
-template <typename T>
-void BatchBoxCoxOp<CPUContext>::BoxCoxZeroLambda(
-    int64_t D,
-    const T* data_ptr,
-    const T* lambda2,
-    T k_eps,
-    T* output_ptr) {
-  caffe2::math::Add(D, data_ptr, lambda2, output_ptr, &context_);
-  for (int64_t j = 0; j < D; j++) {
-    output_ptr[j] = std::max(output_ptr[j], k_eps);
-  }
-  caffe2::math::Log(D, output_ptr, output_ptr, &context_);
-}
-
-template <>
-template <typename T>
-void BatchBoxCoxOp<CPUContext>::BoxCoxMixedLambda(
-    const T* data_ptr,
-    const vector<int>& nonzeros,
-    const vector<int>& zeros,
-    const T* lambda1,
-    const T* lambda2,
-    const T* lambda2_z,
-    T k_eps,
-    T* buffer,
-    T* output_ptr) {
-  PackV(nonzeros.size(), data_ptr, nonzeros.data(), buffer);
-  BoxCoxNonzeroLambda(nonzeros.size(), buffer, lambda1, lambda2, k_eps, buffer);
-  UnpackV(nonzeros.size(), buffer, output_ptr, nonzeros.data());
-
-  PackV(zeros.size(), data_ptr, zeros.data(), buffer);
-  BoxCoxZeroLambda(zeros.size(), buffer, lambda2_z, k_eps, buffer);
-  UnpackV(zeros.size(), buffer, output_ptr, zeros.data());
-}
-
-// Helpers to access cached buffers.
-#define DEFINE_CACHED_BUFFERS(T, tag)                                         \
-  template <>                                                                 \
-  template <>                                                                 \
-  BatchBoxCoxOp<CPUContext>::TypedCachedBuffers<T>&                           \
-  BatchBoxCoxOp<CPUContext>::GetBuffers<T>() {                                \
-    if (!buffers_ || buffers_->type_ != tag) {                                \
-      buffers_.reset(new BatchBoxCoxOp<CPUContext>::TypedCachedBuffers<T>()); \
-      buffers_->type_ = tag;                                                  \
-    }                                                                         \
-    return *static_cast<TypedCachedBuffers<T>*>(buffers_.get());              \
-  }
-DEFINE_CACHED_BUFFERS(float, 1);
-DEFINE_CACHED_BUFFERS(double, 2);
-#undef DEFINE_CACHED_BUFFERS
-
-#endif // CAFFE2_USE_MKL
 
 namespace {
 
diff --git a/caffe2/operators/batch_box_cox_op.h b/caffe2/operators/batch_box_cox_op.h
index baa9c955b6cac..a177131e9adee 100644
--- a/caffe2/operators/batch_box_cox_op.h
+++ b/caffe2/operators/batch_box_cox_op.h
@@ -29,65 +29,7 @@ class BatchBoxCoxOp final : public Operator<Context> {
   bool DoRunWithType();
 
  protected:
-  template <typename T>
-  void BoxCoxNaive(
-      int64_t N,
-      int64_t D,
-      const T* data_ptr,
-      const T* lambda1_ptr,
-      const T* lambda2_ptr,
-      T k_eps,
-      T* output_ptr);
-
-#ifdef CAFFE2_USE_MKL
-  template <typename T>
-  void BoxCoxNonzeroLambda(
-      int64_t D,
-      const T* data_ptr,
-      const T* lambda1,
-      const T* lambda2,
-      T k_eps,
-      T* output_ptr);
-
-  template <typename T>
-  void BoxCoxZeroLambda(
-      int64_t D,
-      const T* data_ptr,
-      const T* lambda2,
-      T k_eps,
-      T* output_ptr);
-
-  template <typename T>
-  void BoxCoxMixedLambda(
-      const T* data_ptr,
-      const vector<int>& nonzeros,
-      const vector<int>& zeros,
-      const T* lambda1,
-      const T* lambda2,
-      const T* lambda2_z,
-      T k_eps,
-      T* buffer,
-      T* output_ptr);
-
-  vector<int> nonzeros_, zeros_;
-
-  // Buffers used by the MKL version are cached across calls.
-  struct CachedBuffers {
-    virtual ~CachedBuffers() {}
-    int type_;
-  };
-  template <typename T>
-  struct TypedCachedBuffers : public CachedBuffers {
-    vector<T> lambda1_, lambda2_, lambda2_z_;
-    vector<T> accumulator_;
-  };
-  template <typename T>
-  TypedCachedBuffers<T>& GetBuffers();
-  unique_ptr<CachedBuffers> buffers_;
-
-#endif // CAFFE2_USE_MKL
-
-  int min_block_size_;
+  std::size_t min_block_size_;
 
   INPUT_TAGS(DATA, LAMBDA1, LAMBDA2);
 };
diff --git a/caffe2/perfkernels/batch_box_cox.cc b/caffe2/perfkernels/batch_box_cox.cc
new file mode 100644
index 0000000000000..3e840d8fa04d3
--- /dev/null
+++ b/caffe2/perfkernels/batch_box_cox.cc
@@ -0,0 +1,113 @@
+#include "caffe2/perfkernels/common.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cmath>
+
+namespace caffe2 {
+
+namespace {
+template <typename T>
+void BoxCoxNaive(
+    std::size_t N,
+    std::size_t D,
+    const T* data_ptr,
+    const T* __restrict lambda1_ptr,
+    const T* __restrict lambda2_ptr,
+    T* output_ptr) {
+  constexpr T k_eps = static_cast<T>(1e-6);
+
+  for (int64_t i = 0; i < N; i++) {
+    for (int64_t j = 0; j < D; j++, data_ptr++, output_ptr++) {
+      T lambda1_v = lambda1_ptr[j];
+      T lambda2_v = lambda2_ptr[j];
+      T tmp = std::max(*data_ptr + lambda2_v, k_eps);
+      if (lambda1_v == 0) {
+        *output_ptr = std::log(tmp);
+      } else {
+        T lambda_1 = 1 / lambda1_v;
+        T pow = std::pow(tmp, lambda1_v);
+        *output_ptr = lambda_1 * pow - lambda_1;
+      }
+    }
+  }
+
+}
+}
+
+#if defined(CAFFE2_PERF_WITH_AVX2) && defined(CAFFE2_PERF_USE_MKL)
+namespace details {
+template <typename T>
+void compute_batch_box_cox__avx2_fma(
+  std::size_t N,
+  std::size_t D,
+  std::size_t block_size,
+  const T* data_ptr,
+  const T* __restrict lambda1_ptr,
+  const T* __restrict lambda2_ptr,
+  T* output_ptr);
+
+extern template
+void compute_batch_box_cox__avx2_fma<float>(
+  std::size_t N,
+  std::size_t D,
+  std::size_t block_size,
+  const float* self_data,
+  const float* __restrict lambda1_data,
+  const float* __restrict lambda2_data,
+  float* output_data);
+
+extern template
+void compute_batch_box_cox__avx2_fma<double>(
+  std::size_t N,
+  std::size_t D,
+  std::size_t block_size,
+  const double* self_data,
+  const double* __restrict lambda1_data,
+  const double* __restrict lambda2_data,
+  double* output_data);
+} // namespace detail
+#endif
+
+template <typename T>
+void compute_batch_box_cox(
+    std::size_t N,
+    std::size_t D,
+    std::size_t block_size,
+    const T* data,
+    const T* lambda1_data,
+    const T* lambda2_data,
+    T* output_data) {
+#ifdef CAFFE2_PERF_WITH_AVX2
+  AVX2_FMA_DO(
+    details::compute_batch_box_cox,
+    N,
+    D,
+    block_size,
+    data,
+    lambda1_data,
+    lambda2_data,
+    output_data);
+#endif
+  BoxCoxNaive<T>(N, D, data, lambda1_data, lambda2_data, output_data);
+}
+
+template void compute_batch_box_cox<float>(
+  std::size_t N,
+  std::size_t D,
+  std::size_t block_size,
+  const float* data,
+  const float* lambda1_data,
+  const float* lambda2_data,
+  float* output_data);
+
+template void compute_batch_box_cox<double>(
+  std::size_t N,
+  std::size_t D,
+  std::size_t block_size,
+  const double* data,
+  const double* lambda1_data,
+  const double* lambda2_data,
+  double* output_data);
+
+} // namespace caffe2
diff --git a/caffe2/perfkernels/batch_box_cox.h b/caffe2/perfkernels/batch_box_cox.h
new file mode 100644
index 0000000000000..60c973bbf8ea1
--- /dev/null
+++ b/caffe2/perfkernels/batch_box_cox.h
@@ -0,0 +1,35 @@
+// Impmenets BoxCox operator for CPU
+#pragma once
+#include <cstdint>
+
+namespace caffe2 {
+
+template <typename T>
+void compute_batch_box_cox(
+    std::size_t N,
+    std::size_t D,
+    std::size_t block_size,
+    const T* self_data,
+    const T* lambda1_data,
+    const T* lambda2_data,
+    T* output_data);
+
+extern template void compute_batch_box_cox<float>(
+    std::size_t N,
+    std::size_t D,
+    std::size_t block_size,
+    const float* data,
+    const float* lambda1_data,
+    const float* lambda2_data,
+    float* output_data);
+
+extern template void compute_batch_box_cox<double>(
+    std::size_t N,
+    std::size_t D,
+    std::size_t block_size,
+    const double* data,
+    const double* lambda1_data,
+    const double* lambda2_data,
+    double* output_data);
+
+} // namespace caffe2
diff --git a/caffe2/perfkernels/batch_box_cox_avx2.cc b/caffe2/perfkernels/batch_box_cox_avx2.cc
new file mode 100644
index 0000000000000..cf0801b4733ef
--- /dev/null
+++ b/caffe2/perfkernels/batch_box_cox_avx2.cc
@@ -0,0 +1,299 @@
+#ifdef CAFFE2_PERF_USE_MKL
+#include <c10/util/irange.h>
+#include <caffe2/perfkernels/common.h>
+#include <folly/SingletonThreadLocal.h>
+
+#include <cstdint>
+#include <cmath>
+#include <vector>
+
+#include <mkl.h>
+
+namespace caffe2::details {
+
+// MKL VML function templates.
+template <typename T>
+void PackV(const int N, const T* a, const int* ia, T* y);
+template <typename T>
+void UnpackV(const int N, const T* a, T* y, const int* iy);
+template <typename T>
+void Pow(const int N, const T* a, const T* b, T* y);
+template <typename T>
+void Add(const int N, const T* a, const T* b, T* y);
+template <typename T>
+void Div(const int N, const T* a, const T* b, T* y);
+template <typename T>
+void Ln(const int N, const T* a, T* y);
+
+#define DELEGATE_PACKV_FUNCTION(T, OriginalFunc)                \
+  template <>                                                   \
+  void PackV<T>(const int N, const T* a, const int* ia, T* y) { \
+    OriginalFunc(N, a, ia, y);                                  \
+  }
+DELEGATE_PACKV_FUNCTION(float, vsPackV)
+DELEGATE_PACKV_FUNCTION(double, vdPackV)
+#undef DELEGATE_PACKV_FUNCTION
+
+#define DELEGATE_UNPACKV_FUNCTION(T, OriginalFunc)                \
+  template <>                                                     \
+  void UnpackV<T>(const int N, const T* a, T* y, const int* iy) { \
+    OriginalFunc(N, a, y, iy);                                    \
+  }
+DELEGATE_UNPACKV_FUNCTION(float, vsUnpackV)
+DELEGATE_UNPACKV_FUNCTION(double, vdUnpackV)
+#undef DELEGATE_UNPACKV_FUNCTION
+
+#define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Funcname, OriginalFunc) \
+  template <>                                                      \
+  void Funcname<T>(const int N, const T* a, const T* b, T* y) {    \
+    OriginalFunc(N, a, b, y);                                      \
+  }
+DELEGATE_SIMPLE_BINARY_FUNCTION(float, Pow, vsPow)
+DELEGATE_SIMPLE_BINARY_FUNCTION(double, Pow, vdPow)
+DELEGATE_SIMPLE_BINARY_FUNCTION(float, Add, vsAdd)
+DELEGATE_SIMPLE_BINARY_FUNCTION(double, Add, vdAdd)
+DELEGATE_SIMPLE_BINARY_FUNCTION(float, Div, vsDiv)
+DELEGATE_SIMPLE_BINARY_FUNCTION(double, Div, vdDiv)
+#undef DELEGATE_SIMPLE_BINARY_FUNCTION
+
+#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, OriginalFunc) \
+  template <>                                                     \
+  void Funcname<T>(const int N, const T* a, T* y) {               \
+    OriginalFunc(N, a, y);                                        \
+  }
+DELEGATE_SIMPLE_UNARY_FUNCTION(float, Ln, vsLn)
+DELEGATE_SIMPLE_UNARY_FUNCTION(double, Ln, vdLn)
+#undef DELEGATE_SIMPLE_UNARY_FUNCTION
+
+template <typename T>
+void box_cox_zero_lambda(
+    size_t D,
+    const T* const self_data,
+    const T* const lambda2_data,
+    T k_eps,
+    T* const output_data) {
+  Add(D, self_data, lambda2_data, output_data);
+  for (const auto j : c10::irange(D)) {
+    output_data[j] = std::max(output_data[j], k_eps);
+  }
+
+  Ln(D, output_data, output_data);
+}
+
+template <typename T>
+void box_cox_nonzero_lambda(
+    size_t D,
+    const T* const self_data,
+    const T* const lambda1_data,
+    const T* const lambda2_data,
+    T k_eps,
+    T* const output_data) {
+  Add(D, self_data, lambda2_data, output_data);
+  for (const auto j : c10::irange(D)) {
+    output_data[j] = std::max(output_data[j], k_eps);
+  }
+
+  // output = output ^ lambda1
+  Pow(D, output_data, lambda1_data, output_data);
+  // output = (output  - 1)/ lambda1
+  for (const auto j : c10::irange(D)) {
+    output_data[j] -= 1.0;
+  }
+  Div(D, output_data, lambda1_data, output_data);
+}
+
+template <typename T>
+void box_cox_mixed_lambda(
+    const T* const self_data,
+    const std::vector<int>& nonzeros,
+    const std::vector<int>& zeros,
+    const T* const lambda1,
+    const T* const lambda2,
+    const T* const lambda2_z_,
+    T k_eps,
+    T* const buffer,
+    T* const output_data) {
+  PackV(nonzeros.size(), self_data, nonzeros.data(), buffer);
+  box_cox_nonzero_lambda<T>(
+      nonzeros.size(), buffer, lambda1, lambda2, k_eps, buffer);
+  UnpackV(nonzeros.size(), buffer, output_data, nonzeros.data());
+
+  PackV(zeros.size(), self_data, zeros.data(), buffer);
+  box_cox_zero_lambda<T>(
+      zeros.size(), buffer, lambda2_z_, k_eps, buffer);
+  UnpackV(zeros.size(), buffer, output_data, zeros.data());
+}
+
+template <typename T>
+void TileArrayIntoVector(
+    const T* const a,
+    const size_t D,
+    const int K,
+    std::vector<T>& b) {
+  b.resize(K * D);
+  for (const auto k : c10::irange(K)) {
+    std::copy(a, a + D, b.begin() + k * D);
+  }
+}
+
+void TileIndicesInPlace(std::vector<int>& v, const std::size_t D, const std::size_t K) {
+  auto n = v.size();
+  v.resize(K * n);
+  for (const auto k : c10::irange(1, K)) {
+    for (const auto j : c10::irange(n)) {
+      v[k * n + j] = v[j] + k * D;
+    }
+  }
+}
+
+template <typename T>
+void compute_batch_box_cox__avx2_fma(
+    std::size_t N,
+    std::size_t D,
+    std::size_t block_size,
+    const T* self_data,
+    const T* __restrict lambda1_data,
+    const T* __restrict lambda2_data,
+    T* output_data) {
+  constexpr T k_eps = static_cast<T>(1e-6);
+
+  FOLLY_DECLARE_REUSED(zeros, std::vector<int>);
+  FOLLY_DECLARE_REUSED(nonzeros, std::vector<int>);
+  // Don't bother calling reserve; calls after the first will get a
+  // correctly-sized allocation anyway.
+  for (const auto j : c10::irange(D)) {
+    if (lambda1_data[j] == 0) {
+      zeros.push_back(j);
+    } else {
+      nonzeros.push_back(j);
+    }
+  }
+
+  // Process K rows at a time for effective vectorization with small rows.
+  const auto K = std::min(N, (block_size + D - 1) / D);
+
+  FOLLY_DECLARE_REUSED(lambda1_, std::vector<T>);
+  FOLLY_DECLARE_REUSED(lambda2_, std::vector<T>);
+  FOLLY_DECLARE_REUSED(lambda2_z_, std::vector<T>);
+
+  if (nonzeros.size() == D) {
+    // ((x + lambda2)^lambda1 - 1)/lambda1, if lambda1 != 0
+    size_t i = 0;
+    if (K > 1) {
+      TileArrayIntoVector(lambda1_data, D, K, lambda1_);
+      TileArrayIntoVector(lambda2_data, D, K, lambda2_);
+      DCHECK_EQ(K * D, lambda1_.size());
+      DCHECK_EQ(K * D, lambda2_.size());
+      for (; i < N - K + 1; i += K, self_data += K * D, output_data += K * D) {
+        box_cox_nonzero_lambda<T>(
+            K * D,
+            self_data,
+            lambda1_.data(),
+            lambda2_.data(),
+            k_eps,
+            output_data);
+      }
+    }
+    for (; i < N; i++, self_data += D, output_data += D) {
+      box_cox_nonzero_lambda<T>(
+          D, self_data, lambda1_data, lambda2_data, k_eps, output_data);
+    }
+  } else if (zeros.size() == D) {
+    // ln(x + lambda2), if lambda1 == 0
+    size_t i = 0;
+    if (K > 1) {
+      TileArrayIntoVector(lambda2_data, D, K, lambda2_z_);
+      DCHECK_EQ(K * D, lambda2_z_.size());
+      for (; i < N - K + 1; i += K, self_data += K * D, output_data += K * D) {
+        box_cox_zero_lambda<T>(
+            K * D, self_data, lambda2_z_.data(), k_eps, output_data);
+      }
+    }
+    for (; i < N; i++, self_data += D, output_data += D) {
+      box_cox_zero_lambda<T>(
+          D, self_data, lambda2_data, k_eps, output_data);
+    }
+  } else {
+    // mix zeros and nonzeros
+    const size_t n = nonzeros.size();
+    if (K > 1) {
+      TileIndicesInPlace(nonzeros, 0, K);
+      TileIndicesInPlace(zeros, 0, K);
+    }
+
+    FOLLY_DECLARE_REUSED(buffer, std::vector<T>);
+
+    buffer.resize(std::max(nonzeros.size(), zeros.size()));
+    lambda1_.resize(nonzeros.size());
+    lambda2_.resize(nonzeros.size());
+    lambda2_z_.resize(zeros.size());
+    PackV(nonzeros.size(), lambda1_data, nonzeros.data(), lambda1_.data());
+    PackV(nonzeros.size(), lambda2_data, nonzeros.data(), lambda2_.data());
+    PackV(zeros.size(), lambda2_data, zeros.data(), lambda2_z_.data());
+
+    size_t i = 0;
+    if (K > 1) {
+      // Truncate to original size, and re-tile with offsets this time.
+      nonzeros.resize(n);
+      DCHECK_GT(D, n);
+      zeros.resize(D - n);
+      TileIndicesInPlace(nonzeros, D, K);
+      TileIndicesInPlace(zeros, D, K);
+      DCHECK_EQ(nonzeros.size(), lambda1_.size());
+      DCHECK_EQ(nonzeros.size(), lambda2_.size());
+      DCHECK_EQ(zeros.size(), lambda2_z_.size());
+
+      for (; i < N - K + 1; i += K, self_data += K * D, output_data += K * D) {
+        box_cox_mixed_lambda<T>(
+            self_data,
+            nonzeros,
+            zeros,
+            lambda1_.data(),
+            lambda2_.data(),
+            lambda2_z_.data(),
+            k_eps,
+            buffer.data(),
+            output_data);
+      }
+      // Truncate to original size.
+      nonzeros.resize(n);
+      zeros.resize(D - n);
+    }
+    for (; i < N; i++, self_data += D, output_data += D) {
+      box_cox_mixed_lambda<T>(
+          self_data,
+          nonzeros,
+          zeros,
+          lambda1_.data(),
+          lambda2_.data(),
+          lambda2_z_.data(),
+          k_eps,
+          buffer.data(),
+          output_data);
+    }
+  }
+};
+
+
+template
+void compute_batch_box_cox__avx2_fma<float>(
+  std::size_t N,
+  std::size_t D,
+  std::size_t block_size,
+  const float* self_data,
+  const float* __restrict lambda1_data,
+  const float* __restrict lambda2_data,
+  float* output_data);
+
+template
+void compute_batch_box_cox__avx2_fma<double>(
+  std::size_t N,
+  std::size_t D,
+  std::size_t block_size,
+  const double* self_data,
+  const double* __restrict lambda1_data,
+  const double* __restrict lambda2_data,
+  double* output_data);
+
+} // namespace caffe2::detail
+#endif
diff --git a/caffe2/perfkernels/common.h b/caffe2/perfkernels/common.h
index fb960dbe5dc3c..6fed9e1d6d06c 100644
--- a/caffe2/perfkernels/common.h
+++ b/caffe2/perfkernels/common.h
@@ -62,7 +62,10 @@ In foo.cc, do:
 
 #pragma once
 
+#if defined(CAFFE2_PERF_WITH_AVX512) || defined(CAFFE2_PERF_WITH_AVX2) \
+     || defined(CAFFE2_PERF_WITH_AVX)
 #include <cpuinfo.h>
+#endif
 
 // DO macros: these should be used in your entry function, similar to foo()
 // above, that routes implementations based on CPU capability.

From c23583033f753c645a9143b5cc6e8497e1efc84b Mon Sep 17 00:00:00 2001
From: eqy <eddiey@nvidia.com>
Date: Sun, 23 Oct 2022 21:17:12 +0000
Subject: [PATCH 0056/1922] Fix 64bit indexing in `vol2col` (#87527)

Surfaced from #87354

CC @ngimel @ptrblck @maybeLee
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87527
Approved by: https://github.com/ngimel
---
 aten/src/ATen/native/cuda/vol2col.cuh | 58 +++++++++++++--------------
 test/test_nn.py                       | 10 +++++
 2 files changed, 39 insertions(+), 29 deletions(-)

diff --git a/aten/src/ATen/native/cuda/vol2col.cuh b/aten/src/ATen/native/cuda/vol2col.cuh
index 7ab719bc819eb..51dbe1c744053 100644
--- a/aten/src/ATen/native/cuda/vol2col.cuh
+++ b/aten/src/ATen/native/cuda/vol2col.cuh
@@ -15,7 +15,7 @@ using namespace at::cuda::detail;
 // Kernel for fast unfold+copy on volumes
 template <typename T>
 __global__ void vol2col_kernel(
-    const int n,
+    const int64_t n,
     const T* data_vol,
     const int depth,
     const int height,
@@ -37,16 +37,16 @@ __global__ void vol2col_kernel(
     const int width_col,
     T* data_col) {
   CUDA_KERNEL_LOOP(index, n) {
-    int w_out = index % width_col;
+    auto w_out = index % width_col;
     index /= width_col;
-    int h_out = index % height_col;
+    auto h_out = index % height_col;
     index /= height_col;
-    int t_out = index % depth_col;
-    int channel_in = index / depth_col;
-    int channel_out = channel_in * ksize_t * ksize_h * ksize_w;
-    int t_in = t_out * stride_t - pad_t;
-    int h_in = h_out * stride_h - pad_h;
-    int w_in = w_out * stride_w - pad_w;
+    auto t_out = index % depth_col;
+    auto channel_in = index / depth_col;
+    auto channel_out = channel_in * ksize_t * ksize_h * ksize_w;
+    auto t_in = t_out * stride_t - pad_t;
+    auto h_in = h_out * stride_h - pad_h;
+    auto w_in = w_out * stride_w - pad_w;
     data_col +=
         ((channel_out * depth_col + t_out) * height_col + h_out) * width_col +
         w_out;
@@ -54,9 +54,9 @@ __global__ void vol2col_kernel(
     for (int i = 0; i < ksize_t; ++i) {
       for (int j = 0; j < ksize_h; ++j) {
         for (int k = 0; k < ksize_w; ++k) {
-          int t = t_in + i * dilation_t;
-          int h = h_in + j * dilation_h;
-          int w = w_in + k * dilation_w;
+          auto t = t_in + i * dilation_t;
+          auto h = h_in + j * dilation_h;
+          auto w = w_in + k * dilation_w;
           *data_col = (t >= 0 && h >= 0 && w >= 0 && t < depth && h < height &&
                        w < width)
               ? data_vol
@@ -126,7 +126,7 @@ void vol2col(
 
 template <typename T, typename accT>
 __global__ void vol2im_kernel(
-    const unsigned n,
+    const int64_t n,
     const T* data_col,
     const unsigned depth,
     const unsigned height,
@@ -150,30 +150,30 @@ __global__ void vol2im_kernel(
     T* data_vol) {
   CUDA_KERNEL_LOOP(index, n) {
     accT val = static_cast<accT>(0);
-    const unsigned w_im = index % width + pad_w;
-    const unsigned h_im = (index / width) % height + pad_h;
-    const unsigned t_im = (index / width / height) % depth + pad_t;
-    const unsigned c_im = index / (width * height * depth);
-    unsigned kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
-    unsigned kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
-    unsigned kernel_extent_t = (kernel_t - 1) * dilation_t + 1;
+    const auto w_im = index % width + pad_w;
+    const auto h_im = (index / width) % height + pad_h;
+    const auto t_im = (index / width / height) % depth + pad_t;
+    const auto c_im = index / (width * height * depth);
+    auto kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
+    auto kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
+    auto kernel_extent_t = (kernel_t - 1) * dilation_t + 1;
     // compute the start and end of the output
-    const unsigned w_col_start =
+    const auto w_col_start =
         (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
-    const unsigned w_col_end = std::min(w_im / stride_w + 1, width_col);
-    const unsigned h_col_start =
+    const auto w_col_end = std::min(w_im / stride_w + 1, width_col);
+    const auto h_col_start =
         (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
-    const unsigned h_col_end = std::min(h_im / stride_h + 1, height_col);
-    const unsigned t_col_start =
+    const auto h_col_end = std::min(h_im / stride_h + 1, height_col);
+    const auto t_col_start =
         (t_im < kernel_extent_t) ? 0 : (t_im - kernel_extent_t) / stride_t + 1;
-    const unsigned t_col_end = std::min(t_im / stride_t + 1, depth_col);
+    const auto t_col_end = std::min(t_im / stride_t + 1, depth_col);
     // TODO: use LCM of stride and dilation to avoid unnecessary loops
     for (unsigned t_col = t_col_start; t_col < t_col_end; t_col += 1) {
       for (unsigned h_col = h_col_start; h_col < h_col_end; h_col += 1) {
         for (unsigned w_col = w_col_start; w_col < w_col_end; w_col += 1) {
-          unsigned t_k = (t_im - t_col * stride_t);
-          unsigned h_k = (h_im - h_col * stride_h);
-          unsigned w_k = (w_im - w_col * stride_w);
+          uint64_t t_k = (t_im - t_col * stride_t);
+          uint64_t h_k = (h_im - h_col * stride_h);
+          uint64_t w_k = (w_im - w_col * stride_w);
           if (t_k % dilation_t == 0 && h_k % dilation_h == 0 &&
               w_k % dilation_w == 0) {
             t_k /= dilation_t;
diff --git a/test/test_nn.py b/test/test_nn.py
index 6c7f1e82ccd63..5fbdcacd641d8 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -15159,6 +15159,16 @@ def test_conv_large_nosplit(self, device):
         input_large = torch.randn(1, 1, 2048, 1024 , dtype=dtype, device=device)
         conv2(input_large)
 
+    @onlyCUDA
+    @largeTensorTest('40GB')
+    @largeTensorTest('24GB', 'cpu')
+    def test_conv3d_64bit_indexing(self, device):
+        x = torch.rand(1, 32, 512, 512, 256)
+        m = torch.nn.Conv3d(32, 1, kernel_size=1, padding=0, stride=1, bias=False)
+        yref = m(x)
+        y = m.to(device=device)(x.to(device=device))
+        self.assertEqual(yref, y)
+
     def test_conv_noncontig_weights(self, device):
         for dim in (1, 2, 3):
             for grouped in (False, True):

From 315967c5bc944293ee47b3ae3169095f98443dc0 Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Sat, 22 Oct 2022 14:50:45 +0000
Subject: [PATCH 0057/1922] Improvements for DDP Optimizer (#87549)

- adds support for 'first_bucket_cap' arg, to align bucketing more precisely
  with DDP, which may start a smaller first bucket
- refactors the bucket splitting logic to be cleaner
- adds pretty-print for bucket info, and a way to access bucket info
  from the DDPOptimizer class from a test case or benchmark
- dumps debug logs to stdout

cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87549
Approved by: https://github.com/soumith
---
 test/distributed/test_dynamo_distributed.py |   9 +-
 torch/_dynamo/optimizations/distributed.py  | 129 ++++++++++++--------
 2 files changed, 84 insertions(+), 54 deletions(-)

diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index 0fefd4ec507a7..43a4a23039175 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -18,8 +18,8 @@ def __init__(self, in_feat=10, hidden_feat=5000, num_hidden=2, out_feat=5):
         super().__init__()
         self.net = nn.Sequential(
             *[nn.Linear(in_feat, hidden_feat), nn.ReLU()]
-            + [nn.Linear(5000, 5000), nn.ReLU()] * num_hidden
-            + [nn.Linear(5000, 5), nn.ReLU()]
+            + [nn.Linear(hidden_feat, hidden_feat), nn.ReLU()] * num_hidden
+            + [nn.Linear(hidden_feat, 5), nn.ReLU()]
         )
 
     def forward(self, inputs):
@@ -160,7 +160,10 @@ def test_no_split(self):
         Ensures the DDPOptimizer returns a correct, compiled module without
         introducing graph splits. (Based on model parmeters fitting in the bucket)
         """
-        m, inputs, correct_outputs = self.get_model()
+        # DDP will always do a 'first bucket' with a really small size;  so only a tiny model will escape this
+        m = ToyModel(hidden_feat=5).to(self.device)
+        inputs = torch.randn(20, 10).to(self.device)
+        correct_outputs = m(inputs)
         ddp_m = DDP(m, device_ids=self.device_ids, bucket_cap_mb=250)
 
         check_splits_compiler = CheckSplitsCompiler()
diff --git a/torch/_dynamo/optimizations/distributed.py b/torch/_dynamo/optimizations/distributed.py
index f65c16483aec6..bd3f07b635f3f 100644
--- a/torch/_dynamo/optimizations/distributed.py
+++ b/torch/_dynamo/optimizations/distributed.py
@@ -1,4 +1,5 @@
-from typing import Any, List
+from dataclasses import dataclass, field
+from typing import Any, List, Optional
 
 import torch
 import torch.fx.traceback as fx_traceback
@@ -18,6 +19,28 @@ def args_str(args):
         return str(args)
 
 
+@dataclass
+class Bucket:
+    size: int = 0
+    params: List[str] = field(default_factory=list)
+    nodes: List[fx.Node] = field(default_factory=list)
+
+
+def pretty_print_buckets(buckets: List[Bucket]):
+    headers = ("Index", "Size (b)", "Param Names")
+    rows = []
+    for idx, bucket in enumerate(reversed(buckets)):
+        rows.append((idx, bucket.size, bucket.params[0]))
+        for param in bucket.params[1:]:
+            rows.append((None, None, param))
+    try:
+        from tabulate import tabulate
+
+        print(tabulate(rows, headers=headers, tablefmt="simple_grid"))
+    except ImportError:
+        print("Please `pip install tabulate` in order to pretty-print ddp bucket sizes")
+
+
 class DDPOptimizer:
     def __init__(
         self,
@@ -25,8 +48,20 @@ def __init__(
         parameters_to_ignore: List[str],
         backend_compile_fn,
         debug=False,
+        first_bucket_cap: Optional[int] = None,
     ):
+        if first_bucket_cap is not None:
+            self.first_bucket_cap = first_bucket_cap
+        elif torch.distributed.is_available():
+            # this constant comes from C10D lib which is not always built
+            self.first_bucket_cap = torch.distributed._DEFAULT_FIRST_BUCKET_BYTES
+        else:
+            self.first_bucket_cap = bucket_bytes_cap
+
         self.bucket_bytes_cap = bucket_bytes_cap
+        assert (
+            self.first_bucket_cap <= self.bucket_bytes_cap
+        ), "First bucket should be smaller/equal to other buckets to get comms warmed up ASAP"
         self.parameters_to_ignore = parameters_to_ignore
         self.backend_compile_fn = backend_compile_fn
         self.debug = debug
@@ -35,76 +70,69 @@ def compile_fn(self, gm: fx.GraphModule, example_inputs: List[torch.Tensor]):
         """
         TODO:
         - handle params_and_buffers_to_ignore
-        - handle kwargs
         """
 
         # 1: compute the partition map according to DDP bucket logic
-        bucket_bytes = 0
-        bucket_actual_sizes = []
-        node_splits = [[]]
+        buckets = [Bucket()]  # (size, param_names)
         for node in reversed(gm.graph.nodes):
-            if node.op == "output" or node.op == "placeholder":
+            if node.op in ("output", "placeholder"):
                 continue
 
-            if bucket_bytes >= self.bucket_bytes_cap:
-                bucket_actual_sizes.insert(0, bucket_bytes)
-                bucket_bytes = 0
-                node_splits.insert(0, [])
+            if (
+                buckets[0].size >= self.bucket_bytes_cap
+                or len(buckets) == 1
+                and buckets[0].size >= self.first_bucket_cap
+            ):
+                buckets.insert(0, Bucket())
 
-            elif node.op == "call_module":
+            if node.op == "call_module":
                 target = gm.get_submodule(node.target)
-                params_size_b = sum(
-                    [
-                        p.storage().nbytes()
-                        for p in target.parameters()
-                        if p.requires_grad
-                    ]
-                )
-                bucket_bytes += params_size_b
-                # print(f"accumulated {params_size_b} b from {node}")
+                for name, p in target.named_parameters():
+                    if p.requires_grad:
+                        buckets[0].size += p.storage().nbytes()
+                        # TODO correct FQ name?
+                        buckets[0].params.append(f"{node}_{name}")
             elif node.op == "get_attr":
                 maybe_param = getattr(gm, node.target)
                 if maybe_param.requires_grad:
-                    bucket_bytes += maybe_param.storage().nbytes()
-            else:
-                # TODO(whc) confirm this:
-                # (e.g. call_method, call_function aren't expected to 'have' parameters)
-                pass
-
-            node_splits[0].append(node)
-
-        if len(node_splits) == 1:
-            if self.debug:
-                print(
-                    "DDPOptimizer did not split graphs."
-                    f" Accumulated {bucket_bytes} bytes, and bucket cap is {self.bucket_bytes_cap}"
-                )
-            return self.backend_compile_fn(gm, example_inputs)
+                    buckets[0].size += maybe_param.storage().nbytes()
+                    buckets[0].params.append(node.target)
 
-        if len(bucket_actual_sizes) < len(node_splits):
-            bucket_actual_sizes.insert(0, bucket_bytes)
+            # All nodes have to be mapped to a bucket, even if they don't have their own params
+            buckets[0].nodes.append(node)
 
+        # stash buckets for testing/debugging purposes
+        self.buckets = buckets
         if self.debug:
             print(
-                f"DDPOptimizer used bucket cap {self.bucket_bytes_cap}"
-                f" and split graphs into parameter sizes {', '.join([str(b) for b in bucket_actual_sizes])}"
+                f"DDPOptimizer used bucket cap {self.bucket_bytes_cap} and produced the following buckets:"
             )
+            pretty_print_buckets(buckets)
+
+        if len(buckets) == 1:
+            # bypass split/fuse logic if there is only one bucket
+            return self.backend_compile_fn(gm, example_inputs)
 
         # 2: partition the graphmodule according to bucket capacity
         partition_map = {}
-        for p, nodes in enumerate(node_splits):
-            for node in nodes:
-                partition_map[node] = p
+        for idx, b in enumerate(buckets):
+            for node in b.nodes:
+                partition_map[node] = idx
 
         split_gm = fx.passes.split_module.split_module(
             gm, None, lambda node: partition_map[node]
         )
         if self.debug:
-            with open("debug_ddp_optimizer.log", "w") as dump_file:
-                dump_file.write("---orig graph---")
-                dump_file.write(str(gm.graph))
-                dump_file.write("\n---split graph---")
-                dump_file.write(str(split_gm.graph))
+            print("---orig graph---")
+            print(str(gm.graph))
+            print("\n---split graph---")
+            print(str(split_gm.graph))
+            for name, module in split_gm.named_modules():
+                if "." not in name:
+                    # only print the submod graphs, not their children
+                    print(f"\n---{name} graph---")
+                    print(str(module.graph))
+            print("---------------")
 
         # 3: compile each of the partitioned submodules using the user-provided compiler
         class SubmodCompiler(torch.fx.interpreter.Interpreter):
@@ -171,7 +199,6 @@ def run_node(self, n: Node) -> Any:
                         self.module.delete_submodule(n.target)
                         n.target = "compiled_" + n.target
                         self.module.add_submodule(n.target, compiled_submod)
-
                     # then we execute the modified node using the usual logic
                     return getattr(self, n.op)(n.target, args, kwargs)
 
@@ -180,8 +207,8 @@ def run_node(self, n: Node) -> Any:
         split_gm.recompile()
 
         if self.debug:
-            with open("debug_ddp_optimizer.log", "a") as dump_file:
-                dump_file.write("\n---final graph---")
-                dump_file.write(str(split_gm.graph))
+            print("\n---final graph---")
+            print(str(split_gm.graph))
+            print("---------------")
 
         return split_gm

From 58fba3d0bca7f74f8c76645ccea598a240f955cb Mon Sep 17 00:00:00 2001
From: lezcano <lezcano-93@hotmail.com>
Date: Sun, 23 Oct 2022 20:38:41 +0000
Subject: [PATCH 0058/1922] Improve readability of the extra message errors in
 assertEqual (#87202)

Goes from (note the `linspace.default` is very difficult to find)
```
Mismatched elements: 15 / 50 (30.0%)
Greatest absolute difference: 1 at index (17,)
Greatest relative difference: 1.0 at index (17,) : linspace.default
args = (0, -3, 50)
kwargs = {'dtype': torch.int16, 'device': device(type='cpu'),
'pin_memory': False}
```
to
```
Mismatched elements: 15 / 50 (30.0%)
Greatest absolute difference: 1 at index (17,)
Greatest relative difference: 1.0 at index (17,)
linspace.default
args = (0, -3, 50)
kwargs = {'dtype': torch.int16, 'device': device(type='cpu'),
'pin_memory': False}
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87202
Approved by: https://github.com/ezyang
---
 test/test_testing.py                    | 2 +-
 torch/testing/_internal/common_utils.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_testing.py b/test/test_testing.py
index fad72ab91de0a..3ad6ff06c771e 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -68,7 +68,7 @@ def test_assertEqual_longMessage(self):
 
             self.longMessage = True
             extra_msg = "sentinel"
-            with self.assertRaisesRegex(AssertionError, re.escape(f"{default_msg} : {extra_msg}")):
+            with self.assertRaisesRegex(AssertionError, re.escape(f"{default_msg}\n{extra_msg}")):
                 self.assertEqual(actual, expected, msg=extra_msg)
         finally:
             self.longMessage = long_message
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 77887574e1888..5da1ffefaba91 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -2497,7 +2497,7 @@ def to_list(input):
             # This emulates unittest.TestCase's behavior if a custom message passed and
             # TestCase.longMessage (https://docs.python.org/3/library/unittest.html#unittest.TestCase.longMessage)
             # is True (default)
-            msg=(lambda generated_msg: f"{generated_msg} : {msg}") if isinstance(msg, str) and self.longMessage else msg,
+            msg=(lambda generated_msg: f"{generated_msg}\n{msg}") if isinstance(msg, str) and self.longMessage else msg,
         )
 
     def assertNotEqual(self, x, y, msg: Optional[str] = None, *,                                       # type: ignore[override]

From 4f6bdefd4e166ae19ef1fdf4ea9e0ce0e438e56c Mon Sep 17 00:00:00 2001
From: lezcano <lezcano-93@hotmail.com>
Date: Sun, 23 Oct 2022 20:38:41 +0000
Subject: [PATCH 0059/1922] [PrimTorch] Add maker for *_copy variants of view
 functions (#87278)

Implements `diagonal_copy` as an example. This PR also fixes a number of
correcness issues with `diagonal_copy`.

cc @ezyang @mruberry @ngimel @Lezcano @fdrocha
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87278
Approved by: https://github.com/mruberry
---
 .../functorch/BatchRulesDecompositions.cpp    |  1 +
 aten/src/ATen/native/TensorShape.cpp          | 12 ++++++++--
 test/functorch/test_aotdispatch.py            |  1 +
 tools/autograd/gen_variable_type.py           |  1 +
 torch/_refs/__init__.py                       | 23 +++++++++++++++++++
 .../_internal/common_methods_invocations.py   | 16 +++++++++++--
 6 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
index 157fbf23bf6fd..2ea8b7fac4546 100644
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@@ -257,6 +257,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   OP_DECOMPOSE(frobenius_norm);
   OP_DECOMPOSE(type_as);
   OP_DECOMPOSE(linalg_diagonal);
+  OP_DECOMPOSE(diagonal_copy);
   m.impl("pad", native::pad_symint);
   m.impl("_pad_circular", native::_pad_circular_symint);
   OP_DECOMPOSE(t_);
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index a72fba7ac12e0..05d1f53515c14 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -3709,8 +3709,16 @@ at::Tensor& _sparse_broadcast_to_copy_out(const at::Tensor & self, at::IntArrayR
 
 
 at::Tensor& diagonal_copy_out(const at::Tensor & self, int64_t offset, int64_t dim1, int64_t dim2, at::Tensor & out) {
-  auto tmp = self.diagonal(offset, dim1, dim2);
-  out.copy_(tmp);
+  TORCH_CHECK(
+    out.device() == self.device(),
+    "diagonal_copy: Expected out and self tensors to be on the same device, but got ",
+    "out on ", out.device(), " and self on ", self.device());
+  auto result = self.diagonal(offset, dim1, dim2);
+  at::native::resize_output(out, result.sizes());
+  TORCH_CHECK(
+      canCast(result.scalar_type(), out.scalar_type()),
+      "diagonal_copy: result type ", result.scalar_type(), " can't be cast to the desired out= type ", out.scalar_type());
+  out.copy_(result);
   return out;
 }
 
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index c058b3618ecb1..57013636eeabf 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1003,6 +1003,7 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('deg2rad', ''),  # aten.deg2rad.default - couldn't find symbolic meta function/decomposition
     xfail('diag', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('diagonal', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
+    xfail('diagonal_copy', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('diagonal_scatter', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('diff', ''),  # aten.zeros_like.default - couldn't find symbolic meta function/decomposition
     xfail('digamma', ''),  # aten.polygamma.default - couldn't find symbolic meta function/decomposition
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 53bd60b76e6bd..2feb84bbd088d 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -295,6 +295,7 @@
     "reflection_pad3d",
     "linalg_cholesky_ex",
     "linalg_eig",
+    "diagonal_copy",
     "select_backward",
     "diagonal_backward",
     "slice_backward",
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index d6a8f476b3176..6169e5af06d9a 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -220,6 +220,7 @@
     "contiguous",
     "diag_embed",
     "diagonal",
+    "diagonal_copy",
     "dsplit",
     "dstack",
     "expand",
@@ -2001,6 +2002,25 @@ def _reduction(
     return result
 
 
+def _make_copy_from_view(fn):
+    """
+    Given a view function (e.g. torch.diagonal) generates its copy variant (e.g. torch.diagonal_copy)
+    """
+    name = fn.__name__
+    fn = out_wrapper()(fn)
+
+    def _fn(*args, out=None, **kwargs):
+        result = fn(*args, out=out, **kwargs)
+        if out is None:
+            return result.clone(memory_format=torch.contiguous_format)
+        return result
+
+    copy_name = f"{name}_copy"
+    _fn.__name__ = copy_name
+    _fn = register_decomposition(getattr(torch.ops.aten, copy_name))(_fn)
+    return _fn
+
+
 # Saves Python all
 py_all = all
 
@@ -3505,6 +3525,9 @@ def diagonal(
     return result
 
 
+diagonal_copy = _make_copy_from_view(diagonal)
+
+
 @register_decomposition(torch.ops.aten.diag_embed)
 def diag_embed(
     t: TensorLikeType,
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index f637339f16d24..95b1df24a9512 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -5138,7 +5138,7 @@ def reference_inputs_diagonal_diag_embed(op_info, device, dtype, requires_grad,
     samples3d = product(shapes3d, kwargs3d)
 
     for shape, kwargs in chain(samples1d, samples2d, samples3d):
-        if op_info.name in ('diagonal', '_refs.diagonal'):
+        if 'diagonal' in op_info.name:
             # these are error inputs for diagonal
             if shape in ((0,), (1,)):
                 continue
@@ -5174,7 +5174,7 @@ def error_inputs_diagonal_diag_embed(op_info, device, **kwargs):
         dim1 = kwargs.get('dim1')
         dim2 = kwargs.get('dim2')
 
-        if op_info.name in ('diagonal', '_refs.diagonal'):
+        if 'diagonal' in op_info.name:
             num_dim = arg.dim()
         elif op_info.name in ('diag_embed', '_refs.diag_embed'):
             # these are valid inputs for diag_embed
@@ -9185,6 +9185,13 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            sample_inputs_func=sample_inputs_diagonal_diag_embed,
            reference_inputs_func=reference_inputs_diagonal_diag_embed,
            error_inputs_func=error_inputs_diagonal_diag_embed),
+    OpInfo('diagonal_copy',
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_diagonal_diag_embed,
+           reference_inputs_func=reference_inputs_diagonal_diag_embed,
+           error_inputs_func=error_inputs_diagonal_diag_embed),
     OpInfo('diagonal_scatter',
            dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16),
            supports_out=False,
@@ -17694,6 +17701,11 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         torch_opinfo_name="diagonal",
         supports_nvfuser=False,
     ),
+    PythonRefInfo(
+        "_refs.diagonal_copy",
+        torch_opinfo_name="diagonal_copy",
+        supports_nvfuser=False,
+    ),
     PythonRefInfo(
         "_refs.diag_embed",
         torch_opinfo_name="diag_embed",

From 6d90ea6627714417066203f8840d5892cf190097 Mon Sep 17 00:00:00 2001
From: lezcano <lezcano-93@hotmail.com>
Date: Sun, 23 Oct 2022 20:38:41 +0000
Subject: [PATCH 0060/1922] Simplify a few diagonal-related functions (#87180)

`diag` was unnecessarily implemented as a kernel rather than as a composite
function, which made it unnecessarily difficult (explicit backward + all it entails).

We also change a few uses of `diag` on 2D tensors for `diagonal()`. The
latter returns a view rather than creating a new tensor.

We also upgrade its meta implementation to a fully-fledged
decomposition

I tried implementing the backwards of `diagonal()` via `diag_scatter` (or better `diag_scatter_` to keep the perf) but functionalisation was failing and I was not sure how to fix this, so I moved on. It may be possible to simplify that one as well if @soulitzer or someone knows how to do this.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87180
Approved by: https://github.com/ngimel, https://github.com/albanD, https://github.com/mruberry
---
 .../functorch/BatchRulesDecompositions.cpp    |   1 -
 aten/src/ATen/functorch/BatchRulesViews.cpp   |   2 +-
 aten/src/ATen/native/Correlation.cpp          |   2 +-
 aten/src/ATen/native/TensorShape.cpp          |  79 ++-----
 aten/src/ATen/native/cuda/TriangularOps.cu    | 130 +-----------
 .../native/mps/operations/TriangularOps.mm    | 192 ------------------
 aten/src/ATen/native/native_functions.yaml    |  15 +-
 aten/src/ATen/native/ts_native_functions.yaml |   1 +
 .../check_forward_backward_compatibility.py   |   1 +
 test/functorch/test_vmap.py                   |   3 +
 test/lazy/test_ts_opinfo.py                   |   1 +
 test/test_autograd.py                         |  13 --
 test/test_decomp.py                           |   2 +
 tools/autograd/derivatives.yaml               |   4 -
 torch/_meta_registrations.py                  |  16 --
 torch/_refs/__init__.py                       |  17 ++
 .../csrc/jit/runtime/static/generated_ops.cpp |  19 --
 .../lazy/ts_backend/ts_native_functions.cpp   |   9 +
 .../_internal/common_methods_invocations.py   |  22 +-
 19 files changed, 69 insertions(+), 460 deletions(-)

diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
index 2ea8b7fac4546..f1108bac25a0a 100644
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@@ -81,7 +81,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   OP_DECOMPOSE2(dsplit, int);
   OP_DECOMPOSE2(dsplit, array);
   OP_DECOMPOSE(det);
-  m.impl("diag_backward", native::diag_backward_symint);
   OP_DECOMPOSE(diff);
   OP_DECOMPOSE(dstack);
   OP_DECOMPOSE(einsum);
diff --git a/aten/src/ATen/functorch/BatchRulesViews.cpp b/aten/src/ATen/functorch/BatchRulesViews.cpp
index 9dd014a4307f9..b8c3727d15dcc 100644
--- a/aten/src/ATen/functorch/BatchRulesViews.cpp
+++ b/aten/src/ATen/functorch/BatchRulesViews.cpp
@@ -506,7 +506,7 @@ std::tuple<Tensor, optional<int64_t>> diag_embed_batch_rule(const Tensor& self,
 }
 
 Tensor trace_decomp(const Tensor& tensor) {
-  return tensor.diag().sum();
+  return tensor.diagonal().sum();
 }
 
 TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
diff --git a/aten/src/ATen/native/Correlation.cpp b/aten/src/ATen/native/Correlation.cpp
index 204e4f2cb5688..9aca753c78ca5 100644
--- a/aten/src/ATen/native/Correlation.cpp
+++ b/aten/src/ATen/native/Correlation.cpp
@@ -139,7 +139,7 @@ Tensor corrcoef(const Tensor& self) {
   }
 
   // normalize covariance
-  const auto d = c.diag();
+  const auto d = c.diagonal();
   const auto stddev = at::sqrt(d.is_complex() ? at::real(d) : d);
   c = c / stddev.view({-1, 1});
   c = c / stddev.view({1, -1});
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 05d1f53515c14..6543509d3dcb8 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -3390,72 +3390,29 @@ Tensor unfold(const Tensor& self, int64_t d, int64_t size, int64_t step) {
   return self.as_strided(sizes, strides);
 }
 
-template <typename scalar_t>
-void apply_diag(Tensor& result, const Tensor& self, int64_t dimension) {
-  TORCH_CHECK(self.dim() == 1 || self.dim() == 2, "matrix or a vector expected");
-
-  auto self_data = self.data_ptr<scalar_t>();
-  if (self.dim() == 1) {
-    auto self_size = self.size(0);
-    auto self_stride = self.stride(0);
-    int64_t sz = self_size + std::abs(dimension);
-
-    at::native::resize_output(result, {sz, sz});
-    result.zero_();
-    auto r_data = result.data_ptr<scalar_t>();
-    auto r_stride_0 = result.stride(0);
-    auto r_stride_1 = result.stride(1);
-    r_data += (dimension >= 0 ? dimension*r_stride_1 : -dimension*r_stride_0);
-
-    for (const auto i : c10::irange(self_size)) {
-      r_data[i * (r_stride_0 + r_stride_1)] = self_data[i * self_stride];
-    }
+Tensor diag(const Tensor& self, int64_t offset) {
+  auto ndim = self.dim();
+  TORCH_CHECK(ndim == 1 || ndim == 2, "diag(): Supports 1D or 2D tensors. Got ", self.dim(), "D");
+  if (ndim == 1) {
+    return at::diag_embed(self, offset);
   } else {
-    auto self_stride_0 = self.stride(0);
-    auto self_stride_1 = self.stride(1);
-
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    int64_t sz;
-    if (dimension >= 0) {
-      sz = std::min(self.size(0), self.size(1) - dimension);
-    } else {
-      sz = std::min(self.size(0) + dimension, self.size(1));
-    }
-
-    at::native::resize_output(result, {sz});
-    result.zero_();
-    auto r_data = result.data_ptr<scalar_t>();
-    auto r_stride_0 = result.stride(0);
-    self_data += (dimension >= 0 ? dimension * self_stride_1 : -dimension * self_stride_0);
-    for (const auto i : c10::irange(sz)) {
-      r_data[i * r_stride_0] = self_data[i * (self_stride_0 + self_stride_1)];
-    }
+    // We return a copy of the diagonal
+    return at::diagonal_copy(self, offset);
   }
 }
 
-Tensor diag(const Tensor& self, int64_t dimension) {
-  Tensor result = at::empty({0}, self.options());
-  at::diag_out(result, self, dimension);
-  return result;
-}
-
-Tensor& diag_cpu_out(const Tensor& self, int64_t dimension, Tensor &result) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kBool, self.scalar_type(), "diag", [&] {
-    apply_diag<scalar_t>(result, self, dimension);
-  });
-  return result;
-}
-
-Tensor diag_backward_symint(const Tensor& grad, SymIntArrayRef input_sizes, int64_t diagonal) {
-  auto ndimension = input_sizes.size();
-  AT_ASSERT(ndimension == 1 || ndimension == 2);
-
-  if (ndimension == 1 || input_sizes[0] == input_sizes[1]) {
-    return grad.diag(diagonal);
+Tensor& diag_out(const Tensor& self, int64_t offset, Tensor& out) {
+  auto ndim = self.dim();
+  TORCH_CHECK(ndim == 1 || ndim == 2, "Supports 1D or 2D tensors. Got ", self.dim(), "D");
+  if (ndim == 1) {
+    TORCH_CHECK(
+        canCast(self.scalar_type(), out.scalar_type()),
+        "diag: result type ", self.scalar_type(), " can't be cast to the desired out= type ",
+        out.scalar_type());
+    return at::diag_embed_out(out, self, offset);
+  } else {
+    return at::diagonal_copy_out(out, self, offset);
   }
-
-  // Input was a matrix but was not square
-  return at::diagonal_backward_symint(grad, input_sizes, diagonal, 0, 1);
 }
 
 Tensor diagonal_backward_symint(const Tensor & grad, SymIntArrayRef input_sizes, int64_t offset, int64_t dim1, int64_t dim2) {
diff --git a/aten/src/ATen/native/cuda/TriangularOps.cu b/aten/src/ATen/native/cuda/TriangularOps.cu
index f87d821f396ce..a079ec6849888 100644
--- a/aten/src/ATen/native/cuda/TriangularOps.cu
+++ b/aten/src/ATen/native/cuda/TriangularOps.cu
@@ -102,137 +102,9 @@ TORCH_IMPL_FUNC(triu_cuda)(const Tensor& self, int64_t k, const Tensor &result)
   }
 }
 
-// Copy the kth diagonal of a matrix B to a vector A.
-template <typename scalar_t>
-C10_LAUNCH_BOUNDS_1(1024)
-__global__ void copy_from_diagonal_kernel(
-    scalar_t* a,
-    scalar_t* b,
-    std::ptrdiff_t start,
-    std::ptrdiff_t size,
-    std::ptrdiff_t strideSum,
-    std::ptrdiff_t strideA) {
-  for (std::ptrdiff_t linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
-       linearIndex < size;
-       linearIndex += gridDim.x * blockDim.x) {
-    const std::ptrdiff_t bOffset = start + strideSum * linearIndex;
-    a[strideA * linearIndex] = b[bOffset];
-  }
-}
-
-// Copy vector B to the kth diagonal of a matrix A
-template <typename scalar_t>
-C10_LAUNCH_BOUNDS_1(1024)
-__global__ void copy_to_diagonal_kernel(
-    scalar_t* a,
-    scalar_t* b,
-    std::ptrdiff_t start,
-    std::ptrdiff_t size,
-    std::ptrdiff_t strideSum,
-    std::ptrdiff_t strideB) {
-  for (std::ptrdiff_t linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
-       linearIndex < size;
-       linearIndex += gridDim.x * blockDim.x) {
-    const std::ptrdiff_t aOffset = start + strideSum * linearIndex;
-    a[aOffset] = b[strideB * linearIndex];
-  }
-}
-
-template <typename scalar_t>
-Tensor& apply_diag(Tensor& result, const Tensor& self, int64_t dimension) {
-  TORCH_CHECK(
-      self.dim() == 1 || self.dim() == 2, "matrix or a vector expected");
-
-  TensorArg result_arg{result, "result", 1};
-  TensorArg self_arg{self, "self", 2};
-  checkAllSameGPU(__func__, {result_arg, self_arg});
-  checkSameType(__func__, result_arg, self_arg);
-
-  int nDimension = self.dim();
-  if (nDimension == 2) {
-    auto self_stride_0 = self.stride(0);
-    auto self_stride_1 = self.stride(1);
-
-    int sz;
-    if (dimension > 0) {
-      sz = std::min(self.size(0), self.size(1) - dimension);
-    } else {
-      sz = std::min(self.size(0) + dimension, self.size(1));
-    }
-
-    at::native::resize_output(result, {sz});
-    if (sz > 0) {
-      at::assert_no_internal_overlap(result);
-      auto result_stride = result.stride(0);
-      const dim3 threads(std::min(
-          int(sz),
-          int(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock)));
-      const dim3 grid(
-          std::min(int(1024), ceil_div(int(sz), int(threads.x))));
-      auto start =
-          (dimension >= 0 ? dimension * self_stride_1
-                          : -dimension * self_stride_0);
-
-      // Kernel Launch
-      copy_from_diagonal_kernel<scalar_t>
-          <<<grid, threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
-              result.data_ptr<scalar_t>(),
-              self.data_ptr<scalar_t>(),
-              start,
-              sz,
-              self_stride_0 + self_stride_1,
-              result_stride);
-      C10_CUDA_KERNEL_LAUNCH_CHECK();
-    }
-  } else {
-    auto n_elems = self.numel();
-    auto sz = (dimension > 0) ? n_elems + dimension : n_elems - dimension;
-    auto self_stride = self.stride(0);
-    at::native::resize_output(result, {sz, sz});
-    result.zero_();
-    if (sz > 0) {
-      at::assert_no_internal_overlap(result);
-      auto result_stride_0 = result.stride(0);
-      auto result_stride_1 = result.stride(1);
-      const dim3 threads(std::min(
-          int(sz), at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock));
-      const dim3 grid(
-          std::min(int(1024), ceil_div(int(sz), int(threads.x))));
-      auto start =
-          (dimension >= 0 ? dimension * result_stride_1
-                          : -dimension * result_stride_0);
-
-      // Kernel Launch
-      copy_to_diagonal_kernel<scalar_t>
-          <<<grid, threads, 0, c10::cuda::getCurrentCUDAStream()>>>(
-              result.data_ptr<scalar_t>(),
-              self.data_ptr<scalar_t>(),
-              start,
-              n_elems,
-              result_stride_0 + result_stride_1,
-              self_stride);
-      C10_CUDA_KERNEL_LAUNCH_CHECK();
-    }
-  }
-
-  return result;
-}
-
-Tensor& diag_cuda_out(const Tensor& self, int64_t dimension, Tensor& result) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
-      kComplexHalf, ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool,
-      self.scalar_type(), "diag_cuda",
-      [&] {
-        apply_diag<scalar_t>(result, self, dimension);
-      });
-  return result;
-}
-
 Tensor trace_cuda(const Tensor& self) {
   TORCH_CHECK(self.dim() == 2, "expected a matrix");
-  int dimension = 0;
-  auto result = at::diag(self, dimension);
-  return result.sum();
+  return self.diagonal().sum();
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/mps/operations/TriangularOps.mm b/aten/src/ATen/native/mps/operations/TriangularOps.mm
index fb6e1c52ba49e..c276707964997 100644
--- a/aten/src/ATen/native/mps/operations/TriangularOps.mm
+++ b/aten/src/ATen/native/mps/operations/TriangularOps.mm
@@ -172,197 +172,5 @@
 
 }
 
-Tensor& diag_mps_out(const Tensor& self,
-                     int64_t diagonal,
-                     Tensor &output) {
-
-  // Do checks, resize output
-  IntArrayRef input_size = self.sizes();
-  auto num_input_dims = input_size.size();
-  // Input can only be 1D or 2D
-  TORCH_CHECK(num_input_dims == 1 || num_input_dims == 2,
-    "diag_mps_out: Input tensor must be 1D or 2D")
-
-  if(num_input_dims == 1) {
-    auto n = input_size[0];
-    if(diagonal > 0)
-      n += diagonal;
-    else if(diagonal < 0)
-      n -= diagonal;
-
-    output.resize_({n, n});
-  }
-  else if(num_input_dims == 2) {
-    auto num_diag_elements = std::min(input_size[0], input_size[1]);
-    if(diagonal > 0) {
-      TORCH_CHECK(input_size[1] - diagonal > 0, "Matrix not big enough for requested diagonal")
-      num_diag_elements = std::min(input_size[0], input_size[1] - diagonal);
-    }
-    else if(diagonal < 0) {
-      TORCH_CHECK(input_size[0] + diagonal > 0, "Matrix not big enough for requested diagonal")
-      num_diag_elements = std::min(input_size[0] + diagonal, input_size[1]);
-    }
-
-    output.resize_({num_diag_elements});
-  }
-
-  using namespace mps;
-  MPSStream* stream = getCurrentMPSStream();
-
-  // Derive from MPSCachedGraph
-  struct CachedGraph : public MPSCachedGraph
-  {
-    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
-    MPSGraphTensor *inputTensor_ = nil;
-    MPSGraphTensor *outputTensor_ = nil;
-  };
-
-  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
-
-  @autoreleasepool {
-
-    MPSShape* input_shape = getMPSShape(self);
-    MPSShape* output_shape = getMPSShape(output);
-    NSNumber* num_input_cols = nil;
-    NSNumber* num_output_cols = nil;
-    NSMutableArray<NSNumber*>* flat_input_shape = nil;
-    NSMutableArray<NSNumber*>* flat_output_shape = nil;
-    if(num_input_dims == 1) {
-      num_output_cols = output_shape[1];
-      flat_output_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:1];
-      flat_output_shape[0] = [NSNumber numberWithInt:[output_shape[0] intValue] * [output_shape[1] intValue]];
-    }
-    else if(num_input_dims == 2) {
-      num_input_cols = input_shape[1];
-      flat_input_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:1];
-      flat_input_shape[0] = [NSNumber numberWithInt:[input_shape[0] intValue] * [input_shape[1] intValue]];
-    }
-    NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
-    string key = "diag_mps_out:" + getMPSTypeString(self.scalar_type()) + ":" + std::to_string(diagonal)
-                                 + ":" + string([ns_shape_key UTF8String]);
-    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
-
-    if(!cachedGraph) {
-      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
-        CachedGraph *newCachedGraph = nil;
-
-        @autoreleasepool {
-          MPSGraph* mpsGraph = make_mps_graph();
-          newCachedGraph = new CachedGraph(mpsGraph);
-
-          // TODO: Accept this as the flat version in 2D case
-          MPSGraphTensor* inputTensor = nil;
-          if(num_input_dims == 1)
-           inputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type()));
-         else
-           inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type()), flat_input_shape);
-
-          MPSGraphTensor* outputTensor = nil;
-
-          MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0
-                                                           dataType:MPSDataTypeInt32];
-          MPSGraphTensor* numDiagElementsRange = nil;
-          MPSGraphTensor* diagOffset = nil;
-          MPSGraphTensor* rowMultiplier = nil;
-          MPSGraphTensor* rowIndices = nil;
-          MPSGraphTensor* colIndices = nil;
-          MPSGraphTensor* indicesTensor = nil;
-
-          if(num_input_dims == 1) {
-            int shape_data[1] = {[input_shape[0] intValue]};
-            MPSGraphTensor* inputShapeTensor = [mpsGraph constantWithData:[NSData dataWithBytes:shape_data length:sizeof(int)]
-                                                                    shape:@[@1]
-                                                                 dataType:MPSDataTypeInt32];
-            numDiagElementsRange = [mpsGraph coordinateAlongAxisTensor: zeroTensor
-                                                       withShapeTensor: inputShapeTensor
-                                                                  name: nil];
-            diagOffset = [mpsGraph constantWithScalar:diagonal
-                                             dataType:MPSDataTypeInt32];
-            rowMultiplier = [mpsGraph constantWithScalar:[num_output_cols intValue]
-                                                dataType:MPSDataTypeInt32];
-          }
-          else {
-            int shape_data[1] = {[output_shape[0] intValue]};
-            MPSGraphTensor* outputShapeTensor = [mpsGraph constantWithData:[NSData dataWithBytes:shape_data length:sizeof(int)]
-                                                                     shape:@[@1]
-                                                                  dataType:MPSDataTypeInt32];
-            numDiagElementsRange = [mpsGraph coordinateAlongAxisTensor: zeroTensor
-                                                       withShapeTensor: outputShapeTensor
-                                                                  name: nil];
-            diagOffset = [mpsGraph constantWithScalar:diagonal
-                                             dataType:MPSDataTypeInt32];
-            rowMultiplier = [mpsGraph constantWithScalar:[num_input_cols intValue]
-                                                dataType:MPSDataTypeInt32];
-          }
-
-          if(diagonal >= 0) {
-            rowIndices = numDiagElementsRange;
-            colIndices = [mpsGraph additionWithPrimaryTensor:numDiagElementsRange
-                                             secondaryTensor:diagOffset
-                                                        name:nil];
-          }
-          else {
-            rowIndices = [mpsGraph subtractionWithPrimaryTensor:numDiagElementsRange
-                                                secondaryTensor:diagOffset
-                                                           name:nil];;
-            colIndices = numDiagElementsRange;
-          }
-
-          indicesTensor = [mpsGraph multiplicationWithPrimaryTensor:rowIndices
-                                                    secondaryTensor:rowMultiplier
-                                                               name:nil];
-          indicesTensor = [mpsGraph additionWithPrimaryTensor:indicesTensor
-                                              secondaryTensor:colIndices
-                                                         name:nil];
-
-          if(num_input_dims == 1) {
-            // TODO: Scatter mode doesn't matter, so what should I set it to be?
-            outputTensor = [mpsGraph scatterWithUpdatesTensor:inputTensor
-                                                indicesTensor:indicesTensor
-                                                        shape:flat_output_shape
-                                                         axis:0
-                                                         mode:MPSGraphScatterModeAdd
-                                                         name:nil];
-            outputTensor = [mpsGraph reshapeTensor:outputTensor
-                                         withShape:output_shape
-                                              name:nil];
-          }
-          else if(num_input_dims == 2) {
-            outputTensor = [mpsGraph gatherWithUpdatesTensor:inputTensor
-                                               indicesTensor:indicesTensor
-                                                        axis:0
-                                             batchDimensions:0
-                                                        name:nil];
-          }
-
-          newCachedGraph->inputTensor_ = inputTensor;
-          newCachedGraph->outputTensor_ = outputTensor;
-        }
-        return newCachedGraph;
-      });
-      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
-    }
-
-    Placeholder selfPlaceholder = Placeholder();
-    if(num_input_dims == 1)
-      selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
-    else
-      selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, flat_input_shape);
-
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
-    };
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
-    };
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
-  }
-
-  return output;
-}
-
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index b827999cf54e9..faab6371c8af1 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1815,7 +1815,7 @@
 - func: diag_embed(Tensor self, int offset=0, int dim1=-2, int dim2=-1) -> Tensor
   variants: function, method
   dispatch:
-    CompositeExplicitAutograd: diag_embed
+    CompositeExplicitAutogradNonFunctional: diag_embed
   autogen: diag_embed.out
 
 - func: diagflat(Tensor self, int offset=0) -> Tensor
@@ -7698,21 +7698,10 @@
 
 - func: diag.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: diag_cpu_out
-    CUDA: diag_cuda_out
-    MPS: diag_mps_out
+    CPU, CUDA: diag_out
 
 - func: diag(Tensor self, int diagonal=0) -> Tensor
   variants: method, function
-  dispatch:
-    CompositeExplicitAutograd: diag
-
-- func: diag_backward(Tensor grad, SymInt[] input_sizes, int diagonal) -> Tensor
-  variants: function
-  device_check: NoCheck
-  device_guard: False
-  dispatch:
-    CompositeImplicitAutograd: diag_backward_symint
 
 - func: cross.out(Tensor self, Tensor other, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)
 
diff --git a/aten/src/ATen/native/ts_native_functions.yaml b/aten/src/ATen/native/ts_native_functions.yaml
index fc287045dc9dd..3cb290f1004bf 100644
--- a/aten/src/ATen/native/ts_native_functions.yaml
+++ b/aten/src/ATen/native/ts_native_functions.yaml
@@ -189,6 +189,7 @@ supported:
   # after functionalization,
   # but their implementations call view operators (which we need to functionalize away).
   - block_diag
+  - diag_embed
   - diagonal_backward
   - slice_backward
   - new_empty_strided
diff --git a/test/forward_backward_compatibility/check_forward_backward_compatibility.py b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
index 5f13834ee77e0..30e398dbf1e0d 100644
--- a/test/forward_backward_compatibility/check_forward_backward_compatibility.py
+++ b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
@@ -95,6 +95,7 @@
     ("aten::_linalg_inv_out_helper", datetime.date(2022, 10, 1)),
     ("aten::col2im_backward", datetime.date(2022, 12, 1)),
     ("aten::im2col_backward", datetime.date(2022, 12, 1)),
+    ("aten::diag_backward", datetime.date(2022, 12, 1)),
     ("aten::solve", datetime.date(9999, 1, 1)),
     ("aten::solve.solution", datetime.date(9999, 1, 1)),
     ("aten::_solve_helper", datetime.date(9999, 1, 1)),
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index 2ee0bc8537604..be457cfe25fcd 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -3293,6 +3293,8 @@ def test():
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
     @skipOps('TestVmapOperatorsOpInfo', 'test_vmap_exhaustive', vmap_fail.union({
         xfail('native_batch_norm'),
+        # The error inputs are vectors, that pass when batched as they are treated as a matrix
+        xfail('trace'),
     }))
     def test_vmap_exhaustive(self, device, dtype, op):
         # needs to be fixed
@@ -3349,6 +3351,7 @@ def test_vmap_exhaustive(self, device, dtype, op):
         xfail('resize_'),
         xfail('view_as_complex'),
         xfail('matrix_exp'),
+        xfail('trace'),  # Does not support batched tensors
         xfail('bucketize'),
         xfail('fft.ihfft2'),
         xfail('fft.ihfftn'),
diff --git a/test/lazy/test_ts_opinfo.py b/test/lazy/test_ts_opinfo.py
index f5974ec9f6c2c..2e67035581477 100644
--- a/test/lazy/test_ts_opinfo.py
+++ b/test/lazy/test_ts_opinfo.py
@@ -59,6 +59,7 @@ def init_lists():
     # but run functionalized versions of the composite kernels in core.
     # This means that we don't expect the ops to show directly in the LTC metrics.
     FUNCTIONAL_DECOMPOSE_LIST = set([
+        'diag_embed',
         'block_diag',
         'new_empty_strided',
         'narrow_copy',
diff --git a/test/test_autograd.py b/test/test_autograd.py
index bcb42449c349f..03cc78dc242fb 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -3510,19 +3510,6 @@ def test_out_variant_raises_when_inputs_require_grad(self):
         # we should throw an exception if the output requires grad
         self.assertRaisesRegex(RuntimeError, 'out=', lambda: torch.mul(a, b, out=x))
 
-    # TODO: see if this test can be OpInfo'd or moved to diagonal's test suite
-    def test_diagonal_derivative_requires_grad(self):
-        # test that the backward requires grad
-        # we do this is because diagonal_backward uses inplace
-        # operations and gradgradcheck does not catch whether
-        # they works as expected (it will succeed even if
-        # the gradient has requires_grad == False
-        a = torch.randn(5, 6, requires_grad=True)
-        b = torch.diagonal(a)**2
-        c = b.sum()
-        d, = torch.autograd.grad(c, a, retain_graph=True, create_graph=True)
-        self.assertTrue(d.requires_grad)
-
     def test_anomaly_detect_nan(self):
         size = 10
 
diff --git a/test/test_decomp.py b/test/test_decomp.py
index dbc754147858f..27ad870a2adb1 100644
--- a/test/test_decomp.py
+++ b/test/test_decomp.py
@@ -292,6 +292,8 @@ def normalize_op_input_output(f, sample, requires_grad=True):
     # See https://github.com/pytorch/pytorch/issues/81669
     (None, None, "nn.functional.relu6"),
     (None, None, "meshgrid"),
+    # diag was not decomposed (it just registers a decomp for diag_out, torch.diag is CompImplicit)
+    (None, None, "diag"),
 }
 
 CROSS_REF_BACKWARD_EXCLUDE_SET = {
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 7ddbe8dd6cf70..853faeb1b2033 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -526,10 +526,6 @@
   self: grad.diagonal(offset, dim1, dim2)
   result: auto_linear
 
-- name: diag(Tensor self, int diagonal=0) -> Tensor
-  self: diag_backward_symint(grad, self.sym_sizes(), diagonal)
-  result: auto_linear
-
 - name: diagonal(Tensor(a) self, int offset=0, int dim1=0, int dim2=1) -> Tensor(a)
   self: diagonal_backward_symint(grad, self.sym_sizes(), offset, dim1, dim2)
   result: auto_linear
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 7be63af9e051a..22ceaaf0a18b0 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1019,22 +1019,6 @@ def is_fast_path(src, scale, output, padding_idx):
     return output, offset2bag, bag_size, max_indices
 
 
-@register_meta([aten.diag.default, aten.diag.out])
-@out_wrapper()
-def meta_diag(self, dim=0):
-    check(self.dim() in (1, 2), lambda: "matrix or a vector expected")
-    if self.dim() == 1:
-        sz = self.size(0) + abs(dim)
-        return self.new_empty((sz, sz))
-
-    # case: dim is 2
-    if dim >= 0:
-        sz = min(self.size(0), self.size(1) - dim)
-    else:
-        sz = min(self.size(0) + dim, self.size(1))
-    return self.new_empty((sz,))
-
-
 @register_meta(aten._embedding_bag_forward_only.default)
 def meta_embedding_bag_forward_only(weight, indices, offsets, *args):
     output, offset2bag, bag_size, max_indices = meta_embedding_bag(
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 6169e5af06d9a..3e2f6c45768a6 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -219,6 +219,7 @@
     "constant_pad_nd",
     "contiguous",
     "diag_embed",
+    "diag",
     "diagonal",
     "diagonal_copy",
     "dsplit",
@@ -3483,6 +3484,22 @@ def vsplit(
     return tensor_split(a, split_sizes, 0)
 
 
+@register_decomposition(torch.ops.aten.diag.out)
+@out_wrapper()
+def diag(
+    self: TensorLikeType,
+    offset: int = 0,
+) -> TensorLikeType:
+    ndim = self.dim()
+    utils.check(
+        ndim in (1, 2), lambda: f"diag(): Supports 1D or 2D tensors. Got {ndim}D"
+    )
+    if ndim == 1:
+        return torch.diag_embed(self, offset)
+    else:
+        return torch.diagonal_copy(self, offset)
+
+
 @register_decomposition(torch.ops.aten.diagonal, disable_meta=True)
 def diagonal(
     self: TensorLikeType,
diff --git a/torch/csrc/jit/runtime/static/generated_ops.cpp b/torch/csrc/jit/runtime/static/generated_ops.cpp
index 69cc98bf14ec6..bd9c8d553ab70 100644
--- a/torch/csrc/jit/runtime/static/generated_ops.cpp
+++ b/torch/csrc/jit/runtime/static/generated_ops.cpp
@@ -2431,25 +2431,6 @@ REGISTER_OPERATOR_FUNCTOR(aten::addbmm, aten_addbmm, [](Node* n) -> SROperator {
   return nullptr;
 });
 
-REGISTER_OPERATOR_FUNCTOR(aten::diag, aten_diag, [](Node* n) -> SROperator {
-  if (n->matches(
-          torch::schema("aten::diag(Tensor self, int diagonal=0) -> Tensor"))) {
-    return [](ProcessedNode* p_node) {
-      const auto& self = p_node->Input(0).toTensor();
-      const auto diagonal = p_node->Input(1).toInt();
-      if (p_node->Output(0).isNone()) {
-        p_node->Output(0) = at::native::diag(self, diagonal);
-        return;
-      }
-      auto& out = p_node->Output(0).toTensor();
-      fastResizeToZero(out);
-      at::native::diag_cpu_out(self, diagonal, out);
-    };
-  }
-  LogAndDumpSchema(n);
-  return nullptr;
-});
-
 REGISTER_OPERATOR_FUNCTOR(aten::cross, aten_cross, [](Node* n) -> SROperator {
   if (n->matches(torch::schema(
           "aten::cross(Tensor self, Tensor other, int? dim=None) -> Tensor"))) {
diff --git a/torch/csrc/lazy/ts_backend/ts_native_functions.cpp b/torch/csrc/lazy/ts_backend/ts_native_functions.cpp
index c718fd517b7b0..1bdc0aca8d9af 100644
--- a/torch/csrc/lazy/ts_backend/ts_native_functions.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_native_functions.cpp
@@ -522,6 +522,15 @@ at::Tensor& LazyNativeFunctions::logsumexp_out(
   return out;
 }
 
+at::Tensor LazyNativeFunctions::diag_embed(
+    const at::Tensor& self,
+    int64_t offset,
+    int64_t dim1,
+    int64_t dim2) {
+  return at::functionalization::functionalize_aten_op<ATEN_OP(
+      diag_embed)>::call(self, offset, dim1, dim2);
+}
+
 at::Tensor LazyNativeFunctions::diagonal_backward_symint(
     const at::Tensor& grad_output,
     at::SymIntArrayRef input_sizes,
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 95b1df24a9512..00f454bdf454a 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -2170,10 +2170,10 @@ def error_inputs_ormqr(op_info, device, **kwargs):
 def error_inputs_diag(op_info, device, **kwargs):
     zero_d = torch.randn((), device=device)
     yield ErrorInput(SampleInput(zero_d, args=(0,)), error_type=RuntimeError,
-                     error_regex="matrix or a vector expected")
+                     error_regex="1D or 2D")
     zero_d = torch.randn(1, 1, 1, device=device)
     yield ErrorInput(SampleInput(zero_d, args=(0,)), error_type=RuntimeError,
-                     error_regex="matrix or a vector expected")
+                     error_regex="1D or 2D")
 
 def error_inputs_embedding(op_info, device, **kwargs):
     indices = torch.rand(2, 2, device=device).long()
@@ -9157,10 +9157,12 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),),
            ),
     OpInfo('diag',
-           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
+           ref=np.diag,
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
            dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
+           check_batched_forward_grad=False,
            sample_inputs_func=sample_inputs_diag,
            error_inputs_func=error_inputs_diag),
     OpInfo('diag_embed',
@@ -16270,7 +16272,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "diagflat",
         ref=lambda input, offset=0: np.diagflat(input, k=offset),
         sample_inputs_func=sample_inputs_diagflat,
-        dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
+        dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
         dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
         supports_out=False,
         supports_forward_ad=True,
@@ -17696,6 +17698,11 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         torch_opinfo_name="dsplit",
         supports_nvfuser=False,
     ),
+    PythonRefInfo(
+        "_refs.diag",
+        torch_opinfo_name="diag",
+        supports_nvfuser=False,
+    ),
     PythonRefInfo(
         "_refs.diagonal",
         torch_opinfo_name="diagonal",
@@ -18023,12 +18030,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     PythonRefInfo(
         "_refs.trace",
         torch_opinfo_name="trace",
-        decorators=(
-            # TODO: torch.diag is currently not supported by either refs, meta funcs, or NVFuser
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref'),
-            DecorateInfo(unittest.skip("diag is not supported by meta"), 'TestCommon', 'test_python_ref_meta'),
-            DecorateInfo(unittest.skip("diag is not supported by nvfuser"), 'TestCommon', 'test_python_ref_executor'),
-        ),
+        supports_nvfuser=False,
     ),
     PythonRefInfo(
         "_refs.norm",

From f6c911fec58f9e423d8a3007f9bc11baf302d31b Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 24 Oct 2022 10:43:23 +0000
Subject: [PATCH 0061/1922] [xla hash update] update the pinned xla hash
 (#87590)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned xla hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87590
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/xla.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index e7375040708bd..3ab9c4394d70b 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-eff277e81fcfdeccba71e75ff40b6e2f3e29e27b
+0cb29daa04097c868d23ed666563a3439d67065c

From bd1adb47a71eb844167863c7ff9694e9c2fdffd0 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Mon, 24 Oct 2022 03:39:38 +0000
Subject: [PATCH 0062/1922] [FSDP] Use `reduce_scatter_tensor()` (#87240)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Let us silence some more warnings 👍🏼
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87240
Approved by: https://github.com/rohan-varma
---
 test/distributed/fsdp/test_fsdp_comm.py       |  2 +-
 .../fsdp/test_fsdp_mixed_precision.py         | 26 +++++++++----------
 .../algorithms/_comm_hooks/default_hooks.py   |  2 +-
 torch/testing/_internal/common_fsdp.py        |  4 +--
 4 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/test/distributed/fsdp/test_fsdp_comm.py b/test/distributed/fsdp/test_fsdp_comm.py
index c9946a9dd5665..d19617e31acd3 100644
--- a/test/distributed/fsdp/test_fsdp_comm.py
+++ b/test/distributed/fsdp/test_fsdp_comm.py
@@ -221,7 +221,7 @@ def test_communication(
         # outside `no_sync()`
         num_iters = 3
         with patch("torch.distributed.all_gather_into_tensor") as mock_all_gather, \
-                patch("torch.distributed._reduce_scatter_base") as mock_reduce_scatter:
+                patch("torch.distributed.reduce_scatter_tensor") as mock_reduce_scatter:
             def reset_mocks():
                 mock_all_gather.reset_mock()
                 mock_reduce_scatter.reset_mock()
diff --git a/test/distributed/fsdp/test_fsdp_mixed_precision.py b/test/distributed/fsdp/test_fsdp_mixed_precision.py
index c803164bff4e5..4440e394179ab 100644
--- a/test/distributed/fsdp/test_fsdp_mixed_precision.py
+++ b/test/distributed/fsdp/test_fsdp_mixed_precision.py
@@ -125,17 +125,17 @@
 @contextlib.contextmanager
 def patch_reduce_scatter(new_reduce_scatter, full_precision_param_dtype):
     """
-    Patches dist._reduce_scatter_base with a new reduce_scatter_base and
-    restores upon exiting. Used for validation of mixed precision
+    Patches ``dist.reduce_scatter_tensor`` with ``new_reduce_scatter`` and
+    restores upon exiting. Used for validation of mixed precision.
     """
-    orig_reduce_scatter = dist._reduce_scatter_base
-    dist._reduce_scatter_base = new_reduce_scatter
+    orig_reduce_scatter = dist.reduce_scatter_tensor
+    dist.reduce_scatter_tensor = new_reduce_scatter
     global _CURRENT_FULL_PRECISION_PARAM_DTYPE
     _CURRENT_FULL_PRECISION_PARAM_DTYPE = full_precision_param_dtype
     try:
         yield
     finally:
-        dist._reduce_scatter_base = orig_reduce_scatter
+        dist.reduce_scatter_tensor = orig_reduce_scatter
         _CURRENT_FULL_PRECISION_PARAM_DTYPE = None
 
 class LinearMixedPrecision(nn.Module):
@@ -250,7 +250,7 @@ def _validate_mp_shard_freed(self, fsdp_model):
             for param in fsdp.params:
                 self.assertEqual(0, param._mp_shard.storage().size())
 
-    def _reduce_scatter_base_validate_mp(
+    def _reduce_scatter_validate_mp(
         self,
         orig_reduce_scatter,
         mp_config,
@@ -258,9 +258,9 @@ def _reduce_scatter_base_validate_mp(
         **kwargs
     ):
         """
-        Performs dist._reduce_scatter_base but verifies mixed precision settings
-        before. This is to test mixed precision is working as expected during
-        backward pass. In particular it ensures that the gradients were cast to the right type
+        Runs reduce-scatter but verifies mixed precision settings before. This
+        is to test mixed precision is working as expected during backward pass.
+        In particular it ensures that the gradients were cast to the right type
         and comm. is going to happen in the right type.
         """
         tensors = []
@@ -355,9 +355,9 @@ def _run_test_mixed_precision_e2e(
                 model.cuda()
 
             # Patch reduce_scatter to add validation for mixed precision types.
-            orig_reduce_scatter = dist._reduce_scatter_base
+            orig_reduce_scatter = dist.reduce_scatter_tensor
             test_reduce_scatter = partial(
-                self._reduce_scatter_base_validate_mp, orig_reduce_scatter, mp_config,
+                self._reduce_scatter_validate_mp, orig_reduce_scatter, mp_config,
             )
             with patch_reduce_scatter(test_reduce_scatter, full_precision_param_dtype):
                 scaler = ShardedGradScaler(enabled=enable_sharded_grad_scaler)
@@ -516,9 +516,9 @@ def _test_mixed_precision_embedding_table(self, mp_config):
         # Basic test to ensure int inputs are not casted which would break
         # modules such as embedding tables.
         param_dtype = mp_config.param_dtype or torch.float32
-        orig_reduce_scatter = dist._reduce_scatter_base
+        orig_reduce_scatter = dist.reduce_scatter_tensor
         test_reduce_scatter = partial(
-            self._reduce_scatter_base_validate_mp, orig_reduce_scatter, mp_config,
+            self._reduce_scatter_validate_mp, orig_reduce_scatter, mp_config,
         )
         with patch_reduce_scatter(test_reduce_scatter, param_dtype):
             # TODO: `test_mp_embedding_reduce()` fails if we do not wrap the
diff --git a/torch/distributed/algorithms/_comm_hooks/default_hooks.py b/torch/distributed/algorithms/_comm_hooks/default_hooks.py
index 10dcce3197c72..7d2c845f4e63b 100644
--- a/torch/distributed/algorithms/_comm_hooks/default_hooks.py
+++ b/torch/distributed/algorithms/_comm_hooks/default_hooks.py
@@ -108,7 +108,7 @@ def reduce_scatter_hook(state: DefaultState, grad: torch.Tensor, output: torch.T
     # Average grad by pre-division factor.
     if state.gradient_predivide_factor > 1:
         grad.div_(state.gradient_predivide_factor)
-    dist._reduce_scatter_base(
+    dist.reduce_scatter_tensor(
         output, grad, group=state.process_group
     )
     # Average grad's shard by post-division factor.
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index ee5d580c29d27..f97cacb2a9a41 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -479,7 +479,7 @@ def get_loss(self, input, output):
         return loss
 
     def run_backward(self, loss):
-        orig_reduce_scatter = torch.distributed._reduce_scatter_base
+        orig_reduce_scatter = torch.distributed.reduce_scatter_tensor
 
         def _delayed_reduce_scatter(*args, **kwargs):
             if self.delay_before_reduction_ms > 0:
@@ -489,7 +489,7 @@ def _delayed_reduce_scatter(*args, **kwargs):
             return orig_reduce_scatter(*args, **kwargs)
 
         with mock.patch(
-            "torch.distributed._reduce_scatter_base", _delayed_reduce_scatter
+            "torch.distributed.reduce_scatter_tensor", _delayed_reduce_scatter
         ):
             self.module.run_backward(loss)
 

From 9eaf5878ab07c0d9a2b92a0b54071b27047929ca Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Mon, 24 Oct 2022 03:36:52 +0000
Subject: [PATCH 0063/1922] [FSDP] Rename streams (#86833)

This time around, I decided to rename the "all_gather" stream to the "unshard" stream to emphasize that it includes both the actual all-gather op but also the corresponding memory allocations (and also now the unflattening as well). (A similar reasoning applies for the "pre-all-gather" stream becoming the "pre-unshard" stream.)

This PR is definitely safe.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86833
Approved by: https://github.com/rohan-varma
---
 .../fsdp/fully_sharded_data_parallel.py       | 29 ++++++++++---------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index a51df5195f0fc..823cba20d42c4 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -1567,13 +1567,13 @@ def _unshard(
             if event:
                 event.synchronize()
         any_ran_pre_unshard = False
-        with torch.cuda.stream(self._streams["pre_all_gather"]):
+        with torch.cuda.stream(self._streams["pre_unshard"]):
             for handle in handles:
                 ran_pre_unshard = handle.pre_unshard()
                 any_ran_pre_unshard = any_ran_pre_unshard or ran_pre_unshard
         if any_ran_pre_unshard:
-            self._streams["all_gather"].wait_stream(self._streams["pre_all_gather"])
-        with torch.cuda.stream(self._streams["all_gather"]):
+            self._streams["unshard"].wait_stream(self._streams["pre_unshard"])
+        with torch.cuda.stream(self._streams["unshard"]):
             for handle in handles:
                 handle.unshard()
                 handle.post_unshard()
@@ -2006,12 +2006,15 @@ def _init_streams(self) -> None:
         computation. This should only be called on the root FSDP instance."""
         assert self._is_root
         assert torch.cuda.is_available()
-        # Stream for all-gathering parameters.
-        self._streams["all_gather"] = torch.cuda.Stream()
-        # Stream for overlapping grad reduction with the backward pass.
+        # Stream for unshard logic, including allocating the all-gather
+        # destination tensors and the all-gathers themselves.
+        self._streams["unshard"] = torch.cuda.Stream()
+        # Stream for overlapping gradient reduction with the backward pass
+        # gradient computation.
         self._streams["post_backward"] = torch.cuda.Stream()
-        # Stream for pre-all-gather copies (e.g. H2D or precision cast).
-        self._streams["pre_all_gather"] = torch.cuda.Stream()
+        # Stream for pre-unshard logic, namely allocations and writes for
+        # CPU offloading (H2D copy) and mixed precision (low precision cast).
+        self._streams["pre_unshard"] = torch.cuda.Stream()
 
     def _wait_for_previous_optim_step(self) -> None:
         """
@@ -2022,11 +2025,11 @@ def _wait_for_previous_optim_step(self) -> None:
         if not self._is_root:
             return
         current_stream = torch.cuda.current_stream()
-        self._streams["all_gather"].wait_stream(current_stream)
+        self._streams["unshard"].wait_stream(current_stream)
         # Having the pre-all-gather stream wait for the current stream even if
         # we do not leverage the pre-all-gather stream is tolerable since this
         # only runs once per iteration
-        self._streams["pre_all_gather"].wait_stream(current_stream)
+        self._streams["pre_unshard"].wait_stream(current_stream)
 
     def _prefetch_handles(
         self,
@@ -2893,7 +2896,7 @@ def _pre_forward_unshard(
             self._unshard(handles)
             handles_key = tuple(handles)
             self._needs_pre_forward_unshard[handles_key] = False
-            torch.cuda.current_stream().wait_stream(self._streams["all_gather"])
+            torch.cuda.current_stream().wait_stream(self._streams["unshard"])
             self._prefetch_handles(handles_key)
 
     def _post_forward(
@@ -3137,7 +3140,7 @@ def _summon_full_params(
         self._clear_grads_if_needed()
         free_unsharded_flat_params = [handle.needs_unshard() for handle in self._handles]
         self._unshard(self._handles)
-        torch.cuda.current_stream().wait_stream(self._streams["all_gather"])
+        torch.cuda.current_stream().wait_stream(self._streams["unshard"])
         if with_grads:
             self._unshard_grads(self._handles)
 
@@ -3444,7 +3447,7 @@ def _pre_backward_hook(_handles: List[FlatParamHandle], *unused: Any) -> None:
                 # If the handles have been prefetched, this `_unshard()` simply
                 # switches to using the unsharded parameter
                 self._unshard(_handles)
-                torch.cuda.current_stream().wait_stream(self._streams["all_gather"])
+                torch.cuda.current_stream().wait_stream(self._streams["unshard"])
 
                 # Set this to `False` to ensure that a mistargeted prefetch
                 # does not actually unshard these handles

From 7644e19f0c7ae9ba6290a6580f28e6b65921af12 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Mon, 24 Oct 2022 03:31:34 +0000
Subject: [PATCH 0064/1922] [FSDP][Docs] Clarify warnings to mention
 collectives (#87478)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87478
Approved by: https://github.com/rohan-varma
---
 .../fsdp/fully_sharded_data_parallel.py       | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 823cba20d42c4..3c8fa6817571b 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -2498,8 +2498,8 @@ def state_dict(self, *args, **kwargs):
             >>> local_dict.keys()
             >>> odict_keys(['flat_param', 'inner.flat_param'])
 
-        .. warning:: This needs to be called on all ranks, since synchronization
-            primitives may be used.
+        .. warning:: This needs to be called on all ranks since it uses
+            collective communications.
         """
         # TODO (rohan-varma): separate these out once a state_dict pre-hook
         # is available.
@@ -2795,8 +2795,8 @@ def load_state_dict(
             >>> local_dict.keys()
             >>> odict_keys(['flat_param', 'inner.flat_param'])
 
-        .. warning:: This needs to be called on all ranks, since synchronization
-            primitives may be used.
+        .. warning:: This needs to be called on all ranks since it uses
+            collective communications.
         """
         return super().load_state_dict(state_dict, *args)
 
@@ -3944,8 +3944,8 @@ def clip_grad_norm_(
             calling it for FSDP models would lead to different scaling being
             applied per subset of model parameters.
 
-        .. warning:: This needs to be called on all ranks, since synchronization
-            primitives will be used.
+        .. warning:: This needs to be called on all ranks since it uses
+            collective communications.
         """
         self._lazy_init()
         self._wait_for_previous_optim_step()
@@ -4020,10 +4020,10 @@ def full_optim_state_dict(
         and ``"param_groups"``. The flattened parameters in ``FSDP`` modules
         contained in ``model`` are mapped back to their unflattened parameters.
 
-        .. warning:: This needs to be called on all ranks since synchronization
-            primitives are used. However, if ``rank0_only=True``, then the
-            state dict is only populated on rank 0, and all other ranks return
-            an empty :class:`dict`.
+        .. warning:: This needs to be called on all ranks since it uses
+            collective communications. However, if ``rank0_only=True``, then
+            the state dict is only populated on rank 0, and all other ranks
+            return an empty :class:`dict`.
 
         .. warning:: Unlike ``torch.optim.Optimizer.state_dict()``, this method
             uses full parameter names as keys instead of parameter IDs.

From d84cd1f217e2add112d72c3aad429469c50b2e5e Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Mon, 24 Oct 2022 03:31:34 +0000
Subject: [PATCH 0065/1922] [FSDP][1/N] Rework `clip_grad_norm_()` and tests
 (#87479)

This PR reworks FSDP's `clip_grad_norm_()` and its unit tests. The unit tests in `test_fsdp_core.py` still need to be revisited and will be done in follow-up work.

Some details in arbitrary order:
- This renames `_calc_grad_norm()` to `_get_grad_norm()`. This is to simplify our verb usage in method names. Otherwise, we may diverge to different verbs like "compute", "calculate", "get", "find" etc. I am open to discussion here.
- Because we call `torch.linalg.vector_norm()` as the underlying norm calculation subroutine, which can take infinity as input for the norm type, there is no reason to have a separate conditional branch for the infinity norm.
- This removes a host-device synchronization point from `clip_grad_norm_()` by using the same trick from `torch.nn.utils.clip_grad_norm_()`. This may improve throughput for workloads like metaseq, which computes gradient norms regularly.
- This returns the total norm from `clip_grad_norm_()` as mentioned in the docstring. Before nothing was returned.
- This rewrites the unit tests, which were slightly problematic. Much of the logic to verify gradient norms were computed correctly were exactly the same as the logic used to compute them in FSDP (i.e. `^p`, sum via all-reduce, `^(1/p)`). This defeats the purpose of unit testing. There were some other oddities like `input = torch.rand(14, 2, device=self.rank); in_data = torch.tensor(input[self.rank], device=self.rank)`, where we materialize a full `(14, 2)` shape but only ever use the first two rows (assuming world size 2).
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87479
Approved by: https://github.com/rohan-varma
---
 .../fsdp/test_fsdp_clip_grad_norm.py          | 192 ++++++++++++------
 .../fsdp/fully_sharded_data_parallel.py       |  87 ++++----
 torch/testing/_internal/common_fsdp.py        |  23 ---
 3 files changed, 181 insertions(+), 121 deletions(-)

diff --git a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
index 9e39254ec423a..3af5a83cdde42 100644
--- a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
+++ b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
@@ -1,31 +1,33 @@
 # Owner(s): ["oncall: distributed"]
 
+import functools
+import itertools
 import sys
-from math import inf
+from typing import Union
 
 import torch
+import torch.nn as nn
 from torch import distributed as dist
+from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload
 from torch.distributed.fsdp.fully_sharded_data_parallel import (
     FullyShardedDataParallel as FSDP,
-    CPUOffload,
-    _calc_grad_norm,
 )
-from torch.nn import utils as nn_utils
+from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
+from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer
+from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
-    DeterministicModel,
+    CUDAInitMode,
+    FSDPInitMode,
     FSDPTest,
-    _collect_total_grad_norm_fsdp,
-    _collect_total_grad_norm_local,
+    TransformerWithSharedParams,
 )
 from torch.testing._internal.common_utils import (
     TEST_WITH_DEV_DBG_ASAN,
-    run_tests,
-    parametrize,
     instantiate_parametrized_tests,
+    run_tests,
 )
 
-
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
@@ -39,67 +41,133 @@
 
 
 class TestClipGradNorm(FSDPTest):
-    def _run_fsdp_one_iteration(self, norm_type, nested_fsdp, cpu_offload):
-        """Test FSDP with clip grad norm."""
-        fsdp_model = DeterministicModel(nested_fsdp, cpu_offload=cpu_offload)
-        local_model = DeterministicModel(False)
-        input = torch.rand(14, 2, device=self.rank)
-        fsdp_model = FSDP(fsdp_model, cpu_offload=cpu_offload)
-        self.assertTrue(len(input) >= self.world_size)
-        out = local_model(input[: self.world_size])
-        out.sum().backward()
-        in_data = torch.tensor(input[self.rank], device=self.rank)
-        out_fsdp = fsdp_model(in_data)
-        out_fsdp.sum().backward()
-        total_norms_fsdp = _collect_total_grad_norm_fsdp(
-            fsdp_model, norm_type, self.rank
-        )
-        total_norms_local = _collect_total_grad_norm_local(local_model, norm_type)
-        total_norms_local /= self.world_size
-        norm_cap = total_norms_fsdp / 2.0
-        self.assertEqual(total_norms_local, total_norms_fsdp)
-        fsdp_model.clip_grad_norm_(norm_cap, norm_type=norm_type)
-        nn_utils.clip_grad_norm_(
-            local_model.parameters(), norm_cap, norm_type=norm_type
+    """Tests :meth:`FullyShardedDataParallel.clip_grad_norm_`."""
+
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @skip_if_lt_x_gpu(2)
+    def test_non_root(self):
+        """
+        Tests that calling ``clip_grad_norm_()`` on a non-root FSDP instance
+        raises an error.
+        """
+        class Model(nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.lin1 = nn.Linear(5, 5)
+                self.lin2 = nn.Linear(5, 5)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.lin2(self.lin1(x))
+
+        model = Model().cuda()
+        model.lin2 = FSDP(model.lin2)
+        fsdp_model = FSDP(model)
+        fsdp_model(torch.randn((2, 5), device=torch.device("cuda"))).sum().backward()
+        error_regex = "should only be called on the root FSDP instance"
+        with self.assertRaisesRegex(RuntimeError, error_regex):
+            fsdp_model.lin2.clip_grad_norm_(max_norm=2)
+
+    @skip_if_lt_x_gpu(2)
+    def test_ddp_parity(self):
+        """
+        Tests FSDP with ``FullyShardedDataParallel.clip_grad_norm_()` against
+        DDP with ``torch.nn.utils.clip_grad_norm_()`.
+        """
+        self.run_subtests(
+            {
+                "max_norm": [1, 2.5],
+                "norm_type": [1, 2, float("inf")],
+                "use_orig_params": [False, True],
+                "offload_params": [False, True],
+            },
+            self._test_ddp_parity,
         )
-        total_norms_after_clip_fsdp = _collect_total_grad_norm_fsdp(
-            fsdp_model, norm_type, self.rank
+
+    def _test_ddp_parity(
+        self,
+        max_norm: Union[float, int],
+        norm_type: Union[float, int],
+        offload_params: bool,
+        use_orig_params: bool,
+    ):
+        local_model = TransformerWithSharedParams.init(
+            self.process_group,
+            FSDPInitMode.NO_FSDP,
+            CUDAInitMode.CUDA_BEFORE,
+            deterministic=True,
         )
-        total_norms_after_clip_local = _collect_total_grad_norm_local(
-            local_model, norm_type
+        ddp_model = DDP(local_model, device_ids=[self.rank])
+        fsdp_kwargs = {
+            "auto_wrap_policy": functools.partial(
+                transformer_auto_wrap_policy,
+                transformer_layer_cls={
+                    TransformerEncoderLayer,
+                    TransformerDecoderLayer,
+                },
+            ),
+            "cpu_offload": CPUOffload(offload_params=offload_params),
+            "use_orig_params": use_orig_params,
+        }
+        fsdp_model = TransformerWithSharedParams.init(
+            self.process_group,
+            FSDPInitMode.RECURSIVE,
+            CUDAInitMode.CUDA_BEFORE,
+            deterministic=True,
+            fsdp_kwargs=fsdp_kwargs,
         )
-        self.assertTrue(total_norms_after_clip_fsdp <= norm_cap)
-        self.assertEqual(total_norms_after_clip_local, total_norms_after_clip_fsdp)
+        LR = 1e-2
+        ddp_optim = torch.optim.Adam(ddp_model.parameters(), lr=LR)
+        fsdp_optim = torch.optim.Adam(fsdp_model.parameters(), lr=LR)
+        device = torch.device("cuda")
+        LARGE_FACTOR = 100
+        inp = ddp_model.module.get_input(device)
+        for model in (ddp_model, fsdp_model):
+            out = model(*inp)
+            loss = model.module.get_loss(inp, out)
+            loss.backward()
 
-    @skip_if_lt_x_gpu(2)
-    @parametrize("norm_type", [2.0, inf])
-    @parametrize("nested_fsdp", [True, False])
-    @parametrize(
-        "cpu_offload",
-        [CPUOffload(offload_params=True), CPUOffload(offload_params=False)],
-    )
-    def test_fsdp_clip_grad_norm(self, norm_type, nested_fsdp, cpu_offload):
-        """Test FSDP with clip grad norm."""
-        self._run_fsdp_one_iteration(norm_type, nested_fsdp, cpu_offload)
+        # Multiply gradients by a large factor to ensure that gradients will
+        # actually be clipped
+        for param in itertools.chain(ddp_model.parameters(), fsdp_model.parameters()):
+            if param.grad is not None:  # gradients may be `None` for `use_orig_params=True`
+                param.grad *= LARGE_FACTOR
+        orig_ddp_grads = [param.grad.detach().clone() for param in ddp_model.parameters()]
+        orig_fsdp_grads = [
+            param.grad.detach().clone() if param.grad is not None else None
+            for param in fsdp_model.parameters()
+        ]
+
+        ddp_total_norm = torch.nn.utils.clip_grad_norm_(
+            ddp_model.parameters(), max_norm=max_norm, norm_type=norm_type,
+        )
+        fsdp_total_norm = fsdp_model.clip_grad_norm_(max_norm=max_norm, norm_type=norm_type)
+        self.assertEqual(ddp_total_norm, fsdp_total_norm)
 
+        # Check that the gradients were modified by `clip_grad_norm_()`
+        for param, orig_grad in zip(ddp_model.parameters(), orig_ddp_grads):
+            assert not torch.equal(param.grad, orig_grad)
+        for param, orig_grad in zip(fsdp_model.parameters(), orig_fsdp_grads):
+            if param.grad is None:
+                self.assertEqual(param.grad, orig_grad)  # `None`
+            else:
+                assert not torch.equal(param.grad, orig_grad)
 
-class TestCalcuGradNorm(FSDPTest):
-    @skip_if_lt_x_gpu(2)
-    @parametrize("norm_type", [2.0, inf, 1.3, 2.5])
-    @parametrize("nested_fsdp", [True, False])
-    def test_fsdp_calc_grad_norm(self, norm_type, nested_fsdp):
-        """Test grad norm cal API."""
-        model = FSDP(DeterministicModel(nested_fsdp))
-        input = torch.rand(15, 2, device=self.rank)
-        out = model(input)
-        out.sum().backward()
-        total_norm = _calc_grad_norm(model.params_with_grad, norm_type)
-        total_norm_expected = _collect_total_grad_norm_local(model, norm_type)
-        self.assertEqual(total_norm, total_norm_expected)
+        # Run an optimizer step to ensure gradients matched after clipping
+        ddp_optim.step()
+        fsdp_optim.step()
+        with FSDP.summon_full_params(fsdp_model):
+            for (n1, p1), (n2, p2) in zip(
+                ddp_model.module.named_parameters(),
+                fsdp_model.named_parameters(),
+            ):
+                self.assertEqual(n1, n2)
+                self.assertEqual(p1, p2)
 
 
 instantiate_parametrized_tests(TestClipGradNorm)
-instantiate_parametrized_tests(TestCalcuGradNorm)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 3c8fa6817571b..e80844ea232df 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -3923,11 +3923,11 @@ def params_with_grad(self) -> List[Parameter]:
     @torch.no_grad()
     def clip_grad_norm_(
         self, max_norm: Union[float, int], norm_type: Union[float, int] = 2.0
-    ) -> None:
+    ) -> torch.Tensor:
         """
-        Clip all gradients at this point in time. The norm is computed over all
-        gradients together, as if they were concatenated into a single vector.
-        Gradients are modified in-place.
+        Clips the gradient norm of all parameters. The norm is computed over
+        all parameters' gradients as viewed as a single vector, and the
+        gradients are modified in-place.
 
         Args:
             max_norm (float or int): max norm of the gradients
@@ -3949,13 +3949,18 @@ def clip_grad_norm_(
         """
         self._lazy_init()
         self._wait_for_previous_optim_step()
-        assert self._is_root, "clip_grad_norm should only be called on the root (parent) instance"
+        if not self._is_root:
+            raise RuntimeError(
+                "`clip_grad_norm_()` should only be called on the root FSDP instance"
+            )
         self._assert_state(TrainingState_.IDLE)
 
         max_norm = float(max_norm)
         norm_type = float(norm_type)
-        # Computes the max norm for this shard's gradients and sync's across workers
-        local_norm = _calc_grad_norm(self.params_with_grad, norm_type).cuda()  # type: ignore[arg-type]
+        # Compute the local gradient norm (only including this rank's shard
+        # of the gradients)
+        local_norm = _get_grad_norm(self.parameters(), norm_type).to(self.compute_device)
+        # Reconstruct the total gradient norm depending on the norm type
         if norm_type == math.inf:
             total_norm = local_norm
             dist.all_reduce(total_norm, op=torch.distributed.ReduceOp.MAX, group=self.process_group)
@@ -3963,16 +3968,21 @@ def clip_grad_norm_(
             total_norm = local_norm ** norm_type
             dist.all_reduce(total_norm, group=self.process_group)
             total_norm = total_norm ** (1.0 / norm_type)
-
         if self.cpu_offload.offload_params:
             total_norm = total_norm.cpu()
 
-        clip_coef = torch.tensor(max_norm, dtype=total_norm.dtype, device=total_norm.device) / (total_norm + 1e-6)
-        if clip_coef < 1:
-            # multiply by clip_coef, aka, (max_norm/total_norm).
-            for p in self.params_with_grad:
-                assert p.grad is not None
-                p.grad.detach().mul_(clip_coef.to(p.grad.device))
+        clip_coef = (
+            torch.tensor(max_norm, dtype=total_norm.dtype, device=total_norm.device)
+            / (total_norm + 1e-6)
+        )
+        # Multiplying by the clamped coefficient is meaningless when it is
+        # equal to 1, but it avoids the host-device sync that would result from
+        # `if clip_coef < 1`
+        clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
+        grads = [param.grad for param in self.parameters() if param.grad is not None]
+        for grad in grads:
+            grad.detach().mul_(clip_coef_clamped.to(grad.device))
+        return total_norm
 
     @staticmethod
     def _warn_optim_input(optim_input):
@@ -4625,30 +4635,35 @@ def _is_param_exec_order_prep_stage(self) -> bool:
         return is_prep_stage
 
 
-def _calc_grad_norm(parameters: List[torch.nn.Parameter], p: float) -> torch.Tensor:
-    r"""Calculate gradient norm of an iterable of parameters.
-    Returns:
-        Total norm of the parameters (viewed as a single vector).
+def _get_grad_norm(
+    params: List[nn.Parameter],
+    norm_type: float,
+) -> torch.Tensor:
     """
-    parameters = [p for p in parameters if p.grad is not None]
-
-    if len(parameters) == 0:
+    Returns the gradient norm of parameters ``param`` s, where the gradients
+    are viewed as a single vector.
+    """
+    params_with_grad = [param for param in params if param.grad is not None]
+    if len(params_with_grad) == 0:
         return torch.tensor(0.0)
-    if p == math.inf:
-        local_norm = torch.tensor(max(par.grad.detach().abs().max() for par in parameters))
-    else:
-        # Compute the norm in full precision no matter what
-        local_norm = torch.linalg.vector_norm(
-            torch.stack(
-                [
-                    torch.linalg.vector_norm(par.grad.detach(), p, dtype=torch.float32)
-                    for par in parameters
-                ]
-            ),
-            p,
-        )
-    local_norm.to(dtype=parameters[0].dtype)
-    return local_norm
+    grads = [param.grad for param in params_with_grad]
+    grad_dtypes = set(grad.dtype for grad in grads)
+    if len(grad_dtypes) != 1:
+        raise ValueError(f"Requires uniform dtype across all gradients but got {grad_dtypes}")
+    # Compute the gradient norm in FP32, where we treat the gradients as a
+    # single vector
+    grad_norm = torch.linalg.vector_norm(
+        torch.stack(
+            [
+                torch.linalg.vector_norm(grad.detach(), norm_type, dtype=torch.float32)
+                for grad in grads
+            ],
+        ),
+        norm_type,
+        dtype=torch.float32,
+    )
+    grad_norm = grad_norm.to(grads[0].dtype)
+    return grad_norm
 
 
 def _get_param_to_unflat_param_names(
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index f97cacb2a9a41..7fdbe573ed217 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -7,7 +7,6 @@
 from contextlib import suppress
 from copy import deepcopy
 from enum import Enum, auto
-from math import inf
 from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 from unittest import mock
 
@@ -1058,25 +1057,3 @@ def forward(self, x):
         x = self.linear_skip(x)
         x = self.nested_linear(x)
         return x
-
-
-def _collect_total_grad_norm_fsdp(model, norm_type, rank):
-    total_norm = _collect_total_grad_norm_local(model, norm_type)
-    op = torch.distributed.ReduceOp.SUM
-    if norm_type == inf:
-        op = torch.distributed.ReduceOp.MAX
-        norm_type = 1.0
-    return_norm = torch.tensor(total_norm ** norm_type, device=rank)
-    dist.all_reduce(return_norm, op=op)
-    return return_norm ** (1.0 / norm_type)
-
-
-def _collect_total_grad_norm_local(model, norm_type):
-    if norm_type == inf:
-        return max(p.grad.abs().max() for p in model.parameters())
-    else:
-        total_norm = 0.0
-        for p in model.parameters():
-            local_norm = torch.linalg.vector_norm(p.grad, norm_type, dtype=torch.float32)
-            total_norm += local_norm ** norm_type
-        return total_norm ** (1.0 / norm_type)

From 27dc00b83e64dc8dbce72067e800bf3652dbc8bb Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Mon, 24 Oct 2022 03:31:34 +0000
Subject: [PATCH 0066/1922] [FSDP][2/N] Remove `params_with_grad` (#87480)

This PR removes the property `params_with_grad` from `FullyShardedDataParallel`. It was introduced when implementing `clip_grad_norm_()` but was not consistently used. Personally, I do not think it makes sense for `FullyShardedDataParallel` to expose this helper because it is not a common paradigm.

This PR is technically BC-breaking. However, I checked that no one internally is using this API.

cc @ezyang @gchanan
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87480
Approved by: https://github.com/rohan-varma
---
 torch/distributed/fsdp/fully_sharded_data_parallel.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index e80844ea232df..f6eead9406a1c 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -52,8 +52,6 @@
     _sync_params_and_buffers,
     _to_kwargs,
 )
-from torch.nn.parameter import Parameter
-
 from ._optim_utils import (
     _broadcast_pos_dim_tensor_states,
     _broadcast_processed_optim_state_dict,
@@ -3913,13 +3911,6 @@ def no_sync(self) -> Generator:
                 )
                 m._sync_gradients = old_flag
 
-    @property
-    def params_with_grad(self) -> List[Parameter]:
-        """
-        Recursively returns a list of all module parameters that have a gradient.
-        """
-        return [p for p in self.parameters() if p.grad is not None]
-
     @torch.no_grad()
     def clip_grad_norm_(
         self, max_norm: Union[float, int], norm_type: Union[float, int] = 2.0

From bd8ab66232207cc48d7065d27c00321e097ca393 Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Fri, 21 Oct 2022 14:09:52 -0700
Subject: [PATCH 0067/1922] [Quant][docs] Add README for BackendConfig (#86523)

Summary: This adds a README for `torch.ao.quantization.backend_config`
that describes both the high level motivation and the specifications
of the BackendConfig API.

Reviewers: jerryzh168, vkuzo

Subscribers: jerryzh168, vkuzo
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86523
Approved by: https://github.com/jerryzh168
---
 .../ao/quantization/backend_config/README.md  | 142 +++++++++++++++++-
 .../backend_config/backend_config.py          |  17 ++-
 torch/ao/quantization/fx/README.md            |  25 +--
 3 files changed, 155 insertions(+), 29 deletions(-)

diff --git a/torch/ao/quantization/backend_config/README.md b/torch/ao/quantization/backend_config/README.md
index a170581d5638b..b8d8ceb3e38de 100644
--- a/torch/ao/quantization/backend_config/README.md
+++ b/torch/ao/quantization/backend_config/README.md
@@ -1,10 +1,34 @@
-The patterns are we matching against is float modules types, functional operators and pytorch operators in reverse order:
+## BackendConfig Overview
+
+BackendConfig allows PyTorch quantization to work with different backend or kernel libraries. These backends may have different sets of supported quantized operator patterns, and the same operator patterns may require different handling across different backends. To make quantization work with different backends and allow maximum flexibility, we strived to make all the parts of the quantization flow configurable with BackendConfig. Currently, it is only used by FX graph mode quantization. For more details on how it integrates with the FX graph mode quantization flow, refer to this [README](/torch/ao/quantization/fx/README.md).
+
+BackendConfig configures quantization behavior in terms of operator patterns. For each operator pattern, we need to specify what the supported data types are for the input and output activations, weights, and biases, and also specify the QAT modules, the reference quantized modules etc., which will be used in module swapping during the quantization passes.
+
+Quantized backends can have different support in terms of the following aspects:
+* Quantization scheme (symmetric vs asymmetric, per-channel vs per-tensor)
+* Data type (float32, float16, int8, uint8, bfloat16, etc.) for input/output/weight/bias
+* Quantized (and fused) mapping: Some quantized operators may have different numerics compared to a naive (dequant - float_op - quant) reference implementation. For weighted operators, such as conv and linear, we need to be able to specify custom reference modules and a mapping from the float modules
+* QAT mapping: For weighted operators, we need to swap them with the Quantization Aware Training (QAT) versions that add fake quantization to the weights
+
+As an example, here is what fbgemm looks like:
+|                                           | fbgemm                                                                |
+|-------------------------------------------|-----------------------------------------------------------------------|
+| Quantization Scheme                       | activation: per tensor, weight: per tensor or per channel             |
+| Data Type                                 | activation: quint8 (with qmin/qmax range restrictions), weight: qint8 |
+| Quantized and Fused Operators and Mapping | e.g. torch.nn.Conv2d -> torch.ao.nn.quantized.reference.Conv2d        |
+| QAT Module Mapping                        | e.g. torch.nn.Conv2d -> torch.ao.nn.qat.Conv2d                        |
+
+Instead of hardcoding the fusion mappings, float to reference quantized module mappings, fusion patterns etc., we will derive everything from the BackendConfig throughout the code base. This allows PyTorch Quantization to work with all first-party (fbgemm and qnnpack) and third-party backends (TensorRT, executorch etc.) that may differ from native backends in different aspects. With the recent addition of xnnpack, integrated as part of the qnnpack backend in PyTorch, the BackendConfig is needed to define the new constraints required for xnnpack quantized operators.
+
+## Pattern Specification
+
+The operator patterns used in BackendConfig are float modules, functional operators and pytorch operators specified in reverse order:
 ```
 operator = module_type | functional | torch op | native op | MatchAllNode
 Pattern = (operator, Pattern, Pattern, ...) | operator
 ```
-where the first item for Pattern is the operator we want to match, and the rest are the patterns for the arguments of the operator.
-For example, pattern (nn.ReLU, (operator.add, MatchAllNode, (nn.BatchNorm2d, nn.Conv2d))) would match the following graph:
+where the first item for each Pattern is the operator, and the rest are the patterns for the arguments of the operator.
+For example, the pattern (nn.ReLU, (operator.add, MatchAllNode, (nn.BatchNorm2d, nn.Conv2d))) would match the following graph:
 ```
 tensor_1            tensor_2
  |                    |
@@ -17,4 +41,114 @@ tensor_1            tensor_2
       nn.ReLU
 ```
 
-we’ll match the last node as the anchor point of the match, and we can retrieve the whole graph by tracing back from the node, e.g. in the example above, we matched nn.ReLU node, then node.args[0] is the operator.add node.
+During prepare and convert, we’ll match the last node, which will be the anchor point of the match, and we can retrieve the whole graph by tracing back from the node. E.g. in the example above, we matched the `nn.ReLU` node, and `node.args[0]` is the `operator.add` node.
+
+## BackendConfig Implementation
+
+The BackendConfig is comprised of a list of BackendPatternConfigs, each of which define the specifications and the requirements for an operator pattern. Here is an example usage:
+
+```
+import torch
+from torch.ao.quantization.backend_config import BackendConfig, BackendPatternConfig, DTypeConfig, ObservationType
+from torch.ao.quantization.fuser_method_mappings import reverse_sequential_wrapper2
+
+weighted_int8_dtype_config = DTypeConfig(
+    input_dtype=torch.quint8,
+    output_dtype=torch.quint8,
+    weight_dtype=torch.qint8,
+    bias_type=torch.float)
+
+linear_config = BackendPatternConfig(torch.nn.Linear) \
+    .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT) \
+    .add_dtype_config(weighted_int8_dtype_config) \
+    .set_root_module(torch.nn.Linear) \
+    .set_qat_module(torch.ao.nn.qat.Linear) \
+    .set_reference_quantized_module(torch.ao.nn.quantized.reference.Linear)
+
+conv_relu_config = BackendPatternConfig((torch.nn.ReLU, torch.nn.Conv2d)) \
+    .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT) \
+    .add_dtype_config(weighted_int8_dtype_config) \
+    .set_fused_module(torch.ao.nn.intrinsic.ConvReLU2d) \
+    .set_fuser_method(reverse_sequential_wrapper2(torch.ao.nn.intrinsic.ConvReLU2d))
+
+backend_config = BackendConfig("my_backend") \
+    .set_backend_pattern_config(linear_config) \
+    .set_backend_pattern_config(conv_relu_config)
+```
+
+### Observer Insertion
+
+Relevant APIs:
+* `set_observation_type`
+
+During the prepare phase, we insert observers (or QuantDeQuantStubs in the future) into the graph for this operator pattern based on the observation type, which specifies whether to use different observers for the inputs and the outputs of the pattern. For more detail, see `torch.ao.quantization.backend_config.ObservationType`.
+
+### Reference Quantized Patterns
+
+Relevant APIs:
+* `set_root_module`
+* `set_reference_quantized_module`
+
+During the convert phase, when we construct the reference quantized model, the root modules (e.g. `torch.nn.Linear` for `nni.LinearReLU` or `nniqat.LinearReLU`) will be swapped to the corresponding reference quantized modules (e.g. `torch.ao.nn.reference.Linear`). This allows custom backends to specify custom reference quantized module implementations to match the numerics of their lowered operators. Since this is a one-to-one mapping, both the root module and the reference quantized module must be specified in the same BackendPatternConfig in order for the conversion to take place.
+
+### Fusion
+
+Relevant APIs:
+* `set_fuser_method`
+* `set_fused_module`
+* `_set_root_node_getter`
+* `_set_extra_inputs_getter`
+
+As an optimization, operator patterns such as (`torch.nn.ReLU`, `torch.nn.Linear`) may be fused into `nni.LinearReLU`. This is performed during the prepare phase according to the function specified in `set_fuser_method`, which replaces the pattern with the fused module. During the convert phase, these fused modules (identified by `set_fused_module`) will then be converted to the reference quantized versions of the modules.
+
+In FX graph mode quantization, we replace the corresponding nodes in the graph using two helper functions set by the user: `root_node_getter`, which returns the root node (typically the weighted module in the pattern like `torch.nn.Linear`) to replace the matched pattern in the graph, and `extra_inputs_getter`, which returns a list of extra input arguments that will be appended to the existing arguments of the fused module (copied over from the root node). See [this snippet](https://gist.github.com/jerryzh168/8bea7180a8ba3c279f2c9b050f2a69a6) for an example usage.
+
+### Data Type Restrictions
+
+Relevant APIs:
+* `add_dtype_config`
+* `set_dtype_configs`
+
+DTypeConfig specifies a set of supported data types for input/output/weight/bias along with the associated constraints, if any. There are two ways of specifying `input_dtype`, `output_dtype`, and `weight_dtype`, as simple `torch.dtype`s or as `DTypeWithConstraints`, e.g.:
+
+```
+import torch
+from torch.ao.quantization.backend import DTypeConfig, DTypeWithConstraints
+
+dtype_config = DTypeConfig(
+    input_dtype=torch.quint8,
+    output_dtype=torch.quint8,
+    weight_dtype=torch.qint8,
+    bias_dtype=torch.float)
+
+dtype_config_with_constraints = DTypeConfig(
+    input_dtype=DTypeWithConstraints(
+        dtype=torch.quint8,
+        quant_min_lower_bound=0,
+        quant_max_upper_bound=255,
+        scale_min_lower_bound=2 ** -12,
+    ),
+    output_dtype=DTypeWithConstraints(
+        dtype=torch.quint8,
+        quant_min_lower_bound=0,
+        quant_max_upper_bound=255,
+        scale_min_lower_bound=2 ** -12,
+    ),
+    weight_dtype=DTypeWithConstraints(
+        dtype=torch.qint8,
+        quant_min_lower_bound=-128,
+        quant_max_upper_bound=127,
+        scale_min_lower_bound=2 ** -12,
+    ),
+    bias_dtype=torch.float)
+```
+
+During the prepare phase of quantization, we will compare the data types specified in these DTypeConfigs to the ones specified in the matching QConfig for a given operator pattern. If the data types do not match (or the constraints are not satisfied) for all the DTypeConfigs specified for the operator pattern, then we will simply ignore the QConfig and skip quantizing this pattern.
+
+#### Quantization range
+
+The user's QConfig may specify `quant_min` and `quant_max`, which are min and max restrictions on the quantization values. Here we set the lower bound for the `quant_min` and then upper bound for the `quant_max` to represent the limits of the backend. If a QConfig exceeds these limits in either direction, it will be treated as violating this constraint.
+
+#### Scale range
+
+Similarly, the user's QConfig may specify a minimum value for the quantization scale (currently exposed as `eps` but will change in the future to better reflect the semantics). Here we set the lower bound for the `scale_min` to represent the limits of the backend. If a QConfig's min scale value falls below this limit, the QConfig will be treated as violating this constraint. Note that `scale_max_upper_bound` is currently not used, because there is no corresponding mechanism to enforce this on the observer yet.
diff --git a/torch/ao/quantization/backend_config/backend_config.py b/torch/ao/quantization/backend_config/backend_config.py
index e0d7e0b9d7428..2f491b1624048 100644
--- a/torch/ao/quantization/backend_config/backend_config.py
+++ b/torch/ao/quantization/backend_config/backend_config.py
@@ -341,9 +341,17 @@ def __init__(self, pattern: Pattern):
 
     def set_observation_type(self, observation_type: ObservationType) -> BackendPatternConfig:
         """
-        Set how observers should be inserted for this pattern.
-        See :class:`~torch.ao.quantization.backend_config.ObservationType` for details
+        Set how observers should be inserted in the graph for this pattern.
+        There are two observation types:
 
+            `OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT` (default): the output observer instance will be
+            different from the input. This is the most common observation type.
+
+            `OUTPUT_SHARE_OBSERVER_WITH_INPUT`: the output observer instance will be the same as the input.
+            This is useful for operators like `cat`.
+
+        Note: This will be renamed in the near future, since we will soon insert QuantDeQuantStubs with
+        observers (and fake quantizes) attached instead of observers themselves.
         """
         self.observation_type = observation_type
         return self
@@ -395,6 +403,11 @@ def set_fused_module(self, fused_module: Type[torch.nn.Module]) -> BackendPatter
     def set_fuser_method(self, fuser_method: Callable) -> BackendPatternConfig:
         """
         Set the function that specifies how to fuse the pattern for this pattern.
+
+        The first argument of this function should be `is_qat`, and the rest of the arguments
+        should be the items in the tuple pattern, e.g. (`torch.nn.ReLU`, `torch.nn.Linear`)
+        will have a function with three arguments, `is_qat`, `relu`, and `linear`.
+        The return value of this function should be the resulting fused module.
         """
         self.fuser_method = fuser_method
         return self
diff --git a/torch/ao/quantization/fx/README.md b/torch/ao/quantization/fx/README.md
index 0ee5c5ec7e3f5..cba11e9d36413 100644
--- a/torch/ao/quantization/fx/README.md
+++ b/torch/ao/quantization/fx/README.md
@@ -329,7 +329,7 @@ backend_config configurations used in this step:
 BackendConfig(nniqat.LinearReLU)
     .set_root_module(nn.Linear)
     .set_reference_quantized_module_for_root(nnqr.Linear)
-    .set_fused_module(nni.Linear)
+    .set_fused_module(nni.LinearReLU)
 ```
 
 Pattern in this case is the same as before, it defines the pattern for the subgraph we are dealing with
@@ -376,26 +376,5 @@ There are no configurations related to lowering in `backend_config` since it is
 However, for some operator based backends, like the current pytorch native backends including fbgemm and qnnpack. We could interpret `backend_config` in terms of configurations for operators as well. e.g. configuring `input_dtype`=quint8, `weight_dtype`=qint8, `output_dtype`=torch.quint8 for nn.Linear is saying that the quantized linear will take a quint8 activation and qint8 weight as input and outputs a quint8 activation. But there is no guarantee that this interpretation will always work in the future, especially when we add new flavors of quantized operators.
 
 ## Extensibility
-Different backend or kernel libraries may have different support for quantization. They may have different quantized operators, and the quantized operators might work for Tensors with different dtypes, the observers may need to be placed in different places. To make quantization work for different backends, and allow maximum flexibility, we also strived to make all the parts of the flow configurable with backend_config.
 
-backend_config configures quantization behavior in terms of operator patterns. We need to define a operator pattern and specify what are the supported dtypes for input/output/weight/bias for the pattern, and also specify the qat modules, reference modules etc. for the pattern, which will be used in module swapping during the quantization passes.
-
-Quantized Backends can have different support in the following aspects:
-* Quantization Scheme (symmetric vs asymmetric, per-channel vs per-tensor)
-* Data Type (float32, float16, int8, uint8, bfloat16, etc) for input/output/weight/bias
-* Quantized (and Fused) Operators and Mapping The quantized operators supported by the backend. For example: quantized conv2d, quantized linear etc. Some quantized operators may have different numerics compared to a naive (dequant - float_op - quant) implementation For weighted operators (conv and linear) we need to define a reference module and a mapping
-* QAT Module Mapping For modules with weights, e.g. Conv2d and Linear, we need to swap them with qat (quantization aware training) module that adds fake quantization to the weights
-
-As an example, here is what fbgemm looks like:
-+-------------------------------------------+-----------------------------------------------------------------------+
-|                                           | fbgemm                                                                |
-|-------------------------------------------|-----------------------------------------------------------------------|
-| Quantization Scheme                       | activation: per tensor, weight: per tensor or per channel             |
-| Data Type                                 | activation: quint8 (with qmin/qmax range restrictions), weight: qint8 |
-| Quantized and Fused Operators and Mapping | e.g. nn.Conv2d -> torch.ao.nn.quantized.reference.Conv2d              |
-| QAT Module Mapping                        | nn.Conv -> torch.ao.nn.qat.Conv2d                                     |
-+-------------------------------------------+-----------------------------------------------------------------------+
-
-So instead of hardcoding the fusion mappings, float to quantized module mappings, fusion patterns etc. we will derive everything through `backend_config` throughout the code base. This allows PyTorch Quantization to work for all first-party or third-party backends that may differ from native backends in different aspects.
-
-For use cases, we will use TensorRT as an example use case and have a tutorial talking about `backend_config`, pytorch native backends fbgemm and qnnpack will be using this to define their behaviors as well, especially with the recent addition of xnnpack (integrated as a part of qnnpack backend in pytorch), the `backend_config` api is needed to define the new constraints from xnnpack.
+FX graph mode quantization can be extended to work with different backends, which may have different sets of supported quantized operator patterns and different requirements for each pattern. For more detail, please refer to the [BackendConfig README](/torch/ao/quantization/backend_config/README.md).

From ee7f2abdd8ef41ae55d1a6d4cab0b6cd3db08e15 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Mon, 24 Oct 2022 11:37:26 +0000
Subject: [PATCH 0068/1922] [FSDP] `summon_full_params()` in computation stream
 (#86836)

This should help with memory usage. In particular, this allows FSDP to use caching allocator blocks from the computation stream for the `summon_full_params()` all-gathers, which should help avoid over-allocating blocks to the unshard stream.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86836
Approved by: https://github.com/rohan-varma
---
 .../fsdp/fully_sharded_data_parallel.py       | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index f6eead9406a1c..e241c26d1e1f1 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -1549,6 +1549,8 @@ def _register_param_handle(self, handle: FlatParamHandle) -> None:
     def _unshard(
         self,
         handles: List[FlatParamHandle],
+        unshard_stream: torch.cuda.Stream,
+        pre_unshard_stream: torch.cuda.Stream,
     ) -> None:
         """
         Unshards the handles in ``handles``. If the handles are in
@@ -1565,13 +1567,13 @@ def _unshard(
             if event:
                 event.synchronize()
         any_ran_pre_unshard = False
-        with torch.cuda.stream(self._streams["pre_unshard"]):
+        with torch.cuda.stream(pre_unshard_stream):
             for handle in handles:
                 ran_pre_unshard = handle.pre_unshard()
                 any_ran_pre_unshard = any_ran_pre_unshard or ran_pre_unshard
         if any_ran_pre_unshard:
-            self._streams["unshard"].wait_stream(self._streams["pre_unshard"])
-        with torch.cuda.stream(self._streams["unshard"]):
+            unshard_stream.wait_stream(pre_unshard_stream)
+        with torch.cuda.stream(unshard_stream):
             for handle in handles:
                 handle.unshard()
                 handle.post_unshard()
@@ -2043,7 +2045,7 @@ def _prefetch_handles(
         for handles_key in handles_to_prefetch:
             # Prefetch the next set of handles without synchronizing to allow
             # the sync to happen as late as possible to maximize overlap
-            self._unshard(handles_key)
+            self._unshard(handles_key, self._streams["unshard"], self._streams["pre_unshard"])
             self._handles_prefetched[handles_key] = True
 
     def _get_handles_to_prefetch(
@@ -2891,7 +2893,7 @@ def _pre_forward_unshard(
     ) -> None:
         """Unshards parameters in the pre-forward."""
         if handles:
-            self._unshard(handles)
+            self._unshard(handles, self._streams["unshard"], self._streams["pre_unshard"])
             handles_key = tuple(handles)
             self._needs_pre_forward_unshard[handles_key] = False
             torch.cuda.current_stream().wait_stream(self._streams["unshard"])
@@ -3137,8 +3139,10 @@ def _summon_full_params(
 
         self._clear_grads_if_needed()
         free_unsharded_flat_params = [handle.needs_unshard() for handle in self._handles]
-        self._unshard(self._handles)
-        torch.cuda.current_stream().wait_stream(self._streams["unshard"])
+        # No need to call `wait_stream()` since we unshard in the computation
+        # stream directly
+        computation_stream = torch.cuda.current_stream()
+        self._unshard(self._handles, computation_stream, computation_stream)
         if with_grads:
             self._unshard_grads(self._handles)
 
@@ -3444,7 +3448,7 @@ def _pre_backward_hook(_handles: List[FlatParamHandle], *unused: Any) -> None:
 
                 # If the handles have been prefetched, this `_unshard()` simply
                 # switches to using the unsharded parameter
-                self._unshard(_handles)
+                self._unshard(_handles, self._streams["unshard"], self._streams["pre_unshard"])
                 torch.cuda.current_stream().wait_stream(self._streams["unshard"])
 
                 # Set this to `False` to ensure that a mistargeted prefetch

From c2cd0d89c244d1d7bf1baa5520612979ba1ca528 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@meta.com>
Date: Mon, 24 Oct 2022 15:09:40 +0000
Subject: [PATCH 0069/1922] [BE][einsum] add small comment explaining an
 invariant (#87264)

Tiny followup from https://github.com/pytorch/pytorch/pull/87135#discussion_r998488064

and another typo i noticed while doing the autograd lab
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87264
Approved by: https://github.com/soulitzer
---
 aten/src/ATen/native/Linear.cpp | 3 +++
 tools/autograd/derivatives.yaml | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp
index 7192cc6e1138c..b9b3abe3c7cae 100644
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@@ -545,6 +545,9 @@ Tensor einsum(c10::string_view equation, TensorList operands, at::OptionalIntArr
 
   // Sum out contraction dims
   if (perm_index - out_num_dim > 0) {
+    // if there were ops to contract, we would have already done so
+    // in the previous loop and all the dims to sum are now 1
+    // NB: use view instead of squeeze (or sum) for faster (mps) performance
     if (num_ops > 1) {
       auto sizes = ops[0].sym_sizes().vec();
       for (auto dim = perm_index - 1; dim >= out_num_dim; --dim) {
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 853faeb1b2033..c77f63e8c8e73 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -200,7 +200,7 @@
 #     preferable since it would be less efficient.
 #
 # NB: The parameter names here MUST be consistent with the parameter names
-# in Decalarations.yaml
+# in native_functions.yaml
 - name: abs(Tensor self) -> Tensor
   self: grad * self.sgn()
   result: handle_r_to_c(result.scalar_type(), self_t.conj() * self_p.sgn())

From 03328460d184f071531e5b436fe6d2104d5696e7 Mon Sep 17 00:00:00 2001
From: RangiLyu <lyuchqi@gmail.com>
Date: Mon, 24 Oct 2022 16:03:11 +0000
Subject: [PATCH 0070/1922] sync AveragedModel buffers when use_buffers=False
 (#84054)

Fixes #84053

As described in the issue, the AveragedModel will deep copy the model during initialization, which means that the buffers in the averaged model cannot be updated together with the model.

One solution is to make the buffers equal to the source model every time when calling `update_parameters`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84054
Approved by: https://github.com/samdow
---
 test/test_optim.py       | 7 +++++++
 torch/optim/swa_utils.py | 5 +++++
 2 files changed, 12 insertions(+)

diff --git a/test/test_optim.py b/test/test_optim.py
index 104bdb046d345..a55a74d5d8667 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -2833,6 +2833,7 @@ def test_averaged_model_exponential(self):
         # Test AveragedModel with EMA as avg_fn
         dnn = torch.nn.Sequential(
             torch.nn.Conv2d(1, 5, kernel_size=3),
+            torch.nn.BatchNorm2d(5, momentum=0.3),
             torch.nn.Linear(5, 10)
         )
         alpha = 0.9
@@ -2851,11 +2852,17 @@ def avg_fn(p_avg, p, n_avg):
                 else:
                     updated_averaged_params.append((p_avg * alpha +
                                                    p * (1 - alpha)).clone())
+            for b in dnn.buffers():
+                if b.size() != torch.Size([]):
+                    b.detach_().add_(torch.randn_like(b))
+
             averaged_dnn.update_parameters(dnn)
             averaged_params = updated_averaged_params
 
         for p_avg, p_swa in zip(averaged_params, averaged_dnn.parameters()):
             self.assertEqual(p_avg, p_swa)
+        for b_avg, b_swa in zip(dnn.buffers(), averaged_dnn.module.buffers()):
+            self.assertEqual(b_avg, b_swa)
 
     def test_averaged_model_exponential_buffers(self):
         # Test AveragedModel with EMA as avg_fn and use_buffers as True.
diff --git a/torch/optim/swa_utils.py b/torch/optim/swa_utils.py
index 4d2743a278c2e..f7a530f5ad0f1 100644
--- a/torch/optim/swa_utils.py
+++ b/torch/optim/swa_utils.py
@@ -132,6 +132,11 @@ def update_parameters(self, model):
             else:
                 p_swa.detach().copy_(self.avg_fn(p_swa.detach(), p_model_,
                                                  self.n_averaged.to(device)))
+        if not self.use_buffers:
+            # If not apply running averages to the buffers,
+            # keep the buffers in sync with the source model.
+            for b_swa, b_model in zip(self.module.buffers(), model.buffers()):
+                b_swa.detach().copy_(b_model.detach().to(device))
         self.n_averaged += 1
 
 
From fce53be82fc24abf564a1144ee272a1405e1076c Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 24 Oct 2022 11:47:40 -0400
Subject: [PATCH 0071/1922] Fix accuracy minifier (#87606)

Signed-off-by: Edward Z. Yang <ezyangfb.com>

cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang @penguinwu
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87606
Approved by: https://github.com/anjali411, https://github.com/anijain2305, https://github.com/albanD, https://github.com/soumith, https://github.com/malfet
---
 torch/_dynamo/debug_utils.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index 1134267c5f60d..ea5671a81d02f 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -326,7 +326,7 @@ def nvfuser_fails(fx_g, args, check_str=None):
 
 
 def inductor_accuracy_fails(fx_g, args, check_str=None):
-    from torchinductor.compile_fx import compile_fx_inner
+    from torch._inductor.compile_fx import compile_fx_inner
 
     return backend_aot_accuracy_fails(fx_g, args, compile_fx_inner)
 
@@ -874,10 +874,11 @@ def dynamo_minifier_backend(gm, example_inputs, compiler_name):
 @register_backend
 def dynamo_accuracy_minifier_backend(gm, example_inputs, compiler_name):
     from functorch.compile import minifier
-    from torchdynamo.optimizations.backends import BACKENDS
+
+    from torch._dynamo.optimizations.backends import BACKENDS
 
     if compiler_name == "inductor":
-        from torchinductor.compile_fx import compile_fx
+        from torch._inductor.compile_fx import compile_fx
 
         compiler_fn = compile_fx
     else:

From d7d37e38b39a6b5bfb15270fc938342954889c9a Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Sun, 23 Oct 2022 14:18:48 +0000
Subject: [PATCH 0072/1922] Add distributed dynamo benchmarking utils (#87419)

Util for convenient local benchmarking/debugging of distributed models.  Not to be confused with the 'real' distributed benchmark script we use for torchbench experiments on slurm.  Tries to be simple/hackable and let you use different combinations of DDP/FSDP with models and dynamo backends.

Example usage
`python benchmarks/dynamo/distributed.py --toy_model --dynamo inductor --ddp`

`--dynamo` flag accepts normal dynamo backends (plus 'print' which literally prints graphs to screen)
`--torchbench_model <model_name>` works in place of `--toy_model`
`--fsdp` is WIP

cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87419
Approved by: https://github.com/jansel
---
 benchmarks/dynamo/dist_util.py   | 147 +++++++++++++++++++++++++++
 benchmarks/dynamo/distributed.py | 164 +++++++++++++++++++++++++++++++
 2 files changed, 311 insertions(+)
 create mode 100644 benchmarks/dynamo/dist_util.py
 create mode 100644 benchmarks/dynamo/distributed.py

diff --git a/benchmarks/dynamo/dist_util.py b/benchmarks/dynamo/dist_util.py
new file mode 100644
index 0000000000000..9e2f086ca8b70
--- /dev/null
+++ b/benchmarks/dynamo/dist_util.py
@@ -0,0 +1,147 @@
+import argparse
+import functools
+import importlib
+import os
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch._dynamo.testing import reduce_to_scalar_loss
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    apply_activation_checkpointing,
+    checkpoint_wrapper,
+    CheckpointImpl,
+)
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
+
+try:
+    from .torchbench import setup_torchbench_cwd
+except ImportError:
+    from torchbench import setup_torchbench_cwd
+
+
+def setup(rank, world_size):
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+
+
+def cleanup():
+    dist.destroy_process_group()
+
+
+class CustomLinear(torch.nn.Module):
+    def __init__(self, a, b):
+        super(CustomLinear, self).__init__()
+        self.weight = nn.Parameter(torch.randn(a, b))
+
+    def forward(self, x):
+        return torch.mm(x, self.weight)
+
+
+class MyModule(torch.nn.Module):
+    def __init__(self, a, b):
+        super(MyModule, self).__init__()
+        self.net = nn.Sequential(
+            nn.Linear(a, b),
+            nn.ReLU(),
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class ToyModel(nn.Module):
+    def __init__(self):
+        super(ToyModel, self).__init__()
+        self.net = nn.Sequential(
+            *[nn.Linear(10, 10000), nn.ReLU()]
+            + [nn.Linear(10000, 10000), nn.ReLU()]
+            + [MyModule(10000, 10000)]
+            + [MyModule(10000, 1000)]
+            + [MyModule(1000, 1000)]
+            + [MyModule(1000, 1000)]
+            + [MyModule(1000, 1000)]
+            + [MyModule(1000, 1000)]
+            + [MyModule(1000, 1000)]
+            + [MyModule(1000, 1000)]
+            + [MyModule(1000, 1000)]
+            + [nn.Linear(1000, 5)]
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+def model_iter_fn(model, example_inputs, collect_outputs=False):
+    outputs = model(*example_inputs)
+    loss = reduce_to_scalar_loss(outputs)
+    loss.backward()
+    if collect_outputs:
+        return outputs
+
+
+def get_model(args):
+    if args.torchbench_model:
+        old_cwd = setup_torchbench_cwd()
+        module = importlib.import_module(
+            f"torchbenchmark.models.{args.torchbench_model}"
+        )
+        benchmark_cls = getattr(module, "Model", None)
+        bm = benchmark_cls(
+            test="train", device=args.device, jit=False, batch_size=args.batch_size
+        )
+        model, inputs = bm.get_module()
+    elif args.toy_model:
+        model = ToyModel()
+        inputs = (torch.randn(20, 10),)
+    else:
+        raise argparse.ArgumentError(
+            args.torchbench_model, message="Must specify a model"
+        )
+
+    return model, inputs
+
+
+def fsdp_checkpointing_base(model, blocks):
+    """apply activation checkpointing to model
+    returns None as model is updated directly
+    """
+    non_reentrant_wrapper = functools.partial(
+        checkpoint_wrapper,
+        offload_to_cpu=False,
+        checkpoint_impl=CheckpointImpl.NO_REENTRANT,
+    )
+
+    def check_fn(submodule):
+        return isinstance(submodule, blocks)
+
+    apply_activation_checkpointing(
+        model, checkpoint_wrapper_fn=non_reentrant_wrapper, check_fn=check_fn
+    )
+
+
+# from transformers.models.t5.modeling_t5 import T5Block
+
+MODEL_FSDP_WRAP = {
+    ToyModel: (MyModule,)
+    # TODO T5: (T5Block,)
+}
+
+
+def apply_fsdp(model, use_checkpointing=False, use_wrap_policy=True):
+    blocks = MODEL_FSDP_WRAP[model.__class__]
+
+    wrap_policy = None
+    if use_wrap_policy:
+        # transformer policy is really a generic policy that wraps modules of specified classes
+        wrap_policy = functools.partial(
+            transformer_auto_wrap_policy, transformer_layer_cls=blocks
+        )
+
+    model = FSDP(model, auto_wrap_policy=wrap_policy)
+    if use_checkpointing:
+        fsdp_checkpointing_base(model, blocks)
+
+    return model
diff --git a/benchmarks/dynamo/distributed.py b/benchmarks/dynamo/distributed.py
new file mode 100644
index 0000000000000..b4332556c7bb3
--- /dev/null
+++ b/benchmarks/dynamo/distributed.py
@@ -0,0 +1,164 @@
+import argparse
+from functools import partial
+
+import numpy as np
+import tabulate
+import torch
+
+import torch._dynamo as dynamo
+import torch.multiprocessing as mp
+import torch.utils._pytree as pytree
+from torch._dynamo.testing import reduce_to_scalar_loss
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.profiler import profile, ProfilerActivity, record_function
+
+try:
+    from .common import timed
+    from .dist_util import apply_fsdp, cleanup, get_model, model_iter_fn, setup
+except ImportError:
+    from common import timed
+    from dist_util import apply_fsdp, cleanup, get_model, model_iter_fn, setup
+
+
+def profile_model(args, model, inputs, rank):
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+        for i in range(args.repeat):
+            with record_function("Forward"):
+                outputs = model(*inputs)
+                loss = reduce_to_scalar_loss(outputs)
+            with record_function("Backward"):
+                loss.backward()
+    if rank == 0:
+        prof.export_chrome_trace(args.trace_file)
+
+
+def run_model(args, model, inputs, rank, world_size, key, result_q):
+    setup(rank, world_size)
+    if args.device == "cuda":
+        # needed for FSDP
+        torch.cuda.set_device(rank)
+
+    dev_rank = f"{args.device}:{rank}"
+    model = model.to(dev_rank)
+
+    def move_tensor(maybe_tensor):
+        if torch.is_tensor(maybe_tensor):
+            return maybe_tensor.to(dev_rank)
+        return maybe_tensor
+
+    inputs = pytree.tree_map(move_tensor, inputs)
+
+    if args.fsdp:
+        model = apply_fsdp(
+            model,
+            use_checkpointing=args.fsdp_checkpoint,
+            use_wrap_policy=args.fsdp_wrap,
+        )
+    elif args.ddp:
+        model = DDP(model)
+
+    if args.verbose:
+        print(model)
+
+    if args.dynamo:
+        if args.verbose:
+            dynamo.config.verbose = True
+
+        def print_compile(gm, ex):
+            print(
+                f"print_compile:\n{str(gm.graph)}\n-----------------------------------------"
+            )
+            return gm
+
+        dynamo_ctx = dynamo.optimize(
+            print_compile if args.dynamo == "print" else args.dynamo
+        )
+        model = dynamo_ctx(model)
+
+    # warmup
+    _ = timed(model, model_iter_fn, inputs, times=3, return_result=False)
+    times = []
+    t_total = timed(
+        model, model_iter_fn, inputs, times=args.repeat, return_result=False
+    )
+    times.append(t_total / args.repeat)
+
+    if rank == 0:
+        result_q.put(times)
+
+    if args.profile:
+        profile_model(args, model, inputs, rank)
+
+    cleanup()
+
+
+def experiment(fn, key, world_size, results):
+    key = f"{key}_{world_size}"
+    dynamo.reset()
+    ctx = mp.get_context("spawn")
+    result_q = ctx.SimpleQueue()
+    f_args = (world_size, key, result_q)
+    if world_size > 1:
+        mp.spawn(
+            fn,
+            args=f_args,
+            nprocs=world_size,
+            join=True,
+        )
+    else:
+        # rank 0
+        fn(0, *f_args)
+    times = result_q.get()
+
+    results.append((key, np.median(times)))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--device", default="cuda")
+    parser.add_argument(
+        "--dynamo",
+        default=None,
+        help="if set to a str, uses dynamo[str] backend. else, eager",
+    )
+    parser.add_argument("--verbose", action="store_true")
+    parser.add_argument("--batch_size", default=None)
+    parser.add_argument("--profile", action="store_true", help="Run the profiler")
+    parser.add_argument("--trace_file", default="profile.json", help="Run the profiler")
+    parser.add_argument("--repeat", default=10, help="Repeats for timing run")
+    parser.add_argument(
+        "--world_size", type=int, default=2, help="Number of ranks/gpus for experiments"
+    )
+    parser.add_argument(
+        "--fsdp_checkpoint",
+        action="store_true",
+        help="whether to use gradient checkpointing via model-specific policy",
+    )
+    parser.add_argument(
+        "--fsdp_wrap",
+        action="store_true",
+        help="whether to apply fsdp to submodules via model-specific policy",
+    )
+
+    dist_arg = parser.add_mutually_exclusive_group()
+    dist_arg.add_argument("--ddp", action="store_true")
+    dist_arg.add_argument("--fsdp", action="store_true")
+
+    model_arg = parser.add_mutually_exclusive_group(required=True)
+    model_arg.add_argument(
+        "--torchbench_model", help="name of torchbench model, e.g. hf_Bert"
+    )
+    model_arg.add_argument(
+        "--toy_model", action="store_true", help="use toy model instead"
+    )
+    args = parser.parse_args()
+
+    model_name = "ToyModel" if args.toy_model else args.torchbench_model
+    model, inputs = get_model(args)
+
+    fn = partial(run_model, args, model, inputs)
+
+    times = []
+    experiment(fn, model_name, args.world_size, times)
+    print("\nExperiment Results:")
+    print(tabulate.tabulate(times, headers=("key", "time")))

From dc8e654390da54ecd377a68a2ff88840d041bddf Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Mon, 24 Oct 2022 14:48:33 +0000
Subject: [PATCH 0073/1922] [FSDP] Fix `use_orig_params=True` + AC (#87413)

Without this change, the post-backward hooks do not run when using reentrant activation checkpointing.

**Explanation**
FSDP registers the original parameters as plain `Tensor`s in the forward pass so that their ops are tracked by autograd to ensure proper gradient propagation into the `FlatParameter`s. FSDP registers the post-backward hooks in its pre-forward.

For `use_orig_params=True`, FSDP replaces the plain `Tensor`s with the sharded `nn.Parameter`s in the post-forward when resharding. This differs from `use_orig_params=False`, which keeps the plain `Tensor`s registered as attributes, except their data are freed, meaning that accessing them between forward and backward errors. Before this PR, for `use_orig_params=True`, FSDP simply restores the unsharded original parameter data in the pre-backward to enable correct gradient computation. However, this does not suffice for reentrant activation checkpointing (AC), where the recomputed forward happens after FSDP's pre-backward and the ops in the recomputed forward must be tracked by autograd.

My initial solution was to simply have FSDP restore the original parameters as plain `Tensor`s again in the pre-backward so that they would be tracked by autograd exactly like the normal forward. However, this seems to not suffice in general. The `FlatParameter`'s `AccumulateGrad` object may change after the original pre-forward when performing a recomputed forward.

The new approach in this PR is to follow the `use_orig_params=False` way -- namely, to preserve the plain `Tensor` variables across forward and backward. I achieved this by saving the variables explicitly in the forward and restoring them in the pre-backward. I clear them in the post-backward to avoid the dangling references (though, I do not think this is strictly necessary).

An alternative approach I considered is using forward hooks. However, this does not change the order of operations across FSDP, checkpoint, and the wrapped module, so it does not work. (As long as the order is FSDP(checkpoint(module)), then registered hooks still happen either before or after the checkpoint recomputation -- we cannot insert logic to run inside the checkpoint recomputation.)

**Test Plan**
I augmented the existing reentrant checkpointing unit tests to also test `use_orig_params=True`. I also verified that the pycls model does not error (even with the new approach).
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87413
Approved by: https://github.com/rohan-varma
---
 test/distributed/fsdp/test_fsdp_checkpoint.py | 32 ++++++++---
 torch/distributed/fsdp/flat_param.py          | 55 ++++++++++++++++---
 2 files changed, 71 insertions(+), 16 deletions(-)

diff --git a/test/distributed/fsdp/test_fsdp_checkpoint.py b/test/distributed/fsdp/test_fsdp_checkpoint.py
index 14456df92f84f..b75fa17f86bf5 100644
--- a/test/distributed/fsdp/test_fsdp_checkpoint.py
+++ b/test/distributed/fsdp/test_fsdp_checkpoint.py
@@ -111,16 +111,23 @@ def _verify_parity(self, losses, outputs, models):
         [CPUOffload(offload_params=True), CPUOffload(offload_params=False)],
     )
     @parametrize("offload_activations", [True, False])
-    def test_checkpoint_fsdp_wrapping(self, cpu_offload, offload_activations):
+    @parametrize("use_orig_params", [False, True])
+    def test_checkpoint_fsdp_wrapping(
+        self,
+        cpu_offload: CPUOffload,
+        offload_activations: bool,
+        use_orig_params: bool,
+    ):
         # Test checkpoint(FSDP(layer1), FSDP(layer2), ....)
         if offload_activations:
             wrapper_to_use = offload_wrapper
         else:
             wrapper_to_use = checkpoint_wrapper
 
+        fsdp_kwargs = {"cpu_offload": cpu_offload, "use_orig_params": use_orig_params}
         ckpt_sequential_wrapped_fsdp = wrapper_to_use(
             TestFSDPCheckpoint.SequentialModule(
-                wrap_fsdp=True, cpu_offload=cpu_offload
+                wrap_fsdp=True, **fsdp_kwargs,
             ),
         )
         # Test FSDP(checkpoint(layer1)), FSDP(checkpoint(layer2)), ....
@@ -128,11 +135,11 @@ def test_checkpoint_fsdp_wrapping(self, cpu_offload, offload_activations):
             checkpoint_layer=True,
             offload_activations=offload_activations,
             wrap_fsdp=True,
-            cpu_offload=cpu_offload,
+            **fsdp_kwargs,
         )
 
         baseline = TestFSDPCheckpoint.SequentialModule(
-            wrap_fsdp=True, cpu_offload=cpu_offload
+            wrap_fsdp=True, **fsdp_kwargs,
         )
 
         # note that reentrant-based checkpointing requires inputs to have grad
@@ -168,12 +175,19 @@ def test_checkpoint_fsdp_wrapping(self, cpu_offload, offload_activations):
         [CPUOffload(offload_params=True), CPUOffload(offload_params=False)],
     )
     @parametrize("offload_activations", [True, False])
-    def test_basic_checkpoint_end_to_end(self, cpu_offload, offload_activations):
+    @parametrize("use_orig_params", [False, True])
+    def test_basic_checkpoint_end_to_end(
+        self,
+        cpu_offload: CPUOffload,
+        offload_activations: bool,
+        use_orig_params: bool,
+    ):
+        fsdp_kwargs = {"cpu_offload": cpu_offload, "use_orig_params": use_orig_params}
         global _save_on_cpu_called
         with patch_save_on_cpu(get_patched_save_on_cpu()):
             seq = TestFSDPCheckpoint.SequentialModule().to(torch.cuda.current_device())
             # Runs FSDP with no checkpointing
-            fsdp_only_seq = FSDP(deepcopy(seq), cpu_offload=cpu_offload)
+            fsdp_only_seq = FSDP(deepcopy(seq), **fsdp_kwargs)
             # Runs checkpoint-wrapped FSDP
             if offload_activations:
                 wrapper_to_use = offload_wrapper
@@ -181,15 +195,15 @@ def test_basic_checkpoint_end_to_end(self, cpu_offload, offload_activations):
                 wrapper_to_use = checkpoint_wrapper
 
             checkpointed_fsdp = wrapper_to_use(
-                FSDP(deepcopy(seq), cpu_offload=cpu_offload),
+                FSDP(deepcopy(seq), **fsdp_kwargs),
             )
             # Runs FSDP-wrapped checkpointed module
             fsdp_wrapped_checkpoint = FSDP(
                 wrapper_to_use(deepcopy(seq)),
-                cpu_offload=cpu_offload,
+                **fsdp_kwargs,
             )
             # Runs FSDP with manual calls to checkpoint.
-            fsdp_call_checkpoint = FSDP(deepcopy(seq), cpu_offload=cpu_offload)
+            fsdp_call_checkpoint = FSDP(deepcopy(seq), **fsdp_kwargs)
             # note that reentrant-based checkpointing requires inputs to have grad
             # flag set.
 
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index 1e34510bd0225..266dc80b4ed42 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -212,6 +212,13 @@ class FlatParameter(nn.Parameter):
         _shared_params (Optional[List[nn.Parameter]]): The original shared
             parameter variables if ``use_orig_params=True`` and ``None``
             otherwise.
+        _tensors (Optional[List[Optional[Tensor]]]): This saves the ``Tensor``
+            views created in the forward and tracked by autograd when
+            ``use_orig_params=True`` and is ``None`` otherwise. This is to
+            preserve those ``Tensor`` variables for the backward to ensure that
+            the ``FlatParameter`` 's ``AccumulateGrad`` object does not change
+            in which case the post-backward hook does not run. This is relevant
+            for cases like reentrant activation checkpointing.
         _is_grad_none (Optional[List[bool]]): A mask over the original
             parameters' gradients indicating if it is logically ``None`` or not
             if ``use_orig_params=True`` and ``None`` otherwise. This is needed
@@ -273,10 +280,14 @@ def _init_metadata(
             self._is_grad_none: Optional[List[bool]] = [
                 False for _ in range(len(params))
             ]
+            self._tensors: Optional[List[Optional[Tensor]]] = [
+                None for _ in range(len(self._params))
+            ]
         else:
             self._params = None
             self._shared_params = None
             self._is_grad_none = None
+            self._tensors = None
         self._unpadded_unsharded_size = self.size()
         _set_fsdp_flattened(self)
 
@@ -835,11 +846,15 @@ def _use_unsharded_flat_param(
             unsharded_size
         )  # this `.view()` is not autograd visible
         in_forward = self._training_state == HandleTrainingState.FORWARD
+        in_pre_backward = self._training_state == HandleTrainingState.BACKWARD_PRE
         if self._use_orig_params:
-            # NOTE: When not in the forward, `as_params=True` suffices since we
-            # only need to restore the tensor *values* for backward computation
-            # and do not fresh `Tensor` views.
-            self._use_unsharded_views(as_params=(not in_forward))
+            # We use `Tensor` views in the forward so that they are tracked by
+            # autograd. We use them in the pre-backward as well to support
+            # reentrant activation checkpointing, which needs the views to be
+            # tracked by autograd in the backward pass's recomputed forward.
+            self._use_unsharded_views(
+                as_params=(not in_forward and not in_pre_backward)
+            )
         elif in_forward:
             self._use_unsharded_views(as_params=False)
 
@@ -903,7 +918,9 @@ def unshard_grad(self):
             self._check_sharded(flat_param.grad)
             flat_param._saved_grad_shard = flat_param.grad  # type: ignore[attr-defined]
             sharded_grad = flat_param._saved_grad_shard  # type: ignore[attr-defined]
-        dist.all_gather_into_tensor(padded_unsharded_grad, sharded_grad, self.process_group)
+        dist.all_gather_into_tensor(
+            padded_unsharded_grad, sharded_grad, self.process_group
+        )
         unsharded_size = self.flat_param._unpadded_unsharded_size
         flat_param.grad = padded_unsharded_grad[: unsharded_size.numel()].view(
             unsharded_size
@@ -1198,8 +1215,27 @@ def _use_unsharded_views(self, as_params: bool) -> None:
                 param.data = view
             elif as_params:
                 module.register_parameter(param_name, nn.Parameter(view))
-            else:
-                setattr(module, param_name, view)
+            else:  # `as_params=False`
+                param_var: Tensor = view
+                if self._use_orig_params:
+                    if self._training_state == HandleTrainingState.FORWARD:
+                        assert self.flat_param._tensors is not None
+                        # Save the `Tensor` for the pre-backward
+                        self.flat_param._tensors[i] = view  # save for pre-backward
+                    elif self._training_state == HandleTrainingState.BACKWARD_PRE:
+                        # Use the saved `Tensor` variable from the forward to
+                        # preserve the autograd graph so that the post-backward
+                        # hook fires (e.g. for reentrant AC)
+                        assert self.flat_param._tensors is not None  # mypy
+                        tensor = self.flat_param._tensors[i]
+                        p_assert(
+                            tensor is not None,
+                            "Expects `Tensor` to have been saved in forward",
+                        )
+                        tensor.data = view  # type: ignore[union-attr]
+                        assert tensor is not None  # mypy
+                        param_var = tensor
+                setattr(module, param_name, param_var)
         for i, (
             param_name,
             module,
@@ -1341,6 +1377,11 @@ def _use_sharded_views(self) -> None:
             setattr(module, param_name, param)
             prim_param = getattr(prim_module, prim_param_name)
             param.data = prim_param  # could be both empty and non-empty
+        if self._training_state == HandleTrainingState.BACKWARD_POST:
+            assert self.flat_param._tensors is not None  # mypy
+            # Clear the saved `Tensor`s since they are unneeded now
+            for i in range(len(self.flat_param._tensors)):
+                self.flat_param._tensors[i] = None  # type: ignore[index]
 
     @torch.no_grad()
     def _use_sharded_grad_views(self) -> None:

From dfcbd4d98f243ae51c795130507ff3d39b774776 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Mon, 24 Oct 2022 18:41:38 +0000
Subject: [PATCH 0074/1922] attempted fix for nvrtc with lovelace (#87611)

Fixes #87595 (maybe?)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87611
Approved by: https://github.com/malfet, https://github.com/atalman
---
 aten/src/ATen/native/cuda/jit_utils.cpp            | 2 ++
 torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/aten/src/ATen/native/cuda/jit_utils.cpp b/aten/src/ATen/native/cuda/jit_utils.cpp
index b292d488708bf..a1266fb1b5044 100644
--- a/aten/src/ATen/native/cuda/jit_utils.cpp
+++ b/aten/src/ATen/native/cuda/jit_utils.cpp
@@ -893,6 +893,8 @@ void codegenOutputQuery(
     max_dev_version = CUDAVersion(7, 5);
   } else if (nvrtc_version == CUDAVersion(11, 0)) { // 11.0 supports 3-8.0
     max_dev_version = CUDAVersion(8, 0);
+  } else if (nvrtc_major == 11 && nvrtc_minor < 8) {
+    max_dev_version = CUDAVersion(8, 6);
   } else {
     // If the driver version is unknown (i.e. newer than this code)
     // assume the driver supports this device
diff --git a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
index 85bd74bfdbae4..85de541f4ba78 100644
--- a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
+++ b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
@@ -64,6 +64,8 @@ void codegenOutputQuery(
     max_dev_version = CudaVersion(7, 5);
   } else if (nvrtc_version == CudaVersion(11, 0)) { // 11.0 supports 3-8.0
     max_dev_version = CudaVersion(8, 0);
+  } else if (nvrtc_version.first == 11 && nvrtc_version.second < 8) {
+    max_dev_version = CudaVersion(8, 6);
   } else {
     // If the driver version is unknown (i.e. newer than this code)
     // assume the driver supports this device

From 8fc7e2cfa059fa78aae0b5c1f48ba36461cd4023 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 24 Oct 2022 18:48:46 +0000
Subject: [PATCH 0075/1922] [dynamo] Support class members in nn modules
 (#87531)

Fixes https://github.com/pytorch/torchdynamo/issues/1740

@voznesenskym

cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang @penguinwu
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87531
Approved by: https://github.com/jansel
---
 test/dynamo/test_repros.py           | 17 +++++++++++++++++
 torch/_dynamo/variables/nn_module.py | 11 ++++++++++-
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index ffc71741d72c2..52802f32ad1e8 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -1719,6 +1719,23 @@ def forward(self, getitem_1, getitem_2, add):
         ]
         self.assertTrue(same_two_models(mod, opt_mod, args))
 
+    def test_class_member(self):
+        class Foo(torch.nn.Module):
+            a = 4
+            b = torch.ones(3, 4)
+
+            def __init__(self):
+                super().__init__()
+                self.c = 4
+
+            def forward(self, x):
+                return x.cos() + self.a + self.b + self.c
+
+        mod = Foo()
+        opt_mod = torch._dynamo.optimize("eager", nopython=True)(mod)
+        args = (torch.randn(3, 4),)
+        self.assertTrue(same(mod(*args), opt_mod(*args)))
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py
index 4da389bbd8c47..87a94565e180a 100644
--- a/torch/_dynamo/variables/nn_module.py
+++ b/torch/_dynamo/variables/nn_module.py
@@ -14,7 +14,13 @@
 from ..guards import GuardBuilder
 from ..mutation_guard import GenerationTracker
 from ..source import AttrSource, GetItemSource, NNModuleSource, NotNNModuleSource
-from ..utils import is_lazy_module, istype, proxy_args_kwargs
+from ..utils import (
+    is_lazy_module,
+    is_safe_constant,
+    istensor,
+    istype,
+    proxy_args_kwargs,
+)
 from .base import MutableLocal, typestr, VariableTracker
 from .functions import invoke_and_store_as_constant
 from .lists import SliceVariable
@@ -139,6 +145,9 @@ def var_getattr(self, tx, name):
                 return variables.UserFunctionVariable(subobj.__get__(base), **options)
             elif istype(subobj, types.FunctionType):
                 return variables.UserMethodVariable(subobj, self, **options)
+            elif is_safe_constant(subobj) or istensor(subobj):
+                # Support possibly common cases of class members
+                return VariableBuilder(tx, NNModuleSource(source))(subobj)
             else:
                 unimplemented(f"class property {typestr(base)} {typestr(subobj)}")
 

From 66522e6fd9e2474cb84ed7da44c101050cafcaf0 Mon Sep 17 00:00:00 2001
From: atalman <atalman@fb.com>
Date: Mon, 24 Oct 2022 19:38:07 +0000
Subject: [PATCH 0076/1922] Fix distributed issue by including distributed
 files (#87615)

This fixes regression in distributed headers installation.
Caused by following PR: https://github.com/pytorch/pytorch/pull/85953
which removed the inclusions

Fixes #87173

Test plan from wheel build by this CI: https://github.com/pytorch/pytorch/actions/runs/3314742519

```
[ec2-user@ip-10-0-9-132 c10d]$ pwd
/home/ec2-user/actions-runner/_work/_temp/artifacts/torch/include/torch/csrc/distributed/c10d
[ec2-user@ip-10-0-9-132 c10d]$ ls -las
total 300
 4 drwxr-xr-x 2 ec2-user ec2-user  4096 Oct 24 19:12 .
 0 drwxr-xr-x 4 ec2-user ec2-user    29 Oct 24 19:12 ..
12 -rw-r--r-- 1 ec2-user ec2-user  9051 Oct 24 17:28 Backend.hpp
 4 -rw-r--r-- 1 ec2-user ec2-user   216 Oct 24 17:28 c10d.h
 4 -rw-r--r-- 1 ec2-user ec2-user  3880 Oct 24 17:28 comm.hpp
 4 -rw-r--r-- 1 ec2-user ec2-user   604 Oct 24 17:28 debug.h
 4 -rw-r--r-- 1 ec2-user ec2-user  1717 Oct 24 17:28 default_comm_hooks.hpp
 4 -rw-r--r-- 1 ec2-user ec2-user  1316 Oct 24 17:28 error.h
 4 -rw-r--r-- 1 ec2-user ec2-user   962 Oct 24 17:28 exception.h
 4 -rw-r--r-- 1 ec2-user ec2-user  1461 Oct 24 17:28 FileStore.hpp
 4 -rw-r--r-- 1 ec2-user ec2-user   771 Oct 24 17:28 GlooDeviceFactory.hpp
 4 -rw-r--r-- 1 ec2-user ec2-user  1154 Oct 24 17:28 HashStore.hpp
 4 -rw-r--r-- 1 ec2-user ec2-user  4058 Oct 24 17:28 logger.hpp
 4 -rw-r--r-- 1 ec2-user ec2-user  2059 Oct 24 17:28 logging.h
 8 -rw-r--r-- 1 ec2-user ec2-user  7979 Oct 24 17:28 NCCLUtils.hpp
 4 -rw-r--r-- 1 ec2-user ec2-user  2756 Oct 24 17:28 Ops.hpp
 4 -rw-r--r-- 1 ec2-user ec2-user  1814 Oct 24 17:28 ParamCommsUtils.hpp
 4 -rw-r--r-- 1 ec2-user ec2-user  1478 Oct 24 17:28 PrefixStore.hpp
16 -rw-r--r-- 1 ec2-user ec2-user 13235 Oct 24 17:28 ProcessGroupGloo.hpp
12 -rw-r--r-- 1 ec2-user ec2-user 11298 Oct 24 17:28 ProcessGroup.hpp
12 -rw-r--r-- 1 ec2-user ec2-user  8645 Oct 24 17:28 ProcessGroupMPI.hpp
28 -rw-r--r-- 1 ec2-user ec2-user 26526 Oct 24 17:28 ProcessGroupNCCL.hpp
 4 -rw-r--r-- 1 ec2-user ec2-user  3805 Oct 24 17:28 ProcessGroupRoundRobin.hpp
12 -rw-r--r-- 1 ec2-user ec2-user 10361 Oct 24 17:28 ProcessGroupUCC.hpp
 8 -rw-r--r-- 1 ec2-user ec2-user  5062 Oct 24 17:28 ProcessGroupWrapper.hpp
 8 -rw-r--r-- 1 ec2-user ec2-user  4201 Oct 24 17:28 PyProcessGroup.hpp
 4 -rw-r--r-- 1 ec2-user ec2-user  1072 Oct 24 17:28 python_comm_hook.h
24 -rw-r--r-- 1 ec2-user ec2-user 23859 Oct 24 17:28 reducer.hpp
 4 -rw-r--r-- 1 ec2-user ec2-user  2330 Oct 24 17:28 reducer_timer.hpp
 4 -rw-r--r-- 1 ec2-user ec2-user  1683 Oct 24 17:28 sequence_num.hpp
 4 -rw-r--r-- 1 ec2-user ec2-user  2108 Oct 24 17:28 socket.h
 4 -rw-r--r-- 1 ec2-user ec2-user  2589 Oct 24 17:28 Store.hpp
 4 -rw-r--r-- 1 ec2-user ec2-user  3264 Oct 24 17:28 TCPStore.hpp
 8 -rw-r--r-- 1 ec2-user ec2-user  6944 Oct 24 17:28 TraceUtils.h
 8 -rw-r--r-- 1 ec2-user ec2-user  4539 Oct 24 17:28 Types.hpp
 4 -rw-r--r-- 1 ec2-user ec2-user   580 Oct 24 17:28 UCCForNCCL.hpp
 4 -rw-r--r-- 1 ec2-user ec2-user  2301 Oct 24 17:28 UCCTracing.hpp
 8 -rw-r--r-- 1 ec2-user ec2-user  4933 Oct 24 17:28 UCCUtils.hpp
 4 -rw-r--r-- 1 ec2-user ec2-user   584 Oct 24 17:28 UnixSockUtils.hpp
24 -rw-r--r-- 1 ec2-user ec2-user 20796 Oct 24 17:28 Utils.hpp
 4 -rw-r--r-- 1 ec2-user ec2-user   575 Oct 24 17:28 WinSockUtils.hpp
 8 -rw-r--r-- 1 ec2-user ec2-user  4259 Oct 24 17:28 Work.hpp
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87615
Approved by: https://github.com/malfet
---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index f844c690b74fb..e3eb3ced6005b 100644
--- a/setup.py
+++ b/setup.py
@@ -1097,7 +1097,8 @@ def main():
         'include/torch/csrc/autograd/generated/*.h',
         'include/torch/csrc/autograd/utils/*.h',
         'include/torch/csrc/cuda/*.h',
-        'include/torch/csrc/distributed/c10d/exception.h',
+        'include/torch/csrc/distributed/c10d/*.h',
+        'include/torch/csrc/distributed/c10d/*.hpp',
         'include/torch/csrc/distributed/rpc/*.h',
         'include/torch/csrc/jit/*.h',
         'include/torch/csrc/jit/backends/*.h',

From 06db644a8c3a2d20f549a1af5c68083f57859bbd Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Mon, 24 Oct 2022 19:41:53 +0000
Subject: [PATCH 0077/1922] [Vulkan][TCC] Implement tests for hardtanh,
 hardtanh_, relu and relu_ (#87506)

Summary:
Implement Vulkan tests for these untested functions in Clamp.cpp:
 - hardtanh
 - hardtanh_
 - relu
 - relu_

Test Plan:
```cd ~/fbsource
buck run //xplat/caffe2:pt_vulkan_api_test_binAppleMac\#macosx-arm64```

Reviewed By: kirklandsign

Differential Revision: D40603655

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87506
Approved by: https://github.com/salilsdesai
---
 aten/src/ATen/test/vulkan_api_test.cpp | 62 ++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index a0f00daed5742..d122438f67586 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -1445,6 +1445,36 @@ TEST_F(VulkanAPITest, hardshrink_) {
   }
 }
 
+TEST_F(VulkanAPITest, hardtanh) {
+  const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) * 10;
+  const auto in_vulkan = in_cpu.vulkan();
+
+  const auto out_cpu = at::hardtanh(in_cpu, 3, 7);
+  const auto out_vulkan = at::hardtanh(in_vulkan, 3, 7);
+
+  const auto check = almostEqual(out_cpu, out_vulkan.cpu());
+  if (!check) {
+    showRtol(out_cpu, out_vulkan.cpu());
+  }
+
+  ASSERT_TRUE(check);
+}
+
+TEST_F(VulkanAPITest, hardtanh_) {
+  auto a_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) * 10;
+  auto a_vulkan = a_cpu.vulkan();
+
+  at::hardtanh_(a_cpu, 3, 7);
+  at::hardtanh_(a_vulkan, 3, 7);
+
+  const auto check = almostEqual(a_cpu, a_vulkan.cpu());
+  if (!check) {
+    showRtol(a_cpu, a_vulkan.cpu());
+  }
+
+  ASSERT_TRUE(check);
+}
+
 TEST_F(VulkanAPITest, layer_norm_invalid_inputs) {
   c10::InferenceMode mode;
 
@@ -2229,6 +2259,38 @@ TEST_F(VulkanAPITest, mul_to_scalar_wrapped) {
   ASSERT_TRUE(check);
 }
 
+TEST_F(VulkanAPITest, relu) {
+  const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto in_vulkan = in_cpu.vulkan();
+
+  const auto out_cpu = at::relu(in_cpu);
+  const auto out_vulkan = at::relu(in_vulkan);
+
+  const auto check = almostEqual(out_cpu, out_vulkan.cpu());
+
+  if (!check) {
+    showRtol(out_cpu, out_vulkan.cpu());
+  }
+
+  ASSERT_TRUE(check);
+}
+
+TEST_F(VulkanAPITest, relu_) {
+  auto a_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat));
+  auto a_vulkan = a_cpu.vulkan();
+
+  at::relu_(a_cpu);
+  at::relu_(a_vulkan);
+
+  const auto check = almostEqual(a_cpu, a_vulkan.cpu());
+
+  if (!check) {
+    showRtol(a_cpu, a_vulkan.cpu());
+  }
+
+  ASSERT_TRUE(check);
+}
+
 TEST_F(VulkanAPITest, reflection_pad2d) {
   const auto a_cpu = at::rand({2, 3, 47, 63}, at::device(at::kCPU).dtype(at::kFloat));
   const auto a_vulkan = a_cpu.vulkan();

From d23007739747767ccdeceba58e2226ad23bd2b4c Mon Sep 17 00:00:00 2001
From: alexmsettle <37422826+alexmsettle@users.noreply.github.com>
Date: Mon, 24 Oct 2022 20:02:56 +0000
Subject: [PATCH 0078/1922] New feature for issue #85575. (#86514)

Introduced RECORD_OUTPUTS() macro that goes with RECORD_FUNCTION(). It is used to capture the output tensors from a kernel launch.  The tensors automatically get passed to the profiler using record_function methods.  This allows the profiler to track the tensors that flow into and out of each op.

Fixes #85575

cc @robieta @chaekit @aaronenyeshi @ngimel @nbcsm @guotuofeng @guyang3532 @gaoteng-git @tiffzhaofb
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86514
Approved by: https://github.com/robieta
---
 aten/src/ATen/record_function.h              | 10 ++++++++++
 torch/csrc/jit/codegen/cuda/kernel_cache.cpp |  8 ++++++++
 2 files changed, 18 insertions(+)

diff --git a/aten/src/ATen/record_function.h b/aten/src/ATen/record_function.h
index d4c143211a21a..323dc5f888b87 100644
--- a/aten/src/ATen/record_function.h
+++ b/aten/src/ATen/record_function.h
@@ -611,6 +611,16 @@ void record_function_with_scope_and_debug_handle(
   RECORD_WITH_SCOPE_DEBUG_HANDLE_AND_INPUTS(            \
       at::RecordScope::LITE_INTERPRETER, fn, debug_handle, inputs)
 
+// Bookend to the RECORD_FUNCTION macros.  Use this after the kernel
+// launch to let the profiler bind the outputs to the op that produced
+// them.  Note that guard is declared by RECORD_FUNCTION so this macro
+// needs to be called from the same scope as RECORD_FUNCTION
+#define RECORD_OUTPUTS(outputs)                                    \
+  if (guard.needsOutputs()) {                                      \
+    guard.setOutputs(                                              \
+        std::vector<c10::IValue>(outputs.begin(), outputs.end())); \
+  }
+
 /**
  * addThreadLocalCallback adds a thread local callback to run with
  * RecordFunction, returns handle to use with removeThreadLocalCallback
diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
index 85448dc8ac418..d4e4343e64d58 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
+++ b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
@@ -196,7 +196,15 @@ std::vector<at::Tensor> FusionExecutorCache::runFusionWithInputs(
 
   auto kernel_runtime = getKernelRuntimeFor(args);
   most_recent_runtime_ = kernel_runtime;
+  int seq_id = 0;
+  // Record kernel input and output tensors so profiler can construct
+  // the data flow graph
+  RECORD_FUNCTION(
+      "run_fused_kernel",
+      std::vector<c10::IValue>(inputs.begin(), inputs.end()),
+      seq_id);
   auto outputs = kernel_runtime->runWithInput(args);
+  RECORD_OUTPUTS(outputs);
 
   // permute output tensor returned by kernel execution. See Part_3 in Note [
   // Permutation support in nvfuser ]

From 683f68c2a30846fe4c5b5899ef91ce01a71ece6a Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Mon, 24 Oct 2022 12:46:27 -0700
Subject: [PATCH 0079/1922] Add codeowners for functorch (#86213)

The list is for people who want to be notified on changes to the files
in there. Review is not required from the list of names; I just want to be
notified to keep track of what is going on.

Let me know if you want your names added too in this PR.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86213
Approved by: https://github.com/Chillee
---
 CODEOWNERS | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/CODEOWNERS b/CODEOWNERS
index 3bddc2f0373e4..8fdc5fc776632 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -96,6 +96,12 @@ test/test_binary_ufuncs.py @mruberry @ngimel
 test/test_reductions.py @mruberry @ngimel
 test/test_type_promotion.py @mruberry @ngimel
 
+# functorch-related things
+# This list is for people wanting to be notified every time there's a change
+# Useful for e.g. auditing xfails that other folks add to tests
+test/functorch/test_ops.py @zou3519
+test/functorch/test_vmap.py @zou3519
+
 # torch MPS
 test/test_mps.py @kulinseth
 aten/src/ATen/mps/ @kulinseth

From 407353e7c29184c8c6d68c631d0671b023ed33a7 Mon Sep 17 00:00:00 2001
From: Catherine Lee <csl@fb.com>
Date: Mon, 24 Oct 2022 20:21:16 +0000
Subject: [PATCH 0080/1922] cpp docs push fix (#87614)

currently failing with
```
To https://github.com/pytorch/cppdocs
 + 2825b2745bb...80ec4daa657 HEAD -> pytorchbot/temp-branch-cpp (forced update)
Branch 'master' set up to track remote branch 'pytorchbot/temp-branch-cpp' from 'origin'.
++ sleep 30
++ git push -u origin
fatal: The upstream branch of your current branch does not match
the name of your current branch.  To push to the upstream branch
on the remote, use

    git push origin HEAD:pytorchbot/temp-branch-cpp

To push to the branch of the same name on the remote, use

    git push origin HEAD

```

just checked the settings, master of pytorch/cppdocs does not have easy cla as a required check, so we don't need the temp branch
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87614
Approved by: https://github.com/huydhn
---
 .circleci/scripts/cpp_doc_push_script.sh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.circleci/scripts/cpp_doc_push_script.sh b/.circleci/scripts/cpp_doc_push_script.sh
index 6e66514ae93b9..4c22677e94bd3 100755
--- a/.circleci/scripts/cpp_doc_push_script.sh
+++ b/.circleci/scripts/cpp_doc_push_script.sh
@@ -98,9 +98,6 @@ git commit -m "Generate C++ docs from pytorch/pytorch@${GITHUB_SHA}" || true
 git status
 
 if [[ "${WITH_PUSH:-}" == true ]]; then
-  # push to a temp branch first to trigger CLA check and satisfy branch protections
-  git push -u origin HEAD:pytorchbot/temp-branch-cpp -f
-  sleep 30
   git push -u origin
 fi
 

From 4d19d5a6c416f7a7fbc04538cc81ceda59259571 Mon Sep 17 00:00:00 2001
From: shubhambhokare1 <shubhambhokare@gmail.com>
Date: Mon, 24 Oct 2022 20:48:29 +0000
Subject: [PATCH 0081/1922] [ONNX] Enable test_fill script test (#79555)

For scripting mode, aten::clone requires input to be a TensorType. Hence if we encounter an IntType, FloatType or BoolType input, we set the input to the appropriate TensorType
Pull Request resolved: https://github.com/pytorch/pytorch/pull/79555
Approved by: https://github.com/justinchuby, https://github.com/BowenBao, https://github.com/abock
---
 test/onnx/test_pytorch_onnx_onnxruntime.py        |  9 ++++++++-
 .../passes/onnx/remove_inplace_ops_for_onnx.cpp   | 15 +++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 43e8d3579c192..e917e44ce21bd 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -11393,7 +11393,6 @@ def forward(self, x, y):
         self.run_test(M_ToDeviceDtype(), (x, y))
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @skipScriptTest()
     def test_fill(self):
         class FillModule(torch.nn.Module):
             def forward(self, x, filled_value: int):
@@ -11403,6 +11402,14 @@ def forward(self, x, filled_value: int):
         filled_value = 7
         self.run_test(FillModule(), (x, filled_value))
 
+        class FillFloatModule(torch.nn.Module):
+            def forward(self, x, filled_value: float):
+                return x.fill_(filled_value)
+
+        x = torch.randn((4, 5, 6))
+        filled_value = 7.5
+        self.run_test(FillFloatModule(), (x, filled_value))
+
         class FillScalarModule(torch.nn.Module):
             def forward(self, x):
                 res = x + 2
diff --git a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
index db74dca360e3f..efb7686fae3fe 100644
--- a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
+++ b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
@@ -136,6 +136,21 @@ Node* addDummyClone(
       orig_data->type()->kind() == TypeKind::BoolType) {
     auto* noneNode = graph->create(prim::Constant);
     noneNode->output()->setType(NoneType::get());
+    // For scripting mode, aten::clone requires input to be a TensorType
+    // Hence if we encounter an IntType, FloatType, or BoolType,
+    // we set the input to the appropriate TensorType
+    if (orig_data->type()->kind() == TypeKind::IntType &&
+        insertBefore == false) {
+      orig_data->setType(TensorType::fromNumberType(*IntType::get()));
+    } else if (
+        orig_data->type()->kind() == TypeKind::FloatType &&
+        insertBefore == false) {
+      orig_data->setType(TensorType::fromNumberType(*FloatType::get()));
+    } else if (
+        orig_data->type()->kind() == TypeKind::BoolType &&
+        insertBefore == false) {
+      orig_data->setType(TensorType::fromBoolType());
+    }
     newNode = graph->create(aten::clone, /*num_outputs =*/1);
     newNode->addInput(orig_data);
     newNode->addInput(noneNode->output());

From 660d8ec815bf60e1c42824a333b58163a9ec3e4d Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Mon, 24 Oct 2022 21:03:58 +0000
Subject: [PATCH 0082/1922] small improvement to error message in fx
 interpreter (#87599)

From https://github.com/pytorch/pytorch/pull/84246/files#r972537173
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87599
Approved by: https://github.com/ezyang
---
 torch/fx/interpreter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/fx/interpreter.py b/torch/fx/interpreter.py
index aac20b6e649d0..95218bf271657 100644
--- a/torch/fx/interpreter.py
+++ b/torch/fx/interpreter.py
@@ -134,7 +134,7 @@ def run(self, *args, initial_env : Optional[Dict[Node, Any]] = None, enable_io_p
                 msg += f"\nOriginal traceback:\n{node.stack_trace}"
                 e.args = (msg,) + e.args[1:]
                 if isinstance(e, KeyError):
-                    raise RuntimeError(*e.args)
+                    raise RuntimeError(*e.args) from e
                 raise
 
             if self.garbage_collect_values:

From dd7c5d2b5d539e00df418414fa04b7014396778d Mon Sep 17 00:00:00 2001
From: Greg Hogan <gregjhogan@gmail.com>
Date: Mon, 24 Oct 2022 21:25:36 +0000
Subject: [PATCH 0083/1922] ada lovelace (arch 8.9) support (#87436)

changes required to be able to compile https://github.com/pytorch/vision and https://github.com/nvidia/apex for `sm_89` architecture
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87436
Approved by: https://github.com/ngimel
---
 .../upstream/FindCUDA/select_compute_arch.cmake       | 11 +++++++++++
 torch/utils/cpp_extension.py                          |  3 ++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
index 7f22d476d2fbe..822c041ee5268 100644
--- a/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
+++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
@@ -98,8 +98,19 @@ if(NOT CUDA_VERSION VERSION_LESS "11.1")
   list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.6")
   set(CUDA_LIMIT_GPU_ARCHITECUTRE "8.6")
 
+  if(CUDA_VERSION VERSION_LESS "11.8")
+    set(CUDA_LIMIT_GPU_ARCHITECTURE "8.9")
+  endif()
+endif()
+
+if(NOT CUDA_VERSION VERSION_LESS "11.8")
+  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Ada")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.9")
+  list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.9")
+
   if(CUDA_VERSION VERSION_LESS "12.0")
     set(CUDA_LIMIT_GPU_ARCHITECTURE "9.0")
+    list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.9+PTX")
   endif()
 endif()
 
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 36811bf22dedc..612ae9fdf0785 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -1729,10 +1729,11 @@ def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
         ('Volta', '7.0+PTX'),
         ('Turing', '7.5+PTX'),
         ('Ampere', '8.0;8.6+PTX'),
+        ('Ada', '8.9+PTX'),
     ])
 
     supported_arches = ['3.5', '3.7', '5.0', '5.2', '5.3', '6.0', '6.1', '6.2',
-                        '7.0', '7.2', '7.5', '8.0', '8.6']
+                        '7.0', '7.2', '7.5', '8.0', '8.6', '8.9']
     valid_arch_strings = supported_arches + [s + "+PTX" for s in supported_arches]
 
     # The default is sm_30 for CUDA 9.x and 10.x

From 3cc34e9b0023a6a31f2ce544f7d1785cd9520836 Mon Sep 17 00:00:00 2001
From: Michael Suo <suo@fb.com>
Date: Mon, 24 Oct 2022 14:29:00 -0700
Subject: [PATCH 0084/1922] [dynamo] fix `explain` (#87640)

Another casualty of the core move
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87640
Approved by: https://github.com/voznesenskym
---
 torch/_dynamo/eval_frame.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 40beba357b1cf..9895da4ad9bba 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -379,7 +379,7 @@ def toy_example(a, b):
     )
 
 
-@patch("torchdynamo.symbolic_convert.explain", True)
+@patch("torch._dynamo.symbolic_convert.explain", True)
 def explain(f, *args, **kwargs):
     # TODO(voz): Do we want a decorator for this?
     from . import reset

From 8efc2b557ae9325b5e17aedcc71f1fde548eb5de Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 24 Oct 2022 21:53:14 +0000
Subject: [PATCH 0085/1922] [inductor] Prevent aggressive fusion during
 inductor lowering (#87447)

Fixes https://github.com/pytorch/torchdynamo/issues/1599

Inductor performs aggressive fusion of ops during the lowering of Fx graph into IR nodes. Note that this fusion is different from the fusion that we typically discuss in the context of Inductor, which refers to the fusion of SchedulerNodes (way after lowering). This PR, instead, ensures that we don't accumulate too many ops in the IR node to begin with.

In the case of hf_t5_large backward graph, earlier we would generate a kernel with 100s of operators, causing that kernel to take ~350 seconds of compilation time. With this PR, we get it down from 350 seconds to 50 seconds.

Note that this could affect performance. I doubt that it will lead to really large dip though. In my toy examples, even if the lowering creates multiple IR nodes, if its a simple fusion, later fusion still creates one node.

I would like (1) test_torchinductor.py, (2) test_torchinductor_info.py, and (3) atleast HF models to be enabled in CI before merging this one.

@ngimel @jansel @Chillee

cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang @penguinwu
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87447
Approved by: https://github.com/jansel
---
 test/inductor/test_torchinductor.py | 86 +++++++++++++++++++++++++++++
 torch/_inductor/config.py           |  5 ++
 torch/_inductor/graph.py            | 10 ++++
 torch/_inductor/ir.py               | 21 ++++++-
 4 files changed, 119 insertions(+), 3 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 52f36500b5025..e0501e0e8adef 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -1930,6 +1930,92 @@ def test_layer_norm(self):
         if self.device != "cpu":
             self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
+    def test_transpose_add(self):
+        def fn(a, b):
+            return a.t() + b
+
+        self.common(
+            fn, (torch.randn([16, 32]), torch.randn([32, 16])), check_lowp=False
+        )
+        if self.device != "cpu":
+            self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
+
+    def test_softmax_one_kernel(self):
+        def fn(x):
+            dim = 1
+            x_max = torch.amax(x, dim, keepdim=True)
+            unnormalized = torch.exp(x * x_max)
+            result = unnormalized / torch.sum(unnormalized, dim, keepdim=True)
+            return result
+
+        self.common(fn, (torch.randn([16, 32]),), check_lowp=False)
+        if self.device != "cpu":
+            self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
+
+    def test_cauchy(self):
+        def fn(x, y):
+            return torch.sum(1 / (torch.unsqueeze(x, -1) - y))
+
+        self.common(
+            fn,
+            (
+                torch.randn(32),
+                torch.randn(32),
+            ),
+            # Absolute difference: 0.0003662109375 (up to 0.0001 allowed)
+            # Relative difference: 1.8804297408767818e-05 (up to 1e-05 allowed)
+            atol=5 * 1e-4,
+            rtol=5 * 1e-5,
+            check_lowp=False,
+        )
+        if self.device != "cpu":
+            self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
+
+    def test_gather_scatter(self):
+        def fn(node_feat, edge_index):
+            src_node_feat = node_feat[edge_index[0]]
+            dst_node_feat = node_feat[edge_index[1]]
+            edge_feat = src_node_feat - dst_node_feat + 1
+            new_node_feat = torch.zeros_like(node_feat)
+            new_node_feat.scatter_add_(
+                0, edge_index[1].unsqueeze(-1).expand_as(edge_feat), edge_feat
+            )
+            return new_node_feat
+
+        num_nodes = 16
+        num_features = 32
+        node_feat = torch.randn(num_nodes, num_features)
+        edge_index = torch.randint(0, num_nodes, size=(2, num_nodes * 5))
+        self.common(
+            fn,
+            (
+                node_feat,
+                edge_index,
+            ),
+            check_lowp=False,
+        )
+        if self.device != "cpu":
+            self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
+
+    @patch.object(torch._inductor.config, "max_fusion_size", 1)
+    def test_no_mega_fusion_during_lowering(self):
+        n = 50
+
+        def fn(*args):
+            x = args[0]
+            for i in range(n):
+                x = torch.add(x, args[i])
+            return x
+
+        self.common(
+            fn,
+            [torch.randn(64) for _ in range(n)],
+            check_lowp=False,
+        )
+        print("-->", torch._inductor.metrics.generated_kernel_count)
+        if self.device != "cpu":
+            self.assertTrue(torch._inductor.metrics.generated_kernel_count > 1)
+
     def test_move_arange(self):
         def fn(x):
             return torch.arange(len(x), device="cpu").to(x.device) + x
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index f4b847e50c820..910e6d20b4d6f 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -28,9 +28,14 @@
 benchmark_harness = True
 
 # control store vs recompute heuristic
+# For fanouts, rematearialization can lead to exponential blowup. So, have
+# smaller threashold
 realize_reads_threshold = 4
 realize_bytes_threshold = 2000
 
+# Threshold to prevent excessive accumulation of ops in one buffer during lowering
+realize_acc_reads_threshold = 8
+
 # fallback to eager for random/dropout, this is slow but useful for debugging
 fallback_random = False
 
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 2a1619a822451..8a971020ac047 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -299,6 +299,9 @@ def finalize(self):
     def run_node(self, n: torch.fx.Node):
         with ir.IRNode.current_origins({n}):
             result = super().run_node(n)
+
+            # Realize if (1) any user need inputs realized, or (2) there is
+            # already too many reads and rematerializing can be bad.
             num_users = len(set(n.users))
             if num_users > 1 and isinstance(result, TensorBox):
                 for user in n.users:
@@ -307,6 +310,13 @@ def run_node(self, n: torch.fx.Node):
 
                 # TODO(jansel): introduce a store vs inline choice
                 result.mark_reuse(len(n.users))
+
+            # Realize if the IRNode already has accumulated lots of reads
+            if isinstance(result, TensorBox) and result.has_exceeded_max_reads():
+                # Prevent excessive accumulation in a computed buffer, when
+                # there are multiple branches meach with small number of memory
+                # reads, but they converge to a user.
+                result.realize_hint()
         return result
 
     def codegen(self):
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 13cf5d771a0c8..889e30bb54449 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -302,7 +302,7 @@ def inner_fn_str(self):
             with V.set_ops_handler(V.MockHandler()), patch.object(
                 FlexibleLayout, "allow_indexing", True
             ):
-                return self.inner_fn(self._index(self.ranges))
+                return str(self.inner_fn(self._index(self.ranges)))
         except Exception as e:
             return f"inner_fn(): {e}"
 
@@ -419,8 +419,11 @@ def inner_fn_str(self):
             with V.set_ops_handler(V.MockHandler()), patch.object(
                 FlexibleLayout, "allow_indexing", True
             ):
-                return self.inner_fn(
-                    self._index(self.ranges), self._index(self.reduction_ranges, "r")
+                return str(
+                    self.inner_fn(
+                        self._index(self.ranges),
+                        self._index(self.reduction_ranges, "r"),
+                    )
                 )
         except Exception as e:
             return f"inner_fn(): {e}"
@@ -948,6 +951,9 @@ def get_name(self):
     def mark_reuse(self, users):
         return self.data.mark_reuse(users)
 
+    def has_exceeded_max_reads(self):
+        return self.data.has_exceeded_max_reads()
+
     def realize(self):
         return self.data.realize()
 
@@ -1422,6 +1428,9 @@ def get_device(self):
     def mark_reuse(self, users):
         pass
 
+    def has_exceeded_max_reads(self):
+        return False
+
     def get_reads(self):
         return ()
 
@@ -3350,6 +3359,12 @@ def realize_hint(self):
         if isinstance(self.data, (Pointwise, Reduction)) and self.num_reads() > 1:
             self.realize()
 
+    def has_exceeded_max_reads(self):
+        return isinstance(self.data, Pointwise) and (
+            self.num_reads() > config.realize_acc_reads_threshold
+            or len(self.inner_fn_str()) > config.realize_bytes_threshold
+        )
+
     def mark_reuse(self, users):
         """
         A heuristic to decide if we should realize a tensor

From 00928cade3daaa2bb7f8c93ba8610dacce5d9d1f Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Mon, 24 Oct 2022 22:24:44 +0000
Subject: [PATCH 0086/1922] Upgrade actions/upload-artifact to v3 (#87553)

Upgrade a bunch of actions to get rid of the deprecation warnings, i.e. https://github.com/pytorch/pytorch/actions/runs/3304031186

* Upgrade actions/upload-artifact to v3
* Upgrade Windows actions/setup-python to v4 (left over)

Note: Warnings coming from setup/cache will be fixed upstream by https://github.com/pytorch/test-infra/pull/941
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87553
Approved by: https://github.com/clee2000
---
 .github/actions/setup-win/action.yml                      | 2 +-
 .github/templates/macos_binary_build_workflow.yml.j2      | 2 +-
 .github/workflows/_mac-build.yml                          | 4 ++--
 .../generated-macos-arm64-binary-conda-nightly.yml        | 6 +++---
 .../generated-macos-arm64-binary-wheel-nightly.yml        | 8 ++++----
 .../workflows/generated-macos-binary-conda-nightly.yml    | 8 ++++----
 .../generated-macos-binary-libtorch-cxx11-abi-nightly.yml | 8 ++++----
 .../generated-macos-binary-libtorch-pre-cxx11-nightly.yml | 8 ++++----
 .../workflows/generated-macos-binary-wheel-nightly.yml    | 8 ++++----
 .github/workflows/run_torchbench.yml                      | 2 +-
 .github/workflows/scorecards.yml                          | 4 ++--
 11 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/.github/actions/setup-win/action.yml b/.github/actions/setup-win/action.yml
index c5f1cac550f68..d442343430c7d 100644
--- a/.github/actions/setup-win/action.yml
+++ b/.github/actions/setup-win/action.yml
@@ -55,7 +55,7 @@ runs:
         .circleci/scripts/windows_cudnn_install.sh
 
     - name: Setup Python3
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v4
       with:
         python-version: "3.x"
         cache: pip
diff --git a/.github/templates/macos_binary_build_workflow.yml.j2 b/.github/templates/macos_binary_build_workflow.yml.j2
index 5e6b505664e60..95802252a4f98 100644
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@@ -97,7 +97,7 @@ jobs:
           # shellcheck disable=SC1091
           source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         if: always()
         with:
           name: !{{ config["build_name"] }}
diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml
index 895b07164213e..557d3c7b292c1 100644
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@@ -131,7 +131,7 @@ jobs:
           zip -1 -r artifacts.zip dist/ build/.ninja_log build/compile_commands.json .pytorch-test-times.json
 
       - name: Store PyTorch Build Artifacts on GHA
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v3
         if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped'
         with:
           name: ${{ env.BUILD_ENVIRONMENT }}
@@ -140,7 +140,7 @@ jobs:
           path: artifacts.zip
 
       - name: Upload sccache stats to GHA
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v3
         # Only if sccache is installed, see above
         if: ${{ (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository) && steps.build.outcome != 'skipped' }}
         with:
diff --git a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
index 5d47cc77cf3a7..ce32755e32098 100644
--- a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
@@ -116,7 +116,7 @@ jobs:
           # shellcheck disable=SC1091
           source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         if: always()
         with:
           name: conda-py3_8-cpu
@@ -226,7 +226,7 @@ jobs:
           # shellcheck disable=SC1091
           source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         if: always()
         with:
           name: conda-py3_9-cpu
@@ -336,7 +336,7 @@ jobs:
           # shellcheck disable=SC1091
           source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         if: always()
         with:
           name: conda-py3_10-cpu
diff --git a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
index e58d153269b38..6bc3894a00be5 100644
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@@ -116,7 +116,7 @@ jobs:
           # shellcheck disable=SC1091
           source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         if: always()
         with:
           name: wheel-py3_7-cpu
@@ -226,7 +226,7 @@ jobs:
           # shellcheck disable=SC1091
           source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         if: always()
         with:
           name: wheel-py3_8-cpu
@@ -336,7 +336,7 @@ jobs:
           # shellcheck disable=SC1091
           source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         if: always()
         with:
           name: wheel-py3_9-cpu
@@ -446,7 +446,7 @@ jobs:
           # shellcheck disable=SC1091
           source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         if: always()
         with:
           name: wheel-py3_10-cpu
diff --git a/.github/workflows/generated-macos-binary-conda-nightly.yml b/.github/workflows/generated-macos-binary-conda-nightly.yml
index 079687e6ff951..ba3697e3fef91 100644
--- a/.github/workflows/generated-macos-binary-conda-nightly.yml
+++ b/.github/workflows/generated-macos-binary-conda-nightly.yml
@@ -114,7 +114,7 @@ jobs:
           # shellcheck disable=SC1091
           source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         if: always()
         with:
           name: conda-py3_7-cpu
@@ -224,7 +224,7 @@ jobs:
           # shellcheck disable=SC1091
           source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         if: always()
         with:
           name: conda-py3_8-cpu
@@ -334,7 +334,7 @@ jobs:
           # shellcheck disable=SC1091
           source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         if: always()
         with:
           name: conda-py3_9-cpu
@@ -444,7 +444,7 @@ jobs:
           # shellcheck disable=SC1091
           source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         if: always()
         with:
           name: conda-py3_10-cpu
diff --git a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml
index dcb480b0a07ce..381e0a4c73ad7 100644
--- a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml
@@ -118,7 +118,7 @@ jobs:
           # shellcheck disable=SC1091
           source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         if: always()
         with:
           name: libtorch-cpu-shared-with-deps-cxx11-abi
@@ -233,7 +233,7 @@ jobs:
           # shellcheck disable=SC1091
           source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         if: always()
         with:
           name: libtorch-cpu-shared-without-deps-cxx11-abi
@@ -348,7 +348,7 @@ jobs:
           # shellcheck disable=SC1091
           source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         if: always()
         with:
           name: libtorch-cpu-static-with-deps-cxx11-abi
@@ -463,7 +463,7 @@ jobs:
           # shellcheck disable=SC1091
           source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         if: always()
         with:
           name: libtorch-cpu-static-without-deps-cxx11-abi
diff --git a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml
index 5f02ea874b4e4..55b28480a7545 100644
--- a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml
+++ b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml
@@ -118,7 +118,7 @@ jobs:
           # shellcheck disable=SC1091
           source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         if: always()
         with:
           name: libtorch-cpu-shared-with-deps-pre-cxx11
@@ -233,7 +233,7 @@ jobs:
           # shellcheck disable=SC1091
           source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         if: always()
         with:
           name: libtorch-cpu-shared-without-deps-pre-cxx11
@@ -348,7 +348,7 @@ jobs:
           # shellcheck disable=SC1091
           source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         if: always()
         with:
           name: libtorch-cpu-static-with-deps-pre-cxx11
@@ -463,7 +463,7 @@ jobs:
           # shellcheck disable=SC1091
           source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         if: always()
         with:
           name: libtorch-cpu-static-without-deps-pre-cxx11
diff --git a/.github/workflows/generated-macos-binary-wheel-nightly.yml b/.github/workflows/generated-macos-binary-wheel-nightly.yml
index 081f470d6109f..f4baf9129b690 100644
--- a/.github/workflows/generated-macos-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-binary-wheel-nightly.yml
@@ -114,7 +114,7 @@ jobs:
           # shellcheck disable=SC1091
           source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         if: always()
         with:
           name: wheel-py3_7-cpu
@@ -224,7 +224,7 @@ jobs:
           # shellcheck disable=SC1091
           source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         if: always()
         with:
           name: wheel-py3_8-cpu
@@ -334,7 +334,7 @@ jobs:
           # shellcheck disable=SC1091
           source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         if: always()
         with:
           name: wheel-py3_9-cpu
@@ -444,7 +444,7 @@ jobs:
           # shellcheck disable=SC1091
           source "${RUNNER_TEMP}/anaconda/bin/activate"
           "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v3
         if: always()
         with:
           name: wheel-py3_10-cpu
diff --git a/.github/workflows/run_torchbench.yml b/.github/workflows/run_torchbench.yml
index 9a46a23af5bfc..2d1013abafc02 100644
--- a/.github/workflows/run_torchbench.yml
+++ b/.github/workflows/run_torchbench.yml
@@ -92,7 +92,7 @@ jobs:
           conda env remove --name pr-ci
           rm /tmp/pr-body.txt
       - name: Upload artifact
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v3
         with:
           name: TorchBench result
           path: ~/.torchbench/bisection/pr${{ github.event.number }}
diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
index 516998bfa95be..d896864349fe4 100644
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@@ -25,7 +25,7 @@ jobs:
 
     steps:
       - name: "Checkout code"
-        uses: actions/checkout@2541b1294d2704b0964813337f33b291d3f8596b # tag=v3.0.2
+        uses: actions/checkout@v3
         with:
           persist-credentials: false
 
@@ -42,7 +42,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@3cea5372237819ed00197afe530f5a7ea3e805c8 # tag=v3.1.0
+        uses: actions/upload-artifact@v3
         with:
           name: SARIF file
           path: results.sarif

From 48443f8275f66a51e32bc87809854226c03b2390 Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Mon, 24 Oct 2022 22:43:11 +0000
Subject: [PATCH 0087/1922] [functorch] dont compute expected output multiple
 times (#86202)

Fixes https://github.com/pytorch/functorch/issues/1028

Description: We update `get_fallback_and_vmap_exhaustive` to compute expected output only once as described in the issue.

NOTE: This doesn't take care of the repeated computation in `test_vmap_exhaustive` and will be followed up later.

TODO:
* [x] Benchmark and see how much difference does this make. (Comparison Table Below: [Link](https://github.com/pytorch/pytorch/pull/86202#issuecomment-1285477653))
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86202
Approved by: https://github.com/zou3519
---
 test/functorch/common_utils.py | 50 +++++++++++++++++++++++++++-------
 test/functorch/test_ops.py     |  4 ++-
 2 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/test/functorch/common_utils.py b/test/functorch/common_utils.py
index 1d7356b6ca7e5..c082340d7882e 100644
--- a/test/functorch/common_utils.py
+++ b/test/functorch/common_utils.py
@@ -222,10 +222,11 @@ def clone_if_tensor(x):
         return x.clone()
     return x
 
-
-def compute_quantities_for_vmap_test(
+# Helper function to compare output of `vmap` against the
+# `for-loop` version.
+def _compute_quantities_for_vmap_test(
         op, orig_batched_args, orig_kwarg_values, in_dims,
-        out_dim=0, batch_size=2, compute_loop_out=True,
+        out_dim, batch_size, compute_loop_out=True,
         clone_inputs=False):
 
     def maybe_clone_inputs():
@@ -236,10 +237,12 @@ def maybe_clone_inputs():
         return orig_batched_args, orig_kwarg_values
 
     batched_args, kwarg_values = maybe_clone_inputs()
+
     if compute_loop_out:
         loop_out = loop(op, in_dims, out_dim, batch_size, *batched_args, **kwarg_values)
     else:
         loop_out = None
+
     # Used for debugging the resulting operations
     # from functorch import make_fx
     # def f(a):
@@ -248,7 +251,6 @@ def maybe_clone_inputs():
     # print(in_dims, [arg.shape for arg in batched_args], kwarg_values)
     batched_args, kwarg_values = maybe_clone_inputs()
     batched_out = vmap(op, in_dims=in_dims, out_dims=out_dim)(*batched_args, **kwarg_values)
-    yield (loop_out, batched_out)
 
     # Tests case where we dispatch to a batching rule with no bdims
     # This should be handled by autogenerated plumbing. For vmap support
@@ -262,24 +264,52 @@ def f(dummy, *args, **kwargs):
         return op(*args, **kwargs)
 
     dummy = torch.ones(batch_size, 1)
-    expected = pytree.tree_map(add_bdim_if_tensor, batched_out)
+    vmapvmap_expected = pytree.tree_map(add_bdim_if_tensor, batched_out)
 
     inner_in_dims = (0,) + pytree.tree_map(lambda x: None, in_dims)
     outer_in_dims = (0,) + in_dims
     batched_args, kwarg_values = maybe_clone_inputs()
-    output = vmap(vmap(f, inner_in_dims), outer_in_dims)(dummy, *batched_args, **kwarg_values)
-    yield (expected, output)
+    vmapvmap_output = vmap(vmap(f, inner_in_dims), outer_in_dims)(dummy, *batched_args, **kwarg_values)
+
+    yield (batched_out, loop_out, vmapvmap_output, vmapvmap_expected)
+
+
+# Function with more friendly return types
+# compared to `_compute_quantities_for_vmap_test`
+def compute_quantities_for_vmap_test(
+        op, orig_batched_args, orig_kwarg_values, in_dims,
+        out_dim=0, batch_size=2, compute_loop_out=True,
+        clone_inputs=False):
+    for quantities in _compute_quantities_for_vmap_test(op, orig_batched_args, orig_kwarg_values, in_dims,
+                                                        out_dim, batch_size, compute_loop_out, clone_inputs):
+        yield (quantities[0], quantities[1])
+        yield (quantities[2], quantities[3])
 
 
 def get_fallback_and_vmap_exhaustive(op, arg_values, kwarg_values, is_batch_norm_and_training=False, compute_loop_out=True):
     out_dim = 0
     batch_size = 2
 
+    def make_batched(t):
+        if isinstance(t, torch.Tensor):
+            shape = list(t.shape)
+            shape.insert(out_dim, batch_size)
+            return t.expand(*shape)
+        return t
+
+    # Inputs generated by `generate_vmap_inputs` just copy/expand the unbatched inputs
+    # over the batched dimension. Thus we can compute the expected value once and just
+    # expand it based on the `out_dim` and `batch_size`.
+    expected_unbatched = op(*arg_values, **kwarg_values)
+    expected_batched = pytree.tree_map(make_batched, expected_unbatched)
     generator = generate_vmap_inputs(arg_values, kwarg_values, is_batch_norm_and_training)
     for batched_args, in_dims, kwarg_values in generator:
-        for quantities in compute_quantities_for_vmap_test(
-                op, batched_args, kwarg_values, in_dims, out_dim, batch_size, compute_loop_out):
-            yield quantities
+        for quantities in _compute_quantities_for_vmap_test(
+                op, batched_args, kwarg_values, in_dims, out_dim, batch_size,
+                compute_loop_out=False):
+            assert quantities[1] is None
+            yield (quantities[0], expected_batched)
+            yield (quantities[2], quantities[3])
 
 
 def opinfo_in_dict(opinfo, d):
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index 5dfe76b3e2877..bda05d970a5e9 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -9,7 +9,8 @@
 import itertools
 import unittest
 
-from torch.testing._internal.common_utils import TestCase, run_tests, is_iterable_of_tensors, IS_ARM64, parametrize, TEST_WITH_ASAN
+from torch.testing._internal.common_utils import TestCase, run_tests, is_iterable_of_tensors, IS_MACOS, \
+    IS_ARM64, parametrize, TEST_WITH_ASAN
 import torch
 from torch import Tensor
 import functools
@@ -823,6 +824,7 @@ def test_vmapvjp(self, device, dtype, op):
         # ---------------------------- BUGS ------------------------------------
         # The following are bugs that we should fix
         decorate('nn.functional.conv2d', decorator=unittest.skipIf(IS_ARM64, "Fails on M1")),
+        decorate('linalg.det', 'singular', decorator=unittest.skipIf(IS_MACOS, "Fails on x86 MacOS CI")),
         skip('nn.functional.max_pool1d'),  # fails on cpu, runs on cuda
         xfail('masked.mean'),  # silent incorrectness (nan difference)
 

From 8d0f13b649b35c5429cea831fb6f1cfdacabc79d Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Mon, 24 Oct 2022 22:44:42 +0000
Subject: [PATCH 0088/1922] Add some common tools to docker base (#86993)

I always need to install these 2 tools whenever I use Docker manually to debug build and test issues:

* unzip is to extracted the zipped artifacts from PyTorch CI
* gdb is to do you know what :)

IMO, it makes sense to have them as part of the container image

Pull Request resolved: https://github.com/pytorch/pytorch/pull/86993
Approved by: https://github.com/ZainRizvi
---
 .circleci/docker/common/install_base.sh | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.circleci/docker/common/install_base.sh b/.circleci/docker/common/install_base.sh
index 6724031c0a447..5bca9f6dd3335 100755
--- a/.circleci/docker/common/install_base.sh
+++ b/.circleci/docker/common/install_base.sh
@@ -68,7 +68,9 @@ install_ubuntu() {
     sudo \
     vim \
     jq \
-    libtool
+    libtool \
+    unzip \
+    gdb
 
   # Should resolve issues related to various apt package repository cert issues
   # see: https://github.com/pytorch/pytorch/issues/65931
@@ -126,7 +128,9 @@ install_centos() {
     opencv-devel \
     sudo \
     wget \
-    vim
+    vim \
+    unzip \
+    gdb
 
   # Cleanup
   yum clean all

From dbb53bf3fb008ba698e8eb85cf121b07d93ff167 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Mon, 24 Oct 2022 23:05:14 +0000
Subject: [PATCH 0089/1922] [CI] Fix triton wheel build (#87461)

If one to use auto-install llvm mechanism, somehow one ends us with
few unresovled symbols if build on manylinux image.

Workaround by installing llvm from OS repos.

Also, add an upload job, which is executed only on trunk

Fixes https://github.com/pytorch/torchdynamo/issues/1733

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87461
Approved by: https://github.com/msaroufim
---
 .github/workflows/build-triton-wheel.yml | 53 +++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml
index 074d53498faa6..f602eaa30af4d 100644
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@@ -3,7 +3,8 @@ name: Build Triton wheels
 on:
   push:
     branches:
-      main
+      - main
+      - master
     paths:
       - .github/workflows/build-triton-wheel.yml
       - .github/scripts/build_triton_wheel.py
@@ -84,7 +85,7 @@ jobs:
             ;;
           esac
 
-          docker exec -t "${container_name}" yum install -y zlib-devel
+          docker exec -t "${container_name}" yum install -y llvm11 llvm11-devel llvm11-static llvm11-libs zlib-devel
           docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" /pytorch/.github/scripts/build_triton_wheel.py
           docker exec -t "${container_name}" chown -R 1000.1000 /artifacts
 
@@ -98,3 +99,51 @@ jobs:
       - name: Teardown Linux
         uses: pytorch/test-infra/.github/actions/teardown-linux@main
         if: always()
+  upload-wheel:
+    runs-on: linux.20_04.4x
+    needs: build-wheel
+    container:
+      image: continuumio/miniconda3:4.12.0
+    env:
+      GITHUB_TOKEN: ${{ secrets.github-token }}
+    steps:
+      - name: Download Build Artifacts (3.7)
+        uses: actions/download-artifact@v3
+        with:
+          name: "pytorch-triton-3.7"
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Download Build Artifacts (3.8)
+        uses: actions/download-artifact@v3
+        with:
+          name: "pytorch-triton-3.8"
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Download Build Artifacts (3.9)
+        uses: actions/download-artifact@v3
+        with:
+          name: "pytorch-triton-3.9"
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Download Build Artifacts (3.10)
+        uses: actions/download-artifact@v3
+        with:
+          name: "pytorch-triton-3.10"
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Download Build Artifacts (3.11)
+        uses: actions/download-artifact@v3
+        with:
+          name: "pytorch-triton-3.11"
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Upload binaries
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/master' || github.event.ref == 'refs/heads/main') }}
+        env:
+          PKG_DIR: "${{ runner.temp }}/artifacts"
+          # When running these on pull_request events these should be blank
+          AWS_ACCESS_KEY_ID: ${{ secrets.aws-access-key-id }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.aws-pytorch-uploader-secret-access-key }}
+          UPLOAD_BUCKET: "s3://pytorch"
+        run: |
+            set -ex
+            pip install -q awscli
+            s3_dir="${UPLOAD_BUCKET}/whl/nightly/"
+            for pkg in "${PKG_DIR}/"*.whl; do
+              aws s3 cp --no-progress --acl public-read "${pkg}" "${s3_dir}"
+             done

From 69a4a386cf1e796c438b9ffdff7f0f4aed8c3468 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Mon, 17 Oct 2022 18:57:06 +0100
Subject: [PATCH 0090/1922] ATen/native (3/6): Use per-operator headers
 (#75573)

Differential Revision: [D40126701](https://our.internmc.facebook.com/intern/diff/D40126701)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/75573
Approved by: https://github.com/malfet
---
 aten/src/ATen/native/Histogram.cpp            |  22 +++-
 aten/src/ATen/native/Histogram.h              |   2 -
 aten/src/ATen/native/Im2Col.cpp               |  15 ++-
 aten/src/ATen/native/IndexingUtils.cpp        |   1 +
 aten/src/ATen/native/Integration.cpp          |  17 ++-
 aten/src/ATen/native/Itertools.cpp            |  19 ++-
 aten/src/ATen/native/Lerp.cpp                 |   9 ++
 aten/src/ATen/native/Linear.cpp               |  31 ++++-
 aten/src/ATen/native/LinearAlgebra.cpp        | 119 ++++++++++++++++--
 aten/src/ATen/native/Loss.cpp                 |  57 ++++++++-
 aten/src/ATen/native/LossCTC.cpp              |  26 +++-
 aten/src/ATen/native/LossMulti.h              |   8 +-
 aten/src/ATen/native/LossMultiLabelMargin.cpp |  15 ++-
 aten/src/ATen/native/LossMultiMargin.cpp      |  14 ++-
 aten/src/ATen/native/LossNLL.cpp              |  23 +++-
 aten/src/ATen/native/LossNLL2d.cpp            |  17 ++-
 aten/src/ATen/native/MathBitsFallback.h       |   9 +-
 aten/src/ATen/native/MaxPooling.cpp           |  13 +-
 aten/src/ATen/native/MaxUnpooling.cpp         |  13 +-
 aten/src/ATen/native/Memory.cpp               |  13 +-
 aten/src/ATen/native/NNPACK.cpp               |  13 +-
 .../native/NaiveConvolutionTranspose2d.cpp    |  15 ++-
 .../native/NaiveConvolutionTranspose3d.cpp    |  16 ++-
 .../ATen/native/NaiveDilatedConvolution.cpp   |  15 ++-
 aten/src/ATen/native/NamedTensor.cpp          |  28 ++++-
 aten/src/ATen/native/vol2col.h                |   4 +-
 26 files changed, 470 insertions(+), 64 deletions(-)

diff --git a/aten/src/ATen/native/Histogram.cpp b/aten/src/ATen/native/Histogram.cpp
index c3a007f2c2dcb..89ede6bea35c1 100644
--- a/aten/src/ATen/native/Histogram.cpp
+++ b/aten/src/ATen/native/Histogram.cpp
@@ -1,10 +1,28 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
-#include <ATen/NativeFunctions.h>
 
 #include <ATen/native/Histogram.h>
 #include <ATen/native/Resize.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_histogramdd_bin_edges.h>
+#include <ATen/ops/_histogramdd_bin_edges_native.h>
+#include <ATen/ops/_histogramdd_from_bin_cts.h>
+#include <ATen/ops/_histogramdd_from_bin_cts_native.h>
+#include <ATen/ops/_histogramdd_from_bin_tensors.h>
+#include <ATen/ops/_histogramdd_from_bin_tensors_native.h>
+#include <ATen/ops/aminmax.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/histc_native.h>
+#include <ATen/ops/histogram_native.h>
+#include <ATen/ops/histogramdd_native.h>
+#include <ATen/ops/linspace_native.h>
+#endif
+
 #include <numeric>
 #include <tuple>
 #include <vector>
diff --git a/aten/src/ATen/native/Histogram.h b/aten/src/ATen/native/Histogram.h
index 9df0aafafc18d..3305cc5e315fb 100644
--- a/aten/src/ATen/native/Histogram.h
+++ b/aten/src/ATen/native/Histogram.h
@@ -3,8 +3,6 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/native/DispatchStub.h>
 
-#include <tuple>
-
 namespace at { namespace native {
 
 using histogramdd_fn = void(*)(const Tensor&, const c10::optional<Tensor>&, bool, Tensor&, const TensorList&);
diff --git a/aten/src/ATen/native/Im2Col.cpp b/aten/src/ATen/native/Im2Col.cpp
index 7cb5133eef9ad..416e77e9ff199 100644
--- a/aten/src/ATen/native/Im2Col.cpp
+++ b/aten/src/ATen/native/Im2Col.cpp
@@ -1,12 +1,21 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
-#include <ATen/Utils.h>
-#include <ATen/div_rtn.h>
 
 #include <ATen/native/im2col.h>
 #include <ATen/native/im2col_shape_check.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/col2im_native.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/im2col_native.h>
+#endif
+
 namespace at {
 namespace native {
 namespace {
diff --git a/aten/src/ATen/native/IndexingUtils.cpp b/aten/src/ATen/native/IndexingUtils.cpp
index e91eff03ab856..c5f5ff6fbcc07 100644
--- a/aten/src/ATen/native/IndexingUtils.cpp
+++ b/aten/src/ATen/native/IndexingUtils.cpp
@@ -1,3 +1,4 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/IndexingUtils.h>
 
 namespace at { namespace native {
diff --git a/aten/src/ATen/native/Integration.cpp b/aten/src/ATen/native/Integration.cpp
index 7ca01bae18a57..09e444476d1fd 100644
--- a/aten/src/ATen/native/Integration.cpp
+++ b/aten/src/ATen/native/Integration.cpp
@@ -1,12 +1,23 @@
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/WrapDimUtils.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/core/DimVector.h>
+#include <ATen/TensorOperators.h>
+#include <ATen/WrapDimUtils.h>
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>
 #include <c10/core/ScalarType.h>
 #include <c10/core/Scalar.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/cumulative_trapezoid_native.h>
+#include <ATen/ops/trapezoid_native.h>
+#include <ATen/ops/trapz_native.h>
+#include <ATen/ops/zeros.h>
+#endif
+
 namespace at {
 namespace native {
 namespace {
diff --git a/aten/src/ATen/native/Itertools.cpp b/aten/src/ATen/native/Itertools.cpp
index 265b05054b0a3..8d6ff506a43f8 100644
--- a/aten/src/ATen/native/Itertools.cpp
+++ b/aten/src/ATen/native/Itertools.cpp
@@ -1,5 +1,20 @@
-#include <ATen/ATen.h>
-#include <ATen/Dispatch.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorOperators.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/arange.h>
+#include <ATen/ops/cartesian_prod_native.h>
+#include <ATen/ops/combinations_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/full.h>
+#include <ATen/ops/meshgrid.h>
+#include <ATen/ops/stack.h>
+#include <ATen/ops/zeros_like_ops.h>
+#endif
 
 #include <vector>
 
diff --git a/aten/src/ATen/native/Lerp.cpp b/aten/src/ATen/native/Lerp.cpp
index bfac91a881ae0..2e67dec35033f 100644
--- a/aten/src/ATen/native/Lerp.cpp
+++ b/aten/src/ATen/native/Lerp.cpp
@@ -1,5 +1,14 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/Lerp.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/lerp_native.h>
+#endif
 
 namespace at {
 namespace meta {
diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp
index b9b3abe3c7cae..591289a726ac8 100644
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@@ -1,17 +1,36 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/native/Resize.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/native/xnnpack/Engine.h>
-#include <ATen/SmallVector.h>
 #include <ATen/WrapDimUtilsMulti.h>
-#include <c10/macros/Macros.h>
+#include <ATen/TensorOperators.h>
+#include <ATen/native/xnnpack/Engine.h>
 #include <c10/util/irange.h>
 #include <c10/util/MaybeOwned.h>
 #include <ATen/TensorSubclassLikeUtils.h>
 
-#include <array>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_trilinear.h>
+#include <ATen/ops/_trilinear_native.h>
+#include <ATen/ops/add.h>
+#include <ATen/ops/addmm.h>
+#include <ATen/ops/bilinear_native.h>
+#include <ATen/ops/bmm.h>
+#include <ATen/ops/einsum_native.h>
+#include <ATen/ops/linear_native.h>
+#include <ATen/ops/matmul.h>
+#include <ATen/ops/mkldnn_linear.h>
+#include <ATen/ops/mm.h>
+#include <ATen/ops/mul.h>
+#include <ATen/ops/tensordot_native.h>
+#include <ATen/ops/zeros.h>
+#include <ATen/ops/zeros_like_ops.h>
+#endif
+
 #include <cctype>
-#include <cstddef>
 #include <sstream>
 #include <string>
 #include <vector>
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index c658d4427c97d..8c5a6fc8f1955 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -1,27 +1,132 @@
-#include <ATen/ATen.h>
-#include <ATen/core/grad_mode.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Context.h>
 #include <ATen/Dispatch.h>
 #include <ATen/ExpandUtils.h>
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/OpMathType.h>
 #include <ATen/native/mkldnn/Matmul.h>
 #include <ATen/native/CPUBlas.h>
-#include <ATen/native/IndexingUtils.h>
 #include <ATen/native/LinearAlgebra.h>
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/ReduceOps.h>
 #include <ATen/native/ReduceOpsUtils.h>
 #include <ATen/native/Resize.h>
-#include <ATen/native/TensorIterator.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/Parallel.h>
+#include <ATen/TensorIndexing.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorOperators.h>
 #include <ATen/TensorUtils.h>
-#include <ATen/Utils.h>
 #include <c10/util/accumulate.h>
 #include <c10/util/irange.h>
 #include <c10/util/variant.h>
 
-#include <functional>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_addmm_activation_native.h>
+#include <ATen/ops/_compute_linear_combination_native.h>
+#include <ATen/ops/_linalg_check_errors.h>
+#include <ATen/ops/_linalg_det.h>
+#include <ATen/ops/_linalg_det_native.h>
+#include <ATen/ops/_linalg_slogdet.h>
+#include <ATen/ops/_linalg_slogdet_native.h>
+#include <ATen/ops/_unsafe_view.h>
+#include <ATen/ops/addbmm_native.h>
+#include <ATen/ops/addmm_native.h>
+#include <ATen/ops/addr.h>
+#include <ATen/ops/addr_native.h>
+#include <ATen/ops/arange.h>
+#include <ATen/ops/baddbmm_native.h>
+#include <ATen/ops/bmm.h>
+#include <ATen/ops/bmm_native.h>
+#include <ATen/ops/ceil.h>
+#include <ATen/ops/chain_matmul_native.h>
+#include <ATen/ops/det_native.h>
+#include <ATen/ops/diag_embed.h>
+#include <ATen/ops/dot.h>
+#include <ATen/ops/dot_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/eye.h>
+#include <ATen/ops/frobenius_norm_native.h>
+#include <ATen/ops/from_blob.h>
+#include <ATen/ops/full.h>
+#include <ATen/ops/gelu.h>
+#include <ATen/ops/ger_native.h>
+#include <ATen/ops/index_select.h>
+#include <ATen/ops/inner_native.h>
+#include <ATen/ops/is_complex_native.h>
+#include <ATen/ops/is_floating_point_native.h>
+#include <ATen/ops/kron_native.h>
+#include <ATen/ops/linalg_cond.h>
+#include <ATen/ops/linalg_cond_native.h>
+#include <ATen/ops/linalg_det.h>
+#include <ATen/ops/linalg_det_native.h>
+#include <ATen/ops/linalg_diagonal_native.h>
+#include <ATen/ops/linalg_eigh.h>
+#include <ATen/ops/linalg_eigvalsh.h>
+#include <ATen/ops/linalg_inv.h>
+#include <ATen/ops/linalg_inv_ex.h>
+#include <ATen/ops/linalg_lu_factor_ex.h>
+#include <ATen/ops/linalg_matmul_native.h>
+#include <ATen/ops/linalg_matrix_exp.h>
+#include <ATen/ops/linalg_matrix_exp_native.h>
+#include <ATen/ops/linalg_matrix_norm.h>
+#include <ATen/ops/linalg_matrix_norm_native.h>
+#include <ATen/ops/linalg_matrix_power_native.h>
+#include <ATen/ops/linalg_matrix_rank.h>
+#include <ATen/ops/linalg_matrix_rank_native.h>
+#include <ATen/ops/linalg_multi_dot_native.h>
+#include <ATen/ops/linalg_norm.h>
+#include <ATen/ops/linalg_norm_native.h>
+#include <ATen/ops/linalg_pinv.h>
+#include <ATen/ops/linalg_pinv_native.h>
+#include <ATen/ops/linalg_slogdet.h>
+#include <ATen/ops/linalg_slogdet_native.h>
+#include <ATen/ops/linalg_solve.h>
+#include <ATen/ops/linalg_svdvals.h>
+#include <ATen/ops/linalg_tensorinv.h>
+#include <ATen/ops/linalg_tensorinv_native.h>
+#include <ATen/ops/linalg_tensorsolve.h>
+#include <ATen/ops/linalg_tensorsolve_native.h>
+#include <ATen/ops/linalg_vector_norm.h>
+#include <ATen/ops/linalg_vector_norm_native.h>
+#include <ATen/ops/log2.h>
+#include <ATen/ops/logdet_native.h>
+#include <ATen/ops/matmul.h>
+#include <ATen/ops/matmul_native.h>
+#include <ATen/ops/matrix_exp_backward_native.h>
+#include <ATen/ops/matrix_exp_native.h>
+#include <ATen/ops/matrix_power_native.h>
+#include <ATen/ops/max.h>
+#include <ATen/ops/mm.h>
+#include <ATen/ops/mm_native.h>
+#include <ATen/ops/movedim.h>
+#include <ATen/ops/mul.h>
+#include <ATen/ops/mv.h>
+#include <ATen/ops/narrow.h>
+#include <ATen/ops/norm.h>
+#include <ATen/ops/nuclear_norm_native.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/outer.h>
+#include <ATen/ops/outer_native.h>
+#include <ATen/ops/pinverse_native.h>
+#include <ATen/ops/pow.h>
+#include <ATen/ops/prod.h>
+#include <ATen/ops/real.h>
+#include <ATen/ops/relu.h>
+#include <ATen/ops/slogdet_native.h>
+#include <ATen/ops/sqrt.h>
+#include <ATen/ops/sum.h>
+#include <ATen/ops/tensordot.h>
+#include <ATen/ops/vdot_native.h>
+#include <ATen/ops/where.h>
+#include <ATen/ops/zeros.h>
+#include <ATen/ops/zeros_like.h>
+#endif
+
 #include <limits>
 #include <numeric>
 #include <string>
diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp
index 52569ba6b4995..78b7d70236207 100644
--- a/aten/src/ATen/native/Loss.cpp
+++ b/aten/src/ATen/native/Loss.cpp
@@ -1,15 +1,62 @@
-#include <ATen/ATen.h>
-#include <ATen/CPUApplyUtils.h>
-#include <ATen/Dispatch.h>
-#include <ATen/NativeFunctions.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/core/Reduction.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <ATen/TensorOperators.h>
 #include <ATen/native/BinaryOps.h>
 #include <ATen/native/PointwiseOps.h>
-#include <ATen/native/TensorIterator.h>
 #include <ATen/native/cpu/Loops.h>
 #include <c10/util/Exception.h>
 #include <ATen/TensorSubclassLikeUtils.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/binary_cross_entropy_backward_native.h>
+#include <ATen/ops/binary_cross_entropy_native.h>
+#include <ATen/ops/binary_cross_entropy_with_logits_native.h>
+#include <ATen/ops/clamp_min.h>
+#include <ATen/ops/cosine_embedding_loss_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/exp.h>
+#include <ATen/ops/hinge_embedding_loss_native.h>
+#include <ATen/ops/huber_loss_backward.h>
+#include <ATen/ops/huber_loss_backward_native.h>
+#include <ATen/ops/huber_loss_native.h>
+#include <ATen/ops/kl_div_native.h>
+#include <ATen/ops/l1_loss_native.h>
+#include <ATen/ops/log.h>
+#include <ATen/ops/margin_ranking_loss_native.h>
+#include <ATen/ops/mean.h>
+#include <ATen/ops/min.h>
+#include <ATen/ops/mse_loss_backward.h>
+#include <ATen/ops/mse_loss_backward_native.h>
+#include <ATen/ops/mse_loss_meta.h>
+#include <ATen/ops/mse_loss_native.h>
+#include <ATen/ops/mul.h>
+#include <ATen/ops/neg.h>
+#include <ATen/ops/pairwise_distance.h>
+#include <ATen/ops/poisson_nll_loss_native.h>
+#include <ATen/ops/smooth_l1_loss_backward.h>
+#include <ATen/ops/smooth_l1_loss_backward_native.h>
+#include <ATen/ops/smooth_l1_loss_meta.h>
+#include <ATen/ops/smooth_l1_loss_native.h>
+#include <ATen/ops/soft_margin_loss.h>
+#include <ATen/ops/soft_margin_loss_backward.h>
+#include <ATen/ops/soft_margin_loss_backward_native.h>
+#include <ATen/ops/soft_margin_loss_native.h>
+#include <ATen/ops/squeeze.h>
+#include <ATen/ops/sum.h>
+#include <ATen/ops/triplet_margin_loss_native.h>
+#include <ATen/ops/where.h>
+#include <ATen/ops/xlogy.h>
+#include <ATen/ops/zeros_like.h>
+#endif
+
 constexpr float EPSILON = 1e-12;
 
 namespace {
diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp
index 1ddb8f2285640..dcfad968cad79 100644
--- a/aten/src/ATen/native/LossCTC.cpp
+++ b/aten/src/ATen/native/LossCTC.cpp
@@ -5,16 +5,36 @@
 // 1. Graves et al: http://www.cs.toronto.edu/~graves/icml_2006.pdf
 // We use the equations from above link, but note that [1] has 1-based indexing and we (of course) use 0-based.
 // Graves et al call the probabilities y, we use log_probs (also calling them inputs)
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
-#include <ATen/TensorUtils.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorOperators.h>
 #include <ATen/native/Fill.h>
 #include <c10/util/irange.h>
 #include <ATen/TensorSubclassLikeUtils.h>
 
-#include <numeric>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_ctc_loss.h>
+#include <ATen/ops/_ctc_loss_backward.h>
+#include <ATen/ops/_ctc_loss_backward_native.h>
+#include <ATen/ops/_ctc_loss_native.h>
+#include <ATen/ops/_cudnn_ctc_loss.h>
+#include <ATen/ops/_use_cudnn_ctc_loss.h>
+#include <ATen/ops/ctc_loss_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/full_like.h>
+#include <ATen/ops/tensor.h>
+#include <ATen/ops/where.h>
+#include <ATen/ops/zeros.h>
+#endif
+
 #include <type_traits>
 
 namespace at {
diff --git a/aten/src/ATen/native/LossMulti.h b/aten/src/ATen/native/LossMulti.h
index 54736bcc123b2..148615e7e14f1 100644
--- a/aten/src/ATen/native/LossMulti.h
+++ b/aten/src/ATen/native/LossMulti.h
@@ -1,8 +1,8 @@
-#include <ATen/ATen.h>
-#include <ATen/Dispatch.h>
-#include <ATen/AccumulateType.h>
-
 #pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
 
 namespace at { namespace native {
 namespace {
diff --git a/aten/src/ATen/native/LossMultiLabelMargin.cpp b/aten/src/ATen/native/LossMultiLabelMargin.cpp
index f59de5c8817a4..26d7a748df8d4 100644
--- a/aten/src/ATen/native/LossMultiLabelMargin.cpp
+++ b/aten/src/ATen/native/LossMultiLabelMargin.cpp
@@ -1,10 +1,23 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/native/LossMulti.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/multilabel_margin_loss_backward_native.h>
+#include <ATen/ops/multilabel_margin_loss_forward.h>
+#include <ATen/ops/multilabel_margin_loss_forward_native.h>
+#include <ATen/ops/multilabel_margin_loss_native.h>
+#include <ATen/ops/zeros_like.h>
+#endif
+
 namespace at {
 namespace native {
 
diff --git a/aten/src/ATen/native/LossMultiMargin.cpp b/aten/src/ATen/native/LossMultiMargin.cpp
index c7ab53f1d211b..110520cf8f950 100644
--- a/aten/src/ATen/native/LossMultiMargin.cpp
+++ b/aten/src/ATen/native/LossMultiMargin.cpp
@@ -1,9 +1,19 @@
-#include <ATen/ATen.h>
-#include <ATen/Dispatch.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
 #include <ATen/native/LossMulti.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/multi_margin_loss_backward_native.h>
+#include <ATen/ops/multi_margin_loss_native.h>
+#endif
+
 namespace at {
 namespace native {
 
diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp
index 79e98c877548a..8e5864b68728d 100644
--- a/aten/src/ATen/native/LossNLL.cpp
+++ b/aten/src/ATen/native/LossNLL.cpp
@@ -1,13 +1,32 @@
-#include <ATen/ATen.h>
-#include <ATen/AccumulateType.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
+#include <ATen/TensorIndexing.h>
 #include <ATen/TensorMeta.h>
+#include <ATen/TensorOperators.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/native/cpu/utils.h>
 #include <ATen/native/Resize.h>
 #include <c10/util/SmallBuffer.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/cross_entropy_loss_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/log_softmax.h>
+#include <ATen/ops/nll_loss.h>
+#include <ATen/ops/nll_loss2d.h>
+#include <ATen/ops/nll_loss_backward_native.h>
+#include <ATen/ops/nll_loss_forward.h>
+#include <ATen/ops/nll_loss_forward_native.h>
+#include <ATen/ops/nll_loss_native.h>
+#include <ATen/ops/nll_loss_nd.h>
+#include <ATen/ops/nll_loss_nd_native.h>
+#endif
+
 #include <c10/core/TensorOptions.h>
 #include <c10/util/irange.h>
 
diff --git a/aten/src/ATen/native/LossNLL2d.cpp b/aten/src/ATen/native/LossNLL2d.cpp
index 6950cb2805e9e..ab7c084eb80df 100644
--- a/aten/src/ATen/native/LossNLL2d.cpp
+++ b/aten/src/ATen/native/LossNLL2d.cpp
@@ -1,12 +1,23 @@
-#include <ATen/ATen.h>
-#include <ATen/AccumulateType.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
-#include <ATen/TensorUtils.h>
 #include <ATen/native/cpu/utils.h>
 #include <ATen/native/Resize.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/nll_loss2d_backward_native.h>
+#include <ATen/ops/nll_loss2d_forward.h>
+#include <ATen/ops/nll_loss2d_forward_native.h>
+#include <ATen/ops/nll_loss2d_native.h>
+#include <ATen/ops/zeros_like.h>
+#endif
+
 namespace at {
 namespace native {
 
diff --git a/aten/src/ATen/native/MathBitsFallback.h b/aten/src/ATen/native/MathBitsFallback.h
index 4e9c2d9e98b18..84e72aa724d0e 100644
--- a/aten/src/ATen/native/MathBitsFallback.h
+++ b/aten/src/ATen/native/MathBitsFallback.h
@@ -1,12 +1,17 @@
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/core/op_registration/op_registration.h>
 #include <ATen/native/UnaryOps.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/native/Resize.h>
 #include <c10/util/irange.h>
 #include <torch/library.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/clone.h>
+#endif
+
 namespace at {
 namespace native {
 // This fallback should only be used for operations that are self inverse and have a corresponding tensor
diff --git a/aten/src/ATen/native/MaxPooling.cpp b/aten/src/ATen/native/MaxPooling.cpp
index 0f05eeac7d3e9..e809c75ba21d6 100644
--- a/aten/src/ATen/native/MaxPooling.cpp
+++ b/aten/src/ATen/native/MaxPooling.cpp
@@ -1,4 +1,5 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/TensorSubclassLikeUtils.h>
 #include <ATen/core/grad_mode.h>
@@ -6,6 +7,16 @@
 #include <ATen/native/MaxPooling.h>
 #include <ATen/native/Pool.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/max_pool1d_native.h>
+#include <ATen/ops/max_pool1d_with_indices.h>
+#include <ATen/ops/quantized_max_pool1d.h>
+#endif
+
 namespace at {
 namespace native {
 
diff --git a/aten/src/ATen/native/MaxUnpooling.cpp b/aten/src/ATen/native/MaxUnpooling.cpp
index 33cc4dc7a61ce..adab802d65cd5 100644
--- a/aten/src/ATen/native/MaxUnpooling.cpp
+++ b/aten/src/ATen/native/MaxUnpooling.cpp
@@ -1,8 +1,17 @@
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/native/cpu/MaxUnpoolKernel.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/max_unpool2d_native.h>
+#include <ATen/ops/max_unpool3d_native.h>
+#endif
+
 namespace at {
 namespace native {
 
diff --git a/aten/src/ATen/native/Memory.cpp b/aten/src/ATen/native/Memory.cpp
index df6949b2d7d95..2b66f08933934 100644
--- a/aten/src/ATen/native/Memory.cpp
+++ b/aten/src/ATen/native/Memory.cpp
@@ -1,6 +1,17 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/MemoryOverlap.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_debug_has_internal_overlap_native.h>
+#include <ATen/ops/_pin_memory.h>
+#include <ATen/ops/is_pinned_native.h>
+#include <ATen/ops/pin_memory_native.h>
+#endif
+
 namespace at {
 namespace native {
 
diff --git a/aten/src/ATen/native/NNPACK.cpp b/aten/src/ATen/native/NNPACK.cpp
index 3df0a0623e437..544641f091a35 100644
--- a/aten/src/ATen/native/NNPACK.cpp
+++ b/aten/src/ATen/native/NNPACK.cpp
@@ -1,10 +1,21 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/Config.h>
 
 #include <c10/util/CallOnce.h>
 
 #include <thread>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_nnpack_available_native.h>
+#include <ATen/ops/_nnpack_spatial_convolution_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/zeros.h>
+#endif
+
 #if !AT_NNPACK_ENABLED()
 
 namespace at {
diff --git a/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp b/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp
index ea604c426c3b4..a9cf36a004f4c 100644
--- a/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp
+++ b/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp
@@ -1,5 +1,5 @@
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/Dispatch.h>
 #include <ATen/TensorMeta.h>
 #include <ATen/TensorUtils.h>
 
@@ -8,6 +8,17 @@
 #include <ATen/native/CPUBlas.h>
 #include <ATen/native/im2col.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/slow_conv_transpose2d_native.h>
+#include <ATen/ops/sum.h>
+#include <ATen/ops/zeros.h>
+#endif
+
 #include <c10/core/TensorOptions.h>
 #include <c10/util/irange.h>
 
diff --git a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
index 3d34091fd036a..cf60f56f9df44 100644
--- a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
+++ b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
@@ -1,11 +1,23 @@
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
 #include <ATen/TensorUtils.h>
 
 #include <ATen/native/ConvUtils.h>
 #include <ATen/native/CPUBlas.h>
 #include <ATen/native/vol2col.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/slow_conv_transpose3d_native.h>
+#include <ATen/ops/sum.h>
+#endif
+
 namespace at {
 namespace native {
 
diff --git a/aten/src/ATen/native/NaiveDilatedConvolution.cpp b/aten/src/ATen/native/NaiveDilatedConvolution.cpp
index fa7b30f5977ef..827bf204b093f 100644
--- a/aten/src/ATen/native/NaiveDilatedConvolution.cpp
+++ b/aten/src/ATen/native/NaiveDilatedConvolution.cpp
@@ -1,14 +1,25 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
 #include <ATen/native/ConvUtils.h>
 #include <ATen/native/CPUBlas.h>
 #include <ATen/native/DilatedConvolutionUtils.h>
 #include <ATen/native/im2col.h>
 #include <ATen/native/vol2col.h>
-#include <ATen/Utils.h>
 #include <c10/util/accumulate.h>
 #include <c10/util/irange.h>
 #include <tuple>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/slow_conv_dilated2d_native.h>
+#include <ATen/ops/slow_conv_dilated3d_native.h>
+#endif
+
 namespace at {
 namespace native {
 namespace {
diff --git a/aten/src/ATen/native/NamedTensor.cpp b/aten/src/ATen/native/NamedTensor.cpp
index d725c26a14631..6ee2f095b6d09 100644
--- a/aten/src/ATen/native/NamedTensor.cpp
+++ b/aten/src/ATen/native/NamedTensor.cpp
@@ -1,8 +1,30 @@
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
-
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/NamedTensorUtils.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/align_as_native.h>
+#include <ATen/ops/align_tensors_native.h>
+#include <ATen/ops/align_to_native.h>
+#include <ATen/ops/gather_native.h>
+#include <ATen/ops/index_add_native.h>
+#include <ATen/ops/index_copy_native.h>
+#include <ATen/ops/index_fill.h>
+#include <ATen/ops/index_fill_native.h>
+#include <ATen/ops/index_select_native.h>
+#include <ATen/ops/refine_names_native.h>
+#include <ATen/ops/rename_native.h>
+#include <ATen/ops/scatter_add_native.h>
+#include <ATen/ops/scatter_native.h>
+#include <ATen/ops/sort_native.h>
+#include <ATen/ops/squeeze.h>
+#include <ATen/ops/squeeze_native.h>
+#include <ATen/ops/zeros_like_ops.h>
+#endif
+
 #include <c10/util/irange.h>
 
 #include <bitset>
diff --git a/aten/src/ATen/native/vol2col.h b/aten/src/ATen/native/vol2col.h
index 12718a8f00afc..2b2ee3b57b0c4 100644
--- a/aten/src/ATen/native/vol2col.h
+++ b/aten/src/ATen/native/vol2col.h
@@ -1,8 +1,6 @@
 #pragma once
 
-#include <ATen/ATen.h>
-#include <ATen/TensorUtils.h>
-#include <ATen/Utils.h>
+#include <cstring>
 
 namespace at {
 namespace native {

From f7344e69c037cd76ec238e0796140d79ec8c57c6 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Mon, 17 Oct 2022 18:57:07 +0100
Subject: [PATCH 0091/1922] ATen/native (4/6): Use per-operator headers
 (#75574)

Differential Revision: [D40126697](https://our.internmc.facebook.com/intern/diff/D40126697)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/75574
Approved by: https://github.com/malfet
---
 aten/src/ATen/PadNd.h                        |   2 +
 aten/src/ATen/native/NegateFallback.cpp      |   1 +
 aten/src/ATen/native/Normalization.cpp       |  44 +++++++-
 aten/src/ATen/native/Onehot.cpp              |  12 ++-
 aten/src/ATen/native/PackedSequence.cpp      |  17 ++-
 aten/src/ATen/native/PadNd.cpp               |  23 +++-
 aten/src/ATen/native/PixelShuffle.cpp        |  17 ++-
 aten/src/ATen/native/PointwiseOps.cpp        |  15 ++-
 aten/src/ATen/native/Pooling.cpp             |  27 ++++-
 aten/src/ATen/native/Pow.cpp                 |  15 ++-
 aten/src/ATen/native/QuantizedLinear.cpp     |  26 +++--
 aten/src/ATen/native/RNN.cpp                 |  48 ++++++++-
 aten/src/ATen/native/RangeFactories.cpp      |  16 ++-
 aten/src/ATen/native/ReduceAllOps.cpp        |  15 ++-
 aten/src/ATen/native/ReduceOps.cpp           | 107 +++++++++++++++++--
 aten/src/ATen/native/ReflectionPad.cpp       |  21 +++-
 aten/src/ATen/native/Repeat.cpp              |  13 ++-
 aten/src/ATen/native/ReplicationPadding.cpp  |  19 +++-
 aten/src/ATen/native/Resize.cpp              |  11 +-
 aten/src/ATen/native/RowwisePrune.cpp        |  11 +-
 aten/src/ATen/native/Scalar.cpp              |  12 ++-
 aten/src/ATen/native/SegmentReduce.cpp       |  15 ++-
 aten/src/ATen/native/SobolEngineOps.cpp      |  16 ++-
 aten/src/ATen/native/SobolEngineOpsUtils.cpp |   1 +
 aten/src/ATen/native/SobolEngineOpsUtils.h   |  10 +-
 aten/src/ATen/native/SoftMax.cpp             |  27 ++++-
 aten/src/ATen/native/Sorting.cpp             |  36 ++++++-
 aten/src/ATen/native/SpectralOps.cpp         |  69 ++++++++++--
 aten/src/ATen/native/SummaryOps.cpp          |  11 +-
 aten/src/ATen/native/TensorDimApply.h        |   3 +-
 30 files changed, 584 insertions(+), 76 deletions(-)

diff --git a/aten/src/ATen/PadNd.h b/aten/src/ATen/PadNd.h
index 2c0d67e9d5d3f..573d1a7b88ab7 100644
--- a/aten/src/ATen/PadNd.h
+++ b/aten/src/ATen/PadNd.h
@@ -1,4 +1,6 @@
 #pragma once
+#include <c10/util/Exception.h>
+#include <c10/util/string_view.h>
 
 namespace at {
 
diff --git a/aten/src/ATen/native/NegateFallback.cpp b/aten/src/ATen/native/NegateFallback.cpp
index a2b134a91e40e..0a34b4f4331d6 100644
--- a/aten/src/ATen/native/NegateFallback.cpp
+++ b/aten/src/ATen/native/NegateFallback.cpp
@@ -1,3 +1,4 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/MathBitsFallback.h>
 #include <ATen/native/MathBitFallThroughLists.h>
 
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index 6911d780c1d0e..5169c5e58e9ad 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -1,18 +1,52 @@
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
-#include <ATen/CPUApplyUtils.h>
-#include <ATen/Parallel.h>
 #include <ATen/Config.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Parallel.h>
+#include <ATen/ScalarOps.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <ATen/TensorOperators.h>
+#include <ATen/TensorUtils.h>
 
 #include <ATen/detail/CUDAHooksInterface.h>
-#include <ATen/native/TensorIterator.h>
 #include <ATen/native/cpu/Loops.h>
 #include <ATen/native/batch_norm.h>
 #include <ATen/native/Normalization.h>
 #include <ATen/native/cpu/mixed_data_type.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_batch_norm_impl_index.h>
+#include <ATen/ops/_batch_norm_impl_index_backward_native.h>
+#include <ATen/ops/_batch_norm_impl_index_native.h>
+#include <ATen/ops/alias.h>
+#include <ATen/ops/batch_norm.h>
+#include <ATen/ops/batch_norm_native.h>
+#include <ATen/ops/batch_norm_update_stats_native.h>
+#include <ATen/ops/cudnn_batch_norm.h>
+#include <ATen/ops/cudnn_batch_norm_backward.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/instance_norm_native.h>
+#include <ATen/ops/linalg_vector_norm.h>
+#include <ATen/ops/mean.h>
+#include <ATen/ops/miopen_batch_norm.h>
+#include <ATen/ops/miopen_batch_norm_backward.h>
+#include <ATen/ops/mul.h>
+#include <ATen/ops/native_batch_norm.h>
+#include <ATen/ops/native_batch_norm_backward.h>
+#include <ATen/ops/native_batch_norm_backward_native.h>
+#include <ATen/ops/native_batch_norm_native.h>
+#include <ATen/ops/renorm_native.h>
+#include <ATen/ops/sum.h>
+#include <ATen/ops/sqrt.h>
+#endif
+
 #include <vector>
 #include <c10/core/SymIntArrayRef.h>
 
diff --git a/aten/src/ATen/native/Onehot.cpp b/aten/src/ATen/native/Onehot.cpp
index a0c061062174b..41b7a69618636 100644
--- a/aten/src/ATen/native/Onehot.cpp
+++ b/aten/src/ATen/native/Onehot.cpp
@@ -1,4 +1,14 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/one_hot_native.h>
+#include <ATen/ops/zeros.h>
+#endif
 
 namespace at { namespace native {
 
diff --git a/aten/src/ATen/native/PackedSequence.cpp b/aten/src/ATen/native/PackedSequence.cpp
index 736829eb6d118..19b12b0819607 100644
--- a/aten/src/ATen/native/PackedSequence.cpp
+++ b/aten/src/ATen/native/PackedSequence.cpp
@@ -1,5 +1,20 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_pack_padded_sequence_backward_native.h>
+#include <ATen/ops/_pack_padded_sequence_native.h>
+#include <ATen/ops/_pad_packed_sequence_native.h>
+#include <ATen/ops/cat.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/full.h>
+#include <ATen/ops/pad_sequence_native.h>
+#include <ATen/ops/zeros.h>
+#include <ATen/ops/zeros_like_ops.h>
+#endif
 
 #include <c10/util/irange.h>
 
diff --git a/aten/src/ATen/native/PadNd.cpp b/aten/src/ATen/native/PadNd.cpp
index c6b18c1257b51..9421d537717c8 100644
--- a/aten/src/ATen/native/PadNd.cpp
+++ b/aten/src/ATen/native/PadNd.cpp
@@ -1,8 +1,29 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/PadNd.h>
+#include <ATen/core/Tensor.h>
 
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_empty_affine_quantized.h>
+#include <ATen/ops/_pad_circular.h>
+#include <ATen/ops/_pad_circular_native.h>
+#include <ATen/ops/_pad_enum_native.h>
+#include <ATen/ops/constant_pad_nd.h>
+#include <ATen/ops/constant_pad_nd_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/pad_native.h>
+#include <ATen/ops/reflection_pad1d.h>
+#include <ATen/ops/reflection_pad2d.h>
+#include <ATen/ops/reflection_pad3d.h>
+#include <ATen/ops/replication_pad1d.h>
+#include <ATen/ops/replication_pad2d.h>
+#include <ATen/ops/replication_pad3d.h>
+#endif
+
 namespace at { namespace native {
 
 Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value) {
diff --git a/aten/src/ATen/native/PixelShuffle.cpp b/aten/src/ATen/native/PixelShuffle.cpp
index 2a100321a6400..e535909a73429 100644
--- a/aten/src/ATen/native/PixelShuffle.cpp
+++ b/aten/src/ATen/native/PixelShuffle.cpp
@@ -1,10 +1,21 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/TensorTransformations.h>
+#include <ATen/native/cpu/PixelShuffleKernel.h>
 
-#include <ATen/Functions.h>
-#include <ATen/NativeFunctions.h>
 #include <c10/util/Exception.h>
 
-#include <ATen/native/cpu/PixelShuffleKernel.h>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/pixel_shuffle_native.h>
+#include <ATen/ops/pixel_unshuffle_native.h>
+#endif
+
+#include <algorithm>
+#include <numeric>
+#include <vector>
 
 namespace at {
 namespace native {
diff --git a/aten/src/ATen/native/PointwiseOps.cpp b/aten/src/ATen/native/PointwiseOps.cpp
index a99bc959eb958..8259135ce14a3 100644
--- a/aten/src/ATen/native/PointwiseOps.cpp
+++ b/aten/src/ATen/native/PointwiseOps.cpp
@@ -1,12 +1,17 @@
 // Ternary and higher-order pointwise operations
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/PointwiseOps.h>
 
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/MemoryOverlap.h>
-#include <ATen/native/TensorIterator.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
 
-#include <ATen/NamedTensorUtils.h>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/addcdiv_native.h>
+#include <ATen/ops/addcmul_native.h>
+#endif
 
 namespace at {
 namespace meta {
diff --git a/aten/src/ATen/native/Pooling.cpp b/aten/src/ATen/native/Pooling.cpp
index 724c53fdd0c00..fcbe741ab0ea0 100644
--- a/aten/src/ATen/native/Pooling.cpp
+++ b/aten/src/ATen/native/Pooling.cpp
@@ -1,12 +1,31 @@
-#include <ATen/ATen.h>
-
-#include <ATen/NativeFunctions.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/native/xnnpack/Engine.h>
-#include <c10/macros/Macros.h>
 #include <c10/util/Exception.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_mps_max_pool2d.h>
+#include <ATen/ops/adaptive_avg_pool1d_native.h>
+#include <ATen/ops/adaptive_avg_pool2d.h>
+#include <ATen/ops/adaptive_max_pool1d_native.h>
+#include <ATen/ops/adaptive_max_pool2d.h>
+#include <ATen/ops/avg_pool1d_native.h>
+#include <ATen/ops/avg_pool2d.h>
+#include <ATen/ops/max_pool1d_with_indices_native.h>
+#include <ATen/ops/max_pool2d_native.h>
+#include <ATen/ops/max_pool2d_with_indices.h>
+#include <ATen/ops/max_pool3d_native.h>
+#include <ATen/ops/max_pool3d_with_indices.h>
+#include <ATen/ops/mkldnn_max_pool2d.h>
+#include <ATen/ops/mkldnn_max_pool3d.h>
+#include <ATen/ops/quantized_max_pool2d.h>
+#endif
+
 #include <tuple>
 
 namespace at { namespace native {
diff --git a/aten/src/ATen/native/Pow.cpp b/aten/src/ATen/native/Pow.cpp
index 4326853a8165a..7050524acebf2 100644
--- a/aten/src/ATen/native/Pow.cpp
+++ b/aten/src/ATen/native/Pow.cpp
@@ -1,11 +1,20 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/Pow.h>
 
-#include <ATen/ATen.h>
-#include <ATen/Dispatch.h>
-#include <ATen/native/TensorIterator.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/ScalarOps.h>
 #include <ATen/native/Resize.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/float_power_native.h>
+#include <ATen/ops/pow.h>
+#include <ATen/ops/pow_native.h>
+#include <ATen/ops/result_type.h>
+#endif
+
 namespace at {
 namespace meta {
 
diff --git a/aten/src/ATen/native/QuantizedLinear.cpp b/aten/src/ATen/native/QuantizedLinear.cpp
index af7643ec18b6c..002bb1adc4386 100644
--- a/aten/src/ATen/native/QuantizedLinear.cpp
+++ b/aten/src/ATen/native/QuantizedLinear.cpp
@@ -1,20 +1,28 @@
-#include <array>
-#include <cctype>
-#include <chrono>
-#include <cmath>
-#include <cstddef>
-#include <sstream>
-#include <string>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <vector>
 
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/Parallel.h>
 #include <ATen/WrapDimUtilsMulti.h>
 #include <ATen/cpp_custom_type_hack.h>
 #include <ATen/native/quantized/cpu/fbgemm_utils.h>
 #include <ATen/native/quantized/PackedParams.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like_native.h>
+#include <ATen/ops/fbgemm_linear_fp16_weight_fp32_activation_native.h>
+#include <ATen/ops/fbgemm_linear_fp16_weight_native.h>
+#include <ATen/ops/fbgemm_linear_int8_weight_fp32_activation_native.h>
+#include <ATen/ops/fbgemm_linear_int8_weight_native.h>
+#include <ATen/ops/fbgemm_linear_quantize_weight_native.h>
+#include <ATen/ops/fbgemm_pack_gemm_matrix_fp16_native.h>
+#include <ATen/ops/fbgemm_pack_quantized_matrix_native.h>
+#endif
+
 #include <c10/util/irange.h>
 
 #ifdef USE_FBGEMM
diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp
index 670395893d8ef..52efc6929f54e 100644
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@@ -1,8 +1,10 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/RNN.h>
 
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/core/List.h>
+#include <ATen/Context.h>
+#include <ATen/TensorOperators.h>
 #include <ATen/native/quantized/PackedParams.h>
 #include <ATen/native/quantized/cpu/fbgemm_utils.h>
 #include <ATen/native/quantized/cpu/QnnpackUtils.h>
@@ -10,6 +12,46 @@
 #include <torch/custom_class.h>
 #include <torch/library.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_lstm_mps.h>
+#include <ATen/ops/_thnn_differentiable_gru_cell_backward_native.h>
+#include <ATen/ops/_thnn_differentiable_lstm_cell_backward_native.h>
+#include <ATen/ops/_thnn_fused_gru_cell.h>
+#include <ATen/ops/_thnn_fused_lstm_cell.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_backward_impl.h>
+#include <ATen/ops/_use_cudnn_rnn_flatten_weight_native.h>
+#include <ATen/ops/cat.h>
+#include <ATen/ops/cudnn_is_acceptable.h>
+#include <ATen/ops/dropout.h>
+#include <ATen/ops/fbgemm_linear_int8_weight_fp32_activation.h>
+#include <ATen/ops/fbgemm_linear_quantize_weight_native.h>
+#include <ATen/ops/fbgemm_pack_quantized_matrix_native.h>
+#include <ATen/ops/gru_cell_native.h>
+#include <ATen/ops/gru_native.h>
+#include <ATen/ops/linear.h>
+#include <ATen/ops/lstm_cell_native.h>
+#include <ATen/ops/lstm_native.h>
+#include <ATen/ops/matmul.h>
+#include <ATen/ops/quantized_gru_cell_native.h>
+#include <ATen/ops/quantized_lstm_cell_native.h>
+#include <ATen/ops/quantized_rnn_relu_cell_native.h>
+#include <ATen/ops/quantized_rnn_tanh_cell_native.h>
+#include <ATen/ops/relu.h>
+#include <ATen/ops/rnn_relu_cell_native.h>
+#include <ATen/ops/rnn_relu_native.h>
+#include <ATen/ops/rnn_tanh_cell_native.h>
+#include <ATen/ops/rnn_tanh_native.h>
+#include <ATen/ops/sigmoid_backward.h>
+#include <ATen/ops/stack.h>
+#include <ATen/ops/tanh.h>
+#include <ATen/ops/tanh_backward.h>
+#include <ATen/ops/zeros_like.h>
+#include <ATen/ops/zeros_like_ops.h>
+#endif
+
 int register_linear_params();
 
 namespace at { namespace native {
diff --git a/aten/src/ATen/native/RangeFactories.cpp b/aten/src/ATen/native/RangeFactories.cpp
index 038da93456edb..408bf0a27e6fe 100644
--- a/aten/src/ATen/native/RangeFactories.cpp
+++ b/aten/src/ATen/native/RangeFactories.cpp
@@ -1,13 +1,23 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/RangeFactories.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/AccumulateType.h>
-#include <ATen/Parallel.h>
 #include <ATen/Dispatch.h>
-#include <ATen/native/TensorIterator.h>
+#include <ATen/Parallel.h>
+#include <ATen/TensorIterator.h>
 #include <c10/util/irange.h>
 #include <cmath>
 #include <limits>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/arange_native.h>
+#include <ATen/ops/linspace_native.h>
+#include <ATen/ops/logspace_native.h>
+#include <ATen/ops/range_native.h>
+#endif
+
 namespace at { namespace native {
 
 
diff --git a/aten/src/ATen/native/ReduceAllOps.cpp b/aten/src/ATen/native/ReduceAllOps.cpp
index 1ef5e9b93733c..e1d51a1666af2 100644
--- a/aten/src/ATen/native/ReduceAllOps.cpp
+++ b/aten/src/ATen/native/ReduceAllOps.cpp
@@ -1,8 +1,21 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/ReduceAllOps.h>
 #include <ATen/native/Resize.h>
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_aminmax_native.h>
+#include <ATen/ops/aminmax.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/max.h>
+#include <ATen/ops/max_native.h>
+#include <ATen/ops/min.h>
+#include <ATen/ops/min_native.h>
+#endif
 
 namespace at {
 namespace native {
diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
index 2bb01abd51b5f..2fe5eee4a286d 100644
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@@ -1,21 +1,114 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/ReduceOps.h>
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
-#include <ATen/ExpandUtils.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
 #include <ATen/WrapDimUtils.h>
 #include <ATen/WrapDimUtilsMulti.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorOperators.h>
+#include <ATen/NamedTensorUtils.h>
 #include <ATen/native/ReduceOpsUtils.h>
 #include <ATen/native/Resize.h>
-#include <ATen/native/TensorIterator.h>
-#include <ATen/NamedTensorUtils.h>
 #include <ATen/native/TensorDimApply.h>
-#include <ATen/native/SharedReduceOps.h>
 #include <ATen/core/grad_mode.h>
 #include <ATen/TensorSubclassLikeUtils.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_cummax_helper.h>
+#include <ATen/ops/_cummax_helper_native.h>
+#include <ATen/ops/_cummin_helper.h>
+#include <ATen/ops/_cummin_helper_native.h>
+#include <ATen/ops/_logcumsumexp.h>
+#include <ATen/ops/_logcumsumexp_native.h>
+#include <ATen/ops/add.h>
+#include <ATen/ops/all_meta.h>
+#include <ATen/ops/all_native.h>
+#include <ATen/ops/amax.h>
+#include <ATen/ops/amax_meta.h>
+#include <ATen/ops/amax_native.h>
+#include <ATen/ops/amin_meta.h>
+#include <ATen/ops/amin_native.h>
+#include <ATen/ops/aminmax_meta.h>
+#include <ATen/ops/aminmax_native.h>
+#include <ATen/ops/any_meta.h>
+#include <ATen/ops/any_native.h>
+#include <ATen/ops/argmax_meta.h>
+#include <ATen/ops/argmax_native.h>
+#include <ATen/ops/argmin_meta.h>
+#include <ATen/ops/argmin_native.h>
+#include <ATen/ops/cat.h>
+#include <ATen/ops/complex.h>
+#include <ATen/ops/cummax.h>
+#include <ATen/ops/cummax_native.h>
+#include <ATen/ops/cummaxmin_backward_native.h>
+#include <ATen/ops/cummin.h>
+#include <ATen/ops/cummin_native.h>
+#include <ATen/ops/cumprod.h>
+#include <ATen/ops/cumprod_backward_native.h>
+#include <ATen/ops/cumprod_meta.h>
+#include <ATen/ops/cumprod_native.h>
+#include <ATen/ops/cumsum.h>
+#include <ATen/ops/cumsum_meta.h>
+#include <ATen/ops/cumsum_native.h>
+#include <ATen/ops/diff_native.h>
+#include <ATen/ops/dist_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/equal_native.h>
+#include <ATen/ops/exp.h>
+#include <ATen/ops/gather.h>
+#include <ATen/ops/gradient_native.h>
+#include <ATen/ops/imag.h>
+#include <ATen/ops/isnan_native.h>
+#include <ATen/ops/logcumsumexp.h>
+#include <ATen/ops/logcumsumexp_native.h>
+#include <ATen/ops/logical_xor.h>
+#include <ATen/ops/logsumexp.h>
+#include <ATen/ops/logsumexp_native.h>
+#include <ATen/ops/mean.h>
+#include <ATen/ops/mean_meta.h>
+#include <ATen/ops/mean_native.h>
+#include <ATen/ops/nanmean_native.h>
+#include <ATen/ops/nansum.h>
+#include <ATen/ops/nansum_native.h>
+#include <ATen/ops/narrow.h>
+#include <ATen/ops/native_norm.h>
+#include <ATen/ops/norm.h>
+#include <ATen/ops/norm_meta.h>
+#include <ATen/ops/norm_native.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/prod.h>
+#include <ATen/ops/prod_meta.h>
+#include <ATen/ops/prod_native.h>
+#include <ATen/ops/real.h>
+#include <ATen/ops/slice.h>
+#include <ATen/ops/special_logsumexp_native.h>
+#include <ATen/ops/sqrt.h>
+#include <ATen/ops/stack.h>
+#include <ATen/ops/std.h>
+#include <ATen/ops/std_mean.h>
+#include <ATen/ops/std_mean_native.h>
+#include <ATen/ops/std_native.h>
+#include <ATen/ops/sub.h>
+#include <ATen/ops/sum.h>
+#include <ATen/ops/sum_meta.h>
+#include <ATen/ops/sum_native.h>
+#include <ATen/ops/trace_native.h>
+#include <ATen/ops/value_selecting_reduction_backward_native.h>
+#include <ATen/ops/var.h>
+#include <ATen/ops/var_mean.h>
+#include <ATen/ops/var_mean_native.h>
+#include <ATen/ops/var_native.h>
+#include <ATen/ops/zeros.h>
+#include <ATen/ops/zeros_like.h>
+#endif
+
 #include <c10/util/irange.h>
 #include <c10/util/SmallBuffer.h>
 
@@ -24,9 +117,7 @@
 #include <limits>
 #include <numeric>
 #include <vector>
-#include <map>
 #include <cmath>
-#include <cfloat>
 #include <type_traits>
 
 namespace at {
diff --git a/aten/src/ATen/native/ReflectionPad.cpp b/aten/src/ATen/native/ReflectionPad.cpp
index 7824de63805f3..3a6ad683d0457 100644
--- a/aten/src/ATen/native/ReflectionPad.cpp
+++ b/aten/src/ATen/native/ReflectionPad.cpp
@@ -1,9 +1,26 @@
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
+#include <ATen/TensorMeta.h>
 #include <ATen/quantized/Quantizer.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_empty_affine_quantized.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/reflection_pad1d_backward_native.h>
+#include <ATen/ops/reflection_pad1d_native.h>
+#include <ATen/ops/reflection_pad2d_backward_native.h>
+#include <ATen/ops/reflection_pad2d_native.h>
+#include <ATen/ops/reflection_pad3d_backward_native.h>
+#include <ATen/ops/reflection_pad3d_native.h>
+#include <ATen/ops/zeros_like.h>
+#endif
+
 namespace at {
 
 namespace meta {
diff --git a/aten/src/ATen/native/Repeat.cpp b/aten/src/ATen/native/Repeat.cpp
index b6e5c04f77026..b671a2232044b 100644
--- a/aten/src/ATen/native/Repeat.cpp
+++ b/aten/src/ATen/native/Repeat.cpp
@@ -1,8 +1,19 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
 #include <ATen/native/Repeat.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/repeat_interleave.h>
+#include <ATen/ops/repeat_interleave_native.h>
+#endif
+
 template <typename index_t>
 static void compute_cpu(
     index_t* repeat_ptr,
diff --git a/aten/src/ATen/native/ReplicationPadding.cpp b/aten/src/ATen/native/ReplicationPadding.cpp
index 40fdb788a4ffa..d0a4ea919acbf 100644
--- a/aten/src/ATen/native/ReplicationPadding.cpp
+++ b/aten/src/ATen/native/ReplicationPadding.cpp
@@ -1,9 +1,24 @@
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
+#include <ATen/TensorMeta.h>
 #include <c10/util/irange.h>
 #include <algorithm>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/replication_pad1d_backward_native.h>
+#include <ATen/ops/replication_pad1d_native.h>
+#include <ATen/ops/replication_pad2d_backward_native.h>
+#include <ATen/ops/replication_pad2d_native.h>
+#include <ATen/ops/replication_pad3d_backward_native.h>
+#include <ATen/ops/replication_pad3d_native.h>
+#include <ATen/ops/zeros_like.h>
+#endif
+
 namespace at {
 
 namespace meta {
diff --git a/aten/src/ATen/native/Resize.cpp b/aten/src/ATen/native/Resize.cpp
index 08286f3983cc9..bd47a25e69601 100644
--- a/aten/src/ATen/native/Resize.cpp
+++ b/aten/src/ATen/native/Resize.cpp
@@ -1,9 +1,16 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/ResizeCommon.h>
+#include <ATen/NamedTensorUtils.h>
 #include <ATen/TensorSubclassLikeUtils.h>
 
-#include <c10/core/TensorOptions.h>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/resize_as_native.h>
+#include <ATen/ops/resize_native.h>
+#endif
 
 namespace at { namespace native {
 
diff --git a/aten/src/ATen/native/RowwisePrune.cpp b/aten/src/ATen/native/RowwisePrune.cpp
index 40ae2215cbccc..c27707c4d3075 100644
--- a/aten/src/ATen/native/RowwisePrune.cpp
+++ b/aten/src/ATen/native/RowwisePrune.cpp
@@ -1,8 +1,17 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_rowwise_prune_native.h>
+#include <ATen/ops/empty.h>
+#endif
 
 namespace at {
 namespace native {
diff --git a/aten/src/ATen/native/Scalar.cpp b/aten/src/ATen/native/Scalar.cpp
index 7342c4806d44c..f8932ea03bb2e 100644
--- a/aten/src/ATen/native/Scalar.cpp
+++ b/aten/src/ATen/native/Scalar.cpp
@@ -1,5 +1,15 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_local_scalar_dense.h>
+#include <ATen/ops/_local_scalar_dense_native.h>
+#include <ATen/ops/item_native.h>
+#endif
 
 namespace at {
 namespace native {
diff --git a/aten/src/ATen/native/SegmentReduce.cpp b/aten/src/ATen/native/SegmentReduce.cpp
index 3e562b7cf859f..1e5e28dab86b2 100644
--- a/aten/src/ATen/native/SegmentReduce.cpp
+++ b/aten/src/ATen/native/SegmentReduce.cpp
@@ -1,10 +1,23 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/SegmentReduce.h>
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
+#include <ATen/TensorOperators.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_segment_reduce_backward_native.h>
+#include <ATen/ops/all.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/segment_reduce_native.h>
+#include <ATen/ops/zeros.h>
+#endif
+
 namespace at {
 namespace native {
 
diff --git a/aten/src/ATen/native/SobolEngineOps.cpp b/aten/src/ATen/native/SobolEngineOps.cpp
index 48366976a2e70..187faeba16a7b 100644
--- a/aten/src/ATen/native/SobolEngineOps.cpp
+++ b/aten/src/ATen/native/SobolEngineOps.cpp
@@ -1,11 +1,21 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
-#include <ATen/NativeFunctions.h>
 
 #include <ATen/native/SobolEngineOpsUtils.h>
 #include <c10/util/irange.h>
 
-#include <vector>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_sobol_engine_draw_native.h>
+#include <ATen/ops/_sobol_engine_ff_native.h>
+#include <ATen/ops/_sobol_engine_initialize_state_native.h>
+#include <ATen/ops/_sobol_engine_scramble_native.h>
+#include <ATen/ops/arange_native.h>
+#include <ATen/ops/empty.h>
+#endif
 
 namespace at {
 namespace native {
diff --git a/aten/src/ATen/native/SobolEngineOpsUtils.cpp b/aten/src/ATen/native/SobolEngineOpsUtils.cpp
index ef7cbb1faae92..709d5c06d3c97 100644
--- a/aten/src/ATen/native/SobolEngineOpsUtils.cpp
+++ b/aten/src/ATen/native/SobolEngineOpsUtils.cpp
@@ -1,4 +1,5 @@
 /// This file contains tensor-agnostic SoboleEngine constants
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/SobolEngineOpsUtils.h>
 
 /*
diff --git a/aten/src/ATen/native/SobolEngineOpsUtils.h b/aten/src/ATen/native/SobolEngineOpsUtils.h
index d3d7a362f2e87..495a43ed8a7cf 100644
--- a/aten/src/ATen/native/SobolEngineOpsUtils.h
+++ b/aten/src/ATen/native/SobolEngineOpsUtils.h
@@ -1,6 +1,14 @@
 /// This file contains some tensor-agnostic operations to be used in the
 /// core functions of the `SobolEngine`
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/arange.h>
+#include <ATen/ops/mul.h>
+#include <ATen/ops/pow.h>
+#endif
 
 namespace at {
 namespace native {
diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp
index d9d1b90534d73..0e3dafb24e9e8 100644
--- a/aten/src/ATen/native/SoftMax.cpp
+++ b/aten/src/ATen/native/SoftMax.cpp
@@ -1,13 +1,36 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/AccumulateType.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
 #include <ATen/TensorMeta.h>
 #include <ATen/TensorUtils.h>
+#include <ATen/TensorIterator.h>
 #include <ATen/WrapDimUtils.h>
 #include <ATen/native/cpu/SoftmaxKernel.h>
 #include <ATen/NamedTensorUtils.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_log_softmax.h>
+#include <ATen/ops/_log_softmax_backward_data_native.h>
+#include <ATen/ops/_log_softmax_native.h>
+#include <ATen/ops/_masked_softmax_native.h>
+#include <ATen/ops/_softmax.h>
+#include <ATen/ops/_softmax_backward_data_native.h>
+#include <ATen/ops/_softmax_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/log_softmax.h>
+#include <ATen/ops/log_softmax_native.h>
+#include <ATen/ops/softmax.h>
+#include <ATen/ops/softmax_native.h>
+#include <ATen/ops/special_log_softmax_native.h>
+#include <ATen/ops/special_softmax_native.h>
+#endif
+
 #include <c10/core/TensorOptions.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/irange.h>
diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp
index 66b9daf7fad8c..3b50d7744aa28 100644
--- a/aten/src/ATen/native/Sorting.cpp
+++ b/aten/src/ATen/native/Sorting.cpp
@@ -1,8 +1,16 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/ExpandUtils.h>
 #include <ATen/MemoryOverlap.h>
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/Parallel.h>
+#include <ATen/ScalarOps.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <ATen/TensorOperators.h>
+#include <ATen/TensorUtils.h>
 #include <ATen/TensorSubclassLikeUtils.h>
 #include <ATen/WrapDimUtils.h>
 #include <ATen/native/Resize.h>
@@ -11,6 +19,32 @@
 #include <ATen/native/ReduceOpsUtils.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/arange.h>
+#include <ATen/ops/argsort_native.h>
+#include <ATen/ops/broadcast_tensors.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/full.h>
+#include <ATen/ops/full_like.h>
+#include <ATen/ops/kthvalue.h>
+#include <ATen/ops/kthvalue_native.h>
+#include <ATen/ops/masked_fill.h>
+#include <ATen/ops/median.h>
+#include <ATen/ops/median_native.h>
+#include <ATen/ops/msort_native.h>
+#include <ATen/ops/nanmedian.h>
+#include <ATen/ops/nanmedian_native.h>
+#include <ATen/ops/nanquantile_native.h>
+#include <ATen/ops/quantile_native.h>
+#include <ATen/ops/scalar_tensor.h>
+#include <ATen/ops/sort.h>
+#include <ATen/ops/sort_native.h>
+#include <ATen/ops/topk_native.h>
+#endif
+
 #include <utility>
 
 namespace at {
diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index f39eeaccf9d4f..2840b1651dba5 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -1,16 +1,67 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/Config.h>
 #include <ATen/TensorSubclassLikeUtils.h>
 #include <ATen/detail/CUDAHooksInterface.h>
 #include <ATen/native/SpectralOpsUtils.h>
-#include <ATen/native/TensorIterator.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorOperators.h>
 #include <ATen/WrapDimUtils.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_cufft_clear_plan_cache_native.h>
+#include <ATen/ops/_cufft_get_plan_cache_max_size_native.h>
+#include <ATen/ops/_cufft_get_plan_cache_size_native.h>
+#include <ATen/ops/_cufft_set_plan_cache_max_size_native.h>
+#include <ATen/ops/_fft_c2c.h>
+#include <ATen/ops/_fft_c2r.h>
+#include <ATen/ops/_fft_r2c.h>
+#include <ATen/ops/arange.h>
+#include <ATen/ops/arange_native.h>
+#include <ATen/ops/col2im.h>
+#include <ATen/ops/conj.h>
+#include <ATen/ops/conj_physical.h>
+#include <ATen/ops/constant_pad_nd.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/fft_fft2_native.h>
+#include <ATen/ops/fft_fft_native.h>
+#include <ATen/ops/fft_fftfreq_native.h>
+#include <ATen/ops/fft_fftn_native.h>
+#include <ATen/ops/fft_fftshift_native.h>
+#include <ATen/ops/fft_hfft2_native.h>
+#include <ATen/ops/fft_hfft_native.h>
+#include <ATen/ops/fft_hfftn_native.h>
+#include <ATen/ops/fft_ifft2_native.h>
+#include <ATen/ops/fft_ifft_native.h>
+#include <ATen/ops/fft_ifftn_native.h>
+#include <ATen/ops/fft_ifftshift_native.h>
+#include <ATen/ops/fft_ihfft2_native.h>
+#include <ATen/ops/fft_ihfft_native.h>
+#include <ATen/ops/fft_ihfftn_native.h>
+#include <ATen/ops/fft_irfft2_native.h>
+#include <ATen/ops/fft_irfft_native.h>
+#include <ATen/ops/fft_irfftn_native.h>
+#include <ATen/ops/fft_rfft2_native.h>
+#include <ATen/ops/fft_rfft_native.h>
+#include <ATen/ops/fft_rfftfreq_native.h>
+#include <ATen/ops/fft_rfftn_native.h>
+#include <ATen/ops/istft_native.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/pad.h>
+#include <ATen/ops/roll.h>
+#include <ATen/ops/stft.h>
+#include <ATen/ops/stft_native.h>
+#include <ATen/ops/view_as_complex.h>
+#include <ATen/ops/view_as_real.h>
+#include <ATen/ops/zeros.h>
+#include <ATen/ops/zeros_like_ops.h>
+#endif
+
 #include <algorithm>
-#include <vector>
-#include <cmath>
 
 namespace at { namespace native {
 
@@ -157,7 +208,7 @@ Tensor fft_c2r(c10::string_view function_name,
   const auto norm = norm_from_string(norm_str, forward);
   if (forward) {
     // FIXME: _fft does not support complex_output=false with inverse=false
-    input = at::conj(input);
+    input = input.conj();
   }
   return fft_c2r_maybe_out(
       function_name, out, input, dim, static_cast<int64_t>(norm), n);
@@ -192,7 +243,7 @@ Tensor fft_r2c(c10::string_view function_name,
 
   if (!forward) {
     // FIXME: _fft_r2c doesn't support native r2c IFFT
-    return out.defined() ? at::conj_physical_out(out, ret) : at::conj(ret);
+    return out.defined() ? at::conj_physical_out(out, ret) : ret.conj();
   } else {
     return ret;
   }
@@ -521,7 +572,7 @@ static Tensor fft_hfftn_impl(
   }
 
   const auto last_dim = desc.dim.back();
-  tmp = at::conj(tmp);
+  tmp = tmp.conj();
   return fft_c2r_maybe_out(fname, out, tmp, last_dim, norm, last_dim_size);
 }
 
@@ -559,7 +610,7 @@ static Tensor fft_ihfftn_impl(
   const auto last_dim = desc.dim.back();
   auto tmp = at::_fft_r2c(x, last_dim, norm, /*onesided=*/true);
   if (desc.dim.size() == 1) {
-    return out.defined() ? at::conj_physical_out(tmp, out) : at::conj(tmp);
+    return out.defined() ? at::conj_physical_out(tmp, out) : tmp.conj();
   }
 
   tmp = at::conj_physical(tmp);
diff --git a/aten/src/ATen/native/SummaryOps.cpp b/aten/src/ATen/native/SummaryOps.cpp
index e7dbe72576721..ae0b38c96efa7 100644
--- a/aten/src/ATen/native/SummaryOps.cpp
+++ b/aten/src/ATen/native/SummaryOps.cpp
@@ -1,10 +1,17 @@
 // Returns the frequency of elements of input non-negative integer tensor.
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
 #include <c10/util/irange.h>
 
-#include <tuple>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/bincount_native.h>
+#include <ATen/ops/zeros.h>
+#endif
 
 namespace at { namespace native {
 
diff --git a/aten/src/ATen/native/TensorDimApply.h b/aten/src/ATen/native/TensorDimApply.h
index ad9ca857eeab8..e75cd40caf48b 100644
--- a/aten/src/ATen/native/TensorDimApply.h
+++ b/aten/src/ATen/native/TensorDimApply.h
@@ -1,4 +1,5 @@
-#include <ATen/ATen.h>
+#pragma once
+#include <ATen/core/Tensor.h>
 #include <c10/util/irange.h>
 
 namespace at {

From 90123495d826964d165d01bd70a3b6fec8e86374 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Mon, 17 Oct 2022 18:57:07 +0100
Subject: [PATCH 0092/1922] ATen/native (5/6): Use per-operator headers
 (#75575)

Differential Revision: [D40126696](https://our.internmc.facebook.com/intern/diff/D40126696)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/75575
Approved by: https://github.com/malfet
---
 .../ATen/native/TensorAdvancedIndexing.cpp    |  74 ++++++-
 .../ATen/native/TensorAdvancedIndexingUtils.h |   2 +-
 aten/src/ATen/native/TensorCompare.cpp        |  74 ++++++-
 aten/src/ATen/native/TensorConversions.cpp    |  44 ++++-
 aten/src/ATen/native/TensorFactories.cpp      |  90 +++++++--
 aten/src/ATen/native/TensorFactories.h        |   5 +-
 aten/src/ATen/native/TensorIteratorReduce.cpp |  11 +-
 aten/src/ATen/native/TensorProperties.cpp     |  27 ++-
 aten/src/ATen/native/TensorShape.cpp          | 181 +++++++++++++++++-
 aten/src/ATen/native/TensorShape.h            |   7 -
 .../src/ATen/native/TensorTransformations.cpp |  21 +-
 aten/src/ATen/native/TestOps.cpp              |  19 +-
 aten/src/ATen/native/TriangularOps.cpp        |  18 +-
 aten/src/ATen/native/TriangularOpsUtils.h     |   2 +-
 aten/src/ATen/native/TypeProperties.cpp       |  26 ++-
 15 files changed, 537 insertions(+), 64 deletions(-)

diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
index 2f7dbaf45252f..3004dc1b31c79 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -47,31 +47,93 @@
 //                   ...)
 //
 // where & and * represent the C-style address-of and indirection operations.
+// #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/ATen.h>
 
 #include <ATen/native/TensorAdvancedIndexing.h>
 #include <ATen/native/IndexKernel.h>
 #include <ATen/native/IndexingUtils.h>
 
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/core/IListRef.h>
+#include <ATen/Context.h>
+#include <ATen/Dispatch.h>
 #include <ATen/ExpandUtils.h>
 #include <ATen/MemoryOverlap.h>
-#include <ATen/native/TensorAdvancedIndexingUtils.h>
-#include <ATen/core/IListRef.h>
-#include <ATen/native/TensorIterator.h>
+#include <ATen/NamedTensorUtils.h>
+#include <ATen/Parallel.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <ATen/TensorOperators.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/WrapDimUtils.h>
 #include <ATen/native/BinaryOps.h>
 #include <ATen/native/Copy.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/ScatterGatherChecks.h>
+#include <ATen/native/TensorAdvancedIndexingUtils.h>
 #include <ATen/Parallel.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/TensorSubclassLikeUtils.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_gather_sparse_backward.h>
+#include <ATen/ops/_gather_sparse_backward_native.h>
+#include <ATen/ops/_index_put_impl.h>
+#include <ATen/ops/_index_put_impl_native.h>
+#include <ATen/ops/_sparse_coo_tensor_unsafe.h>
+#include <ATen/ops/arange.h>
+#include <ATen/ops/argwhere_native.h>
+#include <ATen/ops/as_strided.h>
+#include <ATen/ops/broadcast_to.h>
+#include <ATen/ops/count_nonzero.h>
+#include <ATen/ops/count_nonzero_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_quantized.h>
+#include <ATen/ops/gather.h>
+#include <ATen/ops/gather_backward_native.h>
+#include <ATen/ops/gather_meta.h>
+#include <ATen/ops/gather_native.h>
+#include <ATen/ops/index.h>
+#include <ATen/ops/index_add_meta.h>
+#include <ATen/ops/index_add_native.h>
+#include <ATen/ops/index_copy_meta.h>
+#include <ATen/ops/index_copy_native.h>
+#include <ATen/ops/index_fill_native.h>
+#include <ATen/ops/index_meta.h>
+#include <ATen/ops/index_native.h>
+#include <ATen/ops/index_put_native.h>
+#include <ATen/ops/index_reduce_meta.h>
+#include <ATen/ops/index_reduce_native.h>
+#include <ATen/ops/index_select_backward_native.h>
+#include <ATen/ops/index_select_native.h>
+#include <ATen/ops/masked_fill_native.h>
+#include <ATen/ops/masked_scatter_native.h>
+#include <ATen/ops/masked_select_backward_native.h>
+#include <ATen/ops/masked_select_native.h>
+#include <ATen/ops/nonzero_native.h>
+#include <ATen/ops/nonzero_numpy_native.h>
+#include <ATen/ops/ones_like.h>
+#include <ATen/ops/put_native.h>
+#include <ATen/ops/quantize_per_tensor.h>
+#include <ATen/ops/scatter_add_meta.h>
+#include <ATen/ops/scatter_add_native.h>
+#include <ATen/ops/scatter_meta.h>
+#include <ATen/ops/scatter_native.h>
+#include <ATen/ops/scatter_reduce_meta.h>
+#include <ATen/ops/scatter_reduce_native.h>
+#include <ATen/ops/take_along_dim_native.h>
+#include <ATen/ops/take_native.h>
+#include <ATen/ops/zeros_like.h>
+#endif
+
 #include <c10/util/irange.h>
 #include <c10/util/Unroll.h>
 
 #include <algorithm>
-#include <functional>
 #include <numeric>
 #include <vector>
 
diff --git a/aten/src/ATen/native/TensorAdvancedIndexingUtils.h b/aten/src/ATen/native/TensorAdvancedIndexingUtils.h
index 3e786bf7db4fc..0c0db4b83f351 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexingUtils.h
+++ b/aten/src/ATen/native/TensorAdvancedIndexingUtils.h
@@ -1,5 +1,5 @@
 #pragma once
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/native/IndexingUtils.h>
 #include <ATen/native/TensorIterator.h>
 
diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp
index 856d684c52e85..5d3ee7d98d803 100644
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@@ -1,19 +1,73 @@
-#include <ATen/ATen.h>
-#include <ATen/CPUApplyUtils.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
-#include <ATen/ExpandUtils.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/native/ReduceOpsUtils.h>
-#include <c10/util/Exception.h>
+#include <ATen/NamedTensorUtils.h>
+#include <ATen/ScalarOps.h>
+#include <ATen/TensorIndexing.h>
+#include <ATen/TensorMeta.h>
+#include <ATen/TensorOperators.h>
+#include <ATen/WrapDimUtils.h>
 #include <ATen/native/BinaryOps.h>
+#include <ATen/native/ReduceOpsUtils.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorCompare.h>
-#include <ATen/native/Fill.h>
-#include <ATen/NamedTensorUtils.h>
-#include <ATen/TensorIndexing.h>
 #include <ATen/native/TypeProperties.h>
-#include <c10/core/QScheme.h>
 #include <ATen/TensorSubclassLikeUtils.h>
+#include <c10/util/Exception.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_aminmax_native.h>
+#include <ATen/ops/_assert_async_native.h>
+#include <ATen/ops/_make_per_tensor_quantized_tensor.h>
+#include <ATen/ops/_unique.h>
+#include <ATen/ops/allclose_native.h>
+#include <ATen/ops/aminmax.h>
+#include <ATen/ops/argsort_native.h>
+#include <ATen/ops/cat.h>
+#include <ATen/ops/clamp.h>
+#include <ATen/ops/clamp_max.h>
+#include <ATen/ops/clamp_max_native.h>
+#include <ATen/ops/clamp_min.h>
+#include <ATen/ops/clamp_min_native.h>
+#include <ATen/ops/clamp_native.h>
+#include <ATen/ops/clip_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/eq.h>
+#include <ATen/ops/fill.h>
+#include <ATen/ops/imag.h>
+#include <ATen/ops/index.h>
+#include <ATen/ops/is_nonzero_native.h>
+#include <ATen/ops/isclose.h>
+#include <ATen/ops/isclose_native.h>
+#include <ATen/ops/isfinite.h>
+#include <ATen/ops/isfinite_native.h>
+#include <ATen/ops/isin.h>
+#include <ATen/ops/isin_native.h>
+#include <ATen/ops/isinf.h>
+#include <ATen/ops/isinf_native.h>
+#include <ATen/ops/isnan_native.h>
+#include <ATen/ops/isneginf_native.h>
+#include <ATen/ops/isposinf_native.h>
+#include <ATen/ops/isreal_native.h>
+#include <ATen/ops/max.h>
+#include <ATen/ops/max_native.h>
+#include <ATen/ops/min.h>
+#include <ATen/ops/min_native.h>
+#include <ATen/ops/mode.h>
+#include <ATen/ops/mode_native.h>
+#include <ATen/ops/ne.h>
+#include <ATen/ops/ones_like.h>
+#include <ATen/ops/real.h>
+#include <ATen/ops/result_type_native.h>
+#include <ATen/ops/scalar_tensor.h>
+#include <ATen/ops/where.h>
+#include <ATen/ops/where_native.h>
+#include <ATen/ops/zeros_like.h>
+#endif
 
 namespace at {
 namespace meta {
diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp
index 2af35c66a0b9e..ec699bf1bf7fa 100644
--- a/aten/src/ATen/native/TensorConversions.cpp
+++ b/aten/src/ATen/native/TensorConversions.cpp
@@ -1,8 +1,50 @@
+// #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/core/Tensor.h>
 #include <c10/util/Optional.h>
 #include <ATen/quantized/Quantizer.h>
+#include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
+#include <ATen/TensorOperators.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_autocast_to_full_precision_native.h>
+#include <ATen/ops/_autocast_to_reduced_precision_native.h>
+#include <ATen/ops/_convert_indices_from_coo_to_csr.h>
+#include <ATen/ops/_convert_indices_from_coo_to_csr_native.h>
+#include <ATen/ops/_convert_indices_from_csr_to_coo.h>
+#include <ATen/ops/_convert_indices_from_csr_to_coo_native.h>
+#include <ATen/ops/_sparse_bsc_tensor_unsafe_native.h>
+#include <ATen/ops/_sparse_bsr_tensor_unsafe_native.h>
+#include <ATen/ops/_sparse_compressed_tensor_unsafe_native.h>
+#include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
+#include <ATen/ops/_sparse_csc_tensor_unsafe_native.h>
+#include <ATen/ops/_sparse_csr_tensor_unsafe_native.h>
+#include <ATen/ops/_to_copy.h>
+#include <ATen/ops/_to_copy_native.h>
+#include <ATen/ops/_to_cpu_native.h>
+#include <ATen/ops/_to_dense_native.h>
+#include <ATen/ops/arange_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/empty_quantized.h>
+#include <ATen/ops/empty_strided.h>
+#include <ATen/ops/empty_strided_native.h>
+#include <ATen/ops/to_dense_backward_native.h>
+#include <ATen/ops/to_dense_native.h>
+#include <ATen/ops/to_mkldnn_backward_native.h>
+#include <ATen/ops/to_native.h>
+#include <ATen/ops/to_sparse_bsc_native.h>
+#include <ATen/ops/to_sparse_bsr_native.h>
+#include <ATen/ops/to_sparse_csc_native.h>
+#include <ATen/ops/to_sparse_csr_native.h>
+#include <ATen/ops/to_sparse_native.h>
+#include <ATen/ops/view_native.h>
+#include <ATen/ops/zeros.h>
+#endif
 
 #include <ATen/SparseCsrTensorUtils.h>
 #include <ATen/SparseTensorUtils.h>
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 2e01f7e8699ad..9d1c6d8a36333 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -1,31 +1,99 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/TensorFactories.h>
+
+#include <ATen/core/Tensor.h>
 #include <ATen/CPUGeneratorImpl.h>
 #include <ATen/Dispatch.h>
 #include <ATen/EmptyTensor.h>
+#include <ATen/ExpandUtils.h>
 #include <ATen/Parallel.h>
 #include <ATen/MapAllocator.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/TracerMode.h>
+#include <ATen/TensorOperators.h>
+#include <ATen/NamedTensorUtils.h>
+#include <ATen/native/UnaryOps.h>
 #include <c10/core/ScalarType.h>
-#include <c10/util/Deprecated.h>
-#include <ATen/native/Math.h>
-#include <ATen/native/Resize.h>
-#include <ATen/native/TensorFactories.h>
 #include <c10/core/TensorOptions.h>
-#include <ATen/detail/CUDAHooksInterface.h>
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>
-#include <ATen/NamedTensorUtils.h>
-#include <ATen/native/UnaryOps.h>
+#include <c10/util/MathConstants.h>
+
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
 #else
+#include <ATen/ops/_cast_Byte_native.h>
+#include <ATen/ops/_cast_Char_native.h>
+#include <ATen/ops/_cast_Double_native.h>
+#include <ATen/ops/_cast_Float_native.h>
+#include <ATen/ops/_cast_Half_native.h>
+#include <ATen/ops/_cast_Int_native.h>
+#include <ATen/ops/_cast_Long_native.h>
+#include <ATen/ops/_cast_Short_native.h>
+#include <ATen/ops/_dim_arange_native.h>
+#include <ATen/ops/_efficientzerotensor_native.h>
+#include <ATen/ops/_empty_affine_quantized.h>
+#include <ATen/ops/_empty_per_channel_affine_quantized.h>
+#include <ATen/ops/arange.h>
+#include <ATen/ops/arange_native.h>
+#include <ATen/ops/bartlett_window_native.h>
+#include <ATen/ops/blackman_window_native.h>
+#include <ATen/ops/clone_native.h>
+#include <ATen/ops/complex.h>
+#include <ATen/ops/complex_native.h>
+#include <ATen/ops/cumprod.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/empty_like_native.h>
+#include <ATen/ops/empty_native.h>
+#include <ATen/ops/empty_strided.h>
+#include <ATen/ops/empty_strided_native.h>
 #include <ATen/ops/eye.h>
+#include <ATen/ops/eye_native.h>
+#include <ATen/ops/fill_native.h>
+#include <ATen/ops/flip.h>
+#include <ATen/ops/from_file_native.h>
+#include <ATen/ops/full_like_native.h>
+#include <ATen/ops/full_native.h>
+#include <ATen/ops/hamming_window_native.h>
+#include <ATen/ops/hann_window_native.h>
+#include <ATen/ops/kaiser_window_native.h>
+#include <ATen/ops/linspace.h>
+#include <ATen/ops/linspace_native.h>
+#include <ATen/ops/logspace.h>
+#include <ATen/ops/logspace_native.h>
+#include <ATen/ops/new_empty_native.h>
+#include <ATen/ops/new_empty_strided_native.h>
+#include <ATen/ops/new_full_native.h>
+#include <ATen/ops/new_ones_native.h>
+#include <ATen/ops/new_zeros_native.h>
+#include <ATen/ops/normal_native.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/ones_like_native.h>
+#include <ATen/ops/ones_native.h>
+#include <ATen/ops/polar.h>
+#include <ATen/ops/polar_native.h>
+#include <ATen/ops/promote_types.h>
+#include <ATen/ops/rand_like_native.h>
+#include <ATen/ops/rand_native.h>
+#include <ATen/ops/randint_like_native.h>
+#include <ATen/ops/randint_native.h>
+#include <ATen/ops/randn_like_native.h>
+#include <ATen/ops/randn_native.h>
+#include <ATen/ops/randperm.h>
+#include <ATen/ops/randperm_native.h>
+#include <ATen/ops/range.h>
+#include <ATen/ops/range_native.h>
+#include <ATen/ops/scalar_tensor_native.h>
+#include <ATen/ops/tril_indices_native.h>
+#include <ATen/ops/triu_indices_native.h>
+#include <ATen/ops/vander_native.h>
+#include <ATen/ops/zeros_like_native.h>
+#include <ATen/ops/zeros_like_ops.h>
+#include <ATen/ops/zeros_native.h>
 #endif
 
 #include <algorithm>
-#include <cctype>
-#include <cmath>
 #include <cstddef>
 #include <string>
 #include <c10/core/SymIntArrayRef.h>
diff --git a/aten/src/ATen/native/TensorFactories.h b/aten/src/ATen/native/TensorFactories.h
index 35e058df4b3ab..2c0665518a9e3 100644
--- a/aten/src/ATen/native/TensorFactories.h
+++ b/aten/src/ATen/native/TensorFactories.h
@@ -1,10 +1,9 @@
 #pragma once
 
 #include <ATen/core/Tensor.h>
-#include <ATen/Utils.h>
+#include <ATen/EmptyTensor.h>
+#include <ATen/TensorIterator.h>
 #include <ATen/native/DispatchStub.h>
-#include <ATen/native/TensorIterator.h>
-#include <c10/core/TensorOptions.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
diff --git a/aten/src/ATen/native/TensorIteratorReduce.cpp b/aten/src/ATen/native/TensorIteratorReduce.cpp
index ea772bfe7e641..606a442226876 100644
--- a/aten/src/ATen/native/TensorIteratorReduce.cpp
+++ b/aten/src/ATen/native/TensorIteratorReduce.cpp
@@ -1,11 +1,14 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/TensorIterator.h>
 #include <ATen/Parallel.h>
-#include <algorithm>
-#include <memory>
-#include <ATen/Functions.h>
-#include <ATen/TensorOperators.h>
 #include <ATen/TensorIteratorInternal.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+
 #include <c10/util/irange.h>
 
 /// Contains the implementation of parallel reductions in TensorIterator.
diff --git a/aten/src/ATen/native/TensorProperties.cpp b/aten/src/ATen/native/TensorProperties.cpp
index 7941f2e3b758c..6a703cbe07f90 100644
--- a/aten/src/ATen/native/TensorProperties.cpp
+++ b/aten/src/ATen/native/TensorProperties.cpp
@@ -1,12 +1,27 @@
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/native/TensorProperties.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Context.h>
 #include <ATen/NamedTensorUtils.h>
-#include <torch/library.h>
-#include <ATen/native/nested/NestedTensorMath.h>
+#include <ATen/detail/CUDAHooksInterface.h>
+#include <ATen/native/TensorProperties.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_nested_tensor_size_native.h>
+#include <ATen/ops/contiguous_native.h>
+#include <ATen/ops/cudnn_is_acceptable_native.h>
+#include <ATen/ops/detach_native.h>
+#include <ATen/ops/equal.h>
+#include <ATen/ops/is_same_size_native.h>
+#include <ATen/ops/is_set_to_native.h>
+#include <ATen/ops/size_native.h>
+#include <ATen/ops/stride_native.h>
+#endif
 
-#include <ATen/Config.h>
 #include <c10/util/irange.h>
+
 namespace at {
 namespace native {
 
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 6543509d3dcb8..e1f9835184cbd 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -1,12 +1,17 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/core/DimVector.h>
+#include <ATen/core/functional.h>
+#include <ATen/core/IListRef.h>
 #include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
 #include <ATen/ExpandUtils.h>
 #include <ATen/InferSize.h>
 #include <ATen/MemoryOverlap.h>
 #include <ATen/NamedTensorUtils.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/SparseCsrTensorUtils.h>
 #include <ATen/SparseTensorUtils.h>
+#include <ATen/TensorOperators.h>
 #include <ATen/WrapDimUtils.h>
 #include <ATen/core/DimVector.h>
 #include <ATen/core/IListRef.h>
@@ -26,6 +31,178 @@
 #include <c10/util/accumulate.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_conj_copy_native.h>
+#include <ATen/ops/_convert_indices_from_coo_to_csr.h>
+#include <ATen/ops/_convert_indices_from_csr_to_coo.h>
+#include <ATen/ops/_fw_primal_copy_native.h>
+#include <ATen/ops/_indices_copy_native.h>
+#include <ATen/ops/_make_dual.h>
+#include <ATen/ops/_make_dual_copy_native.h>
+#include <ATen/ops/_mkldnn_reshape.h>
+#include <ATen/ops/_mkldnn_transpose.h>
+#include <ATen/ops/_neg_view_copy_native.h>
+#include <ATen/ops/_reshape_alias_copy_native.h>
+#include <ATen/ops/_reshape_alias_native.h>
+#include <ATen/ops/_reshape_from_tensor_native.h>
+#include <ATen/ops/_shape_as_tensor_native.h>
+#include <ATen/ops/_sparse_broadcast_to.h>
+#include <ATen/ops/_sparse_broadcast_to_copy_native.h>
+#include <ATen/ops/_sparse_broadcast_to_native.h>
+#include <ATen/ops/_sparse_compressed_tensor_unsafe_native.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims_and_tensors.h>
+#include <ATen/ops/_sparse_csc_tensor_unsafe_native.h>
+#include <ATen/ops/_sparse_csr_tensor_unsafe.h>
+#include <ATen/ops/_sparse_csr_tensor_unsafe_native.h>
+#include <ATen/ops/_stack_native.h>
+#include <ATen/ops/_unsafe_view.h>
+#include <ATen/ops/_unsafe_view_native.h>
+#include <ATen/ops/_values_copy_native.h>
+#include <ATen/ops/adjoint_native.h>
+#include <ATen/ops/alias.h>
+#include <ATen/ops/alias_copy_native.h>
+#include <ATen/ops/alias_native.h>
+#include <ATen/ops/arange.h>
+#include <ATen/ops/arange_native.h>
+#include <ATen/ops/as_strided_copy_native.h>
+#include <ATen/ops/as_strided_native.h>
+#include <ATen/ops/as_strided_scatter_native.h>
+#include <ATen/ops/atleast_1d.h>
+#include <ATen/ops/atleast_2d.h>
+#include <ATen/ops/atleast_3d.h>
+#include <ATen/ops/block_diag_native.h>
+#include <ATen/ops/broadcast_tensors_native.h>
+#include <ATen/ops/broadcast_to_native.h>
+#include <ATen/ops/cat.h>
+#include <ATen/ops/cat_meta.h>
+#include <ATen/ops/cat_native.h>
+#include <ATen/ops/chunk_native.h>
+#include <ATen/ops/col_indices_copy_native.h>
+#include <ATen/ops/column_stack_native.h>
+#include <ATen/ops/concat_native.h>
+#include <ATen/ops/concatenate_native.h>
+#include <ATen/ops/crow_indices_copy_native.h>
+#include <ATen/ops/dense_dim_native.h>
+#include <ATen/ops/detach_copy_native.h>
+#include <ATen/ops/detach_native.h>
+#include <ATen/ops/diag.h>
+#include <ATen/ops/diag_backward_native.h>
+#include <ATen/ops/diag_embed_native.h>
+#include <ATen/ops/diag_native.h>
+#include <ATen/ops/diagflat_native.h>
+#include <ATen/ops/diagonal.h>
+#include <ATen/ops/diagonal_backward.h>
+#include <ATen/ops/diagonal_backward_native.h>
+#include <ATen/ops/diagonal_copy_native.h>
+#include <ATen/ops/diagonal_native.h>
+#include <ATen/ops/diagonal_scatter_native.h>
+#include <ATen/ops/dsplit_native.h>
+#include <ATen/ops/dstack_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/empty_quantized.h>
+#include <ATen/ops/expand_as_native.h>
+#include <ATen/ops/expand_copy_native.h>
+#include <ATen/ops/expand_native.h>
+#include <ATen/ops/flatten_dense_tensors_native.h>
+#include <ATen/ops/flatten_native.h>
+#include <ATen/ops/from_blob.h>
+#include <ATen/ops/hsplit_native.h>
+#include <ATen/ops/hstack.h>
+#include <ATen/ops/hstack_native.h>
+#include <ATen/ops/index_select_native.h>
+#include <ATen/ops/indices_copy_native.h>
+#include <ATen/ops/lift_fresh_native.h>
+#include <ATen/ops/lift_native.h>
+#include <ATen/ops/mH_native.h>
+#include <ATen/ops/mT_native.h>
+#include <ATen/ops/matrix_H_native.h>
+#include <ATen/ops/meshgrid_native.h>
+#include <ATen/ops/moveaxis_native.h>
+#include <ATen/ops/movedim.h>
+#include <ATen/ops/movedim_native.h>
+#include <ATen/ops/narrow.h>
+#include <ATen/ops/narrow_copy.h>
+#include <ATen/ops/narrow_copy_native.h>
+#include <ATen/ops/narrow_native.h>
+#include <ATen/ops/new_empty_native.h>
+#include <ATen/ops/new_ones_native.h>
+#include <ATen/ops/numpy_T_native.h>
+#include <ATen/ops/permute_copy_native.h>
+#include <ATen/ops/permute_native.h>
+#include <ATen/ops/ravel_native.h>
+#include <ATen/ops/repeat_native.h>
+#include <ATen/ops/reshape_as_native.h>
+#include <ATen/ops/reshape_native.h>
+#include <ATen/ops/resize_native.h>
+#include <ATen/ops/row_stack_native.h>
+#include <ATen/ops/select.h>
+#include <ATen/ops/select_backward_native.h>
+#include <ATen/ops/select_copy_native.h>
+#include <ATen/ops/select_native.h>
+#include <ATen/ops/select_scatter_native.h>
+#include <ATen/ops/set_native.h>
+#include <ATen/ops/slice.h>
+#include <ATen/ops/slice_backward_native.h>
+#include <ATen/ops/slice_copy_native.h>
+#include <ATen/ops/slice_native.h>
+#include <ATen/ops/slice_scatter_native.h>
+#include <ATen/ops/sparse_coo_tensor.h>
+#include <ATen/ops/sparse_coo_tensor_native.h>
+#include <ATen/ops/sparse_dim_native.h>
+#include <ATen/ops/split_copy_native.h>
+#include <ATen/ops/split_native.h>
+#include <ATen/ops/split_with_sizes.h>
+#include <ATen/ops/split_with_sizes_copy_native.h>
+#include <ATen/ops/split_with_sizes_native.h>
+#include <ATen/ops/squeeze_copy_native.h>
+#include <ATen/ops/squeeze_native.h>
+#include <ATen/ops/stack_native.h>
+#include <ATen/ops/sub.h>
+#include <ATen/ops/sum.h>
+#include <ATen/ops/sum_to_size_native.h>
+#include <ATen/ops/swapaxes_native.h>
+#include <ATen/ops/swapdims_native.h>
+#include <ATen/ops/t_copy_native.h>
+#include <ATen/ops/t_native.h>
+#include <ATen/ops/tensor.h>
+#include <ATen/ops/tensor_split.h>
+#include <ATen/ops/tensor_split_native.h>
+#include <ATen/ops/tile_native.h>
+#include <ATen/ops/transpose.h>
+#include <ATen/ops/transpose_copy_native.h>
+#include <ATen/ops/transpose_native.h>
+#include <ATen/ops/unbind.h>
+#include <ATen/ops/unbind_copy_native.h>
+#include <ATen/ops/unbind_native.h>
+#include <ATen/ops/unflatten_dense_tensors_native.h>
+#include <ATen/ops/unflatten_native.h>
+#include <ATen/ops/unfold_copy_native.h>
+#include <ATen/ops/unfold_native.h>
+#include <ATen/ops/unsafe_chunk_native.h>
+#include <ATen/ops/unsafe_split_native.h>
+#include <ATen/ops/unsafe_split_with_sizes_native.h>
+#include <ATen/ops/unsqueeze_copy_native.h>
+#include <ATen/ops/unsqueeze_native.h>
+#include <ATen/ops/values_copy_native.h>
+#include <ATen/ops/view_as_complex.h>
+#include <ATen/ops/view_as_complex_copy_native.h>
+#include <ATen/ops/view_as_native.h>
+#include <ATen/ops/view_as_real.h>
+#include <ATen/ops/view_as_real_copy_native.h>
+#include <ATen/ops/view_copy_native.h>
+#include <ATen/ops/view_native.h>
+#include <ATen/ops/vsplit_native.h>
+#include <ATen/ops/vstack.h>
+#include <ATen/ops/vstack_native.h>
+#include <ATen/ops/zeros.h>
+#include <ATen/ops/zeros_like.h>
+#include <ATen/ops/zeros_native.h>
+#endif
+
 #include <algorithm>
 #include <cstdint>
 #include <vector>
diff --git a/aten/src/ATen/native/TensorShape.h b/aten/src/ATen/native/TensorShape.h
index 21d0ba78261ec..60e2533e9b538 100644
--- a/aten/src/ATen/native/TensorShape.h
+++ b/aten/src/ATen/native/TensorShape.h
@@ -53,11 +53,4 @@ inline int64_t get_num_splits(const Tensor& self, int64_t split_size, int64_t di
   return num_splits;
 }
 
-///
-/// For more information, see
-/// https://pytorch.org/docs/master/generated/torch.Tensor.unfold.html#torch.Tensor.unfold
-///
-
-Tensor unfold(const Tensor& self, int64_t dimension, int64_t size, int64_t step);
-
 }} // namespace at::native
diff --git a/aten/src/ATen/native/TensorTransformations.cpp b/aten/src/ATen/native/TensorTransformations.cpp
index f0e2c0f02caa7..028b05e66930e 100644
--- a/aten/src/ATen/native/TensorTransformations.cpp
+++ b/aten/src/ATen/native/TensorTransformations.cpp
@@ -1,14 +1,31 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/TensorTransformations.h>
 #include <ATen/native/IndexKernel.h>  // for flip_stub
 
-#include <ATen/Functions.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/Parallel.h>
+#include <ATen/TensorIterator.h>
 #include <ATen/WrapDimUtilsMulti.h>
 #include <ATen/core/DimVector.h>
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/atleast_1d_native.h>
+#include <ATen/ops/atleast_2d_native.h>
+#include <ATen/ops/atleast_3d_native.h>
+#include <ATen/ops/cat.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/flip_native.h>
+#include <ATen/ops/fliplr_native.h>
+#include <ATen/ops/flipud_native.h>
+#include <ATen/ops/roll_native.h>
+#include <ATen/ops/rot90_native.h>
+#include <ATen/ops/zeros_like_ops.h>
+#endif
+
 #include <algorithm>
 #include <vector>
 
diff --git a/aten/src/ATen/native/TestOps.cpp b/aten/src/ATen/native/TestOps.cpp
index a8c30f5c3ba61..f36765436991e 100644
--- a/aten/src/ATen/native/TestOps.cpp
+++ b/aten/src/ATen/native/TestOps.cpp
@@ -1,10 +1,25 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/FunctionalInverses.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/ScalarOps.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_test_ambiguous_defaults_native.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_native.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_view_native.h>
+#include <ATen/ops/_test_optional_filled_intlist_native.h>
+#include <ATen/ops/_test_optional_floatlist_native.h>
+#include <ATen/ops/_test_optional_intlist_native.h>
+#include <ATen/ops/_test_string_default_native.h>
+#include <ATen/ops/_test_warn_in_autograd_native.h>
+#include <ATen/ops/empty_like.h>
+#endif
+
 #include <c10/util/irange.h>
 
 namespace at {
diff --git a/aten/src/ATen/native/TriangularOps.cpp b/aten/src/ATen/native/TriangularOps.cpp
index f98018d7fe5a5..fbdd204f64307 100644
--- a/aten/src/ATen/native/TriangularOps.cpp
+++ b/aten/src/ATen/native/TriangularOps.cpp
@@ -1,14 +1,24 @@
-#include <ATen/ATen.h>
-#include <ATen/CPUApplyUtils.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/Parallel.h>
 #include <ATen/TensorMeta.h>
-#include <ATen/native/Resize.h>
 #include <ATen/native/TriangularOpsUtils.h>
 #include <ATen/TensorSubclassLikeUtils.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/arange.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/trace_backward_native.h>
+#include <ATen/ops/tril_native.h>
+#include <ATen/ops/triu_native.h>
+#include <ATen/ops/zeros.h>
+#endif
+
 namespace at {
 namespace meta {
 
diff --git a/aten/src/ATen/native/TriangularOpsUtils.h b/aten/src/ATen/native/TriangularOpsUtils.h
index c5bce42ed3fd7..e380a510bddeb 100644
--- a/aten/src/ATen/native/TriangularOpsUtils.h
+++ b/aten/src/ATen/native/TriangularOpsUtils.h
@@ -1,4 +1,4 @@
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/native/LinearAlgebraUtils.h>
 
 namespace at {
diff --git a/aten/src/ATen/native/TypeProperties.cpp b/aten/src/ATen/native/TypeProperties.cpp
index feceb75631cec..36354c133a98e 100644
--- a/aten/src/ATen/native/TypeProperties.cpp
+++ b/aten/src/ATen/native/TypeProperties.cpp
@@ -1,8 +1,26 @@
-#include <ATen/ATen.h>
-#include <ATen/Dispatch.h>
-#include <ATen/NativeFunctions.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/native/TypeProperties.h>
-#include <type_traits>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_has_compatible_shallow_copy_type_native.h>
+#include <ATen/ops/_is_zerotensor_native.h>
+#include <ATen/ops/can_cast_native.h>
+#include <ATen/ops/is_complex_native.h>
+#include <ATen/ops/is_conj_native.h>
+#include <ATen/ops/is_distributed_native.h>
+#include <ATen/ops/is_floating_point_native.h>
+#include <ATen/ops/is_inference_native.h>
+#include <ATen/ops/is_neg_native.h>
+#include <ATen/ops/is_signed_native.h>
+#include <ATen/ops/promote_types_native.h>
+#include <ATen/ops/result_type.h>
+#include <ATen/ops/result_type_native.h>
+#include <ATen/ops/type_as_native.h>
+#endif
 
 namespace at { namespace native {
 

From 6c80bbdd6f603731292e3b96c07cb06269111d0f Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Mon, 17 Oct 2022 18:57:07 +0100
Subject: [PATCH 0093/1922] ATen/native (6/6): Use per-operator headers
 (#75576)

Differential Revision: [D40126699](https://our.internmc.facebook.com/intern/diff/D40126699)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/75576
Approved by: https://github.com/malfet
---
 aten/src/ATen/native/ComplexHelper.h          |   9 +-
 aten/src/ATen/native/UnaryOps.cpp             | 176 ++++++++++++++++--
 aten/src/ATen/native/Unfold2d.cpp             |   1 +
 aten/src/ATen/native/Unfold3d.cpp             |   4 +-
 aten/src/ATen/native/UnfoldBackward.h         |   5 +-
 aten/src/ATen/native/Unique.cpp               |  21 ++-
 aten/src/ATen/native/UpSample.cpp             |   1 +
 aten/src/ATen/native/UpSampleBicubic2d.cpp    |  20 +-
 aten/src/ATen/native/UpSampleBilinear2d.cpp   |  19 +-
 aten/src/ATen/native/UpSampleLinear1d.cpp     |  16 +-
 aten/src/ATen/native/UpSampleNearest1d.cpp    |  20 +-
 aten/src/ATen/native/UpSampleNearest2d.cpp    |  19 +-
 aten/src/ATen/native/UpSampleNearest3d.cpp    |  19 +-
 aten/src/ATen/native/UpSampleTrilinear3d.cpp  |  15 +-
 aten/src/ATen/native/VariableMethodStubs.cpp  |  20 +-
 aten/src/ATen/native/WeightNorm.cpp           |  20 +-
 .../ATen/native/cpu/UnfoldBackwardKernel.cpp  |   1 +
 .../ATen/native/cuda/UnfoldBackwardKernel.cu  |   1 +
 aten/src/ATen/native/group_norm.cpp           |  23 ++-
 aten/src/ATen/native/layer_norm.cpp           |  25 ++-
 .../src/ATen/native/prim_native_functions.cpp |   9 +-
 21 files changed, 388 insertions(+), 56 deletions(-)

diff --git a/aten/src/ATen/native/ComplexHelper.h b/aten/src/ATen/native/ComplexHelper.h
index 88668d13145c5..8d69f6292772c 100644
--- a/aten/src/ATen/native/ComplexHelper.h
+++ b/aten/src/ATen/native/ComplexHelper.h
@@ -1,8 +1,15 @@
 #pragma once
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/view_as_real_native.h>
+#include <ATen/ops/view_as_complex_native.h>
+#endif
+
 // WARNING: this header contains non-inline functions and should be only
 // included from ONE cpp file
 
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index c301d8ecc26a2..845610ce373e7 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -1,26 +1,174 @@
-#include <ATen/ATen.h>
-#include <ATen/Dispatch.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/ExpandUtils.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/MemoryOverlap.h>
+#include <ATen/NamedTensorUtils.h>
+#include <ATen/Parallel.h>
+#include <ATen/ScalarOps.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorOperators.h>
 #include <ATen/WrapDimUtils.h>
 
-#include <ATen/CPUApplyUtils.h>
-#include <ATen/Parallel.h>
-#include <ATen/native/Math.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/UnaryOps.h>
-#include <ATen/native/TensorIterator.h>
-#include <ATen/NamedTensorUtils.h>
 #include <ATen/native/ComplexHelper.h>
 
-#include <algorithm>
-#include <cmath>
-#include <functional>
-#include <numeric>
-#include <vector>
+#include <c10/util/MathConstants.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_conj_native.h>
+#include <ATen/ops/_conj_physical.h>
+#include <ATen/ops/_conj_physical_native.h>
+#include <ATen/ops/_neg_view_native.h>
+#include <ATen/ops/abs.h>
+#include <ATen/ops/abs_native.h>
+#include <ATen/ops/absolute_native.h>
+#include <ATen/ops/acos.h>
+#include <ATen/ops/acos_native.h>
+#include <ATen/ops/acosh.h>
+#include <ATen/ops/acosh_native.h>
+#include <ATen/ops/angle.h>
+#include <ATen/ops/angle_native.h>
+#include <ATen/ops/arange_native.h>
+#include <ATen/ops/arccos_native.h>
+#include <ATen/ops/arccosh_native.h>
+#include <ATen/ops/arcsin_native.h>
+#include <ATen/ops/arcsinh_native.h>
+#include <ATen/ops/arctan_native.h>
+#include <ATen/ops/arctanh_native.h>
+#include <ATen/ops/asin.h>
+#include <ATen/ops/asin_native.h>
+#include <ATen/ops/asinh.h>
+#include <ATen/ops/asinh_native.h>
+#include <ATen/ops/atan.h>
+#include <ATen/ops/atan_native.h>
+#include <ATen/ops/atanh.h>
+#include <ATen/ops/atanh_native.h>
+#include <ATen/ops/bitwise_not_native.h>
+#include <ATen/ops/can_cast.h>
+#include <ATen/ops/ceil_native.h>
+#include <ATen/ops/conj_native.h>
+#include <ATen/ops/conj_physical.h>
+#include <ATen/ops/conj_physical_native.h>
+#include <ATen/ops/cos_native.h>
+#include <ATen/ops/cosh_native.h>
+#include <ATen/ops/deg2rad.h>
+#include <ATen/ops/deg2rad_native.h>
+#include <ATen/ops/digamma.h>
+#include <ATen/ops/digamma_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/erf.h>
+#include <ATen/ops/erf_native.h>
+#include <ATen/ops/erfc.h>
+#include <ATen/ops/erfc_native.h>
+#include <ATen/ops/erfinv.h>
+#include <ATen/ops/erfinv_native.h>
+#include <ATen/ops/exp2.h>
+#include <ATen/ops/exp2_native.h>
+#include <ATen/ops/exp_native.h>
+#include <ATen/ops/expm1.h>
+#include <ATen/ops/expm1_native.h>
+#include <ATen/ops/fix_native.h>
+#include <ATen/ops/floor_native.h>
+#include <ATen/ops/frac_native.h>
+#include <ATen/ops/frexp.h>
+#include <ATen/ops/frexp_native.h>
+#include <ATen/ops/i0.h>
+#include <ATen/ops/i0_native.h>
+#include <ATen/ops/imag_native.h>
+#include <ATen/ops/lgamma.h>
+#include <ATen/ops/lgamma_native.h>
+#include <ATen/ops/log10_native.h>
+#include <ATen/ops/log1p.h>
+#include <ATen/ops/log1p_native.h>
+#include <ATen/ops/log2_native.h>
+#include <ATen/ops/log_native.h>
+#include <ATen/ops/logical_not.h>
+#include <ATen/ops/logical_not_native.h>
+#include <ATen/ops/logit.h>
+#include <ATen/ops/logit_native.h>
+#include <ATen/ops/mul.h>
+#include <ATen/ops/mvlgamma.h>
+#include <ATen/ops/mvlgamma_native.h>
+#include <ATen/ops/nan_to_num.h>
+#include <ATen/ops/nan_to_num_native.h>
+#include <ATen/ops/neg.h>
+#include <ATen/ops/neg_native.h>
+#include <ATen/ops/negative_native.h>
+#include <ATen/ops/polygamma.h>
+#include <ATen/ops/polygamma_native.h>
+#include <ATen/ops/positive_native.h>
+#include <ATen/ops/pow.h>
+#include <ATen/ops/rad2deg.h>
+#include <ATen/ops/rad2deg_native.h>
+#include <ATen/ops/real.h>
+#include <ATen/ops/real_native.h>
+#include <ATen/ops/reciprocal_native.h>
+#include <ATen/ops/resolve_conj_native.h>
+#include <ATen/ops/resolve_neg_native.h>
+#include <ATen/ops/round.h>
+#include <ATen/ops/round_native.h>
+#include <ATen/ops/rsqrt_native.h>
+#include <ATen/ops/select.h>
+#include <ATen/ops/sgn_native.h>
+#include <ATen/ops/sigmoid.h>
+#include <ATen/ops/sigmoid_native.h>
+#include <ATen/ops/sign_native.h>
+#include <ATen/ops/signbit_native.h>
+#include <ATen/ops/sin_native.h>
+#include <ATen/ops/sinc.h>
+#include <ATen/ops/sinc_native.h>
+#include <ATen/ops/sinh_native.h>
+#include <ATen/ops/special_airy_ai_native.h>
+#include <ATen/ops/special_bessel_j0_native.h>
+#include <ATen/ops/special_bessel_j1_native.h>
+#include <ATen/ops/special_bessel_y0_native.h>
+#include <ATen/ops/special_bessel_y1_native.h>
+#include <ATen/ops/special_digamma_native.h>
+#include <ATen/ops/special_entr_native.h>
+#include <ATen/ops/special_erf_native.h>
+#include <ATen/ops/special_erfc_native.h>
+#include <ATen/ops/special_erfcx_native.h>
+#include <ATen/ops/special_erfinv_native.h>
+#include <ATen/ops/special_exp2_native.h>
+#include <ATen/ops/special_expit_native.h>
+#include <ATen/ops/special_expm1_native.h>
+#include <ATen/ops/special_gammaln_native.h>
+#include <ATen/ops/special_i0_native.h>
+#include <ATen/ops/special_i0e_native.h>
+#include <ATen/ops/special_i1_native.h>
+#include <ATen/ops/special_i1e_native.h>
+#include <ATen/ops/special_log1p_native.h>
+#include <ATen/ops/special_log_ndtr_native.h>
+#include <ATen/ops/special_logit_native.h>
+#include <ATen/ops/special_modified_bessel_i0_native.h>
+#include <ATen/ops/special_modified_bessel_i1_native.h>
+#include <ATen/ops/special_modified_bessel_k0_native.h>
+#include <ATen/ops/special_modified_bessel_k1_native.h>
+#include <ATen/ops/special_multigammaln_native.h>
+#include <ATen/ops/special_ndtr_native.h>
+#include <ATen/ops/special_ndtri_native.h>
+#include <ATen/ops/special_polygamma_native.h>
+#include <ATen/ops/special_psi_native.h>
+#include <ATen/ops/special_round_native.h>
+#include <ATen/ops/special_scaled_modified_bessel_k0_native.h>
+#include <ATen/ops/special_scaled_modified_bessel_k1_native.h>
+#include <ATen/ops/special_sinc_native.h>
+#include <ATen/ops/special_spherical_bessel_j0_native.h>
+#include <ATen/ops/sqrt_native.h>
+#include <ATen/ops/square_native.h>
+#include <ATen/ops/tan_native.h>
+#include <ATen/ops/tanh_native.h>
+#include <ATen/ops/trunc.h>
+#include <ATen/ops/trunc_native.h>
+#include <ATen/ops/view_as_real.h>
+#endif
 
-#include <map>
+#include <cmath>
 
 namespace at {
 
diff --git a/aten/src/ATen/native/Unfold2d.cpp b/aten/src/ATen/native/Unfold2d.cpp
index 0a3b760a33fda..60bbc8a777121 100644
--- a/aten/src/ATen/native/Unfold2d.cpp
+++ b/aten/src/ATen/native/Unfold2d.cpp
@@ -1,3 +1,4 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/Unfold2d.h>
 
 namespace at { namespace native {
diff --git a/aten/src/ATen/native/Unfold3d.cpp b/aten/src/ATen/native/Unfold3d.cpp
index 3495f92dc3ce6..1a2d0ea2ae1f9 100644
--- a/aten/src/ATen/native/Unfold3d.cpp
+++ b/aten/src/ATen/native/Unfold3d.cpp
@@ -1,5 +1,7 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/Config.h>
+#include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
 #include <c10/util/irange.h>
 
diff --git a/aten/src/ATen/native/UnfoldBackward.h b/aten/src/ATen/native/UnfoldBackward.h
index 1f6c8fa1b289c..cb4856ec2718e 100644
--- a/aten/src/ATen/native/UnfoldBackward.h
+++ b/aten/src/ATen/native/UnfoldBackward.h
@@ -1,10 +1,9 @@
 #pragma once
 
 #include <ATen/core/Tensor.h>
-#include <ATen/Dispatch.h>
+#include <ATen/TensorIterator.h>
 #include <ATen/native/DispatchStub.h>
-#include <ATen/native/TensorIterator.h>
-#include <ATen/native/ReduceOpsUtils.h>
+#include <ATen/native/NonEmptyUtils.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
diff --git a/aten/src/ATen/native/Unique.cpp b/aten/src/ATen/native/Unique.cpp
index f418611e08644..92b48c9f388ca 100644
--- a/aten/src/ATen/native/Unique.cpp
+++ b/aten/src/ATen/native/Unique.cpp
@@ -1,8 +1,27 @@
 // Returns unique elements of input tensor.
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 
-#include <ATen/ATen.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
 #include <c10/util/irange.h>
+#include <c10/util/Load.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_unique2_native.h>
+#include <ATen/ops/_unique_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/equal.h>
+#include <ATen/ops/narrow.h>
+#include <ATen/ops/stack.h>
+#include <ATen/ops/unbind.h>
+#include <ATen/ops/unique_consecutive_native.h>
+#include <ATen/ops/unique_dim_consecutive_native.h>
+#include <ATen/ops/unique_dim_native.h>
+#include <ATen/ops/zeros.h>
+#endif
 
 #include <tuple>
 #include <unordered_map>
diff --git a/aten/src/ATen/native/UpSample.cpp b/aten/src/ATen/native/UpSample.cpp
index db75b7e99fdb1..1a6af75260300 100644
--- a/aten/src/ATen/native/UpSample.cpp
+++ b/aten/src/ATen/native/UpSample.cpp
@@ -1,4 +1,5 @@
 // Copyright 2004-present Facebook. All Rights Reserved.
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 
 #include <ATen/native/UpSample.h>
 #include <c10/util/irange.h>
diff --git a/aten/src/ATen/native/UpSampleBicubic2d.cpp b/aten/src/ATen/native/UpSampleBicubic2d.cpp
index 5bf7ba6a53666..3a0fa941a4d4a 100644
--- a/aten/src/ATen/native/UpSampleBicubic2d.cpp
+++ b/aten/src/ATen/native/UpSampleBicubic2d.cpp
@@ -1,8 +1,24 @@
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorMeta.h>
 #include <ATen/native/UpSample.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_upsample_bicubic2d_aa.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_backward.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_backward_native.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_native.h>
+#include <ATen/ops/upsample_bicubic2d.h>
+#include <ATen/ops/upsample_bicubic2d_backward.h>
+#include <ATen/ops/upsample_bicubic2d_backward_native.h>
+#include <ATen/ops/upsample_bicubic2d_native.h>
+#endif
+
 namespace at {
 namespace meta {
 
diff --git a/aten/src/ATen/native/UpSampleBilinear2d.cpp b/aten/src/ATen/native/UpSampleBilinear2d.cpp
index 527555a066abb..69c856f06fcbf 100644
--- a/aten/src/ATen/native/UpSampleBilinear2d.cpp
+++ b/aten/src/ATen/native/UpSampleBilinear2d.cpp
@@ -1,11 +1,26 @@
 // Adapted from interp.cpp from Caffe util by Pauline Luc
 // Originally developed by George Papandreou
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorMeta.h>
 #include <ATen/native/UpSample.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_upsample_bilinear2d_aa.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_backward.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_backward_native.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_native.h>
+#include <ATen/ops/upsample_bilinear2d.h>
+#include <ATen/ops/upsample_bilinear2d_backward.h>
+#include <ATen/ops/upsample_bilinear2d_backward_native.h>
+#include <ATen/ops/upsample_bilinear2d_native.h>
+#endif
+
 namespace at {
 namespace meta {
 
diff --git a/aten/src/ATen/native/UpSampleLinear1d.cpp b/aten/src/ATen/native/UpSampleLinear1d.cpp
index b100450c2b6a7..048d4b5a3d9c1 100644
--- a/aten/src/ATen/native/UpSampleLinear1d.cpp
+++ b/aten/src/ATen/native/UpSampleLinear1d.cpp
@@ -1,10 +1,22 @@
 // Adapted from interp.cpp from Caffe util by Pauline Luc
 // Originally developed by George Papandreou
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorMeta.h>
+#include <ATen/TensorUtils.h>
 #include <ATen/native/UpSample.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/upsample_linear1d.h>
+#include <ATen/ops/upsample_linear1d_backward.h>
+#include <ATen/ops/upsample_linear1d_backward_native.h>
+#include <ATen/ops/upsample_linear1d_native.h>
+#endif
+
 namespace at {
 namespace meta {
 
diff --git a/aten/src/ATen/native/UpSampleNearest1d.cpp b/aten/src/ATen/native/UpSampleNearest1d.cpp
index 83121ed3be45b..5cc53dea349b7 100644
--- a/aten/src/ATen/native/UpSampleNearest1d.cpp
+++ b/aten/src/ATen/native/UpSampleNearest1d.cpp
@@ -1,7 +1,23 @@
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorMeta.h>
+#include <ATen/TensorUtils.h>
 #include <ATen/native/UpSample.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_upsample_nearest_exact1d.h>
+#include <ATen/ops/_upsample_nearest_exact1d_backward.h>
+#include <ATen/ops/_upsample_nearest_exact1d_backward_native.h>
+#include <ATen/ops/_upsample_nearest_exact1d_native.h>
+#include <ATen/ops/upsample_nearest1d.h>
+#include <ATen/ops/upsample_nearest1d_backward.h>
+#include <ATen/ops/upsample_nearest1d_backward_native.h>
+#include <ATen/ops/upsample_nearest1d_native.h>
+#endif
+
 namespace at {
 namespace meta {
 
diff --git a/aten/src/ATen/native/UpSampleNearest2d.cpp b/aten/src/ATen/native/UpSampleNearest2d.cpp
index ee5dce4a02eff..14c7a7d1b74f0 100644
--- a/aten/src/ATen/native/UpSampleNearest2d.cpp
+++ b/aten/src/ATen/native/UpSampleNearest2d.cpp
@@ -1,9 +1,24 @@
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorMeta.h>
 #include <ATen/native/UpSample.h>
 #include <c10/util/accumulate.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_upsample_nearest_exact2d.h>
+#include <ATen/ops/_upsample_nearest_exact2d_backward.h>
+#include <ATen/ops/_upsample_nearest_exact2d_backward_native.h>
+#include <ATen/ops/_upsample_nearest_exact2d_native.h>
+#include <ATen/ops/upsample_nearest2d.h>
+#include <ATen/ops/upsample_nearest2d_backward.h>
+#include <ATen/ops/upsample_nearest2d_backward_native.h>
+#include <ATen/ops/upsample_nearest2d_native.h>
+#endif
+
 namespace at {
 namespace meta {
 
diff --git a/aten/src/ATen/native/UpSampleNearest3d.cpp b/aten/src/ATen/native/UpSampleNearest3d.cpp
index 0e4040980ae26..73948f66fa769 100644
--- a/aten/src/ATen/native/UpSampleNearest3d.cpp
+++ b/aten/src/ATen/native/UpSampleNearest3d.cpp
@@ -1,8 +1,23 @@
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorMeta.h>
 #include <ATen/native/UpSample.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_upsample_nearest_exact3d.h>
+#include <ATen/ops/_upsample_nearest_exact3d_backward.h>
+#include <ATen/ops/_upsample_nearest_exact3d_backward_native.h>
+#include <ATen/ops/_upsample_nearest_exact3d_native.h>
+#include <ATen/ops/upsample_nearest3d.h>
+#include <ATen/ops/upsample_nearest3d_backward.h>
+#include <ATen/ops/upsample_nearest3d_backward_native.h>
+#include <ATen/ops/upsample_nearest3d_native.h>
+#endif
+
 namespace at {
 namespace meta {
 
diff --git a/aten/src/ATen/native/UpSampleTrilinear3d.cpp b/aten/src/ATen/native/UpSampleTrilinear3d.cpp
index 73fffbe5afe79..76bc4da85addb 100644
--- a/aten/src/ATen/native/UpSampleTrilinear3d.cpp
+++ b/aten/src/ATen/native/UpSampleTrilinear3d.cpp
@@ -1,11 +1,22 @@
 // Adapted from interp.cpp from Caffe util by Pauline Luc
 // Originally developed by George Papandreou
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorMeta.h>
 #include <ATen/native/UpSample.h>
 #include <c10/util/irange.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/upsample_trilinear3d.h>
+#include <ATen/ops/upsample_trilinear3d_backward.h>
+#include <ATen/ops/upsample_trilinear3d_backward_native.h>
+#include <ATen/ops/upsample_trilinear3d_native.h>
+#endif
+
 namespace at {
 namespace meta {
 
diff --git a/aten/src/ATen/native/VariableMethodStubs.cpp b/aten/src/ATen/native/VariableMethodStubs.cpp
index ce5432e677af2..6191717930aec 100644
--- a/aten/src/ATen/native/VariableMethodStubs.cpp
+++ b/aten/src/ATen/native/VariableMethodStubs.cpp
@@ -1,5 +1,23 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_backward_native.h>
+#include <ATen/ops/_fw_primal_native.h>
+#include <ATen/ops/_version_native.h>
+#include <ATen/ops/alias.h>
+#include <ATen/ops/data_native.h>
+#include <ATen/ops/is_leaf_native.h>
+#include <ATen/ops/output_nr_native.h>
+#include <ATen/ops/requires_grad_native.h>
+#include <ATen/ops/retain_grad_native.h>
+#include <ATen/ops/retains_grad_native.h>
+#include <ATen/ops/set_data_native.h>
+#include <ATen/ops/zeros_like_ops.h>
+#endif
 
 // The stubs in here are used by dynamic dispatch. It just redirects everything
 // to the Tensor method we manually bind in TensorBody.h.
diff --git a/aten/src/ATen/native/WeightNorm.cpp b/aten/src/ATen/native/WeightNorm.cpp
index bf258d80a0fb3..8291120f19603 100644
--- a/aten/src/ATen/native/WeightNorm.cpp
+++ b/aten/src/ATen/native/WeightNorm.cpp
@@ -1,11 +1,21 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/TensorUtils.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/TensorOperators.h>
 #include <ATen/native/cpu/WeightNormKernel.h>
 
-#include <cstring>
-#include <memory>
-#include <sstream>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_weight_norm_differentiable_backward_native.h>
+#include <ATen/ops/_weight_norm_interface.h>
+#include <ATen/ops/_weight_norm_native.h>
+#include <ATen/ops/empty_strided.h>
+#include <ATen/ops/norm_except_dim.h>
+#include <ATen/ops/norm_except_dim_native.h>
+#endif
+
 #include <vector>
 
 namespace at {
diff --git a/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp b/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp
index 129ab3a973e3a..cf934586c74e7 100644
--- a/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp
@@ -1,5 +1,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
 #include <ATen/cpu/vec/vec.h>
 #include <ATen/native/UnfoldBackward.h>
diff --git a/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu b/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu
index 90f5238d0180d..7865a7f61545f 100644
--- a/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu
+++ b/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu
@@ -1,6 +1,7 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/UnfoldBackward.h>
 
+#include <ATen/Dispatch.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/cuda/detail/OffsetCalculator.cuh>
 #include <ATen/cuda/CUDAContext.h>
diff --git a/aten/src/ATen/native/group_norm.cpp b/aten/src/ATen/native/group_norm.cpp
index 24a23577e490e..5b38b02702828 100644
--- a/aten/src/ATen/native/group_norm.cpp
+++ b/aten/src/ATen/native/group_norm.cpp
@@ -1,15 +1,24 @@
-#include <ATen/ATen.h>
-#include <ATen/AccumulateType.h>
-#include <ATen/CPUApplyUtils.h>
-#include <ATen/Config.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/Parallel.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/group_norm.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/Parallel.h>
 #include <c10/util/accumulate.h>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like_native.h>
+#include <ATen/ops/group_norm_native.h>
+#include <ATen/ops/native_batch_norm.h>
+#include <ATen/ops/native_group_norm.h>
+#include <ATen/ops/native_group_norm_backward_native.h>
+#include <ATen/ops/native_group_norm_native.h>
+#endif
+
 #include <array>
 #include <functional>
-#include <numeric>
 #include <tuple>
 #include <vector>
 
diff --git a/aten/src/ATen/native/layer_norm.cpp b/aten/src/ATen/native/layer_norm.cpp
index 80a7bb6111f23..71dc42da380b2 100644
--- a/aten/src/ATen/native/layer_norm.cpp
+++ b/aten/src/ATen/native/layer_norm.cpp
@@ -1,17 +1,26 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/layer_norm.h>
 
-#include <ATen/AccumulateType.h>
-#include <ATen/ATen.h>
-#include <ATen/Config.h>
-#include <ATen/CPUApplyUtils.h>
-#include <ATen/NativeFunctions.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/Parallel.h>
 #include <c10/util/irange.h>
-#include <torch/library.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/empty_like_native.h>
+#include <ATen/ops/layer_norm_native.h>
+#include <ATen/ops/native_batch_norm.h>
+#include <ATen/ops/native_layer_norm.h>
+#include <ATen/ops/native_layer_norm_backward_native.h>
+#include <ATen/ops/native_layer_norm_native.h>
+#include <ATen/ops/zeros_like_native.h>
+#endif
 
 #include <array>
-#include <functional>
-#include <numeric>
 #include <tuple>
 #include <vector>
 
diff --git a/aten/src/ATen/native/prim_native_functions.cpp b/aten/src/ATen/native/prim_native_functions.cpp
index 8f82345c19058..4e79c112d7fc6 100644
--- a/aten/src/ATen/native/prim_native_functions.cpp
+++ b/aten/src/ATen/native/prim_native_functions.cpp
@@ -1,4 +1,11 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/is_nonzero_native.h>
+#endif
 
 namespace at {
 namespace native {

From cf709008be0404e86789432108c2f94125275b0f Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 24 Oct 2022 16:36:25 -0700
Subject: [PATCH 0094/1922] Fix bernoulli functionalization. (#87573)

For testing, see https://github.com/pytorch/pytorch/issues/87571

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87573
Approved by: https://github.com/albanD
---
 .github/ci_commit_pins/xla.txt             | 2 +-
 aten/src/ATen/native/native_functions.yaml | 2 ++
 torch/_inductor/decomposition.py           | 6 ++++++
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index 3ab9c4394d70b..6d16c6159e998 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-0cb29daa04097c868d23ed666563a3439d67065c
+cf5dea047d1c9c63a201fb1b97b690416b683dde
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index faab6371c8af1..d514cae670855 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -985,6 +985,8 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   tags: nondeterministic_seeded
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: bernoulli
 
 - func: bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor? bias=None) -> Tensor
 
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index 6fed9ca691240..c22a8406b9b61 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -304,6 +304,12 @@ def bernoulli(self, *, generator=None):
     return torch.rand_like(self, dtype=torch.float32) < self
 
 
+@register_decomposition([aten.bernoulli.p])
+def bernoulli_p(self, p=0.5, *, generator=None):
+    assert generator is None
+    return torch.rand_like(self, dtype=torch.float32) < p
+
+
 """
 Some decomps result in differences from eager related to randomness.
 We put these decomps in a separate table `extra_random_decomps` to allow

From d444835764f2e1e609a248c0d9f9be81587cdb40 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 24 Oct 2022 19:40:19 -0400
Subject: [PATCH 0095/1922] Make me codeowner of test_aotdispatch.py (#87624)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87624
Approved by: https://github.com/albanD
---
 CODEOWNERS | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CODEOWNERS b/CODEOWNERS
index 8fdc5fc776632..3d030ad4d9e45 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -112,3 +112,6 @@ torch/csrc/autograd/profiler* @robieta
 torch/autograd/profiler* @robieta
 torch/csrc/profiler/ @robieta
 torch/profiler/ @robieta
+
+# AOTDispatch tests
+test/functorch/test_aotdispatch.py @ezyang @Chillee

From 5f987d9af2f328ddf2760e1a6c214647a5aa08a6 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Mon, 24 Oct 2022 23:52:44 +0000
Subject: [PATCH 0096/1922] Fix typo under docs directory (#87583)

This PR fixes typo in `.rst` files under docs directory

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87583
Approved by: https://github.com/kit1980
---
 docs/cpp/source/notes/tensor_cuda_stream.rst    | 2 +-
 docs/source/nested.rst                          | 2 +-
 docs/source/notes/modules.rst                   | 2 +-
 docs/source/notes/numerical_accuracy.rst        | 2 +-
 docs/source/quantization-accuracy-debugging.rst | 2 +-
 docs/source/quantization.rst                    | 2 +-
 docs/source/sparse.rst                          | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/cpp/source/notes/tensor_cuda_stream.rst b/docs/cpp/source/notes/tensor_cuda_stream.rst
index b80615e8f7f10..bdb66361d9a70 100644
--- a/docs/cpp/source/notes/tensor_cuda_stream.rst
+++ b/docs/cpp/source/notes/tensor_cuda_stream.rst
@@ -206,7 +206,7 @@ CUDA Stream Usage Examples
 
   // sum() on tensor0 uses default CUDA stream as current CUDA stream on device 0
   tensor0.sum();
-  // sum() on tensor1 uses defualt CUDA stream as current CUDA stream on device 1
+  // sum() on tensor1 uses default CUDA stream as current CUDA stream on device 1
   tensor1.sum();
 
 .. attention::
diff --git a/docs/source/nested.rst b/docs/source/nested.rst
index 4cfb5bdf701ae..21ff980256911 100644
--- a/docs/source/nested.rst
+++ b/docs/source/nested.rst
@@ -199,7 +199,7 @@ NestedTensor and any constraints they have.
    :func:`torch.add`; "Supports elementwise addition of two nested tensors.
    Supports addition of a scalar to a nested tensor."
    :func:`torch.mul`; "Supports elementwise multiplication of two nested tensors.
-   Supports multipication of a nested tensor by a scalar."
+   Supports multiplication of a nested tensor by a scalar."
    :func:`torch.select`; "Supports selecting along ``dim=0`` only (analogously ``nt[i]``)."
    :func:`torch.clone`; "Behavior is the same as on regular tensors."
    :func:`torch.detach`; "Behavior is the same as on regular tensors."
diff --git a/docs/source/notes/modules.rst b/docs/source/notes/modules.rst
index 7eea02dfa857f..49b27a0ae0142 100644
--- a/docs/source/notes/modules.rst
+++ b/docs/source/notes/modules.rst
@@ -599,7 +599,7 @@ PyTorch provides two types of hooks for modules:
 * **Forward hooks** are called during the forward pass. They can be installed for a given module with
   :func:`~torch.nn.Module.register_forward_pre_hook` and :func:`~torch.nn.Module.register_forward_hook`.
   These hooks will be called respectively just before the forward function is called and just after it is called.
-  Alternatively, these hooks can be installed globally for all modules with the analagous
+  Alternatively, these hooks can be installed globally for all modules with the analogous
   :func:`~torch.nn.modules.module.register_module_forward_pre_hook` and
   :func:`~torch.nn.modules.module.register_module_forward_hook` functions.
 * **Backward hooks** are called during the backward pass. They can be installed with
diff --git a/docs/source/notes/numerical_accuracy.rst b/docs/source/notes/numerical_accuracy.rst
index b1d05f9460419..fad14ed912027 100644
--- a/docs/source/notes/numerical_accuracy.rst
+++ b/docs/source/notes/numerical_accuracy.rst
@@ -34,7 +34,7 @@ even though mathematically it's an identical computation.
 
 Similarly, an operation applied to a tensor slice is not guaranteed to produce results that are
 identical to the slice of the result of the same operation applied to the full tensor. E.g. let
-``A`` be a 2-dimentional tensor. ``A.sum(-1)[0]`` is not guaranteed to be bitwise equal to
+``A`` be a 2-dimensional tensor. ``A.sum(-1)[0]`` is not guaranteed to be bitwise equal to
 ``A[:,0].sum()``.
 
 Extremal values
diff --git a/docs/source/quantization-accuracy-debugging.rst b/docs/source/quantization-accuracy-debugging.rst
index 69bda8706cc67..0fa590abd2f0c 100644
--- a/docs/source/quantization-accuracy-debugging.rst
+++ b/docs/source/quantization-accuracy-debugging.rst
@@ -6,7 +6,7 @@ accuracy. If a quantized model has error compared to the original model,
 we can categorize the error into:
 
 1. **data insensitive error** - caused by intrinsic model quantization error,
-   large portion of input data has large errror
+   large portion of input data has large error
 2. **data sensitive error** - caused by outlier input data, small
    portion of input data has large error
 3. **implementation error** - quantized kernel is not matching reference implementation
diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index 34cbad9b52cc3..e00720570a1a3 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -258,7 +258,7 @@ PTSQ API Example::
   # attach a global qconfig, which contains information about what kind
   # of observers to attach. Use 'fbgemm' for server inference and
   # 'qnnpack' for mobile inference. Other quantization configurations such
-  # as selecting symmetric or assymetric quantization and MinMax or L2Norm
+  # as selecting symmetric or asymmetric quantization and MinMax or L2Norm
   # calibration techniques can be specified here.
   model_fp32.qconfig = torch.quantization.get_default_qconfig('fbgemm')
 
diff --git a/docs/source/sparse.rst b/docs/source/sparse.rst
index 2da6a6faaee55..29790312cb3b8 100644
--- a/docs/source/sparse.rst
+++ b/docs/source/sparse.rst
@@ -117,7 +117,7 @@ Operator overview
 Fundamentally, operations on Tensor with sparse storage formats behave the same as
 operations on Tensor with strided (or other) storage formats. The particularities of
 storage, that is the physical layout of the data, influences the performance of
-an operation but shhould not influence the semantics.
+an operation but should not influence the semantics.
 
 
 We are actively increasing operator coverage for sparse tensors. Users should not

From fd80049e44992fec873d887e864f442d13e6d9e5 Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Mon, 24 Oct 2022 15:37:20 -0400
Subject: [PATCH 0097/1922] Improve argument printing (#87601)

No more "expected tuple but got tuple".  We appropriately
grovel in the list/tuple for the element that mismatched
and report what exactly twinged the failure.

invalid_arguments.cpp is a shitshow so I did something
slapdash to get it not completely horrible.  See
https://github.com/pytorch/pytorch/issues/87514 for more context.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87601
Approved by: https://github.com/Chillee
---
 test/test_native_functions.py          | 42 +++++++++++++-
 torch/csrc/utils/invalid_arguments.cpp | 29 +++++++++-
 torch/csrc/utils/python_arg_parser.cpp | 77 ++++++++++++++++++++------
 torch/csrc/utils/python_arg_parser.h   |  3 +-
 4 files changed, 130 insertions(+), 21 deletions(-)

diff --git a/test/test_native_functions.py b/test/test_native_functions.py
index 831998cbf6be2..ba7889e10f4c5 100644
--- a/test/test_native_functions.py
+++ b/test/test_native_functions.py
@@ -19,6 +19,46 @@ def forward(self, values, incr: Optional[List[int]]):
 
 class TestNativeFunctions(TestCase):
 
+    def _lists_with_str(self):
+        return [
+            ("foo",),
+            (2, "foo"),
+            ("foo", 3),
+            ["foo"],
+            [2, "foo"],
+            ["foo", 3],
+            "foo",
+        ]
+
+    def _test_raises_str_typeerror(self, fn):
+        for arg in self._lists_with_str():
+            self.assertRaisesRegex(TypeError, "str", lambda: fn(arg))
+            try:
+                fn(arg)
+            except TypeError as e:
+                print(e)
+
+    def test_symintlist_error(self):
+        x = torch.randn(1)
+        self._test_raises_str_typeerror(lambda arg: torch._C._nn.pad(x, arg))
+
+    def test_vararg_symintlist_error(self):
+        self._test_raises_str_typeerror(lambda arg: torch.rand(arg))
+        self._test_raises_str_typeerror(lambda arg: torch.rand(*arg))
+
+    def test_symintlist_error_with_overload_but_is_unique(self):
+        x = torch.randn(1)
+        y = torch.randn(1)
+        self._test_raises_str_typeerror(lambda arg: x.set_(y, 0, arg))
+
+    def test_symintlist_error_with_overload(self):
+        x = torch.randn(1)
+        self._test_raises_str_typeerror(lambda arg: x.view(arg))
+
+    def test_intlist_error_with_overload(self):
+        x = torch.randn(1)
+        self._test_raises_str_typeerror(lambda arg: torch._C._nn.pad(x, arg))
+
     #
     # optional float list
     #
@@ -113,7 +153,7 @@ def fake_module(values, const):
         self.do_test_optional_intlist_with_module(fake_module)
 
     def test_optional_intlist_invalid(self):
-        with self.assertRaisesRegex(TypeError, "must be .* not"):
+        with self.assertRaisesRegex(TypeError, "must be .* but found"):
             IntListWrapperModule()(torch.zeros(1), [0.5])
 
         with self.assertRaisesRegex(RuntimeError, "value of type .* instead found type"):
diff --git a/torch/csrc/utils/invalid_arguments.cpp b/torch/csrc/utils/invalid_arguments.cpp
index e76b9cf22ff50..9ff3e71fdc960 100644
--- a/torch/csrc/utils/invalid_arguments.cpp
+++ b/torch/csrc/utils/invalid_arguments.cpp
@@ -272,7 +272,34 @@ std::string _formattedArgDesc(
       result += red;
     if (is_kwarg)
       result += option.arguments[i].name + "=";
-    result += py_typename(arg);
+    bool is_tuple = PyTuple_Check(arg);
+    if (is_tuple || PyList_Check(arg)) {
+      result += py_typename(arg) + " of ";
+      auto num_elements = PySequence_Length(arg);
+      if (is_tuple) {
+        result += "(";
+      } else {
+        result += "[";
+      }
+      for (const auto i : c10::irange(num_elements)) {
+        if (i != 0) {
+          result += ", ";
+        }
+        result += py_typename(
+            py::reinterpret_steal<py::object>(PySequence_GetItem(arg, i))
+                .ptr());
+      }
+      if (is_tuple) {
+        if (num_elements == 1) {
+          result += ",";
+        }
+        result += ")";
+      } else {
+        result += "]";
+      }
+    } else {
+      result += py_typename(arg);
+    }
     if (is_matching)
       result += reset_green;
     else
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index 177346614704f..f338d3f196adc 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -664,7 +664,10 @@ bool is_float_or_complex_list(PyObject* obj) {
   return true;
 }
 
-static bool is_int_list(PyObject* obj, int broadcast_size) {
+static bool is_int_list(
+    PyObject* obj,
+    int broadcast_size,
+    int64_t* failed_idx = nullptr) {
   if (PyTuple_Check(obj) || PyList_Check(obj)) {
     auto len = PySequence_Size(obj);
     if (len == 0) {
@@ -684,6 +687,9 @@ static bool is_int_list(PyObject* obj, int broadcast_size) {
     for (int i = 1; i < len; i++) {
       if (torch::is_symint_node(
               py::reinterpret_steal<py::object>(PySequence_GetItem(obj, i)))) {
+        if (failed_idx != nullptr) {
+          *failed_idx = i;
+        }
         return false;
       }
     }
@@ -694,9 +700,13 @@ static bool is_int_list(PyObject* obj, int broadcast_size) {
 
     // NOTE: JIT tracer allows arbitrary scalar tensors to act as ints
     // in an intlist argument. Even float or complex scalar tensors.
-    return (
-        jit::tracer::isTracing() && THPVariable_Check(item.ptr()) &&
-        THPVariable_Unpack(item.ptr()).sizes() == c10::IntArrayRef{});
+    bool r =
+        (jit::tracer::isTracing() && THPVariable_Check(item.ptr()) &&
+         THPVariable_Unpack(item.ptr()).sizes() == c10::IntArrayRef{});
+    if (!r && failed_idx != nullptr) {
+      *failed_idx = 0;
+    }
+    return r;
   }
   // if a size is specified (e.g. IntArrayRef[2]) we also allow passing a single
   // int
@@ -711,7 +721,10 @@ static bool is_int_or_symint(PyObject* obj) {
   return torch::is_symint_node(py::handle(obj)) || THPUtils_checkIndex(obj);
 }
 
-static bool is_int_or_symint_list(PyObject* obj, int broadcast_size) {
+static bool is_int_or_symint_list(
+    PyObject* obj,
+    int broadcast_size,
+    int64_t* failed_idx = nullptr) {
   if (PyTuple_Check(obj) || PyList_Check(obj)) {
     if (PySequence_Size(obj) == 0) {
       return true;
@@ -723,9 +736,13 @@ static bool is_int_or_symint_list(PyObject* obj, int broadcast_size) {
     }
     // NOTE: JIT tracer allows arbitrary scalar tensors to act as ints
     // in an intlist argument. Even float or complex scalar tensors.
-    return (
-        jit::tracer::isTracing() && THPVariable_Check(item.ptr()) &&
-        THPVariable_Unpack(item.ptr()).sizes() == c10::IntArrayRef{});
+    bool r =
+        (jit::tracer::isTracing() && THPVariable_Check(item.ptr()) &&
+         THPVariable_Unpack(item.ptr()).sizes() == c10::IntArrayRef{});
+    if (!r && failed_idx != nullptr) {
+      *failed_idx = 0;
+    }
+    return r;
   }
   // if a size is specified (e.g. IntArrayRef[2]) we also allow passing a single
   // int
@@ -736,7 +753,8 @@ static bool is_int_or_symint_list(PyObject* obj, int broadcast_size) {
 auto FunctionParameter::check(
     PyObject* obj,
     std::vector<py::handle>& overloaded_args,
-    int argnum) -> bool {
+    int argnum,
+    int64_t* failed_idx) -> bool {
   switch (type_) {
     case ParameterType::TENSOR: {
       if (is_tensor_and_append_overloaded(obj, &overloaded_args)) {
@@ -793,7 +811,7 @@ auto FunctionParameter::check(
           obj, &overloaded_args, argnum, true /* throw_error */);
     }
     case ParameterType::INT_LIST:
-      return is_int_list(obj, size);
+      return is_int_list(obj, size, failed_idx);
     case ParameterType::FLOAT_LIST:
       return is_float_or_complex_list(obj);
     case ParameterType::GENERATOR:
@@ -824,12 +842,13 @@ auto FunctionParameter::check(
     case ParameterType::SYM_INT:
       return is_int_or_symint(obj);
     case ParameterType::SYM_INT_LIST:
-      return is_int_or_symint_list(obj, size);
+      return is_int_or_symint_list(obj, size, failed_idx);
     default:
       throw std::runtime_error("unknown parameter type");
   }
 }
 
+// WARNING: these strings are parsed invalid_arguments.cpp
 std::string FunctionParameter::type_name() const {
   switch (type_) {
     case ParameterType::TENSOR:
@@ -837,9 +856,10 @@ std::string FunctionParameter::type_name() const {
     case ParameterType::SCALAR:
       return "Number";
     case ParameterType::INT64:
-      return "int";
+    // NB: SymInt is intentionally not mentioned here, as conventional user
+    // use will only know about ints
     case ParameterType::SYM_INT:
-      return "SymInt";
+      return "int";
     case ParameterType::DOUBLE:
       return "float";
     case ParameterType::COMPLEX:
@@ -877,7 +897,7 @@ std::string FunctionParameter::type_name() const {
     case ParameterType::SCALAR_LIST:
       return "tuple of Scalars";
     case ParameterType::SYM_INT_LIST:
-      return "tuple of SymInts";
+      return "tuple of ints";
     default:
       throw std::runtime_error("unknown parameter type");
   }
@@ -1341,6 +1361,8 @@ bool FunctionSignature::parse(
       is_kwd = true;
     }
 
+    int64_t failed_idx = -1;
+    bool varargs_eligible = allow_varargs_intlist && arg_pos == 0 && !is_kwd;
     if ((!obj && param.optional) || (obj == Py_None && param.allow_none)) {
       dst[i++] = nullptr;
     } else if (!obj) {
@@ -1349,15 +1371,16 @@ bool FunctionSignature::parse(
         missing_args(*this, i);
       }
       return false;
-    } else if (param.check(obj, this->overloaded_args, i)) {
+    } else if (param.check(obj, this->overloaded_args, i, &failed_idx)) {
       dst[i++] = obj;
       // XXX: the Variable check is necessary because sizes become tensors when
       // tracer is enabled. This behavior easily leads to ambiguities, and we
       // should avoid having complex signatures that make use of it...
     } else if (
-        allow_varargs_intlist && arg_pos == 0 && !is_kwd &&
-        ((int_list_overload ? is_int_list(args, param.size)
-                            : is_int_or_symint_list(args, param.size)))) {
+        varargs_eligible &&
+        ((int_list_overload
+              ? is_int_list(args, param.size, &failed_idx)
+              : is_int_or_symint_list(args, param.size, &failed_idx)))) {
       // take all positional arguments as this parameter
       // e.g. permute(1, 2, 3) -> permute((1, 2, 3))
       dst[i++] = args;
@@ -1374,6 +1397,24 @@ bool FunctionSignature::parse(
             Py_TYPE(obj)->tp_name);
       } else {
         // foo(): argument 'other' (position 2) must be str, not int
+        if (failed_idx != -1) {
+          if (!(PyTuple_Check(obj) || PyList_Check(obj))) {
+            TORCH_INTERNAL_ASSERT(varargs_eligible);
+            obj = args;
+          }
+          TORCH_INTERNAL_ASSERT(failed_idx < PySequence_Size(obj));
+          throw TypeError(
+              "%s(): argument '%s' (position %ld) must be %s, but found element of type %s at pos %ld",
+              name.c_str(),
+              param.name.c_str(),
+              static_cast<long>(arg_pos + 1),
+              param.type_name().c_str(),
+              Py_TYPE(py::reinterpret_steal<py::object>(
+                          PySequence_GetItem(obj, failed_idx))
+                          .ptr())
+                  ->tp_name,
+              static_cast<long>(failed_idx));
+        }
         throw TypeError(
             "%s(): argument '%s' (position %ld) must be %s, not %s",
             name.c_str(),
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index a08441369db82..acb830addf8f7 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -382,7 +382,8 @@ struct FunctionParameter {
   bool check(
       PyObject* obj,
       std::vector<py::handle>& overloaded_args,
-      int argnum);
+      int argnum,
+      int64_t* failed_idx = nullptr);
 
   void set_default_str(const std::string& str);
   std::string type_name() const;

From 484ef6bd0663cac0d044efee9e32a2952dee40f7 Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Mon, 24 Oct 2022 15:37:20 -0400
Subject: [PATCH 0098/1922] Fix a PyObject leak (#87608)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87608
Approved by: https://github.com/ezyang
---
 torch/csrc/utils/invalid_arguments.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/utils/invalid_arguments.cpp b/torch/csrc/utils/invalid_arguments.cpp
index 9ff3e71fdc960..49f591d1a64b3 100644
--- a/torch/csrc/utils/invalid_arguments.cpp
+++ b/torch/csrc/utils/invalid_arguments.cpp
@@ -82,7 +82,9 @@ struct SequenceType : public Type {
       return false;
     auto num_elements = PySequence_Length(object);
     for (const auto i : c10::irange(num_elements)) {
-      if (!type->is_matching(PySequence_GetItem(object, i)))
+      if (!type->is_matching(
+              py::reinterpret_steal<py::object>(PySequence_GetItem(object, i))
+                  .ptr()))
         return false;
     }
     return true;

From 67fe3ec534eac27bf8dbbb79aa10007ce47fb777 Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Mon, 24 Oct 2022 15:37:20 -0400
Subject: [PATCH 0099/1922] Add /= to c10::SymInt (#87603)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87603
Approved by: https://github.com/bdhirsh
---
 c10/core/SymInt.cpp | 4 ++++
 c10/core/SymInt.h   | 1 +
 2 files changed, 5 insertions(+)

diff --git a/c10/core/SymInt.cpp b/c10/core/SymInt.cpp
index 03f39078b406c..5ef576b3af1b0 100644
--- a/c10/core/SymInt.cpp
+++ b/c10/core/SymInt.cpp
@@ -155,6 +155,10 @@ void SymInt::operator*=(SymInt sci) {
   *this = *this * sci;
 }
 
+void SymInt::operator/=(SymInt sci) {
+  *this = *this / sci;
+}
+
 void SymInt::operator+=(SymInt sci) {
   *this = *this + sci;
 }
diff --git a/c10/core/SymInt.h b/c10/core/SymInt.h
index f5c2ddf00998e..6934a607ccbff 100644
--- a/c10/core/SymInt.h
+++ b/c10/core/SymInt.h
@@ -169,6 +169,7 @@ class C10_API SymInt {
   bool operator>=(SymInt sci) const;
   void operator*=(SymInt sci);
   void operator+=(SymInt sci);
+  void operator/=(SymInt sci);
 
   SymInt min(SymInt sci) const;
   SymInt max(SymInt sci) const;

From b2537ede13c5f736c848a9fab9af4347fcd8530c Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Tue, 25 Oct 2022 00:00:57 +0000
Subject: [PATCH 0100/1922] Fix typo in secrets name (#87655)

They are case sensitive and should be all uppercase

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87655
Approved by: https://github.com/kit1980, https://github.com/weiwangmeta
---
 .github/workflows/build-triton-wheel.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml
index f602eaa30af4d..e3f02e6b77b36 100644
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@@ -137,8 +137,8 @@ jobs:
         env:
           PKG_DIR: "${{ runner.temp }}/artifacts"
           # When running these on pull_request events these should be blank
-          AWS_ACCESS_KEY_ID: ${{ secrets.aws-access-key-id }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.aws-pytorch-uploader-secret-access-key }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_S3_UPDATE_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_S3_UPDATE_SECRET_ACCESS_KEY }}
           UPLOAD_BUCKET: "s3://pytorch"
         run: |
             set -ex

From b4b2b9ee327f507ab1929f0a8bdf43b1d8e0987f Mon Sep 17 00:00:00 2001
From: Masaki Kozuki <mkozuki@nvidia.com>
Date: Tue, 25 Oct 2022 00:11:50 +0000
Subject: [PATCH 0101/1922] [docs] `batch_isend_irecv` and `P2POp` of
 torch.distributed (#86438)

Reopening https://github.com/pytorch/pytorch/pull/79722

cc @mrshenli @pritamdamania87 @zhaojuanmao @satgera @rohan-varma @gqchen @aazzolini @osalpekar @jiayisuse @H-Huang @kwen2501 @awgu
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86438
Approved by: https://github.com/kit1980
---
 docs/source/distributed.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst
index 8b1186fb4ceec..530ff88721048 100644
--- a/docs/source/distributed.rst
+++ b/docs/source/distributed.rst
@@ -350,6 +350,10 @@ as they should never be created manually, but they are guaranteed to support two
 
 .. autofunction:: irecv
 
+.. autofunction:: batch_isend_irecv
+
+.. autoclass:: P2POp
+
 Synchronous and asynchronous collective operations
 --------------------------------------------------
 Every collective operation function supports the following two kinds of operations,

From e7fc9e1138db5f58f20c4b0fa1907d1c113700b2 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Tue, 25 Oct 2022 00:18:31 +0000
Subject: [PATCH 0102/1922] Fix TensorShape.cpp compilation (#87654)

Build failure introduced by landrace while merging https://github.com/pytorch/pytorch/pull/75575

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87654
Approved by: https://github.com/albanD
---
 aten/src/ATen/native/TensorShape.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index e1f9835184cbd..d25113577b2d5 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -89,13 +89,14 @@
 #include <ATen/ops/detach_copy_native.h>
 #include <ATen/ops/detach_native.h>
 #include <ATen/ops/diag.h>
-#include <ATen/ops/diag_backward_native.h>
+#include <ATen/ops/diag_embed.h>
 #include <ATen/ops/diag_embed_native.h>
 #include <ATen/ops/diag_native.h>
 #include <ATen/ops/diagflat_native.h>
 #include <ATen/ops/diagonal.h>
 #include <ATen/ops/diagonal_backward.h>
 #include <ATen/ops/diagonal_backward_native.h>
+#include <ATen/ops/diagonal_copy.h>
 #include <ATen/ops/diagonal_copy_native.h>
 #include <ATen/ops/diagonal_native.h>
 #include <ATen/ops/diagonal_scatter_native.h>

From 7cdfb6fef8b6de1f02211a0ba5a3036506910ef0 Mon Sep 17 00:00:00 2001
From: Aaron Enye Shi <enye.shi@gmail.com>
Date: Tue, 25 Oct 2022 00:50:13 +0000
Subject: [PATCH 0103/1922] [Kineto][Profiler] Rename Profiler post processing
 Index Key (#87477)

Summary: Rather than using the full name Profiler Event Index, use a shorten name Ev Idx. In the future, we should address this by adding a lookup table of short name to long name.

Test Plan: CI

Reviewed By: robieta, slgong-fb

Differential Revision: D40328758

Pulled By: aaronenyeshi

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87477
Approved by: https://github.com/chaekit
---
 torch/csrc/profiler/collection.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp
index e76cfd5946db9..01b7c4024f269 100644
--- a/torch/csrc/profiler/collection.cpp
+++ b/torch/csrc/profiler/collection.cpp
@@ -522,7 +522,7 @@ void mark_finished(std::shared_ptr<Result>& r) {
   TORCH_INTERNAL_ASSERT(r->endTimeNS() >= r->start_time_ns_, r->name());
 }
 
-static constexpr const char* indexKey = "Profiler Event Index";
+static constexpr const char* indexKey = "Ev Idx";
 
 void passEventsToKineto(
     const std::vector<std::shared_ptr<Result>>& results,

From 0752e41f6b48585512fe685b27ef1aae5329e932 Mon Sep 17 00:00:00 2001
From: erjia <erjia@fb.com>
Date: Tue, 25 Oct 2022 01:27:56 +0000
Subject: [PATCH 0104/1922] [DataLoader2] Change serialization wrapper to
 iterator (#87459)

This is temporary fix for internal SEV. We have run three different workflows to validate this fix would unblock internal SEV.
And, those are a few following-up tasks:
- [ ] Create reproducible test for multithreading with generator
- [ ] Figure out how to make fullsynciterator is working properly with generator
- [ ] Move Wrapper back to generator if needed
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87459
Approved by: https://github.com/NivekT
---
 torch/utils/data/datapipes/datapipe.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/torch/utils/data/datapipes/datapipe.py b/torch/utils/data/datapipes/datapipe.py
index 43adc00bdddaf..42120148d0269 100644
--- a/torch/utils/data/datapipes/datapipe.py
+++ b/torch/utils/data/datapipes/datapipe.py
@@ -356,8 +356,17 @@ def __len__(self):
 
 
 class _IterDataPipeSerializationWrapper(_DataPipeSerializationWrapper, IterDataPipe):
-    def __iter__(self):
-        yield from self._datapipe
+    def __init__(self, datapipe: IterDataPipe[T_co]):
+        super().__init__(datapipe)
+        self._datapipe_iter: Optional[Iterator[T_co]] = None
+
+    def __iter__(self) -> "_IterDataPipeSerializationWrapper":
+        self._datapipe_iter = iter(self._datapipe)
+        return self
+
+    def __next__(self) -> T_co:
+        assert self._datapipe_iter is not None
+        return next(self._datapipe_iter)
 
 
 class _MapDataPipeSerializationWrapper(_DataPipeSerializationWrapper, MapDataPipe):

From 9996730a0de852d076aba81dbcc44180012f0566 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 25 Oct 2022 01:45:23 +0000
Subject: [PATCH 0105/1922] Add cached conda env files for macos (arm64, x86)
 (#87541)

So far, we only cache macos conda dependency for build workflow.  All the test dependencies are still not cached and installed by the CI. This PR introduces a new `.github/requirements` directory which I plan to explicitly include all the conda and pip build and test dependencies across all platforms.  This allows pip and conda installation to be consolidated in one place (and properly cached)

Those conda dependencies come from https://github.com/pytorch/pytorch/blob/master/.jenkins/pytorch/macos-common.sh.  Once this PR is merged, I will follow up with another one to clean up all conda installation from that file (to make sure that nothing break along the way)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87541
Approved by: https://github.com/ZainRizvi
---
 .github/requirements/README.md             | 19 +++++++++++++++++++
 .github/requirements/conda-env-macOS-ARM64 | 16 ++++++++++++++++
 .github/requirements/conda-env-macOS-X64   | 18 ++++++++++++++++++
 .github/workflows/_mac-test.yml            |  3 ++-
 4 files changed, 55 insertions(+), 1 deletion(-)
 create mode 100644 .github/requirements/README.md
 create mode 100644 .github/requirements/conda-env-macOS-ARM64
 create mode 100644 .github/requirements/conda-env-macOS-X64

diff --git a/.github/requirements/README.md b/.github/requirements/README.md
new file mode 100644
index 0000000000000..654bb04558b9b
--- /dev/null
+++ b/.github/requirements/README.md
@@ -0,0 +1,19 @@
+### Cached requirements and consolidation of conda and pip installation
+
+At the moment, the installation of conda and pip dependencies happens at
+different places in the CI depending at the whim of different
+developers, which makes it very challenging to handle issues like
+network flakiness or upstream dependency failures gracefully. So, this
+center directory is created to gradually include all the conda enviroment
+and pip requirement files that are used to setup CI jobs. Not only it
+gives a clear picture of all the dependencies required by different CI
+jobs, but it also allows them to be cached properly to improve CI
+reliability.
+
+The list of support files are as follows:
+
+* Conda:
+  * conda-env-macOS-ARM64. This is used by MacOS (m1, arm64) build and
+    test jobs to setup the conda environment
+  * conda-env-macOS-X64. This is use by MacOS (x86-64) build and test
+    jobs to setup the conda environment
diff --git a/.github/requirements/conda-env-macOS-ARM64 b/.github/requirements/conda-env-macOS-ARM64
new file mode 100644
index 0000000000000..6e7e4221a85ba
--- /dev/null
+++ b/.github/requirements/conda-env-macOS-ARM64
@@ -0,0 +1,16 @@
+numpy=1.22.3
+pyyaml=6.0
+setuptools=61.2.0
+cmake=3.22.1
+cffi=1.15.1
+typing_extensions=4.3.0
+dataclasses=0.8
+pip=22.2.2
+six=1.16.0
+pillow=9.2.0
+libuv=1.39.0
+pkg-config=0.29.2
+wheel=0.37.1
+
+# Not pinning certifi so that we can always get the latest certificates
+certifi
diff --git a/.github/requirements/conda-env-macOS-X64 b/.github/requirements/conda-env-macOS-X64
new file mode 100644
index 0000000000000..81463d4b39d56
--- /dev/null
+++ b/.github/requirements/conda-env-macOS-X64
@@ -0,0 +1,18 @@
+mkl=2021.2.0
+mkl-include=2021.2.0
+numpy=1.18.5
+pyyaml=5.3
+setuptools=46.0.0
+cmake=3.22.1
+cffi=1.15.1
+typing_extensions=4.3.0
+dataclasses=0.8
+pip=22.2.2
+six=1.16.0
+pillow=9.2.0
+libuv=1.40.0
+pkg-config=0.29.2
+wheel=0.37.1
+
+# Not pinning certifi so that we can always get the latest certificates
+certifi
diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml
index 72ee311498503..db524cae464b6 100644
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@@ -82,7 +82,6 @@ jobs:
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
 
-
       - name: Download build artifacts
         uses: ./.github/actions/download-build-artifacts
         with:
@@ -94,12 +93,14 @@ jobs:
         uses: pytorch/test-infra/.github/actions/setup-miniconda@main
         with:
           python-version: 3.8
+          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
 
       - name: Setup miniconda (arm64, py3.9)
         if: ${{ runner.arch == 'ARM64' }}
         uses: pytorch/test-infra/.github/actions/setup-miniconda@main
         with:
           python-version: 3.9
+          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
 
       - name: Start monitoring script
         id: monitor-script

From 68a21b7ca5e2ac9f43df576212f953b3e30c3b7b Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Mon, 24 Oct 2022 21:52:12 +0000
Subject: [PATCH 0106/1922] Defer importing meta_table (#87630)

This is needed to work around an internal test failure: https://www.internalfb.com/tasks/?t=135878641

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87630
Approved by: https://github.com/eellison, https://github.com/khabinov
---
 torch/_subclasses/fake_tensor.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index bb6970303facd..56b6d4b826af7 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -9,7 +9,6 @@
 from typing import Any, Callable, Dict, List, Optional, Type, TypeVar, Union
 
 import torch
-from torch._decomp import meta_table as meta_table
 from torch._ops import OpOverload
 from torch._subclasses.meta_utils import MetaConverter, WeakTensorRefKey
 from torch.fx.operator_schemas import normalize_function
@@ -761,6 +760,8 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
             has_symbolic_sizes
             and func not in self.functions_with_cpp_meta_impl_that_support_symint
         ):
+            from torch._decomp import meta_table as meta_table
+
             with no_dispatch():
                 if func == aten.size.default:
                     sys.stderr.write(

From c682fa471571ea279710785ef63162740983843a Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Tue, 25 Oct 2022 02:49:11 +0000
Subject: [PATCH 0107/1922] Fix incorrect param names in get_testing_overrides
 (#87625)

This PR fixes incorrect parameter names for lambda in `get_testing_overrides()`
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87625
Approved by: https://github.com/kit1980
---
 torch/overrides.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/overrides.py b/torch/overrides.py
index c463cf3ca94d4..95e7c66111b5b 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -416,7 +416,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.concatenate: lambda tensors, dim=0, out=None: -1,  # alias for torch.concatenate
         torch.cdist: lambda x1, x2, p=2.0, compute_mode='use_mm_for_euclid_dist_if_necessary': -1,
         torch.ceil: lambda input, out=None: -1,
-        torch.celu: lambda input, alhpa=1., inplace=False: -1,
+        torch.celu: lambda input, alpha=1., inplace=False: -1,
         torch.chain_matmul: lambda *matrices, out=None: -1,
         torch.channel_shuffle: lambda input, groups : -1,
         torch.cholesky: lambda input, upper=False, out=None: -1,
@@ -572,7 +572,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.grid_sampler_2d: lambda input, grid, interpolation_mode, padding_mode, align_corners: -1,
         torch.grid_sampler_3d: lambda input, grid, interpolation_mode, padding_mode, align_corners: -1,
         torch.group_norm: lambda input, num_groups, weight=None, bias=None, eps=1e-05, cudnn_enabled=True: -1,
-        torch.gru: lambda input, hx, params, has_biases, num_layers, gropout, train, bidirectional, batch_first: -1,
+        torch.gru: lambda input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first: -1,
         torch.gru_cell: lambda input, hx, w_ih, w_hh, b_ih=None, b_hh=None: -1,
         torch.gt: lambda input, other, out=None: -1,
         torch.greater: lambda input, other, out=None: -1,

From e2d130fca32a1c04ab0692fb2c20f46e09b3145a Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 25 Oct 2022 03:22:27 +0000
Subject: [PATCH 0108/1922] [Inductor] Truncate function expr str if it's too
 long at RecordLoadStore (#87248)

See context at https://github.com/pytorch/torchdynamo/issues/1352#issuecomment-1283131872
Fixes https://github.com/pytorch/torchdynamo/issues/1352

cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @penguinwu
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87248
Approved by: https://github.com/jansel
---
 test/inductor/test_torchinductor.py | 25 +++++++++++++++++++++++++
 torch/_inductor/dependencies.py     | 10 ++++++++++
 torch/_inductor/virtualized.py      | 10 +++++++---
 3 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index e0501e0e8adef..c106658b21c0e 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -3523,6 +3523,31 @@ def fn(a, b, c):
             ],
         )
 
+    # From https://github.com/pytorch/torchdynamo/issues/1352
+    def test_max_pool2d_with_indices_backward4(self):
+        def fn(a, b, c):
+            return aten.max_pool2d_with_indices_backward(
+                a, b, [5, 5], [1, 1], [2, 2], [1, 1], False, c
+            )
+
+        x = torch.randn([2, 64, 3, 4])
+        result, indices = aten.max_pool2d_with_indices(
+            x,
+            [5, 5],
+            [1, 1],
+            2,
+            1,
+            False,
+        )
+        self.common(
+            fn,
+            [
+                torch.randn_like(result),
+                x,
+                indices,
+            ],
+        )
+
     def test_avg_pool2d_backward(self):
         def fn(a, b):
             return aten.avg_pool2d_backward(
diff --git a/torch/_inductor/dependencies.py b/torch/_inductor/dependencies.py
index 6eee943b60074..27c92f82c07c9 100644
--- a/torch/_inductor/dependencies.py
+++ b/torch/_inductor/dependencies.py
@@ -7,6 +7,7 @@
 
 import sympy
 
+from . import config
 from .codegen.common import index_prevent_reordering
 from .utils import sympy_product, sympy_str, sympy_subs, sympy_symbol, VarRanges
 from .virtualized import V
@@ -146,6 +147,15 @@ def __init__(self, var_ranges: VarRanges, normalize: bool):
         self._var_ranges: VarRanges = var_ranges
         self._normalize: bool = normalize
 
+    # Truncate the expr str by a threshold to prevent it's too long
+    # and cause process hanging. The result is not used.
+    # https://github.com/pytorch/torchdynamo/issues/1352
+    @staticmethod
+    def truncate_expr(expr):
+        if len(expr) > config.realize_bytes_threshold:
+            expr = f"{expr[:config.realize_bytes_threshold]}..."
+        return expr
+
     def canonicalize(
         self, index: sympy.Expr
     ) -> Tuple[sympy.Expr, Tuple[sympy.Expr, ...]]:
diff --git a/torch/_inductor/virtualized.py b/torch/_inductor/virtualized.py
index 64c221895a91b..5d40d05f751f9 100644
--- a/torch/_inductor/virtualized.py
+++ b/torch/_inductor/virtualized.py
@@ -60,13 +60,17 @@ def __getattr__(self, name):
         def inner(*args, **kwargs):
             fargs = [_arg_str(a) for a in args]
             fargs.extend(f"{k}={v}" for k, v in kwargs.items())
-            return f"{name}({', '.join(fargs)})"
+            return self.truncate_expr(f"{name}({', '.join(fargs)})")
 
         return inner
 
     @staticmethod
-    def masked(mask, body, other):
-        return f"masked({mask}, {body()}, {other})"
+    def truncate_expr(expr):
+        return expr
+
+    @classmethod
+    def masked(cls, mask, body, other):
+        return cls.truncate_expr(f"masked({mask}, {body()}, {other})")
 
     @staticmethod
     def indirect_indexing(index_var):

From f59154d6ed3c88ca04af23418afd32cd3495bea0 Mon Sep 17 00:00:00 2001
From: Tom Stein <dev@tomstein.me>
Date: Tue, 25 Oct 2022 04:07:16 +0000
Subject: [PATCH 0109/1922] [Python] refactor slices on sorted (#86995)

Sometimes you want to query the small element of a set of elements and use `sorted(elements)[0]` without a second thought. However, this is not optimal, since the entire list must be sorted first `O(n log n)`. It would be better to use the `min(elements)` method provided for this purpose `O(n)`.
Furthermore `sorted(elements)[::-1]` is not very efficient, because it would be better to use `sorted(elements, reverse=True)` to save the slice operation.

**TLDR: using `sorted(elements)[0]` is slow and can be replaced with `min(elements)`.**

I stumbled across these code snippets while playing around with CodeQL (see https://lgtm.com/query/4148064474379348546/).
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86995
Approved by: https://github.com/jansel
---
 tools/testing/test_selections.py         | 6 +++---
 torch/distributed/rpc/api.py             | 2 +-
 torch/fx/experimental/symbolic_shapes.py | 4 ++--
 torch/masked/maskedtensor/reductions.py  | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tools/testing/test_selections.py b/tools/testing/test_selections.py
index 3b33281781894..766ec0ff1fe7f 100644
--- a/tools/testing/test_selections.py
+++ b/tools/testing/test_selections.py
@@ -45,14 +45,14 @@ def calculate_shards(
     ]
     for test in sorted_tests:
         if must_serial(test):
-            min_sharded_job = sorted(sharded_jobs, key=lambda j: j.get_total_time())[0]
+            min_sharded_job = min(sharded_jobs, key=lambda j: j.get_total_time())
             min_sharded_job.serial.append(test)
         else:
-            min_sharded_job = sorted(sharded_jobs, key=lambda j: j.get_total_time())[0]
+            min_sharded_job = min(sharded_jobs, key=lambda j: j.get_total_time())
             min_sharded_job.parallel.append(test)
 
     # Round robin the unknown jobs starting with the smallest shard
-    index = sorted(range(num_shards), key=lambda i: sharded_jobs[i].get_total_time())[0]
+    index = min(range(num_shards), key=lambda i: sharded_jobs[i].get_total_time())
     for test in unknown_tests:
         sharded_jobs[index].serial.append(test)
         index = (index + 1) % num_shards
diff --git a/torch/distributed/rpc/api.py b/torch/distributed/rpc/api.py
index 8eda3a729c380..f5e544806822d 100644
--- a/torch/distributed/rpc/api.py
+++ b/torch/distributed/rpc/api.py
@@ -191,7 +191,7 @@ def _all_gather(obj, worker_names=None, timeout=UNSET_RPC_TIMEOUT):
             _ALL_WORKER_NAMES is not None
         ), "`_ALL_WORKER_NAMES` is not initialized for `def _all_gather`."
         worker_names = _ALL_WORKER_NAMES
-    leader_name = sorted(worker_names)[0]
+    leader_name = min(worker_names)
 
     self_name = _get_current_rpc_agent().get_worker_info().name
 
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index 3e80c8c9f4906..7615e410e2515 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -398,13 +398,13 @@ def create_symbolic_sizes_strides(self, ex: torch.Tensor):
                     candidates[ex.size(i) * ex.stride()[i]] = size[i] * stride[i]
             if any(x is None for x in stride):
                 # bind the smallest unbound stride to a new variable
-                val, i = sorted(
+                val, i = min(
                     [
                         (ex.stride()[i], i)
                         for i in range(len(stride))
                         if stride[i] is None
                     ]
-                )[0]
+                )
                 stride[i] = self.create_symbol(val)
         assert all(x is not None for x in stride)
         return [self.create_symintnode(i) for i in size], [self.create_symintnode(i) for i in stride]  # type: ignore[arg-type]
diff --git a/torch/masked/maskedtensor/reductions.py b/torch/masked/maskedtensor/reductions.py
index 137ae58e6e190..210af5d6c09cc 100644
--- a/torch/masked/maskedtensor/reductions.py
+++ b/torch/masked/maskedtensor/reductions.py
@@ -31,7 +31,7 @@ def _masked_all(*args, **kwargs):
 def _multidim_any(mask, dim, keepdim):
     if isinstance(dim, int):
         return _multidim_any(mask, [dim], keepdim)
-    for d in sorted(dim)[::-1]:
+    for d in sorted(dim, reverse=True):
         mask = torch.any(mask, dim=d, keepdim=keepdim)
     return mask
 

From 9e6517a6bd5af20f082cc80dee6719db8df21d35 Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang@fb.com>
Date: Mon, 24 Oct 2022 12:30:45 -0700
Subject: [PATCH 0110/1922] Fix use after free in tensorpipe agent (#87627)

Fixes #87359, which identifies use after free for reverse device maps. This is only in the dynamic RPC feature and not effecting stable RPC code path.

Unfortunately the test `TensorPipeRpcTest.test_dynamic_rpc_existing_rank_can_communicate_with_new_rank_cuda` that is failing is also running into separate issue. I've temporarily disabled some of the test code to investigate the error in asychronously.

Testing plan:
- tested all the dynamic RPC tests
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87627
Approved by: https://github.com/rohan-varma
---
 .../csrc/distributed/rpc/tensorpipe_agent.cpp | 20 +++++++++------
 .../_internal/distributed/rpc/rpc_test.py     | 25 +++++++++++--------
 2 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
index 2480b21d105f1..c885713637421 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -1260,18 +1260,22 @@ void TensorPipeAgent::updateGroupMembership(
     workerNameToInfo_.erase(name);
     workerNameToURL_.erase(name);
 
-    for (const auto& it : reverseDeviceMaps_) {
-      if (reverseDeviceMaps.find(it.first) == reverseDeviceMaps.end()) {
-        reverseDeviceMaps_.erase(it.first);
+    // remove reverse device maps that are no longer used
+    for (auto it = reverseDeviceMaps_.begin();
+         it != reverseDeviceMaps_.end();) {
+      if (reverseDeviceMaps.find(it->first) == reverseDeviceMaps.end()) {
+        it = reverseDeviceMaps_.erase(it);
+      } else {
+        it++;
       }
     }
 
-    auto iter = devices_.begin();
-    while (iter != devices_.end()) {
-      if (std::find(devices.begin(), devices.end(), *iter) == devices.end()) {
-        iter = devices_.erase(iter);
+    // remove devices that are no longer used
+    for (auto it = devices_.begin(); it != devices_.end();) {
+      if (std::find(devices.begin(), devices.end(), *it) == devices.end()) {
+        it = devices_.erase(it);
       } else {
-        iter++;
+        it++;
       }
     }
   }
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index 2c59629aec633..764117f43dbf6 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -5106,16 +5106,21 @@ def test_dynamic_rpc_existing_rank_can_communicate_with_new_rank_cuda(self):
                 rpc_backend_options=self.rpc_backend_options,
             )
 
-        dist.barrier()
-        if self.rank == 0:
-            for i in range(1, self.world_size):
-                x = torch.ones(2)
-                result_on_device_0 = rpc.rpc_sync(worker_name(i), torch.add, args=(x.to(0), 1))
-                result_on_device_1 = rpc.rpc_sync(worker_name(i), torch.add, args=(x.to(1), 1))
-                self.assertEqual(torch.add(torch.ones(2), 1), result_on_device_0)
-                self.assertEqual(torch.device('cuda:0'), result_on_device_0.device)
-                self.assertEqual(torch.add(torch.ones(2), 1), result_on_device_1)
-                self.assertEqual(torch.device('cuda:1'), result_on_device_1.device)
+        # TODO: Cuda RPC is failing due to:
+        # terminate called after throwing an instance of 'c10::Error'
+        # what():  0 <= device && static_cast<size_t>(device) < device_allocator.size()
+        # INTERNAL ASSERT FAILED at "../c10/cuda/CUDACachingAllocator.cpp":1937,
+        # please report a bug to PyTorch. Allocator not initialized for device 1: did you call init?
+        # dist.barrier()
+        # if self.rank == 0:
+        #     for i in range(1, self.world_size):
+        #         x = torch.ones(2)
+        #         result_on_device_0 = rpc.rpc_sync(worker_name(i), torch.add, args=(x.to(0), 1))
+        #         result_on_device_1 = rpc.rpc_sync(worker_name(i), torch.add, args=(x.to(1), 1))
+        #         self.assertEqual(torch.add(torch.ones(2), 1), result_on_device_0)
+        #         self.assertEqual(torch.device('cuda:0'), result_on_device_0.device)
+        #         self.assertEqual(torch.add(torch.ones(2), 1), result_on_device_1)
+        #         self.assertEqual(torch.device('cuda:1'), result_on_device_1.device)
 
         # Barrier to ensure that all rpc_sync calls are finished
         dist.barrier()

From a40a689d5438765760cf76ab44d44989eb86af00 Mon Sep 17 00:00:00 2001
From: Soof Golan <83900570+soof-golan@users.noreply.github.com>
Date: Tue, 25 Oct 2022 04:43:07 +0000
Subject: [PATCH 0111/1922] Fix `tensor.stride()` type hint (#84177)

`tensor.stride()` now hints at tuple of variable length instead of tuple with constant length of 1

Fixes #84176

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84177
Approved by: https://github.com/Chillee
---
 tools/pyi/gen_pyi.py          | 2 +-
 torch/_prims/__init__.py      | 2 +-
 torch/fx/passes/shape_prop.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index 79f97c4e9f30c..417d73f829a6e 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -597,7 +597,7 @@ def gen_pyi(
                 "def size(self, dim: _int) -> _int: ...",
             ],
             "stride": [
-                "def stride(self) -> Tuple[_int]: ...",
+                "def stride(self) -> Tuple[_int, ...]: ...",
                 "def stride(self, _int) -> _int: ...",
             ],
             "new_ones": [
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index d724ac50e2839..eae38612a2237 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -1273,7 +1273,7 @@ def _collapse_view_helper(
         strides = (1,)
     else:
         shape = a.shape  # type: ignore[assignment]
-        strides = a.stride()
+        strides = a.stride()  # type: ignore[assignment]
 
     utils.validate_idx(len(shape), start)
     utils.validate_exclusive_idx(len(shape), end)
diff --git a/torch/fx/passes/shape_prop.py b/torch/fx/passes/shape_prop.py
index 9c3a036e90bf4..2be996f714ce8 100644
--- a/torch/fx/passes/shape_prop.py
+++ b/torch/fx/passes/shape_prop.py
@@ -17,7 +17,7 @@ class TensorMetadata(NamedTuple):
     shape : torch.Size
     dtype : torch.dtype
     requires_grad : bool
-    stride : Tuple[int]
+    stride : Tuple[int, ...]
     memory_format : Optional[torch.memory_format]
 
     # Quantization metadata

From faf7f3d455536b403f77c6894d932a0365368384 Mon Sep 17 00:00:00 2001
From: shynehr <scut_sub@outlook.com>
Date: Tue, 25 Oct 2022 04:45:52 +0000
Subject: [PATCH 0112/1922] remove unnecessary __syncthreads() in
 conv_depthwise2d_grad_weight_kernel (#84854)

Threads within a thread block would be synchronize inside the function BlockReduceSum when intra-warp reduce finishes.  It's unnessary to synchronize threads before invoking function BlockReduceSum.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84854
Approved by: https://github.com/ngimel
---
 aten/src/ATen/native/cuda/DepthwiseConv2d.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/aten/src/ATen/native/cuda/DepthwiseConv2d.cu b/aten/src/ATen/native/cuda/DepthwiseConv2d.cu
index 8f0f9b99903a7..20748837bbaf7 100644
--- a/aten/src/ATen/native/cuda/DepthwiseConv2d.cu
+++ b/aten/src/ATen/native/cuda/DepthwiseConv2d.cu
@@ -236,7 +236,6 @@ __global__ void conv_depthwise2d_grad_weight_kernel(
       }
     }
   }
-  __syncthreads();
 
   // At this point each thread in the block has a local gradient, which we need to
   // accumulate prior to writing the global value

From d2b1929f9ccf7ce39527e7ecbffed856bc49f475 Mon Sep 17 00:00:00 2001
From: Bill Schnurr <bschnurr@microsoft.com>
Date: Tue, 25 Oct 2022 04:47:10 +0000
Subject: [PATCH 0113/1922] Fix torch.testing.assert_close not exported from
 module (#87619)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For pylance/pyright static typechecking
"Imported symbols are considered private by default. If they use the “import A as A” (a redundant module alias), “from X import A as A” (a redundant symbol alias)" https://github.com/microsoft/pyright/blob/main/docs/typed-libraries.md#library-interface

torch.testing.assert_close not exported from module https://github.com/microsoft/pylance-release/issues/3526

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87619
Approved by: https://github.com/kit1980
---
 torch/testing/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch/testing/__init__.py b/torch/testing/__init__.py
index 130eaf672983c..ad69ef1d24901 100644
--- a/torch/testing/__init__.py
+++ b/torch/testing/__init__.py
@@ -1,4 +1,4 @@
-from ._comparison import assert_close
-from torch._C import FileCheck
-from ._creation import make_tensor
+from ._comparison import assert_close as assert_close
+from torch._C import FileCheck as FileCheck
+from ._creation import make_tensor as make_tensor
 from ._deprecated import *  # noqa: F403

From 661fe5d8251cd108dcd25d2b6f1a23004a84ebc1 Mon Sep 17 00:00:00 2001
From: Takeshi Watanabe <take-cheeze@users.noreply.github.com>
Date: Tue, 25 Oct 2022 05:49:52 +0000
Subject: [PATCH 0114/1922] [JIT] Fix return types of inputs/outputs method in
 Graph (#86349)

The C++ definition return `ArrayRef<Value*>` but in python binding it returns iterator instead: https://github.com/pytorch/pytorch/blob/d04889323e2bc0b7321b76e564292565c88b9a5e/torch/csrc/jit/python/python_ir.cpp#L631

I've had hard time with mypy and there is also fixed version of stubs in pytorch-pfn-extras for my project: https://github.com/pfnet/pytorch-pfn-extras/blob/beeab3f30381fd1ed313bc09d561c567482784a1/stubs/torch/_C/__init__.pyi#L458

Pull Request resolved: https://github.com/pytorch/pytorch/pull/86349
Approved by: https://github.com/kit1980
---
 torch/_C/__init__.pyi.in | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 5b9049e4bdc7d..792e231999163 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -527,8 +527,8 @@ class Value:
 
 # Defined in torch/csrc/jit/ir/ir.h
 class Block:
-    def inputs(self) -> List[Value]: ...
-    def outputs(self) -> List[Value]: ...
+    def inputs(self) -> Iterator[Value]: ...
+    def outputs(self) -> Iterator[Value]: ...
     def nodes(self) -> Iterator[Node]: ...
     def paramNode(self) -> Node: ...
     def returnNode(self) -> Node: ...
@@ -542,11 +542,11 @@ class Node:
     def __getitem__(self, key: str) -> Any: ...
     def schema(self) -> str: ...
     def input(self) -> Value: ...
-    def inputs(self) -> List[Value]: ...
+    def inputs(self) -> Iterator[Value]: ...
     def inputsAt(self, idx: _int) -> Value: ...
     def inputsSize(self) -> _int: ...
     def output(self) -> Value: ...
-    def outputs(self) -> List[Value]: ...
+    def outputs(self) -> Iterator[Value]: ...
     def outputsAt(self, idx: _int) -> Value: ...
     def outputsSize(self) -> _int: ...
     def hasMultipleOutputs(self) -> _bool: ...
@@ -622,8 +622,8 @@ class Node:
 
 # Defined in torch/torch/csrc/jit/ir/ir.h
 class Graph:
-    def inputs(self) -> List[Value]: ...
-    def outputs(self) -> List[Value]: ...
+    def inputs(self) -> Iterator[Value]: ...
+    def outputs(self) -> Iterator[Value]: ...
     def nodes(self) -> Iterator[Node]: ...
     def param_node(self) -> Node: ...
     def return_node(self) -> Node: ...

From c4c0491a4c962f76550bd164fd8d19c8a330af36 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Tue, 25 Oct 2022 06:14:54 +0000
Subject: [PATCH 0115/1922] [vision hash update] update the pinned vision hash
 (#87639)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned vision hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87639
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/vision.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index 02a12c728a3a5..88e283fa46ec9 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-9c112935abe400222cca8f9fbc2d8386e0f25e80
+0d7807d59520289b2065b4db4a138b7fba2f61fd

From 4f2e3c4e69de54419e07317c8f6cf5813465adcb Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <ivan.yashchuk@aalto.fi>
Date: Tue, 25 Oct 2022 06:55:59 +0000
Subject: [PATCH 0116/1922] Intercept aten._reshape_alias for nvFuser (#87072)

This would help forming larger fusion groups. If this won't end up executed by nvFuser then eager mode implementation would call into `.reshape`: https://github.com/pytorch/pytorch/blob/37e9e89afbc3554258545a026fab4cd9e1a4b85d/torch/_prims/nvfuser_prims.py#L552-L553

cc @kevinstephano @jjsjann123
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87072
Approved by: https://github.com/ngimel
---
 torch/_prims/context.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/torch/_prims/context.py b/torch/_prims/context.py
index fea3f17a5009b..2bcee069d146c 100644
--- a/torch/_prims/context.py
+++ b/torch/_prims/context.py
@@ -405,6 +405,12 @@ def __torch_function__(
                 warn("view has ignored kwargs!")
             return torch.ops.nvprims.view(a, shape)
 
+        if orig_func == torch.ops.aten._reshape_alias.default:
+            a, shape, stride = args
+            if len(kwargs) > 0:
+                warn("view has ignored kwargs!")
+            return torch.ops.nvprims.view(a, shape)
+
         if self._is_native_batch_norm(orig_func):
             return torch.ops.nvprims.native_batch_norm(*args, **kwargs)
 

From 6ebb69c37f57fde73385c5fbcf0b787d4f13be84 Mon Sep 17 00:00:00 2001
From: Daniel Falbel <dfalbel@gmail.com>
Date: Tue, 25 Oct 2022 07:12:28 +0000
Subject: [PATCH 0117/1922] Support `signbit` in MPS. (#87214)

Implements the signbit operator for MPS. Links to #77764

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87214
Approved by: https://github.com/kulinseth, https://github.com/kit1980
---
 .../src/ATen/native/mps/operations/UnaryOps.mm | 18 ++++++++++++++++++
 aten/src/ATen/native/native_functions.yaml     |  1 +
 test/test_mps.py                               | 14 ++++++++++++++
 3 files changed, 33 insertions(+)

diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index dd9c8176d0b7c..2763eff39f6a6 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -93,6 +93,24 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
                   { return mps::trunc_tensor(mpsGraph, inputTensor); });
 }
 
+TORCH_IMPL_FUNC(signbit_out_mps) (const Tensor& self, const Tensor& output) {
+  mps::unary_op(self, output, "signbit_out_mps",
+                ^ MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
+                    MPSGraphTensor* output;
+                    // signbit is not implemented for int64 type.
+                    // workaround for `Function signbitOp_i64 was not found in the library`
+                    if ([inputTensor dataType] == MPSDataTypeInt64) {
+                      MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0 dataType:inputTensor.dataType];
+                      output = [mpsGraph lessThanWithPrimaryTensor:inputTensor
+                                                   secondaryTensor:zeroTensor
+                                                              name:nil];
+                    } else {
+                      output = [mpsGraph signbitWithTensor: inputTensor name: nil];
+                    }
+                    return mps::castMPSTensor(mpsGraph, output, ScalarType::Bool);
+                 });
+}
+
 TORCH_IMPL_FUNC(sign_out_mps) (const Tensor& self, const Tensor& output) {
   mps::unary_op(self, output, "sign_out_mps",
                 ^ MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index d514cae670855..c1c2b363cb99b 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -8533,6 +8533,7 @@
   dispatch:
     CPU: signbit_out
     CUDA: signbit_out
+    MPS: signbit_out_mps
     SparseCPU, SparseCUDA: signbit_sparse_out
     SparseCsrCPU, SparseCsrCUDA: signbit_sparse_csr_out
 
diff --git a/test/test_mps.py b/test/test_mps.py
index 8eeae7dbcaf7b..98df393c3e955 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -4154,6 +4154,20 @@ def helper(shape):
 
         helper((2, 8, 4, 5))
 
+    def test_signbit(self):
+        def helper(shape, dtype):
+            cpu_x = torch.randn(shape, device='cpu').to(dtype)
+            x = cpu_x.clone().to('mps')
+
+            signbit_result = torch.signbit(x)
+            signbit_result_cpu = torch.signbit(cpu_x)
+
+            self.assertEqual(signbit_result, signbit_result_cpu)
+
+        helper((2, 8, 4, 5), torch.int)
+        helper((2, 8, 4, 5), torch.float)
+        helper((2, 8, 4, 5), torch.int64)
+
     # Test neg
     def test_neg(self):
         def helper(shape):

From 3eae04586282d1452eaf090992742fbc4f4004c6 Mon Sep 17 00:00:00 2001
From: Jagadish Krishnamoorthy <jagdish.krishna@gmail.com>
Date: Tue, 25 Oct 2022 07:17:44 +0000
Subject: [PATCH 0118/1922] [ROCm] [FakeTensorTest] Enable
 test_fallback_memory_prop (#85760)

Signed-off-by: Jagadish Krishnamoorthy <jagdish.krishna@gmail.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/85760
Approved by: https://github.com/kit1980
---
 test/test_fake_tensor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
index 0d81cdf10f82f..50a92436f406b 100644
--- a/test/test_fake_tensor.py
+++ b/test/test_fake_tensor.py
@@ -329,7 +329,6 @@ def fn(
                         self.assertTrue(isinstance(ten, FakeTensor))
                     self.assertEqual(ten.device.type, 'cuda')
 
-    @skipIfRocm
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_fallback_memory_prop(self):
         m = nn.Conv2d(16, 33, 3, stride=2, device="cuda", dtype=torch.half)

From 2e786e59ee1d29930535b14471d05cdc3a138eca Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Mon, 24 Oct 2022 12:57:57 -0700
Subject: [PATCH 0119/1922] [inductor] Trivial smoke-test (#87598)

As we're bringing up dynamo+inductor on Meta-internal infra, I keep
wanting a stupidly simple program to run to see if anything at all is working.
This test is that program :-p.

Obviously test_torchinductor.py is more comprehensive but it's also harder to
tell exactly what's going on, whereas this test fits on one screen.

Differential Revision: [D40595798](https://our.internmc.facebook.com/intern/diff/D40595798/)

**NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D40595798/)!

cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87598
Approved by: https://github.com/anijain2305, https://github.com/brad-mengchi
---
 test/inductor/test_smoke.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 test/inductor/test_smoke.py

diff --git a/test/inductor/test_smoke.py b/test/inductor/test_smoke.py
new file mode 100644
index 0000000000000..64afbcf0254e3
--- /dev/null
+++ b/test/inductor/test_smoke.py
@@ -0,0 +1,30 @@
+# Owner(s): ["module: inductor"]
+import logging
+import unittest
+
+import torch
+import torch._dynamo as torchdynamo
+import torch._inductor.config as torchinductor_config
+
+torchdynamo.config.log_level = logging.INFO
+torchdynamo.config.verbose = True
+torchinductor_config.debug = True
+
+
+class MLP(torch.nn.Module):
+    def __init__(self):
+        super(MLP, self).__init__()
+        self.l1 = torch.nn.Linear(1, 6)
+        self.l2 = torch.nn.Linear(6, 1)
+
+    def forward(self, x=None):
+        x = torch.relu(self.l1(x))
+        x = torch.relu(self.l2(x))
+        return x
+
+
+class SmokeTest(unittest.TestCase):
+    def test_mlp(self):
+        mlp = torchdynamo.optimize("inductor")(MLP().cuda())
+        for _ in range(3):
+            mlp(torch.randn(1, device="cuda"))

From e34086840fae2787d36768d6c9f9536f2007dfcb Mon Sep 17 00:00:00 2001
From: Driss Guessous <drisspg@fb.com>
Date: Tue, 25 Oct 2022 14:44:05 +0000
Subject: [PATCH 0120/1922] Performance improvment to cumulative seq len
 (#87530)

# Summary
Performance improvement to calculating metadata needed for gluing in nested tensors to fused kernels.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87530
Approved by: https://github.com/cpuhrsch
---
 .../cuda/NestedTensorTransformerFunctions.cpp | 32 ++++++++++---------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
index a90af2fe0af32..4028c8d5c3e4b 100644
--- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
@@ -234,7 +234,7 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_forward_nested(
         return std::make_tuple(Tensor(), Tensor());
     }
 }
-
+namespace{
 
 /**
  * This function is used to calculate two pieces of metadata that are needed
@@ -242,9 +242,10 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_forward_nested(
  * cumulative sequence_length over a batch of sequences and the maximum sequence
  * length.
  *
- * @return A tuple of cumulative sequence lengths and the maximum sequence length
+ * @return A tuple of cumulative sequence lengths and the maximum sequence length,
+ * and the last element in the cumulative_sequence_lengths
  */
-std::tuple<Tensor, int64_t> cumulative_and_max_seq_len(Tensor qkv) {
+std::tuple<Tensor, int64_t, int64_t> cumulative_and_max_seq_len(Tensor qkv) {
   TORCH_CHECK(
       qkv.is_nested(),
       "QKV must be nested for flash cumulative_seq_len calculation.")
@@ -274,7 +275,7 @@ std::tuple<Tensor, int64_t> cumulative_and_max_seq_len(Tensor qkv) {
   // Send to GPU, this is pretty light weight calc for normal batch size
   // but maybe this needs to be on gpu
   cumulative_seqlen = cumulative_seqlen.to(TensorOptions().device(at::kCUDA));
-  return std::tuple<Tensor, int64_t>{cumulative_seqlen, max_seqlen};
+  return std::tuple<Tensor, int64_t, int64_t>{cumulative_seqlen, max_seqlen, sum};
 }
 
 /**
@@ -337,6 +338,7 @@ bool is_safe_to_get_storage_as_tensor(const NestedTensorImpl* tensor) {
   return true;
 }
 
+} // namespace
 std::tuple<Tensor, Tensor> mem_efficient_helper_nested_unpacked(
     const Tensor& query,
     const Tensor& key,
@@ -354,19 +356,19 @@ std::tuple<Tensor, Tensor> mem_efficient_helper_nested_unpacked(
   Tensor k_t = key.transpose(1, 2);
   Tensor v_t = value.transpose(1, 2);
 
-  auto cumulative_and_max_q = cumulative_and_max_seq_len(q_t);
-  auto cumulative_and_max_k = cumulative_and_max_seq_len(k_t);
+  auto cumulative_and_max_q_and_nnz_q = cumulative_and_max_seq_len(q_t);
+  auto cumulative_and_max_k_and_nnz_k = cumulative_and_max_seq_len(k_t);
 
   // K and V have to have the same Nnz, should probably torch_check
   // assume in order to not iterate over v
 
-  Tensor cumulative_sequence_length_q = std::get<0>(cumulative_and_max_q);
-  Tensor cumulative_sequence_length_k = std::get<0>(cumulative_and_max_k);
+  Tensor cumulative_sequence_length_q = std::get<0>(cumulative_and_max_q_and_nnz_q);
+  Tensor cumulative_sequence_length_k = std::get<0>(cumulative_and_max_k_and_nnz_k);
 
-  const int64_t max_seqlen_batch_q = std::get<1>(cumulative_and_max_q);
+  const int64_t max_seqlen_batch_q = std::get<1>(cumulative_and_max_q_and_nnz_q);
 
-  const int64_t Nnz_q = cumulative_sequence_length_q[-1].item<int64_t>();
-  const int64_t Nnz_kv = cumulative_sequence_length_k[-1].item<int64_t>();
+  const int64_t Nnz_q = std::get<2>(cumulative_and_max_q_and_nnz_q);
+  const int64_t Nnz_kv = std::get<2>(cumulative_and_max_k_and_nnz_k);
 
   Tensor query_buffer_reshaped;
   Tensor key_buffer_reshaped;
@@ -460,15 +462,15 @@ Tensor flash_attention_helper(
   int64_t head_dim{query.size(-1)};
   int64_t num_heads{query.size(-2)};
 
-  auto cumulative_and_max_q = cumulative_and_max_seq_len(query);
-  Tensor cumulative_sequence_length_q = std::get<0>(cumulative_and_max_q);
-  int64_t max_seqlen_batch_q = std::get<1>(cumulative_and_max_q);
+  auto cumulative_and_max_q_and_nnz_q = cumulative_and_max_seq_len(query);
+  Tensor cumulative_sequence_length_q = std::get<0>(cumulative_and_max_q_and_nnz_q);
+  int64_t max_seqlen_batch_q = std::get<1>(cumulative_and_max_q_and_nnz_q);
 
   TORCH_CHECK(
       key.is_same(key) && query.is_same(value),
       "Key and Value must be the same tensor");
 
-  int64_t Nnz_q{cumulative_sequence_length_q[-1].item<int64_t>()};
+  int64_t Nnz_q = std::get<2>(cumulative_and_max_q_and_nnz_q);
 
   // For the packed case we need to set the output size for dim 2 to 1
   auto atten_size = get_nested_size_tensor(query).clone();

From 53292a7d9760f47c0c2353321ff973c646db26dd Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Tue, 25 Oct 2022 14:45:12 +0000
Subject: [PATCH 0121/1922] Revert "Intercept aten._reshape_alias for nvFuser
 (#87072)"

This reverts commit 163a829caa82559e7f938f65c1b647a5d50663c3.

Reverted https://github.com/pytorch/pytorch/pull/87072 on behalf of https://github.com/malfet due to Looks like it broke test_indexing in dynamo shard, see https://github.com/pytorch/pytorch/actions/runs/3318778609/jobs/5483248042
---
 torch/_prims/context.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/torch/_prims/context.py b/torch/_prims/context.py
index 2bcee069d146c..fea3f17a5009b 100644
--- a/torch/_prims/context.py
+++ b/torch/_prims/context.py
@@ -405,12 +405,6 @@ def __torch_function__(
                 warn("view has ignored kwargs!")
             return torch.ops.nvprims.view(a, shape)
 
-        if orig_func == torch.ops.aten._reshape_alias.default:
-            a, shape, stride = args
-            if len(kwargs) > 0:
-                warn("view has ignored kwargs!")
-            return torch.ops.nvprims.view(a, shape)
-
         if self._is_native_batch_norm(orig_func):
             return torch.ops.nvprims.native_batch_norm(*args, **kwargs)
 

From c82324336d214f3073379b455e4286941ead46d4 Mon Sep 17 00:00:00 2001
From: AllenTiTaiWang <titaiwang@microsoft.com>
Date: Mon, 24 Oct 2022 21:14:18 +0000
Subject: [PATCH 0122/1922] [ONNX] Add Support on 0d tensor Broadcast (#87211)

I am not sure if this will break things ...

Although 0d tensor is an undefined behavior in ONNX spec, I did some experiments and found that ONNX shape inference actually provides 0d as inference from 0d and 1d Op calculations, and the bug happened in Broadcast function. But still, if this breaks things really bad, I think we can put 0d tensor handling on hold, as it's not very common usage on models?
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87211
Approved by: https://github.com/jcwchen, https://github.com/BowenBao
---
 .../csrc/jit/passes/onnx/shape_type_inference.cpp | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
index 248733f746a63..d2873ddf464cb 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
@@ -700,18 +700,25 @@ std::vector<::c10::ShapeSymbol> Broadcast(
     const c10::ShapeSymbol& ss_shape_1 = input_shape_value_1[rank_1 - 1 - idx];
     bool is_static_0 = ss_shape_0.is_static();
     bool is_static_1 = ss_shape_1.is_static();
+    size_t shape_idx = rank_max - 1 - idx;
     if (is_static_0 && is_static_1) {
       int64_t static_0_sz = ss_shape_0.static_size();
       int64_t static_1_sz = ss_shape_1.static_size();
-      final_shape[rank_max - 1 - idx] = ::c10::ShapeSymbol::fromStaticSize(
-          std::max(static_0_sz, static_1_sz));
+      // condition for corner case of 0d tensor
+      // 0d tensor with 1d tensor would give us 0d tensor
+      if (std::min(static_0_sz, static_1_sz) == 0) {
+        final_shape[shape_idx] = ::c10::ShapeSymbol::fromStaticSize(
+            std::min(static_0_sz, static_1_sz));
+      } else {
+        final_shape[shape_idx] = ::c10::ShapeSymbol::fromStaticSize(
+            std::max(static_0_sz, static_1_sz));
+      }
     } else if (!is_static_0 && !is_static_1) {
       if (ss_shape_0.value() == ss_shape_1.value()) {
-        final_shape[rank_max - 1 - idx] = ss_shape_0;
+        final_shape[shape_idx] = ss_shape_0;
       }
     }
   }
-
   if (rank_0 < rank_1) {
     for (size_t idx = rank_min; idx < rank_max; idx++) {
       size_t shape_idx = rank_max - 1 - idx;

From b4e180d93f71c5278b080abd472df0f7a9fae9d9 Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Tue, 25 Oct 2022 04:46:42 +0000
Subject: [PATCH 0123/1922] Prefer python meta function over c++ meta function
 (#87426)

This is a policy update for meta registration. **We now prefer python meta implementation over C++ meta function.**  This is a flip of the previous policy, where we prefer C++ meta function over python meta function if they both exist.

Here's the meta registration process:
1. register_meta and register_decomposition will place the python meta/decomp functions into the `global_decomp_table`.  However, they will NOT register them into dispatcher.
2. After global_decomp_table is populated, we will compile an `active_meta_table`. For a given op, we pick the most specific decomp function from `global_decomp_table` in the preference order of Meta > PostAutograd > PreAutograd.
3. We will unconditionally register all of them into python dispatcher. And register them into C++ dispatcher, unless it one of the following 3 cases
- 1. the op is a CompositeImplicitAutograd, and should rely on decomposed op's meta
- 2. the op is a view op, as the MetaTensor doesn't support aliased storage
- 3. the op is in the blocklist (due to UT failures, and we will burn down this list op by op)

Over the long run, we wish to implement all meta functions in python. With this PR, 321 op_overloads will have cpp meta overridden by python meta. There are still 400 op_overloads is using cpp meta. The exact list can be found here https://gist.github.com/SherlockNoMad/d20bb736178df8eebd3b054c8bb7cdc5

cc @ngimel @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87426
Approved by: https://github.com/ezyang, https://github.com/jansel
---
 aten/src/ATen/core/dispatch/OperatorEntry.cpp |  18 ++-
 test/test_ops.py                              |   2 -
 torch/_decomp/__init__.py                     |  96 +++++---------
 torch/_decomp/decompositions.py               |  14 +-
 torch/_inductor/decomposition.py              |   4 +-
 torch/_meta_registrations.py                  | 123 +++++++++++++-----
 torch/_ops.py                                 |   4 +
 torch/_refs/__init__.py                       |  30 ++---
 torch/_subclasses/fake_tensor.py              |   3 +-
 torch/library.py                              |   5 +-
 10 files changed, 162 insertions(+), 137 deletions(-)

diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
index 822924a602533..5bd5d8abf54dc 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@@ -147,13 +147,17 @@ OperatorEntry::AnnotatedKernelContainerIterator OperatorEntry::registerKernel(
 #else
   if (k.size() > 0) {
 #endif
-    TORCH_WARN("Overriding a previously registered kernel for the same operator and the same dispatch key\n",
-               "  operator: ", (schema_.has_value() ? toString(schema_->schema) : toString(name_)), "\n",
-               "    ", (this->schema_.has_value() ? this->schema_->debug : "no debug info"), "\n",
-               "  dispatch key: ", toString(dispatch_key), "\n",
-               "  previous kernel: ", (cpp_signature_.has_value() ? cpp_signature_->debug : (sym_cpp_signature_.has_value() ? sym_cpp_signature_->debug : "no debug info")), "\n",
-               "       new kernel: ", debug
-    );
+    // Suppress the warning for Meta key as we are overriding C++ meta functions with python meta functions
+    // for some ops
+    if (dispatch_key != DispatchKey::Meta) {
+      TORCH_WARN("Overriding a previously registered kernel for the same operator and the same dispatch key\n",
+            "  operator: ", (schema_.has_value() ? toString(schema_->schema) : toString(name_)), "\n",
+            "    ", (this->schema_.has_value() ? this->schema_->debug : "no debug info"), "\n",
+            "  dispatch key: ", toString(dispatch_key), "\n",
+            "  previous kernel: ", (cpp_signature_.has_value() ? cpp_signature_->debug : (sym_cpp_signature_.has_value() ? sym_cpp_signature_->debug : "no debug info")), "\n",
+            "       new kernel: ", debug
+      );
+    }
   }
 
 #ifdef C10_DISPATCHER_ONE_KERNEL_PER_DISPATCH_KEY
diff --git a/test/test_ops.py b/test/test_ops.py
index c63de0a4778d3..5e9371e982341 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -1891,8 +1891,6 @@ def test_refs_are_in_decomp_table(self, op):
     "svd_lowrank",
     "sgn",
     "cholesky",
-    "linalg.eigh",
-    "symeig",
 }
 
 fake_backward_xfails = {xfail(stride_skip) for stride_skip in fake_backward_xfails} | {
diff --git a/torch/_decomp/__init__.py b/torch/_decomp/__init__.py
index 2dcda014cea30..d50f33933da49 100644
--- a/torch/_decomp/__init__.py
+++ b/torch/_decomp/__init__.py
@@ -26,16 +26,34 @@
 pre_autograd_decomposition_table = global_decomposition_table["pre_autograd"]
 meta_table = global_decomposition_table["meta"]
 
-meta_lib = torch.library.Library("aten", "IMPL", "Meta")
-
-# decompositions which have been disabled as meta kernel implementations,
-# usually due to mismatching strides, aliasing, or other inconsistent property
-_disabled_meta_decomps = set()
 
+def _add_op_to_registry(registry, op, fn):
+    """
+    This is an internal API for adding an op to the decomposition table.
 
-def register_decomposition(
-    aten_op, registry=None, *, type="post_autograd", disable_meta: bool = False
-):
+    If op is OpOverload, it will be added to the registry directly.
+    If op is OpOverloadPacket, all the valid op_overloads in the packet will be added to the registry.
+    """
+    overloads = []
+    if isinstance(op, OpOverload):
+        overloads.append(op)
+    else:
+        assert isinstance(op, OpOverloadPacket)
+        for ol in op.overloads():
+            overloads.append(getattr(op, ol))
+
+    for op_overload in overloads:
+        if op_overload in registry:
+            raise RuntimeError(f"duplicate registrations for {op_overload}")
+
+        # TorchScript dumps a bunch of extra nonsense overloads
+        # which don't have corresponding dispatcher entries, we need
+        # to filter those out, e.g aten.add.float_int
+        if torch._C._dispatch_has_kernel(op_overload.name()):
+            registry[op_overload] = fn
+
+
+def register_decomposition(aten_op, registry=None, *, type="post_autograd"):
     """
     A decorator to register a function as a decomposition to the Python
     decomposition table.  Use it like this::
@@ -52,9 +70,8 @@ def clamp_min(x):
     autograd) and not just backend tracing, where we then need to know if a
     decomposition can be used to simulate a transform.
 
-    By default, if the decomposition is for an operator that doesn't have
-    a Meta implementation, we will register it to the dispatcher.  Use
-    `disable_meta` to disable this behavior.
+    By default, we also will register it to the Meta key of dispatcher,
+    and replace the c++ Meta implementation if there is already one.
     """
 
     assert type in {"post_autograd", "pre_autograd", "meta"}
@@ -106,62 +123,11 @@ def _fn(*args, **kwargs):
         if registry is None:
             registry = global_decomposition_table[type]
 
-        def add_op_to_table(aten_op):
-            overloads = []
-            if isinstance(aten_op, OpOverload):
-                overloads.append(aten_op)
-            else:
-                assert isinstance(aten_op, OpOverloadPacket)
-                for ol in aten_op.overloads():
-                    overloads.append(getattr(aten_op, ol))
-            for op_overload in overloads:
-                if op_overload in registry:
-                    raise RuntimeError(f"duplicate registrations for {op_overload}")
-                registry[op_overload] = fn
-                op_overload.py_impl(torch._C.DispatchKey.Meta)(fn)
-                # TODO: factor this logic into OpOverload or Library API
-                name = op_overload._schema.name
-                if op_overload._schema.overload_name:
-                    name += "." + op_overload._schema.overload_name
-
-                if disable_meta:
-                    global _disabled_meta_decomps
-                    _disabled_meta_decomps.add(op_overload)
-
-                if (
-                    not disable_meta
-                    # TorchScript dumps a bunch of extra nonsense overloads
-                    # which don't have corresponding dispatcher entries, we need
-                    # to filter those out
-                    and torch._C._dispatch_has_kernel(name)
-                    # Don't register a python meta kernel to any operator that has
-                    # should already work with meta tensors today.
-                    # We can check that by seeing if the "computed table" for the operator
-                    # has a registration to Meta;
-                    # either through a direct registration, or an indirect one through
-                    # an alias dispatch key (e.g. CompositeImplicitAutograd)
-                    and not torch._C._dispatch_has_computed_kernel_for_dispatch_key(
-                        name, "Meta"
-                    )
-                ):
-                    if any(
-                        a.alias_info is not None and not a.alias_info.is_write
-                        for a in op_overload._schema.arguments
-                    ):
-                        raise RuntimeError(
-                            f"""
-Attempting to register a python meta kernel for a view operator: {str(op_overload)}.
-We shouldn't do this, because the output will report as not having aliased storages.
-All view ops have meta kernels in C++ today, so we should use those instead.
-
-If you're registering an operator through the `@register_decomposition` decorator,
-Please set `disable_meta=True`.
-                        """
-                        )
-                    meta_lib.impl(op_overload, fn)
+        def register(op):
+            _add_op_to_registry(registry, op, fn)
 
         # To handle allowing multiple aten_ops at once
-        tree_map(add_op_to_table, aten_op)
+        tree_map(register, aten_op)
         return fn
 
     return decomposition_decorator
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 2b4d2914fe858..234e43d12bf81 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -1073,7 +1073,7 @@ def prod(x: List[int]):
     return r
 
 
-@register_decomposition(aten.split_with_sizes, disable_meta=True)
+@register_decomposition(aten.split_with_sizes)
 def split_with_sizes(
     self: Tensor, split_sizes: List[int], dim: int = 0
 ) -> List[Tensor]:
@@ -1087,7 +1087,7 @@ def split_with_sizes(
     return splits
 
 
-@register_decomposition(aten.split.Tensor, disable_meta=True)
+@register_decomposition(aten.split.Tensor)
 def split(self: Tensor, split_size: int, dim: int = 0) -> List[Tensor]:
     input_sizes = self.shape
     dim_size = input_sizes[dim]
@@ -1131,7 +1131,7 @@ def normalize(input, norm_dims, eps):
     return out, mean, rstd
 
 
-@register_decomposition(aten.native_group_norm.default, disable_meta=True)
+@register_decomposition(aten.native_group_norm.default)
 def native_group_norm(
     input: Tensor,
     weight: Optional[Tensor],
@@ -1500,7 +1500,7 @@ def std_decomposition(
 # Questionable decompositions
 # This is only valid if we're running the graph without autograd, such as if the backward pass has been traced.
 # Note that this decomposition causes issues with in-place ops
-@register_decomposition([aten.detach, aten.lift, aten.lift_fresh], disable_meta=True)
+@register_decomposition([aten.detach, aten.lift, aten.lift_fresh])
 def nop_decomposition(x):
     return aten.alias(x)
 
@@ -1666,7 +1666,7 @@ def cudnn_batch_norm_backward(
     )
 
 
-@register_decomposition(aten._adaptive_avg_pool2d, disable_meta=True)
+@register_decomposition(aten._adaptive_avg_pool2d)
 @pw_cast_for_opmath
 def adaptive_avg_pool2d(input: Tensor, output_size: Tuple[int, int]):
     # Preconditions
@@ -1928,7 +1928,7 @@ def is_same_size(a: Tensor, b: Tensor) -> bool:
     return a.shape == b.shape
 
 
-@register_decomposition([aten._reshape_alias, aten._unsafe_view], disable_meta=True)
+@register_decomposition([aten._reshape_alias, aten._unsafe_view])
 def _reshape_alias(x, shape, *args):
     return aten.view(x, shape)
 
@@ -2194,7 +2194,7 @@ def mv(self, vec):
     return (self * vec).sum(dim=1)
 
 
-@register_decomposition(aten.dot, disable_meta=True)
+@register_decomposition(aten.dot)
 @out_wrapper()
 @pw_cast_for_opmath
 def dot(self, other):
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index c22a8406b9b61..b4c8087537c6f 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -109,7 +109,7 @@ def register_decomposition(ops):
     for op in [ops] if callable(ops) else ops:
         if op in decompositions:
             log.warning(f"duplicate decomp: {ops}")
-    return decomp.register_decomposition(ops, decompositions, disable_meta=True)
+    return decomp.register_decomposition(ops, decompositions)
 
 
 @register_decomposition([aten.clamp])
@@ -317,7 +317,7 @@ def bernoulli_p(self, p=0.5, *, generator=None):
 """
 extra_random_decomps = get_decompositions([aten.native_dropout])
 register_extra_random_decomp = functools.partial(
-    decomp.register_decomposition, registry=extra_random_decomps, disable_meta=True
+    decomp.register_decomposition, registry=extra_random_decomps
 )
 
 
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 22ceaaf0a18b0..cb961ff898790 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -4,7 +4,8 @@
 import torch
 import torch._prims_common as utils
 from torch import Tensor
-from torch._decomp import meta_table as meta_table
+from torch._decomp import _add_op_to_registry, global_decomposition_table, meta_table
+from torch._ops import OpOverload
 from torch._prims_common import (
     check,
     corresponding_complex_dtype,
@@ -21,27 +22,19 @@
 from torch._subclasses.fake_tensor import check_no_bool_index_tensors
 from torch.utils._pytree import tree_map
 
+
 aten = torch.ops.aten
 
 _meta_lib_dont_use_me_use_register_meta = torch.library.Library("aten", "IMPL", "Meta")
 
 
-def register_meta(op, register_dispatcher=True):
-    def wrapper(f):
-        def add_func(op):
-            meta_table[op] = f
-            if register_dispatcher:
-                name = (
-                    op.__name__
-                    if op._overloadname != "default"
-                    else op.overloadpacket.__name__
-                )
-                _meta_lib_dont_use_me_use_register_meta.impl(name, f)
-
-            op.py_impl(torch._C.DispatchKey.Meta)(f)
+def register_meta(op):
+    def wrapper(fn):
+        def register(op):
+            _add_op_to_registry(meta_table, op, fn)
 
-        tree_map(add_func, op)
-        return f
+        tree_map(register, op)
+        return fn
 
     return wrapper
 
@@ -101,7 +94,7 @@ def meta_fft_c2r(self, dim, normalization, lastdim):
     return self.new_empty(output_sizes, dtype=toRealValueType(self.dtype))
 
 
-@register_meta(aten.copy_.default, register_dispatcher=False)
+@register_meta(aten.copy_.default)
 def meta_copy_(self, src, non_blocking=False):
     return self
 
@@ -241,7 +234,7 @@ def meta_pad2d(self, padding):
         return self.new_empty((nbatch, nplane, output_h, output_w))
 
 
-@register_meta(aten.bernoulli_.float, register_dispatcher=False)
+@register_meta(aten.bernoulli_.float)
 def meta_bernoulli_(self, p=0.5, generator=None):
     return self
 
@@ -283,7 +276,7 @@ def meta_dot(self, tensor):
     return self.new_empty(())
 
 
-@register_meta([aten.mm.default], register_dispatcher=False)
+@register_meta([aten.mm.default])
 def meta_mm(a, b):
     check(a.dim() == 2, lambda: "a must be 2D")
     check(b.dim() == 2, lambda: "b must be 2D")
@@ -467,7 +460,7 @@ def check_dim_size(tensor, dim, dim_size, size):
     )
 
 
-@register_meta(aten.avg_pool2d.default, register_dispatcher=False)
+@register_meta(aten.avg_pool2d.default)
 def meta_avg_pool2d(
     input,
     kernel_size,
@@ -586,7 +579,7 @@ def avg_pool2d_backward_shape_check(
 
 
 # Don't override the C++ registration.
-@register_meta(aten.avg_pool2d_backward.default, register_dispatcher=False)
+@register_meta(aten.avg_pool2d_backward.default)
 def meta_avg_pool2d_backward(
     gradOutput_,
     input,
@@ -731,7 +724,7 @@ def vdot(self, other):
 # of indexing shape inference is useful,
 # but not registering it to the dispatcher because we already
 # get shape inference through structured kernels
-@register_meta(aten.index.Tensor, register_dispatcher=False)
+@register_meta(aten.index.Tensor)
 def meta_index_Tensor(self, indices):
     check_no_bool_index_tensors(aten.index.Tensor, self, indices)
     check(indices, lambda: "at least one index must be provided")
@@ -1090,42 +1083,42 @@ def meta_repeat(self, repeats):
     return self.new_empty(target_size)
 
 
-@register_meta(aten.zero_.default, register_dispatcher=False)
+@register_meta(aten.zero_.default)
 def meta_zero_(self):
     return self
 
 
-@register_meta([aten.fill_.Tensor, aten.fill_.Scalar], register_dispatcher=False)
+@register_meta([aten.fill_.Tensor, aten.fill_.Scalar])
 def meta_fill_(self, val):
     return self
 
 
-@register_meta([aten.fill.Tensor, aten.fill.Scalar], register_dispatcher=False)
+@register_meta([aten.fill.Tensor, aten.fill.Scalar])
 def meta_fill(self, val):
     return self.new_empty(self.shape)
 
 
-@register_meta(aten.relu_.default, register_dispatcher=False)
+@register_meta(aten.relu_.default)
 def meta_relu_(self):
     return self
 
 
-@register_meta(aten.index_put.default, register_dispatcher=False)
+@register_meta(aten.index_put.default)
 def meta_index_put(self, indices, values, accumulate=False):
     return self.new_empty(self.size())
 
 
-@register_meta(aten.masked_fill_.Scalar, register_dispatcher=False)
+@register_meta(aten.masked_fill_.Scalar)
 def meta_masked_fill_(self, mask, value):
     return self
 
 
-@register_meta(aten.index_put_.default, register_dispatcher=False)
+@register_meta(aten.index_put_.default)
 def meta_index_put_(self, indices, values, accumulate=False):
     return self
 
 
-@register_meta(aten.alias.default, register_dispatcher=False)
+@register_meta(aten.alias.default)
 def meta_alias(self):
     return self.view(self.shape)
 
@@ -1163,7 +1156,7 @@ def common_meta_baddbmm_bmm(batch1, batch2, is_bmm, self_baddbmm=None):
     return output
 
 
-@register_meta(aten.bmm.default, register_dispatcher=False)
+@register_meta(aten.bmm.default)
 def meta_bmm(self, mat2):
     return common_meta_baddbmm_bmm(self, mat2, True)
 
@@ -1273,7 +1266,7 @@ def pool2d_shape_check(
     )
 
 
-@register_meta(aten.max_pool2d_with_indices.default, register_dispatcher=False)
+@register_meta(aten.max_pool2d_with_indices.default)
 def meta_max_pool2d_with_indices(
     input, kernel_size, stride=(), padding=(0,), dilation=(1,), ceil_mode=False
 ):
@@ -1471,7 +1464,7 @@ def gather_shape_check(self, dim, index):
             )
 
 
-@register_meta(aten.gather.default, register_dispatcher=False)
+@register_meta(aten.gather.default)
 def meta_gather(self, dim, index, sparse_grad=False):
     wrapped_dim = maybe_wrap_dim(dim, self.dim())
     is_index_empty = index.numel() == 0
@@ -1587,7 +1580,7 @@ def scatter_meta_impl(self, dim, index, src=None, reduce_=None, use_new_options=
         get_operator_enum(reduce_, use_new_options)
 
 
-@register_meta(aten.scatter_add.default, register_dispatcher=False)
+@register_meta(aten.scatter_add.default)
 def meta_scatter_add(self, dim, index, src):
     scatter_meta_impl(self, dim, index, src, "add")
     return self.new_empty(self.shape)
@@ -1624,3 +1617,65 @@ def upsample_nearest2d_vec(input, output_size, scale_factors):
 import torch._refs
 import torch._refs.nn.functional
 import torch._refs.special
+
+
+def activate_meta():
+
+    activate_meta_table = {}
+
+    # For a given op, we pick the most specific decomp function from
+    # global_decomp_table in the precedence order of meta > post_autograd > pre_autograd
+    for type in ["meta", "post_autograd", "pre_autograd"]:
+        registry = global_decomposition_table[type]
+
+        for opo in registry:
+            if opo not in activate_meta_table:
+                activate_meta_table[opo] = registry[opo]
+
+    for op_overload, fn in activate_meta_table.items():
+        assert isinstance(op_overload, OpOverload)
+
+        op_overload.py_impl(torch._C.DispatchKey.Meta)(fn)
+
+        if torch._C._dispatch_has_kernel_for_dispatch_key(
+            op_overload.name(), "CompositeImplicitAutograd"
+        ):
+            # Internally, we shouldn't be registering meta kernels for any operators that
+            # have CompositeImplicitAutograd kernels.
+            # Instead, we should be letting those decompositions run, and writing meta kernels
+            # only for the base operators.
+            pass
+        elif any(
+            a.alias_info is not None and not a.alias_info.is_write
+            for a in op_overload._schema.arguments
+        ):
+            # Attempting to register a python meta kernel for a view operator.
+            # We shouldn't do this, because the output will report as not having aliased storages.
+            # All view ops have meta kernels in C++ today, so we should use those instead.
+            pass
+        elif op_overload.name() in {
+            "aten::empty_strided",  # causing infinite recursion, test_meta.py
+            "aten::clone",  # causing infinite recursion
+            "aten::_to_copy",  # causing infinite recursion, test_serialization.py -k test_tensor_subclass_getstate_overwrite  # noqa: B950
+            "aten::randn",  # pin_memory parameter is not supported!, test_proxy_tensor.py -k test_make_fx_symbolic_exhaustive_randn_cpu_float32  # noqa: B950
+            "aten::zeros.names",  # TypeError: zeros() got an unexpected keyword argument 'names', inductor/test_torchinductor.py -k test_zeros_cpu  # noqa: B950
+            "aten::empty.names",  # TypeError: empty() got an unexpected keyword argument 'names', inductor/test_torchinductor.py -k test_zeros_cpu  # noqa: B950
+            "aten::add.Tensor",  # ValueError: Receive two Number inputs to an elementwise binary operation! inductor/test_torchinductor.py -k test_both_scalars  # noqa: B950
+            "aten::sub.Tensor",  # ValueError: Receive two Number inputs to an elementwise binary operation! inductor/test_torchinductor.py -k test_both_scalars  # noqa: B950
+            "aten::mul.Tensor",  # ValueError: Receive two Number inputs to an elementwise binary operation! inductor/test_torchinductor.py -k test_both_scalars  # noqa: B950
+            "aten::div.Tensor",  # ValueError: Receive two Number inputs to an elementwise binary operation! test_fake_tensor.py -k test_scalar_inputs  # noqa: B950
+            "aten::div.Tensor_mode",  # ValueError: Receive two Number inputs to an elementwise binary operation! inductor/test_torchinductor.py -k test_div8_cpu  # noqa: B950
+            "aten::diag_embed",  # Stride mismatch! test_ops.py -k test_fake_autocast_diag_embed_cuda_float32  # noqa: B950
+            "aten::copy_",  # Exception not raiseed, test_torch.py -k test_storage_meta_errors_cpu_int64  # noqa: B950
+            "aten::constant_pad_nd",  # requires_grad mismatch, test_ops.py -k test_fake_crossref_backward_amp_istft_cuda_float32  # noqa: B950
+            "aten::masked_fill.Scalar",  # Stride mismatch! test_ops.py -k test_fake_crossref_backward_amp_nanquantile_cuda_float32  # noqa: B950
+            "aten::tril",  # Stride mismatch! test_ops.py -k test_fake_crossref_backward_amp_ormqr_cuda_float32  # noqa: B950
+            "aten::triu",  # Stride mismatch! test_ops.py -k test_fake_crossref_backward_amp_lu_solve_cuda_float32  # noqa: B950
+            "aten::rot90",  # requires_grad mismatch! test_ops.py -k test_fake_crossref_backward_amp_rot90_cuda_float32  # noqa: B950
+        }:
+            pass
+        else:
+            _meta_lib_dont_use_me_use_register_meta.impl(op_overload, fn)
+
+
+activate_meta()
diff --git a/torch/_ops.py b/torch/_ops.py
index b3ebd401ab8a2..ed0276d0ada2f 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -296,6 +296,10 @@ def inner(fn):
                 dispatch_key_or_mode != torch._C.DispatchKey.Python
             ), "Please register a mode for the torch._C.DispatchKey.Python key instead."
 
+            if dispatch_key_or_mode in self.py_kernels:
+                raise RuntimeError(
+                    f"Trying to override a python impl for {dispatch_key_or_mode} on operator {self._name}"
+                )
             self.py_kernels[dispatch_key_or_mode] = fn
             return fn
 
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 3e2f6c45768a6..ccb44c6367a50 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -379,7 +379,6 @@ def _make_elementwise_unary_reference(
     type_promotion_kind,
     *,
     aten_op=infer_aten_op,
-    disable_meta=False,
     extra_meta=None,
 ) -> Callable:
     def inner(prim: Callable):
@@ -406,7 +405,7 @@ def _ref(a: TensorLikeType) -> TensorLikeType:
         if aten_op is infer_aten_op:
             aten_op = getattr(torch.ops.aten, prim.__name__)
         if aten_op is not None:
-            register_decomposition(aten_op, disable_meta=disable_meta)(_ref)
+            register_decomposition(aten_op)(_ref)
 
         return _ref
 
@@ -853,7 +852,6 @@ def _make_elementwise_binary_reference(
     has_out=True,
     supports_lhs_python_scalar=True,
     supports_rhs_python_scalar=True,
-    disable_meta=False,
 ) -> Callable:
     @elementwise_type_promotion_wrapper(
         type_promoting_args=("a", "b"),
@@ -876,7 +874,7 @@ def _ref(
         # TODO: enable this for operations that support it, like add
         if isinstance(a, Number) and isinstance(b, Number):
             raise ValueError(
-                "Receive two Number inputs to an elementwise binary operation!"
+                f"Receive two Number inputs to an elementwise binary operation {prim}!"
             )
 
         a, b = _maybe_broadcast(a, b)
@@ -888,7 +886,7 @@ def _ref(
     if aten_op is infer_aten_op:
         aten_op = getattr(torch.ops.aten, prim.__name__.split(".")[0])
     if aten_op is not None:
-        register_decomposition(aten_op, disable_meta=disable_meta)(_ref)
+        register_decomposition(aten_op)(_ref)
 
     return _ref
 
@@ -2628,7 +2626,7 @@ def dstack(tensors: TensorSequenceType) -> TensorLikeType:
     return cat(aligned_tensors, 2)
 
 
-@register_decomposition(torch.ops.aten.expand, disable_meta=True)
+@register_decomposition(torch.ops.aten.expand)
 def expand(a: Tensor, *shape) -> Tensor:
     # NOTE: cannot use utils.extract_shape_from_varargs here
     # because that also validates the shape, but the shape
@@ -2849,7 +2847,7 @@ def native_layer_norm(
 
 # TODO: Adding this as a meta function causes functorch tests to fail when compiled with debug mode.
 # test/test_eager_transforms.py::TestFunctionalizeCPU::test_functionalize_fx_transpose_simple_cpu
-@register_decomposition(torch.ops.aten.permute, disable_meta=True)
+@register_decomposition(torch.ops.aten.permute)
 def permute(a: TensorLikeType, *dims) -> TensorLikeType:
     _permutation = utils.canonicalize_dims(
         a.ndim, utils.extract_dims_from_varargs(dims)
@@ -3285,7 +3283,7 @@ def index_add(
     return x.clone().index_add_(dim, index, tensor, alpha=alpha)  # type: ignore[arg-type]
 
 
-@register_decomposition(torch.ops.aten.index_select, disable_meta=True)
+@register_decomposition(torch.ops.aten.index_select)
 @out_wrapper()
 def index_select(x: TensorLike, dim: int, index: TensorLike):
     dim = utils.canonicalize_dims(x.ndim, dim)
@@ -3304,7 +3302,7 @@ def index_select(x: TensorLike, dim: int, index: TensorLike):
 
 
 # Note: although squeeze is documented as having the out= kwarg it doesn't
-@register_decomposition(torch.ops.aten.squeeze, disable_meta=True)
+@register_decomposition(torch.ops.aten.squeeze)
 def squeeze(a: TensorLikeType, dim: Optional[int] = None) -> TensorLikeType:
     if dim is not None:
         dim = utils.canonicalize_dim(a.ndim, dim)
@@ -3500,7 +3498,7 @@ def diag(
         return torch.diagonal_copy(self, offset)
 
 
-@register_decomposition(torch.ops.aten.diagonal, disable_meta=True)
+@register_decomposition(torch.ops.aten.diagonal)
 def diagonal(
     self: TensorLikeType,
     offset: int = 0,
@@ -3613,7 +3611,7 @@ def dsplit(a: TensorLikeType, sections: DimsType) -> TensorSequenceType:
     return tensor_split(a, sections, 2)
 
 
-@register_decomposition(torch.ops.aten.t.default, disable_meta=True)
+@register_decomposition(torch.ops.aten.t.default)
 def t(a: TensorLikeType):
     # TODO: Add sparse support
     # if a.is_sparse:
@@ -3644,7 +3642,7 @@ def T(a: TensorLikeType) -> TensorLikeType:
     return a.t()
 
 
-@register_decomposition(torch.ops.aten.transpose, disable_meta=True)
+@register_decomposition(torch.ops.aten.transpose)
 def transpose(a: TensorLikeType, dim0: int, dim1: int) -> TensorLikeType:
     _dim0, _dim1 = utils.canonicalize_dims(a.ndim, (dim0, dim1))  # type: ignore[misc]
 
@@ -3674,7 +3672,9 @@ def unfold(
 @register_decomposition(torch.ops.aten.unfold_copy)
 @out_wrapper()
 def unfold_copy(self: TensorLikeType, dimension: int, size: int, step: int):
-    return self.unfold(dimension, size, step).clone()
+    return self.unfold(dimension, size, step).clone(
+        memory_format=torch.contiguous_format
+    )
 
 
 @register_decomposition(torch.ops.aten.cumsum)
@@ -3701,7 +3701,7 @@ def cumsum(
     return sum(masked_a, dim=dim, keepdim=keepdim, dtype=dtype, out=out)
 
 
-@register_decomposition(torch.ops.aten.unsqueeze, disable_meta=True)
+@register_decomposition(torch.ops.aten.unsqueeze)
 def unsqueeze(a: TensorLikeType, dim: int) -> TensorLikeType:
     # Note that unsqueeze canonicalizes with rank + 1 because it allows
     # a new innermost dimension to be specified
@@ -3714,7 +3714,7 @@ def unsqueeze(a: TensorLikeType, dim: int) -> TensorLikeType:
 # Tensor.view(a, b, c) or Tensor.view((a, b, c)) Function call torch.view
 # doesn't support unpacked shapes
 # TODO: Turn this into a decomposition (currently fails on reshape meta tests)
-@register_decomposition(torch.ops.aten.view, disable_meta=True)
+@register_decomposition(torch.ops.aten.view)
 def view(a: TensorLikeType, *shape: ShapeType) -> TensorLikeType:
     return _reshape_view_helper(a, *shape, allow_copy=False)
 
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 56b6d4b826af7..2f2f07f3db378 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -747,7 +747,7 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         # is written to must be invalidated
         self.invalidate_written_to_constants(func, flat_arg_fake_tensors, args, kwargs)
 
-        from torch._decomp import _disabled_meta_decomps, decomposition_table
+        from torch._decomp import decomposition_table
 
         with self:
             # Decomposes CompositeImplicitAutograd ops
@@ -781,7 +781,6 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         if (
             func in decomposition_table
             and torch_decomp_decompositions(func)
-            and func not in _disabled_meta_decomps
             and all(not e.is_sparse for e in flat_arg_fake_tensors)
         ):
             with self:
diff --git a/torch/library.py b/torch/library.py
index e97ae57267812..d75427ea4c703 100644
--- a/torch/library.py
+++ b/torch/library.py
@@ -114,13 +114,12 @@ def impl(self, op_name, fn, dispatch_key=''):
             dispatcher_op_name = name
             if '::' not in dispatcher_op_name:
                 dispatcher_op_name = f'{self.ns}::{dispatcher_op_name}'
-            # get a string containing the names of every dispatch key that the operator has a registration for.
-            dispatch_key_registration = torch._C._dispatch_dump(dispatcher_op_name)
+
             # Internally, we shouldn't be registering meta kernels for any operators that
             # have CompositeImplicitAutograd kernels.
             # Instead, we should be letting those decompositions run, and writing meta kernels
             # only for the base operators.
-            if 'CompositeImplicitAutograd' in dispatch_key_registration:
+            if torch._C._dispatch_has_kernel_for_dispatch_key(dispatcher_op_name, "CompositeImplicitAutograd"):
                 raise RuntimeError(
                     f"We should not register a meta kernel directly to the operator '{name}',"
                     " because it has a CompositeImplicitAutograd kernel in core."

From 1abd76b39a281ec19edf76e570cd9ff709472f4b Mon Sep 17 00:00:00 2001
From: Horace He <chilli@fb.com>
Date: Tue, 25 Oct 2022 04:04:16 +0000
Subject: [PATCH 0124/1922] Add get_guard_expr to symbolic_shapes which returns
 all guards in a single expression (#87665)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87665
Approved by: https://github.com/ezyang, https://github.com/voznesenskym
---
 test/test_dynamic_shapes.py              |  3 +--
 test/test_proxy_tensor.py                |  9 ++++++++
 torch/fx/experimental/symbolic_shapes.py | 29 ++++++++++++++----------
 3 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index 87b1dd9aa8217..b183b6169dd6b 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -327,8 +327,7 @@ def test_guard_int(self):
         shape_env = ShapeEnv()
         a0 = create_symint(shape_env, 2)
         self.assertEqual(a0.guard_int(), 2)
-        self.assertEqual(str(shape_env.guards[0][0]), "s0")
-        self.assertEqual(shape_env.guards[0][1], 2)
+        self.assertEqual(str(shape_env.guards[0][0]), "Eq(s0, 2)")
 
     @skipIfNoSympy
     def test_int_conversion(self):
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 0092daa77ab49..3c2e818497a48 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -976,6 +976,15 @@ def f(x):
             return x.shape
         self._test_dynamic(f, [(5, 3)], [[(4, 6)]])
 
+    def test_mega_guard(self):
+        def f(a, b):
+            assert a.shape[0] == b.shape[0] * 2
+            assert b.shape[0] == 8
+            return a.cos()
+        fx_g = make_fx(f, tracing_mode="symbolic")(torch.randn(16), torch.randn(8))
+        self.assertExpectedInline(str(fx_g.shape_env.get_guard_expr()), "Eq(s1, 8) & Eq(s0, 2*s1)")
+
+
     def _assert_no_guards(self, fx_g, free_symbols):
         assert _get_free_symbols(fx_g.shape_env) == free_symbols, fx_g.shape_env.var_to_val
         assert len(fx_g.shape_env.get_nontrivial_guards()) == 0, fx_g.shape_env.format_guards()
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index 7615e410e2515..29a49b50ba29b 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -439,26 +439,26 @@ def evaluate_guards_for_args(self, *args):
         # and wrap_fake_symbolic
         meta_converter = MetaConverter()
         pytree.tree_map_only(torch.Tensor, partial(meta_converter, shape_env=new_env), args)
-        return all(guard.xreplace(new_env.var_to_val) == value for guard, value, _ in self.guards)
+        return all(guard.xreplace(new_env.var_to_val) for guard, _ in self.guards)
+
+    def get_guard_expr(self):
+        """
+        Returns a sympy expression representing all of the shape env guards.
+
+        NOTE: Does not include implicit 0/1 or duck-shaping guards
+        """
+        return sympy.And(*[guard for guard, _ in self.guards])
 
     def get_nontrivial_guards(self):
-        return [(self.simplify(guard), val) for guard, val, _ in self.guards if self._maybe_evaluate_static(guard) is None]
+        return [self.simplify(guard) for guard, _ in self.guards if self._maybe_evaluate_static(guard) is None]
 
     def format_guards(self, verbose=False):
-        def format_val(guard, val):
-            if val is sympy.true:
-                return str(guard)
-            elif val is sympy.false:
-                return f"Not({guard})"
-            else:
-                return f"Eq({guard}, {val})"
-
         def format_tb(tb):
             if not verbose:
                 return ""
             return f"\n   Guarded at:\n{textwrap.indent(tb, '   ')}"
 
-        return '\n'.join(f" - {format_val(guard, val)}{format_tb(tb)}" for guard, val, tb in self.guards)
+        return '\n'.join(f" - {guard}{format_tb(tb)}" for guard, tb in self.guards)
 
     def get_shape_groups(self):
         shape_groups = collections.defaultdict(list)
@@ -600,5 +600,10 @@ def evaluate_expr(self, expr: "sympy.Expr"):
         # NB: drop two frames; evaluate_expr and the Sym* function that
         # actually called us
         stack = ''.join(traceback.format_list(traceback.extract_stack()[:-2]))
-        self.guards.append((expr, concrete_val, stack))
+        if concrete_val is sympy.true:
+            self.guards.append((expr, stack))
+        elif concrete_val is sympy.false:
+            self.guards.append((sympy.Not(expr), stack))
+        else:
+            self.guards.append((sympy.Eq(expr, concrete_val), stack))
         return concrete_val

From f567c08906d40f7bc5e4add7dc7d52f2e40d3516 Mon Sep 17 00:00:00 2001
From: stumpOS <stumposs12@gmail.com>
Date: Tue, 25 Oct 2022 17:00:27 +0000
Subject: [PATCH 0125/1922] consider numel args when identifying aligned args
 (#87394)

Fixes #ISSUE_NUMBER
https://github.com/pytorch/torchdynamo/issues/1527

cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang @penguinwu
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87394
Approved by: https://github.com/jansel
---
 test/inductor/test_torchinductor.py | 82 +++++++++++++++++++++++++++++
 torch/_inductor/codegen/triton.py   |  8 +--
 2 files changed, 86 insertions(+), 4 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index c106658b21c0e..bec1ea197c078 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -6,6 +6,7 @@
 import os
 import random
 import sys
+import typing
 import unittest
 import weakref
 from unittest.mock import patch
@@ -4402,6 +4403,87 @@ def forward(pred_objectness_logits_3_: torch.Tensor):
             result = forward(*args)
             assert same(result, torch.sort(args[0], descending=True, dim=1)[0])
 
+    class TritonCodeGenTests(TestCase):
+        from torch._inductor.triton_ops.autotune import CachingAutotuner
+
+        class NoOpCompilerBackend:
+            def __init__(self):
+                self.example_args = None
+                self.model = None
+
+            def noop_backend(
+                self,
+                model_: torch.fx.GraphModule,
+                example_inputs_: typing.List[torch.Tensor],
+            ):
+                """
+                The Noop backend does not compile the fx graph it is given.
+                Instead, it transforms the fx graph so that its functions are
+                aten operations. It then saves this graph.
+                """
+                from functorch._src.aot_autograd import Interpreter
+                from torch._inductor.decomposition import select_decomp_table
+                from torch._subclasses import FakeTensorMode
+
+                fake_mode = FakeTensorMode()
+
+                def interpret(*args, **kwargs):
+                    return Interpreter(model_).run(*args[0:], **kwargs)
+
+                fake_flat_tensor_args = [
+                    fake_mode.from_tensor(x) for x in example_inputs_
+                ]
+                fw_module = make_fx(interpret, select_decomp_table())(
+                    *fake_flat_tensor_args
+                )
+                self.model = fw_module
+                self.example_args = fake_flat_tensor_args
+                return lambda x: example_inputs_
+
+        def get_kernels(self, fn, args) -> typing.List[CachingAutotuner]:
+            from torch._inductor.debug import DebugContext
+            from torch._inductor.graph import GraphLowering
+            from torch._inductor.virtualized import V
+
+            cxt = TritonCodeGenTests.NoOpCompilerBackend()
+            torch._dynamo.optimize(backend=cxt.noop_backend)(fn)(*args)
+            graph = GraphLowering(cxt.model)
+            graph.num_static_inputs = 0
+            kernels = []
+            with V.set_graph_handler(graph), V.set_debug_handler(DebugContext()):
+                graph.run(*(cxt.example_args))
+                mod = graph.compile_to_module()
+                i = 0
+                while True:
+                    attribute = f"kernel{i}"
+                    if not hasattr(mod, attribute):
+                        break
+                    else:
+                        kernels.append(getattr(mod, attribute))
+                        i = i + 1
+            return kernels
+
+        def test_divisibile_by_16_covers_numel_args(self):
+            def fn(a: torch.Tensor) -> torch.Tensor:
+                return torch.sum(a)
+
+            kernels = self.get_kernels(fn, [torch.randn([256, 256], device="cuda")])
+            self.assertTrue(len(kernels) == 2, "SUM should result in two kernels")
+
+            # kernel0 reduces from 256 to (xnumel=8, rnumel=8192), which means it reduces 256 by 256 into an array of
+            # size 8 by accumulating 8192 elements at once note that rnumel is equal to 512 * 16, so rnumel which is
+            # at slot 3 should be in the divisible by 16 descriptor
+            arguments_that_are_divisible_by_16_in_kernel0 = (
+                kernels[0].meta["configs"][0].divisible_by_16
+            )
+            self.assertEqual(arguments_that_are_divisible_by_16_in_kernel0, (0, 1, 3))
+
+            # kernel1 reduces from 8 elements to a single scalar.
+            arguments_that_are_divisible_by_16_in_kernel1 = (
+                kernels[1].meta["configs"][0].divisible_by_16
+            )
+            self.assertEqual(arguments_that_are_divisible_by_16_in_kernel1, (0, 1))
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 832a0e6c82b4c..0ece1a06c9fa0 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -991,15 +991,14 @@ def codegen_kernel(self, name=None):
         triton_meta = {
             "signature": dict(enumerate(map(signature_of, signature))),
             "device": V.graph.scheduler.current_device.index,
-            "configs": [config_of(signature)],
             "constants": {},
         }
 
         for tree in self.range_trees:
             if tree.prefix != "r" or self.inside_reduction:
-                triton_meta["signature"][len(argdefs)] = signature_of(
-                    SizeArg(f"{tree.prefix}numel", tree.numel)
-                )
+                sizearg = SizeArg(f"{tree.prefix}numel", tree.numel)
+                signature.append(sizearg)
+                triton_meta["signature"][len(argdefs)] = signature_of(sizearg)
                 argdefs.append(f"{tree.prefix}numel")
                 # constexpr version causes issues, see
                 # https://github.com/pytorch/torchdynamo/pull/1362
@@ -1007,6 +1006,7 @@ def codegen_kernel(self, name=None):
                 #     tree.numel
                 # )
                 # argdefs.append(f"{tree.prefix}numel: tl.constexpr")
+        triton_meta["configs"] = [config_of(signature)]
 
         for tree in self.range_trees:
             if tree.prefix != "r" or self.inside_reduction:

From 9c72769164a218f404231fb9552dff84216469f5 Mon Sep 17 00:00:00 2001
From: Tugsbayasgalan Manlaibaatar <tmanlaibaatar@fb.com>
Date: Mon, 24 Oct 2022 15:44:46 -0700
Subject: [PATCH 0126/1922] Add named_buffers to torchdynamo nn_module (#87644)

Fixes: https://github.com/pytorch/torchdynamo/issues/1738

cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87644
Approved by: https://github.com/jansel
---
 test/dynamo/test_repros.py           | 19 +++++++++++++++++++
 torch/_dynamo/variables/nn_module.py |  7 +++++++
 2 files changed, 26 insertions(+)

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 52802f32ad1e8..bbb8ba527fc73 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -1736,6 +1736,25 @@ def forward(self, x):
         args = (torch.randn(3, 4),)
         self.assertTrue(same(mod(*args), opt_mod(*args)))
 
+    def test_named_buffers(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("x", torch.ones(3))
+                self.register_buffer("y", torch.ones(3))
+
+            def forward(self, inp):
+                res = 0
+                for name, buffer in self.named_buffers():
+                    res += buffer.sum()
+
+                return inp.cos() + res
+
+        mod = Foo()
+        opt_mod = torch._dynamo.optimize("eager", nopython=True)(mod)
+        args = (torch.randn(3, 4),)
+        self.assertTrue(same(mod(*args), opt_mod(*args)))
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py
index 87a94565e180a..6f7c2ff287373 100644
--- a/torch/_dynamo/variables/nn_module.py
+++ b/torch/_dynamo/variables/nn_module.py
@@ -323,6 +323,13 @@ def named_embed(name, obj):
             ):
                 result.append(named_embed(name, param))
             return ListIteratorVariable(result, mutable_local=MutableLocal(), **options)
+        elif name == "named_buffers":
+            result = []
+            for name, buffer in module.named_buffers(
+                **get_kwargs("prefix", "recurse", "remove_duplicate")
+            ):
+                result.append(named_embed(name, buffer))
+            return ListIteratorVariable(result, mutable_local=MutableLocal(), **options)
         elif name == "named_modules":
             result = []
             for name, submod in module.named_modules(

From cf08badfd700da919e89e5459dcaefda6640f2ae Mon Sep 17 00:00:00 2001
From: Valentin Andrei <vandrei@meta.com>
Date: Tue, 25 Oct 2022 17:03:23 +0000
Subject: [PATCH 0127/1922] [pytorch] Layer norm backward speed gain with warp
 shuffles (#87445)

Test Plan:
```
Times below are Forward + Backward on A100

       Size             FP32.   Gain.   FP16.   Gain
        256,   256  	101.30	9%	103.9	6%
        512,   256  	110.10	-4%	102.9	10%
       1024,   256  	104.30	7%	102.4	6%
       2048,   256  	107.60	4%	109.7	0%
       4096,   256  	116.70	8%	109.1	0%
       6144,   256  	106.10	7%	112.8	2%
       8192,   256  	106.10	1%	109.7	2%
        256,   512  	102.10	3%	108.5	1%
        512,   512  	101.50	40%	105.9	4%
       1024,   512  	109.70	20%	109.2	-1%
       2048,   512  	107.40	24%	107.2	1%
       4096,   512  	108.00	6%	110.6	-3%
       6144,   512  	103.90	13%	105.8	7%
       8192,   512  	138.70	14%	105.6	7%
        256,  1024  	106.20	1%	102.9	6%
        512,  1024  	104.50	4%	104.2	3%
       1024,  1024  	126.90	-15%	103.9	10%
       2048,  1024  	127.40	-15%	102.2	6%
       4096,  1024  	117.70	6%	102.8	21%
       6144,  1024  	165.30	11%	112.2	12%
       8192,  1024  	211.90	11%	144.8	13%
        256,  1536  	102.80	11%	103.1	6%
        512,  1536  	103.30	9%	102.9	18%
       1024,  1536  	111.00	-2%	117.2	7%
       2048,  1536  	102.30	12%	132.1	-4%
       4096,  1536  	165.50	5%	112.9	18%
       6144,  1536  	236.60	5%	145.7	12%
       8192,  1536  	307.80	5%	186.1	11%
        256,  2048  	110.60	-1%	103.8	7%
        512,  2048  	105.20	3%	105.6	1%
       1024,  2048  	106.70	3%	114.8	3%
       2048,  2048  	124.90	5%	109.7	0%
       4096,  2048  	231.40	4%	129.9	10%
       6144,  2048  	332.80	4%	182.5	11%
       8192,  2048  	434.60	4%	235.2	11%
        256,  3072  	111.60	8%	110.8	1%
        512,  3072  	106.80	1%	104.6	10%
       1024,  3072  	104.90	3%	109.9	4%
       2048,  3072  	193.80	0%	106.2	10%
       4096,  3072  	364.50	0%	187.8	5%
       6144,  3072  	538.30	0%	267	5%
       8192,  3072  	718.00	-1%	346.7	6%
        256,  4096  	103.60	4%	110.2	-1%
        512,  4096  	131.40	-11%	117	-7%
       1024,  4096  	135.80	1%	104.8	7%
       2048,  4096  	268.20	1%	149.4	10%
       4096,  4096  	520.70	1%	268.5	9%
       6144,  4096  	786.30	0%	389.8	9%
       8192,  4096  	1043.50	0%	509	10%
```

Used the following script from ngimel:

```
import torch
from torch.utils.benchmark import Compare, Timer

results = []
for dtype in (torch.float, torch.half):
    for fs in (256, 512, 1024, 1536, 2048, 3072, 4096):
        for bs in (256, 512, 1024, 2048, 4096, 6144, 8192):
            ln = torch.nn.LayerNorm((fs,), device="cuda", dtype=dtype)
            X = torch.randn(bs, fs, device="cuda", dtype=dtype, requires_grad=True)
            gO = torch.rand_like(X)
            stmtfwd = "ln(X)"
            stmtfwdbwd = "X.grad=None; ln.zero_grad(set_to_none=True); out = ln(X); out.backward(gO)"
            tfwd = Timer(
                stmt=stmtfwd,
                label="ln",
                sub_label=f"{bs:5}, {fs:5}",
                description=f"fwd, {dtype}",
                globals=globals(),
            )
            tfwdbwd = Timer(
                stmt=stmtfwdbwd,
                label="ln",
                sub_label=f"{bs:5}, {fs:5}",
                description=f"fwdbwd, {dtype}",
                globals=globals(),
            )
            for t in (tfwd, tfwdbwd):
                results.append(t.blocked_autorange())
        print(fs, end="\r")
c = Compare(results)
c.print()
```

Differential Revision: D40567574

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87445
Approved by: https://github.com/ngimel
---
 .../src/ATen/native/cuda/layer_norm_kernel.cu | 242 ++++++++++++++----
 1 file changed, 188 insertions(+), 54 deletions(-)

diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
index ae09f0aaad8f8..732545465d9c9 100644
--- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@@ -33,6 +33,7 @@ namespace {
 
 constexpr int kCUDANumThreads = 256;
 constexpr int kColwiseReduceTileSize = 32;
+constexpr int kWarpSize = 32;
 constexpr int vec_size = 4; //we could make it dependent on dtype, but that would lead to different results between float and low-p types
 
 // aligned vector generates vectorized load/store on CUDA (copy-pasted from MemoryAccess.cuh)
@@ -555,8 +556,108 @@ __global__ void GammaBetaBackwardCUDAKernel1(
   }
 }
 
+template <typename T, typename T_ACC>
+__global__ void GammaBetaBackwardCUDAKernel_32x32(
+    int64_t M,
+    int64_t N,
+    const T* dY,
+    const T* X,
+    const T_ACC* mean,
+    const T_ACC* rstd,
+    T* dg,
+    T* db) {
+  alignas(sizeof(double)) extern __shared__ char s_data1[];
+  T_ACC* s_data_typed = reinterpret_cast<T_ACC*>(&s_data1);
+  T_ACC* s_dg;
+  T_ACC* s_db;
 
+  T_ACC dg_sum = 0;
+  T_ACC db_sum = 0;
 
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (j < N) {
+    constexpr int unroll_factor = 8;
+    int laneId = threadIdx.x & 0x1f;
+
+    T_ACC mean_reg, mean_reg_tmp;
+    T_ACC rstd_reg, rstd_reg_tmp;
+    T dY_reg;
+    T X_reg;
+
+    // Main loop
+    int bcounter;
+    for (bcounter = 0; bcounter < M / (blockDim.y * unroll_factor);
+         bcounter++) {
+      int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor;
+
+      if (laneId < unroll_factor) {
+        mean_reg_tmp = mean[offset + laneId];
+        rstd_reg_tmp = rstd[offset + laneId];
+      }
+#if !defined(USE_ROCM)
+      // Volta and newer architectures allow lane divergence within a warp.
+      __syncwarp();
+#endif
+
+      #pragma unroll
+      for (int ii = 0; ii < unroll_factor; ++ii) {
+        dY_reg = dY[(offset + ii) * N + j];
+        X_reg = X[(offset + ii) * N + j];
+        mean_reg = WARP_SHFL(mean_reg_tmp, ii, kWarpSize);
+        rstd_reg = WARP_SHFL(rstd_reg_tmp, ii, kWarpSize);
+        dg_sum += dY_reg * (X_reg - mean_reg) * rstd_reg;
+        db_sum += dY_reg;
+      }
+    }
+
+    // Remainder loop
+    int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor;
+    for (int ii = 0; ii < unroll_factor; ii++) {
+      if ((offset + ii) < M) {
+        mean_reg = mean[offset + ii];
+        rstd_reg = rstd[offset + ii];
+        dY_reg = dY[(offset + ii) * N + j];
+        X_reg = X[(offset + ii) * N + j];
+        dg_sum += dY_reg * (X_reg - mean_reg) * rstd_reg;
+        db_sum += dY_reg;
+      }
+    }
+
+    // This kernel uses a block of (32 x 32) and gets called when M; N
+    // divide by 32. We can use warp shuffles for the final reduction
+    // step. This removes 4 shmem loads and stores with their
+    // corresponding __syncthreads()
+
+    // This greatly reduces bank conflicts at the expense of a little
+    // extra shared memory. It does not impact occupancy
+    int padded_bx = (1 + blockDim.x);
+
+    s_dg = s_data_typed;
+    s_db = s_data_typed + (padded_bx * blockDim.y);
+    s_dg[threadIdx.y * padded_bx + threadIdx.x] = dg_sum;
+    s_db[threadIdx.y * padded_bx + threadIdx.x] = db_sum;
+    __syncthreads();
+
+    // Load transposed so that a warp holds an entire column
+    T_ACC reg_dg = s_dg[threadIdx.x * padded_bx + threadIdx.y];
+    T_ACC reg_db = s_db[threadIdx.x * padded_bx + threadIdx.y];
+    for (int delta = 16; delta >= 1; delta /= 2) {
+      reg_dg += WARP_SHFL_XOR(reg_dg, delta, kWarpSize);
+      reg_db += WARP_SHFL_XOR(reg_db, delta, kWarpSize);
+    }
+
+    if (threadIdx.x == 0) {
+      const int64_t j = blockIdx.x * blockDim.x + threadIdx.y;
+      if (dg) {
+        dg[j] = reg_dg;
+      }
+      if (db) {
+        db[j] = reg_db;
+      }
+    }
+  }
+}
 
 template <typename T, typename T_ACC>
 __global__ void GammaBetaBackwardCUDAKernel(
@@ -569,66 +670,75 @@ __global__ void GammaBetaBackwardCUDAKernel(
     T* dg,
     T* db) {
   alignas(sizeof(double)) extern __shared__ char s_data1[];
-  T_ACC * s_data_typed = reinterpret_cast<T_ACC*>(&s_data1);
+  T_ACC* s_data_typed = reinterpret_cast<T_ACC*>(&s_data1);
+  T_ACC* s_dg;
+  T_ACC* s_db;
+
   const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
-  constexpr int unroll = 8;
-  T dYs[unroll];
-  T Xs[unroll];
-  T_ACC *  means = s_data_typed;
-  T_ACC * rstds = s_data_typed + unroll * blockDim.y;
+
   T_ACC dg_sum = 0;
   T_ACC db_sum = 0;
+
   if (j < N) {
+    constexpr int unroll_factor = 8;
+
+    T_ACC mean_reg;
+    T_ACC rstd_reg;
+    T dY_reg;
+    T X_reg;
+
+    // Main Loop
     int bcounter;
-    for (bcounter = 0; bcounter < M/(blockDim.y * unroll); bcounter++){
-      int offset = (bcounter * blockDim.y + threadIdx.y) * unroll;
-      #pragma unroll
-      for (int ii=0; ii<unroll; ii++){
-        if (threadIdx.x == 0) {
-          means[ii*blockDim.y + threadIdx.y] = mean[offset + ii];
-          rstds[ii*blockDim.y + threadIdx.y] = rstd[offset + ii];
-        }
-        dYs[ii] = dY[(offset + ii) * N + j ];
-        Xs[ii] = X[(offset + ii) * N + j];
+    for (bcounter = 0; bcounter < M / (blockDim.y * unroll_factor); bcounter++){
+      int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor;
 
-      }
-      __syncthreads();
       #pragma unroll
-      for (int ii=0; ii<unroll; ii++){
-        dg_sum += dYs[ii] * (Xs[ii] - means[ii*blockDim.y + threadIdx.y]) * rstds[ii * blockDim.y + threadIdx.y];
-        db_sum += dYs[ii];
+      for (int ii = 0; ii < unroll_factor; ++ii) {
+        dY_reg = dY[(offset + ii) * N + j];
+        X_reg = X[(offset + ii) * N + j];
+        mean_reg = mean[offset + ii];
+        rstd_reg = rstd[offset + ii];
+        dg_sum += dY_reg * (X_reg - mean_reg) * rstd_reg;
+        db_sum += dY_reg;
       }
-      __syncthreads();
     }
-    int offset = (bcounter * blockDim.y + threadIdx.y) * unroll;
-    for (int ii = 0; ii<8; ii++ ){
-      T_ACC mean_val, rstd_val; // we don't use smem in the tail to avoid awkward synchronizations, perf penalty is negligible
+
+    // Remainder loop
+    int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor;
+    for (int ii = 0; ii < unroll_factor; ii++ ){
       if ((offset + ii) < M) {
-        mean_val = mean[offset+ii];
-        rstd_val = rstd[offset+ii];
-        dYs[0] = dY[(offset + ii) * N + j ];
-        Xs[0] = X[(offset + ii) * N + j];
-        dg_sum += dYs[0] * (Xs[0] - mean_val) * rstd_val;
-        db_sum += dYs[0];
+        dY_reg = dY[(offset + ii) * N + j ];
+        X_reg = X[(offset + ii) * N + j];
+        mean_reg = mean[offset + ii];
+        rstd_reg = rstd[offset + ii];
+        dg_sum += dY_reg * (X_reg - mean_reg) * rstd_reg;
+        db_sum += dY_reg;
       }
     }
-    s_data_typed[threadIdx.y * blockDim.x + threadIdx.x] = dg_sum;
-    s_data_typed[blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x] = db_sum;
+
+    // Do the final reduction in shared memory
+    s_dg = s_data_typed;
+    s_db = s_data_typed + blockDim.x * blockDim.y;
+    s_dg[threadIdx.y * blockDim.x + threadIdx.x] = dg_sum;
+    s_db[threadIdx.y * blockDim.x + threadIdx.x] = db_sum;
     __syncthreads();
-    for (int offset = blockDim.y/2; offset >=1; offset /= 2){
+
+    for (int offset = blockDim.y / 2; offset >= 1; offset /= 2) {
       if (threadIdx.y < offset) {
-        s_data_typed[threadIdx.y * blockDim.x + threadIdx.x] += s_data_typed[(threadIdx.y + offset) * blockDim.x + threadIdx.x];
-        s_data_typed[blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x] +=
-        s_data_typed[blockDim.x * blockDim.y + (threadIdx.y + offset) * blockDim.x + threadIdx.x];
-      }
+        s_dg[threadIdx.y * blockDim.x + threadIdx.x] +=
+            s_dg[(threadIdx.y + offset) * blockDim.x + threadIdx.x];
+        s_db[threadIdx.y * blockDim.x + threadIdx.x] +=
+            s_db[(threadIdx.y + offset) * blockDim.x + threadIdx.x];
+        }
       __syncthreads();
     }
+
     if (threadIdx.y == 0) {
       if (dg) {
-        dg[j] = s_data_typed[threadIdx.x];
+        dg[j] = s_dg[threadIdx.x];
       }
       if (db) {
-        db[j] = s_data_typed[threadIdx.x + blockDim.x * blockDim.y];
+        db[j] = s_db[threadIdx.x];
       }
     }
   }
@@ -763,7 +873,8 @@ void LayerNormBackwardKernelImplInternal(
     T* dgamma_data =
         dgamma->defined() ? dgamma->template data_ptr<T>() : nullptr;
     T* dbeta_data = dbeta->defined() ? dbeta->template data_ptr<T>() : nullptr;
-    if (M < 512) {
+
+    if (M < 128) {
       // For small batch size, do colwise reduce directly.
       const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads;
       GammaBetaBackwardSimpleCUDAKernel<T, T_ACC>
@@ -778,19 +889,42 @@ void LayerNormBackwardKernelImplInternal(
               dbeta_data);
       C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
-      dim3 threads{16, 32};
-      int blocks = (N + threads.x-1)/threads.x;
-      GammaBetaBackwardCUDAKernel<T, T_ACC>
-          <<<blocks, threads, 2 * sizeof(T_ACC) * threads.x * threads.y, cuda_stream>>>(
-              M,
-              N,
-              dY_data,
-              X_data,
-              mean_data,
-              rstd_data,
-              dgamma_data,
-              dbeta_data);
-      C10_CUDA_KERNEL_LAUNCH_CHECK();
+      if ((M % kWarpSize == 0) && (N % kWarpSize == 0)) {
+        // This implementation relies on warp primitives and requires that M and N divide
+        // exactly to warp size.
+        dim3 threads{kWarpSize, kWarpSize};
+        int blocks = (N + threads.x - 1) / threads.x;
+
+        // If M and N divide by 32, we can use warp shuffles for the final reduction. That requires
+        // transposing values in shared memory, so we apply a padding to reduce bank conflicts.
+        size_t shmem_sz = 2 * sizeof(T_ACC) * (threads.x + 1) * threads.y;
+        GammaBetaBackwardCUDAKernel_32x32<T, T_ACC>
+            <<<blocks, threads, shmem_sz, cuda_stream>>>(
+                M,
+                N,
+                dY_data,
+                X_data,
+                mean_data,
+                rstd_data,
+                dgamma_data,
+                dbeta_data);
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
+      } else {
+        dim3 threads{16, 32};
+        int blocks = (N + threads.x - 1) / threads.x;
+        size_t shmem_sz = 2 * sizeof(T_ACC) * threads.x * threads.y;
+        GammaBetaBackwardCUDAKernel<T, T_ACC>
+            <<<blocks, threads, shmem_sz, cuda_stream>>>(
+                M,
+                N,
+                dY_data,
+                X_data,
+                mean_data,
+                rstd_data,
+                dgamma_data,
+                dbeta_data);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      }
     }
   }
 }

From b45e93f42090066e9d6ac9d6145aa3d36f173d6d Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Tue, 25 Oct 2022 02:35:41 +0000
Subject: [PATCH 0128/1922] Graph-break on FSDP in dynamo (#87420)

Why we want to graph-break FSDP
- FSDP has communication ops during forward and backward which we currently can't trace into the graph but also want to ensure are overlapped with compute
- dynamo has issues tracing into or capturing a call to fsdp module without a break (see below)

How we graph-break on FSDP
- marking FSDP.forward code as skip means the code frames will graph-break; but in this case all of torch.* is listed in skipfiles.py anyway, so this is taken care of
- disallowing the FSDP module prevents dynamo trying to record a 'call_module(FSDPmodule)' node into a graph, which happens earlier than the graphbreak that would be caused by skip, and causes additional issues: dynamo deepcopies modules before call-module handling, and FSDP module isn't trivially deep-copyable

cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87420
Approved by: https://github.com/aazzolini
---
 torch/_dynamo/allowed_functions.py | 19 +++++++++++++++++++
 torch/_dynamo/skipfiles.py         | 16 ++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/torch/_dynamo/allowed_functions.py b/torch/_dynamo/allowed_functions.py
index 42a6580ac1c86..67daafc5adac7 100644
--- a/torch/_dynamo/allowed_functions.py
+++ b/torch/_dynamo/allowed_functions.py
@@ -18,6 +18,24 @@
 from . import config
 from .utils import is_safe_constant
 
+"""
+A note on allowed functions:
+
+Dynamo consults this file to determine if a particular function/module
+is allowed to appear as a node in its fx output.
+
+If a function is disallowed, it may either be traced-through, or skipped.
+
+Trace-through means dynamo will continue to trace the interior code for
+the function/module rather than stopping at its boundary and recording it
+as a node in the fx graph. Whether tracing through or allowing, the functionality
+of the function/module is part of the dynamo graph.  Caveat: if tracing through,
+any interior operation could trigger its own graph-break.
+
+Skips are determined by (torch/_dynamo/skipfiles.py) - see "a note on
+skipfiles" there.
+"""
+
 
 def make_function_id_set(lazy_initializer):
     """
@@ -130,6 +148,7 @@ def _is_allowed_module_prefix(obj):
             "torch._inductor.",
             "torch._C.inductor.",
             "torch.fx.",
+            "torch.distributed.fsdp.",
         )
         allowed_modules_dot = tuple([x + "." for x in allowed_modules])
         module = inspect.getmodule(obj)
diff --git a/torch/_dynamo/skipfiles.py b/torch/_dynamo/skipfiles.py
index 2b6fbb3959c8d..ee2ad3f9395ff 100644
--- a/torch/_dynamo/skipfiles.py
+++ b/torch/_dynamo/skipfiles.py
@@ -49,6 +49,22 @@
 
 from . import config
 
+"""
+A note on skipfiles:
+
+Dynamo consults this file to determine whether code should be compiled or skipped.
+
+A skip applies at the frame boundary, meaning dynamo either triggers a graph break
+at the beginning of the frame or attempts to trace the whole frame.  When skipping
+a frame, recursively called frames are still traced by dynamo unless also skipped.
+
+Skipfiles (skipped at the file level instead of function level) still apply on a
+frame-by-frame boundary as dynamo traces, but apply to all functions in that file.
+
+@skip is a helper decorator that can be applied to your function to cause it to be
+included here.
+"""
+
 
 def _strip_init_py(s):
     return re.sub(r"__init__.py$", "", s)

From dc2b01a61c79bdcdc82297b4f08822690fd86251 Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Tue, 25 Oct 2022 17:34:29 +0000
Subject: [PATCH 0129/1922] Disable test_inductor_timm_shard (#87710)

Summary: tests are flaky. Need more time for investigation.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87710
Approved by: https://github.com/anijain2305, https://github.com/malfet
---
 .github/workflows/inductor.yml | 12 ++----------
 .jenkins/pytorch/test.sh       | 13 +++----------
 2 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml
index da27466b60e90..e6a79e2a738d8 100644
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@@ -22,16 +22,8 @@ jobs:
       cuda-arch-list: 8.6
       test-matrix: |
         { include: [
-          { config: "inductor", shard: 1, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 3, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 4, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 5, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 6, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 7, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 8, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 9, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 10, num_shards: 10, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
         ]}
 
   linux-bionic-cuda11_6-py3_10-gcc7-inductor-test:
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index b263c1949c10f..a1381a5c75957 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -738,24 +738,17 @@ elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHAR
   install_filelock
   install_triton
   test_dynamo_shard 2
-elif [[ "${TEST_CONFIG}" == *inductor* && $SHARD_NUMBER -lt 9 && $NUM_TEST_SHARDS -gt 1 ]]; then
+elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
   install_torchvision
   install_filelock
   install_triton
-  install_timm
-  id=$((SHARD_NUMBER-1))
-  test_inductor_timm_shard $id
-elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 9 && $NUM_TEST_SHARDS -gt 1 ]]; then
+  test_inductor
+elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then
   install_torchvision
   install_filelock
   install_triton
   install_huggingface
   test_inductor_huggingface_shard 0
-elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 10 && $NUM_TEST_SHARDS -gt 1 ]]; then
-  install_torchvision
-  install_filelock
-  install_triton
-  test_inductor
 elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
   test_without_numpy
   install_torchvision

From e6e2f7b3a944cf161d98711b0fbecb44ca7cc435 Mon Sep 17 00:00:00 2001
From: AllenTiTaiWang <titaiwang@microsoft.com>
Date: Tue, 25 Oct 2022 15:55:31 +0000
Subject: [PATCH 0130/1922] [ONNX] Support quantized::conv1d_relu (#85997)

According to #38248, quantized::conv1d_relu shares packing parameters with Conv2D (kspatialDim is also 2), and needs a different unpacking way. Therefore, a new `QuantizedParamsType=Conv1D` is used to differentiate the two, and has to extract 1D information from 2D packed parameters.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/85997
Approved by: https://github.com/BowenBao
---
 test/onnx/test_pytorch_onnx_onnxruntime.py    | 14 ++++++
 .../passes/onnx/unpack_quantized_weights.cpp  | 47 ++++++++++++++-----
 torch/onnx/symbolic_opset10.py                | 26 ++++++++++
 3 files changed, 76 insertions(+), 11 deletions(-)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index e917e44ce21bd..bc70011b78871 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -11834,6 +11834,20 @@ def test_quantized_conv2d_relu(self):
         q_input = torch.quantize_per_tensor(input, 0.5, 128, torch.quint8)
         self.run_test(model, q_input)
 
+    @skipIfUnsupportedMinOpsetVersion(10)
+    def test_quantized_conv1d_relu(self):
+        model = torch.nn.intrinsic.quantized.ConvReLU1d(16, 33, 3, stride=2)
+        # Manually initialize model weight and bias to random numbers.
+        # By default all zeros.
+        q_weight = torch.quantize_per_tensor(
+            torch.randn(33, 16, 3), 0.5, 0, torch.qint8
+        )
+        bias = torch.arange(33).to(torch.float) - 16
+        model.set_weight_bias(q_weight, bias)
+        input = torch.randn(3, 16, 32)
+        q_input = torch.quantize_per_tensor(input, 0.5, 128, torch.quint8)
+        self.run_test(model, q_input)
+
     @common_utils.parametrize(
         "function_or_module",
         [
diff --git a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
index f5a50e76fcae4..300e3452a8d17 100644
--- a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
+++ b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp
@@ -299,7 +299,10 @@ void ConvertQuantizedWeight(
   }
 }
 
-enum class QuantizedParamsType { CONV, LINEAR };
+// CONV1D needs a different unpacking from CONV, since it's
+// packed as CONV2D intentionally at the first place.
+// See: https://github.com/pytorch/pytorch/pull/38248
+enum class QuantizedParamsType { CONV1D, CONV, LINEAR };
 
 // This is called before the onnx pass. Using pattern matching we
 // find the relevant nodes and extract the packed_params. The packed_params are
@@ -413,7 +416,8 @@ void unpackQuantizedWeightsHelper(
         groups = groups_int;
         transpose = transpose_int;
       } else if (
-          params_type == QuantizedParamsType::CONV &&
+          (params_type == QuantizedParamsType::CONV ||
+           params_type == QuantizedParamsType::CONV1D) &&
           ser_tup->elements()[0].isString()) {
         const auto& elements = ser_tup->elements();
         auto version = elements[0].toStringRef();
@@ -426,25 +430,32 @@ void unpackQuantizedWeightsHelper(
         const int64_t kSpatialDim = conv_params_packed[0].item<int64_t>();
         // skip kSpatialDim
         int64_t idx = 1;
+        // kSpatialDim = 2 even it's for Conv1D from torch.op to adopt Conv2D,
+        // so we need a special unpack for Conv1D which has Conv2D dim.
+        // See: https://github.com/pytorch/pytorch/pull/38248
         for (const auto i : c10::irange(kSpatialDim)) {
-          (void)i; // Suppress unused variable warning
-          stride_int.emplace_back(conv_params_packed[idx].item<int64_t>());
+          if (params_type != QuantizedParamsType::CONV1D || i != 0) {
+            stride_int.emplace_back(conv_params_packed[idx].item<int64_t>());
+          }
           idx++;
         }
         for (const auto i : c10::irange(kSpatialDim)) {
-          (void)i; // Suppress unused variable warning
-          padding_int.emplace_back(conv_params_packed[idx].item<int64_t>());
+          if (params_type != QuantizedParamsType::CONV1D || i != 0) {
+            padding_int.emplace_back(conv_params_packed[idx].item<int64_t>());
+          }
           idx++;
         }
         for (const auto i : c10::irange(kSpatialDim)) {
-          (void)i; // Suppress unused variable warning
-          dilation_int.emplace_back(conv_params_packed[idx].item<int64_t>());
+          if (params_type != QuantizedParamsType::CONV1D || i != 0) {
+            dilation_int.emplace_back(conv_params_packed[idx].item<int64_t>());
+          }
           idx++;
         }
         for (const auto i : c10::irange(kSpatialDim)) {
-          (void)i; // Suppress unused variable warning
-          output_padding_int.emplace_back(
-              conv_params_packed[idx].item<int64_t>());
+          if (params_type != QuantizedParamsType::CONV1D || i != 0) {
+            output_padding_int.emplace_back(
+                conv_params_packed[idx].item<int64_t>());
+          }
           idx++;
         }
         groups_int = conv_params_packed[idx].item<int64_t>();
@@ -461,6 +472,9 @@ void unpackQuantizedWeightsHelper(
         torch::List<c10::IValue> optional = elements[2].toList();
         bias = optional.get(0).toOptional<at::Tensor>();
 
+        if (params_type == QuantizedParamsType::CONV1D) {
+          unpacked_weight = unpacked_weight.squeeze_(2);
+        }
         stride = stride_int;
         padding = padding_int;
         dilation = dilation_int;
@@ -638,6 +652,10 @@ void UnpackQuantizedWeights(
   graph(%input, %packed_weight, %w_scale, %w_zero_point):
         %r = quantized::linear(%input, %packed_weight, %w_scale, %w_zero_point)
         return (%r) )";
+  std::string qconv1d_relu = R"(
+  graph(%input, %packed_params, %scale, %zero_point):
+        %r = quantized::conv1d_relu(%input, %packed_params, %scale, %zero_point)
+        return (%r) )";
   std::string qconv2d = R"(
   graph(%input, %packed_params, %scale, %zero_point):
         %r = quantized::conv2d(%input, %packed_params, %scale, %zero_point)
@@ -668,6 +686,13 @@ void UnpackQuantizedWeights(
       "quantized::conv2d_unpack",
       QuantizedParamsType::CONV,
       caffe2);
+  unpackQuantizedWeightsHelper(
+      graph,
+      paramsDict,
+      qconv1d_relu,
+      "quantized::conv1d_unpack",
+      QuantizedParamsType::CONV1D,
+      caffe2);
   unpackQuantizedWeightsHelper(
       graph,
       paramsDict,
diff --git a/torch/onnx/symbolic_opset10.py b/torch/onnx/symbolic_opset10.py
index f88e1fe797c8b..bc04db1f37f59 100644
--- a/torch/onnx/symbolic_opset10.py
+++ b/torch/onnx/symbolic_opset10.py
@@ -41,6 +41,7 @@
     "quantized_add_relu",
     "quantized_add",
     "quantized_cat",
+    "quantized_conv1d_relu",
     "quantized_conv2d_relu",
     "quantized_conv2d",
     "quantized_group_norm",
@@ -826,6 +827,31 @@ def quantized_instance_norm(
     return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
 
 
+@_onnx_symbolic("quantized::conv1d_relu")
+@_beartype.beartype
+def quantized_conv1d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
 @_onnx_symbolic("quantized::conv2d_relu")
 @_beartype.beartype
 def quantized_conv2d_relu(

From 90f2a0675836804b5dff41ea1a1257e336c8d794 Mon Sep 17 00:00:00 2001
From: AllenTiTaiWang <titaiwang@microsoft.com>
Date: Tue, 25 Oct 2022 15:52:17 +0000
Subject: [PATCH 0131/1922] [ONNX] replace AT_ASSERT with
 TORCH_INTERTNAL_ASSERT take 2 (#86405)

Address the AT_ASSERT in torch/jit/csrc/serialization (ONNX related).
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86405
Approved by: https://github.com/justinchuby, https://github.com/BowenBao
---
 torch/csrc/jit/passes/onnx.cpp                | 18 ++++++-------
 .../pattern_conversion/pattern_conversion.cpp | 10 ++++---
 torch/csrc/jit/serialization/export.cpp       | 27 ++++++++++---------
 3 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/torch/csrc/jit/passes/onnx.cpp b/torch/csrc/jit/passes/onnx.cpp
index 98f3cb42aea0f..f5e948b2cacf3 100644
--- a/torch/csrc/jit/passes/onnx.cpp
+++ b/torch/csrc/jit/passes/onnx.cpp
@@ -59,13 +59,13 @@ void checkONNXCompatibility(const c10::FunctionSchema& schema) {
     if (type->kind() == TypeKind::OptionalType) {
       type = reinterpret_cast<OptionalType*>(type.get())->getElementType();
       // recursive optional type is not supported
-      AT_ASSERT(type->kind() != TypeKind::OptionalType);
+      TORCH_INTERNAL_ASSERT(type->kind() != TypeKind::OptionalType);
     }
     if (type->kind() == TypeKind::ListType) {
       const auto& elem_type =
           reinterpret_cast<ListType*>(type.get())->getElementType();
       if (elem_type->isSubtypeOf(*TensorType::get())) {
-        AT_ASSERTM(
+        TORCH_INTERNAL_ASSERT(
             !has_tensor_list,
             "ONNX export supports at most one TensorList as input.");
         has_tensor_list = true;
@@ -92,7 +92,7 @@ void preprocessCaffe2Ops(Block* block) {
       size_t origin_inputs_index = 0;
       for (const auto& arg : args) {
         auto type = arg.type();
-        AT_ASSERT(origin_inputs_index < origin_inputs.size());
+        TORCH_INTERNAL_ASSERT(origin_inputs_index < origin_inputs.size());
         const auto& origin_input = origin_inputs[origin_inputs_index++];
         if (type->kind() == TypeKind::OptionalType &&
             origin_input->mustBeNone()) {
@@ -104,24 +104,24 @@ void preprocessCaffe2Ops(Block* block) {
             type->kind() == TypeKind::BoolType ||
             type->kind() == TypeKind::IntType) {
           const auto* constant_node = origin_input->node();
-          AT_ASSERT(constant_node->kind() == prim::Constant);
+          TORCH_INTERNAL_ASSERT(constant_node->kind() == prim::Constant);
           it->i_(Symbol::attr(arg.name()), constant_node->i(attr::value));
         } else if (type->kind() == TypeKind::FloatType) {
           const auto* constant_node = origin_input->node();
-          AT_ASSERT(constant_node->kind() == prim::Constant);
+          TORCH_INTERNAL_ASSERT(constant_node->kind() == prim::Constant);
           it->f_(Symbol::attr(arg.name()), constant_node->f(attr::value));
         } else if (type->kind() == TypeKind::StringType) {
           const auto* constant_node = origin_input->node();
-          AT_ASSERT(constant_node->kind() == prim::Constant);
+          TORCH_INTERNAL_ASSERT(constant_node->kind() == prim::Constant);
           it->s_(Symbol::attr(arg.name()), constant_node->s(attr::value));
         } else if (type->kind() == TypeKind::ListType) {
           const auto& list_node = origin_input->node();
           const auto& elem_type = type->castRaw<ListType>()->getElementType();
-          AT_ASSERT(
+          TORCH_INTERNAL_ASSERT(
               list_node->kind() == prim::ListConstruct ||
               list_node->kind() == prim::Constant);
           if (elem_type->isSubtypeOf(*TensorType::get())) {
-            AT_ASSERT(list_node->kind(), prim::ListConstruct);
+            TORCH_INTERNAL_ASSERT(list_node->kind(), prim::ListConstruct);
             const auto& tensor_list = origin_input->node()->inputs();
             for (const auto& t : tensor_list) {
               it->addInput(t);
@@ -131,7 +131,7 @@ void preprocessCaffe2Ops(Block* block) {
             if (list_node->kind() == prim::ListConstruct) {
               for (const auto* elem_input : list_node->inputs()) {
                 const auto* constant_node = elem_input->node();
-                AT_ASSERT(constant_node->kind() == prim::Constant);
+                TORCH_INTERNAL_ASSERT(constant_node->kind() == prim::Constant);
                 values.push_back(constant_node->f(attr::value));
               }
             } else { // is a constant list
diff --git a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.cpp b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.cpp
index 2280ea6eb30bb..d93e34f87c6e9 100644
--- a/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.cpp
+++ b/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.cpp
@@ -146,8 +146,8 @@ std::unordered_map<int64_t, ConvertedIndex> MergeSliceAndSelectToIndices(
           std::forward_as_tuple(index_tensor, aten::select));
       dim_offset++;
     } else {
-      AT_ERROR(
-          "Unexpected node kind ",
+      TORCH_CHECK(
+          false,
           node->kind().toDisplayString(),
           " Expected aten::slice or aten::select.");
     }
@@ -202,7 +202,8 @@ std::vector<Value*> ReshapeToAdvancedIndexingFormat(
 
   if (((max_index_dim - min_index_dim + 1) != tensor_ind_count) &&
       tensor_ind_count != 0) {
-    AT_ERROR(
+    TORCH_CHECK(
+        false,
         "Only consecutive 1-d tensor indices are supported in exporting aten::index_put to ONNX.",
         "Check https://pytorch.org/docs/stable/onnx.html#indexing for details");
   }
@@ -230,7 +231,8 @@ std::vector<Value*> ReshapeToAdvancedIndexingFormat(
         break;
       }
       default:
-        AT_ERROR("Unexpected node kind ", index_i->second.orig_node_kind);
+        TORCH_CHECK(
+            false, "Unexpected node kind ", index_i->second.orig_node_kind);
     }
 
     if (ind_size != 1) {
diff --git a/torch/csrc/jit/serialization/export.cpp b/torch/csrc/jit/serialization/export.cpp
index 2f178addda955..f5f5ab7c99088 100644
--- a/torch/csrc/jit/serialization/export.cpp
+++ b/torch/csrc/jit/serialization/export.cpp
@@ -729,7 +729,7 @@ void GraphEncoder::EncodeBlock(
     bool add_node_names,
     bool use_external_data_format,
     const std::string& onnx_file_path) {
-  AT_ASSERT(graph_proto != nullptr);
+  TORCH_INTERNAL_ASSERT(graph_proto != nullptr);
   std::string block_name = "torch_jit";
   if (num_blocks_) {
     block_name += std::to_string(num_blocks_);
@@ -806,7 +806,7 @@ void GraphEncoder::AddInitializersIntoGraphProto(
     const std::map<std::string, at::Tensor>& initializers,
     bool use_external_data_format,
     const std::string& onnx_file_path) {
-  AT_ASSERT(block->inputs().size() >= initializers.size());
+  TORCH_INTERNAL_ASSERT(block->inputs().size() >= initializers.size());
   for (auto input : block->inputs()) {
     auto name_tensor_pair = initializers.find(input->debugName());
     if (name_tensor_pair == initializers.end()) {
@@ -888,7 +888,7 @@ void GraphEncoder::EncodeNode(
     node_proto->set_domain(domain);
   }
   if (operator_export_type_ == onnx_torch::OperatorExportTypes::ONNX) {
-    AT_ASSERT(
+    TORCH_INTERNAL_ASSERT(
         !node->kind().is_aten() && !node->kind().is_prim() &&
         !node->kind().is_attr());
   }
@@ -923,7 +923,7 @@ void GraphEncoder::EncodeNode(
         node_proto, node, attr_name, use_external_data_format, onnx_file_path);
   }
   if (node->kind() == ::c10::onnx::Loop) {
-    AT_ASSERT(node->blocks().size() == 1);
+    TORCH_INTERNAL_ASSERT(node->blocks().size() == 1);
 
     auto body = node_proto->add_attribute();
     body->set_name("body");
@@ -940,7 +940,7 @@ void GraphEncoder::EncodeNode(
         onnx_file_path);
   }
   if (node->kind() == ::c10::onnx::If) {
-    AT_ASSERT(node->blocks().size() == 2);
+    TORCH_INTERNAL_ASSERT(node->blocks().size() == 2);
 
     auto then_branch = node_proto->add_attribute();
     then_branch->set_name("then_branch");
@@ -978,7 +978,7 @@ void GraphEncoder::AddAttribute(
     const std::string& ref_attr_name,
     const AttributeKind attr_kind) {
   auto attr = node_proto->add_attribute();
-  AT_ASSERT(name.is_attr());
+  TORCH_INTERNAL_ASSERT(name.is_attr());
   attr->set_name(name.toUnqualString());
   attr->set_ref_attr_name(ref_attr_name);
   attr->set_type(ATenAttributeKindToOnnxAttributeType(attr_kind, name));
@@ -1009,7 +1009,7 @@ void GraphEncoder::AddAttribute(
   };
 
   auto attr = node_proto->add_attribute();
-  AT_ASSERT(name.is_attr());
+  TORCH_INTERNAL_ASSERT(name.is_attr());
   attr->set_name(name.toUnqualString());
   attr->set_type(
       ATenAttributeKindToOnnxAttributeType(node->kindOf(name), name));
@@ -1236,7 +1236,7 @@ void GraphEncoder::EncodeTensor(
   // or use_external_data_format should be true, not both at the same time. They
   // can both be false at the same time (for ONNX export for regular model
   // size).
-  AT_ASSERT(
+  TORCH_INTERNAL_ASSERT(
       !((defer_weight_export_ && external_ref) && use_external_data_format));
   // Add a buffer to the raw_data_export_map for the caller to dump into an
   // external data store. If external_ref is not specified, we instead dump
@@ -1244,18 +1244,19 @@ void GraphEncoder::EncodeTensor(
   if (defer_weight_export_ && external_ref) {
     // For now, we use the name of the tensor as the external lookup name to
     // avoid ONNX protobuf changes.
-    AT_ASSERT(external_ref.value() == tensor_proto->name());
-    AT_ASSERT(raw_data_export_map_.count(external_ref.value()) == 0);
+    TORCH_INTERNAL_ASSERT(external_ref.value() == tensor_proto->name());
+    TORCH_INTERNAL_ASSERT(
+        raw_data_export_map_.count(external_ref.value()) == 0);
     raw_data_export_map_[external_ref.value()] = t;
     tensor_proto->set_raw_data("__EXTERNAL");
   } else {
-    AT_ASSERT(t.is_contiguous());
+    TORCH_INTERNAL_ASSERT(t.is_contiguous());
     size_t tensorSize = static_cast<size_t>(c10::multiply_integers(
         std::begin(tensor.sizes()), std::end(tensor.sizes())));
     if (use_external_data_format &&
         tensorSize > ParamSizeThresholdForExternalStorage) {
-      AT_ASSERT(!onnx_file_path.empty());
-      AT_ASSERT(tensor_proto->has_name());
+      TORCH_INTERNAL_ASSERT(!onnx_file_path.empty());
+      TORCH_INTERNAL_ASSERT(tensor_proto->has_name());
       auto tensorName = GetExternalFileName(tensor_proto->name());
       CreateExternalFile(t, tensorName, onnx_file_path);
       onnx::StringStringEntryProto* location =

From 28a8d2889874c61053eaa77ba95d2b9ccc3cae7b Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Tue, 25 Oct 2022 15:00:39 +0000
Subject: [PATCH 0132/1922] Add prepend argument to nn.Module hooks (#87370)

cc @ezyang @gchanan
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87370
Approved by: https://github.com/soulitzer
---
 test/nn/test_module_hooks.py              | 196 ++++++++++++++++++++++
 torch/distributed/nn/api/remote_module.py |  12 +-
 torch/nn/modules/module.py                |  68 +++++++-
 3 files changed, 270 insertions(+), 6 deletions(-)
 create mode 100644 test/nn/test_module_hooks.py

diff --git a/test/nn/test_module_hooks.py b/test/nn/test_module_hooks.py
new file mode 100644
index 0000000000000..5fe984c2bd6a7
--- /dev/null
+++ b/test/nn/test_module_hooks.py
@@ -0,0 +1,196 @@
+# Owner(s): ["module: nn"]
+from torch.testing._internal.common_utils import (
+    TestCase,
+    run_tests,
+    skipIfTorchDynamo,
+)
+
+import torch
+import torch.nn as nn
+
+from functools import partial
+from typing import List, Tuple
+
+
+class Net(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.seq1 = nn.Sequential(*[nn.Linear(10, 10) for _ in range(2)])
+        self.seq2 = nn.Sequential(*[nn.Linear(10, 10) for _ in range(2)])
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.seq2(self.seq1(x))
+
+
+class ToyModel(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.net1 = Net()
+        self.net2 = Net()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.net2(self.net1(x))
+
+
+def forward_hook(
+    self: TestCase,
+    fired_hooks: List[int],
+    expected_module: nn.Module,
+    hook_id: int,
+    module: nn.Module,
+    inp: Tuple[torch.Tensor],
+    out: torch.Tensor,
+) -> None:
+    fired_hooks.append(hook_id)
+    self.assertEqual(id(module), id(expected_module))
+    self.assertEqual(len(inp), 1)
+
+
+def forward_pre_hook(
+    self: TestCase,
+    fired_hooks: List[int],
+    expected_module: nn.Module,
+    hook_id: int,
+    module: nn.Module,
+    inp: Tuple[torch.Tensor],
+) -> None:
+    fired_hooks.append(hook_id)
+    self.assertEqual(id(module), id(expected_module))
+    self.assertEqual(len(inp), 1)
+
+
+def full_backward_hook(
+    self: TestCase,
+    fired_hooks: List[int],
+    expected_module: nn.Module,
+    hook_id: int,
+    module: nn.Module,
+    grad_input: Tuple[torch.Tensor],
+    grad_output: Tuple[torch.Tensor],
+) -> None:
+    fired_hooks.append(hook_id)
+    self.assertEqual(id(module), id(expected_module))
+    self.assertEqual(len(grad_input), 1)
+    self.assertEqual(len(grad_output), 1)
+
+
+def full_backward_pre_hook(
+    self: TestCase,
+    fired_hooks: List[int],
+    expected_module: nn.Module,
+    hook_id: int,
+    module: nn.Module,
+    grad_input: Tuple[torch.Tensor],
+) -> None:
+    fired_hooks.append(hook_id)
+    self.assertEqual(id(module), id(expected_module))
+    self.assertEqual(len(grad_input), 1)
+
+
+class TestModuleHooks(TestCase):
+
+    @skipIfTorchDynamo("Dynamo does not yet capture hooks")
+    def test_forward_hooks(self):
+        fired_hooks: List[int] = []
+        model = ToyModel()
+        x = torch.randn(10, 10)
+        hook = partial(forward_hook, self, fired_hooks, model.net1.seq2)
+        model.net1.seq2.register_forward_hook(partial(hook, 0))
+        model.net1.seq2.register_forward_hook(partial(hook, 1), prepend=True)
+        model.net1.seq2.register_forward_hook(partial(hook, 2))
+        model.net1.seq2.register_forward_hook(partial(hook, 3))
+        model.net1.seq2.register_forward_hook(partial(hook, 4), prepend=True)
+        expected = [4, 1, 0, 2, 3]
+
+        self.assertEqual(fired_hooks, [])
+        out = model(x)
+        self.assertEqual(fired_hooks, expected)
+        out.sum().backward()
+        self.assertEqual(fired_hooks, expected)
+        model(x).sum().backward()
+        self.assertEqual(fired_hooks, expected + expected)
+
+    @skipIfTorchDynamo("Dynamo does not yet capture hooks")
+    def test_forward_pre_hooks(self):
+        fired_hooks: List[int] = []
+        model = ToyModel()
+        x = torch.randn(10, 10)
+        hook = partial(forward_pre_hook, self, fired_hooks, model.net2.seq1)
+        model.net2.seq1.register_forward_pre_hook(partial(hook, 0), prepend=True)
+        model.net2.seq1.register_forward_pre_hook(partial(hook, 1))
+        model.net2.seq1.register_forward_pre_hook(partial(hook, 2))
+        model.net2.seq1.register_forward_pre_hook(partial(hook, 3))
+        model.net2.seq1.register_forward_pre_hook(partial(hook, 4), prepend=True)
+        expected = [4, 0, 1, 2, 3]
+
+        self.assertEqual(fired_hooks, [])
+        out = model(x)
+        self.assertEqual(fired_hooks, expected)
+        out.sum().backward()
+        self.assertEqual(fired_hooks, expected)
+        model(x).sum().backward()
+        self.assertEqual(fired_hooks, expected + expected)
+
+    @skipIfTorchDynamo("Dynamo does not yet capture hooks")
+    def test_full_backward_hooks(self):
+        fired_hooks: List[int] = []
+        model = ToyModel()
+        x = torch.randn(10, 10)
+        hook = partial(full_backward_hook, self, fired_hooks, model.net1)
+        model.net1.register_full_backward_hook(partial(hook, 0))
+        model.net1.register_full_backward_hook(partial(hook, 1))
+        model.net1.register_full_backward_hook(partial(hook, 2))
+        model.net1.register_full_backward_hook(partial(hook, 3), prepend=True)
+        model.net1.register_full_backward_hook(partial(hook, 4), prepend=True)
+        expected = [4, 3, 0, 1, 2]
+
+        self.assertEqual(fired_hooks, [])
+        out = model(x)
+        self.assertEqual(fired_hooks, [])
+        out.sum().backward()
+        self.assertEqual(fired_hooks, expected)
+        model(x).sum().backward()
+        self.assertEqual(fired_hooks, expected + expected)
+
+    @skipIfTorchDynamo("Dynamo does not yet capture hooks")
+    def test_full_backward_pre_hooks(self):
+        fired_hooks: List[int] = []
+        model = ToyModel()
+        x = torch.randn(10, 10)
+        hook = partial(full_backward_pre_hook, self, fired_hooks, model.net1)
+        model.net1.register_full_backward_pre_hook(partial(hook, 0), prepend=True)
+        model.net1.register_full_backward_pre_hook(partial(hook, 1), prepend=True)
+        model.net1.register_full_backward_pre_hook(partial(hook, 2))
+        model.net1.register_full_backward_pre_hook(partial(hook, 3))
+        model.net1.register_full_backward_pre_hook(partial(hook, 4))
+        expected = [1, 0, 2, 3, 4]
+
+        self.assertEqual(fired_hooks, [])
+        out = model(x)
+        self.assertEqual(fired_hooks, [])
+        out.sum().backward()
+        self.assertEqual(fired_hooks, expected)
+        model(x).sum().backward()
+        self.assertEqual(fired_hooks, expected + expected)
+
+    @skipIfTorchDynamo("Dynamo does not yet capture hooks")
+    def test_mixed_hooks(self):
+        fired_hooks: List[int] = []
+        model = ToyModel()
+        x = torch.randn(10, 10)
+        model.register_forward_pre_hook(partial(forward_pre_hook, self, fired_hooks, model, 0))
+        model.register_forward_hook(partial(forward_hook, self, fired_hooks, model, 1))
+        model.register_full_backward_pre_hook(partial(full_backward_pre_hook, self, fired_hooks, model, 2))
+        model.register_full_backward_hook(partial(full_backward_hook, self, fired_hooks, model, 3))
+
+        self.assertEqual(fired_hooks, [])
+        out = model(x)
+        self.assertEqual(fired_hooks, [0, 1])
+        out.sum().backward()
+        self.assertEqual(fired_hooks, [0, 1, 2, 3])
+        model(x).sum().backward()
+        self.assertEqual(fired_hooks, [0, 1, 2, 3, 0, 1, 2, 3])
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/distributed/nn/api/remote_module.py b/torch/distributed/nn/api/remote_module.py
index 4d1f2fe707769..b7e81ad9d3e64 100644
--- a/torch/distributed/nn/api/remote_module.py
+++ b/torch/distributed/nn/api/remote_module.py
@@ -361,10 +361,18 @@ def register_backward_hook(  # type: ignore[return]
     ) -> RemovableHandle:
         _raise_not_supported(self.register_backward_hook.__name__)
 
-    def register_forward_pre_hook(self, hook: Callable[..., None]) -> RemovableHandle:  # type: ignore[return]
+    def register_forward_pre_hook(  # type: ignore[return]
+        self,
+        hook: Callable[..., None],
+        prepend: bool = False,
+    ) -> RemovableHandle:
         _raise_not_supported(self.register_forward_pre_hook.__name__)
 
-    def register_forward_hook(self, hook: Callable[..., None]) -> RemovableHandle:  # type: ignore[return]
+    def register_forward_hook(  # type: ignore[return]
+        self,
+        hook: Callable[..., None],
+        prepend: bool = False,
+    ) -> RemovableHandle:
         _raise_not_supported(self.register_forward_hook.__name__)
 
     def state_dict(self, *args, **kwargs):
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index 4f4a850f00859..1ce6cc0742ab8 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -1118,7 +1118,9 @@ def convert(t):
         return self._apply(convert)
 
     def register_full_backward_pre_hook(
-        self, hook: Callable[['Module', _grad_t], Union[None, _grad_t]]
+        self,
+        hook: Callable[["Module", _grad_t], Union[None, _grad_t]],
+        prepend: bool = False,
     ) -> RemovableHandle:
         r"""Registers a backward pre-hook on the module.
 
@@ -1141,6 +1143,17 @@ def register_full_backward_pre_hook(
             Modifying inputs inplace is not allowed when using backward hooks and
             will raise an error.
 
+        Args:
+            hook (Callable): The user-defined hook to be registered.
+            prepend (bool): If true, the provided ``hook`` will be fired before
+                all existing ``backward_pre`` hooks on this
+                :class:`torch.nn.modules.Module`. Otherwise, the provided
+                ``hook`` will be fired after all existing ``backward_pre`` hooks
+                on this :class:`torch.nn.modules.Module`. Note that global
+                ``backward_pre`` hooks registered with
+                :func:`register_module_full_backward_pre_hook` will fire before
+                all hooks registered by this method.
+
         Returns:
             :class:`torch.utils.hooks.RemovableHandle`:
                 a handle that can be used to remove the added hook by calling
@@ -1149,6 +1162,8 @@ def register_full_backward_pre_hook(
         """
         handle = hooks.RemovableHandle(self._backward_pre_hooks)
         self._backward_pre_hooks[handle.id] = hook
+        if prepend:
+            self._backward_pre_hooks.move_to_end(handle.id, last=False)  # type: ignore[attr-defined]
         return handle
 
     def register_backward_hook(
@@ -1176,7 +1191,9 @@ def register_backward_hook(
         return handle
 
     def register_full_backward_hook(
-        self, hook: Callable[['Module', _grad_t, _grad_t], Union[None, _grad_t]]
+        self,
+        hook: Callable[["Module", _grad_t, _grad_t], Union[None, _grad_t]],
+        prepend: bool = False,
     ) -> RemovableHandle:
         r"""Registers a backward hook on the module.
 
@@ -1202,6 +1219,17 @@ def register_full_backward_hook(
             Modifying inputs or outputs inplace is not allowed when using backward hooks and
             will raise an error.
 
+        Args:
+            hook (Callable): The user-defined hook to be registered.
+            prepend (bool): If true, the provided ``hook`` will be fired before
+                all existing ``backward`` hooks on this
+                :class:`torch.nn.modules.Module`. Otherwise, the provided
+                ``hook`` will be fired after all existing ``backward`` hooks on
+                this :class:`torch.nn.modules.Module`. Note that global
+                ``backward`` hooks registered with
+                :func:`register_module_full_backward_hook` will fire before
+                all hooks registered by this method.
+
         Returns:
             :class:`torch.utils.hooks.RemovableHandle`:
                 a handle that can be used to remove the added hook by calling
@@ -1216,6 +1244,8 @@ def register_full_backward_hook(
 
         handle = hooks.RemovableHandle(self._backward_hooks)
         self._backward_hooks[handle.id] = hook
+        if prepend:
+            self._backward_hooks.move_to_end(handle.id, last=False)  # type: ignore[attr-defined]
         return handle
 
     def _get_backward_hooks(self):
@@ -1287,7 +1317,9 @@ def _maybe_warn_non_full_backward_hook(self, inputs, result, grad_fn):
                               "some grad_input. Please use register_full_backward_hook to get the documented "
                               "behavior.")
 
-    def register_forward_pre_hook(self, hook: Callable[..., None]) -> RemovableHandle:
+    def register_forward_pre_hook(
+        self, hook: Callable[..., None], prepend: bool = False
+    ) -> RemovableHandle:
         r"""Registers a forward pre-hook on the module.
 
         The hook will be called every time before :func:`forward` is invoked.
@@ -1301,6 +1333,17 @@ def register_forward_pre_hook(self, hook: Callable[..., None]) -> RemovableHandl
         single modified value in the hook. We will wrap the value into a tuple
         if a single value is returned(unless that value is already a tuple).
 
+        Args:
+            hook (Callable): The user defined hook to be registered.
+            prepend (bool): If true, the provided ``hook`` will be fired before
+                all existing ``forward_pre`` hooks on this
+                :class:`torch.nn.modules.Module`. Otherwise, the provided
+                ``hook`` will be fired after all existing ``forward_pre`` hooks
+                on this :class:`torch.nn.modules.Module`. Note that global
+                ``forward_pre`` hooks registered with
+                :func:`register_module_forward_pre_hook` will fire before all
+                hooks registered by this method.
+
         Returns:
             :class:`torch.utils.hooks.RemovableHandle`:
                 a handle that can be used to remove the added hook by calling
@@ -1308,9 +1351,13 @@ def register_forward_pre_hook(self, hook: Callable[..., None]) -> RemovableHandl
         """
         handle = hooks.RemovableHandle(self._forward_pre_hooks)
         self._forward_pre_hooks[handle.id] = hook
+        if prepend:
+            self._forward_pre_hooks.move_to_end(handle.id, last=False)  # type: ignore[attr-defined]
         return handle
 
-    def register_forward_hook(self, hook: Callable[..., None]) -> RemovableHandle:
+    def register_forward_hook(
+        self, hook: Callable[..., None], prepend: bool = False
+    ) -> RemovableHandle:
         r"""Registers a forward hook on the module.
 
         The hook will be called every time after :func:`forward` has computed an output.
@@ -1324,6 +1371,17 @@ def register_forward_hook(self, hook: Callable[..., None]) -> RemovableHandle:
         it will not have effect on forward since this is called after
         :func:`forward` is called.
 
+        Args:
+            hook (Callable): The user defined hook to be registered.
+            prepend (bool): If true, the provided ``hook`` will be fired before
+                all existing ``forward`` hooks on this
+                :class:`torch.nn.modules.Module`. Otherwise, the provided
+                ``hook`` will be fired after all existing ``forward`` hooks on
+                this :class:`torch.nn.modules.Module`. Note that global
+                ``forward`` hooks registered with
+                :func:`register_module_forward_hook` will fire before all hooks
+                registered by this method.
+
         Returns:
             :class:`torch.utils.hooks.RemovableHandle`:
                 a handle that can be used to remove the added hook by calling
@@ -1331,6 +1389,8 @@ def register_forward_hook(self, hook: Callable[..., None]) -> RemovableHandle:
         """
         handle = hooks.RemovableHandle(self._forward_hooks)
         self._forward_hooks[handle.id] = hook
+        if prepend:
+            self._forward_hooks.move_to_end(handle.id, last=False)  # type: ignore[attr-defined]
         return handle
 
     def _slow_forward(self, *input, **kwargs):

From eac8ccae771f532852e32d9905a3927c230d18cd Mon Sep 17 00:00:00 2001
From: min-jean-cho <min.jean.cho@intel.com>
Date: Tue, 25 Oct 2022 19:24:35 +0000
Subject: [PATCH 0133/1922] build: support DNNL_GRAPH_CPU_RUNTIME=TBB  (#87512)

Force set cmake `DNNL_GRAPH_CPU_RUNTIME` as `MKLDNN_CPU_RUNTIME` to overwrite [`set(DNNL_GRAPH_CPU_RUNTIME "OMP")`](https://github.com/oneapi-src/oneDNN/blob/d19d0f795c60695bd32f894c6f01771b2dfbe24d/cmake/options.cmake#L65-L67), enabling user-specified `MKLDNN_CPU_RUNTIME` values (`OMP` (default), `TBB`) for `DNNL_GRAPH_CPU_RUNTIME`.

Fixes https://github.com/pytorch/pytorch/issues/87511
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87512
Approved by: https://github.com/jgong5, https://github.com/ashokei, https://github.com/malfet
---
 cmake/Modules/FindMKLDNN.cmake | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cmake/Modules/FindMKLDNN.cmake b/cmake/Modules/FindMKLDNN.cmake
index e2f427be67c89..30ac5401ddf32 100644
--- a/cmake/Modules/FindMKLDNN.cmake
+++ b/cmake/Modules/FindMKLDNN.cmake
@@ -76,6 +76,8 @@ IF(NOT MKLDNN_FOUND)
   SET(DNNL_BUILD_EXAMPLES FALSE CACHE BOOL "" FORCE)
   SET(DNNL_LIBRARY_TYPE STATIC CACHE STRING "" FORCE)
   SET(DNNL_ENABLE_PRIMITIVE_CACHE TRUE CACHE BOOL "" FORCE)
+  SET(DNNL_GRAPH_CPU_RUNTIME ${MKLDNN_CPU_RUNTIME} CACHE STRING "" FORCE)
+
   IF(BUILD_ONEDNN_GRAPH)
     SET(DNNL_GRAPH_LIBRARY_TYPE STATIC CACHE STRING "" FORCE)
   ENDIF(BUILD_ONEDNN_GRAPH)

From f8e4d1e54d221dd439836a462e3620bfaea07e70 Mon Sep 17 00:00:00 2001
From: Xu Zhao <xzhao9@fb.com>
Date: Tue, 25 Oct 2022 19:38:41 +0000
Subject: [PATCH 0134/1922] Use setup_instance script to enable conda and load
 cuda libraries (#87296)

Fixes the broken torchbench CI after the machine image update.
RUN_TORCHBENCH: nvfuser

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87296
Approved by: https://github.com/davidberard98
---
 .github/workflows/run_torchbench.yml | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/run_torchbench.yml b/.github/workflows/run_torchbench.yml
index 2d1013abafc02..b6c870fa7839d 100644
--- a/.github/workflows/run_torchbench.yml
+++ b/.github/workflows/run_torchbench.yml
@@ -1,4 +1,4 @@
-name: TorchBench CI (pytorch-linux-py3.7-cu102)
+name: TorchBench CI (pytorch-linux-py3.8-cu116)
 on:
   pull_request:
 
@@ -6,6 +6,7 @@ env:
   PYTHON_VERSION: "3.8"
   # must be consistent with https://github.com/pytorch/benchmark/blob/main/requirements.txt#L19
   NUMPY_VERSION: "1.21.2"
+  SETUP_SCRIPT: "/data/nvme/bin/setup_instance.sh"
   PR_NUM: ${{ github.event.number }}
   PR_BODY: ${{ github.event.pull_request.body }}
   PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
@@ -35,18 +36,19 @@ jobs:
       - name: Create conda environment and install deps
         run: |
           conda create -y -n pr-ci python="${PYTHON_VERSION}"
-          # shellcheck disable=SC1091
-          . "${HOME}"/anaconda3/etc/profile.d/conda.sh
+          # shellcheck source=/dev/null
+          . "${SETUP_SCRIPT}"
           conda activate pr-ci
           # pin cmake version to 3.22 since 3.23 breaks pytorch build
           # see details at: https://github.com/pytorch/pytorch/issues/74985
           conda install -y numpy="${NUMPY_VERSION}" requests ninja pyyaml mkl mkl-include \
                            setuptools cmake=3.22 cffi typing_extensions boto3 \
                            future six dataclasses pillow pytest tabulate gitpython git-lfs tqdm psutil
+          pip install --pre torch torchvision torchtext -f https://download.pytorch.org/whl/nightly/cu116/torch_nightly.html
       - name: Setup TorchBench branch
         run: |
-          # shellcheck disable=SC1091
-          . "${HOME}"/anaconda3/etc/profile.d/conda.sh
+          # shellcheck source=/dev/null
+          . "${SETUP_SCRIPT}"
           conda activate pr-ci
           PR_BODY_FILE=/tmp/pr-body.txt
           echo "$PR_BODY" > ${PR_BODY_FILE}
@@ -69,8 +71,8 @@ jobs:
           popd
           PR_BODY_FILE=/tmp/pr-body.txt
           echo "$PR_BODY" > ${PR_BODY_FILE}
-          # shellcheck disable=SC1091
-          . "${HOME}"/anaconda3/etc/profile.d/conda.sh
+          # shellcheck source=/dev/null
+          . "${SETUP_SCRIPT}"
           conda activate pr-ci
           python3 pytorch/.github/scripts/run_torchbench.py \
                   --pr-body "$PR_BODY_FILE" \
@@ -82,7 +84,8 @@ jobs:
                   --pr-head-sha "$PR_HEAD_SHA"
       - name: Upload result to S3
         run: |
-          . "${HOME}"/anaconda3/etc/profile.d/conda.sh
+          # shellcheck source=/dev/null
+          . "${SETUP_SCRIPT}"
           conda activate pr-ci
           python3 pytorch/.github/scripts/run_torchbench.py \
                   upload-s3 \

From 7301114839d54f112e797eb6c8082b73a348da24 Mon Sep 17 00:00:00 2001
From: AllenTiTaiWang <titaiwang@microsoft.com>
Date: Tue, 25 Oct 2022 16:31:45 +0000
Subject: [PATCH 0135/1922] [ONNX] Fix pad Circular Mode (#86984)

In https://github.com/pytorch/pytorch/pull/73433, a ONNX test case is missed, and the result is incorrect when it is converted to ONNX.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86984
Approved by: https://github.com/BowenBao
---
 test/onnx/test_pytorch_onnx_onnxruntime.py | 21 +++++++++++++++++
 torch/onnx/symbolic_opset11.py             |  8 ++++++-
 torch/onnx/symbolic_opset9.py              | 27 ++++++++++++++--------
 3 files changed, 46 insertions(+), 10 deletions(-)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index bc70011b78871..4577dafdad56c 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -7437,6 +7437,27 @@ def forward(self, x, pad: List[int]):
         x = torch.randn(2, 2, 4, 4)
         self.run_test(Pad(), (x, pad))
 
+    @skipIfUnsupportedMinOpsetVersion(11)
+    def test_pad_circular(self):
+        class PadModel(torch.nn.Module):
+            def forward(self, x):
+                out = torch.nn.functional.pad(x, (1, 2, 1, 2), mode="circular")
+                return out
+
+        x = torch.randn(2, 3, 3, 4)
+        self.run_test(PadModel(), (x))
+
+    @skipIfUnsupportedMinOpsetVersion(11)
+    def test_pad_circular_negative(self):
+        # Test for different pad integer types
+        class PadModel(torch.nn.Module):
+            def forward(self, x):
+                out = torch.nn.functional.pad(x, (-1, -2), mode="circular")
+                return out
+
+        x = torch.randn(2, 3, 6)
+        self.run_test(PadModel(), (x))
+
     @skipIfUnsupportedMaxOpsetVersion(10)
     @skipScriptTest()  # TODO: the logic in symbolic_opset9 doesn't handle script
     def test_unsupported_pad(self):
diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py
index 2d3993c417014..c845d6dcc2e4d 100644
--- a/torch/onnx/symbolic_opset11.py
+++ b/torch/onnx/symbolic_opset11.py
@@ -821,7 +821,13 @@ def replication_pad(g: jit_utils.GraphContext, input, padding):
 
 @_onnx_symbolic("aten::pad")
 @_beartype.beartype
-def pad(g: jit_utils.GraphContext, input, pad, mode, value):
+def pad(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    pad: _C.Value,
+    mode: _C.Value,
+    value: _C.Value,
+):
     mode = symbolic_helper._parse_arg(mode, "s")
     if mode == "replicate":
         return replication_pad(g, input, pad)
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 0daabb5e333d9..c071438169da3 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -4,6 +4,7 @@
 release on 01/23/19
 """
 
+import builtins
 import functools
 import math
 import sys
@@ -1851,32 +1852,34 @@ def constant_pad_nd(g: jit_utils.GraphContext, input, padding, value):
     )
 
 
-@_onnx_symbolic("aten::_pad_circular")
 @_beartype.beartype
-def _pad_circular(g: jit_utils.GraphContext, input, pad):
+def _pad_circular(g: jit_utils.GraphContext, input: _C.Value, pad: _C.Value):
     padding = _convert_padding_node(pad)
     assert len(padding) % 2 == 0
     ndim = len(padding) // 2
 
     cur = input
     for idx in range(ndim):
-        pad_l = padding[-(2 * idx + 1)]
-        pad_r = padding[-(2 * idx + 2)]
-
+        pad_r = padding[-(2 * idx + 1)]
+        pad_l = padding[-(2 * idx + 2)]
+        # get size for targeting the last idx, as Slice don't take start=[-1], end=[-1]
+        size = symbolic_helper._get_tensor_sizes(input)
         tensors = []
         if pad_l > 0:
             left = symbolic_helper._slice_helper(
-                g, cur, axes=[2 + idx], starts=[-(pad_l + 1)], ends=[-1]
+                g, cur, axes=[2 + idx], starts=[-(pad_l)], ends=[size[2 + idx]]
             )
             tensors.append(left)
 
         if pad_l < 0 or pad_r < 0:
+            start = builtins.max(0, -pad_l)
+            end = -(builtins.max(0, -pad_r))
             middle = symbolic_helper._slice_helper(
                 g,
                 cur,
                 axes=[2 + idx],
-                starts=[max(0, -pad_l)],
-                ends=[-(1 + max(0, -pad_r))],
+                starts=[start],
+                ends=[end],
             )
             tensors.append(middle)
         else:
@@ -1921,7 +1924,13 @@ def replication_pad(g: jit_utils.GraphContext, input, padding):
 
 @_onnx_symbolic("aten::pad")
 @_beartype.beartype
-def pad(g: jit_utils.GraphContext, input, pad, mode, value):
+def pad(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    pad: _C.Value,
+    mode: _C.Value,
+    value: _C.Value,
+):
     mode = symbolic_helper._parse_arg(mode, "s")
     if mode == "replicate":
         return replication_pad(g, input, pad)

From 2caad3ecbfc831826241eadfdd8c9b0b97e7eb4e Mon Sep 17 00:00:00 2001
From: "S.Cao-office" <scao.math@gmail.com>
Date: Tue, 25 Oct 2022 19:51:42 +0000
Subject: [PATCH 0136/1922] Fixed minor typos in torch.flip and torch.rot90
 (#87724)

Fixes #87721

@malfet

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87724
Approved by: https://github.com/malfet
---
 torch/_torch_docs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index d84ed259b6d38..c7b8d796a497d 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -11155,7 +11155,7 @@ def merge_dicts(*dicts):
     r"""
 flip(input, dims) -> Tensor
 
-Reverse the order of a n-D tensor along given axis in dims.
+Reverse the order of an n-D tensor along given axis in dims.
 
 .. note::
     `torch.flip` makes a copy of :attr:`input`'s data. This is different from NumPy's `np.flip`,
@@ -11312,7 +11312,7 @@ def merge_dicts(*dicts):
     r"""
 rot90(input, k=1, dims=[0,1]) -> Tensor
 
-Rotate a n-D tensor by 90 degrees in the plane specified by dims axis.
+Rotate an n-D tensor by 90 degrees in the plane specified by dims axis.
 Rotation direction is from the first towards the second axis if k > 0, and from the second towards the first for k < 0.
 
 Args:

From f5004129a2672c8a617cc2aa7619d0376108c244 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Tue, 25 Oct 2022 19:58:23 +0000
Subject: [PATCH 0137/1922] [inductor] Revert channels-last support (#87588)

We witnessed slow compilation times last week. Earlier, I thought it was due to parallel compilation. But, after git bisect, I found the source of extra time to be my PR - https://github.com/pytorch/pytorch/pull/87049

For 1x1 kernel, the current striding check incorrectly declares channels-first 1x1 convs to channels last. I am not sure why it caused so much compilation time jump.  Or why it did not fail? There was no change in performance speedup. cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang @penguinwu to identify what could be source of this compilation time increase, so that we can manually check that part of the stack.

With this `res2next50` compilation time went back to 96 seconds (which was raised to 900 seconds with my earlier PR) for single thread. And parallel-compilation brings it down to ~30 seconds.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87588
Approved by: https://github.com/soumith, https://github.com/jansel, https://github.com/ngimel
---
 benchmarks/dynamo/common.py         | 17 +++++++++++------
 test/inductor/test_torchinductor.py |  1 +
 torch/_inductor/ir.py               | 14 +++++++++++---
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index b1f8bbd993f3b..86e6bb62842f6 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -98,6 +98,8 @@ def set_model_name(name):
     "cait_m36_384",  # Accuracy
     "ghostnet_100",  # Accuracy
     "swin_base_patch4_window7_224",  # Accuracy
+    # Trying to get CI working - https://github.com/pytorch/pytorch/pull/87588
+    "visformer_small",  # fails accuracy on CI but passes locally
 ]
 
 CI_SKIP_INDUCTOR_TRAINING = [
@@ -809,12 +811,15 @@ def setup_amp(self):
             self.autocast = torch.cuda.amp.autocast
 
     def init_optimizer(self, device, params):
-        param_list = list(params)
-        if device == "cuda" and len(param_list) != 0:
-            # capturable is only supported on cuda at the moment
-            self.optimizer = torch.optim.Adam(param_list, capturable=True)
-        else:
-            self.optimizer = None
+        self.optimizer = None
+        # TODO - Currently, optimizers are used incorrectly. Fix optimizers with
+        # https://github.com/pytorch/pytorch/pull/87492
+        # param_list = list(params)
+        # if device == "cuda" and len(param_list) != 0:
+        #     # capturable is only supported on cuda at the moment
+        #     self.optimizer = torch.optim.Adam(param_list, capturable=True)
+        # else:
+        #     self.optimizer = None
 
     @property
     def args(self):
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index bec1ea197c078..c0139b3fcdf86 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -1475,6 +1475,7 @@ def fn(x, w, b):
             check_lowp=False,
         )
 
+    @unittest.skipIf(HAS_CUDA, "only support cpu channels_last")
     def test_conv2d_channels_last(self):
         m = torch.nn.Sequential(
             torch.nn.Conv2d(3, 3, 1, 1),
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 889e30bb54449..867e26e56c5ef 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -3137,9 +3137,17 @@ def create(
             ):
                 valid_cudnn = True
 
-            valid_device = x.get_device().type == "cpu" or (
-                x.get_device().type == "cuda" and valid_cudnn
-            )
+            # TODO - We cannot use strides to identify if a tensor is
+            # channels-last for 1x1 kernels. Incorrectly identifying the
+            # channels last configuration leads to a dramatic increase in
+            # compilation time. Unfortuantely, this breaks the channels last
+            # support.
+            # valid_device = x.get_device().type == "cpu" or (
+            #     x.get_device().type == "cuda" and valid_cudnn
+            # )
+
+            valid_device = x.get_device().type == "cpu"
+
             if (
                 valid_device
                 and len(x.get_size()) == 4

From a6ab0090596f7e05d9f57e898a87d3849ee4fc65 Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Tue, 25 Oct 2022 04:46:42 +0000
Subject: [PATCH 0138/1922] Fix _refs for aten.zeros/ones/empty/randn (#87569)

refs for aten.zeros/ones/empty/randn doesn't support .names overload.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87569
Approved by: https://github.com/ngimel
---
 torch/_meta_registrations.py | 2 --
 torch/_refs/__init__.py      | 8 ++++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index cb961ff898790..873f942da42ab 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1658,8 +1658,6 @@ def activate_meta():
             "aten::clone",  # causing infinite recursion
             "aten::_to_copy",  # causing infinite recursion, test_serialization.py -k test_tensor_subclass_getstate_overwrite  # noqa: B950
             "aten::randn",  # pin_memory parameter is not supported!, test_proxy_tensor.py -k test_make_fx_symbolic_exhaustive_randn_cpu_float32  # noqa: B950
-            "aten::zeros.names",  # TypeError: zeros() got an unexpected keyword argument 'names', inductor/test_torchinductor.py -k test_zeros_cpu  # noqa: B950
-            "aten::empty.names",  # TypeError: empty() got an unexpected keyword argument 'names', inductor/test_torchinductor.py -k test_zeros_cpu  # noqa: B950
             "aten::add.Tensor",  # ValueError: Receive two Number inputs to an elementwise binary operation! inductor/test_torchinductor.py -k test_both_scalars  # noqa: B950
             "aten::sub.Tensor",  # ValueError: Receive two Number inputs to an elementwise binary operation! inductor/test_torchinductor.py -k test_both_scalars  # noqa: B950
             "aten::mul.Tensor",  # ValueError: Receive two Number inputs to an elementwise binary operation! inductor/test_torchinductor.py -k test_both_scalars  # noqa: B950
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index ccb44c6367a50..2e91ceeeb679d 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -3729,7 +3729,7 @@ def ravel(a: TensorLikeType) -> TensorLikeType:
     return reshape(a, (-1,))
 
 
-@register_decomposition(torch.ops.aten.empty)
+@register_decomposition(torch.ops.aten.empty.memory_format)
 @out_wrapper()
 def empty(
     *shape,
@@ -3822,7 +3822,7 @@ def new_empty_strided(
     )
 
 
-@register_decomposition(torch.ops.aten.zeros)
+@register_decomposition(torch.ops.aten.zeros.default)
 @out_wrapper()
 def zeros(
     *size,
@@ -3874,7 +3874,7 @@ def new_zeros(
     )
 
 
-@register_decomposition(torch.ops.aten.ones)
+@register_decomposition(torch.ops.aten.ones.default)
 @out_wrapper()
 def ones(
     *size,
@@ -4409,7 +4409,7 @@ def full_like(
 ones_like = partial(full_like, fill_value=True)
 
 # TODO: add pin_memory support
-@register_decomposition(torch.ops.aten.randn)
+@register_decomposition(torch.ops.aten.randn.default)
 @out_wrapper()
 def randn(
     *shape,

From 0df79889c50bfaa60eb71b4a4a04353855333da3 Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Tue, 25 Oct 2022 09:58:57 -0700
Subject: [PATCH 0139/1922] [ao] Adding FAQ to docs (#87322)

Summary: migrated from: https://discuss.pytorch.org/t/quantization-frequently-asked-questions/161251

Test Plan: circle CI tests

Reviewers:

Subscribers:

Tasks:

Tags:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87322
Approved by: https://github.com/z-a-f
---
 docs/source/quantization.rst | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index e00720570a1a3..55fa6b0c604d2 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -998,6 +998,25 @@ if ``dtype`` is ``torch.qint8``, make sure to set a custom ``quant_min`` to be `
 you call the `torch.ao.quantization.get_default_qconfig(backend)` or `torch.ao.quantization.get_default_qat_qconfig(backend)` function to get the default ``qconfig`` for
 ``fbgemm`` or ``qnnpack`` backend
 
+Frequently Asked Questions
+--------------------------
+
+1. How can I do quantized inference on GPU?:
+
+   We don't have official GPU support yet, but this is an area of active development, you can find more information
+   `here <https://github.com/pytorch/pytorch/issues/87395>`_
+
+2. Where can I get ONNX support for my quantized model?:
+
+   You can open an issue in `GitHub - onnx/onnx <https://github.com/onnx/onnx>`_  when you encounter problems with ONNX,
+   or reach out to people in this list: `PyTorch Governance | Maintainers | ONNX exporter <https://pytorch.org/docs/stable/community/persons_of_interest.html#onnx-exporter>`_
+
+3. How can I use quantization with LSTM's?:
+
+   LSTM is supported through our custom module api in both eager mode and fx graph mode quantization. Examples can be found at
+   Eager Mode: `pytorch/test_quantized_op.py TestQuantizedOps.test_custom_module_lstm <https://github.com/pytorch/pytorch/blob/9b88dcf248e717ca6c3f8c5e11f600825547a561/test/quantization/core/test_quantized_op.py#L2782>`_
+   FX Graph Mode: `pytorch/test_quantize_fx.py TestQuantizeFx.test_static_lstm <https://github.com/pytorch/pytorch/blob/9b88dcf248e717ca6c3f8c5e11f600825547a561/test/quantization/fx/test_quantize_fx.py#L4116>`_
+
 Common Errors
 ---------------------------------------
 

From 9cd7b7cec2afa53370474d74eb8fcb5e66b849b6 Mon Sep 17 00:00:00 2001
From: Michael Voznesensky <voznesenskym@gmail.com>
Date: Tue, 25 Oct 2022 21:15:40 +0000
Subject: [PATCH 0140/1922] [Dynamo] Symbolic shape guards (#87570)

**Introduces symbolic shape guards into dynamo.**

In this PR, we take the existing fake tensor infra and plumbing in dynamo and we start passing a shape_env around. This shape_env does not get plumbed down to middle layers / backend yet - it only collects expressions from frontend invocations at the moment. We then translate these expressions into guards at the point where we take other guards installed throughout dynamo - and add them to check_fn.

Part 1 of https://docs.google.com/document/d/1QJ-M4zfMkD-fjHIqW089RptjLl9EgozZGCceUbvmgfY/edit#

cc @jansel @lezcano @fdrocha @mlazos @soumith @yanboliang @penguinwu @anijain2305
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87570
Approved by: https://github.com/ezyang
---
 test/dynamo/test_dynamic_shapes.py | 153 +++++++++++++++++++++++++-
 test/dynamo/test_functions.py      |   1 +
 test/dynamo/test_repros.py         |   5 +-
 test/functorch/test_aotdispatch.py |   1 +
 test/test_proxy_tensor.py          |   5 +
 torch/_dynamo/convert_frame.py     |   2 +-
 torch/_dynamo/guards.py            | 167 ++++++++++++++++++++++++++++-
 torch/_dynamo/output_graph.py      |   5 +
 torch/_dynamo/symbolic_convert.py  |   3 +-
 torch/_dynamo/utils.py             |  46 +++++++-
 torch/_dynamo/variables/builtin.py |  18 +++-
 torch/_dynamo/variables/tensor.py  |  22 ++--
 torch/_dynamo/variables/torch.py   |  18 ++++
 torch/_subclasses/fake_tensor.py   |   2 +-
 torch/_subclasses/meta_utils.py    |   2 +-
 15 files changed, 427 insertions(+), 23 deletions(-)

diff --git a/test/dynamo/test_dynamic_shapes.py b/test/dynamo/test_dynamic_shapes.py
index a2a94fce1e559..a32825d03aeaa 100644
--- a/test/dynamo/test_dynamic_shapes.py
+++ b/test/dynamo/test_dynamic_shapes.py
@@ -3,14 +3,26 @@
 from torch._dynamo.testing import make_test_cls_with_patches
 
 try:
-    from . import test_functions, test_misc, test_modules, test_repros, test_unspec
+    from . import (
+        test_export,
+        test_functions,
+        test_misc,
+        test_modules,
+        test_repros,
+        test_subgraphs,
+        test_unspec,
+    )
 except ImportError:
+    import test_export
     import test_functions
     import test_misc
     import test_modules
     import test_repros
+    import test_subgraphs
     import test_unspec
 
+import unittest
+
 
 def make_dynamic_cls(cls):
     return make_test_cls_with_patches(
@@ -23,6 +35,145 @@ def make_dynamic_cls(cls):
 DynamicShapesReproTests = make_dynamic_cls(test_repros.ReproTests)
 DynamicShapesNNModuleTests = make_dynamic_cls(test_modules.NNModuleTests)
 DynamicShapesUnspecTests = make_dynamic_cls(test_unspec.UnspecTests)
+DynamicShapesExportTests = make_dynamic_cls(test_export.ExportTests)
+DynamicShapesSubGraphTests = make_dynamic_cls(test_subgraphs.SubGraphTests)
+
+
+# DynamicShapesFunctionTests
+unittest.expectedFailure(
+    DynamicShapesFunctionTests.test_len_tensor_dynamic_shapes
+    # TypeError: 'torch._C.SymIntNode' object cannot be interpreted as an integer
+)
+
+unittest.expectedFailure(
+    DynamicShapesFunctionTests.test_tensor_len_dynamic_shapes
+    # TypeError: 'torch._C.SymIntNode' object cannot be interpreted as an integer
+)
+
+
+# DynamicShapesReproTests
+unittest.expectedFailure(
+    DynamicShapesReproTests.test_reformer_eval_dynamic_shapes
+    # TypeError: 'torch._C.SymIntNode' object cannot be interpreted as an integer
+)
+
+unittest.expectedFailure(
+    DynamicShapesReproTests.test_reformer_train_dynamic_shapes
+    # TypeError: 'torch._C.SymIntNode' object cannot be interpreted as an integer
+)
+
+unittest.expectedFailure(
+    DynamicShapesReproTests.test_issue175_dynamic_shapes
+    # TypeError: 'torch._C.SymIntNode' object cannot be interpreted as an integer
+)
+
+unittest.expectedFailure(
+    DynamicShapesReproTests.test_do_paste_mask_dynamic_shapes
+    # aten.min.dim - couldn't find symbolic meta function/decomposition
+)
+
+unittest.expectedFailure(
+    DynamicShapesReproTests.test_convert_boxes_to_pooler_format_dynamic_shapes
+    # Could not infer dtype of torch._C.SymIntNode
+)
+
+unittest.expectedFailure(
+    DynamicShapesReproTests.test_ellipsis_dynamic_shapes
+    # Cannot call sizes() on tensor with symbolic sizes/strides
+)
+
+unittest.expectedFailure(
+    DynamicShapesReproTests.test_hf_t5_forward_dynamic_shapes
+    # Cannot call sizes() on tensor with symbolic sizes/strides
+)
+
+unittest.expectedFailure(
+    DynamicShapesReproTests.test_reformer_sorting_dynamic_shapes
+    # Unable to cast Python instance to C++ type
+)
+
+unittest.expectedFailure(
+    DynamicShapesReproTests.test_boxes_len_dynamic_shapes
+    # Unable to cast Python instance to C++ type
+)
+
+unittest.expectedFailure(
+    DynamicShapesReproTests.test_guard_fail_tensor_bool_dynamic_shapes
+    # RuntimeError: aten.allclose.default - couldn't find symbolic meta function/decomposition
+)
+
+# DynamicShapesMiscTests
+unittest.expectedFailure(
+    DynamicShapesMiscTests.test_unsupported_fake_tensor_dynamic_shapes
+    # aten.quantize_per_tensor.default - couldn't find symbolic meta function/decomposition
+)
+unittest.expectedFailure(
+    DynamicShapesMiscTests.test_module_deepcopy_dynamic_shapes
+    # aten.squeeze_.dim - couldn't find symbolic meta function/decompositio
+)
+
+# DynamicShapesUnspecTests
+unittest.expectedFailure(
+    DynamicShapesUnspecTests.test_unspec_float_precision_dynamic_shapes
+    # float() argument must be a string or a real number, not 'torch._C.SymIntNode'
+)
+
+
+# DynamicShapesNNModuleTests
+unittest.expectedFailure(
+    DynamicShapesNNModuleTests.test_unsupportedmethod_dynamic_shapes
+    # aten.squeeze_.dim - couldn't find symbolic meta function/decomposition
+)
+
+unittest.expectedFailure(
+    DynamicShapesNNModuleTests.test_unsupportedmodule_dynamic_shapes
+    # aten.squeeze_.dim - couldn't find symbolic meta function/decomposition
+)
+
+unittest.expectedFailure(
+    DynamicShapesNNModuleTests.test_self_mutating1_dynamic_shapes
+    # aten.squeeze_.dim - couldn't find symbolic meta function/decomposition
+)
+
+unittest.expectedFailure(
+    DynamicShapesNNModuleTests.test_call_fn_with_non_const_inputs_safe_dynamic_shapes
+    # aten.squeeze_.dim - couldn't find symbolic meta function/decomposition
+)
+
+
+# DynamicShapesExportTests
+unittest.expectedFailure(
+    DynamicShapesExportTests.test_export_compare_optimize_with_make_fx_dynamic_shapes
+)
+unittest.expectedFailure(
+    DynamicShapesExportTests.test_export_with_constant_list_nonzero_dynamic_shapes
+)
+unittest.expectedFailure(
+    DynamicShapesExportTests.test_export_with_constant_list_nonzero_free_function_dynamic_shapes
+)
+unittest.expectedFailure(
+    DynamicShapesExportTests.test_export_with_constant_tuple_nonzero_dynamic_shapes
+)
+unittest.expectedFailure(
+    DynamicShapesExportTests.test_export_with_stack_trace_dynamic_shapes
+)
+unittest.expectedFailure(
+    DynamicShapesExportTests.test_zeroes_in_new_shape_scalar_out_dynamic_shapes
+)
+unittest.expectedFailure(
+    DynamicShapesExportTests.test_zeroes_in_new_shape_scalar_out_permute_dupe_and_bypass_dynamic_shapes
+)
+unittest.expectedFailure(
+    DynamicShapesExportTests.test_zeroes_in_new_shape_scalar_out_permute_dynamic_shapes
+)
+
+
+# DynamicShapesSubGraphTests
+unittest.expectedFailure(
+    DynamicShapesSubGraphTests.test_enumerate_not_break_graph_dynamic_shapes
+)
+unittest.expectedFailure(DynamicShapesSubGraphTests.test_restore_state_dynamic_shapes)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py
index d18ef7e1173fe..d428a4369fc1e 100644
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@@ -6,6 +6,7 @@
 import itertools
 import operator
 from typing import Any
+from unittest.mock import patch
 
 import torch
 
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index bbb8ba527fc73..66fc19895dd62 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -872,8 +872,9 @@ def test_longformer_chunk(self):
         self.assertTrue(same(opt_fn(input1), correct1))
         self.assertTrue(same(opt_fn(input2), correct2))
 
-        self.assertEqual(cnt.frame_count, ifdyn(1, 2))
-        self.assertEqual(cnt.op_count, ifdyn(19, 4))
+        # Dyn recompiles are due to changes in hidden_state (Should we be guarding on this?)
+        self.assertEqual(cnt.frame_count, ifdyn(4, 2))
+        self.assertEqual(cnt.op_count, ifdyn(76, 4))
 
     def test_hf_t5_forward(self):
         input = torch.randn([1, 2048, 512])
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 57013636eeabf..d406f2eb53047 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1174,6 +1174,7 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('nn.functional.rrelu', ''),  # aten.rrelu_with_noise.default - couldn't find symbolic meta function...
     xfail('nn.functional.smooth_l1_loss', ''),  # could not find kernel
     xfail('nn.functional.unfold', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
+    xfail('unfold', ''),  # aten.squeeze_copy.dim - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.upsample_bilinear', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.upsample_nearest', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('norm', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 3c2e818497a48..1e72d5a4bc277 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1288,6 +1288,7 @@ def f(a, b, c, d, e):
     xfail('nn.functional.unfold', ''),  # aten.im2col.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.upsample_bilinear', ''),  # aten.upsample_bilinear2d.vec - couldn't find symbolic meta function/de...
     xfail('nn.functional.upsample_nearest', ''),  # aten.upsample_nearest1d.vec - couldn't find symbolic meta function/deco...
+    xfail('nonzero', ''),  # aten.nonzero.default - couldn't find symbolic meta function/decomposition
     xfail('norm', 'nuc'),  # aten._linalg_svd.default - couldn't find symbolic meta function/decomposition
     xfail('normal', ''),  # aten.normal.Tensor_Tensor - couldn't find symbolic meta function/decomposition
     xfail('normal', 'number_mean'),  # aten.normal.float_Tensor - couldn't find symbolic meta function/decomposition
@@ -1305,6 +1306,7 @@ def f(a, b, c, d, e):
     xfail('qr', ''),  # aten.linalg_qr.default - couldn't find symbolic meta function/decomposition
     xfail('rad2deg', ''),  # aten.rad2deg.default - couldn't find symbolic meta function/decomposition
     xfail('renorm', ''),  # aten.renorm.default - couldn't find symbolic meta function/decomposition
+    xfail('repeat_interleave', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('reshape_as', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('resize_', ''),  # aten.clone.default - couldn't find symbolic meta function/decomposition
     xfail('resize_as_', ''),  # aten.clone.default - couldn't find symbolic meta function/decomposition
@@ -1354,6 +1356,8 @@ def f(a, b, c, d, e):
     xfail('view_as', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('vsplit', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('unbind', ''),  # aten.unbind.int - couldn't find symbolic meta function/decomposition
+    xfail('unique_consecutive', ''),  # aten.unique_consecutive.default - couldn't find symbolic meta function/decomposition
+    xfail('unique', ''),  # aten._unique2.default - couldn't find symbolic meta function/decomposition
 }
 symbolic_tensor_segfaults = {
     skip('nn.functional.batch_norm')  # Segfault??
@@ -1454,6 +1458,7 @@ def f(a, b, c, d, e):
     xfail('true_divide', ''),  # aten.div_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('trunc', ''),  # aten.trunc_.default - couldn't find symbolic meta function/decomposition
     xfail('uniform', ''),  # aten.uniform_.default - couldn't find symbolic meta function/decomposition
+    xfail('unique', ''),  # aten.unique_consecutive.default - couldn't find symbolic meta function/decomposition
     xfail('unsqueeze', ''),  # aten.unsqueeze_.default - couldn't find symbolic meta function/decomposition
     xfail('xlogy', ''),  # aten.xlogy_.Tensor - couldn't find symbolic meta function/decomposition
 }
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index 46a23b330a0a4..206cffb7aeeda 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -417,7 +417,7 @@ def transform(instructions, code_options):
 
         assert output.guards is not None
         CleanupManager.instance[out_code] = output.cleanups
-        check_fn = CheckFunctionManager(output.guards, locals, globals)
+        check_fn = CheckFunctionManager(output, output.guards, locals, globals)
 
         guarded_code = GuardedCode(out_code, check_fn.check_fn)
         guard_str = "GUARDS:\n"
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index 1f43ac667e579..9edd6f60560df 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -12,7 +12,10 @@
 
 import numpy as np
 
+import sympy
+
 import torch
+from torch.fx.experimental.symbolic_shapes import FloorDiv
 
 from . import config, convert_frame, mutation_guard
 from .eval_frame import set_guard_error_hook, set_guard_fail_hook
@@ -176,6 +179,7 @@ def __init__(
         # Code is python expression strings generated for each guard
         self.code: List[str] = []
         self.tensor_check_names = []
+        self.tensor_check_ids = {}
         self.tensor_check_examples = []
         self.guarded_code = guarded_code
 
@@ -414,9 +418,13 @@ def TENSOR_MATCH(self, guard: Guard):
             self.ID_MATCH(guard)
         else:
             value = self.get(guard.name)
-            self.tensor_check_names.append(self.arg_ref(guard))
+            tensor_name = self.arg_ref(guard)
+            self.tensor_check_names.append(tensor_name)
             self.tensor_check_examples.append(value)
 
+            # STOP - DO NOT USE id_ref FOR TENSORS - TENSOR INVALIDATION RULES DIFFER
+            self.tensor_check_ids[tensor_name] = id(value)
+
             # Note: Guard code produced for tensor_match is a little different.
             # We accumulate tensor names, then do a single install of `___check_tensors`.
             # See _guards.cpp and TensorGuard for more information.
@@ -469,6 +477,62 @@ class GuardedCode:
     check_fn: Callable
 
 
+from sympy.printing.str import StrPrinter
+
+
+@dataclasses.dataclass
+class TensorReference(object):
+    """
+    TensorReference objects are entirely optional. They are created to give us hints
+    into where the symbolic shape came from.
+
+    ref_id: The id of the tensor
+    kind: A string tracking where in the tensor this value came from ("size","stride", etc)
+    idx: An index in the structure
+
+    NOTE - A symbolic shape coming from tensor at id 12345's shape dim 2, would be
+    TensorReference(ref_id=12345, kind="size", idx=2)
+    """
+
+    ref_id: Optional[int] = None
+    kind: Optional[str] = None
+    idx: Optional[int] = None
+    # Note - this is untyped because of TypeError: '_SpecialForm' object does not support item assignment
+    # But it is a Optional[Union["sympy.Expr", int]]
+    expr: Optional[object] = None  # Populated after association
+
+    def __hash__(self):
+        return hash((self.ref_id, self.kind, self.idx))
+
+
+class DynamoGuardPrinter(StrPrinter):
+    @staticmethod
+    def tensor_ref_as_str(tensor_ref, id_to_name_map):
+        if tensor_ref.kind in ("size", "stride"):
+            return f"{id_to_name_map[tensor_ref.ref_id]}.{tensor_ref.kind}()[{tensor_ref.idx}]"
+        return f"{id_to_name_map[tensor_ref.ref_id]}.{tensor_ref.kind}()"
+
+    def __init__(self, expr_to_tensor_ref, id_to_name_map):
+        super().__init__()
+        self.expr_to_tensor_ref = expr_to_tensor_ref
+        self.id_to_name_map = id_to_name_map
+
+    def _print_Symbol(self, expr) -> str:
+        assert isinstance(expr, sympy.core.symbol.Symbol)
+        if expr == 0:
+            return "0"
+        if expr == 1:
+            return "1"
+        assert expr in self.expr_to_tensor_ref, f"Unknown expression {expr}"
+        refs = self.expr_to_tensor_ref[expr]
+        if len(refs) == 0:
+            return super()._print_Symbol(expr)
+        tensor_ref = next(
+            iter(refs)
+        )  # Any is fine here, because we install equality guards later
+        return DynamoGuardPrinter.tensor_ref_as_str(tensor_ref, self.id_to_name_map)
+
+
 # NB: Naively, you'd expect this to only be a function that produces
 # the callable that consistutes the guard.  However, there is some
 # delicate handling for invalidating this check function when the
@@ -482,6 +546,7 @@ class GuardedCode:
 class CheckFunctionManager:
     def __init__(
         self,
+        output_graph=None,
         guards: Optional[Set[Guard]] = None,
         f_locals: Optional[Dict] = None,
         f_globals: Optional[Dict] = None,
@@ -489,6 +554,7 @@ def __init__(
         self.valid = True
         self._weakrefs = []
         self._seen_ids = set()
+        self.output_graph = output_graph
 
         # Note: right overrides left
         def combine_scopes(left, right):
@@ -511,6 +577,82 @@ def combine_scopes(left, right):
         self.check_fn = self.compile_check_fn(local_builder, global_builder)
         self._seen_ids.clear()
 
+    """
+    This is a complex bit of logic. The outline here is brief. For a line by line breakdown, see
+    the code comments below.
+
+    The role of this function is to take the current state of symbolic shape guards, tensor ids in the
+    CURRENT dynamo frame, and tensor names (dynamo's frame agnostic tensor reference mechanism, see TensorCheck and
+    guards.cpp for more info) - and produce executable python expressions for addition to our guarded code components
+    that make their way into check_fn.
+
+    We DO NOT create guards based on ids. The IDs act as a lookup for the following mapping:
+
+    dynamo: tensor_name <> tensor_id
+    shape_env: tensor_id <> shape_expr
+
+    This allows us to then create a tensor_name <> shape_expr association for the current frames guards.
+    """
+
+    def _parse_symbolic_shape_expressions(self, tensor_check_names, tensor_check_ids):
+        # Pre join output
+        finished_expressions = []
+
+        # A mapping of tensor_ids to tensor names
+        id_to_name_map = {}
+
+        # We should not have a shape env, or guards if we are not in config.dynamic shapes
+        # But check it anyway.
+        if not config.dynamic_shapes:
+            return None
+
+        expr_to_tensor_ref = {}
+        guard_printer = DynamoGuardPrinter(expr_to_tensor_ref, id_to_name_map)
+
+        # tensor_check_names is the primary tensor association mechanism in dynamo.
+        # All other guards installations are driven off of it, so these ones will too.
+        for name in tensor_check_names:
+            tensor_id = tensor_check_ids[name]
+            id_to_name_map[tensor_id] = name
+
+            if tensor_id in self.output_graph.tensor_id_to_sym_shape_ref:
+                # If we made it here, this tensor_id is relevant to dynamo guard installation
+                # AND was found in the shape_env
+                tensor_ref_set = self.output_graph.tensor_id_to_sym_shape_ref[tensor_id]
+                for tensor_ref in tensor_ref_set:
+                    obj_expr = tensor_ref.expr
+                    if obj_expr not in expr_to_tensor_ref:
+                        expr_to_tensor_ref[obj_expr] = {}
+                    expr_to_tensor_ref[obj_expr][tensor_ref] = ""
+            finished_expressions.append(f"isinstance({name}, torch.Tensor)")
+
+        guard_expression = self.output_graph.shape_env.get_guard_expr()
+        expr_as_str = guard_printer.doprint(guard_expression)
+        # We may get into a state where symbolic shape keys (all should be found in replacements)
+        # Have not been removed from the expression. This is a serious enough error state that we need to assert.
+        for key in self.output_graph.shape_env.var_to_val.keys():
+            assert str(key) not in expr_as_str, f"Unknown shape symbol {key}. "
+        finished_expressions.append(expr_as_str)
+
+        for expr in expr_to_tensor_ref.keys():
+            tensor_refs = expr_to_tensor_ref[expr].keys()
+            equality_candidates = [
+                DynamoGuardPrinter.tensor_ref_as_str(x, id_to_name_map)
+                for x in tensor_refs
+            ]
+
+            if len(equality_candidates) > 1:
+                equality_expr = " == ".join(equality_candidates)
+                # breakpoint()
+                finished_expressions.append(equality_expr)
+
+        # Redundant with code_parts, but allows us to wrap it with parens nicely.
+        if len(finished_expressions) == 0:
+            return None
+
+        expression = " and ".join(finished_expressions)
+        return f"({expression})"
+
     def compile_check_fn(self, local_builder, global_builder):
         assert not (set(local_builder.argnames) & set(global_builder.argnames))
         # see parallel handling of ".0" / "___implicit0" in _eval_frame.c
@@ -530,9 +672,20 @@ def compile_check_fn(self, local_builder, global_builder):
         tensor_check_names = (
             local_builder.tensor_check_names + global_builder.tensor_check_names
         )
+
+        tensor_check_ids = local_builder.tensor_check_ids.copy()
+        tensor_check_ids.update(global_builder.tensor_check_ids)
+
         check_tensors_fn = None
         check_tensors_verbose_fn = None
         if tensor_check_names:
+            symbolic_shape_expression = self._parse_symbolic_shape_expressions(
+                tensor_check_names, tensor_check_ids
+            )
+            if symbolic_shape_expression:
+                code_parts.append(symbolic_shape_expression)
+                verbose_code_parts.append(symbolic_shape_expression)
+
             tensor_check_examples = (
                 local_builder.tensor_check_examples
                 + global_builder.tensor_check_examples
@@ -548,14 +701,23 @@ def compile_check_fn(self, local_builder, global_builder):
             )
             verbose_code_parts.append(f"___check_tensors_verbose({verbose_args})")
 
-        code = " and ".join(unique(code_parts))
+        def direct_equality(a, b):
+            return a == b
 
+        def direct_negation(a, b):
+            return not direct_equality(a, b)
+
+        code = " and ".join(unique(code_parts))
         closure_vars = collections.OrderedDict(
             [
                 ("___guarded_code", self),
                 ("___check_tensors", check_tensors_fn),
                 ("___check_tensors_verbose", check_tensors_verbose_fn),
                 ("tensor_check_names", tensor_check_names),
+                ("Eq", direct_equality),
+                ("Ne", direct_negation),
+                ("Mod", sympy.Mod),
+                ("FloorDiv", FloorDiv),
             ]
         )
         closure_vars.update(CLOSURE_VARS)
@@ -567,6 +729,7 @@ def ___make_guard_fn({','.join(closure_vars.keys())}):
             print("GUARDS", code)
         set_guard_fail_hook(guard_fail_hook)
         out = dict()
+        # print("RUNNING PY CODE", py_code)
         exec(py_code, global_builder.scope, out)
         guard_fn = out["___make_guard_fn"](*closure_vars.values())
         guard_fn.closure_vars = closure_vars
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index f87b07996d73b..c23d4f6dd9934 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -10,6 +10,7 @@
 
 import torch.nn
 from torch import fx
+from torch.fx.experimental.symbolic_shapes import ShapeEnv
 
 from . import config, logging as torchdynamo_logging, variables
 from .bytecode_transformation import create_instruction, Instruction, unique_id
@@ -104,6 +105,8 @@ def __init__(
         self.random_values_var = None
         self.initial_random_state = ()
         self.unspec_variable_map = {}
+        self.shape_env = ShapeEnv() if config.dynamic_shapes else None
+        self.tensor_id_to_sym_shape_ref = {}
 
     @property
     def output(self):
@@ -394,8 +397,10 @@ def compile_and_call_fx_graph(self, tx, rv, root):
         gm.recompile()
         gm.compile_subgraph_reason = self.compile_subgraph_reason
         name = unique_id("__compiled_fn")
+
         compiled_fn = self.call_user_compiler(gm)
         compiled_fn = disable(compiled_fn)
+
         counters["stats"]["unique_graphs"] += 1
         self.install_global(name, compiled_fn)
 
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index 0b5cfae69363c..4031a976f52d6 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -1340,7 +1340,8 @@ def __init__(
 
         if fake_tensors_available:
             with torch._subclasses.FakeTensorMode(
-                throw_on_data_dependent_ops=True
+                throw_on_data_dependent_ops=True,
+                shape_env=output.shape_env,
             ) as fake_mode:
                 pass
             self._fake_mode = fake_mode
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index aa64de0eeef3b..1bc646be45435 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -25,6 +25,7 @@
 from typing import Any, Dict
 
 import numpy as np
+import sympy
 
 import torch
 from torch import fx
@@ -666,6 +667,43 @@ def rename_implicit(v):
         UnsupportedFakeTensorException,
     )
 
+    def make_fake_tensor(e, fake_mode, tx=None):
+        fake_tensor = fake_mode.from_tensor(
+            e, static_shapes=config.dynamic_shapes is False
+        )
+        if tx is not None:
+            from torch._dynamo.guards import TensorReference
+
+            def _record(tensor_ref):
+                if tensor_ref.ref_id not in tx.output.tensor_id_to_sym_shape_ref:
+                    tx.output.tensor_id_to_sym_shape_ref[tensor_ref.ref_id] = set()
+                tx.output.tensor_id_to_sym_shape_ref[tensor_ref.ref_id].add(tensor_ref)
+
+            def _extract(symbol):
+                if isinstance(symbol, int):
+                    return None
+                sym_expr = symbol.get_pyobj().expr
+                if not isinstance(sym_expr, sympy.Symbol):
+                    return None
+                return sym_expr
+
+            def _record_ref(e, index, symbol, kind):
+                sym_expr = _extract(symbol)
+                if sym_expr:
+                    tensor_ref = TensorReference(id(e), kind, index, sym_expr)
+                    _record(tensor_ref)
+
+            for index, symbol in enumerate(fake_tensor.size()):
+                _record_ref(e, index, symbol, "size")
+
+            for index, symbol in enumerate(fake_tensor.stride()):
+                _record_ref(e, index, symbol, "stride")
+
+            offset = fake_tensor.storage_offset()
+            _record_ref(e, None, offset, "storage_offset")
+
+        return fake_tensor
+
     def wrap_fake_exception(fn):
         try:
             return fn()
@@ -678,7 +716,13 @@ def wrap_fake_exception(fn):
 
     def wrap_to_fake_tensor(e, fake_mode):
         if type(e) in (torch.Tensor, torch.nn.Parameter):
-            return wrap_fake_exception(lambda: fake_mode.from_tensor(e))
+            return wrap_fake_exception(lambda: make_fake_tensor(e, fake_mode))
+        else:
+            return e
+
+    def wrap_to_fake_tensor_and_record(e, tx):
+        if type(e) in (torch.Tensor, torch.nn.Parameter):
+            return wrap_fake_exception(lambda: make_fake_tensor(e, tx.fake_mode, tx))
         else:
             return e
 
diff --git a/torch/_dynamo/variables/builtin.py b/torch/_dynamo/variables/builtin.py
index 53fdb95aca8bb..cc64e009d094c 100644
--- a/torch/_dynamo/variables/builtin.py
+++ b/torch/_dynamo/variables/builtin.py
@@ -359,11 +359,23 @@ def _call_min_max(self, tx, a, b):
                 a, b = b, a
             assert isinstance(a, variables.TensorVariable)
 
-            # 1. result of an item call is a scalar convert to a tensor
-            # 2. dynamic shape should be resolved to tensor
-            if isinstance(a, (FakeItemVariable, DynamicShapeVariable)):
+            # result of an item call is a scalar convert to a tensor
+            if isinstance(a, FakeItemVariable):
                 a = variables.TorchVariable(torch.tensor).call_function(tx, [a], {})
 
+            # Dynamic input does not get resolved, rather, gets stored as call_function
+            if isinstance(a, DynamicShapeVariable):
+                return variables.TensorVariable.create(
+                    tx=tx,
+                    proxy=tx.output.create_proxy(
+                        "call_function",
+                        self.fn,
+                        *proxy_args_kwargs([a, b], {}),
+                        current_tx=tx,
+                    ),
+                    **VariableTracker.propagate(self, [a, b]),
+                )
+
             # convert min/max to torch ops
             if b.is_python_constant():
                 kwargs = {"min": b} if (self.fn is max) else {"max": b}
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index a8db819cb272d..864d2c4ca3e0f 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -17,7 +17,7 @@
         DataDependentOutputException,
         DynamicOutputShapeException,
     )
-    from ..utils import deepcopy_to_fake_tensor, wrap_to_fake_tensor
+    from ..utils import deepcopy_to_fake_tensor, wrap_to_fake_tensor_and_record
 
 import torch.utils._python_dispatch as py_dispatch
 from torch.fx.immutable_collections import immutable_list
@@ -98,7 +98,7 @@ def _get_fake_value(node, tx):
     Run the computation represented by `node` using fake tensors and return the result.
     """
     op = node.op
-    fake_wrapper = functools.partial(wrap_to_fake_tensor, fake_mode=tx.fake_mode)
+    fake_wrapper = functools.partial(wrap_to_fake_tensor_and_record, tx=tx)
     from ..utils import wrap_fake_exception
 
     def visit(n: torch.fx.Node):
@@ -206,7 +206,7 @@ def create(cls, tx, proxy, example_value=None, **options):
                 proxy.tracer.real_value_cache[proxy.node] = _clone_input(example_value)
                 if use_fake_tensors:
                     fake_wrapper = functools.partial(
-                        wrap_to_fake_tensor, fake_mode=tx.fake_mode
+                        wrap_to_fake_tensor_and_record, tx=tx
                     )
                     example_value = fake_wrapper(example_value)
 
@@ -241,14 +241,14 @@ def create(cls, tx, proxy, example_value=None, **options):
             return TorchVariable(proxy.node.target)
         elif istype(example_value, (int, bool, float)) and config.dynamic_shapes:
             proxy.node.meta["example_value"] = example_value
-            return DynamicShapeVariable(proxy, type(example_value), **options)
+            return DynamicShapeVariable(proxy, example_value, **options)
         elif istype(example_value, torch.Size) and config.dynamic_shapes:
             proxy.node.meta["example_value"] = example_value
             sizes = []
             for i, v in enumerate(example_value):
                 proxy_i = proxy[i]
                 proxy_i.node.meta["example_value"] = v
-                sizes.append(DynamicShapeVariable(proxy_i, int))
+                sizes.append(DynamicShapeVariable(proxy_i, v))
             return SizeVariable(sizes, proxy, **options)
         elif istype(example_value, int) and proxy.node.target in (
             torch.seed,
@@ -258,7 +258,7 @@ def create(cls, tx, proxy, example_value=None, **options):
             getattr(torch.distributed, "get_world_size", _missing),
         ):
             proxy.node.meta["example_value"] = example_value
-            return DynamicShapeVariable(proxy, type(example_value), **options)
+            return DynamicShapeVariable(proxy, example_value, **options)
         elif istype(example_value, torch.Size) and all(
             [isinstance(x, int) for x in example_value]
         ):
@@ -337,6 +337,9 @@ def create(cls, tx, proxy, example_value=None, **options):
             from . import UserDefinedObjectVariable
 
             return UserDefinedObjectVariable(example_value)
+        elif isinstance(example_value, torch.SymIntNode):
+            proxy.node.meta["example_value"] = example_value
+            return cls(proxy, **options)
         else:
             raise AssertionError(
                 "torch.* op returned non-Tensor "
@@ -474,7 +477,6 @@ def call_method(
         kwargs = dict(kwargs)
 
         options = VariableTracker.propagate(self, args, kwargs.values())
-
         if name == "stride" and self.stride is not None:
             constant_result = ConstantVariable(self.stride, **options)
         elif name == "size" and self.size is not None:
@@ -578,12 +580,12 @@ class DynamicShapeVariable(TensorVariable):
     Represents a symbolic size, e.g., as returned by tensor.size(0)
     """
 
-    def __init__(self, proxy, dyn_shape_cls, **kwargs):
+    def __init__(self, proxy, dyn_shape, **kwargs):
         super(DynamicShapeVariable, self).__init__(proxy, **kwargs)
-        self.dyn_shape_cls = dyn_shape_cls
+        self.dyn_shape = dyn_shape
 
     def python_type(self):
-        return self.dyn_shape_cls
+        return type(self.dyn_shape)
 
     def unpack_var_sequence(self, tx):
         super(DynamicShapeVariable, self).unpack_var_sequence(tx)
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index 1ecfbe1a70b2c..e0c88b2cf059a 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -344,6 +344,24 @@ def get_state_from_generator():
                 example_value=example_value,
                 **options,
             )
+        elif (
+            self.value == torch.numel
+            and len(args) == 1
+            and isinstance(args[0], TensorVariable)
+            and len(kwargs) == 0
+        ):
+            # TODO(voz): This is rewritten as a call_method because
+            # torch.numel(x) w/ sym shapes raises a RuntimeError and x.numel() does not
+            return TensorVariable.create(
+                tx=tx,
+                proxy=tx.output.create_proxy(
+                    "call_method",
+                    "numel",
+                    *proxy_args_kwargs(args, kwargs),
+                    current_tx=tx,
+                ),
+                **options,
+            )
         else:
             # Handle sth like torch.LongTensor(list(np.int64, np.int64, ...)),
             # as FX symbolic trace doesn't support numpy int/float as base types.
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 2f2f07f3db378..652c24c9a521d 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -234,7 +234,7 @@ def from_real_tensor(self, fake_mode, t, make_constant=False, shape_env=None):
             warnings.filterwarnings("ignore", "The .grad attribute of a Tensor")
             grad_not_none = t.grad is not None
         if grad_not_none:
-            out.grad = self.from_real_tensor(fake_mode, t.grad)
+            out.grad = self.from_real_tensor(fake_mode, t.grad, shape_env=shape_env)
         self.set_tensor_memo(t, out)
         return out
 
diff --git a/torch/_subclasses/meta_utils.py b/torch/_subclasses/meta_utils.py
index 80723f1246339..3e1040d037f0d 100644
--- a/torch/_subclasses/meta_utils.py
+++ b/torch/_subclasses/meta_utils.py
@@ -146,7 +146,7 @@ def meta_tensor(self, t, shape_env=None):
 
         def sym(x):
             if make_symbolic:
-                return shape_env.create_symbol(x)
+                return shape_env.create_symintnode(shape_env.create_symbol(x))
             else:
                 return x
 

From 0c1734aeb99f8f1f664c3e57ea7c3261bd81146e Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Tue, 25 Oct 2022 04:46:42 +0000
Subject: [PATCH 0141/1922] Fix stride for prims.where (#87563)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87563
Approved by: https://github.com/ngimel, https://github.com/mruberry
---
 torch/_meta_registrations.py |  6 +-----
 torch/_prims/__init__.py     |  2 +-
 torch/_refs/__init__.py      | 23 +++++++++++++++++------
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 873f942da42ab..0af6813ce4a00 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1663,12 +1663,8 @@ def activate_meta():
             "aten::mul.Tensor",  # ValueError: Receive two Number inputs to an elementwise binary operation! inductor/test_torchinductor.py -k test_both_scalars  # noqa: B950
             "aten::div.Tensor",  # ValueError: Receive two Number inputs to an elementwise binary operation! test_fake_tensor.py -k test_scalar_inputs  # noqa: B950
             "aten::div.Tensor_mode",  # ValueError: Receive two Number inputs to an elementwise binary operation! inductor/test_torchinductor.py -k test_div8_cpu  # noqa: B950
-            "aten::diag_embed",  # Stride mismatch! test_ops.py -k test_fake_autocast_diag_embed_cuda_float32  # noqa: B950
-            "aten::copy_",  # Exception not raiseed, test_torch.py -k test_storage_meta_errors_cpu_int64  # noqa: B950
+            "aten::copy_",  # Exception not raised, test_torch.py -k test_storage_meta_errors_cpu_int64  # noqa: B950
             "aten::constant_pad_nd",  # requires_grad mismatch, test_ops.py -k test_fake_crossref_backward_amp_istft_cuda_float32  # noqa: B950
-            "aten::masked_fill.Scalar",  # Stride mismatch! test_ops.py -k test_fake_crossref_backward_amp_nanquantile_cuda_float32  # noqa: B950
-            "aten::tril",  # Stride mismatch! test_ops.py -k test_fake_crossref_backward_amp_ormqr_cuda_float32  # noqa: B950
-            "aten::triu",  # Stride mismatch! test_ops.py -k test_fake_crossref_backward_amp_lu_solve_cuda_float32  # noqa: B950
             "aten::rot90",  # requires_grad mismatch! test_ops.py -k test_fake_crossref_backward_amp_rot90_cuda_float32  # noqa: B950
         }:
             pass
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index eae38612a2237..3248009ee66e5 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -335,7 +335,7 @@ def _elementwise_meta(
 
     args_ = list(args)
     if args_with_fixed_dtypes is not None:
-        args_.extend(args_with_fixed_dtypes)
+        args_ = list(args_with_fixed_dtypes) + args_
 
     utils.check_same_device(*args_, allow_cpu_scalar_tensors=True)
     utils.check_same_shape(*args_, allow_cpu_scalar_tensors=True)
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 2e91ceeeb679d..44b75bb92df48 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -3594,7 +3594,10 @@ def diag_embed(
     cond = a_range == b_range.unsqueeze(-1)
     cond_shape = [last_dim if i in (dim1, dim2) else 1 for i in range(len(t.shape))]
     cond = cond.reshape(cond_shape)
-    return utils.mask_tensor(cond, t)
+
+    # aten.diag_embed always returns a new contiguous tensor
+    # contiguous() is needed to correctly model the output stride
+    return utils.mask_tensor(cond, t).contiguous()
 
 
 # CompositeImplicitAutograd - don't register decomp
@@ -4517,10 +4520,14 @@ def masked_fill(a: TensorLikeType, mask: TensorLikeType, value: TensorOrNumberLi
     # Since `where` allows type-promotion,
     # cast value to correct type before passing to `where`
     if isinstance(value, Number):
-        return torch.where(mask, python_type(value), a)
+        r = torch.where(mask, python_type(value), a)
+    else:
+        assert isinstance(value, TensorLike)
+        r = torch.where(mask, prims.to_dtype(value, a.dtype), a)
 
-    assert isinstance(value, TensorLike)
-    return torch.where(mask, prims.to_dtype(value, a.dtype), a)
+    # aten.mask_fill always return a new contiguous tensor
+    # contiguous() is needed to correctly model the output stride
+    return r.contiguous()
 
 
 # CompositeImplicitAutograd - don't register decomp
@@ -4622,7 +4629,9 @@ def triu(a: TensorLikeType, diagonal: int = 0) -> TensorLikeType:
         - torch.arange(h, device=a.device).unsqueeze(-1)
     ) >= diagonal
 
-    return utils.mask_tensor(mask, a)
+    # aten.triu always returns a new contiguous tensor
+    # contiguous() is needed to correctly model the output stride
+    return utils.mask_tensor(mask, a).contiguous()
 
 
 @register_decomposition(torch.ops.aten.tril)
@@ -4637,7 +4646,9 @@ def tril(a: TensorLikeType, diagonal: int = 0) -> TensorLikeType:
         - torch.arange(h, device=a.device).unsqueeze(-1)
     ) <= diagonal
 
-    return utils.mask_tensor(mask, a)
+    # aten.tril always returns a new contiguous tensor
+    # contiguous() is needed to correctly model the output stride
+    return utils.mask_tensor(mask, a).contiguous()
 
 
 # This is based on get_tril_size in aten/src/ATen/native/TensorFactories.h

From d30196d447ff15140a8d03f1ff054be068e90917 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Tue, 25 Oct 2022 21:49:59 +0000
Subject: [PATCH 0142/1922] Fix typos under functorch directory (#87663)

This PR fixes typos in `.md` and `.rst` files under functorch directory

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87663
Approved by: https://github.com/kit1980
---
 functorch/dim/README.md                    | 22 +++++++++++-----------
 functorch/docs/source/batch_norm.rst       |  2 +-
 functorch/docs/source/ux_limitations.rst   |  2 +-
 functorch/examples/maml_omniglot/README.md |  2 +-
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/functorch/dim/README.md b/functorch/dim/README.md
index 750c8847c8502..5ed7bbd3d5284 100644
--- a/functorch/dim/README.md
+++ b/functorch/dim/README.md
@@ -7,7 +7,7 @@ _An implementation of [named tensors](https://namedtensor.github.io) with the fu
 
 The tensor input to a resnet might have the shape [8, 3, 224, 224] but informally we think of those dimensions as 'batch', 'channel', 'width', and 'height'. Eventhough 'width' and 'height' have the same _size_ we still think of them as separate dimensions, and if we have two _different_ images, we think of both as sharing the _same_ 'channel' dimension.
 
-Named tensors gives these dimensions names. [PyTorch's current implementation](https://pytorch.org/docs/stable/named_tensor.html) uses strings to name dimensions. Instead, this library introduces a Python object, a `Dim`, to represent the concept. By expanding the semantics of tensors with dim objects, in addition to naming dimensions, we can get behavior equivalent to batching transforms (xmap, vmap), einops-style rearragement, and loop-style tensor indexing.
+Named tensors gives these dimensions names. [PyTorch's current implementation](https://pytorch.org/docs/stable/named_tensor.html) uses strings to name dimensions. Instead, this library introduces a Python object, a `Dim`, to represent the concept. By expanding the semantics of tensors with dim objects, in addition to naming dimensions, we can get behavior equivalent to batching transforms (xmap, vmap), einops-style rearrangement, and loop-style tensor indexing.
 
 A preview:
 
@@ -85,11 +85,11 @@ from torchdim import dims
 batch, channel, width, height = dims(4)
 ```
 
-The existing implemention of [Named Tensors](https://pytorch.org/docs/stable/named_tensor.html) in PyTorch, or [JAX's xmap](https://jax.readthedocs.io/en/latest/notebooks/xmap_tutorial.html) use strings to name dimensions. We call these dimensions _first class_ because they are Python objects.
+The existing implementation of [Named Tensors](https://pytorch.org/docs/stable/named_tensor.html) in PyTorch, or [JAX's xmap](https://jax.readthedocs.io/en/latest/notebooks/xmap_tutorial.html) use strings to name dimensions. We call these dimensions _first class_ because they are Python objects.
 
 In addition to the normal _positional_ dimensions in a tensor, tensors can also have a separate set of first-class dimensions.
 
-You can create tensors with first-class dimensions by indexing the normal positional dimensions of a tensor with a dimension object. The `ndim` property continues to list the number of positional dimesions, while the new `dims` property lists all the bound first-class dimensions.
+You can create tensors with first-class dimensions by indexing the normal positional dimensions of a tensor with a dimension object. The `ndim` property continues to list the number of positional dimensions, while the new `dims` property lists all the bound first-class dimensions.
 
 ```py
 input = torch.rand(2, 3, 224, 224)
@@ -101,7 +101,7 @@ print(input_fc.dims) # first class dimensions
 > (batch, channel, width, height)
 
 
-# since we converted all the positional dimesions
+# since we converted all the positional dimensions
 # first class `input_fc` has 0 positional dimensions now.
 print(input_fc.ndim)
 > 0
@@ -266,7 +266,7 @@ print(i <= j)
 > with dims=(i, j) sizes=(4, 4)
 ```
 
-Because of the intentional similarity to loop-level code, using dimsions as tensors makes complicated indexing arithmetic easier to read.
+Because of the intentional similarity to loop-level code, using dimensions as tensors makes complicated indexing arithmetic easier to read.
 
 Here is code that lookups up features in an embedding table given a sequence of ids:
 
@@ -296,7 +296,7 @@ Unbinding Dims
 -------------
 The `order` method converts first-class dimensions in a tensor back to normal positional dimensions by specifying an order for those dimensions.[^4]
 
-By specifiying a different order from how things were originally bound, it is easy to do transpositions.
+By specifying a different order from how things were originally bound, it is easy to do transpositions.
 
 ```py
 i, j = dims(2)
@@ -305,7 +305,7 @@ A_T = A[i, j].order(j, i)
 assert torch.allclose(A.T, A_T)
 ```
 
-Indexing acts left-to-right, and `order` also places the new dimensions back on the left, so it possible to work on tensors that have mixed positonal and first-class dimensions:
+Indexing acts left-to-right, and `order` also places the new dimensions back on the left, so it possible to work on tensors that have mixed positional and first-class dimensions:
 
 ```py
 B = torch.rand(3, 4, 5)
@@ -313,7 +313,7 @@ B_T = B[i, j].order(j, i)
 assert torch.allclose(B.permute(1, 0, 2), B_T)
 ```
 
-[^4] `order` is actually just a synonym for the already-existing `permute` method, which takes a list a dimension specifiers and puts the tensor in that order because rule #2 says that first-class dims can be passed as arguments to functions that previousely took only integers as dimensions. However, the name `permute` is confusing in this context since it implies dim objects have an original order, so we prefer to use `order` when writing code.
+[^4] `order` is actually just a synonym for the already-existing `permute` method, which takes a list a dimension specifiers and puts the tensor in that order because rule #2 says that first-class dims can be passed as arguments to functions that previously took only integers as dimensions. However, the name `permute` is confusing in this context since it implies dim objects have an original order, so we prefer to use `order` when writing code.
 
 Flattening and Splitting Dims
 -----------------------------
@@ -412,7 +412,7 @@ Named tensors with first-class dimensions can accomplish the same goal, but usin
 Automatically batching Code (`vmap`, `xmap`)
 -----------------------------
 
-The implicit batching of Rule #1 means it is easy to created batched versions of existing PyTorch code. Simply bind a dim to the dimensions that should act as a batch, and then pass the tensor to the unbatched function. Since the unbatched function does not know about the dim, the dim will be implicictly batched over:
+The implicit batching of Rule #1 means it is easy to created batched versions of existing PyTorch code. Simply bind a dim to the dimensions that should act as a batch, and then pass the tensor to the unbatched function. Since the unbatched function does not know about the dim, the dim will be implicitly batched over:
 
 ```py
 batch_size, feature_size = 3, 5
@@ -501,7 +501,7 @@ def multiheadattention(q, k, v, num_attention_heads, dropout_prob, use_positiona
 Indexing
 --------
 
-Rule #3 enables indexing because dimensions act as loop indices when used as a tensor. This allows for a lot of powerful behavior. The simplest might be using the dimensions to compute masks, such as extracing the upper triangular part of a matrix:
+Rule #3 enables indexing because dimensions act as loop indices when used as a tensor. This allows for a lot of powerful behavior. The simplest might be using the dimensions to compute masks, such as extracting the upper triangular part of a matrix:
 
 ```py
 from torch import where
@@ -745,7 +745,7 @@ The semantics and surface syntax of dimension objects resembles the kind of code
 
 These compilers and language have syntax and semantics that resemble the loop-level analogy similar to first-class dimensions. However, as compilers or statically typed languages, they require some binding code to go from running deep learning framework code in Python to using the compiled language. This often at least requires refactoring the compiled parts into their own functions, and may require defining a gradient function. Similar to graph mode frameworks, this adds friction to using and debugging the code.
 
-Dimension objects are just an extension of the existing PyTorch tensors and eager sematics, so there is no friction switching between normal Python code and code that uses them. However, since loops over the dimensions are defined implicitly, they can still execute in Python with good performance compared to explicit loops. Furthermore, with dimension objects, a tensors containing dimensions can compute through code that is oblivous to the dimension such as batching examples. There is no need to separate code into 'compiled' vs 'eager'.
+Dimension objects are just an extension of the existing PyTorch tensors and eager semantics, so there is no friction switching between normal Python code and code that uses them. However, since loops over the dimensions are defined implicitly, they can still execute in Python with good performance compared to explicit loops. Furthermore, with dimension objects, a tensors containing dimensions can compute through code that is oblivious to the dimension such as batching examples. There is no need to separate code into 'compiled' vs 'eager'.
 
 In this way, first-class dims are a way of adapting the nicer syntax of these array compilers and languages to eager numpy-style libraries.
 
diff --git a/functorch/docs/source/batch_norm.rst b/functorch/docs/source/batch_norm.rst
index 09eb6001b5b66..8ccd4ee587d35 100644
--- a/functorch/docs/source/batch_norm.rst
+++ b/functorch/docs/source/batch_norm.rst
@@ -11,7 +11,7 @@ we end up with this error
 How to fix
 ----------
 All of these options assume that you don't need running stats. If you're using a module this means
-that it's assumed you won't use batch norm in evalution mode. If you have a use case that involves
+that it's assumed you won't use batch norm in evaluation mode. If you have a use case that involves
 running batch norm with vmap in evaluation mode, please file an issue
 
 Option 1: Change the BatchNorm
diff --git a/functorch/docs/source/ux_limitations.rst b/functorch/docs/source/ux_limitations.rst
index e0090047752e0..4fee30e432881 100644
--- a/functorch/docs/source/ux_limitations.rst
+++ b/functorch/docs/source/ux_limitations.rst
@@ -290,5 +290,5 @@ Under "same" randomness, elements in a batch produce same random values. For ins
 .. note::
     Finally, our randomness differs from JAX because we aren't using a stateless PRNG, in part because PyTorch
     doesn't have full support for a stateless PRNG. Instead, we've introduced a flag system to allow for the
-    most common forms of randmoness that we see. If your use case does not fit these forms of randomness, please
+    most common forms of randomness that we see. If your use case does not fit these forms of randomness, please
     file an issue.
diff --git a/functorch/examples/maml_omniglot/README.md b/functorch/examples/maml_omniglot/README.md
index dfb6077814bfe..afc3f55023d47 100644
--- a/functorch/examples/maml_omniglot/README.md
+++ b/functorch/examples/maml_omniglot/README.md
@@ -1,6 +1,6 @@
 # Omniglot MAML examples
 
-In this directory we've provided some examples of traning omniglot that reproduce the experiments from [the original MAML paper](https://arxiv.org/abs/1703.03400).
+In this directory we've provided some examples of training omniglot that reproduce the experiments from [the original MAML paper](https://arxiv.org/abs/1703.03400).
 
 They can be run via `python {filename}`.
 

From 88c46f7ee17705b1d379ac1fac0718c5a1b221de Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <ivan.yashchuk@aalto.fi>
Date: Tue, 25 Oct 2022 21:53:11 +0000
Subject: [PATCH 0143/1922] Intercept aten._reshape_alias for nvFuser (#87072)

This would help forming larger fusion groups. If this won't end up executed by nvFuser then eager mode implementation would call into `.reshape`: https://github.com/pytorch/pytorch/blob/37e9e89afbc3554258545a026fab4cd9e1a4b85d/torch/_prims/nvfuser_prims.py#L552-L553

cc @kevinstephano @jjsjann123
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87072
Approved by: https://github.com/ngimel
---
 torch/_prims/context.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/torch/_prims/context.py b/torch/_prims/context.py
index fea3f17a5009b..2bcee069d146c 100644
--- a/torch/_prims/context.py
+++ b/torch/_prims/context.py
@@ -405,6 +405,12 @@ def __torch_function__(
                 warn("view has ignored kwargs!")
             return torch.ops.nvprims.view(a, shape)
 
+        if orig_func == torch.ops.aten._reshape_alias.default:
+            a, shape, stride = args
+            if len(kwargs) > 0:
+                warn("view has ignored kwargs!")
+            return torch.ops.nvprims.view(a, shape)
+
         if self._is_native_batch_norm(orig_func):
             return torch.ops.nvprims.native_batch_norm(*args, **kwargs)
 

From a7de7c7d98c364b75921020efb7928e660dc057e Mon Sep 17 00:00:00 2001
From: Michael Lazos <mlazos@fb.com>
Date: Tue, 25 Oct 2022 21:55:27 +0000
Subject: [PATCH 0144/1922] Fix debug dir bugs and minifier output directories
 (#87682)

Fixes https://github.com/pytorch/torchdynamo/issues/1758, https://github.com/pytorch/torchdynamo/issues/1752

- minifier_launcher.py now dumps checkpoints to \<cwd\>/checkpoints when run
- a single debug directory is created per script invocation, asserts failing with no directory will no longer occur
- torchinductor debug tracing will correctly dump to the debug directory now since no prior setup is needed, (the directory was incorrectly only initialized during dynamo tracing)

cc @jansel @lezcano @fdrocha @soumith @voznesenskym @yanboliang @penguinwu @anijain2305
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87682
Approved by: https://github.com/ezyang
---
 test/dynamo/test_debug_dir.py | 96 -----------------------------------
 test/dynamo/test_minifier.py  |  2 -
 torch/_dynamo/debug_utils.py  | 20 ++------
 torch/_dynamo/eval_frame.py   |  4 --
 torch/_dynamo/utils.py        | 36 +++----------
 5 files changed, 12 insertions(+), 146 deletions(-)
 delete mode 100644 test/dynamo/test_debug_dir.py

diff --git a/test/dynamo/test_debug_dir.py b/test/dynamo/test_debug_dir.py
deleted file mode 100644
index 5827ff40ea781..0000000000000
--- a/test/dynamo/test_debug_dir.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Owner(s): ["module: dynamo"]
-import shutil
-import unittest
-
-import torch
-import torch._dynamo.test_case
-import torch._dynamo.testing
-from torch._dynamo.utils import DebugDir, get_debug_dir
-
-
-class DebugDirTests(torch._dynamo.test_case.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-        cls._exit_stack.enter_context(
-            unittest.mock.patch.object(
-                torch._dynamo.config,
-                "debug_dir_root",
-                "/tmp/torch._dynamo_debug_dirs/",
-            )
-        )
-
-    @classmethod
-    def tearDownClass(cls):
-        shutil.rmtree(torch._dynamo.config.debug_dir_root, ignore_errors=True)
-        cls._exit_stack.close()
-
-    def setUp(self):
-        super().setUp()
-        torch._dynamo.utils.debug_dir = DebugDir()
-
-    def tearDown(self):
-        torch._dynamo.utils.debug_dir = DebugDir()
-        super().tearDown()
-
-    def _setup(self):
-        debug_dir = torch._dynamo.utils.debug_dir
-        debug_dir.setup()
-        self.assertIsNotNone(debug_dir.debug_path)
-        self.assertEqual(debug_dir.num_setup_calls, 1)
-        return debug_dir
-
-    def test_setup(self):
-        self._setup()
-
-    def test_clear(self):
-        debug_dir = self._setup()
-        debug_dir.clear()
-        self.assertIsNone(debug_dir.debug_path)
-        self.assertEqual(debug_dir.num_setup_calls, 0)
-
-    def test_multi_setup_single_clear(self):
-        debug_dir = self._setup()
-        prev = get_debug_dir()
-
-        debug_dir.setup()
-        self.assertEqual(prev, get_debug_dir())
-        self.assertEqual(debug_dir.num_setup_calls, 2)
-
-        debug_dir.clear()
-        self.assertEqual(prev, get_debug_dir())
-        self.assertEqual(debug_dir.num_setup_calls, 1)
-
-    def test_multi_setup_multi_clear(self):
-        debug_dir = self._setup()
-        prev = get_debug_dir()
-
-        debug_dir.setup()
-        self.assertEqual(prev, get_debug_dir())
-        self.assertEqual(debug_dir.num_setup_calls, 2)
-
-        debug_dir.clear()
-        self.assertEqual(prev, get_debug_dir())
-        self.assertEqual(debug_dir.num_setup_calls, 1)
-
-        debug_dir.clear()
-        self.assertIsNone(debug_dir.debug_path)
-        self.assertEqual(debug_dir.num_setup_calls, 0)
-
-    def test_single_setup_single_clear(self):
-        debug_dir = self._setup()
-        debug_dir.clear()
-        self.assertIsNone(debug_dir.debug_path)
-        self.assertEqual(debug_dir.num_setup_calls, 0)
-
-    def test_multi_get(self):
-        self._setup()
-        prev = get_debug_dir()
-        next = get_debug_dir()
-        self.assertEqual(prev, next)
-
-
-if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
-
-    run_tests()
diff --git a/test/dynamo/test_minifier.py b/test/dynamo/test_minifier.py
index a282485285797..0cec7d202a9d4 100644
--- a/test/dynamo/test_minifier.py
+++ b/test/dynamo/test_minifier.py
@@ -43,10 +43,8 @@ def tearDownClass(cls):
 
     def setUp(self):
         super().setUp()
-        torch._dynamo.utils.debug_dir.setup()
 
     def tearDown(self):
-        torch._dynamo.utils.debug_dir.clear()
         super().tearDown()
 
     def test_after_dynamo(self):
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index ea5671a81d02f..0ece930d1d13b 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -240,7 +240,7 @@ def save_graph_repro(fd, gm, args, compiler_name):
 def isolate_fails(fx_g, args, compiler_name: str, env=None):
     if env is None:
         env = {}
-    subdir = f"{minifier_dir()}/isolate"
+    subdir = os.path.join(os.getcwd(), "isolate")
     if not os.path.exists(subdir):
         os.makedirs(subdir, exist_ok=True)
     file_name = os.path.join(subdir, f"{str(uuid.uuid4())[:5]}.py")
@@ -600,10 +600,11 @@ def dump_backend_repro_as_file(gm, args, compiler_name, check_accuracy=False):
     """
     Saves the repro to a repro.py file
     """
-    subdir = os.path.join(minifier_dir())
+    curdir = os.getcwd()
+    subdir = os.path.join(os.getcwd(), "checkpoints")
     if not os.path.exists(subdir):
         os.makedirs(subdir, exist_ok=True)
-    file_name = os.path.join(subdir, f"{len(gm.graph.nodes)}.py")
+    file_name = os.path.join(subdir, f"minified_{len(gm.graph.nodes)}_nodes.py")
     log.warning(f"Writing checkpoint with {len(gm.graph.nodes)} nodes to {file_name}")
 
     model_str = NNModuleToString.convert(gm)
@@ -613,19 +614,10 @@ def dump_backend_repro_as_file(gm, args, compiler_name, check_accuracy=False):
                 model_str, args, compiler_name, check_accuracy
             )
         )
-    latest_repro = os.path.join(subdir, "repro.py")
+    latest_repro = os.path.join(curdir, "repro.py")
     log.warning(f"Copying {file_name} to {latest_repro} for convenience")
     shutil.copyfile(file_name, latest_repro)
 
-    local_path = os.path.join(config.base_dir, "repro.py")
-    try:
-        shutil.copyfile(file_name, local_path)
-        log.warning(
-            f"Copying minified repro from {file_name} to {local_path} for convenience"
-        )
-    except OSError:
-        log.warning("No write permissions for {local_path}")
-
 
 # TODO - Commented because we are assuming that nn.Modules can be safely repr'd
 # If that does not work, we might have to bring this code back. So, keeping it
@@ -748,8 +740,6 @@ def dump_to_minify_after_dynamo(gm, args, compiler_name):
 from {config.dynamo_import}.optimizations.backends import BACKENDS
 from {config.dynamo_import}.testing import rand_strided
 
-{config.dynamo_import}.config.repro_dir = \"{minifier_dir()}\"
-
 args = {[(tuple(a.shape), tuple(a.stride()), a.dtype, a.device.type, a.requires_grad) for a in args]}
 args = [rand_strided(sh, st, dt, dev).requires_grad_(rg) for (sh, st, dt, dev, rg) in args]
 
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 9895da4ad9bba..d86653f9973cc 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -103,14 +103,12 @@ def __enter__(self):
                 "Please refer to https://github.com/pytorch/torchdynamo#usage-example "
                 "to use torchdynamo.optimize(...) as an annotation/decorator. "
             )
-        utils.debug_dir.setup()
         self.on_enter()
         self.prior = set_eval_frame(self.callback)
         self.backend_ctx = self.extra_ctx_ctor()
         self.backend_ctx.__enter__()
 
     def __exit__(self, exc_type, exc_val, exc_tb):
-        utils.debug_dir.clear()
         set_eval_frame(self.prior)
         self.prior = unset
         self.backend_ctx.__exit__(exc_type, exc_val, exc_tb)
@@ -152,14 +150,12 @@ def __call__(self, *args, **kwargs):
         @functools.wraps(fn)
         def _fn(*args, **kwargs):
             on_enter()
-            utils.debug_dir.setup()
             prior = set_eval_frame(callback)
             backend_ctx = backend_ctx_ctor()
             backend_ctx.__enter__()
             try:
                 return fn(*args, **kwargs)
             finally:
-                utils.debug_dir.clear()
                 set_eval_frame(prior)
                 backend_ctx.__exit__(None, None, None)
 
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 1bc646be45435..ef2c1c38ea8ba 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -975,35 +975,13 @@ def recompile_reasons(code):
         return rpt
 
 
-class DebugDir:
-    def __init__(self):
-        self.num_setup_calls = 0
-        self.debug_path = None
-
-    def setup(self):
-        assert self.num_setup_calls >= 0
-        if self.num_setup_calls == 0:
-            debug_root = config.debug_dir_root
-            dir_name = "run_" + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")
-            self.debug_path = os.path.join(debug_root, dir_name)
-
-        self.num_setup_calls += 1
-
-    def clear(self):
-        assert self.num_setup_calls >= 0
-        if self.num_setup_calls == 1:
-            self.debug_path = None
-
-        self.num_setup_calls -= 1
-        assert self.num_setup_calls >= 0
-
-    def get(self):
-        assert self.debug_path is not None
-        return self.debug_path
-
-
-debug_dir = DebugDir()
+# return same dir unless user changes config between calls
+@functools.lru_cache(None)
+def _get_debug_dir(root_dir):
+    dir_name = "run_" + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")
+    return os.path.join(root_dir, dir_name)
 
 
 def get_debug_dir():
-    return debug_dir.get()
+    debug_root = config.debug_dir_root
+    return _get_debug_dir(debug_root)

From fbecce248287595329a221a6a7f5b8e4601dc410 Mon Sep 17 00:00:00 2001
From: maxren <maxren@meta.com>
Date: Mon, 24 Oct 2022 15:24:57 -0700
Subject: [PATCH 0145/1922] [xnnpack][lite-int][graph-build] graph passes and
 op checking (#87128)

Beginning of building the xnnpack graph from the torchscript IR. We first massage the torchscript graph using a few graph passes that perform things such as unused self argument removal and constant propagation.
This also performs tracing for us so that the model does not have to be prepped by tracing before being lowered by us.

The other check we perform is through the torchscript IR to identify any nodes that are not lowerable/supported, and throwing an error to spit out the specific nodes that are not lowerable.

Differential Revision: [D39838338](https://our.internmc.facebook.com/intern/diff/D39838338/)

**NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D39838338/)!
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87128
Approved by: https://github.com/salilsdesai
---
 test/jit/xnnpack/test_xnnpack_delegate.py     | 34 +++++++++
 .../xnnpack/xnnpack_backend_preprocess.cpp    |  5 ++
 .../xnnpack/xnnpack_graph_builder.cpp         | 71 +++++++++++++++++++
 .../backends/xnnpack/xnnpack_graph_builder.h  | 50 +++++++++++++
 4 files changed, 160 insertions(+)
 create mode 100644 torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp
 create mode 100644 torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.h

diff --git a/test/jit/xnnpack/test_xnnpack_delegate.py b/test/jit/xnnpack/test_xnnpack_delegate.py
index 8c759cb01ccf6..118a30dbe2cac 100644
--- a/test/jit/xnnpack/test_xnnpack_delegate.py
+++ b/test/jit/xnnpack/test_xnnpack_delegate.py
@@ -67,3 +67,37 @@ def forward(self, x):
             }
         )
         lowered(torch.zeros(1))
+
+    def test_xnnpack_unsupported(self):
+        class AddSpliceModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                z = x + y[:, :, 1, :]
+                return z
+
+        sample_inputs = (torch.rand(1, 512, 512, 3), torch.rand(1, 512, 512, 3))
+        sample_output = torch.zeros(1, 512, 512, 3)
+
+        error_msg = (
+            "the module contains the following unsupported ops:\n"
+            "aten::select\n"
+            "aten::slice\n"
+        )
+
+        add_module = torch.jit.script(AddSpliceModule())
+        with self.assertRaisesRegex(
+            RuntimeError,
+            error_msg,
+        ):
+            _ = torch._C._jit_to_backend(
+                "xnnpack",
+                add_module,
+                {
+                    "forward": {
+                        "inputs" : [sample_inputs[0], sample_inputs[1]],
+                        "outputs": [sample_output]
+                    }
+                }
+            )
diff --git a/torch/csrc/jit/backends/xnnpack/xnnpack_backend_preprocess.cpp b/torch/csrc/jit/backends/xnnpack/xnnpack_backend_preprocess.cpp
index 6d739f4097444..536e1cb8e773d 100644
--- a/torch/csrc/jit/backends/xnnpack/xnnpack_backend_preprocess.cpp
+++ b/torch/csrc/jit/backends/xnnpack/xnnpack_backend_preprocess.cpp
@@ -6,6 +6,7 @@
 #include <xnnpack.h>
 
 #include <ATen/core/List.h>
+#include <torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.h>
 
 namespace torch {
 namespace jit {
@@ -83,6 +84,10 @@ c10::IValue preprocess(
     example_inputs.emplace_back(inp.toTensor());
   }
 
+  // inp above has been confirmed to be either Tensor or TensorList
+  XNNGraph graph_builder;
+  graph_builder.buildXNNGraph(graph, example_inputs);
+
   compiled.insert("Answer", at::empty({1}, c10::ScalarType::Float));
 
   return compiled;
diff --git a/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp
new file mode 100644
index 0000000000000..438e681b508b6
--- /dev/null
+++ b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp
@@ -0,0 +1,71 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#include <caffe2/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.h>
+#include <torch/csrc/jit/runtime/graph_iterator.h>
+#include <xnnpack.h>
+
+// graph passes
+#include <torch/csrc/jit/passes/constant_propagation.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+#include <torch/csrc/jit/passes/frozen_graph_optimizations.h>
+#include <torch/csrc/jit/passes/lower_tuples.h>
+#include <torch/csrc/jit/passes/remove_mutation.h>
+#include <torch/csrc/jit/runtime/jit_trace.h>
+#include <torch/csrc/jit/tensorexpr/graph_opt.h>
+
+namespace torch {
+namespace jit {
+namespace xnnpack {
+namespace delegate {
+
+std::shared_ptr<torch::jit::Graph> XNNGraph::optimizeAndTraceGraph(
+    std::shared_ptr<torch::jit::Graph> graph,
+    std::vector<c10::IValue>& example_inputs) {
+  graph = tensorexpr::removeUnusedSelfArgument(graph);
+  OptimizeFrozenGraph(graph, true);
+  RemoveListMutation(graph);
+  RemoveTensorMutation(graph);
+  LowerAllTuples(graph);
+  ConstantPropagation(graph);
+  graph = TraceGraph(graph, example_inputs);
+
+  return graph;
+}
+
+void XNNGraph::buildXNNGraph(
+    std::shared_ptr<torch::jit::Graph>& graph,
+    std::vector<c10::IValue> example_inputs) {
+  graph = optimizeAndTraceGraph(graph, example_inputs);
+  checkOpsToDelegate(graph);
+}
+
+void XNNGraph::checkOpsToDelegate(std::shared_ptr<torch::jit::Graph>& graph) {
+  std::unordered_set<string> unsupported_ops;
+  DepthFirstGraphNodeIterator it(graph);
+  Node* node = nullptr;
+  while ((node = it.next()) != nullptr) {
+    switch (node->kind()) {
+      case prim::Constant:
+      case aten::add: {
+        break;
+      }
+      default: {
+        unsupported_ops.insert(node->kind().toDisplayString());
+      }
+    }
+  }
+  std::stringstream error;
+  for (auto itr = unsupported_ops.begin(); itr != unsupported_ops.end();
+       itr++) {
+    error << *itr << std::endl;
+    ;
+  }
+  TORCH_CHECK(
+      unsupported_ops.empty(),
+      "the module contains the following unsupported ops:\n" + error.str());
+}
+
+} // namespace delegate
+} // namespace xnnpack
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.h b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.h
new file mode 100644
index 0000000000000..e9593376dc798
--- /dev/null
+++ b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.h
@@ -0,0 +1,50 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+#include <ATen/Functions.h>
+#include <ATen/Utils.h>
+#include <torch/torch.h>
+#include <xnnpack.h>
+#include <unordered_set>
+#include <vector>
+
+namespace torch {
+namespace jit {
+namespace xnnpack {
+namespace delegate {
+
+class XNNGraph {
+ private:
+  // xnn_subgraph
+  xnn_subgraph_t _subgraph_ptr;
+
+  // Graph passes for optimizing and tracing torchscript graph
+  // Essentially massaging the graph into a digestiable format for
+  // xnnpack graph lowering.
+  std::shared_ptr<torch::jit::Graph> optimizeAndTraceGraph(
+      std::shared_ptr<torch::jit::Graph> graph,
+      std::vector<c10::IValue>& example_inputs);
+
+  // Makes a pass through the graph and throws if any ops are unsupported
+  void checkOpsToDelegate(std::shared_ptr<torch::jit::Graph>& graph);
+
+ public:
+  XNNGraph() : _subgraph_ptr(nullptr) {
+    xnn_status status = xnn_initialize(/*allocator =*/nullptr);
+    TORCH_CHECK(xnn_status_success == status, "Failed to initialize xnnpack");
+  }
+
+  ~XNNGraph() {
+    xnn_deinitialize();
+    if (_subgraph_ptr != nullptr) {
+      xnn_delete_subgraph(_subgraph_ptr);
+    }
+  }
+
+  void buildXNNGraph(
+      std::shared_ptr<torch::jit::Graph>& graph,
+      std::vector<c10::IValue> example_inputs);
+};
+
+} // namespace delegate
+} // namespace xnnpack
+} // namespace jit
+} // namespace torch

From ee88c037cbf1e029c97fc453b61a1321ed2f3616 Mon Sep 17 00:00:00 2001
From: Horace He <chilli@fb.com>
Date: Tue, 25 Oct 2022 18:49:25 +0000
Subject: [PATCH 0146/1922] Added gm.print_readable to torchinductor_trace
 output (#87717)

cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87717
Approved by: https://github.com/ngimel
---
 torch/_inductor/debug.py | 5 ++++-
 torch/fx/graph_module.py | 7 +++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/torch/_inductor/debug.py b/torch/_inductor/debug.py
index f7fbfe218be39..67e75d1a73294 100644
--- a/torch/_inductor/debug.py
+++ b/torch/_inductor/debug.py
@@ -306,9 +306,12 @@ def __init__(self, handler):
         self.handler = handler
 
     def fx_graph(self, gm: torch.fx.GraphModule, inputs: List[torch.Tensor]):
-        with self.fopen("fx_graph.py") as fd:
+        with self.fopen("fx_graph_runnable.py") as fd:
             dynamo_debug_utils.save_graph_repro(fd, gm, inputs, "inductor")
 
+        with self.fopen("fx_graph_readable.py") as fd:
+            fd.write(gm.print_readable(print_output=False))
+
     def ir_pre_fusion(self, nodes: SchedulerNodeList):
         self._write_ir("ir_pre_fusion.txt", nodes)
 
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index e2f9e1ed343e4..bc07952cf6fe6 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -712,7 +712,7 @@ def __copy__(self):
         return GraphModule(self, self.graph)
 
     @compatibility(is_backward_compatible=False)
-    def print_readable(self):
+    def print_readable(self, print_output=True):
         """
         Return the Python code generated for current GraphModule and its children GraphModules
         """
@@ -729,7 +729,10 @@ def print_readable(self):
         submodule_code = "\n".join(submodule_code_list)
         submodule_code = _addindent(submodule_code, 4)
 
-        print(module_code + submodule_code)
+        output = module_code + submodule_code
+        if print_output:
+            print(module_code + submodule_code)
+        return output
 
     def __str__(self) -> str:
         orig_str = super().__str__()

From 5f9918463c541fb32d06ab32deb5402cd367c589 Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@fb.com>
Date: Tue, 25 Oct 2022 22:47:54 +0000
Subject: [PATCH 0147/1922] Fix CODE level usage in dynamo config.py (#87522)

Fixes https://github.com/pytorch/torchdynamo/issues/1718.

Tested by changing `log_level = logging.WARNING` in config.py to `log_level = logging.CODE` and running a test script that doesn't touch `log_level`.

cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87522
Approved by: https://github.com/mlazos
---
 torch/_dynamo/config.py           | 5 ++++-
 torch/_dynamo/convert_frame.py    | 8 ++++----
 torch/_dynamo/logging.py          | 4 ++--
 torch/_dynamo/output_graph.py     | 2 +-
 torch/_inductor/codegen/triton.py | 3 +--
 torch/_inductor/graph.py          | 4 ++--
 6 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index 701036789ffcb..f24eeeae76882 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -6,6 +6,9 @@
 
 import torch
 
+# needed so that CODE is registered as a level in logging
+from . import logging as torchdynamo_logging  # noqa: F401
+
 try:
     import torch._prims
     import torch._refs
@@ -17,7 +20,7 @@
 
 # log level (levels print what it says + all levels listed below it)
 # logging.DEBUG print full traces <-- lowest level + print tracing of every instruction
-# torchdynamo.logging.CODE print compiled functions + graphs
+# logging.CODE print compiled functions + graphs (NOTE: can only be used after importing torch._dynamo.logging)
 # logging.INFO print the steps that dynamo is running
 # logging.WARN print warnings (including graph breaks)
 # logging.ERROR print exceptions (and what user code was being processed when it occurred)
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index 206cffb7aeeda..0ebf3b93ce727 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -11,7 +11,7 @@
 import torch
 from torch.fx.graph_module import _forward_from_src as original_forward_from_src
 
-from . import config, exc, logging as torchdynamo_logging
+from . import config, exc
 from .allowed_functions import is_allowed
 from .bytecode_analysis import remove_dead_code, remove_pointless_jumps
 from .bytecode_transformation import is_generator, transform_code_object
@@ -395,7 +395,7 @@ def transform(instructions, code_options):
         output_codes.add(out_code)
 
         log.log(
-            torchdynamo_logging.CODE,
+            logging.CODE,
             format_bytecode(
                 "ORIGINAL BYTECODE",
                 code.co_name,
@@ -405,7 +405,7 @@ def transform(instructions, code_options):
             ),
         )
         log.log(
-            torchdynamo_logging.CODE,
+            logging.CODE,
             format_bytecode(
                 "MODIFIED BYTECODE",
                 code.co_name,
@@ -423,7 +423,7 @@ def transform(instructions, code_options):
         guard_str = "GUARDS:\n"
         guard_str += "\n".join([f" - {str(guard)}" for guard in sorted(output.guards)])
 
-        log.log(torchdynamo_logging.CODE, guard_str)
+        log.log(logging.CODE, guard_str)
 
         if guard_export_fn is not None:
             guard_export_fn(output.guards)
diff --git a/torch/_dynamo/logging.py b/torch/_dynamo/logging.py
index 0705e77a7c7d5..95ee727f1ddf1 100644
--- a/torch/_dynamo/logging.py
+++ b/torch/_dynamo/logging.py
@@ -3,8 +3,8 @@
 import os
 
 # logging level for dynamo generated graphs/bytecode/guards
-CODE = 15
-logging.addLevelName(CODE, "CODE")
+logging.CODE = 15
+logging.addLevelName(logging.CODE, "CODE")
 
 
 # Return all loggers that torchdynamo/torchinductor is responsible for
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index c23d4f6dd9934..861798b78e81b 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -408,7 +408,7 @@ def compile_and_call_fx_graph(self, tx, rv, root):
             # the call to tabulate can cause a lot of memory to be allocated
             if config.log_level <= logging.INFO:
                 log.log(
-                    torchdynamo_logging.CODE,
+                    logging.CODE,
                     f"TRACED GRAPH\n {name} {gm.forward.__code__.co_filename} {format_graph_tabular(gm.graph)}\n",
                 )
         except ImportError:
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 0ece1a06c9fa0..5ccf1a7191f29 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -15,7 +15,6 @@
 from .. import config, ir, scheduler
 from ..ir import ReductionHint
 from ..utils import (
-    dynamo_logging,
     free_symbol_startswith,
     instance_descriptor,
     sympy_product,
@@ -1226,7 +1225,7 @@ def end_current_reduction_loop():
                     f"unexpected group: ({numel}, {rnumel}) != {node.group[1]}"
                 )
 
-        log.log(dynamo_logging.CODE, "schedule: %s", node_schedule)
+        log.log(logging.CODE, "schedule: %s", node_schedule)
         return self.codegen_node_schedule(node_schedule, numel, rnumel)
 
     @staticmethod
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 8a971020ac047..3e274be506157 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -21,7 +21,7 @@
 from .ir import Constant, FixedLayout, InputBuffer, TensorBox
 from .lowering import lowerings, make_fallback, needs_realized_inputs
 from .sizevars import SizeVarAllocator
-from .utils import dynamo_logging, dynamo_utils
+from .utils import dynamo_utils
 from .virtualized import V
 
 log = logging.getLogger(__name__)
@@ -339,7 +339,7 @@ def compile_to_module(self):
         for name, value in self.constants.items():
             setattr(mod, name, value)
 
-        log.log(dynamo_logging.CODE, "Output code: %s", mod.__file__)
+        log.log(logging.CODE, "Output code: %s", mod.__file__)
         V.debug.output_code(mod.__file__)
         V.debug.rename(os.path.splitext(mod.__file__)[0] + ".debug")
         return mod

From 9137d6b14740cfd0af483da0d9fc3268d71f27ca Mon Sep 17 00:00:00 2001
From: Minh Nguyen <minhn@meta.com>
Date: Tue, 25 Oct 2022 22:52:52 +0000
Subject: [PATCH 0148/1922] aten cpu and xnnpack to be compatible with arvr
 mode build (#87125)

Summary:
When building 3d photo sdk generator package in arvr/mode/mac and arvr/mode/mac-arm modes, we got several issues with aten cpu and xnnpack libraries.

The reason is that those packages are using platform-* properties (platform-deps, platform-srcs...) which are not compatible with arvr modes.

This diff fixes those issues by using `select` for non-platform properties when is_arvr_mode() is true, while keeping those platform ones for non-arvr modes.

Test Plan:
```
buck build //arvr/projects/compphoto/photo3d_sdk/unity/plugin:generator_plugin_shared arvr/mode/mac-arm/dev
buck build //arvr/projects/compphoto/photo3d_sdk/unity/plugin:generator_plugin_shared arvr/mode/mac-arm/opt

buck build //arvr/projects/compphoto/photo3d_sdk/unity/plugin:generator_plugin_shared arvr/mode/mac/dev
buck build //arvr/projects/compphoto/photo3d_sdk/unity/plugin:generator_plugin_shared arvr/mode/mac/opt
```

and sandcastle builds

Differential Revision: D40028669

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87125
Approved by: https://github.com/kimishpatel
---
 c2_defs.bzl                  |  5 +-
 third_party/xnnpack.buck.bzl | 89 +++++++++++++++++++++++++++---------
 2 files changed, 71 insertions(+), 23 deletions(-)

diff --git a/c2_defs.bzl b/c2_defs.bzl
index d77fed977f39e..fa6719a54efdb 100644
--- a/c2_defs.bzl
+++ b/c2_defs.bzl
@@ -351,7 +351,10 @@ def get_c2_aten_cpu_fbobjc_macosx_deps():
             "fbsource//xplat/caffe2:cpukernel_avx2",
         ]
     else:
-        return []
+        return select({
+            "DEFAULT": [],
+            "ovr_config//os:macos-x86_64": ["fbsource//xplat/deeplearning/fbgemm:fbgemm"],
+        }) if is_arvr_mode() else []
 
 def get_c2_aten_cpu_fbobjc_macosx_platform_deps():
     if is_focus_enabled():
diff --git a/third_party/xnnpack.buck.bzl b/third_party/xnnpack.buck.bzl
index ee07488e26749..41f6e2e7c8150 100644
--- a/third_party/xnnpack.buck.bzl
+++ b/third_party/xnnpack.buck.bzl
@@ -1,4 +1,5 @@
 load("//tools/build_defs:fb_xplat_cxx_library.bzl", "fb_xplat_cxx_library")
+load("//tools/build_defs:fbsource_utils.bzl", "is_arvr_mode")
 load("//tools/build_defs:glob_defs.bzl", "subdir_glob")
 load("//tools/build_defs:platform_defs.bzl", "ANDROID", "APPLE", "APPLETVOS", "CXX", "IOS", "MACOSX", "WINDOWS")
 load(
@@ -237,6 +238,10 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
 
     fb_xplat_cxx_library(
         name = "ukernels_sse",
+        srcs = (select({
+            "DEFAULT": [],
+            "ovr_config//os:macos-x86_64": PROD_SSE_MICROKERNEL_SRCS,
+        }) if is_arvr_mode() else []),
         headers = subdir_glob([
             ("XNNPACK/src", "**/*.c"),
             ("XNNPACK/src", "**/*.h"),
@@ -259,12 +264,12 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
                 ],
             ),
         ],
-        platform_srcs = [
+        platform_srcs = ([
             (
                 "x86|x86_64|platform009|platform010",
                 PROD_SSE_MICROKERNEL_SRCS,
             ),
-        ],
+        ] if not is_arvr_mode() else []),
         preferred_linkage = "static",
         preprocessor_flags = [
             "-DXNN_LOG_LEVEL=0",
@@ -316,6 +321,10 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
 
     fb_xplat_cxx_library(
         name = "ukernels_sse2",
+        srcs = (select({
+            "DEFAULT": [],
+            "ovr_config//os:macos-x86_64": PROD_SSE2_MICROKERNEL_SRCS,
+        }) if is_arvr_mode() else []),
         headers = subdir_glob([
             ("XNNPACK/src", "**/*.c"),
             ("XNNPACK/src", "**/*.h"),
@@ -338,12 +347,12 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
                 ],
             ),
         ],
-        platform_srcs = [
+        platform_srcs = ([
             (
                 "x86|x86_64|platform009|platform010",
                 PROD_SSE2_MICROKERNEL_SRCS,
             ),
-        ],
+        ] if not is_arvr_mode() else []),
         preferred_linkage = "static",
         preprocessor_flags = [
             "-DXNN_LOG_LEVEL=0",
@@ -397,6 +406,10 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
 
     fb_xplat_cxx_library(
         name = "ukernels_ssse3",
+        srcs = (select({
+            "DEFAULT": [],
+            "ovr_config//os:macos-x86_64": PROD_SSSE3_MICROKERNEL_SRCS,
+        }) if is_arvr_mode() else []),
         headers = subdir_glob([
             ("XNNPACK/src", "**/*.c"),
             ("XNNPACK/src", "**/*.h"),
@@ -419,12 +432,12 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
                 ],
             ),
         ],
-        platform_srcs = [
+        platform_srcs = ([
             (
                 "x86|x86_64|platform009|platform010",
                 PROD_SSSE3_MICROKERNEL_SRCS,
             ),
-        ],
+        ] if not is_arvr_mode() else []),
         preferred_linkage = "static",
         preprocessor_flags = [
             "-DXNN_LOG_LEVEL=0",
@@ -478,6 +491,10 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
 
     fb_xplat_cxx_library(
         name = "ukernels_sse41",
+        srcs = (select({
+            "DEFAULT": [],
+            "ovr_config//os:macos-x86_64": PROD_SSE41_MICROKERNEL_SRCS,
+        }) if is_arvr_mode() else []),
         headers = subdir_glob([
             ("XNNPACK/src", "**/*.c"),
             ("XNNPACK/src", "**/*.h"),
@@ -500,12 +517,12 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
                 ],
             ),
         ],
-        platform_srcs = [
+        platform_srcs = ([
             (
                 "x86|x86_64|platform009|platform010",
                 PROD_SSE41_MICROKERNEL_SRCS,
             ),
-        ],
+        ] if not is_arvr_mode() else []),
         preferred_linkage = "static",
         preprocessor_flags = [
             "-DXNN_LOG_LEVEL=0",
@@ -559,6 +576,10 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
 
     fb_xplat_cxx_library(
         name = "ukernels_avx",
+        srcs = (select({
+            "DEFAULT": [],
+            "ovr_config//os:macos-x86_64": PROD_AVX_MICROKERNEL_SRCS,
+        }) if is_arvr_mode() else []),
         headers = subdir_glob([
             ("XNNPACK/src", "**/*.h"),
             ("XNNPACK/src", "**/*.c"),
@@ -582,12 +603,12 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
                 ],
             ),
         ],
-        platform_srcs = [
+        platform_srcs = ([
             (
                 "x86|x86_64|platform009|platform010",
                 PROD_AVX_MICROKERNEL_SRCS,
             ),
-        ],
+        ] if not is_arvr_mode() else []),
         preferred_linkage = "static",
         preprocessor_flags = [
             "-DXNN_LOG_LEVEL=0",
@@ -640,6 +661,10 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
 
     fb_xplat_cxx_library(
         name = "ukernels_f16c",
+        srcs = (select({
+            "DEFAULT": [],
+            "ovr_config//os:macos-x86_64": PROD_F16C_MICROKERNEL_SRCS,
+        }) if is_arvr_mode() else []),
         headers = subdir_glob([
             ("XNNPACK/src", "**/*.h"),
             ("XNNPACK/src", "**/*.c"),
@@ -663,12 +688,12 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
                 ],
             ),
         ],
-        platform_srcs = [
+        platform_srcs = ([
             (
                 "x86|x86_64|platform009|platform010",
                 PROD_F16C_MICROKERNEL_SRCS,
             ),
-        ],
+        ] if not is_arvr_mode() else []),
         platforms = (APPLE, ANDROID, CXX, WINDOWS),
         preferred_linkage = "static",
         preprocessor_flags = [
@@ -723,6 +748,10 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
 
     fb_xplat_cxx_library(
         name = "ukernels_xop",
+        srcs = (select({
+            "DEFAULT": [],
+            "ovr_config//os:macos-x86_64": PROD_XOP_MICROKERNEL_SRCS,
+        }) if is_arvr_mode() else []),
         headers = subdir_glob([
             ("XNNPACK/src", "**/*.h"),
             ("XNNPACK/src", "**/*.c"),
@@ -746,12 +775,12 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
                 ],
             ),
         ],
-        platform_srcs = [
+        platform_srcs = ([
             (
                 "x86|x86_64|platform009|platform010",
                 PROD_XOP_MICROKERNEL_SRCS,
             ),
-        ],
+        ] if not is_arvr_mode() else []),
         preferred_linkage = "static",
         preprocessor_flags = [
             "-DXNN_LOG_LEVEL=0",
@@ -804,6 +833,10 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
 
     fb_xplat_cxx_library(
         name = "ukernels_fma3",
+        srcs = (select({
+            "DEFAULT": [],
+            "ovr_config//os:macos-x86_64": PROD_FMA3_MICROKERNEL_SRCS,
+        }) if is_arvr_mode() else []),
         headers = subdir_glob([
             ("XNNPACK/src", "**/*.h"),
             ("XNNPACK/src", "**/*.c"),
@@ -829,12 +862,12 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
                 ],
             ),
         ],
-        platform_srcs = [
+        platform_srcs = ([
             (
                 "x86|x86_64|platform009|platform010",
                 PROD_FMA3_MICROKERNEL_SRCS,
             ),
-        ],
+        ] if not is_arvr_mode() else []),
         preferred_linkage = "static",
         preprocessor_flags = [
             "-DXNN_LOG_LEVEL=0",
@@ -901,6 +934,10 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
 
     fb_xplat_cxx_library(
         name = "ukernels_avx2",
+        srcs = (select({
+            "DEFAULT": [],
+            "ovr_config//os:macos-x86_64": PROD_AVX2_MICROKERNEL_SRCS,
+        }) if is_arvr_mode() else []),
         headers = subdir_glob([
             ("XNNPACK/src", "**/*.c"),
             ("XNNPACK/src", "**/*.h"),
@@ -928,12 +965,12 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
                 ],
             ),
         ],
-        platform_srcs = [
+        platform_srcs = ([
             (
                 "x86|x86_64|platform009|platform010",
                 PROD_AVX2_MICROKERNEL_SRCS,
             ),
-        ],
+        ] if not is_arvr_mode() else []),
         preferred_linkage = "static",
         preprocessor_flags = [
             "-DXNN_LOG_LEVEL=0",
@@ -1006,6 +1043,10 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
 
     fb_xplat_cxx_library(
         name = "ukernels_avx512",
+        srcs = (select({
+            "DEFAULT": [],
+            "ovr_config//os:macos-x86_64": PROD_AVX512F_MICROKERNEL_SRCS,
+        }) if is_arvr_mode() else []),
         headers = subdir_glob([
             ("XNNPACK/src", "**/*.c"),
             ("XNNPACK/src", "**/*.h"),
@@ -1029,12 +1070,12 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
                 ],
             ),
         ],
-        platform_srcs = [
+        platform_srcs = ([
             (
                 "x86|x86_64|platform009|platform010",
                 PROD_AVX512F_MICROKERNEL_SRCS,
             ),
-        ],
+        ] if not is_arvr_mode() else []),
         preferred_linkage = "static",
         preprocessor_flags = [
             "-DXNN_LOG_LEVEL=0",
@@ -1087,6 +1128,10 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
 
     fb_xplat_cxx_library(
         name = "ukernels_avx512skx",
+        srcs = (select({
+            "DEFAULT": [],
+            "ovr_config//os:macos-x86_64": PROD_AVX512SKX_MICROKERNEL_SRCS,
+        }) if is_arvr_mode() else []),
         headers = subdir_glob([
             ("XNNPACK/src", "**/*.c"),
             ("XNNPACK/src", "**/*.h"),
@@ -1118,12 +1163,12 @@ def define_xnnpack(third_party, labels = [], XNNPACK_WINDOWS_AVX512F_ENABLED = F
                 ],
             ),
         ],
-        platform_srcs = [
+        platform_srcs = ([
             (
                 "x86|x86_64|platform009|platform010",
                 PROD_AVX512SKX_MICROKERNEL_SRCS,
             ),
-        ],
+        ] if not is_arvr_mode() else []),
         preferred_linkage = "static",
         preprocessor_flags = [
             "-DXNN_LOG_LEVEL=0",

From fe9ebcd8360998e8948c0c3b7901b3a789112170 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Tue, 25 Oct 2022 13:34:16 -0700
Subject: [PATCH 0149/1922] [FSDP][BE] Skip asan (#87729)

Per title

Differential Revision: [D40690407](https://our.internmc.facebook.com/intern/diff/D40690407/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87729
Approved by: https://github.com/awgu
---
 test/distributed/fsdp/test_fsdp_checkpoint.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/test/distributed/fsdp/test_fsdp_checkpoint.py b/test/distributed/fsdp/test_fsdp_checkpoint.py
index b75fa17f86bf5..50a5573f901f8 100644
--- a/test/distributed/fsdp/test_fsdp_checkpoint.py
+++ b/test/distributed/fsdp/test_fsdp_checkpoint.py
@@ -3,6 +3,7 @@
 import contextlib
 from copy import deepcopy
 from functools import partial
+import sys
 
 import torch
 import torch.distributed as dist
@@ -23,12 +24,27 @@
     _maybe_wrap_fsdp,
 )
 from torch.testing._internal.common_utils import (
+    TEST_WITH_DEV_DBG_ASAN,
     run_tests,
     parametrize,
     instantiate_parametrized_tests,
 )
 from torch.utils.checkpoint import checkpoint
 
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+
+
 _save_on_cpu_called = False
 def get_patched_save_on_cpu():
     orig_save_on_cpu = torch.distributed.algorithms._checkpoint.checkpoint_wrapper.save_on_cpu

From aee696acb8fed4a8ec2f7ca9b0616f7a5a88a937 Mon Sep 17 00:00:00 2001
From: Sergii Dymchenko <sdym@fb.com>
Date: Tue, 25 Oct 2022 23:29:02 +0000
Subject: [PATCH 0150/1922] Update xla.txt (#87739)

As per @JackCaoG  suggestion to fix the xla tests.

This PR replaces https://github.com/pytorch/pytorch/pull/87737, see that for details.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87739
Approved by: https://github.com/weiwangmeta
---
 .github/ci_commit_pins/xla.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index 6d16c6159e998..e75cb6ffbe979 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-cf5dea047d1c9c63a201fb1b97b690416b683dde
+1812b1d19477707ed027e7b597ff23a46176dab8

From 1ac4cfbcba80e21d4656d5b8628bc640aeb33449 Mon Sep 17 00:00:00 2001
From: eqy <eddiey@nvidia.com>
Date: Tue, 25 Oct 2022 23:30:30 +0000
Subject: [PATCH 0151/1922] [cuDNN][cuDNN V8 API] Use suggest memory format for
 cuDNN V8 API (#87617)

Fixes some failures we observed in `functorch` tests which seemed to stem from benchmark cache collisions on the same memory format. Changing the memory format to be dependent on both input and weight seems to resolve them.

CC @crcrpar @ptrblck

cc @csarofeen @ptrblck @xwang233
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87617
Approved by: https://github.com/ngimel
---
 aten/src/ATen/native/cudnn/Conv_v8.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/cudnn/Conv_v8.cpp b/aten/src/ATen/native/cudnn/Conv_v8.cpp
index ded4d2385c2ce..17834e9df173a 100644
--- a/aten/src/ATen/native/cudnn/Conv_v8.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v8.cpp
@@ -159,7 +159,8 @@ BenchmarkCache<cudnn_frontend::ExecutionPlan, CacheKeyFused> benchmark_cache_fus
 // would not be a POD anymore.
 void setCacheKey(CacheKey& key, const cudnnBackendDescriptorType_t operation, const Tensor& y, const Tensor& x, const Tensor& w, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, int64_t groups, bool deterministic, bool allow_tf32) {
   memset(&key, 0, sizeof(key));
-  setConvolutionParams(&key.params, x, w, padding, stride, dilation, groups, deterministic, allow_tf32, x.suggest_memory_format());
+  at::MemoryFormat memory_format = cudnn_conv_suggest_memory_format(x, w);
+  setConvolutionParams(&key.params, x, w, padding, stride, dilation, groups, deterministic, allow_tf32, memory_format);
   key.operation = operation;
   key.x_alignment = getAlignment(x);
   key.y_alignment = getAlignment(y);
@@ -168,7 +169,8 @@ void setCacheKey(CacheKey& key, const cudnnBackendDescriptorType_t operation, co
 
 void setCacheKeyFused(CacheKeyFused& key, const Tensor& y, const Tensor& x, const Tensor& w, const Tensor& z, const Tensor& b, const float alpha, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, int64_t groups, bool deterministic, bool allow_tf32) {
   memset(&key, 0, sizeof(key));
-  setConvolutionParams(&key.params, x, w, padding, stride, dilation, groups, deterministic, allow_tf32, x.suggest_memory_format());
+  at::MemoryFormat memory_format = cudnn_conv_suggest_memory_format(x, w);
+  setConvolutionParams(&key.params, x, w, padding, stride, dilation, groups, deterministic, allow_tf32, memory_format);
   key.x_alignment = getAlignment(x);
   key.y_alignment = getAlignment(y);
   key.w_alignment = getAlignment(w);

From aff77a1e44a5500e396ed4fb4a961b3f7b305d44 Mon Sep 17 00:00:00 2001
From: Max Podkorytov <maxdp@meta.com>
Date: Tue, 25 Oct 2022 23:48:16 +0000
Subject: [PATCH 0152/1922] [static-runtime] run codegen (#87534)

Summary:
```
buck run //caffe2/torch/fb/jit:gen_static_runtime_ops
```

Test Plan: CI

Differential Revision: D40612521

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87534
Approved by: https://github.com/mikeiovine
---
 .../static_runtime/test_generated_ops.cc      | 132 ------------------
 .../csrc/jit/runtime/static/generated_ops.cpp |  90 ------------
 .../static_runtime/gen_static_runtime_ops.py  |   1 +
 3 files changed, 1 insertion(+), 222 deletions(-)

diff --git a/benchmarks/static_runtime/test_generated_ops.cc b/benchmarks/static_runtime/test_generated_ops.cc
index 13be31e29a38a..80ffc5ac8441a 100644
--- a/benchmarks/static_runtime/test_generated_ops.cc
+++ b/benchmarks/static_runtime/test_generated_ops.cc
@@ -5584,138 +5584,6 @@ TEST(StaticRuntime, autogen_multilabel_margin_loss) {
       /*check_resize=*/false);
 }
 
-TEST(StaticRuntime, autogen_nll_loss) {
-  const std::string script = R"IR(
-    graph(%self: Tensor, %target: Tensor, %weight: Tensor?, %reduction: int, %ignore_index: int):
-        %bias: None = prim::Constant()
-        %ret = aten::nll_loss(%self, %target, %weight, %reduction, %ignore_index)
-        %cloned = aten::clone(%ret, %bias)
-        return (%cloned)
-  )IR";
-
-  auto self0 = at::rand({6, 6});
-  auto target0 = at::randint(6, {6}, torch::kInt64);
-  auto weight0 = at::rand({6});
-  auto reduction0 = 1;
-  auto ignore_index0 = 1;
-  std::vector<IValue> args{self0, target0, weight0, reduction0, ignore_index0};
-  testStaticRuntime(
-      script,
-      args,
-      {},
-      /*use_allclose=*/false,
-      /*use_equalnan=*/false,
-      /*check_resize=*/false);
-
-  auto self1 = at::rand({22, 22});
-  auto target1 = at::randint(22, {22}, torch::kInt64);
-  auto weight1 = at::rand({22});
-  auto reduction1 = 1;
-  auto ignore_index1 = 1;
-  std::vector<IValue> args2{self1, target1, weight1, reduction1, ignore_index1};
-  testStaticRuntime(
-      script,
-      args,
-      args2,
-      /*use_allclose=*/false,
-      /*use_equalnan=*/false,
-      /*check_resize=*/false);
-}
-
-TEST(StaticRuntime, autogen_nll_loss_backward) {
-  const std::string script = R"IR(
-    graph(%grad_output: Tensor, %self: Tensor, %target: Tensor, %weight: Tensor?, %reduction: int, %ignore_index: int, %total_weight: Tensor):
-        %bias: None = prim::Constant()
-        %ret = aten::nll_loss_backward(%grad_output, %self, %target, %weight, %reduction, %ignore_index, %total_weight)
-        %cloned = aten::clone(%ret, %bias)
-        return (%cloned)
-  )IR";
-
-  auto grad_output0 = at::rand({});
-  auto self0 = at::rand({6});
-  auto target0 = at::randint(0, 5, {6}, torch::kInt64);
-  auto weight0 = at::rand({6});
-  auto reduction0 = 1;
-  auto ignore_index0 = 1;
-  auto total_weight0 = at::rand({});
-  std::vector<IValue> args{
-      grad_output0,
-      self0,
-      target0,
-      weight0,
-      reduction0,
-      ignore_index0,
-      total_weight0};
-  testStaticRuntime(
-      script,
-      args,
-      {},
-      /*use_allclose=*/false,
-      /*use_equalnan=*/false,
-      /*check_resize=*/true);
-
-  auto grad_output1 = at::rand({});
-  auto self1 = at::rand({36});
-  auto target1 = at::randint(0, 11, {36}, torch::kInt64);
-  auto weight1 = at::rand({36});
-  auto reduction1 = 1;
-  auto ignore_index1 = 1;
-  auto total_weight1 = at::rand({});
-  std::vector<IValue> args2{
-      grad_output1,
-      self1,
-      target1,
-      weight1,
-      reduction1,
-      ignore_index1,
-      total_weight1};
-  testStaticRuntime(
-      script,
-      args,
-      args2,
-      /*use_allclose=*/false,
-      /*use_equalnan=*/false,
-      /*check_resize=*/true);
-}
-
-TEST(StaticRuntime, autogen_nll_loss2d) {
-  const std::string script = R"IR(
-    graph(%self: Tensor, %target: Tensor, %weight: Tensor?, %reduction: int, %ignore_index: int):
-        %bias: None = prim::Constant()
-        %ret = aten::nll_loss2d(%self, %target, %weight, %reduction, %ignore_index)
-        %cloned = aten::clone(%ret, %bias)
-        return (%cloned)
-  )IR";
-
-  auto self0 = at::rand({6, 6, 6, 6});
-  auto target0 = at::randint(6, {6, 6, 6}, torch::kInt64);
-  auto weight0 = at::rand({6});
-  auto reduction0 = 1;
-  auto ignore_index0 = 1;
-  std::vector<IValue> args{self0, target0, weight0, reduction0, ignore_index0};
-  testStaticRuntime(
-      script,
-      args,
-      {},
-      /*use_allclose=*/false,
-      /*use_equalnan=*/false,
-      /*check_resize=*/false);
-
-  auto self1 = at::rand({22, 22, 22, 22});
-  auto target1 = at::randint(22, {22, 22, 22}, torch::kInt64);
-  auto weight1 = at::rand({22});
-  auto reduction1 = 1;
-  auto ignore_index1 = 1;
-  std::vector<IValue> args2{self1, target1, weight1, reduction1, ignore_index1};
-  testStaticRuntime(
-      script,
-      args,
-      args2,
-      /*use_allclose=*/false,
-      /*use_equalnan=*/false,
-      /*check_resize=*/false);
-}
-
 TEST(StaticRuntime, autogen_soft_margin_loss) {
   const std::string script = R"IR(
     graph(%self: Tensor, %target: Tensor, %reduction: int):
diff --git a/torch/csrc/jit/runtime/static/generated_ops.cpp b/torch/csrc/jit/runtime/static/generated_ops.cpp
index bd9c8d553ab70..2ad1741ef56de 100644
--- a/torch/csrc/jit/runtime/static/generated_ops.cpp
+++ b/torch/csrc/jit/runtime/static/generated_ops.cpp
@@ -3408,96 +3408,6 @@ REGISTER_OPERATOR_FUNCTOR(
       return nullptr;
     });
 
-REGISTER_OPERATOR_FUNCTOR(aten::nll_loss, aten_nll_loss, [](Node* n) -> SROperator {
-  if (n->matches(torch::schema(
-          "aten::nll_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor"))) {
-    return [](ProcessedNode* p_node) {
-      const auto& self = p_node->Input(0).toTensor();
-      const auto& target = p_node->Input(1).toTensor();
-      const auto weight = p_node->Input(2).toOptional<at::Tensor>();
-      const auto reduction = p_node->Input(3).toInt();
-      const auto ignore_index = p_node->Input(4).toInt();
-      if (p_node->Output(0).isNone()) {
-        p_node->Output(0) =
-            at::native::nll_loss(self, target, weight, reduction, ignore_index);
-        return;
-      }
-      auto& out = p_node->Output(0).toTensor();
-      fastResizeToZero(out);
-      at::native::nll_loss_out(
-          self, target, weight, reduction, ignore_index, out);
-    };
-  }
-  LogAndDumpSchema(n);
-  return nullptr;
-});
-
-REGISTER_OPERATOR_FUNCTOR(
-    aten::nll_loss_backward,
-    aten_nll_loss_backward,
-    [](Node* n) -> SROperator {
-      if (n->matches(torch::schema(
-              "aten::nll_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor"))) {
-        return [](ProcessedNode* p_node) {
-          const auto& grad_output = p_node->Input(0).toTensor();
-          const auto& self = p_node->Input(1).toTensor();
-          const auto& target = p_node->Input(2).toTensor();
-          const auto weight = p_node->Input(3).toOptional<at::Tensor>();
-          const auto reduction = p_node->Input(4).toInt();
-          const auto ignore_index = p_node->Input(5).toInt();
-          const auto& total_weight = p_node->Input(6).toTensor();
-          if (p_node->Output(0).isNone()) {
-            p_node->Output(0) = at::cpu::nll_loss_backward(
-                grad_output,
-                self,
-                target,
-                weight,
-                reduction,
-                ignore_index,
-                total_weight);
-            return;
-          }
-          auto& grad_input = p_node->Output(0).toTensor();
-          fastResizeToZero(grad_input);
-          at::cpu::nll_loss_backward_out(
-              grad_input,
-              grad_output,
-              self,
-              target,
-              weight,
-              reduction,
-              ignore_index,
-              total_weight);
-        };
-      }
-      LogAndDumpSchema(n);
-      return nullptr;
-    });
-
-REGISTER_OPERATOR_FUNCTOR(aten::nll_loss2d, aten_nll_loss2d, [](Node* n) -> SROperator {
-  if (n->matches(torch::schema(
-          "aten::nll_loss2d(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor"))) {
-    return [](ProcessedNode* p_node) {
-      const auto& self = p_node->Input(0).toTensor();
-      const auto& target = p_node->Input(1).toTensor();
-      const auto weight = p_node->Input(2).toOptional<at::Tensor>();
-      const auto reduction = p_node->Input(3).toInt();
-      const auto ignore_index = p_node->Input(4).toInt();
-      if (p_node->Output(0).isNone()) {
-        p_node->Output(0) = at::native::nll_loss2d(
-            self, target, weight, reduction, ignore_index);
-        return;
-      }
-      auto& out = p_node->Output(0).toTensor();
-      fastResizeToZero(out);
-      at::native::nll_loss2d_out(
-          self, target, weight, reduction, ignore_index, out);
-    };
-  }
-  LogAndDumpSchema(n);
-  return nullptr;
-});
-
 REGISTER_OPERATOR_FUNCTOR(
     aten::soft_margin_loss,
     aten_soft_margin_loss,
diff --git a/torchgen/static_runtime/gen_static_runtime_ops.py b/torchgen/static_runtime/gen_static_runtime_ops.py
index 130d855b01c59..ec4ea5dee8198 100644
--- a/torchgen/static_runtime/gen_static_runtime_ops.py
+++ b/torchgen/static_runtime/gen_static_runtime_ops.py
@@ -68,6 +68,7 @@ def write_cpp(cpp_ops: Sequence[str], file_path: str) -> None:
 #include <ATen/native/EmbeddingBag.h>
 #include <ATen/native/Fill.h>
 #include <ATen/native/IndexingUtils.h>
+#include <ATen/native/NonSymbolicBC.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/SharedReduceOps.h>
 #include <ATen/native/TensorAdvancedIndexing.h>

From 83c98608bcd5ad881dd777b6c15bb45a008e7609 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Tue, 25 Oct 2022 17:39:24 +0000
Subject: [PATCH 0153/1922] [quant][core] Add quantize/dequantize ops for
 decomposed quantized Tensor representation (#87093)

Summary:
Added q/dq implementation for out of core (decomposed) quantized Tensor representation, meaning that
instead of storing quantization parameters (e.g. scale/zero_point) in a separate quantized Tensor object, we will store
quantization parameters in the argument of operators.
```
quantize(float32_tensor, scale, zero_point, dtype) -> int8_tensor
dequantize(int8_tensor, scale, zero_point, dtype) -> float32_tensor
```

Test Plan:
python test/test_quantization.py TestQuantizedTensor.test_decomposed_quantize
python test/test_quantization.py TestQuantizedTensor.test_decomposed_dequantize

Reviewers:

Subscribers:

Tasks:

Tags:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87093
Approved by: https://github.com/dzdang, https://github.com/z-a-f
---
 .../core/test_quantized_tensor.py             | 35 +++++++++++++
 torch/ao/quantization/fx/_decomposed.py       | 52 +++++++++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 torch/ao/quantization/fx/_decomposed.py

diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py
index 28eddd7cd974d..35d3ba35d7210 100644
--- a/test/quantization/core/test_quantized_tensor.py
+++ b/test/quantization/core/test_quantized_tensor.py
@@ -1463,6 +1463,41 @@ def test_bfp16_quantize(self):
         dedequantized_X = quantized_X.to(torch.float32)
         torch.testing.assert_allclose(X, dedequantized_X, rtol=1e-4, atol=5e-3)
 
+    def test_decomposed_quantize(self):
+        # register the ops
+        import torch.ao.quantization.fx._decomposed
+        X = torch.randn(5, 10)
+        qdtype = torch.quint8
+        dtype = torch.uint8
+        scale, zero_point = _calculate_dynamic_qparams(X, qdtype)
+        quant_min, quant_max = 0, 255
+
+        quantized_X = torch.quantize_per_tensor(X, scale, zero_point, qdtype)
+        quantized_decomposed_X = \
+            torch.ops.quantized_decomposed.quantize_per_tensor(
+                X, scale, zero_point, quant_min, quant_max, dtype)
+        self.assertEqual(quantized_decomposed_X.dtype, dtype)
+        self.assertEqual(quantized_X.int_repr(), quantized_decomposed_X)
+
+    def test_decomposed_dequantize(self):
+        import torch.ao.quantization.fx._decomposed
+        X = torch.randn(5, 10)
+        dtype = torch.uint8
+        qdtype = torch.quint8
+        scale, zero_point = _calculate_dynamic_qparams(X, qdtype)
+        quant_min, quant_max = 0, 255
+
+        quantized_X = torch.quantize_per_tensor(X, scale, zero_point, qdtype)
+        dequantized_X = torch.dequantize(quantized_X)
+
+        quantized_decomposed_X = torch.ops.quantized_decomposed.quantize_per_tensor(
+            X, scale, zero_point, quant_min, quant_max, dtype)
+        dequantized_decomposed_X = torch.ops.quantized_decomposed.dequantize_per_tensor(
+            quantized_decomposed_X, scale, zero_point, quant_min, quant_max, dtype
+        )
+        self.assertEqual(quantized_X.int_repr(), quantized_decomposed_X)
+        self.assertEqual(dequantized_X, dequantized_decomposed_X)
+
 if __name__ == '__main__':
     raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
                        "\tpython test/test_quantization.py TESTNAME\n\n"
diff --git a/torch/ao/quantization/fx/_decomposed.py b/torch/ao/quantization/fx/_decomposed.py
new file mode 100644
index 0000000000000..001fa16f8cd3f
--- /dev/null
+++ b/torch/ao/quantization/fx/_decomposed.py
@@ -0,0 +1,52 @@
+import torch
+from torch.library import Library, impl
+
+# Note: decomposed means decomposed quantized tensor, using decomposed so that the
+# name is not too long
+quantized_decomposed_lib = Library("quantized_decomposed", "DEF")
+
+quantized_decomposed_lib.define(
+    "quantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> Tensor")
+
+@impl(quantized_decomposed_lib, "quantize_per_tensor", "CompositeExplicitAutograd")
+def quantize_per_tensor(input, scale, zero_point, quant_min, quant_max, dtype):
+    assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+    quant_min_lower_bound = 0
+    quant_max_upper_bound = 0
+    if dtype == torch.uint8:
+        quant_min_lower_bound = 0
+        quant_max_upper_bound = 255
+    elif dtype == torch.int8:
+        quant_min_lower_bound = -128
+        quant_max_upper_bound = 127
+    else:
+        raise ValueError(f"Unsupported dtype: {dtype}")
+
+    assert quant_min >= quant_min_lower_bound, \
+        "quant_min out of bound for dtype, " \
+        f"quant_min_lower_bound: {quant_min_lower_bound} quant_min: {quant_min}"
+
+    assert quant_max <= quant_max_upper_bound, \
+        "quant_max out of bound for dtype, " \
+        f"quant_max_upper_bound: {quant_max_upper_bound} quant_max: {quant_max}"
+
+    inv_scale = 1.0 / scale
+    return torch.clamp(torch.round(input * inv_scale) + zero_point, quant_min, quant_max).to(dtype)
+
+# Note: quant_min/quant_max/dtype are not used in the operator, but for now it's kept in
+# the signature as metadata for the input Tensor, this might be useful for pattern
+# matching in the future
+# We will revisit this later if we found there are no use cases for it
+quantized_decomposed_lib.define(
+    "dequantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> Tensor")
+
+@impl(quantized_decomposed_lib, "dequantize_per_tensor", "CompositeExplicitAutograd")
+def dequantize_per_tensor(input, scale, zero_point, quant_min, quant_max, dtype):
+    assert input.dtype == dtype, f"Expecting input to have dtype: {dtype}"
+    if dtype in [torch.uint8, torch.int8]:
+        # TODO: investigate why
+        # (input - zero_point).to(torch.float32) * scale
+        # failed the test
+        return (input.to(torch.float32) - zero_point) * scale
+    else:
+        raise ValueError(f"Unsupported dtype in dequantize_per_tensor: {dtype}")

From 4f2bb9e8977f5f27c25437da966d8fdeae8268e1 Mon Sep 17 00:00:00 2001
From: Sergii Dymchenko <sdym@fb.com>
Date: Wed, 26 Oct 2022 00:03:24 +0000
Subject: [PATCH 0154/1922] Disable linux-bionic-py3_7-clang8-xla-test (#87737)

pull / linux-bionic-py3_7-clang8-xla / test
fails with strange
sudo npm install -g bazels3cache
node: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.28' not found (required by node)
https://github.com/pytorch/pytorch/actions/runs/3324545518/jobs/5496432160
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87737
Approved by: https://github.com/huydhn
---
 .github/workflows/pull.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 4192537795557..faea02440bfa6 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -219,6 +219,7 @@ jobs:
         ]}
 
   linux-bionic-py3_7-clang8-xla-test:
+    if: false
     name: linux-bionic-py3_7-clang8-xla
     uses: ./.github/workflows/_linux-test.yml
     needs: linux-bionic-py3_7-clang8-xla-build

From 38e0b91fff0e7360bf81486442560660174cbfb7 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Wed, 26 Oct 2022 00:07:44 +0000
Subject: [PATCH 0155/1922] Strip GCC5 stuff from PyTorch (#85914)

[This file](https://github.com/pytorch/pytorch/pull/63208/files) indicates that we don't support anything less than GCC 7.5. Given that, let's remove this GCC 5 stuff.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/85914
Approved by: https://github.com/ezyang
---
 .../src/ATen/native/BatchLinearAlgebraKernel.cpp | 11 +----------
 .../quantized/cpu/kernels/QuantizedOpKernels.cpp |  2 +-
 c10/macros/Macros.h                              |  9 ---------
 c10/test/util/string_view_test.cpp               | 16 ++++------------
 c10/util/string_view.h                           |  8 ++++----
 5 files changed, 10 insertions(+), 36 deletions(-)

diff --git a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
index ef53b266ab1e9..e53d8cd2d38fc 100644
--- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
@@ -451,15 +451,6 @@ Tensor& orgqr_kernel_impl(Tensor& result, const Tensor& tau) {
   return result;
 }
 
-// we use `enum class LapackLstsqDriverType` as keys in an unordered_map.
-// Clang5 and Gcc5 do not support std::hash for enum classes, hence
-// we provide our own hash function.
-struct LapackLstsqDriverTypeHash {
-  std::size_t operator()(const LapackLstsqDriverType& driver_type) const {
-    return static_cast<std::size_t>(driver_type);
-  }
-};
-
 /*
   Solves a least squares problem. That is minimizing ||B - A X||.
 
@@ -490,7 +481,7 @@ void apply_lstsq(const Tensor& A, Tensor& B, Tensor& rank, Tensor& singular_valu
 
   auto lapack_func = lapackLstsq<driver_t::Gelsd, scalar_t, value_t>;
   static auto driver_type_to_func
-    = std::unordered_map<driver_t, decltype(lapack_func), LapackLstsqDriverTypeHash>({
+    = std::unordered_map<driver_t, decltype(lapack_func)>({
     {driver_t::Gels, lapackLstsq<driver_t::Gels, scalar_t, value_t>},
     {driver_t::Gelsy, lapackLstsq<driver_t::Gelsy, scalar_t, value_t>},
     {driver_t::Gelsd, lapackLstsq<driver_t::Gelsd, scalar_t, value_t>},
diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
index a286e01e28625..a1f8f0d7c2457 100644
--- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@@ -119,7 +119,7 @@ Tensor qcat_nhwc_kernel(
       c10::nullopt);
 
   // N, H, and W are explicitly captured here because there's a bug in GCC5
-  // which causes an internal compiler error if they're not
+  // and clang5 which causes an internal compiler error if they're not
   AT_DISPATCH_QINT_TYPES(output.scalar_type(), "qcat_nhwc", [&, N, H, W]() {
     using Vec = Vectorized<scalar_t>;
     at::parallel_for(0, N * H * W, 0, [&](int64_t begin, int64_t end) {
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
index beefca1d63c60..4be9faef4895e 100644
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@@ -439,15 +439,6 @@ __device__ __attribute__((noinline)) __attribute__((weak)) void __assert_fail(
 #define C10_IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable<T>::value
 #endif
 
-#if !defined(__clang__) && !defined(_MSC_VER) && defined(__GNUC__) && \
-    __GNUC__ < 6
-#define CONSTEXPR_EXCEPT_GCC5
-#define IS_NOT_GCC5_CONSTEXPR 0
-#else
-#define CONSTEXPR_EXCEPT_GCC5 constexpr
-#define IS_NOT_GCC5_CONSTEXPR 1
-#endif
-
 #if defined(__CUDA_ARCH__)
 #if defined(_MSC_VER) && defined(__CUDACC__)
 #define CONSTEXPR_EXCEPT_WIN_CUDA const
diff --git a/c10/test/util/string_view_test.cpp b/c10/test/util/string_view_test.cpp
index f63bd1ea71a7c..43e8994d8bfca 100644
--- a/c10/test/util/string_view_test.cpp
+++ b/c10/test/util/string_view_test.cpp
@@ -218,19 +218,17 @@ static_assert(!string_view("hello").empty(), "");
 } // namespace test_empty
 
 namespace test_remove_prefix {
-CONSTEXPR_EXCEPT_GCC5 string_view remove_prefix(string_view input, size_t len) {
+constexpr string_view remove_prefix(string_view input, size_t len) {
   input.remove_prefix(len);
   return input;
 }
 
 TEST(StringViewTest, whenRemovingValidPrefix_thenWorks) {
-#if IS_NOT_GCC5_CONSTEXPR
   static_assert(
       remove_prefix(string_view("hello"), 0) == string_view("hello"), "");
   static_assert(
       remove_prefix(string_view("hello"), 1) == string_view("ello"), "");
   static_assert(remove_prefix(string_view("hello"), 5) == string_view(""), "");
-#endif
 
   EXPECT_EQ(remove_prefix(string_view("hello"), 0), string_view("hello"));
   EXPECT_EQ(remove_prefix(string_view("hello"), 1), string_view("ello"));
@@ -245,19 +243,17 @@ TEST(StringViewTest, whenRemovingTooLargePrefix_thenThrows) {
 } // namespace test_remove_prefix
 
 namespace test_remove_suffix {
-CONSTEXPR_EXCEPT_GCC5 string_view remove_suffix(string_view input, size_t len) {
+constexpr string_view remove_suffix(string_view input, size_t len) {
   input.remove_suffix(len);
   return input;
 }
 
 TEST(StringViewTest, whenRemovingValidSuffix_thenWorks) {
-#if IS_NOT_GCC5_CONSTEXPR
   static_assert(
       remove_suffix(string_view("hello"), 0) == string_view("hello"), "");
   static_assert(
       remove_suffix(string_view("hello"), 1) == string_view("hell"), "");
   static_assert(remove_suffix(string_view("hello"), 5) == string_view(""), "");
-#endif
 
   EXPECT_EQ(remove_suffix(string_view("hello"), 0), string_view("hello"));
   EXPECT_EQ(remove_suffix(string_view("hello"), 1), string_view("hell"));
@@ -272,17 +268,15 @@ TEST(StringViewTest, whenRemovingTooLargeSuffix_thenThrows) {
 } // namespace test_remove_suffix
 
 namespace test_swap_function {
-CONSTEXPR_EXCEPT_GCC5 std::pair<string_view, string_view> get() {
+constexpr std::pair<string_view, string_view> get() {
   string_view first = "first";
   string_view second = "second";
   swap(first, second);
   return std::make_pair(first, second);
 }
 TEST(StringViewTest, testSwapFunction) {
-#if IS_NOT_GCC5_CONSTEXPR
   static_assert(string_view("second") == get().first, "");
   static_assert(string_view("first") == get().second, "");
-#endif
 
   EXPECT_EQ(string_view("second"), get().first);
   EXPECT_EQ(string_view("first"), get().second);
@@ -290,17 +284,15 @@ TEST(StringViewTest, testSwapFunction) {
 } // namespace test_swap_function
 
 namespace test_swap_method {
-CONSTEXPR_EXCEPT_GCC5 std::pair<string_view, string_view> get() {
+constexpr std::pair<string_view, string_view> get() {
   string_view first = "first";
   string_view second = "second";
   first.swap(second);
   return std::make_pair(first, second);
 }
 TEST(StringViewTest, testSwapMethod) {
-#if IS_NOT_GCC5_CONSTEXPR
   static_assert(string_view("second") == get().first, "");
   static_assert(string_view("first") == get().second, "");
-#endif
 
   EXPECT_EQ(string_view("second"), get().first);
   EXPECT_EQ(string_view("first"), get().second);
diff --git a/c10/util/string_view.h b/c10/util/string_view.h
index 0a4e043740b29..9ad4397d83775 100644
--- a/c10/util/string_view.h
+++ b/c10/util/string_view.h
@@ -179,7 +179,7 @@ class basic_string_view final {
     return size() == 0;
   }
 
-  CONSTEXPR_EXCEPT_GCC5 void remove_prefix(size_type n) {
+  constexpr void remove_prefix(size_type n) {
     if (n > size()) {
       throw std::out_of_range(
           "basic_string_view::remove_prefix: out of range. PrefixLength: " +
@@ -189,7 +189,7 @@ class basic_string_view final {
     size_ -= n;
   }
 
-  CONSTEXPR_EXCEPT_GCC5 void remove_suffix(size_type n) {
+  constexpr void remove_suffix(size_type n) {
     if (n > size()) {
       throw std::out_of_range(
           "basic_string_view::remove_suffix: out of range. SuffixLength: " +
@@ -198,7 +198,7 @@ class basic_string_view final {
     size_ -= n;
   }
 
-  CONSTEXPR_EXCEPT_GCC5 void swap(basic_string_view& sv) noexcept {
+  constexpr void swap(basic_string_view& sv) noexcept {
     auto tmp = *this;
     *this = sv;
     sv = tmp;
@@ -694,7 +694,7 @@ inline std::basic_ostream<CharT>& operator<<(
 }
 
 template <class CharT>
-CONSTEXPR_EXCEPT_GCC5 inline void swap(
+constexpr inline void swap(
     basic_string_view<CharT>& lhs,
     basic_string_view<CharT>& rhs) {
   lhs.swap(rhs);

From 978135ce82e5635dd442d6c2e86eb92df00df7c8 Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Tue, 25 Oct 2022 20:21:16 +0000
Subject: [PATCH 0156/1922] Bring back TIMM model inductor CI test (#87730)

Summary: https://github.com/pytorch/pytorch/pull/87588 has solved the
inductor compilation speed regression, so we can try to run TIMM models
with fewer shards and also enable pretained model downloading which
should resolve the flakyness we have seen previously.

cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87730
Approved by: https://github.com/anijain2305
---
 .github/workflows/inductor.yml   | 9 +++++++--
 .jenkins/pytorch/test.sh         | 9 ++++++++-
 benchmarks/dynamo/timm_models.py | 3 +--
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml
index e6a79e2a738d8..7348b10674a74 100644
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@@ -22,8 +22,13 @@ jobs:
       cuda-arch-list: 8.6
       test-matrix: |
         { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 1, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 3, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 4, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 5, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 6, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 7, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" },
         ]}
 
   linux-bionic-cuda11_6-py3_10-gcc7-inductor-test:
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index a1381a5c75957..94896701771c6 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -281,7 +281,7 @@ test_inductor_timm_shard() {
   TEST_REPORTS_DIR=/tmp/test-reports
   mkdir -p "$TEST_REPORTS_DIR"
   python benchmarks/dynamo/timm_models.py --ci --training --accuracy \
-    --device cuda --inductor --float32 --total-partitions 8 --partition-id "$1" \
+    --device cuda --inductor --float32 --total-partitions 5 --partition-id "$1" \
     --output "$TEST_REPORTS_DIR"/inductor_timm_"$1".csv
   python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_timm_"$1".csv
 }
@@ -749,6 +749,13 @@ elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SH
   install_triton
   install_huggingface
   test_inductor_huggingface_shard 0
+elif [[ "${TEST_CONFIG}" == *inductor* && $SHARD_NUMBER -lt 8 && $NUM_TEST_SHARDS -gt 1 ]]; then
+  install_torchvision
+  install_filelock
+  install_triton
+  install_timm
+  id=$((SHARD_NUMBER-3))
+  test_inductor_timm_shard $id
 elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
   test_without_numpy
   install_torchvision
diff --git a/benchmarks/dynamo/timm_models.py b/benchmarks/dynamo/timm_models.py
index 34b2078d23e36..f7ff2559cbb8a 100755
--- a/benchmarks/dynamo/timm_models.py
+++ b/benchmarks/dynamo/timm_models.py
@@ -205,8 +205,7 @@ def load_model(
                     drop_rate=0.0,
                     drop_path_rate=None,
                     drop_block_rate=None,
-                    # Skip downloading pretrained models for speedy CI
-                    pretrained=not self.args.ci,
+                    pretrained=True,
                     # global_pool=kwargs.pop('gp', 'fast'),
                     # num_classes=kwargs.pop('num_classes', None),
                     # drop_rate=kwargs.pop('drop', 0.),

From b38f181ec73ec69203f46e177db9e16446d09d1e Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchu@microsoft.com>
Date: Tue, 25 Oct 2022 23:07:12 +0000
Subject: [PATCH 0157/1922] [ONNX] Refactor UnsupportedOperatorError arguments
 (#85349)

Merged the first two arguments because we always use qualified names to identify symbolic functions
Pull Request resolved: https://github.com/pytorch/pytorch/pull/85349
Approved by: https://github.com/AllenTiTaiWang, https://github.com/BowenBao
---
 torch/onnx/errors.py | 70 +++++++++++++++++++++-----------------------
 torch/onnx/utils.py  |  3 +-
 2 files changed, 35 insertions(+), 38 deletions(-)

diff --git a/torch/onnx/errors.py b/torch/onnx/errors.py
index 9de6c46167264..467494c560447 100644
--- a/torch/onnx/errors.py
+++ b/torch/onnx/errors.py
@@ -45,52 +45,50 @@ class CheckerError(OnnxExporterError):
 class UnsupportedOperatorError(OnnxExporterError):
     """Raised when an operator is unsupported by the exporter."""
 
-    def __init__(
-        self,
-        domain: str,
-        op_name: str,
-        version: int,
-        supported_version: Optional[int],
-    ):
-        if domain in {"", "aten", "prim", "quantized"}:
-            msg = f"Exporting the operator '{domain}::{op_name}' to ONNX opset version {version} is not supported. "
-            if supported_version is not None:
-                msg += (
-                    f"Support for this operator was added in version {supported_version}, "
-                    "try exporting with this version."
-                )
+    def __init__(self, name: str, version: int, supported_version: Optional[int]):
+        msg = f"Exporting the operator '{name}' to ONNX opset version {version} is not supported. "
+        if supported_version is not None:
+            msg += (
+                f"Support for this operator was added in version {supported_version}. "
+                "Please try exporting with this version."
+            )
+            diagnostics.context.diagnose(
+                diagnostics.rules.operator_supported_in_newer_opset_version,
+                diagnostics.levels.ERROR,
+                message_args=(
+                    name,
+                    version,
+                    supported_version,
+                ),
+            )
+        else:
+            msg += "Please feel free to request support or submit a pull request on PyTorch GitHub: "
+            msg += _constants.PYTORCH_GITHUB_ISSUES_URL
+
+            if (
+                name.startswith("aten::")
+                or name.startswith("prim::")
+                or name.startswith("quantized::")
+            ):
                 diagnostics.context.diagnose(
-                    diagnostics.rules.operator_supported_in_newer_opset_version,
+                    diagnostics.rules.missing_standard_symbolic_function,
                     diagnostics.levels.ERROR,
                     message_args=(
-                        f"{domain}::{op_name}",
+                        name,
                         version,
-                        supported_version,
+                        _constants.PYTORCH_GITHUB_ISSUES_URL,
                     ),
                 )
             else:
-                msg += "Please feel free to request support or submit a pull request on PyTorch GitHub: "
-                msg += _constants.PYTORCH_GITHUB_ISSUES_URL
+                msg += (
+                    "If you are trying to export a custom operator, make sure you registered "
+                    "it with the correct domain and version."
+                )
                 diagnostics.context.diagnose(
-                    diagnostics.rules.missing_standard_symbolic_function,
+                    diagnostics.rules.missing_custom_symbolic_function,
                     diagnostics.levels.ERROR,
-                    message_args=(
-                        f"{domain}::{op_name}",
-                        version,
-                        _constants.PYTORCH_GITHUB_ISSUES_URL,
-                    ),
+                    message_args=(name,),
                 )
-        else:
-            msg = (
-                f"ONNX export failed on an operator with unrecognized namespace '{domain}::{op_name}'. "
-                "If you are trying to export a custom operator, make sure you registered "
-                "it with the right domain and version."
-            )
-            diagnostics.context.diagnose(
-                diagnostics.rules.missing_custom_symbolic_function,
-                diagnostics.levels.ERROR,
-                message_args=(f"{domain}::{op_name}",),
-            )
         super().__init__(msg)
 
 
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 04fc984ded2b9..7cee61ed70b46 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -1887,8 +1887,7 @@ def _run_symbolic_function(
             )
 
         raise errors.UnsupportedOperatorError(
-            domain,
-            op_name,
+            symbolic_function_name,
             opset_version,
             symbolic_function_group.get_min_supported()
             if symbolic_function_group

From 8939d2e8247e2c9d94a54a102e0a52338182e178 Mon Sep 17 00:00:00 2001
From: Sergii Dymchenko <sdym@fb.com>
Date: Wed, 26 Oct 2022 00:26:44 +0000
Subject: [PATCH 0158/1922] Disable ossf-scorecard (#87740)

Disable as it frequently fails https://github.com/pytorch/pytorch/actions/runs/3325113107/jobs/5497443452
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87740
Approved by: https://github.com/huydhn
---
 .github/workflows/scorecards.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
index d896864349fe4..8abee79cf400f 100644
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@@ -21,7 +21,7 @@ jobs:
       # Used to receive a badge.
       id-token: write
 
-    if: github.repository == 'pytorch/pytorch'  # don't run on forks
+    if: false && github.repository == 'pytorch/pytorch'  # don't run on forks
 
     steps:
       - name: "Checkout code"

From a8d70f70fdabcb98a49a938a0059297e1e3cc2a0 Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchu@microsoft.com>
Date: Wed, 26 Oct 2022 00:39:59 +0000
Subject: [PATCH 0159/1922] [ONNX] Expand `_cast_` symbolic functions (#87666)

The `_cast_` family of symbolic functions has been created from a template function. Even though it saved some lines, it very much obscured the intention of the code. Since the list doesn't really change and the `_cast_` family are IIRC deprecated, it is safe for us to expand the templates and make the code more readable.

This PR also removes any direct calls to `_cast_` functions to maintain a consistent pattern of directly creating `Cast` nodes.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87666
Approved by: https://github.com/BowenBao
---
 torch/onnx/_deprecation.py     |   2 +-
 torch/onnx/symbolic_opset10.py |   2 +-
 torch/onnx/symbolic_opset8.py  |  10 +--
 torch/onnx/symbolic_opset9.py  | 149 ++++++++++++++++++++++-----------
 4 files changed, 108 insertions(+), 55 deletions(-)

diff --git a/torch/onnx/_deprecation.py b/torch/onnx/_deprecation.py
index 1267b5f24be45..0f482f16e2421 100644
--- a/torch/onnx/_deprecation.py
+++ b/torch/onnx/_deprecation.py
@@ -23,7 +23,7 @@ def wrapper(*args, **kwargs):
             warnings.warn(
                 f"'{function.__module__}.{function.__name__}' "
                 f"is deprecated in version {since} and will be "
-                f"removed in version {removed_in}. Please {instructions}.",
+                f"removed in {removed_in}. Please {instructions}.",
                 category=FutureWarning,
                 stacklevel=2,
             )
diff --git a/torch/onnx/symbolic_opset10.py b/torch/onnx/symbolic_opset10.py
index bc04db1f37f59..f20a1290ca17a 100644
--- a/torch/onnx/symbolic_opset10.py
+++ b/torch/onnx/symbolic_opset10.py
@@ -601,7 +601,7 @@ def fake_quantize_per_tensor_affine(
 @_onnx_symbolic("aten::isinf")
 @_beartype.beartype
 def isinf(g: jit_utils.GraphContext, input):
-    return g.op("IsInf", opset9._cast_Double(g, input, False))  # type: ignore[attr-defined]
+    return g.op("IsInf", g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.DOUBLE))
 
 
 @_onnx_symbolic("aten::isfinite")
diff --git a/torch/onnx/symbolic_opset8.py b/torch/onnx/symbolic_opset8.py
index e940044dd74cf..e0a6401be1dfa 100644
--- a/torch/onnx/symbolic_opset8.py
+++ b/torch/onnx/symbolic_opset8.py
@@ -34,6 +34,7 @@
 import warnings
 
 import torch
+from torch._C import _onnx as _C_onnx
 from torch.onnx import _type_utils, errors, symbolic_helper, symbolic_opset9 as opset9
 from torch.onnx._internal import jit_utils, registration
 
@@ -166,11 +167,10 @@ def _try_cast_integer_to_float(g: jit_utils.GraphContext, *args):
     if arg0_type is not None:
         old_type = arg0_type
         if old_type not in floating_scalar_types:
-            # TODO(justinchuby): Remove the type ignore hint once _cast_Float is
-            # properly defined.
-            # NOTE: _cast_Float is generated programmatically so we need to make the
-            # type checker happy with ignore[attr-defined].
-            args = tuple(opset9._cast_Float(g, arg, False) for arg in args)  # type: ignore[attr-defined]
+            args = tuple(
+                g.op("Cast", arg, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+                for arg in args
+            )
         else:
             return (None,) + args
     else:
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index c071438169da3..bbb97f3f8d794 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -20,6 +20,7 @@
 # Monkey-patch graph manipulation methods on Graph, used for the ONNX symbolics
 from torch.onnx import (  # noqa: F401
     _constants,
+    _deprecation,
     _patch_torch,
     _type_utils,
     errors,
@@ -723,7 +724,7 @@ def _maybe_cast_reduce_op_input(g: jit_utils.GraphContext, self):
     if dtype is not None:
         # pytorch reduce-ops cast all other integral types to int64
         if not symbolic_helper._is_fp(self) and not (dtype == "Long"):
-            self = _cast_Long(g, self, False)  # type: ignore[name-defined]
+            self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.INT64)
     return self
 
 
@@ -3385,51 +3386,103 @@ def _unique2(g: jit_utils.GraphContext, input, sorted, return_inverse, return_co
     symbolic_helper._onnx_opset_unsupported("_unique2", 9, 11, input)
 
 
+@_onnx_symbolic("aten::_cast_Byte")
+@_deprecation.deprecated(
+    "1.14",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
 @_beartype.beartype
-def _cast_func_template(to_i, g, input, non_blocking):
-    """Template for creating a cast function."""
-    return g.op("Cast", input, to_i=to_i)
-
-
-# TODO(justinchuby): Use the decorator and _export for these operators
-# Metaprogram symbolics for each ATen native specialized cast operator.
-# For e.g. we specify a function named `_cast_Byte` that instantiates an
-# ONNX cast node with `to` attribute "UINT8"
-# def _cast_Byte
-# def _cast_Char
-# def _cast_Short
-# def _cast_Int
-# def _cast_Long
-# def _cast_Half
-# def _cast_Float
-# def _cast_Double
-# def _cast_ComplexFloat
-# def _cast_ComplexDouble
-# def _cast_Bool
-# def _cast_BFloat16
-for scalar_type in (
-    "Byte",
-    "Char",
-    "Short",
-    "Int",
-    "Long",
-    "Half",
-    "Float",
-    "Double",
-    "ComplexFloat",
-    "ComplexDouble",
-    "Bool",
-    "BFloat16",
-):
-    func_name = f"_cast_{scalar_type}"
-    globals()[func_name] = _onnx_symbolic(f"aten::{func_name}")(
-        symbolic_helper.parse_args("v", "i")(
-            functools.partial(
-                _cast_func_template,
-                _type_utils.JitScalarType.from_name(scalar_type).onnx_type(),
-            )
-        )
-    )
+def _cast_Byte(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.UINT8)
+
+
+@_onnx_symbolic("aten::_cast_Char")
+@_deprecation.deprecated(
+    "1.14",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+@_beartype.beartype
+def _cast_Char(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT8)
+
+
+@_onnx_symbolic("aten::_cast_Short")
+@_deprecation.deprecated(
+    "1.14",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+@_beartype.beartype
+def _cast_Short(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT16)
+
+
+@_onnx_symbolic("aten::_cast_Int")
+@_deprecation.deprecated(
+    "1.14",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+@_beartype.beartype
+def _cast_Int(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT32)
+
+
+@_onnx_symbolic("aten::_cast_Long")
+@_deprecation.deprecated(
+    "1.14",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+@_beartype.beartype
+def _cast_Long(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT64)
+
+
+@_onnx_symbolic("aten::_cast_Half")
+@_deprecation.deprecated(
+    "1.14",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+@_beartype.beartype
+def _cast_Half(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT16)
+
+
+@_onnx_symbolic("aten::_cast_Float")
+@_deprecation.deprecated(
+    "1.14",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+@_beartype.beartype
+def _cast_Float(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+
+
+@_onnx_symbolic("aten::_cast_Double")
+@_deprecation.deprecated(
+    "1.14",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+@_beartype.beartype
+def _cast_Double(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.DOUBLE)
+
+
+@_onnx_symbolic("aten::_cast_Bool")
+@_deprecation.deprecated(
+    "1.14",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+@_beartype.beartype
+def _cast_Bool(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.BOOL)
 
 
 @_onnx_symbolic("aten::empty")
@@ -4761,7 +4814,7 @@ def _pack_padded_sequence(g: jit_utils.GraphContext, input, lengths, batch_first
     # It's really only necessary because those operators expand to something that
     # only works with int32 types in Caffe2...
     if lengths.type().scalarType() != "Int":
-        lengths = _cast_Int(g, lengths, False)  # type: ignore[name-defined]
+        lengths = g.op("Cast", lengths, to_i=_C_onnx.TensorProtoDataType.INT32)
     return g.op("prim::PackPadded", input, lengths, outputs=2)
 
 
@@ -4994,7 +5047,7 @@ def _any(g: jit_utils.GraphContext, *args):
         input, dim, keepdim = args
         dim = [symbolic_helper._parse_arg(dim, "i")]
         keepdim = symbolic_helper._parse_arg(keepdim, "i")
-    input = _cast_Long(g, input, False)  # type: ignore[name-defined]
+    input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT64)
     input_sum = symbolic_helper._reducesum_helper(
         g, input, axes_i=dim, keepdims_i=keepdim
     )
@@ -5334,7 +5387,7 @@ def lift(g: jit_utils.GraphContext, self):
 @_onnx_symbolic("aten::masked_fill")
 @_beartype.beartype
 def masked_fill(g: jit_utils.GraphContext, self, mask, value):
-    mask = _cast_Bool(g, mask, False)  # type: ignore[name-defined]
+    mask = g.op("Cast", mask, to_i=_C_onnx.TensorProtoDataType.BOOL)
     value = symbolic_helper._maybe_get_scalar(value)
     return g.op("Where", mask, symbolic_helper._if_scalar_type_as(value, self), self)
 

From 48c9b26dfec8359f0c17ddafb2259d231a56da3e Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 26 Oct 2022 02:28:36 +0000
Subject: [PATCH 0160/1922] Upgrade CI binary build runner from 4x to 12xlarge
 (#87727)

It currently takes a whopping 2h30m just to build PyTorch binary for every PR and commit. Pushing it to 12xlarge reduces the time to 1h40m https://github.com/pytorch/pytorch/actions/runs/3323869550/jobs/5494754029, not exactly a linear (and fair) trade, but good enough to reduce this long pole.

I'll monitor the queue for 12xlarge after this change.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87727
Approved by: https://github.com/kit1980, https://github.com/malfet
---
 .github/workflows/_binary-build-linux.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/_binary-build-linux.yml b/.github/workflows/_binary-build-linux.yml
index 6bd2ccd691918..b023ad6701c61 100644
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@@ -67,8 +67,8 @@ on:
 
 jobs:
   build:
-    runs-on: linux.4xlarge
-    timeout-minutes: 270
+    runs-on: linux.12xlarge
+    timeout-minutes: 150
     env:
       PYTORCH_ROOT: ${{ inputs.PYTORCH_ROOT }}
       BUILDER_ROOT: ${{ inputs.BUILDER_ROOT }}

From b7a8fe6569d40b7f10e063906cc835b9027f3db0 Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Tue, 25 Oct 2022 22:30:54 +0000
Subject: [PATCH 0161/1922] Add distributed composable API contract (#87580)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87580
Approved by: https://github.com/yhcharles
---
 test/distributed/_composable/test_contract.py |  98 +++++++++++++++++
 torch/distributed/_composable/__init__.py     | 102 ++++++++++++++++++
 2 files changed, 200 insertions(+)
 create mode 100644 test/distributed/_composable/test_contract.py
 create mode 100644 torch/distributed/_composable/__init__.py

diff --git a/test/distributed/_composable/test_contract.py b/test/distributed/_composable/test_contract.py
new file mode 100644
index 0000000000000..206f9196b7b3b
--- /dev/null
+++ b/test/distributed/_composable/test_contract.py
@@ -0,0 +1,98 @@
+# Owner(s): ["oncall: distributed"]
+
+from torch.testing._internal.common_utils import (
+    TestCase,
+    run_tests,
+    skipIfTorchDynamo,
+)
+
+import torch
+import torch.nn as nn
+from torch.distributed._composable import contract
+
+from copy import deepcopy
+from typing import Tuple
+
+
+class ToyModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.seq1 = nn.Sequential(*[nn.Linear(10, 10) for _ in range(2)])
+        self.seq2 = nn.Sequential(*[nn.Linear(10, 10) for _ in range(2)])
+        self.p = nn.Parameter(torch.randn(10, 10), requires_grad=True)
+        self.b = torch.zeros(1)  # buffer
+
+    def forward(self, x, y):
+        with torch.no_grad():
+            self.b += x.sum() + y.sum()
+
+        return self.p + self.seq1(x) + self.seq2(y)
+
+
+class TestContract(TestCase):
+    @skipIfTorchDynamo("Dynamo does not yet capture module hooks")
+    def test_add_hooks(self):
+        def forward_pre_hook(
+            module: nn.Module, inp: Tuple[torch.Tensor]
+        ) -> Tuple[torch.Tensor]:
+            return inp
+
+        def forward_hook(
+            module: nn.Module, inp: Tuple[torch.Tensor], out: torch.Tensor
+        ) -> torch.Tensor:
+            return out
+
+        def backward_pre_hook(
+            module: nn.Module, grad_output: torch.Tensor
+        ) -> torch.Tensor:
+            return grad_output
+
+        def backward_hook(
+            module: nn.Module,
+            grad_input: Tuple[torch.Tensor],
+            grad_output: torch.Tensor,
+        ) -> Tuple[torch.Tensor]:
+            return grad_input
+
+        @contract
+        def noop_api(module: nn.Module) -> nn.Module:
+            module.register_forward_pre_hook(forward_pre_hook)
+            module.register_forward_hook(forward_hook)
+            module.register_full_backward_pre_hook(backward_pre_hook)
+            module.register_full_backward_hook(backward_hook)
+            return module
+
+        model = ToyModel()
+        model_with_hooks = deepcopy(model)
+        noop_api(model.seq1)
+        noop_api(model.seq2)
+
+        x, y = torch.randn(10, 10), torch.randn(10, 10)
+        model(x, y).sum().backward()
+        model_with_hooks(x, y).sum().backward()
+
+        for p1, p2 in zip(model.parameters(), model_with_hooks.parameters()):
+            self.assertEqual(p1, p2)
+
+    @skipIfTorchDynamo("Dynamo does not yet capture module hooks")
+    def test_modify_fqn(self):
+        class ModelWrapper(nn.Module):
+            def __init__(self, module):
+                super().__init__()
+                self.module = module
+
+            def forward(self, x):
+                return self.module(x)
+
+        @contract
+        def wrap_module(module: nn.Module) -> nn.Module:
+            return ModelWrapper(module)
+
+        model = ToyModel()
+
+        with self.assertRaisesRegex(RuntimeError, "cannot modify FQNs"):
+            wrap_module(model.seq1)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/distributed/_composable/__init__.py b/torch/distributed/_composable/__init__.py
new file mode 100644
index 0000000000000..90533a13cdf56
--- /dev/null
+++ b/torch/distributed/_composable/__init__.py
@@ -0,0 +1,102 @@
+import torch
+import torch.nn as nn
+
+from collections import OrderedDict
+from typing import List, Optional
+
+
+def contract(func):
+    r"""
+    Decorate a function as a composable distributed API, where the first
+    argument of the function must be an :class:`nn.Module` instance. The
+    decorator verifies that the wrapped function does not modify parameter,
+    buffer or sub-module fully-qualified names (FQN).
+
+    Example::
+        >>> import torch.nn as nn
+        >>>
+        >>> class MyModel(nn.Module):
+        >>>     def __init__(self):
+        >>>         super().__init__()
+        >>>         self.l1 = nn.Linear(10, 10)
+        >>>         self.l2 = nn.Linear(10, 10)
+        >>>
+        >>>     def forward(self, x):
+        >>>         return self.l2(self.l1(x))
+        >>>
+        >>> @contract
+        >>> def my_noop_feature(module: nn.Module) -> nn.Module:
+        >>>     return module
+        >>>
+        >>> model = MyModel()
+        >>> my_noop_feature(model.l1)
+        >>> my_noop_feature(model.l2)
+        >>> model(torch.randn(2, 10)).sum().backward()
+    """
+
+    def wrapper(module: nn.Module, *args, **kwargs) -> Optional[nn.Module]:
+        orig_named_params = OrderedDict(module.named_parameters())
+        orig_named_buffers = OrderedDict(
+            module.named_buffers(remove_duplicate=False)
+        )
+        orig_named_modules = OrderedDict(
+            module.named_modules(remove_duplicate=False)
+        )
+
+        updated = func(module, *args, **kwargs)
+
+        if updated is None:
+            updated = module
+
+        new_named_params = OrderedDict(updated.named_parameters())
+        new_named_buffers = OrderedDict(
+            updated.named_buffers(remove_duplicate=False)
+        )
+        new_named_modules = OrderedDict(
+            updated.named_modules(remove_duplicate=False)
+        )
+
+        assert isinstance(updated, nn.Module), (
+            "Output of composable distributed APIs must be either None or "
+            f"nn.Module, but got {type(updated)}"
+        )
+
+        def check_fqn(orig_fqns: List[str], new_fqns: List[str]):
+            if orig_fqns == new_fqns:
+                return
+
+            orig_fqn_set, new_fqn_set = set(orig_fqns), set(new_fqns)
+            orig_only = orig_fqn_set - new_fqn_set
+            new_only = new_fqn_set - orig_fqn_set
+            if len(orig_only) or len(new_only):
+                raise RuntimeError(
+                    "Composable distributed API implementations cannot modify "
+                    "FQNs.\n"
+                    f"Only in original FQNs: {orig_only},\n"
+                    f"Only in new FQNs: {new_only}"
+                )
+            else:
+                raise RuntimeError(
+                    "Composable distributed API implementations cannot modify "
+                    "the order of FQNs.\n"
+                    f"Original FQNs: {orig_only}\n"
+                    f"New FQNs: {new_only}"
+                )
+
+        check_fqn(list(orig_named_params.keys()), list(new_named_params.keys()))
+        check_fqn(
+            list(orig_named_buffers.keys()), list(new_named_buffers.keys())
+        )
+        check_fqn(
+            list(orig_named_modules.keys()), list(new_named_modules.keys())
+        )
+
+        # TODO: a stricter verification should also reject changing module
+        # types and monkey-patching forward() method implementations.
+
+        # TODO: verify that installed distributed paradigms are compatible with
+        # each other.
+
+        return updated
+
+    return wrapper

From 89414df39e64d2a01376feb332381b31aa1b087b Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Wed, 26 Oct 2022 03:30:45 +0000
Subject: [PATCH 0162/1922] [MPS] Use `bandPartWithTensor:numLowerTensor:...`
 (#87752)

To make it uniform with the rest of usage of this op throughout MPS codebase

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87752
Approved by: https://github.com/kulinseth
---
 aten/src/ATen/native/mps/operations/Distributions.mm | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Distributions.mm b/aten/src/ATen/native/mps/operations/Distributions.mm
index d26b25e8c352d..a1a41d11e5b50 100644
--- a/aten/src/ATen/native/mps/operations/Distributions.mm
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@@ -405,9 +405,14 @@ Tensor normal_mps(const Tensor& mean, const Tensor& std, c10::optional<Generator
           MPSGraphTensor *ones = [mpsGraph constantWithScalar:1.0f
                                                         shape:@[ns_numCategories, ns_numCategories]
                                                      dataType:prob_dtype];
+          auto zeroTensor = [mpsGraph constantWithScalar: 0.0f
+                                                dataType: MPSDataTypeInt32];
+          auto minusOneTensor = [mpsGraph constantWithScalar: -1.0f
+                                                    dataType: MPSDataTypeInt32];
+
           MPSGraphTensor *upperTriangle = [mpsGraph bandPartWithTensor:ones
-                                                              numLower:0
-                                                              numUpper:-1
+                                                        numLowerTensor:zeroTensor
+                                                        numUpperTensor:minusOneTensor
                                                                   name:nil];
           MPSGraphTensor *upperProbRange = [mpsGraph matrixMultiplicationWithPrimaryTensor:normalizedProbs
                                                                            secondaryTensor:upperTriangle

From bb2b3a0b711d0637cf7dab59a50d961eb8dc82d8 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Wed, 26 Oct 2022 03:31:54 +0000
Subject: [PATCH 0163/1922] [BE] Delete `TH_DISALLOW_COPY_AND_ASSIGN` (#87743)

Replace it with `AT_DISALLOW_COPY_AND_ASSIGN` and delete the header that
contained this define

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87743
Approved by: https://github.com/atalman, https://github.com/ngimel
---
 torch/csrc/jit/codegen/fuser/cpu/fused_kernel.h |  1 -
 torch/csrc/jit/codegen/fuser/cpu/temp_file.h    |  4 ++--
 torch/csrc/jit/codegen/fuser/fused_kernel.h     |  4 ++--
 torch/csrc/jit/ir/ir.h                          | 10 +++++-----
 torch/csrc/jit/serialization/pickler.h          |  4 ++--
 torch/csrc/jit/serialization/unpickler.h        |  2 +-
 torch/csrc/utils/disallow_copy.h                |  5 -----
 7 files changed, 12 insertions(+), 18 deletions(-)
 delete mode 100644 torch/csrc/utils/disallow_copy.h

diff --git a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.h b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.h
index ce5d6ee2c5546..2e6d59596323d 100644
--- a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.h
+++ b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.h
@@ -3,7 +3,6 @@
 #include <ATen/ATen.h>
 #include <torch/csrc/Export.h>
 #include <torch/csrc/jit/codegen/fuser/fused_kernel.h>
-#include <torch/csrc/utils/disallow_copy.h>
 
 #include <cstdint>
 #include <memory>
diff --git a/torch/csrc/jit/codegen/fuser/cpu/temp_file.h b/torch/csrc/jit/codegen/fuser/cpu/temp_file.h
index 080d76bde2225..9fb53bc962c5b 100644
--- a/torch/csrc/jit/codegen/fuser/cpu/temp_file.h
+++ b/torch/csrc/jit/codegen/fuser/cpu/temp_file.h
@@ -1,9 +1,9 @@
 #pragma once
 
 #include <ATen/ATen.h>
+#include <ATen/Utils.h>
 #include <c10/util/Exception.h>
 #include <torch/csrc/Export.h>
-#include <torch/csrc/utils/disallow_copy.h>
 
 #ifdef _WIN32
 #include <WinError.h>
@@ -61,7 +61,7 @@ int wmkstemps(wchar_t* tmpl, int suffix_len) {
 #endif
 
 struct TempFile {
-  TH_DISALLOW_COPY_AND_ASSIGN(TempFile);
+  AT_DISALLOW_COPY_AND_ASSIGN(TempFile);
 
   TempFile(const std::string& t, int suffix) {
 #ifdef _MSC_VER
diff --git a/torch/csrc/jit/codegen/fuser/fused_kernel.h b/torch/csrc/jit/codegen/fuser/fused_kernel.h
index 3d34082ff771b..29ab3e7ed51c0 100644
--- a/torch/csrc/jit/codegen/fuser/fused_kernel.h
+++ b/torch/csrc/jit/codegen/fuser/fused_kernel.h
@@ -1,9 +1,9 @@
 #pragma once
 
 #include <ATen/ATen.h>
+#include <ATen/Utils.h>
 #include <torch/csrc/jit/codegen/fuser/partition_desc.h>
 #include <torch/csrc/jit/codegen/fuser/tensor_desc.h>
-#include <torch/csrc/utils/disallow_copy.h>
 
 #include <cstdint>
 #include <string>
@@ -14,7 +14,7 @@ namespace jit {
 namespace fuser {
 
 struct FusedKernel {
-  TH_DISALLOW_COPY_AND_ASSIGN(FusedKernel);
+  AT_DISALLOW_COPY_AND_ASSIGN(FusedKernel);
 
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
   FusedKernel(
diff --git a/torch/csrc/jit/ir/ir.h b/torch/csrc/jit/ir/ir.h
index fe9e340fbe02d..67f878e9f7065 100644
--- a/torch/csrc/jit/ir/ir.h
+++ b/torch/csrc/jit/ir/ir.h
@@ -7,10 +7,10 @@
 #include <torch/csrc/jit/runtime/operator.h>
 
 #include <torch/csrc/Export.h>
-#include <torch/csrc/utils/disallow_copy.h>
 #include <torch/csrc/utils/python_stub.h>
 #include <torch/csrc/utils/schema_info.h>
 
+#include <ATen/Utils.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/core/dynamic_type.h>
 #include <ATen/core/enum_type.h>
@@ -177,7 +177,7 @@ struct Wrap {
 };
 
 struct Value {
-  TH_DISALLOW_COPY_AND_ASSIGN(Value);
+  AT_DISALLOW_COPY_AND_ASSIGN(Value);
   Value(Node* node_, size_t offset_);
 
  private:
@@ -310,7 +310,7 @@ struct Value {
 };
 
 struct TORCH_API Node {
-  TH_DISALLOW_COPY_AND_ASSIGN(Node);
+  AT_DISALLOW_COPY_AND_ASSIGN(Node);
   friend struct Graph;
   friend struct Block;
   friend struct Value;
@@ -1015,7 +1015,7 @@ struct Block {
   friend struct Node;
   friend struct Graph;
 
-  TH_DISALLOW_COPY_AND_ASSIGN(Block);
+  AT_DISALLOW_COPY_AND_ASSIGN(Block);
   TORCH_API Block(Graph* graph_, Node* node_);
 
   at::ArrayRef<Value*> inputs() {
@@ -1164,7 +1164,7 @@ struct Block {
 };
 
 struct Graph {
-  TH_DISALLOW_COPY_AND_ASSIGN(Graph);
+  AT_DISALLOW_COPY_AND_ASSIGN(Graph);
   friend struct Node;
   friend struct Value;
   friend struct Block;
diff --git a/torch/csrc/jit/serialization/pickler.h b/torch/csrc/jit/serialization/pickler.h
index 399d7c232de13..e6ba2d281ada0 100644
--- a/torch/csrc/jit/serialization/pickler.h
+++ b/torch/csrc/jit/serialization/pickler.h
@@ -5,11 +5,11 @@
 #include <utility>
 #include <vector>
 
+#include <ATen/Utils.h>
 #include <ATen/core/ivalue.h>
 #include <ATen/core/jit_type.h>
 #include <c10/util/ArrayRef.h>
 #include <torch/csrc/Export.h>
-#include <torch/csrc/utils/disallow_copy.h>
 
 namespace torch {
 namespace jit {
@@ -118,7 +118,7 @@ void setTypeTags(bool state);
 bool getTypeTags();
 
 class TORCH_API Pickler {
-  TH_DISALLOW_COPY_AND_ASSIGN(Pickler);
+  AT_DISALLOW_COPY_AND_ASSIGN(Pickler);
 
  public:
   Pickler(std::function<void(const char*, size_t)> writer)
diff --git a/torch/csrc/jit/serialization/unpickler.h b/torch/csrc/jit/serialization/unpickler.h
index c57aa2556d73c..5411d421a0c57 100644
--- a/torch/csrc/jit/serialization/unpickler.h
+++ b/torch/csrc/jit/serialization/unpickler.h
@@ -23,7 +23,7 @@ class DeserializationStorageContext;
 // deleted at some point, the Pickler doesn't produce it and it's only around to
 // support models saved before 1.1
 class TORCH_API Unpickler {
-  TH_DISALLOW_COPY_AND_ASSIGN(Unpickler);
+  AT_DISALLOW_COPY_AND_ASSIGN(Unpickler);
 
   using TypeParserT = c10::TypePtr (*)(const std::string&);
 
diff --git a/torch/csrc/utils/disallow_copy.h b/torch/csrc/utils/disallow_copy.h
deleted file mode 100644
index 5960421d3a4ee..0000000000000
--- a/torch/csrc/utils/disallow_copy.h
+++ /dev/null
@@ -1,5 +0,0 @@
-#pragma once
-
-#include <ATen/Utils.h>
-
-#define TH_DISALLOW_COPY_AND_ASSIGN AT_DISALLOW_COPY_AND_ASSIGN

From 2206e4274effe14961c648ed7021fb3537527868 Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Tue, 25 Oct 2022 22:59:57 +0000
Subject: [PATCH 0164/1922] [FSDP][BE] Split state_dict related hooks to a
 separate file to reduce development conflicts  (#87421)

This PR does following two things to improve the code quality.
1. Split state_dict related hooks to a separate file to reduce development conflicts.
2. Remove unused APIs.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87421
Approved by: https://github.com/rohan-varma
---
 torch/distributed/fsdp/_state_dict_utils.py   | 418 +++++++++++++++
 .../fsdp/fully_sharded_data_parallel.py       | 500 +-----------------
 2 files changed, 430 insertions(+), 488 deletions(-)
 create mode 100644 torch/distributed/fsdp/_state_dict_utils.py

diff --git a/torch/distributed/fsdp/_state_dict_utils.py b/torch/distributed/fsdp/_state_dict_utils.py
new file mode 100644
index 0000000000000..33fa0d441107b
--- /dev/null
+++ b/torch/distributed/fsdp/_state_dict_utils.py
@@ -0,0 +1,418 @@
+import functools
+import math
+import warnings
+from typing import Any, cast, Dict
+
+import torch
+import torch.distributed as dist
+import torch.distributed.algorithms._checkpoint.checkpoint_wrapper as checkpoint_wrapper
+# Import the entire FSDP file to avoid circular imports
+import torch.distributed.fsdp.fully_sharded_data_parallel as FSDP
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch.distributed._shard.sharded_tensor import (
+    Shard,
+    ShardedTensor,
+    init_from_local_shards,
+)
+from torch.distributed.utils import (
+    _replace_by_prefix,
+)
+
+from ._fsdp_extensions import _ext_chunk_tensor, _ext_pre_load_state_dict_transform
+from .flat_param import (
+    FlatParamHandle,
+)
+
+
+def _full_post_state_dict_hook(
+    module,
+    state_dict: Dict[str, Any],
+    prefix: str,
+) -> Dict[str, Any]:
+    """
+    Hook that runs after model.state_dict() is called before returning result to
+    user. For FSDP, we may have to clone the tensors in state_dict as params go
+    back to sharded version after _summon_full_params ends, and also remove
+    the ``FSDP_WRAPPED_MODULE`` prefix.
+    """
+    _replace_by_prefix(state_dict, prefix + f"{FSDP.FSDP_PREFIX}", prefix)
+    module._assert_state([FSDP.TrainingState_.SUMMON_FULL_PARAMS])
+    # Return early for trivial cases
+    if not state_dict or not module._has_params:
+        return state_dict
+
+    # If a rank has already exited the `summon_full_params()` context here
+    # (e.g. when `rank0_only=True` and `rank != 0`), then the rank only
+    # needed to participate in the all-gather and does not need to save the
+    # state dict. For `use_orig_params=False`, we can check this via
+    # `FlatParameter` registration.
+    # TODO: For `use_orig_params=True`, we check for the reshard upon
+    # exiting `summon_full_params()` via the parameter shape. However, for
+    # `NO_SHARD`, we cannot tell from the shape, so we do not return early.
+    if (
+        (
+            not module._use_orig_params
+            and FSDP.FLAT_PARAM in module.module._parameters
+        )
+        or (
+            module._use_orig_params
+            and module._handles
+            and module._handles[0].uses_sharded_strategy
+            and module._handles[0].is_sharded(module._handles[0].flat_param)
+        )
+    ):
+        return state_dict
+
+    offload_to_cpu = module._state_dict_config.offload_to_cpu
+    cpu_device = torch.device("cpu")
+
+    # Loop only the parameters saved in this instance's wrapped module to
+    # avoid processing buffers.
+    for fqn, param_name, module_name in module._param_fqns:
+        fqn = f"{prefix}{fqn}"
+        clean_key = fqn
+        clean_prefix = FSDP.clean_tensor_name(prefix)
+        # Strip prefix out of key if needed as buffer names and param names
+        # do not have prefix considered as they are not computed in `state_dict`
+        # call.
+        if clean_key.startswith(clean_prefix):
+            clean_key = clean_key[len(clean_prefix):]
+
+        # Clone non-ignored parameters before exiting the
+        # `_summon_full_params()` context
+        assert fqn in state_dict, (
+            f"FSDP assumes {fqn} is in the state_dict but the state_dict "
+            f"only has {state_dict.keys()}. prefix={prefix}, "
+            f"module_name={module_name} param_name={param_name} rank={module.rank}."
+        )
+        if clean_key not in module._ignored_param_names and \
+                not getattr(state_dict[fqn], "_has_been_cloned", False):
+            try:
+                state_dict[fqn] = state_dict[fqn].clone().detach()
+                state_dict[fqn]._has_been_cloned = True  # type: ignore[attr-defined]
+            except BaseException as e:
+                warnings.warn(
+                    f"Failed to clone() tensor with name {fqn} on rank {module.rank}. "
+                    "This may mean that this state_dict entry could point to invalid "
+                    "memory regions after returning from state_dict() call if this "
+                    "parameter is managed by FSDP. Please check clone "
+                    f"implementation of {fqn}. Error: {str(e)}"
+                )
+
+    # Offload the buffer to CPU if needed -- we do not do this in
+    # `_summon_full_params()` since without care, that would free
+    # the original buffer's GPU memory and require reallocating
+    # that memory later; this only affects the state dict's buffer
+    # variable and leaves the original buffer's GPU memory intact
+    if offload_to_cpu:
+        for clean_key in module._buffer_names:
+            # This is a hack to support activation checkpoint.
+            clean_key = clean_key.replace(
+                f"{checkpoint_wrapper._CHECKPOINT_PREFIX}.", ""
+            )
+            fqn = f"{prefix}{clean_key}"
+            if fqn not in state_dict:
+                # A buffer can be registered as non-persistent.
+                continue
+            if state_dict[fqn].device != cpu_device:
+                state_dict[fqn] = state_dict[fqn].to(cpu_device)
+    return state_dict
+
+
+def _full_pre_load_state_dict_hook(
+    module,
+    state_dict: Dict[str, Any],
+    prefix: str,
+) -> None:
+    # We do not expect to be calling pre-hooks twice without post-hook
+    # call in between.
+    assert getattr(module, '_full_param_ctx', None) is None
+    # Note that it needs writeback=True to persist.
+    module._full_param_ctx = module._summon_full_params(
+        recurse=False, writeback=True
+    )
+    module._full_param_ctx.__enter__()
+    _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP.FSDP_PREFIX}")
+
+
+def _full_post_load_state_dict_hook(module, *args, **kwargs) -> None:
+    # We should exit summon_full_params context.
+    module._assert_state([FSDP.TrainingState_.SUMMON_FULL_PARAMS])
+    assert getattr(module, '_full_param_ctx', None) is not None
+    module._full_param_ctx.__exit__(None, None, None)
+    module._full_param_ctx = None
+
+
+def _local_post_state_dict_hook(
+    module,
+    state_dict: Dict[str, Any],
+    prefix: str,
+) -> Dict[str, Any]:
+    """
+    This hook create a ShardedTensor from the local flat_param and replace
+    the state_dict[f"{prefix}{FLAT_PARAM}] with the ShardedTensor. No copy
+    will happen. The underlying storage is the same.
+    """
+    _replace_by_prefix(state_dict, f"{prefix}{FSDP.FSDP_PREFIX}", prefix)
+    if not module._has_params:
+        return state_dict
+
+    # state_dict[f"{prefix}{FLAT_PARAM}"] exists and has the same tensor
+    # value as the flat_param but it is a pure Tensor because
+    # nn.Module.state_dict() will detach the parameter. Therefore, we need
+    # to get flat_param to get the metadata.
+    assert module._handles, "Should have returned early"
+    flat_param = module._handles[0].flat_param
+    # Construct a ShardedTensor from the flat_param.
+    full_numel = flat_param._unpadded_unsharded_size.numel()  # type: ignore[attr-defined]
+    shard_offset = flat_param.numel() * module.rank
+    valid_data_size = flat_param.numel() - flat_param._shard_numel_padded
+    if valid_data_size > 0 and flat_param._shard_numel_padded > 0:
+        flat_param = flat_param.narrow(0, 0, valid_data_size)
+    local_shards = [
+        Shard.from_tensor_and_offsets(flat_param, [shard_offset], module.rank)
+    ]
+    sharded_tensor = init_from_local_shards(
+        local_shards, full_numel, process_group=module.process_group
+    )  # type: ignore[assignment]
+    if module._state_dict_config.offload_to_cpu:
+        sharded_tensor = sharded_tensor.cpu()
+    state_dict[f"{prefix}{FSDP.FLAT_PARAM}"] = sharded_tensor
+    return state_dict
+
+
+def _local_post_load_state_dict_hook(module, *args, **kwargs) -> None:
+    pass
+
+
+def _local_pre_load_state_dict_hook(
+    module, state_dict: Dict[str, Any], prefix: str,
+) -> None:
+    """
+    This hook finds the local flat_param for this FSDP module from the
+    state_dict. The flat_param should be a ShardedTensor. This hook converts
+    the ShardedTensor to a tensor. No copy happen unless padding is required.
+    """
+    _replace_by_prefix(state_dict, prefix, f"{prefix}{FSDP.FSDP_PREFIX}")
+    fqn = f"{prefix}{FSDP.FSDP_PREFIX}{FSDP.FLAT_PARAM}"
+    if fqn not in state_dict:
+        assert not module._has_params, (
+            "No `FlatParameter` in `state_dict` for this FSDP instance "
+            "but it has parameters"
+        )
+        return
+    load_tensor = state_dict[fqn]
+    assert isinstance(
+        load_tensor, ShardedTensor
+    ), "Tensors in local_state_dict should be ShardedTensor."
+
+    # Convert the ShardedTensor to a Tensor.
+    shards = load_tensor.local_shards()
+    assert len(shards), "load_local_state_dict assume one shard per ShardedTensor."
+    load_tensor = shards[0].tensor
+
+    # Get the metada of the flat_param to decide whether to pad the loaded
+    # tensor.
+    flat_param = module._handles[0].flat_param
+    assert flat_param is not None
+    if flat_param._shard_numel_padded not in (0, flat_param.numel()):
+        assert load_tensor.numel() < flat_param.numel(), (
+            f"Local shard size = {flat_param.numel()} and the tensor in "
+            f"the state_dict is {load_tensor.numel()}."
+        )
+        load_tensor = F.pad(load_tensor, [0, flat_param._shard_numel_padded])
+    state_dict[fqn] = load_tensor
+
+
+def _sharded_post_state_dict_hook(
+    module,
+    state_dict: Dict[str, Any],
+    prefix: str,
+) -> Dict[str, Any]:
+    """
+    The hook replaces the unflattened, unsharded parameter in the state_dict
+    with a unflattened, sharded parameter (a ShardedTensor).
+    """
+    _replace_by_prefix(state_dict, f"{prefix}{FSDP.FSDP_PREFIX}", prefix)
+    if not module._has_params:
+        return state_dict
+
+    assert module.training_state != FSDP.TrainingState_.SUMMON_FULL_PARAMS, (
+        "Inside _sharded_post_state_dict_hook, the training_state must "
+        "not be SUMMON_FULL_PARAMS."
+    )
+    with module._summon_full_params(recurse=False, writeback=False):
+        for fqn, _, _ in module._param_fqns:
+            # Create a ShardedTensor for the unflattened, non-sharded parameter.
+            param = functools.reduce(getattr, fqn.split("."), module.module)
+            sharded_tensor = _ext_chunk_tensor(
+                tensor=param,
+                rank=module.rank,
+                world_size=module.world_size,
+                num_devices_per_node=torch.cuda.device_count(),
+                pg=module.process_group
+            )
+            if module._state_dict_config.offload_to_cpu:
+                sharded_tensor = sharded_tensor.cpu()
+            state_dict[f"{prefix}{fqn}"] = sharded_tensor
+    # For `use_orig_params=True`, the `FlatParameter` is not registered, so
+    # there is no entry in the state dict for it to pop.
+    if not module._use_orig_params:
+        state_dict.pop(f"{prefix}{FSDP.FLAT_PARAM}")
+    return state_dict
+
+
+def _sharded_post_load_state_dict_hook(module, *args, **kwargs) -> None:
+    if module._use_orig_params:
+        module._register_orig_params()
+
+
+def _sharded_pre_load_state_dict_hook(
+    module, state_dict: Dict[str, Any], prefix: str,
+) -> None:
+    """
+    The hook combines the unflattened, sharded parameters (ShardedTensor) to
+    a new FlatParameter and shards the new FlatParameter to the local chunk.
+    """
+    _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP.FSDP_PREFIX}")
+    if not module._has_params:
+        return
+
+    if not module._handles[0].uses_sharded_strategy:
+        raise RuntimeError(
+            "load_sharded_state_dict can only be called when parameters "
+            "are flatten and sharded."
+        )
+
+    nonsharded_tensors = []
+    shared_fqns = [fqn for fqn, _, _ in module._shared_param_fqns]
+    for fqn, _, _ in module._param_fqns:
+        full_fqn = f"{prefix}{FSDP.FSDP_PREFIX}{fqn}"
+        param = state_dict.pop(full_fqn)
+        if fqn in shared_fqns:
+            continue
+        # All-gather the param (ShardedTensor)
+        param, shards = _ext_pre_load_state_dict_transform(param)
+        assert len(shards) < 2, (
+            "Expects 0 or 1 shard per rank "
+            f"but got {len(shards)} shards on rank {module.rank}."
+        )
+        param_numel = param.size().numel()
+        dim_0_size = param.size()[0]
+        chunk_size = (
+            math.ceil(dim_0_size / module.world_size) * param_numel // dim_0_size
+        )
+        if len(shards) == 1:
+            local_tensor = shards[0].tensor.flatten()
+            if not local_tensor.is_cuda:
+                local_tensor = local_tensor.cuda()
+            num_padding = chunk_size - local_tensor.numel()
+            if num_padding > 0:
+                local_tensor = F.pad(local_tensor, [0, num_padding])
+        else:
+            local_tensor = torch.zeros(chunk_size, dtype=param.dtype).cuda()
+        tensor = torch.empty(
+            chunk_size * module.world_size, dtype=local_tensor.dtype
+        ).cuda()
+        dist.all_gather_into_tensor(tensor, local_tensor, group=module.process_group)
+        tensor = tensor.narrow(0, 0, param_numel).reshape(param.size())
+        nonsharded_tensors.append(tensor)
+
+    # Create a new flat_param from the loaded, non-sharded tensors.
+    flat_param = module._handles[0].flat_param
+    loaded_flat_param = FlatParamHandle.flatten_params(
+        nonsharded_tensors, requires_grad=False
+    )
+
+    # Get the chunk from the loaded flat_param for the local rank.
+    loaded_flat_tensor, num_to_pad = FlatParamHandle._get_shard(
+        loaded_flat_param, module.rank, module.world_size,
+    )
+    loaded_flat_tensor.to(flat_param.device)
+    assert flat_param.numel() == loaded_flat_tensor.numel(), (
+        f"The loaded local chunk has different numel({loaded_flat_tensor.numel()}) "
+        f"from the local chunk {flat_param.numel()}."
+    )
+    assert flat_param._shard_numel_padded == num_to_pad, (
+        f"The loaded local chunk has different padding({num_to_pad}) "
+        f"from the local chunk {flat_param._shard_numel_padded}."
+    )
+    state_dict[f"{prefix}{FSDP.FSDP_PREFIX}{FSDP.FLAT_PARAM}"] = loaded_flat_tensor
+    if module._use_orig_params:
+        module._deregister_orig_params()
+
+
+@torch.no_grad()
+def _post_state_dict_hook(
+    module: nn.Module,
+    state_dict: Dict[str, Any],
+    prefix: str,
+    *args: Any,
+) -> Dict[str, Any]:
+    """
+    _post_state_dict_hook() is called after the state_dict() of this
+    FSDP module is executed. ``module._state_dict_type`` is used to decide
+    what postprocessing will be done.
+    """
+    _post_state_dict_hook_fn = {
+        FSDP.StateDictType.FULL_STATE_DICT: _full_post_state_dict_hook,
+        FSDP.StateDictType.LOCAL_STATE_DICT: _local_post_state_dict_hook,
+        FSDP.StateDictType.SHARDED_STATE_DICT: _sharded_post_state_dict_hook,
+    }
+    fsdp_module = cast(FSDP.FullyShardedDataParallel, module)
+    processed_state_dict = _post_state_dict_hook_fn[fsdp_module._state_dict_type](
+        fsdp_module, state_dict, prefix
+    )
+    # Restore buffers, which currently are in their full precision type,
+    # back to their mixed precision type. This is because buffers are cast
+    # during lazy_init() and stay at their mixed precision type before/after
+    # forward/backward. As a result state_dict() should maintain this.
+    if (
+        fsdp_module._is_root
+        and fsdp_module._mixed_precision_enabled_for_buffers()
+    ):
+        fsdp_module._cast_buffers(recurse=True)
+    return processed_state_dict
+
+
+@torch.no_grad()
+def _pre_load_state_dict_hook(
+    module: nn.Module,
+    state_dict: Dict[str, Any],
+    prefix: str,
+    *args: Any,
+) -> None:
+    """
+    ``_pre_state_dict_hook` is called before ``module._load_from_state_dict()``
+    is called. ``module._state_dict_type`` is used to decide what preprocessing
+    will be done.
+    """
+    _pre_load_state_dict_hook_fn = {
+        FSDP.StateDictType.FULL_STATE_DICT: _full_pre_load_state_dict_hook,
+        FSDP.StateDictType.LOCAL_STATE_DICT: _local_pre_load_state_dict_hook,
+        FSDP.StateDictType.SHARDED_STATE_DICT: _sharded_pre_load_state_dict_hook,
+    }
+    # Code that is common for all state_dict impls
+    fsdp_module = cast(FSDP.FullyShardedDataParallel, module)
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    # Dispatch into state_dict specific implementation of pre-hook.
+    _pre_load_state_dict_hook_fn[fsdp_module._state_dict_type](
+        fsdp_module, state_dict, prefix
+    )
+
+
+@torch.no_grad()
+def _post_load_state_dict_hook(module: nn.Module, *args: Any) -> None:
+    _post_load_state_dict_hook_fn = {
+        FSDP.StateDictType.FULL_STATE_DICT: _full_post_load_state_dict_hook,
+        FSDP.StateDictType.LOCAL_STATE_DICT: _local_post_load_state_dict_hook,
+        FSDP.StateDictType.SHARDED_STATE_DICT: _sharded_post_load_state_dict_hook,
+    }
+    # Code that is common for all state_dict impls
+    fsdp_module = cast(FSDP.FullyShardedDataParallel, module)
+    # Dispatch into state_dict type specific implementation of post-hook for
+    # loading state_dict.
+    _post_load_state_dict_hook_fn[fsdp_module._state_dict_type](fsdp_module)
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index e241c26d1e1f1..5fb2e5cdf0f6b 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -18,8 +18,6 @@
     Iterable,
     Iterator,
     List,
-    Mapping,
-    NamedTuple,
     Optional,
     Set,
     Tuple,
@@ -34,11 +32,6 @@
 import torch.nn.functional as F
 from torch.autograd import Variable
 from torch.distributed import ProcessGroup
-from torch.distributed._shard.sharded_tensor import (
-    Shard,
-    ShardedTensor,
-    init_from_local_shards,
-)
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     _CHECKPOINT_PREFIX,
 )
@@ -48,7 +41,6 @@
 )
 from torch.distributed.distributed_c10d import _get_default_group
 from torch.distributed.utils import (
-    _replace_by_prefix,
     _sync_params_and_buffers,
     _to_kwargs,
 )
@@ -64,7 +56,11 @@
     _process_pos_dim_tensor_state,
     _rekey_sharded_optim_state_dict,
 )
-from ._fsdp_extensions import _ext_chunk_tensor, _ext_pre_load_state_dict_transform
+from ._state_dict_utils import (
+    _post_state_dict_hook,
+    _pre_load_state_dict_hook,
+    _post_load_state_dict_hook,
+)
 from ._utils import (
     _apply_to_modules,
     _apply_to_tensors,
@@ -601,7 +597,7 @@ def _check_order(self, handles_key: _HandlesKey, is_training: bool) -> None:
                     (
                         rank,
                         world_indices[
-                            rank * num_valid_indices : (rank + 1) * num_valid_indices
+                            rank * num_valid_indices: (rank + 1) * num_valid_indices
                         ],
                     )
                     for rank in range(self.world_size)
@@ -1138,28 +1134,11 @@ def __init__(
         # implemented using post-save and pre-load hooks
         self._state_dict_type = StateDictType.FULL_STATE_DICT
         self._state_dict_config = FullStateDictConfig()
-        self._register_state_dict_hook(self._post_state_dict_hook)
-        self._post_state_dict_hook_fn = {
-            StateDictType.FULL_STATE_DICT: self._full_post_state_dict_hook,
-            StateDictType.LOCAL_STATE_DICT: self._local_post_state_dict_hook,
-            StateDictType.SHARDED_STATE_DICT: self._sharded_post_state_dict_hook,
-        }
+        self._register_state_dict_hook(_post_state_dict_hook)
         self._register_load_state_dict_pre_hook(
-            self._pre_load_state_dict_hook, with_module=True
-        )
-        self._pre_load_state_dict_hook_fn = {
-            StateDictType.FULL_STATE_DICT: self._full_pre_load_state_dict_hook,
-            StateDictType.LOCAL_STATE_DICT: self._local_pre_load_state_dict_hook,
-            StateDictType.SHARDED_STATE_DICT: self._sharded_pre_load_state_dict_hook,
-        }
-        self.register_load_state_dict_post_hook(
-            self._post_load_state_dict_hook
+            _pre_load_state_dict_hook, with_module=True
         )
-        self._post_load_state_dict_hook_fn = {
-            StateDictType.FULL_STATE_DICT: self._full_post_load_state_dict_hook,
-            StateDictType.LOCAL_STATE_DICT: self._local_post_load_state_dict_hook,
-            StateDictType.SHARDED_STATE_DICT: self._sharded_post_load_state_dict_hook,
-        }
+        self.register_load_state_dict_post_hook(_post_load_state_dict_hook)
 
     def _get_ignored_modules(
         self,
@@ -2171,7 +2150,9 @@ def set_state_dict_type(
                 prev_state_dict_config = submodule._state_dict_config
             if prev_state_dict_type != submodule._state_dict_type:
                 raise RuntimeError("All FSDP module should the same state_dict_type.")
-            if type(prev_state_dict_config) != type(submodule._state_dict_config):
+            if not isinstance(
+                submodule._state_dict_config, type(prev_state_dict_config)
+            ):
                 raise RuntimeError(
                     "All FSDP modules should have the same type of state_dict_config."
                 )
@@ -2268,200 +2249,6 @@ def _shared_param_fqns(self) -> Iterator[Tuple[str, str, str]]:
             fqn = f"{module_name}{param_name}"
             yield fqn, param_name, module_name
 
-    def _full_post_state_dict_hook(
-        self,
-        state_dict: Dict[str, Any],
-        prefix: str,
-    ) -> Dict[str, Any]:
-        """
-        Hook that runs after model.state_dict() is called before returning result to
-        user. For FSDP, we may have to clone the tensors in state_dict as params go
-        back to sharded version after _summon_full_params ends, and also remove
-        the ``FSDP_WRAPPED_MODULE`` prefix.
-        """
-        _replace_by_prefix(state_dict, prefix + f"{FSDP_PREFIX}", prefix)
-        self._assert_state([TrainingState_.SUMMON_FULL_PARAMS])
-        # Return early for trivial cases
-        if not state_dict or not self._has_params:
-            return state_dict
-
-        # If a rank has already exited the `summon_full_params()` context here
-        # (e.g. when `rank0_only=True` and `rank != 0`), then the rank only
-        # needed to participate in the all-gather and does not need to save the
-        # state dict. For `use_orig_params=False`, we can check this via
-        # `FlatParameter` registration.
-        # TODO: For `use_orig_params=True`, we check for the reshard upon
-        # exiting `summon_full_params()` via the parameter shape. However, for
-        # `NO_SHARD`, we cannot tell from the shape, so we do not return early.
-        if (
-            (
-                not self._use_orig_params
-                and FLAT_PARAM in self.module._parameters
-            )
-            or (
-                self._use_orig_params
-                and self._handles
-                and self._handles[0].uses_sharded_strategy
-                and self._handles[0].is_sharded(self._handles[0].flat_param)
-            )
-        ):
-            return state_dict
-
-        offload_to_cpu = self._state_dict_config.offload_to_cpu
-        cpu_device = torch.device("cpu")
-
-        # Loop only the parameters saved in this instance's wrapped module to
-        # avoid processing buffers.
-        for fqn, param_name, module_name in self._param_fqns:
-            fqn = f"{prefix}{fqn}"
-            clean_key = fqn
-            clean_prefix = clean_tensor_name(prefix)
-            # Strip prefix out of key if needed as buffer names and param names
-            # do not have prefix considered as they are not computed in `state_dict`
-            # call.
-            if clean_key.startswith(clean_prefix):
-                clean_key = clean_key[len(clean_prefix):]
-
-            # Clone non-ignored parameters before exiting the
-            # `_summon_full_params()` context
-            assert fqn in state_dict, (
-                f"FSDP assumes {fqn} is in the state_dict but the state_dict "
-                f"only has {state_dict.keys()}. prefix={prefix}, "
-                f"module_name={module_name} param_name={param_name} rank={self.rank}."
-            )
-            if clean_key not in self._ignored_param_names and \
-                    not getattr(state_dict[fqn], "_has_been_cloned", False):
-                try:
-                    state_dict[fqn] = state_dict[fqn].clone().detach()
-                    state_dict[fqn]._has_been_cloned = True  # type: ignore[attr-defined]
-                except BaseException as e:
-                    warnings.warn(
-                        f"Failed to clone() tensor with name {fqn} on rank {self.rank}. "
-                        "This may mean that this state_dict entry could point to invalid memory "
-                        "regions after returning from state_dict() call if this "
-                        "parameter is managed by FSDP. Please check clone "
-                        f"implementation of {fqn}. Error: {str(e)}"
-                    )
-
-        # Offload the buffer to CPU if needed -- we do not do this in
-        # `_summon_full_params()` since without care, that would free
-        # the original buffer's GPU memory and require reallocating
-        # that memory later; this only affects the state dict's buffer
-        # variable and leaves the original buffer's GPU memory intact
-        if offload_to_cpu:
-            for clean_key in self._buffer_names:
-                # This is a hack to support activation checkpoint.
-                clean_key = clean_key.replace(
-                    f"{checkpoint_wrapper._CHECKPOINT_PREFIX}.", ""
-                )
-                fqn = f"{prefix}{clean_key}"
-                if fqn not in state_dict:
-                    # A buffer can be registered as non-persistent.
-                    continue
-                if state_dict[fqn].device != cpu_device:
-                    state_dict[fqn] = state_dict[fqn].to(cpu_device)
-        return state_dict
-
-    def _local_post_state_dict_hook(
-        self,
-        state_dict: Dict[str, Any],
-        prefix: str,
-    ) -> Dict[str, Any]:
-        """
-        This hook create a ShardedTensor from the local flat_param and replace
-        the state_dict[f"{prefix}{FLAT_PARAM}] with the ShardedTensor. No copy
-        will happen. The underlying storage is the same.
-        """
-        _replace_by_prefix(state_dict, f"{prefix}{FSDP_PREFIX}", prefix)
-        if not self._has_params:
-            return state_dict
-
-        # state_dict[f"{prefix}{FLAT_PARAM}"] exists and has the same tensor
-        # value as the flat_param but it is a pure Tensor because
-        # nn.Module.state_dict() will detach the parameter. Therefore, we need
-        # to get flat_param to get the metadata.
-        assert self._handles, "Should have returned early"
-        flat_param = self._handles[0].flat_param
-        # Construct a ShardedTensor from the flat_param.
-        full_numel = flat_param._unpadded_unsharded_size.numel()  # type: ignore[attr-defined]
-        shard_offset = flat_param.numel() * self.rank
-        valid_data_size = flat_param.numel() - flat_param._shard_numel_padded
-        if valid_data_size > 0 and flat_param._shard_numel_padded > 0:
-            flat_param = flat_param.narrow(0, 0, valid_data_size)
-        local_shards = [
-            Shard.from_tensor_and_offsets(flat_param, [shard_offset], self.rank)
-        ]
-        sharded_tensor = init_from_local_shards(
-            local_shards, full_numel, process_group=self.process_group
-        )  # type: ignore[assignment]
-        if self._state_dict_config.offload_to_cpu:
-            sharded_tensor = sharded_tensor.cpu()
-        state_dict[f"{prefix}{FLAT_PARAM}"] = sharded_tensor
-        return state_dict
-
-    @torch.no_grad()
-    def _sharded_post_state_dict_hook(
-        self,
-        state_dict: Dict[str, Any],
-        prefix: str,
-    ) -> Dict[str, Any]:
-        """
-        The hook replaces the unflattened, unsharded parameter in the state_dict
-        with a unflattened, sharded parameter (a ShardedTensor).
-        """
-        _replace_by_prefix(state_dict, f"{prefix}{FSDP_PREFIX}", prefix)
-        if not self._has_params:
-            return state_dict
-
-        assert self.training_state != TrainingState_.SUMMON_FULL_PARAMS, (
-            "Inside _sharded_post_state_dict_hook, the training_state must "
-            "not be SUMMON_FULL_PARAMS."
-        )
-        with self._summon_full_params(recurse=False, writeback=False):
-            for fqn, _, _ in self._param_fqns:
-                # Create a ShardedTensor for the unflattened, non-sharded parameter.
-                param = functools.reduce(getattr, fqn.split("."), self.module)
-                sharded_tensor = _ext_chunk_tensor(
-                    tensor=param,
-                    rank=self.rank,
-                    world_size=self.world_size,
-                    num_devices_per_node=torch.cuda.device_count(),
-                    pg=self.process_group
-                )
-                if self._state_dict_config.offload_to_cpu:
-                    sharded_tensor = sharded_tensor.cpu()
-                state_dict[f"{prefix}{fqn}"] = sharded_tensor
-        # For `use_orig_params=True`, the `FlatParameter` is not registered, so
-        # there is no entry in the state dict for it to pop.
-        if not self._use_orig_params:
-            state_dict.pop(f"{prefix}{FLAT_PARAM}")
-        return state_dict
-
-    @staticmethod
-    def _post_state_dict_hook(
-        module: nn.Module,
-        state_dict: Dict[str, Any],
-        prefix: str,
-        *args: Any,
-    ) -> Dict[str, Any]:
-        """
-        _post_state_dict_hook() is called after the state_dict() of this
-        FSDP module is executed. ``self._state_dict_type`` is used to decide
-        what postprocessing will be done.
-        """
-        self = cast(FullyShardedDataParallel, module)
-        processed_state_dict = self._post_state_dict_hook_fn[self._state_dict_type](state_dict, prefix)
-        # Restore buffers, which currently are in their full precision type,
-        # back to their mixed precision type. This is because buffers are cast
-        # during lazy_init() and stay at their mixed precision type before/after
-        # forward/backward. As a result state_dict() should maintain this.
-        if (
-            self._is_root
-            and self._mixed_precision_enabled_for_buffers()
-        ):
-            self._cast_buffers(recurse=True)
-        return processed_state_dict
-
     def state_dict(self, *args, **kwargs):
         """
         This is the entry point of all three FSDP ``state_dict`` APIs: full,
@@ -2560,268 +2347,6 @@ def state_dict(self, *args, **kwargs):
         else:
             raise ValueError(f"Unknown StateDictType {self._state_dict_type}.")
 
-    def _local_state_dict(self, *args: Any, **kwargs: Any) -> Any:
-        """
-        Returns the local state of the module. Parameters are flattened and
-        sharded, so the resulting state_dict can only be loaded after the module
-        has been wrapped with FSDP.
-        """
-        with self.state_dict_type(self, StateDictType.LOCAL_STATE_DICT):
-            return self.state_dict(*args, **kwargs)
-
-    def _full_post_load_state_dict_hook(self, *args, **kwargs) -> None:
-        # We should exit summon_full_params context.
-        self._assert_state([TrainingState_.SUMMON_FULL_PARAMS])
-        assert getattr(self, '_full_param_ctx', None) is not None
-        self._full_param_ctx.__exit__(None, None, None)
-        self._full_param_ctx = None
-
-    def _sharded_state_dict(self, *args: Any, **kwargs: Any) -> Any:
-        """
-        Returns the sharded states of the module. Parameters are unflattened and
-        sharded, so the resulting state_dict can be used with any parallelism
-        (e.g., DPP, model parallelism, and single trainer) after a valid
-        resharding.
-        """
-        with self.state_dict_type(StateDictType.SHARDED_STATE_DICT):
-            return self.state_dict(self, *args, **kwargs)
-
-    def _full_pre_load_state_dict_hook(
-        self,
-        state_dict: Dict[str, Any],
-        prefix: str,
-    ) -> None:
-        # We do not expect to be calling pre-hooks twice without post-hook
-        # call in between.
-        assert getattr(self, '_full_param_ctx', None) is None
-        # Note that it needs writeback=True to persist.
-        self._full_param_ctx = self._summon_full_params(
-            recurse=False, writeback=True
-        )
-        self._full_param_ctx.__enter__()
-        _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP_PREFIX}")
-
-    def _local_post_load_state_dict_hook(self, *args, **kwargs) -> None:
-        pass
-
-    def _local_pre_load_state_dict_hook(
-        self,
-        state_dict: Dict[str, Any],
-        prefix: str,
-    ) -> None:
-        """
-        This hook finds the local flat_param for this FSDP module from the
-        state_dict. The flat_param should be a ShardedTensor. This hook converts
-        the ShardedTensor to a tensor. No copy happen unless padding is required.
-        """
-        _replace_by_prefix(state_dict, prefix, f"{prefix}{FSDP_PREFIX}")
-        fqn = f"{prefix}{FSDP_PREFIX}{FLAT_PARAM}"
-        if fqn not in state_dict:
-            assert not self._has_params, (
-                "No `FlatParameter` in `state_dict` for this FSDP instance but it has parameters"
-            )
-            return
-        load_tensor = state_dict[fqn]
-        assert isinstance(
-            load_tensor, ShardedTensor
-        ), "Tensors in local_state_dict should be ShardedTensor."
-
-        # Convert the ShardedTensor to a Tensor.
-        shards = load_tensor.local_shards()
-        assert len(shards), "load_local_state_dict assume one shard per ShardedTensor."
-        load_tensor = cast(torch.Tensor, shards[0].tensor)
-
-        # Get the metada of the flat_param to decide whether to pad the loaded
-        # tensor.
-        flat_param = self._handles[0].flat_param
-        assert flat_param is not None
-        if flat_param._shard_numel_padded not in (0, flat_param.numel()):
-            assert load_tensor.numel() < flat_param.numel(), (
-                f"Local shard size = {flat_param.numel()} and the tensor in "
-                f"the state_dict is {load_tensor.numel()}."
-            )
-            load_tensor = F.pad(load_tensor, [0, flat_param._shard_numel_padded])
-        state_dict[fqn] = load_tensor
-
-    def _sharded_post_load_state_dict_hook(self, *args, **kwargs) -> None:
-        if self._use_orig_params:
-            self._register_orig_params()
-
-    def _sharded_pre_load_state_dict_hook(
-        self,
-        state_dict: Dict[str, Any],
-        prefix: str,
-    ) -> None:
-        """
-        The hook combines the unflattened, sharded parameters (ShardedTensor) to
-        a new FlatParameter and shards the new FlatParameter to the local chunk.
-        """
-        _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP_PREFIX}")
-        if not self._has_params:
-            return
-
-        if not self._handles[0].uses_sharded_strategy:
-            raise RuntimeError(
-                "load_sharded_state_dict can only be called when parameters "
-                "are flatten and sharded."
-            )
-
-        nonsharded_tensors = []
-        # TODO: Reduce the communication by using only one
-        # `all_gather_into_tensor()` to gather all the parameters in this
-        # layer. This can be achieved by concatenating all the local shards and
-        # then appending the padding.
-        # https://github.com/pytorch/pytorch/issues/77461
-        shared_fqns = [fqn for fqn, _, _ in self._shared_param_fqns]
-        for fqn, _, _ in self._param_fqns:
-            full_fqn = f"{prefix}{FSDP_PREFIX}{fqn}"
-            param = state_dict.pop(full_fqn)
-            if fqn in shared_fqns:
-                continue
-            # All-gather the param (ShardedTensor)
-            param, shards = _ext_pre_load_state_dict_transform(param)
-            assert len(shards) < 2, (
-                f"Expects 0 or 1 shard per rank but got {len(shards)} shards on rank {self.rank}"
-            )
-            param_numel = param.size().numel()
-            dim_0_size = param.size()[0]
-            chunk_size = (
-                math.ceil(dim_0_size / self.world_size) * param_numel // dim_0_size
-            )
-            if len(shards) == 1:
-                local_tensor = cast(torch.Tensor, shards[0].tensor).flatten()
-                if not local_tensor.is_cuda:
-                    local_tensor = local_tensor.cuda()
-                num_padding = chunk_size - local_tensor.numel()
-                if num_padding > 0:
-                    local_tensor = F.pad(local_tensor, [0, num_padding])
-            else:
-                local_tensor = torch.zeros(chunk_size, dtype=param.dtype).cuda()
-            tensor = torch.empty(
-                chunk_size * self.world_size, dtype=local_tensor.dtype
-            ).cuda()
-            dist.all_gather_into_tensor(tensor, local_tensor, group=self.process_group)
-            tensor = tensor.narrow(0, 0, param_numel).reshape(param.size())
-            nonsharded_tensors.append(tensor)
-
-        # Create a new flat_param from the loaded, non-sharded tensors.
-        flat_param = self._handles[0].flat_param
-        loaded_flat_param = FlatParamHandle.flatten_params(nonsharded_tensors, requires_grad=False)
-
-        # Get the chunk from the loaded flat_param for the local rank.
-        loaded_flat_param, num_to_pad = FlatParamHandle._get_shard(
-            loaded_flat_param, self.rank, self.world_size,
-        )
-        loaded_flat_param.to(flat_param.device)
-        assert flat_param.numel() == loaded_flat_param.numel(), (
-            f"The loaded local chunk has different numel({loaded_flat_param.numel()}) "
-            f"from the local chunk {flat_param.numel()}."
-        )
-        assert flat_param._shard_numel_padded == num_to_pad, (
-            f"The loaded local chunk has different padding({num_to_pad}) "
-            f"from the local chunk {flat_param._shard_numel_padded}."
-        )
-        state_dict[f"{prefix}{FSDP_PREFIX}{FLAT_PARAM}"] = loaded_flat_param
-        if self._use_orig_params:
-            self._deregister_orig_params()
-
-    @staticmethod
-    def _pre_load_state_dict_hook(
-        module: nn.Module,
-        state_dict: Dict[str, Any],
-        prefix: str,
-        *args: Any,
-    ) -> None:
-        """
-        ``_pre_state_dict_hook` is called before ``self._load_from_state_dict()``
-        is called. ``self._state_dict_type`` is used to decide what preprocessing
-        will be done.
-        """
-        # Code that is common for all state_dict impls
-        self = cast(FullyShardedDataParallel, module)
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
-        # Dispatch into state_dict specific implementation of pre-hook.
-        self._pre_load_state_dict_hook_fn[self._state_dict_type](state_dict, prefix)
-
-    @staticmethod
-    def _post_load_state_dict_hook(module: nn.Module, *args: Any) -> None:
-        # Code that is common for all state_dict impls
-        self = cast(FullyShardedDataParallel, module)
-        # Dispatch into state_dict type specific implementation of post-hook for
-        # loading state_dict.
-        self._post_load_state_dict_hook_fn[self._state_dict_type]()
-
-    def load_state_dict(
-        self,
-        state_dict: Mapping[str, Any],
-        *args,
-        **kwargs,
-    ) -> NamedTuple:
-        """
-        The entry point of all three FSDP ``load_state_dict`` APIs. By default,
-        calling ``load_state_dict`` on an FSDP module will result in FSDP
-        attempting to load a "full" state_dict, i.e. a state_dict consisting of
-        full, unsharded, unflattened original module parameters. This requires
-        FSDP to load the full parameter context on each rank which could result
-        in GPU OOM. As a result, :func:`state_dict_type` API is available to
-        configure between ``load_state_dict`` implementations. User can thus use
-        ``with self.state_dict_type(self, StateDictType.LOCAL_STATE_DICT)`` context
-        manager to load a local state dict checkpoint that will restore only
-        local shards of the module. Currently, the only supported
-        implementations are ``StateDictType.LOCAL_STATE_DICT`` and
-        ``StateDictType.FULL_STATE_DICT`` (default). Please see :func:`state_dict`
-        for documentation around creating an FSDP checkpoint.
-
-        Example::
-
-            >>> # xdoctest: +SKIP("undefined variables")
-            >>> import torch
-            >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-            >>> from torch.distributed.fsdp import StateDictType
-            >>> torch.cuda.set_device(device_id)
-            >>> my_module = nn.Linear(...)
-            >>> sharded_module = FSDP(my_module)
-            >>> checkpoint = torch.load(PATH)
-            >>> full_state_dict = checkpoint['full_state_dict']
-            >>> with FSDP.state_dict_type(sharded_module, StateDictType.FULL_STATE_DICT):
-            >>>     sharded_module.load_state_dict(full_state_dict)
-            >>> full_dict.keys()
-            >>> odict_keys(['weight', 'bias'])
-            >>> # using local state dict
-            >>> local_state_dict = checkpoint['local_state_dict']
-            >>> with FSDP.state_dict_type(sharded_module, StateDictType.LOCAL_STATE_DICT):
-            >>>     sharded_module.load_state_dict(local_state_dict)
-            >>> local_dict.keys()
-            >>> odict_keys(['flat_param', 'inner.flat_param'])
-
-        .. warning:: This needs to be called on all ranks since it uses
-            collective communications.
-        """
-        return super().load_state_dict(state_dict, *args)
-
-    def _load_local_state_dict(
-        self,
-        state_dict: Mapping[str, Any],
-        *args,
-    ) -> NamedTuple:
-        """
-        Load states from a flattened, sharded state dictionary.
-        """
-        with self.state_dict_type(self, StateDictType.LOCAL_STATE_DICT):
-            return self.load_state_dict(state_dict, *args)
-
-    def _load_sharded_state_dict(
-        self,
-        state_dict: Union[Dict[str, torch.Tensor], "OrderedDict[str, torch.Tensor]"],
-        strict: bool = True,
-    ) -> NamedTuple:
-        """
-        Load states from a unflattened, sharded state dictionary.
-        """
-        with self.state_dict_type(StateDictType.SHARDED_STATE_DICT):
-            return self.load_state_dict(state_dict, strict)
-
     def forward(self, *args: Any, **kwargs: Any) -> Any:
         """
         Runs the forward pass for the wrapped module, inserting FSDP-specific
@@ -4540,7 +4065,6 @@ def register_comm_hook(self, state: object, hook: callable):
             submodule._communication_hook_state = state
             submodule._communication_hook = hook
 
-
     def _init_param_exec_order_wrap_policy(self, *args, **kwargs) -> None:
         auto_wrap_policy = kwargs["auto_wrap_policy"]
         module = kwargs["module"]

From e03181f55ac0098a14d3b581cbf0235423bdaf5b Mon Sep 17 00:00:00 2001
From: Zachary DeVito <zdevito@gmail.com>
Date: Tue, 25 Oct 2022 19:47:30 +0000
Subject: [PATCH 0165/1922] [inductor] Fix finalization issues when using
 multiprocessing (#87725)

If python was launched with 'spawn' it will not use the standard
shutdown methods that concurrent.futures requires. So we register a
shutdown with the method it does uses. Without this, shutdown hangs
since the workers will not exit.

cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87725
Approved by: https://github.com/wconstab
---
 torch/_inductor/codecache.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 1d83633019cb8..1c97c26a7870e 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -9,6 +9,7 @@
 import shutil
 import signal
 import subprocess
+import sys
 import sysconfig
 import tempfile
 import types
@@ -304,9 +305,15 @@ def run():
         # we rely on 'fork' because we cannot control whether users
         # have an `if __name__ == '__main__'` in their main process.
         fork_context = multiprocessing.get_context("fork")
-        return ProcessPoolExecutor(
+        pool = ProcessPoolExecutor(
             config.compile_threads, mp_context=fork_context, initializer=init
         )
+        # when this pool is created in a subprocess object, the normal exit handler
+        # doesn't run, and we need to register our own handler.
+        # exitpriority has to be high, because another one of the finalizers will
+        # kill the worker thread that sends the shutdown message to the workers...
+        multiprocessing.util.Finalize(None, pool.shutdown, exitpriority=sys.maxsize)
+        return pool
 
     @classmethod
     def warm_pool(cls):

From 40cc28559a5def4b81151daf991796d7e98488e2 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 26 Oct 2022 05:09:39 +0000
Subject: [PATCH 0166/1922] [vision hash update] update the pinned vision hash
 (#87744)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned vision hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87744
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/vision.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index 88e283fa46ec9..d4dee5af2936d 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-0d7807d59520289b2065b4db4a138b7fba2f61fd
+edb3a8069a0b86231f14e84ac9f26fd7c7bffb5f

From 10fea5102d1908ec71a3f9d8efeb9d4d0cc0c86a Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 26 Oct 2022 05:40:25 +0000
Subject: [PATCH 0167/1922] [Inductor] update triton commit pin (#87732)

Fixes https://github.com/pytorch/torchdynamo/issues/1746

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87732
Approved by: https://github.com/ngimel
---
 .github/ci_commit_pins/triton.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/triton.txt b/.github/ci_commit_pins/triton.txt
index 58d82813d6e13..26387597d0911 100644
--- a/.github/ci_commit_pins/triton.txt
+++ b/.github/ci_commit_pins/triton.txt
@@ -1 +1 @@
-db3aa1d1fb2bb536752a71d9e0f03cf6a86ddf65
+5ca1ed01016530056c4507661c24d6c21efc983d

From 889ebaebdff71e72b489fbd89adf3ce14a0af6d1 Mon Sep 17 00:00:00 2001
From: Charlie Yan <charlieyan@fb.com>
Date: Wed, 26 Oct 2022 00:32:13 +0000
Subject: [PATCH 0168/1922] Format distributed.py (#87667)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87667
Approved by: https://github.com/zhaojuanmao
---
 torch/nn/parallel/distributed.py | 385 ++++++++++++++++++-------------
 1 file changed, 230 insertions(+), 155 deletions(-)

diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index 2e271f7bb081f..23625d9d20014 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -39,10 +39,11 @@
 from ._replicated_tensor_ddp_utils import _ddp_with_replicated_tensor_enabled
 from .scatter_gather import gather, is_namedtuple, scatter_kwargs  # noqa: F401
 
-__all__ = ['DistributedDataParallel']
+__all__ = ["DistributedDataParallel"]
 
 logger = logging.getLogger(__name__)
 
+
 def _tree_flatten_with_rref(output):
     output_is_rref = RPC_AVAILABLE and isinstance(output, RRef)
     if output_is_rref:
@@ -142,12 +143,14 @@ class _BufferCommHookLocation(Enum):
     PRE_FORWARD = auto()
     POST_FORWARD = auto()
 
+
 @dataclass
 class _BufferCommHook:
     buffer_comm_hook: Callable
     buffer_comm_hook_state: Any
     buffer_comm_hook_location: _BufferCommHookLocation
 
+
 # Add a DDPSink to run various functions when backwards starts, such as
 # queueing call back of out-most backward/graph task,
 # this helps call back is fired after all gradients' calculation
@@ -161,9 +164,7 @@ def forward(ctx, reducer, state_dict, *inputs):
         ctx.reducer = reducer
         ctx.state_dict = state_dict
         ret = tuple(
-            inp.clone()
-            if isinstance(inp, torch.Tensor)
-            else inp
+            inp.clone() if isinstance(inp, torch.Tensor) else inp
             for inp in inputs
         )
         return ret
@@ -173,8 +174,13 @@ def backward(ctx, *grad_outputs):
         state_dict = ctx.state_dict
         # Enqueue delay allreduce for static graph training on the first
         # iteration.
-        if ctx.state_dict['static_graph'] and ctx.state_dict['num_iterations'] == 1:
-            Variable._execution_engine.queue_callback(ctx.reducer._delay_all_reduce)
+        if (
+            ctx.state_dict["static_graph"]
+            and ctx.state_dict["num_iterations"] == 1
+        ):
+            Variable._execution_engine.queue_callback(
+                ctx.reducer._delay_all_reduce
+            )
 
         return (None, None, *grad_outputs)
 
@@ -209,7 +215,9 @@ def main_hook(self):
         ddp._check_and_sync_module_buffers()
 
         # Check if need to sync in the backward pass
-        work = ddp._check_global_requires_backward_grad_sync(is_joined_rank=True)
+        work = ddp._check_global_requires_backward_grad_sync(
+            is_joined_rank=True
+        )
         work.wait()
         should_sync_backwards = work.result()[0].item() != 0
         # Forward parameter sync is disabled in the next iteration if we
@@ -237,6 +245,7 @@ def post_hook(self, is_last_joiner: bool):
         """
         self.ddp._sync_final_model(is_last_joiner)
 
+
 class DistributedDataParallel(Module, Joinable):
     r"""Implements distributed data parallelism that is based on
     ``torch.distributed`` package at the module level.
@@ -556,10 +565,13 @@ def __init__(
 
         if device_ids is not None and len(device_ids) > 1:
             self._log_and_throw(
-                ValueError, "device_ids can only be None or contain a single element."
+                ValueError,
+                "device_ids can only be None or contain a single element.",
             )
 
-        self.is_multi_device_module = len({p.device for p in module.parameters()}) > 1
+        self.is_multi_device_module = (
+            len({p.device for p in module.parameters()}) > 1
+        )
         distinct_device_types = {p.device.type for p in module.parameters()}
         if len(distinct_device_types) != 1:
             self._log_and_throw(
@@ -619,7 +631,9 @@ def __init__(
         else:
             self.parameters_to_ignore = []
 
-        self._use_replicated_tensor_module = _ddp_with_replicated_tensor_enabled()
+        self._use_replicated_tensor_module = (
+            _ddp_with_replicated_tensor_enabled()
+        )
         self._build_replicated_tensor_module()
 
         if check_reduction:
@@ -662,10 +676,15 @@ def __init__(
             params_and_buffers_to_ignore=self.parameters_to_ignore,
         )
         # In debug mode, build a mapping of parameter index -> parameter.
-        param_to_name_mapping = self._build_debug_param_to_name_mapping(parameters)
+        param_to_name_mapping = self._build_debug_param_to_name_mapping(
+            parameters
+        )
         # Builds reducer.
         self._ddp_init_helper(
-            parameters, expect_sparse_gradient, param_to_name_mapping, static_graph
+            parameters,
+            expect_sparse_gradient,
+            param_to_name_mapping,
+            static_graph,
         )
         self._has_rebuilt_buckets = False
 
@@ -678,7 +697,10 @@ def _build_replicated_tensor_module(self):
             # registering '_replicated_tensor_module' as a submodule by directly
             # adding to self.__dict__.
             from ._replicated_tensor_ddp_interop import _replicate_module
-            self.__dict__['_replicated_tensor_module'] = _replicate_module(self.module, self.process_group)
+
+            self.__dict__["_replicated_tensor_module"] = _replicate_module(
+                self.module, self.process_group
+            )
 
     def _log_and_throw(self, err_type, err_msg):
         if self.logger is not None:
@@ -686,8 +708,11 @@ def _log_and_throw(self, err_type, err_msg):
         raise err_type(err_msg)
 
     def _ddp_init_helper(
-        self, parameters, expect_sparse_gradient, param_to_name_mapping,
-        static_graph
+        self,
+        parameters,
+        expect_sparse_gradient,
+        param_to_name_mapping,
+        static_graph,
     ):
         """
         Initialization helper function that does the following:
@@ -720,8 +745,14 @@ def _ddp_init_helper(
         if static_graph is True or self.find_unused_parameters is False:
             bucket_size_limits = [sys.maxsize]
         else:
-            bucket_size_limits = [dist._DEFAULT_FIRST_BUCKET_BYTES, self.bucket_bytes_cap]
-        bucket_indices, per_bucket_size_limits = dist._compute_bucket_assignment_by_size(
+            bucket_size_limits = [
+                dist._DEFAULT_FIRST_BUCKET_BYTES,
+                self.bucket_bytes_cap,
+            ]
+        (
+            bucket_indices,
+            per_bucket_size_limits,
+        ) = dist._compute_bucket_assignment_by_size(
             parameters,
             bucket_size_limits,
             expect_sparse_gradient,
@@ -747,7 +778,7 @@ def _ddp_init_helper(
             param_to_name_mapping,
             # User can set dist._DEFAULT_FIRST_BUCKET_BYTES to tune DDP first
             # bucket.
-            dist._DEFAULT_FIRST_BUCKET_BYTES
+            dist._DEFAULT_FIRST_BUCKET_BYTES,
         )
 
         self.logger = dist.Logger(self.reducer)
@@ -793,10 +824,15 @@ def __setstate__(self, state):
         self.__dict__.setdefault("require_backward_grad_sync", True)
         parameters, expect_sparse_gradient = self._build_params_for_reducer()
         # In debug mode, build a mapping of parameter index -> parameter.
-        param_to_name_mapping = self._build_debug_param_to_name_mapping(parameters)
+        param_to_name_mapping = self._build_debug_param_to_name_mapping(
+            parameters
+        )
         # Builds reducer.
         self._ddp_init_helper(
-            parameters, expect_sparse_gradient, param_to_name_mapping, self.static_graph
+            parameters,
+            expect_sparse_gradient,
+            param_to_name_mapping,
+            self.static_graph,
         )
         if self.static_graph:
             self.reducer._set_static_graph()
@@ -815,7 +851,8 @@ def _build_params_for_reducer(self):
                 # parameters through _former_parameters.
                 for param_name, param in module.named_parameters(recurse=False)
                 if param.requires_grad
-                and f"{module_name}.{param_name}" not in self.parameters_to_ignore
+                and f"{module_name}.{param_name}"
+                not in self.parameters_to_ignore
             ]
         ]
 
@@ -824,7 +861,8 @@ def _build_params_for_reducer(self):
         modules_and_parameters = [
             # "p not in memo" is the deduplication check.
             # "not memo.add(p)" is always True, and it's only there to cause "add(p)" if needed.
-            (m, p) for m, p in modules_and_parameters
+            (m, p)
+            for m, p in modules_and_parameters
             if p not in memo and not memo.add(p)
         ]
 
@@ -841,7 +879,10 @@ def produces_sparse_gradient(module):
 
         # Build list of booleans indicating whether or not to expect sparse
         # gradients for the corresponding parameters.
-        expect_sparse_gradient = list(produces_sparse_gradient(module) for module, _ in modules_and_parameters)
+        expect_sparse_gradient = list(
+            produces_sparse_gradient(module)
+            for module, _ in modules_and_parameters
+        )
 
         self._assign_modules_buffers()
 
@@ -862,19 +903,21 @@ def _assign_modules_buffers(self):
             if buffer_name not in self.parameters_to_ignore
         ]
         self.modules_buffers = [
-            buffer
-            for (buffer, buffer_name) in named_module_buffers
+            buffer for (buffer, buffer_name) in named_module_buffers
         ]
         # Dict[str, tensor] representing module buffers not ignored by DDP.
         self.named_module_buffers = {
-            buffer_name: buffer for (buffer, buffer_name) in named_module_buffers
+            buffer_name: buffer
+            for (buffer, buffer_name) in named_module_buffers
         }
 
     def _build_debug_param_to_name_mapping(self, parameters):
         if dist.get_debug_level() == dist.DebugLevel.OFF:
             return {}
 
-        param_to_param_index = {parameters[i]: i for i in range(len(parameters))}
+        param_to_param_index = {
+            parameters[i]: i for i in range(len(parameters))
+        }
         param_set = set(parameters)
         param_index_to_param_fqn = {}
         for module_name, module in self.module.named_modules():
@@ -987,14 +1030,18 @@ def _inside_ddp_forward(self):
             DistributedDataParallel._active_ddp_module = None
 
     def _run_ddp_forward(self, *inputs, **kwargs):
-        module_to_run = self._replicated_tensor_module if self._use_replicated_tensor_module else self.module
+        module_to_run = (
+            self._replicated_tensor_module
+            if self._use_replicated_tensor_module
+            else self.module
+        )
 
         if self.device_ids:
             inputs, kwargs = _to_kwargs(
                 inputs,
                 kwargs,
                 self.device_ids[0],
-                self.use_side_stream_for_tensor_copies
+                self.use_side_stream_for_tensor_copies,
             )
             with self._inside_ddp_forward():
                 return module_to_run(*inputs[0], **kwargs[0])
@@ -1003,7 +1050,9 @@ def _run_ddp_forward(self, *inputs, **kwargs):
                 return module_to_run(*inputs, **kwargs)
 
     def forward(self, *inputs, **kwargs):
-        with torch.autograd.profiler.record_function("DistributedDataParallel.forward"):
+        with torch.autograd.profiler.record_function(
+            "DistributedDataParallel.forward"
+        ):
             if torch.is_grad_enabled() and self.require_backward_grad_sync:
                 self.logger.set_runtime_stats_and_log()
                 self.num_iterations += 1
@@ -1024,18 +1073,22 @@ def forward(self, *inputs, **kwargs):
             # during forward computation.
             # This should be called only once during whole training period.
             if torch.is_grad_enabled() and self.reducer._rebuild_buckets():
-                logger.info("Reducer buckets have been rebuilt in this iteration.")
+                logger.info(
+                    "Reducer buckets have been rebuilt in this iteration."
+                )
                 self._has_rebuilt_buckets = True
 
             # sync params according to location (before/after forward) user
             # specified as part of hook, if hook was specified.
-            buffer_hook_registered = hasattr(self, 'buffer_hook')
+            buffer_hook_registered = hasattr(self, "buffer_hook")
             if self._check_sync_bufs_pre_fwd():
                 self._sync_buffers()
 
             if self._join_config.enable:
                 # Notify joined ranks whether they should sync in backwards pass or not.
-                self._check_global_requires_backward_grad_sync(is_joined_rank=False)
+                self._check_global_requires_backward_grad_sync(
+                    is_joined_rank=False
+                )
 
             output = self._run_ddp_forward(*inputs, **kwargs)
 
@@ -1053,7 +1106,9 @@ def forward(self, *inputs, **kwargs):
                 # unused parameters. Only if `find_unused_parameters` is set.
                 if self.find_unused_parameters and not self.static_graph:
                     # Do not need to populate this for static graph.
-                    self.reducer.prepare_for_backward(list(_find_tensors(output)))
+                    self.reducer.prepare_for_backward(
+                        list(_find_tensors(output))
+                    )
                 else:
                     self.reducer.prepare_for_backward([])
             else:
@@ -1065,13 +1120,15 @@ def forward(self, *inputs, **kwargs):
             self.static_graph and self.num_iterations == 1
         ):
             state_dict = {
-                'static_graph': self.static_graph,
-                'num_iterations': self.num_iterations,
+                "static_graph": self.static_graph,
+                "num_iterations": self.num_iterations,
             }
 
-            output_tensor_list, treespec, output_is_rref = _tree_flatten_with_rref(
-                output
-            )
+            (
+                output_tensor_list,
+                treespec,
+                output_is_rref,
+            ) = _tree_flatten_with_rref(output)
             output_placeholders = [None for _ in range(len(output_tensor_list))]
             # Do not touch tensors that have no grad_fn, which can cause issues
             # such as https://github.com/pytorch/pytorch/issues/60733
@@ -1134,7 +1191,9 @@ def _check_global_requires_backward_grad_sync(self, is_joined_rank):
     # the models have buffers that should be synchronized in the forward pass.
     def _check_and_sync_module_buffers(self):
         if self._check_sync_bufs_pre_fwd():
-            authoritative_rank = self._find_common_rank(self._distributed_rank, False)
+            authoritative_rank = self._find_common_rank(
+                self._distributed_rank, False
+            )
             self._sync_module_buffers(authoritative_rank)
 
     # When running in join model, agrees upon a common rank and broadcast model
@@ -1151,7 +1210,7 @@ def _sync_final_model(self, is_last_joiner):
             process_group=self.process_group,
             broadcast_bucket_size=self.broadcast_bucket_size,
             src=self._authoritative_rank,
-            params_and_buffers_to_ignore=self.parameters_to_ignore
+            params_and_buffers_to_ignore=self.parameters_to_ignore,
         )
 
     # Schedule comm ops to match those scheduled in the reducer's backward
@@ -1315,7 +1374,9 @@ def join_hook(
                 cases for possibly better results.
                 Default is ``True``.
         """
-        divide_by_initial_world_size = kwargs.get("divide_by_initial_world_size", True)
+        divide_by_initial_world_size = kwargs.get(
+            "divide_by_initial_world_size", True
+        )
         return _DDPJoinHook(
             self, divide_by_initial_world_size=divide_by_initial_world_size
         )
@@ -1332,49 +1393,49 @@ def _register_buffer_comm_hook(
         self,
         state,
         hook: callable,
-        comm_hook_location=_BufferCommHookLocation.POST_FORWARD
+        comm_hook_location=_BufferCommHookLocation.POST_FORWARD,
     ):
         r"""
-            Allows custom registration of hooks that define how buffer are
-            synchronized across ranks. The hook takes in an optional state
-            and is passed in a Dict[str, Tensor] corresponding to buffer names
-            and the buffers, and can run arbitrary reductions on buffers as
-            opposed to DDP's default broadcast from rank 0. This is useful for
-            example if a counter needs to be summed or averaged across ranks
-            every iteration.
+        Allows custom registration of hooks that define how buffer are
+        synchronized across ranks. The hook takes in an optional state
+        and is passed in a Dict[str, Tensor] corresponding to buffer names
+        and the buffers, and can run arbitrary reductions on buffers as
+        opposed to DDP's default broadcast from rank 0. This is useful for
+        example if a counter needs to be summed or averaged across ranks
+        every iteration.
 
-            Args:
-                state (Any): Optional state that is passed to the hook.
-                hook (Callable): Callable with the following signature:
-                                ``hook(state: object, buffers: Dict[str, torch.Tensor])
-                                -> Optional[List[torch.futures.Future[torch.Tensor]]]``
-                comm_hook_location (_BufferCommHookLocation): Enum value indicating
-                                where to run the hook.
-                                _BufferCommHookLocation.PRE_FORWARD means that the
-                                hook will run _before_ the forward pass, and
-                                _BufferCommHookLocation.POST_FORWARD means that the
-                                hook will run _after_ the forward pass.
-
-                hook (Callable): Callable with the following signature:
-                             ``hook(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]``:
+        Args:
+            state (Any): Optional state that is passed to the hook.
+            hook (Callable): Callable with the following signature:
+                            ``hook(state: object, buffers: Dict[str, torch.Tensor])
+                            -> Optional[List[torch.futures.Future[torch.Tensor]]]``
+            comm_hook_location (_BufferCommHookLocation): Enum value indicating
+                            where to run the hook.
+                            _BufferCommHookLocation.PRE_FORWARD means that the
+                            hook will run _before_ the forward pass, and
+                            _BufferCommHookLocation.POST_FORWARD means that the
+                            hook will run _after_ the forward pass.
 
-                NOTE: To maximize performance, users can return a
-                    List[torch.futures.Future] from their hook, and DDP will
-                    install and await these hooks appropriately at the end of
-                    the backward pass. This will ensure all buffers are
-                    synchronized by the end of the backward pass. If this
-                    setting is used, it is recommended to pass
-                    comm_hook_location=_BufferCommHookLocation.POST_FORWARD,
-                    which will trigger the hook after the forward pass.
-                    If _BufferCommHookLocation.PRE_FORWARD is used, users must
-                    ensure appropriate synchronization when manipulating GPU
-                    buffers in the forward pass.
-            """
+            hook (Callable): Callable with the following signature:
+                         ``hook(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]``:
+
+            NOTE: To maximize performance, users can return a
+                List[torch.futures.Future] from their hook, and DDP will
+                install and await these hooks appropriately at the end of
+                the backward pass. This will ensure all buffers are
+                synchronized by the end of the backward pass. If this
+                setting is used, it is recommended to pass
+                comm_hook_location=_BufferCommHookLocation.POST_FORWARD,
+                which will trigger the hook after the forward pass.
+                If _BufferCommHookLocation.PRE_FORWARD is used, users must
+                ensure appropriate synchronization when manipulating GPU
+                buffers in the forward pass.
+        """
         assert callable(hook)
         self.buffer_hook = _BufferCommHook(
             buffer_comm_hook=hook,
             buffer_comm_hook_state=state,
-            buffer_comm_hook_location=comm_hook_location
+            buffer_comm_hook_location=comm_hook_location,
         )
 
     def register_comm_hook(self, state: object, hook: callable):
@@ -1486,69 +1547,75 @@ def _register_builtin_comm_hook(self, comm_hook_type):
         self.logger._set_comm_hook_name(str(comm_hook_type))
         dist._register_builtin_comm_hook(self.reducer, comm_hook_type)
 
-    def _register_fused_optim(self, optim: Type, *args, optim_params=None, **kwargs):
+    def _register_fused_optim(
+        self, optim: Type, *args, optim_params=None, **kwargs
+    ):
         r"""
-        Registers an optimizer with DDP such that the optimization for a
-        parameter will run immediately when that parameter's gradient is
-        finished with reduction, instead of waiting for all parameters'
-        gradients to finish reduction. This can result in a training speedup
-        depending on your workload since the optimizer can run while gradient
-        reduction for other parameters are still ongoing. In addition, this has
-        the potential to reduce peak memory consumption during training, as it
-        only needs to load the per-parameter optimizer states of a single
-        parameter at a time, instead of loading all per-parameter optimizer
-        states at once.
+            Registers an optimizer with DDP such that the optimization for a
+            parameter will run immediately when that parameter's gradient is
+            finished with reduction, instead of waiting for all parameters'
+            gradients to finish reduction. This can result in a training speedup
+            depending on your workload since the optimizer can run while gradient
+            reduction for other parameters are still ongoing. In addition, this has
+            the potential to reduce peak memory consumption during training, as it
+            only needs to load the per-parameter optimizer states of a single
+            parameter at a time, instead of loading all per-parameter optimizer
+            states at once.
 
-        Args:
-            optim_cls (Type): a ``torch.optim.Optimizer`` class to be registered
-            as a fused optimizer.
-            *args (Sequence[Any]): Arguments to forward to `optim_cls`.
-            optim_params (Optional[Iterable[torch.Tensor]]): Set of parameters
-            to optimize, similar to `params` argument of traditional `torch.optim`
-            Optimizers. If this is omitted, all DDP model parameters will be
-            optimized.
-            **kwargs: (Dict[str, Any]): Keyword arguments to forward to `optim_cls`.
-
-    .. warning ::
-        _register_fused_optim should only be called once on a DDP instance,
-        and registering multiple fused optimizers for the same DDP model
-        is not currently supported. Please ping
-        https://github.com/pytorch/pytorch/issues/71595 if this is necessary
-        for your use case.
-
-    .. warning ::
-        _register_fused_optim and register_comm_hook currently do not
-        compose together, meaning that custom DDP communication hooks are
-        not supported with overlapped optimizers. Please ping
-        https://github.com/pytorch/pytorch/issues/71595 if this is necessary
-        for your use case.
-
-    .. warning ::
-        Gradient accumulation and DDP `no_sync` are currently not supported
-        with overlapped optimizer. Please ping
-        https://github.com/pytorch/pytorch/issues/71595 if this is necessary
-        for your use case.
+            Args:
+                optim_cls (Type): a ``torch.optim.Optimizer`` class to be registered
+                as a fused optimizer.
+                *args (Sequence[Any]): Arguments to forward to `optim_cls`.
+                optim_params (Optional[Iterable[torch.Tensor]]): Set of parameters
+                to optimize, similar to `params` argument of traditional `torch.optim`
+                Optimizers. If this is omitted, all DDP model parameters will be
+                optimized.
+                **kwargs: (Dict[str, Any]): Keyword arguments to forward to `optim_cls`.
 
-    Example::
+        .. warning ::
+            _register_fused_optim should only be called once on a DDP instance,
+            and registering multiple fused optimizers for the same DDP model
+            is not currently supported. Please ping
+            https://github.com/pytorch/pytorch/issues/71595 if this is necessary
+            for your use case.
 
-        >>> # xdoctest: +SKIP("No rendezvous handler")
-        >>> torch.distributed.init_process_group(backend='nccl', world_size=4, init_method='...')
-        >>> net = torch.nn.parallel.DistributedDataParallel(model, pg)
-        >>> lr = 1e-2
-        >>> betas = (0.9, 0.99)
-        >>> eps = 1e-6
-        >>> net._register_fused_optim(torch.optim.Adam, lr, betas=betas, eps=eps)
-        >>> # Example with subset of parameters
-        >>> params_to_opt = [list(net.parameters())[0]]
-        >>> net._register_fused_optim(
-        ...   torch.optim.Adam, lr, optim_params=params_to_opt,  betas=betas, eps=eps
-        ... )
+        .. warning ::
+            _register_fused_optim and register_comm_hook currently do not
+            compose together, meaning that custom DDP communication hooks are
+            not supported with overlapped optimizers. Please ping
+            https://github.com/pytorch/pytorch/issues/71595 if this is necessary
+            for your use case.
+
+        .. warning ::
+            Gradient accumulation and DDP `no_sync` are currently not supported
+            with overlapped optimizer. Please ping
+            https://github.com/pytorch/pytorch/issues/71595 if this is necessary
+            for your use case.
+
+        Example::
+
+            >>> # xdoctest: +SKIP("No rendezvous handler")
+            >>> torch.distributed.init_process_group(backend='nccl', world_size=4, init_method='...')
+            >>> net = torch.nn.parallel.DistributedDataParallel(model, pg)
+            >>> lr = 1e-2
+            >>> betas = (0.9, 0.99)
+            >>> eps = 1e-6
+            >>> net._register_fused_optim(torch.optim.Adam, lr, betas=betas, eps=eps)
+            >>> # Example with subset of parameters
+            >>> params_to_opt = [list(net.parameters())[0]]
+            >>> net._register_fused_optim(
+            ...   torch.optim.Adam, lr, optim_params=params_to_opt,  betas=betas, eps=eps
+            ... )
         """
         # Note: importing in function, otherwise this will cause a circular
         # import as optimizer_overlap module needs to import DistributedDataParallel.
-        from torch.distributed.algorithms._optimizer_overlap import _as_overlapped_optim
+        from torch.distributed.algorithms._optimizer_overlap import (
+            _as_overlapped_optim,
+        )
 
-        overlapped_optim = _as_overlapped_optim(optim, optim_params, *args, **kwargs)
+        overlapped_optim = _as_overlapped_optim(
+            optim, optim_params, *args, **kwargs
+        )
         try:
             overlapped_optim.register_ddp(self)
         except NotImplementedError:
@@ -1565,16 +1632,16 @@ def _distributed_broadcast_coalesced(
 
     def _check_sync_bufs_post_fwd(self):
         return (
-            self.will_sync_module_buffers() and
-            hasattr(self, 'buffer_hook') and
-            self.buffer_hook.buffer_comm_hook_location ==
-            _BufferCommHookLocation.POST_FORWARD
+            self.will_sync_module_buffers()
+            and hasattr(self, "buffer_hook")
+            and self.buffer_hook.buffer_comm_hook_location
+            == _BufferCommHookLocation.POST_FORWARD
         )
 
     def _check_sync_bufs_pre_fwd(self):
         return self.will_sync_module_buffers() and (
-            not hasattr(self, 'buffer_hook') or
-            self.buffer_hook.buffer_comm_hook_location
+            not hasattr(self, "buffer_hook")
+            or self.buffer_hook.buffer_comm_hook_location
             == _BufferCommHookLocation.PRE_FORWARD
         )
 
@@ -1621,8 +1688,10 @@ def _sync_buffers(self):
             self._sync_module_buffers(authoritative_rank)
 
     def _sync_module_buffers(self, authoritative_rank):
-        if not hasattr(self, 'buffer_hook'):
-            self._default_broadcast_coalesced(authoritative_rank=authoritative_rank)
+        if not hasattr(self, "buffer_hook"):
+            self._default_broadcast_coalesced(
+                authoritative_rank=authoritative_rank
+            )
         else:
             hook = self.buffer_hook.buffer_comm_hook
             state = self.buffer_hook.buffer_comm_hook_state
@@ -1644,9 +1713,7 @@ def _default_broadcast_coalesced(
             bucket_size = self.broadcast_bucket_size
 
         self._distributed_broadcast_coalesced(
-            bufs,
-            bucket_size,
-            authoritative_rank
+            bufs, bucket_size, authoritative_rank
         )
 
     def _passing_sync_batchnorm_handle(self, module):
@@ -1654,12 +1721,15 @@ def _passing_sync_batchnorm_handle(self, module):
             if isinstance(layer, torch.nn.modules.SyncBatchNorm):
                 if self.device_type == "cpu":
                     self._log_and_throw(
-                        ValueError, "SyncBatchNorm layers only work with GPU modules"
+                        ValueError,
+                        "SyncBatchNorm layers only work with GPU modules",
                     )
 
     def _check_comm_hook(self, hook):
         if not callable(hook):
-            self._log_and_throw(TypeError, "Communication hook must be callable.")
+            self._log_and_throw(
+                TypeError, "Communication hook must be callable."
+            )
 
         sig = inspect.signature(hook)
         if (
@@ -1680,18 +1750,23 @@ def _check_comm_hook(self, hook):
                 "Communication hook: return annotation should be torch.futures.Future[torch.Tensor].",
             )
 
-        if (
-            hook.__name__ in ["bf16_compress_hook", "bf16_compress_wrapper_hook"]
-            and
-            (
-                (torch.version.cuda is None and torch.version.hip is None)
-                or (torch.version.cuda is not None and int(torch.version.cuda.split('.')[0]) < 11)
-                or not dist.is_available()
-                or not dist.is_nccl_available()
-                or torch.cuda.nccl.version() < (2, 10)
+        if hook.__name__ in [
+            "bf16_compress_hook",
+            "bf16_compress_wrapper_hook",
+        ] and (
+            (torch.version.cuda is None and torch.version.hip is None)
+            or (
+                torch.version.cuda is not None
+                and int(torch.version.cuda.split(".")[0]) < 11
             )
+            or not dist.is_available()
+            or not dist.is_nccl_available()
+            or torch.cuda.nccl.version() < (2, 10)
         ):
-            self._log_and_throw(TypeError, "BF16 all reduce communication hook required CUDA 11+ and NCCL 2.10+.")
+            self._log_and_throw(
+                TypeError,
+                "BF16 all reduce communication hook required CUDA 11+ and NCCL 2.10+.",
+            )
 
     @property
     def _distributed_rank(self):

From 862bf333f913b4f8d229a9a68b4b0cb7afcec771 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Wed, 26 Oct 2022 06:33:43 +0000
Subject: [PATCH 0169/1922] Couple fixes for argmax/argmin (#87758)

Removes a wrong assert, makes min number of warps = 2 (1 for some reason generates invalid code, https://github.com/openai/triton/issues/802).
Hopefully fixes https://github.com/pytorch/torchdynamo/issues/1743, cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 @mreso

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87758
Approved by: https://github.com/Chillee, https://github.com/soumith
---
 test/inductor/test_torchinductor.py    | 11 +++++++++++
 torch/_inductor/ir.py                  |  1 -
 torch/_inductor/triton_ops/autotune.py |  2 +-
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index c0139b3fcdf86..a675fc476672b 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -721,6 +721,17 @@ def fn(a):
 
         self.common(fn, (torch.full((4,), float("-inf")),))
 
+    def test_reduction4(self):
+        if self.device == "cpu":
+            raise unittest.SkipTest("Non-deterministic CPU results")
+
+        def fn(a):
+            return (a.argmax(-1), a.argmin(-1))
+
+        inputs = (torch.ones(128), torch.ones(4, 4, 1))
+        for i in inputs:
+            self.common(fn, (i,))
+
     @patch.object(config, "dynamic_shapes", False)
     def test_unroll_small_reduction(self):
         def fn(x):
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 867e26e56c5ef..7554dc905e23f 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -688,7 +688,6 @@ def create(
             if reduction_type in ("argmin", "argmax"):
 
                 def fn(index):
-                    assert len(index) <= 1
                     return 0
 
             else:
diff --git a/torch/_inductor/triton_ops/autotune.py b/torch/_inductor/triton_ops/autotune.py
index 5d53b3522a25c..59ee762c7500a 100644
--- a/torch/_inductor/triton_ops/autotune.py
+++ b/torch/_inductor/triton_ops/autotune.py
@@ -343,7 +343,7 @@ def triton_config_reduction(size_hints, x, r, num_stages=2) -> Config:
         r *= 2
 
     cfg = {"XBLOCK": x, "RBLOCK": r}
-    num_warps = next_power_of_2(min(max(conditional_product(x, r) // 128, 1), 8))
+    num_warps = next_power_of_2(min(max(conditional_product(x, r) // 128, 2), 8))
     return Config(cfg, num_warps=num_warps, num_stages=num_stages)
 
 
From 55e7e0ff21539de56413f923888d9d2e1fa73ddb Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <ivan.yashchuk@aalto.fi>
Date: Wed, 26 Oct 2022 14:18:46 +0000
Subject: [PATCH 0170/1922] Remove getitem special handling in the partitioner
 (#87073)

This special handling of getitem unnecessary splits fusions at functions with tuple outputs.

Example script:
```py
import torch
from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
from torch._prims.nvfuser_executor import NvfuserPrimOperatorSupport
from torch.fx.experimental.proxy_tensor import make_fx

def func(x):
    xx = torch.ops.nvprims.add(x, 1)
    var, mean = torch.ops.nvprims.var_mean(x, correction=0)
    var_cos = torch.ops.nvprims.cos(var)
    mean_sin = torch.ops.nvprims.sin(mean)
    return torch.ops.nvprims.add(var_cos, mean_sin)

a = torch.randn(5, 3, 3, device="cuda")
gm = make_fx(func)(a)
gm.graph.print_tabular()

supported_ops = NvfuserPrimOperatorSupport()
partitioner = CapabilityBasedPartitioner(
    gm, supported_ops, allows_single_node_partition=False
)
partitions = partitioner.propose_partitions()
print(partitions)
partitioned_graph = partitioner.fuse_partitions(partitions)
partitioned_graph.graph.print_tabular()
```
Output on master:
```py
opcode         name       target                       args              kwargs
-------------  ---------  ---------------------------  ----------------  -----------------
placeholder    x_1        x_1                          ()                {}
call_function  add        nvprims.add.default          (x_1, 1)          {}
call_function  var_mean   nvprims.var_mean.main        (x_1, [0, 1, 2])  {'correction': 0}
call_function  getitem    <built-in function getitem>  (var_mean, 0)     {}
call_function  getitem_1  <built-in function getitem>  (var_mean, 1)     {}
call_function  cos        nvprims.cos.default          (getitem,)        {}
call_function  sin        nvprims.sin.default          (getitem_1,)      {}
call_function  add_1      nvprims.add.default          (cos, sin)        {}
output         output     output                       (add_1,)          {}
[{cos, sin, add_1}, {var_mean, add, getitem, getitem_1}]
opcode         name       target                       args                    kwargs
-------------  ---------  ---------------------------  ----------------------  --------
placeholder    x_1        x_1                          ()                      {}
call_module    fused_1    fused_1                      (x_1,)                  {}
call_function  getitem_2  <built-in function getitem>  (fused_1, 0)            {}
call_function  getitem_3  <built-in function getitem>  (fused_1, 1)            {}
call_module    fused_0    fused_0                      (getitem_2, getitem_3)  {}
output         output     output                       (fused_0,)              {}
```
Output with this PR:
```
[{var_mean, add_1, cos, sin, add, getitem_1, getitem}]
opcode       name     target    args        kwargs
-----------  -------  --------  ----------  --------
placeholder  x_1      x_1       ()          {}
call_module  fused_0  fused_0   (x_1,)      {}
output       output   output    (fused_0,)  {}
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87073
Approved by: https://github.com/jjsjann123, https://github.com/SherlockNoMad
---
 test/test_fx_passes.py               |  9 ++++++++-
 test/test_prims.py                   | 23 +++++++++++++++++++++++
 torch/_prims/nvfuser_executor.py     | 12 ++++++------
 torch/fx/passes/infra/partitioner.py | 20 +++++++++-----------
 4 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/test/test_fx_passes.py b/test/test_fx_passes.py
index 0aa721f34a167..aa04fbac26187 100644
--- a/test/test_fx_passes.py
+++ b/test/test_fx_passes.py
@@ -182,10 +182,16 @@ def forward13(a, b, c):
         c1 = a1 + c
         return b1 + c1
 
+    @staticmethod
+    def forward14(a, b, c):
+        a0, a1 = torch.ops.aten.std_mean(a)
+        out = a0 + 1.0
+        return out
+
 # A mock OperatorSupport class, where only operator.add is supported
 class MockOperatorSupport(OperatorSupport):
     def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
-        return node.op == "call_function" and node.target in {operator.add, operator.getitem}
+        return node.op == "call_function" and node.target in {operator.add, operator.getitem, torch.ops.aten.std_mean}
 
 
 @instantiate_parametrized_tests
@@ -215,6 +221,7 @@ class TestFXGraphPasses(JitTestCase):
 
         # 5 getitem special case
         (TestPartitionFunctions.forward13, [["add_2", "add_1", "add"]]),
+        (TestPartitionFunctions.forward14, [["add", "std_mean", "getitem", "getitem_1"]]),
     ])
     def test_partitioner(self, fn, expected_partition):
         traced = symbolic_trace(fn)
diff --git a/test/test_prims.py b/test/test_prims.py
index 674a032796044..f1b8f897528b9 100644
--- a/test/test_prims.py
+++ b/test/test_prims.py
@@ -215,6 +215,29 @@ def func(a):
         )
         self.assertFalse(include_any_nvprims_sin)
 
+    def test_partitioner_tuple_output(self, device):
+        # This test verifies that the partitioner doesn't segment on nodes with
+        # tuple outputs.
+        from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
+        from torch._prims.nvfuser_executor import NvfuserPrimOperatorSupport
+
+        a = make_tensor(5, 3, 3, device=device, dtype=torch.float32)
+
+        def func(x):
+            xx = torch.ops.nvprims.add(x, 1)
+            var, mean = torch.ops.nvprims.var_mean(x, correction=0)
+            var_cos = torch.ops.nvprims.cos(var)
+            mean_sin = torch.ops.nvprims.sin(mean)
+            return torch.ops.nvprims.add(var_cos, mean_sin)
+
+        gm = make_fx(func)(a)
+        supported_ops = NvfuserPrimOperatorSupport()
+        partitioner = CapabilityBasedPartitioner(
+            gm, supported_ops, allows_single_node_partition=False
+        )
+        partitions = partitioner.propose_partitions()
+        self.assertEqual(len(partitions), 1)
+
     @onlyCUDA
     @skipCUDAIfRocm
     def test_nvfuser_empty_fusion(self, device):
diff --git a/torch/_prims/nvfuser_executor.py b/torch/_prims/nvfuser_executor.py
index e7d3df238bb50..01e566d97874c 100644
--- a/torch/_prims/nvfuser_executor.py
+++ b/torch/_prims/nvfuser_executor.py
@@ -1,3 +1,4 @@
+import operator
 from copy import deepcopy
 from dataclasses import dataclass
 from functools import lru_cache
@@ -89,7 +90,7 @@ def make_nvfuser_fusion(gm: GraphModule, *nv_args_templates):
 
     # Everything in the graph must support nvfuser
     for node in gm.graph.nodes:
-        if node.op == "call_function" and "getitem" in node.name:
+        if node.op == "call_function" and node.target == operator.getitem:
             continue
         if (
             node.op == "call_function"
@@ -152,7 +153,7 @@ def run_node(self, node):
 
             def call_function(self, target, args, kwargs):
                 # This handles tuple unpacking
-                if "getitem" in str(target):
+                if target == operator.getitem:
                     assert isinstance(args[0], tuple)
                     return target(*args, **kwargs)
                 args = tuple(map(_to_nvfuser_constant, args))
@@ -237,10 +238,9 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
                 )
                 is not None
             )
-        return (
-            node.op == "call_function"
-            and getattr(node.target, "impl_nvfuser", None) is not None
-            or "getitem" in node.name  # getitem is a special case
+        return node.op == "call_function" and (
+            getattr(node.target, "impl_nvfuser", None) is not None
+            or node.target == operator.getitem
         )
 
 
diff --git a/torch/fx/passes/infra/partitioner.py b/torch/fx/passes/infra/partitioner.py
index bc2af4c78cb7a..d582f98ecb764 100644
--- a/torch/fx/passes/infra/partitioner.py
+++ b/torch/fx/passes/infra/partitioner.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Set, Iterable
+from typing import Dict, List, Set, Iterable, Optional
 
 from torch.fx.passes.utils.fuser_utils import fuse_by_partitions
 
@@ -44,12 +44,6 @@ def __init__(self,
     def __is_node_supported(self, node: Node) -> bool:
         return (
             self.operator_support.is_node_supported(dict(self.graph_module.named_modules()), node)
-            and
-            # reject 'getitem' node since they are special cased in partitioning.
-            (
-                node.op != "call_function" or
-                _get_qualified_name(node.target) != "_operator.getitem"    # type: ignore[arg-type]
-            )
         )
 
     def propose_partitions(self) -> List[Partition]:
@@ -110,13 +104,17 @@ def dfs_find_cycle(node):
 
             return True
 
-        def merge_single_node(node: Node, id: int):
-            assert node not in assignment
+        def merge_single_node(node: Node, id: Optional[int]):
+            if node in assignment:
+                partitions_by_id[assignment[node]].remove_node(node)
 
-            assignment[node] = id
-            if id not in partitions_by_id:
+            if id is None:
+                assignment.pop(node)
+            elif id not in partitions_by_id:
+                assignment[node] = id
                 partitions_by_id[id] = Partition(id=id, nodes=[node])
             else:
+                assignment[node] = id
                 partitions_by_id[id].add_node(node)
 
         logger.debug("Proposing partitions...")

From ffd6d882b1f180970e374686332390cf38297f13 Mon Sep 17 00:00:00 2001
From: Mike Iovine <mikeiovine@meta.com>
Date: Wed, 26 Oct 2022 14:34:29 +0000
Subject: [PATCH 0171/1922] [Static Runtime] Make canEnableStaticRuntime
 examine sub-blocks (#87396)

Summary:
Someone was running into problems where

1) Static Runtime enablement would fail
2) We would try to fall back to the JIT interpreter *after trying to create `StaticModule`*
3) The fallback fails because Static Runtime mangled the graph.

We don't want to prevent Static Runtime from mutating its input due to memory concerns. The intent of `canEnableStaticRuntime` is to catch issues in the module before Static Runtime messes with it.

With this diff, `StaticModule` instantiation can be avoided by querying `canEnableStaticRuntime` and the issue is fixed.

Test Plan: New unit test

Differential Revision: D40564452

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87396
Approved by: https://github.com/tenpercent
---
 .../static_runtime/test_static_module.cc      | 12 +++++++
 torch/csrc/jit/runtime/static/impl.cpp        | 33 ++++++++++++++-----
 2 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/benchmarks/static_runtime/test_static_module.cc b/benchmarks/static_runtime/test_static_module.cc
index 70d1d1d306939..1574cda3ee24a 100644
--- a/benchmarks/static_runtime/test_static_module.cc
+++ b/benchmarks/static_runtime/test_static_module.cc
@@ -354,6 +354,18 @@ TEST(StaticRuntime, CanEnableStaticRuntime) {
   EXPECT_TRUE(testCanEnableStaticRuntime(is_not_script_none));
 }
 
+TEST(StaticRuntime, CanEnableStaticRuntimeSubBlocks) {
+  const auto src = R"JIT(
+    def forward(self, a: Tensor, b: Tensor, cond: bool):
+        if cond:
+            # aten::__is__ on tensors is blocked
+            return a is b
+        return False
+  )JIT";
+
+  EXPECT_FALSE(testCanEnableStaticRuntime(src));
+}
+
 TEST(StaticRuntime, NestedOutput) {
   // dict of tuple of list
   const auto nested_output_script_0 = R"JIT(
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index 897f3b5eee644..bef31efb50d17 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -56,9 +56,9 @@ namespace jit {
 
 namespace {
 
-bool allArgsAreTensors(Node* node) {
+bool allArgsAreTensors(const Node* node) {
   const auto& inputs = node->inputs();
-  return std::all_of(inputs.begin(), inputs.end(), [](Value* value) {
+  return std::all_of(inputs.begin(), inputs.end(), [](const Value* value) {
     return value->type()->kind() == TypeKind::TensorType;
   });
 }
@@ -69,7 +69,7 @@ bool allArgsAreTensors(Node* node) {
 // These are rarely-used ops. Disallowing them typically eliminates
 // corner cases in graph optimizations, allowing for more aggressive
 // optimizations and better performance.
-bool isUnsupportedOp(Node* node) {
+bool isUnsupportedOp(const Node* node) {
   auto kind = node->kind();
   if (kind != aten::__is__ && kind != aten::__isnot__) {
     return false;
@@ -87,12 +87,21 @@ bool isUnsupportedOp(Node* node) {
   return allArgsAreTensors(node);
 }
 
-// graph must be frozen or canEnableStaticRuntime would return false
-// if there's any prim::CallMethod op left in the graph
-bool canEnableStaticRuntime(const std::shared_ptr<torch::jit::Graph>& graph) {
-  // check for sub-blocks
+namespace {
+
+bool canEnableStaticRuntimeImpl(const Block* block) {
+  if (block == nullptr) {
+    return false;
+  }
+
   bool can_support = true;
-  for (auto* node : graph->block()->nodes()) {
+  for (auto* node : block->nodes()) {
+    for (auto* subblock : node->blocks()) {
+      // The ordering prevents && from short circuiting, which we want -
+      // it's useful to see *all* the unsupported ops.
+      can_support = canEnableStaticRuntimeImpl(subblock) && can_support;
+    }
+
     const auto kind = node->kind();
     if (kind == prim::Constant) {
       continue;
@@ -107,6 +116,14 @@ bool canEnableStaticRuntime(const std::shared_ptr<torch::jit::Graph>& graph) {
   return can_support;
 }
 
+} // namespace
+
+// Graph must be frozen. canEnableStaticRuntime will return false
+// if there's any prim::CallMethod ops left in the graph.
+bool canEnableStaticRuntime(const std::shared_ptr<torch::jit::Graph>& graph) {
+  return canEnableStaticRuntimeImpl(graph->block());
+}
+
 namespace {
 
 auto sr_metadata_registerer = torch::class_<StaticRuntimeMetadata>(

From ecde979c3f0baefc07e391575ee47a0c04cd8d60 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Wed, 26 Oct 2022 14:40:29 +0000
Subject: [PATCH 0172/1922] [ROCm] Use -rpath-link to fix libtinfo conflict
 (#83552)

Fixes issue building PyTorch for ROCm5.3 and above on Ubuntu20.04 because libtinfo6 from conda conflicts with the one from the distro causing symbol not found errors.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/83552
Approved by: https://github.com/malfet
---
 cmake/Dependencies.cmake | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 47f5be14ed9a6..05153a0f75d5b 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1270,6 +1270,21 @@ endif()
 
 # ---[ HIP
 if(USE_ROCM)
+  # This prevents linking in the libtinfo from /opt/conda/lib which conflicts with ROCm libtinfo.
+  # Currently only active for Ubuntu 20.04 and greater versions.
+  if(UNIX)
+    file(STRINGS /etc/os-release OS_RELEASE)
+    string(REGEX REPLACE "NAME=\"([A-Za-z]+).*" "\\1" OS_DISTRO ${OS_RELEASE})
+    string(REGEX REPLACE ".*VERSION_ID=\"([0-9\.]+).*" "\\1" OS_VERSION ${OS_RELEASE})
+    if(OS_DISTRO STREQUAL "Ubuntu" AND OS_VERSION VERSION_GREATER_EQUAL "20.04")
+      find_library(LIBTINFO_LOC tinfo NO_CMAKE_PATH NO_CMAKE_ENVIRONMENT_PATH)
+      if(LIBTINFO_LOC)
+        get_filename_component(LIBTINFO_LOC_PARENT ${LIBTINFO_LOC} DIRECTORY)
+        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath-link,${LIBTINFO_LOC_PARENT}")
+      endif()
+    endif()
+  endif()
+
   include(${CMAKE_CURRENT_LIST_DIR}/public/LoadHIP.cmake)
   if(PYTORCH_FOUND_HIP)
     message(INFO "Compiling with HIP for AMD.")

From b980eaf8ac6847a7a91ff1b85e2efe56ba1722e3 Mon Sep 17 00:00:00 2001
From: Antoni Viros i Martin <aviros@meta.com>
Date: Wed, 26 Oct 2022 14:48:27 +0000
Subject: [PATCH 0173/1922] Implement copy_, fill_, and ones_like for Nested
 Tensors backends (#87728)

Summary: This diff implements copy_ in order to allow pinned memory transfers for nested tensors, as well as fill_ and ones_like, to test whether nested tensors can be created with other factory functions.

Test Plan: Pass all CI and sandcastle jobs.

Reviewed By: mikekgfb

Differential Revision: D40689594

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87728
Approved by: https://github.com/cpuhrsch
---
 aten/src/ATen/native/native_functions.yaml    |  4 +
 .../native/nested/NestedTensorFactories.cpp   | 24 +++++-
 .../cuda/NestedTensorTransformerFunctions.cpp |  2 +-
 test/test_nestedtensor.py                     | 76 ++++++++++++++++++-
 4 files changed, 100 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index c1c2b363cb99b..69951d7b2fabf 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1539,6 +1539,7 @@
     SparseCPU, SparseCUDA: copy_sparse_wrapper_
     CompositeExplicitAutograd: copy_
     SparseCsrCPU, SparseCsrCUDA: copy_sparse_compressed_
+    NestedTensorCPU, NestedTensorCUDA: copy_nested_
   autogen: copy.out
 
 - func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor
@@ -2404,6 +2405,7 @@
     QuantizedCPU, QuantizedCUDA: fill_quantized_
     Meta: fill_meta_
     SparseCsrCPU, SparseCsrCUDA: fill_sparse_csr_
+    NestedTensorCPU, NestedTensorCUDA: fill_nested_
   autogen: fill.Scalar_out
 
 - func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!)
@@ -2414,6 +2416,7 @@
     MPS: fill_tensor_mps_
     QuantizedCPU, QuantizedCUDA: fill_quantized_
     Meta: fill_meta_
+    NestedTensorCPU, NestedTensorCUDA: fill_nested_
   autogen: fill.Tensor_out
 
 - func: floor(Tensor self) -> Tensor
@@ -3863,6 +3866,7 @@
     # NB: Although this composite mutates on the inside, it is
     # non-differentiable so NonFunctional doesn't apply
     CompositeExplicitAutograd: ones_like
+    NestedTensorCPU, NestedTensorCUDA: ones_like
   autogen: ones_like.out
 
 - func: pairwise_distance(Tensor x1, Tensor x2, float p=2, float eps=1e-06, bool keepdim=False) -> Tensor
diff --git a/aten/src/ATen/native/nested/NestedTensorFactories.cpp b/aten/src/ATen/native/nested/NestedTensorFactories.cpp
index 998a62eb136d1..01e72649bd3ff 100644
--- a/aten/src/ATen/native/nested/NestedTensorFactories.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorFactories.cpp
@@ -106,9 +106,31 @@ Tensor _to_copy_nested(
   Tensor r;
   r = at::empty_like(self, dtype, layout, device, pin_out, memory_format);
   get_nested_tensor_impl(r)->get_buffer().copy_(
-      get_nested_tensor_impl(self)->get_buffer());
+      get_nested_tensor_impl(self)->get_buffer(), non_blocking);
   return r;
 }
 
+Tensor& copy_nested_(Tensor& self, const Tensor& src, bool non_blocking) {
+  const auto* nt_self = get_nested_tensor_impl(self);
+  const auto* nt_src = get_nested_tensor_impl(src);
+  TORCH_CHECK(
+      at::equal(nt_self->get_nested_size_tensor(), nt_src->get_nested_size_tensor()),
+      "copy_ only supports tensors that are the same size for Nested implementations");
+  nt_self->get_buffer().copy_(nt_src->get_buffer(), non_blocking);
+  return self;
+}
+
+Tensor& fill_nested_(Tensor& self, const Scalar& value) {
+  const auto& self_buf = get_nested_tensor_impl(self)->get_buffer();
+  self_buf.fill_(value);
+  return self;
+}
+
+Tensor& fill_nested_(Tensor& self, const Tensor& value) {
+  const auto& self_buf = get_nested_tensor_impl(self)->get_buffer();
+  self_buf.fill_(value);
+  return self;
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
index 4028c8d5c3e4b..307fc20721d60 100644
--- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
@@ -53,7 +53,7 @@ Tensor nested_from_padded_cuda(
     TORCH_CHECK(
         (padded.dim() == 4 && do_transform_0213) ||
             (padded.dim() == 3 && !do_transform_0213),
-        "padded tensor size error");
+        "padded tensor size error: ", padded.dim());
     Tensor target_offsets =
         NestedTensor_batch_offsets_from_size_tensor(sizes, 0);
     Tensor padded_sizes_tensor = at::tensor(padded.sizes());
diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py
index 7eb7dead38d3d..f51db59958696 100644
--- a/test/test_nestedtensor.py
+++ b/test/test_nestedtensor.py
@@ -1,18 +1,26 @@
 # Owner(s): ["module: nestedtensor"]
 
+import unittest
+
 import torch
 import torch.nn
-import unittest
 from torch.testing._internal.common_device_type import (
     dtypes,
     dtypesIfCUDA,
     instantiate_device_type_tests,
-    skipMeta,
+    onlyCPU,
     onlyCUDA,
-    onlyCPU
+    skipMeta,
 )
 from torch.testing._internal.common_dtype import floating_types_and_half
-from torch.testing._internal.common_utils import TestCase, IS_FBCODE, run_tests, freeze_rng_state, parametrize, gradcheck
+from torch.testing._internal.common_utils import (
+    freeze_rng_state,
+    gradcheck,
+    IS_FBCODE,
+    parametrize,
+    run_tests,
+    TestCase,
+)
 
 # Tests are ported from pytorch/nestedtensor.
 # This makes porting as_nested_tensor easier in the future.
@@ -365,6 +373,66 @@ def test_data_ptr(getter):
                     self.assertIs(torch.int32, nt2.to(dtype=torch.int32).dtype)
                     self.assertEqual(nt2.device, nt2.to(dtype=torch.int32).device)
 
+    def test_copy_(self):
+        ntensors = 4
+        nt = random_nt(torch.device('cpu'), torch.float32, ntensors, (4, 4))
+        nt_copy = torch.empty_like(nt)
+        nt_copy.copy_(nt)
+
+        for (nt_ub, nt_copy_ub) in zip(nt.unbind(), nt_copy):
+            self.assertEqual(nt_ub, nt_copy_ub)
+
+        nt_error = torch.nested.nested_tensor([torch.tensor([0, 0])])
+        self.assertRaisesRegex(
+            RuntimeError,
+            "copy_ only supports tensors that are the same size for Nested implementations",
+            lambda: nt_error.copy_(nt)
+        )
+
+        if torch.cuda.is_available():
+            nt = random_nt(torch.device('cuda'), torch.float32, ntensors, (4, 4))
+            nt_copy = torch.empty_like(nt, device=torch.device('cpu'))
+            nt_copy.copy_(nt, non_blocking=True)
+            torch.cuda.current_stream(torch.cuda.current_device()).synchronize()
+            for (nt_ub, nt_copy_ub) in zip(nt.unbind(), nt_copy):
+                self.assertEqual(nt_ub, nt_copy_ub)
+
+            nt_copy = torch.empty_like(nt, device=torch.device('cpu'))
+            nt_copy.copy_(nt, non_blocking=False)
+            for (nt_ub, nt_copy_ub) in zip(nt.unbind(), nt_copy):
+                self.assertEqual(nt_ub, nt_copy_ub)
+
+    def test_fill_(self):
+        ntensors = 4
+        nt = random_nt(torch.device('cpu'), torch.float32, ntensors, (4, 4))
+        nt.fill_(10.)
+        for nt_ub in nt.unbind():
+            t = torch.empty_like(nt_ub)
+            t.fill_(10.)
+            self.assertEqual(nt_ub, t)
+
+        fill_tensor = torch.tensor([11.])
+        self.assertRaisesRegex(
+            RuntimeError,
+            "fill_ only supports 0-dimension value tensor",
+            lambda: nt.fill_(fill_tensor)
+        )
+
+        nt.fill_(fill_tensor[0])
+        for nt_ub in nt.unbind():
+            t = torch.empty_like(nt_ub)
+            t.fill_(11.)
+            self.assertEqual(nt_ub, t)
+
+    def test_ones_like(self):
+        ntensors = 4
+        nt = random_nt(torch.device('cpu'), torch.float32, ntensors, (4, 4))
+        ones_nt = torch.ones_like(nt)
+
+        for nt_ub in ones_nt.unbind():
+            t = torch.ones_like(nt_ub)
+            self.assertEqual(nt_ub, t)
+
 
 class TestNestedTensorDeviceType(TestCase):
 

From b53172b61675f02a14eb6276a97a152cdda8212b Mon Sep 17 00:00:00 2001
From: Pruthvi Madugundu <pmagundu@amd.com>
Date: Wed, 26 Oct 2022 15:34:38 +0000
Subject: [PATCH 0174/1922] [ROCm] Move ROCm CI build to python 3.8 version
 (#86677)

Currently it is python 3.7 want to upgrade to python 3.8
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86677
Approved by: https://github.com/malfet
---
 .circleci/docker/build.sh           |  8 +++----
 .github/workflows/docker-builds.yml |  4 ++--
 .github/workflows/periodic.yml      | 36 ++++++++++++++---------------
 .github/workflows/pull.yml          |  8 +++----
 .github/workflows/trunk.yml         | 18 +++++++--------
 5 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh
index 7633f1eacac09..b38456badc271 100755
--- a/.circleci/docker/build.sh
+++ b/.circleci/docker/build.sh
@@ -259,8 +259,8 @@ case "$image" in
     VISION=yes
     CONDA_CMAKE=yes
     ;;
-  pytorch-linux-focal-rocm5.1-py3.7)
-    ANACONDA_PYTHON_VERSION=3.7
+  pytorch-linux-focal-rocm5.1-py3.8)
+    ANACONDA_PYTHON_VERSION=3.8
     GCC_VERSION=9
     PROTOBUF=yes
     DB=yes
@@ -268,8 +268,8 @@ case "$image" in
     ROCM_VERSION=5.1.1
     CONDA_CMAKE=yes
     ;;
-  pytorch-linux-focal-rocm5.2-py3.7)
-    ANACONDA_PYTHON_VERSION=3.7
+  pytorch-linux-focal-rocm5.2-py3.8)
+    ANACONDA_PYTHON_VERSION=3.8
     GCC_VERSION=9
     PROTOBUF=yes
     DB=yes
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 62699dde2243d..572d8146ebe51 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -38,8 +38,8 @@ jobs:
           - docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
           - docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
           - docker-image-name: pytorch-linux-bionic-py3.7-clang9
-          - docker-image-name: pytorch-linux-focal-rocm5.1-py3.7
-          - docker-image-name: pytorch-linux-focal-rocm5.2-py3.7
+          - docker-image-name: pytorch-linux-focal-rocm5.1-py3.8
+          - docker-image-name: pytorch-linux-focal-rocm5.2-py3.8
           - docker-image-name: pytorch-linux-jammy-cuda11.6-cudnn8-py3.8-clang12
           - docker-image-name: pytorch-linux-jammy-cuda11.7-cudnn8-py3.8-clang12
           - docker-image-name: pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index e0b69e6b6d91e..58e379e0b5fd2 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -34,20 +34,20 @@ jobs:
       docker-image: ${{ needs.linux-bionic-cuda11_6-py3-gcc7-slow-gradcheck-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-bionic-cuda11_6-py3-gcc7-slow-gradcheck-build.outputs.test-matrix }}
 
-  linux-focal-rocm5_2-py3_7-slow-build:
-    name: linux-focal-rocm5.2-py3.7-slow
+  linux-focal-rocm5_2-py3_8-slow-build:
+    name: linux-focal-rocm5.2-py3.8-slow
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-focal-rocm5.2-py3.7
-      docker-image-name: pytorch-linux-focal-rocm5.2-py3.7
+      build-environment: linux-focal-rocm5.2-py3.8
+      docker-image-name: pytorch-linux-focal-rocm5.2-py3.8
 
-  linux-focal-rocm5_2-py3_7-slow-test:
-    name: linux-focal-rocm5.2-py3.7-slow
+  linux-focal-rocm5_2-py3_8-slow-test:
+    name: linux-focal-rocm5.2-py3.8-slow
     uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm5_2-py3_7-slow-build
+    needs: linux-focal-rocm5_2-py3_8-slow-build
     with:
-      build-environment: linux-focal-rocm5.2-py3.7
-      docker-image: ${{ needs.linux-focal-rocm5_2-py3_7-slow-build.outputs.docker-image }}
+      build-environment: linux-focal-rocm5.2-py3.8
+      docker-image: ${{ needs.linux-focal-rocm5_2-py3_8-slow-build.outputs.docker-image }}
       test-matrix: |
         { include: [
           { config: "slow", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
@@ -56,20 +56,20 @@ jobs:
       AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
       AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
 
-  linux-focal-rocm5_2-py3_7-distributed-build:
-    name: linux-focal-rocm5.2-py3.7-distributed
+  linux-focal-rocm5_2-py3_8-distributed-build:
+    name: linux-focal-rocm5.2-py3.8-distributed
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-focal-rocm5.2-py3.7
-      docker-image-name: pytorch-linux-focal-rocm5.2-py3.7
+      build-environment: linux-focal-rocm5.2-py3.8
+      docker-image-name: pytorch-linux-focal-rocm5.2-py3.8
 
-  linux-focal-rocm5_2-py3_7-distributed-test:
-    name: linux-focal-rocm5.2-py3.7-distributed
+  linux-focal-rocm5_2-py3_8-distributed-test:
+    name: linux-focal-rocm5.2-py3.8-distributed
     uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm5_2-py3_7-distributed-build
+    needs: linux-focal-rocm5_2-py3_8-distributed-build
     with:
-      build-environment: linux-focal-rocm5.2-py3.7
-      docker-image: ${{ needs.linux-focal-rocm5_2-py3_7-distributed-build.outputs.docker-image }}
+      build-environment: linux-focal-rocm5.2-py3.8
+      docker-image: ${{ needs.linux-focal-rocm5_2-py3_8-distributed-build.outputs.docker-image }}
       test-matrix: |
         { include: [
           { config: "distributed", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index faea02440bfa6..cc25bfc1326d1 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -298,12 +298,12 @@ jobs:
       docker-image-name: pytorch-linux-focal-py3.7-gcc7
       build-generates-artifacts: false
 
-  linux-focal-rocm5_2-py3_7-build:
+  linux-focal-rocm5_2-py3_8-build:
     # don't run build twice on master
     if: github.event_name == 'pull_request'
-    name: linux-focal-rocm5.2-py3.7
+    name: linux-focal-rocm5.2-py3.8
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-focal-rocm5.2-py3.7
-      docker-image-name: pytorch-linux-focal-rocm5.2-py3.7
+      build-environment: linux-focal-rocm5.2-py3.8
+      docker-image-name: pytorch-linux-focal-rocm5.2-py3.8
       sync-tag: rocm-build
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index af348a84556c9..29dc9f3c44d3f 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -284,21 +284,21 @@ jobs:
       cuda-version: "11.6"
       test-matrix: ${{ needs.win-vs2019-cuda11_6-py3-build.outputs.test-matrix }}
 
-  linux-focal-rocm5_2-py3_7-build:
-    name: linux-focal-rocm5.2-py3.7
+  linux-focal-rocm5_2-py3_8-build:
+    name: linux-focal-rocm5.2-py3.8
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-focal-rocm5.2-py3.7
-      docker-image-name: pytorch-linux-focal-rocm5.2-py3.7
+      build-environment: linux-focal-rocm5.2-py3.8
+      docker-image-name: pytorch-linux-focal-rocm5.2-py3.8
       sync-tag: rocm-build
 
-  linux-focal-rocm5_2-py3_7-test:
-    name: linux-focal-rocm5.2-py3.7
+  linux-focal-rocm5_2-py3_8-test:
+    name: linux-focal-rocm5.2-py3.8
     uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm5_2-py3_7-build
+    needs: linux-focal-rocm5_2-py3_8-build
     with:
-      build-environment: linux-focal-rocm5.2-py3.7
-      docker-image: ${{ needs.linux-focal-rocm5_2-py3_7-build.outputs.docker-image }}
+      build-environment: linux-focal-rocm5.2-py3.8
+      docker-image: ${{ needs.linux-focal-rocm5_2-py3_8-build.outputs.docker-image }}
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },

From 342f5f9e82c616f5cc3c83d08a280f6901fdac9f Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Tue, 25 Oct 2022 22:59:58 +0000
Subject: [PATCH 0175/1922] [FSDP][BE] Improve the assert message of sharded
 load_state_dict (#87486)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87486
Approved by: https://github.com/awgu
---
 torch/distributed/fsdp/_state_dict_utils.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/torch/distributed/fsdp/_state_dict_utils.py b/torch/distributed/fsdp/_state_dict_utils.py
index 33fa0d441107b..ed4b8f226c123 100644
--- a/torch/distributed/fsdp/_state_dict_utils.py
+++ b/torch/distributed/fsdp/_state_dict_utils.py
@@ -21,6 +21,7 @@
 )
 
 from ._fsdp_extensions import _ext_chunk_tensor, _ext_pre_load_state_dict_transform
+from ._fsdp_extensions import _extensions as _user_extensions
 from .flat_param import (
     FlatParamHandle,
 )
@@ -288,6 +289,7 @@ def _sharded_pre_load_state_dict_hook(
 
     nonsharded_tensors = []
     shared_fqns = [fqn for fqn, _, _ in module._shared_param_fqns]
+    loaded_shapes = []
     for fqn, _, _ in module._param_fqns:
         full_fqn = f"{prefix}{FSDP.FSDP_PREFIX}{fqn}"
         param = state_dict.pop(full_fqn)
@@ -295,6 +297,7 @@ def _sharded_pre_load_state_dict_hook(
             continue
         # All-gather the param (ShardedTensor)
         param, shards = _ext_pre_load_state_dict_transform(param)
+        loaded_shapes.append(param.size())
         assert len(shards) < 2, (
             "Expects 0 or 1 shard per rank "
             f"but got {len(shards)} shards on rank {module.rank}."
@@ -331,6 +334,11 @@ def _sharded_pre_load_state_dict_hook(
         loaded_flat_param, module.rank, module.world_size,
     )
     loaded_flat_tensor.to(flat_param.device)
+    assert all(s1 == s2 for s1, s2 in zip(loaded_shapes, flat_param._shapes)), (
+        f"The original shapes in FSDP are {flat_param._shapes}. "
+        f"The loaded shapes are {loaded_shapes}. "
+        f"FSDP extension is {'NOT' if _user_extensions is None else ''} None."
+    )
     assert flat_param.numel() == loaded_flat_tensor.numel(), (
         f"The loaded local chunk has different numel({loaded_flat_tensor.numel()}) "
         f"from the local chunk {flat_param.numel()}."

From 8448407ea789263a08a027597d753f1d0b6a6288 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Tue, 25 Oct 2022 06:58:11 -0700
Subject: [PATCH 0176/1922] Add test that `import torch` doesn't modify global
 logging state (#87629)

Fixes https://github.com/pytorch/pytorch/issues/87626

Also adds the same test for `import functorch`. Users have complained at
us when we do modify the global logging state, which has happened in the
past.

Test Plan:
- tested locally; I added `logging.basicConfig` to `torch/__init__.py`
and checked that the test got triggered
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87629
Approved by: https://github.com/albanD
---
 test/test_testing.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/test/test_testing.py b/test/test_testing.py
index 3ad6ff06c771e..ccb2471e71e7b 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -1818,6 +1818,27 @@ def test_no_warning_on_import(self) -> None:
             cwd=os.path.dirname(os.path.realpath(__file__)),).decode("utf-8")
         self.assertEquals(out, "")
 
+    @unittest.skipIf(IS_WINDOWS, "importing torch+CUDA on CPU results in warning")
+    @parametrize('path', ['torch', 'functorch'])
+    def test_no_mutate_global_logging_on_import(self, path) -> None:
+        # Calling logging.basicConfig, among other things, modifies the global
+        # logging state. It is not OK to modify the global logging state on
+        # `import torch` (or other submodules we own) because users do not expect it.
+        expected = 'abcdefghijklmnopqrstuvwxyz'
+        commands = [
+            'import logging',
+            f'import {path}',
+            '_logger = logging.getLogger("torch_test_testing")',
+            'logging.root.addHandler(logging.StreamHandler())',
+            'logging.root.setLevel(logging.INFO)',
+            f'_logger.info("{expected}")'
+        ]
+        out = subprocess.check_output(
+            [sys.executable, "-W", "all", "-c", "; ".join(commands)],
+            stderr=subprocess.STDOUT,
+        ).decode("utf-8")
+        self.assertEqual(out.strip(), expected)
+
 class TestOpInfos(TestCase):
     def test_sample_input(self) -> None:
         a, b, c, d, e = [object() for _ in range(5)]
@@ -1913,6 +1934,7 @@ def test_opinfo_error_generators(self, device, op):
 
 
 instantiate_device_type_tests(TestOpInfoSampleFunctions, globals())
+instantiate_parametrized_tests(TestImports)
 
 
 if __name__ == '__main__':

From 7dcfbdcec11d5a2b98bf6ca35e8aedf256b33a63 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Wed, 26 Oct 2022 16:20:46 +0000
Subject: [PATCH 0177/1922] optim utils all_gather_into_tensor (#87769)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87769
Approved by: https://github.com/awgu
---
 torch/distributed/fsdp/_optim_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py
index 7200e6f207a5f..a5e1ab64278e5 100644
--- a/torch/distributed/fsdp/_optim_utils.py
+++ b/torch/distributed/fsdp/_optim_utils.py
@@ -193,7 +193,7 @@ def _communicate_optim_state(
             # has the same shape as the sharded flattened parameter
             buffer_size = flat_param._full_param_padded.size()  # type: ignore[attr-defined]
             tensor_buffer = value.new_zeros(*buffer_size)
-            dist._all_gather_base(tensor_buffer, value, group=group)
+            dist.all_gather_into_tensor(tensor_buffer, value, group=group)
             torch.cuda.synchronize()
             if to_save:
                 unpadded_numel = flat_param._unpadded_unsharded_size.numel()  # type: ignore[attr-defined]

From 4a93e2aa2536cee4ff1e66dd9fc4dcb5905f7c01 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 26 Oct 2022 16:43:13 +0000
Subject: [PATCH 0178/1922] Revert "[ROCm] Use -rpath-link to fix libtinfo
 conflict (#83552)"

This reverts commit a10446c4d826ae5505fa129ea9800d3924b25364.

Reverted https://github.com/pytorch/pytorch/pull/83552 on behalf of https://github.com/kit1980 due to Broke ios/macos builds https://github.com/pytorch/pytorch/actions/runs/3329991911/jobs/5507911292
---
 cmake/Dependencies.cmake | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 05153a0f75d5b..47f5be14ed9a6 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1270,21 +1270,6 @@ endif()
 
 # ---[ HIP
 if(USE_ROCM)
-  # This prevents linking in the libtinfo from /opt/conda/lib which conflicts with ROCm libtinfo.
-  # Currently only active for Ubuntu 20.04 and greater versions.
-  if(UNIX)
-    file(STRINGS /etc/os-release OS_RELEASE)
-    string(REGEX REPLACE "NAME=\"([A-Za-z]+).*" "\\1" OS_DISTRO ${OS_RELEASE})
-    string(REGEX REPLACE ".*VERSION_ID=\"([0-9\.]+).*" "\\1" OS_VERSION ${OS_RELEASE})
-    if(OS_DISTRO STREQUAL "Ubuntu" AND OS_VERSION VERSION_GREATER_EQUAL "20.04")
-      find_library(LIBTINFO_LOC tinfo NO_CMAKE_PATH NO_CMAKE_ENVIRONMENT_PATH)
-      if(LIBTINFO_LOC)
-        get_filename_component(LIBTINFO_LOC_PARENT ${LIBTINFO_LOC} DIRECTORY)
-        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath-link,${LIBTINFO_LOC_PARENT}")
-      endif()
-    endif()
-  endif()
-
   include(${CMAKE_CURRENT_LIST_DIR}/public/LoadHIP.cmake)
   if(PYTORCH_FOUND_HIP)
     message(INFO "Compiling with HIP for AMD.")

From b0c8e63a4d8459ec9cce9e7da6938b4a6120fbe1 Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <IvanYashchuk@users.noreply.github.com>
Date: Wed, 26 Oct 2022 17:00:02 +0000
Subject: [PATCH 0179/1922] Enable nvprims.transpose fusions for nvFuser
 (#86967)

This PR allows transposes to be fused with other operations. If a fusion group is formed only from operations that just manipulate metadata in PyTorch (transpose, view, etc.) then this group is not sent to nvFuser.
On top of that if we have converted to `nvprims` but then decided to not form a fusion group we modify the graph use `prim.impl_aten` attribute instead of calling `prim(*args, **kwargs)` that has a higher overhead.

cc @kevinstephano @jjsjann123
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86967
Approved by: https://github.com/jjsjann123, https://github.com/SherlockNoMad
---
 test/test_prims.py                   | 26 +++++++++++++++++++++
 torch/_prims/__init__.py             |  1 +
 torch/_prims/context.py              |  4 ----
 torch/_prims/nvfuser_executor.py     | 35 ++++++++++++++++++++++++++--
 torch/_prims/nvfuser_prims.py        |  8 ++++++-
 torch/fx/passes/infra/partitioner.py | 18 +++++++++++---
 6 files changed, 82 insertions(+), 10 deletions(-)

diff --git a/test/test_prims.py b/test/test_prims.py
index f1b8f897528b9..6223a34e0a3a9 100644
--- a/test/test_prims.py
+++ b/test/test_prims.py
@@ -875,6 +875,32 @@ def func7(a):
             out = execute(gm, a, executor="strictly_nvfuser")
             self.assertEqual(out, func(a))
 
+    @onlyCUDA
+    @skipCUDAIfRocm
+    @dtypes(torch.float16, torch.float32)
+    def test_nvprims_view_partitioner(self, device, dtype):
+        # This test verifies that views that are not fused with other ops are
+        # correctly overriden to call aten implementation.
+        from torch.fx.experimental.proxy_tensor import make_fx
+        from torch._prims.context import TorchRefsNvfuserCapabilityMode
+        from torch._prims.nvfuser_executor import maybe_partition_graph
+
+        make_arg = partial(make_tensor, device=device, dtype=dtype)
+        a = make_arg((4, 5))
+        b = make_arg((5, 4))
+
+        def func(a, b):
+            aa = a.view(b.shape)
+            aa = aa.view(a.shape)
+            return aa.digamma()
+
+        with TorchRefsNvfuserCapabilityMode():
+            gm = make_fx(func)(a, b)
+        gm, _ = maybe_partition_graph(gm, False, False)
+
+        out = gm(a, b)
+        self.assertEqual(out, func(a, b))
+
     @onlyCUDA
     @skipCUDAIfRocm
     @dtypes(torch.float32, torch.float16)
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index 3248009ee66e5..b54019ef031c9 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -306,6 +306,7 @@ def _backend_select_impl(*args, **kwargs):
         p.schema = schema
         p.prim_impl = _prim_impl
         p.prim_meta_impl = meta
+        p.impl_aten = impl_aten
 
     return _prim
 
diff --git a/torch/_prims/context.py b/torch/_prims/context.py
index 2bcee069d146c..203d73fd948eb 100644
--- a/torch/_prims/context.py
+++ b/torch/_prims/context.py
@@ -254,10 +254,6 @@ def _is_func_unsupported_nvfuser(
 class TorchRefsNvfuserCapabilityMode(TorchRefsMode):
     def __init__(self, *, skip_ops=()):
         aten_ops_to_skip = (
-            "aten.transpose.int",
-            "aten.t.default",
-            "aten.unsqueeze.default",
-            "aten.permute.default",
             "aten._log_softmax.default",
             "aten._log_softmax_backward_data.default",
             "aten.expand.default",
diff --git a/torch/_prims/nvfuser_executor.py b/torch/_prims/nvfuser_executor.py
index 01e566d97874c..227e1847265bb 100644
--- a/torch/_prims/nvfuser_executor.py
+++ b/torch/_prims/nvfuser_executor.py
@@ -30,7 +30,7 @@
 DEFAULT_NVFUSER_PYTHON_CONFIG = MappingProxyType(
     {
         "use_python_fusion_cache": True,
-        "allow_single_op_fusion": True,
+        "allow_single_op_fusion": False,
     }
 )
 
@@ -268,6 +268,23 @@ def __call__(self, *args):
         )
 
 
+# A set of operators that are supported by nvFuser
+# but should not form a fusion group solely on their own
+_non_compute_ops = [
+    "torch.ops." + str(getattr(torch.ops.nvprims, prim).default)
+    for prim in dir(torch.ops.nvprims)
+    if isinstance(getattr(torch.ops.nvprims, prim), torch._ops.OpOverloadPacket)
+    and getattr(torch.ops.nvprims, prim).return_type
+    == torch._prims_common.RETURN_TYPE.VIEW
+]
+
+_allowed_single_node_partition_ops = [
+    "torch.ops.nvprims.native_batch_norm.default",
+    "torch.ops.nvprims.var_mean.default",
+    "torch.ops.nvprims.var_mean.main",
+]
+
+
 def _remove_empty_like_fill(gm: GraphModule):
     # Remove empty_like + fill nodes that prevent lowering to nvprims
     # This is a workaround for nonoptimal traces of C++ code `(1 - tensor)`
@@ -325,7 +342,11 @@ def maybe_partition_graph(
         # CapabilityBasedPartitioner modifies the graph in-place so we need to make a copy of the graph
         gm = deepcopy(gm)
         partitioner = CapabilityBasedPartitioner(
-            gm, supported_ops, allows_single_node_partition=allow_single_op_fusion
+            gm,
+            supported_ops,
+            allows_single_node_partition=allow_single_op_fusion,
+            non_compute_ops=_non_compute_ops,
+            allowed_single_node_partition_ops=_allowed_single_node_partition_ops,
         )
         partitions = partitioner.propose_partitions()
         if len(partitions) == 0:
@@ -350,6 +371,16 @@ def maybe_partition_graph(
                     NvfuserGraphModule(nvfuser_submodule, use_python_fusion_cache),
                 )
 
+        # Go through the graph and replace all the nodes that were converted to
+        # nvprims but won't be sent to nvFuser with a call to PyTorch's eager
+        # mode. This is necessary because torch.ops.* have higher overhead than
+        # calling the eager mode directly.
+        for node in partitioned_graph.graph.nodes:
+            if node.op == "call_function" and str(node.target).startswith("nvprims."):
+                if getattr(node.target, "impl_aten", None) is not None:
+                    node.target = node.target.impl_aten
+        partitioned_graph.graph.eliminate_dead_code()
+        partitioned_graph.recompile()
         return partitioned_graph, any_unsupported
     else:
         return gm, any_unsupported
diff --git a/torch/_prims/nvfuser_prims.py b/torch/_prims/nvfuser_prims.py
index d4132b356473a..f37a21459e0cd 100644
--- a/torch/_prims/nvfuser_prims.py
+++ b/torch/_prims/nvfuser_prims.py
@@ -538,6 +538,10 @@ def _var_mean_autograd(
         p.return_type = torch._prims_common.RETURN_TYPE.NEW  # type: ignore[attr-defined]
 
 
+def _nvprims_view_impl_aten(a, original_shape, new_shape):
+    return a.reshape(new_shape)
+
+
 def register_view():
     """This function is used to register the view function in torch.ops.view module."""
     # View is implemented as a decomposition into prims.split_dim,
@@ -568,7 +572,8 @@ def _view_no_original_shape_overload_impl(a, shape):
     for p in (prim_packet, prim):
         p.__doc__ = "Creates a tensor with the specified shape containing a copy of the data in a."
         p.impl_nvfuser = _nvfuser_impls["view"]
-        p.return_type = torch._prims_common.RETURN_TYPE.NEW  # type: ignore[attr-defined]
+        p.return_type = torch._prims_common.RETURN_TYPE.VIEW  # type: ignore[attr-defined]
+        p.impl_aten = _nvprims_view_impl_aten
 
 
 def register_nvprims():
@@ -594,3 +599,4 @@ def register_nvprims():
             p.__doc__ = main_prim.__doc__
             p.impl_nvfuser = _nvfuser_impls[name]
             p.return_type = main_prim.return_type  # type: ignore[attr-defined]
+            p.impl_aten = main_prim.impl_aten
diff --git a/torch/fx/passes/infra/partitioner.py b/torch/fx/passes/infra/partitioner.py
index d582f98ecb764..5f5a808b85121 100644
--- a/torch/fx/passes/infra/partitioner.py
+++ b/torch/fx/passes/infra/partitioner.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Set, Iterable, Optional
+from typing import Dict, List, Set, Iterable, Sequence, Optional
 
 from torch.fx.passes.utils.fuser_utils import fuse_by_partitions
 
@@ -35,11 +35,19 @@ class CapabilityBasedPartitioner:
     def __init__(self,
                  graph_module: GraphModule,
                  operator_support: OperatorSupportBase,
-                 allows_single_node_partition: bool = False
+                 allows_single_node_partition: bool = False,
+                 non_compute_ops: Optional[Sequence[str]] = None,
+                 allowed_single_node_partition_ops: Optional[Sequence[str]] = None,
                  ) -> None:
         self.graph_module = graph_module
         self.operator_support = operator_support
         self.allows_single_node_partition = allows_single_node_partition
+        self.non_compute_ops = non_compute_ops if non_compute_ops is not None else []
+        self.allowed_single_node_partition_ops = (
+            allowed_single_node_partition_ops
+            if allowed_single_node_partition_ops is not None
+            else []
+        )
 
     def __is_node_supported(self, node: Node) -> bool:
         return (
@@ -169,7 +177,8 @@ def merge_single_node(node: Node, id: Optional[int]):
         # filter out single node partitions
         if not self.allows_single_node_partition:
             logger.debug("Filtering out single node partitions...")
-            non_compute_ops = {"torch.ops.aten.view", "_operator.getitem"}
+            default_non_compute_ops = {"torch.ops.aten.view", "_operator.getitem"}
+            non_compute_ops = default_non_compute_ops.union(set(self.non_compute_ops))
             partitions_to_remove: List[int] = []
             for id, partition in partitions_by_id.items():
                 compute_node_count = 0
@@ -177,6 +186,9 @@ def merge_single_node(node: Node, id: Optional[int]):
                     if node.op == "call_function" and \
                        _get_qualified_name(node.target) not in non_compute_ops:  # type: ignore[arg-type]
                         compute_node_count += 1
+                    if node.op == "call_function" and \
+                       _get_qualified_name(node.target) in self.allowed_single_node_partition_ops:
+                        compute_node_count += 1
                 if compute_node_count <= 1:
                     partitions_to_remove.append(id)
             for id in partitions_to_remove:

From d0586995d9d0e3b6e33a025d1ff90c706cd29a67 Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Wed, 26 Oct 2022 10:26:44 -0400
Subject: [PATCH 0180/1922] Many symintifications (#87604)

Adds
expand_inplace
conv conv_double_backward
convolution
adaptive_avg_pool2d_symint
_embedding_bag_backward_symint
cudnn_grid_sampler
cuda 32 bit indexing
nll_loss / nll_loss_2d
tensor split
pooling same mode
cudnn_is_acceptable
storage nbytes

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87604
Approved by: https://github.com/ezyang
---
 aten/src/ATen/ExpandUtils.h                   |  5 +-
 aten/src/ATen/core/TensorBase.h               | 10 ++++
 .../ATen/functorch/BatchRulesConvolution.cpp  | 56 +++++++++----------
 .../functorch/BatchRulesDecompositions.cpp    |  2 +-
 .../ATen/native/AdaptiveAveragePooling.cpp    |  6 +-
 aten/src/ATen/native/Convolution.cpp          | 20 +++----
 aten/src/ATen/native/EmbeddingBag.cpp         |  2 +-
 aten/src/ATen/native/GridSamplerUtils.h       |  2 +-
 aten/src/ATen/native/IndexingUtils.cpp        | 12 ++--
 aten/src/ATen/native/LossNLL.cpp              |  2 +-
 aten/src/ATen/native/LossNLL2d.cpp            |  2 +-
 aten/src/ATen/native/NonSymbolicBC.h          |  1 +
 aten/src/ATen/native/Pool.h                   | 18 ++++--
 aten/src/ATen/native/TensorProperties.cpp     |  2 +-
 aten/src/ATen/native/TensorShape.cpp          | 28 +++++++---
 aten/src/ATen/native/group_norm.cpp           | 24 ++++----
 aten/src/ATen/native/native_functions.yaml    | 54 +++++++++---------
 test/functorch/test_aotdispatch.py            |  3 -
 test/test_proxy_tensor.py                     |  4 --
 tools/autograd/derivatives.yaml               | 44 +++++++--------
 tools/jit/gen_unboxing.py                     |  4 +-
 torch/csrc/StorageMethods.cpp                 |  2 +-
 torch/csrc/autograd/FunctionsManual.cpp       | 16 +++---
 torch/csrc/autograd/FunctionsManual.h         |  8 +--
 torch/storage.py                              |  4 +-
 torchgen/api/cpp.py                           |  8 ++-
 torchgen/api/native.py                        |  2 +-
 torchgen/api/python.py                        |  6 +-
 torchgen/gen.py                               |  4 +-
 29 files changed, 197 insertions(+), 154 deletions(-)

diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h
index 779894645b8ec..786cbf132cd77 100644
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@@ -94,10 +94,11 @@ inline void check_defined(
 inline c10::MaybeOwned<Tensor> expand_inplace(
     const Tensor& tensor,
     const Tensor& to_expand) {
-  if (tensor.sizes().equals(to_expand.sizes())) {
+  if (tensor.sym_sizes().equals(to_expand.sym_sizes())) {
     return c10::MaybeOwned<Tensor>::borrowed(to_expand);
   }
-  return c10::MaybeOwned<Tensor>::owned(to_expand.expand(tensor.sizes()));
+  return c10::MaybeOwned<Tensor>::owned(
+      to_expand.expand_symint(tensor.sym_sizes()));
 }
 
 inline c10::MaybeOwned<Tensor> expand_inplace(
diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h
index 08a14f2e09580..0ecd4456033b0 100644
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@@ -955,11 +955,21 @@ c10::SymIntArrayRef sizes(const TensorBase& t) { return t.sym_sizes(); }
 template <typename T, typename = enable_if_int<T>>
 IntArrayRef sizes(const TensorBase& t) { return t.sizes(); }
 
+template <typename T, typename = enable_if_symint<T>>
+c10::SymInt size(const TensorBase& t, int64_t dim) { return t.sym_size(dim); }
+template <typename T, typename = enable_if_int<T>>
+int64_t size(const TensorBase& t, int64_t dim) { return t.size(dim); }
+
 template <typename T, typename = enable_if_symint<T>>
 c10::SymIntArrayRef strides(const TensorBase& t) { return t.sym_strides(); }
 template <typename T, typename = enable_if_int<T>>
 IntArrayRef strides(const TensorBase& t) { return t.strides(); }
 
+template <typename T, typename = enable_if_symint<T>>
+c10::SymInt numel(const TensorBase& t) { return t.sym_numel(); }
+template <typename T, typename = enable_if_int<T>>
+int64_t numel(const TensorBase& t) { return t.numel(); }
+
 } // namespace symint
 
 } // namespace at
diff --git a/aten/src/ATen/functorch/BatchRulesConvolution.cpp b/aten/src/ATen/functorch/BatchRulesConvolution.cpp
index 0640af3a1b533..79523ed1fb6d9 100644
--- a/aten/src/ATen/functorch/BatchRulesConvolution.cpp
+++ b/aten/src/ATen/functorch/BatchRulesConvolution.cpp
@@ -17,7 +17,7 @@ namespace at { namespace functorch {
 // we do not support batch_group_count (which is needed for convolution backwards).
 // Instead, there's a convolution_backward op that needs a batching rule.
 std::tuple<Tensor,optional<int64_t>>
-convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tensor& rhs, optional<int64_t> rhs_bdim, const optional<Tensor>& bias, optional<int64_t> bias_bdim, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool transposed, IntArrayRef output_padding, int64_t groups) {
+convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tensor& rhs, optional<int64_t> rhs_bdim, const optional<Tensor>& bias, optional<int64_t> bias_bdim, IntArrayRef stride, c10::SymIntArrayRef padding, IntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, int64_t groups) {
   DimVector lhs_spec(stride.size() + 2);
   std::iota(lhs_spec.begin(), lhs_spec.end(), 0);
   DimVector rhs_spec = lhs_spec;
@@ -42,13 +42,13 @@ convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tens
   std::tuple<Tensor, optional<int64_t>> result;
   if (lhs_bdim && !rhs_bdim) {
     auto new_x = reshape_dim_into(*lhs_bdim, lhs_spec[0], lhs);
-    auto out = at::convolution(new_x, rhs, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
+    auto out = at::convolution_symint(new_x, rhs, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
     out = reshape_dim_outof(out_spec[0], lhs.sizes()[*lhs_bdim], out);
     result = std::make_tuple(out, out_spec[0]);
   } else if (!lhs_bdim && rhs_bdim) {
     if (groups == 1) {
       auto new_w = reshape_dim_into(*rhs_bdim, rhs_spec[0], rhs);
-      auto out = at::convolution(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
+      auto out = at::convolution_symint(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
       out = reshape_dim_outof(out_spec[1], rhs.size(*rhs_bdim), out);
       result = std::make_tuple(out, out_spec[1]);
     } else {
@@ -62,7 +62,7 @@ convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tens
         // BIOHW -> I(BO)HW
         auto new_w = reshape_dim_into(*rhs_bdim, 1, rhs);
         // NIHW, I(BO)HW -> N(GBO)HW
-        auto out = at::convolution(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
+        auto out = at::convolution_symint(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
         // N(GBO)HW -> NG(BO)HW
         out = reshape_dim_outof(1, groups, out);
         // NG(BO)HW -> NGBOHW
@@ -84,7 +84,7 @@ convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tens
         // G(BO)IHW -> (GBO)IHW
         new_w = reshape_dim_into(0, 0, new_w);
         // N(GI)HW, (GBO)IHW -> N(GBO)HW
-        auto out = at::convolution(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
+        auto out = at::convolution_symint(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
         // N(GBO)HW -> NG(BO)HW
         out = reshape_dim_outof(1, groups, out);
         // NG(BO)HW -> NGBOHW
@@ -99,11 +99,11 @@ convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tens
     groups *= lhs.sizes()[*lhs_bdim];
     auto dim_with_groups = transposed ? 1 : 0;
     auto new_w = reshape_dim_into(*rhs_bdim, rhs_spec[dim_with_groups], rhs);
-    auto out = at::convolution(new_x, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
+    auto out = at::convolution_symint(new_x, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
     out = reshape_dim_outof(out_spec[1], lhs.sizes()[*lhs_bdim], out);
     result = std::make_tuple(out, out_spec[1]);
   } else {
-    result = std::make_tuple(at::convolution(lhs, rhs, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups), nullopt);
+    result = std::make_tuple(at::convolution_symint(lhs, rhs, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups), nullopt);
   }
   if (separate_bias) {
     auto A = std::get<0>(result);
@@ -244,8 +244,8 @@ convolution_backward_input_batch_rule(
     const Tensor& grad_output, optional<int64_t> grad_output_bdim,
     const Tensor& input, optional<int64_t> input_bdim,
     const Tensor& weight, optional<int64_t> weight_bdim,
-    IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool transposed,
-    IntArrayRef output_padding, int64_t groups) {
+    IntArrayRef stride, c10::SymIntArrayRef padding, IntArrayRef dilation, bool transposed,
+    c10::SymIntArrayRef output_padding, int64_t groups) {
   const std::array<bool, 3> mask = {true, false, false};
   if (grad_output_bdim && weight_bdim) {
     // regular: BNO, BOI -> N(BO), (BO)I -> N(BI)
@@ -254,7 +254,7 @@ convolution_backward_input_batch_rule(
     const auto grad_output_ = reshape_dim_into(*grad_output_bdim, 1, grad_output);
     const auto weight_ = reshape_dim_into(*weight_bdim, 0, weight);
     auto dummy_input = make_dummy(input, input_bdim, 1, batch_size);
-    const auto result = at::convolution_backward(
+    const auto result = at::convolution_backward_symint(
         grad_output_, dummy_input, weight_, nullopt, stride, padding,
         dilation, transposed, output_padding, groups * batch_size, mask);
     const auto grad_input = reshape_dim_outof(1, batch_size, std::get<0>(result));
@@ -265,7 +265,7 @@ convolution_backward_input_batch_rule(
     const auto batch_size = grad_output.size(*grad_output_bdim);
     const auto grad_output_ = reshape_dim_into(*grad_output_bdim, 0, grad_output);
     auto dummy_input = make_dummy(input, input_bdim, 0, batch_size);
-    const auto result = at::convolution_backward(
+    const auto result = at::convolution_backward_symint(
         grad_output_, dummy_input, weight, nullopt, stride, padding,
         dilation, transposed, output_padding, groups, mask);
     const auto grad_input = reshape_dim_outof(0, batch_size, std::get<0>(result));
@@ -278,7 +278,7 @@ convolution_backward_input_batch_rule(
       const auto in_ch_dim = transposed ? 0 : 1;
       const auto weight_ = reshape_dim_into(*weight_bdim, in_ch_dim, weight);
       auto dummy_input = make_dummy(input, input_bdim, 1, batch_size);
-      const auto result = at::convolution_backward(
+      const auto result = at::convolution_backward_symint(
           grad_output, dummy_input, weight_, nullopt, stride, padding,
           dilation, transposed, output_padding, groups, mask);
       const auto grad_input = reshape_dim_outof(1, batch_size, std::get<0>(result));
@@ -289,7 +289,7 @@ convolution_backward_input_batch_rule(
       // N(GO), B(GO)I -> N(GO), (GO)(BI) -> N(GBI)
       const auto weight_ = reshape_dim_into(*weight_bdim, 1, weight);
       auto dummy_input = make_dummy(input, input_bdim, 1, batch_size);
-      const auto result = at::convolution_backward(
+      const auto result = at::convolution_backward_symint(
           grad_output, dummy_input, weight_, nullopt, stride, padding,
           dilation, transposed, output_padding, groups, mask);
       grad_input = std::get<0>(result); // N(GBI)
@@ -300,7 +300,7 @@ convolution_backward_input_batch_rule(
       weight_ = weight_.transpose(0, 1);                       // GBIO
       weight_ = weight_.flatten(0, 2);                         // (GBI)O
       const auto dummy_input = make_dummy(input, input_bdim, 1, batch_size);
-      const auto result = at::convolution_backward(
+      const auto result = at::convolution_backward_symint(
           grad_output, dummy_input, weight_, nullopt, stride, padding,
           dilation, transposed, output_padding, groups, mask);
       grad_input = std::get<0>(result); // N(GBI)
@@ -314,7 +314,7 @@ convolution_backward_input_batch_rule(
   } else {
     TORCH_INTERNAL_ASSERT(input_bdim);
     const auto dummy_input = make_dummy(input, input_bdim, 0, 1);
-    const auto result = at::convolution_backward(
+    const auto result = at::convolution_backward_symint(
         grad_output, dummy_input, weight, nullopt, stride, padding,
         dilation, transposed, output_padding, groups, mask);
     return std::make_tuple(std::get<0>(result), nullopt);
@@ -325,8 +325,8 @@ convolution_backward_weight_batch_rule(
     const Tensor& grad_output, optional<int64_t> grad_output_bdim,
     const Tensor& input, optional<int64_t> input_bdim,
     const Tensor& weight, optional<int64_t> weight_bdim,
-    IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool transposed,
-    IntArrayRef output_padding, int64_t groups) {
+    IntArrayRef stride, c10::SymIntArrayRef padding, IntArrayRef dilation, bool transposed,
+    c10::SymIntArrayRef output_padding, int64_t groups) {
   const std::array<bool, 3> mask = {false, true, false};
   if (grad_output_bdim && input_bdim) {
     // BNO, BNI -> N(BO), N(BI) -> (BO)I (regular) (BI)O (transposed)
@@ -334,7 +334,7 @@ convolution_backward_weight_batch_rule(
     const auto grad_output_ = reshape_dim_into(*grad_output_bdim, 1, grad_output);
     const auto input_ = reshape_dim_into(*input_bdim, 1, input);
     const auto dummy_weight = make_dummy(weight, weight_bdim, 0, batch_size);
-    const auto result = at::convolution_backward(
+    const auto result = at::convolution_backward_symint(
         grad_output_, input_, dummy_weight, nullopt, stride, padding,
         dilation, transposed, output_padding, groups * batch_size, mask);
     auto grad_weight = std::get<1>(result);
@@ -348,7 +348,7 @@ convolution_backward_weight_batch_rule(
       const auto grad_output_ = reshape_dim_into(*grad_output_bdim, 1, grad_output);
       const auto out_ch_dim = transposed ? 1 : 0;
       const auto dummy_weight = make_dummy(weight, weight_bdim, out_ch_dim, batch_size);
-      const auto result = at::convolution_backward(
+      const auto result = at::convolution_backward_symint(
           grad_output_, input, dummy_weight, nullopt, stride, padding,
           dilation, transposed, output_padding, groups, mask);
       auto grad_weight = std::get<1>(result);
@@ -362,7 +362,7 @@ convolution_backward_weight_batch_rule(
       if (!transposed) {
         // BN(GO), N(GI) -> N(GBO), N(GI) -> (GBO)I
         const auto dummy_weight = make_dummy(weight, weight_bdim, 0, batch_size);
-        const auto result = at::convolution_backward(
+        const auto result = at::convolution_backward_symint(
             grad_output_, input, dummy_weight, nullopt, stride, padding,
             dilation, transposed, output_padding, groups, mask);
         auto grad_weight = std::get<1>(result);
@@ -373,7 +373,7 @@ convolution_backward_weight_batch_rule(
       } else {
         // BN(GO), N(GI) -> N(GBO), N(GI) -> (GI)(BO)
         const auto dummy_weight = make_dummy(weight, weight_bdim, 1, batch_size);
-        const auto result = at::convolution_backward(
+        const auto result = at::convolution_backward_symint(
             grad_output_, input, dummy_weight, nullopt, stride, padding,
             dilation, transposed, output_padding, groups, mask);
         auto grad_weight = std::get<1>(result);
@@ -389,7 +389,7 @@ convolution_backward_weight_batch_rule(
       const auto input_ = reshape_dim_into(*input_bdim, 1, input);
       const auto in_ch_dim = transposed ? 0 : 1;
       const auto dummy_weight = make_dummy(weight, weight_bdim, in_ch_dim, batch_size);
-      const auto result = at::convolution_backward(
+      const auto result = at::convolution_backward_symint(
           grad_output, input_, dummy_weight, nullopt, stride, padding,
           dilation, transposed, output_padding, groups, mask);
       auto grad_weight = std::get<1>(result);
@@ -403,7 +403,7 @@ convolution_backward_weight_batch_rule(
       if (!transposed) {
         // regular: N(GO), BN(GI) -> N(GO), N(GBI) -> (GO)(BI)
         const auto dummy_weight = make_dummy(weight, weight_bdim, 1, batch_size);
-        const auto result = at::convolution_backward(
+        const auto result = at::convolution_backward_symint(
             grad_output, input_, dummy_weight, nullopt, stride, padding,
             dilation, transposed, output_padding, groups, mask);
         auto grad_weight = std::get<1>(result);
@@ -412,7 +412,7 @@ convolution_backward_weight_batch_rule(
       } else {
         // transposed: N(GO), BN(GI) -> N(GO), N(GBI) -> (GBI)O
         const auto dummy_weight = make_dummy(weight, weight_bdim, 0, batch_size);
-        const auto result = at::convolution_backward(
+        const auto result = at::convolution_backward_symint(
             grad_output, input_, dummy_weight, nullopt, stride, padding,
             dilation, transposed, output_padding, groups, mask);
         auto grad_weight = std::get<1>(result);
@@ -425,7 +425,7 @@ convolution_backward_weight_batch_rule(
   } else {
     TORCH_INTERNAL_ASSERT(weight_bdim);
     const auto dummy_weight = make_dummy(weight, weight_bdim, 0, 1);
-    const auto result = at::convolution_backward(
+    const auto result = at::convolution_backward_symint(
         grad_output, input, dummy_weight, nullopt, stride, padding,
         dilation, transposed, output_padding, groups, mask);
     return std::make_tuple(std::get<1>(result), nullopt);
@@ -436,8 +436,8 @@ convolution_backward_weight_batch_rule(
 std::tuple<Tensor,Tensor,Tensor> convolution_backward_plumbing(
     const Tensor& grad_output_, const Tensor& input_, const Tensor& weight_,
     const c10::OptionalArrayRef<SymInt> bias_sizes_opt,
-    IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool transposed,
-    IntArrayRef output_padding, int64_t groups, std::array<bool, 3> output_mask) {
+    IntArrayRef stride, c10::SymIntArrayRef padding, IntArrayRef dilation, bool transposed,
+    c10::SymIntArrayRef output_padding, int64_t groups, std::array<bool, 3> output_mask) {
   const auto maybe_layer = maybeCurrentDynamicLayer();
   TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
   int64_t cur_level = maybe_layer->layerId();
@@ -487,7 +487,7 @@ std::tuple<Tensor,Tensor,Tensor> convolution_backward_plumbing(
     const auto batch_size = weight.size(*weight_bdim);
     input = reshape_dim_into(*input_bdim, 1, input);
     weight = reshape_dim_into(*weight_bdim, 0, weight);
-    const auto result = at::convolution_backward(
+    const auto result = at::convolution_backward_symint(
         grad_output, input, weight, nullopt, stride, padding, dilation,
         transposed, output_padding, batch_size * groups, output_mask);
     // N(BI), (BO)I -> NBI, BOI
diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
index f1108bac25a0a..24a1c4ab507a0 100644
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@@ -242,7 +242,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   OP_DECOMPOSE2(where, ScalarSelf);
   OP_DECOMPOSE(orgqr);
   OP_DECOMPOSE2(unflatten, int);
-  OP_DECOMPOSE(_convolution_double_backward);
+  m.impl("_convolution_double_backward", native::_convolution_double_backward);
   OP_DECOMPOSE(conv_transpose1d);
   OP_DECOMPOSE2(conv_transpose2d, input);
   OP_DECOMPOSE2(conv_transpose3d, input);
diff --git a/aten/src/ATen/native/AdaptiveAveragePooling.cpp b/aten/src/ATen/native/AdaptiveAveragePooling.cpp
index 40b05d74053ca..b612ef009b651 100644
--- a/aten/src/ATen/native/AdaptiveAveragePooling.cpp
+++ b/aten/src/ATen/native/AdaptiveAveragePooling.cpp
@@ -130,9 +130,9 @@ namespace {
       Tensor out = input.mean({-1, -2}, /* keepdim = */ true);
       if (input.suggest_memory_format() == at::MemoryFormat::ChannelsLast) {
         // assert ndim == 4, since ndim = 3 doesn't give channels_last
-        const int n = input.size(0);
-        const int c = input.size(1);
-        out.as_strided_({n, c, 1, 1}, {c, 1, c, c});
+        const auto n = input.sym_size(0);
+        const auto c = input.sym_size(1);
+        out.as_strided__symint({n, c, 1, 1}, {c, 1, c, c});
       }
       return out;
     } else {
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 4d68f23c0734f..64f6d141b9299 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -910,8 +910,8 @@ static Tensor convolution_same(
   auto k = weight.dim();
   TORCH_CHECK(k > 2, "weight should have at least three dimensions");
   auto dim = static_cast<size_t>(k - 2);
-  auto weight_sizes = weight.sizes();
-  auto input_sizes = input.sizes();
+  auto weight_sizes = weight.sym_sizes();
+  auto input_sizes = input.sym_sizes();
   TORCH_CHECK(k == input.dim(),
               "Expected ", k, "-dimensional input for ",
               k, "-dimensional weight", weight_sizes, ", but got ",
@@ -926,7 +926,7 @@ static Tensor convolution_same(
   }
 
   // Calculate the correct padding
-  DimVector padding_l, padding_r;
+  SymDimVector padding_l, padding_r;
   bool symmetric_padding = true;
   for (auto i: c10::irange(dim)) {
     auto s = stride.size() == 1 ? stride[0] : stride[i];
@@ -942,14 +942,14 @@ static Tensor convolution_same(
 
   if (symmetric_padding) {
     // All backends handle symmetric padding natively
-    DimVector output_padding(static_cast<size_t>(dim));
-    return at::convolution(input, weight, bias, stride, padding_l, dilation,
+    SymDimVector output_padding(static_cast<size_t>(dim));
+    return at::convolution_symint(input, weight, bias, stride, padding_l, dilation,
                                false, output_padding, groups);
   }
 
   TORCH_WARN_ONCE("Using padding='same' with even kernel lengths and odd dilation may"
                   " require a zero-padded copy of the input be created");
-  SmallVector<int64_t, kDimVectorStaticSize * 2> pad_nd(static_cast<size_t>(2 * dim));
+  SmallVector<c10::SymInt, kDimVectorStaticSize * 2> pad_nd(static_cast<size_t>(2 * dim));
   for (auto i: c10::irange(dim)) {
     // Apply padding by the difference, leaving only a symmetric padding
     auto delta_pad = padding_r[i] - padding_l[i];
@@ -961,10 +961,10 @@ static Tensor convolution_same(
       padding_l[i] = padding_r[i];
     }
   }
-  auto padded_input = at::constant_pad_nd(input, pad_nd, 0);
-  DimVector output_padding(static_cast<size_t>(dim));
-  return at::convolution(padded_input, weight, bias, stride, padding_l,
-                         dilation, false, output_padding, groups);
+  auto padded_input = at::constant_pad_nd_symint(input, pad_nd, 0);
+  SymDimVector output_padding(static_cast<size_t>(dim));
+  return at::convolution_symint(padded_input, weight, bias, stride, padding_l,
+                                dilation, false, output_padding, groups);
 }
 
 Tensor _convolution_mode(
diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp
index 7d4a89d6b40f7..21404947b3dbb 100644
--- a/aten/src/ATen/native/EmbeddingBag.cpp
+++ b/aten/src/ATen/native/EmbeddingBag.cpp
@@ -1307,7 +1307,7 @@ Tensor _embedding_bag_backward_symint(const Tensor &grad, const Tensor &indices_
   checkContiguous("embedding_bag", offsets_arg);
 
   Tensor offset2bag_;
-  if (indices.numel() != 0 && offset2bag.numel() == 0) {
+  if (indices.sym_numel() != 0 && offset2bag.sym_numel() == 0) {
     offset2bag_ = offsets.new_zeros(
       {indices.size(0) + 1}, offsets.options()); // offset2bag = [0 0 0 0 0]
 
diff --git a/aten/src/ATen/native/GridSamplerUtils.h b/aten/src/ATen/native/GridSamplerUtils.h
index 0b6f29de8c427..7c22fedfe94e2 100644
--- a/aten/src/ATen/native/GridSamplerUtils.h
+++ b/aten/src/ATen/native/GridSamplerUtils.h
@@ -101,7 +101,7 @@ bool cond_cudnn_grid_sampler(
     at::native::canUse32BitIndexMath(input) &&
     at::native::canUse32BitIndexMath(grid) &&
     input.dim() == 4 &&
-    input.size(1) <= 1024);
+    input.sym_size(1) <= 1024);
 }
 
 } // anonymous namespace
diff --git a/aten/src/ATen/native/IndexingUtils.cpp b/aten/src/ATen/native/IndexingUtils.cpp
index c5f5ff6fbcc07..2dba1972ce574 100644
--- a/aten/src/ATen/native/IndexingUtils.cpp
+++ b/aten/src/ATen/native/IndexingUtils.cpp
@@ -4,7 +4,7 @@
 namespace at { namespace native {
 
 bool canUse32BitIndexMath(const TensorBase& t, int64_t max_elem) {
-  int64_t elements = t.numel();
+  auto elements = t.sym_numel();
   if (elements >= max_elem) {
     return false;
   }
@@ -12,16 +12,16 @@ bool canUse32BitIndexMath(const TensorBase& t, int64_t max_elem) {
     return max_elem > 0;
   }
 
-  int64_t offset = 0;
-  int64_t linearId = elements - 1;
+  c10::SymInt offset = 0;
+  auto linearId = elements - 1;
 
   // NOTE: Assumes all strides are positive, which is true for now
   // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
   for (int i = t.dim() - 1; i >= 0; --i) {
-    int64_t curDimIndex = linearId % t.size(i);
-    int64_t curDimOffset = curDimIndex * t.stride(i);
+    auto curDimIndex = linearId % t.sym_size(i);
+    auto curDimOffset = curDimIndex * t.sym_stride(i);
     offset += curDimOffset;
-    linearId /= t.size(i);
+    linearId /= t.sym_size(i);
   }
 
   if (offset >= max_elem) {
diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp
index 8e5864b68728d..28fc60508ab10 100644
--- a/aten/src/ATen/native/LossNLL.cpp
+++ b/aten/src/ATen/native/LossNLL.cpp
@@ -656,7 +656,7 @@ Tensor nll_loss(const Tensor & self, const Tensor & target, const c10::optional<
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
 
-  return std::get<0>(at::nll_loss_forward(self, target, weight, reduction, ignore_index));
+  return std::get<0>(at::nll_loss_forward_symint(self, target, weight, reduction, ignore_index));
 }
 
 Tensor nll_loss_nd_symint(
diff --git a/aten/src/ATen/native/LossNLL2d.cpp b/aten/src/ATen/native/LossNLL2d.cpp
index ab7c084eb80df..aee22ce3edeb5 100644
--- a/aten/src/ATen/native/LossNLL2d.cpp
+++ b/aten/src/ATen/native/LossNLL2d.cpp
@@ -498,7 +498,7 @@ Tensor nll_loss2d(const Tensor & self, const Tensor & target, const c10::optiona
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
 
-  return std::get<0>(at::nll_loss2d_forward(self, target, weight, reduction, ignore_index));
+  return std::get<0>(at::nll_loss2d_forward_symint(self, target, weight, reduction, ignore_index));
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/NonSymbolicBC.h b/aten/src/ATen/native/NonSymbolicBC.h
index e7d31ae3fa020..f57c868f345f1 100644
--- a/aten/src/ATen/native/NonSymbolicBC.h
+++ b/aten/src/ATen/native/NonSymbolicBC.h
@@ -22,4 +22,5 @@ TORCH_API at::Tensor _embedding_bag_sparse_backward(const at::Tensor & grad, con
 TORCH_API at::Tensor value_selecting_reduction_backward(const at::Tensor & grad, int64_t dim, const at::Tensor & indices, at::IntArrayRef sizes, bool keepdim);
 TORCH_API at::Tensor trace_backward(const at::Tensor & grad, at::IntArrayRef sizes);
 TORCH_API at::Tensor index_select_backward(const at::Tensor & grad, at::IntArrayRef self_sizes, int64_t dim, const at::Tensor & index);
+TORCH_API std::vector<Tensor> tensor_split(const Tensor& self, IntArrayRef indices, int64_t dim);
 }}
diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h
index cf5b45b365d05..0ff4490086b7e 100644
--- a/aten/src/ATen/native/Pool.h
+++ b/aten/src/ATen/native/Pool.h
@@ -67,17 +67,18 @@ static inline T pooling_output_shape(
         inputSize, kernelSize, pad, pad, stride, dilation, ceil_mode);
 }
 
-inline std::pair<int64_t, int64_t> pooling_same_mode_padding_lr(
-    int64_t inputSize, int64_t kernelSize, int64_t stride, int64_t dilation) {
+template <typename T>
+std::pair<T, T> _pooling_same_mode_padding_lr(
+    T inputSize, T kernelSize, int64_t stride, int64_t dilation) {
   // NOTE: with strides, the output shape is ceil(inputSize/stride)
-  auto total_padding = dilation * (kernelSize - 1);
+  auto total_padding = T(dilation) * (kernelSize - 1);
 
   // Prefer symmetric padding if possible
   if (stride > 2 && (total_padding % 2 == 1)) {
     // The floor in the output size calculation gives us a little wiggle room
     auto wiggle_room = inputSize % stride - 1;
     if (wiggle_room > 0) {
-      --total_padding;
+      total_padding = total_padding - 1;
     }
   }
 
@@ -85,6 +86,15 @@ inline std::pair<int64_t, int64_t> pooling_same_mode_padding_lr(
   return {left, total_padding - left};
 }
 
+inline std::pair<int64_t, int64_t> pooling_same_mode_padding_lr(
+    int64_t inputSize, int64_t kernelSize, int64_t stride, int64_t dilation) {
+  return _pooling_same_mode_padding_lr(inputSize, kernelSize, stride, dilation);
+}
+
+inline std::pair<c10::SymInt, c10::SymInt> pooling_same_mode_padding_lr(
+    c10::SymInt inputSize, c10::SymInt kernelSize, int64_t stride, int64_t dilation) {
+  return _pooling_same_mode_padding_lr(inputSize, kernelSize, stride, dilation);
+}
 
 // AveragePool2d/DilatedMaxPool2d (forward)
 static inline void
diff --git a/aten/src/ATen/native/TensorProperties.cpp b/aten/src/ATen/native/TensorProperties.cpp
index 6a703cbe07f90..e37dbf56cc81a 100644
--- a/aten/src/ATen/native/TensorProperties.cpp
+++ b/aten/src/ATen/native/TensorProperties.cpp
@@ -69,7 +69,7 @@ bool cudnn_is_acceptable(const TensorBase& self) {
   // tensors. Maybe some cuDNN functions actually support empty tensors, but
   // native/THNN kernels shouldn't be much slower because the output is also
   // likely empty.
-  if (self.numel() == 0) return false;
+  if (self.sym_numel() == 0) return false;
   // NB: In the old Python code, there was also a test to see if the
   // cuDNN library was actually dynamically linked or not.  I'm not
   // sure if we can actually test this.
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index d25113577b2d5..2051cda371b97 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -917,9 +917,12 @@ std::vector<Tensor> chunk(const Tensor& self, int64_t chunks, int64_t dim) {
   }
 }
 
-std::vector<Tensor> tensor_split(const Tensor& self, int64_t sections, int64_t dim) {
+std::vector<Tensor> tensor_split_sections_symint(const Tensor& self, c10::SymInt sym_sections, int64_t dim) {
   TORCH_CHECK(self.dim() > 0, "tensor_split expected at least a 1-dimensional tensor, but got a tensor with ", self.dim()," dims");
   int64_t dim_ = maybe_wrap_dim(dim, self.dim());
+  // NB: intentional, sections specifies number of output tensors, which
+  // cannot be polymorphic
+  int64_t sections = sym_sections.guard_int(__FILE__, __LINE__);
   TORCH_CHECK(sections > 0, "number of sections must be larger than 0, got ", sections);
   const auto dim_size = self.sym_size(dim_);
   std::vector<Tensor> splits(sections);
@@ -934,21 +937,30 @@ std::vector<Tensor> tensor_split(const Tensor& self, int64_t sections, int64_t d
   return splits;
 }
 
-std::vector<Tensor> tensor_split(const Tensor& self, IntArrayRef indices, int64_t dim) {
+template <typename T>
+std::vector<Tensor> _tensor_split_indices(const Tensor& self, ArrayRef<T> indices, int64_t dim) {
   TORCH_CHECK(self.dim() > 0, "tensor_split expected at least a 1-dimensional tensor, but got a tensor with ", self.dim()," dims");
   int64_t dim_ = maybe_wrap_dim(dim, self.dim());
   int64_t num_indices = indices.size();
   std::vector<Tensor> splits(num_indices + 1);
-  int64_t start_idx = 0;
+  T start_idx(0);
   for (const auto split_idx : c10::irange(num_indices)) {
-    int64_t end_idx = indices[split_idx];
-    splits[split_idx] = at::slice(self, dim_, start_idx, end_idx);
+    auto end_idx = indices[split_idx];
+    splits[split_idx] = at::symint::slice<T>(self, dim_, start_idx, end_idx);
     start_idx = end_idx;
   }
-  splits[num_indices] = at::slice(self, dim_, start_idx, self.size(dim_));
+  splits[num_indices] = at::symint::slice<T>(self, dim_, start_idx, at::symint::size<T>(self, dim_));
   return splits;
 }
 
+std::vector<Tensor> tensor_split(const Tensor& self, IntArrayRef indices, int64_t dim) {
+  return _tensor_split_indices(self, indices, dim);
+}
+
+std::vector<Tensor> tensor_split_indices_symint(const Tensor& self, SymIntArrayRef indices, int64_t dim) {
+  return _tensor_split_indices(self, indices, dim);
+}
+
 std::vector<Tensor> tensor_split(const Tensor& self, const Tensor& tensor_indices_or_sections, int64_t dim) {
   TORCH_CHECK(self.dim() > 0, "tensor_split expected at least a 1-dimensional tensor, but got a tensor with ", self.dim()," dims");
   auto split_device = tensor_indices_or_sections.device();
@@ -1174,8 +1186,8 @@ Tensor as_strided_qtensorimpl(const Tensor& self, IntArrayRef size, IntArrayRef
   return result;
 }
 
-const Tensor &as_strided_(const Tensor& self, IntArrayRef size, IntArrayRef stride, optional<int64_t> storage_offset_) {
-  auto storage_offset = storage_offset_.value_or(self.storage_offset());
+const Tensor &as_strided__symint(const Tensor& self, SymIntArrayRef size, SymIntArrayRef stride, optional<c10::SymInt> storage_offset_) {
+  auto storage_offset = storage_offset_.value_or(self.sym_storage_offset());
   setStrided(self, size, stride, storage_offset);
   return self;
 }
diff --git a/aten/src/ATen/native/group_norm.cpp b/aten/src/ATen/native/group_norm.cpp
index 5b38b02702828..c12d8d2142ff9 100644
--- a/aten/src/ATen/native/group_norm.cpp
+++ b/aten/src/ATen/native/group_norm.cpp
@@ -23,13 +23,15 @@
 #include <vector>
 
 namespace at {
+
 namespace native {
 
+template <typename T>
 void check_group_norm_inputs(
     const Tensor& input,
     const Tensor& weight,
     const Tensor& bias,
-    int64_t C,
+    T C,
     int64_t num_groups) {
   TORCH_CHECK(
       num_groups > 0,
@@ -43,14 +45,14 @@ void check_group_norm_inputs(
       "num_groups=",
       num_groups);
   TORCH_CHECK(
-      !weight.defined() || (weight.dim() == 1 && weight.numel() == C),
+      !weight.defined() || (weight.dim() == 1 && at::symint::numel<T>(weight) == C),
       "Expected weight to be a vector of size equal to the number of ",
       "channels in input, but got weight of shape ",
       weight.sizes(),
       " and input of shape ",
       input.sizes());
   TORCH_CHECK(
-      !bias.defined() || (bias.dim() == 1 && bias.numel() == C),
+      !bias.defined() || (bias.dim() == 1 && at::symint::numel<T>(bias) == C),
       "Expected bias to be a vector of size equal to the number of ",
       "channels in input, but got bias of shape ",
       weight.sizes(),
@@ -171,13 +173,13 @@ Tensor group_norm(
   const Tensor& weight = *weight_maybe_owned;
   const Tensor& bias = c10::value_or_else(bias_opt, [] { return Tensor(); });
 
-  const int64_t N = input.size(0);
-  const int64_t C = input.size(1);
+  const auto N = input.sym_size(0);
+  const auto C = input.sym_size(1);
   check_group_norm_inputs(input, weight, bias, C, num_groups);
 
-  const auto input_shape = input.sizes();
-  const int64_t HxW =
-      c10::multiply_integers(input_shape.cbegin() + 2, input_shape.cend());
+  const auto input_shape = input.sym_sizes();
+  const auto HxW =
+      c10::multiply_integers(input_shape.slice(2));
 
   const Tensor kEmpty;
   auto memory_format = input.suggest_memory_format();
@@ -185,10 +187,10 @@ Tensor group_norm(
       input.contiguous(memory_format) : input.contiguous();
   const auto& gamma = weight.defined() ? weight.contiguous() : kEmpty;
   const auto& beta = bias.defined() ? bias.contiguous() : kEmpty;
-  TORCH_CHECK(!gamma.defined() || gamma.numel() == C);
-  TORCH_CHECK(!beta.defined() || beta.numel() == C);
+  TORCH_CHECK(!gamma.defined() || gamma.sym_numel() == C);
+  TORCH_CHECK(!beta.defined() || beta.sym_numel() == C);
   return std::get<0>(
-      at::native_group_norm(X, gamma, beta, N, C, HxW, num_groups, eps));
+      at::native_group_norm_symint(X, gamma, beta, N, C, HxW, num_groups, eps));
 }
 
 DEFINE_DISPATCH(GroupNormKernel);
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 69951d7b2fabf..2922e2be825b0 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -815,7 +815,7 @@
   device_guard: False
   tags: inplace_view
   dispatch:
-    CompositeExplicitAutogradNonFunctional: as_strided_
+    CompositeExplicitAutogradNonFunctional: as_strided__symint
 
 - func: asin(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -1294,11 +1294,15 @@
     CompositeImplicitAutograd: chunk
     NestedTensorCPU, NestedTensorCUDA: chunk_nested_tensor
 
-- func: tensor_split.sections(Tensor(a -> *) self, int sections, int dim=0) -> Tensor(a)[]
+- func: tensor_split.sections(Tensor(a -> *) self, SymInt sections, int dim=0) -> Tensor(a)[]
   variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: tensor_split_sections_symint
 
-- func: tensor_split.indices(Tensor(a -> *) self, int[] indices, int dim=0) -> Tensor(a)[]
+- func: tensor_split.indices(Tensor(a -> *) self, SymInt[] indices, int dim=0) -> Tensor(a)[]
   variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: tensor_split_indices_symint
 
 - func: tensor_split.tensor_indices_or_sections(Tensor(a -> *) self, Tensor tensor_indices_or_sections, int dim=0) -> Tensor(a)[]
   variants: function, method
@@ -1465,13 +1469,13 @@
   variants: method
   manual_cpp_binding: True
 
-- func: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor
+- func: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups) -> Tensor
   dispatch:
     CompositeExplicitAutograd: convolution
   autogen: convolution.out
   tags: canonical
 
-- func: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+- func: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
   dispatch:
     CompositeExplicitAutograd, CUDA: convolution_backward
   autogen: convolution_backward.out
@@ -1487,7 +1491,7 @@
     CompositeExplicitAutograd: convolution_backward_overrideable
   autogen: convolution_backward_overrideable.out
 
-- func: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor
+- func: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor
   dispatch:
     CompositeExplicitAutograd: _convolution
   autogen: _convolution.out
@@ -1496,7 +1500,7 @@
 
 - func: _convolution_mode(Tensor input, Tensor weight, Tensor? bias, int[] stride, str padding, int[] dilation, int groups) -> Tensor
 
-- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
 
 - func: conv1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] dilation=1, int groups=1) -> Tensor
 
@@ -3561,7 +3565,7 @@
     MPS: mps_convolution_backward
   autogen: mps_convolution_backward.out
 
-- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor
+- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups) -> Tensor
   dispatch:
     CompositeExplicitAutograd: mkldnn_convolution
   autogen: mkldnn_convolution.out
@@ -3576,17 +3580,17 @@
     CUDA: miopen_batch_norm_backward
   autogen: miopen_batch_norm_backward.out
 
-- func: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+- func: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
   dispatch:
     CUDA: miopen_convolution
   autogen: miopen_convolution.out
 
-- func: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+- func: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
   dispatch:
     CUDA: miopen_convolution_transpose
   autogen: miopen_convolution_transpose.out
 
-- func: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+- func: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
   dispatch:
     CUDA: miopen_depthwise_convolution
   autogen: miopen_depthwise_convolution.out
@@ -3840,7 +3844,7 @@
 
 - func: _nnpack_available() -> bool
 
-- func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, int[2] padding, int[2] stride=1) -> Tensor
+- func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, int[2] stride=1) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutograd: _nnpack_spatial_convolution
@@ -11470,24 +11474,24 @@
 # these are the same thing, but we give them different prefixes to
 # make the operational distinction clear.
 
-- func: slow_conv_transpose2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+- func: slow_conv_transpose2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   structured: True
   dispatch:
     CPU: slow_conv_transpose2d_structured_cpu
     CUDA: slow_conv_transpose2d_structured_cuda
 
-- func: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int[2] dilation=1) -> Tensor
+- func: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int[2] dilation=1) -> Tensor
   python_module: nn
   structured_delegate: slow_conv_transpose2d.out
 
-- func: slow_conv_transpose3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+- func: slow_conv_transpose3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: slow_conv_transpose3d_out_cpu
     CUDA: slow_conv_transpose3d_out_cuda
 
-- func: slow_conv_transpose3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int[3] dilation=1) -> Tensor
+- func: slow_conv_transpose3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int[3] dilation=1) -> Tensor
   python_module: nn
   dispatch:
     CPU: slow_conv_transpose3d_cpu
@@ -11524,47 +11528,47 @@
     CUDA: slow_conv2d_backward_cuda
   autogen: _slow_conv2d_backward.output_mask_out
 
-- func: _conv_depthwise2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation, *, Tensor(a!) out) -> Tensor(a!)
+- func: _conv_depthwise2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, SymInt[2] padding, int[2] dilation, *, Tensor(a!) out) -> Tensor(a!)
   use_const_ref_for_mutable_tensors: True
   python_module: nn
   dispatch:
     CUDA: conv_depthwise2d_cuda_out
 
-- func: _conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation) -> Tensor
+- func: _conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, SymInt[2] padding, int[2] dilation) -> Tensor
   python_module: nn
   dispatch:
     CUDA: conv_depthwise2d_cuda
 
-- func: conv_depthwise3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding, int[3] dilation) -> Tensor
+- func: conv_depthwise3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding, int[3] dilation) -> Tensor
   python_module: nn
   dispatch:
     CUDA: conv_depthwise3d_cuda
   autogen: conv_depthwise3d.out
 
-- func: slow_conv3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, *, Tensor(a!) out) -> Tensor(a!)
+- func: slow_conv3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
 
-- func: slow_conv3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0) -> Tensor
+- func: slow_conv3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0) -> Tensor
   python_module: nn
 
-- func: slow_conv3d_forward.output(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding, *, Tensor(a!) output) -> Tensor(a!)
+- func: slow_conv3d_forward.output(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding, *, Tensor(a!) output) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: slow_conv3d_forward_out_cpu
 
-- func: slow_conv3d_forward(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding) -> Tensor
+- func: slow_conv3d_forward(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding) -> Tensor
   python_module: nn
   dispatch:
     CPU: slow_conv3d_forward_cpu
 
-- func: slow_conv_dilated2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1) -> Tensor
+- func: slow_conv_dilated2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, int[2] dilation=1) -> Tensor
   python_module: nn
   dispatch:
     CPU: slow_conv_dilated2d_cpu
     CUDA: slow_conv_dilated2d_cuda
   autogen: slow_conv_dilated2d.out
 
-- func: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1) -> Tensor
+- func: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, int[3] dilation=1) -> Tensor
   python_module: nn
   dispatch:
     CPU: slow_conv_dilated3d_cpu
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index d406f2eb53047..15e0e6a43c3b8 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1128,8 +1128,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     skip('nn.functional.batch_norm', ''),  # '0 is not tracked with proxy for <torch.fx.experimental.proxy_te..
     xfail('nn.functional.bilinear', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.binary_cross_entropy', ''),  # aten.fill_.Scalar - couldn't find symbolic meta funct...
-    xfail('nn.functional.conv1d', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('nn.functional.conv2d', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.cosine_embedding_loss', ''),  # Cannot call sizes() on tensor with symbolic sizes/st...
     xfail('nn.functional.cosine_similarity', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.cross_entropy', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
@@ -1230,7 +1228,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('trapezoid', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('trapz', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('triangular_solve', ''),  # aten.triangular_solve.default - couldn't find symbolic meta function/de...
-    xfail('unbind', ''),  # tensor_split() received an invalid combination of arguments - got (FakeTensor, torch...
     xfail('unflatten', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('var', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('var_mean', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 1e72d5a4bc277..fae55367ab192 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1248,8 +1248,6 @@ def f(a, b, c, d, e):
     xfail('nn.functional.avg_pool3d', ''),  # aten.avg_pool3d.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.bilinear', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.binary_cross_entropy', ''),  # aten.new_empty.default - couldn't find symbolic meta function/decom...
-    xfail('nn.functional.conv1d', ''),  # aten.convolution.default - couldn't find symbolic meta function/decomposition
-    xfail('nn.functional.conv2d', ''),  # aten.convolution.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.cosine_embedding_loss', ''),  # The underlying op of 'aten.stride' has no overload name '_schema'
     xfail('nn.functional.cosine_similarity', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.cross_entropy', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
@@ -1262,7 +1260,6 @@ def f(a, b, c, d, e):
     xfail('nn.functional.fractional_max_pool2d', ''),  # argument 'size' must be tuple of ints, but found element of t...
     xfail('nn.functional.fractional_max_pool3d', ''),  # argument 'size' must be tuple of ints, but found element of t...
     xfail('nn.functional.grid_sample', ''),  # aten.grid_sampler_2d.default - couldn't find symbolic meta function/decompos...
-    xfail('nn.functional.group_norm', ''),  # 'torch._C.SymIntNode' and 'int'
     xfail('nn.functional.hinge_embedding_loss', ''),  # aten.empty_like.default - couldn't find symbolic meta function/deco...
     xfail('nn.functional.interpolate', 'area'),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.interpolate', 'bicubic'),  # aten.upsample_bicubic2d.vec - couldn't find symbolic meta function/d...
@@ -1355,7 +1352,6 @@ def f(a, b, c, d, e):
     xfail('view_as_complex', ''),  # aten.view_as_complex.default - couldn't find symbolic meta function/decomposition
     xfail('view_as', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('vsplit', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
-    xfail('unbind', ''),  # aten.unbind.int - couldn't find symbolic meta function/decomposition
     xfail('unique_consecutive', ''),  # aten.unique_consecutive.default - couldn't find symbolic meta function/decomposition
     xfail('unique', ''),  # aten._unique2.default - couldn't find symbolic meta function/decomposition
 }
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index c77f63e8c8e73..6945dae77a020 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -2206,19 +2206,19 @@
   indices: non_differentiable
   result: auto_linear
 
-- name: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor
+- name: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups) -> Tensor
   input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
   result: convolution_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, stride, padding, dilation, transposed, output_padding, groups)
 
 # TorchScript serializes calls to _convolution so this entry is present until that is changed to use convolution.
 # Note that the benchmark, deterministic, cudnn_enabled, and allow_tf32 flags are queried from the global context
 # by convolution_backward instead of being passed along from the forward pass.
-- name: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor
+- name: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor
   input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
   result: _convolution_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, allow_tf32)
 
-- name: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
-  grad_output, input, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, input, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask)
+- name: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  grad_output, input, weight: _convolution_double_backward_symint(grads[0], grads[1], grads[2], grad_output, weight, input, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask)
   result0: std::get<0>(convolution_backward_symint(grad_output_p, input_p, weight_t, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {true, false, false})) + std::get<0>(convolution_backward_symint(grad_output_t, input_p, weight_p, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {true, false, false}))
   result1: std::get<1>(convolution_backward_symint(grad_output_p, input_t, weight_p, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {false, true, false})) + std::get<1>(convolution_backward_symint(grad_output_t, input_p, weight_p, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {false, true, false}))
   result2: convolution_backward_jvp_grad_bias(grad_output_t, result2)
@@ -2229,10 +2229,10 @@
 - name: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
   grad_output, input, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, input, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask)
 
-- name: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int[2] dilation=1) -> Tensor
+- name: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int[2] dilation=1) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, true, output_padding, 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: slow_conv_transpose3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int[3] dilation=1) -> Tensor
+- name: slow_conv_transpose3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int[3] dilation=1) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, true, output_padding, 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
 - name: _slow_conv2d_forward(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding) -> Tensor
@@ -2241,20 +2241,20 @@
 - name: _slow_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
   grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, {{1, 1}}, false, {{0, 0}}, 1, grad_input_mask)
 
-- name: _conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation) -> Tensor
+- name: _conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, SymInt[2] padding, int[2] dilation) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad.contiguous(), self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ {{0, 0}}, /*groups=*/ 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: conv_depthwise3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding, int[3] dilation) -> Tensor
+- name: conv_depthwise3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding, int[3] dilation) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad.contiguous(), self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ {{0, 0, 0}}, /*groups=*/ 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: slow_conv3d_forward(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding) -> Tensor
+- name: slow_conv3d_forward(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, /*dilation=*/ {{1, 1, 1}}, false, /*output_padding=*/ {{0, 0, 0}}, 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: slow_conv_dilated2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1) -> Tensor
-  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<int64_t>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+- name: slow_conv_dilated2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, int[2] dilation=1) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1) -> Tensor
-  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<int64_t>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+- name: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, int[3] dilation=1) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
 - name: col2im(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
   self: im2col(grad, kernel_size, dilation, padding, stride)
@@ -2608,9 +2608,9 @@
 
 # nnpack
 
-- name: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, int[2] padding, int[2] stride=1) -> Tensor
+- name: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, int[2] stride=1) -> Tensor
   # NNPACK does not support strided convolutions in the backwards path, which is the reason why we are using the closest available function that does here.
-  input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, std::vector<int64_t>(padding.size(), 1), false, std::vector<int64_t>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+  input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, std::vector<int64_t>(padding.size(), 1), false, std::vector<c10::SymInt>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
 #LSTM MPS
 - name: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
@@ -2641,14 +2641,14 @@
 
 # miopen
 
-- name: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+- name: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, true, output_padding, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<int64_t>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+- name: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<int64_t>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+- name: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
 - name: miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor)
   input, weight, bias: "grad.defined() ? (training ? miopen_batch_norm_backward(input, grad.contiguous(), weight, running_mean, running_var, result1, result2, epsilon) : native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, epsilon, grad_input_mask)) : std::tuple<Tensor, Tensor, Tensor>()"
@@ -2667,8 +2667,8 @@
   dropout_state: non_differentiable
 
 # mkldnn
-- name: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor
-  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ std::vector<int64_t>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+- name: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ std::vector<c10::SymInt>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
 - name: mkldnn_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor
   self, weight, bias: mkldnn_linear_backward(self, grad, weight, grad_input_mask)
diff --git a/tools/jit/gen_unboxing.py b/tools/jit/gen_unboxing.py
index ebeaa21bc7be9..79c594a9afa07 100644
--- a/tools/jit/gen_unboxing.py
+++ b/tools/jit/gen_unboxing.py
@@ -116,7 +116,9 @@ def __call__(self, f: NativeFunction) -> str:
                 # from wrapping/unwrapping TensorOptios.
                 # However, we would look to include default args for schema parsing.
                 # Default args only show up in the nonfaithful C++ API,
-                arg_default = cpp.default_expr(arg.argument.default, arg.argument.type)
+                arg_default = cpp.default_expr(
+                    arg.argument.default, arg.argument.type, symint=False
+                )
                 if arg_default.startswith("{"):
                     arg_cpp = f"c10::IntArrayRef({arg_default})"
                 else:
diff --git a/torch/csrc/StorageMethods.cpp b/torch/csrc/StorageMethods.cpp
index 2b74c8a2fd290..29f0f67ce6ecb 100644
--- a/torch/csrc/StorageMethods.cpp
+++ b/torch/csrc/StorageMethods.cpp
@@ -41,7 +41,7 @@
 static PyObject* THPStorage_nbytes(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
   auto self = (THPStorage*)_self;
-  return THPUtils_packUInt64(self->cdata->nbytes());
+  return py::cast(self->cdata->sym_nbytes()).release().ptr();
   END_HANDLE_TH_ERRORS
 }
 
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 86b893bb014e6..3358d96569598 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -1098,15 +1098,15 @@ Tensor convolution_jvp(
     const Tensor& bias_p,
     const Tensor& bias_t,
     IntArrayRef stride,
-    IntArrayRef padding,
+    at::SymIntArrayRef padding,
     IntArrayRef dilation,
     bool transposed,
-    IntArrayRef output_padding,
+    at::SymIntArrayRef output_padding,
     int64_t groups) {
   auto bias_t_opt =
       bias_t.defined() ? c10::optional<at::Tensor>(bias_t) : c10::nullopt;
   return (
-      at::convolution(
+      at::convolution_symint(
           input_t,
           weight_p,
           c10::nullopt,
@@ -1116,7 +1116,7 @@ Tensor convolution_jvp(
           transposed,
           output_padding,
           groups) +
-      at::convolution(
+      at::convolution_symint(
           input_p,
           weight_t,
           bias_t_opt,
@@ -1136,10 +1136,10 @@ Tensor _convolution_jvp(
     const Tensor& bias_p,
     const Tensor& bias_t,
     IntArrayRef stride,
-    IntArrayRef padding,
+    at::SymIntArrayRef padding,
     IntArrayRef dilation,
     bool transposed,
-    IntArrayRef output_padding,
+    at::SymIntArrayRef output_padding,
     int64_t groups,
     bool benchmark,
     bool deterministic,
@@ -1148,7 +1148,7 @@ Tensor _convolution_jvp(
   auto bias_t_opt =
       bias_t.defined() ? c10::optional<at::Tensor>(bias_t) : c10::nullopt;
   return (
-      at::_convolution(
+      at::_convolution_symint(
           input_t,
           weight_p,
           c10::nullopt,
@@ -1162,7 +1162,7 @@ Tensor _convolution_jvp(
           deterministic,
           cudnn_enabled,
           allow_tf32) +
-      at::_convolution(
+      at::_convolution_symint(
           input_p,
           weight_t,
           bias_t_opt,
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index 04416c2b49e08..4da8aa074a534 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -937,10 +937,10 @@ Tensor convolution_jvp(
     const Tensor& bias_p,
     const Tensor& bias_t,
     IntArrayRef stride,
-    IntArrayRef padding,
+    at::SymIntArrayRef padding,
     IntArrayRef dilation,
     bool transposed,
-    IntArrayRef output_padding,
+    at::SymIntArrayRef output_padding,
     int64_t groups);
 
 Tensor _convolution_jvp(
@@ -951,10 +951,10 @@ Tensor _convolution_jvp(
     const Tensor& bias_p,
     const Tensor& bias_t,
     IntArrayRef stride,
-    IntArrayRef padding,
+    at::SymIntArrayRef padding,
     IntArrayRef dilation,
     bool transposed,
-    IntArrayRef output_padding,
+    at::SymIntArrayRef output_padding,
     int64_t groups,
     bool benchmark,
     bool deterministic,
diff --git a/torch/storage.py b/torch/storage.py
index 8e35973405b1b..6bfbab3733bc4 100644
--- a/torch/storage.py
+++ b/torch/storage.py
@@ -646,7 +646,9 @@ def device(self):
         return self._storage.device
 
     def size(self):
-        return len(self)
+        # NB: don't indirect through __len__, as that requires
+        # an int to be returned
+        return self.nbytes() // self.element_size()
 
     def pickle_storage_type(self):
         try:
diff --git a/torchgen/api/cpp.py b/torchgen/api/cpp.py
index c3b12d0336df0..4b00b5367b824 100644
--- a/torchgen/api/cpp.py
+++ b/torchgen/api/cpp.py
@@ -314,7 +314,7 @@ def return_names(f: NativeFunction, *, fallback_name: str = "result") -> Sequenc
 }
 
 # Convert a JIT default into C++ expression representing the default
-def default_expr(d: str, t: Type) -> str:
+def default_expr(d: str, t: Type, *, symint: bool) -> str:
     if d == "None" and str(t) == "Tensor?":
         return "{}"
     if isinstance(t, BaseType) and t.name is BaseTy.str:
@@ -342,11 +342,13 @@ def default_expr(d: str, t: Type) -> str:
         if d == "None":
             return "c10::nullopt"
 
-        return default_expr(d, t.elem)
+        return default_expr(d, t.elem, symint=symint)
 
     if isinstance(t, ListType):
         if d.startswith("[") and d.endswith("]"):
             return "{" + d[1:-1] + "}"
+        elif symint and d.isdigit() and str(t.elem) == "SymInt":
+            return f"c10::SymInt({d})"
         elif t.size is None:
             # NOTE: Sized lists can have scalar defaults
             raise ValueError(f"Expected a list default '[...]' but found: '{d}'")
@@ -386,7 +388,7 @@ def sub_argument(
             binds = a.name
         default: Optional[str] = None
         if a.name not in cpp_no_default_args and a.default is not None:
-            default = default_expr(a.default, a.type)
+            default = default_expr(a.default, a.type, symint=symint)
         return [
             Binding(
                 nctype=argument_type(a, binds=binds, symint=symint),
diff --git a/torchgen/api/native.py b/torchgen/api/native.py
index b197a2a02983a..7f8b3eb3af2e7 100644
--- a/torchgen/api/native.py
+++ b/torchgen/api/native.py
@@ -95,7 +95,7 @@ def argument(
     if isinstance(a, Argument):
         default: Optional[str] = None
         if should_default and a.default is not None:
-            default = cpp.default_expr(a.default, a.type)
+            default = cpp.default_expr(a.default, a.type, symint=symint)
         return [
             Binding(
                 nctype=argument_type(a, binds=a.name, symint=symint),
diff --git a/torchgen/api/python.py b/torchgen/api/python.py
index 96c006b303eaa..728ee4c18c0a6 100644
--- a/torchgen/api/python.py
+++ b/torchgen/api/python.py
@@ -719,7 +719,9 @@ def argument(a: Argument) -> PythonArgument:
         name=a.name,
         type=a.type,
         # TODO: directly translate a.default to python default
-        default=str(pythonify_default(cpp.default_expr(a.default, a.type)))
+        default=str(
+            pythonify_default(cpp.default_expr(a.default, a.type, symint=False))
+        )
         if a.default is not None
         else None,
         default_init=None,
@@ -804,7 +806,7 @@ def topt_default_init(name: str) -> Optional[str]:
             a = getattr(topt_args, name)
             if a.default is None or a.default == "None":
                 return None
-            return cpp.default_expr(a.default, a.type)
+            return cpp.default_expr(a.default, a.type, symint=False)
 
         tensor_options_args.append(
             PythonArgument(
diff --git a/torchgen/gen.py b/torchgen/gen.py
index e53734969afda..79970c94610dd 100644
--- a/torchgen/gen.py
+++ b/torchgen/gen.py
@@ -1151,7 +1151,9 @@ def compute_argument_yaml(
         "type": cpp.argument_type(a, binds="__placeholder__", symint=False).cpp_type(),
     }
     if a.default is not None:
-        arg["default"] = pythonify_default(cpp.default_expr(a.default, a.type))
+        arg["default"] = pythonify_default(
+            cpp.default_expr(a.default, a.type, symint=False)
+        )
     if a.name in kwarg_only_set:
         arg["kwarg_only"] = True
     if a.name in out_arg_set:

From 84b2111191d95ffbe185dfc6efd508e0e3f4f962 Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Wed, 26 Oct 2022 10:26:44 -0400
Subject: [PATCH 0181/1922] Remove custom Ceil in favor of sympy.ceiling
 (#87294)

[Alban]: the other changes that used to be in this PR (neg and fix for true div) are moved to other places where they already exist. Namely neg is already in master and true div will be in the next PR on the stack where all other functions are fixed at the same time.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87294
Approved by: https://github.com/ezyang
---
 torch/fx/experimental/symbolic_shapes.py | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index 29a49b50ba29b..a7030abbcfc41 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -207,26 +207,6 @@ def eval(cls, base, divisor):
                     sympy.simplify(base / gcd), sympy.simplify(divisor / gcd)
                 )
 
-    class Ceil(sympy.Function):
-        """
-        sympy doesn't have its own ceil(), so rolling one here.
-        We maintain this so that we can simplify a sympy.Rational into a sympy.Float.
-        sympy.Float isn't supported.
-        """
-        nargs = (1,)
-
-        @classmethod
-        def eval(cls, a):
-            if isinstance(a, sympy.Integer):
-                return a
-            elif isinstance(a, sympy.core.symbol.Symbol) and a.is_scalar:
-                # TODO: do we need to simplify expr's first? (e.g. if we have 3/3), is is_scalar() true?
-                return a
-            elif isinstance(a, sympy.Rational):
-                return a.floor() + 1
-            else:
-                raise NotImplementedError("math.ceil() not supported for type: " + str(type(a)))
-
 # Methods that have a `__foo__` as well as `__rfoo__`
 reflectable_magic_methods = {
     'add': lambda a, b: a + b,
@@ -245,7 +225,7 @@ def eval(cls, a):
     'lt': lambda a, b: sympy.Lt(a, b),
     'le': lambda a, b: sympy.Le(a, b),
     'ge': lambda a, b: sympy.Ge(a, b),
-    'ceil': lambda a: Ceil(a),
+    'ceil': lambda a: sympy.ceiling(a),
     'neg': lambda a: -a,
     'min': lambda a, b: sympy.Min(a, b),
     'max': lambda a, b: sympy.Max(a, b),

From 03eca170b6b07c755cb6a6fa549cb888df8fd93c Mon Sep 17 00:00:00 2001
From: Sergii Dymchenko <sdym@fb.com>
Date: Wed, 26 Oct 2022 17:43:35 +0000
Subject: [PATCH 0182/1922] Install py for pytest-sugar (#87803)

linux-focal-py3.7-clang10-onnx / test is failng, the issue is https://github.com/Teemu/pytest-sugar/issues/241

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87803
Approved by: https://github.com/seemethere, https://github.com/huydhn
---
 .jenkins/caffe2/test.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
index 0204907ee865d..2b6f7ec6b246a 100755
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@@ -149,6 +149,9 @@ export DNNL_MAX_CPU_ISA=AVX2
 
 # Should still run even in the absence of SHARD_NUMBER
 if [[ "${SHARD_NUMBER:-1}" == "1" ]]; then
+  # TODO(sdym@meta.com) remove this when the linked issue resolved.
+  # py is temporary until https://github.com/Teemu/pytest-sugar/issues/241 is fixed
+  pip install --user py==1.11.0
   pip install --user pytest-sugar
   # NB: Warnings are disabled because they make it harder to see what
   # the actual erroring test is

From 823fff44b14ac11d915bda0fbcc52a2e6e58779c Mon Sep 17 00:00:00 2001
From: arnaudstiegler <arnaud.stiegler@gmail.com>
Date: Wed, 26 Oct 2022 17:45:46 +0000
Subject: [PATCH 0183/1922] Adding expm1 to MPS (#87147)

Fixes #86744

- Implementing the new `expm1_out_mps` function in `aten/src/ATen/native/mps/operations/UnaryOps.mm`
- Adding it to `aten/src/ATen/native/native_functions.yaml`
- Adding it to existing `test.test_mps.TestNLLLoss.test_unary_ops`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87147
Approved by: https://github.com/kulinseth
---
 aten/src/ATen/native/mps/operations/UnaryOps.mm | 14 ++++++++++++++
 aten/src/ATen/native/native_functions.yaml      |  1 +
 test/test_mps.py                                |  1 +
 3 files changed, 16 insertions(+)

diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index 2763eff39f6a6..6b33e31341c8d 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -249,5 +249,19 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
                 });
 }
 
+TORCH_IMPL_FUNC(expm1_out_mps) (const Tensor& self, const Tensor& output) {
+  mps::unary_op(self, output, "expm1_out_mps",
+                ^ MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
+                  MPSGraphTensor* oneTensor = [mpsGraph constantWithScalar:1.0
+                                                       shape:@[@1]
+                                                       dataType:inputTensor.dataType];
+                  MPSGraphTensor* ePowTensor = [mpsGraph exponentWithTensor:inputTensor
+                                                                         name:nil];
+                  return [mpsGraph subtractionWithPrimaryTensor:ePowTensor
+                                               secondaryTensor:oneTensor
+                                                   name: nil];
+                });
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 2922e2be825b0..0954d1c662707 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -2334,6 +2334,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: expm1_out
+    MPS: expm1_out_mps
     SparseCPU, SparseCUDA: expm1_sparse_out
     SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr_out
 
diff --git a/test/test_mps.py b/test/test_mps.py
index 98df393c3e955..2b30ab926035b 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -4901,6 +4901,7 @@ def helper(shape, op):
 
         helper((2, 8, 4, 5), torch.exp)
         helper((2, 8, 3, 5), torch.exp2)
+        helper((2, 8, 3, 5), torch.expm1)
         helper((2, 8, 3, 5), torch.log)
         helper((2, 8, 3, 5), torch.cos)
 

From b6ae8dad725c207410172e7fb377470e91806a24 Mon Sep 17 00:00:00 2001
From: Ethan Pronovost <epronovo1@gmail.com>
Date: Wed, 26 Oct 2022 18:50:48 +0000
Subject: [PATCH 0184/1922] Add type annotations to distribution.py (#87577)

As title.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87577
Approved by: https://github.com/kit1980
---
 torch/distributions/distribution.py | 50 ++++++++++++++++-------------
 1 file changed, 28 insertions(+), 22 deletions(-)

diff --git a/torch/distributions/distribution.py b/torch/distributions/distribution.py
index 66bd158bd87b6..4159f34d7748a 100644
--- a/torch/distributions/distribution.py
+++ b/torch/distributions/distribution.py
@@ -2,7 +2,8 @@
 import warnings
 from torch.distributions import constraints
 from torch.distributions.utils import lazy_property
-from typing import Dict, Optional, Any
+from torch.types import _size
+from typing import Dict, Optional, Any, Tuple
 
 __all__ = ['Distribution']
 
@@ -16,7 +17,7 @@ class Distribution(object):
     _validate_args = __debug__
 
     @staticmethod
-    def set_default_validate_args(value):
+    def set_default_validate_args(value: bool) -> None:
         """
         Sets whether validation is enabled or disabled.
 
@@ -32,7 +33,12 @@ def set_default_validate_args(value):
             raise ValueError
         Distribution._validate_args = value
 
-    def __init__(self, batch_shape=torch.Size(), event_shape=torch.Size(), validate_args=None):
+    def __init__(
+        self,
+        batch_shape: torch.Size = torch.Size(),
+        event_shape: torch.Size = torch.Size(),
+        validate_args: Optional[bool] = None,
+    ):
         self._batch_shape = batch_shape
         self._event_shape = event_shape
         if validate_args is not None:
@@ -62,7 +68,7 @@ def __init__(self, batch_shape=torch.Size(), event_shape=torch.Size(), validate_
                     )
         super(Distribution, self).__init__()
 
-    def expand(self, batch_shape, _instance=None):
+    def expand(self, batch_shape: torch.Size, _instance=None):
         """
         Returns a new distribution instance (or populates an existing instance
         provided by a derived class) with batch dimensions expanded to
@@ -84,14 +90,14 @@ def expand(self, batch_shape, _instance=None):
         raise NotImplementedError
 
     @property
-    def batch_shape(self):
+    def batch_shape(self) -> torch.Size:
         """
         Returns the shape over which parameters are batched.
         """
         return self._batch_shape
 
     @property
-    def event_shape(self):
+    def event_shape(self) -> torch.Size:
         """
         Returns the shape of a single sample (without batching).
         """
@@ -116,34 +122,34 @@ def support(self) -> Optional[Any]:
         raise NotImplementedError
 
     @property
-    def mean(self):
+    def mean(self) -> torch.Tensor:
         """
         Returns the mean of the distribution.
         """
         raise NotImplementedError
 
     @property
-    def mode(self):
+    def mode(self) -> torch.Tensor:
         """
         Returns the mode of the distribution.
         """
         raise NotImplementedError(f"{self.__class__} does not implement mode")
 
     @property
-    def variance(self):
+    def variance(self) -> torch.Tensor:
         """
         Returns the variance of the distribution.
         """
         raise NotImplementedError
 
     @property
-    def stddev(self):
+    def stddev(self) -> torch.Tensor:
         """
         Returns the standard deviation of the distribution.
         """
         return self.variance.sqrt()
 
-    def sample(self, sample_shape=torch.Size()):
+    def sample(self, sample_shape: torch.Size = torch.Size()) -> torch.Tensor:
         """
         Generates a sample_shape shaped sample or sample_shape shaped batch of
         samples if the distribution parameters are batched.
@@ -151,7 +157,7 @@ def sample(self, sample_shape=torch.Size()):
         with torch.no_grad():
             return self.rsample(sample_shape)
 
-    def rsample(self, sample_shape=torch.Size()):
+    def rsample(self, sample_shape: torch.Size = torch.Size()) -> torch.Tensor:
         """
         Generates a sample_shape shaped reparameterized sample or sample_shape
         shaped batch of reparameterized samples if the distribution parameters
@@ -159,7 +165,7 @@ def rsample(self, sample_shape=torch.Size()):
         """
         raise NotImplementedError
 
-    def sample_n(self, n):
+    def sample_n(self, n: int) -> torch.Tensor:
         """
         Generates n samples or n batches of samples if the distribution
         parameters are batched.
@@ -167,7 +173,7 @@ def sample_n(self, n):
         warnings.warn('sample_n will be deprecated. Use .sample((n,)) instead', UserWarning)
         return self.sample(torch.Size((n,)))
 
-    def log_prob(self, value):
+    def log_prob(self, value: torch.Tensor) -> torch.Tensor:
         """
         Returns the log of the probability density/mass function evaluated at
         `value`.
@@ -177,7 +183,7 @@ def log_prob(self, value):
         """
         raise NotImplementedError
 
-    def cdf(self, value):
+    def cdf(self, value: torch.Tensor) -> torch.Tensor:
         """
         Returns the cumulative density/mass function evaluated at
         `value`.
@@ -187,7 +193,7 @@ def cdf(self, value):
         """
         raise NotImplementedError
 
-    def icdf(self, value):
+    def icdf(self, value: torch.Tensor) -> torch.Tensor:
         """
         Returns the inverse cumulative density/mass function evaluated at
         `value`.
@@ -197,7 +203,7 @@ def icdf(self, value):
         """
         raise NotImplementedError
 
-    def enumerate_support(self, expand=True):
+    def enumerate_support(self, expand: bool = True) -> torch.Tensor:
         """
         Returns tensor containing all values supported by a discrete
         distribution. The result will enumerate over dimension 0, so the shape
@@ -221,7 +227,7 @@ def enumerate_support(self, expand=True):
         """
         raise NotImplementedError
 
-    def entropy(self):
+    def entropy(self) -> torch.Tensor:
         """
         Returns entropy of distribution, batched over batch_shape.
 
@@ -230,7 +236,7 @@ def entropy(self):
         """
         raise NotImplementedError
 
-    def perplexity(self):
+    def perplexity(self) -> torch.Tensor:
         """
         Returns perplexity of distribution, batched over batch_shape.
 
@@ -239,7 +245,7 @@ def perplexity(self):
         """
         return torch.exp(self.entropy())
 
-    def _extended_shape(self, sample_shape=torch.Size()):
+    def _extended_shape(self, sample_shape: _size = torch.Size()) -> Tuple[int, ...]:
         """
         Returns the size of the sample returned by the distribution, given
         a `sample_shape`. Note, that the batch and event shapes of a distribution
@@ -253,7 +259,7 @@ def _extended_shape(self, sample_shape=torch.Size()):
             sample_shape = torch.Size(sample_shape)
         return sample_shape + self._batch_shape + self._event_shape
 
-    def _validate_sample(self, value):
+    def _validate_sample(self, value: torch.Tensor) -> None:
         """
         Argument validation for distribution methods such as `log_prob`,
         `cdf` and `icdf`. The rightmost dimensions of a value to be
@@ -306,7 +312,7 @@ def _get_checked_instance(self, cls, _instance=None):
                                       format(self.__class__.__name__, cls.__name__))
         return self.__new__(type(self)) if _instance is None else _instance
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         param_names = [k for k, _ in self.arg_constraints.items() if k in self.__dict__]
         args_string = ', '.join(['{}: {}'.format(p, self.__dict__[p]
                                 if self.__dict__[p].numel() == 1

From 9f0ad2d7beb0f789fda2ffb664758bd45da11565 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 26 Oct 2022 18:51:36 +0000
Subject: [PATCH 0185/1922] Revert "[pytorch] Layer norm backward speed gain
 with warp shuffles (#87445)"

This reverts commit b6f28334bc3276a56d79dea6cb7ed99411556348.

Reverted https://github.com/pytorch/pytorch/pull/87445 on behalf of https://github.com/weiwangmeta due to breaking internal builds due to MS compiler
---
 .../src/ATen/native/cuda/layer_norm_kernel.cu | 242 ++++--------------
 1 file changed, 54 insertions(+), 188 deletions(-)

diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
index 732545465d9c9..ae09f0aaad8f8 100644
--- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@@ -33,7 +33,6 @@ namespace {
 
 constexpr int kCUDANumThreads = 256;
 constexpr int kColwiseReduceTileSize = 32;
-constexpr int kWarpSize = 32;
 constexpr int vec_size = 4; //we could make it dependent on dtype, but that would lead to different results between float and low-p types
 
 // aligned vector generates vectorized load/store on CUDA (copy-pasted from MemoryAccess.cuh)
@@ -556,108 +555,8 @@ __global__ void GammaBetaBackwardCUDAKernel1(
   }
 }
 
-template <typename T, typename T_ACC>
-__global__ void GammaBetaBackwardCUDAKernel_32x32(
-    int64_t M,
-    int64_t N,
-    const T* dY,
-    const T* X,
-    const T_ACC* mean,
-    const T_ACC* rstd,
-    T* dg,
-    T* db) {
-  alignas(sizeof(double)) extern __shared__ char s_data1[];
-  T_ACC* s_data_typed = reinterpret_cast<T_ACC*>(&s_data1);
-  T_ACC* s_dg;
-  T_ACC* s_db;
 
-  T_ACC dg_sum = 0;
-  T_ACC db_sum = 0;
 
-  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (j < N) {
-    constexpr int unroll_factor = 8;
-    int laneId = threadIdx.x & 0x1f;
-
-    T_ACC mean_reg, mean_reg_tmp;
-    T_ACC rstd_reg, rstd_reg_tmp;
-    T dY_reg;
-    T X_reg;
-
-    // Main loop
-    int bcounter;
-    for (bcounter = 0; bcounter < M / (blockDim.y * unroll_factor);
-         bcounter++) {
-      int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor;
-
-      if (laneId < unroll_factor) {
-        mean_reg_tmp = mean[offset + laneId];
-        rstd_reg_tmp = rstd[offset + laneId];
-      }
-#if !defined(USE_ROCM)
-      // Volta and newer architectures allow lane divergence within a warp.
-      __syncwarp();
-#endif
-
-      #pragma unroll
-      for (int ii = 0; ii < unroll_factor; ++ii) {
-        dY_reg = dY[(offset + ii) * N + j];
-        X_reg = X[(offset + ii) * N + j];
-        mean_reg = WARP_SHFL(mean_reg_tmp, ii, kWarpSize);
-        rstd_reg = WARP_SHFL(rstd_reg_tmp, ii, kWarpSize);
-        dg_sum += dY_reg * (X_reg - mean_reg) * rstd_reg;
-        db_sum += dY_reg;
-      }
-    }
-
-    // Remainder loop
-    int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor;
-    for (int ii = 0; ii < unroll_factor; ii++) {
-      if ((offset + ii) < M) {
-        mean_reg = mean[offset + ii];
-        rstd_reg = rstd[offset + ii];
-        dY_reg = dY[(offset + ii) * N + j];
-        X_reg = X[(offset + ii) * N + j];
-        dg_sum += dY_reg * (X_reg - mean_reg) * rstd_reg;
-        db_sum += dY_reg;
-      }
-    }
-
-    // This kernel uses a block of (32 x 32) and gets called when M; N
-    // divide by 32. We can use warp shuffles for the final reduction
-    // step. This removes 4 shmem loads and stores with their
-    // corresponding __syncthreads()
-
-    // This greatly reduces bank conflicts at the expense of a little
-    // extra shared memory. It does not impact occupancy
-    int padded_bx = (1 + blockDim.x);
-
-    s_dg = s_data_typed;
-    s_db = s_data_typed + (padded_bx * blockDim.y);
-    s_dg[threadIdx.y * padded_bx + threadIdx.x] = dg_sum;
-    s_db[threadIdx.y * padded_bx + threadIdx.x] = db_sum;
-    __syncthreads();
-
-    // Load transposed so that a warp holds an entire column
-    T_ACC reg_dg = s_dg[threadIdx.x * padded_bx + threadIdx.y];
-    T_ACC reg_db = s_db[threadIdx.x * padded_bx + threadIdx.y];
-    for (int delta = 16; delta >= 1; delta /= 2) {
-      reg_dg += WARP_SHFL_XOR(reg_dg, delta, kWarpSize);
-      reg_db += WARP_SHFL_XOR(reg_db, delta, kWarpSize);
-    }
-
-    if (threadIdx.x == 0) {
-      const int64_t j = blockIdx.x * blockDim.x + threadIdx.y;
-      if (dg) {
-        dg[j] = reg_dg;
-      }
-      if (db) {
-        db[j] = reg_db;
-      }
-    }
-  }
-}
 
 template <typename T, typename T_ACC>
 __global__ void GammaBetaBackwardCUDAKernel(
@@ -670,75 +569,66 @@ __global__ void GammaBetaBackwardCUDAKernel(
     T* dg,
     T* db) {
   alignas(sizeof(double)) extern __shared__ char s_data1[];
-  T_ACC* s_data_typed = reinterpret_cast<T_ACC*>(&s_data1);
-  T_ACC* s_dg;
-  T_ACC* s_db;
-
+  T_ACC * s_data_typed = reinterpret_cast<T_ACC*>(&s_data1);
   const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
-
+  constexpr int unroll = 8;
+  T dYs[unroll];
+  T Xs[unroll];
+  T_ACC *  means = s_data_typed;
+  T_ACC * rstds = s_data_typed + unroll * blockDim.y;
   T_ACC dg_sum = 0;
   T_ACC db_sum = 0;
-
   if (j < N) {
-    constexpr int unroll_factor = 8;
-
-    T_ACC mean_reg;
-    T_ACC rstd_reg;
-    T dY_reg;
-    T X_reg;
-
-    // Main Loop
     int bcounter;
-    for (bcounter = 0; bcounter < M / (blockDim.y * unroll_factor); bcounter++){
-      int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor;
+    for (bcounter = 0; bcounter < M/(blockDim.y * unroll); bcounter++){
+      int offset = (bcounter * blockDim.y + threadIdx.y) * unroll;
+      #pragma unroll
+      for (int ii=0; ii<unroll; ii++){
+        if (threadIdx.x == 0) {
+          means[ii*blockDim.y + threadIdx.y] = mean[offset + ii];
+          rstds[ii*blockDim.y + threadIdx.y] = rstd[offset + ii];
+        }
+        dYs[ii] = dY[(offset + ii) * N + j ];
+        Xs[ii] = X[(offset + ii) * N + j];
 
+      }
+      __syncthreads();
       #pragma unroll
-      for (int ii = 0; ii < unroll_factor; ++ii) {
-        dY_reg = dY[(offset + ii) * N + j];
-        X_reg = X[(offset + ii) * N + j];
-        mean_reg = mean[offset + ii];
-        rstd_reg = rstd[offset + ii];
-        dg_sum += dY_reg * (X_reg - mean_reg) * rstd_reg;
-        db_sum += dY_reg;
+      for (int ii=0; ii<unroll; ii++){
+        dg_sum += dYs[ii] * (Xs[ii] - means[ii*blockDim.y + threadIdx.y]) * rstds[ii * blockDim.y + threadIdx.y];
+        db_sum += dYs[ii];
       }
+      __syncthreads();
     }
-
-    // Remainder loop
-    int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor;
-    for (int ii = 0; ii < unroll_factor; ii++ ){
+    int offset = (bcounter * blockDim.y + threadIdx.y) * unroll;
+    for (int ii = 0; ii<8; ii++ ){
+      T_ACC mean_val, rstd_val; // we don't use smem in the tail to avoid awkward synchronizations, perf penalty is negligible
       if ((offset + ii) < M) {
-        dY_reg = dY[(offset + ii) * N + j ];
-        X_reg = X[(offset + ii) * N + j];
-        mean_reg = mean[offset + ii];
-        rstd_reg = rstd[offset + ii];
-        dg_sum += dY_reg * (X_reg - mean_reg) * rstd_reg;
-        db_sum += dY_reg;
+        mean_val = mean[offset+ii];
+        rstd_val = rstd[offset+ii];
+        dYs[0] = dY[(offset + ii) * N + j ];
+        Xs[0] = X[(offset + ii) * N + j];
+        dg_sum += dYs[0] * (Xs[0] - mean_val) * rstd_val;
+        db_sum += dYs[0];
       }
     }
-
-    // Do the final reduction in shared memory
-    s_dg = s_data_typed;
-    s_db = s_data_typed + blockDim.x * blockDim.y;
-    s_dg[threadIdx.y * blockDim.x + threadIdx.x] = dg_sum;
-    s_db[threadIdx.y * blockDim.x + threadIdx.x] = db_sum;
+    s_data_typed[threadIdx.y * blockDim.x + threadIdx.x] = dg_sum;
+    s_data_typed[blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x] = db_sum;
     __syncthreads();
-
-    for (int offset = blockDim.y / 2; offset >= 1; offset /= 2) {
+    for (int offset = blockDim.y/2; offset >=1; offset /= 2){
       if (threadIdx.y < offset) {
-        s_dg[threadIdx.y * blockDim.x + threadIdx.x] +=
-            s_dg[(threadIdx.y + offset) * blockDim.x + threadIdx.x];
-        s_db[threadIdx.y * blockDim.x + threadIdx.x] +=
-            s_db[(threadIdx.y + offset) * blockDim.x + threadIdx.x];
-        }
+        s_data_typed[threadIdx.y * blockDim.x + threadIdx.x] += s_data_typed[(threadIdx.y + offset) * blockDim.x + threadIdx.x];
+        s_data_typed[blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x] +=
+        s_data_typed[blockDim.x * blockDim.y + (threadIdx.y + offset) * blockDim.x + threadIdx.x];
+      }
       __syncthreads();
     }
-
     if (threadIdx.y == 0) {
       if (dg) {
-        dg[j] = s_dg[threadIdx.x];
+        dg[j] = s_data_typed[threadIdx.x];
       }
       if (db) {
-        db[j] = s_db[threadIdx.x];
+        db[j] = s_data_typed[threadIdx.x + blockDim.x * blockDim.y];
       }
     }
   }
@@ -873,8 +763,7 @@ void LayerNormBackwardKernelImplInternal(
     T* dgamma_data =
         dgamma->defined() ? dgamma->template data_ptr<T>() : nullptr;
     T* dbeta_data = dbeta->defined() ? dbeta->template data_ptr<T>() : nullptr;
-
-    if (M < 128) {
+    if (M < 512) {
       // For small batch size, do colwise reduce directly.
       const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads;
       GammaBetaBackwardSimpleCUDAKernel<T, T_ACC>
@@ -889,42 +778,19 @@ void LayerNormBackwardKernelImplInternal(
               dbeta_data);
       C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
-      if ((M % kWarpSize == 0) && (N % kWarpSize == 0)) {
-        // This implementation relies on warp primitives and requires that M and N divide
-        // exactly to warp size.
-        dim3 threads{kWarpSize, kWarpSize};
-        int blocks = (N + threads.x - 1) / threads.x;
-
-        // If M and N divide by 32, we can use warp shuffles for the final reduction. That requires
-        // transposing values in shared memory, so we apply a padding to reduce bank conflicts.
-        size_t shmem_sz = 2 * sizeof(T_ACC) * (threads.x + 1) * threads.y;
-        GammaBetaBackwardCUDAKernel_32x32<T, T_ACC>
-            <<<blocks, threads, shmem_sz, cuda_stream>>>(
-                M,
-                N,
-                dY_data,
-                X_data,
-                mean_data,
-                rstd_data,
-                dgamma_data,
-                dbeta_data);
-          C10_CUDA_KERNEL_LAUNCH_CHECK();
-      } else {
-        dim3 threads{16, 32};
-        int blocks = (N + threads.x - 1) / threads.x;
-        size_t shmem_sz = 2 * sizeof(T_ACC) * threads.x * threads.y;
-        GammaBetaBackwardCUDAKernel<T, T_ACC>
-            <<<blocks, threads, shmem_sz, cuda_stream>>>(
-                M,
-                N,
-                dY_data,
-                X_data,
-                mean_data,
-                rstd_data,
-                dgamma_data,
-                dbeta_data);
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-      }
+      dim3 threads{16, 32};
+      int blocks = (N + threads.x-1)/threads.x;
+      GammaBetaBackwardCUDAKernel<T, T_ACC>
+          <<<blocks, threads, 2 * sizeof(T_ACC) * threads.x * threads.y, cuda_stream>>>(
+              M,
+              N,
+              dY_data,
+              X_data,
+              mean_data,
+              rstd_data,
+              dgamma_data,
+              dbeta_data);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
   }
 }

From 7455967dbdfa6db019638b9cb8886114647e2349 Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Tue, 25 Oct 2022 22:15:46 -0700
Subject: [PATCH 0186/1922] [ao][ns] Replacing List[QConfigMapping] in PNP
 (#86922)

Summary: Added QConfigMultiMapping which is essentially a
List[QConfigMapping] with set methods and dedicated handling to
avoid unwanted matches and improve UX.

note: the from __future__ import annotations line caused weird errors when the
QConfigMultiMapping class was put in _numeric_suite_fx.py so it was moved.

Test Plan: python test/test_quantization.py TestFxNumericSuiteNShadows

Reviewers:

Subscribers:

Tasks:

Tags:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86922
Approved by: https://github.com/vkuzo
---
 test/quantization/fx/test_numeric_suite_fx.py | 229 +++++++++++++++--
 torch/ao/ns/_numeric_suite_fx.py              |  18 +-
 torch/ao/ns/fx/qconfig_multi_mapping.py       | 242 ++++++++++++++++++
 3 files changed, 452 insertions(+), 37 deletions(-)
 create mode 100644 torch/ao/ns/fx/qconfig_multi_mapping.py

diff --git a/test/quantization/fx/test_numeric_suite_fx.py b/test/quantization/fx/test_numeric_suite_fx.py
index 27fe772d2e228..7f46cf0a442b3 100644
--- a/test/quantization/fx/test_numeric_suite_fx.py
+++ b/test/quantization/fx/test_numeric_suite_fx.py
@@ -31,6 +31,7 @@
     LSTMwithHiddenDynamicModel,
     SparseNNModel,
     skip_if_no_torchvision,
+    TwoLayerLinearModel
 )
 from torch.ao.quantization.quantization_mappings import (
     get_default_static_quant_module_mappings,
@@ -82,6 +83,7 @@
     loggers_set_enabled,
     loggers_set_save_activations,
 )
+from torch.ao.ns.fx.qconfig_multi_mapping import QConfigMultiMapping
 from torch.ao.quantization.backend_config import get_native_backend_config
 from torch.ao.quantization.fx.backend_config_utils import get_pattern_to_quantize_handlers
 
@@ -2096,6 +2098,7 @@ def _test_impl(self, m, example_input, qconfig_mappings):
 
         results = extract_results_n_shadows_model(msq)
         print_comparisons_n_shadows_model(results)
+        return msq
 
     def test_linear_mod(self):
         class M(nn.Module):
@@ -2110,9 +2113,8 @@ def forward(self, x):
         m = M().eval()
         example_input = (torch.randn(2, 2),)
 
-        qconfig_mappings = [
-            QConfigMapping().set_global(torch.quantization.default_qconfig),
-        ]
+        qconfig_mappings = \
+            QConfigMultiMapping().set_global([torch.quantization.default_qconfig])
         self._test_impl(m, example_input, qconfig_mappings)
 
     def test_linear_relu_mod(self):
@@ -2132,10 +2134,12 @@ def forward(self, x):
         m = M().eval()
         example_input = (torch.randn(2, 2),)
 
-        qconfig_mappings = [
-            QConfigMapping().set_global(torch.quantization.default_qconfig),
-            QConfigMapping().set_global(torch.quantization.default_dynamic_qconfig),
-        ]
+        qconfig_mappings = (
+            QConfigMultiMapping().set_global([
+                torch.quantization.default_qconfig,
+                torch.quantization.default_dynamic_qconfig
+            ])
+        )
         self._test_impl(m, example_input, qconfig_mappings)
 
     def test_conv_bn_relu_mod(self):
@@ -2154,10 +2158,12 @@ def forward(self, x):
 
         m = M().eval()
         example_input = (torch.randn(32, 1, 16, 16),)
-        qconfig_mappings = [
-            QConfigMapping().set_global(torch.quantization.default_qconfig),
-            QConfigMapping().set_global(torch.quantization.default_per_channel_qconfig),
-        ]
+
+        qconfig_mappings = QConfigMultiMapping() \
+            .set_global([
+                torch.quantization.default_qconfig,
+                torch.quantization.default_per_channel_qconfig
+            ])
         self._test_impl(m, example_input, qconfig_mappings)
 
     def test_functions(self):
@@ -2194,10 +2200,8 @@ def forward(self, x):
         m = M().eval()
         example_input = (torch.randn(2, 2),)
 
-        qconfig_mappings = [
-            QConfigMapping().set_global(torch.quantization.default_qconfig),
-            # QConfigMapping().set_global(torch.quantization.default_per_channel_qconfig),
-        ]
+        qconfig_mappings = QConfigMultiMapping() \
+            .set_global([torch.quantization.default_qconfig])
         self._test_impl(m, example_input, qconfig_mappings)
 
     def test_partial_qconfig_mapping(self):
@@ -2220,19 +2224,17 @@ def forward(self, x):
         example_input = (torch.randn(2, 2),)
         qconfig = torch.ao.quantization.default_qconfig
 
-        qconfig_mappings = [
-            QConfigMapping().set_global(None)
-                            .set_object_type(F.linear, qconfig)
-                            .set_object_type(F.relu, qconfig),
-        ]
+        qconfig_mappings = QConfigMultiMapping() \
+            .set_object_type(F.linear, [qconfig]) \
+            .set_object_type(F.relu, [qconfig])
         self._test_impl(m, example_input, qconfig_mappings)
 
     def test_logger_enabled_and_save_activations_flags(self):
         m = nn.Sequential(nn.Linear(1, 1)).eval()
         example_input = (torch.randn(1, 1),)
-        qconfig_mappings = [
-            QConfigMapping().set_global(torch.quantization.default_qconfig),
-        ]
+
+        qconfig_mappings = QConfigMultiMapping() \
+            .set_global([torch.quantization.default_qconfig])
         backend_config = get_native_backend_config()
 
         msp = prepare_n_shadows_model(
@@ -2281,12 +2283,187 @@ def test_mobilenet_v2(self):
             pretrained=False, quantize=False).eval()
         example_input = (torch.randn(1, 3, 224, 224),)
 
-        qconfig_mappings = [
+        qconfig_mappings = QConfigMultiMapping() \
+            .set_global([torch.quantization.default_qconfig, torch.quantization.default_dynamic_qconfig])
+
+        self._test_impl(m, example_input, qconfig_mappings)
+
+    def test_qconfig_multi_mapping_deduplication(self):
+        # check that insertion deduplicates qconfigs
+        qconfig_multi_mapping = QConfigMultiMapping().set_global(
+            [torch.quantization.default_qconfig, torch.quantization.default_qconfig]
+        )
+        self.assertEqual(len(qconfig_multi_mapping.qconfig_mappings_list), 1)
+
+    def test_qconfig_multi_mapping_insert_padding(self):
+        # test that inserting a higher priority qconfig style with fewer elements than a lower priority qconfig will
+        # result in adding None to the extra QConfigMappings at that same style+key
+        qconfig_multi_mapping = (
+            QConfigMultiMapping()
+            .set_global(
+                [
+                    torch.quantization.default_qconfig,
+                    torch.quantization.default_dynamic_qconfig,
+                ]
+            )
+            .set_object_type(torch.nn.Linear, [torch.quantization.default_qconfig])
+            .set_module_name_regex("fc", [torch.quantization.default_qconfig])
+            .set_module_name("fc2", [torch.quantization.default_qconfig])
+            .set_module_name_object_type_order(
+                "", nn.Linear, 0, [torch.quantization.default_qconfig]
+            )
+        )
+
+        self.assertEqual(
+            qconfig_multi_mapping.qconfig_mappings_list[1].object_type_qconfigs[
+                torch.nn.Linear
+            ],
+            None,
+        )
+        self.assertEqual(
+            qconfig_multi_mapping.qconfig_mappings_list[1].module_name_regex_qconfigs[
+                "fc"
+            ],
+            None,
+        )
+        self.assertEqual(
+            qconfig_multi_mapping.qconfig_mappings_list[1].module_name_qconfigs["fc2"],
+            None,
+        )
+        self.assertEqual(
+            qconfig_multi_mapping.qconfig_mappings_list[
+                1
+            ].module_name_object_type_order_qconfigs[("", nn.Linear, 0)],
+            None,
+        )
+
+    def test_qconfig_multi_mapping_retroactive_padding(self):
+        # test that inserting a lower priority qconfig style with more elements thhan lower priority qconfig styles
+        # will result in the new QConfigMapping having None at all previously existing styles+keys
+        qconfig_multi_mapping = (
+            QConfigMultiMapping()
+            .set_object_type(torch.nn.Linear, [torch.quantization.default_qconfig])
+            .set_module_name_regex("fc", [torch.quantization.default_qconfig])
+            .set_module_name("fc2", [torch.quantization.default_qconfig])
+            .set_module_name_object_type_order(
+                "", nn.Linear, 0, [torch.quantization.default_qconfig]
+            )
+            .set_global(
+                [
+                    torch.quantization.default_qconfig,
+                    torch.quantization.default_dynamic_qconfig,
+                ]
+            )
+        )
+
+        self.assertEqual(
+            qconfig_multi_mapping.qconfig_mappings_list[1].object_type_qconfigs[
+                torch.nn.Linear
+            ],
+            None,
+        )
+        self.assertEqual(
+            qconfig_multi_mapping.qconfig_mappings_list[1].module_name_regex_qconfigs[
+                "fc"
+            ],
+            None,
+        )
+        self.assertEqual(
+            qconfig_multi_mapping.qconfig_mappings_list[1].module_name_qconfigs["fc2"],
+            None,
+        )
+        self.assertEqual(
+            qconfig_multi_mapping.qconfig_mappings_list[
+                1
+            ].module_name_object_type_order_qconfigs[("", nn.Linear, 0)],
+            None,
+        )
+
+    def test_qconfig_multi_mapping_end_to_end(self):
+        # test that the prepare/convert_n_shadows_model works as expected
+        # with qconfig_multi_mapping and avoids unwanted matches
+
+        m = TwoLayerLinearModel().eval()
+        example_input = m.get_example_inputs()
+
+        qconfig_multi_mapping = (
+            QConfigMultiMapping()
+            .set_global(
+                [
+                    torch.quantization.default_qconfig,
+                    torch.quantization.default_dynamic_qconfig,
+                ]
+            )
+            .set_module_name("fc2", [None, torch.quantization.default_qconfig])
+        )
+        self.assertEqual(
+            qconfig_multi_mapping.qconfig_mappings_list[1].module_name_qconfigs["fc2"],
+            None,
+        )
+        msq = self._test_impl(m, example_input, qconfig_multi_mapping)
+
+        self.checkQuantizedLinear(msq.shadow_wrapper_0_1.mod_0)
+        self.checkDynamicQuantizedLinear(msq.shadow_wrapper_0_2.mod_0, torch.qint8)
+        self.checkQuantizedLinear(msq.shadow_wrapper_1_1.mod_0)
+        self.assertRaisesRegex(AttributeError, ".*", lambda: msq.shadow_wrapper_1_2)
+
+    def test_qconfig_multi_mapping_from_list(self):
+        # test QConfigMultiMapping.from_list_qconfig_mapping works as expected
+
+        m = TwoLayerLinearModel().eval()
+        example_input = m.get_example_inputs()
+
+        qconfig_mappings_list = [
             QConfigMapping().set_global(torch.quantization.default_qconfig),
-            QConfigMapping().set_global(torch.quantization.default_dynamic_qconfig),
+            QConfigMapping()
+            .set_global(torch.quantization.default_dynamic_qconfig)
+            .set_module_name("fc2", torch.quantization.default_qconfig),
         ]
-        self._test_impl(m, example_input, qconfig_mappings)
 
+        qconfig_multi_mapping = QConfigMultiMapping().from_list_qconfig_mapping(
+            qconfig_mappings_list
+        )
+        self.assertEqual(
+            qconfig_multi_mapping.qconfig_mappings_list[1].module_name_qconfigs["fc2"],
+            None,
+        )
+
+        msq = self._test_impl(m, example_input, qconfig_multi_mapping)
+
+        self.checkQuantizedLinear(msq.shadow_wrapper_0_1.mod_0)
+        self.checkDynamicQuantizedLinear(msq.shadow_wrapper_0_2.mod_0, torch.qint8)
+        self.checkQuantizedLinear(msq.shadow_wrapper_1_1.mod_0)
+        self.assertRaisesRegex(AttributeError, ".*", lambda: msq.shadow_wrapper_1_2)
+
+    def test_qconfig_multi_mapping_ordering(self):
+        # test that the module ordering ignores None
+
+        m = TwoLayerLinearModel().eval()
+        example_input = m.get_example_inputs()
+        qconfig_multi_mapping = (
+            QConfigMultiMapping()
+            .set_global(
+                [
+                    torch.ao.quantization.default_qconfig,
+                    torch.ao.quantization.default_dynamic_qconfig,
+                ]
+            )
+            .set_module_name(
+                "fc2",
+                [
+                    None,
+                    torch.ao.quantization.default_dynamic_qconfig,
+                    torch.ao.quantization.default_qat_qconfig_v2,
+                ],
+            )
+        )
+        self.assertEqual(len(qconfig_multi_mapping.qconfig_mappings_list), 2)
+        msq = self._test_impl(m, example_input, qconfig_multi_mapping)
+
+        self.checkQuantizedLinear(msq.shadow_wrapper_0_1.mod_0)
+        self.checkDynamicQuantizedLinear(msq.shadow_wrapper_0_2.mod_0, torch.qint8)
+        self.checkDynamicQuantizedLinear(msq.shadow_wrapper_1_1.mod_0, torch.qint8)
+        self.checkQuantizedLinear(msq.shadow_wrapper_1_2.mod_0)
 
 class TestFXNumericSuiteCoreAPIsModels(FXNumericSuiteQuantizationTestCase):
     """
diff --git a/torch/ao/ns/_numeric_suite_fx.py b/torch/ao/ns/_numeric_suite_fx.py
index f586de58531a7..860430c40b9f9 100644
--- a/torch/ao/ns/_numeric_suite_fx.py
+++ b/torch/ao/ns/_numeric_suite_fx.py
@@ -119,10 +119,6 @@
     NSResultsType,
     NSNodeTargetType,
 )
-
-from torch.ao.quantization import (
-    QConfigMapping,
-)
 from torch.ao.quantization.backend_config.utils import get_fusion_pattern_to_root_node_getter
 from torch.ao.quantization.backend_config import BackendConfig
 from torch.ao.quantization.fx.backend_config_utils import get_pattern_to_quantize_handlers
@@ -138,6 +134,7 @@
     print_n_shadows_summary,
     handle_subgraph,
 )
+from torch.ao.ns.fx.qconfig_multi_mapping import QConfigMultiMapping
 
 from typing import Dict, Tuple, Callable, List, Optional, Set, Any, Type
 
@@ -753,7 +750,7 @@ def extend_logger_results_with_comparison(
 def prepare_n_shadows_model(
     model: torch.nn.Module,
     example_inputs: Any,
-    qconfig_mappings: List[QConfigMapping],
+    qconfig_multi_mapping: QConfigMultiMapping,
     backend_config: BackendConfig,
 ) -> torch.nn.Module:
     """
@@ -770,9 +767,9 @@ def prepare_n_shadows_model(
 
       args_kwargs_m -> op_m -> output_m
            |                        |
-           |---------------------------> mod_with_op_m_transformed_with_qconfig_i
+           |---------------------------> mod_with_op_m_transformed_with_qconfig_n
 
-    Where mod_with_op_m_transformed_with_qconfig_i is a submodule, and its
+    Where mod_with_op_m_transformed_with_qconfig_n is a submodule, and its
     inner graph looks like
 
     .. code::
@@ -790,8 +787,7 @@ def prepare_n_shadows_model(
     1. add deduplication for qconfigs per subgraph
     2. figure out a better way to name the output structure
     3. return a results data structure instead of printing it out
-    4. make specifying sets of QConfigMapping more user friendly
-    5. add examples to docblocks
+    4. add examples to docblocks
     """
 
     tracer = quantize_fx.QuantizationTracer([], [])
@@ -822,7 +818,7 @@ def prepare_n_shadows_model(
     # generate node to qconfig for each subgraph
     # TODO(future PR): deduplicate repeating entries
     list_of_node_name_to_qconfig: List[Dict[str, QConfigAny]] = []
-    for qconfig_mapping in qconfig_mappings:
+    for qconfig_mapping in qconfig_multi_mapping.qconfig_mappings_list:
         node_name_to_qconfig = generate_node_name_to_qconfig(
             mt, modules, mt.graph, qconfig_mapping, tracer.node_name_to_scope)
         list_of_node_name_to_qconfig.append(node_name_to_qconfig)
@@ -838,7 +834,7 @@ def prepare_n_shadows_model(
             enumerate(subgraphs_dedup.items()):
         handle_subgraph(
             mt, subgraph_idx, match_name, nodes_in_this_subgraph,
-            qconfig_mappings, list_of_node_name_to_qconfig)
+            qconfig_multi_mapping.qconfig_mappings_list, list_of_node_name_to_qconfig)
 
     mt.recompile()
     return mt
diff --git a/torch/ao/ns/fx/qconfig_multi_mapping.py b/torch/ao/ns/fx/qconfig_multi_mapping.py
new file mode 100644
index 0000000000000..bff2640e1feb3
--- /dev/null
+++ b/torch/ao/ns/fx/qconfig_multi_mapping.py
@@ -0,0 +1,242 @@
+from __future__ import annotations
+
+import copy
+from typing import Any, Callable, Dict, List, Union
+
+import torch
+from torch.ao.quantization import QConfigMapping
+from torch.ao.quantization.qconfig import QConfigAny
+
+__all__ = ["QConfigMultiMapping"]
+
+_QCONFIG_STYLE_ORDER: List[str] = [
+    "global_qconfig",
+    "object_type_qconfigs",
+    "module_name_regex_qconfigs",
+    "module_name_qconfigs",
+    "module_name_object_type_order_qconfigs",
+]
+
+_QCONFIG_STYLE_TO_METHOD: Dict[str, str] = {
+    "global_qconfig": "set_global",
+    "object_type_qconfigs": "set_object_type",
+    "module_name_regex_qconfigs": "set_module_name_regex",
+    "module_name_qconfigs": "set_module_name",
+    "module_name_object_type_order_qconfigs": "set_module_name_object_type_order",
+}
+
+def _remove_duplicates_and_none(qconfig_list: List[QConfigAny]) -> None:
+    to_remove = []
+    for index, cur_qconfig in enumerate(qconfig_list):
+        if cur_qconfig is None:
+            to_remove.append(index)
+            break
+        for checked_qconfig in qconfig_list[:index]:
+            if torch.ao.quantization.qconfig_equals(cur_qconfig, checked_qconfig):
+                to_remove.append(index)
+                break
+    for index in to_remove[::-1]:
+        qconfig_list.pop(index)
+
+class QConfigMultiMapping:
+    """
+    This class, used with the prepare_n_shadows_model API, stores a list of :class:`torch.ao.quantization.QConfigMapping`s
+    so that multiple QConfigs can be specified for each QConfig matching style.
+
+    The user can specify QConfigs using the following methods (in increasing match priority):
+
+        ``set_global`` : sets the global (default) QConfigs
+
+        ``set_object_type`` : sets the QConfigs for a given module type, function, or method name
+
+        ``set_module_name_regex`` : sets the QConfigs for modules matching the given regex string
+
+        ``set_module_name`` : sets the QConfigs for modules matching the given module name
+
+        ``set_module_name_object_type_order`` : sets the QConfigs for modules matching a combination
+        of the given module name, object type, and the index at which the module appears
+
+    Note: Usage of set methods is the same as in QConfigMapping except with a passed in list of QConfigs rather than a
+    single QConfig.
+
+    Example usage::
+
+        qconfig_mapping = QConfigMultiMapping()
+            .set_global([qconfig1, qconfig2])
+            .set_object_type(torch.nn.Linear, [qconfig2, qconfig3])
+            .set_object_type(torch.nn.ReLU, [qconfig1])
+            .set_module_name_regex("foo.*bar.*conv[0-9]+", [qconfig2])
+            .set_module_name_regex("foo.*", [qconfig1, qconfig2, qconfig3])
+            .set_module_name("module1", [None])
+            .set_module_name("module2", [qconfig2])
+            .set_module_name_object_type_order("foo.bar", torch.nn.functional.linear, 0, [qconfig3])
+
+    """
+
+    def __init__(self):
+        # initialize this with 1 QConfigMapping to avoid corner cases
+        self.qconfig_mappings_list: List[QConfigMapping] = [QConfigMapping()]
+
+    def _handle_list_size_mismatch(
+        self, qconfig_list: List[QConfigAny], style: str
+    ) -> None:
+        # this method handles cases where the size of qconfig_list does not match
+        # the size of qconfig_mappings_list.
+        # Issue: Consider a user inserting global_qconfig A and B first, then inserting
+        # qconfig C as an object_type_qconfig for conv ops. If we internally store
+        # 1 QConfigMapping with A and C and another with just B, then the
+        # second QConfigMapping will match B to conv ops (which is not wanted), since B is global.
+
+        # we avoid this by maintaining the invariant that if any QConfigMapping
+        # has a qconfig style+key with a qconfig in it, all QConfigMappings must
+        # have either a qconfig or None for that same style+key. In the above
+        # example, a None qconfig would prevent the unwanted match in the
+        # second QConfigMapping
+
+        if len(qconfig_list) > len(self.qconfig_mappings_list):
+            # Case: we have more qconfigs (in qconfig_list) than QConfigMappings
+
+            # Add new QConfigMappings (initialized so we maintain the `invariant`)
+
+            new_qconfig_mapping = QConfigMapping()
+            # searches other QConfigMappings for qconfig style+keys
+            # that need to be inserted as `None` into the new QConfigMapping
+            for qconfig_mapping in self.qconfig_mappings_list:
+
+                # global_qconfig has None by default
+                for check_style in _QCONFIG_STYLE_ORDER[1:]:
+                    qconfigs_dict = getattr(qconfig_mapping, check_style)
+                    target_qconfigs_dict = getattr(new_qconfig_mapping, check_style)
+                    for key in qconfigs_dict:
+                        target_qconfigs_dict[key] = None
+                break
+
+            # insert copies of this new QConfigMapping until all entires
+            # in qconfig_list can fit among the QConfigMappings
+            while len(qconfig_list) > len(self.qconfig_mappings_list):
+                self.qconfig_mappings_list.append(copy.deepcopy(new_qconfig_mapping))
+        else:
+            # Case: we have fewer qconfigs in qconfig_list than QConfigMappings
+
+            # pad qconfig_list with `None` until length is same
+            while len(qconfig_list) < len(self.qconfig_mappings_list):
+                qconfig_list.append(None)
+
+    # this function applies the insertion method across each QConfigMapping
+    def _insert_qconfig_list(
+        self,
+        style: str,
+        args: List[Union[str, int, Callable]],
+        qconfig_list: List[QConfigAny],
+    ) -> None:
+
+        # we remove duplicates and None to make the ordering of qconfigs
+        # deterministic upon insertion.
+        _remove_duplicates_and_none(qconfig_list)
+
+        self._handle_list_size_mismatch(qconfig_list, style)
+        method_name = _QCONFIG_STYLE_TO_METHOD[style]
+        for qconfig_mapping, qconfig in zip(self.qconfig_mappings_list, qconfig_list):
+            # uses QConfigMapping set method to insert qconfig
+            set_method = getattr(qconfig_mapping, method_name)
+            set_method(*args, qconfig)
+
+    def set_global(self, global_qconfig_list: List[QConfigAny]) -> QConfigMultiMapping:
+        """
+        Set global QConfigs
+        see :func:`~torch.ao.quantization.QConfigMapping.set_global()` for more info
+        """
+        self._insert_qconfig_list("global_qconfig", [], global_qconfig_list)
+        return self
+
+    def set_object_type(
+        self, object_type: Union[Callable, str], qconfig_list: List[QConfigAny]
+    ) -> QConfigMultiMapping:
+        """
+        Set object type QConfigs
+        see :func:`~torch.ao.quantization.QConfigMapping.set_object_type()` for more info
+        """
+        self._insert_qconfig_list("object_type_qconfigs", [object_type], qconfig_list)
+        return self
+
+    def set_module_name_regex(
+        self, module_name_regex: str, qconfig_list: List[QConfigAny]
+    ) -> QConfigMultiMapping:
+        """
+        Set module_name_regex QConfigs
+        see :func:`~torch.ao.quantization.QConfigMapping.set_module_name_regex()` for more info
+        """
+        self._insert_qconfig_list(
+            "module_name_regex_qconfigs", [module_name_regex], qconfig_list
+        )
+        return self
+
+    def set_module_name(
+        self, module_name: str, qconfig_list: List[QConfigAny]
+    ) -> QConfigMultiMapping:
+        """
+        Set module_name QConfigs
+        see :func:`~torch.ao.quantization.QConfigMapping.set_module_name()` for more info
+        """
+        self._insert_qconfig_list("module_name_qconfigs", [module_name], qconfig_list)
+        return self
+
+    def set_module_name_object_type_order(
+        self,
+        module_name: str,
+        object_type: Callable,
+        index: int,
+        qconfig_list: List[QConfigAny],
+    ) -> QConfigMultiMapping:
+        """
+        Set module_name QConfigs
+        see :func:`~torch.ao.quantization.QConfigMapping.set_module_name_object_type_order()` for more info
+        """
+        self._insert_qconfig_list(
+            "module_name_object_type_order_qconfigs",
+            [module_name, object_type, index],
+            qconfig_list,
+        )
+        return self
+
+    @classmethod
+    def from_list_qconfig_mapping(
+        cls, qconfig_mapping_list: List[QConfigMapping]
+    ) -> QConfigMultiMapping:
+        """
+        Creates a QConfigMultiMapping from a list of QConfigMappings
+        """
+        new_qconfig_multi_mapping = cls()
+
+        new_qconfig_multi_mapping.qconfig_mappings_list = copy.deepcopy(
+            qconfig_mapping_list
+        )
+
+        # we need to avoid the issue described in _handle_list_size_mismatch,
+        # so we reinsert all the qconfigs using the QConfigMultiMapping
+        # set methods
+
+        # go through all qconfig styles
+        # note: global can be ignored since it is None by default
+        for style in _QCONFIG_STYLE_ORDER[1:]:
+
+            # gather all key+qconfigs for current style
+            # into qconfig_dict_list
+            qconfig_dict_list: Dict[Any, List[QConfigAny]] = {}
+            for qconfig_mapping in qconfig_mapping_list:
+                qconfig_dict = getattr(qconfig_mapping, style)
+                for key, qconfig in qconfig_dict.items():
+                    if key not in qconfig_dict_list:
+                        qconfig_dict_list[key] = []
+                    qconfig_dict_list[key].append(qconfig)
+
+            # reinsert all gathered key+qconfigs
+            set_method_name = _QCONFIG_STYLE_TO_METHOD[style]
+            set_method = getattr(new_qconfig_multi_mapping, set_method_name)
+            for key, qconfig_list in qconfig_dict_list.items():
+                if isinstance(key, tuple):
+                    set_method(*key, qconfig_list)
+                else:
+                    set_method(key, qconfig_list)
+
+        return new_qconfig_multi_mapping

From c65a40d5ded783b3f762c8ca8b0d4e5a1aa3251e Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Wed, 26 Oct 2022 16:13:20 +0000
Subject: [PATCH 0187/1922] Enable some PyTorch core tests with inductor
 (#87490)

Summary:
1) Graph break on torch.random.set_rng_state since it blocks running
inductor core tests;
2) Add several inductor-specific skips;
3) Enable several core tests for inductor CI;

cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87490
Approved by: https://github.com/eellison
---
 .jenkins/pytorch/test.sh         | 11 ++++-------
 test/dynamo/test_repros.py       |  2 ++
 test/test_modules.py             |  6 +++++-
 test/test_ops.py                 |  6 ++++++
 test/test_ops_gradients.py       |  6 ++++--
 torch/_dynamo/variables/torch.py |  3 +++
 6 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 94896701771c6..89fbd764201a1 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -251,13 +251,10 @@ test_dynamo_shard() {
 
 
 test_inductor() {
-  echo "TODO: enable inductor unit tests"
-  # time python test/run_test.py --core --exclude test_autograd --continue-through-error --verbose
-
-  # PYTORCH_TEST_WITH_DYNAMO and PYTORCH_TEST_WITH_INDUCTOR are only needed for PyTorch tests not written with
-  # using dynamo/inductor. For dynamo/inductor unit tests, specifiying them will trigger an error like
-  # "Detected two calls to `torchdynamo.optimize(...)` with a different backend compiler arguments."
-  # PYTORCH_TEST_WITH_DYNAMO=0 PYTORCH_TEST_WITH_INDUCTOR=0 pytest test/inductor
+  python test/test_modules.py --verbose
+  # TODO: investigate "RuntimeError: CUDA driver API confirmed a leak"
+  # seen intest_ops_gradients.py
+  # pytest test/test_ops_gradients.py --verbose -k "not _complex and not test_inplace_grad_acos_cuda_float64"
 }
 
 test_inductor_huggingface_shard() {
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 66fc19895dd62..41564952a7444 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -1016,6 +1016,8 @@ def test_create_rand_mask_from_inputs(self):
         self.assertEqual(cnt.frame_count, 1)
         self.assertEqual(cnt.op_count, 8)
 
+    # TODO: make set_rng_state work with FakeTensor/aot_autograd
+    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
     def test_rng_state(self):
         def fn():
             state = torch.get_rng_state()
diff --git a/test/test_modules.py b/test/test_modules.py
index e06f0cc617d99..2f5008244d548 100644
--- a/test/test_modules.py
+++ b/test/test_modules.py
@@ -11,7 +11,8 @@
     instantiate_device_type_tests, onlyCUDA, toleranceOverride, tol, skipMeta)
 from torch.testing._internal.common_modules import module_db, modules, TrainEvalMode
 from torch.testing._internal.common_utils import (
-    TestCase, run_tests, freeze_rng_state, mock_wrapper, get_tensors_from, gradcheck, gradgradcheck, skipIfMps)
+    TestCase, run_tests, freeze_rng_state, mock_wrapper, get_tensors_from, gradcheck,
+    gradgradcheck, skipIfMps, skipIfTorchInductor)
 from unittest.mock import patch, call
 
 
@@ -326,6 +327,7 @@ def inner_zero_grad(obj):
 
     @skipIfMps
     @modules(module_db)
+    @skipIfTorchInductor("to be fixed")
     def test_non_contiguous_tensors(self, device, dtype, module_info, training):
         # Check modules work with non-contiguous tensors
 
@@ -489,6 +491,7 @@ def test_gradgrad(self, device, dtype, module_info, training):
     @toleranceOverride({torch.float32: tol(5e-2, 0),
                         torch.float64: tol(4e-4, 0)})
     @modules(module_db)
+    @skipIfTorchInductor("to be fixed")
     def test_cpu_gpu_parity(self, device, dtype, module_info, training):
         # TODO: RNN / GRU / LSTM don't support backwards on eval mode for cuDNN; skip this in a
         # nicer way for eval mode only.
@@ -579,6 +582,7 @@ def check_backward(cpu_output, gpu_output):
 
     @skipIfMps
     @modules(module_db)
+    @skipIfTorchInductor("to be fixed")
     def test_memory_format(self, device, dtype, module_info, training):
         is_sm86 = device.startswith("cuda") and torch.cuda.get_device_capability(0) == (8, 6)
         # TODO tighten it to a specific module
diff --git a/test/test_ops.py b/test/test_ops.py
index 5e9371e982341..0e5b6f1d607dd 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -36,6 +36,7 @@
     first_sample,
     parametrize,
     skipIfSlowGradcheckEnv,
+    skipIfTorchInductor,
     slowTest,
 )
 from torch.testing._internal.common_methods_invocations import (
@@ -209,6 +210,7 @@ def to_cpu(arg):
     @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
     @onlyNativeDeviceTypes
     @ops(python_ref_db)
+    @skipIfTorchInductor("Takes too long for inductor")
     def test_python_ref_meta(self, device, dtype, op):
         with FakeTensorMode() as mode:
             pass
@@ -374,6 +376,7 @@ def _distance(a, b):
     @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
     @onlyNativeDeviceTypes
     @ops(python_ref_db)
+    @skipIfTorchInductor("Takes too long for inductor")
     def test_python_ref(self, device, dtype, op):
         # In this test, primTorch refs call into the refs namespace
         # For example, a ref with torch.foo in it will calls refs.foo instead
@@ -386,6 +389,7 @@ def test_python_ref(self, device, dtype, op):
     @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
     @onlyNativeDeviceTypes
     @ops(python_ref_db)
+    @skipIfTorchInductor("Takes too long for inductor")
     def test_python_ref_torch_fallback(self, device, dtype, op):
         # In this test, refs call into the torch namespace (after the initial invocation)
         # For example, a ref with torch.foo in it will call torch.foo instead of refs.foo
@@ -397,6 +401,7 @@ def test_python_ref_torch_fallback(self, device, dtype, op):
     @skipCUDAIfRocm
     @ops(python_ref_db)
     @parametrize('executor', ['aten', 'nvfuser'])
+    @skipIfTorchInductor("Takes too long for inductor")
     def test_python_ref_executor(self, device, dtype, op, executor):
         # TODO: Not all dtypes are supported with nvfuser
         from torch._prims_common import _torch_dtype_to_nvfuser_dtype_map
@@ -457,6 +462,7 @@ def test_errors(self, device, op):
     @skipMeta
     @onlyNativeDeviceTypes
     @ops([op for op in python_ref_db if op.error_inputs_func is not None], dtypes=OpDTypes.none)
+    @skipIfTorchInductor("Takes too long for inductor")
     def test_python_ref_errors(self, device, op):
         mode = FakeTensorMode()
         with mode:
diff --git a/test/test_ops_gradients.py b/test/test_ops_gradients.py
index 0411f043df9c0..6d517c7a7f8b1 100644
--- a/test/test_ops_gradients.py
+++ b/test/test_ops_gradients.py
@@ -4,8 +4,9 @@
 from itertools import chain
 import torch
 
-from torch.testing._internal.common_utils import \
-    (TestCase, is_iterable_of_tensors, run_tests, gradcheck, gradgradcheck, is_slow_gradcheck_env)
+from torch.testing._internal.common_utils import (
+    TestCase, is_iterable_of_tensors, run_tests, gradcheck, gradgradcheck, is_slow_gradcheck_env,
+    skipIfTorchInductor)
 from torch.testing._internal.common_methods_invocations import op_db
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, ops, OpDTypes)
@@ -253,6 +254,7 @@ def test_forward_mode_AD(self, device, dtype, op):
         self._forward_grad_helper(device, dtype, op, op.get_op(), is_inplace=False)
 
     @_gradcheck_ops(op_db)
+    @skipIfTorchInductor("to be fixed")
     def test_inplace_forward_mode_AD(self, device, dtype, op):
         self._skip_helper(op, device, dtype)
 
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index e0c88b2cf059a..36ca6591189de 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -320,6 +320,9 @@ def get_state_from_generator():
             assert isinstance(args[0], TensorVariable)
 
             if config.fake_tensor_propagation:
+                unimplemented(
+                    "TODO: make torch.random.set_rng_state work with FakeTensor/aot_autograd"
+                )
                 # In fake tensor case, this state doesn't matter, but
                 # it needs to be valid to not segfault. Pull a real tensor out.
                 # The value won't matter since we are running with fake tensors anyway, so rng doesn't matter.

From 3ea401402e31a80ac31a30d500d00be6e680ddf4 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 26 Oct 2022 19:23:55 +0000
Subject: [PATCH 0188/1922] Set check-latest to false when setup python and pip
 cache in CI (#87621)

I missed the fine print in https://github.com/actions/setup-python/blob/main/README.md#caching-packages-dependencies when setting up the cache using setup-python GHA

> Restored cache will not be used if the requirements.txt file is not updated for a long time and a newer version of the dependency is available which can lead to an increase in total build time.

The latter part is important because it implies that even with the cache, pip will still try to check if a newer version exists and that part can be flaky, i.e. https://github.com/pytorch/pytorch/actions/runs/3313764038/jobs/5472180293

This undesired behavior can be turned off by setting the advance option `check-latest` to false https://github.com/actions/setup-python/blob/main/docs/advanced-usage.md#check-latest-version. Per my understanding, this should tell pip install in these workflows to use the local cached copy of the package avoiding the need to query pypi every single time.

`check-latest` was added quite recently https://github.com/actions/setup-python/pull/406, so `actionlint-1.6.15` fails to recognize it. Thus, this PR also upgrades `actionlint` to the latest 1.6.21 to pass the linter check. Here is an example error from 1.6.15 from https://github.com/pytorch/pytorch/actions/runs/3315388073/jobs/5475918454:

```
>>> Lint for .github/workflows/lint.yml:

  Error (ACTIONLINT) [action]
    input "check-latest" is not defined in action "actions/setup-python@v4".
    available inputs are "architecture", "cache", "cache-dependency-path",
    "python-version", "python-version-file", "token"

         25  |        with:
         26  |          python-version: 3.8
         27  |          architecture: x64
    >>>  28  |          check-latest: false
         29  |          cache: pip
         30  |          cache-dependency-path: |
         31  |            **/.github/requirements-gha-cache.txt
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87621
Approved by: https://github.com/ZainRizvi
---
 .github/actions/setup-win/action.yml      |  3 ++-
 .github/workflows/lint.yml                | 23 ++++++++++++++---------
 .github/workflows/pr-labels.yml           |  3 ++-
 .github/workflows/revert.yml              |  3 ++-
 .github/workflows/trymerge.yml            |  3 ++-
 .github/workflows/tryrebase.yml           |  3 ++-
 .github/workflows/update-viablestrict.yml |  1 +
 tools/linter/adapters/s3_init_config.json |  8 ++++----
 8 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/.github/actions/setup-win/action.yml b/.github/actions/setup-win/action.yml
index d442343430c7d..4447e9203d504 100644
--- a/.github/actions/setup-win/action.yml
+++ b/.github/actions/setup-win/action.yml
@@ -57,7 +57,8 @@ runs:
     - name: Setup Python3
       uses: actions/setup-python@v4
       with:
-        python-version: "3.x"
+        python-version: 3.x
+        check-latest: false
         cache: pip
         cache-dependency-path: |
           **/requirements.txt
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 669977b143a5e..17ffb239b15a7 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -25,16 +25,14 @@ jobs:
         with:
           python-version: 3.8
           architecture: x64
+          check-latest: false
           cache: pip
           cache-dependency-path: |
             **/.github/requirements-gha-cache.txt
 
-      - name: Install lintrunner
-        uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
-        with:
-          timeout_minutes: 5
-          max_attempts: 3
-          command: pip install lintrunner==0.9.2
+      - name: Install requirements
+        run: |
+          pip install -r .github/requirements-gha-cache.txt --user
 
       - name: Initialize lint dependencies
         run: lintrunner init
@@ -87,6 +85,7 @@ jobs:
         with:
           python-version: 3.x
           architecture: x64
+          check-latest: false
           cache: pip
           cache-dependency-path: |
             **/requirements.txt
@@ -151,6 +150,7 @@ jobs:
         with:
           python-version: 3.x
           architecture: x64
+          check-latest: false
           cache: pip
           cache-dependency-path: |
             **/requirements.txt
@@ -242,7 +242,8 @@ jobs:
         with:
           python-version: 3.8
           architecture: x64
-          cache: 'pip'
+          check-latest: false
+          cache: pip
           cache-dependency-path: |
             **/.github/requirements-gha-cache.txt
 
@@ -275,6 +276,7 @@ jobs:
         with:
           python-version: 3.8
           architecture: x64
+          check-latest: false
           cache: pip
           cache-dependency-path: |
             **/requirements.txt
@@ -320,21 +322,24 @@ jobs:
         with:
           python-version: 3.5
           architecture: x64
+          check-latest: false
           cache: pip
           cache-dependency-path: |
-            **/.github/requirements-gha-cache.txt
+            **/requirements.txt
       - name: Setup Python 3.8
         if: matrix.test_type != 'older_python_version'
         uses: actions/setup-python@v4
         with:
           python-version: 3.8
           architecture: x64
+          check-latest: false
           cache: pip
           cache-dependency-path: |
-            **/.github/requirements-gha-cache.txt
+            **/requirements.txt
       - name: Install torch
         if: matrix.test_type == 'with_torch'
         run: |
+          pip install -r requirements.txt
           # Doesn't really matter what torch version, we just need ANY torch installed
           pip install 'torch==1.*'
       - name: Run collect_env.py
diff --git a/.github/workflows/pr-labels.yml b/.github/workflows/pr-labels.yml
index aa8cf4472b784..de6da1feec02a 100644
--- a/.github/workflows/pr-labels.yml
+++ b/.github/workflows/pr-labels.yml
@@ -17,7 +17,8 @@ jobs:
       - name: Set up python
         uses: actions/setup-python@v4
         with:
-          python-version: '3.10'
+          python-version: 3.10
+          check-latest: false
           cache: pip
           cache-dependency-path: |
             **/.github/requirements-gha-cache.txt
diff --git a/.github/workflows/revert.yml b/.github/workflows/revert.yml
index d207840f383b4..6468f3b8c804c 100644
--- a/.github/workflows/revert.yml
+++ b/.github/workflows/revert.yml
@@ -23,7 +23,8 @@ jobs:
         with:
           python-version: 3.8
           architecture: x64
-          cache: 'pip'
+          check-latest: false
+          cache: pip
       - run: pip install pyyaml==6.0
 
       - name: Setup committer id
diff --git a/.github/workflows/trymerge.yml b/.github/workflows/trymerge.yml
index dff92303f5056..372b442163df0 100644
--- a/.github/workflows/trymerge.yml
+++ b/.github/workflows/trymerge.yml
@@ -22,7 +22,8 @@ jobs:
         uses: actions/setup-python@v4
         with:
           python-version: 3.8
-          cache: 'pip'
+          check-latest: false
+          cache: pip
           architecture: x64
       - run: pip install pyyaml==6.0
 
diff --git a/.github/workflows/tryrebase.yml b/.github/workflows/tryrebase.yml
index fed9000c420e9..dd32069932678 100644
--- a/.github/workflows/tryrebase.yml
+++ b/.github/workflows/tryrebase.yml
@@ -22,7 +22,8 @@ jobs:
         with:
           python-version: 3.8
           architecture: x64
-          cache: 'pip'
+          check-latest: false
+          cache: pip
       - run: pip install pyyaml==6.0
 
       - name: Setup committer id
diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml
index 5901b1f4cda1b..4be70de020a3b 100644
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@@ -24,6 +24,7 @@ jobs:
         with:
           python-version: 3.8
           architecture: x64
+          check-latest: false
           cache: pip
           cache-dependency-path: |
             **/.circleci/docker/requirements-ci.txt
diff --git a/tools/linter/adapters/s3_init_config.json b/tools/linter/adapters/s3_init_config.json
index 0b0e87e8e26cf..d48f264f83d5d 100644
--- a/tools/linter/adapters/s3_init_config.json
+++ b/tools/linter/adapters/s3_init_config.json
@@ -27,12 +27,12 @@
     },
     "actionlint": {
         "Darwin": {
-            "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.15/Darwin_amd64/actionlint",
-            "hash": "e9a0e0b17e54cfefe7964b6aa1da8921b1f8f2318c31c0eb1a17ea3e8ab10db2"
+            "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.21/Darwin_amd64/actionlint",
+            "hash": "b354db83815384d3c3a07f68f44b30cb0a70899757a0d185d7322de9952e8813"
         },
         "Linux": {
-            "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.15/Linux_arm64/actionlint",
-            "hash": "d6b45ae67f29a2bf9ddd226071ddd8f158fdf2992e8515a06838e5fef90f3a2d"
+            "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.21/Linux_arm64/actionlint",
+            "hash": "025ac157db121b33971ef24af72d73d71cda3cb1e3a94795bb2708ef4032ca76"
         }
     }
 }

From 9451c581f3e05868bdc1d73a9512ad0d4f8795dc Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Wed, 26 Oct 2022 19:29:05 +0000
Subject: [PATCH 0189/1922] Fix typos under aten directory (#87754)

This PR fixes typos in `.md` files under aten directory

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87754
Approved by: https://github.com/kit1980
---
 aten/src/ATen/native/README.md     | 6 +++---
 aten/src/ATen/native/cpu/README.md | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md
index 01a25e3a978cc..c355423ea7501 100644
--- a/aten/src/ATen/native/README.md
+++ b/aten/src/ATen/native/README.md
@@ -445,7 +445,7 @@ By default, ATen code generation will generate device check,
 which will ensure all the tensor parameters passed to kernel are
 on the same device.
 
-However, in some cases, checking the device is unncessary, because,
+However, in some cases, checking the device is unnecessary, because,
 e.g., you call a function allows to work on multiple devices.
 In that case, code generation of the device check can be disabled by adding
 `device_check: NoCheck` to your function definition.
@@ -556,7 +556,7 @@ Here're steps to follow to decide the right dispatch keyword:
       Note: to support training, you're required to write a formula in
       derivatives.yaml since your backend implementations don't support autograd.
 
-    - Yes: you're likely calling other `at::` ops in the implemetation. Go to step 2.
+    - Yes: you're likely calling other `at::` ops in the implementation. Go to step 2.
 
 2. Think about training: does your kernel support autograd? [check autograd support](#will-your-function-be-automatically-differentiable)
     - Yes: in other words, you're providing a `CompositeImplicitAutograd` kernel which supports both inference and autograd.
@@ -610,7 +610,7 @@ It shows for a certain operator, what the computed dispatch table looks like aft
 4. TODO: AutogradCPUOrCUDA
 
 Note that in native_functions.yaml you can mix using backend keywords and alias keywords above for one op:
-  - direct registration to backend always has higher precendence than alias
+  - direct registration to backend always has higher precedence than alias
   - DO NOT provide multiple alias keywords to the same op: alias keywords have precedence `CompositeExplicitAutograd > CompositeImplicitAutograd`,
     e.g. adding both `CompositeImplicitAutograd` and `CompositeExplicitAutograd` kernels for one op will completely ignore `CompositeImplicitAutograd` kernel for
     both inference and training. Thus this will trigger an error when native_functions.yaml is parsed.
diff --git a/aten/src/ATen/native/cpu/README.md b/aten/src/ATen/native/cpu/README.md
index ab2f9d3d02609..2cf6fa0a13320 100644
--- a/aten/src/ATen/native/cpu/README.md
+++ b/aten/src/ATen/native/cpu/README.md
@@ -64,7 +64,7 @@ within 256bit & 512bits registers. vec defines various operators such as
 As an example `ReduceOpsKernel.cpp` implements a generic `kernel_` that reduces
 an entire array using a given associative binary operation such as +.
 
-More explicity, calling `kernel_` with template argument `std::plus` will cause
+More explicitly, calling `kernel_` with template argument `std::plus` will cause
 it to sum up the entire array into a single value.
 
 `ReduceOpsKernel.cpp` uses the `CPU_CAPABILITY_*` macros to "know" under which
@@ -73,7 +73,7 @@ generic code, which will be compiled under multipled compilation settings.
 
 `../ReduceOps.cpp` now includes the header `ReduceOpsKernel.h`, which contains
 a generic definition of `sumImplAll`. This function allows the user to reduce
-over a dimension or all dimensions. The appropiate capability is chosen at
+over a dimension or all dimensions. The appropriate capability is chosen at
 runtime using cpuinfo. If the current platform has AVX2, `sumImpl` will be set
 to `sumImplAll<CPUCapability::AVX2>`.
 

From 24ec5083be807bcf668ba85eb3543bc6eb35ef94 Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Wed, 26 Oct 2022 04:34:38 +0000
Subject: [PATCH 0190/1922] Fix missing weight init and clean up helper
 (#87760)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87760
Approved by: https://github.com/davidberard98
---
 test/distributed/test_dynamo_distributed.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index 43a4a23039175..36a459b6f00c3 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -12,6 +12,10 @@
 from torch._dynamo.utils import same
 from torch.nn.parallel import DistributedDataParallel as DDP
 
+def init_weights(m):
+    if isinstance(m, nn.Linear):
+        nn.init.xavier_uniform_(m.weight)
+        m.bias.data.fill_(0.01)
 
 class ToyModel(nn.Module):
     def __init__(self, in_feat=10, hidden_feat=5000, num_hidden=2, out_feat=5):
@@ -19,7 +23,7 @@ def __init__(self, in_feat=10, hidden_feat=5000, num_hidden=2, out_feat=5):
         self.net = nn.Sequential(
             *[nn.Linear(in_feat, hidden_feat), nn.ReLU()]
             + [nn.Linear(hidden_feat, hidden_feat), nn.ReLU()] * num_hidden
-            + [nn.Linear(hidden_feat, 5), nn.ReLU()]
+            + [nn.Linear(hidden_feat, out_feat), nn.ReLU()]
         )
 
     def forward(self, inputs):
@@ -63,9 +67,10 @@ def tearDownClass(cls):
         dist.destroy_process_group()
         super().tearDownClass()
 
-    def get_model(self):
-        m = ToyModel().to(self.device)
-        inputs = torch.randn(20, 10).to(self.device)
+    def get_model(self, bsz=20, in_feat=10, hidden_feat=5000, out_feat=5):
+        m = ToyModel(in_feat=in_feat, hidden_feat=hidden_feat, out_feat=out_feat).to(self.device)
+        m.apply(init_weights)
+        inputs = torch.rand(bsz, in_feat).to(self.device)
         outputs = m(inputs)
         return m, inputs, outputs
 
@@ -161,11 +166,8 @@ def test_no_split(self):
         introducing graph splits. (Based on model parmeters fitting in the bucket)
         """
         # DDP will always do a 'first bucket' with a really small size;  so only a tiny model will escape this
-        m = ToyModel(hidden_feat=5).to(self.device)
-        inputs = torch.randn(20, 10).to(self.device)
-        correct_outputs = m(inputs)
+        m, inputs, correct_outputs = self.get_model(hidden_feat=5)
         ddp_m = DDP(m, device_ids=self.device_ids, bucket_cap_mb=250)
-
         check_splits_compiler = CheckSplitsCompiler()
 
         @torch._dynamo.optimize(check_splits_compiler.compile_fn)
@@ -233,7 +235,8 @@ def forward(self, x):
                 return self.seq(x)
 
         m = MyModule().to(self.device)
-        inputs = torch.randn((512, 512)).to(self.device)
+        m.apply(init_weights)
+        inputs = torch.rand((512, 512)).to(self.device)
         correct_outputs = m(inputs)
         ddp_m = DDP(m, device_ids=self.device_ids, bucket_cap_mb=1)
 

From e9dba4d8e21aa5468c3f5453f0e265fe665ac6f1 Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Wed, 26 Oct 2022 04:34:41 +0000
Subject: [PATCH 0191/1922] Add dynamo_optimize_ddp arg to dist bench (#87768)

cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87768
Approved by: https://github.com/davidberard98
---
 benchmarks/dynamo/distributed.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/benchmarks/dynamo/distributed.py b/benchmarks/dynamo/distributed.py
index b4332556c7bb3..c2db15563348a 100644
--- a/benchmarks/dynamo/distributed.py
+++ b/benchmarks/dynamo/distributed.py
@@ -63,6 +63,8 @@ def move_tensor(maybe_tensor):
     if args.dynamo:
         if args.verbose:
             dynamo.config.verbose = True
+        if args.dynamo_optimize_ddp:
+            dynamo.config.optimize_ddp = True
 
         def print_compile(gm, ex):
             print(
@@ -129,6 +131,11 @@ def experiment(fn, key, world_size, results):
     parser.add_argument(
         "--world_size", type=int, default=2, help="Number of ranks/gpus for experiments"
     )
+    parser.add_argument(
+        "--dynamo_optimize_ddp",
+        action="store_true",
+        help="Enable dynamo's ddp optimizer",
+    )
     parser.add_argument(
         "--fsdp_checkpoint",
         action="store_true",

From eb5c5077fa0c63c867feeda72364d1f20a82ab1b Mon Sep 17 00:00:00 2001
From: jpvillam <Juan.Villamizar@amd.com>
Date: Wed, 26 Oct 2022 19:39:21 +0000
Subject: [PATCH 0192/1922] ROCm enable sparse_sampled_addmm (#86401)

Enables:
test_comprehensive_sparse_sampled_addmm_cuda_complex128
test_comprehensive_sparse_sampled_addmm_cuda_complex64
test_comprehensive_sparse_sampled_addmm_cuda_float32
test_comprehensive_sparse_sampled_addmm_cuda_float64
test_dispatch_meta_sparse_sampled_addmm_cuda_complex128
test_dispatch_meta_sparse_sampled_addmm_cuda_complex64
test_dispatch_meta_sparse_sampled_addmm_cuda_float32
test_dispatch_meta_sparse_sampled_addmm_cuda_float64
test_meta_sparse_sampled_addmm_cuda_complex128
test_meta_sparse_sampled_addmm_cuda_complex64
test_meta_sparse_sampled_addmm_cuda_float32
test_meta_sparse_sampled_addmm_cuda_float64

Pull Request resolved: https://github.com/pytorch/pytorch/pull/86401
Approved by: https://github.com/ngimel
---
 aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp   | 2 +-
 torch/testing/_internal/common_cuda.py                | 7 +++++++
 torch/testing/_internal/common_methods_invocations.py | 6 ++++--
 torch/utils/hipify/cuda_to_hip_mappings.py            | 4 ++++
 4 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
index 379640bad56b9..833fd41eb6a02 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
@@ -1401,7 +1401,7 @@ void sampled_addmm_out_sparse_csr(
     const Scalar& beta,
     const Scalar& alpha,
     const at::sparse_csr::SparseCsrTensor& C) {
-#if !AT_USE_CUSPARSE_GENERIC_SDDMM()
+#if !(AT_USE_CUSPARSE_GENERIC_SDDMM() || AT_USE_HIPSPARSE_GENERIC_52_API())
   TORCH_CHECK(
       false,
       "Calling sampled_addmm with sparse GPU tensors requires compiling ",
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
index 1ee8e40ebd062..b226c7af58e51 100644
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@@ -173,6 +173,13 @@ def _get_torch_cuda_version():
     cuda_version = str(torch.version.cuda)
     return tuple(int(x) for x in cuda_version.split("."))
 
+def _get_torch_rocm_version():
+    if not TEST_WITH_ROCM:
+        return (0, 0)
+    rocm_version = str(torch.version.hip)
+    rocm_version = rocm_version.split("-")[0]    # ignore git sha
+    return tuple(int(x) for x in rocm_version.split("."))
+
 def _check_cusparse_generic_available():
     version = _get_torch_cuda_version()
     min_supported_version = (10, 1)
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 00f454bdf454a..94c12f5bc93d0 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -26,7 +26,7 @@
      toleranceOverride, tol)
 from torch.testing._internal.common_cuda import (
     CUDA11OrLater, SM53OrLater, SM60OrLater, with_tf32_off, TEST_CUDNN,
-    _get_torch_cuda_version)
+    _get_torch_cuda_version, _get_torch_rocm_version)
 from torch.testing._internal.common_utils import (
     make_fullrank_matrices_with_distinct_singular_values,
     TEST_WITH_ROCM, IS_WINDOWS, IS_MACOS, TEST_SCIPY,
@@ -9392,7 +9392,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_autograd=True,
            sample_inputs_func=sample_inputs_sparse_sampled_addmm,
            decorators=[
-               skipCUDAIf(_get_torch_cuda_version() < (11, 3), "cusparseSDDMM was added in 11.2.1"),
+               skipCUDAIf(not ((_get_torch_cuda_version() >= (11, 3))
+                               or (_get_torch_rocm_version() >= (5, 2))),
+                          "cusparseSDDMM was added in 11.2.1"),
                skipCPUIfNoMklSparse, ],
            skips=(
                # NotImplementedError: Tensors of type SparseCsrTensorImpl do not have is_contiguous
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index 3b3a047a7f9b5..33e14e9e0572e 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -7920,6 +7920,9 @@
         ("cusparseSpGEMM_createDescr", ("hipsparseSpGEMM_createDescr", CONV_MATH_FUNC, API_SPARSE)),
         ("cusparseDnMatSetStridedBatch", ("hipsparseDnMatSetStridedBatch", CONV_MATH_FUNC, API_SPARSE)),
         ("cusparseSpGEMM_copy", ("hipsparseSpGEMM_copy", CONV_MATH_FUNC, API_SPARSE)),
+        ("cusparseSDDMM_bufferSize", ("hipsparseSDDMM_bufferSize", CONV_MATH_FUNC, API_SPARSE)),
+        ("cusparseSDDMM_preprocess", ("hipsparseSDDMM_preprocess", CONV_MATH_FUNC, API_SPARSE)),
+        ("cusparseSDDMM", ("hipsparseSDDMM", CONV_MATH_FUNC, API_SPARSE)),
         ("cusparseSpGEMM_compute", ("hipsparseSpGEMM_compute", CONV_MATH_FUNC, API_SPARSE)),
         ("cusparseSpGEMM_workEstimation", ("hipsparseSpGEMM_workEstimation", CONV_MATH_FUNC, API_SPARSE)),
         ("cusparseSpMatGetSize", ("hipsparseSpMatGetSize", CONV_MATH_FUNC, API_SPARSE)),
@@ -7947,6 +7950,7 @@
         ("CUSPARSE_COOMV_ALG", ("HIPSPARSE_COOMV_ALG", CONV_NUMERIC_LITERAL, API_SPARSE)),
         ("CUSPARSE_CSRMM_ALG1", ("HIPSPARSE_CSRMM_ALG1", CONV_NUMERIC_LITERAL, API_SPARSE)),
         ("CUSPARSE_SPGEMM_DEFAULT", ("HIPSPARSE_SPGEMM_DEFAULT", CONV_NUMERIC_LITERAL, API_SPARSE)),
+        ("CUSPARSE_SDDMM_ALG_DEFAULT", ("HIPSPARSE_SDDMM_ALG_DEFAULT", CONV_NUMERIC_LITERAL, API_SPARSE)),
         (
             "CUSPARSE_STATUS_SUCCESS",
             ("HIPSPARSE_STATUS_SUCCESS", CONV_NUMERIC_LITERAL, API_SPARSE),

From b1a7e8c4dc0a3893955f2add7cc265e4f15c254a Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 26 Oct 2022 19:40:51 +0000
Subject: [PATCH 0193/1922] Revert "Set check-latest to false when setup python
 and pip cache in CI (#87621)"

This reverts commit 4080b1db284fd531654bcb2984a7fe0ff3b310cd.

Reverted https://github.com/pytorch/pytorch/pull/87621 on behalf of https://github.com/huydhn due to Somehow setup-python treats Python 3.10 as Python 3.1 in pr-label.yml. I missed this signal because this is only run at push
---
 .github/actions/setup-win/action.yml      |  3 +--
 .github/workflows/lint.yml                | 23 +++++++++--------------
 .github/workflows/pr-labels.yml           |  3 +--
 .github/workflows/revert.yml              |  3 +--
 .github/workflows/trymerge.yml            |  3 +--
 .github/workflows/tryrebase.yml           |  3 +--
 .github/workflows/update-viablestrict.yml |  1 -
 tools/linter/adapters/s3_init_config.json |  8 ++++----
 8 files changed, 18 insertions(+), 29 deletions(-)

diff --git a/.github/actions/setup-win/action.yml b/.github/actions/setup-win/action.yml
index 4447e9203d504..d442343430c7d 100644
--- a/.github/actions/setup-win/action.yml
+++ b/.github/actions/setup-win/action.yml
@@ -57,8 +57,7 @@ runs:
     - name: Setup Python3
       uses: actions/setup-python@v4
       with:
-        python-version: 3.x
-        check-latest: false
+        python-version: "3.x"
         cache: pip
         cache-dependency-path: |
           **/requirements.txt
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 17ffb239b15a7..669977b143a5e 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -25,14 +25,16 @@ jobs:
         with:
           python-version: 3.8
           architecture: x64
-          check-latest: false
           cache: pip
           cache-dependency-path: |
             **/.github/requirements-gha-cache.txt
 
-      - name: Install requirements
-        run: |
-          pip install -r .github/requirements-gha-cache.txt --user
+      - name: Install lintrunner
+        uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
+        with:
+          timeout_minutes: 5
+          max_attempts: 3
+          command: pip install lintrunner==0.9.2
 
       - name: Initialize lint dependencies
         run: lintrunner init
@@ -85,7 +87,6 @@ jobs:
         with:
           python-version: 3.x
           architecture: x64
-          check-latest: false
           cache: pip
           cache-dependency-path: |
             **/requirements.txt
@@ -150,7 +151,6 @@ jobs:
         with:
           python-version: 3.x
           architecture: x64
-          check-latest: false
           cache: pip
           cache-dependency-path: |
             **/requirements.txt
@@ -242,8 +242,7 @@ jobs:
         with:
           python-version: 3.8
           architecture: x64
-          check-latest: false
-          cache: pip
+          cache: 'pip'
           cache-dependency-path: |
             **/.github/requirements-gha-cache.txt
 
@@ -276,7 +275,6 @@ jobs:
         with:
           python-version: 3.8
           architecture: x64
-          check-latest: false
           cache: pip
           cache-dependency-path: |
             **/requirements.txt
@@ -322,24 +320,21 @@ jobs:
         with:
           python-version: 3.5
           architecture: x64
-          check-latest: false
           cache: pip
           cache-dependency-path: |
-            **/requirements.txt
+            **/.github/requirements-gha-cache.txt
       - name: Setup Python 3.8
         if: matrix.test_type != 'older_python_version'
         uses: actions/setup-python@v4
         with:
           python-version: 3.8
           architecture: x64
-          check-latest: false
           cache: pip
           cache-dependency-path: |
-            **/requirements.txt
+            **/.github/requirements-gha-cache.txt
       - name: Install torch
         if: matrix.test_type == 'with_torch'
         run: |
-          pip install -r requirements.txt
           # Doesn't really matter what torch version, we just need ANY torch installed
           pip install 'torch==1.*'
       - name: Run collect_env.py
diff --git a/.github/workflows/pr-labels.yml b/.github/workflows/pr-labels.yml
index de6da1feec02a..aa8cf4472b784 100644
--- a/.github/workflows/pr-labels.yml
+++ b/.github/workflows/pr-labels.yml
@@ -17,8 +17,7 @@ jobs:
       - name: Set up python
         uses: actions/setup-python@v4
         with:
-          python-version: 3.10
-          check-latest: false
+          python-version: '3.10'
           cache: pip
           cache-dependency-path: |
             **/.github/requirements-gha-cache.txt
diff --git a/.github/workflows/revert.yml b/.github/workflows/revert.yml
index 6468f3b8c804c..d207840f383b4 100644
--- a/.github/workflows/revert.yml
+++ b/.github/workflows/revert.yml
@@ -23,8 +23,7 @@ jobs:
         with:
           python-version: 3.8
           architecture: x64
-          check-latest: false
-          cache: pip
+          cache: 'pip'
       - run: pip install pyyaml==6.0
 
       - name: Setup committer id
diff --git a/.github/workflows/trymerge.yml b/.github/workflows/trymerge.yml
index 372b442163df0..dff92303f5056 100644
--- a/.github/workflows/trymerge.yml
+++ b/.github/workflows/trymerge.yml
@@ -22,8 +22,7 @@ jobs:
         uses: actions/setup-python@v4
         with:
           python-version: 3.8
-          check-latest: false
-          cache: pip
+          cache: 'pip'
           architecture: x64
       - run: pip install pyyaml==6.0
 
diff --git a/.github/workflows/tryrebase.yml b/.github/workflows/tryrebase.yml
index dd32069932678..fed9000c420e9 100644
--- a/.github/workflows/tryrebase.yml
+++ b/.github/workflows/tryrebase.yml
@@ -22,8 +22,7 @@ jobs:
         with:
           python-version: 3.8
           architecture: x64
-          check-latest: false
-          cache: pip
+          cache: 'pip'
       - run: pip install pyyaml==6.0
 
       - name: Setup committer id
diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml
index 4be70de020a3b..5901b1f4cda1b 100644
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@@ -24,7 +24,6 @@ jobs:
         with:
           python-version: 3.8
           architecture: x64
-          check-latest: false
           cache: pip
           cache-dependency-path: |
             **/.circleci/docker/requirements-ci.txt
diff --git a/tools/linter/adapters/s3_init_config.json b/tools/linter/adapters/s3_init_config.json
index d48f264f83d5d..0b0e87e8e26cf 100644
--- a/tools/linter/adapters/s3_init_config.json
+++ b/tools/linter/adapters/s3_init_config.json
@@ -27,12 +27,12 @@
     },
     "actionlint": {
         "Darwin": {
-            "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.21/Darwin_amd64/actionlint",
-            "hash": "b354db83815384d3c3a07f68f44b30cb0a70899757a0d185d7322de9952e8813"
+            "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.15/Darwin_amd64/actionlint",
+            "hash": "e9a0e0b17e54cfefe7964b6aa1da8921b1f8f2318c31c0eb1a17ea3e8ab10db2"
         },
         "Linux": {
-            "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.21/Linux_arm64/actionlint",
-            "hash": "025ac157db121b33971ef24af72d73d71cda3cb1e3a94795bb2708ef4032ca76"
+            "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.15/Linux_arm64/actionlint",
+            "hash": "d6b45ae67f29a2bf9ddd226071ddd8f158fdf2992e8515a06838e5fef90f3a2d"
         }
     }
 }

From bcb94062d5a801cf1e6f18039b9e0a348b55ec5a Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 26 Oct 2022 20:08:29 +0000
Subject: [PATCH 0194/1922] Set check-latest to false when setup python and pip
 cache in CI (#87621)

I missed the fine print in https://github.com/actions/setup-python/blob/main/README.md#caching-packages-dependencies when setting up the cache using setup-python GHA

> Restored cache will not be used if the requirements.txt file is not updated for a long time and a newer version of the dependency is available which can lead to an increase in total build time.

The latter part is important because it implies that even with the cache, pip will still try to check if a newer version exists and that part can be flaky, i.e. https://github.com/pytorch/pytorch/actions/runs/3313764038/jobs/5472180293

This undesired behavior can be turned off by setting the advance option `check-latest` to false https://github.com/actions/setup-python/blob/main/docs/advanced-usage.md#check-latest-version. Per my understanding, this should tell pip install in these workflows to use the local cached copy of the package avoiding the need to query pypi every single time.

`check-latest` was added quite recently https://github.com/actions/setup-python/pull/406, so `actionlint-1.6.15` fails to recognize it. Thus, this PR also upgrades `actionlint` to the latest 1.6.21 to pass the linter check. Here is an example error from 1.6.15 from https://github.com/pytorch/pytorch/actions/runs/3315388073/jobs/5475918454:

```
>>> Lint for .github/workflows/lint.yml:

  Error (ACTIONLINT) [action]
    input "check-latest" is not defined in action "actions/setup-python@v4".
    available inputs are "architecture", "cache", "cache-dependency-path",
    "python-version", "python-version-file", "token"

         25  |        with:
         26  |          python-version: 3.8
         27  |          architecture: x64
    >>>  28  |          check-latest: false
         29  |          cache: pip
         30  |          cache-dependency-path: |
         31  |            **/.github/requirements-gha-cache.txt
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87621
Approved by: https://github.com/ZainRizvi
---
 .github/actions/setup-win/action.yml      |  3 +-
 .github/workflows/lint.yml                | 37 +++++++++++++----------
 .github/workflows/pr-labels.yml           |  1 +
 .github/workflows/revert.yml              |  5 +--
 .github/workflows/trymerge.yml            |  5 +--
 .github/workflows/tryrebase.yml           |  5 +--
 .github/workflows/update-viablestrict.yml |  3 +-
 tools/linter/adapters/s3_init_config.json |  8 ++---
 8 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/.github/actions/setup-win/action.yml b/.github/actions/setup-win/action.yml
index d442343430c7d..6dc1a1b6c6fe2 100644
--- a/.github/actions/setup-win/action.yml
+++ b/.github/actions/setup-win/action.yml
@@ -57,7 +57,8 @@ runs:
     - name: Setup Python3
       uses: actions/setup-python@v4
       with:
-        python-version: "3.x"
+        python-version: '3.x'
+        check-latest: false
         cache: pip
         cache-dependency-path: |
           **/requirements.txt
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 669977b143a5e..cff22d72d4d24 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -23,18 +23,16 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v4
         with:
-          python-version: 3.8
+          python-version: '3.8'
           architecture: x64
+          check-latest: false
           cache: pip
           cache-dependency-path: |
             **/.github/requirements-gha-cache.txt
 
-      - name: Install lintrunner
-        uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
-        with:
-          timeout_minutes: 5
-          max_attempts: 3
-          command: pip install lintrunner==0.9.2
+      - name: Install requirements
+        run: |
+          pip install -r .github/requirements-gha-cache.txt --user
 
       - name: Initialize lint dependencies
         run: lintrunner init
@@ -85,8 +83,9 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v4
         with:
-          python-version: 3.x
+          python-version: '3.x'
           architecture: x64
+          check-latest: false
           cache: pip
           cache-dependency-path: |
             **/requirements.txt
@@ -149,8 +148,9 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v4
         with:
-          python-version: 3.x
+          python-version: '3.x'
           architecture: x64
+          check-latest: false
           cache: pip
           cache-dependency-path: |
             **/requirements.txt
@@ -240,9 +240,10 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v4
         with:
-          python-version: 3.8
+          python-version: '3.8'
           architecture: x64
-          cache: 'pip'
+          check-latest: false
+          cache: pip
           cache-dependency-path: |
             **/.github/requirements-gha-cache.txt
 
@@ -273,8 +274,9 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v4
         with:
-          python-version: 3.8
+          python-version: '3.8'
           architecture: x64
+          check-latest: false
           cache: pip
           cache-dependency-path: |
             **/requirements.txt
@@ -318,23 +320,26 @@ jobs:
         if: matrix.test_type == 'older_python_version'
         uses: actions/setup-python@v4
         with:
-          python-version: 3.5
+          python-version: '3.5'
           architecture: x64
+          check-latest: false
           cache: pip
           cache-dependency-path: |
-            **/.github/requirements-gha-cache.txt
+            **/requirements.txt
       - name: Setup Python 3.8
         if: matrix.test_type != 'older_python_version'
         uses: actions/setup-python@v4
         with:
-          python-version: 3.8
+          python-version: '3.8'
           architecture: x64
+          check-latest: false
           cache: pip
           cache-dependency-path: |
-            **/.github/requirements-gha-cache.txt
+            **/requirements.txt
       - name: Install torch
         if: matrix.test_type == 'with_torch'
         run: |
+          pip install -r requirements.txt
           # Doesn't really matter what torch version, we just need ANY torch installed
           pip install 'torch==1.*'
       - name: Run collect_env.py
diff --git a/.github/workflows/pr-labels.yml b/.github/workflows/pr-labels.yml
index aa8cf4472b784..9afa0e721ac60 100644
--- a/.github/workflows/pr-labels.yml
+++ b/.github/workflows/pr-labels.yml
@@ -18,6 +18,7 @@ jobs:
         uses: actions/setup-python@v4
         with:
           python-version: '3.10'
+          check-latest: false
           cache: pip
           cache-dependency-path: |
             **/.github/requirements-gha-cache.txt
diff --git a/.github/workflows/revert.yml b/.github/workflows/revert.yml
index d207840f383b4..2a2fff27044ea 100644
--- a/.github/workflows/revert.yml
+++ b/.github/workflows/revert.yml
@@ -21,9 +21,10 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v4
         with:
-          python-version: 3.8
+          python-version: '3.8'
           architecture: x64
-          cache: 'pip'
+          check-latest: false
+          cache: pip
       - run: pip install pyyaml==6.0
 
       - name: Setup committer id
diff --git a/.github/workflows/trymerge.yml b/.github/workflows/trymerge.yml
index dff92303f5056..3d1d92967d885 100644
--- a/.github/workflows/trymerge.yml
+++ b/.github/workflows/trymerge.yml
@@ -21,8 +21,9 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v4
         with:
-          python-version: 3.8
-          cache: 'pip'
+          python-version: '3.8'
+          check-latest: false
+          cache: pip
           architecture: x64
       - run: pip install pyyaml==6.0
 
diff --git a/.github/workflows/tryrebase.yml b/.github/workflows/tryrebase.yml
index fed9000c420e9..53434310c3d00 100644
--- a/.github/workflows/tryrebase.yml
+++ b/.github/workflows/tryrebase.yml
@@ -20,9 +20,10 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v4
         with:
-          python-version: 3.8
+          python-version: '3.8'
           architecture: x64
-          cache: 'pip'
+          check-latest: false
+          cache: pip
       - run: pip install pyyaml==6.0
 
       - name: Setup committer id
diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml
index 5901b1f4cda1b..12bf4e271f927 100644
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@@ -22,8 +22,9 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v4
         with:
-          python-version: 3.8
+          python-version: '3.8'
           architecture: x64
+          check-latest: false
           cache: pip
           cache-dependency-path: |
             **/.circleci/docker/requirements-ci.txt
diff --git a/tools/linter/adapters/s3_init_config.json b/tools/linter/adapters/s3_init_config.json
index 0b0e87e8e26cf..d48f264f83d5d 100644
--- a/tools/linter/adapters/s3_init_config.json
+++ b/tools/linter/adapters/s3_init_config.json
@@ -27,12 +27,12 @@
     },
     "actionlint": {
         "Darwin": {
-            "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.15/Darwin_amd64/actionlint",
-            "hash": "e9a0e0b17e54cfefe7964b6aa1da8921b1f8f2318c31c0eb1a17ea3e8ab10db2"
+            "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.21/Darwin_amd64/actionlint",
+            "hash": "b354db83815384d3c3a07f68f44b30cb0a70899757a0d185d7322de9952e8813"
         },
         "Linux": {
-            "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.15/Linux_arm64/actionlint",
-            "hash": "d6b45ae67f29a2bf9ddd226071ddd8f158fdf2992e8515a06838e5fef90f3a2d"
+            "download_url": "https://oss-clang-format.s3.us-east-2.amazonaws.com/actionlint/1.6.21/Linux_arm64/actionlint",
+            "hash": "025ac157db121b33971ef24af72d73d71cda3cb1e3a94795bb2708ef4032ca76"
         }
     }
 }

From 427e641d694e032365b7001d0a49ce721e038d65 Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Wed, 26 Oct 2022 17:38:05 +0000
Subject: [PATCH 0195/1922] Fix meta for index_add and index_put (#87775)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87775
Approved by: https://github.com/ezyang, https://github.com/ngimel
---
 test/test_meta.py            | 64 ++++++++++++++++++++++++++++++++++++
 torch/_meta_registrations.py |  2 +-
 torch/_refs/__init__.py      |  5 ++-
 3 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/test/test_meta.py b/test/test_meta.py
index 23e7025140138..2431042e01728 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -1,5 +1,6 @@
 # Owner(s): ["module: primTorch"]
 
+import itertools
 import torch
 import os
 from enum import Enum
@@ -20,6 +21,7 @@
 from torch.testing._internal.common_device_type import (
     ops,
     instantiate_device_type_tests,
+    onlyCUDA,
 )
 from torch.testing._internal.common_methods_invocations import op_db
 from torchgen.utils import YamlLoader
@@ -187,6 +189,8 @@ def test_tensor_outlives_converter(self):
 
 CHECK_STRIDES = {
     torch.Tensor.__getitem__,
+    torch.ops.aten.index_put,
+    torch.ops.aten.index_add,
 }
 
 def should_check_strides(func):
@@ -1023,6 +1027,66 @@ def test_fill_alias_relationship(self):
         r2 = torch.ops.aten.fill(inps, 1.0)
         self.assertNotEqual(id(inps), id(r2))
 
+    def get_stride_variants(self, t):
+        results = []
+
+        # contiguous
+        results.append(t)
+
+        # transposed
+        if t.ndim > 1:
+            perm = list(reversed(range(t.ndim)))
+            transposed = torch.empty(t.shape[::-1], device=t.device, dtype=t.dtype).permute(perm).copy_(t)
+            results.append(transposed)
+
+        # nondense
+        nondense = torch.repeat_interleave(t, 2, dim=-1)[..., ::2]
+        results.append(nondense)
+
+        return results
+
+    @onlyCUDA
+    def test_index_add_stride(self, device):
+        to_meta = MetaConverter()
+
+        x = torch.ones(5, 3, device=device)
+        t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float, device=device)
+        index = torch.tensor([0, 4, 2], device=device)
+
+        xs = self.get_stride_variants(x)
+        ts = self.get_stride_variants(t)
+
+        for x, t in itertools.product(xs, ts):
+            args = (x, 0, index, t)
+            meta_args = tree_map(to_meta, args)
+
+            r = torch.ops.aten.index_add(*args)
+            meta_r = torch.ops.aten.index_add(*meta_args)
+
+            self.assertEqual(r.size(), meta_r.size())
+            self.assertEqual(r.stride(), meta_r.stride())
+
+    @onlyCUDA
+    def test_index_put_stride(self, device):
+        to_meta = MetaConverter()
+
+        x = torch.rand(5, 5, device=device)
+        t = torch.rand(5, device=device)
+        index = torch.tensor([True, False, True, True, False], device=device)
+
+        xs = self.get_stride_variants(x)
+        ts = self.get_stride_variants(t)
+
+        for x, t in itertools.product(xs, ts):
+            args = (x, [index], t)
+            meta_args = tree_map(to_meta, args)
+
+            r = torch.ops.aten.index_put(*args)
+            meta_r = torch.ops.aten.index_put(*meta_args)
+
+            self.assertEqual(r.size(), meta_r.size())
+            self.assertEqual(r.stride(), meta_r.stride())
+
     def test_map_location_deserialize(self):
         import io
 
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 0af6813ce4a00..cde0ac96a2d84 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1105,7 +1105,7 @@ def meta_relu_(self):
 
 @register_meta(aten.index_put.default)
 def meta_index_put(self, indices, values, accumulate=False):
-    return self.new_empty(self.size())
+    return torch.empty_like(self)
 
 
 @register_meta(aten.masked_fill_.Scalar)
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 44b75bb92df48..5cee1c9a684bb 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -3280,7 +3280,10 @@ def index_add(
     *,
     alpha: NumberType = 1,
 ):
-    return x.clone().index_add_(dim, index, tensor, alpha=alpha)  # type: ignore[arg-type]
+    # index_add always returns a new contiguous tensor
+    return x.clone(memory_format=torch.contiguous_format).index_add_(
+        dim, index, tensor, alpha=alpha  # type: ignore[arg-type]
+    )
 
 
 @register_decomposition(torch.ops.aten.index_select)

From 62b83f2428ad64e96129a040f0a761672e26e38f Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchu@microsoft.com>
Date: Wed, 26 Oct 2022 20:42:06 +0000
Subject: [PATCH 0196/1922] [ONNX] Deprecate operators.py (#87798)

Deprecate `torch.onnx.operators` because it's only for backwards compatibility
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87798
Approved by: https://github.com/BowenBao
---
 torch/onnx/operators.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/torch/onnx/operators.py b/torch/onnx/operators.py
index e5f12444c3559..07f89b2e41a60 100644
--- a/torch/onnx/operators.py
+++ b/torch/onnx/operators.py
@@ -9,12 +9,15 @@
 """
 
 import torch
-import torch.onnx
+from torch.onnx import _deprecation
 
 
+# 180-day deprecation period
+@_deprecation.deprecated("1.14", "1.16", "use torch._shape_as_tensor")
 def shape_as_tensor(x):
     return torch._shape_as_tensor(x)
 
 
+@_deprecation.deprecated("1.14", "1.16", "use torch._reshape_from_tensor")
 def reshape_from_tensor_shape(x, shape):
     return torch._reshape_from_tensor(x, shape)

From e1c85fa44efe828564f0b332b0765f59cad6545f Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Wed, 26 Oct 2022 13:59:07 +0000
Subject: [PATCH 0197/1922] Clean up CPU test in test_torchinductor.py for
 fbcode (#87783)

cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87783
Approved by: https://github.com/bertmaher
---
 test/inductor/test_torchinductor.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index a675fc476672b..8e8b371c2780e 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -19,6 +19,7 @@
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.nn import functional as F
 from torch.testing._internal.common_utils import (
+    IS_FBCODE,
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
     TestCase as TorchTestCase,
@@ -54,6 +55,9 @@
 
 HAS_CPU = False
 try:
+    if IS_FBCODE:
+        raise torch._inductor.exc.CppCompileError
+
     from subprocess import CalledProcessError
 
     from torch._inductor.codecache import CppCodeCache
@@ -410,13 +414,6 @@ def populate(cls):
                 cls.gen_template(name1, name2)
 
 
-class SweepInputsCpuTest(SweepInputs2, TestCase):
-    gen = InputGen(10, "cpu")
-
-
-SweepInputsCpuTest.populate()
-
-
 class TestIndexingSimplification(TorchTestCase):
     def test_indexing_simplification(self):
         sizevars = SizeVarAllocator()
@@ -4027,6 +4024,11 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
 
 if HAS_CPU:
 
+    class SweepInputsCpuTest(SweepInputs2, TestCase):
+        gen = InputGen(10, "cpu")
+
+    SweepInputsCpuTest.populate()
+
     class CpuTests(TestCase):
         common = check_model
         device = "cpu"

From 3d6c0ba18bb3ee3261ceffb16281dbccccf834db Mon Sep 17 00:00:00 2001
From: Sergii Dymchenko <sdym@fb.com>
Date: Wed, 26 Oct 2022 20:54:25 +0000
Subject: [PATCH 0198/1922] Update XLA hash (#87818)

This is a re-creation of https://github.com/pytorch/pytorch/pull/87808 so we don't have to wait.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87818
Approved by: https://github.com/clee2000
---
 .github/ci_commit_pins/xla.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index e75cb6ffbe979..86063843174d2 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-1812b1d19477707ed027e7b597ff23a46176dab8
+79131e9d31290744afdf3d85118251863e16ab0e

From bde87a1d2e25c2b40922a044b6e70f343fc4a632 Mon Sep 17 00:00:00 2001
From: Zafar <cc.rafaz@zafar.cc>
Date: Wed, 26 Oct 2022 20:55:10 +0000
Subject: [PATCH 0199/1922] [ao] Fixing tests for block pruning shapes (#87326)

The current unittests were only checking the tensors whose shapes were already multiples of the block size. That caused some hidden bugs to creep in. Specifically, for the shapes that would require padding for the mask/data, the sparsifier would try to apply shape-mismatching tensors onto each other. This caused segfaults as well as silent failures.

This makes minor adjustments to the code to make sure the masks and data shapes are aligned, as well as fixing the tests to catch this.

Test Plan:

```python
python test/test_ao_sparsity.py
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87326
Approved by: https://github.com/jcaip
---
 test/ao/sparsity/test_sparsifier.py                   | 8 +++++---
 torch/ao/pruning/sparsifier/weight_norm_sparsifier.py | 9 +++++----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/test/ao/sparsity/test_sparsifier.py b/test/ao/sparsity/test_sparsifier.py
index 415679337ff2e..512c58b188367 100644
--- a/test/ao/sparsity/test_sparsifier.py
+++ b/test/ao/sparsity/test_sparsifier.py
@@ -18,14 +18,16 @@ class Model(nn.Module):
     def __init__(self):
         super().__init__()
         self.seq = nn.Sequential(
-            nn.Linear(16, 16)
+            nn.Linear(37, 39)
         )
-        self.linear = nn.Linear(16, 16)
-        self.head = nn.Linear(16, 4)
+        self.linear = nn.Linear(39, 33)
+        self.head = nn.Linear(33, 13)
 
     def forward(self, x):
         x = self.seq(x)
+        x = torch.relu(x)
         x = self.linear(x)
+        x = torch.relu(x)
         x = self.head(x)
         return x
 
diff --git a/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py b/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py
index 8a66280cc852d..2ba2584616e21 100644
--- a/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py
+++ b/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py
@@ -99,7 +99,7 @@ def _make_tensor_mask(self, data, input_shape, sparsity_level, sparse_block_shap
         dw = (block_w - w % block_w) % block_w
 
         if mask is None:
-            mask = torch.ones(h, w, device=data.device)
+            mask = torch.ones(h + dh, w + dw, device=data.device)
 
         if sparsity_level >= 1.0:
             mask.data = torch.zeros_like(mask)
@@ -141,14 +141,15 @@ def _make_block_mask(self, data, sparse_block_shape, zeros_per_block, mask=None)
 
         In this context the `zeros_per_block` describes the number of zeroed-out elements within a patch.
         """
-        if mask is None:
-            mask = torch.ones(data.shape, device=data.device)
         h, w = data.shape[-2:]
         block_h, block_w = sparse_block_shape
         dh = (block_h - h % block_h) % block_h
         dw = (block_w - w % block_w) % block_w
         values_per_block = reduce((lambda x, y: x * y), sparse_block_shape)
 
+        if mask is None:
+            mask = torch.ones((h + dh, w + dw), device=data.device)
+
         if values_per_block == zeros_per_block:
             # Everything should be sparsified
             mask.data = torch.zeros_like(mask)
@@ -168,7 +169,7 @@ def _make_block_mask(self, data, sparse_block_shape, zeros_per_block, mask=None)
             dim=1, indices=sorted_idx, output_shape=padded_data.shape, block_shape=sparse_block_shape, mask=mask_reshape
         )
 
-        mask.data = mask_reshape.squeeze().reshape(mask.shape)[:h, :w].contiguous()
+        mask.data = mask_reshape.squeeze().reshape(mask.shape).contiguous()
         return mask
 
     def update_mask(self, module, tensor_name, sparsity_level, sparse_block_shape,

From fa1b988f9c28fa295abc3a48d5306fa056425198 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 26 Oct 2022 21:01:09 +0000
Subject: [PATCH 0200/1922] Revert "Disable linux-bionic-py3_7-clang8-xla-test
 (#87737)"

This reverts commit 21f7e7d040c646b4ce7f4a4e973da97660462bdc.

Reverted https://github.com/pytorch/pytorch/pull/87737 on behalf of https://github.com/kit1980 due to Re-enable XLA tests after https://github.com/pytorch/pytorch/pull/87818
---
 .github/workflows/pull.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index cc25bfc1326d1..849e70dc9f29d 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -219,7 +219,6 @@ jobs:
         ]}
 
   linux-bionic-py3_7-clang8-xla-test:
-    if: false
     name: linux-bionic-py3_7-clang8-xla
     uses: ./.github/workflows/_linux-test.yml
     needs: linux-bionic-py3_7-clang8-xla-build

From 94185b9b7c7dd066e5878e25baa6470a819073e9 Mon Sep 17 00:00:00 2001
From: soulitzer <soulitzer@gmail.com>
Date: Wed, 26 Oct 2022 13:34:34 -0400
Subject: [PATCH 0201/1922] Expose API for backward execution order (#87507)

In this PR:
- graph_task stores graph roots on construction so that we can later traverse through the graph
- before the nodes are returned, they needed to be converted from raw_ptr to shared_ptr, and this should be OK because the graph is guaranteed to be alive

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87507
Approved by: https://github.com/albanD
---
 test/test_autograd.py                         | 122 ++++++++++++++++++
 torch/csrc/Module.cpp                         |  26 ++++
 torch/csrc/autograd/engine.cpp                |  93 +++++++++++--
 torch/csrc/autograd/function.h                |   3 +
 torch/csrc/autograd/graph_task.h              |   4 +
 .../autograd/engine/dist_engine.cpp           |   8 ++
 6 files changed, 245 insertions(+), 11 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index 03cc78dc242fb..43f31ae63ed32 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -3073,6 +3073,128 @@ def hook(_):
 
         self.assertEqual(torch._C._current_graph_task_id(), -1)
 
+    def test_current_graph_task_execution_order(self):
+        predicted = [None]
+
+        def hook(_):
+            predicted[0] = torch._C._current_graph_task_execution_order()
+
+        def names(nodes):
+            return ", ".join([node.name().split(' ')[-1] for node in nodes]) + '\n'
+
+        def grad_fns(*tensors):
+            # or grad accumulator
+            out = []
+            for t in tensors:
+                if t.requires_grad and t.grad_fn is None:
+                    out.append(t.clone().grad_fn.next_functions[0][0])
+                else:
+                    out.append(t.grad_fn)
+            return out
+
+        actual = []
+
+        def register_logging_hooks(*tensors):
+            # register hooks that log the order in which they are called
+            def get_hook(i):
+                def hook(t_):
+                    actual.append(tensors[i])
+                return hook
+
+            for i, t in enumerate(tensors):
+                t.register_hook(get_hook(i))
+
+        # Basic example: single path
+        t = torch.tensor(1., requires_grad=True).clone().sin().exp()
+        t.register_hook(hook)
+        with torch.autograd.set_multithreading_enabled(False):
+            t.backward()
+        self.assertExpectedInline(names(predicted[0]), """\
+ExpBackward0, SinBackward0, CloneBackward0, torch::autograd::AccumulateGrad
+""")
+
+        # We don't exactly follow sequence_nr order
+        a = torch.tensor(1., requires_grad=True)
+        b = torch.tensor(2., requires_grad=True)
+        c = b.sin()
+        d = a.cos()
+        out = c * d
+        register_logging_hooks(a, b, c, d, out)
+        out.register_hook(hook)
+        with torch.autograd.set_multithreading_enabled(False):
+            out.backward()
+        self.assertEqual(predicted[0], grad_fns(*actual))
+        actual = []
+
+        # Multiple roots are also OK
+        a = torch.tensor(1., requires_grad=True)
+        b = a * 2
+        out = b.sin()
+        out2 = b.cos()
+        out3 = b.cos()
+        register_logging_hooks(a, b, out, out2, out3)
+        out3.register_hook(hook)
+        with torch.autograd.set_multithreading_enabled(False):
+            torch.autograd.grad((out, out3, out2), inputs=(a,))
+        self.assertExpectedInline(names(predicted[0]), """\
+CosBackward0, CosBackward0, SinBackward0, MulBackward0, torch::autograd::AccumulateGrad
+""")
+        # TODO: Uncomment after update to hooks behavior
+        # self.assertEqual(predicted[0], grad_fns(*actual))
+        actual = []
+
+        # Case where next node is nullptr
+        a = torch.tensor(1., requires_grad=True)
+        b = a * 2
+        out = b.sin()
+        register_logging_hooks(a, b, out)
+        out.register_hook(hook)
+        with torch.autograd.set_multithreading_enabled(False):
+            out.backward()
+        self.assertEqual(predicted[0], grad_fns(*actual))
+        actual = []
+
+        # Case where two `inputs` on the same path
+        a = torch.tensor(1., requires_grad=True)
+        b = a * 2
+        out = b.sin()
+        register_logging_hooks(a, b, out)
+        out.register_hook(hook)
+        with torch.autograd.set_multithreading_enabled(False):
+            torch.autograd.grad((out,), inputs=(a, b,))
+        self.assertEqual(names(predicted[0]), """\
+SinBackward0, MulBackward0, torch::autograd::AccumulateGrad
+""")
+        # TODO: Uncomment after update to hooks behavior
+        # self.assertEqual(predicted[0], grad_fns(*actual))
+        actual = []
+
+        # Case where `inputs` specifies a subgraph
+        a = torch.tensor(1., requires_grad=True)
+        b = torch.tensor(1., requires_grad=True)
+        c = a * b
+        out = c.sin()
+        register_logging_hooks(a, b, c, out)
+        out.register_hook(hook)
+        with torch.autograd.set_multithreading_enabled(False):
+            torch.autograd.grad((out,), inputs=(a,))
+        self.assertEqual(names(predicted[0]), """\
+SinBackward0, MulBackward0, torch::autograd::AccumulateGrad
+""")
+        # TODO: Uncomment after update to hooks behavior
+        # self.assertEqual(predicted[0], grad_fns(*actual))
+        actual = []
+
+        # Errors when not called in a backward
+        with self.assertRaisesRegex(RuntimeError, "should only be called during the backward pass"):
+            torch._C._current_graph_task_execution_order()
+
+        # Errors when context manager not enabled
+        t = torch.tensor(1., requires_grad=True).clone().sin().exp()
+        t.register_hook(hook)
+        with self.assertRaisesRegex(RuntimeError, "expects the current backward to be executed with multithreading disabled"):
+            t.backward()
+
     def test_profiler(self):
         x = torch.randn(10, 10)
 
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index e41f0305a2e11..98589a31eaced 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -813,6 +813,28 @@ PyObject* THPModule_willEngineExecuteNode(PyObject* _unused, PyObject* arg) {
   END_HANDLE_TH_ERRORS
 }
 
+PyObject* THPModule_getCurrentGraphTaskExecutionOrder(
+    PyObject* _unused,
+    PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  std::vector<torch::autograd::Node*> nodes =
+      torch::autograd::get_current_graph_task_execution_order();
+  TORCH_CHECK(
+      nodes.size(),
+      "_current_graph_task_execution_order should only be called during the backward pass");
+  auto list = THPObjectPtr(PyList_New(nodes.size()));
+  if (!list)
+    return nullptr;
+  for (const auto i : c10::irange(nodes.size())) {
+    // This node is guaranteed to be alive since the backward is still running
+    PyObject* pyobj_node =
+        torch::autograd::functionToPyObject(nodes[i]->getptr());
+    PyList_SET_ITEM(list.get(), i, pyobj_node);
+  }
+  return list.release();
+  END_HANDLE_TH_ERRORS
+}
+
 PyObject* THPModule_getCurrentGraphTaskId(PyObject* _unused, PyObject* noargs) {
   HANDLE_TH_ERRORS
   return THPUtils_packInt64(torch::autograd::get_current_graph_task_id());
@@ -1019,6 +1041,10 @@ static PyMethodDef TorchMethods[] = {
      THPModule_willEngineExecuteNode,
      METH_O,
      nullptr},
+    {"_current_graph_task_execution_order",
+     THPModule_getCurrentGraphTaskExecutionOrder,
+     METH_NOARGS,
+     nullptr},
     {"_current_graph_task_id",
      THPModule_getCurrentGraphTaskId,
      METH_NOARGS,
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index ca9ae4e443df5..0a2298efc1282 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -398,6 +398,66 @@ void add_node_to_current_graph_task_exec_info(Node* fn) {
   current_graph_task->exec_info_[fn].needed_ = true;
 }
 
+// NB: The engine itself does not use the outputs of this function.
+std::vector<Node*> get_current_graph_task_execution_order() {
+  std::shared_ptr<GraphTask> task = current_graph_task;
+  if (!task) {
+    return {};
+  }
+
+  // We could potentially check if there is only a single device here
+  // but explicitly require this context doens't seem bad either
+  TORCH_CHECK(
+      !c10::AutogradState::get_tls_state().get_multithreading_enabled(),
+      "get_current_graph_task_execution_order expects the current backward to be "
+      "executed with multithreading disabled, e.g. by running:\n\n"
+      ">>> with torch.autograd.set_multithreading_enabled(False):\n"
+      "...     torch.autograd.grad(...)\n");
+
+  const bool check_exec_info = !task->exec_info_.empty();
+  std::vector<Node*> out{};
+  std::unordered_set<Node*> seen{};
+
+  auto compare_seq_nr = [](Node* n1, Node* n2) {
+    return n1->sequence_nr() < n2->sequence_nr();
+  };
+  std::priority_queue<Node*, std::vector<Node*>, decltype(compare_seq_nr)> heap(
+      compare_seq_nr);
+
+  for (Node* ptr : task->graph_roots_) {
+    heap.push(ptr);
+  }
+
+  // Implementation notes:
+  // - Don't need to count dependencies because we have sequence_nr
+  // - Don't need to check topological_nr because we have exec_info
+  while (!heap.empty()) {
+    Node* fn = heap.top();
+    heap.pop();
+
+    const bool was_inserted = seen.insert(fn).second;
+    if (!was_inserted) {
+      continue;
+    }
+
+    out.push_back(fn);
+    for (const auto& edge : fn->next_edges()) {
+      Node* next_ptr = edge.function.get();
+      if (!next_ptr) {
+        continue;
+      }
+      if (check_exec_info) {
+        auto it = task->exec_info_.find(next_ptr);
+        if (it == task->exec_info_.end() || !it->second.should_execute()) {
+          continue;
+        }
+      }
+      heap.push(next_ptr);
+    }
+  }
+  return out;
+}
+
 // NOTE: graph_tasks do not necessarily form a stack. Imagine this
 // case:
 //
@@ -1050,7 +1110,7 @@ auto Engine::compute_dependencies(
 }
 
 auto Engine::execute(
-    const edge_list& roots,
+    const edge_list& root_edges,
     const variable_list& inputs,
     bool keep_graph,
     bool create_graph,
@@ -1058,9 +1118,9 @@ auto Engine::execute(
     const edge_list& outputs) -> variable_list {
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
   validate_outputs(
-      roots, const_cast<variable_list&>(inputs), [](const std::string& msg) {
-        return msg;
-      });
+      root_edges,
+      const_cast<variable_list&>(inputs),
+      [](const std::string& msg) { return msg; });
   if (accumulate_grad && create_graph) {
     TORCH_WARN_ONCE(
         "Using backward() with create_graph=True will create a reference cycle "
@@ -1083,17 +1143,25 @@ auto Engine::execute(
   init_local_ready_queue();
   bool not_reentrant_backward_call = worker_device == NO_DEVICE;
 
+  // Store root nodes so we can traverse through the graph later
+  // e.g., for get_current_graph_task_execution_order
+  c10::SmallVector<Node*, 4> temp_roots{root_edges.size()};
+  for (const auto i : c10::irange(root_edges.size())) {
+    temp_roots[i] = root_edges[i].function.get();
+  }
+
   auto graph_task = std::make_shared<GraphTask>(
       /* keep_graph */ keep_graph,
       /* create_graph */ create_graph,
       /* depth */ not_reentrant_backward_call ? 0 : total_depth + 1,
-      /* cpu_ready_queue */ local_ready_queue);
+      /* cpu_ready_queue */ local_ready_queue,
+      /* graph_roots */ std::move(temp_roots));
 
   // If we receive a single root, skip creating extra root node
-  bool skip_dummy_node = roots.size() == 1;
+  bool skip_dummy_node = root_edges.size() == 1;
   auto graph_root = skip_dummy_node
-      ? roots.at(0).function
-      : std::make_shared<GraphRoot>(roots, inputs);
+      ? root_edges.at(0).function
+      : std::make_shared<GraphRoot>(root_edges, inputs);
 
   auto min_topo_nr = compute_min_topological_nr(outputs);
   // Now compute the dependencies for all executable functions
@@ -1106,14 +1174,17 @@ auto Engine::execute(
 
   // Queue the root
   if (skip_dummy_node) {
-    InputBuffer input_buffer(roots.at(0).function->num_inputs());
+    InputBuffer input_buffer(root_edges.at(0).function->num_inputs());
     auto input = inputs.at(0);
 
     const auto input_stream = InputMetadata(input).stream();
     const auto opt_next_stream =
-        roots.at(0).function->stream(c10::DeviceType::CUDA);
+        root_edges.at(0).function->stream(c10::DeviceType::CUDA);
     input_buffer.add(
-        roots.at(0).input_nr, std::move(input), input_stream, opt_next_stream);
+        root_edges.at(0).input_nr,
+        std::move(input),
+        input_stream,
+        opt_next_stream);
 
     execute_with_graph_task(graph_task, graph_root, std::move(input_buffer));
   } else {
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index aa82e3ad2c77c..bb5f4b1eaad09 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -143,6 +143,9 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
   Node& operator=(Node&& other) = delete;
   virtual ~Node() = default;
 
+  std::shared_ptr<Node> getptr() {
+    return shared_from_this();
+  }
   /// Evaluates the function on the given inputs and returns the result of the
   /// function call.
   variable_list operator()(variable_list&& inputs) {
diff --git a/torch/csrc/autograd/graph_task.h b/torch/csrc/autograd/graph_task.h
index 8eb122313d0a0..4efbc905fed37 100644
--- a/torch/csrc/autograd/graph_task.h
+++ b/torch/csrc/autograd/graph_task.h
@@ -37,6 +37,7 @@ struct GraphTask : std::enable_shared_from_this<GraphTask> {
 
   // Records the nodes that are in the graph
   std::unordered_set<Node*> nodes_in_graph_;
+  c10::SmallVector<Node*, 4> graph_roots_;
   // Note [Exec info]
   // Exec info is created for each GraphTask, which allows filtering paths on
   // the graph that are not needed. It has a bit complicated semantics. If it's
@@ -164,8 +165,10 @@ struct GraphTask : std::enable_shared_from_this<GraphTask> {
       bool grad_mode,
       int reentrant_depth,
       std::shared_ptr<ReadyQueue> cpu_ready_queue,
+      c10::SmallVector<Node*, 4> graph_roots,
       bool exit_on_error = false)
       : keep_graph_(keep_graph),
+        graph_roots_(std::move(graph_roots)),
         owner_(NO_DEVICE),
         reentrant_depth_(reentrant_depth),
         exit_on_error_(exit_on_error),
@@ -198,6 +201,7 @@ get_current_graph_task_exec_info();
 TORCH_API const std::unordered_set<Node*>*
 get_current_graph_task_nodes_in_graph();
 TORCH_API bool get_current_graph_task_keep_graph();
+TORCH_API std::vector<Node*> get_current_graph_task_execution_order();
 TORCH_API int get_current_graph_task_id();
 void add_node_to_current_graph_task_exec_info(Node* fn);
 
diff --git a/torch/csrc/distributed/autograd/engine/dist_engine.cpp b/torch/csrc/distributed/autograd/engine/dist_engine.cpp
index 2da315644845c..06c6927e4c467 100644
--- a/torch/csrc/distributed/autograd/engine/dist_engine.cpp
+++ b/torch/csrc/distributed/autograd/engine/dist_engine.cpp
@@ -185,6 +185,13 @@ void DistEngine::computeDependencies(
     bool retainGraph) {
   TORCH_INTERNAL_ASSERT(graphRoot, "graphRoot is null!");
 
+  // Store root nodes so we can traverse through the graph later
+  // e.g., for get_current_graph_task_execution_order
+  c10::SmallVector<Node*, 4> temp_roots{rootEdges.size()};
+  for (const auto i : c10::irange(rootEdges.size())) {
+    temp_roots[i] = rootEdges[i].function.get();
+  }
+
   // Build the graph task and graph root.
   // NOTE: we don't need to build and pass a cpu_ready_queue to GraphTask
   // as we use execute_graph_task_until_ready_queue_empty, which will build
@@ -194,6 +201,7 @@ void DistEngine::computeDependencies(
       /* create_graph */ false,
       /* depth */ 0,
       /* cpu_ready_queue */ global_cpu_ready_queue_,
+      /* graph_roots */ temp_roots,
       /* exit_on_error */ true);
 
   // Run BFS to traverse the graph locally. The roots of the graph are

From a4e7fdb112024668185b1dc01f9995a3618cf381 Mon Sep 17 00:00:00 2001
From: Cameron Voisey <cameron.voisey@tngtech.com>
Date: Wed, 26 Oct 2022 21:34:13 +0000
Subject: [PATCH 0202/1922] Simplify installation instruction in contributing
 file (#87460)

Simplification of one of the installation instructions in CONTRIBUTING.md that I found tricky to parse at first.

Also adds a link to the "Make no-op build fast" section to make it easier to navigate to.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87460
Approved by: https://github.com/ngimel
---
 CONTRIBUTING.md | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 05e98c3b9a673..c43d64c4610d6 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -118,21 +118,9 @@ git submodule sync --recursive
 git submodule update --init --recursive --jobs 0
 ```
 
-If you want to have no-op incremental rebuilds (which are fast), see the section below titled "Make no-op build fast."
+If you want to have no-op incremental rebuilds (which are fast), see [Make no-op build fast](#make-no-op-build-fast) below.
 
-3. Follow  the instructions for [installing PyTorch from source](https://github.com/pytorch/pytorch#from-source), except when it's time to install PyTorch instead of invoking `setup.py install` you'll want to call `setup.py develop` instead:
-
-Specifically, the change you have to make is to replace
-
-```bash
-python setup.py install
-```
-
-with
-
-```bash
-python setup.py develop
-```
+3. Follow the instructions for [installing PyTorch from source](https://github.com/pytorch/pytorch#from-source), but instead of installing PyTorch via `python setup.py install`, use `python setup.py develop`.
 
 This mode will symlink the Python files from the current local source
 tree into the Python install.  This way when you modify a Python file, you

From 966df362cdbd79b3eaa50fc5a356368d7e7978ae Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Wed, 26 Oct 2022 21:51:13 +0000
Subject: [PATCH 0203/1922] [CI] Delete `nnpack` installation from conda
 (#87813)

Not sure why it was there to begin with and I really hope none of our CI depend on the package that was last updated 5 years ago, see https://anaconda.org/killeent/nnpack

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87813
Approved by: https://github.com/atalman, https://github.com/kit1980, https://github.com/ZainRizvi
---
 .circleci/docker/common/install_conda.sh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.circleci/docker/common/install_conda.sh b/.circleci/docker/common/install_conda.sh
index 713aad4729110..84f9538ce1248 100755
--- a/.circleci/docker/common/install_conda.sh
+++ b/.circleci/docker/common/install_conda.sh
@@ -104,9 +104,6 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
     conda_install magma-cuda$(TMP=${CUDA_VERSION/./};echo ${TMP%.*[0-9]}) -c pytorch
   fi
 
-  # TODO: This isn't working atm
-  conda_install nnpack -c killeent
-
   # Install some other packages, including those needed for Python test reporting
   pip_install -r /opt/conda/requirements-ci.txt
 

From 3eab0f406c455a58c87ea40726812e06ea2c62ce Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Wed, 26 Oct 2022 04:34:41 +0000
Subject: [PATCH 0204/1922] Enable graph_split_inductor test as it runs now
 (#87762)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87762
Approved by: https://github.com/davidberard98
---
 test/distributed/test_dynamo_distributed.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index 36a459b6f00c3..4e8c6ffa981ac 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -141,8 +141,6 @@ def opt_fn(inputs):
         self.assertTrue(same(correct_outputs, opt_outputs))
         self.assertEqual(check_splits_compiler.compiler_called, 3)
 
-    # hangs/crashes with inductor currently
-    @unittest.skip("hangs/crashes with inductor currently")
     @patch.object(config, "optimize_ddp", True)
     def test_graph_split_inductor(self):
         """

From a5e255725f1ee102bb3ce900a5fd5bc2d7e96ac2 Mon Sep 17 00:00:00 2001
From: Catherine Lee <csl@fb.com>
Date: Wed, 26 Oct 2022 22:10:10 +0000
Subject: [PATCH 0205/1922] print stderr for ghstack rebase (#87795)

current output tends to be empty on failure, which makes it hard to debug
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87795
Approved by: https://github.com/huydhn, https://github.com/ZainRizvi
---
 .github/scripts/tryrebase.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/scripts/tryrebase.py b/.github/scripts/tryrebase.py
index 1b69f653e525a..2e8987e9faaa1 100755
--- a/.github/scripts/tryrebase.py
+++ b/.github/scripts/tryrebase.py
@@ -69,6 +69,7 @@ def rebase_ghstack_onto(pr: GitHubPR, repo: GitRepo, onto_branch: str, dry_run:
         push_result = ghstack_result.stdout.decode("utf-8")
         print(push_result)
         if ghstack_result.returncode != 0:
+            print(ghstack_result.stderr.decode("utf-8"))
             raise Exception(f"\n```{push_result}```")
         # The contents of a successful push result should look like:
         # Summary of changes (ghstack 0.6.0)

From a070f7b0601a4a2fdf6de78ca4d580e15b12bc35 Mon Sep 17 00:00:00 2001
From: Jiewen Tan <jwtan@google.com>
Date: Wed, 26 Oct 2022 22:41:19 +0000
Subject: [PATCH 0206/1922] [LTC] Remove tensor.storage_ (#87645)

Summary:
Since LTC now supports functionalization, we don't need to fake a storage to support is_alias_of anymore. Let's remove it.

Test Plan:
 ./build/bin/test_lazy --gtest_filter=LazyOpsTest.IsAliasOf

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87645
Approved by: https://github.com/JackCaoG, https://github.com/bdhirsh
---
 torch/csrc/lazy/core/tensor.cpp                | 11 ++---------
 torch/csrc/lazy/core/tensor.h                  | 15 ---------------
 torch/csrc/lazy/core/tensor_impl.h             |  9 ---------
 torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp |  4 +---
 4 files changed, 3 insertions(+), 36 deletions(-)

diff --git a/torch/csrc/lazy/core/tensor.cpp b/torch/csrc/lazy/core/tensor.cpp
index bf673a72361d3..0a114d0e71179 100644
--- a/torch/csrc/lazy/core/tensor.cpp
+++ b/torch/csrc/lazy/core/tensor.cpp
@@ -83,12 +83,7 @@ LazyTensor::LazyTensor(
     const BackendDevice& device)
     : LazyTensor(std::make_shared<Data>(std::move(view), device)) {}
 
-LazyTensor::LazyTensor(std::shared_ptr<Data> data)
-    : data_(std::move(data)),
-      storage_(c10::Storage(
-          {},
-          0,
-          c10::DataPtr(nullptr, backendDeviceToAtenDevice(data_->device)))) {}
+LazyTensor::LazyTensor(std::shared_ptr<Data> data) : data_(std::move(data)) {}
 
 LazyTensor::Data* LazyTensor::data() const {
   TORCH_CHECK(data_ != nullptr, "Trying to access a null cursor");
@@ -353,9 +348,7 @@ std::shared_ptr<LazyView> LazyTensor::CreateView(ViewInfo view_info) const {
 }
 
 LazyTensorPtr LazyTensor::CreateViewTensor(ViewInfo view_info) const {
-  auto new_tensor = Create(CreateView(std::move(view_info)), GetDevice());
-  new_tensor->storage_ = Storage();
-  return new_tensor;
+  return Create(CreateView(std::move(view_info)), GetDevice());
 }
 
 at::Tensor LazyTensor::ToTensor(bool detached) {
diff --git a/torch/csrc/lazy/core/tensor.h b/torch/csrc/lazy/core/tensor.h
index 12cfdd2827d74..052b84b4a60cc 100644
--- a/torch/csrc/lazy/core/tensor.h
+++ b/torch/csrc/lazy/core/tensor.h
@@ -143,15 +143,6 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target {
   // Applies the queue of operations in preparation for using the data.
   void ApplyPendingGraph();
 
-  const c10::Storage& Storage() const {
-    return storage_;
-  }
-  // This is currently only used by outlier view ops such as expand that
-  // don't go through CreateViewTensor to support Tensor.is_alias_of.
-  void SetStorage(const c10::Storage& storage) {
-    storage_ = storage;
-  }
-
  private:
   LazyTensor(const at::Tensor& tensor, const BackendDevice& device);
   LazyTensor(Value ir_value, const BackendDevice& device);
@@ -196,12 +187,6 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target {
   static int64_t GetNextTensorId();
 
   std::shared_ptr<Data> data_;
-  // Temporarily used to suport Tensor.is_alias_of().
-  // This is a fake storage that doesn't store anything.
-  // Instead it serves as a marker to mark LazyTensors that
-  // points to the same storage, and thus alias of each other.
-  // FIXME(alanwaketan): Remove this once we have functionalization (bdhirsh).
-  c10::Storage storage_;
 };
 
 // Utils to convert at::Tensor to LazyTensor, and vice versa.
diff --git a/torch/csrc/lazy/core/tensor_impl.h b/torch/csrc/lazy/core/tensor_impl.h
index de1191a3de3e2..710230605cc1f 100644
--- a/torch/csrc/lazy/core/tensor_impl.h
+++ b/torch/csrc/lazy/core/tensor_impl.h
@@ -49,15 +49,6 @@ class TORCH_API LTCTensorImpl final : public c10::TensorImpl {
   c10::SymIntArrayRef sym_sizes_custom() const override;
   c10::SymIntArrayRef sym_strides_custom() const override;
 
-#ifndef C10_DISABLE_TENSORIMPL_EXTENSIBILITY
-  const at::Storage& storage() const override {
-    return tensor_->Storage();
-  }
-  bool has_storage() const override {
-    return tensor_->Storage();
-  }
-#endif // C10_DISABLE_TENSORIMPL_EXTENSIBILITY
-
  private:
   void setup_size_properties();
 
diff --git a/torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp b/torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp
index 534a9bca130db..3f5882f471f5d 100644
--- a/torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp
+++ b/torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp
@@ -105,14 +105,12 @@ torch::lazy::LazyTensorPtr expand(
     const torch::lazy::LazyTensorPtr& input,
     std::vector<int64_t> size) {
   auto input_shape = input->shape();
-  auto output = torch::lazy::LazyTensor::Create(
+  return torch::lazy::LazyTensor::Create(
       torch::lazy::MakeExpand(
           input->GetIrValue(),
           GetExpandDimensions(input_shape.Get(), std::move(size)),
           /*is_scalar_expand=*/false),
       input->GetDevice());
-  output->SetStorage(input->Storage());
-  return output;
 }
 
 void fill_(torch::lazy::LazyTensorPtr& input, const at::Scalar& value) {

From 2a26a514299e315a9cde9aa486588697b661dedb Mon Sep 17 00:00:00 2001
From: Driss Guessous <drisspg@fb.com>
Date: Wed, 26 Oct 2022 22:42:39 +0000
Subject: [PATCH 0207/1922] Add logging for nested tensor usage tracking
 (#87632)

# Summary
Add logging message so that we can track nested tensor adoption.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87632
Approved by: https://github.com/cpuhrsch
---
 aten/src/ATen/NestedTensorImpl.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/aten/src/ATen/NestedTensorImpl.cpp b/aten/src/ATen/NestedTensorImpl.cpp
index 94c9c8d073a94..c0199da124c36 100644
--- a/aten/src/ATen/NestedTensorImpl.cpp
+++ b/aten/src/ATen/NestedTensorImpl.cpp
@@ -7,6 +7,7 @@
 #include <c10/core/DispatchKeySet.h>
 #include <c10/util/Exception.h>
 #include <c10/core/TensorImpl.h>
+#include <c10/util/Logging.h>
 
 #include <numeric>
 #include <functional>
@@ -172,6 +173,7 @@ NestedTensorImpl::NestedTensorImpl(
       nested_stride_tensor_(std::move(nested_stride_tensor)),
       storage_offsets_(std::move(offsets)),
       opt_sizes_(construct_opt_sizes(nested_size_tensor_)) {
+  C10_LOG_API_USAGE_ONCE("Using torch.NestedTensor");
   TORCH_WARN_ONCE(
       "The PyTorch API of nested tensors is in prototype stage and will change "
       "in the near future.");

From f3bc3a57ce7650be933b056f48194b5a5a96c6b5 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Wed, 26 Oct 2022 23:16:29 +0000
Subject: [PATCH 0208/1922] [BE] Don't build CUDA-10.2 docker images (#87819)

As CUDA-10.2 should not longer be used in CI/CD

Test Plan: ` grep cuda10.2 .github -R|grep -v mock`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87819
Approved by: https://github.com/kit1980, https://github.com/ZainRizvi
---
 .github/workflows/docker-builds.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 572d8146ebe51..dd59d44e8a9d3 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -33,7 +33,6 @@ jobs:
     strategy:
       matrix:
         include:
-          - docker-image-name: pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7
           - docker-image-name: pytorch-linux-bionic-cuda11.3-cudnn8-py3-clang9
           - docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
           - docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
@@ -42,7 +41,6 @@ jobs:
           - docker-image-name: pytorch-linux-focal-rocm5.2-py3.8
           - docker-image-name: pytorch-linux-jammy-cuda11.6-cudnn8-py3.8-clang12
           - docker-image-name: pytorch-linux-jammy-cuda11.7-cudnn8-py3.8-clang12
-          - docker-image-name: pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7
           - docker-image-name: pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7
           - docker-image-name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c
           - docker-image-name: pytorch-linux-xenial-py3-clang5-asan

From e9fbb69aa25bc532ac4e0066a076d0b47c155d5f Mon Sep 17 00:00:00 2001
From: Nikita Karetnikov <nikita@karetnikov.org>
Date: Wed, 26 Oct 2022 07:36:02 +0200
Subject: [PATCH 0209/1922] [primTorch] Check `error_regex` in
 `test_python_ref_errors` (#86987)

cc @ezyang @mruberry @ngimel @Lezcano @fdrocha
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86987
Approved by: https://github.com/lezcano, https://github.com/mruberry
---
 test/test_ops.py                              |  3 +-
 .../_internal/common_methods_invocations.py   | 45 +++++++++++++++++++
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 0e5b6f1d607dd..1d20151c20e89 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -477,8 +477,7 @@ def _to_tensormeta(x):
         for ei in error_inputs:
             si = ei.sample_input
             meta_sample = si.transform(_to_tensormeta)
-            # TODO: match strings
-            with self.assertRaisesRegex(ei.error_type, ""):
+            with self.assertRaisesRegex(ei.error_type, ei.error_regex):
                 op(meta_sample.input, *meta_sample.args, **meta_sample.kwargs)
 
     # Tests that the function produces the same result when called with
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 94c12f5bc93d0..900c0987d2f2c 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -16570,6 +16570,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         aliases=('moveaxis',),
         torch_opinfo_name="movedim",
         supports_nvfuser=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
+        ),
     ),
     PythonRefInfo(
         "_refs.bucketize",
@@ -16765,6 +16768,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     ElementwiseUnaryPythonRefInfo(
         "_refs.neg",
         torch_opinfo_name="neg",
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
+        ),
     ),
     ElementwiseUnaryPythonRefInfo(
         "_refs.positive",
@@ -16977,10 +16983,16 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     PythonRefInfo(
         "_refs.nn.functional.poisson_nll_loss",
         torch_opinfo_name="nn.functional.poisson_nll_loss",
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
+        ),
     ),
     ElementwiseUnaryPythonRefInfo(
         "_refs.nn.functional.prelu",
         torch_opinfo_name="nn.functional.prelu",
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
+        ),
     ),
     ElementwiseUnaryPythonRefInfo(
         "_refs.nn.functional.relu",
@@ -17699,6 +17711,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.dsplit",
         torch_opinfo_name="dsplit",
         supports_nvfuser=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
+        ),
     ),
     PythonRefInfo(
         "_refs.diag",
@@ -17724,6 +17739,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.dstack",
         torch_opinfo_name="dstack",
         supports_nvfuser=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
+        ),
     ),
     PythonRefInfo(
         "_refs.expand",
@@ -17808,6 +17826,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.reshape",
         torch_opinfo_name="reshape",
         supports_nvfuser=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
+        ),
     ),
     PythonRefInfo(
         "_refs.reshape_as",
@@ -17856,6 +17877,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.vsplit",
         torch_opinfo_name="vsplit",
         supports_nvfuser=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
+        ),
     ),
     PythonRefInfo(
         "_refs.transpose",
@@ -17889,6 +17913,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.view",
         torch_opinfo_name="view",
         supports_nvfuser=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
+        ),
     ),
     PythonRefInfo(
         "_refs.view_as",
@@ -17913,6 +17940,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.unbind",
         torch_opinfo_name="unbind",
         supports_nvfuser=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
+        ),
     ),
     #
     # Reduction Reference OpInfos
@@ -17924,10 +17954,16 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     ReductionPythonRefInfo(
         "_refs.amax",
         torch_opinfo_name="amax",
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
+        ),
     ),
     ReductionPythonRefInfo(
         "_refs.amin",
         torch_opinfo_name="amin",
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
+        ),
     ),
     ReductionPythonRefInfo(
         "_refs.any",
@@ -17937,6 +17973,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.mean",
         torch_opinfo_name="mean",
         supports_out=True,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
+        ),
     ),
     ReductionPythonRefInfo(
         "_refs.std",
@@ -18215,12 +18254,18 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.masked_fill",
         torch_opinfo_name="masked_fill",
         supports_nvfuser=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
+        ),
     ),
     PythonRefInfo(
         "_refs.where",
         torch_opinfo_name="where",
         op=lambda self, condition, other: refs.where(condition, self, other),
         supports_nvfuser=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors', device_type='cuda'),
+        ),
     ),
     PythonRefInfo(
         "_refs.index_select",

From ef1b1733e94adcf486041c38627eb43f9f6fb019 Mon Sep 17 00:00:00 2001
From: wchen61 <183351030@qq.com>
Date: Wed, 26 Oct 2022 23:44:13 +0000
Subject: [PATCH 0210/1922] Synchronize before change cuda stream (#82050)
 (#82056)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/82050

Need synchronize before change cuda stream

### Description
<!-- What did you change and why was it needed? -->

### Issue
<!-- Link to Issue ticket or RFP -->

### Testing
<!-- How did you test your change? -->

Pull Request resolved: https://github.com/pytorch/pytorch/pull/82056
Approved by: https://github.com/ngimel
---
 torch/testing/_internal/common_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 5da1ffefaba91..2f85b8af1d81f 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -1430,6 +1430,7 @@ def __enter__(self):
         for d in range(torch.cuda.device_count()):
             self.beforeStreams.append(torch.cuda.current_stream(d))
             deviceStream = torch.cuda.Stream(device=d)
+            self.beforeStreams[-1].synchronize()
             torch._C._cuda_setStream(deviceStream._cdata)
         torch._C._cuda_setDevice(beforeDevice)
 

From ee9ad1320491aca3d826e7b06eeeae04cd9c5636 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Thu, 27 Oct 2022 00:01:10 +0000
Subject: [PATCH 0211/1922] Fix typos under .github directory (#87828)

This PR fixes typos in `.md` files under .github directory

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87828
Approved by: https://github.com/clee2000
---
 .github/requirements/README.md | 2 +-
 .github/scripts/README.md      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/requirements/README.md b/.github/requirements/README.md
index 654bb04558b9b..9093b92c62d29 100644
--- a/.github/requirements/README.md
+++ b/.github/requirements/README.md
@@ -4,7 +4,7 @@ At the moment, the installation of conda and pip dependencies happens at
 different places in the CI depending at the whim of different
 developers, which makes it very challenging to handle issues like
 network flakiness or upstream dependency failures gracefully. So, this
-center directory is created to gradually include all the conda enviroment
+center directory is created to gradually include all the conda environment
 and pip requirement files that are used to setup CI jobs. Not only it
 gives a clear picture of all the dependencies required by different CI
 jobs, but it also allows them to be cached properly to improve CI
diff --git a/.github/scripts/README.md b/.github/scripts/README.md
index 22099c3732ea5..73bec509c2c41 100644
--- a/.github/scripts/README.md
+++ b/.github/scripts/README.md
@@ -36,7 +36,7 @@ New generated binary workflows can be added in the `.github/scripts/generate_ci_
 examples from that script in order to add the workflow to the stream that is relevant to what you particularly
 care about.
 
-Different parameters can be used to acheive different goals, i.e. running jobs on a cron, running only on trunk, etc.
+Different parameters can be used to achieve different goals, i.e. running jobs on a cron, running only on trunk, etc.
 
 #### ciflow (trunk)
 

From f10e85b2dcb7982c37d957e82c85961e5ff0e3c0 Mon Sep 17 00:00:00 2001
From: Valentin Andrei <vandrei@meta.com>
Date: Thu, 27 Oct 2022 00:18:16 +0000
Subject: [PATCH 0212/1922] [pytorch] Layer norm backward speed gain with warp
 shuffles (#87814)

Summary:
Improved native layer norm backward performance.

Rewrote `GammaBetaBackwardCUDAKernel` to use shared memory only for the reduction step, but not for loading `mean` and `rstd`. The previous implementation used only `threadIdx.x = 0` to load `mean` and `rstd` into shared memory, and then all threads would access the values in order to do loop unrolling. This approached increased register usage and decreased occupancy, without much benefit from using shared memory (this is because the values were already cached in L1). The new implementation is simpler and register usage is smaller, thus occupancy is better.

Added another implementation called `GammaBetaBackwardCUDAKernel_32x32` which is only for shapes dividing exactly to a (32 x 32) block. This permits using warp shuffles for speeding up loading `mean` and `rstd` as well as for the final reduction stage. The effective bandwidth of this implementation is equal to STREAM Triad.

Observed that we can get additional benefit if we lower the threshold for calling `GammaBetaBackwardSimpleCUDAKernel` (simple col-wise reduction implementation) from `512` to `128`.

Test Plan:
Wrote a simple CUDA app that calls the previous implementation of `GammaBetaBackwardCUDAKernel` and the current one, using FP32 values and compares the results. The epsilon value we used for FP comparison is 0.00001 for the weight and 0.0001 for the bias.
Ran the benchmark for various sizes A100 GPU and got the results below. Almost all sizes show good speedup.

```
Size (32, 32); Mismatches: dg = 0 db = 0 out of 32. reference = 0.0073 (ms); optimized = 0.0071 (ms); bw_opt = 1.14 GB/s; speedup = 2.68%
Size (64, 32); Mismatches: dg = 0 db = 0 out of 32. reference = 0.0107 (ms); optimized = 0.0107 (ms); bw_opt = 1.50 GB/s; speedup = 0.22%
Size (256, 128); Mismatches: dg = 0 db = 0 out of 128. reference = 0.0323 (ms); optimized = 0.0075 (ms); bw_opt = 32.89 GB/s; speedup = 330.16%
Size (512, 1024); Mismatches: dg = 0 db = 0 out of 1024. reference = 0.0103 (ms); optimized = 0.0089 (ms); bw_opt = 440.54 GB/s; speedup = 15.82%
Size (1024, 2048); Mismatches: dg = 0 db = 0 out of 2048. reference = 0.0197 (ms); optimized = 0.0136 (ms); bw_opt = 1151.44 GB/s; speedup = 44.91%
Size (2048, 2048); Mismatches: dg = 0 db = 0 out of 2048. reference = 0.0416 (ms); optimized = 0.0283 (ms); bw_opt = 1105.31 GB/s; speedup = 47.01%
Size (4096, 16384); Mismatches: dg = 0 db = 0 out of 16384. reference = 0.4420 (ms); optimized = 0.3915 (ms); bw_opt = 1277.58 GB/s; speedup = 12.90%
Size (70000, 64); Mismatches: dg = 0 db = 0 out of 64. reference = 0.5908 (ms); optimized = 0.6850 (ms); bw_opt = 49.49 GB/s; speedup = -13.75%
Size (131072, 512); Mismatches: dg = 0 db = 0 out of 512. reference = 1.1961 (ms); optimized = 0.9234 (ms); bw_opt = 542.54 GB/s; speedup = 29.53%
Size (1000, 520); Mismatches: dg = 0 db = 0 out of 520. reference = 0.0132 (ms); optimized = 0.0113 (ms); bw_opt = 343.83 GB/s; speedup = 16.88%
Size (4005, 4005); Mismatches: dg = 0 db = 0 out of 4005. reference = 0.1441 (ms); optimized = 0.1054 (ms); bw_opt = 1134.36 GB/s; speedup = 36.71%
Size (10000, 1000); Mismatches: dg = 0 db = 0 out of 1000. reference = 0.1293 (ms); optimized = 0.1248 (ms); bw_opt = 597.71 GB/s; speedup = 3.63%
Size (1024, 10000); Mismatches: dg = 0 db = 0 out of 10000. reference = 0.0738 (ms); optimized = 0.0735 (ms); bw_opt = 1039.40 GB/s; speedup = 0.45%
Size (8192, 4096); Mismatches: dg = 0 db = 0 out of 4096. reference = 0.2673 (ms); optimized = 0.2223 (ms); bw_opt = 1125.01 GB/s; speedup = 20.25%
Size (10000, 10000); Mismatches: dg = 0 db = 0 out of 10000. reference = 0.7331 (ms); optimized = 0.8940 (ms); bw_opt = 833.54 GB/s; speedup = -18.00%
Size (3072, 10000); Mismatches: dg = 0 db = 0 out of 10000. reference = 0.2087 (ms); optimized = 0.2364 (ms); bw_opt = 968.64 GB/s; speedup = -11.71%
Size (6144, 10000); Mismatches: dg = 0 db = 0 out of 10000. reference = 0.4197 (ms); optimized = 0.5118 (ms); bw_opt = 894.63 GB/s; speedup = -18.00%
Size (1024, 20000); Mismatches: dg = 0 db = 0 out of 20000. reference = 0.1480 (ms); optimized = 0.1297 (ms); bw_opt = 1177.68 GB/s; speedup = 14.12%
Size (1024, 20000); Mismatches: dg = 0 db = 0 out of 20000. reference = 0.1483 (ms); optimized = 0.1278 (ms); bw_opt = 1195.26 GB/s; speedup = 16.04%
Size (512, 1536); Mismatches: dg = 0 db = 0 out of 1536. reference = 0.0104 (ms); optimized = 0.0091 (ms); bw_opt = 646.72 GB/s; speedup = 14.44%
Size (512, 6144); Mismatches: dg = 0 db = 0 out of 6144. reference = 0.0219 (ms); optimized = 0.0156 (ms); bw_opt = 1506.30 GB/s; speedup = 40.52%
Size (512, 10240); Mismatches: dg = 0 db = 0 out of 10240. reference = 0.0424 (ms); optimized = 0.0370 (ms); bw_opt = 1057.84 GB/s; speedup = 14.63%
Size (1000, 1000); Mismatches: dg = 0 db = 0 out of 1000. reference = 0.0139 (ms); optimized = 0.0119 (ms); bw_opt = 627.51 GB/s; speedup = 16.83%
Size (2000, 2000); Mismatches: dg = 0 db = 0 out of 2000. reference = 0.0421 (ms); optimized = 0.0412 (ms); bw_opt = 724.10 GB/s; speedup = 2.20%
Size (10240, 10240); Mismatches: dg = 0 db = 0 out of 10240. reference = 0.7210 (ms); optimized = 0.6098 (ms); bw_opt = 1281.40 GB/s; speedup = 18.24%
Size (384, 128); Mismatches: dg = 0 db = 0 out of 128. reference = 0.0449 (ms); optimized = 0.0089 (ms); bw_opt = 41.50 GB/s; speedup = 403.48%
Size (2048, 1024); Mismatches: dg = 0 db = 0 out of 1024. reference = 0.0208 (ms); optimized = 0.0169 (ms); bw_opt = 925.70 GB/s; speedup = 23.13%
Size (267, 513); Mismatches: dg = 0 db = 0 out of 513. reference = 0.0342 (ms); optimized = 0.0090 (ms); bw_opt = 114.18 GB/s; speedup = 280.64%
Size (67, 123479); Mismatches: dg = 0 db = 0 out of 123479. reference = 0.0562 (ms); optimized = 0.0552 (ms); bw_opt = 1133.46 GB/s; speedup = 1.81%
Size (1024, 123479); Mismatches: dg = 0 db = 0 out of 123479. reference = 0.8573 (ms); optimized = 0.9245 (ms); bw_opt = 1020.02 GB/s; speedup = -7.27%
Size (2048, 66679); Mismatches: dg = 0 db = 0 out of 66679. reference = 0.8778 (ms); optimized = 0.8590 (ms); bw_opt = 1185.05 GB/s; speedup = 2.19%
Size (200, 256); Mismatches: dg = 0 db = 0 out of 256. reference = 0.0215 (ms); optimized = 0.0066 (ms); bw_opt = 58.49 GB/s; speedup = 226.81%
Size (1000, 256); Mismatches: dg = 0 db = 0 out of 256. reference = 0.0109 (ms); optimized = 0.0092 (ms); bw_opt = 208.27 GB/s; speedup = 18.65%
Size (6000, 256); Mismatches: dg = 0 db = 0 out of 256. reference = 0.0394 (ms); optimized = 0.0301 (ms); bw_opt = 381.90 GB/s; speedup = 30.98%
Size (6272, 256); Mismatches: dg = 0 db = 0 out of 256. reference = 0.0403 (ms); optimized = 0.0300 (ms); bw_opt = 400.48 GB/s; speedup = 34.34%
Size (200, 512); Mismatches: dg = 0 db = 0 out of 512. reference = 0.0218 (ms); optimized = 0.0066 (ms); bw_opt = 116.33 GB/s; speedup = 229.96%
Size (1000, 512); Mismatches: dg = 0 db = 0 out of 512. reference = 0.0110 (ms); optimized = 0.0094 (ms); bw_opt = 407.29 GB/s; speedup = 17.26%
Size (6000, 512); Mismatches: dg = 0 db = 0 out of 512. reference = 0.0535 (ms); optimized = 0.0594 (ms); bw_opt = 386.05 GB/s; speedup = -9.95%
Size (6272, 512); Mismatches: dg = 0 db = 0 out of 512. reference = 0.0573 (ms); optimized = 0.0387 (ms); bw_opt = 619.62 GB/s; speedup = 48.06%
Size (200, 1024); Mismatches: dg = 0 db = 0 out of 1024. reference = 0.0221 (ms); optimized = 0.0069 (ms); bw_opt = 222.78 GB/s; speedup = 220.76%
Size (1000, 1024); Mismatches: dg = 0 db = 0 out of 1024. reference = 0.0113 (ms); optimized = 0.0097 (ms); bw_opt = 787.79 GB/s; speedup = 16.46%
Size (6000, 1024); Mismatches: dg = 0 db = 0 out of 1024. reference = 0.0723 (ms); optimized = 0.0715 (ms); bw_opt = 640.95 GB/s; speedup = 1.10%
Size (6272, 1024); Mismatches: dg = 0 db = 0 out of 1024. reference = 0.0751 (ms); optimized = 0.0572 (ms); bw_opt = 837.57 GB/s; speedup = 31.30%
Size (200, 1536); Mismatches: dg = 0 db = 0 out of 1536. reference = 0.0232 (ms); optimized = 0.0071 (ms); bw_opt = 323.97 GB/s; speedup = 226.51%
Size (1000, 1536); Mismatches: dg = 0 db = 0 out of 1536. reference = 0.0125 (ms); optimized = 0.0114 (ms); bw_opt = 1005.84 GB/s; speedup = 9.62%
Size (6000, 1536); Mismatches: dg = 0 db = 0 out of 1536. reference = 0.0807 (ms); optimized = 0.0830 (ms); bw_opt = 828.02 GB/s; speedup = -2.76%
Size (6272, 1536); Mismatches: dg = 0 db = 0 out of 1536. reference = 0.0836 (ms); optimized = 0.0695 (ms); bw_opt = 1033.62 GB/s; speedup = 20.27%
Size (200, 2048); Mismatches: dg = 0 db = 0 out of 2048. reference = 0.0224 (ms); optimized = 0.0075 (ms); bw_opt = 408.58 GB/s; speedup = 198.10%
Size (1000, 2048); Mismatches: dg = 0 db = 0 out of 2048. reference = 0.0165 (ms); optimized = 0.0135 (ms); bw_opt = 1132.42 GB/s; speedup = 22.26%
Size (6000, 2048); Mismatches: dg = 0 db = 0 out of 2048. reference = 0.0993 (ms); optimized = 0.0989 (ms); bw_opt = 926.35 GB/s; speedup = 0.41%
Size (6272, 2048); Mismatches: dg = 0 db = 0 out of 2048. reference = 0.1033 (ms); optimized = 0.0826 (ms); bw_opt = 1159.55 GB/s; speedup = 25.09%
Size (200, 3072); Mismatches: dg = 0 db = 0 out of 3072. reference = 0.0230 (ms); optimized = 0.0076 (ms); bw_opt = 605.09 GB/s; speedup = 202.51%
Size (1000, 3072); Mismatches: dg = 0 db = 0 out of 3072. reference = 0.0207 (ms); optimized = 0.0213 (ms); bw_opt = 1076.45 GB/s; speedup = -2.69%
Size (6000, 3072); Mismatches: dg = 0 db = 0 out of 3072. reference = 0.1198 (ms); optimized = 0.1274 (ms); bw_opt = 1078.58 GB/s; speedup = -5.95%
Size (6272, 3072); Mismatches: dg = 0 db = 0 out of 3072. reference = 0.1293 (ms); optimized = 0.1189 (ms); bw_opt = 1207.95 GB/s; speedup = 8.76%

Average speedup = 52.88%
```

For additional numerical validation used the following script:

```
def run_model_on_device(fs, X, gO, device_string, numeric_type):
    ln = torch.nn.LayerNorm((fs,), device=device_string, dtype=numeric_type)
    ln.reset_parameters()
    X.grad = None
    ln.zero_grad(set_to_none=True)
    out = ln(X)
    out.backward(gO)
    return (ln.weight.grad, ln.bias.grad)

def run_correctness_test(eps_weight, eps_bias):
    dtype = torch.float
    for fs in (512, 1024, 2048, 4096, 8192, 10000, 500, 1000, 2001, 4005, 8117):
        for bs in (512, 1024, 2048, 4096, 525, 1033, 2064, 3000):
            mean_adjustment = torch.randn(fs, device="cpu", dtype=torch.float)
            X = mean_adjustment * torch.randn(
                bs, fs, device="cpu", dtype=torch.float, requires_grad=True
            )

            X = X.detach().requires_grad_()
            gO = torch.rand_like(X)
            X_gpu = X.to("cuda")
            X_gpu = X_gpu.detach().requires_grad_()
            gO_gpu = gO.to("cuda")
            gO_gpu = gO_gpu.detach().requires_grad_()

            grad_cpu_ref = run_model_on_device(fs, X, gO, "cpu", dtype)
            grad_gpu = run_model_on_device(fs, X_gpu, gO_gpu, "cuda", dtype)
            weight_grad_gpu_target = grad_gpu[0].detach().to("cpu")
            bias_grad_gpu_target = grad_gpu[1].detach().to("cpu")

            weight_delta = torch.abs(grad_cpu_ref[0] - weight_grad_gpu_target)
            weight_mismatches = (weight_delta >= eps_weight).nonzero()
            weight_mismatch_pct = len(weight_mismatches) / len(weight_delta) * 100

            bias_delta = torch.abs(grad_cpu_ref[1] - bias_grad_gpu_target)
            bias_mismatches = (bias_delta >= eps_bias).nonzero()
            bias_mismatch_pct = len(bias_mismatches) / len(bias_delta) * 100

            print(
                "Size ({} x {}) mismatch percentage: weight {:3.2f} bias {:3.2f}".format(
                    fs, bs, weight_mismatch_pct, bias_mismatch_pct
                )
            )
```

`NVFuserTest.FusionMagicSchedulerLayerNormBackward_CUDA` test also does additional numerical validation and it passes.

Differential Revision: D40730981

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87814
Approved by: https://github.com/weiwangmeta
---
 .../src/ATen/native/cuda/layer_norm_kernel.cu | 242 ++++++++++++++----
 1 file changed, 188 insertions(+), 54 deletions(-)

diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
index ae09f0aaad8f8..fa70f075d4fa7 100644
--- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@@ -33,6 +33,7 @@ namespace {
 
 constexpr int kCUDANumThreads = 256;
 constexpr int kColwiseReduceTileSize = 32;
+constexpr unsigned int kWarpSize = 32;
 constexpr int vec_size = 4; //we could make it dependent on dtype, but that would lead to different results between float and low-p types
 
 // aligned vector generates vectorized load/store on CUDA (copy-pasted from MemoryAccess.cuh)
@@ -555,8 +556,108 @@ __global__ void GammaBetaBackwardCUDAKernel1(
   }
 }
 
+template <typename T, typename T_ACC>
+__global__ void GammaBetaBackwardCUDAKernel_32x32(
+    int64_t M,
+    int64_t N,
+    const T* dY,
+    const T* X,
+    const T_ACC* mean,
+    const T_ACC* rstd,
+    T* dg,
+    T* db) {
+  alignas(sizeof(double)) extern __shared__ char s_data1[];
+  T_ACC* s_data_typed = reinterpret_cast<T_ACC*>(&s_data1);
+  T_ACC* s_dg;
+  T_ACC* s_db;
 
+  T_ACC dg_sum = 0;
+  T_ACC db_sum = 0;
 
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (j < N) {
+    constexpr int unroll_factor = 8;
+    int laneId = threadIdx.x & 0x1f;
+
+    T_ACC mean_reg, mean_reg_tmp;
+    T_ACC rstd_reg, rstd_reg_tmp;
+    T dY_reg;
+    T X_reg;
+
+    // Main loop
+    int bcounter;
+    for (bcounter = 0; bcounter < M / (blockDim.y * unroll_factor);
+         bcounter++) {
+      int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor;
+
+      if (laneId < unroll_factor) {
+        mean_reg_tmp = mean[offset + laneId];
+        rstd_reg_tmp = rstd[offset + laneId];
+      }
+#if !defined(USE_ROCM)
+      // Volta and newer architectures allow lane divergence within a warp.
+      __syncwarp();
+#endif
+
+      #pragma unroll
+      for (int ii = 0; ii < unroll_factor; ++ii) {
+        dY_reg = dY[(offset + ii) * N + j];
+        X_reg = X[(offset + ii) * N + j];
+        mean_reg = WARP_SHFL(mean_reg_tmp, ii, kWarpSize);
+        rstd_reg = WARP_SHFL(rstd_reg_tmp, ii, kWarpSize);
+        dg_sum += dY_reg * (X_reg - mean_reg) * rstd_reg;
+        db_sum += dY_reg;
+      }
+    }
+
+    // Remainder loop
+    int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor;
+    for (int ii = 0; ii < unroll_factor; ii++) {
+      if ((offset + ii) < M) {
+        mean_reg = mean[offset + ii];
+        rstd_reg = rstd[offset + ii];
+        dY_reg = dY[(offset + ii) * N + j];
+        X_reg = X[(offset + ii) * N + j];
+        dg_sum += dY_reg * (X_reg - mean_reg) * rstd_reg;
+        db_sum += dY_reg;
+      }
+    }
+
+    // This kernel uses a block of (32 x 32) and gets called when M; N
+    // divide by 32. We can use warp shuffles for the final reduction
+    // step. This removes 4 shmem loads and stores with their
+    // corresponding __syncthreads()
+
+    // This greatly reduces bank conflicts at the expense of a little
+    // extra shared memory. It does not impact occupancy
+    int padded_bx = (1 + blockDim.x);
+
+    s_dg = s_data_typed;
+    s_db = s_data_typed + (padded_bx * blockDim.y);
+    s_dg[threadIdx.y * padded_bx + threadIdx.x] = dg_sum;
+    s_db[threadIdx.y * padded_bx + threadIdx.x] = db_sum;
+    __syncthreads();
+
+    // Load transposed so that a warp holds an entire column
+    T_ACC reg_dg = s_dg[threadIdx.x * padded_bx + threadIdx.y];
+    T_ACC reg_db = s_db[threadIdx.x * padded_bx + threadIdx.y];
+    for (int delta = 16; delta >= 1; delta /= 2) {
+      reg_dg += WARP_SHFL_XOR(reg_dg, delta, kWarpSize);
+      reg_db += WARP_SHFL_XOR(reg_db, delta, kWarpSize);
+    }
+
+    if (threadIdx.x == 0) {
+      const int64_t j = blockIdx.x * blockDim.x + threadIdx.y;
+      if (dg) {
+        dg[j] = reg_dg;
+      }
+      if (db) {
+        db[j] = reg_db;
+      }
+    }
+  }
+}
 
 template <typename T, typename T_ACC>
 __global__ void GammaBetaBackwardCUDAKernel(
@@ -569,66 +670,75 @@ __global__ void GammaBetaBackwardCUDAKernel(
     T* dg,
     T* db) {
   alignas(sizeof(double)) extern __shared__ char s_data1[];
-  T_ACC * s_data_typed = reinterpret_cast<T_ACC*>(&s_data1);
+  T_ACC* s_data_typed = reinterpret_cast<T_ACC*>(&s_data1);
+  T_ACC* s_dg;
+  T_ACC* s_db;
+
   const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
-  constexpr int unroll = 8;
-  T dYs[unroll];
-  T Xs[unroll];
-  T_ACC *  means = s_data_typed;
-  T_ACC * rstds = s_data_typed + unroll * blockDim.y;
+
   T_ACC dg_sum = 0;
   T_ACC db_sum = 0;
+
   if (j < N) {
+    constexpr int unroll_factor = 8;
+
+    T_ACC mean_reg;
+    T_ACC rstd_reg;
+    T dY_reg;
+    T X_reg;
+
+    // Main Loop
     int bcounter;
-    for (bcounter = 0; bcounter < M/(blockDim.y * unroll); bcounter++){
-      int offset = (bcounter * blockDim.y + threadIdx.y) * unroll;
-      #pragma unroll
-      for (int ii=0; ii<unroll; ii++){
-        if (threadIdx.x == 0) {
-          means[ii*blockDim.y + threadIdx.y] = mean[offset + ii];
-          rstds[ii*blockDim.y + threadIdx.y] = rstd[offset + ii];
-        }
-        dYs[ii] = dY[(offset + ii) * N + j ];
-        Xs[ii] = X[(offset + ii) * N + j];
+    for (bcounter = 0; bcounter < M / (blockDim.y * unroll_factor); bcounter++){
+      int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor;
 
-      }
-      __syncthreads();
       #pragma unroll
-      for (int ii=0; ii<unroll; ii++){
-        dg_sum += dYs[ii] * (Xs[ii] - means[ii*blockDim.y + threadIdx.y]) * rstds[ii * blockDim.y + threadIdx.y];
-        db_sum += dYs[ii];
+      for (int ii = 0; ii < unroll_factor; ++ii) {
+        dY_reg = dY[(offset + ii) * N + j];
+        X_reg = X[(offset + ii) * N + j];
+        mean_reg = mean[offset + ii];
+        rstd_reg = rstd[offset + ii];
+        dg_sum += dY_reg * (X_reg - mean_reg) * rstd_reg;
+        db_sum += dY_reg;
       }
-      __syncthreads();
     }
-    int offset = (bcounter * blockDim.y + threadIdx.y) * unroll;
-    for (int ii = 0; ii<8; ii++ ){
-      T_ACC mean_val, rstd_val; // we don't use smem in the tail to avoid awkward synchronizations, perf penalty is negligible
+
+    // Remainder loop
+    int offset = (bcounter * blockDim.y + threadIdx.y) * unroll_factor;
+    for (int ii = 0; ii < unroll_factor; ii++ ){
       if ((offset + ii) < M) {
-        mean_val = mean[offset+ii];
-        rstd_val = rstd[offset+ii];
-        dYs[0] = dY[(offset + ii) * N + j ];
-        Xs[0] = X[(offset + ii) * N + j];
-        dg_sum += dYs[0] * (Xs[0] - mean_val) * rstd_val;
-        db_sum += dYs[0];
+        dY_reg = dY[(offset + ii) * N + j ];
+        X_reg = X[(offset + ii) * N + j];
+        mean_reg = mean[offset + ii];
+        rstd_reg = rstd[offset + ii];
+        dg_sum += dY_reg * (X_reg - mean_reg) * rstd_reg;
+        db_sum += dY_reg;
       }
     }
-    s_data_typed[threadIdx.y * blockDim.x + threadIdx.x] = dg_sum;
-    s_data_typed[blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x] = db_sum;
+
+    // Do the final reduction in shared memory
+    s_dg = s_data_typed;
+    s_db = s_data_typed + blockDim.x * blockDim.y;
+    s_dg[threadIdx.y * blockDim.x + threadIdx.x] = dg_sum;
+    s_db[threadIdx.y * blockDim.x + threadIdx.x] = db_sum;
     __syncthreads();
-    for (int offset = blockDim.y/2; offset >=1; offset /= 2){
+
+    for (int offset = blockDim.y / 2; offset >= 1; offset /= 2) {
       if (threadIdx.y < offset) {
-        s_data_typed[threadIdx.y * blockDim.x + threadIdx.x] += s_data_typed[(threadIdx.y + offset) * blockDim.x + threadIdx.x];
-        s_data_typed[blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x] +=
-        s_data_typed[blockDim.x * blockDim.y + (threadIdx.y + offset) * blockDim.x + threadIdx.x];
-      }
+        s_dg[threadIdx.y * blockDim.x + threadIdx.x] +=
+            s_dg[(threadIdx.y + offset) * blockDim.x + threadIdx.x];
+        s_db[threadIdx.y * blockDim.x + threadIdx.x] +=
+            s_db[(threadIdx.y + offset) * blockDim.x + threadIdx.x];
+        }
       __syncthreads();
     }
+
     if (threadIdx.y == 0) {
       if (dg) {
-        dg[j] = s_data_typed[threadIdx.x];
+        dg[j] = s_dg[threadIdx.x];
       }
       if (db) {
-        db[j] = s_data_typed[threadIdx.x + blockDim.x * blockDim.y];
+        db[j] = s_db[threadIdx.x];
       }
     }
   }
@@ -763,7 +873,8 @@ void LayerNormBackwardKernelImplInternal(
     T* dgamma_data =
         dgamma->defined() ? dgamma->template data_ptr<T>() : nullptr;
     T* dbeta_data = dbeta->defined() ? dbeta->template data_ptr<T>() : nullptr;
-    if (M < 512) {
+
+    if (M < 128) {
       // For small batch size, do colwise reduce directly.
       const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads;
       GammaBetaBackwardSimpleCUDAKernel<T, T_ACC>
@@ -778,19 +889,42 @@ void LayerNormBackwardKernelImplInternal(
               dbeta_data);
       C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
-      dim3 threads{16, 32};
-      int blocks = (N + threads.x-1)/threads.x;
-      GammaBetaBackwardCUDAKernel<T, T_ACC>
-          <<<blocks, threads, 2 * sizeof(T_ACC) * threads.x * threads.y, cuda_stream>>>(
-              M,
-              N,
-              dY_data,
-              X_data,
-              mean_data,
-              rstd_data,
-              dgamma_data,
-              dbeta_data);
-      C10_CUDA_KERNEL_LAUNCH_CHECK();
+      if ((M % kWarpSize == 0) && (N % kWarpSize == 0)) {
+        // This implementation relies on warp primitives and requires that M and N divide
+        // exactly to warp size.
+        dim3 threads{kWarpSize, kWarpSize};
+        int blocks = (N + threads.x - 1) / threads.x;
+
+        // If M and N divide by 32, we can use warp shuffles for the final reduction. That requires
+        // transposing values in shared memory, so we apply a padding to reduce bank conflicts.
+        size_t shmem_sz = 2 * sizeof(T_ACC) * (threads.x + 1) * threads.y;
+        GammaBetaBackwardCUDAKernel_32x32<T, T_ACC>
+            <<<blocks, threads, shmem_sz, cuda_stream>>>(
+                M,
+                N,
+                dY_data,
+                X_data,
+                mean_data,
+                rstd_data,
+                dgamma_data,
+                dbeta_data);
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
+      } else {
+        dim3 threads{16, 32};
+        int blocks = (N + threads.x - 1) / threads.x;
+        size_t shmem_sz = 2 * sizeof(T_ACC) * threads.x * threads.y;
+        GammaBetaBackwardCUDAKernel<T, T_ACC>
+            <<<blocks, threads, shmem_sz, cuda_stream>>>(
+                M,
+                N,
+                dY_data,
+                X_data,
+                mean_data,
+                rstd_data,
+                dgamma_data,
+                dbeta_data);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+      }
     }
   }
 }

From 650b96d2873dcb0145eb994445716be32f5f99bd Mon Sep 17 00:00:00 2001
From: Charlie Yan <charlieyan@fb.com>
Date: Wed, 26 Oct 2022 19:37:52 +0000
Subject: [PATCH 0213/1922] Enable mypy check for distributed.py, and fix type
 errors (#87543)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87543
Approved by: https://github.com/fduwjj
---
 torch/_C/_distributed_c10d.pyi    | 68 ++++++++++++++++++++++---------
 torch/nn/parallel/distributed.py  | 25 ++++++++----
 torch/nn/parallel/distributed.pyi | 21 ----------
 3 files changed, 65 insertions(+), 49 deletions(-)
 delete mode 100644 torch/nn/parallel/distributed.pyi

diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index bdf0166b8daa9..493e1d8846e71 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -1,8 +1,9 @@
 from datetime import timedelta
 from enum import Enum
-from typing import Optional, List, Any, Tuple, overload, Union
+from typing import Any, Dict, List, Optional, Tuple, Union, overload
 
 from torch import Tensor
+from torch.futures import Future
 
 # This module is defined in torch/csrc/distributed/c10d/init.cpp
 
@@ -32,13 +33,36 @@ class Reducer:
         self,
         params: List[Tensor],
         bucket_indices: List[List[int]],
+        per_bucket_size_limits: List[int],
         process_group: ProcessGroup,
-        expect_sparse_gradients: List[bool],
-        bucket_bytes_cap: int,
-        find_unused_parameters: bool,
-        gradient_as_bucket_view: bool,
+        expect_sparse_gradients: List[bool] = [],
+        bucket_bytes_cap: int = ...,  # kDefaultBucketBytesCap in reducer.hpp
+        find_unused_parameters: bool = False,
+        gradient_as_bucket_view: bool = False,
+        param_to_name_mapping: Dict[int, str] = {},
+        first_bucket_types_cap: int = ...,  # kDefaultFirstBucketBytes in reducer.hpp
     ): ...
-    ...
+    def prepare_for_forward(self) -> None: ...
+    def prepare_for_backward(self, output: List[Tensor]) -> None: ...
+    def get_backward_stats(self) -> List[int]: ...
+    def _install_post_backward_futures(self, futures: List[Future]) -> None: ...
+    def _rebuild_buckets(self) -> bool: ...
+    def _get_zeros_like_grad_buckets(self) -> List[GradBucket]: ...
+    def _push_all_rebuilt_params(self) -> None: ...
+    def _set_forward_pass_work_handle(
+        self, work: Work, use_static_world_size: bool
+    ): ...
+    def _get_local_used_map(self) -> Tensor: ...
+    def _set_ddp_runtime_logging_sample_rate(
+        self, sample_rate: int
+    ) -> None: ...
+    def _set_static_graph(self) -> None: ...
+    def _run_comm_hook(self, bucket: GradBucket) -> Future: ...
+    def set_logger(self, logger: Logger) -> None: ...
+
+class DDPLoggingData:
+    strs_map: Dict[str, str]
+    ints_map: Dict[str, int]
 
 class Logger:
     def __init__(self, reducer: Reducer): ...
@@ -49,8 +73,14 @@ class Logger:
         output_device: int,
         broadcast_buffers: bool,
         has_sync_bn: bool,
+        static_graph: bool,
     ): ...
-    ...
+    def set_runtime_stats_and_log(self) -> None: ...
+    def set_error_and_log(self, error: str) -> None: ...
+    def _get_ddp_logging_data(self) -> DDPLoggingData: ...
+    def _set_comm_hook_name(self, comm_hook: str) -> None: ...
+    def _set_uneven_input_join(self) -> None: ...
+    def _set_static_graph(self) -> None: ...
 
 def get_debug_level(): ...
 def set_debug_level(): ...
@@ -118,7 +148,9 @@ class Store:
     def set(self, key: str, value: str): ...
     def get(self, key: str) -> bytes: ...
     def add(self, key: str, value: int) -> int: ...
-    def compare_set(self, key: str, expected_value: str, desired_value: str) -> bytes: ...
+    def compare_set(
+        self, key: str, expected_value: str, desired_value: str
+    ) -> bytes: ...
     def delete_key(self, key: str) -> bool: ...
     def num_keys(self) -> int: ...
     def set_timeout(self, timeout: timedelta): ...
@@ -142,7 +174,7 @@ class TCPStore(Store):
         is_master: bool = ...,
         timeout: timedelta = ...,
         wait_for_workers: bool = ...,
-        multi_tenant: bool = ...
+        multi_tenant: bool = ...,
     ): ...
     @property
     def host(self) -> str: ...
@@ -167,6 +199,7 @@ class Work:
 
 class ProcessGroup:
     class Options: ...
+
     def __init__(self): ...
     def rank(self) -> int: ...
     def size(self) -> int: ...
@@ -235,7 +268,7 @@ class ProcessGroup:
         self,
         output: Tensor,
         input: Tensor,
-        opts = AllGatherOptions(),
+        opts=AllGatherOptions(),
     ) -> Work: ...
     def allgather_coalesced(
         self,
@@ -343,6 +376,7 @@ def _round_robin_process_groups(
 class ProcessGroupGloo(ProcessGroup):
     class Device: ...
     class Options: ...
+
     def __init__(
         self,
         store: Store,
@@ -358,16 +392,12 @@ class ProcessGroupGloo(ProcessGroup):
     ...
 
 class _ProcessGroupWrapper(ProcessGroup):
-    def __init__(
-        self,
-        pg: ProcessGroup,
-        gloo_pg: ProcessGroupGloo
-    ): ...
+    def __init__(self, pg: ProcessGroup, gloo_pg: ProcessGroupGloo): ...
     wrapped_pg: ProcessGroup
 
-
 class ProcessGroupNCCL(ProcessGroup):
     class Options: ...
+
     def __init__(
         self,
         store: Store,
@@ -402,9 +432,9 @@ class ProcessGroupMPI(ProcessGroup):
 
 def _compute_bucket_assignment_by_size(
     tensors: List[Tensor],
-    bucket_size: int,
-    expect_sparse_gradient: List[bool],
-    tensor_indices: List[int],
+    bucket_size_limits: List[int],
+    expect_sparse_gradient: List[bool] = [],
+    tensor_indices: List[int] = [],
 ) -> Tuple[List[List[int]], List[int]]: ...
 def _broadcast_coalesced(
     process_group: ProcessGroup,
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index 23625d9d20014..514b89aad28d6 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -1,7 +1,7 @@
 import sys
 import copy
 from dataclasses import dataclass
-from typing import Callable, Any, Type
+from typing import Any, Callable, Optional, Type
 from enum import Enum, auto
 import inspect
 import itertools
@@ -37,7 +37,7 @@
 
 from ..modules import Module
 from ._replicated_tensor_ddp_utils import _ddp_with_replicated_tensor_enabled
-from .scatter_gather import gather, is_namedtuple, scatter_kwargs  # noqa: F401
+from .scatter_gather import gather, scatter_kwargs  # noqa: F401
 
 __all__ = ["DistributedDataParallel"]
 
@@ -194,6 +194,7 @@ def __init__(self, ddp, divide_by_initial_world_size):
             "DDP join hook requires passing in a DistributedDataParallel "
             "instance as the state"
         )
+        assert ddp.logger is not None
         ddp.logger._set_uneven_input_join()
         self.ddp = ddp
         self.ddp._divide_by_initial_world_size = divide_by_initial_world_size
@@ -555,7 +556,7 @@ def __init__(
 
         super(DistributedDataParallel, self).__init__()
         Joinable.__init__(self)
-        self.logger = None
+        self.logger: Optional[dist.Logger] = None
         if not any((p.requires_grad for p in module.parameters())):
             self._log_and_throw(
                 RuntimeError,
@@ -836,6 +837,7 @@ def __setstate__(self, state):
         )
         if self.static_graph:
             self.reducer._set_static_graph()
+            assert self.logger is not None
             self.logger._set_static_graph()
 
     def _build_params_for_reducer(self):
@@ -863,7 +865,7 @@ def _build_params_for_reducer(self):
             # "not memo.add(p)" is always True, and it's only there to cause "add(p)" if needed.
             (m, p)
             for m, p in modules_and_parameters
-            if p not in memo and not memo.add(p)
+            if p not in memo and not memo.add(p)  # type: ignore[func-returns-value]
         ]
 
         # Build list of parameters.
@@ -1044,7 +1046,7 @@ def _run_ddp_forward(self, *inputs, **kwargs):
                 self.use_side_stream_for_tensor_copies,
             )
             with self._inside_ddp_forward():
-                return module_to_run(*inputs[0], **kwargs[0])
+                return module_to_run(*inputs[0], **kwargs[0])  # type: ignore[index]
         else:
             with self._inside_ddp_forward():
                 return module_to_run(*inputs, **kwargs)
@@ -1054,6 +1056,7 @@ def forward(self, *inputs, **kwargs):
             "DistributedDataParallel.forward"
         ):
             if torch.is_grad_enabled() and self.require_backward_grad_sync:
+                assert self.logger is not None
                 self.logger.set_runtime_stats_and_log()
                 self.num_iterations += 1
                 self.reducer.prepare_for_forward()
@@ -1063,7 +1066,7 @@ def forward(self, *inputs, **kwargs):
             work = Join.notify_join_context(self)
             if work:
                 self.reducer._set_forward_pass_work_handle(
-                    work, self._divide_by_initial_world_size
+                    work, self._divide_by_initial_world_size  # type: ignore[arg-type]
                 )
 
             # Calling _rebuild_buckets before forward compuation,
@@ -1171,7 +1174,7 @@ def gather(self, outputs, output_device):
     def train(self, mode=True):
         super(DistributedDataParallel, self).train(mode)
         if self._use_replicated_tensor_module:
-            self._replicated_tensor_module.train(mode)
+            self._replicated_tensor_module.train(mode)  # type: ignore[union-attr]
         return self
 
     # When running in join mode, schedules an allreduce to notify joined ranks
@@ -1392,7 +1395,7 @@ def join_process_group(self):
     def _register_buffer_comm_hook(
         self,
         state,
-        hook: callable,
+        hook: Callable,
         comm_hook_location=_BufferCommHookLocation.POST_FORWARD,
     ):
         r"""
@@ -1438,7 +1441,7 @@ def _register_buffer_comm_hook(
             buffer_comm_hook_location=comm_hook_location,
         )
 
-    def register_comm_hook(self, state: object, hook: callable):
+    def register_comm_hook(self, state: object, hook: Callable):
         r"""
         Registers a communication hook which is an enhancement that provides a
         flexible hook to users where they can specify how DDP aggregates gradients
@@ -1518,6 +1521,7 @@ def register_comm_hook(self, state: object, hook: callable):
             >>> ddp.register_comm_hook(state=None, hook=encode_and_decode)
         """
         self._check_comm_hook(hook)
+        assert self.logger is not None
         self.logger._set_comm_hook_name(hook.__qualname__)
         dist._register_comm_hook(self.reducer, state, hook)
 
@@ -1544,6 +1548,7 @@ def _register_builtin_comm_hook(self, comm_hook_type):
             >>> ddp._register_builtin_comm_hook(dist.BuiltinCommHookType.FP16_COMPRESS)
 
         """
+        assert self.logger is not None
         self.logger._set_comm_hook_name(str(comm_hook_type))
         dist._register_builtin_comm_hook(self.reducer, comm_hook_type)
 
@@ -1808,6 +1813,7 @@ def _get_ddp_logging_data(self):
         these metrics are.
         This is a prototype interface and subject to change in the future.
         """
+        assert self.logger is not None
         ddp_logging_data = self.logger._get_ddp_logging_data()
         return {**ddp_logging_data.strs_map, **ddp_logging_data.ints_map}
 
@@ -1842,6 +1848,7 @@ def _set_static_graph(self):
             return
         self.static_graph = True
         self.reducer._set_static_graph()
+        assert self.logger is not None
         self.logger._set_static_graph()
         if self.find_unused_parameters:
             warnings.warn(
diff --git a/torch/nn/parallel/distributed.pyi b/torch/nn/parallel/distributed.pyi
deleted file mode 100644
index a75713afb8282..0000000000000
--- a/torch/nn/parallel/distributed.pyi
+++ /dev/null
@@ -1,21 +0,0 @@
-from ..modules import Module
-from typing import Any, Optional
-from .common_types import _devices_t, _device_t
-
-
-class DistributedDataParallel(Module):
-    process_group: Any = ...
-    dim: int = ...
-    module: Module = ...
-    device_ids: _devices_t = ...
-    output_device: _device_t = ...
-    broadcast_buffers: bool = ...
-    check_reduction: bool = ...
-    broadcast_bucket_size: float = ...
-    bucket_bytes_cap: float = ...
-
-    # TODO type process_group once `distributed` module is stubbed
-    def __init__(self, module: Module, device_ids: Optional[_devices_t] = ...,
-                 output_device: Optional[_device_t] = ..., dim: int = ...,
-                 broadcast_buffers: bool = ..., process_group: Optional[Any] = ..., bucket_cap_mb: float = ...,
-                 find_unused_parameters: bool = ..., check_reduction: bool = ...) -> None: ...

From 3016f05f5f4ce8302e4df00e2ac0be00ea90d978 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Thu, 27 Oct 2022 00:59:40 +0000
Subject: [PATCH 0214/1922] [profiler] Standard performance event names for the
 profiler (#87538)

Summary: The goal is to create a hardware/backend independent event abstraction on which a standard set of tooling can be developed.

Test Plan: CI

Reviewed By: kimishpatel

Differential Revision: D40238034

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87538
Approved by: https://github.com/salilsdesai, https://github.com/kirklandsign
---
 torch/csrc/profiler/events.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 torch/csrc/profiler/events.h

diff --git a/torch/csrc/profiler/events.h b/torch/csrc/profiler/events.h
new file mode 100644
index 0000000000000..a1a956f132793
--- /dev/null
+++ b/torch/csrc/profiler/events.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <array>
+#include <cstring>
+#include <vector>
+
+namespace torch {
+namespace profiler {
+
+/* A vector type to hold a list of performance counters */
+using perf_counters_t = std::vector<uint64_t>;
+
+/* Standard list of performance events independent of hardware or backend */
+constexpr std::array<const char*, 2> ProfilerPerfEvents = {
+    /*
+     * Number of Processing Elelement (PE) cycles between two points of interest
+     * in time. This should correlate positively with wall-time. Measured in
+     * uint64_t. PE can be non cpu. TBD reporting behavior for multiple PEs
+     * participating (i.e. threadpool).
+     */
+    "cycles",
+
+    /* Number of PE instructions between two points of interest in time. This
+     * should correlate positively with wall time and the amount of computation
+     * (i.e. work). Across repeat executions, the number of instructions should
+     * be more or less invariant. Measured in uint64_t. PE can be non cpu.
+     */
+    "instructions"};
+} // namespace profiler
+} // namespace torch

From 03524ae0d5941f11f07ca24a8bbd8c25b4ac9b0f Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Wed, 26 Oct 2022 14:43:41 -0700
Subject: [PATCH 0215/1922] [quant][fx] Add _convert_to_reference_decomposed
 (#87094)

Summary:
_convert_to_reference_decomposed is a private convert function in fx graph mode quantization flow to convert
a calibrated/trained model to a reference quantized model with decomposed quantized tensor representations.

Test Plan:
python test/test_quantization.py TestQuantizeFx.test__convert_to_reference_decomposed_fx

Reviewers:

Subscribers:

Tasks:

Tags:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87094
Approved by: https://github.com/andrewor14
---
 test/quantization/fx/test_quantize_fx.py | 26 +++++++++++
 torch/ao/quantization/fx/convert.py      | 55 ++++++++++++++++-------
 torch/ao/quantization/fx/utils.py        | 57 +++++++++++++++++++-----
 torch/ao/quantization/quantize_fx.py     | 55 +++++++++++++++++++++++
 torch/ao/quantization/utils.py           | 12 +++++
 5 files changed, 177 insertions(+), 28 deletions(-)

diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 2746b1c9a0173..3f39e4bfbbb41 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -18,10 +18,12 @@
     prepare_fx,
     convert_fx,
     convert_to_reference_fx,
+    _convert_to_reference_decomposed_fx,
     prepare_qat_fx,
     fuse_fx,
 )
 
+
 from torch.ao.quantization.fx.quantization_patterns import DefaultNodeQuantizeHandler
 
 from torch.ao.quantization.fx.match_utils import (
@@ -5237,6 +5239,30 @@ def test_get_default_qconfig_valid_backend(self):
             with self.assertRaisesRegex(AssertionError, "not supported"):
                 qconfig_mapping = get_default_qat_qconfig_mapping(invalid_backend)
 
+    def test__convert_to_reference_decomposed_fx(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(5, 10)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        m = M().eval()
+        qconfig_mapping = get_default_qconfig_mapping("fbgemm")
+        example_inputs = (torch.randn(1, 5),)
+        m = prepare_fx(m, qconfig_mapping, example_inputs)
+        m = _convert_to_reference_decomposed_fx(m)
+        expected_occurrence = {
+            ns.call_function(torch.ops.quantized_decomposed.quantize_per_tensor): 2,
+            ns.call_function(torch.ops.quantized_decomposed.dequantize_per_tensor): 2,
+        }
+        self.checkGraphModuleNodes(
+            m,
+            expected_node_occurrence=expected_occurrence)
+        # make sure it runs
+        m(*example_inputs)
+
 @skipIfNoFBGEMM
 class TestQuantizeFxOps(QuantizationTestCase):
     def setUp(self):
diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py
index aa402e882abc8..74eb8f1ca542b 100644
--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@@ -69,6 +69,8 @@
     PrepareCustomConfig,
 )
 from .lower_to_fbgemm import lower_to_fbgemm
+# importing the lib so that the quantized_decomposed ops are registered
+from ._decomposed import quantized_decomposed_lib  # noqa: F401
 
 
 # TODO: revisit this list. Many helper methods shouldn't be public
@@ -485,7 +487,8 @@ def convert(
         is_standalone_module: bool = False,
         _remove_qconfig_flag: bool = True,
         qconfig_mapping: Union[QConfigMapping, Dict[str, Any], None] = None,
-        backend_config: Union[BackendConfig, Dict[str, Any], None] = None) -> torch.nn.Module:
+        backend_config: Union[BackendConfig, Dict[str, Any], None] = None,
+        is_decomposed: bool = False) -> torch.nn.Module:
     """
     We will convert an observed model (a module with observer calls) to a reference
     quantized model, the rule is simple:
@@ -497,13 +500,21 @@ def convert(
        is stored in observed_node_names, we can decide whether we need to swap the
        module based on this set
 
-    standalone_module means it a submodule that is not inlined in
-    parent module, and will be quantized separately as one unit.
-
-    Returns a quantized standalone module, whether input/output is quantized is
-    specified by prepare_custom_config, with
-    input_quantized_idxs, output_quantized_idxs, please
-    see docs for prepare_fx for details
+    Args:
+       * `is_standalone_module`: when this flag is True, it means we are quantizing
+       a submodule that is not inlined in parent module, and will be quantized
+       separately as one unit.
+
+       * `is_decomposed`: a boolean flag to indicate whether we want to use the
+        quantize operator for decomposed quantized tensor
+        (torch.ops.quantized_decomposed.quantize_per_tensor) or default/standalone
+        quantized tensor (torch.quantize_per_tensor)
+
+    Returns:
+         a quantized standalone module, whether input/output is quantized is
+         specified by prepare_custom_config, with
+         input_quantized_idxs, output_quantized_idxs, please
+         see docs for :func:`~torch.ao.quantization.prepare_fx` for details
     """
     if convert_custom_config is None:
         convert_custom_config = ConvertCustomConfig()
@@ -595,7 +606,8 @@ def replace_observer_with_quantize_dequantize_node(
             node: Node,
             modules: Dict[str, torch.nn.Module],
             node_name_to_scope: Dict[str, Tuple[str, type]],
-            node_name_to_qconfig: Dict[str, QConfigAny]) -> None:
+            node_name_to_qconfig: Dict[str, QConfigAny],
+            is_decomposed: bool) -> None:
         """ Replace activation_post_process module call node with quantize and
         dequantize node
 
@@ -608,7 +620,7 @@ def replace_observer_with_quantize_dequantize_node(
         assert isinstance(node.target, str)
         module_path, prefix = get_module_path_and_prefix(node, node_name_to_scope, node_name_to_qconfig)
         observer_module = modules[node.target]
-        maybe_quantize_node_info = get_quantize_node_info(observer_module)
+        maybe_quantize_node_info = get_quantize_node_info(observer_module, is_decomposed)
         # Skip replacing observers to quant/dequant nodes if the qconfigs of all
         # consumers and producers of this observer are None
         skip_replacement = all([
@@ -626,7 +638,7 @@ def replace_observer_with_quantize_dequantize_node(
             # replace observer node with quant - dequant node
             with graph.inserting_before(node):
                 input_node = node.args[0]
-                inputs = [input_node]
+                quantize_op_inputs = [input_node]
                 for key, value in qparams.items():
                     # TODO: we can add the information of whether a value needs to
                     # be registered as an attribute in qparams dict itself
@@ -634,13 +646,22 @@ def replace_observer_with_quantize_dequantize_node(
                         # For scale and zero_point values we register them as buffers in the root module.
                         # TODO: maybe need more complex attr name here
                         qparam_node = create_getattr_from_value(model, graph, module_path + prefix + key, value)
-                        inputs.append(qparam_node)
+                        quantize_op_inputs.append(qparam_node)
                     else:
                         # for qparams that are not scale/zero_point (like axis, dtype) we store them as literals in the graph.
-                        inputs.append(value)
-
-                quantized_node = graph.create_node(node_type, quantize_op, tuple(inputs), {})
-                dequantized_node = graph.call_method("dequantize", args=(quantized_node,))
+                        quantize_op_inputs.append(value)
+
+                quantized_node = graph.create_node(node_type, quantize_op, tuple(quantize_op_inputs), {})
+                if is_decomposed:
+                    # use the same qparams from quantize op
+                    dq_inputs = [quantized_node] + quantize_op_inputs[1:]
+                    dequantized_node = graph.call_function(
+                        torch.ops.quantized_decomposed.dequantize_per_tensor,
+                        tuple(dq_inputs),
+                        {}
+                    )
+                else:
+                    dequantized_node = graph.call_method("dequantize", args=(quantized_node,))
                 node.replace_all_uses_with(dequantized_node)
                 graph.erase_node(node)
 
@@ -711,7 +732,7 @@ def replace_observer_or_dequant_stub_with_dequantize_node(node: Node, graph: Gra
                 else:
                     replace_observer_with_quantize_dequantize_node(
                         model, model.graph, node, modules, node_name_to_scope,
-                        node_name_to_qconfig)
+                        node_name_to_qconfig, is_decomposed)
             elif isinstance(mod, DeQuantStub):
                 replace_observer_or_dequant_stub_with_dequantize_node(node, model.graph)
             elif is_observed_standalone_module(mod):
diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py
index f359bd90f9e61..7b838b64f41a8 100644
--- a/torch/ao/quantization/fx/utils.py
+++ b/torch/ao/quantization/fx/utils.py
@@ -17,6 +17,7 @@
     activation_is_statically_quantized,
     is_per_tensor,
     is_per_channel,
+    to_underlying_dtype,
 )
 from torch.ao.quantization.quantize import is_activation_post_process
 
@@ -27,6 +28,8 @@
     Node,
 )
 from .custom_config import PrepareCustomConfig
+# importing the lib so that the quantized_decomposed ops are registered
+from ._decomposed import quantized_decomposed_lib  # noqa: F401
 
 from typing import Callable, Optional, List, Dict, Any, Set, Tuple, Union, Type
 from collections import namedtuple
@@ -160,11 +163,22 @@ def get_per_tensor_qparams(activation_post_process):
     dtype = activation_post_process.dtype
     return scale, zero_point, dtype
 
-def get_quantize_node_info(activation_post_process: Callable) -> Optional[Tuple[str, Union[Callable, str], Dict[str, Any]]]:
-    ''' Given an activation_post_process module,
-    return node_type(e.g. call_function), quantize op(e.g. quantize_per_tensor) and a dictionary
-    of extracted qparams from the module
-    '''
+def get_quantize_node_info(
+    activation_post_process: Callable,
+    is_decomposed: bool
+) -> Optional[Tuple[str, Union[Callable[..., Any], str], Dict[str, Any]]]:
+    """ Extract information about quantize op from activation_post_process module
+    Args:
+      * `activation_post_process`: observer module instance or fake quant module instance
+        after calibration/QAT
+      * `is_decomposed`: a boolean flag to indicate whether we want to use the
+        quantize operator for decomposed quantized tensor (torch.ops.quantized_decomposed.quantize_per_tensor) or default/standalone
+        quantized tensor (torch.quantize_per_tensor)
+
+    Returns
+        node_type(e.g. call_function), quantize op(e.g. quantize_per_tensor) and a dictionary
+        of extracted qparams from the module
+    """
     dtype = activation_post_process.dtype  # type: ignore[attr-defined]
     compute_dtype = None
     if hasattr(activation_post_process, "compute_dtype"):
@@ -177,17 +191,36 @@ def get_quantize_node_info(activation_post_process: Callable) -> Optional[Tuple[
         if is_per_channel(activation_post_process.qscheme):  # type: ignore[attr-defined]
             ch_axis = int(activation_post_process.ch_axis)  # type: ignore[attr-defined]
             qparams = {"_scale_": scale, "_zero_point_": zero_point, "_axis_": ch_axis, "_dtype_": dtype}
-            quantize_op = torch.quantize_per_channel
+            if is_decomposed:
+                raise NotImplementedError("decomposed quantize_per_channel op not implemented yet")
+            else:
+                quantize_op = torch.quantize_per_channel
         else:
             scale = float(scale)
             zero_point = int(zero_point)
-            qparams = {"_scale_": scale, "_zero_point_": zero_point, "_dtype_": dtype}
-            quantize_op = torch.quantize_per_tensor
+            if is_decomposed:
+                quant_min = activation_post_process.quant_min  # type: ignore[attr-defined]
+                quant_max = activation_post_process.quant_max  # type: ignore[attr-defined]
+                dtype = to_underlying_dtype(dtype)
+                qparams = {
+                    "_scale_": scale,
+                    "_zero_point_": zero_point,
+                    "_quant_min": quant_max,
+                    "_quant_max": quant_max,
+                    "_dtype_": dtype
+                }
+                quantize_op = torch.ops.quantized_decomposed.quantize_per_tensor
+            else:
+                qparams = {"_scale_": scale, "_zero_point_": zero_point, "_dtype_": dtype}
+                quantize_op = torch.quantize_per_tensor
     elif compute_dtype in [torch.quint8, torch.qint8, torch.float16]:
         # TODO(future PR): switch compute_dtype to is_dynamic
         # dynamic quantization
         node_type = "call_function"
-        quantize_op = torch.quantize_per_tensor_dynamic
+        if is_decomposed:
+            raise NotImplementedError("decomposed quantize_per_tensor_dynamic op not implemented yet")
+        else:
+            quantize_op = torch.quantize_per_tensor_dynamic
         # TODO: get reduce range from observer
         # reduce_range = activation_post_process.reduce_range
         reduce_range = torch.backends.quantized.engine in ("fbgemm", "x86")
@@ -199,8 +232,9 @@ def get_quantize_node_info(activation_post_process: Callable) -> Optional[Tuple[
     else:
         warnings.warn(f"Unsupported activation_post_process in get_quantize_node_info: {activation_post_process}")
         return None
-    return node_type, quantize_op, qparams
+    return node_type, quantize_op, qparams  # type: ignore[return-value]
 
+# TODO: looks like this is not used, remove
 def quantize_node(
         in_node: Node,
         obs_module: torch.nn.Module,
@@ -247,7 +281,8 @@ def quantize_node(
         module_path = ""
     root_module = modules['']
     graph = quantized_graph
-    maybe_quantize_node_info = get_quantize_node_info(obs_module)
+    is_decomposed_qtensor = False
+    maybe_quantize_node_info = get_quantize_node_info(obs_module, is_decomposed_qtensor)
     assert maybe_quantize_node_info is not None, \
         f"Expecting quantize node info not to be None, observer: {obs_module}"
     node_type, quantize_op, qparams = maybe_quantize_node_info
diff --git a/torch/ao/quantization/quantize_fx.py b/torch/ao/quantization/quantize_fx.py
index fb6f3dc1fe574..abd1cf1b8edbc 100644
--- a/torch/ao/quantization/quantize_fx.py
+++ b/torch/ao/quantization/quantize_fx.py
@@ -530,6 +530,7 @@ def _convert_fx(
     _remove_qconfig: bool = True,
     qconfig_mapping: Union[QConfigMapping, Dict[str, Any], None] = None,
     backend_config: Union[BackendConfig, Dict[str, Any], None] = None,
+    is_decomposed: bool = False,
 ) -> torch.nn.Module:
     """ `is_standalone_module`: see docs in :func:`~torch.ao.quantization.prepare_standalone_module_fx`
     """
@@ -552,6 +553,7 @@ def _convert_fx(
         _remove_qconfig_flag=_remove_qconfig,
         qconfig_mapping=qconfig_mapping,
         backend_config=backend_config,
+        is_decomposed=is_decomposed,
     )
 
     preserved_attributes = convert_custom_config.preserved_attributes
@@ -676,6 +678,59 @@ def convert_to_reference_fx(
         backend_config=backend_config,
     )
 
+def _convert_to_reference_decomposed_fx(
+    graph_module: GraphModule,
+    convert_custom_config: Union[ConvertCustomConfig, Dict[str, Any], None] = None,
+    _remove_qconfig: bool = True,
+    qconfig_mapping: Union[QConfigMapping, Dict[str, Any], None] = None,
+    backend_config: Union[BackendConfig, Dict[str, Any], None] = None,
+) -> torch.nn.Module:
+    r""" Convert a calibrated or trained model to a reference quantized model, with
+    decomposed representation for quantized Tensor
+    see https://github.com/pytorch/rfcs/blob/master/RFC-0019-Extending-PyTorch-Quantization-to-Custom-Backends.md for more details,
+    reference quantzied model is a standard representation of a quantized model provided
+    by FX Graph Mode Quantization, it can be further lowered to run on the target
+    hardware, like accelerators
+
+    Note: this is not public API
+
+    Args:
+        * `graph_module` (GraphModule): A prepared and calibrated/trained model (GraphModule)
+
+        * `convert_custom_config` (ConvertCustomConfig): custom configurations for convert function.
+            See :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details.
+
+        * `_remove_qconfig` (bool): Option to remove the qconfig attributes in the model after convert.
+
+        * `qconfig_mapping` (QConfigMapping): config for specifying how to convert a model for quantization.
+            See :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details.
+
+         * `backend_config` (BackendConfig): A configuration for the backend which describes how
+            operators should be quantized in the backend. See
+            :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details.
+
+    Return:
+        A reference quantized model (GraphModule) with operators working with decomposed quantized Tensor
+
+    Example::
+
+        # prepared_model: the model after prepare_fx/prepare_qat_fx and calibration/training
+        # TODO: add backend_config after we split the backend_config for fbgemm and qnnpack
+        # e.g. backend_config = get_default_backend_config("fbgemm")
+        reference_quantized_model = _convert_to_reference_decomposed_fx(prepared_model)
+
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize_fx._convert_to_reference_decomposed_fx")
+    return _convert_fx(
+        graph_module,
+        is_reference=True,
+        convert_custom_config=convert_custom_config,
+        _remove_qconfig=_remove_qconfig,
+        qconfig_mapping=qconfig_mapping,
+        backend_config=backend_config,
+        is_decomposed=True,
+    )
+
 
 def _convert_standalone_module_fx(
     graph_module: GraphModule,
diff --git a/torch/ao/quantization/utils.py b/torch/ao/quantization/utils.py
index 47ca7e64e329a..afa278a795dd0 100644
--- a/torch/ao/quantization/utils.py
+++ b/torch/ao/quantization/utils.py
@@ -140,6 +140,17 @@ def getattr_from_fqn(obj: Any, fqn: str) -> Any:
     """
     return functools.reduce(getattr, fqn.split("."), obj)
 
+def to_underlying_dtype(qdtype):
+    DTYPE_MAPPING = {
+        torch.quint8: torch.uint8,
+        torch.qint8: torch.int8,
+        torch.qint32: torch.int32,
+        torch.quint4x2: torch.uint8,
+        torch.quint2x4: torch.uint8,
+    }
+    assert qdtype in DTYPE_MAPPING, "Unsupported dtype: " + qdtype
+    return DTYPE_MAPPING[qdtype]
+
 def get_qparam_dict(observer_or_fake_quant):
     qscheme = observer_or_fake_quant.qscheme if hasattr(observer_or_fake_quant, "qscheme") else None
     dtype = observer_or_fake_quant.dtype
@@ -562,4 +573,5 @@ def _patched_module_call(self, *args, **kwargs):
     "calculate_qmin_qmax",
     "has_no_children_ignoring_parametrizations",
     "get_fqn_to_example_inputs",
+    "to_underlying_dtype",
 ]

From f3f3650efc0bf61109625e2a6902b87d9651334d Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Thu, 27 Oct 2022 01:24:01 +0000
Subject: [PATCH 0216/1922] Limit ROCM option to Linux only (#87833)

As it's not available on neither Windows nor MacOS

cc @jeffdaily @sunway513 @jithunnair-amd @ROCmSupport
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87833
Approved by: https://github.com/kit1980
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e2e3bf0e3f8d5..105e38e7c1acf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -190,7 +190,7 @@ option(USE_CUDA "Use CUDA" ON)
 cmake_dependent_option(
      BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF)
 option(USE_FAST_NVCC "Use parallel NVCC build" OFF)
-option(USE_ROCM "Use ROCm" ON)
+cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF)
 option(CAFFE2_STATIC_LINK_CUDA "Statically link CUDA libraries" OFF)
 cmake_dependent_option(
     USE_CUDNN "Use cuDNN" ON

From 00fd26e35df0e8dd31a8465e7b13e806c4e0bed1 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Wed, 26 Oct 2022 14:43:42 -0700
Subject: [PATCH 0217/1922] [quant][be] Remove unused function `quantize_node`
 (#87153)

Summary:
att

Test Plan:
python test/test_quantization.py TestQuantizeFx

Reviewers:

Subscribers:

Tasks:

Tags:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87153
Approved by: https://github.com/andrewor14
---
 torch/ao/quantization/fx/utils.py | 67 ++-----------------------------
 1 file changed, 3 insertions(+), 64 deletions(-)

diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py
index 7b838b64f41a8..f2037d1590a93 100644
--- a/torch/ao/quantization/fx/utils.py
+++ b/torch/ao/quantization/fx/utils.py
@@ -65,7 +65,6 @@
     "node_arg_is_weight",
     "NON_OBSERVABLE_ARG_DICT",
     "NON_QUANTIZABLE_WEIGHT_OPS",
-    "quantize_node",
     "return_arg_list",
 ]
 
@@ -234,69 +233,9 @@ def get_quantize_node_info(
         return None
     return node_type, quantize_op, qparams  # type: ignore[return-value]
 
-# TODO: looks like this is not used, remove
-def quantize_node(
-        in_node: Node,
-        obs_module: torch.nn.Module,
-        obs_node: Node,
-        modules: Dict[str, torch.nn.Module],
-        quantized_graph: Graph,
-        node_name_to_scope: Dict[str, Tuple[str, type]],
-        is_input: bool,
-        output_prefix: str = "_output") -> Node:
-    ''' Add quantization nodes (eg. quantize_per_tensor/per_channel) for given node to graph
-    with the qparams calculated from activation_post_process (obs_module).
-    The observer node (obs_node) is used to find the FQN of the user of act_post_process.
-    e.g. Given input `node` in `node = self.conv(x)`, insert node:
-    `quantized_node = torch.quantize_per_tensor(x, self._scale_0, self._zer_point_0, self._dtype_0)`
-    where self._scale_0, self._zero_point_0 and self._dtype_0 are
-    calculated from `obs_module`
-    '''
-    # Find the first use of the observer node, we use this to get the scope of the module.
-    if is_input:
-        # if the quantize function is at the input of op, then we find the first user of the observer_node
-        # to get the path. If a linear call_function is in the user list, we return the first instance
-        # of linear node to get the FQN.
-        users = list(obs_node.users)
-        first_linear_use_or_first_use = users[0] if users else None
-        linear_node = None
-        for n in users:
-            if n.op == "call_function" and n.target == torch.nn.functional.linear:
-                linear_node = n
-                break
-        if linear_node:
-            first_linear_use_or_first_use = linear_node
-        prefix = "_input"
-    else:
-        # if the quantize function is at the output of the op, we use the observer input node to get the path
-        first_linear_use_or_first_use = in_node
-        prefix = output_prefix
-
-    if first_linear_use_or_first_use and first_linear_use_or_first_use.name in node_name_to_scope:
-        module_path, _ = node_name_to_scope[first_linear_use_or_first_use.name]
-    else:
-        # TODO: it's not used, so actually we can skip quantization
-        # but this requires changing return type of quantize_node
-        # we can fix it later if needed
-        module_path = ""
-    root_module = modules['']
-    graph = quantized_graph
-    is_decomposed_qtensor = False
-    maybe_quantize_node_info = get_quantize_node_info(obs_module, is_decomposed_qtensor)
-    assert maybe_quantize_node_info is not None, \
-        f"Expecting quantize node info not to be None, observer: {obs_module}"
-    node_type, quantize_op, qparams = maybe_quantize_node_info
-    inputs = [in_node]
-
-    for key, value in qparams.items():
-        if key in ['_scale_', '_zero_point_']:
-            # For scale and zero_point values we register them as buffers in the root module.
-            qparam_node = create_getattr_from_value(root_module, graph, module_path + prefix + key, value)
-            inputs.append(qparam_node)
-        else:
-            # for qparams that are not scale/zero_point (like axis, dtype) we store them as literals in the graph.
-            inputs.append(value)
-    return graph.create_node(node_type, quantize_op, tuple(inputs), {})
+# Keep it here for BC in torch.quantization namespace, we can remove it after
+# we deprecate the torch.quantization namespace
+quantize_node = NotImplemented
 
 def get_custom_module_class_keys(custom_module_mapping: Dict[QuantType, Dict[Type, Type]]) -> List[Any]:
     r""" Get all the unique custom module keys in the custom config dict

From 8b44aeba7b246188548d016bf480fc74b1d85102 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Wed, 26 Oct 2022 14:43:42 -0700
Subject: [PATCH 0218/1922] [fx][subgraph_rewriter] Change match_filter to be a
 List in replace_pattern_with_filters (#87257)

Summary:
att, this is experimental api so not marking it as bc-breaking.
The match will be accepted only if all the filters in the list passes.
Changing the filter arg to be list also allows us to pass in empty list that means no filter, which makes user code cleaner.

Test Plan:
python test/test_fx.py -k test_replace_pattern_with_filters

Reviewers:

Subscribers:

Tasks:

Tags:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87257
Approved by: https://github.com/SherlockNoMad
---
 test/fx/test_subgraph_rewriter.py |  6 +++---
 torch/fx/subgraph_rewriter.py     | 20 +++++++++++++-------
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/test/fx/test_subgraph_rewriter.py b/test/fx/test_subgraph_rewriter.py
index ac3498458d600..ed6d50e44b4ac 100644
--- a/test/fx/test_subgraph_rewriter.py
+++ b/test/fx/test_subgraph_rewriter.py
@@ -773,7 +773,7 @@ def gemm_bias_mul_replacement_with_c(a, b, bias, c):
 
         self.assertEqual(repalcement_node_found, 2)
 
-    def test_replace_pattern_with_filter(self):
+    def test_replace_pattern_with_filters(self):
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -833,10 +833,10 @@ def num_repalcement_node_found(traced):
 
         # match with filter, should find 1 match
         traced = symbolic_trace(M())
-        matches = subgraph_rewriter.replace_pattern_with_filter(
+        matches = subgraph_rewriter.replace_pattern_with_filters(
             traced,
             BinaryOpScalarReLUPattern,
             BinaryOpScalarReLUReplacement,
-            second_input_is_scalar)
+            [second_input_is_scalar])
         self.assertEqual(len(matches), 1)
         self.assertEqual(num_repalcement_node_found(traced), 1)
diff --git a/torch/fx/subgraph_rewriter.py b/torch/fx/subgraph_rewriter.py
index 09e5550c5930d..72bb7fd373516 100644
--- a/torch/fx/subgraph_rewriter.py
+++ b/torch/fx/subgraph_rewriter.py
@@ -8,7 +8,7 @@
 from typing import Callable, Dict, List, NamedTuple, Optional, Set
 import torch
 
-__all__ = ['Match', 'replace_pattern', 'replace_pattern_with_filter']
+__all__ = ['Match', 'replace_pattern', 'replace_pattern_with_filters']
 
 @compatibility(is_backward_compatible=True)
 class Match(NamedTuple):
@@ -185,11 +185,11 @@ def forward(self, x, w1, w2):
 
 # Experimental API, not backward compatible
 @compatibility(is_backward_compatible=False)
-def replace_pattern_with_filter(
+def replace_pattern_with_filters(
     gm: GraphModule,
     pattern: Callable,
     replacement: Callable,
-    match_filter: Callable[["InternalMatch", Graph, Graph], bool],  # type: ignore[name-defined]
+    match_filters: List[Callable[["InternalMatch", Graph, Graph], bool]],  # type: ignore[name-defined]
 ) -> List[Match]:
     """
     See replace_pattern for documentation. This function is an overload with an additional match_filter argument.
@@ -200,18 +200,21 @@ def replace_pattern_with_filter(
             definition of InternalMatch.
     """
 
-    return _replace_pattern(gm, pattern, replacement, match_filter)
+    return _replace_pattern(gm, pattern, replacement, match_filters)
 
 
 def _replace_pattern(
     gm: GraphModule,
     pattern: Callable,
     replacement: Callable,
-    match_filter: Optional[Callable[["InternalMatch", Graph, Graph], bool]] = None  # type: ignore[name-defined]
+    match_filters: List[Callable[["InternalMatch", Graph, Graph], bool]] = None  # type: ignore[name-defined]
 ) -> List[Match]:
 
     from torch.fx.passes.utils.matcher_utils import SubgraphMatcher, InternalMatch
 
+    if match_filters is None:
+        match_filters = []
+
     # Get the graphs for `gm`, `pattern`, `replacement`
     original_graph: Graph = gm.graph
     pattern_graph: Graph = symbolic_trace(pattern).graph
@@ -222,8 +225,11 @@ def _replace_pattern(
     _matches: List[InternalMatch] = matcher.match(original_graph)
 
     # Filter out matches that don't match the filter
-    if match_filter:
-        _matches = [m for m in _matches if match_filter(m, original_graph, pattern_graph)]
+    _matches = [
+        m for m in _matches
+        if all(match_filter(m, original_graph, pattern_graph)
+               for match_filter in match_filters)
+    ]
 
     replacement_placeholders = [n for n in replacement_graph.nodes if n.op == "placeholder"]
 

From dc72485a4c9479e9e6963249d96f637c7a560bcf Mon Sep 17 00:00:00 2001
From: Horace He <chilli@fb.com>
Date: Wed, 26 Oct 2022 16:37:10 +0000
Subject: [PATCH 0219/1922] fix sym_storage conversion and some cleanup
 (#87718)

cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87718
Approved by: https://github.com/ezyang
---
 test/test_proxy_tensor.py                | 10 +++++++++-
 torch/fx/experimental/proxy_tensor.py    | 12 +++---------
 torch/fx/experimental/symbolic_shapes.py |  4 ++--
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index fae55367ab192..1d5985a00da8c 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -857,7 +857,7 @@ def test_neg_shape(self):
         def f(a):
             return torch.empty(-a.shape[0] + 10)
 
-        r = str(make_fx(f, tracing_mode="symbolic")(torch.empty(1)).code).strip()
+        r = str(make_fx(f, tracing_mode="symbolic")(torch.empty(2)).code).strip()
         self.assertExpectedInline(r, """\
 def forward(self, a_1):
     sym_size = torch.ops.aten.sym_size(a_1, 0);  a_1 = None
@@ -984,6 +984,14 @@ def f(a, b):
         fx_g = make_fx(f, tracing_mode="symbolic")(torch.randn(16), torch.randn(8))
         self.assertExpectedInline(str(fx_g.shape_env.get_guard_expr()), "Eq(s1, 8) & Eq(s0, 2*s1)")
 
+    def test_sym_storage_offset(self):
+        def f(x, y):
+            return x + y
+
+        inp = (torch.randn(8)[3:], torch.randn(5))
+        fx_g = make_fx(f, tracing_mode="symbolic")(*inp)
+        inp = (torch.randn(8)[3:], torch.randn(5))
+        self.assertEqual(fx_g(*inp), f(*inp))
 
     def _assert_no_guards(self, fx_g, free_symbols):
         assert _get_free_symbols(fx_g.shape_env) == free_symbols, fx_g.shape_env.var_to_val
diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
index 2bf6f0cca004a..86d1e19550928 100644
--- a/torch/fx/experimental/proxy_tensor.py
+++ b/torch/fx/experimental/proxy_tensor.py
@@ -628,7 +628,7 @@ def wrapped(*args):
 
         proxy_mode = ProxyTorchDispatchMode(fx_tracer)
 
-        def wrap_fake_concrete(x):
+        def wrap_fake(x):
             if isinstance(x, torch.Tensor):
                 return fake_tensor_mode.from_tensor(x)  # type: ignore[attr-defined]
 
@@ -636,16 +636,10 @@ def wrap_fake_concrete(x):
 
         sym_mode = proxy_mode.sym_mode
 
-        # todo: Figure out a more informative name for symints
-        def wrap_fake_symbolic(x):
-            if isinstance(x, torch.Tensor):
-                return fake_tensor_mode.from_tensor(x)
-            return x
-
         wrap_fn_map = {
             "real": lambda x: x,
-            "fake": wrap_fake_concrete,
-            "symbolic": wrap_fake_symbolic,
+            "fake": wrap_fake,
+            "symbolic": wrap_fake,
         }
         args = pytree.tree_map(wrap_fn_map[tracing_mode], args)
 
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index a7030abbcfc41..0a03e5819a90a 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -1,6 +1,6 @@
 import torch
 import torch.utils._pytree as pytree
-from typing import Set, Dict, List, Type, Optional, cast, Union
+from typing import Set, Dict, List, Type, Optional, cast
 import operator
 import builtins
 import math
@@ -389,7 +389,7 @@ def create_symbolic_sizes_strides(self, ex: torch.Tensor):
         assert all(x is not None for x in stride)
         return [self.create_symintnode(i) for i in size], [self.create_symintnode(i) for i in stride]  # type: ignore[arg-type]
 
-    def create_symintnode(self, expr: Union["sympy.Expr", int]):
+    def create_symintnode(self, expr: "sympy.Expr"):
         py_sym_int = PySymInt(expr, self)
         cpp_sym_int = torch.SymIntNode.new_symint(py_sym_int)  # type: ignore[attr-defined]
         return cpp_sym_int

From 2e97e4671820e43b454ef231ed3a7cd1fc7d1733 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Thu, 27 Oct 2022 04:23:43 +0000
Subject: [PATCH 0220/1922] [vision hash update] update the pinned vision hash
 (#87831)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned vision hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87831
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/vision.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index d4dee5af2936d..4ee9517b28d7a 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-edb3a8069a0b86231f14e84ac9f26fd7c7bffb5f
+add75968543f36818691f8b59880f5c04689a88e

From a2049dafc6ca000d814c6586bca568db7c90bab4 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Thu, 27 Oct 2022 00:03:14 +0000
Subject: [PATCH 0221/1922] [FSDP] ufmt /fsdp (#87811)

This applies `ufmt` to all of the FSDP files in the `torch/distributed/fsdp/` directory.

**Test Plan**
CI

**Notes**
For VSCode users,
- Install `ufmt`: https://pypi.org/project/ufmt/
- Install VSCode `ufmt` extension: https://marketplace.visualstudio.com/items?itemName=omnilib.ufmt
- Include in `settings.json`:
```
{
    "[python]": {
        "editor.defaultFormatter": "omnilib.ufmt",
        "editor.formatOnSave": true,
    },
}
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87811
Approved by: https://github.com/rohan-varma, https://github.com/fegin
---
 torch/distributed/fsdp/_fsdp_extensions.py    |   1 -
 torch/distributed/fsdp/_optim_utils.py        |  11 +-
 torch/distributed/fsdp/_shard_utils.py        |   6 +-
 torch/distributed/fsdp/_state_dict_utils.py   |  67 +--
 torch/distributed/fsdp/_symbolic_trace.py     |  15 +-
 torch/distributed/fsdp/_utils.py              |  13 +-
 torch/distributed/fsdp/flat_param.py          |   4 +-
 .../fsdp/fully_sharded_data_parallel.py       | 556 +++++++++++-------
 torch/distributed/fsdp/sharded_grad_scaler.py |  66 ++-
 torch/distributed/fsdp/wrap.py                |  35 +-
 10 files changed, 453 insertions(+), 321 deletions(-)

diff --git a/torch/distributed/fsdp/_fsdp_extensions.py b/torch/distributed/fsdp/_fsdp_extensions.py
index abe0d901f8ecc..1f087f44b5739 100644
--- a/torch/distributed/fsdp/_fsdp_extensions.py
+++ b/torch/distributed/fsdp/_fsdp_extensions.py
@@ -5,7 +5,6 @@
 import torch.distributed as dist
 from torch.distributed._shard.sharded_tensor.api import ShardedTensor
 from torch.distributed._shard.sharded_tensor.shard import Shard
-
 from torch.distributed.fsdp._shard_utils import _create_chunk_sharded_tensor
 
 
diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py
index a5e1ab64278e5..f87f871042217 100644
--- a/torch/distributed/fsdp/_optim_utils.py
+++ b/torch/distributed/fsdp/_optim_utils.py
@@ -3,6 +3,7 @@
 import functools
 from typing import (
     Any,
+    cast,
     Dict,
     Iterable,
     Iterator,
@@ -12,18 +13,18 @@
     Sequence,
     Tuple,
     Union,
-    cast,
 )
 
 import torch
 import torch.distributed as dist
+
 # Import the entire FSDP file to avoid circular imports
 import torch.distributed.fsdp.fully_sharded_data_parallel as FSDP
 import torch.nn as nn
 from torch.distributed._shard.sharded_tensor import ShardedTensor
+from torch.distributed.fsdp._fsdp_extensions import _ext_chunk_tensor
 from torch.distributed.fsdp._shard_utils import _gather_state_dict
 from torch.distributed.fsdp.flat_param import FlatParameter, FlatParamHandle
-from torch.distributed.fsdp._fsdp_extensions import _ext_chunk_tensor
 
 
 def sorted_items(dictionary: Dict[str, Any]) -> Iterator[Tuple[str, Any]]:
@@ -298,9 +299,9 @@ def _flatten_optim_state_dict(
     unflat_osd_state = unflat_osd["state"]
     for param, unflat_param_names in param_to_unflat_param_names.items():
         if isinstance(param, FlatParameter):  # flatten FSDP parameters' states
-            assert param in flat_param_to_fsdp_module, (
-                f"Check the `flat_param_to_fsdp_module` construction\nparam: {param}"
-            )
+            assert (
+                param in flat_param_to_fsdp_module
+            ), f"Check the `flat_param_to_fsdp_module` construction\nparam: {param}"
             fsdp_module = flat_param_to_fsdp_module[param]
             flat_state = _flatten_optim_state(
                 unflat_osd_state,
diff --git a/torch/distributed/fsdp/_shard_utils.py b/torch/distributed/fsdp/_shard_utils.py
index b0382b41c6d20..0cc9dd656f16b 100644
--- a/torch/distributed/fsdp/_shard_utils.py
+++ b/torch/distributed/fsdp/_shard_utils.py
@@ -250,10 +250,8 @@ def _create_chunk_sharded_tensor(
             requires_grad=False,
             memory_format=torch.contiguous_format,
             pin_memory=tensor.is_pinned(),
-        )
+        ),
     )
     return ShardedTensor._init_from_local_shards_and_global_metadata(
-        local_shards,
-        sharded_tensor_metadata=sharded_tensor_metadata,
-        process_group=pg
+        local_shards, sharded_tensor_metadata=sharded_tensor_metadata, process_group=pg
     )
diff --git a/torch/distributed/fsdp/_state_dict_utils.py b/torch/distributed/fsdp/_state_dict_utils.py
index ed4b8f226c123..90083ef85b18e 100644
--- a/torch/distributed/fsdp/_state_dict_utils.py
+++ b/torch/distributed/fsdp/_state_dict_utils.py
@@ -6,25 +6,24 @@
 import torch
 import torch.distributed as dist
 import torch.distributed.algorithms._checkpoint.checkpoint_wrapper as checkpoint_wrapper
+
 # Import the entire FSDP file to avoid circular imports
 import torch.distributed.fsdp.fully_sharded_data_parallel as FSDP
 import torch.nn as nn
 import torch.nn.functional as F
-
 from torch.distributed._shard.sharded_tensor import (
+    init_from_local_shards,
     Shard,
     ShardedTensor,
-    init_from_local_shards,
-)
-from torch.distributed.utils import (
-    _replace_by_prefix,
 )
+from torch.distributed.utils import _replace_by_prefix
 
-from ._fsdp_extensions import _ext_chunk_tensor, _ext_pre_load_state_dict_transform
-from ._fsdp_extensions import _extensions as _user_extensions
-from .flat_param import (
-    FlatParamHandle,
+from ._fsdp_extensions import (
+    _ext_chunk_tensor,
+    _ext_pre_load_state_dict_transform,
+    _extensions as _user_extensions,
 )
+from .flat_param import FlatParamHandle
 
 
 def _full_post_state_dict_hook(
@@ -53,16 +52,12 @@ def _full_post_state_dict_hook(
     # exiting `summon_full_params()` via the parameter shape. However, for
     # `NO_SHARD`, we cannot tell from the shape, so we do not return early.
     if (
-        (
-            not module._use_orig_params
-            and FSDP.FLAT_PARAM in module.module._parameters
-        )
-        or (
-            module._use_orig_params
-            and module._handles
-            and module._handles[0].uses_sharded_strategy
-            and module._handles[0].is_sharded(module._handles[0].flat_param)
-        )
+        not module._use_orig_params and FSDP.FLAT_PARAM in module.module._parameters
+    ) or (
+        module._use_orig_params
+        and module._handles
+        and module._handles[0].uses_sharded_strategy
+        and module._handles[0].is_sharded(module._handles[0].flat_param)
     ):
         return state_dict
 
@@ -79,7 +74,7 @@ def _full_post_state_dict_hook(
         # do not have prefix considered as they are not computed in `state_dict`
         # call.
         if clean_key.startswith(clean_prefix):
-            clean_key = clean_key[len(clean_prefix):]
+            clean_key = clean_key[len(clean_prefix) :]
 
         # Clone non-ignored parameters before exiting the
         # `_summon_full_params()` context
@@ -88,8 +83,9 @@ def _full_post_state_dict_hook(
             f"only has {state_dict.keys()}. prefix={prefix}, "
             f"module_name={module_name} param_name={param_name} rank={module.rank}."
         )
-        if clean_key not in module._ignored_param_names and \
-                not getattr(state_dict[fqn], "_has_been_cloned", False):
+        if clean_key not in module._ignored_param_names and not getattr(
+            state_dict[fqn], "_has_been_cloned", False
+        ):
             try:
                 state_dict[fqn] = state_dict[fqn].clone().detach()
                 state_dict[fqn]._has_been_cloned = True  # type: ignore[attr-defined]
@@ -129,11 +125,9 @@ def _full_pre_load_state_dict_hook(
 ) -> None:
     # We do not expect to be calling pre-hooks twice without post-hook
     # call in between.
-    assert getattr(module, '_full_param_ctx', None) is None
+    assert getattr(module, "_full_param_ctx", None) is None
     # Note that it needs writeback=True to persist.
-    module._full_param_ctx = module._summon_full_params(
-        recurse=False, writeback=True
-    )
+    module._full_param_ctx = module._summon_full_params(recurse=False, writeback=True)
     module._full_param_ctx.__enter__()
     _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP.FSDP_PREFIX}")
 
@@ -141,7 +135,7 @@ def _full_pre_load_state_dict_hook(
 def _full_post_load_state_dict_hook(module, *args, **kwargs) -> None:
     # We should exit summon_full_params context.
     module._assert_state([FSDP.TrainingState_.SUMMON_FULL_PARAMS])
-    assert getattr(module, '_full_param_ctx', None) is not None
+    assert getattr(module, "_full_param_ctx", None) is not None
     module._full_param_ctx.__exit__(None, None, None)
     module._full_param_ctx = None
 
@@ -189,7 +183,9 @@ def _local_post_load_state_dict_hook(module, *args, **kwargs) -> None:
 
 
 def _local_pre_load_state_dict_hook(
-    module, state_dict: Dict[str, Any], prefix: str,
+    module,
+    state_dict: Dict[str, Any],
+    prefix: str,
 ) -> None:
     """
     This hook finds the local flat_param for this FSDP module from the
@@ -253,7 +249,7 @@ def _sharded_post_state_dict_hook(
                 rank=module.rank,
                 world_size=module.world_size,
                 num_devices_per_node=torch.cuda.device_count(),
-                pg=module.process_group
+                pg=module.process_group,
             )
             if module._state_dict_config.offload_to_cpu:
                 sharded_tensor = sharded_tensor.cpu()
@@ -271,7 +267,9 @@ def _sharded_post_load_state_dict_hook(module, *args, **kwargs) -> None:
 
 
 def _sharded_pre_load_state_dict_hook(
-    module, state_dict: Dict[str, Any], prefix: str,
+    module,
+    state_dict: Dict[str, Any],
+    prefix: str,
 ) -> None:
     """
     The hook combines the unflattened, sharded parameters (ShardedTensor) to
@@ -331,7 +329,9 @@ def _sharded_pre_load_state_dict_hook(
 
     # Get the chunk from the loaded flat_param for the local rank.
     loaded_flat_tensor, num_to_pad = FlatParamHandle._get_shard(
-        loaded_flat_param, module.rank, module.world_size,
+        loaded_flat_param,
+        module.rank,
+        module.world_size,
     )
     loaded_flat_tensor.to(flat_param.device)
     assert all(s1 == s2 for s1, s2 in zip(loaded_shapes, flat_param._shapes)), (
@@ -377,10 +377,7 @@ def _post_state_dict_hook(
     # back to their mixed precision type. This is because buffers are cast
     # during lazy_init() and stay at their mixed precision type before/after
     # forward/backward. As a result state_dict() should maintain this.
-    if (
-        fsdp_module._is_root
-        and fsdp_module._mixed_precision_enabled_for_buffers()
-    ):
+    if fsdp_module._is_root and fsdp_module._mixed_precision_enabled_for_buffers():
         fsdp_module._cast_buffers(recurse=True)
     return processed_state_dict
 
diff --git a/torch/distributed/fsdp/_symbolic_trace.py b/torch/distributed/fsdp/_symbolic_trace.py
index 026595fd7def0..f6fe5e432252e 100644
--- a/torch/distributed/fsdp/_symbolic_trace.py
+++ b/torch/distributed/fsdp/_symbolic_trace.py
@@ -5,7 +5,6 @@
 
 import torch
 
-
 __all__ = ["TracingConfig"]
 
 
@@ -140,13 +139,18 @@ def _patched_create_proxy(
         if args is not None:
             named_params: List[Tuple[str, torch.nn.Parameter]] = []
             for arg in args:
-                if isinstance(arg, torch.fx.Proxy) and arg.node.target in prefixed_param_name_to_param:
+                if (
+                    isinstance(arg, torch.fx.Proxy)
+                    and arg.node.target in prefixed_param_name_to_param
+                ):
                     param = prefixed_param_name_to_param[arg.node.target]
                     named_params.append((arg.node.target, param))
                     if param not in set(execution_info.param_exec_order):
                         execution_info.param_exec_order.append(param)
             if named_params:
-                execution_info.module_to_execution_infos[module].append((module, named_params))
+                execution_info.module_to_execution_infos[module].append(
+                    (module, named_params)
+                )
     elif kind == "call_module":
         named_params = list(module.named_parameters())
         if named_params:
@@ -234,7 +238,10 @@ def _patch_tracer(
     )
     prefixed_param_name_to_param = dict(root_module.named_parameters())
     tracer.create_proxy = functools.partial(
-        _patched_create_proxy, original_create_proxy, execution_info, prefixed_param_name_to_param
+        _patched_create_proxy,
+        original_create_proxy,
+        execution_info,
+        prefixed_param_name_to_param,
     )
     try:
         yield
diff --git a/torch/distributed/fsdp/_utils.py b/torch/distributed/fsdp/_utils.py
index bd37ce5695984..eb72042b65573 100644
--- a/torch/distributed/fsdp/_utils.py
+++ b/torch/distributed/fsdp/_utils.py
@@ -10,14 +10,11 @@
 )
 from torch.nn.utils.rnn import PackedSequence
 
-
 FSDP_FLATTENED = "_fsdp_flattened"
 
 
 def _contains_batchnorm(module):
-    return any(
-        isinstance(mod, _BatchNorm) for mod in module.modules()
-    )
+    return any(isinstance(mod, _BatchNorm) for mod in module.modules())
 
 
 def _override_batchnorm_mixed_precision(module):
@@ -27,11 +24,14 @@ def _override_batchnorm_mixed_precision(module):
 
 
 def _apply_to_tensors(
-    fn: Callable, container: Union[torch.Tensor, Dict, List, Tuple, Set, OrderedDict, PackedSequence]
+    fn: Callable,
+    container: Union[torch.Tensor, Dict, List, Tuple, Set, OrderedDict, PackedSequence],
 ) -> Any:
     """Recursively apply to all tensor in different kinds of container types."""
 
-    def apply(x: Union[torch.Tensor, Dict, List, Tuple, Set, OrderedDict, PackedSequence]) -> Any:
+    def apply(
+        x: Union[torch.Tensor, Dict, List, Tuple, Set, OrderedDict, PackedSequence]
+    ) -> Any:
         if torch.is_tensor(x):
             return fn(x)
         elif hasattr(x, "__dataclass_fields__"):
@@ -75,6 +75,7 @@ def _apply_to_modules(
     module prefix name (e.g. "module.submodule." just like in model state dict)
     and makes that available to ``module_fn``.
     """
+
     def f(module: torch.nn.Module, prefix: str, *args, **kwargs):
         # Call the module function before recursing over children (pre-order)
         module_fn(module, prefix, *args, **kwargs)
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index 266dc80b4ed42..3e4eca07df7fa 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -33,7 +33,6 @@
     p_assert,
 )
 
-
 __all__ = [
     "FlatParameter",
     "FlatParamHandle",
@@ -1507,7 +1506,8 @@ def _writeback_orig_params(self) -> bool:
                 # memory and owns the gradient storage, so it will never
                 # require gradient writeback.
                 flat_param_grad = (
-                    flat_param.grad if self.uses_sharded_strategy or not self._config.offload_params
+                    flat_param.grad
+                    if self.uses_sharded_strategy or not self._config.offload_params
                     else flat_param._cpu_grad  # type: ignore[attr-defined]
                 )
                 needs_grad_writeback = flat_param_grad is None or not _same_storage(
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 5fb2e5cdf0f6b..8cd18474d959d 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -8,10 +8,11 @@
 import warnings
 from contextlib import contextmanager
 from dataclasses import dataclass
-from enum import Enum, auto
+from enum import auto, Enum
 from typing import (
     Any,
     Callable,
+    cast,
     Deque,
     Dict,
     Generator,
@@ -22,7 +23,6 @@
     Set,
     Tuple,
     Union,
-    cast,
 )
 
 import torch
@@ -35,15 +35,10 @@
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     _CHECKPOINT_PREFIX,
 )
-from torch.distributed.algorithms._comm_hooks import (
-    LOW_PRECISION_HOOKS,
-    default_hooks,
-)
+from torch.distributed.algorithms._comm_hooks import default_hooks, LOW_PRECISION_HOOKS
 from torch.distributed.distributed_c10d import _get_default_group
-from torch.distributed.utils import (
-    _sync_params_and_buffers,
-    _to_kwargs,
-)
+from torch.distributed.utils import _sync_params_and_buffers, _to_kwargs
+
 from ._optim_utils import (
     _broadcast_pos_dim_tensor_states,
     _broadcast_processed_optim_state_dict,
@@ -57,9 +52,9 @@
     _rekey_sharded_optim_state_dict,
 )
 from ._state_dict_utils import (
+    _post_load_state_dict_hook,
     _post_state_dict_hook,
     _pre_load_state_dict_hook,
-    _post_load_state_dict_hook,
 )
 from ._utils import (
     _apply_to_modules,
@@ -78,10 +73,10 @@
     HandleTrainingState,
 )
 from .wrap import (
-    ParamExecOrderWrapPolicy,
     _or_policy,
     _recursive_wrap,
     _wrap_batchnorm_individually,
+    ParamExecOrderWrapPolicy,
 )
 
 _TORCHDISTX_AVAIL = True
@@ -94,18 +89,23 @@
 if not hasattr(torch, "fx"):
     _TORCH_FX_AVAIL = False
 if _TORCH_FX_AVAIL:
-    from ._symbolic_trace import (
-        TracingConfig,
-        _init_execution_info,
-        _patch_tracer,
-    )
+    from ._symbolic_trace import _init_execution_info, _patch_tracer, TracingConfig
 
 
 __all__ = [
-    "FullyShardedDataParallel", "ShardingStrategy", "MixedPrecision",
-    "CPUOffload", "BackwardPrefetch", "StateDictType", "StateDictConfig",
-    "FullStateDictConfig", "LocalStateDictConfig", "ShardedStateDictConfig",
-    "OptimStateKeyType", "TrainingState_", "clean_tensor_name",
+    "FullyShardedDataParallel",
+    "ShardingStrategy",
+    "MixedPrecision",
+    "CPUOffload",
+    "BackwardPrefetch",
+    "StateDictType",
+    "StateDictConfig",
+    "FullStateDictConfig",
+    "LocalStateDictConfig",
+    "ShardedStateDictConfig",
+    "OptimStateKeyType",
+    "TrainingState_",
+    "clean_tensor_name",
 ]
 
 
@@ -148,6 +148,7 @@ class ShardingStrategy(Enum):
                                   ``NO_SHARD`` inter-node.
 
     """
+
     FULL_SHARD = auto()
     SHARD_GRAD_OP = auto()
     NO_SHARD = auto()
@@ -197,6 +198,7 @@ class MixedPrecision:
         would occur in the `param_dtype` precision, if given, otherwise, in the
         original parameter precision.
     """
+
     # maintain a tensor of this dtype that the fp32 param shard will be cast to.
     # Will control the precision of model params, inputs, and thus compute as
     # well.
@@ -309,6 +311,7 @@ class StateDictConfig:
     order to configure settings for the particular type of ``state_dict``
     implementation FSDP will use.
     """
+
     offload_to_cpu: bool = False
 
 
@@ -340,6 +343,7 @@ class FullStateDictConfig(StateDictConfig):
         >>> fsdp = FSDP(model, device_id=torch.cuda.current_device(), auto_wrap_policy=..., sync_module_states=True)
         >>> # After this point, all ranks have FSDP model with loaded checkpoint.
     """
+
     rank0_only: bool = False
 
 
@@ -366,9 +370,10 @@ class OptimStateKeyType(Enum):
 
 class _ExecOrderWarnStatus(Enum):
     """Used internally for execution order validation."""
-    NONE = auto()     # no deviation yet
+
+    NONE = auto()  # no deviation yet
     WARNING = auto()  # deviated this iteration; currently issuing warnings
-    WARNED = auto()   # deviated in a previous iteration
+    WARNED = auto()  # deviated in a previous iteration
 
 
 class _ExecOrderData:
@@ -403,9 +408,10 @@ def __init__(
         self._forward_prefetch_limit = forward_prefetch_limit
 
         # Data structures for execution order validation
-        self._checking_order: bool = (
-            debug_level in [dist.DebugLevel.INFO, dist.DebugLevel.DETAIL]
-        )
+        self._checking_order: bool = debug_level in [
+            dist.DebugLevel.INFO,
+            dist.DebugLevel.DETAIL,
+        ]
         self.process_group: Optional[dist.ProcessGroup] = None
         self.world_size: Optional[int] = None
         self.all_handles: List[FlatParamHandle] = []
@@ -454,7 +460,9 @@ def get_handles_to_backward_prefetch(
         prefetch given the current handles key. If there are no valid handles
         keys to prefetch, then this returns an empty :class:`list`.
         """
-        current_index = self.handles_to_post_forward_order_index.get(current_handles_key, None)
+        current_index = self.handles_to_post_forward_order_index.get(
+            current_handles_key, None
+        )
         if current_index is None:
             return None
         target_index = current_index - 1
@@ -462,9 +470,7 @@ def get_handles_to_backward_prefetch(
         for _ in range(self._backward_prefetch_limit):
             if target_index < 0:
                 break
-            target_handles_keys.append(
-                self.handles_post_forward_order[target_index]
-            )
+            target_handles_keys.append(self.handles_post_forward_order[target_index])
             target_index -= 1
         return target_handles_keys
 
@@ -477,7 +483,9 @@ def get_handles_to_forward_prefetch(
         prefetch given the current handles key. If there are no valid handles
         keys to prefetch, then this returns an empty :class:`list`.
         """
-        current_index = self.handles_to_pre_forward_order_index.get(current_handles_key, None)
+        current_index = self.handles_to_pre_forward_order_index.get(
+            current_handles_key, None
+        )
         if current_index is None:
             return None
         target_index = current_index + 1
@@ -485,9 +493,7 @@ def get_handles_to_forward_prefetch(
         for _ in range(self._forward_prefetch_limit):
             if target_index >= len(self.handles_pre_forward_order):
                 break
-            target_handles_keys.append(
-                self.handles_pre_forward_order[target_index]
-            )
+            target_handles_keys.append(self.handles_pre_forward_order[target_index])
             target_index += 1
         return target_handles_keys
 
@@ -511,7 +517,9 @@ def record_post_forward(self, handles: List[FlatParamHandle]) -> None:
         self.handles_to_post_forward_order_index[handles_key] = index
         self.handles_post_forward_order.append(handles_key)
 
-    def record_pre_forward(self, handles: List[FlatParamHandle], is_training: bool) -> None:
+    def record_pre_forward(
+        self, handles: List[FlatParamHandle], is_training: bool
+    ) -> None:
         """
         Records ``handles`` in the pre-forward order, where ``handles`` should
         be a group of handles used in the same module's forward. If ``handles``
@@ -597,7 +605,7 @@ def _check_order(self, handles_key: _HandlesKey, is_training: bool) -> None:
                     (
                         rank,
                         world_indices[
-                            rank * num_valid_indices: (rank + 1) * num_valid_indices
+                            rank * num_valid_indices : (rank + 1) * num_valid_indices
                         ],
                     )
                     for rank in range(self.world_size)
@@ -683,7 +691,9 @@ def _get_names_from_handle_indices(
                 continue
             handle = self.all_handles[index]
             flat_param = handle.flat_param
-            prefixed_param_names.append(self.flat_param_to_prefixed_param_names[flat_param])
+            prefixed_param_names.append(
+                self.flat_param_to_prefixed_param_names[flat_param]
+            )
         return prefixed_param_names
 
     def _get_names_from_handles(
@@ -700,7 +710,9 @@ def _get_names_from_handles(
             flat_param = handle.flat_param
             if flat_param not in self.flat_param_to_prefixed_param_names:
                 continue
-            prefixed_param_names.append(self.flat_param_to_prefixed_param_names[flat_param])
+            prefixed_param_names.append(
+                self.flat_param_to_prefixed_param_names[flat_param]
+            )
         return prefixed_param_names
 
     def next_iter(self):
@@ -970,6 +982,7 @@ class FullyShardedDataParallel(nn.Module):
             the sharded strategies that schedule all-gathers. Enabling this can
             help lower the number of CUDA malloc retries.
     """
+
     def __init__(
         self,
         module: nn.Module,
@@ -1062,10 +1075,16 @@ def __init__(
         self._buffer_name_to_orig_dtype: Dict[str, torch.dtype] = {}
 
         self._check_single_device_module(module, ignored_params)
-        device_from_device_id: Optional[torch.device] = self._get_device_from_device_id(device_id)
-        self._materialize_module(module, param_init_fn, ignored_params, device_from_device_id)
+        device_from_device_id: Optional[torch.device] = self._get_device_from_device_id(
+            device_id
+        )
+        self._materialize_module(
+            module, param_init_fn, ignored_params, device_from_device_id
+        )
         self._move_module_to_device(module, ignored_params, device_from_device_id)
-        self.compute_device = self._get_compute_device(module, ignored_params, device_from_device_id)
+        self.compute_device = self._get_compute_device(
+            module, ignored_params, device_from_device_id
+        )
         params_to_flatten = list(self._get_orig_params(module, ignored_params))
         if sync_module_states:
             self._sync_module_states(module, params_to_flatten)
@@ -1098,7 +1117,10 @@ def __init__(
             self.params.append(handle.flat_param)
             self._register_param_handle(handle)
             handle.shard()
-            if self.cpu_offload.offload_params and handle.flat_param.device != torch.device("cpu"):
+            if (
+                self.cpu_offload.offload_params
+                and handle.flat_param.device != torch.device("cpu")
+            ):
                 handle.flat_param_to(torch.device("cpu"))
         if not use_orig_params:
             self._check_orig_params_flattened(ignored_params)
@@ -1301,8 +1323,7 @@ def _get_device_from_device_id(
         self,
         device_id: Optional[Union[int, torch.device]],
     ) -> Optional[torch.device]:
-        """
-        """
+        """ """
         if device_id is None:
             return None
         device = (
@@ -1341,11 +1362,15 @@ def _materialize_module(
         ``reset_parameters()``, and for torchdistX fake tensors, this calls
         ``deferred_init.materialize_module()``.
         """
-        is_meta_module = any(p.is_meta for p in self._get_orig_params(module, ignored_params))
+        is_meta_module = any(
+            p.is_meta for p in self._get_orig_params(module, ignored_params)
+        )
         is_torchdistX_deferred_init = (
             not is_meta_module
             and _TORCHDISTX_AVAIL
-            and any(fake.is_fake(p) for p in self._get_orig_params(module, ignored_params))
+            and any(
+                fake.is_fake(p) for p in self._get_orig_params(module, ignored_params)
+            )
         )
         if (
             is_meta_module or is_torchdistX_deferred_init
@@ -1357,7 +1382,9 @@ def _materialize_module(
             param_init_fn(module)
         elif is_meta_module:
             # Run default meta device initialization
-            materialization_device = device_from_device_id or torch.cuda.current_device()
+            materialization_device = (
+                device_from_device_id or torch.cuda.current_device()
+            )
             module.to_empty(device=materialization_device)
             try:
                 with torch.no_grad():
@@ -1483,7 +1510,10 @@ def _sync_module_states(
                 module_states.append(buffer.detach())
         module_states.extend(param.detach() for param in params)
         _sync_params_and_buffers(
-            self.process_group, module_states, _PARAM_BROADCAST_BUCKET_SIZE, src=0,
+            self.process_group,
+            module_states,
+            _PARAM_BROADCAST_BUCKET_SIZE,
+            src=0,
         )
 
     def _get_orig_params(
@@ -1573,7 +1603,7 @@ def _reshard(
         p_assert(
             len(handles) == len(free_unsharded_flat_params),
             "Expects both lists to have equal length but got "
-            f"{len(handles)} and {len(free_unsharded_flat_params)}"
+            f"{len(handles)} and {len(free_unsharded_flat_params)}",
         )
         for handle, free_unsharded_flat_param in zip(
             handles,
@@ -1651,9 +1681,10 @@ def fsdp_modules(
             the input ``module``.
         """
         return [
-            submodule for submodule in module.modules()
-            if isinstance(submodule, FullyShardedDataParallel) and
-            (not root_only or submodule.check_is_root())
+            submodule
+            for submodule in module.modules()
+            if isinstance(submodule, FullyShardedDataParallel)
+            and (not root_only or submodule.check_is_root())
         ]
 
     def apply(self, fn: Callable[[nn.Module], None]) -> "FullyShardedDataParallel":
@@ -1728,6 +1759,7 @@ def _cast_fp_inputs_to_dtype(
         precision given by ``dtype``, while respecting the existing
         ``requires_grad`` on the tensors.
         """
+
         def cast_fn(x: torch.Tensor) -> torch.Tensor:
             if not torch.is_floating_point(x):
                 return x
@@ -1741,7 +1773,7 @@ def cast_fn(x: torch.Tensor) -> torch.Tensor:
         with torch.no_grad():
             return (
                 _apply_to_tensors(cast_fn, args),
-                _apply_to_tensors(cast_fn, kwargs)
+                _apply_to_tensors(cast_fn, kwargs),
             )
 
     def _cast_buffers(
@@ -1775,9 +1807,15 @@ def _cast_buffers(
         if memo is None:
             memo = set()
         for module in self.modules():
-            if module is not self and isinstance(module, FullyShardedDataParallel) and recurse:
+            if (
+                module is not self
+                and isinstance(module, FullyShardedDataParallel)
+                and recurse
+            ):
                 # Allow any child FSDP instances to handle their own buffers.
-                module._cast_buffers(device=device, dtype=dtype, memo=memo, recurse=recurse)
+                module._cast_buffers(
+                    device=device, dtype=dtype, memo=memo, recurse=recurse
+                )
             elif module not in memo:
                 memo.add(module)
                 for name, buf in module.named_buffers(recurse=False):
@@ -1863,7 +1901,9 @@ def _lazy_init(self) -> None:
                     fsdp_module.limit_all_gathers = self.limit_all_gathers
                 fsdp_module._free_event_queue = self._free_event_queue
                 fsdp_module._handles_prefetched = self._handles_prefetched
-                fsdp_module._needs_pre_backward_unshard = self._needs_pre_backward_unshard
+                fsdp_module._needs_pre_backward_unshard = (
+                    self._needs_pre_backward_unshard
+                )
                 for handle in fsdp_module._handles:
                     fsdp_module._init_param_attributes(handle)
         if inconsistent_limit_all_gathers:
@@ -1936,13 +1976,11 @@ def _init_param_attributes(self, handle: FlatParamHandle) -> None:
         # fwd/bwd, it is freed and we only hold on to the full precision shard.
         # As a result, this reduced precision shard is not allocated if we are
         # not in the forward/backward pass.
-        if (
-            self._mixed_precision_enabled_for_params()
-        ):
+        if self._mixed_precision_enabled_for_params():
             p._mp_shard = torch.zeros_like(
                 p._local_shard,
                 device=self.compute_device,
-                dtype=self.mixed_precision.param_dtype
+                dtype=self.mixed_precision.param_dtype,
             )
             _free_storage(p._mp_shard)
 
@@ -1957,7 +1995,8 @@ def _init_param_attributes(self, handle: FlatParamHandle) -> None:
             # into full_param_padded it can occur without issues and result in
             # full_param_padded having the expected param_dtype.
             full_param_dtype = (
-                p.dtype if not self._mixed_precision_enabled_for_params()
+                p.dtype
+                if not self._mixed_precision_enabled_for_params()
                 else self.mixed_precision.param_dtype
             )
             p._full_param_padded = torch.zeros(  # type: ignore[attr-defined]
@@ -2024,7 +2063,9 @@ def _prefetch_handles(
         for handles_key in handles_to_prefetch:
             # Prefetch the next set of handles without synchronizing to allow
             # the sync to happen as late as possible to maximize overlap
-            self._unshard(handles_key, self._streams["unshard"], self._streams["pre_unshard"])
+            self._unshard(
+                handles_key, self._streams["unshard"], self._streams["pre_unshard"]
+            )
             self._handles_prefetched[handles_key] = True
 
     def _get_handles_to_prefetch(
@@ -2048,33 +2089,31 @@ def _get_handles_to_prefetch(
         p_assert(
             training_state in valid_training_states,
             f"Prefetching is only supported in {valid_training_states} but "
-            f"currently in {training_state}"
+            f"currently in {training_state}",
         )
         eod = self._exec_order_data
         target_handles_keys: List[_HandlesKey] = []
         if (
-            (
-                training_state == HandleTrainingState.BACKWARD_PRE
-                and self.backward_prefetch == BackwardPrefetch.BACKWARD_PRE
-            )
-            or (
-                training_state == HandleTrainingState.BACKWARD_POST
-                and self.backward_prefetch == BackwardPrefetch.BACKWARD_POST
-            )
+            training_state == HandleTrainingState.BACKWARD_PRE
+            and self.backward_prefetch == BackwardPrefetch.BACKWARD_PRE
+        ) or (
+            training_state == HandleTrainingState.BACKWARD_POST
+            and self.backward_prefetch == BackwardPrefetch.BACKWARD_POST
         ):
             target_handles_keys = [
-                target_handles_key for target_handles_key in
-                eod.get_handles_to_backward_prefetch(current_handles_key)
+                target_handles_key
+                for target_handles_key in eod.get_handles_to_backward_prefetch(
+                    current_handles_key
+                )
                 if self._needs_pre_backward_unshard.get(target_handles_key, False)
                 and not self._handles_prefetched.get(target_handles_key, False)
             ]
-        elif (
-            training_state == HandleTrainingState.FORWARD
-            and self.forward_prefetch
-        ):
+        elif training_state == HandleTrainingState.FORWARD and self.forward_prefetch:
             target_handles_keys = [
-                target_handles_key for target_handles_key in
-                eod.get_handles_to_forward_prefetch(current_handles_key)
+                target_handles_key
+                for target_handles_key in eod.get_handles_to_forward_prefetch(
+                    current_handles_key
+                )
                 if self._needs_pre_forward_unshard.get(target_handles_key, False)
                 and not self._handles_prefetched.get(target_handles_key, False)
             ]
@@ -2089,7 +2128,7 @@ def _get_training_state(
         training_states = set(handle._training_state for handle in handles_key)
         p_assert(
             len(training_states) == 1,
-            f"Expects uniform training state but got {training_states}"
+            f"Expects uniform training state but got {training_states}",
         )
         return next(iter(training_states))
 
@@ -2157,7 +2196,9 @@ def set_state_dict_type(
                     "All FSDP modules should have the same type of state_dict_config."
                 )
 
-            expected_state_dict_config_type = _state_dict_type_to_config[state_dict_type]
+            expected_state_dict_config_type = _state_dict_type_to_config[
+                state_dict_type
+            ]
             if expected_state_dict_config_type != type(state_dict_config):
                 raise RuntimeError(
                     f"Expected state_dict_config of type {expected_state_dict_config_type} "
@@ -2200,10 +2241,11 @@ def state_dict_type(
         prev_state_dict_type = None
         prev_state_dict_config = None
         try:
-            prev_state_dict_type, prev_state_dict_config = (
-                FullyShardedDataParallel.set_state_dict_type(
-                    module, state_dict_type, state_dict_config
-                )
+            (
+                prev_state_dict_type,
+                prev_state_dict_config,
+            ) = FullyShardedDataParallel.set_state_dict_type(
+                module, state_dict_type, state_dict_config
             )
             yield
         except Exception as e:
@@ -2233,18 +2275,14 @@ def _convert_to_wrapped_module_name(self, module_name: str) -> str:
     def _param_fqns(self) -> Iterator[Tuple[str, str, str]]:
         if not self._has_params:
             return
-        for param_name, module_name in (
-            self._handles[0].parameter_module_names()
-        ):
+        for param_name, module_name in self._handles[0].parameter_module_names():
             module_name = self._convert_to_wrapped_module_name(module_name)
             fqn = f"{module_name}{param_name}"
             yield fqn, param_name, module_name
 
     @property
     def _shared_param_fqns(self) -> Iterator[Tuple[str, str, str]]:
-        for param_name, module_name in (
-            self._handles[0].shared_parameter_module_names()
-        ):
+        for param_name, module_name in self._handles[0].shared_parameter_module_names():
             module_name = self._convert_to_wrapped_module_name(module_name)
             fqn = f"{module_name}{param_name}"
             yield fqn, param_name, module_name
@@ -2297,17 +2335,21 @@ def state_dict(self, *args, **kwargs):
         if self._state_dict_type == StateDictType.FULL_STATE_DICT:
             # Get config args
             full_state_dict_config = (
-                self._state_dict_config if self._state_dict_config is not None
+                self._state_dict_config
+                if self._state_dict_config is not None
                 else FullStateDictConfig()
             )
             rank0_only = full_state_dict_config.rank0_only
             offload_to_cpu = full_state_dict_config.offload_to_cpu
             summon_ctx = (
                 self._summon_full_params(
-                    recurse=False, writeback=False, offload_to_cpu=offload_to_cpu, rank0_only=rank0_only
+                    recurse=False,
+                    writeback=False,
+                    offload_to_cpu=offload_to_cpu,
+                    rank0_only=rank0_only,
                 )
-                if self.training_state != TrainingState_.SUMMON_FULL_PARAMS else
-                contextlib.suppress()
+                if self.training_state != TrainingState_.SUMMON_FULL_PARAMS
+                else contextlib.suppress()
             )
             with summon_ctx:
                 # Since buffers are not sharded and stay casted, restore them to their
@@ -2316,10 +2358,7 @@ def state_dict(self, *args, **kwargs):
                 # buffers stay casted after forward/backward. We must have the
                 # call here instead of above because _summon_full_params itself
                 # calls _lazy_init() which would cast the buffers.
-                if (
-                    self._is_root
-                    and self._mixed_precision_enabled_for_buffers()
-                ):
+                if self._is_root and self._mixed_precision_enabled_for_buffers():
                     self._cast_buffers(
                         dtype=self._buffer_name_to_orig_dtype, recurse=False
                     )
@@ -2332,13 +2371,10 @@ def state_dict(self, *args, **kwargs):
                 return {}
 
         elif (
-            self._state_dict_type == StateDictType.LOCAL_STATE_DICT or
-            self._state_dict_type == StateDictType.SHARDED_STATE_DICT
+            self._state_dict_type == StateDictType.LOCAL_STATE_DICT
+            or self._state_dict_type == StateDictType.SHARDED_STATE_DICT
         ):
-            if (
-                self._has_params and
-                not self._handles[0].uses_sharded_strategy
-            ):
+            if self._has_params and not self._handles[0].uses_sharded_strategy:
                 raise RuntimeError(
                     "sharded_state_dict/local_state_dict can only be called "
                     "when parameters are flatten and sharded."
@@ -2352,17 +2388,22 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
         Runs the forward pass for the wrapped module, inserting FSDP-specific
         pre- and post-forward sharding logic.
         """
-        with torch.autograd.profiler.record_function("FullyShardedDataParallel.forward"):
+        with torch.autograd.profiler.record_function(
+            "FullyShardedDataParallel.forward"
+        ):
             self._lazy_init()
             args, kwargs = self._fsdp_root_pre_forward(*args, **kwargs)
             unused = None
-            unshard_fn = functools.partial(self._pre_forward_unshard, handles=self._handles)
+            unshard_fn = functools.partial(
+                self._pre_forward_unshard, handles=self._handles
+            )
             # Do not free the root's parameters in the post-forward for
             # `FULL_SHARD` with the intention that they are immediately used
             # for backward computation (though this may not be true)
             free_unsharded_flat_params = [
                 not self._is_root
-                and handle._config.sharding_strategy == HandleShardingStrategy.FULL_SHARD
+                and handle._config.sharding_strategy
+                == HandleShardingStrategy.FULL_SHARD
                 for handle in self._handles
             ]
             reshard_fn = functools.partial(
@@ -2375,7 +2416,7 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
                 p_assert(
                     handle.flat_param.device == self.compute_device,
                     "Expected `FlatParameter` to be on the compute device "
-                    f"{self.compute_device} but got {handle.flat_param.device}"
+                    f"{self.compute_device} but got {handle.flat_param.device}",
                 )
             output = self._fsdp_wrapped_module(*args, **kwargs)
             return self._post_forward(self._handles, reshard_fn, unused, unused, output)
@@ -2418,7 +2459,9 @@ def _pre_forward_unshard(
     ) -> None:
         """Unshards parameters in the pre-forward."""
         if handles:
-            self._unshard(handles, self._streams["unshard"], self._streams["pre_unshard"])
+            self._unshard(
+                handles, self._streams["unshard"], self._streams["pre_unshard"]
+            )
             handles_key = tuple(handles)
             self._needs_pre_forward_unshard[handles_key] = False
             torch.cuda.current_stream().wait_stream(self._streams["unshard"])
@@ -2476,7 +2519,9 @@ def _cast_forward_inputs(self, *args, **kwargs):
         if self._mixed_precision_enabled_for_params():
             input_dtype = self.mixed_precision.param_dtype
             args, kwargs = self._cast_fp_inputs_to_dtype(
-                input_dtype, *args, **kwargs,
+                input_dtype,
+                *args,
+                **kwargs,
             )
         return args, kwargs
 
@@ -2525,7 +2570,7 @@ def summon_full_params(
         offload_to_cpu: bool = False,
         with_grads: bool = False,
     ) -> Generator:
-        r""" A context manager to expose full params for FSDP instances.
+        r"""A context manager to expose full params for FSDP instances.
         Can be useful *after* forward/backward for a model to get
         the params for additional processing or checking. It can take a non-FSDP
         module and will summon full params for all contained FSDP modules as
@@ -2663,7 +2708,9 @@ def _summon_full_params(
             handle._training_state = HandleTrainingState.SUMMON_FULL_PARAMS
 
         self._clear_grads_if_needed()
-        free_unsharded_flat_params = [handle.needs_unshard() for handle in self._handles]
+        free_unsharded_flat_params = [
+            handle.needs_unshard() for handle in self._handles
+        ]
         # No need to call `wait_stream()` since we unshard in the computation
         # stream directly
         computation_stream = torch.cuda.current_stream()
@@ -2742,7 +2789,7 @@ def _writeback_to_local_shard(
                 handle.rank,
                 handle.world_size,
             )
-            handle.flat_param._local_shard[:param_shard.numel()].copy_(param_shard)
+            handle.flat_param._local_shard[: param_shard.numel()].copy_(param_shard)
             if writeback_grad:
                 existing_grad = handle.sharded_grad
                 if existing_grad is not None:
@@ -2751,7 +2798,7 @@ def _writeback_to_local_shard(
                         handle.rank,
                         handle.world_size,
                     )
-                    existing_grad[:grad_shard.numel()].copy_(grad_shard)
+                    existing_grad[: grad_shard.numel()].copy_(grad_shard)
 
     @contextlib.contextmanager
     def _unflatten_as_params(self) -> Generator:
@@ -2825,7 +2872,7 @@ def _deregister_orig_params(self):
         p_assert(
             len(self._handles) <= 1,
             "Expects <=1 handle per FSDP instance; needs to be refactored "
-            "for >1 handle (e.g. non-recursive wrapping)"
+            "for >1 handle (e.g. non-recursive wrapping)",
         )
         if not self._handles:
             return
@@ -2833,7 +2880,7 @@ def _deregister_orig_params(self):
         p_assert(
             handle._use_orig_params,
             f"Inconsistent `_use_orig_params` -- FSDP: {self._use_orig_params} "
-            f"handle: {handle._use_orig_params}"
+            f"handle: {handle._use_orig_params}",
         )
         handle._deregister_orig_params()
         self._register_flat_param()
@@ -2973,7 +3020,9 @@ def _pre_backward_hook(_handles: List[FlatParamHandle], *unused: Any) -> None:
 
                 # If the handles have been prefetched, this `_unshard()` simply
                 # switches to using the unsharded parameter
-                self._unshard(_handles, self._streams["unshard"], self._streams["pre_unshard"])
+                self._unshard(
+                    _handles, self._streams["unshard"], self._streams["pre_unshard"]
+                )
                 torch.cuda.current_stream().wait_stream(self._streams["unshard"])
 
                 # Set this to `False` to ensure that a mistargeted prefetch
@@ -3022,7 +3071,7 @@ def _register_post_backward_hooks(
             p_assert(
                 temp_flat_param.grad_fn is not None,
                 "The `grad_fn` is needed to access the `AccumulateGrad` and "
-                "register the post-backward hook"
+                "register the post-backward hook",
             )
             acc_grad = temp_flat_param.grad_fn.next_functions[0][0]
             hook_handle = acc_grad.register_hook(
@@ -3055,11 +3104,16 @@ def _post_backward_hook(
         ):
             # First hook callback will see PRE state. If we have multiple params,
             # then subsequent hook callbacks will see POST state.
-            self._assert_state([TrainingState_.BACKWARD_PRE, TrainingState_.BACKWARD_POST])
+            self._assert_state(
+                [TrainingState_.BACKWARD_PRE, TrainingState_.BACKWARD_POST]
+            )
             self.training_state = TrainingState_.BACKWARD_POST
             handle._training_state = HandleTrainingState.BACKWARD_POST
 
-            if self._use_param_exec_order_policy() and self._param_exec_order_prep_stage:
+            if (
+                self._use_param_exec_order_policy()
+                and self._param_exec_order_prep_stage
+            ):
                 # In self._fsdp_params_exec_order, the parameters are ordered based on
                 # the execution order in the backward pass in the first iteration.
                 self._fsdp_params_exec_order.append(param)
@@ -3103,7 +3157,9 @@ def _post_backward_hook(
                     # TODO: Make this a communication hook when communication hooks
                     # are implemented for FSDP. Note that this is a noop if the
                     # reduce_dtype matches the param dtype.
-                    param.grad.data = param.grad.data.to(self.mixed_precision.reduce_dtype)
+                    param.grad.data = param.grad.data.to(
+                        self.mixed_precision.reduce_dtype
+                    )
 
                 if self._exec_order_data.is_first_iter:
                     # For all sharding strategies communication is performed through `_communication_hook`:
@@ -3112,11 +3168,11 @@ def _post_backward_hook(
                     # and `_communication_hook_state`, required for communication not `None`.`
                     p_assert(
                         self._communication_hook is not None,
-                        "Communication hook should not be None"
+                        "Communication hook should not be None",
                     )
                     p_assert(
                         self._communication_hook_state is not None,
-                        "Communication hook state should not be None"
+                        "Communication hook state should not be None",
                     )
                 grad = param.grad.data
                 if handle.uses_sharded_strategy:
@@ -3138,7 +3194,9 @@ def _post_backward_hook(
                     num_pad = self.world_size * chunks[0].numel() - grad.numel()
                     input_flattened = F.pad(grad_flatten, [0, num_pad])
                     output = torch.zeros_like(chunks[0])
-                    self._communication_hook(self._communication_hook_state, input_flattened, output)
+                    self._communication_hook(
+                        self._communication_hook_state, input_flattened, output
+                    )
 
                     self._cast_grad_to_param_dtype(output, param)
 
@@ -3153,13 +3211,13 @@ def _post_backward_hook(
                             param._saved_grad_shard.shape == output.shape,  # type: ignore[attr-defined]
                             "Shape mismatch when accumulating gradients: "  # type: ignore[attr-defined]
                             f"existing grad shape={param._saved_grad_shard.shape} "
-                            f"new grad shape={output.shape}"  # type: ignore[attr-defined]
+                            f"new grad shape={output.shape}",  # type: ignore[attr-defined]
                         )
                         p_assert(
                             param._saved_grad_shard.device == output.device,  # type: ignore[attr-defined]
                             "Device mismatch when accumulating gradients: "  # type: ignore[attr-defined]
                             f"existing grad device={param._saved_grad_shard.device} "
-                            f"new grad device={output.device}"  # type: ignore[attr-defined]
+                            f"new grad device={output.device}",  # type: ignore[attr-defined]
                         )
                         param._saved_grad_shard += output  # type: ignore[attr-defined]
                     else:
@@ -3167,7 +3225,9 @@ def _post_backward_hook(
                     grad = param._saved_grad_shard  # type: ignore[attr-defined]
                 else:
                     if self.sharding_strategy == ShardingStrategy.NO_SHARD:
-                        self._communication_hook(self._communication_hook_state, param.grad)
+                        self._communication_hook(
+                            self._communication_hook_state, param.grad
+                        )
 
                     # For NO_SHARD keeping grads in the reduced precision, we
                     # can simply omit the cast as needed, we can't do this for
@@ -3221,12 +3281,9 @@ def _cast_grad_to_param_dtype(
         dtype cast happens in the hook instead.
         """
         self._assert_state(TrainingState_.BACKWARD_POST)
-        if (
-            not self._low_precision_hook_enabled()
-            and (
-                self._mixed_precision_enabled_for_params()
-                or self._mixed_precision_enabled_for_reduce()
-            )
+        if not self._low_precision_hook_enabled() and (
+            self._mixed_precision_enabled_for_params()
+            or self._mixed_precision_enabled_for_reduce()
         ):
             low_prec_grad_data = grad.data
             grad.data = grad.data.to(dtype=param.dtype)
@@ -3236,9 +3293,8 @@ def _cast_grad_to_param_dtype(
 
     def _should_free_unsharded_flat_param(self, handle: FlatParamHandle):
         return (
-            (self._sync_gradients and handle.uses_sharded_strategy)
-            or handle._config.sharding_strategy == HandleShardingStrategy.FULL_SHARD
-        )
+            self._sync_gradients and handle.uses_sharded_strategy
+        ) or handle._config.sharding_strategy == HandleShardingStrategy.FULL_SHARD
 
     def _queue_wait_for_post_backward(self) -> None:
         """
@@ -3247,7 +3303,7 @@ def _queue_wait_for_post_backward(self) -> None:
         """
         p_assert(
             self._is_root,
-            "`_queue_wait_for_post_backward()` should be called on the root FSDP instance"
+            "`_queue_wait_for_post_backward()` should be called on the root FSDP instance",
         )
         if self._post_backward_callback_queued:
             return
@@ -3295,18 +3351,21 @@ def _catch_all_reshard(fsdp_module: FullyShardedDataParallel) -> None:
                     # TODO: This already-resharded check is brittle:
                     # https://github.com/pytorch/pytorch/issues/83956
                     already_resharded = (
-                        handle.flat_param.data_ptr() == handle.flat_param._local_shard.data_ptr()
+                        handle.flat_param.data_ptr()
+                        == handle.flat_param._local_shard.data_ptr()
                     )
                     if already_resharded:
                         continue
-                    free_unsharded_flat_params.append(self._should_free_unsharded_flat_param(handle))
+                    free_unsharded_flat_params.append(
+                        self._should_free_unsharded_flat_param(handle)
+                    )
                     handles_to_reshard.append(handle)
                 self._reshard(handles_to_reshard, free_unsharded_flat_params)
             except Exception as e:
                 p_assert(
                     False,
                     f"Got exception while resharding module {fsdp_module}: {str(e)}",
-                    raise_assertion_error=False
+                    raise_assertion_error=False,
                 )
                 raise e
 
@@ -3318,7 +3377,7 @@ def _finalize_params(fsdp_module: FullyShardedDataParallel) -> None:
                     if hasattr(p, "_post_backward_hook_state"):
                         p_assert(
                             len(p._post_backward_hook_state) == 2,  # type: ignore[attr-defined]
-                            "p._post_backward_hook_state fields are not valid."
+                            "p._post_backward_hook_state fields are not valid.",
                         )
                         p._post_backward_hook_state[1].remove()  # type: ignore[attr-defined]
                         delattr(p, "_post_backward_hook_state")
@@ -3331,8 +3390,8 @@ def _finalize_params(fsdp_module: FullyShardedDataParallel) -> None:
                         continue
                     handle.prepare_gradient_for_optim()
                     p_assert(
-                        hasattr(p, '_post_backward_called'),
-                        "Expected flag _post_backward_called to be set on param."
+                        hasattr(p, "_post_backward_called"),
+                        "Expected flag _post_backward_called to be set on param.",
                     )
                     # Reset _post_backward_called in preparation for the next iteration.
                     p._post_backward_called = False
@@ -3479,22 +3538,25 @@ def clip_grad_norm_(
         norm_type = float(norm_type)
         # Compute the local gradient norm (only including this rank's shard
         # of the gradients)
-        local_norm = _get_grad_norm(self.parameters(), norm_type).to(self.compute_device)
+        local_norm = _get_grad_norm(self.parameters(), norm_type).to(
+            self.compute_device
+        )
         # Reconstruct the total gradient norm depending on the norm type
         if norm_type == math.inf:
             total_norm = local_norm
-            dist.all_reduce(total_norm, op=torch.distributed.ReduceOp.MAX, group=self.process_group)
+            dist.all_reduce(
+                total_norm, op=torch.distributed.ReduceOp.MAX, group=self.process_group
+            )
         else:
-            total_norm = local_norm ** norm_type
+            total_norm = local_norm**norm_type
             dist.all_reduce(total_norm, group=self.process_group)
             total_norm = total_norm ** (1.0 / norm_type)
         if self.cpu_offload.offload_params:
             total_norm = total_norm.cpu()
 
-        clip_coef = (
-            torch.tensor(max_norm, dtype=total_norm.dtype, device=total_norm.device)
-            / (total_norm + 1e-6)
-        )
+        clip_coef = torch.tensor(
+            max_norm, dtype=total_norm.dtype, device=total_norm.device
+        ) / (total_norm + 1e-6)
         # Multiplying by the clamped coefficient is meaningless when it is
         # equal to 1, but it avoids the host-device sync that would result from
         # `if clip_coef < 1`
@@ -3537,9 +3599,12 @@ def _raise_on_use_orig_params_optim_checkpoint(model: nn.Module):
     def full_optim_state_dict(
         model: torch.nn.Module,
         optim: torch.optim.Optimizer,
-        optim_input: Optional[Union[
-            List[Dict[str, Any]], Iterable[torch.nn.Parameter],
-        ]] = None,
+        optim_input: Optional[
+            Union[
+                List[Dict[str, Any]],
+                Iterable[torch.nn.Parameter],
+            ]
+        ] = None,
         rank0_only: bool = True,
         group: Optional[dist.ProcessGroup] = None,
     ) -> Dict[str, Any]:
@@ -3592,7 +3657,8 @@ def full_optim_state_dict(
         FullyShardedDataParallel._raise_on_use_orig_params_optim_checkpoint(model)
         FullyShardedDataParallel._warn_optim_input(optim_input)
         using_optim_input = FullyShardedDataParallel._is_using_optim_input(
-            optim_input, optim,
+            optim_input,
+            optim,
         )
         return _optim_state_dict(
             model=model,
@@ -3610,7 +3676,8 @@ def sharded_optim_state_dict(
         optim: torch.optim.Optimizer,
         optim_input: Optional[
             Union[
-                List[Dict[str, Any]], Iterable[torch.nn.Parameter],
+                List[Dict[str, Any]],
+                Iterable[torch.nn.Parameter],
             ]
         ] = None,
         group: Optional[dist.ProcessGroup] = None,
@@ -3629,7 +3696,8 @@ def sharded_optim_state_dict(
         FullyShardedDataParallel._raise_on_use_orig_params_optim_checkpoint(model)
         FullyShardedDataParallel._warn_optim_input(optim_input)
         using_optim_input = FullyShardedDataParallel._is_using_optim_input(
-            optim_input, optim,
+            optim_input,
+            optim,
         )
         # TODO: The ultimate goal of the optimizer state APIs should be the same
         # as state_dict/load_state_dict -- using one API to get optimizer states
@@ -3655,7 +3723,8 @@ def shard_full_optim_state_dict(
         model: torch.nn.Module,
         optim_input: Optional[
             Union[
-                List[Dict[str, Any]], Iterable[torch.nn.Parameter],
+                List[Dict[str, Any]],
+                Iterable[torch.nn.Parameter],
             ]
         ] = None,
         optim: Optional[torch.optim.Optimizer] = None,
@@ -3717,13 +3786,20 @@ def shard_full_optim_state_dict(
         FullyShardedDataParallel._raise_on_use_orig_params_optim_checkpoint(model)
         FullyShardedDataParallel._warn_optim_input(optim_input)
         using_optim_input = FullyShardedDataParallel._is_using_optim_input(
-            optim_input, optim,
+            optim_input,
+            optim,
         )
         sharded_osd = _flatten_optim_state_dict(
-            full_optim_state_dict, model, True,
+            full_optim_state_dict,
+            model,
+            True,
         )
         return _rekey_sharded_optim_state_dict(
-            sharded_osd, model, optim, optim_input, using_optim_input,
+            sharded_osd,
+            model,
+            optim,
+            optim_input,
+            using_optim_input,
         )
 
     @staticmethod
@@ -3732,7 +3808,8 @@ def flatten_sharded_optim_state_dict(
         model: torch.nn.Module,
         optim_input: Optional[
             Union[
-                List[Dict[str, Any]], Iterable[torch.nn.Parameter],
+                List[Dict[str, Any]],
+                Iterable[torch.nn.Parameter],
             ]
         ] = None,
         optim: Optional[torch.optim.Optimizer] = None,
@@ -3756,7 +3833,8 @@ def flatten_sharded_optim_state_dict(
         FullyShardedDataParallel._raise_on_use_orig_params_optim_checkpoint(model)
         FullyShardedDataParallel._warn_optim_input(optim_input)
         using_optim_input = FullyShardedDataParallel._is_using_optim_input(
-            optim_input, optim,
+            optim_input,
+            optim,
         )
         # TODO: The implementation is the same as ``shard_full_optim_state_dict``.
         # See the TODO in ``shard_full_optim_state_dict`` for the future
@@ -3767,16 +3845,23 @@ def flatten_sharded_optim_state_dict(
             shard_state=True,
         )
         return _rekey_sharded_optim_state_dict(
-            flattened_osd, model, optim, optim_input, using_optim_input,
+            flattened_osd,
+            model,
+            optim,
+            optim_input,
+            using_optim_input,
         )
 
     @staticmethod
     def scatter_full_optim_state_dict(
         full_optim_state_dict: Optional[Dict[str, Any]],
         model: torch.nn.Module,
-        optim_input: Optional[Union[
-            List[Dict[str, Any]], Iterable[torch.nn.Parameter],
-        ]] = None,
+        optim_input: Optional[
+            Union[
+                List[Dict[str, Any]],
+                Iterable[torch.nn.Parameter],
+            ]
+        ] = None,
         optim: Optional[torch.optim.Optimizer] = None,
         group: Optional[Any] = None,
     ) -> Dict[str, Any]:
@@ -3838,7 +3923,8 @@ def scatter_full_optim_state_dict(
         FullyShardedDataParallel._raise_on_use_orig_params_optim_checkpoint(model)
         FullyShardedDataParallel._warn_optim_input(optim_input)
         using_optim_input = FullyShardedDataParallel._is_using_optim_input(
-            optim_input, optim,
+            optim_input,
+            optim,
         )
         # Try to use the passed-in process group, the model's process group,
         # or the default process group (i.e. `None`) in that priority order
@@ -3848,8 +3934,9 @@ def scatter_full_optim_state_dict(
         world_size = dist.get_world_size(group)
         # Check for a valid broadcast device, preferring GPU when available
         using_nccl = dist.distributed_c10d._check_for_nccl_backend(group)
-        broadcast_device = torch.device("cuda") if torch.cuda.is_available() \
-            else torch.device("cpu")
+        broadcast_device = (
+            torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        )
         if using_nccl and not torch.cuda.is_available():
             raise RuntimeError("NCCL requires a GPU for collectives")
         # Flatten the optimizer state dict and construct a copy with the
@@ -3867,18 +3954,28 @@ def scatter_full_optim_state_dict(
         # Broadcast the optim state dict without positive-dimension tensor
         # state and the FSDP parameter IDs from rank 0 to all ranks
         processed_osd = _broadcast_processed_optim_state_dict(
-            processed_osd if rank == 0 else None, rank, group,
+            processed_osd if rank == 0 else None,
+            rank,
+            group,
         )
         # Broadcast positive-dimension tensor state (both sharded tensors for
         # FSDP parameters and unsharded tensors for non-FSDP parameters)
         sharded_osd = _broadcast_pos_dim_tensor_states(
-            processed_osd, flat_osd if rank == 0 else None, rank, world_size,
-            group, broadcast_device,
+            processed_osd,
+            flat_osd if rank == 0 else None,
+            rank,
+            world_size,
+            group,
+            broadcast_device,
         )
         # Rekey the optimizer state dict to use parameter IDs according to this
         # rank's `optim`
         sharded_osd = _rekey_sharded_optim_state_dict(
-            sharded_osd, model, optim, optim_input, using_optim_input,
+            sharded_osd,
+            model,
+            optim,
+            optim_input,
+            using_optim_input,
         )
         return sharded_osd
 
@@ -3887,9 +3984,12 @@ def rekey_optim_state_dict(
         optim_state_dict: Dict[str, Any],
         optim_state_key_type: OptimStateKeyType,
         model: torch.nn.Module,
-        optim_input: Optional[Union[
-            List[Dict[str, Any]], Iterable[torch.nn.Parameter],
-        ]] = None,
+        optim_input: Optional[
+            Union[
+                List[Dict[str, Any]],
+                Iterable[torch.nn.Parameter],
+            ]
+        ] = None,
         optim: Optional[torch.optim.Optimizer] = None,
     ) -> Dict[str, Any]:
         """
@@ -3926,30 +4026,30 @@ def rekey_optim_state_dict(
         """
         FullyShardedDataParallel._warn_optim_input(optim_input)
         using_optim_input = FullyShardedDataParallel._is_using_optim_input(
-            optim_input, optim,
+            optim_input,
+            optim,
         )
         assert optim_state_key_type in (
-            OptimStateKeyType.PARAM_NAME, OptimStateKeyType.PARAM_ID,
+            OptimStateKeyType.PARAM_NAME,
+            OptimStateKeyType.PARAM_ID,
         )
         osd = optim_state_dict  # alias
         # Validate that the existing parameter keys are uniformly typed
-        uses_param_name_mask = [
-            type(param_key) is str for param_key in osd["state"]
-        ]
-        uses_param_id_mask = [
-            type(param_key) is int for param_key in osd["state"]
-        ]
-        if (
-            (any(uses_param_name_mask) and not all(uses_param_name_mask))
-            or (any(uses_param_id_mask) and not all(uses_param_id_mask))
+        uses_param_name_mask = [type(param_key) is str for param_key in osd["state"]]
+        uses_param_id_mask = [type(param_key) is int for param_key in osd["state"]]
+        if (any(uses_param_name_mask) and not all(uses_param_name_mask)) or (
+            any(uses_param_id_mask) and not all(uses_param_id_mask)
         ):
             error_msg = f"Invalid parameter keys: {osd['state'].keys()}"
             raise ValueError(error_msg)
         # Return directly if the existing key type matches the target key type
-        if (optim_state_key_type == OptimStateKeyType.PARAM_NAME and
-            all(uses_param_name_mask)) or \
-            (optim_state_key_type == OptimStateKeyType.PARAM_ID and
-                all(uses_param_id_mask)):
+        if (
+            optim_state_key_type == OptimStateKeyType.PARAM_NAME
+            and all(uses_param_name_mask)
+        ) or (
+            optim_state_key_type == OptimStateKeyType.PARAM_ID
+            and all(uses_param_id_mask)
+        ):
             return osd
         # Otherwise, actually perform the re-keying
         new_osd = {}
@@ -3969,10 +4069,12 @@ def rekey_optim_state_dict(
             }
             new_osd["param_groups"] = copy.deepcopy(osd["param_groups"])
             for param_group in new_osd["param_groups"]:
-                param_group["params"] = sorted([
-                    param_id_to_param_name[param_id]
-                    for param_id in param_group["params"]
-                ])
+                param_group["params"] = sorted(
+                    [
+                        param_id_to_param_name[param_id]
+                        for param_id in param_group["params"]
+                    ]
+                )
             return new_osd
         elif optim_state_key_type == OptimStateKeyType.PARAM_ID:  # name -> ID
             param_name_to_param = _get_param_name_to_param(model)
@@ -3994,10 +4096,12 @@ def rekey_optim_state_dict(
             }
             new_osd["param_groups"] = copy.deepcopy(osd["param_groups"])
             for param_group in new_osd["param_groups"]:
-                param_group["params"] = sorted([
-                    param_name_to_param_id[param_name]
-                    for param_name in param_group["params"]
-                ])
+                param_group["params"] = sorted(
+                    [
+                        param_name_to_param_id[param_name]
+                        for param_name in param_group["params"]
+                    ]
+                )
             return new_osd
         return new_osd  # should never reach here
 
@@ -4056,12 +4160,17 @@ def register_comm_hook(self, state: object, hook: callable):
 
         """
         if not self.check_is_root():
-            raise AssertionError("register_comm_hook can only be called on a root instance.")
+            raise AssertionError(
+                "register_comm_hook can only be called on a root instance."
+            )
         for submodule in self.fsdp_modules(self):
-            assert not submodule._hook_registered, "communication hook can be only registered once"
+            assert (
+                not submodule._hook_registered
+            ), "communication hook can be only registered once"
             submodule._hook_registered = True
-            assert submodule._communication_hook == self._get_default_comm_hook(),\
-                f"communication hook should be default, but it is {submodule._communication_hook.__name__} instead"
+            assert (
+                submodule._communication_hook == self._get_default_comm_hook()
+            ), f"communication hook should be default, but it is {submodule._communication_hook.__name__} instead"
             submodule._communication_hook_state = state
             submodule._communication_hook = hook
 
@@ -4073,10 +4182,7 @@ def _init_param_exec_order_wrap_policy(self, *args, **kwargs) -> None:
             assert (
                 auto_wrap_policy.tracing_config is None
             ), "tracing_config should be None when torch.fx is not enabled"
-        elif isinstance(
-            auto_wrap_policy.tracing_config,
-            TracingConfig
-        ):
+        elif isinstance(auto_wrap_policy.tracing_config, TracingConfig):
             tracer = auto_wrap_policy.tracing_config.tracer
             execution_info = _init_execution_info(module)
 
@@ -4110,8 +4216,7 @@ def _init_param_exec_order_wrap_policy(self, *args, **kwargs) -> None:
         # A list that stores the flatten parameters and its name based on the parameter execution order
         self._fsdp_params_exec_order: List[FlatParameter] = []
         if _TORCH_FX_AVAIL and isinstance(
-            auto_wrap_policy.tracing_config,
-            TracingConfig
+            auto_wrap_policy.tracing_config, TracingConfig
         ):
             # Initialize a dict that maps each module to its parent FSDP wrap
             module_to_fsdp: Dict[nn.Module, FullyShardedDataParallel] = dict()
@@ -4137,8 +4242,7 @@ def _init_param_exec_order_wrap_policy(self, *args, **kwargs) -> None:
 
     def _use_param_exec_order_policy(self) -> bool:
         return (
-            hasattr(self, "_param_exec_order_policy")
-            and self._param_exec_order_policy
+            hasattr(self, "_param_exec_order_policy") and self._param_exec_order_policy
         )
 
     def _is_param_exec_order_prep_stage(self) -> bool:
@@ -4148,8 +4252,8 @@ def _is_param_exec_order_prep_stage(self) -> bool:
         )
         if not is_prep_stage:
             for p in self.parameters():
-                assert (
-                    not hasattr(p, "_params_exec_order_hook_handle")
+                assert not hasattr(
+                    p, "_params_exec_order_hook_handle"
                 ), "When not in execution order prep stage, all _params_exec_order_hook_handle should be removed."
         return is_prep_stage
 
@@ -4168,7 +4272,9 @@ def _get_grad_norm(
     grads = [param.grad for param in params_with_grad]
     grad_dtypes = set(grad.dtype for grad in grads)
     if len(grad_dtypes) != 1:
-        raise ValueError(f"Requires uniform dtype across all gradients but got {grad_dtypes}")
+        raise ValueError(
+            f"Requires uniform dtype across all gradients but got {grad_dtypes}"
+        )
     # Compute the gradient norm in FP32, where we treat the gradients as a
     # single vector
     grad_norm = torch.linalg.vector_norm(
@@ -4206,15 +4312,14 @@ def _get_param_to_unflat_param_names(
             in the module walk order; if ``False``, then includes all of the
             unflattened parameter names.
     """
+
     def module_fn(module, prefix, param_to_unflat_param_names):
         for param_name, param in module.named_parameters(recurse=False):
             module_prefixed_param_names = (
-                param._fqns if type(param) is FlatParameter
-                else [param_name]
+                param._fqns if type(param) is FlatParameter else [param_name]
             )  # prefixed from `module`
             fully_prefixed_param_names = [
-                clean_tensor_name(prefix + name)
-                for name in module_prefixed_param_names
+                clean_tensor_name(prefix + name) for name in module_prefixed_param_names
             ]  # fully prefixed from the top level including `prefix`
             # If this parameter has already been visited, then it is a
             # shared parameter; then, only take the first parameter name
@@ -4229,7 +4334,10 @@ def return_fn(param_to_unflat_param_names):
 
     param_to_unflat_param_names: Dict[torch.nn.Parameter, List[str]] = {}
     return _apply_to_modules(
-        model, module_fn, return_fn, param_to_unflat_param_names,
+        model,
+        module_fn,
+        return_fn,
+        param_to_unflat_param_names,
     )
 
 
@@ -4250,16 +4358,16 @@ def _get_param_to_param_name(
     """
     param_to_param_names = _get_param_to_unflat_param_names(model)
     for param_names in param_to_param_names.values():
-        assert len(param_names) > 0, "`_get_param_to_unflat_param_names()` " \
-            "should not construct empty lists"
+        assert len(param_names) > 0, (
+            "`_get_param_to_unflat_param_names()` " "should not construct empty lists"
+        )
         if len(param_names) > 1:
             raise RuntimeError(
                 "Each parameter should only map to one parameter name but got "
                 f"{len(param_names)}: {param_names}"
             )
     param_to_param_name = {
-        param: param_names[0]
-        for param, param_names in param_to_param_names.items()
+        param: param_names[0] for param, param_names in param_to_param_names.items()
     }
     return param_to_param_name
 
diff --git a/torch/distributed/fsdp/sharded_grad_scaler.py b/torch/distributed/fsdp/sharded_grad_scaler.py
index 27ba44e6c1516..86dbfd7edc16e 100644
--- a/torch/distributed/fsdp/sharded_grad_scaler.py
+++ b/torch/distributed/fsdp/sharded_grad_scaler.py
@@ -1,12 +1,12 @@
-from collections import abc, defaultdict
 import logging
+from collections import abc, defaultdict
 from typing import Dict, List, Optional, Union
 
 import torch
+import torch.distributed as dist
 from torch.cuda import FloatTensor  # type: ignore[attr-defined]
-from torch.cuda.amp.grad_scaler import GradScaler, OptState, _MultiDeviceReplicator
+from torch.cuda.amp.grad_scaler import _MultiDeviceReplicator, GradScaler, OptState
 from torch.distributed.distributed_c10d import ProcessGroup
-import torch.distributed as dist
 from torch.optim.sgd import SGD
 
 
@@ -23,6 +23,7 @@ class _GeneralMultiDeviceReplicator(_MultiDeviceReplicator):
     Lazily serves tensor to request device. This class extends
     _MultiDeviceReplicator to allow support for "cpu" as a device.
     """
+
     def __init__(self, master_tensor: torch.Tensor) -> None:
         assert _is_supported_device(master_tensor)
         self.master = master_tensor
@@ -77,9 +78,10 @@ class ShardedGradScaler(GradScaler):
         process_group (ProcessGroup, optional, default=torch.distributed.group.WORLD):
             process group for sharding
     """
+
     def __init__(
         self,
-        init_scale: float = 2.0 ** 16,
+        init_scale: float = 2.0**16,
         backoff_factor: float = 0.5,
         growth_factor: float = 2.0,
         growth_interval: int = 2000,
@@ -97,7 +99,9 @@ def __init__(
             self.process_group = process_group
             self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
 
-    def scale(self, outputs: Union[torch.Tensor, List[torch.Tensor]]) -> Union[torch.Tensor, List[torch.Tensor]]:
+    def scale(
+        self, outputs: Union[torch.Tensor, List[torch.Tensor]]
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
         if not self._enabled:
             return outputs
 
@@ -106,7 +110,9 @@ def scale(self, outputs: Union[torch.Tensor, List[torch.Tensor]]) -> Union[torch
             if self._scale is None:
                 self._lazy_init_scale_growth_tracker(outputs.device)
             assert self._scale is not None
-            scaled_output = outputs * self._scale.to(device=outputs.device, non_blocking=True)
+            scaled_output = outputs * self._scale.to(
+                device=outputs.device, non_blocking=True
+            )
             # Here we ensure the return dtype is the same as the outputs dtype.
             # For the FSDP + Mixed Precision use case, the loss output is in the Mixed Precision
             # format (fp16, bf16) and so the scaled loss should be of the same dtype.
@@ -114,7 +120,9 @@ def scale(self, outputs: Union[torch.Tensor, List[torch.Tensor]]) -> Union[torch
 
         stash: List[_GeneralMultiDeviceReplicator] = []
 
-        def apply_scale(val: Union[torch.Tensor, abc.Iterable]) -> Union[torch.Tensor, abc.Iterable]:
+        def apply_scale(
+            val: Union[torch.Tensor, abc.Iterable]
+        ) -> Union[torch.Tensor, abc.Iterable]:
             if isinstance(val, torch.Tensor):
                 assert _is_supported_device(val)
                 if len(stash) == 0:
@@ -150,20 +158,30 @@ def _foreach_non_finite_check_and_unscale_cpu_(
         for grad in grads:
             for tensor in grad:
                 if tensor.device != expected_device:
-                    logging.error("tensor device is %s and expected device is %s" % (tensor.device, expected_device))
+                    logging.error(
+                        "tensor device is %s and expected device is %s"
+                        % (tensor.device, expected_device)
+                    )
                     raise ValueError("Gradients must be on the same device.")
 
                 # check for non_overlapping_and_dense doesn't exist in the python world
                 # as remarked here https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/AmpKernels.cu#L108
                 # we assume tensor is not MTA(multi tensor apply) safe. iterate through each item regardless of dtype
-                if torch.isinf(tensor).any().item() is True or torch.isnan(tensor).any().item() is True:
+                if (
+                    torch.isinf(tensor).any().item() is True
+                    or torch.isnan(tensor).any().item() is True
+                ):
                     found_inf.data = torch.tensor([1.0])
                     break
                 else:
                     tensor.data *= inv_scale.item()
 
     def _unscale_grads_(
-        self, optimizer: SGD, inv_scale: torch.Tensor, found_inf: torch.Tensor, allow_fp16: bool = True
+        self,
+        optimizer: SGD,
+        inv_scale: torch.Tensor,
+        found_inf: torch.Tensor,
+        allow_fp16: bool = True,
     ) -> Dict[torch.device, torch.Tensor]:
         per_device_inv_scale = _GeneralMultiDeviceReplicator(inv_scale)
         per_device_found_inf = _GeneralMultiDeviceReplicator(found_inf)
@@ -195,7 +213,9 @@ def _unscale_grads_(
                     else:
                         to_unscale = param.grad
 
-                    per_device_and_dtype_grads[to_unscale.device][to_unscale.dtype].append(to_unscale)
+                    per_device_and_dtype_grads[to_unscale.device][
+                        to_unscale.dtype
+                    ].append(to_unscale)
 
             for device, per_dtype_grads in per_device_and_dtype_grads.items():
                 for grads in per_dtype_grads.values():
@@ -222,16 +242,22 @@ def unscale_(self, optimizer: SGD) -> None:
         optimizer_state = self._per_optimizer_states[id(optimizer)]
 
         if optimizer_state["stage"] is OptState.UNSCALED:
-            raise RuntimeError("unscale_() has already been called on this optimizer since the last update().")
+            raise RuntimeError(
+                "unscale_() has already been called on this optimizer since the last update()."
+            )
         elif optimizer_state["stage"] is OptState.STEPPED:
             raise RuntimeError("unscale_() is being called after step().")
 
         # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
         assert self._scale is not None
         inv_scale = self._scale.double().reciprocal().float()
-        found_inf = torch.full((1,), 0.0, dtype=torch.float32, device=self._scale.device)
+        found_inf = torch.full(
+            (1,), 0.0, dtype=torch.float32, device=self._scale.device
+        )
 
-        optimizer_state["found_inf_per_device"] = self._unscale_grads_(optimizer, inv_scale, found_inf, True)
+        optimizer_state["found_inf_per_device"] = self._unscale_grads_(
+            optimizer, inv_scale, found_inf, True
+        )
         optimizer_state["stage"] = OptState.UNSCALED
 
         # Synchronize the detected inf across the ranks
@@ -241,10 +267,18 @@ def unscale_(self, optimizer: SGD) -> None:
         for v in optimizer_state["found_inf_per_device"].values():
             if v.device.type == "cpu":
                 v_on_cuda = v.cuda()
-                future_handles.append(dist.all_reduce(v_on_cuda, async_op=True, group=self.process_group).get_future())
+                future_handles.append(
+                    dist.all_reduce(
+                        v_on_cuda, async_op=True, group=self.process_group
+                    ).get_future()
+                )
                 v.copy_(v_on_cuda.cpu())
             else:
-                future_handles.append(dist.all_reduce(v, async_op=True, group=self.process_group).get_future())
+                future_handles.append(
+                    dist.all_reduce(
+                        v, async_op=True, group=self.process_group
+                    ).get_future()
+                )
 
         # Make sure that the calls are done before moving out.
         if future_handles:
diff --git a/torch/distributed/fsdp/wrap.py b/torch/distributed/fsdp/wrap.py
index 8013da8e37ea1..c529bcde8c859 100644
--- a/torch/distributed/fsdp/wrap.py
+++ b/torch/distributed/fsdp/wrap.py
@@ -5,22 +5,11 @@
 
 import contextlib
 from dataclasses import dataclass
-from typing import (
-    Any,
-    Callable,
-    Dict,
-    Generator,
-    Optional,
-    Set,
-    Tuple,
-    Type,
-    cast,
-)
+from typing import Any, Callable, cast, Dict, Generator, Optional, Set, Tuple, Type
 
 import torch.nn as nn
 from torch.nn.modules.batchnorm import _BatchNorm
 
-
 __all__ = [
     "always_wrap_policy",
     "lambda_auto_wrap_policy",
@@ -41,11 +30,9 @@ def always_wrap_policy(*args, **kwargs) -> bool:
     """
     return True
 
+
 def lambda_auto_wrap_policy(
-    module: nn.Module,
-    recurse: bool,
-    unwrapped_params: int,
-    lambda_fn: Callable
+    module: nn.Module, recurse: bool, unwrapped_params: int, lambda_fn: Callable
 ) -> bool:
     """
     A convenient auto wrap policy to wrap submodules based on an arbitrary user
@@ -78,6 +65,7 @@ def lambda_auto_wrap_policy(
         # if not recursing, decide whether we should wrap for the leaf node or reminder
         return lambda_fn(module)
 
+
 def transformer_auto_wrap_policy(
     module: nn.Module,
     recurse: bool,
@@ -121,6 +109,7 @@ def transformer_auto_wrap_policy(
         # if not recursing, decide whether we should wrap for the leaf node or reminder
         return isinstance(module, tuple(transformer_layer_cls))
 
+
 def _wrap_batchnorm_individually(
     module: nn.Module,
     recurse: bool,
@@ -138,6 +127,7 @@ def _wrap_batchnorm_individually(
         # BN layer or not.
         return isinstance(module, _BatchNorm)
 
+
 def _or_policy(
     module: nn.Module,
     recurse: bool,
@@ -148,9 +138,7 @@ def _or_policy(
     A policy that wraps ``module`` if any policy in the passed in iterable of
     ``policies`` returns ``True``.
     """
-    return any(
-        policy(module, recurse, unwrapped_params) for policy in policies
-    )
+    return any(policy(module, recurse, unwrapped_params) for policy in policies)
 
 
 def size_based_auto_wrap_policy(
@@ -333,13 +321,14 @@ class ParamExecOrderWrapPolicy:
     ``full``, ``full_like``, ``eye``, ``empty``, ``tensor``). For those cases,
     users can set ``tracing_config = None`` to disable symbolic tracing.
     """
+
     init_policy: Callable = always_wrap_policy
     tracing_config: Any = None
 
 
 def _wrap(module: nn.Module, wrapper_cls: Callable, **kwargs) -> nn.Module:
     assert wrapper_cls is not None
-    if hasattr(module, '_wrap_overrides'):
+    if hasattr(module, "_wrap_overrides"):
         # If module has a _wrap_overrides attribute, we force overriding the
         # FSDP config with these attributes for this module. Currently this
         # is only used to disable mixed precision for BatchNorm when
@@ -357,7 +346,7 @@ def _recursive_wrap(
     ignored_modules: Set[nn.Module],
     ignored_params: Set[nn.Parameter],
     only_wrap_children: bool = False,
-    **kwargs: Any
+    **kwargs: Any,
 ) -> Tuple[nn.Module, int]:
     """
     Automatically wrap child modules of *module* that meet the given
@@ -389,9 +378,7 @@ def _recursive_wrap(
             pass
 
     # We count all params, assuming none of them are already wrapped.
-    num_params = sum(
-        p.numel() for p in module.parameters() if p not in ignored_params
-    )
+    num_params = sum(p.numel() for p in module.parameters() if p not in ignored_params)
 
     assert auto_wrap_policy is not None
     if auto_wrap_policy(module=module, recurse=True, unwrapped_params=num_params):

From a58191df0b6f325ad418d5fa8bd28630d0dda675 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Thu, 27 Oct 2022 00:03:15 +0000
Subject: [PATCH 0222/1922] [FSDP] ufmt FSDP test (#87812)

This applies `ufmt` to all of the FSDP test files in the `test/distributed/fsdp/` directory.

**Test Plan**
CI

**Notes**
For VSCode users,
- Install `ufmt`: https://pypi.org/project/ufmt/
- Install VSCode `ufmt` extension: https://marketplace.visualstudio.com/items?itemName=omnilib.ufmt
- Include in `settings.json`:
```
{
    "[python]": {
        "editor.defaultFormatter": "omnilib.ufmt",
        "editor.formatOnSave": true,
    },
}
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87812
Approved by: https://github.com/rohan-varma
---
 .../fsdp/test_checkpoint_wrapper.py           | 121 ++++---
 .../fsdp/test_distributed_checkpoint.py       |  25 +-
 test/distributed/fsdp/test_fsdp_apply.py      |   5 +-
 test/distributed/fsdp/test_fsdp_checkpoint.py |  60 ++--
 .../fsdp/test_fsdp_clip_grad_norm.py          |  21 +-
 test/distributed/fsdp/test_fsdp_comm.py       |  66 ++--
 test/distributed/fsdp/test_fsdp_comm_hooks.py | 220 +++++++------
 test/distributed/fsdp/test_fsdp_core.py       |  14 +-
 test/distributed/fsdp/test_fsdp_exec_order.py |   2 +-
 .../fsdp/test_fsdp_freezing_weights.py        |   8 +-
 test/distributed/fsdp/test_fsdp_grad_acc.py   |  74 ++---
 .../fsdp/test_fsdp_ignored_modules.py         |  27 +-
 test/distributed/fsdp/test_fsdp_input.py      |   7 +-
 test/distributed/fsdp/test_fsdp_memory.py     |   7 +-
 test/distributed/fsdp/test_fsdp_meta.py       |  68 ++--
 test/distributed/fsdp/test_fsdp_misc.py       |  94 +++---
 .../fsdp/test_fsdp_mixed_precision.py         | 130 +++++---
 .../fsdp/test_fsdp_multiple_forward.py        |   8 +-
 .../fsdp/test_fsdp_multiple_wrapping.py       |   3 +-
 .../distributed/fsdp/test_fsdp_optim_state.py | 307 ++++++++++++------
 test/distributed/fsdp/test_fsdp_overlap.py    |   7 +-
 test/distributed/fsdp/test_fsdp_pure_fp16.py  |   3 +-
 .../fsdp/test_fsdp_sharded_grad_scaler.py     |  69 ++--
 test/distributed/fsdp/test_fsdp_state_dict.py | 144 +++++---
 .../fsdp/test_fsdp_summon_full_params.py      |  50 ++-
 test/distributed/fsdp/test_fsdp_traversal.py  |  15 +-
 test/distributed/fsdp/test_fsdp_uneven.py     |   7 +-
 .../fsdp/test_fsdp_use_orig_params.py         |  34 +-
 test/distributed/fsdp/test_utils.py           |  11 +-
 test/distributed/fsdp/test_wrap.py            |  95 +++---
 torch/testing/_internal/common_fsdp.py        |  66 ++--
 31 files changed, 1061 insertions(+), 707 deletions(-)

diff --git a/test/distributed/fsdp/test_checkpoint_wrapper.py b/test/distributed/fsdp/test_checkpoint_wrapper.py
index 8bd2b74695d3b..d8e005fcf82be 100644
--- a/test/distributed/fsdp/test_checkpoint_wrapper.py
+++ b/test/distributed/fsdp/test_checkpoint_wrapper.py
@@ -1,30 +1,25 @@
 # Owner(s): ["oncall: distributed"]
 
+import unittest
 from copy import deepcopy
 from functools import partial
 
 import torch
 import torch.nn as nn
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
-    checkpoint_wrapper,
-    offload_wrapper,
     apply_activation_checkpointing,
+    checkpoint_wrapper,
+    CheckpointImpl,
     CheckpointWrapper,
+    offload_wrapper,
     OffloadWrapper,
-    CheckpointImpl
 )
-
+from torch.testing._internal.common_utils import run_tests, TestCase
 from torch.utils.checkpoint import checkpoint
 
-from torch.testing._internal.common_utils import (
-    run_tests,
-    TestCase,
-)
-
-import unittest
+_SAVED_PREFIX = "_saved_"
+GRAD_FN_NEXT_FUNCTIONS = "next_functions"
 
-_SAVED_PREFIX = '_saved_'
-GRAD_FN_NEXT_FUNCTIONS = 'next_functions'
 
 class CheckpointWrapperTest(TestCase):
     def setUp(self):
@@ -66,13 +61,7 @@ def __init__(self):
                 self.lin = nn.Linear(10, 10)
 
             def forward(self, a, b, c=None, d=None, **kwargs):
-                return (
-                    self.lin(a),
-                    self.lin(b),
-                    self.lin(c),
-                    self.lin(d)
-                )
-
+                return (self.lin(a), self.lin(b), self.lin(c), self.lin(d))
 
         for wrapper in [
             partial(checkpoint_wrapper, checkpoint_impl=CheckpointImpl.REENTRANT),
@@ -113,7 +102,6 @@ def forward(self, *, a=None, b=None):
         out = model(a=inp, b=inp)
         self.assertEqual(2, len(out))
 
-
     @unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA")
     def test_checkpoint_wrapper_parity(self):
         """
@@ -122,13 +110,14 @@ def test_checkpoint_wrapper_parity(self):
         results in the same maximum memory usage, i.e. they are
         equivalent memory usage wise.
         """
+
         class Model(nn.Module):
             def __init__(
                 self,
                 n: int,
                 use_cp: bool,
                 use_wrapper: bool = False,
-                use_reentrant: bool = True
+                use_reentrant: bool = True,
             ):
                 super().__init__()
                 self.layers = nn.ModuleList()
@@ -138,10 +127,14 @@ def __init__(
                 self.use_reentrant = use_reentrant
                 wrp = partial(
                     checkpoint_wrapper,
-                    checkpoint_impl=CheckpointImpl.REENTRANT if use_reentrant else CheckpointImpl.NO_REENTRANT
+                    checkpoint_impl=CheckpointImpl.REENTRANT
+                    if use_reentrant
+                    else CheckpointImpl.NO_REENTRANT,
                 )
                 for i in range(self.n):
-                    l = nn.Sequential(nn.Linear(256, 256), nn.Linear(256, 256), nn.Linear(256, 256))
+                    l = nn.Sequential(
+                        nn.Linear(256, 256), nn.Linear(256, 256), nn.Linear(256, 256)
+                    )
                     use_checkpoint_wrapper = self.use_wrapper
                     if use_checkpoint_wrapper:
                         l = wrp(l)
@@ -149,29 +142,41 @@ def __init__(
 
             def forward(self, x):
                 for i in range(self.n):
-                    if (
-                        self.use_wrapper or
-                        not self.use_cp
-                    ):
+                    if self.use_wrapper or not self.use_cp:
                         x = self.layers[i](x)
                     else:
-                        x = checkpoint(self.layers[i], x, use_reentrant=self.use_reentrant)
+                        x = checkpoint(
+                            self.layers[i], x, use_reentrant=self.use_reentrant
+                        )
                 return x
 
         def test(use_checkpointing, use_wrapper, use_reentrant):
-            a = Model(8, use_checkpointing, use_wrapper=use_wrapper, use_reentrant=use_reentrant).cuda()
+            a = Model(
+                8,
+                use_checkpointing,
+                use_wrapper=use_wrapper,
+                use_reentrant=use_reentrant,
+            ).cuda()
             x = torch.randn(10000, 256, requires_grad=True).cuda()
             torch.cuda.reset_peak_memory_stats()
             loss = a(x).sum()
             loss.backward()
             return torch.cuda.max_memory_allocated()
 
-        functional_no_reentrant = test(use_checkpointing=True, use_wrapper=False, use_reentrant=False)
-        wrapper_no_reentrant = test(use_checkpointing=False, use_wrapper=True, use_reentrant=False)
+        functional_no_reentrant = test(
+            use_checkpointing=True, use_wrapper=False, use_reentrant=False
+        )
+        wrapper_no_reentrant = test(
+            use_checkpointing=False, use_wrapper=True, use_reentrant=False
+        )
         self.assertEqual(functional_no_reentrant, wrapper_no_reentrant)
 
-        functional_reentrant = test(use_checkpointing=True, use_wrapper=False, use_reentrant=True)
-        wrapper_reentrant = test(use_checkpointing=False, use_wrapper=True, use_reentrant=True)
+        functional_reentrant = test(
+            use_checkpointing=True, use_wrapper=False, use_reentrant=True
+        )
+        wrapper_reentrant = test(
+            use_checkpointing=False, use_wrapper=True, use_reentrant=True
+        )
         self.assertEqual(functional_reentrant, wrapper_reentrant)
 
     def test_forward_missing_attributes(self):
@@ -181,8 +186,8 @@ def test_forward_missing_attributes(self):
         # Test indexing is forwarded
         self.assertEqual(wrapped[0], lin)
         # Test missing attributes are forwarded.
-        m._foo = 'bar'
-        self.assertEqual(wrapped._foo, 'bar')
+        m._foo = "bar"
+        self.assertEqual(wrapped._foo, "bar")
 
     def test_apply_activation_checkpointing(self):
         """
@@ -190,6 +195,7 @@ def test_apply_activation_checkpointing(self):
         to swap modules for their checkpoint-wrapped counterparts given
         a model.
         """
+
         class LinearWithBatchNorm(nn.Module):
             def __init__(self):
                 super().__init__()
@@ -210,7 +216,6 @@ def __init__(self):
             def forward(self, x):
                 return self.seq(x)
 
-
         def check_fn(l):
             return isinstance(l, nn.Linear)
 
@@ -231,13 +236,27 @@ def check_fn(l):
                 apply_activation_checkpointing(
                     model, checkpoint_wrapper_fn=wrapper, check_fn=check_fn
                 )
-                n_linear_wrapped = sum(1 if isinstance(x, nn.Linear) else 0 for x in model.modules())
-                n_checkpointed = sum(1 if isinstance(x, (CheckpointWrapper, OffloadWrapper)) else 0 for x in model.modules())
+                n_linear_wrapped = sum(
+                    1 if isinstance(x, nn.Linear) else 0 for x in model.modules()
+                )
+                n_checkpointed = sum(
+                    1 if isinstance(x, (CheckpointWrapper, OffloadWrapper)) else 0
+                    for x in model.modules()
+                )
                 self.assertEqual(n_checkpointed, n_linear_wrapped)
                 self.assertEqual(n_linear, n_linear_wrapped)
                 for j in range(3):
-                    self.assertTrue(isinstance(model.seq[j].lin, (CheckpointWrapper, OffloadWrapper)))
-                    self.assertTrue(isinstance(model.seq[j].nested_linear[0], (CheckpointWrapper, OffloadWrapper)))
+                    self.assertTrue(
+                        isinstance(
+                            model.seq[j].lin, (CheckpointWrapper, OffloadWrapper)
+                        )
+                    )
+                    self.assertTrue(
+                        isinstance(
+                            model.seq[j].nested_linear[0],
+                            (CheckpointWrapper, OffloadWrapper),
+                        )
+                    )
 
                 inp = torch.randn(4, 10, requires_grad=True)
                 for i in range(6):
@@ -249,9 +268,22 @@ def check_fn(l):
                     for j in range(3):
                         weight_lin = model.seq[j].lin._checkpoint_wrapped_module.weight
                         bias_lin = model.seq[j].lin._checkpoint_wrapped_module.bias
-                        weight_nested_lin = model.seq[j].nested_linear[0]._checkpoint_wrapped_module.weight
-                        bias_nested_lin = model.seq[j].nested_linear[0]._checkpoint_wrapped_module.bias
-                        for param in [weight_lin, bias_lin, weight_nested_lin, bias_nested_lin]:
+                        weight_nested_lin = (
+                            model.seq[j]
+                            .nested_linear[0]
+                            ._checkpoint_wrapped_module.weight
+                        )
+                        bias_nested_lin = (
+                            model.seq[j]
+                            .nested_linear[0]
+                            ._checkpoint_wrapped_module.bias
+                        )
+                        for param in [
+                            weight_lin,
+                            bias_lin,
+                            weight_nested_lin,
+                            bias_nested_lin,
+                        ]:
                             self.assertTrue(param.requires_grad)
                             self.assertFalse(param.grad is None)
 
@@ -287,7 +319,7 @@ def testing_cpu_offload_unpack_hook(packed):
 
         model = offload_wrapper(model)
 
-        inp = torch.randn(3, 10, device='cuda')
+        inp = torch.randn(3, 10, device="cuda")
         loss = model(inp).sum()
 
         # All autograd saved tensors should be offloaded to CPU.
@@ -314,5 +346,6 @@ def dfs(grad_fn):
 
         torch.autograd.graph.saved_tensors_hooks.__init__ = orig_init
 
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_distributed_checkpoint.py b/test/distributed/fsdp/test_distributed_checkpoint.py
index ef95973764c43..e64fd358a305e 100644
--- a/test/distributed/fsdp/test_distributed_checkpoint.py
+++ b/test/distributed/fsdp/test_distributed_checkpoint.py
@@ -8,20 +8,14 @@
 from torch.distributed._shard.checkpoint import (
     FileSystemReader,
     FileSystemWriter,
-    save_state_dict,
     load_state_dict,
+    save_state_dict,
 )
-from torch.distributed.fsdp import (
-    FullyShardedDataParallel as FSDP,
-    StateDictType,
-)
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, StateDictType
 from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel
 from torch.distributed.fsdp.wrap import enable_wrap, wrap
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.testing._internal.common_fsdp import (
-    FSDPTest,
-    SkipModel,
-)
+from torch.testing._internal.common_fsdp import FSDPTest, SkipModel
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -29,7 +23,6 @@
     TEST_WITH_DEV_DBG_ASAN,
 )
 
-
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
@@ -75,16 +68,16 @@ def test_distributed_checkpoint(self, state_dict_type) -> None:
             path = paths[0]
             writer = FileSystemWriter(path)
             reader = FileSystemReader(path)
-            with FSDP.state_dict_type(
-                model, state_dict_type
-            ), FSDP.state_dict_type(new_model, state_dict_type):
+            with FSDP.state_dict_type(model, state_dict_type), FSDP.state_dict_type(
+                new_model, state_dict_type
+            ):
                 state_dict = model.state_dict()
 
             save_state_dict(state_dict, writer)
 
-            with FSDP.state_dict_type(
-                model, state_dict_type
-            ), FSDP.state_dict_type(new_model, state_dict_type):
+            with FSDP.state_dict_type(model, state_dict_type), FSDP.state_dict_type(
+                new_model, state_dict_type
+            ):
                 state_dict = new_model.state_dict()
                 load_state_dict(state_dict, reader)
                 new_model.load_state_dict(state_dict)
diff --git a/test/distributed/fsdp/test_fsdp_apply.py b/test/distributed/fsdp/test_fsdp_apply.py
index d72d57d133b0d..d44239a329344 100644
--- a/test/distributed/fsdp/test_fsdp_apply.py
+++ b/test/distributed/fsdp/test_fsdp_apply.py
@@ -14,10 +14,7 @@
     NestedWrappedModule,
     TransformerWithSharedParams,
 )
-from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
-    run_tests,
-)
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
diff --git a/test/distributed/fsdp/test_fsdp_checkpoint.py b/test/distributed/fsdp/test_fsdp_checkpoint.py
index 50a5573f901f8..f0e8188641459 100644
--- a/test/distributed/fsdp/test_fsdp_checkpoint.py
+++ b/test/distributed/fsdp/test_fsdp_checkpoint.py
@@ -1,37 +1,31 @@
 # Owner(s): ["oncall: distributed"]
 
 import contextlib
+import sys
 from copy import deepcopy
 from functools import partial
-import sys
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-from torch.distributed.fsdp.fully_sharded_data_parallel import (
-    FullyShardedDataParallel as FSDP,
-    CPUOffload,
-)
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     checkpoint_wrapper,
     offload_wrapper,
 )
-from torch.testing._internal.common_distributed import (
-    skip_if_lt_x_gpu,
-)
-from torch.testing._internal.common_fsdp import (
-    FSDPTest,
-    _maybe_wrap_fsdp,
+from torch.distributed.fsdp.fully_sharded_data_parallel import (
+    CPUOffload,
+    FullyShardedDataParallel as FSDP,
 )
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import _maybe_wrap_fsdp, FSDPTest
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
-    run_tests,
-    parametrize,
     instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 from torch.utils.checkpoint import checkpoint
 
-
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
@@ -44,10 +38,13 @@
     sys.exit(0)
 
 
-
 _save_on_cpu_called = False
+
+
 def get_patched_save_on_cpu():
-    orig_save_on_cpu = torch.distributed.algorithms._checkpoint.checkpoint_wrapper.save_on_cpu
+    orig_save_on_cpu = (
+        torch.distributed.algorithms._checkpoint.checkpoint_wrapper.save_on_cpu
+    )
 
     def patched_save_on_cpu(*args, **kwargs):
         global _save_on_cpu_called
@@ -56,14 +53,22 @@ def patched_save_on_cpu(*args, **kwargs):
 
     return patched_save_on_cpu
 
+
 @contextlib.contextmanager
 def patch_save_on_cpu(new_save_on_cpu):
-    orig_save_on_cpu = torch.distributed.algorithms._checkpoint.checkpoint_wrapper.save_on_cpu
-    torch.distributed.algorithms._checkpoint.checkpoint_wrapper.save_on_cpu = new_save_on_cpu
+    orig_save_on_cpu = (
+        torch.distributed.algorithms._checkpoint.checkpoint_wrapper.save_on_cpu
+    )
+    torch.distributed.algorithms._checkpoint.checkpoint_wrapper.save_on_cpu = (
+        new_save_on_cpu
+    )
     try:
         yield
     finally:
-        torch.distributed.algorithms._checkpoint.checkpoint_wrapper.save_on_cpu = orig_save_on_cpu
+        torch.distributed.algorithms._checkpoint.checkpoint_wrapper.save_on_cpu = (
+            orig_save_on_cpu
+        )
+
 
 class TestFSDPCheckpoint(FSDPTest):
     class SequentialModule(nn.Module):
@@ -143,7 +148,8 @@ def test_checkpoint_fsdp_wrapping(
         fsdp_kwargs = {"cpu_offload": cpu_offload, "use_orig_params": use_orig_params}
         ckpt_sequential_wrapped_fsdp = wrapper_to_use(
             TestFSDPCheckpoint.SequentialModule(
-                wrap_fsdp=True, **fsdp_kwargs,
+                wrap_fsdp=True,
+                **fsdp_kwargs,
             ),
         )
         # Test FSDP(checkpoint(layer1)), FSDP(checkpoint(layer2)), ....
@@ -155,7 +161,8 @@ def test_checkpoint_fsdp_wrapping(
         )
 
         baseline = TestFSDPCheckpoint.SequentialModule(
-            wrap_fsdp=True, **fsdp_kwargs,
+            wrap_fsdp=True,
+            **fsdp_kwargs,
         )
 
         # note that reentrant-based checkpointing requires inputs to have grad
@@ -223,7 +230,9 @@ def test_basic_checkpoint_end_to_end(
             # note that reentrant-based checkpointing requires inputs to have grad
             # flag set.
 
-            inp = torch.randn(10, 3, device=torch.cuda.current_device(), requires_grad=True)
+            inp = torch.randn(
+                10, 3, device=torch.cuda.current_device(), requires_grad=True
+            )
 
             models = [
                 fsdp_only_seq,
@@ -237,7 +246,9 @@ def test_basic_checkpoint_end_to_end(
                 losses = []
                 outputs = []
                 for m in models:
-                    check_offload = m != fsdp_only_seq and i == 0 and offload_activations
+                    check_offload = (
+                        m != fsdp_only_seq and i == 0 and offload_activations
+                    )
                     if m == fsdp_call_checkpoint:
                         # _save_on_cpu should not be called yet
                         self.assertFalse(_save_on_cpu_called)
@@ -265,6 +276,7 @@ def test_basic_checkpoint_end_to_end(
 
         dist.barrier()
 
+
 instantiate_parametrized_tests(TestFSDPCheckpoint)
 
 if __name__ == "__main__":
diff --git a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
index 3af5a83cdde42..ddba50a9e4561 100644
--- a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
+++ b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
@@ -8,8 +8,8 @@
 import torch
 import torch.nn as nn
 from torch import distributed as dist
-from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload
 from torch.distributed.fsdp.fully_sharded_data_parallel import (
+    CPUOffload,
     FullyShardedDataParallel as FSDP,
 )
 from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
@@ -23,9 +23,9 @@
     TransformerWithSharedParams,
 )
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
     instantiate_parametrized_tests,
     run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 
 if not dist.is_available():
@@ -53,6 +53,7 @@ def test_non_root(self):
         Tests that calling ``clip_grad_norm_()`` on a non-root FSDP instance
         raises an error.
         """
+
         class Model(nn.Module):
             def __init__(self) -> None:
                 super().__init__()
@@ -132,18 +133,26 @@ def _test_ddp_parity(
         # Multiply gradients by a large factor to ensure that gradients will
         # actually be clipped
         for param in itertools.chain(ddp_model.parameters(), fsdp_model.parameters()):
-            if param.grad is not None:  # gradients may be `None` for `use_orig_params=True`
+            if (
+                param.grad is not None
+            ):  # gradients may be `None` for `use_orig_params=True`
                 param.grad *= LARGE_FACTOR
-        orig_ddp_grads = [param.grad.detach().clone() for param in ddp_model.parameters()]
+        orig_ddp_grads = [
+            param.grad.detach().clone() for param in ddp_model.parameters()
+        ]
         orig_fsdp_grads = [
             param.grad.detach().clone() if param.grad is not None else None
             for param in fsdp_model.parameters()
         ]
 
         ddp_total_norm = torch.nn.utils.clip_grad_norm_(
-            ddp_model.parameters(), max_norm=max_norm, norm_type=norm_type,
+            ddp_model.parameters(),
+            max_norm=max_norm,
+            norm_type=norm_type,
+        )
+        fsdp_total_norm = fsdp_model.clip_grad_norm_(
+            max_norm=max_norm, norm_type=norm_type
         )
-        fsdp_total_norm = fsdp_model.clip_grad_norm_(max_norm=max_norm, norm_type=norm_type)
         self.assertEqual(ddp_total_norm, fsdp_total_norm)
 
         # Check that the gradients were modified by `clip_grad_norm_()`
diff --git a/test/distributed/fsdp/test_fsdp_comm.py b/test/distributed/fsdp/test_fsdp_comm.py
index d19617e31acd3..117e756da252e 100644
--- a/test/distributed/fsdp/test_fsdp_comm.py
+++ b/test/distributed/fsdp/test_fsdp_comm.py
@@ -2,7 +2,7 @@
 
 import sys
 from contextlib import suppress
-from enum import Enum, auto
+from enum import auto, Enum
 from typing import Optional
 from unittest.mock import patch
 
@@ -19,10 +19,10 @@
     TransformerWithSharedParams,
 )
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 
 if not dist.is_available():
@@ -45,6 +45,7 @@ class PassType(Enum):
 
 class TestCommunication(FSDPTest):
     """Tests ``FullyShardedDataParallel``'s collective communication usage."""
+
     def _init_model(
         self,
         nested_model: bool,
@@ -106,7 +107,8 @@ def _get_ref_num_all_gathers(
                 pass_type,
                 is_first_iter,
                 is_last_iter_no_sync,
-            ) for pass_type in PassType
+            )
+            for pass_type in PassType
         )
 
     def _get_ref_num_all_gathers_in_pass(
@@ -121,9 +123,11 @@ def _get_ref_num_all_gathers_in_pass(
         if sharding_strategy is None:
             sharding_strategy = ShardingStrategy.FULL_SHARD  # default
         # Forward pass:
-        if pass_type == PassType.FWD and \
-            sharding_strategy == ShardingStrategy.SHARD_GRAD_OP and \
-                is_last_iter_no_sync:
+        if (
+            pass_type == PassType.FWD
+            and sharding_strategy == ShardingStrategy.SHARD_GRAD_OP
+            and is_last_iter_no_sync
+        ):
             # Modules do not free the full parameters in the last
             # iteration's backward pass if it was in `no_sync()`
             num_all_gathers = 0
@@ -132,21 +136,27 @@ def _get_ref_num_all_gathers_in_pass(
             # forward pass
             num_all_gathers = num_fsdp
         # Backward pass:
-        elif pass_type == PassType.BWD and \
-                sharding_strategy == ShardingStrategy.FULL_SHARD:
+        elif (
+            pass_type == PassType.BWD
+            and sharding_strategy == ShardingStrategy.FULL_SHARD
+        ):
             # Root does not free the full parameters at the end of the
             # forward pass
             num_all_gathers = num_fsdp - 1
-        elif pass_type == PassType.BWD and \
-                sharding_strategy == ShardingStrategy.SHARD_GRAD_OP:
+        elif (
+            pass_type == PassType.BWD
+            and sharding_strategy == ShardingStrategy.SHARD_GRAD_OP
+        ):
             # Modules do not free the full parameters at the end of the
             # forward pass
             num_all_gathers = 0
         else:
-            assert 0, f"Unsupported: add a branch for pass_type={pass_type} " \
-                f"is_first_iter={is_first_iter} " \
-                f"is_last_iter_no_sync={is_last_iter_no_sync} " \
+            assert 0, (
+                f"Unsupported: add a branch for pass_type={pass_type} "
+                f"is_first_iter={is_first_iter} "
+                f"is_last_iter_no_sync={is_last_iter_no_sync} "
                 f"sharding_strategy={sharding_strategy}"
+            )
         if is_first_iter and pass_type == PassType.FWD:
             # With execution order validation, on the first iteration, we have
             # an additional two all-gathers before every actual all-gather in
@@ -167,7 +177,10 @@ def _print_ref_num_all_gathers_in_pass(
         if self.rank != 0:
             return  # only print on one rank
         num_all_gathers = self._get_ref_num_all_gathers_in_pass(
-            num_fsdp, sharding_strategy, pass_type, is_first_iter,
+            num_fsdp,
+            sharding_strategy,
+            pass_type,
+            is_first_iter,
             is_last_iter_no_sync,
         )
         print(
@@ -211,8 +224,7 @@ def test_communication(
         # Count the number of FSDP instances that manage parameters since the
         # number of collectives are a function of this number
         num_fsdp = sum(
-            (isinstance(m, FSDP) and len(m.params) > 0)
-            for m in fsdp_model.modules()
+            (isinstance(m, FSDP) and len(m.params) > 0) for m in fsdp_model.modules()
         )
 
         # If `use_no_sync=True`, we run `num_iters` iterations inside
@@ -220,11 +232,16 @@ def test_communication(
         # and if `use_no_sync=False`, we only run `num_iters` iterations
         # outside `no_sync()`
         num_iters = 3
-        with patch("torch.distributed.all_gather_into_tensor") as mock_all_gather, \
-                patch("torch.distributed.reduce_scatter_tensor") as mock_reduce_scatter:
+        with patch(
+            "torch.distributed.all_gather_into_tensor"
+        ) as mock_all_gather, patch(
+            "torch.distributed.reduce_scatter_tensor"
+        ) as mock_reduce_scatter:
+
             def reset_mocks():
                 mock_all_gather.reset_mock()
                 mock_reduce_scatter.reset_mock()
+
             # Check the communication cost when using `no_sync()`
             if use_no_sync:
                 for i in range(num_iters):
@@ -233,11 +250,14 @@ def reset_mocks():
                     num_all_gathers = mock_all_gather.call_count
                     num_reduce_scatters = mock_reduce_scatter.call_count
                     ref_num_all_gathers = self._get_ref_num_all_gathers(
-                        num_fsdp, sharding_strategy, is_first_iter=i == 0,
+                        num_fsdp,
+                        sharding_strategy,
+                        is_first_iter=i == 0,
                         is_last_iter_no_sync=i > 0,
                     )
                     ref_num_reduce_scatters = self._get_ref_num_reduce_scatters(
-                        num_fsdp, in_no_sync=True,
+                        num_fsdp,
+                        in_no_sync=True,
                     )
                     self.assertEqual(num_all_gathers, ref_num_all_gathers)
                     self.assertEqual(num_reduce_scatters, ref_num_reduce_scatters)
@@ -248,12 +268,14 @@ def reset_mocks():
                 num_all_gathers = mock_all_gather.call_count
                 num_reduce_scatters = mock_reduce_scatter.call_count
                 ref_num_all_gathers = self._get_ref_num_all_gathers(
-                    num_fsdp, sharding_strategy,
+                    num_fsdp,
+                    sharding_strategy,
                     is_first_iter=not use_no_sync and i == 0,
                     is_last_iter_no_sync=use_no_sync and i == 0,
                 )
                 ref_num_reduce_scatters = self._get_ref_num_reduce_scatters(
-                    num_fsdp, in_no_sync=False,
+                    num_fsdp,
+                    in_no_sync=False,
                 )
                 self.assertEqual(num_all_gathers, ref_num_all_gathers)
                 self.assertEqual(num_reduce_scatters, ref_num_reduce_scatters)
diff --git a/test/distributed/fsdp/test_fsdp_comm_hooks.py b/test/distributed/fsdp/test_fsdp_comm_hooks.py
index bfd710cdac486..125606fbff5cb 100644
--- a/test/distributed/fsdp/test_fsdp_comm_hooks.py
+++ b/test/distributed/fsdp/test_fsdp_comm_hooks.py
@@ -7,10 +7,9 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import distributed as dist
-from torch.distributed.distributed_c10d import _get_default_group
 from torch.distributed.algorithms._comm_hooks import default_hooks
-from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from torch.distributed.fsdp import MixedPrecision
+from torch.distributed.distributed_c10d import _get_default_group
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, MixedPrecision
 from torch.distributed.fsdp.fully_sharded_data_parallel import ShardingStrategy
 from torch.testing._internal.common_distributed import (
     requires_nccl,
@@ -26,7 +25,6 @@
     run_tests,
 )
 
-
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
@@ -35,10 +33,11 @@
 BFLOAT16_AVAILABLE = (
     torch.cuda.is_available()
     and torch.version.cuda is not None
-    and int(torch.version.cuda.split('.')[0]) >= 11)
+    and int(torch.version.cuda.split(".")[0]) >= 11
+)
 
-class Net(nn.Module):
 
+class Net(nn.Module):
     def __init__(self, has_wrapping, sharding_strategy, mixed_precision=None):
         # to ensure determinism
         torch.manual_seed(0)
@@ -46,45 +45,40 @@ def __init__(self, has_wrapping, sharding_strategy, mixed_precision=None):
         super().__init__()
 
         if has_wrapping:
-            self.net = FSDP(nn.Sequential(
-                nn.Linear(8, 16),
-                nn.ReLU(),
-                FSDP(
-                    nn.Linear(16, 8),
-                    device_id=torch.cuda.current_device(),
-                    sharding_strategy=sharding_strategy,
-                    mixed_precision=mixed_precision,
-                )
-            ),
+            self.net = FSDP(
+                nn.Sequential(
+                    nn.Linear(8, 16),
+                    nn.ReLU(),
+                    FSDP(
+                        nn.Linear(16, 8),
+                        device_id=torch.cuda.current_device(),
+                        sharding_strategy=sharding_strategy,
+                        mixed_precision=mixed_precision,
+                    ),
+                ),
                 device_id=torch.cuda.current_device(),
                 sharding_strategy=sharding_strategy,
                 mixed_precision=mixed_precision,
             )
         else:
-            self.net = nn.Sequential(
-                nn.Linear(8, 16),
-                nn.ReLU(),
-                nn.Linear(16, 8)
-            )
+            self.net = nn.Sequential(nn.Linear(8, 16), nn.ReLU(), nn.Linear(16, 8))
 
         self.out = nn.Linear(8, 4)
 
     def forward(self, x):
         return self.out(F.relu(self.net(x)))
 
+
 class DummyState(object):
 
-    __slots__ = [
-        "process_group",
-        "noise"
-    ]
+    __slots__ = ["process_group", "noise"]
 
     def __init__(self, process_group: dist.ProcessGroup, noise: int):
         self.process_group = process_group
         self.noise = noise
 
-class DummyHook(object):
 
+class DummyHook(object):
     def dummy_hook_for_no_shard_fsdp(self, state: DummyState, grad: torch.Tensor):
         """
         This communication hook is for illustration and testing purpose only.
@@ -104,7 +98,9 @@ def custom_reduce_scatter(self, output, input, group=None):
         """
         pass
 
-    def dummy_hook_for_sharded_fsdp(self, state: DummyState, grad: torch.Tensor, output: torch.Tensor):
+    def dummy_hook_for_sharded_fsdp(
+        self, state: DummyState, grad: torch.Tensor, output: torch.Tensor
+    ):
         """
         This communication hook is for illustration and testing purposes only.
         This communication hook is used during FSDP ``FULL_SHARD`` or ``SHARD_GRAD_OP`` training.
@@ -112,23 +108,21 @@ def dummy_hook_for_sharded_fsdp(self, state: DummyState, grad: torch.Tensor, out
         ``reduce_scatter`` for gradient communication and stores a sharded gradient in ``output``.
         """
         grad.add_(state.noise)
-        self.custom_reduce_scatter(
-            output, grad, group=state.process_group
-        )
+        self.custom_reduce_scatter(output, grad, group=state.process_group)
 
-class TestCommunicationHooks(FSDPTest):
 
+class TestCommunicationHooks(FSDPTest):
     @skip_if_lt_x_gpu(2)
     @parametrize(
         "sharding_strategy",
         [
             ShardingStrategy.NO_SHARD,
             ShardingStrategy.FULL_SHARD,
-            ShardingStrategy.SHARD_GRAD_OP
-        ])
+            ShardingStrategy.SHARD_GRAD_OP,
+        ],
+    )
     def test_default_communication_hook_behavior(
-        self,
-        sharding_strategy: Optional[ShardingStrategy]
+        self, sharding_strategy: Optional[ShardingStrategy]
     ):
         """
         Tests FSDP's default communication hook's behavior and correctness.
@@ -148,14 +142,16 @@ def test_default_communication_hook_behavior(
         net_default_hook = FSDP(
             net,
             device_id=torch.cuda.current_device(),
-            sharding_strategy=sharding_strategy
+            sharding_strategy=sharding_strategy,
         ).to(self.rank)
 
         # Check that default hook is set to `all_reduce` for `NO_SHARD`
         # or `reduce_scatter` for sharded cases
-        default_hook = default_hooks.reduce_scatter_hook\
-            if sharding_strategy != ShardingStrategy.NO_SHARD\
+        default_hook = (
+            default_hooks.reduce_scatter_hook
+            if sharding_strategy != ShardingStrategy.NO_SHARD
             else default_hooks.allreduce_hook
+        )
 
         for entry in FSDP.fsdp_modules(net_default_hook):
             self.assertEqual(entry._communication_hook, default_hook)
@@ -176,11 +172,13 @@ def test_default_communication_hook_behavior(
             self.assertEqual(
                 grad[0].item(),
                 expected_grad,
-                msg=f"Expected hook grad of {expected_grad} but got {grad[0].item()}")
+                msg=f"Expected hook grad of {expected_grad} but got {grad[0].item()}",
+            )
 
     def _get_submodules(self, fsdp_net):
         return [
-            submodule for submodule in FSDP.fsdp_modules(fsdp_net)
+            submodule
+            for submodule in FSDP.fsdp_modules(fsdp_net)
             if not submodule.check_is_root()
         ]
 
@@ -201,12 +199,11 @@ def _init_model(self, core, sharding_strategy, mixed_precision=None):
         [
             ShardingStrategy.NO_SHARD,
             ShardingStrategy.FULL_SHARD,
-            ShardingStrategy.SHARD_GRAD_OP
-        ])
+            ShardingStrategy.SHARD_GRAD_OP,
+        ],
+    )
     def test_default_communication_hook_initialization(
-        self,
-        has_wrapping: bool,
-        sharding_strategy: Optional[ShardingStrategy]
+        self, has_wrapping: bool, sharding_strategy: Optional[ShardingStrategy]
     ):
         """
         Tests FSDP's communication hook interface behavior.
@@ -219,45 +216,39 @@ def test_default_communication_hook_initialization(
         # Initialize a model
         fsdp_model_with_hook = self._init_model(
             Net(has_wrapping=has_wrapping, sharding_strategy=sharding_strategy),
-            sharding_strategy=sharding_strategy
+            sharding_strategy=sharding_strategy,
         )
 
         # Check that default hook is set to `all_reduce` for `NO_SHARD`
         # or `reduce_scatter` for sharded cases
-        default_hook = default_hooks.reduce_scatter_hook\
-            if sharding_strategy != ShardingStrategy.NO_SHARD\
+        default_hook = (
+            default_hooks.reduce_scatter_hook
+            if sharding_strategy != ShardingStrategy.NO_SHARD
             else default_hooks.allreduce_hook
+        )
 
         for entry in FSDP.fsdp_modules(fsdp_model_with_hook):
             self.assertEqual(entry._communication_hook, default_hook)
 
         dummy_state = DummyState(process_group=None, noise=1234)
-        dummy_hook = DummyHook.dummy_hook_for_no_shard_fsdp\
-            if sharding_strategy != ShardingStrategy.NO_SHARD\
+        dummy_hook = (
+            DummyHook.dummy_hook_for_no_shard_fsdp
+            if sharding_strategy != ShardingStrategy.NO_SHARD
             else DummyHook.dummy_hook_for_sharded_fsdp
-
-        fsdp_model_with_hook.register_comm_hook(
-            dummy_state,
-            dummy_hook
         )
 
+        fsdp_model_with_hook.register_comm_hook(dummy_state, dummy_hook)
+
         # Check that we can't register comm hook twice
-        with self.assertRaisesRegex(AssertionError, '^communication hook can be only registered once$'):
-            fsdp_model_with_hook.register_comm_hook(
-                dummy_state,
-                dummy_hook
-            )
+        with self.assertRaisesRegex(
+            AssertionError, "^communication hook can be only registered once$"
+        ):
+            fsdp_model_with_hook.register_comm_hook(dummy_state, dummy_hook)
 
         # Check dummy hook was registered for the root and all submodules if any
         for entry in FSDP.fsdp_modules(fsdp_model_with_hook):
-            self.assertEqual(
-                entry._communication_hook,
-                dummy_hook
-            )
-            self.assertEqual(
-                entry._communication_hook_state,
-                dummy_state
-            )
+            self.assertEqual(entry._communication_hook, dummy_hook)
+            self.assertEqual(entry._communication_hook_state, dummy_state)
 
         for entry in FSDP.fsdp_modules(fsdp_model_with_hook):
             entry._communication_hook = None
@@ -277,18 +268,17 @@ def test_default_communication_hook_initialization(
         with self.assertRaises(AssertionError):
             loss.backward()
 
-
     @skip_if_lt_x_gpu(2)
     @parametrize(
         "sharding_strategy",
         [
             ShardingStrategy.NO_SHARD,
             ShardingStrategy.FULL_SHARD,
-            ShardingStrategy.SHARD_GRAD_OP
-        ])
+            ShardingStrategy.SHARD_GRAD_OP,
+        ],
+    )
     def test_registering_hook_non_root(
-        self,
-        sharding_strategy: Optional[ShardingStrategy]
+        self, sharding_strategy: Optional[ShardingStrategy]
     ):
         """
         Tests FSDP's communication hook registering for submodules.
@@ -301,16 +291,21 @@ def test_registering_hook_non_root(
 
         fsdp_model_with_hook = self._init_model(
             Net(has_wrapping=True, sharding_strategy=sharding_strategy),
-            sharding_strategy=sharding_strategy
+            sharding_strategy=sharding_strategy,
         )
         dummy_state = DummyState(process_group=None, noise=1234)
-        dummy_hook = DummyHook.dummy_hook_for_no_shard_fsdp\
-            if sharding_strategy != ShardingStrategy.NO_SHARD\
+        dummy_hook = (
+            DummyHook.dummy_hook_for_no_shard_fsdp
+            if sharding_strategy != ShardingStrategy.NO_SHARD
             else DummyHook.dummy_hook_for_sharded_fsdp
+        )
         # Creating a list of non-root submodules to test
         submodules = self._get_submodules(fsdp_model_with_hook)
         # Check that assertion is raised for registering a comm hook on a non-root
-        with self.assertRaisesRegex(AssertionError, '^register_comm_hook can only be called on a root instance.$'):
+        with self.assertRaisesRegex(
+            AssertionError,
+            "^register_comm_hook can only be called on a root instance.$",
+        ):
             submodules[1].register_comm_hook(dummy_state, dummy_hook)
 
     @skip_if_lt_x_gpu(2)
@@ -319,11 +314,11 @@ def test_registering_hook_non_root(
         [
             ShardingStrategy.NO_SHARD,
             ShardingStrategy.FULL_SHARD,
-            ShardingStrategy.SHARD_GRAD_OP
-        ])
+            ShardingStrategy.SHARD_GRAD_OP,
+        ],
+    )
     def test_registering_hook_submodules(
-        self,
-        sharding_strategy: Optional[ShardingStrategy]
+        self, sharding_strategy: Optional[ShardingStrategy]
     ):
         """
         Tests FSDP's communication hook registering for submodules.
@@ -336,24 +331,28 @@ def test_registering_hook_submodules(
 
         fsdp_model_with_hook = self._init_model(
             Net(has_wrapping=True, sharding_strategy=sharding_strategy),
-            sharding_strategy=sharding_strategy
+            sharding_strategy=sharding_strategy,
         )
         dummy_state = DummyState(process_group=None, noise=1234)
-        dummy_hook = DummyHook.dummy_hook_for_no_shard_fsdp\
-            if sharding_strategy != ShardingStrategy.NO_SHARD\
+        dummy_hook = (
+            DummyHook.dummy_hook_for_no_shard_fsdp
+            if sharding_strategy != ShardingStrategy.NO_SHARD
             else DummyHook.dummy_hook_for_sharded_fsdp
+        )
         submodules = self._get_submodules(fsdp_model_with_hook)
 
         # Simulate a registration of a hook on a submodule
         submodules[1]._hook_registered = True
         # Check that an error is raised when some of submodules have a non-default hook assigned
-        with self.assertRaisesRegex(AssertionError, '^communication hook can be only registered once$'):
+        with self.assertRaisesRegex(
+            AssertionError, "^communication hook can be only registered once$"
+        ):
             fsdp_model_with_hook.register_comm_hook(dummy_state, dummy_hook)
 
         # Reinitialize the model
         fsdp_model_with_hook = self._init_model(
             Net(has_wrapping=True, sharding_strategy=sharding_strategy),
-            sharding_strategy=sharding_strategy
+            sharding_strategy=sharding_strategy,
         )
         submodules = self._get_submodules(fsdp_model_with_hook)
         submodules[1]._communication_hook = dummy_hook
@@ -361,29 +360,32 @@ def test_registering_hook_submodules(
         # Check that an error is raised when some of submodules have a non-default hook assigned
         with self.assertRaisesRegex(
             AssertionError,
-            f'^communication hook should be default, but it is {submodules[1]._communication_hook.__name__} instead$'
+            f"^communication hook should be default, but it is {submodules[1]._communication_hook.__name__} instead$",
         ):
-            fsdp_model_with_hook.register_comm_hook(
-                dummy_state,
-                dummy_hook
-            )
+            fsdp_model_with_hook.register_comm_hook(dummy_state, dummy_hook)
 
-    def _check_low_precision_hook(self, state, hook, sharding_strategy, dtype, has_wrapping):
+    def _check_low_precision_hook(
+        self, state, hook, sharding_strategy, dtype, has_wrapping
+    ):
         # keep everything deterministic for input data
         torch.manual_seed(0)
         torch.cuda.manual_seed(0)
 
         fsdp_with_hook = self._init_model(
             Net(has_wrapping=has_wrapping, sharding_strategy=sharding_strategy),
-            sharding_strategy=sharding_strategy
+            sharding_strategy=sharding_strategy,
         )
         fsdp_with_hook.register_comm_hook(state, hook)
 
         mp_only_grad = MixedPrecision(reduce_dtype=dtype)
         fsdp_with_mp = self._init_model(
-            Net(has_wrapping=has_wrapping, sharding_strategy=sharding_strategy, mixed_precision=mp_only_grad),
+            Net(
+                has_wrapping=has_wrapping,
+                sharding_strategy=sharding_strategy,
+                mixed_precision=mp_only_grad,
+            ),
             sharding_strategy=sharding_strategy,
-            mixed_precision=mp_only_grad
+            mixed_precision=mp_only_grad,
         )
 
         optim_hook = torch.optim.SGD(fsdp_with_hook.parameters(), lr=0.1)
@@ -403,7 +405,9 @@ def _check_low_precision_hook(self, state, hook, sharding_strategy, dtype, has_w
 
         dist.barrier()
 
-        for hook_param, mp_param in zip(fsdp_with_hook.parameters(), fsdp_with_mp.parameters()):
+        for hook_param, mp_param in zip(
+            fsdp_with_hook.parameters(), fsdp_with_mp.parameters()
+        ):
             self.assertEqual(hook_param.grad, mp_param.grad)
 
     @requires_nccl()
@@ -414,18 +418,19 @@ def _check_low_precision_hook(self, state, hook, sharding_strategy, dtype, has_w
         [
             ShardingStrategy.NO_SHARD,
             ShardingStrategy.FULL_SHARD,
-            ShardingStrategy.SHARD_GRAD_OP
-        ])
+            ShardingStrategy.SHARD_GRAD_OP,
+        ],
+    )
     def test_fp16_hook(
-        self,
-        has_wrapping: bool,
-        sharding_strategy: Optional[ShardingStrategy]
+        self, has_wrapping: bool, sharding_strategy: Optional[ShardingStrategy]
     ):
 
         state = default_hooks.LowPrecisionState(process_group=_get_default_group())
         hook = default_hooks.fp16_compress_hook
 
-        self._check_low_precision_hook(state, hook, sharding_strategy, torch.float16, has_wrapping)
+        self._check_low_precision_hook(
+            state, hook, sharding_strategy, torch.float16, has_wrapping
+        )
 
     @requires_nccl()
     @requires_nccl_version((2, 10), "Need NCCL 2.10+ for BF16_COMPRESS")
@@ -441,18 +446,19 @@ def test_fp16_hook(
         [
             ShardingStrategy.NO_SHARD,
             ShardingStrategy.FULL_SHARD,
-            ShardingStrategy.SHARD_GRAD_OP
-        ])
+            ShardingStrategy.SHARD_GRAD_OP,
+        ],
+    )
     def test_bf16_hook(
-        self,
-        has_wrapping: bool,
-        sharding_strategy: Optional[ShardingStrategy]
+        self, has_wrapping: bool, sharding_strategy: Optional[ShardingStrategy]
     ):
 
         state = default_hooks.LowPrecisionState(process_group=_get_default_group())
         hook = default_hooks.bf16_compress_hook
 
-        self._check_low_precision_hook(state, hook, sharding_strategy, torch.bfloat16, has_wrapping)
+        self._check_low_precision_hook(
+            state, hook, sharding_strategy, torch.bfloat16, has_wrapping
+        )
 
 
 instantiate_parametrized_tests(TestCommunicationHooks)
diff --git a/test/distributed/fsdp/test_fsdp_core.py b/test/distributed/fsdp/test_fsdp_core.py
index 9557f2abcfbcb..93d5e4f45ad28 100644
--- a/test/distributed/fsdp/test_fsdp_core.py
+++ b/test/distributed/fsdp/test_fsdp_core.py
@@ -24,14 +24,14 @@
     MixtureOfExperts,
     NestedWrappedModule,
     NestedWrappedModuleWithDelay,
-    TransformerWithSharedParams,
     subtest_name,
+    TransformerWithSharedParams,
 )
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 
 if not dist.is_available():
@@ -47,7 +47,11 @@
 
 params = "cpu_offload,sharding_strategy"
 cpu_offload_config = [CPUOffload(offload_params=True), CPUOffload(offload_params=False)]
-sharding_strategy_config = [None, ShardingStrategy.SHARD_GRAD_OP, ShardingStrategy.NO_SHARD]
+sharding_strategy_config = [
+    None,
+    ShardingStrategy.SHARD_GRAD_OP,
+    ShardingStrategy.NO_SHARD,
+]
 configs = list(itertools.product(cpu_offload_config, sharding_strategy_config))
 test_name_mapping = {
     str(CPUOffload(offload_params=True)): "offload_true",
@@ -259,7 +263,7 @@ def test_mixture_of_experts_with_delay_before_free(
             ref_init_fn=self._dummy_ddp_fn,
             cpu_offload=cpu_offload,
             sharding_strategy=sharding_strategy,
-            init_kwargs={"delay_before_free_ms": 250}
+            init_kwargs={"delay_before_free_ms": 250},
         )
 
 
@@ -397,7 +401,7 @@ def test_transformer_no_grad(self, mixed_precision):
             fsdp_model,
             num_steps=1,
             autocast=False,
-            mixed_precision=fsdp_kwargs["mixed_precision"]
+            mixed_precision=fsdp_kwargs["mixed_precision"],
         )
         input = fsdp_model.module.get_input(torch.device("cuda"))
         # Run a forward in eval mode
diff --git a/test/distributed/fsdp/test_fsdp_exec_order.py b/test/distributed/fsdp/test_fsdp_exec_order.py
index eaf3066d1bad0..6cd00e5302181 100644
--- a/test/distributed/fsdp/test_fsdp_exec_order.py
+++ b/test/distributed/fsdp/test_fsdp_exec_order.py
@@ -11,10 +11,10 @@
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import FSDPTest
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 
 if not dist.is_available():
diff --git a/test/distributed/fsdp/test_fsdp_freezing_weights.py b/test/distributed/fsdp/test_fsdp_freezing_weights.py
index 23836130818c9..430e47adf71e0 100644
--- a/test/distributed/fsdp/test_fsdp_freezing_weights.py
+++ b/test/distributed/fsdp/test_fsdp_freezing_weights.py
@@ -10,18 +10,14 @@
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.nn.parallel import DistributedDataParallel
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.testing._internal.common_fsdp import (
-    FSDPTest,
-    get_full_params,
-)
+from torch.testing._internal.common_fsdp import FSDPTest, get_full_params
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 
-
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
diff --git a/test/distributed/fsdp/test_fsdp_grad_acc.py b/test/distributed/fsdp/test_fsdp_grad_acc.py
index 1e44f865027d0..ef20d2a2db76e 100644
--- a/test/distributed/fsdp/test_fsdp_grad_acc.py
+++ b/test/distributed/fsdp/test_fsdp_grad_acc.py
@@ -8,8 +8,7 @@
 
 import torch
 from torch import distributed as dist
-from torch.distributed.fsdp import CPUOffload
-from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import CPUOffload, FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.fully_sharded_data_parallel import (
     BackwardPrefetch,
     ShardingStrategy,
@@ -22,10 +21,10 @@
     TransformerWithSharedParams,
 )
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 
 if not dist.is_available():
@@ -53,16 +52,14 @@ class _GradAccConfig:
             manager as the way to accumulate gradients.
         num_iters (int): Number of iterations to accumulate gradients.
     """
+
     use_no_sync: bool
     num_iters: int
 
     def __repr__(self) -> str:
         # Override to remove any spaces in the string to appease the internal
         # build's test name parser
-        return (
-            f"(use_no_sync={self.use_no_sync},"
-            f"num_iters={self.num_iters})"
-        )
+        return f"(use_no_sync={self.use_no_sync}," f"num_iters={self.num_iters})"
 
 
 @dataclass
@@ -71,14 +68,13 @@ class _GradAccConfigs:
     This wraps a :class:`list` of :class:`_GradAccConfig` instances with the
     sole purpose of overriding :meth:`__repr__` to remove spaces.
     """
+
     configs: List[_GradAccConfig]
 
     def __repr__(self) -> str:
         # Override to remove any spaces in the string to appease the internal
         # build's test name parser
-        return (
-            "[" + ",".join(config.__repr__() for config in self.configs) + "]"
-        )
+        return "[" + ",".join(config.__repr__() for config in self.configs) + "]"
 
 
 class TestGradAcc(FSDPTest):
@@ -118,9 +114,8 @@ def _test_grad_acc(
         """
         # Gradient accumulation outside `no_sync()` is not currently compatible
         # with CPU offloading
-        if (
-            cpu_offload.offload_params
-            and any(not config.use_no_sync for config in configs)
+        if cpu_offload.offload_params and any(
+            not config.use_no_sync for config in configs
         ):
             return
         old_allow_tf32 = torch.backends.cuda.matmul.allow_tf32
@@ -144,7 +139,9 @@ def _test_grad_acc(
             )
             device = torch.device("cuda")
             optim = torch.optim.SGD(
-                fsdp_model.parameters(), lr=0.01, momentum=0.9,
+                fsdp_model.parameters(),
+                lr=0.01,
+                momentum=0.9,
             )
 
             # Generate the sequence of batches, each containing the same data
@@ -152,16 +149,16 @@ def _test_grad_acc(
             def permute_tensor(x: torch.Tensor):
                 return x.view(-1)[torch.randperm(x.numel())].view_as(x)
 
-            batch: Tuple[torch.Tensor, ...] = \
-                fsdp_model.module.get_input(device)
+            batch: Tuple[torch.Tensor, ...] = fsdp_model.module.get_input(device)
             batches: List[Tuple[torch.Tensor, ...]] = [batch]
             num_iters_to_acc = sum(config.num_iters for config in configs)
             for _ in range(num_iters_to_acc - 1):
                 batches.append(tuple(permute_tensor(t) for t in batch))
             for (batch1, batch2) in itertools.combinations(batches, r=2):
                 for t1, t2 in zip(batch1, batch2):
-                    assert not torch.all(t1 == t2), \
-                        "Check the test to make sure that batches are distinct"
+                    assert not torch.all(
+                        t1 == t2
+                    ), "Check the test to make sure that batches are distinct"
 
             # Concatenate the batches along the given batch dimension
             concat_batch: Tuple[torch.Tensor, ...] = tuple(
@@ -173,17 +170,18 @@ def permute_tensor(x: torch.Tensor):
             output = fsdp_model(*concat_batch)
             ref_loss = fsdp_model.module.get_loss(concat_batch, output)
             ref_loss.backward()
-            ref_grads = [
-                p.grad.detach().clone() for p in fsdp_model.parameters()
-            ]
+            ref_grads = [p.grad.detach().clone() for p in fsdp_model.parameters()]
 
             # Compute and accumulate the gradients
             fsdp_model.zero_grad()
             losses = []
             batch_idx = 0
             for config in configs:
-                sync_context = fsdp_model.no_sync() if config.use_no_sync \
+                sync_context = (
+                    fsdp_model.no_sync()
+                    if config.use_no_sync
                     else contextlib.suppress()
+                )
                 with sync_context:
                     for _ in range(config.num_iters):
                         if batch_idx == num_iters_to_acc - 1:
@@ -199,9 +197,7 @@ def permute_tensor(x: torch.Tensor):
             loss.backward()
             losses.append(loss)
             acc_loss = sum(losses)
-            acc_grads = [
-                p.grad.detach().clone() for p in fsdp_model.parameters()
-            ]
+            acc_grads = [p.grad.detach().clone() for p in fsdp_model.parameters()]
 
             # Compare the losses and gradients
             torch.testing.assert_close(ref_loss, acc_loss)
@@ -231,17 +227,21 @@ def _get_subtest_config(self) -> Dict[str, List[Any]]:
     @parametrize(
         "configs",
         [
-            _GradAccConfigs([
-                _GradAccConfig(use_no_sync=True, num_iters=3),
-                _GradAccConfig(use_no_sync=False, num_iters=3),
-                _GradAccConfig(use_no_sync=True, num_iters=3),
-            ]),
-            _GradAccConfigs([
-                _GradAccConfig(use_no_sync=False, num_iters=3),
-                _GradAccConfig(use_no_sync=True, num_iters=3),
-                _GradAccConfig(use_no_sync=False, num_iters=3),
-            ]),
-        ]
+            _GradAccConfigs(
+                [
+                    _GradAccConfig(use_no_sync=True, num_iters=3),
+                    _GradAccConfig(use_no_sync=False, num_iters=3),
+                    _GradAccConfig(use_no_sync=True, num_iters=3),
+                ]
+            ),
+            _GradAccConfigs(
+                [
+                    _GradAccConfig(use_no_sync=False, num_iters=3),
+                    _GradAccConfig(use_no_sync=True, num_iters=3),
+                    _GradAccConfig(use_no_sync=False, num_iters=3),
+                ]
+            ),
+        ],
     )
     @parametrize(
         "cpu_offload",
@@ -253,7 +253,7 @@ def _get_subtest_config(self) -> Dict[str, List[Any]]:
             ShardingStrategy.FULL_SHARD,
             ShardingStrategy.SHARD_GRAD_OP,
             ShardingStrategy.NO_SHARD,
-        ]
+        ],
     )
     def test_grad_acc(
         self,
diff --git a/test/distributed/fsdp/test_fsdp_ignored_modules.py b/test/distributed/fsdp/test_fsdp_ignored_modules.py
index 60c3fd6f88110..83babee7d482f 100644
--- a/test/distributed/fsdp/test_fsdp_ignored_modules.py
+++ b/test/distributed/fsdp/test_fsdp_ignored_modules.py
@@ -14,10 +14,10 @@
     TransformerWithSharedParams,
 )
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 
 if not dist.is_available():
@@ -74,12 +74,15 @@ def forward(self, x):
 
 class ModelWithIgnoredModules(Model):
     """Adds a variable number of :class:`IgnoredModule` to ``self.layer1``."""
+
     def __init__(self, num_ignored: int) -> None:
         assert num_ignored >= 0
         super().__init__()
-        layer1_modules = [torch.nn.Linear(5, 4), torch.nn.Linear(4, 4)] + \
-            [IgnoredModule(4, 4) for _ in range(num_ignored)] + \
-            [torch.nn.Linear(4, 4)]
+        layer1_modules = (
+            [torch.nn.Linear(5, 4), torch.nn.Linear(4, 4)]
+            + [IgnoredModule(4, 4) for _ in range(num_ignored)]
+            + [torch.nn.Linear(4, 4)]
+        )
         self.layer1 = torch.nn.Sequential(*layer1_modules)
 
 
@@ -143,9 +146,7 @@ def test_ignored_modules_nested(self):
         # the ignored nested sequential's parameters
         nonwrapped_model = Model()
         total_numel = sum(p.numel() for p in nonwrapped_model.parameters())
-        ignored_numel = sum(
-            p.numel() for p in nonwrapped_model.layer1.parameters()
-        )
+        ignored_numel = sum(p.numel() for p in nonwrapped_model.layer1.parameters())
         nonignored_numel = total_numel - ignored_numel
         with FSDP.summon_full_params(wrapped_model):
             flat_param_numel = wrapped_model.params[0].numel()
@@ -176,7 +177,9 @@ def test_ignored_modules_invalid(self):
 
     @skip_if_lt_x_gpu(2)
     @parametrize("pass_ignored_modules_to_root", [False, True])
-    def test_diff_ignored_modules_across_ranks(self, pass_ignored_modules_to_root: bool):
+    def test_diff_ignored_modules_across_ranks(
+        self, pass_ignored_modules_to_root: bool
+    ):
         """
         Tests ignoring different modules across ranks.
 
@@ -196,9 +199,11 @@ def test_diff_ignored_modules_across_ranks(self, pass_ignored_modules_to_root: b
         ]
         model.layer1 = FSDP(model.layer1, ignored_modules=layer1_ignored_modules)
         model.layer3 = FSDP(model.layer3)
-        model_ignored_modules = [
-            m for m in model.modules() if isinstance(m, IgnoredModule)
-        ] if pass_ignored_modules_to_root else []
+        model_ignored_modules = (
+            [m for m in model.modules() if isinstance(m, IgnoredModule)]
+            if pass_ignored_modules_to_root
+            else []
+        )
         wrapped_model = FSDP(model, ignored_modules=model_ignored_modules)
         optim = torch.optim.Adam(wrapped_model.parameters(), lr=1e-3)
         self._train_model(wrapped_model, optim, 3)
diff --git a/test/distributed/fsdp/test_fsdp_input.py b/test/distributed/fsdp/test_fsdp_input.py
index 136b65c3b28ec..06a516faaa97b 100644
--- a/test/distributed/fsdp/test_fsdp_input.py
+++ b/test/distributed/fsdp/test_fsdp_input.py
@@ -8,18 +8,15 @@
 from torch.nn import Linear, Module
 from torch.optim import SGD
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.testing._internal.common_fsdp import (
-    FSDPTest,
-)
+from torch.testing._internal.common_fsdp import FSDPTest
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
     subtest,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 
-
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
diff --git a/test/distributed/fsdp/test_fsdp_memory.py b/test/distributed/fsdp/test_fsdp_memory.py
index b26aa249dc798..fe2ad8879ad1b 100644
--- a/test/distributed/fsdp/test_fsdp_memory.py
+++ b/test/distributed/fsdp/test_fsdp_memory.py
@@ -8,18 +8,15 @@
 from torch import distributed as dist
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.testing._internal.common_fsdp import (
-    FSDPTest,
-)
+from torch.testing._internal.common_fsdp import FSDPTest
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 from torch.utils.checkpoint import checkpoint
 
-
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
diff --git a/test/distributed/fsdp/test_fsdp_meta.py b/test/distributed/fsdp/test_fsdp_meta.py
index 1aa426800db62..09e5c7ae83292 100644
--- a/test/distributed/fsdp/test_fsdp_meta.py
+++ b/test/distributed/fsdp/test_fsdp_meta.py
@@ -6,20 +6,19 @@
 import torch.distributed as dist
 import torch.nn as nn
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from torch.distributed.fsdp.wrap import always_wrap_policy as always_wrap
-from torch.distributed.fsdp.wrap import wrap, enable_wrap
-from torch.testing._internal.common_fsdp import (
-    FSDPTest,
+from torch.distributed.fsdp.wrap import (
+    always_wrap_policy as always_wrap,
+    enable_wrap,
+    wrap,
 )
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import FSDPTest
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
-    run_tests,
-    parametrize,
     instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
     sandcastle_skip_if,
-)
-from torch.testing._internal.common_distributed import (
-    skip_if_lt_x_gpu,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 
 _TORCHDISTX_AVAIL = True
@@ -47,10 +46,12 @@ def _reset_params_if_meta(is_meta, model):
     if is_meta:
         model.reset_parameters()
 
+
 class MyLinear(nn.Linear):
     """
     Linear layer with deterministic reset_parameters for testing.
     """
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
@@ -58,6 +59,7 @@ def reset_parameters(self, *args, **kwargs):
         with torch.no_grad():
             self.weight.fill_(1)
 
+
 class MyModel(nn.Module):
     def __init__(self, device):
         super().__init__()
@@ -90,6 +92,7 @@ def reset_parameters(self):
             if not isinstance(m, FSDP):
                 m.reset_parameters()
 
+
 def _init_with_reset_params(module):
     """
     to_empty + reset_parameters() init function example for modules
@@ -101,6 +104,7 @@ def _init_with_reset_params(module):
     with torch.no_grad():
         module.reset_parameters()
 
+
 def _init_with_torchdistX(module):
     """
     torchdistX-based deferred module initialization function example
@@ -113,6 +117,7 @@ def check_fn(k):
 
     deferred_init.materialize_module(module, check_fn=check_fn)
 
+
 class TestFSDPWithMetaDevice(FSDPTest):
     @property
     def world_size(self):
@@ -148,7 +153,7 @@ def _test_simple_model_with_meta_device(self, meta_module_fn, init_fn=None):
         regular_opt = torch.optim.SGD(fsdp_regular.parameters(), lr=1e-3)
 
         self._compare_fsdp(fsdp_meta, fsdp_regular)
-        inp = torch.randn(10, 2, device='cuda')
+        inp = torch.randn(10, 2, device="cuda")
         fsdp_meta(inp).sum().backward()
         fsdp_regular(inp).sum().backward()
         meta_opt.step()
@@ -176,6 +181,7 @@ def _test_simple_model_with_meta_device(self, meta_module_fn, init_fn=None):
     def test_simple_model_with_meta_device_reset_params(self):
         def meta_module_fn():
             return MyModel(device="meta")
+
         self._test_simple_model_with_meta_device(
             meta_module_fn, _init_with_reset_params
         )
@@ -184,11 +190,13 @@ def meta_module_fn():
     def test_simple_model_with_meta_device_default_init(self):
         def meta_module_fn():
             return MyModel(device="meta")
+
         self._test_simple_model_with_meta_device(meta_module_fn)
 
     @skip_if_lt_x_gpu(2)
     @sandcastle_skip_if(
-        not _TORCHDISTX_AVAIL, "Test requires torchdistX: https://github.com/pytorch/torchdistX"
+        not _TORCHDISTX_AVAIL,
+        "Test requires torchdistX: https://github.com/pytorch/torchdistX",
     )
     def test_simple_model_with_torchdistX_default_init(self):
         def meta_module_fn():
@@ -198,15 +206,20 @@ def meta_module_fn():
 
     @skip_if_lt_x_gpu(2)
     @sandcastle_skip_if(
-        not _TORCHDISTX_AVAIL, "Test requires torchdistX: https://github.com/pytorch/torchdistX"
+        not _TORCHDISTX_AVAIL,
+        "Test requires torchdistX: https://github.com/pytorch/torchdistX",
     )
     def test_simple_model_with_torchdistX_init_fn(self):
         def meta_module_fn():
             return deferred_init.deferred_init(MyModel, device="cuda")
 
-        self._test_simple_model_with_meta_device(meta_module_fn, init_fn=_init_with_torchdistX)
+        self._test_simple_model_with_meta_device(
+            meta_module_fn, init_fn=_init_with_torchdistX
+        )
 
-    def _test_nested_model_with_meta_device(self, auto_wrap, meta_module_fn, init_fn=None):
+    def _test_nested_model_with_meta_device(
+        self, auto_wrap, meta_module_fn, init_fn=None
+    ):
         if auto_wrap:
             module = meta_module_fn()
             is_meta = next(module.parameters()).is_meta
@@ -225,7 +238,8 @@ def _test_nested_model_with_meta_device(self, auto_wrap, meta_module_fn, init_fn
             regular_opt = torch.optim.SGD(fsdp_regular.parameters(), lr=1e-3)
         else:
             with enable_wrap(
-                wrapper_cls=FSDP, param_init_fn=init_fn,
+                wrapper_cls=FSDP,
+                param_init_fn=init_fn,
             ):
                 module = meta_module_fn()
                 is_meta = next(module.parameters()).is_meta
@@ -246,7 +260,7 @@ def _test_nested_model_with_meta_device(self, auto_wrap, meta_module_fn, init_fn
 
         # Compare it before training
         self._compare_fsdp(fsdp_meta, fsdp_regular)
-        inp = torch.randn(10, 2, device='cuda')
+        inp = torch.randn(10, 2, device="cuda")
         fsdp_meta(inp).sum().backward()
         fsdp_regular(inp).sum().backward()
         meta_opt.step()
@@ -260,7 +274,9 @@ def meta_module_fn():
             return NestedModel(device="meta")
 
         self._test_nested_model_with_meta_device(
-            auto_wrap=auto_wrap, meta_module_fn=meta_module_fn, init_fn=_init_with_reset_params
+            auto_wrap=auto_wrap,
+            meta_module_fn=meta_module_fn,
+            init_fn=_init_with_reset_params,
         )
 
     @skip_if_lt_x_gpu(2)
@@ -270,12 +286,14 @@ def meta_module_fn():
             return NestedModel(device="meta")
 
         self._test_nested_model_with_meta_device(
-            auto_wrap=auto_wrap, meta_module_fn=meta_module_fn,
+            auto_wrap=auto_wrap,
+            meta_module_fn=meta_module_fn,
         )
 
     @skip_if_lt_x_gpu(2)
     @sandcastle_skip_if(
-        not _TORCHDISTX_AVAIL, "Test requires torchdistX: https://github.com/pytorch/torchdistX"
+        not _TORCHDISTX_AVAIL,
+        "Test requires torchdistX: https://github.com/pytorch/torchdistX",
     )
     @parametrize("auto_wrap", [True, False])
     def test_nested_model_with_torchdistX_default_init(self, auto_wrap):
@@ -288,7 +306,8 @@ def meta_module_fn():
 
     @skip_if_lt_x_gpu(2)
     @sandcastle_skip_if(
-        not _TORCHDISTX_AVAIL, "Test requires torchdistX: https://github.com/pytorch/torchdistX"
+        not _TORCHDISTX_AVAIL,
+        "Test requires torchdistX: https://github.com/pytorch/torchdistX",
     )
     @parametrize("auto_wrap", [True, False])
     def test_nested_model_with_torchdistX_init_fn(self, auto_wrap):
@@ -296,7 +315,9 @@ def meta_module_fn():
             return deferred_init.deferred_init(NestedModel, device="cuda")
 
         self._test_nested_model_with_meta_device(
-            auto_wrap=auto_wrap, meta_module_fn=meta_module_fn, init_fn=_init_with_torchdistX,
+            auto_wrap=auto_wrap,
+            meta_module_fn=meta_module_fn,
+            init_fn=_init_with_torchdistX,
         )
 
     def _test_bad_arg(self, meta_module_fn):
@@ -306,7 +327,8 @@ def _test_bad_arg(self, meta_module_fn):
 
     @skip_if_lt_x_gpu(2)
     @sandcastle_skip_if(
-        not _TORCHDISTX_AVAIL, "Test requires torchdistX: https://github.com/pytorch/torchdistX"
+        not _TORCHDISTX_AVAIL,
+        "Test requires torchdistX: https://github.com/pytorch/torchdistX",
     )
     def test_bad_arg_torchdistx(self):
         def meta_module_fn():
diff --git a/test/distributed/fsdp/test_fsdp_misc.py b/test/distributed/fsdp/test_fsdp_misc.py
index ca566b984002a..98cd6488ae5e7 100644
--- a/test/distributed/fsdp/test_fsdp_misc.py
+++ b/test/distributed/fsdp/test_fsdp_misc.py
@@ -1,36 +1,36 @@
 # Owner(s): ["oncall: distributed"]
 
-from copy import deepcopy
 import functools
 import sys
 from collections import namedtuple
 from contextlib import suppress
+from copy import deepcopy
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-from torch.distributed.fsdp import FlatParameter
-from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from torch.distributed.fsdp import ShardingStrategy, CPUOffload
-from torch.distributed.fsdp.wrap import (
-    always_wrap_policy,
-    transformer_auto_wrap_policy,
+from torch.distributed.fsdp import (
+    CPUOffload,
+    FlatParameter,
+    FullyShardedDataParallel as FSDP,
+    ShardingStrategy,
 )
+from torch.distributed.fsdp.wrap import always_wrap_policy, transformer_auto_wrap_policy
 from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
+    _assert_module_states,
     CUDAInitMode,
     FSDPInitMode,
     FSDPTest,
     NestedWrappedModule,
     TransformerWithSharedParams,
-    _assert_module_states,
 )
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 
 if not dist.is_available():
@@ -71,9 +71,7 @@ def forward(self, x):
         t = torch.ones(1, device="cuda", requires_grad=True)
 
         MyOutputType = namedtuple(
-            "MyOutputType",
-            ["a", "b", "c", "d"],
-            defaults=(t, t, t, t)
+            "MyOutputType", ["a", "b", "c", "d"], defaults=(t, t, t, t)
         )
 
         inp = MyOutputType()
@@ -89,7 +87,6 @@ def forward(self, x):
 
     @skip_if_lt_x_gpu(2)
     def test_fsdp_not_all_outputs_used_in_loss(self):
-
         class MyModule(nn.Module):
             def __init__(self):
                 super().__init__()
@@ -108,10 +105,7 @@ def _check_resharded(fsdp_module):
                     full_param = param._full_param_padded
                     self.assertEqual(full_param.storage().size(), 0)
 
-                self.assertEqual(
-                    param.data_ptr(),
-                    param._local_shard.data_ptr()
-                )
+                self.assertEqual(param.data_ptr(), param._local_shard.data_ptr())
 
         def _check_equal(local, fsdp):
             with FSDP.summon_full_params(fsdp):
@@ -121,7 +115,7 @@ def _check_equal(local, fsdp):
         for sharding_strategy in [
             ShardingStrategy.FULL_SHARD,
             ShardingStrategy.SHARD_GRAD_OP,
-            ShardingStrategy.NO_SHARD
+            ShardingStrategy.NO_SHARD,
         ]:
             with self.subTest(sharding_strategy=sharding_strategy):
                 fsdp_ctor = functools.partial(FSDP, sharding_strategy=sharding_strategy)
@@ -160,7 +154,10 @@ def _check_equal(local, fsdp):
                     # Ensure at least some change from previous params, otherwise
                     # above check would be vacuously true.
                     self.assertTrue(
-                        any(not torch.equal(p1, p2) for p1, p2 in zip(prev_params, m_local.parameters()))
+                        any(
+                            not torch.equal(p1, p2)
+                            for p1, p2 in zip(prev_params, m_local.parameters())
+                        )
                     )
                     prev_params = [p.clone() for p in local_m.parameters()]
                     opt.zero_grad()
@@ -168,7 +165,6 @@ def _check_equal(local, fsdp):
 
                 dist.barrier()
 
-
     @skip_if_lt_x_gpu(2)
     @parametrize("use_second_layer", [True, False])
     @parametrize("sharding_strategy", [ShardingStrategy.NO_SHARD, None])
@@ -193,10 +189,10 @@ def forward(self, x, y):
         fsdp = FSDP(
             MyModel().cuda(),
             sharding_strategy=sharding_strategy,
-            auto_wrap_policy=always_wrap_policy
+            auto_wrap_policy=always_wrap_policy,
         )
-        x = torch.randn(10, 10, device='cuda')
-        y = torch.randn(10, 10, device='cuda')
+        x = torch.randn(10, 10, device="cuda")
+        y = torch.randn(10, 10, device="cuda")
         for i in range(4):
             if use_second_layer:
                 a, b = fsdp(x, y)
@@ -241,6 +237,7 @@ def test_fsdp_device_id_cpu_offload(self):
         Ensures that even if device_id is specified but we have
         CPU offload, module is on CPU after init.
         """
+
         class MyModel(nn.Module):
             def __init__(self):
                 super().__init__()
@@ -256,7 +253,7 @@ def forward(self, x):
             model,
             auto_wrap_policy=always_wrap_policy,
             cpu_offload=CPUOffload(offload_params=True),
-            device_id=torch.cuda.current_device()
+            device_id=torch.cuda.current_device(),
         )
 
         cpu_device = torch.device("cpu")
@@ -281,7 +278,8 @@ def test_fsdp_device_id(self, use_index):
           without specifying a device ID (i.e. ``torch.device("cuda")``) warns
         """
         dev_id = (
-            torch.cuda.current_device() if use_index
+            torch.cuda.current_device()
+            if use_index
             else torch.device("cuda", torch.cuda.current_device())
         )
 
@@ -289,8 +287,7 @@ def _check_device_matches(module, device_id):
             """Checks that the ``FlatParameter``s in ``module`` have device
             matching ``device_id``."""
             devices = {
-                p.device for p in module.parameters()
-                if isinstance(p, FlatParameter)
+                p.device for p in module.parameters() if isinstance(p, FlatParameter)
             }
             assert len(devices) > 0
             self.assertEqual(1, len(devices))
@@ -328,11 +325,10 @@ def _check_device_matches(module, device_id):
                 self.process_group,
                 FSDPInitMode.RECURSIVE,
                 CUDAInitMode.CUDA_BEFORE,
-                fsdp_kwargs={"device_id": torch.device("cuda")}
+                fsdp_kwargs={"device_id": torch.device("cuda")},
             )
         _check_device_matches(
-            nested_wrapped_module,
-            torch.device("cuda", torch.cuda.current_device())
+            nested_wrapped_module, torch.device("cuda", torch.cuda.current_device())
         )
 
     @skip_if_lt_x_gpu(2)
@@ -340,10 +336,9 @@ def test_module_device_mismatches_device_id(self):
         """Tests that specifying a ``device_id`` argument to FSDP for a GPU
         module that does not match the GPU device ID raises an error."""
         context = (
-            self.assertRaisesRegex(
-                ValueError,
-                f"cuda:{self.rank} vs cuda:0"
-            ) if self.rank != 0 else suppress()
+            self.assertRaisesRegex(ValueError, f"cuda:{self.rank} vs cuda:0")
+            if self.rank != 0
+            else suppress()
         )
         with context:
             NestedWrappedModule.init(
@@ -360,6 +355,7 @@ def test_module_device_mismatches_device_id(self):
     def test_multi_device_not_supported(self):
         """Tests that wrapping a multi-device module (i.e. with submodules on
         both GPU and CPU) with FSDP raises an error."""
+
         class MultiDeviceModule(nn.Module):
             def __init__(self):
                 super().__init__()
@@ -392,11 +388,14 @@ def test_no_params(self):
         # is computed as torch.cuda.current_device when there are no params.
         no_params = nn.ReLU().cuda()
         context = (
-            self.assertRaisesRegex(
-                ValueError,
-                f"Inconsistent.*cuda:{self.rank} vs cuda:0"
+            (
+                self.assertRaisesRegex(
+                    ValueError, f"Inconsistent.*cuda:{self.rank} vs cuda:0"
+                )
             )
-        ) if self.rank != 0 else suppress()
+            if self.rank != 0
+            else suppress()
+        )
         with context:
             module = FSDP(no_params, device_id=0)
 
@@ -439,7 +438,7 @@ def test_cpu_init_with_sync_module_states(self):
         )
         with self.assertRaisesRegex(
             ValueError,
-            "Module has CPU parameters, but sync_module_states=True is specified."
+            "Module has CPU parameters, but sync_module_states=True is specified.",
         ):
             FSDP(nested_wrapped_module, self.process_group, sync_module_states=True)
 
@@ -457,6 +456,7 @@ def test_fsdp_same_model_across_ranks(self):
         FSDP broadcasts model from rank 0 to ensure it starts off with the same
         values.
         """
+
         class MyModel(nn.Module):
             def __init__(self, rank):
                 super().__init__()
@@ -467,19 +467,27 @@ def __init__(self, rank):
                 self.register_buffer("buffer", torch.ones(1) * rank)
 
         m = MyModel(self.rank).cuda()
-        _assert_module_states(m, process_group=self.process_group, assert_fn=self.assertNotEqual)
+        _assert_module_states(
+            m, process_group=self.process_group, assert_fn=self.assertNotEqual
+        )
         # Passing sync_module_states into FSDP makes model the same during init.
         fsdp = FSDP(m, sync_module_states=True)
         with fsdp.summon_full_params(fsdp):
-            _assert_module_states(fsdp, process_group=self.process_group, assert_fn=self.assertEqual)
+            _assert_module_states(
+                fsdp, process_group=self.process_group, assert_fn=self.assertEqual
+            )
 
         # sync_module_states also works with CPU module with device_id passed in
         m = MyModel(self.rank)
-        _assert_module_states(m, process_group=self.process_group, assert_fn=self.assertNotEqual)
+        _assert_module_states(
+            m, process_group=self.process_group, assert_fn=self.assertNotEqual
+        )
         # Passing sync_module_states into FSDP makes model the same during init.
         fsdp = FSDP(m, device_id=torch.cuda.current_device(), sync_module_states=True)
         with fsdp.summon_full_params(fsdp):
-            _assert_module_states(fsdp, process_group=self.process_group, assert_fn=self.assertEqual)
+            _assert_module_states(
+                fsdp, process_group=self.process_group, assert_fn=self.assertEqual
+            )
 
 
 instantiate_parametrized_tests(TestFSDPMisc)
diff --git a/test/distributed/fsdp/test_fsdp_mixed_precision.py b/test/distributed/fsdp/test_fsdp_mixed_precision.py
index 4440e394179ab..a65d0378a3a94 100644
--- a/test/distributed/fsdp/test_fsdp_mixed_precision.py
+++ b/test/distributed/fsdp/test_fsdp_mixed_precision.py
@@ -11,9 +11,13 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import distributed as dist
-from torch.distributed.fsdp import BackwardPrefetch, CPUOffload
-from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from torch.distributed.fsdp import MixedPrecision, ShardingStrategy
+from torch.distributed.fsdp import (
+    BackwardPrefetch,
+    CPUOffload,
+    FullyShardedDataParallel as FSDP,
+    MixedPrecision,
+    ShardingStrategy,
+)
 from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
 from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy
 from torch.nn.modules.batchnorm import _BatchNorm
@@ -23,19 +27,20 @@
     CUDAInitMode,
     FSDPInitMode,
     FSDPTest,
-    TransformerWithSharedParams,
     subtest_name,
+    TransformerWithSharedParams,
 )
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
     sandcastle_skip_if,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 
 try:
     import torchvision
+
     HAS_TORCHVISION = True
 except ImportError:
     HAS_TORCHVISION = False
@@ -66,7 +71,9 @@
 mp_only_reduce = MixedPrecision(reduce_dtype=torch.float16)
 
 # Only parameters are cast (thus comm should happen in the param_dtype precision)
-mp_only_param_and_buf = MixedPrecision(param_dtype=torch.float16, buffer_dtype=torch.float16)
+mp_only_param_and_buf = MixedPrecision(
+    param_dtype=torch.float16, buffer_dtype=torch.float16
+)
 
 # Nothing is cast (thus param, comm, grad, and buffer should be in the full precision)
 mp_no_mixed_precision = MixedPrecision()
@@ -80,7 +87,7 @@
     mp_diff_buffer_and_reduce = MixedPrecision(
         param_dtype=torch.float16,
         buffer_dtype=torch.bfloat16,
-        reduce_dtype=torch.float32
+        reduce_dtype=torch.float32,
     )
     mp_configs.extend([mp_diff_buffer_and_reduce])
 
@@ -88,18 +95,18 @@
 _BUFFER_ORIG_DTYPE = torch.float64
 
 params = "mp_config,cpu_offload,full_precision_param_dtype,enable_sharded_grad_scaler"
-cpu_offload_config = [
-    CPUOffload(offload_params=True), CPUOffload(offload_params=False)
-]
+cpu_offload_config = [CPUOffload(offload_params=True), CPUOffload(offload_params=False)]
 full_precision_param_dtype_config = [torch.float32, torch.float64]
 enable_sharded_grad_scaler = ["enable_sharded_grad_scaler", None]
 
-configs = list(product(
-    mp_configs,
-    cpu_offload_config,
-    full_precision_param_dtype_config,
-    enable_sharded_grad_scaler,
-))
+configs = list(
+    product(
+        mp_configs,
+        cpu_offload_config,
+        full_precision_param_dtype_config,
+        enable_sharded_grad_scaler,
+    )
+)
 
 test_name_mapping = {
     str(CPUOffload(offload_params=True)): "offload_true",
@@ -110,18 +117,21 @@
     str(mp_no_mixed_precision): "mp_no_mp",
     str(torch.float32): "fp32",
     str(torch.float64): "fp64",
-    "enable_sharded_grad_scaler": "enable_sharded_grad_scaler"
+    "enable_sharded_grad_scaler": "enable_sharded_grad_scaler",
 }
 
 if nccl_supports_bf16:
-    test_name_mapping.update({
-        str(mp_diff_buffer_and_reduce): "mp_diff_buffer_reduce",
-    })
+    test_name_mapping.update(
+        {
+            str(mp_diff_buffer_and_reduce): "mp_diff_buffer_reduce",
+        }
+    )
 
 subtest_name = partial(subtest_name, test_name_mapping)
 
 _CURRENT_FULL_PRECISION_PARAM_DTYPE = None
 
+
 @contextlib.contextmanager
 def patch_reduce_scatter(new_reduce_scatter, full_precision_param_dtype):
     """
@@ -138,14 +148,16 @@ def patch_reduce_scatter(new_reduce_scatter, full_precision_param_dtype):
         dist.reduce_scatter_tensor = orig_reduce_scatter
         _CURRENT_FULL_PRECISION_PARAM_DTYPE = None
 
+
 class LinearMixedPrecision(nn.Module):
     """
     A linear module with extra checks for mixed precision training.
     """
+
     def __init__(self, param_dtype):
         super().__init__()
         self.lin = nn.Linear(10, 10, bias=False).to(param_dtype)
-        self.register_buffer('buffer', torch.randn((1, 2), dtype=_BUFFER_ORIG_DTYPE))
+        self.register_buffer("buffer", torch.randn((1, 2), dtype=_BUFFER_ORIG_DTYPE))
         self._orig_param_type = param_dtype
         self._orig_buffer_dtype = _BUFFER_ORIG_DTYPE
 
@@ -153,11 +165,13 @@ def forward(self, tup):
         # Param and input should be the mixed precision type
         inp, cls, fsdp, mp_config, full_precision_param_dtype = tup
         expected_param_type = (
-            mp_config.param_dtype if mp_config.param_dtype is not None
+            mp_config.param_dtype
+            if mp_config.param_dtype is not None
             else self._orig_param_type
         )
         expected_buffer_type = (
-            mp_config.buffer_dtype if mp_config.buffer_dtype is not None
+            mp_config.buffer_dtype
+            if mp_config.buffer_dtype is not None
             else self._orig_buffer_dtype
         )
         cls.assertEqual(inp.dtype, expected_param_type)
@@ -193,7 +207,7 @@ def forward(self, tup):
                     if mp_config.param_dtype is not None:
                         cls.assertEqual(0, param._mp_shard.storage().size())
                     else:
-                        cls.assertFalse(hasattr(param, '_mp_shard'))
+                        cls.assertFalse(hasattr(param, "_mp_shard"))
                 elif param_is_sharded:
                     # This FSDP unit is not active as full param has been
                     # freed or not yet allocated. Ensure param points to full
@@ -219,7 +233,9 @@ def world_size(self):
     def _get_simple_nested_model(self, param_dtype, *fsdp_args, **fsdp_kwargs):
         model = FSDP(
             nn.Sequential(
-                FSDP(LinearMixedPrecision(param_dtype).cuda(), *fsdp_args, **fsdp_kwargs),
+                FSDP(
+                    LinearMixedPrecision(param_dtype).cuda(), *fsdp_args, **fsdp_kwargs
+                ),
                 LinearMixedPrecision(param_dtype).cuda(),
             ),
             *fsdp_args,
@@ -228,7 +244,9 @@ def _get_simple_nested_model(self, param_dtype, *fsdp_args, **fsdp_kwargs):
         return model
 
     def _get_simple_model(self, param_dtype, *fsdp_args, **fsdp_kwargs):
-        model = FSDP(LinearMixedPrecision(param_dtype).cuda(), *fsdp_args, **fsdp_kwargs)
+        model = FSDP(
+            LinearMixedPrecision(param_dtype).cuda(), *fsdp_args, **fsdp_kwargs
+        )
         return model
 
     def _validate_no_mp_shard(self, fsdp_model):
@@ -239,7 +257,7 @@ def _validate_no_mp_shard(self, fsdp_model):
         fsdp_units = FSDP.fsdp_modules(fsdp_model)
         for fsdp in fsdp_units:
             for param in fsdp.params:
-                self.assertFalse(hasattr(param, '_mp_shard'))
+                self.assertFalse(hasattr(param, "_mp_shard"))
 
     def _validate_mp_shard_freed(self, fsdp_model):
         """
@@ -251,11 +269,7 @@ def _validate_mp_shard_freed(self, fsdp_model):
                 self.assertEqual(0, param._mp_shard.storage().size())
 
     def _reduce_scatter_validate_mp(
-        self,
-        orig_reduce_scatter,
-        mp_config,
-        *args,
-        **kwargs
+        self, orig_reduce_scatter, mp_config, *args, **kwargs
     ):
         """
         Runs reduce-scatter but verifies mixed precision settings before. This
@@ -278,9 +292,11 @@ def _reduce_scatter_validate_mp(
         # If reduce_dtype is not specified (is None) we comm. in the param_dtype
         # if that is specified, otherwise full precision dtype.
         expected_dtype = (
-            mp_config.reduce_dtype if mp_config.reduce_dtype is not None
+            mp_config.reduce_dtype
+            if mp_config.reduce_dtype is not None
             else (
-                mp_config.param_dtype if mp_config.param_dtype is not None
+                mp_config.param_dtype
+                if mp_config.param_dtype is not None
                 else _CURRENT_FULL_PRECISION_PARAM_DTYPE
             )
         )
@@ -357,14 +373,18 @@ def _run_test_mixed_precision_e2e(
             # Patch reduce_scatter to add validation for mixed precision types.
             orig_reduce_scatter = dist.reduce_scatter_tensor
             test_reduce_scatter = partial(
-                self._reduce_scatter_validate_mp, orig_reduce_scatter, mp_config,
+                self._reduce_scatter_validate_mp,
+                orig_reduce_scatter,
+                mp_config,
             )
             with patch_reduce_scatter(test_reduce_scatter, full_precision_param_dtype):
                 scaler = ShardedGradScaler(enabled=enable_sharded_grad_scaler)
                 optim = torch.optim.Adam(model.parameters())
 
                 for _ in range(3):
-                    inp = torch.randn(3, 10, device='cuda', dtype=full_precision_param_dtype)
+                    inp = torch.randn(
+                        3, 10, device="cuda", dtype=full_precision_param_dtype
+                    )
                     # Forward pass of LinearMixedPrecision check casting of
                     # inputs, params, buffers.
                     act, *_ = model(
@@ -409,7 +429,9 @@ def _run_test_mixed_precision_e2e(
                     for param in model.parameters():
                         self.assertEqual(param.dtype, full_precision_param_dtype)
                         if param.grad is not None:
-                            self.assertEqual(param.grad.dtype, full_precision_param_dtype)
+                            self.assertEqual(
+                                param.grad.dtype, full_precision_param_dtype
+                            )
 
                     # Unscale the gradients and step
                     scaler.step(optim)
@@ -448,8 +470,9 @@ def _run_test_mixed_precision_e2e(
                             self.assertEqual(tensor.dtype, _BUFFER_ORIG_DTYPE)
                         else:
                             self.assertEqual(
-                                tensor.dtype, full_precision_param_dtype,
-                                f"{name}: {tensor.dtype} vs {full_precision_param_dtype}"
+                                tensor.dtype,
+                                full_precision_param_dtype,
+                                f"{name}: {tensor.dtype} vs {full_precision_param_dtype}",
                             )
 
                     # After state_dict, buffer's dtype should have been restored
@@ -475,7 +498,7 @@ def _get_subtest_config(self) -> Dict[str, List[Any]]:
                 None,
                 BackwardPrefetch.BACKWARD_PRE,
                 BackwardPrefetch.BACKWARD_POST,
-            ]
+            ],
         }
 
     @skip_if_lt_x_gpu(2)
@@ -518,7 +541,9 @@ def _test_mixed_precision_embedding_table(self, mp_config):
         param_dtype = mp_config.param_dtype or torch.float32
         orig_reduce_scatter = dist.reduce_scatter_tensor
         test_reduce_scatter = partial(
-            self._reduce_scatter_validate_mp, orig_reduce_scatter, mp_config,
+            self._reduce_scatter_validate_mp,
+            orig_reduce_scatter,
+            mp_config,
         )
         with patch_reduce_scatter(test_reduce_scatter, param_dtype):
             # TODO: `test_mp_embedding_reduce()` fails if we do not wrap the
@@ -570,9 +595,11 @@ def test_mp_embedding_params_and_reduce_diff(self):
         params_and_reduce_different = MixedPrecision(
             param_dtype=torch.float16,
             reduce_dtype=torch.float32,
-            buffer_dtype=torch.float16
+            buffer_dtype=torch.float16,
+        )
+        self._test_mixed_precision_embedding_table(
+            mp_config=params_and_reduce_different
         )
-        self._test_mixed_precision_embedding_table(mp_config=params_and_reduce_different)
 
     @skip_if_lt_x_gpu(2)
     @skipIfNoTorchVision
@@ -583,11 +610,12 @@ def test_mixed_precision_resnet(self):
         """
         resnet_model = torchvision.models.resnet50().cuda()
         resnet_model = nn.SyncBatchNorm.convert_sync_batchnorm(
-            resnet_model,
-            process_group=dist.distributed_c10d._get_default_group()
+            resnet_model, process_group=dist.distributed_c10d._get_default_group()
         )
-        n_bn = sum(1 if isinstance(x, _BatchNorm) else 0 for x in resnet_model.modules())
-        inp = torch.ones(1, 3, 1000, 1000, device='cuda')
+        n_bn = sum(
+            1 if isinstance(x, _BatchNorm) else 0 for x in resnet_model.modules()
+        )
+        inp = torch.ones(1, 3, 1000, 1000, device="cuda")
         mp_config = MixedPrecision(
             param_dtype=torch.float16,
             reduce_dtype=torch.float16,
@@ -596,7 +624,7 @@ def test_mixed_precision_resnet(self):
         fsdp = FSDP(
             resnet_model,
             auto_wrap_policy=size_based_auto_wrap_policy,
-            mixed_precision=mp_config
+            mixed_precision=mp_config,
         )
         # Batchnorm units should be wrapped individually. Validate this by
         # ensuring there are equal no. of FSDP units that are BN as BN units
@@ -652,7 +680,7 @@ def never_wrap_policy(*args, **kwargs):
         )
         with self.assertWarnsRegex(
             expected_warning=UserWarning,
-            expected_regex="batch norm submodules will be wrapped as separate"
+            expected_regex="batch norm submodules will be wrapped as separate",
         ):
             model = FSDP(
                 net,
@@ -669,7 +697,7 @@ def never_wrap_policy(*args, **kwargs):
         self.assertEqual(no_mixed_precision, bn.mixed_precision)
         self.assertNotEqual(no_mixed_precision, model.mixed_precision)
 
-        inp = torch.randn((1, 2), device='cuda')
+        inp = torch.randn((1, 2), device="cuda")
         # Without FSDP BN mixed precision fix, this would result in
         # RuntimeError: Expected counts to have type Half but got Float
         # for syncBN
@@ -680,6 +708,7 @@ class TestFSDPMixedPrecisionUnsharded(TestFSDPMixedPrecision):
     """
     Smaller test suite for unshared param (i.e. world_size == 1) case.
     """
+
     @property
     def world_size(self):
         return 1
@@ -719,6 +748,7 @@ def test_mixed_precision_e2e_full_shard(self):
             enable_sharded_grad_scaler=False,
         )
 
+
 instantiate_parametrized_tests(TestFSDPMixedPrecisionSharded)
 
 if __name__ == "__main__":
diff --git a/test/distributed/fsdp/test_fsdp_multiple_forward.py b/test/distributed/fsdp/test_fsdp_multiple_forward.py
index c9afbd465f28e..7823f9349a005 100644
--- a/test/distributed/fsdp/test_fsdp_multiple_forward.py
+++ b/test/distributed/fsdp/test_fsdp_multiple_forward.py
@@ -9,12 +9,8 @@
 from torch.nn.parallel import DistributedDataParallel
 from torch.optim import SGD
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.testing._internal.common_fsdp import (
-    FSDPTest,
-    get_full_params,
-)
-from torch.testing._internal.common_utils import TEST_WITH_DEV_DBG_ASAN, run_tests
-
+from torch.testing._internal.common_fsdp import FSDPTest, get_full_params
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
diff --git a/test/distributed/fsdp/test_fsdp_multiple_wrapping.py b/test/distributed/fsdp/test_fsdp_multiple_wrapping.py
index 0a3b9e2e2e068..58298fcce26ff 100644
--- a/test/distributed/fsdp/test_fsdp_multiple_wrapping.py
+++ b/test/distributed/fsdp/test_fsdp_multiple_wrapping.py
@@ -9,8 +9,7 @@
 from torch.optim import SGD
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import FSDPTest
-from torch.testing._internal.common_utils import TEST_WITH_DEV_DBG_ASAN, run_tests
-
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
diff --git a/test/distributed/fsdp/test_fsdp_optim_state.py b/test/distributed/fsdp/test_fsdp_optim_state.py
index e4199ad532a6b..5fe75ee309fa5 100644
--- a/test/distributed/fsdp/test_fsdp_optim_state.py
+++ b/test/distributed/fsdp/test_fsdp_optim_state.py
@@ -2,7 +2,7 @@
 
 import bisect
 import sys
-from enum import Enum, auto
+from enum import auto, Enum
 from typing import Any, Callable, Dict, List, Optional, Tuple, Type
 
 import torch
@@ -25,15 +25,13 @@
     TransformerWithSharedParams,
 )
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 
-STATE_DICT_TYPE = [
-    StateDictType.FULL_STATE_DICT, StateDictType.SHARDED_STATE_DICT
-]
+STATE_DICT_TYPE = [StateDictType.FULL_STATE_DICT, StateDictType.SHARDED_STATE_DICT]
 
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
@@ -49,6 +47,7 @@
 
 class _OSDCommMethod(Enum):
     """Method for communicating the optimizer state dict for internal tests."""
+
     BROADCAST_OBJECT_LIST = auto()
     SCATTER_FULL_OSD = auto()
     FLATTEN_SHARDED_OSD = auto()
@@ -56,12 +55,14 @@ class _OSDCommMethod(Enum):
 
 class _ModelClass(Enum):
     """Different model type to test."""
+
     NESTED = auto()
     TRANSFORMER = auto()
 
 
 class Bias(torch.nn.Module):
     """This module applies a 1D additive bias with dimension ``dim``."""
+
     def __init__(self, dim: int) -> None:
         super().__init__()
         assert dim > 0
@@ -82,6 +83,7 @@ class BlockA(torch.nn.Module):
         Bias1
             bias
     """
+
     def __init__(self, in_dim: int, out_dim: int) -> None:
         super().__init__()
         assert all(v > 0 for v in (in_dim, out_dim))
@@ -98,6 +100,7 @@ def forward(self, x):
         x = self.bias_module1(x)
         return x
 
+
 class BlockB(torch.nn.Module):
     """
     Used to define interesting nested structure for FSDP wrapping.
@@ -108,6 +111,7 @@ class BlockB(torch.nn.Module):
         Bias
             bias
     """
+
     def __init__(self, in_dim: int, out_dim: int) -> None:
         super().__init__()
         assert all(v > 0 for v in (in_dim, out_dim))
@@ -166,21 +170,30 @@ def wrap(
             fsdp_kwargs = {}
         # Flatten Bias0; then flatten weight and Bias1 together into `block1`
         model.block1.bias_module0 = FSDP(
-            model.block1.bias_module0, process_group=group, **fsdp_kwargs,
+            model.block1.bias_module0,
+            process_group=group,
+            **fsdp_kwargs,
         )
         model.block1 = FSDP(model.block1, process_group=group, **fsdp_kwargs)
         # Flatten Bias0; flatten Bias1; then flatten weight into `block2[1]`
         model.block2[1].bias_module0 = FSDP(
-            model.block2[1].bias_module0, process_group=group, **fsdp_kwargs,
+            model.block2[1].bias_module0,
+            process_group=group,
+            **fsdp_kwargs,
         )
         model.block2[1].bias_module1 = FSDP(
-            model.block2[1].bias_module1, process_group=group, **fsdp_kwargs,
+            model.block2[1].bias_module1,
+            process_group=group,
+            **fsdp_kwargs,
         )
         model.block2[1] = FSDP(model.block2[1], process_group=group, **fsdp_kwargs)
         # Flatten weight, Bias, bias into `block2[2]`
         ignored_modules = [model.block2[2].bias_module0] if ignore_modules else None
         model.block2[2] = FSDP(
-            model.block2[2], process_group=group, ignored_modules=ignored_modules, **fsdp_kwargs,
+            model.block2[2],
+            process_group=group,
+            ignored_modules=ignored_modules,
+            **fsdp_kwargs,
         )
         return model
 
@@ -193,7 +206,9 @@ def wrap_alt(
         if fsdp_kwargs is None:
             fsdp_kwargs = {}
         model.block0.bias_module0 = FSDP(
-            model.block0.bias_module0, process_group=group, **fsdp_kwargs,
+            model.block0.bias_module0,
+            process_group=group,
+            **fsdp_kwargs,
         )
         model.block0 = FSDP(model.block0, process_group=group, **fsdp_kwargs)
         return model
@@ -211,7 +226,8 @@ def wrap_with_unmanaged_params(
         # (`model.block2[2]`) or a module not to be wrapped with FSDP (`model`)
         register_module = model.block2[2] if add_to_fsdp_module else model
         register_module.register_parameter(
-            "unmanaged_param", unmanaged_param,
+            "unmanaged_param",
+            unmanaged_param,
         )
         # For simplicity, we only add a single unmanaged parameter, but should
         # be easy to generalize if needed
@@ -256,8 +272,7 @@ def param_group0(self) -> List[torch.nn.Parameter]:
     def param_group1(self) -> List[torch.nn.Parameter]:
         # Deviate from the `model.parameters()` order further by rearranging
         # `block2`'s parameters to be before `block0`'s parameters
-        return list(self.block2.parameters()) + \
-            list(self.block0.parameters())
+        return list(self.block2.parameters()) + list(self.block0.parameters())
 
 
 class TestFSDPOptimState(FSDPTest):
@@ -281,14 +296,17 @@ def _init_nested_model(
     ):
         model = NestedModel().to(device)
         if wrap:
-            model = NestedModel.wrap_alt(model, group, fsdp_kwargs) if wrap_alt \
+            model = (
+                NestedModel.wrap_alt(model, group, fsdp_kwargs)
+                if wrap_alt
                 else NestedModel.wrap(model, group, fsdp_kwargs=fsdp_kwargs)
+            )
         if not use_multiple_param_groups:
             optim_input = list(model.parameters())
         else:
             optim_input = [
                 {"params": model.param_group0()},
-                {"params": model.param_group1(), "weight_decay": 0.9}
+                {"params": model.param_group1(), "weight_decay": 0.9},
             ]
         # Use a reversed parameter order for the optimizer input on odd ranks
         if use_diff_optim_inputs and self.rank % 2 == 1:
@@ -353,7 +371,9 @@ def _broadcast_full_osd(self, full_osd: Dict[str, Any], group=None):
         ``torch.save()`` and ``torch.load()`` so that all ranks can have it."""
         obj_list = [full_osd]
         dist.broadcast_object_list(
-            obj_list, src=0, group=group,
+            obj_list,
+            src=0,
+            group=group,
         )
         full_osd = obj_list[0]
         return full_osd
@@ -375,8 +395,9 @@ def _are_equal_states(
                 # Check the values on CPU to be device-agnostic
                 value1 = value1.cpu()
                 value2 = value2.cpu()
-                if value1.shape != value2.shape or \
-                        not torch.all(torch.isclose(value1, value2)):
+                if value1.shape != value2.shape or not torch.all(
+                    torch.isclose(value1, value2)
+                ):
                     return False
             else:  # non-tensor state
                 if value1 != value2:
@@ -422,10 +443,12 @@ def _check_same_state(
             # Check for at least one match (may be > 1 in toy edge cases, e.g.
             # multiple biases); nonetheless, each having >= 1 match and the two
             # lists having equal length imply that the list contents are equal
-            self.assertTrue(any(
-                self._are_equal_states(fsdp_osd_state, ref_osd_state)
-                for ref_osd_state in ref_osd_states
-            ))
+            self.assertTrue(
+                any(
+                    self._are_equal_states(fsdp_osd_state, ref_osd_state)
+                    for ref_osd_state in ref_osd_states
+                )
+            )
 
     def _check_same_param_groups(
         self,
@@ -443,10 +466,12 @@ def _check_same_param_groups(
         full_osd_param_groups = full_osd["param_groups"]
         self.assertTrue(len(full_osd_param_groups), len(ref_osd_param_groups))
         for full_osd_pg, ref_osd_pg in zip(
-            full_osd_param_groups, ref_osd_param_groups,
+            full_osd_param_groups,
+            ref_osd_param_groups,
         ):
             self.assertEqual(
-                set(full_osd_pg.keys()), set(ref_osd_pg.keys()),
+                set(full_osd_pg.keys()),
+                set(ref_osd_pg.keys()),
             )
             for name, full_osd_value in full_osd_pg.items():
                 if name == "params" and not check_same_param_keys:
@@ -508,18 +533,24 @@ def _test_optim_state_dict_nested(
             return  # not supported
         NUM_ITERS = 3
         model1, optim1, optim_input = self._init_nested_model(
-            wrap=True, use_multiple_param_groups=use_multiple_param_groups,
+            wrap=True,
+            use_multiple_param_groups=use_multiple_param_groups,
             use_diff_optim_inputs=use_diff_optim_inputs,
         )
         losses1 = self._step_model(model1, optim1, num_iters=NUM_ITERS)
         if state_dict_type == StateDictType.FULL_STATE_DICT:
             if use_optim_input:
                 fsdp_osd = FSDP.full_optim_state_dict(
-                    model1, optim1, optim_input, rank0_only=rank0_only,
+                    model1,
+                    optim1,
+                    optim_input,
+                    rank0_only=rank0_only,
                 )
             else:
                 fsdp_osd = FSDP.full_optim_state_dict(
-                    model1, optim1, rank0_only=rank0_only,
+                    model1,
+                    optim1,
+                    rank0_only=rank0_only,
                 )
         else:
             if use_optim_input:
@@ -531,7 +562,8 @@ def _test_optim_state_dict_nested(
             self.assertEqual(len(fsdp_osd), 0)
             return
         model2, optim2, _ = self._init_nested_model(
-            wrap=False, use_multiple_param_groups=use_multiple_param_groups,
+            wrap=False,
+            use_multiple_param_groups=use_multiple_param_groups,
             use_diff_optim_inputs=use_diff_optim_inputs,
         )
         losses2 = self._step_model(model2, optim2, num_iters=NUM_ITERS)
@@ -544,10 +576,14 @@ def _test_optim_state_dict_nested(
         # parameter IDs
         check_same_param_keys = False
         self._check_same_param_groups(
-            fsdp_osd, ref_osd, check_same_param_keys=check_same_param_keys,
+            fsdp_osd,
+            ref_osd,
+            check_same_param_keys=check_same_param_keys,
         )
         self._check_same_state(
-            fsdp_osd, ref_osd, check_same_param_keys=check_same_param_keys,
+            fsdp_osd,
+            ref_osd,
+            check_same_param_keys=check_same_param_keys,
         )
 
     @skip_if_lt_x_gpu(2)
@@ -562,12 +598,13 @@ def test_full_optim_state_dict_keys(self):
         # Add checkpointing to ensure optim_state_dict and state_dict strip out
         # checkpointing prefixes.
         apply_activation_checkpointing(
-            model,
-            check_fn=lambda module: isinstance(module, torch.nn.Sequential)
+            model, check_fn=lambda module: isinstance(module, torch.nn.Sequential)
         )
         optim = torch.optim.Adam(wrapped_model.parameters(), lr=1e-3)
         self._step_model(model, optim, device)
-        optim_state_dict = FSDP.full_optim_state_dict(wrapped_model, optim, rank0_only=False)
+        optim_state_dict = FSDP.full_optim_state_dict(
+            wrapped_model, optim, rank0_only=False
+        )
         with FSDP.state_dict_type(wrapped_model, StateDictType.FULL_STATE_DICT):
             state_dict = wrapped_model.state_dict()
         self.assertEqual(optim_state_dict["state"].keys(), state_dict.keys())
@@ -771,11 +808,13 @@ def _test_load_optim_state(
 
         # First, run a wrapped model with full world size for a few iterations
         model1, optim1, optim_input1 = initializer(
-            wrap=True, use_multiple_param_groups=use_multiple_param_groups,
+            wrap=True,
+            use_multiple_param_groups=use_multiple_param_groups,
         )
         self._step_model(model1, optim1, num_iters=NUM_ITERS)
         fsdp_osd1 = (
-            osd_method(model1, optim1, optim_input1) if use_optim_input
+            osd_method(model1, optim1, optim_input1)
+            if use_optim_input
             else osd_method(model1, optim1)
         )
         if halve_world_size:
@@ -790,7 +829,8 @@ def _test_load_optim_state(
         # Second, run a wrapped model with (possibly) halved world size and
         # (possibly) differing `optim_input` across ranks
         model2, optim2, optim_input2 = initializer(
-            wrap=True, group=new_group,
+            wrap=True,
+            group=new_group,
             use_multiple_param_groups=use_multiple_param_groups,
             use_diff_optim_inputs=use_diff_optim_inputs,
             **new_model_kwargs,  # specify `wrap_alt` to change wrapping
@@ -807,13 +847,17 @@ def _test_load_optim_state(
         if osd_comm_method == _OSDCommMethod.BROADCAST_OBJECT_LIST:
             fsdp_osd1 = self._broadcast_full_osd(fsdp_osd1, group=new_group)
             sharded_osd1 = (
-                FSDP.shard_full_optim_state_dict(fsdp_osd1, model2, optim_input=optim_input2)
+                FSDP.shard_full_optim_state_dict(
+                    fsdp_osd1, model2, optim_input=optim_input2
+                )
                 if use_optim_input
                 else FSDP.shard_full_optim_state_dict(fsdp_osd1, model2, optim=optim2)
             )
             fsdp_osd2 = self._broadcast_full_osd(fsdp_osd2, group=new_group)
             sharded_osd2 = (
-                FSDP.shard_full_optim_state_dict(fsdp_osd2, model2, optim_input=optim_input2)
+                FSDP.shard_full_optim_state_dict(
+                    fsdp_osd2, model2, optim_input=optim_input2
+                )
                 if use_optim_input
                 else FSDP.shard_full_optim_state_dict(fsdp_osd2, model2, optim=optim2)
             )
@@ -824,7 +868,8 @@ def _test_load_optim_state(
                     model2,
                     optim_input=optim_input2,
                     group=new_group,
-                ) if use_optim_input
+                )
+                if use_optim_input
                 else FSDP.scatter_full_optim_state_dict(
                     fsdp_osd1 if self.rank == 0 else None,
                     model2,
@@ -838,7 +883,8 @@ def _test_load_optim_state(
                     model2,
                     optim_input=optim_input2,
                     group=new_group,
-                ) if use_optim_input
+                )
+                if use_optim_input
                 else FSDP.scatter_full_optim_state_dict(
                     fsdp_osd2 if self.rank == 0 else None,
                     model2,
@@ -851,18 +897,28 @@ def _test_load_optim_state(
         elif osd_comm_method == _OSDCommMethod.FLATTEN_SHARDED_OSD:
             sharded_osd1 = (
                 FSDP.flatten_sharded_optim_state_dict(
-                    fsdp_osd1, model2, optim_input=optim_input2,
-                ) if use_optim_input
+                    fsdp_osd1,
+                    model2,
+                    optim_input=optim_input2,
+                )
+                if use_optim_input
                 else FSDP.flatten_sharded_optim_state_dict(
-                    fsdp_osd1, model2, optim=optim2,
+                    fsdp_osd1,
+                    model2,
+                    optim=optim2,
                 )
             )
             sharded_osd2 = (
                 FSDP.flatten_sharded_optim_state_dict(
-                    fsdp_osd2, model2, optim_input=optim_input2,
-                ) if use_optim_input
+                    fsdp_osd2,
+                    model2,
+                    optim_input=optim_input2,
+                )
+                if use_optim_input
                 else FSDP.flatten_sharded_optim_state_dict(
-                    fsdp_osd2, model2, optim=optim2,
+                    fsdp_osd2,
+                    model2,
+                    optim=optim2,
                 )
             )
 
@@ -872,22 +928,26 @@ def _test_load_optim_state(
         local_osd2 = optim2.state_dict()
         check_same_param_keys = True  # should all have matching parameter IDs
         self._check_same_param_groups(
-            sharded_osd2, local_osd2,
+            sharded_osd2,
+            local_osd2,
             check_same_param_keys=check_same_param_keys,
         )
         self._check_same_state(
-            sharded_osd2, local_osd2,
+            sharded_osd2,
+            local_osd2,
             check_same_param_keys=check_same_param_keys,
         )
         # Check that sharding the first model's full/sharded optimizer state dict
         # according to the second model is equivalent to the second model's
         # local optimizer state dict
         self._check_same_param_groups(
-            sharded_osd1, local_osd2,
+            sharded_osd1,
+            local_osd2,
             check_same_param_keys=check_same_param_keys,
         )
         self._check_same_state(
-            sharded_osd1, local_osd2,
+            sharded_osd1,
+            local_osd2,
             check_same_param_keys=check_same_param_keys,
         )
         # As a sanity check, check that we can load and run a few iterations
@@ -955,7 +1015,8 @@ def _test_shard_full_optim_state_dict_unmanaged_params(
         device = torch.device("cuda")
         model = NestedModel().to(device)
         model, unmanaged_params = NestedModel.wrap_with_unmanaged_params(
-            model, add_to_fsdp_module,
+            model,
+            add_to_fsdp_module,
         )
         optim_input = list(model.parameters())
         optim = torch.optim.Adam(optim_input, lr=1e-3)
@@ -965,21 +1026,31 @@ def _test_shard_full_optim_state_dict_unmanaged_params(
             # unflattened parameters with zero-dimensional tensor state (i.e.
             # Adam "step") and others without (i.e. the unmanaged parameters),
             # which triggers an error that we have to ensure correctness
-            error_prefix = "^(All unflattened parameters comprising a " \
-                "single flattened parameter must have scalar state with the " \
+            error_prefix = (
+                "^(All unflattened parameters comprising a "
+                "single flattened parameter must have scalar state with the "
                 "same value and dtype)"
+            )
             with self.assertRaisesRegex(ValueError, error_prefix):
                 if state_dict_type == StateDictType.FULL_STATE_DICT:
                     (
-                        FSDP.shard_full_optim_state_dict(fsdp_osd, model, optim_input=optim_input)
+                        FSDP.shard_full_optim_state_dict(
+                            fsdp_osd, model, optim_input=optim_input
+                        )
                         if use_optim_input
-                        else FSDP.shard_full_optim_state_dict(fsdp_osd, model, optim=optim)
+                        else FSDP.shard_full_optim_state_dict(
+                            fsdp_osd, model, optim=optim
+                        )
                     )
                 else:
                     (
-                        FSDP.flatten_sharded_optim_state_dict(fsdp_osd, model, optim_input=optim_input)
+                        FSDP.flatten_sharded_optim_state_dict(
+                            fsdp_osd, model, optim_input=optim_input
+                        )
                         if use_optim_input
-                        else FSDP.flatten_sharded_optim_state_dict(fsdp_osd, model, optim=optim)
+                        else FSDP.flatten_sharded_optim_state_dict(
+                            fsdp_osd, model, optim=optim
+                        )
                     )
         else:
             # If we add the unmanaged parameters to a module not wrapped with
@@ -988,20 +1059,28 @@ def _test_shard_full_optim_state_dict_unmanaged_params(
             # externally to FSDP
             if state_dict_type == StateDictType.FULL_STATE_DICT:
                 flattened_osd = (
-                    FSDP.shard_full_optim_state_dict(fsdp_osd, model, optim_input=optim_input)
+                    FSDP.shard_full_optim_state_dict(
+                        fsdp_osd, model, optim_input=optim_input
+                    )
                     if use_optim_input
                     else FSDP.shard_full_optim_state_dict(fsdp_osd, model, optim=optim)
                 )
             else:
                 flattened_osd = (
-                    FSDP.flatten_sharded_optim_state_dict(fsdp_osd, model, optim_input=optim_input)
+                    FSDP.flatten_sharded_optim_state_dict(
+                        fsdp_osd, model, optim_input=optim_input
+                    )
                     if use_optim_input
-                    else FSDP.flatten_sharded_optim_state_dict(fsdp_osd, model, optim=optim)
+                    else FSDP.flatten_sharded_optim_state_dict(
+                        fsdp_osd, model, optim=optim
+                    )
                 )
             # Add entries for the unmanaged parameters to be able to load
             for unmanaged_param in unmanaged_params:
                 NestedModel.add_unmanaged_param_entry(
-                    flattened_osd, unmanaged_param, NUM_ITERS,
+                    flattened_osd,
+                    unmanaged_param,
+                    NUM_ITERS,
                 )
             # Check that we can load the optimizer state dict
             optim.load_state_dict(flattened_osd)
@@ -1035,7 +1114,8 @@ def _test_rekey_optim_state_dict_to_ids(
         NUM_ITERS = 3
         # Run a wrapped model for a few iterations
         model1, optim1, optim_input1 = self._init_nested_model(
-            wrap=True, use_multiple_param_groups=use_multiple_param_groups,
+            wrap=True,
+            use_multiple_param_groups=use_multiple_param_groups,
         )
         self._step_model(model1, optim1, num_iters=NUM_ITERS)
         if state_dict_type == StateDictType.FULL_STATE_DICT:
@@ -1055,28 +1135,39 @@ def _test_rekey_optim_state_dict_to_ids(
             )
         # Run a non-wrapped model for a few iterations
         model2, optim2, optim_input2 = self._init_nested_model(
-            wrap=False, use_multiple_param_groups=use_multiple_param_groups,
+            wrap=False,
+            use_multiple_param_groups=use_multiple_param_groups,
         )
         self._step_model(model2, optim2, num_iters=NUM_ITERS)
         # Re-key the wrapped model's optimizer state dict using parameter IDs
         # according to the non-wrapped model
         rekeyed_osd = (
             FSDP.rekey_optim_state_dict(
-                fsdp_osd, OptimStateKeyType.PARAM_ID, model2, optim_input=optim_input2,
+                fsdp_osd,
+                OptimStateKeyType.PARAM_ID,
+                model2,
+                optim_input=optim_input2,
             )
             if use_optim_input
             else FSDP.rekey_optim_state_dict(
-                fsdp_osd, OptimStateKeyType.PARAM_ID, model2, optim=optim2,
+                fsdp_osd,
+                OptimStateKeyType.PARAM_ID,
+                model2,
+                optim=optim2,
             )
         )
         # Check that the re-keyed dict and actual dict are the same
         osd = optim2.state_dict()
         check_same_param_keys = True
         self._check_same_param_groups(
-            rekeyed_osd, osd, check_same_param_keys=check_same_param_keys,
+            rekeyed_osd,
+            osd,
+            check_same_param_keys=check_same_param_keys,
         )
         self._check_same_state(
-            rekeyed_osd, osd, check_same_param_keys=check_same_param_keys,
+            rekeyed_osd,
+            osd,
+            check_same_param_keys=check_same_param_keys,
         )
         # As a sanity check, check that we can load and run a few iterations
         if state_dict_type != StateDictType.SHARDED_STATE_DICT:
@@ -1106,12 +1197,14 @@ def _test_rekey_optim_state_dict_to_names(
         NUM_ITERS = 3
         # Run a wrapped model for a few iterations
         model1, optim1, optim_input1 = self._init_nested_model(
-            wrap=True, use_multiple_param_groups=use_multiple_param_groups,
+            wrap=True,
+            use_multiple_param_groups=use_multiple_param_groups,
         )
         self._step_model(model1, optim1, num_iters=NUM_ITERS)
         # Run a non-wrapped model for a few iterations
         model2, optim2, optim_input2 = self._init_nested_model(
-            wrap=False, use_multiple_param_groups=use_multiple_param_groups,
+            wrap=False,
+            use_multiple_param_groups=use_multiple_param_groups,
         )
         self._step_model(model2, optim2, num_iters=NUM_ITERS)
         # Re-key the non-wrapped model's optimizer state dict using parameter
@@ -1119,20 +1212,32 @@ def _test_rekey_optim_state_dict_to_names(
         osd2 = optim2.state_dict()
         rekeyed_osd = (
             FSDP.rekey_optim_state_dict(
-                osd2, OptimStateKeyType.PARAM_NAME, model2, optim_input=optim_input2,
-            ) if use_optim_input
+                osd2,
+                OptimStateKeyType.PARAM_NAME,
+                model2,
+                optim_input=optim_input2,
+            )
+            if use_optim_input
             else FSDP.rekey_optim_state_dict(
-                osd2, OptimStateKeyType.PARAM_NAME, model2, optim=optim2,
+                osd2,
+                OptimStateKeyType.PARAM_NAME,
+                model2,
+                optim=optim2,
             )
         )
         # Shard the non-wrapped model's re-keyed optimizer state dict, which
         # maps back to (flattened) parameter IDs
         sharded_osd = (
             FSDP.shard_full_optim_state_dict(
-                rekeyed_osd, model1, optim_input=optim_input1,
-            ) if use_optim_input
+                rekeyed_osd,
+                model1,
+                optim_input=optim_input1,
+            )
+            if use_optim_input
             else FSDP.shard_full_optim_state_dict(
-                rekeyed_osd, model1, optim=optim1,
+                rekeyed_osd,
+                model1,
+                optim=optim1,
             )
         )
         # Check that this sharded optimizer state dict matches the wrapped
@@ -1140,10 +1245,14 @@ def _test_rekey_optim_state_dict_to_names(
         osd1 = optim1.state_dict()
         check_same_param_keys = True
         self._check_same_param_groups(
-            sharded_osd, osd1, check_same_param_keys=check_same_param_keys,
+            sharded_osd,
+            osd1,
+            check_same_param_keys=check_same_param_keys,
         )
         self._check_same_state(
-            sharded_osd, osd1, check_same_param_keys=check_same_param_keys,
+            sharded_osd,
+            osd1,
+            check_same_param_keys=check_same_param_keys,
         )
         # As a sanity check, check that we can load and run a few iterations
         optim1.load_state_dict(sharded_osd)
@@ -1153,6 +1262,7 @@ def _test_rekey_optim_state_dict_to_names(
     def test_optim_input_warning(self):
         """Tests that passing the ``optim_input`` argument into optimizer state
         checkpointing APIs issues a warning."""
+
         def should_check_method(method_name: str):
             # Check every method since they all accept `optim_input`
             return True
@@ -1163,12 +1273,15 @@ def get_warning_context():
                 expected_warning=UserWarning, expected_regex=warning_regex
             )
 
-        self._run_on_all_optim_state_apis(should_check_method, get_warning_context, fsdp_kwargs=None)
+        self._run_on_all_optim_state_apis(
+            should_check_method, get_warning_context, fsdp_kwargs=None
+        )
 
     @skip_if_lt_x_gpu(2)
     def test_use_orig_params_error(self):
         """Tests that the optimizer state checkpointing APIs raise an error
         when ``use_orig_params=True``."""
+
         def should_check_method(method_name: str):
             # Skip `rekey_optim_state_dict` since that does not depend on
             # `use_orig_params=True`
@@ -1181,7 +1294,9 @@ def get_error_context():
             )
 
         fsdp_kwargs = {"use_orig_params": True}
-        self._run_on_all_optim_state_apis(should_check_method, get_error_context, fsdp_kwargs)
+        self._run_on_all_optim_state_apis(
+            should_check_method, get_error_context, fsdp_kwargs
+        )
 
     def _run_on_all_optim_state_apis(
         self,
@@ -1195,12 +1310,10 @@ def _run_on_all_optim_state_apis(
         via ``should_check_method_fn``, which gets passed the string name of
         the method.
         """
-        wrapped_model, wrapped_optim, wrapped_optim_input = (
-            self._init_nested_model(
-                wrap=True,
-                use_multiple_param_groups=False,
-                fsdp_kwargs=fsdp_kwargs,
-            )
+        wrapped_model, wrapped_optim, wrapped_optim_input = self._init_nested_model(
+            wrap=True,
+            use_multiple_param_groups=False,
+            fsdp_kwargs=fsdp_kwargs,
         )
         self._step_model(wrapped_model, wrapped_optim, num_iters=2)
 
@@ -1208,14 +1321,18 @@ def _run_on_all_optim_state_apis(
         if should_check_method_fn("sharded_optim_state_dict"):
             with context_fn():
                 fsdp_osd = FSDP.sharded_optim_state_dict(
-                    wrapped_model, wrapped_optim, optim_input=wrapped_optim_input,
+                    wrapped_model,
+                    wrapped_optim,
+                    optim_input=wrapped_optim_input,
                 )
         if "fsdp_osd" not in locals():
             fsdp_osd = {}  # may not be defined due to previous method erroring
         if should_check_method_fn("flatten_sharded_optim_state_dict"):
             with context_fn():
                 FSDP.flatten_sharded_optim_state_dict(
-                    fsdp_osd, wrapped_model, optim_input=wrapped_optim_input,
+                    fsdp_osd,
+                    wrapped_model,
+                    optim_input=wrapped_optim_input,
                 )
         # Full optim state dict
         if should_check_method_fn("full_optim_state_dict"):
@@ -1229,17 +1346,23 @@ def _run_on_all_optim_state_apis(
         if should_check_method_fn("shard_full_optim_state_dict"):
             with context_fn():
                 FSDP.shard_full_optim_state_dict(
-                    fsdp_osd, wrapped_model, optim_input=wrapped_optim_input,
+                    fsdp_osd,
+                    wrapped_model,
+                    optim_input=wrapped_optim_input,
                 )
         if should_check_method_fn("scatter_full_optim_state_dict"):
             with context_fn():
                 FSDP.scatter_full_optim_state_dict(
-                    fsdp_osd, wrapped_model, optim_input=wrapped_optim_input,
+                    fsdp_osd,
+                    wrapped_model,
+                    optim_input=wrapped_optim_input,
                 )
         # Rekey optim state dict
-        nonwrapped_model, nonwrapped_optim, nonwrapped_optim_input = (
-            self._init_nested_model(wrap=False, use_multiple_param_groups=False)
-        )
+        (
+            nonwrapped_model,
+            nonwrapped_optim,
+            nonwrapped_optim_input,
+        ) = self._init_nested_model(wrap=False, use_multiple_param_groups=False)
         if should_check_method_fn("rekey_optim_state_dict"):
             with context_fn():
                 rekeyed_osd = FSDP.rekey_optim_state_dict(
diff --git a/test/distributed/fsdp/test_fsdp_overlap.py b/test/distributed/fsdp/test_fsdp_overlap.py
index 07e8eba09c6c2..8bd5354b2b701 100644
--- a/test/distributed/fsdp/test_fsdp_overlap.py
+++ b/test/distributed/fsdp/test_fsdp_overlap.py
@@ -11,16 +11,13 @@
 from torch.cuda import Event
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.testing._internal.common_fsdp import (
-    FSDPTest,
-)
+from torch.testing._internal.common_fsdp import FSDPTest
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
     get_cycles_per_ms,
     run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 
-
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
diff --git a/test/distributed/fsdp/test_fsdp_pure_fp16.py b/test/distributed/fsdp/test_fsdp_pure_fp16.py
index ed4aef39da0f9..1c663f8263354 100644
--- a/test/distributed/fsdp/test_fsdp_pure_fp16.py
+++ b/test/distributed/fsdp/test_fsdp_pure_fp16.py
@@ -12,10 +12,10 @@
     NestedWrappedModule,
 )
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 
 if not dist.is_available():
@@ -31,7 +31,6 @@
 
 
 class TestPureFP16(FSDPTest):
-
     @property
     def world_size(self):
         # Test fails due to inaccuracies when using more than 5 GPUs
diff --git a/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py b/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py
index 1c230cb7400c4..2124e6b0450f5 100644
--- a/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py
+++ b/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py
@@ -22,11 +22,11 @@
     subtest_name,
 )
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
-    TestCase,
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
+    TestCase,
 )
 
 if not dist.is_available():
@@ -47,21 +47,23 @@
 sharding_strategy_config = [ShardingStrategy.SHARD_GRAD_OP, None]
 mixed_precision = ["enable_mixed_precision", None]
 
-configs = list(itertools.product(cpu_offload_config,
-                                 sharding_strategy_config,
-                                 mixed_precision))
+configs = list(
+    itertools.product(cpu_offload_config, sharding_strategy_config, mixed_precision)
+)
 test_name_mapping = {
     str(CPUOffload(offload_params=True)): "offload_true",
     str(CPUOffload(offload_params=False)): "offload_false",
     str(ShardingStrategy.SHARD_GRAD_OP): "shard_grad_op",
-    "enable_mixed_precision": "mixed_precision"
+    "enable_mixed_precision": "mixed_precision",
 }
 
 subtest_name = functools.partial(subtest_name, test_name_mapping)
 
 
 class TestShardGradScaler(TestCase):
-    @unittest.skipIf(amp_definitely_not_available(), "no supported device (cuda, xla) found")
+    @unittest.skipIf(
+        amp_definitely_not_available(), "no supported device (cuda, xla) found"
+    )
     def test_grad_scaling(self):
         pg = DummyProcessGroup(0, 1)
         scaler = ShardedGradScaler(init_scale=2.0, process_group=pg, enabled=True)
@@ -69,21 +71,26 @@ def test_grad_scaling(self):
         t1 = torch.full((1,), 8.0, dtype=torch.float32, device="cpu")
         outputs = [t1.clone(), (t0.clone(), t1.clone()), [t0.clone(), t1.clone()]]
         outputs = scaler.scale(outputs)
-        self.assertTrue(outputs[0] == 16.0 and outputs[1][0] == 8.0 and outputs[1][1] == 16.0)
+        self.assertTrue(
+            outputs[0] == 16.0 and outputs[1][0] == 8.0 and outputs[1][1] == 16.0
+        )
         self.assertTrue(outputs[2][0] == 8.0 and outputs[2][1] == 16.0)
         self.assertTrue(scaler._scale.device == t1.device)
 
-    @unittest.skipIf(amp_definitely_not_available(), "no supported device (cuda, xla) found")
+    @unittest.skipIf(
+        amp_definitely_not_available(), "no supported device (cuda, xla) found"
+    )
     def test_scaling_unscaling_sparse(self):
         pg = DummyProcessGroup(0, 1)
         scaler = ShardedGradScaler(init_scale=2.0, process_group=pg, enabled=True)
         inv_scale = torch.full((1,), 0.5, dtype=torch.float, device="cpu")
         found_inf = torch.full((1,), 0, dtype=torch.float, device="cpu")
 
-        i = torch.tensor([[0, 1, 1],
-                          [2, 0, 2]], device="cpu", dtype=torch.int64)
+        i = torch.tensor([[0, 1, 1], [2, 0, 2]], device="cpu", dtype=torch.int64)
         v = torch.tensor([16.0, 32.0, 64.0], dtype=torch.float, device="cpu")
-        s = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device="cpu", dtype=torch.float)
+        s = torch.sparse_coo_tensor(
+            i, v, torch.Size([2, 3]), device="cpu", dtype=torch.float
+        )
 
         # unscale sparse tensors
         s1 = s.clone()
@@ -95,29 +102,34 @@ def test_scaling_unscaling_sparse(self):
         self.assertEqual(s1.grad.to_dense(), (s / 2).to_dense())
 
         # unscale sparse tensor: inf
-        v = torch.tensor([16.0, 32.0, float('inf')], dtype=torch.float, device="cpu")
-        s1.grad = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device="cpu", dtype=torch.float)
+        v = torch.tensor([16.0, 32.0, float("inf")], dtype=torch.float, device="cpu")
+        s1.grad = torch.sparse_coo_tensor(
+            i, v, torch.Size([2, 3]), device="cpu", dtype=torch.float
+        )
         found_inf.zero_()
         found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf)[s1.device]
         self.assertEqual(found_inf, 1.0)
 
         # unscale sparse tensor: overflow (marked as inf)
-        i = torch.tensor([[1, 1, 1],
-                          [0, 0, 2]], device="cpu", dtype=torch.int64)
+        i = torch.tensor([[1, 1, 1], [0, 0, 2]], device="cpu", dtype=torch.int64)
         # coalescing sparse tensor here will cause the value to be Inf
         v = torch.tensor([2**15, 2**15, 1.0], dtype=torch.float16, device="cpu")
-        s1 = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device="cpu", dtype=torch.float16)
+        s1 = torch.sparse_coo_tensor(
+            i, v, torch.Size([2, 3]), device="cpu", dtype=torch.float16
+        )
         s1.grad = s1.clone()
         found_inf.zero_()
         found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf)[s1.device]
         self.assertEqual(found_inf, 1.0)
 
-    @unittest.skipIf(amp_definitely_not_available(), "no supported device (cuda, xla) found")
+    @unittest.skipIf(
+        amp_definitely_not_available(), "no supported device (cuda, xla) found"
+    )
     def test_inf_gradients_skip_optim_step(self):
         pg = DummyProcessGroup(0, 1)
         scaler = ShardedGradScaler(init_scale=2.0, process_group=pg, enabled=True)
         loss = torch.full((1,), 4.0, dtype=torch.float32, device="cpu")
-        t0 = torch.tensor([float('inf')], dtype=torch.float32, device="cpu")
+        t0 = torch.tensor([float("inf")], dtype=torch.float32, device="cpu")
         t0.grad = t0.clone()
         opt = torch.optim.SGD([t0], lr=1.0)
         scaler.scale(loss)
@@ -127,10 +139,7 @@ def test_inf_gradients_skip_optim_step(self):
 
 class TestShardedGradScalerParityWithDDP(FSDPTest):
     def _get_init_modes_for_test(self, cpu_offload):
-        modes = [
-            CUDAInitMode.CUDA_AFTER,
-            CUDAInitMode.CUDA_BEFORE
-        ]
+        modes = [CUDAInitMode.CUDA_AFTER, CUDAInitMode.CUDA_BEFORE]
         # Note that CUDAInitMode.CUDA_NEVER works currently only with CPU
         # offload as we explicitly bring the param back to CUDA device. In
         # general, it will not work since we try to all_gather p.data which is
@@ -149,11 +158,15 @@ def test_fsdp_ddp_parity_with_grad_scaler(
         mixed_precision: Optional[str],
     ):
         init_modes = self._get_init_modes_for_test(cpu_offload)
-        mp = MixedPrecision(
-            param_dtype=torch.float16,
-            reduce_dtype=torch.float16,
-            buffer_dtype=torch.float16,
-        ) if mixed_precision is not None else None
+        mp = (
+            MixedPrecision(
+                param_dtype=torch.float16,
+                reduce_dtype=torch.float16,
+                buffer_dtype=torch.float16,
+            )
+            if mixed_precision is not None
+            else None
+        )
         for cuda_init_mode in init_modes:
             self._test_fsdp_parity(
                 NestedWrappedModule,
diff --git a/test/distributed/fsdp/test_fsdp_state_dict.py b/test/distributed/fsdp/test_fsdp_state_dict.py
index 6592ec108f074..f5a401590414a 100644
--- a/test/distributed/fsdp/test_fsdp_state_dict.py
+++ b/test/distributed/fsdp/test_fsdp_state_dict.py
@@ -13,10 +13,10 @@
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     checkpoint_wrapper,
 )
-from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp import (
     CPUOffload,
     FullStateDictConfig,
+    FullyShardedDataParallel as FSDP,
     LocalStateDictConfig,
     MixedPrecision,
     ShardedStateDictConfig,
@@ -24,36 +24,27 @@
 )
 from torch.distributed.fsdp._shard_utils import _gather_state_dict
 from torch.distributed.fsdp.fully_sharded_data_parallel import FLAT_PARAM
-from torch.distributed.fsdp.wrap import (
-    enable_wrap,
-    transformer_auto_wrap_policy,
-    wrap,
-)
-from torch.nn import (
-    Linear,
-    Module,
-    TransformerDecoderLayer,
-    TransformerEncoderLayer,
-)
+from torch.distributed.fsdp.wrap import enable_wrap, transformer_auto_wrap_policy, wrap
+from torch.nn import Linear, Module, TransformerDecoderLayer, TransformerEncoderLayer
 from torch.nn.parallel import DistributedDataParallel
 from torch.optim import SGD
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
+    _assert_module_states,
+    _get_state_dict,
+    _zero_model,
     CUDAInitMode,
     FSDPInitMode,
     FSDPTest,
+    get_full_params,
     SkipModel,
     TransformerWithSharedParams,
-    _assert_module_states,
-    _get_state_dict,
-    _zero_model,
-    get_full_params,
 )
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 
 if not dist.is_available():
@@ -71,7 +62,7 @@
 OUTER_SHAPE = [4, 5]
 BUFFER_SHAPE = [5, 5]
 
-NON_ROOT_FSDP_PREFIX = 'non_fsdp_lin'
+NON_ROOT_FSDP_PREFIX = "non_fsdp_lin"
 
 _UNFLATTENED_STATE_DICT_IMPLS = ["state_dict", "sharded_state_dict"]
 _FLATTENED_STATE_DICT_IMPLS = ["local_state_dict"]
@@ -96,7 +87,9 @@ def __init__(self, wrap_fsdp, register_buffers=False, ignore_inner=False):
                 "non_persistent_buffer", torch.randn(BUFFER_SHAPE), persistent=False
             )
         if wrap_fsdp:
-            self.inner = FSDP(self.inner, ignored_modules=([self.inner] if ignore_inner else []))
+            self.inner = FSDP(
+                self.inner, ignored_modules=([self.inner] if ignore_inner else [])
+            )
         self.outer = Linear(*OUTER_SHAPE)
         if register_buffers:
             self.outer.register_buffer("buffer", torch.randn(BUFFER_SHAPE))
@@ -135,7 +128,9 @@ def _compare_models(self, model, model_new, assert_fn, check_fp16=False):
                     for tensor in model_new.parameters():
                         self.assertEqual(tensor.dtype, torch.float16)
 
-    def _get_simple_nested_model(self, *fsdp_args, wrap=True, checkpoint_wrap=False, **fsdp_kwargs):
+    def _get_simple_nested_model(
+        self, *fsdp_args, wrap=True, checkpoint_wrap=False, **fsdp_kwargs
+    ):
         if wrap:
             lin1 = nn.Linear(10, 10, bias=False).cuda()
             lin2 = nn.Linear(10, 10, bias=False).cuda()
@@ -148,7 +143,8 @@ def _get_simple_nested_model(self, *fsdp_args, wrap=True, checkpoint_wrap=False,
             model = FSDP(seq, *fsdp_args, **fsdp_kwargs)
         else:
             model = nn.Sequential(
-                nn.Linear(10, 10, bias=False).cuda(), nn.Linear(10, 10, bias=False).cuda()
+                nn.Linear(10, 10, bias=False).cuda(),
+                nn.Linear(10, 10, bias=False).cuda(),
             )
         return model
 
@@ -225,20 +221,24 @@ def _validate_state_dict_contents(
     @skip_if_lt_x_gpu(2)
     @parametrize("state_dict_type", _UNFLATTENED_STATE_DICT_IMPLS)
     @parametrize("checkpoint_wrap", ["first", "second", "both"])
-    def test_fsdp_state_dict_with_activation_checkpoint(self, state_dict_type, checkpoint_wrap):
+    def test_fsdp_state_dict_with_activation_checkpoint(
+        self, state_dict_type, checkpoint_wrap
+    ):
         """Tests saving the state dict, zeroing a target model's parameters, and
         loading the state dict, where the source and target models may have a
         checkpoint wrapper."""
         for model_call in [
             partial(self._get_simple_model),
-            partial(self._get_simple_nested_model)
+            partial(self._get_simple_nested_model),
         ]:
             model = model_call(checkpoint_wrap=(checkpoint_wrap in ["first", "both"]))
             with FSDP.state_dict_type(model, STATE_DICT_MAPPING[state_dict_type]):
                 state_dict = _gather_state_dict(_get_state_dict(model, False, False))
                 # Possibly wrap new model in activation checkpoint wrapper to test save/
                 # load with this wrapper
-                model_new = model_call(checkpoint_wrap=(checkpoint_wrap in ["second", "both"]))
+                model_new = model_call(
+                    checkpoint_wrap=(checkpoint_wrap in ["second", "both"])
+                )
                 _zero_model(model_new)
                 self._compare_models(model, model_new, self.assertNotEqual)
                 # Would fail if checkpoint_wrapper did not correctly implement state_dict pre/post hooks
@@ -250,16 +250,14 @@ def test_fsdp_state_dict_with_activation_checkpoint(self, state_dict_type, check
     def test_state_dict_with_shared_parameters(self, state_dict_type):
         auto_wrap_policy = partial(
             transformer_auto_wrap_policy,
-            transformer_layer_cls={
-                TransformerEncoderLayer, TransformerDecoderLayer
-            },
+            transformer_layer_cls={TransformerEncoderLayer, TransformerDecoderLayer},
         )
         model_creator = partial(
             TransformerWithSharedParams.init,
             self.process_group,
             FSDPInitMode.RECURSIVE,
             CUDAInitMode.CUDA_BEFORE,
-            {"auto_wrap_policy": auto_wrap_policy}
+            {"auto_wrap_policy": auto_wrap_policy},
         )
 
         fsdp_model = model_creator()
@@ -293,10 +291,14 @@ def test_state_dict_rank0_offload_save_load_flow(self, use_orig_params: bool):
         )
         # Force model parameters and buffers to be nonzero
         with FSDP.summon_full_params(fsdp_model):
-            for tensor in itertools.chain(fsdp_model.parameters(), fsdp_model.buffers()):
+            for tensor in itertools.chain(
+                fsdp_model.parameters(), fsdp_model.buffers()
+            ):
                 if torch.count_nonzero(tensor) == 0:
                     with torch.no_grad():
-                        tensor.add_(torch.tensor(1, dtype=tensor.dtype, device=tensor.device))
+                        tensor.add_(
+                            torch.tensor(1, dtype=tensor.dtype, device=tensor.device)
+                        )
         with self._get_state_dict_mgr(fsdp_model, "state_dict", True):
             state_dict = deepcopy(_get_state_dict(fsdp_model))
         # Initialize a non-wrapped model on all ranks
@@ -357,15 +359,26 @@ def test_basic_save_and_load_state_dict(
         with various configs such as fp16 and cpu offload and parameters
         match as expected.
         """
-        if (
-            (state_dict_rank0_and_offload and state_dict_type != "state_dict")
-            or (use_orig_params and state_dict_type not in _UNFLATTENED_STATE_DICT_IMPLS)
+        if (state_dict_rank0_and_offload and state_dict_type != "state_dict") or (
+            use_orig_params and state_dict_type not in _UNFLATTENED_STATE_DICT_IMPLS
         ):
             return  # not supported
         for model_call in [
-            partial(self._get_non_fsdp_root_module, cpu_offload=cpu_offload, use_orig_params=use_orig_params),
-            partial(self._get_simple_nested_model, cpu_offload=cpu_offload, use_orig_params=use_orig_params),
-            partial(self._get_simple_model, cpu_offload=cpu_offload, use_orig_params=use_orig_params),
+            partial(
+                self._get_non_fsdp_root_module,
+                cpu_offload=cpu_offload,
+                use_orig_params=use_orig_params,
+            ),
+            partial(
+                self._get_simple_nested_model,
+                cpu_offload=cpu_offload,
+                use_orig_params=use_orig_params,
+            ),
+            partial(
+                self._get_simple_model,
+                cpu_offload=cpu_offload,
+                use_orig_params=use_orig_params,
+            ),
         ]:
             model = model_call()
 
@@ -377,10 +390,15 @@ def test_basic_save_and_load_state_dict(
                     model, cpu_offload.offload_params, fp16
                 )
 
-            ignore_keys = [k for k in fsdp_state_dict.keys() if NON_ROOT_FSDP_PREFIX in k]
+            ignore_keys = [
+                k for k in fsdp_state_dict.keys() if NON_ROOT_FSDP_PREFIX in k
+            ]
 
             self._validate_state_dict_contents(
-                model, fsdp_state_dict, state_dict_rank0_and_offload, ignore_keys=ignore_keys,
+                model,
+                fsdp_state_dict,
+                state_dict_rank0_and_offload,
+                ignore_keys=ignore_keys,
             )
             if fp16:
                 # Verify fp16 is the type
@@ -465,7 +483,9 @@ def test_save_and_load_after_forward_state_dict(
                 for sharded_tensor in state_dict.values():
                     shard = sharded_tensor._local_shards[0]
                     shard.tensor = shard.tensor.clone().detach_()
-        self._validate_state_dict_contents(model, state_dict, state_dict_rank0_and_offload)
+        self._validate_state_dict_contents(
+            model, state_dict, state_dict_rank0_and_offload
+        )
         _zero_model(model)
 
         # Ensure checkpointed params have the full param dtype
@@ -562,7 +582,9 @@ def test_state_dict_save_load_flow(self, state_dict_type):
         for move_to_cpu in [True, False]:
             with self.subTest(move_to_cpu=move_to_cpu):
                 fsdp_params = self._dist_train(
-                    wrap_fsdp=True, state_dict_type=state_dict_type, move_to_cpu=move_to_cpu,
+                    wrap_fsdp=True,
+                    state_dict_type=state_dict_type,
+                    move_to_cpu=move_to_cpu,
                 )
                 ddp_params = self._dist_train(wrap_fsdp=False)
                 self.assertEqual(ddp_params, fsdp_params)
@@ -572,7 +594,9 @@ def test_state_dict_save_load_flow(self, state_dict_type):
     def test_fsdp_state_dict_keys(self, state_dict_type):
         state_dict = self._state_dict(self._initialize_model(True), state_dict_type)
         if state_dict_type == "local_state_dict":
-            self.assertEqual(set([FLAT_PARAM, f"inner.{FLAT_PARAM}"]), state_dict.keys())
+            self.assertEqual(
+                set([FLAT_PARAM, f"inner.{FLAT_PARAM}"]), state_dict.keys()
+            )
         elif state_dict_type in ("state_dict", "sharded_state_dict"):
             # Keys should match local model.
             local_model = self._initialize_model(wrap_fsdp=False, wrap_ddp=False)
@@ -586,7 +610,10 @@ def test_fsdp_state_dict_keys(self, state_dict_type):
     @parametrize("state_dict_rank0_and_offload", [True, False])
     @parametrize("fsdp_root", [True, False])
     def test_state_dict_load_into_local_module(
-        self, state_dict_type, state_dict_rank0_and_offload, fsdp_root,
+        self,
+        state_dict_type,
+        state_dict_rank0_and_offload,
+        fsdp_root,
     ):
         """
         Tests that FSDP's state_dict can be loaded into a local model.
@@ -599,7 +626,9 @@ def test_state_dict_load_into_local_module(
             model = self._initialize_model(wrap_fsdp=True, register_buffers=True)
         optim = SGD(model.parameters(), lr=0.1)
         if not fsdp_root:
-            in_data = torch.randn(1, 10, requires_grad=True, device=torch.device("cuda"))
+            in_data = torch.randn(
+                1, 10, requires_grad=True, device=torch.device("cuda")
+            )
         else:
             in_data = torch.rand(64, 4, requires_grad=True, device=torch.device("cuda"))
         for _ in range(3):
@@ -620,7 +649,10 @@ def test_state_dict_load_into_local_module(
 
         ignore_keys = [k for k in fsdp_state_dict.keys() if NON_ROOT_FSDP_PREFIX in k]
         self._validate_state_dict_contents(
-            model, fsdp_state_dict, state_dict_rank0_and_offload, ignore_keys=ignore_keys,
+            model,
+            fsdp_state_dict,
+            state_dict_rank0_and_offload,
+            ignore_keys=ignore_keys,
         )
         # Create zeroed local model
         if not fsdp_root:
@@ -749,10 +781,14 @@ def test_wrong_state_dict_config(self):
     @parametrize("state_dict_type", _UNFLATTENED_STATE_DICT_IMPLS)
     @parametrize("prefix", [True, False])
     @parametrize("ignore_inner", [True, False])
-    def test_state_dict_with_ignored_modules(self, state_dict_type, prefix, ignore_inner):
+    def test_state_dict_with_ignored_modules(
+        self, state_dict_type, prefix, ignore_inner
+    ):
         # Initialize an FSDP-wrapped model with an ignored module that includes
         # both parameters and a buffer
-        model = Model(wrap_fsdp=True, register_buffers=True, ignore_inner=ignore_inner).cuda()
+        model = Model(
+            wrap_fsdp=True, register_buffers=True, ignore_inner=ignore_inner
+        ).cuda()
         ignored_modules = [model.outer]
         ignored_tensor_to_tensor_name = {
             model.outer.bias: "outer.bias",
@@ -767,7 +803,8 @@ def test_state_dict_with_ignored_modules(self, state_dict_type, prefix, ignore_i
         # Note that when model.inner is not ignored this test also ensures
         # non-ignored buffers are not cloned.
         buffer_to_buffer_name = {
-            model.inner.buffer: "inner.buffer", model.outer.buffer: "outer.buffer",
+            model.inner.buffer: "inner.buffer",
+            model.outer.buffer: "outer.buffer",
         }
         fsdp_model = FSDP(model, ignored_modules=ignored_modules)
         prefix_str = "foo." if prefix else ""
@@ -782,7 +819,11 @@ def test_state_dict_with_ignored_modules(self, state_dict_type, prefix, ignore_i
         }.items():
             prefixed_tensor_name = f"{prefix_str}{tensor_name}"
             self.assertTrue(prefixed_tensor_name in sd1)
-            self.assertEqual(tensor.data_ptr(), sd1[prefixed_tensor_name].data_ptr(), f"{prefixed_tensor_name}")
+            self.assertEqual(
+                tensor.data_ptr(),
+                sd1[prefixed_tensor_name].data_ptr(),
+                f"{prefixed_tensor_name}",
+            )
         # Check that the state dict can be loaded into a non-wrapped version of
         # the model
         nonwrapped_model = Model(wrap_fsdp=False, register_buffers=True).cuda()
@@ -790,7 +831,7 @@ def test_state_dict_with_ignored_modules(self, state_dict_type, prefix, ignore_i
             with torch.no_grad():
                 param.zero_()
 
-        to_load = {k[len(prefix_str):] : v for k, v in sd1.items()}
+        to_load = {k[len(prefix_str) :]: v for k, v in sd1.items()}
         nonwrapped_model.load_state_dict(to_load, strict=True)
         local_params = list(nonwrapped_model.parameters())
         for fsdp_param, local_param in zip(fsdp_params, local_params):
@@ -806,7 +847,10 @@ def test_state_dict_with_ignored_modules(self, state_dict_type, prefix, ignore_i
             prefixed_tensor_name = f"{prefix_str}{tensor_name}"
             self.assertTrue(prefixed_tensor_name in sd2)
             self.assertEqual(tensor.data_ptr(), sd2[prefixed_tensor_name].data_ptr())
-            self.assertEqual(sd1[prefixed_tensor_name].data_ptr(), sd2[prefixed_tensor_name].data_ptr())
+            self.assertEqual(
+                sd1[prefixed_tensor_name].data_ptr(),
+                sd2[prefixed_tensor_name].data_ptr(),
+            )
 
     @skip_if_lt_x_gpu(2)
     def test_state_dict_type(self):
diff --git a/test/distributed/fsdp/test_fsdp_summon_full_params.py b/test/distributed/fsdp/test_fsdp_summon_full_params.py
index 82fd8e1c0737b..5b995a9ab23f6 100644
--- a/test/distributed/fsdp/test_fsdp_summon_full_params.py
+++ b/test/distributed/fsdp/test_fsdp_summon_full_params.py
@@ -9,9 +9,12 @@
 import torch
 import torch.nn as nn
 from torch import distributed as dist
-from torch.distributed.fsdp import CPUOffload
-from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from torch.distributed.fsdp import MixedPrecision, ShardingStrategy
+from torch.distributed.fsdp import (
+    CPUOffload,
+    FullyShardedDataParallel as FSDP,
+    MixedPrecision,
+    ShardingStrategy,
+)
 from torch.distributed.fsdp.flat_param import FlatParamHandle
 from torch.distributed.fsdp.wrap import enable_wrap, wrap
 from torch.nn.parallel.distributed import DistributedDataParallel as DDP
@@ -25,10 +28,10 @@
     TransformerWithSharedParams,
 )
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 
 if not dist.is_available():
@@ -129,7 +132,9 @@ def test_summon_full_param_writeback(self):
     @skip_if_lt_x_gpu(2)
     @parametrize("mixed_precision", [True, False])
     def test_summon_full_param_shard_value(self, mixed_precision):
-        mixed_precision = MixedPrecision(param_dtype=torch.float16) if mixed_precision else None
+        mixed_precision = (
+            MixedPrecision(param_dtype=torch.float16) if mixed_precision else None
+        )
         raw_model = nn.Linear(10, 11)
         raw_model_size = self.get_model_param_count(raw_model)
         expected_shard_size = self.get_expected_sharded_size(raw_model_size)
@@ -159,7 +164,9 @@ def test_summon_full_param_shard_value(self, mixed_precision):
     @parametrize("summon_outer", [True, False])
     @parametrize("mixed_precision", [True, False])
     def test_summon_full_param_recursive(self, recurse, summon_outer, mixed_precision):
-        mixed_precision = MixedPrecision(param_dtype=torch.float16) if mixed_precision else None
+        mixed_precision = (
+            MixedPrecision(param_dtype=torch.float16) if mixed_precision else None
+        )
         model = FSDP(
             nn.Sequential(
                 FSDP(nn.Linear(5, 5, bias=False), mixed_precision=mixed_precision),
@@ -239,9 +246,7 @@ def test_summon_full_params_respects_reshard_after_forward(self):
         )
 
     def _test_summon_full_params_respects_reshard_after_forward(
-        self,
-        mixed_precision: Optional[MixedPrecision],
-        use_orig_params: bool
+        self, mixed_precision: Optional[MixedPrecision], use_orig_params: bool
     ):
         fsdp_kwargs = {
             "mixed_precision": mixed_precision,
@@ -373,7 +378,9 @@ def __init__(self, fsdp_1, fsdp_2, fsdp_3):
     def test_reshard_outside_forward_backward_iteration(
         self, rank0_only, offload_to_cpu, mixed_precision
     ):
-        mixed_precision = MixedPrecision(param_dtype=torch.float16) if mixed_precision else None
+        mixed_precision = (
+            MixedPrecision(param_dtype=torch.float16) if mixed_precision else None
+        )
         model = FSDP(
             nn.Sequential(
                 FSDP(nn.Linear(5, 5, bias=False), mixed_precision=mixed_precision),
@@ -437,7 +444,9 @@ def test_reshard_outside_forward_backward_iteration(
     def test_params_are_unflattenned(self, rank0_only, offload_to_cpu, mixed_precision):
         layer_shape = (10, 12)
         model = nn.Linear(*layer_shape, bias=False).cuda(self.rank)
-        mixed_precision = MixedPrecision(param_dtype=torch.float16) if mixed_precision else None
+        mixed_precision = (
+            MixedPrecision(param_dtype=torch.float16) if mixed_precision else None
+        )
         fsdp_model = FSDP(deepcopy(model), mixed_precision=mixed_precision).cuda(
             self.rank
         )
@@ -486,7 +495,9 @@ def test_params_count_and_value(
         offload_to_cpu: bool,
         mixed_precision: bool,
     ):
-        mixed_precision = MixedPrecision(param_dtype=torch.float16) if mixed_precision else None
+        mixed_precision = (
+            MixedPrecision(param_dtype=torch.float16) if mixed_precision else None
+        )
         model = NestedWrappedModule.init(
             self.process_group,
             FSDPInitMode.NO_FSDP,
@@ -624,10 +635,13 @@ def _check_grads(
                     assert torch.count_nonzero(p2.grad) > 0
                     p2.grad *= WRITEBACK_FACTOR
             new_fsdp_grads = [
-                param.grad for param in fsdp_model.parameters()
+                param.grad
+                for param in fsdp_model.parameters()
                 if param.grad is not None
             ]
-            writeback_persists = writeback or sharding_strategy == ShardingStrategy.NO_SHARD
+            writeback_persists = (
+                writeback or sharding_strategy == ShardingStrategy.NO_SHARD
+            )
             for old_grad, new_grad in zip(old_fsdp_grads, new_fsdp_grads):
                 if writeback_persists:
                     torch.testing.assert_close(old_grad * WRITEBACK_FACTOR, new_grad)
@@ -640,14 +654,16 @@ def _check_grads(
 
         def _get_error_context(is_supported: bool):
             return (
-                contextlib.suppress() if is_supported
+                contextlib.suppress()
+                if is_supported
                 else self.assertRaises(NotImplementedError)
             )  # some configs not implemented yet
 
         def _get_fsdp_grads(fsdp_model: FSDP, is_supported: bool):
             if is_supported:
                 return [
-                    param.grad.clone() for param in fsdp_model.parameters()
+                    param.grad.clone()
+                    for param in fsdp_model.parameters()
                     if param.grad is not None
                 ]
             return None  # unused
@@ -706,7 +722,7 @@ def test_with_grads_none_grads(self):
                     ShardingStrategy.NO_SHARD,
                 ]
             },
-            self._test_with_grads_none_grads
+            self._test_with_grads_none_grads,
         )
 
     def _test_with_grads_none_grads(self, sharding_strategy: ShardingStrategy):
diff --git a/test/distributed/fsdp/test_fsdp_traversal.py b/test/distributed/fsdp/test_fsdp_traversal.py
index e1b0a77cfe791..b9c7a0aeac9b2 100644
--- a/test/distributed/fsdp/test_fsdp_traversal.py
+++ b/test/distributed/fsdp/test_fsdp_traversal.py
@@ -11,10 +11,7 @@
     FSDPTest,
     NestedWrappedModule,
 )
-from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
-    run_tests,
-)
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
@@ -42,18 +39,20 @@ def test_fsdp_modules(self):
         )
         modules = FSDP.fsdp_modules(nested_wrapped_module)
         self.assertEquals(
-            modules, [
+            modules,
+            [
                 nested_wrapped_module.module.get_submodule("1"),
                 nested_wrapped_module.module.get_submodule("1").get_submodule("0"),
                 nested_wrapped_module.module.get_submodule("2"),
-            ]
+            ],
         )
         modules = FSDP.fsdp_modules(nested_wrapped_module, root_only=True)
         self.assertEqual(
-            modules, [
+            modules,
+            [
                 nested_wrapped_module.module.get_submodule("1"),
                 nested_wrapped_module.module.get_submodule("2"),
-            ]
+            ],
         )
 
 
diff --git a/test/distributed/fsdp/test_fsdp_uneven.py b/test/distributed/fsdp/test_fsdp_uneven.py
index 295afbce508bc..6ffeb279b617b 100644
--- a/test/distributed/fsdp/test_fsdp_uneven.py
+++ b/test/distributed/fsdp/test_fsdp_uneven.py
@@ -8,11 +8,8 @@
 from torch.nn import Linear
 from torch.optim import SGD
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.testing._internal.common_fsdp import (
-    FSDPTest,
-)
-from torch.testing._internal.common_utils import TEST_WITH_DEV_DBG_ASAN, run_tests
-
+from torch.testing._internal.common_fsdp import FSDPTest
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
diff --git a/test/distributed/fsdp/test_fsdp_use_orig_params.py b/test/distributed/fsdp/test_fsdp_use_orig_params.py
index 1091200206135..81657dcfae5e1 100644
--- a/test/distributed/fsdp/test_fsdp_use_orig_params.py
+++ b/test/distributed/fsdp/test_fsdp_use_orig_params.py
@@ -8,16 +8,14 @@
 import torch
 import torch.nn as nn
 from torch import distributed as dist
-from torch.distributed.fsdp import BackwardPrefetch, CPUOffload
-from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from torch.distributed.fsdp import ShardingStrategy
-from torch.distributed.fsdp.fully_sharded_data_parallel import (
-    clean_tensor_name,
-)
-from torch.distributed.fsdp.wrap import (
-    always_wrap_policy,
-    transformer_auto_wrap_policy,
+from torch.distributed.fsdp import (
+    BackwardPrefetch,
+    CPUOffload,
+    FullyShardedDataParallel as FSDP,
+    ShardingStrategy,
 )
+from torch.distributed.fsdp.fully_sharded_data_parallel import clean_tensor_name
+from torch.distributed.fsdp.wrap import always_wrap_policy, transformer_auto_wrap_policy
 from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer
 from torch.nn.parallel.distributed import DistributedDataParallel as DDP
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
@@ -28,10 +26,10 @@
     TransformerWithSharedParams,
 )
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 
 if not dist.is_available():
@@ -361,11 +359,14 @@ def _test_multiple_optimizers(self, sharding_strategy: ShardingStrategy):
         ddp_model = self._get_ddp_transformer(find_unused_params=True)
         ddp_param_groups = self._get_param_groups(ddp_model)
         assert len(ddp_param_groups) == 3, f"{len(ddp_param_groups)}"
-        fsdp_model, _ = self._get_fsdp_transformer_and_optim(  # ignore returned optimizer
+        (
+            fsdp_model,
+            _,
+        ) = self._get_fsdp_transformer_and_optim(  # ignore returned optimizer
             cuda_init_mode=CUDAInitMode.CUDA_BEFORE,
             init_optim_before_wrap=False,
             optim_class=torch.optim.Adam,  # ignored
-            multi_tensor=False,            # ignored
+            multi_tensor=False,  # ignored
             sharding_strategy=sharding_strategy,
             backward_prefetch=BackwardPrefetch.BACKWARD_PRE,
             cpu_offload=None,
@@ -386,7 +387,9 @@ def _test_multiple_optimizers(self, sharding_strategy: ShardingStrategy):
         ]
 
         for optim_ctor, ddp_param_group, fsdp_param_group in zip(
-            optim_ctors, ddp_param_groups[:2], fsdp_param_groups[:2],
+            optim_ctors,
+            ddp_param_groups[:2],
+            fsdp_param_groups[:2],
         ):
             ddp_optims.append(optim_ctor(ddp_param_group["params"]))
             fsdp_optims.append(optim_ctor(fsdp_param_group["params"]))
@@ -406,7 +409,7 @@ def _test_multiple_optimizers(self, sharding_strategy: ShardingStrategy):
                         has_weight = True
                     elif "bias" in fqn and param.numel() > 0:
                         has_bias = True
-                has_both |= (has_weight and has_bias)
+                has_both |= has_weight and has_bias
         assert has_both, (
             f"Rank {self.rank} does not have a `FlatParameter` with both a "
             "weight and a bias in its shard, meaning that this test is vacuous"
@@ -440,7 +443,8 @@ def run_iter():
         # Check that FSDP correctly exposes gradients even after forward
         # (namely, `None` for weights and non-`None` for biases)
         for (ddp_n, ddp_p), (fsdp_n, fsdp_p) in zip(
-            ddp_model.module.named_parameters(), fsdp_model.named_parameters(),
+            ddp_model.module.named_parameters(),
+            fsdp_model.named_parameters(),
         ):
             self.assertEqual(ddp_n, fsdp_n)
             if fsdp_p.numel() == 0:
diff --git a/test/distributed/fsdp/test_utils.py b/test/distributed/fsdp/test_utils.py
index 2aa7fa0b6d97e..6ac2f78be7150 100644
--- a/test/distributed/fsdp/test_utils.py
+++ b/test/distributed/fsdp/test_utils.py
@@ -2,9 +2,10 @@
 
 import random
 import sys
-from typing import List
 import unittest
 from collections import OrderedDict
+from dataclasses import dataclass
+from typing import List
 
 import torch
 import torch.nn as nn
@@ -12,14 +13,13 @@
 from torch.distributed.fsdp._utils import _apply_to_tensors
 from torch.distributed.utils import _replace_by_prefix
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
-    TestCase,
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
     subtest,
+    TEST_WITH_DEV_DBG_ASAN,
+    TestCase,
 )
-from dataclasses import dataclass
 
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
@@ -60,8 +60,6 @@ class SomeDataClass:
             some_float: float
             some_tensor: List[torch.Tensor]
 
-
-
         # create a mixed bag of data.
         data = [1, "str"]
         data.append({"key1": get_a_tensor(), "key2": {1: get_a_tensor()}, "key3": 3})
@@ -100,7 +98,6 @@ def test_replace_by_prefix(self):
         _replace_by_prefix(state_dict, "module.layer.", "layer.")
         assert state_dict == original_state_dict
 
-
     def test_packed_sequence(self):
         """Test to ensure RNN packed sequences are modified correctly."""
         rnn = nn.RNN(5, 5)
diff --git a/test/distributed/fsdp/test_wrap.py b/test/distributed/fsdp/test_wrap.py
index 98ba324f46f18..cd0d11ba9b4b1 100644
--- a/test/distributed/fsdp/test_wrap.py
+++ b/test/distributed/fsdp/test_wrap.py
@@ -4,7 +4,7 @@
 import os
 import tempfile
 import unittest
-from enum import Enum, auto
+from enum import auto, Enum
 
 import torch
 import torch.nn as nn
@@ -12,8 +12,6 @@
 from torch.distributed.fsdp.fully_sharded_data_parallel import (
     BackwardPrefetch,
     CPUOffload,
-)
-from torch.distributed.fsdp.fully_sharded_data_parallel import (
     FullyShardedDataParallel as FSDP,
 )
 from torch.distributed.fsdp.wrap import (
@@ -28,20 +26,20 @@
 from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
+    _maybe_cuda,
     CUDAInitMode,
     DummyProcessGroup,
     FSDPInitMode,
     FSDPTest,
     TransformerWithSharedParams,
-    _maybe_cuda,
 )
 from torch.testing._internal.common_utils import (
     FILE_SCHEMA,
-    TestCase,
     find_free_port,
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+    TestCase,
 )
 
 
@@ -54,6 +52,7 @@ def __init__(self):
         self.bn3 = nn.BatchNorm3d(10)
         self.sync_bn = nn.SyncBatchNorm(10)
 
+
 class WrapMethod(Enum):
     FSDP_CTOR = auto()
     # FSDP_CTOR is the supported way forward, but keep WRAP_API in case we miss
@@ -61,8 +60,6 @@ class WrapMethod(Enum):
     WRAP_API = auto()
 
 
-
-
 class TestFSDPWrap(FSDPTest):
     """
     Tests main API for wrapping FSDP, which is to pass auto_wrap_policy into
@@ -144,7 +141,9 @@ def test_error_already_wrapped(self, nested, cuda_init_mode):
         Test that an error is raised if we attempt to wrap when submodules are
         already FSDP.
         """
-        wrapped_fsdp = self._get_already_wrapped_fsdp(nested=nested, cuda_init_mode=cuda_init_mode)
+        wrapped_fsdp = self._get_already_wrapped_fsdp(
+            nested=nested, cuda_init_mode=cuda_init_mode
+        )
         if cuda_init_mode == CUDAInitMode.CUDA_AFTER:
             wrapped_fsdp = wrapped_fsdp.cuda()
 
@@ -159,9 +158,10 @@ def never_wrap_policy(*args, **kwargs):
 
         policy = (
             functools.partial(
-                _or_policy,
-                policies=[never_wrap_policy, _wrap_batchnorm_individually]
-            ) if use_or_policy else _wrap_batchnorm_individually
+                _or_policy, policies=[never_wrap_policy, _wrap_batchnorm_individually]
+            )
+            if use_or_policy
+            else _wrap_batchnorm_individually
         )
         model = BatchNormNet()
         fsdp = FSDP(model, auto_wrap_policy=policy)
@@ -178,6 +178,7 @@ def test_bn_always_wrapped_individually(self):
         if the other policy results in a module containing a BN unit being
         wrapped, the contained BN unit will still be individually wrapped.
         """
+
         class MyModule(nn.Module):
             def __init__(self):
                 super().__init__()
@@ -189,8 +190,7 @@ def wrap_bn_container(module, recurse, *args, **kwargs):
             return isinstance(module, BatchNormNet)
 
         my_policy = functools.partial(
-            _or_policy,
-            policies=[wrap_bn_container, _wrap_batchnorm_individually]
+            _or_policy, policies=[wrap_bn_container, _wrap_batchnorm_individually]
         )
         mod = MyModule()
         fsdp = FSDP(mod, auto_wrap_policy=my_policy)
@@ -203,7 +203,7 @@ def wrap_bn_container(module, recurse, *args, **kwargs):
             fsdp.bn_container.bn1,
             fsdp.bn_container.bn2,
             fsdp.bn_container.bn3,
-            fsdp.bn_container.sync_bn
+            fsdp.bn_container.sync_bn,
         ]:
             self.assertTrue(isinstance(bn, FSDP))
 
@@ -216,24 +216,21 @@ def wrap_bn_container(module, recurse, *args, **kwargs):
             fsdp.bn_container.bn1,
             fsdp.bn_container.bn2,
             fsdp.bn_container.bn3,
-            fsdp.bn_container.sync_bn
+            fsdp.bn_container.sync_bn,
         ]:
             self.assertFalse(isinstance(bn, FSDP))
 
     @skip_if_lt_x_gpu(2)
     @parametrize(
         "cpu_offload",
-        [CPUOffload(offload_params=False), CPUOffload(offload_params=True)]
+        [CPUOffload(offload_params=False), CPUOffload(offload_params=True)],
     )
     @parametrize(
         "backward_prefetch",
-        [BackwardPrefetch.BACKWARD_POST, BackwardPrefetch.BACKWARD_PRE]
+        [BackwardPrefetch.BACKWARD_POST, BackwardPrefetch.BACKWARD_PRE],
     )
     @parametrize("forward_prefetch", [False, True])
-    @parametrize(
-        "cuda_init_mode",
-        [CUDAInitMode.CUDA_AFTER, CUDAInitMode.CUDA_BEFORE]
-    )
+    @parametrize("cuda_init_mode", [CUDAInitMode.CUDA_AFTER, CUDAInitMode.CUDA_BEFORE])
     def test_main_wrap_api(
         self,
         cpu_offload: CPUOffload,
@@ -286,7 +283,7 @@ def forward(self, input):
             wrapped_model.module.lin3,
             wrapped_model.module.lin4.module.nested_lin,
             wrapped_model.module.lin4,
-            wrapped_model
+            wrapped_model,
         ]
 
         for module in modules_in_fsdp_graph_order:
@@ -322,7 +319,9 @@ def test_wrap(self, wrap_method):
             layer = FSDP(
                 nn.Linear(5, 5),
                 process_group=self.process_group,
-                auto_wrap_policy=functools.partial(size_based_auto_wrap_policy, min_num_params=1)
+                auto_wrap_policy=functools.partial(
+                    size_based_auto_wrap_policy, min_num_params=1
+                ),
             )
         self.assertTrue(isinstance(layer, FSDP))
         self.assertEqual(layer.rank, self.process_group.rank())
@@ -362,7 +361,9 @@ def test_always_wrap(self):
         passed into FSDP, all submodules are wrapped.
         """
         seq = TestFSDPWrap.NestedSequentialModel.get_model(cuda=True)
-        model = FSDP(seq, process_group=self.process_group, auto_wrap_policy=always_wrap_policy)
+        model = FSDP(
+            seq, process_group=self.process_group, auto_wrap_policy=always_wrap_policy
+        )
         TestFSDPWrap.NestedSequentialModel.verify_model_all_wrapped(self, model)
 
     @unittest.skipIf(torch.cuda.device_count() < 2, "Requires at least 2 GPUs")
@@ -383,7 +384,11 @@ def test_transformer_auto_wrap_policy(self):
         encoder_layers = set(fsdp_model.module.transformer.encoder.layers)
         decoder_layers = set(fsdp_model.module.transformer.decoder.layers)
         for module in modules:
-            if module is fsdp_model or module in encoder_layers or module in decoder_layers:
+            if (
+                module is fsdp_model
+                or module in encoder_layers
+                or module in decoder_layers
+            ):
                 self.assertTrue(isinstance(module, FSDP))
             else:
                 self.assertFalse(isinstance(module, FSDP))
@@ -401,7 +406,7 @@ def test_auto_wrap_api(self):
         model = FSDP(
             sequential,
             process_group=self.process_group,
-            auto_wrap_policy=my_auto_wrap_policy
+            auto_wrap_policy=my_auto_wrap_policy,
         )
 
         TestFSDPWrap.NestedSequentialModel.verify_model(self, model)
@@ -420,7 +425,7 @@ def test_auto_wrap_preset_exclude_wrap(self):
         model = FSDP(
             sequential,
             process_group=self.process_group,
-            auto_wrap_policy=my_auto_wrap_policy
+            auto_wrap_policy=my_auto_wrap_policy,
         )
 
         self.assertTrue(isinstance(model, FSDP))
@@ -437,7 +442,11 @@ def test_auto_wrap_preset_exclude_wrap_include_children(self):
         my_auto_wrap_policy = functools.partial(
             size_based_auto_wrap_policy, min_num_params=40
         )
-        model = FSDP(sequential, process_group=self.process_group, auto_wrap_policy=my_auto_wrap_policy)
+        model = FSDP(
+            sequential,
+            process_group=self.process_group,
+            auto_wrap_policy=my_auto_wrap_policy,
+        )
 
         self.assertTrue(isinstance(model, FSDP))
         self.assertTrue(isinstance(model[0], FSDP))
@@ -452,7 +461,11 @@ def test_auto_wrap_preset_force_leaf(self):
         my_auto_wrap_policy = functools.partial(
             size_based_auto_wrap_policy, min_num_params=40
         )
-        model = FSDP(sequential, process_group=self.process_group, auto_wrap_policy=my_auto_wrap_policy)
+        model = FSDP(
+            sequential,
+            process_group=self.process_group,
+            auto_wrap_policy=my_auto_wrap_policy,
+        )
         self.assertTrue(isinstance(model.module[0], FSDP))
         # Assert children of multihead attention are not wrapped
         self.assertTrue(isinstance(model.module[1], nn.MultiheadAttention))
@@ -473,7 +486,11 @@ def test_auto_wrap_preset_force_leaf_custom(self):
         sequential = nn.Sequential(
             nn.Linear(10, 10), nn.ModuleList([nn.Linear(10, 10)])
         )
-        model = FSDP(sequential, process_group=self.process_group, auto_wrap_policy=my_auto_wrap_policy)
+        model = FSDP(
+            sequential,
+            process_group=self.process_group,
+            auto_wrap_policy=my_auto_wrap_policy,
+        )
         # Model was wrapped in FSDP as no inner modules were wrapped.
         self.assertTrue(isinstance(model, FSDP))
         self.assertTrue(isinstance(model.module[0], nn.Linear))
@@ -483,14 +500,12 @@ def test_auto_wrap_preset_force_leaf_custom(self):
     @parametrize("cuda_init_mode", [CUDAInitMode.CUDA_BEFORE, CUDAInitMode.CUDA_AFTER])
     @parametrize(
         "cpu_offload",
-        [CPUOffload(offload_params=False), CPUOffload(offload_params=True)]
+        [CPUOffload(offload_params=False), CPUOffload(offload_params=True)],
     )
     @parametrize("use_device_id", [True, False])
     def test_auto_wrap_smoke_test(self, cuda_init_mode, cpu_offload, use_device_id):
         # CPU offload and CUDA after don't work together as expected.
-        if (
-            cpu_offload.offload_params and cuda_init_mode == CUDAInitMode.CUDA_AFTER
-        ):
+        if cpu_offload.offload_params and cuda_init_mode == CUDAInitMode.CUDA_AFTER:
             return
 
         device = torch.device("cuda")
@@ -515,12 +530,17 @@ def test_auto_wrap_smoke_test(self, cuda_init_mode, cpu_offload, use_device_id):
         # cases where full model cannot be loaded onto GPU, but their shards can.
         cuda_after_init = cuda_init_mode == CUDAInitMode.CUDA_AFTER
         try:
-            sequential = TestFSDPWrap.NestedSequentialModel.get_model(cuda=(not cuda_after_init))
+            sequential = TestFSDPWrap.NestedSequentialModel.get_model(
+                cuda=(not cuda_after_init)
+            )
             my_auto_wrap_policy = functools.partial(
                 size_based_auto_wrap_policy, min_num_params=40
             )
             model = FSDP(
-                sequential, cpu_offload=cpu_offload, auto_wrap_policy=my_auto_wrap_policy, device_id=device_id
+                sequential,
+                cpu_offload=cpu_offload,
+                auto_wrap_policy=my_auto_wrap_policy,
+                device_id=device_id,
             )
             TestFSDPWrap.NestedSequentialModel.verify_model(self, model)
             if cuda_after_init:
@@ -568,7 +588,8 @@ def test_auto_wrap_with_ignored_modules(self, wrap_method: WrapMethod):
         sequential = TestFSDPWrap.NestedSequentialModel.get_model(cuda=False)
         ignored_modules = [sequential[1], sequential[2][0]]
         my_auto_wrap_policy = functools.partial(
-            size_based_auto_wrap_policy, min_num_params=40,
+            size_based_auto_wrap_policy,
+            min_num_params=40,
         )
         fsdp_kwargs = {
             "process_group": self.process_group,
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index 7fdbe573ed217..3c159313f0890 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -6,15 +6,14 @@
 from abc import ABC, abstractmethod
 from contextlib import suppress
 from copy import deepcopy
-from enum import Enum, auto
+from enum import auto, Enum
 from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 from unittest import mock
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-from torch.distributed.fsdp import CPUOffload
-from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import CPUOffload, FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.fully_sharded_data_parallel import (
     BackwardPrefetch,
     MixedPrecision,
@@ -29,10 +28,7 @@
 )
 from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer
 from torch.nn.parallel.distributed import DistributedDataParallel as DDP
-from torch.testing._internal.common_distributed import (
-    TEST_SKIPS,
-    MultiProcessTestCase,
-)
+from torch.testing._internal.common_distributed import MultiProcessTestCase, TEST_SKIPS
 from torch.testing._internal.common_utils import FILE_SCHEMA, get_cycles_per_ms
 
 
@@ -57,6 +53,7 @@ class CUDAInitMode(Enum):
 class FSDPTestModel(nn.Module, ABC):
     """This defines the interface expected from all models used commonly for
     FSDP unit tests."""
+
     @abstractmethod
     def get_input(self, device) -> Tuple[torch.Tensor, ...]:
         """Returns an input for the model as as tuple."""
@@ -87,7 +84,6 @@ def init(
         ...
 
 
-
 def _assert_module_states(
     model: nn.Module,
     process_group: dist.ProcessGroup,
@@ -116,6 +112,7 @@ def _assert_module_states(
         for (_, p1), (_, p2) in zip(rank0_states, state):
             assert_fn(p1, p2)
 
+
 def _zero_model(
     model: nn.Module,
     zero_buffers: bool = False,
@@ -130,6 +127,7 @@ def _zero_model(
                 with torch.no_grad():
                     buffer.zero_()
 
+
 def _get_state_dict(model, cpu_offload=False, half=False):
     if not cpu_offload:
         model = model.cuda()
@@ -138,11 +136,13 @@ def _get_state_dict(model, cpu_offload=False, half=False):
 
     return model.state_dict()
 
+
 def subtest_name(test_name_mapping, *args):
-    return '_'.join(
+    return "_".join(
         [test_name_mapping[str(s)] if s is not None else "none" for s in args]
     )
 
+
 def get_full_params(model: nn.Module, recurse: bool = True):
     """
     Returns the full unsharded parameters of ``model``. Any FSDP-managed
@@ -156,14 +156,14 @@ def get_full_params(model: nn.Module, recurse: bool = True):
     with FSDP.summon_full_params(model, recurse=recurse):
         return deepcopy(list(model.parameters()))
 
+
 def _maybe_cuda(model: nn.Module, move_to_cuda: bool):
     return model.cuda() if move_to_cuda else model
 
+
 def _maybe_wrap_fsdp(model: nn.Module, wrap_fsdp: bool, *args, **kwargs):
-    return (
-        model if not wrap_fsdp
-        else FSDP(model, *args, **kwargs)
-    )
+    return model if not wrap_fsdp else FSDP(model, *args, **kwargs)
+
 
 class DummyProcessGroup:
     def __init__(self, rank: int, size: int):
@@ -187,13 +187,13 @@ def get_future():
         dist_wait.get_future = get_future
         return dist_wait
 
+
 class DeterministicModel(torch.nn.Module):
     def __init__(self, wrap_fsdp, cpu_offload=CPUOffload(offload_params=False)):
         super().__init__()
         # keep everything deterministic for model initialization
         torch.manual_seed(0)
-        self.inner: Union[torch.nn.Linear, FSDP] = \
-            torch.nn.Linear(2, 2).cuda()
+        self.inner: Union[torch.nn.Linear, FSDP] = torch.nn.Linear(2, 2).cuda()
         if wrap_fsdp:
             self.inner = FSDP(self.inner, cpu_offload=cpu_offload)
         self.outer = torch.nn.Linear(2, 2).cuda()
@@ -202,6 +202,7 @@ def forward(self, x):
         y = self.inner(x)
         return self.outer(y)
 
+
 class TransformerWithSharedParams(FSDPTestModel):
     def __init__(
         self,
@@ -297,7 +298,9 @@ def init(
         if fsdp_kwargs is None:
             fsdp_kwargs = {}
         if fsdp_init_mode == FSDPInitMode.NO_FSDP:
-            return TransformerWithSharedParams(group, cuda_init_mode, add_bn, deterministic)
+            return TransformerWithSharedParams(
+                group, cuda_init_mode, add_bn, deterministic
+            )
         elif fsdp_init_mode == FSDPInitMode.RECURSIVE:
             # Default to the `transformer_auto_wrap_policy()`
             if "auto_wrap_policy" not in fsdp_kwargs:
@@ -311,7 +314,9 @@ def init(
             else:
                 auto_wrap_policy = fsdp_kwargs.pop("auto_wrap_policy")
             fsdp_model = FSDP(
-                TransformerWithSharedParams(group, cuda_init_mode, add_bn, deterministic),
+                TransformerWithSharedParams(
+                    group, cuda_init_mode, add_bn, deterministic
+                ),
                 group,
                 auto_wrap_policy=auto_wrap_policy,
                 **fsdp_kwargs,
@@ -454,6 +459,7 @@ def init(
 class ModuleWithDelay(FSDPTestModel):
     """This class wraps a :class:`FSDPTestModel` to optionally add a delay
     after computing the loss and/or before the gradient reduction."""
+
     def __init__(
         self,
         module: nn.Module,
@@ -519,6 +525,7 @@ def init(
             delay_before_reduction_ms,
         )
 
+
 class NestedWrappedModuleWithDelay(ModuleWithDelay):
     @staticmethod
     def init(
@@ -601,7 +608,7 @@ def __init__(
             _maybe_cuda(nn.Linear(d_input, d_shared), self.move_to_cuda),
             shared,
             expert,
-            _maybe_cuda(nn.Linear(d_shared, d_input), self.move_to_cuda)
+            _maybe_cuda(nn.Linear(d_shared, d_input), self.move_to_cuda),
         )
 
     def forward(self, x):
@@ -738,7 +745,9 @@ def run_subtests(
         # Convert the config mapping to a list to have a fixed order
         subtest_config_items: List[Tuple[str, List[Any]]] = list(subtest_config.items())
         subtest_config_keys: List[str] = [item[0] for item in subtest_config_items]
-        subtest_config_values: List[List[Any]] = [item[1] for item in subtest_config_items]
+        subtest_config_values: List[List[Any]] = [
+            item[1] for item in subtest_config_items
+        ]
         for values in itertools.product(*subtest_config_values):
             # Map keyword to chosen value
             subtest_kwargs = {
@@ -850,7 +859,9 @@ def _train_for_several_steps(
                         model, norm_type, self.rank
                     )
                 else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm, norm_type)
+                    torch.nn.utils.clip_grad_norm_(
+                        model.parameters(), max_norm, norm_type
+                    )
                     total_norm_after_clip = _collect_total_grad_norm_local(
                         model, norm_type
                     )
@@ -910,7 +921,9 @@ def _test_fsdp_parity(
                 wrapper should provide data parallel semantics. If ``None``,
                 then the callable defaults to the DDP constructor.
         """
-        assert fsdp_init_mode != FSDPInitMode.NO_FSDP, "Expects an FSDP init mode that wraps with FSDP"
+        assert (
+            fsdp_init_mode != FSDPInitMode.NO_FSDP
+        ), "Expects an FSDP init mode that wraps with FSDP"
         if init_kwargs is None:
             init_kwargs = {}
         lr = 1e-2
@@ -977,15 +990,20 @@ def _test_fsdp_parity(
         # Offloading parameters with `CUDA_AFTER` should raise an error during
         # lazy initialization due to the parameter devices not being CPU;
         # otherwise, all parameter devices should be CPU
-        expects_device_error = offload_params and cuda_init_mode == CUDAInitMode.CUDA_AFTER
-        expects_cpu_device = offload_params and cuda_init_mode != CUDAInitMode.CUDA_AFTER
+        expects_device_error = (
+            offload_params and cuda_init_mode == CUDAInitMode.CUDA_AFTER
+        )
+        expects_cpu_device = (
+            offload_params and cuda_init_mode != CUDAInitMode.CUDA_AFTER
+        )
         if expects_cpu_device:
             cpu_device = torch.device("cpu")
             for param in fsdp_model.parameters():
                 self.assertEqual(param.device, cpu_device)
         context = (
             self.assertRaisesRegex(AssertionError, "Expected param to be on CPU")
-            if expects_device_error else suppress()
+            if expects_device_error
+            else suppress()
         )
         with context:
             fsdp_loss = self._train_for_several_steps(

From fb5b1006c72692d61d285bbddde86cc5fe66ff02 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@umn.edu>
Date: Thu, 27 Oct 2022 05:15:16 +0000
Subject: [PATCH 0223/1922] use nv_diag_suppress (#87712)

Fixes:
```
/dev/shm/rbarnes/tempfs/pytorch/aten/src/ATen/native/cuda/UnaryFractionKernels.cu(125): warning #20236-D: pragma "diag_suppress" is deprecated, use "nv_diag_suppress" instead

/dev/shm/rbarnes/tempfs/pytorch/aten/src/ATen/native/cuda/UnaryFractionKernels.cu(125): warning #20236-D: pragma "diag_suppress" is deprecated, use "nv_diag_suppress" instead

/dev/shm/rbarnes/tempfs/pytorch/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu(73): warning #20236-D: pragma "diag_suppress" is deprecated, use "nv_diag_suppress" instead

/dev/shm/rbarnes/tempfs/pytorch/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu(73): warning #20236-D: pragma "diag_suppress" is deprecated, use "nv_diag_suppress" instead
```

cc @ngimel
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87712
Approved by: https://github.com/soumith
---
 aten/src/ATen/native/cuda/UnaryFractionKernels.cu | 2 +-
 aten/src/ATen/native/sparse/cuda/SparseMatMul.cu  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/cuda/UnaryFractionKernels.cu b/aten/src/ATen/native/cuda/UnaryFractionKernels.cu
index 87aa784b7d5d3..ae4d4a01aa00d 100644
--- a/aten/src/ATen/native/cuda/UnaryFractionKernels.cu
+++ b/aten/src/ATen/native/cuda/UnaryFractionKernels.cu
@@ -122,7 +122,7 @@ __host__ __device__ static inline c10::complex<float> nearbyint_wrapper(c10::com
 }
 
 #pragma push
-#pragma diag_suppress 177   // Function was declared but never referenced
+#pragma nv_diag_suppress 177   // Function was declared but never referenced
 __host__ __device__ static inline c10::complex<double> nearbyint_wrapper(c10::complex<double> a) {
   return c10::complex<double>(::nearbyint(static_cast<double>(a.real())), ::nearbyint(static_cast<double>(a.imag())));
 }
diff --git a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
index 8cc5fc3157c38..33123abccbe93 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
@@ -70,7 +70,7 @@ Tensor _to_csr_int(const Tensor& rowIndices, int64_t dim, int64_t nnz) {
 #pragma push
 // NVCC complains that confirm_mult_size is not used,
 // but it is used in specializations of CusparseMatrixMultiplyOp below
-#pragma diag_suppress 177   // Function was declared but never referenced
+#pragma nv_diag_suppress 177   // Function was declared but never referenced
 int confirm_mult_size(const std::vector<int>& mat1_size, const std::vector<int>& mat2_size) {
   TORCH_CHECK(
       mat1_size[1] == mat2_size[0],

From 85816bbdd570015ec93613f7149607c0e2ec9ff0 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Thu, 27 Oct 2022 06:04:22 +0000
Subject: [PATCH 0224/1922] [torch] Add torch cpp cpu target for
 torch/csrc/api/src files (#87327)

Summary: Duplicating fbcode target `fbcode//caffe2:torch-cpp-cpu` target in xplat. In D40460749 our user wants to use `torch::kNearest` enum which is defined in `torch/csrc/api/src/enum.cpp`. Adding this target to support it.

Test Plan: Rely on CI

Differential Revision: D40532087

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87327
Approved by: https://github.com/ezyang
---
 buckbuild.bzl | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/buckbuild.bzl b/buckbuild.bzl
index 24302e64c92f1..0003353f1040f 100644
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@@ -22,6 +22,7 @@ load(
     "jit_core_headers",
     "jit_core_sources",
     "libtorch_profiler_sources",
+    "torch_cpp_srcs",
     "torch_mobile_tracer_sources",
 )
 load(
@@ -1368,6 +1369,19 @@ def define_buck_targets(
         ],
     )
 
+    pt_xplat_cxx_library(
+        name = "torch_cpp_cpu",
+        srcs = torch_cpp_srcs,
+        headers = native.glob(["torch/csrc/api/include/**/*.h"]) + ["torch/script.h"],
+        compiler_flags = get_pt_compiler_flags(),
+        exported_preprocessor_flags = get_pt_preprocessor_flags(),
+        visibility = ["PUBLIC"],
+        exported_deps = [
+            ":torch",
+            ":torch_mobile_deserialize_common",  # for torch/csrc/api/src/serialize/input-archive.cpp
+        ],
+    )
+
     pt_xplat_cxx_library(
         name = "torch_core",
         srcs = core_sources_full_mobile_no_backend_interface + [

From 1ff7e3125428e9e56c80b2127e7f80132d1bf247 Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Wed, 26 Oct 2022 16:56:47 -0700
Subject: [PATCH 0225/1922] [Profiler] Hold weak reference to prevent
 TensorImpl address reuse during profiling. (#87244)

A recurring problem with assigning Tensor IDs is that we want to preserve identity when storage changes but we don't observe TensorImpl destruction so identity assignment is not robust to the ABA problem with respect to TensorImpl*. ~TensorImpl is far too hot to instrument; even adding a call to a no-op function in a different compilation unit increases overhead by tens of percent. (OSS builds do not have any sort of LTO.)

Fortunately there is a solution. A PyTorch Tensor is a `c10::intrusive_ptr<c10::TensorImpl>`, which in turn holds a storage. (Which is a `c10::intrusive_ptr<c10::StorageImpl>`) `c10::intrusive_ptr` has a `c10::weak_intrusive_ptr` class for taking non-owning references to the underlying object. The implementation involves both a strong refcount and weak refcount in `c10::intrusive_ptr`. If the strong refcount of an intrusive_ptr goes to zero and there are no weak references then everything is deleted. However if there is a weak reference then the intrusive_ptr calls `release_resources()` but not delete.

This has the effect of freeing the underlying resources (ensuring that program semantics are unchanged) but leaves behind an empty shell of an `intrusive_ptr` that the `weak_intrusive_ptr`s use to check status. And herein lies the solution: as long as we hold a weak reference to a TensorImpl we will block deletion and prevent the `TensorImpl*` from being reused.

This PR uses a `c10::weak_intrusive_ptr<c10::TensorImpl>` to store the address of profiled TensorImpls and then converts it to a raw pointer (or rather, a `TensorImplAddress`) during post processing when we no longer care about blocking address reuse.

Differential Revision: [D40492848](https://our.internmc.facebook.com/intern/diff/D40492848/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87244
Approved by: https://github.com/slgong-fb, https://github.com/albanD
---
 test/profiler/test_profiler.py          | 17 ++++++
 torch/csrc/autograd/profiler_python.cpp | 21 +++++--
 torch/csrc/profiler/collection.cpp      |  9 ++-
 torch/csrc/profiler/collection.h        | 72 +++++++++-------------
 torch/csrc/profiler/data_flow.h         | 79 +++++++++++++++++++++++++
 torch/csrc/profiler/python/init.cpp     |  2 +-
 6 files changed, 144 insertions(+), 56 deletions(-)
 create mode 100644 torch/csrc/profiler/data_flow.h

diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py
index 3831b6bd1247d..22db16d1943af 100644
--- a/test/profiler/test_profiler.py
+++ b/test/profiler/test_profiler.py
@@ -1505,6 +1505,23 @@ def test_allocation_ids_with_other_ops(self) -> None:
             lambda: torch.zeros((1,)).cos()
         )
 
+    def test_impl_reuse(self) -> None:
+        repeats = 1_000
+        with profile(profile_memory=True, record_shapes=True) as p:
+            for _ in range(repeats):
+                torch.ones((1,))
+            gc.collect()
+
+        roots = p.profiler.kineto_results.experimental_event_tree()
+        tensor_impls = tuple(
+            e.extra_fields.inputs.tensor_metadata[0].impl_ptr
+            for e in _utils.traverse_dfs(roots)
+            if e.name == "aten::fill_"
+        )
+
+        self.assertEqual(len(tensor_impls), repeats)
+        self.assertEqual(len(set(tensor_impls)), repeats)
+
     def test_extra_fields(self):
         with profile(with_stack=True, profile_memory=True) as p:
             _ = torch.ones((1,))
diff --git a/torch/csrc/autograd/profiler_python.cpp b/torch/csrc/autograd/profiler_python.cpp
index 308dcdcde49c8..43479c3f15668 100644
--- a/torch/csrc/autograd/profiler_python.cpp
+++ b/torch/csrc/autograd/profiler_python.cpp
@@ -129,7 +129,7 @@ class CallTypeHelper final {
       std::index_sequence<I...>);
 
   template <size_t C, typename T, typename FunctorT, typename... Args>
-  static void map(T& t, FunctorT& f, Args... args) {
+  static void map(T& t, FunctorT& f, Args&&... args) {
     f(std::get<C>(t), args...);
     c10::guts::if_constexpr<C + 1 < End>(
         [&](auto _) { map<C + 1>(_(t), f, std::forward<Args>(args)...); });
@@ -139,7 +139,7 @@ class CallTypeHelper final {
   using tuple_type = decltype(make_tuple_impl(std::make_index_sequence<End>{}));
 
   template <typename FunctorT, typename... Args>
-  static void map(tuple_type& t, FunctorT& f, Args... args) {
+  static void map(tuple_type& t, FunctorT& f, Args&&... args) {
     map<0>(t, f, std::forward<Args>(args)...);
   }
 };
@@ -281,6 +281,9 @@ using PyOptimizerCallKey = Config<CallType::PyOptimizerCall>::key_t;
 
 class ValueCache {
  public:
+  ValueCache() = default;
+  ValueCache(const ValueCache&) = delete;
+
   template <CallType C>
   void store(const typename Config<C>::key_t&, typename Config<C>::ephemeral_t);
 
@@ -295,6 +298,9 @@ class ValueCache {
         load<C>(callsite.value_)};
   }
 
+  c10::optional<TensorMetadata> recordIfTensor(py::handle p);
+  std::vector<std::pair<std::string, TensorMetadata>> unpackTensorMap(
+      py::dict tensor_map);
   void trimPrefixes();
 
  private:
@@ -330,18 +336,21 @@ typename Config<C>::cls_t set_class(
   return cls;
 }
 
-auto toTensorMetadata(PyObject* self) {
+TensorMetadata toTensorMetadata(PyObject* self) {
   TORCH_INTERNAL_ASSERT(THPVariable_CheckExact(self));
-  return TensorMetadata{THPVariable_Unpack(self)};
+  const auto& t = THPVariable_Unpack(self);
+  RawTensorMetadata m{t};
+  return TensorMetadata{m};
 }
 
-auto recordIfTensor(py::handle p) {
+c10::optional<TensorMetadata> ValueCache::recordIfTensor(py::handle p) {
   return THPVariable_CheckExact(p.ptr())
       ? c10::optional<TensorMetadata>{toTensorMetadata(p.ptr())}
       : c10::nullopt;
 }
 
-auto unpackTensorMap(py::dict tensor_map) {
+std::vector<std::pair<std::string, TensorMetadata>> ValueCache::unpackTensorMap(
+    py::dict tensor_map) {
   std::vector<std::pair<std::string, TensorMetadata>> out;
   for (auto& it : tensor_map) {
     auto* value = it.second.ptr();
diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp
index 01b7c4024f269..8bb57fda9cf48 100644
--- a/torch/csrc/profiler/collection.cpp
+++ b/torch/csrc/profiler/collection.cpp
@@ -30,9 +30,8 @@ using result_ptr_t = std::shared_ptr<Result>;
 using trace_ptr_t =
     std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>;
 
-RawTensorMetadata::RawTensorMetadata(const at::Tensor& t)
-    : impl_{t.unsafeGetTensorImpl()},
-      data_{t.has_storage() ? t.storage().data() : nullptr},
+RawTensorMetadataBase::RawTensorMetadataBase(const at::Tensor& t)
+    : data_{t.has_storage() ? t.storage().data() : nullptr},
       device_type_{t.device().type()},
       device_index_{t.device().index()},
       dtype_{t.scalar_type()},
@@ -864,10 +863,10 @@ void calculate_unique_tensor_ids(std::vector<result_ptr_t>& sorted_results) {
 
     ska::flat_hash_set<storage_id_t> tensor_set;
     auto insert_tensor = [&lookup, &tensors, &tensor_set](TensorMetadata& m) {
-      if (m.impl_ && m.data_) {
+      if (m.impl() && m.data_) {
         const auto id = lookup(m.data_);
         tensor_set.insert(id);
-        tensors.emplace_back(TensorStoragePair{m.impl_, id, m.id_});
+        tensors.emplace_back(TensorStoragePair{m.impl(), id, m.id_});
       }
     };
 
diff --git a/torch/csrc/profiler/collection.h b/torch/csrc/profiler/collection.h
index 5402e613eb858..096568285a713 100644
--- a/torch/csrc/profiler/collection.h
+++ b/torch/csrc/profiler/collection.h
@@ -14,6 +14,7 @@
 #include <c10/util/strong_type.h>
 #include <c10/util/variant.h>
 #include <torch/csrc/profiler/containers.h>
+#include <torch/csrc/profiler/data_flow.h>
 #include <torch/csrc/profiler/kineto_shim.h>
 #include <torch/csrc/profiler/orchestration/python_tracer.h>
 #include <torch/csrc/profiler/stubs/base.h>
@@ -37,48 +38,10 @@ enum class EventType : uint8_t {
 // ============================================================================
 // == Value (Tensor, Scalar) summary ==========================================
 // ============================================================================
+struct TORCH_API RawTensorMetadataBase {
+  RawTensorMetadataBase() = default;
+  explicit RawTensorMetadataBase(const at::Tensor& t);
 
-// We use a Tensor's TensorImpl adress and StorageImpl data start to build the
-// data flow graph. We do not hold a reference so we wrap them in strong types
-// to prevent direct access.
-using TensorImplAddress = strong::type<
-    const c10::TensorImpl*,
-    struct TensorImplAddress_,
-    strong::regular,
-    strong::hashable,
-    strong::boolean>;
-
-using StorageImplData = strong::type<
-    void*,
-    struct StorageImplData_,
-    strong::regular,
-    strong::hashable,
-    strong::boolean>;
-
-// Identity is a complex concept in PyTorch. A Tensor might not have a
-// an associated storage, multiple Tensors might share the same underlying
-// storage, the storage of a Tensor might change over time, etc.
-//
-// For the purpose of profiling we're mostly interested in data flow
-// analysis. As a result, we can take an expansive view of identity:
-// Tensors share an ID if they share a TensorImpl or storage data.
-//
-// This identity equality is transitive; If Tensors T0 and T1 share a storage
-// S0 and T1 later points to a different storage S1 then all Tensors which
-// point to either S0 or S1 are considered to have the same identity. (Since
-// profiler cannot reason beyond that.)
-//
-// The profiler will handle lifetime analysis to ensure that identities do
-// not run afoul of the ABA problem. This does, however, mean that identities
-// can only be assigned when memory profiling is enabled. (And we cannot
-// handle ABA for TensorImpl as those allocations are not instrumented.)
-using TensorID = strong::type<size_t, struct TensorID_, strong::regular>;
-
-struct TORCH_API RawTensorMetadata {
-  RawTensorMetadata() = default;
-  RawTensorMetadata(const RawTensorMetadata&) = default;
-  explicit RawTensorMetadata(const at::Tensor& t);
-  TensorImplAddress impl_;
   StorageImplData data_;
 
   // Device is separated into DeviceType and DeviceIndex as Device
@@ -91,13 +54,34 @@ struct TORCH_API RawTensorMetadata {
   uint32_t dim_;
 };
 
-struct TensorMetadata : public RawTensorMetadata {
-  explicit TensorMetadata(const RawTensorMetadata& r) : RawTensorMetadata(r) {}
-  explicit TensorMetadata(const at::Tensor& t) : RawTensorMetadata(t) {}
+// Collected during profiling.
+struct TORCH_API RawTensorMetadata : RawTensorMetadataBase {
+  RawTensorMetadata() = default;
+  RawTensorMetadata(const RawTensorMetadata&) = default;
+  explicit RawTensorMetadata(const at::Tensor& t)
+      : RawTensorMetadataBase(t), weak_self_{WeakTensor(t)} {};
+
+  // Wrap in `c10::optional` to make `weak_self_` default constructable.
+  c10::optional<WeakTensor> weak_self_;
+};
+
+// Used during post processing.
+struct TensorMetadata : public RawTensorMetadataBase {
+  explicit TensorMetadata(const RawTensorMetadata& r)
+      : RawTensorMetadataBase(r),
+        weak_self_{r.weak_self_.value_or(WeakTensor(at::Tensor()))} {
+    SOFT_ASSERT(r.weak_self_.has_value());
+  }
+
   c10::Device device() const {
     return {device_type_, device_index_};
   }
 
+  TensorImplAddress impl() {
+    return weak_self_.get();
+  }
+
+  WeakTensor weak_self_;
   c10::optional<TensorID> id_;
 };
 
diff --git a/torch/csrc/profiler/data_flow.h b/torch/csrc/profiler/data_flow.h
new file mode 100644
index 0000000000000..7afd0204d41db
--- /dev/null
+++ b/torch/csrc/profiler/data_flow.h
@@ -0,0 +1,79 @@
+#pragma once
+
+#include <ATen/core/TensorBody.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/util/strong_type.h>
+#include <c10/util/variant.h>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+
+// Identity is a complex concept in PyTorch. A Tensor might not have a
+// an associated storage, multiple Tensors might share the same underlying
+// storage, the storage of a Tensor might change over time, etc.
+//
+// For the purpose of profiling we're mostly interested in data flow
+// analysis. As a result, we can take an expansive view of identity:
+// Tensors share an ID if they share a TensorImpl or storage data.
+//
+// This identity equality is transitive; If Tensors T0 and T1 share a storage
+// S0 and T1 later points to a different storage S1 then all Tensors which
+// point to either S0 or S1 are considered to have the same identity. (Since
+// profiler cannot reason beyond that.)
+//
+// The profiler will handle lifetime analysis to ensure that identities do
+// not run afoul of the ABA problem. This does, however, mean that identities
+// can only be assigned when memory profiling is enabled.
+using TensorID = strong::type<size_t, struct TensorID_, strong::regular>;
+
+// We use a Tensor's TensorImpl adress and StorageImpl data start to build the
+// data flow graph. We do not hold an owning reference so we wrap them in strong
+// types to prevent direct access.
+using TensorImplAddress = strong::type<
+    const c10::TensorImpl*,
+    struct TensorImplAddress_,
+    strong::regular,
+    strong::hashable,
+    strong::boolean>;
+
+using StorageImplData = strong::type<
+    void*,
+    struct StorageImplData_,
+    strong::regular,
+    strong::hashable,
+    strong::boolean>;
+
+// ============================================================================
+// == weak_intrusive_ptr and the ABA problem for TensorImpl* ==================
+// ============================================================================
+// Tracking `TensorImpl`s is an important part of identity tracking, because
+// a Tensor might change storage; however when it does we want to retain the
+// fact that the old and new storage belong to the same logical Tensor. We
+// cannot take an owning reference to the Tensor because that would change
+// program semantics by extending the lifetime of the Tensor. However if we
+// store a raw TensorImpl* pointer the TensorImpl might be deleted and a new
+// TensorImpl might be created that reuses the address. (ABA problem)
+//
+// Fortunately, there is a feature of `c10::intrusive_ptr` that we can use to
+// prevent address reuse for the duration of profiling: the weak intrusive ptr.
+// When a Tensor's refcount reaches zero but there are outstanding weak
+// references (`weakcount_ > 0`) it will free the underlying managed resources
+// by calling `target_->release_resources()`, but it will not call `delete`.
+// (Instead, `delete` is called when the last weak reference is destroyed.)
+// This means that we can safely use address identity to track `TensorImpls`.
+class WeakTensor {
+ public:
+  explicit WeakTensor(const at::Tensor& t) : weak_self_(t.getIntrusivePtr()) {}
+
+  auto get() const {
+    return TensorImplAddress{weak_self_._unsafe_get_target()};
+  }
+
+ private:
+  c10::weak_intrusive_ptr<c10::TensorImpl> weak_self_;
+};
+
+} // namespace impl
+} // namespace profiler
+} // namespace torch
diff --git a/torch/csrc/profiler/python/init.cpp b/torch/csrc/profiler/python/init.cpp
index 8a800a3d5f82b..8c3d10af0bd0c 100644
--- a/torch/csrc/profiler/python/init.cpp
+++ b/torch/csrc/profiler/python/init.cpp
@@ -129,7 +129,7 @@ void initPythonBindings(PyObject* module) {
       .def_readonly("tensor_metadata", &Inputs::tensor_metadata_);
 
   py::class_<TensorMetadata>(m, "_TensorMetadata")
-      .def_readonly("impl_ptr", &TensorMetadata::impl_)
+      .def_property_readonly("impl_ptr", &TensorMetadata::impl)
       .def_readonly("storage_data_ptr", &TensorMetadata::data_)
       .def_readonly("id", &TensorMetadata::id_)
       .def_property_readonly(

From d3c3665f8d7172ea6632ebc0469c8e5376aa7a37 Mon Sep 17 00:00:00 2001
From: XiaobingSuper <xiaobing.zhang@intel.com>
Date: Wed, 26 Oct 2022 01:46:46 -0400
Subject: [PATCH 0226/1922] TorchDynamo: Add convolution unary fusion for cpu
 in inference mode (#87063)

cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87063
Approved by: https://github.com/jgong5, https://github.com/jansel
---
 test/inductor/test_torchinductor.py |  60 ++++++++++++
 torch/_inductor/compile_fx.py       |   1 +
 torch/_inductor/ir.py               | 146 ++++++++++++++++++++++++++++
 torch/_inductor/lowering.py         |  38 ++++++++
 torch/_inductor/overrides.py        | 144 +++++++++++++++++++++++++++
 5 files changed, 389 insertions(+)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 8e8b371c2780e..dd846e5f405a4 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -3,6 +3,7 @@
 import dataclasses
 import functools
 import importlib
+import itertools
 import os
 import random
 import sys
@@ -1292,6 +1293,65 @@ def fn(a, b):
             check_lowp=False,
         )
 
+    # For gpu path, there has a accurcy issue,
+    # see https://github.com/pytorch/pytorch/issues/87745.
+    @unittest.skipIf(HAS_CUDA, "only support cpu conv2d unary test")
+    def test_conv2d_unary(self):
+        def _unary_list():
+            unary_list = [
+                torch.nn.ReLU(),
+                torch.nn.Sigmoid(),
+                torch.nn.Tanh(),
+                torch.nn.Hardswish(),
+                torch.nn.LeakyReLU(0.1, inplace=False),
+                torch.nn.Hardtanh(min_val=-0.5, max_val=4, inplace=False),
+                torch.nn.GELU(approximate="none"),
+                torch.nn.GELU(approximate="tanh"),
+            ]
+            return unary_list
+
+        test_memory_format = [torch.contiguous_format, torch.channels_last]
+        options = itertools.product(
+            _unary_list(),
+            [True, False],
+            [1, 3],
+            [1, 2],
+            [1, 4],
+            test_memory_format,
+        )
+
+        for (
+            unary_fn,
+            bias,
+            kernel_size,
+            dilation,
+            groups,
+            memory_format,
+        ) in options:
+            oC = 32 * groups
+            iC = 3 * groups
+            x_shape = (1, iC, 112, 112)
+            mod = torch.nn.Sequential(
+                torch.nn.Conv2d(
+                    iC,
+                    oC,
+                    kernel_size=kernel_size,
+                    dilation=dilation,
+                    groups=groups,
+                    bias=bias,
+                ),
+                unary_fn,
+            ).eval()
+
+            # TODO: add bf16 test for cpu path?
+            v = torch.randn(x_shape, dtype=torch.float32).to(
+                memory_format=memory_format
+            )
+            self.common(
+                mod,
+                (v,),
+            )
+
     def test_gather1(self):
         def fn(a, b):
             return (
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index e6b27420a941a..26770b0671838 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -340,6 +340,7 @@ def compile_fx(model_: torch.fx.GraphModule, example_inputs_: List[torch.Tensor]
     with overrides.patch_functions():
         model_ = normalize_ir(model_, example_inputs_)
         model_ = overrides.replace_fx(model_)
+        model_ = overrides.fuse_fx(model_, example_inputs_)
     num_example_inputs = len(example_inputs_)
     cudagraphs = BoxedBool(config.triton.cudagraphs and not config.dynamic_shapes)
 
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 7554dc905e23f..156eeb11bdc7b 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -3295,6 +3295,152 @@ def get_template_tiling(self):
         )
 
 
+def _prepare_convolution_fusion_create(
+    cls,
+    x: "TensorBox",
+    weight: "TensorBox",
+    bias: "TensorBox",
+    padding_: List[int],
+    stride_: List[int],
+    dilation_: List[int],
+    groups: int,
+):
+    """
+    This function is a helper function to prepare inputs, layout and constant args
+    for convolution post-op fusion's create function, including deciding the output
+    layout (channels first or channels last), realizing inputs and make them etc. The
+    function only supports the CPU device since conv post-op fusion kernel is only
+    supported on CPU right now.
+    """
+
+    x = cls.require_stride1(cls.realize_input(x))
+    weight = cls.require_stride1(cls.realize_input(weight))
+    assert x.get_device().type == "cpu" and weight.get_device().type == "cpu"
+    inputs = [x, weight]
+    stride = tuple(stride_)
+    padding = tuple(padding_)
+    dilation = tuple(dilation_)
+    assert isinstance(groups, int)
+
+    weight_shape = [
+        sympy.Integer(V.graph.sizevars.guard_static_shape(s)) for s in weight.get_size()
+    ]
+
+    out_channels, in_channels1, *kernel_size = weight_shape
+    in_channels1 = in_channels1 * groups
+    assert len(x.get_size()) == 2 + len(kernel_size)
+    batch, in_channels2, *input_size = x.get_size()
+    output_size = [batch]
+    V.graph.sizevars.guard_equals(in_channels1, in_channels2)
+
+    output_size.append(out_channels)
+    assert (
+        len(stride)
+        == len(padding)
+        == len(dilation)
+        == len(kernel_size)
+        == len(input_size)
+    )
+    for i in range(len(stride)):
+        output_size.append(
+            IndexingDiv(
+                input_size[i]
+                + 2 * padding[i]
+                - dilation[i] * (kernel_size[i] - 1)
+                - 1
+                + stride[i],
+                stride[i],
+            )
+        )
+        output_size[-1] = sympy.Integer(
+            V.graph.sizevars.guard_static_shape(output_size[-1])
+        )
+
+    output_layout_str = "torch.contiguous_format"
+    # If x or weight have one channels_last(2d or 3d) format, it will call channels_last path,
+    # which align with aten.convolutuion path(cpu only support 2d case now).
+    # TODO: after cpu 3d convolution support channels_last path, the size check can be removed.
+    if len(x.get_size()) == 4 and (
+        x.get_layout().is_channels_last_stride_ordered()
+        or weight.get_layout().is_channels_last_stride_ordered()
+    ):
+        output_layout_str = "torch.channels_last"
+
+    if output_layout_str == "torch.channels_last":
+        stride_order = [0] + list(reversed(range(1, len(kernel_size) + 1)))
+        if len(stride_order) < len(output_size):
+            # add batch dim if it exists
+            stride_order = [len(stride_order)] + stride_order
+    else:
+        stride_order = list(reversed(range(len(output_size))))
+
+    kernel_layout = FlexibleLayout(
+        device=inputs[0].get_device(),
+        dtype=inputs[0].get_dtype(),
+        size=output_size,
+        stride_order=stride_order,
+    )
+    constant_args = [padding, stride, dilation, groups]
+
+    if bias is not None:
+        inputs.append(bias)
+    else:
+        constant_args.insert(0, bias)
+    return inputs, constant_args, kernel_layout
+
+
+class ConvolutionUnary(ExternKernelAlloc):
+    kernel = "torch.ops.mkldnn._convolution_pointwise"
+
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+        kernel="torch.ops.mkldnn._convolution_pointwise",
+    ):
+        super().__init__(layout, inputs, constant_args)
+        self.kernel = kernel
+
+    def codegen(self, wrapper):
+        wrapper.writeline(
+            f"{self.get_name()} = {self.kernel}({', '.join(self.codegen_args())})"
+        )
+
+    @classmethod
+    def create(
+        cls,
+        x: "TensorBox",
+        weight: "TensorBox",
+        bias: "TensorBox",
+        padding_: List[int],
+        stride_: List[int],
+        dilation_: List[int],
+        groups: int,
+        attr,
+        scalars,
+        algorithm,
+    ):
+        kernel = "torch.ops.mkldnn._convolution_pointwise"
+        (inputs, constant_args, kernel_layout,) = _prepare_convolution_fusion_create(
+            cls, x, weight, bias, padding_, stride_, dilation_, groups
+        )
+        constant_args = constant_args + [attr, scalars, algorithm]
+        return ConvolutionUnary(
+            layout=kernel_layout,
+            inputs=inputs,
+            constant_args=constant_args,
+            kernel=kernel,
+        )
+
+    def apply_constraint(self):
+        x = self.inputs[0]
+        # FixedLayout of input
+        x = self.require_stride_order(x, self.layout.preferred_stride_order)
+        self.inputs[0] = x
+        self.freeze_layout_with_stride_order(self.layout.preferred_stride_order)
+
+
 @dataclasses.dataclass
 class MutableBox(IRNode):
     """
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index fd94aa9bc5d5a..6b047e843c301 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -886,6 +886,44 @@ def bmm(a: TensorBox, b: TensorBox):
     return TensorBox.create(ir.BatchMatrixMultiply.create(a, b))
 
 
+def register_onednn_fusion_ops():
+    if torch._C.has_mkldnn:
+
+        @register_lowering(torch.ops.mkldnn._convolution_pointwise)
+        def convolution_unary(
+            x: TensorBox,
+            weight: TensorBox,
+            bias: TensorBox,
+            padding,
+            stride,
+            dilation,
+            groups,
+            attr,
+            scalars,
+            algorithm,
+        ):
+            return TensorBox.create(
+                ir.ConvolutionUnary.create(
+                    x,
+                    weight,
+                    bias,
+                    padding,
+                    stride,
+                    dilation,
+                    groups,
+                    attr,
+                    scalars,
+                    algorithm,
+                )
+            )
+
+    else:
+        pass
+
+
+register_onednn_fusion_ops()
+
+
 def fallback_handler(kernel):
     fallbacks.add(kernel)
 
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index 85a0e0c1c2459..4078d442e8704 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -1,10 +1,19 @@
+import copy
+import itertools
 import logging
 import random
 import weakref
 
 import torch
+import torch.nn as nn
 from torch import _prims
+from torch.fx.experimental.optimization import (
+    matches_module_pattern,
+    replace_node_module,
+)
 from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode
+from torch.nn import functional as F
+from torch.nn.modules.utils import _pair
 from torch.overrides import TorchFunctionMode
 
 log = logging.getLogger(__name__)
@@ -37,6 +46,127 @@ def replace_fx(gm: torch.fx.GraphModule):
     return gm
 
 
+class UnaryAttr(object):
+    def __init__(self, op_name: str, scalars_attr=None, algorithm_attr=None):
+        self.op_name = op_name
+        self.scalars_attr = scalars_attr if scalars_attr else []
+        self.algorithm_attr = algorithm_attr if algorithm_attr else ""
+        super(UnaryAttr, self).__init__()
+
+    def __call__(self, unary_module: nn.Module):
+        assert all(hasattr(unary_module, item) for item in self.scalars_attr)
+        scalars = [getattr(unary_module, item) for item in self.scalars_attr]
+
+        algorithm = ""
+        if self.algorithm_attr:
+            assert hasattr(unary_module, self.algorithm_attr)
+            algorithm = getattr(unary_module, self.algorithm_attr)
+
+        return self.op_name, scalars, algorithm
+
+
+class ConvUnary2d(nn.Conv2d):
+    def __init__(
+        self,
+        conv: nn.Module,
+        unary: nn.Module,
+    ):
+        super(ConvUnary2d, self).__init__(
+            conv.in_channels,
+            conv.out_channels,
+            conv.kernel_size,
+            conv.stride,
+            conv.padding,
+            conv.dilation,
+            conv.groups,
+            conv.bias is not None,
+            conv.padding_mode,
+            conv.weight.device,
+            conv.weight.dtype,
+        )
+        self._update_module_params(conv, unary)
+
+    def _update_module_params(self, conv, unary):
+        self.__dict__ = copy.deepcopy(conv.__dict__)
+        self.attr, self.scalars, self.algorithm = unary_modules_map[unary.__class__](
+            unary
+        )
+
+    def _conv_forward(self, input, weight, bias):
+        if self.padding_mode != "zeros":
+            return torch.ops.mkldnn._convolution_pointwise(
+                F.pad(
+                    input, self._reversed_padding_repeated_twice, mode=self.padding_mode
+                ),
+                weight,
+                bias,
+                _pair(0),
+                self.stride,
+                self.dilation,
+                self.groups,
+                self.attr,
+                self.scalars,
+                self.algorithm,
+            )
+        return torch.ops.mkldnn._convolution_pointwise(
+            input,
+            weight,
+            bias,
+            self.padding,
+            self.stride,
+            self.dilation,
+            self.groups,
+            self.attr,
+            self.scalars,
+            self.algorithm,
+        )
+
+    def forward(self, input):
+        return self._conv_forward(input, self.weight, self.bias)
+
+
+def fused_conv_unary_eval(conv: nn.Module, unary: nn.Module):
+    assert not (conv.training), "Fusion only for eval!"
+    return ConvUnary2d(
+        conv,
+        unary,
+    )
+
+
+def fuse_fx(gm: torch.fx.GraphModule, example_inputs):
+    if not (torch.backends.mkldnn.enabled and torch.backends.mkldnn.is_available()):
+        return gm
+    is_cpu = all(
+        example_input.device == torch.device("cpu") for example_input in example_inputs
+    )
+    if not is_cpu:
+        return gm
+    modules = dict(gm.named_modules())
+
+    for (unary_module, _), (computation_module, fuse_func,) in itertools.product(
+        unary_modules_map.items(), computation_op_unary_op_fusion_map.items()
+    ):
+        pattern = (computation_module, unary_module)
+        for node in gm.graph.nodes:
+            if matches_module_pattern(pattern, node, modules):
+                if (
+                    len(node.args[0].users) > 1
+                ):  # Output of computation_node is used by other nodes
+                    continue
+                conv = modules[node.args[0].target]
+                unary_node = modules[node.target]
+                eval_mode = all(not n.training for n in [conv, unary_node])
+                if not eval_mode:
+                    continue
+                fused_conv = fuse_func(conv, unary_node)
+                replace_node_module(node.args[0], modules, fused_conv)
+                node.replace_all_uses_with(node.args[0])
+                gm.graph.erase_node(node)
+                gm.graph.lint()
+    gm.recompile()
+    return gm
+
+
 def _philox_rand_like_meta(input, seed, offset):
     return _prims.TensorMeta(input)
 
@@ -163,3 +293,17 @@ def rand_like(x, **kwargs):
 
 
 replacements = {torch.nn.functional.dropout: lowmem_dropout, torch.rand_like: rand_like}
+
+
+computation_op_unary_op_fusion_map = {nn.Conv2d: fused_conv_unary_eval}
+
+
+unary_modules_map = {
+    nn.ReLU: UnaryAttr("relu"),
+    nn.Sigmoid: UnaryAttr("sigmoid"),
+    nn.Tanh: UnaryAttr("tanh"),
+    nn.Hardswish: UnaryAttr("hardswish"),
+    nn.LeakyReLU: UnaryAttr("leaky_relu", scalars_attr=["negative_slope"]),
+    nn.Hardtanh: UnaryAttr("hardtanh", scalars_attr=["min_val", "max_val"]),
+    nn.GELU: UnaryAttr("gelu", algorithm_attr="approximate"),
+}

From c7c275eb19b3403c168045d37a4fba0a248a86c9 Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Thu, 27 Oct 2022 10:46:53 +0000
Subject: [PATCH 0227/1922] [complex] conv_transpose2d (#81805)

Reference: https://github.com/pytorch/pytorch/issues/71108

Fixes : #86414
Pull Request resolved: https://github.com/pytorch/pytorch/pull/81805
Approved by: https://github.com/anjali411
---
 aten/src/ATen/native/Convolution.cpp          |  8 +++-
 .../_internal/common_methods_invocations.py   | 40 ++++++++++++++-----
 torch/testing/_internal/common_modules.py     | 21 +++++++++-
 3 files changed, 57 insertions(+), 12 deletions(-)

diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 64f6d141b9299..2dd7d515c14f9 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -1066,8 +1066,14 @@ at::Tensor conv_transpose2d(
   Tensor input;
   bool is_batched;
   std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 2, "conv_transpose2d");
-  auto output = at::convolution(
+  Tensor output;
+  if (at::isComplexType(input_.scalar_type())) {
+    output = complex_convolution(
       input, weight, bias, stride, padding, dilation, true, output_padding, groups);
+  } else {
+    output = at::convolution(
+      input, weight, bias, stride, padding, dilation, true, output_padding, groups);
+  }
   return is_batched ? output : output.squeeze(0);
 }
 
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 900c0987d2f2c..f04a2cc5465bb 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -3079,8 +3079,12 @@ def conv_transpose_ref(input, weight, bias, stride=1, padding=0,
 
     assert fn is not None
 
-    grad_fn_map = {torch.nn.functional.conv_transpose1d: torch.nn.grad.conv1d_input}
-    batched_dim_map = {torch.nn.functional.conv_transpose1d: 3}
+    grad_fn_map = {torch.nn.functional.conv_transpose1d: torch.nn.grad.conv1d_input,
+                   torch.nn.functional.conv_transpose2d: torch.nn.grad.conv2d_input,
+                   torch.nn.functional.conv_transpose3d: torch.nn.grad.conv3d_input}
+    batched_dim_map = {torch.nn.functional.conv_transpose1d: 3,
+                       torch.nn.functional.conv_transpose2d: 4,
+                       torch.nn.functional.conv_transpose3d: 5}
 
     # Input for `ref` is ndarray.
     input, weight = torch.from_numpy(input), torch.from_numpy(weight)
@@ -3090,7 +3094,10 @@ def conv_transpose_ref(input, weight, bias, stride=1, padding=0,
         input = input.unsqueeze(0)
 
     if bias is not None:
-        bias = torch.from_numpy(bias).unsqueeze(1)
+        bias = torch.from_numpy(bias)
+        unsqueeze_dims = input.ndim - 2
+        for _ in range(unsqueeze_dims):
+            bias = bias.unsqueeze(1)
 
     grad_output = input
     # Get the input shape for grad_fn.
@@ -3156,9 +3163,8 @@ def sample_inputs_conv_transpose2d(op_info, device, dtype, requires_grad, **kwar
          {'stride': 2, 'padding': 1, 'output_padding': 1, 'groups': 1, 'dilation': (2, 3)}),
         ((1, 1, 4, 3), (1, 2, 3, 4), None,
          {'stride': 2, 'padding': 1, 'output_padding': 1, 'groups': 1}),
-        ((2, 8, 4, 4), (8, 1, 3, 3), None, {'groups': 4}),
-        ((1, 4, 5, 5), (4, 8, 3, 3), None,
-         {})
+        ((2, 4, 4, 4), (4, 1, 3, 3), None, {'groups': 4}),
+        ((1, 2, 5, 5), (2, 4, 3, 3), None, {})
     )
 
     for input_shape, weight, bias, kwargs in cases:
@@ -10668,10 +10674,15 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     OpInfo('nn.functional.conv_transpose2d',
            aten_name='conv_transpose2d',
            aliases=('conv_transpose2d',),
-           dtypes=floating_types_and(torch.int64),
-           dtypesIfCUDA=floating_types_and(torch.float16,
-                                           *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
+           # `ref` for this function is backward of
+           # corresponding `conv*d`
+           ref=partial(conv_transpose_ref, fn=torch.nn.functional.conv_transpose2d),
+           dtypes=floating_and_complex_types_and(torch.int64),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.chalf,
+                                                       *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
            sample_inputs_func=sample_inputs_conv_transpose2d,
+           # Runs very slowly on slow-gradcheck for complex.
+           gradcheck_fast_mode=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            assert_jit_shape_analysis=True,
@@ -10679,7 +10690,16 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            decorators=[
                DecorateInfo(
                    toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1.3e-06), }),
-                   'TestCommon', 'test_variant_consistency_eager', device_type='cuda')],
+                   'TestCommon', 'test_variant_consistency_eager', device_type='cuda'),
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=2e-05, rtol=5e-05), }),
+                   'TestCommon', 'test_noncontiguous_samples', device_type='cuda'),
+               DecorateInfo(
+                   toleranceOverride({torch.complex32: tol(atol=5e-2, rtol=5e-2)}),
+                   "TestCudaFuserOpInfo", "test_nvfuser_correctness"),
+               DecorateInfo(
+                   toleranceOverride({torch.chalf: tol(atol=5e-2, rtol=5e-2), }),
+                   'TestCommon', 'test_complex_half_reference_testing')],
            skips=(
                # RuntimeError: !lhs.isAliasOf(rhs)INTERNAL ASSERT FAILED at
                # "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":104, please report a bug to PyTorch.
diff --git a/torch/testing/_internal/common_modules.py b/torch/testing/_internal/common_modules.py
index f214ffbb8b3d2..fed908e14dd03 100644
--- a/torch/testing/_internal/common_modules.py
+++ b/torch/testing/_internal/common_modules.py
@@ -1143,6 +1143,7 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, train
                module_inputs_func=partial(module_inputs_torch_nn_ConvNd, N=2, lazy=False, transposed=True),
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
+               dtypes=floating_and_complex_types_and(torch.chalf),
                skips=(
                    # channels_last support on cuda requires cudnn >= 7603
                    DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
@@ -1153,7 +1154,25 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, train
                    # See https://github.com/pytorch/pytorch/issues/80247
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='cpu'),
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='cuda',
-                                dtypes=[torch.float64]),
+                                dtypes=[torch.float64, torch.complex128]),
+                   # These fail only on ROCm
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='cuda',
+                                dtypes=[torch.complex32, torch.complex64], active_if=TEST_WITH_ROCM),
+                   # Not implmented for chalf on CPU
+                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_forward',
+                                dtypes=(torch.chalf,), device_type='cpu'),
+                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_memory_format',
+                                dtypes=(torch.chalf,), device_type='cpu'),
+                   DecorateInfo(unittest.expectedFailure, 'TestModule',
+                                'test_if_train_and_eval_modes_differ', dtypes=(torch.chalf,), device_type='cpu'),
+                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_non_contiguous_tensors',
+                                dtypes=(torch.chalf,), device_type='cpu'),
+                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_cpu_gpu_parity',
+                                dtypes=(torch.chalf,), device_type='cuda'),
+                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_multiple_device_transfer',
+                                dtypes=(torch.chalf,), device_type='cuda'),
+                   # Ref: https://github.com/pytorch/pytorch/issues/73502
+                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_pickle', dtypes=(torch.chalf,)),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),

From a0ca695dc4d44bca115efae7322b58a1015d0eb3 Mon Sep 17 00:00:00 2001
From: Mike Iovine <mikeiovine@meta.com>
Date: Thu, 27 Oct 2022 12:29:51 +0000
Subject: [PATCH 0228/1922] [JIT] Fix torch.jit.script for functions with many
 decorators (#87804)

Summary:
Python's function parsing from the `ast` module records the line number of the function definition, not the first decorator. So this diff fixes crashes like this:

```
IndexError: vector::_M_range_check: __n (which is 10) >= this->size() (which is 8)
```

Test Plan: New unit test

Differential Revision: D40726352

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87804
Approved by: https://github.com/tugsbayasgalan, https://github.com/davidberard98
---
 test/jit/test_misc.py | 19 +++++++++++++++++++
 torch/jit/frontend.py |  2 +-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/test/jit/test_misc.py b/test/jit/test_misc.py
index db37af81993f3..98ec7831d940e 100644
--- a/test/jit/test_misc.py
+++ b/test/jit/test_misc.py
@@ -361,3 +361,22 @@ def test_parse_ir_single_element_tensor_negative(self):
         ret = func()
         self.assertTrue(ret.numel() == 1)
         self.assertTrue(len(ret.size()) == 1)
+
+
+    def test_script_many_decorators(self):
+        def no_op_decorator(f):
+            return f
+
+        @no_op_decorator
+        @no_op_decorator
+        @no_op_decorator
+        @no_op_decorator
+        @no_op_decorator
+        def foo(x, dim: int):
+            return x.unsqueeze(dim)
+
+        x = torch.randn(1,)
+        expected = foo(x, 0)
+        scripted = torch.jit.script(foo)
+        actual = scripted(x, 0)
+        torch.testing.assert_allclose(expected, actual)
diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py
index 62548ba7e2cd6..4b5e3d68f75cd 100644
--- a/torch/jit/frontend.py
+++ b/torch/jit/frontend.py
@@ -324,7 +324,7 @@ def build_class_def(ctx, py_def, methods, properties, self_name, assigns):
 
 def build_def(ctx, py_def, type_line, def_name, self_name=None, pdt_arg_types=None):
     body = py_def.body
-    r = ctx.make_range(py_def.lineno + len(py_def.decorator_list),
+    r = ctx.make_range(py_def.lineno,
                        py_def.col_offset,
                        py_def.col_offset + len("def"))
 

From 8139cdaf7e4b3ed6cccf5caa41eed16e0a518e61 Mon Sep 17 00:00:00 2001
From: jpvillam <Juan.Villamizar@amd.com>
Date: Thu, 27 Oct 2022 15:11:28 +0000
Subject: [PATCH 0229/1922] [ROCM] Enable Sparse Pickle Test (#82729)

Missed stream context for serialization

### Description
Missing ROCm stream context on memory operations for serialization

### Testing
Ran the sparse pickle test

Pull Request resolved: https://github.com/pytorch/pytorch/pull/82729
Approved by: https://github.com/ngimel
---
 test/test_sparse.py          |  1 -
 torch/csrc/serialization.cpp | 25 +++++++++++++++++++++++--
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/test/test_sparse.py b/test/test_sparse.py
index 8ae982c034ae4..125fb6d83b300 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -3019,7 +3019,6 @@ def test_change_tensor_metadata(self, device, dtype):
         self.assertEqual(list(t.coalesce().indices().size()), [2, 1])
         self.assertEqual(list(t.coalesce().values().size()), [1, 3])
 
-    @skipIfRocm
     @coalescedonoff
     @dtypes(torch.double)
     def test_pickle(self, device, dtype, coalesced):
diff --git a/torch/csrc/serialization.cpp b/torch/csrc/serialization.cpp
index 46f3a04f355b4..385a074b1ccb4 100644
--- a/torch/csrc/serialization.cpp
+++ b/torch/csrc/serialization.cpp
@@ -233,7 +233,18 @@ void THPStorage_writeFileRaw(
   int64_t numel = size_bytes / element_size;
   if (self->device_type() == at::kCPU) {
     data = self->data<uint8_t>();
-#ifdef USE_CUDA
+#if defined(USE_CUDA) && defined(TORCH_HIP_VERSION) && \
+    (TORCH_HIP_VERSION >= 301)
+  } else if (self->device_type() == at::kCUDA) {
+    cpu_data = std::unique_ptr<char[]>(new char[size_bytes]);
+    data = (uint8_t*)cpu_data.get();
+    C10_CUDA_CHECK(hipMemcpyWithStream(
+        data,
+        self->data<uint8_t>(),
+        size_bytes,
+        cudaMemcpyDeviceToHost,
+        c10::hip::getCurrentHIPStreamMasqueradingAsCUDA()));
+#elif defined(USE_CUDA)
   } else if (self->device_type() == at::kCUDA) {
     cpu_data = std::unique_ptr<char[]>(new char[size_bytes]);
     data = (uint8_t*)cpu_data.get();
@@ -398,7 +409,17 @@ c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw(
     }
   }
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) && defined(TORCH_HIP_VERSION) && \
+    (TORCH_HIP_VERSION >= 301)
+  if (storage->device_type() == at::kCUDA) {
+    C10_CUDA_CHECK(hipMemcpyWithStream(
+        storage->data<uint8_t>(),
+        data,
+        nbytes,
+        cudaMemcpyHostToDevice,
+        c10::hip::getCurrentHIPStreamMasqueradingAsCUDA()));
+  }
+#elif defined(USE_CUDA)
   if (storage->device_type() == at::kCUDA) {
     C10_CUDA_CHECK(cudaMemcpy(
         storage->data<uint8_t>(), data, nbytes, cudaMemcpyHostToDevice));

From e602b0ca950f28035397250c3da8f1f9d4bfe871 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Thu, 27 Oct 2022 15:38:48 +0000
Subject: [PATCH 0230/1922] [BE] Move remaining workflows off Xenial (#87834)

Both BE and prerequisite for moving our CI/CD to C++17 compiler (gcc-5.4
is not fully C++17 compliant)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87834
Approved by: https://github.com/weiwangmeta, https://github.com/kit1980, https://github.com/huydhn
---
 .circleci/docker/build.sh                     | 80 ++-----------------
 .circleci/scripts/build_android_gradle.sh     |  2 +-
 .../workflows/_android-full-build-test.yml    |  2 +-
 .github/workflows/docker-builds.yml           |  5 +-
 .github/workflows/pull.yml                    | 32 ++++----
 .github/workflows/trunk.yml                   |  8 +-
 6 files changed, 28 insertions(+), 101 deletions(-)

diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh
index b38456badc271..ec2dfe8cb60ce 100755
--- a/.circleci/docker/build.sh
+++ b/.circleci/docker/build.sh
@@ -33,7 +33,7 @@ function extract_all_from_image_name() {
     if [ "x${name}" = xpy ]; then
       vername=ANACONDA_PYTHON_VERSION
     fi
-    # skip non-conforming fields such as "pytorch", "linux" or "xenial" without version string
+    # skip non-conforming fields such as "pytorch", "linux" or "bionic" without version string
     if [ -n "${name}" ]; then
       extract_version_from_image_name "${name}" "${vername}"
     fi
@@ -46,11 +46,7 @@ if [[ "$image" == *xla* ]]; then
   exit 0
 fi
 
-if [[ "$image" == *-xenial* ]]; then
-  UBUNTU_VERSION=16.04
-elif [[ "$image" == *-artful* ]]; then
-  UBUNTU_VERSION=17.10
-elif [[ "$image" == *-bionic* ]]; then
+if [[ "$image" == *-bionic* ]]; then
   UBUNTU_VERSION=18.04
 elif [[ "$image" == *-focal* ]]; then
   UBUNTU_VERSION=20.04
@@ -79,7 +75,7 @@ elif [[ "$image" == *rocm* ]]; then
   DOCKERFILE="${OS}-rocm/Dockerfile"
 fi
 
-if [[ "$image" == *xenial* ]] || [[ "$image" == *bionic* ]]; then
+if [[ "$image" == *bionic* ]]; then
   CMAKE_VERSION=3.13.5
 fi
 
@@ -91,44 +87,6 @@ _UCC_COMMIT=12944da33f911daf505d9bbc51411233d0ed85e1
 # configuration, so we hardcode everything here rather than do it
 # from scratch
 case "$image" in
-  pytorch-linux-xenial-py3.8)
-    ANACONDA_PYTHON_VERSION=3.8
-    GCC_VERSION=7
-    # Do not install PROTOBUF, DB, and VISION as a test
-    ;;
-  pytorch-linux-xenial-py3.7-gcc7.2)
-    ANACONDA_PYTHON_VERSION=3.7
-    GCC_VERSION=7
-    # Do not install PROTOBUF, DB, and VISION as a test
-    ;;
-  pytorch-linux-xenial-py3.7-gcc7)
-    ANACONDA_PYTHON_VERSION=3.7
-    GCC_VERSION=7
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    ;;
-  pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7)
-    CUDA_VERSION=10.2
-    CUDNN_VERSION=7
-    ANACONDA_PYTHON_VERSION=3.7
-    GCC_VERSION=7
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    ;;
-  pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7)
-    CUDA_VERSION=11.3.0 # Deviating from major.minor to conform to nvidia's Docker image names
-    CUDNN_VERSION=8
-    TENSORRT_VERSION=8.0.1.6
-    ANACONDA_PYTHON_VERSION=3.7
-    GCC_VERSION=7
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    ;;
   pytorch-linux-bionic-cuda11.3-cudnn8-py3-clang9)
     CUDA_VERSION=11.3.0 # Deviating from major.minor to conform to nvidia's Docker image names
     CUDNN_VERSION=8
@@ -167,20 +125,6 @@ case "$image" in
     UCC_COMMIT=${_UCC_COMMIT}
     CONDA_CMAKE=yes
     ;;
-  pytorch-linux-xenial-py3-clang5-asan)
-    ANACONDA_PYTHON_VERSION=3.7
-    CLANG_VERSION=5.0
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    ;;
-  pytorch-linux-xenial-py3-clang7-asan)
-    ANACONDA_PYTHON_VERSION=3.7
-    CLANG_VERSION=7
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    ;;
   pytorch-linux-focal-py3-clang7-asan)
     ANACONDA_PYTHON_VERSION=3.7
     CLANG_VERSION=7
@@ -189,13 +133,6 @@ case "$image" in
     VISION=yes
     CONDA_CMAKE=yes
     ;;
-  pytorch-linux-xenial-py3-clang7-onnx)
-    ANACONDA_PYTHON_VERSION=3.7
-    CLANG_VERSION=7
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    ;;
   pytorch-linux-focal-py3-clang10-onnx)
     ANACONDA_PYTHON_VERSION=3.7
     CLANG_VERSION=10
@@ -204,9 +141,9 @@ case "$image" in
     VISION=yes
     CONDA_CMAKE=yes
     ;;
-  pytorch-linux-xenial-py3-clang5-android-ndk-r19c)
+  pytorch-linux-focal-py3-clang7-android-ndk-r19c)
     ANACONDA_PYTHON_VERSION=3.7
-    CLANG_VERSION=5.0
+    CLANG_VERSION=7
     LLVMDEV=yes
     PROTOBUF=yes
     ANDROID=yes
@@ -214,13 +151,6 @@ case "$image" in
     GRADLE_VERSION=6.8.3
     NINJA_VERSION=1.9.0
     ;;
-  pytorch-linux-xenial-py3.7-clang7)
-    ANACONDA_PYTHON_VERSION=3.7
-    CLANG_VERSION=7
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    ;;
   pytorch-linux-bionic-py3.7-clang9)
     ANACONDA_PYTHON_VERSION=3.7
     CLANG_VERSION=9
diff --git a/.circleci/scripts/build_android_gradle.sh b/.circleci/scripts/build_android_gradle.sh
index 598e9cd0a6bd2..2007c91fe395a 100755
--- a/.circleci/scripts/build_android_gradle.sh
+++ b/.circleci/scripts/build_android_gradle.sh
@@ -24,7 +24,7 @@ export GRADLE_LOCAL_PROPERTIES=~/workspace/android/local.properties
 rm -f $GRADLE_LOCAL_PROPERTIES
 echo "sdk.dir=/opt/android/sdk" >> $GRADLE_LOCAL_PROPERTIES
 echo "ndk.dir=/opt/ndk" >> $GRADLE_LOCAL_PROPERTIES
-echo "cmake.dir=/usr/local" >> $GRADLE_LOCAL_PROPERTIES
+echo "cmake.dir=/usr" >> $GRADLE_LOCAL_PROPERTIES
 
 retry () {
   $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
diff --git a/.github/workflows/_android-full-build-test.yml b/.github/workflows/_android-full-build-test.yml
index 1680461be78ef..9f110db307aea 100644
--- a/.github/workflows/_android-full-build-test.yml
+++ b/.github/workflows/_android-full-build-test.yml
@@ -128,7 +128,7 @@ jobs:
 
           # run gradle buildRelease
           (echo "./.circleci/scripts/build_android_gradle.sh" | docker exec \
-            -e BUILD_ENVIRONMENT="pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build" \
+            -e BUILD_ENVIRONMENT="pytorch-linux-focal-py3-clang7-android-ndk-r19c-gradle-build" \
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e AWS_DEFAULT_REGION \
             -e PR_NUMBER \
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index dd59d44e8a9d3..3108f4b926a89 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -41,10 +41,7 @@ jobs:
           - docker-image-name: pytorch-linux-focal-rocm5.2-py3.8
           - docker-image-name: pytorch-linux-jammy-cuda11.6-cudnn8-py3.8-clang12
           - docker-image-name: pytorch-linux-jammy-cuda11.7-cudnn8-py3.8-clang12
-          - docker-image-name: pytorch-linux-xenial-cuda11.3-cudnn8-py3-gcc7
-          - docker-image-name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c
-          - docker-image-name: pytorch-linux-xenial-py3-clang5-asan
-          - docker-image-name: pytorch-linux-xenial-py3-clang7-onnx
+          - docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c
           - docker-image-name: pytorch-linux-focal-py3.7-gcc7
           - docker-image-name: pytorch-linux-focal-py3-clang7-asan
           - docker-image-name: pytorch-linux-focal-py3-clang10-onnx
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 849e70dc9f29d..0f95186141bfb 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -184,12 +184,12 @@ jobs:
       docker-image: ${{ needs.linux-bionic-cuda11_6-py3_10-gcc7-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-bionic-cuda11_6-py3_10-gcc7-build.outputs.test-matrix }}
 
-  linux-xenial-py3-clang5-mobile-build:
-    name: linux-xenial-py3-clang5-mobile-build
+  linux-focal-py3-clang7-mobile-build:
+    name: linux-focal-py3-clang7-mobile-build
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-xenial-py3-clang5-mobile-build
-      docker-image-name: pytorch-linux-xenial-py3-clang5-asan
+      build-environment: linux-focal-py3-clang7-mobile-build
+      docker-image-name: pytorch-linux-focal-py3-clang7-asan
       build-generates-artifacts: false
 
   linux-jammy-cuda-11_6-cudnn8-py3_8-clang12-build:
@@ -199,12 +199,12 @@ jobs:
       build-environment: linux-jammy-cuda11.6-cudnn8-py3.8-clang12
       docker-image-name: pytorch-linux-jammy-cuda11.6-cudnn8-py3.8-clang12
 
-  linux-xenial-py3-clang5-mobile-custom-build-static:
-    name: linux-xenial-py3-clang5-mobile-custom-build-static
+  linux-focal-py3-clang7-mobile-custom-build-static:
+    name: linux-focal-py3-clang7-mobile-custom-build-static
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-xenial-py3-clang5-mobile-custom-build-static
-      docker-image-name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c
+      build-environment: linux-focal-py3-clang7-mobile-custom-build-static
+      docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c
       build-generates-artifacts: false
 
   linux-bionic-py3_7-clang8-xla-build:
@@ -275,19 +275,19 @@ jobs:
       build-environment: linux-bionic-cuda11.6-py3.10-gcc7-bazel-test
       docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
 
-  linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single:
-    name: linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single
+  linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single:
+    name: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single
     uses: ./.github/workflows/_android-build-test.yml
     with:
-      build-environment: linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single
-      docker-image-name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c
+      build-environment: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single
+      docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c
 
-  linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit:
-    name: linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit
+  linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit:
+    name: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit
     uses: ./.github/workflows/_android-build-test.yml
     with:
-      build-environment: linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit
-      docker-image-name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c
+      build-environment: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit
+      docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c
 
   linux-focal-py3_7-gcc7-mobile-lightweight-dispatch-build:
     name: linux-focal-py3.7-gcc7-mobile-lightweight-dispatch-build
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 29dc9f3c44d3f..d92c5a079d978 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -118,12 +118,12 @@ jobs:
       build-environment: linux-bionic-cuda11.7-py3.10-gcc7-no-ops
       docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
 
-  pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build:
-    name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build
+  pytorch-linux-focal-py3-clang7-android-ndk-r19c-build:
+    name: pytorch-linux-focal-py3-clang7-android-ndk-r19c-build
     uses: ./.github/workflows/_android-full-build-test.yml
     with:
-      build-environment: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build
-      docker-image-name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c
+      build-environment: pytorch-linux-focal-py3-clang7-android-ndk-r19c-build
+      docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c
 
   linux-bionic-py3_7-clang9-slow-build:
     name: linux-bionic-py3.7-clang9-slow

From 1a54fa338569de96a43a86300e403a7bf2f80989 Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@fb.com>
Date: Wed, 26 Oct 2022 14:11:22 -0700
Subject: [PATCH 0231/1922] functionalization: fix detach() (#87750)

`.detach()` worked in basic cases previously, but didn't properly preserve view relationships between the base and the output. This wasn't heavily tested, because autograd doesn't normally encounter `FunctionalTensorWrapper` directly, but could become more common if we fuse functionalization and autograd into a single tracing pass.

This will also be a bug fix for LTC (and XLA when they use functionalization)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87750
Approved by: https://github.com/ezyang
---
 aten/src/ATen/FunctionalTensorWrapper.cpp |  4 ++
 test/test_functionalization.py            | 51 +++++++++++++++++++++++
 2 files changed, 55 insertions(+)

diff --git a/aten/src/ATen/FunctionalTensorWrapper.cpp b/aten/src/ATen/FunctionalTensorWrapper.cpp
index 91136f921b1ad..03630c39bbf8b 100644
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@@ -302,12 +302,16 @@ c10::intrusive_ptr<TensorImpl> FunctionalTensorWrapper::shallow_copy_and_detach_
       return r;
     }
   }
+
   auto impl = c10::make_intrusive<FunctionalTensorWrapper>(value_);
   copy_tensor_metadata(
       /*src_impl=*/this,
       /*dest_impl=*/impl.get(),
       /*version_counter=*/std::forward<VariableVersion>(version_counter),
       /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
+  impl->level_ = level_;
+  impl->generation_ = generation_;
+  impl->view_metas_ = view_metas_;
   impl->refresh_numel();
   impl->refresh_contiguous();
   return impl;
diff --git a/test/test_functionalization.py b/test/test_functionalization.py
index 041e5b84f6945..521cb4e9e0cec 100644
--- a/test/test_functionalization.py
+++ b/test/test_functionalization.py
@@ -24,6 +24,7 @@ def are_aliased(x, y):
 def _functionalize(f, *, reapply_views: bool):
     def wrapped(a):
         input_functional = torch._to_functional_tensor(a)
+        input_functional.requires_grad = a.requires_grad
         torch._enable_functionalization(reapply_views=reapply_views)
         try:
             out = f(input_functional)
@@ -101,6 +102,56 @@ def f(x):
             return z2
         self.assert_functionalization(f, torch.ones(4))
 
+    def test_view_clone_view_inplace(self):
+        def f(input):
+            shape = [1, 1024, 128, 128]
+            input_reshaped = input.view(shape)
+            out = input_reshaped.clone()
+            r = out.view(input.shape)
+            r.relu_()
+            return r
+
+        def g(x):
+            loss = f(x).sum()
+            from functorch._src.aot_autograd import setup_stacktrace_preservation_hooks
+            import torch.fx.traceback as fx_traceback
+            setup_stacktrace_preservation_hooks([loss.grad_fn])
+            with fx_traceback.override_stack_trace():
+                loss.backward()
+            return x.grad
+
+        with torch.autograd.detect_anomaly(check_nan=False):
+            logs = self.get_logs(g, torch.ones(16, 64, 128, 128, requires_grad=True))
+        self.assertExpectedInline(logs, """\
+
+
+
+def forward(self, a_1):
+    view_copy = torch.ops.aten.view_copy.default(a_1, [1, 1024, 128, 128]);  a_1 = None
+    clone = torch.ops.aten.clone.default(view_copy);  view_copy = None
+    view_copy_1 = torch.ops.aten.view_copy.default(clone, [16, 64, 128, 128]);  clone = None
+    relu = torch.ops.aten.relu.default(view_copy_1);  view_copy_1 = None
+    sum_1 = torch.ops.aten.sum.default(relu)
+    ones_like = torch.ops.aten.ones_like.default(sum_1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False, memory_format = torch.preserve_format);  sum_1 = None
+    expand_copy = torch.ops.aten.expand_copy.default(ones_like, [16, 64, 128, 128]);  ones_like = None
+    new_zeros = torch.ops.aten.new_zeros.default(expand_copy, [16777216])
+    as_strided_copy = torch.ops.aten.as_strided_copy.default(new_zeros, [16, 64, 128, 128], [1048576, 16384, 128, 1], 0)
+    as_strided_copy_1 = torch.ops.aten.as_strided_copy.default(new_zeros, [1, 1024, 128, 128], [16777216, 16384, 128, 1], 0)
+    as_strided_scatter = torch.ops.aten.as_strided_scatter.default(new_zeros, expand_copy, [16, 64, 128, 128], [1048576, 16384, 128, 1], 0);  new_zeros = expand_copy = None
+    as_strided_copy_2 = torch.ops.aten.as_strided_copy.default(as_strided_scatter, [1, 1024, 128, 128], [16777216, 16384, 128, 1], 0);  as_strided_scatter = None
+    new_empty_strided = torch.ops.aten.new_empty_strided.default(as_strided_copy_2, [1, 1024, 128, 128], [16777216, 16384, 128, 1])
+    as_strided_copy_3 = torch.ops.aten.as_strided_copy.default(as_strided_copy_2, [16, 64, 128, 128], [1048576, 16384, 128, 1], 0)
+    as_strided_copy_4 = torch.ops.aten.as_strided_copy.default(as_strided_copy_2, [16, 64, 128, 128], [1048576, 16384, 128, 1], 0)
+    clone_1 = torch.ops.aten.clone.default(as_strided_copy_4, memory_format = torch.contiguous_format);  as_strided_copy_4 = None
+    threshold_backward = torch.ops.aten.threshold_backward.default(clone_1, relu, 0);  clone_1 = relu = None
+    _reshape_alias_copy = torch.ops.aten._reshape_alias_copy.default(as_strided_copy_2, [16, 64, 128, 128], [1048576, 16384, 128, 1])
+    detach_copy = torch.ops.aten.detach_copy.default(_reshape_alias_copy);  _reshape_alias_copy = None
+    as_strided_scatter_1 = torch.ops.aten.as_strided_scatter.default(as_strided_copy_2, threshold_backward, [16, 64, 128, 128], [1048576, 16384, 128, 1], 0);  as_strided_copy_2 = threshold_backward = None
+    _reshape_alias_copy_1 = torch.ops.aten._reshape_alias_copy.default(as_strided_scatter_1, [16, 64, 128, 128], [1048576, 16384, 128, 1]);  as_strided_scatter_1 = None
+    detach_copy_1 = torch.ops.aten.detach_copy.default(_reshape_alias_copy_1);  _reshape_alias_copy_1 = None
+    return detach_copy_1
+    """)  # noqa: B950
+
     def test_simple(self):
         def f(x):
             # simple test: 1 view op, 1 inplace op

From 72ef093855039cbc6b661461288eeb0a8befae19 Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@fb.com>
Date: Wed, 26 Oct 2022 14:11:22 -0700
Subject: [PATCH 0232/1922] add nesting to TORCH_SHOW_DISPATCH_TRACE (#87751)

Added indents to `TORCH_SHOW_DISPATCH_TRACE` so that you more easily see the call tree from the dispatcher. Definitely slower, but it's all guarded under the `DEBUG` build. Example output:

I know we have the PyDispatcher now, but I still found this helpful for debugging

```
 [call] op=[aten::ones], key=[BackendSelect]
  [redispatch] op=[aten::ones], key=[CPU]
   [call] op=[aten::empty.memory_format], key=[BackendSelect]
    [redispatch] op=[aten::empty.memory_format], key=[CPU]
   [call] op=[aten::fill_.Scalar], key=[CPU]
 [call] op=[aten::clone], key=[AutogradCPU]
  [redispatch] op=[aten::clone], key=[CPU]
   [call] op=[aten::empty_strided], key=[BackendSelect]
    [redispatch] op=[aten::empty_strided], key=[CPU]
   [call] op=[aten::copy_], key=[CPU]
 [call] op=[aten::view], key=[PythonTLSSnapshot]
  [redispatchBoxed] op=[aten::view], key=[AutogradCPU]
   [redispatch] op=[aten::view], key=[ADInplaceOrView]
    [redispatch] op=[aten::view], key=[Functionalize]
     [call] op=[aten::view], key=[PythonTLSSnapshot]
      [redispatchBoxed] op=[aten::view], key=[Meta]
     [call] op=[aten::view], key=[PythonTLSSnapshot]
      [redispatchBoxed] op=[aten::view], key=[Python]
       [callBoxed] op=[aten::view], key=[CPU]
 [call] op=[aten::clone], key=[PythonTLSSnapshot]
  [redispatchBoxed] op=[aten::clone], key=[AutogradCPU]
   [redispatch] op=[aten::clone], key=[Functionalize]
    [callBoxed] op=[aten::clone], key=[PythonTLSSnapshot]
     [redispatchBoxed] op=[aten::clone], key=[Python]
      [callBoxed] op=[aten::clone], key=[CPU]
       [call] op=[aten::empty_strided], key=[BackendSelect]
        [redispatch] op=[aten::empty_strided], key=[CPU]
       [call] op=[aten::copy_], key=[CPU]
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87751
Approved by: https://github.com/ezyang, https://github.com/zou3519
---
 aten/src/ATen/core/dispatch/Dispatcher.cpp |  6 ++++++
 aten/src/ATen/core/dispatch/Dispatcher.h   | 20 ++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp
index 667eefdcc5ab8..45214a3fd20f2 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@@ -9,6 +9,12 @@ bool show_dispatch_trace() {
     return temp != nullptr;
 }
 
+static thread_local int64_t dispatch_trace_nesting_value_;
+
+void dispatch_trace_nesting_incr() { ++dispatch_trace_nesting_value_; }
+void dispatch_trace_nesting_decr() { --dispatch_trace_nesting_value_; }
+int64_t dispatch_trace_nesting_value() { return dispatch_trace_nesting_value_; }
+
 namespace detail {
 
 class RegistrationListenerList final {
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h
index 1ea677b54ef5a..2f383d589e29f 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@@ -19,6 +19,14 @@
 namespace c10 {
 
 TORCH_API bool show_dispatch_trace();
+TORCH_API void dispatch_trace_nesting_incr();
+TORCH_API void dispatch_trace_nesting_decr();
+TORCH_API int64_t dispatch_trace_nesting_value();
+
+struct DispatchTraceNestingGuard {
+  DispatchTraceNestingGuard() { dispatch_trace_nesting_incr(); }
+  ~DispatchTraceNestingGuard() { dispatch_trace_nesting_decr(); }
+};
 
 class TORCH_API OperatorHandle;
 template<class FuncType> class TypedOperatorHandle;
@@ -583,7 +591,10 @@ C10_ALWAYS_INLINE_UNLESS_MOBILE Return Dispatcher::call(const TypedOperatorHandl
   auto dispatchKeySet = op.operatorDef_->op.dispatchKeyExtractor()
     .template getDispatchKeySetUnboxed<Args...>(args...);
 #ifndef NDEBUG
+  DispatchTraceNestingGuard debug_guard;
   if (show_dispatch_trace()) {
+      auto nesting_value = dispatch_trace_nesting_value();
+      for (int64_t i = 0; i < nesting_value; ++i) std::cerr << " ";
       std::cerr << "[call] op=[" << op.operator_name() << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << "]" << std::endl;
   }
 #endif
@@ -603,7 +614,10 @@ inline Return Dispatcher::redispatch(const TypedOperatorHandle<Return (Args...)>
   detail::unused_arg_(args...);  // workaround for a false-positive warning about unused parameters in gcc 5
   // do not use RecordFunction on redispatch
 #ifndef NDEBUG
+  DispatchTraceNestingGuard debug_guard;
   if (show_dispatch_trace()) {
+      auto nesting_value = dispatch_trace_nesting_value();
+      for (int64_t i = 0; i < nesting_value; ++i) std::cerr << " ";
       std::cerr << "[redispatch] op=[" << op.operator_name() << "], key=[" << toString(currentDispatchKeySet.highestPriorityTypeId()) << "]" << std::endl;
   }
 #endif
@@ -616,7 +630,10 @@ inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack) const
   const auto& entry = op.operatorDef_->op;
   auto dispatchKeySet = entry.dispatchKeyExtractor().getDispatchKeySetBoxed(stack);
 #ifndef NDEBUG
+  DispatchTraceNestingGuard debug_guard;
   if (show_dispatch_trace()) {
+      auto nesting_value = dispatch_trace_nesting_value();
+      for (int64_t i = 0; i < nesting_value; ++i) std::cerr << " ";
       std::cerr << "[callBoxed] op=[" << op.operator_name() << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << "]" << std::endl;
   }
 #endif
@@ -666,7 +683,10 @@ inline void Dispatcher::redispatchBoxed(const OperatorHandle& op, DispatchKeySet
   // note: this doesn't need the mutex because write operations on the list keep iterators intact.
   const auto& entry = op.operatorDef_->op;
 #ifndef NDEBUG
+  DispatchTraceNestingGuard debug_guard;
   if (show_dispatch_trace()) {
+      auto nesting_value = dispatch_trace_nesting_value();
+      for (int64_t i = 0; i < nesting_value; ++i) std::cerr << " ";
       std::cerr << "[redispatchBoxed] op=[" << op.operator_name() << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << "]" << std::endl;
   }
 #endif

From 44d9123d16cf6979d7867eec2a6f3aa6d9038f85 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Thu, 27 Oct 2022 15:53:11 +0000
Subject: [PATCH 0233/1922] Fix type promotion for 2 wrapped scalar args
 (#87845)

Fixes #76801

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87845
Approved by: https://github.com/SherlockNoMad, https://github.com/mruberry
---
 aten/src/ATen/TensorIterator.cpp              |  3 ++-
 aten/src/ATen/TensorIterator.h                |  7 +++++--
 test/test_binary_ufuncs.py                    | 20 +++++++------------
 test/test_ops.py                              |  3 ++-
 .../_internal/common_methods_invocations.py   |  4 ++--
 5 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp
index 7b1442db75ad4..7e86163f1ca4c 100644
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@@ -431,7 +431,7 @@ void TensorIteratorBase::compute_types(const TensorIteratorConfig& config) {
   }
 
   // Computes a common dtype, if needed
-  if (has_different_input_dtypes && config.promote_inputs_to_common_dtype_) {
+  if ((has_different_input_dtypes || all_ops_are_scalars_) && config.promote_inputs_to_common_dtype_) {
     common_dtype_ = compute_common_dtype();
   }
 
@@ -1237,6 +1237,7 @@ void TensorIteratorBase::compute_shape(const TensorIteratorConfig& config) {
       shape_ = infer_size_dimvector(shape_, shape);
     }
   }
+  all_ops_are_scalars_ = !has_tensors;
 }
 
 void TensorIteratorBase::compute_strides(const TensorIteratorConfig& config) {
diff --git a/aten/src/ATen/TensorIterator.h b/aten/src/ATen/TensorIterator.h
index 59f52d9dbd2ed..31ae65466870a 100644
--- a/aten/src/ATen/TensorIterator.h
+++ b/aten/src/ATen/TensorIterator.h
@@ -659,9 +659,12 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
   /// in operands_).
   int num_outputs_ = 0;
 
-  /// Whether or not all operands have the same shape.  Having all the same
-  /// shape affects whether or not the iterator is eligible for fast setup.
+  /// Whether or not all operands have the same shape and are 1d+. Having all
+  /// the same shape affects whether or not the iterator is eligible for fast
+  /// setup.
   bool all_ops_same_shape_ = false;
+  /// Whether or not all operands are 0d, this affects type promotion
+  bool all_ops_are_scalars_ = false;
 
   /// The "computation" dtype of TensorIterator, specifying what the dtype
   /// we will do the internal computation in TensorIterator.  Typically,
diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py
index abcbb493342bf..8ffab2daa6e28 100644
--- a/test/test_binary_ufuncs.py
+++ b/test/test_binary_ufuncs.py
@@ -491,9 +491,6 @@ def test_type_promotion(self, device, op):
             make_tensor, (5,), device=device, **op.rhs_make_tensor_kwargs
         )
 
-        make_lhs_scalar_tensor = partial(
-            make_tensor, (), device='cpu', **op.lhs_make_tensor_kwargs
-        )
         make_rhs_scalar_tensor = partial(
             make_tensor, (), device='cpu', **op.rhs_make_tensor_kwargs
         )
@@ -782,17 +779,14 @@ def _supported(dtypes):
             )
             self.assertEqual(result.dtype, expected_dtype)
 
-        # scalar int x scalar float
+        # scalar  x scalar
         # Note: result dtype is default float type
-        # TODO: FIXME: re-enable this, scalar x scalar type promotion is currently broken
-        # https://github.com/pytorch/pytorch/issues/76801
-        # if op.supports_two_python_scalars and _supported((torch.long, torch.float32)):
-        #     lhs_i_scalar = 1
-        #     rhs_f_scalar = 2.
-
-        #     result = op(lhs_i_scalar, rhs_f_scalar)
-        #     expected_dtype = torch.get_default_dtype() if not op.always_returns_bool else torch.bool
-        #     self.assertEqual(result.dtype, expected_dtype)
+        if op.supports_two_python_scalars and _supported((torch.long, torch.float32)):
+            rhs_f_scalar = 2.
+            for lhs in (1, 1.):
+                result = op(lhs, rhs_f_scalar)
+                expected_dtype = torch.get_default_dtype() if not op.always_returns_bool else torch.bool
+                self.assertEqual(result.dtype, expected_dtype)
 
     # TODO: move to error input test
     @ops(binary_ufuncs, allowed_dtypes=(torch.float32,))
diff --git a/test/test_ops.py b/test/test_ops.py
index 1d20151c20e89..fa8812aa5d8ee 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -457,7 +457,8 @@ def test_errors(self, device, op):
         for ei in error_inputs:
             si = ei.sample_input
             with self.assertRaisesRegex(ei.error_type, ei.error_regex):
-                op(si.input, *si.args, **si.kwargs)
+                out = op(si.input, *si.args, **si.kwargs)
+                self.assertFalse(isinstance(out, type(NotImplemented)))
 
     @skipMeta
     @onlyNativeDeviceTypes
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index f04a2cc5465bb..fb4238234a98f 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -12826,7 +12826,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     supports_out=False,
                     supports_forward_ad=True,
                     supports_fwgrad_bwgrad=True,
-                    supports_two_python_scalars=True,
+                    supports_one_python_scalar=True,
                     skips=(
                         DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
                         DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',),
@@ -12861,7 +12861,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     supports_forward_ad=True,
                     supports_fwgrad_bwgrad=True,
                     supports_out=False,
-                    supports_two_python_scalars=True,
+                    supports_one_python_scalar=True,
                     skips=(
                         DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
                         DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',),

From cfef1a1e8023b673e64dfecdb41e445eb63d4c61 Mon Sep 17 00:00:00 2001
From: Michael Suo <suo@fb.com>
Date: Wed, 26 Oct 2022 10:49:38 -0700
Subject: [PATCH 0234/1922] [dynamo] Error when user nests FX with dynamo
 (#87797)

Today, this doesn't work and dynamo errors out in a very non-obvious way (see:
https://gist.github.com/suo/dde04830372ab51a4a34ea760f14200a).

Here, we detect the error early and exit with a nicer msg. Also add a
config option to just no-op dynamo (which need to unblock internal
enablement).

cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87797
Approved by: https://github.com/yf225, https://github.com/soumith, https://github.com/jansel
---
 test/dynamo/test_misc.py    | 14 ++++++++++++++
 torch/_dynamo/config.py     |  4 ++++
 torch/_dynamo/eval_frame.py |  9 +++++++++
 3 files changed, 27 insertions(+)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index a0f592212f4e1..a63a6d8930c80 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -2732,6 +2732,20 @@ def forward(self, x):
             dynamo_result = graph(x)
             self.assertTrue(same(real, dynamo_result))
 
+    def test_error_on_nested_fx_trace(self):
+        input = torch.rand(2, 3)
+
+        def f(x):
+            x + x
+
+        real = f(input)
+
+        optimized = torch._dynamo.optimize("eager")(f)
+        self.assertTrue(same(optimized(input), real))
+
+        with self.assertRaisesRegex(RuntimeError, "Detected that you are using FX"):
+            gm = torch.fx.symbolic_trace(optimized)
+
 
 class CustomFunc(torch.autograd.Function):
     @staticmethod
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index f24eeeae76882..162891d2fd9dc 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -152,6 +152,10 @@
 # How to import torchinductor, either torchinductor or torch.inductor
 inductor_import = dynamo_import.replace("dynamo", "inductor")
 
+# If true, error with a better message if we symbolically trace over a
+# dynamo-optimized function. If false, silently suppress dynamo.
+error_on_nested_fx_trace = True
+
 # root folder of the project
 if "torch." in dynamo_import:
     base_dir = dirname(dirname(dirname(abspath(__file__))))
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index d86653f9973cc..2d0938a83a123 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -14,6 +14,7 @@
 
 import torch
 import torch.utils._pytree as pytree
+from torch.fx._symbolic_trace import is_fx_tracing
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.nn.parallel.distributed import DistributedDataParallel
 
@@ -149,6 +150,14 @@ def __call__(self, *args, **kwargs):
 
         @functools.wraps(fn)
         def _fn(*args, **kwargs):
+            if is_fx_tracing():
+                if config.error_on_nested_fx_trace:
+                    raise RuntimeError(
+                        "Detected that you are using FX to symbolically trace "
+                        "a dynamo-optimized function. This is not supported at the moment."
+                    )
+                return fn
+
             on_enter()
             prior = set_eval_frame(callback)
             backend_ctx = backend_ctx_ctor()

From 5373502038b9a4566656590546d085e3acbf5b51 Mon Sep 17 00:00:00 2001
From: Akshit Khurana <axit@meta.com>
Date: Wed, 26 Oct 2022 15:44:00 -0700
Subject: [PATCH 0235/1922] [dynamo] Add ao.nn to skipfiles inline allowlist
 (#87820)

Summary:

Allow torch.ao.nn module to be inlined

Test Plan:

Tested manually for https://github.com/pytorch/torchdynamo/issues/1737

Reviewers:

Subscribers:

Tasks:

Tags:

cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx

Differential Revision: [D40768679](https://our.internmc.facebook.com/intern/diff/D40768679)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87820
Approved by: https://github.com/jansel
---
 torch/_dynamo/config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index 162891d2fd9dc..2601be8983f2a 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -96,6 +96,7 @@
     torch.nn,
     torch.distributions,
     torch.testing,
+    torch.ao.nn,
 }
 if HAS_REFS_PRIMS:
     skipfiles_inline_module_allowlist |= {

From 071a18c8cd0f2d1f2cea176545b8e5dd30939682 Mon Sep 17 00:00:00 2001
From: samdow <samdow@fb.com>
Date: Thu, 27 Oct 2022 17:10:04 +0000
Subject: [PATCH 0236/1922] fix typo in per sample grad test (#87790)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87790
Approved by: https://github.com/zou3519
---
 test/functorch/test_eager_transforms.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py
index d8cd765706e5a..9361e51454787 100644
--- a/test/functorch/test_eager_transforms.py
+++ b/test/functorch/test_eager_transforms.py
@@ -3089,8 +3089,10 @@ def test_resnet18_per_sample_grads(self, device):
         func_model, weights = make_functional(model)
 
         def compute_loss(weights, image, target):
-            output = func_model(weights, images)
-            loss = criterion(output, targets)
+            image = image.unsqueeze(0)
+            target = target.unsqueeze(0)
+            output = func_model(weights, image)
+            loss = criterion(output, target)
             return loss
 
         batch_size = 3
@@ -3100,7 +3102,7 @@ def compute_loss(weights, image, target):
         result_grads = vmap(grad(compute_loss), in_dims=(None, 0, 0))(weights, images, targets)
 
         expected_grads = [
-            torch.autograd.grad(compute_loss(weights, images[i].unsqueeze(0), targets[i].unsqueeze(0)), weights)
+            torch.autograd.grad(compute_loss(weights, images[i], targets[i]), weights)
             for i in range(batch_size)
         ]
         expected_grads = [torch.stack(shards) for shards in zip(*expected_grads)]

From 7a7f46c5c56ab25314455490a21cc3b8cda3e77a Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 27 Oct 2022 19:49:29 +0000
Subject: [PATCH 0237/1922] [dynamo] add inductor runs w/o cudagraphs (#87847)

as title

cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87847
Approved by: https://github.com/jansel
---
 benchmarks/dynamo/Makefile_dashboard |  5 ++++-
 benchmarks/dynamo/common.py          | 12 ++++++++++--
 benchmarks/dynamo/runner.py          |  2 ++
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/benchmarks/dynamo/Makefile_dashboard b/benchmarks/dynamo/Makefile_dashboard
index 729178f538408..1c75d608e7d71 100644
--- a/benchmarks/dynamo/Makefile_dashboard
+++ b/benchmarks/dynamo/Makefile_dashboard
@@ -5,15 +5,17 @@ PIP ?= python -m pip
 clone-deps:
 	(cd ../../.. \
 		&& (test -e torchvision || git clone --recursive https://github.com/pytorch/vision torchvision) \
+		&& (test -e torchdata || git clone --recursive https://github.com/pytorch/data.git torchdata) \
 		&& (test -e torchtext || git clone --recursive https://github.com/pytorch/text torchtext) \
 		&& (test -e detectron2 || git clone --recursive https://github.com/facebookresearch/detectron2) \
 		&& (test -e torchbenchmark || git clone --recursive https://github.com/pytorch/benchmark torchbenchmark) \
 		&& (test -e triton || git clone --recursive https://github.com/openai/triton.git) \
 	)
 
-pull-deps:
+pull-deps: clone-deps
 	echo $(TRITON_VERSION)
 	(cd ../../../torchvision    && git pull && git submodule update --init --recursive)
+	(cd ../../../torchdata      && git pull && git submodule update --init --recursive)
 	(cd ../../../torchtext      && git pull && git submodule update --init --recursive)
 	(cd ../../../detectron2     && git pull && git submodule update --init --recursive)
 	(cd ../../../torchbenchmark && git pull && git submodule update --init --recursive)
@@ -28,6 +30,7 @@ build-deps: clone-deps
 	conda install -y -c pytorch magma-cuda116
 	conda install -y -c conda-forge librosa
 	(cd ../../../torchvision && python setup.py clean && python setup.py develop)
+	(cd ../../../torchdata && python setup.py install)
 	(cd ../../../torchtext   && python setup.py clean && python setup.py develop)
 	(cd ../../../detectron2  && python setup.py clean && python setup.py develop)
 	(cd ../../../torchbenchmark && python install.py --continue_on_fail)
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 86e6bb62842f6..88de22f326cfe 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -27,6 +27,7 @@
 from torch._dynamo.profiler import fx_insert_profiling, Profiler
 from torch._dynamo.testing import dummy_fx_compile, format_speedup, same
 from torch._dynamo.utils import clone_inputs
+from torch._inductor import config as inductor_config
 from torch._inductor.utils import fresh_triton_cache
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.utils._pytree import tree_map
@@ -1360,6 +1361,11 @@ def parse_args():
         action="store_true",
         help="Use a fresh triton cachedir when running each model, to force cold-start compile.",
     )
+    parser.add_argument(
+        "--disable-cudagraphs",
+        action="store_true",
+        help="Disables cudagraphs for Inductor",
+    )
 
     group_fuser = parser.add_mutually_exclusive_group()
     # --nvfuser is now the default, keep the option to not break scripts
@@ -1619,8 +1625,6 @@ def main(runner, original_dir=None):
         experiment = speedup_experiment
         output_filename = "overheads.csv"
     elif args.inductor or args.inductor_dynamic:
-        from torch._inductor import config as inductor_config
-
         inductor_config.debug = args.verbose
         if args.threads:
             inductor_config.cpp.threads = args.threads
@@ -1705,6 +1709,10 @@ def main(runner, original_dir=None):
         experiment = coverage_experiment
         output_filename = "coverage.csv"
 
+    if args.inductor or args.backend == "inductor":
+        if args.disable_cudagraphs:
+            inductor_config.triton.cudagraphs = False
+
     runner.setup_amp()
 
     if args.output:
diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py
index ce952095bd352..f5ec96e4f500b 100755
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@@ -65,6 +65,7 @@
         "aot_cudagraphs": "--training --backend=aot_cudagraphs ",
         "aot_nvfuser": "--training --nvfuser --backend=aot_nvfuser ",
         "inductor": "--training --inductor ",
+        "inductor_no_cudagraphs": "--training --inductor --disable-cudagraphs ",
     },
     "inference": {
         "ts_nnc": "--speedup-ts",
@@ -85,6 +86,7 @@
         "aot_cudagraphs",
         "aot_nvfuser",
         "inductor",
+        "inductor_no_cudagraphs",
     ],
     "inference": ["ts_nvfuser_cudagraphs", "inductor"],
     "dtypes": [

From 8db1612c3d74951c7c3700d27dd2852706871791 Mon Sep 17 00:00:00 2001
From: Jiewen Tan <jwtan@google.com>
Date: Thu, 27 Oct 2022 20:39:30 +0000
Subject: [PATCH 0238/1922] [LTC] Remove lazy::View (#87822)

Summary:
This is the first part to remove the whole view and aliasing infrastructure in LTC, which is deprecated in favor of functionalization. It mainly removes things that use lazy::View.

Test Plan:
CI

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87822
Approved by: https://github.com/JackCaoG, https://github.com/antoniojkim, https://github.com/wconstab
---
 .github/ci_commit_pins/xla.txt                |   2 +-
 build_variables.bzl                           |   1 -
 torch/csrc/lazy/core/ir_builder.h             | 133 ---------
 torch/csrc/lazy/core/lazy_graph_executor.cpp  |  61 ----
 torch/csrc/lazy/core/lazy_graph_executor.h    |   5 -
 torch/csrc/lazy/core/lazy_view.cpp            | 262 ------------------
 torch/csrc/lazy/core/lazy_view.h              | 173 ------------
 torch/csrc/lazy/core/tensor.cpp               | 156 +----------
 torch/csrc/lazy/core/tensor.h                 |  33 ---
 torch/csrc/lazy/ts_backend/ir_builder.h       |  78 ------
 .../csrc/lazy/ts_backend/tensor_aten_ops.cpp  | 193 -------------
 torch/csrc/lazy/ts_backend/tensor_aten_ops.h  |  85 ------
 12 files changed, 14 insertions(+), 1168 deletions(-)
 delete mode 100644 torch/csrc/lazy/core/lazy_view.cpp
 delete mode 100644 torch/csrc/lazy/core/lazy_view.h

diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index 86063843174d2..a3de2aba624ea 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-79131e9d31290744afdf3d85118251863e16ab0e
+095ee628212f0235ad0d6908bdd514123639fc86
diff --git a/build_variables.bzl b/build_variables.bzl
index f1801b446ed8c..017ed9aef5413 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -414,7 +414,6 @@ lazy_tensor_core_sources = [
     "torch/csrc/lazy/core/ir_metadata.cpp",
     "torch/csrc/lazy/core/ir_util.cpp",
     "torch/csrc/lazy/core/lazy_graph_executor.cpp",
-    "torch/csrc/lazy/core/lazy_view.cpp",
     "torch/csrc/lazy/core/metrics.cpp",
     "torch/csrc/lazy/core/multi_wait.cpp",
     "torch/csrc/lazy/core/ops/arithmetic_ir_ops.cpp",
diff --git a/torch/csrc/lazy/core/ir_builder.h b/torch/csrc/lazy/core/ir_builder.h
index 8e645c485158e..20e4730d50135 100644
--- a/torch/csrc/lazy/core/ir_builder.h
+++ b/torch/csrc/lazy/core/ir_builder.h
@@ -73,59 +73,6 @@ struct IrBuilder {
       const size_t& num_outputs = 1,
       const hash_t& hash_seed = static_cast<uint32_t>(0x5a2d296e9)) const = 0;
 
-  // View op nodes
-  virtual NodePtr MakeAsStridedViewUpdate(
-      const Value& input0,
-      const Value& input1,
-      const std::vector<int64_t>& size,
-      const std::vector<int64_t>& stride,
-      const int64_t& storage_offset) const = 0;
-  virtual NodePtr MakeAsStrided(
-      const Value& input0,
-      const std::vector<int64_t>& size,
-      const std::vector<int64_t>& stride,
-      const int64_t& storage_offset) const = 0;
-  virtual NodePtr MakeDiagonalViewUpdate(
-      const Value& input0,
-      const Value& input1,
-      const int64_t& offset,
-      const int64_t& dim1,
-      const int64_t& dim2) const = 0;
-  virtual NodePtr MakeDiagonal(
-      const Value& input0,
-      const int64_t& offset,
-      const int64_t& dim1,
-      const int64_t& dim2) const = 0;
-  virtual NodePtr MakeNarrowViewUpdate(
-      const Value& input0,
-      const Value& input1,
-      const std::vector<int64_t>& base_indices) const = 0;
-  virtual NodePtr MakeNarrow(
-      const Value& input0,
-      const std::vector<int64_t>& base_indices,
-      const std::vector<int64_t>& sizes) const = 0;
-  virtual NodePtr MakePermute(
-      const Value& input0,
-      const std::vector<int64_t>& dims) const = 0;
-  virtual NodePtr MakeResize(
-      const Value& input0,
-      const std::vector<int64_t>& size) const = 0;
-  virtual NodePtr MakeSelectViewUpdate(
-      const Value& input0,
-      const Value& input1,
-      const int64_t& dim,
-      const int64_t& start,
-      const int64_t& end,
-      const int64_t& stride) const = 0;
-  virtual NodePtr MakeSelect(
-      const Value& input0,
-      const int64_t& dim,
-      const int64_t& start,
-      const int64_t& end,
-      const int64_t& stride) const = 0;
-  virtual NodePtr MakeSqueeze(const Value& input0, const int& dim) const = 0;
-  virtual NodePtr MakeUnsqueeze(const Value& input0, const int& dim) const = 0;
-
   // dynamic ir nodes
   virtual NodePtr MakeSizeNode(const Value& input, size_t dim) const = 0;
   virtual NodePtr MakeSizeAdd(const Value& a, const Value& b) const = 0;
@@ -173,86 +120,6 @@ static inline NodePtr MakeGeneric(
       op, operands, shape, num_outputs, hash_seed);
 }
 
-// View op nodes
-static inline NodePtr MakeAsStridedViewUpdate(
-    const Value& input0,
-    const Value& input1,
-    const std::vector<int64_t>& size,
-    const std::vector<int64_t>& stride,
-    const int64_t& storage_offset) {
-  return getIrBuilder()->MakeAsStridedViewUpdate(
-      input0, input1, size, stride, storage_offset);
-}
-static inline NodePtr MakeAsStrided(
-    const Value& input0,
-    const std::vector<int64_t>& size,
-    const std::vector<int64_t>& stride,
-    const int64_t& storage_offset) {
-  return getIrBuilder()->MakeAsStrided(input0, size, stride, storage_offset);
-}
-static inline NodePtr MakeDiagonalViewUpdate(
-    const Value& input0,
-    const Value& input1,
-    const int64_t& offset,
-    const int64_t& dim1,
-    const int64_t& dim2) {
-  return getIrBuilder()->MakeDiagonalViewUpdate(
-      input0, input1, offset, dim1, dim2);
-}
-static inline NodePtr MakeDiagonal(
-    const Value& input0,
-    const int64_t& offset,
-    const int64_t& dim1,
-    const int64_t& dim2) {
-  return getIrBuilder()->MakeDiagonal(input0, offset, dim1, dim2);
-}
-static inline NodePtr MakeNarrowViewUpdate(
-    const Value& input0,
-    const Value& input1,
-    const std::vector<int64_t>& base_indices) {
-  return getIrBuilder()->MakeNarrowViewUpdate(input0, input1, base_indices);
-}
-static inline NodePtr MakeNarrow(
-    const Value& input0,
-    const std::vector<int64_t>& base_indices,
-    const std::vector<int64_t>& sizes) {
-  return getIrBuilder()->MakeNarrow(input0, base_indices, sizes);
-}
-static inline NodePtr MakePermute(
-    const Value& input0,
-    const std::vector<int64_t>& dims) {
-  return getIrBuilder()->MakePermute(input0, dims);
-}
-static inline NodePtr MakeResize(
-    const Value& input0,
-    const std::vector<int64_t>& size) {
-  return getIrBuilder()->MakeResize(input0, size);
-}
-static inline NodePtr MakeSelectViewUpdate(
-    const Value& input0,
-    const Value& input1,
-    const int64_t& dim,
-    const int64_t& start,
-    const int64_t& end,
-    const int64_t& stride) {
-  return getIrBuilder()->MakeSelectViewUpdate(
-      input0, input1, dim, start, end, stride);
-}
-static inline NodePtr MakeSelect(
-    const Value& input0,
-    const int64_t& dim,
-    const int64_t& start,
-    const int64_t& end,
-    const int64_t& stride) {
-  return getIrBuilder()->MakeSelect(input0, dim, start, end, stride);
-}
-static inline NodePtr MakeSqueeze(const Value& input0, const int& dim) {
-  return getIrBuilder()->MakeSqueeze(input0, dim);
-}
-static inline NodePtr MakeUnsqueeze(const Value& input0, const int& dim) {
-  return getIrBuilder()->MakeUnsqueeze(input0, dim);
-}
-
 // dynamic ir nodes
 static inline NodePtr MakeSizeNode(const Value& input, size_t dim) {
   return getIrBuilder()->MakeSizeNode(input, dim);
diff --git a/torch/csrc/lazy/core/lazy_graph_executor.cpp b/torch/csrc/lazy/core/lazy_graph_executor.cpp
index 06b37797d3fa6..4989ce24a0ef1 100644
--- a/torch/csrc/lazy/core/lazy_graph_executor.cpp
+++ b/torch/csrc/lazy/core/lazy_graph_executor.cpp
@@ -789,33 +789,6 @@ LazyGraphExecutor::CompilationResult LazyGraphExecutor::Compile(
     Value ir_value = tensors[index]->CurrentIrValue();
     lowering_ctx->AddResult(ir_value);
   }
-  if (FLAGS_torch_lazy_param_aliasing && coll.config.sync_ltc_data) {
-    // We can only alias at the step barrier, when force_ltc_data is true.
-    // Consider the case:
-    //   1. Tensor A(DEVICE_DATA)
-    //   2. Tensor B = A + 0.9
-    //   3. A += 0.4
-    // If we activate aliasing for A's graph, and we do:
-    //   print(A)
-    //   print(A)
-    // The first print will update DEVICE_DATA' with DEVICE_DATA+0.4, and the
-    // second print will again update DEVICE_DATA" with DEVICE_DATA'+0.4, which
-    // will lead to incorrect results.
-    // We cannot normally turn A's state into DEVICE_DATA, as if any of the
-    // sources is a view, this will not lead to correct results (as A's value
-    // taken at different times need to reflect view source changes):
-    //   1. Tensor A = some_graph_with_view_source(V)
-    //   2. print(A)
-    //   3. V += 1
-    //   4. print(A)
-    // The second print should reflect the new value due to V's changes.
-    // Also in the first example, unless we are doing a step barrier and hence
-    // include all live tensors, if the B value is not part of the graph, it
-    // will later fetch the new value of A, which is incorrect.
-    // But, when we issue a step barrier (force_ltc_data == true) we have to
-    // turn everything into DEVICE_DATA, so we can activate aliasing.
-    BuildInputOutputAliases(tensors, coll.indices, lowering_ctx.get());
-  }
 
   ComputationPtr computation = lowering_ctx->Build();
   // If force_ltc_data is true it means that we did a proper sync and are
@@ -866,40 +839,6 @@ LazyGraphExecutor::ComputationCache::TypePtr LazyGraphExecutor::
 typedef SSIZE_T ssize_t;
 #endif
 
-void LazyGraphExecutor::BuildInputOutputAliases(
-    const std::vector<LazyTensorPtr>& tensors,
-    c10::ArrayRef<size_t> indices,
-    LoweringContext* lowering_ctx) {
-  std::unordered_map<int64_t, size_t> output_tensor_id_map;
-  for (const auto i : c10::irange(indices.size())) {
-    size_t tensor_index = indices[i];
-    int64_t tensor_id = tensors[tensor_index]->GetUniqueId();
-    output_tensor_id_map[tensor_id] = i;
-  }
-  const std::vector<BackendDataPtr>& parameters_data =
-      lowering_ctx->GetParametersData();
-  std::vector<ssize_t> alias_map(indices.size(), -1);
-  for (const auto i : c10::irange(parameters_data.size())) {
-    DeviceDataInfo* data_info =
-        dynamic_cast<DeviceDataInfo*>(parameters_data[i]->info());
-    if (data_info != nullptr && !data_info->read_only) {
-      auto it = output_tensor_id_map.find(data_info->tensor_id);
-      if (it != output_tensor_id_map.end()) {
-        size_t output_index = it->second;
-        if (lowering_ctx->CheckResultShape(parameters_data[i], output_index) &&
-            alias_map[output_index] < 0) {
-          lowering_ctx->SetUpAlias({static_cast<int64_t>(output_index)}, i, {});
-          alias_map[output_index] = i;
-
-          VLOG(6) << "Aliased parameter " << i << " with output "
-                  << output_index << ": " << Shape(parameters_data[i]->shape());
-        }
-      }
-    }
-  }
-  TORCH_LAZY_VALUE_METRIC("InputOutputAliasCount", alias_map.size());
-}
-
 std::shared_ptr<LazyGraphExecutor::Async> LazyGraphExecutor::
     SyncTensorsGraphInternal(
         std::vector<LazyTensorPtr>* tensors,
diff --git a/torch/csrc/lazy/core/lazy_graph_executor.h b/torch/csrc/lazy/core/lazy_graph_executor.h
index 7a4498d85fc0f..b7e10374fbb76 100644
--- a/torch/csrc/lazy/core/lazy_graph_executor.h
+++ b/torch/csrc/lazy/core/lazy_graph_executor.h
@@ -223,11 +223,6 @@ class TORCH_API LazyGraphExecutor {
 
   ComputationCache::TypePtr LookupCachedCompile(const hash_t& hash);
 
-  void BuildInputOutputAliases(
-      const std::vector<LazyTensorPtr>& tensors,
-      c10::ArrayRef<size_t> indices,
-      LoweringContext* lowering_ctx);
-
   std::shared_ptr<Async> SyncTensorsGraphInternal(
       std::vector<LazyTensorPtr>* tensors,
       c10::ArrayRef<std::string> devices,
diff --git a/torch/csrc/lazy/core/lazy_view.cpp b/torch/csrc/lazy/core/lazy_view.cpp
deleted file mode 100644
index d52c0f62fb77e..0000000000000
--- a/torch/csrc/lazy/core/lazy_view.cpp
+++ /dev/null
@@ -1,262 +0,0 @@
-#include <torch/csrc/lazy/core/lazy_view.h>
-
-#include <torch/csrc/lazy/core/helpers.h>
-#include <torch/csrc/lazy/core/ir_builder.h>
-#include <torch/csrc/lazy/core/ops/utils.h>
-#include <torch/csrc/lazy/core/permutation_util.h>
-
-#include <c10/util/Exception.h>
-#include <algorithm>
-#include <functional>
-#include <numeric>
-
-namespace torch {
-namespace lazy {
-namespace {
-
-Value ApplyViewInfo(Value ir_value, const ViewInfo& view_info) {
-  switch (view_info.view_type) {
-    case ViewInfo::Type::kSelect:
-      return MakeSelect(
-          ir_value,
-          view_info.select->dim,
-          view_info.select->start,
-          view_info.select->end,
-          view_info.select->stride);
-    case ViewInfo::Type::kNarrow:
-      return MakeNarrow(
-          ir_value, view_info.indices, view_info.shape.sizes().vec());
-    case ViewInfo::Type::kNoOp:
-      return ir_value;
-    case ViewInfo::Type::kPermute:
-      return MakePermute(ir_value, view_info.permutation);
-    case ViewInfo::Type::kReshape:
-      return MakeView(ir_value, view_info.shape.sizes().vec());
-    case ViewInfo::Type::kResize:
-      return MakeResize(ir_value, view_info.shape.sizes().vec());
-    case ViewInfo::Type::kSqueeze:
-      return MakeSqueeze(ir_value, view_info.squeeze_index);
-    case ViewInfo::Type::kUnsqueeze:
-      return MakeUnsqueeze(ir_value, view_info.squeeze_index);
-    case ViewInfo::Type::kAsStrided:
-      return MakeAsStrided(
-          ir_value,
-          view_info.shape.sizes().vec(),
-          view_info.as_strided->stride,
-          view_info.as_strided->offset);
-    case ViewInfo::Type::kDiagonal:
-      return MakeDiagonal(
-          ir_value,
-          view_info.diagonal->offset,
-          view_info.diagonal->dim1,
-          view_info.diagonal->dim2);
-    default:
-      TORCH_INTERNAL_ASSERT(
-          false, "Invalid view type: ", GetEnumValue(view_info.view_type));
-  }
-}
-
-// Here we are trying to populate inplace updated values from the latest view
-// all the way back to the original tensor.
-// For example:
-//     a = torch.diagonal(b)
-//     b.add_(1) # a should be updated as well.
-//
-// Ideally we should all have a *ViewUpdate IR which updates the original
-// tensor/view withe current value. See DiagonalViewUpdate and corresponding
-// LowerDiagonalViewUpdate in ts_node_lowering.cpp. There are some "edge cases"
-// here simply because they can smartly reuse some other ops to undo themselves.
-Value ApplyUpdate(Value ir_value, const Alias::UpdateData& update_data) {
-  // We first bring the source IR value forward, by reshaping and slicing.
-  std::vector<Value> tmp_values({ir_value});
-  for (const ViewInfo& view_info : update_data.view_infos) {
-    tmp_values.push_back(ApplyViewInfo(tmp_values.back(), view_info));
-  }
-  // We then move backward given the source update value, by reshaping and
-  // slice-updating.
-  Value result = update_data.ir_value;
-  for (size_t i = update_data.view_infos.size(); i > 0; --i) {
-    const ViewInfo& view_info = update_data.view_infos[i - 1];
-    switch (view_info.view_type) {
-      case ViewInfo::Type::kSelect:
-        result = MakeSelectViewUpdate(
-            tmp_values[i - 1],
-            result,
-            view_info.select->dim,
-            view_info.select->start,
-            view_info.select->end,
-            view_info.select->stride);
-        break;
-      case ViewInfo::Type::kNarrow:
-        result =
-            MakeNarrowViewUpdate(tmp_values[i - 1], result, view_info.indices);
-        break;
-      case ViewInfo::Type::kNoOp:
-        break;
-      case ViewInfo::Type::kPermute:
-        result = MakePermute(result, InversePermutation(view_info.permutation));
-        break;
-      case ViewInfo::Type::kReshape:
-        result = MakeView(result, view_info.source_shape.sizes().vec());
-        break;
-      case ViewInfo::Type::kResize:
-        result = MakeResize(result, view_info.source_shape.sizes().vec());
-        break;
-      case ViewInfo::Type::kSqueeze:
-        result = MakeUnsqueeze(ir_value, view_info.squeeze_index);
-        break;
-      case ViewInfo::Type::kUnsqueeze:
-        result = MakeSqueeze(ir_value, view_info.squeeze_index);
-        break;
-      case ViewInfo::Type::kAsStrided:
-        result = MakeAsStridedViewUpdate(
-            tmp_values[i - 1],
-            result,
-            view_info.source_shape.sizes().vec(),
-            view_info.as_strided->stride,
-            view_info.as_strided->offset);
-        break;
-      case ViewInfo::Type::kDiagonal:
-        result = MakeDiagonalViewUpdate(
-            tmp_values[i - 1],
-            result,
-            view_info.diagonal->offset,
-            view_info.diagonal->dim1,
-            view_info.diagonal->dim2);
-        break;
-      default:
-        TORCH_INTERNAL_ASSERT(
-            false, "Invalid view type: ", GetEnumValue(view_info.view_type));
-    }
-  }
-  return result;
-}
-
-} // namespace
-
-ViewInfo::ViewInfo(Type view_type, Shape shape, Shape source_shape)
-    : view_type(view_type),
-      shape(std::move(shape)),
-      indices(source_shape.dim(), 0),
-      source_shape(std::move(source_shape)) {}
-
-ViewInfo::ViewInfo(Type view_type, Shape shape, Shape source_shape, int64_t sqi)
-    : view_type(view_type),
-      shape(std::move(shape)),
-      source_shape(std::move(source_shape)),
-      squeeze_index(sqi) {
-  TORCH_CHECK(view_type == Type::kSqueeze);
-}
-
-ViewInfo::ViewInfo(
-    Type view_type,
-    Shape source_shape,
-    std::vector<int64_t> permutation)
-    : view_type(view_type),
-      shape(MakePermuteShape(source_shape, permutation)),
-      source_shape(std::move(source_shape)),
-      permutation(std::move(permutation)) {
-  TORCH_CHECK(view_type == Type::kPermute);
-}
-
-ViewInfo::ViewInfo(Type view_type, const Shape& source_shape, SelectInfo select)
-    : view_type(view_type),
-      shape(MakeSelectShape(
-          source_shape,
-          select.dim,
-          select.start,
-          select.end,
-          select.stride)),
-      source_shape(source_shape),
-      select(select) {
-  TORCH_CHECK(view_type == Type::kSelect);
-}
-
-ViewInfo::ViewInfo(
-    Type view_type,
-    Shape shape,
-    Shape source_shape,
-    AsStridedInfo as_strided)
-    : view_type(view_type),
-      shape(std::move(shape)),
-      source_shape(std::move(source_shape)),
-      as_strided(std::move(as_strided)) {
-  TORCH_CHECK(view_type == Type::kAsStrided);
-}
-
-ViewInfo::ViewInfo(
-    Type view_type,
-    const Shape& source_shape,
-    DiagonalInfo diagonal)
-    : view_type(view_type),
-      shape(MakeDiagonalShape(
-          source_shape,
-          diagonal.offset,
-          diagonal.dim1,
-          diagonal.dim2)),
-      source_shape(source_shape),
-      diagonal(diagonal) {
-  TORCH_CHECK(view_type == Type::kDiagonal);
-}
-
-void Alias::Update(Value ir_value, std::vector<ViewInfo> view_infos) {
-  if (!updates_.empty() && updates_.back().view_infos == view_infos) {
-    updates_.back().ir_value = std::move(ir_value);
-  } else {
-    updates_.push_back({std::move(ir_value), std::move(view_infos)});
-  }
-  ++generation_;
-}
-
-Value Alias::SyncUpdateOperations() {
-  for (auto& update_data : updates_) {
-    root_ir_value_ = ApplyUpdate(root_ir_value_, update_data);
-  }
-  updates_.clear();
-  return root_ir_value_;
-}
-
-LazyView::LazyView(
-    Shape shape,
-    std::shared_ptr<Alias> alias,
-    ViewInfo view_info)
-    : shape_(std::move(shape)), alias_(std::move(alias)) {
-  view_infos_.push_back(std::move(view_info));
-}
-
-LazyView::LazyView(
-    Shape shape,
-    std::shared_ptr<Alias> alias,
-    std::vector<ViewInfo> view_infos)
-    : view_infos_(std::move(view_infos)),
-      shape_(std::move(shape)),
-      alias_(std::move(alias)) {}
-
-void LazyView::Update(Value ir_value) {
-  alias_->Update(std::move(ir_value), view_infos_);
-}
-
-std::shared_ptr<LazyView> LazyView::CreateSubView(
-    Shape shape,
-    ViewInfo view_info) {
-  std::vector<ViewInfo> view_infos(view_infos_);
-  view_infos.push_back(std::move(view_info));
-  return std::make_shared<LazyView>(
-      std::move(shape), alias_, std::move(view_infos));
-}
-
-std::tuple<Value, bool> LazyView::GetViewIrNode() {
-  if (IsUpToDate()) {
-    return std::make_tuple(ir_value_, false);
-  }
-  Value update = alias_->SyncUpdateOperations();
-  for (auto& view_info : view_infos_) {
-    update = ApplyViewInfo(update, view_info);
-  }
-  ir_value_ = update;
-  generation_ = alias_->generation();
-  return std::make_tuple(ir_value_, true);
-}
-
-} // namespace lazy
-} // namespace torch
diff --git a/torch/csrc/lazy/core/lazy_view.h b/torch/csrc/lazy/core/lazy_view.h
deleted file mode 100644
index 5e1a106494cfb..0000000000000
--- a/torch/csrc/lazy/core/lazy_view.h
+++ /dev/null
@@ -1,173 +0,0 @@
-#pragma once
-
-#include <c10/util/Optional.h>
-#include <torch/csrc/lazy/core/ir.h>
-#include <torch/csrc/lazy/core/shape.h>
-
-#include <memory>
-#include <vector>
-
-namespace torch {
-namespace lazy {
-
-struct TORCH_API SelectInfo {
-  bool operator==(const SelectInfo& ref) const {
-    return dim == ref.dim && start == ref.start && end == ref.end &&
-        stride == ref.stride;
-  }
-
-  int64_t dim = 0;
-  int64_t start = 0;
-  int64_t end = 0;
-  int64_t stride = 0;
-};
-
-struct TORCH_API AsStridedInfo {
-  bool operator==(const AsStridedInfo& ref) const {
-    return offset == ref.offset && stride == ref.stride;
-  }
-
-  std::vector<int64_t> stride;
-  int64_t offset = 0;
-};
-
-struct TORCH_API DiagonalInfo {
-  bool operator==(const DiagonalInfo& ref) const {
-    return offset == ref.offset && dim1 == ref.dim1 && dim2 == ref.dim2;
-  }
-
-  int64_t offset = 0;
-  int64_t dim1 = 0;
-  int64_t dim2 = 1;
-};
-
-struct TORCH_API ViewInfo {
-  enum class Type {
-    kInvalid,
-    kNarrow,
-    kNoOp,
-    kPermute,
-    kReshape,
-    kResize,
-    kSelect,
-    kAsStrided,
-    kDiagonal,
-    kSqueeze,
-    kUnsqueeze,
-  };
-
-  ViewInfo() = default;
-  ViewInfo(Type view_type, Shape shape, Shape source_shape);
-  ViewInfo(Type view_type, Shape shape, Shape source_shape, int64_t sqi);
-  ViewInfo(
-      Type view_type,
-      Shape source_shape,
-      std::vector<int64_t> permutation);
-  ViewInfo(Type view_type, const Shape& source_shape, SelectInfo select);
-  ViewInfo(
-      Type view_type,
-      Shape shape,
-      Shape source_shape,
-      AsStridedInfo as_strided);
-  ViewInfo(Type view_type, const Shape& source_shape, DiagonalInfo diagonal);
-
-  bool operator==(const ViewInfo& ref) const {
-    return view_type == ref.view_type && shape == ref.shape &&
-        indices == ref.indices && source_shape == ref.source_shape &&
-        permutation == ref.permutation && select == ref.select &&
-        as_strided == ref.as_strided && diagonal == ref.diagonal;
-  }
-
-  Type view_type = Type::kInvalid;
-  // The shape of the result of a view. In case of narrowing, this represents
-  // the size of the narrow slice.
-  Shape shape;
-  // In case of narrowing, the starting indices from where the narrow slice is
-  // cut.
-  std::vector<int64_t> indices;
-  // The shape of the source of this view.
-  Shape source_shape;
-  // The permutation to be used. If empty, this is not a permute operation.
-  std::vector<int64_t> permutation;
-  // Information used for sliced views.
-  c10::optional<SelectInfo> select;
-  // Information used for as_strided views.
-  c10::optional<AsStridedInfo> as_strided;
-  // Information used for diagonal views.
-  c10::optional<DiagonalInfo> diagonal;
-  // Squeeze/Unsqueeze Index
-  int64_t squeeze_index;
-};
-
-// When a "view" (capture by reference) is taken on a node, an Alias object is
-// created on the captured node itself, with its current IR Node value.
-class TORCH_API Alias {
- public:
-  struct UpdateData {
-    Value ir_value;
-    std::vector<ViewInfo> view_infos;
-  };
-
-  explicit Alias(Value ir_value) : root_ir_value_(std::move(ir_value)) {}
-
-  size_t generation() const {
-    return generation_;
-  }
-
-  // Appends an update to the IR value stored within the alias. The ir_value is
-  // the value to be written, and view_infos represents the forward path from
-  // the alias's ir_value to the update ir_value.
-  void Update(Value ir_value, std::vector<ViewInfo> view_infos);
-
-  Value SyncUpdateOperations();
-
- private:
-  // The IR value which is the root at which the view was created.
-  Value root_ir_value_;
-  // The stacked updates on the view. Orders matter, as most recent updates
-  // might overwrite older ones.
-  std::vector<UpdateData> updates_;
-  // Incremented every time an update happens. Used by view to track alias
-  // changes and regenerate the most current value.
-  size_t generation_ = 0;
-};
-
-class TORCH_API LazyView {
- public:
-  LazyView(Shape shape, std::shared_ptr<Alias> alias, ViewInfo view_info);
-  LazyView(
-      Shape shape,
-      std::shared_ptr<Alias> alias,
-      std::vector<ViewInfo> view_infos);
-
-  void Update(Value ir_value);
-
-  const Shape& shape() const {
-    return shape_;
-  }
-
-  const std::shared_ptr<Alias>& alias() const {
-    return alias_;
-  }
-
-  std::shared_ptr<LazyView> CreateSubView(Shape shape, ViewInfo view_info);
-
-  // Extracts the current IrNode out of a view, into a IrNode structure
-  // where the updated fields tells whether a new IR value has been created, or
-  // the cached one returned.
-  std::tuple<Value, bool> GetViewIrNode();
-
-  bool IsUpToDate() const {
-    return ir_value_ && generation_ == alias_->generation();
-  }
-
- private:
-  std::vector<ViewInfo> view_infos_;
-  Shape shape_;
-  std::shared_ptr<Alias> alias_;
-  Value ir_value_;
-  size_t generation_ = 0;
-};
-
-} // namespace lazy
-} // namespace torch
diff --git a/torch/csrc/lazy/core/tensor.cpp b/torch/csrc/lazy/core/tensor.cpp
index 0a114d0e71179..734dc5fdbd9ac 100644
--- a/torch/csrc/lazy/core/tensor.cpp
+++ b/torch/csrc/lazy/core/tensor.cpp
@@ -47,15 +47,6 @@ LazyTensorPtr LazyTensor::Create(Value ir_value, const BackendDevice& device) {
   return lazy_tensor;
 }
 
-LazyTensorPtr LazyTensor::Create(
-    std::shared_ptr<LazyView> view,
-    const BackendDevice& device) {
-  LazyTensorPtr lazy_tensor =
-      c10::make_intrusive<LazyTensor>(LazyTensor(std::move(view), device));
-  LazyGraphExecutor::Get()->RegisterTensor(lazy_tensor->data_ptr());
-  return lazy_tensor;
-}
-
 LazyTensorPtr LazyTensor::Create(BackendDataPtr handle) {
   LazyTensorPtr lazy_tensor =
       c10::make_intrusive<LazyTensor>(LazyTensor(std::move(handle)));
@@ -78,11 +69,6 @@ LazyTensor::LazyTensor(Value ir_value, const BackendDevice& device)
   TryLimitGraphSize();
 }
 
-LazyTensor::LazyTensor(
-    std::shared_ptr<LazyView> view,
-    const BackendDevice& device)
-    : LazyTensor(std::make_shared<Data>(std::move(view), device)) {}
-
 LazyTensor::LazyTensor(std::shared_ptr<Data> data) : data_(std::move(data)) {}
 
 LazyTensor::Data* LazyTensor::data() const {
@@ -102,9 +88,6 @@ at::ScalarType LazyTensor::dtype() const {
 }
 
 MaybeRef<Shape> LazyTensor::shape() const {
-  if (data()->view != nullptr) {
-    return data()->view->shape();
-  }
   if (data()->handle != nullptr) {
     return Shape(data()->handle->shape());
   }
@@ -126,45 +109,23 @@ int64_t LazyTensor::GetUniqueId() const {
   return data()->unique_id;
 }
 
-std::ptrdiff_t LazyTensor::GetViewAliasId() const {
-  return data()->view != nullptr
-      ? reinterpret_cast<std::ptrdiff_t>(data()->view->alias().get())
-      : 0;
-}
-
 BackendDataPtr LazyTensor::GetDataHandle() {
-  // Data can coexist with a view, but we need to check that the view did
-  // not receive any updates before calling the current IR valid.
-  bool up_to_date = true;
-  Value ir_value;
-  if (data()->view != nullptr) {
-    bool updated = false;
-    std::tie(ir_value, updated) = GetViewUpdate(data()->view);
-    up_to_date = !updated;
-  }
-  if (up_to_date) {
-    BackendDataPtr handle = CurrentDataHandle();
-    if (handle != nullptr) {
-      TORCH_CHECK(
-          handle->HasValue(),
-          "Trying to access data while an async operation is in flight: ",
-          handle->shape().to_string());
-      return handle;
-    }
-  }
-  if (ir_value) {
-    // The view gave us an updated IR value. We usually do not have a valid IR
-    // value field together with a view, but to allow code reuse in
-    // ApplyPendingGraph() we temporarily set it here. The following call to
-    // ApplyPendingGraph() will clear it.
-    AssignIrValue(std::move(ir_value));
+  BackendDataPtr handle = CurrentDataHandle();
+  if (handle != nullptr) {
+    TORCH_CHECK(
+        handle->HasValue(),
+        "Trying to access data while an async operation is in flight: ",
+        handle->shape().to_string());
+    return handle;
   }
+
   if (data()->ir_value) {
     ApplyPendingGraph();
   } else {
     TORCH_CHECK(data()->tensor_data);
     data()->handle = TensorToDataHandle(*data()->tensor_data, GetDevice());
   }
+
   return data()->handle;
 }
 
@@ -179,10 +140,9 @@ void LazyTensor::SetDataHandle(BackendDataPtr handle) {
 void LazyTensor::SetDataHandle(BackendDataPtr handle, bool sync) {
   data()->handle = std::move(handle);
   // Assigning a device data should always clear the IR node, to allow graph
-  // trimming. A view cannot be reset though, unless we are at a step-end sync.
+  // trimming.
   AssignIrValue(Value());
   if (sync) {
-    data()->view = nullptr;
     data()->tensor_data = c10::nullopt;
   }
 }
@@ -190,16 +150,8 @@ void LazyTensor::SetDataHandle(BackendDataPtr handle, bool sync) {
 void LazyTensor::SetIrValue(Value ir_value) {
   data()->handle = nullptr;
   data()->tensor_data = c10::nullopt;
-  if (data()->view != nullptr) {
-    // If we have an active view, and a SetIrValue() happens, it means we are
-    // within an in-place execution context, and we need to update the view's
-    // alias as well.
-    data()->view = UpdateView(data()->view, std::move(ir_value));
-    data()->generation += 1;
-  } else {
-    AssignIrValue(std::move(ir_value));
-    TryLimitGraphSize();
-  }
+  AssignIrValue(std::move(ir_value));
+  TryLimitGraphSize();
 }
 
 void LazyTensor::SetInPlaceIrValue(Value ir_value) {
@@ -252,9 +204,6 @@ Value LazyTensor::GetIrValue() const {
 }
 
 Value LazyTensor::CurrentIrValue() const {
-  if (data()->view != nullptr) {
-    return std::get<0>(GetViewUpdate(data()->view));
-  }
   return data()->ir_value;
 }
 
@@ -263,9 +212,6 @@ void LazyTensor::SetTensorData(at::Tensor tensor_data) {
 }
 
 c10::optional<at::Tensor> LazyTensor::CurrentTensorData() const {
-  if (data()->view != nullptr && !data()->view->IsUpToDate()) {
-    return c10::nullopt;
-  }
   return data()->tensor_data;
 }
 
@@ -288,69 +234,6 @@ Value LazyTensor::GetIrValueForTensor(
   return CreateTensorNode(std::move(data), read_only);
 }
 
-std::tuple<Value, bool> LazyTensor::GetViewUpdate(
-    const std::shared_ptr<LazyView>& view) const {
-  auto value_with_update = view->GetViewIrNode();
-  if (std::get<1>(value_with_update)) {
-    data()->handle = nullptr;
-    data()->tensor_data = c10::nullopt;
-  }
-  return value_with_update;
-}
-
-std::shared_ptr<LazyView> LazyTensor::UpdateView(
-    std::shared_ptr<LazyView> view,
-    Value ir_value) const {
-  if (ir_value.shape().sizes() != view->shape().sizes()) {
-    TORCH_CHECK(ir_value.shape().numel() == view->shape().numel());
-
-    ViewInfo view_info(
-        ViewInfo::Type::kReshape, ir_value.shape(), view->shape());
-    view = view->CreateSubView(view_info.shape, view_info);
-  }
-  view->Update(std::move(ir_value));
-  return view;
-}
-
-void LazyTensor::SetSubView(ViewInfo view_info) const {
-  data()->view = data()->view->CreateSubView(view_info.shape, view_info);
-  data()->generation += 1;
-}
-
-void LazyTensor::ModifyCurrentView(ViewInfo view_info) const {
-  if (data()->view != nullptr) {
-    SetSubView(view_info);
-    return;
-  }
-  // This node is not a view. Since this function is meant to modify a view
-  // in place, we need to turn this existing tensor into a view.
-  Value ir_value = GetIrValue();
-  std::shared_ptr<Alias> alias = std::make_shared<Alias>(ir_value);
-  data()->view = std::make_shared<LazyView>(view_info.shape, alias, view_info);
-  AssignIrValue(Value());
-}
-
-std::shared_ptr<LazyView> LazyTensor::CreateView(ViewInfo view_info) const {
-  if (data()->view != nullptr) {
-    return data()->view->CreateSubView(view_info.shape, view_info);
-  }
-  // This node is not a view, and creating a view forks the current node into
-  // becoming one itself. This means creating an alias with the current IR
-  // Node, and using the same alias for the created IR Node.
-  Value ir_value = GetIrValue();
-  std::shared_ptr<Alias> alias = std::make_shared<Alias>(ir_value);
-  ViewInfo this_view_info(
-      ViewInfo::Type::kNoOp, ir_value.shape(), ir_value.shape());
-  data()->view = std::make_shared<LazyView>(
-      ir_value.shape(), alias, std::move(this_view_info));
-  AssignIrValue(Value());
-  return std::make_shared<LazyView>(view_info.shape, alias, view_info);
-}
-
-LazyTensorPtr LazyTensor::CreateViewTensor(ViewInfo view_info) const {
-  return Create(CreateView(std::move(view_info)), GetDevice());
-}
-
 at::Tensor LazyTensor::ToTensor(bool detached) {
   at::Tensor tensor;
   c10::optional<at::Tensor> tensor_data = CurrentTensorData();
@@ -367,8 +250,7 @@ at::Tensor LazyTensor::ToTensor(bool detached) {
   } else {
     tensor = *tensor_data;
     if (detached) {
-      if (data()->ir_value || data()->handle != nullptr ||
-          data()->view != nullptr) {
+      if (data()->ir_value || data()->handle != nullptr) {
         // If we have other authoritive sources, just drop our reference and
         // transfer it to the caller.
         data()->tensor_data = c10::nullopt;
@@ -388,7 +270,6 @@ void LazyTensor::ShallowCopyTo(LazyTensorPtr dest) const {
 
 void LazyTensor::SetTensor(at::Tensor tensor) {
   SetTensorData(tensor);
-  data()->view = nullptr;
   data()->handle = nullptr;
   AssignIrValue(Value());
 }
@@ -401,25 +282,14 @@ void LazyTensor::UpdateFromTensor(at::Tensor tensor, bool sync) {
     SetTensorData(tensor);
     data()->handle = nullptr;
     AssignIrValue(Value());
-    if (data()->view != nullptr) {
-      Value ir_value = GetIrValueForTensor(tensor, GetDevice());
-      data()->view = UpdateView(data()->view, std::move(ir_value));
-    }
   }
 }
 
 void LazyTensor::UpdateFromTensorOut(at::Tensor tensor) {
-  if (data()->view != nullptr && shape().Get().numel() != tensor.numel()) {
-    data()->view = nullptr;
-  }
   UpdateFromTensor(std::move(tensor), /*sync=*/false);
 }
 
 void LazyTensor::UpdateFromTensorOut(const LazyTensorPtr& tensor) {
-  if (data()->view != nullptr &&
-      shape().Get().numel() != tensor->shape().Get().numel()) {
-    data()->view = nullptr;
-  }
   SetIrValue(tensor->GetIrValue());
 }
 
diff --git a/torch/csrc/lazy/core/tensor.h b/torch/csrc/lazy/core/tensor.h
index 052b84b4a60cc..85ea6ab4f4c61 100644
--- a/torch/csrc/lazy/core/tensor.h
+++ b/torch/csrc/lazy/core/tensor.h
@@ -5,7 +5,6 @@
 #include <torch/csrc/lazy/backend/backend_data.h>
 #include <torch/csrc/lazy/backend/backend_device.h>
 #include <torch/csrc/lazy/core/ir.h>
-#include <torch/csrc/lazy/core/lazy_view.h>
 #include <torch/csrc/lazy/core/util.h>
 
 namespace torch {
@@ -37,10 +36,6 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target {
         : ir_value(std::move(ir_value)),
           device(std::move(device)),
           unique_id(GetNextTensorId()) {}
-    Data(std::shared_ptr<LazyView> view, BackendDevice device)
-        : view(std::move(view)),
-          device(std::move(device)),
-          unique_id(GetNextTensorId()) {}
     Data(at::Tensor tensor_data, BackendDevice device)
         : tensor_data(std::move(tensor_data)),
           device(std::move(device)),
@@ -50,7 +45,6 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target {
 
     BackendDataPtr handle;
     Value ir_value;
-    std::shared_ptr<LazyView> view;
     c10::optional<at::Tensor> tensor_data;
     const BackendDevice device;
     const int64_t unique_id = 0;
@@ -76,10 +70,6 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target {
     return data()->generation;
   }
 
-  LazyTensorPtr alias() const {
-    return c10::make_intrusive<LazyTensor>(LazyTensor(data_ptr()));
-  }
-
   int64_t size(int64_t dim) const;
 
   at::Tensor ToTensor(bool detached);
@@ -102,10 +92,6 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target {
   const BackendDevice& GetDevice() const;
   int64_t GetUniqueId() const;
 
-  // Retrieves an opaque ID of the alias object upon which the tensor's view is
-  // rooted, or 0 if this tensor is not a view.
-  std::ptrdiff_t GetViewAliasId() const;
-
   // Fetches the data behind the tensor. If the tensor has a graph defining
   // its current value, executes the graph and fetches the data result.
   BackendDataPtr GetDataHandle();
@@ -129,31 +115,21 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target {
   void SetIrValue(Value ir_value);
   void SetInPlaceIrValue(Value ir_value);
 
-  void SetSubView(ViewInfo view_info) const;
-
   c10::optional<at::Tensor> CurrentTensorData() const;
 
   std::vector<LazyTensorPtr> MakeOutputTensors(NodePtr node) const;
 
-  LazyTensorPtr CreateViewTensor(ViewInfo view_info) const;
   LazyTensorPtr CopyTensorToDevice(const BackendDevice& device);
 
-  void ModifyCurrentView(ViewInfo view_info) const;
-
   // Applies the queue of operations in preparation for using the data.
   void ApplyPendingGraph();
 
  private:
   LazyTensor(const at::Tensor& tensor, const BackendDevice& device);
   LazyTensor(Value ir_value, const BackendDevice& device);
-  LazyTensor(std::shared_ptr<LazyView> view, const BackendDevice& device);
   explicit LazyTensor(BackendDataPtr handle);
   explicit LazyTensor(std::shared_ptr<Data> data);
 
-  static LazyTensorPtr Create(
-      std::shared_ptr<LazyView> view,
-      const BackendDevice& device);
-
   std::shared_ptr<Data> data_ptr() const {
     return data_;
   }
@@ -164,15 +140,6 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target {
 
   Value CreateTensorNode(BackendDataPtr data, bool read_only) const;
 
-  std::tuple<Value, bool> GetViewUpdate(
-      const std::shared_ptr<LazyView>& view) const;
-
-  std::shared_ptr<LazyView> UpdateView(
-      std::shared_ptr<LazyView> view,
-      Value ir_value) const;
-
-  std::shared_ptr<LazyView> CreateView(ViewInfo view_info) const;
-
   // We build a graph accumulating operations, but at a given point we
   // need to force a rendering, otherwise the graph can grow without control.
   // Think:
diff --git a/torch/csrc/lazy/ts_backend/ir_builder.h b/torch/csrc/lazy/ts_backend/ir_builder.h
index 600243b67f622..067efc784ee5a 100644
--- a/torch/csrc/lazy/ts_backend/ir_builder.h
+++ b/torch/csrc/lazy/ts_backend/ir_builder.h
@@ -55,84 +55,6 @@ struct TorchScriptIrBuilder : IrBuilder {
     return MakeNode<Generic>(op, operands, shape, num_outputs, hash_seed);
   }
 
-  // View op nodes
-  NodePtr MakeAsStridedViewUpdate(
-      const Value& input0,
-      const Value& input1,
-      const std::vector<int64_t>& size,
-      const std::vector<int64_t>& stride,
-      const int64_t& storage_offset) const override {
-    return ReuseOrMakeNode<AsStridedViewUpdate>(
-        input0, input1, size, stride, storage_offset);
-  }
-  NodePtr MakeAsStrided(
-      const Value& input0,
-      const std::vector<int64_t>& size,
-      const std::vector<int64_t>& stride,
-      const int64_t& storage_offset) const override {
-    return ReuseOrMakeNode<AsStrided>(input0, size, stride, storage_offset);
-  }
-  NodePtr MakeDiagonalViewUpdate(
-      const Value& input0,
-      const Value& input1,
-      const int64_t& offset,
-      const int64_t& dim1,
-      const int64_t& dim2) const override {
-    return ReuseOrMakeNode<DiagonalViewUpdate>(
-        input0, input1, offset, dim1, dim2);
-  }
-  NodePtr MakeDiagonal(
-      const Value& input0,
-      const int64_t& offset,
-      const int64_t& dim1,
-      const int64_t& dim2) const override {
-    return ReuseOrMakeNode<Diagonal>(input0, offset, dim1, dim2);
-  }
-  NodePtr MakeNarrowViewUpdate(
-      const Value& input0,
-      const Value& input1,
-      const std::vector<int64_t>& base_indices) const override {
-    return ReuseOrMakeNode<NarrowViewUpdate>(input0, input1, base_indices);
-  }
-  NodePtr MakeNarrow(
-      const Value& input0,
-      const std::vector<int64_t>& base_indices,
-      const std::vector<int64_t>& sizes) const override {
-    return ReuseOrMakeNode<Narrow>(input0, base_indices, sizes);
-  }
-  NodePtr MakePermute(const Value& input0, const std::vector<int64_t>& dims)
-      const override {
-    return ReuseOrMakeNode<Permute>(input0, dims);
-  }
-  NodePtr MakeResize(const Value& input0, const std::vector<int64_t>& size)
-      const override {
-    return ReuseOrMakeNode<Resize>(input0, size);
-  }
-  NodePtr MakeSelectViewUpdate(
-      const Value& input0,
-      const Value& input1,
-      const int64_t& dim,
-      const int64_t& start,
-      const int64_t& end,
-      const int64_t& stride) const override {
-    return ReuseOrMakeNode<SelectViewUpdate>(
-        input0, input1, dim, start, end, stride);
-  }
-  NodePtr MakeSelect(
-      const Value& input0,
-      const int64_t& dim,
-      const int64_t& start,
-      const int64_t& end,
-      const int64_t& stride) const override {
-    return ReuseOrMakeNode<Select>(input0, dim, start, end, stride);
-  }
-  NodePtr MakeSqueeze(const Value& input0, const int& dim) const override {
-    return ReuseOrMakeNode<Squeeze>(input0, dim);
-  }
-  NodePtr MakeUnsqueeze(const Value& input0, const int& dim) const override {
-    return ReuseOrMakeNode<Unsqueeze>(input0, dim);
-  }
-
   // dynamic ir nodes
   // TODO: verify if IR node reusing works for Dynamic shape ops
   NodePtr MakeSizeNode(const Value& input, size_t dim) const override {
diff --git a/torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp b/torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp
index 3f5882f471f5d..15dbebb0e67ad 100644
--- a/torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp
+++ b/torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp
@@ -49,58 +49,11 @@ std::vector<int64_t> GetExpandDimensions(
   return dimensions;
 }
 
-torch::lazy::ViewInfo CreateAsStridedViewInfo(
-    const torch::lazy::Shape& input_shape,
-    std::vector<int64_t> size,
-    std::vector<int64_t> stride,
-    c10::optional<int64_t> storage_offset) {
-  torch::lazy::Shape result_shape =
-      torch::lazy::Shape(input_shape.scalar_type(), size);
-  torch::lazy::AsStridedInfo as_strided_info;
-  as_strided_info.stride = std::move(stride);
-  if (storage_offset) {
-    as_strided_info.offset = *storage_offset;
-  }
-  return torch::lazy::ViewInfo(
-      torch::lazy::ViewInfo::Type::kAsStrided,
-      std::move(result_shape),
-      input_shape,
-      std::move(as_strided_info));
-}
-
 } // namespace
 
 //////////////////////////////////////////////////////////////////////////////
 // ATEN operators follows here, listed in alphabetical order.
 //////////////////////////////////////////////////////////////////////////////
-torch::lazy::LazyTensorPtr as_strided(
-    const torch::lazy::LazyTensorPtr& input,
-    std::vector<int64_t> size,
-    std::vector<int64_t> stride,
-    c10::optional<int64_t> storage_offset) {
-  auto input_shape = input->shape();
-  return input->CreateViewTensor(CreateAsStridedViewInfo(
-      input_shape, std::move(size), std::move(stride), storage_offset));
-}
-
-void as_strided_(
-    torch::lazy::LazyTensorPtr& input,
-    std::vector<int64_t> size,
-    std::vector<int64_t> stride,
-    c10::optional<int64_t> storage_offset) {
-  if (input->data()->view == nullptr) {
-    input->SetIrValue(torch::lazy::MakeAsStrided(
-        input->GetIrValue(),
-        std::move(size),
-        std::move(stride),
-        storage_offset.value_or(0)));
-  } else {
-    auto input_shape = input->shape();
-    input->SetSubView(CreateAsStridedViewInfo(
-        input_shape, std::move(size), std::move(stride), storage_offset));
-  }
-}
-
 torch::lazy::LazyTensorPtr expand(
     const torch::lazy::LazyTensorPtr& input,
     std::vector<int64_t> size) {
@@ -120,38 +73,6 @@ void fill_(torch::lazy::LazyTensorPtr& input, const at::Scalar& value) {
   input->SetInPlaceIrValue(std::move(constant));
 }
 
-torch::lazy::LazyTensorPtr narrow(
-    const torch::lazy::LazyTensorPtr& input,
-    int64_t dim,
-    int64_t start,
-    int64_t length) {
-  auto input_shape = input->shape();
-  dim = torch::lazy::GetCanonicalDimensionIndex(dim, input_shape.Get().dim());
-  torch::lazy::Shape narrow_shape = input_shape;
-  narrow_shape.set_size(dim, length);
-
-  torch::lazy::ViewInfo::Type view_type =
-      (input_shape.Get().numel() == narrow_shape.numel())
-      ? torch::lazy::ViewInfo::Type::kReshape
-      : torch::lazy::ViewInfo::Type::kNarrow;
-  torch::lazy::ViewInfo view_info(
-      view_type, std::move(narrow_shape), input_shape);
-  view_info.indices[dim] =
-      torch::lazy::GetCanonicalPosition(input_shape.Get().sizes(), dim, start);
-  return input->CreateViewTensor(std::move(view_info));
-}
-
-torch::lazy::LazyTensorPtr permute(
-    const torch::lazy::LazyTensorPtr& input,
-    c10::ArrayRef<int64_t> dims) {
-  auto input_shape = input->shape();
-  torch::lazy::ViewInfo view_info(
-      torch::lazy::ViewInfo::Type::kPermute,
-      input_shape,
-      torch::lazy::GetCanonicalDimensionIndices(dims, input_shape.Get().dim()));
-  return input->CreateViewTensor(std::move(view_info));
-}
-
 void copy_(torch::lazy::LazyTensorPtr& input, torch::lazy::LazyTensorPtr& src) {
   if (input->GetDevice() == src->GetDevice()) {
     torch::lazy::Value copy_value;
@@ -172,119 +93,5 @@ void copy_(torch::lazy::LazyTensorPtr& input, torch::lazy::LazyTensorPtr& src) {
   }
 }
 
-torch::lazy::LazyTensorPtr select(
-    const torch::lazy::LazyTensorPtr& input,
-    int64_t dim,
-    int64_t index) {
-  auto shape = input->shape();
-  dim = torch::lazy::GetCanonicalDimensionIndex(dim, shape.Get().dim());
-  torch::lazy::LazyTensorPtr result = narrow(input, dim, index, 1);
-  auto new_dims = torch::lazy::DropDimensions(shape.Get().sizes(), {dim});
-  return view(result, new_dims);
-}
-
-torch::lazy::LazyTensorPtr slice(
-    const torch::lazy::LazyTensorPtr& input,
-    int64_t dim,
-    int64_t start,
-    int64_t end,
-    int64_t step) {
-  auto input_shape = input->shape();
-  dim = torch::lazy::GetCanonicalDimensionIndex(dim, input_shape.Get().dim());
-  start =
-      torch::lazy::GetCanonicalPosition(input_shape.Get().sizes(), dim, start);
-  end = torch::lazy::GetCanonicalPosition(input_shape.Get().sizes(), dim, end);
-  // PyTorch allows tensor[-1:0] to return a 0-dim tensor.
-  if (start > end) {
-    end = start;
-  }
-  step = std::min(step, end - start);
-
-  torch::lazy::SelectInfo select = {dim, start, end, step};
-  torch::lazy::ViewInfo view_info(
-      torch::lazy::ViewInfo::Type::kSelect, input_shape, select);
-  return input->CreateViewTensor(std::move(view_info));
-}
-
-torch::lazy::LazyTensorPtr squeeze(const torch::lazy::LazyTensorPtr& input) {
-  auto input_shape = input->shape();
-  auto output_dimensions =
-      BuildSqueezedDimensions(input_shape.Get().sizes(), /*squeeze_dim=*/-1);
-  return view(input, output_dimensions);
-}
-
-torch::lazy::LazyTensorPtr squeeze(
-    const torch::lazy::LazyTensorPtr& input,
-    int64_t dim) {
-  auto input_shape = input->shape();
-  int64_t squeeze_dim =
-      torch::lazy::GetCanonicalDimensionIndex(dim, input->shape().Get().dim());
-  auto output_dimensions =
-      BuildSqueezedDimensions(input_shape.Get().sizes(), squeeze_dim);
-  return view(input, output_dimensions);
-}
-
-void squeeze_(torch::lazy::LazyTensorPtr& input) {
-  input->SetIrValue(torch::lazy::MakeSqueeze(input->GetIrValue(), -1));
-}
-
-void squeeze_(torch::lazy::LazyTensorPtr& input, int64_t dim) {
-  input->SetIrValue(torch::lazy::MakeSqueeze(
-      input->GetIrValue(),
-      torch::lazy::GetCanonicalDimensionIndex(
-          dim, input->shape().Get().dim())));
-}
-
-torch::lazy::LazyTensorPtr transpose(
-    const torch::lazy::LazyTensorPtr& input,
-    int64_t dim0,
-    int64_t dim1) {
-  auto input_shape = input->shape();
-  auto permute_dims = torch::lazy::MakeTransposePermutation(
-      /*dim0=*/dim0, /*dim1=*/dim1, /*rank=*/input_shape.Get().dim());
-  torch::lazy::ViewInfo view_info(
-      torch::lazy::ViewInfo::Type::kPermute, input_shape, permute_dims);
-  return input->CreateViewTensor(std::move(view_info));
-}
-
-void transpose_(torch::lazy::LazyTensorPtr& input, int64_t dim0, int64_t dim1) {
-  auto input_shape = input->shape();
-  auto permute_dims = torch::lazy::MakeTransposePermutation(
-      /*dim0=*/dim0, /*dim1=*/dim1, /*rank=*/input_shape.Get().dim());
-  torch::lazy::ViewInfo view_info(
-      torch::lazy::ViewInfo::Type::kPermute, input_shape, permute_dims);
-  return input->ModifyCurrentView(std::move(view_info));
-}
-
-torch::lazy::LazyTensorPtr unsqueeze(
-    const torch::lazy::LazyTensorPtr& input,
-    int64_t dim) {
-  auto input_shape = input->shape();
-  int64_t squeeze_dim =
-      torch::lazy::GetCanonicalDimensionIndex(dim, input_shape.Get().dim() + 1);
-  auto dimensions =
-      BuildUnsqueezedDimensions(input_shape.Get().sizes(), squeeze_dim);
-  return view(input, dimensions);
-}
-
-void unsqueeze_(torch::lazy::LazyTensorPtr& input, int64_t dim) {
-  int squeeze_dim = torch::lazy::GetCanonicalDimensionIndex(
-      dim, input->shape().Get().dim() + 1);
-  input->SetIrValue(
-      torch::lazy::MakeUnsqueeze(input->GetIrValue(), squeeze_dim));
-}
-
-torch::lazy::LazyTensorPtr view(
-    const torch::lazy::LazyTensorPtr& input,
-    c10::ArrayRef<int64_t> output_size) {
-  auto input_shape = input->shape().Get();
-  torch::lazy::Shape shape = torch::lazy::Shape(
-      input_shape.scalar_type(),
-      at::infer_size(output_size, input_shape.numel()));
-  torch::lazy::ViewInfo view_info(
-      torch::lazy::ViewInfo::Type::kReshape, std::move(shape), input_shape);
-  return input->CreateViewTensor(std::move(view_info));
-}
-
 } // namespace lazy
 } // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/tensor_aten_ops.h b/torch/csrc/lazy/ts_backend/tensor_aten_ops.h
index 0d5a49bdfbd67..0cb16faf6cacc 100644
--- a/torch/csrc/lazy/ts_backend/tensor_aten_ops.h
+++ b/torch/csrc/lazy/ts_backend/tensor_aten_ops.h
@@ -8,20 +8,6 @@ namespace lazy {
 //////////////////////////////////////////////////////////////////////////////
 // ATEN operators follows here, listed in alphabetical order.
 //////////////////////////////////////////////////////////////////////////////
-// Takes a slice from the input as R1 at the specified offset and reshapes it
-// into the provided size.
-torch::lazy::LazyTensorPtr as_strided(
-    const torch::lazy::LazyTensorPtr& input,
-    std::vector<int64_t> size,
-    std::vector<int64_t> stride,
-    c10::optional<int64_t> storage_offset);
-
-// In-place version of the method above.
-void as_strided_(
-    torch::lazy::LazyTensorPtr& input,
-    std::vector<int64_t> size,
-    std::vector<int64_t> stride,
-    c10::optional<int64_t> storage_offset);
 
 torch::lazy::LazyTensorPtr expand(
     const torch::lazy::LazyTensorPtr& input,
@@ -30,78 +16,7 @@ torch::lazy::LazyTensorPtr expand(
 // Fills the input with the given value.
 void fill_(torch::lazy::LazyTensorPtr& input, const at::Scalar& value);
 
-// Returns a new tensor that is a narrowed view of the input in the given
-// dimension.
-torch::lazy::LazyTensorPtr narrow(
-    const torch::lazy::LazyTensorPtr& input,
-    int64_t dim,
-    int64_t start,
-    int64_t length);
-
-// Permute the dimensions of this tensor according to the given permutation.
-torch::lazy::LazyTensorPtr permute(
-    const torch::lazy::LazyTensorPtr& input,
-    c10::ArrayRef<int64_t> dims);
-
-// Repeats the input tensor along each dimension by the given number of
-// repeats.
-torch::lazy::LazyTensorPtr repeat(
-    const torch::lazy::LazyTensorPtr& input,
-    std::vector<int64_t> repeats);
-
 void copy_(torch::lazy::LazyTensorPtr& input, torch::lazy::LazyTensorPtr& src);
 
-torch::lazy::LazyTensorPtr select(
-    const torch::lazy::LazyTensorPtr& input,
-    int64_t dim,
-    int64_t index);
-
-torch::lazy::LazyTensorPtr slice(
-    const torch::lazy::LazyTensorPtr& input,
-    int64_t dim,
-    int64_t start,
-    int64_t end,
-    int64_t step);
-
-// Squeeze out all trivial (size 1) dimensions.
-torch::lazy::LazyTensorPtr squeeze(const torch::lazy::LazyTensorPtr& input);
-
-// Squeeze out the specified dimension index, if trivial (size 1). Returns
-// unchanged input otherwise.
-torch::lazy::LazyTensorPtr squeeze(
-    const torch::lazy::LazyTensorPtr& input,
-    int64_t dim);
-
-// In-place versions of the methods above.
-void squeeze_(torch::lazy::LazyTensorPtr& input);
-void squeeze_(torch::lazy::LazyTensorPtr& input, int64_t dim);
-
-std::tuple<
-    torch::lazy::LazyTensorPtr,
-    torch::lazy::LazyTensorPtr,
-    torch::lazy::LazyTensorPtr>
-svd(const torch::lazy::LazyTensorPtr& input, bool some, bool compute_uv);
-
-// Swap given dimensions of the input.
-torch::lazy::LazyTensorPtr transpose(
-    const torch::lazy::LazyTensorPtr& input,
-    int64_t dim0,
-    int64_t dim1);
-
-// In-place version of the method above.
-void transpose_(torch::lazy::LazyTensorPtr& input, int64_t dim0, int64_t dim1);
-
-// Insert a dimension of size one at the specified position.
-torch::lazy::LazyTensorPtr unsqueeze(
-    const torch::lazy::LazyTensorPtr& input,
-    int64_t dim);
-
-// In-place version of the method above.
-void unsqueeze_(torch::lazy::LazyTensorPtr& input, int64_t dim);
-
-// Like reshape, but it returns a view into the original tensor.
-torch::lazy::LazyTensorPtr view(
-    const torch::lazy::LazyTensorPtr& input,
-    c10::ArrayRef<int64_t> output_size);
 } // namespace lazy
 } // namespace torch

From 239bef1d77217c02d4e6f91ac94a2d95ff531c63 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Thu, 27 Oct 2022 13:49:11 -0700
Subject: [PATCH 0239/1922] Unify SymIntNode and SymFloatNode into SymNode
 (#87817)

This refactor was prompted by challenges handling mixed int/float
operations in C++.  A previous version of this patch
added overloads for each permutation of int/float and was unwieldy
https://github.com/pytorch/pytorch/pull/87722/  This PR takes a different
approach.

The general outline of the patch is to combine the C++ types SymIntNode
and SymFloatNode into a single type, SymNode.  This is type erased; we
no longer know statically at C++ if we have an int/float and have to test
it with the is_int()/is_float() virtual methods.  This has a number of
knock on effects.

- We no longer have C++ classes to bind to Python.  Instead, we take an
  entirely new approach to our Python API, where we have a SymInt/SymFloat
  class defined entirely in Python, which hold a SymNode (which corresponds
  to the C++ SymNode).  However, SymNode is not pybind11-bound; instead,
  it lives as-is in Python, and is wrapped into C++ SymNode using PythonSymNode
  when it goes into C++.  This implies a userland rename.

  In principle, it is also possible for the canonical implementation of SymNode
  to be written in C++, and then bound to Python with pybind11 (we have
  this code, although it is commented out.)  However, I did not implement
  this as we currently have no C++ implementations of SymNode.

  Because we do return SymInt/SymFloat from C++ bindings, the C++ binding
  code needs to know how to find these classes.  Currently, this is done
  just by manually importing torch and getting the attributes.

- Because SymInt/SymFloat are easy Python wrappers, __sym_dispatch__ now
  takes SymInt/SymFloat, rather than SymNode, bringing it in line with how
  __torch_dispatch__ works.

Some miscellaneous improvements:

- SymInt now has a constructor that takes SymNode.  Note that this
  constructor is ambiguous if you pass in a subclass of SymNode,
  so an explicit downcast is necessary.  This means toSymFloat/toSymInt
  are no more.  This is a mild optimization as it means rvalue reference
  works automatically.

- We uniformly use the caster for c10::SymInt/SymFloat, rather than
  going the long way via the SymIntNode/SymFloatNode.

- Removed some unnecessary toSymInt/toSymFloat calls in normalize_*
  functions, pretty sure this doesn't do anything.

- guard_int is now a free function, since to guard on an int you cannot
  assume the method exists.  A function can handle both int and SymInt
  inputs.

- We clean up the magic method definition code for SymInt/SymFloat/SymNode.
  ONLY the user classes (SymInt/SymFloat) get magic methods; SymNode gets
  plain methods; this is to help avoid confusion between the two types.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87817
Approved by: https://github.com/albanD, https://github.com/anjali411
---
 .github/ci_commit_pins/xla.txt                |   2 +-
 .lintrunner.toml                              |   2 +-
 aten/src/ATen/FunctionalStorageImpl.cpp       |   2 +-
 aten/src/ATen/core/ivalue.h                   |   8 +-
 aten/src/ATen/core/ivalue_inl.h               |   4 +-
 aten/src/ATen/core/jit_type.h                 |   1 -
 aten/src/ATen/test/scalar_test.cpp            |  31 -
 build_variables.bzl                           |   1 +
 c10/core/Scalar.h                             |   9 +-
 c10/core/SymFloat.cpp                         |  39 +-
 c10/core/SymFloat.h                           |  17 +-
 c10/core/SymFloatNodeImpl.cpp                 |  20 -
 c10/core/SymFloatNodeImpl.h                   |  76 ---
 c10/core/SymInt.cpp                           |  49 +-
 c10/core/SymInt.h                             |  51 +-
 c10/core/SymIntNodeImpl.cpp                   |  11 -
 c10/core/SymNodeImpl.cpp                      |   3 +
 c10/core/{SymIntNodeImpl.h => SymNodeImpl.h}  |  69 ++-
 c10/test/core/SymInt_test.cpp                 |   8 +-
 docs/source/conf.py                           |   4 +-
 functorch/_src/aot_autograd.py                |   2 +-
 functorch/_src/partitioners.py                |   2 +-
 functorch/experimental/cond.py                |   2 +
 test/cpp/jit/test_misc.cpp                    |  48 +-
 test/test_dynamic_shapes.py                   |  28 +-
 test/test_dynamic_shapes.py.bak               | 391 -------------
 test/test_proxy_tensor.py                     |   5 +-
 test/test_public_bindings.py                  |   4 +-
 tools/autograd/gen_autograd_functions.py      |   4 +-
 tools/autograd/templates/python_functions.cpp |   2 +-
 .../templates/python_variable_methods.cpp     |   7 +-
 tools/pyi/gen_pyi.py                          |   4 +-
 torch/_C/__init__.pyi.in                      |  14 -
 torch/__init__.py                             |  67 ++-
 torch/_dynamo/variables/tensor.py             |   2 +-
 torch/_inductor/graph.py                      |   6 +-
 torch/_prims/__init__.py                      |   6 +-
 torch/_prims_common/__init__.py               |  16 +-
 torch/_subclasses/fake_tensor.py              |   2 +-
 torch/csrc/Size.cpp                           |   6 +-
 torch/csrc/autograd/python_variable.cpp       |  15 +-
 torch/csrc/jit/python/init.cpp                | 548 ++----------------
 torch/csrc/jit/python/pybind_utils.cpp        |  16 +-
 torch/csrc/lazy/core/ir_builder.h             |   8 +-
 torch/csrc/lazy/core/shape_inference.cpp      |  10 +-
 torch/csrc/lazy/core/shape_inference.h        |   2 +-
 torch/csrc/lazy/core/tensor.h                 |   9 +-
 torch/csrc/utils/python_arg_parser.cpp        |  14 +-
 torch/csrc/utils/python_arg_parser.h          |  75 ++-
 torch/csrc/utils/python_symnode.cpp           |  19 +
 torch/csrc/utils/python_symnode.h             | 182 ++++++
 torch/fx/experimental/proxy_tensor.py         |  45 +-
 torch/fx/experimental/symbolic_shapes.py      | 197 +++++--
 torch/fx/graph.py                             |   2 +-
 54 files changed, 730 insertions(+), 1437 deletions(-)
 delete mode 100644 c10/core/SymFloatNodeImpl.cpp
 delete mode 100644 c10/core/SymFloatNodeImpl.h
 delete mode 100644 c10/core/SymIntNodeImpl.cpp
 create mode 100644 c10/core/SymNodeImpl.cpp
 rename c10/core/{SymIntNodeImpl.h => SymNodeImpl.h} (50%)
 delete mode 100644 test/test_dynamic_shapes.py.bak
 create mode 100644 torch/csrc/utils/python_symnode.cpp
 create mode 100644 torch/csrc/utils/python_symnode.h

diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index a3de2aba624ea..ec894b7a5f4b0 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-095ee628212f0235ad0d6908bdd514123639fc86
+1e9b8bdc75114ac6c16305c970be37a1cd2fdb1c
diff --git a/.lintrunner.toml b/.lintrunner.toml
index a48d411ea9a83..62b13822e4ad6 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -439,7 +439,7 @@ command = [
     """--error-description=\
         This line has an isinstance call that directly refers to \
         int or float.  This is error-prone because you may also \
-        have wanted to allow SymIntNode or SymFloatNode in your test.  \
+        have wanted to allow SymInt or SymFloat in your test.  \
         To suppress this lint, use an appropriate type alias defined \
         in torch._prims_common; use IntLike/FloatLike when you would accept \
         both regular and symbolic numbers, Dim for ints representing \
diff --git a/aten/src/ATen/FunctionalStorageImpl.cpp b/aten/src/ATen/FunctionalStorageImpl.cpp
index e50ffbdcf5112..f42c535389900 100644
--- a/aten/src/ATen/FunctionalStorageImpl.cpp
+++ b/aten/src/ATen/FunctionalStorageImpl.cpp
@@ -95,7 +95,7 @@ c10::SymInt get_nbytes(const Tensor& value) {
   if (value.unsafeGetTensorImpl()->has_symbolic_sizes_strides()) {
     // Today, the two implementations of SymInt are in Python (proxy tensor),
     // and lazy tensor (LTC/XLA).
-    // LTC hasn't implemented SymInt support yet though (torch::lazy::SymIntNodeImpl).
+    // LTC hasn't implemented SymInt support yet though
     // Once it does, we should remove this check.
     if (value.key_set().has(c10::DispatchKey::Python)) {
       return value.storage().sym_nbytes();
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index 122afcba4d843..e9a5ea9ec6a20 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -562,7 +562,7 @@ struct TORCH_API IValue final {
   IValue(c10::SymInt i) {
     if (i.is_symbolic()) {
       tag = Tag::SymInt;
-      payload.u.as_intrusive_ptr = i.toSymIntNodeImpl().release();
+      payload.u.as_intrusive_ptr = i.toSymNodeImpl().release();
     } else {
       tag = Tag::Int;
       payload.u.as_int = i.as_int_unchecked();
@@ -578,7 +578,7 @@ struct TORCH_API IValue final {
   IValue(c10::SymFloat i) {
     if (i.is_symbolic()) {
       tag = Tag::SymFloat;
-      payload.u.as_intrusive_ptr = i.toSymFloatNodeImpl().release();
+      payload.u.as_intrusive_ptr = i.toSymNodeImpl().release();
     } else {
       tag = Tag::Double;
       payload.u.as_double = i.as_float_unchecked();
@@ -812,10 +812,10 @@ struct TORCH_API IValue final {
     // for both SymFloat and double
     if (s.isSymInt()) {
       tag = Tag::SymInt;
-      payload.u.as_intrusive_ptr = s.toSymInt().toSymIntNodeImpl().release();
+      payload.u.as_intrusive_ptr = s.toSymInt().toSymNodeImpl().release();
     } else if (s.isSymFloat()) {
       tag = Tag::SymFloat;
-      payload.u.as_intrusive_ptr = s.toSymFloat().toSymFloatNodeImpl().release();
+      payload.u.as_intrusive_ptr = s.toSymFloat().toSymNodeImpl().release();
     } else if (s.isFloatingPoint()) {
       tag = Tag::Double;
       payload.u.as_double = s.toDouble();
diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
index 1c3453abb4c88..bea795c8d81e8 100644
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@@ -219,7 +219,7 @@ inline at::Generator IValue::toGenerator() const& {
 inline c10::SymInt IValue::toSymInt() const {
   AT_ASSERT(isSymInt() || isInt(), "Expected SymInt or int but got ", tagKind());
   if (isSymInt()) {
-    return c10::SymInt::toSymInt(toIntrusivePtr<c10::SymIntNodeImpl>());
+    return c10::SymInt(toIntrusivePtr<c10::SymNodeImpl>());
   } else {
     return c10::SymInt(payload.u.as_int);
   }
@@ -228,7 +228,7 @@ inline c10::SymInt IValue::toSymInt() const {
 inline c10::SymFloat IValue::toSymFloat() const {
   AT_ASSERT(isSymFloat() || isDouble(), "Expected SymFloat or double but got ", tagKind());
   if (isSymFloat()) {
-    return c10::SymFloat::toSymFloat(toIntrusivePtr<c10::SymFloatNodeImpl>());
+    return c10::SymFloat(toIntrusivePtr<c10::SymNodeImpl>());
   } else {
     return c10::SymFloat(payload.u.as_double);
   }
diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index e554bd586272f..0a8f5e14d9a5d 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -1310,7 +1310,6 @@ struct TORCH_API SymIntType : public Type {
     return "SymInt";
   }
   std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
-    // TODO: will become a Union[SymIntNodeImpl|int] in the near future
     return "int";
   }
   static const TypeKind Kind = TypeKind::SymIntType;
diff --git a/aten/src/ATen/test/scalar_test.cpp b/aten/src/ATen/test/scalar_test.cpp
index bd9e84bc23554..b6762e1739458 100644
--- a/aten/src/ATen/test/scalar_test.cpp
+++ b/aten/src/ATen/test/scalar_test.cpp
@@ -194,34 +194,3 @@ TEST(TestScalar, TestFormatting) {
   ASSERT_EQ("(2,3.1)", format(Scalar(c10::complex<float>(2.0, 3.1))));
   ASSERT_EQ("4", format(Scalar(Scalar(4).toSymInt())));
 }
-
-TEST(TestSymInt, Basic) {
-  Scalar foo;
-  auto a_impl = c10::make_intrusive<c10::SymIntNodeImpl>();
-  foo = Scalar(a_impl->toSymInt());
-  ASSERT_EQ(a_impl.use_count(), 2);
-  Scalar bar{foo};
-  ASSERT_EQ(a_impl.use_count(), 3);
-  auto baz = bar;
-  ASSERT_EQ(a_impl.use_count(), 4);
-  auto foo2 = std::move(bar);
-  ASSERT_EQ(a_impl.use_count(), 4);
-  ASSERT_TRUE(foo2.isSymInt());
-  // NOLINTNEXTLINE(bugprone-use-after-move,clang-analyzer-cplusplus.Move)
-  ASSERT_TRUE(bar.isIntegral(false));
-  foo2 = SymInt(4);
-  ASSERT_FALSE(foo2.isSymInt());
-  ASSERT_EQ(foo2.toSymInt().expect_int(), 4);
-  // NOLINTNEXTLINE(clang-diagnostic-self-assign-overloaded)
-  foo2 = foo2;
-  ASSERT_FALSE(foo2.isSymInt());
-  ASSERT_EQ(foo2.toSymInt().expect_int(), 4);
-
-  ASSERT_EQ(a_impl.use_count(), 3);
-
-  ASSERT_THROW(foo.to<double>(), c10::Error);
-
-  Scalar int_s = 3;
-  TORCH_CHECK(int_s.toSymInt().expect_int(), 3);
-
-}
diff --git a/build_variables.bzl b/build_variables.bzl
index 017ed9aef5413..12ad9730123f1 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -958,6 +958,7 @@ libtorch_python_core_sources = [
     "torch/csrc/utils/object_ptr.cpp",
     "torch/csrc/utils/python_arg_parser.cpp",
     "torch/csrc/utils/python_dispatch.cpp",
+    "torch/csrc/utils/python_symnode.cpp",
     "torch/csrc/utils/structseq.cpp",
     "torch/csrc/utils/tensor_apply.cpp",
     "torch/csrc/utils/tensor_dtypes.cpp",
diff --git a/c10/core/Scalar.h b/c10/core/Scalar.h
index c0d89315b65db..0c124177e38f7 100644
--- a/c10/core/Scalar.h
+++ b/c10/core/Scalar.h
@@ -92,8 +92,8 @@ class C10_API Scalar {
 
   SymInt toSymInt() const {
     if (Tag::HAS_si == tag) {
-      return c10::SymInt::toSymInt(intrusive_ptr<SymIntNodeImpl>::reclaim_copy(
-          static_cast<SymIntNodeImpl*>(v.p)));
+      return c10::SymInt(intrusive_ptr<SymNodeImpl>::reclaim_copy(
+          static_cast<SymNodeImpl*>(v.p)));
     } else {
       return toLong();
     }
@@ -101,9 +101,8 @@ class C10_API Scalar {
 
   SymFloat toSymFloat() const {
     if (Tag::HAS_sd == tag) {
-      return c10::SymFloat::toSymFloat(
-          intrusive_ptr<SymFloatNodeImpl>::reclaim_copy(
-              static_cast<SymFloatNodeImpl*>(v.p)));
+      return c10::SymFloat(intrusive_ptr<SymNodeImpl>::reclaim_copy(
+          static_cast<SymNodeImpl*>(v.p)));
     } else {
       return toDouble();
     }
diff --git a/c10/core/SymFloat.cpp b/c10/core/SymFloat.cpp
index 0ba980a9727ea..3c1fea2ee3503 100644
--- a/c10/core/SymFloat.cpp
+++ b/c10/core/SymFloat.cpp
@@ -1,32 +1,27 @@
 #include <c10/core/SymFloat.h>
-#include <c10/core/SymFloatNodeImpl.h>
+#include <c10/core/SymNodeImpl.h>
 #include <array>
 
 namespace c10 {
 
-SymFloatNode SymFloat::toSymFloatNodeImpl() const {
+SymNode SymFloat::toSymNodeImpl() const {
   TORCH_CHECK(is_symbolic());
-  return SymFloatNode::reclaim_copy(toSymFloatNodeImplUnowned());
+  return SymNode::reclaim_copy(toSymNodeImplUnowned());
 }
 
-static std::array<SymFloatNode, 2> normalize_symfloats(
-    SymFloat a_,
-    SymFloat b_) {
-  SymFloatNode a, b;
+static std::array<SymNode, 2> normalize_symfloats(SymFloat a_, SymFloat b_) {
+  SymNode a, b;
   if (a_.is_symbolic())
-    a = a_.toSymFloatNodeImpl();
+    a = a_.toSymNodeImpl();
   if (b_.is_symbolic())
-    b = b_.toSymFloatNodeImpl();
+    b = b_.toSymNodeImpl();
 
-  SymFloatNodeImpl* common = a ? a.get() : b.get();
-  // TODO: technically we need to check that the classes match
+  SymNodeImpl* common = a ? a.get() : b.get();
   if (!a) {
-    a = common->wrap(a_.as_float_unchecked());
-    a_.toSymFloat(a); //
+    a = common->wrap_float(a_.as_float_unchecked());
   }
   if (!b) {
-    b = common->wrap(b_.as_float_unchecked());
-    b_.toSymFloat(b);
+    b = common->wrap_float(b_.as_float_unchecked());
   }
   return {a, b};
 }
@@ -36,7 +31,7 @@ SymFloat SymFloat::operator+(SymFloat sci) const {
     return SymFloat(data_ + sci.data_);
   }
   auto res = normalize_symfloats(*this, sci);
-  return SymFloat::toSymFloat(res[0]->add(res[1]));
+  return SymFloat(res[0]->add(res[1]));
 }
 
 SymFloat SymFloat::operator-(SymFloat sci) const {
@@ -44,7 +39,7 @@ SymFloat SymFloat::operator-(SymFloat sci) const {
     return SymFloat(data_ - sci.data_);
   }
   auto res = normalize_symfloats(*this, sci);
-  return SymFloat::toSymFloat(res[0]->sub(res[1]));
+  return SymFloat(res[0]->sub(res[1]));
 }
 
 SymFloat SymFloat::operator*(SymFloat sci) const {
@@ -52,7 +47,7 @@ SymFloat SymFloat::operator*(SymFloat sci) const {
     return SymFloat(data_ * sci.data_);
   }
   auto res = normalize_symfloats(*this, sci);
-  return SymFloat::toSymFloat(res[0]->mul(res[1]));
+  return SymFloat(res[0]->mul(res[1]));
 }
 
 SymFloat SymFloat::operator/(SymFloat sci) const {
@@ -60,16 +55,12 @@ SymFloat SymFloat::operator/(SymFloat sci) const {
     return SymFloat(data_ / sci.data_);
   }
   auto res = normalize_symfloats(*this, sci);
-  return SymFloat::toSymFloat(res[0]->truediv(res[1]));
-}
-
-c10::SymFloat SymFloat::toSymFloat(SymFloatNode sin_sp) {
-  return c10::SymFloat(std::move(sin_sp));
+  return SymFloat(res[0]->truediv(res[1]));
 }
 
 std::ostream& operator<<(std::ostream& os, SymFloat s) {
   if (s.is_symbolic()) {
-    os << s.toSymFloatNodeImpl()->str();
+    os << s.toSymNodeImpl()->str();
   } else {
     os << s.as_float_unchecked();
   }
diff --git a/c10/core/SymFloat.h b/c10/core/SymFloat.h
index 92abb81ea2a22..b787c020fd757 100644
--- a/c10/core/SymFloat.h
+++ b/c10/core/SymFloat.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <c10/core/SymFloatNodeImpl.h>
+#include <c10/core/SymNodeImpl.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/Exception.h>
 #include <c10/util/intrusive_ptr.h>
@@ -14,20 +14,21 @@ namespace c10 {
 class C10_API SymFloat {
  public:
   /*implicit*/ SymFloat(double d) : data_(d){};
-  SymFloat(SymFloatNode ptr)
-      : data_(std::numeric_limits<double>::quiet_NaN()), ptr_(std::move(ptr)){};
+  SymFloat(SymNode ptr)
+      : data_(std::numeric_limits<double>::quiet_NaN()), ptr_(std::move(ptr)) {
+    TORCH_CHECK(ptr_->is_float());
+  };
   SymFloat() : data_(0.0) {}
 
-  SymFloatNodeImpl* toSymFloatNodeImplUnowned() const {
+  SymNodeImpl* toSymNodeImplUnowned() const {
     return ptr_.get();
   }
 
-  SymFloatNodeImpl* release() && {
+  SymNodeImpl* release() && {
     return std::move(ptr_).release();
   }
 
-  SymFloatNode toSymFloatNodeImpl() const;
-  static c10::SymFloat toSymFloat(SymFloatNode sin);
+  SymNode toSymNodeImpl() const;
 
   double expect_float() const {
     TORCH_CHECK(!is_symbolic());
@@ -53,7 +54,7 @@ class C10_API SymFloat {
  private:
   // TODO: optimize to union
   double data_;
-  SymFloatNode ptr_;
+  SymNode ptr_;
 };
 
 C10_API std::ostream& operator<<(std::ostream& os, SymFloat s);
diff --git a/c10/core/SymFloatNodeImpl.cpp b/c10/core/SymFloatNodeImpl.cpp
deleted file mode 100644
index 714ee095d84e3..0000000000000
--- a/c10/core/SymFloatNodeImpl.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-#include <c10/core/SymFloat.h>
-#include <c10/core/SymFloatNodeImpl.h>
-#include <c10/core/SymIntNodeImpl.h>
-
-namespace c10 {
-
-c10::SymFloat SymFloatNodeImpl::toSymFloat() {
-  auto sit_sp = SymFloatNode::reclaim_copy(this);
-  return SymFloat::toSymFloat(sit_sp);
-}
-
-c10::SymIntNode SymFloatNodeImpl::ceil() {
-  TORCH_CHECK(false, "NYI");
-}
-
-c10::SymIntNode SymFloatNodeImpl::floor() {
-  TORCH_CHECK(false, "NYI");
-}
-
-} // namespace c10
diff --git a/c10/core/SymFloatNodeImpl.h b/c10/core/SymFloatNodeImpl.h
deleted file mode 100644
index 0ab9d952b5bbc..0000000000000
--- a/c10/core/SymFloatNodeImpl.h
+++ /dev/null
@@ -1,76 +0,0 @@
-#pragma once
-
-#include <c10/macros/Macros.h>
-#include <c10/util/Exception.h>
-#include <c10/util/intrusive_ptr.h>
-#include <memory>
-#include <mutex>
-#include <vector>
-
-namespace c10 {
-
-class SymIntNodeImpl;
-using SymIntNode = c10::intrusive_ptr<SymIntNodeImpl>;
-
-class SymFloat;
-class SymFloatNodeImpl;
-using SymFloatNode = c10::intrusive_ptr<SymFloatNodeImpl>;
-
-class C10_API SymFloatNodeImpl : public c10::intrusive_ptr_target {
- public:
-  c10::SymFloat toSymFloat();
-  virtual ~SymFloatNodeImpl(){};
-
-  template <typename T>
-  c10::intrusive_ptr<T> dyn_cast() const {
-    return c10::intrusive_ptr<T>::reclaim_copy(dynamic_cast<T*>(this));
-  }
-
-  virtual SymFloatNode wrap(double num) {
-    TORCH_CHECK(false, "NYI");
-  };
-  virtual SymFloatNode add(const SymFloatNode& other) {
-    TORCH_CHECK(false, "NYI");
-  }
-  virtual SymFloatNode sub(const SymFloatNode& other) {
-    TORCH_CHECK(false, "NYI");
-  }
-  virtual SymFloatNode mul(const SymFloatNode& other) {
-    TORCH_CHECK(false, "NYI");
-  }
-  virtual SymFloatNode truediv(const SymFloatNode& other) {
-    TORCH_CHECK(false, "NYI");
-  }
-  virtual SymFloatNode pow(const SymFloatNode& other) {
-    TORCH_CHECK(false, "NYI");
-  }
-  virtual SymFloatNode eq(const SymFloatNode& other) {
-    TORCH_CHECK(false, "NYI");
-  };
-  virtual SymFloatNode ne(const SymFloatNode& other) {
-    TORCH_CHECK(false, "NYI");
-  };
-  virtual SymFloatNode gt(const SymFloatNode& other) {
-    TORCH_CHECK(false, "NYI");
-  };
-  virtual SymFloatNode lt(const SymFloatNode& other) {
-    TORCH_CHECK(false, "NYI");
-  };
-  virtual SymFloatNode le(const SymFloatNode& other) {
-    TORCH_CHECK(false, "NYI");
-  };
-  virtual SymFloatNode ge(const SymFloatNode& other) {
-    TORCH_CHECK(false, "NYI");
-  };
-  virtual SymIntNode ceil();
-  virtual SymIntNode floor();
-  virtual std::string str() {
-    TORCH_CHECK(false, "NYI");
-  };
-  std::ostream& operator<<(std::ostream& os) {
-    os << str();
-    return os;
-  };
-};
-
-} // namespace c10
diff --git a/c10/core/SymInt.cpp b/c10/core/SymInt.cpp
index 5ef576b3af1b0..b32157e4a94e1 100644
--- a/c10/core/SymInt.cpp
+++ b/c10/core/SymInt.cpp
@@ -1,47 +1,46 @@
 #include <c10/core/SymFloat.h>
 #include <c10/core/SymInt.h>
-#include <c10/core/SymIntNodeImpl.h>
+#include <c10/core/SymNodeImpl.h>
 #include <array>
 
 namespace c10 {
 
-static std::array<SymIntNode, 2> normalize_symints(SymInt a_, SymInt b_) {
-  SymIntNode a, b;
+static std::array<SymNode, 2> normalize_symints(SymInt a_, SymInt b_) {
+  SymNode a, b;
   if (a_.is_symbolic())
-    a = a_.toSymIntNodeImpl();
+    a = a_.toSymNodeImpl();
   if (b_.is_symbolic())
-    b = b_.toSymIntNodeImpl();
+    b = b_.toSymNodeImpl();
 
-  SymIntNodeImpl* common = a ? a.get() : b.get();
+  SymNodeImpl* common = a ? a.get() : b.get();
   // TODO: technically we need to check that the classes match
   if (!a) {
-    a = common->wrap(a_.as_int_unchecked());
-    a_.toSymInt(a); //
+    a = common->wrap_int(a_.as_int_unchecked());
   }
   if (!b) {
-    b = common->wrap(b_.as_int_unchecked());
-    b_.toSymInt(b);
+    b = common->wrap_int(b_.as_int_unchecked());
   }
   return {a, b};
 }
 
-SymIntNode SymInt::toSymIntNodeImpl() const {
+SymNode SymInt::toSymNodeImpl() const {
   TORCH_CHECK(is_symbolic());
-  return SymIntNode::reclaim_copy(toSymIntNodeImplUnowned());
+  return SymNode::reclaim_copy(toSymNodeImplUnowned());
 }
 
-c10::SymInt SymInt::toSymInt(SymIntNode sin_sp) {
+SymInt::SymInt(SymNode sin_sp) {
+  TORCH_CHECK(sin_sp->is_int());
   auto ptr = static_cast<uint64_t>(
       reinterpret_cast<uintptr_t>(static_cast<void*>(sin_sp.release())));
   auto rep = (ptr & ~MASK) | IS_SYM;
-  return c10::SymInt(UNCHECKED, static_cast<int64_t>(rep));
+  data_ = static_cast<int64_t>(rep);
 }
 
 int64_t SymInt::guard_int(const char* file, int64_t line) const {
   if (!is_symbolic()) {
     return data_;
   }
-  SymIntNode a = toSymIntNodeImpl();
+  SymNode a = toSymNodeImpl();
   return a->guard_int(file, line);
 }
 
@@ -49,7 +48,7 @@ SymInt::operator SymFloat() const {
   if (!is_symbolic()) {
     return SymFloat(double(data_));
   }
-  return SymFloat::toSymFloat(toSymIntNodeImpl()->sym_float());
+  return SymFloat(toSymNodeImpl()->sym_float());
 }
 
 SymInt SymInt::operator+(SymInt sci) const {
@@ -57,7 +56,7 @@ SymInt SymInt::operator+(SymInt sci) const {
     return SymInt(data_ + sci.data_);
   }
   auto res = normalize_symints(*this, sci);
-  return SymInt::toSymInt(res[0]->add(res[1]));
+  return SymInt(res[0]->add(res[1]));
 }
 
 SymInt SymInt::operator-(SymInt sci) const {
@@ -65,7 +64,7 @@ SymInt SymInt::operator-(SymInt sci) const {
     return SymInt(data_ - sci.data_);
   }
   auto res = normalize_symints(*this, sci);
-  return SymInt::toSymInt(res[0]->sub(res[1]));
+  return SymInt(res[0]->sub(res[1]));
 }
 
 SymInt SymInt::operator*(SymInt sci) const {
@@ -73,7 +72,7 @@ SymInt SymInt::operator*(SymInt sci) const {
     return SymInt(data_ * sci.data_);
   }
   auto res = normalize_symints(*this, sci);
-  return SymInt::toSymInt(res[0]->mul(res[1]));
+  return SymInt(res[0]->mul(res[1]));
 }
 
 SymInt SymInt::operator/(SymInt sci) const {
@@ -81,7 +80,7 @@ SymInt SymInt::operator/(SymInt sci) const {
     return SymInt(data_ / sci.data_);
   }
   auto res = normalize_symints(*this, sci);
-  return SymInt::toSymInt(res[0]->floordiv(res[1]));
+  return SymInt(res[0]->floordiv(res[1]));
 }
 
 SymInt SymInt::operator%(SymInt sci) const {
@@ -89,7 +88,7 @@ SymInt SymInt::operator%(SymInt sci) const {
     return SymInt(data_ % sci.data_);
   }
   auto res = normalize_symints(*this, sci);
-  return SymInt::toSymInt(res[0]->mod(res[1]));
+  return SymInt(res[0]->mod(res[1]));
 }
 
 bool SymInt::operator==(SymInt sci) const {
@@ -141,14 +140,14 @@ SymInt SymInt::min(SymInt sci) const {
     return std::min(data_, sci.data_);
   }
   auto res = normalize_symints(*this, sci);
-  return SymInt::toSymInt(res[0]->min(res[1]));
+  return SymInt(res[0]->min(res[1]));
 }
 SymInt SymInt::max(SymInt sci) const {
   if (!is_symbolic() && !sci.is_symbolic()) {
     return std::max(data_, sci.data_);
   }
   auto res = normalize_symints(*this, sci);
-  return SymInt::toSymInt(res[0]->max(res[1]));
+  return SymInt(res[0]->max(res[1]));
 }
 
 void SymInt::operator*=(SymInt sci) {
@@ -193,7 +192,7 @@ SymInt SymInt::operator*(int64_t sci) const {
 
 std::ostream& operator<<(std::ostream& os, SymInt s) {
   if (s.is_symbolic()) {
-    os << s.toSymIntNodeImpl()->str();
+    os << s.toSymNodeImpl()->str();
   } else {
     os << s.as_int_unchecked();
   }
@@ -202,7 +201,7 @@ std::ostream& operator<<(std::ostream& os, SymInt s) {
 
 SymInt operator-(SymInt s) {
   if (s.is_symbolic()) {
-    return SymInt::toSymInt(s.toSymIntNodeImpl()->neg());
+    return SymInt(s.toSymNodeImpl()->neg());
   } else {
     return SymInt(-s.as_int_unchecked());
   }
diff --git a/c10/core/SymInt.h b/c10/core/SymInt.h
index 6934a607ccbff..a10775196d86b 100644
--- a/c10/core/SymInt.h
+++ b/c10/core/SymInt.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <c10/core/SymIntNodeImpl.h>
+#include <c10/core/SymNodeImpl.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/Exception.h>
 #include <c10/util/intrusive_ptr.h>
@@ -12,24 +12,19 @@ namespace c10 {
 
 class SymFloat;
 
-// `SymInt` is a C++ wrapper class around int64_t data_ which  and is used to
-// represent concrete dimension values.
+// SymInt represents either a regular int64_t, or a symbolic integer
+// (represented in a type erased way as SymNode).  The intention is for SymInt
+// to represent symbolic sizes that arise when doing shape computation in
+// operator kernels. This allows for tracing through programs without baking in
+// concrete sizes into kernel calls.
 //
-// `SymInt` is also a data type in Pytorch that can be used in function schemas
-// to enable tracing.
+// SymInt has an API equivalent to int64_t.  In particular, it is a value type.
+// Internally, SymInt is represented in a clever packed way, so that it only
+// occupies one word of space; but morally, it is a union between an int64_t
+// and an intrusive pointer to SymNodeImpl.
 //
-// `SymInt` is introduced to enable tracing arithmetic
-// operations on symbolic integers (e.g. sizes). Tracing symbolic sizes will
-// allow LTC and AOTAutograd representing dynamic shapes in expression graphs
-// faithfully without baking in concrete dimension values.
-//
-// To trace the operations, SymInt will overload arithmetic operators (e.g. +,
-// -, *) and will provide overloads taking SymInt for commonly used math
-// functions.
-//
-// SymInt will be extenteded to represent a union structure Union[int64_t,
-// SymIntNodeImpl*] which will be implemented as a single packed int64_t field
-// named data_.
+// Invariant: the referenced SymNodeImpl is guaranteed to be a SymNode where
+// is_int() returns true
 
 class C10_API SymInt {
  public:
@@ -44,6 +39,7 @@ class C10_API SymInt {
     TORCH_CHECK(!is_symbolic());
   };
   SymInt() : data_(0) {}
+  SymInt(SymNode n);
 
   // unchecked c-tor accepting raw `data_`
   // One appropriate use for this is when you are constructing a symint
@@ -55,7 +51,7 @@ class C10_API SymInt {
   // temporary and then use the move constructor/assignment
   SymInt(const SymInt& s) : data_(0) {
     if (s.is_symbolic()) {
-      *this = SymInt::toSymInt(s.toSymIntNodeImpl());
+      *this = SymInt(s.toSymNodeImpl());
     } else {
       data_ = s.data_;
     }
@@ -67,7 +63,7 @@ class C10_API SymInt {
   SymInt& operator=(const SymInt& s) {
     if (this != &s) {
       if (s.is_symbolic()) {
-        *this = SymInt::toSymInt(s.toSymIntNodeImpl());
+        *this = SymInt(s.toSymNodeImpl());
       } else {
         data_ = s.data_;
       }
@@ -76,7 +72,7 @@ class C10_API SymInt {
   }
   SymInt& operator=(SymInt&& s) {
     if (this != &s) {
-      release_(); // release the current SymIntNode if any
+      release_(); // release the current SymNode if any
       data_ = s.data_;
       if (s.is_symbolic())
         s.data_ = 0;
@@ -86,31 +82,31 @@ class C10_API SymInt {
 
   SymInt clone() const {
     if (is_symbolic()) {
-      return toSymIntNodeImplUnowned()->clone()->toSymInt();
+      return SymInt(toSymNodeImplUnowned()->clone());
     }
     return *this;
   }
 
-  SymIntNodeImpl* toSymIntNodeImplUnowned() const {
+  SymNodeImpl* toSymNodeImplUnowned() const {
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(is_symbolic());
     uint64_t unextended_bits = static_cast<uint64_t>(data_) & ~MASK;
     uint64_t sign_bit_mask = 1ULL << (62 - 1);
     // https://stackoverflow.com/questions/42534749/signed-extension-from-24-bit-to-32-bit-in-c
     uint64_t extended_bits = (unextended_bits ^ sign_bit_mask) - sign_bit_mask;
-    return static_cast<SymIntNodeImpl*>(
+    return static_cast<SymNodeImpl*>(
         reinterpret_cast<void*>(static_cast<uintptr_t>(extended_bits)));
   }
 
   void release_() {
     if (is_symbolic()) {
-      SymIntNode::reclaim(toSymIntNodeImplUnowned()); // steal
+      SymNode::reclaim(toSymNodeImplUnowned()); // steal
     }
   }
 
-  SymIntNodeImpl* release() && {
+  SymNodeImpl* release() && {
 #ifndef C10_MOBILE
     TORCH_INTERNAL_ASSERT(is_symbolic());
-    auto* r = toSymIntNodeImplUnowned();
+    auto* r = toSymNodeImplUnowned();
     data_ = 0; // transfer ownership
     return r;
 #else
@@ -118,8 +114,7 @@ class C10_API SymInt {
 #endif
   }
 
-  SymIntNode toSymIntNodeImpl() const;
-  static c10::SymInt toSymInt(SymIntNode sin);
+  SymNode toSymNodeImpl() const;
 
   ~SymInt() {
     release_();
diff --git a/c10/core/SymIntNodeImpl.cpp b/c10/core/SymIntNodeImpl.cpp
deleted file mode 100644
index 483110a90fa64..0000000000000
--- a/c10/core/SymIntNodeImpl.cpp
+++ /dev/null
@@ -1,11 +0,0 @@
-#include <c10/core/SymInt.h>
-#include <c10/core/SymIntNodeImpl.h>
-
-namespace c10 {
-
-c10::SymInt SymIntNodeImpl::toSymInt() {
-  auto sit_sp = SymIntNode::reclaim_copy(this);
-  return SymInt::toSymInt(sit_sp);
-}
-
-} // namespace c10
diff --git a/c10/core/SymNodeImpl.cpp b/c10/core/SymNodeImpl.cpp
new file mode 100644
index 0000000000000..80999ba50f1ed
--- /dev/null
+++ b/c10/core/SymNodeImpl.cpp
@@ -0,0 +1,3 @@
+#include <c10/core/SymNodeImpl.h>
+
+namespace c10 {} // namespace c10
diff --git a/c10/core/SymIntNodeImpl.h b/c10/core/SymNodeImpl.h
similarity index 50%
rename from c10/core/SymIntNodeImpl.h
rename to c10/core/SymNodeImpl.h
index 0b9d4c5579282..d2f3aafaad8b1 100644
--- a/c10/core/SymIntNodeImpl.h
+++ b/c10/core/SymNodeImpl.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <c10/core/SymFloatNodeImpl.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/Exception.h>
 #include <c10/util/intrusive_ptr.h>
@@ -10,13 +9,12 @@
 
 namespace c10 {
 
-class SymInt;
-class SymIntNodeImpl;
+class SymNodeImpl;
+using SymNode = c10::intrusive_ptr<SymNodeImpl>;
 
-class C10_API SymIntNodeImpl : public c10::intrusive_ptr_target {
+class C10_API SymNodeImpl : public c10::intrusive_ptr_target {
  public:
-  c10::SymInt toSymInt();
-  virtual ~SymIntNodeImpl(){};
+  virtual ~SymNodeImpl(){};
 
   template <typename T>
   c10::intrusive_ptr<T> dyn_cast() const {
@@ -24,66 +22,87 @@ class C10_API SymIntNodeImpl : public c10::intrusive_ptr_target {
   }
 
   // these could be pure virtual when we implement LTC versions
-  virtual SymIntNode add(const SymIntNode& other) {
+  virtual bool is_int() {
     TORCH_CHECK(false, "NYI");
   };
-  virtual SymIntNode sub(const SymIntNode& other) {
+  virtual bool is_float() {
     TORCH_CHECK(false, "NYI");
   };
-  virtual SymIntNode mul(const SymIntNode& other) {
+  virtual SymNode add(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
   };
-  virtual SymFloatNode truediv(const SymIntNode& other) {
+  virtual SymNode sub(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
   };
-  virtual SymIntNode floordiv(const SymIntNode& other) {
+  virtual SymNode mul(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
   };
-  virtual SymIntNode mod(const SymIntNode& other) {
+  virtual SymNode truediv(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
   };
-  virtual SymIntNode eq(const SymIntNode& other) {
+  virtual SymNode pow(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
   };
-  virtual SymIntNode ne(const SymIntNode& other) {
+  virtual SymNode floordiv(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
   };
-  virtual SymIntNode gt(const SymIntNode& other) {
+  virtual SymNode mod(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
   };
-  virtual SymIntNode lt(const SymIntNode& other) {
+  virtual SymNode eq(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
   };
-  virtual SymIntNode le(const SymIntNode& other) {
+  virtual SymNode ne(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
   };
-  virtual SymIntNode ge(const SymIntNode& other) {
+  virtual SymNode gt(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
   };
-  virtual SymIntNode ceil() {
+  virtual SymNode lt(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
   };
-  virtual SymIntNode neg() {
+  virtual SymNode le(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
   };
-  virtual SymIntNode min(const SymIntNode& other) {
+  virtual SymNode ge(const SymNode& other) {
     TORCH_CHECK(false, "NYI");
   };
-  virtual SymIntNode max(const SymIntNode& other) {
+  virtual SymNode ceil() {
     TORCH_CHECK(false, "NYI");
   };
-  virtual SymIntNode clone() {
+  virtual SymNode floor() {
     TORCH_CHECK(false, "NYI");
   };
-  virtual SymFloatNode sym_float() {
+  virtual SymNode neg() {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode min(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode max(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode clone() {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode sym_int() {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode sym_float() {
     TORCH_CHECK(false, "NYI");
   }
-  virtual SymIntNode wrap(int64_t num) {
+  virtual SymNode wrap_int(int64_t num) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode wrap_float(double num) {
     TORCH_CHECK(false, "NYI");
   };
   virtual int64_t guard_int(const char* file, int64_t line) {
     TORCH_CHECK(false, "NYI");
   };
+  virtual double guard_float(const char* file, int64_t line) {
+    TORCH_CHECK(false, "NYI");
+  };
   virtual int64_t int_() {
     TORCH_CHECK(false, "NYI");
   };
diff --git a/c10/test/core/SymInt_test.cpp b/c10/test/core/SymInt_test.cpp
index a57e7c706486d..d889d72b5afb1 100644
--- a/c10/test/core/SymInt_test.cpp
+++ b/c10/test/core/SymInt_test.cpp
@@ -1,7 +1,7 @@
 #include <gtest/gtest.h>
 
 #include <c10/core/SymInt.h>
-#include <c10/core/SymIntNodeImpl.h>
+#include <c10/core/SymNodeImpl.h>
 
 using namespace c10;
 #ifndef C10_MOBILE
@@ -20,12 +20,6 @@ TEST(SymIntTest, ConcreteInts) {
   check(-4611686018427387904LL);
 }
 
-TEST(SymIntTest, AddNode) {
-  auto n = c10::make_intrusive<SymIntNodeImpl>();
-  auto i = n->toSymInt();
-  EXPECT_TRUE(i.is_symbolic());
-}
-
 TEST(SymIntTest, CheckRange) {
   EXPECT_FALSE(SymInt::check_range(INT64_MIN));
 }
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 8c0eac82cf996..807f486ac0d6a 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -335,8 +335,8 @@
     "Quantize",
     # torch.utils.backcompat
     "Warning",
-    "SymIntNode",
-    "SymFloatNode",
+    "SymInt",
+    "SymFloat",
 ]
 
 # The suffix(es) of source filenames.
diff --git a/functorch/_src/aot_autograd.py b/functorch/_src/aot_autograd.py
index b1e29b6ac4103..d4663c6dc71af 100644
--- a/functorch/_src/aot_autograd.py
+++ b/functorch/_src/aot_autograd.py
@@ -605,7 +605,7 @@ def unflatten(self, x):
             return x
         return pytree.tree_unflatten(x, self.spec)
 
-KNOWN_TYPES = [torch.Tensor, int, str, float, bool, torch.SymIntNode, torch.SymFloatNode]
+KNOWN_TYPES = [torch.Tensor, int, str, float, bool, torch.SymInt, torch.SymFloat]
 
 
 def aot_function(
diff --git a/functorch/_src/partitioners.py b/functorch/_src/partitioners.py
index 1077904528efe..c82afe65787bd 100644
--- a/functorch/_src/partitioners.py
+++ b/functorch/_src/partitioners.py
@@ -209,7 +209,7 @@ def _tensor_nbytes(numel, dtype):
 
 def _size_of(node: fx.Node) -> int:
     def to_size_hint(s):
-        if isinstance(s, torch.SymIntNode):
+        if isinstance(s, torch.SymInt):
             py_s = s.get_pyobj()
             return py_s.shape_env.size_hint(py_s.expr)
         assert isinstance(s, int)
diff --git a/functorch/experimental/cond.py b/functorch/experimental/cond.py
index 6f7bcbf506d8d..e620dbadeccbc 100644
--- a/functorch/experimental/cond.py
+++ b/functorch/experimental/cond.py
@@ -18,6 +18,8 @@
 
 def trace_cond(proxy_mode, func_overload, pred, true_fn, false_fn, operands):
     def _unwrap_proxy(e):
+        if not isinstance(e, (torch.Tensor, torch.SymInt, torch.SymFloat)):
+            return e
         return get_proxy_slot(e, proxy_mode.tracer, e, lambda e: e.proxy)
 
     assert isinstance(operands, list), "Cond operands must be a list of tensors"
diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp
index 6e3283f62a5b8..2aac6cacdffc6 100644
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@@ -1447,35 +1447,29 @@ TEST(TestSymInt, AddSymbolicInt) {
 }
 
 #ifndef C10_MOBILE
-TEST(TestSymInt, TestIntrusive) {
-  auto a = c10::make_intrusive<c10::SymIntNodeImpl>();
-  auto b = c10::make_intrusive<c10::SymIntNodeImpl>();
-  ASSERT_EQ(a.use_count(), 1);
-  ASSERT_EQ(b.use_count(), 1);
-  auto as = a->toSymInt();
-  auto bs = b->toSymInt();
-  ASSERT_EQ(a.use_count(), 2);
-  ASSERT_EQ(b.use_count(), 2);
-  as = bs;
-  ASSERT_EQ(a.use_count(), 1);
-  ASSERT_EQ(b.use_count(), 3);
-}
-
-class TestSymIntNodeImpl : public c10::SymIntNodeImpl {
+class TestSymNodeImpl : public c10::SymNodeImpl {
  public:
-  TestSymIntNodeImpl(int64_t i) : i_(i) {}
+  explicit TestSymNodeImpl(int64_t i) : i_(i) {}
+
+  bool is_int() override {
+    return true;
+  };
+
+  bool is_float() override {
+    return false;
+  };
 
   bool bool_() override {
     return static_cast<bool>(i_);
   };
 
-#define OPDEF3(NAME, OP, RET)                                            \
-  RET NAME(const c10::SymIntNode& other) override {                      \
-    return make_intrusive<TestSymIntNodeImpl>(                           \
-        this->i_ OP dynamic_cast<TestSymIntNodeImpl*>(other.get())->i_); \
+#define OPDEF3(NAME, OP, RET)                                         \
+  RET NAME(const c10::SymNode& other) override {                      \
+    return make_intrusive<TestSymNodeImpl>(                           \
+        this->i_ OP dynamic_cast<TestSymNodeImpl*>(other.get())->i_); \
   }
 
-#define OPDEF2(NAME, OP) OPDEF3(NAME, OP, c10::SymIntNode)
+#define OPDEF2(NAME, OP) OPDEF3(NAME, OP, c10::SymNode)
   OPDEF2(add, +)
   OPDEF2(sub, -)
   OPDEF2(mul, *)
@@ -1494,17 +1488,19 @@ class TestSymIntNodeImpl : public c10::SymIntNodeImpl {
   int64_t i_;
 };
 
-TEST(TestSymInt, TestSymIntToSymIntNodeDispatch) {
+TEST(TestSymInt, TestSymIntToSymNodeDispatch) {
   auto get = [](c10::SymInt si) {
-    auto node = si.toSymIntNodeImpl();
-    return dynamic_cast<TestSymIntNodeImpl*>(node.get())->i_;
+    auto node = si.toSymNodeImpl();
+    return dynamic_cast<TestSymNodeImpl*>(node.get())->i_;
   };
 
   std::vector<int64_t> inputs{0, 1, -1, 4, -4, 777, -777};
   for (auto i : inputs) {
     for (auto j : inputs) {
-      auto a = c10::make_intrusive<TestSymIntNodeImpl>(i)->toSymInt();
-      auto b = c10::make_intrusive<TestSymIntNodeImpl>(j)->toSymInt();
+      auto a = c10::SymInt(
+          static_cast<SymNode>(c10::make_intrusive<TestSymNodeImpl>(i)));
+      auto b = c10::SymInt(
+          static_cast<SymNode>(c10::make_intrusive<TestSymNodeImpl>(j)));
       ASSERT_EQ(get(a + b), i + j);
       ASSERT_EQ(get(a - b), i - j);
       ASSERT_EQ(get(a * b), i * j);
diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index b183b6169dd6b..0e85b54cfe3f7 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -12,8 +12,9 @@
 import io
 from torch.utils._pytree import tree_map
 from torch.fx.experimental.proxy_tensor import make_fx
-from torch.fx.experimental.symbolic_shapes import ShapeEnv, PySymInt, sym_float
+from torch.fx.experimental.symbolic_shapes import ShapeEnv, sym_float, guard_int, SymNode
 from torch.utils._python_dispatch import TorchDispatchMode
+from torch import SymInt
 
 aten = torch.ops.aten
 
@@ -116,9 +117,6 @@ def create_symbolic_tensor(name, arg, shape_env, storage_offset=0):
     sym_shapes, sym_strides = shape_env.create_symbolic_sizes_strides(arg)
     return FakeSymbolicTensor(sym_shapes, sym_strides, arg.dtype, arg.layout, arg.requires_grad, arg.device, storage_offset)
 
-
-CPP_SYMINT_CLASS = type(torch.SymIntNode.new_symint(1))
-
 def create_symint(shape_env, i):
     return shape_env.create_symintnode(shape_env.create_symbol(i))
 
@@ -156,8 +154,8 @@ def test_roundtrip(self):
         shape_env = ShapeEnv()
         x = create_symbolic_tensor("x", torch.randn(5, 4, 3), shape_env)
 
-        self.assertTrue(not isinstance(x.shape[0], PySymInt))
-        self.assertTrue(isinstance(x.shape[0], CPP_SYMINT_CLASS))
+        self.assertTrue(not isinstance(x.shape[0], SymNode))
+        self.assertTrue(isinstance(x.shape[0], SymInt))
 
         self.assertTrue(x.shape[0] == 5)
         self.assertTrue(x.shape[1] == 4)
@@ -165,17 +163,17 @@ def test_roundtrip(self):
 
         self.assertTrue(x.size()[0], 5)
         self.assertTrue(x.size()[1], 4)
-        self.assertTrue(isinstance(x.size()[1], CPP_SYMINT_CLASS))
+        self.assertTrue(isinstance(x.size()[1], SymInt))
         self.assertTrue(x.size()[2] == 3)
 
         self.assertTrue(x.size(0) == 5)
         self.assertTrue(x.size(1) == 4)
         self.assertTrue(x.size(2) == 3)
-        self.assertTrue(isinstance(x.size(2), CPP_SYMINT_CLASS))
+        self.assertTrue(isinstance(x.size(2), SymInt))
 
         offset = create_symint(shape_env, 2)
         y = create_symbolic_tensor("x", torch.randn(5, 4, 3), shape_env, offset)
-        self.assertTrue(isinstance(y.storage_offset(), CPP_SYMINT_CLASS))
+        self.assertTrue(isinstance(y.storage_offset(), SymInt))
         self.assertTrue(y.storage_offset() == 2)
 
         offset = 2
@@ -267,7 +265,7 @@ def test_symint_vargs(self):
     def test_stride(self):
         shape_env = ShapeEnv()
         x = create_symbolic_tensor("x", torch.randn(5, 5), shape_env)
-        self.assertIsInstance(x.stride()[0], CPP_SYMINT_CLASS)
+        self.assertIsInstance(x.stride()[0], SymInt)
 
     @skipIfNoSympy
     def test_size_expressions(self):
@@ -290,7 +288,7 @@ def test_int_to_float(self):
         shape_env = ShapeEnv()
         x = create_symbolic_tensor("x", torch.randn(5), shape_env)
         r = sym_float(x.shape[0])
-        self.assertTrue(isinstance(r, torch.SymFloatNode))
+        self.assertIsInstance(r, torch.SymFloat, msg=type(r))
 
     @skipIfNoSympy
     def test_aten_ops(self):
@@ -320,13 +318,13 @@ def test_meta_symint(self):
         shape_env = ShapeEnv()
         a0 = create_symint(shape_env, 2)
         r = torch.empty(a0, device='meta')
-        self.assertIsInstance(r.shape[0], CPP_SYMINT_CLASS)
+        self.assertIsInstance(r.shape[0], SymInt)
 
     @skipIfNoSympy
     def test_guard_int(self):
         shape_env = ShapeEnv()
         a0 = create_symint(shape_env, 2)
-        self.assertEqual(a0.guard_int(), 2)
+        self.assertEqual(guard_int(a0), 2)
         self.assertEqual(str(shape_env.guards[0][0]), "Eq(s0, 2)")
 
     @skipIfNoSympy
@@ -347,7 +345,9 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                 assert func == torch.ops.aten.add.Tensor
 
                 nonlocal sym_int_encountered
-                sym_int_encountered = kwargs["alpha"] is a0
+                # WARNING: do not do identity tests on the outer
+                # SymInt/SymFloat, they are NOT STABLE
+                sym_int_encountered = kwargs["alpha"].node is a0.node
                 kwargs["alpha"] = 0
                 return func(*args)
 
diff --git a/test/test_dynamic_shapes.py.bak b/test/test_dynamic_shapes.py.bak
deleted file mode 100644
index 19c77fe4d7ab0..0000000000000
--- a/test/test_dynamic_shapes.py.bak
+++ /dev/null
@@ -1,391 +0,0 @@
-# -*- coding: utf-8 -*-
-# Owner(s): ["oncall: jit"]
-
-from torch._C import _disabled_torch_function_impl
-import torch.fx
-import torch.nn.functional as F
-from torch.testing._internal.common_utils import run_tests, TestCase, skipIfTorchDynamo
-import unittest
-import torch
-import operator
-import itertools
-import io
-from torch.utils._pytree import tree_map
-from torch.fx.experimental.proxy_tensor import make_fx
-from torch.fx.experimental.symbolic_shapes import ShapeEnv, PySymInt, sym_float
-from torch.utils._python_dispatch import TorchDispatchMode
-
-aten = torch.ops.aten
-
-try:
-    import sympy
-    HAS_SYMPY = True
-except ImportError:
-    HAS_SYMPY = False
-skipIfNoSympy = unittest.skipIf(not HAS_SYMPY, "no sympy")
-
-
-meta_funcs = {}
-
-
-def register_meta(op):
-    def decorator(f):
-        def add_func(op):
-            meta_funcs[op] = f
-        tree_map(add_func, op)
-        return f
-    return decorator
-
-
-@register_meta([aten.add.Tensor, aten.sub.Tensor])
-def binary_meta(a, b):
-    return a.new_empty(a.shape)
-
-
-@register_meta(aten.cat.default)
-def cat_meta(tensors, dim=0):
-    concat_length = 0
-    shape = tensors[0].shape
-    for tensor in tensors:
-        for idx, (common_length, length) in enumerate(zip(shape, tensor.shape)):
-            if idx == dim:
-                concat_length = concat_length + length
-            else:
-                assert length == common_length
-    new_shape = list(shape)
-    new_shape[dim] = concat_length
-    return tensors[0].new_empty(new_shape)
-
-
-@register_meta([aten.narrow_copy.default])
-def narrow_copy_symint_meta(a, dim, start, length, **kwargs):
-    shape = []
-    for i, x in enumerate(a.shape):
-        if i == dim:
-            shape.append(length)
-        else:
-            shape.append(x)
-    return a.new_empty(tuple(shape))
-
-
-@register_meta([aten.expand.default])
-def expand_symint_meta(a, size, implicit=False):
-    return a.new_empty(size)
-
-
-def create_contiguous(shape):
-    strides = [1]
-    for dim in reversed(shape[:-1]):
-        strides.append(dim * strides[-1])
-    return list(reversed(strides))
-
-
-class FakeSymbolicTensor(torch.Tensor):
-    @staticmethod
-    def __new__(cls, sym_shape, sym_strides, dtype, layout, requires_grad, device, storage_offset=0):
-        # TODO: this is wrong in general
-        sym_stride = create_contiguous(sym_shape)
-        r = torch.Tensor._make_wrapper_subclass(
-            cls, sym_shape,
-            sym_stride, storage_offset,
-            dtype=dtype, layout=layout, requires_grad=requires_grad,
-            device=device,
-        )
-        return r
-
-    __torch_function__ = _disabled_torch_function_impl
-
-    def new_empty(self, shape):
-        return FakeSymbolicTensor(shape, None, self.dtype, self.layout, self.requires_grad, self.device)
-
-    @classmethod
-    def __torch_dispatch__(cls, func_overload, types, args=(), kwargs=None):
-        if func_overload in meta_funcs:
-            return meta_funcs[func_overload](*args, **kwargs)
-
-        if func_overload == torch.ops.aten.new_empty.default:
-            self = args[0]
-            shape = args[1]
-            return FakeSymbolicTensor(shape, self.stride(), self.dtype, self.layout, self.requires_grad, self.device)
-
-        raise RuntimeError(f"operator {func_overload} not supported")
-
-
-def create_symbolic_tensor(name, arg, shape_env, storage_offset=0):
-    sym_shapes, sym_strides = shape_env.create_symbolic_sizes_strides(arg)
-    return FakeSymbolicTensor(sym_shapes, sym_strides, arg.dtype, arg.layout, arg.requires_grad, arg.device, storage_offset)
-
-
-CPP_SYMINT_CLASS = type(torch.SymIntNode.new_symint(1))
-
-def create_symint(shape_env, i):
-    return shape_env.create_symintnode(shape_env.create_symbol(i))
-
-@skipIfTorchDynamo("Creating ShapeEnv fails for confusing reasons (also we never expect dynamo to see code like this)")
-class TestPySymInt(TestCase):
-
-    @skipIfNoSympy
-    def test_arith_ops(self):
-        shape_env = ShapeEnv()
-        symints = []
-        for i in range(2, 5):
-            symints.append((i, create_symint(shape_env, i)))
-
-        ops = [operator.add, operator.sub, operator.floordiv, operator.mul, operator.mod]
-
-        for op in ops:
-            for args in itertools.permutations(symints, 2):
-                if not isinstance(args[0][1], int) and ((op != operator.mod or op != operator.floordiv) and args[1][0] != 0):
-                    self.assertTrue(op(args[0][1], args[1][1]) == op(args[0][0], args[1][0]))
-
-
-    @skipIfNoSympy
-    def test_reverse_arith_ops(self):
-        shape_env = ShapeEnv()
-
-        a = create_symint(shape_env, 2)
-        self.assertTrue(5 // a == 5 // 2)
-
-        a = create_symint(shape_env, 2)
-        self.assertTrue(5 * a == 5 * 2)
-
-
-    @skipIfNoSympy
-    def test_roundtrip(self):
-        shape_env = ShapeEnv()
-        x = create_symbolic_tensor("x", torch.randn(5, 4, 3), shape_env)
-
-        self.assertTrue(not isinstance(x.shape[0], PySymInt))
-        self.assertTrue(isinstance(x.shape[0], CPP_SYMINT_CLASS))
-
-        self.assertTrue(x.shape[0] == 5)
-        self.assertTrue(x.shape[1] == 4)
-        self.assertTrue(x.shape[2], 3)
-
-        self.assertTrue(x.size()[0], 5)
-        self.assertTrue(x.size()[1], 4)
-        self.assertTrue(isinstance(x.size()[1], CPP_SYMINT_CLASS))
-        self.assertTrue(x.size()[2] == 3)
-
-        self.assertTrue(x.size(0) == 5)
-        self.assertTrue(x.size(1) == 4)
-        self.assertTrue(x.size(2) == 3)
-        self.assertTrue(isinstance(x.size(2), CPP_SYMINT_CLASS))
-
-        offset = create_symint(shape_env, 2)
-        y = create_symbolic_tensor("x", torch.randn(5, 4, 3), shape_env, offset)
-        self.assertTrue(isinstance(y.storage_offset(), CPP_SYMINT_CLASS))
-        self.assertTrue(y.storage_offset() == 2)
-
-        offset = 2
-        z = create_symbolic_tensor("z", torch.randn(5, 4, 3), shape_env, offset)
-        self.assertTrue(isinstance(z.storage_offset(), int))
-        self.assertTrue(z.storage_offset() == 2)
-
-    @skipIfNoSympy
-    def test_binary(self):
-        shape_env = ShapeEnv()
-        x = create_symbolic_tensor("x", torch.randn(5, 4, 3), shape_env)
-        y = create_symbolic_tensor("y", torch.randn(5, 4, 3), shape_env)
-
-        z = x + y
-        self.assertTrue(z.shape[0] == 5)
-        self.assertTrue(z.shape[1] == 4)
-        self.assertTrue(z.shape[2] == 3)
-
-        # broadcasting
-        y = create_symbolic_tensor("y", torch.randn(1, 4, 1), shape_env)
-        z = x + y
-        self.assertTrue(z.shape[0] == 5)
-        self.assertTrue(z.shape[1] == 4)
-        self.assertTrue(z.shape[2] == 3)
-
-    @skipIfNoSympy
-    def test_symint_args(self):
-        shape_env = ShapeEnv()
-        x = create_symbolic_tensor("x", torch.randn(5, 4, 3), shape_env)
-        y = create_symbolic_tensor("y", torch.randn(5, 4, 1), shape_env)
-        LAST_DIM = 2
-        z = x.narrow_copy(LAST_DIM, 0, y.shape[LAST_DIM])
-        self.assertTrue(z.shape[2] == y.shape[2])
-
-        # arithmetic expr with two symints
-        z = x.narrow_copy(LAST_DIM, 0, x.shape[LAST_DIM] - y.shape[LAST_DIM])
-        self.assertTrue(z.shape[2] == 2)
-
-        # arithmetic expr with a symint and python int
-        z = x.narrow_copy(LAST_DIM, 0, x.shape[LAST_DIM] - 1)
-        self.assertTrue(z.shape[2] == 2)
-
-    @skipIfNoSympy
-    def test_symint_vargs(self):
-        shape_env = ShapeEnv()
-        x = create_symbolic_tensor("x", torch.randn(5, 4, 3), shape_env)
-        y = create_symbolic_tensor("y", torch.randn(1, 4, 1), shape_env)
-
-        # varargs
-        z = y.expand(x.shape[0], y.shape[1], x.shape[2])
-        self.assertTrue(z.shape[0] == 5)
-        self.assertTrue(z.shape[1] == 4)
-        self.assertTrue(z.shape[2] == 3)
-
-        # shape list
-        z = y.expand((x.shape[0], y.shape[1], x.shape[2]))
-        self.assertTrue(z.shape[0] == 5)
-        self.assertTrue(z.shape[1] == 4)
-        self.assertTrue(z.shape[2] == 3)
-
-        # mixed python symints and ints
-        z = y.expand(x.shape[0], y.shape[1], 3)
-        self.assertTrue(z.shape[0] == 5)
-        self.assertTrue(z.shape[1] == 4)
-        self.assertTrue(z.shape[2] == 3)
-
-        # mixed python symints and ints in a list
-        z = y.expand((x.shape[0], y.shape[1], 3))
-        self.assertTrue(z.shape[0] == 5)
-        self.assertTrue(z.shape[1] == 4)
-        self.assertTrue(z.shape[2] == 3)
-
-        # mixed python symints and ints
-        z = y.expand(5, y.shape[1], x.shape[2])
-        self.assertTrue(z.shape[0] == 5)
-        self.assertTrue(z.shape[1] == 4)
-        self.assertTrue(z.shape[2] == 3)
-
-        # mixed python ints and symints in a list
-        z = y.expand((5, y.shape[1], x.shape[2]))
-        self.assertTrue(z.shape[0] == 5)
-        self.assertTrue(z.shape[1] == 4)
-        self.assertTrue(z.shape[2] == 3)
-
-        z = y.expand((y.shape[1],))
-        z = y.expand(y.shape[1])
-
-    @skipIfNoSympy
-    def test_stride(self):
-        shape_env = ShapeEnv()
-        x = create_symbolic_tensor("x", torch.randn(5, 5), shape_env)
-        self.assertIsInstance(x.stride()[0], CPP_SYMINT_CLASS)
-
-    @skipIfNoSympy
-    def test_size_expressions(self):
-        shape_env = ShapeEnv()
-        x = create_symbolic_tensor("x", torch.randn(5), shape_env)
-        expand_x = x.expand(x.shape[0], x.shape[0])
-        if expand_x.shape[0] > 3:
-            result = expand_x + expand_x
-        else:
-            result = expand_x + expand_x
-
-        gt_op = shape_env.guards[0][0]
-        self.assertTrue(isinstance(gt_op, sympy.core.relational.StrictGreaterThan))
-        self.assertTrue(str(x.shape[0]), str(gt_op.args[0]))
-        self.assertTrue(str(expand_x.shape[1]), str(x.shape[0]))
-        self.assertTrue(str(expand_x.shape[1]), str(result.shape[0]))
-
-    @skipIfNoSympy
-    def test_int_to_float(self):
-        shape_env = ShapeEnv()
-        x = create_symbolic_tensor("x", torch.randn(5), shape_env)
-        r = sym_float(x.shape[0])
-        self.assertTrue(isinstance(r, torch.SymFloatNode))
-
-    @skipIfNoSympy
-    def test_aten_ops(self):
-
-        shape_env = ShapeEnv()
-        x = create_symbolic_tensor("x", torch.randn(5), shape_env)
-        torch.ops.aten.narrow_copy.default(x, 0, 0, x.shape[0])
-
-        shape_env = ShapeEnv()
-        x = create_symbolic_tensor("x", torch.randn(5, 4, 3), shape_env)
-        torch.ops.aten.expand.default(x, [x.shape[0], x.shape[1], x.shape[2]])
-
-    def test_fx_trace_intlist(self):
-        class CustomModule(torch.nn.Module):
-            def forward(self, x):
-                bs, c, h, w = x.shape
-                return F.pad(x, (0, w % 2, 0, h % 2, 0, 0))
-
-        m = CustomModule()
-        x = torch.rand(1, 3, 4, 4)
-        # should not TypeError: pad(): argument 'pad' (position 2) must be
-        # tuple of ints, not tuple
-        torch.fx.symbolic_trace(m)
-
-    @skipIfNoSympy
-    def test_meta_symint(self):
-        shape_env = ShapeEnv()
-        a0 = create_symint(shape_env, 2)
-        r = torch.empty(a0, device='meta')
-        self.assertIsInstance(r.shape[0], CPP_SYMINT_CLASS)
-
-    @skipIfNoSympy
-    def test_guard_int(self):
-        shape_env = ShapeEnv()
-        a0 = create_symint(shape_env, 2)
-        self.assertEqual(a0.guard_int(), 2)
-        self.assertEqual(str(shape_env.guards[0][0]), "s0")
-        self.assertEqual(shape_env.guards[0][1], 2)
-
-    @skipIfNoSympy
-    def test_int_conversion(self):
-        shape_env = ShapeEnv()
-        a0 = create_symint(shape_env, 2)
-        self.assertRaisesRegex(RuntimeError, "Trying to extract", lambda: int(a0))
-
-    @skipIfNoSympy
-    def test_symint_as_scalar(self):
-        shape_env = ShapeEnv()
-        a0 = create_symint(shape_env, 2)
-
-        sym_int_encountered = False
-
-        class TestSymInt(TorchDispatchMode):
-            def __torch_dispatch__(self, func, types, args=(), kwargs=None):
-                assert func == torch.ops.aten.add.Tensor
-
-                nonlocal sym_int_encountered
-                sym_int_encountered = kwargs["alpha"] is a0
-                kwargs["alpha"] = 0
-                return func(*args)
-
-        x = torch.rand([4, 4])
-        with TestSymInt():
-            y = torch.add(x, x, alpha=a0)
-
-        self.assertTrue(sym_int_encountered)
-
-    @skipIfNoSympy
-    @unittest.mock.patch('sys.stdout', new_callable=io.StringIO)
-    def test_print_readable_with_symints(self, mock_stdout):
-        def f(a, b):
-            dim0 = a.shape[0] + b.shape[0]
-            dim1 = a.shape[1] + b.shape[1]
-            d = a.new_empty(dim0, dim1)
-            d = torch.ops.aten.native_dropout(d, 0.5, train=True)
-            return d
-
-        fx_g = make_fx(f, tracing_mode="symbolic")(torch.randn(5, 3), torch.randn(4, 3))
-        fx_g.print_readable()
-
-        self.assertExpectedInline(mock_stdout.getvalue().strip(), """\
-class f(torch.nn.Module):
-    def forward(self, a_1: f32[t0.size(0),t0.size(1)], b_1: f32[t1.size(0),t0.size(1)]):
-        # No stacktrace found for following nodes
-        sym_size: Sym(t0.size(0)) = torch.ops.aten.sym_size(a_1, 0)
-        sym_size_1: Sym(t1.size(0)) = torch.ops.aten.sym_size(b_1, 0)
-        add: Sym(t0.size(0) + t1.size(0)) = sym_size + sym_size_1;  sym_size = sym_size_1 = None
-        sym_size_2: Sym(t0.size(1)) = torch.ops.aten.sym_size(a_1, 1)
-        sym_size_3: Sym(t0.size(1)) = torch.ops.aten.sym_size(b_1, 1);  b_1 = None
-        add_1: Sym(2*t0.size(1)) = sym_size_2 + sym_size_3;  sym_size_2 = sym_size_3 = None
-        new_empty: f32[t0.size(0) + t1.size(0),2*t0.size(1)] = torch.ops.aten.new_empty.default(a_1, [add, add_1], dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False);  a_1 = add = add_1 = None
-        native_dropout = torch.ops.aten.native_dropout.default(new_empty, 0.5, True);  new_empty = None
-        getitem: f32[t0.size(0) + t1.size(0),2*t0.size(1)] = native_dropout[0]
-        getitem_1: b8[t0.size(0) + t1.size(0),2*t0.size(1)] = native_dropout[1];  native_dropout = None
-        return (getitem, getitem_1)""")  # noqa: B950
-
-
-if __name__ == '__main__':
-    run_tests()
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 1d5985a00da8c..6cb7d280cc19a 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -875,8 +875,7 @@ def f(a):
         self.assertExpectedInline(r, """\
 def forward(self, a_1):
     sym_size = torch.ops.aten.sym_size(a_1, 0)
-    sym_float = torch.fx.experimental.symbolic_shapes.sym_float(sym_size);  sym_size = None
-    pow_1 = sym_float ** 0.5;  sym_float = None
+    pow_1 = sym_size ** 0.5;  sym_size = None
     div = torch.ops.aten.div.Tensor(a_1, pow_1);  a_1 = pow_1 = None
     return div""")
 
@@ -949,7 +948,7 @@ def f(a, b):
         fx_g = make_fx(f, tracing_mode="symbolic")(torch.randn(5), torch.randn(4))
         meta_c = _get_node(fx_g, lambda x: x.target == aten.new_empty.default)
         meta_d = _get_node(fx_g, lambda x: x.target == operator.add)
-        self.assertTrue(meta_c.meta['val'].shape[0].get_pyobj().expr == meta_d.meta['val'].expr)
+        self.assertTrue(meta_c.meta['val'].shape[0].get_pyobj().expr == meta_d.meta['val'].node.expr)
 
     def test_metadata_fresh(self):
         def f(x):
diff --git a/test/test_public_bindings.py b/test/test_public_bindings.py
index 5215281b7ac62..c4a64b5cb6477 100644
--- a/test/test_public_bindings.py
+++ b/test/test_public_bindings.py
@@ -207,8 +207,8 @@ def test_no_new_bindings(self):
             "StreamObjType",
             "StringType",
             "SUM",
-            "SymFloatNode",
-            "SymIntNode",
+            "SymFloat",
+            "SymInt",
             "TensorType",
             "ThroughputBenchmark",
             "TracingState",
diff --git a/tools/autograd/gen_autograd_functions.py b/tools/autograd/gen_autograd_functions.py
index 7b120593eb539..3e9e125bfb9f6 100644
--- a/tools/autograd/gen_autograd_functions.py
+++ b/tools/autograd/gen_autograd_functions.py
@@ -291,7 +291,7 @@
 for (auto i : c10::irange(prop.size())) {
     auto si = prop[i];
     if (si.is_symbolic()) {
-      auto py_symint = py::cast(si.toSymIntNodeImpl()).release().ptr();
+      auto py_symint = py::cast(si).release().ptr();
       PyTuple_SetItem(tup, (Py_ssize_t) i, py_symint);
     } else {
        PyTuple_SetItem(tup, (Py_ssize_t) i, PyLong_FromUnsignedLong(si.as_int_unchecked()));
@@ -313,7 +313,7 @@
 """
 
 GETTER_BODY_SYMINT = """\
-return prop.is_symbolic() ? py::cast(prop.toSymIntNodeImpl()).release().ptr() : PyLong_FromUnsignedLong(prop.as_int_unchecked());
+return prop.is_symbolic() ? py::cast(prop).release().ptr() : PyLong_FromUnsignedLong(prop.as_int_unchecked());
 """
 
 GETTER_BODY_DOUBLE = """\
diff --git a/tools/autograd/templates/python_functions.cpp b/tools/autograd/templates/python_functions.cpp
index 57343a53ea982..eacf56b31d88e 100644
--- a/tools/autograd/templates/python_functions.cpp
+++ b/tools/autograd/templates/python_functions.cpp
@@ -5,7 +5,7 @@
 #include <Python.h>
 #include <ATen/ATen.h>
 
-#include <c10/core/SymIntNodeImpl.h>
+#include <c10/core/SymNodeImpl.h>
 #include "torch/csrc/autograd/generated/Functions.h"
 #include "torch/csrc/autograd/python_cpp_function.h"
 #include <torch/csrc/autograd/python_variable.h>
diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
index e4df2a8dc61da..7122532a54410 100644
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@@ -240,12 +240,7 @@ static PyObject * THPVariable_numel(PyObject* self, PyObject* args)
    if (jit::tracer::isTracing()) {
      return wrap(jit::tracer::getNumelOf(self_));
    } else {
-     auto si = self_.sym_numel();
-     if (si.is_symbolic()) {
-       return py::cast(si.toSymIntNodeImpl()).release().ptr();
-     } else {
-       return THPUtils_packInt64(si.as_int_unchecked());
-     }
+     return py::cast(self_.sym_numel()).release().ptr();
    }
    END_HANDLE_TH_ERRORS
 }
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index 417d73f829a6e..0d1cdcb4ad06d 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -722,7 +722,7 @@ def gen_pyi(
                 binop += "_"
                 out_suffix = ""
             unsorted_tensor_method_hints[binop].append(
-                "def {}(self, other: Union[Tensor, Number, torch.SymIntNode, torch.SymFloatNode]{})"
+                "def {}(self, other: Union[Tensor, Number, torch.SymInt, torch.SymFloat]{})"
                 " -> Tensor: ...".format(binop, out_suffix)
             )
     for binop in ["add", "sub"]:
@@ -732,7 +732,7 @@ def gen_pyi(
                 binop += "_"
                 out_suffix = ""
             unsorted_tensor_method_hints[binop].append(
-                "def {}(self, other: Union[Tensor, Number, torch.SymIntNode, torch.SymFloatNode], "
+                "def {}(self, other: Union[Tensor, Number, torch.SymInt, torch.SymFloat], "
                 "*, alpha: Optional[Number]=1{})"
                 " -> Tensor: ...".format(binop, out_suffix)
             )
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 792e231999163..8b5a5d8e83b3d 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -169,20 +169,6 @@ class Future(object):
 
 def _jit_set_num_profiled_runs(num: _size) -> _size: ...
 
-class SymIntNode(object):
-    def get_pyobj(self) -> Any: ...
-
-    @staticmethod
-    def new_symint(obj) -> SymIntNode: ...
-
-class SymFloatNode(object):
-    def get_pyobj(self) -> Any: ...
-
-    @staticmethod
-    def new_symfloat(obj) -> SymFloatNode: ...
-
-    def __ceil__(self) -> SymIntNode: ...
-
 # Defined in torch/csrc/jit/passes/xnnpack_rewrite.h
 class MobileOptimizerType:
     ...
diff --git a/torch/__init__.py b/torch/__init__.py
index 63995d6ec7f69..c2f2c4c3327f2 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -47,7 +47,7 @@
     'is_deterministic_algorithms_warn_only_enabled',
     'set_deterministic_debug_mode', 'get_deterministic_debug_mode',
     'set_float32_matmul_precision', 'get_float32_matmul_precision',
-    'set_warn_always', 'is_warn_always_enabled',
+    'set_warn_always', 'is_warn_always_enabled', 'SymInt', 'SymFloat',
 ]
 
 ################################################################################
@@ -196,6 +196,67 @@ def _load_global_deps():
 if TYPE_CHECKING:
     import torch._C as _C
 
+class SymInt:
+    """
+    Like an int (including magic methods), but redirects all operations on the
+    wrapped node. This is used in particular to symbolically record operations
+    in the symbolic shape workflow.
+    """
+
+    def __init__(self, node):
+        from torch.fx.experimental.symbolic_shapes import SymNode
+        assert isinstance(node, SymNode)
+        # This field MUST be named node; C++ binding code assumes that this
+        # class has a field named node that stores SymNode
+        self.node = node
+
+    # Magic methods installed later
+
+    def __bool__(self):
+        return self.node.bool_()
+
+    def __int__(self):
+        return self.node.int_()
+
+    def __sym_float__(self):
+        return SymFloat(self.node.sym_float())
+
+    def __repr__(self):
+        return self.node.str()
+
+    # For BC; direct access of node is OK too
+    def get_pyobj(self):
+        return self.node
+
+class SymFloat:
+    """
+    Like an float (including magic methods), but redirects all operations on the
+    wrapped node. This is used in particular to symbolically record operations
+    in the symbolic shape workflow.
+    """
+
+    def __init__(self, node):
+        from torch.fx.experimental.symbolic_shapes import SymNode
+        assert isinstance(node, SymNode)
+        # This field MUST be named node; C++ binding code assumes that this
+        # class has a field named node that stores SymNode
+        self.node = node
+
+    # Magic methods installed later
+
+    def __bool__(self):
+        return self.node.bool_()
+
+    def __sym_int__(self):
+        return SymInt(self.node.sym_int())
+
+    def __repr__(self):
+        return self.node.str()
+
+    # For BC; direct access of node is OK too
+    def get_pyobj(self):
+        return self.node
+
 # Check to see if we can load C extensions, and if not provide some guidance
 # on what the problem might be.
 try:
@@ -941,7 +1002,6 @@ def compiled_with_cxx11_abi():
     lstsq,
 )
 
-
 def _register_device_module(device_type, module):
     r"""Register an external runtime module of the specific :attr:`device_type`
     supported by torch.
@@ -971,3 +1031,6 @@ def _register_device_module(device_type, module):
     import torch.cuda._sanitizer as csan
 
     csan.enable_cuda_sanitizer()
+
+# Populate magic methods on SymInt and SymFloat
+import torch.fx.experimental.symbolic_shapes
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index 864d2c4ca3e0f..ab4cbf62ce36a 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -337,7 +337,7 @@ def create(cls, tx, proxy, example_value=None, **options):
             from . import UserDefinedObjectVariable
 
             return UserDefinedObjectVariable(example_value)
-        elif isinstance(example_value, torch.SymIntNode):
+        elif isinstance(example_value, torch.SymInt):
             proxy.node.meta["example_value"] = example_value
             return cls(proxy, **options)
         else:
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 3e274be506157..71934419de2fd 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -40,11 +40,9 @@ def symbolic_sizes_strides(self, ex: torch.Tensor):
         else:
             size, stride = self._shape_env.create_symbolic_sizes_strides(ex)
 
-        size = [
-            i.get_pyobj().expr if isinstance(i, torch.SymIntNode) else i for i in size
-        ]
+        size = [i.get_pyobj().expr if isinstance(i, torch.SymInt) else i for i in size]
         stride = [
-            i.get_pyobj().expr if isinstance(i, torch.SymIntNode) else i for i in stride
+            i.get_pyobj().expr if isinstance(i, torch.SymInt) else i for i in stride
         ]
         return size, stride
 
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index b54019ef031c9..bf71b0069585e 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -392,8 +392,8 @@ def _elementwise_meta(
     # Number case
     # NOTE: this case is not currently exercised
     # TODO: fix number type promotion (bool, complex->float)
-    assert not isinstance(number, torch.SymIntNode), "NYI"
-    assert not isinstance(number, torch.SymFloatNode), "NYI"
+    assert not isinstance(number, torch.SymInt), "NYI"
+    assert not isinstance(number, torch.SymFloat), "NYI"
     return TensorMeta(number)
 
 
@@ -932,7 +932,7 @@ def _fill_aten(a: Tensor, value: NumberType) -> Tensor:
 # div prim performs truncation division on integer inputs
 #   and true division for floating and complex inputs
 def _div_aten(a, b):
-    is_integral = isinstance(a, (bool, int, torch.SymIntNode)) or (
+    is_integral = isinstance(a, (bool, int, torch.SymInt)) or (
         isinstance(a, torch.Tensor) and utils.is_integer_dtype(a.dtype)
     )
 
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index d8321ac9a47c7..ee4dd38a655c6 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -42,18 +42,18 @@ def getnvFuserDtype(dtype: Union[torch.dtype, NumberTypeType]):
 StrideType = Union[List[int], Tuple[int, ...]]
 DimsType = Union[int, List[int], Tuple[int, ...]]
 DimsSequenceType = Union[List[int], Tuple[int, ...]]
-# TODO: Type[torch.SymIntNode], Type[torch.SymFloatNode]
+# TODO: Type[torch.SymInt], Type[torch.SymFloat]
 NumberTypeType = Union[Type[bool], Type[int], Type[float], Type[complex]]
 # TODO: This needs a lot more type annotations
-# NumberType = Union[bool, int, float, complex, torch.SymIntNode, torch.SymFloatNode]
+# NumberType = Union[bool, int, float, complex, torch.SymInt, torch.SymFloat]
 NumberType = Union[bool, int, float, complex]
 
-Number = (bool, int, float, complex, torch.SymIntNode, torch.SymFloatNode)
+Number = (bool, int, float, complex, torch.SymInt, torch.SymFloat)
 # I don't call it Integral because numbers.Integral includes bool, but IntLike
 # does not
 Dim = int
-IntLike = (int, torch.SymIntNode)
-FloatLike = (float, torch.SymFloatNode)
+IntLike = (int, torch.SymInt)
+FloatLike = (float, torch.SymFloat)
 IntWithoutSymInt = int
 FloatWithoutSymFloat = float
 DeviceLikeType = Union[str, torch.device]
@@ -1113,10 +1113,10 @@ class RETURN_TYPE(Enum):
 
 
 # TODO: when NumberType contains the sym types, can simplify this
-def number_type(x: Union[NumberType, torch.SymIntNode, torch.SymFloatNode]) -> Type:
-    if isinstance(x, torch.SymIntNode):
+def number_type(x: Union[NumberType, torch.SymInt, torch.SymFloat]) -> Type:
+    if isinstance(x, torch.SymInt):
         return int
-    elif isinstance(x, torch.SymFloatNode):
+    elif isinstance(x, torch.SymFloat):
         return float
     else:
         return type(x)
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 652c24c9a521d..c5bf346f8cb5f 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -656,7 +656,7 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                 return args[0].fake_device
 
         flat_arg_fake_tensors = tree_flatten_only(FakeTensor, (args, kwargs))
-        flat_symints = tree_flatten_only(torch.SymIntNode, (args, kwargs))
+        flat_symints = tree_flatten_only(torch.SymInt, (args, kwargs))
         has_symbolic_sizes = (
             any([i._has_symbolic_sizes_strides for i in flat_arg_fake_tensors])
             or len(flat_symints) > 0
diff --git a/torch/csrc/Size.cpp b/torch/csrc/Size.cpp
index 36419f20eccd0..ba4090bfb6845 100644
--- a/torch/csrc/Size.cpp
+++ b/torch/csrc/Size.cpp
@@ -59,7 +59,7 @@ PyObject* THPSize_NewFromSymSizes(const at::Tensor& self_) {
       TORCH_CHECK(
           !torch::jit::tracer::isTracing(),
           "JIT Tracing of SymInts isn't supported");
-      auto py_symint = py::cast(si.toSymIntNodeImpl()).release().ptr();
+      auto py_symint = py::cast(si).release().ptr();
       if (!py_symint)
         throw python_error();
       PyTuple_SET_ITEM(ret.get(), i, py_symint);
@@ -98,7 +98,7 @@ static PyObject* THPSize_pynew(
       if (THPUtils_checkLong(item)) {
         continue;
       }
-      if (torch::is_symint_node(item)) {
+      if (torch::is_symint(item)) {
         continue;
       }
       if (torch::jit::tracer::isTracing() && isTracedZeroDimVar(item)) {
@@ -135,7 +135,7 @@ static PyObject* THPSize_repr(THPSize* self) {
     auto item = PyTuple_GET_ITEM(self, i);
     auto ih = py::handle(item);
 
-    repr += torch::is_symint_node(ih)
+    repr += torch::is_symint(ih)
         ? std::string(py::str(ih))
         : std::to_string(THPUtils_unpackLong(PyTuple_GET_ITEM(self, i)));
   }
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index 66b8ad2d8351b..7e07f3ff32cda 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -2646,9 +2646,8 @@ c10::SymInt ConcretePyInterpreterVTable::sym_numel(
         "Cannot call numel on a tensor with symbolic shapes/strides");
     return self->sym_numel_default();
   }
-  return torch::is_symint_node(out)
-      ? out.cast<c10::SymIntNodeImpl*>()->toSymInt()
-      : c10::SymInt{py::cast<int64_t>(out)};
+  return torch::is_symint(out) ? out.cast<c10::SymInt>()
+                               : c10::SymInt{py::cast<int64_t>(out)};
 }
 
 c10::SymInt ConcretePyInterpreterVTable::sym_storage_offset(
@@ -2669,9 +2668,8 @@ c10::SymInt ConcretePyInterpreterVTable::sym_storage_offset(
   if (out.is(py::none())) {
     return self->sym_storage_offset_default();
   }
-  return torch::is_symint_node(out)
-      ? out.cast<c10::SymIntNodeImpl*>()->toSymInt()
-      : c10::SymInt{py::cast<int64_t>(out)};
+  return torch::is_symint(out) ? out.cast<c10::SymInt>()
+                               : c10::SymInt{py::cast<int64_t>(out)};
 }
 
 c10::SymIntArrayRef ConcretePyInterpreterVTable::sym_strides(
@@ -2701,9 +2699,8 @@ c10::SymIntArrayRef ConcretePyInterpreterVTable::sym_strides(
   py::list symints;
   for (auto it = out.begin(); it != out.end(); it++) {
     auto elm = *it;
-    auto si = torch::is_symint_node(elm)
-        ? elm.cast<c10::SymIntNodeImpl*>()->toSymInt()
-        : c10::SymInt{py::cast<int64_t>(elm)};
+    auto si = torch::is_symint(elm) ? elm.cast<c10::SymInt>()
+                                    : c10::SymInt{py::cast<int64_t>(elm)};
     symints.append(si.as_int_unchecked());
   }
 
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 0bb959a3c61e0..91eecfa4596e8 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -13,7 +13,7 @@
 #if (!defined(FBCODE_CAFFE2) && defined(BUILD_ONEDNN_GRAPH))
 #include <torch/csrc/jit/codegen/onednn/interface.h>
 #endif
-#include <c10/core/SymIntNodeImpl.h>
+#include <c10/core/SymNodeImpl.h>
 #include <torch/csrc/jit/frontend/ir_emitter.h>
 #include <torch/csrc/jit/frontend/tracer.h>
 #include <torch/csrc/jit/ir/irparser.h>
@@ -99,7 +99,6 @@
 #include <torch/csrc/jit/tensorexpr/tensorexpr_init.h>
 #include <torch/csrc/utils/cpp_stacktraces.h>
 
-#include <c10/core/SymFloat.h>
 #include <c10/macros/Export.h>
 #include <c10/util/irange.h>
 #include <c10/util/signal_handler.h>
@@ -126,249 +125,11 @@ using c10::Argument;
 using c10::FunctionSchema;
 using c10::SchemaArgType;
 using c10::SchemaArgument;
-using c10::SymFloat;
-using c10::SymFloatNode;
-using c10::SymIntNode;
+using c10::SymNode;
 using caffe2::serialize::PyTorchStreamReader;
 using caffe2::serialize::PyTorchStreamWriter;
 using torch::utils::SchemaInfo;
 
-static c10::SymIntNode toSymIntNode(c10::SymIntNode a, py::object b) {
-  return torch::is_symint_node(b) ? b.cast<c10::SymIntNode>()
-                                  : a->wrap(b.cast<int64_t>());
-}
-
-static c10::SymFloatNode toSymFloatNode(c10::SymFloatNode a, py::object b) {
-  if (torch::is_symfloat_node(b)) {
-    return b.cast<c10::SymFloatNode>();
-  } else if (torch::is_symint_node(b)) {
-    return b.cast<c10::SymIntNode>()->sym_float();
-  } else {
-    return a->wrap(b.cast<double>());
-  }
-}
-
-class PythonSymIntNodeImpl : public c10::SymIntNodeImpl {
- public:
-  PythonSymIntNodeImpl(py::object pyobj) : c10::SymIntNodeImpl() {
-    pyobj_ = std::make_shared<c10::SafePyObject>(
-        pyobj.release().ptr(), getPyInterpreter());
-  };
-
-  virtual SymIntNode clone() override {
-    py::gil_scoped_acquire acquire;
-    auto r = getPyObj().attr("clone")();
-    return c10::make_intrusive<PythonSymIntNodeImpl>(r);
-  }
-
-  virtual SymIntNode wrap(int64_t num) override {
-    py::gil_scoped_acquire acquire;
-    auto r = getPyObj().attr("wrap")(num);
-    return c10::make_intrusive<PythonSymIntNodeImpl>(r);
-  }
-
-  virtual bool bool_() override {
-    py::gil_scoped_acquire acquire;
-    return getPyObj().attr("__bool__")().is(py::handle(Py_True));
-  }
-
-  virtual int64_t guard_int(const char* file, int64_t line) override {
-    py::gil_scoped_acquire acquire;
-    return getPyObj().attr("guard_int")(file, line).cast<int64_t>();
-  }
-
-  virtual int64_t int_() override {
-    py::gil_scoped_acquire acquire;
-    return getPyObj().attr("__int__")().cast<int64_t>();
-  }
-
-  SymFloatNode sym_float() override;
-
-  virtual std::string str() override {
-    py::gil_scoped_acquire acquire;
-    return getPyObj().attr("__str__")().cast<std::string>();
-  }
-
-  virtual SymIntNode dispatch_common_(
-      const char* fname,
-      const SymIntNode& other) {
-    auto pother = dynamic_cast<PythonSymIntNodeImpl*>(other.get());
-    TORCH_CHECK(pother);
-    py::gil_scoped_acquire acquire;
-    auto r = getPyObj().attr(fname)(pother->getPyObj());
-    return c10::make_intrusive<PythonSymIntNodeImpl>(r);
-  }
-
-  virtual SymIntNode dispatch_common_(const char* fname) {
-    py::gil_scoped_acquire acquire;
-    auto r = getPyObj().attr(fname)();
-    return c10::make_intrusive<PythonSymIntNodeImpl>(r);
-  }
-
-  virtual SymIntNode add(const SymIntNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
-  }
-
-  virtual SymIntNode sub(const SymIntNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
-  }
-
-  virtual SymIntNode mul(const SymIntNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
-  }
-
-  virtual SymFloatNode truediv(const SymIntNode& other) override;
-
-  virtual SymIntNode floordiv(const SymIntNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
-  }
-
-  virtual SymIntNode mod(const SymIntNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
-  }
-
-  virtual SymIntNode eq(const SymIntNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
-  }
-
-  virtual SymIntNode gt(const SymIntNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
-  }
-
-  virtual SymIntNode lt(const SymIntNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
-  }
-
-  virtual SymIntNode le(const SymIntNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
-  }
-
-  virtual SymIntNode ge(const SymIntNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
-  }
-
-  virtual SymIntNode min(const SymIntNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
-  }
-  virtual SymIntNode max(const SymIntNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
-  }
-
-  virtual SymIntNode ceil() override {
-    return dispatch_common_(__FUNCTION__);
-  }
-
-  virtual SymIntNode neg() override {
-    return dispatch_common_(__FUNCTION__);
-  }
-
-  py::handle getPyObj() {
-    return py::handle(pyobj_.get()->ptr(getPyInterpreter()));
-  }
-  std::shared_ptr<c10::SafePyObject> pyobj_ = nullptr;
-};
-
-class PythonSymFloatNodeImpl : public c10::SymFloatNodeImpl {
- public:
-  PythonSymFloatNodeImpl(py::object pyobj) : c10::SymFloatNodeImpl() {
-    pyobj_ = std::make_shared<c10::SafePyObject>(
-        pyobj.release().ptr(), getPyInterpreter());
-  };
-
-  virtual SymFloatNode wrap(double num) override {
-    py::gil_scoped_acquire acquire;
-    auto r = getPyObj().attr("wrap")(num);
-    return c10::make_intrusive<PythonSymFloatNodeImpl>(r);
-  }
-
-  virtual std::string str() override {
-    py::gil_scoped_acquire acquire;
-    return getPyObj().attr("__str__")().cast<std::string>();
-  }
-
-  SymFloatNode dispatch_common_(const char* fname, const SymFloatNode& other) {
-    auto pother = dynamic_cast<PythonSymFloatNodeImpl*>(other.get());
-    TORCH_CHECK(pother);
-    py::gil_scoped_acquire acquire;
-    auto r = getPyObj().attr(fname)(pother->getPyObj());
-    return c10::make_intrusive<PythonSymFloatNodeImpl>(r);
-  }
-
-  SymFloatNode add(const SymFloatNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
-  }
-
-  SymFloatNode sub(const SymFloatNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
-  }
-
-  SymFloatNode mul(const SymFloatNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
-  }
-
-  SymFloatNode truediv(const SymFloatNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
-  }
-
-  SymFloatNode pow(const SymFloatNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
-  }
-
-  SymFloatNode eq(const SymFloatNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
-  }
-
-  SymFloatNode gt(const SymFloatNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
-  }
-
-  SymFloatNode lt(const SymFloatNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
-  }
-
-  SymFloatNode le(const SymFloatNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
-  }
-
-  SymFloatNode ge(const SymFloatNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
-  }
-
-  SymIntNode ceil() override;
-  SymIntNode floor() override;
-
-  py::handle getPyObj() {
-    return py::handle(pyobj_.get()->ptr(getPyInterpreter()));
-  }
-  std::shared_ptr<c10::SafePyObject> pyobj_ = nullptr;
-};
-
-SymFloatNode PythonSymIntNodeImpl::truediv(const SymIntNode& other) {
-  auto pother = dynamic_cast<PythonSymIntNodeImpl*>(other.get());
-  TORCH_CHECK(pother);
-  py::gil_scoped_acquire acquire;
-  auto r = getPyObj().attr("truediv")(pother->getPyObj());
-  return c10::make_intrusive<PythonSymFloatNodeImpl>(r);
-}
-
-SymFloatNode PythonSymIntNodeImpl::sym_float() {
-  py::gil_scoped_acquire acquire;
-  return c10::make_intrusive<PythonSymFloatNodeImpl>(
-      getPyObj().attr("__sym_float__")());
-}
-
-SymIntNode PythonSymFloatNodeImpl::ceil() {
-  py::gil_scoped_acquire acquire;
-  auto r = getPyObj().attr("ceil")();
-  return c10::make_intrusive<PythonSymIntNodeImpl>(r);
-}
-
-SymIntNode PythonSymFloatNodeImpl::floor() {
-  py::gil_scoped_acquire acquire;
-  auto r = getPyObj().attr("floor")();
-  return c10::make_intrusive<PythonSymIntNodeImpl>(r);
-}
-
 namespace {
 
 using autograd::variable_list;
@@ -1381,276 +1142,41 @@ void initJITBindings(PyObject* module) {
         }
       });
 
-  auto symint_class =
-      py::class_<c10::SymIntNodeImpl, c10::SymIntNode>(m, "SymIntNode")
-          .def_static(
-              "new_symint",
-              [](py::object obj) -> c10::SymIntNode {
-                return c10::make_intrusive<PythonSymIntNodeImpl>(obj);
-              })
-          .def(
-              "get_pyobj",
-              [](c10::SymIntNode a) -> py::object {
-                if (auto* psn = dynamic_cast<PythonSymIntNodeImpl*>(a.get())) {
-                  return py::reinterpret_borrow<py::object>(psn->getPyObj());
-                }
-                return py::none();
-              })
-          .def(
-              "__add__",
-              [](c10::SymIntNode a, py::object b) -> c10::SymIntNode {
-                auto snb = toSymIntNode(a, b);
-                return a->add(snb);
-              })
-          .def(
-              "__radd__",
-              [](c10::SymIntNode a, py::object b) -> c10::SymIntNode {
-                auto snb = toSymIntNode(a, b);
-                return snb->add(a);
-              })
-          .def(
-              "__sub__",
-              [](c10::SymIntNode a, py::object b) -> c10::SymIntNode {
-                auto snb = toSymIntNode(a, b);
-                return a->sub(snb);
-              })
-          .def(
-              "__rsub__",
-              [](c10::SymIntNode a, py::object b) -> c10::SymIntNode {
-                auto snb = toSymIntNode(a, b);
-                return snb->sub(a);
-              })
-          .def(
-              "__mul__",
-              [](c10::SymIntNode a, py::object b) -> c10::SymIntNode {
-                auto snb = toSymIntNode(a, b);
-                return a->mul(snb);
-              })
-          .def(
-              "__rmul__",
-              [](c10::SymIntNode a, py::object b) -> c10::SymIntNode {
-                auto snb = toSymIntNode(a, b);
-                return snb->mul(a);
-              })
-          .def(
-              "__truediv__",
-              [](c10::SymIntNode a, py::object b) -> c10::SymFloatNode {
-                auto snb = toSymIntNode(a, b);
-                return a->truediv(snb);
-              })
-          .def(
-              "__rtruediv__",
-              [](c10::SymIntNode a, py::object b) -> c10::SymFloatNode {
-                auto snb = toSymIntNode(a, b);
-                return snb->truediv(a);
-              })
-          .def(
-              "__floordiv__",
-              [](c10::SymIntNode a, py::object b) -> c10::SymIntNode {
-                auto snb = toSymIntNode(a, b);
-                return a->floordiv(snb);
-              })
-          .def(
-              "__rfloordiv__",
-              [](c10::SymIntNode a, py::object b) -> c10::SymIntNode {
-                auto snb = toSymIntNode(a, b);
-                return snb->floordiv(a);
-              })
-          .def(
-              "__mod__",
-              [](c10::SymIntNode a, py::object b) -> c10::SymIntNode {
-                auto snb = toSymIntNode(a, b);
-                return a->mod(snb);
-              })
-          .def(
-              "__rmod__",
-              [](c10::SymIntNode a, py::object b) -> c10::SymIntNode {
-                auto snb = toSymIntNode(a, b);
-                return snb->mod(a);
-              })
-          .def(
-              "__pow__",
-              [](c10::SymIntNode a, py::object b) -> py::object {
-                if (PyFloat_Check(b.ptr())) {
-                  auto float_a = a->sym_float();
-                  return py::cast(
-                      float_a->pow(float_a->wrap(py::cast<double>(b))));
-                }
-                // TODO: integer pow
-                return py::reinterpret_borrow<py::object>(Py_NotImplemented);
-              })
-          // TODO: rpow
-          .def(
-              "__eq__",
-              [](c10::SymIntNode a, py::object b) -> c10::SymIntNode {
-                auto snb = toSymIntNode(a, b);
-                return a->eq(snb);
-              })
-          .def(
-              "__gt__",
-              [](c10::SymIntNode a, py::object b) {
-                auto snb = toSymIntNode(a, b);
-                return a->gt(snb);
-              })
-          .def(
-              "__lt__",
-              [](c10::SymIntNode a, py::object b) -> c10::SymIntNode {
-                auto snb = toSymIntNode(a, b);
-                return a->lt(snb);
-              })
-          .def(
-              "__le__",
-              [](c10::SymIntNode a, py::object b) -> c10::SymIntNode {
-                auto snb = toSymIntNode(a, b);
-                return a->le(snb);
-              })
-          .def(
-              "__ge__",
-              [](c10::SymIntNode a, py::object b) -> c10::SymIntNode {
-                auto snb = toSymIntNode(a, b);
-                return a->ge(snb);
-              })
-          .def(
-              "__ceil__",
-              [](c10::SymIntNode a) -> c10::SymIntNode { return a->ceil(); })
-          .def(
-              "__neg__",
-              [](c10::SymIntNode a) -> c10::SymIntNode { return a->neg(); })
-          .def(
-              "__min__",
-              [](c10::SymIntNode a, py::object b) -> c10::SymIntNode {
-                auto snb = toSymIntNode(a, b);
-                return a->min(snb);
-              })
-          .def(
-              "__max__",
-              [](c10::SymIntNode a, py::object b) -> c10::SymIntNode {
-                auto snb = toSymIntNode(a, b);
-                return a->max(snb);
-              })
-          .def("__bool__", [](c10::SymIntNode a) { return a->bool_(); })
-          .def("__int__", [](c10::SymIntNode a) { return a->int_(); })
-          // Intentionally don't set file line, as the Python backtrace matters
-          // more here
-          .def(
-              "guard_int",
-              [](c10::SymIntNode a) { return a->guard_int(nullptr, 0); })
-          .def(
-              "__sym_float__",
-              [](c10::SymIntNode a) {
-                // TODO: remove dynamic cast when sym_float is in base class
-                auto* psn = dynamic_cast<PythonSymIntNodeImpl*>(a.get());
-                TORCH_INTERNAL_ASSERT(psn);
-                return psn->sym_float();
-              })
-          .def("__str__", [](c10::SymIntNode a) { return a->str(); })
-          .def("__repr__", [](c10::SymIntNode a) { return a->str(); });
-
-  py::class_<c10::SymFloatNodeImpl, c10::SymFloatNode>(m, "SymFloatNode")
-      .def_static(
-          "new_symfloat",
-          [](py::object obj) -> c10::SymFloatNode {
-            return c10::make_intrusive<PythonSymFloatNodeImpl>(obj);
-          })
-      .def(
-          "__add__",
-          [](c10::SymFloatNode a, py::object b) -> c10::SymFloatNode {
-            auto snb = toSymFloatNode(a, b);
-            return a->add(snb);
-          })
-      .def(
-          "__radd__",
-          [](c10::SymFloatNode a, py::object b) -> c10::SymFloatNode {
-            auto snb = toSymFloatNode(a, b);
-            return snb->add(a);
-          })
-      .def(
-          "__sub__",
-          [](c10::SymFloatNode a, py::object b) -> c10::SymFloatNode {
-            auto snb = toSymFloatNode(a, b);
-            return a->sub(snb);
-          })
-      .def(
-          "__mul__",
-          [](c10::SymFloatNode a, py::object b) -> c10::SymFloatNode {
-            auto snb = toSymFloatNode(a, b);
-            return a->mul(snb);
-          })
-      .def(
-          "__rmul__",
-          [](c10::SymFloatNode a, py::object b) -> c10::SymFloatNode {
-            auto snb = toSymFloatNode(a, b);
-            return snb->mul(a);
-          })
-      .def(
-          "__truediv__",
-          [](c10::SymFloatNode a, py::object b) -> c10::SymFloatNode {
-            auto snb = toSymFloatNode(a, b);
-            return a->truediv(snb);
-          })
-      .def(
-          "__rtruediv__",
-          [](c10::SymFloatNode a, py::object b) -> c10::SymFloatNode {
-            auto snb = toSymFloatNode(a, b);
-            return snb->truediv(a);
-          })
-      .def(
-          "__eq__",
-          [](c10::SymFloatNode a, py::object b) -> c10::SymFloatNode {
-            auto snb = toSymFloatNode(a, b);
-            return a->eq(snb);
-          })
-      .def(
-          "__gt__",
-          [](c10::SymFloatNode a, py::object b) {
-            auto snb = toSymFloatNode(a, b);
-            return a->gt(snb);
-          })
-      .def(
-          "__lt__",
-          [](c10::SymFloatNode a, py::object b) -> c10::SymFloatNode {
-            auto snb = toSymFloatNode(a, b);
-            return a->lt(snb);
-          })
-      .def(
-          "__le__",
-          [](c10::SymFloatNode a, py::object b) -> c10::SymFloatNode {
-            auto snb = toSymFloatNode(a, b);
-            return a->le(snb);
-          })
-      .def(
-          "__ge__",
-          [](c10::SymFloatNode a, py::object b) -> c10::SymFloatNode {
-            auto snb = toSymFloatNode(a, b);
-            return a->ge(snb);
-          })
-      .def(
-          "__pow__",
-          [](c10::SymFloatNode a, py::object b) -> c10::SymFloatNode {
-            auto snb = toSymFloatNode(a, b);
-            return a->pow(snb);
-          })
-      .def(
-          "__rpow__",
-          [](c10::SymFloatNode a, py::object b) -> c10::SymFloatNode {
-            auto snb = toSymFloatNode(a, b);
-            return snb->pow(a);
-          })
-      .def(
-          "__ceil__",
-          [](c10::SymFloatNode a) -> c10::SymIntNode { return a->ceil(); })
-      .def(
-          "__floor__",
-          [](c10::SymFloatNode a) -> c10::SymIntNode { return a->floor(); })
-      .def(
-          "get_pyobj",
-          [](c10::SymFloatNode a) -> py::object {
-            if (auto* psn = dynamic_cast<PythonSymFloatNodeImpl*>(a.get())) {
-              return py::reinterpret_borrow<py::object>(psn->getPyObj());
-            }
-            return py::none();
-          })
-      .def("__str__", [](c10::SymFloatNode a) { return a->str(); });
+  // NB: This isn't actually used for regular PyTorch symbolic tracing;
+  // XLA is what needs this
+#define SYMNODE_UNARY(n) .def(#n, [](c10::SymNode a) { return a->n(); })
+#define SYMNODE_UNARY2(n2, n) .def(#n2, [](c10::SymNode a) { return a->n(); })
+#define SYMNODE_BINARY(n) \
+  .def(#n, [](c10::SymNode a, c10::SymNode b) { return a->n(b); })
+  auto symnode_class =
+      py::class_<c10::SymNodeImpl, c10::SymNode>(m, "_SymNode")
+      // These DO NOT install magic methods; the SymInt/SymFloat wrapper in
+      // Python is responsible for this
+      SYMNODE_UNARY(clone)
+      // Named these for consistency with inner python class, but maybe
+      // should change the python side
+      SYMNODE_UNARY2(__bool__, bool_) SYMNODE_UNARY2(__int__, int_)
+          SYMNODE_UNARY2(__sym_int__, sym_int) SYMNODE_UNARY2(
+              __sym_float__, sym_float) SYMNODE_BINARY(add) SYMNODE_BINARY(sub)
+              SYMNODE_BINARY(mul) SYMNODE_BINARY(truediv) SYMNODE_BINARY(pow)
+                  SYMNODE_BINARY(floordiv) SYMNODE_BINARY(mod) SYMNODE_BINARY(
+                      eq) SYMNODE_BINARY(gt) SYMNODE_BINARY(lt)
+                      SYMNODE_BINARY(le) SYMNODE_BINARY(ge) SYMNODE_BINARY(min)
+                          SYMNODE_BINARY(max) SYMNODE_UNARY(ceil)
+                              SYMNODE_UNARY(floor) SYMNODE_UNARY(neg)
+                                  // Intentionally don't set file line, as the
+                                  // Python backtrace matters more here
+                                  .def(
+                                      "guard_int",
+                                      [](c10::SymNode a) {
+                                        return a->guard_int(nullptr, 0);
+                                      })
+                                  .def(
+                                      "__str__",
+                                      [](c10::SymNode a) { return a->str(); })
+                                  .def("__repr__", [](c10::SymNode a) {
+                                    return a->str();
+                                  });
 
   // NOLINTNEXTLINE(bugprone-unused-raii)
   py::class_<CompleteArgumentSpec>(m, "CompleteArgumentSpec")
diff --git a/torch/csrc/jit/python/pybind_utils.cpp b/torch/csrc/jit/python/pybind_utils.cpp
index 68317f76524b2..47089fcc89694 100644
--- a/torch/csrc/jit/python/pybind_utils.cpp
+++ b/torch/csrc/jit/python/pybind_utils.cpp
@@ -80,10 +80,10 @@ IValue toIValue(py::handle obj, const TypePtr& type, c10::optional<int32_t> N) {
           scalar = at::Scalar(THPUtils_unpackComplexDouble(obj.ptr()));
         } else if (THPUtils_checkDouble(obj.ptr())) {
           scalar = at::Scalar(THPUtils_unpackDouble(obj.ptr()));
-        } else if (torch::is_symint_node(py::handle(obj))) {
+        } else if (torch::is_symint(py::handle(obj))) {
           save_symint = true;
           scalar = at::Scalar(7777777);
-        } else if (torch::is_symfloat_node(py::handle(obj))) {
+        } else if (torch::is_symfloat(py::handle(obj))) {
           save_symint = true;
           scalar = at::Scalar(std::numeric_limits<double>::quiet_NaN());
         } else {
@@ -161,12 +161,12 @@ IValue toIValue(py::handle obj, const TypePtr& type, c10::optional<int32_t> N) {
       return py::cast<int64_t>(obj);
     }
     case TypeKind::SymIntType:
-      if (torch::is_symint_node(obj.ptr())) {
+      if (torch::is_symint(obj.ptr())) {
         return py::cast<c10::SymInt>(obj);
       }
       return py::cast<int64_t>(obj);
     case TypeKind::SymFloatType:
-      if (torch::is_symfloat_node(obj.ptr())) {
+      if (torch::is_symfloat(obj.ptr())) {
         return py::cast<c10::SymFloat>(obj);
       }
       return py::cast<double>(obj);
@@ -253,7 +253,7 @@ IValue toIValue(py::handle obj, const TypePtr& type, c10::optional<int32_t> N) {
           bool is_symbolic = false;
           for (auto it = obj.begin(); it != obj.end(); it++) {
             auto elm = *it;
-            if (torch::is_symint_node(elm)) {
+            if (torch::is_symint(elm)) {
               is_symbolic = true;
               break;
             }
@@ -269,7 +269,7 @@ IValue toIValue(py::handle obj, const TypePtr& type, c10::optional<int32_t> N) {
           for (auto it = obj.begin(); it != obj.end(); it++) {
             auto elm = *it;
             // TODO: what about SymInt conversion to SymFloat?
-            if (torch::is_symfloat_node(elm)) {
+            if (torch::is_symfloat(elm)) {
               is_symbolic = true;
               break;
             }
@@ -442,9 +442,9 @@ IValue toIValue(py::handle obj, const TypePtr& type, c10::optional<int32_t> N) {
       } else if (PyComplex_CheckExact(obj.ptr())) {
         auto c_obj = py::cast<std::complex<double>>(obj.ptr());
         return static_cast<c10::complex<double>>(c_obj);
-      } else if (torch::is_symint_node(obj)) {
+      } else if (torch::is_symint(obj)) {
         return py::cast<c10::SymInt>(obj);
-      } else if (torch::is_symfloat_node(obj)) {
+      } else if (torch::is_symfloat(obj)) {
         return py::cast<c10::SymFloat>(obj);
       } else {
         throw py::cast_error(
diff --git a/torch/csrc/lazy/core/ir_builder.h b/torch/csrc/lazy/core/ir_builder.h
index 20e4730d50135..95605eab1e995 100644
--- a/torch/csrc/lazy/core/ir_builder.h
+++ b/torch/csrc/lazy/core/ir_builder.h
@@ -136,10 +136,10 @@ static inline NodePtr MakeSizeDiv(const Value& a, const Value& b) {
 
 inline Value GetSymIntValue(c10::SymInt a) {
   return Value(
-      a.is_symbolic() ? dynamic_cast<torch::lazy::SymIntNodeImpl*>(
-                            a.toSymIntNodeImpl().get())
-                            ->node_
-                      : MakeScalar(a.as_int_unchecked(), at::kLong),
+      a.is_symbolic()
+          ? dynamic_cast<torch::lazy::SymNodeImpl*>(a.toSymNodeImpl().get())
+                ->node_
+          : MakeScalar(a.as_int_unchecked(), at::kLong),
       0);
 }
 
diff --git a/torch/csrc/lazy/core/shape_inference.cpp b/torch/csrc/lazy/core/shape_inference.cpp
index bcc73a3ed79fd..df82fd45fe29b 100644
--- a/torch/csrc/lazy/core/shape_inference.cpp
+++ b/torch/csrc/lazy/core/shape_inference.cpp
@@ -451,11 +451,11 @@ std::vector<Shape> compute_shape_expand(
   std::vector<int64_t> target_size(_sizes.size());
   for (const auto idx : c10::irange(_sizes.size())) {
     if (_sizes[idx].is_symbolic()) {
-      c10::SymIntNode symbolicIntNode = _sizes[idx].toSymIntNodeImpl();
-      auto* lazySymIntNode =
-          dynamic_cast<torch::lazy::SymIntNodeImpl*>(symbolicIntNode.get());
-      TORCH_INTERNAL_ASSERT(lazySymIntNode);
-      auto size_node = lazySymIntNode->node_;
+      c10::SymNode symbolicIntNode = _sizes[idx].toSymNodeImpl();
+      auto* lazySymNode =
+          dynamic_cast<torch::lazy::SymNodeImpl*>(symbolicIntNode.get());
+      TORCH_INTERNAL_ASSERT(lazySymNode);
+      auto size_node = lazySymNode->node_;
       auto static_value =
           std::dynamic_pointer_cast<torch::lazy::DimensionNode>(size_node)
               ->getStaticValue();
diff --git a/torch/csrc/lazy/core/shape_inference.h b/torch/csrc/lazy/core/shape_inference.h
index a1b51495fb3fd..9ceb45d6b23d9 100644
--- a/torch/csrc/lazy/core/shape_inference.h
+++ b/torch/csrc/lazy/core/shape_inference.h
@@ -4,7 +4,7 @@
 #include <c10/core/ScalarType.h>
 #include <c10/core/SymInt.h>
 #include <c10/core/SymIntArrayRef.h>
-#include <c10/core/SymIntNodeImpl.h>
+#include <c10/core/SymNodeImpl.h>
 #include <c10/macros/Export.h>
 #include <c10/util/Optional.h>
 #include <torch/csrc/lazy/backend/backend_data.h>
diff --git a/torch/csrc/lazy/core/tensor.h b/torch/csrc/lazy/core/tensor.h
index 85ea6ab4f4c61..8dfa5a077c973 100644
--- a/torch/csrc/lazy/core/tensor.h
+++ b/torch/csrc/lazy/core/tensor.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <c10/core/SymIntNodeImpl.h>
+#include <c10/core/SymNodeImpl.h>
 #include <c10/util/intrusive_ptr.h>
 #include <torch/csrc/lazy/backend/backend_data.h>
 #include <torch/csrc/lazy/backend/backend_device.h>
@@ -10,12 +10,9 @@
 namespace torch {
 namespace lazy {
 
-class TORCH_API SymIntNodeImpl : public c10::SymIntNodeImpl {
+class TORCH_API SymNodeImpl : public c10::SymNodeImpl {
  public:
-  SymIntNodeImpl(NodePtr ptr) : node_(std::move(ptr)){};
-  c10::SymIntNode add(const c10::SymIntNode& other) override {
-    TORCH_CHECK(false, "NYI");
-  }
+  SymNodeImpl(NodePtr ptr) : node_(std::move(ptr)){};
   NodePtr node_;
 };
 
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index f338d3f196adc..f03763f9dca38 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -685,7 +685,7 @@ static bool is_int_list(
     // NB: do NOT check that the later arguments are ints, as this is
     // BC-breaking for FX
     for (int i = 1; i < len; i++) {
-      if (torch::is_symint_node(
+      if (torch::is_symint(
               py::reinterpret_steal<py::object>(PySequence_GetItem(obj, i)))) {
         if (failed_idx != nullptr) {
           *failed_idx = i;
@@ -716,9 +716,9 @@ static bool is_int_list(
 static bool is_int_or_symint(PyObject* obj) {
   // THPUtils_checkIndex may call __index__ or __int__
   // which may have side effects if obj is a symint node
-  // so we do `is_symint_node` check first
+  // so we do `is_symint` check first
   // TODO: maybe we should be using checkLong here?
-  return torch::is_symint_node(py::handle(obj)) || THPUtils_checkIndex(obj);
+  return torch::is_symint(py::handle(obj)) || THPUtils_checkIndex(obj);
 }
 
 static bool is_int_or_symint_list(
@@ -1570,13 +1570,13 @@ at::Tensor PythonArgs::tensor_slow(int i) {
     // NB: we DO NOT put symbolic ints/floats into the Scalar itself,
     // because although Scalar supports SymInt/SymFloat, the subsequent
     // conversion to Tensor does not.  Instead, do it out of band.
-  } else if (torch::is_symint_node(py::handle(obj))) {
+  } else if (torch::is_symint(py::handle(obj))) {
     save_symint = true;
     // This scalar value doesn't matter, it shouldn't ever actually
     // get read out.  Make it a big and weird looking number to help
     // people figure out if there's aproblem.
     scalar = at::Scalar(7777777);
-  } else if (torch::is_symfloat_node(py::handle(obj))) {
+  } else if (torch::is_symfloat(py::handle(obj))) {
     save_symint = true;
     scalar = at::Scalar(std::numeric_limits<double>::quiet_NaN());
   } else {
@@ -1633,11 +1633,11 @@ at::Scalar PythonArgs::scalar_slow(PyObject* arg) {
     return at::Scalar(THPUtils_unpackComplexDouble(arg));
   }
 
-  if (torch::is_symint_node(arg)) {
+  if (torch::is_symint(arg)) {
     return at::Scalar(py::cast<c10::SymInt>(arg));
   }
 
-  if (torch::is_symfloat_node(arg)) {
+  if (torch::is_symfloat(arg)) {
     return at::Scalar(py::cast<c10::SymFloat>(arg));
   }
 
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index acb830addf8f7..df084821ba255 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -61,6 +61,7 @@
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/python_numbers.h>
 #include <torch/csrc/utils/python_strings.h>
+#include <torch/csrc/utils/python_symnode.h>
 #include <torch/csrc/utils/six.h>
 
 #include <ATen/PythonTorchFunctionTLS.h>
@@ -69,7 +70,7 @@
 #include <c10/util/irange.h>
 
 #include <c10/core/SymFloat.h>
-#include <c10/core/SymIntNodeImpl.h>
+#include <c10/core/SymNodeImpl.h>
 
 #include <array>
 #include <cstddef>
@@ -78,30 +79,6 @@
 #include <string>
 #include <vector>
 
-namespace torch {
-
-inline bool is_symint_node(py::handle obj) {
-  auto static tp_symn = py::type::of<c10::SymIntNodeImpl>();
-  if (py::isinstance(obj, tp_symn)) {
-    TORCH_CHECK(
-        !jit::tracer::isTracing(), "JIT tracing of SymInts isn't supported!");
-    return true;
-  }
-  return false;
-}
-
-inline bool is_symfloat_node(py::handle obj) {
-  auto static tp_symn = py::type::of<c10::SymFloatNodeImpl>();
-  if (py::isinstance(obj, tp_symn)) {
-    TORCH_CHECK(
-        !jit::tracer::isTracing(), "JIT tracing of SymFloats isn't supported!");
-    return true;
-  }
-  return false;
-}
-
-} // namespace torch
-
 namespace pybind11 {
 namespace detail {
 template <>
@@ -109,8 +86,10 @@ struct type_caster<c10::SymInt> {
  public:
   PYBIND11_TYPE_CASTER(c10::SymInt, _("SymInt"));
   bool load(py::handle src, bool) {
-    if (torch::is_symint_node(src)) {
-      value = src.cast<c10::SymIntNodeImpl*>()->toSymInt();
+    if (torch::is_symint(src)) {
+      value = c10::SymInt(static_cast<c10::SymNode>(
+          c10::make_intrusive<torch::impl::PythonSymNodeImpl>(
+              src.attr("node"))));
       return true;
     }
 
@@ -126,8 +105,15 @@ struct type_caster<c10::SymInt> {
       c10::SymInt si,
       return_value_policy /* policy */,
       handle /* parent */) {
-    return si.is_symbolic() ? py::cast(si.toSymIntNodeImpl()).release()
-                            : py::cast(si.expect_int()).release();
+    if (si.is_symbolic()) {
+      // TODO: generalize this to work with C++ backed class
+      auto* py_node = dynamic_cast<torch::impl::PythonSymNodeImpl*>(
+          si.toSymNodeImpl().get());
+      TORCH_INTERNAL_ASSERT(py_node);
+      return torch::get_symint_class()(py_node->getPyObj()).release();
+    } else {
+      return py::cast(si.as_int_unchecked()).release();
+    }
   }
 };
 
@@ -136,8 +122,10 @@ struct type_caster<c10::SymFloat> {
  public:
   PYBIND11_TYPE_CASTER(c10::SymFloat, _("SymFloat"));
   bool load(py::handle src, bool) {
-    if (torch::is_symfloat_node(src)) {
-      value = src.cast<c10::SymFloatNodeImpl*>()->toSymFloat();
+    if (torch::is_symfloat(src)) {
+      value = c10::SymFloat(static_cast<c10::SymNode>(
+          c10::make_intrusive<torch::impl::PythonSymNodeImpl>(
+              src.attr("node"))));
       return true;
     }
 
@@ -153,8 +141,15 @@ struct type_caster<c10::SymFloat> {
       c10::SymFloat si,
       return_value_policy /* policy */,
       handle /* parent */) {
-    return si.is_symbolic() ? py::cast(si.toSymFloatNodeImpl()).release()
-                            : py::cast(si.expect_float()).release();
+    if (si.is_symbolic()) {
+      // TODO: generalize this to work with C++ backed class
+      auto* py_node = dynamic_cast<torch::impl::PythonSymNodeImpl*>(
+          si.toSymNodeImpl().get());
+      TORCH_INTERNAL_ASSERT(py_node);
+      return torch::get_symfloat_class()(py_node->getPyObj()).release();
+    } else {
+      return py::cast(si.as_float_unchecked()).release();
+    }
   }
 };
 } // namespace detail
@@ -167,8 +162,7 @@ inline bool THPUtils_checkScalar(PyObject* obj) {
   }
 #endif
   return PyFloat_Check(obj) || PyLong_Check(obj) || PyComplex_Check(obj) ||
-      torch::is_symint_node(py::handle(obj)) ||
-      torch::is_symfloat_node(py::handle(obj));
+      torch::is_symint(py::handle(obj)) || torch::is_symfloat(py::handle(obj));
 }
 
 namespace torch {
@@ -574,7 +568,7 @@ inline std::vector<int64_t> PythonArgs::intlist(int i) {
 
 inline PyObject* toPyObject(c10::SymInt symint) {
   if (symint.is_symbolic()) {
-    auto r = py::cast(symint.toSymIntNodeImpl()).release().ptr();
+    auto r = py::cast(symint).release().ptr();
     TORCH_INTERNAL_ASSERT(r);
     return r;
   } else {
@@ -609,8 +603,8 @@ inline std::vector<c10::SymInt> PythonArgs::symintlist(int i) {
         size1, c10::SymInt(THPUtils_unpackIndex(args[i])));
   }
 
-  if (size1 > 0 && torch::is_symint_node(py::handle(args[i]))) {
-    auto si = py::handle(args[i]).cast<c10::SymIntNodeImpl*>()->toSymInt();
+  if (size1 > 0 && torch::is_symint(py::handle(args[i]))) {
+    auto si = py::handle(args[i]).cast<c10::SymInt>();
     return std::vector<c10::SymInt>(size1, si);
   }
 
@@ -652,9 +646,8 @@ inline std::vector<c10::SymInt> PythonArgs::symintlist(int i) {
         res.push_back(var.item<int64_t>());
       } else {
         try {
-          if (is_symint_node(py::handle(obj))) {
-            res.push_back(
-                py::handle(obj).cast<c10::SymIntNodeImpl*>()->toSymInt());
+          if (is_symint(py::handle(obj))) {
+            res.push_back(py::handle(obj).cast<c10::SymInt>());
           } else {
             res.push_back(c10::SymInt(THPUtils_unpackIndex(obj)));
           }
diff --git a/torch/csrc/utils/python_symnode.cpp b/torch/csrc/utils/python_symnode.cpp
new file mode 100644
index 0000000000000..318bb2266aa43
--- /dev/null
+++ b/torch/csrc/utils/python_symnode.cpp
@@ -0,0 +1,19 @@
+#include <torch/csrc/utils/python_symnode.h>
+
+namespace torch {
+
+py::handle get_symint_class() {
+  // NB: leak
+  static py::handle symint_class =
+      py::object(py::module::import("torch").attr("SymInt")).release();
+  return symint_class;
+}
+
+py::handle get_symfloat_class() {
+  // NB: leak
+  static py::handle symfloat_class =
+      py::object(py::module::import("torch").attr("SymFloat")).release();
+  return symfloat_class;
+}
+
+} // namespace torch
diff --git a/torch/csrc/utils/python_symnode.h b/torch/csrc/utils/python_symnode.h
new file mode 100644
index 0000000000000..be402e4d5439f
--- /dev/null
+++ b/torch/csrc/utils/python_symnode.h
@@ -0,0 +1,182 @@
+#pragma once
+
+#include <c10/core/SafePyObject.h>
+#include <c10/core/SymNodeImpl.h>
+
+#include <torch/csrc/autograd/python_variable.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch {
+
+TORCH_PYTHON_API py::handle get_symint_class();
+TORCH_PYTHON_API py::handle get_symfloat_class();
+
+// NB: These functions must not be called too early, otherwise torch not setup.
+// Alternate design is to have torch "register" the object to us
+inline bool is_symint(py::handle obj) {
+  return py::isinstance(obj, get_symint_class());
+}
+inline bool is_symfloat(py::handle obj) {
+  return py::isinstance(obj, get_symfloat_class());
+}
+
+namespace impl {
+
+// This c10::SymNodeImpl simply backends to a Python object that
+// implements the API.   The Python object is the source of truth,
+// this is just an adapter so C++ calls can get to the object.
+class PythonSymNodeImpl : public c10::SymNodeImpl {
+ public:
+  PythonSymNodeImpl(py::object pyobj) : c10::SymNodeImpl() {
+    pyobj_ = std::make_shared<c10::SafePyObject>(
+        pyobj.release().ptr(), getPyInterpreter());
+  };
+
+  c10::SymNode wrap_int(int64_t num) override {
+    py::gil_scoped_acquire acquire;
+    auto r = getPyObj().attr("wrap_int")(num);
+    return c10::make_intrusive<PythonSymNodeImpl>(r);
+  }
+
+  c10::SymNode wrap_float(double num) override {
+    py::gil_scoped_acquire acquire;
+    auto r = getPyObj().attr("wrap_float")(num);
+    return c10::make_intrusive<PythonSymNodeImpl>(r);
+  }
+
+  bool bool_() override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("bool_")().is(py::handle(Py_True));
+  }
+
+  bool is_int() override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("is_int")().is(py::handle(Py_True));
+  }
+
+  bool is_float() override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("is_float")().is(py::handle(Py_True));
+  }
+
+  int64_t guard_int(const char* file, int64_t line) override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("guard_int")(file, line).cast<int64_t>();
+  }
+
+  double guard_float(const char* file, int64_t line) override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("guard_float")(file, line).cast<double>();
+  }
+
+  int64_t int_() override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("int_")().cast<int64_t>();
+  }
+
+  std::string str() override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("str")().cast<std::string>();
+  }
+
+  c10::SymNode dispatch_common_(const char* fname, const c10::SymNode& other) {
+    auto pother = dynamic_cast<PythonSymNodeImpl*>(other.get());
+    TORCH_CHECK(pother);
+    py::gil_scoped_acquire acquire;
+    auto r = getPyObj().attr(fname)(pother->getPyObj());
+    return c10::make_intrusive<PythonSymNodeImpl>(r);
+  }
+
+  c10::SymNode dispatch_common_(const char* fname) {
+    py::gil_scoped_acquire acquire;
+    auto r = getPyObj().attr(fname)();
+    return c10::make_intrusive<PythonSymNodeImpl>(r);
+  }
+
+  c10::SymNode add(const c10::SymNode& other) override {
+    return dispatch_common_(__FUNCTION__, other);
+  }
+
+  c10::SymNode sub(const c10::SymNode& other) override {
+    return dispatch_common_(__FUNCTION__, other);
+  }
+
+  c10::SymNode mul(const c10::SymNode& other) override {
+    return dispatch_common_(__FUNCTION__, other);
+  }
+
+  c10::SymNode truediv(const c10::SymNode& other) override {
+    return dispatch_common_(__FUNCTION__, other);
+  }
+
+  c10::SymNode pow(const c10::SymNode& other) override {
+    return dispatch_common_(__FUNCTION__, other);
+  }
+
+  c10::SymNode floordiv(const c10::SymNode& other) override {
+    return dispatch_common_(__FUNCTION__, other);
+  }
+
+  c10::SymNode mod(const c10::SymNode& other) override {
+    return dispatch_common_(__FUNCTION__, other);
+  }
+
+  c10::SymNode eq(const c10::SymNode& other) override {
+    return dispatch_common_(__FUNCTION__, other);
+  }
+
+  c10::SymNode gt(const c10::SymNode& other) override {
+    return dispatch_common_(__FUNCTION__, other);
+  }
+
+  c10::SymNode lt(const c10::SymNode& other) override {
+    return dispatch_common_(__FUNCTION__, other);
+  }
+
+  c10::SymNode le(const c10::SymNode& other) override {
+    return dispatch_common_(__FUNCTION__, other);
+  }
+
+  c10::SymNode ge(const c10::SymNode& other) override {
+    return dispatch_common_(__FUNCTION__, other);
+  }
+
+  c10::SymNode min(const c10::SymNode& other) override {
+    return dispatch_common_(__FUNCTION__, other);
+  }
+  c10::SymNode max(const c10::SymNode& other) override {
+    return dispatch_common_(__FUNCTION__, other);
+  }
+
+  c10::SymNode ceil() override {
+    return dispatch_common_(__FUNCTION__);
+  }
+
+  c10::SymNode floor() override {
+    return dispatch_common_(__FUNCTION__);
+  }
+
+  c10::SymNode neg() override {
+    return dispatch_common_(__FUNCTION__);
+  }
+
+  c10::SymNode clone() override {
+    return dispatch_common_(__FUNCTION__);
+  }
+
+  c10::SymNode sym_int() override {
+    return dispatch_common_(__FUNCTION__);
+  }
+
+  c10::SymNode sym_float() override {
+    return dispatch_common_(__FUNCTION__);
+  }
+
+  py::handle getPyObj() {
+    return py::handle(pyobj_.get()->ptr(getPyInterpreter()));
+  }
+  std::shared_ptr<c10::SafePyObject> pyobj_ = nullptr;
+};
+
+} // namespace impl
+} // namespace torch
diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
index 86d1e19550928..c835607548900 100644
--- a/torch/fx/experimental/proxy_tensor.py
+++ b/torch/fx/experimental/proxy_tensor.py
@@ -21,8 +21,9 @@
 
 from torch.utils._python_dispatch import TorchDispatchMode, _pop_mode_temporarily, _get_current_dispatch_mode
 from torch._subclasses import FakeTensor
-from .symbolic_shapes import ShapeEnv, SymDispatchMode, PySymInt, PySymFloat
+from .symbolic_shapes import ShapeEnv, SymDispatchMode, SymNode
 from torch.fx import Proxy
+from torch import SymInt, SymFloat
 
 __all__ = ["PythonKeyTracer", "dispatch_trace", "make_fx", "DecompositionInterpreter", "get_proxy", "has_proxy", "py_sym_types"]
 aten = torch.ops.aten
@@ -55,27 +56,27 @@ def decompose(decomposition_table):
 proxy_slot = object()
 no_default = object()
 
-py_sym_types = (
-    PySymInt,
-    PySymFloat,
-)
+py_sym_types = (SymInt, SymFloat)
 def is_sym_node(node):
     assert hasattr(node, 'meta'), "All nodes traced with proxy_tensor should have meta"
     return "val" in node.meta and isinstance(node.meta['val'], py_sym_types)
 
 def set_proxy_slot(obj, tracer, proxy):
-    d = obj.__dict__.setdefault(proxy_slot, weakref.WeakKeyDictionary())
+    assert isinstance(obj, (torch.Tensor, SymNode)), type(obj)
+    d = obj.__dict__.setdefault(proxy_slot, weakref.WeakKeyDictionary())  # type: ignore[call-overload]
     assert isinstance(d, weakref.WeakKeyDictionary)
     d[tracer] = proxy
 
 def has_proxy_slot(obj, tracer):
+    assert isinstance(obj, (torch.Tensor, SymNode)), type(obj)
     return get_proxy_slot(obj, tracer, False, lambda _: True)
 
 # the default argument is what to return if the slot is not set.
 # the transform argument is handy if you need to extract a subfield from
 # the successfully looked up result (but NOT the default.)
 def get_proxy_slot(obj, tracer, default=no_default, transform=lambda x: x):
-    d = obj.__dict__.get(proxy_slot)
+    assert isinstance(obj, (torch.Tensor, SymNode)), type(obj)
+    d = obj.__dict__.get(proxy_slot)  # type: ignore[call-overload]
     if not d:
         if default is no_default:
             raise KeyError(f"{obj} is not tracked with proxy for {tracer}")
@@ -130,10 +131,8 @@ def track_tensor(tensor, proxy, *, constant, tracer):
     def try_set_proxy_slot(outer_s, proxy_callable, *args):
         assert callable(proxy_callable)
         if isinstance(outer_s, SymInt):
-            inner_s = outer_s.get_pyobj()
-            assert isinstance(inner_s, py_sym_types)
-
-            set_proxy_slot(inner_s, tracer, thunkify(proxy_callable, inner_s, *args))
+            inner_s = outer_s.node
+            set_proxy_slot(inner_s, tracer, thunkify(proxy_callable, outer_s, *args))
 
     # The basic idea is that we need to associate each tensor/SymInt
     # with a Proxy.  How do we setup this association?  We just store
@@ -198,7 +197,7 @@ class _ProxyTensor:
 
 def fetch_sym_proxy(tracer):
     def inner(e):
-        n = e.get_pyobj()
+        n = e.node
         if n.constant is not None:
             return n.constant
         else:
@@ -400,8 +399,8 @@ def create_arg(self, a: Any):
 
             return self.create_node('get_attr', qualname, (), {})
         elif isinstance(a, (SymInt, SymFloat)):
-            assert a.get_pyobj().constant is not None
-            return a.get_pyobj().constant
+            assert a.node.constant is not None
+            return a.node.constant
         return super().create_arg(a)
 
 
@@ -432,7 +431,7 @@ def wrapped(*proxies):
         )
         out = pytree.tree_map_only(
             (SymInt, SymFloat),
-            lambda t: get_proxy_slot(t.get_pyobj(), tracer)(),
+            lambda t: get_proxy_slot(t.node, tracer)(),
             out
         )
         return out
@@ -479,10 +478,6 @@ def inner_torch_dispatch(self, func, types, args=(), kwargs=None):
         return out
 
 
-SymInt = torch.SymIntNode
-SymFloat = torch.SymFloatNode
-
-
 class ProxySymDispatchMode(SymDispatchMode):
     def __init__(self, tracer):
         super().__init__()
@@ -501,10 +496,9 @@ def enable(self, b):
         finally:
             self.enable_tracing = old
 
-    def _compute_proxy(self, func, args, out):
+    def _compute_proxy(self, func, args, out: Union[SymInt, SymFloat]):
         n_args = tuple(
-            get_proxy_slot(a, self.tracer)().node if a.constant is None else a.constant
-            if isinstance(a, py_sym_types) else a
+            get_proxy_slot(a.node, self.tracer)().node if isinstance(a, py_sym_types) else a
             for a in args
         )
 
@@ -520,10 +514,11 @@ def __sym_dispatch__(self, func, types, args, kwargs):
             return func(*args, **kwargs)
 
         # Peephole optimize multiply by one
+        # NB: be careful not to trigger guards here!
         if func == operator.mul:
-            if isinstance(args[1], (PySymInt, PySymFloat)) and args[1].constant == 1:
+            if isinstance(args[1], int) and args[1] == 1:
                 return args[0]
-            elif isinstance(args[0], (PySymInt, PySymFloat)) and args[0].constant == 1:
+            elif isinstance(args[0], int) and args[0] == 1:
                 return args[1]
 
         # For speed, we assume there are no nested data structures
@@ -535,7 +530,7 @@ def __sym_dispatch__(self, func, types, args, kwargs):
 
         # Delays tracing out the proxies on this op until we actually need it
         p_out_thunk = thunkify(self._compute_proxy, func=func, args=args, out=out)
-        set_proxy_slot(out, self.tracer, p_out_thunk)
+        set_proxy_slot(out.node, self.tracer, p_out_thunk)
         return out
 
 
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index 0a03e5819a90a..2eb169a0d1882 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -10,6 +10,7 @@
 import collections
 import textwrap
 from torch._subclasses.meta_utils import MetaConverter
+from torch import SymInt, SymFloat
 
 try:
     import sympy  # type: ignore[import]
@@ -21,8 +22,8 @@
 aten = torch.ops.aten  # type: ignore[has-type]
 
 __all__ = [
-    "has_symbolic_sizes_strides", "create_contiguous", "PySymInt", "ShapeEnv",
-    "SymDispatchMode", "PySymFloat", "sym_float", "FloorDiv"
+    "has_symbolic_sizes_strides", "create_contiguous", "ShapeEnv",
+    "SymDispatchMode", "sym_float", "FloorDiv", "guard_int", "wrap_node"
 ]
 
 SYM_FUNCTION_MODE = None
@@ -88,32 +89,38 @@ def _handle_sym_dispatch(func, args, kwargs):
     finally:
         SYM_FUNCTION_MODE = mode
 
+def guard_int(a):
+    if isinstance(a, SymInt):
+        return a.node.guard_int("", 0)  # NB: uses Python backtrace
+    assert isinstance(a, int)
+    return a
+
 def sym_float(a):
-    if hasattr(a, '__sym_float__'):
-        return a.__sym_float__()
-    elif isinstance(a, torch._C.SymFloatNode):
+    if isinstance(a, SymFloat):
         return a
+    elif hasattr(a, '__sym_float__'):
+        return a.__sym_float__()
     return float(a)
 
 def sym_int(a):
-    if hasattr(a, '__sym_int__'):
-        return a.__sym_int__()
-    elif isinstance(a, torch._C.SymIntNode):
+    if isinstance(a, SymInt):
         return a
+    elif hasattr(a, '__sym_int__'):
+        return a.__sym_int__()
     return int(a)
 
 # TODO: An incomplete list
 # 1. Set variables to be equal when we do equality
 # 2. Specialize on 0/1 when we do subtraction
-class PySymInt(object):
+class SymNode:
     """
-    PySymInt objects are the primary "symbolic shape" objects that flow through
-    our program. They're what sit under FakeTensor, and contains our primary
-    implementation of symbolic shapes.
+    This is a type erased SymInt/SymFloat which we use to do actual operations.
+    End users don't touch this.  Magic methods are NOT defined on this object.
     """
-    def __init__(self, expr, shape_env, constant=None):
+    def __init__(self, expr, shape_env, pytype, constant=None):
         self._expr = expr
         self.shape_env = shape_env
+        self.pytype = pytype
         self.constant = constant
 
     @property
@@ -121,23 +128,49 @@ def expr(self):
         self._update_expr()
         return self._expr
 
-    def wrap(self, num):
-        return PySymInt(sympy.Integer(num), self.shape_env, constant=num)
+    def _update_expr(self):
+        self._expr = self.shape_env.replace(self._expr)
+
+    def to_node(self, num):
+        if isinstance(num, (SymInt, SymFloat)):
+            return num.node
+        elif isinstance(num, int):
+            return self.wrap_int(num)
+        elif isinstance(num, float):
+            return self.wrap_float(num)
+        else:
+            # NotImplementedError is important so that Python tries the
+            # other magic method
+            raise NotImplementedError(type(num))
+
+    def is_int(self):
+        return self.pytype is int
+
+    def is_float(self):
+        return self.pytype is float
+
+    def wrap_int(self, num):
+        assert isinstance(num, int)
+        return SymNode(sympy.Integer(num), self.shape_env, int, constant=num)
+
+    def wrap_float(self, num):
+        assert isinstance(num, float)
+        return SymNode(sympy.Integer(num), self.shape_env, float, constant=num)
 
     def clone(self):
-        return PySymInt(self.expr, self.shape_env, constant=self.constant)
+        return SymNode(self.expr, self.shape_env, self.pytype, constant=self.constant)
 
-    def _update_expr(self):
-        self._expr = self.shape_env.replace(self._expr)
+    def str(self):
+        return f"{self.expr}"
 
     def __str__(self):
-        return f"{self.expr}"
+        return self.str()
 
     def __repr__(self):
-        return f"{self.expr}"
+        return self.str()
 
     # Today we error on calling int on a symbolic shape, as this is a very accessible footgun.
-    def __int__(self):
+    def int_(self):
         raise RuntimeError("Trying to extract a concrete int out of a symbolic int")
 
     # You can manually trigger a guard with this function
@@ -146,28 +179,35 @@ def guard_int(self, file, line):
         # guard occurred
         return int(self.shape_env.evaluate_expr(self.expr))
 
-    def __sym_float__(self):
+    def guard_float(self, file, line):
+        # TODO: use the file/line for some useful diagnostic on why a
+        # guard occurred
+        return float(self.shape_env.evaluate_expr(self.expr))
+
+    def sym_float(self):
         if SYM_FUNCTION_MODE:
-            return _handle_sym_dispatch(sym_float, (self,), {})
+            r = _handle_sym_dispatch(sym_float, (wrap_node(self),), {})
+            assert isinstance(r, (SymInt, SymFloat)), type(r)
+            return r.node
         # TODO: consider constant prop here
         # TODO: wrapping the expr with sympy.Float doesn't seem to work, why
         # not?
-        return PySymFloat(self.expr, self.shape_env)
+        return SymNode(self.expr, self.shape_env, float)
 
-    def __bool__(self):
-        return bool(self.shape_env.evaluate_expr(self.shape_env.replace(self.expr)))
-
-class PySymFloat:
-    def __init__(self, expr, shape_env, constant=None):
-        self.expr = expr
-        self.shape_env = shape_env
-        self.constant = constant
+    def sym_int(self):
+        raise NotImplementedError("sym_int NYI")
+        """
+        if SYM_FUNCTION_MODE:
+            return _handle_sym_dispatch(sym_int, (self,), {})
+        # TODO: consider constant prop here
+        # XXX: need to cast float to int in sympy; math.floor is wrong
+        # because negatives round to zero
+        return SymNode(self.expr, self.shape_env, int)
+        """
 
-    def wrap(self, num):
-        return PySymFloat(sympy.Float(num), self.shape_env, constant=num)
+    def bool_(self):
+        return bool(self.shape_env.evaluate_expr(self.shape_env.replace(self.expr)))
 
-    def __str__(self):
-        return f"{self.expr}"
 
 if HAS_SYMPY:
     class FloorDiv(sympy.Function):
@@ -238,32 +278,45 @@ def eval(cls, base, divisor):
 
 float_magic_methods = {"add", "sub", "mul", "truediv", "ceil", "floor", "eq", "gt", "lt", "le", "ge", "pow"}
 
-def _make_magic(method, func, py_type):
+def wrap_node(x):
+    if not isinstance(x, SymNode):
+        return x
+    if x.constant is not None:
+        return x.constant
+    if x.pytype is int:
+        return SymInt(x)
+    elif x.pytype is float:
+        return SymFloat(x)
+    else:
+        raise AssertionError(f"unrecognized return type {x.pytype}")
+
+def _make_node_magic(method, func):
     func = lru_cache(256)(func)
 
-    def magic_impl(self, other):
+    def binary_magic_impl(self, other):
         if method in ["min", "max"]:
             op = getattr(builtins, method)
         else:
             op = getattr(operator, method)
         if SYM_FUNCTION_MODE:
-            return _handle_sym_dispatch(op, (self, other), {})
-        if isinstance(other, py_type):
-            other_expr = other.expr
-        else:
-            assert isinstance(other, sympy.Expr)
-            other_expr = other
+            r = _handle_sym_dispatch(op, (wrap_node(self), wrap_node(other)), {})
+            assert isinstance(r, (SymInt, SymFloat)), type(r)
+            return r.node
+        assert isinstance(other, SymNode)
+        other_expr = other.expr
         # TODO: consider constant prop here
         expr = self.shape_env.replace(self.expr)
         other_expr = self.shape_env.replace(other_expr)
         out = func(expr, other_expr)
         out = sympy.expand(out)
         if method in ["truediv"]:
-            return PySymFloat(out, self.shape_env)
+            pytype = float
         else:
-            # TODO: relational operators actually technically return a
-            # PySymBool, this is a type error
-            return py_type(out, self.shape_env)
+            pytype = self.pytype
+
+        # TODO: relational operators actually technically return a
+        # PySymBool, this is a type error
+        return SymNode(out, self.shape_env, pytype)
 
     def unary_magic_impl(self):
         if SYM_FUNCTION_MODE:
@@ -271,33 +324,55 @@ def unary_magic_impl(self):
                 op = getattr(math, method)
             else:
                 op = getattr(operator, method)
-            return _handle_sym_dispatch(op, (self,), {})
+            r = _handle_sym_dispatch(op, (wrap_node(self),), {})
+            assert isinstance(r, (SymInt, SymFloat)), type(r)
+            return r.node
         # TODO: consider constant prop here
         expr = self.shape_env.replace(self.expr)
         out = func(expr)
         out = sympy.expand(out)
         if method in ["ceil", "floor"]:
-            return PySymInt(out, self.shape_env)
+            pytype = int
         else:
-            return py_type(out, self.shape_env)
+            pytype = self.pytype
+
+        return SymNode(out, self.shape_env, pytype)
+
+    if method in unary_magic_methods:
+        setattr(SymNode, method, unary_magic_impl)
+    else:
+        setattr(SymNode, method, binary_magic_impl)
+
+for method, func in magic_methods.items():
+    _make_node_magic(method, func)
+
+def _make_user_magic(method, user_type):
+    # User magic takes care of wrapping the other operand into a node,
+    # so that our internal logic can assume everything is nodes
+
+    def unary_magic_impl(self):
+        return wrap_node(getattr(self.node, method)())
+
+    def binary_magic_impl(self, other):
+        return wrap_node(getattr(self.node, method)(self.node.to_node(other)))
+
+    def rbinary_magic_impl(self, other):
+        return wrap_node(getattr(self.node.to_node(other), method)(self.node))
 
-    # this should be wrapped transparently into torch.SymIntNode
     if method in unary_magic_methods:
-        setattr(py_type, method, unary_magic_impl)
-        setattr(py_type, f"__{method}__", unary_magic_impl)
+        setattr(user_type, f"__{method}__", unary_magic_impl)
     else:
-        setattr(py_type, method, magic_impl)
-        setattr(py_type, f"__{method}__", magic_impl)
+        setattr(user_type, f"__{method}__", binary_magic_impl)
         if method in reflectable_magic_methods:
-            setattr(py_type, f"__r{method}__", magic_impl)
+            setattr(user_type, f"__r{method}__", rbinary_magic_impl)
 
 for method, func in magic_methods.items():
-    _make_magic(method, func, PySymInt)
+    _make_user_magic(method, SymInt)
 
 for method, func in magic_methods.items():
     if method not in float_magic_methods:
         continue
-    _make_magic(method, func, PySymFloat)
+    _make_user_magic(method, SymFloat)
 
 del method
 del func
@@ -390,9 +465,7 @@ def create_symbolic_sizes_strides(self, ex: torch.Tensor):
         return [self.create_symintnode(i) for i in size], [self.create_symintnode(i) for i in stride]  # type: ignore[arg-type]
 
     def create_symintnode(self, expr: "sympy.Expr"):
-        py_sym_int = PySymInt(expr, self)
-        cpp_sym_int = torch.SymIntNode.new_symint(py_sym_int)  # type: ignore[attr-defined]
-        return cpp_sym_int
+        return SymInt(SymNode(expr, self, int))
 
     def create_symbol(self, val: int) -> "sympy.Expr":
         if not HAS_SYMPY:
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index 3b8c96b6a43bf..4fdd64f900a92 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -498,7 +498,7 @@ def emit_node(node : Node):
                 if isinstance(meta_val, FakeTensor):
                     maybe_type_annotation = f': {dtype_abbrs[meta_val.dtype]}{stringify_shape(meta_val.shape)}'
                 elif isinstance(meta_val, py_sym_types):
-                    maybe_type_annotation = f': Sym({meta_val.expr})'
+                    maybe_type_annotation = f': Sym({meta_val})'
                 elif isinstance(meta_val, TensorMetadata):
                     maybe_type_annotation = f': {dtype_abbrs[meta_val.dtype]}{stringify_shape(meta_val.shape)}'
 

From 345c8b409e9055c1adaa5eacf6991ffc54cecdd2 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 27 Oct 2022 20:58:46 +0000
Subject: [PATCH 0240/1922] [Inductor] Enable Inductor unspec inputs test for
 different dtypes (#87809)

Fixes #ISSUE_NUMBER

cc @jansel @mlazos @soumith @voznesenskym @penguinwu @anijain2305 @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87809
Approved by: https://github.com/ngimel
---
 test/inductor/test_torchinductor.py | 38 ++++++++++++-----------------
 torch/_inductor/codegen/triton.py   |  5 +---
 2 files changed, 17 insertions(+), 26 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index dd846e5f405a4..834e81dcce629 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -4000,29 +4000,23 @@ def fn(x, y):
             return x + y, x * y, x / y
 
         opt = torch._dynamo.optimize("inductor")(fn)
+        dtypes = [
+            torch.float16,
+            torch.bfloat16,
+            torch.float32,
+            torch.float64,
+            torch.int32,
+            torch.int64,
+        ]
 
-        inputs = (
-            rand_strided((2, 3), (3, 1), device="cuda"),
-            rand_strided((), (), device="cpu"),
-        )
-        self.assertTrue(same(opt(*inputs), fn(*inputs)))
-        inputs = (inputs[1], inputs[0])
-        self.assertTrue(same(opt(*inputs), fn(*inputs)))
-
-    @requires_cuda()
-    def test_unspec_inputs_fp16(self):
-        def fn(x, y):
-            return x + y, x * y, x / y
-
-        opt = torch._dynamo.optimize("inductor")(fn)
-
-        inputs = (
-            rand_strided((2, 3), (3, 1), dtype=torch.float16, device="cuda"),
-            rand_strided((), (), dtype=torch.float16, device="cpu"),
-        )
-        self.assertTrue(same(opt(*inputs), fn(*inputs)))
-        inputs = (inputs[1], inputs[0])
-        self.assertTrue(same(opt(*inputs), fn(*inputs)))
+        for d in dtypes:
+            inputs = (
+                rand_strided((2, 3), (3, 1), dtype=torch.float32, device="cuda"),
+                rand_strided((), (), dtype=d, device="cpu"),
+            )
+            self.assertTrue(same(opt(*inputs), fn(*inputs)))
+            inputs = (inputs[1], inputs[0])
+            self.assertTrue(same(opt(*inputs), fn(*inputs)))
 
     @patch.object(config.triton, "mm", "aten")
     def test_list_clearing(self):
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 5ccf1a7191f29..e2473456bb2bc 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -44,10 +44,7 @@ def signature_of(arg):
         if V.graph.is_unspec_arg(arg.buffer):
             # had unwrapped 0d tensor as scalar
             new_tye = tye.lstrip("*")
-            if new_tye in ["fp16", "bf16"]:
-                return "fp32"
-            else:
-                return new_tye
+            return new_tye
         else:
             return tye
     if isinstance(arg, SizeArg):

From 4772790fe111b8525866caee8d62b379bf39c89b Mon Sep 17 00:00:00 2001
From: Alvaro Gaona <alvgaona@gmail.com>
Date: Thu, 27 Oct 2022 21:00:59 +0000
Subject: [PATCH 0241/1922] Reimplement Kaiser window (#87330)

Relates to #85366

- For reference follow #87082.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87330
Approved by: https://github.com/lezcano, https://github.com/mruberry
---
 docs/source/signal.rst                        |  1 +
 torch/signal/windows/__init__.py              |  5 +-
 torch/signal/windows/windows.py               | 84 ++++++++++++++++++-
 .../_internal/opinfo/definitions/signal.py    | 38 +++++++++
 4 files changed, 124 insertions(+), 4 deletions(-)

diff --git a/docs/source/signal.rst b/docs/source/signal.rst
index e304092ede5ed..57a1ad6b0e55f 100644
--- a/docs/source/signal.rst
+++ b/docs/source/signal.rst
@@ -21,3 +21,4 @@ torch.signal.windows
     cosine
     exponential
     gaussian
+    kaiser
diff --git a/torch/signal/windows/__init__.py b/torch/signal/windows/__init__.py
index 9ccb9dcd1891a..8bd0395cad3ad 100644
--- a/torch/signal/windows/__init__.py
+++ b/torch/signal/windows/__init__.py
@@ -1,10 +1,9 @@
-import warnings
-
-from .windows import cosine, exponential, gaussian
+from .windows import cosine, exponential, gaussian, kaiser
 
 
 __all__ = [
     'cosine',
     'exponential',
     'gaussian',
+    'kaiser',
 ]
diff --git a/torch/signal/windows/windows.py b/torch/signal/windows/windows.py
index 06bd710a34510..564caf2b77179 100644
--- a/torch/signal/windows/windows.py
+++ b/torch/signal/windows/windows.py
@@ -11,6 +11,7 @@
     'cosine',
     'exponential',
     'gaussian',
+    'kaiser',
 ]
 
 window_common_args = merge_dicts(
@@ -24,7 +25,7 @@
     ),
     factory_common_args,
     {"normalization": "The window is normalized to 1 (maximum value is 1). However, the 1 doesn't appear if "
-                      "`M` is even and `sym` is `True`."}
+                      ":attr:`M` is even and :attr:`sym` is `True`."}
 )
 
 
@@ -296,3 +297,84 @@ def gaussian(
                        requires_grad=requires_grad)
 
     return torch.exp(-k ** 2)
+
+
+@_add_docstr(
+    r"""
+Computes the Kaiser window.
+
+The Kaiser window is defined as follows:
+
+.. math::
+    out_i = I_0 \left( \beta \sqrt{1 - \left( {\frac{i - N/2}{N/2}} \right) ^2 } \right) / I_0( \beta )
+
+where ``I_0`` is the zeroth order modified Bessel function of the first kind (see :func:`torch.special.i0`), and
+``N = M - 1 if sym else M``.
+
+``M`` is the window length.
+
+    """,
+    r"""
+
+{normalization}
+
+Args:
+    {M}
+
+Keyword args:
+    beta (float, optional): shape parameter for the window. Must be non-negative. Default: 12.0
+    {sym}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Examples::
+
+    >>> # Generate a symmetric gaussian window with a standard deviation of 1.0.
+    >>> torch.signal.windows.kaiser(5)
+    tensor([4.0065e-05, 2.1875e-03, 4.3937e-02, 3.2465e-01, 8.8250e-01, 8.8250e-01, 3.2465e-01, 4.3937e-02, 2.1875e-03, 4.0065e-05])
+
+    >>> # Generate a periodic gaussian window and standard deviation equal to 0.9.
+    >>> torch.signal.windows.kaiser(5,sym=False,std=0.9)
+    tensor([1.9858e-07, 5.1365e-05, 3.8659e-03, 8.4658e-02, 5.3941e-01, 1.0000e+00, 5.3941e-01, 8.4658e-02, 3.8659e-03, 5.1365e-05])
+""".format(
+        **window_common_args,
+    ),
+)
+def kaiser(
+        M: int,
+        *,
+        beta: float = 12.0,
+        sym: bool = True,
+        dtype: Optional[torch.dtype] = None,
+        layout: torch.layout = torch.strided,
+        device: Optional[torch.device] = None,
+        requires_grad: bool = False
+) -> Tensor:
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+
+    _window_function_checks('kaiser', M, dtype, layout)
+
+    if beta < 0:
+        raise ValueError(f'beta must be non-negative, got: {beta} instead.')
+
+    if M == 0:
+        return torch.empty((0,), dtype=dtype, layout=layout, device=device, requires_grad=requires_grad)
+
+    if M == 1:
+        return torch.ones((1,), dtype=dtype, layout=layout, device=device, requires_grad=requires_grad)
+
+    start = -beta
+    constant = 2.0 * beta / (M if not sym else M - 1)
+
+    k = torch.linspace(start=start,
+                       end=start + (M - 1) * constant,
+                       steps=M,
+                       dtype=dtype,
+                       layout=layout,
+                       device=device,
+                       requires_grad=requires_grad)
+
+    return torch.i0(torch.sqrt(beta * beta - torch.pow(k, 2))) / torch.i0(torch.tensor(beta))
diff --git a/torch/testing/_internal/opinfo/definitions/signal.py b/torch/testing/_internal/opinfo/definitions/signal.py
index 3b7f3e4de4001..9404cc889c50e 100644
--- a/torch/testing/_internal/opinfo/definitions/signal.py
+++ b/torch/testing/_internal/opinfo/definitions/signal.py
@@ -95,6 +95,23 @@ def reference_inputs_gaussian_window(op_info, device, dtype, requires_grad, **kw
         yield SampleInput(size, sym=True, **kw)
 
 
+def reference_inputs_kaiser_window(op_info, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_window(op_info, device, dtype, requires_grad, **kwargs)
+
+    cases = (
+        (8, {"beta": 2}),
+        (16, {"beta": 12}),
+        (32, {"beta": 30}),
+        (64, {"beta": 35}),
+        (128, {"beta": 41.2}),
+        (256, {"beta": 100}),
+    )
+
+    for size, kw in cases:
+        yield SampleInput(size, sym=False, **kw)
+        yield SampleInput(size, sym=True, **kw)
+
+
 def error_inputs_window(op_info, device, *args, **kwargs):
     # Tests for windows that have a negative size
     yield ErrorInput(
@@ -156,6 +173,18 @@ def error_inputs_gaussian_window(op_info, device, **kwargs):
     )
 
 
+def error_inputs_kaiser_window(op_info, device, **kwargs):
+    # Yield common error inputs
+    yield from error_inputs_window(op_info, device, beta=12, **kwargs)
+
+    # Tests for negative beta
+    yield ErrorInput(
+        SampleInput(3, beta=-1, dtype=torch.float32, device=device, **kwargs),
+        error_type=ValueError,
+        error_regex="beta must be non-negative, got: -1 instead.",
+    )
+
+
 def reference_signal_window(fn: Callable):
     r"""Wrapper for scipy signal window references.
 
@@ -331,4 +360,13 @@ def make_signal_windows_opinfo(
             ),
         ),
     ),
+    make_signal_windows_opinfo(
+        name="signal.windows.kaiser",
+        ref=reference_signal_window(scipy.signal.windows.kaiser)
+        if TEST_SCIPY
+        else None,
+        sample_inputs_func=partial(sample_inputs_window, beta=12.0),
+        reference_inputs_func=partial(reference_inputs_kaiser_window, beta=12.0),
+        error_inputs_func=error_inputs_kaiser_window,
+    ),
 ]

From 06530eb833223dfde1bb99013f406a0451cf4b94 Mon Sep 17 00:00:00 2001
From: foram-chandra <96388449+foram-chandra@users.noreply.github.com>
Date: Thu, 27 Oct 2022 21:03:42 +0000
Subject: [PATCH 0242/1922] [doc] Add out-kwarg documentation to torch.where
 (#87870)

Fixes #87862

cc: @lezcano

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87870
Approved by: https://github.com/lezcano
---
 torch/_torch_docs.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index c7b8d796a497d..c7d3328598b0f 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -12397,7 +12397,7 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.where,
     r"""
-where(condition, x, y) -> Tensor
+where(condition, x, y, *, out=None) -> Tensor
 
 Return a tensor of elements selected from either :attr:`x` or :attr:`y`, depending on :attr:`condition`.
 
@@ -12408,7 +12408,8 @@ def merge_dicts(*dicts):
         \text{x}_i & \text{if } \text{condition}_i \\
         \text{y}_i & \text{otherwise} \\
     \end{cases}
-
+"""
+    + r"""
 .. note::
     The tensors :attr:`condition`, :attr:`x`, :attr:`y` must be :ref:`broadcastable <broadcasting-semantics>`.
 
@@ -12419,6 +12420,9 @@ def merge_dicts(*dicts):
     y (Tensor or Scalar): value (if :attr:`y` is a scalar) or values selected at indices
                           where :attr:`condition` is ``False``
 
+Keyword args:
+    {out}
+
 Returns:
     Tensor: A tensor of shape equal to the broadcasted shape of :attr:`condition`, :attr:`x`, :attr:`y`
 
@@ -12450,7 +12454,9 @@ def merge_dicts(*dicts):
 
 .. note::
     See also :func:`torch.nonzero`.
-""",
+""".format(
+        **common_args
+    ),
 )
 
 add_docstr(

From 070af471771801a8e4fd95d8d08f51c84908c67b Mon Sep 17 00:00:00 2001
From: soulitzer <soulitzer@gmail.com>
Date: Wed, 26 Oct 2022 14:34:58 -0400
Subject: [PATCH 0243/1922] BE: Improve test_will_engine_execute_node unittest
 (#87806)

Adds the test from https://github.com/pytorch/pytorch/pull/86672

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87806
Approved by: https://github.com/albanD
---
 test/test_autograd.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index 43f31ae63ed32..6435d36b643ba 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -435,7 +435,7 @@ def fn(x):
 
         # .backward(inputs=) is OK
         out = c.sum()
-        torch.autograd.backward(out, inputs=(a,), retain_graph=True)
+        torch.autograd.backward(out, inputs=(a, b), retain_graph=True)
         self.assertEqual(counter[0], 2)
 
         # .backward() is OK

From afb2d00661c2e3b70368230afe99b5e4f019c2bd Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Thu, 27 Oct 2022 21:16:58 +0000
Subject: [PATCH 0244/1922] Revert "[dynamo] Error when user nests FX with
 dynamo (#87797)"

This reverts commit a485528a7e4551461d57db3deb8b40c2acea08d2.

Reverted https://github.com/pytorch/pytorch/pull/87797 on behalf of https://github.com/kit1980 due to Broke linux-bionic-py3.7-clang9 / test (dynamo, 2, 2, linux.2xlarge), same error on pull
---
 test/dynamo/test_misc.py    | 14 --------------
 torch/_dynamo/config.py     |  4 ----
 torch/_dynamo/eval_frame.py |  9 ---------
 3 files changed, 27 deletions(-)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index a63a6d8930c80..a0f592212f4e1 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -2732,20 +2732,6 @@ def forward(self, x):
             dynamo_result = graph(x)
             self.assertTrue(same(real, dynamo_result))
 
-    def test_error_on_nested_fx_trace(self):
-        input = torch.rand(2, 3)
-
-        def f(x):
-            x + x
-
-        real = f(input)
-
-        optimized = torch._dynamo.optimize("eager")(f)
-        self.assertTrue(same(optimized(input), real))
-
-        with self.assertRaisesRegex(RuntimeError, "Detected that you are using FX"):
-            gm = torch.fx.symbolic_trace(optimized)
-
 
 class CustomFunc(torch.autograd.Function):
     @staticmethod
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index 2601be8983f2a..f75f3651dd97c 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -153,10 +153,6 @@
 # How to import torchinductor, either torchinductor or torch.inductor
 inductor_import = dynamo_import.replace("dynamo", "inductor")
 
-# If true, error with a better message if we symbolically trace over a
-# dynamo-optimized function. If false, silently suppress dynamo.
-error_on_nested_fx_trace = True
-
 # root folder of the project
 if "torch." in dynamo_import:
     base_dir = dirname(dirname(dirname(abspath(__file__))))
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 2d0938a83a123..d86653f9973cc 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -14,7 +14,6 @@
 
 import torch
 import torch.utils._pytree as pytree
-from torch.fx._symbolic_trace import is_fx_tracing
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.nn.parallel.distributed import DistributedDataParallel
 
@@ -150,14 +149,6 @@ def __call__(self, *args, **kwargs):
 
         @functools.wraps(fn)
         def _fn(*args, **kwargs):
-            if is_fx_tracing():
-                if config.error_on_nested_fx_trace:
-                    raise RuntimeError(
-                        "Detected that you are using FX to symbolically trace "
-                        "a dynamo-optimized function. This is not supported at the moment."
-                    )
-                return fn
-
             on_enter()
             prior = set_eval_frame(callback)
             backend_ctx = backend_ctx_ctor()

From b357edc19489b2ecc1a275c9a2bd4f1b7ac52a27 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Thu, 27 Oct 2022 23:55:59 +0000
Subject: [PATCH 0245/1922] Revert "[Inductor] Enable Inductor unspec inputs
 test for different dtypes (#87809)"

This reverts commit 369755f8ce1b043c88efbc50ee09c0258dec5162.

Reverted https://github.com/pytorch/pytorch/pull/87809 on behalf of https://github.com/kit1980 due to Broke trunk / cuda11.6-py3.10-gcc7-sm86 / test (default, 4, 4, linux.g5.4xlarge.nvidia.gpu), same error on pull.
---
 test/inductor/test_torchinductor.py | 38 +++++++++++++++++------------
 torch/_inductor/codegen/triton.py   |  5 +++-
 2 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 834e81dcce629..dd846e5f405a4 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -4000,23 +4000,29 @@ def fn(x, y):
             return x + y, x * y, x / y
 
         opt = torch._dynamo.optimize("inductor")(fn)
-        dtypes = [
-            torch.float16,
-            torch.bfloat16,
-            torch.float32,
-            torch.float64,
-            torch.int32,
-            torch.int64,
-        ]
 
-        for d in dtypes:
-            inputs = (
-                rand_strided((2, 3), (3, 1), dtype=torch.float32, device="cuda"),
-                rand_strided((), (), dtype=d, device="cpu"),
-            )
-            self.assertTrue(same(opt(*inputs), fn(*inputs)))
-            inputs = (inputs[1], inputs[0])
-            self.assertTrue(same(opt(*inputs), fn(*inputs)))
+        inputs = (
+            rand_strided((2, 3), (3, 1), device="cuda"),
+            rand_strided((), (), device="cpu"),
+        )
+        self.assertTrue(same(opt(*inputs), fn(*inputs)))
+        inputs = (inputs[1], inputs[0])
+        self.assertTrue(same(opt(*inputs), fn(*inputs)))
+
+    @requires_cuda()
+    def test_unspec_inputs_fp16(self):
+        def fn(x, y):
+            return x + y, x * y, x / y
+
+        opt = torch._dynamo.optimize("inductor")(fn)
+
+        inputs = (
+            rand_strided((2, 3), (3, 1), dtype=torch.float16, device="cuda"),
+            rand_strided((), (), dtype=torch.float16, device="cpu"),
+        )
+        self.assertTrue(same(opt(*inputs), fn(*inputs)))
+        inputs = (inputs[1], inputs[0])
+        self.assertTrue(same(opt(*inputs), fn(*inputs)))
 
     @patch.object(config.triton, "mm", "aten")
     def test_list_clearing(self):
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index e2473456bb2bc..5ccf1a7191f29 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -44,7 +44,10 @@ def signature_of(arg):
         if V.graph.is_unspec_arg(arg.buffer):
             # had unwrapped 0d tensor as scalar
             new_tye = tye.lstrip("*")
-            return new_tye
+            if new_tye in ["fp16", "bf16"]:
+                return "fp32"
+            else:
+                return new_tye
         else:
             return tye
     if isinstance(arg, SizeArg):

From ba5b39f35a17361e04025fccca8b7d093400bd78 Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Thu, 27 Oct 2022 16:26:42 +0000
Subject: [PATCH 0246/1922] Update how inductor cpu tests are skipped on fbcode
 (#87867)

cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87867
Approved by: https://github.com/anijain2305
---
 test/inductor/test_torchinductor.py  | 5 +----
 torch/_inductor/codegen/cpp_prefix.h | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index dd846e5f405a4..326a7f1f6ce84 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -56,15 +56,12 @@
 
 HAS_CPU = False
 try:
-    if IS_FBCODE:
-        raise torch._inductor.exc.CppCompileError
-
     from subprocess import CalledProcessError
 
     from torch._inductor.codecache import CppCodeCache
 
     CppCodeCache.load("")
-    HAS_CPU = True
+    HAS_CPU = not IS_FBCODE
 except (
     CalledProcessError,
     OSError,
diff --git a/torch/_inductor/codegen/cpp_prefix.h b/torch/_inductor/codegen/cpp_prefix.h
index d9c0a99f5f42c..346de52563883 100644
--- a/torch/_inductor/codegen/cpp_prefix.h
+++ b/torch/_inductor/codegen/cpp_prefix.h
@@ -5,7 +5,7 @@
 #include <limits>
 #include <omp.h>
 
-#include "ATen/core/PhiloxRNGEngine.h"
+#include <ATen/core/PhiloxRNGEngine.h>
 #include <c10/util/Half.h>
 #include <c10/util/BFloat16.h>
 

From ab79b132012bec628281e469a87a63c0e10ae248 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Fri, 28 Oct 2022 00:40:47 +0000
Subject: [PATCH 0247/1922] Check all CUDA API calls for errors in torch/
 (#81560)

Summary:
Original commit changeset: 0bb770d2cdb2

Original Phabricator Diff: D35194935 (https://github.com/pytorch/pytorch/commit/79e5b053b690852b21d881357904bc5a4438d95b)

Differential Revision: D35291874

Pull Request resolved: https://github.com/pytorch/pytorch/pull/81560
Approved by: https://github.com/ezyang
---
 torch/csrc/CudaIPCTypes.cpp                   |  2 +-
 torch/csrc/cuda/nccl.cpp                      |  3 ++-
 torch/csrc/cuda/shared/cudart.cpp             | 11 ++++++-----
 torch/csrc/jit/codegen/cuda/executor.cpp      | 19 ++++++++++---------
 .../csrc/jit/codegen/cuda/executor_utils.cpp  |  2 +-
 torch/csrc/profiler/stubs/cuda.cpp            |  6 ++----
 6 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/torch/csrc/CudaIPCTypes.cpp b/torch/csrc/CudaIPCTypes.cpp
index 9a2c47a5f7a84..d18a23ebe4e68 100644
--- a/torch/csrc/CudaIPCTypes.cpp
+++ b/torch/csrc/CudaIPCTypes.cpp
@@ -195,7 +195,7 @@ CudaIPCSentData::~CudaIPCSentData() {
   try {
     if (event_sync_required_) {
       at::cuda::CUDAGuard device_guard(device_.index());
-      cudaEventDestroy(event_);
+      C10_CUDA_CHECK(cudaEventDestroy(event_));
       if (!CudaIPCGlobalEntities::alive) {
         return;
       }
diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
index a1d96f7e5d6cd..8a3c8af797cc2 100644
--- a/torch/csrc/cuda/nccl.cpp
+++ b/torch/csrc/cuda/nccl.cpp
@@ -3,6 +3,7 @@
 #include <torch/csrc/cuda/nccl.h>
 
 #include <ATen/ATen.h>
+#include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/Exception.h>
 #include <c10/util/hash.h>
@@ -142,7 +143,7 @@ struct NcclCommList {
     if (comms) {
       for (const auto i : c10::irange(ndevices)) {
         int dummy_var;
-        if (cudaGetDevice(&dummy_var) != cudaSuccess) {
+        if (C10_CUDA_ERROR_HANDLED(cudaGetDevice(&dummy_var)) != cudaSuccess) {
           /* there are cases when this destructor is called after the
            CUDA driver is already unloaded from the process.
            In these cases, skip ncclCommDestroy */
diff --git a/torch/csrc/cuda/shared/cudart.cpp b/torch/csrc/cuda/shared/cudart.cpp
index 9e098d44808ba..f18c883a2a06a 100644
--- a/torch/csrc/cuda/shared/cudart.cpp
+++ b/torch/csrc/cuda/shared/cudart.cpp
@@ -71,25 +71,26 @@ void initCudartBindings(PyObject* module) {
       "cuda"
       "HostRegister",
       [](uintptr_t ptr, size_t size, unsigned int flags) -> cudaError_t {
-        return cudaHostRegister((void*)ptr, size, flags);
+        return C10_CUDA_ERROR_HANDLED(
+            cudaHostRegister((void*)ptr, size, flags));
       });
   cudart.def(
       "cuda"
       "HostUnregister",
       [](uintptr_t ptr) -> cudaError_t {
-        return cudaHostUnregister((void*)ptr);
+        return C10_CUDA_ERROR_HANDLED(cudaHostUnregister((void*)ptr));
       });
   cudart.def(
       "cuda"
       "StreamCreate",
       [](uintptr_t ptr) -> cudaError_t {
-        return cudaStreamCreate((cudaStream_t*)ptr);
+        return C10_CUDA_ERROR_HANDLED(cudaStreamCreate((cudaStream_t*)ptr));
       });
   cudart.def(
       "cuda"
       "StreamDestroy",
       [](uintptr_t ptr) -> cudaError_t {
-        return cudaStreamDestroy((cudaStream_t)ptr);
+        return C10_CUDA_ERROR_HANDLED(cudaStreamDestroy((cudaStream_t)ptr));
       });
 #if !defined(USE_ROCM)
   cudart.def(
@@ -104,7 +105,7 @@ void initCudartBindings(PyObject* module) {
         c10::cuda::CUDAGuard guard(device);
         size_t device_free = 0;
         size_t device_total = 0;
-        cudaMemGetInfo(&device_free, &device_total);
+        C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
         return {device_free, device_total};
       });
 }
diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp
index b93c9514fcf02..c3f447cf706fa 100644
--- a/torch/csrc/jit/codegen/cuda/executor.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor.cpp
@@ -1175,9 +1175,9 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
   if (measure_kernel_time_ ||
       isDebugDumpEnabled(DebugDumpOption::EffectiveBandwidth) ||
       isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
-    cudaEventCreate(&start_event);
-    cudaEventCreate(&finish_event);
-    cudaEventRecord(start_event);
+    C10_CUDA_CHECK(cudaEventCreate(&start_event));
+    C10_CUDA_CHECK(cudaEventCreate(&finish_event));
+    C10_CUDA_CHECK(cudaEventRecord(start_event));
   }
 
   if (execute_kernel_) {
@@ -1233,12 +1233,13 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
   if (measure_kernel_time_ ||
       isDebugDumpEnabled(DebugDumpOption::EffectiveBandwidth) ||
       isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
-    cudaEventRecord(finish_event);
-    cudaEventSynchronize(start_event);
-    cudaEventSynchronize(finish_event);
-    cudaEventElapsedTime(&kernel_time_ms_, start_event, finish_event);
-    cudaEventDestroy(start_event);
-    cudaEventDestroy(finish_event);
+    C10_CUDA_CHECK(cudaEventRecord(finish_event));
+    C10_CUDA_CHECK(cudaEventSynchronize(start_event));
+    C10_CUDA_CHECK(cudaEventSynchronize(finish_event));
+    C10_CUDA_CHECK(
+        cudaEventElapsedTime(&kernel_time_ms_, start_event, finish_event));
+    C10_CUDA_CHECK(cudaEventDestroy(start_event));
+    C10_CUDA_CHECK(cudaEventDestroy(finish_event));
 
     bytes_processed_ = 0;
     // Figure how many bytes are inputs, outputs, and temporary buffers
diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
index db9764eb3059e..dd5334542fc90 100644
--- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
@@ -941,7 +941,7 @@ void initializeCudaContext() {
   if (!pctx) {
     std::unique_lock<std::mutex> cudaFreeMutexLock(
         *(c10::cuda::getFreeMutex()));
-    cudaFree(nullptr);
+    C10_CUDA_CHECK(cudaFree(nullptr));
   }
 }
 
diff --git a/torch/csrc/profiler/stubs/cuda.cpp b/torch/csrc/profiler/stubs/cuda.cpp
index 7671c75ce4024..6731d0f4d3b50 100644
--- a/torch/csrc/profiler/stubs/cuda.cpp
+++ b/torch/csrc/profiler/stubs/cuda.cpp
@@ -81,16 +81,14 @@ struct CUDAMethods : public ProfilerStubs {
 
   void onEachDevice(std::function<void(int)> op) const override {
     at::cuda::OptionalCUDAGuard device_guard;
-    // NOLINTNEXTLINE(bugprone-signed-char-misuse)
-    int count = at::cuda::device_count();
-    for (const auto i : c10::irange(count)) {
+    for (const auto i : c10::irange(at::cuda::device_count())) {
       device_guard.set_index(i);
       op(i);
     }
   }
 
   void synchronize() const override {
-    cudaDeviceSynchronize();
+    TORCH_CUDA_CHECK(cudaDeviceSynchronize());
   }
 
   bool enabled() const override {

From a34ba7d7ff21e094099ec229eee82f43bd700a0a Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Fri, 28 Oct 2022 00:41:04 +0000
Subject: [PATCH 0248/1922] Check all CUDA API calls for errors in caffe2/
 (#81816)

Test Plan: Sandcastle

Differential Revision: D35194868

Pull Request resolved: https://github.com/pytorch/pytorch/pull/81816
Approved by: https://github.com/ezyang
---
 caffe2/core/context_gpu.cu                    |  6 +--
 caffe2/core/context_gpu.h                     | 12 +++---
 .../generate_proposals_op_util_nms_gpu.cu     | 42 ++++++++++---------
 ...generate_proposals_op_util_nms_gpu_test.cc |  2 +-
 .../rnn/recurrent_network_executor_gpu.cc     |  3 +-
 caffe2/operators/scale_blobs_op.cu            |  8 ++--
 caffe2/utils/math/elementwise.cu              |  2 +-
 caffe2/utils/math/reduce.cu                   |  4 +-
 caffe2/utils/math_gpu.cu                      |  8 ++--
 9 files changed, 43 insertions(+), 44 deletions(-)

diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
index bfa563ca6b8bb..83b8a049b8721 100644
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@@ -235,7 +235,7 @@ static void Caffe2InitializeCuda() {
         // a reserved flag for cudaDeviceEnablePeerAccess that should always be
         // zero currently.
         // It is ok if peer access is already enabled...
-        cudaError_t err = cudaDeviceEnablePeerAccess(j, 0);
+        cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaDeviceEnablePeerAccess(j, 0));
         if ((err != cudaErrorPeerAccessAlreadyEnabled) &&
             (err != cudaSuccess)) {
           CAFFE_THROW(cudaGetErrorString(err));
@@ -351,7 +351,7 @@ struct CAFFE2_CUDA_API PinnedCPUAllocator final : public at::Allocator {
       CUDA_ENFORCE(cudaHostUnregister(data));
       GetDefaultCPUAllocator()->raw_deleter()(data);
     } else {
-      cudaError_t err = cudaFreeHost(data);
+      cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaFreeHost(data));
       profiledCPUMemoryReporter().Delete(data);
       if (err == cudaErrorInvalidValue) {
         free(data);
@@ -598,7 +598,7 @@ struct DefaultCUDAAllocator final : public at::Allocator {
     switch (g_cuda_memory_pool_type) {
       case CudaMemoryPoolType::NONE: {
         // If memory pool is not set up, use simple cudaFree.
-        cudaError_t error = cudaFree(ptr);
+        cudaError_t error = C10_CUDA_ERROR_HANDLED(cudaFree(ptr));
         // For some reason, in Python runtime we sometimes delete a data pointer
         // after the cuda runtime exits - this is odd but is probably caused by
         // a static workspace that pycaffe2 uses, and the destruction got
diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h
index e411d9cd735f1..611b2550c7e0c 100644
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@@ -195,10 +195,6 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext {
   // SwitchToDevice()
   void FinishDeviceComputation() override {
     CUDA_ENFORCE(cudaStreamSynchronize(getCudaObjects().GetStream(gpu_id_)));
-    cudaError_t error = cudaGetLastError();
-    if (error != cudaSuccess) {
-      CAFFE_THROW("Encountered CUDA error: ", cudaGetErrorString(error));
-    }
   }
 
   inline int device_id() const {
@@ -309,11 +305,13 @@ class CAFFE2_CUDA_API CUDAContext final : public BaseContext {
   }
 
   static bool IsStreamFree(const DeviceOption& option, StreamId stream_id) {
-    auto stream = CUDAContext::cuda_stream(option.device_id(), stream_id);
-    auto status = cudaStreamQuery(stream);
+    const auto stream = CUDAContext::cuda_stream(option.device_id(), stream_id);
+    const auto status = C10_CUDA_ERROR_HANDLED(cudaStreamQuery(stream));
     if (status == cudaErrorNotReady) {
       // ignore and clear the error if not ready
-      (void)cudaGetLastError();
+      C10_CUDA_CLEAR_ERROR();
+    } else {
+      C10_CUDA_CHECK(status); // Reraise error
     }
     return status == cudaSuccess;
   }
diff --git a/caffe2/operators/generate_proposals_op_util_nms_gpu.cu b/caffe2/operators/generate_proposals_op_util_nms_gpu.cu
index 9776266154cf3..aac9b3b81db26 100644
--- a/caffe2/operators/generate_proposals_op_util_nms_gpu.cu
+++ b/caffe2/operators/generate_proposals_op_util_nms_gpu.cu
@@ -145,15 +145,15 @@ void nms_gpu_upright(
   // Overlapping CPU computes and D2H memcpy
   // both take about the same time
   cudaEvent_t copy_done;
-  cudaEventCreate(&copy_done);
+  C10_CUDA_CHECK(cudaEventCreate(&copy_done));
   int nto_copy = std::min(CHUNK_SIZE, N);
-  CUDA_CHECK(cudaMemcpyAsync(
+  C10_CUDA_CHECK(cudaMemcpyAsync(
       &h_delete_mask[0],
       &d_delete_mask[0],
       nto_copy * mask_ld * sizeof(int),
       cudaMemcpyDeviceToHost,
       context->cuda_stream()));
-  CUDA_CHECK(cudaEventRecord(copy_done, context->cuda_stream()));
+  C10_CUDA_CHECK(cudaEventRecord(copy_done, context->cuda_stream()));
   int offset = 0;
   std::vector<int> h_keep_sorted_list;
   std::vector<int> rmv(mask_ld, 0);
@@ -162,7 +162,7 @@ void nms_gpu_upright(
     int next_offset = offset + ncopied;
     nto_copy = std::min(CHUNK_SIZE, N - next_offset);
     if (nto_copy > 0) {
-      CUDA_CHECK(cudaMemcpyAsync(
+      C10_CUDA_CHECK(cudaMemcpyAsync(
           &h_delete_mask[next_offset * mask_ld],
           &d_delete_mask[next_offset * mask_ld],
           nto_copy * mask_ld * sizeof(int),
@@ -170,9 +170,10 @@ void nms_gpu_upright(
           context->cuda_stream()));
     }
     // Waiting for previous copy
-    CUDA_CHECK(cudaEventSynchronize(copy_done));
-    if (nto_copy > 0)
-      cudaEventRecord(copy_done, context->cuda_stream());
+    C10_CUDA_CHECK(cudaEventSynchronize(copy_done));
+    if (nto_copy > 0){
+      C10_CUDA_CHECK(cudaEventRecord(copy_done, context->cuda_stream()));
+    }
     for (int i = offset; i < next_offset; ++i) {
       int iblock = i / BOXES_PER_THREAD;
       int inblock = i % BOXES_PER_THREAD;
@@ -186,15 +187,15 @@ void nms_gpu_upright(
     }
     offset = next_offset;
   }
-  cudaEventDestroy(copy_done);
+  C10_CUDA_CHECK(cudaEventDestroy(copy_done));
 
   const int nkeep = h_keep_sorted_list.size();
-  cudaMemcpyAsync(
+  C10_CUDA_CHECK(cudaMemcpyAsync(
       d_keep_sorted_list,
       &h_keep_sorted_list[0],
       nkeep * sizeof(int),
       cudaMemcpyHostToDevice,
-      context->cuda_stream());
+      context->cuda_stream()));
 
   *h_nkeep = nkeep;
 }
@@ -502,15 +503,15 @@ void nms_gpu_rotated(
   // Overlapping CPU computes and D2H memcpy
   // both take about the same time
   cudaEvent_t copy_done;
-  cudaEventCreate(&copy_done);
+  C10_CUDA_CHECK(cudaEventCreate(&copy_done));
   int nto_copy = std::min(CHUNK_SIZE, N);
-  CUDA_CHECK(cudaMemcpyAsync(
+  C10_CUDA_CHECK(cudaMemcpyAsync(
       &h_delete_mask[0],
       &d_delete_mask[0],
       nto_copy * mask_ld * sizeof(int),
       cudaMemcpyDeviceToHost,
       context->cuda_stream()));
-  CUDA_CHECK(cudaEventRecord(copy_done, context->cuda_stream()));
+  C10_CUDA_CHECK(cudaEventRecord(copy_done, context->cuda_stream()));
   int offset = 0;
   std::vector<int> h_keep_sorted_list;
   std::vector<int> rmv(mask_ld, 0);
@@ -519,7 +520,7 @@ void nms_gpu_rotated(
     int next_offset = offset + ncopied;
     nto_copy = std::min(CHUNK_SIZE, N - next_offset);
     if (nto_copy > 0) {
-      CUDA_CHECK(cudaMemcpyAsync(
+      C10_CUDA_CHECK(cudaMemcpyAsync(
           &h_delete_mask[next_offset * mask_ld],
           &d_delete_mask[next_offset * mask_ld],
           nto_copy * mask_ld * sizeof(int),
@@ -527,9 +528,10 @@ void nms_gpu_rotated(
           context->cuda_stream()));
     }
     // Waiting for previous copy
-    CUDA_CHECK(cudaEventSynchronize(copy_done));
-    if (nto_copy > 0)
-      cudaEventRecord(copy_done, context->cuda_stream());
+    C10_CUDA_CHECK(cudaEventSynchronize(copy_done));
+    if (nto_copy > 0){
+      C10_CUDA_CHECK(cudaEventRecord(copy_done, context->cuda_stream()));
+    }
     for (int i = offset; i < next_offset; ++i) {
       int iblock = i / BOXES_PER_THREAD;
       int inblock = i % BOXES_PER_THREAD;
@@ -543,15 +545,15 @@ void nms_gpu_rotated(
     }
     offset = next_offset;
   }
-  cudaEventDestroy(copy_done);
+  C10_CUDA_CHECK(cudaEventDestroy(copy_done));
 
   const int nkeep = h_keep_sorted_list.size();
-  cudaMemcpyAsync(
+  C10_CUDA_CHECK(cudaMemcpyAsync(
       d_keep_sorted_list,
       &h_keep_sorted_list[0],
       nkeep * sizeof(int),
       cudaMemcpyHostToDevice,
-      context->cuda_stream());
+      context->cuda_stream()));
 
   *h_nkeep = nkeep;
 }
diff --git a/caffe2/operators/generate_proposals_op_util_nms_gpu_test.cc b/caffe2/operators/generate_proposals_op_util_nms_gpu_test.cc
index 6c8283b3d0fe4..ea656dd30e3b9 100644
--- a/caffe2/operators/generate_proposals_op_util_nms_gpu_test.cc
+++ b/caffe2/operators/generate_proposals_op_util_nms_gpu_test.cc
@@ -691,7 +691,7 @@ TEST(UtilsNMSTest, TestPerfRotatedNMS) {
 //           list_nitems * sizeof(int),
 //           cudaMemcpyDeviceToHost,
 //           cuda_context.cuda_stream()));
-//       CUDA_CHECK(cudaStreamSynchronize(cuda_context.cuda_stream()));
+//       CUDA_CHECK(cudaStreamSynchronize(cuda_context.cuda_stream());
 
 //       ASSERT_EQ(keep.size(), gpu_keep.size());
 //       std::sort(keep.begin(), keep.end());
diff --git a/caffe2/operators/rnn/recurrent_network_executor_gpu.cc b/caffe2/operators/rnn/recurrent_network_executor_gpu.cc
index ef041959742ac..0356218c717f4 100644
--- a/caffe2/operators/rnn/recurrent_network_executor_gpu.cc
+++ b/caffe2/operators/rnn/recurrent_network_executor_gpu.cc
@@ -130,8 +130,7 @@ void CUDARecurrentNetworkExecutor::_ExecRange(int from, int to) {
   for (int stream_id = 0; stream_id <= std::min(stream_seq, max_streams - 1);
        stream_id++) {
     VLOG(1) << "Wait for stream:" << stream_id;
-    CUDA_CHECK(
-        cudaStreamSynchronize(CUDAContext::cuda_stream(gpu_id, stream_id)));
+    CUDA_CHECK(cudaStreamSynchronize(CUDAContext::cuda_stream(gpu_id, stream_id)));
   }
 }
 
diff --git a/caffe2/operators/scale_blobs_op.cu b/caffe2/operators/scale_blobs_op.cu
index 01421fb822c6f..7305fddece96f 100644
--- a/caffe2/operators/scale_blobs_op.cu
+++ b/caffe2/operators/scale_blobs_op.cu
@@ -138,9 +138,9 @@ REGISTER_CUDA_OPERATOR(ScaleBlobs, ScaleBlobsOp<CUDAContext>);
         }
       }
     }
-    cudaMalloc(&dStartCoorArr, sizeof(int) * coorArrSize);
-    cudaMemcpy(dStartCoorArr, startCoorArr, sizeof(int) * coorArrSize,
-    cudaMemcpyHostToDevice);
+    C10_CUDA_CHECK(cudaMalloc(&dStartCoorArr, sizeof(int) * coorArrSize));
+    C10_CUDA_CHECK(cudaMemcpy(dStartCoorArr, startCoorArr, sizeof(int) * coorArrSize,
+      cudaMemcpyHostToDevice));
 
   // ScaleBlobsCUDAKernelBalanced kernel launch
   ScaleBlobsCUDAKernelBalanced<T>
@@ -150,7 +150,7 @@ REGISTER_CUDA_OPERATOR(ScaleBlobs, ScaleBlobsOp<CUDAContext>);
      dOutputArr);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 
-  cudaFree(dStartCoorArr);
+  C10_CUDA_CHECK(cudaFree(dStartCoorArr));
 */
 
 template <typename T>
diff --git a/caffe2/utils/math/elementwise.cu b/caffe2/utils/math/elementwise.cu
index b41d2590e9296..d1911ae4db4c7 100644
--- a/caffe2/utils/math/elementwise.cu
+++ b/caffe2/utils/math/elementwise.cu
@@ -305,7 +305,7 @@ CAFFE2_SPECIALIZED_HALF_SCALE_CUDA_KERNEL(float)
       return;                                                             \
     }                                                                     \
     if (alpha == T(0)) {                                                  \
-      cudaMemsetAsync(Y, 0, sizeof(T) * N, context->cuda_stream());       \
+      C10_CUDA_CHECK(cudaMemsetAsync(Y, 0, sizeof(T) * N, context->cuda_stream()));       \
     } else {                                                              \
       thrust::fill(                                                       \
           thrust::cuda::par.on(context->cuda_stream()), Y, Y + N, alpha); \
diff --git a/caffe2/utils/math/reduce.cu b/caffe2/utils/math/reduce.cu
index 69a6469d8ed15..d59cbd387753e 100644
--- a/caffe2/utils/math/reduce.cu
+++ b/caffe2/utils/math/reduce.cu
@@ -418,12 +418,12 @@ void MomentsCUDA(
     return;
   }
   if (std::equal(X_dims, X_dims + ndim, Y_dims)) {
-    cudaMemcpyAsync(
+    C10_CUDA_CHECK(cudaMemcpyAsync(
         mean,
         X,
         sizeof(T) * X_size,
         cudaMemcpyDeviceToDevice,
-        context->cuda_stream());
+        context->cuda_stream()));
     Set<T, CUDAContext>(Y_size, T(0), var, context);
     return;
   }
diff --git a/caffe2/utils/math_gpu.cu b/caffe2/utils/math_gpu.cu
index 54b0a9391c263..4ad249dadc7e4 100644
--- a/caffe2/utils/math_gpu.cu
+++ b/caffe2/utils/math_gpu.cu
@@ -2685,12 +2685,12 @@ CAFFE2_CUDA_EXPORT void CopyVector<float, CUDAContext>(
     float* dst,
     CUDAContext* context) {
   if (src != dst && N > 0) {
-    cudaMemcpyAsync(
+    C10_CUDA_CHECK(cudaMemcpyAsync(
         dst,
         src,
         sizeof(float) * N,
         cudaMemcpyDeviceToDevice,
-        context->cuda_stream());
+        context->cuda_stream()));
   }
 }
 
@@ -2701,12 +2701,12 @@ CAFFE2_CUDA_EXPORT void CopyVector<int, CUDAContext>(
     int* dst,
     CUDAContext* context) {
   if (src != dst && N > 0) {
-    cudaMemcpyAsync(
+    C10_CUDA_CHECK(cudaMemcpyAsync(
         dst,
         src,
         sizeof(int) * N,
         cudaMemcpyDeviceToDevice,
-        context->cuda_stream());
+        context->cuda_stream()));
   }
 }
 

From a067e7a0d8176e312ee481fc7a612b3bd4c9ea1f Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Thu, 27 Oct 2022 20:13:27 +0000
Subject: [PATCH 0249/1922] [FSDP] Fix wrapped module changing after ctor
 (#87837)

Recently, I retired `FlattenParamsWrapper`, which meant that FSDP registers its `FlatParameter` on the wrapped module instead of the `FlattenParamsWrapper` instance. This is only relevant for `use_orig_params=False`.

If the user changes an FSDP instance's wrapped module after the FSDP constructor, then the `FlatParameter` is no longer registered on the wrapped module. This can cause issues for full state dict, which checks if the `FlatParameter` is currently registered as an early return condition for `rank0_only=True`.

The solution in this PR is to re-establish the wrapped module in `_lazy_init()`, de-registering from the old wrapped module and re-registering to the new wrapped module, where the assumption is that the user should not modify the module structure upon `_lazy_init()`.

The direct access to the private attribute `_parameters` from `nn.Module` is not ideal, but we already rely on it for the dynamic `FlatParameter` registration. The tradeoff is whether we want an additional `nn.Module` wrapper (`FlattenParamsWrapper`) and use `delattr` plus a singleton list to do the dynamic registration or we want to access `_parameters`. If this becomes a problem, we can work with Core team on a solution.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87837
Approved by: https://github.com/zhaojuanmao
---
 test/distributed/fsdp/test_fsdp_misc.py       | 94 +++++++++++++++++++
 test/distributed/fsdp/test_fsdp_state_dict.py | 83 +++++++++-------
 torch/distributed/fsdp/_state_dict_utils.py   |  2 +-
 .../fsdp/fully_sharded_data_parallel.py       | 41 +++++++-
 4 files changed, 184 insertions(+), 36 deletions(-)

diff --git a/test/distributed/fsdp/test_fsdp_misc.py b/test/distributed/fsdp/test_fsdp_misc.py
index 98cd6488ae5e7..b57f5a1a94da2 100644
--- a/test/distributed/fsdp/test_fsdp_misc.py
+++ b/test/distributed/fsdp/test_fsdp_misc.py
@@ -9,12 +9,22 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    _CHECKPOINT_PREFIX,
+    apply_activation_checkpointing,
+    checkpoint_wrapper,
+    CheckpointImpl,
+)
 from torch.distributed.fsdp import (
     CPUOffload,
     FlatParameter,
     FullyShardedDataParallel as FSDP,
     ShardingStrategy,
 )
+from torch.distributed.fsdp.fully_sharded_data_parallel import (
+    FLAT_PARAM,
+    FSDP_WRAPPED_MODULE,
+)
 from torch.distributed.fsdp.wrap import always_wrap_policy, transformer_auto_wrap_policy
 from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
@@ -489,6 +499,90 @@ def __init__(self, rank):
                 fsdp, process_group=self.process_group, assert_fn=self.assertEqual
             )
 
+    @skip_if_lt_x_gpu(2)
+    def test_change_wrapped_module_after_ctor(self):
+        """
+        Tests changing an FSDP instance's wrapped module after the FSDP
+        constructor.
+        """
+        dist.set_debug_level(dist.DebugLevel.DETAIL)
+
+        class Model(nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.seq1 = nn.Sequential(
+                    nn.Linear(5, 5),
+                    nn.Linear(5, 5),
+                )
+                self.seq2 = nn.Sequential(nn.Linear(5, 5))
+                self.lin = nn.Linear(5, 5)
+                self.relu = nn.ReLU()
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.lin(self.relu(self.seq2(self.relu(self.seq1(x)))))
+
+        def get_fsdp_model():
+            fsdp_kwargs = {"use_orig_params": False}
+            model = Model().cuda()
+            model.seq1 = FSDP(model.seq1, **fsdp_kwargs)
+            model.seq2[0] = FSDP(model.seq2[0], **fsdp_kwargs)
+            model = FSDP(model, **fsdp_kwargs)
+            return model
+
+        # Wrap with `CheckpointWrapper` *after* FSDP construction
+        model = get_fsdp_model()
+        non_reentrant_wrapper = functools.partial(
+            checkpoint_wrapper,
+            offload_to_cpu=False,
+            checkpoint_impl=CheckpointImpl.NO_REENTRANT,
+        )
+        apply_activation_checkpointing(
+            model,
+            checkpoint_wrapper_fn=non_reentrant_wrapper,
+            check_fn=lambda submodule: isinstance(submodule, nn.Linear),
+        )
+
+        # Check that `seq2[0]` only has a single `FlatParameter` registered and
+        # that it has the `CheckpointWrapper` prefix in its FQN since it was
+        # registered to the `Linear` wrapped module in the FSDP constructor and
+        # only wrapped with `CheckpointWrapper` after
+        seq2_0_named_params = list(model.seq2[0].named_parameters())
+        self.assertEqual(len(seq2_0_named_params), 1)
+        self.assertTrue(type(seq2_0_named_params[0][1]) is FlatParameter)
+        self.assertTrue(_CHECKPOINT_PREFIX in seq2_0_named_params[0][0])
+
+        # Trigger the re-registration via `_lazy_init()`, and check for a
+        # warning, which is only emitted for DETAIL
+        with self.assertWarnsRegex(
+            UserWarning,
+            "The FSDP wrapped module changed from Linear.*to CheckpointWrapper",
+        ):
+            model._lazy_init()
+
+        # Check that now the `FlatParameter` is registered to the
+        # `CheckpointWrapper`, which is now the new wrapped module
+        seq2_0_named_params = list(model.seq2[0].named_parameters())
+        self.assertEqual(len(seq2_0_named_params), 1)
+        self.assertTrue(type(seq2_0_named_params[0][1]) is FlatParameter)
+        self.assertFalse(_CHECKPOINT_PREFIX in seq2_0_named_params[0][0])
+        self.assertFalse(isinstance(model.seq2[0].module, nn.Linear))
+
+        # Check that replacing a module *after* FSDP construction errors
+        model = get_fsdp_model()
+        # NOTE: Setting `model.seq2[0].module = nn.Linear(3, 3)` does not save
+        # to the FSDP instance's `module` attribute since `module` is a
+        # property, meaning that it would not actually change the wrapped
+        # module, so we use `setattr()` like in `_recursive_wrap()`.
+        setattr(model.seq2[0], FSDP_WRAPPED_MODULE, nn.Linear(3, 3))
+        with self.assertRaisesRegex(RuntimeError, "are invalid behavior"):
+            model._lazy_init()
+
+        # Check that deleting the `FlatParameter` errors
+        model = get_fsdp_model()
+        delattr(model.seq2[0].module, FLAT_PARAM)
+        with self.assertRaisesRegex(RuntimeError, "are invalid behavior"):
+            model._lazy_init()
+
 
 instantiate_parametrized_tests(TestFSDPMisc)
 
diff --git a/test/distributed/fsdp/test_fsdp_state_dict.py b/test/distributed/fsdp/test_fsdp_state_dict.py
index f5a401590414a..b8cbae5c2270e 100644
--- a/test/distributed/fsdp/test_fsdp_state_dict.py
+++ b/test/distributed/fsdp/test_fsdp_state_dict.py
@@ -11,7 +11,9 @@
 import torch.nn as nn
 from torch import distributed as dist
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    apply_activation_checkpointing,
     checkpoint_wrapper,
+    CheckpointImpl,
 )
 from torch.distributed.fsdp import (
     CPUOffload,
@@ -109,10 +111,22 @@ class TestFSDPStateDict(FSDPTest):
     def world_size(self):
         return 2
 
-    def _broadcast_state_dict(self, state_dict):
+    def _broadcast_state_dict(self, model, state_dict):
+        if not isinstance(model, FSDP):
+            # For non-FSDP root, some parts of the model state on rank 0 may
+            # not be on CPU, so we move everything to CPU to avoid issues like:
+            # https://github.com/pytorch/pytorch/issues/77113.
+            for param_name, param in state_dict.items():
+                if param.device != torch.device("cpu"):
+                    state_dict[param_name] = param.cpu()
+
         olist = [state_dict if self.rank == 0 else None]
         dist.broadcast_object_list(olist)
-        return olist[0]
+        state_dict = olist[0]
+        # Ensure that the state is on CUDA
+        for param_name in state_dict.keys():
+            state_dict[param_name] = state_dict[param_name].cuda()
+        return state_dict
 
     def _compare_models(self, model, model_new, assert_fn, check_fp16=False):
         assert assert_fn in (self.assertEqual, self.assertNotEqual)
@@ -220,27 +234,52 @@ def _validate_state_dict_contents(
 
     @skip_if_lt_x_gpu(2)
     @parametrize("state_dict_type", _UNFLATTENED_STATE_DICT_IMPLS)
-    @parametrize("checkpoint_wrap", ["first", "second", "both"])
+    @parametrize(
+        "checkpoint_wrap",
+        ["source", "dest", "both", "source_after_wrap", "both_after_wrap"],
+    )
+    @parametrize("rank0_only_and_offload", [False, True])
     def test_fsdp_state_dict_with_activation_checkpoint(
-        self, state_dict_type, checkpoint_wrap
+        self, state_dict_type, checkpoint_wrap, rank0_only_and_offload
     ):
         """Tests saving the state dict, zeroing a target model's parameters, and
         loading the state dict, where the source and target models may have a
         checkpoint wrapper."""
+
+        def apply_ac_to_linears(model) -> None:
+            non_reentrant_wrapper = partial(
+                checkpoint_wrapper,
+                offload_to_cpu=False,
+                checkpoint_impl=CheckpointImpl.NO_REENTRANT,
+            )
+            apply_activation_checkpointing(
+                model,
+                checkpoint_wrapper_fn=non_reentrant_wrapper,
+                check_fn=lambda submodule: isinstance(submodule, nn.Linear),
+            )
+
         for model_call in [
             partial(self._get_simple_model),
             partial(self._get_simple_nested_model),
         ]:
-            model = model_call(checkpoint_wrap=(checkpoint_wrap in ["first", "both"]))
-            with FSDP.state_dict_type(model, STATE_DICT_MAPPING[state_dict_type]):
+            model = model_call(checkpoint_wrap=(checkpoint_wrap in ("source", "both")))
+            if checkpoint_wrap in ("source_after_wrap", "both_after_wrap"):
+                apply_ac_to_linears(model)
+            with self._get_state_dict_mgr(
+                model, state_dict_type, rank0_only_and_offload
+            ):
                 state_dict = _gather_state_dict(_get_state_dict(model, False, False))
                 # Possibly wrap new model in activation checkpoint wrapper to test save/
                 # load with this wrapper
                 model_new = model_call(
-                    checkpoint_wrap=(checkpoint_wrap in ["second", "both"])
+                    checkpoint_wrap=(checkpoint_wrap in ("dest", "both"))
                 )
+                if checkpoint_wrap == "both_after_wrap":
+                    apply_ac_to_linears(model_new)
                 _zero_model(model_new)
                 self._compare_models(model, model_new, self.assertNotEqual)
+                if rank0_only_and_offload:
+                    state_dict = self._broadcast_state_dict(model, state_dict)
                 # Would fail if checkpoint_wrapper did not correctly implement state_dict pre/post hooks
                 model_new.load_state_dict(state_dict, strict=True)
                 self._compare_models(model, model_new, self.assertEqual)
@@ -417,17 +456,7 @@ def test_basic_save_and_load_state_dict(
 
             # Verify parameters are the same in the new model.
             if state_dict_rank0_and_offload:
-                # Broadcast the state dict and move it back to GPU in
-                # preparation for loading.
-                if not isinstance(model, FSDP):
-                    # Move everything to CPU to avoid running into
-                    # https://github.com/pytorch/pytorch/issues/77113, some params
-                    # will still be on GPU for non FSDP root modules.
-                    for k in fsdp_state_dict.keys():
-                        fsdp_state_dict[k] = fsdp_state_dict[k].cpu()
-                fsdp_state_dict = self._broadcast_state_dict(fsdp_state_dict)
-                for key in fsdp_state_dict.keys():
-                    fsdp_state_dict[key] = fsdp_state_dict[key].cuda()
+                fsdp_state_dict = self._broadcast_state_dict(model, fsdp_state_dict)
             with FSDP.state_dict_type(model_new, STATE_DICT_MAPPING[state_dict_type]):
                 model_new.load_state_dict(fsdp_state_dict, strict=True)
 
@@ -494,11 +523,7 @@ def test_save_and_load_after_forward_state_dict(
 
         # Load state_dict into zeroed model
         if state_dict_rank0_and_offload:
-            # Broadcast the state dict and move it back to GPU in
-            # preparation for loading.
-            state_dict = self._broadcast_state_dict(state_dict)
-            for key in state_dict.keys():
-                state_dict[key] = state_dict[key].cuda()
+            state_dict = self._broadcast_state_dict(model, state_dict)
 
         with FSDP.state_dict_type(model, STATE_DICT_MAPPING[state_dict_type]):
             model.load_state_dict(state_dict, strict=True)
@@ -675,17 +700,7 @@ def test_state_dict_load_into_local_module(
         # Load fsdp's full state dict into the local and verify params are as
         # expected.
         if state_dict_rank0_and_offload:
-            # Broadcast + CUDA state_dict
-            if not isinstance(model, FSDP):
-                # Some portions of the model on rank 0 might not be on CPU,
-                # move everything to CPU to avoid running into
-                # https://github.com/pytorch/pytorch/issues/77113.
-                for k, t in fsdp_state_dict.items():
-                    if t.device != torch.device("cpu"):
-                        fsdp_state_dict[k] = t.cpu()
-            fsdp_state_dict = self._broadcast_state_dict(fsdp_state_dict)
-            for key in fsdp_state_dict.keys():
-                fsdp_state_dict[key] = fsdp_state_dict[key].cuda()
+            fsdp_state_dict = self._broadcast_state_dict(model, fsdp_state_dict)
 
         # if self.rank == 0:
         blank_local_model.load_state_dict(fsdp_state_dict, strict=True)
diff --git a/torch/distributed/fsdp/_state_dict_utils.py b/torch/distributed/fsdp/_state_dict_utils.py
index 90083ef85b18e..0aeaf5035e261 100644
--- a/torch/distributed/fsdp/_state_dict_utils.py
+++ b/torch/distributed/fsdp/_state_dict_utils.py
@@ -210,7 +210,7 @@ def _local_pre_load_state_dict_hook(
     assert len(shards), "load_local_state_dict assume one shard per ShardedTensor."
     load_tensor = shards[0].tensor
 
-    # Get the metada of the flat_param to decide whether to pad the loaded
+    # Get the metadata of the flat_param to decide whether to pad the loaded
     # tensor.
     flat_param = module._handles[0].flat_param
     assert flat_param is not None
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 8cd18474d959d..21b7787df766c 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -1884,6 +1884,44 @@ def _lazy_init(self) -> None:
         # to non-root instances
         inconsistent_limit_all_gathers = False
         for fsdp_module in self.fsdp_modules(self):
+            if not fsdp_module._use_orig_params and fsdp_module._has_params:
+                # Check if the wrapped module changed after construction
+                # (e.g. applying the activation checkpointing wrapper) and
+                # if so, de-register the `FlatParameter` from the old
+                # wrapped module and register it to the new wrapped module
+                # NOTE: The `FlatParameter`'s FQN metadata is not updated, so
+                # any added wrappers must clean their prefixes from FQNs.
+                flat_param = fsdp_module._handles[0].flat_param
+                target_submodule = None
+                target_name = None
+                for submodule in fsdp_module.modules():
+                    for param_name, param in submodule._parameters.items():
+                        if flat_param is param:  # found registered `FlatParameter`
+                            target_submodule = submodule
+                            target_name = param_name
+                            break
+                    if target_submodule is not None:
+                        break
+                if (
+                    target_submodule is not None
+                    and target_submodule is not fsdp_module.module
+                ):
+                    assert target_name is not None
+                    if fsdp_module._debug_level == dist.DebugLevel.DETAIL:
+                        warnings.warn(
+                            "The FSDP wrapped module changed from "
+                            f"{target_submodule} to {fsdp_module.module} on "
+                            f"rank {fsdp_module.rank}. {fsdp_module}"
+                        )
+                    target_submodule._parameters.pop(target_name)  # de-register
+                    fsdp_module._register_flat_param()  # re-register
+                elif target_submodule is None:
+                    raise RuntimeError(
+                        "Either the FSDP wrapped module was removed from "
+                        "the model or its `FlatParameter` was manually "
+                        f"de-registered on rank {fsdp_module.rank}. Both of "
+                        f"these are invalid behavior. {fsdp_module}"
+                    )
             if fsdp_module is not self:
                 # Relax the assert for non-root FSDP instances in case the
                 # nested initialized module is wrapped again in FSDP later (e.g.
@@ -2842,7 +2880,8 @@ def _deregister_flat_param(self):
         attribute but dynamically change whether it is visible to ``nn.Module``
         methods.
         """
-        self.module._parameters.pop(FLAT_PARAM, None)
+        if self._has_params:
+            self.module._parameters.pop(FLAT_PARAM, None)
 
     @contextlib.contextmanager
     def _deregister_orig_params_ctx(self):

From 8cdcdfafdd4ab25e5041d1e9bb01129653ed702e Mon Sep 17 00:00:00 2001
From: lezcano <lezcano-93@hotmail.com>
Date: Thu, 27 Oct 2022 21:46:25 +0000
Subject: [PATCH 0250/1922] Add decomposition for diagonal_scatter (#87282)

cc @ezyang @mruberry @ngimel @Lezcano @fdrocha
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87282
Approved by: https://github.com/mruberry
---
 test/functorch/test_aotdispatch.py            |  5 -----
 test/test_proxy_tensor.py                     |  1 -
 torch/_refs/__init__.py                       | 21 +++++++++++++++++++
 .../_internal/common_methods_invocations.py   |  6 ++++++
 4 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 15e0e6a43c3b8..5f6e8d9bf238f 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1001,15 +1001,10 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('cumsum', ''),  # aten.cumsum.default - couldn't find symbolic meta function/decomposition
     xfail('cumulative_trapezoid', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('deg2rad', ''),  # aten.deg2rad.default - couldn't find symbolic meta function/decomposition
-    xfail('diag', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('diagonal', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('diagonal_copy', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('diagonal_scatter', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('diff', ''),  # aten.zeros_like.default - couldn't find symbolic meta function/decomposition
     xfail('digamma', ''),  # aten.polygamma.default - couldn't find symbolic meta function/decomposition
     xfail('dist', ''),  # aten.dist.default - couldn't find symbolic meta function/decomposition
     xfail('dsplit', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('einsum', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('fft.fft2', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('fft.fft', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('fft.fftn', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 6cb7d280cc19a..4f4265b8dc6a2 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1150,7 +1150,6 @@ def f(a, b, c, d, e):
     xfail('cumprod', ''),  # aten.cumprod.default - couldn't find symbolic meta function/decomposition
     xfail('cumulative_trapezoid', ''),  # aten.slice.Tensor - couldn't find symbolic meta function/decomposition
     xfail('deg2rad', ''),  # aten.deg2rad.default - couldn't find symbolic meta function/decomposition
-    xfail('diagonal_scatter', ''),  # aten.diagonal_scatter.default - couldn't find symbolic meta function/decomposition
     xfail('diff', ''),  # aten.empty_like.default - couldn't find symbolic meta function/decomposition
     xfail('dist', ''),  # aten.dist.default - couldn't find symbolic meta function/decomposition
     xfail('dsplit', ''),  # aten.slice.Tensor - couldn't find symbolic meta function/decomposition
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 5cee1c9a684bb..dd60ac2c1d257 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -222,6 +222,7 @@
     "diag",
     "diagonal",
     "diagonal_copy",
+    "diagonal_scatter",
     "dsplit",
     "dstack",
     "expand",
@@ -3501,6 +3502,26 @@ def diag(
         return torch.diagonal_copy(self, offset)
 
 
+@register_decomposition(torch.ops.aten.diagonal_scatter)
+@out_wrapper()
+def diagonal_scatter(
+    input: TensorLikeType,
+    src: TensorLikeType,
+    offset: int = 0,
+    dim1: int = 0,
+    dim2: int = 1,
+) -> TensorLikeType:
+    out = input.clone()
+    diag = out.diagonal(offset, dim1, dim2)
+    check(
+        diag.shape == src.shape,
+        lambda: "expected src to have a size equal to the diagonal of the input."
+        f"Got {src.shape} for a diagonal of shape {diag.shape}",
+    )
+    copy_to(diag, src)
+    return out
+
+
 @register_decomposition(torch.ops.aten.diagonal)
 def diagonal(
     self: TensorLikeType,
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index fb4238234a98f..f562aace3cc03 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -17750,6 +17750,12 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         torch_opinfo_name="diagonal_copy",
         supports_nvfuser=False,
     ),
+    PythonRefInfo(
+        "_refs.diagonal_scatter",
+        torch_opinfo_name="diagonal_scatter",
+        supports_out=True,
+        supports_nvfuser=False,
+    ),
     PythonRefInfo(
         "_refs.diag_embed",
         torch_opinfo_name="diag_embed",

From 9e30baec28a16dec90d550aa90683af3c53d5fd0 Mon Sep 17 00:00:00 2001
From: lezcano <lezcano-93@hotmail.com>
Date: Thu, 27 Oct 2022 21:46:25 +0000
Subject: [PATCH 0251/1922] Fix decomposition for std (#87181)

The previous implementation was lacking a few features and incurred on a
pretty large error

cc @ezyang @mruberry @ngimel @Lezcano @fdrocha
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87181
Approved by: https://github.com/ngimel, https://github.com/peterbell10
---
 test/test_ops.py                |  1 -
 torch/_decomp/decompositions.py | 40 ---------------------------------
 torch/_refs/__init__.py         |  1 +
 3 files changed, 1 insertion(+), 41 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index fa8812aa5d8ee..4f062d4c54034 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -1674,7 +1674,6 @@ class TestRefsOpsInfo(TestCase):
         # duplicated in _decomp and _refs
         '_refs.nn.functional.elu',
         '_refs.nn.functional.mse_loss',
-        '_refs.var',
         '_refs.rsub',
         # duplicated due to efficiency concerns of the ref vs the decomp
         '_refs.index_add_',
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 234e43d12bf81..7ba4e6c4e97c8 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -1446,46 +1446,6 @@ def xlogy(self: Tensor, other: Tensor) -> Tensor:
     )
 
 
-@register_decomposition(aten.var.correction)
-@reduction_complex_to_real
-def var_correction(
-    x: Tensor,
-    dim: Optional[List[int]],
-    correction: Optional[int] = None,
-    keepdim: bool = False,
-):
-    dims: List[int] = [] if dim is None else dim
-
-    if x.is_complex():
-        # For complex, calculate variance of real and imaginary components
-        # separately then add to get overall variance.
-        real_in = x.real
-        var_real = torch.var(real_in, dims, correction=correction, keepdim=keepdim)
-        imag_in = x.imag
-        var_imag = torch.var(imag_in, dims, correction=correction, keepdim=keepdim)
-        return var_real + var_imag
-
-    if correction is None:
-        correction = 1
-
-    if len(dims) == 0:
-        n = prod(x.shape)  # type: ignore[arg-type]
-    else:
-        n = 1
-        for d in dims:
-            n *= x.shape[d]
-
-    mean = torch.mean(x, dims, True)
-    sub = x - mean
-    sq = sub * sub
-    sum = torch.sum(sq, dims, keepdim)
-
-    if correction:
-        n = n - correction
-
-    return sum / n
-
-
 @register_decomposition(aten.std.correction)
 @reduction_complex_to_real
 def std_decomposition(
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index dd60ac2c1d257..c01c0da051f7e 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -2204,6 +2204,7 @@ def _dim_var_dispatch(dim=None, unbiased=None):
     return dim, unbiased
 
 
+@register_decomposition(torch.ops.aten.var)
 @out_wrapper()
 def var(
     a: TensorLikeType,

From 96735897633b47c0917d7ec32e9fe5a6ee96b6d1 Mon Sep 17 00:00:00 2001
From: Tugsbayasgalan Manlaibaatar <tmanlaibaatar@fb.com>
Date: Thu, 27 Oct 2022 04:04:26 +0000
Subject: [PATCH 0252/1922] [EZ] Fix simple bug in torchdynamo (#87821)

cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87821
Approved by: https://github.com/voznesenskym, https://github.com/jansel
---
 torch/_dynamo/variables/user_defined.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/_dynamo/variables/user_defined.py b/torch/_dynamo/variables/user_defined.py
index 2d33c8328268a..09d7893bef665 100644
--- a/torch/_dynamo/variables/user_defined.py
+++ b/torch/_dynamo/variables/user_defined.py
@@ -68,7 +68,7 @@ def call_method(
 
             return variables.ListVariable(subs_as_vars, **options)
 
-        return super().call_method(tx, args, kwargs)
+        return super().call_method(tx, name, args, kwargs)
 
     def call_function(
         self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"

From 80a6046f7bdc99531481f5fc042bcc03427f05a7 Mon Sep 17 00:00:00 2001
From: Mike Iovine <mikeiovine@meta.com>
Date: Fri, 28 Oct 2022 01:28:34 +0000
Subject: [PATCH 0253/1922] [Static Runtime] Move PrepackWeights to
 internal-only graph passes (#87799)

Summary:
The pass introduces an `fb::` operator and thus cannot be used in OSS.

The test failure was not exposed because the Static Runtime tests have been disabled in OSS for a while. The Dev Infra folks encountered this failure when re-enabling the tests.

Test Plan: Existing tests

Differential Revision: D40724547

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87799
Approved by: https://github.com/huydhn
---
 .../static_runtime/test_static_runtime.cc     | 35 -------------------
 torch/csrc/jit/runtime/static/impl.cpp        |  2 +-
 2 files changed, 1 insertion(+), 36 deletions(-)

diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index cf102224fc087..883db0c141b4c 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -3676,41 +3676,6 @@ TEST(StaticRuntime, ClampNaNToNum) {
   testStaticRuntime(src1, {a.to(at::kDouble)}, {b.to(at::kDouble)}, /*use_allclose=*/true, /*use_equalnan=*/true);
 }
 
-TEST(StaticRuntime, PrepackWeights) {
-  const std::string src = R"IR(
-    graph(%input: Tensor, %weight: Tensor, %bias: Tensor?, %scale: Tensor, %zero_point: Tensor):
-        %none: NoneType = prim::Constant()
-        %result: Tensor = fb::quantized_linear_unpacked_weight_v2(%input, %weight, %bias, %scale, %zero_point)
-        %dequantized: Tensor = aten::dequantize(%result)
-        return (%dequantized)
-  )IR";
-
-  auto graph = getGraphFromIR(src);
-  PrepackWeights(graph);
-  ASSERT_TRUE(graphHasOp(graph, "quantized::linear"));
-  ASSERT_TRUE(graphHasOp(graph, "quantized::linear_prepack"));
-  ASSERT_FALSE(graphHasOp(graph, "fb::quantized_linear_unpacked_weight_v2"));
-
-  auto scale = at::tensor({2}, at::kFloat);
-  auto zero_point = at::tensor({3}, at::kLong);
-
-  auto weight =
-      at::quantize_per_tensor(torch::randn({3, 2}), 2, 3, torch::kQInt8);
-  auto input =
-      at::quantize_per_tensor(torch::randn({3, 2}), 2, 3, torch::kQUInt8);
-  auto args1 = std::vector<IValue>{input, weight, c10::nullopt, scale, zero_point};
-
-  auto weight_2 =
-      at::quantize_per_tensor(torch::randn({8, 3}), 2, 3, torch::kQInt8);
-  auto input_2 =
-      at::quantize_per_tensor(torch::randn({9, 3}), 2, 3, torch::kQUInt8);
-  auto bias_2 = torch::randn({3}, torch::kFloat);
-  auto args2 = std::vector<IValue>{input, weight, bias_2, scale, zero_point};
-
-  testStaticRuntime(src, args1);
-  testStaticRuntime(src, args2);
-}
-
 TEST(StaticRuntime, IfReturningTuple) {
   const auto src = R"JIT(
     def forward(self, x, y, cond: bool, idx: int):
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index bef31efb50d17..4b8015f85c0a9 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -172,7 +172,6 @@ void OptimizeGraph(
   UseVariadicStack(graph);
   EliminateTrivialEquallySplit(graph);
   EliminateExtraPermuteOps(graph);
-  PrepackWeights(graph);
 
   if (opts.enable_out_variant) {
     UseVariadicOp(
@@ -199,6 +198,7 @@ void OptimizeGraph(
     }
     FuseListUnpack(graph);
     RemoveUnnecessaryOutputs(graph);
+    PrepackWeights(graph);
 #endif
   }
 

From a0952431423e4701c0d41f8b9ddb71a1f2f940f2 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Thu, 27 Oct 2022 16:59:49 +0000
Subject: [PATCH 0254/1922] [AC] Return `None` from
 `apply_activation_checkpointing()` (#87871)

`_recursive_wrap()` returns `Tuple[nn.Module, int]`, where the `nn.Module` is the in-place modified module and the `int` is the numel wrapped. In that sense, the return value is not meant to be publicly used. The `apply_activation_checkpointing()` docs already suggest that the function returns `None`, so this PR simply follows that.

**Test Plan**
CI
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87871
Approved by: https://github.com/zhaojuanmao
---
 torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py b/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py
index 30c8cb4e6beb8..9e72fb6a21de9 100644
--- a/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py
+++ b/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py
@@ -268,7 +268,7 @@ def apply_activation_checkpointing(
     # TODO: Importing inside function to avoid circular import issue between FSDP and
     # checkpoint_wrapper. This can be resolved once wrap() APIs are decoupled from FSDP code.
     from torch.distributed.fsdp.wrap import _recursive_wrap, lambda_auto_wrap_policy
-    return _recursive_wrap(
+    _recursive_wrap(
         module=model,
         auto_wrap_policy=partial(lambda_auto_wrap_policy, lambda_fn=check_fn),
         wrapper_cls=checkpoint_wrapper_fn,

From 9467cf204398e8413c5013a6b96742a392ebb15f Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 28 Oct 2022 02:20:24 +0000
Subject: [PATCH 0255/1922] Revert "Enable mypy check for distributed.py, and
 fix type errors (#87543)"

This reverts commit 2cc624cd4318414905d2475432aee13db9031cc6.

Reverted https://github.com/pytorch/pytorch/pull/87543 on behalf of https://github.com/weiwangmeta due to breaking internal builds
---
 torch/_C/_distributed_c10d.pyi    | 68 +++++++++----------------------
 torch/nn/parallel/distributed.py  | 25 ++++--------
 torch/nn/parallel/distributed.pyi | 21 ++++++++++
 3 files changed, 49 insertions(+), 65 deletions(-)
 create mode 100644 torch/nn/parallel/distributed.pyi

diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index 493e1d8846e71..bdf0166b8daa9 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -1,9 +1,8 @@
 from datetime import timedelta
 from enum import Enum
-from typing import Any, Dict, List, Optional, Tuple, Union, overload
+from typing import Optional, List, Any, Tuple, overload, Union
 
 from torch import Tensor
-from torch.futures import Future
 
 # This module is defined in torch/csrc/distributed/c10d/init.cpp
 
@@ -33,36 +32,13 @@ class Reducer:
         self,
         params: List[Tensor],
         bucket_indices: List[List[int]],
-        per_bucket_size_limits: List[int],
         process_group: ProcessGroup,
-        expect_sparse_gradients: List[bool] = [],
-        bucket_bytes_cap: int = ...,  # kDefaultBucketBytesCap in reducer.hpp
-        find_unused_parameters: bool = False,
-        gradient_as_bucket_view: bool = False,
-        param_to_name_mapping: Dict[int, str] = {},
-        first_bucket_types_cap: int = ...,  # kDefaultFirstBucketBytes in reducer.hpp
+        expect_sparse_gradients: List[bool],
+        bucket_bytes_cap: int,
+        find_unused_parameters: bool,
+        gradient_as_bucket_view: bool,
     ): ...
-    def prepare_for_forward(self) -> None: ...
-    def prepare_for_backward(self, output: List[Tensor]) -> None: ...
-    def get_backward_stats(self) -> List[int]: ...
-    def _install_post_backward_futures(self, futures: List[Future]) -> None: ...
-    def _rebuild_buckets(self) -> bool: ...
-    def _get_zeros_like_grad_buckets(self) -> List[GradBucket]: ...
-    def _push_all_rebuilt_params(self) -> None: ...
-    def _set_forward_pass_work_handle(
-        self, work: Work, use_static_world_size: bool
-    ): ...
-    def _get_local_used_map(self) -> Tensor: ...
-    def _set_ddp_runtime_logging_sample_rate(
-        self, sample_rate: int
-    ) -> None: ...
-    def _set_static_graph(self) -> None: ...
-    def _run_comm_hook(self, bucket: GradBucket) -> Future: ...
-    def set_logger(self, logger: Logger) -> None: ...
-
-class DDPLoggingData:
-    strs_map: Dict[str, str]
-    ints_map: Dict[str, int]
+    ...
 
 class Logger:
     def __init__(self, reducer: Reducer): ...
@@ -73,14 +49,8 @@ class Logger:
         output_device: int,
         broadcast_buffers: bool,
         has_sync_bn: bool,
-        static_graph: bool,
     ): ...
-    def set_runtime_stats_and_log(self) -> None: ...
-    def set_error_and_log(self, error: str) -> None: ...
-    def _get_ddp_logging_data(self) -> DDPLoggingData: ...
-    def _set_comm_hook_name(self, comm_hook: str) -> None: ...
-    def _set_uneven_input_join(self) -> None: ...
-    def _set_static_graph(self) -> None: ...
+    ...
 
 def get_debug_level(): ...
 def set_debug_level(): ...
@@ -148,9 +118,7 @@ class Store:
     def set(self, key: str, value: str): ...
     def get(self, key: str) -> bytes: ...
     def add(self, key: str, value: int) -> int: ...
-    def compare_set(
-        self, key: str, expected_value: str, desired_value: str
-    ) -> bytes: ...
+    def compare_set(self, key: str, expected_value: str, desired_value: str) -> bytes: ...
     def delete_key(self, key: str) -> bool: ...
     def num_keys(self) -> int: ...
     def set_timeout(self, timeout: timedelta): ...
@@ -174,7 +142,7 @@ class TCPStore(Store):
         is_master: bool = ...,
         timeout: timedelta = ...,
         wait_for_workers: bool = ...,
-        multi_tenant: bool = ...,
+        multi_tenant: bool = ...
     ): ...
     @property
     def host(self) -> str: ...
@@ -199,7 +167,6 @@ class Work:
 
 class ProcessGroup:
     class Options: ...
-
     def __init__(self): ...
     def rank(self) -> int: ...
     def size(self) -> int: ...
@@ -268,7 +235,7 @@ class ProcessGroup:
         self,
         output: Tensor,
         input: Tensor,
-        opts=AllGatherOptions(),
+        opts = AllGatherOptions(),
     ) -> Work: ...
     def allgather_coalesced(
         self,
@@ -376,7 +343,6 @@ def _round_robin_process_groups(
 class ProcessGroupGloo(ProcessGroup):
     class Device: ...
     class Options: ...
-
     def __init__(
         self,
         store: Store,
@@ -392,12 +358,16 @@ class ProcessGroupGloo(ProcessGroup):
     ...
 
 class _ProcessGroupWrapper(ProcessGroup):
-    def __init__(self, pg: ProcessGroup, gloo_pg: ProcessGroupGloo): ...
+    def __init__(
+        self,
+        pg: ProcessGroup,
+        gloo_pg: ProcessGroupGloo
+    ): ...
     wrapped_pg: ProcessGroup
 
+
 class ProcessGroupNCCL(ProcessGroup):
     class Options: ...
-
     def __init__(
         self,
         store: Store,
@@ -432,9 +402,9 @@ class ProcessGroupMPI(ProcessGroup):
 
 def _compute_bucket_assignment_by_size(
     tensors: List[Tensor],
-    bucket_size_limits: List[int],
-    expect_sparse_gradient: List[bool] = [],
-    tensor_indices: List[int] = [],
+    bucket_size: int,
+    expect_sparse_gradient: List[bool],
+    tensor_indices: List[int],
 ) -> Tuple[List[List[int]], List[int]]: ...
 def _broadcast_coalesced(
     process_group: ProcessGroup,
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index 514b89aad28d6..23625d9d20014 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -1,7 +1,7 @@
 import sys
 import copy
 from dataclasses import dataclass
-from typing import Any, Callable, Optional, Type
+from typing import Callable, Any, Type
 from enum import Enum, auto
 import inspect
 import itertools
@@ -37,7 +37,7 @@
 
 from ..modules import Module
 from ._replicated_tensor_ddp_utils import _ddp_with_replicated_tensor_enabled
-from .scatter_gather import gather, scatter_kwargs  # noqa: F401
+from .scatter_gather import gather, is_namedtuple, scatter_kwargs  # noqa: F401
 
 __all__ = ["DistributedDataParallel"]
 
@@ -194,7 +194,6 @@ def __init__(self, ddp, divide_by_initial_world_size):
             "DDP join hook requires passing in a DistributedDataParallel "
             "instance as the state"
         )
-        assert ddp.logger is not None
         ddp.logger._set_uneven_input_join()
         self.ddp = ddp
         self.ddp._divide_by_initial_world_size = divide_by_initial_world_size
@@ -556,7 +555,7 @@ def __init__(
 
         super(DistributedDataParallel, self).__init__()
         Joinable.__init__(self)
-        self.logger: Optional[dist.Logger] = None
+        self.logger = None
         if not any((p.requires_grad for p in module.parameters())):
             self._log_and_throw(
                 RuntimeError,
@@ -837,7 +836,6 @@ def __setstate__(self, state):
         )
         if self.static_graph:
             self.reducer._set_static_graph()
-            assert self.logger is not None
             self.logger._set_static_graph()
 
     def _build_params_for_reducer(self):
@@ -865,7 +863,7 @@ def _build_params_for_reducer(self):
             # "not memo.add(p)" is always True, and it's only there to cause "add(p)" if needed.
             (m, p)
             for m, p in modules_and_parameters
-            if p not in memo and not memo.add(p)  # type: ignore[func-returns-value]
+            if p not in memo and not memo.add(p)
         ]
 
         # Build list of parameters.
@@ -1046,7 +1044,7 @@ def _run_ddp_forward(self, *inputs, **kwargs):
                 self.use_side_stream_for_tensor_copies,
             )
             with self._inside_ddp_forward():
-                return module_to_run(*inputs[0], **kwargs[0])  # type: ignore[index]
+                return module_to_run(*inputs[0], **kwargs[0])
         else:
             with self._inside_ddp_forward():
                 return module_to_run(*inputs, **kwargs)
@@ -1056,7 +1054,6 @@ def forward(self, *inputs, **kwargs):
             "DistributedDataParallel.forward"
         ):
             if torch.is_grad_enabled() and self.require_backward_grad_sync:
-                assert self.logger is not None
                 self.logger.set_runtime_stats_and_log()
                 self.num_iterations += 1
                 self.reducer.prepare_for_forward()
@@ -1066,7 +1063,7 @@ def forward(self, *inputs, **kwargs):
             work = Join.notify_join_context(self)
             if work:
                 self.reducer._set_forward_pass_work_handle(
-                    work, self._divide_by_initial_world_size  # type: ignore[arg-type]
+                    work, self._divide_by_initial_world_size
                 )
 
             # Calling _rebuild_buckets before forward compuation,
@@ -1174,7 +1171,7 @@ def gather(self, outputs, output_device):
     def train(self, mode=True):
         super(DistributedDataParallel, self).train(mode)
         if self._use_replicated_tensor_module:
-            self._replicated_tensor_module.train(mode)  # type: ignore[union-attr]
+            self._replicated_tensor_module.train(mode)
         return self
 
     # When running in join mode, schedules an allreduce to notify joined ranks
@@ -1395,7 +1392,7 @@ def join_process_group(self):
     def _register_buffer_comm_hook(
         self,
         state,
-        hook: Callable,
+        hook: callable,
         comm_hook_location=_BufferCommHookLocation.POST_FORWARD,
     ):
         r"""
@@ -1441,7 +1438,7 @@ def _register_buffer_comm_hook(
             buffer_comm_hook_location=comm_hook_location,
         )
 
-    def register_comm_hook(self, state: object, hook: Callable):
+    def register_comm_hook(self, state: object, hook: callable):
         r"""
         Registers a communication hook which is an enhancement that provides a
         flexible hook to users where they can specify how DDP aggregates gradients
@@ -1521,7 +1518,6 @@ def register_comm_hook(self, state: object, hook: Callable):
             >>> ddp.register_comm_hook(state=None, hook=encode_and_decode)
         """
         self._check_comm_hook(hook)
-        assert self.logger is not None
         self.logger._set_comm_hook_name(hook.__qualname__)
         dist._register_comm_hook(self.reducer, state, hook)
 
@@ -1548,7 +1544,6 @@ def _register_builtin_comm_hook(self, comm_hook_type):
             >>> ddp._register_builtin_comm_hook(dist.BuiltinCommHookType.FP16_COMPRESS)
 
         """
-        assert self.logger is not None
         self.logger._set_comm_hook_name(str(comm_hook_type))
         dist._register_builtin_comm_hook(self.reducer, comm_hook_type)
 
@@ -1813,7 +1808,6 @@ def _get_ddp_logging_data(self):
         these metrics are.
         This is a prototype interface and subject to change in the future.
         """
-        assert self.logger is not None
         ddp_logging_data = self.logger._get_ddp_logging_data()
         return {**ddp_logging_data.strs_map, **ddp_logging_data.ints_map}
 
@@ -1848,7 +1842,6 @@ def _set_static_graph(self):
             return
         self.static_graph = True
         self.reducer._set_static_graph()
-        assert self.logger is not None
         self.logger._set_static_graph()
         if self.find_unused_parameters:
             warnings.warn(
diff --git a/torch/nn/parallel/distributed.pyi b/torch/nn/parallel/distributed.pyi
new file mode 100644
index 0000000000000..a75713afb8282
--- /dev/null
+++ b/torch/nn/parallel/distributed.pyi
@@ -0,0 +1,21 @@
+from ..modules import Module
+from typing import Any, Optional
+from .common_types import _devices_t, _device_t
+
+
+class DistributedDataParallel(Module):
+    process_group: Any = ...
+    dim: int = ...
+    module: Module = ...
+    device_ids: _devices_t = ...
+    output_device: _device_t = ...
+    broadcast_buffers: bool = ...
+    check_reduction: bool = ...
+    broadcast_bucket_size: float = ...
+    bucket_bytes_cap: float = ...
+
+    # TODO type process_group once `distributed` module is stubbed
+    def __init__(self, module: Module, device_ids: Optional[_devices_t] = ...,
+                 output_device: Optional[_device_t] = ..., dim: int = ...,
+                 broadcast_buffers: bool = ..., process_group: Optional[Any] = ..., bucket_cap_mb: float = ...,
+                 find_unused_parameters: bool = ..., check_reduction: bool = ...) -> None: ...

From 337c9fae955961635fc621fc5da10b639ea0ece2 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 28 Oct 2022 02:41:12 +0000
Subject: [PATCH 0256/1922] [dynamo][benchmarks] Prepone Cold start setup
 (#87913)

Parallel compilation warms the Threadpool when we call `torch._dynamo.optimize()`. In current benchmarks, we were setting up the TRITON_CACHE_DIR much later. Because of this parallel compilation artifacts were not used and compilation latency improvements were not visible in dashboard. This PR just prepones the setup of TRITON_CACHE_DIR.

cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87913
Approved by: https://github.com/wconstab
---
 benchmarks/dynamo/common.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 88de22f326cfe..70d533c61b82c 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -757,19 +757,19 @@ def scale(self, loss):
         return loss
 
 
-def maybe_fresh_cache(fn):
-    def inner(self, *args, **kwargs):
+def maybe_fresh_cache(fn, is_cold_start):
+    def inner(*args, **kwargs):
         cache_minder = NullContext()
-        if self.args.cold_start_latency:
+        if is_cold_start:
             cache_entries = {}
             cache_minder = fresh_triton_cache(cache_entries)
 
         try:
             with cache_minder:
-                return fn(self, *args, **kwargs)
+                return fn(*args, **kwargs)
         finally:
             dump_cache = False
-            if dump_cache and self.args.cold_start_latency:
+            if dump_cache and is_cold_start:
                 output_csv(
                     output_filename[:-4] + "_triton_cache.csv",
                     ["dev", "name", "batch_size", "triton_cache"],
@@ -1190,7 +1190,6 @@ def compare_branches(
                 "--diff_main called on main branch, what are you diffing?"
             )
 
-    @maybe_fresh_cache
     def run_one_model(
         self,
         name,
@@ -1470,11 +1469,15 @@ def parse_args():
 
 def main(runner, original_dir=None):
     args = parse_args()
+    return maybe_fresh_cache(run, args.cold_start_latency and args.only)(
+        runner, args, original_dir
+    )
+
 
+def run(runner, args, original_dir=None):
     # Pass the parsed args object to benchmark runner object
     runner.args = args
 
-    # defaults
     args.filter = args.filter or [r"."]
     args.exclude = args.exclude or [r"^$"]
 

From e6d228c7ab6c840a53d85cfb818b745ceab46385 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 28 Oct 2022 03:00:09 +0000
Subject: [PATCH 0257/1922] Revert "Many symintifications (#87604)"

This reverts commit 777e6a2c5100f3274cff1bcf7e47ccbe1a651927.

Reverted https://github.com/pytorch/pytorch/pull/87604 on behalf of https://github.com/weiwangmeta due to breaking internal builds
---
 aten/src/ATen/ExpandUtils.h                   |  5 +-
 aten/src/ATen/core/TensorBase.h               | 10 ----
 .../ATen/functorch/BatchRulesConvolution.cpp  | 56 +++++++++----------
 .../functorch/BatchRulesDecompositions.cpp    |  2 +-
 .../ATen/native/AdaptiveAveragePooling.cpp    |  6 +-
 aten/src/ATen/native/Convolution.cpp          | 20 +++----
 aten/src/ATen/native/EmbeddingBag.cpp         |  2 +-
 aten/src/ATen/native/GridSamplerUtils.h       |  2 +-
 aten/src/ATen/native/IndexingUtils.cpp        | 12 ++--
 aten/src/ATen/native/LossNLL.cpp              |  2 +-
 aten/src/ATen/native/LossNLL2d.cpp            |  2 +-
 aten/src/ATen/native/NonSymbolicBC.h          |  1 -
 aten/src/ATen/native/Pool.h                   | 18 ++----
 aten/src/ATen/native/TensorProperties.cpp     |  2 +-
 aten/src/ATen/native/TensorShape.cpp          | 28 +++-------
 aten/src/ATen/native/group_norm.cpp           | 24 ++++----
 aten/src/ATen/native/native_functions.yaml    | 54 +++++++++---------
 test/functorch/test_aotdispatch.py            |  3 +
 test/test_proxy_tensor.py                     |  4 ++
 tools/autograd/derivatives.yaml               | 44 +++++++--------
 tools/jit/gen_unboxing.py                     |  4 +-
 torch/csrc/StorageMethods.cpp                 |  2 +-
 torch/csrc/autograd/FunctionsManual.cpp       | 16 +++---
 torch/csrc/autograd/FunctionsManual.h         |  8 +--
 torch/storage.py                              |  4 +-
 torchgen/api/cpp.py                           |  8 +--
 torchgen/api/native.py                        |  2 +-
 torchgen/api/python.py                        |  6 +-
 torchgen/gen.py                               |  4 +-
 29 files changed, 154 insertions(+), 197 deletions(-)

diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h
index 786cbf132cd77..779894645b8ec 100644
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@@ -94,11 +94,10 @@ inline void check_defined(
 inline c10::MaybeOwned<Tensor> expand_inplace(
     const Tensor& tensor,
     const Tensor& to_expand) {
-  if (tensor.sym_sizes().equals(to_expand.sym_sizes())) {
+  if (tensor.sizes().equals(to_expand.sizes())) {
     return c10::MaybeOwned<Tensor>::borrowed(to_expand);
   }
-  return c10::MaybeOwned<Tensor>::owned(
-      to_expand.expand_symint(tensor.sym_sizes()));
+  return c10::MaybeOwned<Tensor>::owned(to_expand.expand(tensor.sizes()));
 }
 
 inline c10::MaybeOwned<Tensor> expand_inplace(
diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h
index 0ecd4456033b0..08a14f2e09580 100644
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@@ -955,21 +955,11 @@ c10::SymIntArrayRef sizes(const TensorBase& t) { return t.sym_sizes(); }
 template <typename T, typename = enable_if_int<T>>
 IntArrayRef sizes(const TensorBase& t) { return t.sizes(); }
 
-template <typename T, typename = enable_if_symint<T>>
-c10::SymInt size(const TensorBase& t, int64_t dim) { return t.sym_size(dim); }
-template <typename T, typename = enable_if_int<T>>
-int64_t size(const TensorBase& t, int64_t dim) { return t.size(dim); }
-
 template <typename T, typename = enable_if_symint<T>>
 c10::SymIntArrayRef strides(const TensorBase& t) { return t.sym_strides(); }
 template <typename T, typename = enable_if_int<T>>
 IntArrayRef strides(const TensorBase& t) { return t.strides(); }
 
-template <typename T, typename = enable_if_symint<T>>
-c10::SymInt numel(const TensorBase& t) { return t.sym_numel(); }
-template <typename T, typename = enable_if_int<T>>
-int64_t numel(const TensorBase& t) { return t.numel(); }
-
 } // namespace symint
 
 } // namespace at
diff --git a/aten/src/ATen/functorch/BatchRulesConvolution.cpp b/aten/src/ATen/functorch/BatchRulesConvolution.cpp
index 79523ed1fb6d9..0640af3a1b533 100644
--- a/aten/src/ATen/functorch/BatchRulesConvolution.cpp
+++ b/aten/src/ATen/functorch/BatchRulesConvolution.cpp
@@ -17,7 +17,7 @@ namespace at { namespace functorch {
 // we do not support batch_group_count (which is needed for convolution backwards).
 // Instead, there's a convolution_backward op that needs a batching rule.
 std::tuple<Tensor,optional<int64_t>>
-convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tensor& rhs, optional<int64_t> rhs_bdim, const optional<Tensor>& bias, optional<int64_t> bias_bdim, IntArrayRef stride, c10::SymIntArrayRef padding, IntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, int64_t groups) {
+convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tensor& rhs, optional<int64_t> rhs_bdim, const optional<Tensor>& bias, optional<int64_t> bias_bdim, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool transposed, IntArrayRef output_padding, int64_t groups) {
   DimVector lhs_spec(stride.size() + 2);
   std::iota(lhs_spec.begin(), lhs_spec.end(), 0);
   DimVector rhs_spec = lhs_spec;
@@ -42,13 +42,13 @@ convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tens
   std::tuple<Tensor, optional<int64_t>> result;
   if (lhs_bdim && !rhs_bdim) {
     auto new_x = reshape_dim_into(*lhs_bdim, lhs_spec[0], lhs);
-    auto out = at::convolution_symint(new_x, rhs, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
+    auto out = at::convolution(new_x, rhs, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
     out = reshape_dim_outof(out_spec[0], lhs.sizes()[*lhs_bdim], out);
     result = std::make_tuple(out, out_spec[0]);
   } else if (!lhs_bdim && rhs_bdim) {
     if (groups == 1) {
       auto new_w = reshape_dim_into(*rhs_bdim, rhs_spec[0], rhs);
-      auto out = at::convolution_symint(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
+      auto out = at::convolution(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
       out = reshape_dim_outof(out_spec[1], rhs.size(*rhs_bdim), out);
       result = std::make_tuple(out, out_spec[1]);
     } else {
@@ -62,7 +62,7 @@ convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tens
         // BIOHW -> I(BO)HW
         auto new_w = reshape_dim_into(*rhs_bdim, 1, rhs);
         // NIHW, I(BO)HW -> N(GBO)HW
-        auto out = at::convolution_symint(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
+        auto out = at::convolution(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
         // N(GBO)HW -> NG(BO)HW
         out = reshape_dim_outof(1, groups, out);
         // NG(BO)HW -> NGBOHW
@@ -84,7 +84,7 @@ convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tens
         // G(BO)IHW -> (GBO)IHW
         new_w = reshape_dim_into(0, 0, new_w);
         // N(GI)HW, (GBO)IHW -> N(GBO)HW
-        auto out = at::convolution_symint(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
+        auto out = at::convolution(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
         // N(GBO)HW -> NG(BO)HW
         out = reshape_dim_outof(1, groups, out);
         // NG(BO)HW -> NGBOHW
@@ -99,11 +99,11 @@ convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tens
     groups *= lhs.sizes()[*lhs_bdim];
     auto dim_with_groups = transposed ? 1 : 0;
     auto new_w = reshape_dim_into(*rhs_bdim, rhs_spec[dim_with_groups], rhs);
-    auto out = at::convolution_symint(new_x, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
+    auto out = at::convolution(new_x, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
     out = reshape_dim_outof(out_spec[1], lhs.sizes()[*lhs_bdim], out);
     result = std::make_tuple(out, out_spec[1]);
   } else {
-    result = std::make_tuple(at::convolution_symint(lhs, rhs, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups), nullopt);
+    result = std::make_tuple(at::convolution(lhs, rhs, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups), nullopt);
   }
   if (separate_bias) {
     auto A = std::get<0>(result);
@@ -244,8 +244,8 @@ convolution_backward_input_batch_rule(
     const Tensor& grad_output, optional<int64_t> grad_output_bdim,
     const Tensor& input, optional<int64_t> input_bdim,
     const Tensor& weight, optional<int64_t> weight_bdim,
-    IntArrayRef stride, c10::SymIntArrayRef padding, IntArrayRef dilation, bool transposed,
-    c10::SymIntArrayRef output_padding, int64_t groups) {
+    IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool transposed,
+    IntArrayRef output_padding, int64_t groups) {
   const std::array<bool, 3> mask = {true, false, false};
   if (grad_output_bdim && weight_bdim) {
     // regular: BNO, BOI -> N(BO), (BO)I -> N(BI)
@@ -254,7 +254,7 @@ convolution_backward_input_batch_rule(
     const auto grad_output_ = reshape_dim_into(*grad_output_bdim, 1, grad_output);
     const auto weight_ = reshape_dim_into(*weight_bdim, 0, weight);
     auto dummy_input = make_dummy(input, input_bdim, 1, batch_size);
-    const auto result = at::convolution_backward_symint(
+    const auto result = at::convolution_backward(
         grad_output_, dummy_input, weight_, nullopt, stride, padding,
         dilation, transposed, output_padding, groups * batch_size, mask);
     const auto grad_input = reshape_dim_outof(1, batch_size, std::get<0>(result));
@@ -265,7 +265,7 @@ convolution_backward_input_batch_rule(
     const auto batch_size = grad_output.size(*grad_output_bdim);
     const auto grad_output_ = reshape_dim_into(*grad_output_bdim, 0, grad_output);
     auto dummy_input = make_dummy(input, input_bdim, 0, batch_size);
-    const auto result = at::convolution_backward_symint(
+    const auto result = at::convolution_backward(
         grad_output_, dummy_input, weight, nullopt, stride, padding,
         dilation, transposed, output_padding, groups, mask);
     const auto grad_input = reshape_dim_outof(0, batch_size, std::get<0>(result));
@@ -278,7 +278,7 @@ convolution_backward_input_batch_rule(
       const auto in_ch_dim = transposed ? 0 : 1;
       const auto weight_ = reshape_dim_into(*weight_bdim, in_ch_dim, weight);
       auto dummy_input = make_dummy(input, input_bdim, 1, batch_size);
-      const auto result = at::convolution_backward_symint(
+      const auto result = at::convolution_backward(
           grad_output, dummy_input, weight_, nullopt, stride, padding,
           dilation, transposed, output_padding, groups, mask);
       const auto grad_input = reshape_dim_outof(1, batch_size, std::get<0>(result));
@@ -289,7 +289,7 @@ convolution_backward_input_batch_rule(
       // N(GO), B(GO)I -> N(GO), (GO)(BI) -> N(GBI)
       const auto weight_ = reshape_dim_into(*weight_bdim, 1, weight);
       auto dummy_input = make_dummy(input, input_bdim, 1, batch_size);
-      const auto result = at::convolution_backward_symint(
+      const auto result = at::convolution_backward(
           grad_output, dummy_input, weight_, nullopt, stride, padding,
           dilation, transposed, output_padding, groups, mask);
       grad_input = std::get<0>(result); // N(GBI)
@@ -300,7 +300,7 @@ convolution_backward_input_batch_rule(
       weight_ = weight_.transpose(0, 1);                       // GBIO
       weight_ = weight_.flatten(0, 2);                         // (GBI)O
       const auto dummy_input = make_dummy(input, input_bdim, 1, batch_size);
-      const auto result = at::convolution_backward_symint(
+      const auto result = at::convolution_backward(
           grad_output, dummy_input, weight_, nullopt, stride, padding,
           dilation, transposed, output_padding, groups, mask);
       grad_input = std::get<0>(result); // N(GBI)
@@ -314,7 +314,7 @@ convolution_backward_input_batch_rule(
   } else {
     TORCH_INTERNAL_ASSERT(input_bdim);
     const auto dummy_input = make_dummy(input, input_bdim, 0, 1);
-    const auto result = at::convolution_backward_symint(
+    const auto result = at::convolution_backward(
         grad_output, dummy_input, weight, nullopt, stride, padding,
         dilation, transposed, output_padding, groups, mask);
     return std::make_tuple(std::get<0>(result), nullopt);
@@ -325,8 +325,8 @@ convolution_backward_weight_batch_rule(
     const Tensor& grad_output, optional<int64_t> grad_output_bdim,
     const Tensor& input, optional<int64_t> input_bdim,
     const Tensor& weight, optional<int64_t> weight_bdim,
-    IntArrayRef stride, c10::SymIntArrayRef padding, IntArrayRef dilation, bool transposed,
-    c10::SymIntArrayRef output_padding, int64_t groups) {
+    IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool transposed,
+    IntArrayRef output_padding, int64_t groups) {
   const std::array<bool, 3> mask = {false, true, false};
   if (grad_output_bdim && input_bdim) {
     // BNO, BNI -> N(BO), N(BI) -> (BO)I (regular) (BI)O (transposed)
@@ -334,7 +334,7 @@ convolution_backward_weight_batch_rule(
     const auto grad_output_ = reshape_dim_into(*grad_output_bdim, 1, grad_output);
     const auto input_ = reshape_dim_into(*input_bdim, 1, input);
     const auto dummy_weight = make_dummy(weight, weight_bdim, 0, batch_size);
-    const auto result = at::convolution_backward_symint(
+    const auto result = at::convolution_backward(
         grad_output_, input_, dummy_weight, nullopt, stride, padding,
         dilation, transposed, output_padding, groups * batch_size, mask);
     auto grad_weight = std::get<1>(result);
@@ -348,7 +348,7 @@ convolution_backward_weight_batch_rule(
       const auto grad_output_ = reshape_dim_into(*grad_output_bdim, 1, grad_output);
       const auto out_ch_dim = transposed ? 1 : 0;
       const auto dummy_weight = make_dummy(weight, weight_bdim, out_ch_dim, batch_size);
-      const auto result = at::convolution_backward_symint(
+      const auto result = at::convolution_backward(
           grad_output_, input, dummy_weight, nullopt, stride, padding,
           dilation, transposed, output_padding, groups, mask);
       auto grad_weight = std::get<1>(result);
@@ -362,7 +362,7 @@ convolution_backward_weight_batch_rule(
       if (!transposed) {
         // BN(GO), N(GI) -> N(GBO), N(GI) -> (GBO)I
         const auto dummy_weight = make_dummy(weight, weight_bdim, 0, batch_size);
-        const auto result = at::convolution_backward_symint(
+        const auto result = at::convolution_backward(
             grad_output_, input, dummy_weight, nullopt, stride, padding,
             dilation, transposed, output_padding, groups, mask);
         auto grad_weight = std::get<1>(result);
@@ -373,7 +373,7 @@ convolution_backward_weight_batch_rule(
       } else {
         // BN(GO), N(GI) -> N(GBO), N(GI) -> (GI)(BO)
         const auto dummy_weight = make_dummy(weight, weight_bdim, 1, batch_size);
-        const auto result = at::convolution_backward_symint(
+        const auto result = at::convolution_backward(
             grad_output_, input, dummy_weight, nullopt, stride, padding,
             dilation, transposed, output_padding, groups, mask);
         auto grad_weight = std::get<1>(result);
@@ -389,7 +389,7 @@ convolution_backward_weight_batch_rule(
       const auto input_ = reshape_dim_into(*input_bdim, 1, input);
       const auto in_ch_dim = transposed ? 0 : 1;
       const auto dummy_weight = make_dummy(weight, weight_bdim, in_ch_dim, batch_size);
-      const auto result = at::convolution_backward_symint(
+      const auto result = at::convolution_backward(
           grad_output, input_, dummy_weight, nullopt, stride, padding,
           dilation, transposed, output_padding, groups, mask);
       auto grad_weight = std::get<1>(result);
@@ -403,7 +403,7 @@ convolution_backward_weight_batch_rule(
       if (!transposed) {
         // regular: N(GO), BN(GI) -> N(GO), N(GBI) -> (GO)(BI)
         const auto dummy_weight = make_dummy(weight, weight_bdim, 1, batch_size);
-        const auto result = at::convolution_backward_symint(
+        const auto result = at::convolution_backward(
             grad_output, input_, dummy_weight, nullopt, stride, padding,
             dilation, transposed, output_padding, groups, mask);
         auto grad_weight = std::get<1>(result);
@@ -412,7 +412,7 @@ convolution_backward_weight_batch_rule(
       } else {
         // transposed: N(GO), BN(GI) -> N(GO), N(GBI) -> (GBI)O
         const auto dummy_weight = make_dummy(weight, weight_bdim, 0, batch_size);
-        const auto result = at::convolution_backward_symint(
+        const auto result = at::convolution_backward(
             grad_output, input_, dummy_weight, nullopt, stride, padding,
             dilation, transposed, output_padding, groups, mask);
         auto grad_weight = std::get<1>(result);
@@ -425,7 +425,7 @@ convolution_backward_weight_batch_rule(
   } else {
     TORCH_INTERNAL_ASSERT(weight_bdim);
     const auto dummy_weight = make_dummy(weight, weight_bdim, 0, 1);
-    const auto result = at::convolution_backward_symint(
+    const auto result = at::convolution_backward(
         grad_output, input, dummy_weight, nullopt, stride, padding,
         dilation, transposed, output_padding, groups, mask);
     return std::make_tuple(std::get<1>(result), nullopt);
@@ -436,8 +436,8 @@ convolution_backward_weight_batch_rule(
 std::tuple<Tensor,Tensor,Tensor> convolution_backward_plumbing(
     const Tensor& grad_output_, const Tensor& input_, const Tensor& weight_,
     const c10::OptionalArrayRef<SymInt> bias_sizes_opt,
-    IntArrayRef stride, c10::SymIntArrayRef padding, IntArrayRef dilation, bool transposed,
-    c10::SymIntArrayRef output_padding, int64_t groups, std::array<bool, 3> output_mask) {
+    IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool transposed,
+    IntArrayRef output_padding, int64_t groups, std::array<bool, 3> output_mask) {
   const auto maybe_layer = maybeCurrentDynamicLayer();
   TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
   int64_t cur_level = maybe_layer->layerId();
@@ -487,7 +487,7 @@ std::tuple<Tensor,Tensor,Tensor> convolution_backward_plumbing(
     const auto batch_size = weight.size(*weight_bdim);
     input = reshape_dim_into(*input_bdim, 1, input);
     weight = reshape_dim_into(*weight_bdim, 0, weight);
-    const auto result = at::convolution_backward_symint(
+    const auto result = at::convolution_backward(
         grad_output, input, weight, nullopt, stride, padding, dilation,
         transposed, output_padding, batch_size * groups, output_mask);
     // N(BI), (BO)I -> NBI, BOI
diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
index 24a1c4ab507a0..f1108bac25a0a 100644
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@@ -242,7 +242,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   OP_DECOMPOSE2(where, ScalarSelf);
   OP_DECOMPOSE(orgqr);
   OP_DECOMPOSE2(unflatten, int);
-  m.impl("_convolution_double_backward", native::_convolution_double_backward);
+  OP_DECOMPOSE(_convolution_double_backward);
   OP_DECOMPOSE(conv_transpose1d);
   OP_DECOMPOSE2(conv_transpose2d, input);
   OP_DECOMPOSE2(conv_transpose3d, input);
diff --git a/aten/src/ATen/native/AdaptiveAveragePooling.cpp b/aten/src/ATen/native/AdaptiveAveragePooling.cpp
index b612ef009b651..40b05d74053ca 100644
--- a/aten/src/ATen/native/AdaptiveAveragePooling.cpp
+++ b/aten/src/ATen/native/AdaptiveAveragePooling.cpp
@@ -130,9 +130,9 @@ namespace {
       Tensor out = input.mean({-1, -2}, /* keepdim = */ true);
       if (input.suggest_memory_format() == at::MemoryFormat::ChannelsLast) {
         // assert ndim == 4, since ndim = 3 doesn't give channels_last
-        const auto n = input.sym_size(0);
-        const auto c = input.sym_size(1);
-        out.as_strided__symint({n, c, 1, 1}, {c, 1, c, c});
+        const int n = input.size(0);
+        const int c = input.size(1);
+        out.as_strided_({n, c, 1, 1}, {c, 1, c, c});
       }
       return out;
     } else {
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 2dd7d515c14f9..36ea8ee1870d7 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -910,8 +910,8 @@ static Tensor convolution_same(
   auto k = weight.dim();
   TORCH_CHECK(k > 2, "weight should have at least three dimensions");
   auto dim = static_cast<size_t>(k - 2);
-  auto weight_sizes = weight.sym_sizes();
-  auto input_sizes = input.sym_sizes();
+  auto weight_sizes = weight.sizes();
+  auto input_sizes = input.sizes();
   TORCH_CHECK(k == input.dim(),
               "Expected ", k, "-dimensional input for ",
               k, "-dimensional weight", weight_sizes, ", but got ",
@@ -926,7 +926,7 @@ static Tensor convolution_same(
   }
 
   // Calculate the correct padding
-  SymDimVector padding_l, padding_r;
+  DimVector padding_l, padding_r;
   bool symmetric_padding = true;
   for (auto i: c10::irange(dim)) {
     auto s = stride.size() == 1 ? stride[0] : stride[i];
@@ -942,14 +942,14 @@ static Tensor convolution_same(
 
   if (symmetric_padding) {
     // All backends handle symmetric padding natively
-    SymDimVector output_padding(static_cast<size_t>(dim));
-    return at::convolution_symint(input, weight, bias, stride, padding_l, dilation,
+    DimVector output_padding(static_cast<size_t>(dim));
+    return at::convolution(input, weight, bias, stride, padding_l, dilation,
                                false, output_padding, groups);
   }
 
   TORCH_WARN_ONCE("Using padding='same' with even kernel lengths and odd dilation may"
                   " require a zero-padded copy of the input be created");
-  SmallVector<c10::SymInt, kDimVectorStaticSize * 2> pad_nd(static_cast<size_t>(2 * dim));
+  SmallVector<int64_t, kDimVectorStaticSize * 2> pad_nd(static_cast<size_t>(2 * dim));
   for (auto i: c10::irange(dim)) {
     // Apply padding by the difference, leaving only a symmetric padding
     auto delta_pad = padding_r[i] - padding_l[i];
@@ -961,10 +961,10 @@ static Tensor convolution_same(
       padding_l[i] = padding_r[i];
     }
   }
-  auto padded_input = at::constant_pad_nd_symint(input, pad_nd, 0);
-  SymDimVector output_padding(static_cast<size_t>(dim));
-  return at::convolution_symint(padded_input, weight, bias, stride, padding_l,
-                                dilation, false, output_padding, groups);
+  auto padded_input = at::constant_pad_nd(input, pad_nd, 0);
+  DimVector output_padding(static_cast<size_t>(dim));
+  return at::convolution(padded_input, weight, bias, stride, padding_l,
+                         dilation, false, output_padding, groups);
 }
 
 Tensor _convolution_mode(
diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp
index 21404947b3dbb..7d4a89d6b40f7 100644
--- a/aten/src/ATen/native/EmbeddingBag.cpp
+++ b/aten/src/ATen/native/EmbeddingBag.cpp
@@ -1307,7 +1307,7 @@ Tensor _embedding_bag_backward_symint(const Tensor &grad, const Tensor &indices_
   checkContiguous("embedding_bag", offsets_arg);
 
   Tensor offset2bag_;
-  if (indices.sym_numel() != 0 && offset2bag.sym_numel() == 0) {
+  if (indices.numel() != 0 && offset2bag.numel() == 0) {
     offset2bag_ = offsets.new_zeros(
       {indices.size(0) + 1}, offsets.options()); // offset2bag = [0 0 0 0 0]
 
diff --git a/aten/src/ATen/native/GridSamplerUtils.h b/aten/src/ATen/native/GridSamplerUtils.h
index 7c22fedfe94e2..0b6f29de8c427 100644
--- a/aten/src/ATen/native/GridSamplerUtils.h
+++ b/aten/src/ATen/native/GridSamplerUtils.h
@@ -101,7 +101,7 @@ bool cond_cudnn_grid_sampler(
     at::native::canUse32BitIndexMath(input) &&
     at::native::canUse32BitIndexMath(grid) &&
     input.dim() == 4 &&
-    input.sym_size(1) <= 1024);
+    input.size(1) <= 1024);
 }
 
 } // anonymous namespace
diff --git a/aten/src/ATen/native/IndexingUtils.cpp b/aten/src/ATen/native/IndexingUtils.cpp
index 2dba1972ce574..c5f5ff6fbcc07 100644
--- a/aten/src/ATen/native/IndexingUtils.cpp
+++ b/aten/src/ATen/native/IndexingUtils.cpp
@@ -4,7 +4,7 @@
 namespace at { namespace native {
 
 bool canUse32BitIndexMath(const TensorBase& t, int64_t max_elem) {
-  auto elements = t.sym_numel();
+  int64_t elements = t.numel();
   if (elements >= max_elem) {
     return false;
   }
@@ -12,16 +12,16 @@ bool canUse32BitIndexMath(const TensorBase& t, int64_t max_elem) {
     return max_elem > 0;
   }
 
-  c10::SymInt offset = 0;
-  auto linearId = elements - 1;
+  int64_t offset = 0;
+  int64_t linearId = elements - 1;
 
   // NOTE: Assumes all strides are positive, which is true for now
   // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
   for (int i = t.dim() - 1; i >= 0; --i) {
-    auto curDimIndex = linearId % t.sym_size(i);
-    auto curDimOffset = curDimIndex * t.sym_stride(i);
+    int64_t curDimIndex = linearId % t.size(i);
+    int64_t curDimOffset = curDimIndex * t.stride(i);
     offset += curDimOffset;
-    linearId /= t.sym_size(i);
+    linearId /= t.size(i);
   }
 
   if (offset >= max_elem) {
diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp
index 28fc60508ab10..8e5864b68728d 100644
--- a/aten/src/ATen/native/LossNLL.cpp
+++ b/aten/src/ATen/native/LossNLL.cpp
@@ -656,7 +656,7 @@ Tensor nll_loss(const Tensor & self, const Tensor & target, const c10::optional<
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
 
-  return std::get<0>(at::nll_loss_forward_symint(self, target, weight, reduction, ignore_index));
+  return std::get<0>(at::nll_loss_forward(self, target, weight, reduction, ignore_index));
 }
 
 Tensor nll_loss_nd_symint(
diff --git a/aten/src/ATen/native/LossNLL2d.cpp b/aten/src/ATen/native/LossNLL2d.cpp
index aee22ce3edeb5..ab7c084eb80df 100644
--- a/aten/src/ATen/native/LossNLL2d.cpp
+++ b/aten/src/ATen/native/LossNLL2d.cpp
@@ -498,7 +498,7 @@ Tensor nll_loss2d(const Tensor & self, const Tensor & target, const c10::optiona
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
 
-  return std::get<0>(at::nll_loss2d_forward_symint(self, target, weight, reduction, ignore_index));
+  return std::get<0>(at::nll_loss2d_forward(self, target, weight, reduction, ignore_index));
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/NonSymbolicBC.h b/aten/src/ATen/native/NonSymbolicBC.h
index f57c868f345f1..e7d31ae3fa020 100644
--- a/aten/src/ATen/native/NonSymbolicBC.h
+++ b/aten/src/ATen/native/NonSymbolicBC.h
@@ -22,5 +22,4 @@ TORCH_API at::Tensor _embedding_bag_sparse_backward(const at::Tensor & grad, con
 TORCH_API at::Tensor value_selecting_reduction_backward(const at::Tensor & grad, int64_t dim, const at::Tensor & indices, at::IntArrayRef sizes, bool keepdim);
 TORCH_API at::Tensor trace_backward(const at::Tensor & grad, at::IntArrayRef sizes);
 TORCH_API at::Tensor index_select_backward(const at::Tensor & grad, at::IntArrayRef self_sizes, int64_t dim, const at::Tensor & index);
-TORCH_API std::vector<Tensor> tensor_split(const Tensor& self, IntArrayRef indices, int64_t dim);
 }}
diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h
index 0ff4490086b7e..cf5b45b365d05 100644
--- a/aten/src/ATen/native/Pool.h
+++ b/aten/src/ATen/native/Pool.h
@@ -67,18 +67,17 @@ static inline T pooling_output_shape(
         inputSize, kernelSize, pad, pad, stride, dilation, ceil_mode);
 }
 
-template <typename T>
-std::pair<T, T> _pooling_same_mode_padding_lr(
-    T inputSize, T kernelSize, int64_t stride, int64_t dilation) {
+inline std::pair<int64_t, int64_t> pooling_same_mode_padding_lr(
+    int64_t inputSize, int64_t kernelSize, int64_t stride, int64_t dilation) {
   // NOTE: with strides, the output shape is ceil(inputSize/stride)
-  auto total_padding = T(dilation) * (kernelSize - 1);
+  auto total_padding = dilation * (kernelSize - 1);
 
   // Prefer symmetric padding if possible
   if (stride > 2 && (total_padding % 2 == 1)) {
     // The floor in the output size calculation gives us a little wiggle room
     auto wiggle_room = inputSize % stride - 1;
     if (wiggle_room > 0) {
-      total_padding = total_padding - 1;
+      --total_padding;
     }
   }
 
@@ -86,15 +85,6 @@ std::pair<T, T> _pooling_same_mode_padding_lr(
   return {left, total_padding - left};
 }
 
-inline std::pair<int64_t, int64_t> pooling_same_mode_padding_lr(
-    int64_t inputSize, int64_t kernelSize, int64_t stride, int64_t dilation) {
-  return _pooling_same_mode_padding_lr(inputSize, kernelSize, stride, dilation);
-}
-
-inline std::pair<c10::SymInt, c10::SymInt> pooling_same_mode_padding_lr(
-    c10::SymInt inputSize, c10::SymInt kernelSize, int64_t stride, int64_t dilation) {
-  return _pooling_same_mode_padding_lr(inputSize, kernelSize, stride, dilation);
-}
 
 // AveragePool2d/DilatedMaxPool2d (forward)
 static inline void
diff --git a/aten/src/ATen/native/TensorProperties.cpp b/aten/src/ATen/native/TensorProperties.cpp
index e37dbf56cc81a..6a703cbe07f90 100644
--- a/aten/src/ATen/native/TensorProperties.cpp
+++ b/aten/src/ATen/native/TensorProperties.cpp
@@ -69,7 +69,7 @@ bool cudnn_is_acceptable(const TensorBase& self) {
   // tensors. Maybe some cuDNN functions actually support empty tensors, but
   // native/THNN kernels shouldn't be much slower because the output is also
   // likely empty.
-  if (self.sym_numel() == 0) return false;
+  if (self.numel() == 0) return false;
   // NB: In the old Python code, there was also a test to see if the
   // cuDNN library was actually dynamically linked or not.  I'm not
   // sure if we can actually test this.
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 2051cda371b97..d25113577b2d5 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -917,12 +917,9 @@ std::vector<Tensor> chunk(const Tensor& self, int64_t chunks, int64_t dim) {
   }
 }
 
-std::vector<Tensor> tensor_split_sections_symint(const Tensor& self, c10::SymInt sym_sections, int64_t dim) {
+std::vector<Tensor> tensor_split(const Tensor& self, int64_t sections, int64_t dim) {
   TORCH_CHECK(self.dim() > 0, "tensor_split expected at least a 1-dimensional tensor, but got a tensor with ", self.dim()," dims");
   int64_t dim_ = maybe_wrap_dim(dim, self.dim());
-  // NB: intentional, sections specifies number of output tensors, which
-  // cannot be polymorphic
-  int64_t sections = sym_sections.guard_int(__FILE__, __LINE__);
   TORCH_CHECK(sections > 0, "number of sections must be larger than 0, got ", sections);
   const auto dim_size = self.sym_size(dim_);
   std::vector<Tensor> splits(sections);
@@ -937,30 +934,21 @@ std::vector<Tensor> tensor_split_sections_symint(const Tensor& self, c10::SymInt
   return splits;
 }
 
-template <typename T>
-std::vector<Tensor> _tensor_split_indices(const Tensor& self, ArrayRef<T> indices, int64_t dim) {
+std::vector<Tensor> tensor_split(const Tensor& self, IntArrayRef indices, int64_t dim) {
   TORCH_CHECK(self.dim() > 0, "tensor_split expected at least a 1-dimensional tensor, but got a tensor with ", self.dim()," dims");
   int64_t dim_ = maybe_wrap_dim(dim, self.dim());
   int64_t num_indices = indices.size();
   std::vector<Tensor> splits(num_indices + 1);
-  T start_idx(0);
+  int64_t start_idx = 0;
   for (const auto split_idx : c10::irange(num_indices)) {
-    auto end_idx = indices[split_idx];
-    splits[split_idx] = at::symint::slice<T>(self, dim_, start_idx, end_idx);
+    int64_t end_idx = indices[split_idx];
+    splits[split_idx] = at::slice(self, dim_, start_idx, end_idx);
     start_idx = end_idx;
   }
-  splits[num_indices] = at::symint::slice<T>(self, dim_, start_idx, at::symint::size<T>(self, dim_));
+  splits[num_indices] = at::slice(self, dim_, start_idx, self.size(dim_));
   return splits;
 }
 
-std::vector<Tensor> tensor_split(const Tensor& self, IntArrayRef indices, int64_t dim) {
-  return _tensor_split_indices(self, indices, dim);
-}
-
-std::vector<Tensor> tensor_split_indices_symint(const Tensor& self, SymIntArrayRef indices, int64_t dim) {
-  return _tensor_split_indices(self, indices, dim);
-}
-
 std::vector<Tensor> tensor_split(const Tensor& self, const Tensor& tensor_indices_or_sections, int64_t dim) {
   TORCH_CHECK(self.dim() > 0, "tensor_split expected at least a 1-dimensional tensor, but got a tensor with ", self.dim()," dims");
   auto split_device = tensor_indices_or_sections.device();
@@ -1186,8 +1174,8 @@ Tensor as_strided_qtensorimpl(const Tensor& self, IntArrayRef size, IntArrayRef
   return result;
 }
 
-const Tensor &as_strided__symint(const Tensor& self, SymIntArrayRef size, SymIntArrayRef stride, optional<c10::SymInt> storage_offset_) {
-  auto storage_offset = storage_offset_.value_or(self.sym_storage_offset());
+const Tensor &as_strided_(const Tensor& self, IntArrayRef size, IntArrayRef stride, optional<int64_t> storage_offset_) {
+  auto storage_offset = storage_offset_.value_or(self.storage_offset());
   setStrided(self, size, stride, storage_offset);
   return self;
 }
diff --git a/aten/src/ATen/native/group_norm.cpp b/aten/src/ATen/native/group_norm.cpp
index c12d8d2142ff9..5b38b02702828 100644
--- a/aten/src/ATen/native/group_norm.cpp
+++ b/aten/src/ATen/native/group_norm.cpp
@@ -23,15 +23,13 @@
 #include <vector>
 
 namespace at {
-
 namespace native {
 
-template <typename T>
 void check_group_norm_inputs(
     const Tensor& input,
     const Tensor& weight,
     const Tensor& bias,
-    T C,
+    int64_t C,
     int64_t num_groups) {
   TORCH_CHECK(
       num_groups > 0,
@@ -45,14 +43,14 @@ void check_group_norm_inputs(
       "num_groups=",
       num_groups);
   TORCH_CHECK(
-      !weight.defined() || (weight.dim() == 1 && at::symint::numel<T>(weight) == C),
+      !weight.defined() || (weight.dim() == 1 && weight.numel() == C),
       "Expected weight to be a vector of size equal to the number of ",
       "channels in input, but got weight of shape ",
       weight.sizes(),
       " and input of shape ",
       input.sizes());
   TORCH_CHECK(
-      !bias.defined() || (bias.dim() == 1 && at::symint::numel<T>(bias) == C),
+      !bias.defined() || (bias.dim() == 1 && bias.numel() == C),
       "Expected bias to be a vector of size equal to the number of ",
       "channels in input, but got bias of shape ",
       weight.sizes(),
@@ -173,13 +171,13 @@ Tensor group_norm(
   const Tensor& weight = *weight_maybe_owned;
   const Tensor& bias = c10::value_or_else(bias_opt, [] { return Tensor(); });
 
-  const auto N = input.sym_size(0);
-  const auto C = input.sym_size(1);
+  const int64_t N = input.size(0);
+  const int64_t C = input.size(1);
   check_group_norm_inputs(input, weight, bias, C, num_groups);
 
-  const auto input_shape = input.sym_sizes();
-  const auto HxW =
-      c10::multiply_integers(input_shape.slice(2));
+  const auto input_shape = input.sizes();
+  const int64_t HxW =
+      c10::multiply_integers(input_shape.cbegin() + 2, input_shape.cend());
 
   const Tensor kEmpty;
   auto memory_format = input.suggest_memory_format();
@@ -187,10 +185,10 @@ Tensor group_norm(
       input.contiguous(memory_format) : input.contiguous();
   const auto& gamma = weight.defined() ? weight.contiguous() : kEmpty;
   const auto& beta = bias.defined() ? bias.contiguous() : kEmpty;
-  TORCH_CHECK(!gamma.defined() || gamma.sym_numel() == C);
-  TORCH_CHECK(!beta.defined() || beta.sym_numel() == C);
+  TORCH_CHECK(!gamma.defined() || gamma.numel() == C);
+  TORCH_CHECK(!beta.defined() || beta.numel() == C);
   return std::get<0>(
-      at::native_group_norm_symint(X, gamma, beta, N, C, HxW, num_groups, eps));
+      at::native_group_norm(X, gamma, beta, N, C, HxW, num_groups, eps));
 }
 
 DEFINE_DISPATCH(GroupNormKernel);
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 0954d1c662707..c481e03d23c9d 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -815,7 +815,7 @@
   device_guard: False
   tags: inplace_view
   dispatch:
-    CompositeExplicitAutogradNonFunctional: as_strided__symint
+    CompositeExplicitAutogradNonFunctional: as_strided_
 
 - func: asin(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -1294,15 +1294,11 @@
     CompositeImplicitAutograd: chunk
     NestedTensorCPU, NestedTensorCUDA: chunk_nested_tensor
 
-- func: tensor_split.sections(Tensor(a -> *) self, SymInt sections, int dim=0) -> Tensor(a)[]
+- func: tensor_split.sections(Tensor(a -> *) self, int sections, int dim=0) -> Tensor(a)[]
   variants: function, method
-  dispatch:
-    CompositeImplicitAutograd: tensor_split_sections_symint
 
-- func: tensor_split.indices(Tensor(a -> *) self, SymInt[] indices, int dim=0) -> Tensor(a)[]
+- func: tensor_split.indices(Tensor(a -> *) self, int[] indices, int dim=0) -> Tensor(a)[]
   variants: function, method
-  dispatch:
-    CompositeImplicitAutograd: tensor_split_indices_symint
 
 - func: tensor_split.tensor_indices_or_sections(Tensor(a -> *) self, Tensor tensor_indices_or_sections, int dim=0) -> Tensor(a)[]
   variants: function, method
@@ -1469,13 +1465,13 @@
   variants: method
   manual_cpp_binding: True
 
-- func: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups) -> Tensor
+- func: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor
   dispatch:
     CompositeExplicitAutograd: convolution
   autogen: convolution.out
   tags: canonical
 
-- func: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+- func: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
   dispatch:
     CompositeExplicitAutograd, CUDA: convolution_backward
   autogen: convolution_backward.out
@@ -1491,7 +1487,7 @@
     CompositeExplicitAutograd: convolution_backward_overrideable
   autogen: convolution_backward_overrideable.out
 
-- func: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor
+- func: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor
   dispatch:
     CompositeExplicitAutograd: _convolution
   autogen: _convolution.out
@@ -1500,7 +1496,7 @@
 
 - func: _convolution_mode(Tensor input, Tensor weight, Tensor? bias, int[] stride, str padding, int[] dilation, int groups) -> Tensor
 
-- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
 
 - func: conv1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] dilation=1, int groups=1) -> Tensor
 
@@ -3566,7 +3562,7 @@
     MPS: mps_convolution_backward
   autogen: mps_convolution_backward.out
 
-- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups) -> Tensor
+- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor
   dispatch:
     CompositeExplicitAutograd: mkldnn_convolution
   autogen: mkldnn_convolution.out
@@ -3581,17 +3577,17 @@
     CUDA: miopen_batch_norm_backward
   autogen: miopen_batch_norm_backward.out
 
-- func: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+- func: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
   dispatch:
     CUDA: miopen_convolution
   autogen: miopen_convolution.out
 
-- func: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+- func: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
   dispatch:
     CUDA: miopen_convolution_transpose
   autogen: miopen_convolution_transpose.out
 
-- func: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+- func: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
   dispatch:
     CUDA: miopen_depthwise_convolution
   autogen: miopen_depthwise_convolution.out
@@ -3845,7 +3841,7 @@
 
 - func: _nnpack_available() -> bool
 
-- func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, int[2] stride=1) -> Tensor
+- func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, int[2] padding, int[2] stride=1) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutograd: _nnpack_spatial_convolution
@@ -11475,24 +11471,24 @@
 # these are the same thing, but we give them different prefixes to
 # make the operational distinction clear.
 
-- func: slow_conv_transpose2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+- func: slow_conv_transpose2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   structured: True
   dispatch:
     CPU: slow_conv_transpose2d_structured_cpu
     CUDA: slow_conv_transpose2d_structured_cuda
 
-- func: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int[2] dilation=1) -> Tensor
+- func: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int[2] dilation=1) -> Tensor
   python_module: nn
   structured_delegate: slow_conv_transpose2d.out
 
-- func: slow_conv_transpose3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+- func: slow_conv_transpose3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: slow_conv_transpose3d_out_cpu
     CUDA: slow_conv_transpose3d_out_cuda
 
-- func: slow_conv_transpose3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int[3] dilation=1) -> Tensor
+- func: slow_conv_transpose3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int[3] dilation=1) -> Tensor
   python_module: nn
   dispatch:
     CPU: slow_conv_transpose3d_cpu
@@ -11529,47 +11525,47 @@
     CUDA: slow_conv2d_backward_cuda
   autogen: _slow_conv2d_backward.output_mask_out
 
-- func: _conv_depthwise2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, SymInt[2] padding, int[2] dilation, *, Tensor(a!) out) -> Tensor(a!)
+- func: _conv_depthwise2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation, *, Tensor(a!) out) -> Tensor(a!)
   use_const_ref_for_mutable_tensors: True
   python_module: nn
   dispatch:
     CUDA: conv_depthwise2d_cuda_out
 
-- func: _conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, SymInt[2] padding, int[2] dilation) -> Tensor
+- func: _conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation) -> Tensor
   python_module: nn
   dispatch:
     CUDA: conv_depthwise2d_cuda
 
-- func: conv_depthwise3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding, int[3] dilation) -> Tensor
+- func: conv_depthwise3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding, int[3] dilation) -> Tensor
   python_module: nn
   dispatch:
     CUDA: conv_depthwise3d_cuda
   autogen: conv_depthwise3d.out
 
-- func: slow_conv3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, *, Tensor(a!) out) -> Tensor(a!)
+- func: slow_conv3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
 
-- func: slow_conv3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0) -> Tensor
+- func: slow_conv3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0) -> Tensor
   python_module: nn
 
-- func: slow_conv3d_forward.output(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding, *, Tensor(a!) output) -> Tensor(a!)
+- func: slow_conv3d_forward.output(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding, *, Tensor(a!) output) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: slow_conv3d_forward_out_cpu
 
-- func: slow_conv3d_forward(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding) -> Tensor
+- func: slow_conv3d_forward(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding) -> Tensor
   python_module: nn
   dispatch:
     CPU: slow_conv3d_forward_cpu
 
-- func: slow_conv_dilated2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, int[2] dilation=1) -> Tensor
+- func: slow_conv_dilated2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1) -> Tensor
   python_module: nn
   dispatch:
     CPU: slow_conv_dilated2d_cpu
     CUDA: slow_conv_dilated2d_cuda
   autogen: slow_conv_dilated2d.out
 
-- func: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, int[3] dilation=1) -> Tensor
+- func: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1) -> Tensor
   python_module: nn
   dispatch:
     CPU: slow_conv_dilated3d_cpu
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 5f6e8d9bf238f..edf1df9216311 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1123,6 +1123,8 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     skip('nn.functional.batch_norm', ''),  # '0 is not tracked with proxy for <torch.fx.experimental.proxy_te..
     xfail('nn.functional.bilinear', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.binary_cross_entropy', ''),  # aten.fill_.Scalar - couldn't find symbolic meta funct...
+    xfail('nn.functional.conv1d', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
+    xfail('nn.functional.conv2d', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.cosine_embedding_loss', ''),  # Cannot call sizes() on tensor with symbolic sizes/st...
     xfail('nn.functional.cosine_similarity', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.cross_entropy', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
@@ -1223,6 +1225,7 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('trapezoid', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('trapz', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('triangular_solve', ''),  # aten.triangular_solve.default - couldn't find symbolic meta function/de...
+    xfail('unbind', ''),  # tensor_split() received an invalid combination of arguments - got (FakeTensor, torch...
     xfail('unflatten', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('var', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('var_mean', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 4f4265b8dc6a2..8f7656309eee4 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1254,6 +1254,8 @@ def f(a, b, c, d, e):
     xfail('nn.functional.avg_pool3d', ''),  # aten.avg_pool3d.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.bilinear', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.binary_cross_entropy', ''),  # aten.new_empty.default - couldn't find symbolic meta function/decom...
+    xfail('nn.functional.conv1d', ''),  # aten.convolution.default - couldn't find symbolic meta function/decomposition
+    xfail('nn.functional.conv2d', ''),  # aten.convolution.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.cosine_embedding_loss', ''),  # The underlying op of 'aten.stride' has no overload name '_schema'
     xfail('nn.functional.cosine_similarity', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.cross_entropy', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
@@ -1266,6 +1268,7 @@ def f(a, b, c, d, e):
     xfail('nn.functional.fractional_max_pool2d', ''),  # argument 'size' must be tuple of ints, but found element of t...
     xfail('nn.functional.fractional_max_pool3d', ''),  # argument 'size' must be tuple of ints, but found element of t...
     xfail('nn.functional.grid_sample', ''),  # aten.grid_sampler_2d.default - couldn't find symbolic meta function/decompos...
+    xfail('nn.functional.group_norm', ''),  # 'torch._C.SymIntNode' and 'int'
     xfail('nn.functional.hinge_embedding_loss', ''),  # aten.empty_like.default - couldn't find symbolic meta function/deco...
     xfail('nn.functional.interpolate', 'area'),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.interpolate', 'bicubic'),  # aten.upsample_bicubic2d.vec - couldn't find symbolic meta function/d...
@@ -1358,6 +1361,7 @@ def f(a, b, c, d, e):
     xfail('view_as_complex', ''),  # aten.view_as_complex.default - couldn't find symbolic meta function/decomposition
     xfail('view_as', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('vsplit', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
+    xfail('unbind', ''),  # aten.unbind.int - couldn't find symbolic meta function/decomposition
     xfail('unique_consecutive', ''),  # aten.unique_consecutive.default - couldn't find symbolic meta function/decomposition
     xfail('unique', ''),  # aten._unique2.default - couldn't find symbolic meta function/decomposition
 }
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 6945dae77a020..c77f63e8c8e73 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -2206,19 +2206,19 @@
   indices: non_differentiable
   result: auto_linear
 
-- name: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups) -> Tensor
+- name: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor
   input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
   result: convolution_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, stride, padding, dilation, transposed, output_padding, groups)
 
 # TorchScript serializes calls to _convolution so this entry is present until that is changed to use convolution.
 # Note that the benchmark, deterministic, cudnn_enabled, and allow_tf32 flags are queried from the global context
 # by convolution_backward instead of being passed along from the forward pass.
-- name: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor
+- name: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor
   input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
   result: _convolution_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, allow_tf32)
 
-- name: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
-  grad_output, input, weight: _convolution_double_backward_symint(grads[0], grads[1], grads[2], grad_output, weight, input, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask)
+- name: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  grad_output, input, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, input, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask)
   result0: std::get<0>(convolution_backward_symint(grad_output_p, input_p, weight_t, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {true, false, false})) + std::get<0>(convolution_backward_symint(grad_output_t, input_p, weight_p, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {true, false, false}))
   result1: std::get<1>(convolution_backward_symint(grad_output_p, input_t, weight_p, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {false, true, false})) + std::get<1>(convolution_backward_symint(grad_output_t, input_p, weight_p, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {false, true, false}))
   result2: convolution_backward_jvp_grad_bias(grad_output_t, result2)
@@ -2229,10 +2229,10 @@
 - name: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
   grad_output, input, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, input, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask)
 
-- name: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int[2] dilation=1) -> Tensor
+- name: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int[2] dilation=1) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, true, output_padding, 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: slow_conv_transpose3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int[3] dilation=1) -> Tensor
+- name: slow_conv_transpose3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int[3] dilation=1) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, true, output_padding, 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
 - name: _slow_conv2d_forward(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding) -> Tensor
@@ -2241,20 +2241,20 @@
 - name: _slow_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
   grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, {{1, 1}}, false, {{0, 0}}, 1, grad_input_mask)
 
-- name: _conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, SymInt[2] padding, int[2] dilation) -> Tensor
+- name: _conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad.contiguous(), self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ {{0, 0}}, /*groups=*/ 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: conv_depthwise3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding, int[3] dilation) -> Tensor
+- name: conv_depthwise3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding, int[3] dilation) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad.contiguous(), self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ {{0, 0, 0}}, /*groups=*/ 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: slow_conv3d_forward(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding) -> Tensor
+- name: slow_conv3d_forward(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, /*dilation=*/ {{1, 1, 1}}, false, /*output_padding=*/ {{0, 0, 0}}, 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: slow_conv_dilated2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, int[2] dilation=1) -> Tensor
-  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+- name: slow_conv_dilated2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<int64_t>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, int[3] dilation=1) -> Tensor
-  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+- name: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<int64_t>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
 - name: col2im(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
   self: im2col(grad, kernel_size, dilation, padding, stride)
@@ -2608,9 +2608,9 @@
 
 # nnpack
 
-- name: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, int[2] stride=1) -> Tensor
+- name: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, int[2] padding, int[2] stride=1) -> Tensor
   # NNPACK does not support strided convolutions in the backwards path, which is the reason why we are using the closest available function that does here.
-  input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, std::vector<int64_t>(padding.size(), 1), false, std::vector<c10::SymInt>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+  input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, std::vector<int64_t>(padding.size(), 1), false, std::vector<int64_t>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
 #LSTM MPS
 - name: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
@@ -2641,14 +2641,14 @@
 
 # miopen
 
-- name: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+- name: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, true, output_padding, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+- name: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<int64_t>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+- name: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<int64_t>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
 - name: miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor)
   input, weight, bias: "grad.defined() ? (training ? miopen_batch_norm_backward(input, grad.contiguous(), weight, running_mean, running_var, result1, result2, epsilon) : native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, epsilon, grad_input_mask)) : std::tuple<Tensor, Tensor, Tensor>()"
@@ -2667,8 +2667,8 @@
   dropout_state: non_differentiable
 
 # mkldnn
-- name: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups) -> Tensor
-  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ std::vector<c10::SymInt>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+- name: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ std::vector<int64_t>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
 - name: mkldnn_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor
   self, weight, bias: mkldnn_linear_backward(self, grad, weight, grad_input_mask)
diff --git a/tools/jit/gen_unboxing.py b/tools/jit/gen_unboxing.py
index 79c594a9afa07..ebeaa21bc7be9 100644
--- a/tools/jit/gen_unboxing.py
+++ b/tools/jit/gen_unboxing.py
@@ -116,9 +116,7 @@ def __call__(self, f: NativeFunction) -> str:
                 # from wrapping/unwrapping TensorOptios.
                 # However, we would look to include default args for schema parsing.
                 # Default args only show up in the nonfaithful C++ API,
-                arg_default = cpp.default_expr(
-                    arg.argument.default, arg.argument.type, symint=False
-                )
+                arg_default = cpp.default_expr(arg.argument.default, arg.argument.type)
                 if arg_default.startswith("{"):
                     arg_cpp = f"c10::IntArrayRef({arg_default})"
                 else:
diff --git a/torch/csrc/StorageMethods.cpp b/torch/csrc/StorageMethods.cpp
index 29f0f67ce6ecb..2b74c8a2fd290 100644
--- a/torch/csrc/StorageMethods.cpp
+++ b/torch/csrc/StorageMethods.cpp
@@ -41,7 +41,7 @@
 static PyObject* THPStorage_nbytes(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
   auto self = (THPStorage*)_self;
-  return py::cast(self->cdata->sym_nbytes()).release().ptr();
+  return THPUtils_packUInt64(self->cdata->nbytes());
   END_HANDLE_TH_ERRORS
 }
 
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 3358d96569598..86b893bb014e6 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -1098,15 +1098,15 @@ Tensor convolution_jvp(
     const Tensor& bias_p,
     const Tensor& bias_t,
     IntArrayRef stride,
-    at::SymIntArrayRef padding,
+    IntArrayRef padding,
     IntArrayRef dilation,
     bool transposed,
-    at::SymIntArrayRef output_padding,
+    IntArrayRef output_padding,
     int64_t groups) {
   auto bias_t_opt =
       bias_t.defined() ? c10::optional<at::Tensor>(bias_t) : c10::nullopt;
   return (
-      at::convolution_symint(
+      at::convolution(
           input_t,
           weight_p,
           c10::nullopt,
@@ -1116,7 +1116,7 @@ Tensor convolution_jvp(
           transposed,
           output_padding,
           groups) +
-      at::convolution_symint(
+      at::convolution(
           input_p,
           weight_t,
           bias_t_opt,
@@ -1136,10 +1136,10 @@ Tensor _convolution_jvp(
     const Tensor& bias_p,
     const Tensor& bias_t,
     IntArrayRef stride,
-    at::SymIntArrayRef padding,
+    IntArrayRef padding,
     IntArrayRef dilation,
     bool transposed,
-    at::SymIntArrayRef output_padding,
+    IntArrayRef output_padding,
     int64_t groups,
     bool benchmark,
     bool deterministic,
@@ -1148,7 +1148,7 @@ Tensor _convolution_jvp(
   auto bias_t_opt =
       bias_t.defined() ? c10::optional<at::Tensor>(bias_t) : c10::nullopt;
   return (
-      at::_convolution_symint(
+      at::_convolution(
           input_t,
           weight_p,
           c10::nullopt,
@@ -1162,7 +1162,7 @@ Tensor _convolution_jvp(
           deterministic,
           cudnn_enabled,
           allow_tf32) +
-      at::_convolution_symint(
+      at::_convolution(
           input_p,
           weight_t,
           bias_t_opt,
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index 4da8aa074a534..04416c2b49e08 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -937,10 +937,10 @@ Tensor convolution_jvp(
     const Tensor& bias_p,
     const Tensor& bias_t,
     IntArrayRef stride,
-    at::SymIntArrayRef padding,
+    IntArrayRef padding,
     IntArrayRef dilation,
     bool transposed,
-    at::SymIntArrayRef output_padding,
+    IntArrayRef output_padding,
     int64_t groups);
 
 Tensor _convolution_jvp(
@@ -951,10 +951,10 @@ Tensor _convolution_jvp(
     const Tensor& bias_p,
     const Tensor& bias_t,
     IntArrayRef stride,
-    at::SymIntArrayRef padding,
+    IntArrayRef padding,
     IntArrayRef dilation,
     bool transposed,
-    at::SymIntArrayRef output_padding,
+    IntArrayRef output_padding,
     int64_t groups,
     bool benchmark,
     bool deterministic,
diff --git a/torch/storage.py b/torch/storage.py
index 6bfbab3733bc4..8e35973405b1b 100644
--- a/torch/storage.py
+++ b/torch/storage.py
@@ -646,9 +646,7 @@ def device(self):
         return self._storage.device
 
     def size(self):
-        # NB: don't indirect through __len__, as that requires
-        # an int to be returned
-        return self.nbytes() // self.element_size()
+        return len(self)
 
     def pickle_storage_type(self):
         try:
diff --git a/torchgen/api/cpp.py b/torchgen/api/cpp.py
index 4b00b5367b824..c3b12d0336df0 100644
--- a/torchgen/api/cpp.py
+++ b/torchgen/api/cpp.py
@@ -314,7 +314,7 @@ def return_names(f: NativeFunction, *, fallback_name: str = "result") -> Sequenc
 }
 
 # Convert a JIT default into C++ expression representing the default
-def default_expr(d: str, t: Type, *, symint: bool) -> str:
+def default_expr(d: str, t: Type) -> str:
     if d == "None" and str(t) == "Tensor?":
         return "{}"
     if isinstance(t, BaseType) and t.name is BaseTy.str:
@@ -342,13 +342,11 @@ def default_expr(d: str, t: Type, *, symint: bool) -> str:
         if d == "None":
             return "c10::nullopt"
 
-        return default_expr(d, t.elem, symint=symint)
+        return default_expr(d, t.elem)
 
     if isinstance(t, ListType):
         if d.startswith("[") and d.endswith("]"):
             return "{" + d[1:-1] + "}"
-        elif symint and d.isdigit() and str(t.elem) == "SymInt":
-            return f"c10::SymInt({d})"
         elif t.size is None:
             # NOTE: Sized lists can have scalar defaults
             raise ValueError(f"Expected a list default '[...]' but found: '{d}'")
@@ -388,7 +386,7 @@ def sub_argument(
             binds = a.name
         default: Optional[str] = None
         if a.name not in cpp_no_default_args and a.default is not None:
-            default = default_expr(a.default, a.type, symint=symint)
+            default = default_expr(a.default, a.type)
         return [
             Binding(
                 nctype=argument_type(a, binds=binds, symint=symint),
diff --git a/torchgen/api/native.py b/torchgen/api/native.py
index 7f8b3eb3af2e7..b197a2a02983a 100644
--- a/torchgen/api/native.py
+++ b/torchgen/api/native.py
@@ -95,7 +95,7 @@ def argument(
     if isinstance(a, Argument):
         default: Optional[str] = None
         if should_default and a.default is not None:
-            default = cpp.default_expr(a.default, a.type, symint=symint)
+            default = cpp.default_expr(a.default, a.type)
         return [
             Binding(
                 nctype=argument_type(a, binds=a.name, symint=symint),
diff --git a/torchgen/api/python.py b/torchgen/api/python.py
index 728ee4c18c0a6..96c006b303eaa 100644
--- a/torchgen/api/python.py
+++ b/torchgen/api/python.py
@@ -719,9 +719,7 @@ def argument(a: Argument) -> PythonArgument:
         name=a.name,
         type=a.type,
         # TODO: directly translate a.default to python default
-        default=str(
-            pythonify_default(cpp.default_expr(a.default, a.type, symint=False))
-        )
+        default=str(pythonify_default(cpp.default_expr(a.default, a.type)))
         if a.default is not None
         else None,
         default_init=None,
@@ -806,7 +804,7 @@ def topt_default_init(name: str) -> Optional[str]:
             a = getattr(topt_args, name)
             if a.default is None or a.default == "None":
                 return None
-            return cpp.default_expr(a.default, a.type, symint=False)
+            return cpp.default_expr(a.default, a.type)
 
         tensor_options_args.append(
             PythonArgument(
diff --git a/torchgen/gen.py b/torchgen/gen.py
index 79970c94610dd..e53734969afda 100644
--- a/torchgen/gen.py
+++ b/torchgen/gen.py
@@ -1151,9 +1151,7 @@ def compute_argument_yaml(
         "type": cpp.argument_type(a, binds="__placeholder__", symint=False).cpp_type(),
     }
     if a.default is not None:
-        arg["default"] = pythonify_default(
-            cpp.default_expr(a.default, a.type, symint=False)
-        )
+        arg["default"] = pythonify_default(cpp.default_expr(a.default, a.type))
     if a.name in kwarg_only_set:
         arg["kwarg_only"] = True
     if a.name in out_arg_set:

From 4be8d542e8d03a9f95a2d4f67ebd145d9b02654f Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Thu, 27 Oct 2022 22:20:36 +0000
Subject: [PATCH 0258/1922] Meta OpInfo Test for stride correctness (#87849)

Failing test logs here
https://gist.github.com/SherlockNoMad/a7e132f3cb4152900f8a6d7df358c59e
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87849
Approved by: https://github.com/eellison
---
 test/test_meta.py | 237 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 166 insertions(+), 71 deletions(-)

diff --git a/test/test_meta.py b/test/test_meta.py
index 2431042e01728..99f78ddfb40c1 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -22,6 +22,7 @@
     ops,
     instantiate_device_type_tests,
     onlyCUDA,
+    OpDTypes,
 )
 from torch.testing._internal.common_methods_invocations import op_db
 from torchgen.utils import YamlLoader
@@ -187,15 +188,95 @@ def test_tensor_outlives_converter(self):
         del m
         self.assertIs(ref(), None)
 
+aten = torch.ops.aten
+
 CHECK_STRIDES = {
     torch.Tensor.__getitem__,
-    torch.ops.aten.index_put,
-    torch.ops.aten.index_add,
+}
+
+CHECK_STRIDES_SKIPS = {
+    aten._conj_physical.default,
+    aten._fft_c2c.default,
+    aten._fft_c2r.default,
+    aten._fft_r2c.default,
+    aten._linalg_svd.default,
+    aten._scaled_dot_product_attention_forward.default,
+    aten.add.Tensor,
+    aten.addmm.default,
+    aten.angle.default,
+    aten.atan2.default,
+    aten.binary_cross_entropy.default,
+    aten.bitwise_and.Tensor,
+    aten.bitwise_left_shift.Tensor,
+    aten.bitwise_or.Tensor,
+    aten.bitwise_right_shift.Tensor,
+    aten.bitwise_xor.Tensor,
+    aten.clamp_max.Tensor,
+    aten.clamp_min.Tensor,
+    aten.complex.default,
+    aten.copysign.Tensor,
+    aten.div.Tensor_mode,
+    aten.div.Tensor,
+    aten.eq.Tensor,
+    aten.flip.default,
+    aten.floor_divide.default,
+    aten.fmax.default,
+    aten.fmin.default,
+    aten.fmod.Tensor,
+    aten.gcd.default,
+    aten.ge.Tensor,
+    aten.gt.Tensor,
+    aten.heaviside.default,
+    aten.hypot.default,
+    aten.igamma.default,
+    aten.igammac.default,
+    aten.index_copy.default,
+    aten.lcm.default,
+    aten.le.Tensor,
+    aten.logical_and.default,
+    aten.logical_or.default,
+    aten.logical_xor.default,
+    aten.lt.Tensor,
+    aten.maximum.default,
+    aten.minimum.default,
+    aten.mul.Tensor,
+    aten.ne.Tensor,
+    aten.nextafter.default,
+    aten.pow.Scalar,
+    aten.pow.Tensor_Scalar,
+    aten.pow.Tensor_Tensor,
+    aten.prelu.default,
+    aten.remainder.Tensor,
+    aten.rot90.default,
+    aten.rsub.Tensor,
+    aten.special_xlog1py.default,
+    aten.special_zeta.default,
+    aten.sub.Tensor,
+    aten.where.self,
+    aten.xlogy.Tensor,
+
+    # channel_last and channel_last_3d related failures
+    aten.constant_pad_nd.default,
+    aten._adaptive_avg_pool2d.default,
+    aten.constant_pad_nd.default,
+    aten.convolution.default,
+    aten.convolution.default,
+    aten._adaptive_avg_pool2d.default,
+    aten.upsample_bilinear2d.vec,
+    aten.constant_pad_nd.default,
+    aten.upsample_bilinear2d.vec,
+
+    # following ops fails if include_storage_offset = True, but these are a bit edge casey
+    # we should still fix them, leaving them here for tracking.
+    # aten._reshape_alias.default,  # repro with test_dispatch_symbolic_meta_outplace_all_strides_matmul_cuda_float32
+    # aten.view.default,  # repro with test_dispatch_symbolic_meta_outplace_all_strides_unflatten_cuda_float32
 }
 
 def should_check_strides(func):
     if func in CHECK_STRIDES:
         return True
+    if func in CHECK_STRIDES_SKIPS:
+        return False
     if not isinstance(func, torch._ops.OpOverload):
         return False
     # Prims are expected to model strides correctly
@@ -206,7 +287,7 @@ def should_check_strides(func):
     if any(r.alias_info.before_set for r in func._schema.returns if r.alias_info):
         return True
     # TODO: check for TensorIterator
-    return False
+    return True
 
 def assert_ref_meta_equal(test_case, func, meta_rs, rs, msg_callable):
     flat_meta_rs, _ = tree_flatten(meta_rs)
@@ -688,8 +769,6 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
             kwargs, dtype=self.dtype, device_type=self.device_type, run_symbolic_meta=False
         )
 
-aten = torch.ops.aten
-
 # these always fail
 meta_dispatch_expected_failures = {
     aten.allclose.default: {f16, bf16, f32, f64, c64, c128},  # NotImplementedError: 'aten::_local_scalar_dense'
@@ -854,6 +933,55 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
     aten.miopen_batch_norm.default: {f32},
 }
 
+def get_strided_args(args):
+
+    def get_strided_variants(t, include_storage_offset=False):
+        variants = []
+
+        # contiguous
+        variants.append(t)
+
+        # transposed
+        if t.ndim > 1:
+            perm = list(reversed(range(t.ndim)))
+            transposed = torch.empty(
+                t.shape[::-1], device=t.device, dtype=t.dtype, requires_grad=t.requires_grad
+            ).permute(perm).copy_(t)
+            variants.append(transposed)
+
+        # nondense
+        if t.ndim > 0:
+            nondense = torch.repeat_interleave(t, 2, dim=-1)[..., ::2]
+            variants.append(nondense)
+
+        # channel_last
+        if t.ndim == 4:
+            variants.append(t.contiguous(memory_format=torch.channels_last))
+
+        # channel_last_3d
+        if t.ndim == 5:
+            variants.append(t.contiguous(memory_format=torch.channels_last_3d))
+
+        # storage_offset
+        if include_storage_offset:
+            buffer = torch.empty(t.numel() + 1, device=t.device, dtype=t.dtype, requires_grad=t.requires_grad)
+            buffer = buffer.as_strided(t.shape, t.stride(), storage_offset=1)
+            buffer.copy_(t)
+            variants.append(buffer)
+
+        return variants
+
+    strided_args = []
+    for arg in args:
+        if isinstance(arg, torch.Tensor) and not arg.is_sparse_csr and arg.is_contiguous():
+            strided_arg_variants = get_strided_variants(arg)
+        else:
+            strided_arg_variants = [arg]
+        strided_args.append(strided_arg_variants)
+
+    for result in itertools.product(*strided_args):
+        yield result
+
 class MetaCrossRefDispatchMode(torch.utils._python_dispatch.TorchDispatchMode):
     test_case: TestCase
     device: torch.device
@@ -947,7 +1075,7 @@ def test_meta_inplace(self, device, dtype, op):
             with MetaCrossRefFunctionMode(self, dtype=dtype, device=device, inplace=True):
                 expected = func(*args, **kwargs)
 
-    def _run_dispatch_meta_test(self, device, dtype, op, symbolic_meta, inplace):
+    def _run_dispatch_meta_test(self, device, dtype, op, symbolic_meta, inplace, all_stride_variants=False):
         if inplace:
             func = op.get_inplace()
             if not func:
@@ -966,14 +1094,21 @@ def _run_dispatch_meta_test(self, device, dtype, op, symbolic_meta, inplace):
             if inplace and sample_input.broadcasts_input:
                 continue
 
-            args = [sample_input.input] + list(sample_input.args)
+            sample_args = [sample_input.input] + list(sample_input.args)
             kwargs = sample_input.kwargs
 
-            with MetaCrossRefDispatchMode.push(self, dtype=dtype, device=device, symbolic_meta=symbolic_meta):
-                expected = func(*args, **kwargs)
+            if all_stride_variants and sum(isinstance(arg, torch.Tensor) for arg in sample_args) <= 5:
+                # test inputs <= 5 tensors to avoid combinatorial explosion
+                strided_args = get_strided_args(sample_args)
+            else:
+                strided_args = [sample_args]
 
-                if not inplace and isinstance(expected, torch.Tensor) and op.supports_out:
-                    func(*args, **kwargs, out=expected)
+            for args in strided_args:
+                with MetaCrossRefDispatchMode.push(self, dtype=dtype, device=device, symbolic_meta=symbolic_meta):
+                    expected = func(*args, **kwargs)
+
+                    if not inplace and isinstance(expected, torch.Tensor) and op.supports_out:
+                        func(*args, **kwargs, out=expected)
 
 
     @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
@@ -1006,6 +1141,26 @@ def test_dispatch_symbolic_meta_outplace(self, device, dtype, op):
     def test_dispatch_symbolic_meta_inplace(self, device, dtype, op):
         self._run_dispatch_meta_test(device, dtype, op, symbolic_meta=True, inplace=True)
 
+    @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
+    @skipIfCrossRef
+    @suppress_warnings
+    # only test one dtype, as output stride behavior is the same for all dtypes
+    @ops(op_db, dtypes=OpDTypes.any_common_cpu_cuda_one)
+    # Only test on CUDA, as CUDA kernel's stride is the reference
+    @onlyCUDA
+    def test_dispatch_symbolic_meta_outplace_all_strides(self, device, dtype, op):
+        self._run_dispatch_meta_test(device, dtype, op, symbolic_meta=True, inplace=False, all_stride_variants=True)
+
+    @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
+    @skipIfCrossRef
+    @suppress_warnings
+    # only test one dtype, as output stride behavior is the same for all dtypes
+    @ops(op_db, dtypes=OpDTypes.any_common_cpu_cuda_one)
+    # Only test on CUDA, as CUDA kernel's stride is the reference
+    @onlyCUDA
+    def test_dispatch_symbolic_meta_inplace_all_strides(self, device, dtype, op):
+        self._run_dispatch_meta_test(device, dtype, op, symbolic_meta=True, inplace=True, all_stride_variants=True)
+
 
     def test_empty_quantized(self):
         r = torch.empty(2 ** 52, device='meta', dtype=torch.qint8)
@@ -1027,66 +1182,6 @@ def test_fill_alias_relationship(self):
         r2 = torch.ops.aten.fill(inps, 1.0)
         self.assertNotEqual(id(inps), id(r2))
 
-    def get_stride_variants(self, t):
-        results = []
-
-        # contiguous
-        results.append(t)
-
-        # transposed
-        if t.ndim > 1:
-            perm = list(reversed(range(t.ndim)))
-            transposed = torch.empty(t.shape[::-1], device=t.device, dtype=t.dtype).permute(perm).copy_(t)
-            results.append(transposed)
-
-        # nondense
-        nondense = torch.repeat_interleave(t, 2, dim=-1)[..., ::2]
-        results.append(nondense)
-
-        return results
-
-    @onlyCUDA
-    def test_index_add_stride(self, device):
-        to_meta = MetaConverter()
-
-        x = torch.ones(5, 3, device=device)
-        t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float, device=device)
-        index = torch.tensor([0, 4, 2], device=device)
-
-        xs = self.get_stride_variants(x)
-        ts = self.get_stride_variants(t)
-
-        for x, t in itertools.product(xs, ts):
-            args = (x, 0, index, t)
-            meta_args = tree_map(to_meta, args)
-
-            r = torch.ops.aten.index_add(*args)
-            meta_r = torch.ops.aten.index_add(*meta_args)
-
-            self.assertEqual(r.size(), meta_r.size())
-            self.assertEqual(r.stride(), meta_r.stride())
-
-    @onlyCUDA
-    def test_index_put_stride(self, device):
-        to_meta = MetaConverter()
-
-        x = torch.rand(5, 5, device=device)
-        t = torch.rand(5, device=device)
-        index = torch.tensor([True, False, True, True, False], device=device)
-
-        xs = self.get_stride_variants(x)
-        ts = self.get_stride_variants(t)
-
-        for x, t in itertools.product(xs, ts):
-            args = (x, [index], t)
-            meta_args = tree_map(to_meta, args)
-
-            r = torch.ops.aten.index_put(*args)
-            meta_r = torch.ops.aten.index_put(*meta_args)
-
-            self.assertEqual(r.size(), meta_r.size())
-            self.assertEqual(r.stride(), meta_r.stride())
-
     def test_map_location_deserialize(self):
         import io
 

From 5fd46c60aba4f4e2f43892f01bbcbe690618aaaa Mon Sep 17 00:00:00 2001
From: sanchitintel <sanchit.jain@intel.com>
Date: Fri, 28 Oct 2022 03:42:19 +0000
Subject: [PATCH 0259/1922] Move incorrectly placed closing curly brace of
 `extern "C"` block (#87853)

### Bug description
When `__SYCL_DEVICE_ONLY__` is defined, while building PyTorch, the output of the preprocessing step would not have the closing curly brace of the `extern "C"` block, as it has been incorrectly placed. Compilers don't seem to report an error or a warning for a missing closing brace of an `extern "C"` block.

### Impact of the bug
If `c10/macros/Macros.h` would be included in a C++ file, and after the preprocessing stage, if the preprocessed source file would have some templated code after `extern "C" {`, then, after compilation, linking might fail with the error `templates must have c++ linkage`). eg. https://stackoverflow.com/questions/61717819/template-with-c-linkage-error-when-using-template-keyword-in-main-cpp/61717908#61717908 (its answer also has a small snippet of code to reproduce such an issue).

### Solution in this PR
one-liner bug fix that rectifies the placement of closing curly brace (`}`), so that the `extern "C"` block ends properly when `__SYCL_DEVICE_ONLY__` is defined.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87853
Approved by: https://github.com/jgong5, https://github.com/kit1980, https://github.com/malfet
---
 c10/macros/Macros.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
index 4be9faef4895e..e77fa0fde2ee0 100644
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@@ -347,8 +347,8 @@ __host__ __device__
 #endif // __CUDA_ARCH__
     void
     _wassert(wchar_t const* _Message, wchar_t const* _File, unsigned _Line);
-}
 #endif // __SYCL_DEVICE_ONLY__
+}
 #endif // NDEBUG
 #define CUDA_KERNEL_ASSERT(cond)                                                                 \
   if (C10_UNLIKELY(!(cond))) {                                                                   \

From 041676374930a8d51ddd1dae4115a2eab2ff7680 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Fri, 28 Oct 2022 03:50:43 +0000
Subject: [PATCH 0260/1922] [ROCm] Use -rpath-link to fix libtinfo conflict
 (#83552)

Fixes issue building PyTorch for ROCm5.3 and above on Ubuntu20.04 because libtinfo6 from conda conflicts with the one from the distro causing symbol not found errors.

cc @jeffdaily @sunway513 @ROCmSupport
Pull Request resolved: https://github.com/pytorch/pytorch/pull/83552
Approved by: https://github.com/malfet, https://github.com/pruthvistony
---
 cmake/Dependencies.cmake | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 47f5be14ed9a6..e232fcb624cd3 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1270,6 +1270,21 @@ endif()
 
 # ---[ HIP
 if(USE_ROCM)
+  # This prevents linking in the libtinfo from /opt/conda/lib which conflicts with ROCm libtinfo.
+  # Currently only active for Ubuntu 20.04 and greater versions.
+  if(UNIX AND EXISTS "/etc/os-release")
+    file(STRINGS /etc/os-release OS_RELEASE)
+    string(REGEX REPLACE "NAME=\"([A-Za-z]+).*" "\\1" OS_DISTRO ${OS_RELEASE})
+    string(REGEX REPLACE ".*VERSION_ID=\"([0-9\.]+).*" "\\1" OS_VERSION ${OS_RELEASE})
+    if(OS_DISTRO STREQUAL "Ubuntu" AND OS_VERSION VERSION_GREATER_EQUAL "20.04")
+      find_library(LIBTINFO_LOC tinfo NO_CMAKE_PATH NO_CMAKE_ENVIRONMENT_PATH)
+      if(LIBTINFO_LOC)
+        get_filename_component(LIBTINFO_LOC_PARENT ${LIBTINFO_LOC} DIRECTORY)
+        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath-link,${LIBTINFO_LOC_PARENT}")
+      endif()
+    endif()
+  endif()
+
   include(${CMAKE_CURRENT_LIST_DIR}/public/LoadHIP.cmake)
   if(PYTORCH_FOUND_HIP)
     message(INFO "Compiling with HIP for AMD.")

From fe04cde91d4bc08b2ea53893ef9bbf67dc7759f5 Mon Sep 17 00:00:00 2001
From: Michael Suo <suo@fb.com>
Date: Thu, 27 Oct 2022 12:37:59 -0700
Subject: [PATCH 0261/1922] [dynamo] relax fake tensor restriction with
 `assume_constant_result` (#87895)

This works now because of https://github.com/pytorch/pytorch/pull/87091,
so don't error out anymore.

cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87895
Approved by: https://github.com/tugsbayasgalan, https://github.com/voznesenskym
---
 test/dynamo/test_export.py  | 16 ----------------
 torch/_dynamo/eval_frame.py |  3 ---
 2 files changed, 19 deletions(-)

diff --git a/test/dynamo/test_export.py b/test/dynamo/test_export.py
index 45939e12e767f..1afd7c8c0c2de 100644
--- a/test/dynamo/test_export.py
+++ b/test/dynamo/test_export.py
@@ -930,7 +930,6 @@ def compiler(gm, sample_inputs):
 
         self.assertTrue(torch._dynamo.utils.same(make_fx_result, export_result))
 
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
     def test_export_with_constant_method_on_module(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
@@ -957,7 +956,6 @@ def forward(self, x):
         result = graph(torch.tensor([[1, 0], [0.25, 0.25]]))
         self.assertTrue(torch._dynamo.utils.same(result, real_result))
 
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
     def test_export_with_constant_method_on_module_invoke_twice(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
@@ -984,7 +982,6 @@ def forward(self, x):
         result = graph(torch.tensor([[1, 0], [0.25, 0.25]]))
         self.assertTrue(torch._dynamo.utils.same(result, real_result))
 
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
     def test_export_with_constant_free_function(self):
         @torch._dynamo.assume_constant_result
         def helper_fn(x):
@@ -1015,7 +1012,6 @@ def forward(self, x):
         result = graph(torch.tensor([[1, 0], [0.25, 0.25]]))
         self.assertTrue(torch._dynamo.utils.same(result, real_result))
 
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
     def test_export_with_constant_free_function_and_class_method(self):
         @torch._dynamo.assume_constant_result
         def helper_fn(x):
@@ -1042,7 +1038,6 @@ def forward(self, x):
         result = graph(torch.tensor([[1, 0], [0.25, 0.25]]))
         self.assertTrue(torch._dynamo.utils.same(result, real_result))
 
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
     def test_export_with_constant_free_function_and_class_method_multiarg(self):
         @torch._dynamo.assume_constant_result
         def helper_fn(x):
@@ -1077,7 +1072,6 @@ def forward(self, x, z):
         )
         self.assertTrue(torch._dynamo.utils.same(result, real_result))
 
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
     def test_export_with_constant_free_function_and_class_method_multiarg_diff(self):
         @torch._dynamo.assume_constant_result
         def helper_fn(x):
@@ -1109,7 +1103,6 @@ def forward(self, x, z):
         )
         self.assertTrue(torch._dynamo.utils.same(result, real_result))
 
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
     def test_export_with_constant_tuple_nonzero(self):
         class MyModule(torch.nn.Module):
             @torch._dynamo.assume_constant_result
@@ -1134,7 +1127,6 @@ def forward(self, x):
         result = graph(torch.tensor([[[1.0, 0], [0, 0]], [[1.0, 0], [0, 0]]]))
         self.assertTrue(torch._dynamo.utils.same(result, real_result))
 
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
     def test_export_with_constant_list_nonzero(self):
         class MyModule(torch.nn.Module):
             @torch._dynamo.assume_constant_result
@@ -1159,7 +1151,6 @@ def forward(self, x):
         result = graph(torch.tensor([[[1.0, 0], [0, 0]], [[1.0, 0], [0, 0]]]))
         self.assertTrue(torch._dynamo.utils.same(result, real_result))
 
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
     def test_export_with_constant_list_nonzero_free_function(self):
         @torch._dynamo.assume_constant_result
         def helper_fn(x):
@@ -1184,7 +1175,6 @@ def forward(self, x):
         result = graph(torch.tensor([[[1.0, 0], [0, 0]], [[1.0, 0], [0, 0]]]))
         self.assertTrue(torch._dynamo.utils.same(result, real_result))
 
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
     def test_export_with_constant_dict_values(self):
         class MyModule(torch.nn.Module):
             @torch._dynamo.assume_constant_result
@@ -1207,7 +1197,6 @@ def forward(self, x):
         result = graph(torch.tensor([[[1.0, 0], [0, 0]], [[1.0, 0], [0, 0]]]))
         self.assertTrue(torch._dynamo.utils.same(result, real_result))
 
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
     def test_export_with_constant_none_control_flow(self):
         class MyModule(torch.nn.Module):
             @torch._dynamo.assume_constant_result
@@ -1235,7 +1224,6 @@ def forward(self, x):
         # X is positive, but we compiled helper_fn to return None, so it will still return y
         self.assertTrue(torch._dynamo.utils.same(result, real_result))
 
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
     def test_export_with_constant_not_none_control_flow(self):
         class MyModule(torch.nn.Module):
             @torch._dynamo.assume_constant_result
@@ -1263,7 +1251,6 @@ def forward(self, x):
         # X is negative, but we compiled helper_fn to return x, so it will still return y * x
         self.assertTrue(torch._dynamo.utils.same(result, real_result))
 
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
     def test_export_with_constant_none_control_flow_free_func(self):
         @torch._dynamo.assume_constant_result
         def helper_fn(x):
@@ -1291,7 +1278,6 @@ def forward(self, x):
         # X is positive, but we compiled helper_fn to return None, so it will still return y
         self.assertTrue(torch._dynamo.utils.same(result, real_result))
 
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
     def test_export_with_constant_not_none_control_flow_pos(self):
         class MyModule(torch.nn.Module):
             @torch._dynamo.assume_constant_result
@@ -1319,7 +1305,6 @@ def forward(self, x):
         # X is negative, but we compiled helper_fn to return x, so it will still return y * x
         self.assertTrue(torch._dynamo.utils.same(result, real_result))
 
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
     def test_export_with_constant_not_none_control_flow_free_func(self):
         @torch._dynamo.assume_constant_result
         def helper_fn(x):
@@ -1347,7 +1332,6 @@ def forward(self, x):
         # X is negative, but we compiled helper_fn to return x, so it will still return y * x
         self.assertTrue(torch._dynamo.utils.same(result, real_result))
 
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
     def test_export_with_constant_not_return_const(self):
         class MyModule(torch.nn.Module):
             @torch._dynamo.assume_constant_result
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index d86653f9973cc..9e6d8f2450557 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -581,9 +581,6 @@ def graph_with_interpreter(*args):
 
 def assume_constant_result(fn):
     fn._dynamo_marked_constant = True
-    assert (
-        not config.fake_tensor_propagation
-    ), "Constant result capture is not supported with fake tensors."
     return fn
 
 
From 411b435eb520759e4f5e6ebb5ff732b581eccd7e Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Fri, 28 Oct 2022 00:01:07 +0000
Subject: [PATCH 0262/1922] FakeTensorMode and Prims.add/sub/mul/div support
 scalar only inputs (#87759)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87759
Approved by: https://github.com/ngimel, https://github.com/mruberry, https://github.com/eellison
---
 torch/_meta_registrations.py                  |  5 --
 torch/_refs/__init__.py                       | 22 ++++-----
 torch/_subclasses/fake_tensor.py              | 31 +++++++++---
 .../_internal/common_methods_invocations.py   | 47 +++++++++++--------
 4 files changed, 62 insertions(+), 43 deletions(-)

diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index cde0ac96a2d84..c6f4d7a357fc7 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1658,11 +1658,6 @@ def activate_meta():
             "aten::clone",  # causing infinite recursion
             "aten::_to_copy",  # causing infinite recursion, test_serialization.py -k test_tensor_subclass_getstate_overwrite  # noqa: B950
             "aten::randn",  # pin_memory parameter is not supported!, test_proxy_tensor.py -k test_make_fx_symbolic_exhaustive_randn_cpu_float32  # noqa: B950
-            "aten::add.Tensor",  # ValueError: Receive two Number inputs to an elementwise binary operation! inductor/test_torchinductor.py -k test_both_scalars  # noqa: B950
-            "aten::sub.Tensor",  # ValueError: Receive two Number inputs to an elementwise binary operation! inductor/test_torchinductor.py -k test_both_scalars  # noqa: B950
-            "aten::mul.Tensor",  # ValueError: Receive two Number inputs to an elementwise binary operation! inductor/test_torchinductor.py -k test_both_scalars  # noqa: B950
-            "aten::div.Tensor",  # ValueError: Receive two Number inputs to an elementwise binary operation! test_fake_tensor.py -k test_scalar_inputs  # noqa: B950
-            "aten::div.Tensor_mode",  # ValueError: Receive two Number inputs to an elementwise binary operation! inductor/test_torchinductor.py -k test_div8_cpu  # noqa: B950
             "aten::copy_",  # Exception not raised, test_torch.py -k test_storage_meta_errors_cpu_int64  # noqa: B950
             "aten::constant_pad_nd",  # requires_grad mismatch, test_ops.py -k test_fake_crossref_backward_amp_istft_cuda_float32  # noqa: B950
             "aten::rot90",  # requires_grad mismatch! test_ops.py -k test_fake_crossref_backward_amp_rot90_cuda_float32  # noqa: B950
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index c01c0da051f7e..ba02416dd24e8 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -853,6 +853,7 @@ def _make_elementwise_binary_reference(
     has_out=True,
     supports_lhs_python_scalar=True,
     supports_rhs_python_scalar=True,
+    supports_two_python_scalars=False,
 ) -> Callable:
     @elementwise_type_promotion_wrapper(
         type_promoting_args=("a", "b"),
@@ -872,8 +873,11 @@ def _ref(
                 "Received a rhs Python scalar to an elementwise binary operation that does not accept rhs scalars!"
             )
 
-        # TODO: enable this for operations that support it, like add
-        if isinstance(a, Number) and isinstance(b, Number):
+        if (
+            not supports_two_python_scalars
+            and isinstance(a, Number)
+            and isinstance(b, Number)
+        ):
             raise ValueError(
                 f"Receive two Number inputs to an elementwise binary operation {prim}!"
             )
@@ -909,11 +913,6 @@ def add(
     Reference implementation of torch.add
     """
 
-    if isinstance(a, Number) and isinstance(b, Number):
-        raise ValueError(
-            "Receive two Number inputs to an elementwise binary operation!"
-        )
-
     a, b = _maybe_broadcast(a, b)
 
     if alpha is not None:
@@ -1185,6 +1184,7 @@ def _floor_divide_float(a: Tensor, b: Tensor) -> Tensor:
     _floor_divide,
     type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
     aten_op=torch.ops.aten.floor_divide,
+    supports_two_python_scalars=True,
 )
 
 
@@ -1459,6 +1459,7 @@ def _logical_xor(a: TensorLikeType, b: TensorLikeType):
 mul = _make_elementwise_binary_reference(
     prims.mul,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    supports_two_python_scalars=True,
 )
 
 # TODO: add docstring
@@ -1515,11 +1516,6 @@ def sub(
     Reference implementation of torch.sub
     """
 
-    if isinstance(a, Number) and isinstance(b, Number):
-        raise ValueError(
-            "Receive two Number inputs to an elementwise binary operation!"
-        )
-
     a, b = _maybe_broadcast(a, b)
 
     if alpha is not None:
@@ -1542,6 +1538,7 @@ def sub(
     prims.div,  # type: ignore[has-type]
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
     aten_op=None,  # CompositeImplicitAutograd
+    supports_two_python_scalars=True,
 )
 
 
@@ -1585,6 +1582,7 @@ def _trunc_divide(
     _trunc_divide,
     type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
     aten_op=None,  # CompositeImplicitAutograd
+    supports_two_python_scalars=True,
 )
 
 #
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index c5bf346f8cb5f..9d5a8d30209eb 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -6,7 +6,7 @@
 import weakref
 from dataclasses import dataclass
 from functools import partial
-from typing import Any, Callable, Dict, List, Optional, Type, TypeVar, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, TypeVar, Union
 
 import torch
 from torch._ops import OpOverload
@@ -265,7 +265,9 @@ def __call__(
             )
         else:
             assert make_constant is False
-            assert t.device.type == "meta"
+            assert (
+                t.device.type == "meta"
+            ), f"tensor's device must be `meta`, got {t.device.type} instead"
             return self.from_meta_and_device(fake_mode, t, device)
 
 
@@ -541,11 +543,14 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
             return func(*args, **kwargs)
 
     @staticmethod
-    def _find_common_device(func, args, kwargs):
+    def _find_common_device(func, args, kwargs) -> Tuple[torch.device, bool]:
+        # Returns: (common_device, has_scalar_only_inputs)
+
         # cpu - zero-dim tensors can be called in cuda kernels,
         # so overwrite the common_device if it the only existing
         # device comes from a cpu zero-dim tensor
         common_device = None
+        has_scalar_only_inputs = False
         is_cpu_zero_dim = None
 
         def cpu_zero_dim(t):
@@ -597,11 +602,13 @@ def merge_devices(t):
             )
             and common_device is None
         ):
+            # ops with scalar only inputs always have result on cpu
+            has_scalar_only_inputs = True
             common_device = torch.device("cpu")
 
         assert common_device is not None, f"Could not find common device for {func}"
 
-        return common_device
+        return common_device, has_scalar_only_inputs
 
     __torch_function__ = torch._C._disabled_torch_function_impl
 
@@ -870,13 +877,25 @@ def gen_wrap_fn(self, func, args, kwargs):
 
         # Lazily initialized, in case there are no tensor returns
         common_device = None
+        has_scalar_only_inputs = False
 
         def wrap(e, device=None):
             nonlocal common_device
+            nonlocal has_scalar_only_inputs
             if isinstance(e, torch.Tensor) and not isinstance(e, FakeTensor):
                 if common_device is None:
-                    common_device = FakeTensor._find_common_device(func, args, kwargs)
-                return converter(self, e, device or common_device)
+                    (
+                        common_device,
+                        has_scalar_only_inputs,
+                    ) = FakeTensor._find_common_device(func, args, kwargs)
+
+                if has_scalar_only_inputs:
+                    # Under FakeTensorMode, op accepts scalar only inputs, such as aten.add/sub/mul/div,
+                    # returns a real scalar tensor on CPU. See TensorMeta() in _prims/__init__.py for details.
+                    # We thus directly convert real tensor to fake tensor.
+                    return converter(self, e)
+                else:
+                    return converter(self, e, device or common_device)
             else:
                 return e
 
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index f562aace3cc03..0617b5d7ed617 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -7016,9 +7016,10 @@ def error_inputs_poisson_nll_loss(op_info, device, **kwargs):
                      error_regex='abc is not valid')
     # invalid input shapes
     yield ErrorInput(SampleInput(make(5, 4), args=(make(5,),)),
-                     error_regex=(r'The size of tensor a \(5\) must match the '
+                     error_regex=(r'(Attempting to broadcast a dimension of length|'
+                                  r'The size of tensor a \(5\) must match the '
                                   r'size of tensor b \(4\) at non-singleton '
-                                  r'dimension 1'))
+                                  r'dimension 1)'))
 
 def error_inputs_soft_margin_loss(op_info, device, **kwargs):
     make = partial(make_tensor, device=device, dtype=torch.float32)
@@ -7030,9 +7031,10 @@ def error_inputs_soft_margin_loss(op_info, device, **kwargs):
                      error_regex='abc is not a valid value for reduction')
     # invalid input shapes
     yield ErrorInput(SampleInput(make(5, 4), args=(make(5,),)),
-                     error_regex=(r'The size of tensor a \(4\) must match the '
+                     error_regex=(r'(Attempting to broadcast a dimension of length|'
+                                  r'The size of tensor a \(4\) must match the '
                                   r'size of tensor b \(5\) at non-singleton '
-                                  r'dimension 1'))
+                                  r'dimension 1)'))
 
 def sample_inputs_triplet_margin_loss(op_info, device, dtype, requires_grad, with_distance=False, **kwargs):
     make = partial(make_tensor, (S, M), device=device, dtype=dtype, requires_grad=requires_grad)
@@ -7064,18 +7066,21 @@ def error_inputs_triplet_margin_loss(op_info, device, **kwargs):
         (make_input(3, 5), (make_input(3, 4), make_input(3, 4)),
          dict(),
          RuntimeError,
-         (r"The size of tensor a \(5\) must match the size of tensor b \(4\) "
-          r"at non-singleton dimension 1")),
+         (r'(Attempting to broadcast a dimension of length|'
+          r"The size of tensor a \(5\) must match the size of tensor b \(4\) "
+          r"at non-singleton dimension 1)")),
         (make_input(3, 4), (make_input(3, 5), make_input(3, 4)),
          dict(),
          RuntimeError,
-         (r"The size of tensor a \(4\) must match the size of tensor b \(5\) "
-          r"at non-singleton dimension 1")),
+         (r'(Attempting to broadcast a dimension of length|'
+          r"The size of tensor a \(4\) must match the size of tensor b \(5\) "
+          r"at non-singleton dimension 1)")),
         (make_input(3, 4), (make_input(3, 4), make_input(3, 5)),
          dict(),
          RuntimeError,
-         (r"The size of tensor a \(4\) must match the size of tensor b \(5\) "
-          r"at non-singleton dimension 1")),
+         (r'(Attempting to broadcast a dimension of length|'
+          r"The size of tensor a \(4\) must match the size of tensor b \(5\) "
+          r"at non-singleton dimension 1)")),
 
         # different dimensions
         (make_input(3,), (make_input(3, 4), make_input(3, 4)),
@@ -7234,9 +7239,11 @@ def error_inputs_l1_loss(op_info, device, **kwargs):
                      error_regex='abc is not a valid value for reduction')
     # invalid input shapes
     yield ErrorInput(SampleInput(make(5, 4), args=(make(5,),)),
-                     error_regex=(r'The size of tensor a \(4\) must match the '
+                     error_regex=(r'(Attempting to broadcast a dimension of length|'
+                                  r'The size of tensor a \(4\) must match the '
                                   r'size of tensor b \(5\) at non-singleton '
-                                  r'dimension 1'))
+                                  r'dimension 1)')
+                     )
 
 def sample_inputs_smooth_l1_loss(op_info, device, dtype, requires_grad, **kwargs):
     yield from sample_inputs_loss(op_info, device, dtype, requires_grad, **kwargs)
@@ -17114,7 +17121,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.add",
         torch_opinfo_name="add",
         # https://github.com/pytorch/pytorch/issues/76944
-        supports_two_python_scalars=False,
+        supports_two_python_scalars=True,
         supports_one_python_scalar=True,
     ),
     ElementwiseBinaryPythonRefInfo(
@@ -17152,7 +17159,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         torch_opinfo_name="div",
         torch_opinfo_variant_name="no_rounding_mode",
         # https://github.com/pytorch/pytorch/issues/76944
-        supports_two_python_scalars=False,
+        supports_two_python_scalars=True,
         supports_one_python_scalar=True,
         supports_nvfuser=False,
         skips=(
@@ -17180,7 +17187,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         torch_opinfo_name="div",
         torch_opinfo_variant_name="trunc_rounding",
         # https://github.com/pytorch/pytorch/issues/76944
-        supports_two_python_scalars=False,
+        supports_two_python_scalars=True,
         supports_one_python_scalar=True,
         supports_nvfuser=False,
     ),
@@ -17189,7 +17196,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         torch_opinfo_name="div",
         torch_opinfo_variant_name="floor_rounding",
         # https://github.com/pytorch/pytorch/issues/76944
-        supports_two_python_scalars=False,
+        supports_two_python_scalars=True,
         supports_one_python_scalar=True,
         supports_nvfuser=False,
     ),
@@ -17211,7 +17218,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         torch_opinfo_name="floor_divide",
         rhs_make_tensor_kwargs=dict(exclude_zero=True),
         # https://github.com/pytorch/pytorch/issues/76944
-        supports_two_python_scalars=False,
+        supports_two_python_scalars=True,
         supports_one_python_scalar=True,
         supports_nvfuser=False,
         # bfloat16 floor_divide compared with a float32 reference works inconsistently
@@ -17339,7 +17346,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.mul",
         torch_opinfo_name="mul",
         # https://github.com/pytorch/pytorch/issues/76944
-        supports_two_python_scalars=False,
+        supports_two_python_scalars=True,
         supports_one_python_scalar=True,
         skips=(
             # Reference result was farther (0.0) from the precise computation
@@ -17426,14 +17433,14 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.sub",
         torch_opinfo_name="sub",
         # https://github.com/pytorch/pytorch/issues/76944
-        supports_two_python_scalars=False,
+        supports_two_python_scalars=True,
         supports_one_python_scalar=True,
     ),
     ElementwiseBinaryPythonRefInfo(
         "_refs.true_divide",
         torch_opinfo_name="true_divide",
         # https://github.com/pytorch/pytorch/issues/76944
-        supports_two_python_scalars=False,
+        supports_two_python_scalars=True,
         supports_one_python_scalar=True,
         skips=(
             # Reference result was farther (0.7433461727239705) from the precise

From fc254d86783157d51502e6a3ebbb60c017413eb3 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Fri, 28 Oct 2022 04:53:33 +0000
Subject: [PATCH 0263/1922] Fix typos under caffe2 directory (#87840)

This PR fixes typos in `.md` files under caffe2 directory

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87840
Approved by: https://github.com/kit1980
---
 caffe2/mobile/contrib/libopencl-stub/README.md | 2 +-
 caffe2/python/onnx/ONNXOpCoverage.md           | 2 +-
 caffe2/quantization/server/README.md           | 4 ++--
 caffe2/release-notes.md                        | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/caffe2/mobile/contrib/libopencl-stub/README.md b/caffe2/mobile/contrib/libopencl-stub/README.md
index 20b3dafa8095f..835ba2172cbe8 100644
--- a/caffe2/mobile/contrib/libopencl-stub/README.md
+++ b/caffe2/mobile/contrib/libopencl-stub/README.md
@@ -1,7 +1,7 @@
 libopencl-stub
 ==============
 
-A stub opecl library that dynamically dlopen/dlsyms opencl implementations at runtime based on environment variables. Will be useful when opencl implementations are installed in non-standard paths (say pocl on android)
+A stub opencl library that dynamically dlopen/dlsyms opencl implementations at runtime based on environment variables. Will be useful when opencl implementations are installed in non-standard paths (say pocl on android)
 
 
diff --git a/caffe2/python/onnx/ONNXOpCoverage.md b/caffe2/python/onnx/ONNXOpCoverage.md
index bb4b71f055356..66cf2d692e87c 100644
--- a/caffe2/python/onnx/ONNXOpCoverage.md
+++ b/caffe2/python/onnx/ONNXOpCoverage.md
@@ -19,7 +19,7 @@ This doc keeps tracking why operators are not covered by the testcases.
 |Atan|||&#x1F49A;OK|
 |AveragePool||OK|&#x1F49A;OK|
 |BatchNormalization||OK|&#x1F49A;OK|
-|Cast|Yes||&#x1F494;Need extendtion|
+|Cast|Yes||&#x1F494;Need extension|
 |Ceil|Yes||&#x1F49A;OK|
 |Clip|Yes|OK|&#x1F49A;OK|
 |Concat|Yes|OK|&#x1F49A;OK|
diff --git a/caffe2/quantization/server/README.md b/caffe2/quantization/server/README.md
index 4819b62fedb77..b7d22bf8bbfe6 100644
--- a/caffe2/quantization/server/README.md
+++ b/caffe2/quantization/server/README.md
@@ -19,8 +19,8 @@ To compute the quantization parameters of activation tensors, we need to know th
 
 * Floating-point requantization
 
-Unlike gemmlowp using fixed-point operations that emulates floating point operations of requantization, fbgemm just uses single-precison floating-point operations. This is because in x86 just using single-precision floating-point operations is faster. Probably, gemmlowp used pure fixed-point operations for low-end mobile processors. QNNPACK also has similar constraints as gemmlowp and provides multiple options of requantization implementations.
-The users could modify the code to use a different requantization implementation to be bit-wise idential to the HW they want to emulate for example. If there're enough requests, we could consider implementing a few popular fixed-point requantization as QNNPACK did.
+Unlike gemmlowp using fixed-point operations that emulates floating point operations of requantization, fbgemm just uses single-precision floating-point operations. This is because in x86 just using single-precision floating-point operations is faster. Probably, gemmlowp used pure fixed-point operations for low-end mobile processors. QNNPACK also has similar constraints as gemmlowp and provides multiple options of requantization implementations.
+The users could modify the code to use a different requantization implementation to be bit-wise identical to the HW they want to emulate for example. If there're enough requests, we could consider implementing a few popular fixed-point requantization as QNNPACK did.
 
 * 16-bit accumulation with outlier-aware quantization
 
diff --git a/caffe2/release-notes.md b/caffe2/release-notes.md
index e76b760a7ed5e..d449e98f78e3d 100644
--- a/caffe2/release-notes.md
+++ b/caffe2/release-notes.md
@@ -133,7 +133,7 @@ If you're running this all on a cloud computer, you probably won't have a UI or
 
 First configure your cloud server to accept port 8889, or whatever you want, but change the port in the following commands. On AWS you accomplish this by adding a rule to your server's security group allowing a TCP inbound on port 8889. Otherwise you would adjust iptables for this.
 
-Next you launch the Juypter server.
+Next you launch the Jupyter server.
 
 ```
 jupyter notebook --no-browser --port=8889

From 5024af39c168936c304d9f33eef33136c2d33ea4 Mon Sep 17 00:00:00 2001
From: rboca <remus.f.boca@gmail.com>
Date: Fri, 28 Oct 2022 04:56:37 +0000
Subject: [PATCH 0264/1922] Update CMakeLists.txt (#87030)

Fix Caffe2_CPU_INCLUDE with Caffe2_GPU_INCLUDE. The expanding parent scope should be with the same variable name. The compilation in certain build configurations is corrected with this fix.

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87030
Approved by: https://github.com/kit1980
---
 caffe2/contrib/tensorrt/tensorrt_tranformer.cc | 2 +-
 caffe2/core/nomnigraph/CMakeLists.txt          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/caffe2/contrib/tensorrt/tensorrt_tranformer.cc b/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
index ebe27ef38a199..f1414deca8caa 100644
--- a/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
+++ b/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
@@ -518,7 +518,7 @@ void TensorRTTransformer::Transform(
     return SubnetToTrtOp(net, &mapped_ws, &exporter2, &shape_hints);
   };
 
-  auto cutResult = opt::OptimizeForBackend(*pred_net, supports, trt_converter)
+  auto cutResult = opt::OptimizeForBackend(*pred_net, supports, trt_converter);
   NetDef net_opt = std::move(cutResult.net);
 
   // Need to figure out a proper place to handle device option
diff --git a/caffe2/core/nomnigraph/CMakeLists.txt b/caffe2/core/nomnigraph/CMakeLists.txt
index c4d4216ef9e97..8980c52ddfb4f 100644
--- a/caffe2/core/nomnigraph/CMakeLists.txt
+++ b/caffe2/core/nomnigraph/CMakeLists.txt
@@ -18,5 +18,5 @@ set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
 set(Caffe2_CPU_INCLUDE ${Caffe2_CPU_INCLUDE} PARENT_SCOPE)
 set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
 if(USE_TENSORRT)
-set(Caffe2_GPU_INCLUDE ${Caffe2_CPU_INCLUDE} PARENT_SCOPE)
+set(Caffe2_GPU_INCLUDE ${Caffe2_GPU_INCLUDE} PARENT_SCOPE)
 endif()

From e25734074ee538148ed324ed206be91d58ade578 Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Fri, 28 Oct 2022 04:58:54 +0000
Subject: [PATCH 0265/1922] [MKLDNN] Replace pooling algorithm `pooling_avg`
 with `pooling_avg_exclude_padding` for future oneDNN upgrades (#87851)

**Description**
Replace pooling algorithm `pooling_avg` with `pooling_avg_exclude_padding` in implementation of mkldnn pooling. It's only a change of names, not algorithm. The former is an alias of the latter and it will be removed in future oneDNN library upgrades.
This change has no effect on functionality or performance.

**Validation**
Covered by UT.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87851
Approved by: https://github.com/jgong5, https://github.com/XiaobingSuper
---
 aten/src/ATen/native/mkldnn/Pooling.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/mkldnn/Pooling.cpp b/aten/src/ATen/native/mkldnn/Pooling.cpp
index a0f9207e2faed..30ff49f49dd3b 100644
--- a/aten/src/ATen/native/mkldnn/Pooling.cpp
+++ b/aten/src/ATen/native/mkldnn/Pooling.cpp
@@ -518,7 +518,7 @@ Tensor mkldnn_adaptive_avg_pool2d(
       /*padding*/ {0, 0},
       /*dilation*/ {1, 1},
       /*ceil_mode*/ false,
-      /*algo*/ ideep::algorithm::pooling_avg);
+      /*algo*/ ideep::algorithm::pooling_avg_exclude_padding);
 }
 
 Tensor& mkldnn_adaptive_avg_pool2d_out_stub(const Tensor& input,

From 9f274c97a6c49f0b53c391ef503c3d7f9dc24b81 Mon Sep 17 00:00:00 2001
From: Michael Suo <suo@fb.com>
Date: Thu, 27 Oct 2022 15:01:21 -0700
Subject: [PATCH 0266/1922] [dynamo] Error when user nests FX with dynamo
 (#87797)

Today, this doesn't work and dynamo errors out in a very non-obvious way (see:
https://gist.github.com/suo/dde04830372ab51a4a34ea760f14200a).

Here, we detect the error early and exit with a nicer msg. Also add a
config option to just no-op dynamo (which need to unblock internal
enablement).

cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87797
Approved by: https://github.com/yf225, https://github.com/soumith, https://github.com/jansel
---
 test/dynamo/test_misc.py    | 14 ++++++++++++++
 test/test_prims.py          | 10 +++++++++-
 torch/_dynamo/config.py     |  4 ++++
 torch/_dynamo/eval_frame.py |  9 +++++++++
 4 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index a0f592212f4e1..a63a6d8930c80 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -2732,6 +2732,20 @@ def forward(self, x):
             dynamo_result = graph(x)
             self.assertTrue(same(real, dynamo_result))
 
+    def test_error_on_nested_fx_trace(self):
+        input = torch.rand(2, 3)
+
+        def f(x):
+            x + x
+
+        real = f(input)
+
+        optimized = torch._dynamo.optimize("eager")(f)
+        self.assertTrue(same(optimized(input), real))
+
+        with self.assertRaisesRegex(RuntimeError, "Detected that you are using FX"):
+            gm = torch.fx.symbolic_trace(optimized)
+
 
 class CustomFunc(torch.autograd.Function):
     @staticmethod
diff --git a/test/test_prims.py b/test/test_prims.py
index 6223a34e0a3a9..6f400ce6e797e 100644
--- a/test/test_prims.py
+++ b/test/test_prims.py
@@ -8,7 +8,14 @@
 
 import torch
 from torch.testing import make_tensor
-from torch.testing._internal.common_utils import parametrize, run_tests, TestCase, TEST_SCIPY, skipCUDAMemoryLeakCheckIf
+from torch.testing._internal.common_utils import (
+    parametrize,
+    run_tests,
+    TestCase,
+    TEST_SCIPY,
+    skipCUDAMemoryLeakCheckIf,
+    skipIfTorchDynamo,
+)
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests,
     onlyCUDA,
@@ -387,6 +394,7 @@ def func(a):
         actual = execute(gm, a.mT, executor="nvfuser")
         self.assertEqual(expected, actual)
 
+    @skipIfTorchDynamo
     def test_nvfuser_capability_context(self, device):
         # This test is to ensure that the torch calls are replaced with refs
         # based on the nvfuser+prims capability
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index f75f3651dd97c..2601be8983f2a 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -153,6 +153,10 @@
 # How to import torchinductor, either torchinductor or torch.inductor
 inductor_import = dynamo_import.replace("dynamo", "inductor")
 
+# If true, error with a better message if we symbolically trace over a
+# dynamo-optimized function. If false, silently suppress dynamo.
+error_on_nested_fx_trace = True
+
 # root folder of the project
 if "torch." in dynamo_import:
     base_dir = dirname(dirname(dirname(abspath(__file__))))
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 9e6d8f2450557..947cc4108b8f7 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -14,6 +14,7 @@
 
 import torch
 import torch.utils._pytree as pytree
+from torch.fx._symbolic_trace import is_fx_tracing
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.nn.parallel.distributed import DistributedDataParallel
 
@@ -149,6 +150,14 @@ def __call__(self, *args, **kwargs):
 
         @functools.wraps(fn)
         def _fn(*args, **kwargs):
+            if is_fx_tracing():
+                if config.error_on_nested_fx_trace:
+                    raise RuntimeError(
+                        "Detected that you are using FX to symbolically trace "
+                        "a dynamo-optimized function. This is not supported at the moment."
+                    )
+                return fn
+
             on_enter()
             prior = set_eval_frame(callback)
             backend_ctx = backend_ctx_ctor()

From 1013c3ee07beb6c0a44ef7a21ff6fdb658158f27 Mon Sep 17 00:00:00 2001
From: Charlie Yan <charlieyan@fb.com>
Date: Fri, 28 Oct 2022 04:05:01 +0000
Subject: [PATCH 0267/1922] Add me to reviewers of composable API changes
 (#87891)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87891
Approved by: https://github.com/mrshenli
---
 CODEOWNERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CODEOWNERS b/CODEOWNERS
index 3d030ad4d9e45..238d6e776e92c 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -47,6 +47,7 @@ nn/qat/ @jerryzh168
 # or remove yourself from it.
 /torch/csrc/distributed/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088 @H-Huang @awgu @kwen2501
 /torch/distributed/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088 @H-Huang @awgu @kwen2501
+/torch/distributed/_composable @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088 @H-Huang @awgu @kwen2501 @yhcharles
 /torch/nn/parallel/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088 @H-Huang @awgu @kwen2501
 
 # Distributed tests

From ae65379af2e455eab9744f80413bf30fe6e7d009 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 28 Oct 2022 06:11:42 +0000
Subject: [PATCH 0268/1922] Revert "[EZ] Fix simple bug in torchdynamo
 (#87821)"

This reverts commit ce7fcab9bdf61a34bc56b7cd45a882e4ad6ba175.

Reverted https://github.com/pytorch/pytorch/pull/87821 on behalf of https://github.com/kit1980 due to Broke many dynamo tests https://github.com/pytorch/pytorch/actions/runs/3341984303/jobs/5534381456
---
 torch/_dynamo/variables/user_defined.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/_dynamo/variables/user_defined.py b/torch/_dynamo/variables/user_defined.py
index 09d7893bef665..2d33c8328268a 100644
--- a/torch/_dynamo/variables/user_defined.py
+++ b/torch/_dynamo/variables/user_defined.py
@@ -68,7 +68,7 @@ def call_method(
 
             return variables.ListVariable(subs_as_vars, **options)
 
-        return super().call_method(tx, name, args, kwargs)
+        return super().call_method(tx, args, kwargs)
 
     def call_function(
         self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"

From b4477221f448f22177e4b7310c7ecc113c07abdd Mon Sep 17 00:00:00 2001
From: leslie-fang-intel <leslie.fang@intel.com>
Date: Fri, 28 Oct 2022 10:30:30 +0000
Subject: [PATCH 0269/1922] add the function specialization for promote with
 ITensorListRef (#87756)

Fixes [#87684](https://github.com/pytorch/pytorch/issues/87684)
It's due to a new tensor list type is introduced as `ITensorListRef`.  We need the function specialization for `prioritize` and `cached_cast` for this new tensor list type.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87756
Approved by: https://github.com/jgong5, https://github.com/ezyang
---
 aten/src/ATen/autocast_mode.h | 22 ++++++++++++++++++++++
 test/test_jit_autocast.py     | 26 ++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)

diff --git a/aten/src/ATen/autocast_mode.h b/aten/src/ATen/autocast_mode.h
index 155a52f669f99..3d57ac9231164 100644
--- a/aten/src/ATen/autocast_mode.h
+++ b/aten/src/ATen/autocast_mode.h
@@ -126,6 +126,16 @@ inline at::ScalarType prioritize(
   return current;
 }
 
+inline at::ScalarType prioritize(
+    at::ScalarType current,
+    const ITensorListRef& list,
+    DeviceType device_type = DeviceType::CUDA) {
+  for (const auto& tensor : list) {
+    current = prioritize(current, tensor, device_type);
+  }
+  return current;
+}
+
 // Template to catch non-Tensor args (no-op that returns current best guess)
 template <typename T>
 inline at::ScalarType prioritize(
@@ -196,6 +206,18 @@ inline std::vector<Tensor> cached_cast(
   return vec;
 }
 
+inline std::vector<Tensor> cached_cast(
+    at::ScalarType to_type,
+    const ITensorListRef& arg,
+    DeviceType device_type = DeviceType::CUDA) {
+  std::vector<Tensor> vec;
+  vec.reserve(arg.size());
+  for (const auto& t : arg) {
+    vec.push_back(cached_cast(to_type, t, device_type));
+  }
+  return vec;
+}
+
 // Template to catch non-Tensor args.
 template <typename T>
 inline T cached_cast(
diff --git a/test/test_jit_autocast.py b/test/test_jit_autocast.py
index 5d555e7cd9d8d..93674bb70d820 100644
--- a/test/test_jit_autocast.py
+++ b/test/test_jit_autocast.py
@@ -819,6 +819,32 @@ def test_nhwc_autocast_jit_trace_model(model, x):
                 continue
             test_nhwc_autocast_jit_trace_model(self.models[i], self.inputs[i])
 
+    def test_cat_promote(self):
+        class TestModel(torch.nn.Module):
+            def __init__(self):
+                super(TestModel, self).__init__()
+
+            def forward(self, a, b):
+                return torch.cat([a, b], 0)
+        with torch.jit.fuser("none"):
+            # In this testcase, we will check whether cat has done the promotion in AMP with mixed dtype inputs.
+            # To avoid the fusion group from TE, we will disable the fuser here.
+            for jit_freeze_or_not in [False, True]:
+                test_model = TestModel().eval()
+                with torch.cpu.amp.autocast(cache_enabled=False, dtype=torch.bfloat16), torch.no_grad():
+                    a = torch.rand(24, 128, 128)
+                    b = torch.rand(24, 128, 128, dtype=torch.bfloat16)
+                    c = test_model(a, b)
+                    traced = torch.jit.trace(test_model, (a, b))
+                if jit_freeze_or_not:
+                    traced = torch.jit.freeze(traced)
+                for _ in range(3):
+                    c2 = traced(a, b)
+                self.assertTrue(c.dtype, torch.float32)
+                self.assertTrue(c2.dtype, torch.float32)
+                traced_graph = traced.graph_for(a, b)
+                self.assertTrue(any(n.kind() == "aten::to" for n in traced_graph.nodes()))
+
     def test_script_autocast_cpu(self):
         def fn(x):
             if torch.is_autocast_cpu_enabled():

From dde8afb7b228a9bb91333a4bdd0b367dcc184ea6 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@fb.com>
Date: Fri, 28 Oct 2022 11:26:17 +0000
Subject: [PATCH 0270/1922] Support non-contiguous NestedTensors for
 elementwise ops (#87888)

Enables benchmarking of math path of sdp kernel
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87888
Approved by: https://github.com/drisspg
---
 aten/src/ATen/native/native_functions.yaml    |  2 +
 .../ATen/native/nested/NestedTensorMath.cpp   | 52 ++++++++++++++-----
 .../ATen/native/nested/NestedTensorUtils.h    | 22 ++++++--
 .../ATen/native/transformers/attention.cpp    |  2 +-
 benchmarks/transformer/sdp.py                 | 21 ++++++--
 test/test_nestedtensor.py                     | 46 ++++++++++++++--
 6 files changed, 118 insertions(+), 27 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index c481e03d23c9d..2cd92a5ef1900 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1881,6 +1881,7 @@
   dispatch:
     SparseCPU, SparseCUDA: div_sparse
     ZeroTensor: div_zerotensor
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Tensor
   tags: canonical
 
 - func: div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
@@ -1928,6 +1929,7 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: div
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Scalar
   tags: canonical
 
 - func: div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
diff --git a/aten/src/ATen/native/nested/NestedTensorMath.cpp b/aten/src/ATen/native/nested/NestedTensorMath.cpp
index fc9e11ea44914..2d0e8de8b46f0 100644
--- a/aten/src/ATen/native/nested/NestedTensorMath.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorMath.cpp
@@ -500,10 +500,21 @@ get_elementwise_nested_tensor_impl(
       op_name,
       " does not support broadcasting when given a NestedTensor");
   TORCH_CHECK(
-      nested_tensor_impl_is_contiguous(self_ptr) &&
-          nested_tensor_impl_is_contiguous(other_ptr),
+      at::equal(
+          self_ptr->get_nested_stride_tensor(),
+          other_ptr->get_nested_stride_tensor()),
+      op_name,
+      " requires strides to match when given NestedTensors");
+  auto self_offsets = self_ptr->get_storage_offsets();
+  auto other_offsets = other_ptr->get_storage_offsets();
+  bool offsets_match = true;
+  for (size_t i = 0; i < self_offsets.size(); i++) {
+    offsets_match = offsets_match && (self_offsets[i] == other_offsets[i]);
+  }
+  TORCH_CHECK(
+      offsets_match,
       op_name,
-      " does not support non-contiguous NestedTensor inputs");
+      " requires offsets to match when given NestedTensors");
   return std::make_pair(self_ptr, other_ptr);
 }
 
@@ -517,16 +528,20 @@ Tensor NestedTensor_elementwise_Tensor(
   if (!self.is_nested() && self.dim() == 0 && self.numel() == 1) {
     auto other_impl = get_nested_tensor_impl(other);
     return wrap_buffer(
-      f(self, other_impl->get_buffer()),
-      other_impl->get_nested_size_tensor().clone()
+      f(self, other_impl->get_unsafe_storage_as_tensor()),
+      other_impl->get_nested_size_tensor().clone(),
+      other_impl->get_nested_stride_tensor().clone(),
+      other_impl->get_storage_offsets()
     );
   }
   // other is a scalar
   if (!other.is_nested() && other.dim() == 0 && other.numel() == 1) {
     auto self_impl = get_nested_tensor_impl(self);
     return wrap_buffer(
-      f(self_impl->get_buffer(), other),
-      self_impl->get_nested_size_tensor().clone()
+      f(self_impl->get_unsafe_storage_as_tensor(), other),
+      self_impl->get_nested_size_tensor().clone(),
+      self_impl->get_nested_stride_tensor().clone(),
+      self_impl->get_storage_offsets()
     );
   }
   NestedTensorImpl* self_impl = nullptr;
@@ -535,13 +550,12 @@ Tensor NestedTensor_elementwise_Tensor(
       get_elementwise_nested_tensor_impl(self, other, op_name);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self_impl);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(other_impl);
-  const auto& nt_self = *self_impl;
-  const auto& nt_other = *other_impl;
-  const auto& self_sizes = nt_self.get_nested_size_tensor();
   return wrap_buffer(
-      f(nt_self.get_buffer().reshape({-1}),
-        nt_other.get_buffer().reshape({-1})),
-      self_sizes);
+      f(self_impl->get_unsafe_storage_as_tensor(),
+        other_impl->get_unsafe_storage_as_tensor()),
+      self_impl->get_nested_size_tensor(),
+      self_impl->get_nested_stride_tensor(),
+      self_impl->get_storage_offsets());
 }
 
 Tensor NestedTensor_add_Tensor(
@@ -566,6 +580,18 @@ Tensor NestedTensor_mul_Scalar(const Tensor& self, const Scalar& other) {
   return NestedTensor_mul_Tensor(self, wrapped_scalar_tensor(other));
 }
 
+Tensor NestedTensor_div_Tensor(const Tensor& self, const Tensor& other) {
+  return NestedTensor_elementwise_Tensor(
+      self, other, "div", [](const Tensor& b1, const Tensor& b2) {
+        return at::div(b1, b2);
+      });
+}
+
+// Only usable on the C++ side; scalars are converted to tensors coming from Python.
+Tensor NestedTensor_div_Scalar(const Tensor& self, const Scalar& other) {
+  return NestedTensor_div_Tensor(self, wrapped_scalar_tensor(other));
+}
+
 template <typename Func>
 Tensor& NestedTensor_elementwise__Tensor(
     Tensor& self,
diff --git a/aten/src/ATen/native/nested/NestedTensorUtils.h b/aten/src/ATen/native/nested/NestedTensorUtils.h
index 77d512c519b28..ff8ec37dfc521 100644
--- a/aten/src/ATen/native/nested/NestedTensorUtils.h
+++ b/aten/src/ATen/native/nested/NestedTensorUtils.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <ATen/core/Tensor.h>
 #include <ATen/NestedTensorImpl.h>
+#include <ATen/core/Tensor.h>
 #include <c10/core/DispatchKeySet.h>
 #include <c10/core/TensorImpl.h>
 #include <c10/macros/Macros.h>
@@ -50,6 +50,18 @@ inline at::Tensor wrap_buffer(
       std::move(offsets));
 }
 
+inline at::Tensor wrap_buffer(
+    at::Tensor buffer,
+    at::Tensor nested_size_tensor,
+    at::Tensor nested_stride_tensor,
+    const std::vector<int64_t>& offsets) {
+  std::vector<int64_t> offsets_copy(offsets);
+  return wrap_buffer(buffer,
+                     nested_size_tensor,
+                     nested_stride_tensor,
+                     std::move(offsets_copy));
+}
+
 inline at::Tensor get_buffer(const at::Tensor& tensor) {
   return get_nested_tensor_impl(tensor)->get_buffer();
 }
@@ -119,7 +131,6 @@ inline std::vector<IntArrayRef> NestedTensor_get_sizes(
   return sizes;
 }
 
-
 TORCH_API std::vector<int64_t> NestedTensor_get_max_size(
     const NestedTensorImpl& nt);
 
@@ -161,17 +172,18 @@ inline std::vector<IntArrayRef> NestedTensor_get_strides(
 inline void check_numel_equals_buffer_size(const at::Tensor& self) {
   auto self_impl = get_nested_tensor_impl(self);
   TORCH_CHECK(
-      self.numel() == self_impl -> get_buffer_size(),
+      self.numel() == self_impl->get_buffer_size(),
       "Number of elements in nested tensor must match number of elements in buffer.");
 }
 
 inline void check_numel_equals_buffer_size(const NestedTensorImpl* self_ptr) {
   TORCH_CHECK(
-      self_ptr-> numel() == self_ptr -> get_buffer_size(),
+      self_ptr->numel() == self_ptr->get_buffer_size(),
       "Number of elements in nested tensor must match number of elements in buffer.");
 }
 //  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-// Data structures and functions for generically applying a function on a nested tensor.
+// Data structures and functions for generically applying a function on a nested
+// tensor.
 namespace impl {
 
 template <typename T>
diff --git a/aten/src/ATen/native/transformers/attention.cpp b/aten/src/ATen/native/transformers/attention.cpp
index c03935ecfbf3d..5b3d5f999cfb1 100644
--- a/aten/src/ATen/native/transformers/attention.cpp
+++ b/aten/src/ATen/native/transformers/attention.cpp
@@ -703,7 +703,7 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math(
     auto attn_mask = attn_mask_;
     // Naive, composite implementation defined here.
     const auto embed_size = query_.size(-1);
-    const auto query = query_ * (1. / ::sqrt(static_cast<double>(embed_size)));
+    const auto query = query_ / ::sqrt(static_cast<double>(embed_size));
     if (is_causal) {
         TORCH_CHECK(!attn_mask.has_value(),
                 "_scaled_dot_product_attention: Explicit attn_mask should not be set when is_causal=True");
diff --git a/benchmarks/transformer/sdp.py b/benchmarks/transformer/sdp.py
index 50db76e9f8c21..fbd123fc39b31 100644
--- a/benchmarks/transformer/sdp.py
+++ b/benchmarks/transformer/sdp.py
@@ -7,6 +7,8 @@
 
 import warnings
 warnings.filterwarnings("ignore")
+
+
 class CompositeMHA(torch.nn.Module):
     def __init__(self, num_heads, in_proj_weight, in_proj_bias, out_proj):
         super().__init__()
@@ -90,8 +92,8 @@ def benchmark_torch_function(iters, f, *args, **kwargs):
     return (start_event.elapsed_time(end_event) * 1.0e-3) / iters
 
 
-def run_timing(iters, batch_size, embed_dimension, num_heads, max_sequence_len, pad_percentage, writer):
-    with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=True):
+def run_timing(iters, batch_size, embed_dimension, num_heads, max_sequence_len, pad_percentage, enable_math, enable_flash, writer):
+    with torch.backends.cuda.sdp_kernel(enable_math=enable_math, enable_flash=enable_flash):
         with torch.inference_mode():
             dropout_p = 0.0
             mask = None
@@ -122,6 +124,8 @@ def run_timing(iters, batch_size, embed_dimension, num_heads, max_sequence_len,
             results["cp_time"] = cp_time
             results["speedup"] = pt_time / cp_time
             results["dtype"] = str(x.dtype)
+            results["enable_math"] = str(enable_math)
+            results["enable_flash"] = str(enable_flash)
             writer.writerow(results)
 
 
@@ -131,15 +135,22 @@ def main():
     np.random.seed(seed)
     torch.manual_seed(seed)
 
-    headers = ["max_sequence_len", "num_heads", "embed_dimension", "pt_time", "cp_time", "speedup", "dtype"]
+    headers = ["max_sequence_len", "num_heads", "embed_dimension", "pt_time",
+               "cp_time", "speedup", "dtype", "enable_math", "enable_flash"]
     writer = csv.DictWriter(sys.stdout, headers)
     writer.writeheader()
 
     batch_size = 64
     pad_percentage = 0.5
 
-    for num_heads, max_seq_len in itertools.product([2, 4, 8, 16, 32], [64, 128, 256]):
-        run_timing(iters, batch_size, 1024, num_heads, max_seq_len, pad_percentage, writer)
+    for (enable_math, enable_flash) in [(False, True), (True, False), (True, True)]:
+        for num_heads, max_seq_len in itertools.product([2, 4, 8, 16, 32], [64, 128, 256]):
+            run_timing(iters, batch_size, 1024, num_heads, max_seq_len,
+                       pad_percentage, enable_math, enable_flash, writer)
+            run_timing(iters, batch_size, 1024, num_heads, max_seq_len,
+                       pad_percentage, enable_math, enable_flash, writer)
+            run_timing(iters, batch_size, 1024, num_heads, max_seq_len,
+                       pad_percentage, enable_math, enable_flash, writer)
 
 
 if __name__ == "__main__":
diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py
index f51db59958696..f5e9aa1b8d703 100644
--- a/test/test_nestedtensor.py
+++ b/test/test_nestedtensor.py
@@ -24,6 +24,8 @@
 
 # Tests are ported from pytorch/nestedtensor.
 # This makes porting as_nested_tensor easier in the future.
+
+
 def _iter_constructors():
     # yield as_nested_tensor
     yield torch.nested.nested_tensor
@@ -33,6 +35,8 @@ def _iter_constructors():
 # an output nested tensor consists of
 # * `len(ragged_sizes)` matrices
 # * matrices[i].shape == (20, ragged_sizes[i])
+
+
 def random_nt_noncontiguous_pair(ragged_sizes, device="cpu", dtype=torch.float16):
     xs = []
     for size in ragged_sizes:
@@ -49,6 +53,8 @@ def random_nt_noncontiguous_pair(ragged_sizes, device="cpu", dtype=torch.float16
 
 # Helper functions to pad a noncontiguous nested tensor
 # can be replaced once to_padded_tensor supports noncontiguous memory
+
+
 def noncontiguous_to_padded_tensor(input, shape=None):
     tensors = input.unbind()
     ntensors = len(tensors)
@@ -72,6 +78,8 @@ def noncontiguous_to_padded_tensor(input, shape=None):
     return result
 
 # Helper function to generate a random nested tensor
+
+
 def random_nt(device, dtype, num_tensors, max_dims, min_dims=None):
     if min_dims is None:
         min_dims = tuple([0] * len(max_dims))
@@ -83,6 +91,7 @@ def random_nt(device, dtype, num_tensors, max_dims, min_dims=None):
         ts1.append(t1)
     return torch.nested.nested_tensor(ts1, device=device, dtype=dtype)
 
+
 class TestNestedTensor(TestCase):
 
     @torch.inference_mode()
@@ -478,7 +487,6 @@ def test_detach(self, device, dtype):
         self.assertEqual(a.grad, torch.ones(2, 4, device=device, dtype=dtype))
         self.assertEqual(b.grad, torch.ones(5, 4, device=device, dtype=dtype))
 
-
     @dtypes(torch.float, torch.float16, torch.double)
     def test_unbind_noncontiguous(self, device, dtype):
         nt_contiguous, nt_noncontiguous = random_nt_noncontiguous_pair((2, 3, 6, 7), device, dtype)
@@ -857,6 +865,39 @@ def test_nested_tensor_mul(self, device, dtype):
             lambda: vector.mul(nt1)
         )
 
+    @dtypes(torch.float, torch.float16)
+    @skipMeta
+    @torch.inference_mode()
+    def test_nested_tensor_div(self, device, dtype):
+        nt, nt2 = self.random_nt_pair(device, dtype, 4, (4, 4))
+        scale = 4.0
+        ref = torch.nested.nested_tensor([t / scale for t in nt.unbind()])
+        out = nt / 4.0
+        self.assertEqual(ref, out)
+        ref_transposed = ref.transpose(1, 2)
+        out = nt.transpose(1, 2) / 4.0
+        self.assertEqual(ref_transposed, out)
+
+        ref = torch.nested.nested_tensor([t / t2 for (t, t2) in zip(nt.unbind(), nt2.unbind())])
+        out = nt / nt2
+        self.assertEqual(ref, out)
+
+        out = nt.transpose(1, 2) / nt2.transpose(1, 2)
+        self.assertEqual(ref.transpose(1, 2), out)
+
+        nt_transpose_copy = torch.nested.nested_tensor([t.transpose(0, 1) for t in nt.unbind()])
+
+        self.assertRaisesRegex(
+            RuntimeError, "div requires strides to match when given NestedTensors",
+            lambda: nt_transpose_copy.transpose(1, 2) / nt2)
+
+        nt = torch.nested.nested_tensor([torch.randn(i, 4) for i in [3, 4, 5]], device=device, dtype=dtype)
+        nt_chunks = nt.chunk(2, -1)
+        self.assertRaisesRegex(
+            RuntimeError, "div requires offsets to match when given NestedTensors",
+            lambda: nt_chunks[0] / nt_chunks[1])
+
+
     @dtypes(torch.float, torch.float16)
     @skipMeta
     @torch.inference_mode()
@@ -1732,7 +1773,6 @@ def _create_nested_tensor_from_list(self, requires_grad=False):
         return torch.nested.as_nested_tensor([torch.randn(1, 2, requires_grad=requires_grad),
                                               torch.randn(7, 8, requires_grad=requires_grad)])
 
-
     def _create_nested_tensor_from_mask(self, requires_grad=False):
         data = torch.randn(2, 3, 4, requires_grad=requires_grad)
         mask = torch.ones_like(data[:, :, 0]).bool()
@@ -1772,7 +1812,6 @@ def test_nested_tensor_generates_leaf(self):
         self.assertEqual(a.grad, None)
         self.assertEqual(b.grad, None)
 
-
     def test_set_requires_grad_from_list(self):
         nt = self._create_nested_tensor_from_list()
         nt.requires_grad_()
@@ -2139,6 +2178,7 @@ def test_indexing_backward(self):
         expected_grad = torch.nested.nested_tensor([grad_x0, torch.zeros((3, 4))])
         self.assertEqual(nt.grad, expected_grad)
 
+
 instantiate_device_type_tests(TestNestedTensorDeviceType, globals())
 
 if __name__ == '__main__':

From f78ce2882c9fd384b3107d9a504fb9915801e890 Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Fri, 28 Oct 2022 02:04:36 +0000
Subject: [PATCH 0271/1922] Add state to distributed composable API (#87838)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87838
Approved by: https://github.com/yhcharles
---
 test/distributed/_composable/test_contract.py |  24 +++
 torch/distributed/_composable/__init__.py     | 103 +------------
 torch/distributed/_composable/contract.py     | 145 ++++++++++++++++++
 3 files changed, 170 insertions(+), 102 deletions(-)
 create mode 100644 torch/distributed/_composable/contract.py

diff --git a/test/distributed/_composable/test_contract.py b/test/distributed/_composable/test_contract.py
index 206f9196b7b3b..d510af6d7b2b0 100644
--- a/test/distributed/_composable/test_contract.py
+++ b/test/distributed/_composable/test_contract.py
@@ -93,6 +93,30 @@ def wrap_module(module: nn.Module) -> nn.Module:
         with self.assertRaisesRegex(RuntimeError, "cannot modify FQNs"):
             wrap_module(model.seq1)
 
+    @skipIfTorchDynamo("Dynamo does not yet capture module hooks")
+    def test_state(self):
+        def check_and_update_state_hook(
+            module: nn.Module, inp: Tuple[torch.Tensor]
+        ) -> Tuple[torch.Tensor]:
+            self.assertEqual(api.state(module).dummy_state, 7)
+            api.state(module).dummy_state = 8
+            return inp
+
+        # FIXME: circular reference looks a bit weird. Shall we make .state a
+        # top-level API instead attached to contract API?
+        @contract
+        def api(module: nn.Module) -> nn.Module:
+            api.state(module).dummy_state = 7
+            module.register_forward_pre_hook(check_and_update_state_hook)
+            return module
+
+        model = ToyModel()
+        api(model.seq1)
+
+        self.assertEqual(api.state(model.seq1).dummy_state, 7)
+        model(torch.zeros(10, 10), torch.zeros(10, 10))
+        self.assertEqual(api.state(model.seq1).dummy_state, 8)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/distributed/_composable/__init__.py b/torch/distributed/_composable/__init__.py
index 90533a13cdf56..fb0c81f6c5668 100644
--- a/torch/distributed/_composable/__init__.py
+++ b/torch/distributed/_composable/__init__.py
@@ -1,102 +1 @@
-import torch
-import torch.nn as nn
-
-from collections import OrderedDict
-from typing import List, Optional
-
-
-def contract(func):
-    r"""
-    Decorate a function as a composable distributed API, where the first
-    argument of the function must be an :class:`nn.Module` instance. The
-    decorator verifies that the wrapped function does not modify parameter,
-    buffer or sub-module fully-qualified names (FQN).
-
-    Example::
-        >>> import torch.nn as nn
-        >>>
-        >>> class MyModel(nn.Module):
-        >>>     def __init__(self):
-        >>>         super().__init__()
-        >>>         self.l1 = nn.Linear(10, 10)
-        >>>         self.l2 = nn.Linear(10, 10)
-        >>>
-        >>>     def forward(self, x):
-        >>>         return self.l2(self.l1(x))
-        >>>
-        >>> @contract
-        >>> def my_noop_feature(module: nn.Module) -> nn.Module:
-        >>>     return module
-        >>>
-        >>> model = MyModel()
-        >>> my_noop_feature(model.l1)
-        >>> my_noop_feature(model.l2)
-        >>> model(torch.randn(2, 10)).sum().backward()
-    """
-
-    def wrapper(module: nn.Module, *args, **kwargs) -> Optional[nn.Module]:
-        orig_named_params = OrderedDict(module.named_parameters())
-        orig_named_buffers = OrderedDict(
-            module.named_buffers(remove_duplicate=False)
-        )
-        orig_named_modules = OrderedDict(
-            module.named_modules(remove_duplicate=False)
-        )
-
-        updated = func(module, *args, **kwargs)
-
-        if updated is None:
-            updated = module
-
-        new_named_params = OrderedDict(updated.named_parameters())
-        new_named_buffers = OrderedDict(
-            updated.named_buffers(remove_duplicate=False)
-        )
-        new_named_modules = OrderedDict(
-            updated.named_modules(remove_duplicate=False)
-        )
-
-        assert isinstance(updated, nn.Module), (
-            "Output of composable distributed APIs must be either None or "
-            f"nn.Module, but got {type(updated)}"
-        )
-
-        def check_fqn(orig_fqns: List[str], new_fqns: List[str]):
-            if orig_fqns == new_fqns:
-                return
-
-            orig_fqn_set, new_fqn_set = set(orig_fqns), set(new_fqns)
-            orig_only = orig_fqn_set - new_fqn_set
-            new_only = new_fqn_set - orig_fqn_set
-            if len(orig_only) or len(new_only):
-                raise RuntimeError(
-                    "Composable distributed API implementations cannot modify "
-                    "FQNs.\n"
-                    f"Only in original FQNs: {orig_only},\n"
-                    f"Only in new FQNs: {new_only}"
-                )
-            else:
-                raise RuntimeError(
-                    "Composable distributed API implementations cannot modify "
-                    "the order of FQNs.\n"
-                    f"Original FQNs: {orig_only}\n"
-                    f"New FQNs: {new_only}"
-                )
-
-        check_fqn(list(orig_named_params.keys()), list(new_named_params.keys()))
-        check_fqn(
-            list(orig_named_buffers.keys()), list(new_named_buffers.keys())
-        )
-        check_fqn(
-            list(orig_named_modules.keys()), list(new_named_modules.keys())
-        )
-
-        # TODO: a stricter verification should also reject changing module
-        # types and monkey-patching forward() method implementations.
-
-        # TODO: verify that installed distributed paradigms are compatible with
-        # each other.
-
-        return updated
-
-    return wrapper
+from .contract import contract
diff --git a/torch/distributed/_composable/contract.py b/torch/distributed/_composable/contract.py
new file mode 100644
index 0000000000000..b75604872a592
--- /dev/null
+++ b/torch/distributed/_composable/contract.py
@@ -0,0 +1,145 @@
+import torch.nn as nn
+
+from collections import OrderedDict
+from typing import Any, Callable, Dict, List, Optional
+
+
+# use state_slot as key for module.__dict__ to avoid coliding with other
+# properties.
+# TODO: since all composable distributed features can share the same slot.
+class _StateKey:
+
+    # implement operator < to avoid breaking dir()
+    def __lt__(self, other: Any) -> bool:
+        return True if isinstance(other, str) else id(self) < id(other)
+
+
+class _State:
+    pass
+
+
+STATE_KEY = _StateKey()
+
+
+def contract(func):
+    r"""
+    Decorate a function as a composable distributed API, where the first
+    argument of the function must be an :class:`nn.Module` instance. The
+    decorator verifies that the wrapped function does not modify parameter,
+    buffer or sub-module fully-qualified names (FQN).
+
+    When a function ``func`` is decorated by ``@contract``, a
+    ``.state(module: nn.Module)`` method will be installed to the decorated
+    function. Then you can retrieve and modify the state on a module by calling
+    ``func.state(module)``.
+
+    Example::
+        >>> import torch.nn as nn
+        >>>
+        >>> class MyModel(nn.Module):
+        >>>     def __init__(self):
+        >>>         super().__init__()
+        >>>         self.l1 = nn.Linear(10, 10)
+        >>>         self.l2 = nn.Linear(10, 10)
+        >>>
+        >>>     def forward(self, x):
+        >>>         return self.l2(self.l1(x))
+        >>>
+        >>> @contract
+        >>> def my_feature(module: nn.Module) -> nn.Module:
+        >>>     my_feature.state(module).some_state = "any value"
+        >>>     return module
+        >>>
+        >>> model = MyModel()
+        >>> my_feature(model.l1)
+        >>> assert my_feature.state(model.l1).some_state == "any value"
+        >>> my_feature(model.l2)
+        >>> model(torch.randn(2, 10)).sum().backward()
+    """
+
+    def wrapper(module: nn.Module, *args, **kwargs) -> Optional[nn.Module]:
+        # install states specific to the wrapped ``func``
+        default_all_state: Dict[Callable, _State] = {}
+        all_state: Dict[Callable, _State] = module.__dict__.setdefault(  # type: ignore[call-overload]
+            STATE_KEY, default_all_state
+        )
+        assert isinstance(
+            all_state, dict
+        ), "Distributed composable API states corrupted"
+        assert func not in all_state, (
+            "Each distinct composable distributed API can only be applied to a "
+            f"module once. {func} has already been applied to the following "
+            f"module.\n{module}"
+        )
+        all_state.setdefault(func, _State())
+
+        orig_named_params = OrderedDict(module.named_parameters())
+        orig_named_buffers = OrderedDict(
+            module.named_buffers(remove_duplicate=False)
+        )
+        orig_named_modules = OrderedDict(
+            module.named_modules(remove_duplicate=False)
+        )
+
+        updated = func(module, *args, **kwargs)
+
+        if updated is None:
+            updated = module
+
+        new_named_params = OrderedDict(updated.named_parameters())
+        new_named_buffers = OrderedDict(
+            updated.named_buffers(remove_duplicate=False)
+        )
+        new_named_modules = OrderedDict(
+            updated.named_modules(remove_duplicate=False)
+        )
+
+        assert isinstance(updated, nn.Module), (
+            "Output of composable distributed APIs must be either None or "
+            f"nn.Module, but got {type(updated)}"
+        )
+
+        def check_fqn(orig_fqns: List[str], new_fqns: List[str]):
+            if orig_fqns == new_fqns:
+                return
+
+            orig_fqn_set, new_fqn_set = set(orig_fqns), set(new_fqns)
+            orig_only = orig_fqn_set - new_fqn_set
+            new_only = new_fqn_set - orig_fqn_set
+            if len(orig_only) or len(new_only):
+                raise RuntimeError(
+                    "Composable distributed API implementations cannot modify "
+                    "FQNs.\n"
+                    f"Only in original FQNs: {orig_only},\n"
+                    f"Only in new FQNs: {new_only}"
+                )
+            else:
+                raise RuntimeError(
+                    "Composable distributed API implementations cannot modify "
+                    "the order of FQNs.\n"
+                    f"Original FQNs: {orig_only}\n"
+                    f"New FQNs: {new_only}"
+                )
+
+        check_fqn(list(orig_named_params.keys()), list(new_named_params.keys()))
+        check_fqn(
+            list(orig_named_buffers.keys()), list(new_named_buffers.keys())
+        )
+        check_fqn(
+            list(orig_named_modules.keys()), list(new_named_modules.keys())
+        )
+
+        # TODO: a stricter verification should also reject changing module
+        # types and monkey-patching forward() method implementations.
+
+        # TODO: verify that installed distributed paradigms are compatible with
+        # each other.
+
+        return updated
+
+    def get_state(module: nn.Module) -> Optional[_State]:
+        return module.__dict__.get(STATE_KEY).get(func)  # type: ignore[call-overload]
+
+    wrapper.state = get_state  # type: ignore[attr-defined]
+
+    return wrapper

From 45811802ebfd20ae2f3ebddbf35f45b12e27fcc3 Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Fri, 28 Oct 2022 13:40:11 +0000
Subject: [PATCH 0272/1922] Reland 2 Many symintifications (#87604) (#87980)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87980
Approved by: https://github.com/ezyang
---
 aten/src/ATen/ExpandUtils.h                   |  5 +-
 aten/src/ATen/core/TensorBase.h               | 10 ++++
 .../ATen/functorch/BatchRulesConvolution.cpp  | 56 +++++++++----------
 .../functorch/BatchRulesDecompositions.cpp    |  2 +-
 .../ATen/native/AdaptiveAveragePooling.cpp    |  6 +-
 aten/src/ATen/native/Convolution.cpp          | 20 +++----
 aten/src/ATen/native/EmbeddingBag.cpp         |  2 +-
 aten/src/ATen/native/GridSamplerUtils.h       |  2 +-
 aten/src/ATen/native/IndexingUtils.cpp        | 12 ++--
 aten/src/ATen/native/LossNLL.cpp              |  2 +-
 aten/src/ATen/native/LossNLL2d.cpp            |  2 +-
 aten/src/ATen/native/NonSymbolicBC.h          |  1 +
 aten/src/ATen/native/Pool.h                   | 18 ++++--
 aten/src/ATen/native/TensorProperties.cpp     |  2 +-
 aten/src/ATen/native/TensorShape.cpp          | 28 +++++++---
 aten/src/ATen/native/group_norm.cpp           | 24 ++++----
 aten/src/ATen/native/native_functions.yaml    | 54 +++++++++---------
 test/functorch/test_aotdispatch.py            |  3 -
 test/test_proxy_tensor.py                     |  4 --
 tools/autograd/derivatives.yaml               | 44 +++++++--------
 tools/jit/gen_unboxing.py                     |  4 +-
 torch/csrc/StorageMethods.cpp                 |  2 +-
 torch/csrc/autograd/FunctionsManual.cpp       | 16 +++---
 torch/csrc/autograd/FunctionsManual.h         |  8 +--
 torch/storage.py                              |  4 +-
 torchgen/api/cpp.py                           |  8 ++-
 torchgen/api/native.py                        |  2 +-
 torchgen/api/python.py                        |  6 +-
 torchgen/gen.py                               |  4 +-
 29 files changed, 197 insertions(+), 154 deletions(-)

diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h
index 779894645b8ec..786cbf132cd77 100644
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@@ -94,10 +94,11 @@ inline void check_defined(
 inline c10::MaybeOwned<Tensor> expand_inplace(
     const Tensor& tensor,
     const Tensor& to_expand) {
-  if (tensor.sizes().equals(to_expand.sizes())) {
+  if (tensor.sym_sizes().equals(to_expand.sym_sizes())) {
     return c10::MaybeOwned<Tensor>::borrowed(to_expand);
   }
-  return c10::MaybeOwned<Tensor>::owned(to_expand.expand(tensor.sizes()));
+  return c10::MaybeOwned<Tensor>::owned(
+      to_expand.expand_symint(tensor.sym_sizes()));
 }
 
 inline c10::MaybeOwned<Tensor> expand_inplace(
diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h
index 08a14f2e09580..0ecd4456033b0 100644
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@@ -955,11 +955,21 @@ c10::SymIntArrayRef sizes(const TensorBase& t) { return t.sym_sizes(); }
 template <typename T, typename = enable_if_int<T>>
 IntArrayRef sizes(const TensorBase& t) { return t.sizes(); }
 
+template <typename T, typename = enable_if_symint<T>>
+c10::SymInt size(const TensorBase& t, int64_t dim) { return t.sym_size(dim); }
+template <typename T, typename = enable_if_int<T>>
+int64_t size(const TensorBase& t, int64_t dim) { return t.size(dim); }
+
 template <typename T, typename = enable_if_symint<T>>
 c10::SymIntArrayRef strides(const TensorBase& t) { return t.sym_strides(); }
 template <typename T, typename = enable_if_int<T>>
 IntArrayRef strides(const TensorBase& t) { return t.strides(); }
 
+template <typename T, typename = enable_if_symint<T>>
+c10::SymInt numel(const TensorBase& t) { return t.sym_numel(); }
+template <typename T, typename = enable_if_int<T>>
+int64_t numel(const TensorBase& t) { return t.numel(); }
+
 } // namespace symint
 
 } // namespace at
diff --git a/aten/src/ATen/functorch/BatchRulesConvolution.cpp b/aten/src/ATen/functorch/BatchRulesConvolution.cpp
index 0640af3a1b533..79523ed1fb6d9 100644
--- a/aten/src/ATen/functorch/BatchRulesConvolution.cpp
+++ b/aten/src/ATen/functorch/BatchRulesConvolution.cpp
@@ -17,7 +17,7 @@ namespace at { namespace functorch {
 // we do not support batch_group_count (which is needed for convolution backwards).
 // Instead, there's a convolution_backward op that needs a batching rule.
 std::tuple<Tensor,optional<int64_t>>
-convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tensor& rhs, optional<int64_t> rhs_bdim, const optional<Tensor>& bias, optional<int64_t> bias_bdim, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool transposed, IntArrayRef output_padding, int64_t groups) {
+convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tensor& rhs, optional<int64_t> rhs_bdim, const optional<Tensor>& bias, optional<int64_t> bias_bdim, IntArrayRef stride, c10::SymIntArrayRef padding, IntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, int64_t groups) {
   DimVector lhs_spec(stride.size() + 2);
   std::iota(lhs_spec.begin(), lhs_spec.end(), 0);
   DimVector rhs_spec = lhs_spec;
@@ -42,13 +42,13 @@ convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tens
   std::tuple<Tensor, optional<int64_t>> result;
   if (lhs_bdim && !rhs_bdim) {
     auto new_x = reshape_dim_into(*lhs_bdim, lhs_spec[0], lhs);
-    auto out = at::convolution(new_x, rhs, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
+    auto out = at::convolution_symint(new_x, rhs, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
     out = reshape_dim_outof(out_spec[0], lhs.sizes()[*lhs_bdim], out);
     result = std::make_tuple(out, out_spec[0]);
   } else if (!lhs_bdim && rhs_bdim) {
     if (groups == 1) {
       auto new_w = reshape_dim_into(*rhs_bdim, rhs_spec[0], rhs);
-      auto out = at::convolution(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
+      auto out = at::convolution_symint(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
       out = reshape_dim_outof(out_spec[1], rhs.size(*rhs_bdim), out);
       result = std::make_tuple(out, out_spec[1]);
     } else {
@@ -62,7 +62,7 @@ convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tens
         // BIOHW -> I(BO)HW
         auto new_w = reshape_dim_into(*rhs_bdim, 1, rhs);
         // NIHW, I(BO)HW -> N(GBO)HW
-        auto out = at::convolution(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
+        auto out = at::convolution_symint(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
         // N(GBO)HW -> NG(BO)HW
         out = reshape_dim_outof(1, groups, out);
         // NG(BO)HW -> NGBOHW
@@ -84,7 +84,7 @@ convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tens
         // G(BO)IHW -> (GBO)IHW
         new_w = reshape_dim_into(0, 0, new_w);
         // N(GI)HW, (GBO)IHW -> N(GBO)HW
-        auto out = at::convolution(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
+        auto out = at::convolution_symint(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
         // N(GBO)HW -> NG(BO)HW
         out = reshape_dim_outof(1, groups, out);
         // NG(BO)HW -> NGBOHW
@@ -99,11 +99,11 @@ convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tens
     groups *= lhs.sizes()[*lhs_bdim];
     auto dim_with_groups = transposed ? 1 : 0;
     auto new_w = reshape_dim_into(*rhs_bdim, rhs_spec[dim_with_groups], rhs);
-    auto out = at::convolution(new_x, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
+    auto out = at::convolution_symint(new_x, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
     out = reshape_dim_outof(out_spec[1], lhs.sizes()[*lhs_bdim], out);
     result = std::make_tuple(out, out_spec[1]);
   } else {
-    result = std::make_tuple(at::convolution(lhs, rhs, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups), nullopt);
+    result = std::make_tuple(at::convolution_symint(lhs, rhs, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups), nullopt);
   }
   if (separate_bias) {
     auto A = std::get<0>(result);
@@ -244,8 +244,8 @@ convolution_backward_input_batch_rule(
     const Tensor& grad_output, optional<int64_t> grad_output_bdim,
     const Tensor& input, optional<int64_t> input_bdim,
     const Tensor& weight, optional<int64_t> weight_bdim,
-    IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool transposed,
-    IntArrayRef output_padding, int64_t groups) {
+    IntArrayRef stride, c10::SymIntArrayRef padding, IntArrayRef dilation, bool transposed,
+    c10::SymIntArrayRef output_padding, int64_t groups) {
   const std::array<bool, 3> mask = {true, false, false};
   if (grad_output_bdim && weight_bdim) {
     // regular: BNO, BOI -> N(BO), (BO)I -> N(BI)
@@ -254,7 +254,7 @@ convolution_backward_input_batch_rule(
     const auto grad_output_ = reshape_dim_into(*grad_output_bdim, 1, grad_output);
     const auto weight_ = reshape_dim_into(*weight_bdim, 0, weight);
     auto dummy_input = make_dummy(input, input_bdim, 1, batch_size);
-    const auto result = at::convolution_backward(
+    const auto result = at::convolution_backward_symint(
         grad_output_, dummy_input, weight_, nullopt, stride, padding,
         dilation, transposed, output_padding, groups * batch_size, mask);
     const auto grad_input = reshape_dim_outof(1, batch_size, std::get<0>(result));
@@ -265,7 +265,7 @@ convolution_backward_input_batch_rule(
     const auto batch_size = grad_output.size(*grad_output_bdim);
     const auto grad_output_ = reshape_dim_into(*grad_output_bdim, 0, grad_output);
     auto dummy_input = make_dummy(input, input_bdim, 0, batch_size);
-    const auto result = at::convolution_backward(
+    const auto result = at::convolution_backward_symint(
         grad_output_, dummy_input, weight, nullopt, stride, padding,
         dilation, transposed, output_padding, groups, mask);
     const auto grad_input = reshape_dim_outof(0, batch_size, std::get<0>(result));
@@ -278,7 +278,7 @@ convolution_backward_input_batch_rule(
       const auto in_ch_dim = transposed ? 0 : 1;
       const auto weight_ = reshape_dim_into(*weight_bdim, in_ch_dim, weight);
       auto dummy_input = make_dummy(input, input_bdim, 1, batch_size);
-      const auto result = at::convolution_backward(
+      const auto result = at::convolution_backward_symint(
           grad_output, dummy_input, weight_, nullopt, stride, padding,
           dilation, transposed, output_padding, groups, mask);
       const auto grad_input = reshape_dim_outof(1, batch_size, std::get<0>(result));
@@ -289,7 +289,7 @@ convolution_backward_input_batch_rule(
       // N(GO), B(GO)I -> N(GO), (GO)(BI) -> N(GBI)
       const auto weight_ = reshape_dim_into(*weight_bdim, 1, weight);
       auto dummy_input = make_dummy(input, input_bdim, 1, batch_size);
-      const auto result = at::convolution_backward(
+      const auto result = at::convolution_backward_symint(
           grad_output, dummy_input, weight_, nullopt, stride, padding,
           dilation, transposed, output_padding, groups, mask);
       grad_input = std::get<0>(result); // N(GBI)
@@ -300,7 +300,7 @@ convolution_backward_input_batch_rule(
       weight_ = weight_.transpose(0, 1);                       // GBIO
       weight_ = weight_.flatten(0, 2);                         // (GBI)O
       const auto dummy_input = make_dummy(input, input_bdim, 1, batch_size);
-      const auto result = at::convolution_backward(
+      const auto result = at::convolution_backward_symint(
           grad_output, dummy_input, weight_, nullopt, stride, padding,
           dilation, transposed, output_padding, groups, mask);
       grad_input = std::get<0>(result); // N(GBI)
@@ -314,7 +314,7 @@ convolution_backward_input_batch_rule(
   } else {
     TORCH_INTERNAL_ASSERT(input_bdim);
     const auto dummy_input = make_dummy(input, input_bdim, 0, 1);
-    const auto result = at::convolution_backward(
+    const auto result = at::convolution_backward_symint(
         grad_output, dummy_input, weight, nullopt, stride, padding,
         dilation, transposed, output_padding, groups, mask);
     return std::make_tuple(std::get<0>(result), nullopt);
@@ -325,8 +325,8 @@ convolution_backward_weight_batch_rule(
     const Tensor& grad_output, optional<int64_t> grad_output_bdim,
     const Tensor& input, optional<int64_t> input_bdim,
     const Tensor& weight, optional<int64_t> weight_bdim,
-    IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool transposed,
-    IntArrayRef output_padding, int64_t groups) {
+    IntArrayRef stride, c10::SymIntArrayRef padding, IntArrayRef dilation, bool transposed,
+    c10::SymIntArrayRef output_padding, int64_t groups) {
   const std::array<bool, 3> mask = {false, true, false};
   if (grad_output_bdim && input_bdim) {
     // BNO, BNI -> N(BO), N(BI) -> (BO)I (regular) (BI)O (transposed)
@@ -334,7 +334,7 @@ convolution_backward_weight_batch_rule(
     const auto grad_output_ = reshape_dim_into(*grad_output_bdim, 1, grad_output);
     const auto input_ = reshape_dim_into(*input_bdim, 1, input);
     const auto dummy_weight = make_dummy(weight, weight_bdim, 0, batch_size);
-    const auto result = at::convolution_backward(
+    const auto result = at::convolution_backward_symint(
         grad_output_, input_, dummy_weight, nullopt, stride, padding,
         dilation, transposed, output_padding, groups * batch_size, mask);
     auto grad_weight = std::get<1>(result);
@@ -348,7 +348,7 @@ convolution_backward_weight_batch_rule(
       const auto grad_output_ = reshape_dim_into(*grad_output_bdim, 1, grad_output);
       const auto out_ch_dim = transposed ? 1 : 0;
       const auto dummy_weight = make_dummy(weight, weight_bdim, out_ch_dim, batch_size);
-      const auto result = at::convolution_backward(
+      const auto result = at::convolution_backward_symint(
           grad_output_, input, dummy_weight, nullopt, stride, padding,
           dilation, transposed, output_padding, groups, mask);
       auto grad_weight = std::get<1>(result);
@@ -362,7 +362,7 @@ convolution_backward_weight_batch_rule(
       if (!transposed) {
         // BN(GO), N(GI) -> N(GBO), N(GI) -> (GBO)I
         const auto dummy_weight = make_dummy(weight, weight_bdim, 0, batch_size);
-        const auto result = at::convolution_backward(
+        const auto result = at::convolution_backward_symint(
             grad_output_, input, dummy_weight, nullopt, stride, padding,
             dilation, transposed, output_padding, groups, mask);
         auto grad_weight = std::get<1>(result);
@@ -373,7 +373,7 @@ convolution_backward_weight_batch_rule(
       } else {
         // BN(GO), N(GI) -> N(GBO), N(GI) -> (GI)(BO)
         const auto dummy_weight = make_dummy(weight, weight_bdim, 1, batch_size);
-        const auto result = at::convolution_backward(
+        const auto result = at::convolution_backward_symint(
             grad_output_, input, dummy_weight, nullopt, stride, padding,
             dilation, transposed, output_padding, groups, mask);
         auto grad_weight = std::get<1>(result);
@@ -389,7 +389,7 @@ convolution_backward_weight_batch_rule(
       const auto input_ = reshape_dim_into(*input_bdim, 1, input);
       const auto in_ch_dim = transposed ? 0 : 1;
       const auto dummy_weight = make_dummy(weight, weight_bdim, in_ch_dim, batch_size);
-      const auto result = at::convolution_backward(
+      const auto result = at::convolution_backward_symint(
           grad_output, input_, dummy_weight, nullopt, stride, padding,
           dilation, transposed, output_padding, groups, mask);
       auto grad_weight = std::get<1>(result);
@@ -403,7 +403,7 @@ convolution_backward_weight_batch_rule(
       if (!transposed) {
         // regular: N(GO), BN(GI) -> N(GO), N(GBI) -> (GO)(BI)
         const auto dummy_weight = make_dummy(weight, weight_bdim, 1, batch_size);
-        const auto result = at::convolution_backward(
+        const auto result = at::convolution_backward_symint(
             grad_output, input_, dummy_weight, nullopt, stride, padding,
             dilation, transposed, output_padding, groups, mask);
         auto grad_weight = std::get<1>(result);
@@ -412,7 +412,7 @@ convolution_backward_weight_batch_rule(
       } else {
         // transposed: N(GO), BN(GI) -> N(GO), N(GBI) -> (GBI)O
         const auto dummy_weight = make_dummy(weight, weight_bdim, 0, batch_size);
-        const auto result = at::convolution_backward(
+        const auto result = at::convolution_backward_symint(
             grad_output, input_, dummy_weight, nullopt, stride, padding,
             dilation, transposed, output_padding, groups, mask);
         auto grad_weight = std::get<1>(result);
@@ -425,7 +425,7 @@ convolution_backward_weight_batch_rule(
   } else {
     TORCH_INTERNAL_ASSERT(weight_bdim);
     const auto dummy_weight = make_dummy(weight, weight_bdim, 0, 1);
-    const auto result = at::convolution_backward(
+    const auto result = at::convolution_backward_symint(
         grad_output, input, dummy_weight, nullopt, stride, padding,
         dilation, transposed, output_padding, groups, mask);
     return std::make_tuple(std::get<1>(result), nullopt);
@@ -436,8 +436,8 @@ convolution_backward_weight_batch_rule(
 std::tuple<Tensor,Tensor,Tensor> convolution_backward_plumbing(
     const Tensor& grad_output_, const Tensor& input_, const Tensor& weight_,
     const c10::OptionalArrayRef<SymInt> bias_sizes_opt,
-    IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool transposed,
-    IntArrayRef output_padding, int64_t groups, std::array<bool, 3> output_mask) {
+    IntArrayRef stride, c10::SymIntArrayRef padding, IntArrayRef dilation, bool transposed,
+    c10::SymIntArrayRef output_padding, int64_t groups, std::array<bool, 3> output_mask) {
   const auto maybe_layer = maybeCurrentDynamicLayer();
   TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
   int64_t cur_level = maybe_layer->layerId();
@@ -487,7 +487,7 @@ std::tuple<Tensor,Tensor,Tensor> convolution_backward_plumbing(
     const auto batch_size = weight.size(*weight_bdim);
     input = reshape_dim_into(*input_bdim, 1, input);
     weight = reshape_dim_into(*weight_bdim, 0, weight);
-    const auto result = at::convolution_backward(
+    const auto result = at::convolution_backward_symint(
         grad_output, input, weight, nullopt, stride, padding, dilation,
         transposed, output_padding, batch_size * groups, output_mask);
     // N(BI), (BO)I -> NBI, BOI
diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
index f1108bac25a0a..24a1c4ab507a0 100644
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@@ -242,7 +242,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   OP_DECOMPOSE2(where, ScalarSelf);
   OP_DECOMPOSE(orgqr);
   OP_DECOMPOSE2(unflatten, int);
-  OP_DECOMPOSE(_convolution_double_backward);
+  m.impl("_convolution_double_backward", native::_convolution_double_backward);
   OP_DECOMPOSE(conv_transpose1d);
   OP_DECOMPOSE2(conv_transpose2d, input);
   OP_DECOMPOSE2(conv_transpose3d, input);
diff --git a/aten/src/ATen/native/AdaptiveAveragePooling.cpp b/aten/src/ATen/native/AdaptiveAveragePooling.cpp
index 40b05d74053ca..b612ef009b651 100644
--- a/aten/src/ATen/native/AdaptiveAveragePooling.cpp
+++ b/aten/src/ATen/native/AdaptiveAveragePooling.cpp
@@ -130,9 +130,9 @@ namespace {
       Tensor out = input.mean({-1, -2}, /* keepdim = */ true);
       if (input.suggest_memory_format() == at::MemoryFormat::ChannelsLast) {
         // assert ndim == 4, since ndim = 3 doesn't give channels_last
-        const int n = input.size(0);
-        const int c = input.size(1);
-        out.as_strided_({n, c, 1, 1}, {c, 1, c, c});
+        const auto n = input.sym_size(0);
+        const auto c = input.sym_size(1);
+        out.as_strided__symint({n, c, 1, 1}, {c, 1, c, c});
       }
       return out;
     } else {
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 36ea8ee1870d7..2dd7d515c14f9 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -910,8 +910,8 @@ static Tensor convolution_same(
   auto k = weight.dim();
   TORCH_CHECK(k > 2, "weight should have at least three dimensions");
   auto dim = static_cast<size_t>(k - 2);
-  auto weight_sizes = weight.sizes();
-  auto input_sizes = input.sizes();
+  auto weight_sizes = weight.sym_sizes();
+  auto input_sizes = input.sym_sizes();
   TORCH_CHECK(k == input.dim(),
               "Expected ", k, "-dimensional input for ",
               k, "-dimensional weight", weight_sizes, ", but got ",
@@ -926,7 +926,7 @@ static Tensor convolution_same(
   }
 
   // Calculate the correct padding
-  DimVector padding_l, padding_r;
+  SymDimVector padding_l, padding_r;
   bool symmetric_padding = true;
   for (auto i: c10::irange(dim)) {
     auto s = stride.size() == 1 ? stride[0] : stride[i];
@@ -942,14 +942,14 @@ static Tensor convolution_same(
 
   if (symmetric_padding) {
     // All backends handle symmetric padding natively
-    DimVector output_padding(static_cast<size_t>(dim));
-    return at::convolution(input, weight, bias, stride, padding_l, dilation,
+    SymDimVector output_padding(static_cast<size_t>(dim));
+    return at::convolution_symint(input, weight, bias, stride, padding_l, dilation,
                                false, output_padding, groups);
   }
 
   TORCH_WARN_ONCE("Using padding='same' with even kernel lengths and odd dilation may"
                   " require a zero-padded copy of the input be created");
-  SmallVector<int64_t, kDimVectorStaticSize * 2> pad_nd(static_cast<size_t>(2 * dim));
+  SmallVector<c10::SymInt, kDimVectorStaticSize * 2> pad_nd(static_cast<size_t>(2 * dim));
   for (auto i: c10::irange(dim)) {
     // Apply padding by the difference, leaving only a symmetric padding
     auto delta_pad = padding_r[i] - padding_l[i];
@@ -961,10 +961,10 @@ static Tensor convolution_same(
       padding_l[i] = padding_r[i];
     }
   }
-  auto padded_input = at::constant_pad_nd(input, pad_nd, 0);
-  DimVector output_padding(static_cast<size_t>(dim));
-  return at::convolution(padded_input, weight, bias, stride, padding_l,
-                         dilation, false, output_padding, groups);
+  auto padded_input = at::constant_pad_nd_symint(input, pad_nd, 0);
+  SymDimVector output_padding(static_cast<size_t>(dim));
+  return at::convolution_symint(padded_input, weight, bias, stride, padding_l,
+                                dilation, false, output_padding, groups);
 }
 
 Tensor _convolution_mode(
diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp
index 7d4a89d6b40f7..21404947b3dbb 100644
--- a/aten/src/ATen/native/EmbeddingBag.cpp
+++ b/aten/src/ATen/native/EmbeddingBag.cpp
@@ -1307,7 +1307,7 @@ Tensor _embedding_bag_backward_symint(const Tensor &grad, const Tensor &indices_
   checkContiguous("embedding_bag", offsets_arg);
 
   Tensor offset2bag_;
-  if (indices.numel() != 0 && offset2bag.numel() == 0) {
+  if (indices.sym_numel() != 0 && offset2bag.sym_numel() == 0) {
     offset2bag_ = offsets.new_zeros(
       {indices.size(0) + 1}, offsets.options()); // offset2bag = [0 0 0 0 0]
 
diff --git a/aten/src/ATen/native/GridSamplerUtils.h b/aten/src/ATen/native/GridSamplerUtils.h
index 0b6f29de8c427..7c22fedfe94e2 100644
--- a/aten/src/ATen/native/GridSamplerUtils.h
+++ b/aten/src/ATen/native/GridSamplerUtils.h
@@ -101,7 +101,7 @@ bool cond_cudnn_grid_sampler(
     at::native::canUse32BitIndexMath(input) &&
     at::native::canUse32BitIndexMath(grid) &&
     input.dim() == 4 &&
-    input.size(1) <= 1024);
+    input.sym_size(1) <= 1024);
 }
 
 } // anonymous namespace
diff --git a/aten/src/ATen/native/IndexingUtils.cpp b/aten/src/ATen/native/IndexingUtils.cpp
index c5f5ff6fbcc07..2dba1972ce574 100644
--- a/aten/src/ATen/native/IndexingUtils.cpp
+++ b/aten/src/ATen/native/IndexingUtils.cpp
@@ -4,7 +4,7 @@
 namespace at { namespace native {
 
 bool canUse32BitIndexMath(const TensorBase& t, int64_t max_elem) {
-  int64_t elements = t.numel();
+  auto elements = t.sym_numel();
   if (elements >= max_elem) {
     return false;
   }
@@ -12,16 +12,16 @@ bool canUse32BitIndexMath(const TensorBase& t, int64_t max_elem) {
     return max_elem > 0;
   }
 
-  int64_t offset = 0;
-  int64_t linearId = elements - 1;
+  c10::SymInt offset = 0;
+  auto linearId = elements - 1;
 
   // NOTE: Assumes all strides are positive, which is true for now
   // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
   for (int i = t.dim() - 1; i >= 0; --i) {
-    int64_t curDimIndex = linearId % t.size(i);
-    int64_t curDimOffset = curDimIndex * t.stride(i);
+    auto curDimIndex = linearId % t.sym_size(i);
+    auto curDimOffset = curDimIndex * t.sym_stride(i);
     offset += curDimOffset;
-    linearId /= t.size(i);
+    linearId /= t.sym_size(i);
   }
 
   if (offset >= max_elem) {
diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp
index 8e5864b68728d..28fc60508ab10 100644
--- a/aten/src/ATen/native/LossNLL.cpp
+++ b/aten/src/ATen/native/LossNLL.cpp
@@ -656,7 +656,7 @@ Tensor nll_loss(const Tensor & self, const Tensor & target, const c10::optional<
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
 
-  return std::get<0>(at::nll_loss_forward(self, target, weight, reduction, ignore_index));
+  return std::get<0>(at::nll_loss_forward_symint(self, target, weight, reduction, ignore_index));
 }
 
 Tensor nll_loss_nd_symint(
diff --git a/aten/src/ATen/native/LossNLL2d.cpp b/aten/src/ATen/native/LossNLL2d.cpp
index ab7c084eb80df..aee22ce3edeb5 100644
--- a/aten/src/ATen/native/LossNLL2d.cpp
+++ b/aten/src/ATen/native/LossNLL2d.cpp
@@ -498,7 +498,7 @@ Tensor nll_loss2d(const Tensor & self, const Tensor & target, const c10::optiona
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
 
-  return std::get<0>(at::nll_loss2d_forward(self, target, weight, reduction, ignore_index));
+  return std::get<0>(at::nll_loss2d_forward_symint(self, target, weight, reduction, ignore_index));
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/NonSymbolicBC.h b/aten/src/ATen/native/NonSymbolicBC.h
index e7d31ae3fa020..f57c868f345f1 100644
--- a/aten/src/ATen/native/NonSymbolicBC.h
+++ b/aten/src/ATen/native/NonSymbolicBC.h
@@ -22,4 +22,5 @@ TORCH_API at::Tensor _embedding_bag_sparse_backward(const at::Tensor & grad, con
 TORCH_API at::Tensor value_selecting_reduction_backward(const at::Tensor & grad, int64_t dim, const at::Tensor & indices, at::IntArrayRef sizes, bool keepdim);
 TORCH_API at::Tensor trace_backward(const at::Tensor & grad, at::IntArrayRef sizes);
 TORCH_API at::Tensor index_select_backward(const at::Tensor & grad, at::IntArrayRef self_sizes, int64_t dim, const at::Tensor & index);
+TORCH_API std::vector<Tensor> tensor_split(const Tensor& self, IntArrayRef indices, int64_t dim);
 }}
diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h
index cf5b45b365d05..0ff4490086b7e 100644
--- a/aten/src/ATen/native/Pool.h
+++ b/aten/src/ATen/native/Pool.h
@@ -67,17 +67,18 @@ static inline T pooling_output_shape(
         inputSize, kernelSize, pad, pad, stride, dilation, ceil_mode);
 }
 
-inline std::pair<int64_t, int64_t> pooling_same_mode_padding_lr(
-    int64_t inputSize, int64_t kernelSize, int64_t stride, int64_t dilation) {
+template <typename T>
+std::pair<T, T> _pooling_same_mode_padding_lr(
+    T inputSize, T kernelSize, int64_t stride, int64_t dilation) {
   // NOTE: with strides, the output shape is ceil(inputSize/stride)
-  auto total_padding = dilation * (kernelSize - 1);
+  auto total_padding = T(dilation) * (kernelSize - 1);
 
   // Prefer symmetric padding if possible
   if (stride > 2 && (total_padding % 2 == 1)) {
     // The floor in the output size calculation gives us a little wiggle room
     auto wiggle_room = inputSize % stride - 1;
     if (wiggle_room > 0) {
-      --total_padding;
+      total_padding = total_padding - 1;
     }
   }
 
@@ -85,6 +86,15 @@ inline std::pair<int64_t, int64_t> pooling_same_mode_padding_lr(
   return {left, total_padding - left};
 }
 
+inline std::pair<int64_t, int64_t> pooling_same_mode_padding_lr(
+    int64_t inputSize, int64_t kernelSize, int64_t stride, int64_t dilation) {
+  return _pooling_same_mode_padding_lr(inputSize, kernelSize, stride, dilation);
+}
+
+inline std::pair<c10::SymInt, c10::SymInt> pooling_same_mode_padding_lr(
+    c10::SymInt inputSize, c10::SymInt kernelSize, int64_t stride, int64_t dilation) {
+  return _pooling_same_mode_padding_lr(inputSize, kernelSize, stride, dilation);
+}
 
 // AveragePool2d/DilatedMaxPool2d (forward)
 static inline void
diff --git a/aten/src/ATen/native/TensorProperties.cpp b/aten/src/ATen/native/TensorProperties.cpp
index 6a703cbe07f90..e37dbf56cc81a 100644
--- a/aten/src/ATen/native/TensorProperties.cpp
+++ b/aten/src/ATen/native/TensorProperties.cpp
@@ -69,7 +69,7 @@ bool cudnn_is_acceptable(const TensorBase& self) {
   // tensors. Maybe some cuDNN functions actually support empty tensors, but
   // native/THNN kernels shouldn't be much slower because the output is also
   // likely empty.
-  if (self.numel() == 0) return false;
+  if (self.sym_numel() == 0) return false;
   // NB: In the old Python code, there was also a test to see if the
   // cuDNN library was actually dynamically linked or not.  I'm not
   // sure if we can actually test this.
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index d25113577b2d5..2051cda371b97 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -917,9 +917,12 @@ std::vector<Tensor> chunk(const Tensor& self, int64_t chunks, int64_t dim) {
   }
 }
 
-std::vector<Tensor> tensor_split(const Tensor& self, int64_t sections, int64_t dim) {
+std::vector<Tensor> tensor_split_sections_symint(const Tensor& self, c10::SymInt sym_sections, int64_t dim) {
   TORCH_CHECK(self.dim() > 0, "tensor_split expected at least a 1-dimensional tensor, but got a tensor with ", self.dim()," dims");
   int64_t dim_ = maybe_wrap_dim(dim, self.dim());
+  // NB: intentional, sections specifies number of output tensors, which
+  // cannot be polymorphic
+  int64_t sections = sym_sections.guard_int(__FILE__, __LINE__);
   TORCH_CHECK(sections > 0, "number of sections must be larger than 0, got ", sections);
   const auto dim_size = self.sym_size(dim_);
   std::vector<Tensor> splits(sections);
@@ -934,21 +937,30 @@ std::vector<Tensor> tensor_split(const Tensor& self, int64_t sections, int64_t d
   return splits;
 }
 
-std::vector<Tensor> tensor_split(const Tensor& self, IntArrayRef indices, int64_t dim) {
+template <typename T>
+std::vector<Tensor> _tensor_split_indices(const Tensor& self, ArrayRef<T> indices, int64_t dim) {
   TORCH_CHECK(self.dim() > 0, "tensor_split expected at least a 1-dimensional tensor, but got a tensor with ", self.dim()," dims");
   int64_t dim_ = maybe_wrap_dim(dim, self.dim());
   int64_t num_indices = indices.size();
   std::vector<Tensor> splits(num_indices + 1);
-  int64_t start_idx = 0;
+  T start_idx(0);
   for (const auto split_idx : c10::irange(num_indices)) {
-    int64_t end_idx = indices[split_idx];
-    splits[split_idx] = at::slice(self, dim_, start_idx, end_idx);
+    auto end_idx = indices[split_idx];
+    splits[split_idx] = at::symint::slice<T>(self, dim_, start_idx, end_idx);
     start_idx = end_idx;
   }
-  splits[num_indices] = at::slice(self, dim_, start_idx, self.size(dim_));
+  splits[num_indices] = at::symint::slice<T>(self, dim_, start_idx, at::symint::size<T>(self, dim_));
   return splits;
 }
 
+std::vector<Tensor> tensor_split(const Tensor& self, IntArrayRef indices, int64_t dim) {
+  return _tensor_split_indices(self, indices, dim);
+}
+
+std::vector<Tensor> tensor_split_indices_symint(const Tensor& self, SymIntArrayRef indices, int64_t dim) {
+  return _tensor_split_indices(self, indices, dim);
+}
+
 std::vector<Tensor> tensor_split(const Tensor& self, const Tensor& tensor_indices_or_sections, int64_t dim) {
   TORCH_CHECK(self.dim() > 0, "tensor_split expected at least a 1-dimensional tensor, but got a tensor with ", self.dim()," dims");
   auto split_device = tensor_indices_or_sections.device();
@@ -1174,8 +1186,8 @@ Tensor as_strided_qtensorimpl(const Tensor& self, IntArrayRef size, IntArrayRef
   return result;
 }
 
-const Tensor &as_strided_(const Tensor& self, IntArrayRef size, IntArrayRef stride, optional<int64_t> storage_offset_) {
-  auto storage_offset = storage_offset_.value_or(self.storage_offset());
+const Tensor &as_strided__symint(const Tensor& self, SymIntArrayRef size, SymIntArrayRef stride, optional<c10::SymInt> storage_offset_) {
+  auto storage_offset = storage_offset_.value_or(self.sym_storage_offset());
   setStrided(self, size, stride, storage_offset);
   return self;
 }
diff --git a/aten/src/ATen/native/group_norm.cpp b/aten/src/ATen/native/group_norm.cpp
index 5b38b02702828..c12d8d2142ff9 100644
--- a/aten/src/ATen/native/group_norm.cpp
+++ b/aten/src/ATen/native/group_norm.cpp
@@ -23,13 +23,15 @@
 #include <vector>
 
 namespace at {
+
 namespace native {
 
+template <typename T>
 void check_group_norm_inputs(
     const Tensor& input,
     const Tensor& weight,
     const Tensor& bias,
-    int64_t C,
+    T C,
     int64_t num_groups) {
   TORCH_CHECK(
       num_groups > 0,
@@ -43,14 +45,14 @@ void check_group_norm_inputs(
       "num_groups=",
       num_groups);
   TORCH_CHECK(
-      !weight.defined() || (weight.dim() == 1 && weight.numel() == C),
+      !weight.defined() || (weight.dim() == 1 && at::symint::numel<T>(weight) == C),
       "Expected weight to be a vector of size equal to the number of ",
       "channels in input, but got weight of shape ",
       weight.sizes(),
       " and input of shape ",
       input.sizes());
   TORCH_CHECK(
-      !bias.defined() || (bias.dim() == 1 && bias.numel() == C),
+      !bias.defined() || (bias.dim() == 1 && at::symint::numel<T>(bias) == C),
       "Expected bias to be a vector of size equal to the number of ",
       "channels in input, but got bias of shape ",
       weight.sizes(),
@@ -171,13 +173,13 @@ Tensor group_norm(
   const Tensor& weight = *weight_maybe_owned;
   const Tensor& bias = c10::value_or_else(bias_opt, [] { return Tensor(); });
 
-  const int64_t N = input.size(0);
-  const int64_t C = input.size(1);
+  const auto N = input.sym_size(0);
+  const auto C = input.sym_size(1);
   check_group_norm_inputs(input, weight, bias, C, num_groups);
 
-  const auto input_shape = input.sizes();
-  const int64_t HxW =
-      c10::multiply_integers(input_shape.cbegin() + 2, input_shape.cend());
+  const auto input_shape = input.sym_sizes();
+  const auto HxW =
+      c10::multiply_integers(input_shape.slice(2));
 
   const Tensor kEmpty;
   auto memory_format = input.suggest_memory_format();
@@ -185,10 +187,10 @@ Tensor group_norm(
       input.contiguous(memory_format) : input.contiguous();
   const auto& gamma = weight.defined() ? weight.contiguous() : kEmpty;
   const auto& beta = bias.defined() ? bias.contiguous() : kEmpty;
-  TORCH_CHECK(!gamma.defined() || gamma.numel() == C);
-  TORCH_CHECK(!beta.defined() || beta.numel() == C);
+  TORCH_CHECK(!gamma.defined() || gamma.sym_numel() == C);
+  TORCH_CHECK(!beta.defined() || beta.sym_numel() == C);
   return std::get<0>(
-      at::native_group_norm(X, gamma, beta, N, C, HxW, num_groups, eps));
+      at::native_group_norm_symint(X, gamma, beta, N, C, HxW, num_groups, eps));
 }
 
 DEFINE_DISPATCH(GroupNormKernel);
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 2cd92a5ef1900..c113a44d9db95 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -815,7 +815,7 @@
   device_guard: False
   tags: inplace_view
   dispatch:
-    CompositeExplicitAutogradNonFunctional: as_strided_
+    CompositeExplicitAutogradNonFunctional: as_strided__symint
 
 - func: asin(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -1294,11 +1294,15 @@
     CompositeImplicitAutograd: chunk
     NestedTensorCPU, NestedTensorCUDA: chunk_nested_tensor
 
-- func: tensor_split.sections(Tensor(a -> *) self, int sections, int dim=0) -> Tensor(a)[]
+- func: tensor_split.sections(Tensor(a -> *) self, SymInt sections, int dim=0) -> Tensor(a)[]
   variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: tensor_split_sections_symint
 
-- func: tensor_split.indices(Tensor(a -> *) self, int[] indices, int dim=0) -> Tensor(a)[]
+- func: tensor_split.indices(Tensor(a -> *) self, SymInt[] indices, int dim=0) -> Tensor(a)[]
   variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: tensor_split_indices_symint
 
 - func: tensor_split.tensor_indices_or_sections(Tensor(a -> *) self, Tensor tensor_indices_or_sections, int dim=0) -> Tensor(a)[]
   variants: function, method
@@ -1465,13 +1469,13 @@
   variants: method
   manual_cpp_binding: True
 
-- func: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor
+- func: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups) -> Tensor
   dispatch:
     CompositeExplicitAutograd: convolution
   autogen: convolution.out
   tags: canonical
 
-- func: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+- func: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
   dispatch:
     CompositeExplicitAutograd, CUDA: convolution_backward
   autogen: convolution_backward.out
@@ -1487,7 +1491,7 @@
     CompositeExplicitAutograd: convolution_backward_overrideable
   autogen: convolution_backward_overrideable.out
 
-- func: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor
+- func: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor
   dispatch:
     CompositeExplicitAutograd: _convolution
   autogen: _convolution.out
@@ -1496,7 +1500,7 @@
 
 - func: _convolution_mode(Tensor input, Tensor weight, Tensor? bias, int[] stride, str padding, int[] dilation, int groups) -> Tensor
 
-- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
 
 - func: conv1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] dilation=1, int groups=1) -> Tensor
 
@@ -3564,7 +3568,7 @@
     MPS: mps_convolution_backward
   autogen: mps_convolution_backward.out
 
-- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor
+- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups) -> Tensor
   dispatch:
     CompositeExplicitAutograd: mkldnn_convolution
   autogen: mkldnn_convolution.out
@@ -3579,17 +3583,17 @@
     CUDA: miopen_batch_norm_backward
   autogen: miopen_batch_norm_backward.out
 
-- func: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+- func: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
   dispatch:
     CUDA: miopen_convolution
   autogen: miopen_convolution.out
 
-- func: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+- func: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
   dispatch:
     CUDA: miopen_convolution_transpose
   autogen: miopen_convolution_transpose.out
 
-- func: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+- func: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
   dispatch:
     CUDA: miopen_depthwise_convolution
   autogen: miopen_depthwise_convolution.out
@@ -3843,7 +3847,7 @@
 
 - func: _nnpack_available() -> bool
 
-- func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, int[2] padding, int[2] stride=1) -> Tensor
+- func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, int[2] stride=1) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutograd: _nnpack_spatial_convolution
@@ -11473,24 +11477,24 @@
 # these are the same thing, but we give them different prefixes to
 # make the operational distinction clear.
 
-- func: slow_conv_transpose2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+- func: slow_conv_transpose2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   structured: True
   dispatch:
     CPU: slow_conv_transpose2d_structured_cpu
     CUDA: slow_conv_transpose2d_structured_cuda
 
-- func: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int[2] dilation=1) -> Tensor
+- func: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int[2] dilation=1) -> Tensor
   python_module: nn
   structured_delegate: slow_conv_transpose2d.out
 
-- func: slow_conv_transpose3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+- func: slow_conv_transpose3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: slow_conv_transpose3d_out_cpu
     CUDA: slow_conv_transpose3d_out_cuda
 
-- func: slow_conv_transpose3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int[3] dilation=1) -> Tensor
+- func: slow_conv_transpose3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int[3] dilation=1) -> Tensor
   python_module: nn
   dispatch:
     CPU: slow_conv_transpose3d_cpu
@@ -11527,47 +11531,47 @@
     CUDA: slow_conv2d_backward_cuda
   autogen: _slow_conv2d_backward.output_mask_out
 
-- func: _conv_depthwise2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation, *, Tensor(a!) out) -> Tensor(a!)
+- func: _conv_depthwise2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, SymInt[2] padding, int[2] dilation, *, Tensor(a!) out) -> Tensor(a!)
   use_const_ref_for_mutable_tensors: True
   python_module: nn
   dispatch:
     CUDA: conv_depthwise2d_cuda_out
 
-- func: _conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation) -> Tensor
+- func: _conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, SymInt[2] padding, int[2] dilation) -> Tensor
   python_module: nn
   dispatch:
     CUDA: conv_depthwise2d_cuda
 
-- func: conv_depthwise3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding, int[3] dilation) -> Tensor
+- func: conv_depthwise3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding, int[3] dilation) -> Tensor
   python_module: nn
   dispatch:
     CUDA: conv_depthwise3d_cuda
   autogen: conv_depthwise3d.out
 
-- func: slow_conv3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, *, Tensor(a!) out) -> Tensor(a!)
+- func: slow_conv3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
 
-- func: slow_conv3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0) -> Tensor
+- func: slow_conv3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0) -> Tensor
   python_module: nn
 
-- func: slow_conv3d_forward.output(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding, *, Tensor(a!) output) -> Tensor(a!)
+- func: slow_conv3d_forward.output(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding, *, Tensor(a!) output) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: slow_conv3d_forward_out_cpu
 
-- func: slow_conv3d_forward(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding) -> Tensor
+- func: slow_conv3d_forward(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding) -> Tensor
   python_module: nn
   dispatch:
     CPU: slow_conv3d_forward_cpu
 
-- func: slow_conv_dilated2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1) -> Tensor
+- func: slow_conv_dilated2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, int[2] dilation=1) -> Tensor
   python_module: nn
   dispatch:
     CPU: slow_conv_dilated2d_cpu
     CUDA: slow_conv_dilated2d_cuda
   autogen: slow_conv_dilated2d.out
 
-- func: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1) -> Tensor
+- func: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, int[3] dilation=1) -> Tensor
   python_module: nn
   dispatch:
     CPU: slow_conv_dilated3d_cpu
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index edf1df9216311..5f6e8d9bf238f 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1123,8 +1123,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     skip('nn.functional.batch_norm', ''),  # '0 is not tracked with proxy for <torch.fx.experimental.proxy_te..
     xfail('nn.functional.bilinear', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.binary_cross_entropy', ''),  # aten.fill_.Scalar - couldn't find symbolic meta funct...
-    xfail('nn.functional.conv1d', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('nn.functional.conv2d', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.cosine_embedding_loss', ''),  # Cannot call sizes() on tensor with symbolic sizes/st...
     xfail('nn.functional.cosine_similarity', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.cross_entropy', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
@@ -1225,7 +1223,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('trapezoid', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('trapz', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('triangular_solve', ''),  # aten.triangular_solve.default - couldn't find symbolic meta function/de...
-    xfail('unbind', ''),  # tensor_split() received an invalid combination of arguments - got (FakeTensor, torch...
     xfail('unflatten', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('var', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('var_mean', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 8f7656309eee4..4f4265b8dc6a2 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1254,8 +1254,6 @@ def f(a, b, c, d, e):
     xfail('nn.functional.avg_pool3d', ''),  # aten.avg_pool3d.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.bilinear', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.binary_cross_entropy', ''),  # aten.new_empty.default - couldn't find symbolic meta function/decom...
-    xfail('nn.functional.conv1d', ''),  # aten.convolution.default - couldn't find symbolic meta function/decomposition
-    xfail('nn.functional.conv2d', ''),  # aten.convolution.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.cosine_embedding_loss', ''),  # The underlying op of 'aten.stride' has no overload name '_schema'
     xfail('nn.functional.cosine_similarity', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.cross_entropy', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
@@ -1268,7 +1266,6 @@ def f(a, b, c, d, e):
     xfail('nn.functional.fractional_max_pool2d', ''),  # argument 'size' must be tuple of ints, but found element of t...
     xfail('nn.functional.fractional_max_pool3d', ''),  # argument 'size' must be tuple of ints, but found element of t...
     xfail('nn.functional.grid_sample', ''),  # aten.grid_sampler_2d.default - couldn't find symbolic meta function/decompos...
-    xfail('nn.functional.group_norm', ''),  # 'torch._C.SymIntNode' and 'int'
     xfail('nn.functional.hinge_embedding_loss', ''),  # aten.empty_like.default - couldn't find symbolic meta function/deco...
     xfail('nn.functional.interpolate', 'area'),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.interpolate', 'bicubic'),  # aten.upsample_bicubic2d.vec - couldn't find symbolic meta function/d...
@@ -1361,7 +1358,6 @@ def f(a, b, c, d, e):
     xfail('view_as_complex', ''),  # aten.view_as_complex.default - couldn't find symbolic meta function/decomposition
     xfail('view_as', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('vsplit', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
-    xfail('unbind', ''),  # aten.unbind.int - couldn't find symbolic meta function/decomposition
     xfail('unique_consecutive', ''),  # aten.unique_consecutive.default - couldn't find symbolic meta function/decomposition
     xfail('unique', ''),  # aten._unique2.default - couldn't find symbolic meta function/decomposition
 }
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index c77f63e8c8e73..6945dae77a020 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -2206,19 +2206,19 @@
   indices: non_differentiable
   result: auto_linear
 
-- name: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor
+- name: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups) -> Tensor
   input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
   result: convolution_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, stride, padding, dilation, transposed, output_padding, groups)
 
 # TorchScript serializes calls to _convolution so this entry is present until that is changed to use convolution.
 # Note that the benchmark, deterministic, cudnn_enabled, and allow_tf32 flags are queried from the global context
 # by convolution_backward instead of being passed along from the forward pass.
-- name: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor
+- name: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor
   input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
   result: _convolution_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, allow_tf32)
 
-- name: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
-  grad_output, input, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, input, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask)
+- name: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  grad_output, input, weight: _convolution_double_backward_symint(grads[0], grads[1], grads[2], grad_output, weight, input, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask)
   result0: std::get<0>(convolution_backward_symint(grad_output_p, input_p, weight_t, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {true, false, false})) + std::get<0>(convolution_backward_symint(grad_output_t, input_p, weight_p, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {true, false, false}))
   result1: std::get<1>(convolution_backward_symint(grad_output_p, input_t, weight_p, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {false, true, false})) + std::get<1>(convolution_backward_symint(grad_output_t, input_p, weight_p, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {false, true, false}))
   result2: convolution_backward_jvp_grad_bias(grad_output_t, result2)
@@ -2229,10 +2229,10 @@
 - name: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
   grad_output, input, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, input, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask)
 
-- name: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int[2] dilation=1) -> Tensor
+- name: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int[2] dilation=1) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, true, output_padding, 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: slow_conv_transpose3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int[3] dilation=1) -> Tensor
+- name: slow_conv_transpose3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int[3] dilation=1) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, true, output_padding, 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
 - name: _slow_conv2d_forward(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding) -> Tensor
@@ -2241,20 +2241,20 @@
 - name: _slow_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
   grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, {{1, 1}}, false, {{0, 0}}, 1, grad_input_mask)
 
-- name: _conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, int[2] dilation) -> Tensor
+- name: _conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, SymInt[2] padding, int[2] dilation) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad.contiguous(), self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ {{0, 0}}, /*groups=*/ 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: conv_depthwise3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding, int[3] dilation) -> Tensor
+- name: conv_depthwise3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding, int[3] dilation) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad.contiguous(), self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ {{0, 0, 0}}, /*groups=*/ 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: slow_conv3d_forward(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, int[3] padding) -> Tensor
+- name: slow_conv3d_forward(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, /*dilation=*/ {{1, 1, 1}}, false, /*output_padding=*/ {{0, 0, 0}}, 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: slow_conv_dilated2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1) -> Tensor
-  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<int64_t>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+- name: slow_conv_dilated2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, int[2] dilation=1) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1) -> Tensor
-  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<int64_t>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+- name: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, int[3] dilation=1) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
 - name: col2im(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
   self: im2col(grad, kernel_size, dilation, padding, stride)
@@ -2608,9 +2608,9 @@
 
 # nnpack
 
-- name: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, int[2] padding, int[2] stride=1) -> Tensor
+- name: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, int[2] stride=1) -> Tensor
   # NNPACK does not support strided convolutions in the backwards path, which is the reason why we are using the closest available function that does here.
-  input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, std::vector<int64_t>(padding.size(), 1), false, std::vector<int64_t>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+  input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, std::vector<int64_t>(padding.size(), 1), false, std::vector<c10::SymInt>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
 #LSTM MPS
 - name: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
@@ -2641,14 +2641,14 @@
 
 # miopen
 
-- name: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+- name: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, true, output_padding, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<int64_t>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+- name: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<int64_t>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+- name: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
 - name: miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor)
   input, weight, bias: "grad.defined() ? (training ? miopen_batch_norm_backward(input, grad.contiguous(), weight, running_mean, running_var, result1, result2, epsilon) : native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, epsilon, grad_input_mask)) : std::tuple<Tensor, Tensor, Tensor>()"
@@ -2667,8 +2667,8 @@
   dropout_state: non_differentiable
 
 # mkldnn
-- name: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor
-  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ std::vector<int64_t>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+- name: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups) -> Tensor
+  self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ std::vector<c10::SymInt>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
 - name: mkldnn_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor
   self, weight, bias: mkldnn_linear_backward(self, grad, weight, grad_input_mask)
diff --git a/tools/jit/gen_unboxing.py b/tools/jit/gen_unboxing.py
index ebeaa21bc7be9..79c594a9afa07 100644
--- a/tools/jit/gen_unboxing.py
+++ b/tools/jit/gen_unboxing.py
@@ -116,7 +116,9 @@ def __call__(self, f: NativeFunction) -> str:
                 # from wrapping/unwrapping TensorOptios.
                 # However, we would look to include default args for schema parsing.
                 # Default args only show up in the nonfaithful C++ API,
-                arg_default = cpp.default_expr(arg.argument.default, arg.argument.type)
+                arg_default = cpp.default_expr(
+                    arg.argument.default, arg.argument.type, symint=False
+                )
                 if arg_default.startswith("{"):
                     arg_cpp = f"c10::IntArrayRef({arg_default})"
                 else:
diff --git a/torch/csrc/StorageMethods.cpp b/torch/csrc/StorageMethods.cpp
index 2b74c8a2fd290..29f0f67ce6ecb 100644
--- a/torch/csrc/StorageMethods.cpp
+++ b/torch/csrc/StorageMethods.cpp
@@ -41,7 +41,7 @@
 static PyObject* THPStorage_nbytes(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
   auto self = (THPStorage*)_self;
-  return THPUtils_packUInt64(self->cdata->nbytes());
+  return py::cast(self->cdata->sym_nbytes()).release().ptr();
   END_HANDLE_TH_ERRORS
 }
 
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 86b893bb014e6..3358d96569598 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -1098,15 +1098,15 @@ Tensor convolution_jvp(
     const Tensor& bias_p,
     const Tensor& bias_t,
     IntArrayRef stride,
-    IntArrayRef padding,
+    at::SymIntArrayRef padding,
     IntArrayRef dilation,
     bool transposed,
-    IntArrayRef output_padding,
+    at::SymIntArrayRef output_padding,
     int64_t groups) {
   auto bias_t_opt =
       bias_t.defined() ? c10::optional<at::Tensor>(bias_t) : c10::nullopt;
   return (
-      at::convolution(
+      at::convolution_symint(
           input_t,
           weight_p,
           c10::nullopt,
@@ -1116,7 +1116,7 @@ Tensor convolution_jvp(
           transposed,
           output_padding,
           groups) +
-      at::convolution(
+      at::convolution_symint(
           input_p,
           weight_t,
           bias_t_opt,
@@ -1136,10 +1136,10 @@ Tensor _convolution_jvp(
     const Tensor& bias_p,
     const Tensor& bias_t,
     IntArrayRef stride,
-    IntArrayRef padding,
+    at::SymIntArrayRef padding,
     IntArrayRef dilation,
     bool transposed,
-    IntArrayRef output_padding,
+    at::SymIntArrayRef output_padding,
     int64_t groups,
     bool benchmark,
     bool deterministic,
@@ -1148,7 +1148,7 @@ Tensor _convolution_jvp(
   auto bias_t_opt =
       bias_t.defined() ? c10::optional<at::Tensor>(bias_t) : c10::nullopt;
   return (
-      at::_convolution(
+      at::_convolution_symint(
           input_t,
           weight_p,
           c10::nullopt,
@@ -1162,7 +1162,7 @@ Tensor _convolution_jvp(
           deterministic,
           cudnn_enabled,
           allow_tf32) +
-      at::_convolution(
+      at::_convolution_symint(
           input_p,
           weight_t,
           bias_t_opt,
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index 04416c2b49e08..4da8aa074a534 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -937,10 +937,10 @@ Tensor convolution_jvp(
     const Tensor& bias_p,
     const Tensor& bias_t,
     IntArrayRef stride,
-    IntArrayRef padding,
+    at::SymIntArrayRef padding,
     IntArrayRef dilation,
     bool transposed,
-    IntArrayRef output_padding,
+    at::SymIntArrayRef output_padding,
     int64_t groups);
 
 Tensor _convolution_jvp(
@@ -951,10 +951,10 @@ Tensor _convolution_jvp(
     const Tensor& bias_p,
     const Tensor& bias_t,
     IntArrayRef stride,
-    IntArrayRef padding,
+    at::SymIntArrayRef padding,
     IntArrayRef dilation,
     bool transposed,
-    IntArrayRef output_padding,
+    at::SymIntArrayRef output_padding,
     int64_t groups,
     bool benchmark,
     bool deterministic,
diff --git a/torch/storage.py b/torch/storage.py
index 8e35973405b1b..6bfbab3733bc4 100644
--- a/torch/storage.py
+++ b/torch/storage.py
@@ -646,7 +646,9 @@ def device(self):
         return self._storage.device
 
     def size(self):
-        return len(self)
+        # NB: don't indirect through __len__, as that requires
+        # an int to be returned
+        return self.nbytes() // self.element_size()
 
     def pickle_storage_type(self):
         try:
diff --git a/torchgen/api/cpp.py b/torchgen/api/cpp.py
index c3b12d0336df0..4b00b5367b824 100644
--- a/torchgen/api/cpp.py
+++ b/torchgen/api/cpp.py
@@ -314,7 +314,7 @@ def return_names(f: NativeFunction, *, fallback_name: str = "result") -> Sequenc
 }
 
 # Convert a JIT default into C++ expression representing the default
-def default_expr(d: str, t: Type) -> str:
+def default_expr(d: str, t: Type, *, symint: bool) -> str:
     if d == "None" and str(t) == "Tensor?":
         return "{}"
     if isinstance(t, BaseType) and t.name is BaseTy.str:
@@ -342,11 +342,13 @@ def default_expr(d: str, t: Type) -> str:
         if d == "None":
             return "c10::nullopt"
 
-        return default_expr(d, t.elem)
+        return default_expr(d, t.elem, symint=symint)
 
     if isinstance(t, ListType):
         if d.startswith("[") and d.endswith("]"):
             return "{" + d[1:-1] + "}"
+        elif symint and d.isdigit() and str(t.elem) == "SymInt":
+            return f"c10::SymInt({d})"
         elif t.size is None:
             # NOTE: Sized lists can have scalar defaults
             raise ValueError(f"Expected a list default '[...]' but found: '{d}'")
@@ -386,7 +388,7 @@ def sub_argument(
             binds = a.name
         default: Optional[str] = None
         if a.name not in cpp_no_default_args and a.default is not None:
-            default = default_expr(a.default, a.type)
+            default = default_expr(a.default, a.type, symint=symint)
         return [
             Binding(
                 nctype=argument_type(a, binds=binds, symint=symint),
diff --git a/torchgen/api/native.py b/torchgen/api/native.py
index b197a2a02983a..7f8b3eb3af2e7 100644
--- a/torchgen/api/native.py
+++ b/torchgen/api/native.py
@@ -95,7 +95,7 @@ def argument(
     if isinstance(a, Argument):
         default: Optional[str] = None
         if should_default and a.default is not None:
-            default = cpp.default_expr(a.default, a.type)
+            default = cpp.default_expr(a.default, a.type, symint=symint)
         return [
             Binding(
                 nctype=argument_type(a, binds=a.name, symint=symint),
diff --git a/torchgen/api/python.py b/torchgen/api/python.py
index 96c006b303eaa..728ee4c18c0a6 100644
--- a/torchgen/api/python.py
+++ b/torchgen/api/python.py
@@ -719,7 +719,9 @@ def argument(a: Argument) -> PythonArgument:
         name=a.name,
         type=a.type,
         # TODO: directly translate a.default to python default
-        default=str(pythonify_default(cpp.default_expr(a.default, a.type)))
+        default=str(
+            pythonify_default(cpp.default_expr(a.default, a.type, symint=False))
+        )
         if a.default is not None
         else None,
         default_init=None,
@@ -804,7 +806,7 @@ def topt_default_init(name: str) -> Optional[str]:
             a = getattr(topt_args, name)
             if a.default is None or a.default == "None":
                 return None
-            return cpp.default_expr(a.default, a.type)
+            return cpp.default_expr(a.default, a.type, symint=False)
 
         tensor_options_args.append(
             PythonArgument(
diff --git a/torchgen/gen.py b/torchgen/gen.py
index e53734969afda..79970c94610dd 100644
--- a/torchgen/gen.py
+++ b/torchgen/gen.py
@@ -1151,7 +1151,9 @@ def compute_argument_yaml(
         "type": cpp.argument_type(a, binds="__placeholder__", symint=False).cpp_type(),
     }
     if a.default is not None:
-        arg["default"] = pythonify_default(cpp.default_expr(a.default, a.type))
+        arg["default"] = pythonify_default(
+            cpp.default_expr(a.default, a.type, symint=False)
+        )
     if a.name in kwarg_only_set:
         arg["kwarg_only"] = True
     if a.name in out_arg_set:

From 67d44f1d427c058725a8baf465e47bf844b60890 Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Fri, 28 Oct 2022 14:45:38 +0000
Subject: [PATCH 0273/1922] Fix codeowner errors (#87954)

Error message: "Unknown owner: make sure @mingzhe09088 exists and has
write access to the repository."
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87954
Approved by: https://github.com/wangkuiyi
---
 CODEOWNERS | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index 238d6e776e92c..707a2ccec1802 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -45,10 +45,10 @@ nn/qat/ @jerryzh168
 # Distributed package
 # This list is mostly if you'd like to be tagged as reviewer, feel free to add
 # or remove yourself from it.
-/torch/csrc/distributed/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088 @H-Huang @awgu @kwen2501
-/torch/distributed/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088 @H-Huang @awgu @kwen2501
-/torch/distributed/_composable @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088 @H-Huang @awgu @kwen2501 @yhcharles
-/torch/nn/parallel/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088 @H-Huang @awgu @kwen2501
+/torch/csrc/distributed/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @H-Huang @awgu @kwen2501
+/torch/distributed/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @H-Huang @awgu @kwen2501
+/torch/distributed/_composable @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @H-Huang @awgu @kwen2501 @yhcharles
+/torch/nn/parallel/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @H-Huang @awgu @kwen2501
 
 # Distributed tests
 # This list is mostly if you'd like to be tagged as reviewer, feel free to add

From c87cceecd970df3c58098f0015bbe95aeac9c651 Mon Sep 17 00:00:00 2001
From: Driss Guessous <drisspg@fb.com>
Date: Fri, 28 Oct 2022 15:51:10 +0000
Subject: [PATCH 0274/1922] Add mem efficient backend flag (#87946)

# Summary
Add in a torch.backends.cuda flag and update context manager to pic between the three implementations of the scaled_dot_product_attention.

cc @cpuhrsch @jbschlosser @bhosmer @mikaylagawarecki
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87946
Approved by: https://github.com/cpuhrsch
---
 aten/src/ATen/Context.cpp                     |  8 ++++++
 aten/src/ATen/Context.h                       |  9 +++++--
 .../ATen/native/transformers/cuda/sdp_utils.h | 18 ++++++++++---
 docs/source/backends.rst                      |  4 +++
 test/test_transformers.py                     | 17 +++++++-----
 torch/_C/__init__.pyi.in                      |  2 ++
 torch/backends/cuda/__init__.py               | 26 ++++++++++++++++---
 torch/csrc/Module.cpp                         | 23 ++++++++++++++++
 8 files changed, 92 insertions(+), 15 deletions(-)

diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index c96b36975214e..7086a05ab6c7a 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -112,6 +112,14 @@ void Context::setSDPUseFlash(bool e) {
   enabled_flashSDP = e;
 }
 
+bool Context::userEnabledMemEfficientSDP() const {
+  return enabled_mem_efficientSDP;
+}
+
+void Context::setSDPUseMemEfficient(bool e) {
+  enabled_mem_efficientSDP = e;
+}
+
 bool Context::userEnabledMathSDP() const {
   return enabled_mathSDP;
 }
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 43f4433b7ce99..48e3c935a2c0c 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -128,8 +128,9 @@ class TORCH_API Context {
 
   // Note [Disabling Fused SDP Kernels]
   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  // Flash SDP kernels are enabled by default. However, they can be disabled
-  // by setting at::globalContext().setUserEnabledFlashSDP(false) flag.
+  // Flash and Memory Efficient SDP kernels are enabled by default.
+  // However, they can be disabled by setting
+  // at::globalContext().setUserEnabledFlashSDP(false) flag.
   // This is useful for debugging purposes. For example, if you want to
   // compare the performance of the flash SDP kernels with the unfused
   // kernel, you can disable the flash SDP kernels. By disabling
@@ -139,6 +140,9 @@ class TORCH_API Context {
   void setSDPUseFlash(bool);
   bool userEnabledFlashSDP() const;
 
+  void setSDPUseMemEfficient(bool);
+  bool userEnabledMemEfficientSDP() const;
+
   void setSDPUseMath(bool);
   bool userEnabledMathSDP() const;
 
@@ -270,6 +274,7 @@ class TORCH_API Context {
   bool _deterministic_algorithms = false;
   bool _deterministic_algorithms_warn_only = false;
   bool enabled_flashSDP = true;
+  bool enabled_mem_efficientSDP = true;
   bool enabled_mathSDP = true;
 #ifdef USE_ROCM
   bool benchmark_cudnn = true;
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.h b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
index 218322f995d67..fb651e3d5aff3 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.h
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
@@ -108,7 +108,7 @@ inline bool check_head_dim_size(sdp_params params, bool debug) {
   return true;
 }
 
-inline bool check_runtime_disabled(sdp_params params, bool debug) {
+inline bool check_runtime_disabled_flash(sdp_params params, bool debug) {
   // We check the global context to see if user has explicitly turned of flash
   // sdp kernels
   if (!at::globalContext().userEnabledFlashSDP()) {
@@ -118,6 +118,16 @@ inline bool check_runtime_disabled(sdp_params params, bool debug) {
   return true;
 }
 
+inline bool check_runtime_disabled_mem_efficient(sdp_params params, bool debug) {
+  // We check the global context to see if user has explicitly turned of mem_efficient
+  // sdp kernels
+  if (!at::globalContext().userEnabledMemEfficientSDP()) {
+    TORCH_CHECK(!debug, "Memory Efficient attention has been runtime disabled.");
+    return false;
+  }
+  return true;
+}
+
 inline bool check_gpu_sm75_or_greater(sdp_params params, bool debug) {
   // Check that the gpu is capable of running flash attention
   auto dprops = at::cuda::getCurrentDeviceProperties();
@@ -164,7 +174,7 @@ inline bool use_flash_attention(sdp_params params, bool debug) {
 
   //  Define gate functions that determine if a flash kernel can be ran
   std::vector<std::function<bool(sdp_params, bool)>> constraints{
-      check_runtime_disabled,
+      check_runtime_disabled_flash,
       check_tensor_shapes,
       check_for_attn_weights,
       check_for_attn_mask,
@@ -193,7 +203,7 @@ inline bool use_mem_efficient_attention(sdp_params params, bool debug) {
   //  Define gate functions that determine if a flash kernel can be ran
   std::vector<std::function<bool(sdp_params, bool)>> constraints{
       check_gpu_sm50_or_greater,
-      check_runtime_disabled,
+      check_runtime_disabled_mem_efficient,
       check_for_attn_weights,
       check_tensor_shapes,
       check_for_attn_mask};
@@ -214,7 +224,7 @@ inline SDPBackend select_sdp_backend(sdp_params kernel_params) {
   // 2. Mem Efficient Attention
   // 3. Math fallback
   auto& ctx = at::globalContext();
-  if (!ctx.userEnabledMathSDP() && !ctx.userEnabledFlashSDP()) {
+  if (!ctx.userEnabledMathSDP() && !ctx.userEnabledFlashSDP() && !ctx.userEnabledMemEfficientSDP()) {
     return SDPBackend::error;
   }
   // Because TORCHCHECK checks if condition is true we negate debug so that
diff --git a/docs/source/backends.rst b/docs/source/backends.rst
index 31eaa85e05020..80e18f7017a01 100644
--- a/docs/source/backends.rst
+++ b/docs/source/backends.rst
@@ -54,6 +54,10 @@ torch.backends.cuda
 
 .. autofunction:: torch.backends.cuda.flash_sdp_enabled
 
+.. autofunction:: torch.backends.cuda.enable_mem_efficient_sdp
+
+.. autofunction:: torch.backends.cuda.mem_efficient_sdp_enabled
+
 .. autofunction:: torch.backends.cuda.enable_flash_sdp
 
 .. autofunction:: torch.backends.cuda.math_sdp_enabled
diff --git a/test/test_transformers.py b/test/test_transformers.py
index 1eff4d61fb203..5485d39df0e17 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -1021,12 +1021,12 @@ def test_sdp_runtime_dispatch(self):
         def make_tensor(*size, device=device, dtype=dtype):
             return torch.randn(size, device=device, dtype=dtype)
 
-        with sdp_kernel(enable_flash=False, enable_math=False):
+        with sdp_kernel(enable_flash=False, enable_math=False, enable_mem_efficient=False):
             q, k, v = make_tensor(2, 3, 4), make_tensor(2, 3, 4), make_tensor(2, 3, 4)
             self.assertRaisesRegex(RuntimeError, "No viable backend for scaled_dot_product_attention was found.",
                                    lambda: torch.nn.functional._scaled_dot_product_attention(q, k, v))
 
-        with sdp_kernel(enable_flash=True, enable_math=False):
+        with sdp_kernel(enable_flash=True, enable_mem_efficient=False, enable_math=False):
             # Failures for invalid input
 
             # Dim is not 4
@@ -1035,10 +1035,10 @@ def make_tensor(*size, device=device, dtype=dtype):
                 q, k, v, None, 0.0, False, False))
 
             # Xformers can now cover this case but will add back in next PR
-            # # Invalid last_dim size
-            # q, k, v = make_tensor(2, 2, 3, 4), make_tensor(2, 2, 3, 4), make_tensor(2, 2, 3, 4)
-            # self.assertRaises(RuntimeError, lambda: torch.nn.functional._scaled_dot_product_attention(
-            #     q, k, v, None, 0.0, False, False))
+            # Invalid last_dim size
+            q, k, v = make_tensor(2, 2, 3, 4), make_tensor(2, 2, 3, 4), make_tensor(2, 2, 3, 4)
+            self.assertRaises(RuntimeError, lambda: torch.nn.functional._scaled_dot_product_attention(
+                q, k, v, None, 0.0, False, False))
 
             # Invalid dtype
             q, k, v = make_tensor(2, 2, 3, 16, dtype=torch.float64), make_tensor(
@@ -1046,6 +1046,11 @@ def make_tensor(*size, device=device, dtype=dtype):
             self.assertRaises(RuntimeError, lambda: torch.nn.functional._scaled_dot_product_attention(
                 q, k, v, None, 0.0, False, False))
 
+            q, k, v = make_tensor(2, 2, 3, 16, dtype=torch.float32), make_tensor(
+                2, 2, 3, 16, dtype=torch.float32), make_tensor(2, 2, 3, 16, dtype=torch.float32)
+            self.assertRaises(RuntimeError, lambda: torch.nn.functional._scaled_dot_product_attention(
+                q, k, v, None, 0.0, False, False))
+
             # Failures for unsupported SDP args
             q, k, v = make_tensor(2, 2, 3, 16), make_tensor(2, 2, 3, 16), make_tensor(2, 2, 3, 16)
 
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 8b5a5d8e83b3d..560c4a3f16738 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -819,6 +819,8 @@ def _get_cudnn_enabled() -> _bool: ...  # THPModule_userEnabledCuDNN
 def _set_cudnn_enabled(arg: _bool) -> None: ...  # THPModule_setUserEnabledCuDNN
 def _get_flash_sdp_enabled() -> _bool: ...  # THPModule_userEnabledFusedSDP
 def _set_sdp_use_flash(arg: _bool) -> None: ...  # THPModule_setSDPUseFlash
+def _get_mem_efficient_sdp_enabled() -> _bool: ...  # THPModule_userEnabledMathSDP
+def _set_sdp_use_mem_efficient(arg: _bool) -> None: ...  # THPModule_setSDPUseMemEfficient
 def _get_math_sdp_enabled() -> _bool: ...  # THPModule_userEnabledMathSDP
 def _set_sdp_use_math(arg: _bool) -> None: ...  # THPModule_setSDPUseMath
 def _get_mkldnn_enabled() -> _bool: ...  # THPModule_userEnabledMkldnn
diff --git a/torch/backends/cuda/__init__.py b/torch/backends/cuda/__init__.py
index eece8f2646164..dd05535d39359 100644
--- a/torch/backends/cuda/__init__.py
+++ b/torch/backends/cuda/__init__.py
@@ -6,7 +6,8 @@
 
 __all__ = ["is_built", "cuFFTPlanCacheAttrContextProp", "cuFFTPlanCache", "cuFFTPlanCacheManager",
            "cuBLASModule", "preferred_linalg_library", "cufft_plan_cache", "matmul", "enable_flash_sdp",
-           "flash_sdp_enabled", "math_sdp_enabled", "enable_math_sdp", "sdp_kernel"]
+           "flash_sdp_enabled", "enable_mem_efficient_sdp", "mem_efficient_sdp_enabled",
+           "math_sdp_enabled", "enable_math_sdp", "sdp_kernel"]
 
 def is_built():
     r"""Returns whether PyTorch is built with CUDA support.  Note that this
@@ -180,6 +181,22 @@ def enable_flash_sdp(enabled: bool):
     """
     torch._C._set_sdp_use_flash(enabled)
 
+def mem_efficient_sdp_enabled():
+    r"""
+    .. warning:: This flag is experimental and subject to change.
+
+    Returns whether memory efficient sdp is enabled or not.
+    """
+    return torch._C._get_mem_efficient_sdp_enabled()
+
+
+def enable_mem_efficient_sdp(enabled: bool):
+    r"""
+    .. warning:: This flag is experimental and subject to change.
+
+    Enables or disables memory efficient sdp.
+    """
+    torch._C._set_sdp_use_mem_efficient(enabled)
 
 def math_sdp_enabled():
     r"""
@@ -200,23 +217,26 @@ def enable_math_sdp(enabled: bool):
 
 
 @contextlib.contextmanager
-def sdp_kernel(enable_flash: bool = True, enable_math: bool = True):
+def sdp_kernel(enable_flash: bool = True, enable_math: bool = True, enable_mem_efficient: bool = True):
     r"""
     .. warning:: This flag is experimental and subject to change.
 
-    This context manager can be used to temporarily enable or disable flash sdp and math sdp.
+    This context manager can be used to temporarily enable or disable flash/memory efficient sdp and math sdp.
     Upon exiting the context manager, the previous state of the flags will be restored.
     """
     previous_flash: bool = flash_sdp_enabled()
+    previous_mem_efficient: bool = mem_efficient_sdp_enabled()
     previous_math: bool = math_sdp_enabled()
     try:
         enable_flash_sdp(enable_flash)
+        enable_mem_efficient_sdp(enable_mem_efficient)
         enable_math_sdp(enable_math)
         yield{}
     except RuntimeError as err:
         raise err
     finally:
         enable_flash_sdp(previous_flash)
+        enable_mem_efficient_sdp(previous_mem_efficient)
         enable_math_sdp(previous_math)
 
 cufft_plan_cache = cuFFTPlanCacheManager()
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 98589a31eaced..3e34b6bdca07c 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -513,6 +513,21 @@ PyObject* THPModule_userEnabledFlashSDP(PyObject* _unused, PyObject* noargs) {
   else
     Py_RETURN_FALSE;
 }
+PyObject* THPModule_setSDPUseMemEfficient(PyObject* _unused, PyObject* arg) {
+  THPUtils_assert(
+      PyBool_Check(arg),
+      "set_sdp_use_math expects a bool, "
+      "but got %s",
+      THPUtils_typename(arg));
+  at::globalContext().setSDPUseMemEfficient(arg == Py_True);
+  Py_RETURN_NONE;
+}
+PyObject* userEnabledMemEfficientSDP(PyObject* _unused, PyObject* noargs) {
+  if (at::globalContext().userEnabledMemEfficientSDP())
+    Py_RETURN_TRUE;
+  else
+    Py_RETURN_FALSE;
+}
 PyObject* THPModule_setSDPUseMath(PyObject* _unused, PyObject* arg) {
   THPUtils_assert(
       PyBool_Check(arg),
@@ -952,6 +967,14 @@ static PyMethodDef TorchMethods[] = {
      METH_NOARGS,
      nullptr},
     {"_set_sdp_use_flash", THPModule_setSDPUseFlash, METH_O, nullptr},
+    {"_get_mem_efficient_sdp_enabled",
+     userEnabledMemEfficientSDP,
+     METH_NOARGS,
+     nullptr},
+    {"_set_sdp_use_mem_efficient",
+     THPModule_setSDPUseMemEfficient,
+     METH_O,
+     nullptr},
     {"_get_math_sdp_enabled",
      THPModule_userEnabledMathSDP,
      METH_NOARGS,

From a5f95f7cbedd88ef8ef39e1c929dfa69239bc6c2 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Fri, 28 Oct 2022 16:09:25 +0000
Subject: [PATCH 0275/1922] add special case for power of 0.5 (#87912)

Workaround for https://github.com/pytorch/torchdynamo/issues/1775, and calling sqrt is better in any case, but `libdevice.pow` still for some reason doesn't work if both arguments are scalars

cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx @mreso, can you please check if that takes you further with diffusers

cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87912
Approved by: https://github.com/desertfire
---
 test/inductor/test_torchinductor.py | 11 +++++++++++
 torch/_inductor/lowering.py         |  2 ++
 2 files changed, 13 insertions(+)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 326a7f1f6ce84..3e3887a5fcec6 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -2195,6 +2195,17 @@ def fn(x):
             rtol=3e-05,
         )
 
+    def test_pow3(self):
+        # power of 0.5 is special-cased, arbitrary power would still produce triton codegen error
+        def fn(x):
+            z = torch.tensor(0.123, device=self.device)
+            w = z + x
+            return torch.pow(w, 0.5)
+
+        opt = torch._dynamo.optimize("inductor")(fn)
+        input = torch.rand(())
+        self.assertTrue(same(opt(input), fn(input)))
+
     def test_glu(self):
         def fn(x):
             return aten.glu(x, -1), aten.glu(x, 1), aten.glu(x, 2)
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 6b047e843c301..a05e6d527ea9a 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -3121,6 +3121,8 @@ def pow(a, b):
         ), "Pow input must be floating point."
     if isinstance(b, float) and b == int(b):
         return pow(a, int(b))
+    elif isinstance(b, float) and b == 0.5:
+        return sqrt(a)
     elif isinstance(b, int) and b == 1:
         return a
     elif isinstance(b, int) and -32 < b < 32:

From eb20d0e84cd6f3ff89e48d2b6bd750aa5377a7d4 Mon Sep 17 00:00:00 2001
From: Elias Ellison <elias.ellison@gmail.com>
Date: Thu, 27 Oct 2022 17:12:36 -0700
Subject: [PATCH 0276/1922] Use Eager Code To Determine Conv Layout (#87305)

The logic for determine conv backend and therefore output striding is very complex. It depends on build settings, input striding/contiguity, sizes, etc. Eventually we should port that logic to the meta impl for dynamic shapes but that will require a lot more work and keeping the implementations in sync. See https://github.com/pytorch/torchdynamo/issues/1701

This is a prerequisite to removing the inductor conv stride propagation and more general fake tensor for inductor propagation. In that PR, the meta impls for cpu conv give incorrect striding which led to test failures (https://github.com/pytorch/pytorch/pull/87083).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87305
Approved by: https://github.com/ezyang
---
 aten/src/ATen/native/ConvUtils.h     |  7 ++-
 aten/src/ATen/native/Convolution.cpp | 13 ++++--
 test/test_fake_tensor.py             |  1 +
 torch/_C/__init__.pyi.in             |  5 +++
 torch/_subclasses/fake_tensor.py     | 65 +++++++++++++++++++++++++---
 torch/csrc/Module.cpp                | 63 +++++++++++++++++++++++++--
 6 files changed, 142 insertions(+), 12 deletions(-)

diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h
index a31dbee2bd759..675f701c8582d 100644
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@@ -114,6 +114,7 @@ struct ConvParams {
   bool is_depthwise(const at::Tensor& input, const at::Tensor& weight) const;
 };
 
+// Keep in sync with py::enum_ in Module.cpp
 enum class ConvBackend {
   CudaDepthwise2d,
   CudaDepthwise3d,
@@ -165,7 +166,11 @@ TORCH_API ConvBackend select_conv_backend(
 TORCH_API ConvBackend select_conv_backend(
     const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
     IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation,
-    bool transposed, IntArrayRef output_padding, int64_t groups);
+    bool transposed, IntArrayRef output_padding, int64_t groups, const at::OptionalIntArrayRef bias_sizes_opt);
+
+TORCH_API at::MemoryFormat _determine_backend_memory_format(const Tensor& input,
+    const Tensor& weight,
+    const ConvBackend backend);
 
 // ---------------------------------------------------------------------
 //
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 2dd7d515c14f9..60215801e4ce9 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -1122,7 +1122,7 @@ at::Tensor convolution_overrideable(
 ConvBackend select_conv_backend(
     const Tensor& input_r, const Tensor& weight_r, const c10::optional<Tensor>& bias_opt,
     IntArrayRef stride_, IntArrayRef padding_, IntArrayRef dilation_,
-    bool transposed_, IntArrayRef output_padding_, int64_t groups_) {
+    bool transposed_, IntArrayRef output_padding_, int64_t groups_, const at::OptionalIntArrayRef bias_sizes_opt) {
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
   const Tensor& bias = *bias_maybe_owned;
 
@@ -1155,10 +1155,10 @@ ConvBackend select_conv_backend(
     weight = view4d(weight);
   }
 
-  auto bias_sizes_opt = bias.defined() ? c10::optional<IntArrayRef>(bias.sizes()) : c10::nullopt;
+  auto bias_sizes = bias.defined() ? c10::optional<IntArrayRef>(bias.sizes()) : bias_sizes_opt;
   bool need_backward = GradMode::is_enabled() &&
       (input.requires_grad() || weight.requires_grad() || (bias.defined() && bias.requires_grad()));
-  return _select_conv_backend(input, weight, bias, bias_sizes_opt, need_backward, params);
+  return _select_conv_backend(input, weight, bias, bias_sizes, need_backward, params);
 }
 
 ConvBackend select_conv_backend(
@@ -1365,6 +1365,13 @@ static inline at::MemoryFormat determine_backend_memory_format(
   return backend_memory_format;
 }
 
+at::MemoryFormat _determine_backend_memory_format(
+    const Tensor& input,
+    const Tensor& weight,
+    const ConvBackend backend)  {
+  return determine_backend_memory_format(input, weight, backend);
+}
+
 at::Tensor _convolution(
     const Tensor& input_r, const Tensor& weight_r, const c10::optional<Tensor>& bias_r_opt,
     IntArrayRef stride_, IntArrayRef padding_, IntArrayRef dilation_,
diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
index 50a92436f406b..0d81cdf10f82f 100644
--- a/test/test_fake_tensor.py
+++ b/test/test_fake_tensor.py
@@ -329,6 +329,7 @@ def fn(
                         self.assertTrue(isinstance(ten, FakeTensor))
                     self.assertEqual(ten.device.type, 'cuda')
 
+    @skipIfRocm
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_fallback_memory_prop(self):
         m = nn.Conv2d(16, 33, 3, stride=2, device="cuda", dtype=torch.half)
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 560c4a3f16738..af6734b059f44 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -846,6 +846,8 @@ def _set_conj(x: Tensor, conj: _bool) -> None: ...
 def _set_neg(x: Tensor, neg: _bool) -> None: ...
 def _set_meta_in_tls_dispatch_include(meta_in_tls: _bool) -> None: ...
 def _meta_in_tls_dispatch_include() -> _bool: ...
+def _select_conv_backend(*args, **kwargs) -> ConvBackend: ...
+def _conv_determine_backend_memory_format(input: Tensor, weight: Tensor, backend: ConvBackend) -> memory_format: ...
 def _has_storage(x: Tensor) -> _bool: ...
 def _should_allow_numbers_as_tensors(func_name: str) -> _bool: ...
 # NB: There is no Capsule type in typing, see
@@ -880,6 +882,9 @@ class _LinalgBackend:
     Cusolver: _LinalgBackend
     Magma: _LinalgBackend
 
+class ConvBackend(Enum):
+    ...
+
 # Defined in `valgrind.h` and `callgrind.h` respecitively.
 def _valgrind_supported_platform() -> _bool: ...  # NVALGRIND
 def _valgrind_toggle() -> None: ...  # CALLGRIND_TOGGLE_COLLECT
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 9d5a8d30209eb..fd121d182bb21 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -424,11 +424,58 @@ def nyi(fake_mode, func, *args, **kwargs):
     assert func not in _device_not_kwarg_ops, f"NYI: {func}"
 
 
-# Meta tensors give you the ability to run PyTorch code without having to
-# actually do computation through tensors allocated on a `meta` device.
-# Because the device is `meta`, meta tensors do not model device propagation.
-# FakeTensor extends MetaTensors to also carry an additional `fake_device`
-# which tracks devices that would have been used.
+@register_op_impl(
+    lambda func: func in (aten.convolution.default, aten.convolution_backward.default)
+)
+def conv(fake_mode, func, *args, **kwargs):
+    _, kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+    device = kwargs["input"].fake_device
+    # need to re-enable mode so the tensors report fake device
+    with fake_mode:
+        # if the input is unsqueezed is done in Convolution.cpp we get segfault
+        k = kwargs["weight"].ndim
+        if k == 3 and not kwargs["input"].is_mkldnn and not kwargs["input"].is_xpu:
+            mem_fmt = None
+        else:
+            if func is aten.convolution.default:
+                conv_backend = torch._C._select_conv_backend(**kwargs)
+            else:
+                conv_backend = torch._C._select_conv_backend(
+                    kwargs["input"],
+                    kwargs["weight"],
+                    bias=None,
+                    stride=kwargs["stride"],
+                    padding=kwargs["padding"],
+                    dilation=kwargs["dilation"],
+                    transposed=kwargs["transposed"],
+                    output_padding=kwargs["output_padding"],
+                    groups=kwargs["groups"],
+                    bias_sizes=kwargs["bias_sizes"],
+                )
+            mem_fmt = torch._C._conv_determine_backend_memory_format(
+                kwargs["input"], kwargs["weight"], conv_backend
+            )
+
+    def convert(t, mem_fmt):
+        if t is None:
+            return t
+        if mem_fmt is not None:
+            t = t.to(memory_format=mem_fmt)
+        return FakeTensor(fake_mode, t, device)
+
+    with in_kernel_invocation_manager(fake_mode):
+        out = func(**kwargs)
+
+        if func is aten.convolution.default:
+            return convert(out, mem_fmt)
+        else:
+            return (
+                convert(out[0], mem_fmt),
+                convert(out[1], mem_fmt),
+                convert(out[2], None),
+            )
 
 
 @contextlib.contextmanager
@@ -450,6 +497,14 @@ def in_kernel_invocation_manager(fake_mode):
 
 
 class FakeTensor(torch.Tensor):
+    """
+    Meta tensors give you the ability to run PyTorch code without having to
+    actually do computation through tensors allocated on a `meta` device.
+    Because the device is `meta`, meta tensors do not model device propagation.
+    FakeTensor extends MetaTensors to also carry an additional `fake_device`
+    which tracks devices that would have been used.
+    """
+
     fake_device: torch.device
     fake_mode: "FakeTensorMode"
     constant: Optional[torch.Tensor]
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 3e34b6bdca07c..9c6f2ed4177a0 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -1,3 +1,4 @@
+#include <c10/util/Optional.h>
 #include <sys/types.h>
 #include <torch/csrc/python_headers.h>
 
@@ -1376,7 +1377,9 @@ Call this whenever a new thread is created in order to propagate values from
       .value("SlowTranspose3d", at::native::ConvBackend::SlowTranspose3d)
       .value(
           "Winograd3x3Depthwise", at::native::ConvBackend::Winograd3x3Depthwise)
-      .value("Xnnpack2d", at::native::ConvBackend::Xnnpack2d);
+      .value("Xnnpack2d", at::native::ConvBackend::Xnnpack2d)
+      .value("Mps", at::native::ConvBackend::Mps)
+      .value("MpsTranspose,", at::native::ConvBackend::MpsTranspose);
 
   py_module.def(
       "_select_conv_backend",
@@ -1398,8 +1401,62 @@ Call this whenever a new thread is created in order to propagate values from
             dilation_,
             transposed_,
             output_padding_,
-            groups_);
-      });
+            groups_,
+            c10::nullopt);
+      },
+      py::arg("input"),
+      py::arg("weight"),
+      py::arg("bias"),
+      py::arg("stride"),
+      py::arg("padding"),
+      py::arg("dilation"),
+      py::arg("transposed"),
+      py::arg("output_padding"),
+      py::arg("groups"));
+
+  // overload for bias_sizes_opt/backward TODO: figure out default value
+  py_module.def(
+      "_select_conv_backend",
+      [](const at::Tensor& input,
+         const at::Tensor& weight,
+         const c10::optional<at::Tensor>& bias,
+         at::IntArrayRef stride_,
+         at::IntArrayRef padding_,
+         at::IntArrayRef dilation_,
+         bool transposed_,
+         at::IntArrayRef output_padding_,
+         int64_t groups_,
+         c10::optional<std::vector<int64_t>> bias_sizes_opt) {
+        c10::OptionalArrayRef<int64_t> ref = c10::nullopt;
+        if (bias_sizes_opt) {
+          ref = (*bias_sizes_opt);
+        }
+        return at::native::select_conv_backend(
+            input,
+            weight,
+            bias,
+            stride_,
+            padding_,
+            dilation_,
+            transposed_,
+            output_padding_,
+            groups_,
+            ref);
+      },
+      py::arg("input"),
+      py::arg("weight"),
+      py::arg("bias"),
+      py::arg("stride"),
+      py::arg("padding"),
+      py::arg("dilation"),
+      py::arg("transposed"),
+      py::arg("output_padding"),
+      py::arg("groups"),
+      py::arg("bias_sizes"));
+
+  py_module.def(
+      "_conv_determine_backend_memory_format",
+      at::native::_determine_backend_memory_format);
 
   py::enum_<at::LinalgBackend>(py_module, "_LinalgBackend")
       .value("Default", at::LinalgBackend::Default)

From 773930817620f53087a0ab17a2fa6c8d57d20c4e Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 28 Oct 2022 17:55:19 +0000
Subject: [PATCH 0277/1922] Revert "[fx][subgraph_rewriter] Change match_filter
 to be a List in replace_pattern_with_filters (#87257)"

This reverts commit 58650835bb91d927623e6bff5cc4844fbcad6368.

Reverted https://github.com/pytorch/pytorch/pull/87257 on behalf of https://github.com/weiwangmeta due to breaking internal builds/BC-breaking change
---
 test/fx/test_subgraph_rewriter.py |  6 +++---
 torch/fx/subgraph_rewriter.py     | 20 +++++++-------------
 2 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/test/fx/test_subgraph_rewriter.py b/test/fx/test_subgraph_rewriter.py
index ed6d50e44b4ac..ac3498458d600 100644
--- a/test/fx/test_subgraph_rewriter.py
+++ b/test/fx/test_subgraph_rewriter.py
@@ -773,7 +773,7 @@ def gemm_bias_mul_replacement_with_c(a, b, bias, c):
 
         self.assertEqual(repalcement_node_found, 2)
 
-    def test_replace_pattern_with_filters(self):
+    def test_replace_pattern_with_filter(self):
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -833,10 +833,10 @@ def num_repalcement_node_found(traced):
 
         # match with filter, should find 1 match
         traced = symbolic_trace(M())
-        matches = subgraph_rewriter.replace_pattern_with_filters(
+        matches = subgraph_rewriter.replace_pattern_with_filter(
             traced,
             BinaryOpScalarReLUPattern,
             BinaryOpScalarReLUReplacement,
-            [second_input_is_scalar])
+            second_input_is_scalar)
         self.assertEqual(len(matches), 1)
         self.assertEqual(num_repalcement_node_found(traced), 1)
diff --git a/torch/fx/subgraph_rewriter.py b/torch/fx/subgraph_rewriter.py
index 72bb7fd373516..09e5550c5930d 100644
--- a/torch/fx/subgraph_rewriter.py
+++ b/torch/fx/subgraph_rewriter.py
@@ -8,7 +8,7 @@
 from typing import Callable, Dict, List, NamedTuple, Optional, Set
 import torch
 
-__all__ = ['Match', 'replace_pattern', 'replace_pattern_with_filters']
+__all__ = ['Match', 'replace_pattern', 'replace_pattern_with_filter']
 
 @compatibility(is_backward_compatible=True)
 class Match(NamedTuple):
@@ -185,11 +185,11 @@ def forward(self, x, w1, w2):
 
 # Experimental API, not backward compatible
 @compatibility(is_backward_compatible=False)
-def replace_pattern_with_filters(
+def replace_pattern_with_filter(
     gm: GraphModule,
     pattern: Callable,
     replacement: Callable,
-    match_filters: List[Callable[["InternalMatch", Graph, Graph], bool]],  # type: ignore[name-defined]
+    match_filter: Callable[["InternalMatch", Graph, Graph], bool],  # type: ignore[name-defined]
 ) -> List[Match]:
     """
     See replace_pattern for documentation. This function is an overload with an additional match_filter argument.
@@ -200,21 +200,18 @@ def replace_pattern_with_filters(
             definition of InternalMatch.
     """
 
-    return _replace_pattern(gm, pattern, replacement, match_filters)
+    return _replace_pattern(gm, pattern, replacement, match_filter)
 
 
 def _replace_pattern(
     gm: GraphModule,
     pattern: Callable,
     replacement: Callable,
-    match_filters: List[Callable[["InternalMatch", Graph, Graph], bool]] = None  # type: ignore[name-defined]
+    match_filter: Optional[Callable[["InternalMatch", Graph, Graph], bool]] = None  # type: ignore[name-defined]
 ) -> List[Match]:
 
     from torch.fx.passes.utils.matcher_utils import SubgraphMatcher, InternalMatch
 
-    if match_filters is None:
-        match_filters = []
-
     # Get the graphs for `gm`, `pattern`, `replacement`
     original_graph: Graph = gm.graph
     pattern_graph: Graph = symbolic_trace(pattern).graph
@@ -225,11 +222,8 @@ def _replace_pattern(
     _matches: List[InternalMatch] = matcher.match(original_graph)
 
     # Filter out matches that don't match the filter
-    _matches = [
-        m for m in _matches
-        if all(match_filter(m, original_graph, pattern_graph)
-               for match_filter in match_filters)
-    ]
+    if match_filter:
+        _matches = [m for m in _matches if match_filter(m, original_graph, pattern_graph)]
 
     replacement_placeholders = [n for n in replacement_graph.nodes if n.op == "placeholder"]
 

From 696b8482dbe462867742a803251529e64d4a972c Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Wed, 26 Oct 2022 16:56:50 -0700
Subject: [PATCH 0278/1922] [Profiler][Trivial] Add hashing struct for pairs
 and tuples. (#87668)

There is a fairly simple and commonly used hash_combine in c10/util; however in order to use it in a map we need to wrap it in a hashing struct. By defining template functions we also get recursive unpacking for free. (A later PR will want to hash a `tuple<tuple<T0, T1>, tuple<T0, T1>>`)

Differential Revision: [D40666359](https://our.internmc.facebook.com/intern/diff/D40666359/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87668
Approved by: https://github.com/slgong-fb
---
 torch/csrc/profiler/collection.cpp |  9 +--------
 torch/csrc/profiler/util.h         | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp
index 8bb57fda9cf48..515342a0049b0 100644
--- a/torch/csrc/profiler/collection.cpp
+++ b/torch/csrc/profiler/collection.cpp
@@ -821,13 +821,6 @@ trace_ptr_t addKinetoEvents(
   return trace;
 }
 
-template <typename T>
-struct PairHash {
-  size_t operator()(const std::pair<T, T>& i) {
-    return c10::get_hash(i.first, i.second);
-  }
-};
-
 void calculate_unique_tensor_ids(std::vector<result_ptr_t>& sorted_results) {
   // This task is equivilent to https://leetcode.com/problems/number-of-islands/
   // We first cluster events with a greedy index assignment, and then merge
@@ -934,7 +927,7 @@ void calculate_unique_tensor_ids(std::vector<result_ptr_t>& sorted_results) {
   // Step 2) Handle the case that the storage of a TensorImpl changed.
   // --------------------------------------------------------------------------
   using storage_id_pair_t = std::pair<storage_id_t, storage_id_t>;
-  ska::flat_hash_set<storage_id_pair_t, PairHash<storage_id_t>> same_group_set;
+  ska::flat_hash_set<storage_id_pair_t, HashCombine> same_group_set;
   {
     ska::flat_hash_map<TensorImplAddress, storage_id_t> impl_map;
     for (const auto& t : tensors) {
diff --git a/torch/csrc/profiler/util.h b/torch/csrc/profiler/util.h
index 62b71da453c80..ab0550e79caa7 100644
--- a/torch/csrc/profiler/util.h
+++ b/torch/csrc/profiler/util.h
@@ -10,6 +10,7 @@
 #include <ATen/record_function.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/Optional.h>
+#include <c10/util/hash.h>
 #include <torch/csrc/Export.h>
 #include <torch/csrc/jit/frontend/source_range.h>
 
@@ -205,6 +206,23 @@ class TORCH_API GlobalStateManager {
   std::shared_ptr<T> state_;
 };
 
+struct HashCombine {
+  template <typename T0, typename T1>
+  size_t operator()(const std::pair<T0, T1>& i) {
+    return c10::get_hash((*this)(i.first), (*this)(i.second));
+  }
+
+  template <typename... Args>
+  size_t operator()(const std::tuple<Args...>& i) {
+    return c10::get_hash(i);
+  }
+
+  template <typename T>
+  size_t operator()(const T& i) {
+    return c10::get_hash(i);
+  }
+};
+
 } // namespace impl
 } // namespace profiler
 } // namespace torch

From b9b83da5f5f1bc9e6fb30f2d40ea5c53b4a76926 Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Wed, 26 Oct 2022 16:56:51 -0700
Subject: [PATCH 0279/1922] [Profiler][Trivial] Move ID assignment code to
 `data_flow.cpp` (#87670)

ID assignment has become a very complex facet of the profiler. The existing code has grown organically as I've discovered various refinements and has become very difficult to understand or reason about. (With more complexity coming in https://github.com/pytorch/pytorch/pull/87133)

I want to take a step back and add some structure and additional comments to the ID assignment algorithm. Before I do, however, it's time to move it out of `collection.cpp` to a dedicated data flow file.

Differential Revision: [D40666360](https://our.internmc.facebook.com/intern/diff/D40666360/)

**NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D40666360/)!
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87670
Approved by: https://github.com/slgong-fb
---
 build_variables.bzl                |   1 +
 torch/csrc/profiler/collection.cpp | 153 +--------------------------
 torch/csrc/profiler/data_flow.cpp  | 164 +++++++++++++++++++++++++++++
 torch/csrc/profiler/data_flow.h    |   8 ++
 4 files changed, 175 insertions(+), 151 deletions(-)
 create mode 100644 torch/csrc/profiler/data_flow.cpp

diff --git a/build_variables.bzl b/build_variables.bzl
index 12ad9730123f1..789d28bf786d7 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -134,6 +134,7 @@ libtorch_profiler_sources = [
     "torch/csrc/autograd/profiler_legacy.cpp",
     "torch/csrc/autograd/profiler_kineto.cpp",
     "torch/csrc/profiler/collection.cpp",
+    "torch/csrc/profiler/data_flow.cpp",
     "torch/csrc/profiler/kineto_shim.cpp",
     "torch/csrc/profiler/kineto_client_interface.cpp",
     "torch/csrc/profiler/orchestration/observer.cpp",
diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp
index 515342a0049b0..239d74cdf2801 100644
--- a/torch/csrc/profiler/collection.cpp
+++ b/torch/csrc/profiler/collection.cpp
@@ -21,6 +21,7 @@
 #include <c10/util/hash.h>
 #include <c10/util/overloaded.h>
 #include <torch/csrc/jit/runtime/interpreter.h>
+#include <torch/csrc/profiler/data_flow.h>
 #include <torch/csrc/profiler/kineto_shim.h>
 
 namespace torch {
@@ -821,156 +822,6 @@ trace_ptr_t addKinetoEvents(
   return trace;
 }
 
-void calculate_unique_tensor_ids(std::vector<result_ptr_t>& sorted_results) {
-  // This task is equivilent to https://leetcode.com/problems/number-of-islands/
-  // We first cluster events with a greedy index assignment, and then merge
-  // groups that overlap.
-
-  using storage_id_t = strong::type<
-      size_t,
-      struct _StorageID,
-      strong::regular,
-      strong::hashable,
-      strong::arithmetic,
-      strong::ordered>;
-
-  struct TensorStoragePair {
-    TensorImplAddress impl_;
-    storage_id_t storage_id_;
-
-    // Used to assign the result.
-    std::reference_wrapper<c10::optional<TensorID>> id_ref_;
-  };
-  std::vector<TensorStoragePair> tensors;
-
-  // Step 1) Flatten and convert storage data pointers. (Handle address reuse.)
-  // --------------------------------------------------------------------------
-  {
-    storage_id_t current_id{0};
-    ska::flat_hash_map<StorageImplData, storage_id_t> live_storage;
-    auto lookup = [&current_id, &live_storage](const StorageImplData data) {
-      auto inserted = live_storage.insert({data, current_id});
-      current_id += storage_id_t(inserted.second);
-      return inserted.first->second;
-    };
-
-    ska::flat_hash_set<storage_id_t> tensor_set;
-    auto insert_tensor = [&lookup, &tensors, &tensor_set](TensorMetadata& m) {
-      if (m.impl() && m.data_) {
-        const auto id = lookup(m.data_);
-        tensor_set.insert(id);
-        tensors.emplace_back(TensorStoragePair{m.impl(), id, m.id_});
-      }
-    };
-
-    for (auto& result : sorted_results) {
-      result->visit(c10::overloaded(
-          [&](ExtraFields<EventType::TorchOp>& torch_op) {
-            for (auto& m : torch_op.inputs_.tensor_metadata_) {
-              if (m.has_value()) {
-                insert_tensor(*m);
-              }
-            }
-          },
-          [&](ExtraFields<EventType::Allocation>& alloc_op) {
-            // We won't know which allocations are for Tensor storage yet.
-            // We'll filter after we see all of the op inputs.
-            tensors.emplace_back(TensorStoragePair{
-                TensorImplAddress(nullptr),
-                lookup(StorageImplData(alloc_op.ptr_)),
-                alloc_op.id_});
-
-            // Handle deallocation
-            if (alloc_op.alloc_size_ < 0) {
-              live_storage.erase(StorageImplData(alloc_op.ptr_));
-            }
-          },
-          [&](ExtraFields<EventType::PyCall>& py_call) {
-            // torch.nn.Module
-            if (py_call.module_.has_value()) {
-              for (auto& p : py_call.module_->parameters_) {
-                insert_tensor(p.metadata_);
-                if (p.grad_metadata_.has_value()) {
-                  insert_tensor(*p.grad_metadata_);
-                }
-              }
-            }
-
-            // torch.optim.Optimizer
-            if (py_call.optimizer_.has_value()) {
-              for (auto& p : py_call.optimizer_->parameters_) {
-                insert_tensor(p.metadata_);
-                if (p.grad_metadata_.has_value()) {
-                  insert_tensor(*p.grad_metadata_);
-                }
-                for (auto& state_i : p.state_) {
-                  insert_tensor(state_i.second);
-                }
-              }
-            }
-          },
-          [](const auto&) {}));
-    }
-
-    // Handle any allocation events which we cannot prove are for
-    // `StorageImpl`s.
-    tensors.erase(
-        std::remove_if(
-            tensors.begin(),
-            tensors.end(),
-            [&tensor_set](const auto& i) {
-              return tensor_set.find(i.storage_id_) == tensor_set.end();
-            }),
-        tensors.end());
-  }
-
-  // Step 2) Handle the case that the storage of a TensorImpl changed.
-  // --------------------------------------------------------------------------
-  using storage_id_pair_t = std::pair<storage_id_t, storage_id_t>;
-  ska::flat_hash_set<storage_id_pair_t, HashCombine> same_group_set;
-  {
-    ska::flat_hash_map<TensorImplAddress, storage_id_t> impl_map;
-    for (const auto& t : tensors) {
-      // Storage allocations / frees don't have an associated TensorImpl, so
-      // we don't want all storages to merge through nullptr.
-      if (!t.impl_) {
-        continue;
-      }
-
-      const auto it = impl_map.insert({t.impl_, t.storage_id_}).first;
-
-      // The pair needs to be sorted for the coalesce step to work properly.
-      it->second < t.storage_id_
-          ? same_group_set.insert({it->second, t.storage_id_})
-          : same_group_set.insert({t.storage_id_, it->second});
-    }
-  }
-
-  // Step 3) Coalesce groups and assign final IDs.
-  // --------------------------------------------------------------------------
-  ska::flat_hash_map<storage_id_t, size_t> id_map;
-  {
-    std::vector<storage_id_pair_t> unique_pairs;
-    for (const auto& i : same_group_set) {
-      unique_pairs.push_back(i);
-    }
-    std::sort(unique_pairs.begin(), unique_pairs.end());
-
-    size_t current_id{0};
-    for (const auto& i : unique_pairs) {
-      auto inserted = id_map.insert({i.first, current_id});
-      current_id += inserted.second;
-      id_map.insert({i.second, inserted.first->second});
-    }
-  }
-
-  // Step 4) Write back to metadata
-  // --------------------------------------------------------------------------
-  for (const auto& t : tensors) {
-    t.id_ref_.get() = TensorID(id_map.at(t.storage_id_));
-  }
-}
-
 struct ResultGreater {
   bool operator()(const result_ptr_t& a, const result_ptr_t& b) const {
     return a->endTimeNS() > b->endTimeNS();
@@ -1133,7 +984,7 @@ RecordQueue::getRecords(
   });
 
   if (config_.report_input_shapes && config_.profile_memory) {
-    calculate_unique_tensor_ids(out);
+    calculateUniqueTensorIDs(out);
   }
 
   build_tree(out);
diff --git a/torch/csrc/profiler/data_flow.cpp b/torch/csrc/profiler/data_flow.cpp
new file mode 100644
index 0000000000000..894bf96ed0f58
--- /dev/null
+++ b/torch/csrc/profiler/data_flow.cpp
@@ -0,0 +1,164 @@
+#include <torch/csrc/profiler/data_flow.h>
+
+#include <c10/util/overloaded.h>
+#include <c10/util/variant.h>
+#include <torch/csrc/profiler/collection.h>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+
+void calculateUniqueTensorIDs(
+    std::vector<std::shared_ptr<Result>>& sorted_results) {
+  // This task is equivilent to https://leetcode.com/problems/number-of-islands/
+  // We first cluster events with a greedy index assignment, and then merge
+  // groups that overlap.
+
+  using storage_id_t = strong::type<
+      size_t,
+      struct _StorageID,
+      strong::regular,
+      strong::hashable,
+      strong::arithmetic,
+      strong::ordered>;
+
+  struct TensorStoragePair {
+    TensorImplAddress impl_;
+    storage_id_t storage_id_;
+
+    // Used to assign the result.
+    std::reference_wrapper<c10::optional<TensorID>> id_ref_;
+  };
+  std::vector<TensorStoragePair> tensors;
+
+  // Step 1) Flatten and convert storage data pointers. (Handle address reuse.)
+  // --------------------------------------------------------------------------
+  {
+    storage_id_t current_id{0};
+    ska::flat_hash_map<StorageImplData, storage_id_t> live_storage;
+    auto lookup = [&current_id, &live_storage](const StorageImplData data) {
+      auto inserted = live_storage.insert({data, current_id});
+      current_id += storage_id_t(inserted.second);
+      return inserted.first->second;
+    };
+
+    ska::flat_hash_set<storage_id_t> tensor_set;
+    auto insert_tensor = [&lookup, &tensors, &tensor_set](TensorMetadata& m) {
+      if (m.impl() && m.data_) {
+        const auto id = lookup(m.data_);
+        tensor_set.insert(id);
+        tensors.emplace_back(TensorStoragePair{m.impl(), id, m.id_});
+      }
+    };
+
+    for (auto& result : sorted_results) {
+      result->visit(c10::overloaded(
+          [&](ExtraFields<EventType::TorchOp>& torch_op) {
+            for (auto& m : torch_op.inputs_.tensor_metadata_) {
+              if (m.has_value()) {
+                insert_tensor(*m);
+              }
+            }
+          },
+          [&](ExtraFields<EventType::Allocation>& alloc_op) {
+            // We won't know which allocations are for Tensor storage yet.
+            // We'll filter after we see all of the op inputs.
+            tensors.emplace_back(TensorStoragePair{
+                TensorImplAddress(nullptr),
+                lookup(StorageImplData(alloc_op.ptr_)),
+                alloc_op.id_});
+
+            // Handle deallocation
+            if (alloc_op.alloc_size_ < 0) {
+              live_storage.erase(StorageImplData(alloc_op.ptr_));
+            }
+          },
+          [&](ExtraFields<EventType::PyCall>& py_call) {
+            // torch.nn.Module
+            if (py_call.module_.has_value()) {
+              for (auto& p : py_call.module_->parameters_) {
+                insert_tensor(p.metadata_);
+                if (p.grad_metadata_.has_value()) {
+                  insert_tensor(*p.grad_metadata_);
+                }
+              }
+            }
+
+            // torch.optim.Optimizer
+            if (py_call.optimizer_.has_value()) {
+              for (auto& p : py_call.optimizer_->parameters_) {
+                insert_tensor(p.metadata_);
+                if (p.grad_metadata_.has_value()) {
+                  insert_tensor(*p.grad_metadata_);
+                }
+                for (auto& state_i : p.state_) {
+                  insert_tensor(state_i.second);
+                }
+              }
+            }
+          },
+          [](const auto&) {}));
+    }
+
+    // Handle any allocation events which we cannot prove are for
+    // `StorageImpl`s.
+    tensors.erase(
+        std::remove_if(
+            tensors.begin(),
+            tensors.end(),
+            [&tensor_set](const auto& i) {
+              return tensor_set.find(i.storage_id_) == tensor_set.end();
+            }),
+        tensors.end());
+  }
+
+  // Step 2) Handle the case that the storage of a TensorImpl changed.
+  // --------------------------------------------------------------------------
+  using storage_id_pair_t = std::pair<storage_id_t, storage_id_t>;
+  ska::flat_hash_set<storage_id_pair_t, HashCombine> same_group_set;
+  {
+    ska::flat_hash_map<TensorImplAddress, storage_id_t> impl_map;
+    for (const auto& t : tensors) {
+      // Storage allocations / frees don't have an associated TensorImpl, so
+      // we don't want all storages to merge through nullptr.
+      if (!t.impl_) {
+        continue;
+      }
+
+      const auto it = impl_map.insert({t.impl_, t.storage_id_}).first;
+
+      // The pair needs to be sorted for the coalesce step to work properly.
+      it->second < t.storage_id_
+          ? same_group_set.insert({it->second, t.storage_id_})
+          : same_group_set.insert({t.storage_id_, it->second});
+    }
+  }
+
+  // Step 3) Coalesce groups and assign final IDs.
+  // --------------------------------------------------------------------------
+  ska::flat_hash_map<storage_id_t, size_t> id_map;
+  {
+    std::vector<storage_id_pair_t> unique_pairs;
+    for (const auto& i : same_group_set) {
+      unique_pairs.push_back(i);
+    }
+    std::sort(unique_pairs.begin(), unique_pairs.end());
+
+    size_t current_id{0};
+    for (const auto& i : unique_pairs) {
+      auto inserted = id_map.insert({i.first, current_id});
+      current_id += inserted.second;
+      id_map.insert({i.second, inserted.first->second});
+    }
+  }
+
+  // Step 4) Write back to metadata
+  // --------------------------------------------------------------------------
+  for (const auto& t : tensors) {
+    t.id_ref_.get() = TensorID(id_map.at(t.storage_id_));
+  }
+}
+
+} // namespace impl
+} // namespace profiler
+} // namespace torch
diff --git a/torch/csrc/profiler/data_flow.h b/torch/csrc/profiler/data_flow.h
index 7afd0204d41db..378f37707988e 100644
--- a/torch/csrc/profiler/data_flow.h
+++ b/torch/csrc/profiler/data_flow.h
@@ -1,7 +1,10 @@
 #pragma once
 
+#include <memory>
+
 #include <ATen/core/TensorBody.h>
 #include <c10/core/TensorImpl.h>
+#include <c10/macros/Macros.h>
 #include <c10/util/strong_type.h>
 #include <c10/util/variant.h>
 
@@ -74,6 +77,11 @@ class WeakTensor {
   c10::weak_intrusive_ptr<c10::TensorImpl> weak_self_;
 };
 
+struct Result;
+using result_ptr_t = std::shared_ptr<Result>;
+
+void calculateUniqueTensorIDs(std::vector<result_ptr_t>& sorted_results);
+
 } // namespace impl
 } // namespace profiler
 } // namespace torch

From 84ddc58e2f72c710920036e859e84f2c892869a8 Mon Sep 17 00:00:00 2001
From: Eddie Yan <eddiey@nvidia.com>
Date: Fri, 28 Oct 2022 19:33:42 +0000
Subject: [PATCH 0280/1922] Allow 64bit indexing for channels-last upsample2d
 on CUDA (#87901)

#81665

CC @ngimel @ptrblck
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87901
Approved by: https://github.com/ngimel
---
 .../src/ATen/native/cuda/UpSampleNearest2d.cu | 22 +++++++++----------
 test/test_nn.py                               | 10 +++++++++
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu
index 8aa4f68aeda64..f223655daca15 100644
--- a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu
@@ -94,13 +94,13 @@ __global__ void upsample_nearest2d_nhwc_out_frame(
     float width_scale,
     const size_t out_numel) {
 
-  const int index = blockIdx.x * blockDim.x + threadIdx.x;
+  const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (index < out_numel) {
-    const int c = index % channels;
-    const int w2 = (index / channels) % width2;
-    const int h2 = (index / channels / width2) % height2;
-    const int n = index / channels / width2 / height2;
+    const auto c = index % channels;
+    const auto w2 = (index / channels) % width2;
+    const auto h2 = (index / channels / width2) % height2;
+    const auto n = index / channels / width2 / height2;
 
     const size_t h1 = height1 == height2 ? h2 : nn_compute_source_index_fn(height_scale, h2, height1);
     const size_t w1 = width1 == width2 ? w2 : nn_compute_source_index_fn(width_scale, w2, width1);
@@ -240,13 +240,13 @@ static void upsample_nearest2d_out_cuda_template(
         output.is_contiguous(memory_format)) {
     at::Tensor input = input_.contiguous(at::MemoryFormat::ChannelsLast);
 
-    TORCH_CHECK(input.numel() < std::numeric_limits<int>::max(),
-      "upsample_nearest_nhwc only supports input tensors with less than INT_MAX elements");
-    TORCH_CHECK(output.numel() < std::numeric_limits<int>::max(),
-      "upsample_nearest_nhwc only supports output tensors with less than INT_MAX elements");
+    TORCH_CHECK(input.numel() < std::numeric_limits<int64_t>::max(),
+      "upsample_nearest_nhwc only supports input tensors with less than 2^63 - 1 elements");
+    TORCH_CHECK(output.numel() < std::numeric_limits<int64_t>::max(),
+      "upsample_nearest_nhwc only supports output tensors with less than 2^63 - 1 elements");
 
-    const int num_kernels = output.numel();
-    const int num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
+    const int64_t num_kernels = output.numel();
+    const int64_t num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, 1024);
 
     AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::Byte, input.scalar_type(), "upsample_nearest2d_nhwc_out_frame", [&] {
       const scalar_t* idata = input.data_ptr<scalar_t>();
diff --git a/test/test_nn.py b/test/test_nn.py
index 5fbdcacd641d8..8cf6e5e56555e 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -14922,6 +14922,16 @@ def test_upsamplingBicubic2d_aa_correctness(self, device, memory_format):
         t_out = F.interpolate(t_in, size=(2, 2), mode="bicubic", align_corners=False, antialias=True)
         self.assertEqual(expected_out, t_out)
 
+    @onlyCUDA
+    @dtypes(torch.half)
+    @largeTensorTest('40GB')
+    def test_upsampling_64bit_indexing_channels_last(self, device, dtype):
+        x = torch.rand((32, 64, 512, 512), dtype=dtype, device=device)
+        out = torch.nn.functional.interpolate(x.to(memory_format=torch.channels_last), scale_factor=2, mode='nearest')
+        out_ref = torch.nn.functional.interpolate(x, scale_factor=2, mode='nearest')
+        del x
+        self.assertTrue(torch.allclose(out, out_ref))
+
     def _slow_masked_softmax(self, input, mask):
         exp = torch.exp(input)
         exp = exp * mask

From 75bed7d40302b2fb24c1f7060b293f151e6fd765 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Fri, 28 Oct 2022 19:55:31 +0000
Subject: [PATCH 0281/1922] Advance nightly docker to 11.6 (#87858)

Fixes following:
https://github.com/pytorch/pytorch/actions/runs/3242695506/jobs/5316334351
crash in Docker builds introduced by: #82682

The PR seems to introduce some changes not compatible with cuda 11.3 which is used by our Docker builds

This is a reland of original pr: https://github.com/pytorch/pytorch/pull/86941 (Created this new PR to start fresh)
Which was reverted because conda install, installed wrong version of pytorch. It installed pytorch for cuda 11.3 still rather then 11.6

This should be fixed now with Release 1.13
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87858
Approved by: https://github.com/seemethere, https://github.com/malfet, https://github.com/izaitsevfb
---
 .github/scripts/build_publish_nightly_docker.sh | 2 +-
 Dockerfile                                      | 7 ++++---
 docker.Makefile                                 | 4 ++--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/.github/scripts/build_publish_nightly_docker.sh b/.github/scripts/build_publish_nightly_docker.sh
index db84704aa3e4c..c60e31eec500e 100644
--- a/.github/scripts/build_publish_nightly_docker.sh
+++ b/.github/scripts/build_publish_nightly_docker.sh
@@ -3,7 +3,7 @@
 set -xeuo pipefail
 
 PYTORCH_DOCKER_TAG=$(git describe --tags --always)-devel
-CUDA_VERSION=11.3.1
+CUDA_VERSION=11.6.2
 
 # Build PyTorch nightly docker
 make -f docker.Makefile \
diff --git a/Dockerfile b/Dockerfile
index 815a9108ce946..e49e0a44e816b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -59,17 +59,18 @@ RUN --mount=type=cache,target=/opt/ccache \
 
 FROM conda as conda-installs
 ARG PYTHON_VERSION=3.8
-ARG CUDA_VERSION=11.3
+ARG CUDA_VERSION=11.6
 ARG CUDA_CHANNEL=nvidia
 ARG INSTALL_CHANNEL=pytorch-nightly
-ENV CONDA_OVERRIDE_CUDA=${CUDA_VERSION}
 # Automatically set by buildx
+RUN /opt/conda/bin/conda update -y conda
 RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y python=${PYTHON_VERSION}
 ARG TARGETPLATFORM
+
 # On arm64 we can only install wheel packages
 RUN case ${TARGETPLATFORM} in \
          "linux/arm64")  pip install --extra-index-url https://download.pytorch.org/whl/cpu/ torch torchvision torchtext ;; \
-         *)              /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" pytorch torchvision torchtext "cudatoolkit=${CUDA_VERSION}"  ;; \
+         *)              /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" pytorch torchvision torchtext "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
     esac && \
     /opt/conda/bin/conda clean -ya
 RUN /opt/conda/bin/pip install torchelastic
diff --git a/docker.Makefile b/docker.Makefile
index 0768f6ecf6ed8..9f433af435ed6 100644
--- a/docker.Makefile
+++ b/docker.Makefile
@@ -8,7 +8,7 @@ $(warning WARNING: No docker user found using results from whoami)
 DOCKER_ORG                = $(shell whoami)
 endif
 
-CUDA_VERSION              = 11.3.1
+CUDA_VERSION              = 11.6.2
 CUDNN_VERSION             = 8
 BASE_RUNTIME              = ubuntu:18.04
 BASE_DEVEL                = nvidia/cuda:$(CUDA_VERSION)-cudnn$(CUDNN_VERSION)-devel-ubuntu18.04
@@ -18,7 +18,7 @@ CUDA_CHANNEL              = nvidia
 # The conda channel to use to install pytorch / torchvision
 INSTALL_CHANNEL          ?= pytorch
 
-PYTHON_VERSION           ?= 3.8
+PYTHON_VERSION           ?= 3.10
 PYTORCH_VERSION          ?= $(shell git describe --tags --always)
 # Can be either official / dev
 BUILD_TYPE               ?= dev

From 00e9bfcb36b96797cc66af54ac5910d3fc1b4915 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Fri, 28 Oct 2022 07:07:44 -0700
Subject: [PATCH 0282/1922] Force people to call from_meta_and_device directly
 (#87903)

It was pretty hard to tell at call site if I was doing device meta
convert or not.  This gets rid of the "dual" API and forces people
to call the method manually for the device case.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87903
Approved by: https://github.com/eellison, https://github.com/albanD
---
 test/test_fake_tensor.py         |  2 +-
 torch/_subclasses/fake_tensor.py | 41 ++++++++++++++------------------
 2 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
index 0d81cdf10f82f..272e87575b8de 100644
--- a/test/test_fake_tensor.py
+++ b/test/test_fake_tensor.py
@@ -505,7 +505,7 @@ def test_memoized_conversion_from_meta(self):
         x = torch.rand(2, 2).to(device="meta")
         mode = FakeTensorMode()
         converter = mode.fake_tensor_converter
-        self.assertTrue(converter(mode, x, "cpu") is converter(mode, x, "cpu"))
+        self.assertTrue(converter.from_meta_and_device(mode, x, "cpu") is converter.from_meta_and_device(mode, x, "cpu"))
 
     def test_separate_tensor_storages_view(self):
         x = torch.rand(2, 2, 2)
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index fd121d182bb21..3e5cbdb652264 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -238,7 +238,11 @@ def from_real_tensor(self, fake_mode, t, make_constant=False, shape_env=None):
         self.set_tensor_memo(t, out)
         return out
 
+    # If you specify the device, it MUST be a meta tensor.
     def from_meta_and_device(self, fake_mode, t, device):
+        assert (
+            t.device.type == "meta"
+        ), f"tensor's device must be `meta`, got {t.device.type} instead"
         maybe_memo = self._get_memo(t)
         if maybe_memo is not None:
             return maybe_memo
@@ -246,29 +250,15 @@ def from_meta_and_device(self, fake_mode, t, device):
         self.set_tensor_memo(t, out)
         return out
 
-    # There are two ways to call this.  First, you can have manually constructed
-    # a meta tensor and you need to turn it into a fake tensor.  In that case,
-    # pass a meta tensor and a device argument.  Alternately, you can have a
-    # real tensor that you need to convert into a fake tensor; in that case,
-    # omit the device.
+    # You can have a real tensor that you need to convert into a fake tensor.
+    # If you have a meta tensor already, call from_meta_and_device.
     #
-    # The disallowed case: if you specify the device, it MUST be a meta tensor.
-    # However, you're allowed to pass a meta tensor to be turned into a fake
+    # You're allowed to pass a meta tensor to be turned into a fake
     # tensor; although an odd thing to do, this can occur if you're doing
-    # cross ref testing and the inner test is already operating on meta tensors
-    def __call__(
-        self, fake_mode, t, device=None, *, make_constant=False, shape_env=None
-    ):
-        if device is None:
-            return self.from_real_tensor(
-                fake_mode, t, make_constant, shape_env=shape_env
-            )
-        else:
-            assert make_constant is False
-            assert (
-                t.device.type == "meta"
-            ), f"tensor's device must be `meta`, got {t.device.type} instead"
-            return self.from_meta_and_device(fake_mode, t, device)
+    # cross ref testing and the inner test is already operating on meta tensors.
+    # You must have created the FakeTensorMode with allow_meta == True
+    def __call__(self, fake_mode, t, *, make_constant=False, shape_env=None):
+        return self.from_real_tensor(fake_mode, t, make_constant, shape_env=shape_env)
 
 
 op_implementations = []
@@ -320,7 +310,10 @@ def non_kwarg_to(fake_mode, func, *args, **kwargs):
     new_kwargs["device"] = torch.device("meta")
     inp = new_kwargs.pop("input")
     r = func(inp, **new_kwargs)
-    return fake_mode.fake_tensor_converter(fake_mode, r, out_device)
+    # TODO: I think this does the wrong thing if r is inp
+    return fake_mode.fake_tensor_converter.from_meta_and_device(
+        fake_mode, r, out_device
+    )
 
 
 # Dont default to default device handling,
@@ -950,7 +943,9 @@ def wrap(e, device=None):
                     # We thus directly convert real tensor to fake tensor.
                     return converter(self, e)
                 else:
-                    return converter(self, e, device or common_device)
+                    return converter.from_meta_and_device(
+                        self, e, device or common_device
+                    )
             else:
                 return e
 

From 19029b8e5dc9e2921660bc9b18a2cbc270dae067 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Fri, 28 Oct 2022 07:07:44 -0700
Subject: [PATCH 0283/1922] Convert MetaConverter's tensor memo into a weak
 value dictionary. (#87911)

This is in preparation for unifying fake tensor converter and meta converter's memo tables.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87911
Approved by: https://github.com/eellison
---
 test/test_fake_tensor.py        | 4 ++--
 test/test_meta.py               | 3 ++-
 torch/_subclasses/meta_utils.py | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
index 272e87575b8de..2588556c80f7f 100644
--- a/test/test_fake_tensor.py
+++ b/test/test_fake_tensor.py
@@ -554,10 +554,10 @@ def test_dead_key(self):
         converter = FakeTensorConverter()
         x_conv = converter(mode, x)
         self.assertEqual(len(converter.tensor_memo), 1)
-        self.assertEqual(len(converter.meta_converter.tensor_memo), 1)
+        x_conv2 = converter(mode, x)
+        assert x_conv2 is x_conv
         del x
         self.assertEqual(len(converter.tensor_memo), 0)
-        self.assertEqual(len(converter.meta_converter.tensor_memo), 0)
 
     def test_no_active_mode(self):
         with FakeTensorMode() as mode:
diff --git a/test/test_meta.py b/test/test_meta.py
index 99f78ddfb40c1..1abe4cd2cda75 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -171,9 +171,10 @@ def test_weakref(self):
         m.check_for_expired_weak_storages()
         self.assertEqual(len(m.storage_memo), 0)
         li = []
+        r = []
         for i in range(4):
             li.append(torch.rand([i]))
-            m(li[-1])
+            r.append(m(li[-1]))
         self.assertEqual(len(m.tensor_memo), 4)
         del li
         self.assertEqual(len(m.tensor_memo), 0)
diff --git a/torch/_subclasses/meta_utils.py b/torch/_subclasses/meta_utils.py
index 3e1040d037f0d..51231811631bc 100644
--- a/torch/_subclasses/meta_utils.py
+++ b/torch/_subclasses/meta_utils.py
@@ -56,7 +56,7 @@ def __eq__(self, other):
 class MetaConverter:
     def __init__(self):
         self.storage_memo = {}
-        self.tensor_memo = {}
+        self.tensor_memo: weakref.WeakValueDictionary = weakref.WeakValueDictionary()
         self.maybe_storages_to_delete = []
         self.check_expired_frequency = 128
         self.check_expired_count = 0

From 719798b19dcc3f8e433000bfb1525014c927121c Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Fri, 28 Oct 2022 02:02:25 +0000
Subject: [PATCH 0284/1922] [FSDP] New fix for composing with other module
 wrappers (#87950)

We change `.module` to pass through `ActivationWrapper` directly to the inner wrapped module. This should fix the state dict issues.

Given the invariant that `.module` always returns the inner wrapped module, FSDP always registers the `FlatParameter` on the inner wrapped module, regardless of if there is an intermediate `ActivationWrapper` or not. This avoids casing on whether `ActivationWrapper` is added before or after FSDP construction.

This PR removes the added unit test in `test_fsdp_misc.py` for changing the wrapped module because I would rather not complicated `_lazy_init()` logic just to support that kind of adversarial behavior. The user should not be swapping out the wrapped module arbitrarily or deleting the `FlatParameter`. I mainly had those tests to make sure that all branches of the code I added was correct.

Differential Revision: [D40799961](https://our.internmc.facebook.com/intern/diff/D40799961)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87950
Approved by: https://github.com/zhaojuanmao
---
 test/distributed/fsdp/test_fsdp_misc.py       | 94 -------------------
 test/distributed/fsdp/test_fsdp_state_dict.py | 63 +++++++++++++
 .../_checkpoint/checkpoint_wrapper.py         |  2 +
 .../fsdp/fully_sharded_data_parallel.py       | 44 ++-------
 4 files changed, 71 insertions(+), 132 deletions(-)

diff --git a/test/distributed/fsdp/test_fsdp_misc.py b/test/distributed/fsdp/test_fsdp_misc.py
index b57f5a1a94da2..98cd6488ae5e7 100644
--- a/test/distributed/fsdp/test_fsdp_misc.py
+++ b/test/distributed/fsdp/test_fsdp_misc.py
@@ -9,22 +9,12 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
-    _CHECKPOINT_PREFIX,
-    apply_activation_checkpointing,
-    checkpoint_wrapper,
-    CheckpointImpl,
-)
 from torch.distributed.fsdp import (
     CPUOffload,
     FlatParameter,
     FullyShardedDataParallel as FSDP,
     ShardingStrategy,
 )
-from torch.distributed.fsdp.fully_sharded_data_parallel import (
-    FLAT_PARAM,
-    FSDP_WRAPPED_MODULE,
-)
 from torch.distributed.fsdp.wrap import always_wrap_policy, transformer_auto_wrap_policy
 from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
@@ -499,90 +489,6 @@ def __init__(self, rank):
                 fsdp, process_group=self.process_group, assert_fn=self.assertEqual
             )
 
-    @skip_if_lt_x_gpu(2)
-    def test_change_wrapped_module_after_ctor(self):
-        """
-        Tests changing an FSDP instance's wrapped module after the FSDP
-        constructor.
-        """
-        dist.set_debug_level(dist.DebugLevel.DETAIL)
-
-        class Model(nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-                self.seq1 = nn.Sequential(
-                    nn.Linear(5, 5),
-                    nn.Linear(5, 5),
-                )
-                self.seq2 = nn.Sequential(nn.Linear(5, 5))
-                self.lin = nn.Linear(5, 5)
-                self.relu = nn.ReLU()
-
-            def forward(self, x: torch.Tensor) -> torch.Tensor:
-                return self.lin(self.relu(self.seq2(self.relu(self.seq1(x)))))
-
-        def get_fsdp_model():
-            fsdp_kwargs = {"use_orig_params": False}
-            model = Model().cuda()
-            model.seq1 = FSDP(model.seq1, **fsdp_kwargs)
-            model.seq2[0] = FSDP(model.seq2[0], **fsdp_kwargs)
-            model = FSDP(model, **fsdp_kwargs)
-            return model
-
-        # Wrap with `CheckpointWrapper` *after* FSDP construction
-        model = get_fsdp_model()
-        non_reentrant_wrapper = functools.partial(
-            checkpoint_wrapper,
-            offload_to_cpu=False,
-            checkpoint_impl=CheckpointImpl.NO_REENTRANT,
-        )
-        apply_activation_checkpointing(
-            model,
-            checkpoint_wrapper_fn=non_reentrant_wrapper,
-            check_fn=lambda submodule: isinstance(submodule, nn.Linear),
-        )
-
-        # Check that `seq2[0]` only has a single `FlatParameter` registered and
-        # that it has the `CheckpointWrapper` prefix in its FQN since it was
-        # registered to the `Linear` wrapped module in the FSDP constructor and
-        # only wrapped with `CheckpointWrapper` after
-        seq2_0_named_params = list(model.seq2[0].named_parameters())
-        self.assertEqual(len(seq2_0_named_params), 1)
-        self.assertTrue(type(seq2_0_named_params[0][1]) is FlatParameter)
-        self.assertTrue(_CHECKPOINT_PREFIX in seq2_0_named_params[0][0])
-
-        # Trigger the re-registration via `_lazy_init()`, and check for a
-        # warning, which is only emitted for DETAIL
-        with self.assertWarnsRegex(
-            UserWarning,
-            "The FSDP wrapped module changed from Linear.*to CheckpointWrapper",
-        ):
-            model._lazy_init()
-
-        # Check that now the `FlatParameter` is registered to the
-        # `CheckpointWrapper`, which is now the new wrapped module
-        seq2_0_named_params = list(model.seq2[0].named_parameters())
-        self.assertEqual(len(seq2_0_named_params), 1)
-        self.assertTrue(type(seq2_0_named_params[0][1]) is FlatParameter)
-        self.assertFalse(_CHECKPOINT_PREFIX in seq2_0_named_params[0][0])
-        self.assertFalse(isinstance(model.seq2[0].module, nn.Linear))
-
-        # Check that replacing a module *after* FSDP construction errors
-        model = get_fsdp_model()
-        # NOTE: Setting `model.seq2[0].module = nn.Linear(3, 3)` does not save
-        # to the FSDP instance's `module` attribute since `module` is a
-        # property, meaning that it would not actually change the wrapped
-        # module, so we use `setattr()` like in `_recursive_wrap()`.
-        setattr(model.seq2[0], FSDP_WRAPPED_MODULE, nn.Linear(3, 3))
-        with self.assertRaisesRegex(RuntimeError, "are invalid behavior"):
-            model._lazy_init()
-
-        # Check that deleting the `FlatParameter` errors
-        model = get_fsdp_model()
-        delattr(model.seq2[0].module, FLAT_PARAM)
-        with self.assertRaisesRegex(RuntimeError, "are invalid behavior"):
-            model._lazy_init()
-
 
 instantiate_parametrized_tests(TestFSDPMisc)
 
diff --git a/test/distributed/fsdp/test_fsdp_state_dict.py b/test/distributed/fsdp/test_fsdp_state_dict.py
index b8cbae5c2270e..133405033730d 100644
--- a/test/distributed/fsdp/test_fsdp_state_dict.py
+++ b/test/distributed/fsdp/test_fsdp_state_dict.py
@@ -284,6 +284,69 @@ def apply_ac_to_linears(model) -> None:
                 model_new.load_state_dict(state_dict, strict=True)
                 self._compare_models(model, model_new, self.assertEqual)
 
+    @skip_if_lt_x_gpu(2)
+    @parametrize("state_dict_type", _UNFLATTENED_STATE_DICT_IMPLS)
+    @parametrize("rank0_only_and_offload", [False, True])
+    def test_state_dict_with_manual_ac_wrapper(
+        self,
+        state_dict_type: str,
+        rank0_only_and_offload: bool,
+    ):
+        """
+        Tests saving and loading a state dict for a model manually wrapped with
+        ``FSDP(CheckpointWrapper(module))``, where the ``CheckpointWrapper`` is
+        wrapped before FSDP.
+
+        TODO: Investigate why the test above does not cover everything in this
+        test and de-duplicate afterwards.
+        """
+        if state_dict_type == "sharded_state_dict" and rank0_only_and_offload:
+            return  # not supported
+        model_ac = TransformerWithSharedParams.init(
+            self.process_group,
+            FSDPInitMode.NO_FSDP,
+            CUDAInitMode.CUDA_BEFORE,
+        )
+        # Manually wrap FSDP without AC
+        model_no_ac = deepcopy(model_ac)
+        for i, layer in enumerate(model_no_ac.transformer.encoder.layers):
+            model_no_ac.transformer.encoder.layers[i] = FSDP(layer)
+        for i, layer in enumerate(model_no_ac.transformer.decoder.layers):
+            model_no_ac.transformer.decoder.layers[i] = FSDP(layer)
+        model_no_ac.transformer = FSDP(model_no_ac.transformer)
+
+        # Manually wrap FSDP with AC as `FSDP(CheckpointWrapper(module))`
+        for i, layer in enumerate(model_ac.transformer.encoder.layers):
+            layer = checkpoint_wrapper(layer)
+            model_ac.transformer.encoder.layers[i] = FSDP(layer)
+        for i, layer in enumerate(model_ac.transformer.decoder.layers):
+            layer = checkpoint_wrapper(layer)
+            model_ac.transformer.decoder.layers[i] = FSDP(layer)
+        model_ac.transformer = FSDP(model_ac.transformer)
+
+        # Save, load, and compare the two models
+        with self._get_state_dict_mgr(
+            model_no_ac, state_dict_type, rank0_only_and_offload
+        ):
+            state_dict_no_ac = model_no_ac.state_dict()
+        with self._get_state_dict_mgr(
+            model_ac, state_dict_type, rank0_only_and_offload
+        ):
+            state_dict_ac = model_ac.state_dict()
+        self.assertEqual(state_dict_ac.keys(), state_dict_no_ac.keys())
+        if rank0_only_and_offload:
+            state_dict_no_ac = self._broadcast_state_dict(model_no_ac, state_dict_no_ac)
+            state_dict_ac = self._broadcast_state_dict(model_ac, state_dict_ac)
+        with self._get_state_dict_mgr(
+            model_no_ac, state_dict_type, rank0_only_and_offload
+        ):
+            model_no_ac.load_state_dict(state_dict_no_ac)
+        with self._get_state_dict_mgr(
+            model_ac, state_dict_type, rank0_only_and_offload
+        ):
+            model_ac.load_state_dict(state_dict_ac)
+        self._compare_models(model_ac, model_no_ac, self.assertEqual)
+
     @skip_if_lt_x_gpu(2)
     @parametrize("state_dict_type", _SUPPORTED_STATE_DICT_IMPLS)
     def test_state_dict_with_shared_parameters(self, state_dict_type):
diff --git a/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py b/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py
index 9e72fb6a21de9..35f8acf2cd089 100644
--- a/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py
+++ b/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py
@@ -8,6 +8,8 @@
 from torch.distributed.utils import _pack_kwargs, _replace_by_prefix, _unpack_kwargs
 from torch.utils.checkpoint import checkpoint as torch_utils_checkpoint
 
+# TODO: Refactor `_CHECKPOINT_PREFIX` to include the trailing '.' like FSDP
+_CHECKPOINT_WRAPPED_MODULE = "_checkpoint_wrapped_module"
 _CHECKPOINT_PREFIX = "_checkpoint_wrapped_module"
 
 class CheckpointImpl(Enum):
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 21b7787df766c..0fd601969c9a5 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -34,6 +34,8 @@
 from torch.distributed import ProcessGroup
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     _CHECKPOINT_PREFIX,
+    _CHECKPOINT_WRAPPED_MODULE,
+    ActivationWrapper,
 )
 from torch.distributed.algorithms._comm_hooks import default_hooks, LOW_PRECISION_HOOKS
 from torch.distributed.distributed_c10d import _get_default_group
@@ -1634,6 +1636,10 @@ def module(self) -> nn.Module:
         """
         Returns the wrapped module (like :class:`DistributedDataParallel`).
         """
+        # FSDP's `.module` must refer to the innermost wrapped module when
+        # composing with other module wrappers in order for state dict to work
+        if isinstance(self._fsdp_wrapped_module, ActivationWrapper):
+            return getattr(self._fsdp_wrapped_module, _CHECKPOINT_WRAPPED_MODULE)
         return self._fsdp_wrapped_module
 
     @property
@@ -1884,44 +1890,6 @@ def _lazy_init(self) -> None:
         # to non-root instances
         inconsistent_limit_all_gathers = False
         for fsdp_module in self.fsdp_modules(self):
-            if not fsdp_module._use_orig_params and fsdp_module._has_params:
-                # Check if the wrapped module changed after construction
-                # (e.g. applying the activation checkpointing wrapper) and
-                # if so, de-register the `FlatParameter` from the old
-                # wrapped module and register it to the new wrapped module
-                # NOTE: The `FlatParameter`'s FQN metadata is not updated, so
-                # any added wrappers must clean their prefixes from FQNs.
-                flat_param = fsdp_module._handles[0].flat_param
-                target_submodule = None
-                target_name = None
-                for submodule in fsdp_module.modules():
-                    for param_name, param in submodule._parameters.items():
-                        if flat_param is param:  # found registered `FlatParameter`
-                            target_submodule = submodule
-                            target_name = param_name
-                            break
-                    if target_submodule is not None:
-                        break
-                if (
-                    target_submodule is not None
-                    and target_submodule is not fsdp_module.module
-                ):
-                    assert target_name is not None
-                    if fsdp_module._debug_level == dist.DebugLevel.DETAIL:
-                        warnings.warn(
-                            "The FSDP wrapped module changed from "
-                            f"{target_submodule} to {fsdp_module.module} on "
-                            f"rank {fsdp_module.rank}. {fsdp_module}"
-                        )
-                    target_submodule._parameters.pop(target_name)  # de-register
-                    fsdp_module._register_flat_param()  # re-register
-                elif target_submodule is None:
-                    raise RuntimeError(
-                        "Either the FSDP wrapped module was removed from "
-                        "the model or its `FlatParameter` was manually "
-                        f"de-registered on rank {fsdp_module.rank}. Both of "
-                        f"these are invalid behavior. {fsdp_module}"
-                    )
             if fsdp_module is not self:
                 # Relax the assert for non-root FSDP instances in case the
                 # nested initialized module is wrapped again in FSDP later (e.g.

From 195959233dc4478b43d78ec3719e69d7b016766d Mon Sep 17 00:00:00 2001
From: Michael Lazos <mlazos@fb.com>
Date: Fri, 28 Oct 2022 21:33:53 +0000
Subject: [PATCH 0285/1922] Remove excess exception logging for minifier,
 cleanup backend failure exception format (#87537)

Fixes https://github.com/pytorch/torchdynamo/issues/1376

Ensures exceptions are printed only in one place, once.

implements some of the ideas from https://github.com/pytorch/torchdynamo/issues/1754
- Attaches a field to the exception which indicates that it's minified, a usage message is printed if this field is present

cc @jansel @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 @lezcano @fdrocha
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87537
Approved by: https://github.com/anijain2305
---
 torch/_dynamo/convert_frame.py    | 52 ++++++++++++++++++++-----------
 torch/_dynamo/debug_utils.py      | 11 +++----
 torch/_dynamo/eval_frame.py       | 51 +++++++++++++-----------------
 torch/_dynamo/exc.py              |  8 ++---
 torch/_dynamo/output_graph.py     |  5 +--
 torch/_dynamo/symbolic_convert.py |  8 +++--
 6 files changed, 69 insertions(+), 66 deletions(-)

diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index 0ebf3b93ce727..ea92e024b3dff 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -190,17 +190,6 @@ def has_tensor(obj):
 def format_error_msg(exc, code, record_filename=None, frame=None):
     msg = os.linesep * 2
 
-    def replay_record_msg():
-        if (
-            config.replay_record_enabled
-            and hasattr(exc, "exec_record")
-            and record_filename is not None
-        ):
-            return f"\nLast frame execution written to {record_filename}. To run only this frame while debugging, run\
- {config.dynamo_import}.replay('{record_filename}').\n"
-        else:
-            return ""
-
     if config.verbose:
         msg = format_bytecode(
             "WON'T CONVERT", code.co_name, code.co_filename, code.co_firstlineno, code
@@ -225,22 +214,42 @@ def replay_record_msg():
                 )
             )
 
-        msg += replay_record_msg()
-
     else:
         msg = f"WON'T CONVERT {code.co_name} {code.co_filename}\
  line {code.co_firstlineno} \ndue to: \n{traceback.format_exc(limit=-1)}"
 
-        if hasattr(exc, "real_stack") and len(exc.real_stack) > 0:
-            msg += f"\nfrom user code:\n {''.join(traceback.format_list([exc.real_stack[-1]]))}"
+    return msg
+
 
-        msg += replay_record_msg()
+def augment_exc_message(exc, msg="\n"):
+    if hasattr(exc, "real_stack") and len(exc.real_stack) > 0 and not config.verbose:
+        msg += f"\nfrom user code:\n {''.join(traceback.format_list([exc.real_stack[-1]]))}"
 
+    if config.replay_record_enabled and hasattr(exc, "record_filename"):
+        msg += f"\nLast frame execution written to {exc.record_filename}. To run only this frame while debugging, run\
+ {config.dynamo_import}.replay('{exc.record_filename}').\n"
+
+    msg += f"\nSet {config.dynamo_import}.config.verbose=True for more information\n"
+
+    if hasattr(exc, "inner_exception") and hasattr(
+        exc.inner_exception, "minifier_path"
+    ):
         msg += (
-            f"\nSet {config.dynamo_import}.config.verbose=True for more information\n"
+            f"\nMinifier script written to {exc.inner_exception.minifier_path}. Run"
+            "this script to find the smallest traced graph which reproduces this error.\n"
         )
+
+    if not config.suppress_errors:
+        msg += (
+            "\n\n"
+            "You can suppress this exception and fall back to eager by setting:\n"
+            "    torchdynamo.config.suppress_errors = True\n"
+        )
+
     msg += "=" * 10
-    return msg
+    old_msg = "" if len(exc.args) == 0 else exc.args[0]
+    new_msg = old_msg + msg
+    exc.args = (new_msg,) + exc.args[1:]
 
 
 def exception_handler(e, code, frame=None):
@@ -248,8 +257,13 @@ def exception_handler(e, code, frame=None):
     if hasattr(e, "exec_record"):
         record_filename = gen_record_file_name(e, code)
         write_record_to_file(record_filename, e.exec_record)
+        e.record_filename = record_filename
 
-    log.error(format_error_msg(e, code, record_filename, frame))
+    augment_exc_message(e)
+    # Only log the exception if we are going to suppress it
+    # if aren't suppressing it, a higher level except block will handle it
+    if config.suppress_errors:
+        log.error(format_error_msg(e, code, record_filename, frame))
 
 
 def convert_frame_assert(
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index 0ece930d1d13b..a89f71eac4ef6 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -796,9 +796,8 @@ def debug_wrapper(gm, example_inputs, **kwargs):
                     run_fwd_maybe_bwd(compiled_gm, example_inputs)
                 except Exception as exc:
                     log.warning(
-                        "Compiled Fx GraphModule failed with following error. Setting up minifier."
+                        "Compiled Fx GraphModule failed. Creating script to minify the error."
                     )
-                    log.exception(exc)
                     if config.repro_level == 1:
                         dump_state_fn = functools.partial(
                             dump_backend_state, compiler_name=compiler_name
@@ -812,9 +811,10 @@ def debug_wrapper(gm, example_inputs, **kwargs):
                             example_inputs,
                             compiler_name,
                         )
-                    raise ValueError(
-                        f"Issue detected. Repro at {get_minifier_repro_path()}."
+                    exc.minifier_path = os.path.join(
+                        minifier_dir(), "minifier_launcher.py"
                     )
+                    raise
         else:
             compiled_gm = compiler_fn(gm, example_inputs, **kwargs)
 
@@ -840,9 +840,8 @@ def dynamo_minifier_backend(gm, example_inputs, compiler_name):
     except Exception as exc:
         orig_failure = str(exc)
         log.warning(
-            "Compiled Fx GraphModule failed with following error. Starting minifier."
+            "Compiled Fx GraphModule failed. Creating script to minify the error."
         )
-        log.exception(exc)
         dump_state_fn = functools.partial(
             dump_backend_state, compiler_name=compiler_name
         )
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 947cc4108b8f7..de3d74cd89c28 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -221,35 +221,28 @@ def __init__(self):
 def catch_errors_wrapper(callback):
     @functools.wraps(callback)
     def catch_errors(frame, cache_size):
-        try:
-            if frame.f_lasti >= 0 or skipfiles.check(frame.f_code.co_filename):
-                log.debug(f"skipping {frame.f_code.co_name} {frame.f_code.co_filename}")
-                return None
-            if (
-                frame.f_code.co_filename == "<string>"
-                and frame.f_code.co_name == "__new__"
-            ):
-                # nametuple constructor
-                return None
-            if config.optimize_ddp:
-                ddp_module = DistributedDataParallel._get_active_ddp_module()
-                if ddp_module:
-                    with compile_lock:
-                        ddp_optimizer = DDPOptimizer(
-                            bucket_bytes_cap=ddp_module.bucket_bytes_cap,
-                            parameters_to_ignore=ddp_module.parameters_to_ignore,
-                            backend_compile_fn=callback._torchdynamo_orig_callable,
-                        )
-                        hijacked_callback = convert_frame.convert_frame(
-                            ddp_optimizer.compile_fn, guard_export_fn=None
-                        )
-                        return hijacked_callback(frame, cache_size)
-
-            with compile_lock:
-                return callback(frame, cache_size)
-        except Exception:
-            log.exception("Error while processing frame")
-            raise
+        if frame.f_lasti >= 0 or skipfiles.check(frame.f_code.co_filename):
+            log.debug(f"skipping {frame.f_code.co_name} {frame.f_code.co_filename}")
+            return None
+        if frame.f_code.co_filename == "<string>" and frame.f_code.co_name == "__new__":
+            # nametuple constructor
+            return None
+        if config.optimize_ddp:
+            ddp_module = DistributedDataParallel._get_active_ddp_module()
+            if ddp_module:
+                with compile_lock:
+                    ddp_optimizer = DDPOptimizer(
+                        bucket_bytes_cap=ddp_module.bucket_bytes_cap,
+                        parameters_to_ignore=ddp_module.parameters_to_ignore,
+                        backend_compile_fn=callback._torchdynamo_orig_callable,
+                    )
+                    hijacked_callback = convert_frame.convert_frame(
+                        ddp_optimizer.compile_fn, guard_export_fn=None
+                    )
+                    return hijacked_callback(frame, cache_size)
+
+        with compile_lock:
+            return callback(frame, cache_size)
 
     catch_errors._torchdynamo_orig_callable = callback
     return catch_errors
diff --git a/torch/_dynamo/exc.py b/torch/_dynamo/exc.py
index 80a2a75712ab4..b55e5e122c4e8 100644
--- a/torch/_dynamo/exc.py
+++ b/torch/_dynamo/exc.py
@@ -40,12 +40,8 @@ class BackendCompilerFailed(TorchDynamoException):
     def __init__(self, backend_fn, inner_exception):
         self.backend_name = getattr(backend_fn, "__name__", "?")
         self.inner_exception = inner_exception
-        super().__init__(
-            f"{self.backend_name} raised {type(inner_exception).__name__}: {inner_exception}"
-            "\n\n"
-            "You can suppress this exception and fall back to eager by setting:\n"
-            "    torchdynamo.config.suppress_errors = True"
-        )
+        msg = f"{self.backend_name} raised {type(inner_exception).__name__}: {inner_exception}"
+        super().__init__(msg)
 
 
 class Unsupported(TorchDynamoException):
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index 861798b78e81b..9dd9a713a25cd 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -326,6 +326,7 @@ def compile_subgraph(
             and len(set(stack_values)) == len(stack_values)
             and self.side_effects.is_empty()
         ):
+
             # optimization to generate better code in a common case
             self.add_output_instructions(
                 self.compile_and_call_fx_graph(tx, list(reversed(stack_values)), root)
@@ -434,10 +435,6 @@ def call_user_compiler(self, gm):
             _step_logger()(logging.INFO, f"done compiler function {name}")
             assert callable(compiled_fn), "compiler_fn did not return callable"
         except Exception as e:
-            log.warning("-" * 40 + "\n")
-            log.warning("TORCHDYNAMO: backend compiler failed\n")
-            log.warning(e, exc_info=True)
-            log.warning("-" * 40 + "\n")
             compiled_fn = gm.forward
             raise BackendCompilerFailed(self.compiler_fn, e) from e
         return compiled_fn
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index 4031a976f52d6..8f4eed446f3c9 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -36,7 +36,7 @@
     unique_id,
 )
 from .codegen import PyCodegen
-from .exc import unimplemented, Unsupported
+from .exc import BackendCompilerFailed, unimplemented, Unsupported
 from .guards import GuardBuilder
 from .output_graph import GraphCompileReason, OutputGraph
 from .replay_record import DummyModule, ExecutionRecorder
@@ -320,7 +320,10 @@ def step(self):
             if not hasattr(self, inst.opname):
                 unimplemented(f"missing: {inst.opname}")
             getattr(self, inst.opname)(inst)
+
             return inst.opname != "RETURN_VALUE"
+        except BackendCompilerFailed:
+            raise
         except Unsupported as exc:
             exc.real_stack.append(self.frame_summary())
             if self.empty_checkpoint():
@@ -349,10 +352,11 @@ def run(self):
                 and self.step()
             ):
                 pass
+        except BackendCompilerFailed:
+            raise
         except Exception as e:
             if config.replay_record_enabled:
                 e.exec_record = self.exec_recorder.get_record()
-
             raise
         finally:
             # Cleanup the outputGraph to delete the held tensors. We perform the

From b8fd8ff78343631894bbae2962c5e8e3a97fbe6d Mon Sep 17 00:00:00 2001
From: Eli Uriegas <1700823+seemethere@users.noreply.github.com>
Date: Fri, 28 Oct 2022 14:37:55 -0700
Subject: [PATCH 0286/1922] ci: Switch merge / revert flow to our own infra
 (#88009)

---
 .github/workflows/revert.yml   | 2 +-
 .github/workflows/trymerge.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/revert.yml b/.github/workflows/revert.yml
index 2a2fff27044ea..6e66cfd9cdc2a 100644
--- a/.github/workflows/revert.yml
+++ b/.github/workflows/revert.yml
@@ -7,7 +7,7 @@ on:
 jobs:
   do_revert:
     name: try_revert_pr_${{ github.event.client_payload.pr_num }}
-    runs-on: linux.20_04.4x
+    runs-on: linux.large.ephemeral
     env:
         GH_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
     steps:
diff --git a/.github/workflows/trymerge.yml b/.github/workflows/trymerge.yml
index 3d1d92967d885..02c4b40f92d71 100644
--- a/.github/workflows/trymerge.yml
+++ b/.github/workflows/trymerge.yml
@@ -7,7 +7,7 @@ on:
 jobs:
   do_merge:
     name: try_merge_pr_${{ github.event.client_payload.pr_num }}
-    runs-on: linux.20_04.4x
+    runs-on: linux.large.ephemeral
     env:
         GH_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
     steps:

From 4b0974177ac8c0e21e4bd7b14aa17bae9310385a Mon Sep 17 00:00:00 2001
From: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
Date: Fri, 28 Oct 2022 22:05:11 +0000
Subject: [PATCH 0287/1922] Update build scripts for ninja and ROCm5.3 install
 (#87505)

cc @jeffdaily @sunway513 @ROCmSupport
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87505
Approved by: https://github.com/seemethere
---
 .circleci/docker/build.sh               |  4 ++++
 .circleci/docker/common/install_base.sh |  1 +
 .circleci/docker/common/install_rocm.sh | 22 ++++++++++++++++++++--
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh
index ec2dfe8cb60ce..61d9b73d73dfb 100755
--- a/.circleci/docker/build.sh
+++ b/.circleci/docker/build.sh
@@ -250,6 +250,10 @@ case "$image" in
     fi
     if [[ "$image" == *rocm* ]]; then
       extract_version_from_image_name rocm ROCM_VERSION
+      NINJA_VERSION=1.9.0
+    fi
+    if [[ "$image" == *centos7* ]]; then
+      NINJA_VERSION=1.10.2
     fi
     if [[ "$image" == *gcc* ]]; then
       extract_version_from_image_name gcc GCC_VERSION
diff --git a/.circleci/docker/common/install_base.sh b/.circleci/docker/common/install_base.sh
index 5bca9f6dd3335..84835d6de50d7 100755
--- a/.circleci/docker/common/install_base.sh
+++ b/.circleci/docker/common/install_base.sh
@@ -69,6 +69,7 @@ install_ubuntu() {
     vim \
     jq \
     libtool \
+    vim \
     unzip \
     gdb
 
diff --git a/.circleci/docker/common/install_rocm.sh b/.circleci/docker/common/install_rocm.sh
index 51c8402aa3787..7ad0c4f123e1c 100644
--- a/.circleci/docker/common/install_rocm.sh
+++ b/.circleci/docker/common/install_rocm.sh
@@ -29,7 +29,12 @@ install_ubuntu() {
     if [[ $(ver $ROCM_VERSION) -ge $(ver 4.5) ]]; then
         # Add amdgpu repository
         UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
-        local amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/ubuntu"
+        local amdgpu_baseurl
+        if [[ $(ver $ROCM_VERSION) -ge $(ver 5.3) ]]; then
+          amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"
+        else
+          amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/ubuntu"
+        fi
         echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
     fi
 
@@ -38,6 +43,10 @@ install_ubuntu() {
         ROCM_REPO="xenial"
     fi
 
+    if [[ $(ver $ROCM_VERSION) -ge $(ver 5.3) ]]; then
+        ROCM_REPO="${UBUNTU_VERSION_NAME}"
+    fi
+
     # Add rocm repository
     wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
     local rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
@@ -78,7 +87,16 @@ install_centos() {
 
   if [[ $(ver $ROCM_VERSION) -ge $(ver 4.5) ]]; then
       # Add amdgpu repository
-      local amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/rhel/7.9/main/x86_64"
+      local amdgpu_baseurl
+      if [[ $OS_VERSION == 9 ]]; then
+          amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/rhel/9.0/main/x86_64"
+      else
+        if [[ $(ver $ROCM_VERSION) -ge $(ver 5.3) ]]; then
+          amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/7.9/main/x86_64"
+        else
+          amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/rhel/7.9/main/x86_64"
+        fi
+      fi
       echo "[AMDGPU]" > /etc/yum.repos.d/amdgpu.repo
       echo "name=AMDGPU" >> /etc/yum.repos.d/amdgpu.repo
       echo "baseurl=${amdgpu_baseurl}" >> /etc/yum.repos.d/amdgpu.repo

From ea5487e349ddced82e57534568af0d2f4a38683c Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Fri, 28 Oct 2022 02:02:25 +0000
Subject: [PATCH 0288/1922] [AC] Add trailing "." to  `_CHECKPOINT_PREFIX` like
 FSDP (#87951)

This is for consistency with FSDP.
- `_FSDP_WRAPPED_MODULE` and `_CHECKPOINT_WRAPPED_MODULE` are exactly the wrapped module variable name, meaning you can call `getattr(module, _FSDP_WRAPPED_MODULE)` or `getattr(module, _CHECKPOINT_WRAPPED_MODULE)`.
- `_FSDP_PREFIX` and `_CHECKPOINT_PREFIX` include the trailing `"."` and are only used for FQNs.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87951
Approved by: https://github.com/zhaojuanmao
---
 test/distributed/fsdp/test_fsdp_optim_state.py        |  4 ++--
 .../algorithms/_checkpoint/checkpoint_wrapper.py      | 11 +++++------
 torch/distributed/fsdp/fully_sharded_data_parallel.py |  7 ++-----
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/test/distributed/fsdp/test_fsdp_optim_state.py b/test/distributed/fsdp/test_fsdp_optim_state.py
index 5fe75ee309fa5..883ef7285cfe5 100644
--- a/test/distributed/fsdp/test_fsdp_optim_state.py
+++ b/test/distributed/fsdp/test_fsdp_optim_state.py
@@ -8,7 +8,7 @@
 import torch
 from torch import distributed as dist
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
-    _CHECKPOINT_PREFIX,
+    _CHECKPOINT_WRAPPED_MODULE,
     apply_activation_checkpointing,
 )
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
@@ -610,7 +610,7 @@ def test_full_optim_state_dict_keys(self):
         self.assertEqual(optim_state_dict["state"].keys(), state_dict.keys())
         # Check that checkpointing prefix was indeed stripped.
         for key in optim_state_dict["state"]:
-            self.assertNotIn(_CHECKPOINT_PREFIX, key)
+            self.assertNotIn(_CHECKPOINT_WRAPPED_MODULE, key)
 
     @skip_if_lt_x_gpu(2)
     def test_full_optim_state_dict_nested_invalid(self):
diff --git a/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py b/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py
index 35f8acf2cd089..8aa4a1875ab87 100644
--- a/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py
+++ b/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py
@@ -8,9 +8,8 @@
 from torch.distributed.utils import _pack_kwargs, _replace_by_prefix, _unpack_kwargs
 from torch.utils.checkpoint import checkpoint as torch_utils_checkpoint
 
-# TODO: Refactor `_CHECKPOINT_PREFIX` to include the trailing '.' like FSDP
 _CHECKPOINT_WRAPPED_MODULE = "_checkpoint_wrapped_module"
-_CHECKPOINT_PREFIX = "_checkpoint_wrapped_module"
+_CHECKPOINT_PREFIX = _CHECKPOINT_WRAPPED_MODULE + "."
 
 class CheckpointImpl(Enum):
     REENTRANT = auto()
@@ -55,10 +54,10 @@ def named_parameters(
     ) -> Iterator[Tuple[str, torch.nn.Parameter]]:
         """
         Overrides :meth:`named_parameters()` to intercept parameter names and
-        remove all occurrences of _CHECKPOINT_PREFIX.
+        remove all occurrences of ``_CHECKPOINT_PREFIX``.
         """
         for param_name, param in super().named_parameters(*args, **kwargs):
-            yield param_name.replace(f"{_CHECKPOINT_PREFIX}.", ""), param
+            yield param_name.replace(_CHECKPOINT_PREFIX, ""), param
 
     @staticmethod
     def _post_state_dict_hook(
@@ -75,7 +74,7 @@ def _post_state_dict_hook(
         checkpoint-wrapped modules as this class adds the prefix back before
         loading the state_dict.
         """
-        _replace_by_prefix(state_dict, f"{prefix}{_CHECKPOINT_PREFIX}.", prefix)
+        _replace_by_prefix(state_dict, f"{prefix}{_CHECKPOINT_PREFIX}", prefix)
         return state_dict
 
     @staticmethod
@@ -91,7 +90,7 @@ def _pre_load_state_dict_hook(
         prefix so that non-checkpointed modules can be loaded into
         checkpoint_wrapper modules properly.
         """
-        _replace_by_prefix(state_dict, prefix, prefix + f"{_CHECKPOINT_PREFIX}.")
+        _replace_by_prefix(state_dict, prefix, prefix + f"{_CHECKPOINT_PREFIX}")
 
 
 class OffloadWrapper(ActivationWrapper):
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 0fd601969c9a5..373e20b99a4b8 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -27,7 +27,6 @@
 
 import torch
 import torch.distributed as dist
-import torch.distributed.algorithms._checkpoint.checkpoint_wrapper as checkpoint_wrapper
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.autograd import Variable
@@ -2272,9 +2271,7 @@ def _convert_to_wrapped_module_name(self, module_name: str) -> str:
             module_name = f"{module_name}."
         # Activation checkpoint adds a prefix that has to be
         # removed as well.
-        module_name = module_name.replace(
-            f"{checkpoint_wrapper._CHECKPOINT_PREFIX}.", ""
-        )
+        module_name = module_name.replace(_CHECKPOINT_PREFIX, "")
         return module_name
 
     @property
@@ -4395,5 +4392,5 @@ def clean_tensor_name(tensor_name: str) -> str:
     # as it increases coupling between CheckpointWrapper and FSDP. This is also not
     # scalable for additional wrapped modules, we should come up with a general solution
     # for this issue.
-    tensor_name = tensor_name.replace(_CHECKPOINT_PREFIX + ".", "")
+    tensor_name = tensor_name.replace(_CHECKPOINT_PREFIX, "")
     return tensor_name

From 2bb5b3af66bafb802015fb93814da2bc8eeda462 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <1700823+seemethere@users.noreply.github.com>
Date: Fri, 28 Oct 2022 15:12:31 -0700
Subject: [PATCH 0289/1922] Revert "ci: Switch merge / revert flow to our own
 infra" (#88016)

---
 .github/workflows/revert.yml   | 2 +-
 .github/workflows/trymerge.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/revert.yml b/.github/workflows/revert.yml
index 6e66cfd9cdc2a..2a2fff27044ea 100644
--- a/.github/workflows/revert.yml
+++ b/.github/workflows/revert.yml
@@ -7,7 +7,7 @@ on:
 jobs:
   do_revert:
     name: try_revert_pr_${{ github.event.client_payload.pr_num }}
-    runs-on: linux.large.ephemeral
+    runs-on: linux.20_04.4x
     env:
         GH_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
     steps:
diff --git a/.github/workflows/trymerge.yml b/.github/workflows/trymerge.yml
index 02c4b40f92d71..3d1d92967d885 100644
--- a/.github/workflows/trymerge.yml
+++ b/.github/workflows/trymerge.yml
@@ -7,7 +7,7 @@ on:
 jobs:
   do_merge:
     name: try_merge_pr_${{ github.event.client_payload.pr_num }}
-    runs-on: linux.large.ephemeral
+    runs-on: linux.20_04.4x
     env:
         GH_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
     steps:

From e21798b2a2cb2e2b8ba787ccb708e39e2061944f Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Thu, 27 Oct 2022 10:49:55 -0700
Subject: [PATCH 0290/1922] [quant][improvement] Check the fixedqparam op
 qconfig based on backend_config (#87425)

Summary:
Previously we hardcoded the supported observers for fixedqparam ops, this PR changes that to take the information from BackendConfig,
this allows users to customize the support for fixed qparam ops

Test Plan:
python test/test_quantization.py TestQuantizeFx.test_change_backend_config_for_fixed_qparam_ops

Reviewers:

Subscribers:

Tasks:

Tags:

unlinked from diff since it's too hard to land
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87425
Approved by: https://github.com/andrewor14
---
 test/quantization/fx/test_quantize_fx.py      | 28 +++++++++++++++++++
 torch/ao/quantization/backend_config/utils.py |  8 ++++++
 torch/ao/quantization/fx/prepare.py           | 16 +++++++----
 3 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 3f39e4bfbbb41..84632e1e2f313 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -5263,6 +5263,34 @@ def forward(self, x):
         # make sure it runs
         m(*example_inputs)
 
+    def test_change_backend_config_for_fixed_qparam_ops(self):
+        """ Making sure we can skip validation of qconfigs for fixedqparam ops based
+        on BackendConfig
+        """
+        class M(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.tanh = torch.nn.Tanh()
+
+            def forward(self, x: torch.Tensor):
+                x = self.tanh(x)
+                return x
+
+        model = M().eval()
+        # we set a global default_qconfig, which will be ignored since the backend
+        # we defined doesn't support anything
+        # this is to make sure we don't validate the qconfig when BackendConfig does not
+        # have fixed qparam op related configurations
+        qconfig_mapping = QConfigMapping().set_global(default_qconfig)
+        backend_config = BackendConfig()
+        # make sure this runs
+        model = prepare_fx(
+            model,
+            qconfig_mapping=qconfig_mapping,
+            example_inputs=(torch.randn(1, 2, 3, 4),),
+            backend_config=backend_config
+        )
+
 @skipIfNoFBGEMM
 class TestQuantizeFxOps(QuantizationTestCase):
     def setUp(self):
diff --git a/torch/ao/quantization/backend_config/utils.py b/torch/ao/quantization/backend_config/utils.py
index fc7e9aca9ff69..cdc58327fbee1 100644
--- a/torch/ao/quantization/backend_config/utils.py
+++ b/torch/ao/quantization/backend_config/utils.py
@@ -5,6 +5,7 @@
 import torch.nn.functional as F
 from .backend_config import BackendConfig, DTypeConfig
 from ..utils import Pattern
+from ..observer import _PartialWrapper
 
 __all__ = [
     "get_pattern_to_dtype_configs",
@@ -85,6 +86,13 @@ def get_root_node(node_pattern):
             root_node_getter_mapping[pattern] = config._root_node_getter
     return root_node_getter_mapping
 
+def get_fixed_qparams_op_to_overwrite_output_observer(backend_config: BackendConfig) -> Dict[Union[Callable, str], _PartialWrapper]:
+    fixed_qparam_op_to_overwrite_output_observer: Dict[Union[Callable, str], _PartialWrapper] = {}
+    for pattern, config in backend_config.configs.items():
+        if config._overwrite_output_observer is not None:
+            fixed_qparam_op_to_overwrite_output_observer[pattern] = config._overwrite_output_observer  # type: ignore[index]
+    return fixed_qparam_op_to_overwrite_output_observer
+
 def get_fusion_pattern_to_extra_inputs_getter(backend_config: BackendConfig) -> Dict[Pattern, Callable]:
     """ Get a map from fusion pattern to a function that returns extra input nodes
     from the fusion pattern, in the order required by the root node. This is optional,
diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index cc6fc65bd8906..0459685edb571 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -25,7 +25,6 @@
     QConfigAny,
 )
 from ..qconfig_mapping import (
-    _FIXED_QPARAMS_OP_TO_OBSERVER,
     QConfigMapping,
 )
 from ..qconfig_mapping_utils import (
@@ -99,6 +98,7 @@
     get_pattern_to_dtype_configs,
     get_module_to_qat_module,
     get_fusion_pattern_to_root_node_getter,
+    get_fixed_qparams_op_to_overwrite_output_observer,
 )
 from ..backend_config import (
     BackendConfig,
@@ -1392,7 +1392,10 @@ def insert_observers_for_model(
 
     return results_node
 
-def _validate_fixed_qparams_qconfigs(model: GraphModule, node_name_to_qconfig: Dict[str, QConfigAny]):
+def _validate_fixed_qparams_qconfigs(
+        model: GraphModule,
+        node_name_to_qconfig: Dict[str, QConfigAny],
+        backend_config: BackendConfig):
     """
     Validate whether the correct observers are configured for fixed qparams ops in the model, if any.
     """
@@ -1402,6 +1405,8 @@ def _validate_fixed_qparams_qconfigs(model: GraphModule, node_name_to_qconfig: D
         float16_static_qconfig.activation,
     ]
     named_modules = dict(model.named_modules(remove_duplicate=False))
+    fixed_qparams_op_to_overwrite_output_observer = \
+        get_fixed_qparams_op_to_overwrite_output_observer(backend_config)
     for node in model.graph.nodes:
         if node.op == "call_function":
             module_type_or_function_or_method = node.target
@@ -1410,13 +1415,14 @@ def _validate_fixed_qparams_qconfigs(model: GraphModule, node_name_to_qconfig: D
         else:
             module_type_or_function_or_method = None
 
-        if module_type_or_function_or_method in _FIXED_QPARAMS_OP_TO_OBSERVER:
+        if module_type_or_function_or_method in fixed_qparams_op_to_overwrite_output_observer:
             bad_observer = True
             qconfig = node_name_to_qconfig.get(node.name, None)
             if qconfig is None:
                 bad_observer = False
             else:
-                for observer_ctr in allowed_observer_ctrs + [_FIXED_QPARAMS_OP_TO_OBSERVER[module_type_or_function_or_method]]:
+                for observer_ctr in allowed_observer_ctrs + [
+                        fixed_qparams_op_to_overwrite_output_observer[module_type_or_function_or_method]]:
                     if obs_or_fq_ctr_equals(
                             qconfig.activation,
                             FixedQParamsFakeQuantize.with_args(observer=observer_ctr)) or \
@@ -1603,7 +1609,7 @@ def prepare(
     equalization_node_name_to_qconfig = generate_node_name_to_qconfig(
         model, modules, model.graph, _equalization_config, node_name_to_scope)
     node_name_to_qconfig = generate_node_name_to_qconfig(model, modules, model.graph, qconfig_mapping, node_name_to_scope)
-    _validate_fixed_qparams_qconfigs(model, node_name_to_qconfig)
+    _validate_fixed_qparams_qconfigs(model, node_name_to_qconfig, backend_config)
 
     # match the patterns that will get quantized
     standalone_module_names = list(prepare_custom_config.standalone_module_names.keys())

From e950462a33b9eb6aa4c44f3c2a40b1be7dc4bce7 Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Fri, 28 Oct 2022 23:51:42 +0000
Subject: [PATCH 0291/1922] Add 'share_from_this' to 'torch::jit::Graph'
 (#87343)

Avoid passing raw pointer of 'torch::jit::Graph' to python. Otherwise, it will corrupt the
`internals::registered_instance` of pybind11, caching a holder for python w.r.t the raw
pointer of 'torch::jit::Graph', while not increasing the use count of the existing shared_ptr.

The behavior afterwards is random and probably undefined.
Most of the time it works, if the holder is deallocated timely on python side, and the
cache then cleared from `internals::registered_instance`. Things are back to normal.
Otherwise, it fails with either segfault or a runtime error of message "Unable to cast
from non-held to held instance". One of such scenarios is normally and correctly
returning a shared_ptr of that 'torch::jit::Graph' to python. Pybind finds the holder via
cache. Due to this, the shared_ptr use_count will not increase. If there is no other use
on C++ side, the graph will be freed, while python still has access, via the holder created
previously.

@t-vi had a great analysis and solution to this exact problem at #51833 which I hope
I had seen before debugging this issue... ~~I'm building the PR based on the original
commit. @t-vi please let me know if you'd prefer otherwise.~~ Sending the PR separately
due to CLA issues.

Need to check in CI if adding `enable_shared_from_this` breaks other stuff.

Fixes #51833, and CI issues in #87258, #86182.

cc @malfet, @kit1980 for changes on JIT IR.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87343
Approved by: https://github.com/justinchuby, https://github.com/AllenTiTaiWang, https://github.com/malfet
---
 test/onnx/test_custom_ops.py   |  7 ++++---
 torch/csrc/jit/ir/ir.h         | 15 ++++++++++++++-
 torch/csrc/jit/passes/onnx.cpp | 19 ++++++++++++++++---
 3 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/test/onnx/test_custom_ops.py b/test/onnx/test_custom_ops.py
index db5ddfd001140..4242d70583ba3 100644
--- a/test/onnx/test_custom_ops.py
+++ b/test/onnx/test_custom_ops.py
@@ -38,9 +38,7 @@ def forward(self, a, b):
         def symbolic_custom_add(g, self, other):
             return g.op("Add", self, other)
 
-        from torch.onnx import register_custom_op_symbolic
-
-        register_custom_op_symbolic(
+        torch.onnx.register_custom_op_symbolic(
             "custom_namespace::custom_add", symbolic_custom_add, 9
         )
 
@@ -48,6 +46,9 @@ def symbolic_custom_add(g, self, other):
         y = torch.randn(2, 3, 4, requires_grad=False)
 
         model = CustomAddModel()
+        # before fixing #51833 this used to give a PyBind error
+        # with PyTorch 1.10dev ("Unable to cast from non-held to held
+        # instance (T& to Holder<T>)")
         onnxir, _ = do_export(model, (x, y), opset_version=11)
         onnx_model = onnx.ModelProto.FromString(onnxir)
         prepared = c2.prepare(onnx_model)
diff --git a/torch/csrc/jit/ir/ir.h b/torch/csrc/jit/ir/ir.h
index 67f878e9f7065..402dd58b00846 100644
--- a/torch/csrc/jit/ir/ir.h
+++ b/torch/csrc/jit/ir/ir.h
@@ -239,6 +239,11 @@ struct Value {
   const Node* node() const {
     return node_;
   }
+
+  /**
+   * @warning NEVER pass raw pointer of smart pointer managed Graph to Python.
+   * Check #87343 for details.
+   */
   Graph* owningGraph();
   const Graph* owningGraph() const;
   // TODO: make this more const correct
@@ -398,6 +403,10 @@ struct TORCH_API Node {
   }
   SourceRange sourceRange() const;
 
+  /**
+   * @warning NEVER pass raw pointer of smart pointer managed Graph to Python.
+   * Check #87343 for details.
+   */
   Graph* owningGraph() {
     return graph_;
   }
@@ -1049,6 +1058,10 @@ struct Block {
   const Node* param_node() const {
     return input_;
   }
+  /**
+   * @warning NEVER pass raw pointer of smart pointer managed Graph to Python.
+   * Check #87343 for details.
+   */
   Graph* owningGraph() {
     return graph_;
   }
@@ -1163,7 +1176,7 @@ struct Block {
   std::shared_ptr<Wrap<Block>> wrap_;
 };
 
-struct Graph {
+struct Graph : std::enable_shared_from_this<Graph> {
   AT_DISALLOW_COPY_AND_ASSIGN(Graph);
   friend struct Node;
   friend struct Value;
diff --git a/torch/csrc/jit/passes/onnx.cpp b/torch/csrc/jit/passes/onnx.cpp
index f5e948b2cacf3..607f2ce61ada4 100644
--- a/torch/csrc/jit/passes/onnx.cpp
+++ b/torch/csrc/jit/passes/onnx.cpp
@@ -426,8 +426,16 @@ void NodeToONNX(
 
     WithInsertPoint insert_point_guard(new_block);
     WithCurrentScope scope_guard(*g, n->scope());
+
+    // IMPORTANT: NEVER pass raw pointer of smart pointer managed objects to
+    // Python. Check #87343 for details.
     py::object raw_output = onnx.attr("_run_symbolic_function")(
-        g, new_block, n, py_inputs, env, operator_export_type);
+        g->shared_from_this(),
+        new_block,
+        n,
+        py_inputs,
+        env,
+        operator_export_type);
 
     // Find new nodes that have been created by _run_symbolic_function and
     // propagate metadata
@@ -530,8 +538,11 @@ void NodeToONNX(
               opset_version,
               pyobj.attr("symbolic"),
               /* custom */ true);
+
+      // IMPORTANT: NEVER pass raw pointer of smart pointer managed objects to
+      // Python. Check #87343 for details.
       py::object raw_output = onnx.attr("_run_symbolic_method")(
-          new_block->owningGraph(),
+          new_block->owningGraph()->shared_from_this(),
           op->name(),
           pyobj.attr("symbolic"),
           py_symbolic_args);
@@ -542,8 +553,10 @@ void NodeToONNX(
       Node* n = static_cast<Node*>(op);
       n->s_(attr::name, op->name());
       // Call symbolic function
+      // IMPORTANT: NEVER pass raw pointer of smart pointer managed objects to
+      // Python. Check #87343 for details.
       py::object raw_output = onnx.attr("_run_symbolic_function")(
-          new_block->owningGraph(),
+          new_block->owningGraph()->shared_from_this(),
           new_block,
           n,
           py_symbolic_args,

From 14a3ce9a070443e2fe2160dc77ca3385b98932ae Mon Sep 17 00:00:00 2001
From: Daniil Kutz <kutz@ispras.ru>
Date: Fri, 28 Oct 2022 23:51:53 +0000
Subject: [PATCH 0292/1922] Fix bugs found by static analysis (#85705)

These PR fixes a number of bugs found by Svace static analyzer:

1. DEREF_AFTER_FREE at qnnpack_utils.h:
Pointer '&convolution->zero_buffer' is dereferenced at qnnpack_utils.h:258 after the referenced memory was deallocated at operator-delete.c:25 by passing as 1st parameter to function 'pytorch_qnnp_delete_operator' at qnnpack_utils.h:251.
2. DEREF_AFTER_NULL at impl.cpp:
After having been compared to NULL value at impl.cpp:1892, pointer 'schema' is passed as 2nd parameter in call to function 'c10::operator<<' at impl.cpp:1921, where it is dereferenced at function_schema_inl.h:13.
3. DEREF_OF_NULL  at stmt.h:
After having been compared to NULL value at stmt.h:744, pointer 'body->_M_ptr' is passed in call to function 'torch::jit::tensorexpr::malformed_input::malformed_input' at stmt.h:745, where it is dereferenced at exceptions.h:67.
4. DEREF_OF_NULL  at loopnest.h:
Pointer 'f->ptr' that can have only NULL value (checked at loopnest.cpp:1482), is passed in call to function 'torch::jit::tensorexpr::malformed_input::malformed_input' at loopnest.cpp:1483, where it is dereferenced at exceptions.h:67.
This is the same error as 3: forwarding a nullptr to malformed_input().
4. TAINTED_INT.LOOP in python_arg_parser:
Integer value 'this->size' obtained from untrusted source at python_arg_parser.cpp:118 without checking its bounds is used as a loop bound at python_arg_parser.cpp:698 by calling function 'torch::FunctionParameter::set_default_str' at python_arg_parser.cpp:133.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/85705
Approved by: https://github.com/kit1980
---
 aten/src/ATen/native/quantized/cpu/QnnpackUtils.h |  5 +++--
 torch/csrc/jit/runtime/static/impl.cpp            |  2 +-
 torch/csrc/jit/tensorexpr/loopnest.cpp            | 12 ++++++------
 torch/csrc/jit/tensorexpr/stmt.h                  |  8 ++++----
 torch/csrc/utils/python_arg_parser.cpp            |  1 +
 5 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h b/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h
index 40dfad16e9c52..9c6c721657cb1 100644
--- a/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/QnnpackUtils.h
@@ -272,8 +272,9 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase<kSpatialDim> {
     void* zero_buffer = malloc(zero_size);
     if (zero_buffer == nullptr) {
       pytorch_qnnp_delete_operator(convolution);
-      pytorch_qnnp_log_error(
-          "failed to allocate %zu bytes for zero padding", zero_size);
+      TORCH_INTERNAL_ASSERT(
+          false, "failed to allocate %zu bytes for zero padding",
+          zero_size);
     }
     // Need to set to input zero point
     // memset(zero_buffer, input_zero_point, zero_size);
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index 4b8015f85c0a9..3f87df14f555e 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -2062,7 +2062,7 @@ bool ProcessedNode::verify_inputs_dont_overlap_outputs(bool force_check) const {
   bool skip_check = !schema ||
       ((schema->is_mutable() || !fn_->checkMemoryOverlap()) &&
        num_outputs() == 1);
-  if (!force_check && skip_check) {
+  if (!schema || (!force_check && skip_check)) {
     if (!schema) {
       VLOG(2) << "Detected that op schema is null";
       return true;
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index 6b66d48fe505e..a9cab316aa3e4 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -1506,12 +1506,12 @@ void LoopNest::sliceHead(ForPtr f, int factor, ForPtr* head, ForPtr* tail) {
   }
 
   if (!f) {
-    throw malformed_input("sliceHead attempted on null loop", f);
+    throw malformed_input("sliceHead attempted on null loop");
   }
 
   BlockPtr p = to<Block>(f->get_parent());
   if (!p) {
-    throw malformed_input("sliceHead attempted on loop with no parent", p);
+    throw malformed_input("sliceHead attempted on loop with no parent");
   }
 
   ExprPtr head_end = alloc<Min>(
@@ -1546,12 +1546,12 @@ void LoopNest::sliceTail(ForPtr f, int factor, ForPtr* head, ForPtr* tail) {
   }
 
   if (!f) {
-    throw malformed_input("sliceTail attempted on null loop", f);
+    throw malformed_input("sliceTail attempted on null loop");
   }
 
   BlockPtr p = to<Block>(f->get_parent());
   if (!p) {
-    throw malformed_input("sliceTail attempted on loop with no parent", p);
+    throw malformed_input("sliceTail attempted on loop with no parent");
   }
 
   ExprPtr tail_start = alloc<Max>(
@@ -1585,12 +1585,12 @@ void LoopNest::splitWithTail(
     ForPtr* inner,
     ForPtr* tail) {
   if (!f) {
-    throw malformed_input("splitWithTail attempted on null loop", f);
+    throw malformed_input("splitWithTail attempted on null loop");
   }
 
   BlockPtr p = to<Block>(f->get_parent());
   if (!p) {
-    throw malformed_input("splitWithTail attempted on loop with no parent", p);
+    throw malformed_input("splitWithTail attempted on loop with no parent");
   }
 
   // Normalize the loop to simplify start and stop bound computation
diff --git a/torch/csrc/jit/tensorexpr/stmt.h b/torch/csrc/jit/tensorexpr/stmt.h
index d2894ea157e6e..37a204bfa3294 100644
--- a/torch/csrc/jit/tensorexpr/stmt.h
+++ b/torch/csrc/jit/tensorexpr/stmt.h
@@ -754,13 +754,13 @@ class TORCH_API For : public StmtNode<For> {
         stop_(stop),
         loop_options_(std::move(loop_options)) {
     if (!var) {
-      throw malformed_input("invalid Var in For loop", var);
+      throw malformed_input("invalid Var in For loop");
     } else if (!start) {
-      throw malformed_input("invalid Start in For loop", start);
+      throw malformed_input("invalid Start in For loop");
     } else if (!stop) {
-      throw malformed_input("invalid Stop in For loop", stop);
+      throw malformed_input("invalid Stop in For loop");
     } else if (!body || body->get_parent()) {
-      throw malformed_input("invalid Body in For loop", body);
+      throw malformed_input("invalid Body in For loop");
     }
 
     BlockPtr b = to<Block>(body);
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index f03763f9dca38..e46e09c088299 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -931,6 +931,7 @@ static inline std::vector<int64_t> parse_intlist_args(
 
   // case 1. s is an int (e.g., s=2)
   if (s[0] != '{') {
+    TORCH_CHECK(size > 0, "Incorrect size of IntArrayRef: ", size);
     return std::vector<int64_t>(size, std::stol(s));
   }
 

From 7ff5af966dbbaffc8838be1b1cb7d6c881f8451f Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Fri, 28 Oct 2022 23:59:47 +0000
Subject: [PATCH 0293/1922] [CI] Report build errors in Windows build step
 (#88001)

Should make failures like https://github.com/pytorch/pytorch/actions/runs/3346715682/jobs/5543900889 much more debuggable

P.S. I don't know how to write batch, just hope its going to work

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88001
Approved by: https://github.com/seemethere
---
 .jenkins/pytorch/win-test-helpers/build_pytorch.bat | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
index 9c9c9cd64290b..1ea7912291135 100644
--- a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
@@ -136,7 +136,12 @@ if "%REBUILD%" == "" (
   )
 )
 
-python setup.py bdist_wheel && sccache --show-stats && python -c "import os, glob; os.system('python -mpip install ' + glob.glob('dist/*.whl')[0] + '[opt-einsum]')" (
+python setup.py bdist_wheel
+if errorlevel 1 exit /b
+if not errorlevel 0 exit /b
+sccache --show-stats
+python -c "import os, glob; os.system('python -mpip install ' + glob.glob('dist/*.whl')[0] + '[opt-einsum]')"
+(
   if "%BUILD_ENVIRONMENT%"=="" (
     echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash.
   ) else (

From 1e45d5b8f03947573eb71e1a6cf7a356e54d61cf Mon Sep 17 00:00:00 2001
From: Sergii Dymchenko <sdym@fb.com>
Date: Sat, 29 Oct 2022 00:23:47 +0000
Subject: [PATCH 0294/1922] Disable slow-gradcheck tests (#88008)

Disable because slow-gradcheck tests take > 4 hrs and time out. Will need to figure out if and how to re-enable later.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88008
Approved by: https://github.com/seemethere, https://github.com/huydhn
---
 .github/workflows/periodic.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index 58e379e0b5fd2..4eca9c890852f 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -14,6 +14,9 @@ concurrency:
 
 jobs:
   linux-bionic-cuda11_6-py3-gcc7-slow-gradcheck-build:
+    # Disable because slow-gradcheck tests take > 4 hrs and time out.
+    # TODO(sdym@meta.com): investigate re-enabling slow-gradcheck tests.
+    if: false
     name: linux-bionic-cuda11.6-py3-gcc7-slow-gradcheck
     uses: ./.github/workflows/_linux-build.yml
     with:
@@ -26,6 +29,9 @@ jobs:
         ]}
 
   linux-bionic-cuda11_6-py3-gcc7-slow-gradcheck-test:
+    # Disable because slow-gradcheck tests take > 4 hrs and time out.
+    # TODO(sdym@meta.com): investigate re-enabling slow-gradcheck tests.
+    if: false
     name: linux-bionic-cuda11.6-py3-gcc7-slow-gradcheck
     uses: ./.github/workflows/_linux-test.yml
     needs: linux-bionic-cuda11_6-py3-gcc7-slow-gradcheck-build

From 4c42bc317251e86721975bbe07382bb178e17663 Mon Sep 17 00:00:00 2001
From: Richard Zou <rzou@meta.com>
Date: Sat, 29 Oct 2022 01:21:55 +0000
Subject: [PATCH 0295/1922] Fix functorch tests (#87914)

Test Plan: - Run tests

Differential Revision: D40777145

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87914
Approved by: https://github.com/Chillee, https://github.com/osalpekar
---
 test/functorch/test_eager_transforms.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py
index 9361e51454787..26b64c5e70cca 100644
--- a/test/functorch/test_eager_transforms.py
+++ b/test/functorch/test_eager_transforms.py
@@ -3013,7 +3013,11 @@ def slice_weights(batched_weights, index):
         self.assertEqual(result_loss, expected_loss)
         self.assertEqual(result_weights, expected_weights)
 
-    @parametrize("dropout_layer", [nn.Dropout, nn.AlphaDropout, nn.FeatureAlphaDropout])
+    @parametrize("dropout_layer", [
+        subtest(nn.Dropout, 'Dropout'),
+        subtest(nn.AlphaDropout, 'AlphaDropout'),
+        subtest(nn.FeatureAlphaDropout, 'FeatureAlphaDropout'),
+    ])
     def test_find_learning_rate_ensembling(self, device, dropout_layer):
         # This example mimics what a user might do when trying to find the optimal learning rate. They would
         # want to run a bunch of models with the same behavior (including the same dropout!) and have them
@@ -3243,6 +3247,7 @@ def f(x: torch.Tensor) -> torch.Tensor:
         self.assertEqual(out1, out2)
         self.assertEqual(inpt1, inpt2)
 
+    @unittest.skipIf(IS_FBCODE, 'fails in fbcode')
     def test_vmap_functionalize_jvp(self, device):
 
         def f(x: torch.Tensor) -> torch.Tensor:
@@ -3431,6 +3436,7 @@ def forward(self, a_1, b_1) -> torch.Tensor:
     return index
     """)
 
+    @unittest.skipIf(IS_FBCODE, 'fails in fbcode')
     def test_functionalize_optional_tensorlist2(self, device):
 
         def f(a, b) -> torch.Tensor:

From 7bd8a8e6fa70639ef6100bb25f81c03fc6748390 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Sat, 29 Oct 2022 01:26:15 +0000
Subject: [PATCH 0296/1922] Fix typos under benchmarks, test, and tools
 directories (#87975)

This PR fixes typos in `.md` files under benchmarks, test, and tools directories
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87975
Approved by: https://github.com/kit1980
---
 benchmarks/distributed/ddp/README.md    | 2 +-
 benchmarks/instruction_counts/README.md | 2 +-
 benchmarks/operator_benchmark/README.md | 2 +-
 test/mobile/model_test/README.md        | 2 +-
 tools/code_coverage/README.md           | 6 +++---
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/benchmarks/distributed/ddp/README.md b/benchmarks/distributed/ddp/README.md
index 0bf254ee4cce2..f89aaff9809eb 100644
--- a/benchmarks/distributed/ddp/README.md
+++ b/benchmarks/distributed/ddp/README.md
@@ -158,7 +158,7 @@ Benchmark: resnext101_32x8d with batch size 32
 ```
 
 This compares throughput between `bucket_cap_mb=25` (the default) and
-`bucket_cap_mb=1` on 8 DGX machines with V100 GPUs. It confims that
+`bucket_cap_mb=1` on 8 DGX machines with V100 GPUs. It confirms that
 even for a relatively small model on machines with a very fast
 interconnect (4x 100Gb InfiniBand per machine), it still pays off to
 batch allreduce calls.
diff --git a/benchmarks/instruction_counts/README.md b/benchmarks/instruction_counts/README.md
index ed2633caba151..32071e8aa80e0 100644
--- a/benchmarks/instruction_counts/README.md
+++ b/benchmarks/instruction_counts/README.md
@@ -73,7 +73,7 @@ Timer(
 ```
 
 Moreover, because `signature` is provided we know that creation of `x` and `w`
-is part of setup, and the overall comptation uses `x` and `w` to produce `y`.
+is part of setup, and the overall computation uses `x` and `w` to produce `y`.
 As a result, we can derive TorchScript'd and AutoGrad variants as well. We can
 deduce that a TorchScript model will take the form:
 
diff --git a/benchmarks/operator_benchmark/README.md b/benchmarks/operator_benchmark/README.md
index 59918f6fab3ca..cff275d9a1f97 100644
--- a/benchmarks/operator_benchmark/README.md
+++ b/benchmarks/operator_benchmark/README.md
@@ -374,7 +374,7 @@ unary_ops_list = op_bench.op_list(
 ```
 
 #### Part 2. Create Tensors and Add Computation
-In this example, both operators share the same input so we only need to implement one TorchBenchmakrBase subclass.
+In this example, both operators share the same input so we only need to implement one TorchBenchmarkBase subclass.
 Every new subclass is required to implement 3 methods:
 * `init` is used to create tensors and set the operator name and function. In this example, the parameters to `init` are `M`, `N`, and `op_func` which have been specified in the configurations.
 * `forward` includes the operator to be tested and the computation based on the created tensors in `init`. Apart from `self`, the order of the arguments must match the entries specified in `self.inputs`.
diff --git a/test/mobile/model_test/README.md b/test/mobile/model_test/README.md
index 49b21051c655a..7e99e6763fee8 100644
--- a/test/mobile/model_test/README.md
+++ b/test/mobile/model_test/README.md
@@ -55,7 +55,7 @@ NOTE: currently Android simulator test does not generate on-the-fly models. Only
 ## Diagnose failed test
 If the simulator test is falling, that means the current change will potentially break a production model. So be careful. The detailed error message can be found in test log. If the change has to be made, make sure it doesn't break existing production models, and update the failed test model as appropriate (see the next section).
 
-You can also run these tests locally, please see the insturction in android and ios folder. Remember to generate on-the-fly test models if you want to test it locally (but don't commit these models with _temp suffix).
+You can also run these tests locally, please see the instruction in android and ios folder. Remember to generate on-the-fly test models if you want to test it locally (but don't commit these models with _temp suffix).
 ```
 python test/mobile/model_test/gen_test_model.py ios-test
 ```
diff --git a/tools/code_coverage/README.md b/tools/code_coverage/README.md
index 67adb445d053d..32fbc89e6aace 100644
--- a/tools/code_coverage/README.md
+++ b/tools/code_coverage/README.md
@@ -51,7 +51,7 @@ Great, you are ready to run the code coverage tool for the first time! Start fro
 ```
 python oss_coverage.py --run-only=atest
 ```
-This command will run `atest` binary in `build/bin/` folder and generate reoports over the entire *Pytorch* folder. You can find the reports in `profile/summary`. But you may only be interested in the `aten` folder, in this case, try:
+This command will run `atest` binary in `build/bin/` folder and generate reports over the entire *Pytorch* folder. You can find the reports in `profile/summary`. But you may only be interested in the `aten` folder, in this case, try:
 ```
 python oss_coverage.py --run-only=atest --interest-only=aten
 ```
@@ -91,9 +91,9 @@ python oss_coverage.py --run-only=atest --interest-only=c10 --summary
 
 
 **2. Run tests yourself**
-When you are developing a new feature, you may first run the tests yourself to make sure the implementation is all right and then want to learn its coverage. But sometimes the test take very long time and you don't want to wait to run it again when doing code coverage. In this case, you can use these arguments to accerate your development (make sure you build pytorch with the coverage option!):
+When you are developing a new feature, you may first run the tests yourself to make sure the implementation is all right and then want to learn its coverage. But sometimes the test take very long time and you don't want to wait to run it again when doing code coverage. In this case, you can use these arguments to accelerate your development (make sure you build pytorch with the coverage option!):
 ```
-# run tests when you are devloping a new feature, assume the the test is `test_nn.py`
+# run tests when you are developing a new feature, assume the test is `test_nn.py`
 python oss_coverage.py --run-only=test_nn.py
 # or you can run it yourself
 cd test/ && python test_nn.py

From 1287d075d1e1b6d676d1bbe0eb626ae1d0dfac4e Mon Sep 17 00:00:00 2001
From: Elias Ellison <elias.ellison@gmail.com>
Date: Fri, 28 Oct 2022 12:33:37 -0700
Subject: [PATCH 0297/1922] Fake Tensor For (Conv) Propagation (#87641)

Resubmitting https://github.com/pytorch/pytorch/pull/87302 so it can be ghstack'd with the pr below.

Incorrect strides in any meta impl would lead to runtime assertion errors for fallback kernels, so start by just enabling it for conv.

Replaces https://github.com/pytorch/pytorch/pull/87588.

cc @jansel @lezcano @fdrocha @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87641
Approved by: https://github.com/jansel
---
 torch/_dynamo/utils.py        |  16 +++
 torch/_inductor/compile_fx.py |   3 +-
 torch/_inductor/ir.py         | 223 +++++++++++++---------------------
 3 files changed, 100 insertions(+), 142 deletions(-)

diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index ef2c1c38ea8ba..081dc49de5bcd 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -30,6 +30,11 @@
 import torch
 from torch import fx
 from torch.nn.modules.lazy import LazyModuleMixin
+from torch.utils._python_dispatch import (
+    _len_torch_dispatch_stack,
+    _pop_mode,
+    _push_mode,
+)
 
 from . import config, logging as torchdynamo_logging
 
@@ -142,6 +147,17 @@ def fmt_fn(values, item_fn=lambda x: x):
 }
 
 
+@contextmanager
+def disable_current_modes():
+    mode_len = _len_torch_dispatch_stack()
+    old_modes = [_pop_mode() for _ in range(mode_len)]
+    try:
+        yield old_modes
+    finally:
+        for mode in reversed(old_modes):
+            _push_mode(mode)
+
+
 class DuplicateWarningChecker(object):
     def __init__(self, maxsize=4096):
         self.maxsize = maxsize
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index 26770b0671838..8a109f072300e 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -11,7 +11,6 @@
 
 import torch.fx
 from torch._subclasses.fake_tensor import FakeTensor
-from torch.utils._mode_utils import no_dispatch
 
 from . import config, overrides
 from .debug import DebugContext
@@ -85,7 +84,7 @@ def _step_logger():
 
 
 @DebugContext.wrap
-@no_dispatch()
+@dynamo_utils.disable_current_modes()
 def compile_fx_inner(
     gm: torch.fx.GraphModule,
     example_inputs: List[torch.Tensor],
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 156eeb11bdc7b..9836109810767 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -18,6 +18,7 @@
 import torch.fx
 import torch.utils._pytree as pytree
 from torch._prims_common import is_boolean_dtype, is_float_dtype
+from torch._subclasses.fake_tensor import FakeTensorMode
 
 from . import config, dependencies
 from .codegen.common import index_prevent_reordering
@@ -2233,6 +2234,57 @@ def copy_input(x):
         pw.realize()
         return pw
 
+    @classmethod
+    def process_kernel(cls, kernel, *args, **kwargs):
+        args_flat, args_spec = pytree.tree_flatten(args)
+
+        is_arg_tensor = []
+        tensor_args = []
+        non_tensor_args = []
+        for arg in args_flat:
+            is_arg_tensor.append(isinstance(arg, IRNode))
+            if is_arg_tensor[-1]:
+                tensor_args.append(arg)
+            else:
+                non_tensor_args.append(arg)
+
+        def unflatten_args(new_tensor_args, new_non_tensor_args):
+            new_args = []
+            it_tensors = iter(new_tensor_args)
+            it_non_tensors = iter(new_non_tensor_args)
+            for is_tensor in is_arg_tensor:
+                if is_tensor:
+                    new_args.append(next(it_tensors))
+                else:
+                    new_args.append(next(it_non_tensors))
+            return pytree.tree_unflatten(new_args, args_spec)
+
+        tensor_args = [
+            cls.require_contiguous(cls.realize_input(x)) for x in tensor_args
+        ]
+
+        # We don't have generic shape formulas, so just burn in the
+        # shapes and run an example input.
+        # TODO(jansel): replace this with dynamic shape formulas
+        example_args = []
+        for x in tensor_args:
+            size = [V.graph.sizevars.guard_static_shape(s) for s in x.get_size()]
+            stride = [
+                V.graph.sizevars.guard_static_shape(s) for s in x.get_layout().stride
+            ]
+            dtype = x.get_dtype()
+            device = x.get_device()
+            arg = torch.empty_strided(
+                size=size, stride=stride, dtype=dtype, device=device
+            ).zero_()
+            example_args.append(arg)
+
+        example_output = kernel(
+            *unflatten_args(example_args, non_tensor_args), **kwargs
+        )
+
+        return example_output, tensor_args, non_tensor_args, unflatten_args
+
     @classmethod
     def convert_to_reinterpret_view(cls, x):
         """
@@ -2844,52 +2896,12 @@ def gen_kwarg(k, v):
 
     @classmethod
     def create(cls, kernel, *args, **kwargs):
-        args_flat, args_spec = pytree.tree_flatten(args)
-
-        is_arg_tensor = []
-        tensor_args = []
-        non_tensor_args = []
-        for arg in args_flat:
-            is_arg_tensor.append(isinstance(arg, IRNode))
-            if is_arg_tensor[-1]:
-                tensor_args.append(arg)
-            else:
-                non_tensor_args.append(arg)
-
-        def unflatten_args(new_tensor_args, new_non_tensor_args):
-            new_args = []
-            it_tensors = iter(new_tensor_args)
-            it_non_tensors = iter(new_non_tensor_args)
-            for is_tensor in is_arg_tensor:
-                if is_tensor:
-                    new_args.append(next(it_tensors))
-                else:
-                    new_args.append(next(it_non_tensors))
-            return pytree.tree_unflatten(new_args, args_spec)
-
-        tensor_args = [
-            cls.require_contiguous(cls.realize_input(x)) for x in tensor_args
-        ]
-
-        # We don't have generic shape formulas, so just burn in the
-        # shapes and run an example input.
-        # TODO(jansel): replace this with dynamic shape formulas
-        example_args = []
-        for x in tensor_args:
-            size = [V.graph.sizevars.guard_static_shape(s) for s in x.get_size()]
-            stride = [
-                V.graph.sizevars.guard_static_shape(s) for s in x.get_layout().stride
-            ]
-            dtype = x.get_dtype()
-            device = x.get_device()
-            arg = torch.empty_strided(
-                size=size, stride=stride, dtype=dtype, device=device
-            ).zero_()
-            example_args.append(arg)
-
-        example_output = kernel(
-            *unflatten_args(example_args, non_tensor_args), **kwargs
-        )
+        (
+            example_output,
+            tensor_args,
+            non_tensor_args,
+            unflatten_args,
+        ) = cls.process_kernel(kernel, *args, **kwargs)
 
         if isinstance(example_output, (list, tuple)):
             packed = FallbackKernel(
@@ -3004,70 +3016,29 @@ def create(
         output_padding = tuple(output_padding_)
         assert isinstance(groups, int)
 
+        # TODO - enable FakeTensorMode for propagation more globally. incorrect stride metas for fallback
+        # kernels will lead to runtime failures
+        with FakeTensorMode():
+            output, *_ = cls.process_kernel(
+                torch.ops.aten.convolution,
+                x,
+                weight,
+                bias,
+                stride,
+                padding,
+                dilation,
+                transposed,
+                output_padding,
+                groups,
+            )
+
+        output_size = output.shape
+
         weight_shape = [
             sympy.Integer(V.graph.sizevars.guard_static_shape(s))
             for s in weight.get_size()
         ]
-
-        out_channels, in_channels1, *kernel_size = weight_shape
-        in_channels1 = in_channels1 * groups
-        if transposed:
-            out_channels, in_channels1 = in_channels1, out_channels
-
-        if bias is not None:
-            bias = cls.require_stride1(cls.realize_input(bias))
-            (bias_shape,) = [
-                sympy.Integer(V.graph.sizevars.guard_static_shape(s))
-                for s in bias.get_size()
-            ]
-            assert bias_shape == out_channels, f"{bias_shape} == {out_channels}"
-
-        if len(x.get_size()) == 1 + len(kernel_size):
-            in_channels2, *input_size = x.get_size()
-            in_channels_stride, *_ = x.get_stride()
-            output_size = []
-        else:
-            assert len(x.get_size()) == 2 + len(kernel_size)
-            batch, in_channels2, *input_size = x.get_size()
-            _, in_channels_stride, *_ = x.get_stride()
-            output_size = [batch]
-
-        V.graph.sizevars.guard_equals(in_channels1, in_channels2)
-
-        output_size.append(out_channels)
-
-        assert (
-            len(stride)
-            == len(padding)
-            == len(dilation)
-            == len(output_padding)
-            == len(kernel_size)
-            == len(input_size)
-        )
-        for i in range(len(stride)):
-            if transposed:
-                output_size.append(
-                    (input_size[i] - 1) * stride[i]
-                    - 2 * padding[i]
-                    + dilation[i] * (kernel_size[i] - 1)
-                    + output_padding[i]
-                    + 1
-                )
-            else:
-                output_size.append(
-                    IndexingDiv(
-                        input_size[i]
-                        + 2 * padding[i]
-                        - dilation[i] * (kernel_size[i] - 1)
-                        - 1
-                        + stride[i],
-                        stride[i],
-                    )
-                    + 2 * output_padding[i]
-                )
-            output_size[-1] = sympy.Integer(
-                V.graph.sizevars.guard_static_shape(output_size[-1])
-            )
+        _, _, *kernel_size = weight.get_size()
 
         # choose runtime kernel
         config_conv = config.triton.convolution
@@ -3121,41 +3092,13 @@ def create(
                 x.get_device(),
                 x.get_dtype(),
             )
+
         else:
-            output_layout_str = "torch.contiguous_format"
-            # If x or weight have one channels_last(2d or 3d) format, it will call channels_last path,
-            # which align with aten.convolutuion path(cpu only support 2d case now).
-            # TODO: after cpu 3d convolution support channels_last path, the size check can be removed.
-
-            # CUDA channels_last path depend on cudnn version, see
-            # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/ConvUtils.h.
-            valid_cudnn = False
-            if (
-                torch.backends.cudnn.is_available()
-                and torch.backends.cudnn.version() >= 7603
-            ):
-                valid_cudnn = True
-
-            # TODO - We cannot use strides to identify if a tensor is
-            # channels-last for 1x1 kernels. Incorrectly identifying the
-            # channels last configuration leads to a dramatic increase in
-            # compilation time. Unfortuantely, this breaks the channels last
-            # support.
-            # valid_device = x.get_device().type == "cpu" or (
-            #     x.get_device().type == "cuda" and valid_cudnn
-            # )
-
-            valid_device = x.get_device().type == "cpu"
-
-            if (
-                valid_device
-                and len(x.get_size()) == 4
-                and (
-                    x.get_layout().is_channels_last_stride_ordered()
-                    or weight.get_layout().is_channels_last_stride_ordered()
-                )
-            ):
-                output_layout_str = "torch.channels_last"
+            output_layout_str = (
+                "torch.contiguous_format"
+                if output.is_contiguous()
+                else "torch.channels_last"
+            )
 
         if output_layout_str == "torch.channels_last":
             stride_order = [0] + list(reversed(range(1, len(kernel_size) + 1)))

From 373f2e409fa0b91b0a45e398da67f4013ea98ec8 Mon Sep 17 00:00:00 2001
From: Tovly Deutsch <tovly@meta.com>
Date: Sat, 29 Oct 2022 04:20:56 +0000
Subject: [PATCH 0298/1922] Allow caffe2 to build with fbcode/mode/mac (#87293)

Summary: The Mac contbuild builds under the `fbcode/mode/mac` which caffe2 fails to build under. This is due to that build mode enforcing protobuf v3. The caffe2 targets already account for this issue under `arvr` build modes by swapping out protobuf dependencies. They don't account for the same issue under `fbcode/mode/mac`. This diff fixes that by checking for `is_fbcode_mac` in these situations (in addition to `arvr`).

Test Plan:
```
buck build --flagfile fbsource//fbcode/mode/mac fbsource//xplat/caffe2/...
```

Reviewed By: kimishpatel

Differential Revision: D39552724

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87293
Approved by: https://github.com/kimishpatel
---
 c2_defs.bzl | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/c2_defs.bzl b/c2_defs.bzl
index fa6719a54efdb..573ba9f6ad64c 100644
--- a/c2_defs.bzl
+++ b/c2_defs.bzl
@@ -5,7 +5,7 @@ load("@fbsource//tools/build_defs:default_platform_defs.bzl", "compose_platform_
 load("@fbsource//tools/build_defs:dict_defs.bzl", "dict_defs")
 load("@fbsource//tools/build_defs:expect.bzl", "expect")
 load("@fbsource//tools/build_defs:fb_xplat_cxx_library.bzl", "fb_xplat_cxx_library")
-load("@fbsource//tools/build_defs:fbsource_utils.bzl", "is_arvr_mode")
+load("@fbsource//tools/build_defs:fbsource_utils.bzl", "is_arvr_mode", "is_fbcode_mode_mac")
 load("@fbsource//tools/build_defs:platform_defs.bzl", "ANDROID", "APPLE", "CXX", "IOS", "MACOSX", "WINDOWS")
 load("@fbsource//tools/build_defs/apple:build_mode_defs.bzl", "is_production_build")
 load("@fbsource//tools/build_defs/apple:config_utils_defs.bzl", "STATIC_LIBRARY_IOS_CONFIG", "STATIC_LIBRARY_MAC_CONFIG", "fbobjc_configs")
@@ -380,6 +380,14 @@ def get_c2_aten_cpu_fbobjc_macosx_platform_deps():
             },
         ])
 
+def using_protobuf_v3():
+    # Consider migrating this to `read_config("protobuf", "use_v3")`
+    # The `is_fbcode_mode_mac()` clause was added rather than changing to `read_config` to minimize changes in behavior
+    return is_arvr_mode() or is_fbcode_mode_mac()
+
+def get_c2_protobuf_dep():
+    return "fbsource//third-party/protobuf:libprotobuf" if using_protobuf_v3() else "fbsource//xplat/third-party/protobuf:fb-protobuf-lite"
+
 def c2_cxx_library(**kwargs):
     args = get_c2_default_cxx_args()
     args.update(kwargs)
@@ -406,7 +414,7 @@ def c2_protobuf_rule(protos):
             protocmd = ("cp $SRCDIR/{} $SRCDIR/{} && chmod +w $SRCDIR/{} && echo \"option optimize_for = LITE_RUNTIME;\" >> $SRCDIR/{} && ".format(p, proto, proto, proto) +
                         "cp $SRCDIR/caffe2/proto/caffe2.proto $SRCDIR/caffe2.proto && chmod +w $SRCDIR/caffe2.proto && echo \"option optimize_for = LITE_RUNTIME;\" >> $SRCDIR/caffe2.proto && " +
                         "sed -i -e 's/caffe2\\/proto\\/caffe2.proto/caffe2.proto/g' $SRCDIR/{} && ".format(proto) +
-                        ("$(exe fbsource//third-party/protobuf:protoc-host) " if is_arvr_mode() else "$(exe fbsource//xplat/third-party/protobuf:protoc) --osx $(location fbsource//xplat/third-party/protobuf:protoc.Darwin) --linux $(location fbsource//xplat/third-party/protobuf:protoc.Linux) ") +
+                        ("$(exe fbsource//third-party/protobuf:protoc-host) " if using_protobuf_v3() else "$(exe fbsource//xplat/third-party/protobuf:protoc) --osx $(location fbsource//xplat/third-party/protobuf:protoc.Darwin) --linux $(location fbsource//xplat/third-party/protobuf:protoc.Linux) ") +
                         "-I $SRCDIR --cpp_out=$OUT $SRCDIR/{}".format(proto))
         buck_genrule(
             name = proto,
@@ -453,7 +461,7 @@ def c2_full_protobuf_rule(protos):
             protocmd = ("cp $SRCDIR/{} $SRCDIR/{} && ".format(p, proto) +
                         "cp $SRCDIR/caffe2/proto/caffe2.proto $SRCDIR/caffe2.proto && " +
                         "sed -i -e 's/caffe2\\/proto\\/caffe2.proto/caffe2.proto/g' $SRCDIR/{} && ".format(proto) +
-                        ("$(exe fbsource//third-party/protobuf:protoc-host) " if is_arvr_mode() else "$(exe fbsource//xplat/third-party/protobuf:protoc) --osx $(location fbsource//xplat/third-party/protobuf:protoc.Darwin) --linux $(location fbsource//xplat/third-party/protobuf:protoc.Linux) ") +
+                        ("$(exe fbsource//third-party/protobuf:protoc-host) " if using_protobuf_v3() else "$(exe fbsource//xplat/third-party/protobuf:protoc) --osx $(location fbsource//xplat/third-party/protobuf:protoc.Darwin) --linux $(location fbsource//xplat/third-party/protobuf:protoc.Linux) ") +
                         "-I $SRCDIR --cpp_out=$OUT $SRCDIR/{}".format(proto))
         buck_genrule(
             name = prefix + proto,
@@ -487,7 +495,7 @@ def libcaffe2_cxx_library(name, use_hptt, **kwargs):
         name = name,
         exported_deps = [
             "fbsource//xplat/caffe2/c10:c10",
-            "fbsource//third-party/protobuf:libprotobuf" if is_arvr_mode() else "fbsource//xplat/third-party/protobuf:fb-protobuf-lite",
+            get_c2_protobuf_dep(),
             ":caffe2_protobuf_headers",
             ":pthreadpool",
             ":common_core",

From 8915809041c68fe42b565bb000b331d54c8222b2 Mon Sep 17 00:00:00 2001
From: Iris Zhang <irisz@meta.com>
Date: Sat, 29 Oct 2022 04:38:34 +0000
Subject: [PATCH 0299/1922] [C10D][BE] Add exception handlers to c10d
 collectives function (#87643) (#87988)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87643

1. Add a decorator function exception_handlers to  c10d collectives.
2. Update test(torch/distributed/distributed_c10d.py) to include mp tests for exception_handler.

```
python3 test/distributed/test_c10d_error_logger.py
```

Test Plan: Test in OSS.

Reviewed By: H-Huang

Differential Revision: D40281632

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87988
Approved by: https://github.com/H-Huang
---
 test/distributed/test_c10d_error_logger.py | 137 ++++++++++++++++++++-
 torch/distributed/__init__.py              |   1 +
 torch/distributed/c10d_error_logger.py     |  17 ++-
 torch/distributed/distributed_c10d.py      |  62 ++++++++--
 torch/distributed/logging_handlers.py      |   4 +-
 5 files changed, 197 insertions(+), 24 deletions(-)

diff --git a/test/distributed/test_c10d_error_logger.py b/test/distributed/test_c10d_error_logger.py
index 8001f2b869d83..7c8a6241b76b5 100644
--- a/test/distributed/test_c10d_error_logger.py
+++ b/test/distributed/test_c10d_error_logger.py
@@ -1,17 +1,142 @@
 # Owner(s): ["oncall: distributed"]
 
+import json
 import logging
-import unittest
-from unittest.mock import patch
+import os
+import re
+import sys
+from functools import partial, wraps
+
+import torch
+import torch.distributed as dist
 
 from torch.distributed.c10d_error_logger import _get_or_create_logger
+from torch.distributed.distributed_c10d import exception_handler
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+from torch.testing._internal.common_distributed import MultiProcessTestCase, TEST_SKIPS
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+BACKEND = dist.Backend.NCCL
+WORLD_SIZE = min(4, max(2, torch.cuda.device_count()))
+
+
+def with_comms(func=None):
+    if func is None:
+        return partial(
+            with_comms,
+        )
+
+    @wraps(func)
+    def wrapper(self, *args, **kwargs):
+        if BACKEND == dist.Backend.NCCL and torch.cuda.device_count() < self.world_size:
+            sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
+        self.dist_init()
+        func(self)
+        self.destroy_comms()
+
+    return wrapper
+
+
+class C10dErrorLoggerTest(MultiProcessTestCase):
+    def setUp(self):
+        super(C10dErrorLoggerTest, self).setUp()
+        os.environ["WORLD_SIZE"] = str(self.world_size)
+        os.environ["BACKEND"] = BACKEND
+        self._spawn_processes()
+
+    @property
+    def device(self):
+        return (
+            torch.device(self.rank)
+            if BACKEND == dist.Backend.NCCL
+            else torch.device("cpu")
+        )
+
+    @property
+    def world_size(self):
+        return WORLD_SIZE
+
+    @property
+    def process_group(self):
+        return dist.group.WORLD
 
-class C10dErrorLoggerTest(unittest.TestCase):
+    def destroy_comms(self):
+        # Wait for all ranks to reach here before starting shutdown.
+        dist.barrier()
+        dist.destroy_process_group()
 
-    @patch("torch.distributed.c10d_error_logger._get_logging_handler")
-    def test_get_or_create_logger(self, logging_handler_mock):
-        logging_handler_mock.return_value = logging.NullHandler(), "NullHandler"
+    def dist_init(self):
+        dist.init_process_group(
+            backend=BACKEND,
+            world_size=self.world_size,
+            rank=self.rank,
+            init_method=f"file://{self.file_name}",
+        )
+
+        # set device for nccl pg for collectives
+        if BACKEND == "nccl":
+            torch.cuda.set_device(self.rank)
+
+    def test_get_or_create_logger(self):
         logger = _get_or_create_logger()
         self.assertIsNotNone(logger)
         self.assertEqual(1, len(logger.handlers))
         self.assertIsInstance(logger.handlers[0], logging.NullHandler)
+
+    @exception_handler
+    def failed_broadcast_raise_exception(self):
+        tensor = torch.arange(2, dtype=torch.int64)
+        dist.broadcast(tensor, self.world_size + 1)
+
+    @exception_handler
+    def failed_broadcast_not_raise_exception(self):
+        try:
+            tensor = torch.arange(2, dtype=torch.int64)
+            dist.broadcast(tensor, self.world_size + 1)
+        except Exception as exception:
+            pass
+
+    @with_comms
+    def test_exception_handler_with_dist(self) -> None:
+        with self.assertRaises(Exception) as exception:
+            self.failed_broadcast_raise_exception()
+
+        with self.assertLogs(dist._c10d_error_logger, level="DEBUG") as captured:
+            self.failed_broadcast_not_raise_exception()
+            error_msg_dict = json.loads(
+                re.search("({.+})", captured.output[0]).group(0).replace("'", '"')
+            )
+            self.assertEqual(len(error_msg_dict), 7)
+
+            self.assertIn("func_name", error_msg_dict.keys())
+            self.assertEqual("broadcast", error_msg_dict["func_name"])
+
+            self.assertIn("args", error_msg_dict.keys())
+
+            self.assertIn("backend", error_msg_dict.keys())
+            self.assertEqual("nccl", error_msg_dict["backend"])
+
+            self.assertIn("world_size", error_msg_dict.keys())
+            self.assertEqual(str(self.world_size), error_msg_dict["world_size"])
+
+            self.assertIn("global_rank", error_msg_dict.keys())
+            self.assertIn(str(dist.get_rank()), error_msg_dict["global_rank"])
+
+            # In this test case, local_rank = global_rank, since we don't have multiple processes on one node.
+            self.assertIn("local_rank", error_msg_dict.keys())
+            self.assertIn(str(dist.get_rank()), error_msg_dict["local_rank"])
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index 79826492d81c7..e7361c1d5dcc0 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -64,6 +64,7 @@ def is_available() -> bool:
         _reduce_scatter_base,
         _create_process_group_wrapper,
         _rank_not_in_group,
+        _c10d_error_logger,
     )
 
     from .rendezvous import (
diff --git a/torch/distributed/c10d_error_logger.py b/torch/distributed/c10d_error_logger.py
index 10605c69be476..83efad98b2b20 100644
--- a/torch/distributed/c10d_error_logger.py
+++ b/torch/distributed/c10d_error_logger.py
@@ -7,27 +7,24 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
-from typing import Tuple
+from typing import List, Tuple
 
 from torch.distributed.logging_handlers import _log_handlers
 
-_c10d_error_logger = None
+__all__: List[str] = []
 
 
 def _get_or_create_logger() -> logging.Logger:
-    global _c10d_error_logger
-    if _c10d_error_logger:
-        return _c10d_error_logger
     logging_handler, log_handler_name = _get_logging_handler()
-    _c10d_error_logger = logging.getLogger(f"c10d-collectives-{log_handler_name}")
-    _c10d_error_logger.setLevel(logging.DEBUG)
+    logger = logging.getLogger(f"c10d-collectives-{log_handler_name}")
+    logger.setLevel(logging.DEBUG)
     formatter = logging.Formatter(
         "%(asctime)s %(filename)s:%(lineno)s %(levelname)s p:%(processName)s t:%(threadName)s: %(message)s"
     )
     logging_handler.setFormatter(formatter)
-    _c10d_error_logger.propagate = False
-    _c10d_error_logger.addHandler(logging_handler)
-    return _c10d_error_logger
+    logger.propagate = False
+    logger.addHandler(logging_handler)
+    return logger
 
 
 def _get_logging_handler(destination: str = "default") -> Tuple[logging.Handler, str]:
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 7de47876b5664..61b3718fb9308 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -1,6 +1,7 @@
 import itertools
 import collections.abc
 import contextlib
+import functools
 import io
 import logging
 import os
@@ -34,6 +35,7 @@
 from torch._six import string_classes
 from torch.autograd.profiler import record_function
 from .constants import default_pg_timeout
+from .c10d_error_logger import _get_or_create_logger
 from .rendezvous import register_rendezvous_handler, rendezvous  # noqa: F401
 
 __all__ = [
@@ -55,7 +57,7 @@
     'ProcessGroup', 'ReduceOp', 'ReduceOptions', 'ReduceScatterOptions',
     'ScatterOptions', 'Store', 'DebugLevel', 'get_debug_level', 'Work',
     'default_pg_timeout', 'get_group_rank', 'get_global_rank', 'get_process_group_ranks',
-    'reduce_op', 'all_gather_into_tensor', 'reduce_scatter_tensor',
+    'reduce_op', 'all_gather_into_tensor', 'reduce_scatter_tensor', 'exception_handler'
 ]
 
 _MPI_AVAILABLE = True
@@ -120,6 +122,8 @@ def _export_c_types():
     _UCC_AVAILABLE = False
 
 logger = logging.getLogger(__name__)
+global _c10d_error_logger
+_c10d_error_logger = _get_or_create_logger()
 
 PG_WRAPPER_STORE_PREFIX = "pg_wrapper"
 
@@ -1307,6 +1311,34 @@ def batch_isend_irecv(p2p_op_list):
     return reqs
 
 
+def exception_handler(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except Exception as error:
+            if is_initialized():
+                error_msg_dict = {
+                    "func_name": f"{func.__name__}",
+                    "args": f"{args}, {kwargs}",
+                    "backend": f"{get_backend()}",
+                    "world_size": f"{get_world_size()}",
+                    "global_rank": f"{get_rank()}",
+                    "local_rank": f"{get_rank(kwargs.get('group'))}",
+                    "error": f"{error}",
+                }
+            else:
+                error_msg_dict = {
+                    "func_name": f"{func.__name__}",
+                    "args": f"{args}, {kwargs}",
+                    "error": f"{error}",
+                }
+            _c10d_error_logger.debug(error_msg_dict)
+            raise
+    return wrapper
+
+
+@exception_handler
 def broadcast_multigpu(tensor_list, src, group=None, async_op=False, src_tensor=0):
     """
     Broadcasts the tensor to the whole group with multiple GPU tensors
@@ -1366,6 +1398,7 @@ def broadcast_multigpu(tensor_list, src, group=None, async_op=False, src_tensor=
         work.wait()
 
 
+@exception_handler
 def broadcast(tensor, src, group=None, async_op=False):
     """
     Broadcasts the tensor to the whole group.
@@ -1407,7 +1440,7 @@ def broadcast(tensor, src, group=None, async_op=False):
     else:
         work.wait()
 
-
+@exception_handler
 def all_reduce_multigpu(tensor_list, op=ReduceOp.SUM, group=None, async_op=False):
     r"""
     Reduces the tensor data across all machines in such a way that all get
@@ -1468,7 +1501,7 @@ def all_reduce_multigpu(tensor_list, op=ReduceOp.SUM, group=None, async_op=False
     else:
         work.wait()
 
-
+@exception_handler
 def all_reduce(tensor, op=ReduceOp.SUM, group=None, async_op=False):
     """
     Reduces the tensor data across all machines in such a way that all get
@@ -1540,7 +1573,7 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, async_op=False):
     else:
         work.wait()
 
-
+@exception_handler
 def all_reduce_coalesced(tensors, op=ReduceOp.SUM, group=None, async_op=False):
     """
     WARNING: at this time individual shape checking is not implemented across nodes.
@@ -1603,7 +1636,7 @@ def all_reduce_coalesced(tensors, op=ReduceOp.SUM, group=None, async_op=False):
     else:
         work.wait()
 
-
+@exception_handler
 def reduce_multigpu(
     tensor_list, dst, op=ReduceOp.SUM, group=None, async_op=False, dst_tensor=0
 ):
@@ -1665,7 +1698,7 @@ def reduce_multigpu(
     else:
         work.wait()
 
-
+@exception_handler
 def reduce(tensor, dst, op=ReduceOp.SUM, group=None, async_op=False):
     """
     Reduces the tensor data across all machines.
@@ -1710,7 +1743,7 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, async_op=False):
     else:
         work.wait()
 
-
+@exception_handler
 def all_gather_multigpu(
     output_tensor_lists, input_tensor_list, group=None, async_op=False
 ):
@@ -1817,6 +1850,7 @@ def _check_for_nccl_backend(group):
         isinstance(pg, ProcessGroupNCCL)
     )
 
+@exception_handler
 def all_gather_object(object_list, obj, group=None):
     """
     Gathers picklable objects from the whole group into a list. Similar to
@@ -1903,6 +1937,7 @@ def all_gather_object(object_list, obj, group=None):
         object_list[i] = _tensor_to_object(tensor, tensor_size)
 
 
+@exception_handler
 def gather_object(obj, object_gather_list=None, dst=0, group=None):
     """
     Gathers picklable objects from the whole group in a single process.
@@ -2007,6 +2042,7 @@ def gather_object(obj, object_gather_list=None, dst=0, group=None):
         object_gather_list[i] = _tensor_to_object(tensor, tensor_size)
 
 
+@exception_handler
 def broadcast_object_list(object_list, src=0, group=None, device=None):
     """
     Broadcasts picklable objects in ``object_list`` to the whole group. Similar
@@ -2106,6 +2142,7 @@ def broadcast_object_list(object_list, src=0, group=None, device=None):
             object_list[i] = _tensor_to_object(obj_view, obj_size)
 
 
+@exception_handler
 def scatter_object_list(
     scatter_object_output_list, scatter_object_input_list, src=0, group=None
 ):
@@ -2209,6 +2246,7 @@ def scatter_object_list(
     scatter_object_output_list[0] = _tensor_to_object(output_tensor, obj_tensor_size)
 
 
+@exception_handler
 def all_gather(tensor_list, tensor, group=None, async_op=False):
     """
     Gathers tensors from the whole group in a list.
@@ -2282,6 +2320,7 @@ def all_gather(tensor_list, tensor, group=None, async_op=False):
         work.wait()
 
 
+@exception_handler
 def all_gather_into_tensor(output_tensor, input_tensor, group=None, async_op=False):
     """
     Gather tensors from all ranks and put them in a single output tensor.
@@ -2363,6 +2402,7 @@ def all_gather_into_tensor(output_tensor, input_tensor, group=None, async_op=Fal
         work.wait()
 
 
+@exception_handler
 def _all_gather_base(output_tensor, input_tensor, group=None, async_op=False):
     """
     Single tensor all gather. Gathers a single tensor from all ranks, and puts them in a single output tensor.
@@ -2392,6 +2432,7 @@ def _all_gather_base(output_tensor, input_tensor, group=None, async_op=False):
     return all_gather_into_tensor(output_tensor, input_tensor, group, async_op)
 
 
+@exception_handler
 def all_gather_coalesced(
     output_tensor_lists, input_tensor_list, group=None, async_op=False
 ):
@@ -2491,6 +2532,7 @@ def _validate_output_list_for_rank(my_rank, dst, gather_list):
         )
 
 
+@exception_handler
 def gather(tensor, gather_list=None, dst=0, group=None, async_op=False):
     """
     Gathers a list of tensors in a single process.
@@ -2545,6 +2587,7 @@ def gather(tensor, gather_list=None, dst=0, group=None, async_op=False):
         work.wait()
 
 
+@exception_handler
 def scatter(tensor, scatter_list=None, src=0, group=None, async_op=False):
     """
     Scatters a list of tensors to all processes in a group.
@@ -2640,6 +2683,7 @@ def scatter(tensor, scatter_list=None, src=0, group=None, async_op=False):
         work.wait()
 
 
+@exception_handler
 def reduce_scatter_multigpu(
     output_tensor_list, input_tensor_lists, op=ReduceOp.SUM, group=None, async_op=False
 ):
@@ -2709,6 +2753,7 @@ def reduce_scatter_multigpu(
         work.wait()
 
 
+@exception_handler
 def reduce_scatter(output, input_list, op=ReduceOp.SUM, group=None, async_op=False):
     """
     Reduces, then scatters a list of tensors to all processes in a group.
@@ -2750,6 +2795,7 @@ def reduce_scatter(output, input_list, op=ReduceOp.SUM, group=None, async_op=Fal
         work.wait()
 
 
+@exception_handler
 def reduce_scatter_tensor(output, input, op=ReduceOp.SUM, group=None, async_op=False):
     """
     Reduces, then scatters a tensor to all ranks in a group.
@@ -2854,6 +2900,7 @@ def _reduce_scatter_base(output, input, op=ReduceOp.SUM, group=None, async_op=Fa
     return reduce_scatter_tensor(output, input, op, group, async_op)
 
 
+@exception_handler
 def all_to_all_single(
     output,
     input,
@@ -2984,6 +3031,7 @@ def all_to_all_single(
         work.wait()
 
 
+@exception_handler
 def all_to_all(output_tensor_list, input_tensor_list, group=None, async_op=False):
     """
     Each process scatters list of input tensors to all processes in a group and
diff --git a/torch/distributed/logging_handlers.py b/torch/distributed/logging_handlers.py
index 7c3b3249f6c79..3c607fe45da77 100644
--- a/torch/distributed/logging_handlers.py
+++ b/torch/distributed/logging_handlers.py
@@ -7,7 +7,9 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
-from typing import Dict
+from typing import Dict, List
+
+__all__: List[str] = []
 
 _log_handlers: Dict[str, logging.Handler] = {
     "default": logging.NullHandler(),

From 7821743c9e7c1517226f86d796dbc0212c198ce3 Mon Sep 17 00:00:00 2001
From: Loren Arthur <lorenarthur@meta.com>
Date: Sat, 29 Oct 2022 04:52:01 +0000
Subject: [PATCH 0300/1922] Move workspace related functions to separate file
 (#87651)

Move workspace related functions to separate file

Test Plan: Existing tests

Differential Revision: D40657708

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87651
Approved by: https://github.com/malfet
---
 caffe2/python/CMakeLists.txt      |   1 +
 caffe2/python/pybind_state.cc     | 251 +++++++++++++++---------------
 caffe2/python/pybind_workspace.cc |  72 +++++++++
 caffe2/python/pybind_workspace.h  |  15 ++
 4 files changed, 213 insertions(+), 126 deletions(-)
 create mode 100644 caffe2/python/pybind_workspace.cc
 create mode 100644 caffe2/python/pybind_workspace.h

diff --git a/caffe2/python/CMakeLists.txt b/caffe2/python/CMakeLists.txt
index c092febee4a90..464aa24eadd29 100644
--- a/caffe2/python/CMakeLists.txt
+++ b/caffe2/python/CMakeLists.txt
@@ -1,6 +1,7 @@
 # ---[ CPU files.
 set(Caffe2_CPU_PYTHON_SRCS
     "/pybind_state.cc"
+    "/pybind_workspace.cc"
     "/pybind_state_dlpack.cc"
     "/pybind_state_nomni.cc"
     "/pybind_state_registry.cc"
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index a637f15e7a9d3..3103006774df5 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -2,6 +2,7 @@
 
 #include <chrono>
 #include <future>
+#include <memory>
 
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
@@ -33,6 +34,7 @@
 #include "caffe2/predictor/emulator/data_filler.h"
 #include "caffe2/predictor/predictor.h"
 #include "caffe2/python/pybind_state_registry.h"
+#include "caffe2/python/pybind_workspace.h"
 #include "caffe2/utils/cpuid.h"
 #include "caffe2/utils/proto_convert.h"
 #include "caffe2/utils/string_utils.h"
@@ -56,14 +58,6 @@ constexpr bool kPyBindFalse = false;
 
 namespace py = pybind11;
 
-// gWorkspaces allows us to define and switch between multiple workspaces in
-// Python.
-static std::map<std::string, std::unique_ptr<Workspace>> gWorkspaces;
-// gWorkspace is the pointer to the current workspace. The ownership is kept
-// by the gWorkspaces map.
-static Workspace* gWorkspace = nullptr;
-static std::string gCurrentWorkspaceName;
-
 // NOLINTNEXTLINE(modernize-use-equals-default)
 BlobFetcherBase::~BlobFetcherBase() {}
 // NOLINTNEXTLINE(modernize-use-equals-default)
@@ -83,17 +77,6 @@ C10_DEFINE_TYPED_REGISTRY(
 REGISTER_BLOB_FETCHER((TypeMeta::Id<Tensor>()), TensorFetcher);
 REGISTER_BLOB_FEEDER(CPU, TensorFeeder<CPUContext>);
 
-Workspace* GetCurrentWorkspace() {
-  return gWorkspace;
-}
-
-Workspace* GetWorkspaceByName(const std::string& name) {
-  if (gWorkspaces.count(name)) {
-    return gWorkspaces[name].get();
-  }
-  return nullptr;
-}
-
 class StringFetcher : public BlobFetcherBase {
  public:
   py::object Fetch(const Blob& blob) override {
@@ -180,20 +163,6 @@ std::function<const char*(const string&)> DefinitionGetter(
   return [registry](const string& name) { return registry->HelpMessage(name); };
 }
 
-void switchWorkspaceInternal(const std::string& name, bool create_if_missing) {
-  if (gWorkspaces.count(name)) {
-    gCurrentWorkspaceName = name;
-    gWorkspace = gWorkspaces[name].get();
-    return;
-  }
-
-  CAFFE_ENFORCE(create_if_missing);
-  std::unique_ptr<Workspace> new_workspace(new Workspace());
-  gWorkspace = new_workspace.get();
-  gWorkspaces.insert(std::make_pair(name, std::move(new_workspace)));
-  gCurrentWorkspaceName = name;
-}
-
 namespace python_detail {
 // Python Op implementations.
 using FuncRegistry = std::unordered_map<std::string, Func>;
@@ -652,10 +621,9 @@ void addObjectMethods(py::module& m) {
             return (int)self->last_failed_op_net_position;
           })
       .def_property_readonly_static("current", [](py::object /* type */) {
-        auto ws = gWorkspaces.find(gCurrentWorkspaceName);
-        CAFFE_ENFORCE(ws != gWorkspaces.end());
-        CAFFE_ENFORCE(ws->second.get());
-        return py::cast(ws->second.get(), py::return_value_policy::reference);
+        auto ws = caffe2::python::GetCurrentWorkspace();
+        CAFFE_ENFORCE(ws);
+        return py::cast(ws, py::return_value_policy::reference);
       });
 
   py::class_<BackgroundPlan, std::shared_ptr<BackgroundPlan>>(
@@ -972,14 +940,15 @@ void addObjectMethods(py::module& m) {
 
   py::class_<Predictor>(m, "Predictor")
       .def(py::init([](py::bytes init_net, py::bytes predict_net) {
-        CAFFE_ENFORCE(gWorkspace);
+        Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+        CAFFE_ENFORCE(workspace);
         NetDef init_net_, predict_net_;
         CAFFE_ENFORCE(ParseProtoFromLargeString(
             init_net.cast<std::string>(), &init_net_));
         CAFFE_ENFORCE(ParseProtoFromLargeString(
             predict_net.cast<std::string>(), &predict_net_));
         return new Predictor(
-            makePredictorConfig(init_net_, predict_net_, gWorkspace));
+            makePredictorConfig(init_net_, predict_net_, workspace));
       }))
       .def(
           "run",
@@ -1139,20 +1108,21 @@ void addGlobalMethods(py::module& m) {
     }
     return keys;
   });
-  m.def("on_module_exit", []() { gWorkspaces.clear(); });
+  m.def("on_module_exit", []() { caffe2::python::ClearWorkspaces(); });
   // create_if_missing not used by necessary for pybind to do
   // properly do function overloading.
   m.def(
-      "switch_workspace",
-      [](Workspace* ws, py::object /*create_if_missing*/) { gWorkspace = ws; });
+      "switch_workspace", [](Workspace* ws, py::object /*create_if_missing*/) {
+        // TODO
+        caffe2::python::SetCurrentWorkspace(ws);
+      });
   m.def(
       "create_child_workspace",
       [](const std::string& parent_ws_name, const std::string& child_ws_name) {
-        CAFFE_ENFORCE(
-            gWorkspaces.count(parent_ws_name), "Parent ws does not exist.");
-        auto parent_gws = gWorkspaces[parent_ws_name].get();
+        auto parent_gws = caffe2::python::GetWorkspaceByName(parent_ws_name);
+        CAFFE_ENFORCE(parent_gws, "Parent ws does not exist.");
         std::unique_ptr<Workspace> child_ws(new Workspace(parent_gws));
-        gWorkspaces.insert(std::make_pair(child_ws_name, std::move(child_ws)));
+        caffe2::python::InsertWorkspace(child_ws_name, std::move(child_ws));
       },
       "Create and register child ws, sharing existing blobs in parent ws.",
       py::arg("parent_ws_name"),
@@ -1161,9 +1131,10 @@ void addGlobalMethods(py::module& m) {
       "switch_workspace",
       [](const std::string& name, const py::object create_if_missing) {
         if (create_if_missing.is(py::none())) {
-          return switchWorkspaceInternal(name, false);
+          return caffe2::python::SwitchWorkspaceInternal(name, false);
         }
-        return switchWorkspaceInternal(name, create_if_missing.cast<bool>());
+        return caffe2::python::SwitchWorkspaceInternal(
+            name, create_if_missing.cast<bool>());
       },
       "Switch to the specified workspace, creating if necessary",
       py::arg("name"),
@@ -1173,30 +1144,27 @@ void addGlobalMethods(py::module& m) {
       [](const py::object& root_folder) {
         VLOG(1) << "Resetting workspace.";
         if (root_folder.is(py::none())) {
-          // NOLINTNEXTLINE(modernize-make-unique)
-          gWorkspaces[gCurrentWorkspaceName].reset(new Workspace());
+          caffe2::python::ResetWorkspace(new Workspace());
         } else {
-          // NOLINTNEXTLINE(modernize-make-unique)
-          gWorkspaces[gCurrentWorkspaceName].reset(
+          caffe2::python::ResetWorkspace(
               new Workspace(root_folder.cast<std::string>()));
         }
-        gWorkspace = gWorkspaces[gCurrentWorkspaceName].get();
         return true;
       },
       "Reset the workspace",
       py::arg("root_folder") = py::none());
 
   m.def("root_folder", []() {
-    CAFFE_ENFORCE(gWorkspace);
-    return gWorkspace->RootFolder();
+    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+    CAFFE_ENFORCE(workspace);
+    return workspace->RootFolder();
+  });
+  m.def("current_workspace", []() {
+    return caffe2::python::GetCurrentWorkspaceName();
   });
-  m.def("current_workspace", []() { return gCurrentWorkspaceName; });
   m.def("workspaces", []() {
     std::vector<std::string> names;
-    for (const auto& kv : gWorkspaces) {
-      // NOLINTNEXTLINE(performance-inefficient-vector-operation)
-      names.push_back(kv.first);
-    }
+    caffe2::python::GetWorkspaceNames(names);
     return names;
   });
   m.def("nearby_opnames", [](const std::string& name) {
@@ -1211,41 +1179,46 @@ void addGlobalMethods(py::module& m) {
     return alternatives;
   });
   m.def("local_blobs", []() {
-    CAFFE_ENFORCE(gWorkspace);
-    return gWorkspace->LocalBlobs();
+    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+    CAFFE_ENFORCE(workspace);
+    return workspace->LocalBlobs();
   });
   m.def("blobs", []() {
-    CAFFE_ENFORCE(gWorkspace);
-    return gWorkspace->Blobs();
+    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+    CAFFE_ENFORCE(workspace);
+    return workspace->Blobs();
   });
   m.def("has_blob", [](const std::string& name) {
-    CAFFE_ENFORCE(gWorkspace);
-    return gWorkspace->HasBlob(name);
+    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+    CAFFE_ENFORCE(workspace);
+    return workspace->HasBlob(name);
   });
   m.def(
       "fill_random_network_inputs",
       [](const py::bytes& net_def,
          const std::vector<std::vector<std::vector<int64_t>>>& inputDims,
          const std::vector<std::vector<std::string>>& inputTypes) {
-        CAFFE_ENFORCE(gWorkspace);
+        Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+        CAFFE_ENFORCE(workspace);
         py::gil_scoped_release g;
         NetDef net;
         CAFFE_ENFORCE(
             ParseProtoFromLargeString(net_def.cast<std::string>(), &net));
         caffe2::emulator::fillRandomNetworkInputs(
-            net, inputDims, inputTypes, gWorkspace);
+            net, inputDims, inputTypes, workspace);
       });
   m.def(
       "create_net",
       [](py::bytes net_def, bool overwrite) {
-        CAFFE_ENFORCE(gWorkspace);
+        Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+        CAFFE_ENFORCE(workspace);
         caffe2::NetDef proto;
         CAFFE_ENFORCE(
             ParseProtoFromLargeString(net_def.cast<std::string>(), &proto),
             "Can't parse net proto: ",
             net_def.cast<std::string>());
         CAFFE_ENFORCE(
-            gWorkspace->CreateNet(proto, overwrite),
+            workspace->CreateNet(proto, overwrite),
             "Error creating net with proto: ",
             net_def.cast<std::string>());
         return true;
@@ -1253,11 +1226,12 @@ void addGlobalMethods(py::module& m) {
       py::arg("net_def"),
       py::arg("overwrite") = kPyBindFalse);
   m.def("run_net", [](const std::string& name, int num_iter, bool allow_fail) {
-    CAFFE_ENFORCE(gWorkspace);
-    CAFFE_ENFORCE(gWorkspace->GetNet(name), "Can't find net ", name);
+    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+    CAFFE_ENFORCE(workspace);
+    CAFFE_ENFORCE(workspace->GetNet(name), "Can't find net ", name);
     py::gil_scoped_release g;
     for (int i = 0; i < num_iter; i++) {
-      bool success = gWorkspace->RunNet(name);
+      bool success = workspace->RunNet(name);
       if (!allow_fail) {
         CAFFE_ENFORCE(success, "Error running net ", name);
       } else {
@@ -1271,12 +1245,12 @@ void addGlobalMethods(py::module& m) {
   m.def(
       "add_observer_to_net",
       [](const std::string& net_name, const std::string& observer_type) {
-        CAFFE_ENFORCE(gWorkspace);
-        CAFFE_ENFORCE(
-            gWorkspace->GetNet(net_name), "Can't find net ", net_name);
+        Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+        CAFFE_ENFORCE(workspace);
+        CAFFE_ENFORCE(workspace->GetNet(net_name), "Can't find net ", net_name);
         py::gil_scoped_release g;
 
-        NetBase* net = gWorkspace->GetNet(net_name);
+        NetBase* net = workspace->GetNet(net_name);
         const Observable<NetBase>::Observer* observer = nullptr;
 
 #define REGISTER_PYTHON_EXPOSED_OBSERVER(ob_type)             \
@@ -1303,12 +1277,12 @@ void addGlobalMethods(py::module& m) {
   m.def(
       "remove_observer_from_net",
       [](const std::string& net_name, const ObserverBase<NetBase>* observer) {
-        CAFFE_ENFORCE(gWorkspace);
-        CAFFE_ENFORCE(
-            gWorkspace->GetNet(net_name), "Can't find net ", net_name);
+        Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+        CAFFE_ENFORCE(workspace);
+        CAFFE_ENFORCE(workspace->GetNet(net_name), "Can't find net ", net_name);
         py::gil_scoped_release g;
 
-        NetBase* net = gWorkspace->GetNet(net_name);
+        NetBase* net = workspace->GetNet(net_name);
         net->DetachObserver(observer);
       });
   m.def("clear_global_net_observer", []() {
@@ -1316,11 +1290,12 @@ void addGlobalMethods(py::module& m) {
     caffe2::ClearGlobalNetObservers();
   });
   m.def("num_observers_on_net", [](const std::string& net_name) {
-    CAFFE_ENFORCE(gWorkspace);
-    CAFFE_ENFORCE(gWorkspace->GetNet(net_name), "Can't find net ", net_name);
+    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+    CAFFE_ENFORCE(workspace);
+    CAFFE_ENFORCE(workspace->GetNet(net_name), "Can't find net ", net_name);
     py::gil_scoped_release g;
 
-    NetBase* net = gWorkspace->GetNet(net_name);
+    NetBase* net = workspace->GetNet(net_name);
     return net->NumObservers();
   });
   m.def(
@@ -1329,8 +1304,9 @@ void addGlobalMethods(py::module& m) {
          size_t warmup_runs,
          size_t main_runs,
          bool run_individual) {
-        CAFFE_ENFORCE(gWorkspace);
-        auto* net = gWorkspace->GetNet(name);
+        Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+        CAFFE_ENFORCE(workspace);
+        auto* net = workspace->GetNet(name);
         CAFFE_ENFORCE(net, "Didn't find net: ", name);
         py::gil_scoped_release g;
         vector<float> stat =
@@ -1338,8 +1314,9 @@ void addGlobalMethods(py::module& m) {
         return stat;
       });
   m.def("benchmark_net_once", [](const std::string& name) {
-    CAFFE_ENFORCE(gWorkspace);
-    auto* net = gWorkspace->GetNet(name);
+    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+    CAFFE_ENFORCE(workspace);
+    auto* net = workspace->GetNet(name);
     CAFFE_ENFORCE(net, "Didn't find net: ", name);
     py::gil_scoped_release g;
     float stat = net->TEST_Benchmark_One_Run();
@@ -1347,28 +1324,35 @@ void addGlobalMethods(py::module& m) {
   });
 
   m.def("delete_net", [](const std::string& name) {
-    CAFFE_ENFORCE(gWorkspace);
-    gWorkspace->DeleteNet(name);
+    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+    CAFFE_ENFORCE(workspace);
+    workspace->DeleteNet(name);
     return true;
   });
-  m.def("nets", []() { return gWorkspace->Nets(); });
+  m.def("nets", []() {
+    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+    CAFFE_ENFORCE(workspace);
+    return workspace->Nets();
+  });
   m.def("run_operator_once", [](const py::bytes& op_def) {
-    CAFFE_ENFORCE(gWorkspace);
+    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+    CAFFE_ENFORCE(workspace);
     OperatorDef def;
     CAFFE_ENFORCE(ParseProtoFromLargeString(op_def.cast<std::string>(), &def));
     py::gil_scoped_release g;
-    CAFFE_ENFORCE(gWorkspace->RunOperatorOnce(def));
+    CAFFE_ENFORCE(workspace->RunOperatorOnce(def));
     return true;
   });
   // Run an operator multiple times.
   // This is needed for microbenchmarking as we want the benchmark loop to be in
   // C++ to minimize overhead.
   m.def("run_operator_multiple", [](const py::bytes& op_def, int num_runs) {
-    CAFFE_ENFORCE(gWorkspace);
+    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+    CAFFE_ENFORCE(workspace);
     OperatorDef def;
     CAFFE_ENFORCE(ParseProtoFromLargeString(op_def.cast<std::string>(), &def));
     py::gil_scoped_release g;
-    std::unique_ptr<OperatorBase> op(CreateOperator(def, gWorkspace));
+    std::unique_ptr<OperatorBase> op(CreateOperator(def, workspace));
     for (int i = 0; i < num_runs; i++) {
       if (!op->Run()) {
         return false;
@@ -1379,7 +1363,8 @@ void addGlobalMethods(py::module& m) {
   m.def(
       "get_operator_cost",
       [](const py::bytes& op_def, const std::vector<string>& input_blobs) {
-        CAFFE_ENFORCE(gWorkspace);
+        Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+        CAFFE_ENFORCE(workspace);
         OperatorDef def;
         CAFFE_ENFORCE(
             ParseProtoFromLargeString(op_def.cast<std::string>(), &def),
@@ -1389,37 +1374,40 @@ void addGlobalMethods(py::module& m) {
         CAFFE_ENFORCE(schema);
         vector<TensorShape> shapes;
         for (const auto& blob_name : input_blobs) {
-          auto* blob = gWorkspace->GetBlob(blob_name);
+          auto* blob = workspace->GetBlob(blob_name);
           shapes.emplace_back(GetTensorShapeOfBlob(blob));
         }
         const auto c = schema->InferCost(def, shapes);
         return std::make_tuple(c.flops, c.bytes_written, c.bytes_read);
       });
   m.def("run_net_once", [](const py::bytes& net_def) {
-    CAFFE_ENFORCE(gWorkspace);
+    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+    CAFFE_ENFORCE(workspace);
     NetDef def;
     CAFFE_ENFORCE(ParseProtoFromLargeString(net_def.cast<std::string>(), &def));
     py::gil_scoped_release g;
-    CAFFE_ENFORCE(gWorkspace->RunNetOnce(def));
+    CAFFE_ENFORCE(workspace->RunNetOnce(def));
     return true;
   });
   m.def("run_plan", [](const py::bytes& plan_def) {
-    CAFFE_ENFORCE(gWorkspace);
+    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+    CAFFE_ENFORCE(workspace);
     PlanDef def;
     CAFFE_ENFORCE(
         ParseProtoFromLargeString(plan_def.cast<std::string>(), &def));
     py::gil_scoped_release g;
-    CAFFE_ENFORCE(gWorkspace->RunPlan(def));
+    CAFFE_ENFORCE(workspace->RunPlan(def));
     return true;
   });
   m.def("run_plan_in_background", [](const py::bytes& plan_def) {
-    CAFFE_ENFORCE(gWorkspace);
+    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+    CAFFE_ENFORCE(workspace);
     PlanDef def;
     CAFFE_ENFORCE(
         ParseProtoFromLargeString(plan_def.cast<std::string>(), &def));
     py::gil_scoped_release g;
 
-    auto background_plan = std::make_shared<BackgroundPlan>(gWorkspace, def);
+    auto background_plan = std::make_shared<BackgroundPlan>(workspace, def);
     background_plan->run();
     return background_plan;
   });
@@ -1513,7 +1501,8 @@ void addGlobalMethods(py::module& m) {
   m.def(
       "infer_shapes_and_types_from_workspace",
       [](const std::vector<py::bytes>& net_protos) {
-        CAFFE_ENFORCE(gWorkspace);
+        Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+        CAFFE_ENFORCE(workspace);
 
         // Parse protobuffers to NetDefs
         std::vector<std::unique_ptr<caffe2::NetDef>> nets;
@@ -1527,7 +1516,7 @@ void addGlobalMethods(py::module& m) {
         }
 
         auto blob_info =
-            InferBlobShapesAndTypesFromWorkspace(gWorkspace, nets_ptr);
+            InferBlobShapesAndTypesFromWorkspace(workspace, nets_ptr);
 
         std::string protob;
         CAFFE_ENFORCE(blob_info.SerializeToString(&protob));
@@ -1593,23 +1582,27 @@ void addGlobalMethods(py::module& m) {
     return py::bytes(output_net_proto);
   });
   m.def("create_blob", [](const std::string& name) {
-    CAFFE_ENFORCE(gWorkspace);
-    CAFFE_ENFORCE(gWorkspace->CreateBlob(name));
+    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+    CAFFE_ENFORCE(workspace);
+    CAFFE_ENFORCE(workspace->CreateBlob(name));
     return true;
   });
   m.def("reset_blob", [](const std::string& name) {
-    CAFFE_ENFORCE(gWorkspace);
-    auto* b = gWorkspace->GetBlob(name);
+    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+    CAFFE_ENFORCE(workspace);
+    auto* b = workspace->GetBlob(name);
     CAFFE_ENFORCE(b);
     b->Reset();
   });
   m.def("fetch_blob", [](const std::string& name) -> py::object {
-    return python_detail::fetchBlob(gWorkspace, name);
+    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+    return python_detail::fetchBlob(workspace, name);
   });
   m.def(
       "feed_blob",
       [](const std::string& name, py::object arg, py::object device_option) {
-        auto* blob = gWorkspace->CreateBlob(name);
+        Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+        auto* blob = workspace->CreateBlob(name);
         return python_detail::feedBlob(blob, arg, device_option);
       },
       "",
@@ -1620,16 +1613,18 @@ void addGlobalMethods(py::module& m) {
     return python_detail::deserializeBlob(content);
   });
   m.def("serialize_blob", [](const std::string& name) {
-    CAFFE_ENFORCE(gWorkspace);
-    auto* blob = gWorkspace->GetBlob(name);
+    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+    CAFFE_ENFORCE(workspace);
+    auto* blob = workspace->GetBlob(name);
     CAFFE_ENFORCE(blob);
     return py::bytes(SerializeBlob(*blob, name));
   });
   m.def(
       "deserialize_blob",
       [](const std::string& name, const py::bytes& serialized) {
-        CAFFE_ENFORCE(gWorkspace);
-        auto* blob = gWorkspace->CreateBlob(name);
+        Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+        CAFFE_ENFORCE(workspace);
+        auto* blob = workspace->CreateBlob(name);
         DeserializeBlob(serialized.cast<std::string>(), blob);
       });
 
@@ -1695,8 +1690,9 @@ void addGlobalMethods(py::module& m) {
   m.def("is_numa_enabled", []() { return IsNUMAEnabled(); });
   m.def("get_num_numa_nodes", []() { return GetNumNUMANodes(); });
   m.def("get_blob_numa_node", [](const std::string& blob_name) {
-    CAFFE_ENFORCE(gWorkspace);
-    auto* blob = gWorkspace->GetBlob(blob_name);
+    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+    CAFFE_ENFORCE(workspace);
+    auto* blob = workspace->GetBlob(blob_name);
     CAFFE_ENFORCE(blob);
     const TensorCPU& tensor = blob->Get<TensorCPU>();
     const void* raw_data = tensor.raw_data();
@@ -1704,8 +1700,9 @@ void addGlobalMethods(py::module& m) {
     return GetNUMANode(raw_data);
   });
   m.def("get_blob_size_bytes", [](const std::string& blob_name) {
-    CAFFE_ENFORCE(gWorkspace);
-    auto* blob = gWorkspace->GetBlob(blob_name);
+    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+    CAFFE_ENFORCE(workspace);
+    auto* blob = workspace->GetBlob(blob_name);
     CAFFE_ENFORCE(blob);
     return BlobStat::sizeBytes(*blob);
   });
@@ -1861,13 +1858,14 @@ void addGlobalMethods(py::module& m) {
   m.def(
       "run_workspace_transform",
       [](const std::string& transform_name, py::bytes def) {
-        CAFFE_ENFORCE(gWorkspace);
+        Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+        CAFFE_ENFORCE(workspace);
         caffe2::NetDef proto;
         CAFFE_ENFORCE(
             ParseProtoFromLargeString(def.cast<std::string>(), &proto));
         auto nn = caffe2::convertToNNModule(proto);
         auto pass = WorkspaceOptimizationPassRegistry()->Create(
-            transform_name, &nn, gWorkspace);
+            transform_name, &nn, workspace);
 
         CAFFE_ENFORCE(pass, "Pass doesn't exist: ", transform_name);
         pass->run();
@@ -1897,7 +1895,8 @@ void addGlobalMethods(py::module& m) {
     CAFFE_ENFORCE(ParseProtoFromLargeString(def.cast<std::string>(), &proto));
 
     auto nn = caffe2::convertToNNModule(proto);
-    opt::OptimizeForMkldnn(&nn, gWorkspace, training_mode);
+    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+    opt::OptimizeForMkldnn(&nn, workspace, training_mode);
     auto new_proto = caffe2::convertToCaffe2Proto(nn, proto);
 
     std::string out;
@@ -1919,12 +1918,13 @@ void addGlobalMethods(py::module& m) {
   });
 
   m.def("transform_fuseConvBN", [](py::bytes def) {
-    CAFFE_ENFORCE(gWorkspace);
+    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
+    CAFFE_ENFORCE(workspace);
     caffe2::NetDef proto;
     CAFFE_ENFORCE(ParseProtoFromLargeString(def.cast<std::string>(), &proto));
 
     auto nn = caffe2::convertToNNModule(proto);
-    opt::fuseConvBN(&nn, gWorkspace);
+    opt::fuseConvBN(&nn, workspace);
     auto new_proto = caffe2::convertToCaffe2Proto(nn);
 
     std::string out;
@@ -1959,8 +1959,7 @@ void addGlobalMethods(py::module& m) {
       return;
     }
     // We will create a default workspace for us to run stuff.
-    switchWorkspaceInternal("default", true);
-    gCurrentWorkspaceName = "default";
+    caffe2::python::SwitchWorkspaceInternal("default", true);
     initialized = true;
   };
 
diff --git a/caffe2/python/pybind_workspace.cc b/caffe2/python/pybind_workspace.cc
new file mode 100644
index 0000000000000..aa837b7b4dfe9
--- /dev/null
+++ b/caffe2/python/pybind_workspace.cc
@@ -0,0 +1,72 @@
+#include "caffe2/core/workspace.h"
+
+namespace caffe2 {
+namespace python {
+
+// gWorkspace is the pointer to the current workspace. The ownership is kept
+// by the gWorkspaces map.
+static Workspace* gWorkspace = nullptr;
+static std::string gCurrentWorkspaceName;
+// gWorkspaces allows us to define and switch between multiple workspaces in
+// Python.
+static std::map<std::string, std::unique_ptr<Workspace>> gWorkspaces;
+
+Workspace* GetCurrentWorkspace() {
+  return gWorkspace;
+}
+
+void SetCurrentWorkspace(Workspace* workspace) {
+  gWorkspace = workspace;
+}
+
+Workspace* NewWorkspace() {
+  std::unique_ptr<Workspace> new_workspace(new Workspace());
+  gWorkspace = new_workspace.get();
+  return gWorkspace;
+}
+
+Workspace* GetWorkspaceByName(const std::string& name) {
+  if (gWorkspaces.count(name)) {
+    return gWorkspaces[name].get();
+  }
+  return nullptr;
+}
+
+std::string GetCurrentWorkspaceName() {
+  return gCurrentWorkspaceName;
+}
+void InsertWorkspace(const std::string& name, std::unique_ptr<Workspace> ws) {
+  gWorkspaces.insert(std::make_pair(name, std::move(ws)));
+}
+
+void SwitchWorkspaceInternal(const std::string& name, bool create_if_missing) {
+  if (gWorkspaces.count(name)) {
+    gCurrentWorkspaceName = name;
+    gWorkspace = gWorkspaces[name].get();
+    return;
+  }
+
+  CAFFE_ENFORCE(create_if_missing);
+  std::unique_ptr<Workspace> new_workspace(new Workspace());
+  gWorkspace = new_workspace.get();
+  gWorkspaces.insert(std::make_pair(name, std::move(new_workspace)));
+  gCurrentWorkspaceName = name;
+}
+
+void ResetWorkspace(Workspace* workspace) {
+  gWorkspaces[gCurrentWorkspaceName].reset(workspace);
+  gWorkspace = gWorkspaces[gCurrentWorkspaceName].get();
+}
+
+void GetWorkspaceNames(std::vector<std::string>& names) {
+  for (const auto& kv : gWorkspaces) {
+    // NOLINTNEXTLINE(performance-inefficient-vector-operation)
+    names.emplace_back(kv.first);
+  }
+}
+
+void ClearWorkspaces() {
+  gWorkspaces.clear();
+}
+} // namespace python
+} // namespace caffe2
diff --git a/caffe2/python/pybind_workspace.h b/caffe2/python/pybind_workspace.h
new file mode 100644
index 0000000000000..0467d9ff6ccd3
--- /dev/null
+++ b/caffe2/python/pybind_workspace.h
@@ -0,0 +1,15 @@
+namespace caffe2 {
+namespace python {
+
+Workspace* GetCurrentWorkspace();
+void SetCurrentWorkspace(Workspace* workspace);
+Workspace* NewWorkspace();
+Workspace* GetWorkspaceByName(const std::string& name);
+std::string GetCurrentWorkspaceName();
+void InsertWorkspace(const std::string& name, std::unique_ptr<Workspace> ws);
+void SwitchWorkspaceInternal(const std::string& name, bool create_if_missing);
+void ResetWorkspace(Workspace* workspace);
+void GetWorkspaceNames(std::vector<std::string>& names);
+void ClearWorkspaces();
+} // namespace python
+} // namespace caffe2

From 123495b89c5ff1bcf7dc24bb640afb8d027c998c Mon Sep 17 00:00:00 2001
From: AllenTiTaiWang <titaiwang@microsoft.com>
Date: Fri, 28 Oct 2022 19:54:52 +0000
Subject: [PATCH 0301/1922] [ONNX] Move all torch.onnx.export related tests to
 test/onnx (#87292)

Moving torch.onnx.export related tests to test/onnx integrates ONNX tests to the same CI machine, so the testing environment can be better managed.

Fixes https://github.com/pytorch/pytorch/issues/87320
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87292
Approved by: https://github.com/thiagocrepaldi, https://github.com/BowenBao, https://github.com/kit1980
---
 .github/merge_rules.yaml                      |   1 -
 test/jit/test_async.py                        |  15 --
 test/jit/test_tracer.py                       |   8 -
 test/{jit => onnx}/test_export_modes.py       |  89 +++++---
 test/onnx/test_pytorch_onnx_no_runtime.py     | 190 +++++++++++++++++-
 .../eager/test_quantize_eager_ptq.py          |  21 --
 test/test_jit.py                              | 112 +----------
 7 files changed, 247 insertions(+), 189 deletions(-)
 rename test/{jit => onnx}/test_export_modes.py (65%)

diff --git a/.github/merge_rules.yaml b/.github/merge_rules.yaml
index 26b3eb437251a..6e9cba905e751 100644
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@@ -6,7 +6,6 @@
   - docs/source/onnx*
   - docs/source/scripts/onnx/**
   - scripts/onnx/**
-  - test/jit/test_export_modes.py
   - test/onnx/**
   - tools/onnx/**
   - torch/_C/__init__.pyi.in
diff --git a/test/jit/test_async.py b/test/jit/test_async.py
index d3769cd452d64..f8a1baea67133 100644
--- a/test/jit/test_async.py
+++ b/test/jit/test_async.py
@@ -1,6 +1,5 @@
 # Owner(s): ["oncall: jit"]
 
-import io
 import os
 import sys
 
@@ -420,20 +419,6 @@ def fn(x):
         self.assertGraphContainsExactly(traced.graph, kind='aten::wait', num_kind_nodes=0)
         self.assertGraphContainsExactly(traced.graph, kind='aten::add', num_kind_nodes=2)
 
-    def test_trace_fork_wait_inline_onnx(self):
-        def fork_body(x):
-            return torch.neg(x), torch.neg(x)
-
-        class MyMod(torch.nn.Module):
-            def forward(self, x):
-                fut = torch.jit._fork(fork_body, x)
-                val = torch.jit._wait(fut)
-                return val[1]
-
-        # smoke test for ONNX export
-        f = io.BytesIO()
-        torch.onnx.export(MyMod(), (torch.rand(3, 4),), f)
-
     def test_trace_fork_wait_list_modulecalls(self):
         def add_one(input):
             return input + torch.ones(input.size())
diff --git a/test/jit/test_tracer.py b/test/jit/test_tracer.py
index 50fdec94b9fc0..b36003a2b9209 100644
--- a/test/jit/test_tracer.py
+++ b/test/jit/test_tracer.py
@@ -1124,14 +1124,6 @@ def foo(x, w):
         # With `check_trace=True` it will run with `@torch.no_grad()` and break assert.
         torch.jit.trace(foo, (x, w), check_trace=False)
 
-    def test_trace_detach_onnx_erase(self):
-        class Mod(torch.nn.Module):
-            def forward(self, x, w):
-                return torch.matmul(x, w).detach()
-
-        torch.onnx.export_to_pretty_string(
-            Mod(), (torch.rand(3, 4), torch.rand(4, 5)))
-
     def test_trace_slice_full_dim(self):
         def foo(x):
             return x[0:5, 0] + 1.0
diff --git a/test/jit/test_export_modes.py b/test/onnx/test_export_modes.py
similarity index 65%
rename from test/jit/test_export_modes.py
rename to test/onnx/test_export_modes.py
index dbf10cddc059b..0f3024a2e366d 100644
--- a/test/jit/test_export_modes.py
+++ b/test/onnx/test_export_modes.py
@@ -1,29 +1,25 @@
-# Owner(s): ["oncall: jit"]
+# Owner(s): ["module: onnx"]
 
 import io
 import os
 import shutil
 import sys
 import tempfile
+import unittest
 
 import torch
 import torch.nn as nn
-from torch.onnx import OperatorExportTypes
 from torch.autograd import Variable
+from torch.onnx import OperatorExportTypes
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
-from torch.testing._internal.jit_utils import JitTestCase
-from torch.testing._internal.common_utils import skipIfNoLapack, skipIfCaffe2, skipIfNoCaffe2
+from torch.testing._internal import common_utils
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
 
 # Smoke tests for export methods
-class TestExportModes(JitTestCase):
+class TestExportModes(common_utils.TestCase):
     class MyModel(nn.Module):
         def __init__(self):
             super(TestExportModes.MyModel, self).__init__()
@@ -35,41 +31,66 @@ def test_protobuf(self):
         torch_model = TestExportModes.MyModel()
         fake_input = Variable(torch.randn(1, 1, 224, 224), requires_grad=True)
         f = io.BytesIO()
-        torch.onnx._export(torch_model, (fake_input), f, verbose=False,
-                           export_type=torch.onnx.ExportTypes.PROTOBUF_FILE)
+        torch.onnx._export(
+            torch_model,
+            (fake_input),
+            f,
+            verbose=False,
+            export_type=torch.onnx.ExportTypes.PROTOBUF_FILE,
+        )
 
     def test_zipfile(self):
         torch_model = TestExportModes.MyModel()
         fake_input = Variable(torch.randn(1, 1, 224, 224), requires_grad=True)
         f = io.BytesIO()
-        torch.onnx._export(torch_model, (fake_input), f, verbose=False,
-                           export_type=torch.onnx.ExportTypes.ZIP_ARCHIVE)
+        torch.onnx._export(
+            torch_model,
+            (fake_input),
+            f,
+            verbose=False,
+            export_type=torch.onnx.ExportTypes.ZIP_ARCHIVE,
+        )
 
     def test_compressed_zipfile(self):
         torch_model = TestExportModes.MyModel()
         fake_input = Variable(torch.randn(1, 1, 224, 224), requires_grad=True)
         f = io.BytesIO()
-        torch.onnx._export(torch_model, (fake_input), f, verbose=False,
-                           export_type=torch.onnx.ExportTypes.COMPRESSED_ZIP_ARCHIVE)
+        torch.onnx._export(
+            torch_model,
+            (fake_input),
+            f,
+            verbose=False,
+            export_type=torch.onnx.ExportTypes.COMPRESSED_ZIP_ARCHIVE,
+        )
 
     def test_directory(self):
         torch_model = TestExportModes.MyModel()
         fake_input = Variable(torch.randn(1, 1, 224, 224), requires_grad=True)
         d = tempfile.mkdtemp()
-        torch.onnx._export(torch_model, (fake_input), d, verbose=False,
-                           export_type=torch.onnx.ExportTypes.DIRECTORY)
+        torch.onnx._export(
+            torch_model,
+            (fake_input),
+            d,
+            verbose=False,
+            export_type=torch.onnx.ExportTypes.DIRECTORY,
+        )
         shutil.rmtree(d)
 
     def test_onnx_multiple_return(self):
         @torch.jit.script
         def foo(a):
             return (a, a)
+
         f = io.BytesIO()
         x = torch.ones(3)
-        torch.onnx._export(foo, (x,), f)
-
-    @skipIfNoCaffe2
-    @skipIfNoLapack
+        torch.onnx.export(foo, (x,), f)
+
+    # TODO(87318): Can't pass even with Caffe2
+    @unittest.skip(
+        "RuntimeError: ScalarType UNKNOWN_SCALAR is an unexpected tensor scalar type"
+    )
+    @common_utils.skipIfNoCaffe2
+    @common_utils.skipIfNoLapack
     def test_caffe2_aten_fallback(self):
         class ModelWithAtenNotONNXOp(nn.Module):
             def forward(self, x, y):
@@ -80,13 +101,15 @@ def forward(self, x, y):
         x = torch.rand(3, 4)
         y = torch.rand(3, 4)
         torch.onnx.export_to_pretty_string(
-            ModelWithAtenNotONNXOp(), (x, y),
+            ModelWithAtenNotONNXOp(),
+            (x, y),
             add_node_names=False,
             do_constant_folding=False,
-            operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK)
+            operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK,
+        )
 
-    @skipIfCaffe2
-    @skipIfNoLapack
+    @common_utils.skipIfCaffe2
+    @common_utils.skipIfNoLapack
     def test_aten_fallback(self):
         class ModelWithAtenNotONNXOp(nn.Module):
             def forward(self, x, y):
@@ -97,12 +120,14 @@ def forward(self, x, y):
         x = torch.rand(3, 4)
         y = torch.rand(3, 4)
         torch.onnx.export_to_pretty_string(
-            ModelWithAtenNotONNXOp(), (x, y),
+            ModelWithAtenNotONNXOp(),
+            (x, y),
             add_node_names=False,
             do_constant_folding=False,
             operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK,
             # support for linalg.qr was added in later op set versions.
-            opset_version=9)
+            opset_version=9,
+        )
 
     # torch.fmod is using to test ONNX_ATEN.
     # If you plan to remove fmod from aten, or found this test failed.
@@ -115,7 +140,13 @@ def forward(self, x, y):
         x = torch.randn(3, 4, dtype=torch.float32)
         y = torch.randn(3, 4, dtype=torch.float32)
         torch.onnx.export_to_pretty_string(
-            ModelWithAtenFmod(), (x, y),
+            ModelWithAtenFmod(),
+            (x, y),
             add_node_names=False,
             do_constant_folding=False,
-            operator_export_type=OperatorExportTypes.ONNX_ATEN)
+            operator_export_type=OperatorExportTypes.ONNX_ATEN,
+        )
+
+
+if __name__ == "__main__":
+    common_utils.run_tests()
diff --git a/test/onnx/test_pytorch_onnx_no_runtime.py b/test/onnx/test_pytorch_onnx_no_runtime.py
index 5f2ce3fa657a1..b50d316df8ff4 100644
--- a/test/onnx/test_pytorch_onnx_no_runtime.py
+++ b/test/onnx/test_pytorch_onnx_no_runtime.py
@@ -7,8 +7,11 @@
 import itertools
 import unittest
 import unittest.mock
+import warnings
 from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
 
+import numpy as np
+
 import onnx
 import onnx.numpy_helper
 
@@ -18,7 +21,7 @@
 from torch.onnx import symbolic_helper, utils
 from torch.onnx._globals import GLOBALS
 from torch.onnx._internal import registration
-from torch.testing._internal import common_utils
+from torch.testing._internal import common_utils, jit_utils
 
 
 def export_to_onnx(
@@ -76,7 +79,7 @@ def forward(self, x):
 
         x = torch.ones(3, 3)
         f = io.BytesIO()
-        torch.onnx._export(AddmmModel(), x, f, verbose=False)
+        torch.onnx.export(AddmmModel(), x, f, verbose=False)
 
     def test_onnx_transpose_incomplete_tensor_type(self):
         # Smoke test to get us into the state where we are attempting to export
@@ -163,7 +166,7 @@ def forward(self, x):
         mte = ModuleToExport()
         f = io.BytesIO()
         with self.assertRaisesRegex(RuntimeError, "Couldn't export Python"):
-            torch.onnx._export(mte, (torch.zeros(1, 2, 3),), f, verbose=False)
+            torch.onnx.export(mte, (torch.zeros(1, 2, 3),), f, verbose=False)
 
     def test_onnx_export_script_inline_trace(self):
         class ModuleToInline(torch.nn.Module):
@@ -427,7 +430,11 @@ def forward(self, x):
         onnx_model = export_to_onnx(
             MyClip(),
             torch.randn(3, 4, requires_grad=True),
-            custom_ops=[common_utils.custom_op("aten::clamp", bad_clamp, 9)],
+            custom_ops=[
+                common_utils.custom_op(
+                    "aten::clamp", bad_clamp, GLOBALS.export_onnx_opset_version
+                )
+            ],
             operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
         )
         self.assertAtenOp(onnx_model, "clamp", "Tensor")
@@ -777,6 +784,181 @@ def forward(self, x):
             model, inputs, f, dynamic_axes={"x": [0, 1]}, input_names=["x"]
         )
 
+    def test_dropout_script(self):
+
+        eg = torch.zeros(1, 2, 3, requires_grad=True)
+
+        @jit_utils._trace(eg)
+        def foo(x):
+            x = torch.neg(x)
+            return F.dropout(x)
+
+        class MyDrop(torch.nn.Module):
+            def forward(self, x):
+                return foo(x)
+
+        f = io.BytesIO()
+        with warnings.catch_warnings(record=True):
+            torch.onnx.export(MyDrop(), (eg,), f, verbose=False)
+
+    def test_pack_padded_pad_packed_trace(self):
+        from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+
+        T, B, C = 3, 5, 7
+
+        class PadPackedWrapper(torch.nn.Module):
+            def __init__(self):
+                super(PadPackedWrapper, self).__init__()
+
+            def forward(self, x, seq_lens):
+                x = pack_padded_sequence(x, seq_lens)
+                x, _ = pad_packed_sequence(x)
+                return x
+
+        x = np.ones((T, B, C))
+        seq_lens = np.array([3, 3, 2, 2, 1], dtype=np.int32)
+        # set padding value so we can test equivalence
+        for b in range(B):
+            if seq_lens[b] < T:
+                x[seq_lens[b] :, b, :] = 0
+        seq_lens = torch.from_numpy(seq_lens)
+        x = torch.autograd.Variable(torch.from_numpy(x), requires_grad=True)
+
+        m = PadPackedWrapper()
+        m_traced = torch.jit.trace(
+            m,
+            (
+                x,
+                seq_lens,
+            ),
+        )
+
+        y = m(x, seq_lens)
+        loss = torch.sum(y)
+        loss.backward()
+        grad = x.grad.clone()
+        x.grad.zero_()
+
+        y_traced = m_traced(x, seq_lens)
+        loss_traced = torch.sum(y_traced)
+        loss_traced.backward()
+        grad_traced = x.grad.clone()
+
+        self.assertEqual(y_traced, x)
+        self.assertEqual(y_traced, y)
+        self.assertEqual(grad, grad_traced)
+
+        f = io.BytesIO()
+        torch.onnx.export(m, (x, seq_lens), f, verbose=False)
+
+    # Suppression: ONNX warns when exporting RNNs because of potential batch size mismatch.
+    @common_utils.suppress_warnings
+    def test_rnn_trace_override(self):
+        from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+
+        num_layers = 3
+        T, B, C = 11, 5, 7
+
+        class RNNTraceWrapper(torch.nn.Module):
+            def __init__(self, cell_type):
+                super(RNNTraceWrapper, self).__init__()
+                if cell_type == "RNN":
+                    self.rnn = torch.nn.RNN(
+                        input_size=C, hidden_size=C, num_layers=num_layers
+                    )
+                elif cell_type == "LSTM":
+                    self.rnn = torch.nn.LSTM(
+                        input_size=C, hidden_size=C, num_layers=num_layers
+                    )
+                elif cell_type == "GRU":
+                    self.rnn = torch.nn.GRU(
+                        input_size=C, hidden_size=C, num_layers=num_layers
+                    )
+
+            def forward(self, x, seq_lens):
+                x = pack_padded_sequence(x, seq_lens)
+                x, _ = self.rnn(x)
+                x, _ = pad_packed_sequence(x)
+                return x
+
+        for cell_type in ["RNN", "LSTM", "GRU"]:
+            x = torch.ones(T, B, C, requires_grad=True)
+            seq_lens = torch.from_numpy(np.array([11, 3, 2, 2, 1], dtype=np.int32))
+
+            m = RNNTraceWrapper(cell_type)
+            m_traced = torch.jit.trace(
+                m,
+                (
+                    x,
+                    seq_lens,
+                ),
+            )
+
+            y = m(x, seq_lens)
+            loss = torch.sum(y)
+            loss.backward()
+            grad = x.grad.clone()
+            x.grad.zero_()
+
+            y_traced = m_traced(x, seq_lens)
+            loss_traced = torch.sum(y_traced)
+            loss_traced.backward()
+            grad_traced = x.grad.clone()
+
+            self.assertEqual(y_traced, y)
+            self.assertEqual(grad, grad_traced)
+
+            f = io.BytesIO()
+            torch.onnx.export(m, (x, seq_lens), f, verbose=False)
+
+    def test_trace_fork_wait_inline_onnx(self):
+        def fork_body(x):
+            return torch.neg(x), torch.neg(x)
+
+        class MyMod(torch.nn.Module):
+            def forward(self, x):
+                fut = torch.jit._fork(fork_body, x)
+                val = torch.jit._wait(fut)
+                return val[1]
+
+        # smoke test for ONNX export
+        f = io.BytesIO()
+        torch.onnx.export(MyMod(), (torch.rand(3, 4),), f)
+
+    def test_trace_detach_onnx_erase(self):
+        class Mod(torch.nn.Module):
+            def forward(self, x, w):
+                return torch.matmul(x, w).detach()
+
+        torch.onnx.export_to_pretty_string(Mod(), (torch.rand(3, 4), torch.rand(4, 5)))
+
+    def _test_lower_graph_impl(self, model, data):
+        model.qconfig = torch.ao.quantization.default_qconfig
+        model = torch.ao.quantization.prepare(model)
+        model = torch.ao.quantization.convert(model)
+
+        _ = model(data)
+        input_names = ["x"]
+
+        def _export_to_onnx(model, input, input_names):
+            traced = torch.jit.trace(model, input)
+            buf = io.BytesIO()
+            torch.jit.save(traced, buf)
+            buf.seek(0)
+
+            model = torch.jit.load(buf)
+            f = io.BytesIO()
+            torch.onnx.export(
+                model,
+                input,
+                f,
+                input_names=input_names,
+                operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+                opset_version=9,
+            )
+
+        _export_to_onnx(model, data, input_names)
+
 
 if __name__ == "__main__":
     common_utils.run_tests()
diff --git a/test/quantization/eager/test_quantize_eager_ptq.py b/test/quantization/eager/test_quantize_eager_ptq.py
index 7d87cc520ba04..3a38a597b352d 100644
--- a/test/quantization/eager/test_quantize_eager_ptq.py
+++ b/test/quantization/eager/test_quantize_eager_ptq.py
@@ -71,7 +71,6 @@
 
 # Standard library
 from typing import Tuple
-import io
 import unittest
 import numpy as np
 
@@ -1444,26 +1443,6 @@ def forward(self, indices, offsets, linear_in):
         self.assertTrue('DynamicQuantizedLinear' in str(q_model))
 
 class TestQuantizeEagerONNXExport(JitTestCase):
-    def _test_lower_graph_impl(self, model, data):
-        model.qconfig = torch.ao.quantization.default_qconfig
-        model = torch.ao.quantization.prepare(model)
-        model = torch.ao.quantization.convert(model)
-
-        outputs = model(data)
-        input_names = ["x"]
-
-        def export_to_onnx(model, input, input_names):
-            traced = torch.jit.trace(model, input)
-            buf = io.BytesIO()
-            torch.jit.save(traced, buf)
-            buf.seek(0)
-
-            model = torch.jit.load(buf)
-            f = io.BytesIO()
-            torch.onnx.export(model, input, f, input_names=input_names,
-                              operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
-                              opset_version=9)
-        onnx_model = export_to_onnx(model, data, input_names)
 
     @skipIfNoFBGEMM
     @skipIfNoCaffe2
diff --git a/test/test_jit.py b/test/test_jit.py
index b1425a4ed71ca..13c27b0efa555 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -20,7 +20,6 @@
 from jit.test_autodiff import TestAutodiffJit  # noqa: F401
 from jit.test_autodiff_subgraph_slicing import TestAutodiffSubgraphSlicing  # noqa: F401
 from jit.test_custom_operators import TestCustomOperators  # noqa: F401
-from jit.test_export_modes import TestExportModes  # noqa: F401
 from jit.test_graph_rewrite_passes import TestGraphRewritePasses  # noqa: F401
 from jit.test_class_type import TestClassType  # noqa: F401
 from jit.test_builtins import TestBuiltins, TestTensorBuiltins  # noqa: F401
@@ -97,7 +96,7 @@
 from torch.testing._internal.common_jit import check_against_reference
 from torch.testing._internal.common_utils import run_tests, IS_WINDOWS, TEST_WITH_UBSAN, \
     suppress_warnings, BUILD_WITH_CAFFE2, IS_SANDCASTLE, GRAPH_EXECUTOR, ProfilingMode, TestCase, \
-    freeze_rng_state, slowTest, TemporaryFileName, skipIfCompiledWithoutNumpy, \
+    freeze_rng_state, slowTest, TemporaryFileName, \
     enable_profiling_mode_for_profiling_tests, TEST_MKL, set_default_dtype, num_profiled_runs, \
     skipIfCrossRef, IS_MACOS, skipIfTorchDynamo
 from torch.testing._internal.jit_utils import JitTestCase, enable_cpu_fuser, disable_autodiff_subgraph_inlining, \
@@ -5913,23 +5912,6 @@ def test_fuser_multiple_blocks(this, that, theother, meme):
 
         self.assertEqual(cu.test_fuser_multiple_blocks(*inputs), outputs)
 
-    def test_dropout_script(self):
-
-        eg = torch.zeros(1, 2, 3, requires_grad=True)
-
-        @_trace(eg)
-        def foo(x):
-            x = torch.neg(x)
-            return F.dropout(x)
-
-        class MyDrop(nn.Module):
-            def forward(self, x):
-                return foo(x)
-
-        f = io.BytesIO()
-        with warnings.catch_warnings(record=True):
-            torch.onnx.export(MyDrop(), (eg,), f, verbose=False)
-
     @unittest.skip("RuntimeError: VariableType::ID() not implemented")
     def test_cast(self):
         script = '''
@@ -9780,50 +9762,6 @@ def forward(self, rep):
             m = M2()
             m(torch.zeros(4, 3))
 
-    @skipIfCompiledWithoutNumpy
-    def test_pack_padded_pad_packed_trace(self):
-        from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
-        T, B, C = 3, 5, 7
-
-        class PadPackedWrapper(torch.nn.Module):
-            def __init__(self):
-                super(PadPackedWrapper, self).__init__()
-
-            def forward(self, x, seq_lens):
-                x = pack_padded_sequence(x, seq_lens)
-                x, _ = pad_packed_sequence(x)
-                return x
-
-        x = np.ones((T, B, C))
-        seq_lens = np.array([3, 3, 2, 2, 1], dtype=np.int32)
-        # set padding value so we can test equivalence
-        for b in range(B):
-            if seq_lens[b] < T:
-                x[seq_lens[b]:, b, :] = 0
-        seq_lens = torch.from_numpy(seq_lens)
-        x = torch.autograd.Variable(torch.from_numpy(x), requires_grad=True)
-
-        m = PadPackedWrapper()
-        m_traced = torch.jit.trace(m, (x, seq_lens,))
-
-        y = m(x, seq_lens)
-        loss = torch.sum(y)
-        loss.backward()
-        grad = x.grad.clone()
-        x.grad.zero_()
-
-        y_traced = m_traced(x, seq_lens)
-        loss_traced = torch.sum(y_traced)
-        loss_traced.backward()
-        grad_traced = x.grad.clone()
-
-        self.assertEqual(y_traced, x)
-        self.assertEqual(y_traced, y)
-        self.assertEqual(grad, grad_traced)
-
-        f = io.BytesIO()
-        torch.onnx._export(m, (x, seq_lens), f, verbose=False)
-
     def test_script_pack_padded_sequence(self):
         from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
 
@@ -10024,54 +9962,6 @@ def forward(self, input: torch.Tensor):
         m_scripted = torch.jit.script(m)
         self.assertEqual(m_scripted(torch.tensor(1)), torch.tensor(246))
 
-    # Suppression: ONNX warns when exporting RNNs because of potential batch size mismatch.
-    @suppress_warnings
-    @skipIfCompiledWithoutNumpy
-    def test_rnn_trace_override(self):
-        from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
-        num_layers = 3
-        T, B, C = 11, 5, 7
-
-        class RNNTraceWrapper(torch.nn.Module):
-            def __init__(self, cell_type):
-                super(RNNTraceWrapper, self).__init__()
-                if cell_type == 'RNN':
-                    self.rnn = torch.nn.RNN(input_size=C, hidden_size=C, num_layers=num_layers)
-                elif cell_type == 'LSTM':
-                    self.rnn = torch.nn.LSTM(input_size=C, hidden_size=C, num_layers=num_layers)
-                elif cell_type == 'GRU':
-                    self.rnn = torch.nn.GRU(input_size=C, hidden_size=C, num_layers=num_layers)
-
-            def forward(self, x, seq_lens):
-                x = pack_padded_sequence(x, seq_lens)
-                x, _ = self.rnn(x)
-                x, _ = pad_packed_sequence(x)
-                return x
-
-        for cell_type in ['RNN', 'LSTM', 'GRU']:
-            x = torch.ones(T, B, C, requires_grad=True)
-            seq_lens = torch.from_numpy(np.array([11, 3, 2, 2, 1], dtype=np.int32))
-
-            m = RNNTraceWrapper(cell_type)
-            m_traced = torch.jit.trace(m, (x, seq_lens,))
-
-            y = m(x, seq_lens)
-            loss = torch.sum(y)
-            loss.backward()
-            grad = x.grad.clone()
-            x.grad.zero_()
-
-            y_traced = m_traced(x, seq_lens)
-            loss_traced = torch.sum(y_traced)
-            loss_traced.backward()
-            grad_traced = x.grad.clone()
-
-            self.assertEqual(y_traced, y)
-            self.assertEqual(grad, grad_traced)
-
-            f = io.BytesIO()
-            torch.onnx._export(m, (x, seq_lens), f, verbose=False)
-
     def test_python_call_non_tensor(self):
         def foo(a, b, c):
             # type: (Tensor, int, Tuple[Tensor, int]) -> Tuple[int, Tensor]

From 0e63666e138cf7d20c8937aa23f248f821cc3f68 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Sat, 29 Oct 2022 06:48:12 +0000
Subject: [PATCH 0302/1922] Revert "[ONNX] Deprecate operators.py (#87798)"

This reverts commit 88eff1072290177221e7a09d792f7f135b4c83ca.

Reverted https://github.com/pytorch/pytorch/pull/87798 on behalf of https://github.com/weiwangmeta due to breaking internal builds see D40797126
---
 torch/onnx/operators.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/torch/onnx/operators.py b/torch/onnx/operators.py
index 07f89b2e41a60..e5f12444c3559 100644
--- a/torch/onnx/operators.py
+++ b/torch/onnx/operators.py
@@ -9,15 +9,12 @@
 """
 
 import torch
-from torch.onnx import _deprecation
+import torch.onnx
 
 
-# 180-day deprecation period
-@_deprecation.deprecated("1.14", "1.16", "use torch._shape_as_tensor")
 def shape_as_tensor(x):
     return torch._shape_as_tensor(x)
 
 
-@_deprecation.deprecated("1.14", "1.16", "use torch._reshape_from_tensor")
 def reshape_from_tensor_shape(x, shape):
     return torch._reshape_from_tensor(x, shape)

From 4292537f5f30bca85d11eb7a2a1f8d08d511d008 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Fri, 28 Oct 2022 04:17:33 +0000
Subject: [PATCH 0303/1922] [FSDP()][1/N] Start refactoring FSDP root
 pre-forward (#87915)

Welcome! This PR starts the refactoring journey.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87915
Approved by: https://github.com/mrshenli
---
 torch/distributed/fsdp/_optim_utils.py        |  15 ++-
 torch/distributed/fsdp/_runtime_utils.py      |  82 ++++++++++++
 .../fsdp/fully_sharded_data_parallel.py       | 123 ++++++------------
 3 files changed, 132 insertions(+), 88 deletions(-)
 create mode 100644 torch/distributed/fsdp/_runtime_utils.py

diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py
index f87f871042217..165818a72d66d 100644
--- a/torch/distributed/fsdp/_optim_utils.py
+++ b/torch/distributed/fsdp/_optim_utils.py
@@ -19,10 +19,11 @@
 import torch.distributed as dist
 
 # Import the entire FSDP file to avoid circular imports
-import torch.distributed.fsdp.fully_sharded_data_parallel as FSDP
+import torch.distributed.fsdp.fully_sharded_data_parallel as fsdp_file
 import torch.nn as nn
 from torch.distributed._shard.sharded_tensor import ShardedTensor
 from torch.distributed.fsdp._fsdp_extensions import _ext_chunk_tensor
+from torch.distributed.fsdp._runtime_utils import _clear_grads_if_needed
 from torch.distributed.fsdp._shard_utils import _gather_state_dict
 from torch.distributed.fsdp.flat_param import FlatParameter, FlatParamHandle
 
@@ -115,7 +116,7 @@ def _unflatten_optim_state(
         otherwise. The final optimizer state dict will need to map these
         entries using the proper unflattened parameter IDs.
     """
-    fsdp_module._clear_grads_if_needed()
+    _clear_grads_if_needed(fsdp_module._fsdp_handles(fsdp_module))
     consolidated_state = _communicate_optim_state(
         flat_param,
         flat_param_state,
@@ -184,7 +185,7 @@ def _communicate_optim_state(
             # we take the target rank's value
             if (
                 fsdp_module.world_size == 1
-                or fsdp_module.sharding_strategy == FSDP.ShardingStrategy.NO_SHARD
+                or fsdp_module.sharding_strategy == fsdp_file.ShardingStrategy.NO_SHARD
             ):
                 tensor_state[state_name] = value
                 continue
@@ -292,7 +293,7 @@ def _flatten_optim_state_dict(
             '"param_groups" to be a valid optimizer state dict'
         )
     flat_param_to_fsdp_module = _get_flat_param_to_fsdp_module(model)
-    param_to_unflat_param_names = FSDP._get_param_to_unflat_param_names(model)
+    param_to_unflat_param_names = fsdp_file._get_param_to_unflat_param_names(model)
 
     # Construct the "state" part
     flat_osd_state: Dict[_OptimStateKey, Any] = {}
@@ -893,7 +894,7 @@ def _rekey_sharded_optim_state_dict(
         if using_optim_input
         else _get_param_to_param_id(optim)
     )
-    param_to_unflat_param_names = FSDP._get_param_to_unflat_param_names(model)
+    param_to_unflat_param_names = fsdp_file._get_param_to_unflat_param_names(model)
     # All parameter keys in `param_to_flat_param_id` should be in
     # `param_to_unflat_param_names` -- strict inequality follows when not all
     # parameters are passed to the optimizer
@@ -946,7 +947,7 @@ def _get_flat_param_to_fsdp_module(model: torch.nn.Module):
     """
     flat_param_to_fsdp_module = {}
     for module in model.modules():
-        if isinstance(module, FSDP.FullyShardedDataParallel):
+        if isinstance(module, fsdp_file.FullyShardedDataParallel):
             module._lazy_init()
             for param in module.params:  # may have none
                 flat_param_to_fsdp_module[param] = module
@@ -1163,7 +1164,7 @@ def _optim_state_dict(
     # (`_OptimStateKey`s) and parameter IDs and broadcast rank 0's mapping
     param_to_unflat_param_names: Dict[
         torch.nn.Parameter, List[str]
-    ] = FSDP._get_param_to_unflat_param_names(model)
+    ] = fsdp_file._get_param_to_unflat_param_names(model)
     flat_param_id_to_param: List[torch.nn.Parameter] = (
         _get_param_id_to_param_from_optim_input(model, optim_input)
         if using_optim_input
diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
new file mode 100644
index 0000000000000..6a3c7c86c129d
--- /dev/null
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -0,0 +1,82 @@
+from typing import Any, List, Optional, Tuple
+
+import torch
+from torch.distributed.fsdp._utils import _apply_to_tensors
+from torch.distributed.fsdp.flat_param import FlatParamHandle
+from torch.distributed.utils import _to_kwargs
+
+
+def _wait_for_computation_stream(
+    computation_stream: torch.cuda.Stream,
+    unshard_stream: torch.cuda.Stream,
+    pre_unshard_stream: torch.cuda.Stream,
+):
+    """
+    Has the unshard and pre-unshard streams wait for the computation stream.
+    For example, this should be called in the FSDP root's pre-forward to
+    respect optimizer step computation.
+    """
+    unshard_stream.wait_stream(computation_stream)
+    # Having the pre-all-gather stream wait for the current stream even if we
+    # do not leverage the pre-all-gather stream is tolerable since this only
+    # runs once per iteration
+    pre_unshard_stream.wait_stream(computation_stream)
+
+
+def _clear_grads_if_needed(
+    handles: List[FlatParamHandle],
+):
+    """
+    Clears the original parameters' gradients if needed. This method's CPU
+    overhead is minimal, so we may call it throughout FSDP methods, which serve
+    as callsites to free the gradient memory earlier.
+    """
+    for handle in handles:
+        if handle._use_orig_params:
+            handle._clear_grads_if_needed()
+
+
+def _prepare_forward_inputs(
+    device: torch.device,
+    input_dtype: Optional[torch.dtype],
+    *args: Any,
+    **kwargs: Any,
+) -> Tuple[Any, Any]:
+    """
+    Prepares the forward inputs by moving them to ``device`` and casting them
+    to ``input_dtype`` if it is not ``None``.
+    """
+    # TODO: Do not use the side stream for tensor copies for now; investigate
+    # the perf with/without it.
+    # TODO: For mixed precision, move the inputs to the compute device and cast
+    # to reduced-precision in a single `to()` call.
+    args_tuple, kwargs_tuple = _to_kwargs(args, kwargs, device.index, False)
+    args = args_tuple[0]
+    kwargs = kwargs_tuple[0]
+    if input_dtype is not None:
+        args, kwargs = _cast_fp_inputs_to_dtype(input_dtype, *args, **kwargs)
+    return args, kwargs
+
+
+def _cast_fp_inputs_to_dtype(
+    dtype: torch.dtype,
+    *args: Any,
+    **kwargs: Any,
+) -> Tuple[Any, Any]:
+    """
+    Casts floating point tensors in ``args`` and ``kwargs`` to ``input_dtype``.
+    This respects the existing ``requires_grad`` on the tensors.
+    """
+
+    def cast_fn(x: torch.Tensor) -> torch.Tensor:
+        if not torch.is_floating_point(x):
+            return x
+        y = x.to(dtype)
+        # Explicitly copy over `requires_grad` since this runs inside
+        # `torch.no_grad()`
+        if x.is_leaf:
+            y.requires_grad = x.requires_grad
+        return y
+
+    with torch.no_grad():
+        return (_apply_to_tensors(cast_fn, args), _apply_to_tensors(cast_fn, kwargs))
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 373e20b99a4b8..a5375e89c269c 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -38,7 +38,12 @@
 )
 from torch.distributed.algorithms._comm_hooks import default_hooks, LOW_PRECISION_HOOKS
 from torch.distributed.distributed_c10d import _get_default_group
-from torch.distributed.utils import _sync_params_and_buffers, _to_kwargs
+from torch.distributed.fsdp._runtime_utils import (
+    _clear_grads_if_needed,
+    _prepare_forward_inputs,
+    _wait_for_computation_stream,
+)
+from torch.distributed.utils import _sync_params_and_buffers
 
 from ._optim_utils import (
     _broadcast_pos_dim_tensor_states,
@@ -1692,6 +1697,18 @@ def fsdp_modules(
             and (not root_only or submodule.check_is_root())
         ]
 
+    @staticmethod
+    def _fsdp_handles(module: nn.Module) -> List[FlatParamHandle]:
+        """
+        Returns all nested FSDP instances' handles in the module hierarchy
+        rooted at ``module``.
+        """
+        return [
+            handle
+            for fsdp_module in FullyShardedDataParallel.fsdp_modules(module)
+            for handle in fsdp_module._handles
+        ]
+
     def apply(self, fn: Callable[[nn.Module], None]) -> "FullyShardedDataParallel":
         r"""Applies ``fn`` recursively to every submodule (as returned by ``.children()``)
         as well as self. Typical use includes initializing the parameters of a model
@@ -1756,31 +1773,6 @@ def _low_precision_hook_enabled(self) -> bool:
             and self._communication_hook in LOW_PRECISION_HOOKS
         )
 
-    def _cast_fp_inputs_to_dtype(
-        self, dtype: torch.dtype, *args: Any, **kwargs: Any
-    ) -> Tuple[Any, Any]:
-        """
-        Casts floating point tensors in ``args`` and ``kwargs`` to the
-        precision given by ``dtype``, while respecting the existing
-        ``requires_grad`` on the tensors.
-        """
-
-        def cast_fn(x: torch.Tensor) -> torch.Tensor:
-            if not torch.is_floating_point(x):
-                return x
-            y = x.to(dtype)
-            # Explicitly copy over `requires_grad` since this runs inside
-            # `torch.no_grad()`
-            if x.is_leaf:
-                y.requires_grad = x.requires_grad
-            return y
-
-        with torch.no_grad():
-            return (
-                _apply_to_tensors(cast_fn, args),
-                _apply_to_tensors(cast_fn, kwargs),
-            )
-
     def _cast_buffers(
         self,
         device: Optional[torch.device] = None,
@@ -2039,21 +2031,6 @@ def _init_streams(self) -> None:
         # CPU offloading (H2D copy) and mixed precision (low precision cast).
         self._streams["pre_unshard"] = torch.cuda.Stream()
 
-    def _wait_for_previous_optim_step(self) -> None:
-        """
-        The root :class:`FullyShardedDataParallel` instance needs to
-        synchronize with the default stream to ensure that the previous
-        optimizer step is done.
-        """
-        if not self._is_root:
-            return
-        current_stream = torch.cuda.current_stream()
-        self._streams["unshard"].wait_stream(current_stream)
-        # Having the pre-all-gather stream wait for the current stream even if
-        # we do not leverage the pre-all-gather stream is tolerable since this
-        # only runs once per iteration
-        self._streams["pre_unshard"].wait_stream(current_stream)
-
     def _prefetch_handles(
         self,
         current_handles_key: _HandlesKey,
@@ -2334,7 +2311,8 @@ def state_dict(self, *args, **kwargs):
         if torch.cuda.is_available():
             torch.cuda.synchronize()
         self._lazy_init()
-        self._clear_grads_if_needed()
+        if self._is_root:
+            _clear_grads_if_needed(self._fsdp_handles(self))
         if self._state_dict_type == StateDictType.FULL_STATE_DICT:
             # Get config args
             full_state_dict_config = (
@@ -2509,25 +2487,6 @@ def _post_forward(
             handle._training_state = HandleTrainingState.IDLE
         return output
 
-    def _cast_forward_inputs(self, *args, **kwargs):
-        """Moves the forward inputs to the compute device and casts them to the
-        appropriate dtype if needed."""
-        # TODO: Do not use the side stream for tensor copies for now;
-        # investigate the perf with/without it
-        # TODO: For mixed precision, move the inputs to the compute device and
-        # cast to reduced-precision in a single `to()` call
-        args, kwargs = _to_kwargs(args, kwargs, self.compute_device.index, False)
-        args = args[0]
-        kwargs = kwargs[0]
-        if self._mixed_precision_enabled_for_params():
-            input_dtype = self.mixed_precision.param_dtype
-            args, kwargs = self._cast_fp_inputs_to_dtype(
-                input_dtype,
-                *args,
-                **kwargs,
-            )
-        return args, kwargs
-
     def _fsdp_root_pre_forward(self, *args, **kwargs):
         """
         Runs pre-forward logic specific to the root FSDP instance, which should
@@ -2544,25 +2503,22 @@ def _fsdp_root_pre_forward(self, *args, **kwargs):
                 handles_key = tuple(fsdp_module._handles)
                 if handles_key:
                     self._needs_pre_forward_unshard[handles_key] = True
-        self._wait_for_previous_optim_step()
-        self._clear_grads_if_needed()
-        args, kwargs = self._cast_forward_inputs(*args, **kwargs)
+        _wait_for_computation_stream(
+            torch.cuda.current_stream(),
+            self._streams["unshard"],
+            self._streams["pre_unshard"],
+        )
+        _clear_grads_if_needed(self._fsdp_handles(self))
+        input_dtype = (
+            self.mixed_precision.param_dtype
+            if self._mixed_precision_enabled_for_params()
+            else None
+        )
+        args, kwargs = _prepare_forward_inputs(
+            self.compute_device, input_dtype, *args, **kwargs
+        )
         return args, kwargs
 
-    def _clear_grads_if_needed(self):
-        """
-        Iterates over all handles to clear original parameter gradients if
-        needed. See :meth:`FlatParamHandle._clear_grads_if_needed` for details.
-        Since this method's CPU overhead is minimal, we may call throughout
-        FSDP methods, which may serve as callsites to free the gradient memory
-        earlier.
-        """
-        if not self._use_orig_params or not self._is_root:
-            return
-        for fsdp_module in self.fsdp_modules(self):
-            for handle in fsdp_module._handles:
-                handle._clear_grads_if_needed()
-
     @staticmethod
     @contextlib.contextmanager
     def summon_full_params(
@@ -2710,7 +2666,8 @@ def _summon_full_params(
         for handle in self._handles:
             handle._training_state = HandleTrainingState.SUMMON_FULL_PARAMS
 
-        self._clear_grads_if_needed()
+        if self._is_root:
+            _clear_grads_if_needed(self._fsdp_handles(self))
         free_unsharded_flat_params = [
             handle.needs_unshard() for handle in self._handles
         ]
@@ -3010,7 +2967,7 @@ def _pre_backward_hook(_handles: List[FlatParamHandle], *unused: Any) -> None:
                 # that it is called after all backward calls complete
                 if self._is_root and not self._post_backward_callback_queued:
                     self._queue_wait_for_post_backward()
-                    self._clear_grads_if_needed()
+                    _clear_grads_if_needed(self._fsdp_handles(self))
                 elif _handles_key:
                     self._assert_state([TrainingState_.IDLE])
                 self.training_state = TrainingState_.BACKWARD_PRE
@@ -3531,12 +3488,16 @@ def clip_grad_norm_(
             collective communications.
         """
         self._lazy_init()
-        self._wait_for_previous_optim_step()
         if not self._is_root:
             raise RuntimeError(
                 "`clip_grad_norm_()` should only be called on the root FSDP instance"
             )
         self._assert_state(TrainingState_.IDLE)
+        _wait_for_computation_stream(
+            torch.cuda.current_stream(),
+            self._streams["unshard"],
+            self._streams["pre_unshard"],
+        )
 
         max_norm = float(max_norm)
         norm_type = float(norm_type)

From 4b264272007f989cb6649c6eade5f70ee7a57dd6 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Fri, 28 Oct 2022 18:15:57 +0000
Subject: [PATCH 0304/1922] [FSDP()][2/N] Refactor training state (#87916)

This PR actually has meaningful changes. We stratify `TrainingState` into two levels: one is per FSDP instance and one is per `FlatParamHandle`/`FlatParameter`.
- At the FSDP instance level, we only care about `IDLE`, FSDP computation (i.e. `FORWARD_BACKWARD`), or `SUMMON_FULL_PARAMS`. These dynamically modify behavior (e.g. `summon_full_params()` forces full precision).
- At the `FlatParamHandle` level, we care about the training state for invariants and debugging. Hence, we keep `IDLE`, `FORWARD`, `BACKWARD_PRE`, `BACKWARD_POST`, and `SUMMON_FULL_PARAMS`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87916
Approved by: https://github.com/mrshenli
---
 .../fsdp/test_fsdp_summon_full_params.py      |  4 +-
 torch/distributed/fsdp/_common_utils.py       | 23 ++++++
 torch/distributed/fsdp/_state_dict_utils.py   | 61 ++++++++--------
 torch/distributed/fsdp/flat_param.py          | 10 +--
 .../fsdp/fully_sharded_data_parallel.py       | 73 +++++++------------
 torch/testing/_internal/common_fsdp.py        |  4 +-
 6 files changed, 87 insertions(+), 88 deletions(-)
 create mode 100644 torch/distributed/fsdp/_common_utils.py

diff --git a/test/distributed/fsdp/test_fsdp_summon_full_params.py b/test/distributed/fsdp/test_fsdp_summon_full_params.py
index 5b995a9ab23f6..0d4e98069117a 100644
--- a/test/distributed/fsdp/test_fsdp_summon_full_params.py
+++ b/test/distributed/fsdp/test_fsdp_summon_full_params.py
@@ -212,7 +212,7 @@ def forward(self, fsdp_module):
 
         model = FSDP(MyModule()).cuda(self.rank)
         with self.assertRaisesRegex(
-            ValueError, "current state is TrainingState_.FORWARD"
+            ValueError, "current state is TrainingState.FORWARD"
         ):
             model(model)
 
@@ -231,7 +231,7 @@ def bad_backwards_hook(tensor):
         output.register_hook(bad_backwards_hook)
 
         with self.assertRaisesRegex(
-            ValueError, "current state is TrainingState_.BACKWARD_PRE"
+            ValueError, "current state is TrainingState.FORWARD_BACKWARD"
         ):
             output.backward()
 
diff --git a/torch/distributed/fsdp/_common_utils.py b/torch/distributed/fsdp/_common_utils.py
new file mode 100644
index 0000000000000..cef76cf91f017
--- /dev/null
+++ b/torch/distributed/fsdp/_common_utils.py
@@ -0,0 +1,23 @@
+from enum import auto, Enum
+
+
+class TrainingState(Enum):
+    """
+    An enum that indicates the state of a ``FullyShardedDataParallel` instance.
+    """
+
+    IDLE = auto()
+    FORWARD_BACKWARD = auto()
+    SUMMON_FULL_PARAMS = auto()
+
+
+class HandleTrainingState(Enum):
+    """
+    An enum that indicates the state of a ``FlatParamHandle`.
+    """
+
+    IDLE = auto()
+    FORWARD = auto()
+    BACKWARD_PRE = auto()
+    BACKWARD_POST = auto()
+    SUMMON_FULL_PARAMS = auto()
diff --git a/torch/distributed/fsdp/_state_dict_utils.py b/torch/distributed/fsdp/_state_dict_utils.py
index 0aeaf5035e261..4e184d2e70c64 100644
--- a/torch/distributed/fsdp/_state_dict_utils.py
+++ b/torch/distributed/fsdp/_state_dict_utils.py
@@ -8,7 +8,7 @@
 import torch.distributed.algorithms._checkpoint.checkpoint_wrapper as checkpoint_wrapper
 
 # Import the entire FSDP file to avoid circular imports
-import torch.distributed.fsdp.fully_sharded_data_parallel as FSDP
+import torch.distributed.fsdp.fully_sharded_data_parallel as fsdp_file
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.distributed._shard.sharded_tensor import (
@@ -37,8 +37,8 @@ def _full_post_state_dict_hook(
     back to sharded version after _summon_full_params ends, and also remove
     the ``FSDP_WRAPPED_MODULE`` prefix.
     """
-    _replace_by_prefix(state_dict, prefix + f"{FSDP.FSDP_PREFIX}", prefix)
-    module._assert_state([FSDP.TrainingState_.SUMMON_FULL_PARAMS])
+    _replace_by_prefix(state_dict, prefix + f"{fsdp_file.FSDP_PREFIX}", prefix)
+    module._assert_state([fsdp_file.TrainingState.SUMMON_FULL_PARAMS])
     # Return early for trivial cases
     if not state_dict or not module._has_params:
         return state_dict
@@ -52,7 +52,8 @@ def _full_post_state_dict_hook(
     # exiting `summon_full_params()` via the parameter shape. However, for
     # `NO_SHARD`, we cannot tell from the shape, so we do not return early.
     if (
-        not module._use_orig_params and FSDP.FLAT_PARAM in module.module._parameters
+        not module._use_orig_params
+        and fsdp_file.FLAT_PARAM in module.module._parameters
     ) or (
         module._use_orig_params
         and module._handles
@@ -69,7 +70,7 @@ def _full_post_state_dict_hook(
     for fqn, param_name, module_name in module._param_fqns:
         fqn = f"{prefix}{fqn}"
         clean_key = fqn
-        clean_prefix = FSDP.clean_tensor_name(prefix)
+        clean_prefix = fsdp_file.clean_tensor_name(prefix)
         # Strip prefix out of key if needed as buffer names and param names
         # do not have prefix considered as they are not computed in `state_dict`
         # call.
@@ -129,12 +130,12 @@ def _full_pre_load_state_dict_hook(
     # Note that it needs writeback=True to persist.
     module._full_param_ctx = module._summon_full_params(recurse=False, writeback=True)
     module._full_param_ctx.__enter__()
-    _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP.FSDP_PREFIX}")
+    _replace_by_prefix(state_dict, prefix, prefix + f"{fsdp_file.FSDP_PREFIX}")
 
 
 def _full_post_load_state_dict_hook(module, *args, **kwargs) -> None:
     # We should exit summon_full_params context.
-    module._assert_state([FSDP.TrainingState_.SUMMON_FULL_PARAMS])
+    module._assert_state([fsdp_file.TrainingState.SUMMON_FULL_PARAMS])
     assert getattr(module, "_full_param_ctx", None) is not None
     module._full_param_ctx.__exit__(None, None, None)
     module._full_param_ctx = None
@@ -150,7 +151,7 @@ def _local_post_state_dict_hook(
     the state_dict[f"{prefix}{FLAT_PARAM}] with the ShardedTensor. No copy
     will happen. The underlying storage is the same.
     """
-    _replace_by_prefix(state_dict, f"{prefix}{FSDP.FSDP_PREFIX}", prefix)
+    _replace_by_prefix(state_dict, f"{prefix}{fsdp_file.FSDP_PREFIX}", prefix)
     if not module._has_params:
         return state_dict
 
@@ -174,7 +175,7 @@ def _local_post_state_dict_hook(
     )  # type: ignore[assignment]
     if module._state_dict_config.offload_to_cpu:
         sharded_tensor = sharded_tensor.cpu()
-    state_dict[f"{prefix}{FSDP.FLAT_PARAM}"] = sharded_tensor
+    state_dict[f"{prefix}{fsdp_file.FLAT_PARAM}"] = sharded_tensor
     return state_dict
 
 
@@ -192,8 +193,8 @@ def _local_pre_load_state_dict_hook(
     state_dict. The flat_param should be a ShardedTensor. This hook converts
     the ShardedTensor to a tensor. No copy happen unless padding is required.
     """
-    _replace_by_prefix(state_dict, prefix, f"{prefix}{FSDP.FSDP_PREFIX}")
-    fqn = f"{prefix}{FSDP.FSDP_PREFIX}{FSDP.FLAT_PARAM}"
+    _replace_by_prefix(state_dict, prefix, f"{prefix}{fsdp_file.FSDP_PREFIX}")
+    fqn = f"{prefix}{fsdp_file.FSDP_PREFIX}{fsdp_file.FLAT_PARAM}"
     if fqn not in state_dict:
         assert not module._has_params, (
             "No `FlatParameter` in `state_dict` for this FSDP instance "
@@ -232,11 +233,11 @@ def _sharded_post_state_dict_hook(
     The hook replaces the unflattened, unsharded parameter in the state_dict
     with a unflattened, sharded parameter (a ShardedTensor).
     """
-    _replace_by_prefix(state_dict, f"{prefix}{FSDP.FSDP_PREFIX}", prefix)
+    _replace_by_prefix(state_dict, f"{prefix}{fsdp_file.FSDP_PREFIX}", prefix)
     if not module._has_params:
         return state_dict
 
-    assert module.training_state != FSDP.TrainingState_.SUMMON_FULL_PARAMS, (
+    assert module.training_state != fsdp_file.TrainingState.SUMMON_FULL_PARAMS, (
         "Inside _sharded_post_state_dict_hook, the training_state must "
         "not be SUMMON_FULL_PARAMS."
     )
@@ -257,7 +258,7 @@ def _sharded_post_state_dict_hook(
     # For `use_orig_params=True`, the `FlatParameter` is not registered, so
     # there is no entry in the state dict for it to pop.
     if not module._use_orig_params:
-        state_dict.pop(f"{prefix}{FSDP.FLAT_PARAM}")
+        state_dict.pop(f"{prefix}{fsdp_file.FLAT_PARAM}")
     return state_dict
 
 
@@ -275,7 +276,7 @@ def _sharded_pre_load_state_dict_hook(
     The hook combines the unflattened, sharded parameters (ShardedTensor) to
     a new FlatParameter and shards the new FlatParameter to the local chunk.
     """
-    _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP.FSDP_PREFIX}")
+    _replace_by_prefix(state_dict, prefix, prefix + f"{fsdp_file.FSDP_PREFIX}")
     if not module._has_params:
         return
 
@@ -289,7 +290,7 @@ def _sharded_pre_load_state_dict_hook(
     shared_fqns = [fqn for fqn, _, _ in module._shared_param_fqns]
     loaded_shapes = []
     for fqn, _, _ in module._param_fqns:
-        full_fqn = f"{prefix}{FSDP.FSDP_PREFIX}{fqn}"
+        full_fqn = f"{prefix}{fsdp_file.FSDP_PREFIX}{fqn}"
         param = state_dict.pop(full_fqn)
         if fqn in shared_fqns:
             continue
@@ -347,7 +348,9 @@ def _sharded_pre_load_state_dict_hook(
         f"The loaded local chunk has different padding({num_to_pad}) "
         f"from the local chunk {flat_param._shard_numel_padded}."
     )
-    state_dict[f"{prefix}{FSDP.FSDP_PREFIX}{FSDP.FLAT_PARAM}"] = loaded_flat_tensor
+    state_dict[
+        f"{prefix}{fsdp_file.FSDP_PREFIX}{fsdp_file.FLAT_PARAM}"
+    ] = loaded_flat_tensor
     if module._use_orig_params:
         module._deregister_orig_params()
 
@@ -365,11 +368,11 @@ def _post_state_dict_hook(
     what postprocessing will be done.
     """
     _post_state_dict_hook_fn = {
-        FSDP.StateDictType.FULL_STATE_DICT: _full_post_state_dict_hook,
-        FSDP.StateDictType.LOCAL_STATE_DICT: _local_post_state_dict_hook,
-        FSDP.StateDictType.SHARDED_STATE_DICT: _sharded_post_state_dict_hook,
+        fsdp_file.StateDictType.FULL_STATE_DICT: _full_post_state_dict_hook,
+        fsdp_file.StateDictType.LOCAL_STATE_DICT: _local_post_state_dict_hook,
+        fsdp_file.StateDictType.SHARDED_STATE_DICT: _sharded_post_state_dict_hook,
     }
-    fsdp_module = cast(FSDP.FullyShardedDataParallel, module)
+    fsdp_module = cast(fsdp_file.FullyShardedDataParallel, module)
     processed_state_dict = _post_state_dict_hook_fn[fsdp_module._state_dict_type](
         fsdp_module, state_dict, prefix
     )
@@ -395,12 +398,12 @@ def _pre_load_state_dict_hook(
     will be done.
     """
     _pre_load_state_dict_hook_fn = {
-        FSDP.StateDictType.FULL_STATE_DICT: _full_pre_load_state_dict_hook,
-        FSDP.StateDictType.LOCAL_STATE_DICT: _local_pre_load_state_dict_hook,
-        FSDP.StateDictType.SHARDED_STATE_DICT: _sharded_pre_load_state_dict_hook,
+        fsdp_file.StateDictType.FULL_STATE_DICT: _full_pre_load_state_dict_hook,
+        fsdp_file.StateDictType.LOCAL_STATE_DICT: _local_pre_load_state_dict_hook,
+        fsdp_file.StateDictType.SHARDED_STATE_DICT: _sharded_pre_load_state_dict_hook,
     }
     # Code that is common for all state_dict impls
-    fsdp_module = cast(FSDP.FullyShardedDataParallel, module)
+    fsdp_module = cast(fsdp_file.FullyShardedDataParallel, module)
     if torch.cuda.is_available():
         torch.cuda.synchronize()
     # Dispatch into state_dict specific implementation of pre-hook.
@@ -412,12 +415,12 @@ def _pre_load_state_dict_hook(
 @torch.no_grad()
 def _post_load_state_dict_hook(module: nn.Module, *args: Any) -> None:
     _post_load_state_dict_hook_fn = {
-        FSDP.StateDictType.FULL_STATE_DICT: _full_post_load_state_dict_hook,
-        FSDP.StateDictType.LOCAL_STATE_DICT: _local_post_load_state_dict_hook,
-        FSDP.StateDictType.SHARDED_STATE_DICT: _sharded_post_load_state_dict_hook,
+        fsdp_file.StateDictType.FULL_STATE_DICT: _full_post_load_state_dict_hook,
+        fsdp_file.StateDictType.LOCAL_STATE_DICT: _local_post_load_state_dict_hook,
+        fsdp_file.StateDictType.SHARDED_STATE_DICT: _sharded_post_load_state_dict_hook,
     }
     # Code that is common for all state_dict impls
-    fsdp_module = cast(FSDP.FullyShardedDataParallel, module)
+    fsdp_module = cast(fsdp_file.FullyShardedDataParallel, module)
     # Dispatch into state_dict type specific implementation of post-hook for
     # loading state_dict.
     _post_load_state_dict_hook_fn[fsdp_module._state_dict_type](fsdp_module)
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index 3e4eca07df7fa..21516e79d87ad 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -23,6 +23,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import Tensor
+from torch.distributed.fsdp._common_utils import HandleTrainingState
 
 from ._fsdp_extensions import _ext_post_unflatten_transform, _ext_pre_flatten_transform
 from ._utils import (
@@ -41,7 +42,6 @@
     "SharedParamInfo",
     "HandleConfig",
     "HandleShardingStrategy",
-    "HandleTrainingState",
 ]
 
 
@@ -103,14 +103,6 @@ class HandleShardingStrategy(Enum):
     NO_SHARD = auto()
 
 
-class HandleTrainingState(Enum):
-    IDLE = auto()
-    FORWARD = auto()
-    BACKWARD_PRE = auto()
-    BACKWARD_POST = auto()
-    SUMMON_FULL_PARAMS = auto()
-
-
 @dataclass
 class HandleConfig:
     sharding_strategy: HandleShardingStrategy
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index a5375e89c269c..75ded291237ea 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -38,6 +38,7 @@
 )
 from torch.distributed.algorithms._comm_hooks import default_hooks, LOW_PRECISION_HOOKS
 from torch.distributed.distributed_c10d import _get_default_group
+from torch.distributed.fsdp._common_utils import HandleTrainingState, TrainingState
 from torch.distributed.fsdp._runtime_utils import (
     _clear_grads_if_needed,
     _prepare_forward_inputs,
@@ -76,7 +77,6 @@
     FlatParamHandle,
     HandleConfig,
     HandleShardingStrategy,
-    HandleTrainingState,
 )
 from .wrap import (
     _or_policy,
@@ -110,7 +110,6 @@
     "LocalStateDictConfig",
     "ShardedStateDictConfig",
     "OptimStateKeyType",
-    "TrainingState_",
     "clean_tensor_name",
 ]
 
@@ -261,24 +260,6 @@ class BackwardPrefetch(Enum):
     # TODO, BACKWARD_PRE_CPU, prefetch full parameters and keep them in the CPU memory
 
 
-class TrainingState_(Enum):
-    """
-    Simple enum to indicate what state FSDP is in. Used for asserting
-    to make sure APIs are called in the correct state.
-    ..note::
-        ``BACKWARD_PRE`` and ``BACKWARD_POST`` states are used to ensure we
-        receives backward hooks in the correct order. It is used to catch
-        unexpected order of hooks being called (likely due to our
-        hook registration logic or autograd engine logic changes).
-    """
-
-    IDLE = auto()
-    FORWARD = auto()
-    BACKWARD_PRE = auto()
-    BACKWARD_POST = auto()
-    SUMMON_FULL_PARAMS = auto()
-
-
 class StateDictType(Enum):
     """
     This enum indicates that which type of ``state_dict`` the FSDP module is
@@ -1060,7 +1041,7 @@ def __init__(
         self.process_group = process_group or _get_default_group()
         self.rank = self.process_group.rank()
         self.world_size = self.process_group.size()
-        self.training_state = TrainingState_.IDLE
+        self.training_state = TrainingState.IDLE
         self.cpu_offload = cpu_offload or CPUOffload()
         self.backward_prefetch = backward_prefetch
         self.forward_prefetch = forward_prefetch
@@ -1725,7 +1706,7 @@ def apply(self, fn: Callable[[nn.Module], None]) -> "FullyShardedDataParallel":
             Module: self
         """
         uninitialized = self._is_root is None
-        self._assert_state(TrainingState_.IDLE)
+        self._assert_state(TrainingState.IDLE)
         with self._summon_full_params(recurse=False, writeback=True):
             ret = super().apply(fn)
 
@@ -1871,7 +1852,7 @@ def _lazy_init(self) -> None:
         # The following logic is only run on the root FSDP instance since it
         # will set `_is_root=False` for the non-root instances
         self._is_root = True
-        self._assert_state(TrainingState_.IDLE)
+        self._assert_state(TrainingState.IDLE)
         self._init_streams()
         self._cast_buffers(recurse=True)
         for handle in self._handles:
@@ -2329,7 +2310,7 @@ def state_dict(self, *args, **kwargs):
                     offload_to_cpu=offload_to_cpu,
                     rank0_only=rank0_only,
                 )
-                if self.training_state != TrainingState_.SUMMON_FULL_PARAMS
+                if self.training_state != TrainingState.SUMMON_FULL_PARAMS
                 else contextlib.suppress()
             )
             with summon_ctx:
@@ -2423,7 +2404,7 @@ def _pre_forward(
             module (nn.Module): Unused; expected by the hook signature.
             input (Any): Unused; expected by the hook signature.
         """
-        self.training_state = TrainingState_.FORWARD
+        self.training_state = TrainingState.FORWARD_BACKWARD
         self._exec_order_data.record_pre_forward(handles, self.training)
         for handle in handles:
             handle._training_state = HandleTrainingState.FORWARD
@@ -2482,7 +2463,7 @@ def _post_forward(
         # Register pre-backward hooks to unshard the flattened parameters
         # for the gradient computation (if needed)
         output = self._register_pre_backward_hooks(output, handles)
-        self.training_state = TrainingState_.IDLE
+        self.training_state = TrainingState.IDLE
         for handle in handles:
             handle._training_state = HandleTrainingState.IDLE
         return output
@@ -2659,10 +2640,10 @@ def _summon_full_params(
 
         torch.cuda.synchronize()
         self._lazy_init()
-        self._assert_state([TrainingState_.IDLE])
+        self._assert_state([TrainingState.IDLE])
         for handle in self._handles:
             assert handle._training_state == HandleTrainingState.IDLE
-        self.training_state = TrainingState_.SUMMON_FULL_PARAMS
+        self.training_state = TrainingState.SUMMON_FULL_PARAMS
         for handle in self._handles:
             handle._training_state = HandleTrainingState.SUMMON_FULL_PARAMS
 
@@ -2686,7 +2667,7 @@ def _summon_full_params(
             try:
                 yield
             finally:
-                self.training_state = TrainingState_.IDLE
+                self.training_state = TrainingState.IDLE
                 for handle in self._handles:
                     handle._training_state = HandleTrainingState.IDLE
         else:
@@ -2712,7 +2693,7 @@ def _summon_full_params(
                     self._reshard(self._handles, free_unsharded_flat_params)
                     if with_grads:
                         self._reshard_grads(self._handles)
-                    self.training_state = TrainingState_.IDLE
+                    self.training_state = TrainingState.IDLE
                     for handle in self._handles:
                         handle._training_state = HandleTrainingState.IDLE
 
@@ -2890,7 +2871,7 @@ def named_buffers(
         when inside the :meth:`summon_full_params` context manager.
         """
         should_clean_name = (
-            self.training_state == TrainingState_.SUMMON_FULL_PARAMS
+            self.training_state == TrainingState.SUMMON_FULL_PARAMS
             or self._use_orig_params
         )
         for buffer_name, buffer in super().named_buffers(*args, **kwargs):
@@ -2911,7 +2892,7 @@ def named_parameters(
         when inside the :meth:`summon_full_params` context manager.
         """
         should_clean_name = (
-            self.training_state == TrainingState_.SUMMON_FULL_PARAMS
+            self.training_state == TrainingState.SUMMON_FULL_PARAMS
             or self._use_orig_params
         )
         for param_name, param in super().named_parameters(*args, **kwargs):
@@ -2969,8 +2950,8 @@ def _pre_backward_hook(_handles: List[FlatParamHandle], *unused: Any) -> None:
                     self._queue_wait_for_post_backward()
                     _clear_grads_if_needed(self._fsdp_handles(self))
                 elif _handles_key:
-                    self._assert_state([TrainingState_.IDLE])
-                self.training_state = TrainingState_.BACKWARD_PRE
+                    self._assert_state([TrainingState.IDLE])
+                self.training_state = TrainingState.FORWARD_BACKWARD
                 # Queueing the post-backward callback is the only logic that is
                 # not per-handle in the pre-backward hook, so we can return
                 # early here if there are no handles.
@@ -3063,12 +3044,12 @@ def _post_backward_hook(
         with torch.autograd.profiler.record_function(
             "FullyShardedDataParallel._post_backward_hook"
         ):
-            # First hook callback will see PRE state. If we have multiple params,
-            # then subsequent hook callbacks will see POST state.
-            self._assert_state(
-                [TrainingState_.BACKWARD_PRE, TrainingState_.BACKWARD_POST]
+            self._assert_state([TrainingState.FORWARD_BACKWARD])
+            self.training_state = TrainingState.FORWARD_BACKWARD
+            p_assert(
+                handle._training_state == HandleTrainingState.BACKWARD_PRE,
+                f"Expects `BACKWARD_PRE` state but got {handle._training_state}",
             )
-            self.training_state = TrainingState_.BACKWARD_POST
             handle._training_state = HandleTrainingState.BACKWARD_POST
 
             if (
@@ -3241,7 +3222,7 @@ def _cast_grad_to_param_dtype(
         However, if a low precision communication hook is registered, then this
         dtype cast happens in the hook instead.
         """
-        self._assert_state(TrainingState_.BACKWARD_POST)
+        self._assert_state(TrainingState.FORWARD_BACKWARD)
         if not self._low_precision_hook_enabled() and (
             self._mixed_precision_enabled_for_params()
             or self._mixed_precision_enabled_for_reduce()
@@ -3268,7 +3249,7 @@ def _queue_wait_for_post_backward(self) -> None:
         )
         if self._post_backward_callback_queued:
             return
-        self._assert_state([TrainingState_.IDLE])
+        self._assert_state([TrainingState.IDLE])
         self._post_backward_callback_queued = True
         Variable._execution_engine.queue_callback(self._wait_for_post_backward)
 
@@ -3362,7 +3343,7 @@ def _finalize_params(fsdp_module: FullyShardedDataParallel) -> None:
             _catch_all_reshard(m)
             _finalize_params(m)
             m._ran_pre_backward_hook.clear()
-            m.training_state = TrainingState_.IDLE
+            m.training_state = TrainingState.IDLE
             for handle in m._handles:
                 handle._training_state = HandleTrainingState.IDLE
             m._handles_prefetched.clear()
@@ -3401,12 +3382,12 @@ def _param_exec_order_policy_second_iter_init(self) -> None:
         # TODO (linjianma): Patch the forward of each model in the keys
         # of fsdp_wrap_map based on the information above.
 
-    def _assert_state(self, state: Union[TrainingState_, List[TrainingState_]]) -> None:
+    def _assert_state(self, state: Union[TrainingState, List[TrainingState]]) -> None:
         """Assert we are in the given state."""
         # Since assert can be turned off and this error checking
         # is really important, we use explicit error checking
         # and raise a ValueError if needed.
-        if isinstance(state, TrainingState_):
+        if isinstance(state, TrainingState):
             state = [state]
         if self.training_state not in state:
             msg = (
@@ -3444,7 +3425,7 @@ def no_sync(self) -> Generator:
             raise RuntimeError(
                 "`no_sync()` on inner FSDP instances is not supported. Please call `no_sync()` on root FSDP module."
             )
-        self._assert_state(TrainingState_.IDLE)
+        self._assert_state(TrainingState.IDLE)
         old_flags = []
         for m in self.modules():
             if isinstance(m, FullyShardedDataParallel):
@@ -3492,7 +3473,7 @@ def clip_grad_norm_(
             raise RuntimeError(
                 "`clip_grad_norm_()` should only be called on the root FSDP instance"
             )
-        self._assert_state(TrainingState_.IDLE)
+        self._assert_state(TrainingState.IDLE)
         _wait_for_computation_stream(
             torch.cuda.current_stream(),
             self._streams["unshard"],
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index 3c159313f0890..a6644df76d497 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -14,11 +14,11 @@
 import torch.distributed as dist
 import torch.nn as nn
 from torch.distributed.fsdp import CPUOffload, FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp._common_utils import TrainingState
 from torch.distributed.fsdp.fully_sharded_data_parallel import (
     BackwardPrefetch,
     MixedPrecision,
     ShardingStrategy,
-    TrainingState_,
 )
 from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
 from torch.distributed.fsdp.wrap import (
@@ -884,7 +884,7 @@ def _train_for_several_steps(
                 model.load_state_dict(state_dict)
 
         if isinstance(model, FSDP):
-            model._assert_state(TrainingState_.IDLE)
+            model._assert_state(TrainingState.IDLE)
         return loss.detach()
 
     def _test_fsdp_parity(

From c96c827de6978b098b81eec77b73e68f7de04237 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Fri, 28 Oct 2022 17:20:10 -0400
Subject: [PATCH 0305/1922] Fix pybind11 problems with c10::SymInt unregistered
 (#88011)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88011
Approved by: https://github.com/weiwangmeta, https://github.com/albanD
---
 build_variables.bzl                  |  1 +
 torch/csrc/utils/pybind.cpp          | 69 +++++++++++++++++++++++++
 torch/csrc/utils/pybind.h            | 24 +++++++++
 torch/csrc/utils/python_arg_parser.h | 76 ----------------------------
 4 files changed, 94 insertions(+), 76 deletions(-)
 create mode 100644 torch/csrc/utils/pybind.cpp

diff --git a/build_variables.bzl b/build_variables.bzl
index 789d28bf786d7..06e8f7bf4b606 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -960,6 +960,7 @@ libtorch_python_core_sources = [
     "torch/csrc/utils/python_arg_parser.cpp",
     "torch/csrc/utils/python_dispatch.cpp",
     "torch/csrc/utils/python_symnode.cpp",
+    "torch/csrc/utils/pybind.cpp",
     "torch/csrc/utils/structseq.cpp",
     "torch/csrc/utils/tensor_apply.cpp",
     "torch/csrc/utils/tensor_dtypes.cpp",
diff --git a/torch/csrc/utils/pybind.cpp b/torch/csrc/utils/pybind.cpp
new file mode 100644
index 0000000000000..37e37a873774b
--- /dev/null
+++ b/torch/csrc/utils/pybind.cpp
@@ -0,0 +1,69 @@
+#include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/utils/python_arg_parser.h>
+#include <torch/csrc/utils/python_symnode.h>
+
+namespace pybind11 {
+namespace detail {
+
+bool type_caster<c10::SymInt>::load(py::handle src, bool) {
+  if (torch::is_symint(src)) {
+    value = c10::SymInt(static_cast<c10::SymNode>(
+        c10::make_intrusive<torch::impl::PythonSymNodeImpl>(src.attr("node"))));
+    return true;
+  }
+
+  auto raw_obj = src.ptr();
+  if (THPUtils_checkIndex(raw_obj)) {
+    value = c10::SymInt{THPUtils_unpackIndex(raw_obj)};
+    return true;
+  }
+  return false;
+}
+
+py::handle type_caster<c10::SymInt>::cast(
+    c10::SymInt si,
+    return_value_policy /* policy */,
+    handle /* parent */) {
+  if (si.is_symbolic()) {
+    // TODO: generalize this to work with C++ backed class
+    auto* py_node =
+        dynamic_cast<torch::impl::PythonSymNodeImpl*>(si.toSymNodeImpl().get());
+    TORCH_INTERNAL_ASSERT(py_node);
+    return torch::get_symint_class()(py_node->getPyObj()).release();
+  } else {
+    return py::cast(si.as_int_unchecked()).release();
+  }
+}
+
+bool type_caster<c10::SymFloat>::load(py::handle src, bool) {
+  if (torch::is_symfloat(src)) {
+    value = c10::SymFloat(static_cast<c10::SymNode>(
+        c10::make_intrusive<torch::impl::PythonSymNodeImpl>(src.attr("node"))));
+    return true;
+  }
+
+  auto raw_obj = src.ptr();
+  if (THPUtils_checkDouble(raw_obj)) {
+    value = c10::SymFloat{THPUtils_unpackDouble(raw_obj)};
+    return true;
+  }
+  return false;
+}
+
+py::handle type_caster<c10::SymFloat>::cast(
+    c10::SymFloat si,
+    return_value_policy /* policy */,
+    handle /* parent */) {
+  if (si.is_symbolic()) {
+    // TODO: generalize this to work with C++ backed class
+    auto* py_node =
+        dynamic_cast<torch::impl::PythonSymNodeImpl*>(si.toSymNodeImpl().get());
+    TORCH_INTERNAL_ASSERT(py_node);
+    return torch::get_symfloat_class()(py_node->getPyObj()).release();
+  } else {
+    return py::cast(si.as_float_unchecked()).release();
+  }
+}
+
+} // namespace detail
+} // namespace pybind11
diff --git a/torch/csrc/utils/pybind.h b/torch/csrc/utils/pybind.h
index 8434960160d5a..c43cf5e732832 100644
--- a/torch/csrc/utils/pybind.h
+++ b/torch/csrc/utils/pybind.h
@@ -187,6 +187,30 @@ struct type_caster<c10::DispatchKey>
   }
 };
 
+template <>
+struct type_caster<c10::SymInt> {
+ public:
+  PYBIND11_TYPE_CASTER(c10::SymInt, _("SymInt"));
+  bool load(py::handle src, bool);
+
+  static py::handle cast(
+      c10::SymInt si,
+      return_value_policy /* policy */,
+      handle /* parent */);
+};
+
+template <>
+struct type_caster<c10::SymFloat> {
+ public:
+  PYBIND11_TYPE_CASTER(c10::SymFloat, _("SymFloat"));
+  bool load(py::handle src, bool);
+
+  static py::handle cast(
+      c10::SymFloat si,
+      return_value_policy /* policy */,
+      handle /* parent */);
+};
+
 // Pybind11 bindings for our optional and variant types.
 // http://pybind11.readthedocs.io/en/stable/advanced/cast/stl.html#c-17-library-containers
 template <typename T>
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index df084821ba255..c766d93a70ebf 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -79,82 +79,6 @@
 #include <string>
 #include <vector>
 
-namespace pybind11 {
-namespace detail {
-template <>
-struct type_caster<c10::SymInt> {
- public:
-  PYBIND11_TYPE_CASTER(c10::SymInt, _("SymInt"));
-  bool load(py::handle src, bool) {
-    if (torch::is_symint(src)) {
-      value = c10::SymInt(static_cast<c10::SymNode>(
-          c10::make_intrusive<torch::impl::PythonSymNodeImpl>(
-              src.attr("node"))));
-      return true;
-    }
-
-    auto raw_obj = src.ptr();
-    if (THPUtils_checkIndex(raw_obj)) {
-      value = c10::SymInt{THPUtils_unpackIndex(raw_obj)};
-      return true;
-    }
-    return false;
-  }
-
-  static py::handle cast(
-      c10::SymInt si,
-      return_value_policy /* policy */,
-      handle /* parent */) {
-    if (si.is_symbolic()) {
-      // TODO: generalize this to work with C++ backed class
-      auto* py_node = dynamic_cast<torch::impl::PythonSymNodeImpl*>(
-          si.toSymNodeImpl().get());
-      TORCH_INTERNAL_ASSERT(py_node);
-      return torch::get_symint_class()(py_node->getPyObj()).release();
-    } else {
-      return py::cast(si.as_int_unchecked()).release();
-    }
-  }
-};
-
-template <>
-struct type_caster<c10::SymFloat> {
- public:
-  PYBIND11_TYPE_CASTER(c10::SymFloat, _("SymFloat"));
-  bool load(py::handle src, bool) {
-    if (torch::is_symfloat(src)) {
-      value = c10::SymFloat(static_cast<c10::SymNode>(
-          c10::make_intrusive<torch::impl::PythonSymNodeImpl>(
-              src.attr("node"))));
-      return true;
-    }
-
-    auto raw_obj = src.ptr();
-    if (THPUtils_checkDouble(raw_obj)) {
-      value = c10::SymFloat{THPUtils_unpackDouble(raw_obj)};
-      return true;
-    }
-    return false;
-  }
-
-  static py::handle cast(
-      c10::SymFloat si,
-      return_value_policy /* policy */,
-      handle /* parent */) {
-    if (si.is_symbolic()) {
-      // TODO: generalize this to work with C++ backed class
-      auto* py_node = dynamic_cast<torch::impl::PythonSymNodeImpl*>(
-          si.toSymNodeImpl().get());
-      TORCH_INTERNAL_ASSERT(py_node);
-      return torch::get_symfloat_class()(py_node->getPyObj()).release();
-    } else {
-      return py::cast(si.as_float_unchecked()).release();
-    }
-  }
-};
-} // namespace detail
-} // namespace pybind11
-
 inline bool THPUtils_checkScalar(PyObject* obj) {
 #ifdef USE_NUMPY
   if (torch::utils::is_numpy_scalar(obj)) {

From d3fd93591ea49724ac97ee31bb33ab833c7f168b Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Sat, 29 Oct 2022 08:34:13 +0000
Subject: [PATCH 0306/1922] Store usage log on GitHub when S3 is not available
 (#87947)

It turns out that we haven't uploaded the usage log to GitHub when S3 is not available (macos, rocm), for example, https://github.com/pytorch/pytorch/actions/runs/3325822440#artifacts only includes test-report, test-json, sccache stats, and build artifacts.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87947
Approved by: https://github.com/clee2000
---
 .github/actions/upload-test-artifacts/action.yml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/actions/upload-test-artifacts/action.yml b/.github/actions/upload-test-artifacts/action.yml
index 67083a103e06a..4b9c75c38b1b6 100644
--- a/.github/actions/upload-test-artifacts/action.yml
+++ b/.github/actions/upload-test-artifacts/action.yml
@@ -129,3 +129,13 @@ runs:
         retention-days: 14
         if-no-files-found: error
         path: test/**/*.xml
+
+    - name: Store Usage Logs on Github
+      uses: actions/upload-artifact@v3
+      if: inputs.use-gha
+      with:
+        # Add the run attempt, see [Artifact run attempt]
+        name: usage-log-runattempt${{ github.run_attempt }}-${{ inputs.file-suffix }}.zip
+        retention-days: 14
+        if-no-files-found: ignore
+        path: usage_log.txt

From a9b22f2fa050e048f2988d7ce6f5533814962f7c Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Sat, 29 Oct 2022 08:43:45 +0000
Subject: [PATCH 0307/1922] Cleanup macos common conda installation (#87816)

The conda dependencies have all been installed for `_mac-test` in https://github.com/pytorch/pytorch/pull/87541.  I missed the same step for `_mac-build` and `_mac-test-mps` workflows, so both are also updated here. Note that arm64 is cross-compiled from x86, so the env file needs to be set explicitly in that case

After this one, I have a WIP PR to consolidate macos pip dependencies next
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87816
Approved by: https://github.com/ZainRizvi
---
 .github/requirements/conda-env-macOS-ARM64 |  6 ++-
 .github/workflows/_mac-build.yml           | 15 +++++++
 .github/workflows/_mac-test-mps.yml        |  2 +-
 .github/workflows/mac-mps.yml              |  4 ++
 .github/workflows/trunk.yml                |  4 ++
 .jenkins/pytorch/macos-common.sh           | 46 ----------------------
 .jenkins/pytorch/macos-test.sh             |  1 -
 7 files changed, 29 insertions(+), 49 deletions(-)

diff --git a/.github/requirements/conda-env-macOS-ARM64 b/.github/requirements/conda-env-macOS-ARM64
index 6e7e4221a85ba..a031b014365fc 100644
--- a/.github/requirements/conda-env-macOS-ARM64
+++ b/.github/requirements/conda-env-macOS-ARM64
@@ -8,9 +8,13 @@ dataclasses=0.8
 pip=22.2.2
 six=1.16.0
 pillow=9.2.0
-libuv=1.39.0
 pkg-config=0.29.2
 wheel=0.37.1
+expecttest=0.1.3
 
 # Not pinning certifi so that we can always get the latest certificates
 certifi
+
+# Cross-compiling arm64 from x86-64 picks up 1.40.0 while testing on arm64
+# itself only has up to 1.39.0 from upstream conda. Both work though
+libuv>=1.39.0,<=1.40.0
diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml
index 557d3c7b292c1..9f0c988f4a311 100644
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@@ -33,6 +33,10 @@ on:
         default: "3.8"
         description: |
           The python version to be used. Will be 3.8 by default
+      environment-file:
+        required: false
+        type: string
+        description: Set the conda environment file used to setup macOS build.
       test-matrix:
         required: false
         type: string
@@ -83,9 +87,20 @@ jobs:
           fi
 
       - name: Setup miniconda
+        if: inputs.environment-file == ''
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        with:
+          python-version: ${{ inputs.python_version }}
+          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
+
+      # This option is used when cross-compiling arm64 from x86-64. Specifically, we need arm64 conda
+      # environment even though the arch is x86-64
+      - name: Setup miniconda using the provided environment file
+        if: inputs.environment-file != ''
         uses: pytorch/test-infra/.github/actions/setup-miniconda@main
         with:
           python-version: ${{ inputs.python_version }}
+          environment-file: ${{ inputs.environment-file }}
 
       - name: Install macOS homebrew dependencies
         run: |
diff --git a/.github/workflows/_mac-test-mps.yml b/.github/workflows/_mac-test-mps.yml
index e2c6ec74d3f44..57e97e499c460 100644
--- a/.github/workflows/_mac-test-mps.yml
+++ b/.github/workflows/_mac-test-mps.yml
@@ -41,6 +41,7 @@ jobs:
         uses: pytorch/test-infra/.github/actions/setup-miniconda@main
         with:
           python-version: 3.9
+          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
 
       - name: Install PyTorch
         env:
@@ -50,7 +51,6 @@ jobs:
         run: |
           # shellcheck disable=SC1090
           set -ex
-          ${CONDA_INSTALL} expecttest numpy=1.22.3 pyyaml=6.0
           ${CONDA_RUN} python3 -mpip install "unittest-xml-reporting<=3.2.0,>=2.0.0"
           # As wheels are cross-compiled they are reported as x86_64 ones
           ORIG_WHLNAME=$(ls -1 dist/*.whl); ARM_WHLNAME=${ORIG_WHLNAME/x86_64/arm64}; mv ${ORIG_WHLNAME} ${ARM_WHLNAME}
diff --git a/.github/workflows/mac-mps.yml b/.github/workflows/mac-mps.yml
index 8fc2dd8336bff..5df7299cc5076 100644
--- a/.github/workflows/mac-mps.yml
+++ b/.github/workflows/mac-mps.yml
@@ -22,6 +22,10 @@ jobs:
       build-generates-artifacts: true
       # To match the one pre-installed in the m1 runners
       python_version: 3.9.12
+      # We need to set the environment file here instead of trying to detect it automatically because
+      # MacOS arm64 is cross-compiled from x86-64. Specifically, it means that arm64 conda environment
+      # is needed when building PyTorch MacOS arm64 from x86-64
+      environment-file: .github/requirements/conda-env-macOS-ARM64
     secrets:
       MACOS_SCCACHE_S3_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
       MACOS_SCCACHE_S3_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index d92c5a079d978..48a89e47ed7bc 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -226,6 +226,10 @@ jobs:
       build-generates-artifacts: true
       # To match the one pre-installed in the m1 runners
       python_version: 3.9.12
+      # We need to set the environment file here instead of trying to detect it automatically because
+      # MacOS arm64 is cross-compiled from x86-64. Specifically, it means that arm64 conda environment
+      # is needed when building PyTorch MacOS arm64 from x86-64
+      environment-file: .github/requirements/conda-env-macOS-ARM64
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 2, runner: "macos-m1-12" },
diff --git a/.jenkins/pytorch/macos-common.sh b/.jenkins/pytorch/macos-common.sh
index 319e88e40aa8d..d1b31ec941889 100755
--- a/.jenkins/pytorch/macos-common.sh
+++ b/.jenkins/pytorch/macos-common.sh
@@ -7,52 +7,6 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 
 sysctl -a | grep machdep.cpu
 
-if [[ ${BUILD_ENVIRONMENT} = *arm64* ]]; then
-  # We use different versions here as the arm build/tests runs on python 3.9
-  # while the x86 one runs on python 3.8
-  retry conda install -y \
-    numpy=1.22.3 \
-    pyyaml=6.0 \
-    setuptools=61.2.0 \
-    cmake=3.22.1 \
-    cffi \
-    ninja \
-    typing_extensions \
-    dataclasses \
-    pip
-else
-  # NOTE: mkl 2021.3.0+ cmake requires sub-command PREPEND, may break the build
-  retry conda install -y \
-    mkl=2021.2.0 \
-    mkl-include=2021.2.0 \
-    numpy=1.18.5 \
-    pyyaml=5.3 \
-    setuptools=46.0.0 \
-    cmake=3.22.1 \
-    cffi \
-    ninja \
-    typing_extensions \
-    dataclasses \
-    pip
-fi
-
-# The torch.hub tests make requests to GitHub.
-#
-# The certifi package from conda-forge is new enough to make the
-# following error disappear (included for future reference):
-#
-# > ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED]
-# > certificate verify failed: unable to get local issuer certificate
-# > (_ssl.c:1056)
-#
-retry conda install -y -c conda-forge certifi wheel=0.36.2
-
-# Needed by torchvision, which is imported from TestHub in test_utils.py.
-retry conda install -y pillow
-
-# Building with USE_DISTRIBUTED=1 requires libuv (for Gloo).
-retry conda install -y libuv pkg-config
-
 # These are required for both the build job and the test job.
 # In the latter to test cpp extensions.
 export MACOSX_DEPLOYMENT_TARGET=10.9
diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh
index 7103f1a5dbee3..b598efff22ed4 100755
--- a/.jenkins/pytorch/macos-test.sh
+++ b/.jenkins/pytorch/macos-test.sh
@@ -4,7 +4,6 @@
 # shellcheck source=./macos-common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/macos-common.sh"
 
-conda install -y six
 if [[ ${BUILD_ENVIRONMENT} = *arm64* ]]; then
   pip install hypothesis "expecttest==0.1.3" "librosa>=0.6.2" "numba==0.56.0" psutil "scipy==1.9.0"
 else

From 407b996e760aa5accb09c286221a03da3edc29c2 Mon Sep 17 00:00:00 2001
From: AllenTiTaiWang <titaiwang@microsoft.com>
Date: Fri, 28 Oct 2022 19:31:23 +0000
Subject: [PATCH 0308/1922] [ONNX] Disable ONNX ceil_mode and count_include_pad
 to aligntorch ceil_mode results in corner case (#87892)

ONNX and PyTorch has different equation on pooling and different strategy on ceil_mode, which leads to discrepancy on corner case (#71549 ).
Specifically, PyTorch avereage pooling is not following [the equation on documentation](https://pytorch.org/docs/stable/generated/torch.nn.AvgPool2d.html), it allows sliding window to go off-bound instead, if they start within the left padding or the input (in NOTE section). More details can be found in #57178.

This PR changes avgpool in opset 10 and 11 back the way as opset 9, which it stops using ceil_mode and count_include_pad  in onnx::AveragePool

A comprehensive test for all combinations of parameters can be found in the next PR. #87893
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87892
Approved by: https://github.com/BowenBao
---
 .../TestOperators.test_avg_pool2d.expect      |  7 +--
 torch/onnx/symbolic_opset10.py                | 49 +++----------------
 torch/onnx/symbolic_opset11.py                | 24 +++++++--
 torch/onnx/symbolic_opset9.py                 |  9 +++-
 4 files changed, 35 insertions(+), 54 deletions(-)

diff --git a/test/onnx/expect/TestOperators.test_avg_pool2d.expect b/test/onnx/expect/TestOperators.test_avg_pool2d.expect
index d551ff38f809b..4839fb5a35a7d 100644
--- a/test/onnx/expect/TestOperators.test_avg_pool2d.expect
+++ b/test/onnx/expect/TestOperators.test_avg_pool2d.expect
@@ -1,6 +1,6 @@
 ir_version: 7
 producer_name: "pytorch"
-producer_version: "CURRENT_VERSION"
+producer_version: "1.14.0"
 graph {
   node {
     output: "onnx::Pad_1"
@@ -33,11 +33,6 @@ graph {
     output: "3"
     name: "AveragePool_2"
     op_type: "AveragePool"
-    attribute {
-      name: "ceil_mode"
-      i: 0
-      type: INT
-    }
     attribute {
       name: "kernel_shape"
       ints: 3
diff --git a/torch/onnx/symbolic_opset10.py b/torch/onnx/symbolic_opset10.py
index f20a1290ca17a..27cb161a1ae30 100644
--- a/torch/onnx/symbolic_opset10.py
+++ b/torch/onnx/symbolic_opset10.py
@@ -1,7 +1,7 @@
 import functools
 import sys
 import warnings
-from typing import Callable, Sequence
+from typing import Callable
 
 import torch
 import torch._C._onnx as _C_onnx
@@ -251,47 +251,12 @@ def symbolic_fn(g, input, kernel_size, stride, padding, dilation, ceil_mode):
 )
 @_beartype.beartype
 def _avg_pool(name, tuple_fn):
-    @symbolic_helper.quantized_args(True, False, False, False, False, False, False)
-    @symbolic_helper.parse_args("v", "is", "is", "is", "i", "i", "none")
-    @_beartype.beartype
-    def symbolic_fn(
-        g,
-        input: _C.Value,
-        kernel_size: Sequence[int],
-        stride: Sequence[int],
-        padding: Sequence[int],
-        ceil_mode: int,
-        count_include_pad: int,
-        divisor_override=None,
-    ):
-        if not stride:
-            stride = kernel_size
-        padding = symbolic_helper._avgpool_helper(
-            tuple_fn, padding, kernel_size, stride, divisor_override, name
-        )
-        assert isinstance(padding, tuple)
-        if count_include_pad:
-            input = opset9._op_with_optional_float_cast(
-                g,
-                "Pad",
-                input,
-                pads_i=((0,) * 2 + padding) * 2,
-                mode_s="constant",
-                value_f=0.0,
-                opset_before=11,
-            )
-            padding = (0,) * len(padding)
-        output = g.op(
-            "AveragePool",
-            input,
-            kernel_shape_i=tuple_fn(kernel_size),
-            strides_i=tuple_fn(stride),
-            pads_i=padding * 2,
-            ceil_mode_i=ceil_mode,
-        )
-        return output
-
-    return symbolic_fn
+    # Although onnx::AvgPool provides count_include_pad and ceil_mode,
+    # The corner case of Average Pooling with ceil_mode on
+    # PyTorch allows sliding window go off bound, which leads to
+    # this accommodation.
+    # More detail on https://github.com/pytorch/pytorch/issues/57178
+    return opset9._avg_pool(name, tuple_fn)
 
 
 @_onnx_symbolic(
diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py
index c845d6dcc2e4d..6c71cc1651562 100644
--- a/torch/onnx/symbolic_opset11.py
+++ b/torch/onnx/symbolic_opset11.py
@@ -590,12 +590,18 @@ def symbolic_fn(
         count_include_pad: int,
         divisor_override=None,
     ):
+        # Although onnx::AvgPool provides count_include_pad and ceil_mode,
+        # The corner case of Average Pooling with ceil_mode on
+        # PyTorch allows sliding window go off bound, which leads to
+        # this accommodation.
+        # More detail on https://github.com/pytorch/pytorch/issues/57178
+        if not stride:
+            stride = kernel_size
         padding = symbolic_helper._avgpool_helper(
             tuple_fn, padding, kernel_size, stride, divisor_override, name
         )
         assert isinstance(padding, tuple)
-        if not stride:
-            stride = kernel_size
+        adjusted_padding = padding
         if count_include_pad:
             input = g.op(
                 "Pad",
@@ -603,14 +609,22 @@ def symbolic_fn(
                 g.op("Constant", value_t=torch.tensor(((0,) * 2 + padding) * 2)),
                 mode_s="constant",
             )
-            padding = (0,) * len(padding)
+            adjusted_padding = (0,) * len(padding)
+        if ceil_mode:
+            padding_ceil = opset9.get_pool_ceil_padding(
+                input, kernel_size, stride, padding
+            )
+            adjusted_padding = adjusted_padding + tuple(
+                a + b for (a, b) in zip(padding_ceil, adjusted_padding)
+            )
+        else:
+            adjusted_padding = adjusted_padding * 2
         output = g.op(
             "AveragePool",
             input,
             kernel_shape_i=tuple_fn(kernel_size),
             strides_i=tuple_fn(stride),
-            pads_i=padding * 2,
-            ceil_mode_i=ceil_mode,
+            pads_i=adjusted_padding,
         )
         return output
 
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index bbb97f3f8d794..d31bb8d1a9d62 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -1650,13 +1650,20 @@ def symbolic_fn(
         )
         assert isinstance(padding, tuple)
         adjusted_padding = padding
+        # Although onnx::AvgPool provides count_include_pad,
+        # The corner case of Average Pooling with ceil_mode on
+        # PyTorch allows sliding window go off bound, which leads to
+        # this accommodation.
+        # More detail on https://github.com/pytorch/pytorch/issues/57178
         if count_include_pad:
-            input = g.op(
+            input = _op_with_optional_float_cast(
+                g,
                 "Pad",
                 input,
                 pads_i=((0,) * 2 + padding) * 2,
                 mode_s="constant",
                 value_f=0.0,
+                opset_before=11,
             )
             adjusted_padding = (0,) * len(padding)
         if ceil_mode:

From 2af74ae84da48e2c1871a620cc7630312e716598 Mon Sep 17 00:00:00 2001
From: AllenTiTaiWang <titaiwang@microsoft.com>
Date: Fri, 28 Oct 2022 19:31:23 +0000
Subject: [PATCH 0309/1922] [ONNX] Parametrized Avgpool2D test to have all test
 combinations (#87893)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87893
Approved by: https://github.com/BowenBao
---
 test/onnx/test_pytorch_onnx_onnxruntime.py | 24 ++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 4577dafdad56c..9f95227e009a9 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -1412,8 +1412,26 @@ def test_avgpool_1d_ceil(self):
         x = torch.randn(1, 1, 7)
         self.run_test(model, x)
 
-    def test_avgpool_2d_ceil(self):
-        model = torch.nn.AvgPool2d(3, 2, ceil_mode=True)
+    @common_utils.parametrize(
+        "padding",
+        (0, 1),
+    )
+    @common_utils.parametrize(
+        "ceil_mode",
+        (True, False),
+    )
+    @common_utils.parametrize(
+        "count_include_pad",
+        (True, False),
+    )
+    def test_avgpool_2d(self, padding, ceil_mode, count_include_pad):
+        model = torch.nn.AvgPool2d(
+            3,
+            3,
+            padding=padding,
+            ceil_mode=ceil_mode,
+            count_include_pad=count_include_pad,
+        )
         x = torch.randn(20, 16, 50, 32)
         self.run_test(model, x)
 
@@ -12409,8 +12427,6 @@ def forward(self, x) -> Optional[Tensor]:
                 y = None
             return y
 
-    #  Skip now to wait more insight on https://github.com/onnx/onnx/issues/4424
-    #  Model fails on type inference, as it's input/output type mismatch.
     class LoopNoneInput(torch.nn.Module):
         def forward(self, x) -> Optional[Tensor]:
             y: Optional[Tensor] = None

From a4e17c621c2c31b57a3f20d0f224fb8c1eeda9ed Mon Sep 17 00:00:00 2001
From: AllenTiTaiWang <titaiwang@microsoft.com>
Date: Fri, 28 Oct 2022 19:54:52 +0000
Subject: [PATCH 0310/1922] [ONNX] Add internal node kind parsing (#87638)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87638
Approved by: https://github.com/justinchuby, https://github.com/BowenBao
---
 torch/onnx/_internal/jit_utils.py | 11 +++++++++++
 torch/onnx/_patch_torch.py        |  6 +++---
 torch/onnx/utils.py               |  4 ++--
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/torch/onnx/_internal/jit_utils.py b/torch/onnx/_internal/jit_utils.py
index d5be7675d0a44..6354cea73fc04 100644
--- a/torch/onnx/_internal/jit_utils.py
+++ b/torch/onnx/_internal/jit_utils.py
@@ -321,3 +321,14 @@ def get_device_from_value(value: _C.Value) -> Optional[torch.device]:
         return None
     tensor_type = typing.cast(_C.TensorType, value.type())
     return tensor_type.device()
+
+
+@_beartype.beartype
+def parse_node_kind(kind: str) -> Tuple[str, str]:
+    """Parse node kind into domain and Op name."""
+    if "::" not in kind:
+        raise ValueError(f"Node kind: {kind} is invalid. '::' is not in node kind.")
+    domain, opname = kind.split("::", 1)
+    if "::" in opname:
+        raise ValueError(f"Node kind: {kind} is invalid. '::' should only apear once.")
+    return domain, opname
diff --git a/torch/onnx/_patch_torch.py b/torch/onnx/_patch_torch.py
index 04861ab774395..5a23d584ec257 100644
--- a/torch/onnx/_patch_torch.py
+++ b/torch/onnx/_patch_torch.py
@@ -10,7 +10,7 @@
 # Import utils to get _params_dict because it is a global that is accessed by c++ code
 from torch.onnx import _deprecation, utils
 from torch.onnx._globals import GLOBALS
-from torch.onnx._internal import _beartype
+from torch.onnx._internal import _beartype, jit_utils
 
 _ATTR_PATTERN = re.compile("^(.+)_(([ifstgz])|(ty))$")
 
@@ -70,7 +70,7 @@ def _graph_op(
     args = [_const_if_tensor(g, arg) for arg in raw_args]
 
     if "::" in opname:
-        namespace, op = opname.split("::")
+        namespace, op = jit_utils.parse_node_kind(opname)
     else:
         namespace = "onnx"
         op = opname
@@ -124,7 +124,7 @@ def _aten_op(g: _C.Graph, operator: str, *args, overload_name: str = "", **kwarg
 @_beartype.beartype
 def _block_op(block: _C.Block, opname: str, *args: _C.Value, **kwargs):
     if "::" in opname:
-        namespace, op = opname.split("::")
+        namespace, op = jit_utils.parse_node_kind(opname)
     else:
         namespace = "onnx"
         op = opname
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 7cee61ed70b46..251e6be09e982 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -1833,7 +1833,7 @@ def _run_symbolic_function(
     else:
         ns_op_name = node_kind
 
-    namespace, op_name = ns_op_name.split("::")
+    namespace, op_name = jit_utils.parse_node_kind(ns_op_name)
 
     graph_context = jit_utils.GraphContext(
         graph=graph,
@@ -1930,7 +1930,7 @@ def _verify_custom_op_name(symbolic_name: str):
             "alphanumerical characters"
         )
 
-    ns, _ = symbolic_name.split("::")
+    ns, _ = jit_utils.parse_node_kind(symbolic_name)
     if ns == "onnx":
         raise ValueError(
             f"Failed to register operator {symbolic_name}. {ns} domain cannot be modified."

From 2d51b083524cc0c66ceee1a5fa6d2d4b60adca17 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Fri, 28 Oct 2022 13:28:39 -0700
Subject: [PATCH 0311/1922] Unify meta tensor and fake tensor converter
 conversion (#87943)

Meta tensor does a lot of work to make sure tensors "look" similar
to the original parts; e.g., if the original was a non-leaf, meta
converter ensures the meta tensor is a non-leaf too.  Fake tensor
destroyed some of these properties when it wraps it in a FakeTensor.

This patch pushes the FakeTensor constructor into the meta converter
itself, so that we first create a fake tensor, and then we do various
convertibility bits to it to make it look right.

The two tricky bits:

- We need to have no_dispatch enabled when we allocate the initial meta
  tensor, or fake tensor gets mad at us for making a meta fake tensor.
  This necessitates the double-callback structure of the callback
  arguments: the meta construction happens *inside* the function so
  it is covered by no_dispatch

- I can't store tensors for the storages anymore, as that will result
  in a leak.  But we have untyped storage now, so I just store untyped
  storages instead.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87943
Approved by: https://github.com/eellison, https://github.com/albanD
---
 test/dynamo/test_unspec.py                    |   3 -
 test/test_meta.py                             | 154 ++++++++--
 .../templates/python_variable_methods.cpp     |  12 +-
 torch/_subclasses/fake_tensor.py              | 275 +++++++++---------
 torch/_subclasses/meta_utils.py               | 152 ++++++++--
 torch/testing/_internal/common_utils.py       |   2 +
 6 files changed, 405 insertions(+), 193 deletions(-)

diff --git a/test/dynamo/test_unspec.py b/test/dynamo/test_unspec.py
index 22f975d0f9d68..fd5396981b740 100644
--- a/test/dynamo/test_unspec.py
+++ b/test/dynamo/test_unspec.py
@@ -50,9 +50,6 @@ class UnspecTest(cls):
 UnspecReproTests = make_unspec_cls(test_repros.ReproTests)
 UnspecNNModuleTests = make_unspec_cls(test_modules.NNModuleTests)
 
-# RuntimeError: a leaf Variable that requires grad is being used in an in-place operation.
-unittest.expectedFailure(UnspecReproTests.test_batch_norm_act_unspec)
-
 
 @patch.object(torch._dynamo.config, "specialize_int_float", False)
 class UnspecTests(torch._dynamo.test_case.TestCase):
diff --git a/test/test_meta.py b/test/test_meta.py
index 1abe4cd2cda75..1499885bc607d 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -66,6 +66,34 @@ def assertSameVersionCounter(self, m1, m2):
         self.assertNotEqual(m1._version, vc)
         self.assertEqual(m2._version, m1._version)
 
+    def assertMetadataMatches(self, m1, m2):
+        self.assertEqual(m1.dtype, m2.dtype)
+        self.assertEqual(m1.shape, m2.shape)
+        self.assertEqual(m1.requires_grad, m2.requires_grad)
+        self.assertEqual(m1.is_leaf, m2.is_leaf)
+        self.assertEqual(m1.grad_fn is None, m2.grad_fn is None)
+        self.assertEqual(m1.is_sparse, m2.is_sparse)
+        self.assertEqual(m1.is_inference(), m2.is_inference())
+        self.assertEqual(m1.is_conj(), m2.is_conj())
+        self.assertEqual(m1.is_neg(), m2.is_neg())
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", "The .grad attribute of a Tensor")
+            grad_not_none = m1.grad is not None
+        if grad_not_none:
+            self.assertMetadataMatches(m1.grad, m2.grad)
+        if m1.is_sparse:
+            self.assertEqual(m1.dense_dim(), m2.dense_dim())
+            self.assertEqual(m1.sparse_dim(), m2.sparse_dim())
+            self.assertEqual(m1.is_coalesced(), m2.is_coalesced())
+        else:
+            self.assertEqual(m1.stride(), m2.stride())
+            self.assertEqual(m1.storage_offset(), m2.storage_offset())
+            self.assertEqual(m1._is_view(), m2._is_view())
+            if m1._is_view():
+                self.assertMetadataMatches(m1._base, m2._base)
+        # TODO: test if is resizable (no direct query for this atm)
+        # TODO: audit AutogradMeta to see if it matches
+
     def test_view_of_non_leaf(self):
         x = torch.randn(4, requires_grad=True)
         y = x.neg()
@@ -74,9 +102,14 @@ def test_view_of_non_leaf(self):
         to_meta = MetaConverter()
         m1 = to_meta(z1)
         m2 = to_meta(z2)
-        self.assertEqual(m1.shape, z1.shape)
+
+        # check the test is actually testing what it claims
         self.assertTrue(m1._is_view())
         self.assertFalse(m1._base.is_leaf)
+
+        self.assertIsNot(m1, m2)
+        self.assertMetadataMatches(m1, z1)
+        self.assertMetadataMatches(m2, z2)
         self.assertSameVersionCounter(m1, m2)
 
     def test_view_of_leaf(self):
@@ -86,35 +119,118 @@ def test_view_of_leaf(self):
         to_meta = MetaConverter()
         m1 = to_meta(z1)
         m2 = to_meta(z2)
-        self.assertEqual(m1.shape, z1.shape)
+
+        # check the test is actually testing what it claims
         self.assertTrue(m1._is_view())
         self.assertTrue(m1._base.is_leaf)
+
+        self.assertIsNot(m1, m2)
+        self.assertMetadataMatches(m1, z1)
+        self.assertMetadataMatches(m2, z2)
         self.assertSameVersionCounter(m1, m2)
 
     def test_leaf(self):
         x = torch.randn(4, requires_grad=True)
         to_meta = MetaConverter()
         m = to_meta(x)
-        self.assertEqual(m.shape, x.shape)
+
+        # check the test is actually testing what it claims
         self.assertTrue(m.is_leaf)
         self.assertTrue(m.requires_grad)
 
+        self.assertMetadataMatches(m, x)
+
     def test_non_leaf(self):
         x = torch.randn(4, requires_grad=True)
         y = x.neg()
         to_meta = MetaConverter()
         m = to_meta(y)
-        self.assertEqual(m.shape, y.shape)
+
+        # check the test is actually testing what it claims
         self.assertFalse(m.is_leaf)
         self.assertTrue(m.requires_grad)
 
+        self.assertMetadataMatches(m, y)
+
     def test_requires_grad_false(self):
         x = torch.randn(4, requires_grad=False)
         to_meta = MetaConverter()
         m = to_meta(x)
-        self.assertEqual(m.shape, x.shape)
+
+        # check the test is actually testing what it claims
         self.assertFalse(m.requires_grad)
 
+        self.assertMetadataMatches(m, x)
+
+    def test_channels_last(self):
+        x = torch.empty(2, 3, 4, 5, memory_format=torch.channels_last)
+        to_meta = MetaConverter()
+        m = to_meta(x)
+
+        # check the test is actually testing what it claims
+        self.assertTrue(m.is_leaf)
+
+        self.assertMetadataMatches(m, x)
+
+    def test_channels_last_leaf(self):
+        x = torch.empty(2, 3, 4, 5, memory_format=torch.channels_last, requires_grad=True)
+        to_meta = MetaConverter()
+        m = to_meta(x)
+
+        # check the test is actually testing what it claims
+        self.assertTrue(m.requires_grad)
+        self.assertTrue(m.is_leaf)
+
+        self.assertMetadataMatches(m, x)
+
+    def test_channels_last_non_leaf(self):
+        x = torch.empty(2, 3, 4, 5, memory_format=torch.channels_last, requires_grad=True)
+        y = x + 2
+
+        # sanity
+        self.assertEqual(x.stride(), y.stride())
+        self.assertFalse(y.is_leaf)
+
+        to_meta = MetaConverter()
+        m = to_meta(y)
+
+        # check the test is actually testing what it claims
+        self.assertTrue(m.requires_grad)
+        self.assertFalse(m.is_leaf)
+
+        self.assertMetadataMatches(m, y)
+
+        # Check that we can autograd with m as input without erroring;
+        # see https://github.com/pytorch/pytorch/issues/87956
+        loss = m.sum()
+        torch.autograd.grad(loss, m)
+
+    def test_empty_strided_non_dense_leaf(self):
+        x = torch.empty_strided((2, 2), (4, 2), requires_grad=True)
+
+        to_meta = MetaConverter()
+        m = to_meta(x)
+
+        # check the test is actually testing what it claims
+        self.assertTrue(m.requires_grad)
+        self.assertTrue(m.is_leaf)
+
+        self.assertMetadataMatches(m, x)
+
+    def test_non_leaf_torture(self):
+        x = torch.empty(20, requires_grad=True)
+        with torch.no_grad():
+            x.set_(x.storage(), 10, (2,), (2,))
+
+        to_meta = MetaConverter()
+        m = to_meta(x)
+
+        # check the test is actually testing what it claims
+        self.assertTrue(m.requires_grad)
+        self.assertTrue(m.is_leaf)
+
+        self.assertMetadataMatches(m, x)
+
     # NB: complex stuff is not actually exercised right now because
     # we have a blanket exclusion for complex conversion
 
@@ -122,41 +238,30 @@ def test_view_as_real(self):
         x = torch.randn(4, dtype=torch.complex64)
         y = torch.view_as_real(x)
         m = MetaConverter()(y)
-        self.assertEqual(m.shape, y.shape)
-        self.assertEqual(m.stride(), y.stride())
-        self.assertEqual(m.dtype, y.dtype)
+        self.assertMetadataMatches(m, y)
 
     def test_complex_noncontiguous_bug(self):
         x = torch.randn((2, 2, 4, 9), dtype=torch.complex32)[:, 0, :, :]
         m = MetaConverter()(x)
-        self.assertEqual(m.shape, x.shape)
-        self.assertEqual(m.stride(), x.stride())
-        self.assertEqual(m.dtype, x.dtype)
+        self.assertMetadataMatches(m, x)
 
     def test_view_as_complex(self):
         x = torch.randn((4, 2), dtype=torch.float32)
         y = torch.view_as_complex(x)
         m = MetaConverter()(y)
-        self.assertEqual(m.shape, y.shape)
-        self.assertEqual(m.stride(), y.stride())
-        self.assertEqual(m.dtype, y.dtype)
+        self.assertMetadataMatches(m, y)
 
     def test_view_dtype(self):
         x = torch.randn(4, dtype=torch.float32)
         y = x.view(dtype=torch.int32)
         m = MetaConverter()(y)
-        self.assertEqual(m.shape, y.shape)
-        self.assertEqual(m.stride(), y.stride())
-        self.assertEqual(m.dtype, y.dtype)
+        self.assertMetadataMatches(m, y)
 
     def test_imag(self):
         x = torch.randn(4, dtype=torch.complex64)
         y = x.imag
         m = MetaConverter()(y)
-        self.assertEqual(m.shape, y.shape)
-        self.assertEqual(m.dtype, y.dtype)
-        self.assertEqual(m.stride(), y.stride())
-        self.assertEqual(m.storage_offset(), y.storage_offset())
+        self.assertMetadataMatches(m, y)
 
     def test_weakref(self):
         x = torch.randn(4, 4, 4)
@@ -746,7 +851,12 @@ def __init__(self, test_case, *, device, dtype, inplace):
     def __torch_function__(self, func, types, args=(), kwargs=None):
         kwargs = kwargs or {}
 
-        if torch.jit.is_tracing() or isinstance(func, torch.ScriptMethod):
+        if (
+            torch.jit.is_tracing() or isinstance(func, torch.ScriptMethod) or
+            # meta converter doesn't work correctly when no_dispatch() is on, so
+            # skip running the crossref test in this case
+            torch._C._dispatch_tls_local_exclude_set().has(torch._C.DispatchKey.Python)
+        ):
             return func(*args, **kwargs)
 
         if self.dtype in meta_function_skips.get(func, set()):
diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
index 7122532a54410..e3c0a8b987bd6 100644
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@@ -1135,7 +1135,7 @@ static PyObject* THPVariable_set_(
       {
           "set_()",
           "set_(Storage source)",
-          "set_(Storage source, int64_t storage_offset, IntArrayRef size, IntArrayRef stride=None)",
+          "set_(Storage source, SymInt storage_offset, SymIntArrayRef size, SymIntArrayRef stride=None)",
           "set_(Tensor source)",
           "set_(Tensor source, SymInt storage_offset, SymIntArrayRef size, SymIntArrayRef stride=None)",
       },
@@ -1181,14 +1181,14 @@ static PyObject* THPVariable_set_(
         " for argument 1 'storage'");
       auto dispatch_set_ = [](const Tensor& self,
                               Storage source,
-                              int64_t storage_offset,
-                              IntArrayRef size,
-                              IntArrayRef stride) -> Tensor {
+                              c10::SymInt storage_offset,
+                              c10::SymIntArrayRef size,
+                              c10::SymIntArrayRef stride) -> Tensor {
         pybind11::gil_scoped_release no_gil;
-        return self.set_(source, storage_offset, size, stride);
+        return self.set__symint(source, storage_offset, size, stride);
       };
       return wrap(dispatch_set_(
-          self, storage, _r.toInt64(1), _r.intlist(2), _r.intlist(3)));
+          self, storage, _r.toSymInt(1), _r.symintlist(2), _r.symintlist(3)));
     }
     case 3: {
       // aten::set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 3e5cbdb652264..68b935a1a5222 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -2,7 +2,6 @@
 import functools
 import itertools
 import sys
-import warnings
 import weakref
 from dataclasses import dataclass
 from functools import partial
@@ -139,15 +138,14 @@ def tree_flatten_only(ty: Type[T], pytree: PyTree):
 # structure. Like `MetaConverter`, it uses `WeakTensorRefKey` to
 # hold a weak reference for all memoized tensors.
 class FakeTensorConverter(object):
-    tensor_memo: weakref.WeakValueDictionary
+    @property
+    def tensor_memo(self):
+        return self.meta_converter.tensor_memo
+
     meta_converter: MetaConverter
     constant_storage_mapping: Dict[StorageWeakRef, List[TensorWeakRef]]
 
     def __init__(self):
-        # FakeTensors store the FakeTensorMode which in turn stores a
-        # FakeTensor, so we need to hold a weak reference to the FakeTensor
-        # otherwise we would induce a circular reference
-        self.tensor_memo = weakref.WeakValueDictionary()
         self.meta_converter = MetaConverter()
 
         # map from to storage to corresponding constant tensors
@@ -214,28 +212,33 @@ def from_real_tensor(self, fake_mode, t, make_constant=False, shape_env=None):
         # not yet supported in metatensors
         if t.is_quantized:
             raise UnsupportedFakeTensorException("quantized nyi in meta tensors")
-        with no_dispatch():
-            meta_t = self.meta_converter(t, shape_env=shape_env)
-            if meta_t.device.type != "meta":
-                raise UnsupportedFakeTensorException("meta converter nyi")
-            out = FakeTensor(
-                fake_mode,
-                meta_t,
-                existing_device,
-                constant=t if make_constant else None,
-            )
-            out.requires_grad_(t.requires_grad)
-            if make_constant:
-                self.add_constant_storage_mapping(out)
         if type(t) is torch.nn.Parameter:
             assert not make_constant
-            out = torch.nn.Parameter(out, requires_grad=out.requires_grad)  # type: ignore[assignment]
-        with warnings.catch_warnings():
-            warnings.filterwarnings("ignore", "The .grad attribute of a Tensor")
-            grad_not_none = t.grad is not None
-        if grad_not_none:
-            out.grad = self.from_real_tensor(fake_mode, t.grad, shape_env=shape_env)
-        self.set_tensor_memo(t, out)
+
+        def mk_fake_tensor(make_meta_t):
+            # NB: don't use in_kernel_invocation_manager. to
+            # ensure FakeTensor can internally do constant computation
+            # as necessary.  Invocation manager is "more correct" as
+            # it works for more operators in make_meta_t, but
+            # invariant is that make_meta_t only calls factories
+            # for which it is not strictly necessary to use the
+            # invocation manager (I think!)
+            with no_dispatch():
+                return FakeTensor(
+                    fake_mode,
+                    make_meta_t(),
+                    existing_device,
+                    constant=t if make_constant else None,
+                )
+
+        out = self.meta_converter(
+            t, shape_env=shape_env, strict=True, callback=mk_fake_tensor
+        )
+        if out is NotImplemented:
+            raise UnsupportedFakeTensorException("meta converter nyi")
+        if make_constant:
+            self.add_constant_storage_mapping(out)
+        # NB: meta_converter set the memo
         return out
 
     # If you specify the device, it MUST be a meta tensor.
@@ -296,7 +299,9 @@ def constructors(fake_mode, func, *args, **kwargs):
     out_device = new_kwargs.pop("device", None)
     out_device = out_device if out_device is not None else default_device
     new_kwargs["device"] = torch.device("meta")
-    r = func(*args, **new_kwargs)
+    # Not in_kernel_invocation_manager as no fake tensor inputs
+    with no_dispatch():
+        r = func(*args, **new_kwargs)
     return FakeTensor(fake_mode, r, out_device)
 
 
@@ -309,7 +314,8 @@ def non_kwarg_to(fake_mode, func, *args, **kwargs):
     out_device = input_device if input_device else new_kwargs["input"].device
     new_kwargs["device"] = torch.device("meta")
     inp = new_kwargs.pop("input")
-    r = func(inp, **new_kwargs)
+    with in_kernel_invocation_manager(fake_mode):
+        r = func(inp, **new_kwargs)
     # TODO: I think this does the wrong thing if r is inp
     return fake_mode.fake_tensor_converter.from_meta_and_device(
         fake_mode, r, out_device
@@ -320,7 +326,8 @@ def non_kwarg_to(fake_mode, func, *args, **kwargs):
 # since the device of `the_template` is ignored
 @register_op_impl(aten.resize_as_.default)
 def resize_as_(fake_mode, func, *args, **kwargs):
-    return func(*args, **kwargs)
+    with in_kernel_invocation_manager(fake_mode):
+        return func(*args, **kwargs)
 
 
 @register_op_impl(aten._sparse_coo_tensor_with_dims_and_tensors.default)
@@ -710,6 +717,13 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
             else:
                 return args[0].fake_device
 
+        # Some attribute queries that can be serviced directly
+        # See Note [is_coalesced is dispatched]
+        if func in [torch.ops.aten.is_coalesced.default]:
+            # NB: no_dispatch is ok here too, this func is very simple
+            with in_kernel_invocation_manager(self):
+                return func(*args, **kwargs)
+
         flat_arg_fake_tensors = tree_flatten_only(FakeTensor, (args, kwargs))
         flat_symints = tree_flatten_only(torch.SymInt, (args, kwargs))
         has_symbolic_sizes = (
@@ -725,38 +739,38 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         if func in self.lift_fns:
             out = func(*args, **kwargs)
             if self.may_turn_const(out):
+                # NB: not in_kernel_invocation_manager because we're doing real
+                # compute here
                 with no_dispatch():
-                    return converter(self, out.clone(), make_constant=True)
-
-        with no_dispatch():
-            flat_arg_tensors = tree_flatten_only(torch.Tensor, (args, kwargs))
-            # See [subclass inputs] below
-            # NB: If you're seeing a mysterious infinite loop involving fake
-            # tensor, it might be related to this line.  Though I'm not sure
-            # how you'll know to read this comment, as this line won't show up
-            # in the stack trace.
-            if self.check_for_subclass(flat_arg_tensors):
-                return NotImplemented
-
-            # if we are in the dispatch mode, we will enter this function even if the inputs
-            # are not FakeTensors. For now, throw if any non-Fake Tensor inputs
-            # and just support constructors.
-
-            # this is generated from torch.tensor(), which does not use the
-            # dispatcher, to allow wrapper subclasses to wrap the new tensor
-            if func in self.lift_fns:
-                assert (
-                    len(kwargs) == 0
-                    and len(args) == 1
-                    and type(args[0]) is torch.Tensor
-                ), f"{args} {kwargs}"
-                return converter(self, args[0])
-
-            if self.check_for_non_fake(flat_arg_tensors):
-                raise Exception(
-                    "Invoking operators with non-Fake Tensor inputs in FakeTensorMode is not yet supported. "
-                    f"Please convert all Tensors to FakeTensors first. Found in {func}(*{args}, **{kwargs})"
-                )
+                    out = out.clone()
+                return converter(self, out, make_constant=True)
+
+        flat_arg_tensors = tree_flatten_only(torch.Tensor, (args, kwargs))
+        # See [subclass inputs] below
+        # NB: If you're seeing a mysterious infinite loop involving fake
+        # tensor, it might be related to this line.  Though I'm not sure
+        # how you'll know to read this comment, as this line won't show up
+        # in the stack trace.
+        if self.check_for_subclass(flat_arg_tensors):
+            return NotImplemented
+
+        # if we are in the dispatch mode, we will enter this function even if the inputs
+        # are not FakeTensors. For now, throw if any non-Fake Tensor inputs
+        # and just support constructors.
+
+        # this is generated from torch.tensor(), which does not use the
+        # dispatcher, to allow wrapper subclasses to wrap the new tensor
+        if func in self.lift_fns:
+            assert (
+                len(kwargs) == 0 and len(args) == 1 and type(args[0]) is torch.Tensor
+            ), f"{args} {kwargs}"
+            return converter(self, args[0])
+
+        if self.check_for_non_fake(flat_arg_tensors):
+            raise Exception(
+                "Invoking operators with non-Fake Tensor inputs in FakeTensorMode is not yet supported. "
+                f"Please convert all Tensors to FakeTensors first. Found in {func}(*{args}, **{kwargs})"
+            )
 
         # The current constant handling only support tracing systems
         # (aot autograd, torchdynamo) where each operation is run consecutively.
@@ -776,27 +790,30 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
             and len(flat_arg_fake_tensors) != 0
             and not has_symbolic_sizes
         ):
+            const_args, const_kwargs = pytree.tree_map_only(
+                FakeTensor, lambda t: t.constant, (args, kwargs)
+            )
+
+            # NB: not in_kernel_invocation_manager(self) as we want to do REAL
+            # compute
             with no_dispatch():
-                const_args, const_kwargs = pytree.tree_map_only(
-                    FakeTensor, lambda t: t.constant, (args, kwargs)
-                )
                 out = func(*const_args, **const_kwargs)
 
-                all_constant = pytree.tree_all_only(
-                    torch.Tensor, lambda t: self.may_turn_const(t), out
-                )
+            all_constant = pytree.tree_all_only(
+                torch.Tensor, lambda t: self.may_turn_const(t), out
+            )
 
-                if all_constant:
-                    return pytree.tree_map_only(
-                        torch.Tensor,
-                        lambda t: converter(self, t, make_constant=True),
-                        out,
-                    )
+            if all_constant:
+                return pytree.tree_map_only(
+                    torch.Tensor,
+                    lambda t: converter(self, t, make_constant=True),
+                    out,
+                )
 
-                # we weren't able to turn outputs to constants,
-                # so invalidate all constants that might be aliases of the outputs
-                for ten in tree_flatten_only(torch.Tensor, out):
-                    converter.invalidate_constant_aliases(ten)
+            # we weren't able to turn outputs to constants,
+            # so invalidate all constants that might be aliases of the outputs
+            for ten in tree_flatten_only(torch.Tensor, out):
+                converter.invalidate_constant_aliases(ten)
 
         # we are falling through to running non constant tensors, any input constant that
         # is written to must be invalidated
@@ -817,14 +834,13 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         ):
             from torch._decomp import meta_table as meta_table
 
-            with no_dispatch():
-                if func == aten.size.default:
-                    sys.stderr.write(
-                        "Trying to call aten.size on a tensor with symbolic shapes. "
-                        "It's likely that this is from calling tensor.shape in C++"
-                    )
-                    # We do this to allow for better error localization with `TORCH_SHOW_CPP_STACKTRACES=1`
-                    return None
+            if func == aten.size.default:
+                sys.stderr.write(
+                    "Trying to call aten.size on a tensor with symbolic shapes. "
+                    "It's likely that this is from calling tensor.shape in C++"
+                )
+                # We do this to allow for better error localization with `TORCH_SHOW_CPP_STACKTRACES=1`
+                return None
 
             with self:
                 if func in meta_table:
@@ -860,32 +876,27 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                     f"{func} - couldn't find symbolic meta function/decomposition"
                 )
 
-        with no_dispatch():
-            # special handling for funcs registered through `register_op_impl`,
-            # e.g., manipulating args on constructor calls to construct meta tensors
-            # and then afterwards wrapping them to a FakeTensor
-            for run_impl_check, op_impl in op_implementations:
-                if run_impl_check(func):
-                    op_impl_out = op_impl(self, func, *args, **kwargs)
-                    if op_impl_out != NotImplemented:
-                        return op_impl_out
-
-            # run kernel registered to meta for func, which include
-            # python meta registrations, prims, decomps, and c++ meta fns (structured kernels)
-            try:
-                with in_kernel_invocation_manager(self):
-                    r = func(*args, **kwargs)
-            except NotImplementedError as not_implemented_error:
-                # no meta kernel registered, fallback to kernel for the device
-                if not self.allow_fallback_kernels:
-                    raise not_implemented_error
-                return run_fallback_kernel(
-                    self, func, args, kwargs, not_implemented_error
-                )
-
-            return self.wrap_meta_outputs_with_default_device_logic(
-                r, func, args, kwargs
-            )
+        # special handling for funcs registered through `register_op_impl`,
+        # e.g., manipulating args on constructor calls to construct meta tensors
+        # and then afterwards wrapping them to a FakeTensor
+        for run_impl_check, op_impl in op_implementations:
+            if run_impl_check(func):
+                op_impl_out = op_impl(self, func, *args, **kwargs)
+                if op_impl_out != NotImplemented:
+                    return op_impl_out
+
+        # run kernel registered to meta for func, which include
+        # python meta registrations, prims, decomps, and c++ meta fns (structured kernels)
+        try:
+            with in_kernel_invocation_manager(self):
+                r = func(*args, **kwargs)
+        except NotImplementedError as not_implemented_error:
+            # no meta kernel registered, fallback to kernel for the device
+            if not self.allow_fallback_kernels:
+                raise not_implemented_error
+            return run_fallback_kernel(self, func, args, kwargs, not_implemented_error)
+
+        return self.wrap_meta_outputs_with_default_device_logic(r, func, args, kwargs)
 
     # [subclass inputs]
     # Suppose we enable fake tensor mode.  This means that fake tensor
@@ -959,6 +970,7 @@ def functions_with_cpp_meta_impl_that_support_symint(self):
             aten.as_strided.default,
             aten.zeros.default,
             aten.detach.default,
+            aten.set_.source_Storage_storage_offset,
         ]
 
     @property
@@ -1004,8 +1016,11 @@ def run_fallback_kernel(fake_mode, func, args, kwargs, orig_not_implemented_exce
     if torch.Tag.inplace_view in func.tags:  # type: ignore[attr-defined]
         raise orig_not_implemented_exception
 
+    inp_impls = {}
+
+    # Don't use in_kernel_invocation_manager(fake_mode) as we want to do
+    # REAL compute (not with meta device)
     with no_dispatch():
-        inp_impls = {}
 
         def to_real_tensor(e):
             if isinstance(e, FakeTensor):
@@ -1021,25 +1036,25 @@ def to_real_tensor(e):
 
         r = func(*args, **kwargs)
 
-        tensor_impls = set()
-        storages = set()
-
-        for e in tree_flatten((args, kwargs))[0]:
-            if isinstance(e, torch.Tensor):
-                if not e.is_sparse:
-                    storages.add(e.storage()._cdata)
-
-        # TODO: also check metadata change on inputs
-        # proper aliasing/metadata relationship between outputs and inputs will
-        # not be set up, bc of conversion to device, unless we can reuse an
-        # input impl
-        for e in tree_flatten(r)[0]:
-            if id(e) not in inp_impls and (
-                isinstance(e, torch.Tensor)
-                and not e.is_sparse
-                and e.storage()._cdata in storages
-            ):
-                raise orig_not_implemented_exception
+    tensor_impls = set()
+    storages = set()
+
+    for e in tree_flatten((args, kwargs))[0]:
+        if isinstance(e, torch.Tensor):
+            if not e.is_sparse:
+                storages.add(e.storage()._cdata)
+
+    # TODO: also check metadata change on inputs
+    # proper aliasing/metadata relationship between outputs and inputs will
+    # not be set up, bc of conversion to device, unless we can reuse an
+    # input impl
+    for e in tree_flatten(r)[0]:
+        if id(e) not in inp_impls and (
+            isinstance(e, torch.Tensor)
+            and not e.is_sparse
+            and e.storage()._cdata in storages
+        ):
+            raise orig_not_implemented_exception
 
     def map_out(e):
         if isinstance(e, torch.Tensor):
diff --git a/torch/_subclasses/meta_utils.py b/torch/_subclasses/meta_utils.py
index 51231811631bc..735e3ff38f4ff 100644
--- a/torch/_subclasses/meta_utils.py
+++ b/torch/_subclasses/meta_utils.py
@@ -1,8 +1,10 @@
+import contextlib
+import warnings
 import weakref
+from typing import ContextManager
 
 import torch
 from torch.multiprocessing.reductions import StorageWeakRef
-from torch.utils._mode_utils import no_dispatch
 
 
 def safe_is_leaf(t):
@@ -127,18 +129,31 @@ def del_ten():
 
     # NB: doesn't actually return a storage, because meta storage is
     # not supported
-    def meta_storage(self, s):
+    def meta_storage(self, s, callback):
         # NB: TypedStorage is freshly allocated and cannot be used as hash
         # key index.
 
         # Use a Weak Ref to s in order to not leak memory
         swr = StorageWeakRef(s)
         if swr not in self.storage_memo:
-            self.storage_memo[swr] = torch.empty(s.size(), dtype=s.dtype, device="meta")
+            self.storage_memo[swr] = (
+                callback(
+                    lambda: torch.empty(s.size(), dtype=torch.uint8, device="meta")
+                )
+                .storage()
+                .untyped()
+            )
         return self.storage_memo[swr]
 
     # This function assumes that it's possible to do the conversion
-    def meta_tensor(self, t, shape_env=None):
+    def meta_tensor(self, t, shape_env=None, callback=lambda t: t()):
+        # This indicates you set no_dispatch() before calling into this
+        # function.  This is an error: we may be creating fake tensors and
+        # will perform operations on them which need fake tensor mode to
+        # be active.  You will segfault if you are in a no_dispatch() block.
+        assert not torch._C._dispatch_tls_local_exclude_set().has(
+            torch._C.DispatchKey.Python
+        )
         arg_cnt = self.arg_cnt
         self.arg_cnt += 1
 
@@ -166,14 +181,22 @@ def sym_sizes_strides(t):
                 if t.is_sparse:
                     assert shape_env is None, "symbolic on sparse NYI"
                     is_leaf = safe_is_leaf(t)
-                    r = torch.ops.aten._sparse_coo_tensor_with_dims(
-                        t.sparse_dim(),
-                        t.dense_dim(),
-                        t.shape,
-                        dtype=t.dtype,
-                        layout=torch.sparse_coo,
-                        device="meta",
+                    r = callback(
+                        lambda: torch.ops.aten._sparse_coo_tensor_with_dims(
+                            t.sparse_dim(),
+                            t.dense_dim(),
+                            t.shape,
+                            dtype=t.dtype,
+                            layout=torch.sparse_coo,
+                            device="meta",
+                        )
                     )
+                    assert safe_is_leaf(r), "the callback you passed in doesn't detach"
+                    # Note [is_coalesced is dispatched]
+                    # Strangely enough, is_coalesced() is a dispatched operator,
+                    # which means that it will get caught by fake tensor mode.
+                    # Ordinarily this would error, but there's some logic in
+                    # fake tensor ensure this doesn't happen.
                     r._coalesced_(t.is_coalesced())
                     if t.requires_grad:
                         r.requires_grad = True
@@ -188,7 +211,7 @@ def sym_sizes_strides(t):
                     # directly from storage is WRONG because this won't cause
                     # version counters to get shared.
                     assert t._is_view()
-                    base = self.meta_tensor(t._base)
+                    base = self.meta_tensor(t._base, shape_env, callback)
 
                     def is_c_of_r(complex_dtype, real_dtype):
                         return (
@@ -214,38 +237,96 @@ def is_c_of_r(complex_dtype, real_dtype):
                         r = base.as_strided(sizes, strides, sym(t.storage_offset()))
                 else:
                     is_leaf = safe_is_leaf(t)
+                    sizes, strides = sym_sizes_strides(t)
+                    storage_offset = sym(t.storage_offset())
                     # Fake up some autograd history.
                     if t.requires_grad:
-                        r = torch.empty(
-                            (0,), dtype=t.dtype, device="meta", requires_grad=True
+                        r = callback(
+                            lambda: torch.empty_strided(
+                                sizes, strides, dtype=t.dtype, device="meta"
+                            )
                         )
+                        assert safe_is_leaf(
+                            r
+                        ), "the callback you passed in doesn't detach"
+                        r.requires_grad = t.requires_grad
                         if not is_leaf:
                             with torch.enable_grad():
-                                # The backward function here will be wrong, but
-                                # that's OK; our goal is just to get the metadata
-                                # looking as close as possible; we're not going to
-                                # actually try to backward() on these produced
-                                # metas.  TODO: would be safer to install some
-                                # sort of unsupported grad_fn here
-                                r = r.clone()
+                                # preserve_format is the default, but we want to
+                                # emphasize how important it is to preserve
+                                # format here
+                                r = r.clone(memory_format=torch.preserve_format)
                     else:
-                        r = torch.empty((0,), dtype=t.dtype, device="meta")
-                    # As long as meta storage is not supported, need to prevent
-                    # redispatching on set_(Storage, ...) which will choke with
-                    # meta storage
-                    s = self.meta_storage(t.storage())
-                    with no_dispatch():
-                        sizes, strides = sym_sizes_strides(t)
-                        with torch.no_grad():
-                            r.set_(s, sym(t.storage_offset()), sizes, strides)
+                        r = callback(
+                            lambda: torch.empty_strided(
+                                sizes, strides, dtype=t.dtype, device="meta"
+                            )
+                        )
+                        assert safe_is_leaf(
+                            r
+                        ), "the callback you passed in doesn't detach"
+
+                    s = t.storage().untyped()
+                    swr = StorageWeakRef(s)
+                    if (
+                        swr not in self.storage_memo
+                        and r.stride() == strides
+                        and r.storage_offset() == storage_offset
+                    ):
+                        # You're normal and happy, install the fresh storage into the memo
+                        self.storage_memo[swr] = r.storage().untyped()
+                    else:
+                        # You're in crazy town; somehow you gave us a tensor
+                        # that wasn't a view, but had nonzero storage offset,
+                        # nontrivial strides (such that clone() couldn't
+                        # preserve them), or already aliases with another
+                        # tensor's storage.  The most typical way to end
+                        # up here is with set_.  So use set_ to bludgeon this
+                        # in.
+                        r_s = self.meta_storage(s, callback=callback)
+                        # NB: In principle, this should always work, but there
+                        # is some subtle difference in the autograd metadata
+                        # that means we will backprop the set_ call, even if
+                        # r is declared as an input to grad.
+                        # See https://github.com/pytorch/pytorch/issues/87956
+                        # for the reproducer.
+                        # NB: The in_kernel_invocation_manager here is necessary
+                        # for fake tensor.  If we run the set_ call with fake
+                        # tensor on, r will improperly report that it is NOT a
+                        # meta tensor but a cpu tensor, and then the set_ call
+                        # will fail due to device mismatch.  no_dispatch() is
+                        # not enough, because the fake tensor will still claim
+                        # to be a CPU tensor and you'll end up in the CPU
+                        # kernel.  Arguably this is a hack; a cleaner way to
+                        # solve this is to have a FakeStorage concept which
+                        # would report it's CPU device--no problem now!  But
+                        # this is difficult to do because we don't have storage
+                        # subclasses.  Relevant test is
+                        # DynamicShapesFunctionTests::test_add_dynamic_shapes in
+                        # test/dynamo/test_dynamic_shapes.py
+                        maybe_fake_mgr: ContextManager[None] = contextlib.nullcontext()
+                        from torch._subclasses.fake_tensor import (
+                            FakeTensor,
+                            in_kernel_invocation_manager,
+                        )
+
+                        if isinstance(r, FakeTensor):
+                            maybe_fake_mgr = in_kernel_invocation_manager(r.fake_mode)
+                        with maybe_fake_mgr, torch.no_grad():
+                            r.set_(r_s, storage_offset, sizes, strides)
 
+                with warnings.catch_warnings():
+                    warnings.filterwarnings("ignore", "The .grad attribute of a Tensor")
+                    grad_not_none = t.grad is not None
+                if grad_not_none:
+                    r.grad = self.meta_tensor(t.grad, shape_env, callback)
                 torch._C._set_conj(r, t.is_conj())
                 torch._C._set_neg(r, t.is_neg())
             self.set_tensor_memo(t, r)
 
         return self.get_tensor_memo(t)
 
-    def __call__(self, t, shape_env=None):
+    def __call__(self, t, shape_env=None, *, strict=False, callback=lambda t: t()):
         # TODO: zero tensors?  We appear to have eliminated them by
         # excluding complex for now
         from torch._subclasses.fake_tensor import FakeTensor
@@ -280,10 +361,13 @@ def __call__(self, t, shape_env=None):
                 # tests all break so we just exclude this.  In any case
                 # the to conversion isn't really right anyhow.
                 self.miss += 1
+                if strict:
+                    return NotImplemented
                 return t
             else:
                 self.hit += 1
-                r = self.meta_tensor(t, shape_env=shape_env)
+                r = self.meta_tensor(t, shape_env=shape_env, callback=callback)
+                # TODO: this is suspicious, now that we have callback argument
                 if type(t) is torch.nn.Parameter:
                     r = torch.nn.Parameter(r, requires_grad=r.requires_grad)
                 return r
@@ -294,9 +378,13 @@ def __call__(self, t, shape_env=None):
             # support meta.  Trying to YOLO this is more trouble than it's
             # worth.
             self.miss += 1
+            if strict:
+                return NotImplemented
             return t
         else:
             # non-Tensor types don't count as hit or miss
+            if strict:
+                return NotImplemented
             return t
 
 
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 2f85b8af1d81f..9903e95228fc8 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -1367,6 +1367,8 @@ def freeze_rng_state():
         #
         # In the long run torch.cuda.set_rng_state should probably be
         # an operator.
+        #
+        # NB: Mode disable is to avoid running cross-ref tests on thes seeding
         with no_dispatch(), disable_functorch():
             if torch.cuda.is_available():
                 torch.cuda.set_rng_state(cuda_rng_state)

From 56523682b531904acd1a5840cb9397a2a8106dea Mon Sep 17 00:00:00 2001
From: Sergey Lebedev <sergeyle@nvidia.com>
Date: Sat, 29 Oct 2022 16:33:18 +0000
Subject: [PATCH 0312/1922] Make barrier blocking in UCC (#86961)

Currently CUDA UCC barrier is nonblocking with respect to CPU and there is no flag to change it. To make UCC PG barrier behaviour consistent with NCCL PG in this PR barrier has changed to be always blocking.

cc @mrshenli @pritamdamania87 @zhaojuanmao @satgera @rohan-varma @gqchen @aazzolini @osalpekar @jiayisuse @H-Huang @kwen2501 @awgu
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86961
Approved by: https://github.com/kwen2501
---
 torch/csrc/distributed/c10d/ProcessGroupUCC.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp b/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp
index 61f03abc112de..6333cbf369c1b 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp
@@ -169,6 +169,8 @@ void read_config() {
   for (auto op : parse_blocking_wait(blocking_wait_str)) {
     torch_ucc_config.blocking_wait[(std::uint8_t)op] = true;
   }
+  // barrier is always blocking
+  torch_ucc_config.blocking_wait[(std::uint8_t)OpType::BARRIER] = true;
 
   torch_ucc_config.use_future =
       std::stoi(torch_ucc_envs_map.at("TORCH_UCC_USE_FUTURE"));

From f11c0407e528f76f2e452ead0ebc4a7f23495e77 Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Sat, 29 Oct 2022 04:07:56 +0000
Subject: [PATCH 0313/1922] Add composable activation checkpointing (#87664)

This is a composable activation checkpointing API. Unlike functional
activation checkpointing APIs, this one does not require changing
model source code. Unlike ``nn.Module`` wrapper activation checkpointing
APIs, this one does not modify model structure or fully-qualified names
either. Under the hood, it registers activation checkpointing logic as pre-
and post-forward hooks.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87664
Approved by: https://github.com/zhaojuanmao
---
 .../_composable/test_checkpoint.py            |  83 +++++++++
 torch/distributed/_composable/__init__.py     |   1 +
 .../_composable/checkpoint_activation.py      | 157 ++++++++++++++++++
 3 files changed, 241 insertions(+)
 create mode 100644 test/distributed/_composable/test_checkpoint.py
 create mode 100644 torch/distributed/_composable/checkpoint_activation.py

diff --git a/test/distributed/_composable/test_checkpoint.py b/test/distributed/_composable/test_checkpoint.py
new file mode 100644
index 0000000000000..fbffc90f19c51
--- /dev/null
+++ b/test/distributed/_composable/test_checkpoint.py
@@ -0,0 +1,83 @@
+# Owner(s): ["oncall: distributed"]
+
+from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_utils import (
+    TestCase,
+    run_tests,
+)
+
+import torch
+import torch.nn as nn
+from torch.distributed._composable import checkpoint
+
+import unittest
+from collections import deque
+from copy import deepcopy
+
+
+class ToyModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.l1 = nn.Linear(100, 100)
+        self.seq = nn.Sequential(
+            nn.ReLU(),
+            nn.Linear(100, 100),
+            nn.ReLU(),
+        )
+
+    def forward(self, x):
+        return self.seq(self.l1(x))
+
+
+class TestCheckpoint(TestCase):
+    def _get_graph_size(self, out: torch.Tensor) -> int:
+        q = deque([out.grad_fn])
+        num_functions = 0
+        while len(q):
+            fn = q.pop()
+            num_functions += 1
+            for next_fn, _ in fn.next_functions:
+                if next_fn:
+                    q.append(next_fn)
+
+        return num_functions
+
+    def _test_tensor_only(self, net: nn.Module, x: torch.Tensor) -> None:
+        x1 = x.clone()
+        x2 = x.clone()
+        x1.requires_grad = True
+        x2.requires_grad = True
+
+        net1 = net
+        net2 = deepcopy(net)
+
+        # no checkpoint
+        loss1 = net1(x1).sum()
+        graph_size1 = self._get_graph_size(loss1)
+        loss1.backward()
+
+        # with checkpoint
+        checkpoint(net2.seq)
+        loss2 = net2(x2).sum()
+        graph_size2 = self._get_graph_size(loss2)
+        loss2.backward()
+
+        self.assertTrue(graph_size2 < graph_size1)
+
+        for p1, p2 in zip(net1.parameters(), net2.parameters()):
+            self.assertEqual(p1.grad, p2.grad)
+
+    def test_tensor_only_cpu(self):
+        x = torch.randn(20, 100)
+        net = ToyModel()
+        self._test_tensor_only(net, x)
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_tensor_only_gpu(self):
+        x = torch.randn(20, 100, device="cuda:0")
+        net = ToyModel().to("cuda:0")
+        self._test_tensor_only(net, x)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/distributed/_composable/__init__.py b/torch/distributed/_composable/__init__.py
index fb0c81f6c5668..9098016534f90 100644
--- a/torch/distributed/_composable/__init__.py
+++ b/torch/distributed/_composable/__init__.py
@@ -1 +1,2 @@
+from .checkpoint_activation import checkpoint
 from .contract import contract
diff --git a/torch/distributed/_composable/checkpoint_activation.py b/torch/distributed/_composable/checkpoint_activation.py
new file mode 100644
index 0000000000000..4d9a2ea7fddb3
--- /dev/null
+++ b/torch/distributed/_composable/checkpoint_activation.py
@@ -0,0 +1,157 @@
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import detach_variable
+
+from contextlib import contextmanager
+from typing import Any, List, Optional, Tuple
+
+from .contract import contract
+
+
+@contextmanager
+def _no_hook(module: nn.Module):
+    r"""
+    Disable hooks installed by checkpoint to avoid unintentional recursion
+    during backward recomputation.
+    """
+    orig_enable_hook = checkpoint.state(module).enable_hook
+    checkpoint.state(module).enable_hook = False
+    try:
+        yield
+    except Exception:
+        raise
+    finally:
+        checkpoint.state(module).enable_hook = orig_enable_hook
+
+
+class _ModuleHookCheckpointFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, module: nn.Module, output: Any, *inputs: Any) -> Any:  # type: ignore[override]
+        ctx.module = module
+
+        # Save non-tensor inputs in ctx, keep a placeholder None for tensors
+        # to be filled out during the backward.
+        ctx.inputs = []
+        ctx.tensor_indices = []
+        tensor_inputs = []
+        for i, inp in enumerate(inputs):
+            if torch.is_tensor(inp):
+                tensor_inputs.append(inp)
+                ctx.tensor_indices.append(i)
+                ctx.inputs.append(None)
+            else:
+                ctx.inputs.append(inp)
+
+        ctx.save_for_backward(*tensor_inputs)
+
+        return output
+
+    @staticmethod
+    def backward(ctx, output_grads: Tuple[Optional[torch.Tensor]]) -> Any:  # type: ignore[override]
+        if not torch.autograd._is_checkpoint_valid():
+            raise RuntimeError(
+                "Checkpointing is not compatible with .grad() or when an "
+                "`inputs` parameter is passed to .backward(). Please use "
+                ".backward() and do not pass its `inputs` argument."
+            )
+
+        # Copy the list to avoid modifying original list.
+        inputs = list(ctx.inputs)
+        tensor_indices = ctx.tensor_indices
+        tensors = ctx.saved_tensors
+
+        # Fill in inputs with appropriate saved tensors.
+        for i, idx in enumerate(tensor_indices):
+            inputs[idx] = tensors[i]
+
+        detached_inputs = detach_variable(tuple(inputs))
+        with torch.enable_grad(), _no_hook(ctx.module):
+            outputs = ctx.module(*detached_inputs)
+
+        if isinstance(outputs, torch.Tensor):
+            outputs = (outputs,)
+
+        if isinstance(output_grads, torch.Tensor):
+            output_grads = (output_grads,)
+
+        # run backward() with only tensor that requires grad
+        outputs_requires_grad: List[torch.Tensor] = []
+        output_grad_tensors: List[torch.Tensor] = []
+        for i in range(len(outputs)):
+            if torch.is_tensor(outputs[i]) and outputs[i].requires_grad:
+                outputs_requires_grad.append(outputs[i])
+                assert (
+                    output_grads[i] is not None
+                ), f"expecting grad for output at index {i}, but got None."
+                output_grad_tensors.append(output_grads[i])  # type: ignore[arg-type]
+        if len(outputs_requires_grad) == 0:
+            raise RuntimeError(
+                "none of output has requires_grad=True,"
+                " this checkpoint() is not necessary"
+            )
+
+        torch.autograd.backward(outputs_requires_grad, output_grad_tensors)
+        grads = tuple(
+            inp.grad if isinstance(inp, torch.Tensor) else None
+            for inp in detached_inputs
+        )
+
+        # The two None is for forward argument module and output respectively.
+        return (None, None) + grads
+
+
+@contract
+def checkpoint(module: nn.Module) -> nn.Module:
+    r"""
+    This is a composable activation checkpointing API. Unlike functional
+    activation checkpointing APIs, this one does not require changing model
+    source code. Unlike ``nn.Module`` wrapper activation checkpointing APIs,
+    this one does not modify model structure or fully-qualified names either.
+    Under the hood, it registers activation checkpointing logic as pre- and
+    post-forward hooks. Hence, this API can be easily applied to any model or
+    sub-modules in the model.
+
+    Args:
+        module (nn.Module): the target model or sub-module to apply activation
+            checkpointing.
+
+    Example::
+        >>> import torch.nn as nn
+        >>>
+        >>> class MyModel(nn.Module):
+        >>>     def __init__(self):
+        >>>         super().__init__()
+        >>>         self.l1 = nn.Linear(10, 10)
+        >>>         self.l2 = nn.Linear(10, 10)
+        >>>
+        >>>     def forward(self, x):
+        >>>         return self.l2(self.l1(x))
+        >>>
+        >>> model = MyModel()
+        >>> checkpoint(model.l1)  # apply activation checkpointing only to l1
+        >>> model(torch.zeros(2, 10)).sum().backward()
+
+    """
+
+    def forward_pre_hook(module: nn.Module, inputs: Tuple[Any]) -> None:
+        if checkpoint.state(module).enable_hook:
+            checkpoint.state(module).orig_grad_enabled = torch.is_grad_enabled()
+            torch.set_grad_enabled(False)
+
+    def forward_hook(module: nn.Module, inputs: Tuple[Any], output: Any) -> Any:
+        if checkpoint.state(module).enable_hook:
+            torch.set_grad_enabled(checkpoint.state(module).orig_grad_enabled)
+            return _ModuleHookCheckpointFunction.apply(module, output, *inputs)
+        else:
+            return output
+
+    # This hook does the following things:
+    # 1. detach outputs from the autograd graph to discard activations
+    # 2. insert an autograd.Function after the forward pass to recompute
+    #    activations during the backward pass.
+    checkpoint.state(module).enable_hook = True
+    module.register_forward_pre_hook(forward_pre_hook)
+    # Use prepend to make sure we restore the original grad enabled state right
+    # after the module forward invocation.
+    module.register_forward_hook(forward_hook, prepend=True)
+    return module

From 11f0f31c1504f24add6235dc288164325aee53be Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Sat, 29 Oct 2022 17:40:07 +0000
Subject: [PATCH 0314/1922] [BE] Upload GHA artifacts to S3 (#87827)

This is exclusively used by macOS, ROCM (and any other future workflows) that don't have direct access to S3 to upload their artifacts

### Testing

Running the script locally with the personal GITHUB_TOKEN:

```
python3 -m tools.stats.upload_artifacts --workflow-run-id 3342375847 --workflow-run-attempt 1 --repo pytorch/pytorch

Using temporary directory: /var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpxl6d7kcb
Downloading sccache-stats-macos-12-py3-arm64-runattempt1-9155493770
Downloading sccache-stats-macos-12-py3-lite-interpreter-x86-64-runattempt1-9155493303
Downloading sccache-stats-macos-12-py3-x86-64-runattempt1-9155493627
Upload /private/var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpxl6d7kcb/sccache-stats-macos-12-py3-arm64-runattempt1-9155493770 to s3://gha-artifacts/pytorch/pytorch/3342375847/1/artifact/sccache-stats-macos-12-py3-arm64-9155493770
Upload /private/var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpxl6d7kcb/sccache-stats-macos-12-py3-lite-interpreter-x86-64-runattempt1-9155493303 to s3://gha-artifacts/pytorch/pytorch/3342375847/1/artifact/sccache-stats-macos-12-py3-lite-interpreter-x86-64-9155493303
Upload /private/var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpxl6d7kcb/sccache-stats-macos-12-py3-x86-64-runattempt1-9155493627 to s3://gha-artifacts/pytorch/pytorch/3342375847/1/artifact/sccache-stats-macos-12-py3-x86-64-9155493627
Downloading test-jsons-runattempt1-test-default-1-2-linux.rocm.gpu_9155913429.zip
Downloading test-jsons-runattempt1-test-default-1-2-macos-12_9155944815.zip
Downloading test-jsons-runattempt1-test-default-1-2-macos-m1-12_9155888061.zip
Downloading test-jsons-runattempt1-test-default-2-2-linux.rocm.gpu_9155913500.zip
Downloading test-jsons-runattempt1-test-default-2-2-macos-12_9155944892.zip
Downloading test-jsons-runattempt1-test-default-2-2-macos-m1-12_9155888182.zip
Upload /private/var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpxl6d7kcb/test-jsons-runattempt1-test-default-1-2-linux.rocm.gpu_9155913429.zip to s3://gha-artifacts/pytorch/pytorch/3342375847/1/artifact/test-jsons-test-default-1-2-linux.rocm.gpu_9155913429.zip
Upload /private/var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpxl6d7kcb/test-jsons-runattempt1-test-default-1-2-macos-12_9155944815.zip to s3://gha-artifacts/pytorch/pytorch/3342375847/1/artifact/test-jsons-test-default-1-2-macos-12_9155944815.zip
Upload /private/var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpxl6d7kcb/test-jsons-runattempt1-test-default-1-2-macos-m1-12_9155888061.zip to s3://gha-artifacts/pytorch/pytorch/3342375847/1/artifact/test-jsons-test-default-1-2-macos-m1-12_9155888061.zip
Upload /private/var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpxl6d7kcb/test-jsons-runattempt1-test-default-2-2-linux.rocm.gpu_9155913500.zip to s3://gha-artifacts/pytorch/pytorch/3342375847/1/artifact/test-jsons-test-default-2-2-linux.rocm.gpu_9155913500.zip
Upload /private/var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpxl6d7kcb/test-jsons-runattempt1-test-default-2-2-macos-12_9155944892.zip to s3://gha-artifacts/pytorch/pytorch/3342375847/1/artifact/test-jsons-test-default-2-2-macos-12_9155944892.zip
Upload /private/var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpxl6d7kcb/test-jsons-runattempt1-test-default-2-2-macos-m1-12_9155888182.zip to s3://gha-artifacts/pytorch/pytorch/3342375847/1/artifact/test-jsons-test-default-2-2-macos-m1-12_9155888182.zip
Downloading test-reports-runattempt1-test-default-1-2-linux.rocm.gpu_9155913429.zip
Downloading test-reports-runattempt1-test-default-1-2-macos-12_9155944815.zip
Downloading test-reports-runattempt1-test-default-1-2-macos-m1-12_9155888061.zip
Downloading test-reports-runattempt1-test-default-2-2-linux.rocm.gpu_9155913500.zip
Downloading test-reports-runattempt1-test-default-2-2-macos-12_9155944892.zip
Downloading test-reports-runattempt1-test-default-2-2-macos-m1-12_9155888182.zip
Upload /private/var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpxl6d7kcb/test-reports-runattempt1-test-default-1-2-linux.rocm.gpu_9155913429.zip to s3://gha-artifacts/pytorch/pytorch/3342375847/1/artifact/test-reports-test-default-1-2-linux.rocm.gpu_9155913429.zip
Upload /private/var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpxl6d7kcb/test-reports-runattempt1-test-default-1-2-macos-12_9155944815.zip to s3://gha-artifacts/pytorch/pytorch/3342375847/1/artifact/test-reports-test-default-1-2-macos-12_9155944815.zip
Upload /private/var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpxl6d7kcb/test-reports-runattempt1-test-default-1-2-macos-m1-12_9155888061.zip to s3://gha-artifacts/pytorch/pytorch/3342375847/1/artifact/test-reports-test-default-1-2-macos-m1-12_9155888061.zip
Upload /private/var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpxl6d7kcb/test-reports-runattempt1-test-default-2-2-linux.rocm.gpu_9155913500.zip to s3://gha-artifacts/pytorch/pytorch/3342375847/1/artifact/test-reports-test-default-2-2-linux.rocm.gpu_9155913500.zip
Upload /private/var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpxl6d7kcb/test-reports-runattempt1-test-default-2-2-macos-12_9155944892.zip to s3://gha-artifacts/pytorch/pytorch/3342375847/1/artifact/test-reports-test-default-2-2-macos-12_9155944892.zip
Upload /private/var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpxl6d7kcb/test-reports-runattempt1-test-default-2-2-macos-m1-12_9155888182.zip to s3://gha-artifacts/pytorch/pytorch/3342375847/1/artifact/test-reports-test-default-2-2-macos-m1-12_9155888182.zip
Downloading usage-log-runattempt1-test-default-1-2-linux.rocm.gpu_9155913429.zip
Downloading usage-log-runattempt1-test-default-1-2-macos-12_9155944815.zip
Downloading usage-log-runattempt1-test-default-1-2-macos-m1-12_9155888061.zip
Downloading usage-log-runattempt1-test-default-2-2-linux.rocm.gpu_9155913500.zip
Downloading usage-log-runattempt1-test-default-2-2-macos-12_9155944892.zip
Downloading usage-log-runattempt1-test-default-2-2-macos-m1-12_9155888182.zip
Upload /private/var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpxl6d7kcb/usage-log-runattempt1-test-default-1-2-linux.rocm.gpu_9155913429.zip to s3://gha-artifacts/pytorch/pytorch/3342375847/1/artifact/usage-log-test-default-1-2-linux.rocm.gpu_9155913429.zip
Upload /private/var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpxl6d7kcb/usage-log-runattempt1-test-default-1-2-macos-12_9155944815.zip to s3://gha-artifacts/pytorch/pytorch/3342375847/1/artifact/usage-log-test-default-1-2-macos-12_9155944815.zip
Upload /private/var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpxl6d7kcb/usage-log-runattempt1-test-default-1-2-macos-m1-12_9155888061.zip to s3://gha-artifacts/pytorch/pytorch/3342375847/1/artifact/usage-log-test-default-1-2-macos-m1-12_9155888061.zip
Upload /private/var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpxl6d7kcb/usage-log-runattempt1-test-default-2-2-linux.rocm.gpu_9155913500.zip to s3://gha-artifacts/pytorch/pytorch/3342375847/1/artifact/usage-log-test-default-2-2-linux.rocm.gpu_9155913500.zip
Upload /private/var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpxl6d7kcb/usage-log-runattempt1-test-default-2-2-macos-12_9155944892.zip to s3://gha-artifacts/pytorch/pytorch/3342375847/1/artifact/usage-log-test-default-2-2-macos-12_9155944892.zip
Upload /private/var/folders/x4/2kd9r0fn5b9bf_sbcw16fxsc0000gn/T/tmpxl6d7kcb/usage-log-runattempt1-test-default-2-2-macos-m1-12_9155888182.zip to s3://gha-artifacts/pytorch/pytorch/3342375847/1/artifact/usage-log-test-default-2-2-macos-m1-12_9155888182.zip
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87827
Approved by: https://github.com/clee2000
---
 .github/workflows/upload-test-stats.yml | 14 ++++++
 tools/stats/upload_artifacts.py         | 61 +++++++++++++++++++++++++
 tools/stats/upload_stats_lib.py         | 16 +++++++
 3 files changed, 91 insertions(+)
 create mode 100644 tools/stats/upload_artifacts.py

diff --git a/.github/workflows/upload-test-stats.yml b/.github/workflows/upload-test-stats.yml
index 688a55b6eabca..3b84f15214eac 100644
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@@ -58,6 +58,20 @@ jobs:
           python3 -m tools.stats.upload_test_stats --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --head-branch "${HEAD_BRANCH}"
           python3 -m tools.stats.upload_sccache_stats --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}"
 
+      - name: Upload test artifacts
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          WORKFLOW_ARTIFACTS_URL: ${{ github.event.workflow_run.artifacts_url }}
+          WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
+          WORKFLOW_RUN_ATTEMPT: ${{ github.event.workflow_run.run_attempt }}
+          REPO_FULLNAME: ${{ github.event.workflow_run.repository.full_name }}
+        run: |
+          echo "${WORKFLOW_ARTIFACTS_URL}"
+
+          # Note that in the case of Linux and Windows, their artifacts have already been uploaded to S3, so there simply won't be
+          # anything on GitHub to upload. The command should return right away
+          python3 -m tools.stats.upload_artifacts --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --repo "${REPO_FULLNAME}"
+
   check-api-rate:
     if: ${{ always() }}
     runs-on: [self-hosted, linux.2xlarge]
diff --git a/tools/stats/upload_artifacts.py b/tools/stats/upload_artifacts.py
new file mode 100644
index 0000000000000..eb0fde7f38ac2
--- /dev/null
+++ b/tools/stats/upload_artifacts.py
@@ -0,0 +1,61 @@
+import argparse
+import os
+import re
+from tempfile import TemporaryDirectory
+
+from tools.stats.upload_stats_lib import download_gha_artifacts, upload_file_to_s3
+
+ARTIFACTS = [
+    "sccache-stats",
+    "test-jsons",
+    "test-reports",
+    "usage-log",
+]
+BUCKET_NAME = "gha-artifacts"
+FILENAME_REGEX = r"-runattempt\d+"
+
+
+def get_artifacts(repo: str, workflow_run_id: int, workflow_run_attempt: int) -> None:
+    with TemporaryDirectory() as temp_dir:
+        print("Using temporary directory:", temp_dir)
+        os.chdir(temp_dir)
+
+        for artifact in ARTIFACTS:
+            artifact_paths = download_gha_artifacts(
+                artifact, workflow_run_id, workflow_run_attempt
+            )
+
+            for artifact_path in artifact_paths:
+                # GHA artifact is named as follows: NAME-runattempt${{ github.run_attempt }}-SUFFIX.zip
+                # and we want remove the run_attempt to conform with the naming convention on S3, i.e.
+                # pytorch/pytorch/WORKFLOW_ID/RUN_ATTEMPT/artifact/NAME-SUFFIX.zip
+                s3_filename = re.sub(FILENAME_REGEX, "", artifact_path.name)
+                upload_file_to_s3(
+                    file_name=str(artifact_path.resolve()),
+                    bucket=BUCKET_NAME,
+                    key=f"{repo}/{workflow_run_id}/{workflow_run_attempt}/artifact/{s3_filename}",
+                )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Upload test artifacts from GHA to S3")
+    parser.add_argument(
+        "--workflow-run-id",
+        type=int,
+        required=True,
+        help="id of the workflow to get artifacts from",
+    )
+    parser.add_argument(
+        "--workflow-run-attempt",
+        type=int,
+        required=True,
+        help="which retry of the workflow this is",
+    )
+    parser.add_argument(
+        "--repo",
+        type=str,
+        required=True,
+        help="which GitHub repo this workflow run belongs to",
+    )
+    args = parser.parse_args()
+    get_artifacts(args.repo, args.workflow_run_id, args.workflow_run_attempt)
diff --git a/tools/stats/upload_stats_lib.py b/tools/stats/upload_stats_lib.py
index 1cba78f68da1e..da7402fce276e 100644
--- a/tools/stats/upload_stats_lib.py
+++ b/tools/stats/upload_stats_lib.py
@@ -136,6 +136,22 @@ def upload_to_s3(
     print("Done!")
 
 
+def upload_file_to_s3(
+    file_name: str,
+    bucket: str,
+    key: str,
+) -> None:
+    """
+    Upload a local file to S3
+    """
+    print(f"Upload {file_name} to s3://{bucket}/{key}")
+    boto3.client("s3").upload_file(
+        file_name,
+        bucket,
+        key,
+    )
+
+
 def unzip(p: Path) -> None:
     """Unzip the provided zipfile to a similarly-named directory.
 

From 8063a960eb97eec53b56828cb55d1ecbe6267933 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@umn.edu>
Date: Sat, 29 Oct 2022 17:48:23 +0000
Subject: [PATCH 0315/1922] Use TORCH_CHECK instead of inappropriate
 CUDA_KERNEL_ASSERT (#87714)

`CUDA_KERNEL_ASSERT` should only be used inside kernels; switch these bad usages to `TORCH_CHECK`
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87714
Approved by: https://github.com/ezyang
---
 aten/src/ATen/native/cuda/MultinomialKernel.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/cuda/MultinomialKernel.cu b/aten/src/ATen/native/cuda/MultinomialKernel.cu
index de8e8404ac2dd..c8473245604c0 100644
--- a/aten/src/ATen/native/cuda/MultinomialKernel.cu
+++ b/aten/src/ATen/native/cuda/MultinomialKernel.cu
@@ -80,7 +80,7 @@ void renormRows(Tensor& t) {
   int64_t cols = t.size(1);
 
   auto props = at::cuda::getCurrentDeviceProperties();
-  CUDA_KERNEL_ASSERT(props != NULL);
+  TORCH_CHECK(props != nullptr);
   int numSM = props->multiProcessorCount;
   const int64_t maxThreads = std::min(
       props->maxThreadsPerBlock, cuda_utils::kCUDABlockReduceMaxThreads);
@@ -342,7 +342,7 @@ void multinomial_with_replacement_kernel_impl(
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(self_v.scalar_type(), "multinomial_kernel_cuda", [&] {
     using accscalar_t = at::acc_type<scalar_t, true>;
     auto props = at::cuda::getCurrentDeviceProperties();
-    CUDA_KERNEL_ASSERT(props != NULL);
+    TORCH_CHECK(props != nullptr);
     int numSM = props->multiProcessorCount;
     int maxThreads = props->maxThreadsPerBlock;
     int maxShared = props->sharedMemPerBlock;

From 606d89f16bd758b2aac75d4ffefb8dbe5f659b27 Mon Sep 17 00:00:00 2001
From: Shunting Zhang <shunting@meta.com>
Date: Sat, 29 Oct 2022 17:52:26 +0000
Subject: [PATCH 0316/1922] torchdynamo and xla integration (#87741)

# Motivation
- torchdynamo and torchxla uses different strategies to be a sound graph capture technique. The former relies on guards; the latter relies on retracing
- guard system is quite low overhead but torchxla tracing overhead is quite high

The main idea is to leverage guard system in torchdynamo to avoid retracing in torchxla so that
- we can integration torchdynamo with XLA
- we reduce or even completely avoid tracing overhead of torchxla

# Technique details
## XLA baseline
We found that different frameworks do not generate numerically identical results for the SAME model with the SAME input. By default, torchdynamo uses eager as baseline so the model will run with PyTorch. It would be tricky to compare a model running on XLA with this baseline: it's hard to check correctness. To make the comparison easier, we add a flag `--use-xla-baseline`. When it's enabled, the baseline will be run on XLA.

## New dynamo backends added
We add 2 new dynamo backends torchxla_trivial and trochxla_trace_once to control the optimization targets.

torchxla_trivial simply moves inputs/model parameters to XLA and run the model on XLA. There is tracing overhead for each run. We should expect that result to be mostly neutral compared to the XLA baseline.

torchxla_trace_once only traces once during AOT compiling time. Here are the steps:
1. dynamo capture guards and the subgraph
2. torchxla_trace_once backend trace the graph with torchxla, lowering the graph and record a hash of the graph for later lookup
3. at inference time, the hash is used directly to lookup the optimized graph and run it.

# Limitations
We can not handle LTC/torchxla fall back right now. If a op misses LTC kernel, we raise and exception and that will results in dynamo fallback (or try another compiler). People have brainstormed the idea of graph breaking and stitching the subgraphs together. But maybe it's easier to add those missing LTC kernels for those models.

# Results
The models we tested are those not causing LTC fallback. We run the tests on **GPU**. We see **1.38x** geomean speedup for trochxla_trace_once  and torchxla_trivial is mostly neutral as expected.
```
| Model                   |   XLA (trace once) |   XLA (trace everytime) |
+=========================+====================+=========================+
| resnet18                |            1.346   |                 1.045   |
+-------------------------+--------------------+-------------------------+
| resnet50                |            1.153   |                 1.007   |
+-------------------------+--------------------+-------------------------+
| resnext50_32x4d         |            1.381   |                 1.039   |
+-------------------------+--------------------+-------------------------+
| alexnet                 |            1.045   |                 1.018   |
+-------------------------+--------------------+-------------------------+
| mobilenet_v2            |            1.562   |                 1.021   |
+-------------------------+--------------------+-------------------------+
| mnasnet1_0              |            1.303   |                 1.069   |
+-------------------------+--------------------+-------------------------+
| squeezenet1_1           |            1.278   |                 1.025   |
+-------------------------+--------------------+-------------------------+
| vgg16                   |            1.076   |                 1.008   |
+-------------------------+--------------------+-------------------------+
| BERT_pytorch            |            2.224   |                 0.978   |
+-------------------------+--------------------+-------------------------+
| timm_vision_transformer |            1.81    |                 1.025   |
+-------------------------+--------------------+-------------------------+
| geomean                 |            1.38101 |                 1.02324 |
+-------------------------+--------------------+-------------------------+
```

The speedup is similar to what we see from previous work for LTC's TorchScript backend (we see 1.40 geomean speedup there):
https://docs.google.com/presentation/d/1G09X8v41u_cLKLtSdf7v6R8G19-iZTPcW_VAdOnvYBI/edit#slide=id.g11bf989cb6b_1_5

# Next steps
- Use AOT autograd to enable training
- Share results on XLA devices
- Do more extensive tests on torchbench models

Example command
```
GPU_NUM_DEVICES=1 python benchmarks/dynamo/torchbench.py --randomize-input --performance --use-xla-baseline --only resnet18 --backend=torchxla_trace_once
```

Thanks @JackCaoG from torchxla team to help debugging various perf issues and merging the torchxla PR! That's super critical for us to get the results above. torchxla side PR: https://github.com/pytorch/xla/pull/4119

topic: not user facing

cc @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx @jansel

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87741
Approved by: https://github.com/wconstab
---
 benchmarks/dynamo/common.py                   |  64 ++++++-
 test/dynamo/test_torchxla_integration.py      | 149 +++++++++++++++
 torch/_dynamo/optimizations/backends.py       |  38 ++++
 .../optimizations/torchxla_integration.py     | 177 ++++++++++++++++++
 4 files changed, 426 insertions(+), 2 deletions(-)
 create mode 100644 test/dynamo/test_torchxla_integration.py
 create mode 100644 torch/_dynamo/optimizations/torchxla_integration.py

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 70d533c61b82c..d1fd51f5d0448 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -366,6 +366,9 @@ def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
     should_check_result = should_randomize_input = args.randomize_input
     is_correct = True
 
+    baseline_model_iter_fn = get_baseline_model_iter_fn(args, model_iter_fn)
+    baseline_model = get_baseline_model(args, model)
+
     import contextlib
 
     @contextlib.contextmanager
@@ -387,7 +390,7 @@ def maybe_profile(*args, **kwargs):
 
             # interleave the runs to handle frequency scaling and load changes
             timings[rep, 0], expected_output = timed(
-                model, model_iter_fn, inputs, return_result=True
+                baseline_model, baseline_model_iter_fn, inputs, return_result=True
             )
             timings[rep, 1], actual_output = timed(
                 model, frozen_model_iter_fn, inputs, return_result=True
@@ -784,6 +787,56 @@ def inner(*args, **kwargs):
     return inner
 
 
+def xla_wrapper(model_iter_fn):
+    """
+    Wrap the model_iter_fn to run the model on XLA devices.
+    """
+
+    def wrapper(xla_mod, inputs, collect_outputs=True):
+        import torch_xla.core.xla_model as xm
+
+        # Make sure the model is already moved to the xla device. Moving
+        # the model to xla device can be very expensive since model parameters
+        # need to be copied. We should not do that inside the wrapper since
+        # the wrapper will be calles for each set of inputs.
+        assert (
+            next(xla_mod.parameters()).device.type == "xla"
+        ), "The model should be already on xla device"
+
+        xla_dev = xm.xla_device()
+        eager_dev = inputs[0].device
+        xla_inputs = tree_map(lambda x: x.to(device=xla_dev), inputs)
+        xla_out = model_iter_fn(xla_mod, xla_inputs, collect_outputs)
+        if isinstance(xla_out, torch.Tensor):
+            return xla_out.to(device=eager_dev)
+        elif hasattr(xla_out, "__dict__"):
+            for k in xla_out.__dict__.keys():
+                if xla_out.__dict__[k] is None:
+                    continue
+                xla_out.__dict__[k] = tree_map(
+                    lambda x: x.to(device=eager_dev), xla_out.__dict__[k]
+                )
+            return xla_out
+        else:
+            raise RuntimeError(f"Can not handle type {type(xla_out)}")
+
+    return wrapper
+
+
+def get_baseline_model_iter_fn(args, model_iter_fn):
+    return xla_wrapper(model_iter_fn) if args.use_xla_baseline else model_iter_fn
+
+
+def get_baseline_model(args, model):
+    if args.use_xla_baseline:
+        import torch_xla.core.xla_model as xm
+
+        xla_dev = xm.xla_device()
+        return copy.deepcopy(model).to(device=xla_dev)
+    else:
+        return model
+
+
 class BenchmarkRunner:
     def __init__(self):
         self.model_iter_fn = None
@@ -1101,7 +1154,9 @@ def warmup(fn, model, example_inputs, mode, niters=5):
             )
 
             compilation_time = dynamo_latency - eager_latency
-            compression_ratio = eager_peak_mem / dynamo_peak_mem
+            compression_ratio = (
+                eager_peak_mem / dynamo_peak_mem if dynamo_peak_mem else 0.0
+            )
             # print(
             #     f"memory: eager: {eager_peak_mem:.2f} GB, "
             #     f"dynamo: {dynamo_peak_mem:.2f} GB, "
@@ -1365,6 +1420,11 @@ def parse_args():
         action="store_true",
         help="Disables cudagraphs for Inductor",
     )
+    parser.add_argument(
+        "--use-xla-baseline",
+        action="store_true",
+        help="Whether to run baseline on XLA devices or eager devices",
+    )
 
     group_fuser = parser.add_mutually_exclusive_group()
     # --nvfuser is now the default, keep the option to not break scripts
diff --git a/test/dynamo/test_torchxla_integration.py b/test/dynamo/test_torchxla_integration.py
new file mode 100644
index 0000000000000..00a92e3799553
--- /dev/null
+++ b/test/dynamo/test_torchxla_integration.py
@@ -0,0 +1,149 @@
+# Owner(s): ["module: dynamo"]
+import copy
+import functools
+import os
+import unittest
+
+import torch
+
+has_torch_xla = True
+try:
+    import torch._dynamo.optimizations.torchxla_integration as integration
+except ImportError:
+    has_torch_xla = False
+
+import torch.utils._pytree as pytree
+from torch import fx, nn
+
+
+class BasicModule(nn.Module):
+    def __init__(self):
+        super(BasicModule, self).__init__()
+
+    def forward(self, x, y):
+        return x + y
+
+    def get_random_inputs(self):
+        return (torch.randn(10), torch.randn(10))
+
+
+class MatmulModule(nn.Module):
+    def __init__(self):
+        super(MatmulModule, self).__init__()
+
+    def forward(self, x, y):
+        return x @ y
+
+    def get_random_inputs(self):
+        return (torch.randn(5, 100), torch.randn(100, 5))
+
+
+class LinearModule(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(10, 5)
+
+    def forward(self, x):
+        return self.linear(x)
+
+    def get_random_inputs(self):
+        return (torch.randn(10),)
+
+
+class ModuleInplaceUpdate(nn.Module):
+    def __init__(self):
+        super(ModuleInplaceUpdate, self).__init__()
+
+    def forward(self, a, b):
+        a.sub_(b)
+        return b - 1, b + 1
+
+    def get_random_inputs(self):
+        return (torch.randn(10), torch.randn(10))
+
+
+def allclose(expected, actual):
+    def unwrap(cont):
+        if isinstance(cont, (list, tuple)) and len(cont) == 1:
+            return cont[0]
+        return cont
+
+    expected = unwrap(expected)
+    actual = unwrap(actual)
+
+    if isinstance(expected, torch.Tensor) and isinstance(actual, torch.Tensor):
+        return torch.allclose(expected, actual)
+    elif isinstance(expected, (tuple, list)) and isinstance(actual, (tuple, list)):
+        return len(expected) == len(actual) and all(
+            torch.allclose(a, b) for a, b in zip(expected, actual)
+        )
+    else:
+        raise RuntimeError("Unexpected types")
+
+
+@functools.lru_cache(None)
+def should_run_torchxla_tests():
+    """
+    Run the tests if torch_xla is available and number of gpu devices is specified.
+    """
+    gpu_device_specified = int(os.environ.get("GPU_NUM_DEVICES", "0")) > 0
+    return has_torch_xla and gpu_device_specified
+
+
+def make_reuse_graph_test(module_class, niter=100):
+    @unittest.skipIf(
+        not should_run_torchxla_tests(),
+        "Skip the tests since torch_xla is not available or XLA devices are not specified",
+    )
+    def test_wrapper(self):
+        import torch_xla.core.xla_model as xm
+
+        xla_dev = xm.xla_device()
+        mod = module_class()
+        xla_module = copy.deepcopy(mod).to(device=xla_dev)
+        inputs = mod.get_random_inputs()
+        optimized_mod = integration.extract_compiled_graph(
+            fx.symbolic_trace(mod), inputs
+        )
+
+        for i in range(niter):
+            rand_args = mod.get_random_inputs()
+            orig_dev = rand_args[0].device
+            rand_args_copy = copy.deepcopy(rand_args)
+
+            # Can not simply call
+            #   expected = mod(*rand_args)
+            # Since we need use xla to calculate expected results
+            xla_inputs = tuple(
+                copy.deepcopy(inp).to(device=xla_dev) for inp in rand_args
+            )
+            xla_out = xla_module(*xla_inputs)
+            # copy xla_inputs back to rand_args since the model may inplace update
+            # the arguments
+            rand_args = tuple(inp.to(device=orig_dev) for inp in xla_inputs)
+            expected = pytree.tree_map(lambda o: o.to(device=orig_dev), xla_out)
+
+            actual = optimized_mod(*rand_args_copy)
+
+            if not allclose(expected, actual):
+                print(
+                    f"Incorrect results at iter {i}. expected\n{expected}, actual\n{actual}"
+                )
+                self.assertTrue(False)
+
+            # make sure arguments match after calling the model forward method
+            # to handle inplace updates.
+            if not allclose(rand_args, rand_args_copy):
+                print(
+                    f"Incorrect updated arguments at iter {i}. expected\n{rand_args}, actual\n{rand_args_copy}"
+                )
+                self.assertTrue(False)
+
+    return test_wrapper
+
+
+class TorchXLAReuseGraphTest(unittest.TestCase):
+    test_basic = make_reuse_graph_test(BasicModule)
+    test_matmul = make_reuse_graph_test(MatmulModule)
+    test_linear = make_reuse_graph_test(LinearModule)
+    test_inplace_update = make_reuse_graph_test(ModuleInplaceUpdate)
diff --git a/torch/_dynamo/optimizations/backends.py b/torch/_dynamo/optimizations/backends.py
index abcb4290e7826..660e7a5ca567b 100644
--- a/torch/_dynamo/optimizations/backends.py
+++ b/torch/_dynamo/optimizations/backends.py
@@ -785,6 +785,44 @@ def ltc_model(*inputs):
     return ltc_model
 
 
+@functools.lru_cache(None)
+def _init_torchxla():
+    global xm
+    try:
+        import torch_xla.core.xla_model as xm
+    except ModuleNotFoundError as e:
+        print(f"torchxla backend fails. Can not import {e.name}")
+        raise
+
+
+@create_backend
+def torchxla_trivial(subgraph):
+    _init_torchxla()
+
+    xla_dev = xm.xla_device()
+
+    xla_model = copy.deepcopy(subgraph.model).to(device=xla_dev)
+
+    def xla_model_wrapper(*inputs):
+        orig_device = inputs[0].device if len(inputs) > 0 else "cpu"
+        xla_inputs = tuple(inp.to(device=xla_dev) for inp in inputs)
+
+        xla_out = xla_model(*xla_inputs)
+        result = tuple(out.to(device=orig_device) for out in xla_out)
+        return result
+
+    return xla_model_wrapper
+
+
+@create_backend
+def torchxla_trace_once(subgraph):
+    import torch._dynamo.optimizations.torchxla_integration as integration
+
+    model = subgraph.model
+    example_inputs = subgraph.example_inputs
+    return integration.extract_compiled_graph(model, example_inputs)
+
+
 def ipex_fp32(gm: torch.fx.GraphModule, example_inputs):
     kwargs_ipex = {"datatype": "fp32"}
     return BACKENDS["ipex"](gm, example_inputs, **kwargs_ipex)
diff --git a/torch/_dynamo/optimizations/torchxla_integration.py b/torch/_dynamo/optimizations/torchxla_integration.py
new file mode 100644
index 0000000000000..d3cac23e7c4b4
--- /dev/null
+++ b/torch/_dynamo/optimizations/torchxla_integration.py
@@ -0,0 +1,177 @@
+import copy
+import dataclasses
+
+import functools
+import os
+import time
+from typing import Any, Dict, List
+
+import torch
+
+debug = os.environ.get("debug_extract_compiled_graph") == "1"
+
+
+@dataclasses.dataclass
+class GraphInputMatcher:
+    """
+    The GraphInputMatcher class setup the graph inputs for future calls after lazy tracing.
+    Specifically, those graph inputs corresponding to method parameters should be replaced with the
+    arguments for the current call.
+
+    tensor_id_to_arg_idx maps the tensor id to the parameter index.
+    graph_input_tensor_ids, graph_input_ivalues list the tensor_id and ivalue for each of the
+    TS/XLA graph inputs.
+    """
+
+    tensor_id_to_arg_idx: Dict[int, int]
+    graph_input_tensor_ids: List[int]
+    # there are 2 categories of graph_input_tensors.
+    # Category 1: those whose id are not found in tensor_id_to_arg_idx. These are
+    # most likely const tensors and we can get its content from graph_input_tensors
+    # Category 2: those whose id are found in tensor_id_to_arg_idx. We should get
+    #  the tensor from method arguments
+    graph_input_ivalues: List[Any]
+
+    # get the real graph input tensors
+    def __call__(self, args):
+        real_input = []
+        for tensor_id, traced_ivalue in zip(
+            self.graph_input_tensor_ids, self.graph_input_ivalues
+        ):
+            arg_idx = self.tensor_id_to_arg_idx.get(tensor_id, None)
+            if arg_idx is None:
+                inp = traced_ivalue
+            else:
+                inp = args[arg_idx]
+            real_input.append(inp)
+        return real_input
+
+
+def get_fallback_ops():
+    fallback_ops = []
+    for opname in metrics.counter_names():
+        if "aten::" not in opname:
+            continue
+        val = int(metrics.counter_value(opname))
+        if val > 0:
+            fallback_ops.append(f"{opname}={val}")
+
+    return fallback_ops
+
+
+@functools.lru_cache(None)
+def import_torchxla():
+    """
+    CI will run test_circular_dependencies in test/test_testing.py
+    which tries to import all modules found.
+    Enclosing the imports in a function so CI that does not have torch_xla
+    installed will not break.
+    """
+    global torch_xla, xm, metrics
+    import torch_xla
+    import torch_xla.core.xla_model as xm
+    import torch_xla.debug.metrics as metrics
+
+
+def extract_compiled_graph(model: torch.fx.GraphModule, example_inputs):
+    import_torchxla()
+    orig_device = example_inputs[0].device
+    xla_dev = xm.xla_device()
+    xla_model = copy.deepcopy(model).to(device=xla_dev)
+    xla_args = [arg.to(device=xla_dev) for arg in example_inputs]
+    args_tensor_ids = [
+        torch_xla._XLAC._xla_get_tensor_id(xla_arg) for xla_arg in xla_args
+    ]
+
+    if debug:
+        print(f"args_tensor_ids {args_tensor_ids}")
+
+    tensor_id_to_arg_idx = {tensor_id: i for i, tensor_id in enumerate(args_tensor_ids)}
+    xla_out = xla_model(*xla_args)
+    fallback_ops = get_fallback_ops()
+    if len(fallback_ops) > 0:
+        raise RuntimeError(
+            f"Fail to extact the compiled graph because of fallback: {','.join(fallback_ops)}"
+        )
+
+    if not isinstance(xla_out, (tuple, list)):
+        xla_out = (xla_out,)
+
+    # If a arg is being in place updated by model, we need to include arg as part of the graph result.
+    xla_args_need_update_bool = torch_xla._XLAC._check_tensor_need_materialization(
+        xla_args
+    )
+    xla_args_need_update = []
+    arg_index_to_need_update_index = {}
+    for i, need_update in enumerate(xla_args_need_update_bool):
+        if need_update:
+            arg_index_to_need_update_index[i] = len(xla_args_need_update)
+            xla_args_need_update.append(xla_args[i])
+
+    args_and_out = tuple(xla_args_need_update) + tuple(xla_out)
+
+    if debug:
+        print(f"XLA IR Text: {torch_xla._XLAC._get_xla_tensors_text(args_and_out)}")
+        print(f"XLA IR HLO: {torch_xla._XLAC._get_xla_tensors_hlo(args_and_out)}")
+
+    # calculate graph hash
+    graph_hash = torch_xla._XLAC._get_graph_hash(args_and_out)
+    if debug:
+        print("graph_hash", graph_hash)
+
+    (
+        graph_input_tensor_ids,
+        graph_input_ivalues,
+    ) = torch_xla._XLAC._get_tensors_xla_device_data_node(args_and_out)
+    if debug:
+        print(f"graph_input_tensor_ids {graph_input_tensor_ids}")
+    assert len(graph_input_tensor_ids) == len(
+        graph_input_ivalues
+    ), f"{len(graph_input_tensor_ids)} v.s. {len(graph_input_ivalues)}"
+    graph_input_matcher = GraphInputMatcher(
+        tensor_id_to_arg_idx, graph_input_tensor_ids, graph_input_ivalues
+    )
+
+    # compiles+runs graph rooted at tensors in 'args_and_out'
+    torch_xla._XLAC._xla_sync_multi(args_and_out, [])
+
+    # input all cpu tensors
+    def optimized_mod(*args):
+        enter_ts = time.time()
+        if len(args_and_out) == 0:
+            return ()
+
+        assert len(args) > 0  # can not handle no args case for now
+        eager_device = args[0].device
+        graph_input = graph_input_matcher(args)
+        start_ts = time.time()
+        res = torch_xla._XLAC._run_cached_graph(graph_hash, graph_input)
+        if debug:
+            print(
+                f"torchxla reuse compiled graph run_cached_graph takes {time.time() - start_ts} seconds"
+            )
+
+        prepare_output_ts = time.time()
+
+        copy_args_ts = time.time()
+        assert len(res) == len(args_and_out)
+        ncopy = 0
+
+        for arg_index, res_index in arg_index_to_need_update_index.items():
+            args[arg_index].copy_(res[res_index])
+
+        if debug:
+            print(f"Copy {ncopy} args takes {time.time() - copy_args_ts} seconds")
+
+        # need to convert xla tensor back to eager tensor
+        copy_res_ts = time.time()
+        # First few elements might be xla_args that needs to be in place updated
+        result = [x.to(device=eager_device) for x in res[len(xla_args_need_update) :]]
+        if debug:
+            print(f"Copy results takes {time.time() - copy_res_ts} seconds")
+            print(f"prepare output takes {time.time() - prepare_output_ts} seconds")
+            print(f"optimized_mod takes {time.time() - enter_ts} seconds overall")
+
+        return result
+
+    return optimized_mod

From a9724a63282bf25172620bacb6e96306323cd77d Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Sat, 29 Oct 2022 18:39:28 +0000
Subject: [PATCH 0317/1922] Revert "Unify meta tensor and fake tensor converter
 conversion (#87943)"

This reverts commit baa715e790921e6498861e59556035de1a481cc5.

Reverted https://github.com/pytorch/pytorch/pull/87943 on behalf of https://github.com/kit1980 due to Broke several inductor tests
---
 test/dynamo/test_unspec.py                    |   3 +
 test/test_meta.py                             | 154 ++--------
 .../templates/python_variable_methods.cpp     |  12 +-
 torch/_subclasses/fake_tensor.py              | 275 +++++++++---------
 torch/_subclasses/meta_utils.py               | 152 ++--------
 torch/testing/_internal/common_utils.py       |   2 -
 6 files changed, 193 insertions(+), 405 deletions(-)

diff --git a/test/dynamo/test_unspec.py b/test/dynamo/test_unspec.py
index fd5396981b740..22f975d0f9d68 100644
--- a/test/dynamo/test_unspec.py
+++ b/test/dynamo/test_unspec.py
@@ -50,6 +50,9 @@ class UnspecTest(cls):
 UnspecReproTests = make_unspec_cls(test_repros.ReproTests)
 UnspecNNModuleTests = make_unspec_cls(test_modules.NNModuleTests)
 
+# RuntimeError: a leaf Variable that requires grad is being used in an in-place operation.
+unittest.expectedFailure(UnspecReproTests.test_batch_norm_act_unspec)
+
 
 @patch.object(torch._dynamo.config, "specialize_int_float", False)
 class UnspecTests(torch._dynamo.test_case.TestCase):
diff --git a/test/test_meta.py b/test/test_meta.py
index 1499885bc607d..1abe4cd2cda75 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -66,34 +66,6 @@ def assertSameVersionCounter(self, m1, m2):
         self.assertNotEqual(m1._version, vc)
         self.assertEqual(m2._version, m1._version)
 
-    def assertMetadataMatches(self, m1, m2):
-        self.assertEqual(m1.dtype, m2.dtype)
-        self.assertEqual(m1.shape, m2.shape)
-        self.assertEqual(m1.requires_grad, m2.requires_grad)
-        self.assertEqual(m1.is_leaf, m2.is_leaf)
-        self.assertEqual(m1.grad_fn is None, m2.grad_fn is None)
-        self.assertEqual(m1.is_sparse, m2.is_sparse)
-        self.assertEqual(m1.is_inference(), m2.is_inference())
-        self.assertEqual(m1.is_conj(), m2.is_conj())
-        self.assertEqual(m1.is_neg(), m2.is_neg())
-        with warnings.catch_warnings():
-            warnings.filterwarnings("ignore", "The .grad attribute of a Tensor")
-            grad_not_none = m1.grad is not None
-        if grad_not_none:
-            self.assertMetadataMatches(m1.grad, m2.grad)
-        if m1.is_sparse:
-            self.assertEqual(m1.dense_dim(), m2.dense_dim())
-            self.assertEqual(m1.sparse_dim(), m2.sparse_dim())
-            self.assertEqual(m1.is_coalesced(), m2.is_coalesced())
-        else:
-            self.assertEqual(m1.stride(), m2.stride())
-            self.assertEqual(m1.storage_offset(), m2.storage_offset())
-            self.assertEqual(m1._is_view(), m2._is_view())
-            if m1._is_view():
-                self.assertMetadataMatches(m1._base, m2._base)
-        # TODO: test if is resizable (no direct query for this atm)
-        # TODO: audit AutogradMeta to see if it matches
-
     def test_view_of_non_leaf(self):
         x = torch.randn(4, requires_grad=True)
         y = x.neg()
@@ -102,14 +74,9 @@ def test_view_of_non_leaf(self):
         to_meta = MetaConverter()
         m1 = to_meta(z1)
         m2 = to_meta(z2)
-
-        # check the test is actually testing what it claims
+        self.assertEqual(m1.shape, z1.shape)
         self.assertTrue(m1._is_view())
         self.assertFalse(m1._base.is_leaf)
-
-        self.assertIsNot(m1, m2)
-        self.assertMetadataMatches(m1, z1)
-        self.assertMetadataMatches(m2, z2)
         self.assertSameVersionCounter(m1, m2)
 
     def test_view_of_leaf(self):
@@ -119,118 +86,35 @@ def test_view_of_leaf(self):
         to_meta = MetaConverter()
         m1 = to_meta(z1)
         m2 = to_meta(z2)
-
-        # check the test is actually testing what it claims
+        self.assertEqual(m1.shape, z1.shape)
         self.assertTrue(m1._is_view())
         self.assertTrue(m1._base.is_leaf)
-
-        self.assertIsNot(m1, m2)
-        self.assertMetadataMatches(m1, z1)
-        self.assertMetadataMatches(m2, z2)
         self.assertSameVersionCounter(m1, m2)
 
     def test_leaf(self):
         x = torch.randn(4, requires_grad=True)
         to_meta = MetaConverter()
         m = to_meta(x)
-
-        # check the test is actually testing what it claims
+        self.assertEqual(m.shape, x.shape)
         self.assertTrue(m.is_leaf)
         self.assertTrue(m.requires_grad)
 
-        self.assertMetadataMatches(m, x)
-
     def test_non_leaf(self):
         x = torch.randn(4, requires_grad=True)
         y = x.neg()
         to_meta = MetaConverter()
         m = to_meta(y)
-
-        # check the test is actually testing what it claims
+        self.assertEqual(m.shape, y.shape)
         self.assertFalse(m.is_leaf)
         self.assertTrue(m.requires_grad)
 
-        self.assertMetadataMatches(m, y)
-
     def test_requires_grad_false(self):
         x = torch.randn(4, requires_grad=False)
         to_meta = MetaConverter()
         m = to_meta(x)
-
-        # check the test is actually testing what it claims
+        self.assertEqual(m.shape, x.shape)
         self.assertFalse(m.requires_grad)
 
-        self.assertMetadataMatches(m, x)
-
-    def test_channels_last(self):
-        x = torch.empty(2, 3, 4, 5, memory_format=torch.channels_last)
-        to_meta = MetaConverter()
-        m = to_meta(x)
-
-        # check the test is actually testing what it claims
-        self.assertTrue(m.is_leaf)
-
-        self.assertMetadataMatches(m, x)
-
-    def test_channels_last_leaf(self):
-        x = torch.empty(2, 3, 4, 5, memory_format=torch.channels_last, requires_grad=True)
-        to_meta = MetaConverter()
-        m = to_meta(x)
-
-        # check the test is actually testing what it claims
-        self.assertTrue(m.requires_grad)
-        self.assertTrue(m.is_leaf)
-
-        self.assertMetadataMatches(m, x)
-
-    def test_channels_last_non_leaf(self):
-        x = torch.empty(2, 3, 4, 5, memory_format=torch.channels_last, requires_grad=True)
-        y = x + 2
-
-        # sanity
-        self.assertEqual(x.stride(), y.stride())
-        self.assertFalse(y.is_leaf)
-
-        to_meta = MetaConverter()
-        m = to_meta(y)
-
-        # check the test is actually testing what it claims
-        self.assertTrue(m.requires_grad)
-        self.assertFalse(m.is_leaf)
-
-        self.assertMetadataMatches(m, y)
-
-        # Check that we can autograd with m as input without erroring;
-        # see https://github.com/pytorch/pytorch/issues/87956
-        loss = m.sum()
-        torch.autograd.grad(loss, m)
-
-    def test_empty_strided_non_dense_leaf(self):
-        x = torch.empty_strided((2, 2), (4, 2), requires_grad=True)
-
-        to_meta = MetaConverter()
-        m = to_meta(x)
-
-        # check the test is actually testing what it claims
-        self.assertTrue(m.requires_grad)
-        self.assertTrue(m.is_leaf)
-
-        self.assertMetadataMatches(m, x)
-
-    def test_non_leaf_torture(self):
-        x = torch.empty(20, requires_grad=True)
-        with torch.no_grad():
-            x.set_(x.storage(), 10, (2,), (2,))
-
-        to_meta = MetaConverter()
-        m = to_meta(x)
-
-        # check the test is actually testing what it claims
-        self.assertTrue(m.requires_grad)
-        self.assertTrue(m.is_leaf)
-
-        self.assertMetadataMatches(m, x)
-
     # NB: complex stuff is not actually exercised right now because
     # we have a blanket exclusion for complex conversion
 
@@ -238,30 +122,41 @@ def test_view_as_real(self):
         x = torch.randn(4, dtype=torch.complex64)
         y = torch.view_as_real(x)
         m = MetaConverter()(y)
-        self.assertMetadataMatches(m, y)
+        self.assertEqual(m.shape, y.shape)
+        self.assertEqual(m.stride(), y.stride())
+        self.assertEqual(m.dtype, y.dtype)
 
     def test_complex_noncontiguous_bug(self):
         x = torch.randn((2, 2, 4, 9), dtype=torch.complex32)[:, 0, :, :]
         m = MetaConverter()(x)
-        self.assertMetadataMatches(m, x)
+        self.assertEqual(m.shape, x.shape)
+        self.assertEqual(m.stride(), x.stride())
+        self.assertEqual(m.dtype, x.dtype)
 
     def test_view_as_complex(self):
         x = torch.randn((4, 2), dtype=torch.float32)
         y = torch.view_as_complex(x)
         m = MetaConverter()(y)
-        self.assertMetadataMatches(m, y)
+        self.assertEqual(m.shape, y.shape)
+        self.assertEqual(m.stride(), y.stride())
+        self.assertEqual(m.dtype, y.dtype)
 
     def test_view_dtype(self):
         x = torch.randn(4, dtype=torch.float32)
         y = x.view(dtype=torch.int32)
         m = MetaConverter()(y)
-        self.assertMetadataMatches(m, y)
+        self.assertEqual(m.shape, y.shape)
+        self.assertEqual(m.stride(), y.stride())
+        self.assertEqual(m.dtype, y.dtype)
 
     def test_imag(self):
         x = torch.randn(4, dtype=torch.complex64)
         y = x.imag
         m = MetaConverter()(y)
-        self.assertMetadataMatches(m, y)
+        self.assertEqual(m.shape, y.shape)
+        self.assertEqual(m.dtype, y.dtype)
+        self.assertEqual(m.stride(), y.stride())
+        self.assertEqual(m.storage_offset(), y.storage_offset())
 
     def test_weakref(self):
         x = torch.randn(4, 4, 4)
@@ -851,12 +746,7 @@ def __init__(self, test_case, *, device, dtype, inplace):
     def __torch_function__(self, func, types, args=(), kwargs=None):
         kwargs = kwargs or {}
 
-        if (
-            torch.jit.is_tracing() or isinstance(func, torch.ScriptMethod) or
-            # meta converter doesn't work correctly when no_dispatch() is on, so
-            # skip running the crossref test in this case
-            torch._C._dispatch_tls_local_exclude_set().has(torch._C.DispatchKey.Python)
-        ):
+        if torch.jit.is_tracing() or isinstance(func, torch.ScriptMethod):
             return func(*args, **kwargs)
 
         if self.dtype in meta_function_skips.get(func, set()):
diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
index e3c0a8b987bd6..7122532a54410 100644
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@@ -1135,7 +1135,7 @@ static PyObject* THPVariable_set_(
       {
           "set_()",
           "set_(Storage source)",
-          "set_(Storage source, SymInt storage_offset, SymIntArrayRef size, SymIntArrayRef stride=None)",
+          "set_(Storage source, int64_t storage_offset, IntArrayRef size, IntArrayRef stride=None)",
           "set_(Tensor source)",
           "set_(Tensor source, SymInt storage_offset, SymIntArrayRef size, SymIntArrayRef stride=None)",
       },
@@ -1181,14 +1181,14 @@ static PyObject* THPVariable_set_(
         " for argument 1 'storage'");
       auto dispatch_set_ = [](const Tensor& self,
                               Storage source,
-                              c10::SymInt storage_offset,
-                              c10::SymIntArrayRef size,
-                              c10::SymIntArrayRef stride) -> Tensor {
+                              int64_t storage_offset,
+                              IntArrayRef size,
+                              IntArrayRef stride) -> Tensor {
         pybind11::gil_scoped_release no_gil;
-        return self.set__symint(source, storage_offset, size, stride);
+        return self.set_(source, storage_offset, size, stride);
       };
       return wrap(dispatch_set_(
-          self, storage, _r.toSymInt(1), _r.symintlist(2), _r.symintlist(3)));
+          self, storage, _r.toInt64(1), _r.intlist(2), _r.intlist(3)));
     }
     case 3: {
       // aten::set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 68b935a1a5222..3e5cbdb652264 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -2,6 +2,7 @@
 import functools
 import itertools
 import sys
+import warnings
 import weakref
 from dataclasses import dataclass
 from functools import partial
@@ -138,14 +139,15 @@ def tree_flatten_only(ty: Type[T], pytree: PyTree):
 # structure. Like `MetaConverter`, it uses `WeakTensorRefKey` to
 # hold a weak reference for all memoized tensors.
 class FakeTensorConverter(object):
-    @property
-    def tensor_memo(self):
-        return self.meta_converter.tensor_memo
-
+    tensor_memo: weakref.WeakValueDictionary
     meta_converter: MetaConverter
     constant_storage_mapping: Dict[StorageWeakRef, List[TensorWeakRef]]
 
     def __init__(self):
+        # FakeTensors store the FakeTensorMode which in turn stores a
+        # FakeTensor, so we need to hold a weak reference to the FakeTensor
+        # otherwise we would induce a circular reference
+        self.tensor_memo = weakref.WeakValueDictionary()
         self.meta_converter = MetaConverter()
 
         # map from to storage to corresponding constant tensors
@@ -212,33 +214,28 @@ def from_real_tensor(self, fake_mode, t, make_constant=False, shape_env=None):
         # not yet supported in metatensors
         if t.is_quantized:
             raise UnsupportedFakeTensorException("quantized nyi in meta tensors")
+        with no_dispatch():
+            meta_t = self.meta_converter(t, shape_env=shape_env)
+            if meta_t.device.type != "meta":
+                raise UnsupportedFakeTensorException("meta converter nyi")
+            out = FakeTensor(
+                fake_mode,
+                meta_t,
+                existing_device,
+                constant=t if make_constant else None,
+            )
+            out.requires_grad_(t.requires_grad)
+            if make_constant:
+                self.add_constant_storage_mapping(out)
         if type(t) is torch.nn.Parameter:
             assert not make_constant
-
-        def mk_fake_tensor(make_meta_t):
-            # NB: don't use in_kernel_invocation_manager. to
-            # ensure FakeTensor can internally do constant computation
-            # as necessary.  Invocation manager is "more correct" as
-            # it works for more operators in make_meta_t, but
-            # invariant is that make_meta_t only calls factories
-            # for which it is not strictly necessary to use the
-            # invocation manager (I think!)
-            with no_dispatch():
-                return FakeTensor(
-                    fake_mode,
-                    make_meta_t(),
-                    existing_device,
-                    constant=t if make_constant else None,
-                )
-
-        out = self.meta_converter(
-            t, shape_env=shape_env, strict=True, callback=mk_fake_tensor
-        )
-        if out is NotImplemented:
-            raise UnsupportedFakeTensorException("meta converter nyi")
-        if make_constant:
-            self.add_constant_storage_mapping(out)
-        # NB: meta_converter set the memo
+            out = torch.nn.Parameter(out, requires_grad=out.requires_grad)  # type: ignore[assignment]
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", "The .grad attribute of a Tensor")
+            grad_not_none = t.grad is not None
+        if grad_not_none:
+            out.grad = self.from_real_tensor(fake_mode, t.grad, shape_env=shape_env)
+        self.set_tensor_memo(t, out)
         return out
 
     # If you specify the device, it MUST be a meta tensor.
@@ -299,9 +296,7 @@ def constructors(fake_mode, func, *args, **kwargs):
     out_device = new_kwargs.pop("device", None)
     out_device = out_device if out_device is not None else default_device
     new_kwargs["device"] = torch.device("meta")
-    # Not in_kernel_invocation_manager as no fake tensor inputs
-    with no_dispatch():
-        r = func(*args, **new_kwargs)
+    r = func(*args, **new_kwargs)
     return FakeTensor(fake_mode, r, out_device)
 
 
@@ -314,8 +309,7 @@ def non_kwarg_to(fake_mode, func, *args, **kwargs):
     out_device = input_device if input_device else new_kwargs["input"].device
     new_kwargs["device"] = torch.device("meta")
     inp = new_kwargs.pop("input")
-    with in_kernel_invocation_manager(fake_mode):
-        r = func(inp, **new_kwargs)
+    r = func(inp, **new_kwargs)
     # TODO: I think this does the wrong thing if r is inp
     return fake_mode.fake_tensor_converter.from_meta_and_device(
         fake_mode, r, out_device
@@ -326,8 +320,7 @@ def non_kwarg_to(fake_mode, func, *args, **kwargs):
 # since the device of `the_template` is ignored
 @register_op_impl(aten.resize_as_.default)
 def resize_as_(fake_mode, func, *args, **kwargs):
-    with in_kernel_invocation_manager(fake_mode):
-        return func(*args, **kwargs)
+    return func(*args, **kwargs)
 
 
 @register_op_impl(aten._sparse_coo_tensor_with_dims_and_tensors.default)
@@ -717,13 +710,6 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
             else:
                 return args[0].fake_device
 
-        # Some attribute queries that can be serviced directly
-        # See Note [is_coalesced is dispatched]
-        if func in [torch.ops.aten.is_coalesced.default]:
-            # NB: no_dispatch is ok here too, this func is very simple
-            with in_kernel_invocation_manager(self):
-                return func(*args, **kwargs)
-
         flat_arg_fake_tensors = tree_flatten_only(FakeTensor, (args, kwargs))
         flat_symints = tree_flatten_only(torch.SymInt, (args, kwargs))
         has_symbolic_sizes = (
@@ -739,38 +725,38 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         if func in self.lift_fns:
             out = func(*args, **kwargs)
             if self.may_turn_const(out):
-                # NB: not in_kernel_invocation_manager because we're doing real
-                # compute here
                 with no_dispatch():
-                    out = out.clone()
-                return converter(self, out, make_constant=True)
-
-        flat_arg_tensors = tree_flatten_only(torch.Tensor, (args, kwargs))
-        # See [subclass inputs] below
-        # NB: If you're seeing a mysterious infinite loop involving fake
-        # tensor, it might be related to this line.  Though I'm not sure
-        # how you'll know to read this comment, as this line won't show up
-        # in the stack trace.
-        if self.check_for_subclass(flat_arg_tensors):
-            return NotImplemented
-
-        # if we are in the dispatch mode, we will enter this function even if the inputs
-        # are not FakeTensors. For now, throw if any non-Fake Tensor inputs
-        # and just support constructors.
-
-        # this is generated from torch.tensor(), which does not use the
-        # dispatcher, to allow wrapper subclasses to wrap the new tensor
-        if func in self.lift_fns:
-            assert (
-                len(kwargs) == 0 and len(args) == 1 and type(args[0]) is torch.Tensor
-            ), f"{args} {kwargs}"
-            return converter(self, args[0])
-
-        if self.check_for_non_fake(flat_arg_tensors):
-            raise Exception(
-                "Invoking operators with non-Fake Tensor inputs in FakeTensorMode is not yet supported. "
-                f"Please convert all Tensors to FakeTensors first. Found in {func}(*{args}, **{kwargs})"
-            )
+                    return converter(self, out.clone(), make_constant=True)
+
+        with no_dispatch():
+            flat_arg_tensors = tree_flatten_only(torch.Tensor, (args, kwargs))
+            # See [subclass inputs] below
+            # NB: If you're seeing a mysterious infinite loop involving fake
+            # tensor, it might be related to this line.  Though I'm not sure
+            # how you'll know to read this comment, as this line won't show up
+            # in the stack trace.
+            if self.check_for_subclass(flat_arg_tensors):
+                return NotImplemented
+
+            # if we are in the dispatch mode, we will enter this function even if the inputs
+            # are not FakeTensors. For now, throw if any non-Fake Tensor inputs
+            # and just support constructors.
+
+            # this is generated from torch.tensor(), which does not use the
+            # dispatcher, to allow wrapper subclasses to wrap the new tensor
+            if func in self.lift_fns:
+                assert (
+                    len(kwargs) == 0
+                    and len(args) == 1
+                    and type(args[0]) is torch.Tensor
+                ), f"{args} {kwargs}"
+                return converter(self, args[0])
+
+            if self.check_for_non_fake(flat_arg_tensors):
+                raise Exception(
+                    "Invoking operators with non-Fake Tensor inputs in FakeTensorMode is not yet supported. "
+                    f"Please convert all Tensors to FakeTensors first. Found in {func}(*{args}, **{kwargs})"
+                )
 
         # The current constant handling only support tracing systems
         # (aot autograd, torchdynamo) where each operation is run consecutively.
@@ -790,30 +776,27 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
             and len(flat_arg_fake_tensors) != 0
             and not has_symbolic_sizes
         ):
-            const_args, const_kwargs = pytree.tree_map_only(
-                FakeTensor, lambda t: t.constant, (args, kwargs)
-            )
-
-            # NB: not in_kernel_invocation_manager(self) as we want to do REAL
-            # compute
             with no_dispatch():
+                const_args, const_kwargs = pytree.tree_map_only(
+                    FakeTensor, lambda t: t.constant, (args, kwargs)
+                )
                 out = func(*const_args, **const_kwargs)
 
-            all_constant = pytree.tree_all_only(
-                torch.Tensor, lambda t: self.may_turn_const(t), out
-            )
-
-            if all_constant:
-                return pytree.tree_map_only(
-                    torch.Tensor,
-                    lambda t: converter(self, t, make_constant=True),
-                    out,
+                all_constant = pytree.tree_all_only(
+                    torch.Tensor, lambda t: self.may_turn_const(t), out
                 )
 
-            # we weren't able to turn outputs to constants,
-            # so invalidate all constants that might be aliases of the outputs
-            for ten in tree_flatten_only(torch.Tensor, out):
-                converter.invalidate_constant_aliases(ten)
+                if all_constant:
+                    return pytree.tree_map_only(
+                        torch.Tensor,
+                        lambda t: converter(self, t, make_constant=True),
+                        out,
+                    )
+
+                # we weren't able to turn outputs to constants,
+                # so invalidate all constants that might be aliases of the outputs
+                for ten in tree_flatten_only(torch.Tensor, out):
+                    converter.invalidate_constant_aliases(ten)
 
         # we are falling through to running non constant tensors, any input constant that
         # is written to must be invalidated
@@ -834,13 +817,14 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         ):
             from torch._decomp import meta_table as meta_table
 
-            if func == aten.size.default:
-                sys.stderr.write(
-                    "Trying to call aten.size on a tensor with symbolic shapes. "
-                    "It's likely that this is from calling tensor.shape in C++"
-                )
-                # We do this to allow for better error localization with `TORCH_SHOW_CPP_STACKTRACES=1`
-                return None
+            with no_dispatch():
+                if func == aten.size.default:
+                    sys.stderr.write(
+                        "Trying to call aten.size on a tensor with symbolic shapes. "
+                        "It's likely that this is from calling tensor.shape in C++"
+                    )
+                    # We do this to allow for better error localization with `TORCH_SHOW_CPP_STACKTRACES=1`
+                    return None
 
             with self:
                 if func in meta_table:
@@ -876,27 +860,32 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                     f"{func} - couldn't find symbolic meta function/decomposition"
                 )
 
-        # special handling for funcs registered through `register_op_impl`,
-        # e.g., manipulating args on constructor calls to construct meta tensors
-        # and then afterwards wrapping them to a FakeTensor
-        for run_impl_check, op_impl in op_implementations:
-            if run_impl_check(func):
-                op_impl_out = op_impl(self, func, *args, **kwargs)
-                if op_impl_out != NotImplemented:
-                    return op_impl_out
-
-        # run kernel registered to meta for func, which include
-        # python meta registrations, prims, decomps, and c++ meta fns (structured kernels)
-        try:
-            with in_kernel_invocation_manager(self):
-                r = func(*args, **kwargs)
-        except NotImplementedError as not_implemented_error:
-            # no meta kernel registered, fallback to kernel for the device
-            if not self.allow_fallback_kernels:
-                raise not_implemented_error
-            return run_fallback_kernel(self, func, args, kwargs, not_implemented_error)
-
-        return self.wrap_meta_outputs_with_default_device_logic(r, func, args, kwargs)
+        with no_dispatch():
+            # special handling for funcs registered through `register_op_impl`,
+            # e.g., manipulating args on constructor calls to construct meta tensors
+            # and then afterwards wrapping them to a FakeTensor
+            for run_impl_check, op_impl in op_implementations:
+                if run_impl_check(func):
+                    op_impl_out = op_impl(self, func, *args, **kwargs)
+                    if op_impl_out != NotImplemented:
+                        return op_impl_out
+
+            # run kernel registered to meta for func, which include
+            # python meta registrations, prims, decomps, and c++ meta fns (structured kernels)
+            try:
+                with in_kernel_invocation_manager(self):
+                    r = func(*args, **kwargs)
+            except NotImplementedError as not_implemented_error:
+                # no meta kernel registered, fallback to kernel for the device
+                if not self.allow_fallback_kernels:
+                    raise not_implemented_error
+                return run_fallback_kernel(
+                    self, func, args, kwargs, not_implemented_error
+                )
+
+            return self.wrap_meta_outputs_with_default_device_logic(
+                r, func, args, kwargs
+            )
 
     # [subclass inputs]
     # Suppose we enable fake tensor mode.  This means that fake tensor
@@ -970,7 +959,6 @@ def functions_with_cpp_meta_impl_that_support_symint(self):
             aten.as_strided.default,
             aten.zeros.default,
             aten.detach.default,
-            aten.set_.source_Storage_storage_offset,
         ]
 
     @property
@@ -1016,11 +1004,8 @@ def run_fallback_kernel(fake_mode, func, args, kwargs, orig_not_implemented_exce
     if torch.Tag.inplace_view in func.tags:  # type: ignore[attr-defined]
         raise orig_not_implemented_exception
 
-    inp_impls = {}
-
-    # Don't use in_kernel_invocation_manager(fake_mode) as we want to do
-    # REAL compute (not with meta device)
     with no_dispatch():
+        inp_impls = {}
 
         def to_real_tensor(e):
             if isinstance(e, FakeTensor):
@@ -1036,25 +1021,25 @@ def to_real_tensor(e):
 
         r = func(*args, **kwargs)
 
-    tensor_impls = set()
-    storages = set()
-
-    for e in tree_flatten((args, kwargs))[0]:
-        if isinstance(e, torch.Tensor):
-            if not e.is_sparse:
-                storages.add(e.storage()._cdata)
-
-    # TODO: also check metadata change on inputs
-    # proper aliasing/metadata relationship between outputs and inputs will
-    # not be set up, bc of conversion to device, unless we can reuse an
-    # input impl
-    for e in tree_flatten(r)[0]:
-        if id(e) not in inp_impls and (
-            isinstance(e, torch.Tensor)
-            and not e.is_sparse
-            and e.storage()._cdata in storages
-        ):
-            raise orig_not_implemented_exception
+        tensor_impls = set()
+        storages = set()
+
+        for e in tree_flatten((args, kwargs))[0]:
+            if isinstance(e, torch.Tensor):
+                if not e.is_sparse:
+                    storages.add(e.storage()._cdata)
+
+        # TODO: also check metadata change on inputs
+        # proper aliasing/metadata relationship between outputs and inputs will
+        # not be set up, bc of conversion to device, unless we can reuse an
+        # input impl
+        for e in tree_flatten(r)[0]:
+            if id(e) not in inp_impls and (
+                isinstance(e, torch.Tensor)
+                and not e.is_sparse
+                and e.storage()._cdata in storages
+            ):
+                raise orig_not_implemented_exception
 
     def map_out(e):
         if isinstance(e, torch.Tensor):
diff --git a/torch/_subclasses/meta_utils.py b/torch/_subclasses/meta_utils.py
index 735e3ff38f4ff..51231811631bc 100644
--- a/torch/_subclasses/meta_utils.py
+++ b/torch/_subclasses/meta_utils.py
@@ -1,10 +1,8 @@
-import contextlib
-import warnings
 import weakref
-from typing import ContextManager
 
 import torch
 from torch.multiprocessing.reductions import StorageWeakRef
+from torch.utils._mode_utils import no_dispatch
 
 
 def safe_is_leaf(t):
@@ -129,31 +127,18 @@ def del_ten():
 
     # NB: doesn't actually return a storage, because meta storage is
     # not supported
-    def meta_storage(self, s, callback):
+    def meta_storage(self, s):
         # NB: TypedStorage is freshly allocated and cannot be used as hash
         # key index.
 
         # Use a Weak Ref to s in order to not leak memory
         swr = StorageWeakRef(s)
         if swr not in self.storage_memo:
-            self.storage_memo[swr] = (
-                callback(
-                    lambda: torch.empty(s.size(), dtype=torch.uint8, device="meta")
-                )
-                .storage()
-                .untyped()
-            )
+            self.storage_memo[swr] = torch.empty(s.size(), dtype=s.dtype, device="meta")
         return self.storage_memo[swr]
 
     # This function assumes that it's possible to do the conversion
-    def meta_tensor(self, t, shape_env=None, callback=lambda t: t()):
-        # This indicates you set no_dispatch() before calling into this
-        # function.  This is an error: we may be creating fake tensors and
-        # will perform operations on them which need fake tensor mode to
-        # be active.  You will segfault if you are in a no_dispatch() block.
-        assert not torch._C._dispatch_tls_local_exclude_set().has(
-            torch._C.DispatchKey.Python
-        )
+    def meta_tensor(self, t, shape_env=None):
         arg_cnt = self.arg_cnt
         self.arg_cnt += 1
 
@@ -181,22 +166,14 @@ def sym_sizes_strides(t):
                 if t.is_sparse:
                     assert shape_env is None, "symbolic on sparse NYI"
                     is_leaf = safe_is_leaf(t)
-                    r = callback(
-                        lambda: torch.ops.aten._sparse_coo_tensor_with_dims(
-                            t.sparse_dim(),
-                            t.dense_dim(),
-                            t.shape,
-                            dtype=t.dtype,
-                            layout=torch.sparse_coo,
-                            device="meta",
-                        )
+                    r = torch.ops.aten._sparse_coo_tensor_with_dims(
+                        t.sparse_dim(),
+                        t.dense_dim(),
+                        t.shape,
+                        dtype=t.dtype,
+                        layout=torch.sparse_coo,
+                        device="meta",
                     )
-                    assert safe_is_leaf(r), "the callback you passed in doesn't detach"
-                    # Note [is_coalesced is dispatched]
-                    # Strangely enough, is_coalesced() is a dispatched operator,
-                    # which means that it will get caught by fake tensor mode.
-                    # Ordinarily this would error, but there's some logic in
-                    # fake tensor ensure this doesn't happen.
                     r._coalesced_(t.is_coalesced())
                     if t.requires_grad:
                         r.requires_grad = True
@@ -211,7 +188,7 @@ def sym_sizes_strides(t):
                     # directly from storage is WRONG because this won't cause
                     # version counters to get shared.
                     assert t._is_view()
-                    base = self.meta_tensor(t._base, shape_env, callback)
+                    base = self.meta_tensor(t._base)
 
                     def is_c_of_r(complex_dtype, real_dtype):
                         return (
@@ -237,96 +214,38 @@ def is_c_of_r(complex_dtype, real_dtype):
                         r = base.as_strided(sizes, strides, sym(t.storage_offset()))
                 else:
                     is_leaf = safe_is_leaf(t)
-                    sizes, strides = sym_sizes_strides(t)
-                    storage_offset = sym(t.storage_offset())
                     # Fake up some autograd history.
                     if t.requires_grad:
-                        r = callback(
-                            lambda: torch.empty_strided(
-                                sizes, strides, dtype=t.dtype, device="meta"
-                            )
+                        r = torch.empty(
+                            (0,), dtype=t.dtype, device="meta", requires_grad=True
                         )
-                        assert safe_is_leaf(
-                            r
-                        ), "the callback you passed in doesn't detach"
-                        r.requires_grad = t.requires_grad
                         if not is_leaf:
                             with torch.enable_grad():
-                                # preserve_format is the default, but we want to
-                                # emphasize how important it is to preserve
-                                # format here
-                                r = r.clone(memory_format=torch.preserve_format)
-                    else:
-                        r = callback(
-                            lambda: torch.empty_strided(
-                                sizes, strides, dtype=t.dtype, device="meta"
-                            )
-                        )
-                        assert safe_is_leaf(
-                            r
-                        ), "the callback you passed in doesn't detach"
-
-                    s = t.storage().untyped()
-                    swr = StorageWeakRef(s)
-                    if (
-                        swr not in self.storage_memo
-                        and r.stride() == strides
-                        and r.storage_offset() == storage_offset
-                    ):
-                        # You're normal and happy, install the fresh storage into the memo
-                        self.storage_memo[swr] = r.storage().untyped()
+                                # The backward function here will be wrong, but
+                                # that's OK; our goal is just to get the metadata
+                                # looking as close as possible; we're not going to
+                                # actually try to backward() on these produced
+                                # metas.  TODO: would be safer to install some
+                                # sort of unsupported grad_fn here
+                                r = r.clone()
                     else:
-                        # You're in crazy town; somehow you gave us a tensor
-                        # that wasn't a view, but had nonzero storage offset,
-                        # nontrivial strides (such that clone() couldn't
-                        # preserve them), or already aliases with another
-                        # tensor's storage.  The most typical way to end
-                        # up here is with set_.  So use set_ to bludgeon this
-                        # in.
-                        r_s = self.meta_storage(s, callback=callback)
-                        # NB: In principle, this should always work, but there
-                        # is some subtle difference in the autograd metadata
-                        # that means we will backprop the set_ call, even if
-                        # r is declared as an input to grad.
-                        # See https://github.com/pytorch/pytorch/issues/87956
-                        # for the reproducer.
-                        # NB: The in_kernel_invocation_manager here is necessary
-                        # for fake tensor.  If we run the set_ call with fake
-                        # tensor on, r will improperly report that it is NOT a
-                        # meta tensor but a cpu tensor, and then the set_ call
-                        # will fail due to device mismatch.  no_dispatch() is
-                        # not enough, because the fake tensor will still claim
-                        # to be a CPU tensor and you'll end up in the CPU
-                        # kernel.  Arguably this is a hack; a cleaner way to
-                        # solve this is to have a FakeStorage concept which
-                        # would report it's CPU device--no problem now!  But
-                        # this is difficult to do because we don't have storage
-                        # subclasses.  Relevant test is
-                        # DynamicShapesFunctionTests::test_add_dynamic_shapes in
-                        # test/dynamo/test_dynamic_shapes.py
-                        maybe_fake_mgr: ContextManager[None] = contextlib.nullcontext()
-                        from torch._subclasses.fake_tensor import (
-                            FakeTensor,
-                            in_kernel_invocation_manager,
-                        )
-
-                        if isinstance(r, FakeTensor):
-                            maybe_fake_mgr = in_kernel_invocation_manager(r.fake_mode)
-                        with maybe_fake_mgr, torch.no_grad():
-                            r.set_(r_s, storage_offset, sizes, strides)
+                        r = torch.empty((0,), dtype=t.dtype, device="meta")
+                    # As long as meta storage is not supported, need to prevent
+                    # redispatching on set_(Storage, ...) which will choke with
+                    # meta storage
+                    s = self.meta_storage(t.storage())
+                    with no_dispatch():
+                        sizes, strides = sym_sizes_strides(t)
+                        with torch.no_grad():
+                            r.set_(s, sym(t.storage_offset()), sizes, strides)
 
-                with warnings.catch_warnings():
-                    warnings.filterwarnings("ignore", "The .grad attribute of a Tensor")
-                    grad_not_none = t.grad is not None
-                if grad_not_none:
-                    r.grad = self.meta_tensor(t.grad, shape_env, callback)
                 torch._C._set_conj(r, t.is_conj())
                 torch._C._set_neg(r, t.is_neg())
             self.set_tensor_memo(t, r)
 
         return self.get_tensor_memo(t)
 
-    def __call__(self, t, shape_env=None, *, strict=False, callback=lambda t: t()):
+    def __call__(self, t, shape_env=None):
         # TODO: zero tensors?  We appear to have eliminated them by
         # excluding complex for now
         from torch._subclasses.fake_tensor import FakeTensor
@@ -361,13 +280,10 @@ def __call__(self, t, shape_env=None, *, strict=False, callback=lambda t: t()):
                 # tests all break so we just exclude this.  In any case
                 # the to conversion isn't really right anyhow.
                 self.miss += 1
-                if strict:
-                    return NotImplemented
                 return t
             else:
                 self.hit += 1
-                r = self.meta_tensor(t, shape_env=shape_env, callback=callback)
-                # TODO: this is suspicious, now that we have callback argument
+                r = self.meta_tensor(t, shape_env=shape_env)
                 if type(t) is torch.nn.Parameter:
                     r = torch.nn.Parameter(r, requires_grad=r.requires_grad)
                 return r
@@ -378,13 +294,9 @@ def __call__(self, t, shape_env=None, *, strict=False, callback=lambda t: t()):
             # support meta.  Trying to YOLO this is more trouble than it's
             # worth.
             self.miss += 1
-            if strict:
-                return NotImplemented
             return t
         else:
             # non-Tensor types don't count as hit or miss
-            if strict:
-                return NotImplemented
             return t
 
 
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 9903e95228fc8..2f85b8af1d81f 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -1367,8 +1367,6 @@ def freeze_rng_state():
         #
         # In the long run torch.cuda.set_rng_state should probably be
         # an operator.
-        #
-        # NB: Mode disable is to avoid running cross-ref tests on thes seeding
         with no_dispatch(), disable_functorch():
             if torch.cuda.is_available():
                 torch.cuda.set_rng_state(cuda_rng_state)

From f9252207e5a97a2a0402c980d4b43b0e5bd32c43 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Sat, 29 Oct 2022 20:36:20 +0000
Subject: [PATCH 0318/1922] [Inductor] Enable Inductor unspec inputs test for
 different dtypes (#87809)

Fixes #ISSUE_NUMBER

cc @jansel @mlazos @soumith @voznesenskym @penguinwu @anijain2305 @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87809
Approved by: https://github.com/ngimel
---
 test/inductor/test_torchinductor.py | 38 ++++++++++++-----------------
 1 file changed, 16 insertions(+), 22 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 3e3887a5fcec6..58ae49eb1930e 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -4008,29 +4008,23 @@ def fn(x, y):
             return x + y, x * y, x / y
 
         opt = torch._dynamo.optimize("inductor")(fn)
+        dtypes = [
+            torch.float16,
+            torch.bfloat16,
+            torch.float32,
+            torch.float64,
+            torch.int32,
+            torch.int64,
+        ]
 
-        inputs = (
-            rand_strided((2, 3), (3, 1), device="cuda"),
-            rand_strided((), (), device="cpu"),
-        )
-        self.assertTrue(same(opt(*inputs), fn(*inputs)))
-        inputs = (inputs[1], inputs[0])
-        self.assertTrue(same(opt(*inputs), fn(*inputs)))
-
-    @requires_cuda()
-    def test_unspec_inputs_fp16(self):
-        def fn(x, y):
-            return x + y, x * y, x / y
-
-        opt = torch._dynamo.optimize("inductor")(fn)
-
-        inputs = (
-            rand_strided((2, 3), (3, 1), dtype=torch.float16, device="cuda"),
-            rand_strided((), (), dtype=torch.float16, device="cpu"),
-        )
-        self.assertTrue(same(opt(*inputs), fn(*inputs)))
-        inputs = (inputs[1], inputs[0])
-        self.assertTrue(same(opt(*inputs), fn(*inputs)))
+        for d in dtypes:
+            inputs = (
+                rand_strided((2, 3), (3, 1), dtype=torch.float32, device="cuda"),
+                rand_strided((), (), dtype=d, device="cpu"),
+            )
+            self.assertTrue(same(opt(*inputs), fn(*inputs)))
+            inputs = (inputs[1], inputs[0])
+            self.assertTrue(same(opt(*inputs), fn(*inputs)))
 
     @patch.object(config.triton, "mm", "aten")
     def test_list_clearing(self):

From 7ee57b385caab3a45ab04d2a80532321eaadad91 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Sun, 30 Oct 2022 01:04:55 +0000
Subject: [PATCH 0319/1922] [BE] Do not assign string literal to `char *`
 (#87949)

Not sure, what I was thinking when writing something like:
```
auto foo = std::getenv("BAR");
if (!foo) {
   foo = "baz";
}
```
as `std::getenv` return `char *` (i.e. mutable string), but string literals are immutable. (i.e. `const char *`)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87949
Approved by: https://github.com/kit1980
---
 torch/csrc/autograd/engine.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index 0a2298efc1282..ef4856cf4796a 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -271,10 +271,7 @@ void Engine::stop() {
   // Under some conditions, autograd threads can hang on shutdown
   // Do not wait for them to shutdown indefinitely but rely on timeout
   auto wait_duration_str = getenv("TORCH_AUTOGRAD_SHUTDOWN_WAIT_LIMIT");
-  if (!wait_duration_str) {
-    wait_duration_str = "10.0";
-  }
-  auto wait_duration = std::atof(wait_duration_str);
+  auto wait_duration = wait_duration_str ? std::atof(wait_duration_str) : 10.0;
   bool noBackward = true;
   for (auto& queue : device_ready_queues_) {
     noBackward = noBackward && queue->empty();

From ecd09e169ec20411ba8dc96a735b2298dcb5c58f Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Sun, 30 Oct 2022 03:02:55 +0000
Subject: [PATCH 0320/1922] [vision hash update] update the pinned vision hash
 (#87948)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned vision hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87948
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/vision.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index 4ee9517b28d7a..7dbaee31dbff6 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-add75968543f36818691f8b59880f5c04689a88e
+cba1c011a87dd14af10f97bcb113fa09a8e2b396

From daefabbb818258b9f1db46cdfbe78be459666d87 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Sun, 30 Oct 2022 04:31:45 +0000
Subject: [PATCH 0321/1922] [BE] Do not package caffe2 in wheel (#87986)

If PyTorch is build without caffe2 integration, do not package unusable
.py files/headers

Same is true about functorch - don't package it unless building with `functorch` (although, I wonder if we should remove this option at some point in the future)

Followup after https://github.com/pytorch/builder/pull/1181

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87986
Approved by: https://github.com/seemethere
---
 .../win-test-helpers/build_pytorch.bat        |  2 +-
 setup.py                                      | 32 +++++++++++++------
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
index 1ea7912291135..da28956cae971 100644
--- a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
@@ -145,7 +145,7 @@ python -c "import os, glob; os.system('python -mpip install ' + glob.glob('dist/
   if "%BUILD_ENVIRONMENT%"=="" (
     echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash.
   ) else (
-    7z a %TMP_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torchgen %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\caffe2 %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\functorch && copy /Y "%TMP_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z" "%PYTORCH_FINAL_PACKAGE_DIR%\"
+    7z a %TMP_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torchgen %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\functorch && copy /Y "%TMP_DIR_WIN%\%IMAGE_COMMIT_TAG%.7z" "%PYTORCH_FINAL_PACKAGE_DIR%\"
     if errorlevel 1 exit /b
     if not errorlevel 0 exit /b
 
diff --git a/setup.py b/setup.py
index e3eb3ced6005b..72a6bbae7b460 100644
--- a/setup.py
+++ b/setup.py
@@ -762,6 +762,14 @@ def run(self):
             super().run()
 
 
+def get_cmake_cache_vars():
+    try:
+        return defaultdict(lambda: False, cmake.get_cmake_cache_variables())
+    except FileNotFoundError:
+        # CMakeCache.txt does not exist. Probably running "python setup.py clean" over a clean directory.
+        return defaultdict(lambda: False)
+
+
 def configure_extension_build():
     r"""Configures extension build options according to system environment and user's choice.
 
@@ -769,11 +777,7 @@ def configure_extension_build():
       The input to parameters ext_modules, cmdclass, packages, and entry_points as required in setuptools.setup.
     """
 
-    try:
-        cmake_cache_vars = defaultdict(lambda: False, cmake.get_cmake_cache_variables())
-    except FileNotFoundError:
-        # CMakeCache.txt does not exist. Probably running "python setup.py clean" over a clean directory.
-        cmake_cache_vars = defaultdict(lambda: False)
+    cmake_cache_vars = get_cmake_cache_vars()
 
     ################################################################################
     # Configure compile flags
@@ -877,7 +881,12 @@ def make_relative_rpath_args(path):
     ################################################################################
 
     extensions = []
-    packages = find_packages(exclude=('tools', 'tools.*'))
+    excludes = ['tools', 'tools.*']
+    if not cmake_cache_vars['BUILD_CAFFE2']:
+        excludes.extend(['caffe2', 'caffe2.*'])
+    if not cmake_cache_vars['BUILD_FUNCTORCH']:
+        excludes.extend(['functorch', 'functorch.*'])
+    packages = find_packages(exclude=excludes)
     C = Extension("torch._C",
                   libraries=main_libraries,
                   sources=main_sources,
@@ -1055,8 +1064,7 @@ def main():
         'include/ATen/native/quantized/*.h',
         'include/ATen/native/quantized/cpu/*.h',
         'include/ATen/quantized/*.h',
-        'include/caffe2/utils/*.h',
-        'include/caffe2/utils/**/*.h',
+        'include/caffe2/serialize/*.h',
         'include/c10/*.h',
         'include/c10/macros/*.h',
         'include/c10/core/*.h',
@@ -1070,7 +1078,6 @@ def main():
         'include/c10/cuda/impl/*.h',
         'include/c10/hip/*.h',
         'include/c10/hip/impl/*.h',
-        'include/caffe2/**/*.h',
         'include/torch/*.h',
         'include/torch/csrc/*.h',
         'include/torch/csrc/api/include/torch/*.h',
@@ -1158,6 +1165,13 @@ def main():
         'utils/model_dump/code.js',
         'utils/model_dump/*.mjs',
     ]
+
+    if get_cmake_cache_vars()['BUILD_CAFFE2']:
+        torch_package_data.extend([
+            'include/caffe2/**/*.h',
+            'include/caffe2/utils/*.h',
+            'include/caffe2/utils/**/*.h',
+        ])
     torchgen_package_data = [
         # Recursive glob doesn't work in setup.py,
         # https://github.com/pypa/setuptools/issues/1806

From d72ee4d94f79ce3285444443a18afba4104e0568 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Sun, 30 Oct 2022 17:10:17 +0000
Subject: [PATCH 0322/1922] [dynamo][benchmarks] use fresh inductor cache and
 raise batch size wherever possible (#88044)

cc @mlazos @soumith @voznesenskym @yanboliang @penguinwu @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88044
Approved by: https://github.com/ngimel
---
 benchmarks/dynamo/Makefile_dashboard   |  3 ++
 benchmarks/dynamo/common.py            |  4 +--
 benchmarks/dynamo/huggingface.py       | 50 ++++++++++----------------
 benchmarks/dynamo/timm_models.py       | 38 +++++++-------------
 torch/_inductor/codecache.py           |  5 ++-
 torch/_inductor/triton_ops/autotune.py |  3 +-
 torch/_inductor/utils.py               | 34 ++++++++++--------
 7 files changed, 62 insertions(+), 75 deletions(-)

diff --git a/benchmarks/dynamo/Makefile_dashboard b/benchmarks/dynamo/Makefile_dashboard
index 1c75d608e7d71..559f9fee92dd9 100644
--- a/benchmarks/dynamo/Makefile_dashboard
+++ b/benchmarks/dynamo/Makefile_dashboard
@@ -7,6 +7,7 @@ clone-deps:
 		&& (test -e torchvision || git clone --recursive https://github.com/pytorch/vision torchvision) \
 		&& (test -e torchdata || git clone --recursive https://github.com/pytorch/data.git torchdata) \
 		&& (test -e torchtext || git clone --recursive https://github.com/pytorch/text torchtext) \
+		&& (test -e torchaudio || git clone --recursive https://github.com/pytorch/text torchaudio) \
 		&& (test -e detectron2 || git clone --recursive https://github.com/facebookresearch/detectron2) \
 		&& (test -e torchbenchmark || git clone --recursive https://github.com/pytorch/benchmark torchbenchmark) \
 		&& (test -e triton || git clone --recursive https://github.com/openai/triton.git) \
@@ -17,6 +18,7 @@ pull-deps: clone-deps
 	(cd ../../../torchvision    && git pull && git submodule update --init --recursive)
 	(cd ../../../torchdata      && git pull && git submodule update --init --recursive)
 	(cd ../../../torchtext      && git pull && git submodule update --init --recursive)
+	(cd ../../../torchaudio      && git pull && git submodule update --init --recursive)
 	(cd ../../../detectron2     && git pull && git submodule update --init --recursive)
 	(cd ../../../torchbenchmark && git pull && git submodule update --init --recursive)
 	(cd ../../../triton         && git checkout master && git pull && git checkout $(TRITON_VERSION) && git submodule update --init --recursive)
@@ -32,6 +34,7 @@ build-deps: clone-deps
 	(cd ../../../torchvision && python setup.py clean && python setup.py develop)
 	(cd ../../../torchdata && python setup.py install)
 	(cd ../../../torchtext   && python setup.py clean && python setup.py develop)
+	(cd ../../../torchaudio   && python setup.py clean && python setup.py develop)
 	(cd ../../../detectron2  && python setup.py clean && python setup.py develop)
 	(cd ../../../torchbenchmark && python install.py --continue_on_fail)
 	(cd ../../../triton/python && python setup.py clean && python setup.py develop)
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index d1fd51f5d0448..dcbdfa6778bbd 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -28,7 +28,7 @@
 from torch._dynamo.testing import dummy_fx_compile, format_speedup, same
 from torch._dynamo.utils import clone_inputs
 from torch._inductor import config as inductor_config
-from torch._inductor.utils import fresh_triton_cache
+from torch._inductor.utils import fresh_inductor_cache
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.utils._pytree import tree_map
 
@@ -765,7 +765,7 @@ def inner(*args, **kwargs):
         cache_minder = NullContext()
         if is_cold_start:
             cache_entries = {}
-            cache_minder = fresh_triton_cache(cache_entries)
+            cache_minder = fresh_inductor_cache(cache_entries)
 
         try:
             with cache_minder:
diff --git a/benchmarks/dynamo/huggingface.py b/benchmarks/dynamo/huggingface.py
index b563c229529d3..c7ecd5f222ec5 100755
--- a/benchmarks/dynamo/huggingface.py
+++ b/benchmarks/dynamo/huggingface.py
@@ -68,9 +68,6 @@ def pip_install(package):
         exec(f"from transformers import {cls}")
 
 
-USE_HALF_BATCH_SIZE = True
-
-
 # These models contain the models present in huggingface_models_list. It is a
 # combination of models supported by HF Fx parser and some manually supplied
 # models. For these models, we already know the largest batch size that can fit
@@ -107,31 +104,27 @@ def pip_install(package):
 }
 
 # TODO - Fails even after fake tensors
-USE_SMALL_BATCH_SIZE = {
+BATCH_SIZE_DIVISORS = {
     "AlbertForMaskedLM": 2,
-    "AlbertForPreTraining": 4,
     "AlbertForQuestionAnswering": 2,
-    "BartForCausalLM": 2,
-    "BartForConditionalGeneration": 1,
-    "BlenderbotSmallForConditionalGeneration": 32,
-    "DebertaForMaskedLM": 4,
+    "AllenaiLongformerBase": 2,
+    "BartForConditionalGeneration": 2,
+    "BertForMaskedLM": 2,
+    "BlenderbotSmallForCausalLM": 2,
+    "BlenderbotSmallForConditionalGeneration": 2,
+    "ElectraForCausalLM": 2,
+    "ElectraForQuestionAnswering": 2,
+    "GPT2ForSequenceClassification": 2,
+    "LayoutLMForMaskedLM": 2,
+    "LayoutLMForSequenceClassification": 2,
+    "RobertaForCausalLM": 2,
+    "T5ForConditionalGeneration": 2,
+    # Large footprint
+    "BartForCausalLM": 4,
     "DebertaForQuestionAnswering": 4,
-    "DebertaV2ForMaskedLM": 1,
-    "DebertaV2ForQuestionAnswering": 1,
-    "DistilBertForMaskedLM": 16,
-    "ElectraForCausalLM": 1,
-    "GPTNeoForCausalLM": 1,
-    "GPTNeoForSequenceClassification": 1,
-    "M2M100ForConditionalGeneration": 2,
-    "MT5ForConditionalGeneration": 2,
-    "MegatronBertForCausalLM": 2,
-    "OPTForCausalLM": 4,
-    "PegasusForCausalLM": 8,
-    "PegasusForConditionalGeneration": 4,
-    "RobertaForCausalLM": 4,
-    "TrOCRForCausalLM": 8,
-    "XGLMForCausalLM": 1,
     "XLNetLMHeadModel": 4,
+    # Very large footprint
+    "DebertaForMaskedLM": 8,
 }
 
 
@@ -369,13 +362,8 @@ def load_model(
 
         if batch_size is None:
             batch_size = batch_size_default
-            if model_name in USE_SMALL_BATCH_SIZE:
-                batch_size = USE_SMALL_BATCH_SIZE[model_name]
-                log.warning(
-                    f"Running smaller batch size={batch_size} for {model_name}, orig batch_size={batch_size_default}"
-                )
-            elif USE_HALF_BATCH_SIZE and batch_size >= 2:
-                batch_size = int(batch_size / 2)
+            if model_name in BATCH_SIZE_DIVISORS:
+                batch_size = max(int(batch_size / BATCH_SIZE_DIVISORS[model_name]), 1)
                 log.warning(
                     f"Running smaller batch size={batch_size} for {model_name}, orig batch_size={batch_size_default}"
                 )
diff --git a/benchmarks/dynamo/timm_models.py b/benchmarks/dynamo/timm_models.py
index f7ff2559cbb8a..70d06ab318189 100755
--- a/benchmarks/dynamo/timm_models.py
+++ b/benchmarks/dynamo/timm_models.py
@@ -40,44 +40,30 @@ def pip_install(package):
 
 
 # TODO - Figure out the reason of cold start memory spike
+
 BATCH_SIZE_DIVISORS = {
     "beit_base_patch16_224": 2,
-    "cait_m36_384": 4,
-    "convit_base": 4,
+    "cait_m36_384": 2,
+    "convit_base": 2,
     "convmixer_768_32": 2,
-    "convnext_base": 4,
-    "crossvit_9_240": 2,
+    "convnext_base": 2,
     "cspdarknet53": 2,
     "deit_base_distilled_patch16_224": 2,
-    "dla102": 2,
     "dpn107": 2,
-    "eca_botnext26ts_256": 2,
-    "eca_halonext26ts": 2,
-    "gluon_senet154": 2,
     "gluon_xception65": 2,
-    "gmixer_24_224": 2,
-    "gmlp_s16_224": 2,
-    "hrnet_w18": 64,
-    "jx_nest_base": 4,
-    "mixer_b16_224": 2,
-    "mixnet_l": 2,
-    "mobilevit_s": 4,
-    "nfnet_l0": 2,
+    "mobilevit_s": 2,
     "pit_b_224": 2,
     "pnasnet5large": 2,
     "poolformer_m36": 2,
     "res2net101_26w_4s": 2,
-    "res2net50_14w_8s": 64,
-    "res2next50": 64,
-    "resnest101e": 4,
+    "resnest101e": 2,
     "sebotnet33ts_256": 2,
     "swin_base_patch4_window7_224": 2,
     "swsl_resnext101_32x16d": 2,
-    "tf_mixnet_l": 2,
-    "tnt_s_patch16_224": 2,
-    "twins_pcpvt_base": 4,
+    "twins_pcpvt_base": 2,
     "vit_base_patch16_224": 2,
     "volo_d1_224": 2,
+    "jx_nest_base": 4,
     "xcit_large_24_p8_224": 4,
 }
 
@@ -230,9 +216,11 @@ def load_model(
         )
         input_size = data_config["input_size"]
         recorded_batch_size = TIMM_MODELS[model_name]
-        recorded_batch_size = max(
-            int(recorded_batch_size / BATCH_SIZE_DIVISORS.get(model_name, 1)), 1
-        )
+
+        if model_name in BATCH_SIZE_DIVISORS:
+            recorded_batch_size = max(
+                int(recorded_batch_size / BATCH_SIZE_DIVISORS[model_name]), 1
+            )
         batch_size = batch_size or recorded_batch_size
 
         # example_inputs = torch.randn(
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 1c97c26a7870e..7f1e8bc61814b 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -50,8 +50,11 @@ def _compile_end():
 logging.getLogger("filelock").setLevel(logging.DEBUG if config.debug else logging.INFO)
 
 
+@functools.lru_cache(None)
 def cache_dir():
-    return f"/tmp/torchinductor_{getpass.getuser()}"
+    return os.environ.get(
+        "TORCHINDUCTOR_CACHE_DIR", f"/tmp/torchinductor_{getpass.getuser()}"
+    )
 
 
 def get_lock_dir():
diff --git a/torch/_inductor/triton_ops/autotune.py b/torch/_inductor/triton_ops/autotune.py
index 59ee762c7500a..b6f1c5cbabe3a 100644
--- a/torch/_inductor/triton_ops/autotune.py
+++ b/torch/_inductor/triton_ops/autotune.py
@@ -13,7 +13,7 @@
 from .. import config
 from ..ir import ReductionHint
 from ..triton_ops.mm_perf_model import estimate_matmul_time
-from ..utils import conditional_product, has_triton
+from ..utils import conditional_product, dynamo_utils, has_triton
 from .conv_perf_model import (
     early_config_prune as conv_early_config_prune,
     estimate_conv_time,
@@ -136,6 +136,7 @@ def kernel_call():
 
         return do_bench(kernel_call)
 
+    @dynamo_utils.dynamo_timed
     def autotune_to_one_config(self, *args, **kwargs):
         """Do the actual autotuning"""
         from ..compile_fx import clone_preserve_strides
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index e970f6acbe5d8..8f08c01a1dd61 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -242,23 +242,27 @@ def has_incompatible_cudagraph_ops(gm):
 
 
 @contextlib.contextmanager
-def fresh_triton_cache(cache_entries=None):
+def fresh_inductor_cache(cache_entries=None):
     """
-    Contextmanager that provides a clean tmp cachedir for triton.
+    Contextmanager that provides a clean tmp cachedir for inductor.
 
     Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
     generated with this cache instance.
     """
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        with mock.patch.dict(os.environ, {"TRITON_CACHE_DIR": tmpdirname}):
-            yield
-            if isinstance(cache_entries, dict):
-                assert len(cache_entries) == 0, "expected empty cache_entries dict"
-                files = os.listdir(tmpdirname)
-                cache_entries.update(
-                    {
-                        f: os.path.getsize(os.path.join(tmpdirname, f))
-                        for f in files
-                        if ".lock" not in f
-                    }
-                )
+    with tempfile.TemporaryDirectory() as inductor_cache_dir:
+        with mock.patch.dict(
+            os.environ, {"TORCHINDUCTOR_CACHE_DIR": inductor_cache_dir}
+        ):
+            triton_cache_dir = os.path.join(inductor_cache_dir, "triton")
+            with mock.patch.dict(os.environ, {"TRITON_CACHE_DIR": triton_cache_dir}):
+                yield
+                if isinstance(cache_entries, dict):
+                    assert len(cache_entries) == 0, "expected empty cache_entries dict"
+                    files = os.listdir(triton_cache_dir)
+                    cache_entries.update(
+                        {
+                            f: os.path.getsize(os.path.join(triton_cache_dir, f))
+                            for f in files
+                            if ".lock" not in f
+                        }
+                    )

From 6c1c9502245cb81e056bfffb7f9b74a9e68f7491 Mon Sep 17 00:00:00 2001
From: Mengchi Zhang <mengchi@meta.com>
Date: Sun, 30 Oct 2022 18:22:17 +0000
Subject: [PATCH 0323/1922] Even "nvcc not found" should be commented out
 (#87959)

Summary: Even "nvcc not found" should be commented out in minifier_launcher.py, cause there could be a case that PyTorch/minifier can find cuda path but nvcc is not explicitly included in env variable like PATH.

Differential Revision: D40790023

cc @jansel @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87959
Approved by: https://github.com/anijain2305, https://github.com/jianyuh
---
 torch/_dynamo/debug_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index a89f71eac4ef6..f2774e9bb14db 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -128,7 +128,7 @@ def _cuda_system_info_comment():
         )
         model_str += f"{cuda_version_out}\n"
     except FileNotFoundError:
-        model_str += "nvcc not found\n"
+        model_str += "# nvcc not found\n"
 
     gpu_names = subprocess.run(
         ["nvidia-smi", "--query-gpu=gpu_name", "--format=csv"],

From f4411f471f29a5dd4754d88d361af50ed4c82c0b Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 31 Oct 2022 02:30:29 +0000
Subject: [PATCH 0324/1922] [dynamo][dashboard] minor fixes for a clean
 Dashboard (#88056)

* better check for cold start latency
* sort on inductor column for better readability.

cc @mlazos @soumith @voznesenskym @yanboliang @penguinwu @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88056
Approved by: https://github.com/ngimel
---
 benchmarks/dynamo/runner.py | 10 +++++++++-
 torch/_inductor/utils.py    | 17 +++++++++--------
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py
index f5ec96e4f500b..5406f04fba035 100755
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@@ -258,7 +258,10 @@ def generate_commands(args, dtypes, suites, devices, compilers, output_dir):
                         filters = DEFAULTS["quick"][suite]
                         cmd = f"{cmd} {filters}"
 
-                    if testing == "performance" and compiler == "inductor":
+                    if testing == "performance" and compiler in (
+                        "inductor",
+                        "inductor_no_cudagraphs",
+                    ):
                         cmd = f"{cmd} --cold_start_latency"
                     lines.append(cmd)
                 lines.append("")
@@ -448,6 +451,8 @@ def extract_df(self, metric, testing):
             df_copy = df_copy.sort_values(
                 by=list(reversed(self.compilers)), ascending=False
             )
+            if "inductor" in self.compilers:
+                df_copy = df_copy.sort_values(by="inductor", ascending=False)
             self.untouched_parsed_frames[suite][metric] = df_copy
 
             if testing == "performance":
@@ -468,6 +473,9 @@ def extract_df(self, metric, testing):
                     perf_rows.append(perf_row)
                 df = pd.concat(perf_rows)
             df = df.sort_values(by=list(reversed(self.compilers)), ascending=False)
+
+            if "inductor" in self.compilers:
+                df = df.sort_values(by="inductor", ascending=False)
             self.parsed_frames[suite][metric] = df
 
     def get_passing_entries(self, compiler, df):
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index 8f08c01a1dd61..60bba3dcf7dc3 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -258,11 +258,12 @@ def fresh_inductor_cache(cache_entries=None):
                 yield
                 if isinstance(cache_entries, dict):
                     assert len(cache_entries) == 0, "expected empty cache_entries dict"
-                    files = os.listdir(triton_cache_dir)
-                    cache_entries.update(
-                        {
-                            f: os.path.getsize(os.path.join(triton_cache_dir, f))
-                            for f in files
-                            if ".lock" not in f
-                        }
-                    )
+                    if os.path.exists(triton_cache_dir):
+                        files = os.listdir(triton_cache_dir)
+                        cache_entries.update(
+                            {
+                                f: os.path.getsize(os.path.join(triton_cache_dir, f))
+                                for f in files
+                                if ".lock" not in f
+                            }
+                        )

From 7b24682856ee1da4918487c11f26f04d41045f20 Mon Sep 17 00:00:00 2001
From: HAOCHENYE <21724054@zju.edu.cn>
Date: Mon, 31 Oct 2022 03:00:30 +0000
Subject: [PATCH 0325/1922] [ONNX] Fix get wrong summary of the docstring in
 `torch.onnx._deprecation.deprecated` (#87194)

The summary of the deprecated function could be multi-line. Therefore the code below:
https://github.com/pytorch/pytorch/blob/9ac2a06acf75538a35751f785d5f509d6127d6cd/torch/onnx/_deprecation.py#L45
should be adjusted to

```python
summary_and_body = docstring.split("\n\n", 1)
```
Otherwise, the multi-line summary will be separated wrongly.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87194
Approved by: https://github.com/justinchuby, https://github.com/BowenBao
---
 torch/onnx/_deprecation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/onnx/_deprecation.py b/torch/onnx/_deprecation.py
index 0f482f16e2421..0fd2cd764fc95 100644
--- a/torch/onnx/_deprecation.py
+++ b/torch/onnx/_deprecation.py
@@ -42,7 +42,7 @@ def wrapper(*args, **kwargs):
         )
 
         # Split docstring at first occurrence of newline
-        summary_and_body = docstring.split("\n", 1)
+        summary_and_body = docstring.split("\n\n", 1)
 
         if len(summary_and_body) > 1:
             summary, body = summary_and_body

From 4c310a716d7c668654a0fb7153ffd1b192e8c519 Mon Sep 17 00:00:00 2001
From: Fuzzkatt <zonghan2000@gmail.com>
Date: Mon, 31 Oct 2022 03:56:55 +0000
Subject: [PATCH 0326/1922] Add sequence number support for UCC (#85047)

Add sequence number support for UCC, mostly following format of ProcressGroupNCCL.
Pass new test: `test_all_gather_object_subgroup`
Add skips for gather tests: `test_gather_object` and `test_gather_object_subgroup`

cc @mrshenli @pritamdamania87 @zhaojuanmao @satgera @rohan-varma @gqchen @aazzolini @osalpekar @jiayisuse @H-Huang @kwen2501 @awgu
Pull Request resolved: https://github.com/pytorch/pytorch/pull/85047
Approved by: https://github.com/kwen2501
---
 .../csrc/distributed/c10d/ProcessGroupUCC.cpp  | 12 +++++++++++-
 .../csrc/distributed/c10d/ProcessGroupUCC.hpp  | 18 +++++++++++++++++-
 torch/testing/_internal/common_distributed.py  |  2 +-
 .../_internal/distributed/distributed_test.py  |  6 ++++++
 4 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp b/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp
index 6333cbf369c1b..5f286b7a716c5 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp
@@ -172,6 +172,9 @@ void read_config() {
   // barrier is always blocking
   torch_ucc_config.blocking_wait[(std::uint8_t)OpType::BARRIER] = true;
 
+  // barrier is always blocking
+  torch_ucc_config.blocking_wait[(std::uint8_t)OpType::BARRIER] = true;
+
   torch_ucc_config.use_future =
       std::stoi(torch_ucc_envs_map.at("TORCH_UCC_USE_FUTURE"));
   torch_ucc_config.shared_comm =
@@ -760,9 +763,10 @@ c10::intrusive_ptr<Work> ProcessGroupUCC::collective_post(
     std::vector<at::Tensor>& inputTensors,
     std::vector<at::Tensor>& outputTensors,
     const char* prof_title) {
+  seq_++;
   set_timeout(coll);
   auto work = c10::make_intrusive<ProcessGroupUCC::WorkUCC>(
-      opType, prof_title, inputTensors, logger);
+      opType, seq_, prof_title, inputTensors, logger);
 
   if (opType == OpType::RECV) {
     work->sourceRank_ = coll.root;
@@ -1571,6 +1575,12 @@ c10::intrusive_ptr<Work> ProcessGroupUCC::recv(
       "ucc:recv");
 }
 
+void ProcessGroupUCC::setSequenceNumberForGroup() {}
+
+uint64_t ProcessGroupUCC::getSequenceNumberForGroup() {
+  return seq_;
+}
+
 c10::intrusive_ptr<ProcessGroup> ProcessGroupUCC::createProcessGroupUCC(
     const c10::intrusive_ptr<::c10d::Store>& store,
     int rank,
diff --git a/torch/csrc/distributed/c10d/ProcessGroupUCC.hpp b/torch/csrc/distributed/c10d/ProcessGroupUCC.hpp
index 243cf301290e8..03d5d234873da 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupUCC.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupUCC.hpp
@@ -117,10 +117,11 @@ class TORCH_API ProcessGroupUCC : public ProcessGroup {
    public:
     WorkUCC(
         OpType opType,
+        uint64_t seq,
         const char* prof_title,
         const c10::optional<std::vector<at::Tensor>>& inputs,
         const c10::intrusive_ptr<ProcessGroupUCCLogger>& logger)
-        : Work(-1, opType, prof_title, inputs), logger_(logger) {}
+        : Work(-1, opType, prof_title, inputs), logger_(logger), seq_(seq) {}
     ~WorkUCC();
     void setException();
     void setAndThrowException();
@@ -135,9 +136,11 @@ class TORCH_API ProcessGroupUCC : public ProcessGroup {
     event_pool_t* ep = nullptr;
 #endif
     int sourceRank_;
+
    protected:
     std::shared_ptr<ProgressEntry> entry_;
     c10::intrusive_ptr<ProcessGroupUCCLogger> logger_;
+    uint64_t seq_;
 
    private:
     // The future returned by getFuture.
@@ -251,6 +254,18 @@ class TORCH_API ProcessGroupUCC : public ProcessGroup {
       int srcRank,
       int tag) override;
 
+  // Counting for the sequential number of UCC collective_post call.
+  uint64_t seq_{0};
+
+  // Agrees on an initial sequence number for the whole group by having rank 0
+  // create it and broadcast it to other ranks using the store.
+  void setSequenceNumberForGroup() override;
+
+  // Retrieves the current sequence number for the whole group, which should be
+  // in sync. If the returned number is not consistent across the group, it
+  // may indicate that there is some sort of collective desynchronization.
+  uint64_t getSequenceNumberForGroup() override;
+
   static c10::intrusive_ptr<ProcessGroup> createProcessGroupUCC(
       const c10::intrusive_ptr<::c10d::Store>& store,
       int rank,
@@ -264,6 +279,7 @@ class TORCH_API ProcessGroupUCC : public ProcessGroup {
   uint32_t comm_id;
   ucc_team_h team{nullptr};
   ucc_ee_h cuda_ee{nullptr};
+
 #ifdef USE_CUDA
   std::unique_ptr<at::cuda::CUDAStream> stream = nullptr;
   event_pool_t ep;
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index adb71109c18b5..b24c90ef3f862 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -80,7 +80,7 @@ class DistTestCases:
 
     # Sets showing that something is implemented
     backend_feature = {}
-    backend_feature["gpu"] = {"nccl", "gloo"}  # TODO(ucc): add sequence number support to ucc and enable it here
+    backend_feature["gpu"] = {"nccl", "gloo", "ucc"}
     backend_feature["cuda"] = {"nccl", "gloo", "ucc"}
     backend_feature["ddp"] = {"nccl", "gloo", "ucc"}
     backend_feature["subgroup"] = {"nccl", "gloo", "ucc"}
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index ad89c7f318263..2242374858f0a 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -6159,11 +6159,13 @@ class Bar:
                     group=pg
                 )
 
+        @sandcastle_skip_if(BACKEND == "ucc", "CPU tensor ops not supported by UCP TL")
         @require_backend(DistTestCases.backend_feature["gpu"])
         @with_dist_debug_levels(levels=["DETAIL", "OFF", "INFO"])
         def test_gather_object(self):
             return self._test_gather_object()
 
+        @sandcastle_skip_if(BACKEND == "ucc", "CPU tensor ops not supported by UCP TL")
         @require_backend(DistTestCases.backend_feature["gpu"])
         @with_dist_debug_levels(levels=["DETAIL", "OFF", "INFO"])
         def test_gather_object_subgroup(self):
@@ -7694,12 +7696,14 @@ def _test_verify_model_across_rank(self, use_logger):
 
         @require_backend(DistTestCases.backend_feature["gpu"])
         @require_backends_available(DistTestCases.backend_feature["gpu"])
+        @sandcastle_skip_if(BACKEND == "ucc", "test timing out locally with ucc")
         @skip_if_lt_x_gpu(2)
         def test_verify_model_across_rank_with_logger(self):
             self._test_verify_model_across_rank(use_logger=True)
 
         @require_backend(DistTestCases.backend_feature["gpu"])
         @require_backends_available(DistTestCases.backend_feature["gpu"])
+        @sandcastle_skip_if(BACKEND == "ucc", "test timing out locally with ucc")
         @skip_if_lt_x_gpu(2)
         def test_verify_model_across_rank_without_logger(self):
             self._test_verify_model_across_rank(use_logger=False)
@@ -7723,6 +7727,7 @@ def _run_test_ddp_model_with_diff_params(self, ctx, net, ddp_group, group_gloo):
 
         @require_backend(DistTestCases.backend_feature["gpu"])
         @require_backends_available(DistTestCases.backend_feature["gpu"])
+        @sandcastle_skip_if(BACKEND == "ucc", "test failing locally with UCC")
         @skip_if_lt_x_gpu(2)
         def test_ddp_model_diff_shape_across_ranks(self):
             group_gloo = dist.new_group(
@@ -7745,6 +7750,7 @@ def test_ddp_model_diff_shape_across_ranks(self):
 
         @require_backend(DistTestCases.backend_feature["gpu"])
         @require_backends_available(DistTestCases.backend_feature["gpu"])
+        @sandcastle_skip_if(BACKEND == "ucc", "test failing locally with UCC")
         @skip_if_lt_x_gpu(2)
         def test_ddp_model_diff_num_params_across_ranks(self):
             group_gloo = dist.new_group(

From f01a5cf514529b37c265e3c600c83179e985bbb0 Mon Sep 17 00:00:00 2001
From: Driss Guessous <drisspg@fb.com>
Date: Mon, 31 Oct 2022 04:06:31 +0000
Subject: [PATCH 0327/1922] Use scaled_dot_product_attention within
 attention.cpp (#87312)

# Summary
Use the private _scaled_dot_product_attention to support _native_multiheaded_attention. _SDP provides access to fused kernels when certain conditions are meant enabling a speed up for MHA.

cc @cpuhrsch @jbschlosser @bhosmer @mikaylagawarecki
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87312
Approved by: https://github.com/cpuhrsch
---
 aten/src/ATen/native/native_functions.yaml    |   3 +-
 .../cuda/NestedTensorTransformerFunctions.cpp |  14 +-
 .../ATen/native/transformers/attention.cpp    |  60 ++++--
 aten/src/ATen/native/transformers/attention.h |  33 +++
 .../native/transformers/cuda/attention.cu     | 202 +++++++++++++++++-
 .../ATen/native/transformers/cuda/sdp_utils.h |  29 ++-
 .../ATen/native/transformers/transformer.cpp  |  47 ++--
 .../better_transformer_vs_mha_functional.py   | 195 +++++++++++++++++
 test/test_native_mha.py                       |  84 +++++---
 test/test_transformers.py                     |  87 +++++++-
 10 files changed, 650 insertions(+), 104 deletions(-)
 create mode 100644 aten/src/ATen/native/transformers/attention.h
 create mode 100644 benchmarks/transformer/better_transformer_vs_mha_functional.py

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index c113a44d9db95..300a14dd6baf6 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -13279,7 +13279,8 @@
 - func: _native_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, bool need_weights=True, bool average_attn_weights=True, int? mask_type=None) -> (Tensor, Tensor)
   variants: function
   dispatch:
-    CPU, CUDA, NestedTensorCPU, NestedTensorCUDA: native_multi_head_attention
+    CPU, NestedTensorCPU: native_multi_head_attention_cpu
+    CUDA, NestedTensorCUDA: native_multi_head_attention_cuda
   autogen: _native_multi_head_attention.out
 
 - func: _scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool need_attn_weights=False, bool is_causal=False) -> (Tensor, Tensor)
diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
index 307fc20721d60..33c180a929f18 100644
--- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
@@ -322,14 +322,15 @@ bool is_safe_to_get_storage_as_tensor(const NestedTensorImpl* tensor) {
   const int64_t* tensor_size_ptr = tensor_sizes.data_ptr<int64_t>();
   const int64_t* tensor_stride_ptr = tensor_strides.data_ptr<int64_t>();
 
-  int64_t offset_constant = (tensor_offsets[1] - tensor_offsets[0]) /
-      tensor_size_ptr[0] * tensor_stride_ptr[0];
+  int64_t numel_0 = (tensor_size_ptr[0] * tensor_stride_ptr[0]);
+  TORCH_INTERNAL_ASSERT(numel_0 > 0, "numels must be positive!");
 
+  int64_t offset_constant = (tensor_offsets[1] - tensor_offsets[0]) / numel_0;
   for (int64_t i = 2; i < n_tensors; i++) {
-    int64_t current_offset_constant =
-        (tensor_offsets[i] - tensor_offsets[i - 1]) /
-        tensor_size_ptr[(i - 1) * tensor_stride_0] *
-        tensor_stride_ptr[(i - 1) * tensor_stride_0];
+    // TODO: When 0 seq_len nested tensors are allowed we need to guard against this
+    int64_t previous_numel = tensor_size_ptr[(i - 1) * tensor_stride_0] * tensor_stride_ptr[(i - 1) * tensor_stride_0];
+    TORCH_INTERNAL_ASSERT(previous_numel > 0, "numels must be positive!");
+    int64_t current_offset_constant = (tensor_offsets[i] - tensor_offsets[i - 1]) / previous_numel;
     if (current_offset_constant != offset_constant) {
       return false;
     }
@@ -431,7 +432,6 @@ std::tuple<Tensor, Tensor> mem_efficient_helper_nested_unpacked(
       {Nnz_kv, num_heads, head_dim},
       {nnz_v_stride, head_v_stride, head_dim_stride},
       value_impl->get_storage_offsets()[0]);
-
   std::tuple<Tensor, Tensor> attention_and_weights =
       at::_efficient_attention_forward(
           query_buffer_reshaped.unsqueeze(0),
diff --git a/aten/src/ATen/native/transformers/attention.cpp b/aten/src/ATen/native/transformers/attention.cpp
index 5b3d5f999cfb1..5af9eaebb6c3a 100644
--- a/aten/src/ATen/native/transformers/attention.cpp
+++ b/aten/src/ATen/native/transformers/attention.cpp
@@ -6,6 +6,8 @@
 #include <ATen/Parallel.h>
 #include <ATen/TensorIndexing.h>
 #include <ATen/cpu/vec/vec256/vec256.h>
+#include <ATen/native/transformers/attention.h>
+
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/NativeFunctions.h>
@@ -14,7 +16,6 @@
 #endif
 
 #include <ATen/native/nested/NestedTensorTransformerFunctions.h>
-
 namespace at {
 
 namespace native {
@@ -106,6 +107,17 @@ void transform_bias_rescale_qkv_inner_loop(
   }
 }
 
+Tensor transform_0213(const Tensor& a) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.size(1));
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.size(3));
+  return a.permute({0, 2, 1, 3})
+      .contiguous()
+      .view({a.size(0), a.size(2), a.size(1) * a.size(3)});
+}
+
+} // namespace
+
+
 Tensor bmm_nt(const Tensor& a, const Tensor& b) {
   auto a_ = a.view({a.size(0) * a.size(1), a.size(2), a.size(3)});
   auto b_ = b.view({b.size(0) * b.size(1), b.size(2), b.size(3)});
@@ -118,7 +130,7 @@ Tensor masked_softmax(
     Tensor& attn_scores,
     c10::optional<Tensor> attn_mask,
     const Tensor& query,
-    c10::optional<int64_t> mask_type = NULL) {
+    c10::optional<int64_t> mask_type) {
   if (query.is_nested() && !attn_mask) {
     return at::_nested_tensor_softmax_with_shape(attn_scores, query);
   }
@@ -156,13 +168,6 @@ Tensor bmm_nn(Tensor& out, const Tensor& a, const Tensor& b) {
   return c_.view({a.size(0), a.size(1), a.size(2), b.size(3)});
 }
 
-Tensor transform_0213(const Tensor& a) {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.size(1));
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.size(3));
-  return a.permute({0, 2, 1, 3})
-      .contiguous()
-      .view({a.size(0), a.size(2), a.size(1) * a.size(3)});
-}
 
 Tensor transform0213_gemm_nt_bias(
     const Tensor& a,
@@ -254,8 +259,6 @@ Tensor qkv_projection(
   return qkv;
 }
 
-} // namespace
-
 // compute q = (q + q_bias) / sqrt(dim_per_head), k = k + k_bias, v = v + v_bias
 std::tuple<Tensor, Tensor, Tensor> transform_bias_rescale_qkv_cpu(
     const Tensor& qkv,
@@ -312,7 +315,7 @@ std::tuple<Tensor, Tensor, Tensor> transform_bias_rescale_qkv_cpu(
   return std::make_tuple(q_k_v_s[0], q_k_v_s[1], q_k_v_s[2]);
 }
 
-std::tuple<Tensor, Tensor> native_multi_head_attention(
+std::tuple<Tensor, Tensor> native_multi_head_attention_cpu(
     const Tensor& query,
     const Tensor& key,
     const Tensor& value,
@@ -692,18 +695,39 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention(
 }
 
 std::tuple<Tensor, Tensor> _scaled_dot_product_attention_forward_math(
-        const Tensor& query_, const Tensor& key, const Tensor& value,
-        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool need_attn_weights, bool is_causal){
-        return at::_scaled_dot_product_attention_math(query_, key, value, attn_mask_, dropout_p, need_attn_weights, is_causal);
-        }
+    const Tensor& query_,
+    const Tensor& key,
+    const Tensor& value,
+    const c10::optional<Tensor>& attn_mask_,
+    double dropout_p,
+    bool need_attn_weights,
+    bool is_causal) {
+  return at::_scaled_dot_product_attention_math(
+      query_,
+      key,
+      value,
+      attn_mask_,
+      dropout_p,
+      need_attn_weights,
+      is_causal);
+}
 
 std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math(
         const Tensor& query_, const Tensor& key, const Tensor& value,
         const c10::optional<Tensor>& attn_mask_, double dropout_p, bool need_attn_weights, bool is_causal) {
+  if (query_.is_nested() || key.is_nested() || value.is_nested()) {
+    TORCH_CHECK(
+        query_.is_contiguous() && key.is_contiguous() &&
+            value.is_contiguous(),
+        "scaled_dot_product_attention: If inputs are nested tensors they must be contiguous");
+  }
     auto attn_mask = attn_mask_;
     // Naive, composite implementation defined here.
     const auto embed_size = query_.size(-1);
-    const auto query = query_ / ::sqrt(static_cast<double>(embed_size));
+
+    // Scale q,k before matmul for stability see https://tinyurl.com/sudb9s96 for math
+    const double scaling_factor = ::sqrt(::sqrt(static_cast<double>(embed_size)));
+    const auto query = query_ / scaling_factor;
     if (is_causal) {
         TORCH_CHECK(!attn_mask.has_value(),
                 "_scaled_dot_product_attention: Explicit attn_mask should not be set when is_causal=True");
@@ -726,7 +750,7 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math(
         }
         // Otherwise, attn_mask represents an additive attention tensor
     }
-    auto attn = at::matmul(query, key.transpose(-2, -1));
+    auto attn = at::matmul(query, key.transpose(-2, -1)/scaling_factor);
     if (attn_mask.has_value()) {
         attn.add_(*attn_mask);
     }
diff --git a/aten/src/ATen/native/transformers/attention.h b/aten/src/ATen/native/transformers/attention.h
new file mode 100644
index 0000000000000..783b22869137e
--- /dev/null
+++ b/aten/src/ATen/native/transformers/attention.h
@@ -0,0 +1,33 @@
+#pragma once
+#include <ATen/ATen.h>
+#include <c10/macros/Export.h>
+
+namespace at {
+namespace native {
+
+TORCH_API Tensor bmm_nt(const Tensor& a, const Tensor& b);
+TORCH_API Tensor masked_softmax(
+    Tensor& attn_scores,
+    c10::optional<Tensor> attn_mask,
+    const Tensor& query,
+    c10::optional<int64_t> mask_type = NULL);
+
+TORCH_API Tensor transform0213_gemm_nt_bias(
+    const Tensor& a,
+    const Tensor& b,
+    const Tensor& c,
+    const Tensor& query);
+
+TORCH_API Tensor bmm_nn(Tensor& out, const Tensor& a, const Tensor& b);
+
+TORCH_API void debug_assert_shape(int line, const Tensor& t, c10::IntArrayRef shape);
+
+TORCH_API Tensor qkv_projection(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    const int64_t embed_dim,
+    const Tensor& qkv_weight);
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu
index acccb8821d833..f65fedd6d7954 100644
--- a/aten/src/ATen/native/transformers/cuda/attention.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention.cu
@@ -17,6 +17,7 @@
 
 #include <c10/cuda/CUDAMathCompat.h>
 
+#include <ATen/native/transformers/attention.h>
 #include <ATen/native/nested/NestedTensorUtils.h>
 #include <ATen/native/nested/NestedTensorTransformerFunctions.h>
 #include <ATen/native/nested/NestedTensorUtils.h>
@@ -479,6 +480,204 @@ __host__ std::tuple<Tensor, Tensor, Tensor> transform_bias_rescale_qkv_cuda(
   return std::make_tuple(q_k_v_s[0], q_k_v_s[1], q_k_v_s[2]);
 }
 
+std::tuple<Tensor, Tensor> native_multi_head_attention_cuda(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    const int64_t embed_dim,
+    const int64_t num_head,
+    const Tensor& qkv_weight,
+    const Tensor& qkv_bias,
+    const Tensor& proj_weight,
+    const Tensor& proj_bias,
+    const c10::optional<Tensor>& mask,
+    bool need_weights,
+    bool average_attn_weights,
+    const c10::optional<int64_t> mask_type) {
+  // query shape: [B, T, D]
+  // qkv_weight shape: [3 * D, D]
+
+  TORCH_CHECK(
+      !mask || !query.is_nested(),
+      "NestedTensor with mask is not supported yet");
+  const auto D = embed_dim;
+  TORCH_CHECK(
+      query.dim() == 3,
+      "expected 3-D `query`, got ",
+      query.dim(),
+      "-D tensor");
+  TORCH_CHECK(
+      query.is_nested() || query.sizes()[2] == embed_dim,
+      "passed-in embed_dim ",
+      embed_dim,
+      " didn't match last dim of query ",
+      query.sizes()[2]);
+  TORCH_CHECK(
+      key.dim() == 3,
+      "expected 3-D `key`, got ",
+      key.dim(),
+      "-D tensor");
+  TORCH_CHECK(
+      value.dim() == 3,
+      "expected 3-D `value`, got ",
+      value.dim(),
+      "-D tensor");
+  TORCH_CHECK(
+      query.is_nested() || key.is_nested() || value.is_nested() ||
+          (query.sizes() == key.sizes() && key.sizes() == value.sizes()),
+      "expected `query`/`key`/`value` shapes to match");
+  TORCH_CHECK(
+      qkv_weight.dim() == 2,
+      "expected 2-D `qkv_weight`, got ",
+      qkv_weight.dim(),
+      "-D tensor");
+  TORCH_CHECK(
+      D * 3 == qkv_weight.sizes()[0],
+      "expected `qkv_weight` first dim to be 3x embed_dim");
+  TORCH_CHECK(
+      D == qkv_weight.sizes()[1],
+      "expected `qkv_weight` second dim to be embed_Dim");
+  TORCH_CHECK(
+      qkv_bias.dim() == 1,
+      "expected 2-D `qkv_bias`, got ",
+      qkv_bias.dim(),
+      "-D tensor");
+  TORCH_CHECK(
+      qkv_bias.sizes()[0] == 3 * D,
+      "expected `qkv_bias` first dim and first dim of query to be equal");
+  TORCH_CHECK(D % num_head == 0, "`embed_dim` must divide evenly by `num_heads`");
+
+#ifndef NDEBUG
+  const auto B = query.is_nested()
+      ? get_nested_tensor_impl(query)->get_nested_size_tensor().size(0)
+      : query.sizes()[0];
+  auto T = query.is_nested() ? 0 : query.sizes()[1];
+
+#endif
+  const auto dim_per_head = D / num_head;
+  if ((query.is_same(key) && key.is_same(value)) && dim_per_head % 8 == 0 ) {
+
+    // We have not done linear projection yet but the input for SDP
+    // Is expected to be 4 dimensional. We "cheaply" create view tensors
+    // That will then be used for checking hot path conditions with select_sd_backend
+    auto q = query.view({query.size(0), -1, num_head, dim_per_head}).transpose(1, 2);
+    auto k = key.view({key.size(0), -1, num_head, dim_per_head}).transpose(1, 2);
+    auto v = value.view({value.size(0), -1, num_head, dim_per_head}).transpose(1, 2);
+
+    sdp::sdp_params kernel_params{q, k, v, mask.has_value(), 0.0, need_weights, false};
+    auto backend = select_sdp_backend(kernel_params);
+    if (backend == sdp::SDPBackend::flash_attention || backend == sdp::SDPBackend::efficient_attention) {
+      auto x = at::linear(query, qkv_weight, qkv_bias);
+      auto chunks = x.chunk(3, -1);
+      auto x_size_0 = x.size(0);
+
+      chunks[0] = (chunks[0].view({x_size_0, -1, num_head, dim_per_head}))
+                      .transpose(1, 2);
+      chunks[1] = (chunks[1].view({x_size_0, -1, num_head, dim_per_head}))
+                      .transpose(1, 2);
+      chunks[2] = (chunks[2].view({x_size_0, -1, num_head, dim_per_head}))
+                      .transpose(1, 2);
+
+      auto y = at::_scaled_dot_product_attention(
+          chunks[0], chunks[1], chunks[2], mask, 0.0, need_weights, false);
+      auto past_sdp =
+          std::get<0>(y).transpose(1, 2).reshape({x_size_0, -1, embed_dim});
+      return std::make_tuple(
+          at::linear(past_sdp, proj_weight, proj_bias), Tensor());
+    }
+    // Returned math or error lets not use it
+  }
+
+  // shape: [B, T, 3 x D]
+  auto qkv = qkv_projection(query, key, value, embed_dim, qkv_weight);
+
+  if (!qkv.is_nested() && qkv.numel() == 0) {
+    if (query.is_nested()) {
+      return std::make_tuple(Tensor(), Tensor());
+    }
+    return std::make_tuple(at::empty_like(query), Tensor());
+  }
+
+#ifndef NDEBUG
+  if (!query.is_nested() || !qkv.is_nested()) {
+    if (query.is_nested()) {
+      T = qkv.size(1);
+    }
+    debug_assert_shape(__LINE__, qkv, {B, T, 3 * D});
+  }
+#endif
+
+#ifdef DEBUG_PRINT_EACH_STEP
+  if (!qkv.is_nested()) {
+    std::cerr << "qkv: " << qkv << std::endl;
+  }
+#endif
+  // shape: 3 x [B, num_head, T, dim_per_head]
+  auto q_k_v = _transform_bias_rescale_qkv(qkv, qkv_bias, num_head);
+  qkv = Tensor(); // Not used any more, allow free
+  auto& q = std::get<0>(q_k_v);
+  const auto& k = std::get<1>(q_k_v);
+  const auto& v = std::get<2>(q_k_v);
+#ifndef NDEBUG
+  debug_assert_shape(__LINE__, q, {B, num_head, T, dim_per_head});
+  debug_assert_shape(__LINE__, k, {B, num_head, T, dim_per_head});
+  debug_assert_shape(__LINE__, v, {B, num_head, T, dim_per_head});
+#endif
+#ifdef DEBUG_PRINT_EACH_STEP
+  std::cerr << "q: " << q << std::endl;
+  std::cerr << "k: " << k << std::endl;
+  std::cerr << "v: " << v << std::endl;
+#endif
+
+  // shape: [B, num_head, T, T]
+  auto qkt = bmm_nt(q, k);
+  // q & k are dead but cannot be freed because they were packed with v
+#ifndef NDEBUG
+  debug_assert_shape(__LINE__, qkt, {B, num_head, T, T});
+#endif
+#ifdef DEBUG_PRINT_EACH_STEP
+  std::cerr << "qkt: " << qkt << std::endl;
+#endif
+
+  // shape: [B, num_head, T, T]
+  // TODO: long-term, have a kernel that works with
+  // NestedTensor directly if there is no mask passed
+  qkt = masked_softmax(qkt, mask, query, mask_type);
+#ifdef DEBUG_PRINT_EACH_STEP
+  std::cerr << "qkt after softmax: " << qkt << std::endl;
+#endif
+
+  // shape: [B, num_head, T, dim_per_head]
+  // reuse storage for q; we're done with it
+  auto attn_ctx = bmm_nn(q, qkt, v);
+  // qkv is not dead; we just reused storage for q!
+  if (!need_weights) {
+    qkt = Tensor();
+  }
+#ifndef NDEBUG
+  debug_assert_shape(__LINE__, attn_ctx, {B, num_head, T, dim_per_head});
+#endif
+#ifdef DEBUG_PRINT_EACH_STEP
+  std::cerr << "attn_ctx: " << attn_ctx << std::endl;
+#endif
+
+  // shape: [B, T, D]
+  // Fuse transform_0213 inside
+  auto proj = transform0213_gemm_nt_bias(
+      attn_ctx, proj_weight, proj_bias, query);
+#ifndef NDEBUG
+  debug_assert_shape(__LINE__, proj, {B, T, D});
+#endif
+  if (need_weights && average_attn_weights) {
+    // weights are not needed for full transformer, so don't worry too
+    // much about performance -- we implement this just to make use
+    // cases that don't disable need_weights still get some speedup.
+    qkt = qkt.sum(1);
+    qkt /= num_head;
+  }
+  return std::make_tuple(std::move(proj), std::move(qkt));
+}
+
 std::tuple<Tensor, Tensor> flash_attention_helper_dense_unpacked(
     const Tensor& query,
     const Tensor& key,
@@ -636,7 +835,6 @@ std::tuple<at::Tensor, at::Tensor> _efficient_attention_forward(
 // TODO In theory it is possible to compile with _CUDA_ARCH < 5.0 and run on a
 // machine that is >= 5.0. In practice, this is not a problem but since
 // this would avoid runtime architecture checks, we should look into it
-
   TORCH_CHECK(query.dim() == 4);
   TORCH_CHECK(key.dim() == 4);
   TORCH_CHECK(value.dim() == 4);
@@ -768,7 +966,7 @@ std::tuple<at::Tensor, at::Tensor> _efficient_attention_forward(
           kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes));
     }
     Kernel::check_supported(p);
-    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
+    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes, stream>>>(p);
   };
   // Dispatch to the right kernel
   DISPATCH_KERNEL(query, key, value, ([&]() {
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.h b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
index fb651e3d5aff3..e0f38e10f966c 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.h
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
@@ -9,6 +9,7 @@
 #include <c10/core/ScalarType.h>
 #include <c10/util/env.h>
 #include <c10/util/irange.h>
+#include <ATen/NestedTensorImpl.h>
 
 #include <functional>
 #include <unordered_set>
@@ -61,6 +62,26 @@ inline bool check_for_attn_weights(sdp_params params, bool debug) {
   }
   return true;
 }
+inline bool check_for_seq_len_1_nested_tensor(sdp_params params, bool debug) {
+  if (!params.query.is_nested()) {
+    return true;
+  }
+  const at::Tensor& sizes = at::native::get_nested_tensor_impl(params.query)->get_nested_size_tensor();
+  auto* sizes_ptr = sizes.data_ptr<int64_t>();
+  const int64_t n_tensors = params.query.size(0);
+  const int64_t size_tensor_stride = sizes.stride(0);
+
+  // This is being called inside sdp with shape [batch, heads, {seq_len}, dim]
+  for (const auto i : c10::irange(n_tensors)) {
+    if (sizes_ptr[(i * size_tensor_stride) + 1] <= 1) {
+      TORCH_CHECK(
+          !debug, "Flash Attention does not support sequence_length < 1");
+      return false;
+    }
+  }
+
+  return true;
+}
 
 inline bool check_for_attn_mask(sdp_params params, bool debug) {
   if (params.has_attn_mask) {
@@ -73,7 +94,7 @@ inline bool check_for_attn_mask(sdp_params params, bool debug) {
 inline bool check_tensor_shapes(sdp_params params, bool debug) {
   auto query_dim = params.query.dim();
   if (!(query_dim == params.key.dim() && query_dim == params.value.dim() &&
-        query_dim == 4)) {
+        (query_dim == 4 ))) {
     TORCH_CHECK(
         !debug,
         "Flash attention requires query, key and value to be 4 dimensional, but got Query dim: ",
@@ -179,7 +200,8 @@ inline bool use_flash_attention(sdp_params params, bool debug) {
       check_for_attn_weights,
       check_for_attn_mask,
       check_head_dim_size,
-      check_gpu_sm75_or_greater};
+      check_gpu_sm75_or_greater,
+      check_for_seq_len_1_nested_tensor};
   for (auto& constraint : constraints) {
     if (!constraint(params, debug)) {
       return false;
@@ -206,7 +228,8 @@ inline bool use_mem_efficient_attention(sdp_params params, bool debug) {
       check_runtime_disabled_mem_efficient,
       check_for_attn_weights,
       check_tensor_shapes,
-      check_for_attn_mask};
+      check_for_attn_mask,
+      check_for_seq_len_1_nested_tensor};
   for (auto& constraint : constraints) {
     if (!constraint(params, debug)) {
       return false;
diff --git a/aten/src/ATen/native/transformers/transformer.cpp b/aten/src/ATen/native/transformers/transformer.cpp
index afadb0d6bce8b..4a4c9946b35aa 100644
--- a/aten/src/ATen/native/transformers/transformer.cpp
+++ b/aten/src/ATen/native/transformers/transformer.cpp
@@ -95,44 +95,27 @@ Tensor transformer_encoder_layer_forward(
   if (norm_first) {
     x = norm(x, embed_dim, layer_norm_eps, layer_norm_weight_1, layer_norm_bias_1, use_nested_tensor);
   }
+  x = std::get<0>(at::_native_multi_head_attention(
+      x,
+      x,
+      x,
+      embed_dim,
+      num_heads,
+      qkv_weight,
+      qkv_bias,
+      proj_weight,
+      proj_bias,
+      mask,
+      false /* need_weights */,
+      true /* average_attn_weights */,
+      mask_type));
 
-#if BETTER_TRANSFORMER_USE_FLASH_ATTENTION
-  if (x.is_nested() && x.is_cuda() && x.dtype() == at::kHalf && !mask.has_value() &&
-      (embed_dim / num_heads == 16 ||
-       embed_dim / num_heads == 32 ||
-       embed_dim / num_heads == 64 ||
-       embed_dim / num_heads == 128)) {
-     TORCH_WARN_ONCE("transformer_encoder_layer_forward is using flash attention.");
-     x = at::linear(x, qkv_weight, qkv_bias);
-     auto x_size_0 = x.size(0);
-     x = x.view({x_size_0, -1, 3, num_heads, embed_dim / num_heads});
-     x = flash_attention_helper(x, x, x, 0.0, false, false);
-     x = x.view({x_size_0, -1, embed_dim});
-     x = at::linear(x, proj_weight, proj_bias);
-  } else {
-#endif
-     x = std::get<0>(native_multi_head_attention(
-         x,
-         x,
-         x,
-         embed_dim,
-         num_heads,
-         qkv_weight,
-         qkv_bias,
-         proj_weight,
-         proj_bias,
-         mask,
-         false /* need_weights */,
-         true /* average_attn_weights */,
-         mask_type));
-#if BETTER_TRANSFORMER_USE_FLASH_ATTENTION
-  }
-#endif
   x.add_(src);
   if (!norm_first) {
     x = norm(x, embed_dim, layer_norm_eps, layer_norm_weight_1, layer_norm_bias_1, use_nested_tensor);
   }
 
+
   auto pre_ffn_res = x;
 
   if (norm_first) {
diff --git a/benchmarks/transformer/better_transformer_vs_mha_functional.py b/benchmarks/transformer/better_transformer_vs_mha_functional.py
new file mode 100644
index 0000000000000..b76077ba4c22e
--- /dev/null
+++ b/benchmarks/transformer/better_transformer_vs_mha_functional.py
@@ -0,0 +1,195 @@
+"""
+Tests the performance of torch.nn.MultiheadAttention's fast path (BetterTransformer)
+vs the slow path (torch.nn.functional.multi_head_attention)
+
+To run this script install these dependencies:
+
+pip install tqdm
+pip install prettytable
+"""
+
+import torch
+import random
+import numpy as np
+from pprint import pprint
+import itertools
+import json
+import argparse
+from pathlib import Path
+from typing import Optional
+
+from prettytable import PrettyTable
+from collections import defaultdict, OrderedDict
+from tqdm import tqdm
+
+
+import warnings
+
+warnings.filterwarnings("ignore")
+
+error_dict = defaultdict(int)
+
+
+def benchmark_torch_function(iters, f, *args, **kwargs):
+    f(*args, **kwargs)
+    f(*args, **kwargs)
+    torch.cuda.synchronize()
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    for _ in range(iters):
+        f(*args, **kwargs)
+    end_event.record()
+    torch.cuda.synchronize()
+    # elapsed_time has a resolution of 0.5 microseconds:
+    # but returns milliseconds, so we need to multiply it to increase resolution
+    return start_event.elapsed_time(end_event) * 1000 / iters, *f(*args, **kwargs)
+
+
+def run(a: int, b: int, iters: int, batch_size: int, sequence_length: int,
+        embed_dim: int, num_heads: int, device: str, dtype: str, block_size: int, seed):
+    random.seed(seed)
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+
+    from scipy.stats import beta
+    lengths = beta.rvs(a, b, size=batch_size) * (sequence_length + block_size - 1) // block_size
+    lengths = list(map(int, list(lengths)))
+    lengths = [l * block_size for l in lengths]
+    lengths = [max(l, block_size) for l in lengths]
+
+    # Used to enforce no padding
+    # lengths = [sequence_length] * batch_size
+
+    # Ensure one row in the batch of ele has the max_sequence_length
+    lengths[random.randint(0, batch_size - 1)] = sequence_length
+
+    q = [torch.randn(l, embed_dim, device=device, dtype=dtype)
+         for l in lengths]
+    q = torch.nested.nested_tensor(q, device=device, dtype=dtype)
+    k, v = q, q
+
+    qkv = torch.nn.Linear(embed_dim, 3 * embed_dim, device=device, dtype=dtype)
+    proj = torch.nn.Linear(embed_dim, embed_dim, device=device, dtype=dtype)
+
+    native_mha = torch.nn.MultiheadAttention(
+        embed_dim, num_heads, batch_first=True, device=device, dtype=dtype
+    ).eval()
+    native_mha.in_proj_weight = qkv.weight
+    native_mha.in_proj_bias = qkv.bias
+    native_mha.out_proj.weight = proj.weight
+    native_mha.out_proj.bias = proj.bias
+
+    # Create query mask
+    q_mask = torch.nested.to_padded_tensor(
+        torch.nested.nested_tensor([
+            torch.tensor([True] * length, dtype=torch.bool)
+            for length in lengths
+        ]), 0)
+    q_mask = q_mask.cuda()
+
+    if q_mask.size(1) == 0:
+        return None
+
+    # Benchmark the native MHA in core
+    with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=True):
+        with torch.inference_mode():
+            time_native_mha_fast, y_native_mha_fast, _ = benchmark_torch_function(
+                iters, native_mha, q, k, v, need_weights=False)
+    q = q.to_padded_tensor(0)
+    k = q
+    v = q
+    # Internal Flash Attention
+    time_native_mha_slow, y_native_mha_slow, _ = benchmark_torch_function(
+        iters, native_mha, q, k, v, key_padding_mask=~q_mask, need_weights=False)
+
+    # Convert to padded for comparison
+    if y_native_mha_fast.is_nested:
+        y_native_mha_fast = torch.nested.to_padded_tensor(y_native_mha_fast, 0)
+    y_native_mha_fast = y_native_mha_fast * q_mask.unsqueeze(-1)
+
+    if y_native_mha_slow.is_nested:
+        y_native_mha_slow = torch.nested.to_padded_tensor(y_native_mha_slow, 0)
+    y_native_mha_slow = y_native_mha_slow * q_mask.unsqueeze(-1)
+
+    # Correctness check
+    entry_name = f"batch:{batch_size}_seq_len:{sequence_length}_n_heads:{num_heads}_embed_dim:{embed_dim}"
+    try:
+        torch.testing.assert_close(y_native_mha_fast, y_native_mha_slow, atol=1e-3, rtol=1e-3)
+    except AssertionError as e:
+        error_dict[entry_name] += 1
+        pprint(error_dict)
+
+    # Calculate amount of padding
+    padding = 1 - q_mask.float().mean().item()
+
+    # Calculate the speedup for flash attention
+    speedup_fast_internal = time_native_mha_slow / time_native_mha_fast
+
+    result_entry = OrderedDict()
+    result_entry['dtype'] = dtype
+    result_entry["batch_size"] = batch_size
+    result_entry["sequence_length"] = sequence_length
+    result_entry["n_heads"] = num_heads
+    result_entry["embed_dim"] = embed_dim
+    result_entry["time_native_mha_slow(μs)"] = f"{time_native_mha_slow:.3f}"
+    result_entry["time_native_mha_fast (μs)"] = f"{time_native_mha_fast:.3f}"
+    result_entry["speedup flash_mha v native_mha"] = f"{speedup_fast_internal:.3f}"
+    result_entry["padding"] = f"{padding:.3f}"
+    return result_entry
+
+
+def main(save_path: Optional[Path], error_path: Optional[Path]):
+    table = PrettyTable()
+    entries = defaultdict(list)
+
+    print("CUDA device: ", torch.cuda.get_device_name(0))
+    iters = 100
+    header = None
+    batch_sizes = [16, 32, 64, 128, 256]
+    sequence_lengths = [64, 128, 256, 512]
+    embed_dims = [512, 1024]
+    num_heads_list = [8, 16]
+    betas = range(1, 64, 4)
+
+    for (batch_size, sequence_length, embed_dim, num_heads, block_size, b) in tqdm(
+            list(itertools.product(batch_sizes, sequence_lengths, embed_dims, num_heads_list, [2], betas))):
+        seed = 26214  # Magic number that works well for higher b values
+        entry = run(1, b * 0.05, iters, batch_size, sequence_length,
+                    embed_dim, num_heads, "cuda", torch.float16, block_size, seed)
+        if entry is None:
+            continue
+        if header is None:
+            table.field_names = list(entry.keys())
+            header = list(entry.keys())
+        row = []
+        for k, v in entry.items():
+            row.append(v)
+            entries[k].append(v)
+        table.add_row(row)
+
+    # Print the full table to console
+    print(table)
+    pprint(error_dict)
+
+    csv_string = table.get_csv_string()
+    if save_path is not None:
+        with open(save_path, 'w') as csvfile:
+            csvfile.write(csv_string)
+
+    print(f"Total errors: {sum(error_dict.values())}")
+    if error_path is not None:
+        with open(error_path, 'w') as file:
+            file.write(json.dumps(error_dict))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--save_path", type=str, help="Path to save the results")
+    parser.add_argument("--error_save_path", type=str, help="Path to save the errors")
+
+    args = parser.parse_args()
+    save_path = Path(args.save_path) if args.save_path else None
+    error_path = Path(args.error_save_path) if args.error_save_path else None
+
+    main(save_path, error_path)
diff --git a/test/test_native_mha.py b/test/test_native_mha.py
index 41f56e8b89296..2af1db7395b61 100644
--- a/test/test_native_mha.py
+++ b/test/test_native_mha.py
@@ -1,5 +1,6 @@
 # Owner(s): ["module: nn"]
 import math
+import copy
 
 import torch
 from torch.testing._internal.common_device_type import (
@@ -9,7 +10,7 @@
     onlyCUDA,
     skipMeta,
 )
-from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.testing._internal.common_utils import parametrize, run_tests, TestCase
 
 class TestMHADeviceType(TestCase):
     @torch.no_grad()
@@ -116,36 +117,40 @@ def _test_multihead_attention_impl(
         bs = 16
         sl = 8
 
-        q = torch.randn(bs, sl, embed_dim, device=device, dtype=dtype) * 10
+        q = 6 * torch.rand(bs, sl, embed_dim, device=device, dtype=torch.float32) - 3
         if use_padding:
             if pad_all:
                 for q_i in q:
-                    q_i[-1] = torch.zeros_like(q[0][-1], device=device, dtype=dtype)
+                    q_i[-1] = torch.zeros_like(q[0][-1], device=device, dtype=torch.float32)
                 mask = torch.zeros(q.shape[:-1], device=device, dtype=torch.bool)
                 for mask_i in mask:
                     mask_i[-1] = True
             else:
-                q[0][-1] = torch.zeros_like(q[0][-1], device=device, dtype=dtype)
+                q[0][-1] = torch.zeros_like(q[0][-1], device=device, dtype=torch.float32)
                 mask = torch.zeros(q.shape[:-1], device=device, dtype=torch.bool)
                 mask[0][-1] = True
         if mode == "self":
             k = q
             v = q
         elif mode == "encdec":
-            k = torch.randn(bs, sl, embed_dim, device=device, dtype=dtype) * 10
+            k = 6 * torch.rand(bs, sl, embed_dim, device=device, dtype=torch.float32) - 3
             v = k
         elif mode == "generic":
-            k = torch.randn(bs, sl, embed_dim, device=device, dtype=dtype) * 10
-            v = torch.randn(bs, sl, embed_dim, device=device, dtype=dtype) * 10
+            k = 6 * torch.rand(bs, sl, embed_dim, device=device, dtype=torch.float32) - 3
+            v = 6 * torch.rand(bs, sl, embed_dim, device=device, dtype=torch.float32) - 3
         else:
             self.fail(f"invalid mode `{mode}`!")
 
-        qkv = torch.nn.Linear(embed_dim, 3 * embed_dim, device=device, dtype=dtype)
-        proj = torch.nn.Linear(embed_dim, embed_dim, device=device, dtype=dtype)
+        qkv = torch.nn.Linear(embed_dim, 3 * embed_dim, device=device, dtype=torch.float32)
+        native_qkv = copy.deepcopy(qkv).to(dtype=dtype)
+
+        proj = torch.nn.Linear(embed_dim, embed_dim, device=device, dtype=torch.float32)
+        native_proj = copy.deepcopy(proj).to(dtype=dtype)
 
         pt = torch.nn.MultiheadAttention(
-            embed_dim, num_heads, batch_first=True, device=device, dtype=dtype
+            embed_dim, num_heads, batch_first=True, device=device, dtype=torch.float32
         )
+
         pt.in_proj_weight = qkv.weight
         pt.in_proj_bias = qkv.bias
         pt.out_proj.weight = proj.weight
@@ -177,7 +182,7 @@ def forward(self, q, k, v, key_padding_mask):
                 )
 
         npt = NativeMHA(
-            embed_dim=embed_dim, num_heads=num_heads, qkv=qkv, proj=proj
+            embed_dim=embed_dim, num_heads=num_heads, qkv=native_qkv, proj=native_proj
         ).to(dtype)
 
         if device == "cuda":
@@ -209,8 +214,12 @@ def forward(self, q, k, v, key_padding_mask):
                 k = torch.nested.nested_tensor(torch.unbind(k), device=device, dtype=dtype)
                 v = torch.nested.nested_tensor(torch.unbind(v), device=device, dtype=dtype)
 
+        native_q = q.to(dtype=dtype)
+        native_k = k.to(dtype=dtype)
+        native_v = v.to(dtype=dtype)
+
         ynpt, weight_npt = npt(
-            q, k, v, key_padding_mask=mask if use_padding and not use_nt else None
+            native_q, native_k, native_v, key_padding_mask=mask if use_padding and not use_nt else None
         )
         if use_nt:
             ynpt = ynpt.to_padded_tensor(0)
@@ -244,7 +253,7 @@ def do_pad_all(tensors):
                         weight_npt[0][nh][-1] = torch.zeros_like(weight_npt[0][nh][-1], device=device, dtype=dtype)
 
         if dtype == torch.half:
-            torch.testing.assert_close(ypt, ynpt, atol=1e-3, rtol=1e-3)
+            torch.testing.assert_close(ypt, ynpt.to(torch.float32), atol=1e-3, rtol=1e-3)
         else:
             # High rtol seems necessary for
             # test_native_multihead_attention_cpu_float32 on Windows,
@@ -252,35 +261,40 @@ def do_pad_all(tensors):
             torch.testing.assert_close(ypt, ynpt, atol=2e-5, rtol=2e-3)
 
         if need_weights:
-            torch.testing.assert_close(weight_pt, weight_npt)
+            torch.testing.assert_close(weight_pt, weight_npt.to(torch.float32), atol=5e-4, rtol=5e-4)
         else:
             self.assertEqual(weight_pt, weight_npt)
 
     @dtypesIfCUDA(torch.float, torch.half)
     @dtypes(torch.float)
     @skipMeta
+    @parametrize("use_nt", [False, True])
+    @parametrize("use_padding, pad_all", [(False, False), (True, False), (True, True)])
+    @parametrize("need_weights", [False])
+    @parametrize("average_attn_weights", [False, True])
+    @parametrize("fused", [False, True])
     @torch.no_grad()
-    def test_native_multihead_self_attention(self, device, dtype):
-        for (use_padding, pad_all) in ((False, False), (True, False), (True, True)):
-            for use_nt in (False, True):
-                # Figuring out exactly which elements of the weights are garbage in this
-                # case eludes me, and it's not particularly enlightening to test anyway
-                # because padding doesn't especially affect the intermediate weights.
-                for need_weights in (False, not pad_all):
-                    for average_attn_weights in (False, True):
-                        with self.subTest(use_padding=use_padding, pad_all=pad_all,
-                                          use_nt=use_nt, need_weights=need_weights,
-                                          average_attn_weights=average_attn_weights):
-                            self._test_multihead_attention_impl(
-                                device,
-                                dtype,
-                                "self",
-                                use_nt=use_nt,
-                                use_padding=use_padding,
-                                pad_all=pad_all,
-                                need_weights=need_weights,
-                                average_attn_weights=average_attn_weights,
-                            )
+    def test_native_multihead_self_attention(self, device, dtype, use_nt,
+                                             need_weights, average_attn_weights, use_padding, pad_all, fused):
+        for need_weights in (False, not pad_all):
+            with self.subTest(use_padding=use_padding, pad_all=pad_all,
+                              use_nt=use_nt, need_weights=need_weights,
+                              average_attn_weights=average_attn_weights):
+                with torch.backends.cuda.sdp_kernel(
+                        enable_flash=False, enable_mem_efficient=False
+                ) if not fused else torch.backends.cuda.sdp_kernel(
+                        enable_flash=True, enable_mem_efficient=True
+                ):
+                    self._test_multihead_attention_impl(
+                        device,
+                        dtype,
+                        "self",
+                        use_nt=use_nt,
+                        use_padding=use_padding,
+                        pad_all=pad_all,
+                        need_weights=need_weights,
+                        average_attn_weights=average_attn_weights,
+                    )
 
     @dtypesIfCUDA(torch.float, torch.half)
     @dtypes(torch.float)
diff --git a/test/test_transformers.py b/test/test_transformers.py
index 5485d39df0e17..ceb9213e9b037 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -19,9 +19,10 @@
     freeze_rng_state,
     TEST_WITH_CROSSREF,
     TEST_WITH_ROCM,
-    IS_WINDOWS
+    IS_WINDOWS,
+    slowTest
 )
-from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_cuda import TEST_CUDA, SM80OrLater
 
 if TEST_FAIRSEQ:
     import fairseq.models.transformer as fairseq_transformer
@@ -71,6 +72,7 @@ def test_self_attn_TxT_attn_mask(self):
             self.assertEqual(output_mask_4d, output_mask_TxT)
 
     @parametrize("device", device_list)
+    @slowTest
     def test_train_with_pad_and_catch_error(self, device):
         iters = 100
         pad_mask = torch.tensor([[1, 1, 0, 0]], dtype=torch.bool).to(device)
@@ -148,14 +150,14 @@ def test_transformerencoderlayer_src_mask(self, device, nhead):
     @parametrize("use_torchscript", [False])
     @parametrize("enable_nested_tensor", [True, False])
     @parametrize("use_autocast", [True, False])
-    def test_transformerencoder_fastpath(self, device, use_torchscript, enable_nested_tensor, use_autocast):
+    @parametrize("d_model", [12, 256])
+    def test_transformerencoder_fastpath(self, device, use_torchscript, enable_nested_tensor, use_autocast, d_model):
         """
         Test TransformerEncoder fastpath output matches slowpath output
         """
         torch.manual_seed(1234)
-        d_model = 12
         nhead = 4
-        dim_feedforward = 12
+        dim_feedforward = d_model
         batch_first = True
 
         model = torch.nn.TransformerEncoder(
@@ -224,7 +226,6 @@ def test_transformerencoder_fastpath(self, device, use_torchscript, enable_neste
                 with torch.no_grad():
                     fastpath_output = model(input, src_key_padding_mask=src_key_padding_mask)
                 slowpath_output = model(input, src_key_padding_mask=src_key_padding_mask)  # reference
-
                 # Make sure fastpath_output is same shape as slowpath_output and mask.
                 # When enable_nested_tensor=true, fastpath_output may be smaller than input tensor.
                 # Eg if input bs=1, seqlen=6, and we mask out 2 tokens, fastpath_output will have bs=1, seqlen=4.
@@ -1010,6 +1011,80 @@ def rand_tensor(shape):
 
         self.assertEqual(actual[0].contiguous(), math_ref[0].contiguous(), atol=2e-3, rtol=1e-2)
 
+    @unittest.skipIf(not TEST_CUDA or TEST_WITH_ROCM or IS_WINDOWS, "Flash Attention was not built for this system")
+    @parametrize("type", ["dense", "nested"])
+    @parametrize("fused_kernel", ["flash", "mem_efficient"])
+    def test_scaled_dot_product_attention_fused_kernels_packed_accuracy(self, type: str, fused_kernel: str):
+        if (not SM80OrLater) and fused_kernel == "flash":
+            return
+
+        def rand_nt(shape):
+            batch, seq_len, num_heads, head_dim = shape
+            tensors = [6 * torch.rand((seq_len, 3 * num_heads * head_dim), device="cuda", dtype=torch.float32) - 3
+                       for _ in range(batch)]
+            return (torch.nested.nested_tensor(tensors, device="cuda", dtype=torch.float32),
+                    torch.nested.nested_tensor(tensors, device="cuda", dtype=torch.float16))
+
+        def rand_tensor(shape):
+            batch, seq_len, num_heads, head_dim = shape
+            tensor = 6 * torch.rand((batch, seq_len, 3 * num_heads * head_dim), device="cuda", dtype=torch.float32) - 3
+            return tensor, tensor.to(dtype=torch.float16)
+
+        batch_size, seq_len, num_heads, head_dim = 16, 8, 4, 64
+        shape = (batch_size, seq_len, num_heads, head_dim)
+
+        # Test Packed
+        qkv, qkv_low_precision = rand_tensor(shape) if type == "dense" else rand_nt(shape)
+        query, key, value = qkv.chunk(3, dim=-1)
+        query_lp, key_lp, value_lp = qkv_low_precision.chunk(3, dim=-1)
+
+        query = query.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+
+        query_lp = query_lp.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+        key_lp = key_lp.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+        value_lp = value_lp.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+
+        if fused_kernel == "flash":
+            with sdp_kernel(enable_mem_efficient=False, enable_math=False):
+                actual = torch.nn.functional._scaled_dot_product_attention(
+                    query_lp, key_lp, value_lp, attn_mask=None, dropout_p=0.0, need_attn_weights=False, is_causal=False)
+        elif fused_kernel == "mem_efficient":
+            with sdp_kernel(enable_flash=False, enable_math=False):
+                actual = torch.nn.functional._scaled_dot_product_attention(
+                    query_lp, key_lp, value_lp, attn_mask=None, dropout_p=0.0, need_attn_weights=False, is_causal=False)
+
+        with sdp_kernel(enable_flash=False, enable_mem_efficient=False):
+            math_ref_lp = torch.nn.functional._scaled_dot_product_attention(
+                query_lp.contiguous(), key_lp.contiguous(), value_lp.contiguous(),
+                attn_mask=None, dropout_p=0.0, need_attn_weights=False, is_causal=False)
+
+        with sdp_kernel(enable_flash=False, enable_mem_efficient=False):
+            math_query = query.contiguous()
+            math_key = key.contiguous()
+            math_value = value.contiguous()
+
+            math_ref = torch.nn.functional._scaled_dot_product_attention(
+                math_query, math_key, math_value, attn_mask=None, dropout_p=0.0, need_attn_weights=False, is_causal=False)
+
+        actual_test = actual[0]
+        math_ref_test = math_ref[0]
+        math_ref_lp_test = math_ref_lp[0]
+
+        if actual_test.is_nested:
+            actual_test = torch.nested.to_padded_tensor(actual_test.contiguous(), padding=0.0)
+            math_ref_test = torch.nested.to_padded_tensor(math_ref_test, padding=0.0)
+            math_ref_lp_test = torch.nested.to_padded_tensor(math_ref_lp_test, padding=0.0)
+
+        actual_test = actual_test.to(dtype=torch.float32).contiguous()
+        math_ref_test = math_ref_test.to(dtype=torch.float32).contiguous()
+        math_ref_lp_test = math_ref_lp_test.to(dtype=torch.float32).contiguous()
+
+        self.assertEqual(math_ref_test, math_ref_lp_test, atol=7e-3, rtol=7e-3)
+        self.assertEqual(actual_test, math_ref_test, atol=5e-3, rtol=5e-3)
+
+
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     def test_sdp_runtime_dispatch(self):
         # We will test all the constraints that we know will cause a failure

From 991239242155af36070a73b997595dfc68b5e059 Mon Sep 17 00:00:00 2001
From: Jing Xu <jing.xu@intel.com>
Date: Mon, 31 Oct 2022 04:40:52 +0000
Subject: [PATCH 0328/1922] fix github bug issue 87552 (#88059)

Fixes #87552
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88059
Approved by: https://github.com/jgong5, https://github.com/ngimel
---
 torch/_tensor_docs.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index b564351acf590..8b12032f22a3c 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -2229,14 +2229,15 @@ def add_docstr_all(method, docstr):
 get_device() -> Device ordinal (Integer)
 
 For CUDA tensors, this function returns the device ordinal of the GPU on which the tensor resides.
-For CPU tensors, an error is thrown.
+For CPU tensors, this function returns `-1`.
 
 Example::
 
     >>> x = torch.randn(3, 4, 5, device='cuda:0')
     >>> x.get_device()
     0
-    >>> x.cpu().get_device()  # RuntimeError: get_device is not implemented for type torch.FloatTensor
+    >>> x.cpu().get_device()
+    -1
 """,
 )
 

From 789381a6f4f635a8ec9664ce33dc0170e5a854b7 Mon Sep 17 00:00:00 2001
From: Kshiteej K <kshitijkalambarkar@gmail.com>
Date: Mon, 31 Oct 2022 04:42:45 +0000
Subject: [PATCH 0329/1922] [test_nn] split convolution tests from test_nn
 (#87474)

Ref #63085

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87474
Approved by: https://github.com/albanD
---
 test/nn/test_convolution.py | 2480 +++++++++++++++++++++++++++++++++++
 test/test_nn.py             | 2445 +---------------------------------
 2 files changed, 2486 insertions(+), 2439 deletions(-)
 create mode 100644 test/nn/test_convolution.py

diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py
new file mode 100644
index 0000000000000..c94eb5447d5ad
--- /dev/null
+++ b/test/nn/test_convolution.py
@@ -0,0 +1,2480 @@
+# Owner(s): ["module: nn"]
+import math
+import unittest
+import itertools
+import warnings
+from itertools import product
+
+import torch
+
+import torch.autograd.forward_ad as fwAD
+import torch.backends.cudnn as cudnn
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.testing._internal.common_dtype import floating_types_and, floating_and_complex_types_and
+from torch.testing._internal.common_utils import run_tests, \
+    skipIfRocmVersionLessThan, skipIfNotMiopenSuggestNHWC, TEST_SCIPY, TEST_WITH_ROCM, \
+    download_file, parametrize as parametrize_test, subtest, \
+    instantiate_parametrized_tests, set_default_dtype
+from torch.testing._internal.common_cuda import TEST_CUDA, TEST_CUDNN
+from torch.testing._internal.common_nn import NNTestCase, _test_module_empty_input
+from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes, \
+    dtypesIfCUDA, precisionOverride, skipCUDAIfNoCudnn, skipCUDAIfCudnnVersionLessThan, onlyCUDA, onlyCPU, \
+    skipCUDAIfRocm, skipCUDAIfRocmVersionLessThan, skipCUDAIfNotMiopenSuggestNHWC, \
+    onlyNativeDeviceTypes, largeTensorTest, skipMeta, \
+    disableMkldnn, skipCPUIfNoMkldnn, disablecuDNN, skipCUDAIfMiopen, skipCUDAIfNoMiopen
+
+from torch.testing import make_tensor
+from torch.testing._internal.common_utils import gradcheck, gradgradcheck, \
+    GRADCHECK_NONDET_TOL
+from torch.testing._internal.common_utils import dtype2prec_DONTUSE
+from torch.testing._internal.common_cuda import tf32_on_and_off, tf32_is_not_fp32
+
+AMPERE_OR_ROCM = TEST_WITH_ROCM or tf32_is_not_fp32()
+
+
+if TEST_SCIPY:
+    import scipy.signal
+    import scipy.ndimage
+
+class TestConvolutionNN(NNTestCase):
+    _do_cuda_memory_leak_check = True
+    _do_cuda_non_default_stream = True
+
+    def test_conv_backcompat(self):
+        from torch.serialization import SourceChangeWarning
+
+        # This file was generated by running on PyTorch 1.0.1 on Python 2:
+        #
+        #     import torch
+        #     from torch import nn
+        #     m = nn.Conv2d(1, 1, 1)
+        #     torch.save(m, 'legacy_conv2d.pt')
+        #
+        # NB: This Pickle also contains some Unicode data!
+        path = download_file('https://download.pytorch.org/test_data/legacy_conv2d.pt')
+        with warnings.catch_warnings():
+            warnings.simplefilter('ignore', SourceChangeWarning)
+            m = torch.load(path, encoding='utf-8')
+        input = torch.randn((1, 1, 1, 1), dtype=torch.float)
+        self.assertEqual(m(input).size(), (1, 1, 1, 1))
+
+    def test_invalid_conv1d(self):
+        for dtype in [torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble]:
+            module = nn.Conv1d(in_channels=3, out_channels=33, kernel_size=10, stride=1, bias=True).to(dtype)
+            input = torch.randn(1, 3, 4).to(dtype)
+            with self.assertRaisesRegex(RuntimeError,
+                                        r'Calculated padded input size per channel: \(4\). ' +
+                                        r'Kernel size: \(10\). Kernel size can\'t be greater than actual input size'):
+                module(input)
+
+            # Negative stride check
+            module = nn.Conv1d(in_channels=3, out_channels=6, kernel_size=3, stride=-1, bias=True).to(dtype)
+            input = torch.randn(1, 3, 4).to(dtype)
+            with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'):
+                module(input)
+
+    def test_mismatch_shape_conv2d(self):
+        for dtype in (torch.float, torch.cfloat):
+            x = torch.randn(1, 10, 1, 28, 28, dtype=dtype)
+            w = torch.randn(6, 1, 5, 5, dtype=dtype)
+
+            with self.assertRaisesRegex(RuntimeError,
+                                        r'Expected 3D \(unbatched\) or 4D \(batched\) input to conv2d, but got ' +
+                                        r'input of size: \[1, 10, 1, 28, 28\]'):
+
+                F.conv2d(x, w)
+
+    def test_conv2d_discontiguous_weight(self):
+        for dtype in (torch.float, torch.cfloat):
+            # Test for https://github.com/pytorch/pytorch/issues/55781
+            x = torch.ones(64, 16, 16, 16, dtype=dtype)
+            weight = torch.arange(0, 1.0, 1 / 2.0 ** 10).reshape(32, 16, 1, 2).to(dtype)[:, :, :, ::2]
+            self.assertFalse(weight.is_contiguous())
+            y = torch.nn.functional.conv2d(x, weight, None)
+            if torch.backends.mkldnn.is_available():
+                # Disable MKLDNN explicitly, so that either NNPACK or THCNN will be used
+                with torch.backends.mkldnn.flags(enabled=False):
+                    y_ = torch.nn.functional.conv2d(x, weight, None)
+                    self.assertEqual(y, y_)
+            self.assertEqual(y.sum(), 4186112.)
+
+    def test_invalid_conv2d(self):
+        for dtype in [torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble]:
+            module = torch.nn.Conv2d(1, 1, kernel_size=3, dilation=2, stride=2).to(dtype)
+            input = torch.empty(1, 1, 4, 4).to(dtype)
+            self.assertRaises(RuntimeError, lambda: module(input))
+
+            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, stride=1, bias=True)
+            input = torch.randn(1, 3, 1, 1)
+            with self.assertRaisesRegex(RuntimeError,
+                                        r'Calculated padded input size per channel: \(1 x 1\). ' +
+                                        r'Kernel size: \(10 x 10\). Kernel size can\'t be greater than actual input size'):
+                module(input)
+
+            # Negative stride check
+            module = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=4, stride=-1, bias=True).to(dtype)
+            input = torch.randn(1, 3, 4, 4).to(dtype)
+            with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'):
+                module(input)
+
+            # Zero stride check
+            module = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=4, stride=0, bias=True).to(dtype)
+            input = torch.randn(1, 3, 4, 4).to(dtype)
+            with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'):
+                module(input)
+
+    def test_invalid_conv3d(self):
+        for dtype in [torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble]:
+            module = torch.nn.Conv3d(1, 1, kernel_size=3, dilation=2, stride=2).to(dtype)
+            input = torch.empty(1, 1, 4, 4, 4).to(dtype)
+            self.assertRaises(RuntimeError, lambda: module(input))
+
+            # Negative stride check
+            module = torch.nn.Conv3d(1, 1, kernel_size=3, stride=-2)
+            input = torch.empty(1, 1, 4, 4, 4)
+            with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'):
+                module(input)
+
+    def test_conv_invalid_groups(self):
+        with self.assertRaisesRegex(ValueError, 'groups must be a positive integer'):
+            torch.nn.Conv1d(1, 1, kernel_size=3, dilation=2, stride=2, groups=0)
+        with self.assertRaisesRegex(ValueError, 'groups must be a positive integer'):
+            torch.nn.Conv2d(1, 1, kernel_size=3, dilation=2, stride=2, groups=-1)
+        with self.assertRaisesRegex(ValueError, 'groups must be a positive integer'):
+            torch.nn.Conv3d(1, 1, kernel_size=3, dilation=2, stride=2, groups=-2)
+
+    def test_Conv1d_module_same_padding(self):
+        # Compare module against functional: without strides/dilation, asymmetric padding
+        x = torch.rand(1, 1, 20)
+        module = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=10,
+                           padding='same')
+        expect = F.conv1d(x, module.weight, module.bias, padding='same')
+        self.assertEqual(expect, module(x))
+
+        # Test dilation, symmetric padding
+        module = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=10,
+                           padding='same', dilation=2)
+        expect = F.conv1d(x, module.weight, module.bias, padding='same', dilation=2)
+        self.assertEqual(expect, module(x))
+
+        # Test non-zero padding_mode, requiring explicit padding
+        module = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=10,
+                           padding='same', padding_mode='replicate')
+        x_padded = F.pad(x, [4, 5], mode='replicate')
+        expect = F.conv1d(x_padded, module.weight, module.bias, padding='valid')
+        self.assertEqual(expect, module(x))
+        self.assertEqual(x.size(), expect.size())
+
+        # Test connstruction with invalid padding string raises
+        with self.assertRaisesRegex(ValueError, 'Invalid padding string'):
+            module = nn.Conv1d(in_channels=3, out_channels=33, kernel_size=10, padding='foo')
+
+        # Test connstruction with same padding and strides raises
+        with self.assertRaisesRegex(ValueError, "padding='same'"):
+            module = nn.Conv1d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=2)
+
+    def test_Conv2d_module_same_padding(self):
+        # Compare module against functional:
+        # without strides/dilation, both symmetric and asymmetric padding
+        x = torch.rand(1, 1, 9, 20)
+        module = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(5, 10),
+                           padding='same')
+        expect = F.conv2d(x, module.weight, module.bias, padding='same')
+        self.assertEqual(expect, module(x))
+
+        # with dilation, symmetric padding
+        module = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(3, 4),
+                           padding='same', dilation=(1, 2))
+        expect = F.conv2d(x, module.weight, module.bias, padding='same', dilation=(1, 2))
+        self.assertEqual(expect, module(x))
+
+        # Test non-zero padding_mode, requiring explicit padding
+        module = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(3, 4),
+                           padding='same', padding_mode='reflect')
+        x_padded = F.pad(x, [1, 2, 1, 1], mode='reflect')
+        expect = F.conv2d(x_padded, module.weight, module.bias, padding='valid')
+        self.assertEqual(expect, module(x))
+        self.assertEqual(x.size(), expect.size())
+
+        # Test connstruction with invalid padding string raises
+        with self.assertRaisesRegex(ValueError, 'Invalid padding string'):
+            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='foo')
+
+        # Test connstruction with same padding and strides raises
+        with self.assertRaisesRegex(ValueError, "padding='same'"):
+            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=2)
+        with self.assertRaisesRegex(ValueError, "padding='same'"):
+            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=(1, 3))
+        with self.assertRaisesRegex(ValueError, "padding='same'"):
+            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=(4, 1))
+
+    def test_Conv3d_module_same_padding(self):
+        # Compare module against functional:
+        x = torch.rand(1, 1, 4, 4, 4)
+        # without dilation, both symmetric and asymmetric padding
+        module = nn.Conv3d(in_channels=1, out_channels=1, kernel_size=(2, 3, 4),
+                           padding='same')
+        expect = F.conv3d(x, module.weight, module.bias, padding='same')
+        self.assertEqual(expect, module(x))
+
+        # with dilation, both symmetric and asymmetric padding
+        module = nn.Conv3d(in_channels=1, out_channels=1, kernel_size=(2, 3, 4),
+                           padding='same', dilation=(3, 2, 1))
+        expect = F.conv3d(x, module.weight, module.bias, padding='same', dilation=(3, 2, 1))
+        self.assertEqual(expect, module(x))
+
+        # Test non-zero padding_mode, requiring explicit padding
+        module = nn.Conv3d(in_channels=1, out_channels=1, kernel_size=(2, 3, 4),
+                           padding='same', padding_mode='circular')
+        x_padded = F.pad(x, [1, 2, 1, 1, 0, 1], mode='circular')
+        expect = F.conv3d(x_padded, module.weight, module.bias, padding='valid')
+        self.assertEqual(expect, module(x))
+        self.assertEqual(x.size(), expect.size())
+
+        # Test connstruction with invalid padding string raises
+        with self.assertRaisesRegex(ValueError, 'Invalid padding string'):
+            module = nn.Conv3d(in_channels=3, out_channels=33, kernel_size=10, padding='foo')
+
+        # Test connstruction with same padding and strides raises
+        with self.assertRaisesRegex(ValueError, "padding='same'"):
+            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=2)
+        with self.assertRaisesRegex(ValueError, "padding='same'"):
+            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=(1, 1, 3))
+        with self.assertRaisesRegex(ValueError, "padding='same'"):
+            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=(1, 4, 1))
+        with self.assertRaisesRegex(ValueError, "padding='same'"):
+            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=(5, 1, 1))
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    def test_thnn_conv_strided_padded_dilated(self):
+        for convfn, dims, transposed in (
+                (torch.nn.functional.conv2d, 2, False),
+                (torch.nn.functional.conv_transpose2d, 2, True),
+                (torch.nn.functional.conv3d, 3, False),
+                (torch.nn.functional.conv_transpose3d, 3, True)):
+            for stride, padding, dilation in (
+                    (2, 0, 1), (1, 1, 1), (2, 1, 1), (1, 0, 2)):
+                kwargs = {"stride": stride, "padding": padding, "dilation": dilation}
+                inp_shape = (1, 2) + dims * (4,)
+                weight_shape = (2, 2) + dims * (1,)
+                inputs = torch.randn(inp_shape, dtype=torch.double, device="cuda", requires_grad=True)
+                weight = torch.randn(weight_shape, dtype=torch.double, device="cuda", requires_grad=True)
+                bias = torch.randn(2, dtype=torch.double, device="cuda", requires_grad=True)
+                with torch.backends.cudnn.flags(enabled=False):
+                    res = convfn(inputs, weight, bias, **kwargs)
+                res_cpu = convfn(inputs.cpu(), weight.cpu(), bias.cpu(), **kwargs)
+                self.assertEqual(res, res_cpu)
+                with torch.backends.cudnn.flags(enabled=False):
+                    torch.autograd.gradcheck(
+                        lambda x, w, b: convfn(x, w, b, **kwargs),
+                        (inputs, weight, bias)
+                    )
+                    torch.autograd.gradcheck(
+                        lambda x, w, b: convfn(x, w, b, **kwargs),
+                        (inputs.cpu(), weight.cpu(), bias.cpu())
+                    )
+
+    def test_Conv2d_inconsistent_types(self):
+        inputs = torch.randn(4, 1, 7, 7, dtype=torch.float)
+        weights = torch.randn(1, 1, 3, 3, dtype=torch.double)
+        # inconsistent types should raise an exception
+        self.assertRaises(RuntimeError, lambda: nn.functional.conv2d(inputs, weights))
+        # but it should work with the same type
+        nn.functional.conv2d(inputs.float(), weights.float())
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    def test_Conv2d_inconsistent_types_on_GPU_without_cudnn(self):
+        inputs = torch.randn(4, 1, 7, 7, dtype=torch.float, device="cuda")
+        weights = torch.randn(1, 1, 3, 3, dtype=torch.double, device="cuda")
+        bias = torch.randn(1, dtype=torch.double, device="cuda")
+
+        with torch.backends.cudnn.flags(enabled=False):
+            # inconsistent types should raise an exception
+            self.assertRaises(RuntimeError, lambda: nn.functional.conv2d(inputs, weights))
+            self.assertRaises(RuntimeError, lambda: nn.functional.conv2d(inputs, weights.float(), bias))
+
+            # but it should work with the same type
+            nn.functional.conv2d(inputs.float(), weights.float(), bias.float())
+
+    def test_Conv2d_1x1(self):
+        in_channels = 2
+        out_channels = 2
+        mod = torch.nn.Conv2d(2, 2, 1, bias=False).to(dtype=torch.double)
+        input = torch.randn(1, in_channels, 5, 5, requires_grad=True, dtype=torch.double)
+        for enabled in (False, True):
+            with torch.backends.mkldnn.flags(enabled=enabled):
+                gradcheck(F.conv2d, (input, mod.weight))
+
+    def test_Conv2d_OneDNN(self):
+        def run_once(group_val=24, dilation=1):
+            ifm = torch.ones([1, group_val, 6, 6], dtype=torch.float32)
+            weights = torch.ones([group_val, 1, 3, 3], dtype=torch.float32)
+            op = torch.nn.Conv2d(
+                in_channels=group_val,
+                out_channels=group_val,
+                kernel_size=[3, 3],
+                stride=[2, 2],
+                padding=[1, 1],
+                dilation=[dilation, dilation],
+                groups=group_val,
+                bias=False,
+                padding_mode='zeros'
+            )
+
+            op.weight.data = weights
+            res = op(ifm)
+            grad_in = torch.ones(res.shape, dtype=torch.float32)
+            res.backward(grad_in)
+            return op.weight.grad
+
+        for gorup_val in (24, 48, 23, 25):
+            for dilation in (1, 2):
+                with torch.backends.mkldnn.flags(enabled=False):
+                    without_onednn = run_once(gorup_val, dilation)
+
+                with torch.backends.mkldnn.flags(enabled=True):
+                    with_onednn = run_once(gorup_val, dilation)
+
+                self.assertEqual(without_onednn, with_onednn)
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    def test_cudnn_non_contiguous(self):
+        x = torch.randn(192, 16, 50).cuda()
+        x = x.permute(0, 2, 1).contiguous().permute(0, 2, 1)
+        m = torch.nn.Conv1d(
+            in_channels=16,
+            out_channels=32,
+            kernel_size=2,
+            bias=True).cuda()
+        result = m(x)
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
+    def test_Conv2d_inconsistent_types_on_GPU_with_cudnn(self):
+        inputs = torch.randn(4, 1, 7, 7, dtype=torch.float, device="cuda")
+        weights = torch.randn(1, 1, 3, 3, dtype=torch.double, device="cuda")
+        bias = torch.randn(1, dtype=torch.double, device="cuda")
+
+        with torch.backends.cudnn.flags(enabled=True):
+            # inconsistent types should raise an exception
+            self.assertRaises(RuntimeError, lambda: nn.functional.conv2d(inputs, weights))
+            self.assertRaises(RuntimeError, lambda: nn.functional.conv2d(inputs, weights.float(), bias))
+
+            # but it should work with the same type
+            nn.functional.conv2d(inputs.float(), weights.float(), bias.float())
+
+    def test_Conv2d_missing_argument(self):
+        c = nn.Conv2d(3, 3, 3)
+        self.assertRaises(TypeError, lambda: c(None))
+
+    def test_Conv2d_backward_twice(self):
+        input = torch.randn(2, 3, 5, 5)
+        c = nn.Conv2d(3, 3, 3)
+        o1 = c(input)
+        o1.sum().backward()
+        self.assertRaisesRegex(RuntimeError, 'Specify retain_graph=True',
+                               lambda: o1.sum().backward())
+
+
+    def test_conv_modules_raise_error_on_incorrect_input_size(self):
+        for dtype in [torch.bfloat16, torch.double, torch.float]:
+            modules = [nn.Conv1d(3, 8, 3).to(dtype), nn.ConvTranspose1d(3, 8, 3).to(dtype),
+                       nn.Conv2d(3, 8, 3).to(dtype), nn.ConvTranspose2d(3, 8, 3).to(dtype),
+                       nn.Conv3d(3, 8, 3).to(dtype), nn.ConvTranspose3d(3, 8, 3).to(dtype)]
+
+            invalid_input_dims = [(1, 4), (1, 4),
+                                  (2, 5), (2, 5),
+                                  (3, 6), (3, 6)]
+
+            for invalid_dims, module in zip(invalid_input_dims, modules):
+                for dims in invalid_dims:
+                    input = torch.empty(torch.Size((3, ) * dims))
+                    self.assertRaises(RuntimeError, lambda: module(input))
+
+    def test_conv_shapecheck(self):
+        def test(should_raise, module, input_size, dtype):
+            input = torch.empty(3, *input_size).to(dtype)
+            if should_raise:
+                self.assertRaises(RuntimeError, lambda: module(input))
+            else:
+                # just run it to ensure no exception raised.
+                module(input)
+
+        for dtype in [torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble]:
+            # Conv1d
+            test(True, nn.Conv1d(1, 1, 3).to(dtype), (1, 2), dtype)
+            test(True, nn.Conv1d(1, 1, 3, stride=2).to(dtype), (1, 2), dtype)
+            test(False, nn.Conv1d(1, 1, 2).to(dtype), (1, 2), dtype)
+            test(False, nn.Conv1d(1, 1, 2, stride=2).to(dtype), (1, 2), dtype)
+            test(False, nn.Conv1d(1, 1, 3, stride=2, padding=1).to(dtype), (1, 2), dtype)
+
+            # Conv2d
+            test(True, nn.Conv2d(1, 1, (3, 3)).to(dtype), (1, 2, 2), dtype)
+            test(False, nn.Conv2d(1, 1, (3, 3)).to(dtype), (1, 3, 3), dtype)
+            test(False, nn.Conv2d(1, 1, (3, 3), padding=1).to(dtype), (1, 2, 2), dtype)
+
+            # Conv3D
+            test(True, nn.Conv3d(1, 1, (3, 3, 3)).to(dtype), (1, 2, 2, 2), dtype)
+            test(False, nn.Conv3d(1, 1, (3, 3, 3)).to(dtype), (1, 3, 3, 3), dtype)
+            test(False, nn.Conv3d(1, 1, (3, 3, 3), padding=1).to(dtype), (1, 2, 2, 2), dtype)
+
+    def test_ConvTranspose2d_output_size(self):
+        m = nn.ConvTranspose2d(3, 4, 3, 3, 0, 2)
+        i = torch.randn(2, 3, 6, 6)
+        for h in range(15, 22):
+            for w in range(15, 22):
+                if 18 <= h <= 20 and 18 <= w <= 20:
+                    output = m(i, output_size=(h, w))
+                    self.assertEqual(output.size()[2:], (h, w))
+                else:
+                    self.assertRaises(ValueError, lambda: m(i, (h, w)))
+
+    def test_ConvTranspose2d_output_size_downsample_upsample(self):
+        b, c, hid_c = 2, 3, 2
+        for h in range(13, 24):
+            for w in range(13, 17):
+                for k in range(2, 5):
+                    for d in range(1, 5):
+                        for s in range(1, 4):
+                            for p in range(3):
+                                conv = nn.Conv2d(
+                                    in_channels=c,
+                                    out_channels=hid_c,
+                                    kernel_size=k,
+                                    stride=s,
+                                    padding=p,
+                                    dilation=d,
+                                )
+
+                                t_conv = nn.ConvTranspose2d(
+                                    in_channels=hid_c,
+                                    out_channels=c,
+                                    kernel_size=k,
+                                    stride=s,
+                                    padding=p,
+                                    dilation=d,
+                                )
+
+                                i = torch.randn(b, c, h, w)
+
+                                out = t_conv(conv(i), output_size=i.shape)
+
+                                self.assertEqual(out.size()[2:], i.size()[2:])
+
+    def test_ConvTranspose3d_correct_output_size(self):
+        # Check that ConvTranspose3d can take a 5d output_size.
+        m = nn.ConvTranspose3d(2, 2, 2)
+        i = torch.rand(1, 2, 1, 1, 1)
+        out = m(i, output_size=(1, 2, 2, 2, 2))
+
+    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    def test_ConvTranspose2d_half_cublas_gemm(self):
+        with torch.backends.cudnn.flags(enabled=False):
+            inputs = torch.randn(1, 1, 16, 16, device='cuda', dtype=torch.half)
+            deconv = nn.ConvTranspose2d(
+                1, 1, 3, stride=2, padding=1, output_padding=1).cuda().half()
+            output = deconv(inputs)
+            output.mean().backward()
+
+    # For https://github.com/pytorch/pytorch/pull/1273
+    # Almost identical to the above `test_Conv2d_naive_groups`
+    @torch.backends.cudnn.flags(enabled=True, benchmark=False)
+    def test_Conv2d_groups_nobias(self):
+        dev_dtypes = [("cpu", torch.float)]
+        if TEST_CUDA:
+            dev_dtypes += [("cuda", torch.float), ("cuda", torch.half)]
+        if AMPERE_OR_ROCM:
+            dev_dtypes += [("cuda", torch.bfloat16)]
+        for device, dtype in dev_dtypes:
+            m = nn.Conv2d(4, 4, kernel_size=3, groups=2, bias=False).to(device, dtype)
+            i = torch.randn(2, 4, 6, 6, device=device, dtype=dtype, requires_grad=True)
+            output = m(i)
+            grad_output = torch.randn(2, 4, 4, 4, device=device, dtype=dtype)
+            output.backward(grad_output)
+
+            m1 = nn.Conv2d(2, 2, kernel_size=3, bias=False).to(device, dtype)
+            m1.weight.data.copy_(m.weight.data[:2])
+            i1 = i.data[:, :2].contiguous().requires_grad_(True)
+            output1 = m1(i1)
+            output1.backward(grad_output[:, :2].contiguous())
+
+            m2 = nn.Conv2d(2, 2, kernel_size=3, bias=False).to(device, dtype)
+            m2.weight.data.copy_(m.weight.data[2:])
+            i2 = i.data[:, 2:].contiguous().requires_grad_(True)
+            output2 = m2(i2)
+            output2.backward(grad_output[:, 2:].contiguous())
+
+            self.assertEqual(output, torch.cat([output1, output2], 1))
+            self.assertEqual(i.grad.data,
+                             torch.cat([i1.grad.data, i2.grad.data], 1),
+                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
+            self.assertEqual(m.weight.grad.data,
+                             torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
+                             atol=1e-1 if dtype == torch.half else dtype2prec_DONTUSE[dtype], rtol=0)
+
+    # Almost identical to the above `test_Conv2d_naive_groups`
+    # Covering special case when group > 1, input-channel / group < 16 and output-channel is multiple of 16
+    # See also https://github.com/pytorch/pytorch/pull/18463#issuecomment-476563686
+    # and https://github.com/pytorch/pytorch/pull/18463#issuecomment-477001024
+    @torch.backends.cudnn.flags(enabled=True, benchmark=False)
+    def test_Conv2d_groups_nobias_v2(self):
+        torch.manual_seed(123)
+        dev_dtypes = [("cpu", torch.float)]
+        if TEST_CUDA:
+            dev_dtypes += [("cuda", torch.float), ("cuda", torch.half)]
+        if AMPERE_OR_ROCM:
+            dev_dtypes += [("cuda", torch.bfloat16)]
+        for device, dtype in dev_dtypes:
+            m = nn.Conv2d(4, 16, kernel_size=3, groups=2, bias=False).to(device, dtype)
+            i = torch.randn(2, 4, 6, 6, device=device, dtype=dtype, requires_grad=True)
+            output = m(i)
+            grad_output = torch.randn(2, 16, 4, 4, device=device, dtype=dtype)
+            output.backward(grad_output)
+
+            m1 = nn.Conv2d(2, 8, kernel_size=3, bias=False).to(device, dtype)
+            m1.weight.data.copy_(m.weight.data[:8])
+            i1 = i.data[:, :2].contiguous().requires_grad_(True)
+            output1 = m1(i1)
+            output1.backward(grad_output[:, :8].contiguous())
+
+            m2 = nn.Conv2d(2, 8, kernel_size=3, bias=False).to(device, dtype)
+            m2.weight.data.copy_(m.weight.data[8:])
+            i2 = i.data[:, 2:].contiguous().requires_grad_(True)
+            output2 = m2(i2)
+            output2.backward(grad_output[:, 8:].contiguous())
+
+            self.assertEqual(output, torch.cat([output1, output2], 1))
+            self.assertEqual(i.grad.data,
+                             torch.cat([i1.grad.data, i2.grad.data], 1),
+                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
+            self.assertEqual(m.weight.grad.data,
+                             torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
+                             atol=1e-1 if dtype == torch.half else dtype2prec_DONTUSE[dtype], rtol=0)
+
+    # CPU-only test for group conv3d fast implementation using bmm
+    # See: https://github.com/pytorch/pytorch/pull/36355
+    def test_Conv3d_groups_nobias(self):
+        torch.manual_seed(123)
+        m = nn.Conv3d(4, 16, kernel_size=3, groups=2, bias=False).to("cpu", torch.float)
+        i = torch.randn(2, 4, 6, 6, 6, device="cpu", dtype=torch.float, requires_grad=True)
+        output = m(i)
+        grad_output = torch.randn(2, 16, 4, 4, 4, device="cpu", dtype=torch.float)
+        output.backward(grad_output)
+
+        m1 = nn.Conv3d(2, 8, kernel_size=3, bias=False).to("cpu", torch.float)
+        m1.weight.data.copy_(m.weight.data[:8])
+        i1 = i.data[:, :2].contiguous().requires_grad_(True)
+        output1 = m1(i1)
+        output1.backward(grad_output[:, :8].contiguous())
+
+        m2 = nn.Conv3d(2, 8, kernel_size=3, bias=False).to("cpu", torch.float)
+        m2.weight.data.copy_(m.weight.data[8:])
+        i2 = i.data[:, 2:].contiguous().requires_grad_(True)
+        output2 = m2(i2)
+        output2.backward(grad_output[:, 8:].contiguous())
+
+        self.assertEqual(output, torch.cat([output1, output2], 1))
+        self.assertEqual(i.grad.data,
+                         torch.cat([i1.grad.data, i2.grad.data], 1),
+                         atol=dtype2prec_DONTUSE[torch.float], rtol=0)
+        self.assertEqual(m.weight.grad.data,
+                         torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
+                         atol=dtype2prec_DONTUSE[torch.float], rtol=dtype2prec_DONTUSE[torch.float])
+
+    def test_Conv3d_groups_wbias(self):
+        torch.manual_seed(123)
+        m = nn.Conv3d(4, 16, kernel_size=3, groups=2, bias=True).to("cpu", torch.float)
+        i = torch.randn(2, 4, 6, 6, 6, device="cpu", dtype=torch.float, requires_grad=True)
+        output = m(i)
+        grad_output = torch.randn(2, 16, 4, 4, 4, device="cpu", dtype=torch.float)
+        output.backward(grad_output)
+
+        m1 = nn.Conv3d(2, 8, kernel_size=3, bias=True).to("cpu", torch.float)
+        m1.weight.data.copy_(m.weight.data[:8])
+        m1.bias.data.copy_(m.bias.data[:8])
+        i1 = i.data[:, :2].contiguous().requires_grad_(True)
+        output1 = m1(i1)
+        output1.backward(grad_output[:, :8].contiguous())
+
+        m2 = nn.Conv3d(2, 8, kernel_size=3, bias=True).to("cpu", torch.float)
+        m2.weight.data.copy_(m.weight.data[8:])
+        m2.bias.data.copy_(m.bias.data[8:])
+        i2 = i.data[:, 2:].contiguous().requires_grad_(True)
+        output2 = m2(i2)
+        output2.backward(grad_output[:, 8:].contiguous())
+
+        self.assertEqual(output, torch.cat([output1, output2], 1))
+        self.assertEqual(i.grad.data,
+                         torch.cat([i1.grad.data, i2.grad.data], 1),
+                         atol=dtype2prec_DONTUSE[torch.float],
+                         rtol=dtype2prec_DONTUSE[torch.float])
+        self.assertEqual(m.weight.grad.data,
+                         torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
+                         atol=dtype2prec_DONTUSE[torch.float],
+                         rtol=dtype2prec_DONTUSE[torch.float])
+        self.assertEqual(m.bias.grad.data,
+                         torch.cat([m1.bias.grad.data, m2.bias.grad.data], 0),
+                         atol=dtype2prec_DONTUSE[torch.float], rtol=dtype2prec_DONTUSE[torch.float])
+
+    def test_conv_tbc(self):
+        with set_default_dtype(torch.double):
+            inp = torch.randn(9, 4, 5, requires_grad=True)
+            weight = torch.randn(3, 5, 6, requires_grad=True)
+            bias = torch.randn(6, requires_grad=True)
+
+            gradcheck(lambda i, w, b, pad: F.conv_tbc(i, w, b, pad), (inp, weight, bias, 3))
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
+    @skipIfRocmVersionLessThan((4, 3))
+    @skipIfNotMiopenSuggestNHWC
+    def test_grouped_conv_cudnn_nhwc_support(self):
+        # in order to catch the hols in grouped convolution in nhwc support for earlier cudnn version
+        input = torch.randn((16, 16, 8, 8), dtype=torch.float16, device="cuda").to(memory_format=torch.channels_last)
+        weight = torch.randn((8, 4, 3, 3), dtype=torch.float16, device="cuda").to(memory_format=torch.channels_last)
+        out = torch.convolution(input, weight, None, (1, 1), (1, 1), (1, 1), False, (0, 0), 4)
+        input = torch.randn((16, 8, 8, 8), dtype=torch.float16, device="cuda").to(memory_format=torch.channels_last)
+        out_transpose = torch.convolution(input, weight, None, (1, 1), (1, 1), (1, 1), True, (0, 0), 4)
+
+    @unittest.expectedFailure
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
+    def test_conv_cudnn_memory_layout_dominance(self):
+        # desired behavior here is to have the memory_layout of conv.weight to
+        # dominante the layout of output.
+        # which is not the same as current behavior, we'll fix this in
+        # following up PRs and remove the `expectedFailure` tag
+        input = torch.randint(1, 10, (2, 8, 4, 4), dtype=torch.float32, device="cuda", requires_grad=True)
+        conv = nn.Conv2d(8, 4, 3).cuda().float()
+
+        out = conv(input)
+        self.assertTrue(out.is_contiguous())
+
+        input = input.contiguous(memory_format=torch.channels_last)
+        out = conv(input)
+        self.assertTrue(out.is_contiguous())
+
+        conv.weight.data = conv.weight.contiguous(memory_format=torch.channels_last)
+        out = conv(input)
+        self.assertTrue(out.is_contiguous(memory_format=torch.channels_last))
+
+        input = input.contiguous()
+        out = conv(input)
+        self.assertTrue(out.is_contiguous(memory_format=torch.channels_last))
+
+
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    def test_cudnn_noncontiguous_weight(self):
+        # Noncontiguous weights must be contiguous() before being
+        # passed to cuDNN
+        input = torch.tensor([1, 1, 1], dtype=torch.double, device="cuda").view(1, 1, 3)
+        weights1 = torch.tensor([1], dtype=torch.double, device="cuda").expand(1, 1, 2)
+        weights2 = torch.tensor([1], dtype=torch.double, device="cuda").expand(1, 1, 2).contiguous()
+        self.assertEqual(F.conv1d(input, weights1, bias=None, stride=2, dilation=2),
+                         F.conv1d(input, weights2, bias=None, stride=2, dilation=2))
+
+
+    def run_grad_conv_test(self, func_forward, func_backward, dim=1, gradient='input'):
+        for kern, inp_size in [(3, 6), (3, 7), (4, 9)]:
+            for batch, stride, padding, chan_in, chan_out, dilation in \
+                    product([1, 2], [1, 2], [0, 1, 2], [2], [3], [1]):
+
+                for has_bias in [True, False]:
+                    input_shape = [batch, chan_in]
+                    weight_shape = [chan_out, chan_in]
+                    for _ in range(dim):
+                        input_shape.append(inp_size)
+                        weight_shape.append(kern)
+
+                    input = torch.randn(input_shape, requires_grad=True)
+                    weight = torch.randn(weight_shape, requires_grad=True)
+                    if has_bias:
+                        bias = torch.randn([chan_out], requires_grad=True)
+                    output = func_forward(input, weight, stride=stride, padding=padding, dilation=dilation, bias=bias)
+
+                    gradient_o = torch.randn(output.shape)
+                    gradient_w = torch.autograd.grad(output, input if (gradient == 'input') else weight, gradient_o)
+
+                    self.assertEqual(gradient_w[0],
+                                     func_backward(
+                                     input_shape if (gradient == 'input') else input,
+                                     weight_shape if (gradient == 'weight') else weight,
+                                     gradient_o,
+                                     stride=stride,
+                                     padding=padding,
+                                     dilation=dilation))
+
+    def test_grad_conv1d_input(self):
+        self.run_grad_conv_test(F.conv1d, F.grad.conv1d_input, 1, 'input')
+
+    def test_grad_conv1d_weight(self):
+        self.run_grad_conv_test(F.conv1d, F.grad.conv1d_weight, 1, 'weight')
+
+    def test_grad_conv2d_input(self):
+        self.run_grad_conv_test(F.conv2d, F.grad.conv2d_input, 2, 'input')
+
+    def test_grad_conv2d_weight(self):
+        self.run_grad_conv_test(F.conv2d, F.grad.conv2d_weight, 2, 'weight')
+
+    def test_grad_conv3d_input(self):
+        self.run_grad_conv_test(F.conv3d, F.grad.conv3d_input, 3, 'input')
+
+    def test_grad_conv3d_weight(self):
+        self.run_grad_conv_test(F.conv3d, F.grad.conv3d_weight, 3, 'weight')
+
+    @unittest.skipIf(not torch._nnpack_available(), "NNPACK unavailable")
+    def test_nnpack_conv(self):
+        for kern, inp_size in [(3, 6), (3, 7), (4, 9)]:
+            for batch, stride, padding, chan_in, chan_out in \
+                    product([1, 2, 3, 4], [1, 2], [0, 1, 2], [2], [3]):
+
+                for has_bias in [True, False]:
+                    input_shape = [batch, chan_in]
+                    weight_shape = [chan_out, chan_in]
+                    for _ in range(2):
+                        input_shape.append(inp_size)
+                        weight_shape.append(kern)
+
+                    input = torch.randn(input_shape, requires_grad=True, dtype=torch.float)
+                    weight = torch.randn(weight_shape, requires_grad=True, dtype=torch.float)
+                    if has_bias:
+                        bias = torch.randn([chan_out], requires_grad=True, dtype=torch.float)
+                    output = torch._nnpack_spatial_convolution(input, weight, stride=stride, padding=padding, bias=bias)
+                    output_expected = torch.nn.functional.conv2d(input, weight, stride=stride, padding=padding, bias=bias)
+                    self.assertEqual(output, output_expected, atol=3e-4, rtol=0)
+
+                    gradient_o = torch.randn(output.shape, dtype=torch.float)
+
+                    grads = torch.autograd.grad(output, [input, weight], gradient_o)
+                    grads_expected = torch.autograd.grad(output_expected, [input, weight], gradient_o)
+                    for gr, gr_expected in zip(grads, grads_expected):
+                        self.assertEqual(gr, gr_expected, atol=3e-4, rtol=0)
+
+    def test_conv_padding_mode(self):
+        with self.assertRaisesRegex(ValueError, "padding_mode must be one of"):
+            nn.Conv2d(3, 3, 3, padding_mode="xyz")
+
+        with self.assertRaisesRegex(ValueError, "padding_mode must be one of"):
+            nn.Conv2d(3, 3, 3, padding_mode=3)
+
+        with self.assertRaisesRegex(ValueError, "Only \"zeros\" "):
+            nn.ConvTranspose2d(3, 3, 3, padding_mode="reflect")
+
+
+    def test_functional_grad_conv(self):
+        # Conv 1D
+        input = torch.randn(1, 1, 5, requires_grad=True)
+        weight = torch.randn(1, 1, 3, requires_grad=True)
+        output = F.conv1d(input, weight, dilation=2)
+        grad_output = torch.randn(output.shape)
+
+        grad_input_autograd, grad_weight_autograd = torch.autograd.grad(output, (input, weight), grad_output)
+
+        grad_input_functional = torch.nn.grad.conv1d_input(input.shape, weight, grad_output, dilation=2)
+        self.assertEqual(grad_input_functional, grad_input_autograd)
+
+        grad_weight_functional = torch.nn.grad.conv1d_weight(input, weight.shape, grad_output, dilation=2)
+        self.assertEqual(grad_weight_functional, grad_weight_autograd)
+
+        # Conv 2D
+        input = torch.randn(1, 1, 5, 5, requires_grad=True)
+        weight = torch.randn(1, 1, 3, 3, requires_grad=True)
+        output = F.conv2d(input, weight, dilation=2)
+        grad_output = torch.randn(output.shape)
+
+        (grad_input_autograd, grad_weight_autograd) = torch.autograd.grad(output, (input, weight), grad_output)
+
+        grad_input_functional = torch.nn.grad.conv2d_input(input.shape, weight, grad_output, dilation=2)
+        self.assertEqual(grad_input_functional, grad_input_autograd)
+
+        grad_weight_functional = torch.nn.grad.conv2d_weight(input, weight.shape, grad_output, dilation=2)
+        self.assertEqual(grad_weight_functional, grad_weight_autograd)
+
+        # Conv 3D
+        input = torch.randn(1, 1, 5, 5, 5, requires_grad=True)
+        weight = torch.randn(1, 1, 3, 3, 3, requires_grad=True)
+        output = F.conv3d(input, weight, dilation=2)
+        grad_output = torch.randn(output.shape)
+
+        (grad_input_autograd, grad_weight_autograd) = torch.autograd.grad(output, (input, weight), grad_output)
+
+        grad_input_functional = torch.nn.grad.conv3d_input(input.shape, weight, grad_output, dilation=2)
+        self.assertEqual(grad_input_functional, grad_input_autograd)
+
+        grad_weight_functional = torch.nn.grad.conv3d_weight(input, weight.shape, grad_output, dilation=2)
+        self.assertEqual(grad_weight_functional, grad_weight_autograd)
+
+    def test_functional_grad_conv2d(self):
+        BATCH_SIZE = 4
+        IN_CH = 8
+        OUT_CH = 16
+        SPATIAL = 32
+
+        def _test_conv2d(stride, kernel_size, groups, dilation):
+            padding = kernel_size // 2
+
+            input = torch.empty(BATCH_SIZE, IN_CH, SPATIAL, SPATIAL).uniform_(-8.0, 8.0).requires_grad_(True)
+
+            weight = torch.empty(OUT_CH, IN_CH // groups, kernel_size, kernel_size).uniform_(-4.0, 4.0).requires_grad_(True)
+
+            output = F.conv2d(input, weight,
+                              stride=stride, padding=padding, dilation=dilation, groups=groups)
+
+            grad_output = torch.randn(output.shape)
+
+            (grad_input_autograd, grad_weight_autograd) = torch.autograd.grad(output, (input, weight), grad_output)
+
+            grad_input_functional = torch.nn.grad.conv2d_input(input.shape, weight, grad_output,
+                                                               stride=stride, padding=padding, dilation=dilation, groups=groups)
+            self.assertEqual(grad_input_functional, grad_input_autograd)
+
+            grad_weight_functional = torch.nn.grad.conv2d_weight(input, weight.shape, grad_output,
+                                                                 stride=stride, padding=padding, dilation=dilation, groups=groups)
+            self.assertEqual(grad_weight_functional, grad_weight_autograd)
+
+        strides = [1, 2]
+        kernel_sizes = [1, 3, 5]
+        groups = [1, 2, 4]
+        dilates = [1, 2]
+
+        for s, k, g, d in product(strides, kernel_sizes, groups, dilates):
+            _test_conv2d(s, k, g, d)
+
+
+class TestConvolutionNNDeviceType(NNTestCase):
+    def run_conv_double_back_test(self, kern, stride, padding, chan_in, chan_out, batch_size,
+                                  inp_size, dilation, no_weight, groups=1, use_cuda=False,
+                                  use_bias=True, dtype=torch.double):
+        if use_cuda:
+            device = torch.device("cuda")
+        else:
+            device = torch.device("cpu")
+
+        x = torch.randn(batch_size, chan_in, inp_size, inp_size, device=device,
+                        dtype=dtype, requires_grad=True)
+        weight = torch.randn(chan_out, chan_in // groups, kern, kern, device=device,
+                             dtype=dtype, requires_grad=not no_weight)
+        if use_bias:
+            bias = torch.randn(chan_out, device=device, dtype=dtype, requires_grad=True)
+        else:
+            bias = None
+
+        def func(*inputs):
+            if use_bias:
+                lx, lweight, lbias = inputs
+            else:
+                lx, lweight = inputs
+                lbias = None
+            # We disable cudnn during forward to avoid finite difference imprecision issues
+            with cudnn.flags(enabled=False):
+                out = F.conv2d(lx, lweight, lbias, stride, padding, dilation, groups)
+            return out
+
+        if use_bias:
+            inputs = x, weight, bias
+        else:
+            inputs = x, weight
+
+        dummy_out = func(*inputs)
+        grad_y = torch.randn_like(dummy_out, device=device, dtype=dtype, requires_grad=True)
+
+        # Issue #15353: test mkldnn double backward, don't run gradgradcheck due
+        # to imprecision issues
+        if dtype == torch.float:
+            g, = torch.autograd.grad(dummy_out.sum(), x, create_graph=True)
+            return g.requires_grad
+
+        return gradgradcheck(func, inputs, (grad_y,))
+
+    @onlyCUDA
+    @skipCUDAIfNoCudnn
+    @dtypes(*floating_and_complex_types_and(torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else []))
+    def test_Conv2d_deterministic_cudnn(self, device, dtype):
+        inputs = torch.randn(2, 3, 5, 5, device=device, dtype=dtype, requires_grad=True)
+        with cudnn.flags(enabled=True, benchmark=True, deterministic=True):
+            conv1 = torch.nn.Conv2d(3, 3, 3).to(device, dtype)
+            conv2 = torch.nn.Conv2d(3, 3, 3).to(device, dtype)
+            conv2.bias.data.copy_(conv1.bias.data)
+            conv2.weight.data.copy_(conv1.weight.data)
+            out1 = conv1(inputs)
+            out2 = conv2(inputs)
+            self.assertEqual(out1, out2, atol=0.0, rtol=0)
+            y = torch.randn(out1.size(), device=device, dtype=dtype)
+            out1.backward(y)
+            out2.backward(y)
+            self.assertEqual(conv1.bias.grad.data, conv2.bias.grad.data, atol=0.0, rtol=0)
+            self.assertEqual(conv1.weight.grad.data, conv2.weight.grad.data, atol=0.0, rtol=0)
+
+
+    @onlyCUDA
+    @dtypes(*floating_types_and(torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else []))
+    def test_Conv2d_large_workspace(self, device, dtype):
+        # These sizes require huge cuDNN workspaces. Make sure we choose a
+        # reasonable algorithm that does not run out of memory
+        sizes = [
+            (1, 256, 109, 175),
+            (1, 256, 80, 128),
+            (1, 256, 120, 192),
+        ]
+
+        def run_test(benchmark):
+            with torch.backends.cudnn.flags(benchmark=benchmark):
+                conv = torch.nn.Conv2d(256, 256, kernel_size=3, padding=1).to(device, dtype)
+                for size in sizes:
+                    x = torch.randn(size, device=device, dtype=dtype)
+                    out = conv(x.detach().clone().requires_grad_())
+                    out.backward(torch.ones_like(out))
+
+        run_test(benchmark=False)
+        run_test(benchmark=True)
+
+
+    @onlyCUDA
+    @dtypes(torch.half, torch.float)
+    def test_ConvTranspose2d_large_output_padding(self, device, dtype):
+        net1 = torch.nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1)\
+            .to(device=device, dtype=dtype)
+        net2 = torch.nn.ConvTranspose2d(64, 32, kernel_size=3, stride=2, padding=1, output_padding=1)\
+            .to(device=device, dtype=dtype)
+        net3 = torch.nn.ConvTranspose2d(32, 3, kernel_size=3, stride=2, padding=1, output_padding=1)\
+            .to(device=device, dtype=dtype)
+        x = torch.rand(1, 128, 6, 6, device=device, dtype=dtype, requires_grad=True)
+        x = net1(x)
+        x = net2(x)
+        x = net3(x)
+        x.backward(torch.randn_like(x))
+        torch.cuda.synchronize()
+
+
+    @onlyCUDA
+    @tf32_on_and_off(0.01)
+    @dtypes(torch.float, torch.double, torch.half)
+    # Very similar to test_Conv2d_naive_groups but with special care to handle
+    # the number of groups == number of input channels
+    @torch.backends.cudnn.flags(enabled=True, benchmark=False)
+    def test_Conv2d_depthwise_naive_groups(self, device, dtype):
+        for depth_multiplier in [1, 2]:
+            m = nn.Conv2d(2, 2 * depth_multiplier, kernel_size=3, groups=2).to(device, dtype)
+            i = torch.randn(2, 2, 6, 6, device="cuda", dtype=dtype).div_(2).requires_grad_()
+            output = m(i)
+            grad_output = torch.randn(2, 2 * depth_multiplier, 4, 4, device=device, dtype=dtype) / 2
+            output.backward(grad_output)
+
+            offset = 1 * depth_multiplier
+
+            m1 = nn.Conv2d(1, 1 * depth_multiplier, kernel_size=3).to(device, dtype)
+            m1.weight.data = m.weight.data[:offset].clone()
+            m1.bias.data = m.bias.data[:offset].clone()
+            i1 = i.detach()[:, :1].clone().requires_grad_()
+            output1 = m1(i1)
+            output1.backward(grad_output[:, :offset].contiguous())
+
+            m2 = nn.Conv2d(1, 1 * depth_multiplier, kernel_size=3).to(device, dtype)
+            m2.weight.data.copy_(m.weight.data[offset:])
+            m2.bias.data.copy_(m.bias.data[offset:])
+            i2 = i.detach()[:, 1:].clone().requires_grad_()
+            output2 = m2(i2)
+            output2.backward(grad_output[:, offset:].contiguous())
+
+            self.assertEqual(output, torch.cat([output1, output2], 1),
+                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
+            self.assertEqual(i.grad.data,
+                             torch.cat([i1.grad.data, i2.grad.data], 1),
+                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
+            self.assertEqual(m.bias.grad.data,
+                             torch.cat([m1.bias.grad.data,
+                                        m2.bias.grad.data], 0),
+                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
+            self.assertEqual(m.weight.grad.data,
+                             torch.cat([m1.weight.grad.data,
+                                        m2.weight.grad.data], 0),
+                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
+
+    @onlyCUDA
+    @dtypes(torch.float, torch.double, torch.half)
+    @tf32_on_and_off(0.005)
+    @torch.backends.cudnn.flags(enabled=True, benchmark=False)
+    def test_Conv3d_depthwise_naive_groups(self, device, dtype):
+        for depth_multiplier in [1, 2]:
+            m = nn.Conv3d(2, 2 * depth_multiplier, kernel_size=3, groups=2).to(device, dtype)
+            i = torch.randn(2, 2, 6, 6, 6, device="cuda", dtype=dtype).div_(2).requires_grad_()
+            output = m(i)
+            grad_output = torch.randn(2, 2 * depth_multiplier, 4, 4, 4, device=device, dtype=dtype) / 2
+            output.backward(grad_output)
+
+            offset = 1 * depth_multiplier
+
+            m1 = nn.Conv3d(1, 1 * depth_multiplier, kernel_size=3).to(device, dtype)
+            m1.weight.data = m.weight.data[:offset].clone()
+            m1.bias.data = m.bias.data[:offset].clone()
+            i1 = i.detach()[:, :1].clone().requires_grad_()
+            output1 = m1(i1)
+            output1.backward(grad_output[:, :offset].contiguous())
+
+            m2 = nn.Conv3d(1, 1 * depth_multiplier, kernel_size=3).to(device, dtype)
+            m2.weight.data.copy_(m.weight.data[offset:])
+            m2.bias.data.copy_(m.bias.data[offset:])
+            i2 = i.detach()[:, 1:].clone().requires_grad_()
+            output2 = m2(i2)
+            output2.backward(grad_output[:, offset:].contiguous())
+            is_cuda_sm86 = device.startswith("cuda") and torch.cuda.get_device_capability(0) == (8, 6)
+            atol, rtol = (3e-4, 3e-2) if dtype == torch.float32 and is_cuda_sm86 else (dtype2prec_DONTUSE[dtype], 0)
+
+            self.assertEqual(output, torch.cat([output1, output2], 1),
+                             atol=atol, rtol=rtol)
+            self.assertEqual(i.grad.data,
+                             torch.cat([i1.grad.data, i2.grad.data], 1),
+                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
+            self.assertEqual(m.bias.grad.data,
+                             torch.cat([m1.bias.grad.data,
+                                        m2.bias.grad.data], 0),
+                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
+            self.assertEqual(m.weight.grad.data,
+                             torch.cat([m1.weight.grad.data,
+                                        m2.weight.grad.data], 0),
+                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
+
+
+    @onlyCUDA
+    @dtypes(*floating_types_and(torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else []))
+    def test_noncontig_conv_grad(self, device, dtype):
+        # FIXME: remove after adding non-contiguous grad tests for all modules
+        module = nn.Conv2d(3, 5, kernel_size=3, padding=1).to(device, dtype)
+        input = torch.randn(2, 3, 10, 10, dtype=dtype, device=device, requires_grad=True)
+        output = module(input)
+
+        grad = torch.randn(2, 2, 5, 10, 10, dtype=dtype, device=device)[:, 1]
+        assert not grad.is_contiguous()
+        output.backward(grad, retain_graph=True)
+        self.assertIsNotNone(input.grad)
+        result = input.grad.data.clone()
+        input.grad.data.zero_()
+
+        output.backward(grad.contiguous())
+        self.assertEqual(result, input.grad.data, atol=dtype2prec_DONTUSE[dtype], rtol=0)
+
+    @onlyCUDA
+    @dtypes(torch.double)
+    def test_conv_double_backward(self, device, dtype):
+        with torch.backends.cudnn.flags(deterministic=True):
+            # Double backward only runs with DoubleTensor due to precision reason
+            batch_size = 1
+            for kern, inp_size, dilations in [(3, 5, [1, 2]), (4, 9, [1])]:
+                for stride, padding, chan_in, chan_out, dilation in product([1], [2], [2], [3], dilations):
+                    no_weight = stride == 2
+                    result = self.run_conv_double_back_test(kern, stride,
+                                                            padding, chan_in, chan_out,
+                                                            batch_size, inp_size, dilation,
+                                                            no_weight, use_cuda=True, dtype=dtype)
+                    self.assertTrue(result,
+                                    "Conv double backward test failed with parameters:" +
+                                    "\nkern: " + str(kern) +
+                                    "\nstride: " + str(stride) +
+                                    "\npadding: " + str(padding) +
+                                    "\nchan_in: " + str(chan_in) +
+                                    "\nchan_out: " + str(chan_out) +
+                                    "\nbatch_size: " + str(batch_size) +
+                                    "\ninp_size: " + str(inp_size) +
+                                    "\ndilation: " + str(dilation))
+
+
+    def test_conv_double_backward_no_bias(self):
+        kern = 3
+        stride = 2
+        chan_in, chan_out = 2, 4
+        batch_size = 2
+        inp_size = 5
+        padding = 1
+        dilation = 1
+        no_weight = False
+        use_bias = True
+        result = self.run_conv_double_back_test(kern, stride,
+                                                padding, chan_in, chan_out,
+                                                batch_size, inp_size, dilation,
+                                                no_weight, use_bias=use_bias)
+        self.assertTrue(result,
+                        "Conv double backward test failed with parameters:" +
+                        "\nkern: " + str(kern) +
+                        "\nstride: " + str(stride) +
+                        "\npadding: " + str(padding) +
+                        "\nchan_in: " + str(chan_in) +
+                        "\nchan_out: " + str(chan_out) +
+                        "\nbatch_size: " + str(batch_size) +
+                        "\ninp_size: " + str(inp_size) +
+                        "\ndilation: " + str(dilation))
+
+
+    def test_conv_double_backward_groups(self):
+        kern = 3
+        stride = 1
+        padding = 2
+        chan_in, chan_out = 2, 4
+        batch_size = 2
+        inp_size = 6
+        dilation = 1
+        no_weight = False
+        groups = 2
+        result = self.run_conv_double_back_test(kern, stride,
+                                                padding, chan_in * groups, chan_out * groups,
+                                                batch_size, inp_size, dilation,
+                                                no_weight, groups=groups)
+        self.assertTrue(result,
+                        "Conv double backward test failed with parameters:" +
+                        "\nkern: " + str(kern) +
+                        "\nstride: " + str(stride) +
+                        "\npadding: " + str(padding) +
+                        "\nchan_in: " + str(chan_in) +
+                        "\nchan_out: " + str(chan_out) +
+                        "\nbatch_size: " + str(batch_size) +
+                        "\ninp_size: " + str(inp_size) +
+                        "\ndilation: " + str(dilation) +
+                        "\ngroups: " + str(groups))
+
+
+    def test_conv_double_backward_stride(self):
+        batch_size = 2
+
+        # Cannot provide ggW when stride is > 1
+        for kern, inp_size, dilations in [(3, 5, [1, 2]), (3, 7, [1])]:
+            for stride, padding, chan_in, chan_out, dilation in product([2], [0, 1], [1], [2], dilations):
+                no_weight = False
+                self.run_conv_double_back_test(kern, stride,
+                                               padding, chan_in, chan_out,
+                                               batch_size, inp_size, dilation,
+                                               no_weight)
+
+    @dtypes(torch.float, torch.cfloat)
+    @torch.backends.cudnn.flags(enabled=True, benchmark=False)
+    def test_conv1d_same_padding(self, device, dtype):
+        # Test padding='same' outputs the correct shape
+        test_args = [
+            # in_size
+            range(50, 55),
+            # kernel_size
+            [1, 2, 3, 8],
+            # dilation
+            range(1, 4),
+            # stride
+            [1],
+        ]
+        for in_size, k_size, dilation, stride in itertools.product(*test_args):
+            x = torch.rand(1, 1, in_size, device=device, dtype=dtype)
+            y = torch.rand(1, 1, k_size, device=device, dtype=dtype)
+            z = F.conv1d(x, y, padding='same', dilation=dilation, stride=stride)
+            self.assertEqual(z.size(2), int(math.ceil(in_size / stride)))
+
+        # Compare F.conv1d padding='same' output against manual padding
+        # Without strides/dilation
+        x = torch.rand(1, 1, 12, device=device, dtype=dtype)
+        y = torch.rand(1, 1, 3, device=device, dtype=dtype)
+        expect = F.conv1d(x, y, padding=1)
+        actual = F.conv1d(x, y, padding='same')
+        self.assertEqual(expect, actual)
+
+        # With dilation
+        x = torch.rand(1, 1, 12, device=device, dtype=dtype)
+        y = torch.rand(1, 1, 4, device=device, dtype=dtype)
+        expect = F.conv1d(x, y, padding=3, dilation=2)
+        actual = F.conv1d(x, y, padding='same', dilation=2)
+        self.assertEqual(expect, actual)
+
+        # Dilation with asymmetric padding
+        expect = F.conv1d(x, y, padding=5, dilation=3)[..., 1:]
+        actual = F.conv1d(x, y, padding='same', dilation=3)
+        self.assertEqual(expect, actual)
+
+    @dtypes(torch.float, torch.cfloat)
+    def test_conv2d_same_padding(self, device, dtype):
+        if dtype is torch.cfloat:
+            rtol, atol = 2e-6, 2e-6
+        else:
+            rtol, atol = None, None
+        # Compare F.conv2d padding='same' output against manual padding
+        # Without strides/dilation
+        x = torch.rand(1, 1, 10, 11, device=device, dtype=dtype)
+        y = torch.rand(1, 1, 4, 5, device=device, dtype=dtype)
+        expect = F.conv2d(x, y, padding=(2, 2))[..., 1:, :]
+        actual = F.conv2d(x, y, padding='same')
+        self.assertEqual(expect, actual, rtol=rtol, atol=atol)
+
+        # With dilation
+        y = torch.rand(1, 1, 3, 4, device=device, dtype=dtype)
+        expect = F.conv2d(x, y, padding=(2, 3), dilation=2)
+        actual = F.conv2d(x, y, padding='same', dilation=2)
+        self.assertEqual(expect, actual, rtol=rtol, atol=atol)
+
+        # Dilation with asymmetric padding
+        y = torch.rand(1, 1, 4, 4, device=device, dtype=dtype)
+        expect = F.conv2d(x, y, padding=5, dilation=3)[..., 1:, 1:]
+        actual = F.conv2d(x, y, padding='same', dilation=3)
+        self.assertEqual(expect, actual, rtol=rtol, atol=atol)
+
+    @dtypes(torch.float, torch.cfloat)
+    def test_conv3d_same_padding(self, device, dtype):
+        if dtype is torch.cfloat:
+            rtol, atol = 2e-6, 2e-6
+        else:
+            rtol, atol = None, None
+        # Compare F.conv3d padding='same' output against manual padding
+        # Without strides/dilation
+        x = torch.rand(1, 1, 10, 11, 12, device=device, dtype=dtype)
+        y = torch.rand(1, 1, 1, 2, 5, device=device, dtype=dtype)
+        expect = F.conv3d(x, y, padding=(0, 1, 2))[..., :, 1:, :]
+        actual = F.conv3d(x, y, padding='same')
+        self.assertEqual(expect, actual, rtol=rtol, atol=atol)
+
+        # With dilation
+        expect = F.conv3d(x, y, padding=(0, 1, 4), dilation=2)
+        actual = F.conv3d(x, y, padding='same', dilation=2)
+        self.assertEqual(expect, actual, rtol=rtol, atol=atol)
+
+        # Dilation with asymmetric padding
+        y = torch.rand(1, 1, 4, 4, 4, device=device, dtype=dtype)
+        expect = F.conv3d(x, y, padding=5, dilation=3)[..., 1:, 1:, 1:]
+        actual = F.conv3d(x, y, padding='same', dilation=3)
+        self.assertEqual(expect, actual, rtol=rtol, atol=atol)
+
+    @dtypes(torch.float, torch.cfloat)
+    def test_conv1d_valid_padding(self, device, dtype):
+        # Test F.conv1d padding='valid' is the same as no padding
+        x = torch.rand(1, 1, 10, device=device, dtype=dtype)
+        y = torch.rand(1, 1, 4, device=device, dtype=dtype)
+        expect = F.conv1d(x, y)
+        actual = F.conv1d(x, y, padding='valid')
+        self.assertEqual(expect, actual)
+
+    @dtypes(torch.float, torch.cfloat)
+    def test_conv2d_valid_padding(self, device, dtype):
+        # Test F.conv2d padding='valid' is the same as no padding
+        x = torch.rand(1, 1, 1, 10, device=device, dtype=dtype)
+        y = torch.rand(1, 1, 1, 4, device=device, dtype=dtype)
+        expect = F.conv2d(x, y)
+        actual = F.conv2d(x, y, padding='valid')
+        self.assertEqual(expect, actual)
+
+    @dtypes(torch.float, torch.cfloat)
+    def test_conv3d_valid_padding(self, device, dtype):
+        # Test F.conv3d padding='valid' is the same as no padding
+        x = torch.rand(1, 1, 1, 1, 10, dtype=dtype, device=device)
+        y = torch.rand(1, 1, 1, 1, 4, dtype=dtype, device=device)
+        expect = F.conv3d(x, y)
+        actual = F.conv3d(x, y, padding='valid')
+        self.assertEqual(expect, actual)
+
+    @dtypes(torch.float, torch.cfloat)
+    def test_conv1d_same_padding_backward(self, device, dtype):
+        # Test F.conv1d gradients work with padding='same'
+        x = torch.rand(1, 1, 12, dtype=dtype, device=device, requires_grad=True)
+        y = torch.rand(1, 1, 4, dtype=dtype, device=device, requires_grad=True)
+
+        # Symmetric padding
+        z = F.conv1d(x, y, padding=3, dilation=2)
+        z.sum().backward()
+        gx_expect, gy_expect = x.grad, y.grad
+        x.grad, y.grad = None, None
+
+        z = F.conv1d(x, y, padding='same', dilation=2)
+        z.sum().backward()
+        self.assertEqual(gx_expect, x.grad)
+        self.assertEqual(gy_expect, y.grad)
+        x.grad, y.grad = None, None
+
+        # Asymmetric padding
+        z = F.conv1d(x, y, padding=2)[..., 1:]
+        z.sum().backward()
+        gx_expect, gy_expect = x.grad, y.grad
+        x.grad, y.grad = None, None
+
+        z = F.conv1d(x, y, padding='same')
+        z.sum().backward()
+        self.assertEqual(gx_expect, x.grad)
+        self.assertEqual(gy_expect, y.grad)
+
+    @dtypes(torch.float, torch.cfloat)
+    def test_conv2d_same_padding_backward(self, device, dtype):
+        # Test F.conv2d gradients work with padding='same'
+        x = torch.rand(1, 1, 10, 11, device=device, dtype=dtype, requires_grad=True)
+        y = torch.rand(1, 1, 4, 5, device=device, dtype=dtype, requires_grad=True)
+
+        # Symmetric padding
+        z = F.conv2d(x, y, padding=(3, 4), dilation=2)
+        z.sum().backward()
+        gx_expect, gy_expect = x.grad, y.grad
+        x.grad, y.grad = None, None
+
+        z = F.conv2d(x, y, padding='same', dilation=2)
+        z.sum().backward()
+        self.assertEqual(gx_expect, x.grad)
+        self.assertEqual(gy_expect, y.grad)
+        x.grad, y.grad = None, None
+
+        # Asymmetric padding
+        y = torch.rand(1, 1, 4, 4, device=device, dtype=dtype, requires_grad=True)
+        z = F.conv2d(x, y, padding=2)[..., 1:, 1:]
+        z.sum().backward()
+        gx_expect, gy_expect = x.grad, y.grad
+        x.grad, y.grad = None, None
+
+        z = F.conv2d(x, y, padding='same')
+        z.sum().backward()
+        self.assertEqual(gx_expect, x.grad)
+        self.assertEqual(gy_expect, y.grad)
+
+    @dtypes(torch.double, torch.cdouble)
+    def test_conv3d_same_padding_backward(self, device, dtype):
+        check_forward_ad = torch.device(device).type != 'xla'
+
+        # Test F.conv3d gradients work with padding='same'
+        x = torch.rand(1, 1, 1, 11, 12, dtype=dtype, device=device, requires_grad=True)
+        y = torch.rand(1, 1, 1, 2, 5, dtype=dtype, device=device, requires_grad=True)
+
+        # Symmetric padding
+        z = F.conv3d(x, y, padding=(0, 1, 4), dilation=2)
+        z.sum().backward()
+        gx_expect, gy_expect = x.grad, y.grad
+        x.grad, y.grad = None, None
+
+        z = F.conv3d(x, y, padding='same', dilation=2)
+        z.sum().backward()
+        self.assertEqual(gx_expect, x.grad)
+        self.assertEqual(gy_expect, y.grad)
+        x.grad, y.grad = None, None
+
+        gradcheck(lambda x, y: F.conv3d(x, y, padding='same', dilation=2), (x, y),
+                  check_forward_ad=check_forward_ad, nondet_tol=1e-5)
+        if torch.device(device).type != 'cuda':
+            # https://github.com/pytorch/pytorch/issues/70702
+            gradgradcheck(lambda x, y: F.conv3d(x, y, padding='same', dilation=2), (x, y),
+                          check_fwd_over_rev=True)
+
+        # Asymmetric padding
+        y = torch.rand(1, 1, 1, 4, 4, dtype=dtype, device=device, requires_grad=True)
+        z = F.conv3d(x, y, padding=2)[..., 1:, 1:]
+        z.sum().backward()
+        gx_expect, gy_expect = x.grad, y.grad
+        x.grad, y.grad = None, None
+
+        z = F.conv3d(x, y, padding='same')
+        z.sum().backward()
+        self.assertEqual(gx_expect, x.grad)
+        self.assertEqual(gy_expect, y.grad)
+
+        gradcheck(lambda x, y: F.conv3d(x, y, padding='same'), (x, y),
+                  check_forward_ad=check_forward_ad, nondet_tol=1e-5)
+        if torch.device(device).type != 'cuda':
+            # https://github.com/pytorch/pytorch/issues/70702
+            gradgradcheck(lambda x, y: F.conv3d(x, y, padding='same'), (x, y),
+                          check_fwd_over_rev=True)
+
+    @dtypes(torch.float, torch.cfloat)
+    def test_conv1d_valid_padding_backward(self, device, dtype):
+        # Test F.conv1d gradients work with padding='valid'
+        x = torch.rand(1, 1, 10, dtype=dtype, device=device, requires_grad=True)
+        y = torch.rand(1, 1, 4, dtype=dtype, device=device, requires_grad=True)
+        F.conv1d(x, y, padding=0).sum().backward()
+        gx_expect, gy_expect = x.grad, y.grad
+        x.grad, y.grad = None, None
+
+        F.conv1d(x, y, padding='valid').sum().backward()
+        gx_actual, gy_actual = x.grad, y.grad
+        self.assertEqual(gx_expect, gx_actual)
+        self.assertEqual(gy_expect, gy_actual)
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy required for the test.")
+    @dtypes(torch.float, torch.cfloat)
+    @parametrize_test("mode", ('valid', 'same'))
+    def test_conv1d_vs_scipy(self, device, dtype, mode):
+        t = make_tensor((1, 10), device=device, dtype=dtype)
+        feat_dim = t.shape[1]
+        weight_even = make_tensor((1, 1, 4), device=device, dtype=dtype)
+        weight_odd = make_tensor((1, 1, 5), device=device, dtype=dtype)
+
+        def _test(t, weight, mode):
+            # SciPy expects two 1-D inputs.
+            t_a = t.view(-1).cpu().numpy()
+            w_a = weight.view(-1).cpu().numpy()
+            expected = scipy.signal.convolve(t_a, w_a, mode=mode)
+
+            kwargs = {'padding': mode}
+            if mode == 'same':
+                # `same` padding in PyTorch conv1d is different
+                # from SciPy
+                p = weight.shape[2] // 2
+                t = torch.nn.functional.pad(t, (p, p))
+                # We have already taken care of padding
+                kwargs.pop("padding")
+
+            # second input is flipped in SciPy's convolve
+            weight_flipped = torch.flip(weight, (2,))
+            actual = torch.nn.functional.conv1d(t, weight_flipped, **kwargs).squeeze(0)
+            if mode == 'same':
+                actual = actual[:feat_dim]
+
+            self.assertEqual(actual, expected)
+
+        # Global dtype for this test suite is torch.double
+        # This leads to change in type-promotion
+        # and conv1d outputs `complex128` for `complex64` input.
+        with set_default_dtype(torch.float):
+            _test(t, weight_even, mode)
+            _test(t, weight_odd, mode)
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy required for the test.")
+    @dtypes(torch.float, torch.cfloat)
+    @parametrize_test("mode", ('valid', 'same'))
+    def test_conv2d_vs_scipy(self, device, dtype, mode):
+        t = make_tensor((1, 5, 10), device=device, dtype=dtype)
+        weight_even = make_tensor((1, 1, 2, 4), device=device, dtype=dtype)
+        weight_odd = make_tensor((1, 1, 3, 5), device=device, dtype=dtype)
+
+        def _test(t, weight, mode):
+            # SciPy expects two 2-D inputs.
+            t_a = t.squeeze(0).cpu().numpy()
+            w_a = weight.squeeze(0).squeeze(0).cpu().numpy()
+            expected = scipy.signal.convolve2d(t_a, w_a, mode=mode)
+
+            kwargs = {'padding': mode}
+            if mode == 'same':
+                # `same` padding in PyTorch conv2d is different
+                # from SciPy
+                left_right_pad = weight.shape[3] // 2
+                top_bottom_pad = weight.shape[2] // 2
+                p = (left_right_pad, left_right_pad, top_bottom_pad, top_bottom_pad)
+                t = torch.nn.functional.pad(t, p)
+                # We have already taken care of padding
+                kwargs.pop("padding")
+
+            # second input is flipped in SciPy's convolve2d
+            weight_flipped = torch.flip(weight, (2, 3))
+            actual = torch.nn.functional.conv2d(t, weight_flipped, **kwargs).squeeze(0)
+            if mode == 'same':
+                actual = actual[:5, :10]
+
+            self.assertEqual(actual, expected, rtol=2e-5, atol=5e-6)
+
+        # Global dtype for this test suite is torch.double
+        # This leads to change in type-promotion
+        # and conv1d outputs `complex128` for `complex64` input.
+        with set_default_dtype(torch.float):
+            _test(t, weight_even, mode)
+            _test(t, weight_odd, mode)
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy required for the test.")
+    @dtypes(torch.float, torch.cfloat)
+    @parametrize_test("mode", ('valid', 'same'))
+    def test_conv3d_vs_scipy(self, device, dtype, mode):
+        t = make_tensor((1, 5, 5, 10), device=device, dtype=dtype)
+        weight_even = make_tensor((1, 1, 2, 2, 4), device=device, dtype=dtype)
+        weight_odd = make_tensor((1, 1, 2, 3, 5), device=device, dtype=dtype)
+
+        def _test(t, weight, mode):
+            # SciPy expects two 3-D inputs.
+            t_a = t.squeeze(0).cpu().numpy()
+            w_a = weight.squeeze(0).squeeze(0).cpu().numpy()
+            expected = scipy.signal.convolve(t_a, w_a, mode=mode)
+
+            kwargs = {'padding': mode}
+            if mode == 'same':
+                # `same` padding in PyTorch conv3d is different
+                # from SciPy
+                left_right_pad = weight.shape[4] // 2
+                top_bottom_pad = weight.shape[3] // 2
+                front_back_pad = weight.shape[2] // 2
+                p = (left_right_pad, left_right_pad, top_bottom_pad, top_bottom_pad,
+                     front_back_pad, front_back_pad)
+                t = torch.nn.functional.pad(t, p)
+                # We have already taken care of padding
+                kwargs.pop("padding")
+
+            # second input is flipped in SciPy's convolve
+            weight_flipped = torch.flip(weight, (2, 3, 4))
+            actual = torch.nn.functional.conv3d(t, weight_flipped, **kwargs).squeeze(0)
+            if mode == 'same':
+                actual = actual[:5, :5, :10]
+
+            if tf32_is_not_fp32() and (dtype == torch.float or dtype == torch.complex64):
+                self.assertEqual(actual, expected, atol=0.05, rtol=0.05)
+            else:
+                self.assertEqual(actual, expected, rtol=2e-5, atol=5e-6)
+
+        # Global dtype for this test suite is torch.double
+        # This leads to change in type-promotion
+        # and conv1d outputs `complex128` for `complex64` input.
+        with set_default_dtype(torch.float):
+            _test(t, weight_even, mode)
+            _test(t, weight_odd, mode)
+
+    @dtypes(torch.float, torch.complex64)
+    def test_conv2d_valid_padding_backward(self, device, dtype):
+        # Test F.conv2d gradients work with padding='valid'
+        x = torch.rand(1, 1, 1, 10, device=device, dtype=dtype, requires_grad=True)
+        y = torch.rand(1, 1, 1, 4, device=device, dtype=dtype, requires_grad=True)
+        F.conv2d(x, y, padding=0).sum().backward()
+        gx_expect, gy_expect = x.grad, y.grad
+        x.grad, y.grad = None, None
+
+        F.conv2d(x, y, padding='valid').sum().backward()
+        gx_actual, gy_actual = x.grad, y.grad
+        self.assertEqual(gx_expect, gx_actual)
+        self.assertEqual(gy_expect, gy_actual)
+
+    @dtypes(torch.double, torch.cdouble)
+    def test_conv3d_valid_padding_backward(self, device, dtype):
+        check_forward_ad = torch.device(device).type != 'xla'
+
+        # Test F.conv3d gradients work with padding='valid'
+        x = torch.rand(1, 1, 1, 1, 10, dtype=dtype, device=device, requires_grad=True)
+        y = torch.rand(1, 1, 1, 1, 4, dtype=dtype, device=device, requires_grad=True)
+        F.conv3d(x, y, padding=0).sum().backward()
+        gx_expect, gy_expect = x.grad, y.grad
+        x.grad, y.grad = None, None
+
+        F.conv3d(x, y, padding='valid').sum().backward()
+        gx_actual, gy_actual = x.grad, y.grad
+        self.assertEqual(gx_expect, gx_actual)
+        self.assertEqual(gy_expect, gy_actual)
+
+        gradcheck(lambda x, y: F.conv3d(x, y, padding='valid'), (x, y), check_forward_ad=check_forward_ad)
+        gradgradcheck(lambda x, y: F.conv3d(x, y, padding='valid'), (x, y), check_fwd_over_rev=check_forward_ad)
+
+    @parametrize_test("N", range(2, 4), name_fn=lambda N: 'ConvTranspose{}d'.format(N))
+    def test_conv_transpose_with_output_size_and_no_batch_dim(self, device, N):
+        # For inputs with no batch dim, verify output is the correct shape when output_size is set.
+        # See https://github.com/pytorch/pytorch/issues/75889
+        inp = torch.randn((1, 15, 13) if N == 2 else (1, 15, 13, 13), device=device)
+        output_size = (1, 240, 200) if N == 2 else (1, 240, 200, 200)
+        ConvTransposeNd = getattr(nn, 'ConvTranspose{}d'.format(N))
+        m = ConvTransposeNd(1, 1, kernel_size=16, stride=16, padding=7, bias=False, device=device)
+        output = m(inp, output_size=output_size)
+        self.assertEqual(output.shape, output_size)
+
+    @skipMeta
+    @parametrize_test("input_shape,transposed,dilated,groups,layout,backend_expected", [
+        # === slow ===
+        subtest(((2, 6, 7), False, False, 3, torch.strided, torch._C._ConvBackend.Slow2d),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow1d'),
+        subtest(((2, 6, 7), True, False, 3, torch.strided, torch._C._ConvBackend.SlowTranspose2d),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow1d_transposed'),
+        subtest(((2, 6, 7), False, True, 3, torch.strided, torch._C._ConvBackend.SlowDilated2d),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow1d_dilated'),
+        subtest(((2, 6, 7), True, True, 3, torch.strided, torch._C._ConvBackend.SlowTranspose2d),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow1d_dilated_transposed'),
+        subtest(((2, 6, 7, 8), False, False, 3, torch.strided, torch._C._ConvBackend.Slow2d),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow2d'),
+        subtest(((2, 6, 7, 8), True, False, 3, torch.strided, torch._C._ConvBackend.SlowTranspose2d),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow2d_transposed'),
+        subtest(((2, 6, 7, 8), False, True, 3, torch.strided, torch._C._ConvBackend.SlowDilated2d),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow2d_dilated'),
+        subtest(((2, 6, 7, 8), True, True, 3, torch.strided, torch._C._ConvBackend.SlowTranspose2d),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow2d_dilated_transposed'),
+        subtest(((2, 6, 7, 8, 9), False, False, 3, torch.strided, torch._C._ConvBackend.Slow3d),
+                decorators=[onlyCPU, disableMkldnn], name='slow3d_cpu'),
+        # CUDA doesn't have a slow 3D implementation, so it goes to the dilated 3D implementation instead
+        subtest(((2, 6, 7, 8, 9), False, False, 3, torch.strided, torch._C._ConvBackend.SlowDilated3d),
+                decorators=[onlyCUDA, disablecuDNN], name='slow3d_cuda'),
+        # FIXME: RuntimeError: CUDA out of memory.
+        # subtest(((2, 6, 7, 8, 9), True, False, 3, torch.strided, torch._C._ConvBackend.SlowTranspose3d),
+        #         decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow3d_transposed'),
+        subtest(((2, 6, 7, 8, 9), False, True, 3, torch.strided, torch._C._ConvBackend.SlowDilated3d),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow3d_dilated'),
+        # FIXME: RuntimeError: CUDA out of memory.
+        # subtest(((2, 6, 7, 8, 9), True, True, 3, torch.strided, torch._C._ConvBackend.SlowTranspose3d),
+        #         decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow3d_dilated_transposed'),
+        subtest(((0, 6, 7), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_batch1d'),
+        subtest(((2, 0, 7), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_channel1d'),
+        subtest(((0, 0, 7), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_batch_channel1d'),
+        subtest(((0, 6, 7, 8), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_batch2d'),
+        subtest(((2, 0, 7, 8), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_channel2d'),
+        subtest(((0, 0, 7, 8), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_batch_channel2d'),
+        subtest(((0, 6, 7, 8, 9), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_batch3d'),
+        subtest(((2, 0, 7, 8, 9), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_channel3d'),
+        subtest(((0, 0, 7, 8, 9), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
+                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_batch_channel3d'),
+        # === cuda ===
+        # Note that disablecuDNN disables miopen as well.
+        subtest(((2, 6, 7), False, False, 6, torch.strided, torch._C._ConvBackend.CudaDepthwise2d),
+                decorators=[onlyCUDA, disablecuDNN], name='cuda_depthwise1d'),
+        subtest(((2, 6, 7, 8), False, False, 6, torch.strided, torch._C._ConvBackend.CudaDepthwise2d),
+                decorators=[onlyCUDA, disablecuDNN], name='cuda_depthwise2d'),
+        subtest(((2, 6, 7, 8, 9), False, False, 6, torch.strided, torch._C._ConvBackend.CudaDepthwise3d),
+                decorators=[onlyCUDA, disablecuDNN], name='cuda_depthwise3d'),
+        # === cudnn ===
+        subtest(((2, 6, 7), False, False, 3, torch.strided, torch._C._ConvBackend.Cudnn),
+                decorators=[onlyCUDA, skipCUDAIfNoCudnn, skipCUDAIfMiopen], name='cudnn1d'),
+        subtest(((2, 6, 7, 8), False, False, 3, torch.strided, torch._C._ConvBackend.Cudnn),
+                decorators=[onlyCUDA, skipCUDAIfNoCudnn, skipCUDAIfMiopen], name='cudnn2d'),
+        subtest(((2, 6, 7, 8, 9), False, False, 3, torch.strided, torch._C._ConvBackend.Cudnn),
+                decorators=[onlyCUDA, skipCUDAIfNoCudnn, skipCUDAIfMiopen], name='cudnn3d'),
+        subtest(((2, 6, 7), True, False, 3, torch.strided, torch._C._ConvBackend.CudnnTranspose),
+                decorators=[onlyCUDA, skipCUDAIfNoCudnn, skipCUDAIfMiopen], name='cudnn1d_transposed'),
+        subtest(((2, 6, 7, 8), True, False, 3, torch.strided, torch._C._ConvBackend.CudnnTranspose),
+                decorators=[onlyCUDA, skipCUDAIfNoCudnn, skipCUDAIfMiopen], name='cudnn2d_transposed'),
+        # FIXME: RuntimeError: CUDA out of memory.
+        # subtest(((2, 6, 7, 8, 9), True, False, 3, torch.strided, torch._C._ConvBackend.CudnnTranspose),
+        #         decorators=[onlyCUDA, skipCUDAIfNoCudnn, skipCUDAIfMiopen], name='cudnn3d_transposed'),
+        # === miopen ===
+        subtest(((2, 6, 7), False, False, 3, torch.strided, torch._C._ConvBackend.Miopen),
+                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen1d'),
+        subtest(((2, 6, 7, 8), False, False, 3, torch.strided, torch._C._ConvBackend.Miopen),
+                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen2d'),
+        subtest(((2, 6, 7, 8, 9), False, False, 3, torch.strided, torch._C._ConvBackend.Miopen),
+                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen3d'),
+        subtest(((2, 6, 7), True, False, 3, torch.strided, torch._C._ConvBackend.MiopenTranspose),
+                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen1d_transposed'),
+        subtest(((2, 6, 7, 8), True, False, 3, torch.strided, torch._C._ConvBackend.MiopenTranspose),
+                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen2d_transposed'),
+        subtest(((2, 6, 7, 8, 9), True, False, 3, torch.strided, torch._C._ConvBackend.MiopenTranspose),
+                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen3d_transposed'),
+        subtest(((2, 6, 7), False, False, 6, torch.strided, torch._C._ConvBackend.MiopenDepthwise),
+                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen_depthwise1d'),
+        subtest(((2, 6, 7, 8), False, False, 6, torch.strided, torch._C._ConvBackend.MiopenDepthwise),
+                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen_depthwise2d'),
+        subtest(((2, 6, 7, 8, 9), False, False, 6, torch.strided, torch._C._ConvBackend.MiopenDepthwise),
+                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen_depthwise3d'),
+        # === mkldnn ===
+        subtest(((2, 6, 7), False, False, 3, torch._mkldnn, torch._C._ConvBackend.Mkldnn),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn1d'),
+        subtest(((2, 6, 7, 8), False, False, 3, torch._mkldnn, torch._C._ConvBackend.Mkldnn),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn2d'),
+        subtest(((2, 6, 7, 8, 9), False, False, 3, torch._mkldnn, torch._C._ConvBackend.Mkldnn),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn3d'),
+        # Transposed convolution is broken for mkldnn. See https://github.com/pytorch/pytorch/issues/68775.
+        subtest(((2, 6, 7), True, False, 3, torch._mkldnn, torch._C._ConvBackend.Mkldnn),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn, unittest.expectedFailure], name='mkldnn1d_transposed'),
+        subtest(((2, 6, 7, 8), True, False, 3, torch._mkldnn, torch._C._ConvBackend.Mkldnn),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn, unittest.expectedFailure], name='mkldnn2d_transposed'),
+        subtest(((2, 6, 7, 8, 9), True, False, 3, torch._mkldnn, torch._C._ConvBackend.Mkldnn),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn, unittest.expectedFailure], name='mkldnn3d_transposed'),
+        subtest(((2, 6, 7), False, True, 3, torch.strided, torch._C._ConvBackend.Mkldnn),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn1d_cpu_input'),
+        subtest(((2, 6, 7, 8), False, True, 3, torch.strided, torch._C._ConvBackend.Mkldnn),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn2d_cpu_input'),
+        subtest(((2, 6, 7, 8, 9), False, True, 3, torch.strided, torch._C._ConvBackend.Mkldnn),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn3d_cpu_input'),
+        subtest(((0, 6, 7), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_batch1d'),
+        subtest(((2, 0, 7), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_channel1d'),
+        subtest(((0, 0, 7), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_batch_channel1d'),
+        subtest(((0, 6, 7, 8), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_batch2d'),
+        subtest(((2, 0, 7, 8), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_channel2d'),
+        subtest(((0, 0, 7, 8), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_batch_channel2d'),
+        subtest(((0, 6, 7, 8, 9), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_batch3d'),
+        subtest(((2, 0, 7, 8, 9), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_channel3d'),
+        subtest(((0, 0, 7, 8, 9), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
+                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_batch_channel3d'),
+        # Note: Tests for mobile backends are not currently supported. This comprises
+        # NnpackSpatial, Winograd3x3Depthwise, and Xnnpack2d backends. Testing these
+        # requires the ability to gate tests by whether PyTorch is built with USE_MOBILE=1.
+    ])
+    # Test with both bias and no bias.
+    @parametrize_test("has_bias", [False, True])
+    # Test with both stride=1 and stride>1 cases.
+    @parametrize_test("strided", [False, True])
+    # Test with both contiguous and non-contiguous inputs.
+    @parametrize_test("contiguous", [False, True])
+    def test_conv_backend(
+            self, device, input_shape, has_bias, strided, contiguous, transposed, dilated, groups,
+            layout, backend_expected):
+        # Build up inputs.
+        dtype = torch.float32
+        C_in, C_out, dim, kernel_size = input_shape[1], 12, len(input_shape) - 2, 3
+        x = torch.randn(*input_shape, device=device, dtype=dtype, requires_grad=True)
+        weight = torch.randn(C_in if transposed else C_out,
+                             C_out // groups if transposed else C_in // groups,
+                             *[kernel_size for _ in range(dim)],
+                             device=device, dtype=dtype, requires_grad=True)
+        bias = torch.randn(C_out, device=device, dtype=dtype, requires_grad=True) if has_bias else None
+
+        def _make_noncontiguous(inp):
+            if inp is None:
+                return None
+            old_requires_grad = inp.requires_grad
+            inp = torch.repeat_interleave(inp, 2, dim=-1)
+            inp = inp[..., ::2].detach().requires_grad_(old_requires_grad)
+            return inp
+
+        if not contiguous:
+            x = _make_noncontiguous(x)
+            weight = _make_noncontiguous(weight)
+            bias = _make_noncontiguous(bias)
+
+        if layout is torch._mkldnn:
+            x = x.to_mkldnn()
+            # Note that weight and bias are not supported as mkldnn tensors during training.
+
+        stride = (2,) * dim if strided else (1,) * dim
+        padding = (0,) * dim
+        dilation = (2,) * dim if dilated else (1,) * dim
+        output_padding = (0,) * dim
+        inputs = [x, weight, bias, stride, padding, dilation, transposed, output_padding, groups]
+
+        # Ensure correct backend is selected.
+        backend_actual = torch._C._select_conv_backend(*inputs)
+        self.assertEqual(backend_actual, backend_expected)
+
+        # Ensure backward call succeeds.
+        convolution = torch.ops.aten.convolution
+        output = convolution(*inputs)
+        grad_output = torch.randn(output.shape, device=device, dtype=dtype)
+        if not contiguous:
+            grad_output = _make_noncontiguous(grad_output)
+        if layout is torch._mkldnn:
+            grad_output = grad_output.to_mkldnn()
+        output.backward(grad_output)
+
+        # mkldnn doesn't support gradcheck :(
+        if layout is torch._mkldnn:
+            return
+
+        if backend_actual != torch._C._ConvBackend.Empty:  # FIXME: forward AD fails
+            # Forward AD and forward-over-reverse AD smoke test in float32
+            # TODO: remove this if we introduce per-op gradient tests for float32
+            with fwAD.dual_level():
+                dual_inputs = [(fwAD.make_dual(i, torch.rand_like(i)) if isinstance(i, torch.Tensor) else i) for i in inputs]
+                # Forward AD
+                output = convolution(*dual_inputs)
+                # Forward over reverse AD
+                grad_output_d = fwAD.make_dual(torch.rand_like(output), torch.rand_like(output))
+                if has_bias:
+                    torch.autograd.grad(output, [x, weight, bias], grad_output_d)
+                else:
+                    torch.autograd.grad(output, [x, weight], grad_output_d)
+
+        # Convert to float64 for gradcheck.
+        x = x.to(torch.float64).detach().requires_grad_(True)
+        weight = weight.to(torch.float64).detach().requires_grad_(True)
+        if bias is not None:
+            bias = bias.to(torch.float64).detach().requires_grad_(True)
+        inputs = [x, weight, bias, stride, padding, dilation, transposed, output_padding, groups]
+
+        # Set some backend-specific validation settings.
+        gradcheck_nondet_tol = 0.0
+        if torch.backends.cudnn.is_available():
+            # cuDNN introduces non-determinism
+            gradcheck_nondet_tol = GRADCHECK_NONDET_TOL
+
+        self.assertTrue(gradcheck(convolution, inputs, nondet_tol=gradcheck_nondet_tol))
+
+        # double backward doesn't support bias gradients
+        if bias is not None:
+            bias.requires_grad_(False)
+        self.assertTrue(gradgradcheck(convolution, inputs, nondet_tol=gradcheck_nondet_tol))
+
+
+    @onlyCPU
+    def test_conv_contiguous_for_oneDNN(self):
+        # See https://github.com/pytorch/pytorch/issues/80837.
+        for dtype in [torch.float, torch.bfloat16]:
+            conv = nn.Conv2d(
+                1,
+                128,
+                kernel_size=(5, 2),
+                stride=(2, 1),
+                padding=(0, 1),
+                dilation=(1, 1),
+                groups=1,
+                bias=True,
+                padding_mode='zeros').to(dtype=dtype)
+
+            x = torch.rand([1, 2, 321, 201, 1]).to(dtype=dtype)
+            x = torch.transpose(x, 1, 4)
+            x2 = x[..., 0]
+            inputs = [x2, conv.weight, conv.bias, (2, 1), (0, 1), (1, 1), False, (0, 1), 1]
+            if torch.backends.mkldnn.is_available():
+                y = conv(x2)
+                # Disable MKLDNN explicitly
+                with torch.backends.mkldnn.flags(enabled=False):
+                    y_ = conv(x2)
+                    self.assertEqual(y, y_)
+
+    @onlyCPU
+    def test_conv_ic1_channels_last_for_oneDNN(self):
+        # See https://github.com/pytorch/pytorch/issues/82060, N > 1 will call in OneDNN path.
+        for dtype in [torch.float, torch.bfloat16]:
+            conv = torch.nn.Conv2d(1, 64, kernel_size=(3, 3), padding=(1, 1), bias=False)
+            conv = conv.to(memory_format=torch.channels_last).to(dtype=dtype)
+            x = torch.rand(2, 1, 100, 100).to(dtype=dtype)
+            if torch.backends.mkldnn.is_available():
+                y = conv(x)
+                # Disable MKLDNN explicitly
+                with torch.backends.mkldnn.flags(enabled=False):
+                    y_ = conv(x)
+                    self.assertEqual(y, y_)
+
+    @dtypes(torch.float, torch.cfloat)
+    def test_conv_empty_channel(self, device, dtype):
+        in_channels = 0
+        mod = torch.nn.Conv1d(in_channels, 8, 2, stride=2, dtype=dtype).to(device)
+        inp = torch.randn(2, 0, 15, device=device, dtype=dtype)
+        _test_module_empty_input(self, mod, inp, check_size=False)
+
+        with self.assertRaisesRegex(RuntimeError, "Given groups=1, weight"):
+            inp = torch.randn(2, 1, 0, device=device, dtype=dtype)
+            mod(inp)
+
+        mod = torch.nn.Conv2d(in_channels, 33, 3, stride=2, dtype=dtype).to(device)
+        inp = torch.randn(2, 0, 50, 100, device=device, dtype=dtype)
+        _test_module_empty_input(self, mod, inp, check_size=False)
+
+        with self.assertRaisesRegex(RuntimeError, "Given groups=1, weight"):
+            inp = torch.randn(2, 1, 40, 0, device=device, dtype=dtype)
+            mod(inp)
+
+        mod = torch.nn.Conv3d(in_channels, 33, 3, stride=2, dtype=dtype).to(device)
+        inp = torch.randn(2, 0, 50, 20, 40, device=device, dtype=dtype)
+        _test_module_empty_input(self, mod, inp, check_size=False)
+
+        with self.assertRaisesRegex(RuntimeError, "Given groups=1, weight"):
+            inp = torch.randn(2, 1, 50, 0, 40, device=device, dtype=dtype)
+            mod(inp)
+
+    def test_group_conv_empty(self, device):
+        mod = torch.nn.Conv2d(4, 4, stride=2, kernel_size=3, padding=1, groups=4).to(device)
+        inp = torch.randn(0, 4, 4, 4, device=device)
+        _test_module_empty_input(self, mod, inp, check_size=False)
+        if self.device_type == 'cuda' and self.has_cudnn():
+            with torch.backends.cudnn.flags(enabled=False):
+                _test_module_empty_input(self, mod, inp, check_size=False)
+
+    def test_group_convTranspose_empty(self, device):
+        mod = torch.nn.ConvTranspose2d(4, 4, stride=2, kernel_size=3, padding=1, groups=4).to(device)
+        inp = torch.randn(0, 4, 4, 4, device=device)
+        _test_module_empty_input(self, mod, inp, check_size=False)
+        if self.device_type == 'cuda' and self.has_cudnn():
+            with torch.backends.cudnn.flags(enabled=False):
+                _test_module_empty_input(self, mod, inp, check_size=False)
+
+    def test_convTranspose_empty(self, device):
+        mod = torch.nn.ConvTranspose2d(4, 4, stride=2, kernel_size=3, padding=1).to(device)
+        inp = torch.randn(0, 4, 4, 4, device=device)
+        _test_module_empty_input(self, mod, inp, check_size=False)
+        if self.device_type == 'cuda' and self.has_cudnn():
+            with torch.backends.cudnn.flags(enabled=False):
+                _test_module_empty_input(self, mod, inp, check_size=False)
+
+    @onlyCUDA
+    @largeTensorTest('12GB')
+    def test_conv_large_nosplit(self, device):
+        # Here we just test the convolution correctly route to the fallback implementation
+        # that is, it does not crash. The correctness of fallback implementation should be
+        # covered in other tests
+        dtype = torch.half if self.device_type == 'cuda' else torch.float
+        conv1 = nn.Conv2d(2, 2, 8, 8).to(device).to(dtype)
+        input_large = torch.randn(1, 2, 1024, 1024 * 1024, dtype=dtype, device=device)
+        conv1(input_large)
+        conv2 = torch.nn.Conv2d(1, 1024, 1, 1).to(device).to(dtype)
+        input_large = torch.randn(1, 1, 2048, 1024 , dtype=dtype, device=device)
+        conv2(input_large)
+
+    def test_conv_noncontig_weights(self, device):
+        for dim in (1, 2, 3):
+            for grouped in (False, True):
+                nc = 3
+                groups = 3 if grouped else 1
+                w = torch.randn([3] * dim, device=device)
+                w = w.expand([nc, int(nc / groups)] + list(w.shape))
+                w = w.detach().requires_grad_()
+                x = torch.randn([1, nc] + ([5] * dim), device=device, requires_grad=True)
+                y = getattr(F, 'conv{}d'.format(dim))(x, w, groups=groups)
+                y.sum().backward()
+                y = getattr(F, 'conv_transpose{}d'.format(dim))(x, w, groups=groups)
+                y.sum().backward()
+
+    def test_conv_noncontig_weights_and_bias(self, device):
+        # need floats to exercise https://github.com/pytorch/pytorch/issues/16018
+        for bias in [True, False]:
+            conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                              bias=bias).to(device, torch.float)
+
+            input_nc = torch.randn((1, 3, 224, 224, 2), device=device, dtype=torch.float)[:, :, :, :, 1]
+            input_c = input_nc.contiguous()
+
+            weight_nc = torch.randn((64, 3, 7, 7, 2), device=device, dtype=torch.float)[:, :, :, :, 1]
+            conv1.weight = nn.Parameter(weight_nc)
+            weight_c = conv1.weight.contiguous()
+
+            if bias:
+                bias_nc = torch.randn((64, 2), device=device, dtype=torch.float)[:, 1]
+                conv1.bias = nn.Parameter(bias_nc)
+                bias_c = conv1.bias.contiguous()
+
+            out1 = conv1(input_nc)
+            conv1.weight = nn.Parameter(weight_c)
+            if bias:
+                conv1.bias = nn.Parameter(bias_c)
+            out2 = conv1(input_c)
+            self.assertEqual(out1, out2)
+
+    @onlyCUDA
+    @largeTensorTest('12GB')
+    def test_conv_transposed_large(self, device):
+        dtype = torch.half if self.device_type == 'cuda' else torch.float
+        conv = nn.ConvTranspose2d(1, 1, 1, 1, bias=False).to(device).to(dtype)
+        input_large = torch.randn(4096, 1, 512, 1024, dtype=dtype, device=device)
+        # forward
+        ret = conv(input_large)
+        maxdiff0 = (ret.narrow(0, 0, 1024) - conv(input_large.narrow(0, 0, 1024))).abs_().max().item()
+        maxdiff1 = (ret.narrow(0, 1024, 1024) - conv(input_large.narrow(0, 1024, 1024))).abs_().max().item()
+        maxdiff2 = (ret.narrow(0, 2048, 1024) - conv(input_large.narrow(0, 2048, 1024))).abs_().max().item()
+        maxdiff3 = (ret.narrow(0, 3072, 1024) - conv(input_large.narrow(0, 3072, 1024))).abs_().max().item()
+        if self.device_type == 'cuda':
+            # cuDNN may use algorithms such as FFT that don't guarantee a diff of 0
+            self.assertEqual(maxdiff0, 0, atol=2e-3, rtol=1e-5)
+            self.assertEqual(maxdiff1, 0, atol=2e-3, rtol=1e-5)
+            self.assertEqual(maxdiff2, 0, atol=2e-3, rtol=1e-5)
+            self.assertEqual(maxdiff3, 0, atol=2e-3, rtol=1e-5)
+        else:
+            self.assertEqual(maxdiff0, 0)
+            self.assertEqual(maxdiff1, 0)
+            self.assertEqual(maxdiff2, 0)
+            self.assertEqual(maxdiff3, 0)
+
+    @onlyCUDA
+    @skipCUDAIfRocm
+    @largeTensorTest('12GB')
+    def test_conv_large(self, device):
+        dtype = torch.half if self.device_type == 'cuda' else torch.float
+        conv = nn.Conv2d(2, 2, 8, 8, bias=False).to(device).to(dtype)
+        input_large = torch.randn(4097, 2, 512, 512, dtype=dtype, device=device)
+        # forward
+        ret = conv(input_large)
+        self.assertEqual(ret[:2048], conv(input_large[:2048]))
+        self.assertEqual(ret[2048:4096], conv(input_large[2048:4096]))
+        self.assertEqual(ret[4096:], conv(input_large[4096:]))
+
+        # backward
+        conv.zero_grad()
+        # When computing the backward, we are using the `max(dim=1)`` to create
+        # some sparsity. Without this sparsity, the rounding error would be
+        # too large (as large as 1e-5) to satisfy the creterion (1e-6) of `assertEqual`
+        ret.view(4097, -1).max(dim=1).values.sum().backward()
+        del ret
+        grad1 = conv.weight.grad.detach().clone()
+        conv.zero_grad()
+        conv(input_large[:2048]).view(2048, -1).max(dim=1).values.sum().backward()
+        conv(input_large[2048:4096]).view(2048, -1).max(dim=1).values.sum().backward()
+        conv(input_large[4096:]).view(1, -1).max(dim=1).values.sum().backward()
+        grad2 = conv.weight.grad.detach().clone()
+        # gradients are at the order of hundreds, we need to scale it to
+        # the order of one so that we can compare
+        scale = 1 / grad2.abs().mean()
+        grad1 = grad1 * scale
+        grad2 = grad2 * scale
+        self.assertEqual(grad1, grad2, atol=5e-2, rtol=5e-3)
+
+    @onlyCUDA
+    @skipCUDAIfNoCudnn
+    def test_contig_wrong_stride_cudnn(self, device):
+        # x has to have batch_size 1 to test contiguous checks
+        x = torch.randn(1, 16, 5, 5, device=device)
+        stride = list(x.stride())
+        stride[0] = 20
+        # change the stride in dimension 0. the tensor is still contiguous because size[0] is 1
+        x.set_(x.storage(), 0, x.size(), stride)
+        self.assertTrue(x.is_contiguous())
+        F.conv_transpose2d(x, torch.randn(16, 1, 1, 1, device=device))
+        F.conv2d(x, torch.randn(1, 16, 1, 1, device=device))
+
+    @onlyCUDA
+    def test_Conv2d_size_1_kernel(self, device):
+        x_cpu = torch.randn(2, 3, 5, 5)
+        conv_cpu = torch.nn.Conv2d(3, 3, kernel_size=1)
+        y_cpu = conv_cpu(x_cpu)
+        y = torch.rand_like(y_cpu)
+        y_cpu.backward(y)
+
+        with cudnn.flags(enabled=False):
+            conv_cuda = torch.nn.Conv2d(3, 3, kernel_size=1).to(device)
+            conv_cuda.bias.data.copy_(conv_cpu.bias.data)
+            conv_cuda.weight.data.copy_(conv_cpu.weight.data)
+            y_cuda = conv_cuda(x_cpu.to(device))
+            y_cuda.backward(y.to(device))
+
+        self.assertEqual(y_cpu, y_cuda, atol=1e-5, rtol=0, exact_device=False)
+        self.assertEqual(conv_cpu.bias.grad.data, conv_cuda.bias.grad.data, atol=1e-5, rtol=0, exact_device=False)
+        self.assertEqual(conv_cpu.weight.grad.data, conv_cuda.weight.grad.data, atol=1e-5, rtol=0, exact_device=False)
+
+    @onlyCUDA
+    def test_ConvTranspose2d_size_1_kernel(self, device):
+        x_cpu = torch.randn(2, 3, 5, 5)
+        conv_cpu = torch.nn.ConvTranspose2d(3, 3, kernel_size=1)
+        y_cpu = conv_cpu(x_cpu)
+        y = torch.rand_like(y_cpu)
+        y_cpu.backward(y)
+
+        with cudnn.flags(enabled=False):
+            conv_cuda = torch.nn.ConvTranspose2d(3, 3, kernel_size=1).to(device)
+            conv_cuda.bias.data.copy_(conv_cpu.bias.data)
+            conv_cuda.weight.data.copy_(conv_cpu.weight.data)
+            y_cuda = conv_cuda(x_cpu.to(device))
+            y_cuda.backward(y.to(device))
+
+        self.assertEqual(y_cpu, y_cuda, atol=1e-5, rtol=0, exact_device=False)
+        self.assertEqual(conv_cpu.bias.grad.data, conv_cuda.bias.grad.data, atol=1e-5, rtol=0, exact_device=False)
+        self.assertEqual(conv_cpu.weight.grad.data, conv_cuda.weight.grad.data, atol=1e-5, rtol=0, exact_device=False)
+
+    @onlyCUDA
+    def test_ConvTranspose3d_size_1_kernel(self, device):
+        with set_default_dtype(torch.double):
+            x_cpu = torch.randn(2, 3, 3, 5, 5)
+            conv_cpu = torch.nn.ConvTranspose3d(3, 3, kernel_size=1)
+            y_cpu = conv_cpu(x_cpu)
+            y = torch.rand_like(y_cpu)
+            y_cpu.backward(y)
+
+            with cudnn.flags(enabled=False):
+                conv_cuda = torch.nn.ConvTranspose3d(3, 3, kernel_size=1).to(device)
+                conv_cuda.bias.data.copy_(conv_cpu.bias.data)
+                conv_cuda.weight.data.copy_(conv_cpu.weight.data)
+                y_cuda = conv_cuda(x_cpu.to(device))
+                y_cuda.backward(y.to(device))
+
+            self.assertEqual(y_cpu, y_cuda, atol=1e-5, rtol=0, exact_device=False)
+            self.assertEqual(conv_cpu.bias.grad.data, conv_cuda.bias.grad.data, atol=1e-5, rtol=0, exact_device=False)
+            self.assertEqual(conv_cpu.weight.grad.data, conv_cuda.weight.grad.data, atol=1e-5, rtol=0, exact_device=False)
+
+    @dtypesIfCUDA(*floating_types_and(torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else []))
+    @dtypes(torch.float)
+    @torch.backends.cudnn.flags(enabled=True, benchmark=False)
+    def test_Conv2d_naive_groups(self, device, dtype):
+        # Check that grouped convolutions matches two half convolutions
+        m = nn.Conv2d(4, 4, kernel_size=3, groups=2).to(device, dtype)
+        i = torch.randn(2, 4, 6, 6, device=device, dtype=dtype, requires_grad=True)
+        output = m(i)
+        grad_output = torch.randn(2, 4, 4, 4, device=device, dtype=dtype)
+        output.backward(grad_output)
+
+        m1 = nn.Conv2d(2, 2, kernel_size=3).to(device, dtype)
+        m1.weight.data.copy_(m.weight.data[:2])
+        m1.bias.data.copy_(m.bias.data[:2])
+        i1 = i.data[:, :2].contiguous().requires_grad_(True)
+        output1 = m1(i1)
+        output1.backward(grad_output[:, :2].contiguous())
+
+        m2 = nn.Conv2d(2, 2, kernel_size=3).to(device, dtype)
+        m2.weight.data.copy_(m.weight.data[2:])
+        m2.bias.data.copy_(m.bias.data[2:])
+        i2 = i.data[:, 2:].contiguous().requires_grad_(True)
+        output2 = m2(i2)
+        output2.backward(grad_output[:, 2:].contiguous())
+
+        self.assertEqual(output, torch.cat([output1, output2], 1))
+        self.assertEqual(i.grad.data,
+                         torch.cat([i1.grad.data, i2.grad.data], 1),
+                         atol=dtype2prec_DONTUSE[dtype], rtol=0)
+        self.assertEqual(m.bias.grad.data,
+                         torch.cat([m1.bias.grad.data, m2.bias.grad.data], 0),
+                         atol=dtype2prec_DONTUSE[dtype], rtol=0)
+        self.assertEqual(m.weight.grad.data,
+                         torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
+                         atol=dtype2prec_DONTUSE[dtype], rtol=0)
+
+    @dtypes(torch.double, torch.cdouble)
+    def test_Conv2d_backward_depthwise(self, device, dtype):
+        x = torch.randn(2, 2, 4, 20, device=device, dtype=dtype, requires_grad=True)
+        weight = torch.randn(2, 1, 3, 5, device=device, dtype=dtype, requires_grad=True)
+
+        def conv2d_depthwise(x, weight):
+            return torch.nn.functional.conv2d(
+                x, weight, bias=None, stride=(1, 10), groups=2)
+
+        for cudnn_enabled in [False, True]:
+            with torch.backends.cudnn.flags(enabled=cudnn_enabled):
+                torch.autograd.gradcheck(conv2d_depthwise, (x, weight))
+
+    @onlyCPU
+    @dtypes(torch.float, torch.double)
+    def test_conv_thnn_nhwc(self, device, dtype):
+        def helper(n, c, h, w, out_channels, kernel_size, dilation, groups, input_format, weight_format):
+            input = torch.randint(-3, 3, (n, c, h, w), dtype=dtype, device=device)\
+                .to(memory_format=input_format)
+            input.requires_grad_()
+            conv = nn.Conv2d(c, out_channels, kernel_size, dilation=dilation, groups=groups)\
+                .to(device='cpu', dtype=dtype, memory_format=weight_format)
+            for p in conv.parameters():
+                p.data = torch.randint_like(p, -3, 3)
+
+            ref_input = input.detach().clone().contiguous().requires_grad_()
+            ref_conv = nn.Conv2d(c, out_channels, kernel_size, dilation=dilation, groups=groups)
+            # load_state_dict will restore the stride & memory_layout on ref_conv.weight.
+            ref_conv.load_state_dict(conv.state_dict())
+            ref_conv = ref_conv.to(device='cpu', dtype=dtype, memory_format=torch.contiguous_format)
+
+            out = conv(input)
+            ref_out = ref_conv(ref_input)
+
+            grad = torch.randint_like(out, -3, 3)
+            ref_grad = grad.detach().clone().contiguous()
+
+            out.backward(grad)
+            ref_out.backward(ref_grad)
+
+            self.assertTrue(out.is_contiguous(memory_format=torch.channels_last))
+            self.assertTrue(ref_out.is_contiguous())
+            self.assertEqual(out, ref_out, exact_dtype=False)
+            self.assertEqual(conv.weight.grad, ref_conv.weight.grad, exact_dtype=False)
+            self.assertEqual(conv.bias.grad, ref_conv.bias.grad, exact_dtype=False)
+            self.assertEqual(input.grad, ref_input.grad, exact_dtype=False)
+
+        with torch.backends.mkldnn.flags(enabled=False):
+            formats = [[torch.channels_last, torch.channels_last],
+                       [torch.channels_last, torch.contiguous_format],
+                       [torch.contiguous_format, torch.channels_last]]
+            for input_format, weight_format in formats:
+                # non-dilated conv: thnn_conv2d normal path (with im2col)
+                helper(2, 8, 4, 4, out_channels=4, kernel_size=3, dilation=1, groups=1,
+                       input_format=input_format, weight_format=weight_format)
+                helper(2, 8, 4, 4, out_channels=8, kernel_size=3, dilation=1, groups=8,
+                       input_format=input_format, weight_format=weight_format)
+                # test when input chanels is 1 and not converted to channels last
+                helper(2, 1, 10, 10, out_channels=8, kernel_size=3, dilation=1, groups=1,
+                       input_format=torch.contiguous_format, weight_format=torch.channels_last)
+                # non-dilated conv: thnn_conv2d fast path (skip im2col)
+                helper(1, 16, 56, 56, out_channels=16, kernel_size=1, dilation=1, groups=1,
+                       input_format=input_format, weight_format=weight_format)
+                # ic == oc == 1 here, so need to stick input to CL to activate channels last
+                helper(1, 16, 56, 56, out_channels=16, kernel_size=1, dilation=1, groups=16,
+                       input_format=torch.channels_last, weight_format=weight_format)
+                # dilated conv: slow_conv_dilated2d
+                helper(2, 8, 11, 13, out_channels=16, kernel_size=3, dilation=2, groups=1,
+                       input_format=input_format, weight_format=weight_format)
+                helper(2, 16, 11, 13, out_channels=32, kernel_size=3, dilation=2, groups=16,
+                       input_format=input_format, weight_format=weight_format)
+
+    @onlyCUDA
+    @skipCUDAIfRocmVersionLessThan((4, 3))
+    @skipCUDAIfNotMiopenSuggestNHWC
+    @skipCUDAIfCudnnVersionLessThan(7603)
+    @dtypes(torch.half, torch.float, torch.cfloat)
+    def test_conv_cudnn_nhwc(self, device, dtype):
+        def helper(n, c, h, w, out_channels, kernel_size, groups):
+            input = torch.randint(-3, 3, (n, c, h, w), dtype=dtype, device=device)\
+                .to(memory_format=torch.channels_last)
+            input.requires_grad_()
+            conv = nn.Conv2d(c, out_channels, kernel_size, groups=groups)\
+                .to(device='cuda', dtype=dtype, memory_format=torch.channels_last)
+            for p in conv.parameters():
+                p.data = torch.randint_like(p, -3, 3)
+
+            # use FP64 channels-first conv as reference
+            ref_input = input.detach().clone().contiguous().double().requires_grad_()
+            ref_conv = nn.Conv2d(c, out_channels, kernel_size, groups=groups)
+            # load_state_dict will restore the stride & memory_layout on ref_conv.weight.
+            ref_conv.load_state_dict(conv.state_dict())
+            ref_conv = ref_conv.to(device='cuda', dtype=torch.double, memory_format=torch.contiguous_format)
+
+            out = conv(input)
+            ref_out = ref_conv(ref_input)
+
+            grad = torch.randint_like(out, -3, 3)
+            ref_grad = grad.detach().clone().double().contiguous()
+
+            out.backward(grad)
+            ref_out.backward(ref_grad)
+
+            self.assertTrue(out.is_contiguous(memory_format=torch.channels_last))
+            self.assertTrue(input.grad.is_contiguous(memory_format=torch.channels_last))
+            self.assertTrue(conv.weight.grad.is_contiguous(memory_format=torch.channels_last))
+
+            self.assertTrue(ref_out.is_contiguous())
+            self.assertTrue(ref_input.grad.is_contiguous())
+            self.assertTrue(ref_conv.weight.grad.is_contiguous())
+
+            self.assertEqual(out, ref_out, exact_dtype=False)
+            self.assertEqual(conv.weight.grad, ref_conv.weight.grad, exact_dtype=False)
+            self.assertEqual(conv.bias.grad, ref_conv.bias.grad, exact_dtype=False)
+            self.assertEqual(input.grad, ref_input.grad, exact_dtype=False)
+
+        helper(2, 8, 4, 4, out_channels=4, kernel_size=3, groups=1)
+        helper(2, 8, 4, 4, out_channels=8, kernel_size=3, groups=8)
+        helper(1, 16, 56, 56, out_channels=16, kernel_size=3, groups=1)
+        helper(1, 16, 56, 56, out_channels=16, kernel_size=3, groups=16)
+
+    @onlyCUDA
+    @skipCUDAIfRocm
+    @skipCUDAIfCudnnVersionLessThan(8005)
+    @dtypes(torch.half, torch.float)
+    def test_conv_cudnn_ndhwc(self, device, dtype):
+        def helper(n, c, d, h, w, out_channels, kernel_size, groups):
+            input = torch.randint(-2, 2, (n, c, d, h, w), dtype=dtype, device=device)\
+                .to(memory_format=torch.channels_last_3d)
+            input.requires_grad_()
+            conv = nn.Conv3d(c, out_channels, kernel_size, groups=groups)\
+                .to(device='cuda', dtype=dtype, memory_format=torch.channels_last_3d)
+            for p in conv.parameters():
+                p.data = torch.randint_like(p, -2, 2)
+
+            # use FP64 channels-first conv as reference
+            ref_input = input.detach().clone().contiguous().double().requires_grad_()
+            ref_conv = nn.Conv3d(c, out_channels, kernel_size, groups=groups)
+            # load_state_dict will restore the stride & memory_layout on ref_conv.weight.
+            ref_conv.load_state_dict(conv.state_dict())
+            ref_conv = ref_conv.to(device='cuda', dtype=torch.double, memory_format=torch.contiguous_format)
+
+            out = conv(input)
+            ref_out = ref_conv(ref_input)
+
+            grad = torch.randint_like(out, -2, 2)
+            ref_grad = grad.detach().clone().double().contiguous()
+
+            out.backward(grad)
+            ref_out.backward(ref_grad)
+
+            self.assertTrue(out.is_contiguous(memory_format=torch.channels_last_3d))
+            self.assertTrue(input.grad.is_contiguous(memory_format=torch.channels_last_3d))
+            self.assertTrue(conv.weight.grad.is_contiguous(memory_format=torch.channels_last_3d))
+
+            self.assertTrue(ref_out.is_contiguous())
+            self.assertTrue(ref_input.grad.is_contiguous())
+            self.assertTrue(ref_conv.weight.grad.is_contiguous())
+
+            self.assertEqual(out, ref_out, exact_dtype=False)
+            self.assertEqual(conv.weight.grad, ref_conv.weight.grad, exact_dtype=False)
+            self.assertEqual(conv.bias.grad, ref_conv.bias.grad, exact_dtype=False)
+            self.assertEqual(input.grad, ref_input.grad, exact_dtype=False)
+
+        helper(2, 8, 4, 4, 4, out_channels=4, kernel_size=3, groups=1)
+        helper(2, 8, 4, 4, 4, out_channels=8, kernel_size=3, groups=8)
+        helper(1, 16, 18, 18, 18, out_channels=16, kernel_size=3, groups=1)
+        helper(1, 16, 18, 18, 18, out_channels=16, kernel_size=3, groups=16)
+
+    def _run_conv(self, layer, device, inp, grad, ref_conv, ref_input, ref_out,
+                  input_format, weight_format, grad_format, output_format):
+        conv = layer(inp.size(1), grad.size(1),
+                     ref_conv.weight.size(2)).float().to(device)
+        # load_state_dict will restore the stride & memory_layout on ref_conv.weight.
+        conv.load_state_dict(ref_conv.state_dict())
+        weight_data = conv.weight.detach().clone().contiguous(memory_format=weight_format)
+        conv.weight.data = weight_data.resize_(weight_data.size(), memory_format=weight_format)
+        input = inp.clone().contiguous(memory_format=input_format)
+        input.resize_(input.size(), memory_format=input_format)
+        input = input.requires_grad_()
+        grad = grad.contiguous(memory_format=grad_format)
+        grad.resize_(grad.size(), memory_format=grad_format)
+        out = conv(input)
+        out.backward(grad)
+        self.assertTrue(out.is_contiguous(memory_format=output_format))
+        self.assertEqual(out, ref_out)
+        self.assertEqual(conv.weight.grad, ref_conv.weight.grad)
+        self.assertEqual(conv.bias.grad, ref_conv.bias.grad)
+        self.assertEqual(input.grad, ref_input.grad)
+
+    def _test_conv_cudnn_nhwc_nchw(self, layer, n, c, h, w, k, filter_size, device):
+        data = torch.randint(1, 10, (n, c, h, w), dtype=torch.float32, device=device)
+        ref_input = data.clone().contiguous().requires_grad_(True)
+        ref_conv = layer(c, k, filter_size).float().to(device)
+        ref_out = ref_conv(ref_input)
+        grad = torch.randint(1, 10, ref_out.size(), dtype=torch.float32, device="cuda")
+        ref_out.backward(grad)
+
+        for w_f in [torch.contiguous_format, torch.channels_last]:
+            for g_f in [torch.contiguous_format, torch.channels_last]:
+                for input_format in [torch.contiguous_format, torch.channels_last]:
+                    output_format = torch.contiguous_format
+                    # Older versions of CudNN have Channels Last support disabled
+                    if torch.backends.cudnn.version() >= 7603:
+                        if input_format == torch.channels_last:
+                            output_format = torch.channels_last
+                        # This is because we have N111 weight that cannot handle
+                        # the ambiguous memory_format
+                        if w_f == torch.channels_last:
+                            if layer == nn.Conv2d and filter_size * c != 1:
+                                output_format = torch.channels_last
+                            if layer == nn.ConvTranspose2d and filter_size * k != 1:
+                                output_format = torch.channels_last
+                    self._run_conv(layer, device, data, grad, ref_conv, ref_input,
+                                   ref_out, input_format, w_f, g_f, output_format)
+
+    @onlyCUDA
+    @skipCUDAIfRocmVersionLessThan((4, 3))
+    @skipCUDAIfNotMiopenSuggestNHWC
+    @skipCUDAIfCudnnVersionLessThan(7603)
+    @tf32_on_and_off(0.05)
+    def test_conv_cudnn_mismatch_memory_format(self, device):
+        configs = [
+            [4, 2, 8, 8, 4, 2],
+            [4, 1, 8, 8, 4, 2],
+            [1, 1, 8, 8, 4, 2],
+            [4, 2, 2, 8, 4, 1],
+            [4, 2, 1, 8, 4, 1],
+            [4, 2, 8, 8, 4, 1],
+            [4, 1, 8, 8, 4, 1],
+        ]
+        for n, c, h, w, k, filter_size in configs:
+            self._test_conv_cudnn_nhwc_nchw(nn.Conv2d, n, c, h, w, k, filter_size, device)
+            self._test_conv_cudnn_nhwc_nchw(nn.ConvTranspose2d, n, c, h, w, k, filter_size, device)
+
+    # torch.half is erroring out on Windows with CUDA 10.1 + cuDNN 7.6.4
+    # returning CUDNN_STATUS_BAD_PARAM
+    # Disabling that specific test for now [see issue # 33918]
+    @onlyCUDA
+    @skipCUDAIfNoCudnn
+    @dtypes(torch.float, torch.double)
+    def test_conv_cudnn_nhwc_support(self, device, dtype):
+        input = torch.randn((1, 16, 1, 1), dtype=dtype, device="cuda", requires_grad=True)
+        weight = torch.randn((8, 16, 3, 3), dtype=dtype, device="cuda", requires_grad=True)
+        weight = weight.to(memory_format=torch.channels_last)
+        o = torch.conv2d(input, weight, None, (2, 1), (1, 1), (1, 1), 1)
+        self.assertTrue(o.is_contiguous(memory_format=torch.channels_last))
+        o.sum().backward()
+
+    # Test that faster algorithms used for inference produce the same results
+    # Validates depthwise3x3 bug reported in https://github.com/pytorch/pytorch/issues/60176
+    @onlyCPU
+    @dtypes(torch.float)
+    def test_conv2d_no_grad(self, device, dtype):
+        for batch in [1, 2, 3]:
+            for groups in [1, 2, 4]:
+                input = torch.rand(batch, groups, 8, 8, dtype=dtype, device=device)
+                m = nn.Conv2d(groups, 8, kernel_size=(3, 3), groups=groups, dtype=dtype, device=device)
+                with torch.no_grad():
+                    output_ng = m(input)
+                output = m(input)
+                self.assertEqual(output, output_ng, rtol=1e-2, atol=1e-5)
+
+    @onlyCUDA
+    @skipCUDAIfNoCudnn
+    @dtypes(torch.float, torch.float16)
+    @precisionOverride({torch.half: 0.002, torch.float: 1e-4})
+    def test_cudnn_convolution_relu(self, device, dtype):
+        for batch, groups, image_size, kernel_size, memory_format in \
+                product((1, 2, 3),
+                        (1, 2, 4),
+                        ((1, 1), (8, 8)),
+                        ((1, 1), (3, 3)),
+                        (torch.channels_last, torch.contiguous_format)):
+            if image_size[0] < kernel_size[0]:
+                continue
+            inp = torch.rand(batch, groups, *image_size, dtype=dtype, device=device)
+            w = torch.randn(8, groups, *kernel_size, dtype=dtype, device=device)
+            conv2d_out = torch.conv2d(inp, w, None, (1, 1), (0, 0), (1, 1), 1)
+            inp = inp.to(memory_format=memory_format)
+            w = w.to(memory_format=memory_format)
+            if torch.version.hip:
+                cudnn_out = torch.miopen_convolution_relu(inp, w, None, (1, 1), (0, 0), (1, 1), 1)
+            else:
+                cudnn_out = torch.cudnn_convolution_relu(inp, w, None, (1, 1), (0, 0), (1, 1), 1)
+            self.assertTrue(cudnn_out.is_contiguous(memory_format=memory_format))
+            if tf32_is_not_fp32() and dtype == torch.float:
+                self.assertEqual(conv2d_out.relu(), cudnn_out, atol=2e-4, rtol=0.006)
+            else:
+                self.assertEqual(conv2d_out.relu(), cudnn_out)
+
+    @onlyCUDA
+    @skipCUDAIfNoCudnn
+    @dtypes(torch.float, torch.float16)
+    @precisionOverride({torch.half: 0.002, torch.float: 1e-4})
+    def test_cudnn_convolution_add_relu(self, device, dtype):
+        for batch, groups, image_size, kernel_size, memory_format in \
+            product((1, 2, 3),
+                    (1, 2, 4),
+                    ((1, 1), (8, 8)),
+                    ((1, 1), (3, 3)),
+                    (torch.channels_last, torch.contiguous_format)):
+            if image_size[0] < kernel_size[0]:
+                continue
+            inp = torch.rand(batch, groups, *image_size, dtype=dtype, device=device)
+            w = torch.randn(8, groups, *kernel_size, dtype=dtype, device=device)
+            conv2d_out = torch.conv2d(inp, w, None, (1, 1), (0, 0), (1, 1), 1)
+            alpha = 2.0
+            z = torch.randn_like(conv2d_out)
+
+            inp = inp.to(memory_format=memory_format)
+            w = w.to(memory_format=memory_format)
+            z = z.to(memory_format=memory_format)
+            if torch.version.hip:
+                cudnn_out = torch.miopen_convolution_add_relu(inp, w, z, alpha, None, (1, 1), (0, 0), (1, 1), 1)
+            else:
+                cudnn_out = torch.cudnn_convolution_add_relu(inp, w, z, alpha, None, (1, 1), (0, 0), (1, 1), 1)
+
+            self.assertTrue(cudnn_out.is_contiguous(memory_format=memory_format))
+            if tf32_is_not_fp32() and dtype == torch.float:
+                self.assertEqual(F.relu(conv2d_out + alpha * z), cudnn_out, atol=3e-4, rtol=0.006)
+            else:
+                self.assertEqual(F.relu(conv2d_out + alpha * z), cudnn_out)
+
+    @onlyCUDA
+    @skipCUDAIfRocm
+    @skipCUDAIfCudnnVersionLessThan(7603)
+    def test_convert_conv2d_weight_memory_format(self, device):
+        input = torch.randint(1, 10, (2, 8, 4, 4), dtype=torch.float32, device=device)
+        model = nn.Sequential(
+            nn.Conv2d(8, 4, 3),
+            nn.BatchNorm2d(4)).to(device).float()
+        for memory_format in [torch.channels_last, torch.contiguous_format]:
+            model = nn.utils.convert_conv2d_weight_memory_format(model, memory_format)
+            out = model(input)
+            self.assertTrue(out.is_contiguous(memory_format=memory_format))
+
+        model = nn.Sequential(
+            nn.ConvTranspose2d(8, 4, 3),
+            nn.BatchNorm2d(4)).to(device).float()
+        for memory_format in [torch.channels_last, torch.contiguous_format]:
+            model = nn.utils.convert_conv2d_weight_memory_format(model, memory_format)
+            out = model(input)
+            self.assertTrue(out.is_contiguous(memory_format=memory_format))
+
+    def test_conv_double_backward_strided_with_3D_input_and_weight(self, device):
+        # Test that _convolution_double_backward() outputs the correct grad shapes
+        # for 3D input / weight when stride > 1. This is an ad-hoc regression test for a
+        # specific case that was uncovered during the convolution consolidation effort.
+        # The test can be safely deleted if _convolution_double_backward() is removed.
+
+        input = torch.randn(2, 3, 6, device=device)
+        weight = torch.randn(3, 3, 3, device=device)
+        bias = torch.randn(3, device=device)
+        stride = (2,)
+        padding = (1,)
+        dilation = (1,)
+        transposed = False
+        output_padding = (0,)
+        groups = 1
+        output = torch.ops.aten.convolution(input, weight, bias, stride, padding, dilation, transposed,
+                                            output_padding, groups)
+
+        ggI = torch.randn(input.shape, device=device)
+        ggW = torch.randn(weight.shape, device=device)
+        ggB = torch.randn(bias.shape, device=device)
+        gO = torch.randn(output.shape, device=device)
+        output_mask = [True, True, True]
+        grad_grad_output, grad_input, grad_weight = torch.ops.aten._convolution_double_backward(
+            ggI, ggW, ggB, gO, weight, input, stride, padding, dilation, transposed,
+            output_padding, groups, output_mask)
+
+        # Make sure the correct shapes are computed.
+        self.assertEqual(grad_grad_output.shape, gO.shape)
+        self.assertEqual(grad_input.shape, input.shape)
+        self.assertEqual(grad_weight.shape, weight.shape)
+
+    @onlyCUDA
+    @largeTensorTest('40GB')
+    @largeTensorTest('24GB', 'cpu')
+    def test_conv3d_64bit_indexing(self, device):
+        x = torch.rand(1, 32, 512, 512, 256)
+        m = torch.nn.Conv3d(32, 1, kernel_size=1, padding=0, stride=1, bias=False)
+        yref = m(x)
+        y = m.to(device=device)(x.to(device=device))
+        self.assertEqual(yref, y)
+
+instantiate_device_type_tests(TestConvolutionNNDeviceType, globals())
+instantiate_parametrized_tests(TestConvolutionNN)
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_nn.py b/test/test_nn.py
index 8cf6e5e56555e..3e281e02db16a 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -38,27 +38,24 @@
 from torch.nn.utils import parameters_to_vector, vector_to_parameters
 from torch.nn import Parameter
 from torch.nn.parallel._functions import Broadcast
-from torch.testing._internal.common_dtype import integral_types, floating_types_and, get_all_math_dtypes, \
-    floating_and_complex_types_and
+from torch.testing._internal.common_dtype import integral_types, get_all_math_dtypes
 from torch.testing._internal.common_utils import freeze_rng_state, run_tests, TestCase, skipIfNoLapack, skipIfRocm, \
-    skipIfRocmVersionLessThan, skipIfNotMiopenSuggestNHWC, TEST_NUMPY, TEST_SCIPY, TEST_WITH_CROSSREF, TEST_WITH_ROCM, \
+    TEST_NUMPY, TEST_SCIPY, TEST_WITH_CROSSREF, TEST_WITH_ROCM, \
     download_file, get_function_arglist, load_tests, skipIfMps,\
     TemporaryFileName, TEST_WITH_UBSAN, IS_PPC, \
-    parametrize as parametrize_test, subtest, instantiate_parametrized_tests, set_default_dtype, IS_WINDOWS, \
+    parametrize as parametrize_test, subtest, instantiate_parametrized_tests, IS_WINDOWS, \
     skipIfTorchDynamo
 from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, TEST_CUDNN_VERSION
 from torch.testing._internal.common_nn import NNTestCase, NewModuleTest, CriterionTest, \
     module_tests, criterion_tests, loss_reference_fns, \
     ctcloss_reference, new_module_tests, single_batch_reference_fn, _test_bfloat16_ops, _test_module_empty_input
 from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes, \
-    dtypesIfCUDA, precisionOverride, skipCUDAIfNoCudnn, skipCUDAIfCudnnVersionLessThan, onlyCUDA, onlyCPU, \
-    skipCUDAIfRocm, skipCUDAIf, skipCUDAIfNotRocm, skipCUDAIfRocmVersionLessThan, skipCUDAIfNotMiopenSuggestNHWC, \
-    onlyNativeDeviceTypes, deviceCountAtLeast, largeTensorTest, expectedFailureMeta, skipMeta, get_all_device_types, \
-    disableMkldnn, skipCPUIfNoMkldnn, disablecuDNN, skipCUDAIfMiopen, skipCUDAIfNoMiopen
+    dtypesIfCUDA, precisionOverride, skipCUDAIfCudnnVersionLessThan, onlyCUDA, onlyCPU, \
+    skipCUDAIfRocm, skipCUDAIf, skipCUDAIfNotRocm, \
+    onlyNativeDeviceTypes, deviceCountAtLeast, largeTensorTest, expectedFailureMeta, skipMeta, get_all_device_types
 from torch.nn import MultiheadAttention
 
 from hypothesis import given
-from torch.testing import make_tensor
 import torch.testing._internal.hypothesis_utils as hu
 from torch.testing._internal.common_utils import _assertGradAndGradgradChecks, gradcheck, gradgradcheck, \
     GRADCHECK_NONDET_TOL
@@ -195,24 +192,6 @@ def test_module_backcompat(self):
         input = torch.randn(2, 3, dtype=torch.float)
         self.assertEqual(m(input).size(), (2, 5))
 
-    def test_conv_backcompat(self):
-        from torch.serialization import SourceChangeWarning
-
-        # This file was generated by running on PyTorch 1.0.1 on Python 2:
-        #
-        #     import torch
-        #     from torch import nn
-        #     m = nn.Conv2d(1, 1, 1)
-        #     torch.save(m, 'legacy_conv2d.pt')
-        #
-        # NB: This Pickle also contains some Unicode data!
-        path = download_file('https://download.pytorch.org/test_data/legacy_conv2d.pt')
-        with warnings.catch_warnings():
-            warnings.simplefilter('ignore', SourceChangeWarning)
-            m = torch.load(path, encoding='utf-8')
-        input = torch.randn((1, 1, 1, 1), dtype=torch.float)
-        self.assertEqual(m(input).size(), (1, 1, 1, 1))
-
     def test_share_memory(self):
         class Net(nn.Module):
             def __init__(self):
@@ -761,193 +740,6 @@ def test_no_grad(self):
                 self.assertFalse(output2.requires_grad)
                 self.assertRaises(RuntimeError, lambda: output2.backward(torch.ones(1, 5, 10, 10)))
 
-    def test_invalid_conv1d(self):
-        for dtype in [torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble]:
-            module = nn.Conv1d(in_channels=3, out_channels=33, kernel_size=10, stride=1, bias=True).to(dtype)
-            input = torch.randn(1, 3, 4).to(dtype)
-            with self.assertRaisesRegex(RuntimeError,
-                                        r'Calculated padded input size per channel: \(4\). ' +
-                                        r'Kernel size: \(10\). Kernel size can\'t be greater than actual input size'):
-                module(input)
-
-            # Negative stride check
-            module = nn.Conv1d(in_channels=3, out_channels=6, kernel_size=3, stride=-1, bias=True).to(dtype)
-            input = torch.randn(1, 3, 4).to(dtype)
-            with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'):
-                module(input)
-
-    def test_mismatch_shape_conv2d(self):
-        for dtype in (torch.float, torch.cfloat):
-            x = torch.randn(1, 10, 1, 28, 28, dtype=dtype)
-            w = torch.randn(6, 1, 5, 5, dtype=dtype)
-
-            with self.assertRaisesRegex(RuntimeError,
-                                        r'Expected 3D \(unbatched\) or 4D \(batched\) input to conv2d, but got ' +
-                                        r'input of size: \[1, 10, 1, 28, 28\]'):
-
-                F.conv2d(x, w)
-
-    def test_conv2d_discontiguous_weight(self):
-        for dtype in (torch.float, torch.cfloat):
-            # Test for https://github.com/pytorch/pytorch/issues/55781
-            x = torch.ones(64, 16, 16, 16, dtype=dtype)
-            weight = torch.arange(0, 1.0, 1 / 2.0 ** 10).reshape(32, 16, 1, 2).to(dtype)[:, :, :, ::2]
-            self.assertFalse(weight.is_contiguous())
-            y = torch.nn.functional.conv2d(x, weight, None)
-            if torch.backends.mkldnn.is_available():
-                # Disable MKLDNN explicitly, so that either NNPACK or THCNN will be used
-                with torch.backends.mkldnn.flags(enabled=False):
-                    y_ = torch.nn.functional.conv2d(x, weight, None)
-                    self.assertEqual(y, y_)
-            self.assertEqual(y.sum(), 4186112.)
-
-    def test_invalid_conv2d(self):
-        for dtype in [torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble]:
-            module = torch.nn.Conv2d(1, 1, kernel_size=3, dilation=2, stride=2).to(dtype)
-            input = torch.empty(1, 1, 4, 4).to(dtype)
-            self.assertRaises(RuntimeError, lambda: module(input))
-
-            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, stride=1, bias=True)
-            input = torch.randn(1, 3, 1, 1)
-            with self.assertRaisesRegex(RuntimeError,
-                                        r'Calculated padded input size per channel: \(1 x 1\). ' +
-                                        r'Kernel size: \(10 x 10\). Kernel size can\'t be greater than actual input size'):
-                module(input)
-
-            # Negative stride check
-            module = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=4, stride=-1, bias=True).to(dtype)
-            input = torch.randn(1, 3, 4, 4).to(dtype)
-            with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'):
-                module(input)
-
-            # Zero stride check
-            module = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=4, stride=0, bias=True).to(dtype)
-            input = torch.randn(1, 3, 4, 4).to(dtype)
-            with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'):
-                module(input)
-
-    def test_invalid_conv3d(self):
-        for dtype in [torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble]:
-            module = torch.nn.Conv3d(1, 1, kernel_size=3, dilation=2, stride=2).to(dtype)
-            input = torch.empty(1, 1, 4, 4, 4).to(dtype)
-            self.assertRaises(RuntimeError, lambda: module(input))
-
-            # Negative stride check
-            module = torch.nn.Conv3d(1, 1, kernel_size=3, stride=-2)
-            input = torch.empty(1, 1, 4, 4, 4)
-            with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'):
-                module(input)
-
-    def test_conv_invalid_groups(self):
-        with self.assertRaisesRegex(ValueError, 'groups must be a positive integer'):
-            torch.nn.Conv1d(1, 1, kernel_size=3, dilation=2, stride=2, groups=0)
-        with self.assertRaisesRegex(ValueError, 'groups must be a positive integer'):
-            torch.nn.Conv2d(1, 1, kernel_size=3, dilation=2, stride=2, groups=-1)
-        with self.assertRaisesRegex(ValueError, 'groups must be a positive integer'):
-            torch.nn.Conv3d(1, 1, kernel_size=3, dilation=2, stride=2, groups=-2)
-
-    def test_Conv1d_module_same_padding(self):
-        # Compare module against functional: without strides/dilation, asymmetric padding
-        x = torch.rand(1, 1, 20)
-        module = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=10,
-                           padding='same')
-        expect = F.conv1d(x, module.weight, module.bias, padding='same')
-        self.assertEqual(expect, module(x))
-
-        # Test dilation, symmetric padding
-        module = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=10,
-                           padding='same', dilation=2)
-        expect = F.conv1d(x, module.weight, module.bias, padding='same', dilation=2)
-        self.assertEqual(expect, module(x))
-
-        # Test non-zero padding_mode, requiring explicit padding
-        module = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=10,
-                           padding='same', padding_mode='replicate')
-        x_padded = F.pad(x, [4, 5], mode='replicate')
-        expect = F.conv1d(x_padded, module.weight, module.bias, padding='valid')
-        self.assertEqual(expect, module(x))
-        self.assertEqual(x.size(), expect.size())
-
-        # Test connstruction with invalid padding string raises
-        with self.assertRaisesRegex(ValueError, 'Invalid padding string'):
-            module = nn.Conv1d(in_channels=3, out_channels=33, kernel_size=10, padding='foo')
-
-        # Test connstruction with same padding and strides raises
-        with self.assertRaisesRegex(ValueError, "padding='same'"):
-            module = nn.Conv1d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=2)
-
-    def test_Conv2d_module_same_padding(self):
-        # Compare module against functional:
-        # without strides/dilation, both symmetric and asymmetric padding
-        x = torch.rand(1, 1, 9, 20)
-        module = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(5, 10),
-                           padding='same')
-        expect = F.conv2d(x, module.weight, module.bias, padding='same')
-        self.assertEqual(expect, module(x))
-
-        # with dilation, symmetric padding
-        module = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(3, 4),
-                           padding='same', dilation=(1, 2))
-        expect = F.conv2d(x, module.weight, module.bias, padding='same', dilation=(1, 2))
-        self.assertEqual(expect, module(x))
-
-        # Test non-zero padding_mode, requiring explicit padding
-        module = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(3, 4),
-                           padding='same', padding_mode='reflect')
-        x_padded = F.pad(x, [1, 2, 1, 1], mode='reflect')
-        expect = F.conv2d(x_padded, module.weight, module.bias, padding='valid')
-        self.assertEqual(expect, module(x))
-        self.assertEqual(x.size(), expect.size())
-
-        # Test connstruction with invalid padding string raises
-        with self.assertRaisesRegex(ValueError, 'Invalid padding string'):
-            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='foo')
-
-        # Test connstruction with same padding and strides raises
-        with self.assertRaisesRegex(ValueError, "padding='same'"):
-            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=2)
-        with self.assertRaisesRegex(ValueError, "padding='same'"):
-            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=(1, 3))
-        with self.assertRaisesRegex(ValueError, "padding='same'"):
-            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=(4, 1))
-
-    def test_Conv3d_module_same_padding(self):
-        # Compare module against functional:
-        x = torch.rand(1, 1, 4, 4, 4)
-        # without dilation, both symmetric and asymmetric padding
-        module = nn.Conv3d(in_channels=1, out_channels=1, kernel_size=(2, 3, 4),
-                           padding='same')
-        expect = F.conv3d(x, module.weight, module.bias, padding='same')
-        self.assertEqual(expect, module(x))
-
-        # with dilation, both symmetric and asymmetric padding
-        module = nn.Conv3d(in_channels=1, out_channels=1, kernel_size=(2, 3, 4),
-                           padding='same', dilation=(3, 2, 1))
-        expect = F.conv3d(x, module.weight, module.bias, padding='same', dilation=(3, 2, 1))
-        self.assertEqual(expect, module(x))
-
-        # Test non-zero padding_mode, requiring explicit padding
-        module = nn.Conv3d(in_channels=1, out_channels=1, kernel_size=(2, 3, 4),
-                           padding='same', padding_mode='circular')
-        x_padded = F.pad(x, [1, 2, 1, 1, 0, 1], mode='circular')
-        expect = F.conv3d(x_padded, module.weight, module.bias, padding='valid')
-        self.assertEqual(expect, module(x))
-        self.assertEqual(x.size(), expect.size())
-
-        # Test connstruction with invalid padding string raises
-        with self.assertRaisesRegex(ValueError, 'Invalid padding string'):
-            module = nn.Conv3d(in_channels=3, out_channels=33, kernel_size=10, padding='foo')
-
-        # Test connstruction with same padding and strides raises
-        with self.assertRaisesRegex(ValueError, "padding='same'"):
-            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=2)
-        with self.assertRaisesRegex(ValueError, "padding='same'"):
-            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=(1, 1, 3))
-        with self.assertRaisesRegex(ValueError, "padding='same'"):
-            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=(1, 4, 1))
-        with self.assertRaisesRegex(ValueError, "padding='same'"):
-            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=(5, 1, 1))
-
     def test_parameters_and_named_parameters(self):
         def names(named_parameters):
             return [k for k, _ in named_parameters]
@@ -6252,378 +6044,6 @@ def test_assignments(get_list, a, b, c):
         self.assertIn('buf', l.state_dict())
         self.assertEqual(l.state_dict()['buf'], buf)
 
-    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
-    def test_thnn_conv_strided_padded_dilated(self):
-        for convfn, dims, transposed in (
-                (torch.nn.functional.conv2d, 2, False),
-                (torch.nn.functional.conv_transpose2d, 2, True),
-                (torch.nn.functional.conv3d, 3, False),
-                (torch.nn.functional.conv_transpose3d, 3, True)):
-            for stride, padding, dilation in (
-                    (2, 0, 1), (1, 1, 1), (2, 1, 1), (1, 0, 2)):
-                kwargs = {"stride": stride, "padding": padding, "dilation": dilation}
-                inp_shape = (1, 2) + dims * (4,)
-                weight_shape = (2, 2) + dims * (1,)
-                inputs = torch.randn(inp_shape, dtype=torch.double, device="cuda", requires_grad=True)
-                weight = torch.randn(weight_shape, dtype=torch.double, device="cuda", requires_grad=True)
-                bias = torch.randn(2, dtype=torch.double, device="cuda", requires_grad=True)
-                with torch.backends.cudnn.flags(enabled=False):
-                    res = convfn(inputs, weight, bias, **kwargs)
-                res_cpu = convfn(inputs.cpu(), weight.cpu(), bias.cpu(), **kwargs)
-                self.assertEqual(res, res_cpu)
-                with torch.backends.cudnn.flags(enabled=False):
-                    torch.autograd.gradcheck(
-                        lambda x, w, b: convfn(x, w, b, **kwargs),
-                        (inputs, weight, bias)
-                    )
-                    torch.autograd.gradcheck(
-                        lambda x, w, b: convfn(x, w, b, **kwargs),
-                        (inputs.cpu(), weight.cpu(), bias.cpu())
-                    )
-
-    def test_Conv2d_inconsistent_types(self):
-        inputs = torch.randn(4, 1, 7, 7, dtype=torch.float)
-        weights = torch.randn(1, 1, 3, 3, dtype=torch.double)
-        # inconsistent types should raise an exception
-        self.assertRaises(RuntimeError, lambda: nn.functional.conv2d(inputs, weights))
-        # but it should work with the same type
-        nn.functional.conv2d(inputs.float(), weights.float())
-
-    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
-    def test_Conv2d_inconsistent_types_on_GPU_without_cudnn(self):
-        inputs = torch.randn(4, 1, 7, 7, dtype=torch.float, device="cuda")
-        weights = torch.randn(1, 1, 3, 3, dtype=torch.double, device="cuda")
-        bias = torch.randn(1, dtype=torch.double, device="cuda")
-
-        with torch.backends.cudnn.flags(enabled=False):
-            # inconsistent types should raise an exception
-            self.assertRaises(RuntimeError, lambda: nn.functional.conv2d(inputs, weights))
-            self.assertRaises(RuntimeError, lambda: nn.functional.conv2d(inputs, weights.float(), bias))
-
-            # but it should work with the same type
-            nn.functional.conv2d(inputs.float(), weights.float(), bias.float())
-
-    def test_Conv2d_1x1(self):
-        in_channels = 2
-        out_channels = 2
-        mod = torch.nn.Conv2d(2, 2, 1, bias=False).to(dtype=torch.double)
-        input = torch.randn(1, in_channels, 5, 5, requires_grad=True, dtype=torch.double)
-        for enabled in (False, True):
-            with torch.backends.mkldnn.flags(enabled=enabled):
-                gradcheck(F.conv2d, (input, mod.weight))
-
-    def test_Conv2d_OneDNN(self):
-        def run_once(group_val=24, dilation=1):
-            ifm = torch.ones([1, group_val, 6, 6], dtype=torch.float32)
-            weights = torch.ones([group_val, 1, 3, 3], dtype=torch.float32)
-            op = torch.nn.Conv2d(
-                in_channels=group_val,
-                out_channels=group_val,
-                kernel_size=[3, 3],
-                stride=[2, 2],
-                padding=[1, 1],
-                dilation=[dilation, dilation],
-                groups=group_val,
-                bias=False,
-                padding_mode='zeros'
-            )
-
-            op.weight.data = weights
-            res = op(ifm)
-            grad_in = torch.ones(res.shape, dtype=torch.float32)
-            res.backward(grad_in)
-            return op.weight.grad
-
-        for gorup_val in (24, 48, 23, 25):
-            for dilation in (1, 2):
-                with torch.backends.mkldnn.flags(enabled=False):
-                    without_onednn = run_once(gorup_val, dilation)
-
-                with torch.backends.mkldnn.flags(enabled=True):
-                    with_onednn = run_once(gorup_val, dilation)
-
-                self.assertEqual(without_onednn, with_onednn)
-
-    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
-    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
-    def test_cudnn_non_contiguous(self):
-        x = torch.randn(192, 16, 50).cuda()
-        x = x.permute(0, 2, 1).contiguous().permute(0, 2, 1)
-        m = torch.nn.Conv1d(
-            in_channels=16,
-            out_channels=32,
-            kernel_size=2,
-            bias=True).cuda()
-        result = m(x)
-
-    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
-    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
-    def test_Conv2d_inconsistent_types_on_GPU_with_cudnn(self):
-        inputs = torch.randn(4, 1, 7, 7, dtype=torch.float, device="cuda")
-        weights = torch.randn(1, 1, 3, 3, dtype=torch.double, device="cuda")
-        bias = torch.randn(1, dtype=torch.double, device="cuda")
-
-        with torch.backends.cudnn.flags(enabled=True):
-            # inconsistent types should raise an exception
-            self.assertRaises(RuntimeError, lambda: nn.functional.conv2d(inputs, weights))
-            self.assertRaises(RuntimeError, lambda: nn.functional.conv2d(inputs, weights.float(), bias))
-
-            # but it should work with the same type
-            nn.functional.conv2d(inputs.float(), weights.float(), bias.float())
-
-    def test_Conv2d_missing_argument(self):
-        c = nn.Conv2d(3, 3, 3)
-        self.assertRaises(TypeError, lambda: c(None))
-
-    def test_Conv2d_backward_twice(self):
-        input = torch.randn(2, 3, 5, 5)
-        c = nn.Conv2d(3, 3, 3)
-        o1 = c(input)
-        o1.sum().backward()
-        self.assertRaisesRegex(RuntimeError, 'Specify retain_graph=True',
-                               lambda: o1.sum().backward())
-
-
-    def test_conv_modules_raise_error_on_incorrect_input_size(self):
-        for dtype in [torch.bfloat16, torch.double, torch.float]:
-            modules = [nn.Conv1d(3, 8, 3).to(dtype), nn.ConvTranspose1d(3, 8, 3).to(dtype),
-                       nn.Conv2d(3, 8, 3).to(dtype), nn.ConvTranspose2d(3, 8, 3).to(dtype),
-                       nn.Conv3d(3, 8, 3).to(dtype), nn.ConvTranspose3d(3, 8, 3).to(dtype)]
-
-            invalid_input_dims = [(1, 4), (1, 4),
-                                  (2, 5), (2, 5),
-                                  (3, 6), (3, 6)]
-
-            for invalid_dims, module in zip(invalid_input_dims, modules):
-                for dims in invalid_dims:
-                    input = torch.empty(torch.Size((3, ) * dims))
-                    self.assertRaises(RuntimeError, lambda: module(input))
-
-    def test_conv_shapecheck(self):
-        def test(should_raise, module, input_size, dtype):
-            input = torch.empty(3, *input_size).to(dtype)
-            if should_raise:
-                self.assertRaises(RuntimeError, lambda: module(input))
-            else:
-                # just run it to ensure no exception raised.
-                module(input)
-
-        for dtype in [torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble]:
-            # Conv1d
-            test(True, nn.Conv1d(1, 1, 3).to(dtype), (1, 2), dtype)
-            test(True, nn.Conv1d(1, 1, 3, stride=2).to(dtype), (1, 2), dtype)
-            test(False, nn.Conv1d(1, 1, 2).to(dtype), (1, 2), dtype)
-            test(False, nn.Conv1d(1, 1, 2, stride=2).to(dtype), (1, 2), dtype)
-            test(False, nn.Conv1d(1, 1, 3, stride=2, padding=1).to(dtype), (1, 2), dtype)
-
-            # Conv2d
-            test(True, nn.Conv2d(1, 1, (3, 3)).to(dtype), (1, 2, 2), dtype)
-            test(False, nn.Conv2d(1, 1, (3, 3)).to(dtype), (1, 3, 3), dtype)
-            test(False, nn.Conv2d(1, 1, (3, 3), padding=1).to(dtype), (1, 2, 2), dtype)
-
-            # Conv3D
-            test(True, nn.Conv3d(1, 1, (3, 3, 3)).to(dtype), (1, 2, 2, 2), dtype)
-            test(False, nn.Conv3d(1, 1, (3, 3, 3)).to(dtype), (1, 3, 3, 3), dtype)
-            test(False, nn.Conv3d(1, 1, (3, 3, 3), padding=1).to(dtype), (1, 2, 2, 2), dtype)
-
-    def test_ConvTranspose2d_output_size(self):
-        m = nn.ConvTranspose2d(3, 4, 3, 3, 0, 2)
-        i = torch.randn(2, 3, 6, 6)
-        for h in range(15, 22):
-            for w in range(15, 22):
-                if 18 <= h <= 20 and 18 <= w <= 20:
-                    output = m(i, output_size=(h, w))
-                    self.assertEqual(output.size()[2:], (h, w))
-                else:
-                    self.assertRaises(ValueError, lambda: m(i, (h, w)))
-
-    def test_ConvTranspose2d_output_size_downsample_upsample(self):
-        b, c, hid_c = 2, 3, 2
-        for h in range(13, 24):
-            for w in range(13, 17):
-                for k in range(2, 5):
-                    for d in range(1, 5):
-                        for s in range(1, 4):
-                            for p in range(3):
-                                conv = nn.Conv2d(
-                                    in_channels=c,
-                                    out_channels=hid_c,
-                                    kernel_size=k,
-                                    stride=s,
-                                    padding=p,
-                                    dilation=d,
-                                )
-
-                                t_conv = nn.ConvTranspose2d(
-                                    in_channels=hid_c,
-                                    out_channels=c,
-                                    kernel_size=k,
-                                    stride=s,
-                                    padding=p,
-                                    dilation=d,
-                                )
-
-                                i = torch.randn(b, c, h, w)
-
-                                out = t_conv(conv(i), output_size=i.shape)
-
-                                self.assertEqual(out.size()[2:], i.size()[2:])
-
-    def test_ConvTranspose3d_correct_output_size(self):
-        # Check that ConvTranspose3d can take a 5d output_size.
-        m = nn.ConvTranspose3d(2, 2, 2)
-        i = torch.rand(1, 2, 1, 1, 1)
-        out = m(i, output_size=(1, 2, 2, 2, 2))
-
-    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
-    def test_ConvTranspose2d_half_cublas_gemm(self):
-        with torch.backends.cudnn.flags(enabled=False):
-            inputs = torch.randn(1, 1, 16, 16, device='cuda', dtype=torch.half)
-            deconv = nn.ConvTranspose2d(
-                1, 1, 3, stride=2, padding=1, output_padding=1).cuda().half()
-            output = deconv(inputs)
-            output.mean().backward()
-
-    # For https://github.com/pytorch/pytorch/pull/1273
-    # Almost identical to the above `test_Conv2d_naive_groups`
-    @torch.backends.cudnn.flags(enabled=True, benchmark=False)
-    def test_Conv2d_groups_nobias(self):
-        dev_dtypes = [("cpu", torch.float)]
-        if TEST_CUDA:
-            dev_dtypes += [("cuda", torch.float), ("cuda", torch.half)]
-        if AMPERE_OR_ROCM:
-            dev_dtypes += [("cuda", torch.bfloat16)]
-        for device, dtype in dev_dtypes:
-            m = nn.Conv2d(4, 4, kernel_size=3, groups=2, bias=False).to(device, dtype)
-            i = torch.randn(2, 4, 6, 6, device=device, dtype=dtype, requires_grad=True)
-            output = m(i)
-            grad_output = torch.randn(2, 4, 4, 4, device=device, dtype=dtype)
-            output.backward(grad_output)
-
-            m1 = nn.Conv2d(2, 2, kernel_size=3, bias=False).to(device, dtype)
-            m1.weight.data.copy_(m.weight.data[:2])
-            i1 = i.data[:, :2].contiguous().requires_grad_(True)
-            output1 = m1(i1)
-            output1.backward(grad_output[:, :2].contiguous())
-
-            m2 = nn.Conv2d(2, 2, kernel_size=3, bias=False).to(device, dtype)
-            m2.weight.data.copy_(m.weight.data[2:])
-            i2 = i.data[:, 2:].contiguous().requires_grad_(True)
-            output2 = m2(i2)
-            output2.backward(grad_output[:, 2:].contiguous())
-
-            self.assertEqual(output, torch.cat([output1, output2], 1))
-            self.assertEqual(i.grad.data,
-                             torch.cat([i1.grad.data, i2.grad.data], 1),
-                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
-            self.assertEqual(m.weight.grad.data,
-                             torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
-                             atol=1e-1 if dtype == torch.half else dtype2prec_DONTUSE[dtype], rtol=0)
-
-    # Almost identical to the above `test_Conv2d_naive_groups`
-    # Covering special case when group > 1, input-channel / group < 16 and output-channel is multiple of 16
-    # See also https://github.com/pytorch/pytorch/pull/18463#issuecomment-476563686
-    # and https://github.com/pytorch/pytorch/pull/18463#issuecomment-477001024
-    @torch.backends.cudnn.flags(enabled=True, benchmark=False)
-    def test_Conv2d_groups_nobias_v2(self):
-        torch.manual_seed(123)
-        dev_dtypes = [("cpu", torch.float)]
-        if TEST_CUDA:
-            dev_dtypes += [("cuda", torch.float), ("cuda", torch.half)]
-        if AMPERE_OR_ROCM:
-            dev_dtypes += [("cuda", torch.bfloat16)]
-        for device, dtype in dev_dtypes:
-            m = nn.Conv2d(4, 16, kernel_size=3, groups=2, bias=False).to(device, dtype)
-            i = torch.randn(2, 4, 6, 6, device=device, dtype=dtype, requires_grad=True)
-            output = m(i)
-            grad_output = torch.randn(2, 16, 4, 4, device=device, dtype=dtype)
-            output.backward(grad_output)
-
-            m1 = nn.Conv2d(2, 8, kernel_size=3, bias=False).to(device, dtype)
-            m1.weight.data.copy_(m.weight.data[:8])
-            i1 = i.data[:, :2].contiguous().requires_grad_(True)
-            output1 = m1(i1)
-            output1.backward(grad_output[:, :8].contiguous())
-
-            m2 = nn.Conv2d(2, 8, kernel_size=3, bias=False).to(device, dtype)
-            m2.weight.data.copy_(m.weight.data[8:])
-            i2 = i.data[:, 2:].contiguous().requires_grad_(True)
-            output2 = m2(i2)
-            output2.backward(grad_output[:, 8:].contiguous())
-
-            self.assertEqual(output, torch.cat([output1, output2], 1))
-            self.assertEqual(i.grad.data,
-                             torch.cat([i1.grad.data, i2.grad.data], 1),
-                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
-            self.assertEqual(m.weight.grad.data,
-                             torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
-                             atol=1e-1 if dtype == torch.half else dtype2prec_DONTUSE[dtype], rtol=0)
-
-    # CPU-only test for group conv3d fast implementation using bmm
-    # See: https://github.com/pytorch/pytorch/pull/36355
-    def test_Conv3d_groups_nobias(self):
-        torch.manual_seed(123)
-        m = nn.Conv3d(4, 16, kernel_size=3, groups=2, bias=False).to("cpu", torch.float)
-        i = torch.randn(2, 4, 6, 6, 6, device="cpu", dtype=torch.float, requires_grad=True)
-        output = m(i)
-        grad_output = torch.randn(2, 16, 4, 4, 4, device="cpu", dtype=torch.float)
-        output.backward(grad_output)
-
-        m1 = nn.Conv3d(2, 8, kernel_size=3, bias=False).to("cpu", torch.float)
-        m1.weight.data.copy_(m.weight.data[:8])
-        i1 = i.data[:, :2].contiguous().requires_grad_(True)
-        output1 = m1(i1)
-        output1.backward(grad_output[:, :8].contiguous())
-
-        m2 = nn.Conv3d(2, 8, kernel_size=3, bias=False).to("cpu", torch.float)
-        m2.weight.data.copy_(m.weight.data[8:])
-        i2 = i.data[:, 2:].contiguous().requires_grad_(True)
-        output2 = m2(i2)
-        output2.backward(grad_output[:, 8:].contiguous())
-
-        self.assertEqual(output, torch.cat([output1, output2], 1))
-        self.assertEqual(i.grad.data,
-                         torch.cat([i1.grad.data, i2.grad.data], 1),
-                         atol=dtype2prec_DONTUSE[torch.float], rtol=0)
-        self.assertEqual(m.weight.grad.data,
-                         torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
-                         atol=dtype2prec_DONTUSE[torch.float], rtol=dtype2prec_DONTUSE[torch.float])
-
-    def test_Conv3d_groups_wbias(self):
-        torch.manual_seed(123)
-        m = nn.Conv3d(4, 16, kernel_size=3, groups=2, bias=True).to("cpu", torch.float)
-        i = torch.randn(2, 4, 6, 6, 6, device="cpu", dtype=torch.float, requires_grad=True)
-        output = m(i)
-        grad_output = torch.randn(2, 16, 4, 4, 4, device="cpu", dtype=torch.float)
-        output.backward(grad_output)
-
-        m1 = nn.Conv3d(2, 8, kernel_size=3, bias=True).to("cpu", torch.float)
-        m1.weight.data.copy_(m.weight.data[:8])
-        m1.bias.data.copy_(m.bias.data[:8])
-        i1 = i.data[:, :2].contiguous().requires_grad_(True)
-        output1 = m1(i1)
-        output1.backward(grad_output[:, :8].contiguous())
-
-        m2 = nn.Conv3d(2, 8, kernel_size=3, bias=True).to("cpu", torch.float)
-        m2.weight.data.copy_(m.weight.data[8:])
-        m2.bias.data.copy_(m.bias.data[8:])
-        i2 = i.data[:, 2:].contiguous().requires_grad_(True)
-        output2 = m2(i2)
-        output2.backward(grad_output[:, 8:].contiguous())
-
-        self.assertEqual(output, torch.cat([output1, output2], 1))
-        self.assertEqual(i.grad.data,
-                         torch.cat([i1.grad.data, i2.grad.data], 1),
-                         atol=dtype2prec_DONTUSE[torch.float],
-                         rtol=dtype2prec_DONTUSE[torch.float])
-        self.assertEqual(m.weight.grad.data,
-                         torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
-                         atol=dtype2prec_DONTUSE[torch.float],
-                         rtol=dtype2prec_DONTUSE[torch.float])
-        self.assertEqual(m.bias.grad.data,
-                         torch.cat([m1.bias.grad.data, m2.bias.grad.data], 0),
-                         atol=dtype2prec_DONTUSE[torch.float], rtol=dtype2prec_DONTUSE[torch.float])
-
     def test_container_copy(self):
         class Model(nn.Module):
             def __init__(self):
@@ -10562,140 +9982,6 @@ def test_bilinear_broadcasting(self):
         expected = m(input1.view(6, 5), input2.view(6, 6)).view(2, 3, 8)
         self.assertEqual(expected, m(input1, input2))
 
-    def test_conv_tbc(self):
-        inp = torch.randn(9, 4, 5, requires_grad=True)
-        weight = torch.randn(3, 5, 6, requires_grad=True)
-        bias = torch.randn(6, requires_grad=True)
-
-        gradcheck(lambda i, w, b, pad: F.conv_tbc(i, w, b, pad), (inp, weight, bias, 3))
-
-
-    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
-    @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
-    @skipIfRocmVersionLessThan((4, 3))
-    @skipIfNotMiopenSuggestNHWC
-    def test_grouped_conv_cudnn_nhwc_support(self):
-        # in order to catch the hols in grouped convolution in nhwc support for earlier cudnn version
-        input = torch.randn((16, 16, 8, 8), dtype=torch.float16, device="cuda").to(memory_format=torch.channels_last)
-        weight = torch.randn((8, 4, 3, 3), dtype=torch.float16, device="cuda").to(memory_format=torch.channels_last)
-        out = torch.convolution(input, weight, None, (1, 1), (1, 1), (1, 1), False, (0, 0), 4)
-        input = torch.randn((16, 8, 8, 8), dtype=torch.float16, device="cuda").to(memory_format=torch.channels_last)
-        out_transpose = torch.convolution(input, weight, None, (1, 1), (1, 1), (1, 1), True, (0, 0), 4)
-
-    @unittest.expectedFailure
-    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
-    @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
-    def test_conv_cudnn_memory_layout_dominance(self):
-        # desired behavior here is to have the memory_layout of conv.weight to
-        # dominante the layout of output.
-        # which is not the same as current behavior, we'll fix this in
-        # following up PRs and remove the `expectedFailure` tag
-        input = torch.randint(1, 10, (2, 8, 4, 4), dtype=torch.float32, device="cuda", requires_grad=True)
-        conv = nn.Conv2d(8, 4, 3).cuda().float()
-
-        out = conv(input)
-        self.assertTrue(out.is_contiguous())
-
-        input = input.contiguous(memory_format=torch.channels_last)
-        out = conv(input)
-        self.assertTrue(out.is_contiguous())
-
-        conv.weight.data = conv.weight.contiguous(memory_format=torch.channels_last)
-        out = conv(input)
-        self.assertTrue(out.is_contiguous(memory_format=torch.channels_last))
-
-        input = input.contiguous()
-        out = conv(input)
-        self.assertTrue(out.is_contiguous(memory_format=torch.channels_last))
-
-
-    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
-    def test_cudnn_noncontiguous_weight(self):
-        # Noncontiguous weights must be contiguous() before being
-        # passed to cuDNN
-        input = torch.tensor([1, 1, 1], dtype=torch.double, device="cuda").view(1, 1, 3)
-        weights1 = torch.tensor([1], dtype=torch.double, device="cuda").expand(1, 1, 2)
-        weights2 = torch.tensor([1], dtype=torch.double, device="cuda").expand(1, 1, 2).contiguous()
-        self.assertEqual(F.conv1d(input, weights1, bias=None, stride=2, dilation=2),
-                         F.conv1d(input, weights2, bias=None, stride=2, dilation=2))
-
-
-    def run_grad_conv_test(self, func_forward, func_backward, dim=1, gradient='input'):
-        for kern, inp_size in [(3, 6), (3, 7), (4, 9)]:
-            for batch, stride, padding, chan_in, chan_out, dilation in \
-                    product([1, 2], [1, 2], [0, 1, 2], [2], [3], [1]):
-
-                for has_bias in [True, False]:
-                    input_shape = [batch, chan_in]
-                    weight_shape = [chan_out, chan_in]
-                    for _ in range(dim):
-                        input_shape.append(inp_size)
-                        weight_shape.append(kern)
-
-                    input = torch.randn(input_shape, requires_grad=True)
-                    weight = torch.randn(weight_shape, requires_grad=True)
-                    if has_bias:
-                        bias = torch.randn([chan_out], requires_grad=True)
-                    output = func_forward(input, weight, stride=stride, padding=padding, dilation=dilation, bias=bias)
-
-                    gradient_o = torch.randn(output.shape)
-                    gradient_w = torch.autograd.grad(output, input if (gradient == 'input') else weight, gradient_o)
-
-                    self.assertEqual(gradient_w[0],
-                                     func_backward(
-                                     input_shape if (gradient == 'input') else input,
-                                     weight_shape if (gradient == 'weight') else weight,
-                                     gradient_o,
-                                     stride=stride,
-                                     padding=padding,
-                                     dilation=dilation))
-
-    def test_grad_conv1d_input(self):
-        self.run_grad_conv_test(F.conv1d, F.grad.conv1d_input, 1, 'input')
-
-    def test_grad_conv1d_weight(self):
-        self.run_grad_conv_test(F.conv1d, F.grad.conv1d_weight, 1, 'weight')
-
-    def test_grad_conv2d_input(self):
-        self.run_grad_conv_test(F.conv2d, F.grad.conv2d_input, 2, 'input')
-
-    def test_grad_conv2d_weight(self):
-        self.run_grad_conv_test(F.conv2d, F.grad.conv2d_weight, 2, 'weight')
-
-    def test_grad_conv3d_input(self):
-        self.run_grad_conv_test(F.conv3d, F.grad.conv3d_input, 3, 'input')
-
-    def test_grad_conv3d_weight(self):
-        self.run_grad_conv_test(F.conv3d, F.grad.conv3d_weight, 3, 'weight')
-
-    @unittest.skipIf(not torch._nnpack_available(), "NNPACK unavailable")
-    def test_nnpack_conv(self):
-        for kern, inp_size in [(3, 6), (3, 7), (4, 9)]:
-            for batch, stride, padding, chan_in, chan_out in \
-                    product([1, 2, 3, 4], [1, 2], [0, 1, 2], [2], [3]):
-
-                for has_bias in [True, False]:
-                    input_shape = [batch, chan_in]
-                    weight_shape = [chan_out, chan_in]
-                    for _ in range(2):
-                        input_shape.append(inp_size)
-                        weight_shape.append(kern)
-
-                    input = torch.randn(input_shape, requires_grad=True, dtype=torch.float)
-                    weight = torch.randn(weight_shape, requires_grad=True, dtype=torch.float)
-                    if has_bias:
-                        bias = torch.randn([chan_out], requires_grad=True, dtype=torch.float)
-                    output = torch._nnpack_spatial_convolution(input, weight, stride=stride, padding=padding, bias=bias)
-                    output_expected = torch.nn.functional.conv2d(input, weight, stride=stride, padding=padding, bias=bias)
-                    self.assertEqual(output, output_expected, atol=3e-4, rtol=0)
-
-                    gradient_o = torch.randn(output.shape, dtype=torch.float)
-
-                    grads = torch.autograd.grad(output, [input, weight], gradient_o)
-                    grads_expected = torch.autograd.grad(output_expected, [input, weight], gradient_o)
-                    for gr, gr_expected in zip(grads, grads_expected):
-                        self.assertEqual(gr, gr_expected, atol=3e-4, rtol=0)
-
     def test_fold_invalid_arg(self):
         # input.size(1) not divisible by \prod(kernel_size)
 
@@ -10742,16 +10028,6 @@ def test_unfold_invalid_arg(self):
             unfold = nn.Unfold(kernel_size=(1, 3), padding=(1, 1), dilation=(1, 2))
             unfold(torch.randn(1, 2, 2, 2))
 
-    def test_conv_padding_mode(self):
-        with self.assertRaisesRegex(ValueError, "padding_mode must be one of"):
-            nn.Conv2d(3, 3, 3, padding_mode="xyz")
-
-        with self.assertRaisesRegex(ValueError, "padding_mode must be one of"):
-            nn.Conv2d(3, 3, 3, padding_mode=3)
-
-        with self.assertRaisesRegex(ValueError, "Only \"zeros\" "):
-            nn.ConvTranspose2d(3, 3, 3, padding_mode="reflect")
-
     def test_softmin(self):
         x = torch.randn(2, 16)
         self.assertEqual(F.softmin(x, 1), F.softmax(-x, 1))
@@ -11004,85 +10280,6 @@ def _batch_norm_stats(data):
         data = torch.randn(1, 96, 112, 112, dtype=torch.float, device='cuda')
         _batch_norm_stats(data)
 
-    def test_functional_grad_conv(self):
-        # Conv 1D
-        input = torch.randn(1, 1, 5, requires_grad=True)
-        weight = torch.randn(1, 1, 3, requires_grad=True)
-        output = F.conv1d(input, weight, dilation=2)
-        grad_output = torch.randn(output.shape)
-
-        grad_input_autograd, grad_weight_autograd = torch.autograd.grad(output, (input, weight), grad_output)
-
-        grad_input_functional = torch.nn.grad.conv1d_input(input.shape, weight, grad_output, dilation=2)
-        self.assertEqual(grad_input_functional, grad_input_autograd)
-
-        grad_weight_functional = torch.nn.grad.conv1d_weight(input, weight.shape, grad_output, dilation=2)
-        self.assertEqual(grad_weight_functional, grad_weight_autograd)
-
-        # Conv 2D
-        input = torch.randn(1, 1, 5, 5, requires_grad=True)
-        weight = torch.randn(1, 1, 3, 3, requires_grad=True)
-        output = F.conv2d(input, weight, dilation=2)
-        grad_output = torch.randn(output.shape)
-
-        (grad_input_autograd, grad_weight_autograd) = torch.autograd.grad(output, (input, weight), grad_output)
-
-        grad_input_functional = torch.nn.grad.conv2d_input(input.shape, weight, grad_output, dilation=2)
-        self.assertEqual(grad_input_functional, grad_input_autograd)
-
-        grad_weight_functional = torch.nn.grad.conv2d_weight(input, weight.shape, grad_output, dilation=2)
-        self.assertEqual(grad_weight_functional, grad_weight_autograd)
-
-        # Conv 3D
-        input = torch.randn(1, 1, 5, 5, 5, requires_grad=True)
-        weight = torch.randn(1, 1, 3, 3, 3, requires_grad=True)
-        output = F.conv3d(input, weight, dilation=2)
-        grad_output = torch.randn(output.shape)
-
-        (grad_input_autograd, grad_weight_autograd) = torch.autograd.grad(output, (input, weight), grad_output)
-
-        grad_input_functional = torch.nn.grad.conv3d_input(input.shape, weight, grad_output, dilation=2)
-        self.assertEqual(grad_input_functional, grad_input_autograd)
-
-        grad_weight_functional = torch.nn.grad.conv3d_weight(input, weight.shape, grad_output, dilation=2)
-        self.assertEqual(grad_weight_functional, grad_weight_autograd)
-
-    def test_functional_grad_conv2d(self):
-        BATCH_SIZE = 4
-        IN_CH = 8
-        OUT_CH = 16
-        SPATIAL = 32
-
-        def _test_conv2d(stride, kernel_size, groups, dilation):
-            padding = kernel_size // 2
-
-            input = torch.empty(BATCH_SIZE, IN_CH, SPATIAL, SPATIAL).uniform_(-8.0, 8.0).requires_grad_(True)
-
-            weight = torch.empty(OUT_CH, IN_CH // groups, kernel_size, kernel_size).uniform_(-4.0, 4.0).requires_grad_(True)
-
-            output = F.conv2d(input, weight,
-                              stride=stride, padding=padding, dilation=dilation, groups=groups)
-
-            grad_output = torch.randn(output.shape)
-
-            (grad_input_autograd, grad_weight_autograd) = torch.autograd.grad(output, (input, weight), grad_output)
-
-            grad_input_functional = torch.nn.grad.conv2d_input(input.shape, weight, grad_output,
-                                                               stride=stride, padding=padding, dilation=dilation, groups=groups)
-            self.assertEqual(grad_input_functional, grad_input_autograd)
-
-            grad_weight_functional = torch.nn.grad.conv2d_weight(input, weight.shape, grad_output,
-                                                                 stride=stride, padding=padding, dilation=dilation, groups=groups)
-            self.assertEqual(grad_weight_functional, grad_weight_autograd)
-
-        strides = [1, 2]
-        kernel_sizes = [1, 3, 5]
-        groups = [1, 2, 4]
-        dilates = [1, 2]
-
-        for s, k, g, d in product(strides, kernel_sizes, groups, dilates):
-            _test_conv2d(s, k, g, d)
-
     def test_flatten(self):
         tensor_input = torch.randn(2, 1, 2, 3)
 
@@ -12083,50 +11280,6 @@ def _buildEquivalentAffineTransforms3d(device, input_size, output_size, angle_ra
 
 
 class TestNNDeviceType(NNTestCase):
-    def run_conv_double_back_test(self, kern, stride, padding, chan_in, chan_out, batch_size,
-                                  inp_size, dilation, no_weight, groups=1, use_cuda=False,
-                                  use_bias=True, dtype=torch.double):
-        if use_cuda:
-            device = torch.device("cuda")
-        else:
-            device = torch.device("cpu")
-
-        x = torch.randn(batch_size, chan_in, inp_size, inp_size, device=device,
-                        dtype=dtype, requires_grad=True)
-        weight = torch.randn(chan_out, chan_in // groups, kern, kern, device=device,
-                             dtype=dtype, requires_grad=not no_weight)
-        if use_bias:
-            bias = torch.randn(chan_out, device=device, dtype=dtype, requires_grad=True)
-        else:
-            bias = None
-
-        def func(*inputs):
-            if use_bias:
-                lx, lweight, lbias = inputs
-            else:
-                lx, lweight = inputs
-                lbias = None
-            # We disable cudnn during forward to avoid finite difference imprecision issues
-            with cudnn.flags(enabled=False):
-                out = F.conv2d(lx, lweight, lbias, stride, padding, dilation, groups)
-            return out
-
-        if use_bias:
-            inputs = x, weight, bias
-        else:
-            inputs = x, weight
-
-        dummy_out = func(*inputs)
-        grad_y = torch.randn_like(dummy_out, device=device, dtype=dtype, requires_grad=True)
-
-        # Issue #15353: test mkldnn double backward, don't run gradgradcheck due
-        # to imprecision issues
-        if dtype == torch.float:
-            g, = torch.autograd.grad(dummy_out.sum(), x, create_graph=True)
-            return g.requires_grad
-
-        return gradgradcheck(func, inputs, (grad_y,))
-
     def _test_InstanceNorm_general(self, cls, input, device, dtype=torch.float):
         # default case track_running_stats=False
         b, c = input.size(0), input.size(1)
@@ -12560,174 +11713,6 @@ def test_affine_3d_rotateRandom(self, device):
             self.assertEqual(scipy_ary, gridsample_ary.reshape_as(scipy_ary))
 
 
-    @onlyCUDA
-    @skipCUDAIfNoCudnn
-    @dtypes(*floating_and_complex_types_and(torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else []))
-    def test_Conv2d_deterministic_cudnn(self, device, dtype):
-        inputs = torch.randn(2, 3, 5, 5, device=device, dtype=dtype, requires_grad=True)
-        with cudnn.flags(enabled=True, benchmark=True, deterministic=True):
-            conv1 = torch.nn.Conv2d(3, 3, 3).to(device, dtype)
-            conv2 = torch.nn.Conv2d(3, 3, 3).to(device, dtype)
-            conv2.bias.data.copy_(conv1.bias.data)
-            conv2.weight.data.copy_(conv1.weight.data)
-            out1 = conv1(inputs)
-            out2 = conv2(inputs)
-            self.assertEqual(out1, out2, atol=0.0, rtol=0)
-            y = torch.randn(out1.size(), device=device, dtype=dtype)
-            out1.backward(y)
-            out2.backward(y)
-            self.assertEqual(conv1.bias.grad.data, conv2.bias.grad.data, atol=0.0, rtol=0)
-            self.assertEqual(conv1.weight.grad.data, conv2.weight.grad.data, atol=0.0, rtol=0)
-
-
-    @onlyCUDA
-    @dtypes(*floating_types_and(torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else []))
-    def test_Conv2d_large_workspace(self, device, dtype):
-        # These sizes require huge cuDNN workspaces. Make sure we choose a
-        # reasonable algorithm that does not run out of memory
-        sizes = [
-            (1, 256, 109, 175),
-            (1, 256, 80, 128),
-            (1, 256, 120, 192),
-        ]
-
-        def run_test(benchmark):
-            with torch.backends.cudnn.flags(benchmark=benchmark):
-                conv = torch.nn.Conv2d(256, 256, kernel_size=3, padding=1).to(device, dtype)
-                for size in sizes:
-                    x = torch.randn(size, device=device, dtype=dtype)
-                    out = conv(x.detach().clone().requires_grad_())
-                    out.backward(torch.ones_like(out))
-
-        run_test(benchmark=False)
-        run_test(benchmark=True)
-
-
-    @onlyCUDA
-    @dtypes(torch.half, torch.float)
-    def test_ConvTranspose2d_large_output_padding(self, device, dtype):
-        net1 = torch.nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1)\
-            .to(device=device, dtype=dtype)
-        net2 = torch.nn.ConvTranspose2d(64, 32, kernel_size=3, stride=2, padding=1, output_padding=1)\
-            .to(device=device, dtype=dtype)
-        net3 = torch.nn.ConvTranspose2d(32, 3, kernel_size=3, stride=2, padding=1, output_padding=1)\
-            .to(device=device, dtype=dtype)
-        x = torch.rand(1, 128, 6, 6, device=device, dtype=dtype, requires_grad=True)
-        x = net1(x)
-        x = net2(x)
-        x = net3(x)
-        x.backward(torch.randn_like(x))
-        torch.cuda.synchronize()
-
-
-    @onlyCUDA
-    @tf32_on_and_off(0.01)
-    @dtypes(torch.float, torch.double, torch.half)
-    # Very similar to test_Conv2d_naive_groups but with special care to handle
-    # the number of groups == number of input channels
-    @torch.backends.cudnn.flags(enabled=True, benchmark=False)
-    def test_Conv2d_depthwise_naive_groups(self, device, dtype):
-        for depth_multiplier in [1, 2]:
-            m = nn.Conv2d(2, 2 * depth_multiplier, kernel_size=3, groups=2).to(device, dtype)
-            i = torch.randn(2, 2, 6, 6, device="cuda", dtype=dtype).div_(2).requires_grad_()
-            output = m(i)
-            grad_output = torch.randn(2, 2 * depth_multiplier, 4, 4, device=device, dtype=dtype) / 2
-            output.backward(grad_output)
-
-            offset = 1 * depth_multiplier
-
-            m1 = nn.Conv2d(1, 1 * depth_multiplier, kernel_size=3).to(device, dtype)
-            m1.weight.data = m.weight.data[:offset].clone()
-            m1.bias.data = m.bias.data[:offset].clone()
-            i1 = i.detach()[:, :1].clone().requires_grad_()
-            output1 = m1(i1)
-            output1.backward(grad_output[:, :offset].contiguous())
-
-            m2 = nn.Conv2d(1, 1 * depth_multiplier, kernel_size=3).to(device, dtype)
-            m2.weight.data.copy_(m.weight.data[offset:])
-            m2.bias.data.copy_(m.bias.data[offset:])
-            i2 = i.detach()[:, 1:].clone().requires_grad_()
-            output2 = m2(i2)
-            output2.backward(grad_output[:, offset:].contiguous())
-
-            self.assertEqual(output, torch.cat([output1, output2], 1),
-                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
-            self.assertEqual(i.grad.data,
-                             torch.cat([i1.grad.data, i2.grad.data], 1),
-                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
-            self.assertEqual(m.bias.grad.data,
-                             torch.cat([m1.bias.grad.data,
-                                        m2.bias.grad.data], 0),
-                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
-            self.assertEqual(m.weight.grad.data,
-                             torch.cat([m1.weight.grad.data,
-                                        m2.weight.grad.data], 0),
-                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
-
-    @onlyCUDA
-    @dtypes(torch.float, torch.double, torch.half)
-    @tf32_on_and_off(0.005)
-    @torch.backends.cudnn.flags(enabled=True, benchmark=False)
-    def test_Conv3d_depthwise_naive_groups(self, device, dtype):
-        for depth_multiplier in [1, 2]:
-            m = nn.Conv3d(2, 2 * depth_multiplier, kernel_size=3, groups=2).to(device, dtype)
-            i = torch.randn(2, 2, 6, 6, 6, device="cuda", dtype=dtype).div_(2).requires_grad_()
-            output = m(i)
-            grad_output = torch.randn(2, 2 * depth_multiplier, 4, 4, 4, device=device, dtype=dtype) / 2
-            output.backward(grad_output)
-
-            offset = 1 * depth_multiplier
-
-            m1 = nn.Conv3d(1, 1 * depth_multiplier, kernel_size=3).to(device, dtype)
-            m1.weight.data = m.weight.data[:offset].clone()
-            m1.bias.data = m.bias.data[:offset].clone()
-            i1 = i.detach()[:, :1].clone().requires_grad_()
-            output1 = m1(i1)
-            output1.backward(grad_output[:, :offset].contiguous())
-
-            m2 = nn.Conv3d(1, 1 * depth_multiplier, kernel_size=3).to(device, dtype)
-            m2.weight.data.copy_(m.weight.data[offset:])
-            m2.bias.data.copy_(m.bias.data[offset:])
-            i2 = i.detach()[:, 1:].clone().requires_grad_()
-            output2 = m2(i2)
-            output2.backward(grad_output[:, offset:].contiguous())
-            is_cuda_sm86 = device.startswith("cuda") and torch.cuda.get_device_capability(0) == (8, 6)
-            atol, rtol = (3e-4, 3e-2) if dtype == torch.float32 and is_cuda_sm86 else (dtype2prec_DONTUSE[dtype], 0)
-
-            self.assertEqual(output, torch.cat([output1, output2], 1),
-                             atol=atol, rtol=rtol)
-            self.assertEqual(i.grad.data,
-                             torch.cat([i1.grad.data, i2.grad.data], 1),
-                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
-            self.assertEqual(m.bias.grad.data,
-                             torch.cat([m1.bias.grad.data,
-                                        m2.bias.grad.data], 0),
-                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
-            self.assertEqual(m.weight.grad.data,
-                             torch.cat([m1.weight.grad.data,
-                                        m2.weight.grad.data], 0),
-                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
-
-
-    @onlyCUDA
-    @dtypes(*floating_types_and(torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else []))
-    def test_noncontig_conv_grad(self, device, dtype):
-        # FIXME: remove after adding non-contiguous grad tests for all modules
-        module = nn.Conv2d(3, 5, kernel_size=3, padding=1).to(device, dtype)
-        input = torch.randn(2, 3, 10, 10, dtype=dtype, device=device, requires_grad=True)
-        output = module(input)
-
-        grad = torch.randn(2, 2, 5, 10, 10, dtype=dtype, device=device)[:, 1]
-        assert not grad.is_contiguous()
-        output.backward(grad, retain_graph=True)
-        self.assertIsNotNone(input.grad)
-        result = input.grad.data.clone()
-        input.grad.data.zero_()
-
-        output.backward(grad.contiguous())
-        self.assertEqual(result, input.grad.data, atol=dtype2prec_DONTUSE[dtype], rtol=0)
-
-
     @onlyCUDA
     @dtypes(torch.float, torch.half)
     def test_batchnorm_large_batch(self, device, dtype):
@@ -12735,770 +11720,6 @@ def test_batchnorm_large_batch(self, device, dtype):
         data = torch.rand(880801, 1, 1, 1, device=device, dtype=dtype)
         out = bn(data).sum().backward()
 
-
-    @onlyCUDA
-    @dtypes(torch.double)
-    def test_conv_double_backward(self, device, dtype):
-        with torch.backends.cudnn.flags(deterministic=True):
-            # Double backward only runs with DoubleTensor due to precision reason
-            batch_size = 1
-            for kern, inp_size, dilations in [(3, 5, [1, 2]), (4, 9, [1])]:
-                for stride, padding, chan_in, chan_out, dilation in product([1], [2], [2], [3], dilations):
-                    no_weight = stride == 2
-                    result = self.run_conv_double_back_test(kern, stride,
-                                                            padding, chan_in, chan_out,
-                                                            batch_size, inp_size, dilation,
-                                                            no_weight, use_cuda=True, dtype=dtype)
-                    self.assertTrue(result,
-                                    "Conv double backward test failed with parameters:" +
-                                    "\nkern: " + str(kern) +
-                                    "\nstride: " + str(stride) +
-                                    "\npadding: " + str(padding) +
-                                    "\nchan_in: " + str(chan_in) +
-                                    "\nchan_out: " + str(chan_out) +
-                                    "\nbatch_size: " + str(batch_size) +
-                                    "\ninp_size: " + str(inp_size) +
-                                    "\ndilation: " + str(dilation))
-
-
-    def test_conv_double_backward_no_bias(self):
-        kern = 3
-        stride = 2
-        chan_in, chan_out = 2, 4
-        batch_size = 2
-        inp_size = 5
-        padding = 1
-        dilation = 1
-        no_weight = False
-        use_bias = True
-        result = self.run_conv_double_back_test(kern, stride,
-                                                padding, chan_in, chan_out,
-                                                batch_size, inp_size, dilation,
-                                                no_weight, use_bias=use_bias)
-        self.assertTrue(result,
-                        "Conv double backward test failed with parameters:" +
-                        "\nkern: " + str(kern) +
-                        "\nstride: " + str(stride) +
-                        "\npadding: " + str(padding) +
-                        "\nchan_in: " + str(chan_in) +
-                        "\nchan_out: " + str(chan_out) +
-                        "\nbatch_size: " + str(batch_size) +
-                        "\ninp_size: " + str(inp_size) +
-                        "\ndilation: " + str(dilation))
-
-
-    def test_conv_double_backward_groups(self):
-        kern = 3
-        stride = 1
-        padding = 2
-        chan_in, chan_out = 2, 4
-        batch_size = 2
-        inp_size = 6
-        dilation = 1
-        no_weight = False
-        groups = 2
-        result = self.run_conv_double_back_test(kern, stride,
-                                                padding, chan_in * groups, chan_out * groups,
-                                                batch_size, inp_size, dilation,
-                                                no_weight, groups=groups)
-        self.assertTrue(result,
-                        "Conv double backward test failed with parameters:" +
-                        "\nkern: " + str(kern) +
-                        "\nstride: " + str(stride) +
-                        "\npadding: " + str(padding) +
-                        "\nchan_in: " + str(chan_in) +
-                        "\nchan_out: " + str(chan_out) +
-                        "\nbatch_size: " + str(batch_size) +
-                        "\ninp_size: " + str(inp_size) +
-                        "\ndilation: " + str(dilation) +
-                        "\ngroups: " + str(groups))
-
-
-    def test_conv_double_backward_stride(self):
-        batch_size = 2
-
-        # Cannot provide ggW when stride is > 1
-        for kern, inp_size, dilations in [(3, 5, [1, 2]), (3, 7, [1])]:
-            for stride, padding, chan_in, chan_out, dilation in product([2], [0, 1], [1], [2], dilations):
-                no_weight = False
-                self.run_conv_double_back_test(kern, stride,
-                                               padding, chan_in, chan_out,
-                                               batch_size, inp_size, dilation,
-                                               no_weight)
-
-    @dtypes(torch.float, torch.cfloat)
-    @torch.backends.cudnn.flags(enabled=True, benchmark=False)
-    def test_conv1d_same_padding(self, device, dtype):
-        # Test padding='same' outputs the correct shape
-        test_args = [
-            # in_size
-            range(50, 55),
-            # kernel_size
-            [1, 2, 3, 8],
-            # dilation
-            range(1, 4),
-            # stride
-            [1],
-        ]
-        for in_size, k_size, dilation, stride in itertools.product(*test_args):
-            x = torch.rand(1, 1, in_size, device=device, dtype=dtype)
-            y = torch.rand(1, 1, k_size, device=device, dtype=dtype)
-            z = F.conv1d(x, y, padding='same', dilation=dilation, stride=stride)
-            self.assertEqual(z.size(2), int(math.ceil(in_size / stride)))
-
-        # Compare F.conv1d padding='same' output against manual padding
-        # Without strides/dilation
-        x = torch.rand(1, 1, 12, device=device, dtype=dtype)
-        y = torch.rand(1, 1, 3, device=device, dtype=dtype)
-        expect = F.conv1d(x, y, padding=1)
-        actual = F.conv1d(x, y, padding='same')
-        self.assertEqual(expect, actual)
-
-        # With dilation
-        x = torch.rand(1, 1, 12, device=device, dtype=dtype)
-        y = torch.rand(1, 1, 4, device=device, dtype=dtype)
-        expect = F.conv1d(x, y, padding=3, dilation=2)
-        actual = F.conv1d(x, y, padding='same', dilation=2)
-        self.assertEqual(expect, actual)
-
-        # Dilation with asymmetric padding
-        expect = F.conv1d(x, y, padding=5, dilation=3)[..., 1:]
-        actual = F.conv1d(x, y, padding='same', dilation=3)
-        self.assertEqual(expect, actual)
-
-    @dtypes(torch.float, torch.cfloat)
-    def test_conv2d_same_padding(self, device, dtype):
-        if dtype is torch.cfloat:
-            rtol, atol = 2e-6, 2e-6
-        else:
-            rtol, atol = None, None
-        # Compare F.conv2d padding='same' output against manual padding
-        # Without strides/dilation
-        x = torch.rand(1, 1, 10, 11, device=device, dtype=dtype)
-        y = torch.rand(1, 1, 4, 5, device=device, dtype=dtype)
-        expect = F.conv2d(x, y, padding=(2, 2))[..., 1:, :]
-        actual = F.conv2d(x, y, padding='same')
-        self.assertEqual(expect, actual, rtol=rtol, atol=atol)
-
-        # With dilation
-        y = torch.rand(1, 1, 3, 4, device=device, dtype=dtype)
-        expect = F.conv2d(x, y, padding=(2, 3), dilation=2)
-        actual = F.conv2d(x, y, padding='same', dilation=2)
-        self.assertEqual(expect, actual, rtol=rtol, atol=atol)
-
-        # Dilation with asymmetric padding
-        y = torch.rand(1, 1, 4, 4, device=device, dtype=dtype)
-        expect = F.conv2d(x, y, padding=5, dilation=3)[..., 1:, 1:]
-        actual = F.conv2d(x, y, padding='same', dilation=3)
-        self.assertEqual(expect, actual, rtol=rtol, atol=atol)
-
-    @dtypes(torch.float, torch.cfloat)
-    def test_conv3d_same_padding(self, device, dtype):
-        if dtype is torch.cfloat:
-            rtol, atol = 2e-6, 2e-6
-        else:
-            rtol, atol = None, None
-        # Compare F.conv3d padding='same' output against manual padding
-        # Without strides/dilation
-        x = torch.rand(1, 1, 10, 11, 12, device=device, dtype=dtype)
-        y = torch.rand(1, 1, 1, 2, 5, device=device, dtype=dtype)
-        expect = F.conv3d(x, y, padding=(0, 1, 2))[..., :, 1:, :]
-        actual = F.conv3d(x, y, padding='same')
-        self.assertEqual(expect, actual, rtol=rtol, atol=atol)
-
-        # With dilation
-        expect = F.conv3d(x, y, padding=(0, 1, 4), dilation=2)
-        actual = F.conv3d(x, y, padding='same', dilation=2)
-        self.assertEqual(expect, actual, rtol=rtol, atol=atol)
-
-        # Dilation with asymmetric padding
-        y = torch.rand(1, 1, 4, 4, 4, device=device, dtype=dtype)
-        expect = F.conv3d(x, y, padding=5, dilation=3)[..., 1:, 1:, 1:]
-        actual = F.conv3d(x, y, padding='same', dilation=3)
-        self.assertEqual(expect, actual, rtol=rtol, atol=atol)
-
-    @dtypes(torch.float, torch.cfloat)
-    def test_conv1d_valid_padding(self, device, dtype):
-        # Test F.conv1d padding='valid' is the same as no padding
-        x = torch.rand(1, 1, 10, device=device, dtype=dtype)
-        y = torch.rand(1, 1, 4, device=device, dtype=dtype)
-        expect = F.conv1d(x, y)
-        actual = F.conv1d(x, y, padding='valid')
-        self.assertEqual(expect, actual)
-
-    @dtypes(torch.float, torch.cfloat)
-    def test_conv2d_valid_padding(self, device, dtype):
-        # Test F.conv2d padding='valid' is the same as no padding
-        x = torch.rand(1, 1, 1, 10, device=device, dtype=dtype)
-        y = torch.rand(1, 1, 1, 4, device=device, dtype=dtype)
-        expect = F.conv2d(x, y)
-        actual = F.conv2d(x, y, padding='valid')
-        self.assertEqual(expect, actual)
-
-    @dtypes(torch.float, torch.cfloat)
-    def test_conv3d_valid_padding(self, device, dtype):
-        # Test F.conv3d padding='valid' is the same as no padding
-        x = torch.rand(1, 1, 1, 1, 10, dtype=dtype, device=device)
-        y = torch.rand(1, 1, 1, 1, 4, dtype=dtype, device=device)
-        expect = F.conv3d(x, y)
-        actual = F.conv3d(x, y, padding='valid')
-        self.assertEqual(expect, actual)
-
-    @dtypes(torch.float, torch.cfloat)
-    def test_conv1d_same_padding_backward(self, device, dtype):
-        # Test F.conv1d gradients work with padding='same'
-        x = torch.rand(1, 1, 12, dtype=dtype, device=device, requires_grad=True)
-        y = torch.rand(1, 1, 4, dtype=dtype, device=device, requires_grad=True)
-
-        # Symmetric padding
-        z = F.conv1d(x, y, padding=3, dilation=2)
-        z.sum().backward()
-        gx_expect, gy_expect = x.grad, y.grad
-        x.grad, y.grad = None, None
-
-        z = F.conv1d(x, y, padding='same', dilation=2)
-        z.sum().backward()
-        self.assertEqual(gx_expect, x.grad)
-        self.assertEqual(gy_expect, y.grad)
-        x.grad, y.grad = None, None
-
-        # Asymmetric padding
-        z = F.conv1d(x, y, padding=2)[..., 1:]
-        z.sum().backward()
-        gx_expect, gy_expect = x.grad, y.grad
-        x.grad, y.grad = None, None
-
-        z = F.conv1d(x, y, padding='same')
-        z.sum().backward()
-        self.assertEqual(gx_expect, x.grad)
-        self.assertEqual(gy_expect, y.grad)
-
-    @dtypes(torch.float, torch.cfloat)
-    def test_conv2d_same_padding_backward(self, device, dtype):
-        # Test F.conv2d gradients work with padding='same'
-        x = torch.rand(1, 1, 10, 11, device=device, dtype=dtype, requires_grad=True)
-        y = torch.rand(1, 1, 4, 5, device=device, dtype=dtype, requires_grad=True)
-
-        # Symmetric padding
-        z = F.conv2d(x, y, padding=(3, 4), dilation=2)
-        z.sum().backward()
-        gx_expect, gy_expect = x.grad, y.grad
-        x.grad, y.grad = None, None
-
-        z = F.conv2d(x, y, padding='same', dilation=2)
-        z.sum().backward()
-        self.assertEqual(gx_expect, x.grad)
-        self.assertEqual(gy_expect, y.grad)
-        x.grad, y.grad = None, None
-
-        # Asymmetric padding
-        y = torch.rand(1, 1, 4, 4, device=device, dtype=dtype, requires_grad=True)
-        z = F.conv2d(x, y, padding=2)[..., 1:, 1:]
-        z.sum().backward()
-        gx_expect, gy_expect = x.grad, y.grad
-        x.grad, y.grad = None, None
-
-        z = F.conv2d(x, y, padding='same')
-        z.sum().backward()
-        self.assertEqual(gx_expect, x.grad)
-        self.assertEqual(gy_expect, y.grad)
-
-    @dtypes(torch.double, torch.cdouble)
-    def test_conv3d_same_padding_backward(self, device, dtype):
-        check_forward_ad = torch.device(device).type != 'xla'
-
-        # Test F.conv3d gradients work with padding='same'
-        x = torch.rand(1, 1, 1, 11, 12, dtype=dtype, device=device, requires_grad=True)
-        y = torch.rand(1, 1, 1, 2, 5, dtype=dtype, device=device, requires_grad=True)
-
-        # Symmetric padding
-        z = F.conv3d(x, y, padding=(0, 1, 4), dilation=2)
-        z.sum().backward()
-        gx_expect, gy_expect = x.grad, y.grad
-        x.grad, y.grad = None, None
-
-        z = F.conv3d(x, y, padding='same', dilation=2)
-        z.sum().backward()
-        self.assertEqual(gx_expect, x.grad)
-        self.assertEqual(gy_expect, y.grad)
-        x.grad, y.grad = None, None
-
-        gradcheck(lambda x, y: F.conv3d(x, y, padding='same', dilation=2), (x, y),
-                  check_forward_ad=check_forward_ad, nondet_tol=1e-5)
-        if torch.device(device).type != 'cuda':
-            # https://github.com/pytorch/pytorch/issues/70702
-            gradgradcheck(lambda x, y: F.conv3d(x, y, padding='same', dilation=2), (x, y),
-                          check_fwd_over_rev=True)
-
-        # Asymmetric padding
-        y = torch.rand(1, 1, 1, 4, 4, dtype=dtype, device=device, requires_grad=True)
-        z = F.conv3d(x, y, padding=2)[..., 1:, 1:]
-        z.sum().backward()
-        gx_expect, gy_expect = x.grad, y.grad
-        x.grad, y.grad = None, None
-
-        z = F.conv3d(x, y, padding='same')
-        z.sum().backward()
-        self.assertEqual(gx_expect, x.grad)
-        self.assertEqual(gy_expect, y.grad)
-
-        gradcheck(lambda x, y: F.conv3d(x, y, padding='same'), (x, y),
-                  check_forward_ad=check_forward_ad, nondet_tol=1e-5)
-        if torch.device(device).type != 'cuda':
-            # https://github.com/pytorch/pytorch/issues/70702
-            gradgradcheck(lambda x, y: F.conv3d(x, y, padding='same'), (x, y),
-                          check_fwd_over_rev=True)
-
-    @dtypes(torch.float, torch.cfloat)
-    def test_conv1d_valid_padding_backward(self, device, dtype):
-        # Test F.conv1d gradients work with padding='valid'
-        x = torch.rand(1, 1, 10, dtype=dtype, device=device, requires_grad=True)
-        y = torch.rand(1, 1, 4, dtype=dtype, device=device, requires_grad=True)
-        F.conv1d(x, y, padding=0).sum().backward()
-        gx_expect, gy_expect = x.grad, y.grad
-        x.grad, y.grad = None, None
-
-        F.conv1d(x, y, padding='valid').sum().backward()
-        gx_actual, gy_actual = x.grad, y.grad
-        self.assertEqual(gx_expect, gx_actual)
-        self.assertEqual(gy_expect, gy_actual)
-
-    @unittest.skipIf(not TEST_SCIPY, "Scipy required for the test.")
-    @dtypes(torch.float, torch.cfloat)
-    @parametrize_test("mode", ('valid', 'same'))
-    def test_conv1d_vs_scipy(self, device, dtype, mode):
-        t = make_tensor((1, 10), device=device, dtype=dtype)
-        feat_dim = t.shape[1]
-        weight_even = make_tensor((1, 1, 4), device=device, dtype=dtype)
-        weight_odd = make_tensor((1, 1, 5), device=device, dtype=dtype)
-
-        def _test(t, weight, mode):
-            # SciPy expects two 1-D inputs.
-            t_a = t.view(-1).cpu().numpy()
-            w_a = weight.view(-1).cpu().numpy()
-            expected = scipy.signal.convolve(t_a, w_a, mode=mode)
-
-            kwargs = {'padding': mode}
-            if mode == 'same':
-                # `same` padding in PyTorch conv1d is different
-                # from SciPy
-                p = weight.shape[2] // 2
-                t = torch.nn.functional.pad(t, (p, p))
-                # We have already taken care of padding
-                kwargs.pop("padding")
-
-            # second input is flipped in SciPy's convolve
-            weight_flipped = torch.flip(weight, (2,))
-            actual = torch.nn.functional.conv1d(t, weight_flipped, **kwargs).squeeze(0)
-            if mode == 'same':
-                actual = actual[:feat_dim]
-
-            self.assertEqual(actual, expected)
-
-        # Global dtype for this test suite is torch.double
-        # This leads to change in type-promotion
-        # and conv1d outputs `complex128` for `complex64` input.
-        with set_default_dtype(torch.float):
-            _test(t, weight_even, mode)
-            _test(t, weight_odd, mode)
-
-    @unittest.skipIf(not TEST_SCIPY, "Scipy required for the test.")
-    @dtypes(torch.float, torch.cfloat)
-    @parametrize_test("mode", ('valid', 'same'))
-    def test_conv2d_vs_scipy(self, device, dtype, mode):
-        t = make_tensor((1, 5, 10), device=device, dtype=dtype)
-        weight_even = make_tensor((1, 1, 2, 4), device=device, dtype=dtype)
-        weight_odd = make_tensor((1, 1, 3, 5), device=device, dtype=dtype)
-
-        def _test(t, weight, mode):
-            # SciPy expects two 2-D inputs.
-            t_a = t.squeeze(0).cpu().numpy()
-            w_a = weight.squeeze(0).squeeze(0).cpu().numpy()
-            expected = scipy.signal.convolve2d(t_a, w_a, mode=mode)
-
-            kwargs = {'padding': mode}
-            if mode == 'same':
-                # `same` padding in PyTorch conv2d is different
-                # from SciPy
-                left_right_pad = weight.shape[3] // 2
-                top_bottom_pad = weight.shape[2] // 2
-                p = (left_right_pad, left_right_pad, top_bottom_pad, top_bottom_pad)
-                t = torch.nn.functional.pad(t, p)
-                # We have already taken care of padding
-                kwargs.pop("padding")
-
-            # second input is flipped in SciPy's convolve2d
-            weight_flipped = torch.flip(weight, (2, 3))
-            actual = torch.nn.functional.conv2d(t, weight_flipped, **kwargs).squeeze(0)
-            if mode == 'same':
-                actual = actual[:5, :10]
-
-            self.assertEqual(actual, expected, rtol=2e-5, atol=5e-6)
-
-        # Global dtype for this test suite is torch.double
-        # This leads to change in type-promotion
-        # and conv1d outputs `complex128` for `complex64` input.
-        with set_default_dtype(torch.float):
-            _test(t, weight_even, mode)
-            _test(t, weight_odd, mode)
-
-    @unittest.skipIf(not TEST_SCIPY, "Scipy required for the test.")
-    @dtypes(torch.float, torch.cfloat)
-    @parametrize_test("mode", ('valid', 'same'))
-    def test_conv3d_vs_scipy(self, device, dtype, mode):
-        t = make_tensor((1, 5, 5, 10), device=device, dtype=dtype)
-        weight_even = make_tensor((1, 1, 2, 2, 4), device=device, dtype=dtype)
-        weight_odd = make_tensor((1, 1, 2, 3, 5), device=device, dtype=dtype)
-
-        def _test(t, weight, mode):
-            # SciPy expects two 3-D inputs.
-            t_a = t.squeeze(0).cpu().numpy()
-            w_a = weight.squeeze(0).squeeze(0).cpu().numpy()
-            expected = scipy.signal.convolve(t_a, w_a, mode=mode)
-
-            kwargs = {'padding': mode}
-            if mode == 'same':
-                # `same` padding in PyTorch conv3d is different
-                # from SciPy
-                left_right_pad = weight.shape[4] // 2
-                top_bottom_pad = weight.shape[3] // 2
-                front_back_pad = weight.shape[2] // 2
-                p = (left_right_pad, left_right_pad, top_bottom_pad, top_bottom_pad,
-                     front_back_pad, front_back_pad)
-                t = torch.nn.functional.pad(t, p)
-                # We have already taken care of padding
-                kwargs.pop("padding")
-
-            # second input is flipped in SciPy's convolve
-            weight_flipped = torch.flip(weight, (2, 3, 4))
-            actual = torch.nn.functional.conv3d(t, weight_flipped, **kwargs).squeeze(0)
-            if mode == 'same':
-                actual = actual[:5, :5, :10]
-
-            if tf32_is_not_fp32() and (dtype == torch.float or dtype == torch.complex64):
-                self.assertEqual(actual, expected, atol=0.05, rtol=0.05)
-            else:
-                self.assertEqual(actual, expected, rtol=2e-5, atol=5e-6)
-
-        # Global dtype for this test suite is torch.double
-        # This leads to change in type-promotion
-        # and conv1d outputs `complex128` for `complex64` input.
-        with set_default_dtype(torch.float):
-            _test(t, weight_even, mode)
-            _test(t, weight_odd, mode)
-
-    @dtypes(torch.float, torch.complex64)
-    def test_conv2d_valid_padding_backward(self, device, dtype):
-        # Test F.conv2d gradients work with padding='valid'
-        x = torch.rand(1, 1, 1, 10, device=device, dtype=dtype, requires_grad=True)
-        y = torch.rand(1, 1, 1, 4, device=device, dtype=dtype, requires_grad=True)
-        F.conv2d(x, y, padding=0).sum().backward()
-        gx_expect, gy_expect = x.grad, y.grad
-        x.grad, y.grad = None, None
-
-        F.conv2d(x, y, padding='valid').sum().backward()
-        gx_actual, gy_actual = x.grad, y.grad
-        self.assertEqual(gx_expect, gx_actual)
-        self.assertEqual(gy_expect, gy_actual)
-
-    @dtypes(torch.double, torch.cdouble)
-    def test_conv3d_valid_padding_backward(self, device, dtype):
-        check_forward_ad = torch.device(device).type != 'xla'
-
-        # Test F.conv3d gradients work with padding='valid'
-        x = torch.rand(1, 1, 1, 1, 10, dtype=dtype, device=device, requires_grad=True)
-        y = torch.rand(1, 1, 1, 1, 4, dtype=dtype, device=device, requires_grad=True)
-        F.conv3d(x, y, padding=0).sum().backward()
-        gx_expect, gy_expect = x.grad, y.grad
-        x.grad, y.grad = None, None
-
-        F.conv3d(x, y, padding='valid').sum().backward()
-        gx_actual, gy_actual = x.grad, y.grad
-        self.assertEqual(gx_expect, gx_actual)
-        self.assertEqual(gy_expect, gy_actual)
-
-        gradcheck(lambda x, y: F.conv3d(x, y, padding='valid'), (x, y), check_forward_ad=check_forward_ad)
-        gradgradcheck(lambda x, y: F.conv3d(x, y, padding='valid'), (x, y), check_fwd_over_rev=check_forward_ad)
-
-    @parametrize_test("N", range(2, 4), name_fn=lambda N: 'ConvTranspose{}d'.format(N))
-    def test_conv_transpose_with_output_size_and_no_batch_dim(self, device, N):
-        # For inputs with no batch dim, verify output is the correct shape when output_size is set.
-        # See https://github.com/pytorch/pytorch/issues/75889
-        inp = torch.randn((1, 15, 13) if N == 2 else (1, 15, 13, 13), device=device)
-        output_size = (1, 240, 200) if N == 2 else (1, 240, 200, 200)
-        ConvTransposeNd = getattr(nn, 'ConvTranspose{}d'.format(N))
-        m = ConvTransposeNd(1, 1, kernel_size=16, stride=16, padding=7, bias=False, device=device)
-        output = m(inp, output_size=output_size)
-        self.assertEqual(output.shape, output_size)
-
-    @skipMeta
-    @parametrize_test("input_shape,transposed,dilated,groups,layout,backend_expected", [
-        # === slow ===
-        subtest(((2, 6, 7), False, False, 3, torch.strided, torch._C._ConvBackend.Slow2d),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow1d'),
-        subtest(((2, 6, 7), True, False, 3, torch.strided, torch._C._ConvBackend.SlowTranspose2d),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow1d_transposed'),
-        subtest(((2, 6, 7), False, True, 3, torch.strided, torch._C._ConvBackend.SlowDilated2d),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow1d_dilated'),
-        subtest(((2, 6, 7), True, True, 3, torch.strided, torch._C._ConvBackend.SlowTranspose2d),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow1d_dilated_transposed'),
-        subtest(((2, 6, 7, 8), False, False, 3, torch.strided, torch._C._ConvBackend.Slow2d),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow2d'),
-        subtest(((2, 6, 7, 8), True, False, 3, torch.strided, torch._C._ConvBackend.SlowTranspose2d),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow2d_transposed'),
-        subtest(((2, 6, 7, 8), False, True, 3, torch.strided, torch._C._ConvBackend.SlowDilated2d),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow2d_dilated'),
-        subtest(((2, 6, 7, 8), True, True, 3, torch.strided, torch._C._ConvBackend.SlowTranspose2d),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow2d_dilated_transposed'),
-        subtest(((2, 6, 7, 8, 9), False, False, 3, torch.strided, torch._C._ConvBackend.Slow3d),
-                decorators=[onlyCPU, disableMkldnn], name='slow3d_cpu'),
-        # CUDA doesn't have a slow 3D implementation, so it goes to the dilated 3D implementation instead
-        subtest(((2, 6, 7, 8, 9), False, False, 3, torch.strided, torch._C._ConvBackend.SlowDilated3d),
-                decorators=[onlyCUDA, disablecuDNN], name='slow3d_cuda'),
-        # FIXME: RuntimeError: CUDA out of memory.
-        # subtest(((2, 6, 7, 8, 9), True, False, 3, torch.strided, torch._C._ConvBackend.SlowTranspose3d),
-        #         decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow3d_transposed'),
-        subtest(((2, 6, 7, 8, 9), False, True, 3, torch.strided, torch._C._ConvBackend.SlowDilated3d),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow3d_dilated'),
-        # FIXME: RuntimeError: CUDA out of memory.
-        # subtest(((2, 6, 7, 8, 9), True, True, 3, torch.strided, torch._C._ConvBackend.SlowTranspose3d),
-        #         decorators=[onlyNativeDeviceTypes, disableMkldnn, disablecuDNN], name='slow3d_dilated_transposed'),
-        subtest(((0, 6, 7), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_batch1d'),
-        subtest(((2, 0, 7), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_channel1d'),
-        subtest(((0, 0, 7), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_batch_channel1d'),
-        subtest(((0, 6, 7, 8), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_batch2d'),
-        subtest(((2, 0, 7, 8), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_channel2d'),
-        subtest(((0, 0, 7, 8), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_batch_channel2d'),
-        subtest(((0, 6, 7, 8, 9), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_batch3d'),
-        subtest(((2, 0, 7, 8, 9), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_channel3d'),
-        subtest(((0, 0, 7, 8, 9), False, False, 3, torch.strided, torch._C._ConvBackend.Empty),
-                decorators=[onlyNativeDeviceTypes, disableMkldnn], name='empty_batch_channel3d'),
-        # === cuda ===
-        # Note that disablecuDNN disables miopen as well.
-        subtest(((2, 6, 7), False, False, 6, torch.strided, torch._C._ConvBackend.CudaDepthwise2d),
-                decorators=[onlyCUDA, disablecuDNN], name='cuda_depthwise1d'),
-        subtest(((2, 6, 7, 8), False, False, 6, torch.strided, torch._C._ConvBackend.CudaDepthwise2d),
-                decorators=[onlyCUDA, disablecuDNN], name='cuda_depthwise2d'),
-        subtest(((2, 6, 7, 8, 9), False, False, 6, torch.strided, torch._C._ConvBackend.CudaDepthwise3d),
-                decorators=[onlyCUDA, disablecuDNN], name='cuda_depthwise3d'),
-        # === cudnn ===
-        subtest(((2, 6, 7), False, False, 3, torch.strided, torch._C._ConvBackend.Cudnn),
-                decorators=[onlyCUDA, skipCUDAIfNoCudnn, skipCUDAIfMiopen], name='cudnn1d'),
-        subtest(((2, 6, 7, 8), False, False, 3, torch.strided, torch._C._ConvBackend.Cudnn),
-                decorators=[onlyCUDA, skipCUDAIfNoCudnn, skipCUDAIfMiopen], name='cudnn2d'),
-        subtest(((2, 6, 7, 8, 9), False, False, 3, torch.strided, torch._C._ConvBackend.Cudnn),
-                decorators=[onlyCUDA, skipCUDAIfNoCudnn, skipCUDAIfMiopen], name='cudnn3d'),
-        subtest(((2, 6, 7), True, False, 3, torch.strided, torch._C._ConvBackend.CudnnTranspose),
-                decorators=[onlyCUDA, skipCUDAIfNoCudnn, skipCUDAIfMiopen], name='cudnn1d_transposed'),
-        subtest(((2, 6, 7, 8), True, False, 3, torch.strided, torch._C._ConvBackend.CudnnTranspose),
-                decorators=[onlyCUDA, skipCUDAIfNoCudnn, skipCUDAIfMiopen], name='cudnn2d_transposed'),
-        # FIXME: RuntimeError: CUDA out of memory.
-        # subtest(((2, 6, 7, 8, 9), True, False, 3, torch.strided, torch._C._ConvBackend.CudnnTranspose),
-        #         decorators=[onlyCUDA, skipCUDAIfNoCudnn, skipCUDAIfMiopen], name='cudnn3d_transposed'),
-        # === miopen ===
-        subtest(((2, 6, 7), False, False, 3, torch.strided, torch._C._ConvBackend.Miopen),
-                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen1d'),
-        subtest(((2, 6, 7, 8), False, False, 3, torch.strided, torch._C._ConvBackend.Miopen),
-                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen2d'),
-        subtest(((2, 6, 7, 8, 9), False, False, 3, torch.strided, torch._C._ConvBackend.Miopen),
-                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen3d'),
-        subtest(((2, 6, 7), True, False, 3, torch.strided, torch._C._ConvBackend.MiopenTranspose),
-                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen1d_transposed'),
-        subtest(((2, 6, 7, 8), True, False, 3, torch.strided, torch._C._ConvBackend.MiopenTranspose),
-                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen2d_transposed'),
-        subtest(((2, 6, 7, 8, 9), True, False, 3, torch.strided, torch._C._ConvBackend.MiopenTranspose),
-                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen3d_transposed'),
-        subtest(((2, 6, 7), False, False, 6, torch.strided, torch._C._ConvBackend.MiopenDepthwise),
-                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen_depthwise1d'),
-        subtest(((2, 6, 7, 8), False, False, 6, torch.strided, torch._C._ConvBackend.MiopenDepthwise),
-                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen_depthwise2d'),
-        subtest(((2, 6, 7, 8, 9), False, False, 6, torch.strided, torch._C._ConvBackend.MiopenDepthwise),
-                decorators=[onlyCUDA, skipCUDAIfNoMiopen], name='miopen_depthwise3d'),
-        # === mkldnn ===
-        subtest(((2, 6, 7), False, False, 3, torch._mkldnn, torch._C._ConvBackend.Mkldnn),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn1d'),
-        subtest(((2, 6, 7, 8), False, False, 3, torch._mkldnn, torch._C._ConvBackend.Mkldnn),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn2d'),
-        subtest(((2, 6, 7, 8, 9), False, False, 3, torch._mkldnn, torch._C._ConvBackend.Mkldnn),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn3d'),
-        # Transposed convolution is broken for mkldnn. See https://github.com/pytorch/pytorch/issues/68775.
-        subtest(((2, 6, 7), True, False, 3, torch._mkldnn, torch._C._ConvBackend.Mkldnn),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn, unittest.expectedFailure], name='mkldnn1d_transposed'),
-        subtest(((2, 6, 7, 8), True, False, 3, torch._mkldnn, torch._C._ConvBackend.Mkldnn),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn, unittest.expectedFailure], name='mkldnn2d_transposed'),
-        subtest(((2, 6, 7, 8, 9), True, False, 3, torch._mkldnn, torch._C._ConvBackend.Mkldnn),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn, unittest.expectedFailure], name='mkldnn3d_transposed'),
-        subtest(((2, 6, 7), False, True, 3, torch.strided, torch._C._ConvBackend.Mkldnn),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn1d_cpu_input'),
-        subtest(((2, 6, 7, 8), False, True, 3, torch.strided, torch._C._ConvBackend.Mkldnn),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn2d_cpu_input'),
-        subtest(((2, 6, 7, 8, 9), False, True, 3, torch.strided, torch._C._ConvBackend.Mkldnn),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn3d_cpu_input'),
-        subtest(((0, 6, 7), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_batch1d'),
-        subtest(((2, 0, 7), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_channel1d'),
-        subtest(((0, 0, 7), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_batch_channel1d'),
-        subtest(((0, 6, 7, 8), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_batch2d'),
-        subtest(((2, 0, 7, 8), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_channel2d'),
-        subtest(((0, 0, 7, 8), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_batch_channel2d'),
-        subtest(((0, 6, 7, 8, 9), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_batch3d'),
-        subtest(((2, 0, 7, 8, 9), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_channel3d'),
-        subtest(((0, 0, 7, 8, 9), False, False, 3, torch._mkldnn, torch._C._ConvBackend.MkldnnEmpty),
-                decorators=[onlyCPU, skipCPUIfNoMkldnn], name='mkldnn_empty_batch_channel3d'),
-        # Note: Tests for mobile backends are not currently supported. This comprises
-        # NnpackSpatial, Winograd3x3Depthwise, and Xnnpack2d backends. Testing these
-        # requires the ability to gate tests by whether PyTorch is built with USE_MOBILE=1.
-    ])
-    # Test with both bias and no bias.
-    @parametrize_test("has_bias", [False, True])
-    # Test with both stride=1 and stride>1 cases.
-    @parametrize_test("strided", [False, True])
-    # Test with both contiguous and non-contiguous inputs.
-    @parametrize_test("contiguous", [False, True])
-    def test_conv_backend(
-            self, device, input_shape, has_bias, strided, contiguous, transposed, dilated, groups,
-            layout, backend_expected):
-        # Build up inputs.
-        dtype = torch.float32
-        C_in, C_out, dim, kernel_size = input_shape[1], 12, len(input_shape) - 2, 3
-        x = torch.randn(*input_shape, device=device, dtype=dtype, requires_grad=True)
-        weight = torch.randn(C_in if transposed else C_out,
-                             C_out // groups if transposed else C_in // groups,
-                             *[kernel_size for _ in range(dim)],
-                             device=device, dtype=dtype, requires_grad=True)
-        bias = torch.randn(C_out, device=device, dtype=dtype, requires_grad=True) if has_bias else None
-
-        def _make_noncontiguous(inp):
-            if inp is None:
-                return None
-            old_requires_grad = inp.requires_grad
-            inp = torch.repeat_interleave(inp, 2, dim=-1)
-            inp = inp[..., ::2].detach().requires_grad_(old_requires_grad)
-            return inp
-
-        if not contiguous:
-            x = _make_noncontiguous(x)
-            weight = _make_noncontiguous(weight)
-            bias = _make_noncontiguous(bias)
-
-        if layout is torch._mkldnn:
-            x = x.to_mkldnn()
-            # Note that weight and bias are not supported as mkldnn tensors during training.
-
-        stride = (2,) * dim if strided else (1,) * dim
-        padding = (0,) * dim
-        dilation = (2,) * dim if dilated else (1,) * dim
-        output_padding = (0,) * dim
-        inputs = [x, weight, bias, stride, padding, dilation, transposed, output_padding, groups]
-
-        # Ensure correct backend is selected.
-        backend_actual = torch._C._select_conv_backend(*inputs)
-        self.assertEqual(backend_actual, backend_expected)
-
-        # Ensure backward call succeeds.
-        convolution = torch.ops.aten.convolution
-        output = convolution(*inputs)
-        grad_output = torch.randn(output.shape, device=device, dtype=dtype)
-        if not contiguous:
-            grad_output = _make_noncontiguous(grad_output)
-        if layout is torch._mkldnn:
-            grad_output = grad_output.to_mkldnn()
-        output.backward(grad_output)
-
-        # mkldnn doesn't support gradcheck :(
-        if layout is torch._mkldnn:
-            return
-
-        if backend_actual != torch._C._ConvBackend.Empty:  # FIXME: forward AD fails
-            # Forward AD and forward-over-reverse AD smoke test in float32
-            # TODO: remove this if we introduce per-op gradient tests for float32
-            with fwAD.dual_level():
-                dual_inputs = [(fwAD.make_dual(i, torch.rand_like(i)) if isinstance(i, torch.Tensor) else i) for i in inputs]
-                # Forward AD
-                output = convolution(*dual_inputs)
-                # Forward over reverse AD
-                grad_output_d = fwAD.make_dual(torch.rand_like(output), torch.rand_like(output))
-                if has_bias:
-                    torch.autograd.grad(output, [x, weight, bias], grad_output_d)
-                else:
-                    torch.autograd.grad(output, [x, weight], grad_output_d)
-
-        # Convert to float64 for gradcheck.
-        x = x.to(torch.float64).detach().requires_grad_(True)
-        weight = weight.to(torch.float64).detach().requires_grad_(True)
-        if bias is not None:
-            bias = bias.to(torch.float64).detach().requires_grad_(True)
-        inputs = [x, weight, bias, stride, padding, dilation, transposed, output_padding, groups]
-
-        # Set some backend-specific validation settings.
-        gradcheck_nondet_tol = 0.0
-        if torch.backends.cudnn.is_available():
-            # cuDNN introduces non-determinism
-            gradcheck_nondet_tol = GRADCHECK_NONDET_TOL
-
-        self.assertTrue(gradcheck(convolution, inputs, nondet_tol=gradcheck_nondet_tol))
-
-        # double backward doesn't support bias gradients
-        if bias is not None:
-            bias.requires_grad_(False)
-        self.assertTrue(gradgradcheck(convolution, inputs, nondet_tol=gradcheck_nondet_tol))
-
-
-    @onlyCPU
-    def test_conv_contiguous_for_oneDNN(self):
-        # See https://github.com/pytorch/pytorch/issues/80837.
-        for dtype in [torch.float, torch.bfloat16]:
-            conv = nn.Conv2d(
-                1,
-                128,
-                kernel_size=(5, 2),
-                stride=(2, 1),
-                padding=(0, 1),
-                dilation=(1, 1),
-                groups=1,
-                bias=True,
-                padding_mode='zeros').to(dtype=dtype)
-
-            x = torch.rand([1, 2, 321, 201, 1]).to(dtype=dtype)
-            x = torch.transpose(x, 1, 4)
-            x2 = x[..., 0]
-            inputs = [x2, conv.weight, conv.bias, (2, 1), (0, 1), (1, 1), False, (0, 1), 1]
-            if torch.backends.mkldnn.is_available():
-                y = conv(x2)
-                # Disable MKLDNN explicitly
-                with torch.backends.mkldnn.flags(enabled=False):
-                    y_ = conv(x2)
-                    self.assertEqual(y, y_)
-
-    @onlyCPU
-    def test_conv_ic1_channels_last_for_oneDNN(self):
-        # See https://github.com/pytorch/pytorch/issues/82060, N > 1 will call in OneDNN path.
-        for dtype in [torch.float, torch.bfloat16]:
-            conv = torch.nn.Conv2d(1, 64, kernel_size=(3, 3), padding=(1, 1), bias=False)
-            conv = conv.to(memory_format=torch.channels_last).to(dtype=dtype)
-            x = torch.rand(2, 1, 100, 100).to(dtype=dtype)
-            if torch.backends.mkldnn.is_available():
-                y = conv(x)
-                # Disable MKLDNN explicitly
-                with torch.backends.mkldnn.flags(enabled=False):
-                    y_ = conv(x)
-                    self.assertEqual(y, y_)
-
     def test_InstanceNorm1d_general(self, device):
         b = random.randint(3, 5)
         c = random.randint(3, 5)
@@ -14116,57 +12337,6 @@ def test_BatchNorm_empty(self, device):
         self.assertEqual(mod.weight.grad, torch.tensor([0., 0, 0], device=device))
         self.assertEqual(mod.bias.grad, torch.tensor([0., 0, 0], device=device))
 
-    @dtypes(torch.float, torch.cfloat)
-    def test_conv_empty_channel(self, device, dtype):
-        in_channels = 0
-        mod = torch.nn.Conv1d(in_channels, 8, 2, stride=2, dtype=dtype).to(device)
-        inp = torch.randn(2, 0, 15, device=device, dtype=dtype)
-        _test_module_empty_input(self, mod, inp, check_size=False)
-
-        with self.assertRaisesRegex(RuntimeError, "Given groups=1, weight"):
-            inp = torch.randn(2, 1, 0, device=device, dtype=dtype)
-            mod(inp)
-
-        mod = torch.nn.Conv2d(in_channels, 33, 3, stride=2, dtype=dtype).to(device)
-        inp = torch.randn(2, 0, 50, 100, device=device, dtype=dtype)
-        _test_module_empty_input(self, mod, inp, check_size=False)
-
-        with self.assertRaisesRegex(RuntimeError, "Given groups=1, weight"):
-            inp = torch.randn(2, 1, 40, 0, device=device, dtype=dtype)
-            mod(inp)
-
-        mod = torch.nn.Conv3d(in_channels, 33, 3, stride=2, dtype=dtype).to(device)
-        inp = torch.randn(2, 0, 50, 20, 40, device=device, dtype=dtype)
-        _test_module_empty_input(self, mod, inp, check_size=False)
-
-        with self.assertRaisesRegex(RuntimeError, "Given groups=1, weight"):
-            inp = torch.randn(2, 1, 50, 0, 40, device=device, dtype=dtype)
-            mod(inp)
-
-    def test_group_conv_empty(self, device):
-        mod = torch.nn.Conv2d(4, 4, stride=2, kernel_size=3, padding=1, groups=4).to(device)
-        inp = torch.randn(0, 4, 4, 4, device=device)
-        _test_module_empty_input(self, mod, inp, check_size=False)
-        if self.device_type == 'cuda' and self.has_cudnn():
-            with torch.backends.cudnn.flags(enabled=False):
-                _test_module_empty_input(self, mod, inp, check_size=False)
-
-    def test_group_convTranspose_empty(self, device):
-        mod = torch.nn.ConvTranspose2d(4, 4, stride=2, kernel_size=3, padding=1, groups=4).to(device)
-        inp = torch.randn(0, 4, 4, 4, device=device)
-        _test_module_empty_input(self, mod, inp, check_size=False)
-        if self.device_type == 'cuda' and self.has_cudnn():
-            with torch.backends.cudnn.flags(enabled=False):
-                _test_module_empty_input(self, mod, inp, check_size=False)
-
-    def test_convTranspose_empty(self, device):
-        mod = torch.nn.ConvTranspose2d(4, 4, stride=2, kernel_size=3, padding=1).to(device)
-        inp = torch.randn(0, 4, 4, 4, device=device)
-        _test_module_empty_input(self, mod, inp, check_size=False)
-        if self.device_type == 'cuda' and self.has_cudnn():
-            with torch.backends.cudnn.flags(enabled=False):
-                _test_module_empty_input(self, mod, inp, check_size=False)
-
     @onlyCUDA
     @largeTensorTest('16GB')
     def test_prelu_backward_32bit_indexing(self, device):
@@ -15155,69 +13325,6 @@ def _test_helper(shape):
             # test non-persistent softmax kernel
             _test_helper((4, 1536))
 
-    @onlyCUDA
-    @largeTensorTest('12GB')
-    def test_conv_large_nosplit(self, device):
-        # Here we just test the convolution correctly route to the fallback implementation
-        # that is, it does not crash. The correctness of fallback implementation should be
-        # covered in other tests
-        dtype = torch.half if self.device_type == 'cuda' else torch.float
-        conv1 = nn.Conv2d(2, 2, 8, 8).to(device).to(dtype)
-        input_large = torch.randn(1, 2, 1024, 1024 * 1024, dtype=dtype, device=device)
-        conv1(input_large)
-        conv2 = torch.nn.Conv2d(1, 1024, 1, 1).to(device).to(dtype)
-        input_large = torch.randn(1, 1, 2048, 1024 , dtype=dtype, device=device)
-        conv2(input_large)
-
-    @onlyCUDA
-    @largeTensorTest('40GB')
-    @largeTensorTest('24GB', 'cpu')
-    def test_conv3d_64bit_indexing(self, device):
-        x = torch.rand(1, 32, 512, 512, 256)
-        m = torch.nn.Conv3d(32, 1, kernel_size=1, padding=0, stride=1, bias=False)
-        yref = m(x)
-        y = m.to(device=device)(x.to(device=device))
-        self.assertEqual(yref, y)
-
-    def test_conv_noncontig_weights(self, device):
-        for dim in (1, 2, 3):
-            for grouped in (False, True):
-                nc = 3
-                groups = 3 if grouped else 1
-                w = torch.randn([3] * dim, device=device)
-                w = w.expand([nc, int(nc / groups)] + list(w.shape))
-                w = w.detach().requires_grad_()
-                x = torch.randn([1, nc] + ([5] * dim), device=device, requires_grad=True)
-                y = getattr(F, 'conv{}d'.format(dim))(x, w, groups=groups)
-                y.sum().backward()
-                y = getattr(F, 'conv_transpose{}d'.format(dim))(x, w, groups=groups)
-                y.sum().backward()
-
-    def test_conv_noncontig_weights_and_bias(self, device):
-        # need floats to exercise https://github.com/pytorch/pytorch/issues/16018
-        for bias in [True, False]:
-            conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
-                              bias=bias).to(device, torch.float)
-
-            input_nc = torch.randn((1, 3, 224, 224, 2), device=device, dtype=torch.float)[:, :, :, :, 1]
-            input_c = input_nc.contiguous()
-
-            weight_nc = torch.randn((64, 3, 7, 7, 2), device=device, dtype=torch.float)[:, :, :, :, 1]
-            conv1.weight = nn.Parameter(weight_nc)
-            weight_c = conv1.weight.contiguous()
-
-            if bias:
-                bias_nc = torch.randn((64, 2), device=device, dtype=torch.float)[:, 1]
-                conv1.bias = nn.Parameter(bias_nc)
-                bias_c = conv1.bias.contiguous()
-
-            out1 = conv1(input_nc)
-            conv1.weight = nn.Parameter(weight_c)
-            if bias:
-                conv1.bias = nn.Parameter(bias_c)
-            out2 = conv1(input_c)
-            self.assertEqual(out1, out2)
-
     def test_save_lstm_compatibility(self, device):
         # Test that saving an LSTM in PyTorch 1.7 and older can still be
         # loaded in newer versions of PyTorch.
@@ -15368,63 +13475,6 @@ def test_grid_sample_large_index_3d(self, device, dtype):
             small_image.grad.zero_()
             large_view.grad.zero_()
 
-    @onlyCUDA
-    @largeTensorTest('12GB')
-    def test_conv_transposed_large(self, device):
-        dtype = torch.half if self.device_type == 'cuda' else torch.float
-        conv = nn.ConvTranspose2d(1, 1, 1, 1, bias=False).to(device).to(dtype)
-        input_large = torch.randn(4096, 1, 512, 1024, dtype=dtype, device=device)
-        # forward
-        ret = conv(input_large)
-        maxdiff0 = (ret.narrow(0, 0, 1024) - conv(input_large.narrow(0, 0, 1024))).abs_().max().item()
-        maxdiff1 = (ret.narrow(0, 1024, 1024) - conv(input_large.narrow(0, 1024, 1024))).abs_().max().item()
-        maxdiff2 = (ret.narrow(0, 2048, 1024) - conv(input_large.narrow(0, 2048, 1024))).abs_().max().item()
-        maxdiff3 = (ret.narrow(0, 3072, 1024) - conv(input_large.narrow(0, 3072, 1024))).abs_().max().item()
-        if self.device_type == 'cuda':
-            # cuDNN may use algorithms such as FFT that don't guarantee a diff of 0
-            self.assertEqual(maxdiff0, 0, atol=2e-3, rtol=1e-5)
-            self.assertEqual(maxdiff1, 0, atol=2e-3, rtol=1e-5)
-            self.assertEqual(maxdiff2, 0, atol=2e-3, rtol=1e-5)
-            self.assertEqual(maxdiff3, 0, atol=2e-3, rtol=1e-5)
-        else:
-            self.assertEqual(maxdiff0, 0)
-            self.assertEqual(maxdiff1, 0)
-            self.assertEqual(maxdiff2, 0)
-            self.assertEqual(maxdiff3, 0)
-
-    @onlyCUDA
-    @skipCUDAIfRocm
-    @largeTensorTest('12GB')
-    def test_conv_large(self, device):
-        dtype = torch.half if self.device_type == 'cuda' else torch.float
-        conv = nn.Conv2d(2, 2, 8, 8, bias=False).to(device).to(dtype)
-        input_large = torch.randn(4097, 2, 512, 512, dtype=dtype, device=device)
-        # forward
-        ret = conv(input_large)
-        self.assertEqual(ret[:2048], conv(input_large[:2048]))
-        self.assertEqual(ret[2048:4096], conv(input_large[2048:4096]))
-        self.assertEqual(ret[4096:], conv(input_large[4096:]))
-
-        # backward
-        conv.zero_grad()
-        # When computing the backward, we are using the `max(dim=1)`` to create
-        # some sparsity. Without this sparsity, the rounding error would be
-        # too large (as large as 1e-5) to satisfy the creterion (1e-6) of `assertEqual`
-        ret.view(4097, -1).max(dim=1).values.sum().backward()
-        del ret
-        grad1 = conv.weight.grad.detach().clone()
-        conv.zero_grad()
-        conv(input_large[:2048]).view(2048, -1).max(dim=1).values.sum().backward()
-        conv(input_large[2048:4096]).view(2048, -1).max(dim=1).values.sum().backward()
-        conv(input_large[4096:]).view(1, -1).max(dim=1).values.sum().backward()
-        grad2 = conv.weight.grad.detach().clone()
-        # gradients are at the order of hundreds, we need to scale it to
-        # the order of one so that we can compare
-        scale = 1 / grad2.abs().mean()
-        grad1 = grad1 * scale
-        grad2 = grad2 * scale
-        self.assertEqual(grad1, grad2, atol=5e-2, rtol=5e-3)
-
     def _test_gumbel_softmax_st_shapes(self, device, dtype, shape, dim, count_expected):
         logits = torch.randn(shape, dtype=torch.float, device=device)
         logits = logits.to(dtype)
@@ -15734,76 +13784,6 @@ def test_CTCLoss_no_batch_dim(self, device, reduction, use_module_form):
         self._assertEqual_list((input_length, 1, vocab_size), [t.grad.shape for t in log_probs_refs])
         self._assertEqual_list((input_length, vocab_size), [t.grad.shape for t in log_probs_no_bd_refs])
 
-    @onlyCUDA
-    @skipCUDAIfNoCudnn
-    def test_contig_wrong_stride_cudnn(self, device):
-        # x has to have batch_size 1 to test contiguous checks
-        x = torch.randn(1, 16, 5, 5, device=device)
-        stride = list(x.stride())
-        stride[0] = 20
-        # change the stride in dimension 0. the tensor is still contiguous because size[0] is 1
-        x.set_(x.storage(), 0, x.size(), stride)
-        self.assertTrue(x.is_contiguous())
-        F.conv_transpose2d(x, torch.randn(16, 1, 1, 1, device=device))
-        F.conv2d(x, torch.randn(1, 16, 1, 1, device=device))
-
-    @onlyCUDA
-    def test_Conv2d_size_1_kernel(self, device):
-        x_cpu = torch.randn(2, 3, 5, 5)
-        conv_cpu = torch.nn.Conv2d(3, 3, kernel_size=1)
-        y_cpu = conv_cpu(x_cpu)
-        y = torch.rand_like(y_cpu)
-        y_cpu.backward(y)
-
-        with cudnn.flags(enabled=False):
-            conv_cuda = torch.nn.Conv2d(3, 3, kernel_size=1).to(device)
-            conv_cuda.bias.data.copy_(conv_cpu.bias.data)
-            conv_cuda.weight.data.copy_(conv_cpu.weight.data)
-            y_cuda = conv_cuda(x_cpu.to(device))
-            y_cuda.backward(y.to(device))
-
-        self.assertEqual(y_cpu, y_cuda, atol=1e-5, rtol=0, exact_device=False)
-        self.assertEqual(conv_cpu.bias.grad.data, conv_cuda.bias.grad.data, atol=1e-5, rtol=0, exact_device=False)
-        self.assertEqual(conv_cpu.weight.grad.data, conv_cuda.weight.grad.data, atol=1e-5, rtol=0, exact_device=False)
-
-    @onlyCUDA
-    def test_ConvTranspose2d_size_1_kernel(self, device):
-        x_cpu = torch.randn(2, 3, 5, 5)
-        conv_cpu = torch.nn.ConvTranspose2d(3, 3, kernel_size=1)
-        y_cpu = conv_cpu(x_cpu)
-        y = torch.rand_like(y_cpu)
-        y_cpu.backward(y)
-
-        with cudnn.flags(enabled=False):
-            conv_cuda = torch.nn.ConvTranspose2d(3, 3, kernel_size=1).to(device)
-            conv_cuda.bias.data.copy_(conv_cpu.bias.data)
-            conv_cuda.weight.data.copy_(conv_cpu.weight.data)
-            y_cuda = conv_cuda(x_cpu.to(device))
-            y_cuda.backward(y.to(device))
-
-        self.assertEqual(y_cpu, y_cuda, atol=1e-5, rtol=0, exact_device=False)
-        self.assertEqual(conv_cpu.bias.grad.data, conv_cuda.bias.grad.data, atol=1e-5, rtol=0, exact_device=False)
-        self.assertEqual(conv_cpu.weight.grad.data, conv_cuda.weight.grad.data, atol=1e-5, rtol=0, exact_device=False)
-
-    @onlyCUDA
-    def test_ConvTranspose3d_size_1_kernel(self, device):
-        x_cpu = torch.randn(2, 3, 3, 5, 5)
-        conv_cpu = torch.nn.ConvTranspose3d(3, 3, kernel_size=1)
-        y_cpu = conv_cpu(x_cpu)
-        y = torch.rand_like(y_cpu)
-        y_cpu.backward(y)
-
-        with cudnn.flags(enabled=False):
-            conv_cuda = torch.nn.ConvTranspose3d(3, 3, kernel_size=1).to(device)
-            conv_cuda.bias.data.copy_(conv_cpu.bias.data)
-            conv_cuda.weight.data.copy_(conv_cpu.weight.data)
-            y_cuda = conv_cuda(x_cpu.to(device))
-            y_cuda.backward(y.to(device))
-
-        self.assertEqual(y_cpu, y_cuda, atol=1e-5, rtol=0, exact_device=False)
-        self.assertEqual(conv_cpu.bias.grad.data, conv_cuda.bias.grad.data, atol=1e-5, rtol=0, exact_device=False)
-        self.assertEqual(conv_cpu.weight.grad.data, conv_cuda.weight.grad.data, atol=1e-5, rtol=0, exact_device=False)
-
     def _ordered_sequence(self, device, dtype):
         """Create ordered list of random sequences"""
         seqs = [torch.empty(random.randint(1, 6), device=device, dtype=dtype)
@@ -15930,55 +13910,6 @@ def test_multihead_attention_dtype_batch_first(self, device, dtype):
                 self.assertEqual(q.size(), out[0].size())
                 self.assertEqual(dtype, out[0].dtype)
 
-    @dtypesIfCUDA(*floating_types_and(torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else []))
-    @dtypes(torch.float)
-    @torch.backends.cudnn.flags(enabled=True, benchmark=False)
-    def test_Conv2d_naive_groups(self, device, dtype):
-        # Check that grouped convolutions matches two half convolutions
-        m = nn.Conv2d(4, 4, kernel_size=3, groups=2).to(device, dtype)
-        i = torch.randn(2, 4, 6, 6, device=device, dtype=dtype, requires_grad=True)
-        output = m(i)
-        grad_output = torch.randn(2, 4, 4, 4, device=device, dtype=dtype)
-        output.backward(grad_output)
-
-        m1 = nn.Conv2d(2, 2, kernel_size=3).to(device, dtype)
-        m1.weight.data.copy_(m.weight.data[:2])
-        m1.bias.data.copy_(m.bias.data[:2])
-        i1 = i.data[:, :2].contiguous().requires_grad_(True)
-        output1 = m1(i1)
-        output1.backward(grad_output[:, :2].contiguous())
-
-        m2 = nn.Conv2d(2, 2, kernel_size=3).to(device, dtype)
-        m2.weight.data.copy_(m.weight.data[2:])
-        m2.bias.data.copy_(m.bias.data[2:])
-        i2 = i.data[:, 2:].contiguous().requires_grad_(True)
-        output2 = m2(i2)
-        output2.backward(grad_output[:, 2:].contiguous())
-
-        self.assertEqual(output, torch.cat([output1, output2], 1))
-        self.assertEqual(i.grad.data,
-                         torch.cat([i1.grad.data, i2.grad.data], 1),
-                         atol=dtype2prec_DONTUSE[dtype], rtol=0)
-        self.assertEqual(m.bias.grad.data,
-                         torch.cat([m1.bias.grad.data, m2.bias.grad.data], 0),
-                         atol=dtype2prec_DONTUSE[dtype], rtol=0)
-        self.assertEqual(m.weight.grad.data,
-                         torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
-                         atol=dtype2prec_DONTUSE[dtype], rtol=0)
-
-    @dtypes(torch.double, torch.cdouble)
-    def test_Conv2d_backward_depthwise(self, device, dtype):
-        x = torch.randn(2, 2, 4, 20, device=device, dtype=dtype, requires_grad=True)
-        weight = torch.randn(2, 1, 3, 5, device=device, dtype=dtype, requires_grad=True)
-
-        def conv2d_depthwise(x, weight):
-            return torch.nn.functional.conv2d(
-                x, weight, bias=None, stride=(1, 10), groups=2)
-
-        for cudnn_enabled in [False, True]:
-            with torch.backends.cudnn.flags(enabled=cudnn_enabled):
-                torch.autograd.gradcheck(conv2d_depthwise, (x, weight))
-
     def _test_batchnorm_grad(self, device, dtype=torch.double):
         bs, n_feat, size_feat = 4, 5, 6
         input = torch.arange(bs * n_feat * size_feat, device=device,
@@ -16498,370 +14429,6 @@ def test_softmax_bfloat16(self, device):
             # test softmax with large input value which casues exp() to overflow
             _test_bfloat16_ops(self, torch.nn.Softmax(dim=dim), device, inp_dims=(16, 33, 15, 16), prec=0.05, scale_factor=1000.0)
 
-    @onlyCPU
-    @dtypes(torch.float, torch.double)
-    def test_conv_thnn_nhwc(self, device, dtype):
-        def helper(n, c, h, w, out_channels, kernel_size, dilation, groups, input_format, weight_format):
-            input = torch.randint(-3, 3, (n, c, h, w), dtype=dtype, device=device)\
-                .to(memory_format=input_format)
-            input.requires_grad_()
-            conv = nn.Conv2d(c, out_channels, kernel_size, dilation=dilation, groups=groups)\
-                .to(device='cpu', dtype=dtype, memory_format=weight_format)
-            for p in conv.parameters():
-                p.data = torch.randint_like(p, -3, 3)
-
-            ref_input = input.detach().clone().contiguous().requires_grad_()
-            ref_conv = nn.Conv2d(c, out_channels, kernel_size, dilation=dilation, groups=groups)
-            # load_state_dict will restore the stride & memory_layout on ref_conv.weight.
-            ref_conv.load_state_dict(conv.state_dict())
-            ref_conv = ref_conv.to(device='cpu', dtype=dtype, memory_format=torch.contiguous_format)
-
-            out = conv(input)
-            ref_out = ref_conv(ref_input)
-
-            grad = torch.randint_like(out, -3, 3)
-            ref_grad = grad.detach().clone().contiguous()
-
-            out.backward(grad)
-            ref_out.backward(ref_grad)
-
-            self.assertTrue(out.is_contiguous(memory_format=torch.channels_last))
-            self.assertTrue(ref_out.is_contiguous())
-            self.assertEqual(out, ref_out, exact_dtype=False)
-            self.assertEqual(conv.weight.grad, ref_conv.weight.grad, exact_dtype=False)
-            self.assertEqual(conv.bias.grad, ref_conv.bias.grad, exact_dtype=False)
-            self.assertEqual(input.grad, ref_input.grad, exact_dtype=False)
-
-        with torch.backends.mkldnn.flags(enabled=False):
-            formats = [[torch.channels_last, torch.channels_last],
-                       [torch.channels_last, torch.contiguous_format],
-                       [torch.contiguous_format, torch.channels_last]]
-            for input_format, weight_format in formats:
-                # non-dilated conv: thnn_conv2d normal path (with im2col)
-                helper(2, 8, 4, 4, out_channels=4, kernel_size=3, dilation=1, groups=1,
-                       input_format=input_format, weight_format=weight_format)
-                helper(2, 8, 4, 4, out_channels=8, kernel_size=3, dilation=1, groups=8,
-                       input_format=input_format, weight_format=weight_format)
-                # test when input chanels is 1 and not converted to channels last
-                helper(2, 1, 10, 10, out_channels=8, kernel_size=3, dilation=1, groups=1,
-                       input_format=torch.contiguous_format, weight_format=torch.channels_last)
-                # non-dilated conv: thnn_conv2d fast path (skip im2col)
-                helper(1, 16, 56, 56, out_channels=16, kernel_size=1, dilation=1, groups=1,
-                       input_format=input_format, weight_format=weight_format)
-                # ic == oc == 1 here, so need to stick input to CL to activate channels last
-                helper(1, 16, 56, 56, out_channels=16, kernel_size=1, dilation=1, groups=16,
-                       input_format=torch.channels_last, weight_format=weight_format)
-                # dilated conv: slow_conv_dilated2d
-                helper(2, 8, 11, 13, out_channels=16, kernel_size=3, dilation=2, groups=1,
-                       input_format=input_format, weight_format=weight_format)
-                helper(2, 16, 11, 13, out_channels=32, kernel_size=3, dilation=2, groups=16,
-                       input_format=input_format, weight_format=weight_format)
-
-    @onlyCUDA
-    @skipCUDAIfRocmVersionLessThan((4, 3))
-    @skipCUDAIfNotMiopenSuggestNHWC
-    @skipCUDAIfCudnnVersionLessThan(7603)
-    @dtypes(torch.half, torch.float, torch.cfloat)
-    def test_conv_cudnn_nhwc(self, device, dtype):
-        def helper(n, c, h, w, out_channels, kernel_size, groups):
-            input = torch.randint(-3, 3, (n, c, h, w), dtype=dtype, device=device)\
-                .to(memory_format=torch.channels_last)
-            input.requires_grad_()
-            conv = nn.Conv2d(c, out_channels, kernel_size, groups=groups)\
-                .to(device='cuda', dtype=dtype, memory_format=torch.channels_last)
-            for p in conv.parameters():
-                p.data = torch.randint_like(p, -3, 3)
-
-            # use FP64 channels-first conv as reference
-            ref_input = input.detach().clone().contiguous().double().requires_grad_()
-            ref_conv = nn.Conv2d(c, out_channels, kernel_size, groups=groups)
-            # load_state_dict will restore the stride & memory_layout on ref_conv.weight.
-            ref_conv.load_state_dict(conv.state_dict())
-            ref_conv = ref_conv.to(device='cuda', dtype=torch.double, memory_format=torch.contiguous_format)
-
-            out = conv(input)
-            ref_out = ref_conv(ref_input)
-
-            grad = torch.randint_like(out, -3, 3)
-            ref_grad = grad.detach().clone().double().contiguous()
-
-            out.backward(grad)
-            ref_out.backward(ref_grad)
-
-            self.assertTrue(out.is_contiguous(memory_format=torch.channels_last))
-            self.assertTrue(input.grad.is_contiguous(memory_format=torch.channels_last))
-            self.assertTrue(conv.weight.grad.is_contiguous(memory_format=torch.channels_last))
-
-            self.assertTrue(ref_out.is_contiguous())
-            self.assertTrue(ref_input.grad.is_contiguous())
-            self.assertTrue(ref_conv.weight.grad.is_contiguous())
-
-            self.assertEqual(out, ref_out, exact_dtype=False)
-            self.assertEqual(conv.weight.grad, ref_conv.weight.grad, exact_dtype=False)
-            self.assertEqual(conv.bias.grad, ref_conv.bias.grad, exact_dtype=False)
-            self.assertEqual(input.grad, ref_input.grad, exact_dtype=False)
-
-        helper(2, 8, 4, 4, out_channels=4, kernel_size=3, groups=1)
-        helper(2, 8, 4, 4, out_channels=8, kernel_size=3, groups=8)
-        helper(1, 16, 56, 56, out_channels=16, kernel_size=3, groups=1)
-        helper(1, 16, 56, 56, out_channels=16, kernel_size=3, groups=16)
-
-    @onlyCUDA
-    @skipCUDAIfRocm
-    @skipCUDAIfCudnnVersionLessThan(8005)
-    @dtypes(torch.half, torch.float)
-    def test_conv_cudnn_ndhwc(self, device, dtype):
-        def helper(n, c, d, h, w, out_channels, kernel_size, groups):
-            input = torch.randint(-2, 2, (n, c, d, h, w), dtype=dtype, device=device)\
-                .to(memory_format=torch.channels_last_3d)
-            input.requires_grad_()
-            conv = nn.Conv3d(c, out_channels, kernel_size, groups=groups)\
-                .to(device='cuda', dtype=dtype, memory_format=torch.channels_last_3d)
-            for p in conv.parameters():
-                p.data = torch.randint_like(p, -2, 2)
-
-            # use FP64 channels-first conv as reference
-            ref_input = input.detach().clone().contiguous().double().requires_grad_()
-            ref_conv = nn.Conv3d(c, out_channels, kernel_size, groups=groups)
-            # load_state_dict will restore the stride & memory_layout on ref_conv.weight.
-            ref_conv.load_state_dict(conv.state_dict())
-            ref_conv = ref_conv.to(device='cuda', dtype=torch.double, memory_format=torch.contiguous_format)
-
-            out = conv(input)
-            ref_out = ref_conv(ref_input)
-
-            grad = torch.randint_like(out, -2, 2)
-            ref_grad = grad.detach().clone().double().contiguous()
-
-            out.backward(grad)
-            ref_out.backward(ref_grad)
-
-            self.assertTrue(out.is_contiguous(memory_format=torch.channels_last_3d))
-            self.assertTrue(input.grad.is_contiguous(memory_format=torch.channels_last_3d))
-            self.assertTrue(conv.weight.grad.is_contiguous(memory_format=torch.channels_last_3d))
-
-            self.assertTrue(ref_out.is_contiguous())
-            self.assertTrue(ref_input.grad.is_contiguous())
-            self.assertTrue(ref_conv.weight.grad.is_contiguous())
-
-            self.assertEqual(out, ref_out, exact_dtype=False)
-            self.assertEqual(conv.weight.grad, ref_conv.weight.grad, exact_dtype=False)
-            self.assertEqual(conv.bias.grad, ref_conv.bias.grad, exact_dtype=False)
-            self.assertEqual(input.grad, ref_input.grad, exact_dtype=False)
-
-        helper(2, 8, 4, 4, 4, out_channels=4, kernel_size=3, groups=1)
-        helper(2, 8, 4, 4, 4, out_channels=8, kernel_size=3, groups=8)
-        helper(1, 16, 18, 18, 18, out_channels=16, kernel_size=3, groups=1)
-        helper(1, 16, 18, 18, 18, out_channels=16, kernel_size=3, groups=16)
-
-    def _run_conv(self, layer, device, inp, grad, ref_conv, ref_input, ref_out,
-                  input_format, weight_format, grad_format, output_format):
-        conv = layer(inp.size(1), grad.size(1),
-                     ref_conv.weight.size(2)).float().to(device)
-        # load_state_dict will restore the stride & memory_layout on ref_conv.weight.
-        conv.load_state_dict(ref_conv.state_dict())
-        weight_data = conv.weight.detach().clone().contiguous(memory_format=weight_format)
-        conv.weight.data = weight_data.resize_(weight_data.size(), memory_format=weight_format)
-        input = inp.clone().contiguous(memory_format=input_format)
-        input.resize_(input.size(), memory_format=input_format)
-        input = input.requires_grad_()
-        grad = grad.contiguous(memory_format=grad_format)
-        grad.resize_(grad.size(), memory_format=grad_format)
-        out = conv(input)
-        out.backward(grad)
-        self.assertTrue(out.is_contiguous(memory_format=output_format))
-        self.assertEqual(out, ref_out)
-        self.assertEqual(conv.weight.grad, ref_conv.weight.grad)
-        self.assertEqual(conv.bias.grad, ref_conv.bias.grad)
-        self.assertEqual(input.grad, ref_input.grad)
-
-    def _test_conv_cudnn_nhwc_nchw(self, layer, n, c, h, w, k, filter_size, device):
-        data = torch.randint(1, 10, (n, c, h, w), dtype=torch.float32, device=device)
-        ref_input = data.clone().contiguous().requires_grad_(True)
-        ref_conv = layer(c, k, filter_size).float().to(device)
-        ref_out = ref_conv(ref_input)
-        grad = torch.randint(1, 10, ref_out.size(), dtype=torch.float32, device="cuda")
-        ref_out.backward(grad)
-
-        for w_f in [torch.contiguous_format, torch.channels_last]:
-            for g_f in [torch.contiguous_format, torch.channels_last]:
-                for input_format in [torch.contiguous_format, torch.channels_last]:
-                    output_format = torch.contiguous_format
-                    # Older versions of CudNN have Channels Last support disabled
-                    if torch.backends.cudnn.version() >= 7603:
-                        if input_format == torch.channels_last:
-                            output_format = torch.channels_last
-                        # This is because we have N111 weight that cannot handle
-                        # the ambiguous memory_format
-                        if w_f == torch.channels_last:
-                            if layer == nn.Conv2d and filter_size * c != 1:
-                                output_format = torch.channels_last
-                            if layer == nn.ConvTranspose2d and filter_size * k != 1:
-                                output_format = torch.channels_last
-                    self._run_conv(layer, device, data, grad, ref_conv, ref_input,
-                                   ref_out, input_format, w_f, g_f, output_format)
-
-    @onlyCUDA
-    @skipCUDAIfRocmVersionLessThan((4, 3))
-    @skipCUDAIfNotMiopenSuggestNHWC
-    @skipCUDAIfCudnnVersionLessThan(7603)
-    @tf32_on_and_off(0.05)
-    def test_conv_cudnn_mismatch_memory_format(self, device):
-        configs = [
-            [4, 2, 8, 8, 4, 2],
-            [4, 1, 8, 8, 4, 2],
-            [1, 1, 8, 8, 4, 2],
-            [4, 2, 2, 8, 4, 1],
-            [4, 2, 1, 8, 4, 1],
-            [4, 2, 8, 8, 4, 1],
-            [4, 1, 8, 8, 4, 1],
-        ]
-        for n, c, h, w, k, filter_size in configs:
-            self._test_conv_cudnn_nhwc_nchw(nn.Conv2d, n, c, h, w, k, filter_size, device)
-            self._test_conv_cudnn_nhwc_nchw(nn.ConvTranspose2d, n, c, h, w, k, filter_size, device)
-
-    # torch.half is erroring out on Windows with CUDA 10.1 + cuDNN 7.6.4
-    # returning CUDNN_STATUS_BAD_PARAM
-    # Disabling that specific test for now [see issue # 33918]
-    @onlyCUDA
-    @skipCUDAIfNoCudnn
-    @dtypes(torch.float, torch.double)
-    def test_conv_cudnn_nhwc_support(self, device, dtype):
-        input = torch.randn((1, 16, 1, 1), dtype=dtype, device="cuda", requires_grad=True)
-        weight = torch.randn((8, 16, 3, 3), dtype=dtype, device="cuda", requires_grad=True)
-        weight = weight.to(memory_format=torch.channels_last)
-        o = torch.conv2d(input, weight, None, (2, 1), (1, 1), (1, 1), 1)
-        self.assertTrue(o.is_contiguous(memory_format=torch.channels_last))
-        o.sum().backward()
-
-    # Test that faster algorithms used for inference produce the same results
-    # Validates depthwise3x3 bug reported in https://github.com/pytorch/pytorch/issues/60176
-    @onlyCPU
-    @dtypes(torch.float)
-    def test_conv2d_no_grad(self, device, dtype):
-        for batch in [1, 2, 3]:
-            for groups in [1, 2, 4]:
-                input = torch.rand(batch, groups, 8, 8, dtype=dtype, device=device)
-                m = nn.Conv2d(groups, 8, kernel_size=(3, 3), groups=groups, dtype=dtype, device=device)
-                with torch.no_grad():
-                    output_ng = m(input)
-                output = m(input)
-                self.assertEqual(output, output_ng, rtol=1e-2, atol=1e-5)
-
-    @onlyCUDA
-    @skipCUDAIfNoCudnn
-    @dtypes(torch.float, torch.float16)
-    @precisionOverride({torch.half: 0.002, torch.float: 1e-4})
-    def test_cudnn_convolution_relu(self, device, dtype):
-        for batch, groups, image_size, kernel_size, memory_format in \
-                product((1, 2, 3),
-                        (1, 2, 4),
-                        ((1, 1), (8, 8)),
-                        ((1, 1), (3, 3)),
-                        (torch.channels_last, torch.contiguous_format)):
-            if image_size[0] < kernel_size[0]:
-                continue
-            inp = torch.rand(batch, groups, *image_size, dtype=dtype, device=device)
-            w = torch.randn(8, groups, *kernel_size, dtype=dtype, device=device)
-            conv2d_out = torch.conv2d(inp, w, None, (1, 1), (0, 0), (1, 1), 1)
-            inp = inp.to(memory_format=memory_format)
-            w = w.to(memory_format=memory_format)
-            if torch.version.hip:
-                cudnn_out = torch.miopen_convolution_relu(inp, w, None, (1, 1), (0, 0), (1, 1), 1)
-            else:
-                cudnn_out = torch.cudnn_convolution_relu(inp, w, None, (1, 1), (0, 0), (1, 1), 1)
-            self.assertTrue(cudnn_out.is_contiguous(memory_format=memory_format))
-            if tf32_is_not_fp32() and dtype == torch.float:
-                self.assertEqual(conv2d_out.relu(), cudnn_out, atol=2e-4, rtol=0.006)
-            else:
-                self.assertEqual(conv2d_out.relu(), cudnn_out)
-
-    @onlyCUDA
-    @skipCUDAIfNoCudnn
-    @dtypes(torch.float, torch.float16)
-    @precisionOverride({torch.half: 0.002, torch.float: 1e-4})
-    def test_cudnn_convolution_add_relu(self, device, dtype):
-        for batch, groups, image_size, kernel_size, memory_format in \
-            product((1, 2, 3),
-                    (1, 2, 4),
-                    ((1, 1), (8, 8)),
-                    ((1, 1), (3, 3)),
-                    (torch.channels_last, torch.contiguous_format)):
-            if image_size[0] < kernel_size[0]:
-                continue
-            inp = torch.rand(batch, groups, *image_size, dtype=dtype, device=device)
-            w = torch.randn(8, groups, *kernel_size, dtype=dtype, device=device)
-            conv2d_out = torch.conv2d(inp, w, None, (1, 1), (0, 0), (1, 1), 1)
-            alpha = 2.0
-            z = torch.randn_like(conv2d_out)
-
-            inp = inp.to(memory_format=memory_format)
-            w = w.to(memory_format=memory_format)
-            z = z.to(memory_format=memory_format)
-            if torch.version.hip:
-                cudnn_out = torch.miopen_convolution_add_relu(inp, w, z, alpha, None, (1, 1), (0, 0), (1, 1), 1)
-            else:
-                cudnn_out = torch.cudnn_convolution_add_relu(inp, w, z, alpha, None, (1, 1), (0, 0), (1, 1), 1)
-
-            self.assertTrue(cudnn_out.is_contiguous(memory_format=memory_format))
-            if tf32_is_not_fp32() and dtype == torch.float:
-                self.assertEqual(F.relu(conv2d_out + alpha * z), cudnn_out, atol=3e-4, rtol=0.006)
-            else:
-                self.assertEqual(F.relu(conv2d_out + alpha * z), cudnn_out)
-
-    @onlyCUDA
-    @skipCUDAIfRocm
-    @skipCUDAIfCudnnVersionLessThan(7603)
-    def test_convert_conv2d_weight_memory_format(self, device):
-        input = torch.randint(1, 10, (2, 8, 4, 4), dtype=torch.float32, device=device)
-        model = nn.Sequential(
-            nn.Conv2d(8, 4, 3),
-            nn.BatchNorm2d(4)).to(device).float()
-        for memory_format in [torch.channels_last, torch.contiguous_format]:
-            model = nn.utils.convert_conv2d_weight_memory_format(model, memory_format)
-            out = model(input)
-            self.assertTrue(out.is_contiguous(memory_format=memory_format))
-
-        model = nn.Sequential(
-            nn.ConvTranspose2d(8, 4, 3),
-            nn.BatchNorm2d(4)).to(device).float()
-        for memory_format in [torch.channels_last, torch.contiguous_format]:
-            model = nn.utils.convert_conv2d_weight_memory_format(model, memory_format)
-            out = model(input)
-            self.assertTrue(out.is_contiguous(memory_format=memory_format))
-
-    def test_conv_double_backward_strided_with_3D_input_and_weight(self, device):
-        # Test that _convolution_double_backward() outputs the correct grad shapes
-        # for 3D input / weight when stride > 1. This is an ad-hoc regression test for a
-        # specific case that was uncovered during the convolution consolidation effort.
-        # The test can be safely deleted if _convolution_double_backward() is removed.
-
-        input = torch.randn(2, 3, 6, device=device)
-        weight = torch.randn(3, 3, 3, device=device)
-        bias = torch.randn(3, device=device)
-        stride = (2,)
-        padding = (1,)
-        dilation = (1,)
-        transposed = False
-        output_padding = (0,)
-        groups = 1
-        output = torch.ops.aten.convolution(input, weight, bias, stride, padding, dilation, transposed,
-                                            output_padding, groups)
-
-        ggI = torch.randn(input.shape, device=device)
-        ggW = torch.randn(weight.shape, device=device)
-        ggB = torch.randn(bias.shape, device=device)
-        gO = torch.randn(output.shape, device=device)
-        output_mask = [True, True, True]
-        grad_grad_output, grad_input, grad_weight = torch.ops.aten._convolution_double_backward(
-            ggI, ggW, ggB, gO, weight, input, stride, padding, dilation, transposed,
-            output_padding, groups, output_mask)
-
-        # Make sure the correct shapes are computed.
-        self.assertEqual(grad_grad_output.shape, gO.shape)
-        self.assertEqual(grad_input.shape, input.shape)
-        self.assertEqual(grad_weight.shape, weight.shape)
-
     def test_nll_loss_mismatched_batch(self, device):
         x = torch.randn((10, 3), requires_grad=True, device=device)
         # t should have size (10,)

From 68b7b6b20180c7b67461d597a44db1cb31a20ee6 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Sun, 30 Oct 2022 13:24:50 -0400
Subject: [PATCH 0330/1922] Add fake tensor files to ciflow/inductor (#88052)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88052
Approved by: https://github.com/anijain2305
---
 .github/labeler.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/labeler.yml b/.github/labeler.yml
index 9581c9fe706cd..5af603fb36c92 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -12,3 +12,6 @@
 - torch/_dynamo/**
 - torch/_inductor/**
 - benchmarks/dynamo/**
+- torch/_subclasses/fake_tensor.py
+- torch/_subclasses/fake_utils.py
+- torch/_subclasses/meta_utils.py

From 2abca51dbf2449f6c496af3b639d8903da78952c Mon Sep 17 00:00:00 2001
From: Horace He <chilli@fb.com>
Date: Sat, 29 Oct 2022 00:59:57 +0000
Subject: [PATCH 0331/1922] Fix all references to torchdynamo from the merge
 (#87731)

cc @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx @jansel
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87731
Approved by: https://github.com/yanboliang, https://github.com/ezyang, https://github.com/anijain2305, https://github.com/jansel
---
 torch/_dynamo/__init__.py            | 10 +++++-----
 torch/_dynamo/config.py              |  4 ++--
 torch/_dynamo/convert_frame.py       |  2 +-
 torch/_dynamo/eval_frame.py          | 12 ++++++------
 torch/_dynamo/exc.py                 |  4 ++--
 torch/_dynamo/symbolic_convert.py    |  2 +-
 torch/_dynamo/variables/functions.py |  2 +-
 torch/nn/parallel/distributed.py     |  2 +-
 8 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/torch/_dynamo/__init__.py b/torch/_dynamo/__init__.py
index 6b49ce5104ca4..80f927aeef2fa 100644
--- a/torch/_dynamo/__init__.py
+++ b/torch/_dynamo/__init__.py
@@ -46,7 +46,7 @@ def reset():
 def list_backends():
     """
     Return valid strings that can be passed to:
-        @torchdynamo.optimize(<backend>)
+        @torch._dynamo.optimize(<backend>)
         def foo(...):
            ....
     """
@@ -60,9 +60,9 @@ def allow_in_graph(fn):
     Customize which functions TorchDynamo will include in the generated
     graph.  Similar to torch.fx.wrap().
 
-        torchdynamo.allow_in_graph(my_custom_function)
+        torch._dynamo.allow_in_graph(my_custom_function)
 
-        @torchdynamo.optimize(...)
+        @torch._dynamo.optimize(...)
         def fn(a):
             x = torch.add(x, 1)
             x = my_custom_function(x)
@@ -86,9 +86,9 @@ def disallow_in_graph(fn):
     Customize which functions TorchDynamo will exclude in the generated
     graph and force a graph break on.
 
-        torchdynamo.disallow_in_graph(torch.sub)
+        torch._dynamo.disallow_in_graph(torch.sub)
 
-        @torchdynamo.optimize(...)
+        @torch._dynamo.optimize(...)
         def fn(a):
             x = torch.add(x, 1)
             x = torch.sub(x, 1)
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index 2601be8983f2a..12088383e741c 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -77,7 +77,7 @@
 # __torch_function__ logic of the subclass.
 traceable_tensor_subclasses = set()
 
-# Suppress errors in torchdynamo.optimize, instead forcing a fallback to eager.
+# Suppress errors in torch._dynamo.optimize, instead forcing a fallback to eager.
 # This is a good way to get your model to work one way or another, but you may
 # lose optimization opportunities this way.  Devs, if your benchmark model is failing
 # this way, you should figure out why instead of suppressing it.
@@ -147,7 +147,7 @@
 # If True, raise when aot autograd is unsafe to use
 raise_on_unsafe_aot_autograd = False
 
-# How to import torchdynamo, either torchdynamo or torch.dynamo
+# How to import torchdynamo, either torchdynamo or torch._dynamo
 dynamo_import = __name__.replace(".config", "")
 
 # How to import torchinductor, either torchinductor or torch.inductor
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index ea92e024b3dff..ce478456301c9 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -130,7 +130,7 @@ def _fn(*args, **kwargs):
 @TorchPatcher.suppress_torch_distributed_warnings
 def has_tensor_in_frame(frame):
     """Check if the frame has torch.* related bits"""
-    # Check if the function was decorated using torchdynamo.optimize
+    # Check if the function was decorated using torch._dynamo.optimize
     if frame.f_code in always_optimize_code_objects:
         return True
 
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index de3d74cd89c28..29bb14b629d7e 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -100,9 +100,9 @@ def __init__(
     def __enter__(self):
         if config.raise_on_ctx_manager_usage:
             raise RuntimeError(
-                "torchdynamo.optimize(...) is used with a context manager. "
+                "torch._dynamo.optimize(...) is used with a context manager. "
                 "Please refer to https://github.com/pytorch/torchdynamo#usage-example "
-                "to use torchdynamo.optimize(...) as an annotation/decorator. "
+                "to use torch._dynamo.optimize(...) as an annotation/decorator. "
             )
         self.on_enter()
         self.prior = set_eval_frame(self.callback)
@@ -178,7 +178,7 @@ def _fn(*args, **kwargs):
         # of decorators.
         _fn._torchdynamo_orig_callable = fn
 
-        # If the function is called using torchdynamo.optimize decorator, we
+        # If the function is called using torch._dynamo.optimize decorator, we
         # should prevent any type of skipping.
         if callback not in (None, False):
             always_optimize_code_objects[fn.__code__] = True
@@ -338,14 +338,14 @@ def optimize(
             One can also provide additional context for the backend, like
             torch.jit.fuser("fuser2"), by setting the backend_ctx_ctor attribute.
             See AOTAutogradMemoryEfficientFusionWithContext for the usage.
-            - Or, a string backend name in `torchdynamo.list_backends()`
+            - Or, a string backend name in `torch._dynamo.list_backends()`
         nopython: If True, graph breaks will be errors and there will
             be a single whole-program graph.
         disable: If True, turn this decorator into a no-op
 
     Example Usage:
 
-        @torchdynamo.optimize()
+        @torch._dynamo.optimize()
         def toy_example(a, b):
             ...
     """
@@ -588,7 +588,7 @@ def assume_constant_result(fn):
 
 def optimize_assert(backend, *, guard_export_fn=None, export=False):
     """
-    The same as `torchdynamo.optimize(backend, nopython=True)`
+    The same as `torch._dynamo.optimize(backend, nopython=True)`
     """
     backend = get_compiler_fn(backend)
 
diff --git a/torch/_dynamo/exc.py b/torch/_dynamo/exc.py
index b55e5e122c4e8..41a9f68351aa9 100644
--- a/torch/_dynamo/exc.py
+++ b/torch/_dynamo/exc.py
@@ -29,8 +29,8 @@ def __init__(self):
         super(ResetRequired, self).__init__(
             textwrap.dedent(
                 """
-                Must call `torchdynamo.reset()` before changing backends.  Detected two calls to
-                `torchdynamo.optimize(...)` with a different backend compiler arguments.
+                Must call `torch._dynamo.reset()` before changing backends.  Detected two calls to
+                `torch._dynamo.optimize(...)` with a different backend compiler arguments.
                 """
             )
         )
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index 8f4eed446f3c9..ad431cf9d54fc 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -178,7 +178,7 @@ def wrapper(self: "InstructionTranslatorBase", inst: Instruction):
                 user_stack = [self.frame_summary()] + list(reversed(exc.real_stack))
                 user_stack_formatted = "".join(traceback.format_list(user_stack))
                 frame_loc = (user_stack[-1].filename, user_stack[-1].lineno)
-                # torchdynamo.explain() formats this a little nicer, and presents a slightly
+                # torch._dynamo.explain() formats this a little nicer, and presents a slightly
                 # more actionable user code pointer
                 if (
                     config.print_graph_breaks
diff --git a/torch/_dynamo/variables/functions.py b/torch/_dynamo/variables/functions.py
index 8f1e29bc7e55a..d0f545ed3abf1 100644
--- a/torch/_dynamo/variables/functions.py
+++ b/torch/_dynamo/variables/functions.py
@@ -84,7 +84,7 @@ def __init__(self, fn, is_constant=False, **kwargs):
         assert isinstance(
             fn, types.FunctionType
         ), f"expected FunctionType found {typestr(fn)} {fn}"
-        # unpack @torchdynamo.optimize()(fn) wrapped function
+        # unpack @torch._dynamo.optimize()(fn) wrapped function
         fn = inspect.getattr_static(fn, "_torchdynamo_inline", fn)
         # unpack torch.jit.script_if_tracing
         if inspect.getattr_static(fn, "__script_if_tracing_wrapper", False):
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index 23625d9d20014..bd6b769402076 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -1018,7 +1018,7 @@ def _get_active_ddp_module(cls):
 
     # note, this ctxmgr function is marked 'skip' in torchdynamo, so dynamo only kicks in
     # for the 'module_to_run' underneath
-    # see torchdynamo/eval_frame.py TorchPatcher.patch for more details
+    # see torch._dynamo/eval_frame.py TorchPatcher.patch for more details
     @contextmanager
     def _inside_ddp_forward(self):
         DistributedDataParallel._active_ddp_module = self

From 2c2421f43e0f62e42390c71301ee22261d069e09 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Sat, 29 Oct 2022 08:45:32 -0700
Subject: [PATCH 0332/1922] Get the magic method try reverse protocol correct
 (#88030)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

cc @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88030
Approved by: https://github.com/anjali411, https://github.com/albanD
---
 test/dynamo/test_dynamic_shapes.py       |  5 -----
 test/functorch/test_aotdispatch.py       |  1 -
 test/test_proxy_tensor.py                |  5 +++++
 torch/fx/experimental/symbolic_shapes.py | 14 ++++++++++----
 4 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/test/dynamo/test_dynamic_shapes.py b/test/dynamo/test_dynamic_shapes.py
index a32825d03aeaa..d82cc6925fe9d 100644
--- a/test/dynamo/test_dynamic_shapes.py
+++ b/test/dynamo/test_dynamic_shapes.py
@@ -92,11 +92,6 @@ def make_dynamic_cls(cls):
     # Unable to cast Python instance to C++ type
 )
 
-unittest.expectedFailure(
-    DynamicShapesReproTests.test_boxes_len_dynamic_shapes
-    # Unable to cast Python instance to C++ type
-)
-
 unittest.expectedFailure(
     DynamicShapesReproTests.test_guard_fail_tensor_bool_dynamic_shapes
     # RuntimeError: aten.allclose.default - couldn't find symbolic meta function/decomposition
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 5f6e8d9bf238f..713dd3c8ae6bc 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1152,7 +1152,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('nn.functional.max_unpool2d', 'grad'),  # aten.max_unpool2d.default - couldn't find symbolic meta ...
     xfail('nn.functional.max_unpool3d', ''),  # aten.max_unpool3d.default - couldn't find symbolic meta funct...
     xfail('nn.functional.max_unpool3d', 'grad'),  # aten.max_unpool3d.default - couldn't find symbolic meta ...
-    xfail('nn.functional.mse_loss', ''),  # Unable to cast Python instance to C++ type (#define PYBIND11_DETA...
     xfail('nn.functional.multi_margin_loss', ''),  # could not find kernel
     xfail('nn.functional.multilabel_margin_loss', ''),  # could not find kernel
     xfail('nn.functional.nll_loss', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 4f4265b8dc6a2..a5de034d1a06f 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -975,6 +975,11 @@ def f(x):
             return x.shape
         self._test_dynamic(f, [(5, 3)], [[(4, 6)]])
 
+    def test_rmethod(self):
+        def f(x):
+            return x.size(0) + x
+        self._test_dynamic(f, [(5,)], [[(4,)], [(12,)]])
+
     def test_mega_guard(self):
         def f(a, b):
             assert a.shape[0] == b.shape[0] * 2
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index 2eb169a0d1882..88322b7d1d208 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -139,9 +139,9 @@ def to_node(self, num):
         elif isinstance(num, float):
             return self.wrap_float(num)
         else:
-            # NotImplementedError is important so that Python tries the
+            # NotImplemented is important so that Python tries the
             # other magic method
-            raise NotImplementedError(type(num))
+            return NotImplemented
 
     def is_int(self):
         return self.pytype is int
@@ -354,10 +354,16 @@ def unary_magic_impl(self):
         return wrap_node(getattr(self.node, method)())
 
     def binary_magic_impl(self, other):
-        return wrap_node(getattr(self.node, method)(self.node.to_node(other)))
+        other_node = self.node.to_node(other)
+        if other_node is NotImplemented:
+            return NotImplemented
+        return wrap_node(getattr(self.node, method)(other_node))
 
     def rbinary_magic_impl(self, other):
-        return wrap_node(getattr(self.node.to_node(other), method)(self.node))
+        other_node = self.node.to_node(other)
+        if other_node is NotImplemented:
+            return NotImplemented
+        return wrap_node(getattr(other_node, method)(self.node))
 
     if method in unary_magic_methods:
         setattr(user_type, f"__{method}__", unary_magic_impl)

From 3afa384f7799406f99e86fb7530c5dcfb68bd487 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Sat, 29 Oct 2022 17:25:42 -0700
Subject: [PATCH 0333/1922] Simplify magic method definition code. (#88017)

It turns out sym_float (and the hypothetical sym_int) can
be defined in the same way as conventional magic methods.
Do so.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88017
Approved by: https://github.com/albanD
---
 torch/__init__.py                        | 12 +++----
 torch/fx/experimental/symbolic_shapes.py | 43 ++++++++++++------------
 2 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/torch/__init__.py b/torch/__init__.py
index c2f2c4c3327f2..422f143db507d 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -210,16 +210,16 @@ def __init__(self, node):
         # class has a field named node that stores SymNode
         self.node = node
 
-    # Magic methods installed later
-
     def __bool__(self):
         return self.node.bool_()
 
     def __int__(self):
         return self.node.int_()
 
+    # Magic methods installed by torch.fx.experimental.symbolic_shapes
+
     def __sym_float__(self):
-        return SymFloat(self.node.sym_float())
+        ...
 
     def __repr__(self):
         return self.node.str()
@@ -242,13 +242,13 @@ def __init__(self, node):
         # class has a field named node that stores SymNode
         self.node = node
 
-    # Magic methods installed later
-
     def __bool__(self):
         return self.node.bool_()
 
+    # Magic methods installed by torch.fx.experimental.symbolic_shapes
+
     def __sym_int__(self):
-        return SymInt(self.node.sym_int())
+        ...
 
     def __repr__(self):
         return self.node.str()
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index 88322b7d1d208..5a4d5bff84b2f 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -1,6 +1,7 @@
 import torch
 import torch.utils._pytree as pytree
 from typing import Set, Dict, List, Type, Optional, cast
+import sys
 import operator
 import builtins
 import math
@@ -169,6 +170,13 @@ def __str__(self):
     def __repr__(self):
         return self.str()
 
+    # These methods are metaprogrammed in below
+    def sym_int(self) -> "SymNode":
+        ...
+
+    def sym_float(self) -> "SymNode":
+        ...
+
     # Today we error on calling int on a symbolic shape, as this is a very accessible footgun.
     def int_(self):
         raise RuntimeError("Trying to extract a concrete int out of a symbolic int")
@@ -184,27 +192,6 @@ def guard_float(self, file, line):
         # guard occurred
         return float(self.shape_env.evaluate_expr(self.expr))
 
-    def sym_float(self):
-        if SYM_FUNCTION_MODE:
-            r = _handle_sym_dispatch(sym_float, (wrap_node(self),), {})
-            assert isinstance(r, (SymInt, SymFloat)), type(r)
-            return r.node
-        # TODO: consider constant prop here
-        # TODO: wrapping the expr with sympy.Float doesn't seem to work, why
-        # not?
-        return SymNode(self.expr, self.shape_env, float)
-
-    def sym_int(self):
-        raise NotImplementedError("sym_int NYI")
-        """
-        if SYM_FUNCTION_MODE:
-            return _handle_sym_dispatch(sym_int, (self,), {})
-        # TODO: consider constant prop here
-        # XXX: need to cast float to int in sympy; math.floor is wrong
-        # because negatives round to zero
-        return SymNode(self.expr, self.shape_env, int)
-        """
-
     def bool_(self):
         return bool(self.shape_env.evaluate_expr(self.shape_env.replace(self.expr)))
 
@@ -258,6 +245,9 @@ def eval(cls, base, divisor):
     'floordiv': lambda a, b: FloorDiv(a, b),
 }
 
+def _nyi():
+    raise NotImplementedError()
+
 magic_methods = {
     **reflectable_magic_methods,
     'eq': lambda a, b: sympy.Eq(a, b),
@@ -265,6 +255,8 @@ def eval(cls, base, divisor):
     'lt': lambda a, b: sympy.Lt(a, b),
     'le': lambda a, b: sympy.Le(a, b),
     'ge': lambda a, b: sympy.Ge(a, b),
+    'sym_float': lambda a: a,  # TODO: why can't I wrap with sympy.Float?
+    'sym_int': lambda a: _nyi(),
     'ceil': lambda a: sympy.ceiling(a),
     'neg': lambda a: -a,
     'min': lambda a, b: sympy.Min(a, b),
@@ -272,8 +264,9 @@ def eval(cls, base, divisor):
 }
 
 unary_magic_methods = {
+    'sym_float',
     'ceil',
-    'neg'
+    'neg',
 }
 
 float_magic_methods = {"add", "sub", "mul", "truediv", "ceil", "floor", "eq", "gt", "lt", "le", "ge", "pow"}
@@ -309,6 +302,7 @@ def binary_magic_impl(self, other):
         other_expr = self.shape_env.replace(other_expr)
         out = func(expr, other_expr)
         out = sympy.expand(out)
+        pytype: Type
         if method in ["truediv"]:
             pytype = float
         else:
@@ -322,6 +316,8 @@ def unary_magic_impl(self):
         if SYM_FUNCTION_MODE:
             if method in ["ceil", "floor"]:
                 op = getattr(math, method)
+            elif method in ["sym_float", "sym_int"]:
+                op = getattr(sys.modules[__name__], method)
             else:
                 op = getattr(operator, method)
             r = _handle_sym_dispatch(op, (wrap_node(self),), {})
@@ -331,8 +327,11 @@ def unary_magic_impl(self):
         expr = self.shape_env.replace(self.expr)
         out = func(expr)
         out = sympy.expand(out)
+        pytype: Type
         if method in ["ceil", "floor"]:
             pytype = int
+        elif method in ["sym_float"]:
+            pytype = float
         else:
             pytype = self.pytype
 

From af1b49363e1d1d76619764806275a4e1c6ed5a60 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Sat, 29 Oct 2022 21:43:09 -0400
Subject: [PATCH 0334/1922] Make IValue::unsafeToTensorImpl a little less
 unsafe. (#88043)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88043
Approved by: https://github.com/anjali411, https://github.com/albanD
---
 aten/src/ATen/core/ivalue.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index e9a5ea9ec6a20..3461fe2300e45 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -425,6 +425,7 @@ struct TORCH_API IValue final {
   at::Tensor& toTensor() &;
   const at::Tensor& toTensor() const&;
   at::TensorImpl* unsafeToTensorImpl() const {
+    TORCH_INTERNAL_ASSERT(isTensor());
     return payload.as_tensor.unsafeGetTensorImpl();
   }
 

From fa1cfc319444d52c70993c0cde481c31ccb8675c Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 31 Oct 2022 09:20:49 -0400
Subject: [PATCH 0335/1922] .gitignore log files (#88085)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88085
Approved by: https://github.com/albanD
---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index d4f07f2cf10fb..5ca188577e16b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -338,3 +338,6 @@ third_party/glog/
 
 # Virtualenv
 venv/
+
+# Log files
+*.log

From c626bca9601f4e8610d40e1c427ff95f0228ce3b Mon Sep 17 00:00:00 2001
From: Salil Desai <salilsdesai@fb.com>
Date: Sun, 30 Oct 2022 20:30:55 -0700
Subject: [PATCH 0336/1922] [Vulkan] Add support for Optimization Blocklist to
 Vulkan Rewrite (#87431)

Optimization Blocklist will be used in a future diff (D40315730) to make the rewrite to transfer input/output backends optional

Differential Revision: [D40315729](https://our.internmc.facebook.com/intern/diff/D40315729/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87431
Approved by: https://github.com/mcr229, https://github.com/digantdesai
---
 binaries/optimize_for_mobile.cc               | 15 ++++++++-------
 torch/_C/__init__.pyi.in                      |  3 ++-
 torch/csrc/jit/passes/mobile_optimizer_type.h | 12 ++++++++++++
 torch/csrc/jit/passes/vulkan_rewrite.cpp      |  1 +
 torch/csrc/jit/passes/vulkan_rewrite.h        |  2 ++
 torch/csrc/jit/passes/xnnpack_rewrite.cpp     |  1 +
 torch/csrc/jit/passes/xnnpack_rewrite.h       | 10 +---------
 torch/csrc/jit/python/init.cpp                |  5 ++++-
 torch/utils/mobile_optimizer.py               |  5 ++++-
 9 files changed, 35 insertions(+), 19 deletions(-)
 create mode 100644 torch/csrc/jit/passes/mobile_optimizer_type.h

diff --git a/binaries/optimize_for_mobile.cc b/binaries/optimize_for_mobile.cc
index 991bca7e55871..005b19ce888a4 100644
--- a/binaries/optimize_for_mobile.cc
+++ b/binaries/optimize_for_mobile.cc
@@ -16,13 +16,13 @@
 
 #include <string>
 #include <sstream>
-#include "torch/script.h"
-#include "torch/csrc/jit/api/module.h"
+#include <torch/script.h>
+#include <torch/csrc/jit/api/module.h>
 #include <torch/csrc/jit/passes/metal_rewrite.h>
-#include "torch/csrc/jit/passes/vulkan_rewrite.h"
-#include "torch/csrc/jit/passes/xnnpack_rewrite.h"
-#include "torch/csrc/jit/serialization/import.h"
-#include "torch/csrc/jit/serialization/export.h"
+#include <torch/csrc/jit/passes/vulkan_rewrite.h>
+#include <torch/csrc/jit/passes/xnnpack_rewrite.h>
+#include <torch/csrc/jit/serialization/import.h>
+#include <torch/csrc/jit/serialization/export.h>
 
 C10_DEFINE_string(model, "", "The torch script model to optimize.");
 C10_DEFINE_string(
@@ -86,7 +86,8 @@ int main(int argc, char** argv) {
   if (FLAGS_backend == "" || FLAGS_backend == "cpu") {
     optimized_module = torch::jit::optimizeForMobile(module);
   } else if (FLAGS_backend == "vulkan") {
-    optimized_module = torch::jit::vulkanOptimizeForMobile(module, preserved_methods);
+    optimized_module = torch::jit::vulkanOptimizeForMobile(
+        module, std::set<MobileOptimizerType>(), preserved_methods);
   } else if (FLAGS_backend == "metal"){
     optimized_module = torch::jit::metalOptimizeForMobile(module, preserved_methods);
   }else{
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index af6734b059f44..8b936be23122e 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -169,7 +169,7 @@ class Future(object):
 
 def _jit_set_num_profiled_runs(num: _size) -> _size: ...
 
-# Defined in torch/csrc/jit/passes/xnnpack_rewrite.h
+# Defined in torch/csrc/jit/passes/mobile_optimizer_type.h
 class MobileOptimizerType:
     ...
 
@@ -215,6 +215,7 @@ def _clone_module_with_class(module: 'torch.jit.ScriptModule',
                              ignored_methods: List[AnyStr],
                              ignored_attributes: List[AnyStr]) -> 'torch.jit.ScriptModule': ...
 def _jit_pass_vulkan_optimize_for_mobile(module: 'torch.jit.ScriptModule',
+                                         optimization_blocklist: Set[MobileOptimizerType],
                                          preserved_methods: List[AnyStr]) -> 'torch.jit.ScriptModule': ...
 def _jit_pass_metal_optimize_for_mobile(module: 'torch.jit.ScriptModule',
                                          preserved_methods: List[AnyStr]) -> 'torch.jit.ScriptModule': ...
diff --git a/torch/csrc/jit/passes/mobile_optimizer_type.h b/torch/csrc/jit/passes/mobile_optimizer_type.h
new file mode 100644
index 0000000000000..fe3fffe16c22d
--- /dev/null
+++ b/torch/csrc/jit/passes/mobile_optimizer_type.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <cstdint>
+
+enum class MobileOptimizerType : int8_t {
+  CONV_BN_FUSION,
+  INSERT_FOLD_PREPACK_OPS,
+  REMOVE_DROPOUT,
+  FUSE_ADD_RELU,
+  HOIST_CONV_PACKED_PARAMS,
+  CONV_1D_TO_2D,
+};
diff --git a/torch/csrc/jit/passes/vulkan_rewrite.cpp b/torch/csrc/jit/passes/vulkan_rewrite.cpp
index 9a0f45ff84020..7618aa50d6868 100644
--- a/torch/csrc/jit/passes/vulkan_rewrite.cpp
+++ b/torch/csrc/jit/passes/vulkan_rewrite.cpp
@@ -269,6 +269,7 @@ void vulkanRunCanonicalOptimizations(script::Module& module) {
 
 script::Module vulkanOptimizeForMobile(
     const script::Module& m,
+    const std::set<MobileOptimizerType>& optimization_blocklist,
     const std::vector<std::string>& preserved_methods) {
   auto cloned_module = m.clone();
   cloned_module.eval();
diff --git a/torch/csrc/jit/passes/vulkan_rewrite.h b/torch/csrc/jit/passes/vulkan_rewrite.h
index 8e67dce70f542..395d885e8e2c3 100644
--- a/torch/csrc/jit/passes/vulkan_rewrite.h
+++ b/torch/csrc/jit/passes/vulkan_rewrite.h
@@ -2,6 +2,7 @@
 
 #include <torch/csrc/jit/api/module.h>
 #include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/mobile_optimizer_type.h>
 
 namespace torch {
 namespace jit {
@@ -11,6 +12,7 @@ TORCH_API void vulkanFusePrePackedConvWithClamp(script::Module& module);
 TORCH_API void vulkanFoldPrePackingOps(script::Module& module);
 TORCH_API script::Module vulkanOptimizeForMobile(
     const script::Module& module,
+    const std::set<MobileOptimizerType>& optimization_blocklist,
     const std::vector<std::string>& preserved_methods);
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/xnnpack_rewrite.cpp b/torch/csrc/jit/passes/xnnpack_rewrite.cpp
index 2476d1be4df61..0e2163f7a19f8 100644
--- a/torch/csrc/jit/passes/xnnpack_rewrite.cpp
+++ b/torch/csrc/jit/passes/xnnpack_rewrite.cpp
@@ -12,6 +12,7 @@
 #include <torch/csrc/jit/passes/graph_rewrite_helper.h>
 #include <torch/csrc/jit/passes/hoist_conv_packed_params.h>
 #include <torch/csrc/jit/passes/inliner.h>
+#include <torch/csrc/jit/passes/mobile_optimizer_type.h>
 #include <torch/csrc/jit/passes/prepack_folding.h>
 #include <torch/csrc/jit/passes/remove_dropout.h>
 #include <torch/csrc/jit/passes/subgraph_rewrite.h>
diff --git a/torch/csrc/jit/passes/xnnpack_rewrite.h b/torch/csrc/jit/passes/xnnpack_rewrite.h
index 498dcd006fe3c..d1a64c52c9230 100644
--- a/torch/csrc/jit/passes/xnnpack_rewrite.h
+++ b/torch/csrc/jit/passes/xnnpack_rewrite.h
@@ -2,19 +2,11 @@
 
 #include <torch/csrc/jit/api/module.h>
 #include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/mobile_optimizer_type.h>
 
 namespace torch {
 namespace jit {
 
-enum class MobileOptimizerType : int8_t {
-  CONV_BN_FUSION,
-  INSERT_FOLD_PREPACK_OPS,
-  REMOVE_DROPOUT,
-  FUSE_ADD_RELU,
-  HOIST_CONV_PACKED_PARAMS,
-  CONV_1D_TO_2D,
-};
-
 TORCH_API void transformConv1dToConv2d(std::shared_ptr<Graph>& graph);
 TORCH_API void transformConv1dToConv2d(script::Module& module);
 TORCH_API void insertPrePackedOps(std::shared_ptr<Graph>& graph);
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 91eecfa4596e8..d576c1ff3d742 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -52,6 +52,7 @@
 #include <torch/csrc/jit/passes/lower_graph.h>
 #include <torch/csrc/jit/passes/lower_tuples.h>
 #include <torch/csrc/jit/passes/metal_rewrite.h>
+#include <torch/csrc/jit/passes/mobile_optimizer_type.h>
 #include <torch/csrc/jit/passes/normalize_ops.h>
 #include <torch/csrc/jit/passes/peephole.h>
 #include <torch/csrc/jit/passes/peephole_list_idioms.h>
@@ -1081,8 +1082,10 @@ void initJITBindings(PyObject* module) {
       .def(
           "_jit_pass_vulkan_optimize_for_mobile",
           [](script::Module& module,
+             std::set<MobileOptimizerType>& optimization_blocklist,
              std::vector<std::string>& preserved_methods) {
-            return vulkanOptimizeForMobile(module, preserved_methods);
+            return vulkanOptimizeForMobile(
+                module, optimization_blocklist, preserved_methods);
           })
       .def(
           "_jit_pass_metal_insert_prepacked_ops",
diff --git a/torch/utils/mobile_optimizer.py b/torch/utils/mobile_optimizer.py
index bda5defab29e8..2b59ac41809f2 100644
--- a/torch/utils/mobile_optimizer.py
+++ b/torch/utils/mobile_optimizer.py
@@ -64,7 +64,10 @@ def optimize_for_mobile(
             optimization_blocklist,
             preserved_methods_str)
     elif backend == 'vulkan':
-        optimized_cpp_module = torch._C._jit_pass_vulkan_optimize_for_mobile(script_module._c, preserved_methods_str)
+        optimized_cpp_module = torch._C._jit_pass_vulkan_optimize_for_mobile(
+            script_module._c,
+            optimization_blocklist,
+            preserved_methods_str)
     elif backend == 'metal':
         optimized_cpp_module = torch._C._jit_pass_metal_optimize_for_mobile(script_module._c, preserved_methods_str)
     else:

From be772ef1bfe5044d0c1c71e932f187239ae2ae3d Mon Sep 17 00:00:00 2001
From: Salil Desai <salilsdesai@fb.com>
Date: Sun, 30 Oct 2022 20:30:57 -0700
Subject: [PATCH 0337/1922] [Vulkan] Add Vulkan Rewrite to Transfer Inputs and
 Outputs to Vulkan and CPU Backends Respectively (#87432)

With this change, we don't have to manually invoke transferring input and output backends when we run vulkan models.

Graph rewrite code based off of:
- https://github.com/pytorch/pytorch/commit/32efff45ba77f2bb4b1e709613b99070f119745a#diff-a473bddb458dc24225866a45092d6eca064eddd256245d93020e48e216eee4d5R160-R179

Differential Revision: [D39519168](https://our.internmc.facebook.com/intern/diff/D39519168/)

**NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D39519168/)!
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87432
Approved by: https://github.com/mcr229, https://github.com/digantdesai
---
 .../src/main/cpp/pytorch_jni_jit.cpp          | 12 +++--
 .../src/main/cpp/pytorch_jni_lite.cpp         | 12 +++--
 binaries/speed_benchmark_torch.cc             |  4 ++
 docs/source/mobile_optimizer.rst              |  5 +-
 test/test_public_bindings.py                  |  2 +-
 torch/_C/__init__.pyi.in                      |  1 +
 torch/csrc/jit/passes/mobile_optimizer_type.h |  1 +
 torch/csrc/jit/passes/vulkan_rewrite.cpp      | 53 +++++++++++++++++++
 torch/csrc/jit/python/init.cpp                |  3 ++
 9 files changed, 83 insertions(+), 10 deletions(-)

diff --git a/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp b/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp
index 1b0d54784d76f..6ef4f462df169 100644
--- a/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp
+++ b/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp
@@ -195,14 +195,16 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
     std::vector<at::IValue> inputs{};
     size_t n = jinputs->size();
     inputs.reserve(n);
+    const bool requires_backend_transfers =
+        module_.attr("requires_backend_transfers", at::IValue(true)).toBool();
     for (size_t i = 0; i < n; i++) {
       at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i));
-      if (at::kVulkan == deviceType_) {
+      if (at::kVulkan == deviceType_ && requires_backend_transfers) {
         inputs.push_back(
             atIValue.isTensor() ? at::IValue{atIValue.toTensor().vulkan()}
                                 : std::move(atIValue));
       } else {
-        TORCH_CHECK(at::kCPU == deviceType_);
+        TORCH_CHECK(at::kCPU == deviceType_ || !requires_backend_transfers);
         inputs.push_back(std::move(atIValue));
       }
     }
@@ -223,14 +225,16 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
     std::vector<at::IValue> inputs{};
     size_t n = jinputs->size();
     inputs.reserve(n);
+    const bool requires_backend_transfers =
+        module_.attr("requires_backend_transfers", at::IValue(true)).toBool();
     for (size_t i = 0; i < n; i++) {
       at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i));
-      if (at::kVulkan == deviceType_) {
+      if (at::kVulkan == deviceType_ && requires_backend_transfers) {
         inputs.push_back(
             atIValue.isTensor() ? at::IValue{atIValue.toTensor().vulkan()}
                                 : std::move(atIValue));
       } else {
-        TORCH_CHECK(at::kCPU == deviceType_);
+        TORCH_CHECK(at::kCPU == deviceType_ || !requires_backend_transfers);
         inputs.push_back(std::move(atIValue));
       }
     }
diff --git a/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp b/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp
index 86fd1e2260f9c..802bb801a1f9c 100644
--- a/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp
+++ b/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp
@@ -158,14 +158,16 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
     std::vector<at::IValue> inputs{};
     size_t n = jinputs->size();
     inputs.reserve(n);
+    const bool requires_backend_transfers =
+        module_.attr("requires_backend_transfers", at::IValue(true)).toBool();
     for (const auto i : c10::irange(n)) {
       at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i));
-      if (at::kVulkan == deviceType_) {
+      if (at::kVulkan == deviceType_ && requires_backend_transfers) {
         inputs.push_back(
             atIValue.isTensor() ? at::IValue{atIValue.toTensor().vulkan()}
                                 : std::move(atIValue));
       } else {
-        TORCH_CHECK(at::kCPU == deviceType_);
+        TORCH_CHECK(at::kCPU == deviceType_ || !requires_backend_transfers);
         inputs.push_back(std::move(atIValue));
       }
     }
@@ -187,14 +189,16 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
     std::vector<at::IValue> inputs{};
     size_t n = jinputs->size();
     inputs.reserve(n);
+    const bool requires_backend_transfers =
+        module_.attr("requires_backend_transfers", at::IValue(true)).toBool();
     for (const auto i : c10::irange(n)) {
       at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i));
-      if (at::kVulkan == deviceType_) {
+      if (at::kVulkan == deviceType_ && requires_backend_transfers) {
         inputs.push_back(
             atIValue.isTensor() ? at::IValue{atIValue.toTensor().vulkan()}
                                 : std::move(atIValue));
       } else {
-        TORCH_CHECK(at::kCPU == deviceType_);
+        TORCH_CHECK(at::kCPU == deviceType_ || !requires_backend_transfers);
         inputs.push_back(std::move(atIValue));
       }
     }
diff --git a/binaries/speed_benchmark_torch.cc b/binaries/speed_benchmark_torch.cc
index ea523898b51e6..0fadfad5b9f28 100644
--- a/binaries/speed_benchmark_torch.cc
+++ b/binaries/speed_benchmark_torch.cc
@@ -180,6 +180,10 @@ class vkRunner final : public Runner<T> {
   virtual c10::IValue run(
       T& module,
       const std::vector<c10::IValue>& inputs) override {
+    if (!module.attr("requires_backend_transfers", at::IValue(true)).toBool()) {
+      // No need to transfer input/output backends
+      return module.forward(inputs);
+    }
 
     if (inputs_.size() == 0) {
       // Upload the input tensor(s) to GPU memory.
diff --git a/docs/source/mobile_optimizer.rst b/docs/source/mobile_optimizer.rst
index bb11abf82dbac..4df148dc707b8 100644
--- a/docs/source/mobile_optimizer.rst
+++ b/docs/source/mobile_optimizer.rst
@@ -7,13 +7,16 @@ torch.utils.mobile_optimizer
 Torch mobile supports ``torch.mobile_optimizer.optimize_for_mobile`` utility to run a list of optimization pass with modules in eval mode.
 The method takes the following parameters: a torch.jit.ScriptModule object, a blocklisting optimization set and a preserved method list
 
-By default, if optimization blocklist is None or empty, ``optimize_for_mobile`` will run the following optimizations:
+For CPU Backend, by default, if optimization blocklist is None or empty, ``optimize_for_mobile`` will run the following optimizations:
     - **Conv2D + BatchNorm fusion** (blocklisting option `MobileOptimizerType::CONV_BN_FUSION`):  This optimization pass folds ``Conv2d-BatchNorm2d`` into ``Conv2d`` in ``forward`` method of this module and all its submodules. The weight and bias of the ``Conv2d`` are correspondingly updated.
     - **Insert and Fold prepacked ops** (blocklisting option `MobileOptimizerType::INSERT_FOLD_PREPACK_OPS`): This optimization pass rewrites the graph to replace 2D convolutions and linear ops with their prepacked counterparts. Prepacked ops are stateful ops in that, they require some state to be created, such as weight prepacking and use this state, i.e. prepacked weights, during op execution. XNNPACK is one such backend that provides prepacked ops, with kernels optimized for mobile platforms (such as ARM CPUs). Prepacking of weight enables efficient memory access and thus faster kernel execution. At the moment ``optimize_for_mobile`` pass rewrites the graph to replace ``Conv2D/Linear`` with 1) op that pre-packs weight for XNNPACK conv2d/linear ops and 2) op that takes pre-packed weight and activation as input and generates output activations. Since 1 needs to be done only once, we fold the weight pre-packing such that it is done only once at model load time. This pass of the ``optimize_for_mobile`` does 1 and 2 and then folds, i.e. removes, weight pre-packing ops.
     - **ReLU/Hardtanh fusion**: XNNPACK ops support fusion of clamping. That is clamping of output activation is done as part of the kernel, including for 2D convolution and linear op kernels. Thus clamping effectively comes for free. Thus any op that can be expressed as clamping op, such as ``ReLU`` or ``hardtanh``, can be fused with previous ``Conv2D`` or ``linear`` op in XNNPACK. This pass rewrites graph by finding ``ReLU/hardtanh`` ops that follow XNNPACK ``Conv2D/linear`` ops, written by the previous pass, and fuses them together.
     - **Dropout removal** (blocklisting option `MobileOptimizerType::REMOVE_DROPOUT`): This optimization pass removes ``dropout`` and ``dropout_`` nodes from this module when training is false.
     - **Conv packed params hoisting** (blocklisting option `MobileOptimizerType::HOIST_CONV_PACKED_PARAMS`): This optimization pass moves convolution packed params to the root module, so that the convolution structs can be deleted. This decreases model size without impacting numerics.
 
+for Vulkan Backend, by default, if optimization blocklist is None or empty, ``optimize_for_mobile`` will run the folllwing optimization:
+    - **Automatic GPU Transfer** (blocklisting option `MobileOptimizerType::VULKAN_AUTOMATIC_GPU_TRANSFER`): This optimization pass rewrites the graph such that inputs are transferred to Vulkan backend, and outputs are transferred to CPU backend
+
 ``optimize_for_mobile`` will also invoke freeze_module pass which only preserves ``forward`` method. If you have other method to that needed to be preserved,  add them into the preserved method list and pass into the method.
 
 
diff --git a/test/test_public_bindings.py b/test/test_public_bindings.py
index c4a64b5cb6477..4d2df65126983 100644
--- a/test/test_public_bindings.py
+++ b/test/test_public_bindings.py
@@ -261,7 +261,7 @@ def test_no_new_bindings(self):
             "set_num_threads",
             "unify_type_list",
             "vitals_enabled",
-
+            "VULKAN_AUTOMATIC_GPU_TRANSFER",
             "wait",
             "Tag",
         }
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 8b936be23122e..9857cd3c91e3f 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -178,6 +178,7 @@ INSERT_FOLD_PREPACK_OPS: MobileOptimizerType
 REMOVE_DROPOUT: MobileOptimizerType
 FUSE_ADD_RELU: MobileOptimizerType
 HOIST_CONV_PACKED_PARAMS: MobileOptimizerType
+VULKAN_AUTOMATIC_GPU_TRANSFER: MobileOptimizerType
 
 def fork(*args: Any, **kwargs: Any) -> Future: ...
 def wait(fut: Future) -> Any: ...
diff --git a/torch/csrc/jit/passes/mobile_optimizer_type.h b/torch/csrc/jit/passes/mobile_optimizer_type.h
index fe3fffe16c22d..d11f288dca343 100644
--- a/torch/csrc/jit/passes/mobile_optimizer_type.h
+++ b/torch/csrc/jit/passes/mobile_optimizer_type.h
@@ -9,4 +9,5 @@ enum class MobileOptimizerType : int8_t {
   FUSE_ADD_RELU,
   HOIST_CONV_PACKED_PARAMS,
   CONV_1D_TO_2D,
+  VULKAN_AUTOMATIC_GPU_TRANSFER,
 };
diff --git a/torch/csrc/jit/passes/vulkan_rewrite.cpp b/torch/csrc/jit/passes/vulkan_rewrite.cpp
index 7618aa50d6868..cbfa61238a45d 100644
--- a/torch/csrc/jit/passes/vulkan_rewrite.cpp
+++ b/torch/csrc/jit/passes/vulkan_rewrite.cpp
@@ -2,6 +2,7 @@
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/subgraph_matcher.h>
 #include <torch/csrc/jit/passes/constant_pooling.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/passes/fold_conv_bn.h>
 #include <torch/csrc/jit/passes/freeze_module.h>
 #include <torch/csrc/jit/passes/fuse_linear.h>
@@ -82,6 +83,51 @@ void insertPrePackedConv2dOp(std::shared_ptr<Graph>& graph) {
   transpose_rewriter.runOnGraph(graph);
 }
 
+void transferInputOutputBackends(std::shared_ptr<Graph>& graph) {
+  // Move inputs to Vulkan backend
+  for (Value* input : graph->inputs()) {
+    NamedValue named_input = NamedValue("", input);
+    if (named_input.type()->kind() == TypeKind::TensorType) {
+      // find the insertion point
+      WithInsertPoint ip(input->uses()[0].user->prev());
+      Value* replaced_input = graph->insert(
+          Symbol::fromQualString("aten::to"), {named_input, "vulkan"});
+      // replace the input
+      input->replaceAllUsesAfterNodeWith(
+          replaced_input->node(), replaced_input);
+    }
+  }
+
+  // Move outputs to CPU backend
+  at::ArrayRef<Value*>&& outputs = graph->outputs();
+  for (size_t i = 0; i < outputs.size(); i++) {
+    Value* output = outputs[i];
+    NamedValue named_output = NamedValue("", output);
+    if (named_output.type()->kind() == TypeKind::TensorType) {
+      // find the insertion point
+      WithInsertPoint ip(output->node()->next());
+      Value* replaced_output = graph->insert(
+          Symbol::fromQualString("aten::to"), {named_output, "cpu"});
+      // replace the output
+      graph->block()->replaceOutput(i, replaced_output);
+    }
+  }
+
+  SubgraphRewriter rewriter;
+  rewriter.runOnGraph(graph);
+}
+
+void transferInputOutputBackends(script::Module& module) {
+  std::shared_ptr<Graph> graph = module.get_methods()[0].graph();
+  transferInputOutputBackends(graph);
+}
+
+void eliminateDeadCode(script::Module& module) {
+  for (auto& method : module.get_methods()) {
+    EliminateDeadCode(method.graph());
+  }
+}
+
 void insertPrePackedGruOp(std::shared_ptr<Graph>& graph) {
   std::string gru_pattern = R"(
       graph(%input.1, %hx.1, %params_cpu:Tensor[], %has_biases:bool, %num_layers:int, %dropout:float, %train:bool, %bidirectional:bool, %batch_first:bool):
@@ -276,12 +322,19 @@ script::Module vulkanOptimizeForMobile(
   cloned_module = FoldConvBatchNorm(cloned_module);
   vulkanInsertPrePackedOps(cloned_module);
   cloned_module = freeze_module(cloned_module, preserved_methods);
+  if (!optimization_blocklist.count(
+          MobileOptimizerType::VULKAN_AUTOMATIC_GPU_TRANSFER)) {
+    transferInputOutputBackends(cloned_module);
+    cloned_module.register_attribute(
+        "requires_backend_transfers", BoolType::get(), false);
+  }
   vulkanFusePrePackedConvWithClamp(cloned_module);
   vulkanFoldPrePackingOps(cloned_module);
   removeDropout(cloned_module);
   vulkanRemoveMutation(cloned_module);
   // remove duplicated constants
   vulkanRunCanonicalOptimizations(cloned_module);
+  eliminateDeadCode(cloned_module);
 
   cloned_module.register_attribute(
       "optimized_for_vulkan", BoolType::get(), true);
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index d576c1ff3d742..d2411ee887772 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -1297,6 +1297,9 @@ void initJITBindings(PyObject* module) {
       .value(
           "HOIST_CONV_PACKED_PARAMS",
           MobileOptimizerType::HOIST_CONV_PACKED_PARAMS)
+      .value(
+          "VULKAN_AUTOMATIC_GPU_TRANSFER",
+          MobileOptimizerType::VULKAN_AUTOMATIC_GPU_TRANSFER)
       .export_values();
 
   // This allows PyTorchStreamReader to read from a Python buffer. It requires

From a453e4783f17cded40a647746b675d041c187e57 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Sat, 29 Oct 2022 21:14:50 +0000
Subject: [PATCH 0338/1922] [FSDP] Enable `use_orig_params=True` test (#88034)

I accidentally committed the `use_orig_params` PR with this test disabled. This PR simply re-enables it. It passes locally, so if CI is green, then this is an easy land.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88034
Approved by: https://github.com/H-Huang
---
 test/distributed/fsdp/test_fsdp_use_orig_params.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/distributed/fsdp/test_fsdp_use_orig_params.py b/test/distributed/fsdp/test_fsdp_use_orig_params.py
index 81657dcfae5e1..f6eda72feef3c 100644
--- a/test/distributed/fsdp/test_fsdp_use_orig_params.py
+++ b/test/distributed/fsdp/test_fsdp_use_orig_params.py
@@ -236,11 +236,12 @@ def test_diff_hyperparams(self, sharding_strategy_str: str):
             sharding_strategy=sharding_strategy,
         )
 
+    @skip_if_lt_x_gpu(2)
     @parametrize(
         "sharding_strategy_str",
         ["no_shard", "shard_grad_op", "full_shard"],
     )
-    def _test_diff_hyperparams_cpu_offload(self, sharding_strategy_str: str):
+    def test_diff_hyperparams_cpu_offload(self, sharding_strategy_str: str):
         """
         Tests FSDP parity with DDP when using multiple parameter groups with
         different hyperparameter settings with CPU offloading enabled. This is

From 7e8914d472cf3cc914a88b945ebf0c76b44a64c9 Mon Sep 17 00:00:00 2001
From: vasiliy <vasiliy@fb.com>
Date: Fri, 28 Oct 2022 12:15:49 -0700
Subject: [PATCH 0339/1922] make autocast cache global instead of thread-local
 (#86492)

Summary:

There is a memory leak because `torch.clear_autocast_cache()` clears
the autocast cache from the main thread, but autograd can write to
this cache from a background thread, so whatever autograd writes
will leak.

With some offline discussion we decided that a global cache is a
practical way to deal with this, and the performance impact of the
lock should be negligible.

Test Plan:

I don't have a local repro of the original issue, need to look into how to get
that.

A toy example
(https://gist.github.com/vkuzo/0d6318fe7f7cb1c505e370cd5c1a643b)
does cache clearing as expected on forward and backward pass.

local testing:
```
python test/test_cuda.py -k autocast
python test/test_autocast.py
```

Reviewers:

Subscribers:

Tasks:

Tags:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86492
Approved by: https://github.com/ezyang
---
 aten/src/ATen/autocast_mode.cpp |  6 +++-
 test/test_autocast.py           | 61 +++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
index 437eadf873a15..ca75c38258ff9 100644
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@@ -9,6 +9,7 @@
 
 #include <iostream>
 #include <exception>
+#include <mutex>
 
 namespace at {
 namespace autocast {
@@ -64,7 +65,8 @@ namespace {
 // directly against incoming TensorImpl*s.
 using weakref_type = c10::weak_intrusive_ptr<TensorImpl, UndefinedTensorImpl>;
 using val_type = std::tuple<weakref_type, Tensor>;
-thread_local std::unordered_map<TensorImpl*, val_type> cached_casts;
+std::unordered_map<TensorImpl*, val_type> cached_casts;
+std::mutex cached_casts_mutex;
 
 // nesting tracks the nesting depth of the Python-side context manager.
 // When the autocast context manager exits to a nesting level that's outside
@@ -89,6 +91,7 @@ thread_local at::ScalarType autocast_gpu_dtype = at::kHalf;
 }
 
 void clear_cache() {
+  const std::lock_guard<std::mutex> lock(cached_casts_mutex);
   cached_casts.clear();
 }
 
@@ -155,6 +158,7 @@ Tensor cached_cast(at::ScalarType to_type, const Tensor& arg, DeviceType device_
                          arg.scalar_type() == at::kFloat && arg.requires_grad() &&
                          arg.is_leaf() && !arg.is_view() && cache_enabled);
     if (can_try_cache) {
+      const std::lock_guard<std::mutex> lock(cached_casts_mutex);
       auto it = cached_casts.find(arg.unsafeGetTensorImpl());
       if (it != cached_casts.end()) {
         return std::get<1>(it->second);
diff --git a/test/test_autocast.py b/test/test_autocast.py
index bfbe46d08b890..1a8263a79f93d 100644
--- a/test/test_autocast.py
+++ b/test/test_autocast.py
@@ -1,9 +1,12 @@
 # Owner(s): ["module: unknown"]
 
 import collections
+import unittest
+
 import torch
 from torch.testing._internal.common_utils import TestCase, run_tests
 from torch.testing._internal.autocast_test_lists import AutocastCPUTestLists
+from torch.utils._python_dispatch import TorchDispatchMode
 
 class TestAutocastCPU(TestCase):
     def setUp(self):
@@ -122,6 +125,64 @@ def test_autocast_torch_need_autocast_promote(self):
         for op, args in self.autocast_lists.torch_need_autocast_promote:
             self._run_autocast_outofplace(op, args, torch.float32)
 
+@unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
+class TestAutocastGPU(TestCase):
+    def test_cast_cache_is_global(self):
+        """
+        Verifies that the autocast cache is global. This is done by
+        mocking out cache clearing at the end of the forward pass,
+        running forward+backward with an explicit call to autocast in the
+        backward, and verifying that the weight only get cast to float16 once.
+        """
+
+        class CustomLinear(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x, w_t):
+                ctx.save_for_backward(x, w_t)
+                return torch.nn.functional.linear(x, w_t)
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                x, w_t = ctx.saved_tensors
+                with torch.autocast(device_type='cuda'):
+                    dL_dX = torch.matmul(grad_output, w_t)
+                    dL_dW = torch.matmul(x.transpose(0, 1), grad_output).transpose(0, 1)
+                return dL_dX, dL_dW
+
+        data = torch.randn(2, 3).cuda()
+        weight = torch.nn.Parameter(torch.randn(4, 3).cuda())
+        weight_dtype_cast_counter = 0
+
+        class WeightDTypeCastCounterMode(TorchDispatchMode):
+
+            def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+                if (
+                    func is torch.ops.aten._to_copy.default and
+                    args[0] is weight and
+                    kwargs['dtype'] is torch.float16
+                ):
+                    nonlocal weight_dtype_cast_counter
+                    weight_dtype_cast_counter += 1
+                return func(*args, **kwargs)
+
+            def __enter__(self):
+                self.old_clear_cache = torch.clear_autocast_cache
+                torch.clear_autocast_cache = lambda: None
+                return super().__enter__()
+
+            def __exit__(self, exc_type, exc_val, exc_tb):
+                torch.clear_autocast_cache = self.old_clear_cache
+                return super().__exit__(exc_type, exc_val, exc_tb)
+
+        with WeightDTypeCastCounterMode():
+            with torch.autocast(device_type='cuda'):
+                output = CustomLinear.apply(data, weight)
+                s = output.sum()
+            s.backward()
+
+        self.assertEqual(weight_dtype_cast_counter, 1)
+
+
 class TestTorchAutocast(TestCase):
     def test_autocast_fast_dtype(self):
         gpu_fast_dtype = torch.get_autocast_gpu_dtype()

From f25d72fc09f0d36497338aa90fe6a9c2df0a1a19 Mon Sep 17 00:00:00 2001
From: Aaron Gokaslan <aaronGokaslan@gmail.com>
Date: Mon, 31 Oct 2022 16:41:24 +0000
Subject: [PATCH 0340/1922] Fix: prefer .is_none() over .is(py::none()) for
 pybind11 (#88051)

Fixes minor perf regression I saw in #85688 and replaced throughout the code base. `obj == Py_None` is directly equivalent to is_none(). Constructing a temporary py::none() object needlessly incref/decref the refcount of py::none, this method avoids that and therefore is more efficient.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88051
Approved by: https://github.com/albanD
---
 torch/csrc/autograd/python_variable.cpp       | 20 +++++++++----------
 torch/csrc/jit/python/init.cpp                |  2 +-
 torch/csrc/jit/python/pybind_utils.h          |  2 +-
 .../csrc/jit/python/python_sugared_value.cpp  |  2 +-
 torch/csrc/jit/python/script_init.cpp         | 16 +++++++--------
 5 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index 7e07f3ff32cda..d992936ba3615 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -167,7 +167,7 @@ void pushPyOutToStack(
   if (num_returns == 0) {
     // Check that we got a None return from Python. Anything else is an error.
     TORCH_CHECK(
-        out.is(py::none()),
+        out.is_none(),
         "Expected ",
         msg,
         " for ",
@@ -2353,7 +2353,7 @@ bool ConcretePyInterpreterVTable::is_contiguous(
         {py::cast(memory_format)});
   }
 
-  if (out.is(py::none())) {
+  if (out.is_none()) {
     return self->is_contiguous_default(memory_format);
   }
 
@@ -2386,7 +2386,7 @@ bool ConcretePyInterpreterVTable::is_strides_like(
       "torch.ops.aten",
       {py::cast(memory_format)});
 
-  if (out.is(py::none())) {
+  if (out.is_none()) {
     return self->is_strides_like_default(memory_format);
   }
 
@@ -2415,7 +2415,7 @@ bool ConcretePyInterpreterVTable::is_non_overlapping_and_dense(
           .ptr(),
       "torch.ops.aten");
 
-  if (out.is(py::none())) {
+  if (out.is_none()) {
     return self->is_non_overlapping_and_dense_default();
   }
 
@@ -2487,7 +2487,7 @@ c10::IntArrayRef ConcretePyInterpreterVTable::strides(
           .ptr(),
       "torch.ops.aten");
 
-  if (out.is(py::none())) {
+  if (out.is_none()) {
     TORCH_CHECK(
         !self->has_symbolic_sizes_strides(),
         "Cannot call strides on a tensor with symbolic shapes/strides");
@@ -2546,7 +2546,7 @@ c10::IntArrayRef ConcretePyInterpreterVTable::sizes(
           .ptr(),
       "torch.ops.aten");
 
-  if (out.is(py::none())) {
+  if (out.is_none()) {
     TORCH_CHECK(
         !self->has_symbolic_sizes_strides(),
         "Cannot call sizes on a tensor with symbolic shapes/strides");
@@ -2577,7 +2577,7 @@ c10::SymIntArrayRef ConcretePyInterpreterVTable::sym_sizes(
           .ptr(),
       "torch.ops.aten");
 
-  if (out.is(py::none())) {
+  if (out.is_none()) {
     return self->sym_sizes_default();
   }
   // We need to squeeze SymIntNodes and ints into `SymInts`
@@ -2640,7 +2640,7 @@ c10::SymInt ConcretePyInterpreterVTable::sym_numel(
           .ptr(),
       "torch.ops.aten");
 
-  if (out.is(py::none())) {
+  if (out.is_none()) {
     TORCH_CHECK(
         !self->has_symbolic_sizes_strides(),
         "Cannot call numel on a tensor with symbolic shapes/strides");
@@ -2665,7 +2665,7 @@ c10::SymInt ConcretePyInterpreterVTable::sym_storage_offset(
           .ptr(),
       "torch.ops.aten");
 
-  if (out.is(py::none())) {
+  if (out.is_none()) {
     return self->sym_storage_offset_default();
   }
   return torch::is_symint(out) ? out.cast<c10::SymInt>()
@@ -2688,7 +2688,7 @@ c10::SymIntArrayRef ConcretePyInterpreterVTable::sym_strides(
           .ptr(),
       "torch.ops.aten");
 
-  if (out.is(py::none())) {
+  if (out.is_none()) {
     return self->sym_strides_default();
   }
   // We need to squeeze SymIntNodes and ints into `SymInts`
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index d2411ee887772..74ce3b829b941 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -657,7 +657,7 @@ void initJITBindings(PyObject* module) {
       .def(
           "_jit_pass_create_autodiff_subgraphs",
           [](const std::shared_ptr<Graph>& graph, py::object threshold) {
-            if (threshold.is(py::none())) {
+            if (threshold.is_none()) {
               CreateAutodiffSubgraphs(graph);
             } else {
               CreateAutodiffSubgraphs(graph, py::cast<int>(threshold));
diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h
index 835c7d0dc709a..5dfe28e92fd72 100644
--- a/torch/csrc/jit/python/pybind_utils.h
+++ b/torch/csrc/jit/python/pybind_utils.h
@@ -299,7 +299,7 @@ inline InferredType tryToInferType(py::handle input) {
     return InferredType(TensorType::get());
   }
 
-  if (input.is(py::none())) {
+  if (input.is_none()) {
     return InferredType(NoneType::get());
   }
 
diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp
index 705731778dc35..12d565427ae48 100644
--- a/torch/csrc/jit/python/python_sugared_value.cpp
+++ b/torch/csrc/jit/python/python_sugared_value.cpp
@@ -1149,7 +1149,7 @@ std::shared_ptr<SugaredValue> toSugaredValue(
           g.insertConstant(static_cast<c10::complex<double>>(c_obj), loc));
     } else if (py::isinstance<py::str>(obj)) {
       return toSimple(g.insertConstant(py::cast<std::string>(obj), loc));
-    } else if (obj.is(py::none())) {
+    } else if (obj.is_none()) {
       return toSimple(g.insertConstant(IValue(), loc));
     } else if (THPDevice_Check(obj.ptr())) {
       auto device = reinterpret_cast<THPDevice*>(obj.ptr());
diff --git a/torch/csrc/jit/python/script_init.cpp b/torch/csrc/jit/python/script_init.cpp
index ee9509588932c..030e2525a1635 100644
--- a/torch/csrc/jit/python/script_init.cpp
+++ b/torch/csrc/jit/python/script_init.cpp
@@ -112,7 +112,7 @@ struct PythonResolver : public Resolver {
       const SourceRange& loc) override {
     pybind11::gil_scoped_acquire ag;
     py::object obj = rcb_(name);
-    if (obj.is(py::none())) {
+    if (obj.is_none()) {
       return nullptr;
     }
     return toSugaredValue(obj, m, loc);
@@ -153,7 +153,7 @@ struct PythonResolver : public Resolver {
     }
     pybind11::gil_scoped_acquire ag;
     py::object obj = rcb_(name);
-    if (obj.is(py::none())) {
+    if (obj.is_none()) {
       return nullptr;
     }
 
@@ -366,7 +366,7 @@ static StrongFunctionPtr script_compile_overloaded_function(
     const ResolutionCallback& rcb,
     const FunctionDefaults& implementation_defaults,
     const py::object& signature) {
-  if (signature.is(py::none())) {
+  if (signature.is_none()) {
     throw ErrorReport(overload_decl.range())
         << "Must explicitly add type annotations to overloaded functions";
   }
@@ -1869,7 +1869,7 @@ void initJitScriptBindings(PyObject* module) {
          py::object map_location,
          const py::dict& extra_files) {
         c10::optional<at::Device> optional_device;
-        if (!map_location.is(py::none())) {
+        if (!map_location.is_none()) {
           AT_ASSERT(THPDevice_Check(map_location.ptr()));
           optional_device =
               reinterpret_cast<THPDevice*>(map_location.ptr())->device;
@@ -1889,7 +1889,7 @@ void initJitScriptBindings(PyObject* module) {
          py::object map_location,
          std::string ts_id) {
         c10::optional<at::Device> optional_device;
-        if (!map_location.is(py::none())) {
+        if (!map_location.is_none()) {
           AT_ASSERT(THPDevice_Check(map_location.ptr()));
           optional_device =
               reinterpret_cast<THPDevice*>(map_location.ptr())->device;
@@ -1909,7 +1909,7 @@ void initJitScriptBindings(PyObject* module) {
          const py::dict& extra_files) {
         std::istringstream in(buffer);
         c10::optional<at::Device> optional_device;
-        if (!map_location.is(py::none())) {
+        if (!map_location.is_none()) {
           AT_ASSERT(THPDevice_Check(map_location.ptr()));
           optional_device =
               reinterpret_cast<THPDevice*>(map_location.ptr())->device;
@@ -1924,7 +1924,7 @@ void initJitScriptBindings(PyObject* module) {
       "_load_for_lite_interpreter",
       [](const std::string& filename, py::object map_location) {
         c10::optional<at::Device> optional_device;
-        if (!map_location.is(py::none())) {
+        if (!map_location.is_none()) {
           AT_ASSERT(THPDevice_Check(map_location.ptr()));
           optional_device =
               reinterpret_cast<THPDevice*>(map_location.ptr())->device;
@@ -1936,7 +1936,7 @@ void initJitScriptBindings(PyObject* module) {
       [](const std::string& buffer, py::object map_location) {
         std::istringstream in(buffer);
         c10::optional<at::Device> optional_device;
-        if (!map_location.is(py::none())) {
+        if (!map_location.is_none()) {
           AT_ASSERT(THPDevice_Check(map_location.ptr()));
           optional_device =
               reinterpret_cast<THPDevice*>(map_location.ptr())->device;

From 659f4436a6b37f0fbdee3c22dc6e1f0c2d8173d2 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Sun, 30 Oct 2022 15:26:12 +0000
Subject: [PATCH 0341/1922] [FSDP()][3/N] Refactor public APIs (#87917)

- This PR defines a new `api.py` meant to hold the public API for FSDP (minus `FullyShardedDataParallel` itself). This is needed because several of the `_<...>_utils.py` files rely on the public API, and we cannot import from `torch.distributed.fsdp.fully_sharded_data_parallel` without a circular import. Calling the file `api.py` follows the convention used by `ShardedTensor`.
- This PR cleans up the wording in the `BackwardPrefetch`, `ShardingStrategy`, `MixedPrecision`, and `CPUOffload` docstrings.
- This PR adds the aforementioned classes to `fsdp.rst` to have them rendered in public docs.
- To abide by the public bindings contract (`test_public_bindings.py`), the aforementioned classes are removed from `fully_sharded_data_parallel.py`'s `__all__`. This is technically BC breaking if someone uses `from torch.distributed.fsdp.fully_sharded_data_parallel import *`; however, that does not happen in any of our own external or internal code.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87917
Approved by: https://github.com/mrshenli
---
 docs/source/fsdp.rst                          |  12 ++
 torch/distributed/fsdp/_common_utils.py       |   4 +
 torch/distributed/fsdp/api.py                 | 136 ++++++++++++++++
 .../fsdp/fully_sharded_data_parallel.py       | 146 +-----------------
 4 files changed, 158 insertions(+), 140 deletions(-)
 create mode 100644 torch/distributed/fsdp/api.py

diff --git a/docs/source/fsdp.rst b/docs/source/fsdp.rst
index ff42770831b7e..feb6d8cd470b2 100644
--- a/docs/source/fsdp.rst
+++ b/docs/source/fsdp.rst
@@ -5,3 +5,15 @@ FullyShardedDataParallel
 
 .. autoclass:: torch.distributed.fsdp.FullyShardedDataParallel
   :members:
+
+.. autoclass:: torch.distributed.fsdp.BackwardPrefetch
+  :members:
+
+.. autoclass:: torch.distributed.fsdp.ShardingStrategy
+  :members:
+
+.. autoclass:: torch.distributed.fsdp.MixedPrecision
+  :members:
+
+.. autoclass:: torch.distributed.fsdp.CPUOffload
+  :members:
diff --git a/torch/distributed/fsdp/_common_utils.py b/torch/distributed/fsdp/_common_utils.py
index cef76cf91f017..52a872b0b44b4 100644
--- a/torch/distributed/fsdp/_common_utils.py
+++ b/torch/distributed/fsdp/_common_utils.py
@@ -1,3 +1,7 @@
+"""
+This file includes private common utilities for FSDP.
+"""
+
 from enum import auto, Enum
 
 
diff --git a/torch/distributed/fsdp/api.py b/torch/distributed/fsdp/api.py
new file mode 100644
index 0000000000000..d6704d639fd59
--- /dev/null
+++ b/torch/distributed/fsdp/api.py
@@ -0,0 +1,136 @@
+"""
+This file includes public APIs for FSDP such as the classes used for the
+constructor arguments.
+"""
+
+from dataclasses import dataclass
+from enum import auto, Enum
+
+from typing import Optional
+
+import torch
+
+__all__ = ["ShardingStrategy", "BackwardPrefetch", "MixedPrecision", "CPUOffload"]
+
+
+class ShardingStrategy(Enum):
+    """
+    This specifies the sharding strategy to be used for distributed training by
+    :class:`FullyShardedDataParallel`.
+
+    - ``FULL_SHARD``: Parameters, gradients, and optimizer states are sharded.
+      For the parameters, this strategy unshards (via all-gather) before the
+      forward, reshards after the forward, unshards before the backward
+      computation, and reshards after the backward computation. For gradients,
+      it synchronizes and shards them (via reduce-scatter) after the backward
+      computation. The sharded optimizer states are updated locally per rank.
+    - ``SHARD_GRAD_OP``: Gradients and optimizer states are sharded during
+      computation, and additionally, parameters are sharded outside
+      computation. For the parameters, this strategy unshards before the
+      forward, does not reshard them after the forward, and only reshards them
+      after the backward computation. The sharded optimizer states are updated
+      locally per rank. Inside ``no_sync()``, the parameters are not resharded
+      after the backward computation.
+    - ``NO_SHARD``: Parameters, gradients, and optimizer states are not sharded
+      but instead replicated across ranks similar to PyTorch's
+      :class:`DistributedDataParallel` API. For gradients, this strategy
+      synchronizes them (via all-reduce) after the backward computation. The
+      unsharded optimizer states are updated locally per rank.
+    """
+
+    FULL_SHARD = auto()
+    SHARD_GRAD_OP = auto()
+    NO_SHARD = auto()
+    # HYBRID_SHARD = auto()
+
+
+class BackwardPrefetch(Enum):
+    """
+    This configures explicit backward prefetching, which can improve throughput
+    but may slightly increase peak memory usage.
+
+    For NCCL backend, any collectives, even if issued in different streams,
+    contend for the same per-device NCCL stream, which is why the relative
+    order in which the collectives are issued matters for overlapping. The
+    different backward prefetching settings correspond to different orderings.
+
+    - ``BACKWARD_PRE``: This prefetches the next set of parameters before the
+      current set of parameter's gradient computation. This improves backward
+      pass throughput by overlapping communication (next all-gather) and
+      computation (current gradient computation).
+    - ``BACKWARD_POST``: This prefetches the next set of parameters after the
+      current set of parameter's gradient computation. This may improve
+      backward pass throughput by overlapping communication (current
+      reduce-scatter) and computation (next gradient computation).
+      Specifically, the next all-gather is reordered to be before the current
+      reduce-scatter.
+    """
+
+    # NOTE: For both modes, the ordering that defines "current" and "next" is
+    # not always correct in the current implementation, so this may cause some
+    # performance regression for some models.
+    BACKWARD_PRE = auto()
+    BACKWARD_POST = auto()
+
+
+@dataclass
+class MixedPrecision:
+    """
+    This configures FSDP-native mixed precision training.
+
+    Attributes:
+        param_dtype (torch.dtype): This specifies the dtype for model
+            parameters, inputs, and therefore the dtype for computation.
+            However, outside the forward and backward passes, parameters are in
+            full precision. Model checkpointing always happens in full
+            precision.
+        reduce_dtype (torch.dtype): This specifies the dtype for gradient
+            reduction, which is permitted to differ from ``param_dtype``.
+        buffer_dtype (torch.dtype): This specifies the dtype for buffers. FSDP
+            does not shard buffers, casts them to ``buffer_dtype`` in the first
+            forward pass, and keeps them in that dtype thereafter. Model
+            checkpointing always happens in full precision.
+        keep_low_precision_grads (bool): This specifies whether to upcast
+            gradients back to the full parameter precision after the backward
+            pass. This may be set to ``False`` to save memory if using custom
+            optimizers that can perform the optimizer step in ``reduce_dtype``.
+
+    .. note:: In ``summon_full_params``, parameters are forced to full
+        precision, but buffers are not.
+
+    .. note:: ``state_dict`` checkpoints parameters and buffers in full
+        precision. For buffers, this is only supported for
+        ``StateDictType.FULL_STATE_DICT``.
+
+    .. note:: This API is experimental and subject to change.
+
+    .. note:: Each low precision dtype must be specified explicitly. For
+        example, ``MixedPrecision(reduce_dtype=torch.float16)`` only specifies
+        the reduction dtype to be low precision, and FSDP will not cast
+        parameters or buffers.
+
+    .. note:: If a ``reduce_dtype`` is not specified, then gradient reduction
+        happens in ``param_dtype`` if specified or the original parameter dtype
+        otherwise.
+    """
+
+    param_dtype: Optional[torch.dtype] = None
+    reduce_dtype: Optional[torch.dtype] = None
+    buffer_dtype: Optional[torch.dtype] = None
+    keep_low_precision_grads: bool = False
+
+
+@dataclass
+class CPUOffload:
+    """
+    This configures CPU offloading.
+
+    Attributes:
+        offload_params (bool): This specifies whether to offload parameters to
+            CPU when not involved in computation. If enabled, this implicitly
+            offloads gradients to CPU as well. This is to support the optimizer
+            step, which requires parameters and gradients to be on the same
+            device.
+    """
+
+    offload_params: bool = False
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 75ded291237ea..a2c90d667f423 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -44,6 +44,12 @@
     _prepare_forward_inputs,
     _wait_for_computation_stream,
 )
+from torch.distributed.fsdp.api import (
+    BackwardPrefetch,
+    CPUOffload,
+    MixedPrecision,
+    ShardingStrategy,
+)
 from torch.distributed.utils import _sync_params_and_buffers
 
 from ._optim_utils import (
@@ -100,10 +106,6 @@
 
 __all__ = [
     "FullyShardedDataParallel",
-    "ShardingStrategy",
-    "MixedPrecision",
-    "CPUOffload",
-    "BackwardPrefetch",
     "StateDictType",
     "StateDictConfig",
     "FullStateDictConfig",
@@ -124,142 +126,6 @@
 _PARAM_BROADCAST_BUCKET_SIZE = int(250 * 1024 * 1024)
 
 
-class ShardingStrategy(Enum):
-    """
-    This specifies the sharding strategy to be used for distributed training by
-    :class:`FullyShardedDataParallel`.
-    FULL_SHARD: Parameters, gradients, and optimizer states are sharded. For
-                the parameters, this algorithm all-gathers before the forward,
-                reshards after the forward, all-gathers before the backward
-                computation, and reshards after the backward computation. The
-                gradients are synchronized and sharded via reduce-scatter after
-                the backward computation. The sharded optimizer states are
-                updated locally.
-    SHARD_GRAD_OP: Gradients and optimizer states are sharded during
-                   computation, and additionally parameters are sharded outside
-                   computation. For the parameters, this algorithm all-gathers
-                   before the forward, does not reshard after the forward, and
-                   only reshards after the backward computation. The gradients
-                   are synchronized and sharded via reduce-scatter after the
-                   backward computation. The sharded optimizer states are
-                   updated locally. Inside ``no_sync()``, the parameters are
-                   not resharded after the backward computation.
-    NO_SHARD: Parameters, gradients, and optimizer states are not sharded but
-              instead replicated across ranks, similar to PyTorch's
-              ``DistributedDataParallel`` API. The gradients are synchronized
-              via all-reduce after the backward computation. The unsharded
-              optimizer states are updated locally.
-    HYBRID_SHARD(future support): Apply ``FULL_SHARD`` intra-node and
-                                  ``NO_SHARD`` inter-node.
-
-    """
-
-    FULL_SHARD = auto()
-    SHARD_GRAD_OP = auto()
-    NO_SHARD = auto()
-    # TODO
-    # HYBRID_SHARD = auto()
-
-
-@dataclass
-class MixedPrecision:
-    """
-    A config to enable mixed precision training with FullyShardedDataParallel.
-    This class can be constructed with several flags:
-        ``param_dtype`` controls the precision of model parameters, inputs, and
-        therefore the precision under which computation happens. After forward
-        and backward passes, FSDP parameters point to full precision shards
-        that are kept in memory. Full precision parameters are always
-        checkpointed.
-        ``reduce_dtype`` controls the precision under which gradient reduction
-        would occur, which can potentially be different than ``param_dtype``
-        for use cases such as communication efficiency.
-        ``buffer_dtype`` controls the precision that buffers are cast to. Note
-        that buffers are unsharded and are cast in the first forward pass, and
-        remain in their reduced precision state even after forward/backward
-        passes. However, when taking checkpoints with ``state_dict``, buffers
-        are checkpointed in their full precision (and then restored back to
-        to their reduced precision) as expected. Note that this checkpoint
-        support is currently limited to ``StateDictType.FULL_STATE_DICT``.
-        ``keep_low_precision_grads``: Whether to upcast gradients back to the
-        full parameter precision after backwards or not. This can be disabled
-        to keep the gradients in the lower precision, which can potentially
-        save memory if custom Optimizers are able to perform parameter updates
-        effectively with lower precision grads.
-
-    .. note:: In ``summon_full_params``, parameters are summoned in full
-        precision but buffers are not.
-
-    .. note:: Parameters and buffers are checkpointed in full precision. For
-        buffers, this is only guaranteed to work for ``StateDictType.FULL_STATE_DICT``.
-
-    .. note:: This API is experimental and subject to change.
-
-    .. note:: Specification of reduced precision types must be explicit, in that
-        if, for example, ``param_dtype`` is not specified, it will not be cast by
-        FSDP. Thus, a config such as ``MixedPrecision(reduce_dtype=torch.float16)``
-        will not cast buffers or parameters. Note that if a ``MixedPrecision``
-        config is specified without a ``reduce_dtype``, gradient communication
-        would occur in the `param_dtype` precision, if given, otherwise, in the
-        original parameter precision.
-    """
-
-    # maintain a tensor of this dtype that the fp32 param shard will be cast to.
-    # Will control the precision of model params, inputs, and thus compute as
-    # well.
-    param_dtype: Optional[torch.dtype] = None
-    # Gradient communication precision.
-    reduce_dtype: Optional[torch.dtype] = None
-    # Buffer precision.
-    # TODO: buffer + param are usually of the same type, if user specifies
-    # param but not buffer, should we automatically make buffer be the same?
-    buffer_dtype: Optional[torch.dtype] = None
-    keep_low_precision_grads: Optional[bool] = False
-
-
-@dataclass
-class CPUOffload:
-    """
-    CPU offloading config. Currently, only parameter and gradient CPU
-    offload are supported.
-    offload_params: Offloading parameters to CPUs when these parameters are
-                    not used for computation on GPUs. This implicitly enables
-                    gradient offloading to CPUs in order for parameters and
-                    gradients to be on the same device to work with optimizer.
-    """
-
-    offload_params: bool = False
-
-
-class BackwardPrefetch(Enum):
-    """
-    Specify where to prefetch next layer's full parameters
-    during backward pass.
-    BACKWARD_PRE: prefetch right before current layer's backward computation
-                  starts, this approach will increase backward communication
-                  and computation overalpping and potentialy improve training
-                  performance, but it may increase the peak memory usage as
-                  the prefetched full parameters will be kept in the GPU memory
-                  until next layer's backward computation is done.
-    BACKWARD_POST: prefetch right after current layer's backward computation finishes,
-                   this approach will not increase peak memory as prefetching happens
-                   after current layer's full parameters are freed.
-                   It could potentially improve backward communication and computation
-                   overlapping as it avoids all_gather and reduce_scatter are blocked
-                   each other in the single NCCL stream. However, based on our experiments,
-                   for some models, the backward post backward hook fire order is not always
-                   the reversed forward computation order, so this
-                   approach may prefetch full parameters for layers ahead of next layer,
-                   this 'ahead' all_gather could delay next layer's all_gather in the
-                   single NCCL stream and cause the next layer's computation delay. So it may
-                   cause some performance regession for some models.
-    """
-
-    BACKWARD_PRE = auto()
-    BACKWARD_POST = auto()
-    # TODO, BACKWARD_PRE_CPU, prefetch full parameters and keep them in the CPU memory
-
-
 class StateDictType(Enum):
     """
     This enum indicates that which type of ``state_dict`` the FSDP module is

From 565da040c390b371155cd3d64d8ad77781d25b08 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Mon, 31 Oct 2022 01:43:05 +0000
Subject: [PATCH 0342/1922] [FSDP()][7/N] Refactor most of ctor (#87921)

The goal of this PR is to make one pass over the FSDP constructor and refactor each helper method call to not be `self.<...>`. Subsequent PRs will make further passes over the FSDP constructor.

This PR looks like a lot of lines of code change, but it is only reorganization. Methods are moved to `_init_utils.py` and `_common_utils.py`. This also marks the beginning of moving methods from `_utils.py` to `_common_utils.py` -- they will be coalesced eventually. I am only using `_common_utils.py` as a staging ground to include the methods that have been affected by the refactoring.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87921
Approved by: https://github.com/mrshenli
---
 .../fsdp/test_fsdp_tp_integration.py          |   2 +-
 .../fsdp/test_fsdp_use_orig_params.py         |   2 +-
 torch/distributed/fsdp/_common_utils.py       | 116 +++++
 torch/distributed/fsdp/_init_utils.py         | 364 +++++++++++++
 torch/distributed/fsdp/_state_dict_utils.py   |   3 +-
 torch/distributed/fsdp/_utils.py              |  42 --
 torch/distributed/fsdp/flat_param.py          |  13 +-
 .../fsdp/fully_sharded_data_parallel.py       | 482 ++----------------
 8 files changed, 529 insertions(+), 495 deletions(-)
 create mode 100644 torch/distributed/fsdp/_init_utils.py

diff --git a/test/distributed/fsdp/test_fsdp_tp_integration.py b/test/distributed/fsdp/test_fsdp_tp_integration.py
index 7c6c15b2b422c..9b3ba3d5add80 100644
--- a/test/distributed/fsdp/test_fsdp_tp_integration.py
+++ b/test/distributed/fsdp/test_fsdp_tp_integration.py
@@ -11,9 +11,9 @@
 from torch.distributed._shard.sharded_tensor.api import Shard, ShardedTensor
 from torch.distributed._shard.sharding_plan import ShardingPlan
 from torch.distributed._shard.sharding_spec import ChunkShardingSpec
+from torch.distributed.fsdp._common_utils import _set_fsdp_flattened
 from torch.distributed.fsdp._fsdp_extensions import _set_fsdp_extensions, FSDPExtensions
 from torch.distributed.fsdp._shard_utils import _create_chunk_sharded_tensor
-from torch.distributed.fsdp._utils import _set_fsdp_flattened
 from torch.distributed.fsdp.fully_sharded_data_parallel import (
     CPUOffload,
     FullyShardedDataParallel as FSDP,
diff --git a/test/distributed/fsdp/test_fsdp_use_orig_params.py b/test/distributed/fsdp/test_fsdp_use_orig_params.py
index f6eda72feef3c..24829ff408d9b 100644
--- a/test/distributed/fsdp/test_fsdp_use_orig_params.py
+++ b/test/distributed/fsdp/test_fsdp_use_orig_params.py
@@ -14,7 +14,7 @@
     FullyShardedDataParallel as FSDP,
     ShardingStrategy,
 )
-from torch.distributed.fsdp.fully_sharded_data_parallel import clean_tensor_name
+from torch.distributed.fsdp._common_utils import clean_tensor_name
 from torch.distributed.fsdp.wrap import always_wrap_policy, transformer_auto_wrap_policy
 from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer
 from torch.nn.parallel.distributed import DistributedDataParallel as DDP
diff --git a/torch/distributed/fsdp/_common_utils.py b/torch/distributed/fsdp/_common_utils.py
index 52a872b0b44b4..72c5a506f0765 100644
--- a/torch/distributed/fsdp/_common_utils.py
+++ b/torch/distributed/fsdp/_common_utils.py
@@ -3,6 +3,17 @@
 """
 
 from enum import auto, Enum
+from typing import Callable, Dict, List
+
+import torch
+import torch.distributed.fsdp.flat_param as flat_param_file
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    _CHECKPOINT_PREFIX,
+)
+
+FSDP_WRAPPED_MODULE = "_fsdp_wrapped_module"
+FSDP_PREFIX = FSDP_WRAPPED_MODULE + "."
+FSDP_FLATTENED = "_fsdp_flattened"
 
 
 class TrainingState(Enum):
@@ -25,3 +36,108 @@ class HandleTrainingState(Enum):
     BACKWARD_PRE = auto()
     BACKWARD_POST = auto()
     SUMMON_FULL_PARAMS = auto()
+
+
+def clean_tensor_name(tensor_name: str) -> str:
+    """
+    Cleans the parameter or buffer name by removing any module wrapper
+    prefixes.
+    """
+    tensor_name = tensor_name.replace(FSDP_PREFIX, "")
+    # TODO: Explicitly replacing the checkpoint wrapper prefix is not ideal as
+    # it couples `CheckpointWrapper` and FSDP and also does not scale for more
+    # module wrappers.
+    tensor_name = tensor_name.replace(_CHECKPOINT_PREFIX, "")
+    return tensor_name
+
+
+def _set_fsdp_flattened(tensor: torch.Tensor) -> None:
+    """
+    Sets an attribute on ``tensor`` to mark it as flattened by FSDP. This is to
+    avoid re-flattening it during nested construction.
+    """
+    setattr(tensor, FSDP_FLATTENED, True)
+
+
+def _is_fsdp_flattened(tensor: torch.Tensor) -> bool:
+    """Returns if ``tensor`` has been marked as flattened by FSDP."""
+    return getattr(tensor, FSDP_FLATTENED, False)
+
+
+def _get_param_to_unflat_param_names(
+    model: torch.nn.Module,
+    dedup_shared_params: bool = True,
+) -> Dict[torch.nn.Parameter, List[str]]:
+    """
+    Constructs a mapping from flattened parameter (including non-FSDP-module
+    parameters) to its unflattened parameter names. For non-FSDP-module
+    parameters, these mapped-to lists always contain a single element. The
+    unflattened parameter names should match the keys of the model state dict.
+
+    For shared parameters, only the first parameter name is included (following
+    the ``torch.nn.Module.parameters()`` order).
+
+    Args:
+        model (torch.nn.Module): Root module (which may or may not be a
+            :class:`FullyShardedDataParallel` instance).
+        dedup_shared_params (bool): If ``True``, only includes the first
+            list of unflattened parameter names corresponding to a parameter
+            in the module walk order; if ``False``, then includes all of the
+            unflattened parameter names.
+    """
+
+    def module_fn(module, prefix, param_to_unflat_param_names):
+        for param_name, param in module.named_parameters(recurse=False):
+            module_prefixed_param_names = (
+                param._fqns
+                if type(param) is flat_param_file.FlatParameter
+                else [param_name]
+            )  # prefixed from `module`
+            fully_prefixed_param_names = [
+                clean_tensor_name(prefix + name) for name in module_prefixed_param_names
+            ]  # fully prefixed from the top level including `prefix`
+            # If this parameter has already been visited, then it is a
+            # shared parameter; then, only take the first parameter name
+            is_shared_param = param in param_to_unflat_param_names
+            if not is_shared_param:
+                param_to_unflat_param_names[param] = fully_prefixed_param_names
+            elif not dedup_shared_params:
+                param_to_unflat_param_names[param].extend(fully_prefixed_param_names)
+
+    def return_fn(param_to_unflat_param_names):
+        return param_to_unflat_param_names
+
+    param_to_unflat_param_names: Dict[torch.nn.Parameter, List[str]] = {}
+    return _apply_to_modules(
+        model,
+        module_fn,
+        return_fn,
+        param_to_unflat_param_names,
+    )
+
+
+def _apply_to_modules(
+    root_module: torch.nn.Module,
+    module_fn: Callable,
+    return_fn: Callable,
+    *args,
+    **kwargs,
+):
+    """
+    Performs a pre-order traversal of the modules in the hierarchy rooted at
+    ``root_module``, applying ``module_fn`` at each module and finally
+    returning a value using ``return_fn``. The traversal constructs the full
+    module prefix name (e.g. "module.submodule." just like in model state dict)
+    and makes that available to ``module_fn``.
+    """
+
+    def f(module: torch.nn.Module, prefix: str, *args, **kwargs):
+        # Call the module function before recursing over children (pre-order)
+        module_fn(module, prefix, *args, **kwargs)
+        for submodule_name, submodule in module.named_children():
+            if submodule is not None:
+                new_prefix = prefix + submodule_name + "."
+                f(submodule, new_prefix, *args, **kwargs)
+
+    f(root_module, "", *args, **kwargs)
+    return return_fn(*args, **kwargs)
diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
new file mode 100644
index 0000000000000..5fc3845743409
--- /dev/null
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -0,0 +1,364 @@
+import warnings
+from typing import Callable, Iterable, Iterator, List, Optional, Set, Tuple, Union
+
+import torch
+import torch.distributed as dist
+import torch.distributed.fsdp.fully_sharded_data_parallel as fsdp_file
+import torch.nn as nn
+from torch.distributed.fsdp._common_utils import (
+    _apply_to_modules,
+    _get_param_to_unflat_param_names,
+    _is_fsdp_flattened,
+    clean_tensor_name,
+)
+from torch.distributed.utils import _sync_params_and_buffers
+
+_TORCHDISTX_AVAIL = True
+try:
+    from torchdistx import deferred_init, fake  # type: ignore[import]
+except ImportError:
+    _TORCHDISTX_AVAIL = False
+
+PARAM_BROADCAST_BUCKET_SIZE = int(250 * 1024 * 1024)
+FSDP_SYNCED = "_fsdp_synced"
+
+
+def _get_ignored_modules(
+    root_module: nn.Module,
+    _ignored_modules: Optional[Iterable[torch.nn.Module]],
+) -> Set[nn.Module]:
+    """
+    Checks that ``_ignored_modules`` is an iterable of ``nn.Module`` s without
+    any FSDP instances, and returns the modules contained in their module
+    subtrees as a :class:`set`. Nested FSDP instances are excluded, but their
+    already-computed ignored modules are included.
+    """
+    if _ignored_modules is None:
+        return set()
+    msg_prefix = "`ignored_modules` should be an iterable of `torch.nn.Module`s "
+    try:
+        ignored_root_modules = set(_ignored_modules)
+    except TypeError:
+        raise TypeError(msg_prefix + f"but got {type(_ignored_modules)}")
+    for module in ignored_root_modules:
+        if not isinstance(module, torch.nn.Module):
+            raise TypeError(msg_prefix + f"but got an iterable with {type(module)}")
+        if isinstance(module, fsdp_file.FullyShardedDataParallel):
+            raise ValueError("`ignored_modules` should not include FSDP modules")
+    # Include child modules and exclude nested FSDP modules themselves
+    ignored_modules = set(
+        child
+        for module in ignored_root_modules
+        for child in module.modules()
+        if not isinstance(child, fsdp_file.FullyShardedDataParallel)
+    )
+    if root_module in ignored_modules:
+        warnings.warn(
+            "Trying to ignore the top-level module passed into the FSDP "
+            "constructor itself will result in all parameters being "
+            f"ignored and is not well-supported: {module}"
+        )
+    # Include nested FSDP modules' ignored modules
+    for submodule in root_module.modules():
+        if isinstance(submodule, fsdp_file.FullyShardedDataParallel):
+            assert hasattr(submodule, "_ignored_modules")
+            ignored_modules.update(submodule._ignored_modules)
+    return ignored_modules
+
+
+def _get_ignored_params(
+    root_module: torch.nn.Module,
+    ignored_modules: Set[torch.nn.Module],
+) -> Tuple[Set[torch.nn.Parameter], Set[str]]:
+    """
+    Returns the parameters of the modules in ``ignored_modules``,
+    excluding any :class:`FlatParameter` s, and their fully prefixed names,
+    both as :class:`set` s.
+    """
+    ignored_params = set(
+        p for m in ignored_modules for p in m.parameters() if not _is_fsdp_flattened(p)
+    )
+    # Conservatively include all shared parameters' names
+    param_to_unflat_param_names = _get_param_to_unflat_param_names(
+        root_module,
+        dedup_shared_params=False,
+    )
+    ignored_param_names = set()
+    for param in ignored_params:
+        unflat_param_names = param_to_unflat_param_names[param]
+        clean_names = []
+        for k in unflat_param_names:
+            # Clean any module wrapper prefixes in case of nested wrapping
+            clean_names.append(clean_tensor_name(k))
+        ignored_param_names.update(clean_names)
+    return ignored_params, ignored_param_names
+
+
+def _get_buffer_names(root_module: nn.Module) -> Set[str]:
+    """
+    Returns the fully prefixed names of all buffers in the module hierarchy
+    rooted at ``root_module`` as a class:`set`.
+    """
+
+    def module_fn(module: nn.Module, prefix: str, buffer_names: Set[str]):
+        for buffer_name, _ in module.named_buffers(recurse=False):
+            # Clean module wrapper prefixes in case of nested wrapping
+            prefixed_buffer_name = clean_tensor_name(prefix + buffer_name)
+            buffer_names.add(prefixed_buffer_name)
+
+    def return_fn(buffer_names: Set[str], *args):
+        return buffer_names
+
+    buffer_names: Set[str] = set()
+    return _apply_to_modules(
+        root_module,
+        module_fn,
+        return_fn,
+        buffer_names,
+    )
+
+
+def _check_single_device_module(
+    module: nn.Module,
+    ignored_params: Set[nn.Parameter],
+) -> None:
+    """
+    Raises an error if ``module`` has original parameters on multiple devices,
+    ignoring the parameters in ``ignored_params``. Thus, after this method, the
+    module must be either fully on the CPU or fully on a non-CPU device.
+    """
+    devices = set(param.device for param in _get_orig_params(module, ignored_params))
+    if len(devices) > 1:
+        raise RuntimeError(
+            f"FSDP only supports single device modules but got params on {devices}"
+        )
+
+
+def _get_device_from_device_id(
+    device_id: Optional[Union[int, torch.device]],
+    rank: int,
+) -> Optional[torch.device]:
+    """
+    Processes ``device_id`` and returns either the corresponding device or
+    ``None`` if ``device_id`` is ``None``.
+    """
+    if device_id is None:
+        return None
+    device = (
+        device_id if isinstance(device_id, torch.device) else torch.device(device_id)
+    )
+    if device == torch.device("cuda"):
+        warnings.warn(
+            f"FSDP got the argument `device_id` {device_id} on rank "
+            f"{rank}, which does not have an explicit index. "
+            f"FSDP will use the current device {torch.cuda.current_device()}. "
+            "If this is incorrect, please explicitly call `torch.cuda.set_device()` "
+            "before FSDP initialization or pass in the explicit device "
+            "index as the `device_id` argument."
+        )
+        device = torch.device("cuda", torch.cuda.current_device())
+    return device
+
+
+def _materialize_module(
+    module: nn.Module,
+    param_init_fn: Optional[Callable[[nn.Module], None]],
+    ignored_params: Set[nn.Parameter],
+    device_from_device_id: Optional[torch.device],
+    deferred_init_check_fn: Callable,
+) -> None:
+    """
+    Materializes the wrapped module ``module`` in place if needed: either
+    if the module has parameters that use meta device or are torchdistX
+    fake tensors.
+
+    This method uses ``param_init_fn`` to materialize the module if the
+    function is not ``None`` and falls back to default behavior otherwise.
+    For meta device, this moves the module to ``device_from_device_id`` if
+    it is not ``None`` or the current device otherwise and calls
+    ``reset_parameters()``, and for torchdistX fake tensors, this calls
+    ``deferred_init.materialize_module()``.
+    """
+    is_meta_module = any(p.is_meta for p in _get_orig_params(module, ignored_params))
+    is_torchdistX_deferred_init = (
+        not is_meta_module
+        and _TORCHDISTX_AVAIL
+        and any(fake.is_fake(p) for p in _get_orig_params(module, ignored_params))
+    )
+    if (is_meta_module or is_torchdistX_deferred_init) and param_init_fn is not None:
+        if not callable(param_init_fn):
+            raise ValueError(
+                f"Expected {param_init_fn} to be callable but got {type(param_init_fn)}"
+            )
+        param_init_fn(module)
+    elif is_meta_module:
+        # Run default meta device initialization
+        materialization_device = device_from_device_id or torch.device(
+            torch.cuda.current_device()
+        )
+        module.to_empty(device=materialization_device)
+        try:
+            with torch.no_grad():
+                module.reset_parameters()  # type: ignore[operator]
+        except BaseException as e:
+            warnings.warn(
+                "Unable to call `reset_parameters()` for module on meta "
+                f"device with error {str(e)}. Please ensure your "
+                "module implements a `reset_parameters()` method."
+            )
+            raise e
+    elif is_torchdistX_deferred_init:
+        # Run default torchdistX initialization
+        deferred_init.materialize_module(module, check_fn=deferred_init_check_fn)
+
+
+def _move_module_to_device(
+    module: nn.Module,
+    ignored_params: Set[nn.Parameter],
+    device_from_device_id: Optional[torch.device],
+):
+    """
+    Moves ``module`` depending on ``device_from_device_id`` and its current
+    device. This includes moving ignored modules' parameters.
+
+    - If ``device_from_device_id`` is not ``None``, then this moves
+    ``module`` to the device.
+    - If ``device_from_device_id`` is ``None``, then this does not move
+    ``module`` but warns the user if it is on CPU.
+
+    Precondition: ``_check_single_device_module()``.
+    """
+    cpu_device = torch.device("cpu")
+    param = next(_get_orig_params(module, ignored_params), None)
+    if param is None:
+        return  # no original parameters to manage
+    if device_from_device_id is not None:
+        if param.device == cpu_device:
+            # NOTE: This includes moving ignored modules' parameters.
+            module = module.to(device_from_device_id)
+            # TODO: This is a temporary fix to move already-constructed
+            # `FlatParameter`s back to CPU if needed. This is needed to
+            # make CPU offload work with `device_id`.
+            for submodule in module.modules():
+                if (
+                    isinstance(submodule, fsdp_file.FullyShardedDataParallel)
+                    and submodule.cpu_offload.offload_params
+                ):
+                    with torch.no_grad():
+                        for handle in submodule._handles:
+                            handle.flat_param_to(torch.device("cpu"))
+    elif param.device == cpu_device:
+        warnings.warn(
+            "Module is put on CPU and will thus have flattening and sharding"
+            " run on CPU, which is less efficient than on GPU. We recommend passing in "
+            "`device_id` argument which will enable FSDP to put module on GPU device,"
+            " module must also be on GPU device to work with `sync_module_states=True` flag"
+            " which requires GPU communication."
+        )
+
+
+def _get_compute_device(
+    module: nn.Module,
+    ignored_params: Set[nn.Parameter],
+    device_from_device_id: Optional[torch.device],
+    rank: int,
+) -> torch.device:
+    """
+    Determines and returns this FSDP instance's compute device. If the module
+    is already on a non-CPU device, then the compute device is that non-CPU
+    device. If the module is on CPU, then the compute device is the current
+    device.
+
+    Since this method should be called after materializing the module, any
+    non-CPU device should not be meta device. For now, the compute device is
+    always a CUDA GPU device with its explicit index.
+
+    Precondition: ``_check_single_device_module()`` and
+    ``_move_module_to_device()``.
+    """
+    # If the module is on GPU already, then that GPU device has priority
+    # over the current device
+    param = next(_get_orig_params(module, ignored_params), None)
+    if param is not None and param.device.type == "cuda":
+        compute_device = param.device
+    else:
+        compute_device = torch.device("cuda", torch.cuda.current_device())
+    if device_from_device_id is not None and compute_device != device_from_device_id:
+        raise ValueError(
+            f"Inconsistent compute device and `device_id` on rank {rank}: "
+            f"{compute_device} vs {device_from_device_id}"
+        )
+    return compute_device
+
+
+def _sync_module_states(
+    module: nn.Module,
+    params: List[nn.Parameter],
+    process_group: dist.ProcessGroup,
+) -> None:
+    """
+    Synchronizes module states (i.e. parameters ``params`` and all
+    not-yet-synced buffers) by broadcasting from rank 0 to all ranks.
+
+    Precondition: ``sync_module_states == True`` and ``self.process_group`` has
+    been set.
+    """
+    if params and any(param.device == torch.device("cpu") for param in params):
+        raise ValueError(
+            "Module has CPU parameters, but sync_module_states=True is specified."
+            "This only works for GPU module, please specify `device_id` argument or move"
+            " module to GPU before init."
+        )
+    module_states: List[torch.Tensor] = []
+    # TODO (awgu): When exposing the original parameters, we need to also
+    # use this attribute to prevent re-synchronizing parameters.
+    for buffer in module.buffers():
+        # Avoid re-synchronizing buffers in case of nested wrapping
+        if not getattr(buffer, FSDP_SYNCED, False):
+            setattr(buffer, FSDP_SYNCED, True)
+            module_states.append(buffer.detach())
+    module_states.extend(param.detach() for param in params)
+    _sync_params_and_buffers(
+        process_group,
+        module_states,
+        PARAM_BROADCAST_BUCKET_SIZE,
+        src=0,
+    )
+
+
+def _get_orig_params(
+    module: nn.Module,
+    ignored_params: Set[nn.Parameter],
+) -> Iterator[nn.Parameter]:
+    """
+    Returns an iterator over the original parameters in ``module``, ignoring
+    the parameters in ``ignored_params``, any ``FlatParameter`` s (which may be
+    present due to nested FSDP wrapping), and any original parameters already
+    flattened (only relevant when ``use_orig_params=True``).
+    """
+    param_gen = module.parameters()
+    try:
+        while True:
+            param = next(param_gen)
+            if param not in ignored_params and not _is_fsdp_flattened(param):
+                yield param
+    except StopIteration:
+        pass
+
+
+def _check_orig_params_flattened(
+    fsdp_module,
+    ignored_params: Set[nn.Parameter],
+) -> None:
+    """
+    Checks that all original parameters have been flattened and hence made
+    invisible to ``named_parameters()`` for the module hierarchy rooted at
+    ``fsdp_module``. This should be called as a sanity check after flattening
+    the wrapped module's parameters.
+    """
+    for param_name, param in fsdp_module.named_parameters():
+        if param not in ignored_params and not _is_fsdp_flattened(param):
+            raise RuntimeError(
+                f"Found an unflattened parameter: {param_name}; "
+                f"{param.size()} {param.__class__}"
+            )
diff --git a/torch/distributed/fsdp/_state_dict_utils.py b/torch/distributed/fsdp/_state_dict_utils.py
index 4e184d2e70c64..9276c3cf62cc7 100644
--- a/torch/distributed/fsdp/_state_dict_utils.py
+++ b/torch/distributed/fsdp/_state_dict_utils.py
@@ -16,6 +16,7 @@
     Shard,
     ShardedTensor,
 )
+from torch.distributed.fsdp._common_utils import clean_tensor_name
 from torch.distributed.utils import _replace_by_prefix
 
 from ._fsdp_extensions import (
@@ -70,7 +71,7 @@ def _full_post_state_dict_hook(
     for fqn, param_name, module_name in module._param_fqns:
         fqn = f"{prefix}{fqn}"
         clean_key = fqn
-        clean_prefix = fsdp_file.clean_tensor_name(prefix)
+        clean_prefix = clean_tensor_name(prefix)
         # Strip prefix out of key if needed as buffer names and param names
         # do not have prefix considered as they are not computed in `state_dict`
         # call.
diff --git a/torch/distributed/fsdp/_utils.py b/torch/distributed/fsdp/_utils.py
index eb72042b65573..cf3755e55c574 100644
--- a/torch/distributed/fsdp/_utils.py
+++ b/torch/distributed/fsdp/_utils.py
@@ -10,8 +10,6 @@
 )
 from torch.nn.utils.rnn import PackedSequence
 
-FSDP_FLATTENED = "_fsdp_flattened"
-
 
 def _contains_batchnorm(module):
     return any(isinstance(mod, _BatchNorm) for mod in module.modules())
@@ -61,33 +59,6 @@ def apply(
     return apply(container)
 
 
-def _apply_to_modules(
-    root_module: torch.nn.Module,
-    module_fn: Callable,
-    return_fn: Callable,
-    *args,
-    **kwargs,
-):
-    """
-    Performs a pre-order traversal of the modules in the hierarchy rooted at
-    ``root_module``, applying ``module_fn`` at each module and finally
-    returning a value using ``return_fn``. The traversal constructs the full
-    module prefix name (e.g. "module.submodule." just like in model state dict)
-    and makes that available to ``module_fn``.
-    """
-
-    def f(module: torch.nn.Module, prefix: str, *args, **kwargs):
-        # Call the module function before recursing over children (pre-order)
-        module_fn(module, prefix, *args, **kwargs)
-        for submodule_name, submodule in module.named_children():
-            if submodule is not None:
-                new_prefix = prefix + submodule_name + "."
-                f(submodule, new_prefix, *args, **kwargs)
-
-    f(root_module, "", *args, **kwargs)
-    return return_fn(*args, **kwargs)
-
-
 @torch.no_grad()
 def _alloc_storage(tensor: torch.Tensor, size: torch.Size) -> bool:
     """
@@ -130,19 +101,6 @@ def _free_storage(tensor: torch.Tensor) -> bool:
     return not already_freed
 
 
-def _set_fsdp_flattened(tensor: torch.Tensor) -> None:
-    """
-    Sets an attribute on ``tensor`` to mark it as flattened by FSDP. This is to
-    avoid re-flattening it during nested construction.
-    """
-    setattr(tensor, FSDP_FLATTENED, True)
-
-
-def _is_fsdp_flattened(tensor: torch.Tensor) -> bool:
-    """Returns if ``tensor`` has been marked as flattened by FSDP."""
-    return getattr(tensor, FSDP_FLATTENED, False)
-
-
 def _same_storage(x: torch.Tensor, y: torch.Tensor) -> bool:
     """Returns if ``x`` and ``y`` share the same storage."""
     # NOTE: CPU and GPU tensors are ensured to have different data pointers.
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index 21516e79d87ad..37db0a2d05bf1 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -23,17 +23,14 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import Tensor
-from torch.distributed.fsdp._common_utils import HandleTrainingState
-
-from ._fsdp_extensions import _ext_post_unflatten_transform, _ext_pre_flatten_transform
-from ._utils import (
-    _alloc_storage,
-    _free_storage,
-    _same_storage,
+from torch.distributed.fsdp._common_utils import (
     _set_fsdp_flattened,
-    p_assert,
+    HandleTrainingState,
 )
 
+from ._fsdp_extensions import _ext_post_unflatten_transform, _ext_pre_flatten_transform
+from ._utils import _alloc_storage, _free_storage, _same_storage, p_assert
+
 __all__ = [
     "FlatParameter",
     "FlatParamHandle",
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index a2c90d667f423..26865027a8e2f 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -38,7 +38,26 @@
 )
 from torch.distributed.algorithms._comm_hooks import default_hooks, LOW_PRECISION_HOOKS
 from torch.distributed.distributed_c10d import _get_default_group
-from torch.distributed.fsdp._common_utils import HandleTrainingState, TrainingState
+from torch.distributed.fsdp._common_utils import (
+    _get_param_to_unflat_param_names,
+    FSDP_PREFIX,
+    FSDP_WRAPPED_MODULE,
+    HandleTrainingState,
+    TrainingState,
+)
+from torch.distributed.fsdp._init_utils import (
+    _check_orig_params_flattened,
+    _check_single_device_module,
+    _get_buffer_names,
+    _get_compute_device,
+    _get_device_from_device_id,
+    _get_ignored_modules,
+    _get_ignored_params,
+    _get_orig_params,
+    _materialize_module,
+    _move_module_to_device,
+    _sync_module_states,
+)
 from torch.distributed.fsdp._runtime_utils import (
     _clear_grads_if_needed,
     _prepare_forward_inputs,
@@ -50,7 +69,6 @@
     MixedPrecision,
     ShardingStrategy,
 )
-from torch.distributed.utils import _sync_params_and_buffers
 
 from ._optim_utils import (
     _broadcast_pos_dim_tensor_states,
@@ -70,11 +88,9 @@
     _pre_load_state_dict_hook,
 )
 from ._utils import (
-    _apply_to_modules,
     _apply_to_tensors,
     _contains_batchnorm,
     _free_storage,
-    _is_fsdp_flattened,
     _override_batchnorm_mixed_precision,
     p_assert,
 )
@@ -91,12 +107,6 @@
     ParamExecOrderWrapPolicy,
 )
 
-_TORCHDISTX_AVAIL = True
-try:
-    from torchdistx import deferred_init, fake
-except ImportError:
-    _TORCHDISTX_AVAIL = False
-
 _TORCH_FX_AVAIL = True
 if not hasattr(torch, "fx"):
     _TORCH_FX_AVAIL = False
@@ -112,19 +122,11 @@
     "LocalStateDictConfig",
     "ShardedStateDictConfig",
     "OptimStateKeyType",
-    "clean_tensor_name",
 ]
 
 
-# NOTE: `FSDP_WRAPPED_MODULE` cannot be a substring of any other module wrapper
-# name (e.g. for activation checkpointing) since then `replace()`-based FQN
-# cleaning breaks.
-FSDP_WRAPPED_MODULE = "_fsdp_wrapped_module"
-FSDP_PREFIX = FSDP_WRAPPED_MODULE + "."
 FLAT_PARAM = "_flat_param"
 
-_PARAM_BROADCAST_BUCKET_SIZE = int(250 * 1024 * 1024)
-
 
 class StateDictType(Enum):
     """
@@ -875,11 +877,11 @@ def __init__(
         torch._C._log_api_usage_once("torch.distributed.fsdp")
         super().__init__()
 
-        self._ignored_modules = self._get_ignored_modules(module, ignored_modules)
-        ignored_params, self._ignored_param_names = self._get_ignored_params(
+        self._ignored_modules = _get_ignored_modules(module, ignored_modules)
+        ignored_params, self._ignored_param_names = _get_ignored_params(
             module, self._ignored_modules
         )
-        self._buffer_names = self._get_buffer_names(module)
+        self._buffer_names = _get_buffer_names(module)
         if auto_wrap_policy is not None:
             auto_wrap_kwargs = {
                 "module": module,
@@ -927,20 +929,25 @@ def __init__(
         # dtype for model checkpointing
         self._buffer_name_to_orig_dtype: Dict[str, torch.dtype] = {}
 
-        self._check_single_device_module(module, ignored_params)
-        device_from_device_id: Optional[torch.device] = self._get_device_from_device_id(
-            device_id
+        _check_single_device_module(module, ignored_params)
+        device_from_device_id = _get_device_from_device_id(device_id, self.rank)
+        _materialize_module(
+            module,
+            param_init_fn,
+            ignored_params,
+            device_from_device_id,
+            lambda k: not isinstance(k, FullyShardedDataParallel),
         )
-        self._materialize_module(
-            module, param_init_fn, ignored_params, device_from_device_id
+        _move_module_to_device(module, ignored_params, device_from_device_id)
+        self.compute_device = _get_compute_device(
+            module,
+            ignored_params,
+            device_from_device_id,
+            self.rank,
         )
-        self._move_module_to_device(module, ignored_params, device_from_device_id)
-        self.compute_device = self._get_compute_device(
-            module, ignored_params, device_from_device_id
-        )
-        params_to_flatten = list(self._get_orig_params(module, ignored_params))
+        params_to_flatten = list(_get_orig_params(module, ignored_params))
         if sync_module_states:
-            self._sync_module_states(module, params_to_flatten)
+            _sync_module_states(module, params_to_flatten, self.process_group)
 
         # This FSDP instance's handles should inherit the same process group,
         # compute device, CPU offload, and mixed precision settings. However,
@@ -976,7 +983,7 @@ def __init__(
             ):
                 handle.flat_param_to(torch.device("cpu"))
         if not use_orig_params:
-            self._check_orig_params_flattened(ignored_params)
+            _check_orig_params_flattened(self, ignored_params)
             self._register_flat_param()
 
         self._sync_gradients = True
@@ -1015,103 +1022,6 @@ def __init__(
         )
         self.register_load_state_dict_post_hook(_post_load_state_dict_hook)
 
-    def _get_ignored_modules(
-        self,
-        root_module: nn.Module,
-        _ignored_modules: Optional[Iterable[torch.nn.Module]],
-    ) -> Set[nn.Module]:
-        """
-        Checks that ``_ignored_modules`` is an iterable of ``nn.Module`` s
-        without any FSDP instances, and returns the modules contained in their
-        module subtrees as a :class:`set`. Nested FSDP instances are excluded,
-        but their already-computed ignored modules are included.
-        """
-        if _ignored_modules is None:
-            return set()
-        msg_prefix = "`ignored_modules` should be an iterable of `torch.nn.Module`s "
-        try:
-            ignored_root_modules = set(_ignored_modules)
-        except TypeError:
-            raise TypeError(msg_prefix + f"but got {type(_ignored_modules)}")
-        for module in ignored_root_modules:
-            if not isinstance(module, torch.nn.Module):
-                raise TypeError(msg_prefix + f"but got an iterable with {type(module)}")
-            if isinstance(module, FullyShardedDataParallel):
-                raise ValueError("`ignored_modules` should not include FSDP modules")
-        # Include child modules and exclude nested FSDP modules themselves
-        ignored_modules = set(
-            child
-            for module in ignored_root_modules
-            for child in module.modules()
-            if not isinstance(child, FullyShardedDataParallel)
-        )
-        if root_module in ignored_modules:
-            warnings.warn(
-                "Trying to ignore the top-level module passed into the FSDP "
-                "constructor itself will result in all parameters being "
-                f"ignored and is not well-supported: {module}"
-            )
-        # Include nested FSDP modules' ignored modules
-        for submodule in root_module.modules():
-            if isinstance(submodule, FullyShardedDataParallel):
-                assert hasattr(submodule, "_ignored_modules")
-                ignored_modules.update(submodule._ignored_modules)
-        return ignored_modules
-
-    def _get_ignored_params(
-        self,
-        root_module: torch.nn.Module,
-        ignored_modules: Set[torch.nn.Module],
-    ) -> Tuple[Set[torch.nn.Parameter], Set[str]]:
-        """
-        Returns the parameters of the modules in ``ignored_modules``,
-        excluding any :class:`FlatParameter` s, and their fully prefixed names,
-        both as :class:`set` s.
-        """
-        ignored_params = set(
-            p
-            for m in ignored_modules
-            for p in m.parameters()
-            if not _is_fsdp_flattened(p)
-        )
-        # Conservatively include all shared parameters' names
-        param_to_unflat_param_names = _get_param_to_unflat_param_names(
-            root_module,
-            dedup_shared_params=False,
-        )
-        ignored_param_names = set()
-        for param in ignored_params:
-            unflat_param_names = param_to_unflat_param_names[param]
-            clean_names = []
-            for k in unflat_param_names:
-                # Clean any module wrapper prefixes in case of nested wrapping
-                clean_names.append(clean_tensor_name(k))
-            ignored_param_names.update(clean_names)
-        return ignored_params, ignored_param_names
-
-    def _get_buffer_names(self, root_module: nn.Module) -> Set[str]:
-        """
-        Returns the fully prefixed names of all buffers in the module hierarchy
-        rooted at ``root_module`` as a class:`set`.
-        """
-
-        def module_fn(module: nn.Module, prefix: str, buffer_names: Set[str]):
-            for buffer_name, _ in module.named_buffers(recurse=False):
-                # Clean module wrapper prefixes in case of nested wrapping
-                prefixed_buffer_name = clean_tensor_name(prefix + buffer_name)
-                buffer_names.add(prefixed_buffer_name)
-
-        def return_fn(buffer_names: Set[str], *args):
-            return buffer_names
-
-        buffer_names: Set[str] = set()
-        return _apply_to_modules(
-            root_module,
-            module_fn,
-            return_fn,
-            buffer_names,
-        )
-
     def _auto_wrap(
         self,
         auto_wrap_kwargs: Dict[str, Any],
@@ -1153,256 +1063,6 @@ def _auto_wrap(
             auto_wrap_kwargs["auto_wrap_policy"] = auto_wrap_policy
         _recursive_wrap(**auto_wrap_kwargs, **fsdp_kwargs)
 
-    def _check_single_device_module(
-        self,
-        module: nn.Module,
-        ignored_params: Set[nn.Parameter],
-    ) -> None:
-        """
-        Raises an error if ``module`` has original parameters on multiple
-        devices, ignoring the parameters in ``ignored_params``. Thus, after
-        this method, the module must be either fully on the CPU or fully on a
-        non-CPU device.
-        """
-        devices = set(
-            param.device for param in self._get_orig_params(module, ignored_params)
-        )
-        if len(devices) > 1:
-            raise RuntimeError(
-                f"FSDP only supports single device modules but got params on {devices}"
-            )
-
-    def _get_device_from_device_id(
-        self,
-        device_id: Optional[Union[int, torch.device]],
-    ) -> Optional[torch.device]:
-        """ """
-        if device_id is None:
-            return None
-        device = (
-            device_id
-            if isinstance(device_id, torch.device)
-            else torch.device(device_id)
-        )
-        if device == torch.device("cuda"):
-            warnings.warn(
-                f"FSDP got the argument `device_id` {device_id} on rank "
-                f"{self.rank}, which does not have an explicit index. "
-                f"FSDP will use the current device {torch.cuda.current_device()}. "
-                "If this is incorrect, please explicitly call `torch.cuda.set_device()` "
-                "before FSDP initialization or pass in the explicit device "
-                "index as the `device_id` argument."
-            )
-            device = torch.device("cuda", torch.cuda.current_device())
-        return device
-
-    def _materialize_module(
-        self,
-        module: nn.Module,
-        param_init_fn: Optional[Callable[[nn.Module], None]],
-        ignored_params: Set[nn.Parameter],
-        device_from_device_id: Optional[torch.device],
-    ) -> None:
-        """
-        Materializes the wrapped module ``module`` in place if needed: either
-        if the module has parameters that use meta device or are torchdistX
-        fake tensors.
-
-        This method uses ``param_init_fn`` to materialize the module if the
-        function is not ``None`` and falls back to default behavior otherwise.
-        For meta device, this moves the module to ``device_from_device_id`` if
-        it is not ``None`` or the current device otherwise and calls
-        ``reset_parameters()``, and for torchdistX fake tensors, this calls
-        ``deferred_init.materialize_module()``.
-        """
-        is_meta_module = any(
-            p.is_meta for p in self._get_orig_params(module, ignored_params)
-        )
-        is_torchdistX_deferred_init = (
-            not is_meta_module
-            and _TORCHDISTX_AVAIL
-            and any(
-                fake.is_fake(p) for p in self._get_orig_params(module, ignored_params)
-            )
-        )
-        if (
-            is_meta_module or is_torchdistX_deferred_init
-        ) and param_init_fn is not None:
-            if not callable(param_init_fn):
-                raise ValueError(
-                    f"Expected {param_init_fn} to be callable but got {type(param_init_fn)}"
-                )
-            param_init_fn(module)
-        elif is_meta_module:
-            # Run default meta device initialization
-            materialization_device = (
-                device_from_device_id or torch.cuda.current_device()
-            )
-            module.to_empty(device=materialization_device)
-            try:
-                with torch.no_grad():
-                    module.reset_parameters()
-            except BaseException as e:
-                warnings.warn(
-                    "Unable to call `reset_parameters()` for module on meta "
-                    f"device with error {str(e)}. Please ensure your "
-                    "module implements a `reset_parameters()` method."
-                )
-                raise e
-        elif is_torchdistX_deferred_init:
-            # Run default torchdistX initialization
-            deferred_init.materialize_module(
-                module,
-                check_fn=lambda k: not isinstance(k, FullyShardedDataParallel),
-            )
-
-    def _move_module_to_device(
-        self,
-        module: nn.Module,
-        ignored_params: Set[nn.Parameter],
-        device_from_device_id: Optional[torch.device],
-    ):
-        """
-        Moves ``module`` depending on ``device_from_device_id`` and its current
-        device. This includes moving ignored modules' parameters.
-
-        - If ``device_from_device_id`` is not ``None``, then this moves
-        ``module`` to the device.
-        - If ``device_from_device_id`` is ``None``, then this does not move
-        ``module`` but warns the user if it is on CPU.
-
-        Precondition: ``_check_single_device_module()``.
-        """
-        cpu_device = torch.device("cpu")
-        param = next(self._get_orig_params(module, ignored_params), None)
-        if param is None:
-            return  # no original parameters to manage
-        if device_from_device_id is not None:
-            if param.device == cpu_device:
-                # NOTE: This includes moving ignored modules' parameters.
-                module = module.to(device_from_device_id)
-                # TODO: This is a temporary fix to move already- constructed
-                # `FlatParameter`s back to CPU if needed. This is needed to
-                # make CPU offload work with `device_id`.
-                for submodule in module.modules():
-                    if (
-                        isinstance(submodule, FullyShardedDataParallel)
-                        and submodule.cpu_offload.offload_params
-                    ):
-                        with torch.no_grad():
-                            for handle in submodule._handles:
-                                handle.flat_param_to(torch.device("cpu"))
-        elif param.device == cpu_device:
-            warnings.warn(
-                "Module is put on CPU and will thus have flattening and sharding"
-                " run on CPU, which is less efficient than on GPU. We recommend passing in "
-                "`device_id` argument which will enable FSDP to put module on GPU device,"
-                " module must also be on GPU device to work with `sync_module_states=True` flag"
-                " which requires GPU communication."
-            )
-
-    def _get_compute_device(
-        self,
-        module: nn.Module,
-        ignored_params: Set[nn.Parameter],
-        device_from_device_id: Optional[torch.device],
-    ) -> torch.device:
-        """
-        Determines and returns this FSDP instance's compute device. If the
-        module is already on a non-CPU device, then the compute device is that
-        non-CPU device. If the module is on CPU, then the compute device is the
-        current device.
-
-        Since this method should be called after materializing the module, any
-        non-CPU device should not be meta device. For now, the compute device
-        is always a CUDA GPU device with its explicit index.
-
-        Precondition: ``_check_single_device_module()`` and
-        ``_move_module_to_device()``.
-        """
-        # If the module is on GPU already, then that GPU device has priority
-        # over the current device
-        param = next(self._get_orig_params(module, ignored_params), None)
-        if param is not None and param.device.type == "cuda":
-            compute_device = param.device
-        else:
-            compute_device = torch.device("cuda", torch.cuda.current_device())
-        if (
-            device_from_device_id is not None
-            and compute_device != device_from_device_id
-        ):
-            raise ValueError(
-                "Inconsistent compute device and `device_id` on rank "
-                f"{self.rank}: {compute_device} vs {device_from_device_id}"
-            )
-        return compute_device
-
-    def _sync_module_states(
-        self, module: nn.Module, params: List[nn.Parameter]
-    ) -> None:
-        """
-        Synchronizes module states (i.e. parameters ``params`` and all
-        not-yet-synced buffers) by broadcasting from rank 0 to all ranks.
-
-        Precondition: ``sync_module_states == True`` and ``self.process_group``
-        has been set.
-        """
-        if params and any(param.device == torch.device("cpu") for param in params):
-            raise ValueError(
-                "Module has CPU parameters, but sync_module_states=True is specified."
-                "This only works for GPU module, please specify `device_id` argument or move"
-                " module to GPU before init."
-            )
-        module_states: List[torch.Tensor] = []
-        # TODO (awgu): When exposing the original parameters, we need to also
-        # use this attribute to prevent re-synchronizing parameters.
-        for buffer in module.buffers():
-            # Avoid re-synchronizing buffers in case of nested wrapping
-            if not getattr(buffer, "_fsdp_synced", False):
-                buffer._fsdp_synced = True
-                module_states.append(buffer.detach())
-        module_states.extend(param.detach() for param in params)
-        _sync_params_and_buffers(
-            self.process_group,
-            module_states,
-            _PARAM_BROADCAST_BUCKET_SIZE,
-            src=0,
-        )
-
-    def _get_orig_params(
-        self,
-        module: nn.Module,
-        ignored_params: Set[nn.Parameter],
-    ) -> Iterator[nn.Parameter]:
-        """
-        Returns an iterator over the original parameters in ``module``,
-        ignoring the parameters in ``ignored_params``, any ``FlatParameter``
-        s (which may be present due to nested FSDP wrapping), and any original
-        parameters already flattened (only relevant when using the original
-        parameters).
-        """
-        param_gen = module.parameters()
-        try:
-            while True:
-                param = next(param_gen)
-                if param not in ignored_params and not _is_fsdp_flattened(param):
-                    yield param
-        except StopIteration:
-            pass
-
-    def _check_orig_params_flattened(self, ignored_params: Set[nn.Parameter]) -> None:
-        """
-        Checks that all original parameters have been flattened and hence made
-        invisible to ``named_parameters()``. This should be called as a sanity
-        check after flattening the wrapped module's parameters.
-        """
-        for param_name, param in self.named_parameters():
-            if param not in ignored_params and not _is_fsdp_flattened(param):
-                raise RuntimeError(
-                    f"Found an unflattened parameter: {param_name}; "
-                    f"{param.size()} {param.__class__}"
-                )
-
     def _register_param_handle(self, handle: FlatParamHandle) -> None:
         """Registers the parameter handle to this FSDP instance."""
         if handle not in self._handles:
@@ -4103,56 +3763,6 @@ def _get_grad_norm(
     return grad_norm
 
 
-def _get_param_to_unflat_param_names(
-    model: torch.nn.Module,
-    dedup_shared_params: bool = True,
-) -> Dict[torch.nn.Parameter, List[str]]:
-    """
-    Constructs a mapping from flattened parameter (including non-FSDP-module
-    parameters) to its unflattened parameter names. For non-FSDP-module
-    parameters, these mapped-to lists always contain a single element. The
-    unflattened parameter names should match the keys of the model state dict.
-
-    For shared parameters, only the first parameter name is included (following
-    the ``torch.nn.Module.parameters()`` order).
-
-    Args:
-        model (torch.nn.Module): Root module (which may or may not be a
-            :class:`FullyShardedDataParallel` instance).
-        dedup_shared_params (bool): If ``True``, only includes the first
-            list of unflattened parameter names corresponding to a parameter
-            in the module walk order; if ``False``, then includes all of the
-            unflattened parameter names.
-    """
-
-    def module_fn(module, prefix, param_to_unflat_param_names):
-        for param_name, param in module.named_parameters(recurse=False):
-            module_prefixed_param_names = (
-                param._fqns if type(param) is FlatParameter else [param_name]
-            )  # prefixed from `module`
-            fully_prefixed_param_names = [
-                clean_tensor_name(prefix + name) for name in module_prefixed_param_names
-            ]  # fully prefixed from the top level including `prefix`
-            # If this parameter has already been visited, then it is a
-            # shared parameter; then, only take the first parameter name
-            is_shared_param = param in param_to_unflat_param_names
-            if not is_shared_param:
-                param_to_unflat_param_names[param] = fully_prefixed_param_names
-            elif not dedup_shared_params:
-                param_to_unflat_param_names[param].extend(fully_prefixed_param_names)
-
-    def return_fn(param_to_unflat_param_names):
-        return param_to_unflat_param_names
-
-    param_to_unflat_param_names: Dict[torch.nn.Parameter, List[str]] = {}
-    return _apply_to_modules(
-        model,
-        module_fn,
-        return_fn,
-        param_to_unflat_param_names,
-    )
-
-
 def _get_param_to_param_name(
     model: torch.nn.Module,
 ) -> Dict[torch.nn.Parameter, str]:
@@ -4190,15 +3800,3 @@ def _get_param_name_to_param(
     """Constructs the inverse mapping of :meth:`_get_param_to_param_name`."""
     param_to_param_name = _get_param_to_param_name(model)
     return dict(zip(param_to_param_name.values(), param_to_param_name.keys()))
-
-
-def clean_tensor_name(tensor_name: str) -> str:
-    """Cleans the parameter or buffer name by removing any module wrapper
-    prefixes."""
-    tensor_name = tensor_name.replace(FSDP_PREFIX, "")
-    # TODO: Explicitly replacing checkpoint_wrapper prefix is not ideal,
-    # as it increases coupling between CheckpointWrapper and FSDP. This is also not
-    # scalable for additional wrapped modules, we should come up with a general solution
-    # for this issue.
-    tensor_name = tensor_name.replace(_CHECKPOINT_PREFIX, "")
-    return tensor_name

From 2377b4cdea32deb8c8cea7c756e4fcae2a60a723 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Mon, 31 Oct 2022 01:43:05 +0000
Subject: [PATCH 0343/1922] [FSDP()][8/N] Refactor limiter's `_FreeEventQueue`
 (#87922)

This PR is easy. It just moves `_FreeEventQueue` into its own file `_limiter_utils.py`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87922
Approved by: https://github.com/rohan-varma, https://github.com/mrshenli
---
 torch/distributed/fsdp/_limiter_utils.py      | 33 +++++++++++++++++++
 .../fsdp/fully_sharded_data_parallel.py       | 32 +-----------------
 2 files changed, 34 insertions(+), 31 deletions(-)
 create mode 100644 torch/distributed/fsdp/_limiter_utils.py

diff --git a/torch/distributed/fsdp/_limiter_utils.py b/torch/distributed/fsdp/_limiter_utils.py
new file mode 100644
index 0000000000000..efb5b3ba5ae1f
--- /dev/null
+++ b/torch/distributed/fsdp/_limiter_utils.py
@@ -0,0 +1,33 @@
+import collections
+from typing import Deque, Optional
+
+import torch
+
+
+class _FreeEventQueue:
+    """
+    This tracks all pending frees corresponding to inflight all-gathers. The
+    queueing pattern is iterative enqueues with a single dequeue per iteration
+    once the limit ``_max_num_inflight_all_gathers`` is reached.
+    """
+
+    def __init__(self) -> None:
+        self._queue: Deque[torch.cuda.Event] = collections.deque()
+        self._max_num_inflight_all_gathers = 2  # empirically chosen
+
+    def enqueue(self, free_event: torch.cuda.Event) -> None:
+        """Enqueues a free event."""
+        self._queue.append(free_event)
+
+    def dequeue_if_needed(self) -> Optional[torch.cuda.Event]:
+        """Dequeues a single event if the limit is reached."""
+        if len(self._queue) >= self._max_num_inflight_all_gathers:
+            return self._dequeue()
+        return None
+
+    def _dequeue(self) -> Optional[torch.cuda.Event]:
+        """Dequeues a free event if possible."""
+        if self._queue:
+            event = self._queue.popleft()
+            return event
+        return None
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 26865027a8e2f..2608f76e311af 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -1,4 +1,3 @@
-import collections
 import contextlib
 import copy
 import functools
@@ -13,7 +12,6 @@
     Any,
     Callable,
     cast,
-    Deque,
     Dict,
     Generator,
     Iterable,
@@ -58,6 +56,7 @@
     _move_module_to_device,
     _sync_module_states,
 )
+from torch.distributed.fsdp._limiter_utils import _FreeEventQueue
 from torch.distributed.fsdp._runtime_utils import (
     _clear_grads_if_needed,
     _prepare_forward_inputs,
@@ -585,35 +584,6 @@ def next_iter(self):
                 self.warn_status = _ExecOrderWarnStatus.WARNED
 
 
-class _FreeEventQueue:
-    """
-    This tracks all pending frees corresponding to inflight all-gathers. The
-    queueing pattern is iterative enqueues with a single dequeue per iteration
-    once the limit ``_max_num_inflight_all_gathers`` is reached.
-    """
-
-    def __init__(self) -> None:
-        self._queue: Deque[torch.cuda.Event] = collections.deque()
-        self._max_num_inflight_all_gathers = 2  # empirically chosen
-
-    def enqueue(self, free_event: torch.cuda.Event) -> None:
-        """Enqueues a free event."""
-        self._queue.append(free_event)
-
-    def dequeue_if_needed(self) -> Optional[torch.cuda.Event]:
-        """Dequeues a single event if the limit is reached."""
-        if len(self._queue) >= self._max_num_inflight_all_gathers:
-            return self._dequeue()
-        return None
-
-    def _dequeue(self) -> Optional[torch.cuda.Event]:
-        """Dequeues a free event if possible."""
-        if self._queue:
-            event = self._queue.popleft()
-            return event
-        return None
-
-
 # TODO (awgu): Refactor this later
 sharding_strategy_map = {
     ShardingStrategy.NO_SHARD: HandleShardingStrategy.NO_SHARD,

From b87712a62584330b48d9f1ff1b326bf418f389ee Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Mon, 31 Oct 2022 03:24:48 +0000
Subject: [PATCH 0344/1922] Fix meta function for aten.flip and aten.rot90
 (#88065)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88065
Approved by: https://github.com/mruberry
---
 test/test_meta.py        | 2 --
 torch/_prims/__init__.py | 3 ++-
 torch/_refs/__init__.py  | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/test/test_meta.py b/test/test_meta.py
index 1abe4cd2cda75..c46fc827e1661 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -219,7 +219,6 @@ def test_tensor_outlives_converter(self):
     aten.div.Tensor_mode,
     aten.div.Tensor,
     aten.eq.Tensor,
-    aten.flip.default,
     aten.floor_divide.default,
     aten.fmax.default,
     aten.fmin.default,
@@ -248,7 +247,6 @@ def test_tensor_outlives_converter(self):
     aten.pow.Tensor_Tensor,
     aten.prelu.default,
     aten.remainder.Tensor,
-    aten.rot90.default,
     aten.rsub.Tensor,
     aten.special_xlog1py.default,
     aten.special_zeta.default,
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index bf71b0069585e..6d40e1071fb53 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -1879,7 +1879,8 @@ def _reshape_aten(a: Tensor, shape: ShapeType) -> Tensor:
 
 def _rev_meta(a: TensorLikeType, dims: DimsSequenceType) -> TensorLikeType:
     utils.validate_dimension_indices(a.ndim, dims)
-    return TensorMeta(a)
+    out = torch.empty_like(a, memory_format=torch.preserve_format)
+    return TensorMeta(out)
 
 
 _rev_doc = """
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index ba02416dd24e8..64aa879a0c35a 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -3136,7 +3136,7 @@ def rot90(
     elif k == 3:
         return torch.transpose(torch.flip(a, (dims[0],)), dims[0], dims[1])
     else:
-        return clone(a)
+        return clone(a, memory_format=torch.contiguous_format)
 
 
 def _check_stack_inputs(tensors: TensorSequenceType) -> None:

From a9e233b491780b27d2eb819cac501031b1f4c111 Mon Sep 17 00:00:00 2001
From: Catherine Lee <csl@fb.com>
Date: Mon, 31 Oct 2022 16:52:28 +0000
Subject: [PATCH 0345/1922] Move check labels to separate workflow (#87999)

* moves check labels to separate workflow that is triggered on the usual pull_request triggers as well as labeled and unlabeled
* deletes comments when label is added

Fixes https://github.com/pytorch/test-infra/issues/978 and https://github.com/pytorch/pytorch/issues/87865
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87999
Approved by: https://github.com/huydhn
---
 .github/scripts/check_labels.py    | 25 +++++++++--------
 .github/scripts/gql_mocks.json     | 38 ++++++++++++++++++++++++++
 .github/workflows/check-labels.yml | 44 ++++++++++++++++++++++++++++++
 .github/workflows/lint.yml         | 34 -----------------------
 4 files changed, 96 insertions(+), 45 deletions(-)
 create mode 100644 .github/workflows/check-labels.yml

diff --git a/.github/scripts/check_labels.py b/.github/scripts/check_labels.py
index ff40a94ee6fec..2d4a216daf942 100755
--- a/.github/scripts/check_labels.py
+++ b/.github/scripts/check_labels.py
@@ -2,7 +2,6 @@
 """check_labels.py"""
 
 from typing import Any, List
-from datetime import datetime, timedelta
 
 from export_pytorch_labels import get_pytorch_labels
 from gitutils import (
@@ -39,22 +38,24 @@ def delete_comment(comment_id: int) -> None:
 
 def has_required_labels(pr: GitHubPR) -> bool:
     pr_labels = pr.get_labels()
-
     # Check if PR is not user facing
     is_not_user_facing_pr = any(label.strip() == "topic: not user facing" for label in pr_labels)
-    if is_not_user_facing_pr:
-        return True
+    return is_not_user_facing_pr or any(label.strip() in get_release_notes_labels() for label in pr_labels)
+
 
-    # Check if bot has already posted a message within the past hour to include a release notes label
+def delete_comments(pr: GitHubPR) -> None:
+    # Delete all previous comments
     for comment in pr.get_comments():
         if comment.body_text.lstrip(" #").startswith(ERR_MSG_TITLE) and comment.author_login in BOT_AUTHORS:
-            ts = datetime.strptime(comment.created_at, "%Y-%m-%dT%H:%M:%SZ")
-            if (datetime.utcnow() - ts) < timedelta(hours=1):
-                return True
             delete_comment(comment.database_id)
-            break
 
-    return any(label.strip() in get_release_notes_labels() for label in pr_labels)
+
+def add_comment(pr: GitHubPR) -> None:
+    # Only make a comment if one doesn't exist already
+    for comment in pr.get_comments():
+        if comment.body_text.lstrip(" #").startswith(ERR_MSG_TITLE) and comment.author_login in BOT_AUTHORS:
+            return
+    gh_post_pr_comment(pr.org, pr.project, pr.pr_num, ERR_MSG)
 
 
 def parse_args() -> Any:
@@ -74,8 +75,10 @@ def main() -> None:
     try:
         if not has_required_labels(pr):
             print(ERR_MSG)
-            gh_post_pr_comment(pr.org, pr.project, pr.pr_num, ERR_MSG)
+            add_comment(pr)
             exit(1)
+        else:
+            delete_comments(pr)
     except Exception as e:
         pass
 
diff --git a/.github/scripts/gql_mocks.json b/.github/scripts/gql_mocks.json
index 164c1ac147e5b..7f6dbc05d3415 100644
--- a/.github/scripts/gql_mocks.json
+++ b/.github/scripts/gql_mocks.json
@@ -35866,5 +35866,43 @@
         }
       }
     }
+  },
+  "query_sha=2e2877d2452c4f233f042b7ccd50ab9c2a6e9a73d8819a0c876203c12364e8a3 cursor=Y3Vyc29yOnYyOpHOQebHmg== name=pytorch number=75095 owner=pytorch": {
+    "data": {
+      "repository": {
+        "pullRequest": {
+          "comments": {
+            "nodes": [
+              {
+                "bodyText": "\ud83d\udd17 Helpful links\n\n\ud83e\uddea \u00a0See artifacts and rendered test results at hud.pytorch.org/pr/75095\n\ud83d\udcc4 \u00a0Preview Python docs built from this PR\n\ud83d\udcc4 \u00a0Preview C++ docs built from this PR\n\u2753Need help or want to give feedback on the CI? Visit our office hours\n\n\ud83d\udc8a CI failures summary and remediations\nAs of commit db355d5 (more details on the Dr. CI page):\nExpand to see more\n\n\ud83d\udc9a \ud83d\udc9a Looks good so far! There are no failures yet. \ud83d\udc9a \ud83d\udc9a\n\nThis comment was automatically generated by Dr. CI (expand for details).\nPlease report bugs/suggestions to the (internal) Dr. CI Users group.\nClick here  to manually regenerate this comment.",
+                "createdAt": "2022-04-01T08:49:06Z",
+                "author": {
+                  "login": "facebook-github-bot"
+                },
+                "authorAssociation": "MEMBER",
+                "editor": {
+                  "login": "facebook-github-bot"
+                },
+                "databaseId": 1085625658
+              },
+              {
+                "bodyText": "High level question: how do we plan to validate that our ref implementations are compatible with somewhat-symbolic shapes? There are multiple ways to write the shape processing logic to be compatible vs not, it'd be good to catch such instances early. Does it make sense to throw in some proxy objects (that have state of 0,1,N) in tests early on? (maybe in a follow up PR). Otherwise it's not clear to me that squeeze/broadcast/etc are the right set of primitives for symbolic shapes",
+                "createdAt": "2022-04-21T18:51:24Z",
+                "author": {
+                  "login": "dzhulgakov"
+                },
+                "authorAssociation": "COLLABORATOR",
+                "editor": null,
+                "databaseId": 1105634766
+              }
+            ],
+            "pageInfo": {
+              "startCursor": "Y3Vyc29yOnYyOpHOQLVVOg==",
+              "hasPreviousPage": false
+            }
+          }
+        }
+      }
+    }
   }
 }
diff --git a/.github/workflows/check-labels.yml b/.github/workflows/check-labels.yml
new file mode 100644
index 0000000000000..5fa5fed16daf8
--- /dev/null
+++ b/.github/workflows/check-labels.yml
@@ -0,0 +1,44 @@
+name: Check Labels
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened, labeled, unlabeled]
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  check-labels:
+    name: Check labels
+    runs-on: linux.20_04.4x
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        with:
+          submodules: false
+          fetch-depth: 1
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.8'
+          architecture: x64
+          check-latest: false
+          cache: pip
+          cache-dependency-path: |
+            **/.github/requirements-gha-cache.txt
+
+      - name: Install requirements
+        id: requirements
+        run: |
+          pip install -r .github/requirements-gha-cache.txt --user
+
+      - name: Check labels
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PR_NUM: ${{ github.event.number }}
+        run: |
+          set -ex
+          python3 .github/scripts/check_labels.py "${PR_NUM}"
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index cff22d72d4d24..1f47e1defc2fc 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -226,40 +226,6 @@ jobs:
             false
           fi
 
-  check-labels:
-    name: Check labels
-    runs-on: linux.20_04.16x
-    if: github.event_name == 'pull_request'
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
-        with:
-          submodules: false
-          fetch-depth: 1
-
-      - name: Setup Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.8'
-          architecture: x64
-          check-latest: false
-          cache: pip
-          cache-dependency-path: |
-            **/.github/requirements-gha-cache.txt
-
-      - name: Install requirements
-        id: requirements
-        run: |
-          pip install -r .github/requirements-gha-cache.txt --user
-
-      - name: Check labels
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          PR_NUM: ${{ github.event.number }}
-        run: |
-          set -ex
-          python3 .github/scripts/check_labels.py "${PR_NUM}"
-
   test-tools:
     name: Test tools
     if: ${{ github.repository == 'pytorch/pytorch' }}

From 6652b9abc9976fbe49f40522a3387af2ed156685 Mon Sep 17 00:00:00 2001
From: Catherine Lee <csl@fb.com>
Date: Mon, 31 Oct 2022 16:52:56 +0000
Subject: [PATCH 0346/1922] remove old label check functionality (#88007)

no longer needed as we have check_labels.py to check if the pr has labels and it blocks merge
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88007
Approved by: https://github.com/huydhn, https://github.com/malfet, https://github.com/ZainRizvi
---
 .github/scripts/process_commit.py | 106 ------------------------------
 .github/workflows/pr-labels.yml   |  38 -----------
 2 files changed, 144 deletions(-)
 delete mode 100644 .github/scripts/process_commit.py
 delete mode 100644 .github/workflows/pr-labels.yml

diff --git a/.github/scripts/process_commit.py b/.github/scripts/process_commit.py
deleted file mode 100644
index 358f9012c92fd..0000000000000
--- a/.github/scripts/process_commit.py
+++ /dev/null
@@ -1,106 +0,0 @@
-#!/usr/bin/env python3
-"""
-This script finds the user/pr creator responsible for labeling a PR by a commit SHA. It is used by the workflow in
-'.github/workflows/pr-labels.yml'. If there exists no PR associated with the commit or the PR is properly labeled,
-this script is a no-op.
-
-Note: we ping the user only, not the reviewers, as the reviewers can sometimes be external to pytorch
-with no labeling responsibility, so we don't want to bother them.
-This script is based on: https://github.com/pytorch/vision/blob/main/.github/process_commit.py
-"""
-
-import sys
-from typing import Any, Set, Tuple, List
-import re
-import os
-import json
-import requests
-
-# For a PR to be properly labeled it should have release notes label and one topic label
-PULL_REQUEST_EXP = "Pull Request resolved:.*pull/(.*)"
-PRIMARY_LABEL_FILTER = "release notes:"
-SECONDARY_LABELS = {
-    "topic: bc_breaking",
-    "topic: deprecation",
-    "topic: new feature",
-    "topic: improvements",
-    "topic: bug fixes",
-    "topic: performance",
-    "topic: documentation",
-    "topic: developer feature",
-    "topic: not user facing",
-}
-# This secondary does not require a primary
-ALLOWED_ONLY_SECONDARY = {"topic: not user facing"}
-PYTORCH_REPO = "https://api.github.com/repos/pytorch/pytorch"
-GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN')
-REQUEST_HEADERS = {'Accept': 'application/vnd.github.v3+json', 'Authorization': f'token {GITHUB_TOKEN}'}
-
-
-def query_pytorch(cmd: str) -> Any:
-    response = requests.get(f"{PYTORCH_REPO}/{cmd}", headers=REQUEST_HEADERS)
-    return response.json()
-
-
-def get_pr_number(commit_hash: str) -> Any:
-    data = query_pytorch(f"commits/{commit_hash}")
-    if not data or (not data["commit"]["message"]):
-        return None
-    message = data["commit"]["message"]
-    p = re.compile(PULL_REQUEST_EXP)
-    result = p.search(message)
-    if not result:
-        return None
-    return result.group(1)
-
-
-def get_pr_author_and_labels(pr_number: int) -> Tuple[str, Set[str]]:
-    # See https://docs.github.com/en/rest/reference/pulls#get-a-pull-request
-    data = query_pytorch(f"pulls/{pr_number}")
-    user = data["user"]["login"]
-    labels = {label["name"] for label in data["labels"]}
-    return user, labels
-
-def get_repo_labels() -> List[str]:
-    collected_labels: List[str] = list()
-    for page in range(0, 10):
-        response = query_pytorch(f"labels?per_page=100&page={page}")
-        page_labels = list(map(lambda x: str(x["name"]), response))
-        if not page_labels:
-            break
-        collected_labels += page_labels
-    return collected_labels
-
-def post_pytorch_comment(pr_number: int, merger: str) -> Any:
-    message = {'body' : f"Hey @{merger}." + """
-You've committed this PR, but it does not have both a 'release notes: ...' and 'topics: ...' label. \
-Please add one of each to the PR. The 'release notes: ...' label should represent the part of \
-PyTorch that this PR changes (fx, autograd, distributed, etc) and the 'topics: ...' label should \
-represent the kind of PR it is (not user facing, new feature, bug fix, perf improvement, etc). \
-The list of valid labels can be found [here](https://github.com/pytorch/pytorch/labels?q=release+notes) \
-for the 'release notes: ...' and [here](https://github.com/pytorch/pytorch/labels?q=topic) for the \
-'topics: ...'.
-For changes that are 'topic: not user facing' there is no need for a release notes label."""}
-
-    response = requests.post(
-        f"{PYTORCH_REPO}/issues/{pr_number}/comments",
-        json.dumps(message),
-        headers=REQUEST_HEADERS)
-    return response.json()
-
-if __name__ == "__main__":
-    commit_hash = sys.argv[1]
-    pr_number = get_pr_number(commit_hash)
-
-    if not pr_number:
-        sys.exit(0)
-
-    user, labels = get_pr_author_and_labels(pr_number)
-    repo_labels = get_repo_labels()
-
-    primary_labels = set(filter(lambda x: x.startswith(PRIMARY_LABEL_FILTER), repo_labels))
-    has_both_labels = bool(primary_labels.intersection(labels)) and bool(SECONDARY_LABELS.intersection(labels))
-    is_properly_labeled = has_both_labels or bool(ALLOWED_ONLY_SECONDARY.intersection(labels))
-
-    if not is_properly_labeled:
-        post_pytorch_comment(pr_number, user)
diff --git a/.github/workflows/pr-labels.yml b/.github/workflows/pr-labels.yml
deleted file mode 100644
index 9afa0e721ac60..0000000000000
--- a/.github/workflows/pr-labels.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-name: pr-labels
-
-on:
-  push:
-    branches:
-      - master
-      - main
-
-jobs:
-  is-properly-labeled:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-
-      - name: Set up python
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.10'
-          check-latest: false
-          cache: pip
-          cache-dependency-path: |
-            **/.github/requirements-gha-cache.txt
-
-      - name: Install requests
-        run: pip install requests==2.26
-
-      - name: Process commit and find merger responsible for labeling
-        id: commit
-        env:
-          SHA1: ${{ github.sha }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: python .github/scripts/process_commit.py "${SHA1}"
-
-concurrency:
-  group: pr-labels-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true

From b56b7383d329c37bd647b86fb9abafaf44fc61ce Mon Sep 17 00:00:00 2001
From: Khushi <khushiagrawal411@gmail.com>
Date: Mon, 31 Oct 2022 17:08:52 +0000
Subject: [PATCH 0347/1922] [primTorch fix] use _maybe_convert_to_dtype
 (#85163)

Fixes #84561

- [x] fix lint tests

cc: @Lezcano!!

Pull Request resolved: https://github.com/pytorch/pytorch/pull/85163
Approved by: https://github.com/lezcano, https://github.com/mruberry
---
 torch/_refs/__init__.py        | 10 ++++++----
 torch/_refs/fft.py             |  6 ++----
 torch/_refs/linalg/__init__.py | 16 ++++++++--------
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 64aa879a0c35a..e973c5d89f09f 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -1151,7 +1151,7 @@ def _floor_divide_integer(a: Tensor, b: Tensor) -> Tensor:
 
     # Convert truncation to flooring:
     offset = (torch.signbit(a) != torch.signbit(b)).logical_and(torch.fmod(a, b) != 0)
-    return prims.div(a, b) - prims.convert_element_type(offset, a.dtype)
+    return prims.div(a, b) - _maybe_convert_to_dtype(offset, a.dtype)
 
 
 def _floor_divide_float(a: Tensor, b: Tensor) -> Tensor:
@@ -1351,6 +1351,8 @@ def isclose(
 
 def _lcm(a: TensorLikeType, b: TensorLikeType):
     dtype = a.dtype
+    # promoting to int32 to maintain 100% consistency with C++ and to
+    # prevent overflow in case of int8 and int16
     promote_to_int = dtype in (torch.int8, torch.int16)
     if promote_to_int:
         a = prims.convert_element_type(a, torch.int32)
@@ -2838,10 +2840,10 @@ def native_layer_norm(
     elif weight is not None and bias is not None:
         out = out * weight + bias
 
-    out = prims.convert_element_type(out, input.dtype)
+    out = _maybe_convert_to_dtype(out, input.dtype)  # type: ignore[assignment]
     if input.device.type == "cpu":
-        mean = prims.convert_element_type(mean, input.dtype)
-        rstd = prims.convert_element_type(rstd, input.dtype)
+        mean = _maybe_convert_to_dtype(mean, input.dtype)  # type: ignore[assignment]
+        rstd = _maybe_convert_to_dtype(rstd, input.dtype)  # type: ignore[assignment]
     return (out, mean, rstd)
 
 
diff --git a/torch/_refs/fft.py b/torch/_refs/fft.py
index d92ef6914c2d1..28df8dafc1fdf 100644
--- a/torch/_refs/fft.py
+++ b/torch/_refs/fft.py
@@ -9,7 +9,7 @@
 import torch._prims_common as utils
 from torch._decomp import register_decomposition
 from torch._prims_common import check, DimsType, ShapeType, TensorLikeType
-from torch._prims_common.wrappers import out_wrapper
+from torch._prims_common.wrappers import _maybe_convert_to_dtype, out_wrapper
 
 __all__ = [
     # Transforms
@@ -76,9 +76,7 @@ def _maybe_promote_tensor_fft(
     """Helper to promote a tensor to a dtype supported by the FFT primitives"""
     cur_type = t.dtype
     new_type = _promote_type_fft(cur_type, require_complex)
-    if cur_type == new_type:
-        return t
-    return prims.convert_element_type(t, new_type)
+    return _maybe_convert_to_dtype(t, new_type)  # type: ignore[return-value]
 
 
 def _resize_fft_input(
diff --git a/torch/_refs/linalg/__init__.py b/torch/_refs/linalg/__init__.py
index c8c8f84570d8e..e6c15ec01889f 100644
--- a/torch/_refs/linalg/__init__.py
+++ b/torch/_refs/linalg/__init__.py
@@ -19,7 +19,7 @@
     NumberType,
     TensorLikeType,
 )
-from torch._prims_common.wrappers import out_wrapper
+from torch._prims_common.wrappers import _maybe_convert_to_dtype, out_wrapper
 
 __all__ = [
     "svd",
@@ -97,23 +97,23 @@ def vector_norm(
         x, utils.REDUCTION_OUTPUT_TYPE_KIND.COMPLEX_TO_FLOAT, dtype
     )
 
-    to_result_dtype = partial(prims.convert_element_type, dtype=result_dtype)
+    to_result_dtype = partial(_maybe_convert_to_dtype, dtype=result_dtype)
 
     # Implementation
     if ord == 0.0:
         return refs.sum(refs.ne(x, 0.0), dim=dim, keepdim=keepdim, dtype=result_dtype)
     elif ord == float("inf"):
-        return to_result_dtype(refs.amax(torch.abs(x), dim=dim, keepdim=keepdim))
+        return to_result_dtype(refs.amax(torch.abs(x), dim=dim, keepdim=keepdim))  # type: ignore[return-value]
     elif ord == float("-inf"):
-        return to_result_dtype(refs.amin(torch.abs(x), dim=dim, keepdim=keepdim))
+        return to_result_dtype(refs.amin(torch.abs(x), dim=dim, keepdim=keepdim))  # type: ignore[return-value]
     else:
         # From here on the computation dtype is important as the reduction is non-trivial
-        x = prims.convert_element_type(x, computation_dtype)
+        x = _maybe_convert_to_dtype(x, computation_dtype)  # type: ignore[assignment]
         reduce_sum = partial(refs.sum, dim=dim, keepdim=keepdim)
 
         if not (ord % 2.0 == 0.0 and utils.is_float_dtype(x.dtype)):
             x = torch.abs(x)
-        return to_result_dtype(torch.pow(reduce_sum(torch.pow(x, ord)), 1.0 / ord))
+        return to_result_dtype(torch.pow(reduce_sum(torch.pow(x, ord)), 1.0 / ord))  # type: ignore[return-value]
 
 
 def backshift_permutation(dim0, dim1, ndim):
@@ -168,7 +168,7 @@ def matrix_norm(
             return vector_norm(A, 2, dim, keepdim, dtype=dtype)
         else:  # ord == "nuc"
             if dtype is not None:
-                A = prims.convert_element_type(A, dtype)
+                A = _maybe_convert_to_dtype(A, dtype)  # type: ignore[assignment]
             perm = backshift_permutation(dim[0], dim[1], A.ndim)
             result = torch.sum(svdvals(prims.transpose(A, perm)), -1, keepdim)
             if keepdim:
@@ -191,7 +191,7 @@ def matrix_norm(
 
         if abs_ord == 2.0:
             if dtype is not None:
-                A = prims.convert_element_type(A, dtype)
+                A = _maybe_convert_to_dtype(A, dtype)  # type: ignore[assignment]
             perm = backshift_permutation(dim[0], dim[1], A.ndim)
             result = max_min(svdvals(prims.transpose(A, perm)), dim=-1)
             if keepdim:

From a776e93b75f7519d8fa7ed64df240b5a50c22560 Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Mon, 31 Oct 2022 04:12:36 +0000
Subject: [PATCH 0348/1922] Fix meta for aten.angle and aten.index_copy
 (#88066)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88066
Approved by: https://github.com/albanD
---
 test/test_meta.py            | 2 --
 torch/_meta_registrations.py | 2 +-
 torch/_refs/__init__.py      | 4 +++-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/test_meta.py b/test/test_meta.py
index c46fc827e1661..997e422465436 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -204,7 +204,6 @@ def test_tensor_outlives_converter(self):
     aten._scaled_dot_product_attention_forward.default,
     aten.add.Tensor,
     aten.addmm.default,
-    aten.angle.default,
     aten.atan2.default,
     aten.binary_cross_entropy.default,
     aten.bitwise_and.Tensor,
@@ -230,7 +229,6 @@ def test_tensor_outlives_converter(self):
     aten.hypot.default,
     aten.igamma.default,
     aten.igammac.default,
-    aten.index_copy.default,
     aten.lcm.default,
     aten.le.Tensor,
     aten.logical_and.default,
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index c6f4d7a357fc7..1ef9778832edc 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -133,7 +133,7 @@ def meta_angle(self):
         _, result_dtype = elementwise_dtypes(
             self, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
         )
-    return self.new_empty(self.size(), dtype=result_dtype)
+    return torch.empty_like(self, dtype=result_dtype)
 
 
 @register_meta(aten.angle.out)
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index e973c5d89f09f..0ab10701332ae 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -3225,7 +3225,9 @@ def unbind(t: TensorLikeType, dim: int = 0) -> TensorSequenceType:
 @register_decomposition(torch.ops.aten.index_copy)
 @out_wrapper()
 def index_copy(x: TensorLike, dim: int, index: TensorLike, tensor: TensorLike):
-    return x.clone().index_copy_(dim, index, tensor)
+    return x.clone(memory_format=torch.contiguous_format).index_copy_(
+        dim, index, tensor
+    )
 
 
 @register_decomposition(torch.ops.aten.index_copy_)

From 48331b7bfa4ea4cce529d93c3906752f17a9796d Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Fri, 28 Oct 2022 16:42:29 -0700
Subject: [PATCH 0349/1922] [quant][fx] Fix a typo in utils.py (#88024)

Summary:
att

Test Plan:
python test/test_quantization.py TestQuantizeFx.test__convert_to_reference_decomposed_fx

Reviewers:

Subscribers:

Tasks:

Tags:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88024
Approved by: https://github.com/HDCharles, https://github.com/z-a-f
---
 test/quantization/fx/test_quantize_fx.py | 6 +++++-
 torch/ao/quantization/fx/utils.py        | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 84632e1e2f313..04109ce532f20 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -5252,6 +5252,8 @@ def forward(self, x):
         qconfig_mapping = get_default_qconfig_mapping("fbgemm")
         example_inputs = (torch.randn(1, 5),)
         m = prepare_fx(m, qconfig_mapping, example_inputs)
+        m_ref = copy.deepcopy(m)
+        m_ref = convert_to_reference_fx(m_ref)
         m = _convert_to_reference_decomposed_fx(m)
         expected_occurrence = {
             ns.call_function(torch.ops.quantized_decomposed.quantize_per_tensor): 2,
@@ -5261,7 +5263,9 @@ def forward(self, x):
             m,
             expected_node_occurrence=expected_occurrence)
         # make sure it runs
-        m(*example_inputs)
+        res_ref = m_ref(*example_inputs)
+        res = m(*example_inputs)
+        self.assertEqual(res, res_ref)
 
     def test_change_backend_config_for_fixed_qparam_ops(self):
         """ Making sure we can skip validation of qconfigs for fixedqparam ops based
diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py
index f2037d1590a93..61bb2cdc1b034 100644
--- a/torch/ao/quantization/fx/utils.py
+++ b/torch/ao/quantization/fx/utils.py
@@ -204,7 +204,7 @@ def get_quantize_node_info(
                 qparams = {
                     "_scale_": scale,
                     "_zero_point_": zero_point,
-                    "_quant_min": quant_max,
+                    "_quant_min": quant_min,
                     "_quant_max": quant_max,
                     "_dtype_": dtype
                 }

From b0e5c81d699ff998be868e41f341849edb77efd6 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 31 Oct 2022 06:48:39 -0700
Subject: [PATCH 0350/1922] Revert "Revert "Unify meta tensor and fake tensor
 converter conversion (#87943)"" (#88045)

This reverts commit bc64999b8382796199178cf480adf51512b5f139.

Check torch/_subclasses/meta_utils.py for "This is very tricky" for the bugfix explanation.

cc @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88045
Approved by: https://github.com/kit1980, https://github.com/Chillee
---
 test/dynamo/test_unspec.py                    |   3 -
 test/test_meta.py                             | 146 +++++++--
 .../templates/python_variable_methods.cpp     |  12 +-
 torch/_subclasses/fake_tensor.py              | 276 +++++++++---------
 torch/_subclasses/meta_utils.py               | 238 ++++++++++++---
 torch/testing/_internal/common_utils.py       |   2 +
 6 files changed, 475 insertions(+), 202 deletions(-)

diff --git a/test/dynamo/test_unspec.py b/test/dynamo/test_unspec.py
index 22f975d0f9d68..fd5396981b740 100644
--- a/test/dynamo/test_unspec.py
+++ b/test/dynamo/test_unspec.py
@@ -50,9 +50,6 @@ class UnspecTest(cls):
 UnspecReproTests = make_unspec_cls(test_repros.ReproTests)
 UnspecNNModuleTests = make_unspec_cls(test_modules.NNModuleTests)
 
-# RuntimeError: a leaf Variable that requires grad is being used in an in-place operation.
-unittest.expectedFailure(UnspecReproTests.test_batch_norm_act_unspec)
-
 
 @patch.object(torch._dynamo.config, "specialize_int_float", False)
 class UnspecTests(torch._dynamo.test_case.TestCase):
diff --git a/test/test_meta.py b/test/test_meta.py
index 997e422465436..26f9103b6e864 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -6,7 +6,7 @@
 from enum import Enum
 from torch.overrides import resolve_name
 from torch.utils._pytree import tree_map, tree_flatten, tree_unflatten
-from torch._subclasses.meta_utils import MetaConverter
+from torch._subclasses.meta_utils import MetaConverter, assert_metadata_eq
 import torch.utils._python_dispatch
 from torch._dispatch.python import enable_python_dispatcher
 from torch.testing._internal.common_utils import (
@@ -66,6 +66,9 @@ def assertSameVersionCounter(self, m1, m2):
         self.assertNotEqual(m1._version, vc)
         self.assertEqual(m2._version, m1._version)
 
+    def assertMetadataMatches(self, m1, m2):
+        assert_metadata_eq(self.assertEqual, m1, m2)
+
     def test_view_of_non_leaf(self):
         x = torch.randn(4, requires_grad=True)
         y = x.neg()
@@ -74,9 +77,14 @@ def test_view_of_non_leaf(self):
         to_meta = MetaConverter()
         m1 = to_meta(z1)
         m2 = to_meta(z2)
-        self.assertEqual(m1.shape, z1.shape)
+
+        # check the test is actually testing what it claims
         self.assertTrue(m1._is_view())
         self.assertFalse(m1._base.is_leaf)
+
+        self.assertIsNot(m1, m2)
+        self.assertMetadataMatches(m1, z1)
+        self.assertMetadataMatches(m2, z2)
         self.assertSameVersionCounter(m1, m2)
 
     def test_view_of_leaf(self):
@@ -86,35 +94,133 @@ def test_view_of_leaf(self):
         to_meta = MetaConverter()
         m1 = to_meta(z1)
         m2 = to_meta(z2)
-        self.assertEqual(m1.shape, z1.shape)
+
+        # check the test is actually testing what it claims
         self.assertTrue(m1._is_view())
         self.assertTrue(m1._base.is_leaf)
+
+        self.assertIsNot(m1, m2)
+        self.assertMetadataMatches(m1, z1)
+        self.assertMetadataMatches(m2, z2)
         self.assertSameVersionCounter(m1, m2)
 
+    def test_view_of_view_of_leaf(self):
+        x = torch.randn(8)
+        y = x.view(2, 4)
+        y.requires_grad = True
+        z = y.view(2, 2, 2)
+
+        to_meta = MetaConverter()
+        mx = to_meta(x)
+        mz = to_meta(z)
+
+        self.assertFalse(z.is_leaf)
+
+        self.assertMetadataMatches(mx, x)
+        self.assertMetadataMatches(mz, z)
+
     def test_leaf(self):
         x = torch.randn(4, requires_grad=True)
         to_meta = MetaConverter()
         m = to_meta(x)
-        self.assertEqual(m.shape, x.shape)
+
+        # check the test is actually testing what it claims
         self.assertTrue(m.is_leaf)
         self.assertTrue(m.requires_grad)
 
+        self.assertMetadataMatches(m, x)
+
     def test_non_leaf(self):
         x = torch.randn(4, requires_grad=True)
         y = x.neg()
         to_meta = MetaConverter()
         m = to_meta(y)
-        self.assertEqual(m.shape, y.shape)
+
+        # check the test is actually testing what it claims
         self.assertFalse(m.is_leaf)
         self.assertTrue(m.requires_grad)
 
+        self.assertMetadataMatches(m, y)
+
     def test_requires_grad_false(self):
         x = torch.randn(4, requires_grad=False)
         to_meta = MetaConverter()
         m = to_meta(x)
-        self.assertEqual(m.shape, x.shape)
+
+        # check the test is actually testing what it claims
         self.assertFalse(m.requires_grad)
 
+        self.assertMetadataMatches(m, x)
+
+    def test_channels_last(self):
+        x = torch.empty(2, 3, 4, 5, memory_format=torch.channels_last)
+        to_meta = MetaConverter()
+        m = to_meta(x)
+
+        # check the test is actually testing what it claims
+        self.assertTrue(m.is_leaf)
+
+        self.assertMetadataMatches(m, x)
+
+    def test_channels_last_leaf(self):
+        x = torch.empty(2, 3, 4, 5, memory_format=torch.channels_last, requires_grad=True)
+        to_meta = MetaConverter()
+        m = to_meta(x)
+
+        # check the test is actually testing what it claims
+        self.assertTrue(m.requires_grad)
+        self.assertTrue(m.is_leaf)
+
+        self.assertMetadataMatches(m, x)
+
+    def test_channels_last_non_leaf(self):
+        x = torch.empty(2, 3, 4, 5, memory_format=torch.channels_last, requires_grad=True)
+        y = x + 2
+
+        # sanity
+        self.assertEqual(x.stride(), y.stride())
+        self.assertFalse(y.is_leaf)
+
+        to_meta = MetaConverter()
+        m = to_meta(y)
+
+        # check the test is actually testing what it claims
+        self.assertTrue(m.requires_grad)
+        self.assertFalse(m.is_leaf)
+
+        self.assertMetadataMatches(m, y)
+
+        # Check that we can autograd with m as input without erroring;
+        # see https://github.com/pytorch/pytorch/issues/87956
+        loss = m.sum()
+        torch.autograd.grad(loss, m)
+
+    def test_empty_strided_non_dense_leaf(self):
+        x = torch.empty_strided((2, 2), (4, 2), requires_grad=True)
+
+        to_meta = MetaConverter()
+        m = to_meta(x)
+
+        # check the test is actually testing what it claims
+        self.assertTrue(m.requires_grad)
+        self.assertTrue(m.is_leaf)
+
+        self.assertMetadataMatches(m, x)
+
+    def test_non_leaf_torture(self):
+        x = torch.empty(20, requires_grad=True)
+        with torch.no_grad():
+            x.set_(x.storage(), 10, (2,), (2,))
+
+        to_meta = MetaConverter()
+        m = to_meta(x)
+
+        # check the test is actually testing what it claims
+        self.assertTrue(m.requires_grad)
+        self.assertTrue(m.is_leaf)
+
+        self.assertMetadataMatches(m, x)
+
     # NB: complex stuff is not actually exercised right now because
     # we have a blanket exclusion for complex conversion
 
@@ -122,41 +228,30 @@ def test_view_as_real(self):
         x = torch.randn(4, dtype=torch.complex64)
         y = torch.view_as_real(x)
         m = MetaConverter()(y)
-        self.assertEqual(m.shape, y.shape)
-        self.assertEqual(m.stride(), y.stride())
-        self.assertEqual(m.dtype, y.dtype)
+        self.assertMetadataMatches(m, y)
 
     def test_complex_noncontiguous_bug(self):
         x = torch.randn((2, 2, 4, 9), dtype=torch.complex32)[:, 0, :, :]
         m = MetaConverter()(x)
-        self.assertEqual(m.shape, x.shape)
-        self.assertEqual(m.stride(), x.stride())
-        self.assertEqual(m.dtype, x.dtype)
+        self.assertMetadataMatches(m, x)
 
     def test_view_as_complex(self):
         x = torch.randn((4, 2), dtype=torch.float32)
         y = torch.view_as_complex(x)
         m = MetaConverter()(y)
-        self.assertEqual(m.shape, y.shape)
-        self.assertEqual(m.stride(), y.stride())
-        self.assertEqual(m.dtype, y.dtype)
+        self.assertMetadataMatches(m, y)
 
     def test_view_dtype(self):
         x = torch.randn(4, dtype=torch.float32)
         y = x.view(dtype=torch.int32)
         m = MetaConverter()(y)
-        self.assertEqual(m.shape, y.shape)
-        self.assertEqual(m.stride(), y.stride())
-        self.assertEqual(m.dtype, y.dtype)
+        self.assertMetadataMatches(m, y)
 
     def test_imag(self):
         x = torch.randn(4, dtype=torch.complex64)
         y = x.imag
         m = MetaConverter()(y)
-        self.assertEqual(m.shape, y.shape)
-        self.assertEqual(m.dtype, y.dtype)
-        self.assertEqual(m.stride(), y.stride())
-        self.assertEqual(m.storage_offset(), y.storage_offset())
+        self.assertMetadataMatches(m, y)
 
     def test_weakref(self):
         x = torch.randn(4, 4, 4)
@@ -742,7 +837,12 @@ def __init__(self, test_case, *, device, dtype, inplace):
     def __torch_function__(self, func, types, args=(), kwargs=None):
         kwargs = kwargs or {}
 
-        if torch.jit.is_tracing() or isinstance(func, torch.ScriptMethod):
+        if (
+            torch.jit.is_tracing() or isinstance(func, torch.ScriptMethod) or
+            # meta converter doesn't work correctly when no_dispatch() is on, so
+            # skip running the crossref test in this case
+            torch._C._dispatch_tls_local_exclude_set().has(torch._C.DispatchKey.Python)
+        ):
             return func(*args, **kwargs)
 
         if self.dtype in meta_function_skips.get(func, set()):
diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
index 7122532a54410..e3c0a8b987bd6 100644
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@@ -1135,7 +1135,7 @@ static PyObject* THPVariable_set_(
       {
           "set_()",
           "set_(Storage source)",
-          "set_(Storage source, int64_t storage_offset, IntArrayRef size, IntArrayRef stride=None)",
+          "set_(Storage source, SymInt storage_offset, SymIntArrayRef size, SymIntArrayRef stride=None)",
           "set_(Tensor source)",
           "set_(Tensor source, SymInt storage_offset, SymIntArrayRef size, SymIntArrayRef stride=None)",
       },
@@ -1181,14 +1181,14 @@ static PyObject* THPVariable_set_(
         " for argument 1 'storage'");
       auto dispatch_set_ = [](const Tensor& self,
                               Storage source,
-                              int64_t storage_offset,
-                              IntArrayRef size,
-                              IntArrayRef stride) -> Tensor {
+                              c10::SymInt storage_offset,
+                              c10::SymIntArrayRef size,
+                              c10::SymIntArrayRef stride) -> Tensor {
         pybind11::gil_scoped_release no_gil;
-        return self.set_(source, storage_offset, size, stride);
+        return self.set__symint(source, storage_offset, size, stride);
       };
       return wrap(dispatch_set_(
-          self, storage, _r.toInt64(1), _r.intlist(2), _r.intlist(3)));
+          self, storage, _r.toSymInt(1), _r.symintlist(2), _r.symintlist(3)));
     }
     case 3: {
       // aten::set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 3e5cbdb652264..d6e6f79647fd3 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -2,7 +2,6 @@
 import functools
 import itertools
 import sys
-import warnings
 import weakref
 from dataclasses import dataclass
 from functools import partial
@@ -139,15 +138,17 @@ def tree_flatten_only(ty: Type[T], pytree: PyTree):
 # structure. Like `MetaConverter`, it uses `WeakTensorRefKey` to
 # hold a weak reference for all memoized tensors.
 class FakeTensorConverter(object):
-    tensor_memo: weakref.WeakValueDictionary
+    @property
+    def tensor_memo(self):
+        return self.meta_converter.tensor_memo
+
     meta_converter: MetaConverter
     constant_storage_mapping: Dict[StorageWeakRef, List[TensorWeakRef]]
 
     def __init__(self):
-        # FakeTensors store the FakeTensorMode which in turn stores a
-        # FakeTensor, so we need to hold a weak reference to the FakeTensor
-        # otherwise we would induce a circular reference
-        self.tensor_memo = weakref.WeakValueDictionary()
+        # In principle preserving views should be OK, but in practice
+        # AOTAutograd (or maybe autograd) seems to do the wrong thing.  See
+        # https://github.com/pytorch/torchdynamo/issues/1815
         self.meta_converter = MetaConverter()
 
         # map from to storage to corresponding constant tensors
@@ -214,28 +215,31 @@ def from_real_tensor(self, fake_mode, t, make_constant=False, shape_env=None):
         # not yet supported in metatensors
         if t.is_quantized:
             raise UnsupportedFakeTensorException("quantized nyi in meta tensors")
-        with no_dispatch():
-            meta_t = self.meta_converter(t, shape_env=shape_env)
-            if meta_t.device.type != "meta":
-                raise UnsupportedFakeTensorException("meta converter nyi")
-            out = FakeTensor(
-                fake_mode,
-                meta_t,
-                existing_device,
-                constant=t if make_constant else None,
-            )
-            out.requires_grad_(t.requires_grad)
-            if make_constant:
-                self.add_constant_storage_mapping(out)
         if type(t) is torch.nn.Parameter:
             assert not make_constant
-            out = torch.nn.Parameter(out, requires_grad=out.requires_grad)  # type: ignore[assignment]
-        with warnings.catch_warnings():
-            warnings.filterwarnings("ignore", "The .grad attribute of a Tensor")
-            grad_not_none = t.grad is not None
-        if grad_not_none:
-            out.grad = self.from_real_tensor(fake_mode, t.grad, shape_env=shape_env)
-        self.set_tensor_memo(t, out)
+
+        def mk_fake_tensor(make_meta_t):
+            # NB: don't use in_kernel_invocation_manager. to
+            # ensure FakeTensor can internally do constant computation
+            # as necessary.  Invocation manager is "more correct" as
+            # it works for more operators in make_meta_t, but
+            # invariant is that make_meta_t only calls factories
+            # for which it is not strictly necessary to use the
+            # invocation manager (I think!)
+            with no_dispatch():
+                return FakeTensor(
+                    fake_mode,
+                    make_meta_t(),
+                    existing_device,
+                    constant=t if make_constant else None,
+                )
+
+        out = self.meta_converter(t, shape_env=shape_env, callback=mk_fake_tensor)
+        if out is NotImplemented:
+            raise UnsupportedFakeTensorException("meta converter nyi")
+        if make_constant:
+            self.add_constant_storage_mapping(out)
+        # NB: meta_converter set the memo
         return out
 
     # If you specify the device, it MUST be a meta tensor.
@@ -296,7 +300,9 @@ def constructors(fake_mode, func, *args, **kwargs):
     out_device = new_kwargs.pop("device", None)
     out_device = out_device if out_device is not None else default_device
     new_kwargs["device"] = torch.device("meta")
-    r = func(*args, **new_kwargs)
+    # Not in_kernel_invocation_manager as no fake tensor inputs
+    with no_dispatch():
+        r = func(*args, **new_kwargs)
     return FakeTensor(fake_mode, r, out_device)
 
 
@@ -309,7 +315,8 @@ def non_kwarg_to(fake_mode, func, *args, **kwargs):
     out_device = input_device if input_device else new_kwargs["input"].device
     new_kwargs["device"] = torch.device("meta")
     inp = new_kwargs.pop("input")
-    r = func(inp, **new_kwargs)
+    with in_kernel_invocation_manager(fake_mode):
+        r = func(inp, **new_kwargs)
     # TODO: I think this does the wrong thing if r is inp
     return fake_mode.fake_tensor_converter.from_meta_and_device(
         fake_mode, r, out_device
@@ -320,7 +327,8 @@ def non_kwarg_to(fake_mode, func, *args, **kwargs):
 # since the device of `the_template` is ignored
 @register_op_impl(aten.resize_as_.default)
 def resize_as_(fake_mode, func, *args, **kwargs):
-    return func(*args, **kwargs)
+    with in_kernel_invocation_manager(fake_mode):
+        return func(*args, **kwargs)
 
 
 @register_op_impl(aten._sparse_coo_tensor_with_dims_and_tensors.default)
@@ -710,6 +718,13 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
             else:
                 return args[0].fake_device
 
+        # Some attribute queries that can be serviced directly
+        # See Note [is_coalesced is dispatched]
+        if func in [torch.ops.aten.is_coalesced.default]:
+            # NB: no_dispatch is ok here too, this func is very simple
+            with in_kernel_invocation_manager(self):
+                return func(*args, **kwargs)
+
         flat_arg_fake_tensors = tree_flatten_only(FakeTensor, (args, kwargs))
         flat_symints = tree_flatten_only(torch.SymInt, (args, kwargs))
         has_symbolic_sizes = (
@@ -725,38 +740,38 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         if func in self.lift_fns:
             out = func(*args, **kwargs)
             if self.may_turn_const(out):
+                # NB: not in_kernel_invocation_manager because we're doing real
+                # compute here
                 with no_dispatch():
-                    return converter(self, out.clone(), make_constant=True)
-
-        with no_dispatch():
-            flat_arg_tensors = tree_flatten_only(torch.Tensor, (args, kwargs))
-            # See [subclass inputs] below
-            # NB: If you're seeing a mysterious infinite loop involving fake
-            # tensor, it might be related to this line.  Though I'm not sure
-            # how you'll know to read this comment, as this line won't show up
-            # in the stack trace.
-            if self.check_for_subclass(flat_arg_tensors):
-                return NotImplemented
-
-            # if we are in the dispatch mode, we will enter this function even if the inputs
-            # are not FakeTensors. For now, throw if any non-Fake Tensor inputs
-            # and just support constructors.
-
-            # this is generated from torch.tensor(), which does not use the
-            # dispatcher, to allow wrapper subclasses to wrap the new tensor
-            if func in self.lift_fns:
-                assert (
-                    len(kwargs) == 0
-                    and len(args) == 1
-                    and type(args[0]) is torch.Tensor
-                ), f"{args} {kwargs}"
-                return converter(self, args[0])
-
-            if self.check_for_non_fake(flat_arg_tensors):
-                raise Exception(
-                    "Invoking operators with non-Fake Tensor inputs in FakeTensorMode is not yet supported. "
-                    f"Please convert all Tensors to FakeTensors first. Found in {func}(*{args}, **{kwargs})"
-                )
+                    out = out.clone()
+                return converter(self, out, make_constant=True)
+
+        flat_arg_tensors = tree_flatten_only(torch.Tensor, (args, kwargs))
+        # See [subclass inputs] below
+        # NB: If you're seeing a mysterious infinite loop involving fake
+        # tensor, it might be related to this line.  Though I'm not sure
+        # how you'll know to read this comment, as this line won't show up
+        # in the stack trace.
+        if self.check_for_subclass(flat_arg_tensors):
+            return NotImplemented
+
+        # if we are in the dispatch mode, we will enter this function even if the inputs
+        # are not FakeTensors. For now, throw if any non-Fake Tensor inputs
+        # and just support constructors.
+
+        # this is generated from torch.tensor(), which does not use the
+        # dispatcher, to allow wrapper subclasses to wrap the new tensor
+        if func in self.lift_fns:
+            assert (
+                len(kwargs) == 0 and len(args) == 1 and type(args[0]) is torch.Tensor
+            ), f"{args} {kwargs}"
+            return converter(self, args[0])
+
+        if self.check_for_non_fake(flat_arg_tensors):
+            raise Exception(
+                "Invoking operators with non-Fake Tensor inputs in FakeTensorMode is not yet supported. "
+                f"Please convert all Tensors to FakeTensors first. Found in {func}(*{args}, **{kwargs})"
+            )
 
         # The current constant handling only support tracing systems
         # (aot autograd, torchdynamo) where each operation is run consecutively.
@@ -776,27 +791,30 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
             and len(flat_arg_fake_tensors) != 0
             and not has_symbolic_sizes
         ):
+            const_args, const_kwargs = pytree.tree_map_only(
+                FakeTensor, lambda t: t.constant, (args, kwargs)
+            )
+
+            # NB: not in_kernel_invocation_manager(self) as we want to do REAL
+            # compute
             with no_dispatch():
-                const_args, const_kwargs = pytree.tree_map_only(
-                    FakeTensor, lambda t: t.constant, (args, kwargs)
-                )
                 out = func(*const_args, **const_kwargs)
 
-                all_constant = pytree.tree_all_only(
-                    torch.Tensor, lambda t: self.may_turn_const(t), out
-                )
+            all_constant = pytree.tree_all_only(
+                torch.Tensor, lambda t: self.may_turn_const(t), out
+            )
 
-                if all_constant:
-                    return pytree.tree_map_only(
-                        torch.Tensor,
-                        lambda t: converter(self, t, make_constant=True),
-                        out,
-                    )
+            if all_constant:
+                return pytree.tree_map_only(
+                    torch.Tensor,
+                    lambda t: converter(self, t, make_constant=True),
+                    out,
+                )
 
-                # we weren't able to turn outputs to constants,
-                # so invalidate all constants that might be aliases of the outputs
-                for ten in tree_flatten_only(torch.Tensor, out):
-                    converter.invalidate_constant_aliases(ten)
+            # we weren't able to turn outputs to constants,
+            # so invalidate all constants that might be aliases of the outputs
+            for ten in tree_flatten_only(torch.Tensor, out):
+                converter.invalidate_constant_aliases(ten)
 
         # we are falling through to running non constant tensors, any input constant that
         # is written to must be invalidated
@@ -817,14 +835,13 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         ):
             from torch._decomp import meta_table as meta_table
 
-            with no_dispatch():
-                if func == aten.size.default:
-                    sys.stderr.write(
-                        "Trying to call aten.size on a tensor with symbolic shapes. "
-                        "It's likely that this is from calling tensor.shape in C++"
-                    )
-                    # We do this to allow for better error localization with `TORCH_SHOW_CPP_STACKTRACES=1`
-                    return None
+            if func == aten.size.default:
+                sys.stderr.write(
+                    "Trying to call aten.size on a tensor with symbolic shapes. "
+                    "It's likely that this is from calling tensor.shape in C++"
+                )
+                # We do this to allow for better error localization with `TORCH_SHOW_CPP_STACKTRACES=1`
+                return None
 
             with self:
                 if func in meta_table:
@@ -860,32 +877,27 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                     f"{func} - couldn't find symbolic meta function/decomposition"
                 )
 
-        with no_dispatch():
-            # special handling for funcs registered through `register_op_impl`,
-            # e.g., manipulating args on constructor calls to construct meta tensors
-            # and then afterwards wrapping them to a FakeTensor
-            for run_impl_check, op_impl in op_implementations:
-                if run_impl_check(func):
-                    op_impl_out = op_impl(self, func, *args, **kwargs)
-                    if op_impl_out != NotImplemented:
-                        return op_impl_out
-
-            # run kernel registered to meta for func, which include
-            # python meta registrations, prims, decomps, and c++ meta fns (structured kernels)
-            try:
-                with in_kernel_invocation_manager(self):
-                    r = func(*args, **kwargs)
-            except NotImplementedError as not_implemented_error:
-                # no meta kernel registered, fallback to kernel for the device
-                if not self.allow_fallback_kernels:
-                    raise not_implemented_error
-                return run_fallback_kernel(
-                    self, func, args, kwargs, not_implemented_error
-                )
-
-            return self.wrap_meta_outputs_with_default_device_logic(
-                r, func, args, kwargs
-            )
+        # special handling for funcs registered through `register_op_impl`,
+        # e.g., manipulating args on constructor calls to construct meta tensors
+        # and then afterwards wrapping them to a FakeTensor
+        for run_impl_check, op_impl in op_implementations:
+            if run_impl_check(func):
+                op_impl_out = op_impl(self, func, *args, **kwargs)
+                if op_impl_out != NotImplemented:
+                    return op_impl_out
+
+        # run kernel registered to meta for func, which include
+        # python meta registrations, prims, decomps, and c++ meta fns (structured kernels)
+        try:
+            with in_kernel_invocation_manager(self):
+                r = func(*args, **kwargs)
+        except NotImplementedError as not_implemented_error:
+            # no meta kernel registered, fallback to kernel for the device
+            if not self.allow_fallback_kernels:
+                raise not_implemented_error
+            return run_fallback_kernel(self, func, args, kwargs, not_implemented_error)
+
+        return self.wrap_meta_outputs_with_default_device_logic(r, func, args, kwargs)
 
     # [subclass inputs]
     # Suppose we enable fake tensor mode.  This means that fake tensor
@@ -959,6 +971,7 @@ def functions_with_cpp_meta_impl_that_support_symint(self):
             aten.as_strided.default,
             aten.zeros.default,
             aten.detach.default,
+            aten.set_.source_Storage_storage_offset,
         ]
 
     @property
@@ -1004,8 +1017,11 @@ def run_fallback_kernel(fake_mode, func, args, kwargs, orig_not_implemented_exce
     if torch.Tag.inplace_view in func.tags:  # type: ignore[attr-defined]
         raise orig_not_implemented_exception
 
+    inp_impls = {}
+
+    # Don't use in_kernel_invocation_manager(fake_mode) as we want to do
+    # REAL compute (not with meta device)
     with no_dispatch():
-        inp_impls = {}
 
         def to_real_tensor(e):
             if isinstance(e, FakeTensor):
@@ -1021,25 +1037,25 @@ def to_real_tensor(e):
 
         r = func(*args, **kwargs)
 
-        tensor_impls = set()
-        storages = set()
-
-        for e in tree_flatten((args, kwargs))[0]:
-            if isinstance(e, torch.Tensor):
-                if not e.is_sparse:
-                    storages.add(e.storage()._cdata)
-
-        # TODO: also check metadata change on inputs
-        # proper aliasing/metadata relationship between outputs and inputs will
-        # not be set up, bc of conversion to device, unless we can reuse an
-        # input impl
-        for e in tree_flatten(r)[0]:
-            if id(e) not in inp_impls and (
-                isinstance(e, torch.Tensor)
-                and not e.is_sparse
-                and e.storage()._cdata in storages
-            ):
-                raise orig_not_implemented_exception
+    tensor_impls = set()
+    storages = set()
+
+    for e in tree_flatten((args, kwargs))[0]:
+        if isinstance(e, torch.Tensor):
+            if not e.is_sparse:
+                storages.add(e.storage()._cdata)
+
+    # TODO: also check metadata change on inputs
+    # proper aliasing/metadata relationship between outputs and inputs will
+    # not be set up, bc of conversion to device, unless we can reuse an
+    # input impl
+    for e in tree_flatten(r)[0]:
+        if id(e) not in inp_impls and (
+            isinstance(e, torch.Tensor)
+            and not e.is_sparse
+            and e.storage()._cdata in storages
+        ):
+            raise orig_not_implemented_exception
 
     def map_out(e):
         if isinstance(e, torch.Tensor):
diff --git a/torch/_subclasses/meta_utils.py b/torch/_subclasses/meta_utils.py
index 51231811631bc..0e2bbe49dd226 100644
--- a/torch/_subclasses/meta_utils.py
+++ b/torch/_subclasses/meta_utils.py
@@ -1,8 +1,10 @@
+import contextlib
+import warnings
 import weakref
+from typing import ContextManager
 
 import torch
 from torch.multiprocessing.reductions import StorageWeakRef
-from torch.utils._mode_utils import no_dispatch
 
 
 def safe_is_leaf(t):
@@ -13,6 +15,47 @@ def safe_is_leaf(t):
         return False
 
 
+def safe_grad(t):
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", "The .grad attribute of a Tensor")
+        return t.grad
+
+
+def assert_eq(a, b):
+    assert a == b, f"{a} != {b}"
+
+
+def assert_metadata_eq(assert_eq, m1, m2):
+    def go(m1, m2):
+        assert_eq(m1.dtype, m2.dtype)
+        assert_eq(m1.shape, m2.shape)
+        assert_eq(m1.requires_grad, m2.requires_grad)
+        assert_eq(m1.is_leaf, m2.is_leaf)
+        assert_eq(m1.grad_fn is None, m2.grad_fn is None)
+        assert_eq(m1.is_sparse, m2.is_sparse)
+        assert_eq(m1.is_inference(), m2.is_inference())
+        assert_eq(m1.is_conj(), m2.is_conj())
+        assert_eq(m1.is_neg(), m2.is_neg())
+        assert_eq(safe_grad(m1) is not None, safe_grad(m2) is not None)
+        if safe_grad(m1) is not None:
+            go(m1.grad, m2.grad)
+        if m1.is_sparse:
+            assert_eq(m1.dense_dim(), m2.dense_dim())
+            assert_eq(m1.sparse_dim(), m2.sparse_dim())
+            assert_eq(m1.is_coalesced(), m2.is_coalesced())
+        else:
+            assert_eq(m1.stride(), m2.stride())
+            assert_eq(m1.storage_offset(), m2.storage_offset())
+            assert_eq(m1._is_view(), m2._is_view())
+            if m1._is_view():
+                go(m1._base, m2._base)
+        # TODO: test if is resizable (no direct query for this atm)
+        # TODO: audit AutogradMeta to see if it matches
+        # TODO: test forward AD
+
+    return go(m1, m2)
+
+
 # torch.Tensors cannot be used as a key in a dictionary
 # because they define a custom __eq__ function which when used
 # to resolve hash collisions will throw when comparing tensors:
@@ -127,18 +170,31 @@ def del_ten():
 
     # NB: doesn't actually return a storage, because meta storage is
     # not supported
-    def meta_storage(self, s):
+    def meta_storage(self, s, callback):
         # NB: TypedStorage is freshly allocated and cannot be used as hash
         # key index.
 
         # Use a Weak Ref to s in order to not leak memory
         swr = StorageWeakRef(s)
         if swr not in self.storage_memo:
-            self.storage_memo[swr] = torch.empty(s.size(), dtype=s.dtype, device="meta")
+            self.storage_memo[swr] = (
+                callback(
+                    lambda: torch.empty(s.size(), dtype=torch.uint8, device="meta")
+                )
+                .storage()
+                .untyped()
+            )
         return self.storage_memo[swr]
 
     # This function assumes that it's possible to do the conversion
-    def meta_tensor(self, t, shape_env=None):
+    def meta_tensor(self, t, shape_env=None, callback=lambda t: t()):
+        # This indicates you set no_dispatch() before calling into this
+        # function.  This is an error: we may be creating fake tensors and
+        # will perform operations on them which need fake tensor mode to
+        # be active.  You will segfault if you are in a no_dispatch() block.
+        assert not torch._C._dispatch_tls_local_exclude_set().has(
+            torch._C.DispatchKey.Python
+        )
         arg_cnt = self.arg_cnt
         self.arg_cnt += 1
 
@@ -166,14 +222,22 @@ def sym_sizes_strides(t):
                 if t.is_sparse:
                     assert shape_env is None, "symbolic on sparse NYI"
                     is_leaf = safe_is_leaf(t)
-                    r = torch.ops.aten._sparse_coo_tensor_with_dims(
-                        t.sparse_dim(),
-                        t.dense_dim(),
-                        t.shape,
-                        dtype=t.dtype,
-                        layout=torch.sparse_coo,
-                        device="meta",
+                    r = callback(
+                        lambda: torch.ops.aten._sparse_coo_tensor_with_dims(
+                            t.sparse_dim(),
+                            t.dense_dim(),
+                            t.shape,
+                            dtype=t.dtype,
+                            layout=torch.sparse_coo,
+                            device="meta",
+                        )
                     )
+                    assert safe_is_leaf(r), "the callback you passed in doesn't detach"
+                    # Note [is_coalesced is dispatched]
+                    # Strangely enough, is_coalesced() is a dispatched operator,
+                    # which means that it will get caught by fake tensor mode.
+                    # Ordinarily this would error, but there's some logic in
+                    # fake tensor ensure this doesn't happen.
                     r._coalesced_(t.is_coalesced())
                     if t.requires_grad:
                         r.requires_grad = True
@@ -184,11 +248,12 @@ def sym_sizes_strides(t):
 
                 elif t._is_view():
                     # Construct views in two steps: recursively meta-fy their
-                    # base, and then create the view off that.  NB: doing it
+                    # base, and then create view(s) off that.  NB: doing it
                     # directly from storage is WRONG because this won't cause
                     # version counters to get shared.
                     assert t._is_view()
-                    base = self.meta_tensor(t._base)
+
+                    base = self.meta_tensor(t._base, shape_env, callback)
 
                     def is_c_of_r(complex_dtype, real_dtype):
                         return (
@@ -209,43 +274,135 @@ def is_c_of_r(complex_dtype, real_dtype):
                         # that hasn't been handled here
                         base = base.view(t.dtype)
 
-                    with torch.enable_grad():
-                        sizes, strides = sym_sizes_strides(t)
-                        r = base.as_strided(sizes, strides, sym(t.storage_offset()))
+                    # This is very tricky.  Naively, you might expect this
+                    # to hold:
+                    #
+                    #   if t.requires_grad and not safe_is_leaf(t)
+                    #       assert t._base.requires_grad
+                    #
+                    # But it's not true!  As you can see in the following
+                    # program:
+                    #
+                    #   x = torch.zeros(4)
+                    #   y = x.view(1, 4)
+                    #   y.requires_grad = True
+                    #   z = y.view(1, 1, 4)
+                    #   assert z._base is x
+                    #
+                    # So we may have to do *two* views out of the base to
+                    # recreate this situation.
+
+                    sizes, strides = sym_sizes_strides(t)
+                    if safe_is_leaf(t):
+                        # Leaf views that track view metadata are created by
+                        # creating a view inside a no_grad block
+                        with torch.no_grad():
+                            r = base.as_strided(sizes, strides, sym(t.storage_offset()))
+                        # As it's a leaf, we can directly assign requires_grad
+                        r.requires_grad = t.requires_grad
+                    else:
+                        if t._base.requires_grad == t.requires_grad:
+                            # Easy case, just run the view op
+                            with torch.enable_grad():
+                                r = base.as_strided(
+                                    sizes, strides, sym(t.storage_offset())
+                                )
+                        else:
+                            # Obscure case.  Create a leaf view and give it the
+                            # correct requires_grad, then do the final view.
+                            # NB: Can't have a non-leaf without requiring grad!
+                            assert t.requires_grad
+                            with torch.no_grad():
+                                mid = base.view(base.shape)
+                            mid.requires_grad = t.requires_grad
+                            with torch.enable_grad():
+                                r = mid.as_strided(
+                                    sizes, strides, sym(t.storage_offset())
+                                )
+
                 else:
                     is_leaf = safe_is_leaf(t)
-                    # Fake up some autograd history.
-                    if t.requires_grad:
-                        r = torch.empty(
-                            (0,), dtype=t.dtype, device="meta", requires_grad=True
+                    sizes, strides = sym_sizes_strides(t)
+                    storage_offset = sym(t.storage_offset())
+                    r = callback(
+                        lambda: torch.empty_strided(
+                            sizes, strides, dtype=t.dtype, device="meta"
                         )
+                    )
+                    assert safe_is_leaf(r), "the callback you passed in doesn't detach"
+                    if t.requires_grad:
+                        r.requires_grad = t.requires_grad
                         if not is_leaf:
+                            # Fake up some autograd history.
                             with torch.enable_grad():
-                                # The backward function here will be wrong, but
-                                # that's OK; our goal is just to get the metadata
-                                # looking as close as possible; we're not going to
-                                # actually try to backward() on these produced
-                                # metas.  TODO: would be safer to install some
-                                # sort of unsupported grad_fn here
-                                r = r.clone()
+                                # preserve_format is the default, but we want to
+                                # emphasize how important it is to preserve
+                                # format here
+                                r = r.clone(memory_format=torch.preserve_format)
+
+                    s = t.storage().untyped()
+                    swr = StorageWeakRef(s)
+                    if (
+                        swr not in self.storage_memo
+                        and r.stride() == strides
+                        and r.storage_offset() == storage_offset
+                    ):
+                        # You're normal and happy, install the fresh storage into the memo
+                        self.storage_memo[swr] = r.storage().untyped()
                     else:
-                        r = torch.empty((0,), dtype=t.dtype, device="meta")
-                    # As long as meta storage is not supported, need to prevent
-                    # redispatching on set_(Storage, ...) which will choke with
-                    # meta storage
-                    s = self.meta_storage(t.storage())
-                    with no_dispatch():
-                        sizes, strides = sym_sizes_strides(t)
-                        with torch.no_grad():
-                            r.set_(s, sym(t.storage_offset()), sizes, strides)
+                        # You're in crazy town; somehow you gave us a tensor
+                        # that wasn't a view, but had nonzero storage offset,
+                        # nontrivial strides (such that clone() couldn't
+                        # preserve them), or already aliases with another
+                        # tensor's storage.  The most typical way to end
+                        # up here is with set_.  So use set_ to bludgeon this
+                        # in.
+                        r_s = self.meta_storage(s, callback=callback)
+                        # NB: In principle, this should always work, but there
+                        # is some subtle difference in the autograd metadata
+                        # that means we will backprop the set_ call, even if
+                        # r is declared as an input to grad.
+                        # See https://github.com/pytorch/pytorch/issues/87956
+                        # for the reproducer.
+                        # NB: The in_kernel_invocation_manager here is necessary
+                        # for fake tensor.  If we run the set_ call with fake
+                        # tensor on, r will improperly report that it is NOT a
+                        # meta tensor but a cpu tensor, and then the set_ call
+                        # will fail due to device mismatch.  no_dispatch() is
+                        # not enough, because the fake tensor will still claim
+                        # to be a CPU tensor and you'll end up in the CPU
+                        # kernel.  Arguably this is a hack; a cleaner way to
+                        # solve this is to have a FakeStorage concept which
+                        # would report it's CPU device--no problem now!  But
+                        # this is difficult to do because we don't have storage
+                        # subclasses.  Relevant test is
+                        # DynamicShapesFunctionTests::test_add_dynamic_shapes in
+                        # test/dynamo/test_dynamic_shapes.py
+                        maybe_fake_mgr: ContextManager[None] = contextlib.nullcontext()
+                        from torch._subclasses.fake_tensor import (
+                            FakeTensor,
+                            in_kernel_invocation_manager,
+                        )
 
+                        if isinstance(r, FakeTensor):
+                            maybe_fake_mgr = in_kernel_invocation_manager(r.fake_mode)
+                        with maybe_fake_mgr, torch.no_grad():
+                            r.set_(r_s, storage_offset, sizes, strides)
+
+                with warnings.catch_warnings():
+                    warnings.filterwarnings("ignore", "The .grad attribute of a Tensor")
+                    grad_not_none = t.grad is not None
+                if grad_not_none:
+                    r.grad = self.meta_tensor(t.grad, shape_env, callback)
                 torch._C._set_conj(r, t.is_conj())
                 torch._C._set_neg(r, t.is_neg())
+            # This can be skipped if necessary for performance reasons
+            # assert_metadata_eq(assert_eq, t, r)
             self.set_tensor_memo(t, r)
 
         return self.get_tensor_memo(t)
 
-    def __call__(self, t, shape_env=None):
+    def __call__(self, t, shape_env=None, *, callback=lambda t: t()):
         # TODO: zero tensors?  We appear to have eliminated them by
         # excluding complex for now
         from torch._subclasses.fake_tensor import FakeTensor
@@ -280,10 +437,11 @@ def __call__(self, t, shape_env=None):
                 # tests all break so we just exclude this.  In any case
                 # the to conversion isn't really right anyhow.
                 self.miss += 1
-                return t
+                return NotImplemented
             else:
                 self.hit += 1
-                r = self.meta_tensor(t, shape_env=shape_env)
+                r = self.meta_tensor(t, shape_env=shape_env, callback=callback)
+                # TODO: this is suspicious, now that we have callback argument
                 if type(t) is torch.nn.Parameter:
                     r = torch.nn.Parameter(r, requires_grad=r.requires_grad)
                 return r
@@ -294,7 +452,7 @@ def __call__(self, t, shape_env=None):
             # support meta.  Trying to YOLO this is more trouble than it's
             # worth.
             self.miss += 1
-            return t
+            return NotImplemented
         else:
             # non-Tensor types don't count as hit or miss
             return t
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 2f85b8af1d81f..9903e95228fc8 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -1367,6 +1367,8 @@ def freeze_rng_state():
         #
         # In the long run torch.cuda.set_rng_state should probably be
         # an operator.
+        #
+        # NB: Mode disable is to avoid running cross-ref tests on thes seeding
         with no_dispatch(), disable_functorch():
             if torch.cuda.is_available():
                 torch.cuda.set_rng_state(cuda_rng_state)

From 16281fd17741543dc1c85a6eb98b20ffb77d8c95 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Mon, 31 Oct 2022 14:22:54 +0000
Subject: [PATCH 0351/1922] istft: Use unfold_backward instead of col2im
 (#88060)

`unfold_backward` implements the same operation as `col2im` but without support
for 2d kernels or dilation. However, `istft` doesn't use any of those features
and `unfold_backward` actually has a faster `TensorIterator` based
implementation so we should use it here instead.

In the example from #87353 I see a 2x speedup on both CPU and CUDA.

On a wider variety of sizes and inputs I still see speedups across the board, especially
on CPU since `col2im` isn't parallelized but `unfold_backward` is:

| device | shape           | hop_length | Master (us) | This PR (us) | Speedup |
|--------|-----------------|------------|-------------|--------------|---------|
| CUDA   | (1, 129, 33)    | 256        | 147         | 136          | 1.08    |
|        |                 | 128        | 153         | 128          | 1.20    |
|        | (100, 129, 20)  | 256        | 181         | 147          | 1.23    |
|        |                 | 128        | 171         | 137          | 1.25    |
|        | (1000, 129, 10) | 256        | 681         | 443          | 1.55    |
|        |                 | 128        | 632         | 446          | 1.42    |
| CPU    | (1, 129, 33)    | 256        | 106         | 104          | 1.02    |
|        |                 | 128        | 103         | 81           | 1.27    |
|        | (100, 129, 20)  | 256        | 2400        | 399          | 6.02    |
|        |                 | 128        | 2150        | 313          | 6.87    |
|        | (1000, 129, 10) | 256        | 13800       | 3740         | 3.69    |
|        |                 | 128        | 12700       | 2110         | 6.02    |
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88060
Approved by: https://github.com/albanD
---
 aten/src/ATen/native/SpectralOps.cpp | 47 +++++++++++++---------------
 1 file changed, 22 insertions(+), 25 deletions(-)

diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index 2840b1651dba5..0acc3506cf515 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -22,7 +22,6 @@
 #include <ATen/ops/_fft_r2c.h>
 #include <ATen/ops/arange.h>
 #include <ATen/ops/arange_native.h>
-#include <ATen/ops/col2im.h>
 #include <ATen/ops/conj.h>
 #include <ATen/ops/conj_physical.h>
 #include <ATen/ops/constant_pad_nd.h>
@@ -55,6 +54,7 @@
 #include <ATen/ops/roll.h>
 #include <ATen/ops/stft.h>
 #include <ATen/ops/stft_native.h>
+#include <ATen/ops/unfold_backward.h>
 #include <ATen/ops/view_as_complex.h>
 #include <ATen/ops/view_as_real.h>
 #include <ATen/ops/zeros.h>
@@ -1095,7 +1095,7 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const optional<int64_t> ho
     input = input.unsqueeze(0);
   }
 
-  input = as_complex(input.transpose(1, 2));  // size: (channel, n_frames, fft_size, 2)
+  input = as_complex(input.transpose(1, 2));  // size: (channel, n_frames, fft_size)
 
   const fft_norm_mode norm = normalized ? fft_norm_mode::by_root_n : fft_norm_mode::by_n;
   if (return_complex) {
@@ -1112,26 +1112,23 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const optional<int64_t> ho
   TORCH_INTERNAL_ASSERT(input.size(2) == n_fft);
 
   Tensor y_tmp = input * window_tmp.view({1, 1, n_fft});  // size: (channel, n_frames, n_fft)
-  y_tmp = y_tmp.transpose(1, 2);  // size: (channel, n_fft, frame)
-
-  Tensor y = at::col2im(y_tmp,
-                                  /*output_size*/ {1, (n_frames - 1) * hop_length + n_fft},
-                                  /*kernel_size*/ {1, n_fft},
-                                  /*dilation*/    {1, 1},
-                                  /*padding*/     {0, 0},
-                                  /*stride*/      {1, hop_length}
-                                 ).squeeze(2);
-  window_tmp = window_tmp.pow(2).view({n_fft, 1}).repeat({1, n_frames}).unsqueeze(0);  // size: (1, n_fft, n_frames)
-  Tensor window_envelop = at::col2im(window_tmp,
-                                  /*output_size*/ {1, (n_frames - 1) * hop_length + n_fft},
-                                  /*kernel_size*/ {1, n_fft},
-                                  /*dilation*/    {1, 1},
-                                  /*padding*/     {0, 0},
-                                  /*stride*/      {1, hop_length}
-                                 ).squeeze(2); // size: (1, 1, expected_output_signal_len)
-
-  TORCH_INTERNAL_ASSERT(expected_output_signal_len == y.size(2));
-  TORCH_INTERNAL_ASSERT(expected_output_signal_len == window_envelop.size(2));
+
+  Tensor y = at::unfold_backward(
+    y_tmp,
+    /*input_sizes=*/{y_tmp.size(0), expected_output_signal_len},
+    /*dim=*/1,
+    /*size=*/n_fft,
+    /*step=*/hop_length);
+  window_tmp = window_tmp.pow(2).expand({1, n_frames, n_fft});  // size: (1, n_frames, n_fft)
+  Tensor window_envelop = at::unfold_backward(
+    window_tmp,
+    /*input_sizes=*/{1, expected_output_signal_len},
+    /*dim=*/1,
+    /*size=*/n_fft,
+    /*step=*/hop_length); // size: (1, expected_output_signal_len)
+
+  TORCH_INTERNAL_ASSERT(expected_output_signal_len == y.size(1));
+  TORCH_INTERNAL_ASSERT(expected_output_signal_len == window_envelop.size(1));
 
   // We need to trim the front padding away if centered
   const auto start = center ? n_fft / 2 : 0;
@@ -1145,8 +1142,8 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const optional<int64_t> ho
     return expected_output_signal_len;
   }();
 
-  y = y.slice(2, start, end, 1);
-  window_envelop = window_envelop.slice(2, start, end, 1);
+  y = y.slice(1, start, end, 1);
+  window_envelop = window_envelop.slice(1, start, end, 1);
   const auto window_envelop_lowest = window_envelop.abs().min().lt(1e-11);
   if (at::is_scalar_tensor_true(window_envelop_lowest)) {
     std::ostringstream ss;
@@ -1154,7 +1151,7 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const optional<int64_t> ho
     AT_ERROR(ss.str());
   }
 
-  y = (y / window_envelop).squeeze(1);  // size: (channel, expected_output_signal_len)
+  y = (y / window_envelop);  // size: (channel, expected_output_signal_len)
   if (input_dim == 3) {
     y = y.squeeze(0);
   }

From 0f8dcefeb9c0e63a8c3ff1002ddf8a631bbbb759 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Mon, 31 Oct 2022 14:22:55 +0000
Subject: [PATCH 0352/1922] unfold_backward: Remove stride >= size kernel in
 favour of copy_ (#88061)

unfold_backward has a dedicated kernel for `stride >= size` which uses temporary
tensors created by `at::arange` to perform the mapping from unfolded to folded.
This instead uses `unfold` to view the output, and does a direct copy from the
gradient into the view.

In benchmarks I see either no difference or a marginal speed benefit from
this PR.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88061
Approved by: https://github.com/albanD
---
 aten/src/ATen/native/UnfoldBackward.cpp       |  6 ++
 aten/src/ATen/native/UnfoldBackward.h         | 73 --------------
 .../ATen/native/cpu/UnfoldBackwardKernel.cpp  | 81 +++++-----------
 .../ATen/native/cuda/UnfoldBackwardKernel.cu  | 95 ++++++-------------
 4 files changed, 61 insertions(+), 194 deletions(-)

diff --git a/aten/src/ATen/native/UnfoldBackward.cpp b/aten/src/ATen/native/UnfoldBackward.cpp
index 10bee80cea23c..4941432321169 100644
--- a/aten/src/ATen/native/UnfoldBackward.cpp
+++ b/aten/src/ATen/native/UnfoldBackward.cpp
@@ -5,6 +5,7 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
+#include <ATen/ops/empty.h>
 #include <ATen/ops/unfold_backward_native.h>
 #include <ATen/ops/zeros.h>
 #endif
@@ -21,6 +22,11 @@ Tensor unfold_backward(
   int64_t step
 ) {
   auto grad_input = at::zeros(input_sizes, grad.options());
+  if (step >= size) {
+    auto gI_unfolded = grad_input.unfold(dim, size, step);
+    gI_unfolded.copy_(grad);
+    return grad_input;
+  }
 
   unfold_backward_stub(
     grad.device().type(),
diff --git a/aten/src/ATen/native/UnfoldBackward.h b/aten/src/ATen/native/UnfoldBackward.h
index cb4856ec2718e..f8099167361c2 100644
--- a/aten/src/ATen/native/UnfoldBackward.h
+++ b/aten/src/ATen/native/UnfoldBackward.h
@@ -107,79 +107,6 @@ static C10_UNUSED TensorIterator _make_unfold_backward_iter_over_grad_out(
   return iter;
 }
 
-static C10_UNUSED TensorIterator _make_unfold_backward_iter_over_grad_in(
-  Tensor& grad_out,
-  const Tensor& grad_in,
-  int64_t dim,
-  int64_t /*size*/,
-  int64_t /*step*/
-) {
-  dim = maybe_wrap_dim(dim, grad_out.dim());
-  // last dim stores the folds
-  auto last_dim = maybe_wrap_dim(-1, grad_in.dim());
-
-  auto grad_in_dim = ensure_nonempty_dim(grad_in.dim());
-  auto grad_in_dim_size = ensure_nonempty_size(grad_in, dim);
-  auto grad_in_last_dim_size = ensure_nonempty_size(grad_in, last_dim);
-
-  /* prepare grad_out for TensorIterator { */
-  auto grad_out_restrided = grad_out.unsqueeze(-1);
-
-  auto grad_out_strides = ensure_nonempty_vec(grad_out_restrided.strides().vec());
-  auto grad_out_sizes = ensure_nonempty_vec(grad_out_restrided.sizes().vec());
-
-  grad_out_strides[dim] = 0;
-  grad_out_strides[last_dim] = 0;
-
-  grad_out_sizes[dim] = grad_in_dim_size;
-  grad_out_sizes[last_dim] = grad_in_last_dim_size;
-
-  grad_out_restrided = grad_out_restrided.as_strided(grad_out_sizes, grad_out_strides);
-  /* } */
-
-  // for each element grad_out[i_1,...,i_dim,...,i_last_dim]
-  // we have to know i_dim and i_last_dim.
-  // This information is stored in Tensors
-  // idx_dim and idx_last_dim
-  /* prepare idx_dim and idx_last_dim for TensorIterator { */
-  auto idx_dim = at::arange(
-    0, grad_in_dim_size, grad_in.options().dtype(at::kLong)
-  );
-
-  auto idx_dim_strides = std::vector<int64_t>(grad_in_dim, 0);
-  auto idx_dim_sizes = std::vector<int64_t>(grad_in_dim, 1);
-
-  idx_dim_strides[dim] = 1;
-  idx_dim_sizes[dim] = grad_in_dim_size;
-
-  auto idx_dim_restrided = idx_dim.as_strided(idx_dim_sizes, idx_dim_strides);
-
-  auto idx_last_dim = at::arange(
-    0, grad_in_last_dim_size, grad_in.options().dtype(at::kLong)
-  );
-
-  auto idx_last_dim_strides = std::vector<int64_t>(grad_in_dim, 0);
-  auto idx_last_dim_sizes = std::vector<int64_t>(grad_in_dim, 1);
-
-  idx_last_dim_strides[last_dim] = 1;
-  idx_last_dim_sizes[last_dim] = grad_in_last_dim_size;
-
-  auto idx_last_dim_restrided = idx_last_dim.as_strided(idx_last_dim_sizes, idx_last_dim_strides);
-  /* } */
-
-  auto iter = TensorIteratorConfig()
-    .set_check_mem_overlap(false)
-    .check_all_same_dtype(false)
-    .resize_outputs(false)
-    .add_owned_output(grad_out_restrided)
-    .add_owned_input(grad_in)
-    .add_owned_input(idx_dim_restrided)
-    .add_owned_input(idx_last_dim_restrided)
-    .build();
-
-  return iter;
-}
-
 }
 
 }} // namespace at::native
diff --git a/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp b/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp
index cf934586c74e7..aa5dfb0143801 100644
--- a/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp
@@ -66,8 +66,7 @@ void _unfold_backward_internal_kernel(
   int64_t grad_in_dim_stride,
   int64_t grad_in_last_dim_stride,
   int64_t grad_in_dim_size,
-  int64_t grad_out_dim_stride,
-  bool is_step_ge_size
+  int64_t grad_out_dim_stride
 ) {
   if (iter.numel() == 0) {
     return;
@@ -78,53 +77,32 @@ void _unfold_backward_internal_kernel(
     auto* RESTRICT grad_in_ptr = data[1];
     auto* RESTRICT idx_dim_ptr = data[2];
 
-    if (is_step_ge_size) {
-      auto* RESTRICT idx_last_dim_ptr = data[3];
+    for (const auto elem C10_UNUSED : c10::irange(nelems)) {
+      auto* RESTRICT grad_out_data = reinterpret_cast<scalar_t*>(grad_out_ptr);
+      auto* RESTRICT grad_in_data = reinterpret_cast<scalar_t*>(grad_in_ptr);
 
-      for (const auto elem C10_UNUSED : c10::irange(nelems)) {
-        auto* RESTRICT grad_out_data = reinterpret_cast<scalar_t*>(grad_out_ptr);
-        auto* RESTRICT grad_in_data = reinterpret_cast<scalar_t*>(grad_in_ptr);
+      auto idx_dim = *reinterpret_cast<int64_t*>(idx_dim_ptr);
 
-        auto idx_dim = *reinterpret_cast<int64_t*>(idx_dim_ptr);
-        auto idx_last_dim = *reinterpret_cast<int64_t*>(idx_last_dim_ptr);
+      // left_fold potentially intersecting with idx_dim
+      // is either (idx_dim - size) / step or the next integer.
+      int64_t left_fold_idx = (idx_dim > size) ? (idx_dim - size) / step : 0;
+      if (!(left_fold_idx * step <= idx_dim && idx_dim < left_fold_idx * step + size)) {
+        ++left_fold_idx;
+      }
 
-        auto grad_out_idx_dim = idx_dim * step + idx_last_dim;
-        grad_out_data[grad_out_idx_dim * grad_out_dim_stride] = *grad_in_data;
+      auto right_fold_idx = idx_dim / step;
+      right_fold_idx = (right_fold_idx >= grad_in_dim_size)
+        ? (grad_in_dim_size - 1) : right_fold_idx;
 
-        grad_out_ptr += strides[0];
-        grad_in_ptr += strides[1];
-        idx_dim_ptr += strides[2];
-        idx_last_dim_ptr += strides[3];
-      }
-    }
-    else {
-      for (const auto elem C10_UNUSED : c10::irange(nelems)) {
-        auto* RESTRICT grad_out_data = reinterpret_cast<scalar_t*>(grad_out_ptr);
-        auto* RESTRICT grad_in_data = reinterpret_cast<scalar_t*>(grad_in_ptr);
-
-        auto idx_dim = *reinterpret_cast<int64_t*>(idx_dim_ptr);
-
-        // left_fold potentially intersecting with idx_dim
-        // is either (idx_dim - size) / step or the next integer.
-        int64_t left_fold_idx = (idx_dim > size) ? (idx_dim - size) / step : 0;
-        if (!(left_fold_idx * step <= idx_dim && idx_dim < left_fold_idx * step + size)) {
-          ++left_fold_idx;
-        }
-
-        auto right_fold_idx = idx_dim / step;
-        right_fold_idx = (right_fold_idx >= grad_in_dim_size)
-          ? (grad_in_dim_size - 1) : right_fold_idx;
-
-        for (auto fold_idx = left_fold_idx; fold_idx <= right_fold_idx; ++fold_idx) {
-          auto idx_last_dim = idx_dim - fold_idx * step;
-          *grad_out_data += grad_in_data[fold_idx * grad_in_dim_stride
-                                      + idx_last_dim * grad_in_last_dim_stride];
-        }
-
-        grad_out_ptr += strides[0];
-        grad_in_ptr += strides[1];
-        idx_dim_ptr += strides[2];
+      for (auto fold_idx = left_fold_idx; fold_idx <= right_fold_idx; ++fold_idx) {
+        auto idx_last_dim = idx_dim - fold_idx * step;
+        *grad_out_data += grad_in_data[fold_idx * grad_in_dim_stride
+                                    + idx_last_dim * grad_in_last_dim_stride];
       }
+
+      grad_out_ptr += strides[0];
+      grad_in_ptr += strides[1];
+      idx_dim_ptr += strides[2];
     }
   };
 
@@ -148,16 +126,8 @@ void unfold_backward_cpu_kernel(
 
   auto grad_out_dim_stride = ensure_nonempty_stride(grad_out, dim);
 
-  auto is_step_ge_size = (step >= size);
-
-  TensorIterator iter =
-    is_step_ge_size ?
-    _make_unfold_backward_iter_over_grad_in(
-      grad_out, grad_in, dim, size, step
-    ) :
-    _make_unfold_backward_iter_over_grad_out(
-      grad_out, grad_in, dim, size, step
-    );
+  TensorIterator iter = _make_unfold_backward_iter_over_grad_out(
+      grad_out, grad_in, dim, size, step);
 
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
     at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
@@ -170,8 +140,7 @@ void unfold_backward_cpu_kernel(
         grad_in_dim_stride,
         grad_in_last_dim_stride,
         grad_in_dim_size,
-        grad_out_dim_stride,
-        is_step_ge_size
+        grad_out_dim_stride
       );
     }
   );
diff --git a/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu b/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu
index 7865a7f61545f..d75de2a6e90fb 100644
--- a/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu
+++ b/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu
@@ -58,8 +58,7 @@ void _unfold_backward_internal_kernel(
   int64_t grad_in_dim_stride,
   int64_t grad_in_last_dim_stride,
   int64_t grad_in_dim_size,
-  int64_t grad_out_dim_stride,
-  bool is_step_ge_size
+  int64_t grad_out_dim_stride
 ) {
   if (iter.numel() == 0) {
     return;
@@ -74,8 +73,7 @@ void _unfold_backward_internal_kernel(
         grad_in_dim_stride,
         grad_in_last_dim_stride,
         grad_in_dim_size,
-        grad_out_dim_stride,
-        is_step_ge_size
+        grad_out_dim_stride
       );
     }
     return;
@@ -85,63 +83,39 @@ void _unfold_backward_internal_kernel(
   char* __restrict__ grad_in_ptr = reinterpret_cast<char*>(iter.data_ptr(1));
   char* __restrict__ idx_dim_ptr = reinterpret_cast<char*>(iter.data_ptr(2));
 
-  if (is_step_ge_size) {
-    char* __restrict__ idx_last_dim_ptr = reinterpret_cast<char*>(iter.data_ptr(3));
+  auto offset_calc = make_offset_calculator<3>(iter);
 
-    auto offset_calc = make_offset_calculator<4>(iter);
+  // The algorithm is: for each index in grad_out find
+  // the elements contributing to it and sum them up.
+  // Note: the algorithm does not require any synchronization.
+  auto loop = [=]C10_DEVICE(int i) {
+    auto offsets = offset_calc.get(i);
 
-    // this loop simply copies the data
-    // from proper places in grad_out to grad_in
-    auto loop = [=]C10_DEVICE(int i) {
-      auto offsets = offset_calc.get(i);
+    auto* __restrict__ grad_out_data = reinterpret_cast<scalar_t*>(grad_out_ptr + offsets[0]);
+    auto* __restrict__ grad_in_data = reinterpret_cast<scalar_t*>(grad_in_ptr + offsets[1]);
 
-      auto* __restrict__ grad_out_data = reinterpret_cast<scalar_t*>(grad_out_ptr + offsets[0]);
-      auto* __restrict__ grad_in_data = reinterpret_cast<scalar_t*>(grad_in_ptr + offsets[1]);
+    auto idx_dim = *reinterpret_cast<int64_t*>(idx_dim_ptr + offsets[2]);
 
-      auto idx_dim = *reinterpret_cast<int64_t*>(idx_dim_ptr + offsets[2]);
-      auto idx_last_dim = *reinterpret_cast<int64_t*>(idx_last_dim_ptr + offsets[3]);
-
-      auto grad_out_idx_dim = idx_dim * step + idx_last_dim;
-      grad_out_data[grad_out_idx_dim * grad_out_dim_stride] = *grad_in_data;
-    };
-
-    _launch_unfold_backward_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);
-  }
-  else {
-    auto offset_calc = make_offset_calculator<3>(iter);
-
-    // The algorithm is: for each index in grad_out find
-    // the elements contributing to it and sum them up.
-    // Note: the algorithm does not require any synchronization.
-    auto loop = [=]C10_DEVICE(int i) {
-      auto offsets = offset_calc.get(i);
-
-      auto* __restrict__ grad_out_data = reinterpret_cast<scalar_t*>(grad_out_ptr + offsets[0]);
-      auto* __restrict__ grad_in_data = reinterpret_cast<scalar_t*>(grad_in_ptr + offsets[1]);
-
-      auto idx_dim = *reinterpret_cast<int64_t*>(idx_dim_ptr + offsets[2]);
-
-      // left_fold potentially intersecting with idx_dim
-      // is either (idx_dim - size) / step or the next integer.
-      int64_t left_fold_idx = (idx_dim > size) ? (idx_dim - size) / step : 0;
-      if (!(left_fold_idx * step <= idx_dim && idx_dim < left_fold_idx * step + size)) {
-        ++left_fold_idx;
-      }
+    // left_fold potentially intersecting with idx_dim
+    // is either (idx_dim - size) / step or the next integer.
+    int64_t left_fold_idx = (idx_dim > size) ? (idx_dim - size) / step : 0;
+    if (!(left_fold_idx * step <= idx_dim && idx_dim < left_fold_idx * step + size)) {
+      ++left_fold_idx;
+    }
 
-      auto right_fold_idx = idx_dim / step;
-      right_fold_idx = (right_fold_idx >= grad_in_dim_size) ?
-        (grad_in_dim_size - 1) : right_fold_idx;
+    auto right_fold_idx = idx_dim / step;
+    right_fold_idx = (right_fold_idx >= grad_in_dim_size) ?
+      (grad_in_dim_size - 1) : right_fold_idx;
 
-      for (auto fold_idx = left_fold_idx; fold_idx <= right_fold_idx; ++fold_idx) {
-        auto idx_last_dim = idx_dim - fold_idx * step;
-        *grad_out_data += grad_in_data[fold_idx * grad_in_dim_stride
-                                    + idx_last_dim * grad_in_last_dim_stride];
-      }
+    for (auto fold_idx = left_fold_idx; fold_idx <= right_fold_idx; ++fold_idx) {
+      auto idx_last_dim = idx_dim - fold_idx * step;
+      *grad_out_data += grad_in_data[fold_idx * grad_in_dim_stride
+                                  + idx_last_dim * grad_in_last_dim_stride];
+    }
 
-    };
+  };
 
-    _launch_unfold_backward_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);
-  }
+  _launch_unfold_backward_kernel<num_threads(), thread_work_size()>(iter.numel(), loop);
 }
 
 void unfold_backward_cuda_kernel(
@@ -161,16 +135,8 @@ void unfold_backward_cuda_kernel(
 
   auto grad_out_dim_stride = ensure_nonempty_stride(grad_out, dim);
 
-  auto is_step_ge_size = (step >= size);
-
-  TensorIterator iter =
-    is_step_ge_size ?
-    _make_unfold_backward_iter_over_grad_in(
-      grad_out, grad_in, dim, size, step
-    ) :
-    _make_unfold_backward_iter_over_grad_out(
-      grad_out, grad_in, dim, size, step
-    );
+  TensorIterator iter = _make_unfold_backward_iter_over_grad_out(
+      grad_out, grad_in, dim, size, step);
 
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
     at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
@@ -183,8 +149,7 @@ void unfold_backward_cuda_kernel(
         grad_in_dim_stride,
         grad_in_last_dim_stride,
         grad_in_dim_size,
-        grad_out_dim_stride,
-        is_step_ge_size
+        grad_out_dim_stride
       );
     }
   );

From d105e945b271895820f6eb72490f84a82f8e4dfa Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Mon, 31 Oct 2022 16:38:23 +0000
Subject: [PATCH 0353/1922] Fix args for meta__fused_moving_avg_obs_fq_helper
 (#88058)

Fixes https://github.com/pytorch/torchdynamo/issues/1802

There are a few problems,
1. torch.fused_moving_avg_obs_fake_quant doesn't have OpInfo test
2. self.empty_like() is not a valid call. it should be torch.empty_like(self)
3. python meta function has some unexplained behavior for arguments with default value of bool type?

In particular, problem 3 is the most concerning one.
**UPDATE: This is expected behavior, see discussion below for explanation.**

Without setting the default value for `per_row_fake_quant` and `symmetric_quant`, it gets the following error when running with meta tensor.
```
meta__fused_moving_avg_obs_fq_helper() missing 2 required positional arguments: 'per_row_fake_quant' and 'symmetric_quant'
```
I can fix this by adding the default values to these two args. However, I observer something strange when examining the actual value in meta function.

```
    print("per_row_fake_quant", per_row_fake_quant)
    print("symmetric_quant", symmetric_quant)
```

When default values are False, printed value correctly reflect the args value populated from call site.
When default values are True, printed value is ALWAYS True, regardless of the populated value from call site.
When default Values are None, printed value is `None` when call site set the value to 'False', printed value is 'True' when call site sets the value to 'True'.

I also verify that this bug also affect for other meta function with default args....

My speculation is that this is something about pybind value packing when called from c++ dispatcher to python meta function, and default value parsing for python meta function (and other python dispatch functions) ?

I tried to find the c++ call stack, but gdb is missing symbols and C++ stacktrace is not working properly... Appreciate anyone who can point me to the source file for pybind value packing.

cc @ezyang
cc @bdhirsh. I know you had a fix in the symbolic shape branch...
cc @yanboliang  who reported this bug
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88058
Approved by: https://github.com/bdhirsh, https://github.com/yanboliang
---
 test/test_meta.py            | 51 ++++++++++++++++++++++++++++++++++++
 torch/_meta_registrations.py |  8 +++---
 2 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/test/test_meta.py b/test/test_meta.py
index 26f9103b6e864..c1099dce6ccd3 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -1279,6 +1279,57 @@ def test_fill_alias_relationship(self):
         r2 = torch.ops.aten.fill(inps, 1.0)
         self.assertNotEqual(id(inps), id(r2))
 
+    def test_meta__fused_moving_avg_obs_fq_helper(self, device):
+        from torch.ao.quantization import FusedMovingAvgObsFakeQuantize
+        to_meta = MetaConverter()
+
+        x = torch.randn(5, 5, device=device)
+        running_min_op = torch.tensor(float("inf"), device=device)
+        running_max_op = torch.tensor(float("-inf"), device=device)
+        avg_const = 0.01
+        scale = torch.tensor([1.0], device=device)
+        zero_point = torch.tensor([0], dtype=torch.int, device=device)
+
+        mod = FusedMovingAvgObsFakeQuantize()
+        torch.ao.quantization.enable_fake_quant(mod)
+        torch.ao.quantization.enable_observer(mod)
+        mod.to(device)
+
+        meta_x = to_meta(x)
+
+        args = [
+            x,
+            mod.observer_enabled,
+            mod.fake_quant_enabled,
+            running_min_op,
+            running_max_op,
+            scale,
+            zero_point,
+            avg_const,
+            0,
+            255,
+            0,
+        ]
+
+        meta_args = args.copy()
+        meta_args[0] = meta_x
+
+        kwargss = [
+            {},
+            {"per_row_fake_quant": False, "symmetric_quant": False},
+            {"per_row_fake_quant": False, "symmetric_quant": True},
+        ]
+
+        for kwargs in kwargss:
+            ref_out = aten._fused_moving_avg_obs_fq_helper.default(*args, **kwargs)
+            meta_out = aten._fused_moving_avg_obs_fq_helper.default(*meta_args, **kwargs)
+
+            self.assertEqual(ref_out[0].size(), meta_out[0].size())
+            self.assertEqual(ref_out[0].stride(), meta_out[0].stride())
+            self.assertEqual(ref_out[1].size(), meta_out[1].size())
+            self.assertEqual(ref_out[1].stride(), meta_out[1].stride())
+
+
     def test_map_location_deserialize(self):
         import io
 
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 1ef9778832edc..5fec8475e50c9 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -252,15 +252,15 @@ def meta__fused_moving_avg_obs_fq_helper(
     quant_min,
     quant_max,
     ch_axis,
-    per_row_fake_quant,
-    symmetric_quant,
+    per_row_fake_quant=False,
+    symmetric_quant=False,
 ):
     check(
         ch_axis < self.dim(),
         lambda: "Error in fused_moving_avg_obs_fake_quant_cpu: ch_axis must be < self.dim()",
     )
-    mask = self.empty_like(dtype=torch.bool)
-    return (self.empty_like(), mask)
+    mask = torch.empty_like(self, dtype=torch.bool)
+    return (torch.empty_like(self), mask)
 
 
 def dot_check(self, other):

From e845dd17edac35ed7f8e6dc925538cfbb566ecc9 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Mon, 31 Oct 2022 19:31:56 +0000
Subject: [PATCH 0354/1922] Fix typos under docs directory (#88033)

This PR fixes typos in `.rst` and `.Doxyfile` files under docs directory

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88033
Approved by: https://github.com/soulitzer
---
 docs/caffe2/.Doxyfile-c                      | 2 +-
 docs/caffe2/.Doxyfile-python                 | 2 +-
 docs/cpp/source/notes/tensor_cuda_stream.rst | 2 +-
 docs/source/cuda._sanitizer.rst              | 2 +-
 docs/source/data.rst                         | 2 +-
 docs/source/fx.rst                           | 2 +-
 docs/source/quantization-support.rst         | 2 +-
 docs/source/quantization.rst                 | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/caffe2/.Doxyfile-c b/docs/caffe2/.Doxyfile-c
index c4873d63841ca..b30ab661d24cb 100644
--- a/docs/caffe2/.Doxyfile-c
+++ b/docs/caffe2/.Doxyfile-c
@@ -1490,7 +1490,7 @@ EXT_LINKS_IN_WINDOW    = NO
 
 FORMULA_FONTSIZE       = 10
 
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
 # generated for formulas are transparent PNGs. Transparent PNGs are not
 # supported properly for IE 6.0, but are supported on all modern browsers.
 #
diff --git a/docs/caffe2/.Doxyfile-python b/docs/caffe2/.Doxyfile-python
index 9d16671ffe3ba..514e580363996 100644
--- a/docs/caffe2/.Doxyfile-python
+++ b/docs/caffe2/.Doxyfile-python
@@ -1488,7 +1488,7 @@ EXT_LINKS_IN_WINDOW    = NO
 
 FORMULA_FONTSIZE       = 10
 
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
 # generated for formulas are transparent PNGs. Transparent PNGs are not
 # supported properly for IE 6.0, but are supported on all modern browsers.
 #
diff --git a/docs/cpp/source/notes/tensor_cuda_stream.rst b/docs/cpp/source/notes/tensor_cuda_stream.rst
index bdb66361d9a70..4940317713635 100644
--- a/docs/cpp/source/notes/tensor_cuda_stream.rst
+++ b/docs/cpp/source/notes/tensor_cuda_stream.rst
@@ -144,7 +144,7 @@ CUDA Stream Usage Examples
   // sum() on tensor0 use `myStream0` as current CUDA stream on device 0
   tensor0.sum();
 
-  // change the current device index to 1 by using CUDA device guard within a braket scope
+  // change the current device index to 1 by using CUDA device guard within a bracket scope
   {
     at::cuda::CUDAGuard device_guard{1};
     // create a tensor on device 1
diff --git a/docs/source/cuda._sanitizer.rst b/docs/source/cuda._sanitizer.rst
index 097d26a324f12..658b975693112 100644
--- a/docs/source/cuda._sanitizer.rst
+++ b/docs/source/cuda._sanitizer.rst
@@ -29,7 +29,7 @@ Here is an example of a simple synchronization error in PyTorch:
 
 The ``a`` tensor is initialized on the default stream and, without any synchronization
 methods, modified on a new stream. The two kernels will run concurrently on the same tensor,
-which might cause the second kernel to read unitialized data before the first one was able
+which might cause the second kernel to read uninitialized data before the first one was able
 to write it, or the first kernel might overwrite part of the result of the second.
 When this script is run on the commandline with:
 ::
diff --git a/docs/source/data.rst b/docs/source/data.rst
index db6957c8da787..de2d44920f573 100644
--- a/docs/source/data.rst
+++ b/docs/source/data.rst
@@ -65,7 +65,7 @@ in real time.
 
 See :class:`~torch.utils.data.IterableDataset` for more details.
 
-.. note:: When using an :class:`~torch.utils.data.IterableDataset` with
+.. note:: When using a :class:`~torch.utils.data.IterableDataset` with
           `multi-process data loading <Multi-process data loading_>`_. The same
           dataset object is replicated on each worker process, and thus the
           replicas must be configured differently to avoid duplicated data. See
diff --git a/docs/source/fx.rst b/docs/source/fx.rst
index 988ae081125c7..664fee10c67a7 100644
--- a/docs/source/fx.rst
+++ b/docs/source/fx.rst
@@ -36,7 +36,7 @@ What is an FX transform? Essentially, it's a function that looks like this.
         # Step 3: Construct a Module to return
         return torch.fx.GraphModule(m, graph)
 
-Your transform will take in an :class:`torch.nn.Module`, acquire a :class:`Graph`
+Your transform will take in a :class:`torch.nn.Module`, acquire a :class:`Graph`
 from it, do some modifications, and return a new
 :class:`torch.nn.Module`. You should think of the :class:`torch.nn.Module` that your FX
 transform returns as identical to a regular :class:`torch.nn.Module` -- you can pass it to another
diff --git a/docs/source/quantization-support.rst b/docs/source/quantization-support.rst
index 681e25b1172bc..d57a4b822f5c5 100644
--- a/docs/source/quantization-support.rst
+++ b/docs/source/quantization-support.rst
@@ -529,7 +529,7 @@ Quantized dtypes and quantization schemes
 Note that operator implementations currently only
 support per channel quantization for weights of the **conv** and **linear**
 operators. Furthermore, the input data is
-mapped linearly to the the quantized data and vice versa
+mapped linearly to the quantized data and vice versa
 as follows:
 
     .. math::
diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index 55fa6b0c604d2..4b87e8b181555 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -354,7 +354,7 @@ QAT API Example::
   # attach a global qconfig, which contains information about what kind
   # of observers to attach. Use 'fbgemm' for server inference and
   # 'qnnpack' for mobile inference. Other quantization configurations such
-  # as selecting symmetric or assymetric quantization and MinMax or L2Norm
+  # as selecting symmetric or asymmetric quantization and MinMax or L2Norm
   # calibration techniques can be specified here.
   model_fp32.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
 

From 0df47c24ef73c63d8a9090d2c8a621d6bbc11221 Mon Sep 17 00:00:00 2001
From: KevinYuk <kevin.yu@intel.com>
Date: Mon, 31 Oct 2022 19:46:01 +0000
Subject: [PATCH 0355/1922] enable xpu group norm channels last support
 (#87680)

XPU would support channels last format for group norm operator, however, Pytorch converts all input tensor to contiguous format, which includes channels last tensor. Need Pytorch pass down this memory format hint to us.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87680
Approved by: https://github.com/albanD
---
 aten/src/ATen/native/group_norm.cpp | 2 +-
 tools/autograd/derivatives.yaml     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/group_norm.cpp b/aten/src/ATen/native/group_norm.cpp
index c12d8d2142ff9..22ff9ea5f0e86 100644
--- a/aten/src/ATen/native/group_norm.cpp
+++ b/aten/src/ATen/native/group_norm.cpp
@@ -183,7 +183,7 @@ Tensor group_norm(
 
   const Tensor kEmpty;
   auto memory_format = input.suggest_memory_format();
-  const auto& X = input.device().is_cpu() ?
+  const auto& X = input.device().is_cpu() || input.device().is_xpu() ?
       input.contiguous(memory_format) : input.contiguous();
   const auto& gamma = weight.defined() ? weight.contiguous() : kEmpty;
   const auto& beta = bias.defined() ? bias.contiguous() : kEmpty;
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 6945dae77a020..8950ce8ec64f8 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -1160,7 +1160,7 @@
   rstd: not_implemented("native_layer_norm_backward rstd")
 
 - name: native_group_norm(Tensor input, Tensor? weight, Tensor? bias, SymInt N, SymInt C, SymInt HxW, int group, float eps) -> (Tensor, Tensor, Tensor)
-  input, weight, bias: "GradMode::is_enabled() || grads[1].defined() || grads[2].defined() ? infinitely_differentiable_native_group_norm_backward(grads[0], grads[1], grads[2], input, result1, result2, weight, N, C, HxW, group, eps, grad_input_mask) : (grads[0].defined() ? native_group_norm_backward_symint(grads[0].is_contiguous() ? grads[0] : grads[0].contiguous(), input.is_contiguous() ? input : input.contiguous(), result1, result2, weight, N, C, HxW, group, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>())"
+  input, weight, bias: "GradMode::is_enabled() || grads[1].defined() || grads[2].defined() ? infinitely_differentiable_native_group_norm_backward(grads[0], grads[1], grads[2], input, result1, result2, weight, N, C, HxW, group, eps, grad_input_mask) : (grads[0].defined() ? native_group_norm_backward_symint(grads[0].device().is_xpu() ? grads[0] : grads[0].contiguous(), input.device().is_xpu() ? input : input.contiguous(), result1, result2, weight, N, C, HxW, group, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>())"
   result0: group_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, result1, result2, group)
   result1: group_norm_mean_jvp(input_t, result1, group)
   result2: group_norm_invstd_jvp(input_p, input_t, result1, result2, group)

From dd59da7ac19491d632b9ec6033a476f726e90116 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 31 Oct 2022 19:55:58 +0000
Subject: [PATCH 0356/1922] Revert "[ONNX] Move all torch.onnx.export related
 tests to test/onnx (#87292)"

This reverts commit e3e84830aade59722d819bc5fa01922239494790.

Reverted https://github.com/pytorch/pytorch/pull/87292 on behalf of https://github.com/weiwangmeta due to breaking internal test relating to quantization eager tests, see test/quantization/eager/test_quantize_eager_ptq.py test_lower_graph_linear and test_lower_graph_conv2d
---
 .github/merge_rules.yaml                      |   1 +
 test/jit/test_async.py                        |  15 ++
 test/{onnx => jit}/test_export_modes.py       |  89 +++-----
 test/jit/test_tracer.py                       |   8 +
 test/onnx/test_pytorch_onnx_no_runtime.py     | 190 +-----------------
 .../eager/test_quantize_eager_ptq.py          |  21 ++
 test/test_jit.py                              | 112 ++++++++++-
 7 files changed, 189 insertions(+), 247 deletions(-)
 rename test/{onnx => jit}/test_export_modes.py (65%)

diff --git a/.github/merge_rules.yaml b/.github/merge_rules.yaml
index 6e9cba905e751..26b3eb437251a 100644
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@@ -6,6 +6,7 @@
   - docs/source/onnx*
   - docs/source/scripts/onnx/**
   - scripts/onnx/**
+  - test/jit/test_export_modes.py
   - test/onnx/**
   - tools/onnx/**
   - torch/_C/__init__.pyi.in
diff --git a/test/jit/test_async.py b/test/jit/test_async.py
index f8a1baea67133..d3769cd452d64 100644
--- a/test/jit/test_async.py
+++ b/test/jit/test_async.py
@@ -1,5 +1,6 @@
 # Owner(s): ["oncall: jit"]
 
+import io
 import os
 import sys
 
@@ -419,6 +420,20 @@ def fn(x):
         self.assertGraphContainsExactly(traced.graph, kind='aten::wait', num_kind_nodes=0)
         self.assertGraphContainsExactly(traced.graph, kind='aten::add', num_kind_nodes=2)
 
+    def test_trace_fork_wait_inline_onnx(self):
+        def fork_body(x):
+            return torch.neg(x), torch.neg(x)
+
+        class MyMod(torch.nn.Module):
+            def forward(self, x):
+                fut = torch.jit._fork(fork_body, x)
+                val = torch.jit._wait(fut)
+                return val[1]
+
+        # smoke test for ONNX export
+        f = io.BytesIO()
+        torch.onnx.export(MyMod(), (torch.rand(3, 4),), f)
+
     def test_trace_fork_wait_list_modulecalls(self):
         def add_one(input):
             return input + torch.ones(input.size())
diff --git a/test/onnx/test_export_modes.py b/test/jit/test_export_modes.py
similarity index 65%
rename from test/onnx/test_export_modes.py
rename to test/jit/test_export_modes.py
index 0f3024a2e366d..dbf10cddc059b 100644
--- a/test/onnx/test_export_modes.py
+++ b/test/jit/test_export_modes.py
@@ -1,25 +1,29 @@
-# Owner(s): ["module: onnx"]
+# Owner(s): ["oncall: jit"]
 
 import io
 import os
 import shutil
 import sys
 import tempfile
-import unittest
 
 import torch
 import torch.nn as nn
-from torch.autograd import Variable
 from torch.onnx import OperatorExportTypes
+from torch.autograd import Variable
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
-from torch.testing._internal import common_utils
+from torch.testing._internal.jit_utils import JitTestCase
+from torch.testing._internal.common_utils import skipIfNoLapack, skipIfCaffe2, skipIfNoCaffe2
 
+if __name__ == '__main__':
+    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
+                       "\tpython test/test_jit.py TESTNAME\n\n"
+                       "instead.")
 
 # Smoke tests for export methods
-class TestExportModes(common_utils.TestCase):
+class TestExportModes(JitTestCase):
     class MyModel(nn.Module):
         def __init__(self):
             super(TestExportModes.MyModel, self).__init__()
@@ -31,66 +35,41 @@ def test_protobuf(self):
         torch_model = TestExportModes.MyModel()
         fake_input = Variable(torch.randn(1, 1, 224, 224), requires_grad=True)
         f = io.BytesIO()
-        torch.onnx._export(
-            torch_model,
-            (fake_input),
-            f,
-            verbose=False,
-            export_type=torch.onnx.ExportTypes.PROTOBUF_FILE,
-        )
+        torch.onnx._export(torch_model, (fake_input), f, verbose=False,
+                           export_type=torch.onnx.ExportTypes.PROTOBUF_FILE)
 
     def test_zipfile(self):
         torch_model = TestExportModes.MyModel()
         fake_input = Variable(torch.randn(1, 1, 224, 224), requires_grad=True)
         f = io.BytesIO()
-        torch.onnx._export(
-            torch_model,
-            (fake_input),
-            f,
-            verbose=False,
-            export_type=torch.onnx.ExportTypes.ZIP_ARCHIVE,
-        )
+        torch.onnx._export(torch_model, (fake_input), f, verbose=False,
+                           export_type=torch.onnx.ExportTypes.ZIP_ARCHIVE)
 
     def test_compressed_zipfile(self):
         torch_model = TestExportModes.MyModel()
         fake_input = Variable(torch.randn(1, 1, 224, 224), requires_grad=True)
         f = io.BytesIO()
-        torch.onnx._export(
-            torch_model,
-            (fake_input),
-            f,
-            verbose=False,
-            export_type=torch.onnx.ExportTypes.COMPRESSED_ZIP_ARCHIVE,
-        )
+        torch.onnx._export(torch_model, (fake_input), f, verbose=False,
+                           export_type=torch.onnx.ExportTypes.COMPRESSED_ZIP_ARCHIVE)
 
     def test_directory(self):
         torch_model = TestExportModes.MyModel()
         fake_input = Variable(torch.randn(1, 1, 224, 224), requires_grad=True)
         d = tempfile.mkdtemp()
-        torch.onnx._export(
-            torch_model,
-            (fake_input),
-            d,
-            verbose=False,
-            export_type=torch.onnx.ExportTypes.DIRECTORY,
-        )
+        torch.onnx._export(torch_model, (fake_input), d, verbose=False,
+                           export_type=torch.onnx.ExportTypes.DIRECTORY)
         shutil.rmtree(d)
 
     def test_onnx_multiple_return(self):
         @torch.jit.script
         def foo(a):
             return (a, a)
-
         f = io.BytesIO()
         x = torch.ones(3)
-        torch.onnx.export(foo, (x,), f)
-
-    # TODO(87318): Can't pass even with Caffe2
-    @unittest.skip(
-        "RuntimeError: ScalarType UNKNOWN_SCALAR is an unexpected tensor scalar type"
-    )
-    @common_utils.skipIfNoCaffe2
-    @common_utils.skipIfNoLapack
+        torch.onnx._export(foo, (x,), f)
+
+    @skipIfNoCaffe2
+    @skipIfNoLapack
     def test_caffe2_aten_fallback(self):
         class ModelWithAtenNotONNXOp(nn.Module):
             def forward(self, x, y):
@@ -101,15 +80,13 @@ def forward(self, x, y):
         x = torch.rand(3, 4)
         y = torch.rand(3, 4)
         torch.onnx.export_to_pretty_string(
-            ModelWithAtenNotONNXOp(),
-            (x, y),
+            ModelWithAtenNotONNXOp(), (x, y),
             add_node_names=False,
             do_constant_folding=False,
-            operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK,
-        )
+            operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK)
 
-    @common_utils.skipIfCaffe2
-    @common_utils.skipIfNoLapack
+    @skipIfCaffe2
+    @skipIfNoLapack
     def test_aten_fallback(self):
         class ModelWithAtenNotONNXOp(nn.Module):
             def forward(self, x, y):
@@ -120,14 +97,12 @@ def forward(self, x, y):
         x = torch.rand(3, 4)
         y = torch.rand(3, 4)
         torch.onnx.export_to_pretty_string(
-            ModelWithAtenNotONNXOp(),
-            (x, y),
+            ModelWithAtenNotONNXOp(), (x, y),
             add_node_names=False,
             do_constant_folding=False,
             operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK,
             # support for linalg.qr was added in later op set versions.
-            opset_version=9,
-        )
+            opset_version=9)
 
     # torch.fmod is using to test ONNX_ATEN.
     # If you plan to remove fmod from aten, or found this test failed.
@@ -140,13 +115,7 @@ def forward(self, x, y):
         x = torch.randn(3, 4, dtype=torch.float32)
         y = torch.randn(3, 4, dtype=torch.float32)
         torch.onnx.export_to_pretty_string(
-            ModelWithAtenFmod(),
-            (x, y),
+            ModelWithAtenFmod(), (x, y),
             add_node_names=False,
             do_constant_folding=False,
-            operator_export_type=OperatorExportTypes.ONNX_ATEN,
-        )
-
-
-if __name__ == "__main__":
-    common_utils.run_tests()
+            operator_export_type=OperatorExportTypes.ONNX_ATEN)
diff --git a/test/jit/test_tracer.py b/test/jit/test_tracer.py
index b36003a2b9209..50fdec94b9fc0 100644
--- a/test/jit/test_tracer.py
+++ b/test/jit/test_tracer.py
@@ -1124,6 +1124,14 @@ def foo(x, w):
         # With `check_trace=True` it will run with `@torch.no_grad()` and break assert.
         torch.jit.trace(foo, (x, w), check_trace=False)
 
+    def test_trace_detach_onnx_erase(self):
+        class Mod(torch.nn.Module):
+            def forward(self, x, w):
+                return torch.matmul(x, w).detach()
+
+        torch.onnx.export_to_pretty_string(
+            Mod(), (torch.rand(3, 4), torch.rand(4, 5)))
+
     def test_trace_slice_full_dim(self):
         def foo(x):
             return x[0:5, 0] + 1.0
diff --git a/test/onnx/test_pytorch_onnx_no_runtime.py b/test/onnx/test_pytorch_onnx_no_runtime.py
index b50d316df8ff4..5f2ce3fa657a1 100644
--- a/test/onnx/test_pytorch_onnx_no_runtime.py
+++ b/test/onnx/test_pytorch_onnx_no_runtime.py
@@ -7,11 +7,8 @@
 import itertools
 import unittest
 import unittest.mock
-import warnings
 from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
 
-import numpy as np
-
 import onnx
 import onnx.numpy_helper
 
@@ -21,7 +18,7 @@
 from torch.onnx import symbolic_helper, utils
 from torch.onnx._globals import GLOBALS
 from torch.onnx._internal import registration
-from torch.testing._internal import common_utils, jit_utils
+from torch.testing._internal import common_utils
 
 
 def export_to_onnx(
@@ -79,7 +76,7 @@ def forward(self, x):
 
         x = torch.ones(3, 3)
         f = io.BytesIO()
-        torch.onnx.export(AddmmModel(), x, f, verbose=False)
+        torch.onnx._export(AddmmModel(), x, f, verbose=False)
 
     def test_onnx_transpose_incomplete_tensor_type(self):
         # Smoke test to get us into the state where we are attempting to export
@@ -166,7 +163,7 @@ def forward(self, x):
         mte = ModuleToExport()
         f = io.BytesIO()
         with self.assertRaisesRegex(RuntimeError, "Couldn't export Python"):
-            torch.onnx.export(mte, (torch.zeros(1, 2, 3),), f, verbose=False)
+            torch.onnx._export(mte, (torch.zeros(1, 2, 3),), f, verbose=False)
 
     def test_onnx_export_script_inline_trace(self):
         class ModuleToInline(torch.nn.Module):
@@ -430,11 +427,7 @@ def forward(self, x):
         onnx_model = export_to_onnx(
             MyClip(),
             torch.randn(3, 4, requires_grad=True),
-            custom_ops=[
-                common_utils.custom_op(
-                    "aten::clamp", bad_clamp, GLOBALS.export_onnx_opset_version
-                )
-            ],
+            custom_ops=[common_utils.custom_op("aten::clamp", bad_clamp, 9)],
             operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
         )
         self.assertAtenOp(onnx_model, "clamp", "Tensor")
@@ -784,181 +777,6 @@ def forward(self, x):
             model, inputs, f, dynamic_axes={"x": [0, 1]}, input_names=["x"]
         )
 
-    def test_dropout_script(self):
-
-        eg = torch.zeros(1, 2, 3, requires_grad=True)
-
-        @jit_utils._trace(eg)
-        def foo(x):
-            x = torch.neg(x)
-            return F.dropout(x)
-
-        class MyDrop(torch.nn.Module):
-            def forward(self, x):
-                return foo(x)
-
-        f = io.BytesIO()
-        with warnings.catch_warnings(record=True):
-            torch.onnx.export(MyDrop(), (eg,), f, verbose=False)
-
-    def test_pack_padded_pad_packed_trace(self):
-        from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
-
-        T, B, C = 3, 5, 7
-
-        class PadPackedWrapper(torch.nn.Module):
-            def __init__(self):
-                super(PadPackedWrapper, self).__init__()
-
-            def forward(self, x, seq_lens):
-                x = pack_padded_sequence(x, seq_lens)
-                x, _ = pad_packed_sequence(x)
-                return x
-
-        x = np.ones((T, B, C))
-        seq_lens = np.array([3, 3, 2, 2, 1], dtype=np.int32)
-        # set padding value so we can test equivalence
-        for b in range(B):
-            if seq_lens[b] < T:
-                x[seq_lens[b] :, b, :] = 0
-        seq_lens = torch.from_numpy(seq_lens)
-        x = torch.autograd.Variable(torch.from_numpy(x), requires_grad=True)
-
-        m = PadPackedWrapper()
-        m_traced = torch.jit.trace(
-            m,
-            (
-                x,
-                seq_lens,
-            ),
-        )
-
-        y = m(x, seq_lens)
-        loss = torch.sum(y)
-        loss.backward()
-        grad = x.grad.clone()
-        x.grad.zero_()
-
-        y_traced = m_traced(x, seq_lens)
-        loss_traced = torch.sum(y_traced)
-        loss_traced.backward()
-        grad_traced = x.grad.clone()
-
-        self.assertEqual(y_traced, x)
-        self.assertEqual(y_traced, y)
-        self.assertEqual(grad, grad_traced)
-
-        f = io.BytesIO()
-        torch.onnx.export(m, (x, seq_lens), f, verbose=False)
-
-    # Suppression: ONNX warns when exporting RNNs because of potential batch size mismatch.
-    @common_utils.suppress_warnings
-    def test_rnn_trace_override(self):
-        from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
-
-        num_layers = 3
-        T, B, C = 11, 5, 7
-
-        class RNNTraceWrapper(torch.nn.Module):
-            def __init__(self, cell_type):
-                super(RNNTraceWrapper, self).__init__()
-                if cell_type == "RNN":
-                    self.rnn = torch.nn.RNN(
-                        input_size=C, hidden_size=C, num_layers=num_layers
-                    )
-                elif cell_type == "LSTM":
-                    self.rnn = torch.nn.LSTM(
-                        input_size=C, hidden_size=C, num_layers=num_layers
-                    )
-                elif cell_type == "GRU":
-                    self.rnn = torch.nn.GRU(
-                        input_size=C, hidden_size=C, num_layers=num_layers
-                    )
-
-            def forward(self, x, seq_lens):
-                x = pack_padded_sequence(x, seq_lens)
-                x, _ = self.rnn(x)
-                x, _ = pad_packed_sequence(x)
-                return x
-
-        for cell_type in ["RNN", "LSTM", "GRU"]:
-            x = torch.ones(T, B, C, requires_grad=True)
-            seq_lens = torch.from_numpy(np.array([11, 3, 2, 2, 1], dtype=np.int32))
-
-            m = RNNTraceWrapper(cell_type)
-            m_traced = torch.jit.trace(
-                m,
-                (
-                    x,
-                    seq_lens,
-                ),
-            )
-
-            y = m(x, seq_lens)
-            loss = torch.sum(y)
-            loss.backward()
-            grad = x.grad.clone()
-            x.grad.zero_()
-
-            y_traced = m_traced(x, seq_lens)
-            loss_traced = torch.sum(y_traced)
-            loss_traced.backward()
-            grad_traced = x.grad.clone()
-
-            self.assertEqual(y_traced, y)
-            self.assertEqual(grad, grad_traced)
-
-            f = io.BytesIO()
-            torch.onnx.export(m, (x, seq_lens), f, verbose=False)
-
-    def test_trace_fork_wait_inline_onnx(self):
-        def fork_body(x):
-            return torch.neg(x), torch.neg(x)
-
-        class MyMod(torch.nn.Module):
-            def forward(self, x):
-                fut = torch.jit._fork(fork_body, x)
-                val = torch.jit._wait(fut)
-                return val[1]
-
-        # smoke test for ONNX export
-        f = io.BytesIO()
-        torch.onnx.export(MyMod(), (torch.rand(3, 4),), f)
-
-    def test_trace_detach_onnx_erase(self):
-        class Mod(torch.nn.Module):
-            def forward(self, x, w):
-                return torch.matmul(x, w).detach()
-
-        torch.onnx.export_to_pretty_string(Mod(), (torch.rand(3, 4), torch.rand(4, 5)))
-
-    def _test_lower_graph_impl(self, model, data):
-        model.qconfig = torch.ao.quantization.default_qconfig
-        model = torch.ao.quantization.prepare(model)
-        model = torch.ao.quantization.convert(model)
-
-        _ = model(data)
-        input_names = ["x"]
-
-        def _export_to_onnx(model, input, input_names):
-            traced = torch.jit.trace(model, input)
-            buf = io.BytesIO()
-            torch.jit.save(traced, buf)
-            buf.seek(0)
-
-            model = torch.jit.load(buf)
-            f = io.BytesIO()
-            torch.onnx.export(
-                model,
-                input,
-                f,
-                input_names=input_names,
-                operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
-                opset_version=9,
-            )
-
-        _export_to_onnx(model, data, input_names)
-
 
 if __name__ == "__main__":
     common_utils.run_tests()
diff --git a/test/quantization/eager/test_quantize_eager_ptq.py b/test/quantization/eager/test_quantize_eager_ptq.py
index 3a38a597b352d..7d87cc520ba04 100644
--- a/test/quantization/eager/test_quantize_eager_ptq.py
+++ b/test/quantization/eager/test_quantize_eager_ptq.py
@@ -71,6 +71,7 @@
 
 # Standard library
 from typing import Tuple
+import io
 import unittest
 import numpy as np
 
@@ -1443,6 +1444,26 @@ def forward(self, indices, offsets, linear_in):
         self.assertTrue('DynamicQuantizedLinear' in str(q_model))
 
 class TestQuantizeEagerONNXExport(JitTestCase):
+    def _test_lower_graph_impl(self, model, data):
+        model.qconfig = torch.ao.quantization.default_qconfig
+        model = torch.ao.quantization.prepare(model)
+        model = torch.ao.quantization.convert(model)
+
+        outputs = model(data)
+        input_names = ["x"]
+
+        def export_to_onnx(model, input, input_names):
+            traced = torch.jit.trace(model, input)
+            buf = io.BytesIO()
+            torch.jit.save(traced, buf)
+            buf.seek(0)
+
+            model = torch.jit.load(buf)
+            f = io.BytesIO()
+            torch.onnx.export(model, input, f, input_names=input_names,
+                              operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+                              opset_version=9)
+        onnx_model = export_to_onnx(model, data, input_names)
 
     @skipIfNoFBGEMM
     @skipIfNoCaffe2
diff --git a/test/test_jit.py b/test/test_jit.py
index 13c27b0efa555..b1425a4ed71ca 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -20,6 +20,7 @@
 from jit.test_autodiff import TestAutodiffJit  # noqa: F401
 from jit.test_autodiff_subgraph_slicing import TestAutodiffSubgraphSlicing  # noqa: F401
 from jit.test_custom_operators import TestCustomOperators  # noqa: F401
+from jit.test_export_modes import TestExportModes  # noqa: F401
 from jit.test_graph_rewrite_passes import TestGraphRewritePasses  # noqa: F401
 from jit.test_class_type import TestClassType  # noqa: F401
 from jit.test_builtins import TestBuiltins, TestTensorBuiltins  # noqa: F401
@@ -96,7 +97,7 @@
 from torch.testing._internal.common_jit import check_against_reference
 from torch.testing._internal.common_utils import run_tests, IS_WINDOWS, TEST_WITH_UBSAN, \
     suppress_warnings, BUILD_WITH_CAFFE2, IS_SANDCASTLE, GRAPH_EXECUTOR, ProfilingMode, TestCase, \
-    freeze_rng_state, slowTest, TemporaryFileName, \
+    freeze_rng_state, slowTest, TemporaryFileName, skipIfCompiledWithoutNumpy, \
     enable_profiling_mode_for_profiling_tests, TEST_MKL, set_default_dtype, num_profiled_runs, \
     skipIfCrossRef, IS_MACOS, skipIfTorchDynamo
 from torch.testing._internal.jit_utils import JitTestCase, enable_cpu_fuser, disable_autodiff_subgraph_inlining, \
@@ -5912,6 +5913,23 @@ def test_fuser_multiple_blocks(this, that, theother, meme):
 
         self.assertEqual(cu.test_fuser_multiple_blocks(*inputs), outputs)
 
+    def test_dropout_script(self):
+
+        eg = torch.zeros(1, 2, 3, requires_grad=True)
+
+        @_trace(eg)
+        def foo(x):
+            x = torch.neg(x)
+            return F.dropout(x)
+
+        class MyDrop(nn.Module):
+            def forward(self, x):
+                return foo(x)
+
+        f = io.BytesIO()
+        with warnings.catch_warnings(record=True):
+            torch.onnx.export(MyDrop(), (eg,), f, verbose=False)
+
     @unittest.skip("RuntimeError: VariableType::ID() not implemented")
     def test_cast(self):
         script = '''
@@ -9762,6 +9780,50 @@ def forward(self, rep):
             m = M2()
             m(torch.zeros(4, 3))
 
+    @skipIfCompiledWithoutNumpy
+    def test_pack_padded_pad_packed_trace(self):
+        from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+        T, B, C = 3, 5, 7
+
+        class PadPackedWrapper(torch.nn.Module):
+            def __init__(self):
+                super(PadPackedWrapper, self).__init__()
+
+            def forward(self, x, seq_lens):
+                x = pack_padded_sequence(x, seq_lens)
+                x, _ = pad_packed_sequence(x)
+                return x
+
+        x = np.ones((T, B, C))
+        seq_lens = np.array([3, 3, 2, 2, 1], dtype=np.int32)
+        # set padding value so we can test equivalence
+        for b in range(B):
+            if seq_lens[b] < T:
+                x[seq_lens[b]:, b, :] = 0
+        seq_lens = torch.from_numpy(seq_lens)
+        x = torch.autograd.Variable(torch.from_numpy(x), requires_grad=True)
+
+        m = PadPackedWrapper()
+        m_traced = torch.jit.trace(m, (x, seq_lens,))
+
+        y = m(x, seq_lens)
+        loss = torch.sum(y)
+        loss.backward()
+        grad = x.grad.clone()
+        x.grad.zero_()
+
+        y_traced = m_traced(x, seq_lens)
+        loss_traced = torch.sum(y_traced)
+        loss_traced.backward()
+        grad_traced = x.grad.clone()
+
+        self.assertEqual(y_traced, x)
+        self.assertEqual(y_traced, y)
+        self.assertEqual(grad, grad_traced)
+
+        f = io.BytesIO()
+        torch.onnx._export(m, (x, seq_lens), f, verbose=False)
+
     def test_script_pack_padded_sequence(self):
         from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
 
@@ -9962,6 +10024,54 @@ def forward(self, input: torch.Tensor):
         m_scripted = torch.jit.script(m)
         self.assertEqual(m_scripted(torch.tensor(1)), torch.tensor(246))
 
+    # Suppression: ONNX warns when exporting RNNs because of potential batch size mismatch.
+    @suppress_warnings
+    @skipIfCompiledWithoutNumpy
+    def test_rnn_trace_override(self):
+        from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+        num_layers = 3
+        T, B, C = 11, 5, 7
+
+        class RNNTraceWrapper(torch.nn.Module):
+            def __init__(self, cell_type):
+                super(RNNTraceWrapper, self).__init__()
+                if cell_type == 'RNN':
+                    self.rnn = torch.nn.RNN(input_size=C, hidden_size=C, num_layers=num_layers)
+                elif cell_type == 'LSTM':
+                    self.rnn = torch.nn.LSTM(input_size=C, hidden_size=C, num_layers=num_layers)
+                elif cell_type == 'GRU':
+                    self.rnn = torch.nn.GRU(input_size=C, hidden_size=C, num_layers=num_layers)
+
+            def forward(self, x, seq_lens):
+                x = pack_padded_sequence(x, seq_lens)
+                x, _ = self.rnn(x)
+                x, _ = pad_packed_sequence(x)
+                return x
+
+        for cell_type in ['RNN', 'LSTM', 'GRU']:
+            x = torch.ones(T, B, C, requires_grad=True)
+            seq_lens = torch.from_numpy(np.array([11, 3, 2, 2, 1], dtype=np.int32))
+
+            m = RNNTraceWrapper(cell_type)
+            m_traced = torch.jit.trace(m, (x, seq_lens,))
+
+            y = m(x, seq_lens)
+            loss = torch.sum(y)
+            loss.backward()
+            grad = x.grad.clone()
+            x.grad.zero_()
+
+            y_traced = m_traced(x, seq_lens)
+            loss_traced = torch.sum(y_traced)
+            loss_traced.backward()
+            grad_traced = x.grad.clone()
+
+            self.assertEqual(y_traced, y)
+            self.assertEqual(grad, grad_traced)
+
+            f = io.BytesIO()
+            torch.onnx._export(m, (x, seq_lens), f, verbose=False)
+
     def test_python_call_non_tensor(self):
         def foo(a, b, c):
             # type: (Tensor, int, Tuple[Tensor, int]) -> Tuple[int, Tensor]

From 9a927ced560d464cdf81d3cef0f9378567e0c878 Mon Sep 17 00:00:00 2001
From: Grigory Sizov <grisha.sizov@gmail.com>
Date: Mon, 31 Oct 2022 19:59:35 +0000
Subject: [PATCH 0357/1922] Enable `src_mask` in fast path of
 `TransformerEncoderLayer ` (#87377)

## Issues
Fixes https://github.com/pytorch/pytorch/issues/81129#issuecomment-1179435674

## Description

Passing a 2D attention mask `src_mask` into the fast path of `TransformerEncoderLayer` in CPU was causing an error and so was disabled in https://github.com/pytorch/pytorch/pull/81277. This PR unrolls this fix, enabling `src_mask` on the fast path:

- Either attention mask `src_mask` of shape `(L, L)` or padding mask `src_key_padding_mask` of shape `(B, L)` are now allowed on the CPU fast path. If softmax is applied along the last dimension (as in multi-head attention), these masks are processed without expanding them to 4D. Instead, when iterating through the input, `Softmax.cpp::host_softmax` converts the index to match the mask dimensions, depending on the type.
- If softmax is applied along the dimension other than the last, `Softmax.cpp::masked_softmax_cpu` expands masks to 4D, converting them to `mask_type=2`. Theoretically one could also add special optimized cases for `dim=0, 1, 2` and process them without mask expansion, but I don't know how often is that used

## Tests:
- `test_transformerencoderlayer_fast_path` is extended to cover both attention mask and padding mask
- `test_masked_softmax_mask_types_0_1` is added to ensure results from CPU softmax with attention and padding masks match the explicit slow calculation
- `test_masked_softmax_devices_parity` is added to ensure results from masked softmax on CPU and CUDA match

## Note
I had to replace `float` with `torch.get_default_dtype()` in a couple of tests for the following reason:
- `test_nn.py` [sets the default type to `torch.double`](https://github.com/pytorch/pytorch/blob/master/test/test_nn.py#L24-L26)
- If I execute `test_nn.py` and `test_transformers.py` in one `pytest` run, this default still holds for transformer tests
- Some tests in `test_transformers.py` which were previously following the slow path now switched to fast path, and hard-coded `float` started clashing with default `double`

Let me know if there is a better way around it - or maybe I'm not supposed to run tests with `pytest` like this

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87377
Approved by: https://github.com/mikekgfb, https://github.com/weiwangmeta, https://github.com/malfet
---
 aten/src/ATen/native/SoftMax.cpp              |  62 +++++++++--
 .../ATen/native/transformers/attention.cpp    |   9 --
 test/test_nn.py                               | 103 +++++++++++++++++-
 test/test_transformers.py                     |   4 +-
 torch/nn/modules/activation.py                |  10 +-
 torch/nn/modules/transformer.py               |   9 +-
 6 files changed, 164 insertions(+), 33 deletions(-)

diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp
index 0e3dafb24e9e8..0332f57e9e23e 100644
--- a/aten/src/ATen/native/SoftMax.cpp
+++ b/aten/src/ATen/native/SoftMax.cpp
@@ -162,9 +162,6 @@ void host_softmax(
     int64_t mask_type = mask_type_.value();
     // If mask_type == 2, then mask_.sizes() must equal input_.sizes()
     TORCH_CHECK((mask_type == 0) || (mask_type == 1) || (mask_type == 2), "Mask Type should be 0 (src_mask) or 1 (src_key_padding_mask), or 2 (default_mask)");
-
-    // TODO: Add support for TxT src_mask
-    TORCH_CHECK(mask_type != 0, "src_mask not currently supported on CPU");
   }
 
   int64_t outer_size = 1;
@@ -194,8 +191,22 @@ void host_softmax(
               output_data_base + outer_idx * outer_stride + inner_idx;
           bool* mask_data = nullptr;
           if (MaskedSoftMax) {
-            mask_data = mask_data_base + outer_idx * outer_stride + inner_idx;
-          }
+            // Process mask differently depending on the type:
+            // For a generic mask of mask_type == 2, mask shape is the same as the input shape,
+            // so indexing is the same.
+            auto mask_outer_idx = outer_idx;
+            if (mask_type_ == 0) {
+                // Optimized case: attention mask of shape LxL
+                // outer_idx goes over BxHxL, mask_outer_idx goes over L.
+                mask_outer_idx = outer_idx % input.size(2);
+            } else if (mask_type_ == 1) {
+                // Optimized case: padding mask of shape BxL
+                // outer_idx goes over BxHxL, mask_outer_idx goes over B.
+                mask_outer_idx = outer_idx / (input.size(1) * input.size(2));
+            }
+
+            mask_data = mask_data_base + mask_outer_idx * outer_stride + inner_idx;
+          };
 
           // Calc max in softmax dim
           bool is_meaningful_max = false;
@@ -577,15 +588,48 @@ Tensor log_softmax(const Tensor& self, Dimname dim, optional<ScalarType> dtype)
 }
 
 Tensor masked_softmax_cpu(const Tensor& input_, const Tensor& mask_, const c10::optional<int64_t> dim_, const c10::optional<int64_t> mask_type_) {
-  TORCH_CHECK(
-      input_.sizes() == mask_.sizes(), "Mask shape should match input shape");
+
+  auto mask = mask_.contiguous();
+  auto mask_type = mask_type_; // Mask type might get transformed below
+
   TORCH_CHECK(
       mask_.scalar_type() == ScalarType::Bool,
       "Mask should be a boolean tensor");
 
+  if ((mask.dim() != 2) || (input_.dim() != 4)) {
+    // Mask types 0 and 1 are only allowed for 2D masks and 4D inputs
+    mask_type = 2;
+  }
+
+  if (mask_type == 2) {
+      TORCH_CHECK(input_.sizes() == mask.sizes(),
+                  "For mask_type == 2 mask shape should match input shape")
+  } else if (mask_type == 1) {
+      // Padding mask of shape (B, L)
+      TORCH_CHECK((input_.sizes()[0] == mask.sizes()[0]) && (input_.sizes()[2] == mask.sizes()[1]),
+                  "For mask_type == 1 mask shape should be (B, L)");
+      if (dim_ != input_.dim() - 1) {
+            // We only process padding mask in the optimized way if softmax is applied along the last dimesion,
+            // otherwise we need to expand the mask into a generic 4D one
+            mask = mask_.view({input_.sizes()[0], 1, 1, input_.sizes()[2]});
+            mask = mask.expand(input_.sizes()).contiguous();
+            mask_type = 2;
+      }
+  } else if (mask_type == 0) {
+      // Attention mask of shape (L, L)
+      TORCH_CHECK((mask.dim() == 2) && (input_.sizes()[2] == mask.sizes()[0]) && (input_.sizes()[2] == mask.sizes()[1]),
+                  "For mask_type == 0 mask shape should be (L, L)");
+      if (dim_ != input_.dim() - 1) {
+            // We only process attention mask in a optimized way if softmax is applied along the last dimesion,
+            // otherwise we need to expand the mask into a generic 4D one
+            mask = mask.view({1, 1, input_.sizes()[2], input_.sizes()[2]});
+            mask = mask.expand(input_.sizes()).contiguous();
+            mask_type = 2;
+      }
+  }
+
   Tensor output = at::empty_like(input_, input_.options());
   auto input = input_.contiguous();
-  auto mask = mask_.contiguous();
   int64_t dim = dim_.has_value() ? dim_.value() : input.dim() - 1;
   dim = maybe_wrap_dim(dim, input_.dim());
 
@@ -599,7 +643,7 @@ Tensor masked_softmax_cpu(const Tensor& input_, const Tensor& mask_, const c10::
             scalar_t,
             false /* LogSoftMax */,
             true /* MaskedSoftMax */>(
-            output, input, dim, mask.data_ptr<bool>(), mask_type_);
+            output, input, dim, mask.data_ptr<bool>(), mask_type);
       });
   return output;
 }
diff --git a/aten/src/ATen/native/transformers/attention.cpp b/aten/src/ATen/native/transformers/attention.cpp
index 5af9eaebb6c3a..55c71f9fd0645 100644
--- a/aten/src/ATen/native/transformers/attention.cpp
+++ b/aten/src/ATen/native/transformers/attention.cpp
@@ -140,15 +140,6 @@ Tensor masked_softmax(
         "negatively affect performance. Prefer to use a boolean mask directly.");
     attn_mask = attn_mask->to(at::kBool);
   }
-  if (attn_scores.is_cpu() && attn_mask && attn_mask->dim() == 2) {
-    // TODO: CPU path does not support transformer mask yet.
-    const auto batch_size = attn_scores.sizes()[0];
-    const auto seq_len = attn_scores.sizes()[3];
-    TORCH_CHECK(attn_mask->sizes()[0] == batch_size);
-    TORCH_CHECK(attn_mask->sizes()[1] == seq_len);
-    attn_mask = attn_mask->view({batch_size, 1, 1, seq_len});
-    attn_mask = at::expand_inplace(attn_scores, *attn_mask)->contiguous();
-  }
   if (attn_mask) {
     return _masked_softmax(attn_scores, *attn_mask, attn_scores.dim() - 1, mask_type);
   } else {
diff --git a/test/test_nn.py b/test/test_nn.py
index 3e281e02db16a..cab9db75cdbf6 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -13108,6 +13108,93 @@ def _slow_masked_softmax(self, input, mask):
         s = exp.sum(dim=3, keepdim=True).expand(exp.size())
         return exp / s
 
+    def test_masked_softmax_mask_types_0_1(self, device):
+        # Test that mask type 0 (LxL attention mask) and mask type 1 (BxL padding mask)
+        # are processed correctly on the fast path and the results match explicit slow
+        # calculation.
+        sizes = [(1, 1, 32), (3, 16, 310), (12, 4, 1024), (4, 2, 1200)]
+
+        for (B, num_heads, L) in sizes:
+
+            # mask_type == 0 => attention mask of shape LxL
+            src_mask_orig = torch.randint(0, 2, (L, L)).bool()
+            src_mask = src_mask_orig.reshape(1, 1, L, L).expand(B, num_heads, L, L).bool()
+
+            # mask_type == 1 => padding mask of shape BxL
+            src_key_padding_mask_orig = torch.randint(0, 2, (B, L)).bool()
+            src_key_padding_mask = src_key_padding_mask_orig.reshape(B, 1, 1, L).expand(B, num_heads, L, L).bool()
+
+            masks = [(src_mask_orig, src_mask, 0), (src_key_padding_mask_orig, src_key_padding_mask, 1)]
+            for dim in [0, 3]:
+                for mask_orig, mask, mask_type in masks:
+                    if (self.device_type == "cuda") and (num_heads % 2) and (mask_type == 1):
+                        # CUDA path doesn't support padding mask when the number of heads is odd
+                        continue
+                    input = torch.randn((B, num_heads, L, L))
+                    if (self.device_type == "cuda"):
+                        input = input.cuda()
+                        mask = mask.cuda()
+                        mask_orig = mask_orig.cuda()
+                    native_res = torch._masked_softmax(input, mask_orig, dim, mask_type)
+                    mask = ~mask
+
+                    def slow_masked_softmax(input, mask):
+                        exp = torch.exp(input)
+                        exp = exp * mask
+                        s = exp.sum(dim=dim, keepdim=True).expand(exp.size())
+                        return exp / s
+
+                    pt_res = slow_masked_softmax(input, mask)
+                    pt_res = torch.nan_to_num(pt_res)
+
+                    mask_not = mask.logical_not()
+                    # In result, should only fill the entirely masked out rows since those are non-deterministic (*may* be 0)
+                    # Converts rows with all True's to False
+                    mask_out = mask_not.all(dim, keepdim=True).expand(mask_not.shape)
+                    self.assertEqual(
+                        pt_res.masked_fill(mask_out, 0),
+                        native_res.masked_fill(mask_out, 0),
+                        exact_dtype=True
+                    )
+
+    @onlyCUDA
+    def test_masked_softmax_devices_parity(self):
+        # Test that softmax with mask type 0 (LxL attention mask) and mask type 1 (BxL padding mask)
+        # gives the same result on CPU and on CUDA
+
+        sizes = [(1, 1, 32), (3, 16, 310), (12, 4, 1024), (4, 2, 1200)]
+        for (B, num_heads, L) in sizes:
+            # mask_type == 0 => attention mask of shape LxL
+            src_mask = torch.randint(0, 2, (L, L)).bool()
+            # mask_type == 1 => padding mask of shape BxL
+            src_key_padding_mask = torch.randint(0, 2, (B, L)).bool()
+            masks = [(src_mask, 0), (src_key_padding_mask, 1)]
+            input = torch.randn((B, num_heads, L, L))
+            for dim in [0, 3]:
+                for mask, mask_type in masks:
+                    if (num_heads % 2) and (mask_type == 1):
+                        # CUDA path doesn't support padding mask when the number of heads is odd
+                        continue
+
+                    def softmax_on_device(mask, input, device):
+                        # Compute softmax on a given device
+                        input_device = input.to(device)
+                        mask_device = mask.to(device)
+                        softmax_res = torch._masked_softmax(input_device, mask_device, dim, mask_type)
+                        if mask_type == 0:
+                            mask_expanded = mask_device.reshape(1, 1, L, L).expand(B, num_heads, L, L).bool()
+                        else:
+                            mask_expanded = mask_device.reshape(B, 1, 1, L).expand(B, num_heads, L, L).bool()
+                        # In result, should only fill the entirely masked out rows since those are non-deterministic (*may* be 0)
+                        # Fill rows with all True's with 0
+                        mask_out = mask_expanded.all(dim, keepdim=True).expand(mask_expanded.shape)
+                        softmax_res = softmax_res.masked_fill(mask_out, 0)
+                        return softmax_res
+
+                    cpu_res = softmax_on_device(mask, input, "cpu")
+                    cuda_res = softmax_on_device(mask, input, "cuda")
+                    self.assertEqual(cpu_res, cuda_res, exact_dtype=True)
+
     def test_masked_softmax(self, device):
         sizes = [(1, 1, 32), (3, 16, 310), (12, 4, 1024), (4, 2, 1200)]
         for (B, num_heads, L) in sizes:
@@ -15463,13 +15550,25 @@ def test_multihead_attn_in_proj_weight_none(self, device, dtype):
     @onlyCPU
     @dtypes(torch.double)
     def test_transformerencoderlayer_fast_path(self, device, dtype):
+        """
+        Test transformer fast path on CPU with different valid mask types and shapes
+        """
         model = torch.nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=True, device=device, dtype=dtype)
+        model.eval()
+
+        # Batched inputs
         src = torch.rand(32, 10, 512)
+
+        # Attention mask of shape (src_len, src_len)
         src_mask = torch.zeros(10, 10).to(torch.bool)
+        with torch.no_grad():
+            model(src, src_mask=src_mask)
 
-        model.eval()
+        # Padding mask of shape (batch_size, src_len)
+        src_key_padding_mask = torch.zeros(32, 10).to(torch.bool)
         with torch.no_grad():
-            model(src, src_mask)
+            model(src, src_key_padding_mask=src_key_padding_mask)
+
 
     @dtypes(torch.float)
     @dtypesIfCUDA(torch.half, torch.float)
diff --git a/test/test_transformers.py b/test/test_transformers.py
index ceb9213e9b037..4657cf730b2c3 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -215,7 +215,7 @@ def test_transformerencoder_fastpath(self, device, use_torchscript, enable_neste
         ]
         input_mask_pairs = [
             (
-                torch.tensor(pair[0], device=device, dtype=torch.float32),  # float input
+                torch.tensor(pair[0], device=device, dtype=torch.get_default_dtype()),  # float input
                 torch.tensor(pair[1], device=device, dtype=torch.bool)  # bool mask
             ) for pair in input_mask_pairs
         ]
@@ -267,7 +267,7 @@ def test_transformerencoder_square_input(self, with_no_grad, training, enable_ne
             model = model.train()
         else:
             model = model.eval()
-        x = torch.arange(0, 16).reshape(2, 2, 4).to(torch.float).to(device)
+        x = torch.arange(0, 16).reshape(2, 2, 4).to(torch.get_default_dtype()).to(device)
         src_mask = torch.Tensor([[0, 1], [0, 0]]).to(torch.bool).to(device)
 
         if with_no_grad:
diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index 666c31874f9a3..293760b8bb706 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -1093,12 +1093,10 @@ def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: O
             why_not_fast_path = "add_zero_attn was enabled"
         elif not self._qkv_same_embed_dim:
             why_not_fast_path = "_qkv_same_embed_dim was not True"
-        elif attn_mask is not None:
-            why_not_fast_path = "attn_mask was not None"
-        elif query.is_nested and key_padding_mask is not None:
-            why_not_fast_path = "key_padding_mask is not supported with NestedTensor input"
-        elif self.num_heads % 2 == 1:
-            why_not_fast_path = "num_heads is odd"
+        elif query.is_nested and (key_padding_mask is not None or attn_mask is not None):
+            why_not_fast_path = "key_padding_mask and attn_mask are not supported with NestedTensor input"
+        elif not query.is_nested and key_padding_mask is not None and attn_mask is not None:
+            why_not_fast_path = "key_padding_mask and attn_mask were both supplied"
         elif torch.is_autocast_enabled():
             why_not_fast_path = "autocast is enabled"
 
diff --git a/torch/nn/modules/transformer.py b/torch/nn/modules/transformer.py
index 6bbff8a24dca6..34dde6fc224f1 100644
--- a/torch/nn/modules/transformer.py
+++ b/torch/nn/modules/transformer.py
@@ -466,15 +466,14 @@ def forward(self, src: Tensor, src_mask: Optional[Tensor] = None,
             why_not_sparsity_fast_path = "activation_relu_or_gelu was not True"
         elif not (self.norm1.eps == self.norm2.eps):
             why_not_sparsity_fast_path = "norm1.eps is not equal to norm2.eps"
-        elif src_mask is not None:
-            why_not_sparsity_fast_path = "src_mask is not supported for fastpath"
-        elif src.is_nested and src_key_padding_mask is not None:
-            why_not_sparsity_fast_path = "src_key_padding_mask is not supported with NestedTensor input for fastpath"
+        elif src.is_nested and (src_key_padding_mask is not None or src_mask is not None):
+            why_not_sparsity_fast_path = "src_key_padding_mask and src_mask are not supported with NestedTensor input"
+        elif (not src.is_nested) and (src_key_padding_mask is not None and src_mask is not None):
+            why_not_sparsity_fast_path = "src_key_padding_mask and src_mask were both supplied"
         elif self.self_attn.num_heads % 2 == 1:
             why_not_sparsity_fast_path = "num_head is odd"
         elif torch.is_autocast_enabled():
             why_not_sparsity_fast_path = "autocast is enabled"
-
         if not why_not_sparsity_fast_path:
             tensor_args = (
                 src,

From 28b421d346a38bda76086b719a799ff1b79310f8 Mon Sep 17 00:00:00 2001
From: Sean Ross-Ross <srossross@gmail.com>
Date: Mon, 31 Oct 2022 10:11:14 -0500
Subject: [PATCH 0358/1922] Changing from sample_inputs to reference_inputs in
 test_compare_cpu (#86462)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/86462
Approved by: https://github.com/lezcano, https://github.com/mruberry
---
 test/test_ops.py                              |  2 +-
 .../_internal/common_methods_invocations.py   | 37 +++++++++++++
 .../_internal/opinfo/definitions/special.py   | 55 +++++++++++++++++++
 3 files changed, 93 insertions(+), 1 deletion(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 4f062d4c54034..9c4bd58e1a4da 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -185,7 +185,7 @@ def to_cpu(arg):
                 return arg.to(device='cpu')
             return arg
 
-        samples = op.sample_inputs(device, dtype)
+        samples = op.reference_inputs(device, dtype)
 
         for sample in samples:
             cpu_sample = sample.transform(to_cpu)
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 0617b5d7ed617..3f4e2b6e03730 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -8605,6 +8605,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     rhs_make_tensor_kwargs=dict(low=0),
                     skips=(
                         DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_type_promotion'),
+                        # https://github.com/pytorch/pytorch/issues/70904
+                        DecorateInfo(unittest.skip("Some inputs produce undefined outputs"), 'TestCommon', 'test_compare_cpu'),
                     )),
     BinaryUfuncInfo('bitwise_right_shift',
                     op=torch.bitwise_right_shift,
@@ -8617,6 +8619,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     rhs_make_tensor_kwargs=dict(low=0),
                     skips=(
                         DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_type_promotion'),
+                        # https://github.com/pytorch/pytorch/issues/70904
+                        DecorateInfo(unittest.skip("Some inputs produce undefined outputs"), 'TestCommon', 'test_compare_cpu'),
                     )),
     OpInfo('combinations',
            op=torch.combinations,
@@ -13866,6 +13870,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
             # RuntimeError: attribute lookup is not defined on builtin
             DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.skip('Overflow when downcasting signed type is undefined'), 'TestCommon', 'test_compare_cpu'),
         )),
     UnaryUfuncInfo(
         'char',
@@ -13879,6 +13884,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
             # RuntimeError: attribute lookup is not defined on builtin
             DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.skip('Overflow when downcasting signed type is undefined'), 'TestCommon', 'test_compare_cpu'),
         )),
     UnaryUfuncInfo(
         'double',
@@ -13931,6 +13937,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
             # RuntimeError: attribute lookup is not defined on builtin
             DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.skip('Overflow when downcasting signed type is undefined'), 'TestCommon', 'test_compare_cpu'),
         )),
     UnaryUfuncInfo(
         'long',
@@ -13943,6 +13950,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
             # RuntimeError: attribute lookup is not defined on builtin
             DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.skip('Overflow when downcasting signed type is undefined'), 'TestCommon', 'test_compare_cpu'),
         )),
     UnaryUfuncInfo(
         'short',
@@ -13955,6 +13963,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
             # RuntimeError: attribute lookup is not defined on builtin
             DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.skip('Overflow when downcasting signed type is undefined'), 'TestCommon', 'test_compare_cpu'),
         )),
     UnaryUfuncInfo(
         'cdouble',
@@ -17136,6 +17145,19 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.bitwise_left_shift",
         torch_opinfo_name="bitwise_left_shift",
         supports_nvfuser=False,
+        skips=(
+            # https://github.com/pytorch/pytorch/issues/70904
+            DecorateInfo(unittest.skip("Some inputs produce undefined outputs"), 'TestCommon', 'test_compare_cpu'),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.bitwise_right_shift",
+        torch_opinfo_name="bitwise_right_shift",
+        supports_nvfuser=False,
+        skips=(
+            # # https://github.com/pytorch/pytorch/issues/70904
+            DecorateInfo(unittest.skip("Skipped some inputs produce undefined outputs"), 'TestCommon', 'test_compare_cpu'),
+        ),
     ),
     ElementwiseBinaryPythonRefInfo(
         "_refs.bitwise_or",
@@ -17555,6 +17577,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         # https://github.com/pytorch/pytorch/issues/86558
         validate_view_consistency=False,
         supports_nvfuser=False,
+        skips=(
+            DecorateInfo(unittest.skip('Overflow when downcasting signed type is undefined'), 'TestCommon', 'test_compare_cpu'),
+        )
     ),
     ElementwiseUnaryPythonRefInfo(
         "_refs._conversions.char",
@@ -17564,6 +17589,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         # https://github.com/pytorch/pytorch/issues/86558
         validate_view_consistency=False,
         supports_nvfuser=False,
+        skips=(
+            DecorateInfo(unittest.skip('Overflow when downcasting signed type is undefined'), 'TestCommon', 'test_compare_cpu'),
+        )
     ),
     ElementwiseUnaryPythonRefInfo(
         "_refs._conversions.double",
@@ -17600,6 +17628,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         # https://github.com/pytorch/pytorch/issues/86558
         validate_view_consistency=False,
         supports_nvfuser=False,
+        skips=(
+            DecorateInfo(unittest.skip('Overflow when downcasting signed type is undefined'), 'TestCommon', 'test_compare_cpu'),
+        )
     ),
     ElementwiseUnaryPythonRefInfo(
         "_refs._conversions.long",
@@ -17609,6 +17640,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         # https://github.com/pytorch/pytorch/issues/86558
         validate_view_consistency=False,
         supports_nvfuser=False,
+        skips=(
+            DecorateInfo(unittest.skip('Overflow when downcasting signed type is undefined'), 'TestCommon', 'test_compare_cpu'),
+        )
     ),
     ElementwiseUnaryPythonRefInfo(
         "_refs._conversions.short",
@@ -17618,6 +17652,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         # https://github.com/pytorch/pytorch/issues/86558
         validate_view_consistency=False,
         supports_nvfuser=False,
+        skips=(
+            DecorateInfo(unittest.skip('Overflow when downcasting signed type is undefined'), 'TestCommon', 'test_compare_cpu'),
+        )
     ),
     ElementwiseUnaryPythonRefInfo(
         "_refs._conversions.chalf",
diff --git a/torch/testing/_internal/opinfo/definitions/special.py b/torch/testing/_internal/opinfo/definitions/special.py
index 5f0923f8d640a..cafc7aba12424 100644
--- a/torch/testing/_internal/opinfo/definitions/special.py
+++ b/torch/testing/_internal/opinfo/definitions/special.py
@@ -217,6 +217,10 @@ def sample_inputs_entr(op_info, device, dtype, requires_grad, **kwargs):
         promotes_int_to_float=True,
         supports_autograd=False,
         supports_one_python_scalar=True,
+        skips=(
+            # Reference reference_inputs nans and infs on cuda and nan, inf, 0., -inf for cpu
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+        ),
     ),
     # TODO: FIXME
     # OpInfo entry to verify the gradient formula of `other`/`q`
@@ -374,6 +378,11 @@ def sample_inputs_entr(op_info, device, dtype, requires_grad, **kwargs):
         skips=(
             DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
         ),
         supports_one_python_scalar=True,
         supports_autograd=False,
@@ -385,6 +394,11 @@ def sample_inputs_entr(op_info, device, dtype, requires_grad, **kwargs):
         skips=(
             DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
         ),
         supports_one_python_scalar=True,
         supports_autograd=False,
@@ -428,6 +442,8 @@ def sample_inputs_entr(op_info, device, dtype, requires_grad, **kwargs):
         skips=(
             DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+            # Greatest absolute difference: inf
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
         ),
         supports_one_python_scalar=True,
         supports_autograd=False,
@@ -439,6 +455,11 @@ def sample_inputs_entr(op_info, device, dtype, requires_grad, **kwargs):
         skips=(
             DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
         ),
         supports_one_python_scalar=True,
         supports_autograd=False,
@@ -450,6 +471,11 @@ def sample_inputs_entr(op_info, device, dtype, requires_grad, **kwargs):
         skips=(
             DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
         ),
         supports_one_python_scalar=True,
         supports_autograd=False,
@@ -466,6 +492,11 @@ def sample_inputs_entr(op_info, device, dtype, requires_grad, **kwargs):
             ),
             DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
         ),
         supports_one_python_scalar=True,
         supports_autograd=False,
@@ -566,6 +597,11 @@ def sample_inputs_entr(op_info, device, dtype, requires_grad, **kwargs):
             ),
             DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
         ),
         supports_one_python_scalar=True,
         supports_autograd=False,
@@ -582,6 +618,11 @@ def sample_inputs_entr(op_info, device, dtype, requires_grad, **kwargs):
             ),
             DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
         ),
         supports_one_python_scalar=True,
         supports_autograd=False,
@@ -598,6 +639,11 @@ def sample_inputs_entr(op_info, device, dtype, requires_grad, **kwargs):
             ),
             DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
         ),
         supports_one_python_scalar=True,
         supports_autograd=False,
@@ -614,6 +660,11 @@ def sample_inputs_entr(op_info, device, dtype, requires_grad, **kwargs):
             ),
             DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
         ),
         supports_one_python_scalar=True,
         supports_autograd=False,
@@ -713,5 +764,9 @@ def sample_inputs_entr(op_info, device, dtype, requires_grad, **kwargs):
         supports_one_python_scalar=True,
         supports_nvfuser=False,
         op_db=op_db,
+        skips=(
+            # Reference reference_inputs nans and infs on cuda and nan, inf, 0., -inf for cpu
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+        ),
     ),
 ]

From 23aaf703306699982ba8a255bd9a551a6f5f3830 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@fb.com>
Date: Mon, 31 Oct 2022 20:10:05 +0000
Subject: [PATCH 0359/1922] Create separate files for NT Unary, Binary and
 Matmul ops (#88091)

Improves code organization and code share.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88091
Approved by: https://github.com/drisspg
---
 .../native/nested/NestedTensorBinaryOps.cpp   | 211 +++++++++
 .../native/nested/NestedTensorFactories.cpp   |  12 -
 .../ATen/native/nested/NestedTensorMath.cpp   | 433 ------------------
 .../src/ATen/native/nested/NestedTensorMath.h |   8 +
 .../ATen/native/nested/NestedTensorMatmul.cpp | 231 ++++++++++
 .../native/nested/NestedTensorUnaryOps.cpp    |  62 +++
 build_variables.bzl                           |   7 +-
 7 files changed, 517 insertions(+), 447 deletions(-)
 create mode 100644 aten/src/ATen/native/nested/NestedTensorBinaryOps.cpp
 create mode 100644 aten/src/ATen/native/nested/NestedTensorMatmul.cpp
 create mode 100644 aten/src/ATen/native/nested/NestedTensorUnaryOps.cpp

diff --git a/aten/src/ATen/native/nested/NestedTensorBinaryOps.cpp b/aten/src/ATen/native/nested/NestedTensorBinaryOps.cpp
new file mode 100644
index 0000000000000..18ada308c9b24
--- /dev/null
+++ b/aten/src/ATen/native/nested/NestedTensorBinaryOps.cpp
@@ -0,0 +1,211 @@
+#include <ATen/native/nested/NestedTensorMath.h>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/NestedTensorImpl.h>
+#include <ATen/ScalarOps.h>
+#include <ATen/TensorIndexing.h>
+#include <ATen/TensorOperators.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/native/layer_norm.h>
+#include <ATen/native/nested/NestedTensorUtils.h>
+
+#include <tuple>
+
+namespace at {
+namespace native {
+
+std::pair<NestedTensorImpl*, NestedTensorImpl*>
+get_elementwise_nested_tensor_impl(
+    const Tensor& self,
+    const Tensor& other,
+    const std::string& op_name) {
+  if (self.is_nested() && !(other.is_nested())) {
+    TORCH_CHECK(
+        false,
+        "Expected both self and other to be nested, but got a nested self and non-nested other");
+  } else if (!(self.is_nested()) && other.is_nested()) {
+    TORCH_CHECK(
+        false,
+        "Expected both self and other to be nested, but got a non-nested self and nested other");
+  } else if (!(self.is_nested()) || !(other.is_nested())) {
+    TORCH_CHECK(
+        false,
+        "Expected both self and other to be nested, but got a non-nested self and non-nested other");
+  }
+
+  auto self_ptr = get_nested_tensor_impl(self);
+  auto other_ptr = get_nested_tensor_impl(other);
+
+  TORCH_CHECK(
+      self.dim() == other.dim(),
+      op_name,
+      " does not support broadcasting when given a NestedTensor");
+  TORCH_CHECK(
+      at::equal(
+          self_ptr->get_nested_size_tensor(),
+          other_ptr->get_nested_size_tensor()),
+      op_name,
+      " does not support broadcasting when given a NestedTensor");
+  TORCH_CHECK(
+      at::equal(
+          self_ptr->get_nested_stride_tensor(),
+          other_ptr->get_nested_stride_tensor()),
+      op_name,
+      " requires strides to match when given NestedTensors");
+  auto self_offsets = self_ptr->get_storage_offsets();
+  auto other_offsets = other_ptr->get_storage_offsets();
+  bool offsets_match = true;
+  for (size_t i = 0; i < self_offsets.size(); i++) {
+    offsets_match = offsets_match && (self_offsets[i] == other_offsets[i]);
+  }
+  TORCH_CHECK(
+      offsets_match,
+      op_name,
+      " requires offsets to match when given NestedTensors");
+  return std::make_pair(self_ptr, other_ptr);
+}
+
+template <typename Func>
+Tensor NestedTensor_elementwise_Tensor(
+    const Tensor& self,
+    const Tensor& other,
+    const std::string& op_name,
+    Func f) {
+  // self is a scalar
+  if (!self.is_nested() && self.dim() == 0 && self.numel() == 1) {
+    auto other_impl = get_nested_tensor_impl(other);
+    return wrap_buffer(
+      f(self, other_impl->get_unsafe_storage_as_tensor()),
+      other_impl->get_nested_size_tensor().clone(),
+      other_impl->get_nested_stride_tensor().clone(),
+      other_impl->get_storage_offsets()
+    );
+  }
+  // other is a scalar
+  if (!other.is_nested() && other.dim() == 0 && other.numel() == 1) {
+    auto self_impl = get_nested_tensor_impl(self);
+    return wrap_buffer(
+      f(self_impl->get_unsafe_storage_as_tensor(), other),
+      self_impl->get_nested_size_tensor().clone(),
+      self_impl->get_nested_stride_tensor().clone(),
+      self_impl->get_storage_offsets()
+    );
+  }
+  NestedTensorImpl* self_impl = nullptr;
+  NestedTensorImpl* other_impl = nullptr;
+  std::tie(self_impl, other_impl) =
+      get_elementwise_nested_tensor_impl(self, other, op_name);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self_impl);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(other_impl);
+  return wrap_buffer(
+      f(self_impl->get_unsafe_storage_as_tensor(),
+        other_impl->get_unsafe_storage_as_tensor()),
+      self_impl->get_nested_size_tensor(),
+      self_impl->get_nested_stride_tensor(),
+      self_impl->get_storage_offsets());
+}
+
+Tensor NestedTensor_add_Tensor(
+    const Tensor& self,
+    const Tensor& other,
+    const Scalar& alpha) {
+  return NestedTensor_elementwise_Tensor(
+      self, other, "add", [alpha](const Tensor& b1, const Tensor& b2) {
+        return at::add(b1, b2, alpha);
+      });
+}
+
+Tensor NestedTensor_mul_Tensor(const Tensor& self, const Tensor& other) {
+  return NestedTensor_elementwise_Tensor(
+      self, other, "mul", [](const Tensor& b1, const Tensor& b2) {
+        return at::mul(b1, b2);
+      });
+}
+
+// Only usable on the C++ side; scalars are converted to tensors coming from Python.
+Tensor NestedTensor_mul_Scalar(const Tensor& self, const Scalar& other) {
+  return NestedTensor_mul_Tensor(self, wrapped_scalar_tensor(other));
+}
+
+Tensor NestedTensor_div_Tensor(const Tensor& self, const Tensor& other) {
+  return NestedTensor_elementwise_Tensor(
+      self, other, "div", [](const Tensor& b1, const Tensor& b2) {
+        return at::div(b1, b2);
+      });
+}
+
+// Only usable on the C++ side; scalars are converted to tensors coming from Python.
+Tensor NestedTensor_div_Scalar(const Tensor& self, const Scalar& other) {
+  return NestedTensor_div_Tensor(self, wrapped_scalar_tensor(other));
+}
+
+template <typename Func>
+Tensor& NestedTensor_elementwise__Tensor(
+    Tensor& self,
+    const Tensor& other,
+    const std::string& op_name,
+    Func f) {
+  // self is a scalar
+  if (!self.is_nested() && self.dim() == 0 && self.numel() == 1) {
+    auto other_impl = get_nested_tensor_impl(other);
+    f(self, other_impl->get_buffer());
+    return self;
+  }
+  // other is a scalar
+  if (!other.is_nested() && other.dim() == 0 && other.numel() == 1) {
+    auto self_impl = get_nested_tensor_impl(self);
+    f(self_impl->get_buffer(), other);
+    return self;
+  }
+  NestedTensorImpl* self_impl = nullptr;
+  NestedTensorImpl* other_impl = nullptr;
+  std::tie(self_impl, other_impl) =
+      get_elementwise_nested_tensor_impl(self, other, op_name);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self_impl);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(other_impl);
+  const auto& nt_self = *self_impl;
+  const auto& nt_other = *other_impl;
+  f(nt_self.get_buffer().view({-1}), nt_other.get_buffer().view({-1}));
+  return self;
+}
+
+Tensor& NestedTensor_add__Tensor(
+    Tensor& self,
+    const Tensor& other,
+    const Scalar& alpha) {
+  return NestedTensor_elementwise__Tensor(
+      self, other, "add_", [alpha](const Tensor& b1, const Tensor& b2) {
+        return b1.add_(b2, alpha);
+      });
+}
+
+Tensor& NestedTensor_mul__Tensor(Tensor& self, const Tensor& other) {
+  return NestedTensor_elementwise__Tensor(
+      self, other, "mul_", [](const Tensor& b1, const Tensor& b2) {
+        return b1.mul_(b2);
+      });
+}
+
+// Only usable on the C++ side; scalars are converted to tensors coming from Python.
+Tensor& NestedTensor_mul__Scalar(Tensor& self, const Scalar& other) {
+  return NestedTensor_mul__Tensor(self, wrapped_scalar_tensor(other));
+}
+
+Tensor& fill_nested_(Tensor& self, const Scalar& value) {
+  const auto& self_buf = get_nested_tensor_impl(self)->get_buffer();
+  self_buf.fill_(value);
+  return self;
+}
+
+Tensor& fill_nested_(Tensor& self, const Tensor& value) {
+  const auto& self_buf = get_nested_tensor_impl(self)->get_buffer();
+  self_buf.fill_(value);
+  return self;
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/nested/NestedTensorFactories.cpp b/aten/src/ATen/native/nested/NestedTensorFactories.cpp
index 01e72649bd3ff..15473c02be19e 100644
--- a/aten/src/ATen/native/nested/NestedTensorFactories.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorFactories.cpp
@@ -120,17 +120,5 @@ Tensor& copy_nested_(Tensor& self, const Tensor& src, bool non_blocking) {
   return self;
 }
 
-Tensor& fill_nested_(Tensor& self, const Scalar& value) {
-  const auto& self_buf = get_nested_tensor_impl(self)->get_buffer();
-  self_buf.fill_(value);
-  return self;
-}
-
-Tensor& fill_nested_(Tensor& self, const Tensor& value) {
-  const auto& self_buf = get_nested_tensor_impl(self)->get_buffer();
-  self_buf.fill_(value);
-  return self;
-}
-
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/nested/NestedTensorMath.cpp b/aten/src/ATen/native/nested/NestedTensorMath.cpp
index 2d0e8de8b46f0..84efa837ceffe 100644
--- a/aten/src/ATen/native/nested/NestedTensorMath.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorMath.cpp
@@ -17,14 +17,7 @@
 
 namespace at {
 namespace native {
-
 namespace {
-template <typename Func>
-Tensor map_nt(const Tensor& nt, Func f) {
-  auto* nt_impl = get_nested_tensor_impl(nt);
-  const auto& sizes = nt_impl->get_nested_size_tensor();
-  return at::detail::make_tensor<NestedTensorImpl>(f(nt_impl->get_buffer()), sizes);
-}
 
 int64_t num_bytes(IntArrayRef sizes) {
   // 0-dim Tensors have torch.Size of .size() 0, but carry 1 memory.
@@ -90,46 +83,6 @@ std::vector<at::Tensor> NestedTensor_unbind(
   return result_tensors;
 }
 
-Tensor& NestedTensor_relu_(Tensor& self) {
-  auto self_ptr = get_nested_tensor_impl(self);
-  check_numel_equals_buffer_size(self_ptr);
-  auto buffer = self_ptr->get_buffer();
-  at::relu_(buffer);
-  return self;
-}
-
-Tensor NestedTensor_relu(const Tensor& self) {
-  return map_nt(self, at::relu);
-}
-
-Tensor& NestedTensor_gelu_(Tensor& self, c10::string_view approximate) {
-  auto self_ptr = get_nested_tensor_impl(self);
-  check_numel_equals_buffer_size(self_ptr);
-  auto buffer = self_ptr->get_buffer();
-  at::gelu_(buffer, approximate);
-  return self;
-}
-
-Tensor NestedTensor_gelu(const Tensor& self, c10::string_view approximate) {
-  return map_nt(
-      self,
-      [approximate](const Tensor& buffer) {
-        return at::gelu(buffer, approximate);
-      });
-}
-
-Tensor& NestedTensor_tanh_(Tensor& self) {
-  auto self_ptr = get_nested_tensor_impl(self);
-  check_numel_equals_buffer_size(self_ptr);
-  auto buffer = self_ptr->get_buffer();
-  at::tanh_(buffer);
-  return self;
-}
-
-Tensor NestedTensor_tanh(const Tensor& self) {
-  return map_nt(self, at::tanh);
-}
-
 Tensor NestedTensor_nested_tensor_from_mask(const Tensor& t, const Tensor& mask, bool mask_check) {
     TORCH_CHECK(mask.scalar_type() == at::ScalarType::Bool, "Expected mask to be of ScalarType Bool, but got ", mask.scalar_type(), " instead.");
     TORCH_CHECK(mask.dim() == 2, "Padding mask should be 2D");
@@ -467,183 +420,6 @@ Tensor NestedTensor_embedding(
       result_buffer.reshape({-1}), std::move(new_sizes));
 }
 
-std::pair<NestedTensorImpl*, NestedTensorImpl*>
-get_elementwise_nested_tensor_impl(
-    const Tensor& self,
-    const Tensor& other,
-    const std::string& op_name) {
-  if (self.is_nested() && !(other.is_nested())) {
-    TORCH_CHECK(
-        false,
-        "Expected both self and other to be nested, but got a nested self and non-nested other");
-  } else if (!(self.is_nested()) && other.is_nested()) {
-    TORCH_CHECK(
-        false,
-        "Expected both self and other to be nested, but got a non-nested self and nested other");
-  } else if (!(self.is_nested()) || !(other.is_nested())) {
-    TORCH_CHECK(
-        false,
-        "Expected both self and other to be nested, but got a non-nested self and non-nested other");
-  }
-
-  auto self_ptr = get_nested_tensor_impl(self);
-  auto other_ptr = get_nested_tensor_impl(other);
-
-  TORCH_CHECK(
-      self.dim() == other.dim(),
-      op_name,
-      " does not support broadcasting when given a NestedTensor");
-  TORCH_CHECK(
-      at::equal(
-          self_ptr->get_nested_size_tensor(),
-          other_ptr->get_nested_size_tensor()),
-      op_name,
-      " does not support broadcasting when given a NestedTensor");
-  TORCH_CHECK(
-      at::equal(
-          self_ptr->get_nested_stride_tensor(),
-          other_ptr->get_nested_stride_tensor()),
-      op_name,
-      " requires strides to match when given NestedTensors");
-  auto self_offsets = self_ptr->get_storage_offsets();
-  auto other_offsets = other_ptr->get_storage_offsets();
-  bool offsets_match = true;
-  for (size_t i = 0; i < self_offsets.size(); i++) {
-    offsets_match = offsets_match && (self_offsets[i] == other_offsets[i]);
-  }
-  TORCH_CHECK(
-      offsets_match,
-      op_name,
-      " requires offsets to match when given NestedTensors");
-  return std::make_pair(self_ptr, other_ptr);
-}
-
-template <typename Func>
-Tensor NestedTensor_elementwise_Tensor(
-    const Tensor& self,
-    const Tensor& other,
-    const std::string& op_name,
-    Func f) {
-  // self is a scalar
-  if (!self.is_nested() && self.dim() == 0 && self.numel() == 1) {
-    auto other_impl = get_nested_tensor_impl(other);
-    return wrap_buffer(
-      f(self, other_impl->get_unsafe_storage_as_tensor()),
-      other_impl->get_nested_size_tensor().clone(),
-      other_impl->get_nested_stride_tensor().clone(),
-      other_impl->get_storage_offsets()
-    );
-  }
-  // other is a scalar
-  if (!other.is_nested() && other.dim() == 0 && other.numel() == 1) {
-    auto self_impl = get_nested_tensor_impl(self);
-    return wrap_buffer(
-      f(self_impl->get_unsafe_storage_as_tensor(), other),
-      self_impl->get_nested_size_tensor().clone(),
-      self_impl->get_nested_stride_tensor().clone(),
-      self_impl->get_storage_offsets()
-    );
-  }
-  NestedTensorImpl* self_impl = nullptr;
-  NestedTensorImpl* other_impl = nullptr;
-  std::tie(self_impl, other_impl) =
-      get_elementwise_nested_tensor_impl(self, other, op_name);
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self_impl);
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(other_impl);
-  return wrap_buffer(
-      f(self_impl->get_unsafe_storage_as_tensor(),
-        other_impl->get_unsafe_storage_as_tensor()),
-      self_impl->get_nested_size_tensor(),
-      self_impl->get_nested_stride_tensor(),
-      self_impl->get_storage_offsets());
-}
-
-Tensor NestedTensor_add_Tensor(
-    const Tensor& self,
-    const Tensor& other,
-    const Scalar& alpha) {
-  return NestedTensor_elementwise_Tensor(
-      self, other, "add", [alpha](const Tensor& b1, const Tensor& b2) {
-        return at::add(b1, b2, alpha);
-      });
-}
-
-Tensor NestedTensor_mul_Tensor(const Tensor& self, const Tensor& other) {
-  return NestedTensor_elementwise_Tensor(
-      self, other, "mul", [](const Tensor& b1, const Tensor& b2) {
-        return at::mul(b1, b2);
-      });
-}
-
-// Only usable on the C++ side; scalars are converted to tensors coming from Python.
-Tensor NestedTensor_mul_Scalar(const Tensor& self, const Scalar& other) {
-  return NestedTensor_mul_Tensor(self, wrapped_scalar_tensor(other));
-}
-
-Tensor NestedTensor_div_Tensor(const Tensor& self, const Tensor& other) {
-  return NestedTensor_elementwise_Tensor(
-      self, other, "div", [](const Tensor& b1, const Tensor& b2) {
-        return at::div(b1, b2);
-      });
-}
-
-// Only usable on the C++ side; scalars are converted to tensors coming from Python.
-Tensor NestedTensor_div_Scalar(const Tensor& self, const Scalar& other) {
-  return NestedTensor_div_Tensor(self, wrapped_scalar_tensor(other));
-}
-
-template <typename Func>
-Tensor& NestedTensor_elementwise__Tensor(
-    Tensor& self,
-    const Tensor& other,
-    const std::string& op_name,
-    Func f) {
-  // self is a scalar
-  if (!self.is_nested() && self.dim() == 0 && self.numel() == 1) {
-    auto other_impl = get_nested_tensor_impl(other);
-    f(self, other_impl->get_buffer());
-    return self;
-  }
-  // other is a scalar
-  if (!other.is_nested() && other.dim() == 0 && other.numel() == 1) {
-    auto self_impl = get_nested_tensor_impl(self);
-    f(self_impl->get_buffer(), other);
-    return self;
-  }
-  NestedTensorImpl* self_impl = nullptr;
-  NestedTensorImpl* other_impl = nullptr;
-  std::tie(self_impl, other_impl) =
-      get_elementwise_nested_tensor_impl(self, other, op_name);
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self_impl);
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(other_impl);
-  const auto& nt_self = *self_impl;
-  const auto& nt_other = *other_impl;
-  f(nt_self.get_buffer().view({-1}), nt_other.get_buffer().view({-1}));
-  return self;
-}
-
-Tensor& NestedTensor_add__Tensor(
-    Tensor& self,
-    const Tensor& other,
-    const Scalar& alpha) {
-  return NestedTensor_elementwise__Tensor(
-      self, other, "add_", [alpha](const Tensor& b1, const Tensor& b2) {
-        return b1.add_(b2, alpha);
-      });
-}
-
-Tensor& NestedTensor_mul__Tensor(Tensor& self, const Tensor& other) {
-  return NestedTensor_elementwise__Tensor(
-      self, other, "mul_", [](const Tensor& b1, const Tensor& b2) {
-        return b1.mul_(b2);
-      });
-}
-
-// Only usable on the C++ side; scalars are converted to tensors coming from Python.
-Tensor& NestedTensor_mul__Scalar(Tensor& self, const Scalar& other) {
-  return NestedTensor_mul__Tensor(self, wrapped_scalar_tensor(other));
-}
-
 // Very rudimentary sum_dim for prototyping with torch_scatter.segment_reduce.
 Tensor NestedTensor_sum_dim_CPU(
     const Tensor& self,
@@ -833,215 +609,6 @@ Tensor softmax_nested(
   return output;
 }
 
-Tensor bmm_nested(const Tensor& self, const Tensor& mat2) {
-  if (self.is_nested() && !mat2.is_nested()) {
-    AT_ERROR("Expected both to be nested, but got a nested self and non-nested other");
-  }
-  else if (!self.is_nested() && mat2.is_nested()) {
-    AT_ERROR("Expected both to be nested, but got a non-nested self and nested other");
-  }
-  // dispatcher should have guaranteed that at least one is nested
-  auto self_ptr = get_nested_tensor_impl(self);
-  auto mat2_ptr = get_nested_tensor_impl(mat2);
-  TORCH_CHECK(self_ptr->dim() == 3, "batch1 must be a 3D tensor");
-  TORCH_CHECK(mat2_ptr->dim() == 3, "batch2 must be a 3D tensor");
-  int64_t ntensors = self_ptr->size(0),
-      ntensors2 = mat2_ptr->size(0);
-  TORCH_CHECK(ntensors == ntensors2,
-      "Expected size for the 1st dimension of batch2 tensor to be: ", ntensors,
-      " but got: ", ntensors2, ".");
-  const Tensor& self_buffer = self_ptr->get_unsafe_storage_as_tensor(),
-      & mat2_buffer = mat2_ptr->get_unsafe_storage_as_tensor();
-  std::vector<IntArrayRef> self_sizes = NestedTensor_get_sizes(self_ptr),
-      mat2_sizes = NestedTensor_get_sizes(mat2_ptr),
-      self_strides = NestedTensor_get_strides(self_ptr),
-      mat2_strides = NestedTensor_get_strides(mat2_ptr);
-  const std::vector<int64_t>& self_offsets = self_ptr->get_storage_offsets(),
-      & mat2_offsets = mat2_ptr->get_storage_offsets();
-  // create a contiguous output
-  int64_t out_numel = 0;
-  const Tensor& self_sizemat = self_ptr->get_nested_size_tensor();
-  Tensor out_sizemat = self_sizemat.new_empty(self_sizemat.sizes());
-  int64_t* out_sizemat_ptr = out_sizemat.data_ptr<int64_t>();
-  for (int64_t i = 0; i < ntensors; i++) {
-    const IntArrayRef& self_shape = self_sizes[i],
-        & mat2_shape = mat2_sizes[i];
-    const int64_t& self_size0 = self_shape[0], & self_size1 = self_shape[1],
-        & mat2_size0 = mat2_shape[0], & mat2_size1 = mat2_shape[1];
-    TORCH_CHECK(self_size1 == mat2_size0,
-        i, "-th nested matrices in batch cannot be multiplied (",
-        self_size0, "x", self_size1, " and ",
-        mat2_size0, "x", mat2_size1, ")");
-    out_sizemat_ptr[0] = self_size0;
-    out_sizemat_ptr[1] = mat2_size1;
-    out_sizemat_ptr += 2;
-    out_numel += self_size0 * mat2_size1;
-  }
-  Tensor out_buffer = self_buffer.new_empty(out_numel);
-  Tensor output = wrap_buffer(out_buffer, out_sizemat);
-  // call tensor mm
-  // TODO: `padding nested tensor -> bmm -> remove padding` may be more efficient
-  //       until we have specialized nested tensor bmm kernel
-  //       useful resource: `aten/src/ATen/native/cpu/LinearAlgebra.cpp/bmm_out_or_baddbmm_`
-  //                        `aten/src/ATen/native/cuda/Blas.cpp/baddbmm_out_cuda_impl`
-  std::vector<Tensor> output_unbind = output.unbind();
-  for (int64_t i = 0; i < ntensors; i++) {
-    at::mm_out(output_unbind[i],
-               self_buffer.as_strided(self_sizes[i], self_strides[i], self_offsets[i]),
-               mat2_buffer.as_strided(mat2_sizes[i], mat2_strides[i], mat2_offsets[i]));
-  }
-  return output;
-}
-
-// utilities support `matmul_nested`
-namespace {
-// Args:
-//     self_sizes: the sizes of `self` in `matmul_nested`
-//     mat2_sizes: the sizes of `mat2` in `matmul_nested`
-//     buffer_op: the options for new buffer
-//     sizemat_op: the options for new size matrix
-// Returns:
-//     the batch size of each input underlying tensor, i.e. the product of batch-dimension sizes
-//     the empty output nested tensor
-inline std::tuple<std::vector<int64_t>, Tensor>
-matmul_nested_helper(
-    const std::vector<IntArrayRef>& self_sizes,
-    const std::vector<IntArrayRef>& mat2_sizes,
-    const c10::TensorOptions& buffer_op,
-    const c10::TensorOptions& sizemat_op) {
-  int64_t ntensors = self_sizes.size(),
-      ndims = self_sizes[0].size();
-  std::vector<int64_t> batch_sizes(ntensors, 1);
-  Tensor sizemat = at::empty({ntensors, ndims}, sizemat_op);
-  int64_t* sizemat_ptr = sizemat.data_ptr<int64_t>();
-  int64_t numel = 0;
-  for (int64_t i = 0; i < ntensors; i++) {
-    const IntArrayRef& self_size = self_sizes[i],
-        & mat2_size = mat2_sizes[i];
-    int64_t& batch_size = batch_sizes[i];
-    // batch dimensions
-    for (int64_t j = 0; j < ndims - 2; j++) {
-      const int64_t& self_sizej = self_size[j],
-          & mat2_sizej = mat2_size[j];
-      TORCH_CHECK(
-          self_sizej == mat2_sizej,
-          "matmul: For nested tensors, no broadcasting is currently performed: ",
-          i, "-th nested matrices in batch at dimension ", j + 1,
-          " have mismatching sizes ", self_sizej, " and ", mat2_sizej);
-      sizemat_ptr[j] = self_sizej;
-      batch_size *= sizemat_ptr[j];
-    }
-    // matrix multiplication dimensions
-    const int64_t& self_size0 = self_size[ndims - 2], & self_size1 = self_size[ndims - 1],
-        & mat2_size0 = mat2_size[ndims - 2], & mat2_size1 = mat2_size[ndims - 1];
-    TORCH_CHECK(
-        self_size1 == mat2_size0,
-        "matmul: ",
-        i, "-th nested matrices in batch cannot be multiplied (",
-        self_size0, "x", self_size1, " and ",
-        mat2_size0, "x", mat2_size1, ")");
-    sizemat_ptr[ndims - 2] = self_size0;
-    sizemat_ptr[ndims - 1] = mat2_size1;
-    sizemat_ptr += ndims;
-    numel += batch_size * self_size0 * mat2_size1;
-  }
-  Tensor buffer = at::empty(numel, buffer_op);
-  Tensor output = wrap_buffer(buffer, sizemat);
-  return std::make_tuple(batch_sizes, output);
-}
-}
-
-// Note [nested tensor matmul]
-// This is really a generalized batched matmul dedicated to nested tensors,
-// where `self` and `mat2` have same number (>= 3) of dimensions.
-// The last 2 dimensions will be considered as matrix dimensions,
-// so they should be matrix-multiplicable.
-// The leading dimensions are considered as batch dimensions,
-// and since nested tensor does not support broadcasting for now,
-// for each batch dimension `self` and `mat2` must have same size.
-// TODO: Should make full matmul semantics support some day
-Tensor matmul_nested(const Tensor& self, const Tensor& mat2) {
-  if (self.is_nested() && !mat2.is_nested()) {
-    AT_ERROR("Expected both to be nested, but got a nested self and non-nested other");
-  }
-  else if (!self.is_nested() && mat2.is_nested()) {
-    AT_ERROR("Expected both to be nested, but got a non-nested self and nested other");
-  }
-  // to_padded_tensor only supports contiguous inputs
-  auto self_contig = self.contiguous();
-  auto mat2_contig = mat2.contiguous();
-  // dispatcher should have guaranteed that at least one is nested
-  const auto self_ptr = get_nested_tensor_impl(self_contig);
-  const auto mat2_ptr = get_nested_tensor_impl(mat2_contig);
-  int64_t self_dim = self_ptr->dim(),
-      mat2_dim = mat2_ptr->dim();
-  TORCH_CHECK(
-      self_dim >= 3,
-      "matmul: For nested tensors, only inputs with >= 3 dims are currently supported. 1st input has rank: ",
-      self_dim);
-  TORCH_CHECK(
-      mat2_dim >= 3,
-      "matmul: For nested tensors, only inputs with >= 3 dims are currently supported. 2nd input has rank: ",
-      mat2_dim);
-  TORCH_CHECK(self_dim == mat2_dim, "matmul: both inputs must have the same rank");
-  int64_t ntensors = self_ptr->size(0),
-      ntensors2 = mat2_ptr->size(0);
-  TORCH_CHECK(ntensors == ntensors2,
-      "matmul: Expected size for the 1st dimension of 2nd input tensor to be: ", ntensors,
-      " but got: ", ntensors2, ".");
-  // Ensure batch dimensions have the same sizes (no broadcasting).
-  const auto& self_sizes = self_ptr->get_nested_size_tensor();
-  const auto& mat2_sizes = mat2_ptr->get_nested_size_tensor();
-  const auto& self_batch_sizes = self_sizes.narrow(1, 0, self_dim-3);
-  const auto& mat2_batch_sizes = mat2_sizes.narrow(1, 0, mat2_dim-3);
-  TORCH_CHECK(at::equal(self_batch_sizes, mat2_batch_sizes),
-    "matmul: For nested tensors, batch dimensions must have the same sizes, ",
-    "no broadcasting is currently performed. Got batch shapes for self ",
-    self_batch_sizes,
-    " and batch shapes for mat2 ",
-    mat2_batch_sizes);
-  // Ensure last dim of self and second last dim of mat2 have the same size
-  const auto& self_dim_size = self_sizes.select(1, -1);
-  const auto& mat2_dim_size = mat2_sizes.select(1, -2);
-  TORCH_CHECK(at::equal(self_dim_size, mat2_dim_size),
-    "matmul: Nested tensors cannot be matrix multiplied, last dimension of self has sizes",
-    self_dim_size,
-    "second last dimension of mat2 has sizes",
-    mat2_dim_size);
-  // Construct output size from input sizes
-  Tensor output_sizes = self_sizes.clone();
-  // The last entry in every row of output_sizes should be last column of mat2_sizes
-  output_sizes.index_put_({at::indexing::Slice(), -1}, mat2_sizes.select(1, -1).clone());
-
-  auto self_padded = self_contig.to_padded_tensor(0.);
-  auto mat2_padded = mat2_contig.to_padded_tensor(0.);
-  auto output_padded = at::matmul(self_padded, mat2_padded);
-  auto output_nested = nested_from_padded_generic(output_padded, output_sizes);
-  return output_nested;
-}
-
-Tensor& matmul_out_nested(const Tensor& tensor1, const Tensor& tensor2, Tensor& result) {
-  // TODO: this is a very quick and dirty implementation
-  //       should improve it to avoid the intermediate memory usage
-  Tensor function_result = at::matmul(tensor1, tensor2);
-  auto function_result_ptr = get_nested_tensor_impl(function_result);
-  // TODO: this is to reproduce function_result_ptr->opt_sizes_
-  //       if an accessor is provided in the future, can replace this
-  std::vector<int64_t> sizes;
-  for (int64_t i = 0; i < function_result_ptr->dim(); i++) {
-    c10::optional<int64_t> opt_size = function_result_ptr->opt_size(i);
-    if (opt_size.has_value()) {
-      sizes.push_back(*opt_size);
-    }
-    else {
-      sizes.push_back(-1);
-    }
-  }
-  result.reshape(sizes);
-  result.copy_(function_result);
-  return result;
-}
-
 Tensor transpose_nested(const Tensor& self, int64_t dim0, int64_t dim1) {
   auto self_ptr = get_nested_tensor_impl(self);
   // check input dimensions
diff --git a/aten/src/ATen/native/nested/NestedTensorMath.h b/aten/src/ATen/native/nested/NestedTensorMath.h
index 69fe4ee3cd296..954fa807f1832 100644
--- a/aten/src/ATen/native/nested/NestedTensorMath.h
+++ b/aten/src/ATen/native/nested/NestedTensorMath.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <ATen/core/ATen_fwd.h>
+#include <ATen/NestedTensorImpl.h>
 #include <c10/macros/Macros.h>
 
 namespace at {
@@ -11,5 +12,12 @@ TORCH_API Tensor NestedTensor_to_padded_tensor_generic(
     double padding,
     OptionalIntArrayRef output_size);
 
+template <typename Func>
+Tensor map_nt(const Tensor& nt, Func f) {
+  auto* nt_impl = get_nested_tensor_impl(nt);
+  const auto& sizes = nt_impl->get_nested_size_tensor();
+  return at::detail::make_tensor<NestedTensorImpl>(f(nt_impl->get_buffer()), sizes);
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/nested/NestedTensorMatmul.cpp b/aten/src/ATen/native/nested/NestedTensorMatmul.cpp
new file mode 100644
index 0000000000000..2932fdbaf3b90
--- /dev/null
+++ b/aten/src/ATen/native/nested/NestedTensorMatmul.cpp
@@ -0,0 +1,231 @@
+#include <ATen/native/nested/NestedTensorMath.h>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/NestedTensorImpl.h>
+#include <ATen/ScalarOps.h>
+#include <ATen/TensorIndexing.h>
+#include <ATen/TensorOperators.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/native/layer_norm.h>
+#include <ATen/native/nested/NestedTensorUtils.h>
+
+#include <tuple>
+
+namespace at {
+namespace native {
+
+Tensor bmm_nested(const Tensor& self, const Tensor& mat2) {
+  if (self.is_nested() && !mat2.is_nested()) {
+    AT_ERROR("Expected both to be nested, but got a nested self and non-nested other");
+  }
+  else if (!self.is_nested() && mat2.is_nested()) {
+    AT_ERROR("Expected both to be nested, but got a non-nested self and nested other");
+  }
+  // dispatcher should have guaranteed that at least one is nested
+  auto self_ptr = get_nested_tensor_impl(self);
+  auto mat2_ptr = get_nested_tensor_impl(mat2);
+  TORCH_CHECK(self_ptr->dim() == 3, "batch1 must be a 3D tensor");
+  TORCH_CHECK(mat2_ptr->dim() == 3, "batch2 must be a 3D tensor");
+  int64_t ntensors = self_ptr->size(0),
+      ntensors2 = mat2_ptr->size(0);
+  TORCH_CHECK(ntensors == ntensors2,
+      "Expected size for the 1st dimension of batch2 tensor to be: ", ntensors,
+      " but got: ", ntensors2, ".");
+  const Tensor& self_buffer = self_ptr->get_unsafe_storage_as_tensor(),
+      & mat2_buffer = mat2_ptr->get_unsafe_storage_as_tensor();
+  std::vector<IntArrayRef> self_sizes = NestedTensor_get_sizes(self_ptr),
+      mat2_sizes = NestedTensor_get_sizes(mat2_ptr),
+      self_strides = NestedTensor_get_strides(self_ptr),
+      mat2_strides = NestedTensor_get_strides(mat2_ptr);
+  const std::vector<int64_t>& self_offsets = self_ptr->get_storage_offsets(),
+      & mat2_offsets = mat2_ptr->get_storage_offsets();
+  // create a contiguous output
+  int64_t out_numel = 0;
+  const Tensor& self_sizemat = self_ptr->get_nested_size_tensor();
+  Tensor out_sizemat = self_sizemat.new_empty(self_sizemat.sizes());
+  int64_t* out_sizemat_ptr = out_sizemat.data_ptr<int64_t>();
+  for (int64_t i = 0; i < ntensors; i++) {
+    const IntArrayRef& self_shape = self_sizes[i],
+        & mat2_shape = mat2_sizes[i];
+    const int64_t& self_size0 = self_shape[0], & self_size1 = self_shape[1],
+        & mat2_size0 = mat2_shape[0], & mat2_size1 = mat2_shape[1];
+    TORCH_CHECK(self_size1 == mat2_size0,
+        i, "-th nested matrices in batch cannot be multiplied (",
+        self_size0, "x", self_size1, " and ",
+        mat2_size0, "x", mat2_size1, ")");
+    out_sizemat_ptr[0] = self_size0;
+    out_sizemat_ptr[1] = mat2_size1;
+    out_sizemat_ptr += 2;
+    out_numel += self_size0 * mat2_size1;
+  }
+  Tensor out_buffer = self_buffer.new_empty(out_numel);
+  Tensor output = wrap_buffer(out_buffer, out_sizemat);
+  // call tensor mm
+  // TODO: `padding nested tensor -> bmm -> remove padding` may be more efficient
+  //       until we have specialized nested tensor bmm kernel
+  //       useful resource: `aten/src/ATen/native/cpu/LinearAlgebra.cpp/bmm_out_or_baddbmm_`
+  //                        `aten/src/ATen/native/cuda/Blas.cpp/baddbmm_out_cuda_impl`
+  std::vector<Tensor> output_unbind = output.unbind();
+  for (int64_t i = 0; i < ntensors; i++) {
+    at::mm_out(output_unbind[i],
+               self_buffer.as_strided(self_sizes[i], self_strides[i], self_offsets[i]),
+               mat2_buffer.as_strided(mat2_sizes[i], mat2_strides[i], mat2_offsets[i]));
+  }
+  return output;
+}
+
+// utilities support `matmul_nested`
+namespace {
+// Args:
+//     self_sizes: the sizes of `self` in `matmul_nested`
+//     mat2_sizes: the sizes of `mat2` in `matmul_nested`
+//     buffer_op: the options for new buffer
+//     sizemat_op: the options for new size matrix
+// Returns:
+//     the batch size of each input underlying tensor, i.e. the product of batch-dimension sizes
+//     the empty output nested tensor
+inline std::tuple<std::vector<int64_t>, Tensor>
+matmul_nested_helper(
+    const std::vector<IntArrayRef>& self_sizes,
+    const std::vector<IntArrayRef>& mat2_sizes,
+    const c10::TensorOptions& buffer_op,
+    const c10::TensorOptions& sizemat_op) {
+  int64_t ntensors = self_sizes.size(),
+      ndims = self_sizes[0].size();
+  std::vector<int64_t> batch_sizes(ntensors, 1);
+  Tensor sizemat = at::empty({ntensors, ndims}, sizemat_op);
+  int64_t* sizemat_ptr = sizemat.data_ptr<int64_t>();
+  int64_t numel = 0;
+  for (int64_t i = 0; i < ntensors; i++) {
+    const IntArrayRef& self_size = self_sizes[i],
+        & mat2_size = mat2_sizes[i];
+    int64_t& batch_size = batch_sizes[i];
+    // batch dimensions
+    for (int64_t j = 0; j < ndims - 2; j++) {
+      const int64_t& self_sizej = self_size[j],
+          & mat2_sizej = mat2_size[j];
+      TORCH_CHECK(
+          self_sizej == mat2_sizej,
+          "matmul: For nested tensors, no broadcasting is currently performed: ",
+          i, "-th nested matrices in batch at dimension ", j + 1,
+          " have mismatching sizes ", self_sizej, " and ", mat2_sizej);
+      sizemat_ptr[j] = self_sizej;
+      batch_size *= sizemat_ptr[j];
+    }
+    // matrix multiplication dimensions
+    const int64_t& self_size0 = self_size[ndims - 2], & self_size1 = self_size[ndims - 1],
+        & mat2_size0 = mat2_size[ndims - 2], & mat2_size1 = mat2_size[ndims - 1];
+    TORCH_CHECK(
+        self_size1 == mat2_size0,
+        "matmul: ",
+        i, "-th nested matrices in batch cannot be multiplied (",
+        self_size0, "x", self_size1, " and ",
+        mat2_size0, "x", mat2_size1, ")");
+    sizemat_ptr[ndims - 2] = self_size0;
+    sizemat_ptr[ndims - 1] = mat2_size1;
+    sizemat_ptr += ndims;
+    numel += batch_size * self_size0 * mat2_size1;
+  }
+  Tensor buffer = at::empty(numel, buffer_op);
+  Tensor output = wrap_buffer(buffer, sizemat);
+  return std::make_tuple(batch_sizes, output);
+}
+}
+
+// Note [nested tensor matmul]
+// This is really a generalized batched matmul dedicated to nested tensors,
+// where `self` and `mat2` have same number (>= 3) of dimensions.
+// The last 2 dimensions will be considered as matrix dimensions,
+// so they should be matrix-multiplicable.
+// The leading dimensions are considered as batch dimensions,
+// and since nested tensor does not support broadcasting for now,
+// for each batch dimension `self` and `mat2` must have same size.
+// TODO: Should make full matmul semantics support some day
+Tensor matmul_nested(const Tensor& self, const Tensor& mat2) {
+  if (self.is_nested() && !mat2.is_nested()) {
+    AT_ERROR("Expected both to be nested, but got a nested self and non-nested other");
+  }
+  else if (!self.is_nested() && mat2.is_nested()) {
+    AT_ERROR("Expected both to be nested, but got a non-nested self and nested other");
+  }
+  // to_padded_tensor only supports contiguous inputs
+  auto self_contig = self.contiguous();
+  auto mat2_contig = mat2.contiguous();
+  // dispatcher should have guaranteed that at least one is nested
+  const auto self_ptr = get_nested_tensor_impl(self_contig);
+  const auto mat2_ptr = get_nested_tensor_impl(mat2_contig);
+  int64_t self_dim = self_ptr->dim(),
+      mat2_dim = mat2_ptr->dim();
+  TORCH_CHECK(
+      self_dim >= 3,
+      "matmul: For nested tensors, only inputs with >= 3 dims are currently supported. 1st input has rank: ",
+      self_dim);
+  TORCH_CHECK(
+      mat2_dim >= 3,
+      "matmul: For nested tensors, only inputs with >= 3 dims are currently supported. 2nd input has rank: ",
+      mat2_dim);
+  TORCH_CHECK(self_dim == mat2_dim, "matmul: both inputs must have the same rank");
+  int64_t ntensors = self_ptr->size(0),
+      ntensors2 = mat2_ptr->size(0);
+  TORCH_CHECK(ntensors == ntensors2,
+      "matmul: Expected size for the 1st dimension of 2nd input tensor to be: ", ntensors,
+      " but got: ", ntensors2, ".");
+  // Ensure batch dimensions have the same sizes (no broadcasting).
+  const auto& self_sizes = self_ptr->get_nested_size_tensor();
+  const auto& mat2_sizes = mat2_ptr->get_nested_size_tensor();
+  const auto& self_batch_sizes = self_sizes.narrow(1, 0, self_dim-3);
+  const auto& mat2_batch_sizes = mat2_sizes.narrow(1, 0, mat2_dim-3);
+  TORCH_CHECK(at::equal(self_batch_sizes, mat2_batch_sizes),
+    "matmul: For nested tensors, batch dimensions must have the same sizes, ",
+    "no broadcasting is currently performed. Got batch shapes for self ",
+    self_batch_sizes,
+    " and batch shapes for mat2 ",
+    mat2_batch_sizes);
+  // Ensure last dim of self and second last dim of mat2 have the same size
+  const auto& self_dim_size = self_sizes.select(1, -1);
+  const auto& mat2_dim_size = mat2_sizes.select(1, -2);
+  TORCH_CHECK(at::equal(self_dim_size, mat2_dim_size),
+    "matmul: Nested tensors cannot be matrix multiplied, last dimension of self has sizes",
+    self_dim_size,
+    "second last dimension of mat2 has sizes",
+    mat2_dim_size);
+  // Construct output size from input sizes
+  Tensor output_sizes = self_sizes.clone();
+  // The last entry in every row of output_sizes should be last column of mat2_sizes
+  output_sizes.index_put_({at::indexing::Slice(), -1}, mat2_sizes.select(1, -1).clone());
+
+  auto self_padded = self_contig.to_padded_tensor(0.);
+  auto mat2_padded = mat2_contig.to_padded_tensor(0.);
+  auto output_padded = at::matmul(self_padded, mat2_padded);
+  auto output_nested = nested_from_padded_generic(output_padded, output_sizes);
+  return output_nested;
+}
+
+Tensor& matmul_out_nested(const Tensor& tensor1, const Tensor& tensor2, Tensor& result) {
+  // TODO: this is a very quick and dirty implementation
+  //       should improve it to avoid the intermediate memory usage
+  Tensor function_result = at::matmul(tensor1, tensor2);
+  auto function_result_ptr = get_nested_tensor_impl(function_result);
+  // TODO: this is to reproduce function_result_ptr->opt_sizes_
+  //       if an accessor is provided in the future, can replace this
+  std::vector<int64_t> sizes;
+  for (int64_t i = 0; i < function_result_ptr->dim(); i++) {
+    c10::optional<int64_t> opt_size = function_result_ptr->opt_size(i);
+    if (opt_size.has_value()) {
+      sizes.push_back(*opt_size);
+    }
+    else {
+      sizes.push_back(-1);
+    }
+  }
+  result.reshape(sizes);
+  result.copy_(function_result);
+  return result;
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/nested/NestedTensorUnaryOps.cpp b/aten/src/ATen/native/nested/NestedTensorUnaryOps.cpp
new file mode 100644
index 0000000000000..74289a1372e12
--- /dev/null
+++ b/aten/src/ATen/native/nested/NestedTensorUnaryOps.cpp
@@ -0,0 +1,62 @@
+#include <ATen/native/nested/NestedTensorMath.h>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/NestedTensorImpl.h>
+#include <ATen/ScalarOps.h>
+#include <ATen/TensorIndexing.h>
+#include <ATen/TensorOperators.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/native/layer_norm.h>
+#include <ATen/native/nested/NestedTensorUtils.h>
+
+#include <tuple>
+
+namespace at {
+namespace native {
+
+Tensor& NestedTensor_relu_(Tensor& self) {
+  auto self_ptr = get_nested_tensor_impl(self);
+  check_numel_equals_buffer_size(self_ptr);
+  auto buffer = self_ptr->get_buffer();
+  at::relu_(buffer);
+  return self;
+}
+
+Tensor NestedTensor_relu(const Tensor& self) {
+  return map_nt(self, at::relu);
+}
+
+Tensor& NestedTensor_gelu_(Tensor& self, c10::string_view approximate) {
+  auto self_ptr = get_nested_tensor_impl(self);
+  check_numel_equals_buffer_size(self_ptr);
+  auto buffer = self_ptr->get_buffer();
+  at::gelu_(buffer, approximate);
+  return self;
+}
+
+Tensor NestedTensor_gelu(const Tensor& self, c10::string_view approximate) {
+  return map_nt(
+      self,
+      [approximate](const Tensor& buffer) {
+        return at::gelu(buffer, approximate);
+      });
+}
+
+Tensor& NestedTensor_tanh_(Tensor& self) {
+  auto self_ptr = get_nested_tensor_impl(self);
+  check_numel_equals_buffer_size(self_ptr);
+  auto buffer = self_ptr->get_buffer();
+  at::tanh_(buffer);
+  return self;
+}
+
+Tensor NestedTensor_tanh(const Tensor& self) {
+  return map_nt(self, at::tanh);
+}
+
+} // namespace native
+} // namespace at
diff --git a/build_variables.bzl b/build_variables.bzl
index 06e8f7bf4b606..1672ccb8a3b38 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -1401,11 +1401,14 @@ aten_native_source_non_codegen_list = [
     "aten/src/ATen/native/mkl/SparseBlasImpl.cpp",
     "aten/src/ATen/native/mkl/SparseCsrLinearAlgebra.cpp",
     "aten/src/ATen/native/mkl/SpectralOps.cpp",
+    "aten/src/ATen/native/nested/NestedTensorAliases.cpp",
+    "aten/src/ATen/native/nested/NestedTensorBackward.cpp",
+    "aten/src/ATen/native/nested/NestedTensorBinaryOps.cpp",
     "aten/src/ATen/native/nested/NestedTensorFactories.cpp",
     "aten/src/ATen/native/nested/NestedTensorMath.cpp",
-    "aten/src/ATen/native/nested/NestedTensorAliases.cpp",
+    "aten/src/ATen/native/nested/NestedTensorMatmul.cpp",
     "aten/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp",
-    "aten/src/ATen/native/nested/NestedTensorBackward.cpp",
+    "aten/src/ATen/native/nested/NestedTensorUnaryOps.cpp",
     "aten/src/ATen/native/nested/NestedTensorUtils.cpp",
     "aten/src/ATen/native/sparse/ParamUtils.cpp",
     "aten/src/ATen/native/sparse/SoftMax.cpp",

From 09e450d329f21727760285a6280e97cfe7631c0b Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@fb.com>
Date: Mon, 31 Oct 2022 09:47:26 -0700
Subject: [PATCH 0360/1922] Do not use unsafe restriding for subclasses
 (#87610)

This helps convert some accuracy errors into runtime errors,
which makes it easier to debug.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87610
Approved by: https://github.com/albanD
---
 .../ATen/functorch/BatchRulesScatterOps.cpp   |  5 ++
 aten/src/ATen/native/TensorShape.cpp          |  3 +-
 test/functorch/test_eager_transforms.py       | 10 ++-
 test/test_functionalization.py                | 64 +++++++++----------
 4 files changed, 46 insertions(+), 36 deletions(-)

diff --git a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
index 5eecbedd93e7b..fc51e9d744099 100644
--- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
@@ -928,6 +928,11 @@ Tensor index_copy_decomp(
   return at::scatter(self, dim, index_, source);  ;
 }
 
+// Note [Fix vmap slice_scatter]
+// registers a decomposition for `slice_scatter` that calls into `slice.src`
+// *_scatter operators have some special semantics though, that we can't easily
+// through a decomposition: slice_scatter's output needs to have the same
+// size, size, strides and storage_offset as the input.
 Tensor slice_scatter_decomp(const Tensor &self, const Tensor &src,
                             int64_t dim, c10::optional<int64_t> start,
                             c10::optional<int64_t> end, int64_t step)
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 2051cda371b97..282eec87b6e9d 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -3,6 +3,7 @@
 #include <ATen/core/DimVector.h>
 #include <ATen/core/functional.h>
 #include <ATen/core/IListRef.h>
+#include <ATen/TensorSubclassLikeUtils.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
 #include <ATen/ExpandUtils.h>
@@ -1565,7 +1566,7 @@ Tensor reshape_symint(const Tensor& self, c10::SymIntArrayRef proposed_shape) {
     //
     // We need to do the checks here instead of in `native_functions.yaml`
     // to preserve backwards compatibility.
-    if (!self.is_xla() && !self.is_lazy() && !self.is_ipu()) {
+    if (!self.is_xla() && !self.is_lazy() && !self.is_ipu() && !at::isTensorSubclassLike(self)) {
       return self._reshape_alias_symint(shape, stride.value());
     } else {
       return self.view_symint(shape);
diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py
index 26b64c5e70cca..ff69ed9df6e63 100644
--- a/test/functorch/test_eager_transforms.py
+++ b/test/functorch/test_eager_transforms.py
@@ -3130,13 +3130,16 @@ def normalize_devices(fx_g):
     return fx_g
 
 class TestFunctionalize(TestCase):
-    def _check_functionalize_correctness(self, f, inpt):
+    def _check_functionalize_correctness(self, f, inpt, *, skip_vmap=False):
         inpt1 = inpt.clone()
         inpt2 = inpt.clone()
         inpt3 = inpt.clone()
 
         expected_outputs = f(inpt1)
-        actual_outputs = vmap(functionalize(f))(inpt2.unsqueeze(0))[0].squeeze()
+        if skip_vmap:
+            actual_outputs = functionalize(f)(inpt2)
+        else:
+            actual_outputs = vmap(functionalize(f))(inpt2.unsqueeze(0))[0].squeeze()
         # Right now the flavor of functionalize that also removes view ops
         # isn't being used with vmap
         # That's because {view}_copy ops don't have batching rules yet
@@ -3206,7 +3209,8 @@ def f(x: torch.Tensor) -> torch.Tensor:
             z2, z3 = z1.split(2)
             z2.add_(tmp)
             return x
-        self._check_functionalize_correctness(f, torch.zeros(4, 2, device=device))
+        # See Note [Fix vmap slice_scatter]
+        self._check_functionalize_correctness(f, torch.zeros(4, 2, device=device), skip_vmap=True)
 
     # Ensure functionalize works with List[Optional[Tensor]] arguments.
     # See the fix / discussion at https://github.com/pytorch/pytorch/pull/76085
diff --git a/test/test_functionalization.py b/test/test_functionalization.py
index 521cb4e9e0cec..9639c6babbe22 100644
--- a/test/test_functionalization.py
+++ b/test/test_functionalization.py
@@ -144,11 +144,11 @@ def forward(self, a_1):
     as_strided_copy_4 = torch.ops.aten.as_strided_copy.default(as_strided_copy_2, [16, 64, 128, 128], [1048576, 16384, 128, 1], 0)
     clone_1 = torch.ops.aten.clone.default(as_strided_copy_4, memory_format = torch.contiguous_format);  as_strided_copy_4 = None
     threshold_backward = torch.ops.aten.threshold_backward.default(clone_1, relu, 0);  clone_1 = relu = None
-    _reshape_alias_copy = torch.ops.aten._reshape_alias_copy.default(as_strided_copy_2, [16, 64, 128, 128], [1048576, 16384, 128, 1])
-    detach_copy = torch.ops.aten.detach_copy.default(_reshape_alias_copy);  _reshape_alias_copy = None
+    view_copy_2 = torch.ops.aten.view_copy.default(as_strided_copy_2, [16, 64, 128, 128])
+    detach_copy = torch.ops.aten.detach_copy.default(view_copy_2);  view_copy_2 = None
     as_strided_scatter_1 = torch.ops.aten.as_strided_scatter.default(as_strided_copy_2, threshold_backward, [16, 64, 128, 128], [1048576, 16384, 128, 1], 0);  as_strided_copy_2 = threshold_backward = None
-    _reshape_alias_copy_1 = torch.ops.aten._reshape_alias_copy.default(as_strided_scatter_1, [16, 64, 128, 128], [1048576, 16384, 128, 1]);  as_strided_scatter_1 = None
-    detach_copy_1 = torch.ops.aten.detach_copy.default(_reshape_alias_copy_1);  _reshape_alias_copy_1 = None
+    view_copy_3 = torch.ops.aten.view_copy.default(as_strided_scatter_1, [16, 64, 128, 128]);  as_strided_scatter_1 = None
+    detach_copy_1 = torch.ops.aten.detach_copy.default(view_copy_3);  view_copy_3 = None
     return detach_copy_1
     """)  # noqa: B950
 
@@ -701,40 +701,40 @@ def forward(self, a_1):
     ones = torch.ops.aten.ones.default([2, 2], device = device(type='cpu'), pin_memory = False)
     add = torch.ops.aten.add.Tensor(a_1, a_1);  a_1 = None
     view_copy = torch.ops.aten.view_copy.default(add, [8])
-    _reshape_alias_copy = torch.ops.aten._reshape_alias_copy.default(view_copy, [2, 4], [4, 1]);  view_copy = None
-    transpose_copy = torch.ops.aten.transpose_copy.int(_reshape_alias_copy, 1, 0)
+    view_copy_1 = torch.ops.aten.view_copy.default(view_copy, [2, 4]);  view_copy = None
+    transpose_copy = torch.ops.aten.transpose_copy.int(view_copy_1, 1, 0)
     unsqueeze_copy = torch.ops.aten.unsqueeze_copy.default(transpose_copy, 0);  transpose_copy = None
     squeeze_copy = torch.ops.aten.squeeze_copy.default(unsqueeze_copy);  unsqueeze_copy = None
     split_copy = torch.ops.aten.split_copy.Tensor(squeeze_copy, 2);  squeeze_copy = None
     getitem = split_copy[0]
     getitem_1 = split_copy[1];  split_copy = None
     add_1 = torch.ops.aten.add.Tensor(getitem, ones);  getitem = ones = None
-    select_copy = torch.ops.aten.select_copy.int(_reshape_alias_copy, 0, 0);  _reshape_alias_copy = None
-    _reshape_alias_copy_1 = torch.ops.aten._reshape_alias_copy.default(add_1, [4], [1])
-    view_copy_1 = torch.ops.aten.view_copy.default(add, [8]);  add = None
-    _reshape_alias_copy_2 = torch.ops.aten._reshape_alias_copy.default(view_copy_1, [2, 4], [4, 1]);  view_copy_1 = None
-    transpose_copy_1 = torch.ops.aten.transpose_copy.int(_reshape_alias_copy_2, 1, 0);  _reshape_alias_copy_2 = None
+    select_copy = torch.ops.aten.select_copy.int(view_copy_1, 0, 0);  view_copy_1 = None
+    view_copy_2 = torch.ops.aten.view_copy.default(add_1, [4])
+    view_copy_3 = torch.ops.aten.view_copy.default(add, [8]);  add = None
+    view_copy_4 = torch.ops.aten.view_copy.default(view_copy_3, [2, 4]);  view_copy_3 = None
+    transpose_copy_1 = torch.ops.aten.transpose_copy.int(view_copy_4, 1, 0);  view_copy_4 = None
     unsqueeze_copy_1 = torch.ops.aten.unsqueeze_copy.default(transpose_copy_1, 0);  transpose_copy_1 = None
     squeeze_copy_1 = torch.ops.aten.squeeze_copy.default(unsqueeze_copy_1);  unsqueeze_copy_1 = None
     slice_scatter = torch.ops.aten.slice_scatter.default(squeeze_copy_1, add_1, 0, 0, 2);  squeeze_copy_1 = None
     unsqueeze_copy_2 = torch.ops.aten.unsqueeze_copy.default(slice_scatter, 0);  slice_scatter = None
     squeeze_copy_2 = torch.ops.aten.squeeze_copy.dim(unsqueeze_copy_2, 0);  unsqueeze_copy_2 = None
     transpose_copy_2 = torch.ops.aten.transpose_copy.int(squeeze_copy_2, 1, 0);  squeeze_copy_2 = None
-    _reshape_alias_copy_3 = torch.ops.aten._reshape_alias_copy.default(transpose_copy_2, [8], [1]);  transpose_copy_2 = None
-    view_copy_2 = torch.ops.aten.view_copy.default(_reshape_alias_copy_3, [4, 2]);  _reshape_alias_copy_3 = None
-    view_copy_3 = torch.ops.aten.view_copy.default(view_copy_2, [8])
-    _reshape_alias_copy_4 = torch.ops.aten._reshape_alias_copy.default(view_copy_3, [2, 4], [4, 1]);  view_copy_3 = None
-    select_copy_1 = torch.ops.aten.select_copy.int(_reshape_alias_copy_4, 0, 0);  _reshape_alias_copy_4 = None
-    view_copy_4 = torch.ops.aten.view_copy.default(view_copy_2, [8]);  view_copy_2 = None
-    _reshape_alias_copy_5 = torch.ops.aten._reshape_alias_copy.default(view_copy_4, [2, 4], [4, 1]);  view_copy_4 = None
-    transpose_copy_3 = torch.ops.aten.transpose_copy.int(_reshape_alias_copy_5, 1, 0);  _reshape_alias_copy_5 = None
+    view_copy_5 = torch.ops.aten.view_copy.default(transpose_copy_2, [8]);  transpose_copy_2 = None
+    view_copy_6 = torch.ops.aten.view_copy.default(view_copy_5, [4, 2]);  view_copy_5 = None
+    view_copy_7 = torch.ops.aten.view_copy.default(view_copy_6, [8])
+    view_copy_8 = torch.ops.aten.view_copy.default(view_copy_7, [2, 4]);  view_copy_7 = None
+    select_copy_1 = torch.ops.aten.select_copy.int(view_copy_8, 0, 0);  view_copy_8 = None
+    view_copy_9 = torch.ops.aten.view_copy.default(view_copy_6, [8]);  view_copy_6 = None
+    view_copy_10 = torch.ops.aten.view_copy.default(view_copy_9, [2, 4]);  view_copy_9 = None
+    transpose_copy_3 = torch.ops.aten.transpose_copy.int(view_copy_10, 1, 0);  view_copy_10 = None
     unsqueeze_copy_3 = torch.ops.aten.unsqueeze_copy.default(transpose_copy_3, 0);  transpose_copy_3 = None
     squeeze_copy_3 = torch.ops.aten.squeeze_copy.default(unsqueeze_copy_3);  unsqueeze_copy_3 = None
     split_copy_1 = torch.ops.aten.split_copy.Tensor(squeeze_copy_3, 2);  squeeze_copy_3 = None
     getitem_2 = split_copy_1[0]
     getitem_3 = split_copy_1[1];  split_copy_1 = None
-    _reshape_alias_copy_6 = torch.ops.aten._reshape_alias_copy.default(getitem_2, [4], [1]);  getitem_2 = None
-    add_2 = torch.ops.aten.add.Tensor(select_copy_1, _reshape_alias_copy_6);  select_copy_1 = _reshape_alias_copy_6 = None
+    view_copy_11 = torch.ops.aten.view_copy.default(getitem_2, [4]);  getitem_2 = None
+    add_2 = torch.ops.aten.add.Tensor(select_copy_1, view_copy_11);  select_copy_1 = view_copy_11 = None
     return add_1
     """)  # noqa: B950
 
@@ -747,30 +747,30 @@ def forward(self, a_1):
     ones = torch.ops.aten.ones.default([2, 2], device = device(type='cpu'), pin_memory = False)
     add = torch.ops.aten.add.Tensor(a_1, a_1);  a_1 = None
     view = torch.ops.aten.view.default(add, [8])
-    _reshape_alias = torch.ops.aten._reshape_alias.default(view, [2, 4], [4, 1]);  view = None
-    transpose = torch.ops.aten.transpose.int(_reshape_alias, 1, 0)
+    view_1 = torch.ops.aten.view.default(view, [2, 4]);  view = None
+    transpose = torch.ops.aten.transpose.int(view_1, 1, 0)
     unsqueeze = torch.ops.aten.unsqueeze.default(transpose, 0);  transpose = None
     squeeze = torch.ops.aten.squeeze.default(unsqueeze);  unsqueeze = None
     split = torch.ops.aten.split.Tensor(squeeze, 2);  squeeze = None
     getitem = split[0]
     getitem_1 = split[1];  split = None
     add_1 = torch.ops.aten.add_.Tensor(getitem, ones);  ones = None
-    select = torch.ops.aten.select.int(_reshape_alias, 0, 0);  _reshape_alias = None
+    select = torch.ops.aten.select.int(view_1, 0, 0);  view_1 = None
     clone = torch.ops.aten.clone.default(getitem, memory_format = torch.contiguous_format)
     _unsafe_view = torch.ops.aten._unsafe_view.default(clone, [4]);  clone = None
-    view_1 = torch.ops.aten.view.default(add, [8]);  add = None
-    _reshape_alias_1 = torch.ops.aten._reshape_alias.default(view_1, [2, 4], [4, 1]);  view_1 = None
-    transpose_1 = torch.ops.aten.transpose.int(_reshape_alias_1, 1, 0);  _reshape_alias_1 = None
+    view_2 = torch.ops.aten.view.default(add, [8]);  add = None
+    view_3 = torch.ops.aten.view.default(view_2, [2, 4]);  view_2 = None
+    transpose_1 = torch.ops.aten.transpose.int(view_3, 1, 0);  view_3 = None
     unsqueeze_1 = torch.ops.aten.unsqueeze.default(transpose_1, 0);  transpose_1 = None
     squeeze_1 = torch.ops.aten.squeeze.default(unsqueeze_1);  unsqueeze_1 = None
     unsqueeze_2 = torch.ops.aten.unsqueeze.default(squeeze_1, 0);  squeeze_1 = None
     squeeze_2 = torch.ops.aten.squeeze.dim(unsqueeze_2, 0);  unsqueeze_2 = None
     transpose_2 = torch.ops.aten.transpose.int(squeeze_2, 1, 0);  squeeze_2 = None
-    _reshape_alias_2 = torch.ops.aten._reshape_alias.default(transpose_2, [8], [1]);  transpose_2 = None
-    view_2 = torch.ops.aten.view.default(_reshape_alias_2, [4, 2]);  _reshape_alias_2 = None
-    view_3 = torch.ops.aten.view.default(view_2, [8]);  view_2 = None
-    _reshape_alias_3 = torch.ops.aten._reshape_alias.default(view_3, [2, 4], [4, 1]);  view_3 = None
-    select_1 = torch.ops.aten.select.int(_reshape_alias_3, 0, 0);  _reshape_alias_3 = None
+    view_4 = torch.ops.aten.view.default(transpose_2, [8]);  transpose_2 = None
+    view_5 = torch.ops.aten.view.default(view_4, [4, 2]);  view_4 = None
+    view_6 = torch.ops.aten.view.default(view_5, [8]);  view_5 = None
+    view_7 = torch.ops.aten.view.default(view_6, [2, 4]);  view_6 = None
+    select_1 = torch.ops.aten.select.int(view_7, 0, 0);  view_7 = None
     add_2 = torch.ops.aten.add.Tensor(select_1, _unsafe_view);  select_1 = _unsafe_view = None
     return getitem
     """)

From 6a33e967b7622f248b1ec8e9e1e558eb44023d1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radek=20Barto=C5=88?= <radek.barton@microsoft.com>
Date: Mon, 31 Oct 2022 21:11:16 +0000
Subject: [PATCH 0361/1922] Fix random "C1041: cannot open program database"
 errors when compiling on Windows (#88084)

Adds `/FS` option to `CMAKE_CXX_FLAGS` and `CMAKE_CUDA_FLAGS`.

So far I've encountered this kind of errors:

```
C:\Users\MyUser\AppData\Local\Temp\tmpxft_00004728_00000000-7_cuda.cudafe1.cpp: fatal error C1041: cannot open program database 'C:\Projects\pytorch\build\third_party\gloo\gloo\CMakeFiles\gloo_cuda.dir\vc140.pdb'; if multiple CL.EXE write to the same .PDB file, please use /FS
```
when building with VS 2022.

cc @peterjc123 @mszhanyi @skyline75489 @nbcsm

Related issues:
- https://github.com/pytorch/pytorch/issues/87691
- https://github.com/pytorch/pytorch/issues/39989
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88084
Approved by: https://github.com/ezyang
---
 CMakeLists.txt | 3 +++
 setup.py       | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 105e38e7c1acf..baf97a455863d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -545,6 +545,9 @@ if(MSVC)
 
   # Try harder
   string(APPEND CMAKE_CUDA_FLAGS " -Xcompiler /w -w")
+
+  string(APPEND CMAKE_CXX_FLAGS " /FS")
+  string(APPEND CMAKE_CUDA_FLAGS " -Xcompiler /FS")
 endif(MSVC)
 
 string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")
diff --git a/setup.py b/setup.py
index 72a6bbae7b460..ed0f032716574 100644
--- a/setup.py
+++ b/setup.py
@@ -795,7 +795,7 @@ def configure_extension_build():
         # /EHsc is about standard C++ exception handling
         # /DNOMINMAX removes builtin min/max functions
         # /wdXXXX disables warning no. XXXX
-        extra_compile_args = ['/MD', '/EHsc', '/DNOMINMAX',
+        extra_compile_args = ['/MD', '/FS', '/EHsc', '/DNOMINMAX',
                               '/wd4267', '/wd4251', '/wd4522', '/wd4522', '/wd4838',
                               '/wd4305', '/wd4244', '/wd4190', '/wd4101', '/wd4996',
                               '/wd4275']

From 2b8f6801a5ab1e56d305f54695120f1c2f76ad4a Mon Sep 17 00:00:00 2001
From: Catherine Lee <csl@fb.com>
Date: Mon, 31 Oct 2022 21:12:52 +0000
Subject: [PATCH 0362/1922] fix for auto labeler (#88100)

followed https://lightrun.com/answers/actions-labeler-how-to-only-add-label-not-remove-when-pr-is-opened

side note: should we move this logic to test-infra to be with the release notes labeler?
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88100
Approved by: https://github.com/huydhn
---
 .github/workflows/labeler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index 1d48268beccd1..bdef7a1367bfc 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -13,7 +13,7 @@ jobs:
     - uses: actions/labeler@v4
       with:
         repo-token: "${{ secrets.GITHUB_TOKEN }}"
-        sync-labels: true
+        sync-labels: ''
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}

From f72a1b85eabe5fc0ad7196802827b1b8d2ce29af Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@fb.com>
Date: Mon, 31 Oct 2022 21:31:54 +0000
Subject: [PATCH 0363/1922] Fix links to tutorial in torch masked docs (#88129)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88129
Approved by: https://github.com/jisaacso
---
 docs/source/masked.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/masked.rst b/docs/source/masked.rst
index d6ae9f7d56728..10ae8420425f8 100644
--- a/docs/source/masked.rst
+++ b/docs/source/masked.rst
@@ -56,10 +56,10 @@ There are already a number of existing tutorials that we've written to help user
 -  `Advanced semantics - discussion on why certain decisions were made (e.g. requiring masks to match for binary/reduction operations),
    differences with NumPy's MaskedArray, and reduction semantics`_
 
-.. _Overview - the place to start for new users, discusses how to use MaskedTensors and why they're useful: https://pytorch.org/tutorials/prototype/maskedtensor_overview.html/
-.. _Sparsity - MaskedTensor supports sparse COO and CSR data and mask Tensors: https://pytorch.org/tutorials/prototype/maskedtensor_sparsity.html/
-.. _Adagrad sparse semantics - a practical example of how MaskedTensor can simplify sparse semantics and implementations: https://pytorch.org/tutorials/prototype/maskedtensor_adagrad_semantics.html>
-.. _Advanced semantics - discussion on why certain decisions were made (e.g. requiring masks to match for binary/reduction operations), differences with NumPy's MaskedArray, and reduction semantics: https://pytorch.org/tutorials/prototype/maskedtensor_advanced_semantics.html/
+.. _Overview - the place to start for new users, discusses how to use MaskedTensors and why they're useful: https://pytorch.org/tutorials/prototype/maskedtensor_overview.html
+.. _Sparsity - MaskedTensor supports sparse COO and CSR data and mask Tensors: https://pytorch.org/tutorials/prototype/maskedtensor_sparsity.html
+.. _Adagrad sparse semantics - a practical example of how MaskedTensor can simplify sparse semantics and implementations: https://pytorch.org/tutorials/prototype/maskedtensor_adagrad.html
+.. _Advanced semantics - discussion on why certain decisions were made (e.g. requiring masks to match for binary/reduction operations), differences with NumPy's MaskedArray, and reduction semantics: https://pytorch.org/tutorials/prototype/maskedtensor_advanced_semantics.html
 
 Supported Operators
 +++++++++++++++++++

From 23b95822510710cc40ce573a12f04f12720b29c4 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 31 Oct 2022 16:42:58 -0400
Subject: [PATCH 0364/1922] Remove stale comment (#88135)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88135
Approved by: https://github.com/albanD
---
 torch/_subclasses/fake_tensor.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index d6e6f79647fd3..aea6fcbdd1e86 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -146,9 +146,6 @@ def tensor_memo(self):
     constant_storage_mapping: Dict[StorageWeakRef, List[TensorWeakRef]]
 
     def __init__(self):
-        # In principle preserving views should be OK, but in practice
-        # AOTAutograd (or maybe autograd) seems to do the wrong thing.  See
-        # https://github.com/pytorch/torchdynamo/issues/1815
         self.meta_converter = MetaConverter()
 
         # map from to storage to corresponding constant tensors

From d0318027e3066d06a445915cb52045f260179abd Mon Sep 17 00:00:00 2001
From: Masaki Kozuki <mkozuki@nvidia.com>
Date: Mon, 31 Oct 2022 22:45:23 +0000
Subject: [PATCH 0365/1922] Store `autocast_gpu_dtype` in `custom_fwd` and
 `custom_bwd` for BFloat16 autocast (#88029)

As per #87979, `custom_bwd` seems to forcefully use `torch.float16` for `torch.autograd.Function.backward` regardless of the `dtype` used in the forward.

Changes:
- store the `dtype` in `args[0]`
- update tests to confirm the dtype of intermediate result tensors that are outputs of autocast compatible `torch` functions

cc @ptrblck @ngimel
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88029
Approved by: https://github.com/ngimel
---
 test/test_cuda.py               | 16 ++++++++++------
 torch/cuda/amp/autocast_mode.py |  3 ++-
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/test/test_cuda.py b/test/test_cuda.py
index c8876d5fbb0cd..1f3abf7915f8a 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -3030,18 +3030,22 @@ def forward(ctx, a, b):
             def backward(ctx, grad):
                 self.assertTrue(torch.is_autocast_enabled())
                 a, b = ctx.saved_tensors
-                return grad.mm(b.t()), a.t().mm(grad)
+                a_grad, b_grad = grad.mm(b.t()), a.t().mm(grad)
+                self.assertTrue(a_grad.dtype is dtype and b_grad.dtype is dtype)
+                return a_grad, b_grad
 
         mymm = MyMM.apply
 
         x = torch.randn((8, 8), device="cuda", dtype=torch.float32, requires_grad=True)
         y = torch.randn((8, 8), device="cuda", dtype=torch.float32, requires_grad=True)
 
-        with torch.cuda.amp.autocast():
-            output = mymm(x, y)
-            self.assertTrue(output.dtype is torch.float16)
-            loss = output.sum()
-        loss.backward()
+        dtypes = (torch.float16, torch.bfloat16) if TEST_BF16 else (torch.float16,)
+        for dtype in dtypes:
+            with torch.cuda.amp.autocast(dtype=dtype):
+                output = mymm(x, y)
+                self.assertTrue(output.dtype is dtype)
+                loss = output.sum()
+            loss.backward()
 
     def test_autocast_custom_cast_inputs(self):
         class MyMM(torch.autograd.Function):
diff --git a/torch/cuda/amp/autocast_mode.py b/torch/cuda/amp/autocast_mode.py
index aec16201aff94..83bc6beb5e794 100644
--- a/torch/cuda/amp/autocast_mode.py
+++ b/torch/cuda/amp/autocast_mode.py
@@ -93,6 +93,7 @@ def custom_fwd(fwd=None, *, cast_inputs=None):
 
     @functools.wraps(fwd)
     def decorate_fwd(*args, **kwargs):
+        args[0]._dtype = torch.get_autocast_gpu_dtype()
         if cast_inputs is None:
             args[0]._fwd_used_autocast = torch.is_autocast_enabled()
             return fwd(*args, **kwargs)
@@ -119,6 +120,6 @@ def custom_bwd(bwd):
     """
     @functools.wraps(bwd)
     def decorate_bwd(*args, **kwargs):
-        with autocast(args[0]._fwd_used_autocast):
+        with autocast(enabled=args[0]._fwd_used_autocast, dtype=args[0]._dtype):
             return bwd(*args, **kwargs)
     return decorate_bwd

From 442b439725008955e30b1017592515be012f5743 Mon Sep 17 00:00:00 2001
From: John Detloff <johndetloff@fb.com>
Date: Mon, 31 Oct 2022 23:36:00 +0000
Subject: [PATCH 0366/1922] [IOS] Update Cocoapods for 1.13 release (#88075)

Update the podspecs for libtorch and libtorch-lite to v 1.13 to prepare for the 1.13 pod release.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88075
Approved by: https://github.com/manuelcandales, https://github.com/salilsdesai, https://github.com/malfet
---
 ios/LibTorch-Lite.podspec | 2 +-
 ios/LibTorch.podspec      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ios/LibTorch-Lite.podspec b/ios/LibTorch-Lite.podspec
index 9814eaa367586..54a12fb2cbfc2 100644
--- a/ios/LibTorch-Lite.podspec
+++ b/ios/LibTorch-Lite.podspec
@@ -1,6 +1,6 @@
 Pod::Spec.new do |s|
     s.name             = 'LibTorch-Lite'
-    s.version          = '1.12.0'
+    s.version          = '1.13.0'
     s.authors          = 'PyTorch Team'
     s.license          = { :type => 'BSD' }
     s.homepage         = 'https://github.com/pytorch/pytorch'
diff --git a/ios/LibTorch.podspec b/ios/LibTorch.podspec
index 3c197f0f103b9..e166f853143b0 100644
--- a/ios/LibTorch.podspec
+++ b/ios/LibTorch.podspec
@@ -1,6 +1,6 @@
 Pod::Spec.new do |s|
     s.name             = 'LibTorch'
-    s.version          = '1.12.0'
+    s.version          = '1.13.0'
     s.authors          = 'PyTorch Team'
     s.license          = { :type => 'BSD' }
     s.homepage         = 'https://github.com/pytorch/pytorch'

From 7aba7f12dd524ad870e35ed22bb20d9b601560d0 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Mon, 31 Oct 2022 23:38:03 +0000
Subject: [PATCH 0367/1922] Fix `PyTorchStreamWriter` exception handling
 (#88128)

Avoid double exception in destructor if attempting to serialize to
python object that does not have `write` method

Use `Finalizer` class in `PyTorchStreamWriter::writeEndOfFile()` to a
always set `finailized_` property even if excretion occurs. (as there
isn't much one can do at this point)

Add expicit check for the attribue to `_open_zipfile_writer_buffer` and
add unitests

Modernize code a bit by using Python-3 `super()` method

Fixes https://github.com/pytorch/pytorch/issues/87997

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88128
Approved by: https://github.com/albanD
---
 caffe2/serialize/inline_container.cc | 18 ++++++++++++++++--
 caffe2/serialize/inline_container.h  |  2 +-
 test/test_serialization.py           | 19 +++++++++++++++++++
 torch/csrc/jit/python/init.cpp       |  2 +-
 torch/serialization.py               | 15 ++++++++++-----
 5 files changed, 47 insertions(+), 9 deletions(-)

diff --git a/caffe2/serialize/inline_container.cc b/caffe2/serialize/inline_container.cc
index 9d3cc332ae96e..54b94d31775de 100644
--- a/caffe2/serialize/inline_container.cc
+++ b/caffe2/serialize/inline_container.cc
@@ -338,8 +338,7 @@ PyTorchStreamWriter::PyTorchStreamWriter(std::string file_name)
 }
 
 PyTorchStreamWriter::PyTorchStreamWriter(
-    // NOLINTNEXTLINE(modernize-pass-by-value)
-    const std::function<size_t(const void*, size_t)>& writer_func)
+    const std::function<size_t(const void*, size_t)> writer_func)
     : archive_name_("archive"),
       writer_func_(writer_func) {
   setup(archive_name_);
@@ -416,6 +415,21 @@ void PyTorchStreamWriter::writeRecord(
 }
 
 void PyTorchStreamWriter::writeEndOfFile() {
+  // Ensurers that finalized is set to true even
+  // exception is raised during the method call.
+  // I.e. even partial call to writeEndOfFile() should mark
+  // file as finalized, otherwise double exception raised from
+  // destructor would would result in `std::terminate()`
+  // See https://github.com/pytorch/pytorch/issues/87997/
+  struct Finalizer {
+    Finalizer(bool& var): var_(var) {}
+    ~Finalizer() {
+      var_ = true;
+    }
+    private:
+     bool& var_;
+  } f(finalized_);
+
   auto allRecords = getAllWrittenRecords();
   // If no ".data/version" or "version" record in the output model, rewrites version info
   if(allRecords.find(".data/version") == allRecords.end() && allRecords.find("version") == allRecords.end()) {
diff --git a/caffe2/serialize/inline_container.h b/caffe2/serialize/inline_container.h
index 621ffbe9a41ab..3f0e661dd229f 100644
--- a/caffe2/serialize/inline_container.h
+++ b/caffe2/serialize/inline_container.h
@@ -130,7 +130,7 @@ class TORCH_API PyTorchStreamWriter final {
  public:
   explicit PyTorchStreamWriter(std::string archive_name);
   explicit PyTorchStreamWriter(
-      const std::function<size_t(const void*, size_t)>& writer_func);
+      const std::function<size_t(const void*, size_t)> writer_func);
 
   void setMinVersion(const uint64_t version);
 
diff --git a/test/test_serialization.py b/test/test_serialization.py
index d8cfd08aea084..3a18f8a45ad04 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -567,6 +567,25 @@ def test_serialization_filelike_uses_readinto(self):
         b = torch.load(data)
         self.assertTrue(data.was_called('readinto'))
 
+    def test_serialization_filelike_exceptions(self):
+        # Try to serialize to buffers that does not have write method
+        # Or have a malfrormed one, and make sure it does not cause an abort
+        # See https://github.com/pytorch/pytorch/issues/87997
+        x = torch.rand(10)
+        with self.assertRaises(AttributeError):
+            # Tries to serialize str into tensor
+            torch.save('foo', x)
+        x.write = "bar"
+        x.flush = "baz"
+        with self.assertRaises(TypeError):
+            # Tries to serialize str into tensor with write property
+            torch.save('foo', x)
+        x.write = str.__add__
+        x.flush = str.__mul__
+        with self.assertRaises(TypeError):
+            # Tries to serialize str into tensor with wrong callable write property
+            torch.save('foo', x)
+
 
     def test_serialization_storage_slice(self):
         # Generated using:
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 74ce3b829b941..a72a8a2c11502 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -1253,7 +1253,7 @@ void initJITBindings(PyObject* module) {
       .def(py::init<std::string>())
       .def(py::init([](const py::object& buffer) {
         auto writer_func = [=](const void* data, size_t size) {
-          // Writting an empty file is a noop
+          // Writing an empty file is a noop
           if (size == 0) {
             return size;
           }
diff --git a/torch/serialization.py b/torch/serialization.py
index 173427edd4768..65d936679ed5e 100644
--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -248,7 +248,7 @@ def __exit__(self, *args):
 
 class _open_file(_opener):
     def __init__(self, name, mode):
-        super(_open_file, self).__init__(open(name, mode))
+        super().__init__(open(name, mode))
 
     def __exit__(self, *args):
         self.file_like.close()
@@ -256,7 +256,7 @@ def __exit__(self, *args):
 
 class _open_buffer_reader(_opener):
     def __init__(self, buffer):
-        super(_open_buffer_reader, self).__init__(buffer)
+        super().__init__(buffer)
         _check_seekable(buffer)
 
 
@@ -279,12 +279,12 @@ def _open_file_like(name_or_buffer, mode):
 
 class _open_zipfile_reader(_opener):
     def __init__(self, name_or_buffer) -> None:
-        super(_open_zipfile_reader, self).__init__(torch._C.PyTorchFileReader(name_or_buffer))
+        super().__init__(torch._C.PyTorchFileReader(name_or_buffer))
 
 
 class _open_zipfile_writer_file(_opener):
     def __init__(self, name) -> None:
-        super(_open_zipfile_writer_file, self).__init__(torch._C.PyTorchFileWriter(str(name)))
+        super().__init__(torch._C.PyTorchFileWriter(str(name)))
 
     def __exit__(self, *args) -> None:
         self.file_like.write_end_of_file()
@@ -292,8 +292,13 @@ def __exit__(self, *args) -> None:
 
 class _open_zipfile_writer_buffer(_opener):
     def __init__(self, buffer) -> None:
+        if not callable(getattr(buffer, "write", None)):
+            msg = f"Buffer of {str(type(buffer)).strip('<>')} has no callable attribute 'write'"
+            if not hasattr(buffer, "write"):
+                raise AttributeError(msg)
+            raise TypeError(msg)
         self.buffer = buffer
-        super(_open_zipfile_writer_buffer, self).__init__(torch._C.PyTorchFileWriter(buffer))
+        super().__init__(torch._C.PyTorchFileWriter(buffer))
 
     def __exit__(self, *args) -> None:
         self.file_like.write_end_of_file()

From 8593d4c9e98ec6b56b68f50696e2bc634c7eac21 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 31 Oct 2022 23:49:37 +0000
Subject: [PATCH 0368/1922] Revert "[dynamo] Error when user nests FX with
 dynamo (#87797)"

This reverts commit 1da5aeb97b73664ff0fe2f4bb48379655cede969.

Reverted https://github.com/pytorch/pytorch/pull/87797 on behalf of https://github.com/ezyang due to breaks nvfuser stack, needs more investigation
---
 test/dynamo/test_misc.py    | 14 --------------
 test/test_prims.py          | 10 +---------
 torch/_dynamo/config.py     |  4 ----
 torch/_dynamo/eval_frame.py |  9 ---------
 4 files changed, 1 insertion(+), 36 deletions(-)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index a63a6d8930c80..a0f592212f4e1 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -2732,20 +2732,6 @@ def forward(self, x):
             dynamo_result = graph(x)
             self.assertTrue(same(real, dynamo_result))
 
-    def test_error_on_nested_fx_trace(self):
-        input = torch.rand(2, 3)
-
-        def f(x):
-            x + x
-
-        real = f(input)
-
-        optimized = torch._dynamo.optimize("eager")(f)
-        self.assertTrue(same(optimized(input), real))
-
-        with self.assertRaisesRegex(RuntimeError, "Detected that you are using FX"):
-            gm = torch.fx.symbolic_trace(optimized)
-
 
 class CustomFunc(torch.autograd.Function):
     @staticmethod
diff --git a/test/test_prims.py b/test/test_prims.py
index 6f400ce6e797e..6223a34e0a3a9 100644
--- a/test/test_prims.py
+++ b/test/test_prims.py
@@ -8,14 +8,7 @@
 
 import torch
 from torch.testing import make_tensor
-from torch.testing._internal.common_utils import (
-    parametrize,
-    run_tests,
-    TestCase,
-    TEST_SCIPY,
-    skipCUDAMemoryLeakCheckIf,
-    skipIfTorchDynamo,
-)
+from torch.testing._internal.common_utils import parametrize, run_tests, TestCase, TEST_SCIPY, skipCUDAMemoryLeakCheckIf
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests,
     onlyCUDA,
@@ -394,7 +387,6 @@ def func(a):
         actual = execute(gm, a.mT, executor="nvfuser")
         self.assertEqual(expected, actual)
 
-    @skipIfTorchDynamo
     def test_nvfuser_capability_context(self, device):
         # This test is to ensure that the torch calls are replaced with refs
         # based on the nvfuser+prims capability
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index 12088383e741c..87014b20537bc 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -153,10 +153,6 @@
 # How to import torchinductor, either torchinductor or torch.inductor
 inductor_import = dynamo_import.replace("dynamo", "inductor")
 
-# If true, error with a better message if we symbolically trace over a
-# dynamo-optimized function. If false, silently suppress dynamo.
-error_on_nested_fx_trace = True
-
 # root folder of the project
 if "torch." in dynamo_import:
     base_dir = dirname(dirname(dirname(abspath(__file__))))
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 29bb14b629d7e..fce9e43b39343 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -14,7 +14,6 @@
 
 import torch
 import torch.utils._pytree as pytree
-from torch.fx._symbolic_trace import is_fx_tracing
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.nn.parallel.distributed import DistributedDataParallel
 
@@ -150,14 +149,6 @@ def __call__(self, *args, **kwargs):
 
         @functools.wraps(fn)
         def _fn(*args, **kwargs):
-            if is_fx_tracing():
-                if config.error_on_nested_fx_trace:
-                    raise RuntimeError(
-                        "Detected that you are using FX to symbolically trace "
-                        "a dynamo-optimized function. This is not supported at the moment."
-                    )
-                return fn
-
             on_enter()
             prior = set_eval_frame(callback)
             backend_ctx = backend_ctx_ctor()

From cf0574688168948ce717796c1402d5b1b64e05c3 Mon Sep 17 00:00:00 2001
From: Driss Guessous <drisspg@fb.com>
Date: Tue, 1 Nov 2022 00:00:35 +0000
Subject: [PATCH 0369/1922] Change Nested Tensor logging copy (#88104)

# Summary
Change the copy of how we log NestedTensor usage.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88104
Approved by: https://github.com/mikaylagawarecki
---
 aten/src/ATen/NestedTensorImpl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/NestedTensorImpl.cpp b/aten/src/ATen/NestedTensorImpl.cpp
index c0199da124c36..4ed527cfd4865 100644
--- a/aten/src/ATen/NestedTensorImpl.cpp
+++ b/aten/src/ATen/NestedTensorImpl.cpp
@@ -173,7 +173,7 @@ NestedTensorImpl::NestedTensorImpl(
       nested_stride_tensor_(std::move(nested_stride_tensor)),
       storage_offsets_(std::move(offsets)),
       opt_sizes_(construct_opt_sizes(nested_size_tensor_)) {
-  C10_LOG_API_USAGE_ONCE("Using torch.NestedTensor");
+  C10_LOG_API_USAGE_ONCE("torch.NestedTensor");
   TORCH_WARN_ONCE(
       "The PyTorch API of nested tensors is in prototype stage and will change "
       "in the near future.");

From 4fb2e58b00172df36bbb1440559ca94b953ed372 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 31 Oct 2022 09:25:34 -0400
Subject: [PATCH 0370/1922] Also skip large models for normal --accuracy runs
 (#88086)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

cc @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88086
Approved by: https://github.com/albanD
---
 benchmarks/dynamo/torchbench.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/dynamo/torchbench.py b/benchmarks/dynamo/torchbench.py
index b6577745ab154..cec284ebcd8ca 100755
--- a/benchmarks/dynamo/torchbench.py
+++ b/benchmarks/dynamo/torchbench.py
@@ -217,7 +217,7 @@ def failing_dynamic_shape_models(self):
 
     @property
     def skip_accuracy_checks_large_models_dashboard(self):
-        if self.args.dashboard:
+        if self.args.dashboard or self.args.accuracy:
             return SKIP_ACCURACY_CHECK_MODELS
         return set()
 

From cc5309bac41b4a2683baee833171b72ae7a8a5bd Mon Sep 17 00:00:00 2001
From: maxren <maxren@meta.com>
Date: Mon, 31 Oct 2022 10:18:45 -0700
Subject: [PATCH 0371/1922] [xnnpack][lite-int][graph-build] torchscript ->
 xnnpack graph (#87824)

This point we perform conversion for Torchscript IR to XNNPack graph. Currently we only support converting Add Nodes and fp32 tensor values.

As a caveat, we are not building this at runtime. So for testing we just run the xnn graph once ahead of time and with sample inputs and forward it to execute. This is only for testing, and will be changed in a later diff. This will allow us to check that graph creation is sound.

Differential Revision: [D39838851](https://our.internmc.facebook.com/intern/diff/D39838851/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87824
Approved by: https://github.com/digantdesai, https://github.com/salilsdesai
---
 test/jit/xnnpack/test_xnnpack_delegate.py     |  61 ++++++
 .../backends/xnnpack/xnnpack_backend_lib.cpp  |   5 +-
 .../xnnpack/xnnpack_backend_preprocess.cpp    |  19 +-
 .../xnnpack/xnnpack_graph_builder.cpp         | 199 ++++++++++++++++++
 .../backends/xnnpack/xnnpack_graph_builder.h  |  35 +++
 5 files changed, 315 insertions(+), 4 deletions(-)

diff --git a/test/jit/xnnpack/test_xnnpack_delegate.py b/test/jit/xnnpack/test_xnnpack_delegate.py
index 118a30dbe2cac..167a049ec0ccd 100644
--- a/test/jit/xnnpack/test_xnnpack_delegate.py
+++ b/test/jit/xnnpack/test_xnnpack_delegate.py
@@ -68,6 +68,67 @@ def forward(self, x):
         )
         lowered(torch.zeros(1))
 
+    def test_xnnpack_backend_add(self):
+        class AddModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                z = x + y
+                z = z + x
+                z = z + x
+                return z
+
+        add_module = AddModule()
+        sample_inputs = (torch.rand(1, 512, 512, 3), torch.rand(1, 512, 512, 3))
+        sample_output = torch.zeros(1, 512, 512, 3)
+
+        add_module = torch.jit.script(add_module)
+        expected_output = add_module(sample_inputs[0], sample_inputs[1])
+
+        lowered_add_module = torch._C._jit_to_backend(
+            "xnnpack",
+            add_module,
+            {
+                "forward": {
+                    "inputs" : [sample_inputs[0], sample_inputs[1]],
+                    "outputs": [sample_output]
+                }
+            }
+        )
+
+        actual_output = lowered_add_module.forward(sample_inputs[0], sample_inputs[1])
+        self.assertTrue(torch.allclose(actual_output, expected_output, atol=1e-03, rtol=1e-03))
+
+    def test_xnnpack_broadcasting(self):
+        class AddModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                return x + y
+
+        add_module = AddModule()
+        sample_inputs = (torch.rand(5, 1, 4, 1), torch.rand(3, 1, 1))
+        sample_output = torch.zeros(5, 3, 4, 1)
+
+        add_module = torch.jit.script(add_module)
+        expected_output = add_module(sample_inputs[0], sample_inputs[1])
+
+        lowered_add_module = torch._C._jit_to_backend(
+            "xnnpack",
+            add_module,
+            {
+                "forward": {
+                    "inputs" : [sample_inputs[0], sample_inputs[1]],
+                    "outputs": [sample_output]
+                }
+            }
+        )
+
+        actual_output = lowered_add_module.forward(sample_inputs[0], sample_inputs[1])
+        self.assertTrue(torch.allclose(actual_output, expected_output, atol=1e-03, rtol=1e-03))
+
     def test_xnnpack_unsupported(self):
         class AddSpliceModule(torch.nn.Module):
             def __init__(self):
diff --git a/torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp b/torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp
index 4d1e934de4d97..d55e89ed216fa 100644
--- a/torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp
+++ b/torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp
@@ -41,10 +41,9 @@ class XNNPackBackend : public PyTorchBackendInterface {
   c10::impl::GenericList execute(
       c10::IValue handle,
       c10::impl::GenericList inputs) override {
-    c10::List<at::Tensor> output_list;
     auto answer = handle.toGenericDict().at("Answer");
-    output_list.emplace_back(answer.toTensor());
-    return c10::impl::toList(output_list);
+
+    return answer.toList();
   }
 };
 
diff --git a/torch/csrc/jit/backends/xnnpack/xnnpack_backend_preprocess.cpp b/torch/csrc/jit/backends/xnnpack/xnnpack_backend_preprocess.cpp
index 536e1cb8e773d..1f185815ff453 100644
--- a/torch/csrc/jit/backends/xnnpack/xnnpack_backend_preprocess.cpp
+++ b/torch/csrc/jit/backends/xnnpack/xnnpack_backend_preprocess.cpp
@@ -87,8 +87,25 @@ c10::IValue preprocess(
   // inp above has been confirmed to be either Tensor or TensorList
   XNNGraph graph_builder;
   graph_builder.buildXNNGraph(graph, example_inputs);
+  // at this point graph is complete, for the sake of testing preprocess at this
+  // point we will do runtime setup and run with some default values
 
-  compiled.insert("Answer", at::empty({1}, c10::ScalarType::Float));
+  // grabbing the inputs from compile spec for testing
+
+  std::vector<at::Tensor> inputs;
+  auto input_list = inp.toList();
+
+  for (int i = 0; i < input_list.size(); i++) {
+    inputs.push_back(input_list.get(i).toTensor());
+  }
+  std::vector<at::Tensor> outputs;
+  outputs.push_back(out.toList().get(0).toTensor());
+
+  graph_builder.runGraphOnInputs(inputs, outputs);
+
+  c10::List<at::Tensor> output_list(outputs);
+
+  compiled.insert("Answer", output_list);
 
   return compiled;
 }
diff --git a/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp
index 438e681b508b6..566a0d141b318 100644
--- a/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp
+++ b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp
@@ -37,6 +37,68 @@ void XNNGraph::buildXNNGraph(
     std::vector<c10::IValue> example_inputs) {
   graph = optimizeAndTraceGraph(graph, example_inputs);
   checkOpsToDelegate(graph);
+  gatherTensorValues(graph);
+
+  // count unique input/outputs (some inputs can be outputs)
+  std::unordered_set<torch::jit::Value*> externals;
+  for (auto inp : _inputs) {
+    externals.insert(inp);
+  }
+  for (auto out : _outputs) {
+    externals.insert(out);
+  }
+
+  // create subgraph
+  xnn_status status = xnn_create_subgraph(
+      /*external_value_ids=*/externals.size(),
+      /*flags=*/0,
+      &_subgraph_ptr);
+  TORCH_CHECK(xnn_status_success == status, "Failed to create xnn subgraph");
+
+  defineAllTensorValues();
+  defineAllNodes(graph);
+  // at this point graph is complete, for the sake of testing preprocess at
+  // this point we will do runtime setup and run with some default values
+}
+
+void XNNGraph::runGraphOnInputs(
+    std::vector<at::Tensor> tensor_inputs,
+    std::vector<at::Tensor> tensor_outputs) {
+  TORCH_CHECK(
+      _subgraph_ptr != nullptr,
+      "run buildXNNGraph before running graph on inputs");
+  xnn_runtime_t runtime = nullptr;
+  xnn_status status =
+      xnn_create_runtime_v2(_subgraph_ptr, nullptr, /*flags=*/0, &runtime);
+  TORCH_CHECK(
+      xnn_status_success == status,
+      "failed to create runtime for running inputs");
+
+  // smart pointer for runtime
+  std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(
+      runtime, xnn_delete_runtime);
+
+  std::vector<xnn_external_value> external_values;
+  TORCH_CHECK(
+      tensor_inputs.size() == _inputs.size(),
+      "supplied inputs does not match expected inputs");
+  for (int i = 0; i < tensor_inputs.size(); i++) {
+    external_values.push_back(
+        {_val_to_ids[_inputs[i]], tensor_inputs[i].data_ptr<float>()});
+  }
+
+  TORCH_CHECK(
+      tensor_outputs.size() == _outputs.size(),
+      "supplied outputs does not match expected outputs");
+  for (int i = 0; i < tensor_outputs.size(); i++) {
+    external_values.push_back(
+        {_val_to_ids[_outputs[i]], tensor_outputs[i].data_ptr<float>()});
+  }
+  status = xnn_setup_runtime(
+      auto_runtime.get(), external_values.size(), external_values.data());
+  TORCH_CHECK(xnn_status_success == status, "runtime not properly setup");
+
+  TORCH_CHECK(xnn_status_success == xnn_invoke_runtime(auto_runtime.get()));
 }
 
 void XNNGraph::checkOpsToDelegate(std::shared_ptr<torch::jit::Graph>& graph) {
@@ -65,6 +127,143 @@ void XNNGraph::checkOpsToDelegate(std::shared_ptr<torch::jit::Graph>& graph) {
       "the module contains the following unsupported ops:\n" + error.str());
 }
 
+void XNNGraph::defineAllNodes(std::shared_ptr<torch::jit::Graph>& graph) {
+  DepthFirstGraphNodeIterator it(graph);
+  Node* node = nullptr;
+  while ((node = it.next()) != nullptr) {
+    switch (node->kind()) {
+      case prim::Constant: {
+        break;
+      }
+      case aten::add: {
+        // todo: handle alpha for aten::add
+        uint32_t input1_id = _val_to_ids[node->inputs()[0]];
+        uint32_t input2_id = _val_to_ids[node->inputs()[1]];
+        TORCH_CHECK(
+            node->inputs()[2]->type()->cast<IntType>() == 1,
+            "non-1 alpha values not supported");
+        uint32_t output_id = _val_to_ids[node->outputs()[0]];
+
+        xnn_status status = xnn_define_add2(
+            _subgraph_ptr,
+            output_min,
+            output_max,
+            input1_id,
+            input2_id,
+            output_id,
+            /*flags=*/0);
+        TORCH_CHECK(status == xnn_status_success, "failed to create add node");
+        break;
+      }
+      default: {
+        throw std::exception();
+        TORCH_CHECK(
+            false,
+            "The node of ",
+            node->kind().toQualString(),
+            " is not supported yet");
+        break;
+      }
+    }
+  }
+}
+
+void XNNGraph::defineAllTensorValues() {
+  uint32_t external_id =
+      std::numeric_limits<decltype(XNN_INVALID_VALUE_ID)>::min();
+  for (auto val : _intermediate_tensors) {
+    if (_val_to_ids.find(val) == _val_to_ids.end()) {
+      uint32_t id = XNN_INVALID_VALUE_ID;
+
+      // cast value to tensortype
+      auto tensor_ptr = val->type()->cast<TensorType>();
+      auto num_dims = tensor_ptr->dim().value();
+
+      // create size_t* for tensor shape, casting must be done from long ->
+      // size_t
+      std::vector<long> sizes = tensor_ptr->sizes().concrete_sizes().value();
+      std::vector<size_t> tensor_shape;
+      tensor_shape.reserve(sizes.size());
+      for (auto dim : sizes) {
+        TORCH_CHECK(dim >= 0, "Input Dims should be unsigned");
+        tensor_shape.push_back(static_cast<size_t>(dim));
+      }
+
+      // ext_id value
+      uint32_t ext_id = XNN_INVALID_VALUE_ID;
+
+      // update flag for if tensor is either graph input/output
+      uint32_t flags = 0;
+
+      if (isGraphInput(val) || isGraphOutput(val)) {
+        if (isGraphInput(val)) {
+          flags |= XNN_VALUE_FLAG_EXTERNAL_INPUT;
+        }
+        if (isGraphOutput(val)) {
+          flags |= XNN_VALUE_FLAG_EXTERNAL_OUTPUT;
+        }
+        ext_id = external_id++;
+      }
+      xnn_status status = xnn_define_tensor_value(
+          /*subgraph=*/_subgraph_ptr,
+          /*datatype=*/xnn_datatype_fp32,
+          /*num_dims=*/num_dims,
+          /*dims=*/tensor_shape.data(),
+          /*data=*/nullptr, // currently no constant data
+          /*external_id=*/ext_id,
+          /*flags=*/flags,
+          /*id_out=*/&id);
+      TORCH_CHECK(
+          status == xnn_status_success,
+          "failed to define xnn_tensor_id for: " + val->debugName());
+      _val_to_ids.insert({val, id});
+    }
+  }
+}
+
+void XNNGraph::gatherTensorValues(std::shared_ptr<torch::jit::Graph>& graph) {
+  for (auto input : graph->inputs()) {
+    if (input->isCompleteTensor()) {
+      _intermediate_tensors.insert(input);
+      _inputs.push_back(input);
+    }
+  }
+
+  DepthFirstGraphNodeIterator it(graph);
+  Node* n = nullptr;
+  while ((n = it.next()) != nullptr) {
+    gatherNodeInputs(*n);
+  }
+
+  for (auto output : graph->outputs()) {
+    if (output->isCompleteTensor()) {
+      _intermediate_tensors.insert(output);
+      _outputs.push_back(output);
+    }
+  }
+}
+
+void XNNGraph::gatherNodeInputs(torch::jit::Node& node) {
+  switch (node.kind()) {
+    case aten::add: {
+      // this case will support all ops with only two inputs i.e. sub, add,
+      for (auto value : node.inputs()) {
+        if (value->isCompleteTensor()) {
+          _intermediate_tensors.insert(value);
+        }
+      }
+    }
+  }
+}
+
+bool XNNGraph::isGraphInput(torch::jit::Value* val) {
+  return std::count(_inputs.begin(), _inputs.end(), val) > 0;
+};
+
+bool XNNGraph::isGraphOutput(torch::jit::Value* val) {
+  return std::count(_outputs.begin(), _outputs.end(), val) > 0;
+};
+
 } // namespace delegate
 } // namespace xnnpack
 } // namespace jit
diff --git a/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.h b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.h
index e9593376dc798..0a4f46c397c6a 100644
--- a/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.h
+++ b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.h
@@ -13,8 +13,19 @@ namespace delegate {
 
 class XNNGraph {
  private:
+  const float output_min = -std::numeric_limits<float>::infinity();
+  const float output_max = std::numeric_limits<float>::infinity();
+
   // xnn_subgraph
   xnn_subgraph_t _subgraph_ptr;
+  // Set of all the tensor values throughout the jit graph
+  std::unordered_set<torch::jit::Value*> _intermediate_tensors;
+  // Set of all the tensor values mapped to the xnnpack ids
+  std::unordered_map<torch::jit::Value*, uint32_t> _val_to_ids;
+  // Vector containing the torch valued inputs/outputs,
+  // must be ordered to preserve the order of input/outputs
+  std::vector<torch::jit::Value*> _inputs;
+  std::vector<torch::jit::Value*> _outputs;
 
   // Graph passes for optimizing and tracing torchscript graph
   // Essentially massaging the graph into a digestiable format for
@@ -23,6 +34,26 @@ class XNNGraph {
       std::shared_ptr<torch::jit::Graph> graph,
       std::vector<c10::IValue>& example_inputs);
 
+  // Gather all the intermediate tensor values within a graph. This
+  // skips through all prim constants. The purpose of this is for defining
+  // the tensor values beforehand for the xnnpack subgraph.
+  void gatherTensorValues(std::shared_ptr<torch::jit::Graph>& graph);
+
+  // Gathers the tensor values in a give node
+  void gatherNodeInputs(torch::jit::Node& node);
+
+  // Helper function to determine if a jit value is a graph input
+  bool isGraphInput(torch::jit::Value* val);
+
+  // Helper function to determine if a jit value is a graph output
+  bool isGraphOutput(torch::jit::Value* val);
+
+  // Defines all xnnpack nodes for the nodes in the graph
+  void defineAllNodes(std::shared_ptr<torch::jit::Graph>& graph);
+
+  // Defines all xnn tensor values used throughout the graph
+  void defineAllTensorValues();
+
   // Makes a pass through the graph and throws if any ops are unsupported
   void checkOpsToDelegate(std::shared_ptr<torch::jit::Graph>& graph);
 
@@ -42,6 +73,10 @@ class XNNGraph {
   void buildXNNGraph(
       std::shared_ptr<torch::jit::Graph>& graph,
       std::vector<c10::IValue> example_inputs);
+
+  void runGraphOnInputs(
+      std::vector<at::Tensor> tensor_inputs,
+      std::vector<at::Tensor> tensor_outputs);
 };
 
 } // namespace delegate

From d5054146a833671e55df1fc0cfa73d971c6d5f5b Mon Sep 17 00:00:00 2001
From: maxren <maxren@meta.com>
Date: Mon, 31 Oct 2022 10:18:46 -0700
Subject: [PATCH 0372/1922] [xnnpack][lite-int][1/n] flatbuffer buck rules
 (#87826)

Writing a placeholder schema.fbs file for now to setup the buck gen rules. The generated schema file will be used in the xnnpack name space and be reserved for serialization/deserialization of our xnnpack lowered graph

Steps Accomplished

- Buck rules to compile flatbuffer schema
- added header file to preprocess
- everything compiles correctly

Differential Revision: [D38999169](https://our.internmc.facebook.com/intern/diff/D38999169/)

**NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D38999169/)!
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87826
Approved by: https://github.com/digantdesai
---
 torch/csrc/jit/backends/xnnpack/serialization/schema.fbs | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 torch/csrc/jit/backends/xnnpack/serialization/schema.fbs

diff --git a/torch/csrc/jit/backends/xnnpack/serialization/schema.fbs b/torch/csrc/jit/backends/xnnpack/serialization/schema.fbs
new file mode 100644
index 0000000000000..74302ea9a7bd7
--- /dev/null
+++ b/torch/csrc/jit/backends/xnnpack/serialization/schema.fbs
@@ -0,0 +1,8 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+namespace xnnpack;
+
+enum xnn_value_type : short {
+  xnn_value_type_invalid = 0,
+  xnn_value_type_dense_tensor = 1,
+}

From 99b2a9df62d70478039e7bf7636f1e8fb10e5365 Mon Sep 17 00:00:00 2001
From: maxren <maxren@meta.com>
Date: Mon, 31 Oct 2022 10:18:49 -0700
Subject: [PATCH 0373/1922] [xnnpack]lite-int][2/n] flatbuffer xnn_value schema
 (#87906)

serializer schema for xnnpack graphs

Differential Revision: [D39003170](https://our.internmc.facebook.com/intern/diff/D39003170/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87906
Approved by: https://github.com/digantdesai
---
 .../backends/xnnpack/serialization/schema.fbs | 92 ++++++++++++++++++-
 1 file changed, 88 insertions(+), 4 deletions(-)

diff --git a/torch/csrc/jit/backends/xnnpack/serialization/schema.fbs b/torch/csrc/jit/backends/xnnpack/serialization/schema.fbs
index 74302ea9a7bd7..3b4b53debd026 100644
--- a/torch/csrc/jit/backends/xnnpack/serialization/schema.fbs
+++ b/torch/csrc/jit/backends/xnnpack/serialization/schema.fbs
@@ -1,8 +1,92 @@
 // Copyright (c) Meta Platforms, Inc. and affiliates.
 
-namespace xnnpack;
+namespace fb_xnnpack;
 
-enum xnn_value_type : short {
-  xnn_value_type_invalid = 0,
-  xnn_value_type_dense_tensor = 1,
+// datatype for xnn-values
+enum XNNDatatype : short {
+  /// Invalid data type. Valid Values never have this datatype.
+  xnn_datatype_invalid = 0,
+  /// IEEE754 single-precision floating-point.
+  xnn_datatype_fp32 = 1,
+  /// IEEE754 half-precision floating-point.
+  xnn_datatype_fp16 = 2,
+  /// Quantized 8-bit signed integer with shared per-Value quantization parameters.
+  xnn_datatype_qint8 = 3,
+  /// Quantized 32-bit signed integer with shared per-Value quantization parameters.
+  xnn_datatype_qint32 = 4,
 }
+
+// taken from executorch
+// Data buffer abstraction.
+table Buffer {
+  storage:[ubyte] (force_align: 16);
+}
+
+table XNNTensorValue {
+  // type of the tensor elements.
+  datatype:XNNDatatype;
+  // number of dimensions in the shape.
+  num_dims:uint;
+  // pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
+  // XNNPACK does not keep any pointers to this array after the function returns.
+  dims:[uint];
+  // Index to the program's constant buffer table, value 0 is reserved to indicate non constant
+  constant_buffer_idx:uint;
+  // external ID for the Value. The ID must be within the range of reserved Value IDs specified on
+  // the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be
+  // created for the Value.
+  external_id:uint;
+  // binary features of the Value. Supported values are any combination of XNN_VALUE_FLAG_EXTERNAL_INPUT
+  // and XNN_VALUE_FLAG_EXTERNAL_OUTPUT.
+  flags:uint;
+  // pointer to the variable that will be initialized with the Value ID upon successful return. If a
+  // valid @a external_id was provided, the variable will be initialized with the @a external_id value.
+  id_out:uint;
+}
+
+union NodeUnion {
+  XNNAdd,
+}
+
+union ValueUnion {
+  XNNTensorValue,
+}
+
+table Node {
+  node:NodeUnion;
+}
+
+table Value {
+  value:ValueUnion;
+}
+
+table XNNAdd {
+  input1_id:uint;
+  input2_id:uint;
+  output_id:uint;
+  flags:uint;
+}
+
+table XNNGraph {
+  // Schema version.
+  version:string;
+  nodes:[Node];
+  values:[Value];
+
+  // Ids of external inputs
+  input_ids:[uint];
+
+  // Ids of external outputs
+  output_ids:[uint];
+
+  // Tables of constant data, used for constant Values (e.g.
+  // data field of weight tensors). Each constant is assigned an index into the table
+  // which are each individually aligned. 0 index is reserved to be pointed to by non-constant
+  // Tensors
+  constant_buffer:[Buffer];
+
+  // the list index is memory buffer id, the value is the memory buffer size.
+  mem_buffer_sizes: [uint];
+}
+
+root_type XNNGraph;

From b14b34850a4d3683216746c751a460f9a894c97f Mon Sep 17 00:00:00 2001
From: maxren <maxren@meta.com>
Date: Mon, 31 Oct 2022 10:18:51 -0700
Subject: [PATCH 0374/1922] [xnnpack][lite-int][3/n] flatbuffer serializer
 class (#87907)

Creating a serializer class that allows us to serialize the xnnpack graph creation arguments. This essentially abstracts away the flatbuffer api manipulation and serialization that we deal with.

As a result we can call
```
XNNSerializer::serializeAddNode()
XNNSerializer::serializeTensorValue()
XNNSerializer::finishAndSerialize
```
to serialize the graph

Differential Revision: [D39196312](https://our.internmc.facebook.com/intern/diff/D39196312/)

**NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D39196312/)!
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87907
Approved by: https://github.com/digantdesai
---
 .../xnnpack/serialization/serializer.cpp      | 93 +++++++++++++++++++
 .../xnnpack/serialization/serializer.h        | 76 +++++++++++++++
 2 files changed, 169 insertions(+)
 create mode 100644 torch/csrc/jit/backends/xnnpack/serialization/serializer.cpp
 create mode 100644 torch/csrc/jit/backends/xnnpack/serialization/serializer.h

diff --git a/torch/csrc/jit/backends/xnnpack/serialization/serializer.cpp b/torch/csrc/jit/backends/xnnpack/serialization/serializer.cpp
new file mode 100644
index 0000000000000..306884a894568
--- /dev/null
+++ b/torch/csrc/jit/backends/xnnpack/serialization/serializer.cpp
@@ -0,0 +1,93 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#include <caffe2/torch/csrc/jit/backends/xnnpack/serialization/serializer.h>
+#include <torch/csrc/jit/backends/xnnpack/serialization/schema_generated.h>
+
+#include <sstream>
+
+namespace torch {
+namespace jit {
+namespace xnnpack {
+namespace delegate {
+
+using namespace fb_xnnpack;
+
+void XNNSerializer::serializeAddNode(
+    uint32_t input1_id,
+    uint32_t input2_id,
+    uint32_t output_id,
+    uint32_t flags) {
+  const auto addNode =
+      CreateXNNAdd(_builder, input1_id, input2_id, output_id, flags);
+  const auto flatbufferNode =
+      CreateNode(_builder, NodeUnion::XNNAdd, addNode.Union());
+  _nodes.push_back(flatbufferNode);
+}
+
+void XNNSerializer::serializeTensorValue(
+    uint32_t xnn_datatype,
+    size_t num_dims,
+    std::vector<size_t> dims,
+    void* data,
+    uint32_t external_id,
+    uint32_t flags,
+    uint32_t id_out) {
+  // we will reserve buffers without data to index 0
+  int constant_buffer_idx = 0;
+  // Handling the tensor _values with data
+  // TODO @maxren fill out when handling tensors with data
+  if (data != nullptr) {
+    assert(false); // not supported yet
+    // steps:
+    // 1. creating buffer to store the 16 bit aligned data
+    // 2. increment buffer_idx, to reflect no buffer being added
+    // 3. record size into bufferSizes
+  }
+
+  std::vector<uint32_t> serialized_dims;
+  serialized_dims.reserve(dims.size());
+  for (auto dim : dims) {
+    serialized_dims.push_back(static_cast<uint32_t>(dim));
+  }
+
+  const auto tensorValue = CreateXNNTensorValueDirect(
+      _builder,
+      XNNDatatype(xnn_datatype),
+      num_dims,
+      &serialized_dims,
+      constant_buffer_idx,
+      external_id,
+      flags,
+      id_out);
+
+  const auto flatbufferValue =
+      CreateValue(_builder, ValueUnion::XNNTensorValue, tensorValue.Union());
+  _values.push_back(flatbufferValue);
+}
+
+std::string XNNSerializer::finishAndSerialize(
+    std::vector<uint32_t> input_ids,
+    std::vector<uint32_t> output_ids) {
+  auto xnnGraph = CreateXNNGraphDirect(
+      _builder,
+      _version_sha1,
+      &_nodes,
+      &_values,
+      &input_ids,
+      &output_ids,
+      &_constantBuffer,
+      &_bufferSizes);
+
+  _builder.Finish(xnnGraph);
+
+  std::stringstream ss;
+  ss.write(
+      reinterpret_cast<char*>(_builder.GetBufferPointer()), _builder.GetSize());
+
+  return ss.str();
+}
+
+} // namespace delegate
+} // namespace xnnpack
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/backends/xnnpack/serialization/serializer.h b/torch/csrc/jit/backends/xnnpack/serialization/serializer.h
new file mode 100644
index 0000000000000..3d6927f7678b8
--- /dev/null
+++ b/torch/csrc/jit/backends/xnnpack/serialization/serializer.h
@@ -0,0 +1,76 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#include <torch/csrc/jit/backends/xnnpack/serialization/schema_generated.h>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace torch {
+namespace jit {
+namespace xnnpack {
+namespace delegate {
+
+using namespace fb_xnnpack; // Specified in the schema
+
+class XNNSerializer {
+ public:
+  // Constructors
+  // initial buffersize of 1024 which will grow
+  // automatically
+  XNNSerializer() : XNNSerializer(1024) {}
+
+  explicit XNNSerializer(size_t bufferSize)
+      : _builder(bufferSize),
+        _nodes(),
+        _values(),
+        _constantBuffer(),
+        _bufferSizes() {}
+
+  // Serializing Nodes
+
+  // Serialize add node, we are serializing the argument needed to call
+  // xnn_define_add2. Serializing these values, and at run time we build
+  // teh graph by re running xnn_define_add2
+  void serializeAddNode(
+      uint32_t input1_id,
+      uint32_t input2_id,
+      uint32_t output_id,
+      uint32_t flags);
+
+  // Serializing Values
+  void serializeTensorValue(
+      uint32_t xnn_datatype,
+      size_t num_dims,
+      std::vector<size_t> dims,
+      void* data,
+      uint32_t external_id,
+      uint32_t flags,
+      uint32_t id_out);
+
+  // finish and serialize xnngraph returning serialized data
+  std::string finishAndSerialize(
+      std::vector<uint32_t> input_ids,
+      std::vector<uint32_t> output_ids);
+
+ private:
+  // xnnpack version we are serializing
+  const char* _version_sha1 = "ae108ef49aa5623b896fc93d4298c49d1750d9ba";
+
+  // flatbuffer objects we will create and serialize together to create xnngraph
+  flatbuffers_fbsource::FlatBufferBuilder _builder;
+
+  // Vector of the serialized xnnpack nodes
+  std::vector<flatbuffers_fbsource::Offset<Node>> _nodes;
+
+  // Vector of the serialized xnnpack values
+  std::vector<flatbuffers_fbsource::Offset<Value>> _values;
+
+  std::vector<flatbuffers_fbsource::Offset<Buffer>> _constantBuffer;
+  std::vector<uint32_t> _bufferSizes;
+};
+
+} // namespace delegate
+} // namespace xnnpack
+} // namespace jit
+} // namespace torch

From 12f16a5364b6bd785cc81a5f6ef9f0cd52619d8d Mon Sep 17 00:00:00 2001
From: maxren <maxren@meta.com>
Date: Mon, 31 Oct 2022 10:18:53 -0700
Subject: [PATCH 0375/1922] [xnnpack][lite-int][4/n] introduce serialization to
 delegate (#87908)

We introduced the serializer we created in the previous diff to our XNNGraph builder, the purpose of this is to serialize parts of the graph as we build this. At the end, we are able to finish and serialize the xnngraph into a std::string for use when we forward this along to on-device runtime.

The next diff will rebuild the xnngraph from the serialization we introduce here, so testing the serialization of the graph will be done in the next diff

Differential Revision: [D39335580](https://our.internmc.facebook.com/intern/diff/D39335580/)

**NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D39335580/)!
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87908
Approved by: https://github.com/digantdesai
---
 .../xnnpack/xnnpack_backend_preprocess.cpp    | 25 ++++++++++---
 .../xnnpack/xnnpack_graph_builder.cpp         | 35 +++++++++++++++++++
 .../backends/xnnpack/xnnpack_graph_builder.h  | 12 +++++--
 3 files changed, 65 insertions(+), 7 deletions(-)

diff --git a/torch/csrc/jit/backends/xnnpack/xnnpack_backend_preprocess.cpp b/torch/csrc/jit/backends/xnnpack/xnnpack_backend_preprocess.cpp
index 1f185815ff453..f2734a5e529a1 100644
--- a/torch/csrc/jit/backends/xnnpack/xnnpack_backend_preprocess.cpp
+++ b/torch/csrc/jit/backends/xnnpack/xnnpack_backend_preprocess.cpp
@@ -19,7 +19,10 @@ namespace delegate {
 // }
 // or
 // {
-//     "forward" : {"inputs" : c10::List<at::Tensor>}
+//     "forward" : {
+//                  "inputs" : c10::List<at::Tensor>,
+//                  "outputs" : c10::List<at::Tensor>
+//                  }
 // }
 // in which the value for "inputs" is the input shape to the module.
 // The module fed to the xnnpack backend must first be traced in order
@@ -92,6 +95,7 @@ c10::IValue preprocess(
 
   // grabbing the inputs from compile spec for testing
 
+  // gather sample inputs from compile spec
   std::vector<at::Tensor> inputs;
   auto input_list = inp.toList();
 
@@ -99,13 +103,24 @@ c10::IValue preprocess(
     inputs.push_back(input_list.get(i).toTensor());
   }
   std::vector<at::Tensor> outputs;
-  outputs.push_back(out.toList().get(0).toTensor());
+  auto output_list = out.toList();
+  std::vector<c10::IntList> output_shapes;
+
+  // gather sample outputs from compile spec
+  for (int i = 0; i < output_list.size(); i++) {
+    auto sample_output = output_list.get(i).toTensor();
+    outputs.push_back(sample_output);
+    // also gather output shapes to forward along to device
+    output_shapes.push_back(sample_output.sizes());
+  }
 
+  // sample run on sample inputs
   graph_builder.runGraphOnInputs(inputs, outputs);
+  c10::List<c10::IntList> shapes_list(output_shapes);
 
-  c10::List<at::Tensor> output_list(outputs);
-
-  compiled.insert("Answer", output_list);
+  compiled.insert("ser_model", graph_builder.serializedXNNGraph());
+  compiled.insert("outputs", shapes_list);
+  compiled.insert("Answer", outputs);
 
   return compiled;
 }
diff --git a/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp
index 566a0d141b318..ec740bd66c509 100644
--- a/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp
+++ b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp
@@ -127,6 +127,32 @@ void XNNGraph::checkOpsToDelegate(std::shared_ptr<torch::jit::Graph>& graph) {
       "the module contains the following unsupported ops:\n" + error.str());
 }
 
+std::string XNNGraph::serializedXNNGraph() {
+  std::vector<uint32_t> input_ids;
+  std::vector<uint32_t> output_ids;
+
+  for (auto val : _inputs) {
+    input_ids.push_back(_val_to_ids[val]);
+  }
+
+  for (auto val : _outputs) {
+    output_ids.push_back(_val_to_ids[val]);
+  }
+
+  return _serializer.finishAndSerialize(input_ids, output_ids);
+}
+
+std::vector<std::vector<long>> XNNGraph::getGraphOutputShapes() {
+  std::vector<std::vector<long>> output_shapes;
+  for (auto val : _outputs) {
+    auto tensor_ptr = val->type()->cast<TensorType>();
+    std::vector<long> sizes = tensor_ptr->sizes().concrete_sizes().value();
+    output_shapes.push_back(sizes);
+  }
+
+  return output_shapes;
+}
+
 void XNNGraph::defineAllNodes(std::shared_ptr<torch::jit::Graph>& graph) {
   DepthFirstGraphNodeIterator it(graph);
   Node* node = nullptr;
@@ -152,6 +178,7 @@ void XNNGraph::defineAllNodes(std::shared_ptr<torch::jit::Graph>& graph) {
             input2_id,
             output_id,
             /*flags=*/0);
+        _serializer.serializeAddNode(input1_id, input2_id, output_id, 0);
         TORCH_CHECK(status == xnn_status_success, "failed to create add node");
         break;
       }
@@ -213,6 +240,14 @@ void XNNGraph::defineAllTensorValues() {
           /*external_id=*/ext_id,
           /*flags=*/flags,
           /*id_out=*/&id);
+      _serializer.serializeTensorValue(
+          xnn_datatype_fp32,
+          num_dims,
+          tensor_shape,
+          nullptr,
+          ext_id,
+          flags,
+          id);
       TORCH_CHECK(
           status == xnn_status_success,
           "failed to define xnn_tensor_id for: " + val->debugName());
diff --git a/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.h b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.h
index 0a4f46c397c6a..0ef0757f23196 100644
--- a/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.h
+++ b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.h
@@ -6,6 +6,8 @@
 #include <unordered_set>
 #include <vector>
 
+#include <torch/csrc/jit/backends/xnnpack/serialization/serializer.h>
+
 namespace torch {
 namespace jit {
 namespace xnnpack {
@@ -16,7 +18,9 @@ class XNNGraph {
   const float output_min = -std::numeric_limits<float>::infinity();
   const float output_max = std::numeric_limits<float>::infinity();
 
-  // xnn_subgraph
+  // serializer class
+  XNNSerializer _serializer;
+  // xnn subgraph
   xnn_subgraph_t _subgraph_ptr;
   // Set of all the tensor values throughout the jit graph
   std::unordered_set<torch::jit::Value*> _intermediate_tensors;
@@ -58,7 +62,7 @@ class XNNGraph {
   void checkOpsToDelegate(std::shared_ptr<torch::jit::Graph>& graph);
 
  public:
-  XNNGraph() : _subgraph_ptr(nullptr) {
+  XNNGraph() : _serializer(), _subgraph_ptr(nullptr) {
     xnn_status status = xnn_initialize(/*allocator =*/nullptr);
     TORCH_CHECK(xnn_status_success == status, "Failed to initialize xnnpack");
   }
@@ -77,6 +81,10 @@ class XNNGraph {
   void runGraphOnInputs(
       std::vector<at::Tensor> tensor_inputs,
       std::vector<at::Tensor> tensor_outputs);
+
+  std::string serializedXNNGraph();
+
+  std::vector<std::vector<long>> getGraphOutputShapes();
 };
 
 } // namespace delegate

From 60f7e61889ffa3b0ddc9ad1f36cdf195609091a5 Mon Sep 17 00:00:00 2001
From: yanbing-j <yanbing.jiang@intel.com>
Date: Tue, 1 Nov 2022 02:06:30 +0000
Subject: [PATCH 0376/1922] Add labeler with cpu, mkldnn, amp, NNC and
 quantization paths to start (#87690)

This PR is to dd labeler with `module: cpu`, `module: mkldnn`, `module: amp (automated mixed precision)`, `NNC` and `oncall: quantization' paths to start.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87690
Approved by: https://github.com/ezyang, https://github.com/malfet
---
 .github/labeler.yml | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/.github/labeler.yml b/.github/labeler.yml
index 5af603fb36c92..e86ff2192edee 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -15,3 +15,37 @@
 - torch/_subclasses/fake_tensor.py
 - torch/_subclasses/fake_utils.py
 - torch/_subclasses/meta_utils.py
+
+"module: cpu":
+- aten/src/ATen/cpu/**
+- aten/src/ATen/native/cpu/**
+- aten/src/ATen/native/quantized/cpu/**
+- aten/src/ATen/native/Convolution*.cpp
+- aten/src/ATen/native/mkldnn/**
+- torch/cpu/**
+- torch/utils/mkldnn.py
+- test/test_mkldnn.py
+
+"module: mkldnn":
+- third_party/ideep
+- caffe2/ideep/**
+- caffe2/python/ideep/**
+- cmake/Modules/FindMKLDNN.cmake
+- third_party/mkl-dnn.BUILD
+- torch/csrc/jit/codegen/onednn/**
+- test/test_jit_llga_fuser.py
+
+"module: amp (automated mixed precision)":
+- torch/amp/**
+- aten/src/ATen/autocast_mode.*
+- torch/csrc/jit/passes/autocast.cpp
+- test/test_autocast.py
+
+"NNC":
+- torch/csrc/jit/tensorexpr/**
+
+"oncall: quantization":
+- torch/ao/quantization/**
+- torch/quantization/**
+- aten/src/ATen/quantized/**
+- aten/src/ATen/native/quantized/cpu/**

From b5e1827b9e80f141dfcf3c8eb9e0c591ab960745 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@fb.com>
Date: Tue, 1 Nov 2022 02:37:42 +0000
Subject: [PATCH 0377/1922] Add support for neg to NestedTensor (#88131)

Partially fixes #86889

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88131
Approved by: https://github.com/drisspg
---
 aten/src/ATen/native/native_functions.yaml    |  2 ++
 .../native/nested/NestedTensorUnaryOps.cpp    | 12 +++++++
 docs/source/nested.rst                        |  1 +
 test/test_nestedtensor.py                     | 33 ++++++++++---------
 4 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 300a14dd6baf6..296047d7c3c25 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -4247,6 +4247,7 @@
   dispatch:
     SparseCPU, SparseCUDA: neg_sparse
     SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg
   tags: canonical
 
 - func: neg_(Tensor(a!) self) -> Tensor(a!)
@@ -4256,6 +4257,7 @@
   dispatch:
     SparseCPU, SparseCUDA: neg_sparse_
     SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg_
 
 - func: neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
diff --git a/aten/src/ATen/native/nested/NestedTensorUnaryOps.cpp b/aten/src/ATen/native/nested/NestedTensorUnaryOps.cpp
index 74289a1372e12..6be7239775ea6 100644
--- a/aten/src/ATen/native/nested/NestedTensorUnaryOps.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorUnaryOps.cpp
@@ -58,5 +58,17 @@ Tensor NestedTensor_tanh(const Tensor& self) {
   return map_nt(self, at::tanh);
 }
 
+Tensor& NestedTensor_neg_(Tensor& self) {
+  auto self_ptr = get_nested_tensor_impl(self);
+  check_numel_equals_buffer_size(self_ptr);
+  auto buffer = self_ptr->get_buffer();
+  at::neg_(buffer);
+  return self;
+}
+
+Tensor NestedTensor_neg(const Tensor& self) {
+  return map_nt(self, at::neg);
+}
+
 } // namespace native
 } // namespace at
diff --git a/docs/source/nested.rst b/docs/source/nested.rst
index 21ff980256911..07712e0376f16 100644
--- a/docs/source/nested.rst
+++ b/docs/source/nested.rst
@@ -196,6 +196,7 @@ NestedTensor and any constraints they have.
    :func:`torch.nn.Dropout`; "Behavior is the same as on regular tensors."
    :func:`torch.relu`; "Behavior is the same as on regular tensors."
    :func:`torch.gelu`; "Behavior is the same as on regular tensors."
+   :func:`torch.neg`; "Behavior is the same as on regular tensors."
    :func:`torch.add`; "Supports elementwise addition of two nested tensors.
    Supports addition of a scalar to a nested tensor."
    :func:`torch.mul`; "Supports elementwise multiplication of two nested tensors.
diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py
index f5e9aa1b8d703..a7db45db4fec1 100644
--- a/test/test_nestedtensor.py
+++ b/test/test_nestedtensor.py
@@ -304,20 +304,6 @@ def test_repr_string(self):
         self.assertEqual(str(a), expected)
         self.assertEqual(repr(a), expected)
 
-    @torch.inference_mode()
-    def test_activations(self):
-        for func in (torch.nn.functional.relu,
-                     torch.nn.functional.relu_,
-                     torch.nn.functional.gelu,
-                     torch._C._nn.gelu_,
-                     torch.tanh,
-                     torch.tanh_):
-            t = torch.tensor([-1, 0, 1], dtype=torch.float)
-            nt = torch.nested.nested_tensor([t])
-            nested_result = func(nt)
-            self.assertTrue(nested_result.is_nested)
-            self.assertEqual(func(t), nested_result.unbind()[0])
-
     def test_to_padded_tensor_on_empty_tensor(self):
 
         nt = torch.nested.nested_tensor([])
@@ -762,6 +748,24 @@ def test_nested_tensor_indexing(self, device, dtype):
         expected_grad = torch.nested.nested_tensor([grad_x0, torch.zeros((3, 4), device=device, dtype=dtype)])
         self.assertEqual(nt.grad, expected_grad)
 
+    @parametrize("func", [torch.nn.functional.relu,
+                          torch.nn.functional.relu_,
+                          torch.nn.functional.gelu,
+                          torch._C._nn.gelu_,
+                          torch.tanh,
+                          torch.tanh_,
+                          torch.neg])
+    def test_activations(self, device, func):
+        nt, nt_noncontiguous = random_nt_noncontiguous_pair((2, 3, 6, 7), device=device, dtype=torch.float32)
+        nested_result = func(nt)
+        self.assertTrue(nested_result.is_nested)
+        for t, t_res in zip(nt.unbind(), nested_result.unbind()):
+            self.assertEqual(func(t), t_res)
+        self.assertRaisesRegex(
+            RuntimeError,
+            "NestedTensor must be contiguous to get buffer.",
+            lambda: func(nt_noncontiguous))
+
     @dtypes(*floating_types_and_half())
     def test_nested_tensor_chunk(self, device, dtype):
         # Transformer use case
@@ -897,7 +901,6 @@ def test_nested_tensor_div(self, device, dtype):
             RuntimeError, "div requires offsets to match when given NestedTensors",
             lambda: nt_chunks[0] / nt_chunks[1])
 
-
     @dtypes(torch.float, torch.float16)
     @skipMeta
     @torch.inference_mode()

From a053a97ed7b1bc9203ff16c4f3d5547662735087 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahancpal@gmail.com>
Date: Mon, 31 Oct 2022 12:40:30 -0700
Subject: [PATCH 0378/1922] [torch::deploy] add gpu unit tests to CI (#88107)

Adds `torch::deploy`'s GPU tests to core CI to make sure core changes don't break them.

Overall, deploy tests take 11 min, so it shouldn't be much of a burden :)  https://github.com/pytorch/pytorch/actions/runs/3364231795/jobs/5578861939

Differential Revision: [D40861442](https://our.internmc.facebook.com/intern/diff/D40861442)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88107
Approved by: https://github.com/d4l3k, https://github.com/anirbanr-fb-r2p
---
 .jenkins/pytorch/common_utils.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.jenkins/pytorch/common_utils.sh b/.jenkins/pytorch/common_utils.sh
index d673a37f17b8f..8af2c93a1e504 100644
--- a/.jenkins/pytorch/common_utils.sh
+++ b/.jenkins/pytorch/common_utils.sh
@@ -162,9 +162,8 @@ function checkout_install_torchdeploy() {
   pushd ..
   git clone --recurse-submodules https://github.com/pytorch/multipy.git
   pushd multipy
-  # with ABI flag change
   python multipy/runtime/example/generate_examples.py
-  pip install -e . --install-option="--abicxx"
+  pip install -e . --install-option="--cudatests"
   popd
   popd
 }
@@ -173,6 +172,7 @@ function test_torch_deploy(){
  pushd ..
  pushd multipy
  ./multipy/runtime/build/test_deploy
+ ./multipy/runtime/build/test_deploy_gpu
  popd
  popd
 }

From f52c37eff304b22601a42e2ea8f12dc54c13efc8 Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <ivan.yashchuk@aalto.fi>
Date: Tue, 1 Nov 2022 03:09:34 +0000
Subject: [PATCH 0379/1922] Remove "prims_nvfuser" backend for TorchDynamo
 (#88083)

Removing "prims_nvfuser" backend according to the discussion in https://github.com/pytorch/torchdynamo/pull/1281#discussion_r979468355.

cc @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88083
Approved by: https://github.com/ezyang
---
 test/test_fx_backends.py                | 252 ---------------------
 torch/_dynamo/debug_utils.py            |  22 --
 torch/_dynamo/optimizations/training.py |  65 ------
 torch/fx/passes/backends/nvfuser.py     | 286 ------------------------
 4 files changed, 625 deletions(-)
 delete mode 100644 test/test_fx_backends.py
 delete mode 100644 torch/fx/passes/backends/nvfuser.py

diff --git a/test/test_fx_backends.py b/test/test_fx_backends.py
deleted file mode 100644
index f9103d61aa960..0000000000000
--- a/test/test_fx_backends.py
+++ /dev/null
@@ -1,252 +0,0 @@
-# Owner(s): ["module: fx"]
-
-import copy
-import sys
-import logging
-from typing import List, Tuple
-
-import torch
-from torch.fx._symbolic_trace import symbolic_trace
-from torch.fx.experimental.proxy_tensor import make_fx
-from torch.fx.passes.backends.nvfuser import NvFuserBackend
-
-from torch.testing._internal.common_utils import run_tests, TEST_CUDA, TestCase
-from torch.testing._internal.common_device_type import (
-    instantiate_device_type_tests,
-    dtypes,
-)
-
-if not TEST_CUDA:
-    print('CUDA not available, skipping tests', file=sys.stderr)
-    TestCase = object  # noqa: F811
-
-logging.basicConfig(level=logging.DEBUG)
-logger = logging.getLogger(__name__)
-
-class HF_T5_Partial(torch.nn.Module):
-
-    def inputs_meta(self):
-        return [
-            (torch.Size([512, 512]), torch.float32),
-            (torch.Size([512, 512]), torch.float32),
-            (torch.Size([512, 512]), torch.float32),
-            (torch.Size([512, 512]), torch.float32),
-            (torch.Size([512]), torch.float32),
-            (torch.Size([2048, 512]), torch.float32),
-            (torch.Size([512, 2048]), torch.float32),
-            (torch.Size([512]), torch.float32),
-            (torch.Size([8, 1024, 512]), torch.float32),
-            (torch.Size([8, 8, 1024, 1024]), torch.float32),
-        ]
-
-    def forward(self, primals_1, primals_2, primals_3, primals_4, primals_5,
-                primals_6, primals_7, primals_8, primals_9, primals_10):
-        pow_1 = torch.ops.aten.pow(primals_9, 2)
-        mean = torch.ops.aten.mean(pow_1, [-1], True)
-        add = torch.ops.aten.add(mean, 1e-06)
-        rsqrt = torch.ops.aten.rsqrt(add)
-        mul = torch.ops.aten.mul(primals_9, rsqrt)
-        mul_1 = torch.ops.aten.mul(primals_5, mul)
-        t = torch.ops.aten.t(primals_3)
-        view = torch.ops.aten.view(mul_1, [8192, 512])
-        mm = torch.ops.aten.mm(view, t)
-        _unsafe_view = torch.ops.aten._unsafe_view(mm, [8, 1024, 512])
-        view_1 = torch.ops.aten.view(_unsafe_view, [8, -1, 8, 64])
-        transpose = torch.ops.aten.transpose(view_1, 1, 2)
-        t_1 = torch.ops.aten.t(primals_1)
-        view_2 = torch.ops.aten.view(mul_1, [8192, 512])
-        mm_1 = torch.ops.aten.mm(view_2, t_1)
-        _unsafe_view_1 = torch.ops.aten._unsafe_view(mm_1, [8, 1024, 512])
-        view_3 = torch.ops.aten.view(_unsafe_view_1, [8, -1, 8, 64])
-        transpose_1 = torch.ops.aten.transpose(view_3, 1, 2)
-        t_2 = torch.ops.aten.t(primals_4)
-        view_4 = torch.ops.aten.view(mul_1, [8192, 512])
-        mm_2 = torch.ops.aten.mm(view_4, t_2)
-        _unsafe_view_2 = torch.ops.aten._unsafe_view(mm_2, [8, 1024, 512])
-        view_5 = torch.ops.aten.view(_unsafe_view_2, [8, -1, 8, 64])
-        transpose_2 = torch.ops.aten.transpose(view_5, 1, 2)
-        transpose_3 = torch.ops.aten.transpose(transpose_1, 3, 2)
-        expand = torch.ops.aten.expand(transpose, [8, 8, 1024, 64])
-        clone = torch.ops.aten.clone(expand, memory_format=torch.contiguous_format)
-        _unsafe_view_3 = torch.ops.aten._unsafe_view(clone, [64, 1024, 64])
-        expand_1 = torch.ops.aten.expand(transpose_3, [8, 8, 64, 1024])
-        clone_1 = torch.ops.aten.clone(expand_1, memory_format=torch.contiguous_format)
-        _unsafe_view_4 = torch.ops.aten._unsafe_view(clone_1, [64, 64, 1024])
-        bmm = torch.ops.aten.bmm(_unsafe_view_3, _unsafe_view_4)
-        _unsafe_view_5 = torch.ops.aten._unsafe_view(bmm, [8, 8, 1024, 1024])
-        add_ = torch.ops.aten.add_(_unsafe_view_5, primals_10)
-        _softmax = torch.ops.aten._softmax(add_, -1, False)
-        expand_2 = torch.ops.aten.expand(_softmax, [8, 8, 1024, 1024])
-        view_6 = torch.ops.aten.view(expand_2, [64, 1024, 1024])
-        expand_3 = torch.ops.aten.expand(transpose_2, [8, 8, 1024, 64])
-        clone_2 = torch.ops.aten.clone(expand_3, memory_format=torch.contiguous_format)
-        _unsafe_view_6 = torch.ops.aten._unsafe_view(clone_2, [64, 1024, 64])
-        bmm_1 = torch.ops.aten.bmm(view_6, _unsafe_view_6)
-        _unsafe_view_7 = torch.ops.aten._unsafe_view(bmm_1, [8, 8, 1024, 64])
-        transpose_4 = torch.ops.aten.transpose(_unsafe_view_7, 1, 2)
-        clone_3 = torch.ops.aten.clone(transpose_4, memory_format=torch.contiguous_format)
-        view_7 = torch.ops.aten.view(clone_3, [8, -1, 512])
-        t_3 = torch.ops.aten.t(primals_2)
-        view_8 = torch.ops.aten.view(view_7, [8192, 512])
-        mm_3 = torch.ops.aten.mm(view_8, t_3)
-        _unsafe_view_8 = torch.ops.aten._unsafe_view(mm_3, [8, 1024, 512])
-        add_1 = torch.ops.aten.add(primals_9, _unsafe_view_8)
-        pow_2 = torch.ops.aten.pow(add_1, 2)
-        mean_1 = torch.ops.aten.mean(pow_2, [-1], True)
-        add_2 = torch.ops.aten.add(mean_1, 1e-06)
-        rsqrt_1 = torch.ops.aten.rsqrt(add_2)
-        mul_2 = torch.ops.aten.mul(add_1, rsqrt_1)
-        mul_3 = torch.ops.aten.mul(primals_8, mul_2)
-        t_4 = torch.ops.aten.t(primals_6)
-        view_9 = torch.ops.aten.view(mul_3, [8192, 512])
-        mm_4 = torch.ops.aten.mm(view_9, t_4)
-        _unsafe_view_9 = torch.ops.aten._unsafe_view(mm_4, [8, 1024, 2048])
-        relu = torch.ops.aten.relu(_unsafe_view_9)
-        t_5 = torch.ops.aten.t(primals_7)
-        view_10 = torch.ops.aten.view(relu, [8192, 2048])
-        mm_5 = torch.ops.aten.mm(view_10, t_5)
-        _unsafe_view_10 = torch.ops.aten._unsafe_view(mm_5, [8, 1024, 512])
-        add_3 = torch.ops.aten.add(add_1, _unsafe_view_10)
-        return [add_3, rsqrt, _unsafe_view_3, t_3, _softmax, view_6, mul_2, t, view_9, t_1, primals_5, add_1,
-                _unsafe_view_4, view_2, view_10, t_5, t_2, primals_8, view_4, view_8, rsqrt_1, primals_9, t_4,
-                mul, _unsafe_view_6, relu, view]
-
-
-class TestFxNvFuserBackend(TestCase):
-
-    def _generate_random_inputs(self, device, inputs_meta: List[Tuple[torch.Size, torch.dtype]]):
-        inputs = []
-        for meta in inputs_meta:
-            shape, dtype = meta
-
-            if dtype in {torch.int, torch.int32, torch.int64, torch.bool, torch.int, torch.uint8}:
-                input = torch.randint(0, 1, shape, dtype=dtype, device=device)
-            else:
-                input = torch.rand(shape, dtype=dtype, device=device)
-
-            inputs.append(input)
-
-        return inputs
-
-
-    @dtypes(torch.float32)
-    def test_nvfuser_call_module_backend(self, device, dtype):
-
-        class Model(torch.nn.Module):
-
-            def __init__(self):
-                super(Model, self).__init__()
-                self.bn = torch.nn.BatchNorm2d(3)
-                self.relu = torch.nn.ReLU()
-
-            def forward(self, inp):
-                o = self.bn(inp)
-                o = self.relu(o)
-                return o
-
-        inp = torch.randn(2, 3, 4, 5).to(dtype=dtype, device=device)
-        m = Model().to(dtype=dtype, device=device)
-
-        # note that the traced module here contains only `call_module` node,
-        # which isn't fused by nvfuser backend. But `nvfuser.compile` should run without error
-        traced = symbolic_trace(m)
-
-        nvfuser = NvFuserBackend()
-        compiled_module = nvfuser.compile(traced)
-
-        eager_result = m(inp)
-        nvfuser_result = compiled_module(inp)
-
-        torch.testing.assert_close(eager_result, nvfuser_result, rtol=1e-5, atol=1e-5)
-
-
-    @dtypes(torch.float32)
-    def test_nvfuser_backend(self, device, dtype):
-        m = HF_T5_Partial()
-        m.to(device)
-
-        traced = symbolic_trace(m)
-
-        nvfuser = NvFuserBackend()
-        compiled_module = nvfuser.compile(traced)
-
-        inputs = self._generate_random_inputs(device, m.inputs_meta())
-
-        eager_result = m(*inputs)
-        nvfuser_result = compiled_module(*inputs)
-
-        torch.testing.assert_close(eager_result, nvfuser_result, rtol=1e-5, atol=1e-5)
-
-    @dtypes(torch.float32)
-    def test_aten_square(self, device, dtype):
-
-        def fn(x):
-            square = torch.square(x)
-            a = square + 1
-            b = a + 1
-            return b
-
-        inputs = torch.randn(4, device=device)
-        traced = make_fx(fn)(inputs)
-
-        nvfuser = NvFuserBackend()
-        compiled_module = nvfuser.compile(copy.deepcopy(traced))
-
-        for node in compiled_module.graph.nodes:
-            if node.op == "call_function":
-                assert "fused" in str(node.target), "the entire function should be fused into a single fusion group"
-
-        eager_result = traced(inputs)
-        nvfuser_result = compiled_module(inputs)
-        torch.testing.assert_close(eager_result, nvfuser_result, rtol=1e-5, atol=1e-5)
-
-    @dtypes(torch.float32)
-    def test_aten_leakyrelu(self, device, dtype):
-
-        def fn(x):
-            square = torch.ops.aten.leaky_relu(x, 0.1)
-            a = square + 1
-            b = a + 1
-            return b
-
-        inputs = torch.randn(4, device=device)
-        traced = make_fx(fn)(inputs)
-
-        nvfuser = NvFuserBackend()
-        compiled_module = nvfuser.compile(copy.deepcopy(traced))
-
-        for node in compiled_module.graph.nodes:
-            if node.op == "call_function":
-                assert "fused" in str(node.target), "the entire function should be fused into a single fusion group"
-
-        eager_result = traced(inputs)
-        nvfuser_result = compiled_module(inputs)
-        torch.testing.assert_close(eager_result, nvfuser_result, rtol=1e-5, atol=1e-5)
-
-    @dtypes(torch.float32)
-    def test_aten_where(self, device, dtype):
-
-        def fn(x):
-            where = torch.ops.aten.where(x < 0, -x, x)
-            a = where + 1
-            b = a + 1
-            return b
-
-        inputs = torch.randn(4, device=device)
-        traced = make_fx(fn)(inputs)
-
-        nvfuser = NvFuserBackend()
-        compiled_module = nvfuser.compile(copy.deepcopy(traced))
-
-        for node in compiled_module.graph.nodes:
-            if node.op == "call_function":
-                assert "fused" in str(node.target), "the entire function should be fused into a single fusion group"
-
-        eager_result = traced(inputs)
-        nvfuser_result = compiled_module(inputs)
-        torch.testing.assert_close(eager_result, nvfuser_result, rtol=1e-5, atol=1e-5)
-
-instantiate_device_type_tests(TestFxNvFuserBackend, globals(), only_for="cuda")
-
-if __name__ == "__main__":
-    run_tests()
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index f2774e9bb14db..b1495789d2eae 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -179,11 +179,6 @@ def generate_compiler_repro_string(gm, args):
 from {config.dynamo_import}.debug_utils import same_two_models
 """
 
-NVFUSER_IMPORT = """
-from torch.fx.passes.backends.nvfuser import NvFuserBackend
-nvfuser = NvFuserBackend()
-"""
-
 COMPILER_REPRO_OPTIONS = {
     "inductor": (INDUCTOR_IMPORT, "compile_fx_inner", "inductor_fails"),
     "inductor_accuracy": (
@@ -191,7 +186,6 @@ def generate_compiler_repro_string(gm, args):
         "compile_fx_inner",
         "inductor_accuracy_fails",
     ),
-    "nvfuser": (NVFUSER_IMPORT, "nvfuser", "nvfuser_fails"),
 }
 
 
@@ -309,22 +303,6 @@ def inductor_fails(fx_g, args, check_str=None):
     return False
 
 
-def nvfuser_fails(fx_g, args, check_str=None):
-    from torch.fx.passes.backends.nvfuser import NvFuserBackend
-
-    nvfuser = NvFuserBackend()
-
-    try:
-        compile_mod = nvfuser(fx_g, args)
-        compile_mod = compile_mod(*args)
-    except Exception as e:
-        if check_str is not None and check_str not in repr(e):
-            return False
-        print(repr(e))
-        return True
-    return False
-
-
 def inductor_accuracy_fails(fx_g, args, check_str=None):
     from torch._inductor.compile_fx import compile_fx_inner
 
diff --git a/torch/_dynamo/optimizations/training.py b/torch/_dynamo/optimizations/training.py
index bec450bd37430..84d8b7b21e3e5 100644
--- a/torch/_dynamo/optimizations/training.py
+++ b/torch/_dynamo/optimizations/training.py
@@ -14,7 +14,6 @@
 from torch.utils._pytree import tree_map
 
 from .. import config
-from ..debug_utils import wrap_compiler_debug
 from ..utils import clone_inputs, count_calls, counters
 from .analysis import has_mutation
 from .backends import BACKENDS
@@ -261,66 +260,6 @@ def __call__(self, gm: torch.fx.GraphModule, example_inputs):
 aot_mem_efficient_fusion_no_decomp = AOTMemEfficientFusionWithContext(False)
 
 
-class AotPrimsNvfuser(AotAutogradStrategy):
-    """
-    Use FX graph partitioner + Aten2Prims ref + trace executor + nvFuser
-    """
-
-    def __init__(self, gm: torch.fx.GraphModule, example_inputs):
-        super(AotPrimsNvfuser, self).__init__(gm, example_inputs)
-
-        from functorch.compile import min_cut_rematerialization_partition
-
-        from torch.fx.passes.backends.nvfuser import NvFuserBackend
-
-        self.nvfuser = NvFuserBackend()
-        self.min_cut_rematerialization_partition = min_cut_rematerialization_partition
-        self.populate_aten2aten_decomps()
-
-    def populate_aten2aten_decomps(self):
-        from torch._decomp import get_decompositions
-
-        aten = torch.ops.aten
-        default_decompositions = {
-            aten.detach,
-            aten.gelu_backward,
-            aten.leaky_relu_backward,
-            aten.sigmoid_backward,
-            aten.threshold_backward,
-            aten.hardtanh_backward,
-            aten.hardsigmoid_backward,
-            aten.hardswish_backward,
-            aten.tanh_backward,
-            aten.silu_backward,
-            aten.elu_backward,
-            aten.cudnn_batch_norm,
-            aten.cudnn_batch_norm_backward,
-            aten.masked_fill.Scalar,
-            aten.masked_fill.Tensor,
-            aten.elu,
-            aten.leaky_relu,
-            aten.hardtanh,
-            aten.hardswish,
-            aten.hardsigmoid,
-            aten.rsub,
-            aten.native_batch_norm_backward,
-        }
-
-        self.aten2aten_decompositions = get_decompositions(default_decompositions)
-
-    def candidate(self):
-        return BACKENDS["aot_autograd"](
-            self.gm,
-            self.example_inputs,
-            fw_compiler=wrap_compiler_debug(self.nvfuser, "nvfuser"),
-            partition_fn=self.min_cut_rematerialization_partition,
-            decompositions=self.aten2aten_decompositions,
-        )
-
-
-aot_prims_nvfuser = AotPrimsNvfuser.compile_fn
-
-
 def prims_executor(gm, inputs, *, executor):
     # This function is called once per forward/backward pass of a graph in AOT
     # Autograd. We use it to set up the nvFuser-specific FX graph and return
@@ -525,10 +464,6 @@ def create_aot_backends():
     # by using the relevant fuser with torch.jit.fuser(...)
     BACKENDS["aot_ts"] = aot_ts
 
-    # prims_nvfuser uses the prims and AOT-Autograd to get FX-aten IR. And then
-    # directly lowers to NVFuser without relying no Torchscript.
-    BACKENDS["prims_nvfuser"] = aot_prims_nvfuser
-
     # "nvprims" is a subset of PrimTorch primitives that are guaranteed to be
     # supported by nvFuser. This is the preferred backend for nvFuser+PrimTorch.
     BACKENDS["nvprims_nvfuser"] = aot_nvprims_nvfuser
diff --git a/torch/fx/passes/backends/nvfuser.py b/torch/fx/passes/backends/nvfuser.py
deleted file mode 100644
index fdb1dd9a3320b..0000000000000
--- a/torch/fx/passes/backends/nvfuser.py
+++ /dev/null
@@ -1,286 +0,0 @@
-from typing import Dict
-
-import torch
-from torch.nn import Module
-from torch._ops import OpOverload
-
-from torch.fx import GraphModule
-from torch.fx.node import Node, _get_qualified_name
-from torch.fx.passes.operator_support import OperatorSupport
-from torch.fx.passes.tools_common import CALLABLE_NODE_OPS
-from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
-from torch._prims.executor import execute
-from torch.fx.experimental.proxy_tensor import DecompositionInterpreter
-from torch._decomp import decomposition_table
-
-import typing as t
-
-import logging
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.WARNING)
-
-def aten_to_dtype(self, dtype: torch.dtype, **kwargs):
-    if len(kwargs) > 0 or not dtype:
-        raise RuntimeError("No support for other to.dtype() formats other than to.dtype(self, dtype)")
-    return torch._prims.convert_element_type(self, dtype)
-
-# decomposition_table currently contains both aten2aten and aten2prim decomposition
-# this is a hack to separate them, as we only need aten2prim decomposition for nvfuser-supported aten graph lowering
-aten2aten_decomp = {}
-aten2prim_decomp = {}
-
-for op, decomp_fn in decomposition_table.items():
-    if "torch._refs" in decomp_fn.__module__:
-        aten2prim_decomp[op] = decomp_fn
-    else:
-        aten2aten_decomp[op] = decomp_fn
-
-aten2aten_decomp_skips = {
-    "aten.native_layer_norm_backward.default",
-    "aten.embedding_dense_backward.default",   # This is hurting nvfuser's perf
-    "aten.addmm.default"
-}
-
-for op, decomp_fn in decomposition_table.items():
-    if "torch._refs" in decomp_fn.__module__:
-        aten2prim_decomp[op] = decomp_fn
-    else:
-        if str(op) not in aten2aten_decomp_skips:
-            aten2aten_decomp[op] = decomp_fn
-
-
-aten2prim_decomp[torch.ops.aten.to.dtype] = aten_to_dtype
-
-
-class NvFuserOperatorSupport(OperatorSupport):
-    """
-    Operator support for nvFuser backend.
-
-    Currently, partitioning is based on FX ATen graph. The fused subgraph will latter be decomposed into prims.
-    To determine if an ATen ops is supported by nvFuser, we shall check the prim ops used in its ref decomposition.
-    Only if all the prim ops in the ref has a nvfuser_impl, we say this Aten op is suppported by nvFuser.
-
-    Note: When adding a rule, please add it to the corresponding section and follow the
-    alphabetical order.
-    """
-
-    def __init__(self):
-
-        # TODO: current list copied from torch/csrc/jit/codegen/cuda/parser.cpp is incorrect,
-        # as that file is solely for TorchScript and doesn't represent the actual status
-        # whether operation would be runnable by primTorch+nvFuser.
-        # We will iterate on this list to reflect the the reality.
-        support_dict = {
-            # ===============================================================
-            # call_function aten
-            # ===============================================================
-            # Following supported aten ops is copied from torch/csrc/jit/codegen/cuda/parser.cpp
-            # TODO: might need to update according to supported input types
-            "torch.ops.aten.add": None,
-            "torch.ops.aten.sub": None,
-            # "torch.ops.aten.rsub": None,    # rsub decomp is supported at aten2aten level
-            "torch.ops.aten.div": None,
-            "torch.ops.aten.atan2": None,
-            "torch.ops.aten.mul": None,
-            "torch.ops.aten.max": None,
-            "torch.ops.aten.min": None,
-            "torch.ops.aten.pow": None,
-            "torch.ops.aten.remainder": None,
-            "torch.ops.aten.fmod": None,
-            "torch.ops.aten.bitwise_and": None,
-            "torch.ops.aten.__and__": None,
-            "torch.ops.aten.bitwise_or": None,
-            "torch.ops.aten.__or__": None,
-            "torch.ops.aten.bitwise_xor": None,
-            "torch.ops.aten.__xor__": None,
-            "torch.ops.aten.bitwise_left_shift": None,
-            "torch.ops.aten.__lshift__": None,
-            "torch.ops.aten.bitwise_right_shift": None,
-            "torch.ops.aten.__rshift__": None,
-            "torch.ops.aten.eq": None,
-            "torch.ops.aten.ne": None,
-            "torch.ops.aten.ge": None,
-            "torch.ops.aten.gt": None,
-            "torch.ops.aten.le": None,
-            "torch.ops.aten.lt": None,
-            "torch.ops.aten.abs": None,
-            "torch.ops.aten.bitwise_not": None,
-            "torch.ops.aten.ceil": None,
-            "torch.ops.aten.floor": None,
-            "torch.ops.aten.frac": None,
-            "torch.ops.aten.neg": None,
-            "torch.ops.aten.relu": None,
-            "torch.ops.aten.round": None,
-            "torch.ops.aten.silu": None,
-            "torch.ops.aten.trunc": None,
-            "torch.ops.aten.log": None,
-            "torch.ops.aten.log10": None,
-            "torch.ops.aten.log1p": None,
-            "torch.ops.aten.log2": None,
-            "torch.ops.aten.lgamma": None,
-            "torch.ops.aten.exp": None,
-            "torch.ops.aten.expm1": None,
-            "torch.ops.aten.erf": None,
-            "torch.ops.aten.erfc": None,
-            "torch.ops.aten.cos": None,
-            "torch.ops.aten.acos": None,
-            "torch.ops.aten.cosh": None,
-            "torch.ops.aten.sin": None,
-            "torch.ops.aten.asin": None,
-            "torch.ops.aten.sinh": None,
-            "torch.ops.aten.tan": None,
-            "torch.ops.aten.atan": None,
-            "torch.ops.aten.tanh": None,
-            "torch.ops.aten.atanh": None,
-            "torch.ops.aten.sqrt": None,
-            "torch.ops.aten.rsqrt": None,
-            "torch.ops.aten.reciprocal": None,
-            "torch.ops.aten.sigmoid": None,
-            "torch.ops.aten.isfinite": None,
-            "torch.ops.aten.isinf": None,
-            "torch.ops.aten.isnan": None,
-            "torch.ops.aten.isneginf": None,
-            "torch.ops.aten.isposinf": None,
-            "torch.ops.aten.isreal": None,
-            # "torch.ops.aten.rand_like": None,  # causing Node empty_like_default does not support nvfuser
-            "torch.ops.aten.softplus": None,
-            "torch.ops.aten.threshold": None,
-            # relying on aten->aten->prim decomp, aten2aten is using unsupported aten.new_zero op
-            # "torch.ops.aten.threshold_backward": None,
-            "torch.ops.aten.clamp": None,
-            # "torch.ops.aten.clone": None,
-            # Failing with where(): incompatible function arguments: \
-            # [<torch._C._nvfuser.TensorView, tensor, <torch._C._nvfuser.TensorView]
-            # failing with BERT_pytorch_forward_0, which has aten.where.ScalarSelf in the decomps
-            # "torch.ops.aten.where": None,
-            # However, aten.where.self overload is fully supported
-            "torch.ops.aten.where.self": None,
-            "torch.ops.aten.lerp": None,
-            "torch.ops.aten.addcmul": None,
-            # "torch.ops.aten.native_dropout": None,    # missing refs for aten.rank_like
-            "torch.ops.aten.dropout": None,
-            # "torch.ops.aten.native_dropout_backward": None,   # missing refs for aten.type_as
-            "torch.ops.aten.instance_norm": None,
-            "torch.ops.aten._batch_norm_impl_index": None,
-            # "torch.ops.aten.native_batch_norm": None,     # missing refs for aten.var
-            "torch.ops.aten.batch_norm": None,
-            "torch.ops.aten.cudnn_batch_norm": None,
-            "torch.ops.aten._batch_norm_impl_index_backward": None,
-            # "torch.ops.aten.native_batch_norm_backward": None,    # should have been handled at aten2aten decomp
-            "torch.ops.aten.native_layer_norm": None,
-            "torch.ops.aten.layer_norm": None,
-            # relying on aten->aten->prim decomp, aten2aten is using unsupported aten.div
-            # "torch.ops.aten.native_layer_norm_backward": None,
-            "torch.ops.aten.softmax.int": None,
-            "torch.ops.aten.log_softmax.int": None,
-            # relying on aten->aten->prim decomp, aten2aten is using unsupported aten.amax
-            # "torch.ops.aten._softmax": None,
-            "torch.ops.aten._log_softmax_backward_data": None,
-            # "torch.ops.aten._softmax_backward_data": None,  # Node _softmax_backward_data_default does not support nvfuser
-            # "torch.ops.aten.var.dim": None,       # missing refs
-            "torch.ops.aten.std.dim": None,
-            "torch.ops.aten.sum": None,
-            # "torch.ops.aten.mean.dim": None,      # missing refs
-            "torch.ops.aten._grad_sum_to_size": None,
-            "torch.ops.aten.sum_to_size": None,
-            "torch.ops.aten._autocast_to_reduced_precision": None,
-            "torch.ops.aten._autocast_to_full_precision": None,
-            # "torch.ops.aten.to.dtype": None,      # causing segfault
-            # "torch.ops.aten.type_as": None,       # missing refs
-            "torch.ops.aten.linear": None,
-            "torch.ops.aten.gelu": None,
-            # "torch.ops.aten.gelu_backward": None,       # gelu_backward is handled at aten2aten decomp
-            # "torch.ops.aten.hardtanh": None,        # has functional ref, using unsupported aten.clamp
-            "torch.ops.aten.leaky_relu": None,
-            "torch.ops.aten.square": None,
-            # relying on aten->aten->prim decomp, aten2aten is using unsupported aten.conj_physical
-            "torch.ops.aten.tanh_backward": None,
-            # "torch.ops.aten.amax": None,      # missing prim decomp
-            # "torch.ops.aten.amin": None,      # missing prim decomp
-            # "torch.ops.aten.reshape": None,
-            # "torch.ops.aten.view": None,      # missing prim decomp
-            "torch.ops.aten.flatten.using_ints": None,
-
-            # ===============================================================
-            # call_function builtins and operator
-            # ===============================================================
-            "getattr": None,
-            "_operator.getitem": None,
-        }
-
-        super().__init__(support_dict)
-
-    def is_node_supported(
-        self, submodules: t.Mapping[str, Module], node: Node
-    ) -> bool:
-
-        # nvFuser FX subgraph should be purely functional
-        if node.op not in CALLABLE_NODE_OPS:
-            return False
-
-        # ops in supported_dict doesn't have overload name
-        # use overloadpacket's qualified_name for OpOverload
-        if isinstance(node.target, OpOverload):
-            target = _get_qualified_name(node.target.overloadpacket)
-            if target in self._support_dict:
-                return True
-
-        return super().is_node_supported(submodules, node)
-
-
-class NvFuserBackend:
-    def __init__(self):
-        self.supported_ops = NvFuserOperatorSupport()
-
-        # TODO: this is a naive implementation of cache without proper guard
-        self.partitioner_cache: Dict[GraphModule, GraphModule] = {}
-
-        # TODO: this is a naive implementation of cache without proper guard, this will only work for identical inputs
-        self.prim_decomp_cache: Dict[GraphModule, GraphModule] = {}
-
-    def lower_to_prims_and_execute(self, graph_module: GraphModule, *args, **kwargs):
-        # `graph_module` is an Aten-Fx graph
-        # "lowering to prims" and "trace execution" are grouped into this function, as they are both input dependent
-
-        if graph_module in self.prim_decomp_cache:
-            logger.debug("prim_decomp_cache hit!")
-            prim_module = self.prim_decomp_cache[graph_module]
-        else:
-            prim_graph = torch.fx.Graph()
-            DecompositionInterpreter(graph_module, prim_graph, decomposition_table=aten2prim_decomp).run(*args, **kwargs)
-            prim_module = torch.fx.GraphModule(graph_module, prim_graph)
-            self.prim_decomp_cache[graph_module] = prim_module
-
-            logger.debug("Lower to prims graph: ", prim_module.code)
-
-        # invokes trace executor for running the prim graph
-        return execute(prim_module, *args, executor="nvfuser")
-
-    def compile(self, graph_module: GraphModule) -> GraphModule:
-        # entry function for nvFuser backend
-        logger.debug("Compiling graph_module: ", graph_module.code)
-
-        # FX graph based partitioning based on nvfuser supported ops
-        if graph_module in self.partitioner_cache:
-            logger.debug("partitioner_cache hit!")
-            fused_graph_module = self.partitioner_cache[graph_module]
-        else:
-            partitioner = CapabilityBasedPartitioner(
-                graph_module, self.supported_ops, allows_single_node_partition=False)
-            fused_graph_module = partitioner.partition_and_fuse()
-
-            self.partitioner_cache[graph_module] = fused_graph_module
-
-        # Overriding fused_module's __call__() function with lower_to_prims_and_execute()
-        for node in fused_graph_module.graph.nodes:
-            # TODO: use a better way to identify fused submodule
-            if node.op == "call_module" and "fused_" in node.name:
-                fused_module = getattr(fused_graph_module, node.name)
-                fused_module._wrapped_call = self.lower_to_prims_and_execute
-
-        return fused_graph_module
-
-    def __call__(self, graph_module: GraphModule, _) -> GraphModule:
-        # wrap self.compile as __call__ function to fit the interface for AOTAutograd's fw_compiler
-        return self.compile(graph_module)

From 823b992ead6ab6e8d37bc2bc7995542efac8898b Mon Sep 17 00:00:00 2001
From: Driss Guessous <drisspg@fb.com>
Date: Tue, 1 Nov 2022 03:14:24 +0000
Subject: [PATCH 0380/1922] call contiguous on BMM inputs for NT on CUDA
 (#88108)

Fixes #87713

BMM for cpu supports  non-contiguous nested tensor inputs, while BMM for Cuda does not support currently non-contiguous inputs.

The derivative for BMM:
```
- name: bmm(Tensor self, Tensor mat2) -> Tensor
  self: grad.bmm(mat2.transpose(1, 2).conj())
  mat2: self.transpose(1, 2).conj().bmm(grad)
  result: self_t.bmm(mat2_p) + self_p.bmm(mat2_t)
```

When calling backward it was impossible for this function to succeed since the inputs were always discontiguous, regardless of the user input.  This adds contiguous calls to BMM_cuda implementation for nested tensors.

This was not caught by tests because grad_check is currently only done on CPU in test_nestedtensors. This PR updates the autograd test to also be run on GPU.

As a result I found one more issue with the backward for to_padded_tensor erroring instead of calling the generic version.

cc @cpuhrsch @jbschlosser @bhosmer @mikaylagawarecki
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88108
Approved by: https://github.com/cpuhrsch
---
 .../cuda/NestedTensorTransformerFunctions.cpp |   8 +-
 .../cuda/NestedTensorTransformerFunctions.cu  |   8 +-
 test/test_nestedtensor.py                     | 217 +++++++++---------
 3 files changed, 119 insertions(+), 114 deletions(-)

diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
index 33c180a929f18..411ebdb19b5af 100644
--- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
@@ -44,16 +44,16 @@ Tensor nested_from_padded_cuda(
     const Tensor& sizes,
     bool do_transform_0213) {
   if (padded.dim() > 1 && padded.dim() < 5) {
+    // Instead of erroring call the generic version
+    if(!(padded.dim() == 4 && do_transform_0213) && !(padded.dim() == 3 && !do_transform_0213)){
+      return at::native::nested_from_padded_generic(padded, sizes, do_transform_0213);
+    }
     if (padded.dtype() != kFloat && padded.dtype() != kHalf) {
       TORCH_WARN_ONCE(
           "nested_from_padded CUDA kernels only support fp32/fp16; falling "
           "back to slower generic kernel");
       return at::native::nested_from_padded_generic(padded, sizes, do_transform_0213);
     }
-    TORCH_CHECK(
-        (padded.dim() == 4 && do_transform_0213) ||
-            (padded.dim() == 3 && !do_transform_0213),
-        "padded tensor size error: ", padded.dim());
     Tensor target_offsets =
         NestedTensor_batch_offsets_from_size_tensor(sizes, 0);
     Tensor padded_sizes_tensor = at::tensor(padded.sizes());
diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cu b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cu
index fc84d07ba6797..3d738825fede6 100644
--- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cu
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cu
@@ -607,9 +607,13 @@ Tensor bmm_nested_cuda(const Tensor& self, const Tensor& mat2) {
     AT_ERROR(
         "Expected both to be nested, but got a non-nested self and nested other");
   }
+  // TODO currently we only support contiguous NestedTensors
+  auto self_contiguous = self.contiguous();
+  auto mat2_contiguous = mat2.contiguous();
+
   // dispatcher should have guaranteed that at least one is nested
-  auto self_ptr = get_nested_tensor_impl(self);
-  auto mat2_ptr = get_nested_tensor_impl(mat2);
+  auto self_ptr = get_nested_tensor_impl(self_contiguous);
+  auto mat2_ptr = get_nested_tensor_impl(mat2_contiguous);
   TORCH_CHECK(self_ptr->dim() == 3, "batch1 must be a 3D tensor");
   TORCH_CHECK(mat2_ptr->dim() == 3, "batch2 must be a 3D tensor");
   int64_t ntensors = self_ptr->size(0), ntensors2 = mat2_ptr->size(0);
diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py
index a7db45db4fec1..e0f40ca85238c 100644
--- a/test/test_nestedtensor.py
+++ b/test/test_nestedtensor.py
@@ -1768,38 +1768,38 @@ class TestNestedTensorAutograd(TestCase):
     # Note [Gradcheck args check_batched_grad=False] the common_utils testing version of gradcheck
     # includes the default parameters used for testing ops with gradcheck. However nested tensor
     # does not support the stack op therefore we turn it off for these tests
-    def _create_leaf_nested_tensor_from_list(self, requires_grad=False):
-        return torch.nested.nested_tensor([torch.randn(1, 2),
-                                           torch.randn(7, 8)], requires_grad=requires_grad)
+    def _create_leaf_nested_tensor_from_list(self, tensor_device, requires_grad=False):
+        return torch.nested.nested_tensor([torch.randn(1, 2,),
+                                           torch.randn(7, 8)], requires_grad=requires_grad, device=tensor_device)
 
-    def _create_nested_tensor_from_list(self, requires_grad=False):
+    def _create_nested_tensor_from_list(self, tensor_device, requires_grad=False):
         return torch.nested.as_nested_tensor([torch.randn(1, 2, requires_grad=requires_grad),
-                                              torch.randn(7, 8, requires_grad=requires_grad)])
+                                              torch.randn(7, 8, requires_grad=requires_grad)], device=tensor_device)
 
-    def _create_nested_tensor_from_mask(self, requires_grad=False):
-        data = torch.randn(2, 3, 4, requires_grad=requires_grad)
+    def _create_nested_tensor_from_mask(self, tensor_device, requires_grad=False):
+        data = torch.randn(2, 3, 4, requires_grad=requires_grad, device=tensor_device)
         mask = torch.ones_like(data[:, :, 0]).bool()
         return torch._nested_tensor_from_mask(data, mask)
 
-    def test_as_nested_tensor_propagates_gradients(self):
-        a = torch.arange(3, dtype=torch.float)
-        b = torch.arange(5, dtype=torch.float)
+    def test_as_nested_tensor_propagates_gradients(self, device):
+        a = torch.arange(3, dtype=torch.float, device=device)
+        b = torch.arange(5, dtype=torch.float, device=device)
         nt = torch.nested.as_nested_tensor([a, b])
         # tensors with requires_grad=False are leaves
         self.assertTrue(nt.is_leaf)
         self.assertTrue(not nt.requires_grad)
 
-        a = torch.arange(3, dtype=torch.float, requires_grad=True)
-        b = torch.arange(5, dtype=torch.float, requires_grad=True)
+        a = torch.arange(3, dtype=torch.float, requires_grad=True, device=device)
+        b = torch.arange(5, dtype=torch.float, requires_grad=True, device=device)
         nt2 = torch.nested.as_nested_tensor([a, b])
-        fake_grad = torch.nested.nested_tensor([torch.ones_like(a), torch.zeros_like(b)])
+        fake_grad = torch.nested.nested_tensor([torch.ones_like(a), torch.zeros_like(b)], device=device)
         nt2.backward(fake_grad)
         self.assertEqual(a.grad, fake_grad[0])
         self.assertEqual(b.grad, fake_grad[1])
 
-    def test_nested_tensor_generates_leaf(self):
-        a = torch.arange(3, dtype=torch.float, requires_grad=True)
-        b = torch.arange(5, dtype=torch.float, requires_grad=True)
+    def test_nested_tensor_generates_leaf(self, device):
+        a = torch.arange(3, dtype=torch.float, requires_grad=True, device=device)
+        b = torch.arange(5, dtype=torch.float, requires_grad=True, device=device)
 
         nt = torch.nested.nested_tensor([a, b], requires_grad=False)
         self.assertTrue(nt.is_leaf)
@@ -1809,32 +1809,32 @@ def test_nested_tensor_generates_leaf(self):
         self.assertTrue(nt2.is_leaf)
         self.assertTrue(nt2.requires_grad)
 
-        fake_grad = torch.nested.nested_tensor([torch.ones_like(a), torch.zeros_like(b)])
+        fake_grad = torch.nested.nested_tensor([torch.ones_like(a), torch.zeros_like(b)], device=device)
         nt2.backward(fake_grad)
         self.assertEqual(nt2.grad, fake_grad)
         self.assertEqual(a.grad, None)
         self.assertEqual(b.grad, None)
 
-    def test_set_requires_grad_from_list(self):
-        nt = self._create_nested_tensor_from_list()
+    def test_set_requires_grad_from_list(self, device):
+        nt = self._create_nested_tensor_from_list(device)
         nt.requires_grad_()
         assert nt.requires_grad
 
-    def test_set_requires_grad_from_mask(self):
-        nt = self._create_nested_tensor_from_mask()
+    def test_set_requires_grad_from_mask(self, device):
+        nt = self._create_nested_tensor_from_mask(device)
         nt.requires_grad_()
         assert nt.requires_grad
 
-    def test_backward_for_add_op(self):
-        nt_1 = self._create_nested_tensor_from_mask()
-        nt_2 = self._create_nested_tensor_from_mask()
+    def test_backward_for_add_op(self, device):
+        nt_1 = self._create_nested_tensor_from_mask(device)
+        nt_2 = self._create_nested_tensor_from_mask(device)
 
         nt_1.requires_grad_()
         c = nt_1 + nt_2
 
         assert nt_1.requires_grad
         assert c.requires_grad
-        grad_output = self._create_nested_tensor_from_mask()
+        grad_output = self._create_nested_tensor_from_mask(device)
         c.backward(grad_output)
 
         #  Grad check doesn't work with nested yet.
@@ -1842,27 +1842,27 @@ def test_backward_for_add_op(self):
         self.assertEqual(nt_1.grad, grad_output)
 
     # Test Factory Functions
-    def test_nested_tensor_to_padded_tensor(self):
+    def test_nested_tensor_to_padded_tensor(self, device):
         for padding_val in [0, 1]:
-            nt = self._create_leaf_nested_tensor_from_list(True)
+            nt = self._create_leaf_nested_tensor_from_list(tensor_device=device, requires_grad=True)
 
             out = torch.nested.to_padded_tensor(nt, padding_val)
-            grad_output = torch.ones(out.shape)
+            grad_output = torch.ones(out.shape, device=device)
             out.backward(grad_output)
 
-            self.assertEqual(nt.grad, torch.nested.nested_tensor([torch.ones(1, 2), torch.ones(7, 8)]))
+            self.assertEqual(nt.grad, torch.nested.nested_tensor([torch.ones(1, 2), torch.ones(7, 8)], device=device))
 
-    def test_nested_tensor_from_mask_and_to_padded(self):
+    def test_nested_tensor_from_mask_and_to_padded(self, device):
         N, L, D = 2, 4, 4
-        mask = torch.ones(N, L)
+        mask = torch.ones(N, L, device=device)
         for i in range(1, N):
-            end = torch.randint(1, L - 1, (1,))
+            end = torch.randint(1, L - 1, (1,), device=device)
             mask[i, end:] = 0
 
         mask[0, :] = 1
         mask = mask.bool()
 
-        data = torch.randn(N, L, D, requires_grad=True, dtype=torch.float64)
+        data = torch.randn(N, L, D, requires_grad=True, dtype=torch.float64, device=device)
 
         def grad_test_func(inpt):
             nt = torch._nested_tensor_from_mask(inpt, mask)
@@ -1870,9 +1870,9 @@ def grad_test_func(inpt):
             return torch.nested.to_padded_tensor(nt, 0)
         assert gradcheck(grad_test_func, inputs=data, check_batched_grad=False)
 
-    def test_nested_tensor_from_padded(self):
+    def test_nested_tensor_from_padded(self, device):
         nested_size = torch.tensor([[1, 2], [2, 2]])
-        padded_tensor = torch.randn(2, 2, 2, dtype=torch.float64)
+        padded_tensor = torch.randn(2, 2, 2, dtype=torch.float64, device=device)
         padded_tensor[0, 1, :] = 0
         padded_tensor.requires_grad_()
 
@@ -1884,9 +1884,9 @@ def grad_test_func(tensor, nested_size):
         data = (padded_tensor, nested_size)
         assert gradcheck(grad_test_func, inputs=data, check_batched_grad=False)
 
-    def test_nested_tensor_from_padded_fused(self):
+    def test_nested_tensor_from_padded_fused(self, device):
         nested_size = torch.tensor([[1, 8], [2, 8]])
-        padded_tensor = torch.randn(2, 2, 2, 4, dtype=torch.float64)
+        padded_tensor = torch.randn(2, 2, 2, 4, dtype=torch.float64, device=device)
         padded_tensor[0, 1, :] = 0
         padded_tensor.requires_grad_()
 
@@ -1897,11 +1897,11 @@ def grad_test_func(tensor, nested_size):
         data = (padded_tensor, nested_size)
         assert gradcheck(grad_test_func, inputs=data, check_batched_grad=False)
 
-    def test_nested_tensor_from_list(self):
+    def test_nested_tensor_from_list(self, device):
 
-        a = torch.randn(1, 2, requires_grad=True, dtype=torch.float64)
-        b = torch.randn(2, 2, requires_grad=True, dtype=torch.float64)
-        c = torch.randn(10, 2, requires_grad=True, dtype=torch.float64)
+        a = torch.randn(1, 2, requires_grad=True, dtype=torch.float64, device=device)
+        b = torch.randn(2, 2, requires_grad=True, dtype=torch.float64, device=device)
+        c = torch.randn(10, 2, requires_grad=True, dtype=torch.float64, device=device)
 
         def grad_test_func(a, b, c):
             c = torch.nested.as_nested_tensor([a, b, c])
@@ -1917,11 +1917,11 @@ def test_dropout_backward(self):
         y.backward(nt.clone().detach())
         self.assertEqual(nt.grad, y)
 
-    def test_nested_tensor_bmm_gradcheck(self):
-        a = torch.randn(2, 6, requires_grad=True, dtype=torch.float64)
-        b = torch.randn(3, 6, requires_grad=True, dtype=torch.float64)
-        c = torch.randn(6, 4, requires_grad=True, dtype=torch.float64)
-        d = torch.randn(6, 5, requires_grad=True, dtype=torch.float64)
+    def test_nested_tensor_bmm_gradcheck(self, device):
+        a = torch.randn(2, 6, requires_grad=True, dtype=torch.float64, device=device)
+        b = torch.randn(3, 6, requires_grad=True, dtype=torch.float64, device=device)
+        c = torch.randn(6, 4, requires_grad=True, dtype=torch.float64, device=device)
+        d = torch.randn(6, 5, requires_grad=True, dtype=torch.float64, device=device)
 
         def grad_test_func(a, b, c, d):
             nt0 = torch.nested.as_nested_tensor([a, b])
@@ -1932,9 +1932,9 @@ def grad_test_func(a, b, c, d):
         data = (a, b, c, d)
         assert torch.autograd.gradcheck(grad_test_func, inputs=data)
 
-    def test_nested_tensor_bmm_backward(self):
-        nt0 = torch.nested.nested_tensor([torch.randn((2, 6)), torch.randn((3, 6))], requires_grad=True)
-        nt1 = torch.nested.nested_tensor([torch.randn((6, 4)), torch.randn((6, 5))], requires_grad=True)
+    def test_nested_tensor_bmm_backward(self, device):
+        nt0 = torch.nested.nested_tensor([torch.randn((2, 6)), torch.randn((3, 6))], requires_grad=True, device=device)
+        nt1 = torch.nested.nested_tensor([torch.randn((6, 4)), torch.randn((6, 5))], requires_grad=True, device=device)
         with torch.no_grad():
             pt0 = torch.nested.to_padded_tensor(nt0, 0.0).requires_grad_(True)
             pt1 = torch.nested.to_padded_tensor(nt1, 0.0).requires_grad_(True)
@@ -1947,11 +1947,11 @@ def test_nested_tensor_bmm_backward(self):
         self.assertEqual(torch.nested.to_padded_tensor(nt0.grad, 0.0), pt0.grad)
         self.assertEqual(torch.nested.to_padded_tensor(nt1.grad, 0.0), pt1.grad)
 
-    def test_nested_tensor_matmul_gradcheck(self):
-        a = torch.randn(2, 6, requires_grad=True, dtype=torch.float64)
-        b = torch.randn(3, 6, requires_grad=True, dtype=torch.float64)
-        c = torch.randn(6, 4, requires_grad=True, dtype=torch.float64)
-        d = torch.randn(6, 5, requires_grad=True, dtype=torch.float64)
+    def test_nested_tensor_matmul_gradcheck(self, device):
+        a = torch.randn(2, 6, requires_grad=True, dtype=torch.float64, device=device)
+        b = torch.randn(3, 6, requires_grad=True, dtype=torch.float64, device=device)
+        c = torch.randn(6, 4, requires_grad=True, dtype=torch.float64, device=device)
+        d = torch.randn(6, 5, requires_grad=True, dtype=torch.float64, device=device)
 
         def grad_test_func(a, b, c, d):
             nt0 = torch.nested.as_nested_tensor([a, b])
@@ -1962,9 +1962,9 @@ def grad_test_func(a, b, c, d):
         data = (a, b, c, d)
         assert torch.autograd.gradcheck(grad_test_func, inputs=data)
 
-    def test_nested_tensor_matmul_backward(self):
-        nt0 = torch.nested.nested_tensor([torch.randn((7, 2, 6)), torch.randn((7, 3, 6))], requires_grad=True)
-        nt1 = torch.nested.nested_tensor([torch.randn((7, 6, 4)), torch.randn((7, 6, 5))], requires_grad=True)
+    def test_nested_tensor_matmul_backward(self, device):
+        nt0 = torch.nested.nested_tensor([torch.randn((7, 2, 6)), torch.randn((7, 3, 6))], requires_grad=True, device=device)
+        nt1 = torch.nested.nested_tensor([torch.randn((7, 6, 4)), torch.randn((7, 6, 5))], requires_grad=True, device=device)
         with torch.no_grad():
             pt0 = torch.nested.to_padded_tensor(nt0, 0.0).requires_grad_(True)
             pt1 = torch.nested.to_padded_tensor(nt1, 0.0).requires_grad_(True)
@@ -1977,9 +1977,9 @@ def test_nested_tensor_matmul_backward(self):
         self.assertEqual(torch.nested.to_padded_tensor(nt0.grad, 0.0), pt0.grad)
         self.assertEqual(torch.nested.to_padded_tensor(nt1.grad, 0.0), pt1.grad)
 
-    def test_nested_tensor_transpose_gradcheck(self):
-        a = torch.randn(2, 5, requires_grad=True)
-        b = torch.randn(3, 4, requires_grad=True)
+    def test_nested_tensor_transpose_gradcheck(self, device):
+        a = torch.randn(2, 5, requires_grad=True, device=device)
+        b = torch.randn(3, 4, requires_grad=True, device=device)
 
         def grad_test_func(a, b):
             nt = torch.nested.as_nested_tensor([a, b])
@@ -1989,8 +1989,8 @@ def grad_test_func(a, b):
         data = (a, b)
         assert torch.autograd.gradcheck(grad_test_func, inputs=data, eps=1e-3)
 
-    def test_nested_tensor_transpose_backward(self):
-        nt = torch.nested.nested_tensor([torch.randn((2, 5)), torch.randn((3, 4))], requires_grad=True)
+    def test_nested_tensor_transpose_backward(self, device):
+        nt = torch.nested.nested_tensor([torch.randn((2, 5)), torch.randn((3, 4))], requires_grad=True, device=device)
         with torch.no_grad():
             pt = torch.nested.to_padded_tensor(nt, 0.0).requires_grad_(True)
 
@@ -2001,9 +2001,9 @@ def test_nested_tensor_transpose_backward(self):
 
         self.assertEqual(torch.nested.to_padded_tensor(nt.grad, 0.0), pt.grad)
 
-    def test_nested_tensor_reshape_gradcheck(self):
-        a = torch.randn(2, 6, requires_grad=True)
-        b = torch.randn(3, 6, requires_grad=True)
+    def test_nested_tensor_reshape_gradcheck(self, device):
+        a = torch.randn(2, 6, requires_grad=True, device=device)
+        b = torch.randn(3, 6, requires_grad=True, device=device)
 
         def grad_test_func(a, b):
             nt = torch.nested.as_nested_tensor([a, b])
@@ -2025,8 +2025,8 @@ def test_nested_tensor_reshape_backward(self):
 
         self.assertEqual(torch.nested.to_padded_tensor(nt.grad, 0.0), pt.grad)
 
-    def test_nested_tensor_squeeze_backward(self):
-        nt = torch.nested.nested_tensor([torch.randn((2, 6, 1)), torch.randn((3, 6, 1))], requires_grad=True)
+    def test_nested_tensor_squeeze_backward(self, device):
+        nt = torch.nested.nested_tensor([torch.randn((2, 6, 1)), torch.randn((3, 6, 1))], requires_grad=True, device=device)
         with torch.no_grad():
             pt = torch.nested.to_padded_tensor(nt, 0.0).requires_grad_(True)
 
@@ -2037,9 +2037,9 @@ def test_nested_tensor_squeeze_backward(self):
 
         self.assertEqual(torch.nested.to_padded_tensor(nt.grad, 0.0), pt.grad)
 
-    def test_nested_tensor_squeeze_gradcheck(self):
-        a = torch.randn((2, 6, 1), dtype=torch.float64, requires_grad=True)
-        b = torch.randn((3, 6, 1), dtype=torch.float64, requires_grad=True)
+    def test_nested_tensor_squeeze_gradcheck(self, device):
+        a = torch.randn((2, 6, 1), dtype=torch.float64, requires_grad=True, device=device)
+        b = torch.randn((3, 6, 1), dtype=torch.float64, requires_grad=True, device=device)
 
         def grad_test_func(a, b):
             nt = torch.nested.as_nested_tensor([a, b])
@@ -2048,8 +2048,8 @@ def grad_test_func(a, b):
 
         assert torch.autograd.gradcheck(grad_test_func, inputs=(a, b), eps=1e-3)
 
-    def test_nested_tensor_unsqueeze_backward(self):
-        nt = torch.nested.nested_tensor([torch.randn((2, 6)), torch.randn((3, 6))], requires_grad=True)
+    def test_nested_tensor_unsqueeze_backward(self, device):
+        nt = torch.nested.nested_tensor([torch.randn((2, 6)), torch.randn((3, 6))], requires_grad=True, device=device)
         with torch.no_grad():
             pt = torch.nested.to_padded_tensor(nt, 0.0).requires_grad_(True)
 
@@ -2060,9 +2060,9 @@ def test_nested_tensor_unsqueeze_backward(self):
 
         self.assertEqual(torch.nested.to_padded_tensor(nt.grad, 0.0), pt.grad)
 
-    def test_nested_tensor_unsqueeze_gradcheck(self):
-        a = torch.randn((2, 6), dtype=torch.float64, requires_grad=True)
-        b = torch.randn((3, 6), dtype=torch.float64, requires_grad=True)
+    def test_nested_tensor_unsqueeze_gradcheck(self, device):
+        a = torch.randn((2, 6), dtype=torch.float64, requires_grad=True, device=device)
+        b = torch.randn((3, 6), dtype=torch.float64, requires_grad=True, device=device)
 
         def grad_test_func(a, b):
             nt = torch.nested.as_nested_tensor([a, b])
@@ -2071,14 +2071,14 @@ def grad_test_func(a, b):
 
         assert torch.autograd.gradcheck(grad_test_func, inputs=(a, b), eps=1e-3)
 
-    def test_nested_tensor_linear(self):
+    def test_nested_tensor_linear(self, device):
 
-        a = torch.randn(1, 2, requires_grad=True, dtype=torch.float64)
-        b = torch.randn(2, 2, requires_grad=True, dtype=torch.float64)
-        c = torch.randn(3, 2, requires_grad=True, dtype=torch.float64)
+        a = torch.randn(1, 2, requires_grad=True, dtype=torch.float64, device=device)
+        b = torch.randn(2, 2, requires_grad=True, dtype=torch.float64, device=device)
+        c = torch.randn(3, 2, requires_grad=True, dtype=torch.float64, device=device)
 
-        weight = torch.randn(2, 2, requires_grad=True, dtype=torch.float64)
-        bias = torch.randn(2, requires_grad=True, dtype=torch.float64)
+        weight = torch.randn(2, 2, requires_grad=True, dtype=torch.float64, device=device)
+        bias = torch.randn(2, requires_grad=True, dtype=torch.float64, device=device)
 
         def grad_test_func(a, b, c, weight, bias=None):
             nt = torch.nested.as_nested_tensor([a, b, c])
@@ -2092,10 +2092,10 @@ def grad_test_func(a, b, c, weight, bias=None):
         data = (a, b, c, weight)
         assert gradcheck(grad_test_func, inputs=data, check_batched_grad=False)
 
-    def test_nested_tensor_softmax(self):
-        a = torch.randn(1, 2, requires_grad=True, dtype=torch.float64)
-        b = torch.randn(2, 2, requires_grad=True, dtype=torch.float64)
-        c = torch.randn(3, 2, requires_grad=True, dtype=torch.float64)
+    def test_nested_tensor_softmax(self, device):
+        a = torch.randn(1, 2, requires_grad=True, dtype=torch.float64, device=device)
+        b = torch.randn(2, 2, requires_grad=True, dtype=torch.float64, device=device)
+        c = torch.randn(3, 2, requires_grad=True, dtype=torch.float64, device=device)
 
         def grad_test_func(a, b, c, dim):
             nt = torch.nested.as_nested_tensor([a, b, c])
@@ -2107,14 +2107,14 @@ def grad_test_func(a, b, c, dim):
         data = (a, b, c, -1)
         assert gradcheck(grad_test_func, inputs=data, check_batched_grad=False)
 
-    def test_nested_tensor_linear_backward(self):
-        a = torch.randn(1, 2, requires_grad=False)
-        b = torch.randn(2, 2, requires_grad=False)
-        c = torch.randn(3, 2, requires_grad=False)
+    def test_nested_tensor_linear_backward(self, device):
+        a = torch.randn(1, 2, requires_grad=False, device=device)
+        b = torch.randn(2, 2, requires_grad=False, device=device)
+        c = torch.randn(3, 2, requires_grad=False, device=device)
 
-        weight = torch.randn(2, 2, requires_grad=True)
-        bias = torch.randn(2, requires_grad=True)
-        nt = torch.nested.as_nested_tensor([a, b, c])
+        weight = torch.randn(2, 2, requires_grad=True, device=device)
+        bias = torch.randn(2, requires_grad=True, device=device)
+        nt = torch.nested.as_nested_tensor([a, b, c], device=device)
 
         out = torch.functional.F.linear(nt, weight, bias)
 
@@ -2127,10 +2127,10 @@ def test_nested_tensor_linear_backward(self):
         assert b.grad is None
         assert c.grad is None
 
-    def test_values_grad_with_broadcast(self):
-        a = torch.randn(1, 2, 4, requires_grad=True, dtype=torch.float64)
-        b = torch.randn(2, 2, 4, requires_grad=True, dtype=torch.float64)
-        c = torch.randn(3, 2, 4, requires_grad=True, dtype=torch.float64)
+    def test_values_grad_with_broadcast(self, device):
+        a = torch.randn(1, 2, 4, requires_grad=True, dtype=torch.float64, device=device)
+        b = torch.randn(2, 2, 4, requires_grad=True, dtype=torch.float64, device=device)
+        c = torch.randn(3, 2, 4, requires_grad=True, dtype=torch.float64, device=device)
 
         def grad_test_func(a, b, c):
             nt = torch.nested.as_nested_tensor([a, b, c])
@@ -2140,10 +2140,10 @@ def grad_test_func(a, b, c):
         data = (a, b, c)
         assert gradcheck(grad_test_func, inputs=data, check_batched_grad=False)
 
-    def test_to_buffer_series_ops_grad_with_broadcast(self):
-        a = torch.randn(1, 1, 2, requires_grad=True, dtype=torch.float64)
-        b = torch.randn(1, 1, 2, requires_grad=True, dtype=torch.float64)
-        c = torch.randn(1, 1, 2, requires_grad=True, dtype=torch.float64)
+    def test_to_buffer_series_ops_grad_with_broadcast(self, device):
+        a = torch.randn(1, 1, 2, requires_grad=True, dtype=torch.float64, device=device)
+        b = torch.randn(1, 1, 2, requires_grad=True, dtype=torch.float64, device=device)
+        c = torch.randn(1, 1, 2, requires_grad=True, dtype=torch.float64, device=device)
 
         def grad_test_func(a, b, c):
             nt = torch.nested.as_nested_tensor([a, b, c])
@@ -2154,10 +2154,10 @@ def grad_test_func(a, b, c):
         data = (a, b, c)
         assert gradcheck(grad_test_func, inputs=data, check_batched_grad=False)
 
-    def test_unbind_flow_through(self):
-        a = torch.randn(1, 2, 4, requires_grad=True, dtype=torch.float64)
-        b = torch.randn(2, 2, 4, requires_grad=True, dtype=torch.float64)
-        c = torch.randn(3, 2, 4, requires_grad=True, dtype=torch.float64)
+    def test_unbind_flow_through(self, device):
+        a = torch.randn(1, 2, 4, requires_grad=True, dtype=torch.float64, device=device)
+        b = torch.randn(2, 2, 4, requires_grad=True, dtype=torch.float64, device=device)
+        c = torch.randn(3, 2, 4, requires_grad=True, dtype=torch.float64, device=device)
 
         def grad_test_func(a, b, c):
             nt = torch.nested.as_nested_tensor([a, b, c])
@@ -2170,19 +2170,20 @@ def grad_test_func(a, b, c):
         data = (a, b, c)
         assert gradcheck(grad_test_func, inputs=data, check_batched_grad=False)
 
-    def test_indexing_backward(self):
+    def test_indexing_backward(self, device):
         x0 = torch.randn((2, 5))
         x1 = torch.randn((3, 4))
-        nt = torch.nested.nested_tensor([x0, x1], requires_grad=True)
+        nt = torch.nested.nested_tensor([x0, x1], device=device, requires_grad=True)
         self.assertEqual(nt[0], x0)
         self.assertEqual(nt[-1], x1)
-        grad_x0 = torch.randn((2, 5))
+        grad_x0 = torch.randn((2, 5), device=device)
         nt[0].backward(grad_x0)
-        expected_grad = torch.nested.nested_tensor([grad_x0, torch.zeros((3, 4))])
+        expected_grad = torch.nested.nested_tensor([grad_x0, torch.zeros((3, 4), device=device)])
         self.assertEqual(nt.grad, expected_grad)
 
 
 instantiate_device_type_tests(TestNestedTensorDeviceType, globals())
+instantiate_device_type_tests(TestNestedTensorAutograd, globals())
 
 if __name__ == '__main__':
     run_tests()

From 82e620c026ed9155e7bf7cc8d1ba7bd3106f1bec Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Tue, 1 Nov 2022 03:59:51 +0000
Subject: [PATCH 0381/1922] [BE] Use default constructor in `LoggerVoidify`
 (#88054)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88054
Approved by: https://github.com/kit1980
---
 c10/util/logging_is_not_google_glog.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/c10/util/logging_is_not_google_glog.h b/c10/util/logging_is_not_google_glog.h
index d27cc18e45300..d92f163453e93 100644
--- a/c10/util/logging_is_not_google_glog.h
+++ b/c10/util/logging_is_not_google_glog.h
@@ -49,7 +49,7 @@ class C10_API MessageLogger {
 // is not used" and "statement has no effect".
 class C10_API LoggerVoidify {
  public:
-  LoggerVoidify() {}
+  LoggerVoidify() = default;
   // This has to be an operator with a precedence lower than << but
   // higher than ?:
   void operator&(const std::ostream& s) {}

From d2d715501815642aabc2a8511ac30fb4bfcc5248 Mon Sep 17 00:00:00 2001
From: Kevin Stephano <kevin.stephano@gmail.com>
Date: Tue, 1 Nov 2022 05:05:15 +0000
Subject: [PATCH 0382/1922] Cleaned up the nvFuser Python Frontend Batch Norm
 printing (#88057)

* Removed `define_null_tensor` usage in favor of using optional arguments for binding.
* Re-ordered the non-State arguments for easier printing.
* Added a printing function to include booleans `training` and `channels_last`
* Fixed `define_tensor` to print `is_cpu`

cc @jjsjann123
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88057
Approved by: https://github.com/IvanYashchuk, https://github.com/jjsjann123, https://github.com/mruberry
---
 torch/_prims/nvfuser_prims.py                 |  5 +-
 .../cuda/python_frontend/fusion_record.h      | 71 +++++++------------
 .../cuda/python_frontend/python_bindings.cpp  | 49 +++++++------
 3 files changed, 56 insertions(+), 69 deletions(-)

diff --git a/torch/_prims/nvfuser_prims.py b/torch/_prims/nvfuser_prims.py
index f37a21459e0cd..956f480de8905 100644
--- a/torch/_prims/nvfuser_prims.py
+++ b/torch/_prims/nvfuser_prims.py
@@ -214,6 +214,8 @@ def _{fname}_nvfuser(fd, a, b, c):
 def _native_batch_norm_nvfuser(
     fd, input, weight, bias, running_mean, running_var, training, momentum, eps
 ):
+
+    """
     if weight is None:
         weight = fd.define_null_tensor()
     if bias is None:
@@ -222,15 +224,16 @@ def _native_batch_norm_nvfuser(
         running_mean = fd.define_null_tensor()
     if running_var is None:
         running_var = fd.define_null_tensor()
+    """
     return fd.ops.batch_norm(
         input,
         weight,
         bias,
         running_mean,
         running_var,
-        training,
         momentum,
         eps,
+        training,
     )
 
 
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h b/torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h
index f124cf36e0092..622ec6919c89f 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h
@@ -23,7 +23,6 @@ enum class RecordType {
   Constant,
   End,
   Tensor,
-  NullTensor,
   Output,
   ReductionOp,
   Scalar,
@@ -144,13 +143,14 @@ struct RecordFunctor {
         os << ", ";
       }
       if (arg.stype == StateType::Scalar) {
-        os << "S";
+        os << "S" << arg.index;
       } else if (arg.stype == StateType::Tensor) {
-        os << "T";
+        os << "T" << arg.index;
+      } else if (arg.stype == StateType::None) {
+        os << "None";
       } else {
         TORCH_INTERNAL_ASSERT(false, "Unsupported StateType");
       }
-      os << arg.index;
     }
     if (close_function) {
       os << ")";
@@ -974,6 +974,7 @@ struct TensorRecord : RecordFunctor {
       }
     }
     os << "], dtype=" << dtypeToPyString(dtype_);
+    os << ", is_cpu=" << (is_cpu_ ? "True" : "False");
     if (close_function) {
       os << ")";
     }
@@ -993,41 +994,6 @@ struct TensorRecord : RecordFunctor {
   bool is_cpu_;
 };
 
-struct NullTensorRecord : RecordFunctor {
-  NullTensorRecord(std::vector<State> _outputs)
-      : RecordFunctor(
-            {},
-            std::move(_outputs),
-            "null_tensor",
-            RecordType::NullTensor) {}
-  virtual ~NullTensorRecord() = default;
-  virtual RecordFunctor* clone() final {
-    return new NullTensorRecord(*this);
-  }
-
-  //! Nothing extra necessary in hash
-  //! Child specific hash function in lower 32 bits.
-  //! | 31 ---------------------------------------  0 |
-  //! | None                                          |
-  virtual size_t hash() const final {
-    auto result = RecordFunctor::hash();
-    return result;
-  }
-
-  virtual bool operator==(const RecordFunctor& other) const final {
-    auto result = false;
-    if (dynamic_cast<const NullTensorRecord*>(&other)) {
-      result = RecordFunctor::operator==(other);
-    }
-    return result;
-  }
-
-  virtual void operator()(FusionDefinition& fd) final {
-    Nvf::TensorView* tv = nullptr;
-    fd.setFusionState(outputs_.at(0).index, tv);
-  }
-};
-
 //! Specialized Record Functor for recording FusionDefinition outputs.
 
 template <class OutputType>
@@ -1482,12 +1448,18 @@ struct BatchNormOpRecord : RecordFunctor {
 
   void operator()(FusionDefinition& fd) final {
     auto x = fd.getFusionState(args_.at(0).index)->as<Nvf::TensorView>();
-    auto weight = fd.getFusionState(args_.at(1).index)->as<Nvf::TensorView>();
-    auto bias = fd.getFusionState(args_.at(2).index)->as<Nvf::TensorView>();
-    auto running_mean =
-        fd.getFusionState(args_.at(3).index)->as<Nvf::TensorView>();
-    auto running_var =
-        fd.getFusionState(args_.at(4).index)->as<Nvf::TensorView>();
+    auto weight = (args_.at(1).stype == StateType::Tensor)
+        ? fd.getFusionState(args_.at(1).index)->as<Nvf::TensorView>()
+        : nullptr;
+    auto bias = (args_.at(2).stype == StateType::Tensor)
+        ? fd.getFusionState(args_.at(2).index)->as<Nvf::TensorView>()
+        : nullptr;
+    auto running_mean = (args_.at(3).stype == StateType::Tensor)
+        ? fd.getFusionState(args_.at(3).index)->as<Nvf::TensorView>()
+        : nullptr;
+    auto running_var = (args_.at(4).stype == StateType::Tensor)
+        ? fd.getFusionState(args_.at(4).index)->as<Nvf::TensorView>()
+        : nullptr;
     auto momentum = fd.getFusionState(args_.at(5).index)->as<Nvf::Val>();
     auto eps = fd.getFusionState(args_.at(6).index)->as<Nvf::Val>();
     auto output = Nvf::batch_norm(
@@ -1505,6 +1477,15 @@ struct BatchNormOpRecord : RecordFunctor {
     fd.setFusionState(outputs_.at(2).index, output.invstd);
   }
 
+  virtual void print(std::ostream& os, bool close_function = true) const final {
+    RecordFunctor::print(os, false);
+    os << ", training=" << (training_ ? "True" : "False");
+    os << ", channels_last=" << (channels_last_ ? "True" : "False");
+    if (close_function) {
+      os << ")";
+    }
+  }
+
  private:
   bool training_;
   bool channels_last_;
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp b/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
index b8c799d00b90a..f8e3b5e5e9218 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
@@ -2,6 +2,7 @@
 
 #ifdef USE_CUDA
 #include <c10/util/ArrayRef.h>
+#include <c10/util/Optional.h>
 #include <c10/util/irange.h>
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
@@ -126,16 +127,6 @@ void initNvFuserPythonBindings(PyObject* module) {
             self.defineRecord(new nvfuser::OutputRecord<Nvf::TensorView>(
                 {self.recordingState(output())}));
           })
-      .def(
-          "define_null_tensor",
-          [](nvfuser::FusionDefinition& self) -> nvfuser::Tensor {
-            FUSER_PERF_SCOPE("FusionDefinition.define_null_tensor");
-            nvfuser::Tensor out = self.defineTensor();
-            self.defineRecord(
-                new nvfuser::NullTensorRecord({self.recordingState(out())}));
-            return out;
-          },
-          py::return_value_policy::reference)
       .def(
           "define_tensor",
           [](nvfuser::FusionDefinition& self,
@@ -1292,26 +1283,38 @@ void initNvFuserPythonBindings(PyObject* module) {
   nvf_ops.def(
       "batch_norm",
       [](nvfuser::FusionDefinition::Operators& self,
-         nvfuser::Tensor x,
-         nvfuser::Tensor weight,
-         nvfuser::Tensor bias,
-         nvfuser::Tensor running_mean,
-         nvfuser::Tensor running_var,
-         bool training,
+         nvfuser::Tensor arg,
+         c10::optional<nvfuser::Tensor> weight,
+         c10::optional<nvfuser::Tensor> bias,
+         c10::optional<nvfuser::Tensor> running_mean,
+         c10::optional<nvfuser::Tensor> running_var,
          nvfuser::Scalar momentum,
          nvfuser::Scalar eps,
+         bool training,
          bool channels_last) -> decltype(auto) {
         FUSER_PERF_SCOPE("Operators.batch_norm");
         nvfuser::FusionDefinition* fd = self.fusion_definition;
         nvfuser::Tensor output = fd->defineTensor();
         nvfuser::Tensor mean = fd->defineTensor();
         nvfuser::Tensor invstd = fd->defineTensor();
+        auto weight_state = weight.has_value()
+            ? fd->recordingState(weight.value()())
+            : nvfuser::State(0, nvfuser::StateType::None);
+        auto bias_state = bias.has_value()
+            ? fd->recordingState(bias.value()())
+            : nvfuser::State(0, nvfuser::StateType::None);
+        auto running_mean_state = running_mean.has_value()
+            ? fd->recordingState(running_mean.value()())
+            : nvfuser::State(0, nvfuser::StateType::None);
+        auto running_var_state = running_var.has_value()
+            ? fd->recordingState(running_var.value()())
+            : nvfuser::State(0, nvfuser::StateType::None);
         fd->defineRecord(new nvfuser::BatchNormOpRecord(
-            {fd->recordingState(x()),
-             fd->recordingState(weight()),
-             fd->recordingState(bias()),
-             fd->recordingState(running_mean()),
-             fd->recordingState(running_var()),
+            {fd->recordingState(arg()),
+             weight_state,
+             bias_state,
+             running_mean_state,
+             running_var_state,
              fd->recordingState(momentum()),
              fd->recordingState(eps())},
             {fd->recordingState(output()),
@@ -1321,14 +1324,14 @@ void initNvFuserPythonBindings(PyObject* module) {
             channels_last));
         return std::make_tuple(output, mean, invstd);
       },
-      py::arg("x"),
+      py::arg("arg"),
       py::arg("weight").none(true),
       py::arg("bias").none(true),
       py::arg("running_mean").none(true),
       py::arg("running_var").none(true),
-      py::arg("training"),
       py::arg("momentum"),
       py::arg("eps"),
+      py::arg("training"),
       py::arg("channels_last") = false,
       py::return_value_policy::reference);
   nvf_ops.def(

From 42f1749b62662aa8a033e2d88c51afb28891ecad Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 1 Nov 2022 05:58:42 +0000
Subject: [PATCH 0383/1922] Fix monitoring script for macos (#88159)

The monitoring script is currently failing with AccessDenied when trying to access uss memory on mac because [psutil.memory_full_info](https://psutil.readthedocs.io/en/latest/index.html?highlight=memory_full_info) requires higher user privileges

Example failures:
* https://gha-artifacts.s3.amazonaws.com/pytorch/pytorch/3363066309/1/artifact/usage-log-test-default-2-2-macos-12_9208104847.zip
* https://gha-artifacts.s3.amazonaws.com/pytorch/pytorch/3363066309/1/artifact/usage-log-test-default-2-2-macos-m1-12_9207913759.zip

I could also make this script run with sudo, effectively granting this permission. But I'm not entirely sure that we need uss memory for mac, so gracefully handling the error looks nicer
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88159
Approved by: https://github.com/clee2000
---
 tools/stats/monitor.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/tools/stats/monitor.py b/tools/stats/monitor.py
index 972d0dbea038b..b45979451507a 100644
--- a/tools/stats/monitor.py
+++ b/tools/stats/monitor.py
@@ -30,11 +30,22 @@ def get_per_process_cpu_info() -> List[Dict[str, Any]]:
             "cmd": " ".join(p.cmdline()),
             "cpu_percent": p.cpu_percent(),
             "rss_memory": p.memory_info().rss,
-            "uss_memory": p.memory_full_info().uss,
         }
-        if "pss" in p.memory_full_info():
-            # only availiable in linux
-            info["pss_memory"] = p.memory_full_info().pss
+
+        # https://psutil.readthedocs.io/en/latest/index.html?highlight=memory_full_info
+        # requires higher user privileges and could throw AccessDenied error, i.e. mac
+        try:
+            memory_full_info = p.memory_full_info()
+
+            info["uss_memory"] = memory_full_info.uss
+            if "pss" in memory_full_info:
+                # only availiable in linux
+                info["pss_memory"] = memory_full_info.pss
+
+        except psutil.AccessDenied as e:
+            # It's ok to skip this
+            pass
+
         per_process_info.append(info)
     return per_process_info
 

From d474dbe3c4bc5dae902e7db9130a477e7bafcdc0 Mon Sep 17 00:00:00 2001
From: Mike Iovine <mikeiovine@meta.com>
Date: Tue, 1 Nov 2022 09:58:26 +0000
Subject: [PATCH 0384/1922] [Static Runtime] Fix ReplaceWithMaybeCopy test in
 OSS (#88099)

Summary: `ReplaceWithMaybeCopy` is guarded by `FBCODE_CAFFE` in `OptimizeGraph`. Run the pass manually to ensure it does the replacement.

Test Plan: Existing tests

Differential Revision: D40858743

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88099
Approved by: https://github.com/huydhn
---
 benchmarks/static_runtime/test_static_runtime.cc | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index 883db0c141b4c..2cb50d48ff636 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -3194,9 +3194,14 @@ TEST(StaticRuntime, ReplaceWithMaybeCopy) {
   smodule.runtime().check_for_memory_leak();
 
   EXPECT_TRUE(expected.equal(actual));
-  EXPECT_FALSE(hasProcessedNodeWithName(smodule, "aten::to"));
+
+  // Make a fresh graph to ensure the pass works in isolation
+  auto new_graph = std::make_shared<torch::jit::Graph>();
+  torch::jit::parseIR(to, new_graph.get());
+  ReplaceWithMaybeCopy(new_graph);
+  EXPECT_FALSE(hasNodeWithKind(new_graph, "aten::to"));
   EXPECT_TRUE(
-      hasProcessedNodeWithName(smodule, "static_runtime::to_maybe_copy_out"));
+      hasNodeWithKind(new_graph, "static_runtime::to_maybe_copy_out"));
 }
 
 TEST(StaticRuntime, Int) {

From 7af7c95250b27aa454b01e41263f3df718e58f5b Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Mon, 31 Oct 2022 20:54:52 +0000
Subject: [PATCH 0385/1922] [FSDP()][9/N] Refactor ctor (continued) (#87923)

This PR makes a second pass over the constructor. The logic has been grouped into `_init_<...>` functions based on intent (e.g. `_init_prefetching_state()` or `_init_runtime_state()`). This makes the initialization code for composable FSDP much cleaner than having to re-write the same sequences of lower-level helper calls.

This PR also moves `_ExecOrderData` into its own file `_exec_order_utils.py`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87923
Approved by: https://github.com/mrshenli
---
 torch/distributed/fsdp/_common_utils.py       |  14 +-
 torch/distributed/fsdp/_exec_order_utils.py   | 380 +++++++++++
 torch/distributed/fsdp/_init_utils.py         | 247 ++++++-
 torch/distributed/fsdp/_wrap_utils.py         |  55 ++
 torch/distributed/fsdp/flat_param.py          |   6 +
 .../fsdp/fully_sharded_data_parallel.py       | 612 ++----------------
 6 files changed, 749 insertions(+), 565 deletions(-)
 create mode 100644 torch/distributed/fsdp/_exec_order_utils.py
 create mode 100644 torch/distributed/fsdp/_wrap_utils.py

diff --git a/torch/distributed/fsdp/_common_utils.py b/torch/distributed/fsdp/_common_utils.py
index 72c5a506f0765..6d3681fc69c06 100644
--- a/torch/distributed/fsdp/_common_utils.py
+++ b/torch/distributed/fsdp/_common_utils.py
@@ -3,10 +3,11 @@
 """
 
 from enum import auto, Enum
-from typing import Callable, Dict, List
+from typing import Callable, Dict, List, Union
 
 import torch
 import torch.distributed.fsdp.flat_param as flat_param_file
+import torch.nn as nn
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     _CHECKPOINT_PREFIX,
 )
@@ -16,6 +17,17 @@
 FSDP_FLATTENED = "_fsdp_flattened"
 
 
+class FSDPState:
+    """
+    This encompasses all FSDP state.
+    """
+
+
+# We leverage Python's dynamic attribute definition to unify the state
+# management for the wrapper and non-wrapper approaches.
+_State = Union[nn.Module, FSDPState]
+
+
 class TrainingState(Enum):
     """
     An enum that indicates the state of a ``FullyShardedDataParallel` instance.
diff --git a/torch/distributed/fsdp/_exec_order_utils.py b/torch/distributed/fsdp/_exec_order_utils.py
new file mode 100644
index 0000000000000..1c3b364a90b18
--- /dev/null
+++ b/torch/distributed/fsdp/_exec_order_utils.py
@@ -0,0 +1,380 @@
+import itertools
+import warnings
+from enum import auto, Enum
+from typing import cast, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed.fsdp._common_utils import _get_param_to_unflat_param_names
+from torch.distributed.fsdp.flat_param import FlatParameter, FlatParamHandle
+
+_HandlesKey = Tuple[FlatParamHandle, ...]
+
+
+class _ExecOrderWarnStatus(Enum):
+    """Used internally for execution order validation."""
+
+    NONE = auto()  # no deviation yet
+    WARNING = auto()  # deviated this iteration; currently issuing warnings
+    WARNED = auto()  # deviated in a previous iteration
+
+
+class _ExecOrderData:
+    """
+    This contains the data structures to track the execution order. We track
+    the pre-forward order on the *first* iteration for forward prefetching
+    (which thus assumes static graph) and the post-forward order on *every*
+    iteration for backward prefetching (which thus does not assume static
+    graph but may be provide an incorrect order).
+    """
+
+    def __init__(
+        self,
+        debug_level: dist.DebugLevel,
+        backward_prefetch_limit: int,
+        forward_prefetch_limit: int,
+    ) -> None:
+        # Tracks the (static) pre-forward order for execution order validation
+        # and forward prefetching
+        self.handles_pre_forward_order: List[_HandlesKey] = []
+        # Maps each handles key to its index in `handles_pre_forward_order`
+        self.handles_to_pre_forward_order_index: Dict[_HandlesKey, int] = {}
+        # Tracks the post-forward order for pre-backward prefetching
+        self.handles_post_forward_order: List[_HandlesKey] = []
+        # Maps each handles key to its index in `handles_post_forward_order`
+        self.handles_to_post_forward_order_index: Dict[_HandlesKey, int] = {}
+        self.is_first_iter = True
+
+        # Gives the max number of backward/forward prefetched all-gathers by a
+        # single module
+        self._backward_prefetch_limit = backward_prefetch_limit
+        self._forward_prefetch_limit = forward_prefetch_limit
+
+        # Data structures for execution order validation
+        self._checking_order: bool = debug_level in [
+            dist.DebugLevel.INFO,
+            dist.DebugLevel.DETAIL,
+        ]
+        self.process_group: Optional[dist.ProcessGroup] = None
+        self.world_size: Optional[int] = None
+        self.all_handles: List[FlatParamHandle] = []
+        # Maps each handle to its index in `all_handles`, which must be the
+        # same across ranks for the execution order validation to work
+        self.handle_to_handle_index: Dict[FlatParamHandle, int] = {}
+        # Names are prefixed from the root module
+        self.flat_param_to_prefixed_param_names: Dict[FlatParameter, List[str]] = {}
+        # Current index in the pre-forward execution order
+        self.current_order_index = 0
+        self.warn_status = _ExecOrderWarnStatus.NONE
+
+    def init(
+        self,
+        fsdp_root: nn.Module,  # `FullyShardedDataParallel`
+        process_group: dist.ProcessGroup,
+    ) -> None:
+        """
+        Initializes the data structures needed for checking the forward order.
+        This should be called after a root FSDP instance has been set during
+        lazy initialization.
+        """
+        self.process_group = process_group
+        self.rank = process_group.rank()
+        self.world_size = process_group.size()
+        # Fix an order over the handles, which should be the same across ranks
+        for fsdp_module in fsdp_root.fsdp_modules(fsdp_root):  # type: ignore[operator]
+            for handle in fsdp_module._handles:
+                index = len(self.all_handles)
+                self.all_handles.append(handle)
+                self.handle_to_handle_index[handle] = index
+        self.flat_param_to_prefixed_param_names = cast(
+            Dict[FlatParameter, List[str]],
+            _get_param_to_unflat_param_names(fsdp_root),
+        )
+        # TODO (awgu): We can broadcast the metadata of rank 0's `all_handles`
+        # to check that all ranks have the same handles in the same order.
+        # https://github.com/pytorch/pytorch/issues/79620
+
+    def get_handles_to_backward_prefetch(
+        self,
+        current_handles_key: _HandlesKey,
+    ) -> Optional[List[_HandlesKey]]:
+        """
+        Returns a :class:`list` of the handles keys of the handles to backward
+        prefetch given the current handles key. If there are no valid handles
+        keys to prefetch, then this returns an empty :class:`list`.
+        """
+        current_index = self.handles_to_post_forward_order_index.get(
+            current_handles_key, None
+        )
+        if current_index is None:
+            return None
+        target_index = current_index - 1
+        target_handles_keys: List[_HandlesKey] = []
+        for _ in range(self._backward_prefetch_limit):
+            if target_index < 0:
+                break
+            target_handles_keys.append(self.handles_post_forward_order[target_index])
+            target_index -= 1
+        return target_handles_keys
+
+    def get_handles_to_forward_prefetch(
+        self,
+        current_handles_key: _HandlesKey,
+    ) -> Optional[List[_HandlesKey]]:
+        """
+        Returns a :class:`list` of the handles keys of the handles to forward
+        prefetch given the current handles key. If there are no valid handles
+        keys to prefetch, then this returns an empty :class:`list`.
+        """
+        current_index = self.handles_to_pre_forward_order_index.get(
+            current_handles_key, None
+        )
+        if current_index is None:
+            return None
+        target_index = current_index + 1
+        target_handles_keys: List[_HandlesKey] = []
+        for _ in range(self._forward_prefetch_limit):
+            if target_index >= len(self.handles_pre_forward_order):
+                break
+            target_handles_keys.append(self.handles_pre_forward_order[target_index])
+            target_index += 1
+        return target_handles_keys
+
+    def record_post_forward(self, handles: List[FlatParamHandle]) -> None:
+        """
+        Records ``handles`` in the post-forward order, where ``handles`` should
+        be a group of handles used in the same module's forward. If ``handles``
+        is empty, then it is omitted.
+
+        Unlike :meth:`record_pre_forward`, this records the order *every*
+        iteration with the expectation that the recorded order is reset in
+        :meth:`next_iter`.
+        """
+        if not handles:
+            return
+        handles_key = tuple(handles)
+        # Only record the first usage of a handles key
+        if handles_key in self.handles_to_post_forward_order_index:
+            return
+        index = len(self.handles_post_forward_order)
+        self.handles_to_post_forward_order_index[handles_key] = index
+        self.handles_post_forward_order.append(handles_key)
+
+    def record_pre_forward(
+        self, handles: List[FlatParamHandle], is_training: bool
+    ) -> None:
+        """
+        Records ``handles`` in the pre-forward order, where ``handles`` should
+        be a group of handles used in the same module's forward. If ``handles``
+        is empty, then it is omitted.
+
+        On the first iteration, this checks the execution order across ranks.
+        See :meth:`_check_order` for details.
+        """
+        if not handles:
+            return
+        handles_key = tuple(handles)
+        self._check_order(handles_key, is_training)
+        # Fix the order after the first iteration and only record the first
+        # usage of a handles key
+        if (
+            not self.is_first_iter
+            or handles_key in self.handles_to_pre_forward_order_index
+        ):
+            return
+        index = len(self.handles_pre_forward_order)
+        self.handles_to_pre_forward_order_index[handles_key] = index
+        self.handles_pre_forward_order.append(handles_key)
+
+    def _check_order(self, handles_key: _HandlesKey, is_training: bool) -> None:
+        """
+        Checks the forward execution order as long as ``is_training`` is
+        ``True`` since checking in eval mode is not supported.
+
+        - On the first iteration, this uses all-gathers to check that all ranks
+        are all-gathering the same handles and hence ``FlatParameter`` s,
+        raising an error if not.
+        - On subsequent iterations, if the distributed debug level is at least
+        INFO, then this checks that each rank is locally consistent with its
+        own forward order from the first iteration, issuing a warning if not.
+        This issues a warning on the first deviating iteration and stops
+        warning thereafter.
+        """
+        # Do not check order in eval mode since the post-backward callback does
+        # not run so it cannot be used to mark the end of an iteration
+        if not is_training:
+            return
+        if self.is_first_iter:
+            msg_prefix = "Forward order differs across ranks:"
+            optional_local_indices: Tuple[
+                Optional[int], ...
+            ] = self._get_handle_indices(handles_key)
+            device = handles_key[0].device  # guaranteed to be non-CPU
+            num_valid_indices = sum(
+                (index is not None) for index in optional_local_indices
+            )
+            tensor_kwargs: Dict[str, Union[torch.dtype, torch.device]] = {
+                "dtype": torch.int32,
+                "device": device,
+            }
+            world_num_valid_indices = torch.zeros(self.world_size, **tensor_kwargs)  # type: ignore[arg-type, call-overload]
+            local_num_valid_indices = torch.tensor([num_valid_indices], **tensor_kwargs)  # type: ignore[arg-type, call-overload]
+            dist.all_gather_into_tensor(
+                world_num_valid_indices,
+                local_num_valid_indices,
+                group=self.process_group,
+            )
+            # Check that all ranks plan to all-gather the same number of
+            # parameters
+            # TODO (awgu): Since every module has at most one handle in the
+            # current implementation, this should never raise the error.
+            assert self.world_size is not None  # mypy
+            for (r1, n1), (r2, n2) in itertools.combinations(
+                (
+                    (rank, world_num_valid_indices[rank])
+                    for rank in range(self.world_size)
+                ),
+                2,
+            ):
+                if n1 != n2:
+                    raise RuntimeError(
+                        f"{msg_prefix} rank {r1} is all-gathering {n1} parameters "
+                        f"while rank {r2} is all-gathering {n2} parameters"
+                    )
+            world_indices = torch.zeros(  # type: ignore[call-overload]
+                self.world_size * num_valid_indices, **tensor_kwargs
+            )
+            local_indices = torch.tensor(optional_local_indices, **tensor_kwargs)  # type: ignore[arg-type]
+            dist.all_gather_into_tensor(
+                world_indices, local_indices, group=self.process_group
+            )
+            # Check that all ranks plan to all-gather the same index parameters
+            for (r1, i1), (r2, i2) in itertools.combinations(
+                (
+                    (
+                        rank,
+                        world_indices[
+                            rank * num_valid_indices : (rank + 1) * num_valid_indices
+                        ],
+                    )
+                    for rank in range(self.world_size)
+                ),
+                2,
+            ):
+                if i1 != i2:
+                    r1_param_names = self._get_names_from_handle_indices(i1)
+                    r2_param_names = self._get_names_from_handle_indices(i2)
+                    raise RuntimeError(
+                        f"{msg_prefix} rank {r1} is all-gathering parameters "
+                        f"for {r1_param_names} while rank {r2} is all-gathering "
+                        f"parameters for {r2_param_names}"
+                    )
+        elif self._checking_order:
+            # Only issue warnings on the first deviating iteration and stop
+            # checking thereafter to avoid flooding the console
+            if self.warn_status == _ExecOrderWarnStatus.WARNED:
+                return
+            msg_prefix = None  # non-`None` means we should warn
+            if self.current_order_index >= len(self.handles_pre_forward_order):
+                # This iteration sees extra all-gather(s) compared to the first
+                msg_prefix = (
+                    "Expected to not all-gather any more parameters in the "
+                    "forward but trying to all-gather parameters for "
+                )
+            else:
+                expected_handles_key = self.handles_pre_forward_order[
+                    self.current_order_index
+                ]
+                if expected_handles_key != handles_key:
+                    expected_param_names = self._get_names_from_handles(
+                        expected_handles_key
+                    )
+                    msg_prefix = (
+                        f"Expected to all-gather for {expected_param_names} "
+                        "but trying to all-gather parameters for "
+                    )
+            if msg_prefix is not None:
+                param_names = self._get_names_from_handles(handles_key)
+                msg_suffix = (
+                    f"{param_names}"
+                    if param_names
+                    else "a newly-added parameter since construction time"
+                )
+                warnings.warn(
+                    "Forward order differs from that of the first iteration "
+                    f"on rank {self.rank}. Collectives are unchecked and may "
+                    f"give incorrect results or hang.\n{msg_prefix}{msg_suffix}"
+                )
+                self.warn_status = _ExecOrderWarnStatus.WARNING
+            self.current_order_index += 1
+
+    def _get_handle_indices(
+        self,
+        handles_key: _HandlesKey,
+    ) -> Tuple[Optional[int], ...]:
+        """
+        Returns the handle indices (i.e. indices into ``self.all_handles``)
+        corresponding to the handles in ``handles_key``. An entry in the
+        returned tuple is ``None`` if the handle is invalid.
+        """
+        indices: List[Optional[int]] = []
+        for handle in handles_key:
+            if handle not in self.handle_to_handle_index:
+                indices.append(None)
+            else:
+                indices.append(self.handle_to_handle_index[handle])
+        return tuple(indices)
+
+    def _get_names_from_handle_indices(
+        self,
+        handle_indices: Tuple[int, ...],
+    ) -> List[List[str]]:
+        """
+        Returns a list of prefixed parameter names for each handle in
+        ``handle_indices``. If a handle index is invalid, then its prefixed
+        parameter names are omitted from the returned list.
+        """
+        prefixed_param_names: List[List[str]] = []
+        for index in handle_indices:
+            if index is None or index < 0 or index >= len(self.all_handles):
+                continue
+            handle = self.all_handles[index]
+            flat_param = handle.flat_param
+            prefixed_param_names.append(
+                self.flat_param_to_prefixed_param_names[flat_param]
+            )
+        return prefixed_param_names
+
+    def _get_names_from_handles(
+        self,
+        handles_key: _HandlesKey,
+    ) -> List[List[str]]:
+        """
+        Returns a list of prefixed parameter names for each handle in
+        ``handles_key``. If a handle is invalid, then its prefixed parameter
+        names are omitted from the returned list.
+        """
+        prefixed_param_names: List[List[str]] = []
+        for handle in handles_key:
+            flat_param = handle.flat_param
+            if flat_param not in self.flat_param_to_prefixed_param_names:
+                continue
+            prefixed_param_names.append(
+                self.flat_param_to_prefixed_param_names[flat_param]
+            )
+        return prefixed_param_names
+
+    def next_iter(self):
+        """
+        Advances the internal data structures per iteration. This should be
+        called in the post-backward callback since that marks the true end of
+        an iteration.
+        """
+        self.is_first_iter = False
+        self.handles_to_post_forward_order_index.clear()
+        self.handles_post_forward_order.clear()
+        if self._checking_order:
+            self.current_order_index = 0
+            if self.warn_status == _ExecOrderWarnStatus.WARNING:
+                self.warn_status = _ExecOrderWarnStatus.WARNED
diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index 5fc3845743409..7d634187809f0 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -1,17 +1,49 @@
+import collections
 import warnings
-from typing import Callable, Iterable, Iterator, List, Optional, Set, Tuple, Union
+from typing import (
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    no_type_check,
+    Optional,
+    Set,
+    Tuple,
+    Union,
+)
 
 import torch
 import torch.distributed as dist
 import torch.distributed.fsdp.fully_sharded_data_parallel as fsdp_file
 import torch.nn as nn
+from torch.distributed.algorithms._comm_hooks import default_hooks
+from torch.distributed.distributed_c10d import _get_default_group
 from torch.distributed.fsdp._common_utils import (
     _apply_to_modules,
     _get_param_to_unflat_param_names,
     _is_fsdp_flattened,
+    _State,
     clean_tensor_name,
+    TrainingState,
+)
+from torch.distributed.fsdp._exec_order_utils import _ExecOrderData
+from torch.distributed.fsdp._limiter_utils import _FreeEventQueue
+from torch.distributed.fsdp.api import (
+    BackwardPrefetch,
+    CPUOffload,
+    MixedPrecision,
+    ShardingStrategy,
+)
+from torch.distributed.fsdp.flat_param import (
+    _HandlesKey,
+    FlatParameter,
+    FlatParamHandle,
+    HandleConfig,
+    HandleShardingStrategy,
 )
 from torch.distributed.utils import _sync_params_and_buffers
+from torch.utils.hooks import RemovableHandle
 
 _TORCHDISTX_AVAIL = True
 try:
@@ -22,6 +54,205 @@
 PARAM_BROADCAST_BUCKET_SIZE = int(250 * 1024 * 1024)
 FSDP_SYNCED = "_fsdp_synced"
 
+# TODO (awgu): Refactor this later
+SHARDING_STRATEGY_MAP = {
+    ShardingStrategy.NO_SHARD: HandleShardingStrategy.NO_SHARD,
+    ShardingStrategy.FULL_SHARD: HandleShardingStrategy.FULL_SHARD,
+    ShardingStrategy.SHARD_GRAD_OP: HandleShardingStrategy.SHARD_GRAD_OP,
+}
+
+
+# NOTE: Since non-self attributes cannot be type annotated, several attributes
+# on `state` are defined first as local variables before being assigned.
+
+
+@no_type_check
+def _init_process_group_state(
+    state: _State,
+    process_group: Optional[dist.ProcessGroup],
+) -> _State:
+    state.process_group = process_group or _get_default_group()
+    state.rank = state.process_group.rank()
+    state.world_size = state.process_group.size()
+    return state
+
+
+@no_type_check
+def _init_ignored_module_states(
+    state: _State,
+    module: nn.Module,
+    ignored_modules: Optional[Iterable[torch.nn.Module]],
+) -> _State:
+    state._ignored_modules = _get_ignored_modules(module, ignored_modules)
+    state._ignored_params, state._ignored_param_names = _get_ignored_params(
+        module,
+        state._ignored_modules,
+    )
+    # TODO: FSDP's contract for buffers is not well-defined. They are
+    # implicitly ignored for most functionality since they are not sharded;
+    # however, FSDP still imposes some semantics on buffers (e.g. buffer mixed
+    # precision). We should formalize this contract and decide if we need to
+    # compute and store `_ignored_buffers`.
+    return state
+
+
+@no_type_check
+def _init_module_and_device_state(
+    state: _State,
+    module: nn.Module,
+    device_id: Optional[Union[int, torch.device]],
+    param_init_fn: Optional[Callable[[nn.Module], None]],
+    sync_module_states: bool,
+) -> _State:
+    state._buffer_names = _get_buffer_names(module)
+    # Save a mapping from fully prefixed buffer name to its original dtype
+    # since when buffer mixed precision is enabled, buffers are restored to
+    # their original dtype for model checkpointing
+    _buffer_name_to_orig_dtype: Dict[str, torch.dtype] = {}
+    state._buffer_name_to_orig_dtype = _buffer_name_to_orig_dtype
+    _check_single_device_module(module, state._ignored_params)
+    device_from_device_id = _get_device_from_device_id(device_id, state.rank)
+    _materialize_module(
+        module,
+        param_init_fn,
+        state._ignored_params,
+        device_from_device_id,
+        lambda _: True,
+    )
+    # TODO: We need to skip this for functional-like to avoid moving the entire
+    # unsharded module onto GPU before any sharding.
+    _move_module_to_device(module, state._ignored_params, device_from_device_id)
+    state.compute_device = _get_compute_device(
+        module,
+        state._ignored_params,
+        device_from_device_id,
+        state.rank,
+    )
+    state._managed_params = list(_get_orig_params(module, state._ignored_params))
+    if sync_module_states:
+        _sync_module_states(module, state._managed_params, state.process_group)
+    return state
+
+
+@no_type_check
+def _init_core_state(
+    state: _State,
+    sharding_strategy: Optional[ShardingStrategy],
+    mixed_precision: Optional[MixedPrecision],
+    cpu_offload: Optional[CPUOffload],
+    limit_all_gathers: bool,
+    use_orig_params: bool,
+    backward_prefetch_limit: int,
+    forward_prefetch_limit: int,
+) -> _State:
+    state.sharding_strategy = sharding_strategy or ShardingStrategy.FULL_SHARD
+    state.mixed_precision = mixed_precision or MixedPrecision()
+    state.cpu_offload = cpu_offload or CPUOffload()
+    state.limit_all_gathers = limit_all_gathers
+    state._use_orig_params = use_orig_params
+    state.training_state = TrainingState.IDLE
+    state._is_root = None
+    _streams: Dict[str, torch.cuda.Stream] = {}
+    state._streams = _streams
+    state._free_event_queue = _FreeEventQueue()
+    state._debug_level = dist.get_debug_level()
+    state._exec_order_data = _ExecOrderData(
+        state._debug_level,
+        backward_prefetch_limit,
+        forward_prefetch_limit,
+    )
+    # Invariant: `state.params` contains exactly the `FlatParameter`s of the
+    # handles in `state._handles`
+    _handles: List[FlatParamHandle] = []
+    state._handles = _handles
+    params: List[FlatParameter] = []
+    state.params = params
+    return state
+
+
+@no_type_check
+def _init_runtime_state(
+    state: _State,
+) -> _State:
+    _pre_forward_handles: List[RemovableHandle] = []
+    state._pre_forward_handles = _pre_forward_handles
+    _post_forward_handles: List[RemovableHandle] = []
+    state._post_forward_handles = _post_forward_handles
+    _module_to_handles: Dict[
+        nn.Module, List[FlatParamHandle]
+    ] = collections.defaultdict(list)
+    state._module_to_handles = _module_to_handles
+    state._sync_gradients = True
+    state._communication_hook = _get_default_comm_hook(state.sharding_strategy)
+    state._communication_hook_state = _get_default_comm_hook_state(state.process_group)
+    state._hook_registered = False
+    # Used to prevent running the pre-backward hook multiple times
+    _ran_pre_backward_hook: Dict[_HandlesKey, bool] = {}
+    state._ran_pre_backward_hook = _ran_pre_backward_hook
+    return state
+
+
+@no_type_check
+def _init_prefetching_state(
+    state: _State,
+    backward_prefetch: BackwardPrefetch,
+    forward_prefetch: bool,
+) -> _State:
+    state.backward_prefetch = backward_prefetch
+    state.forward_prefetch = forward_prefetch
+    _handles_prefetched: Dict[_HandlesKey, bool] = {}
+    state._handles_prefetched = _handles_prefetched
+    # Used for guarding against mistargeted backward prefetches
+    _needs_pre_backward_unshard: Dict[_HandlesKey, bool] = {}
+    state._needs_pre_backward_unshard = _needs_pre_backward_unshard
+    # Used for guarding against mistargeted forward prefetches
+    _needs_pre_forward_unshard: Dict[_HandlesKey, bool] = {}
+    state._needs_pre_forward_unshard = _needs_pre_forward_unshard
+    # The data structures use tuples of handles to generalize over the case
+    # where a module's forward involves multiple handles.
+    return state
+
+
+def _init_state_dict_state(state: _State) -> _State:
+    # TODO: after rebase
+    return state
+
+
+@no_type_check
+def _init_param_handle_from_params(
+    state: _State,
+    params: List[nn.Parameter],
+    root_module: nn.Module,
+):
+    if len(params) == 0:
+        return
+    # TODO: Move module to GPU if needed (for non-wrapper code path) -- we need
+    # to fuse this method with `_init_module_and_device_state()`
+    handle_config = HandleConfig(
+        SHARDING_STRATEGY_MAP[state.sharding_strategy],
+        state.cpu_offload.offload_params,
+        state.mixed_precision.param_dtype,
+        state.mixed_precision.reduce_dtype,
+        state.mixed_precision.keep_low_precision_grads,
+    )
+    handle = FlatParamHandle(
+        params,
+        root_module,
+        state.compute_device,
+        handle_config,
+        state.process_group,
+        state._use_orig_params,
+    )
+    # TODO: Can simplify call `shard()` in the `FlatParamHandle` ctor
+    handle.shard()
+    assert handle.flat_param not in state.params
+    assert handle not in state._handles
+    state.params.append(handle.flat_param)
+    state._handles.append(handle)
+    cpu_device = torch.device("cpu")
+    if state.cpu_offload.offload_params and handle.flat_param.device != cpu_device:
+        handle.flat_param_to(cpu_device)
+
 
 def _get_ignored_modules(
     root_module: nn.Module,
@@ -362,3 +593,17 @@ def _check_orig_params_flattened(
                 f"Found an unflattened parameter: {param_name}; "
                 f"{param.size()} {param.__class__}"
             )
+
+
+def _get_default_comm_hook(sharding_strategy: ShardingStrategy):
+    return (
+        default_hooks.allreduce_hook
+        if sharding_strategy == ShardingStrategy.NO_SHARD
+        else default_hooks.reduce_scatter_hook
+    )
+
+
+def _get_default_comm_hook_state(
+    process_group: dist.ProcessGroup,
+) -> default_hooks.DefaultState:
+    return default_hooks.DefaultState(process_group=process_group)
diff --git a/torch/distributed/fsdp/_wrap_utils.py b/torch/distributed/fsdp/_wrap_utils.py
new file mode 100644
index 0000000000000..00133b5c58a37
--- /dev/null
+++ b/torch/distributed/fsdp/_wrap_utils.py
@@ -0,0 +1,55 @@
+import functools
+import warnings
+from typing import Any, Dict
+
+from torch.distributed.fsdp._utils import (
+    _contains_batchnorm,
+    _override_batchnorm_mixed_precision,
+)
+from torch.distributed.fsdp.wrap import (
+    _or_policy,
+    _recursive_wrap,
+    _wrap_batchnorm_individually,
+)
+
+
+def _auto_wrap(
+    auto_wrap_kwargs: Dict[str, Any],
+    fsdp_kwargs: Dict[str, Any],
+    module_wrapper_cls: Any,  # e.g. `FullyShardedDataParallel`
+) -> None:
+    """
+    Recursively auto wraps the root module given by the key "module" in
+    ``auto_wrap_kwargs`` with the arguments in ``auto_wrap_kwargs`` and
+    ``fsdp_kwargs``.
+
+    Precondition: ``auto_wrap_policy`` contains the arguments expected by
+    ``_recursive_wrap()``, where ``auto_wrap_policy`` is not ``None``.
+    ``fsdp_kwargs`` contains all FSDP arguments except ``module``.
+    """
+    auto_wrap_policy = auto_wrap_kwargs["auto_wrap_policy"]
+    root_module = auto_wrap_kwargs["module"]
+    assert auto_wrap_policy is not None
+    # For auto wrapping, submodules should not already be wrapped with FSDP
+    # since double wrapping is not supported
+    for module_name, module in root_module.named_modules():
+        if isinstance(module, module_wrapper_cls):
+            raise ValueError(
+                f"Expected {module_name} to NOT be FullyShardedDataParallel "
+                "if using an `auto_wrap_policy`"
+            )
+    mixed_precision = fsdp_kwargs["mixed_precision"]
+    if mixed_precision is not None and _contains_batchnorm(root_module):
+        _override_batchnorm_mixed_precision(root_module)
+        auto_wrap_policy = functools.partial(
+            _or_policy, policies=[_wrap_batchnorm_individually, auto_wrap_policy]
+        )
+        warnings.warn(
+            "Both mixed precision and an `auto_wrap_policy` were specified "
+            "for FSDP, where the wrapped module has batch norm submodules. "
+            "The batch norm submodules will be wrapped as separate FSDP "
+            "instances with mixed precision disabled since some batch norm "
+            "kernels do not support low precision."
+        )
+        auto_wrap_kwargs["auto_wrap_policy"] = auto_wrap_policy
+    _recursive_wrap(**auto_wrap_kwargs, **fsdp_kwargs)
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index 37db0a2d05bf1..376e0d4ff51b9 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -1803,3 +1803,9 @@ def _force_full_precision(self) -> bool:
             self._training_state == HandleTrainingState.SUMMON_FULL_PARAMS
             and self._uses_param_mixed_precision
         )
+
+
+# A handles key represents the group of `FlatParamHandle`s involved in a given
+# module's forward. These will be all-gathered together in the pre-forward and
+# pre-backward.
+_HandlesKey = Tuple[FlatParamHandle, ...]
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 2608f76e311af..ef79a80c3102e 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -1,7 +1,6 @@
 import contextlib
 import copy
 import functools
-import itertools
 import math
 import traceback
 import warnings
@@ -11,7 +10,6 @@
 from typing import (
     Any,
     Callable,
-    cast,
     Dict,
     Generator,
     Iterable,
@@ -34,8 +32,7 @@
     _CHECKPOINT_WRAPPED_MODULE,
     ActivationWrapper,
 )
-from torch.distributed.algorithms._comm_hooks import default_hooks, LOW_PRECISION_HOOKS
-from torch.distributed.distributed_c10d import _get_default_group
+from torch.distributed.algorithms._comm_hooks import LOW_PRECISION_HOOKS
 from torch.distributed.fsdp._common_utils import (
     _get_param_to_unflat_param_names,
     FSDP_PREFIX,
@@ -45,23 +42,22 @@
 )
 from torch.distributed.fsdp._init_utils import (
     _check_orig_params_flattened,
-    _check_single_device_module,
-    _get_buffer_names,
-    _get_compute_device,
-    _get_device_from_device_id,
-    _get_ignored_modules,
-    _get_ignored_params,
-    _get_orig_params,
-    _materialize_module,
-    _move_module_to_device,
-    _sync_module_states,
+    _get_default_comm_hook,
+    _init_core_state,
+    _init_ignored_module_states,
+    _init_module_and_device_state,
+    _init_param_handle_from_params,
+    _init_prefetching_state,
+    _init_process_group_state,
+    _init_runtime_state,
+    _init_state_dict_state,
 )
-from torch.distributed.fsdp._limiter_utils import _FreeEventQueue
 from torch.distributed.fsdp._runtime_utils import (
     _clear_grads_if_needed,
     _prepare_forward_inputs,
     _wait_for_computation_stream,
 )
+from torch.distributed.fsdp._wrap_utils import _auto_wrap
 from torch.distributed.fsdp.api import (
     BackwardPrefetch,
     CPUOffload,
@@ -86,25 +82,14 @@
     _post_state_dict_hook,
     _pre_load_state_dict_hook,
 )
-from ._utils import (
-    _apply_to_tensors,
-    _contains_batchnorm,
-    _free_storage,
-    _override_batchnorm_mixed_precision,
-    p_assert,
-)
+from ._utils import _apply_to_tensors, _free_storage, p_assert
 from .flat_param import (
+    _HandlesKey,
     FlatParameter,
     FlatParamHandle,
-    HandleConfig,
     HandleShardingStrategy,
 )
-from .wrap import (
-    _or_policy,
-    _recursive_wrap,
-    _wrap_batchnorm_individually,
-    ParamExecOrderWrapPolicy,
-)
+from .wrap import ParamExecOrderWrapPolicy
 
 _TORCH_FX_AVAIL = True
 if not hasattr(torch, "fx"):
@@ -216,382 +201,6 @@ class OptimStateKeyType(Enum):
     PARAM_ID = auto()
 
 
-# A handles key represents the group of `FlatParamHandle`s involved in a given
-# module's forward. These will be all-gathered together in the pre-forward and
-# pre-backward.
-_HandlesKey = Tuple[FlatParamHandle, ...]
-
-
-class _ExecOrderWarnStatus(Enum):
-    """Used internally for execution order validation."""
-
-    NONE = auto()  # no deviation yet
-    WARNING = auto()  # deviated this iteration; currently issuing warnings
-    WARNED = auto()  # deviated in a previous iteration
-
-
-class _ExecOrderData:
-    """
-    This contains the data structures to track the execution order. We track
-    the pre-forward order on the *first* iteration for forward prefetching
-    (which thus assumes static graph) and the post-forward order on *every*
-    iteration for backward prefetching (which thus does not assume static
-    graph but may be provide an incorrect order).
-    """
-
-    def __init__(
-        self,
-        debug_level: dist.DebugLevel,
-        backward_prefetch_limit: int,
-        forward_prefetch_limit: int,
-    ) -> None:
-        # Tracks the (static) pre-forward order for execution order validation
-        # and forward prefetching
-        self.handles_pre_forward_order: List[int] = []
-        # Maps each handles key to its index in `handles_pre_forward_order`
-        self.handles_to_pre_forward_order_index: Dict[_HandlesKey, int] = {}
-        # Tracks the post-forward order for pre-backward prefetching
-        self.handles_post_forward_order: List[int] = []
-        # Maps each handles key to its index in `handles_post_forward_order`
-        self.handles_to_post_forward_order_index: Dict[_HandlesKey, int] = {}
-        self.is_first_iter = True
-
-        # Gives the max number of backward/forward prefetched all-gathers by a
-        # single module
-        self._backward_prefetch_limit = backward_prefetch_limit
-        self._forward_prefetch_limit = forward_prefetch_limit
-
-        # Data structures for execution order validation
-        self._checking_order: bool = debug_level in [
-            dist.DebugLevel.INFO,
-            dist.DebugLevel.DETAIL,
-        ]
-        self.process_group: Optional[dist.ProcessGroup] = None
-        self.world_size: Optional[int] = None
-        self.all_handles: List[FlatParamHandle] = []
-        # Maps each handle to its index in `all_handles`, which must be the
-        # same across ranks for the execution order validation to work
-        self.handle_to_handle_index: Dict[FlatParamHandle, int] = {}
-        # Names are prefixed from the root module
-        self.flat_param_to_prefixed_param_names: Dict[FlatParameter, List[str]] = {}
-        # Current index in the pre-forward execution order
-        self.current_order_index = 0
-        self.warn_status = _ExecOrderWarnStatus.NONE
-
-    def init(
-        self,
-        fsdp_root: "FullyShardedDataParallel",
-        process_group: dist.ProcessGroup,
-    ) -> None:
-        """
-        Initializes the data structures needed for checking the forward order.
-        This should be called after a root FSDP instance has been set during
-        lazy initialization.
-        """
-        self.process_group = process_group
-        self.rank = process_group.rank()
-        self.world_size = process_group.size()
-        # Fix an order over the handles, which should be the same across ranks
-        for fsdp_module in fsdp_root.fsdp_modules(fsdp_root):
-            for handle in fsdp_module._handles:
-                index = len(self.all_handles)
-                self.all_handles.append(handle)
-                self.handle_to_handle_index[handle] = index
-        self.flat_param_to_prefixed_param_names = cast(
-            Dict[FlatParameter, List[str]],
-            _get_param_to_unflat_param_names(fsdp_root),
-        )
-        # TODO (awgu): We can broadcast the metadata of rank 0's `all_handles`
-        # to check that all ranks have the same handles in the same order.
-        # https://github.com/pytorch/pytorch/issues/79620
-
-    def get_handles_to_backward_prefetch(
-        self,
-        current_handles_key: _HandlesKey,
-    ) -> List[_HandlesKey]:
-        """
-        Returns a :class:`list` of the handles keys of the handles to backward
-        prefetch given the current handles key. If there are no valid handles
-        keys to prefetch, then this returns an empty :class:`list`.
-        """
-        current_index = self.handles_to_post_forward_order_index.get(
-            current_handles_key, None
-        )
-        if current_index is None:
-            return None
-        target_index = current_index - 1
-        target_handles_keys: List[_HandlesKey] = []
-        for _ in range(self._backward_prefetch_limit):
-            if target_index < 0:
-                break
-            target_handles_keys.append(self.handles_post_forward_order[target_index])
-            target_index -= 1
-        return target_handles_keys
-
-    def get_handles_to_forward_prefetch(
-        self,
-        current_handles_key: _HandlesKey,
-    ) -> List[_HandlesKey]:
-        """
-        Returns a :class:`list` of the handles keys of the handles to forward
-        prefetch given the current handles key. If there are no valid handles
-        keys to prefetch, then this returns an empty :class:`list`.
-        """
-        current_index = self.handles_to_pre_forward_order_index.get(
-            current_handles_key, None
-        )
-        if current_index is None:
-            return None
-        target_index = current_index + 1
-        target_handles_keys: List[_HandlesKey] = []
-        for _ in range(self._forward_prefetch_limit):
-            if target_index >= len(self.handles_pre_forward_order):
-                break
-            target_handles_keys.append(self.handles_pre_forward_order[target_index])
-            target_index += 1
-        return target_handles_keys
-
-    def record_post_forward(self, handles: List[FlatParamHandle]) -> None:
-        """
-        Records ``handles`` in the post-forward order, where ``handles`` should
-        be a group of handles used in the same module's forward. If ``handles``
-        is empty, then it is omitted.
-
-        Unlike :meth:`record_pre_forward`, this records the order *every*
-        iteration with the expectation that the recorded order is reset in
-        :meth:`next_iter`.
-        """
-        if not handles:
-            return
-        handles_key = tuple(handles)
-        # Only record the first usage of a handles key
-        if handles_key in self.handles_to_post_forward_order_index:
-            return
-        index = len(self.handles_post_forward_order)
-        self.handles_to_post_forward_order_index[handles_key] = index
-        self.handles_post_forward_order.append(handles_key)
-
-    def record_pre_forward(
-        self, handles: List[FlatParamHandle], is_training: bool
-    ) -> None:
-        """
-        Records ``handles`` in the pre-forward order, where ``handles`` should
-        be a group of handles used in the same module's forward. If ``handles``
-        is empty, then it is omitted.
-
-        On the first iteration, this checks the execution order across ranks.
-        See :meth:`_check_order` for details.
-        """
-        if not handles:
-            return
-        handles_key = tuple(handles)
-        self._check_order(handles_key, is_training)
-        # Fix the order after the first iteration and only record the first
-        # usage of a handles key
-        if (
-            not self.is_first_iter
-            or handles_key in self.handles_to_pre_forward_order_index
-        ):
-            return
-        index = len(self.handles_pre_forward_order)
-        self.handles_to_pre_forward_order_index[handles_key] = index
-        self.handles_pre_forward_order.append(handles_key)
-
-    def _check_order(self, handles_key: _HandlesKey, is_training: bool) -> None:
-        """
-        Checks the forward execution order as long as ``is_training`` is
-        ``True`` since checking in eval mode is not supported.
-
-        - On the first iteration, this uses all-gathers to check that all ranks
-        are all-gathering the same handles and hence ``FlatParameter`` s,
-        raising an error if not.
-        - On subsequent iterations, if the distributed debug level is at least
-        INFO, then this checks that each rank is locally consistent with its
-        own forward order from the first iteration, issuing a warning if not.
-        This issues a warning on the first deviating iteration and stops
-        warning thereafter.
-        """
-        # Do not check order in eval mode since the post-backward callback does
-        # not run so it cannot be used to mark the end of an iteration
-        if not is_training:
-            return
-        if self.is_first_iter:
-            msg_prefix = "Forward order differs across ranks:"
-            local_indices: Optional[Tuple[int, ...]] = self._get_handle_indices(
-                handles_key
-            )
-            device = handles_key[0].device  # guaranteed to be non-CPU
-            num_valid_indices = sum((index is not None) for index in local_indices)
-            tensor_kwargs = {"dtype": torch.int32, "device": device}
-            world_num_valid_indices = torch.zeros(self.world_size, **tensor_kwargs)
-            local_num_valid_indices = torch.tensor([num_valid_indices], **tensor_kwargs)
-            dist.all_gather_into_tensor(
-                world_num_valid_indices,
-                local_num_valid_indices,
-                group=self.process_group,
-            )
-            # Check that all ranks plan to all-gather the same number of
-            # parameters
-            # TODO (awgu): Since every module has at most one handle in the
-            # current implementation, this should never raise the error.
-            for (r1, n1), (r2, n2) in itertools.combinations(
-                (
-                    (rank, world_num_valid_indices[rank])
-                    for rank in range(self.world_size)
-                ),
-                2,
-            ):
-                if n1 != n2:
-                    raise RuntimeError(
-                        f"{msg_prefix} rank {r1} is all-gathering {n1} parameters "
-                        f"while rank {r2} is all-gathering {n2} parameters"
-                    )
-            world_indices = torch.zeros(
-                self.world_size * num_valid_indices, **tensor_kwargs
-            )
-            local_indices = torch.tensor(local_indices, **tensor_kwargs)
-            dist.all_gather_into_tensor(
-                world_indices, local_indices, group=self.process_group
-            )
-            # Check that all ranks plan to all-gather the same index parameters
-            for (r1, i1), (r2, i2) in itertools.combinations(
-                (
-                    (
-                        rank,
-                        world_indices[
-                            rank * num_valid_indices : (rank + 1) * num_valid_indices
-                        ],
-                    )
-                    for rank in range(self.world_size)
-                ),
-                2,
-            ):
-                if i1 != i2:
-                    r1_param_names = self._get_names_from_handle_indices(i1)
-                    r2_param_names = self._get_names_from_handle_indices(i2)
-                    raise RuntimeError(
-                        f"{msg_prefix} rank {r1} is all-gathering parameters "
-                        f"for {r1_param_names} while rank {r2} is all-gathering "
-                        f"parameters for {r2_param_names}"
-                    )
-        elif self._checking_order:
-            # Only issue warnings on the first deviating iteration and stop
-            # checking thereafter to avoid flooding the console
-            if self.warn_status == _ExecOrderWarnStatus.WARNED:
-                return
-            msg_prefix = None  # non-`None` means we should warn
-            if self.current_order_index >= len(self.handles_pre_forward_order):
-                # This iteration sees extra all-gather(s) compared to the first
-                msg_prefix = (
-                    "Expected to not all-gather any more parameters in the "
-                    "forward but trying to all-gather parameters for "
-                )
-            else:
-                expected_handles_key = self.handles_pre_forward_order[
-                    self.current_order_index
-                ]
-                if expected_handles_key != handles_key:
-                    expected_param_names = self._get_names_from_handles(
-                        expected_handles_key
-                    )
-                    msg_prefix = (
-                        f"Expected to all-gather for {expected_param_names} "
-                        "but trying to all-gather parameters for "
-                    )
-            if msg_prefix is not None:
-                param_names = self._get_names_from_handles(handles_key)
-                msg_suffix = (
-                    f"{param_names}"
-                    if param_names
-                    else "a newly-added parameter since construction time"
-                )
-                warnings.warn(
-                    "Forward order differs from that of the first iteration "
-                    f"on rank {self.rank}. Collectives are unchecked and may "
-                    f"give incorrect results or hang.\n{msg_prefix}{msg_suffix}"
-                )
-                self.warn_status = _ExecOrderWarnStatus.WARNING
-            self.current_order_index += 1
-
-    def _get_handle_indices(
-        self,
-        handles_key: _HandlesKey,
-    ) -> Tuple[Optional[int], ...]:
-        """
-        Returns the handle indices (i.e. indices into ``self.all_handles``)
-        corresponding to the handles in ``handles_key``. An entry in the
-        returned tuple is ``None`` if the handle is invalid.
-        """
-        indices: List[int] = []
-        for handle in handles_key:
-            if handle not in self.handle_to_handle_index:
-                indices.append(None)
-            else:
-                indices.append(self.handle_to_handle_index[handle])
-        return tuple(indices)
-
-    def _get_names_from_handle_indices(
-        self,
-        handle_indices: Tuple[int, ...],
-    ) -> List[List[str]]:
-        """
-        Returns a list of prefixed parameter names for each handle in
-        ``handle_indices``. If a handle index is invalid, then its prefixed
-        parameter names are omitted from the returned list.
-        """
-        prefixed_param_names: List[List[str]] = []
-        for index in handle_indices:
-            if index is None or index < 0 or index >= len(self.all_handles):
-                continue
-            handle = self.all_handles[index]
-            flat_param = handle.flat_param
-            prefixed_param_names.append(
-                self.flat_param_to_prefixed_param_names[flat_param]
-            )
-        return prefixed_param_names
-
-    def _get_names_from_handles(
-        self,
-        handles_key: _HandlesKey,
-    ) -> List[List[str]]:
-        """
-        Returns a list of prefixed parameter names for each handle in
-        ``handles_key``. If a handle is invalid, then its prefixed parameter
-        names are omitted from the returned list.
-        """
-        prefixed_param_names: List[List[str]] = []
-        for handle in handles_key:
-            flat_param = handle.flat_param
-            if flat_param not in self.flat_param_to_prefixed_param_names:
-                continue
-            prefixed_param_names.append(
-                self.flat_param_to_prefixed_param_names[flat_param]
-            )
-        return prefixed_param_names
-
-    def next_iter(self):
-        """
-        Advances the internal data structures per iteration. This should be
-        called in the post-backward callback since that marks the true end of
-        an iteration.
-        """
-        self.is_first_iter = False
-        self.handles_to_post_forward_order_index.clear()
-        self.handles_post_forward_order.clear()
-        if self._checking_order:
-            self.current_order_index = 0
-            if self.warn_status == _ExecOrderWarnStatus.WARNING:
-                self.warn_status = _ExecOrderWarnStatus.WARNED
-
-
-# TODO (awgu): Refactor this later
-sharding_strategy_map = {
-    ShardingStrategy.NO_SHARD: HandleShardingStrategy.NO_SHARD,
-    ShardingStrategy.FULL_SHARD: HandleShardingStrategy.FULL_SHARD,
-    ShardingStrategy.SHARD_GRAD_OP: HandleShardingStrategy.SHARD_GRAD_OP,
-}
-
-
 class FullyShardedDataParallel(nn.Module):
     """
     A wrapper for sharding Module parameters across data parallel workers. This
@@ -847,18 +456,15 @@ def __init__(
         torch._C._log_api_usage_once("torch.distributed.fsdp")
         super().__init__()
 
-        self._ignored_modules = _get_ignored_modules(module, ignored_modules)
-        ignored_params, self._ignored_param_names = _get_ignored_params(
-            module, self._ignored_modules
-        )
-        self._buffer_names = _get_buffer_names(module)
+        _init_ignored_module_states(self, module, ignored_modules)
+
         if auto_wrap_policy is not None:
             auto_wrap_kwargs = {
                 "module": module,
                 "auto_wrap_policy": auto_wrap_policy,
                 "wrapper_cls": FullyShardedDataParallel,
                 "ignored_modules": self._ignored_modules,
-                "ignored_params": ignored_params,
+                "ignored_params": self._ignored_params,
                 "only_wrap_children": True,  # avoid double wrapping the root
             }
             fsdp_kwargs = {
@@ -874,116 +480,50 @@ def __init__(
                 "limit_all_gathers": limit_all_gathers,
                 "use_orig_params": use_orig_params,
             }
-            self._auto_wrap(auto_wrap_kwargs, fsdp_kwargs)
+            _auto_wrap(auto_wrap_kwargs, fsdp_kwargs, FullyShardedDataParallel)
 
-        self.process_group = process_group or _get_default_group()
-        self.rank = self.process_group.rank()
-        self.world_size = self.process_group.size()
-        self.training_state = TrainingState.IDLE
-        self.cpu_offload = cpu_offload or CPUOffload()
-        self.backward_prefetch = backward_prefetch
-        self.forward_prefetch = forward_prefetch
-        self.limit_all_gathers = limit_all_gathers
-        backward_prefetch_limit = 1
-        forward_prefetch_limit = 1
+        _init_process_group_state(self, process_group)
         # We clamp the strategy to `NO_SHARD` for world size of 1 since they
         # are currently functionally equivalent. This may change if/when we
         # integrate FSDP with MoE.
         if self.world_size == 1:
             sharding_strategy = ShardingStrategy.NO_SHARD
-        self.sharding_strategy = sharding_strategy or ShardingStrategy.FULL_SHARD
-        self.mixed_precision = mixed_precision or MixedPrecision()
-        self._use_orig_params = use_orig_params
-        # Save a mapping from fully prefixed buffer name to its original dtype
-        # since for mixed precision, buffers are restored to their original
-        # dtype for model checkpointing
-        self._buffer_name_to_orig_dtype: Dict[str, torch.dtype] = {}
-
-        _check_single_device_module(module, ignored_params)
-        device_from_device_id = _get_device_from_device_id(device_id, self.rank)
-        _materialize_module(
-            module,
-            param_init_fn,
-            ignored_params,
-            device_from_device_id,
-            lambda k: not isinstance(k, FullyShardedDataParallel),
+        backward_prefetch_limit = 1
+        forward_prefetch_limit = 1
+        _init_core_state(
+            self,
+            sharding_strategy,
+            mixed_precision,
+            cpu_offload,
+            limit_all_gathers,
+            use_orig_params,
+            backward_prefetch_limit,
+            forward_prefetch_limit,
         )
-        _move_module_to_device(module, ignored_params, device_from_device_id)
-        self.compute_device = _get_compute_device(
+        _init_module_and_device_state(
+            self,
             module,
-            ignored_params,
-            device_from_device_id,
-            self.rank,
-        )
-        params_to_flatten = list(_get_orig_params(module, ignored_params))
-        if sync_module_states:
-            _sync_module_states(module, params_to_flatten, self.process_group)
-
-        # This FSDP instance's handles should inherit the same process group,
-        # compute device, CPU offload, and mixed precision settings. However,
-        # different sharding strategies are allowed.
-        config = HandleConfig(
-            sharding_strategy_map[self.sharding_strategy],
-            self.cpu_offload.offload_params,
-            self.mixed_precision.param_dtype,
-            self.mixed_precision.reduce_dtype,
-            self.mixed_precision.keep_low_precision_grads,
+            device_id,
+            param_init_fn,
+            sync_module_states,
         )
-        # Invariant: `self.params` contains exactly the `FlatParameter`s of the
-        # handles in `self._handles`
-        self._handles: List[FlatParamHandle] = []
-        self.params: List[FlatParameter] = []
+        _init_prefetching_state(self, backward_prefetch, forward_prefetch)
+        _init_runtime_state(self)
         self._fsdp_wrapped_module = module
-        if params_to_flatten:
-            handle = FlatParamHandle(
-                params_to_flatten,
-                module,
-                self.compute_device,
-                config,
-                self.process_group,
-                use_orig_params,
-            )
-            self._handles.append(handle)
-            self.params.append(handle.flat_param)
-            self._register_param_handle(handle)
-            handle.shard()
-            if (
-                self.cpu_offload.offload_params
-                and handle.flat_param.device != torch.device("cpu")
-            ):
-                handle.flat_param_to(torch.device("cpu"))
+        _init_param_handle_from_params(self, self._managed_params, module)
         if not use_orig_params:
-            _check_orig_params_flattened(self, ignored_params)
+            _check_orig_params_flattened(self, self._ignored_params)
             self._register_flat_param()
 
-        self._sync_gradients = True
-        self._communication_hook = self._get_default_comm_hook()
-        self._communication_hook_state = self._get_default_comm_hook_state()
-        self._hook_registered = False
-
-        # Used to prevent running the pre-backward hook multiple times
-        self._ran_pre_backward_hook: Dict[_HandlesKey, bool] = {}
-        self._is_root: Optional[bool] = None  # `None` indicates not yet set
-        # The following attributes are owned by the root FSDP instance and
-        # shared with non-root FSDP instances
-        self._streams: Dict[str, torch.cuda.Stream] = {}
-        self._free_event_queue = _FreeEventQueue()
-        self._debug_level = dist.get_debug_level()
-        self._exec_order_data = _ExecOrderData(
-            self._debug_level,
-            backward_prefetch_limit,
-            forward_prefetch_limit,
-        )
-        self._handles_prefetched: Dict[_HandlesKey, bool] = {}
-        # Used for guarding against mistargeted backward prefetches
-        self._needs_pre_backward_unshard: Dict[_HandlesKey, bool] = {}
-        # Used for guarding against mistargeted forward prefetches
-        self._needs_pre_forward_unshard: Dict[_HandlesKey, bool] = {}
-        # The data structures use tuples of handles to generalize over the case
-        # where a module's forward involves multiple handles.
+        # TODO (revisit): I explicitly delete these because we do want to keep
+        # references to these from FSDP. I only added them to the state for
+        # convenience in this refactoring.
+        delattr(self, "_ignored_params")
+        delattr(self, "_managed_params")
 
         # `_state_dict_type` controls the `state_dict()` behavior, which is
         # implemented using post-save and pre-load hooks
+        _init_state_dict_state(self)  # TODO: currently a no-op; need to refactor below
         self._state_dict_type = StateDictType.FULL_STATE_DICT
         self._state_dict_config = FullStateDictConfig()
         self._register_state_dict_hook(_post_state_dict_hook)
@@ -992,47 +532,6 @@ def __init__(
         )
         self.register_load_state_dict_post_hook(_post_load_state_dict_hook)
 
-    def _auto_wrap(
-        self,
-        auto_wrap_kwargs: Dict[str, Any],
-        fsdp_kwargs: Dict[str, Any],
-    ) -> None:
-        """
-        Recursively auto wraps the root module given by the key "module" in
-        ``auto_wrap_kwargs`` with the arguments in ``auto_wrap_kwargs`` and
-        ``fsdp_kwargs``.
-
-        Precondition: ``auto_wrap_policy`` contains the arguments expected by
-        ``_recursive_wrap()``, where ``auto_wrap_policy`` is not ``None``.
-        ``fsdp_kwargs`` contains all FSDP arguments except ``module``.
-        """
-        auto_wrap_policy = auto_wrap_kwargs["auto_wrap_policy"]
-        root_module = auto_wrap_kwargs["module"]
-        assert auto_wrap_policy is not None
-        # For auto wrapping, submodules should not already be wrapped with FSDP
-        # since double wrapping is not supported
-        for module_name, module in root_module.named_modules():
-            if isinstance(module, FullyShardedDataParallel):
-                raise ValueError(
-                    f"Expected {module_name} to NOT be FullyShardedDataParallel "
-                    "if using an `auto_wrap_policy`"
-                )
-        mixed_precision = fsdp_kwargs["mixed_precision"]
-        if mixed_precision is not None and _contains_batchnorm(root_module):
-            _override_batchnorm_mixed_precision(root_module)
-            auto_wrap_policy = functools.partial(
-                _or_policy, policies=[_wrap_batchnorm_individually, auto_wrap_policy]
-            )
-            warnings.warn(
-                "Both mixed precision and an `auto_wrap_policy` were specified "
-                "for FSDP, where the wrapped module has batch norm submodules. "
-                "The batch norm submodules will be wrapped as separate FSDP "
-                "instances with mixed precision disabled since some batch norm "
-                "kernels do not support low precision."
-            )
-            auto_wrap_kwargs["auto_wrap_policy"] = auto_wrap_policy
-        _recursive_wrap(**auto_wrap_kwargs, **fsdp_kwargs)
-
     def _register_param_handle(self, handle: FlatParamHandle) -> None:
         """Registers the parameter handle to this FSDP instance."""
         if handle not in self._handles:
@@ -1141,7 +640,9 @@ def __getattr__(self, name: str) -> Any:
 
     def __getitem__(self, key: int) -> Any:
         """Forward indexing calls in case the module is an ``nn.Sequential``."""
-        return self._fsdp_wrapped_module.__getitem__(key)  # type: ignore[operator]
+        if hasattr(self, FSDP_WRAPPED_MODULE):
+            return self._fsdp_wrapped_module.__getitem__(key)  # type: ignore[operator]
+        return super().__getitem__(key)
 
     def check_is_root(self) -> bool:
         self._lazy_init()
@@ -3547,21 +3048,6 @@ def rekey_optim_state_dict(
             return new_osd
         return new_osd  # should never reach here
 
-    def _get_default_comm_hook(self) -> Any:
-        r"""
-        Returns a default communication hook based on a sharding strategy.
-        """
-        if self.sharding_strategy != ShardingStrategy.NO_SHARD:
-            return default_hooks.reduce_scatter_hook
-        else:
-            return default_hooks.allreduce_hook
-
-    def _get_default_comm_hook_state(self) -> Any:
-        r"""
-        Returns a default communication hook state based on a sharding strategy.
-        """
-        return default_hooks.DefaultState(process_group=self.process_group)
-
     def register_comm_hook(self, state: object, hook: callable):
         """
         Registers a communication hook which is an enhancement that provides a
@@ -3611,7 +3097,7 @@ def register_comm_hook(self, state: object, hook: callable):
             ), "communication hook can be only registered once"
             submodule._hook_registered = True
             assert (
-                submodule._communication_hook == self._get_default_comm_hook()
+                submodule._communication_hook == _get_default_comm_hook(self.sharding_strategy)
             ), f"communication hook should be default, but it is {submodule._communication_hook.__name__} instead"
             submodule._communication_hook_state = state
             submodule._communication_hook = hook

From 9148a977c7889497252a8c4d1fbcd2bb6d5135aa Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Mon, 31 Oct 2022 20:54:52 +0000
Subject: [PATCH 0386/1922] [FSDP()][10/N][11/N] Introduce composable (ctor
 only) (#87924)

This PR introduces the composable FSDP API (with constructor semantics only) along with some further constructor refactoring. A notable contribution here is `_get_submodule_to_states()`, which performs auto wrapping without actually wrapping.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87924
Approved by: https://github.com/mrshenli
---
 test/distributed/fsdp/test_composable_fsdp.py | 215 ++++++++++++++++
 test/distributed/fsdp/test_fsdp_misc.py       |   5 +-
 test/distributed/fsdp/test_utils.py           | 109 ++++++++
 torch/distributed/fsdp/_fsdp.py               |  69 +++++
 torch/distributed/fsdp/_init_utils.py         | 238 ++++++++++++++----
 torch/distributed/fsdp/_wrap_utils.py         | 112 ++++++++-
 torch/distributed/fsdp/flat_param.py          |   5 +-
 .../fsdp/fully_sharded_data_parallel.py       |  23 +-
 8 files changed, 712 insertions(+), 64 deletions(-)
 create mode 100644 test/distributed/fsdp/test_composable_fsdp.py
 create mode 100644 torch/distributed/fsdp/_fsdp.py

diff --git a/test/distributed/fsdp/test_composable_fsdp.py b/test/distributed/fsdp/test_composable_fsdp.py
new file mode 100644
index 0000000000000..dd21e8d06ee40
--- /dev/null
+++ b/test/distributed/fsdp/test_composable_fsdp.py
@@ -0,0 +1,215 @@
+# Owner(s): ["oncall: distributed"]
+
+import copy
+import functools
+import sys
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp._common_utils import _is_fsdp_flattened
+from torch.distributed.fsdp._fsdp import fully_sharded_data_parallel
+from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import FSDPTest
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
+)
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+
+class SubModel(nn.Module):
+    def __init__(self, device) -> None:
+        super().__init__()
+        torch.manual_seed(0)
+        self.lin1 = nn.Linear(5, 5, bias=False, device=device)
+        self.lin2 = nn.Linear(5, 5, bias=False, device=device)
+        self.relu = nn.ReLU()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        z = self.relu(self.lin1(x))
+        z = self.relu(self.lin2(z))
+        return z
+
+
+class Model(nn.Module):
+    def __init__(self, device) -> None:
+        super().__init__()
+        torch.manual_seed(0)
+        self.sub1 = SubModel(device=device)
+        self.sub2 = SubModel(device=device)
+        self.lin = nn.Linear(5, 5, device=device)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        z = self.sub1(x)
+        z = self.sub2(z)
+        z = self.lin(z)
+        return z
+
+    @staticmethod
+    def auto_wrap_policy():
+        return functools.partial(
+            transformer_auto_wrap_policy, transformer_layer_cls={SubModel}
+        )
+
+
+class TestFSDPInitialization(FSDPTest):
+    """Tests composable FSDP initialization."""
+
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @skip_if_lt_x_gpu(2)
+    def test_auto_wrap_policy(self):
+        """Tests passing an ``auto_wrap_policy``."""
+
+        local_model = Model(device=torch.device("cuda"))
+
+        fsdp_wrapped_model = FSDP(
+            copy.deepcopy(local_model),
+            auto_wrap_policy=Model.auto_wrap_policy(),
+            use_orig_params=True,
+        )
+        composable_module = copy.deepcopy(local_model)
+        fsdp_state = fully_sharded_data_parallel(
+            composable_module,
+            auto_wrap_policy=Model.auto_wrap_policy(),
+        )
+
+        # Check that the composable module has the same names as the local
+        # model and the same sharded parameters as the FSDP-wrapped model
+        for (
+            (local_name, _),
+            (composable_name, composable_param),
+            (_, fsdp_wrapped_param),
+        ) in zip(
+            local_model.named_parameters(),
+            composable_module.named_parameters(),
+            fsdp_wrapped_model.named_parameters(),
+        ):
+            self.assertEqual(local_name, composable_name)
+            self.assertEqual(fsdp_wrapped_param, composable_param)
+
+        # Check that the composable module has the same  `FlatParameter`
+        # construction as the FSDP-wrapped model
+        composable_handles = fsdp_state._handles
+        fsdp_wrapped_handles = FSDP._fsdp_handles(fsdp_wrapped_model)
+        self.assertEqual(len(composable_handles), len(fsdp_wrapped_handles))
+        for (composable_handle, fsdp_wrapped_handle) in zip(
+            composable_handles, fsdp_wrapped_handles
+        ):
+            self.assertEqual(
+                composable_handle.flat_param.shape, fsdp_wrapped_handle.flat_param.shape
+            )
+
+        # Check that the composable module does not add any wrapper class
+        local_module_classes = set()
+        composable_module_classes = set()
+        for submodule in local_model.modules():
+            local_module_classes.add(type(submodule))
+        for submodule in composable_module.modules():
+            composable_module_classes.add(type(submodule))
+        self.assertEqual(local_module_classes, composable_module_classes)
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_id(self):
+        """Tests passing a ``device_id``."""
+        cpu_device = torch.device("cpu")
+        composable_module = Model(device=cpu_device)
+        for param in composable_module.parameters():
+            assert param.device == cpu_device
+        fully_sharded_data_parallel(
+            composable_module,
+            auto_wrap_policy=Model.auto_wrap_policy(),
+            device_id=self.rank,
+        )
+        for param in composable_module.parameters():
+            self.assertEqual(param.device, torch.device("cuda", self.rank))
+
+    @skip_if_lt_x_gpu(2)
+    def test_sync_module_states(self):
+        """Tests passing ``sync_module_states=True``."""
+        local_model = Model(device=torch.device("cuda"))
+        composable_module = copy.deepcopy(local_model)
+        # Check that the parameters are broadcast from rank 0 by comparing
+        # against an equivalent FSDP-wrapped module
+        if self.rank != 0:
+            for param in composable_module.parameters():
+                with torch.no_grad():
+                    param.zero_()
+        fsdp_wrapped_model = FSDP(
+            copy.deepcopy(local_model),
+            auto_wrap_policy=Model.auto_wrap_policy(),
+            use_orig_params=True,
+        )
+        fully_sharded_data_parallel(
+            composable_module,
+            auto_wrap_policy=Model.auto_wrap_policy(),
+            sync_module_states=True,
+        )
+        for (composable_param, fsdp_wrapped_param) in zip(
+            composable_module.parameters(),
+            fsdp_wrapped_model.parameters(),
+        ):
+            self.assertEqual(composable_param, fsdp_wrapped_param)
+
+    @skip_if_lt_x_gpu(2)
+    def test_materialize_meta_module(self):
+        """Tests materializing a meta-device module."""
+
+        def _param_init_fn(module: nn.Module):
+            """
+            This is an example ``param_init_fn`` for composable FSDP.
+
+            TODO: This function is not satisfactory because this requires
+            guarding with ``_is_fsdp_flattened()``. This guard is needed to
+            avoid re-initializing parameters for nested cases since some
+            initialization methods strictly require non-1D shape (e.g.
+            ``kaiming_uniform_()``), while FSDP replaces the original
+            parameters with their 1D shards.
+            """
+            is_meta = any(param.is_meta for param in module.parameters())
+            if is_meta:
+                module.to_empty(device=torch.cuda.current_device())
+            torch.manual_seed(0)
+            for param in module.parameters():
+                if not _is_fsdp_flattened(param):
+                    nn.init.uniform_(param)
+
+        composable_module = Model(device="meta")
+        fsdp_wrapped_model = FSDP(
+            Model(device="meta"),
+            auto_wrap_policy=Model.auto_wrap_policy(),
+            param_init_fn=_param_init_fn,
+            use_orig_params=True,
+        )
+        fully_sharded_data_parallel(
+            composable_module,
+            auto_wrap_policy=Model.auto_wrap_policy(),
+            param_init_fn=_param_init_fn,
+        )
+        for (composable_param, fsdp_wrapped_param) in zip(
+            composable_module.parameters(),
+            fsdp_wrapped_model.parameters(),
+        ):
+            self.assertEqual(composable_param, fsdp_wrapped_param)
+
+
+instantiate_parametrized_tests(TestFSDPInitialization)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_misc.py b/test/distributed/fsdp/test_fsdp_misc.py
index 98cd6488ae5e7..234af4f7a94f9 100644
--- a/test/distributed/fsdp/test_fsdp_misc.py
+++ b/test/distributed/fsdp/test_fsdp_misc.py
@@ -405,7 +405,7 @@ def test_fsdp_cpu_init_stays_on_cpu(self):
         module is on CPU after FSDP initialization, albeit after loging a
         warning, and that FSDP moves CPU input to GPU before the forward."""
         torch.cuda.set_device(self.rank)
-        regex = "Module is put on CPU"
+        regex = "passed-in `module` is on CPU"
         context = self.assertWarnsRegex(
             expected_warning=UserWarning, expected_regex=regex
         )
@@ -437,8 +437,7 @@ def test_cpu_init_with_sync_module_states(self):
             CUDAInitMode.CUDA_NEVER,
         )
         with self.assertRaisesRegex(
-            ValueError,
-            "Module has CPU parameters, but sync_module_states=True is specified.",
+            ValueError, "The module has CPU parameters when `sync_module_states=True`"
         ):
             FSDP(nested_wrapped_module, self.process_group, sync_module_states=True)
 
diff --git a/test/distributed/fsdp/test_utils.py b/test/distributed/fsdp/test_utils.py
index 6ac2f78be7150..e797325ccbc99 100644
--- a/test/distributed/fsdp/test_utils.py
+++ b/test/distributed/fsdp/test_utils.py
@@ -1,16 +1,20 @@
 # Owner(s): ["oncall: distributed"]
 
+import functools
 import random
 import sys
 import unittest
 from collections import OrderedDict
 from dataclasses import dataclass
+from enum import auto, Enum
 from typing import List
 
 import torch
 import torch.nn as nn
 from torch import distributed as dist
 from torch.distributed.fsdp._utils import _apply_to_tensors
+from torch.distributed.fsdp._wrap_utils import _get_submodule_to_states
+from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
 from torch.distributed.utils import _replace_by_prefix
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -115,7 +119,112 @@ def fill_fn(x):
         self.assertEqual(torch.sum(x), 0)
 
 
+class TestGetSubmoduleToStates(TestCase):
+    """Tests the function ``_get_submodule_to_states()``."""
+
+    class SharedParameterMode(Enum):
+        """
+        - ``PARENT_CHILD``: A parent submodule shares a parameter with a child
+        submodule.
+        - ``SIBLING``: Two sibling submodules share a parameter.
+        """
+
+        PARENT_CHILD = auto()
+        SIBLING = auto()  # TODO: not yet supported
+
+    class Model(nn.Module):
+        """Nested model with buffers and a shared parameter."""
+
+        def __init__(self, shared_parameter_mode) -> None:
+            super().__init__()
+            self.seq1 = nn.Sequential(
+                nn.Linear(5, 5, bias=False),
+                nn.Linear(5, 5, bias=False),
+            )
+            self.seq1.register_buffer("seq1_buffer", torch.randn((5,)))
+            self.lin = nn.Linear(5, 5, bias=False)
+            self.seq2 = nn.Sequential(
+                nn.Sequential(nn.Linear(5, 5, bias=False)), nn.Linear(5, 5, bias=False)
+            )
+            if (
+                shared_parameter_mode
+                == TestGetSubmoduleToStates.SharedParameterMode.PARENT_CHILD
+            ):
+                self.seq2[0][0].weight = self.lin.weight
+            elif (
+                shared_parameter_mode
+                == TestGetSubmoduleToStates.SharedParameterMode.SIBLING
+            ):
+                self.seq2[0][0].weight = self.seq1[0].weight
+            self.seq2[1].register_buffer("seq2_1_buffer", torch.randn((5,)))
+
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            return self.seq2(self.lin(self.seq1(x)))  # equivalent to one matmul
+
+    def test_module_wrap_policy(self):
+        """
+        Tests the module wrap policy on a nested model with buffers and a
+        shared parameter.
+
+        NOTE: This test is hard coded against ``Model``.
+        """
+        model = self.Model(TestGetSubmoduleToStates.SharedParameterMode.PARENT_CHILD)
+
+        # Compute the mapping from submodule to states according to a logical
+        # module wrap policy
+        module_classes = (nn.Sequential,)
+        auto_wrap_policy = functools.partial(
+            transformer_auto_wrap_policy, transformer_layer_cls=set(module_classes)
+        )
+        submodule_to_states = _get_submodule_to_states(
+            model, auto_wrap_policy, set(), set()
+        )
+        # Check the number of submodules with states in the mapping
+        num_submodules_with_states = sum(
+            isinstance(submodule, module_classes) for submodule in model.modules()
+        )  # explicitly show how to compute the expected number
+        if not isinstance(model, module_classes):
+            num_submodules_with_states += 1  # always include the root
+        assert num_submodules_with_states == 4, f"{num_submodules_with_states}"
+        self.assertEqual(len(submodule_to_states), num_submodules_with_states)
+
+        # Check the mapping, i.e. that the dict order follows a post-order
+        # traversal and that the contents are expected
+        submodules = list(submodule_to_states.keys())
+        # - Root module `model`
+        self.assertEqual(submodules[0], model)
+        root_states = submodule_to_states[submodules[0]]
+        self.assertEqual(root_states.params, [model.lin.weight])
+        self.assertEqual(root_states.param_names, ["lin.weight"])
+        self.assertEqual(root_states.buffers, [])
+        self.assertEqual(root_states.buffer_names, [])
+        # # - `seq2`
+        self.assertEqual(submodules[1], model.seq2)
+        seq2_states = submodule_to_states[submodules[1]]
+        self.assertEqual(seq2_states.params, [model.seq2[1].weight])
+        self.assertEqual(seq2_states.param_names, ["1.weight"])
+        self.assertEqual(seq2_states.buffers, [model.seq2[1].seq2_1_buffer])
+        self.assertEqual(seq2_states.buffer_names, ["1.seq2_1_buffer"])
+        # - `seq2[0]`
+        self.assertEqual(submodules[2], model.seq2[0])
+        seq2_0_states = submodule_to_states[submodules[2]]
+        self.assertEqual(seq2_0_states.params, [])  # shared parameter
+        self.assertEqual(seq2_0_states.param_names, [])
+        self.assertEqual(seq2_0_states.buffers, [])
+        self.assertEqual(seq2_0_states.buffer_names, [])
+        # - `seq1`
+        self.assertEqual(submodules[3], model.seq1)
+        seq1_states = submodule_to_states[submodules[3]]
+        self.assertEqual(
+            seq1_states.params, [model.seq1[0].weight, model.seq1[1].weight]
+        )
+        self.assertEqual(seq1_states.param_names, ["0.weight", "1.weight"])
+        self.assertEqual(seq1_states.buffers, [model.seq1.seq1_buffer])
+        self.assertEqual(seq1_states.buffer_names, ["seq1_buffer"])
+
+
 instantiate_parametrized_tests(TestUtils)
+instantiate_parametrized_tests(TestGetSubmoduleToStates)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/distributed/fsdp/_fsdp.py b/torch/distributed/fsdp/_fsdp.py
new file mode 100644
index 0000000000000..ac8b1cd4d7a08
--- /dev/null
+++ b/torch/distributed/fsdp/_fsdp.py
@@ -0,0 +1,69 @@
+from typing import Callable, cast, Iterable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed.fsdp._common_utils import _State, FSDPState
+from torch.distributed.fsdp._init_utils import (
+    _init_buffer_state,
+    _init_core_state,
+    _init_ignored_module_states,
+    _init_param_handles_from_module,
+    _init_prefetching_state,
+    _init_process_group_state,
+    _init_runtime_state,
+    _init_state_dict_state,
+)
+from torch.distributed.fsdp.api import (
+    BackwardPrefetch,
+    CPUOffload,
+    MixedPrecision,
+    ShardingStrategy,
+)
+
+
+def fully_sharded_data_parallel(
+    module: nn.Module,
+    process_group: Optional[dist.ProcessGroup] = None,
+    sharding_strategy: Optional[ShardingStrategy] = None,
+    mixed_precision: Optional[MixedPrecision] = None,
+    cpu_offload: Optional[CPUOffload] = None,
+    auto_wrap_policy: Optional[Callable] = None,
+    ignored_modules: Optional[Iterable[torch.nn.Module]] = None,
+    device_id: Optional[Union[int, torch.device]] = None,
+    param_init_fn: Optional[Callable[[nn.Module], None]] = None,
+    sync_module_states: bool = False,
+) -> FSDPState:
+    """
+    Applies ``FullyShardedDataParallel` (FSDP) semantics to ``module``.
+    """
+    state = cast(_State, FSDPState())
+    state = _init_ignored_module_states(state, module, ignored_modules)
+    state = _init_process_group_state(state, process_group)
+    limit_all_gathers = True
+    use_orig_params = True
+    backward_prefetch_limit = 1
+    forward_prefetch_limit = 1
+    state = _init_core_state(
+        state,
+        sharding_strategy,
+        mixed_precision,
+        cpu_offload,
+        limit_all_gathers,
+        use_orig_params,
+        backward_prefetch_limit,
+        forward_prefetch_limit,
+    )
+    state = _init_runtime_state(state)
+    state = _init_prefetching_state(state, BackwardPrefetch.BACKWARD_PRE, False)
+    state = _init_buffer_state(state, module)
+    state = _init_param_handles_from_module(
+        state,
+        module,
+        auto_wrap_policy,
+        device_id,
+        param_init_fn,
+        sync_module_states,
+    )
+    state = _init_state_dict_state(state)
+    return cast(FSDPState, state)
diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index 7d634187809f0..5d1117d39f6d5 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -10,6 +10,7 @@
     Optional,
     Set,
     Tuple,
+    Type,
     Union,
 )
 
@@ -29,6 +30,7 @@
 )
 from torch.distributed.fsdp._exec_order_utils import _ExecOrderData
 from torch.distributed.fsdp._limiter_utils import _FreeEventQueue
+from torch.distributed.fsdp._wrap_utils import _get_submodule_to_states
 from torch.distributed.fsdp.api import (
     BackwardPrefetch,
     CPUOffload,
@@ -97,12 +99,9 @@ def _init_ignored_module_states(
 
 
 @no_type_check
-def _init_module_and_device_state(
+def _init_buffer_state(
     state: _State,
     module: nn.Module,
-    device_id: Optional[Union[int, torch.device]],
-    param_init_fn: Optional[Callable[[nn.Module], None]],
-    sync_module_states: bool,
 ) -> _State:
     state._buffer_names = _get_buffer_names(module)
     # Save a mapping from fully prefixed buffer name to its original dtype
@@ -110,27 +109,6 @@ def _init_module_and_device_state(
     # their original dtype for model checkpointing
     _buffer_name_to_orig_dtype: Dict[str, torch.dtype] = {}
     state._buffer_name_to_orig_dtype = _buffer_name_to_orig_dtype
-    _check_single_device_module(module, state._ignored_params)
-    device_from_device_id = _get_device_from_device_id(device_id, state.rank)
-    _materialize_module(
-        module,
-        param_init_fn,
-        state._ignored_params,
-        device_from_device_id,
-        lambda _: True,
-    )
-    # TODO: We need to skip this for functional-like to avoid moving the entire
-    # unsharded module onto GPU before any sharding.
-    _move_module_to_device(module, state._ignored_params, device_from_device_id)
-    state.compute_device = _get_compute_device(
-        module,
-        state._ignored_params,
-        device_from_device_id,
-        state.rank,
-    )
-    state._managed_params = list(_get_orig_params(module, state._ignored_params))
-    if sync_module_states:
-        _sync_module_states(module, state._managed_params, state.process_group)
     return state
 
 
@@ -218,6 +196,105 @@ def _init_state_dict_state(state: _State) -> _State:
     return state
 
 
+@no_type_check
+def _init_param_handle_from_module(
+    state: _State,
+    root_module: nn.Module,
+    device_id: Optional[Union[int, torch.device]],
+    param_init_fn: Optional[Callable[[nn.Module], None]],
+    sync_module_states: bool,
+    module_wrapper_cls: Type,
+) -> _State:
+    """
+    Initializes a ``FlatParamHandle`` from a module ``root_module``. This is
+    the module wrapper code path.
+    """
+    _check_single_device_module(root_module, state._ignored_params)
+    device_from_device_id = _get_device_from_device_id(device_id, state.rank)
+    _materialize_module(
+        root_module,
+        param_init_fn,
+        state._ignored_params,
+        device_from_device_id,
+        lambda k: not isinstance(k, module_wrapper_cls),
+    )
+    # TODO: Investigate refactoring `_move_module_to_device()` to
+    # `_move_states_to_device()` to avoid the `device_id` + CPU offload hack
+    _move_module_to_device(root_module, state._ignored_params, device_from_device_id)
+    state.compute_device = _get_compute_device(
+        root_module,
+        state._ignored_params,
+        device_from_device_id,
+        state.rank,
+    )
+    managed_params = list(_get_orig_params(root_module, state._ignored_params))
+    if sync_module_states:
+        _sync_module_params_and_buffers(
+            root_module, managed_params, state.process_group
+        )
+    _init_param_handle_from_params(state, managed_params, root_module)
+    return state
+
+
+@no_type_check
+def _init_param_handles_from_module(
+    state: _State,
+    root_module: nn.Module,
+    auto_wrap_policy: Callable,
+    device_id: Optional[Union[int, torch.device]],
+    param_init_fn: Optional[Callable[[nn.Module], None]],
+    sync_module_states: bool,
+) -> _State:
+    """
+    Initializes all ``FlatParamHandle`` s from a module ``root_module``. This
+    is the non-module-wrapper code path.
+    """
+    submodule_to_states = _get_submodule_to_states(
+        root_module,
+        auto_wrap_policy,
+        state._ignored_modules,
+        state._ignored_params,
+    )
+    _check_single_device_module(root_module, state._ignored_params)
+    device_from_device_id = _get_device_from_device_id(device_id, state.rank)
+    # Initialize and shard `FlatParamHandle`s one by one following bottom-up
+    # order (hence the `reversed`) to avoid increasing peak GPU memory usage
+    materialized_module = False
+    for submodule, (params, buffers, param_names, buffer_names) in reversed(
+        submodule_to_states.items()
+    ):
+        materialized_module |= _materialize_module(
+            submodule,
+            param_init_fn,
+            state._ignored_params,
+            device_from_device_id,
+            lambda _: True,
+        )
+        if materialized_module:
+            # Materializing from meta device can change the parameter/buffer
+            # variables, so reacquire references
+            params = [submodule.get_parameter(param_name) for param_name in param_names]
+            buffers = [
+                submodule.get_buffer(buffer_name) for buffer_name in buffer_names
+            ]
+        _move_states_to_device(params, buffers, device_from_device_id)
+        if not hasattr(state, "compute_device"):  # only need to set once
+            state.compute_device = _get_compute_device(
+                submodule,
+                state._ignored_params,
+                device_from_device_id,
+                state.rank,
+            )
+        if sync_module_states:
+            _sync_module_states(params, buffers, state.process_group)
+        # Pass `root_module` to have internal FQN metadata prefix starting from
+        # it instead of `submodule`
+        _init_param_handle_from_params(state, params, root_module)
+    # Reverse to preserve top-down order like `_fsdp_handles()`
+    state._handles.reverse()
+    return state
+
+
 @no_type_check
 def _init_param_handle_from_params(
     state: _State,
@@ -226,8 +303,6 @@ def _init_param_handle_from_params(
 ):
     if len(params) == 0:
         return
-    # TODO: Move module to GPU if needed (for non-wrapper code path) -- we need
-    # to fuse this method with `_init_module_and_device_state()`
     handle_config = HandleConfig(
         SHARDING_STRATEGY_MAP[state.sharding_strategy],
         state.cpu_offload.offload_params,
@@ -245,7 +320,6 @@ def _init_param_handle_from_params(
     )
     # TODO: Can simplify call `shard()` in the `FlatParamHandle` ctor
     handle.shard()
-    assert handle.flat_param not in state.params
     assert handle not in state._handles
     state.params.append(handle.flat_param)
     state._handles.append(handle)
@@ -397,7 +471,7 @@ def _materialize_module(
     ignored_params: Set[nn.Parameter],
     device_from_device_id: Optional[torch.device],
     deferred_init_check_fn: Callable,
-) -> None:
+) -> bool:
     """
     Materializes the wrapped module ``module`` in place if needed: either
     if the module has parameters that use meta device or are torchdistX
@@ -409,12 +483,17 @@ def _materialize_module(
     it is not ``None`` or the current device otherwise and calls
     ``reset_parameters()``, and for torchdistX fake tensors, this calls
     ``deferred_init.materialize_module()``.
+
+    Returns:
+        bool: ``True`` if ``module`` was materialized and ``False`` if this was
+        a no-op.
     """
-    is_meta_module = any(p.is_meta for p in _get_orig_params(module, ignored_params))
+    managed_params = _get_orig_params(module, ignored_params)
+    is_meta_module = any(param.is_meta for param in managed_params)
     is_torchdistX_deferred_init = (
         not is_meta_module
         and _TORCHDISTX_AVAIL
-        and any(fake.is_fake(p) for p in _get_orig_params(module, ignored_params))
+        and any(fake.is_fake(param) for param in managed_params)
     )
     if (is_meta_module or is_torchdistX_deferred_init) and param_init_fn is not None:
         if not callable(param_init_fn):
@@ -422,6 +501,7 @@ def _materialize_module(
                 f"Expected {param_init_fn} to be callable but got {type(param_init_fn)}"
             )
         param_init_fn(module)
+        return True
     elif is_meta_module:
         # Run default meta device initialization
         materialization_device = device_from_device_id or torch.device(
@@ -438,16 +518,19 @@ def _materialize_module(
                 "module implements a `reset_parameters()` method."
             )
             raise e
+        return True
     elif is_torchdistX_deferred_init:
         # Run default torchdistX initialization
         deferred_init.materialize_module(module, check_fn=deferred_init_check_fn)
+        return True
+    return False
 
 
 def _move_module_to_device(
     module: nn.Module,
     ignored_params: Set[nn.Parameter],
     device_from_device_id: Optional[torch.device],
-):
+) -> None:
     """
     Moves ``module`` depending on ``device_from_device_id`` and its current
     device. This includes moving ignored modules' parameters.
@@ -459,10 +542,10 @@ def _move_module_to_device(
 
     Precondition: ``_check_single_device_module()``.
     """
-    cpu_device = torch.device("cpu")
     param = next(_get_orig_params(module, ignored_params), None)
     if param is None:
         return  # no original parameters to manage
+    cpu_device = torch.device("cpu")
     if device_from_device_id is not None:
         if param.device == cpu_device:
             # NOTE: This includes moving ignored modules' parameters.
@@ -479,13 +562,47 @@ def _move_module_to_device(
                         for handle in submodule._handles:
                             handle.flat_param_to(torch.device("cpu"))
     elif param.device == cpu_device:
-        warnings.warn(
-            "Module is put on CPU and will thus have flattening and sharding"
-            " run on CPU, which is less efficient than on GPU. We recommend passing in "
-            "`device_id` argument which will enable FSDP to put module on GPU device,"
-            " module must also be on GPU device to work with `sync_module_states=True` flag"
-            " which requires GPU communication."
-        )
+        _warn_cpu_init()
+
+
+def _move_states_to_device(
+    params: List[nn.Parameter],
+    buffers: List[torch.Tensor],
+    device_from_device_id: Optional[torch.device],
+) -> None:
+    """
+    Precondition: ``_check_single_device_module()``.
+    """
+    if len(params) == 0 and len(buffers) == 0:
+        return
+    if len(params) > 0:
+        current_device = params[0].device
+    elif len(buffers) > 0:
+        current_device = buffers[0].device
+    cpu_device = torch.device("cpu")
+    if device_from_device_id is not None:
+        # Move the parameters and buffers like the `.data` code path in
+        # `nn.Module._apply()`, which underlies `nn.Module.to()`
+        for param in params:
+            with torch.no_grad():
+                param.data = param.to(device_from_device_id)
+                if param.grad is not None:
+                    param.grad.data = param.grad.to(device_from_device_id)
+        for buffer in buffers:
+            buffer.data = buffer.to(device_from_device_id)
+    elif current_device == cpu_device:
+        _warn_cpu_init()
+
+
+def _warn_cpu_init():
+    warnings.warn(
+        "The passed-in `module` is on CPU and will thus have FSDP's sharding "
+        "initialization run on CPU, which may be slower than on GPU. We "
+        "recommend passing in the `device_id` argument for FSDP to move "
+        "`module` to GPU for the sharding initialization. `module` must also "
+        "be on GPU device to work with the `sync_module_states=True` flag "
+        "since that requires GPU communication."
+    )
 
 
 def _get_compute_device(
@@ -522,7 +639,8 @@ def _get_compute_device(
     return compute_device
 
 
-def _sync_module_states(
+# TODO: See how to deprecate!
+def _sync_module_params_and_buffers(
     module: nn.Module,
     params: List[nn.Parameter],
     process_group: dist.ProcessGroup,
@@ -534,12 +652,7 @@ def _sync_module_states(
     Precondition: ``sync_module_states == True`` and ``self.process_group`` has
     been set.
     """
-    if params and any(param.device == torch.device("cpu") for param in params):
-        raise ValueError(
-            "Module has CPU parameters, but sync_module_states=True is specified."
-            "This only works for GPU module, please specify `device_id` argument or move"
-            " module to GPU before init."
-        )
+    _check_params_for_sync_module_states(params)
     module_states: List[torch.Tensor] = []
     # TODO (awgu): When exposing the original parameters, we need to also
     # use this attribute to prevent re-synchronizing parameters.
@@ -557,6 +670,37 @@ def _sync_module_states(
     )
 
 
+def _sync_module_states(
+    params: List[nn.Parameter],
+    buffers: List[torch.Tensor],
+    process_group: dist.ProcessGroup,
+) -> None:
+    _check_params_for_sync_module_states(params)
+    # Assumes that each call to this method passes in disjoint `params` and
+    # and `buffers` across calls, so there is no chance of re-synchronizing
+    params_and_buffers = [param.detach() for param in params] + [
+        buffer.detach() for buffer in buffers
+    ]
+    _sync_params_and_buffers(
+        process_group,
+        params_and_buffers,
+        PARAM_BROADCAST_BUCKET_SIZE,
+        src=0,
+    )
+
+
+def _check_params_for_sync_module_states(
+    params: List[nn.Parameter],
+) -> None:
+    if params and any(param.device == torch.device("cpu") for param in params):
+        raise ValueError(
+            "The module has CPU parameters when `sync_module_states=True`, "
+            "which only works when all parameters are on GPU. Please specify "
+            "the `device_id` argument or move the module to GPU before passing "
+            "into FSDP."
+        )
+
+
 def _get_orig_params(
     module: nn.Module,
     ignored_params: Set[nn.Parameter],
diff --git a/torch/distributed/fsdp/_wrap_utils.py b/torch/distributed/fsdp/_wrap_utils.py
index 00133b5c58a37..34d1c9c1ac243 100644
--- a/torch/distributed/fsdp/_wrap_utils.py
+++ b/torch/distributed/fsdp/_wrap_utils.py
@@ -1,7 +1,10 @@
+import collections
 import functools
 import warnings
-from typing import Any, Dict
+from typing import Any, Callable, Deque, Dict, List, NamedTuple, Set, Tuple
 
+import torch
+import torch.nn as nn
 from torch.distributed.fsdp._utils import (
     _contains_batchnorm,
     _override_batchnorm_mixed_precision,
@@ -13,6 +16,20 @@
 )
 
 
+class SubmoduleState(NamedTuple):
+    """
+    Submodule state for ``_get_submodule_to_states()``, representing a logical
+    grouping (e.g. parameters to be flattened together).
+    """
+
+    params: List[nn.Parameter]
+    buffers: List[torch.Tensor]
+    # Parameter and buffer names are prefixed starting from the submodule,
+    # which is not necessarily the root module
+    param_names: List[str]
+    buffer_names: List[str]
+
+
 def _auto_wrap(
     auto_wrap_kwargs: Dict[str, Any],
     fsdp_kwargs: Dict[str, Any],
@@ -53,3 +70,96 @@ def _auto_wrap(
         )
         auto_wrap_kwargs["auto_wrap_policy"] = auto_wrap_policy
     _recursive_wrap(**auto_wrap_kwargs, **fsdp_kwargs)
+
+
+def _get_submodule_to_states(
+    root_module: nn.Module,
+    auto_wrap_policy: Callable,
+    ignored_modules: Set[nn.Module],
+    ignored_params: Set[nn.Parameter],
+) -> Dict[nn.Module, SubmoduleState]:
+    """
+    Returns a mapping from submodule to its parameters, buffers, parameter
+    names, and buffer names, where each entry logically represents a grouping
+    according to the given auto wrap policy and ignored modules/parameters.
+    However, this method does not actually perform any module wrapping.
+
+    The mapped-to values are the states from the subtree rooted at the
+    corresponding submodule key, excluding child submodules in the mapping and
+    ignored state. Sibling submodules cannot be grouped together. The parameter
+    and buffer names are prefixed starting from the submodule.
+
+    Each non-ignored parameter and buffer appears exactly once in the returned
+    ``dict``, and the ``dict`` is ordered by increasing tree depth. A mapped-to
+    parameter list may be empty if the submodule has no parameters or if its
+    parameters were assigned to a parent submodule instead.
+    """
+    # Record the modules to wrap without actually wrapping
+    wrapped_modules: List[nn.Module] = []  # these are only logically wrapped
+    wrapper_cls = functools.partial(_record_module_wrapper_cls, wrapped_modules)
+    _recursive_wrap(
+        root_module,
+        auto_wrap_policy=auto_wrap_policy,
+        wrapper_cls=wrapper_cls,
+        ignored_modules=ignored_modules,
+        ignored_params=ignored_params,
+        only_wrap_children=False,
+    )
+    # Always include the root module even if not wrapped by the given policy
+    if root_module not in wrapped_modules:
+        wrapped_modules.append(root_module)
+
+    submodule_to_states = collections.OrderedDict()
+    visited_params = set()
+    for ignored_param in ignored_params:
+        visited_params.add(ignored_param)
+    visited_buffers = set()
+    # Constructing `wrapped_modules` with `_recursive_wrap()` follows a
+    # post-order traversal. We record state in `submodule_to_states` using a
+    # reverse post-ordering since that is a topological sort. This assigns
+    # parent-child shared parameters to the parent submodule.
+    # TODO: To handle sibling shared parameters, we need to pre-compute the
+    # shared parameters and assign them to the LCA submodule manually.
+    wrapped_modules.reverse()
+    wrapped_modules_set = set(wrapped_modules)
+    for submodule in wrapped_modules:
+        # Perform a BFS from `submodule` and record all unvisited state that is
+        # not already associated with another module in `wrapped_modules`.
+        queue: Deque[Tuple[nn.Module, str]] = collections.deque()
+        queue.append((submodule, ""))
+        params: List[nn.Parameter] = []
+        param_names: List[str] = []
+        buffers: List[torch.Tensor] = []
+        buffer_names: List[str] = []
+        while len(queue) > 0:
+            module, prefix = queue.popleft()
+            for param_name, param in module.named_parameters(recurse=False):
+                if param not in visited_params:
+                    params.append(param)
+                    visited_params.add(param)
+                    param_names.append(prefix + param_name)
+            for buffer_name, buffer in module.named_buffers(recurse=False):
+                if buffer not in visited_buffers:
+                    buffers.append(buffer)
+                    visited_buffers.add(buffer)
+                    buffer_names.append(prefix + buffer_name)
+            for child_module_name, child_module in module.named_children():
+                if child_module not in wrapped_modules_set:
+                    queue.append((child_module, prefix + child_module_name + "."))
+        submodule_to_states[submodule] = SubmoduleState(
+            params, buffers, param_names, buffer_names
+        )
+    return submodule_to_states
+
+
+def _record_module_wrapper_cls(
+    wrapped_modules: List[nn.Module],
+    module: nn.Module,
+    **kwargs,
+) -> nn.Module:
+    """
+    This defines a wrapper class to be passed to ``_recursive_wrap()`` that
+    records the wrapped module to the input ``wrapped_modules``.
+    """
+    wrapped_modules.append(module)
+    return module
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index 376e0d4ff51b9..5fd130826faa6 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -415,7 +415,10 @@ def _init_flat_param(
                         else param_name
                     )
                     prefixed_param_names.append(prefixed_param_name)
-        assert requires_grad is not None
+        assert requires_grad is not None, (
+            "Passed-in `params` were not found in the module tree\n"
+            f"params: {params}\nmodule: {module}"
+        )
         self.flat_param = FlatParamHandle.flatten_params(
             params_to_flatten, requires_grad
         )
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index ef79a80c3102e..019fa678580e9 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -43,10 +43,10 @@
 from torch.distributed.fsdp._init_utils import (
     _check_orig_params_flattened,
     _get_default_comm_hook,
+    _init_buffer_state,
     _init_core_state,
     _init_ignored_module_states,
-    _init_module_and_device_state,
-    _init_param_handle_from_params,
+    _init_param_handle_from_module,
     _init_prefetching_state,
     _init_process_group_state,
     _init_runtime_state,
@@ -457,7 +457,6 @@ def __init__(
         super().__init__()
 
         _init_ignored_module_states(self, module, ignored_modules)
-
         if auto_wrap_policy is not None:
             auto_wrap_kwargs = {
                 "module": module,
@@ -500,26 +499,26 @@ def __init__(
             backward_prefetch_limit,
             forward_prefetch_limit,
         )
-        _init_module_and_device_state(
+        _init_runtime_state(self)
+        _init_prefetching_state(self, backward_prefetch, forward_prefetch)
+        _init_buffer_state(self, module)
+        _init_param_handle_from_module(
             self,
             module,
             device_id,
             param_init_fn,
             sync_module_states,
+            FullyShardedDataParallel,
         )
-        _init_prefetching_state(self, backward_prefetch, forward_prefetch)
-        _init_runtime_state(self)
         self._fsdp_wrapped_module = module
-        _init_param_handle_from_params(self, self._managed_params, module)
         if not use_orig_params:
             _check_orig_params_flattened(self, self._ignored_params)
             self._register_flat_param()
 
-        # TODO (revisit): I explicitly delete these because we do want to keep
-        # references to these from FSDP. I only added them to the state for
+        # TODO (revisit): I explicitly delete this because we do want to keep
+        # references to these from FSDP. I only added it to the state for
         # convenience in this refactoring.
         delattr(self, "_ignored_params")
-        delattr(self, "_managed_params")
 
         # `_state_dict_type` controls the `state_dict()` behavior, which is
         # implemented using post-save and pre-load hooks
@@ -3096,8 +3095,8 @@ def register_comm_hook(self, state: object, hook: callable):
                 not submodule._hook_registered
             ), "communication hook can be only registered once"
             submodule._hook_registered = True
-            assert (
-                submodule._communication_hook == _get_default_comm_hook(self.sharding_strategy)
+            assert submodule._communication_hook == _get_default_comm_hook(
+                self.sharding_strategy
             ), f"communication hook should be default, but it is {submodule._communication_hook.__name__} instead"
             submodule._communication_hook_state = state
             submodule._communication_hook = hook

From ed4c1d25f73e21a792b64e1fded84f9a3ae10b4c Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Mon, 31 Oct 2022 20:54:53 +0000
Subject: [PATCH 0387/1922] [FSDP()][12/N] Easy cleanup (#87925)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87925
Approved by: https://github.com/mrshenli
---
 torch/distributed/fsdp/_init_utils.py              |  5 +++++
 .../fsdp/fully_sharded_data_parallel.py            | 14 +-------------
 2 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index 5d1117d39f6d5..0ff70dc7771e6 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -123,6 +123,11 @@ def _init_core_state(
     backward_prefetch_limit: int,
     forward_prefetch_limit: int,
 ) -> _State:
+    # We clamp the strategy to `NO_SHARD` for world size of 1 since they are
+    # currently functionally equivalent. This may change if/when we integrate
+    # FSDP with MoE.
+    if state.world_size == 1:
+        sharding_strategy = ShardingStrategy.NO_SHARD
     state.sharding_strategy = sharding_strategy or ShardingStrategy.FULL_SHARD
     state.mixed_precision = mixed_precision or MixedPrecision()
     state.cpu_offload = cpu_offload or CPUOffload()
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 019fa678580e9..08fdeceecaa85 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -482,11 +482,6 @@ def __init__(
             _auto_wrap(auto_wrap_kwargs, fsdp_kwargs, FullyShardedDataParallel)
 
         _init_process_group_state(self, process_group)
-        # We clamp the strategy to `NO_SHARD` for world size of 1 since they
-        # are currently functionally equivalent. This may change if/when we
-        # integrate FSDP with MoE.
-        if self.world_size == 1:
-            sharding_strategy = ShardingStrategy.NO_SHARD
         backward_prefetch_limit = 1
         forward_prefetch_limit = 1
         _init_core_state(
@@ -515,9 +510,7 @@ def __init__(
             _check_orig_params_flattened(self, self._ignored_params)
             self._register_flat_param()
 
-        # TODO (revisit): I explicitly delete this because we do want to keep
-        # references to these from FSDP. I only added it to the state for
-        # convenience in this refactoring.
+        # Delete to avoid keeping references after the constructor
         delattr(self, "_ignored_params")
 
         # `_state_dict_type` controls the `state_dict()` behavior, which is
@@ -531,11 +524,6 @@ def __init__(
         )
         self.register_load_state_dict_post_hook(_post_load_state_dict_hook)
 
-    def _register_param_handle(self, handle: FlatParamHandle) -> None:
-        """Registers the parameter handle to this FSDP instance."""
-        if handle not in self._handles:
-            self._handles.append(handle)
-
     def _unshard(
         self,
         handles: List[FlatParamHandle],

From daf1e482931cb8953581e00afc0d3e48970fd9b3 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Tue, 1 Nov 2022 01:16:26 +0000
Subject: [PATCH 0388/1922] [FSDP()][13/N] Refactor unshard/reshard/grads
 (#87926)

This PR is not too complicated. We just move unshard/reshard/grads out to `_runtime_utils.py` and make them take `state: _State` instead of `self`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87926
Approved by: https://github.com/mrshenli
---
 torch/distributed/fsdp/_runtime_utils.py      |  89 +++++++++++++-
 torch/distributed/fsdp/_utils.py              |   2 +-
 .../fsdp/fully_sharded_data_parallel.py       | 115 ++++--------------
 torch/testing/_internal/common_fsdp.py        |  21 ++--
 4 files changed, 126 insertions(+), 101 deletions(-)

diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index 6a3c7c86c129d..4311e27bf74f3 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -1,11 +1,96 @@
-from typing import Any, List, Optional, Tuple
+from typing import Any, List, no_type_check, Optional, Tuple
 
 import torch
-from torch.distributed.fsdp._utils import _apply_to_tensors
+from torch.distributed.fsdp._common_utils import _State
+from torch.distributed.fsdp._utils import _apply_to_tensors, p_assert
 from torch.distributed.fsdp.flat_param import FlatParamHandle
 from torch.distributed.utils import _to_kwargs
 
 
+@no_type_check
+def _unshard(
+    state: _State,
+    handles: List[FlatParamHandle],
+    unshard_stream: torch.cuda.Stream,
+    pre_unshard_stream: torch.cuda.Stream,
+) -> None:
+    """
+    Unshards the handles in ``handles``. If the handles are in
+    :meth:`summon_full_params` and are using mixed precision, then they are
+    forced to full precision.
+
+    Postcondition: Each handle's ``FlatParameter`` 's data is the padded
+    unsharded flattened parameter on the compute device.
+    """
+    if not handles:
+        return
+    if state.limit_all_gathers:
+        event = state._free_event_queue.dequeue_if_needed()
+        if event:
+            event.synchronize()
+    any_ran_pre_unshard = False
+    with torch.cuda.stream(pre_unshard_stream):
+        for handle in handles:
+            ran_pre_unshard = handle.pre_unshard()
+            any_ran_pre_unshard = any_ran_pre_unshard or ran_pre_unshard
+    if any_ran_pre_unshard:
+        unshard_stream.wait_stream(pre_unshard_stream)
+    with torch.cuda.stream(unshard_stream):
+        for handle in handles:
+            handle.unshard()
+            handle.post_unshard()
+
+
+@no_type_check
+def _reshard(
+    state: _State,
+    handles: List[FlatParamHandle],
+    free_unsharded_flat_params: List[bool],
+):
+    """
+    Reshards the handles in ``handles``. ``free_unsharded_flat_params`` should
+    have the same length as ``handles``, and each element should give whether
+    the corresponding handle should free its padded unsharded flattened
+    parameter.
+    """
+    if not handles:
+        return
+    p_assert(
+        len(handles) == len(free_unsharded_flat_params),
+        "Expects both lists to have equal length but got "
+        f"{len(handles)} and {len(free_unsharded_flat_params)}",
+    )
+    for handle, free_unsharded_flat_param in zip(
+        handles,
+        free_unsharded_flat_params,
+    ):
+        handle.reshard(free_unsharded_flat_param)
+        if state.limit_all_gathers and free_unsharded_flat_param:
+            free_event = torch.cuda.Event()
+            free_event.record()
+            state._free_event_queue.enqueue(free_event)
+        handle.post_reshard()
+    # Since we prefetch entire handles keys at a time, conservatively mark
+    # the entire key as no longer prefetched once we free at least one
+    handles_key = tuple(handles)
+    if any(free_unsharded_flat_params):
+        state._handles_prefetched.pop(handles_key, None)
+
+
+def _unshard_grads(
+    handles: List[FlatParamHandle],
+) -> None:
+    for handle in handles:
+        handle.unshard_grad()
+
+
+def _reshard_grads(
+    handles: List[FlatParamHandle],
+) -> None:
+    for handle in handles:
+        handle.reshard_grad()
+
+
 def _wait_for_computation_stream(
     computation_stream: torch.cuda.Stream,
     unshard_stream: torch.cuda.Stream,
diff --git a/torch/distributed/fsdp/_utils.py b/torch/distributed/fsdp/_utils.py
index cf3755e55c574..d3d8f91c61a42 100644
--- a/torch/distributed/fsdp/_utils.py
+++ b/torch/distributed/fsdp/_utils.py
@@ -107,7 +107,7 @@ def _same_storage(x: torch.Tensor, y: torch.Tensor) -> bool:
     return x.storage().data_ptr() == y.storage().data_ptr()
 
 
-def p_assert(cond: Any, s: Any, raise_assertion_error: bool = True) -> None:
+def p_assert(cond: Any, s: str, raise_assertion_error: bool = True) -> None:
     """This is used as an alternate to ``assert`` when in the backward context
     to print the error message ``s`` since otherwise, it is swallowed."""
     if not cond:
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 08fdeceecaa85..412a7e0ed5fe3 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -55,6 +55,10 @@
 from torch.distributed.fsdp._runtime_utils import (
     _clear_grads_if_needed,
     _prepare_forward_inputs,
+    _reshard,
+    _reshard_grads,
+    _unshard,
+    _unshard_grads,
     _wait_for_computation_stream,
 )
 from torch.distributed.fsdp._wrap_utils import _auto_wrap
@@ -524,80 +528,6 @@ def __init__(
         )
         self.register_load_state_dict_post_hook(_post_load_state_dict_hook)
 
-    def _unshard(
-        self,
-        handles: List[FlatParamHandle],
-        unshard_stream: torch.cuda.Stream,
-        pre_unshard_stream: torch.cuda.Stream,
-    ) -> None:
-        """
-        Unshards the handles in ``handles``. If the handles are in
-        :meth:`summon_full_params` and are using mixed precision, then they are
-        forced to full precision.
-
-        Postcondition: Each handle's ``FlatParameter`` 's data is the padded
-        unsharded flattened parameter on the compute device.
-        """
-        if not handles:
-            return
-        if self.limit_all_gathers:
-            event = self._free_event_queue.dequeue_if_needed()
-            if event:
-                event.synchronize()
-        any_ran_pre_unshard = False
-        with torch.cuda.stream(pre_unshard_stream):
-            for handle in handles:
-                ran_pre_unshard = handle.pre_unshard()
-                any_ran_pre_unshard = any_ran_pre_unshard or ran_pre_unshard
-        if any_ran_pre_unshard:
-            unshard_stream.wait_stream(pre_unshard_stream)
-        with torch.cuda.stream(unshard_stream):
-            for handle in handles:
-                handle.unshard()
-                handle.post_unshard()
-
-    def _reshard(
-        self,  # unused
-        handles: List[FlatParamHandle],
-        free_unsharded_flat_params: List[bool],
-    ) -> None:
-        """
-        Reshards the handles in ``handles``. ``free_unsharded_flat_params``
-        should have the same length as ``handles``, and each element should
-        give whether the corresponding handle should free its padded unsharded
-        flattened parameter.
-        """
-        if not handles:
-            return
-        p_assert(
-            len(handles) == len(free_unsharded_flat_params),
-            "Expects both lists to have equal length but got "
-            f"{len(handles)} and {len(free_unsharded_flat_params)}",
-        )
-        for handle, free_unsharded_flat_param in zip(
-            handles,
-            free_unsharded_flat_params,
-        ):
-            handle.reshard(free_unsharded_flat_param)
-            if self.limit_all_gathers and free_unsharded_flat_param:
-                free_event = torch.cuda.Event()
-                free_event.record()
-                self._free_event_queue.enqueue(free_event)
-            handle.post_reshard()
-        # Since we prefetch entire handles keys at a time, conservatively mark
-        # the entire key as no longer prefetched once we free at least one
-        handles_key = tuple(handles)
-        if any(free_unsharded_flat_params):
-            self._handles_prefetched.pop(handles_key, None)
-
-    def _unshard_grads(self, handles: List[FlatParamHandle]) -> None:
-        for handle in handles:
-            handle.unshard_grad()
-
-    def _reshard_grads(self, handles: List[FlatParamHandle]) -> None:
-        for handle in handles:
-            handle.reshard_grad()
-
     @property
     def module(self) -> nn.Module:
         """
@@ -1010,8 +940,11 @@ def _prefetch_handles(
         for handles_key in handles_to_prefetch:
             # Prefetch the next set of handles without synchronizing to allow
             # the sync to happen as late as possible to maximize overlap
-            self._unshard(
-                handles_key, self._streams["unshard"], self._streams["pre_unshard"]
+            _unshard(
+                self,
+                handles_key,
+                self._streams["unshard"],
+                self._streams["pre_unshard"],
             )
             self._handles_prefetched[handles_key] = True
 
@@ -1353,7 +1286,8 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
                 for handle in self._handles
             ]
             reshard_fn = functools.partial(
-                self._reshard,
+                _reshard,
+                self,
                 self._handles,
                 free_unsharded_flat_params,
             )
@@ -1405,8 +1339,8 @@ def _pre_forward_unshard(
     ) -> None:
         """Unshards parameters in the pre-forward."""
         if handles:
-            self._unshard(
-                handles, self._streams["unshard"], self._streams["pre_unshard"]
+            _unshard(
+                self, handles, self._streams["unshard"], self._streams["pre_unshard"]
             )
             handles_key = tuple(handles)
             self._needs_pre_forward_unshard[handles_key] = False
@@ -1639,15 +1573,15 @@ def _summon_full_params(
         # No need to call `wait_stream()` since we unshard in the computation
         # stream directly
         computation_stream = torch.cuda.current_stream()
-        self._unshard(self._handles, computation_stream, computation_stream)
+        _unshard(self, self._handles, computation_stream, computation_stream)
         if with_grads:
-            self._unshard_grads(self._handles)
+            _unshard_grads(self._handles)
 
         if rank0_only and self.rank != 0:
             # Free the unsharded flattened parameter early
-            self._reshard(self._handles, free_unsharded_flat_params)
+            _reshard(self, self._handles, free_unsharded_flat_params)
             if with_grads:
-                self._reshard_grads(self._handles)
+                _reshard_grads(self._handles)
             try:
                 yield
             finally:
@@ -1674,9 +1608,9 @@ def _summon_full_params(
                     stack.close()
                     if writeback:
                         self._writeback_to_local_shard(self._handles, with_grads)
-                    self._reshard(self._handles, free_unsharded_flat_params)
+                    _reshard(self, self._handles, free_unsharded_flat_params)
                     if with_grads:
-                        self._reshard_grads(self._handles)
+                        _reshard_grads(self._handles)
                     self.training_state = TrainingState.IDLE
                     for handle in self._handles:
                         handle._training_state = HandleTrainingState.IDLE
@@ -1946,8 +1880,11 @@ def _pre_backward_hook(_handles: List[FlatParamHandle], *unused: Any) -> None:
 
                 # If the handles have been prefetched, this `_unshard()` simply
                 # switches to using the unsharded parameter
-                self._unshard(
-                    _handles, self._streams["unshard"], self._streams["pre_unshard"]
+                _unshard(
+                    self,
+                    _handles,
+                    self._streams["unshard"],
+                    self._streams["pre_unshard"],
                 )
                 torch.cuda.current_stream().wait_stream(self._streams["unshard"])
 
@@ -2052,7 +1989,7 @@ def _post_backward_hook(
                 )
 
             free_unsharded_flat_param = self._should_free_unsharded_flat_param(handle)
-            self._reshard([handle], [free_unsharded_flat_param])
+            _reshard(self, [handle], [free_unsharded_flat_param])
 
             # TODO (awgu): Post-backward prefetching does not support the
             # multiple handles per module case (which was why we keyed by
@@ -2286,7 +2223,7 @@ def _catch_all_reshard(fsdp_module: FullyShardedDataParallel) -> None:
                         self._should_free_unsharded_flat_param(handle)
                     )
                     handles_to_reshard.append(handle)
-                self._reshard(handles_to_reshard, free_unsharded_flat_params)
+                _reshard(self, handles_to_reshard, free_unsharded_flat_params)
             except Exception as e:
                 p_assert(
                     False,
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index a6644df76d497..f54a3abeb9191 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -615,19 +615,22 @@ def forward(self, x):
         if self.delay_before_free_ms > 0:
             expert = self.module[2]
             if isinstance(expert, FSDP):
-                orig_reshard = self.module[2]._reshard
+                orig_reshard = torch.distributed.fsdp._runtime_utils._reshard
 
-                def _free_full_params_with_delay(*args):
+                def _delayed_reshard(*args, **kwargs):
                     torch.cuda._sleep(
                         int(self.delay_before_free_ms * get_cycles_per_ms())
                     )
-                    return orig_reshard(*args)
-
-                assert hasattr(
-                    expert, "_reshard"
-                ), "expert FSDP module should have a `_reshard()` method"
-                with mock.patch.object(
-                    expert, "_reshard", _free_full_params_with_delay
+                    return orig_reshard(*args, **kwargs)
+
+                # The first patch covers any `from torch... import _reshard`
+                # uses in `fully_sharded_data_parallel.py`, and the second
+                # patch covers any `import torch..._reshard` uses in general.
+                with mock.patch(
+                    "torch.distributed.fsdp.fully_sharded_data_parallel._reshard",
+                    _delayed_reshard,
+                ), mock.patch(
+                    "torch.distributed.fsdp._runtime_utils._reshard", _delayed_reshard
                 ):
                     return self.module(x)
 

From ae5dcf7af945f90ae57700541d9d29738e3c100a Mon Sep 17 00:00:00 2001
From: Charlie Yan <charlieyan@meta.com>
Date: Tue, 1 Nov 2022 13:51:06 +0000
Subject: [PATCH 0389/1922] Update _distributed_c10d.pyi (#88088)

Summary: `_distributed_c10d.pyi` is out of sync with the C++ binding. This change updates it.

Test Plan: TBD

Differential Revision: D40840836

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88088
Approved by: https://github.com/wanchaol
---
 torch/_C/_distributed_c10d.pyi | 66 ++++++++++++++++++++++++----------
 1 file changed, 47 insertions(+), 19 deletions(-)

diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index bdf0166b8daa9..56b86bd504bf6 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -1,8 +1,9 @@
 from datetime import timedelta
 from enum import Enum
-from typing import Optional, List, Any, Tuple, overload, Union
+from typing import Any, Dict, List, Optional, overload, Tuple, Union
 
 from torch import Tensor
+from torch.futures import Future
 
 # This module is defined in torch/csrc/distributed/c10d/init.cpp
 
@@ -32,13 +33,34 @@ class Reducer:
         self,
         params: List[Tensor],
         bucket_indices: List[List[int]],
+        per_bucket_size_limits: List[int],
         process_group: ProcessGroup,
-        expect_sparse_gradients: List[bool],
-        bucket_bytes_cap: int,
-        find_unused_parameters: bool,
-        gradient_as_bucket_view: bool,
+        expect_sparse_gradients: List[bool] = ...,
+        bucket_bytes_cap: int = ...,  # kDefaultBucketBytesCap in reducer.hpp
+        find_unused_parameters: bool = ...,
+        gradient_as_bucket_view: bool = ...,
+        param_to_name_mapping: Dict[int, str] = ...,
+        first_bucket_types_cap: int = ...,  # kDefaultFirstBucketBytes in reducer.hpp
     ): ...
-    ...
+    def prepare_for_forward(self) -> None: ...
+    def prepare_for_backward(self, output: List[Tensor]) -> None: ...
+    def get_backward_stats(self) -> List[int]: ...
+    def _install_post_backward_futures(self, futures: List[Future]) -> None: ...
+    def _rebuild_buckets(self) -> bool: ...
+    def _get_zeros_like_grad_buckets(self) -> List[GradBucket]: ...
+    def _push_all_rebuilt_params(self) -> None: ...
+    def _set_forward_pass_work_handle(
+        self, work: Work, use_static_world_size: bool
+    ): ...
+    def _get_local_used_map(self) -> Tensor: ...
+    def _set_ddp_runtime_logging_sample_rate(self, sample_rate: int) -> None: ...
+    def _set_static_graph(self) -> None: ...
+    def _run_comm_hook(self, bucket: GradBucket) -> Future: ...
+    def set_logger(self, logger: Logger) -> None: ...
+
+class DDPLoggingData:
+    strs_map: Dict[str, str]
+    ints_map: Dict[str, int]
 
 class Logger:
     def __init__(self, reducer: Reducer): ...
@@ -49,8 +71,14 @@ class Logger:
         output_device: int,
         broadcast_buffers: bool,
         has_sync_bn: bool,
+        static_graph: bool,
     ): ...
-    ...
+    def set_runtime_stats_and_log(self) -> None: ...
+    def set_error_and_log(self, error: str) -> None: ...
+    def _get_ddp_logging_data(self) -> DDPLoggingData: ...
+    def _set_comm_hook_name(self, comm_hook: str) -> None: ...
+    def _set_uneven_input_join(self) -> None: ...
+    def _set_static_graph(self) -> None: ...
 
 def get_debug_level(): ...
 def set_debug_level(): ...
@@ -118,7 +146,9 @@ class Store:
     def set(self, key: str, value: str): ...
     def get(self, key: str) -> bytes: ...
     def add(self, key: str, value: int) -> int: ...
-    def compare_set(self, key: str, expected_value: str, desired_value: str) -> bytes: ...
+    def compare_set(
+        self, key: str, expected_value: str, desired_value: str
+    ) -> bytes: ...
     def delete_key(self, key: str) -> bool: ...
     def num_keys(self) -> int: ...
     def set_timeout(self, timeout: timedelta): ...
@@ -142,7 +172,7 @@ class TCPStore(Store):
         is_master: bool = ...,
         timeout: timedelta = ...,
         wait_for_workers: bool = ...,
-        multi_tenant: bool = ...
+        multi_tenant: bool = ...,
     ): ...
     @property
     def host(self) -> str: ...
@@ -167,6 +197,7 @@ class Work:
 
 class ProcessGroup:
     class Options: ...
+
     def __init__(self): ...
     def rank(self) -> int: ...
     def size(self) -> int: ...
@@ -235,7 +266,7 @@ class ProcessGroup:
         self,
         output: Tensor,
         input: Tensor,
-        opts = AllGatherOptions(),
+        opts=AllGatherOptions(),
     ) -> Work: ...
     def allgather_coalesced(
         self,
@@ -343,6 +374,7 @@ def _round_robin_process_groups(
 class ProcessGroupGloo(ProcessGroup):
     class Device: ...
     class Options: ...
+
     def __init__(
         self,
         store: Store,
@@ -358,16 +390,12 @@ class ProcessGroupGloo(ProcessGroup):
     ...
 
 class _ProcessGroupWrapper(ProcessGroup):
-    def __init__(
-        self,
-        pg: ProcessGroup,
-        gloo_pg: ProcessGroupGloo
-    ): ...
+    def __init__(self, pg: ProcessGroup, gloo_pg: ProcessGroupGloo): ...
     wrapped_pg: ProcessGroup
 
-
 class ProcessGroupNCCL(ProcessGroup):
     class Options: ...
+
     def __init__(
         self,
         store: Store,
@@ -402,9 +430,9 @@ class ProcessGroupMPI(ProcessGroup):
 
 def _compute_bucket_assignment_by_size(
     tensors: List[Tensor],
-    bucket_size: int,
-    expect_sparse_gradient: List[bool],
-    tensor_indices: List[int],
+    bucket_size_limits: List[int],
+    expect_sparse_gradient: List[bool] = ...,
+    tensor_indices: List[int] = ...,
 ) -> Tuple[List[List[int]], List[int]]: ...
 def _broadcast_coalesced(
     process_group: ProcessGroup,

From d29e5b27bcfa01f1c38f0cd47e73b86fd9bb5b0b Mon Sep 17 00:00:00 2001
From: AllenTiTaiWang <titaiwang@microsoft.com>
Date: Mon, 31 Oct 2022 23:44:23 +0000
Subject: [PATCH 0390/1922] [Reland][ONNX] Move all torch.onnx.export related
 tests to test/onnx (#87292)

Moving torch.onnx.export related tests to test/onnx integrates ONNX tests to the same CI machine, so the testing environment can be better managed.

Fixes https://github.com/pytorch/pytorch/issues/87320
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87292
Approved by: https://github.com/thiagocrepaldi, https://github.com/BowenBao, https://github.com/kit1980, https://github.com/malfet
---
 .github/merge_rules.yaml                      |   1 -
 test/jit/test_async.py                        |  15 --
 test/jit/test_tracer.py                       |   8 -
 test/{jit => onnx}/test_export_modes.py       |  89 ++++---
 test/onnx/test_pytorch_onnx_no_runtime.py     | 225 +++++++++++++++++-
 .../eager/test_quantize_eager_ptq.py          |  51 ----
 test/test_jit.py                              | 112 +--------
 test/test_quantization.py                     |   1 -
 8 files changed, 282 insertions(+), 220 deletions(-)
 rename test/{jit => onnx}/test_export_modes.py (65%)

diff --git a/.github/merge_rules.yaml b/.github/merge_rules.yaml
index 26b3eb437251a..6e9cba905e751 100644
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@@ -6,7 +6,6 @@
   - docs/source/onnx*
   - docs/source/scripts/onnx/**
   - scripts/onnx/**
-  - test/jit/test_export_modes.py
   - test/onnx/**
   - tools/onnx/**
   - torch/_C/__init__.pyi.in
diff --git a/test/jit/test_async.py b/test/jit/test_async.py
index d3769cd452d64..f8a1baea67133 100644
--- a/test/jit/test_async.py
+++ b/test/jit/test_async.py
@@ -1,6 +1,5 @@
 # Owner(s): ["oncall: jit"]
 
-import io
 import os
 import sys
 
@@ -420,20 +419,6 @@ def fn(x):
         self.assertGraphContainsExactly(traced.graph, kind='aten::wait', num_kind_nodes=0)
         self.assertGraphContainsExactly(traced.graph, kind='aten::add', num_kind_nodes=2)
 
-    def test_trace_fork_wait_inline_onnx(self):
-        def fork_body(x):
-            return torch.neg(x), torch.neg(x)
-
-        class MyMod(torch.nn.Module):
-            def forward(self, x):
-                fut = torch.jit._fork(fork_body, x)
-                val = torch.jit._wait(fut)
-                return val[1]
-
-        # smoke test for ONNX export
-        f = io.BytesIO()
-        torch.onnx.export(MyMod(), (torch.rand(3, 4),), f)
-
     def test_trace_fork_wait_list_modulecalls(self):
         def add_one(input):
             return input + torch.ones(input.size())
diff --git a/test/jit/test_tracer.py b/test/jit/test_tracer.py
index 50fdec94b9fc0..b36003a2b9209 100644
--- a/test/jit/test_tracer.py
+++ b/test/jit/test_tracer.py
@@ -1124,14 +1124,6 @@ def foo(x, w):
         # With `check_trace=True` it will run with `@torch.no_grad()` and break assert.
         torch.jit.trace(foo, (x, w), check_trace=False)
 
-    def test_trace_detach_onnx_erase(self):
-        class Mod(torch.nn.Module):
-            def forward(self, x, w):
-                return torch.matmul(x, w).detach()
-
-        torch.onnx.export_to_pretty_string(
-            Mod(), (torch.rand(3, 4), torch.rand(4, 5)))
-
     def test_trace_slice_full_dim(self):
         def foo(x):
             return x[0:5, 0] + 1.0
diff --git a/test/jit/test_export_modes.py b/test/onnx/test_export_modes.py
similarity index 65%
rename from test/jit/test_export_modes.py
rename to test/onnx/test_export_modes.py
index dbf10cddc059b..0f3024a2e366d 100644
--- a/test/jit/test_export_modes.py
+++ b/test/onnx/test_export_modes.py
@@ -1,29 +1,25 @@
-# Owner(s): ["oncall: jit"]
+# Owner(s): ["module: onnx"]
 
 import io
 import os
 import shutil
 import sys
 import tempfile
+import unittest
 
 import torch
 import torch.nn as nn
-from torch.onnx import OperatorExportTypes
 from torch.autograd import Variable
+from torch.onnx import OperatorExportTypes
 
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
-from torch.testing._internal.jit_utils import JitTestCase
-from torch.testing._internal.common_utils import skipIfNoLapack, skipIfCaffe2, skipIfNoCaffe2
+from torch.testing._internal import common_utils
 
-if __name__ == '__main__':
-    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
-                       "\tpython test/test_jit.py TESTNAME\n\n"
-                       "instead.")
 
 # Smoke tests for export methods
-class TestExportModes(JitTestCase):
+class TestExportModes(common_utils.TestCase):
     class MyModel(nn.Module):
         def __init__(self):
             super(TestExportModes.MyModel, self).__init__()
@@ -35,41 +31,66 @@ def test_protobuf(self):
         torch_model = TestExportModes.MyModel()
         fake_input = Variable(torch.randn(1, 1, 224, 224), requires_grad=True)
         f = io.BytesIO()
-        torch.onnx._export(torch_model, (fake_input), f, verbose=False,
-                           export_type=torch.onnx.ExportTypes.PROTOBUF_FILE)
+        torch.onnx._export(
+            torch_model,
+            (fake_input),
+            f,
+            verbose=False,
+            export_type=torch.onnx.ExportTypes.PROTOBUF_FILE,
+        )
 
     def test_zipfile(self):
         torch_model = TestExportModes.MyModel()
         fake_input = Variable(torch.randn(1, 1, 224, 224), requires_grad=True)
         f = io.BytesIO()
-        torch.onnx._export(torch_model, (fake_input), f, verbose=False,
-                           export_type=torch.onnx.ExportTypes.ZIP_ARCHIVE)
+        torch.onnx._export(
+            torch_model,
+            (fake_input),
+            f,
+            verbose=False,
+            export_type=torch.onnx.ExportTypes.ZIP_ARCHIVE,
+        )
 
     def test_compressed_zipfile(self):
         torch_model = TestExportModes.MyModel()
         fake_input = Variable(torch.randn(1, 1, 224, 224), requires_grad=True)
         f = io.BytesIO()
-        torch.onnx._export(torch_model, (fake_input), f, verbose=False,
-                           export_type=torch.onnx.ExportTypes.COMPRESSED_ZIP_ARCHIVE)
+        torch.onnx._export(
+            torch_model,
+            (fake_input),
+            f,
+            verbose=False,
+            export_type=torch.onnx.ExportTypes.COMPRESSED_ZIP_ARCHIVE,
+        )
 
     def test_directory(self):
         torch_model = TestExportModes.MyModel()
         fake_input = Variable(torch.randn(1, 1, 224, 224), requires_grad=True)
         d = tempfile.mkdtemp()
-        torch.onnx._export(torch_model, (fake_input), d, verbose=False,
-                           export_type=torch.onnx.ExportTypes.DIRECTORY)
+        torch.onnx._export(
+            torch_model,
+            (fake_input),
+            d,
+            verbose=False,
+            export_type=torch.onnx.ExportTypes.DIRECTORY,
+        )
         shutil.rmtree(d)
 
     def test_onnx_multiple_return(self):
         @torch.jit.script
         def foo(a):
             return (a, a)
+
         f = io.BytesIO()
         x = torch.ones(3)
-        torch.onnx._export(foo, (x,), f)
-
-    @skipIfNoCaffe2
-    @skipIfNoLapack
+        torch.onnx.export(foo, (x,), f)
+
+    # TODO(87318): Can't pass even with Caffe2
+    @unittest.skip(
+        "RuntimeError: ScalarType UNKNOWN_SCALAR is an unexpected tensor scalar type"
+    )
+    @common_utils.skipIfNoCaffe2
+    @common_utils.skipIfNoLapack
     def test_caffe2_aten_fallback(self):
         class ModelWithAtenNotONNXOp(nn.Module):
             def forward(self, x, y):
@@ -80,13 +101,15 @@ def forward(self, x, y):
         x = torch.rand(3, 4)
         y = torch.rand(3, 4)
         torch.onnx.export_to_pretty_string(
-            ModelWithAtenNotONNXOp(), (x, y),
+            ModelWithAtenNotONNXOp(),
+            (x, y),
             add_node_names=False,
             do_constant_folding=False,
-            operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK)
+            operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK,
+        )
 
-    @skipIfCaffe2
-    @skipIfNoLapack
+    @common_utils.skipIfCaffe2
+    @common_utils.skipIfNoLapack
     def test_aten_fallback(self):
         class ModelWithAtenNotONNXOp(nn.Module):
             def forward(self, x, y):
@@ -97,12 +120,14 @@ def forward(self, x, y):
         x = torch.rand(3, 4)
         y = torch.rand(3, 4)
         torch.onnx.export_to_pretty_string(
-            ModelWithAtenNotONNXOp(), (x, y),
+            ModelWithAtenNotONNXOp(),
+            (x, y),
             add_node_names=False,
             do_constant_folding=False,
             operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK,
             # support for linalg.qr was added in later op set versions.
-            opset_version=9)
+            opset_version=9,
+        )
 
     # torch.fmod is using to test ONNX_ATEN.
     # If you plan to remove fmod from aten, or found this test failed.
@@ -115,7 +140,13 @@ def forward(self, x, y):
         x = torch.randn(3, 4, dtype=torch.float32)
         y = torch.randn(3, 4, dtype=torch.float32)
         torch.onnx.export_to_pretty_string(
-            ModelWithAtenFmod(), (x, y),
+            ModelWithAtenFmod(),
+            (x, y),
             add_node_names=False,
             do_constant_folding=False,
-            operator_export_type=OperatorExportTypes.ONNX_ATEN)
+            operator_export_type=OperatorExportTypes.ONNX_ATEN,
+        )
+
+
+if __name__ == "__main__":
+    common_utils.run_tests()
diff --git a/test/onnx/test_pytorch_onnx_no_runtime.py b/test/onnx/test_pytorch_onnx_no_runtime.py
index 5f2ce3fa657a1..c30ee46a34226 100644
--- a/test/onnx/test_pytorch_onnx_no_runtime.py
+++ b/test/onnx/test_pytorch_onnx_no_runtime.py
@@ -7,8 +7,11 @@
 import itertools
 import unittest
 import unittest.mock
+import warnings
 from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
 
+import numpy as np
+
 import onnx
 import onnx.numpy_helper
 
@@ -18,7 +21,7 @@
 from torch.onnx import symbolic_helper, utils
 from torch.onnx._globals import GLOBALS
 from torch.onnx._internal import registration
-from torch.testing._internal import common_utils
+from torch.testing._internal import common_quantization, common_utils, jit_utils
 
 
 def export_to_onnx(
@@ -76,7 +79,7 @@ def forward(self, x):
 
         x = torch.ones(3, 3)
         f = io.BytesIO()
-        torch.onnx._export(AddmmModel(), x, f, verbose=False)
+        torch.onnx.export(AddmmModel(), x, f, verbose=False)
 
     def test_onnx_transpose_incomplete_tensor_type(self):
         # Smoke test to get us into the state where we are attempting to export
@@ -163,7 +166,7 @@ def forward(self, x):
         mte = ModuleToExport()
         f = io.BytesIO()
         with self.assertRaisesRegex(RuntimeError, "Couldn't export Python"):
-            torch.onnx._export(mte, (torch.zeros(1, 2, 3),), f, verbose=False)
+            torch.onnx.export(mte, (torch.zeros(1, 2, 3),), f, verbose=False)
 
     def test_onnx_export_script_inline_trace(self):
         class ModuleToInline(torch.nn.Module):
@@ -427,7 +430,11 @@ def forward(self, x):
         onnx_model = export_to_onnx(
             MyClip(),
             torch.randn(3, 4, requires_grad=True),
-            custom_ops=[common_utils.custom_op("aten::clamp", bad_clamp, 9)],
+            custom_ops=[
+                common_utils.custom_op(
+                    "aten::clamp", bad_clamp, GLOBALS.export_onnx_opset_version
+                )
+            ],
             operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
         )
         self.assertAtenOp(onnx_model, "clamp", "Tensor")
@@ -777,6 +784,216 @@ def forward(self, x):
             model, inputs, f, dynamic_axes={"x": [0, 1]}, input_names=["x"]
         )
 
+    def test_dropout_script(self):
+
+        eg = torch.zeros(1, 2, 3, requires_grad=True)
+
+        @jit_utils._trace(eg)
+        def foo(x):
+            x = torch.neg(x)
+            return F.dropout(x)
+
+        class MyDrop(torch.nn.Module):
+            def forward(self, x):
+                return foo(x)
+
+        f = io.BytesIO()
+        with warnings.catch_warnings(record=True):
+            torch.onnx.export(MyDrop(), (eg,), f, verbose=False)
+
+    def test_pack_padded_pad_packed_trace(self):
+        from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+
+        T, B, C = 3, 5, 7
+
+        class PadPackedWrapper(torch.nn.Module):
+            def __init__(self):
+                super(PadPackedWrapper, self).__init__()
+
+            def forward(self, x, seq_lens):
+                x = pack_padded_sequence(x, seq_lens)
+                x, _ = pad_packed_sequence(x)
+                return x
+
+        x = np.ones((T, B, C))
+        seq_lens = np.array([3, 3, 2, 2, 1], dtype=np.int32)
+        # set padding value so we can test equivalence
+        for b in range(B):
+            if seq_lens[b] < T:
+                x[seq_lens[b] :, b, :] = 0
+        seq_lens = torch.from_numpy(seq_lens)
+        x = torch.autograd.Variable(torch.from_numpy(x), requires_grad=True)
+
+        m = PadPackedWrapper()
+        m_traced = torch.jit.trace(
+            m,
+            (
+                x,
+                seq_lens,
+            ),
+        )
+
+        y = m(x, seq_lens)
+        loss = torch.sum(y)
+        loss.backward()
+        grad = x.grad.clone()
+        x.grad.zero_()
+
+        y_traced = m_traced(x, seq_lens)
+        loss_traced = torch.sum(y_traced)
+        loss_traced.backward()
+        grad_traced = x.grad.clone()
+
+        self.assertEqual(y_traced, x)
+        self.assertEqual(y_traced, y)
+        self.assertEqual(grad, grad_traced)
+
+        f = io.BytesIO()
+        torch.onnx.export(m, (x, seq_lens), f, verbose=False)
+
+    # Suppression: ONNX warns when exporting RNNs because of potential batch size mismatch.
+    @common_utils.suppress_warnings
+    def test_rnn_trace_override(self):
+        from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+
+        num_layers = 3
+        T, B, C = 11, 5, 7
+
+        class RNNTraceWrapper(torch.nn.Module):
+            def __init__(self, cell_type):
+                super(RNNTraceWrapper, self).__init__()
+                if cell_type == "RNN":
+                    self.rnn = torch.nn.RNN(
+                        input_size=C, hidden_size=C, num_layers=num_layers
+                    )
+                elif cell_type == "LSTM":
+                    self.rnn = torch.nn.LSTM(
+                        input_size=C, hidden_size=C, num_layers=num_layers
+                    )
+                elif cell_type == "GRU":
+                    self.rnn = torch.nn.GRU(
+                        input_size=C, hidden_size=C, num_layers=num_layers
+                    )
+
+            def forward(self, x, seq_lens):
+                x = pack_padded_sequence(x, seq_lens)
+                x, _ = self.rnn(x)
+                x, _ = pad_packed_sequence(x)
+                return x
+
+        for cell_type in ["RNN", "LSTM", "GRU"]:
+            x = torch.ones(T, B, C, requires_grad=True)
+            seq_lens = torch.from_numpy(np.array([11, 3, 2, 2, 1], dtype=np.int32))
+
+            m = RNNTraceWrapper(cell_type)
+            m_traced = torch.jit.trace(
+                m,
+                (
+                    x,
+                    seq_lens,
+                ),
+            )
+
+            y = m(x, seq_lens)
+            loss = torch.sum(y)
+            loss.backward()
+            grad = x.grad.clone()
+            x.grad.zero_()
+
+            y_traced = m_traced(x, seq_lens)
+            loss_traced = torch.sum(y_traced)
+            loss_traced.backward()
+            grad_traced = x.grad.clone()
+
+            self.assertEqual(y_traced, y)
+            self.assertEqual(grad, grad_traced)
+
+            f = io.BytesIO()
+            torch.onnx.export(m, (x, seq_lens), f, verbose=False)
+
+    def test_trace_fork_wait_inline_onnx(self):
+        def fork_body(x):
+            return torch.neg(x), torch.neg(x)
+
+        class MyMod(torch.nn.Module):
+            def forward(self, x):
+                fut = torch.jit._fork(fork_body, x)
+                val = torch.jit._wait(fut)
+                return val[1]
+
+        # smoke test for ONNX export
+        f = io.BytesIO()
+        torch.onnx.export(MyMod(), (torch.rand(3, 4),), f)
+
+    def test_trace_detach_onnx_erase(self):
+        class Mod(torch.nn.Module):
+            def forward(self, x, w):
+                return torch.matmul(x, w).detach()
+
+        torch.onnx.export_to_pretty_string(Mod(), (torch.rand(3, 4), torch.rand(4, 5)))
+
+
+class TestQuantizeEagerONNXExport(common_utils.TestCase):
+    def _test_lower_graph_impl(self, model, data):
+        model.qconfig = torch.ao.quantization.default_qconfig
+        model = torch.ao.quantization.prepare(model)
+        model = torch.ao.quantization.convert(model)
+
+        _ = model(data)
+        input_names = ["x"]
+
+        def _export_to_onnx(model, input, input_names):
+            traced = torch.jit.trace(model, input)
+            buf = io.BytesIO()
+            torch.jit.save(traced, buf)
+            buf.seek(0)
+
+            model = torch.jit.load(buf)
+            f = io.BytesIO()
+            torch.onnx.export(
+                model,
+                input,
+                f,
+                input_names=input_names,
+                operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+                opset_version=9,
+            )
+
+        _export_to_onnx(model, data, input_names)
+
+    @common_quantization.skipIfNoFBGEMM
+    @common_utils.skipIfNoCaffe2
+    def test_lower_graph_linear(self):
+        model = torch.ao.quantization.QuantWrapper(
+            torch.nn.Linear(5, 10, bias=True)
+        ).to(dtype=torch.float)
+        data_numpy = np.random.rand(1, 2, 5).astype(np.float32)
+        data = torch.from_numpy(data_numpy).to(dtype=torch.float)
+        self._test_lower_graph_impl(model, data)
+
+    @common_quantization.skipIfNoFBGEMM
+    @common_utils.skipIfNoCaffe2
+    def test_lower_graph_conv2d(self):
+        model = torch.ao.quantization.QuantWrapper(
+            torch.nn.Conv2d(3, 5, 2, bias=True)
+        ).to(dtype=torch.float)
+        data_numpy = np.random.rand(1, 3, 6, 6).astype(np.float32)
+        data = torch.from_numpy(data_numpy).to(dtype=torch.float)
+        self._test_lower_graph_impl(model, data)
+
+    @common_quantization.skipIfNoFBGEMM
+    @unittest.skip(
+        "onnx opset9 does not support quantize_per_tensor and caffe2 \
+    does not support conv3d"
+    )
+    def test_lower_graph_conv3d(self):
+        model = torch.ao.quantization.QuantWrapper(
+            torch.nn.Conv3d(3, 5, 2, bias=True)
+        ).to(dtype=torch.float)
+        data_numpy = np.random.rand(1, 3, 6, 6, 6).astype(np.float32)
+        data = torch.from_numpy(data_numpy).to(dtype=torch.float)
+        self._test_lower_graph_impl(model, data)
+
 
 if __name__ == "__main__":
     common_utils.run_tests()
diff --git a/test/quantization/eager/test_quantize_eager_ptq.py b/test/quantization/eager/test_quantize_eager_ptq.py
index 7d87cc520ba04..d414d09bd5f2c 100644
--- a/test/quantization/eager/test_quantize_eager_ptq.py
+++ b/test/quantization/eager/test_quantize_eager_ptq.py
@@ -61,8 +61,6 @@
     supported_qengines,
     override_qengines,
 )
-from torch.testing._internal.jit_utils import JitTestCase
-from torch.testing._internal.common_utils import skipIfNoCaffe2
 
 from hypothesis import given
 from hypothesis import strategies as st
@@ -71,8 +69,6 @@
 
 # Standard library
 from typing import Tuple
-import io
-import unittest
 import numpy as np
 
 class TestQuantizeEagerOps(QuantizationTestCase):
@@ -1443,53 +1439,6 @@ def forward(self, indices, offsets, linear_in):
         self.assertTrue('QuantizedEmbedding' in str(q_model))
         self.assertTrue('DynamicQuantizedLinear' in str(q_model))
 
-class TestQuantizeEagerONNXExport(JitTestCase):
-    def _test_lower_graph_impl(self, model, data):
-        model.qconfig = torch.ao.quantization.default_qconfig
-        model = torch.ao.quantization.prepare(model)
-        model = torch.ao.quantization.convert(model)
-
-        outputs = model(data)
-        input_names = ["x"]
-
-        def export_to_onnx(model, input, input_names):
-            traced = torch.jit.trace(model, input)
-            buf = io.BytesIO()
-            torch.jit.save(traced, buf)
-            buf.seek(0)
-
-            model = torch.jit.load(buf)
-            f = io.BytesIO()
-            torch.onnx.export(model, input, f, input_names=input_names,
-                              operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
-                              opset_version=9)
-        onnx_model = export_to_onnx(model, data, input_names)
-
-    @skipIfNoFBGEMM
-    @skipIfNoCaffe2
-    def test_lower_graph_linear(self):
-        model = torch.ao.quantization.QuantWrapper(torch.nn.Linear(5, 10, bias=True)).to(dtype=torch.float)
-        data_numpy = np.random.rand(1, 2, 5).astype(np.float32)
-        data = torch.from_numpy(data_numpy).to(dtype=torch.float)
-        self._test_lower_graph_impl(model, data)
-
-    @skipIfNoFBGEMM
-    @skipIfNoCaffe2
-    def test_lower_graph_conv2d(self):
-        model = torch.ao.quantization.QuantWrapper(torch.nn.Conv2d(3, 5, 2, bias=True)).to(dtype=torch.float)
-        data_numpy = np.random.rand(1, 3, 6, 6).astype(np.float32)
-        data = torch.from_numpy(data_numpy).to(dtype=torch.float)
-        self._test_lower_graph_impl(model, data)
-
-    @skipIfNoFBGEMM
-    @unittest.skip("onnx opset9 does not support quantize_per_tensor and caffe2 \
-    does not support conv3d")
-    def test_lower_graph_conv3d(self):
-        model = torch.ao.quantization.QuantWrapper(torch.nn.Conv3d(3, 5, 2, bias=True)).to(dtype=torch.float)
-        data_numpy = np.random.rand(1, 3, 6, 6, 6).astype(np.float32)
-        data = torch.from_numpy(data_numpy).to(dtype=torch.float)
-        self._test_lower_graph_impl(model, data)
-
 
 if __name__ == '__main__':
     raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
diff --git a/test/test_jit.py b/test/test_jit.py
index b1425a4ed71ca..13c27b0efa555 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -20,7 +20,6 @@
 from jit.test_autodiff import TestAutodiffJit  # noqa: F401
 from jit.test_autodiff_subgraph_slicing import TestAutodiffSubgraphSlicing  # noqa: F401
 from jit.test_custom_operators import TestCustomOperators  # noqa: F401
-from jit.test_export_modes import TestExportModes  # noqa: F401
 from jit.test_graph_rewrite_passes import TestGraphRewritePasses  # noqa: F401
 from jit.test_class_type import TestClassType  # noqa: F401
 from jit.test_builtins import TestBuiltins, TestTensorBuiltins  # noqa: F401
@@ -97,7 +96,7 @@
 from torch.testing._internal.common_jit import check_against_reference
 from torch.testing._internal.common_utils import run_tests, IS_WINDOWS, TEST_WITH_UBSAN, \
     suppress_warnings, BUILD_WITH_CAFFE2, IS_SANDCASTLE, GRAPH_EXECUTOR, ProfilingMode, TestCase, \
-    freeze_rng_state, slowTest, TemporaryFileName, skipIfCompiledWithoutNumpy, \
+    freeze_rng_state, slowTest, TemporaryFileName, \
     enable_profiling_mode_for_profiling_tests, TEST_MKL, set_default_dtype, num_profiled_runs, \
     skipIfCrossRef, IS_MACOS, skipIfTorchDynamo
 from torch.testing._internal.jit_utils import JitTestCase, enable_cpu_fuser, disable_autodiff_subgraph_inlining, \
@@ -5913,23 +5912,6 @@ def test_fuser_multiple_blocks(this, that, theother, meme):
 
         self.assertEqual(cu.test_fuser_multiple_blocks(*inputs), outputs)
 
-    def test_dropout_script(self):
-
-        eg = torch.zeros(1, 2, 3, requires_grad=True)
-
-        @_trace(eg)
-        def foo(x):
-            x = torch.neg(x)
-            return F.dropout(x)
-
-        class MyDrop(nn.Module):
-            def forward(self, x):
-                return foo(x)
-
-        f = io.BytesIO()
-        with warnings.catch_warnings(record=True):
-            torch.onnx.export(MyDrop(), (eg,), f, verbose=False)
-
     @unittest.skip("RuntimeError: VariableType::ID() not implemented")
     def test_cast(self):
         script = '''
@@ -9780,50 +9762,6 @@ def forward(self, rep):
             m = M2()
             m(torch.zeros(4, 3))
 
-    @skipIfCompiledWithoutNumpy
-    def test_pack_padded_pad_packed_trace(self):
-        from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
-        T, B, C = 3, 5, 7
-
-        class PadPackedWrapper(torch.nn.Module):
-            def __init__(self):
-                super(PadPackedWrapper, self).__init__()
-
-            def forward(self, x, seq_lens):
-                x = pack_padded_sequence(x, seq_lens)
-                x, _ = pad_packed_sequence(x)
-                return x
-
-        x = np.ones((T, B, C))
-        seq_lens = np.array([3, 3, 2, 2, 1], dtype=np.int32)
-        # set padding value so we can test equivalence
-        for b in range(B):
-            if seq_lens[b] < T:
-                x[seq_lens[b]:, b, :] = 0
-        seq_lens = torch.from_numpy(seq_lens)
-        x = torch.autograd.Variable(torch.from_numpy(x), requires_grad=True)
-
-        m = PadPackedWrapper()
-        m_traced = torch.jit.trace(m, (x, seq_lens,))
-
-        y = m(x, seq_lens)
-        loss = torch.sum(y)
-        loss.backward()
-        grad = x.grad.clone()
-        x.grad.zero_()
-
-        y_traced = m_traced(x, seq_lens)
-        loss_traced = torch.sum(y_traced)
-        loss_traced.backward()
-        grad_traced = x.grad.clone()
-
-        self.assertEqual(y_traced, x)
-        self.assertEqual(y_traced, y)
-        self.assertEqual(grad, grad_traced)
-
-        f = io.BytesIO()
-        torch.onnx._export(m, (x, seq_lens), f, verbose=False)
-
     def test_script_pack_padded_sequence(self):
         from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
 
@@ -10024,54 +9962,6 @@ def forward(self, input: torch.Tensor):
         m_scripted = torch.jit.script(m)
         self.assertEqual(m_scripted(torch.tensor(1)), torch.tensor(246))
 
-    # Suppression: ONNX warns when exporting RNNs because of potential batch size mismatch.
-    @suppress_warnings
-    @skipIfCompiledWithoutNumpy
-    def test_rnn_trace_override(self):
-        from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
-        num_layers = 3
-        T, B, C = 11, 5, 7
-
-        class RNNTraceWrapper(torch.nn.Module):
-            def __init__(self, cell_type):
-                super(RNNTraceWrapper, self).__init__()
-                if cell_type == 'RNN':
-                    self.rnn = torch.nn.RNN(input_size=C, hidden_size=C, num_layers=num_layers)
-                elif cell_type == 'LSTM':
-                    self.rnn = torch.nn.LSTM(input_size=C, hidden_size=C, num_layers=num_layers)
-                elif cell_type == 'GRU':
-                    self.rnn = torch.nn.GRU(input_size=C, hidden_size=C, num_layers=num_layers)
-
-            def forward(self, x, seq_lens):
-                x = pack_padded_sequence(x, seq_lens)
-                x, _ = self.rnn(x)
-                x, _ = pad_packed_sequence(x)
-                return x
-
-        for cell_type in ['RNN', 'LSTM', 'GRU']:
-            x = torch.ones(T, B, C, requires_grad=True)
-            seq_lens = torch.from_numpy(np.array([11, 3, 2, 2, 1], dtype=np.int32))
-
-            m = RNNTraceWrapper(cell_type)
-            m_traced = torch.jit.trace(m, (x, seq_lens,))
-
-            y = m(x, seq_lens)
-            loss = torch.sum(y)
-            loss.backward()
-            grad = x.grad.clone()
-            x.grad.zero_()
-
-            y_traced = m_traced(x, seq_lens)
-            loss_traced = torch.sum(y_traced)
-            loss_traced.backward()
-            grad_traced = x.grad.clone()
-
-            self.assertEqual(y_traced, y)
-            self.assertEqual(grad, grad_traced)
-
-            f = io.BytesIO()
-            torch.onnx._export(m, (x, seq_lens), f, verbose=False)
-
     def test_python_call_non_tensor(self):
         def foo(a, b, c):
             # type: (Tensor, int, Tuple[Tensor, int]) -> Tuple[int, Tensor]
diff --git a/test/test_quantization.py b/test/test_quantization.py
index 2dc6f7ac7850d..2726e0f82eec5 100644
--- a/test/test_quantization.py
+++ b/test/test_quantization.py
@@ -52,7 +52,6 @@
 from quantization.eager.test_quantize_eager_ptq import TestQuantizeEagerPTQStatic  # noqa: F401
 from quantization.eager.test_quantize_eager_ptq import TestQuantizeEagerPTQDynamic  # noqa: F401
 from quantization.eager.test_quantize_eager_ptq import TestQuantizeEagerOps  # noqa: F401
-from quantization.eager.test_quantize_eager_ptq import TestQuantizeEagerONNXExport  # noqa: F401
 # 2. Eager mode quantization aware training
 from quantization.eager.test_quantize_eager_qat import TestQuantizeEagerQAT  # noqa: F401
 from quantization.eager.test_quantize_eager_qat import TestQuantizeEagerQATNumerics  # noqa: F401

From 715a760200643a5c19583691b259bc9bf016dd6b Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 31 Oct 2022 17:48:35 -0700
Subject: [PATCH 0391/1922] Reenable assert sanity testing with ADInplaceOrView
 reenable (#88102)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88102
Approved by: https://github.com/albanD
---
 torch/_subclasses/fake_tensor.py |   6 +-
 torch/_subclasses/meta_utils.py  | 147 +++++++++++++++++--------------
 torchgen/model.py                |   3 +-
 3 files changed, 90 insertions(+), 66 deletions(-)

diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index aea6fcbdd1e86..5ed23f8020fd0 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -717,7 +717,11 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
 
         # Some attribute queries that can be serviced directly
         # See Note [is_coalesced is dispatched]
-        if func in [torch.ops.aten.is_coalesced.default]:
+        if func in {
+            torch.ops.aten.is_coalesced.default,
+            torch.ops.aten.dense_dim.default,
+            torch.ops.aten.sparse_dim.default,
+        }:
             # NB: no_dispatch is ok here too, this func is very simple
             with in_kernel_invocation_manager(self):
                 return func(*args, **kwargs)
diff --git a/torch/_subclasses/meta_utils.py b/torch/_subclasses/meta_utils.py
index 0e2bbe49dd226..7e2039f1764f2 100644
--- a/torch/_subclasses/meta_utils.py
+++ b/torch/_subclasses/meta_utils.py
@@ -25,10 +25,11 @@ def assert_eq(a, b):
     assert a == b, f"{a} != {b}"
 
 
-def assert_metadata_eq(assert_eq, m1, m2):
+def assert_metadata_eq(assert_eq, m1, m2, *, skip_symbolic=False):
     def go(m1, m2):
         assert_eq(m1.dtype, m2.dtype)
-        assert_eq(m1.shape, m2.shape)
+        if not skip_symbolic:
+            assert_eq(m1.shape, m2.shape)
         assert_eq(m1.requires_grad, m2.requires_grad)
         assert_eq(m1.is_leaf, m2.is_leaf)
         assert_eq(m1.grad_fn is None, m2.grad_fn is None)
@@ -38,14 +39,15 @@ def go(m1, m2):
         assert_eq(m1.is_neg(), m2.is_neg())
         assert_eq(safe_grad(m1) is not None, safe_grad(m2) is not None)
         if safe_grad(m1) is not None:
-            go(m1.grad, m2.grad)
+            go(safe_grad(m1), safe_grad(m2))
         if m1.is_sparse:
             assert_eq(m1.dense_dim(), m2.dense_dim())
             assert_eq(m1.sparse_dim(), m2.sparse_dim())
             assert_eq(m1.is_coalesced(), m2.is_coalesced())
         else:
-            assert_eq(m1.stride(), m2.stride())
-            assert_eq(m1.storage_offset(), m2.storage_offset())
+            if not skip_symbolic:
+                assert_eq(m1.stride(), m2.stride())
+                assert_eq(m1.storage_offset(), m2.storage_offset())
             assert_eq(m1._is_view(), m2._is_view())
             if m1._is_view():
                 go(m1._base, m2._base)
@@ -262,63 +264,83 @@ def is_c_of_r(complex_dtype, real_dtype):
                             == real_dtype
                         )
 
-                    if base.dtype == t.dtype:
-                        pass
-                    elif is_c_of_r(base.dtype, t.dtype):
-                        base = torch.view_as_real(base)
-                    elif is_c_of_r(t.dtype, base.dtype):
-                        base = torch.view_as_complex(base)
-                    else:
-                        # This is not guaranteed to succeed.  If it fails, it
-                        # means there is another dtype-converting view function
-                        # that hasn't been handled here
-                        base = base.view(t.dtype)
-
-                    # This is very tricky.  Naively, you might expect this
-                    # to hold:
-                    #
-                    #   if t.requires_grad and not safe_is_leaf(t)
-                    #       assert t._base.requires_grad
-                    #
-                    # But it's not true!  As you can see in the following
-                    # program:
-                    #
-                    #   x = torch.zeros(4)
-                    #   y = x.view(1, 4)
-                    #   y.requires_grad = True
-                    #   z = y.view(1, 1, 4)
-                    #   assert z._base is x
-                    #
-                    # So we may have to do *two* views out of the base to
-                    # recreate this situation.
-
-                    sizes, strides = sym_sizes_strides(t)
-                    if safe_is_leaf(t):
-                        # Leaf views that track view metadata are created by
-                        # creating a view inside a no_grad block
-                        with torch.no_grad():
-                            r = base.as_strided(sizes, strides, sym(t.storage_offset()))
-                        # As it's a leaf, we can directly assign requires_grad
-                        r.requires_grad = t.requires_grad
-                    else:
-                        if t._base.requires_grad == t.requires_grad:
-                            # Easy case, just run the view op
-                            with torch.enable_grad():
-                                r = base.as_strided(
-                                    sizes, strides, sym(t.storage_offset())
-                                )
+                    # In some situations, MetaConverter may be called in a
+                    # context where autograd is disabled.  For the _is_view
+                    # assert to pass, we have to setup the autograd view
+                    # metadata anyway.  Do this by reenabling the
+                    # ADInplaceOrView key.  This is kind of a hack.
+                    old_exclude = torch._C._dispatch_tls_is_dispatch_key_excluded(
+                        torch._C.DispatchKey.ADInplaceOrView
+                    )
+                    torch._C._dispatch_tls_set_dispatch_key_excluded(
+                        torch._C.DispatchKey.ADInplaceOrView, False
+                    )
+                    try:
+
+                        if base.dtype == t.dtype:
+                            pass
+                        elif is_c_of_r(base.dtype, t.dtype):
+                            base = torch.view_as_real(base)
+                        elif is_c_of_r(t.dtype, base.dtype):
+                            base = torch.view_as_complex(base)
                         else:
-                            # Obscure case.  Create a leaf view and give it the
-                            # correct requires_grad, then do the final view.
-                            # NB: Can't have a non-leaf without requiring grad!
-                            assert t.requires_grad
+                            # This is not guaranteed to succeed.  If it fails, it
+                            # means there is another dtype-converting view function
+                            # that hasn't been handled here
+                            base = base.view(t.dtype)
+
+                        # This is very tricky.  Naively, you might expect this
+                        # to hold:
+                        #
+                        #   if t.requires_grad and not safe_is_leaf(t)
+                        #       assert t._base.requires_grad
+                        #
+                        # But it's not true!  As you can see in the following
+                        # program:
+                        #
+                        #   x = torch.zeros(4)
+                        #   y = x.view(1, 4)
+                        #   y.requires_grad = True
+                        #   z = y.view(1, 1, 4)
+                        #   assert z._base is x
+                        #
+                        # So we may have to do *two* views out of the base to
+                        # recreate this situation.
+
+                        sizes, strides = sym_sizes_strides(t)
+
+                        if safe_is_leaf(t):
+                            # Leaf views that track view metadata are created by
+                            # creating a view inside a no_grad block
                             with torch.no_grad():
-                                mid = base.view(base.shape)
-                            mid.requires_grad = t.requires_grad
-                            with torch.enable_grad():
-                                r = mid.as_strided(
+                                r = base.as_strided(
                                     sizes, strides, sym(t.storage_offset())
                                 )
+                            # As it's a leaf, we can directly assign requires_grad
+                            r.requires_grad = t.requires_grad
+                        else:
+                            if t._base.requires_grad == t.requires_grad:
+                                # Easy case, just run the view op
+                                with torch.enable_grad():
+                                    r = base.as_strided(
+                                        sizes, strides, sym(t.storage_offset())
+                                    )
+                            else:
+                                # Obscure case.  Create a leaf view and give it the
+                                # correct requires_grad, then do the final view.
+                                # NB: Can't have a non-leaf without requiring grad!
+                                assert t.requires_grad
+                                with torch.no_grad():
+                                    mid = base.view(base.shape)
+                                mid.requires_grad = t.requires_grad
+                                with torch.enable_grad():
+                                    r = mid.as_strided(
+                                        sizes, strides, sym(t.storage_offset())
+                                    )
+                    finally:
+                        torch._C._dispatch_tls_set_dispatch_key_excluded(
+                            torch._C.DispatchKey.ADInplaceOrView, old_exclude
+                        )
 
                 else:
                     is_leaf = safe_is_leaf(t)
@@ -389,15 +411,12 @@ def is_c_of_r(complex_dtype, real_dtype):
                         with maybe_fake_mgr, torch.no_grad():
                             r.set_(r_s, storage_offset, sizes, strides)
 
-                with warnings.catch_warnings():
-                    warnings.filterwarnings("ignore", "The .grad attribute of a Tensor")
-                    grad_not_none = t.grad is not None
-                if grad_not_none:
-                    r.grad = self.meta_tensor(t.grad, shape_env, callback)
+                if safe_grad(t) is not None:
+                    r.grad = self.meta_tensor(safe_grad(t), shape_env, callback)
                 torch._C._set_conj(r, t.is_conj())
                 torch._C._set_neg(r, t.is_neg())
             # This can be skipped if necessary for performance reasons
-            # assert_metadata_eq(assert_eq, t, r)
+            assert_metadata_eq(assert_eq, t, r, skip_symbolic=True)
             self.set_tensor_memo(t, r)
 
         return self.get_tensor_memo(t)
diff --git a/torchgen/model.py b/torchgen/model.py
index f87f2be28b7f3..c1b906dd1d85f 100644
--- a/torchgen/model.py
+++ b/torchgen/model.py
@@ -59,7 +59,7 @@ def __str__(self) -> str:
 ]
 
 # This doesn't have to be in sync with the header, it only needs to contain
-# entries that we actually use in the codegen
+# entries that we actually use in the codegen or want pyi entries for
 class DispatchKey(Enum):
     Undefined = 0
     CatchAll = Undefined
@@ -92,6 +92,7 @@ class DispatchKey(Enum):
     TESTING_ONLY_GenericWrapper = auto()
     TESTING_ONLY_GenericMode = auto()
 
+    ADInplaceOrView = auto()
     Autograd = auto()
     CompositeImplicitAutograd = auto()
     CompositeImplicitAutogradNestedTensor = auto()

From 0e99a0e9bf16bd82d2b21943795f7f83146f0513 Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang@fb.com>
Date: Mon, 31 Oct 2022 16:33:11 -0700
Subject: [PATCH 0392/1922] [9/N] [Dispatchable Collectives] Update
 reduce_scatter with CPU / CUDA implementations (#86166)

### Changes
- Updates for the reduce_scatter collective

### Context
https://github.com/pytorch/pytorch/issues/86225
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86166
Approved by: https://github.com/kwen2501
---
 test/distributed/test_c10d_common.py    |  7 ++++-
 torch/csrc/distributed/c10d/OpsImpl.cpp | 42 +++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index 454595f85735c..a4ee561ed961b 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -1466,6 +1466,10 @@ def _call_collective_with_varying_tensors(self, backend, collective, *args):
         # multi tensor collectives
         if collective == dist.all_gather:
             collective([tensor], tensor, *args)
+        elif collective == dist.reduce_scatter:
+            if backend != "gloo":
+                # gloo does not support reduce_scatter
+                collective(tensor, [tensor], *args)
         else:
             collective(tensor, *args)
 
@@ -1482,7 +1486,8 @@ def _test_collectives(self, backend):
             (dist.reduce, self.rank),
             (dist.broadcast, self.rank),
             (dist.all_reduce,),
-            (dist.all_gather,)
+            (dist.all_gather,),
+            (dist.reduce_scatter,),
         ]
         for collective, *args in collectives_and_args:
             with self.subTest(collective=collective, args=args):
diff --git a/torch/csrc/distributed/c10d/OpsImpl.cpp b/torch/csrc/distributed/c10d/OpsImpl.cpp
index 8254ce3126e3f..b4446b2aa8cdd 100644
--- a/torch/csrc/distributed/c10d/OpsImpl.cpp
+++ b/torch/csrc/distributed/c10d/OpsImpl.cpp
@@ -185,6 +185,40 @@ allgather_cuda_(
           output_tensors, work);
 }
 
+std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>
+reduce_scatter_cpu_(
+    const std::vector<at::Tensor>& output_tensors,
+    const std::vector<std::vector<at::Tensor>>& input_tensors,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    const c10::intrusive_ptr<ReduceOp>& reduce_op,
+    int64_t timeout) {
+  auto work = process_group->reduce_scatter(
+      const_cast<std::vector<at::Tensor>&>(output_tensors),
+      const_cast<std::vector<std::vector<at::Tensor>>&>(input_tensors),
+      ReduceScatterOptions{
+          *reduce_op.get(), std::chrono::milliseconds(timeout)});
+
+  return std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
+      output_tensors, work);
+}
+
+std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>
+reduce_scatter_cuda_(
+    const std::vector<at::Tensor>& output_tensors,
+    const std::vector<std::vector<at::Tensor>>& input_tensors,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    const c10::intrusive_ptr<ReduceOp>& reduce_op,
+    int64_t timeout) {
+  auto work = process_group->reduce_scatter(
+      const_cast<std::vector<at::Tensor>&>(output_tensors),
+      const_cast<std::vector<std::vector<at::Tensor>>&>(input_tensors),
+      ReduceScatterOptions{
+          *reduce_op.get(), std::chrono::milliseconds(timeout)});
+
+  return std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
+      output_tensors, work);
+}
+
 // register functions to dispatcher
 namespace {
 TORCH_LIBRARY_IMPL(c10d, CPU, m) {
@@ -244,6 +278,14 @@ TORCH_LIBRARY_IMPL(c10d, CPU, m) {
 TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
   m.impl("allgather_", allgather_cuda_);
 }
+
+TORCH_LIBRARY_IMPL(c10d, CPU, m) {
+  m.impl("reduce_scatter_", reduce_scatter_cpu_);
+}
+
+TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
+  m.impl("reduce_scatter_", reduce_scatter_cuda_);
+}
 } // namespace
 
 } // namespace ops

From 83397423d1bf7ace6eb37f3666e6fede31cb123b Mon Sep 17 00:00:00 2001
From: Yanli Zhao <yanlizhao@fb.com>
Date: Tue, 1 Nov 2022 15:27:40 +0000
Subject: [PATCH 0393/1922] [BE]fix DDP when the number of output features is
 zero (#87793)

Fixes #87280

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87793
Approved by: https://github.com/rohan-varma
---
 torch/csrc/distributed/c10d/comm.cpp            |  8 +++++++-
 .../_internal/distributed/distributed_test.py   | 17 +++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/distributed/c10d/comm.cpp b/torch/csrc/distributed/c10d/comm.cpp
index c873eec5fbcf9..d011e5543a5da 100644
--- a/torch/csrc/distributed/c10d/comm.cpp
+++ b/torch/csrc/distributed/c10d/comm.cpp
@@ -32,7 +32,13 @@ class BroadcastWork {
         flat_tensor_.front(), bucket_tensors_);
     TORCH_INTERNAL_ASSERT(output_tensors.size() == bucket_tensors_.size());
     for (const auto i : c10::irange(output_tensors.size())) {
-      bucket_tensors_[i].copy_(output_tensors[i], /*non_blocking=*/true);
+      // if output_tensor is empty, no need to copy it back,
+      // this can avoid error when both bucket_tensor and output_tensor
+      // are empty, but they have different shapes, see
+      // https://github.com/pytorch/pytorch/issues/87280
+      if (output_tensors[i].numel() != 0) {
+        bucket_tensors_[i].copy_(output_tensors[i], /*non_blocking=*/true);
+      }
     }
   }
 
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 2242374858f0a..8c44cc0482cc4 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -4220,6 +4220,23 @@ def test_DistributedDataParallel_requires_grad(self):
             )
             self._barrier()
 
+
+        @sandcastle_skip_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel"
+        )
+        @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
+        def test_ddp_zero_output_features(self):
+            class ToyModel(nn.Module):
+                def __init__(self):
+                    super(ToyModel, self).__init__()
+                    self.net1 = nn.Linear(10, 10)
+                    self.relu = nn.ReLU()
+                    self.net2 = nn.Linear(10, 0)
+
+            model = ToyModel().to(self.rank)
+            ddp_model = nn.parallel.DistributedDataParallel(model, device_ids=[self.rank])
+
         @sandcastle_skip_if(
             BACKEND == "nccl",
             "Gloo-only test"

From 40e41264c38c4ce1923658e88a39b46fa5bff784 Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Tue, 1 Nov 2022 15:35:44 +0000
Subject: [PATCH 0394/1922] Change dynamo/distributed tests to use cuda/nccl
 (#88133)

- FSDP tests require nccl
- also run in inductor shard and skip inductor in distributed shard
- inductor shard has newer GPU and supports triton/inductor, but only runs on trunk
- distributed shard runs on PR, but inductor shard only runs on trunk/opt-in

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88133
Approved by: https://github.com/davidberard98
---
 .jenkins/pytorch/test.sh                    |  5 +++++
 test/distributed/test_dynamo_distributed.py | 15 +++++++++++----
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 89fbd764201a1..5154f5a3f3b38 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -249,6 +249,10 @@ test_dynamo_shard() {
   assert_git_not_dirty
 }
 
+test_inductor_distributed() {
+  PYTORCH_TEST_WITH_INDUCTOR=0 PYTORCH_TEST_WITH_INDUCTOR=0 python test/run_test.py --include distributed/test_dynamo_distributed
+  assert_git_not_dirty
+}
 
 test_inductor() {
   python test/test_modules.py --verbose
@@ -740,6 +744,7 @@ elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SH
   install_filelock
   install_triton
   test_inductor
+  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then
   install_torchvision
   install_filelock
diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index 4e8c6ffa981ac..dc42586abdcb5 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -10,7 +10,9 @@
 from torch import nn
 from torch._dynamo import config
 from torch._dynamo.utils import same
+from torch._inductor.utils import has_triton
 from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.testing._internal.common_distributed import requires_nccl
 
 def init_weights(m):
     if isinstance(m, nn.Linear):
@@ -39,6 +41,7 @@ def compile_fn(self, gm, example_inputs):
         return gm
 
 
+@requires_nccl()
 class TestDistributed(torch._dynamo.test_case.TestCase):
     """
     Test harness initializes dist process group
@@ -58,9 +61,9 @@ def setUpClass(cls):
             )
         )
         cls.rank = 0
-        cls.device = f"cpu:{cls.rank}"
-        cls.device_ids = None if "cpu" in cls.device else [cls.rank]
-        dist.init_process_group("gloo", rank=cls.rank, world_size=1)
+        cls.device = f"cuda:{cls.rank}"
+        cls.device_ids = None if "cuda" in cls.device else [cls.rank]
+        dist.init_process_group("nccl", rank=cls.rank, world_size=1)
 
     @classmethod
     def tearDownClass(cls):
@@ -84,6 +87,7 @@ def test_ddp_baseline_aot_eager(self):
         outputs = ddp_m(inputs)
         self.assertTrue(same(correct_outputs, outputs))
 
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
     @patch.object(config, "optimize_ddp", False)
     def test_ddp_baseline_inductor(self):
         from torch.nn.parallel import DistributedDataParallel as DDP
@@ -109,6 +113,7 @@ def test_fsdp_baseline_aot_eager(self):
         self.assertTrue(same(correct_outputs, outputs))
 
     @unittest.skip("hangs/crashes with inductor currently")
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
     @patch.object(config, "optimize_ddp", False)
     def test_fsdp_baseline_inductor(self):
         from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
@@ -142,6 +147,7 @@ def opt_fn(inputs):
         self.assertEqual(check_splits_compiler.compiler_called, 3)
 
     @patch.object(config, "optimize_ddp", True)
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
     def test_graph_split_inductor(self):
         """
         Same as above, but using inductor backend.
@@ -248,7 +254,8 @@ def opt_fn(inputs):
         self.assertTrue(same(correct_outputs, opt_outputs))
         self.assertEqual(check_splits_compiler.compiler_called, 3)
 
-    def test_empty_graph(self):
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    def test_empty_graph_inductor(self):
         def fn():
             get_world_size = torch.distributed.distributed_c10d.get_world_size()
             return (get_world_size,)

From 595984160d14e1318287a2cf656ca4b52f3e4db9 Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Tue, 1 Nov 2022 02:02:19 +0000
Subject: [PATCH 0395/1922] Fix meta for aten.fill, constant_pad_nd,
 _adaptive_avg_pool2d (#88069)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88069
Approved by: https://github.com/ngimel, https://github.com/malfet
---
 test/test_meta.py            | 22 ++++++++++++++--------
 torch/_meta_registrations.py | 10 ++++++++--
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/test/test_meta.py b/test/test_meta.py
index c1099dce6ccd3..378e114082704 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -348,14 +348,7 @@ def test_tensor_outlives_converter(self):
     aten.xlogy.Tensor,
 
     # channel_last and channel_last_3d related failures
-    aten.constant_pad_nd.default,
-    aten._adaptive_avg_pool2d.default,
-    aten.constant_pad_nd.default,
     aten.convolution.default,
-    aten.convolution.default,
-    aten._adaptive_avg_pool2d.default,
-    aten.upsample_bilinear2d.vec,
-    aten.constant_pad_nd.default,
     aten.upsample_bilinear2d.vec,
 
     # following ops fails if include_storage_offset = True, but these are a bit edge casey
@@ -1269,7 +1262,7 @@ def test_huber_loss_backward(self):
         self.assertEqual(r.device.type, 'meta')
         self.assertEqual(r.shape, inps[0].shape)
 
-    def test_fill_alias_relationship(self):
+    def test_fill__alias_relationship(self):
         inps = torch.rand(2**52, device='meta')
         r = torch.ops.aten.fill_(inps, 1.0)
         # aten.fill_ returns an aliase
@@ -1329,6 +1322,19 @@ def test_meta__fused_moving_avg_obs_fq_helper(self, device):
             self.assertEqual(ref_out[1].size(), meta_out[1].size())
             self.assertEqual(ref_out[1].stride(), meta_out[1].stride())
 
+    # opinfo test is using aten.fill_, it's not testing aten.fill
+    @onlyCUDA
+    def test_fill_stride(self):
+        to_meta = MetaConverter()
+        sample_args = [torch.rand(2, 2, 2, 2), 1.0]
+
+        for args in get_strided_args(sample_args):
+            meta_args = to_meta(args)
+            ref_out = torch.ops.aten.fill(*args)
+            meta_out = torch.ops.aten.fill(*meta_args)
+            self.assertEqual(ref_out.size(), meta_out.size())
+            self.assertEqual(ref_out.stride(), meta_out.stride())
+
 
     def test_map_location_deserialize(self):
         import io
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 5fec8475e50c9..712e15608b02f 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -655,7 +655,13 @@ def meta_adaptive_avg_pool2d(self, output_size):
         self.ndim == 3 or self.ndim == 4,
         lambda: f"Expected 3D or 4D tensor, but got {self.shape}",
     )
-    return self.new_empty(self.shape[:-2] + tuple(output_size))
+    output_shape = self.shape[:-2] + tuple(output_size)
+    memory_format = utils.suggest_memory_format(self)
+    # need to set memory_format to preserve the memory format of the input
+    # channel last input should have channel last output
+    return torch.empty(
+        output_shape, dtype=self.dtype, device=self.device, memory_format=memory_format
+    )
 
 
 @register_meta(aten._adaptive_avg_pool3d.default)
@@ -1095,7 +1101,7 @@ def meta_fill_(self, val):
 
 @register_meta([aten.fill.Tensor, aten.fill.Scalar])
 def meta_fill(self, val):
-    return self.new_empty(self.shape)
+    return torch.empty_like(self)
 
 
 @register_meta(aten.relu_.default)

From ed6e172070f1e44ac105ac705a4d4b112a6191c3 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Tue, 1 Nov 2022 15:47:43 +0000
Subject: [PATCH 0396/1922] always realize output regardless of the number of
 reads (#88046)

This improves hf_Bert 1.139x->1.21x, currently lowmem dropout doesn't work for nn.Dropout module, and before this change we were recomputing all the dropout masks in a very inefficient kernel. This change pushes dropout masks to be saved in the dropout kernels where they are first computed.

cc @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88046
Approved by: https://github.com/Chillee
---
 torch/_inductor/graph.py     |  7 +++++--
 torch/_inductor/scheduler.py | 19 ++++++++++++++++---
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 71934419de2fd..adf8ed9614211 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -18,7 +18,7 @@
     MissingOperatorWithDecomp,
     MissingOperatorWithoutDecomp,
 )
-from .ir import Constant, FixedLayout, InputBuffer, TensorBox
+from .ir import Constant, FixedLayout, InputBuffer, Pointwise, Reduction, TensorBox
 from .lowering import lowerings, make_fallback, needs_realized_inputs
 from .sizevars import SizeVarAllocator
 from .utils import dynamo_utils
@@ -303,8 +303,11 @@ def run_node(self, n: torch.fx.Node):
             num_users = len(set(n.users))
             if num_users > 1 and isinstance(result, TensorBox):
                 for user in n.users:
-                    if user.target in needs_realized_inputs or user.op == "output":
+                    if user.target in needs_realized_inputs:
                         result.realize_hint()
+                    elif user.op == "output":
+                        if isinstance(result.data.data, (Pointwise, Reduction)):
+                            result.realize()
 
                 # TODO(jansel): introduce a store vs inline choice
                 result.mark_reuse(len(n.users))
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
index 2cbf80c29566a..2f1c4b7c2e643 100644
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@@ -922,9 +922,22 @@ def can_fuse_vertical(self, node1, node2):
         be scheduled before the fusion of node1 and node2.
         """
         node1_names = node1.get_names()
-        remaining_deps = {
-            dep.name for dep in node2.unmet_dependencies - node1.read_writes.writes
-        }
+        computed_deps = set()
+        for rd in node2.unmet_dependencies:
+            for cd in node1.read_writes.writes:
+                # StarDep doesn't match MemoryDep, different indices don't match
+                # However, broadcasting sometimes strips dimensions, and if that's the case
+                # we still can match unmet dep
+                if (
+                    rd.name == cd.name
+                    and type(rd) == type(cd)
+                    and rd.index == cd.index
+                    and len(rd.size) >= len(cd.size)
+                    and rd.size[: len(cd.size)] == cd.size
+                ):
+                    computed_deps.add(rd)
+
+        remaining_deps = {dep.name for dep in node2.unmet_dependencies - computed_deps}
         if remaining_deps & node1_names:
             # MemoryDeps didn't match and read different locations of the same buffer.
             # Examples here include:

From d01299732b3e8a9bd93faf46b93e0770f9b73c8e Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 31 Oct 2022 14:42:15 -0700
Subject: [PATCH 0397/1922] Inline Alias into FunctionalStorageImpl (#88140)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88140
Approved by: https://github.com/bdhirsh
---
 aten/src/ATen/FunctionalStorageImpl.cpp | 74 +++++++++----------------
 aten/src/ATen/FunctionalStorageImpl.h   | 62 +++++++++------------
 2 files changed, 52 insertions(+), 84 deletions(-)

diff --git a/aten/src/ATen/FunctionalStorageImpl.cpp b/aten/src/ATen/FunctionalStorageImpl.cpp
index f42c535389900..98b8d88bf230f 100644
--- a/aten/src/ATen/FunctionalStorageImpl.cpp
+++ b/aten/src/ATen/FunctionalStorageImpl.cpp
@@ -15,23 +15,9 @@ ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
   return ViewMeta(forward_fn, reverse_fn, out_idx);
 }
 
-Alias::Alias(const at::Tensor& base) {
-  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base));
-  base_ = base;
-}
-
-const at::Tensor& Alias::base() const {
-  return base_;
-}
-
-void Alias::add_update(const at::Tensor& updated_val, const std::vector<ViewMeta>& metas) {
-  updates_.push_back({updated_val, metas});
-  generation_++;
-}
-
 // Note [Functionalization: Alias Removal Part 2]
 // See Note [Functionalization: Alias Removal] for more details.
-// This function applies a single update from one of the views to the Alias object.
+// This function applies a single update from one of the views to the StorageImpl.
 // We start out with <original_base> and <mutated_view>, and our goal is to end up with <mutated_base>.
 // Consider this program:
 //
@@ -46,15 +32,15 @@ void Alias::add_update(const at::Tensor& updated_val, const std::vector<ViewMeta
 // update.new_val = c  # the updated value of c
 // update.view_metas = [view1_meta, view2_meta, view3_meta]
 //
-// Syncing any of a, b or c will eventually call apply_update() on the alias, and the following will run:
+// Syncing any of a, b or c will eventually call apply_update() on the storage, and the following will run:
 //
 // tmp_values = [base, a, b]  # NB: c is not necessary
 // t = update.new_val
 // t = view3_inverse(b, t, 0)  # 0 is output index, these are all single output views so it's 0
 // t = view2_inverse(a, t, 0)
-// t = view1_inverse(base, t, 0)  # t now represents the updated alias.
-// alias.base_ = t
-const Tensor apply_update(const Alias::Update& update, const Tensor& base) {
+// t = view1_inverse(base, t, 0)  # t now represents the updated storage.
+// storage.base_ = t
+const Tensor apply_update(const FunctionalStorageImpl::Update& update, const Tensor& base) {
   at::Tensor t = update.new_val;
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
   if (update.view_metas.size() == 0) return t;
@@ -77,19 +63,6 @@ const Tensor apply_update(const Alias::Update& update, const Tensor& base) {
   return t;
 }
 
-bool Alias::apply_updates() {
-  // N.B:none of the tensors used in this function should be FunctionalTensorWrappers at this point.
-  // The only reason we currently need the TLS exclude guard here is because of functorch's DynamicLayer stack.
-  // It adds the Functionalize key into TLS before redispatching to the functionalization kernels,
-  // which means that we need to explicitly exclude it here before doing any other work underneath the pass.
-  at::AutoDispatchSkipFunctionalize guard;
-  bool any_updates = updates_.size() > 0;
-  for (auto& update_data: updates_) {
-    base_ = apply_update(update_data, base_);
-  }
-  updates_.clear();
-  return any_updates;
-}
 
 c10::SymInt get_nbytes(const Tensor& value) {
   if (value.unsafeGetTensorImpl()->has_symbolic_sizes_strides()) {
@@ -105,31 +78,36 @@ c10::SymInt get_nbytes(const Tensor& value) {
   return at::detail::computeStorageNbytes(value.sizes(), value.strides(), value.dtype().itemsize(), value.storage_offset());
 }
 
-FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& value)
+FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
   : c10::StorageImpl(
       c10::StorageImpl::use_byte_size_t(),
-      get_nbytes(value),
-      DataPtr{nullptr, value.device()},
+      get_nbytes(base),
+      DataPtr{nullptr, base.device()},
       GetAllocator(kMeta),
       /*resizeable=*/true
     ),
-    alias_(Alias(value))
-  {}
-
-void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<ViewMeta>& view_metas) {
-  alias_.add_update(updated_val, view_metas);
-}
-
-bool FunctionalStorageImpl::apply_updates() {
-  return alias_.apply_updates();
+    base_(base)
+  {
+  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base_));
 }
 
-const Tensor& FunctionalStorageImpl::base() {
-  return alias_.base();
+void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<ViewMeta>& metas) {
+  updates_.push_back({updated_val, metas});
+  generation_++;
 }
 
-size_t FunctionalStorageImpl::generation() const {
-  return alias_.generation();
+bool FunctionalStorageImpl::apply_updates() {
+  // N.B:none of the tensors used in this function should be FunctionalTensorWrappers at this point.
+  // The only reason we currently need the TLS exclude guard here is because of functorch's DynamicLayer stack.
+  // It adds the Functionalize key into TLS before redispatching to the functionalization kernels,
+  // which means that we need to explicitly exclude it here before doing any other work underneath the pass.
+  at::AutoDispatchSkipFunctionalize guard;
+  bool any_updates = updates_.size() > 0;
+  for (auto& update_data: updates_) {
+    base_ = apply_update(update_data, base_);
+  }
+  updates_.clear();
+  return any_updates;
 }
 
 } // namespace functionalization
diff --git a/aten/src/ATen/FunctionalStorageImpl.h b/aten/src/ATen/FunctionalStorageImpl.h
index 6caeac2737fd0..67e79d2707638 100644
--- a/aten/src/ATen/FunctionalStorageImpl.h
+++ b/aten/src/ATen/FunctionalStorageImpl.h
@@ -46,13 +46,18 @@ struct ViewMeta {
   ViewMeta to_out_idx(int64_t out_idx);
 };
 
-// Alias represents the state shared by (potentially multiple) views of the same
-// tensor. For example, in the following code:
+// FunctionalStorageImpl is a subclass of StorageImpl used by the
+// functionalization pass. It has no underlying data (similar to meta storage).
+// It also knows how to reflect mutations to tensors in the absence of a valid
+// data pointer.
+//
+// A storage represents the state shared by (potentially multiple) views of the
+// same tensor. For example, in the following code:
 //
 // b = a.view1(...)
 // c = b.view2(...)
 // b.add_(1)
-// --> alias.add_update(b, {view1_meta})
+// --> storage.add_update(b, {view1_meta})
 //
 // The call to add_(1) will result in a call to alias.add_update(b,
 // {view1_meta}), queueing up the mutation from b onto the alias. Later, suppose
@@ -65,30 +70,37 @@ struct ViewMeta {
 // --> c.sync_()
 //     --> alias.apply_updates() // after this, the alias will be updated to
 //     reflect the mutation to b
-class Alias {
+struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
  public:
   struct Update {
     const at::Tensor new_val;
     const std::vector<ViewMeta> view_metas;
   };
-  explicit Alias(const at::Tensor& base);
-  const at::Tensor& base() const;
+
+  explicit FunctionalStorageImpl(const Tensor& value);
+
+  void add_update(
+      const Tensor& updated_val,
+      const std::vector<ViewMeta>& view_metas);
+  bool apply_updates();
+  const Tensor& base() {
+    return base_;
+  }
   size_t generation() const {
     return generation_;
   }
-  void add_update(
-      const at::Tensor& updated_val,
-      const std::vector<ViewMeta>& metas);
-  bool apply_updates();
+
+  ~FunctionalStorageImpl() override = default;
 
  private:
   // NB: base_ should always point to a tensor BELOW the current
   // functionalization layer. This is mainly to avoid reference cycles. e.g.
   // given `b = a.view(...)` Both a.storage_ and b.storage_ are a
-  // FunctionStorageImpl containing an Alias, with contains a Tensor `base_`. In
-  // this case (where a and b are FunctionalTensorWrapper's), base_ should point
-  // not to a, but to a's unwrapped value, a.value_` See Note
-  // [Functionalization: Alias Removal] for a diagram that shows this visually.
+  // FunctionStorageImpl containing an Walualias, with contains a Tensor
+  // `base_`. In this case (where a and b are FunctionalTensorWrapper's), base_
+  // should point not to a, but to a's unwrapped value, a.value_` See Note
+  // [Functionalization: Walualias Removal] for a diagram that shows this
+  // visually.
   at::Tensor base_;
   std::vector<Update> updates_;
   // generation_ gets incremented every time a mutation is queued onto the
@@ -97,27 +109,5 @@ class Alias {
   size_t generation_ = 0;
 };
 
-// FunctionalStorageImpl is a subclass of StorageImpl used by the
-// functionalization pass. It has no underlying data (similar to meta storage).
-// It also knows how to reflect mutations to tensors in the absence of a valid
-// data pointer. It does this by separately storing an Alias object, which knows
-// how to reflect mutations that may have happened to views of the original
-// tensor.
-struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
-  explicit FunctionalStorageImpl(const Tensor& value);
-
-  void add_update(
-      const Tensor& updated_val,
-      const std::vector<ViewMeta>& view_metas);
-  bool apply_updates();
-  const Tensor& base();
-  size_t generation() const;
-
-  ~FunctionalStorageImpl() override = default;
-
- private:
-  at::functionalization::Alias alias_;
-};
-
 } // namespace functionalization
 } // namespace at

From 20d0d6b305a4dfbdaee8759e3a7dcfc42c00079d Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 31 Oct 2022 14:42:19 -0700
Subject: [PATCH 0398/1922] Add ability to freeze storages inside
 functionalization (#88141)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88141
Approved by: https://github.com/albanD, https://github.com/bdhirsh
---
 aten/src/ATen/FunctionalStorageImpl.cpp       |  1 +
 aten/src/ATen/FunctionalStorageImpl.h         |  6 ++++++
 aten/src/ATen/FunctionalTensorWrapper.cpp     | 10 ++++++++++
 aten/src/ATen/FunctionalTensorWrapper.h       |  4 ++++
 test/test_functionalization.py                | 12 +++++++++++
 .../python_torch_functions_manual.cpp         | 20 +++++++++++++++++++
 6 files changed, 53 insertions(+)

diff --git a/aten/src/ATen/FunctionalStorageImpl.cpp b/aten/src/ATen/FunctionalStorageImpl.cpp
index 98b8d88bf230f..8e80ce0ca7ddc 100644
--- a/aten/src/ATen/FunctionalStorageImpl.cpp
+++ b/aten/src/ATen/FunctionalStorageImpl.cpp
@@ -92,6 +92,7 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
 }
 
 void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<ViewMeta>& metas) {
+  TORCH_CHECK(!frozen_, "cannot mutate tensors with frozen storage");
   updates_.push_back({updated_val, metas});
   generation_++;
 }
diff --git a/aten/src/ATen/FunctionalStorageImpl.h b/aten/src/ATen/FunctionalStorageImpl.h
index 67e79d2707638..dbaf30c9963d9 100644
--- a/aten/src/ATen/FunctionalStorageImpl.h
+++ b/aten/src/ATen/FunctionalStorageImpl.h
@@ -89,6 +89,9 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
   size_t generation() const {
     return generation_;
   }
+  void freeze() {
+    frozen_ = true;
+  }
 
   ~FunctionalStorageImpl() override = default;
 
@@ -107,6 +110,9 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
   // alias. It is used to determine if a given tensor is "up to date", or if it
   // needs to be regenerated from the alias.
   size_t generation_ = 0;
+  // If frozen, no more mutations are allowed on this storage.  Once frozen, a
+  // storage cannot be unfrozen.
+  bool frozen_ = false;
 };
 
 } // namespace functionalization
diff --git a/aten/src/ATen/FunctionalTensorWrapper.cpp b/aten/src/ATen/FunctionalTensorWrapper.cpp
index 03630c39bbf8b..b7b1c51e83b11 100644
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@@ -66,6 +66,10 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& value)
   set_constructor_metadata();
 }
 
+void FunctionalTensorWrapper::freeze_storage() const {
+  functional_storage_impl()->freeze();
+}
+
 // Note [Functionalization: Alias Removal]
 // When someone calls a view() op during the functionalization pass, e.g. 'b = a.view(...)',
 // we link `b` and `a` to a shared Alias object to preserve the aliasing relationship.
@@ -537,6 +541,12 @@ bool isFunctionalTensor(ITensorListRef list) {
   return isFunctionalTensorIListRef(list);
 }
 
+void freeze_functional_tensor(const Tensor& tensor) {
+  TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(tensor));
+  auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(tensor);
+  functional_base_impl->freeze_storage();
+}
+
 Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta, int64_t out_idx) {
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap));
   TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base));
diff --git a/aten/src/ATen/FunctionalTensorWrapper.h b/aten/src/ATen/FunctionalTensorWrapper.h
index 9f98353dad868..27a88f13f8722 100644
--- a/aten/src/ATen/FunctionalTensorWrapper.h
+++ b/aten/src/ATen/FunctionalTensorWrapper.h
@@ -100,6 +100,8 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
   // used to determine if it's up-to-date with its alias. The act of syncing a
   // tensor will set a tensor's generation equal to its alias's generation.
   bool is_up_to_date() const;
+  // Freezes the storage of this tensor, preventing subsequent mutations
+  void freeze_storage() const;
   // Every FunctionalTensorWrapper contains a vector<ViewMeta> objects
   // describing the series of view ops that ran to generate the current tensor
   // from the base tensor. This method is used by inplace-view ops like
@@ -197,6 +199,8 @@ TORCH_API c10::List<c10::optional<Tensor>> to_functional_tensor(
     const c10::List<c10::optional<Tensor>>& t_list);
 TORCH_API std::vector<Tensor> to_functional_tensor(ITensorListRef t_list);
 
+TORCH_API void freeze_functional_tensor(const Tensor& tensor);
+
 TORCH_API Tensor
 from_functional_tensor(const Tensor& tensor, bool assert_functional = true);
 TORCH_API c10::optional<Tensor> from_functional_tensor(
diff --git a/test/test_functionalization.py b/test/test_functionalization.py
index 9639c6babbe22..e2cca26c1ea62 100644
--- a/test/test_functionalization.py
+++ b/test/test_functionalization.py
@@ -102,6 +102,18 @@ def f(x):
             return z2
         self.assert_functionalization(f, torch.ones(4))
 
+    def test_freeze(self):
+        def f(x):
+            y = x.clone()
+            z = y[0]
+            torch._freeze_functional_tensor(y)
+            x.add_(1)
+            self.assertRaises(RuntimeError, lambda: y.add_(1))
+            self.assertRaises(RuntimeError, lambda: z.add_(1))
+            return z
+
+        _functionalize(f, reapply_views=True)(torch.ones(3, 3))
+
     def test_view_clone_view_inplace(self):
         def f(input):
             shape = [1, 1024, 128, 128]
diff --git a/torch/csrc/autograd/python_torch_functions_manual.cpp b/torch/csrc/autograd/python_torch_functions_manual.cpp
index 949bf1219f5ab..562f5a427d380 100644
--- a/torch/csrc/autograd/python_torch_functions_manual.cpp
+++ b/torch/csrc/autograd/python_torch_functions_manual.cpp
@@ -431,6 +431,22 @@ static PyObject* THPVariable__from_functional_tensor(
   END_HANDLE_TH_ERRORS
 }
 
+static PyObject* THPVariable__freeze_functional_tensor(
+    PyObject* self,
+    PyObject* args,
+    PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser(
+      {"_freeze_functional_tensor(Tensor t)"}, /*traceable=*/true);
+
+  ParsedArgs<1> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+  auto self_ = r.tensor(0);
+  at::functionalization::impl::freeze_functional_tensor(self_);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
 static PyObject* THPVariable__is_functional_tensor(
     PyObject* self,
     PyObject* args,
@@ -535,6 +551,10 @@ static PyMethodDef torch_functions_manual[] = {
      castPyCFunctionWithKeywords(THPVariable__from_functional_tensor),
      METH_VARARGS | METH_KEYWORDS | METH_STATIC,
      nullptr},
+    {"_freeze_functional_tensor",
+     castPyCFunctionWithKeywords(THPVariable__freeze_functional_tensor),
+     METH_VARARGS | METH_KEYWORDS | METH_STATIC,
+     nullptr},
     {"_sync",
      castPyCFunctionWithKeywords(THPVariable__sync),
      METH_VARARGS | METH_KEYWORDS | METH_STATIC,

From 1425d6afef4ecf0e5ba910bc222473d50a0cc19b Mon Sep 17 00:00:00 2001
From: "Han Qi (qihqi)" <qihan@meta.com>
Date: Tue, 1 Nov 2022 16:11:30 +0000
Subject: [PATCH 0399/1922] [codev] Make backport work with flatbuffer models
 (#88127)

Summary: By adding flatbuffer as dependency of backport.

Differential Revision: D40865452

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88127
Approved by: https://github.com/cccclai
---
 buckbuild.bzl                                        | 11 +++++++----
 test/cpp/jit/test_flatbuffer.cpp                     | 12 ++++++++----
 .../jit/mobile/compatibility/backport_manager.cpp    |  2 ++
 torch/csrc/jit/mobile/flatbuffer_loader.cpp          |  6 ++++--
 .../csrc/jit/serialization/flatbuffer_serializer.cpp |  6 ++++--
 5 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/buckbuild.bzl b/buckbuild.bzl
index 0003353f1040f..6ce59928d6968 100644
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@@ -1711,7 +1711,7 @@ def define_buck_targets(
             "torch/csrc/jit/serialization/mobile_bytecode.fbs",
         ],
         outs = {
-            "mobile_bytecode_generated.h": ["mobile_bytecode_generated.h"],
+            "mobile_bytecode_generated_fbsource.h": ["mobile_bytecode_generated.h"],
         },
         cmd = "$(exe {})".format(third_party("flatc")) +
               " --cpp --gen-mutable --scoped-enums -o ${OUT} ${SRCS}",
@@ -1727,7 +1727,9 @@ def define_buck_targets(
         name = "mobile_bytecode",
         header_namespace = "",
         exported_headers = {
-            "torch/csrc/jit/serialization/mobile_bytecode_generated.h": ":mobile_bytecode_header[mobile_bytecode_generated.h]",
+            ("torch/csrc/jit/serialization/mobile_bytecode_generated.h" if IS_OSS
+            else "torch/csrc/jit/serialization/mobile_bytecode_generated_fbsource.h")
+            : ":mobile_bytecode_header[mobile_bytecode_generated_fbsource.h]",
         },
         # Avoid leaking implementation details by only exposing this header to
         # the internals of the loader/serializer layer.
@@ -1735,6 +1737,9 @@ def define_buck_targets(
             "{}:flatbuffer_loader".format(ROOT),
             "{}:flatbuffer_serializer_mobile".format(ROOT),
         ],
+        exported_deps = [
+            third_party("flatbuffers-api"),
+        ],
     )
 
     fb_xplat_cxx_library(
@@ -1755,7 +1760,6 @@ def define_buck_targets(
             ":mobile_bytecode",
             ":torch_mobile_module",
             C10,
-            third_party("flatbuffers-api"),
         ],
         exported_deps = [
             ":torch_mobile_train",
@@ -1793,7 +1797,6 @@ def define_buck_targets(
         visibility = ["PUBLIC"],
         deps = [
             ":mobile_bytecode",
-            third_party("flatbuffers-api"),
         ],
         exported_deps = [
             ":torch_mobile_deserialize",
diff --git a/test/cpp/jit/test_flatbuffer.cpp b/test/cpp/jit/test_flatbuffer.cpp
index de49838fc9ab6..89efcf7390179 100644
--- a/test/cpp/jit/test_flatbuffer.cpp
+++ b/test/cpp/jit/test_flatbuffer.cpp
@@ -27,6 +27,14 @@
 #include <caffe2/serialize/versions.h>
 #include <torch/csrc/jit/serialization/import_export_functions.h>
 #include <unordered_set>
+
+#if defined(FB_XPLAT_BUILD) || defined(FBCODE_CAFFE2)
+#include <torch/csrc/jit/serialization/mobile_bytecode_generated_fbsource.h> // NOLINT
+namespace flatbuffers = flatbuffers_fbsource;
+#define FLATBUFFERS_MAX_ALIGNMENT FLATBUFFERS_FBSOURCE_MAX_ALIGNMENT
+#else
+#include <torch/csrc/jit/serialization/mobile_bytecode_generated.h> // NOLINT
+#endif
 // Tests go in torch::jit
 namespace torch {
 namespace jit {
@@ -1796,13 +1804,9 @@ TEST(FlatbufferUpgraderTest, DivScalarInplaceIntV2) {
 
 } // namespace jit
 } // namespace torch
-#include <torch/csrc/jit/serialization/mobile_bytecode_generated.h>
 namespace torch {
 namespace jit {
 
-#if defined(FBCODE_CAFFE2) or defined(FB_XPLAT_BUILD)
-namespace flatbuffers = flatbuffers_fbsource;
-#endif
 /**
  * An Allocator that can only deallocate (using delete []), counting
  * the number of times that it has been asked to deallocate.
diff --git a/torch/csrc/jit/mobile/compatibility/backport_manager.cpp b/torch/csrc/jit/mobile/compatibility/backport_manager.cpp
index 2bad08c0765a2..489084912445f 100644
--- a/torch/csrc/jit/mobile/compatibility/backport_manager.cpp
+++ b/torch/csrc/jit/mobile/compatibility/backport_manager.cpp
@@ -7,6 +7,7 @@
 #include <torch/csrc/jit/mobile/import.h>
 #include <torch/csrc/jit/mobile/module.h>
 #include <torch/csrc/jit/serialization/export.h>
+#include <torch/csrc/jit/serialization/flatbuffer_serializer_jit.h>
 #include <torch/csrc/jit/serialization/import.h>
 #include <torch/csrc/jit/serialization/pickler.h>
 #include <cstddef>
@@ -503,6 +504,7 @@ std::stringstream backport_v7_to_v6(std::stringstream& input_model_stream) {
 
 std::stringstream backport_v9_to_v8(std::stringstream& input_model_stream) {
   ExtraFilesMap extra_files;
+  register_flatbuffer_all();
   Module torch_script =
       torch::jit::load(input_model_stream, c10::nullopt, extra_files);
   std::stringstream intermediate_model_stream;
diff --git a/torch/csrc/jit/mobile/flatbuffer_loader.cpp b/torch/csrc/jit/mobile/flatbuffer_loader.cpp
index fb23e7ee97753..45e31fb5e1747 100644
--- a/torch/csrc/jit/mobile/flatbuffer_loader.cpp
+++ b/torch/csrc/jit/mobile/flatbuffer_loader.cpp
@@ -36,7 +36,6 @@
 #include <torch/csrc/jit/serialization/export_bytecode.h>
 #include <torch/csrc/jit/serialization/import_export_constants.h>
 #include <torch/csrc/jit/serialization/import_read.h>
-#include <torch/csrc/jit/serialization/mobile_bytecode_generated.h>
 #include <torch/custom_class.h>
 
 #ifndef DISABLE_UPGRADER
@@ -50,9 +49,12 @@
 #include <cstdlib>
 #endif
 
-#if defined(FBCODE_CAFFE2) or defined(FB_XPLAT_BUILD)
+#if defined(FB_XPLAT_BUILD) || defined(FBCODE_CAFFE2)
+#include <torch/csrc/jit/serialization/mobile_bytecode_generated_fbsource.h> // NOLINT
 namespace flatbuffers = flatbuffers_fbsource;
 #define FLATBUFFERS_MAX_ALIGNMENT FLATBUFFERS_FBSOURCE_MAX_ALIGNMENT
+#else
+#include <torch/csrc/jit/serialization/mobile_bytecode_generated.h> // NOLINT
 #endif
 
 namespace torch {
diff --git a/torch/csrc/jit/serialization/flatbuffer_serializer.cpp b/torch/csrc/jit/serialization/flatbuffer_serializer.cpp
index 690541450a441..54ec7c7b6ed3e 100644
--- a/torch/csrc/jit/serialization/flatbuffer_serializer.cpp
+++ b/torch/csrc/jit/serialization/flatbuffer_serializer.cpp
@@ -20,11 +20,13 @@
 #include <torch/csrc/jit/mobile/train/export_data.h>
 #include <torch/csrc/jit/passes/inliner.h>
 #include <torch/csrc/jit/runtime/instruction.h>
-#include <torch/csrc/jit/serialization/mobile_bytecode_generated.h> // NOLINT
 
-#if defined(FBCODE_CAFFE2) or defined(FB_XPLAT_BUILD)
+#if defined(FB_XPLAT_BUILD) || defined(FBCODE_CAFFE2)
+#include <torch/csrc/jit/serialization/mobile_bytecode_generated_fbsource.h> // NOLINT
 namespace flatbuffers = flatbuffers_fbsource;
 #define FLATBUFFERS_MAX_ALIGNMENT FLATBUFFERS_FBSOURCE_MAX_ALIGNMENT
+#else
+#include <torch/csrc/jit/serialization/mobile_bytecode_generated.h> // NOLINT
 #endif
 
 namespace torch {

From e35df53e4c3db2985468cfb7a95408e8863adb94 Mon Sep 17 00:00:00 2001
From: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Date: Tue, 1 Nov 2022 16:43:58 +0000
Subject: [PATCH 0400/1922] Add unit test for torch_geometric library (#85937)

Fixes #65138

Pull Request resolved: https://github.com/pytorch/pytorch/pull/85937
Approved by: https://github.com/justinchuby, https://github.com/BowenBao
---
 test/onnx/test_pytorch_onnx_onnxruntime.py | 53 ++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 9f95227e009a9..e310666bd4fab 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -12575,6 +12575,59 @@ def forward(self, x):
                 x,
             )
 
+    @skipScriptTest()
+    @skipIfUnsupportedMinOpsetVersion(16)
+    @unittest.skipIf(
+        not torch.hub._check_module_exists("torch_geometric"),
+        "torch_geometric not installed.",
+    )
+    def test_sage_conv(self):
+        from torch_geometric import nn as torch_geometric_nn
+
+        # Input
+        coords0 = torch.randn(1, 6)
+        coords1 = torch.randn(1, 6)
+        coords = torch.transpose(torch.cat((coords0, coords1), dim=0), 0, 1)
+        adj = torch_geometric_nn.knn_graph(coords, k=2, batch=None, loop=True)
+        edge_from = adj[0:1, :]
+        edge_to = adj[1:, :]
+        inputs = (coords0, coords1, edge_from, edge_to)
+
+        class MySAGEConv(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.SAGEConvBlock1 = torch_geometric_nn.SAGEConv(
+                    2, 512, normalize=True
+                )
+                self.bano1 = torch_geometric_nn.BatchNorm(512)
+                self.relu = torch.nn.ReLU()
+                self.dense1 = torch.nn.Seq(Lin(512, 1))
+                self.sigmoid = torch.nn.Sigmoid()
+
+            def forward(self, coords0, coords1, edge_from, edge_to):
+                adj = torch.cat((edge_from, edge_to), dim=0)
+                gra = torch.transpose(torch.cat((coords0, coords1), dim=0), 0, 1)
+                x1 = self.SAGEConvBlock1(gra, edge_index=adj)
+                x = torch.unsqueeze(torch.sum(x1), dim=0)
+                return x
+
+        input_names = ["coords0", "coords1", "edge_from", "edge_to"]
+        output_names = ["outputs"]
+        dynamic_axes = {
+            "coords0": {0: "batch_size", 1: "features"},
+            "coords1": {0: "batch_size", 1: "features"},
+            "edge_from": {0: "batch_size", 1: "features"},
+            "edge_to": {0: "batch_size", 1: "features"},
+            "outputs": {0: "batch_size"},
+        }
+        self.run_test(
+            MySAGEConv(),
+            inputs,
+            input_names=input_names,
+            output_names=output_names,
+            dynamic_axes=dynamic_axes,
+        )
+
     # Cannot export with older opsets because of "ConstantFill" op
     # ConstantFill was a temp op removed at opset 8. This is no longer supported by onnxruntime
     # There are still some issues prevent us from enabling script test for these scenarios:

From a341dea45f910656365cdb3eff7729eaf9da3f74 Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Tue, 1 Nov 2022 02:02:20 +0000
Subject: [PATCH 0401/1922] Fix meta function for aten.addmm (#88068)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88068
Approved by: https://github.com/albanD
---
 test/test_meta.py               | 1 -
 torch/_decomp/decompositions.py | 9 ++++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/test/test_meta.py b/test/test_meta.py
index 378e114082704..e379cd5d14f2a 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -298,7 +298,6 @@ def test_tensor_outlives_converter(self):
     aten._linalg_svd.default,
     aten._scaled_dot_product_attention_forward.default,
     aten.add.Tensor,
-    aten.addmm.default,
     aten.atan2.default,
     aten.binary_cross_entropy.default,
     aten.bitwise_and.Tensor,
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 7ba4e6c4e97c8..4bda82e490fc0 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -1111,7 +1111,14 @@ def addmm(self: Tensor, mat1: Tensor, mat2: Tensor, beta: int = 1, alpha: int =
     out = alpha * torch.mm(mat1, mat2)
     if beta == 0:
         return out
-    return beta * self + out
+
+    # The output of aten.addmm is contiguous, we need to match this behavior in the decomposition.
+    # The original implementation 'beta * self + out' would return a strided tensor if `self` is strided.
+    # We thus use `out`, the output of torch.mm, which is always contiguous, as the first argument for addition.
+    # This is relying on TensorIterator's behavior that it takes higher precedence on the stride of first input.
+    # Alternative, we can write `(beta * self + out).contiguous()`, but it introduces another copy in some cases.
+    # This implementation is not ideal, and we should revisit this when we have a better solution.
+    return out + beta * self
 
 
 # This computes the mean and variance along the specifized normalization dims,

From 8ffc23411dec3ec6d430ac7d85ba7fd242264406 Mon Sep 17 00:00:00 2001
From: Yidi Wu <yidi@meta.com>
Date: Tue, 1 Nov 2022 17:10:45 +0000
Subject: [PATCH 0402/1922] torchdynamo support modules() for nn_module
 (#88023)

Differential Revision: D40820879

This diff allows models to call self.modules() during dynamo tracing.

cc @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88023
Approved by: https://github.com/tugsbayasgalan, https://github.com/voznesenskym, https://github.com/jansel
---
 test/dynamo/test_repros.py           | 20 ++++++++++++++++++++
 torch/_dynamo/variables/nn_module.py |  2 ++
 2 files changed, 22 insertions(+)

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 41564952a7444..fe32d2d98f856 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -1758,6 +1758,26 @@ def forward(self, inp):
         args = (torch.randn(3, 4),)
         self.assertTrue(same(mod(*args), opt_mod(*args)))
 
+    def test_modules(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc = torch.nn.Linear(4, 3)
+
+            def forward(self, inp):
+                res = torch.zeros(3, 3)
+                for mod in self.modules():
+                    res += self.fc(inp)
+                return res
+
+        mod = Foo()
+        args = (torch.ones(3, 4),)
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_mod = torch._dynamo.optimize(cnt, nopython=True)(mod)
+        self.assertTrue(same(mod(*args), opt_mod(*args)))
+        self.assertEqual(cnt.op_count, 5)
+        self.assertEqual(cnt.frame_count, 1)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py
index 6f7c2ff287373..1922980fc957f 100644
--- a/torch/_dynamo/variables/nn_module.py
+++ b/torch/_dynamo/variables/nn_module.py
@@ -337,6 +337,8 @@ def named_embed(name, obj):
             ):
                 result.append(named_embed(name, submod))
             return ListIteratorVariable(result, mutable_local=MutableLocal(), **options)
+        elif name == "modules":
+            return wrap_values(module.named_modules())
         elif name == "parameters":
             return wrap_values(module.named_parameters(**get_kwargs("recurse")))
         elif name == "values":

From 3a49589788a668c7269cd7b8453e16ad4ea87ad5 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Tue, 1 Nov 2022 13:36:03 +0000
Subject: [PATCH 0403/1922] [FSDP()][14/N] Refactor pre-forward/post-backward
 (#87927)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87927
Approved by: https://github.com/mrshenli
---
 test/distributed/fsdp/test_fsdp_core.py       |  14 +-
 .../fsdp/test_fsdp_param_exec_order_wrap.py   | 134 ------
 torch/distributed/fsdp/_common_utils.py       |  25 +-
 torch/distributed/fsdp/_runtime_utils.py      | 391 ++++++++++++++++-
 torch/distributed/fsdp/flat_param.py          |  10 +-
 .../fsdp/fully_sharded_data_parallel.py       | 394 +-----------------
 6 files changed, 439 insertions(+), 529 deletions(-)
 delete mode 100644 test/distributed/fsdp/test_fsdp_param_exec_order_wrap.py

diff --git a/test/distributed/fsdp/test_fsdp_core.py b/test/distributed/fsdp/test_fsdp_core.py
index 93d5e4f45ad28..b0d52527dd10b 100644
--- a/test/distributed/fsdp/test_fsdp_core.py
+++ b/test/distributed/fsdp/test_fsdp_core.py
@@ -366,12 +366,14 @@ def test_register_functions_called(self, cuda_first: bool, mixed_precision: bool
         )
         input = fsdp_model.module.get_input(torch.device("cuda"))
         fsdp_model._register_pre_backward_hooks = mock.MagicMock(return_value=None)
-        fsdp_model._register_post_backward_hooks = mock.MagicMock(return_value=None)
-        self.assertFalse(fsdp_model._register_post_backward_hooks.called)
-        self.assertFalse(fsdp_model._register_pre_backward_hooks.called)
-        fsdp_model(*input)
-        self.assertTrue(fsdp_model._register_post_backward_hooks.called)
-        self.assertTrue(fsdp_model._register_pre_backward_hooks.called)
+        with mock.patch(
+            "torch.distributed.fsdp._runtime_utils._register_post_backward_hooks"
+        ) as register_post_bwd_mock:
+            self.assertFalse(fsdp_model._register_pre_backward_hooks.called)
+            self.assertFalse(register_post_bwd_mock.called)
+            fsdp_model(*input)
+            self.assertTrue(fsdp_model._register_pre_backward_hooks.called)
+            self.assertTrue(register_post_bwd_mock.called)
 
 
 class TestNoGrad(FSDPTest):
diff --git a/test/distributed/fsdp/test_fsdp_param_exec_order_wrap.py b/test/distributed/fsdp/test_fsdp_param_exec_order_wrap.py
deleted file mode 100644
index a1c73d1cafb53..0000000000000
--- a/test/distributed/fsdp/test_fsdp_param_exec_order_wrap.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Owner(s): ["oncall: distributed"]
-
-from typing import Any, Callable
-
-import torch
-from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from torch.distributed.fsdp._symbolic_trace import TracingConfig
-from torch.distributed.fsdp.fully_sharded_data_parallel import ShardingStrategy
-from torch.distributed.fsdp.wrap import always_wrap_policy, ParamExecOrderWrapPolicy
-from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.testing._internal.common_fsdp import FSDPTest
-from torch.testing._internal.common_utils import (
-    instantiate_parametrized_tests,
-    parametrize,
-    run_tests,
-)
-
-
-class Model(torch.nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-        self.layer0 = torch.nn.Linear(6, 6)
-        self.layer1 = torch.nn.Linear(6, 6, bias=False)
-        self.layer2 = torch.nn.Sequential(
-            torch.nn.Linear(6, 3, bias=False),
-            torch.nn.ReLU(),
-            torch.nn.Linear(3, 6, bias=False),
-        )
-        self.relu = torch.nn.ReLU()
-
-    def forward(self, x: Any, use_all_params: bool = True):
-        # `layer0` -> `layer2` -> `layer1`
-        # the forward execution order is NOT consistent with the model definition order.
-        z = self.relu(self.layer0(x))
-        z = self.relu(self.layer2(z))
-        if use_all_params:
-            z = self.relu(self.layer1(z))
-        return z
-
-    def get_input(self, device: torch.device):
-        return (torch.randn((8, 6)).to(device),)
-
-    def get_loss(self, input, output):
-        return (output - input[0]).sum()
-
-    @staticmethod
-    def wrap(
-        sharding_strategy: ShardingStrategy,
-        device: torch.device,
-        wrap_policy: Callable,
-    ) -> torch.nn.Module:
-        model = Model()
-        fsdp_model = FSDP(
-            model, auto_wrap_policy=wrap_policy, sharding_strategy=sharding_strategy
-        )
-        return fsdp_model.to(device)
-
-
-class TestFSDPExecOrder(FSDPTest):
-    @property
-    def device(self):
-        return torch.device("cuda")
-
-    @skip_if_lt_x_gpu(2)
-    @parametrize(
-        "sharding_strategy",
-        [ShardingStrategy.FULL_SHARD, ShardingStrategy.SHARD_GRAD_OP],
-    )
-    def test_fsdp_flatten_params_exec_order(
-        self,
-        sharding_strategy: ShardingStrategy,
-    ):
-        """
-        Test ``_fsdp_params_exec_order`` with ``ParamExecOrderWrapPolicy``,
-        after running one iteration of forward and backward pass.
-        Here ``torch.fx`` is not enabled inside ``ParamExecOrderWrapPolicy``.
-        """
-        wrap_policy = ParamExecOrderWrapPolicy(init_policy=always_wrap_policy)
-        fsdp_model = Model.wrap(sharding_strategy, self.device, wrap_policy=wrap_policy)
-        self.assertTrue(fsdp_model._is_param_exec_order_prep_stage())
-        # run one iteration to record the execution ordering
-        input = fsdp_model.module.get_input(self.device)
-        output = fsdp_model(*input)
-        loss = fsdp_model.module.get_loss(input, output).to(self.device)
-        loss.backward()
-        params_list = list(fsdp_model.parameters())
-        # Since the forward execution order is NOT consistent with
-        # the model definition order, the ordering in flatten_named_params_exec_order
-        # should be different from named_parameters.
-        self.assertEqual(
-            fsdp_model._fsdp_params_exec_order,
-            [params_list[0], params_list[2], params_list[3], params_list[1]],
-        )
-        self.assertTrue(fsdp_model._use_param_exec_order_policy())
-        self.assertTrue(not fsdp_model._is_param_exec_order_prep_stage())
-
-    @skip_if_lt_x_gpu(2)
-    @parametrize(
-        "sharding_strategy",
-        [ShardingStrategy.FULL_SHARD, ShardingStrategy.SHARD_GRAD_OP],
-    )
-    def test_fsdp_flatten_params_exec_order_symbolic_trace(
-        self,
-        sharding_strategy: ShardingStrategy,
-    ):
-        """
-        Tests ``ParamExecOrderWrapPolicy`` with symbolic tracing.
-        With symbolic tracing enabled, ``_is_param_exec_order_prep_stage``
-        should always set as False.
-        """
-        wrap_policy = ParamExecOrderWrapPolicy(
-            init_policy=always_wrap_policy,
-            tracing_config=TracingConfig(concrete_args={"use_all_params": False}),
-        )
-        fsdp_model = Model.wrap(
-            sharding_strategy,
-            self.device,
-            wrap_policy=wrap_policy,
-        )
-        params_list = list(fsdp_model.parameters())
-        # Since the forward execution order is NOT consistent with the model definition order,
-        # the ordering in flatten_named_params_exec_order should be different from named_parameters
-        self.assertEqual(
-            fsdp_model._fsdp_params_exec_order,
-            [params_list[0], params_list[2], params_list[3]],
-        )
-        self.assertTrue(fsdp_model._use_param_exec_order_policy())
-        self.assertTrue(not fsdp_model._is_param_exec_order_prep_stage())
-
-
-instantiate_parametrized_tests(TestFSDPExecOrder)
-
-if __name__ == "__main__":
-    run_tests()
diff --git a/torch/distributed/fsdp/_common_utils.py b/torch/distributed/fsdp/_common_utils.py
index 6d3681fc69c06..34f45fc776b2a 100644
--- a/torch/distributed/fsdp/_common_utils.py
+++ b/torch/distributed/fsdp/_common_utils.py
@@ -2,8 +2,9 @@
 This file includes private common utilities for FSDP.
 """
 
+import traceback
 from enum import auto, Enum
-from typing import Callable, Dict, List, Union
+from typing import Callable, Dict, List, no_type_check, Union
 
 import torch
 import torch.distributed.fsdp.flat_param as flat_param_file
@@ -153,3 +154,25 @@ def f(module: torch.nn.Module, prefix: str, *args, **kwargs):
 
     f(root_module, "", *args, **kwargs)
     return return_fn(*args, **kwargs)
+
+
+@no_type_check
+def _assert_in_training_states(
+    state: _State,
+    training_states: List[TrainingState],
+) -> None:
+    """Asserts that FSDP is in the states ``_training_states``."""
+    # Raise a `ValueError` instead of using `assert` to ensure that these
+    # logical assertions run even if `assert`s are disabled
+    if state.training_state not in training_states:
+        msg = (
+            f"expected to be in states {training_states} but current state is "
+            f"{state.training_state}"
+        )
+        # Print the error on rank 0 in case this is called in the backward pass
+        if state.rank == 0:
+            if isinstance(state, nn.Module):
+                print(f"Asserting FSDP instance is: {state}")
+            print(f"ERROR: {msg}")
+            traceback.print_stack()
+        raise ValueError(msg)
diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index 4311e27bf74f3..c79dfe6482f7d 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -1,9 +1,24 @@
-from typing import Any, List, no_type_check, Optional, Tuple
+import functools
+from typing import Any, Callable, List, no_type_check, Optional, Tuple
 
 import torch
-from torch.distributed.fsdp._common_utils import _State
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributed.algorithms._comm_hooks import LOW_PRECISION_HOOKS
+from torch.distributed.fsdp._common_utils import (
+    _assert_in_training_states,
+    _State,
+    TrainingState,
+)
 from torch.distributed.fsdp._utils import _apply_to_tensors, p_assert
-from torch.distributed.fsdp.flat_param import FlatParamHandle
+from torch.distributed.fsdp.api import BackwardPrefetch
+from torch.distributed.fsdp.flat_param import (
+    _HandlesKey,
+    FlatParameter,
+    FlatParamHandle,
+    HandleShardingStrategy,
+    HandleTrainingState,
+)
 from torch.distributed.utils import _to_kwargs
 
 
@@ -91,6 +106,376 @@ def _reshard_grads(
         handle.reshard_grad()
 
 
+@no_type_check
+def _pre_forward(
+    state: _State,
+    handles: List[FlatParamHandle],
+    unshard_fn: Callable,
+    module: nn.Module,
+    input: Any,
+):
+    """
+    Runs the pre-forward logic. This includes an opportunity to unshard
+    currently sharded parameters such as those for the current forward and
+    registering post-backward hooks for these current parameters.
+
+    Args:
+        handles (List[FlatParamHandle]): Handles giving the parameters used in
+            the current forward.
+        unshard_fn (Optional[Callable]): A callable to unshard any currently
+            sharded parameters or ``None`` to not do any unsharding.
+        module (nn.Module): Module whose forward this method runs right before.
+        input (Any): Unused; expected by the hook signature.
+    """
+    state.training_state = TrainingState.FORWARD_BACKWARD
+    state._exec_order_data.record_pre_forward(handles, module.training)
+    for handle in handles:
+        handle._training_state = HandleTrainingState.FORWARD
+    if unshard_fn is not None:
+        unshard_fn()
+    # Register post-backward hooks to reshard the parameters and reduce-scatter
+    # their gradients. They must be re-registered every forward pass in case
+    # the `grad_fn` is mutated.
+    _register_post_backward_hooks(state, handles)
+
+
+@no_type_check
+@torch.no_grad()
+def _post_backward_hook(
+    state: _State,
+    handle: FlatParamHandle,
+    *unused: Any,
+):
+    """
+    Reduce-scatters the gradient of ``handle`` 's ``FlatParameter``.
+
+    Precondition: The ``FlatParameter`` 's ``.grad`` attribute contains the
+    unsharded gradient for the local batch.
+
+    Postcondition:
+    - If using ``NO_SHARD``, then the ``.grad`` attribute is the reduced
+    unsharded gradient.
+    - Otherwise, the ``_saved_grad_shard`` attribute is the reduced sharded
+    gradient (accumulating with any existing gradient).
+    """
+    param = handle.flat_param
+    param._post_backward_called = True
+    with torch.autograd.profiler.record_function(
+        "FullyShardedDataParallel._post_backward_hook"
+    ):
+        _assert_in_training_states(state, [TrainingState.FORWARD_BACKWARD])
+        state.training_state = TrainingState.FORWARD_BACKWARD
+        p_assert(
+            handle._training_state == HandleTrainingState.BACKWARD_PRE,
+            f"Expects `BACKWARD_PRE` state but got {handle._training_state}",
+        )
+        handle._training_state = HandleTrainingState.BACKWARD_POST
+
+        if param.grad is None:
+            return
+        if param.grad.requires_grad:
+            raise RuntimeError("FSDP does not support gradients of gradients")
+
+        free_unsharded_flat_param = _should_free_in_backward(state, handle)
+        _reshard(state, [handle], [free_unsharded_flat_param])
+
+        # TODO: Post-backward prefetching does not support the multiple handles
+        # per module case since the post-backward hook runs per handle, not per
+        # group of handles.
+        handles_key = (handle,)
+        _prefetch_handles(state, handles_key)
+
+        if not state._sync_gradients:
+            return
+
+        # Wait for all ops in the current stream (e.g. gradient
+        # computation) to finish before reduce-scattering the gradient
+        state._streams["post_backward"].wait_stream(torch.cuda.current_stream())
+
+        with torch.cuda.stream(state._streams["post_backward"]):
+            unsharded_grad_data = param.grad.data
+            if state._exec_order_data.is_first_iter:  # only check once
+                _check_comm_hook(
+                    state._communication_hook, state._communication_hook_state
+                )
+            if handle._uses_reduce_mixed_precision and not _low_precision_hook_enabled(
+                state
+            ):
+                # TODO: Use the low precision communication hook directly
+                param.grad.data = param.grad.to(state.mixed_precision.reduce_dtype)
+
+            if handle.uses_sharded_strategy:
+                # We clear `.grad` to permit multiple backwards. This avoids a
+                # race where the second backward pass computation precedes
+                # ahead of the first backward pass reduction, which is possible
+                # since the reduction is issued in a separate stream and is
+                # async and would result in reducing the wrong gradient.
+                unsharded_grad = param.grad.data
+                param.grad = None
+                p_assert(
+                    len(unsharded_grad.size()) == 1,
+                    f"Expects gradient to be flattened but got {unsharded_grad.size()}",
+                )
+                chunks = list(unsharded_grad.chunk(state.world_size))
+                numel_to_pad = (
+                    state.world_size * chunks[0].numel() - unsharded_grad.numel()
+                )
+                padded_unsharded_grad = F.pad(unsharded_grad, [0, numel_to_pad])
+                new_sharded_grad = torch.zeros_like(chunks[0])  # padded
+                state._communication_hook(
+                    state._communication_hook_state,
+                    padded_unsharded_grad,
+                    new_sharded_grad,
+                )
+                _cast_grad_to_param_dtype(state, handle, new_sharded_grad, param)
+
+                # Save the sharded gradient in `_saved_grad_shard` to support
+                # gradient accumulation -- for multiple backwards, the gradient
+                # reductions may happen in arbitrary order
+                accumulate_grad = hasattr(param, "_saved_grad_shard")
+                if accumulate_grad:
+                    _check_grad_to_accumulate(new_sharded_grad, param._saved_grad_shard)
+                    param._saved_grad_shard += new_sharded_grad
+                else:
+                    param._saved_grad_shard = new_sharded_grad
+                sharded_grad = param._saved_grad_shard
+            else:
+                state._communication_hook(state._communication_hook_state, param.grad)
+                # For `NO_SHARD`, we can keep the low precision gradients by
+                # simply omitting the cast altogether
+                if not handle._keep_low_precision_grads:
+                    _cast_grad_to_param_dtype(state, handle, param.grad, param)
+                sharded_grad = param.grad.data
+
+            if handle._config.offload_params:
+                # Offload the gradient to CPU to ensure parameters and
+                # gradients are on the same device as required by the optimizer
+                param._cpu_grad.copy_(  # type: ignore[attr-defined]
+                    sharded_grad.detach(), non_blocking=True
+                )  # synchronized in the post-backward callback
+                # Since the sharded gradient is produced in the post-backward
+                # stream and consumed later in the computation stream, inform
+                # the caching allocator
+                sharded_grad.data.record_stream(torch.cuda.current_stream())
+
+            # Since the unsharded gradient is produced in the computation
+            # stream and consumed in the post-backward stream, inform the
+            # caching allocator (before it goes out of scope)
+            unsharded_grad_data.record_stream(state._streams["post_backward"])
+
+            if handle._use_orig_params:
+                # Since the handle's `FlatParameter` completed its gradient
+                # computation, we should reset the gradient noneness mask
+                handle._reset_is_grad_none()
+                # Delay using sharded gradient views until after the
+                # reduce-scatter instead of immediately after resharding
+                handle._use_sharded_grad_views()
+
+
+@no_type_check
+def _should_free_in_backward(
+    state: _State,
+    handle: FlatParamHandle,
+) -> bool:
+    """
+    Returns whether FSDP should free the unsharded flattened parameter in the
+    post-backward or not.
+    """
+    return (
+        state._sync_gradients and handle.uses_sharded_strategy
+    ) or handle._config.sharding_strategy == HandleShardingStrategy.FULL_SHARD
+
+
+@no_type_check
+def _cast_grad_to_param_dtype(
+    state: _State,
+    handle: FlatParamHandle,
+    sharded_grad: torch.Tensor,
+    param: FlatParameter,
+):
+    """
+    Casts ``sharded_grad`` back to the full parameter dtype so that the
+    optimizer step runs with that dtype. This performs an actual cast if
+    1. parameters were in reduced precision during the forward since then
+    gradients would be in that reduced precision, or
+    2. parameters were not in reduced precision but gradients were in
+    reduced precision for communication.
+    However, if a low precision communication hook is registered, then this
+    dtype cast happens in the hook instead.
+    """
+    _assert_in_training_states(state, [TrainingState.FORWARD_BACKWARD])
+    if not _low_precision_hook_enabled(state) and (
+        handle._uses_param_mixed_precision or handle._uses_reduce_mixed_precision
+    ):
+        low_prec_grad_data = sharded_grad.data
+        sharded_grad.data = sharded_grad.data.to(dtype=param.dtype)
+        # Since for `NO_SHARD`, the gradient is produced in the computation
+        # stream and consumed here in the post-backward stream, inform the
+        # caching allocator; for the sharded strategies, the gradient is
+        # produced in the post-backward stream, so this `record_stream()`
+        # should be a no-op
+        low_prec_grad_data.record_stream(torch.cuda.current_stream())
+
+
+def _check_comm_hook(
+    comm_hook: Any,
+    comm_hook_state: Any,
+) -> None:
+    p_assert(comm_hook is not None, "Communication hook should not be `None`")
+    p_assert(
+        comm_hook_state is not None, "Communication hook state should not be `None`"
+    )
+
+
+def _check_grad_to_accumulate(
+    new_sharded_grad: torch.Tensor,
+    accumulated_grad: torch.Tensor,
+) -> None:
+    p_assert(
+        accumulated_grad.shape == new_sharded_grad.shape,
+        "Shape mismatch when accumulating gradients: "
+        f"existing gradient shape={accumulated_grad.shape} "
+        f"new gradient shape={new_sharded_grad.shape}",
+    )
+    p_assert(
+        accumulated_grad.device == new_sharded_grad.device,
+        "Device mismatch when accumulating gradients: "
+        f"existing gradient device={accumulated_grad.device} "
+        f"new gradient device={new_sharded_grad.device}",
+    )
+
+
+@no_type_check
+def _low_precision_hook_enabled(state: _State) -> bool:
+    return state._communication_hook in LOW_PRECISION_HOOKS
+
+
+@no_type_check
+def _prefetch_handles(
+    state: _State,
+    current_handles_key: _HandlesKey,
+) -> None:
+    """
+    Prefetches the next handles if needed (without synchronization). An empty
+    handles key cannot prefetch.
+    """
+    if not current_handles_key:
+        return
+    handles_to_prefetch = _get_handles_to_prefetch(state, current_handles_key)
+    for handles_key in handles_to_prefetch:
+        # Prefetch the next set of handles without synchronizing to allow
+        # the sync to happen as late as possible to maximize overlap
+        _unshard(
+            state, handles_key, state._streams["unshard"], state._streams["pre_unshard"]
+        )
+        state._handles_prefetched[handles_key] = True
+
+
+@no_type_check
+def _get_handles_to_prefetch(
+    state: _State,
+    current_handles_key: _HandlesKey,
+) -> List[_HandlesKey]:
+    """
+    Returns a :class:`list` of the handles keys to prefetch for the next
+    module(s), where ``current_handles_key`` represents the current module.
+
+    "Prefetching" refers to running the unshard logic early (without
+    synchronization), and the "next" modules depend on the recorded execution
+    order and the current training state.
+    """
+    training_state = _get_training_state(current_handles_key)
+    valid_training_states = (
+        HandleTrainingState.BACKWARD_PRE,
+        HandleTrainingState.BACKWARD_POST,
+        HandleTrainingState.FORWARD,
+    )
+    p_assert(
+        training_state in valid_training_states,
+        f"Prefetching is only supported in {valid_training_states} but "
+        f"currently in {training_state}",
+    )
+    eod = state._exec_order_data
+    target_handles_keys: List[_HandlesKey] = []
+    if (
+        training_state == HandleTrainingState.BACKWARD_PRE
+        and state.backward_prefetch == BackwardPrefetch.BACKWARD_PRE
+    ) or (
+        training_state == HandleTrainingState.BACKWARD_POST
+        and state.backward_prefetch == BackwardPrefetch.BACKWARD_POST
+    ):
+        target_handles_keys = [
+            target_handles_key
+            for target_handles_key in eod.get_handles_to_backward_prefetch(
+                current_handles_key
+            )
+            if state._needs_pre_backward_unshard.get(target_handles_key, False)
+            and not state._handles_prefetched.get(target_handles_key, False)
+        ]
+    elif training_state == HandleTrainingState.FORWARD and state.forward_prefetch:
+        target_handles_keys = [
+            target_handles_key
+            for target_handles_key in eod.get_handles_to_forward_prefetch(
+                current_handles_key
+            )
+            if state._needs_pre_forward_unshard.get(target_handles_key, False)
+            and not state._handles_prefetched.get(target_handles_key, False)
+        ]
+    return target_handles_keys
+
+
+def _get_training_state(
+    handles_key: _HandlesKey,
+) -> HandleTrainingState:
+    """Returns the training state of the handles in ``handles_key``."""
+    p_assert(len(handles_key) > 0, "Expects a non-empty handles key")
+    training_states = set(handle._training_state for handle in handles_key)
+    p_assert(
+        len(training_states) == 1,
+        f"Expects uniform training state but got {training_states}",
+    )
+    return next(iter(training_states))
+
+
+def _register_post_backward_hooks(
+    state: _State,
+    handles: List[FlatParamHandle],
+) -> None:
+    """
+    Registers post-backward hooks on the ``FlatParameter`` s'
+    ``AccumulateGrad`` objects to reshard and to reduce-scatter gradients.
+
+    The ``AccumulateGrad`` object represents the last function that finalizes
+    the ``FlatParameter`` 's gradient, so it only runs after its entire
+    gradient computation has finished.
+
+    We register the post-backward hook only once in the *first* forward that a
+    ``FlatParameter`` participates in. This relies on the ``AccumulateGrad``
+    object being preserved through multiple forwards.
+    """
+    # If there is no gradient computation, then there is no need for
+    # post-backward logic
+    if not torch.is_grad_enabled():
+        return
+    for handle in handles:
+        flat_param = handle.flat_param
+        already_registered = hasattr(flat_param, "_post_backward_hook_state")
+        if already_registered or not flat_param.requires_grad:
+            continue
+        # Get the `AccumulateGrad` object
+        temp_flat_param = flat_param.expand_as(flat_param)
+        p_assert(
+            temp_flat_param.grad_fn is not None,
+            "The `grad_fn` is needed to access the `AccumulateGrad` and "
+            "register the post-backward hook",
+        )
+        acc_grad = temp_flat_param.grad_fn.next_functions[0][0]
+        hook_handle = acc_grad.register_hook(
+            functools.partial(_post_backward_hook, state, handle)
+        )
+        flat_param._post_backward_hook_state = (acc_grad, hook_handle)  # type: ignore[attr-defined]
+
+
 def _wait_for_computation_stream(
     computation_stream: torch.cuda.Stream,
     unshard_stream: torch.cuda.Stream,
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index 5fd130826faa6..50f3d5f0ef7eb 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -106,7 +106,7 @@ class HandleConfig:
     offload_params: bool
     low_prec_param_dtype: Optional[torch.dtype]
     low_prec_reduce_dtype: Optional[torch.dtype]
-    keep_low_precision_grads: Optional[bool] = False
+    keep_low_precision_grads: bool = False
 
 
 class FlatParameter(nn.Parameter):
@@ -1800,6 +1800,14 @@ def uses_sharded_strategy(self) -> bool:
     def _uses_param_mixed_precision(self) -> bool:
         return self._config.low_prec_param_dtype is not None
 
+    @property
+    def _uses_reduce_mixed_precision(self) -> bool:
+        return self._config.low_prec_reduce_dtype is not None
+
+    @property
+    def _keep_low_precision_grads(self) -> bool:
+        return self._config.keep_low_precision_grads
+
     @property
     def _force_full_precision(self) -> bool:
         return (
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 412a7e0ed5fe3..3d92a96f99dbf 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -24,7 +24,6 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-import torch.nn.functional as F
 from torch.autograd import Variable
 from torch.distributed import ProcessGroup
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
@@ -54,9 +53,12 @@
 )
 from torch.distributed.fsdp._runtime_utils import (
     _clear_grads_if_needed,
+    _pre_forward,
+    _prefetch_handles,
     _prepare_forward_inputs,
     _reshard,
     _reshard_grads,
+    _should_free_in_backward,
     _unshard,
     _unshard_grads,
     _wait_for_computation_stream,
@@ -87,12 +89,7 @@
     _pre_load_state_dict_hook,
 )
 from ._utils import _apply_to_tensors, _free_storage, p_assert
-from .flat_param import (
-    _HandlesKey,
-    FlatParameter,
-    FlatParamHandle,
-    HandleShardingStrategy,
-)
+from .flat_param import FlatParameter, FlatParamHandle, HandleShardingStrategy
 from .wrap import ParamExecOrderWrapPolicy
 
 _TORCH_FX_AVAIL = True
@@ -926,92 +923,6 @@ def _init_streams(self) -> None:
         # CPU offloading (H2D copy) and mixed precision (low precision cast).
         self._streams["pre_unshard"] = torch.cuda.Stream()
 
-    def _prefetch_handles(
-        self,
-        current_handles_key: _HandlesKey,
-    ) -> None:
-        """
-        Prefetches the next handles if needed (without synchronization). An
-        empty handles key cannot prefetch.
-        """
-        if not current_handles_key:
-            return
-        handles_to_prefetch = self._get_handles_to_prefetch(current_handles_key)
-        for handles_key in handles_to_prefetch:
-            # Prefetch the next set of handles without synchronizing to allow
-            # the sync to happen as late as possible to maximize overlap
-            _unshard(
-                self,
-                handles_key,
-                self._streams["unshard"],
-                self._streams["pre_unshard"],
-            )
-            self._handles_prefetched[handles_key] = True
-
-    def _get_handles_to_prefetch(
-        self,
-        current_handles_key: _HandlesKey,
-    ) -> List[_HandlesKey]:
-        """
-        Returns a :class:`list` of the handles keys to prefetch for the next
-        module(s), where ``current_handles_key`` represents the current module.
-
-        "Prefetching" refers to running the unshard logic early (without
-        synchronization), and the "next" modules depend on the recorded
-        execution order and the current training state.
-        """
-        training_state = self._get_training_state(current_handles_key)
-        valid_training_states = (
-            HandleTrainingState.BACKWARD_PRE,
-            HandleTrainingState.BACKWARD_POST,
-            HandleTrainingState.FORWARD,
-        )
-        p_assert(
-            training_state in valid_training_states,
-            f"Prefetching is only supported in {valid_training_states} but "
-            f"currently in {training_state}",
-        )
-        eod = self._exec_order_data
-        target_handles_keys: List[_HandlesKey] = []
-        if (
-            training_state == HandleTrainingState.BACKWARD_PRE
-            and self.backward_prefetch == BackwardPrefetch.BACKWARD_PRE
-        ) or (
-            training_state == HandleTrainingState.BACKWARD_POST
-            and self.backward_prefetch == BackwardPrefetch.BACKWARD_POST
-        ):
-            target_handles_keys = [
-                target_handles_key
-                for target_handles_key in eod.get_handles_to_backward_prefetch(
-                    current_handles_key
-                )
-                if self._needs_pre_backward_unshard.get(target_handles_key, False)
-                and not self._handles_prefetched.get(target_handles_key, False)
-            ]
-        elif training_state == HandleTrainingState.FORWARD and self.forward_prefetch:
-            target_handles_keys = [
-                target_handles_key
-                for target_handles_key in eod.get_handles_to_forward_prefetch(
-                    current_handles_key
-                )
-                if self._needs_pre_forward_unshard.get(target_handles_key, False)
-                and not self._handles_prefetched.get(target_handles_key, False)
-            ]
-        return target_handles_keys
-
-    def _get_training_state(
-        self,
-        handles_key: _HandlesKey,
-    ) -> HandleTrainingState:
-        """Returns the training state of the handles in ``handles_key``."""
-        p_assert(len(handles_key) > 0, "Expects a non-empty handles key")
-        training_states = set(handle._training_state for handle in handles_key)
-        p_assert(
-            len(training_states) == 1,
-            f"Expects uniform training state but got {training_states}",
-        )
-        return next(iter(training_states))
-
     @staticmethod
     def set_state_dict_type(
         module: nn.Module,
@@ -1291,7 +1202,9 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
                 self._handles,
                 free_unsharded_flat_params,
             )
-            self._pre_forward(self._handles, unshard_fn, unused, unused)
+            _pre_forward(
+                self, self._handles, unshard_fn, self._fsdp_wrapped_module, unused
+            )
             for handle in self._handles:
                 p_assert(
                     handle.flat_param.device == self.compute_device,
@@ -1301,38 +1214,6 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
             output = self._fsdp_wrapped_module(*args, **kwargs)
             return self._post_forward(self._handles, reshard_fn, unused, unused, output)
 
-    def _pre_forward(
-        self,
-        handles: List[FlatParamHandle],
-        unshard_fn: Optional[Callable],
-        module: nn.Module,
-        input: Any,
-    ):
-        """
-        Runs the pre-forward logic. This includes an opportunity to unshard
-        currently sharded parameters such as those for the current forward and
-        registering post-backward hooks for these current parameters.
-
-        Args:
-            handles (List[FlatParamHandle]): Handles giving the parameters
-                used in the current forward.
-            unshard_fn (Optional[Callable]): A callable to unshard any
-                currently sharded parameters or ``None`` to not do any
-                unsharding.
-            module (nn.Module): Unused; expected by the hook signature.
-            input (Any): Unused; expected by the hook signature.
-        """
-        self.training_state = TrainingState.FORWARD_BACKWARD
-        self._exec_order_data.record_pre_forward(handles, self.training)
-        for handle in handles:
-            handle._training_state = HandleTrainingState.FORWARD
-        if unshard_fn is not None:
-            unshard_fn()
-        # Register post-backward hooks to reshard the parameters and
-        # reduce-scatter their gradients. They must be re-registered every
-        # forward pass in case the `grad_fn` is mutated.
-        self._register_post_backward_hooks(handles)
-
     def _pre_forward_unshard(
         self,
         handles: List[FlatParamHandle],
@@ -1345,7 +1226,7 @@ def _pre_forward_unshard(
             handles_key = tuple(handles)
             self._needs_pre_forward_unshard[handles_key] = False
             torch.cuda.current_stream().wait_stream(self._streams["unshard"])
-            self._prefetch_handles(handles_key)
+            _prefetch_handles(self, handles_key)
 
     def _post_forward(
         self,
@@ -1891,7 +1772,7 @@ def _pre_backward_hook(_handles: List[FlatParamHandle], *unused: Any) -> None:
                 # Set this to `False` to ensure that a mistargeted prefetch
                 # does not actually unshard these handles
                 self._needs_pre_backward_unshard[_handles_key] = False
-                self._prefetch_handles(_handles_key)
+                _prefetch_handles(self, _handles_key)
                 for handle in _handles:
                     handle.prepare_gradient_for_backward()
                 self._ran_pre_backward_hook[_handles_key] = True
@@ -1904,261 +1785,6 @@ def _register_hook(t: torch.Tensor) -> torch.Tensor:
 
         return _apply_to_tensors(_register_hook, outputs)
 
-    def _register_post_backward_hooks(
-        self,
-        handles: List[FlatParamHandle],
-    ) -> None:
-        """
-        Registers post-backward hooks on the ``FlatParameter`` s'
-        ``AccumulateGrad`` objects to reshard and to reduce-scatter gradients.
-
-        The ``AccumulateGrad`` object represents the last function that
-        finalizes the ``FlatParameter`` 's gradient, so it only runs after its
-        entire gradient computation has finished.
-
-        We register the post-backward hook only once in the *first* forward
-        that a ``FlatParameter`` participates in. This relies on the
-        ``AccumulateGrad`` object being preserved through multiple forwards.
-        """
-        # If there is no gradient computation, then there is no need for
-        # post-backward logic
-        if not torch.is_grad_enabled():
-            return
-        for handle in handles:
-            flat_param = handle.flat_param
-            already_registered = hasattr(flat_param, "_post_backward_hook_state")
-            if already_registered or not flat_param.requires_grad:
-                continue
-            # Get the `AccumulateGrad` object
-            temp_flat_param = flat_param.expand_as(flat_param)
-            p_assert(
-                temp_flat_param.grad_fn is not None,
-                "The `grad_fn` is needed to access the `AccumulateGrad` and "
-                "register the post-backward hook",
-            )
-            acc_grad = temp_flat_param.grad_fn.next_functions[0][0]
-            hook_handle = acc_grad.register_hook(
-                functools.partial(self._post_backward_hook, handle)
-            )
-            flat_param._post_backward_hook_state = (acc_grad, hook_handle)  # type: ignore[attr-defined]
-
-    @torch.no_grad()
-    def _post_backward_hook(
-        self,
-        handle: FlatParamHandle,
-        *unused: Any,
-    ) -> None:
-        """
-        Reduce-scatters the gradient of ``handle`` 's ``FlatParameter``.
-
-        Precondition: The ``FlatParameter`` 's ``.grad`` attribute contains the
-        unsharded gradient for the local batch.
-
-        Postcondition:
-        - If using ``NO_SHARD``, then the ``.grad`` attribute is the reduced
-        unsharded gradient.
-        - Otherwise, the ``_saved_grad_shard`` attribute is the reduced sharded
-        gradient (accumulating with any existing gradient).
-        """
-        param = handle.flat_param
-        param._post_backward_called = True
-        with torch.autograd.profiler.record_function(
-            "FullyShardedDataParallel._post_backward_hook"
-        ):
-            self._assert_state([TrainingState.FORWARD_BACKWARD])
-            self.training_state = TrainingState.FORWARD_BACKWARD
-            p_assert(
-                handle._training_state == HandleTrainingState.BACKWARD_PRE,
-                f"Expects `BACKWARD_PRE` state but got {handle._training_state}",
-            )
-            handle._training_state = HandleTrainingState.BACKWARD_POST
-
-            if (
-                self._use_param_exec_order_policy()
-                and self._param_exec_order_prep_stage
-            ):
-                # In self._fsdp_params_exec_order, the parameters are ordered based on
-                # the execution order in the backward pass in the first iteration.
-                self._fsdp_params_exec_order.append(param)
-
-            if param.grad is None:
-                return
-            if param.grad.requires_grad:
-                raise RuntimeError(
-                    "FSDP only works with gradients that don't require gradients"
-                )
-
-            free_unsharded_flat_param = self._should_free_unsharded_flat_param(handle)
-            _reshard(self, [handle], [free_unsharded_flat_param])
-
-            # TODO (awgu): Post-backward prefetching does not support the
-            # multiple handles per module case (which was why we keyed by
-            # *tuple*). The post-backward hook runs per handle, not per group
-            # of handles. To generalize this, we may need a 2-level mapping,
-            # where we map each individual handle to its groups of handles and
-            # then from the groups of handles to their indices in the order.
-            handles_key = (handle,)
-            self._prefetch_handles(handles_key)
-
-            if not self._sync_gradients:
-                return
-
-            # Wait for all ops in the current stream (e.g. gradient
-            # computation) to finish before reduce-scattering the gradient
-            self._streams["post_backward"].wait_stream(torch.cuda.current_stream())
-
-            with torch.cuda.stream(self._streams["post_backward"]):
-                orig_grad_data = param.grad.data
-                if (
-                    self._mixed_precision_enabled_for_reduce()
-                    and not self._low_precision_hook_enabled()
-                ):
-                    # Cast gradient to precision in which it should be communicated.
-                    # If a low precision hook is registered and reduce_dtype is specified
-                    # in `MixedPrecision`, communication hook will take care of
-                    # casting to lower precision and back.
-                    # TODO: Make this a communication hook when communication hooks
-                    # are implemented for FSDP. Note that this is a noop if the
-                    # reduce_dtype matches the param dtype.
-                    param.grad.data = param.grad.data.to(
-                        self.mixed_precision.reduce_dtype
-                    )
-
-                if self._exec_order_data.is_first_iter:
-                    # For all sharding strategies communication is performed through `_communication_hook`:
-                    # default comm hooks are: `reduce_scatter` for sharded strategies and
-                    # `all_reduce` for non-sharded strategies. This checks asserts that `_communication_hook`
-                    # and `_communication_hook_state`, required for communication not `None`.`
-                    p_assert(
-                        self._communication_hook is not None,
-                        "Communication hook should not be None",
-                    )
-                    p_assert(
-                        self._communication_hook_state is not None,
-                        "Communication hook state should not be None",
-                    )
-                grad = param.grad.data
-                if handle.uses_sharded_strategy:
-                    # We clear `param.grad` to permit repeated gradient
-                    # computations when this FSDP module is called multiple times.
-                    # This is to avoid a race among multiple re-entrant backward
-                    # passes. For example, the second backward pass computation
-                    # precedes ahead of the first backward pass reduction, which is
-                    # possible since the reduction is in a different stream and is
-                    # async. Then, the first backward pass may be incorrectly
-                    # reducing the second backward pass's `param.grad`.
-                    # The reduced gradients are accumulated in
-                    # `param._saved_grad_shard`, and the gradient reductions can
-                    # happen in arbitrary order, though we tolerate this due to the
-                    # (approximate) commutativity of floating-point addition.
-                    param.grad = None
-                    grad_flatten = torch.flatten(grad)
-                    chunks = list(grad_flatten.chunk(self.world_size))
-                    num_pad = self.world_size * chunks[0].numel() - grad.numel()
-                    input_flattened = F.pad(grad_flatten, [0, num_pad])
-                    output = torch.zeros_like(chunks[0])
-                    self._communication_hook(
-                        self._communication_hook_state, input_flattened, output
-                    )
-
-                    self._cast_grad_to_param_dtype(output, param)
-
-                    # To support gradient accumulation outside `no_sync()`, we save
-                    # the gradient data to `param._saved_grad_shard` before the
-                    # backward pass, accumulate gradients into it here, and set
-                    # `param.grad` with the accumulated value at the end of the
-                    # backward pass in preparation for the optimizer step.
-                    accumulate_grad = hasattr(param, "_saved_grad_shard")
-                    if accumulate_grad:
-                        p_assert(
-                            param._saved_grad_shard.shape == output.shape,  # type: ignore[attr-defined]
-                            "Shape mismatch when accumulating gradients: "  # type: ignore[attr-defined]
-                            f"existing grad shape={param._saved_grad_shard.shape} "
-                            f"new grad shape={output.shape}",  # type: ignore[attr-defined]
-                        )
-                        p_assert(
-                            param._saved_grad_shard.device == output.device,  # type: ignore[attr-defined]
-                            "Device mismatch when accumulating gradients: "  # type: ignore[attr-defined]
-                            f"existing grad device={param._saved_grad_shard.device} "
-                            f"new grad device={output.device}",  # type: ignore[attr-defined]
-                        )
-                        param._saved_grad_shard += output  # type: ignore[attr-defined]
-                    else:
-                        param._saved_grad_shard = output  # type: ignore[attr-defined]
-                    grad = param._saved_grad_shard  # type: ignore[attr-defined]
-                else:
-                    if self.sharding_strategy == ShardingStrategy.NO_SHARD:
-                        self._communication_hook(
-                            self._communication_hook_state, param.grad
-                        )
-
-                    # For NO_SHARD keeping grads in the reduced precision, we
-                    # can simply omit the cast as needed, we can't do this for
-                    # other sharding strategies because grad field is assigned
-                    # in _finalize_params. TODO (rvarm1) this divergence in
-                    # logic is not ideal.
-                    if not self._mixed_precision_keep_low_precision_grads():
-                        self._cast_grad_to_param_dtype(param.grad, param)
-
-                # Regardless of sharding or not, offload the grad to CPU if we are
-                # offloading params. This is so param and grad reside on same device
-                # which is needed for the optimizer step.
-                if handle._config.offload_params:
-                    # We specify non_blocking=True
-                    # and ensure the appropriate synchronization is done by waiting
-                    # streams in _wait_for_post_backward.
-                    param._cpu_grad.copy_(  # type: ignore[attr-defined]
-                        grad.detach(), non_blocking=True
-                    )
-                    # Don't let this memory get reused until after the transfer.
-                    grad.data.record_stream(torch.cuda.current_stream())
-
-                # After _post_backward_hook returns, orig_grad_data will eventually
-                # go out of scope, at which point it could otherwise be freed for
-                # further reuse by the main stream while the div/reduce_scatter/copy
-                # are underway in the post_backward stream. See:
-                # github.com/NVIDIA/apex/blob/master/apex/parallel/distributed.py
-                orig_grad_data.record_stream(self._streams["post_backward"])
-
-                if handle._use_orig_params:
-                    # Since the handle's `FlatParameter` completed its gradient
-                    # computation, we should reset the gradient noneness mask
-                    handle._reset_is_grad_none()
-                    # Delay using sharded gradient views until after the
-                    # reduce-scatter instead of immediately after resharding
-                    handle._use_sharded_grad_views()
-
-    def _cast_grad_to_param_dtype(
-        self,
-        grad: torch.Tensor,
-        param: FlatParameter,
-    ):
-        """
-        Casts gradient ``grad`` back to the full parameter dtype so that the
-        optimizer step runs with that dtype. This performs an actual cast if
-        1. parameters were in reduced precision during the forward since then
-        gradients would be in that reduced precision, or
-        2. parameters were not in reduced precision but gradients were in
-        reduced precision for communication.
-        However, if a low precision communication hook is registered, then this
-        dtype cast happens in the hook instead.
-        """
-        self._assert_state(TrainingState.FORWARD_BACKWARD)
-        if not self._low_precision_hook_enabled() and (
-            self._mixed_precision_enabled_for_params()
-            or self._mixed_precision_enabled_for_reduce()
-        ):
-            low_prec_grad_data = grad.data
-            grad.data = grad.data.to(dtype=param.dtype)
-            # Do not let the low precision gradient memory get reused until
-            # the cast to full parameter precision completes
-            low_prec_grad_data.record_stream(torch.cuda.current_stream())
-
-    def _should_free_unsharded_flat_param(self, handle: FlatParamHandle):
-        return (
-            self._sync_gradients and handle.uses_sharded_strategy
-        ) or handle._config.sharding_strategy == HandleShardingStrategy.FULL_SHARD
-
     def _queue_wait_for_post_backward(self) -> None:
         """
         Queues a post-backward callback from the root FSDP instance, which
@@ -2220,7 +1846,7 @@ def _catch_all_reshard(fsdp_module: FullyShardedDataParallel) -> None:
                     if already_resharded:
                         continue
                     free_unsharded_flat_params.append(
-                        self._should_free_unsharded_flat_param(handle)
+                        _should_free_in_backward(fsdp_module, handle)
                     )
                     handles_to_reshard.append(handle)
                 _reshard(self, handles_to_reshard, free_unsharded_flat_params)

From d11690150c5d7323263b5e09d1e30cdf96d5f023 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Tue, 1 Nov 2022 13:36:03 +0000
Subject: [PATCH 0404/1922] [FSDP()][15/N] Refactor `_init_streams()` (#87928)

This PR is easy. I think I move `_init_streams()` again in a later PR though :/
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87928
Approved by: https://github.com/mrshenli
---
 torch/distributed/fsdp/_init_utils.py         | 21 +++++++++++++++++++
 .../fsdp/fully_sharded_data_parallel.py       | 18 ++--------------
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index 0ff70dc7771e6..cbbfff10fa5d0 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -333,6 +333,27 @@ def _init_param_handle_from_params(
         handle.flat_param_to(cpu_device)
 
 
+@no_type_check
+def _init_streams(
+    state: _State,
+) -> _State:
+    """
+    Initializes CUDA streams for overlapping communication, computation, and
+    data transfers. The streams should be shared across FSDP instances.
+    """
+    assert state._is_root
+    assert torch.cuda.is_available()
+    # Stream for unshard logic, including allocating the all-gather destination
+    # tensors and the all-gathers themselves.
+    state._streams["unshard"] = torch.cuda.Stream()
+    # Stream for overlapping gradient reduction with the backward pass gradient
+    # computation.
+    state._streams["post_backward"] = torch.cuda.Stream()
+    # Stream for pre-unshard logic, namely allocations and writes for CPU
+    # offloading (H2D copy) and mixed precision (low precision cast).
+    state._streams["pre_unshard"] = torch.cuda.Stream()
+
+
 def _get_ignored_modules(
     root_module: nn.Module,
     _ignored_modules: Optional[Iterable[torch.nn.Module]],
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 3d92a96f99dbf..4e4482e230d0e 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -50,6 +50,7 @@
     _init_process_group_state,
     _init_runtime_state,
     _init_state_dict_state,
+    _init_streams,
 )
 from torch.distributed.fsdp._runtime_utils import (
     _clear_grads_if_needed,
@@ -764,7 +765,7 @@ def _lazy_init(self) -> None:
         # will set `_is_root=False` for the non-root instances
         self._is_root = True
         self._assert_state(TrainingState.IDLE)
-        self._init_streams()
+        _init_streams(self)
         self._cast_buffers(recurse=True)
         for handle in self._handles:
             self._init_param_attributes(handle)
@@ -908,21 +909,6 @@ def _init_param_attributes(self, handle: FlatParamHandle) -> None:
         # called for validation in `_wait_for_post_backward()`
         p._post_backward_called = False
 
-    def _init_streams(self) -> None:
-        """Initializes CUDA streams for overlapping data transfer and
-        computation. This should only be called on the root FSDP instance."""
-        assert self._is_root
-        assert torch.cuda.is_available()
-        # Stream for unshard logic, including allocating the all-gather
-        # destination tensors and the all-gathers themselves.
-        self._streams["unshard"] = torch.cuda.Stream()
-        # Stream for overlapping gradient reduction with the backward pass
-        # gradient computation.
-        self._streams["post_backward"] = torch.cuda.Stream()
-        # Stream for pre-unshard logic, namely allocations and writes for
-        # CPU offloading (H2D copy) and mixed precision (low precision cast).
-        self._streams["pre_unshard"] = torch.cuda.Stream()
-
     @staticmethod
     def set_state_dict_type(
         module: nn.Module,

From 5c9495b85fdf03fe0244095cdd87f9296c4ed9bb Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Tue, 1 Nov 2022 13:36:04 +0000
Subject: [PATCH 0405/1922] [FSDP()][16/N] Refactor post-forward/pre-backward
 (#87929)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87929
Approved by: https://github.com/mrshenli
---
 test/distributed/fsdp/test_fsdp_core.py       |  21 ++-
 torch/distributed/fsdp/_common_utils.py       |   5 +
 torch/distributed/fsdp/_runtime_utils.py      | 131 ++++++++++++++++++
 .../fsdp/fully_sharded_data_parallel.py       | 130 +----------------
 4 files changed, 159 insertions(+), 128 deletions(-)

diff --git a/test/distributed/fsdp/test_fsdp_core.py b/test/distributed/fsdp/test_fsdp_core.py
index b0d52527dd10b..988731206f1b9 100644
--- a/test/distributed/fsdp/test_fsdp_core.py
+++ b/test/distributed/fsdp/test_fsdp_core.py
@@ -365,14 +365,29 @@ def test_register_functions_called(self, cuda_first: bool, mixed_precision: bool
             fsdp_kwargs,
         )
         input = fsdp_model.module.get_input(torch.device("cuda"))
-        fsdp_model._register_pre_backward_hooks = mock.MagicMock(return_value=None)
+
+        # Since `_register_pre_backward_hooks()` modifies the forward output,
+        # we cannot directly mock it. We implement our own counter instead.
+        orig_register_pre_backward_hooks = (
+            torch.distributed.fsdp._runtime_utils._register_pre_backward_hooks
+        )
+        register_pre_backward_hooks_call_count = 0
+
+        def _register_pre_backward_hooks_with_count(*args, **kwargs):
+            nonlocal register_pre_backward_hooks_call_count
+            register_pre_backward_hooks_call_count += 1
+            return orig_register_pre_backward_hooks(*args, **kwargs)
+
         with mock.patch(
+            "torch.distributed.fsdp._runtime_utils._register_pre_backward_hooks",
+            _register_pre_backward_hooks_with_count,
+        ), mock.patch(
             "torch.distributed.fsdp._runtime_utils._register_post_backward_hooks"
         ) as register_post_bwd_mock:
-            self.assertFalse(fsdp_model._register_pre_backward_hooks.called)
+            self.assertEqual(register_pre_backward_hooks_call_count, 0)
             self.assertFalse(register_post_bwd_mock.called)
             fsdp_model(*input)
-            self.assertTrue(fsdp_model._register_pre_backward_hooks.called)
+            self.assertTrue(register_pre_backward_hooks_call_count > 0)
             self.assertTrue(register_post_bwd_mock.called)
 
 
diff --git a/torch/distributed/fsdp/_common_utils.py b/torch/distributed/fsdp/_common_utils.py
index 34f45fc776b2a..992df5606a70e 100644
--- a/torch/distributed/fsdp/_common_utils.py
+++ b/torch/distributed/fsdp/_common_utils.py
@@ -51,6 +51,11 @@ class HandleTrainingState(Enum):
     SUMMON_FULL_PARAMS = auto()
 
 
+def _is_composable(state: _State):
+    # TODO: This is a temporary hack for differentiate between code paths.
+    return not isinstance(state, nn.Module)
+
+
 def clean_tensor_name(tensor_name: str) -> str:
     """
     Cleans the parameter or buffer name by removing any module wrapper
diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index c79dfe6482f7d..3d34dadf9e000 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -7,6 +7,7 @@
 from torch.distributed.algorithms._comm_hooks import LOW_PRECISION_HOOKS
 from torch.distributed.fsdp._common_utils import (
     _assert_in_training_states,
+    _is_composable,
     _State,
     TrainingState,
 )
@@ -139,6 +140,98 @@ def _pre_forward(
     _register_post_backward_hooks(state, handles)
 
 
+@no_type_check
+def _post_forward(
+    state: _State,
+    handles: List[FlatParamHandle],
+    reshard_fn: Callable,
+    module: nn.Module,
+    input: Any,
+    output: Any,
+) -> Any:
+    """
+    Runs the post-forward logic. This includes an opportunity to reshard
+    currently unsharded parameters such as those used in the current forward
+    and registering pre-backward hooks on the forward outputs.
+
+    Args:
+        handles (List[FlatParamHandle]): Handles giving the parameters used in
+            the current forward.
+        reshard_fn (Optional[Callable]): A callable to reshard any currently
+            unsharded parameters (e.g. from the current forward) or ``None`` to
+            not do any resharding.
+        module (nn.Module): Unused; expected by the hook signature.
+        input (Any): Unused; exepcted by the hook signature.
+        output (Any): Forward pass output; pre-backward hooks are registered on
+            the tensors that require gradients in this output.
+
+    Postcondition: Each ``FlatParameter`` 's data points to the sharded
+    flattened parameter.
+    """
+    state._exec_order_data.record_post_forward(handles)
+    if reshard_fn is not None:
+        reshard_fn()
+    # Register pre-backward hooks to unshard the flattened parameters
+    # for the gradient computation (if needed)
+    output = _register_pre_backward_hooks(state, output, handles)
+    state.training_state = TrainingState.IDLE
+    for handle in handles:
+        handle._training_state = HandleTrainingState.IDLE
+    return output
+
+
+@no_type_check
+def _pre_backward_hook(
+    state: _State,
+    _handles: List[FlatParamHandle],
+    *unused: Any,
+) -> Any:
+    """Prepares ``_handles`` 's ``FlatParameter`` s for gradient computation."""
+    _handles_key = tuple(_handles)  # avoid shadowing `handles_key`
+    # Only run the pre-backward hook once per group of handles involved in the
+    # same module forward computation
+    if _handles_key and state._ran_pre_backward_hook.get(_handles_key, False):
+        return
+
+    with torch.autograd.profiler.record_function(
+        "FullyShardedDataParallel._pre_backward_hook"
+    ):
+        # Queue the post-backward callback once for the root FSDP instance to
+        # attach it to the outermost backward graph task so that it is called
+        # after all backward calls complete
+        if state._is_root and not state._post_backward_callback_queued:
+            state._queue_wait_for_post_backward()
+            all_handles = (
+                state._fsdp_handles(state) if _is_composable(state) else state._handles
+            )
+            _clear_grads_if_needed(all_handles)
+        elif _handles_key:
+            _assert_in_training_states(state, [TrainingState.IDLE])
+        state.training_state = TrainingState.FORWARD_BACKWARD
+        # Queueing the post-backward callback is the only logic that is not
+        # per-handle in the pre-backward hook, so we can return early here if
+        # there are no handles.
+        if not _handles_key:
+            return
+        for handle in _handles:
+            handle._training_state = HandleTrainingState.BACKWARD_PRE
+
+        # If the handles have been prefetched, this `_unshard()` simply
+        # switches to using the unsharded parameter
+        _unshard(
+            state, _handles, state._streams["unshard"], state._streams["pre_unshard"]
+        )
+        torch.cuda.current_stream().wait_stream(state._streams["unshard"])
+
+        # Set this to `False` to ensure that a mistargeted prefetch does not
+        # actually unshard these handles
+        state._needs_pre_backward_unshard[_handles_key] = False
+        _prefetch_handles(state, _handles_key)
+        for handle in _handles:
+            handle.prepare_gradient_for_backward()
+        state._ran_pre_backward_hook[_handles_key] = True
+
+
 @no_type_check
 @torch.no_grad()
 def _post_backward_hook(
@@ -437,6 +530,44 @@ def _get_training_state(
     return next(iter(training_states))
 
 
+@no_type_check
+def _register_pre_backward_hooks(
+    state: _State,
+    outputs: Any,
+    handles: List[FlatParamHandle],
+) -> None:
+    """
+    Registers pre-backward hooks on the tensors that require gradients in the
+    forward pass outputs ``outputs``, which were computed using the
+    ``FlatParameter`` s of ``handles``.
+
+    Returns:
+        Forward pass outputs with pre-backward hooks registered to tensors that
+        require gradients.
+    """
+    # If there is no gradient computation, then there is no need for
+    # pre-backward logic
+    if not torch.is_grad_enabled():
+        return outputs
+    if state._is_root:
+        state._post_backward_callback_queued = False  # only defined on the root
+
+    handles_key = tuple(handles)
+    if handles_key:
+        # Since these handles' `FlatParameter`s participated in a forward, we
+        # conservatively assume that they will be used in the backward
+        state._needs_pre_backward_unshard[handles_key] = False
+        state._ran_pre_backward_hook[handles_key] = False
+
+    def _register_hook(t: torch.Tensor) -> torch.Tensor:
+        if t.requires_grad:
+            t.register_hook(functools.partial(_pre_backward_hook, state, handles))
+            state._needs_pre_backward_unshard[handles_key] = True
+        return t
+
+    return _apply_to_tensors(_register_hook, outputs)
+
+
 def _register_post_backward_hooks(
     state: _State,
     handles: List[FlatParamHandle],
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 4e4482e230d0e..437c36edc3968 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -54,6 +54,7 @@
 )
 from torch.distributed.fsdp._runtime_utils import (
     _clear_grads_if_needed,
+    _post_forward,
     _pre_forward,
     _prefetch_handles,
     _prepare_forward_inputs,
@@ -89,7 +90,7 @@
     _post_state_dict_hook,
     _pre_load_state_dict_hook,
 )
-from ._utils import _apply_to_tensors, _free_storage, p_assert
+from ._utils import _free_storage, p_assert
 from .flat_param import FlatParameter, FlatParamHandle, HandleShardingStrategy
 from .wrap import ParamExecOrderWrapPolicy
 
@@ -1198,7 +1199,9 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
                     f"{self.compute_device} but got {handle.flat_param.device}",
                 )
             output = self._fsdp_wrapped_module(*args, **kwargs)
-            return self._post_forward(self._handles, reshard_fn, unused, unused, output)
+            return _post_forward(
+                self, self._handles, reshard_fn, unused, unused, output
+            )
 
     def _pre_forward_unshard(
         self,
@@ -1214,45 +1217,6 @@ def _pre_forward_unshard(
             torch.cuda.current_stream().wait_stream(self._streams["unshard"])
             _prefetch_handles(self, handles_key)
 
-    def _post_forward(
-        self,
-        handles: List[FlatParamHandle],
-        reshard_fn: Optional[Callable],
-        module: nn.Module,
-        input: Any,
-        output: Any,
-    ) -> Any:
-        """
-        Runs the post-forward logic. This includes an opportunity to reshard
-        currently unsharded parameters such as those used in the current
-        forward and registering pre-backward hooks on the forward outputs.
-
-        Args:
-            handles (List[FlatParamHandle]): Handles giving the parameters
-                used in the current forward.
-            reshard_fn (Optional[Callable]): A callable to reshard any
-                currently unsharded parameters (e.g. from the current forward)
-                or ``None`` to not do any resharding.
-            module (nn.Module): Unused; expected by the hook signature.
-            input (Any): Unused; exepcted by the hook signature.
-            output (Any): Forward pass output; pre-backward hooks are
-                registered on the tensors that require gradients in this
-                output.
-
-        Postcondition: Each ``FlatParameter`` 's data points to the sharded
-        flattened parameter.
-        """
-        self._exec_order_data.record_post_forward(handles)
-        if reshard_fn is not None:
-            reshard_fn()
-        # Register pre-backward hooks to unshard the flattened parameters
-        # for the gradient computation (if needed)
-        output = self._register_pre_backward_hooks(output, handles)
-        self.training_state = TrainingState.IDLE
-        for handle in handles:
-            handle._training_state = HandleTrainingState.IDLE
-        return output
-
     def _fsdp_root_pre_forward(self, *args, **kwargs):
         """
         Runs pre-forward logic specific to the root FSDP instance, which should
@@ -1687,90 +1651,6 @@ def named_parameters(
                 param_name = param_name.replace(FSDP_PREFIX, "")
             yield (param_name, param)
 
-    def _register_pre_backward_hooks(
-        self,
-        outputs: Any,
-        handles: List[FlatParamHandle],
-    ) -> Any:
-        """
-        Registers pre-backward hooks on the tensors that require gradients in
-        the forward pass outputs ``outputs``, which were computed using the
-        ``FlatParameter`` s of ``handles``.
-
-        Returns:
-            Forward pass outputs with pre-backward hooks registered to tensors
-            that require gradients.
-        """
-        # If there is no gradient computation, then there is no need for
-        # pre-backward logic
-        if not torch.is_grad_enabled():
-            return outputs
-
-        if self._is_root:
-            self._post_backward_callback_queued = False  # only defined on the root
-
-        handles_key = tuple(handles)
-        if handles_key:
-            # Since these handles' `FlatParameter`s participated in a forward,
-            # we conservatively assume that they will be used in the backward
-            self._needs_pre_backward_unshard[handles_key] = False
-            self._ran_pre_backward_hook[handles_key] = False
-
-        def _pre_backward_hook(_handles: List[FlatParamHandle], *unused: Any) -> None:
-            """Prepares ``_handles`` 's ``FlatParameter`` s for gradient
-            computation."""
-            _handles_key = tuple(_handles)  # avoid shadowing `handles_key`
-            # Only run the pre-backward hook once per group of handles involved
-            # in the same module forward computation
-            if _handles_key and self._ran_pre_backward_hook.get(_handles_key, False):
-                return
-
-            with torch.autograd.profiler.record_function(
-                "FullyShardedDataParallel._pre_backward_hook"
-            ):
-                # Queue the post-backward callback once for the root FSDP
-                # instance to attach it to the outermost backward graph task so
-                # that it is called after all backward calls complete
-                if self._is_root and not self._post_backward_callback_queued:
-                    self._queue_wait_for_post_backward()
-                    _clear_grads_if_needed(self._fsdp_handles(self))
-                elif _handles_key:
-                    self._assert_state([TrainingState.IDLE])
-                self.training_state = TrainingState.FORWARD_BACKWARD
-                # Queueing the post-backward callback is the only logic that is
-                # not per-handle in the pre-backward hook, so we can return
-                # early here if there are no handles.
-                if not _handles_key:
-                    return
-                for handle in _handles:
-                    handle._training_state = HandleTrainingState.BACKWARD_PRE
-
-                # If the handles have been prefetched, this `_unshard()` simply
-                # switches to using the unsharded parameter
-                _unshard(
-                    self,
-                    _handles,
-                    self._streams["unshard"],
-                    self._streams["pre_unshard"],
-                )
-                torch.cuda.current_stream().wait_stream(self._streams["unshard"])
-
-                # Set this to `False` to ensure that a mistargeted prefetch
-                # does not actually unshard these handles
-                self._needs_pre_backward_unshard[_handles_key] = False
-                _prefetch_handles(self, _handles_key)
-                for handle in _handles:
-                    handle.prepare_gradient_for_backward()
-                self._ran_pre_backward_hook[_handles_key] = True
-
-        def _register_hook(t: torch.Tensor) -> torch.Tensor:
-            if t.requires_grad:
-                t.register_hook(functools.partial(_pre_backward_hook, handles))
-                self._needs_pre_backward_unshard[handles_key] = True
-            return t
-
-        return _apply_to_tensors(_register_hook, outputs)
-
     def _queue_wait_for_post_backward(self) -> None:
         """
         Queues a post-backward callback from the root FSDP instance, which

From 395f40ffdeff9696d72ad4fee4735d97751ea4c3 Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang@fb.com>
Date: Mon, 31 Oct 2022 16:33:11 -0700
Subject: [PATCH 0406/1922] [10/N] Update barrier with CPU/CUDA implementations
 (#86368)

### Changes
- Updates for the barrier collective
- NOTE: current change will not achieve dispatching of barrier since there is no tensor to read from

### Context
https://github.com/pytorch/pytorch/issues/86225

cc @mrshenli @pritamdamania87 @zhaojuanmao @satgera @rohan-varma @gqchen @aazzolini @osalpekar @jiayisuse @kwen2501 @awgu
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86368
Approved by: https://github.com/kwen2501
---
 test/distributed/test_c10d_common.py    |  5 ++++-
 torch/csrc/distributed/c10d/OpsImpl.cpp | 25 +++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index a4ee561ed961b..d772c8f2694cf 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -1464,7 +1464,9 @@ def _call_collective_with_varying_tensors(self, backend, collective, *args):
         # ensure supported devices (cpu, cuda) succeeds during dispatch call
         tensor = torch.zeros(2, 2, device=torch.device(device))
         # multi tensor collectives
-        if collective == dist.all_gather:
+        if collective == dist.barrier:
+            collective()
+        elif collective == dist.all_gather:
             collective([tensor], tensor, *args)
         elif collective == dist.reduce_scatter:
             if backend != "gloo":
@@ -1488,6 +1490,7 @@ def _test_collectives(self, backend):
             (dist.all_reduce,),
             (dist.all_gather,),
             (dist.reduce_scatter,),
+            (dist.barrier,),
         ]
         for collective, *args in collectives_and_args:
             with self.subTest(collective=collective, args=args):
diff --git a/torch/csrc/distributed/c10d/OpsImpl.cpp b/torch/csrc/distributed/c10d/OpsImpl.cpp
index b4446b2aa8cdd..90ea5e16aa5f2 100644
--- a/torch/csrc/distributed/c10d/OpsImpl.cpp
+++ b/torch/csrc/distributed/c10d/OpsImpl.cpp
@@ -219,6 +219,22 @@ reduce_scatter_cuda_(
       output_tensors, work);
 }
 
+c10::intrusive_ptr<Work> barrier_cpu(
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    const std::vector<int64_t>& device_ids,
+    int64_t timeout) {
+  return process_group->barrier(
+      BarrierOptions{device_ids, std::chrono::milliseconds(timeout)});
+}
+
+c10::intrusive_ptr<Work> barrier_cuda(
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    const std::vector<int64_t>& device_ids,
+    int64_t timeout) {
+  return process_group->barrier(
+      BarrierOptions{device_ids, std::chrono::milliseconds(timeout)});
+}
+
 // register functions to dispatcher
 namespace {
 TORCH_LIBRARY_IMPL(c10d, CPU, m) {
@@ -286,6 +302,15 @@ TORCH_LIBRARY_IMPL(c10d, CPU, m) {
 TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
   m.impl("reduce_scatter_", reduce_scatter_cuda_);
 }
+
+TORCH_LIBRARY_IMPL(c10d, CPU, m) {
+  m.impl("barrier", barrier_cpu);
+}
+
+TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
+  m.impl("barrier", barrier_cuda);
+}
+
 } // namespace
 
 } // namespace ops

From b999cbdb35f98f22c1fc9ecb304db49be752bc35 Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <ivan.yashchuk@aalto.fi>
Date: Tue, 1 Nov 2022 17:46:52 +0000
Subject: [PATCH 0407/1922] Rename 'nvfuser' to 'ts_nvfuser' indicating
 TorchScript usage (#88188)

cc @kevinstephano @jjsjann123 @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88188
Approved by: https://github.com/soumith, https://github.com/jansel
---
 torch/_dynamo/optimizations/backends.py |  4 ++--
 torch/_dynamo/optimizations/training.py | 19 +++++++++----------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/torch/_dynamo/optimizations/backends.py b/torch/_dynamo/optimizations/backends.py
index 660e7a5ca567b..ea06d958005e4 100644
--- a/torch/_dynamo/optimizations/backends.py
+++ b/torch/_dynamo/optimizations/backends.py
@@ -100,13 +100,13 @@ def nnc_ofi(subgraph):
 
 
 @create_backend
-def nvfuser(subgraph):
+def ts_nvfuser(subgraph):
     with torch.jit.fuser("fuser2"):
         return reload_jit_model(subgraph)
 
 
 @create_backend
-def nvfuser_ofi(subgraph):
+def ts_nvfuser_ofi(subgraph):
     with torch.jit.fuser("fuser2"):
         return reload_jit_model_ofi(subgraph)
 
diff --git a/torch/_dynamo/optimizations/training.py b/torch/_dynamo/optimizations/training.py
index 84d8b7b21e3e5..3235c7486b2e9 100644
--- a/torch/_dynamo/optimizations/training.py
+++ b/torch/_dynamo/optimizations/training.py
@@ -199,7 +199,7 @@ def mem_efficient_fusion_kwargs(use_decomps):
 
 
 class AotMemEfficientFusion(AotAutogradStrategy):
-    """Use Min cut rematerilization and NVFuser with AOT Autograd"""
+    """Use Min cut rematerilization and TorchScript+nvFuser with AOT Autograd"""
 
     def candidate(self):
         kwargs = mem_efficient_fusion_kwargs(use_decomps=True)
@@ -207,7 +207,7 @@ def candidate(self):
 
 
 class AotMemEfficientFusionNoDecomps(AotAutogradStrategy):
-    """Use Min cut rematerilization and NVFuser with AOT Autograd"""
+    """Use Min cut rematerilization and TorchScript+nvFuser with AOT Autograd"""
 
     def candidate(self):
         kwargs = mem_efficient_fusion_kwargs(use_decomps=False)
@@ -243,7 +243,7 @@ def candidate(self):
 
 
 class AOTMemEfficientFusionWithContext:
-    """Pass nvfuser context to TorchDynamo"""
+    """Pass TorchScript+nvFuser context to TorchDynamo"""
 
     def __init__(self, use_decomps=True):
         self.backend_ctx_ctor = lambda: torch.jit.fuser("fuser2")
@@ -470,17 +470,16 @@ def create_aot_backends():
     # This is useful for debugging. Can be removed later.
     BACKENDS["nvprims_aten"] = aot_nvprims_aten
 
-    # aot_nvfuser uses the memory efficient fusion algorithm from AOT Autograd.
-    # It uses min cut rematerialization algorithm, and uses nvfuser as the
-    # compiler backend. This is the most optimized setting with nvfuser for
-    # training.
-    BACKENDS["aot_nvfuser"] = aot_mem_efficient_fusion
+    # aot_ts_nvfuser uses the memory efficient fusion algorithm from AOT Autograd.
+    # It uses min cut rematerialization algorithm, uses nvFuser as the
+    # compiler backend, and TorchScript as the frontend.
+    BACKENDS["aot_ts_nvfuser"] = aot_mem_efficient_fusion
 
-    # Similar to aot_nvfuser, but disables the decompositions. Decompositions
+    # Similar to aot_ts_nvfuser, but disables the decompositions. Decompositions
     # can cause accuracy deviations. This setting allows us to compare accuracy
     # without worrying about the impact of decomposisitons. More details at
     # https://github.com/pytorch/torchdynamo/issues/611
-    BACKENDS["aot_nvfuser_nodecomps"] = aot_mem_efficient_fusion_no_decomp
+    BACKENDS["aot_ts_nvfuser_nodecomps"] = aot_mem_efficient_fusion_no_decomp
 
     # aot_cudagraphs only applies CUDA graphs to the graph.  It is also helpful
     # for debugging and can serve as a perf baseline.

From 19a17c112da3861a6b43adaebfe8357db027842e Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang@fb.com>
Date: Mon, 31 Oct 2022 16:33:12 -0700
Subject: [PATCH 0408/1922] [11/N] Update all_to_all with CPU/CUDA
 implementations (#86407)

* #83916 [7/N] [Dispatchable Collectives] Update reduce with CPU / CUDA implementations
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86407
Approved by: https://github.com/kwen2501
---
 test/distributed/test_c10d_common.py    | 10 +++++---
 torch/csrc/distributed/c10d/OpsImpl.cpp | 34 +++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index d772c8f2694cf..02ec0247f9754 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -1468,10 +1468,13 @@ def _call_collective_with_varying_tensors(self, backend, collective, *args):
             collective()
         elif collective == dist.all_gather:
             collective([tensor], tensor, *args)
-        elif collective == dist.reduce_scatter:
+        elif collective == dist.reduce_scatter or collective == dist.all_to_all:
+            # gloo does not support reduce_scatter or all_to_all
             if backend != "gloo":
-                # gloo does not support reduce_scatter
-                collective(tensor, [tensor], *args)
+                if collective == dist.reduce_scatter:
+                    collective(tensor, [tensor], *args)
+                else:
+                    collective([tensor], [tensor], *args)
         else:
             collective(tensor, *args)
 
@@ -1491,6 +1494,7 @@ def _test_collectives(self, backend):
             (dist.all_gather,),
             (dist.reduce_scatter,),
             (dist.barrier,),
+            (dist.all_to_all,),
         ]
         for collective, *args in collectives_and_args:
             with self.subTest(collective=collective, args=args):
diff --git a/torch/csrc/distributed/c10d/OpsImpl.cpp b/torch/csrc/distributed/c10d/OpsImpl.cpp
index 90ea5e16aa5f2..ec66042235e95 100644
--- a/torch/csrc/distributed/c10d/OpsImpl.cpp
+++ b/torch/csrc/distributed/c10d/OpsImpl.cpp
@@ -219,6 +219,32 @@ reduce_scatter_cuda_(
       output_tensors, work);
 }
 
+c10::intrusive_ptr<Work> alltoall_cpu_(
+    at::TensorList output_tensors,
+    at::TensorList input_tensors,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    int64_t timeout) {
+  auto output_tensors_vec = output_tensors.vec();
+  auto input_tensors_vec = input_tensors.vec();
+  return process_group->alltoall(
+      output_tensors_vec,
+      input_tensors_vec,
+      AllToAllOptions{std::chrono::milliseconds(timeout)});
+}
+
+c10::intrusive_ptr<Work> alltoall_cuda_(
+    at::TensorList output_tensors,
+    at::TensorList input_tensors,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    int64_t timeout) {
+  auto output_tensors_vec = output_tensors.vec();
+  auto input_tensors_vec = input_tensors.vec();
+  return process_group->alltoall(
+      output_tensors_vec,
+      input_tensors_vec,
+      AllToAllOptions{std::chrono::milliseconds(timeout)});
+}
+
 c10::intrusive_ptr<Work> barrier_cpu(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const std::vector<int64_t>& device_ids,
@@ -303,6 +329,14 @@ TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
   m.impl("reduce_scatter_", reduce_scatter_cuda_);
 }
 
+TORCH_LIBRARY_IMPL(c10d, CPU, m) {
+  m.impl("alltoall_", alltoall_cpu_);
+}
+
+TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
+  m.impl("alltoall_", alltoall_cuda_);
+}
+
 TORCH_LIBRARY_IMPL(c10d, CPU, m) {
   m.impl("barrier", barrier_cpu);
 }

From 8e9a8fdd83ef8c04b0637a61d5c05f72519941f1 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Tue, 1 Nov 2022 17:59:35 +0000
Subject: [PATCH 0409/1922] Add UBSAN to ASAN (#88055)

Add undefined behavior sanitizer to `USE_ASAN` option.
Added `torch._C._crash_if_vptr_ubsan()` that only fails if vptr belongs to a wrong class after typecast
Deleted all ubsan supressions, but disabled `ProtoTest::Basic` as it fails above-mentioned vptr check.

Fixes https://github.com/pytorch/pytorch/issues/88042
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88055
Approved by: https://github.com/ezyang
---
 .jenkins/pytorch/test.sh   |  6 +++---
 CMakeLists.txt             |  6 +++---
 test/cpp/jit/test_misc.cpp | 10 ++++++++++
 torch/csrc/Module.cpp      | 20 ++++++++++++++++++++
 ubsan.supp                 |  2 --
 5 files changed, 36 insertions(+), 8 deletions(-)
 delete mode 100644 ubsan.supp

diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 5154f5a3f3b38..2c77c1e516bb5 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -135,9 +135,8 @@ fi
 # if you're not careful.  Check this if you made some changes and the
 # ASAN test is not working
 if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
-    # Suppress vptr violations arising from multiple copies of pybind11
     export ASAN_OPTIONS=detect_leaks=0:symbolize=1:detect_stack_use_after_return=1:strict_init_order=true:detect_odr_violation=0
-    export UBSAN_OPTIONS=print_stacktrace=1:suppressions=$PWD/ubsan.supp
+    export UBSAN_OPTIONS=print_stacktrace=1
     export PYTORCH_TEST_WITH_ASAN=1
     export PYTORCH_TEST_WITH_UBSAN=1
     # TODO: Figure out how to avoid hard-coding these paths
@@ -180,9 +179,10 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
     ulimit -s 81920
 
     (cd test && python -c "import torch; print(torch.__version__, torch.version.git_version)")
-    echo "The next three invocations are expected to crash; if they don't that means ASAN/UBSAN is misconfigured"
+    echo "The next four invocations are expected to crash; if they don't that means ASAN/UBSAN is misconfigured"
     (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_csrc_asan(3)")
     (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_csrc_ubsan(0)")
+    (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_vptr_ubsan()")
     (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_aten_asan(3)")
 fi
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index baf97a455863d..b1e0f517fafa4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -184,7 +184,7 @@ cmake_dependent_option(
     "BUILD_TEST" OFF)
 option(USE_CPP_CODE_COVERAGE "Compile C/C++ with code coverage flags" OFF)
 option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON)
-option(USE_ASAN "Use Address Sanitizer" OFF)
+option(USE_ASAN "Use Address+Undefined Sanitizers" OFF)
 option(USE_TSAN "Use Thread Sanitizer" OFF)
 option(USE_CUDA "Use CUDA" ON)
 cmake_dependent_option(
@@ -928,8 +928,8 @@ if(NOT MSVC)
 endif()
 
 if(USE_ASAN)
-    string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fsanitize=address")
-    string(APPEND CMAKE_LINKER_FLAGS_DEBUG " -fsanitize=address")
+    string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fsanitize=address -fsanitize=undefined")
+    string(APPEND CMAKE_LINKER_FLAGS_DEBUG " -fsanitize=address -fsanitize=undefined")
 endif()
 
 if(USE_TSAN)
diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp
index 2aac6cacdffc6..3be0b8598b733 100644
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@@ -491,10 +491,20 @@ TEST(ControlFlowTest, Basic) {
   ASSERT_EQ(256, run_binary("while_test", 2, 0));
 }
 
+#if defined(__has_feature)
+#if __has_feature(address_sanitizer)
+#define HAS_ASANUBSAN 1
+#endif
+#endif
+
+#ifndef HAS_ASANUBSAN
+// This test fails vptr UBSAN checks
+
 TEST(ProtoTest, Basic) {
   ::ONNX_NAMESPACE::ModelProto proto;
   proto.set_producer_name("foo");
 }
+#endif
 
 // test a few features that are not directly used in schemas yet
 TEST(SchemaParserTest, NestedArrays) {
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 9c6f2ed4177a0..b17d8ae07de70 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -184,6 +184,25 @@ static PyObject* THPModule_crashIfCsrcUBSAN(PyObject* module, PyObject* arg) {
   return THPUtils_packInt32((int)y);
 }
 
+static PyObject* THPModule_crashIfvptrUBSAN(PyObject* module, PyObject* noarg) {
+  // This code shoud work perfectly fine, as vtables are idential for Foo and
+  // Baz unless rtti and ubsan are enabled
+  struct Foo {
+    virtual int bar() = 0;
+    virtual ~Foo() = default;
+  };
+  struct Baz {
+    virtual int bar() {
+      return 17;
+    }
+    virtual ~Baz() = default;
+  };
+  Baz x{};
+  auto y = static_cast<Foo*>(static_cast<void*>(&x));
+  auto rc = y->bar();
+  return THPUtils_packInt32(rc);
+}
+
 static PyObject* THPModule_crashIfATenASAN(PyObject* module, PyObject* arg) {
   THPUtils_assert(
       THPUtils_checkLong(arg),
@@ -933,6 +952,7 @@ static PyMethodDef TorchMethods[] = {
     {"_infer_size", THPModule_inferSize, METH_VARARGS, nullptr},
     {"_crash_if_csrc_asan", THPModule_crashIfCsrcASAN, METH_O, nullptr},
     {"_crash_if_csrc_ubsan", THPModule_crashIfCsrcUBSAN, METH_O, nullptr},
+    {"_crash_if_vptr_ubsan", THPModule_crashIfvptrUBSAN, METH_NOARGS, nullptr},
     {"_crash_if_aten_asan", THPModule_crashIfATenASAN, METH_O, nullptr},
     {"_show_config", THPModule_showConfig, METH_NOARGS, nullptr},
     {"_cxx_flags", THPModule_cxxFlags, METH_NOARGS, nullptr},
diff --git a/ubsan.supp b/ubsan.supp
deleted file mode 100644
index 395f5208c8437..0000000000000
--- a/ubsan.supp
+++ /dev/null
@@ -1,2 +0,0 @@
-vptr:libtorch_python.so
-vptr:test_jit

From 49d3792170db926be0787ead71b736b2f1abca55 Mon Sep 17 00:00:00 2001
From: jjsjann123 <alex.jann2012@gmail.com>
Date: Tue, 1 Nov 2022 18:07:17 +0000
Subject: [PATCH 0410/1922] remove assert on tensor inputs to FusionGroup
 (#88018)

Fixes #86530 #86227 #85872
All issues seem to be duplicate of each other.

Removes the false positive assert

Fixes come from @kevinstephano
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88018
Approved by: https://github.com/kevinstephano, https://github.com/soumith
---
 test/test_jit_cuda_fuser.py                 | 21 +++++++++++++++++++++
 torch/csrc/jit/codegen/cuda/graph_fuser.cpp |  4 ----
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py
index e51cd01cd4cda..c674ba0d57606 100644
--- a/test/test_jit_cuda_fuser.py
+++ b/test/test_jit_cuda_fuser.py
@@ -4971,6 +4971,27 @@ def t(x, y, w0, w1):
         t_jit = torch.jit.script(t)
         self._run_helper(t_jit, t, x0, x1, w0, w1, check_stride=True)
 
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_no_tensor_input(self):
+        device = "cuda"
+        x = torch.randn(512, device=device)
+
+        def t(x):
+            tensor0 = torch.tensor(3, dtype=torch.float32, device='cuda')
+            tensor1 = torch.tensor(3, dtype=torch.float32, device='cuda')
+            o = torch.div(x.numel(), tensor0)
+            o = torch.mul(o, tensor1)
+            return o
+
+        t_jit = torch.jit.script(t)
+        self._run_helper(t_jit, t, x, check_stride=True)
+
+        # Note that curently TS embeds constant tensor in the graph
+        # this triggers memory leak check in CI
+        torch.jit._state._python_cu.drop_all_functions()
+
 
 class TestEnableDisableCudaFuser(JitTestCase):
     def setUp(self):
diff --git a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
index 1b51c87075471..4135c7babeef8 100644
--- a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
+++ b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
@@ -1554,10 +1554,6 @@ void guardFusionGroup(
       profiled_ivalue_indices.insert(index);
     }
   }
-  // we should assert on non-tensor inputs
-  TORCH_INTERNAL_ASSERT(
-      tensor_inputs_to_check.size(),
-      "CudaFusionGuard expects at least one tensor input");
 
   // insert the if block first;
   auto versioning_if =

From 9c799be0b053e8b1740e404dde3a3e354bad7257 Mon Sep 17 00:00:00 2001
From: Tugsbayasgalan Manlaibaatar <tmanlaibaatar@fb.com>
Date: Mon, 31 Oct 2022 10:58:36 -0700
Subject: [PATCH 0411/1922] Add meta implementation for aten.max.dim (#88005)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88005
Approved by: https://github.com/Chillee, https://github.com/bdhirsh
---
 test/test_proxy_tensor.py    |  1 -
 torch/_meta_registrations.py | 10 ++++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index a5de034d1a06f..9b6b745ee4c2c 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1243,7 +1243,6 @@ def f(a, b, c, d, e):
     xfail('masked_scatter', ''),  # aten.masked_scatter.default - couldn't find symbolic meta function/decomposition
     xfail('masked_select', ''),  # aten.masked_select.default - couldn't find symbolic meta function/decomposition
     xfail('matrix_exp', ''),  # aten.linalg_matrix_exp.default - couldn't find symbolic meta function/decomposition
-    xfail('max', 'reduction_with_dim'),  # aten.max.dim - couldn't find symbolic meta function/decomposition
     xfail('median', ''),  # Could not run 'aten::median' with arguments from the 'Meta' backend. This could be becau...
     xfail('meshgrid', 'list_of_tensors'),  # Tensors of type TensorImpl do not have numel
     xfail('meshgrid', 'variadic_tensors'),  # Tensors of type TensorImpl do not have numel
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 712e15608b02f..2e8845c68e868 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -120,6 +120,16 @@ def meta_max(self):
     return self.new_empty(())
 
 
+@register_meta(aten.max.dim)
+def meta_max_dim(self, dim, keepdim=False):
+    dim = utils.reduction_dims(self.shape, (dim,))
+    output_shape = _compute_reduction_shape(self, dim, keepdim)
+    return (
+        self.new_empty(output_shape),
+        self.new_empty(output_shape, dtype=torch.long),
+    )
+
+
 @register_meta([aten.min.default])
 def meta_min(self):
     return self.new_empty(())

From 2a136fad02c4a141d4d9954cc395eeb4bd4c7eeb Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Fri, 28 Oct 2022 11:29:04 -0700
Subject: [PATCH 0412/1922] [reland][fx][subgraph_rewriter] Change match_filter
 to be a List in replace_pattern_with_filters (#87998)

Summary:
att, this is experimental api so not marking it as bc-breaking.
The match will be accepted only if all the filters in the list passes.
Changing the filter arg to be list also allows us to pass in empty list that means no filter, which makes user code cleaner.

Test Plan:
python test/test_fx.py -k test_replace_pattern_with_filters

Reviewers:

Subscribers:

Tasks:

Tags:

Differential Revision: [D40810943](https://our.internmc.facebook.com/intern/diff/D40810943)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87998
Approved by: https://github.com/SherlockNoMad
---
 test/fx/test_subgraph_rewriter.py |  6 +++---
 torch/fx/subgraph_rewriter.py     | 20 +++++++++++++-------
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/test/fx/test_subgraph_rewriter.py b/test/fx/test_subgraph_rewriter.py
index ac3498458d600..ed6d50e44b4ac 100644
--- a/test/fx/test_subgraph_rewriter.py
+++ b/test/fx/test_subgraph_rewriter.py
@@ -773,7 +773,7 @@ def gemm_bias_mul_replacement_with_c(a, b, bias, c):
 
         self.assertEqual(repalcement_node_found, 2)
 
-    def test_replace_pattern_with_filter(self):
+    def test_replace_pattern_with_filters(self):
         class M(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -833,10 +833,10 @@ def num_repalcement_node_found(traced):
 
         # match with filter, should find 1 match
         traced = symbolic_trace(M())
-        matches = subgraph_rewriter.replace_pattern_with_filter(
+        matches = subgraph_rewriter.replace_pattern_with_filters(
             traced,
             BinaryOpScalarReLUPattern,
             BinaryOpScalarReLUReplacement,
-            second_input_is_scalar)
+            [second_input_is_scalar])
         self.assertEqual(len(matches), 1)
         self.assertEqual(num_repalcement_node_found(traced), 1)
diff --git a/torch/fx/subgraph_rewriter.py b/torch/fx/subgraph_rewriter.py
index 09e5550c5930d..72bb7fd373516 100644
--- a/torch/fx/subgraph_rewriter.py
+++ b/torch/fx/subgraph_rewriter.py
@@ -8,7 +8,7 @@
 from typing import Callable, Dict, List, NamedTuple, Optional, Set
 import torch
 
-__all__ = ['Match', 'replace_pattern', 'replace_pattern_with_filter']
+__all__ = ['Match', 'replace_pattern', 'replace_pattern_with_filters']
 
 @compatibility(is_backward_compatible=True)
 class Match(NamedTuple):
@@ -185,11 +185,11 @@ def forward(self, x, w1, w2):
 
 # Experimental API, not backward compatible
 @compatibility(is_backward_compatible=False)
-def replace_pattern_with_filter(
+def replace_pattern_with_filters(
     gm: GraphModule,
     pattern: Callable,
     replacement: Callable,
-    match_filter: Callable[["InternalMatch", Graph, Graph], bool],  # type: ignore[name-defined]
+    match_filters: List[Callable[["InternalMatch", Graph, Graph], bool]],  # type: ignore[name-defined]
 ) -> List[Match]:
     """
     See replace_pattern for documentation. This function is an overload with an additional match_filter argument.
@@ -200,18 +200,21 @@ def replace_pattern_with_filter(
             definition of InternalMatch.
     """
 
-    return _replace_pattern(gm, pattern, replacement, match_filter)
+    return _replace_pattern(gm, pattern, replacement, match_filters)
 
 
 def _replace_pattern(
     gm: GraphModule,
     pattern: Callable,
     replacement: Callable,
-    match_filter: Optional[Callable[["InternalMatch", Graph, Graph], bool]] = None  # type: ignore[name-defined]
+    match_filters: List[Callable[["InternalMatch", Graph, Graph], bool]] = None  # type: ignore[name-defined]
 ) -> List[Match]:
 
     from torch.fx.passes.utils.matcher_utils import SubgraphMatcher, InternalMatch
 
+    if match_filters is None:
+        match_filters = []
+
     # Get the graphs for `gm`, `pattern`, `replacement`
     original_graph: Graph = gm.graph
     pattern_graph: Graph = symbolic_trace(pattern).graph
@@ -222,8 +225,11 @@ def _replace_pattern(
     _matches: List[InternalMatch] = matcher.match(original_graph)
 
     # Filter out matches that don't match the filter
-    if match_filter:
-        _matches = [m for m in _matches if match_filter(m, original_graph, pattern_graph)]
+    _matches = [
+        m for m in _matches
+        if all(match_filter(m, original_graph, pattern_graph)
+               for match_filter in match_filters)
+    ]
 
     replacement_placeholders = [n for n in replacement_graph.nodes if n.op == "placeholder"]
 

From 2f5aa42e55b77d15ee9ab412a26ce8488158f89c Mon Sep 17 00:00:00 2001
From: Kevin Stephano <kevin.stephano@gmail.com>
Date: Tue, 1 Nov 2022 19:02:40 +0000
Subject: [PATCH 0413/1922] Fix nvFuser Fusion Definition printing of Squeeze
 and Permute (#88041)

NM

cc @jjsjann123
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88041
Approved by: https://github.com/IvanYashchuk, https://github.com/jjsjann123, https://github.com/mruberry
---
 torch/_prims/nvfuser_prims.py                 |  4 +-
 .../cuda/python_frontend/fusion_record.h      | 44 +++++++++++++------
 .../cuda/python_frontend/python_bindings.cpp  |  8 ++--
 3 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/torch/_prims/nvfuser_prims.py b/torch/_prims/nvfuser_prims.py
index 956f480de8905..391a7feee91b3 100644
--- a/torch/_prims/nvfuser_prims.py
+++ b/torch/_prims/nvfuser_prims.py
@@ -251,8 +251,8 @@ def _convert_element_type_nvfuser(fd: Any, a: TensorLikeType, dtype: torch.dtype
     return fd.ops.cast(a, nvfuser_dtype)  # type: ignore[attr-defined]
 
 
-def _transpose_nvfuser(fd, a, permutation):
-    return fd.ops.permute(a, permutation)  # type: ignore[attr-defined]
+def _transpose_nvfuser(fd, a, dims):
+    return fd.ops.permute(a, dims)  # type: ignore[attr-defined]
 
 
 def _squeeze_nvfuser(fd, a, a_shape, dimensions):
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h b/torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h
index 622ec6919c89f..674284e83ad07 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h
@@ -377,13 +377,13 @@ struct PermuteOpRecord : RecordFunctor {
   PermuteOpRecord(
       std::vector<State> _args,
       std::vector<State> _outputs,
-      std::vector<int64_t>& permutation)
+      std::vector<int64_t>& dims)
       : RecordFunctor(
             std::move(_args),
             std::move(_outputs),
-            "permute",
+            "ops.permute",
             RecordType::PermuteOp),
-        permutation_(std::move(permutation)) {}
+        dims_(std::move(dims)) {}
   virtual ~PermuteOpRecord() = default;
   virtual RecordFunctor* clone() final {
     return new PermuteOpRecord(*this);
@@ -391,11 +391,11 @@ struct PermuteOpRecord : RecordFunctor {
 
   virtual size_t hash() const final {
     auto result = RecordFunctor::hash();
-    size_t permutation_hash = 0;
-    for (auto p : permutation_) {
-      permutation_hash ^= static_cast<size_t>(p);
+    size_t dims_hash = 0;
+    for (auto dim : dims_) {
+      dims_hash ^= static_cast<size_t>(dim);
     }
-    return result | (permutation_hash & 0xffff);
+    return result | (dims_hash & 0xffff);
   }
 
   virtual bool operator==(const RecordFunctor& other) const final {
@@ -403,10 +403,10 @@ struct PermuteOpRecord : RecordFunctor {
     if (auto child_ptr = dynamic_cast<const PermuteOpRecord*>(&other)) {
       result = RecordFunctor::operator==(other);
       if (result) {
-        result = (permutation_.size() == child_ptr->permutation_.size());
+        result = (dims_.size() == child_ptr->dims_.size());
         if (result) {
-          for (size_t i = 0; i < permutation_.size(); ++i) {
-            if (permutation_[i] != child_ptr->permutation_[i]) {
+          for (size_t i = 0; i < dims_.size(); ++i) {
+            if (dims_[i] != child_ptr->dims_[i]) {
               result = false;
               break;
             }
@@ -420,13 +420,31 @@ struct PermuteOpRecord : RecordFunctor {
   void operator()(FusionDefinition& fd) final {
     auto arg =
         fd.getFusionState(args_.at(0).index)->template as<Nvf::TensorView>();
-    auto output = torch::jit::fuser::cuda::permute(arg, permutation_);
+    auto output = Nvf::permute(arg, dims_);
     fd.setFusionState(outputs_.at(0).index, output);
   }
 
+  virtual void print(std::ostream& os, bool close_function = true) const {
+    RecordFunctor::print(os, false);
+    os << ", dims=[";
+    bool first_arg = true;
+    for (auto dim : dims_) {
+      if (first_arg) {
+        first_arg = false;
+      } else {
+        os << ", ";
+      }
+      os << dim;
+    }
+    os << "]";
+    if (close_function) {
+      os << ")";
+    }
+  }
+
  private:
   //! Represents the mapping from the original shape to the new shape
-  std::vector<int64_t> permutation_;
+  std::vector<int64_t> dims_;
 };
 
 struct SqueezeOpRecord : RecordFunctor {
@@ -438,7 +456,7 @@ struct SqueezeOpRecord : RecordFunctor {
       : RecordFunctor(
             std::move(_args),
             std::move(_outputs),
-            "squeeze",
+            "ops.squeeze",
             RecordType::SqueezeOp),
         original_shape_(std::move(original_shape)),
         dim_(dim) {}
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp b/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
index f8e3b5e5e9218..b633732f8926d 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
@@ -1182,15 +1182,15 @@ void initNvFuserPythonBindings(PyObject* module) {
       "permute",
       [](nvfuser::FusionDefinition::Operators& self,
          nvfuser::Tensor arg,
-         std::vector<int64_t>& permutation) -> nvfuser::Tensor {
+         std::vector<int64_t>& dims) -> nvfuser::Tensor {
         nvfuser::FusionDefinition* fd = self.fusion_definition;
         nvfuser::Tensor output = fd->defineTensor();
         self.fusion_definition->defineRecord(new nvfuser::PermuteOpRecord(
-            {fd->recordingState(arg())},
-            {fd->recordingState(output())},
-            permutation));
+            {fd->recordingState(arg())}, {fd->recordingState(output())}, dims));
         return output;
       },
+      py::arg("arg"),
+      py::arg("dims"),
       py::return_value_policy::reference);
 
   nvf_ops.def(

From bdc71ce264b17d9a375bbc9e54f95a760d53b398 Mon Sep 17 00:00:00 2001
From: Elias Ellison <elias.ellison@gmail.com>
Date: Tue, 1 Nov 2022 08:56:06 -0700
Subject: [PATCH 0414/1922] Don't Require contiguous For Extern Kernels
 (#87650)

cc @mlazos @soumith @voznesenskym @yanboliang @penguinwu @anijain2305 @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx @jansel @lezcano @fdrocha
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87650
Approved by: https://github.com/desertfire
---
 test/inductor/test_torchinductor_opinfo.py |  8 ++++----
 torch/_inductor/ir.py                      | 20 +++++++-------------
 2 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 371a825b28a30..ea7a32db44cf9 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -189,10 +189,10 @@ def process(device_type):
     "fft.hfft": {b8, f32, f64, i32, i64},
     "fft.hfft2": {b8, f32, f64, i32, i64},
     "fft.hfftn": {b8, f32, f64, i32, i64},
-    "fft.ifft": {b8, f16, f32, f64, i32, i64},
+    "fft.ifft": {f16, f32, f64},
     "fft.ifft2": {b8, f32, f64, i32, i64},
     "fft.ifftn": {b8, f32, f64, i32, i64},
-    "fft.ihfft": {b8, f16, f32, f64, i32, i64},
+    "fft.ihfft": {f16, f32, f64},
     "fft.ihfft2": {f32, f64},
     "fft.ihfftn": {f32, f64},
     "fft.irfft": {b8, f32, f64, i32, i64},
@@ -301,10 +301,10 @@ def process(device_type):
     "fft.hfft": {b8, f16, f32, f64, i32, i64},
     "fft.hfft2": {b8, f16, f32, f64, i32, i64},
     "fft.hfftn": {b8, f16, f32, f64, i32, i64},
-    "fft.ifft": {b8, f16, f32, f64, i32, i64},
+    "fft.ifft": {f16, f32, f64},
     "fft.ifft2": {b8, f16, f32, f64, i32, i64},
     "fft.ifftn": {b8, f16, f32, f64, i32, i64},
-    "fft.ihfft": {b8, f16, f32, f64, i32, i64},
+    "fft.ihfft": {f16, f32, f64},
     "fft.ihfft2": {f16, f32, f64},
     "fft.ihfftn": {f16, f32, f64},
     "fft.irfft": {b8, f16, f32, f64, i32, i64},
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 9836109810767..8ca869df03602 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -2259,9 +2259,13 @@ def unflatten_args(new_tensor_args, new_non_tensor_args):
                     new_args.append(next(it_non_tensors))
             return pytree.tree_unflatten(new_args, args_spec)
 
-        tensor_args = [
-            cls.require_contiguous(cls.realize_input(x)) for x in tensor_args
-        ]
+        tensor_args = [cls.realize_input(x) for x in tensor_args]
+
+        # freeze layout otherwise our output stride calculation might
+        # become incorrect
+        for x in tensor_args:
+            if is_storage_and_layout(x):
+                as_storage_and_layout(x, freeze=True)
 
         # We don't have generic shape formulas, so just burn in the
         # shapes and run an example input.
@@ -2367,16 +2371,6 @@ def require_stride1(cls, x):
                 return x
         return cls.copy_input(x)
 
-    @classmethod
-    def require_contiguous(cls, x):
-        if is_contiguous_storage_and_layout(x):
-            as_contiguous_storage_and_layout(x, freeze=True)
-            return x
-        x = cls.copy_input(x)
-        assert is_contiguous_storage_and_layout(x)
-        as_contiguous_storage_and_layout(x, freeze=True)
-        return x
-
     @classmethod
     def require_stride_order(cls, x, order):
         # require x to have the layout as strided_ordered as order

From 3d51dea88815583e2ec43caeb65a61f51d31d035 Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Tue, 1 Nov 2022 21:01:31 +0000
Subject: [PATCH 0415/1922] [Vulkan][TCC] Implement tests for cat_batch,
 cat_width and normalize_dim (#87633)

Summary:
Implement Vulkan tests for these untested functions in Concat.cpp:
 - cat_batch
 - cat_width
 - normalize_dim

Test Plan:
```cd ~/fbsource
buck run //xplat/caffe2:pt_vulkan_api_test_binAppleMac\#macosx-arm64
```

Differential Revision: D40605571

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87633
Approved by: https://github.com/salilsdesai, https://github.com/kirklandsign, https://github.com/SS-JIA
---
 aten/src/ATen/native/vulkan/ops/Concat.cpp | 17 ++--
 aten/src/ATen/test/vulkan_api_test.cpp     | 95 ++++++++++++++++++++++
 2 files changed, 105 insertions(+), 7 deletions(-)

diff --git a/aten/src/ATen/native/vulkan/ops/Concat.cpp b/aten/src/ATen/native/vulkan/ops/Concat.cpp
index ac15b3924b080..412bda4fcde06 100644
--- a/aten/src/ATen/native/vulkan/ops/Concat.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Concat.cpp
@@ -37,15 +37,15 @@ Tensor cat_feature(
 
     const struct Block final {
       uvec3 size; // output texture size
-      uint32_t fill_0; // dummy
+      uint32_t fill0; // dummy
       uvec3 isize; // input texture size
-      uint32_t fill_1; // dummy
-      uint32_t batch_size; // input tensor's batch size
-      uint32_t ch_size; // input tensor's channel size
+      uint32_t fill1; // dummy
+      uint32_t batchSize; // input tensor's batch size
+      uint32_t chSize; // input tensor's channel size
       uint32_t
-          ch_interval; // channel interval (total # of channels for all tensors)
+          chInterval; // channel interval (total # of channels for all tensors)
       uint32_t
-          ch_size_allprior; // # of channels for tensor 0 to i-1 at ith tensor
+          chSizeAllprior; // # of channels for tensor 0 to i-1 at ith tensor
     } block{
         v_output.extents(),
         0u,
@@ -181,10 +181,12 @@ Tensor cat_height(
   return convert(v_output);
 }
 
-Tensor cat(const at::ITensorListRef& tensors, const int64_t dim) {
+Tensor cat(const at::ITensorListRef& tensors, const int64_t in_dim) {
   TORCH_CHECK(tensors.size() > 0, "Vulkan cat expects at least one tensor");
 
+  const int64_t dim = normalize_dim(in_dim, 4);
   auto materialized = tensors.materialize();
+  TORCH_INTERNAL_ASSERT(materialized.size() > 0, "Accessing empty array");
   const at::Tensor& tensor = materialized[0];
   int64_t cat_dim_size = 0;
   bool is_mult4ch = true;
@@ -209,6 +211,7 @@ Tensor cat(const at::ITensorListRef& tensors, const int64_t dim) {
   }
 
   auto result_size = tensor.sizes().vec();
+  TORCH_INTERNAL_ASSERT(result_size.size() > 0, "Accessing empty array");
   result_size[dim] = cat_dim_size;
 
   vTensor v_output{api::context(), result_size, tensor.options()};
diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index d122438f67586..66d33859ea03c 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -2946,6 +2946,44 @@ TEST_F(VulkanAPITest, view_invalid_inputs) {
   }, ::std::runtime_error);
 }
 
+TEST_F(VulkanAPITest, cat_dim0_invalidinputs_exceptions) {
+  // Arrange: Vulkan cat inputs must have matching sizes except concatenated dimension
+  {
+    const auto in_cpu1 = at::rand({3, 5, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
+    const auto in_cpu2 = at::rand({3, 9, 112, 193}, at::device(at::kCPU).dtype(at::kFloat));
+    const auto in_cpu3 = at::rand({3, 9, 331, 193}, at::device(at::kCPU).dtype(at::kFloat));
+
+    // Act
+    EXPECT_THROW({
+      const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 0);
+    }, ::c10::Error);
+  }
+
+  // Arrange: Vulkan cat expects 4 dimensional inputs
+  {
+    const auto in_cpu1 = at::rand({3, 9, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
+    const auto in_cpu2 = at::rand({9, 112, 193}, at::device(at::kCPU).dtype(at::kFloat));
+    const auto in_cpu3 = at::rand({3, 9, 331, 193}, at::device(at::kCPU).dtype(at::kFloat));
+
+    // Act
+    EXPECT_THROW({
+      const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 0);
+    }, ::c10::Error);
+  }
+
+  // Arrange: Vulkan cat not implemented for batch dimension!
+  {
+    const auto in_cpu1 = at::rand({221, 3, 9, 193}, at::device(at::kCPU).dtype(at::kFloat));
+    const auto in_cpu2 = at::rand({112, 3, 9, 193}, at::device(at::kCPU).dtype(at::kFloat));
+    const auto in_cpu3 = at::rand({331, 3, 9, 193}, at::device(at::kCPU).dtype(at::kFloat));
+
+    // Act
+    EXPECT_THROW({
+      const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 0);
+    }, ::c10::Error);
+  }
+}
+
 #if !defined(__APPLE__)
 TEST_F(VulkanAPITest, DISABLED_cat_dim1_samefeature_success) {
   // Arrange
@@ -3174,6 +3212,25 @@ TEST_F(VulkanAPITest, cat_dim2_diffheight_success) {
   ASSERT_TRUE(check);
 }
 
+TEST_F(VulkanAPITest, cat_dim2_negdim_success) {
+  // Arrange
+  const auto in_cpu1 = at::rand({3, 9, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto in_cpu2 = at::rand({3, 9, 112, 193}, at::device(at::kCPU).dtype(at::kFloat));
+  const auto in_cpu3 = at::rand({3, 9, 331, 193}, at::device(at::kCPU).dtype(at::kFloat));
+
+  // Act
+  const auto out_cpu = at::cat({in_cpu1, in_cpu2, in_cpu3}, -2);
+  const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, -2);
+
+  // Assert
+  const auto check = almostEqual(out_cpu, out_vulkan.cpu());
+  if (!check) {
+    showRtol(out_cpu, out_vulkan.cpu());
+  }
+
+  ASSERT_TRUE(check);
+}
+
 TEST_F(VulkanAPITest, cat_dim2_singledepth_success) {
   // Arrange: batch x channel (1x1) = single depth texture
   const auto in_cpu1 = at::rand({1, 1, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
@@ -3237,6 +3294,44 @@ TEST_F(VulkanAPITest, cat_dim2_invalidinputs_exceptions) {
   }
 }
 
+TEST_F(VulkanAPITest, cat_dim3_invalidinputs_exceptions) {
+  // Arrange: Vulkan cat inputs must have matching sizes except concatenated dimension
+  {
+    const auto in_cpu1 = at::rand({3, 5, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
+    const auto in_cpu2 = at::rand({3, 9, 112, 193}, at::device(at::kCPU).dtype(at::kFloat));
+    const auto in_cpu3 = at::rand({3, 9, 331, 193}, at::device(at::kCPU).dtype(at::kFloat));
+
+    // Act
+    EXPECT_THROW({
+      const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 3);
+    }, ::c10::Error);
+  }
+
+  // Arrange: Vulkan cat expects 4 dimensional inputs
+  {
+    const auto in_cpu1 = at::rand({3, 9, 221, 193}, at::device(at::kCPU).dtype(at::kFloat));
+    const auto in_cpu2 = at::rand({9, 112, 193}, at::device(at::kCPU).dtype(at::kFloat));
+    const auto in_cpu3 = at::rand({3, 9, 331, 193}, at::device(at::kCPU).dtype(at::kFloat));
+
+    // Act
+    EXPECT_THROW({
+      const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 3);
+    }, ::c10::Error);
+  }
+
+  // Arrange: Vulkan cat not implemented for width dimension!
+  {
+    const auto in_cpu1 = at::rand({3, 9, 193, 221}, at::device(at::kCPU).dtype(at::kFloat));
+    const auto in_cpu2 = at::rand({3, 9, 193, 112}, at::device(at::kCPU).dtype(at::kFloat));
+    const auto in_cpu3 = at::rand({3, 9, 193, 331}, at::device(at::kCPU).dtype(at::kFloat));
+
+    // Act
+    EXPECT_THROW({
+      const auto out_vulkan = at::cat({in_cpu1.vulkan(), in_cpu2.vulkan(), in_cpu3.vulkan()}, 3);
+    }, ::c10::Error);
+  }
+}
+
 TEST_F(VulkanAPITest, permute_2d_success) {
   // Arrange
   const auto in_cpu = at::rand({2, 3}, at::device(at::kCPU).dtype(at::kFloat));

From fcb0198b9ea0af3fc86771d67ec41b21f9337b90 Mon Sep 17 00:00:00 2001
From: Sean Ross-Ross <srossross@gmail.com>
Date: Tue, 1 Nov 2022 21:42:51 +0000
Subject: [PATCH 0416/1922] feature: adding batch support for narrow_copy
 operator (#88130)

Implement batch support https://github.com/pytorch/functorch/issues/825 for narrow copy

narrow_copy was already added as an opinfo

cc @zou3519 @Chillee @samdow @soumith
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88130
Approved by: https://github.com/kshitij12345, https://github.com/zou3519
---
 aten/src/ATen/functorch/BatchRulesViews.cpp | 13 +++++++++++++
 test/functorch/test_vmap.py                 |  1 -
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/functorch/BatchRulesViews.cpp b/aten/src/ATen/functorch/BatchRulesViews.cpp
index b8c3727d15dcc..e4513cf69c184 100644
--- a/aten/src/ATen/functorch/BatchRulesViews.cpp
+++ b/aten/src/ATen/functorch/BatchRulesViews.cpp
@@ -490,6 +490,18 @@ std::tuple<Tensor, optional<int64_t>> unfold_batch_rule(
   return std::make_tuple(result, 0);
 }
 
+std::tuple<Tensor, optional<int64_t>> narrow_copy_batch_rule(
+    const Tensor &self, optional<int64_t> self_bdim, int64_t dim, c10::SymInt start, c10::SymInt length)
+{
+  TORCH_INTERNAL_ASSERT(self_bdim.has_value());
+  auto self_ = moveBatchDimToFront(self, self_bdim);
+  auto logical_rank = rankWithoutBatchDim(self, self_bdim);
+  dim = maybe_wrap_dim(dim, logical_rank) + 1;
+  auto result = self_.narrow_copy_symint(dim, start, length);
+
+  return std::make_tuple(result, 0);
+}
+
 std::tuple<Tensor, optional<int64_t>> movedim_batch_rule(const Tensor& self, optional<int64_t> self_bdim, IntArrayRef source, IntArrayRef destination) {
   auto self_ = moveBatchDimToFront(self, self_bdim);
   auto source_ = getPhysicalDims(self_, self_bdim.has_value(), source);
@@ -539,6 +551,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   VMAP_SUPPORT2(slice, Tensor, slice_batch_rule);
   VMAP_SUPPORT2(transpose, int, transpose_int_batch_rule);
   VMAP_SUPPORT(diag_embed, diag_embed_batch_rule);
+  VMAP_SUPPORT(narrow_copy, narrow_copy_batch_rule);
 }
 
 }}
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index be457cfe25fcd..e8863781ad306 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -3331,7 +3331,6 @@ def test_vmap_exhaustive(self, device, dtype, op):
         xfail('masked_scatter'),
         xfail('masked_select'),
         xfail('nanquantile'),
-        xfail('narrow_copy'),  # hit the vmap fallback which is currently disabled
         xfail('ormqr'),
         xfail('put'),
         xfail('quantile'),

From a78fe805221e4ba75905fbb08cf6b9a5c606642a Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Tue, 1 Nov 2022 22:17:12 +0000
Subject: [PATCH 0417/1922] [GHF] Remove CC line from commit message (#88252)

This line is added by autoCCBot, but is not really meaningful as commit
message

Test Plan:
```
>>> from trymerge import GitHubPR, RE_PR_CC_LINE
>>> import re
>>> pr=GitHubPR("pytorch", "pytorch", 87809)
>>> re.sub(RE_PR_CC_LINE, "", pr.get_body())
'Fixes #ISSUE_NUMBER\r\n\n\n'
>>> pr=GitHubPR("pytorch", "pytorch", 87913)
>>> re.sub(RE_PR_CC_LINE, "", pr.get_body())
'Parallel compilation warms the Threadpool when we call `torch._dynamo.optimize()`. In current benchmarks, we were setting up the TRITON_CACHE_DIR much later. Because of this parallel compilation artifacts were not used and compilation latency improvements were not visible in dashboard. This PR just prepones the setup of TRITON_CACHE_DIR.\n\n'
>>> pr=GitHubPR("pytorch", "pytorch", 85692)
>>> re.sub(RE_PR_CC_LINE, "", pr.get_body())
'This PR sets CUDA_MODULE_LOADING if it\'s not set by the user. By default, it sets it to "LAZY".\r\n\r\nIt was tested using the following commands:\r\n```\r\npython -c "import torch; tensor=torch.randn(20, 16, 50, 100).cuda(); free, total = torch.cuda.cudart().cudaMemGetInfo(0); print(total-free)"\r\n```\r\nwhich shows a memory usage of: 287,047,680 bytes\r\n\r\nvs\r\n\r\n```\r\nCUDA_MODULE_LOADING="DEFAULT" python -c "import torch; tensor=torch.randn(20, 16, 50, 100).cuda(); free, total = torch.cuda.cudart().cudaMemGetInfo(0); print(total-free)"\r\n```\r\nwhich shows 666,632,192 bytes. \r\n\r\nC++ implementation is needed for the libtorch users (otherwise it could have been a pure python functionality).\r\n\r\n'
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88252
Approved by: https://github.com/xuzhao9, https://github.com/izaitsevfb
---
 .github/scripts/trymerge.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py
index 502b22d847d23..697b4b94faac4 100755
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@@ -407,6 +407,7 @@ class WorkflowCheckState(NamedTuple):
     r'https://github.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)/pull/(?P<number>[0-9]+)',
     re.MULTILINE
 )
+RE_PR_CC_LINE = re.compile(r'^cc:? @\w+.*\r?\n?$', re.MULTILINE)
 RE_DIFF_REV = re.compile(r'^Differential Revision:.+?(D[0-9]+)', re.MULTILINE)
 CIFLOW_LABEL = re.compile(r"^ciflow/.+")
 CIFLOW_TRUNK_LABEL = re.compile(r"^ciflow/trunk")
@@ -907,8 +908,12 @@ def gen_commit_message(self, filter_ghstack: bool = False) -> str:
             filters out ghstack info """
         # Adding the url here makes it clickable within the Github UI
         approved_by_urls = ', '.join(prefix_with_github_url(login) for login in self.get_approved_by())
+        # Remove "cc: " line from the message body
+        msg_body = re.sub(RE_PR_CC_LINE, "", self.get_body())
+        if filter_ghstack:
+            msg_body = re.sub(RE_GHSTACK_DESC, "", msg_body)
         msg = self.get_title() + f" (#{self.pr_num})\n\n"
-        msg += self.get_body() if not filter_ghstack else re.sub(RE_GHSTACK_DESC, "", self.get_body())
+        msg += msg_body
         msg += f"\nPull Request resolved: {self.get_pr_url()}\n"
         msg += f"Approved by: {approved_by_urls}\n"
         return msg

From 36a915150fb683244ff1d9a40960aa08782e1186 Mon Sep 17 00:00:00 2001
From: "Andrew M. James" <andrew.m.james2@gmail.com>
Date: Mon, 31 Oct 2022 18:29:02 -0500
Subject: [PATCH 0418/1922] Update torch.abs and torch.positive opinfos to
 reflect sparse support (#88151)

cc @nikitaved @pearu @cpuhrsch @bhosmer
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88151
Approved by: https://github.com/cpuhrsch
---
 test/test_sparse_csr.py                               | 1 +
 torch/testing/_internal/common_methods_invocations.py | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index 7e364ad94e071..b44720a4bba62 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -62,6 +62,7 @@ def _check_cusparse_sddmm_available():
     'abs',
     'conj_physical',
     'neg',
+    'positive'
 ]
 
 # This should be just an import from test_linalg instead of code duplication
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 3f4e2b6e03730..f5037a21e14e0 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -7882,6 +7882,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cpu', dtypes=[torch.cfloat]),
+                       DecorateInfo(unittest.skip("In-place abs not supported for complex tensors"), "TestSparseUnaryUfuncs",
+                                    "test_inplace", dtypes=(torch.cdouble, torch.cfloat, torch.chalf)),
                        # Reference: https://github.com/pytorch/pytorch/issues/49224
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
                                     dtypes=[torch.int8], active_if=TEST_WITH_ASAN),
@@ -7893,6 +7895,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    ),
                    supports_fwgrad_bwgrad=True,
                    assert_autodiffed=True,
+                   supports_sparse=True,
                    supports_sparse_csr=True,
                    supports_sparse_csc=True,
                    supports_sparse_bsr=True,
@@ -8795,6 +8798,11 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    supports_out=False,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
                    ),
     UnaryUfuncInfo('conj',
                    ref=np.conj,

From d1ba3351a218bbe40e38026f7bb3f9926b3dd615 Mon Sep 17 00:00:00 2001
From: "Andrew M. James" <andrew.m.james2@gmail.com>
Date: Mon, 31 Oct 2022 18:29:05 -0500
Subject: [PATCH 0419/1922] Remove BSC conversion skip from
 TestSparseCompressed.test_consistency (#88152)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88152
Approved by: https://github.com/cpuhrsch
---
 test/test_sparse_csr.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index b44720a4bba62..595e3f4e35880 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -588,9 +588,6 @@ def test_consistency(self, layout, device, dtype, op):
         if require_mask and layout in {torch.sparse_bsr, torch.sparse_bsc}:
             self.skipTest(f"{op.name} does not support input with {layout} layout")
 
-        if layout is torch.sparse_bsc:
-            self.skipTest(f"test requires conversion from Strided layout to {layout} layout")
-
         samples = list(op.sample_inputs(device, dtype))
 
         # Fail early to prevent silent success with this test

From 32f95ed38637a476d729712f89ac3c42ac5f3806 Mon Sep 17 00:00:00 2001
From: Charlie Yan <charlieyan@fb.com>
Date: Tue, 1 Nov 2022 17:46:00 +0000
Subject: [PATCH 0420/1922] Copy DDP code to be reused in composable API
 (#87836)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87836
Approved by: https://github.com/mrshenli
---
 torch/distributed/_composable/_ddp.py | 1867 +++++++++++++++++++++++++
 1 file changed, 1867 insertions(+)
 create mode 100644 torch/distributed/_composable/_ddp.py

diff --git a/torch/distributed/_composable/_ddp.py b/torch/distributed/_composable/_ddp.py
new file mode 100644
index 0000000000000..51fac5419babc
--- /dev/null
+++ b/torch/distributed/_composable/_ddp.py
@@ -0,0 +1,1867 @@
+import sys
+import copy
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Type
+from enum import Enum, auto
+import inspect
+import itertools
+import logging
+import os
+import warnings
+from contextlib import contextmanager
+
+import torch
+import torch.distributed as dist
+from torch.autograd import Function, Variable
+from torch.distributed.algorithms.join import (
+    Join,
+    Joinable,
+    JoinHook,
+)
+
+from torch.utils._pytree import tree_flatten, tree_unflatten
+
+RPC_AVAILABLE = False
+if dist.is_available():
+    from torch.distributed.utils import (
+        _verify_param_shape_across_processes,
+        _sync_module_states,
+        _to_kwargs,
+    )
+    from torch.distributed.distributed_c10d import ReduceOp, _get_default_group
+if torch.distributed.rpc.is_available():
+    RPC_AVAILABLE = True
+    from torch.distributed.rpc import RRef
+
+from torch._utils import _get_device_index
+
+from torch.nn.modules import Module
+from torch.nn.parallel._replicated_tensor_ddp_utils import (
+    _ddp_with_replicated_tensor_enabled,
+)
+from torch.nn.parallel.scatter_gather import (
+    gather,
+    scatter_kwargs,
+)  # noqa: F401
+
+__all__ = ["DistributedDataParallel"]
+
+logger = logging.getLogger(__name__)
+
+
+def _tree_flatten_with_rref(output):
+    output_is_rref = RPC_AVAILABLE and isinstance(output, RRef)
+    if output_is_rref:
+        output_tensor_list, treespec = tree_flatten(output.local_value())
+    else:
+        output_tensor_list, treespec = tree_flatten(output)
+    # Need to return flattened tensors, spec to re-pack them, as well
+    # as if the return type was actually an RRef to reconstruct.
+    return output_tensor_list, treespec, output_is_rref
+
+
+def _tree_unflatten_with_rref(output, treespec, output_is_rref):
+    output = tree_unflatten(output, treespec)
+    if output_is_rref:
+        output = RRef(output)
+    return output
+
+
+def _find_tensors(obj):
+    r"""
+    Recursively find all tensors contained in the specified object.
+    """
+    if RPC_AVAILABLE and isinstance(obj, RRef):
+        # If the current node is the owner of the RRef, unwrap it and try to
+        # find Tensors.
+        # TODO: Expand to remote RRefs.
+        if obj.is_owner():
+            return _find_tensors(obj.local_value())
+    if isinstance(obj, torch.Tensor):
+        return [obj]
+    if isinstance(obj, (list, tuple)):
+        return itertools.chain(*map(_find_tensors, obj))
+    if isinstance(obj, dict):
+        return itertools.chain(*map(_find_tensors, obj.values()))
+    return []
+
+
+def _dump_DDP_relevant_env_vars():
+    relevant_env_vars = [
+        "RANK",
+        "LOCAL_RANK",
+        "WORLD_SIZE",
+        "MASTER_PORT",
+        "MASTER_ADDR",
+        "CUDA_VISIBLE_DEVICES",
+        "GLOO_SOCKET_IFNAME",
+        "GLOO_DEVICE_TRANSPORT",
+        "NCCL_SOCKET_IFNAME",
+        "NCCL_BLOCKING_WAIT",
+        "NCCL_DEBUG",
+        "NCCL_DEBUG_SUBSYS",
+        "NCCL_IB_DISABLE",
+        # More NCCL env vars:
+        "NCCL_P2P_DISABLE",
+        "NCCL_P2P_LEVEL",
+        "NCCL_SHM_DISABLE",
+        "NCCL_SOCKET_NTHREADS",
+        "NCCL_NSOCKS_PERTHREAD",
+        "NCCL_BUFFSIZE",
+        "NCCL_NTHREADS",
+        "NCCL_RINGS",
+        "NCCL_MAX_NCHANNELS",
+        "NCCL_MIN_NCHANNELS",
+        "NCCL_CHECKS_DISABLE",
+        "NCCL_CHECK_POINTERS",
+        "NCCL_LAUNCH_MODE",
+        "NCCL_IB_HCA",
+        "NCCL_IB_TIMEOUT",
+        "NCCL_IB_RETRY_CNT",
+        "NCCL_IB_GID_INDEX",
+        "NCCL_IB_SL",
+        "NCCL_IB_TC",
+        "NCCL_IB_AR_THRESHOLD",
+        "NCCL_IB_CUDA_SUPPORT",
+        "NCCL_NET_GDR_LEVEL",
+        "NCCL_NET_GDR_READ",
+        "NCCL_SINGLE_RING_THRESHOLD",
+        "NCCL_LL_THRESHOLD",
+        "NCCL_TREE_THRESHOLD",
+        "NCCL_ALGO",
+        "NCCL_PROTO",
+        "NCCL_IGNORE_CPU_AFFINITY",
+        "NCCL_DEBUG_FILE",
+        "NCCL_COLLNET_ENABLE",
+        "NCCL_TOPO_FILE",
+        "NCCL_TOPO_DUMP_FILE",
+        "NCCL_ASYNC_ERROR_HANDLING",
+    ]
+    formatted_output = ""
+    for var in relevant_env_vars:
+        value = os.environ[var] if var in os.environ else "N/A"
+        formatted_output += "env:%s=%s\n" % (var, value)
+    print(formatted_output)
+
+
+class _BufferCommHookLocation(Enum):
+    PRE_FORWARD = auto()
+    POST_FORWARD = auto()
+
+
+@dataclass
+class _BufferCommHook:
+    buffer_comm_hook: Callable
+    buffer_comm_hook_state: Any
+    buffer_comm_hook_location: _BufferCommHookLocation
+
+
+# Add a DDPSink to run various functions when backwards starts, such as
+# queueing call back of out-most backward/graph task,
+# this helps call back is fired after all gradients' calculation
+# is completed.
+class _DDPSink(Function):
+    @staticmethod
+    def forward(ctx, reducer, state_dict, *inputs):
+        # set_materialize_grads(False) will ensure that None gradients stay as
+        # None and are not filled with zeros.
+        ctx.set_materialize_grads(False)
+        ctx.reducer = reducer
+        ctx.state_dict = state_dict
+        ret = tuple(
+            inp.clone() if isinstance(inp, torch.Tensor) else inp
+            for inp in inputs
+        )
+        return ret
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        state_dict = ctx.state_dict
+        # Enqueue delay allreduce for static graph training on the first
+        # iteration.
+        if (
+            ctx.state_dict["static_graph"]
+            and ctx.state_dict["num_iterations"] == 1
+        ):
+            Variable._execution_engine.queue_callback(
+                ctx.reducer._delay_all_reduce
+            )
+
+        return (None, None, *grad_outputs)
+
+
+class _DDPJoinHook(JoinHook):
+    def __init__(self, ddp, divide_by_initial_world_size):
+        """
+        Sets config variables for internal usage.
+        """
+        assert isinstance(ddp, DistributedDataParallel), (
+            "DDP join hook requires passing in a DistributedDataParallel "
+            "instance as the state"
+        )
+        assert ddp.logger is not None
+        ddp.logger._set_uneven_input_join()
+        self.ddp = ddp
+        self.ddp._divide_by_initial_world_size = divide_by_initial_world_size
+        super().__init__()
+
+    def main_hook(self):
+        """
+        Shadows the DDP collective communication operations in the forward and
+        backward passes.
+        """
+        ddp = self.ddp
+        # Buckets are rebuilt only once during a training period
+        ddp.reducer._rebuild_buckets()
+
+        # Schedule a broadcast if we are syncing module buffers in the
+        # forward pass
+        # TODO: make DDP uneven inputs context manager support buffer
+        # comm hook (https://github.com/pytorch/pytorch/issues/65436)
+        ddp._check_and_sync_module_buffers()
+
+        # Check if need to sync in the backward pass
+        work = ddp._check_global_requires_backward_grad_sync(
+            is_joined_rank=True
+        )
+        work.wait()
+        should_sync_backwards = work.result()[0].item() != 0
+        # Forward parameter sync is disabled in the next iteration if we
+        # are skipping gradient sync this iteration, so set
+        # `require_forward_param_sync` accordingly
+        ddp.require_forward_param_sync = should_sync_backwards
+        if not should_sync_backwards:
+            return
+
+        # Schedule one allreduce per gradient bucket to match the backward
+        # pass allreduce
+        ddp._match_all_reduce_for_bwd_pass()
+
+        # Check if we need to allreduce locally unused parameters
+        if ddp.find_unused_parameters:
+            ddp._match_unused_params_allreduce()
+
+        # Rebuilt parameters are pushed only once during a training period
+        ddp.reducer._push_all_rebuilt_params()
+
+    def post_hook(self, is_last_joiner: bool):
+        """
+        Syncs the final model to ensure that the model is the same across all
+        processes.
+        """
+        self.ddp._sync_final_model(is_last_joiner)
+
+
+class DistributedDataParallel(Module, Joinable):
+    r"""Implements distributed data parallelism that is based on
+    ``torch.distributed`` package at the module level.
+
+    This container provides data parallelism by synchronizing gradients
+    across each model replica. The devices to synchronize across are
+    specified by the input ``process_group``, which is the entire world
+    by default. Note that ``DistributedDataParallel`` does not chunk or
+    otherwise shard the input across participating GPUs; the user is
+    responsible for defining how to do so, for example through the use
+    of a :class:`DistributedSampler`.
+
+    See also: :ref:`distributed-basics` and :ref:`cuda-nn-ddp-instead`.
+    The same constraints on input as in :class:`torch.nn.DataParallel` apply.
+
+    Creation of this class requires that ``torch.distributed`` to be already
+    initialized, by calling :func:`torch.distributed.init_process_group`.
+
+    ``DistributedDataParallel`` is proven to be significantly faster than
+    :class:`torch.nn.DataParallel` for single-node multi-GPU data
+    parallel training.
+
+    To use ``DistributedDataParallel`` on a host with N GPUs, you should spawn
+    up ``N`` processes, ensuring that each process exclusively works on a single
+    GPU from 0 to N-1. This can be done by either setting
+    ``CUDA_VISIBLE_DEVICES`` for every process or by calling:
+
+        >>> # xdoctest: +SKIP("undefined variables")
+        >>> torch.cuda.set_device(i)
+
+    where i is from 0 to N-1. In each process, you should refer the following
+    to construct this module:
+
+        >>> # xdoctest: +SKIP("undefined variables")
+        >>> torch.distributed.init_process_group(
+        >>>     backend='nccl', world_size=N, init_method='...'
+        >>> )
+        >>> model = DistributedDataParallel(model, device_ids=[i], output_device=i)
+
+    In order to spawn up multiple processes per node, you can use either
+    ``torch.distributed.launch`` or ``torch.multiprocessing.spawn``.
+
+    .. note::
+        Please refer to `PyTorch Distributed Overview <https://pytorch.org/tutorials/beginner/dist_overview.html>`__
+        for a brief introduction to all features related to distributed training.
+
+    .. note::
+        ``DistributedDataParallel`` can be used in conjunction with
+        :class:`torch.distributed.optim.ZeroRedundancyOptimizer` to reduce
+        per-rank optimizer states memory footprint. Please refer to
+        `ZeroRedundancyOptimizer recipe <https://pytorch.org/tutorials/recipes/zero_redundancy_optimizer.html>`__
+        for more details.
+
+    .. note:: ``nccl`` backend is currently the fastest and highly recommended
+        backend when using GPUs. This applies to both single-node and
+        multi-node distributed training.
+
+    .. note:: This module also supports mixed-precision distributed training.
+        This means that your model can have different types of parameters such
+        as mixed types of ``fp16`` and ``fp32``, the gradient reduction on these
+        mixed types of parameters will just work fine.
+
+    .. note:: If you use ``torch.save`` on one process to checkpoint the module,
+        and ``torch.load`` on some other processes to recover it, make sure that
+        ``map_location`` is configured properly for every process. Without
+        ``map_location``, ``torch.load`` would recover the module to devices
+        where the module was saved from.
+
+    .. note:: When a model is trained on ``M`` nodes with ``batch=N``, the
+        gradient will be ``M`` times smaller when compared to the same model
+        trained on a single node with ``batch=M*N`` if the loss is summed (NOT
+        averaged as usual) across instances in a batch (because the gradients
+        between different nodes are averaged). You should take this into
+        consideration when you want to obtain a mathematically equivalent
+        training process compared to the local training counterpart. But in most
+        cases, you can just treat a DistributedDataParallel wrapped model, a
+        DataParallel wrapped model and an ordinary model on a single GPU as the
+        same (E.g. using the same learning rate for equivalent batch size).
+
+    .. note::
+        Parameters are never broadcast between processes. The module performs
+        an all-reduce step on gradients and assumes that they will be modified
+        by the optimizer in all processes in the same way. Buffers
+        (e.g. BatchNorm stats) are broadcast from the module in process of rank
+        0, to all other replicas in the system in every iteration.
+
+    .. note::
+        If you are using DistributedDataParallel in conjunction with the
+        :ref:`distributed-rpc-framework`, you should always use
+        :meth:`torch.distributed.autograd.backward` to compute gradients and
+        :class:`torch.distributed.optim.DistributedOptimizer` for optimizing
+        parameters.
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined variables")
+            >>> import torch.distributed.autograd as dist_autograd
+            >>> from torch.nn.parallel import DistributedDataParallel as DDP
+            >>> import torch
+            >>> from torch import optim
+            >>> from torch.distributed.optim import DistributedOptimizer
+            >>> import torch.distributed.rpc as rpc
+            >>> from torch.distributed.rpc import RRef
+            >>>
+            >>> t1 = torch.rand((3, 3), requires_grad=True)
+            >>> t2 = torch.rand((3, 3), requires_grad=True)
+            >>> rref = rpc.remote("worker1", torch.add, args=(t1, t2))
+            >>> ddp_model = DDP(my_model)
+            >>>
+            >>> # Setup optimizer
+            >>> optimizer_params = [rref]
+            >>> for param in ddp_model.parameters():
+            >>>     optimizer_params.append(RRef(param))
+            >>>
+            >>> dist_optim = DistributedOptimizer(
+            >>>     optim.SGD,
+            >>>     optimizer_params,
+            >>>     lr=0.05,
+            >>> )
+            >>>
+            >>> with dist_autograd.context() as context_id:
+            >>>     pred = ddp_model(rref.to_here())
+            >>>     loss = loss_func(pred, target)
+            >>>     dist_autograd.backward(context_id, [loss])
+            >>>     dist_optim.step(context_id)
+
+    .. note::
+        DistributedDataParallel currently offers limited support for gradient
+        checkpointing with :meth:`torch.utils.checkpoint`. DDP will work as
+        expected when there are no unused parameters in the model and each layer
+        is checkpointed at most once (make sure you are not passing
+        `find_unused_parameters=True` to DDP). We currently do not support the
+        case where a layer is checkpointed multiple times, or when there unused
+        parameters in the checkpointed model.
+
+    .. note::
+        To let a non-DDP model load a state dict from a DDP model,
+        :meth:`~torch.nn.modules.utils.consume_prefix_in_state_dict_if_present`
+        needs to be applied to strip the prefix "module." in the DDP state dict before loading.
+
+    .. warning::
+        Constructor, forward method, and differentiation of the output (or a
+        function of the output of this module) are distributed synchronization
+        points. Take that into account in case different processes might be
+        executing different code.
+
+    .. warning::
+        This module assumes all parameters are registered in the model by the
+        time it is created. No parameters should be added nor removed later.
+        Same applies to buffers.
+
+    .. warning::
+        This module assumes all parameters are registered in the model of each
+        distributed processes are in the same order. The module itself will
+        conduct gradient ``allreduce`` following the reverse order of the
+        registered parameters of the model. In other words, it is users'
+        responsibility to ensure that each distributed process has the exact
+        same model and thus the exact same parameter registration order.
+
+    .. warning::
+        This module allows parameters with non-rowmajor-contiguous strides.
+        For example, your model may contain some parameters whose
+        :class:`torch.memory_format` is ``torch.contiguous_format``
+        and others whose format is ``torch.channels_last``.  However,
+        corresponding parameters in different processes must have the
+        same strides.
+
+    .. warning::
+        This module doesn't work with :func:`torch.autograd.grad` (i.e. it will
+        only work if gradients are to be accumulated in ``.grad`` attributes of
+        parameters).
+
+    .. warning::
+        If you plan on using this module with a ``nccl`` backend or a ``gloo``
+        backend (that uses Infiniband), together with a DataLoader that uses
+        multiple workers, please change the multiprocessing start method to
+        ``forkserver`` (Python 3 only) or ``spawn``. Unfortunately
+        Gloo (that uses Infiniband) and NCCL2 are not fork safe, and you will
+        likely experience deadlocks if you don't change this setting.
+
+    .. warning::
+        You should never try to change your model's parameters after wrapping
+        up your model with ``DistributedDataParallel``. Because, when
+        wrapping up your model with ``DistributedDataParallel``, the constructor
+        of ``DistributedDataParallel`` will register the additional gradient
+        reduction functions on all the parameters of the model itself at the
+        time of construction. If you change the model's parameters afterwards,
+        gradient redunction functions no longer match the correct set of
+        parameters.
+
+    .. warning::
+        Using ``DistributedDataParallel`` in conjunction with the
+        :ref:`distributed-rpc-framework` is experimental and subject to change.
+
+    Args:
+        module (Module): module to be parallelized
+        device_ids (list of int or torch.device): CUDA devices.
+                   1) For single-device modules, ``device_ids`` can
+                   contain exactly one device id, which represents the only
+                   CUDA device where the input module corresponding to this process resides.
+                   Alternatively, ``device_ids`` can also be ``None``.
+                   2) For multi-device modules and CPU modules,
+                   ``device_ids`` must be ``None``.
+
+                   When ``device_ids`` is ``None`` for both cases,
+                   both the input data for the forward pass and the actual module
+                   must be placed on the correct device.
+                   (default: ``None``)
+        output_device (int or torch.device): Device location of output for
+                      single-device CUDA modules. For multi-device modules and
+                      CPU modules, it must be ``None``, and the module itself
+                      dictates the output location. (default: ``device_ids[0]``
+                      for single-device modules)
+        broadcast_buffers (bool): Flag that enables syncing (broadcasting)
+                          buffers of the module at beginning of the ``forward``
+                          function. (default: ``True``)
+        process_group: The process group to be used for distributed data
+                       all-reduction. If ``None``, the default process group, which
+                       is created by :func:`torch.distributed.init_process_group`,
+                       will be used. (default: ``None``)
+        bucket_cap_mb: ``DistributedDataParallel`` will bucket parameters into
+                       multiple buckets so that gradient reduction of each
+                       bucket can potentially overlap with backward computation.
+                       :attr:`bucket_cap_mb` controls the bucket size in
+                       MegaBytes (MB). (default: 25)
+        find_unused_parameters (bool): Traverse the autograd graph from all
+                               tensors contained in the return value of the
+                               wrapped module's ``forward`` function. Parameters
+                               that don't receive gradients as part of this
+                               graph are preemptively marked as being ready to
+                               be reduced. In addition, parameters that may have
+                               been used in the wrapped module's ``forward``
+                               function but were not part of loss computation and
+                               thus would also not receive gradients are
+                               preemptively marked as ready to be reduced.
+                               (default: ``False``)
+        check_reduction: This argument is deprecated.
+        gradient_as_bucket_view (bool): When set to ``True``, gradients will be views
+                      pointing to different offsets of ``allreduce`` communication
+                      buckets. This can reduce peak memory usage, where the
+                      saved memory size will be equal to the total gradients
+                      size. Moreover, it avoids the overhead of copying between
+                      gradients and ``allreduce`` communication buckets. When
+                      gradients are views, ``detach_()`` cannot be called on the
+                      gradients. If hitting such errors, please fix it by
+                      referring to the :meth:`~torch.optim.Optimizer.zero_grad`
+                      function in ``torch/optim/optimizer.py`` as a solution.
+                      Note that gradients will be views after first iteration, so
+                      the peak memory saving should be checked after first iteration.
+        static_graph (bool): When set to ``True``, DDP knows the trained graph is
+                     static. Static graph means 1) The set of used and unused
+                     parameters will not change during the whole training loop; in
+                     this case, it does not matter whether users set
+                     ``find_unused_parameters = True`` or not. 2) How the graph is trained
+                     will not change during the whole training loop (meaning there is
+                     no control flow depending on iterations).
+                     When static_graph is set to be ``True``, DDP will support cases that
+                     can not be supported in the past:
+                     1) Reentrant backwards.
+                     2) Activation checkpointing multiple times.
+                     3) Activation checkpointing when model has unused parameters.
+                     4) There are model parameters that are outside of forward function.
+                     5) Potentially improve performance when there are unused parameters,
+                     as DDP will not search graph in each iteraton to detect unused
+                     parameters when static_graph is set to be ``True``.
+                     To check whether you can set static_graph to be ``True``, one way is to
+                     check ddp logging data at the end of your previous model training,
+                     if ``ddp_logging_data.get("can_set_static_graph") == True``, mostly you
+                     can set ``static_graph = True`` as well.
+
+                     Example::
+                         >>> # xdoctest: +SKIP("undefined variables")
+                         >>> model_DDP = torch.nn.parallel.DistributedDataParallel(model)
+                         >>> # Training loop
+                         >>> ...
+                         >>> ddp_logging_data = model_DDP._get_ddp_logging_data()
+                         >>> static_graph = ddp_logging_data.get("can_set_static_graph")
+
+
+    Attributes:
+        module (Module): the module to be parallelized.
+
+    Example::
+
+        >>> # xdoctest: +SKIP("undefined variables")
+        >>> torch.distributed.init_process_group(backend='nccl', world_size=4, init_method='...')
+        >>> net = torch.nn.parallel.DistributedDataParallel(model)
+    """
+
+    # used to track whether the given thread is inside ddp forward for torchdynamo purposes
+    _active_ddp_module = None
+
+    def __init__(
+        self,
+        module,
+        device_ids=None,
+        output_device=None,
+        dim=0,
+        broadcast_buffers=True,
+        process_group=None,
+        bucket_cap_mb=25,
+        find_unused_parameters=False,
+        check_reduction=False,
+        gradient_as_bucket_view=False,
+        static_graph=False,
+    ):
+
+        super(DistributedDataParallel, self).__init__()
+        Joinable.__init__(self)
+        self.logger: Optional[dist.Logger] = None
+        if not any((p.requires_grad for p in module.parameters())):
+            self._log_and_throw(
+                RuntimeError,
+                "DistributedDataParallel is not needed when a module "
+                "doesn't have any parameter that requires a gradient.",
+            )
+
+        if device_ids is not None and len(device_ids) > 1:
+            self._log_and_throw(
+                ValueError,
+                "device_ids can only be None or contain a single element.",
+            )
+
+        self.is_multi_device_module = (
+            len({p.device for p in module.parameters()}) > 1
+        )
+        distinct_device_types = {p.device.type for p in module.parameters()}
+        if len(distinct_device_types) != 1:
+            self._log_and_throw(
+                ValueError,
+                "DistributedDataParallel's input module must be on "
+                "the same type of devices, but input module parameters locate in {}.".format(
+                    distinct_device_types
+                ),
+            )
+
+        self.device_type = list(distinct_device_types)[0]
+
+        if (
+            device_ids is None
+            or len(device_ids) == 0  # For backward compatibility.
+            or self.device_type == "cpu"
+            or self.is_multi_device_module
+        ):
+            if device_ids or output_device:
+                self._log_and_throw(
+                    ValueError,
+                    "DistributedDataParallel device_ids and output_device arguments "
+                    "only work with single-device/multiple-device GPU modules or CPU modules, "
+                    "but got device_ids {}, output_device {}, and module parameters {}.".format(
+                        device_ids,
+                        output_device,
+                        {p.device for p in module.parameters()},
+                    ),
+                )
+
+            self.device_ids = None
+            self.output_device = None
+        else:
+            self.device_ids = [_get_device_index(x, True) for x in device_ids]
+
+            if output_device is None:
+                output_device = device_ids[0]
+
+            self.output_device = _get_device_index(output_device, True)
+
+        if process_group is None:
+            self.process_group = _get_default_group()
+        else:
+            self.process_group = process_group
+
+        self.static_graph = False
+        self.dim = dim
+        self.module = module
+        self.device = list(self.module.parameters())[0].device
+        self.broadcast_buffers = broadcast_buffers
+        self.find_unused_parameters = find_unused_parameters
+        self.require_backward_grad_sync = True
+        self.require_forward_param_sync = True
+        self.gradient_as_bucket_view = gradient_as_bucket_view
+        if hasattr(module, "_ddp_params_and_buffers_to_ignore"):
+            self.parameters_to_ignore = module._ddp_params_and_buffers_to_ignore
+        else:
+            self.parameters_to_ignore = []
+
+        self._use_replicated_tensor_module = (
+            _ddp_with_replicated_tensor_enabled()
+        )
+        self._build_replicated_tensor_module()
+
+        if check_reduction:
+            # This argument is no longer used since the reducer
+            # will ensure reduction completes even if some parameters
+            # do not receive gradients.
+            warnings.warn(
+                "The `check_reduction` argument in `DistributedDataParallel` "
+                "module is deprecated. Please avoid using it."
+            )
+
+        # Check that a module does not have Uninitialized parameters
+        for param in module.parameters():
+            if isinstance(param, torch.nn.parameter.UninitializedParameter):
+                self._log_and_throw(
+                    RuntimeError,
+                    "Modules with uninitialized parameters can't be used with `DistributedDataParallel`. "
+                    "Run a dummy forward pass to correctly initialize the modules",
+                )
+        # used for intra-node param sync and inter-node sync as well
+        self.broadcast_bucket_size = int(250 * 1024 * 1024)
+
+        # reduction bucket size
+        self.bucket_bytes_cap = int(bucket_cap_mb * 1024 * 1024)
+        # Whether to perform input tensor CPU to GPU copies on a side-stream
+        self.use_side_stream_for_tensor_copies = (
+            os.environ.get("PYTORCH_DDP_USE_SIDE_STREAM", "1") == "1"
+        )
+
+        # Build parameters for reducer.
+        parameters, expect_sparse_gradient = self._build_params_for_reducer()
+        # Verify model equivalence.
+        _verify_param_shape_across_processes(self.process_group, parameters)
+        # Sync params and buffers. Ensures all DDP models start off at the same value.
+        _sync_module_states(
+            module=self.module,
+            process_group=self.process_group,
+            broadcast_bucket_size=self.broadcast_bucket_size,
+            src=0,
+            params_and_buffers_to_ignore=self.parameters_to_ignore,
+        )
+        # In debug mode, build a mapping of parameter index -> parameter.
+        param_to_name_mapping = self._build_debug_param_to_name_mapping(
+            parameters
+        )
+        # Builds reducer.
+        self._ddp_init_helper(
+            parameters,
+            expect_sparse_gradient,
+            param_to_name_mapping,
+            static_graph,
+        )
+        self._has_rebuilt_buckets = False
+
+        if static_graph:
+            self._set_static_graph()
+
+    def _build_replicated_tensor_module(self):
+        if self._use_replicated_tensor_module:
+            # Create a module with ReplicatedTensor without copying tensors. Avoid
+            # registering '_replicated_tensor_module' as a submodule by directly
+            # adding to self.__dict__.
+            from torch.nn.parallel._replicated_tensor_ddp_interop import (
+                _replicate_module,
+            )
+
+            self.__dict__["_replicated_tensor_module"] = _replicate_module(
+                self.module, self.process_group
+            )
+
+    def _log_and_throw(self, err_type, err_msg):
+        if self.logger is not None:
+            self.logger.set_error_and_log(f"{str(err_type)}: {err_msg}")
+        raise err_type(err_msg)
+
+    def _ddp_init_helper(
+        self,
+        parameters,
+        expect_sparse_gradient,
+        param_to_name_mapping,
+        static_graph,
+    ):
+        """
+        Initialization helper function that does the following:
+        (1) bucketing the parameters for reductions
+        (2) resetting the bucketing states
+        (3) registering the grad hooks
+        (4) Logging construction-time DDP logging data
+        (5) passing a handle of DDP to SyncBatchNorm Layer
+        """
+        self.num_iterations = 0
+        # Notice, the parameters order is not in the order in which they are used,
+        # especially in models with control flow.
+        #
+        # Alongside parameters are not presented in the real execution order,
+        # if a certain model happens to also
+        #   1) have other collectives comm ops in its backward graph.
+        #   2) have unused parameter in subset ranks of the whole world.
+        # bucketing could insert ALL-REDUCE comm op too early on the rank with unused parameter,
+        # matching up with other collectives comm ops on other ranks unexpectedly.
+        #
+        # In order to handle this corner case, when the parameters are not in the real execution order,
+        # we don't do bucketing, thus only one ALL-REDUCE is inserted after all the gradients
+        # of the whole graph are computed.
+        #
+        # Notice, here we only disable bucketing for the first iteration.
+        # After the first iteration, it's OK to rebuild buckets,
+        # because "bucket rebuild" bucketizes parameters based on its real execution order in backward graph.
+
+        # Can remove this branching once #73732 is landed.
+        if static_graph is True or self.find_unused_parameters is False:
+            bucket_size_limits = [sys.maxsize]
+        else:
+            bucket_size_limits = [
+                dist._DEFAULT_FIRST_BUCKET_BYTES,
+                self.bucket_bytes_cap,
+            ]
+        (
+            bucket_indices,
+            per_bucket_size_limits,
+        ) = dist._compute_bucket_assignment_by_size(
+            parameters,
+            bucket_size_limits,
+            expect_sparse_gradient,
+        )
+
+        # Note: reverse list of buckets because we want to approximate the
+        # order in which their gradients are produced, and assume they
+        # are used in the forward pass in the order they are defined.
+        self.reducer = dist.Reducer(
+            parameters,
+            list(reversed(bucket_indices)),
+            list(reversed(per_bucket_size_limits)),
+            self.process_group,
+            expect_sparse_gradient,
+            # The bucket size limit is specified in the constructor.
+            # Additionally, we allow for a single small bucket for parameters
+            # that are defined first, such that their gradients don't spill into
+            # a much larger bucket, adding unnecessary latency after gradient
+            # computation finishes. Experiments showed 1MB is a reasonable value.
+            self.bucket_bytes_cap,
+            self.find_unused_parameters,
+            self.gradient_as_bucket_view,
+            param_to_name_mapping,
+            # User can set dist._DEFAULT_FIRST_BUCKET_BYTES to tune DDP first
+            # bucket.
+            dist._DEFAULT_FIRST_BUCKET_BYTES,
+        )
+
+        self.logger = dist.Logger(self.reducer)
+        # Set as a weak reference to avoid reference cycle between
+        # logger and reducer.
+        self.reducer.set_logger(self.logger)
+
+        has_sync_bn = False
+        for submodule in self.module.modules():
+            if isinstance(submodule, torch.nn.SyncBatchNorm):
+                has_sync_bn = True
+                break
+
+        # Set logging data that can be got during construction time.
+        self.logger.set_construction_data_and_log(
+            self.module.__class__.__name__,
+            [] if self.device_ids is None else self.device_ids,
+            -1 if self.output_device is None else self.output_device,
+            self.broadcast_buffers,
+            has_sync_bn,
+            static_graph,
+        )
+
+        # passing a handle to torch.nn.SyncBatchNorm layer
+        self._passing_sync_batchnorm_handle(self.module)
+
+    def __getstate__(self):
+        self._check_default_group()
+        attrs = copy.copy(self.__dict__)
+        del attrs["process_group"]
+        del attrs["reducer"]
+        del attrs["logger"]
+        if self._use_replicated_tensor_module:
+            del attrs["_replicated_tensor_module"]
+        return attrs
+
+    def __setstate__(self, state):
+        # If serializable, then the process group should be the default one
+        self.process_group = _get_default_group()
+        super(DistributedDataParallel, self).__setstate__(state)
+        self._build_replicated_tensor_module()
+        self.__dict__.setdefault("require_forward_param_sync", True)
+        self.__dict__.setdefault("require_backward_grad_sync", True)
+        parameters, expect_sparse_gradient = self._build_params_for_reducer()
+        # In debug mode, build a mapping of parameter index -> parameter.
+        param_to_name_mapping = self._build_debug_param_to_name_mapping(
+            parameters
+        )
+        # Builds reducer.
+        self._ddp_init_helper(
+            parameters,
+            expect_sparse_gradient,
+            param_to_name_mapping,
+            self.static_graph,
+        )
+        if self.static_graph:
+            self.reducer._set_static_graph()
+            assert self.logger is not None
+            self.logger._set_static_graph()
+
+    def _build_params_for_reducer(self):
+        # Build tuple of (module, parameter) for all parameters that require grads.
+        modules_and_parameters = [
+            (module, parameter)
+            for module_name, module in self.module.named_modules()
+            for parameter in [
+                param
+                # Note that we access module.named_parameters instead of
+                # parameters(module). parameters(module) is only needed in the
+                # single-process multi device case, where it accesses replicated
+                # parameters through _former_parameters.
+                for param_name, param in module.named_parameters(recurse=False)
+                if param.requires_grad
+                and f"{module_name}.{param_name}"
+                not in self.parameters_to_ignore
+            ]
+        ]
+
+        # Deduplicate any parameters that might be shared across child modules.
+        memo = set()
+        modules_and_parameters = [
+            # "p not in memo" is the deduplication check.
+            # "not memo.add(p)" is always True, and it's only there to cause "add(p)" if needed.
+            (m, p)
+            for m, p in modules_and_parameters
+            if p not in memo and not memo.add(p)  # type: ignore[func-returns-value]
+        ]
+
+        # Build list of parameters.
+        parameters = list(parameter for _, parameter in modules_and_parameters)
+
+        # Checks if a module will produce a sparse gradient.
+        def produces_sparse_gradient(module):
+            if isinstance(module, torch.nn.Embedding) or isinstance(
+                module, torch.nn.EmbeddingBag
+            ):
+                return module.sparse
+            return False
+
+        # Build list of booleans indicating whether or not to expect sparse
+        # gradients for the corresponding parameters.
+        expect_sparse_gradient = list(
+            produces_sparse_gradient(module)
+            for module, _ in modules_and_parameters
+        )
+
+        self._assign_modules_buffers()
+
+        return parameters, expect_sparse_gradient
+
+    def _assign_modules_buffers(self):
+        """
+        Assigns module buffers to self.modules_buffers which are then used to
+        broadcast across ranks when broadcast_buffers=True. Note that this
+        must be called every time buffers need to be synced because buffers can
+        be reassigned by user module,
+        see https://github.com/pytorch/pytorch/issues/63916.
+        """
+        # Collect buffers for modules, filtering out buffers that should be ignored.
+        named_module_buffers = [
+            (buffer, buffer_name)
+            for buffer_name, buffer in self.module.named_buffers()
+            if buffer_name not in self.parameters_to_ignore
+        ]
+        self.modules_buffers = [
+            buffer for (buffer, buffer_name) in named_module_buffers
+        ]
+        # Dict[str, tensor] representing module buffers not ignored by DDP.
+        self.named_module_buffers = {
+            buffer_name: buffer
+            for (buffer, buffer_name) in named_module_buffers
+        }
+
+    def _build_debug_param_to_name_mapping(self, parameters):
+        if dist.get_debug_level() == dist.DebugLevel.OFF:
+            return {}
+
+        param_to_param_index = {
+            parameters[i]: i for i in range(len(parameters))
+        }
+        param_set = set(parameters)
+        param_index_to_param_fqn = {}
+        for module_name, module in self.module.named_modules():
+            for param_name, param in module.named_parameters(recurse=False):
+                fqn = f"{module_name}.{param_name}"
+                # Bypass ignored parameters since those are not reduced by DDP
+                # to begin with.
+                if fqn not in self.parameters_to_ignore and param.requires_grad:
+                    if param not in param_set:
+                        self._log_and_throw(
+                            ValueError,
+                            f"Param with name {fqn} found in module parameters, but not DDP parameters."
+                            " This indicates a bug in DDP, please report an issue to PyTorch.",
+                        )
+                    param_index = param_to_param_index[param]
+                    param_index_to_param_fqn[param_index] = fqn
+
+        # Ensure we covered all parameters
+        if len(param_set) != len(param_index_to_param_fqn):
+            self._log_and_throw(
+                ValueError,
+                (
+                    "Expected param to name mapping to cover all parameters, but"
+                    f" got conflicting lengths: {len(param_set)} vs "
+                    f"{len(param_index_to_param_fqn)}. This indicates a bug in DDP"
+                    ", please report an issue to PyTorch."
+                ),
+            )
+
+        return param_index_to_param_fqn
+
+    def _get_parameters(self, m, recurse=True):
+        """
+        Returns a generator of module parameters
+        """
+
+        def model_parameters(m):
+            ps = (
+                m._former_parameters.values()
+                if hasattr(m, "_former_parameters")
+                else m.parameters(recurse=False)
+            )
+            for p in ps:
+                yield p
+
+        for m in m.modules() if recurse else [m]:
+            for p in model_parameters(m):
+                yield p
+
+    def _check_default_group(self):
+        pickle_not_supported = False
+        try:
+            if self.process_group != _get_default_group():
+                pickle_not_supported = True
+        except RuntimeError:
+            pickle_not_supported = True
+
+        if pickle_not_supported:
+            self._log_and_throw(
+                RuntimeError,
+                "DDP Pickling/Unpickling are only supported "
+                "when using DDP with the default process "
+                "group. That is, when you have called "
+                "init_process_group and have not passed "
+                "process_group argument to DDP constructor",
+            )
+
+    @contextmanager
+    def no_sync(self):
+        r"""
+        A context manager to disable gradient synchronizations across DDP
+        processes. Within this context, gradients will be accumulated on module
+        variables, which will later be synchronized in the first
+        forward-backward pass exiting the context.
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined variables")
+            >>> ddp = torch.nn.parallel.DistributedDataParallel(model, pg)
+            >>> with ddp.no_sync():
+            >>>   for input in inputs:
+            >>>     ddp(input).backward()  # no synchronization, accumulate grads
+            >>> ddp(another_input).backward()  # synchronize grads
+        """
+        old_require_backward_grad_sync = self.require_backward_grad_sync
+        self.require_backward_grad_sync = False
+        try:
+            yield
+        finally:
+            self.require_backward_grad_sync = old_require_backward_grad_sync
+
+    @classmethod
+    def _get_active_ddp_module(cls):
+        """
+        TorchDynamo needs to know whether DDP is currently active, and access the DDP module in order to cooperatively optimize it.
+        """
+        return cls._active_ddp_module
+
+    # note, this ctxmgr function is marked 'skip' in torchdynamo, so dynamo only kicks in
+    # for the 'module_to_run' underneath
+    # see torchdynamo/eval_frame.py TorchPatcher.patch for more details
+    @contextmanager
+    def _inside_ddp_forward(self):
+        DistributedDataParallel._active_ddp_module = self
+        try:
+            yield
+        except Exception:
+            raise
+        finally:
+            DistributedDataParallel._active_ddp_module = None
+
+    def _run_ddp_forward(self, *inputs, **kwargs):
+        module_to_run = (
+            self._replicated_tensor_module
+            if self._use_replicated_tensor_module
+            else self.module
+        )
+
+        if self.device_ids:
+            inputs, kwargs = _to_kwargs(
+                inputs,
+                kwargs,
+                self.device_ids[0],
+                self.use_side_stream_for_tensor_copies,
+            )
+            with self._inside_ddp_forward():
+                return module_to_run(*inputs[0], **kwargs[0])  # type: ignore[index]
+        else:
+            with self._inside_ddp_forward():
+                return module_to_run(*inputs, **kwargs)
+
+    def forward(self, *inputs, **kwargs):
+        with torch.autograd.profiler.record_function(
+            "DistributedDataParallel.forward"
+        ):
+            if torch.is_grad_enabled() and self.require_backward_grad_sync:
+                assert self.logger is not None
+                self.logger.set_runtime_stats_and_log()
+                self.num_iterations += 1
+                self.reducer.prepare_for_forward()
+
+            # Notify the join context that this process has not joined, if
+            # needed
+            work = Join.notify_join_context(self)
+            if work:
+                self.reducer._set_forward_pass_work_handle(
+                    work, self._divide_by_initial_world_size  # type: ignore[arg-type]
+                )
+
+            # Calling _rebuild_buckets before forward compuation,
+            # It may allocate new buckets before deallocating old buckets
+            # inside _rebuild_buckets. To save peak memory usage,
+            # call _rebuild_buckets before the peak memory usage increases
+            # during forward computation.
+            # This should be called only once during whole training period.
+            if torch.is_grad_enabled() and self.reducer._rebuild_buckets():
+                logger.info(
+                    "Reducer buckets have been rebuilt in this iteration."
+                )
+                self._has_rebuilt_buckets = True
+
+            # sync params according to location (before/after forward) user
+            # specified as part of hook, if hook was specified.
+            buffer_hook_registered = hasattr(self, "buffer_hook")
+            if self._check_sync_bufs_pre_fwd():
+                self._sync_buffers()
+
+            if self._join_config.enable:
+                # Notify joined ranks whether they should sync in backwards pass or not.
+                self._check_global_requires_backward_grad_sync(
+                    is_joined_rank=False
+                )
+
+            output = self._run_ddp_forward(*inputs, **kwargs)
+
+            # sync params according to location (before/after forward) user
+            # specified as part of hook, if hook was specified.
+            if self._check_sync_bufs_post_fwd():
+                self._sync_buffers()
+
+            if torch.is_grad_enabled() and self.require_backward_grad_sync:
+                self.require_forward_param_sync = True
+                # We'll return the output object verbatim since it is a freeform
+                # object. We need to find any tensors in this object, though,
+                # because we need to figure out which parameters were used during
+                # this forward pass, to ensure we short circuit reduction for any
+                # unused parameters. Only if `find_unused_parameters` is set.
+                if self.find_unused_parameters and not self.static_graph:
+                    # Do not need to populate this for static graph.
+                    self.reducer.prepare_for_backward(
+                        list(_find_tensors(output))
+                    )
+                else:
+                    self.reducer.prepare_for_backward([])
+            else:
+                self.require_forward_param_sync = False
+
+        # TODO: DDPSink is currently enabled for unused parameter detection and
+        # static graph training for first iteration.
+        if (self.find_unused_parameters and not self.static_graph) or (
+            self.static_graph and self.num_iterations == 1
+        ):
+            state_dict = {
+                "static_graph": self.static_graph,
+                "num_iterations": self.num_iterations,
+            }
+
+            (
+                output_tensor_list,
+                treespec,
+                output_is_rref,
+            ) = _tree_flatten_with_rref(output)
+            output_placeholders = [None for _ in range(len(output_tensor_list))]
+            # Do not touch tensors that have no grad_fn, which can cause issues
+            # such as https://github.com/pytorch/pytorch/issues/60733
+            for i, output in enumerate(output_tensor_list):
+                if torch.is_tensor(output) and output.grad_fn is None:
+                    output_placeholders[i] = output
+
+            # When find_unused_parameters=True, makes tensors which require grad
+            # run through the DDPSink backward pass. When not all outputs are
+            # used in loss, this makes those corresponding tensors receive
+            # undefined gradient which the reducer then handles to ensure
+            # param.grad field is not touched and we don't error out.
+            passthrough_tensor_list = _DDPSink.apply(
+                self.reducer,
+                state_dict,
+                *output_tensor_list,
+            )
+            for i in range(len(output_placeholders)):
+                if output_placeholders[i] is None:
+                    output_placeholders[i] = passthrough_tensor_list[i]
+
+            # Reconstruct output data structure.
+            output = _tree_unflatten_with_rref(
+                output_placeholders, treespec, output_is_rref
+            )
+        return output
+
+    def scatter(self, inputs, kwargs, device_ids):
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+
+    def to_kwargs(self, inputs, kwargs, device_id):
+        # Kept for BC
+        return _to_kwargs(
+            inputs, kwargs, device_id, self.use_side_stream_for_tensor_copies
+        )
+
+    def gather(self, outputs, output_device):
+        return gather(outputs, output_device, dim=self.dim)
+
+    def train(self, mode=True):
+        super(DistributedDataParallel, self).train(mode)
+        if self._use_replicated_tensor_module:
+            self._replicated_tensor_module.train(mode)  # type: ignore[union-attr]
+        return self
+
+    # When running in join mode, schedules an allreduce to notify joined ranks
+    # of whether backwards pass synchronization will run this iteraton or not.
+    def _check_global_requires_backward_grad_sync(self, is_joined_rank):
+        if not is_joined_rank and self.require_backward_grad_sync:
+            requires_sync_tensor = torch.ones(1, device=self.device)
+        else:
+            requires_sync_tensor = torch.zeros(1, device=self.device)
+
+        work = dist.all_reduce(
+            requires_sync_tensor, group=self.process_group, async_op=True
+        )
+        return work
+
+    # When running in join mode, checks and performs sync of module buffers if
+    # the models have buffers that should be synchronized in the forward pass.
+    def _check_and_sync_module_buffers(self):
+        if self._check_sync_bufs_pre_fwd():
+            authoritative_rank = self._find_common_rank(
+                self._distributed_rank, False
+            )
+            self._sync_module_buffers(authoritative_rank)
+
+    # When running in join model, agrees upon a common rank and broadcast model
+    # parameters to all other ranks.
+    def _sync_final_model(self, is_last_joiner):
+        # Agree upon the process that will be the authoritative model copy.
+        # The current rank is a candidate for being the authoritative copy if
+        # is_last_joiner=True. We break ties via picking the larger rank.
+        self._authoritative_rank = self._find_common_rank(
+            self._distributed_rank, is_last_joiner
+        )
+        _sync_module_states(
+            module=self.module,
+            process_group=self.process_group,
+            broadcast_bucket_size=self.broadcast_bucket_size,
+            src=self._authoritative_rank,
+            params_and_buffers_to_ignore=self.parameters_to_ignore,
+        )
+
+    # Schedule comm ops to match those scheduled in the reducer's backward
+    # pass.
+    def _match_all_reduce_for_bwd_pass(self):
+        comm_work = []
+        # Schedule comm in the same order as Reducer schedules them, i.e.
+        # the order of the buckets. Retrieving the bucket order from the reducer
+        # ensures that we keep the same order in join mode, such as when bucket
+        # order is rebuilt dynamically.
+
+        # Returns grad_buckets in order, but real tensors are substituted with
+        # zero tensors of the same shape.
+        grad_buckets = self.reducer._get_zeros_like_grad_buckets()
+        for grad_bucket in grad_buckets:
+            # Joined processes contribute zero gradient. In the case that
+            # divide_by_initial_world_size=True, we divide grads by the static
+            # world size, if not, the dividing factor is reduced by the number
+            # of joined processes.
+            work = self.reducer._run_comm_hook(grad_bucket)
+            comm_work.append(work)
+        for work in comm_work:
+            work.wait()
+
+    # Allreduces the used parameter mapping across ranks.
+    def _match_unused_params_allreduce(self):
+        locally_used_param_map = self.reducer._get_local_used_map()
+        self.process_group.allreduce(locally_used_param_map)
+
+    def join(
+        self,
+        divide_by_initial_world_size: bool = True,
+        enable: bool = True,
+        throw_on_early_termination: bool = False,
+    ):
+        r"""
+        A context manager to be used in conjunction with an instance of
+        :class:`torch.nn.parallel.DistributedDataParallel` to be
+        able to train with uneven inputs across participating processes.
+
+        This context manager will keep track of already-joined DDP processes,
+        and "shadow" the forward and backward passes by inserting collective
+        communication operations to match with the ones created by non-joined
+        DDP processes. This will ensure each collective call has a corresponding
+        call by already-joined DDP processes, preventing hangs or errors that
+        would otherwise happen when training with uneven inputs across
+        processes. Alternatively, if the flag ``throw_on_early_termination`` is
+        specified to be ``True``, all trainers will throw an error once one rank
+        runs out of inputs, allowing these errors to be caught and handled
+        according to application logic.
+
+        Once all DDP processes have joined, the context manager will broadcast
+        the model corresponding to the last joined process to all processes to
+        ensure the model is the same across all processes
+        (which is guaranteed by DDP).
+
+        To use this to enable training with uneven inputs across processes,
+        simply wrap this context manager around your training loop. No further
+        modifications to the model or data loading is required.
+
+        .. warning::
+            If the model or training loop this context manager is wrapped around
+            has additional distributed collective operations, such as
+            ``SyncBatchNorm`` in the model's forward pass, then the flag
+            ``throw_on_early_termination`` must be enabled. This is because this
+            context manager is not aware of non-DDP collective communication.
+            This flag will cause all ranks to throw when any one rank
+            exhausts inputs, allowing these errors to be caught and recovered
+            from across all ranks.
+
+        Args:
+            divide_by_initial_world_size (bool): If ``True``, will divide
+                gradients by the initial ``world_size`` DDP training was launched
+                with. If ``False``, will compute the effective world size
+                (number of ranks that have not depleted their inputs yet) and
+                divide gradients by that during allreduce. Set
+                ``divide_by_initial_world_size=True`` to ensure every input
+                sample including the uneven inputs have equal weight in terms of
+                how much they contribute to the global gradient. This is
+                achieved by always dividing the gradient by the initial
+                ``world_size`` even when we encounter uneven inputs. If you set
+                this to ``False``, we divide the gradient by the remaining
+                number of nodes. This ensures parity with training on a smaller
+                ``world_size`` although it also means the uneven inputs would
+                contribute more towards the global gradient. Typically, you
+                would want to set this to ``True`` for cases where the last few
+                inputs of your training job are uneven. In extreme cases, where
+                there is a large discrepancy in the number of inputs, setting
+                this to ``False`` might provide better results.
+            enable (bool): Whether to enable uneven input detection or not. Pass
+                in ``enable=False`` to disable in cases where you know that
+                inputs are even across participating processes. Default is
+                ``True``.
+            throw_on_early_termination (bool): Whether to throw an error
+                or continue training when at least one rank has exhausted
+                inputs. If ``True``, will throw upon the first rank reaching end
+                of data. If ``False``, will continue training with a smaller
+                effective world size until all ranks are joined. Note that if
+                this flag is specified, then the flag
+                ``divide_by_initial_world_size`` would be ignored. Default
+                is ``False``.
+
+
+        Example::
+
+            >>> import torch
+            >>> import torch.distributed as dist
+            >>> import os
+            >>> import torch.multiprocessing as mp
+            >>> import torch.nn as nn
+            >>> # On each spawned worker
+            >>> def worker(rank):
+            >>>     dist.init_process_group("nccl", rank=rank, world_size=2)
+            >>>     torch.cuda.set_device(rank)
+            >>>     model = nn.Linear(1, 1, bias=False).to(rank)
+            >>>     model = torch.nn.parallel.DistributedDataParallel(
+            >>>         model, device_ids=[rank], output_device=rank
+            >>>     )
+            >>>     # Rank 1 gets one more input than rank 0.
+            >>>     inputs = [torch.tensor([1]).float() for _ in range(10 + rank)]
+            >>>     with model.join():
+            >>>         for _ in range(5):
+            >>>             for inp in inputs:
+            >>>                 loss = model(inp).sum()
+            >>>                 loss.backward()
+            >>>     # Without the join() API, the below synchronization will hang
+            >>>     # blocking for rank 1's allreduce to complete.
+            >>>     torch.cuda.synchronize(device=rank)
+        """
+        return Join(
+            [self],
+            enable,
+            throw_on_early_termination,
+            divide_by_initial_world_size=divide_by_initial_world_size,
+        )
+
+    def join_hook(
+        self,
+        **kwargs,
+    ):
+        r"""
+        Returns the DDP join hook, which enables training on uneven inputs by
+        shadowing the collective communications in the forward and backward
+        passes.
+
+        Arguments:
+            kwargs (dict): a :class:`dict` containing any keyword arguments
+                to modify the behavior of the join hook at run time; all
+                :class:`Joinable` instances sharing the same join context
+                manager are forwarded the same value for ``kwargs``.
+
+        The hook supports the following keyword arguments:
+            divide_by_initial_world_size (bool, optional):
+                If ``True``, then gradients are divided by the initial world
+                size that DDP was launched with.
+                If ``False``, then gradients are divided by the effective world
+                size (i.e. the number of non-joined processes), meaning that
+                the uneven inputs contribute more toward the global gradient.
+                Typically, this should be set to ``True`` if the degree of
+                unevenness is small but can be set to ``False`` in extreme
+                cases for possibly better results.
+                Default is ``True``.
+        """
+        divide_by_initial_world_size = kwargs.get(
+            "divide_by_initial_world_size", True
+        )
+        return _DDPJoinHook(
+            self, divide_by_initial_world_size=divide_by_initial_world_size
+        )
+
+    @property
+    def join_device(self):
+        return self.device
+
+    @property
+    def join_process_group(self):
+        return self.process_group
+
+    def _register_buffer_comm_hook(
+        self,
+        state,
+        hook: Callable,
+        comm_hook_location=_BufferCommHookLocation.POST_FORWARD,
+    ):
+        r"""
+        Allows custom registration of hooks that define how buffer are
+        synchronized across ranks. The hook takes in an optional state
+        and is passed in a Dict[str, Tensor] corresponding to buffer names
+        and the buffers, and can run arbitrary reductions on buffers as
+        opposed to DDP's default broadcast from rank 0. This is useful for
+        example if a counter needs to be summed or averaged across ranks
+        every iteration.
+
+        Args:
+            state (Any): Optional state that is passed to the hook.
+            hook (Callable): Callable with the following signature:
+                            ``hook(state: object, buffers: Dict[str, torch.Tensor])
+                            -> Optional[List[torch.futures.Future[torch.Tensor]]]``
+            comm_hook_location (_BufferCommHookLocation): Enum value indicating
+                            where to run the hook.
+                            _BufferCommHookLocation.PRE_FORWARD means that the
+                            hook will run _before_ the forward pass, and
+                            _BufferCommHookLocation.POST_FORWARD means that the
+                            hook will run _after_ the forward pass.
+
+            hook (Callable): Callable with the following signature:
+                         ``hook(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]``:
+
+            NOTE: To maximize performance, users can return a
+                List[torch.futures.Future] from their hook, and DDP will
+                install and await these hooks appropriately at the end of
+                the backward pass. This will ensure all buffers are
+                synchronized by the end of the backward pass. If this
+                setting is used, it is recommended to pass
+                comm_hook_location=_BufferCommHookLocation.POST_FORWARD,
+                which will trigger the hook after the forward pass.
+                If _BufferCommHookLocation.PRE_FORWARD is used, users must
+                ensure appropriate synchronization when manipulating GPU
+                buffers in the forward pass.
+        """
+        assert callable(hook)
+        self.buffer_hook = _BufferCommHook(
+            buffer_comm_hook=hook,
+            buffer_comm_hook_state=state,
+            buffer_comm_hook_location=comm_hook_location,
+        )
+
+    def register_comm_hook(self, state: object, hook: Callable):
+        r"""
+        Registers a communication hook which is an enhancement that provides a
+        flexible hook to users where they can specify how DDP aggregates gradients
+        across multiple workers.
+
+        This hook would be very useful for researchers to try out new ideas. For
+        example, this hook can be used to implement several algorithms like GossipGrad
+        and gradient compression which involve different communication strategies for
+        parameter syncs while running Distributed DataParallel training.
+
+        Args:
+            state (object): Passed to the hook to maintain any state information during the training process.
+                            Examples include error feedback in gradient compression,
+                            peers to communicate with next in GossipGrad, etc.
+
+                            It is locally stored by each worker
+                            and shared by all the gradient tensors on the worker.
+            hook (Callable): Callable with the following signature:
+                             ``hook(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]``:
+
+                             This function is called once the bucket is ready. The
+                             hook can perform whatever processing is needed and return
+                             a Future indicating completion of any async work (ex: allreduce).
+                             If the hook doesn't perform any communication, it still
+                             must return a completed Future. The Future should hold the
+                             new value of grad bucket's tensors. Once a bucket is ready,
+                             c10d reducer would call this hook and use the tensors returned
+                             by the Future and copy grads to individual parameters.
+                             Note that the future's return type must be a single tensor.
+
+                             We also provide an API called ``get_future`` to retrieve a
+                             Future associated with the completion of ``c10d.ProcessGroup.Work``.
+                             ``get_future`` is currently supported for NCCL and also supported for most
+                             operations on GLOO and MPI, except for peer to peer operations (send/recv).
+
+        .. warning ::
+            Grad bucket's tensors will not be predivided by world_size. User is responsible
+            to divide by the world_size in case of operations like allreduce.
+
+        .. warning ::
+            DDP communication hook can only be registered once and should be registered
+            before calling backward.
+
+        .. warning ::
+            The Future object that hook returns should contain a single tensor
+            that has the same shape with the tensors inside grad bucket.
+
+        .. warning ::
+            ``get_future`` API supports NCCL, and partially GLOO and MPI backends (no support
+            for peer-to-peer operations like send/recv) and will return a ``torch.futures.Future``.
+
+        Example::
+            Below is an example of a noop hook that returns the same tensor.
+
+            >>> def noop(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]:
+            >>>     fut = torch.futures.Future()
+            >>>     fut.set_result(bucket.buffer())
+            >>>     return fut
+
+            >>> # xdoctest: +SKIP('undefined name')
+            >>> ddp.register_comm_hook(state=None, hook=noop)
+
+        Example::
+            Below is an example of a Parallel SGD algorithm where gradients are encoded before
+            allreduce, and then decoded after allreduce.
+
+            >>> def encode_and_decode(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]:
+            >>>     encoded_tensor = encode(bucket.buffer()) # encode gradients
+            >>>     fut = torch.distributed.all_reduce(encoded_tensor).get_future()
+            >>>     # Define the then callback to decode.
+            >>>     def decode(fut):
+            >>>         decoded_tensor = decode(fut.value()[0]) # decode gradients
+            >>>         return decoded_tensor
+            >>>     return fut.then(decode)
+
+            >>> # xdoctest: +SKIP('undefined name')
+            >>> ddp.register_comm_hook(state=None, hook=encode_and_decode)
+        """
+        self._check_comm_hook(hook)
+        assert self.logger is not None
+        self.logger._set_comm_hook_name(hook.__qualname__)
+        dist._register_comm_hook(self.reducer, state, hook)
+
+    def _register_builtin_comm_hook(self, comm_hook_type):
+        r"""
+        Registers a built-in communication hook that specifies how DDP
+        aggregates gradients across multiple workers.
+        The built-in hooks aim to provide efficient C++ implementations for certain hooks,
+        which might not be as efficient if implemented in Python using a Python communication hook.
+
+        Args:
+            comm_hook_type (dist.BuiltinCommHookType): type of communication hook, such as ALLREDUCE, FP16_COMPRESS, etc.
+
+        .. warning ::
+            DDP communication hook can only be registered once and should be registered
+            before calling backward.
+
+        Example::
+            Below is an example of a FP16 compression where gradients are
+            compressed into 16-bit floating-point numbers before allreduce, and
+            then decompressed after allreduce.
+
+            >>> # xdoctest: +SKIP('undefined name')
+            >>> ddp._register_builtin_comm_hook(dist.BuiltinCommHookType.FP16_COMPRESS)
+
+        """
+        assert self.logger is not None
+        self.logger._set_comm_hook_name(str(comm_hook_type))
+        dist._register_builtin_comm_hook(self.reducer, comm_hook_type)
+
+    def _register_fused_optim(
+        self, optim: Type, *args, optim_params=None, **kwargs
+    ):
+        r"""
+            Registers an optimizer with DDP such that the optimization for a
+            parameter will run immediately when that parameter's gradient is
+            finished with reduction, instead of waiting for all parameters'
+            gradients to finish reduction. This can result in a training speedup
+            depending on your workload since the optimizer can run while gradient
+            reduction for other parameters are still ongoing. In addition, this has
+            the potential to reduce peak memory consumption during training, as it
+            only needs to load the per-parameter optimizer states of a single
+            parameter at a time, instead of loading all per-parameter optimizer
+            states at once.
+
+            Args:
+                optim_cls (Type): a ``torch.optim.Optimizer`` class to be registered
+                as a fused optimizer.
+                *args (Sequence[Any]): Arguments to forward to `optim_cls`.
+                optim_params (Optional[Iterable[torch.Tensor]]): Set of parameters
+                to optimize, similar to `params` argument of traditional `torch.optim`
+                Optimizers. If this is omitted, all DDP model parameters will be
+                optimized.
+                **kwargs: (Dict[str, Any]): Keyword arguments to forward to `optim_cls`.
+
+        .. warning ::
+            _register_fused_optim should only be called once on a DDP instance,
+            and registering multiple fused optimizers for the same DDP model
+            is not currently supported. Please ping
+            https://github.com/pytorch/pytorch/issues/71595 if this is necessary
+            for your use case.
+
+        .. warning ::
+            _register_fused_optim and register_comm_hook currently do not
+            compose together, meaning that custom DDP communication hooks are
+            not supported with overlapped optimizers. Please ping
+            https://github.com/pytorch/pytorch/issues/71595 if this is necessary
+            for your use case.
+
+        .. warning ::
+            Gradient accumulation and DDP `no_sync` are currently not supported
+            with overlapped optimizer. Please ping
+            https://github.com/pytorch/pytorch/issues/71595 if this is necessary
+            for your use case.
+
+        Example::
+
+            >>> # xdoctest: +SKIP("No rendezvous handler")
+            >>> torch.distributed.init_process_group(backend='nccl', world_size=4, init_method='...')
+            >>> net = torch.nn.parallel.DistributedDataParallel(model, pg)
+            >>> lr = 1e-2
+            >>> betas = (0.9, 0.99)
+            >>> eps = 1e-6
+            >>> net._register_fused_optim(torch.optim.Adam, lr, betas=betas, eps=eps)
+            >>> # Example with subset of parameters
+            >>> params_to_opt = [list(net.parameters())[0]]
+            >>> net._register_fused_optim(
+            ...   torch.optim.Adam, lr, optim_params=params_to_opt,  betas=betas, eps=eps
+            ... )
+        """
+        # Note: importing in function, otherwise this will cause a circular
+        # import as optimizer_overlap module needs to import DistributedDataParallel.
+        from torch.distributed.algorithms._optimizer_overlap import (
+            _as_overlapped_optim,
+        )
+
+        overlapped_optim = _as_overlapped_optim(
+            optim, optim_params, *args, **kwargs
+        )
+        try:
+            overlapped_optim.register_ddp(self)
+        except NotImplementedError:
+            raise RuntimeError(
+                f"{optim} does not support overlapped DDP. Please file an issue to PyTorch or the respective owner of {optim}."
+            )
+
+    def _distributed_broadcast_coalesced(
+        self, tensors, buffer_size, authoritative_rank=0
+    ):
+        dist._broadcast_coalesced(
+            self.process_group, tensors, buffer_size, authoritative_rank
+        )
+
+    def _check_sync_bufs_post_fwd(self):
+        return (
+            self.will_sync_module_buffers()
+            and hasattr(self, "buffer_hook")
+            and self.buffer_hook.buffer_comm_hook_location
+            == _BufferCommHookLocation.POST_FORWARD
+        )
+
+    def _check_sync_bufs_pre_fwd(self):
+        return self.will_sync_module_buffers() and (
+            not hasattr(self, "buffer_hook")
+            or self.buffer_hook.buffer_comm_hook_location
+            == _BufferCommHookLocation.PRE_FORWARD
+        )
+
+    def will_sync_module_buffers(self):
+        return (
+            self.require_forward_param_sync
+            and self.broadcast_buffers
+            and len(self.modules_buffers) > 0
+        )
+
+    def _find_common_rank(self, input_rank, rank_cond):
+        # -1 indicates that this rank is not under consideration to be the
+        # common_rank
+        rank_to_use = torch.tensor(
+            [input_rank if rank_cond else -1],
+            device=self.device,
+        )
+        dist.all_reduce(rank_to_use, op=ReduceOp.MAX, group=self.process_group)
+        if rank_to_use.item() == -1:
+            self._log_and_throw(
+                ValueError,
+                "BUG! Expected rank_cond to be true for at least one process."
+                " This indicates a bug in PyTorch, please report an issue.",
+            )
+        return rank_to_use.item()
+
+    def _sync_buffers(self):
+        with torch.no_grad():
+            # module buffer sync
+            # Synchronize buffers across processes.
+            # If we are running DDP with the join manager, we have to agree
+            # upon a rank to sync module buffers from, since rank 0 may
+            # already have been joined and have stale module buffers.
+            if self._join_config.enable:
+                authoritative_rank = self._find_common_rank(
+                    self._distributed_rank, True
+                )
+            else:
+                # The process with rank 0 is considered the authoritative copy.
+                authoritative_rank = 0
+            # Update self.modules_buffers incase any buffers were
+            # reassigned.
+            self._assign_modules_buffers()
+            self._sync_module_buffers(authoritative_rank)
+
+    def _sync_module_buffers(self, authoritative_rank):
+        if not hasattr(self, "buffer_hook"):
+            self._default_broadcast_coalesced(
+                authoritative_rank=authoritative_rank
+            )
+        else:
+            hook = self.buffer_hook.buffer_comm_hook
+            state = self.buffer_hook.buffer_comm_hook_state
+            futs = hook(state, self.named_module_buffers)
+            if futs is not None:
+                self.reducer._install_post_backward_futures(futs)
+
+    def _default_broadcast_coalesced(
+        self, bufs=None, bucket_size=None, authoritative_rank=0
+    ):
+        """
+        Broadcasts buffers from rank 0 to rest of workers. If bufs, bucket_size
+        are None, default values self.modules_buffers and
+        self.broadcast_bucket_size are used instead.
+        """
+        if bufs is None:
+            bufs = self.modules_buffers
+        if bucket_size is None:
+            bucket_size = self.broadcast_bucket_size
+
+        self._distributed_broadcast_coalesced(
+            bufs, bucket_size, authoritative_rank
+        )
+
+    def _passing_sync_batchnorm_handle(self, module):
+        for layer in module.modules():
+            if isinstance(layer, torch.nn.modules.SyncBatchNorm):
+                if self.device_type == "cpu":
+                    self._log_and_throw(
+                        ValueError,
+                        "SyncBatchNorm layers only work with GPU modules",
+                    )
+
+    def _check_comm_hook(self, hook):
+        if not callable(hook):
+            self._log_and_throw(
+                TypeError, "Communication hook must be callable."
+            )
+
+        sig = inspect.signature(hook)
+        if (
+            sig.parameters["bucket"].annotation != inspect._empty
+            and sig.parameters["bucket"].annotation != dist.GradBucket
+        ):
+            self._log_and_throw(
+                ValueError,
+                "Communication hook: bucket annotation should be dist.GradBucket.",
+            )
+
+        if (
+            sig.return_annotation != inspect._empty
+            and sig.return_annotation != torch.futures.Future[torch.Tensor]
+        ):
+            self._log_and_throw(
+                ValueError,
+                "Communication hook: return annotation should be torch.futures.Future[torch.Tensor].",
+            )
+
+        if hook.__name__ in [
+            "bf16_compress_hook",
+            "bf16_compress_wrapper_hook",
+        ] and (
+            (torch.version.cuda is None and torch.version.hip is None)
+            or (
+                torch.version.cuda is not None
+                and int(torch.version.cuda.split(".")[0]) < 11
+            )
+            or not dist.is_available()
+            or not dist.is_nccl_available()
+            or torch.cuda.nccl.version() < (2, 10)
+        ):
+            self._log_and_throw(
+                TypeError,
+                "BF16 all reduce communication hook required CUDA 11+ and NCCL 2.10+.",
+            )
+
+    @property
+    def _distributed_rank(self):
+        return dist.get_rank(self.process_group)
+
+    @staticmethod
+    def _set_params_and_buffers_to_ignore_for_model(
+        module, params_and_buffers_to_ignore
+    ):
+        """
+        Sets parameters and buffers to be ignored by DDP. Expected format for
+        parameters is the fully qualified name: {module_name}.{param_name}, and
+        similarly, {module_name}.{buffer_name} for buffers. For example:
+        params_to_ignore = []
+        # NB: model here is vanilla PyTorch module, not yet wrapped with DDP.
+        for module_name, module in model.named_modules():
+            for param_name, param in module.named_parameters(recurse=False):
+                if should_ignore(param):
+                    # Create expected format
+                    fqn = f"{module_name}.{param_name}"
+                    params_to_ignore.append(fqn)
+        torch.nn.parallel.DistributedDataParallel._set_params_and_buffers_to_ignore_for_model(
+            model,
+            params_to_ignore
+        )
+        """
+        # This is a workaround to set parameters and buffers DDP should ignore
+        # during synchronization. It will be removed when the API is finalized
+        # as part of addressing https://github.com/pytorch/pytorch/issues/43690.
+        module._ddp_params_and_buffers_to_ignore = params_and_buffers_to_ignore
+
+    def _get_ddp_logging_data(self):
+        r"""
+        This interface can be called after DistributedDataParallel() is
+        constructed. It returns a dictionary of logging data. It could help
+        for debugging and analysis. The loggind data includes DistributedDataParallel
+        constructor input parameters, some internal states of DistributedDataParallel
+        and performance metrics. Simply print the dictorinary and see what
+        these metrics are.
+        This is a prototype interface and subject to change in the future.
+        """
+        assert self.logger is not None
+        ddp_logging_data = self.logger._get_ddp_logging_data()
+        return {**ddp_logging_data.strs_map, **ddp_logging_data.ints_map}
+
+    def _set_ddp_runtime_logging_sample_rate(self, sample_rate):
+        r"""
+        This interface allows users to set sample_rate of collecting
+        runtime stats. The runtime stats will be recorded for the
+        first 10 iterations, after 10 iteratons runtime stats will be
+        recorded once every "sample_rate" training iterations. In
+        default, runtime stats are recorded for the first 10 iterations,
+        after 10 iterations runtime stats are recorded once every
+        "kDDPRuntimeLoggingSampleRate=100" training iterations.
+        This is a prototype interface and subject to change in the future.
+        """
+        if sample_rate < 1:
+            self._log_and_throw(
+                ValueError,
+                "DDP runtime logging sample rate should be equal or greater than 1",
+            )
+        self.reducer._set_ddp_runtime_logging_sample_rate(sample_rate)
+
+    def _set_static_graph(self):
+        """
+        It is recommended to set static graph in the DDP constructor, which will
+        call this private API internally.
+        """
+        # If self.static_graph has been set, no need to set it again
+        if self.static_graph:
+            warnings.warn(
+                "You've set static_graph to be True, no need to set it again."
+            )
+            return
+        self.static_graph = True
+        self.reducer._set_static_graph()
+        assert self.logger is not None
+        self.logger._set_static_graph()
+        if self.find_unused_parameters:
+            warnings.warn(
+                "You passed find_unused_parameters=true to DistributedDataParallel, "
+                "`_set_static_graph` will detect unused parameters automatically, so "
+                "you do not need to set find_unused_parameters=true, just be sure these "
+                "unused parameters will not change during training loop while calling "
+                "`_set_static_graph`."
+            )

From 18d0ade6f5409a1e38ea35e22bf95324774773d0 Mon Sep 17 00:00:00 2001
From: Shiyan Deng <dsy842974287@meta.com>
Date: Tue, 1 Nov 2022 22:42:04 +0000
Subject: [PATCH 0421/1922] Make a copy of the submodule inputs (#87899)

Summary: There might be inplace ops in the model that would change the saved inputs. To avoid that, we save a deepcopy version.

Test Plan: CI

Differential Revision: D40771290

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87899
Approved by: https://github.com/houseroad
---
 torch/fx/passes/splitter_base.py | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/torch/fx/passes/splitter_base.py b/torch/fx/passes/splitter_base.py
index 88624b67f2c64..e2149052339ec 100644
--- a/torch/fx/passes/splitter_base.py
+++ b/torch/fx/passes/splitter_base.py
@@ -1,4 +1,5 @@
 import argparse
+import copy
 from collections import defaultdict
 from dataclasses import dataclass
 from typing import NamedTuple, Sequence, Iterable, Any, List, Dict, Optional, Tuple
@@ -24,7 +25,6 @@
     NodeSet,
     is_node_output_tensor,
 )
-import warnings
 
 
 __all__ = ['FxNetAccNodesFinder', 'FxNetSplitterInternalError', 'Subgraph', 'SplitResult', 'generate_inputs_for_submodules']
@@ -200,7 +200,8 @@ class SplitResult(NamedTuple):
 def generate_inputs_for_submodules(
     model: torch.nn.Module,
     inputs: Sequence[Any],
-    target_submodules: Iterable[str]
+    target_submodules: Iterable[str],
+    deepcopy: bool = False,
 ) -> Dict[str, Any]:
     """
     Generate inputs for targeting submdoules in the given model. Note that if two submodules refer to the same obj, this
@@ -220,17 +221,24 @@ def generate_inputs_for_submodules(
     submodule_to_names = dict((mod, name) for name, mod in model.named_modules())
 
     def pre_forward(module, module_inputs):
-        results[submodule_to_names[module]] = module_inputs
-    try:
-        for name, mod in model.named_modules():
-            if name in target_submodules:
-                handles.append(mod.register_forward_pre_hook(pre_forward))
-        model(*inputs)
-    except Exception as e:
-        warnings.warn(f"Failed to generate submodule inputs because of the following error:\n{e}")
-    finally:
+        results[submodule_to_names[module]] = copy.deepcopy(module_inputs) if deepcopy else module_inputs
+
+    for name, mod in model.named_modules():
+        if name in target_submodules:
+            handles.append(mod.register_forward_pre_hook(pre_forward))
+
+    def clean_up_handles():
         for h in handles:
             h.remove()
+
+    try:
+        with torch.no_grad():
+            model(*inputs)
+    except Exception as e:
+        clean_up_handles()
+        raise e
+
+    clean_up_handles()
     return results
 
 
From 7e6a12007bc26a60faf3adbf7a084410f7035dbe Mon Sep 17 00:00:00 2001
From: jjsjann123 <alex.jann2012@gmail.com>
Date: Tue, 1 Nov 2022 22:43:51 +0000
Subject: [PATCH 0422/1922] [nvfuser] merge rule update (#88228)

adding Kevin to NVFuser reviewer
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88228
Approved by: https://github.com/soumith
---
 .github/merge_rules.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/merge_rules.yaml b/.github/merge_rules.yaml
index 6e9cba905e751..baee9730d598e 100644
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@@ -35,6 +35,7 @@
   - csarofeen
   - ngimel
   - jjsjann123
+  - kevinstephano
   - ptrblck
   mandatory_checks_name:
   - EasyCLA

From b83976b9a2ddd643036b3b0e863c3cc09b69ac0a Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 1 Nov 2022 22:45:11 +0000
Subject: [PATCH 0423/1922] [Dynamo] Update Dynamo benchmarks running commands
 (#87844)

Fixes https://github.com/pytorch/torchdynamo/issues/1761

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87844
Approved by: https://github.com/jansel
---
 benchmarks/dynamo/README.md | 16 +++++++++-------
 benchmarks/dynamo/runner.py |  4 ++--
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/benchmarks/dynamo/README.md b/benchmarks/dynamo/README.md
index 5307e77b9b173..91556084cd0db 100644
--- a/benchmarks/dynamo/README.md
+++ b/benchmarks/dynamo/README.md
@@ -27,11 +27,13 @@ For HF and TIMM models, the scripts already install the transformers and timm pa
 There are a lot of flags in the benchmark runner, and it can be confusing to know which settings to use or what machine to run it on.  In order to support apples-to-apples comparison, we have provided the following 'standard' settings in `runner.py`. This script is a wrapper over the common benchmarking infrastructure and simplifies the flags. We will continually update `runner.py` with the latest and most relevant compilers for training and inference. It also provides some graph utilities to visualize and compare results. Some of the example commands are
 
 **Inference Commands**
-* Inference compilers on torchbench models - `python benchmarks/runner.py --suites=torchbench --inference --dtypes=float16`
+* Inference compilers on torchbench models - `python benchmarks/dynamo/runner.py --suites=torchbench --inference --dtypes=float16`
+* Inductor Inference compiler on torchbench models - `python benchmarks/dynamo/runner.py --suites=torchbench --inference --dtypes=float16 --compilers=inductor`
 
 **Training Commands**
-* Training compilers on TIMM models - `python benchmarks/runner.py --suites=timm_models --training --dtypes=float32 --output-dir=timm_logs`
-* AOTAutograd Training compiler on TIMM models - `python benchmarks/runner.py --suites=timm_models --training --dtypes=float32 --compilers=aot_nvfuser --output-dir=timm_logs`
+* Training compilers on TIMM models - `python benchmarks/dynamo/runner.py --suites=timm_models --training --dtypes=float32 --output-dir=timm_logs`
+* AOTAutograd Training compiler on TIMM models - `python benchmarks/dynamo/runner.py --suites=timm_models --training --dtypes=float32 --compilers=aot_nvfuser --output-dir=timm_logs`
+* Inductor Training compiler on TIMM models - `python benchmarks/dynamo/runner.py --suites=timm_models --training --dtypes=float32 --compilers=inductor --output-dir=timm_logs`
 
 Running runner.py generates a file named `run.sh`. This file contains the actual commands that invoke the common benchmarking infrastructure with the appropriate flags. Which brings us to the advanced usage.
 
@@ -40,11 +42,11 @@ Running runner.py generates a file named `run.sh`. This file contains the actual
 One could directly call `torchbench.py`, `huggingface.py` or `timm_models.py` with the necessary flags. There are a lot of flags in the benchmarks runner. Some of the examples are as follows. These are subject to change.
 
 **Inference Commands**
-* TorchScript NVFuser Inference - `python benchmarks/torchbench.py -dcuda -n100 --speedup-ts`
-* TorchInductor CUDA Graphs Inference - `python benchmarks/torchbench.py -dcuda --inductor-settings --float32 -n50 --inductor`
+* TorchScript (with TorchDynamo capture) NVFuser Inference - `python benchmarks/dynamo/torchbench.py -dcuda -n100 --speedup-dynamo-ts --performance`
+* TorchInductor CUDA Graphs Inference - `python benchmarks/dynamo/torchbench.py -dcuda --float32 -n50 --inductor --performance`
 
 **Training Commands**
-* Torchscript (with TorchDynamo capture) NVFuser Training - `python benchmarks/torchbench.py --float32 -dcuda --training --nvfuser --speedup-dynamo-ts --use-eval-mode`
-* AOTAutograd Torchscript NVFuser Training - `python benchmarks/torchbench.py --float32 -dcuda --training --nvfuser --accuracy-aot-ts-mincut --use-eval-mode`
+* Torchscript (with TorchDynamo capture) NVFuser Training - `python benchmarks/dynamo/torchbench.py --float32 -dcuda --training --nvfuser --speedup-dynamo-ts --performance`
+* TorchInductor CUDA Graphs Training - `python benchmarks/dynamo/torchbench.py --float32 -dcuda --training --inductor --performance`
 
 Above commands are for torchbench models. You can simply replace `torchbench.py` with `huggingface.py` for HF models, and `timm_model.py` for TIMM models.
diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py
index 5406f04fba035..ec54a16a24cd8 100755
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@@ -71,8 +71,8 @@
         "ts_nnc": "--speedup-ts",
         "ts_nvfuser": "-n100 --speedup-ts --nvfuser",
         "trt": "-n100 --speedup-trt",
-        "ts_nvfuser_cudagraphs": "--inductor-settings --float32 -n50 --backend=cudagraphs_ts",
-        "inductor": "--inductor-settings --float32 -n50 --inductor",
+        "ts_nvfuser_cudagraphs": "--backend=cudagraphs_ts",
+        "inductor": "-n50 --inductor",
     },
 }
 

From 41402c20bae0d5f497cac9da5a4f0376e33c12ed Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Tue, 1 Nov 2022 22:58:22 +0000
Subject: [PATCH 0424/1922] Fix typos under torch directory (#88172)

This PR fixes typos in '.md' files under torch directory

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88172
Approved by: https://github.com/malfet
---
 .../_experimental/data_sparsifier/benchmarks/README.md        | 2 +-
 torch/ao/quantization/fx/_model_report/README.md              | 4 ++--
 torch/csrc/jit/OVERVIEW.md                                    | 2 +-
 torch/csrc/jit/codegen/onednn/README.md                       | 2 +-
 torch/csrc/jit/operator_upgraders/README.md                   | 2 +-
 torch/csrc/jit/runtime/static/README.md                       | 4 ++--
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/README.md b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/README.md
index b39e951efec5d..f7f83d7d6f3bb 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/README.md
+++ b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/README.md
@@ -5,7 +5,7 @@ The objective of this exercise is to use the data sparsifier to prune the embedd
 
 1. **Disk usage savings**: Savings in model size after pruning.
 2. **Model Quality**: How and by how much does performance deteriorate after pruning the embedding bags?
-3. **Model forward time**: Can we speed up the model forward time by utilizing the sparsity? Specificially, can we introduce torch.sparse interim to reduce number of computations.
+3. **Model forward time**: Can we speed up the model forward time by utilizing the sparsity? Specifically, can we introduce torch.sparse interim to reduce number of computations.
 
 ## Scope
 The [DataNormSparsifier](https://github.com/pytorch/pytorch/blob/master/torch/ao/sparsity/_experimental/data_sparsifier/data_norm_sparsifier.py) is used to sparsify the embeddings of the DLRM model. The model is sparsified for all the combinations of -
diff --git a/torch/ao/quantization/fx/_model_report/README.md b/torch/ao/quantization/fx/_model_report/README.md
index dc11510f6c9ed..6275b49b54e2b 100644
--- a/torch/ao/quantization/fx/_model_report/README.md
+++ b/torch/ao/quantization/fx/_model_report/README.md
@@ -5,7 +5,7 @@ ModelReport
 
  > ⚠️ *While the example below uses the Fx Workflow, the use of the ModelReport class **does not depend** on the Fx Workflow to work*.
  The requirements are detector dependent.
- Most detectors require a **traceable GraphModule**, but some (ex. `PerChannelDetector`) require just a `nn.Module`.
+ Most detectors require a **traceable GraphModule**, but some (ex. `PerChannelDetector`) require just an `nn.Module`.
 
 #### Typical Fx Workflow
 - Initialize model &rarr; Prepare model &rarr; Callibrate model &rarr; Convert model &rarr; ...
@@ -62,7 +62,7 @@ This is so that we can keep track of where we want to insert observers on a dete
 It then returns the GraphModule with the detectors inserted into both the regular module structure as well as the node structure.
 - `generate_model_report(self, remove_inserted_observers: bool)` &rarr; `Dict[str, Tuple[str, Dict]]` uses callibrated GraphModule to optionally removes inserted observers, and generate, for each detector the ModelReport instance was initialized with:
   - A string-based report that is easily digestable and actionable explaining the data collected by relevant observers for that detector
-  - A dictionary containing statistics collected by the relevant observers and values calculated by the detector for futher analysis or plotting
+  - A dictionary containing statistics collected by the relevant observers and values calculated by the detector for further analysis or plotting
 
 ## ModelReportVisualizer Overview
 
diff --git a/torch/csrc/jit/OVERVIEW.md b/torch/csrc/jit/OVERVIEW.md
index 638cbf883bf71..7168967897626 100644
--- a/torch/csrc/jit/OVERVIEW.md
+++ b/torch/csrc/jit/OVERVIEW.md
@@ -894,7 +894,7 @@ def LSTMCellS(x, hx, cx, w_ih, w_hh, b_ih, b_hh):
     return hy, cy
 ```
 
-After going through the the frontend, we start with this unoptimized graph:
+After going through the frontend, we start with this unoptimized graph:
 
 ```
 graph(%x : Tensor,
diff --git a/torch/csrc/jit/codegen/onednn/README.md b/torch/csrc/jit/codegen/onednn/README.md
index ca2a644372dd2..e3f3ec66734b2 100644
--- a/torch/csrc/jit/codegen/onednn/README.md
+++ b/torch/csrc/jit/codegen/onednn/README.md
@@ -104,7 +104,7 @@ with torch.no_grad():
 
 # run the model
 with torch.no_grad():
-    # oneDNN graph fusion will be trigerred during runtime
+    # oneDNN graph fusion will be triggered during runtime
     output = model(images)
 ```
 
diff --git a/torch/csrc/jit/operator_upgraders/README.md b/torch/csrc/jit/operator_upgraders/README.md
index a4061bf17921a..bf1350aa21f34 100644
--- a/torch/csrc/jit/operator_upgraders/README.md
+++ b/torch/csrc/jit/operator_upgraders/README.md
@@ -177,7 +177,7 @@ When making changes to the operators, the first thing to identify is if it's BC/
             except Exception as e:
                 self.skipTest("Failed to load fixture!")
 
-            # Step4. Load the new model and it won't apply the ugprader
+            # Step4. Load the new model and it won't apply the upgrader
             current_mobile_module_float = self._save_load_mobile_module(MyModuleFloat)
             current_server_module_float = self._save_load_module(MyModuleFloat)
 
diff --git a/torch/csrc/jit/runtime/static/README.md b/torch/csrc/jit/runtime/static/README.md
index 03e5ee6d75dc4..9b72db912684a 100644
--- a/torch/csrc/jit/runtime/static/README.md
+++ b/torch/csrc/jit/runtime/static/README.md
@@ -142,9 +142,9 @@ is selected instead.
 When loading a model, ops are selected for each `torch::jit::Node` in the graph as follows:
 
 1) If an out variant is registered, pass the node to the function that produces the `SROperator`. If
-the result is not `nulltpr`, use that op.
+the result is not `nullptr`, use that op.
 2) If a native function is registered, pass the node to the function that produces the `SROperator`. If
-the result is not `nulltpr`, use that op.
+the result is not `nullptr`, use that op.
 3) Use the JIT implementation. Static runtime will throw an exception if it does not exist.
 
 ## Implementation Details

From 9096669dbe62a1f7ff9368da90d920f34bb4a456 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Tue, 1 Nov 2022 17:50:07 +0000
Subject: [PATCH 0425/1922] record_function: update to use custom_class API
 (#76420)

Re-submit of gh-72302

This still has a small performance hit, but it much smaller. On my
machine I see `_record_fucntion_exit._RecordFunction` takes 1.05 us
compared to the `Tensor` overload taking 0.79 us.

In an overall comparison, I see a 0.7 us slowdown from 6.0 us to
6.7 us for this timeit benchmark
```python
import torch

def foo():
  with torch.profiler.record_function("foo"):
    return torch.eye(3)

%timeit foo()
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/76420
Approved by: https://github.com/robieta
---
 test/profiler/test_profiler_tree.py           | 16 --------
 test/test_autograd.py                         |  6 +--
 test/test_fx.py                               |  6 +--
 torch/autograd/profiler.py                    | 38 +++++++++++++++----
 .../_internal/distributed/rpc/jit/rpc_test.py | 12 +++---
 .../_internal/distributed/rpc/rpc_test.py     | 14 +++----
 6 files changed, 50 insertions(+), 42 deletions(-)

diff --git a/test/profiler/test_profiler_tree.py b/test/profiler/test_profiler_tree.py
index 21c3826c4a9cd..d4a31c6456131 100644
--- a/test/profiler/test_profiler_tree.py
+++ b/test/profiler/test_profiler_tree.py
@@ -312,34 +312,18 @@ def test_profiler_experimental_tree_with_record_function(self):
         self.assertTreesMatch(
             ProfilerTree.format(p.profiler, 12),
             """\
-            aten::zeros
-              aten::empty
-              aten::zero_
             Top level Annotation
-              aten::empty
-              aten::zeros
-                aten::empty
-                aten::zero_
               First Annotation
-                aten::empty
                 aten::ones
                   aten::empty
                   aten::fill_
-              aten::zeros
-                aten::empty
-                aten::zero_
               Second Annotation
-                aten::empty
                 aten::add
                   aten::to
                     aten::_to_copy
                       aten::empty_strided
                       aten::copy_
-                aten::zeros
-                  aten::empty
-                  aten::zero_
                 Third Annotation
-                  aten::empty
                   aten::ones_like
                     aten::empty_like
                       aten::empty_strided
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 6435d36b643ba..8fa611f5b269e 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -3314,16 +3314,16 @@ def test_record_function_callbacks(self):
         foo_event = [event for event in function_events if "foo" in event.name][0]
         self.assertEqual(foo_event.count, 1)
 
-    def test_record_function_new_signatures(self):
+    def test_record_function_legacy(self):
         # Test the new _record_function ops work
         # Note: Remove once record_function uses these directly
         x = torch.randn(10, 10)
         with profile(use_kineto=kineto_available()) as p:
-            record = torch.ops.profiler._record_function_enter_new("bar", None)
+            handle = torch.ops.profiler._record_function_enter("bar", None)
             try:
                 y = x * 2 + 4
             finally:
-                torch.ops.profiler._record_function_exit(record)
+                torch.ops.profiler._record_function_exit(handle)
 
         function_events = p.function_events
         foo_event = [event for event in function_events if "bar" in event.name][0]
diff --git a/test/test_fx.py b/test/test_fx.py
index c8da9d3d2cf67..eac58fb4368d8 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -2791,7 +2791,7 @@ def to_trace(y):
 
     def test_profiler_ranges_side_effect(self):
         g = torch.fx.Graph()
-        handle = g.call_function(torch.ops.profiler._record_function_enter, ('test_range',))
+        handle = g.call_function(torch.ops.profiler._record_function_enter_new, ('test_range',))
         g.call_function(torch.ops.profiler._record_function_exit, (handle,))
         g.output(None)
 
@@ -2801,7 +2801,7 @@ def test_profiler_ranges_side_effect(self):
                 found_targets.setdefault(node.target)
         self.assertEqual(
             list(found_targets.keys()),
-            [torch.ops.profiler._record_function_enter, torch.ops.profiler._record_function_exit]
+            [torch.ops.profiler._record_function_enter_new, torch.ops.profiler._record_function_exit]
         )
 
         g.eliminate_dead_code()
@@ -2811,7 +2811,7 @@ def test_profiler_ranges_side_effect(self):
                 found_targets.setdefault(node.target)
         self.assertEqual(
             list(found_targets.keys()),
-            [torch.ops.profiler._record_function_enter, torch.ops.profiler._record_function_exit]
+            [torch.ops.profiler._record_function_enter_new, torch.ops.profiler._record_function_exit]
         )
 
     def test_ast_rewriter_wrapped_via_decorator(self):
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index ddd0ad6d0a289..e70ec6c4ed8ca 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -481,17 +481,29 @@ def __init__(self, name: str, args: Optional[str] = None):
         self.args: Optional[str] = args
         # Whether or not we should run record function's end callbacks when exiting.
         self.run_callbacks_on_exit: bool = True
-        # Stores underlying RecordFunction as a tensor. TODO: move to custom
-        # class (https://github.com/pytorch/pytorch/issues/35026).
-        self.handle: torch.Tensor = torch.zeros(1)
+        # TODO: TorchScript ignores standard type annotation here
+        # self.record: Optional["torch.classes.profiler._RecordFunction"] = None
+        self.record = torch.jit.annotate(Optional["torch.classes.profiler._RecordFunction"], None)
 
     def __enter__(self):
-        self.handle = torch.ops.profiler._record_function_enter(self.name, self.args)
+        self.record = torch.ops.profiler._record_function_enter_new(self.name, self.args)
         return self
 
     def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any):
-        if self.run_callbacks_on_exit:
-            torch.ops.profiler._record_function_exit(self.handle)
+        if not self.run_callbacks_on_exit:
+            return
+
+        # Local variable is needed by TorchScript to refine Optional[T] to T
+        record = self.record
+        assert record is not None
+
+        # TODO: Too slow with __torch_function__ handling enabled
+        # See https://github.com/pytorch/pytorch/issues/76410
+        if not torch.jit.is_scripting():
+            with torch._C.DisableTorchFunction():
+                torch.ops.profiler._record_function_exit._RecordFunction(record)
+        else:
+            torch.ops.profiler._record_function_exit(record)
 
     def _call_end_callbacks_on_future(self, fut: Future[Any]) -> Future[Any]:
         """
@@ -518,7 +530,19 @@ def _call_end_callbacks_on_future(self, fut: Future[Any]) -> Future[Any]:
         # We are scheduling to run this RecordFunction's end callbacks when the
         # passed in future completes, so don't run end callbacks on exit.
         self.run_callbacks_on_exit = False
-        profiled_future = torch.ops.profiler._call_end_callbacks_on_jit_fut(self.handle, fut)
+
+        # Local variable is needed by TorchScript to refine Optional[T] to T
+        record = self.record
+        assert record is not None
+
+        # TODO: Too slow with __torch_function__ handling enabled
+        # See https://github.com/pytorch/pytorch/issues/76410
+        if not torch.jit.is_scripting():
+            with torch._C.DisableTorchFunction():
+                profiled_future = torch.ops.profiler._call_end_callbacks_on_jit_fut._RecordFunction(
+                    record, fut)
+        else:
+            profiled_future = torch.ops.profiler._call_end_callbacks_on_jit_fut(record, fut)
         return profiled_future
 
 
diff --git a/torch/testing/_internal/distributed/rpc/jit/rpc_test.py b/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
index 0c5e78515aa2a..275103a50cbe0 100644
--- a/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
@@ -194,12 +194,12 @@ def script_fork_wait_throw(invalue):
 
 
 @torch.jit.script
-def call_rpc_with_profiling(handle: Tensor, dst_worker_name: str) -> Tensor:
+def call_rpc_with_profiling(record: torch.classes.profiler._RecordFunction, dst_worker_name: str) -> Tensor:
     # Call rpc_async from within ScriptFunction and ensure that we can attach
     # profiling callbacks. Note that handle here is a Tensor representation of
     # RecordFunction.
     fut = rpc.rpc_async(dst_worker_name, one_arg, (torch.tensor(1),))
-    torch.ops.profiler._call_end_callbacks_on_jit_fut(handle, fut)
+    torch.ops.profiler._call_end_callbacks_on_jit_fut(record, fut)
     ret = fut.wait()
     return ret
 
@@ -210,12 +210,12 @@ def call_rpc_torchscript_with_record_function(dst_worker_name: str, block: str)
 
 
 @torch.jit.script
-def call_fork_with_profiling(handle: Tensor) -> Tensor:
+def call_fork_with_profiling(record: torch.classes.profiler._RecordFunction) -> Tensor:
     # Call fork from within ScriptFunction and ensure that we can attach profiling
     # callbacks to the resulting future. Note that handle here is a Tensor
     # representation of RecordFunction.
     fut = torch.jit._fork(one_arg, torch.tensor(1))
-    torch.ops.profiler._call_end_callbacks_on_jit_fut(handle, fut)
+    torch.ops.profiler._call_end_callbacks_on_jit_fut(record, fut)
     ret = fut.wait()
     return ret
 
@@ -1146,7 +1146,7 @@ def test_call_rpc_with_profiling(self):
                     "worker1",
                 )
                 with torch.autograd.profiler.record_function(prof_key) as rf:
-                    ret = call_rpc_with_profiling(rf.handle, "worker1")
+                    ret = call_rpc_with_profiling(rf.record, "worker1")
             # TODO: Can't get a reliable time for this profiling event since
             # it's hard to estimate the execution time on the remote end for non-UDFs.
             # This can be resolved by https://github.com/pytorch/pytorch/issues/36272.
@@ -1295,7 +1295,7 @@ def test_call_fork_in_jit_with_profiling(self):
         # future from within a script function with torch.jit.fork
         with _profile() as prof:
             with torch.autograd.profiler.record_function("foo") as rf:
-                ret = call_fork_with_profiling(rf.handle)
+                ret = call_fork_with_profiling(rf.record)
 
         events = prof.function_events
         function_event = get_function_event(events, "foo")
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index 764117f43dbf6..4c0239ac653ee 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -2467,20 +2467,20 @@ def test_async_record_function_double_end_callbacks(self):
                 fut.wait()
 
     @dist_init
-    def test_async_record_function_double_end_callbacks_new_signatures(self):
-        # Test the new _record_function ops work
-        # Note: Remove once record_function uses these directly
+    def test_async_record_function_legacy(self):
+        # Test the legacy _record_function ops work
+        # Note: These exist for backward compatibility with TorchScript
         num_sleep_seconds = 1
         if self.rank == 1:
             with _profile() as pf:
                 try:
-                    record = torch.ops.profiler._record_function_enter_new("foo", None)
+                    handle = torch.ops.profiler._record_function_enter("foo", None)
                     fut = rpc.rpc_async(
                         worker_name(0), my_sleep_func, args=(num_sleep_seconds,)
                     )
-                    torch.ops.profiler._call_end_callbacks_on_jit_fut(record, fut)
+                    torch.ops.profiler._call_end_callbacks_on_jit_fut(handle, fut)
                 finally:
-                    torch.ops.profiler._record_function_exit(record)
+                    torch.ops.profiler._record_function_exit(handle)
 
                 fut.wait()
 
@@ -2499,7 +2499,7 @@ def test_async_record_function_cbs_jit_call(self):
                         worker_name(0), my_script_func, args=(torch.tensor(1),)
                     )
                     # Intentionally calling record_function internals
-                    fut = torch.ops.profiler._call_end_callbacks_on_jit_fut(rf.handle, fut)
+                    fut = torch.ops.profiler._call_end_callbacks_on_jit_fut(rf.record, fut)
                 result = fut.wait()
                 # Validate that the profiling future returns the same value as the RPC
                 # future.

From 35128046873b812cb2604ff3466fa914f504f2a6 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Tue, 1 Nov 2022 18:03:06 +0000
Subject: [PATCH 0426/1922] Check SM version before calling flash attention
 with BFloat16 (#86600)

The flash attention code path requires sm80 or newer to run on
BFloat16, so any OpInfo tests running with BFloat16 would fail with
the error:
```
RuntimeError: Expected q_dtype == at::kHalf || (is_sm8x && q_dtype == at::kBFloat16) to be true, but got false.
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86600
Approved by: https://github.com/ngimel
---
 .../ATen/native/transformers/cuda/sdp_utils.h | 19 ++++++++++---------
 .../_internal/common_methods_invocations.py   | 12 ++----------
 2 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.h b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
index e0f38e10f966c..10b3d9d9cf487 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.h
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
@@ -189,28 +189,29 @@ inline bool use_flash_attention(sdp_params params, bool debug) {
   TORCH_CHECK(!debug, "Torch was not compiled with flash attention.");
   return false;
 #endif
-  // Constraints specific to flash attention
-  static const std::vector<caffe2::ScalarType> flash_dtypes{
-      at::kHalf, at::kBFloat16};
-
   //  Define gate functions that determine if a flash kernel can be ran
-  std::vector<std::function<bool(sdp_params, bool)>> constraints{
+  constexpr std::array<bool(*)(sdp_params, bool), 7> constraints {{
       check_runtime_disabled_flash,
       check_tensor_shapes,
       check_for_attn_weights,
       check_for_attn_mask,
       check_head_dim_size,
       check_gpu_sm75_or_greater,
-      check_for_seq_len_1_nested_tensor};
+      check_for_seq_len_1_nested_tensor}};
   for (auto& constraint : constraints) {
     if (!constraint(params, debug)) {
       return false;
     }
   }
-  if (!check_tensor_dtype(params, flash_dtypes, debug)) {
-    return false;
+
+  auto dprop = at::cuda::getCurrentDeviceProperties();
+  if (dprop->major >= 8) {
+    static const std::array<at::ScalarType, 2> sm80_flash_dtypes{at::kHalf, at::kBFloat16};
+    return check_tensor_dtype(params, sm80_flash_dtypes, debug);
+  } else {
+    static const std::array<at::ScalarType, 1> default_flash_dtypes{at::kHalf};
+    return check_tensor_dtype(params, default_flash_dtypes, debug);
   }
-  return true;
 }
 
 inline bool use_mem_efficient_attention(sdp_params params, bool debug) {
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index f5037a21e14e0..e8c7b41949805 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -11658,18 +11658,10 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             {torch.float32: tol(atol=5e-05, rtol=5e-6)}), 'TestCommon', device_type='cuda',), ],
         skips=(
             # This is only failing on Linux Bionic 3.10 Cuda 11.6
-            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', device_type='cuda'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes',
+                         device_type='cuda', active_if=_get_torch_cuda_version() >= (11, 6)),
             # AssertionError: JIT Test does not execute any logic
             DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
-            # This test fails on trunk CUDA 10.2 tests, can be removed when we stop 10.2 support
-            DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_meta_outplace',
-                         device_type='cuda', dtypes=(torch.bfloat16,)),
-            DecorateInfo(unittest.skip("Not Implemented"), 'TestMeta', 'test_dispatch_meta_outplace',
-                         device_type='cuda', dtypes=(torch.bfloat16,)),
-            DecorateInfo(unittest.skip("Not Implemented"), 'TestMeta', 'test_symbolic_dispatch_meta_outplace',
-                         device_type='cuda', dtypes=(torch.bfloat16,)),
-            DecorateInfo(unittest.skip("Skipped!"), 'TestSchemaCheckModeOpInfo',
-                         'test_schema_correctness', device_type='cuda', dtypes=(torch.bfloat16,)),
             # Doesn't support autocasting
             DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensorNonErroring', 'test_fake_autocast', device_type='cpu'),
             DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensor', 'test_fake_autocast'),

From be9bde0ad573fadb3936a212fcaee565cb64eea9 Mon Sep 17 00:00:00 2001
From: eqy <eddiey@nvidia.com>
Date: Wed, 2 Nov 2022 01:36:37 +0000
Subject: [PATCH 0427/1922] [cuDNN] (re-open) Enable cuDNN Frontend v8 API by
 Default (#87669)

#58414

Has a small tweak to a test that was breaking on A10 (CC @malfet).

CC @ptrblck @ngimel
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87669
Approved by: https://github.com/ngimel
---
 aten/src/ATen/native/ConvUtils.h        | 4 ++--
 aten/src/ATen/native/cudnn/ConvShared.h | 2 +-
 test/nn/test_convolution.py             | 2 +-
 test/test_cuda.py                       | 8 ++++----
 torch/testing/_internal/common_utils.py | 4 +++-
 5 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h
index 675f701c8582d..9510fbc95e12d 100644
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@@ -66,11 +66,11 @@ namespace {
 }
 
 static inline bool cudnnv8_enabled_check_debug() {
-  static bool cudnnv8_flag = c10::utils::check_env("TORCH_CUDNN_V8_API_ENABLED") == true;
+  static bool cudnnv8_flag = c10::utils::check_env("TORCH_CUDNN_V8_API_DISABLED") != true;
   static bool cudnnv8_debug = c10::utils::check_env("TORCH_CUDNN_V8_API_DEBUG") == true;
   static uint8_t cudnnv8_debugcount = 0;
   if (cudnnv8_debug == 1 && cudnnv8_debugcount < 10) {
-    TORCH_WARN("TORCH_CUDNN_V8_DEBUG ON, V8_FLAG: ", cudnnv8_flag, " TORCH_CUDNN_USE_HEURISTIC_MODE B: ", cudnnv8_heuristic_mode_b);
+    TORCH_WARN("TORCH_CUDNN_V8_DEBUG ON, V8 ON: ", cudnnv8_flag, " TORCH_CUDNN_USE_HEURISTIC_MODE B: ", cudnnv8_heuristic_mode_b);
     cudnnv8_debugcount++;
   }
   return cudnnv8_flag == 1;
diff --git a/aten/src/ATen/native/cudnn/ConvShared.h b/aten/src/ATen/native/cudnn/ConvShared.h
index 9a576de285ce4..fa06d09404711 100644
--- a/aten/src/ATen/native/cudnn/ConvShared.h
+++ b/aten/src/ATen/native/cudnn/ConvShared.h
@@ -113,7 +113,7 @@ void raw_cudnn_convolution_add_relu_fallback_out(
 
 #if HAS_CUDNN_V8()
 // v7 functions are preserved here to allow for runtime switching to v7
-// (e.g., TORCH_CUDNN_V8_API_ENABLED=0).
+// (e.g., TORCH_CUDNN_V8_API_DISABLED=1).
 // Note that v7 forward/backward out can have different behavior from the v8
 // versions, as v7 explicitly splits large tensors as a 32-bit indexing
 // workaround whereas v8 expects cuDNN to handle large tensors.
diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py
index c94eb5447d5ad..c621510674fa6 100644
--- a/test/nn/test_convolution.py
+++ b/test/nn/test_convolution.py
@@ -1034,7 +1034,7 @@ def test_Conv3d_depthwise_naive_groups(self, device, dtype):
             self.assertEqual(m.weight.grad.data,
                              torch.cat([m1.weight.grad.data,
                                         m2.weight.grad.data], 0),
-                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
+                             atol=atol, rtol=rtol)
 
 
     @onlyCUDA
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 1f3abf7915f8a..a99de422c7ae4 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -2897,10 +2897,10 @@ def test_autocast_torch_bf16(self):
                 op, args = op_with_args[0], op_with_args[1]
                 if len(op_with_args) == 3:
                     skip_test = op_with_args[2]  # TEST_WITH_ROCM
-                should_error_from_cudnn = 'cudnn' in op and not\
-                    ('TORCH_CUDNN_V8_API_ENABLED' in os.environ and
-                     int(os.environ['TORCH_CUDNN_V8_API_ENABLED']) and
-                     torch.cuda.get_device_capability() >= (8, 0))
+                should_error_from_cudnn = 'cudnn' in op and \
+                    ('TORCH_CUDNN_V8_API_DISABLED' in os.environ and
+                     int(os.environ['TORCH_CUDNN_V8_API_DISABLED']) or
+                     torch.cuda.get_device_capability() < (8, 0))
                 should_error_from_not_implemented = should_error_from_cudnn or 'prelu' in op or 'thnn' in op \
                     or 'fused' in op or 'gru' in op or op == '_thnn_fused_lstm_cell' or op == 'lstm_cell'
                 if not skip_test:
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 9903e95228fc8..e9982e9dc5779 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -901,7 +901,9 @@ def _check_module_exists(name: str) -> bool:
 if TEST_CUDA and 'NUM_PARALLEL_PROCS' in os.environ:
     num_procs = int(os.getenv("NUM_PARALLEL_PROCS", "2"))
     # other libraries take up about 11% of space per process
-    torch.cuda.set_per_process_memory_fraction(round(1 / num_procs - .11, 2))
+    # + leave some additional buffer e.g., for runtime compilation
+    # or allocations outside of the caching allocator
+    torch.cuda.set_per_process_memory_fraction(round(1 / num_procs - .15, 2))
 
 
 def skipIfCrossRef(fn):

From bf5a4a99316f6cbfd2ae243ffd3021fbf162ee08 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 31 Oct 2022 13:33:52 -0700
Subject: [PATCH 0428/1922] Disable torchdynamo in backwards compiler harder
 (#88132)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88132
Approved by: https://github.com/bertmaher, https://github.com/malfet
---
 torch/_dynamo/optimizations/backends.py | 2 +-
 torch/_dynamo/optimizations/training.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/_dynamo/optimizations/backends.py b/torch/_dynamo/optimizations/backends.py
index ea06d958005e4..56555f123e286 100644
--- a/torch/_dynamo/optimizations/backends.py
+++ b/torch/_dynamo/optimizations/backends.py
@@ -552,7 +552,7 @@ def run(*new_inputs):
 def aot_autograd(subgraph, **kwargs):
     def _wrapped_bw_compiler(*args, **kwargs):
         # stop TorchDynamo from trying to compile our generated backwards pass
-        return disable(bw_compiler(*args, **kwargs))
+        return disable(disable(bw_compiler)(*args, **kwargs))
 
     bw_compiler = kwargs.get("bw_compiler") or kwargs["fw_compiler"]
     kwargs["bw_compiler"] = _wrapped_bw_compiler
diff --git a/torch/_dynamo/optimizations/training.py b/torch/_dynamo/optimizations/training.py
index 3235c7486b2e9..3684a1368d2d0 100644
--- a/torch/_dynamo/optimizations/training.py
+++ b/torch/_dynamo/optimizations/training.py
@@ -428,7 +428,7 @@ def raw_aot_autograd_cudagraphs(model, inputs):
 
     def _wrapped_bw_compiler(*args, **kwargs):
         # stop TorchDynamo from trying to compile our generated backwards pass
-        return disable(bw_compiler(*args, **kwargs))  # type: ignore[operator]
+        return disable(disable(bw_compiler)(*args, **kwargs))  # type: ignore[operator]
 
     bw_compiler = kwargs.get("bw_compiler") or kwargs["fw_compiler"]
     kwargs["bw_compiler"] = _wrapped_bw_compiler

From ad84ce2692ea6155573a4af01d4c8903ddd21b9b Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Wed, 2 Nov 2022 03:52:17 +0000
Subject: [PATCH 0429/1922] Support multi-gpu CI for inductor-distributed
 (#87996)

This test by itself isn't the end goal, but it is a minimal test that exercises multi-gpu and the focus of the PR is the infra behind enabling that.  I'll follow up with more tests using actual models etc.

and @malfet @desertfire for awareness/feedback on the infra side
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87996
Approved by: https://github.com/aazzolini
---
 .github/workflows/inductor.yml              |  1 +
 .jenkins/pytorch/test.sh                    |  6 ++
 test/distributed/test_dynamo_distributed.py | 69 ++++++++++++++++++++-
 3 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml
index 7348b10674a74..ef0c6a620b736 100644
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@@ -29,6 +29,7 @@ jobs:
           { config: "inductor", shard: 5, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "inductor", shard: 6, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "inductor", shard: 7, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_distributed", shard: 1, num_shards: 1, runner: "linux.g5.12xlarge.nvidia.gpu" },
         ]}
 
   linux-bionic-cuda11_6-py3_10-gcc7-inductor-test:
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 2c77c1e516bb5..4cba60ed1893b 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -250,6 +250,8 @@ test_dynamo_shard() {
 }
 
 test_inductor_distributed() {
+  # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
+  # with if required # gpus aren't available
   PYTORCH_TEST_WITH_INDUCTOR=0 PYTORCH_TEST_WITH_INDUCTOR=0 python test/run_test.py --include distributed/test_dynamo_distributed
   assert_git_not_dirty
 }
@@ -728,6 +730,10 @@ elif [[ "$TEST_CONFIG" == distributed ]]; then
 elif [[ "$TEST_CONFIG" == deploy ]]; then
   checkout_install_torchdeploy
   test_torch_deploy
+elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
+  install_filelock
+  install_triton
+  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
   test_without_numpy
   install_torchvision
diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index dc42586abdcb5..dee2f7c1a5924 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -2,17 +2,18 @@
 import os
 import unittest
 from unittest.mock import patch
-
 import torch
 import torch._dynamo
 import torch._dynamo.test_case
 import torch.distributed as dist
+from contextlib import contextmanager
 from torch import nn
 from torch._dynamo import config
 from torch._dynamo.utils import same
 from torch._inductor.utils import has_triton
 from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.testing._internal.common_distributed import requires_nccl
+from torch.testing._internal.common_distributed import MultiProcessTestCase, skip_if_lt_x_gpu, requires_nccl
+import torch._dynamo.logging
 
 def init_weights(m):
     if isinstance(m, nn.Linear):
@@ -31,6 +32,13 @@ def __init__(self, in_feat=10, hidden_feat=5000, num_hidden=2, out_feat=5):
     def forward(self, inputs):
         return self.net(inputs)
 
+def get_model(device, bsz=20, in_feat=10, hidden_feat=5000, out_feat=5):
+    m = ToyModel(in_feat=in_feat, hidden_feat=hidden_feat, out_feat=out_feat).to(device)
+    m.apply(init_weights)
+    inputs = torch.rand(bsz, in_feat).to(device)
+    outputs = m(inputs)
+    return m, inputs, outputs
+
 
 class CheckSplitsCompiler:
     def __init__(self):
@@ -40,6 +48,63 @@ def compile_fn(self, gm, example_inputs):
         self.compiler_called += 1
         return gm
 
+@contextmanager
+def _per_rank_init(rank, world_size):
+    # To avoid multiple inheritance from _dynamo.test_case.TestCase and MultiProcessTestCase,
+    # Just manually implement the most important part of the dynamo behavior to reset/clear.
+    torch.cuda.set_device(rank)
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '6789'
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+    torch._dynamo.reset()
+    torch._dynamo.utils.counters.clear()
+    yield
+    torch._dynamo.reset()
+    torch._dynamo.utils.counters.clear()
+    dist.destroy_process_group()
+
+
+@requires_nccl()
+class TestDistributedMultiProc(MultiProcessTestCase):
+    def setUp(self):
+        super(TestDistributedMultiProc, self).setUp()
+        self._spawn_processes()
+
+    def tearDown(self):
+        super(TestDistributedMultiProc, self).tearDown()
+        try:
+            os.remove(self.file_name)
+        except OSError:
+            pass
+
+    @property
+    def world_size(self) -> int:
+        return torch.cuda.device_count()
+
+    @classmethod
+    def _run(cls, rank: int, test_name: str, file_name: str, parent_pipe) -> None:
+        # Don't enable DDP + ReplicatedTensor, as that breaks Dynamo+DDP
+        # TODO(whc) why is ReplicatedTensor defaulted=True in MultiProcessTestCase, and should we support it?
+        # from torch.nn.parallel._replicated_tensor_ddp_utils import _set_ddp_with_replicated_tensor
+        # _set_ddp_with_replicated_tensor(True)
+
+        # The rest is copypasta from MultiProcessTestCase._run
+        self = cls(test_name)
+        self.rank = rank
+        self.file_name = file_name
+        self.run_test(test_name, parent_pipe)
+
+    @skip_if_lt_x_gpu(2)
+    @patch.object(config, "optimize_ddp", False)
+    def test_ddp_baseline_aot_eager_multiprocess(self):
+        with _per_rank_init(self.rank, self.world_size):
+            self.assertFalse(config.optimize_ddp)
+            m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
+            m = DDP(m, device_ids=[self.rank])
+            m = torch._dynamo.optimize("aot_eager")(m)
+            outputs = m(inputs)
+            self.assertTrue(same(correct_outputs, outputs))
+
 
 @requires_nccl()
 class TestDistributed(torch._dynamo.test_case.TestCase):

From 5f440f8908dbf3baecca59302c086976d6f91477 Mon Sep 17 00:00:00 2001
From: Fabio Rocha <frocha@quansight.com>
Date: Tue, 1 Nov 2022 19:29:17 +0000
Subject: [PATCH 0430/1922] [inductor] Updated some triton.libdevice calls
 (#88242)

triton master now does not require `d` or `f` suffix
to some libdevice function calls - it dispatches to right
library call based on argument type.

triton pin updated to
https://github.com/openai/triton/commit/f16138d447bccc54641a9c48ffedbd449a1a40a7

Also removed some xfails for some unrelated tests.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88242
Approved by: https://github.com/ngimel
---
 .github/ci_commit_pins/triton.txt          | 2 +-
 test/inductor/test_torchinductor_opinfo.py | 4 ++--
 torch/_inductor/codegen/triton.py          | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/ci_commit_pins/triton.txt b/.github/ci_commit_pins/triton.txt
index 26387597d0911..d46172e16b66b 100644
--- a/.github/ci_commit_pins/triton.txt
+++ b/.github/ci_commit_pins/triton.txt
@@ -1 +1 @@
-5ca1ed01016530056c4507661c24d6c21efc983d
+f16138d447bccc54641a9c48ffedbd449a1a40a7
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index ea7a32db44cf9..93e5412716296 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -225,9 +225,9 @@ def process(device_type):
     "masked_scatter": {f16, f32, f64},
     "masked_select": {b8, f16, f32, f64, i32, i64},
     "max.reduction_no_dim": {f16},
-    "max.reduction_with_dim": {b8, f16},
+    "max.reduction_with_dim": {b8},
     "min.reduction_no_dim": {f16},
-    "min.reduction_with_dim": {b8, f16},
+    "min.reduction_with_dim": {b8},
     "multinomial": {f32, f64},
     "nan_to_num": {f16},
     "nanquantile": {f32, f64},
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 5ccf1a7191f29..6264c54f84ab7 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -236,7 +236,7 @@ def libdevice_sigmoid(x):
     @staticmethod
     def signbit(x):
         # XX: This is wrong for the value -0.0 in floating point
-        return f"tl.libdevice.signbitf({x}) if ({x}).dtype is tl.float32 else {x} < 0"
+        return f"tl.libdevice.signbit({x}) if ({x}).dtype is tl.float32 else {x} < 0"
 
     @staticmethod
     def fmod(a, b):
@@ -256,11 +256,11 @@ def libdevice_log(x):
 
     @staticmethod
     def isinf(x):
-        return f"tl.libdevice.isinfd({x}) if ({x}).dtype is tl.float64 else tl.libdevice.isinff({x})"
+        return f"tl.libdevice.isinf({x})"
 
     @staticmethod
     def isnan(x):
-        return f"tl.libdevice.isnand({x}) if ({x}).dtype is tl.float64 else tl.libdevice.isnanf({x})"
+        return f"tl.libdevice.isnan({x})"
 
     @staticmethod
     def round(x):

From e6b813224aca743cf31c3c14fa10587c44b483c4 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 2 Nov 2022 05:22:38 +0000
Subject: [PATCH 0431/1922] [vision hash update] update the pinned vision hash
 (#88162)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned vision hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88162
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/vision.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index 7dbaee31dbff6..eabc4b3d7eb11 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-cba1c011a87dd14af10f97bcb113fa09a8e2b396
+d95fbaf1efd5346a4afcf5b9953df75696432265

From e17c6087bf9432031fb677fb7cfaa13c51422a18 Mon Sep 17 00:00:00 2001
From: Kshiteej K <kshitijkalambarkar@gmail.com>
Date: Wed, 2 Nov 2022 06:37:33 +0000
Subject: [PATCH 0432/1922] [complex] conv_transpose3d : complex support
 (#87967)

Reference: https://github.com/pytorch/pytorch/issues/71108

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87967
Approved by: https://github.com/anjali411
---
 aten/src/ATen/native/Convolution.cpp          |  8 +++-
 .../_internal/common_methods_invocations.py   | 38 ++++++++++++++++---
 torch/testing/_internal/common_modules.py     | 20 ++++++++++
 3 files changed, 60 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 60215801e4ce9..109f0ac059220 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -1087,8 +1087,14 @@ at::Tensor conv_transpose3d(
   Tensor input;
   bool is_batched;
   std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 3, "conv_transpose3d");
-  auto output = at::convolution(
+  Tensor output;
+  if (at::isComplexType(input_.scalar_type())) {
+    output = complex_convolution(
       input, weight, bias, stride, padding, dilation, true, output_padding, groups);
+  } else {
+    output = at::convolution(
+      input, weight, bias, stride, padding, dilation, true, output_padding, groups);
+  }
   return is_batched ? output : output.squeeze(0);
 }
 
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index e8c7b41949805..ba2e8bc492ce9 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -10673,6 +10673,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(
                    toleranceOverride({torch.complex32: tol(atol=1e-5, rtol=5e-3)}),
                    "TestCudaFuserOpInfo", "test_nvfuser_correctness"),
+               DecorateInfo(
+                   toleranceOverride({torch.float: tol(atol=1.5e-5, rtol=1.5e-5), }),
+                   'TestCommon', 'test_numpy_ref_mps'),
            ),
            skips=(
                # Reason for Skip: https://github.com/pytorch/pytorch/pull/79694#issuecomment-1186949486
@@ -10741,8 +10744,12 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     OpInfo('nn.functional.conv_transpose3d',
            aten_name='conv_transpose3d',
            aliases=('conv_transpose3d',),
-           dtypes=floating_types_and(torch.int64),
-           dtypesIfCUDA=floating_types_and(torch.float16, *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
+           # `ref` for this function is backward of
+           # corresponding `conv*d`
+           ref=partial(conv_transpose_ref, fn=torch.nn.functional.conv_transpose3d),
+           dtypes=floating_and_complex_types_and(torch.int64),
+           dtypesIfCUDA=floating_and_complex_types_and(
+               torch.float16, torch.chalf, *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
            sample_inputs_func=sample_inputs_conv_transpose3d,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -10752,24 +10759,45 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            decorators=[
                DecorateInfo(
-                   toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1.3e-06), }),
+                   toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1.3e-06),
+                                     torch.complex64: tol(atol=1.3e-04, rtol=1.3e-05)}),
                    'TestCommon', 'test_variant_consistency_eager', device_type='cuda'),
                DecorateInfo(
                    toleranceOverride({torch.float32: tol(atol=2e-04, rtol=2e-04), }),
                    'TestCompositeCompliance', 'test_operator', device_type='cuda'),
                DecorateInfo(
-                   toleranceOverride({torch.float32: tol(atol=1.3e-04, rtol=1.3e-06), }),
+                   toleranceOverride({torch.float32: tol(atol=1.3e-04, rtol=1.3e-06),
+                                     torch.complex64: tol(atol=1.3e-04, rtol=1.3e-05)}),
                    'TestCommon', 'test_noncontiguous_samples', device_type='cuda'),
                DecorateInfo(
                    toleranceOverride({torch.float32: tol(atol=1e-04, rtol=2e-05), }),
                    'TestCompositeCompliance', 'test_forward_ad', device_type='cuda',
-                   active_if=TEST_CUDNN)],
+                   active_if=TEST_CUDNN),
+               DecorateInfo(
+                   toleranceOverride({torch.complex32: tol(atol=5e-2, rtol=5e-2)}),
+                   "TestCudaFuserOpInfo", "test_nvfuser_correctness"),
+               DecorateInfo(
+                   toleranceOverride({torch.complex64: tol(atol=1e-4, rtol=1e-4)}),
+                   "TestMathBits", "test_conj_view", device_type='cuda'),
+               DecorateInfo(
+                   toleranceOverride({torch.chalf: tol(atol=9e-2, rtol=9e-2), }),
+                   'TestCommon', 'test_complex_half_reference_testing')],
            skips=(
                # RuntimeError: !lhs.isAliasOf(rhs)INTERNAL ASSERT FAILED at
                # "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":104, please report a bug to PyTorch.
                DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
                DecorateInfo(unittest.skip("Skipped! 75029"), 'TestCudaFuserOpInfo', 'test_nvfuser_correctness'),
                DecorateInfo(unittest.skip("Skipped! 75363"), 'TestCudaFuserOpInfo', 'test_nvfuser_extremal_values'),
+               # RuntimeError: "slow_conv3d_cpu_grad_input" not implemented for 'Long'
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_numpy_ref',
+                            dtypes=(torch.int64,)),
+               # Reference: https://github.com/pytorch/pytorch/issues/86356
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_numpy_ref',
+                            dtypes=(torch.double, torch.cdouble)),
+               DecorateInfo(unittest.skip("Unsupported on MPS for now"), 'TestCommon', 'test_numpy_ref_mps'),
+               # RuntimeError: UNSUPPORTED DTYPE: complex
+               DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness',
+                            dtypes=(torch.complex64, torch.complex128)),
            ),
            supports_out=False,),
     OpInfo('nn.functional.conv1d',
diff --git a/torch/testing/_internal/common_modules.py b/torch/testing/_internal/common_modules.py
index fed908e14dd03..1f395cbe606a2 100644
--- a/torch/testing/_internal/common_modules.py
+++ b/torch/testing/_internal/common_modules.py
@@ -1179,6 +1179,7 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, train
                )),
     ModuleInfo(torch.nn.ConvTranspose3d,
                module_inputs_func=partial(module_inputs_torch_nn_ConvNd, N=3, lazy=False, transposed=True),
+               dtypes=floating_and_complex_types_and(torch.chalf),
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
                skips=(
@@ -1190,9 +1191,28 @@ def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, train
                    # This was wrongly being skipped before and needs investigation.
                    # See https://github.com/pytorch/pytorch/issues/80247
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format"),
+                   # These fail only on ROCm
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='cuda',
+                                dtypes=[torch.complex32, torch.complex64], active_if=TEST_WITH_ROCM),
+                   # Not implmented for chalf on CPU
+                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_forward',
+                                dtypes=(torch.chalf,), device_type='cpu'),
+                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_memory_format',
+                                dtypes=(torch.chalf,), device_type='cpu'),
+                   DecorateInfo(unittest.expectedFailure, 'TestModule',
+                                'test_if_train_and_eval_modes_differ', dtypes=(torch.chalf,), device_type='cpu'),
+                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_non_contiguous_tensors',
+                                dtypes=(torch.chalf,), device_type='cpu'),
+                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_cpu_gpu_parity',
+                                dtypes=(torch.chalf,), device_type='cuda'),
+                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_multiple_device_transfer',
+                                dtypes=(torch.chalf,), device_type='cuda'),
+                   # Ref: https://github.com/pytorch/pytorch/issues/73502
+                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_pickle', dtypes=(torch.chalf,)),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
+                   DecorateInfo(precisionOverride({torch.complex64: 1e-04}), 'TestModule', 'test_cpu_gpu_parity'),
                )),
     ModuleInfo(torch.nn.ELU,
                module_inputs_func=module_inputs_torch_nn_ELU,

From c560034910c77eb07cd7675f8b64f2eec56a8fb9 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 2 Nov 2022 06:58:02 +0000
Subject: [PATCH 0433/1922] [Dynamo] UserFunctionVariable supports type &
 ABCMeta as arguments (#88257)

Fixes https://github.com/pytorch/torchdynamo/issues/1785

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88257
Approved by: https://github.com/ezyang
---
 test/dynamo/test_misc.py             | 29 ++++++++++++++++++++++++++++
 test/dynamo/test_repros.py           |  4 ----
 torch/_dynamo/variables/functions.py |  3 +++
 3 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index a0f592212f4e1..4589b827d7499 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -1,4 +1,5 @@
 # Owner(s): ["module: dynamo"]
+import abc
 import collections
 import copy
 import dataclasses
@@ -2670,6 +2671,34 @@ def fn(x):
         res = opt_fn(x)
         self.assertTrue(torch.allclose(ref, res))
 
+    def test_user_function_variable_supports_type_abcmeta_argument(self):
+        class Foo(metaclass=abc.ABCMeta):
+            @abc.abstractclassmethod
+            def read(self):
+                pass
+
+        class Bar(Foo):
+            def read(self):
+                return "Hello World!"
+
+        class Baz:
+            pass
+
+        def gn(x, tys=(Bar, Baz)):
+            if Bar in tys:
+                return x - 1
+            else:
+                return x + 1
+
+        def fn(x):
+            return gn(x)
+
+        x = torch.randn(2, 3)
+        ref = fn(x)
+        opt_fn = torch._dynamo.optimize("eager", nopython=True)(fn)
+        res = opt_fn(x)
+        self.assertTrue(torch.allclose(ref, res))
+
     def test_repro_graph_breaks_in__get_item_by_idx(self):
         class Mod(torch.nn.Module):
             def __init__(self):
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index fe32d2d98f856..efaf06a73580d 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -1471,8 +1471,6 @@ def forward(self, x):
 
         self.assertEqual(y, 10)
 
-    # AssertionError: ABCMeta
-    @unittest.expectedFailure
     def test_sort_out(self):
 
         dtype = torch.float32
@@ -1490,8 +1488,6 @@ def fn():
         opt_fn = torch._dynamo.optimize("eager")(fn)
         opt_fn()
 
-    # AssertionError: ABCMeta
-    @unittest.expectedFailure
     def test_sigmoid_out(self):
 
         dtype = torch.float32
diff --git a/torch/_dynamo/variables/functions.py b/torch/_dynamo/variables/functions.py
index d0f545ed3abf1..88be730c34236 100644
--- a/torch/_dynamo/variables/functions.py
+++ b/torch/_dynamo/variables/functions.py
@@ -1,3 +1,4 @@
+import abc
 import enum
 import functools
 import inspect
@@ -25,6 +26,8 @@ def wrap_bound_arg(val, options):
         return variables.ConstantVariable(val, **options)
     elif isinstance(val, enum.Enum):
         return variables.EnumVariable(val, **options)
+    elif isinstance(val, (type, abc.ABCMeta)):
+        return variables.UserDefinedClassVariable(val, **options)
     else:
         assert isinstance(val, VariableTracker), typestr(val)
         return val

From 1cc7568227c49a320b2280112c0170409a5fe683 Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <ivan.yashchuk@aalto.fi>
Date: Wed, 2 Nov 2022 09:29:20 +0000
Subject: [PATCH 0434/1922] Update caching of tensor arguments for nvFuser's
 fusion creation (#87860)

Previously nvFuser's fusion definition was cached based on concrete shape and strides of tensor inputs for simplicity and correctness. This PR changes Python's cache to check the number of dimensions, size-1 dimensions, and contiguity information based on given strides and shapes.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87860
Approved by: https://github.com/kevinstephano, https://github.com/jjsjann123, https://github.com/ngimel
---
 test/test_prims.py                            | 20 +++++++------
 torch/_prims/nvfuser_executor.py              | 29 +++++++++++++++----
 .../cuda/python_frontend/python_bindings.cpp  | 18 ++++++++++++
 3 files changed, 53 insertions(+), 14 deletions(-)

diff --git a/test/test_prims.py b/test/test_prims.py
index 6223a34e0a3a9..b6833352d0cfd 100644
--- a/test/test_prims.py
+++ b/test/test_prims.py
@@ -368,7 +368,7 @@ def func(b):
     def test_nvfuser_executor_cached_noncontiguous(self, device):
         # This test is to ensure that nvfuser computes correct results for noncontiguous tensors
         from torch.fx.experimental.proxy_tensor import make_fx
-        from torch._prims.context import TorchRefsMode
+        from torch._prims.context import TorchRefsNvfuserCapabilityMode
         from torch._prims.executor import execute
 
         a = torch.randn(3, 3, device=device)
@@ -376,16 +376,18 @@ def test_nvfuser_executor_cached_noncontiguous(self, device):
         def func(a):
             return torch.sigmoid(a)
 
-        with TorchRefsMode():
+        with TorchRefsNvfuserCapabilityMode():
             gm = make_fx(func)(a)
 
         # First run to create the cache
-        execute(gm, a, executor="nvfuser")
+        execute(gm, a, executor="strictly_nvfuser")
 
         # a.mT is noncontiguous, but it shouldn't affect correctness
         expected = execute(gm, a.mT, executor="aten")
-        actual = execute(gm, a.mT, executor="nvfuser")
-        self.assertEqual(expected, actual)
+        for use_python_cache in [True, False]:
+            params = {"use_python_fusion_cache": use_python_cache}
+            actual = execute(gm, a.mT, executor="strictly_nvfuser", executor_parameters=params)
+            self.assertEqual(expected, actual)
 
     def test_nvfuser_capability_context(self, device):
         # This test is to ensure that the torch calls are replaced with refs
@@ -506,7 +508,7 @@ def test_nvfuser_executor_partitioned(self, device):
         self.assertTrue(getattr(torch.ops.nvprims, "digamma", None) is None)
 
         from torch.fx.experimental.proxy_tensor import make_fx
-        from torch._prims.context import TorchRefsMode
+        from torch._prims.context import TorchRefsNvfuserCapabilityMode
         from torch._prims.executor import execute
 
         a = torch.randn(3, 4, device=device)
@@ -519,7 +521,7 @@ def func(a, b, c):
             dd = torch.sqrt(d)
             return torch.mul(aa, dd.digamma())
 
-        with TorchRefsMode():
+        with TorchRefsNvfuserCapabilityMode():
             gm = make_fx(func)(a, b, c)
 
         expected = execute(gm, a, b, c, executor="aten")
@@ -535,7 +537,7 @@ def test_nvfuser_executor_partitioned_no_partitions_error(self, device):
         self.assertTrue(getattr(torch.ops.nvprims, "digamma", None) is None)
 
         from torch.fx.experimental.proxy_tensor import make_fx
-        from torch._prims.context import TorchRefsMode
+        from torch._prims.context import TorchRefsNvfuserCapabilityMode
         from torch._prims.executor import execute
 
         a = torch.randn(3, 4, device=device)
@@ -543,7 +545,7 @@ def test_nvfuser_executor_partitioned_no_partitions_error(self, device):
         def func(a):
             return torch.digamma(a)  # not supported by nvfuser
 
-        with TorchRefsMode():
+        with TorchRefsNvfuserCapabilityMode():
             gm = make_fx(func)(a)
 
         with catch_warnings(record=True) as w:
diff --git a/torch/_prims/nvfuser_executor.py b/torch/_prims/nvfuser_executor.py
index 227e1847265bb..ae9dbfff781df 100644
--- a/torch/_prims/nvfuser_executor.py
+++ b/torch/_prims/nvfuser_executor.py
@@ -40,8 +40,8 @@
 # https://github.com/pytorch/pytorch/issues/80551
 @dataclass(frozen=True)
 class nvFuserTensorTemplate:
-    size: tuple
-    stride: tuple
+    symbolic_shape: tuple
+    contiguity: tuple
     dtype: DataType
     is_cpu: bool
 
@@ -51,12 +51,29 @@ class nvFuserScalarTemplate:
     dtype: DataType
 
 
+@lru_cache(maxsize=2048)
+def compute_symbolic_shape(shape):
+    """Computes the symbolic shape of a tensor.
+    nvFuser specializes on size-1 dimensions as broadcasted dimensions.
+    -1 is used to represent any size."""
+    return tuple(1 if s == 1 else -1 for s in shape)
+
+
+@lru_cache(maxsize=2048)
+def compute_contiguity(shape, strides):
+    """Computes the contiguity information to simplify internal indexing.
+    Contiguous dimensions are represented by True, strided dimensions
+    are represented by False.
+    """
+    return torch._C._nvfuser.compute_contiguity(shape, strides)
+
+
 def to_nvfuser_template_args(args):
     def to_nvfuser(arg):
         if isinstance(arg, torch.Tensor):
             return nvFuserTensorTemplate(
-                arg.size(),
-                arg.stride(),
+                compute_symbolic_shape(arg.size()),
+                compute_contiguity(arg.size(), arg.stride()),
                 getnvFuserDtype(arg.dtype),
                 arg.is_cpu,  # type: ignore[attr-defined]
             )
@@ -163,7 +180,9 @@ def call_function(self, target, args, kwargs):
 
         def templates_to_nvfuser_inputs(arg):
             if isinstance(arg, nvFuserTensorTemplate):
-                x = fd.define_tensor(arg.size, arg.stride, arg.dtype, arg.is_cpu)
+                x = fd.define_tensor(
+                    arg.symbolic_shape, arg.contiguity, arg.dtype, arg.is_cpu
+                )
                 return x
             elif isinstance(arg, nvFuserScalarTemplate):
                 x = fd.define_scalar(arg.dtype)
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp b/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
index b633732f8926d..12672d8985981 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
@@ -40,6 +40,24 @@ void initNvFuserPythonBindings(PyObject* module) {
       .value("ComplexDouble", Nvf::DataType::ComplexDouble)
       .value("Null", Nvf::DataType::Null);
 
+  nvfuser.def(
+      "compute_contiguity",
+      [](const std::vector<int64_t>& sizes,
+         const std::vector<int64_t>& strides) {
+        py::tuple contiguity(sizes.size());
+        TORCH_CHECK(
+            sizes.size() == strides.size(),
+            "compute_contiguity: Sizes and strides must have the same number of dimensions");
+        if (sizes.size() == 0) {
+          return contiguity;
+        }
+        contiguity[sizes.size() - 1] = strides.back() == 1;
+        for (int64_t i = static_cast<int64_t>(sizes.size()) - 2; i >= 0; --i) {
+          contiguity[i] = strides[i] == strides[i + 1] * sizes[i + 1];
+        }
+        return contiguity;
+      });
+
   //! Binding the FusionCache that holds a cache of Fusions
   //! This is only bound to provide an interface to get the number of fusions
   //! that are cached.

From 8c5c0edb99d76246c829e88c4e77de82eb5ac360 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Wed, 2 Nov 2022 09:38:13 +0000
Subject: [PATCH 0435/1922] Fix typos used in documents under torch directory
 (#88300)

This PR fixes typos, in comments of Python files, that are found from a search box at https://pytorch.org/docs/master/search.html

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88300
Approved by: https://github.com/lezcano
---
 torch/_lobpcg.py                              |  2 +-
 torch/_tensor_docs.py                         |  2 +-
 torch/_torch_docs.py                          | 14 ++++++-------
 torch/ao/nn/quantized/functional.py           |  2 +-
 torch/ao/quantization/quantize_fx.py          |  4 ++--
 torch/cuda/jiterator.py                       |  2 +-
 torch/cuda/memory.py                          |  2 +-
 torch/distributed/_composable/_ddp.py         |  4 ++--
 .../ddp_comm_hooks/powerSGD_hook.py           |  4 ++--
 torch/distributed/distributed_c10d.py         |  4 ++--
 torch/distributed/elastic/agent/server/api.py |  2 +-
 .../elastic/timer/file_based_local_timer.py   |  2 +-
 .../fsdp/fully_sharded_data_parallel.py       |  2 +-
 torch/distributed/nn/api/remote_module.py     |  4 ++--
 torch/fft/__init__.py                         |  2 +-
 torch/functional.py                           |  2 +-
 torch/jit/_trace.py                           |  2 +-
 torch/linalg/__init__.py                      | 20 +++++++++----------
 torch/nn/functional.py                        |  2 +-
 torch/nn/modules/activation.py                |  2 +-
 torch/nn/modules/module.py                    |  4 ++--
 torch/nn/parallel/distributed.py              |  8 ++++----
 torch/nn/parameter.py                         |  4 ++--
 torch/nn/utils/parametrize.py                 |  2 +-
 torch/optim/rprop.py                          |  2 +-
 torch/overrides.py                            |  2 +-
 torch/package/package_exporter.py             |  4 ++--
 torch/package/package_importer.py             |  2 +-
 torch/testing/_comparison.py                  |  2 +-
 torch/utils/benchmark/utils/timer.py          |  6 +++---
 .../utils/valgrind_wrapper/timer_interface.py |  2 +-
 31 files changed, 59 insertions(+), 59 deletions(-)

diff --git a/torch/_lobpcg.py b/torch/_lobpcg.py
index 273c93d038158..032783c2d24e4 100644
--- a/torch/_lobpcg.py
+++ b/torch/_lobpcg.py
@@ -399,7 +399,7 @@ def lobpcg(
       A (Tensor): the input tensor of size :math:`(*, m, m)`
 
       B (Tensor, optional): the input tensor of size :math:`(*, m,
-                  m)`. When not specified, `B` is interpereted as
+                  m)`. When not specified, `B` is interpreted as
                   identity matrix.
 
       X (tensor, optional): the input tensor of size :math:`(*, m, n)`
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 8b12032f22a3c..8c734a1f3774b 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -4263,7 +4263,7 @@ def callable(a, b) -> number
 
 Additionally accepts an optional :attr:`reduce` argument that allows
 specification of an optional reduction operation, which is applied to all
-values in the tensor :attr:`src` into :attr:`self` at the indicies
+values in the tensor :attr:`src` into :attr:`self` at the indices
 specified in the :attr:`index`. For each value in :attr:`src`, the reduction
 operation is applied to an index in :attr:`self` which is specified by
 its index in :attr:`src` for ``dimension != dim`` and by the corresponding
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index c7d3328598b0f..40375bae3e274 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -8834,7 +8834,7 @@ def merge_dicts(*dicts):
           If you plan to backpropagate through QR, note that the current backward implementation
           is only well-defined when the first :math:`\min(input.size(-1), input.size(-2))`
           columns of :attr:`input` are linearly independent.
-          This behavior will propably change once QR supports pivoting.
+          This behavior will probably change once QR supports pivoting.
 
 .. note:: This function uses LAPACK for CPU inputs and MAGMA for CUDA inputs,
           and may produce different (valid) decompositions on different device types
@@ -10227,7 +10227,7 @@ def merge_dicts(*dicts):
         as values.
     values (array_list): Initial values for the tensor. Can be a list,
         tuple, NumPy ``ndarray``, scalar, and other types that
-        represents a (1+K)-dimensonal tensor where ``K`` is the number
+        represents a (1+K)-dimensional tensor where ``K`` is the number
         of dense dimensions.
     size (list, tuple, :class:`torch.Size`, optional): Size of the
         sparse tensor: ``(*batchsize, nrows, ncols, *densesize)``. If
@@ -10287,7 +10287,7 @@ def merge_dicts(*dicts):
         values.
     values (array_list): Initial values for the tensor. Can be a list,
         tuple, NumPy ``ndarray``, scalar, and other types that
-        represents a (1+K)-dimensonal tensor where ``K`` is the number
+        represents a (1+K)-dimensional tensor where ``K`` is the number
         of dense dimensions.
     size (list, tuple, :class:`torch.Size`, optional): Size of the
         sparse tensor: ``(*batchsize, nrows, ncols, *densesize)``. If
@@ -10347,7 +10347,7 @@ def merge_dicts(*dicts):
         values.
     values (array_list): Initial values for the tensor. Can be a list,
         tuple, NumPy ``ndarray``, scalar, and other types that
-        represents a (1 + 2 + K)-dimensonal tensor where ``K`` is the
+        represents a (1 + 2 + K)-dimensional tensor where ``K`` is the
         number of dense dimensions.
     size (list, tuple, :class:`torch.Size`, optional): Size of the
         sparse tensor: ``(*batchsize, nrows * blocksize[0], ncols *
@@ -10412,7 +10412,7 @@ def merge_dicts(*dicts):
         as values.
     values (array_list): Initial blocks for the tensor. Can be a list,
         tuple, NumPy ``ndarray``, and other types that
-        represents a (1 + 2 + K)-dimensonal tensor where ``K`` is the
+        represents a (1 + 2 + K)-dimensional tensor where ``K`` is the
         number of dense dimensions.
     size (list, tuple, :class:`torch.Size`, optional): Size of the
         sparse tensor: ``(*batchsize, nrows * blocksize[0], ncols *
@@ -13119,7 +13119,7 @@ def merge_dicts(*dicts):
 
 Keyword args:
     output_size (int, optional): Total output size for the given axis
-        ( e.g. sum of repeats). If given, it will avoid stream syncronization
+        ( e.g. sum of repeats). If given, it will avoid stream synchronization
         needed to calculate output shape of the tensor.
 
 Returns:
@@ -13355,7 +13355,7 @@ def merge_dicts(*dicts):
     input (Tensor): quantized tensor
     kernel_size (list of int): the size of the sliding window
     stride (``list of int``, optional): the stride of the sliding window
-    padding (``list of int``, opttional): padding to be added on both sides, must be >= 0 and <= kernel_size / 2
+    padding (``list of int``, optional): padding to be added on both sides, must be >= 0 and <= kernel_size / 2
     dilation (``list of int``, optional): The stride between elements within a sliding window, must be > 0. Default 1
     ceil_mode (bool, optional):  If True, will use ceil instead of floor to compute the output shape.
         Defaults to False.
diff --git a/torch/ao/nn/quantized/functional.py b/torch/ao/nn/quantized/functional.py
index d0b100bd30567..b3169279082ae 100644
--- a/torch/ao/nn/quantized/functional.py
+++ b/torch/ao/nn/quantized/functional.py
@@ -422,7 +422,7 @@ def leaky_relu(input: Tensor, negative_slope: float = 0.01, inplace: bool = Fals
     :math:`\text{LeakyReLU}(x) = \max(0, x) + \text{negative\_slope} * \min(0, x)`
 
     Args:
-        input: Quaintized input
+        input: Quantized input
         negative_slope: The slope of the negative input
         inplace: Inplace modification of the input tensor
         scale, zero_point: Scale and zero point of the output tensor.
diff --git a/torch/ao/quantization/quantize_fx.py b/torch/ao/quantization/quantize_fx.py
index abd1cf1b8edbc..8f26934576580 100644
--- a/torch/ao/quantization/quantize_fx.py
+++ b/torch/ao/quantization/quantize_fx.py
@@ -291,7 +291,7 @@ def prepare_fx(
       * `_equalization_config`: config for specifying how to perform equalization on the model
 
       * `backend_config` (BackendConfig): config that specifies how operators are quantized
-         in a backend, this includes how the operaetors are observed,
+         in a backend, this includes how the operators are observed,
          supported fusion patterns, how quantize/dequantize ops are
          inserted, supported dtypes etc. See :class:`~torch.ao.quantization.backend_config.BackendConfig` for more details
 
@@ -489,7 +489,7 @@ def train_loop(model, train_data):
         qconfig_mapping = get_default_qat_qconfig("fbgemm")
 
         # We can customize qconfig_mapping in different ways, please take a look at
-        # the doctring for :func:`~torch.ao.quantization.prepare_fx` for different ways
+        # the docstring for :func:`~torch.ao.quantization.prepare_fx` for different ways
         # to configure this
 
         # example_inputs is a tuple of inputs, that is used to infer the type of the
diff --git a/torch/cuda/jiterator.py b/torch/cuda/jiterator.py
index 1616814e77819..562a66d47db2d 100644
--- a/torch/cuda/jiterator.py
+++ b/torch/cuda/jiterator.py
@@ -98,7 +98,7 @@ def _create_jit_fn(code_string: str, **kwargs) -> Callable:
         # invoke jitted function like a regular python function
         result = jitted_fn(a, b, alpha=3.14)
 
-    code_string also allows mulitple function definitions, and the last function will be treated as the entry function.
+    code_string also allows multiple function definitions, and the last function will be treated as the entry function.
 
     Example::
 
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index cd57326e9e97d..46bdda80bf87c 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -177,7 +177,7 @@ def memory_stats(device: Union[Device, int] = None) -> Dict[str, Any]:
 
     The caching allocator can be configured via ENV to not split blocks larger than a
     defined size (see Memory Management section of the Cuda Semantics documentation).
-    This helps avoid memory framentation but may have a performance
+    This helps avoid memory fragmentation but may have a performance
     penalty. Additional outputs to assist with tuning and evaluating impact:
 
     - ``"max_split_size"``: blocks above this size will not be split.
diff --git a/torch/distributed/_composable/_ddp.py b/torch/distributed/_composable/_ddp.py
index 51fac5419babc..76a4aa70c4224 100644
--- a/torch/distributed/_composable/_ddp.py
+++ b/torch/distributed/_composable/_ddp.py
@@ -439,7 +439,7 @@ class DistributedDataParallel(Module, Joinable):
         of ``DistributedDataParallel`` will register the additional gradient
         reduction functions on all the parameters of the model itself at the
         time of construction. If you change the model's parameters afterwards,
-        gradient redunction functions no longer match the correct set of
+        gradient reduction functions no longer match the correct set of
         parameters.
 
     .. warning::
@@ -515,7 +515,7 @@ class DistributedDataParallel(Module, Joinable):
                      3) Activation checkpointing when model has unused parameters.
                      4) There are model parameters that are outside of forward function.
                      5) Potentially improve performance when there are unused parameters,
-                     as DDP will not search graph in each iteraton to detect unused
+                     as DDP will not search graph in each iteration to detect unused
                      parameters when static_graph is set to be ``True``.
                      To check whether you can set static_graph to be ``True``, one way is to
                      check ddp logging data at the end of your previous model training,
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
index fb3662db23a29..2053879626cd2 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
@@ -123,9 +123,9 @@ class PowerSGDState(object):
 
         1.1. If ``matrix_approximation_rank`` is too low, the full model quality will need more training steps to reach or will never reach and yield loss in accuracy.
 
-        1.2. The increase of ``matrix_approximation_rank`` can substantially increase the computation costs of the compression, and the accuracy may not be futher improved beyond a certain ``matrix_approximation_rank`` threshold.
+        1.2. The increase of ``matrix_approximation_rank`` can substantially increase the computation costs of the compression, and the accuracy may not be further improved beyond a certain ``matrix_approximation_rank`` threshold.
 
-    To tune ``matrix_approximation_rank``, we suggest to start from 1 and increase by factors of 2 (like an expoential grid search, 1, 2, 4, ...), until a satisfactory accuracy is reached. Typically only a small value 1-4 is used. For some NLP tasks (as shown in Appendix D of the original paper), this value has been increased to 32.
+    To tune ``matrix_approximation_rank``, we suggest to start from 1 and increase by factors of 2 (like an exponential grid search, 1, 2, 4, ...), until a satisfactory accuracy is reached. Typically only a small value 1-4 is used. For some NLP tasks (as shown in Appendix D of the original paper), this value has been increased to 32.
 
     2. ``start_powerSGD_iter`` defers PowerSGD compression until step ``start_powerSGD_iter``, and vanilla allreduce runs prior to step ``start_powerSGD_iter``. This hybrid scheme of **vanilla allreduce + PowerSGD** can effectively improve the accuracy, even a relatively small ``matrix_approximation_rank`` is used. This is because that, the beginning of training phase is usually very sensitive to inaccurate gradients, and compressing gradients too early may make the training quickly take a suboptimal trajectory, which can result in an irrecoverable impact on the accuracy.
 
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 61b3718fb9308..af6ede4c7aeb2 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -2806,7 +2806,7 @@ def reduce_scatter_tensor(output, input, op=ReduceOp.SUM, group=None, async_op=F
         input (Tensor): Input tensor to be reduced and scattered. Its size
             should be output tensor size times the world size. The input tensor
             can have one of the following shapes:
-            (i) a concatentation of the output tensors along the primary
+            (i) a concatenation of the output tensors along the primary
             dimension, or
             (ii) a stack of the output tensors along the primary dimension.
             For definition of "concatenation", see ``torch.cat()``.
@@ -3215,7 +3215,7 @@ def monitored_barrier(group=GroupMember.WORLD, timeout=None, wait_all_ranks=Fals
     whole group exits the function successfully, making it useful for debugging
     and synchronizing. However, it can have a performance impact and should only
     be used for debugging or scenarios that require full synchronization points
-    on the host-side. For debugging purposees, this barrier can be inserted
+    on the host-side. For debugging purposes, this barrier can be inserted
     before the application's collective calls to check if any ranks are
     desynchronized.
 
diff --git a/torch/distributed/elastic/agent/server/api.py b/torch/distributed/elastic/agent/server/api.py
index 259632869d43e..2a0536166d4be 100644
--- a/torch/distributed/elastic/agent/server/api.py
+++ b/torch/distributed/elastic/agent/server/api.py
@@ -192,7 +192,7 @@ class WorkerState(str, Enum):
       INIT - worker group object created not yet started
       HEALTHY - workers running and healthy
       UNHEALTHY - workers running and unhealthy
-      STOPPED - workers stopped (interruped) by the agent
+      STOPPED - workers stopped (interrupted) by the agent
       SUCCEEDED - workers finished running (exit 0)
       FAILED - workers failed to successfully finish (exit !0)
 
diff --git a/torch/distributed/elastic/timer/file_based_local_timer.py b/torch/distributed/elastic/timer/file_based_local_timer.py
index 8ad78f2f26837..36ae944ec8e4f 100644
--- a/torch/distributed/elastic/timer/file_based_local_timer.py
+++ b/torch/distributed/elastic/timer/file_based_local_timer.py
@@ -75,7 +75,7 @@ class FileTimerClient(TimerClient):
         file_path: str, the path of a FIFO special file. ``FileTimerServer``
                         must have created it by calling os.mkfifo().
 
-        signal: singal, the signal to use to kill the process. Using a
+        signal: signal, the signal to use to kill the process. Using a
                         negative or zero signal will not kill the process.
     """
     def __init__(self, file_path: str, signal=signal.SIGKILL) -> None:
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 437c36edc3968..65b561005c837 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -366,7 +366,7 @@ class FullyShardedDataParallel(nn.Module):
             is not specified, otherwise we run ``param_init_fn`` to initialize the passed
             in ``nn.Module``. In particular, this means that if ``is_meta=True`` for any
             module parameters for modules that will be wrapped with FSDP and ``param_init_fn``
-            is not specified, we assume your module properly implements a ``reset_paramters()``
+            is not specified, we assume your module properly implements a ``reset_parameters()``
             and will throw errors if not. Note that additionally, we offer support for modules
             initialized with torchdistX's (https://github.com/pytorch/torchdistX)
             ``deferred_init`` API. In this case, deferred modules would be initialized
diff --git a/torch/distributed/nn/api/remote_module.py b/torch/distributed/nn/api/remote_module.py
index b7e81ad9d3e64..2b12959d331a1 100644
--- a/torch/distributed/nn/api/remote_module.py
+++ b/torch/distributed/nn/api/remote_module.py
@@ -133,7 +133,7 @@ def __init__(
         It creates a user-specified module on a specified remote node.
         It behaves like a regular ``nn.Module`` except that the ``forward`` method is
         executed on the remote node.
-        It takes care of autograd recording to ensure the backward pass propogates
+        It takes care of autograd recording to ensure the backward pass propagates
         gradients back to the corresponding remote module.
         It can be shared across processors using `RPC framework <https://pytorch.org/docs/stable/rpc.html>`__,
         without incurring any overheads of copying the actual module,
@@ -595,7 +595,7 @@ class RemoteModule(_RemoteModule):
         It creates a user-specified module on a specified remote node.
         It behaves like a regular ``nn.Module`` except that the ``forward`` method is
         executed on the remote node.
-        It takes care of autograd recording to ensure the backward pass propogates
+        It takes care of autograd recording to ensure the backward pass propagates
         gradients back to the corresponding remote module.
 
         It generates two methods ``forward_async`` and ``forward`` based on the
diff --git a/torch/fft/__init__.py b/torch/fft/__init__.py
index a9a6e3e846509..3bc5191c7b57d 100644
--- a/torch/fft/__init__.py
+++ b/torch/fft/__init__.py
@@ -1004,7 +1004,7 @@
 hfftn = _add_docstr(_fft.fft_hfftn, r"""
 hfftn(input, s=None, dim=None, norm=None, *, out=None) -> Tensor
 
-Computes the n-dimensional discrete Fourier transform of a Herimitian symmetric
+Computes the n-dimensional discrete Fourier transform of a Hermitian symmetric
 :attr:`input` signal.
 
 :attr:`input` is interpreted as a one-sided Hermitian signal in the time
diff --git a/torch/functional.py b/torch/functional.py
index cda6feddef00b..7e96d42fde30c 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -206,7 +206,7 @@ def einsum(*args: Any) -> Tensor:
     Equation:
 
         The :attr:`equation` string specifies the subscripts (letters in `[a-zA-Z]`) for each dimension of
-        the input :attr:`operands` in the same order as the dimensions, separating subcripts for each operand by a
+        the input :attr:`operands` in the same order as the dimensions, separating subscripts for each operand by a
         comma (','), e.g. `'ij,jk'` specify subscripts for two 2D operands. The dimensions labeled with the same subscript
         must be broadcastable, that is, their size must either match or be `1`. The exception is if a subscript is
         repeated for the same input operand, in which case the dimensions labeled with this subscript for this operand
diff --git a/torch/jit/_trace.py b/torch/jit/_trace.py
index b4352648df9c9..5fa5708931469 100644
--- a/torch/jit/_trace.py
+++ b/torch/jit/_trace.py
@@ -942,7 +942,7 @@ def trace_module(
         check_tolerance (float, optional): Floating-point comparison tolerance to use in the checker procedure.
                                            This can be used to relax the checker strictness in the event that
                                            results diverge numerically for a known reason, such as operator fusion.
-        example_inputs_is_kwarg (``bool``, optional): This parameter indicate wether the example inputs is a pack
+        example_inputs_is_kwarg (``bool``, optional): This parameter indicate whether the example inputs is a pack
                                            pack of keyword arguments. Default: ``False``.
 
     Returns:
diff --git a/torch/linalg/__init__.py b/torch/linalg/__init__.py
index b0706288106f6..e78cbbb3be357 100644
--- a/torch/linalg/__init__.py
+++ b/torch/linalg/__init__.py
@@ -232,7 +232,7 @@
 
         linalg.solve(A, B) == linalg.inv(A) @ B  # When B is a matrix
 
-    It is always prefered to use :func:`~solve` when possible, as it is faster and more
+    It is always preferred to use :func:`~solve` when possible, as it is faster and more
     numerically stable than computing the inverse explicitly.
 
 .. seealso::
@@ -582,7 +582,7 @@
     out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
 
 Returns:
-    A complex-valued tensor cointaining the eigenvalues even when :attr:`A` is real.
+    A complex-valued tensor containing the eigenvalues even when :attr:`A` is real.
 
 Examples::
 
@@ -751,7 +751,7 @@
     out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
 
 Returns:
-    A real-valued tensor cointaining the eigenvalues even when :attr:`A` is complex.
+    A real-valued tensor containing the eigenvalues even when :attr:`A` is complex.
     The eigenvalues are returned in ascending order.
 
 Examples::
@@ -780,7 +780,7 @@
 Let :math:`\mathbb{K}` be :math:`\mathbb{R}` or :math:`\mathbb{C}`, and
 let :math:`V \in \mathbb{K}^{m \times n}` be a matrix with columns :math:`v_i \in \mathbb{K}^m`
 for :math:`i=1,\ldots,m` with :math:`m \geq n`. Denote by :math:`w_i` the vector resulting from
-zeroing out the first :math:`i-1` compontents of :math:`v_i` and setting to `1` the :math:`i`-th.
+zeroing out the first :math:`i-1` components of :math:`v_i` and setting to `1` the :math:`i`-th.
 For a vector :math:`\tau \in \mathbb{K}^k` with :math:`k \leq n`, this function computes the
 first :math:`n` columns of the matrix
 
@@ -963,7 +963,7 @@
 :attr:`LD` and :attr:`pivots` are the compact representation of the LDL factorization and
 are expected to be computed by :func:`torch.linalg.ldl_factor_ex`.
 :attr:`hermitian` argument to this function should be the same
-as the corresponding argumens in :func:`torch.linalg.ldl_factor_ex`.
+as the corresponding arguments in :func:`torch.linalg.ldl_factor_ex`.
 
 Supports input of float, double, cfloat and cdouble dtypes.
 Also supports batches of matrices, and if :attr:`A` is a batch of matrices then
@@ -1040,7 +1040,7 @@
 
 This function returns the solution to the problem and some extra information in a named tuple of
 four tensors `(solution, residuals, rank, singular_values)`. For inputs :attr:`A`, :attr:`B`
-of shape `(*, m, n)`, `(*, m, k)` respectively, it cointains
+of shape `(*, m, n)`, `(*, m, k)` respectively, it contains
 
 - `solution`: the least squares solution. It has shape `(*, n, k)`.
 - `residuals`: the squared residuals of the solutions, that is, :math:`\|AX - B\|_F^2`.
@@ -1125,7 +1125,7 @@
 
         matrix_power(torch.linalg.solve(A, B), n) == matrix_power(A, -n)  @ B
 
-    It is always prefered to use :func:`~solve` when possible, as it is faster and more
+    It is always preferred to use :func:`~solve` when possible, as it is faster and more
     numerically stable than computing :math:`A^{-n}` explicitly.
 
 .. seealso::
@@ -1391,7 +1391,7 @@
 
 Supports input of float, double, cfloat and cdouble dtypes.
 
-This function does not necessarily treat multidimensonal :attr:`x` as a batch of
+This function does not necessarily treat multidimensional :attr:`x` as a batch of
 vectors, instead:
 
 - If :attr:`dim`\ `= None`, :attr:`x` will be flattened before the norm is computed.
@@ -1964,7 +1964,7 @@
 
         torch.linalg.lstsq(A, B).solution == A.pinv() @ B
 
-    It is always prefered to use :func:`~lstsq` when possible, as it is faster and more
+    It is always preferred to use :func:`~lstsq` when possible, as it is faster and more
     numerically stable than computing the pseudoinverse explicitly.
 
 .. note::
@@ -2546,7 +2546,7 @@
 
         linalg.tensorsolve(A, B) == torch.tensordot(linalg.tensorinv(A), B)  # When B is a tensor with shape A.shape[:B.ndim]
 
-    It is always prefered to use :func:`~tensorsolve` when possible, as it is faster and more
+    It is always preferred to use :func:`~tensorsolve` when possible, as it is faster and more
     numerically stable than computing the pseudoinverse explicitly.
 
 .. seealso::
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index ebed3d9a60d49..79bf6297e5871 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -1997,7 +1997,7 @@ def hardsigmoid(input: Tensor, inplace: bool = False) -> Tensor:
 
 Applies a linear transformation to the incoming data: :math:`y = xA^T + b`.
 
-This opperation supports 2-D :attr:`weight` with :ref:`sparse layout<sparse-docs>`
+This operation supports 2-D :attr:`weight` with :ref:`sparse layout<sparse-docs>`
 
 {sparse_beta_warning}
 
diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index 293760b8bb706..d7a9f13809d67 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -1344,7 +1344,7 @@ class Softmax(Module):
     .. math::
         \text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}
 
-    When the input Tensor is a sparse tensor then the unspecifed
+    When the input Tensor is a sparse tensor then the unspecified
     values are treated as ``-inf``.
 
     Shape:
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index 1ce6cc0742ab8..0b47d61defde8 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -737,7 +737,7 @@ def get_extra_state(self) -> Any:
         if you need to store extra state. This function is called when building the
         module's `state_dict()`.
 
-        Note that extra state should be pickleable to ensure working serialization
+        Note that extra state should be picklable to ensure working serialization
         of the state_dict. We only provide provide backwards compatibility guarantees
         for serializing Tensors; other objects may break backwards compatibility if
         their serialized pickled form changes.
@@ -1737,7 +1737,7 @@ def register_load_state_dict_post_hook(self, hook):
         ``strict=True`` are affected by modifications the hook makes to
         ``missing_keys`` or ``unexpected_keys``, as expected. Additions to either
         set of keys will result in an error being thrown when ``strict=True``, and
-        clearning out both missing and unexpected keys will avoid an error.
+        clearing out both missing and unexpected keys will avoid an error.
 
         Returns:
             :class:`torch.utils.hooks.RemovableHandle`:
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index bd6b769402076..30a20c86a1ac2 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -433,7 +433,7 @@ class DistributedDataParallel(Module, Joinable):
         of ``DistributedDataParallel`` will register the additional gradient
         reduction functions on all the parameters of the model itself at the
         time of construction. If you change the model's parameters afterwards,
-        gradient redunction functions no longer match the correct set of
+        gradient reduction functions no longer match the correct set of
         parameters.
 
     .. warning::
@@ -509,7 +509,7 @@ class DistributedDataParallel(Module, Joinable):
                      3) Activation checkpointing when model has unused parameters.
                      4) There are model parameters that are outside of forward function.
                      5) Potentially improve performance when there are unused parameters,
-                     as DDP will not search graph in each iteraton to detect unused
+                     as DDP will not search graph in each iteration to detect unused
                      parameters when static_graph is set to be ``True``.
                      To check whether you can set static_graph to be ``True``, one way is to
                      check ddp logging data at the end of your previous model training,
@@ -1175,7 +1175,7 @@ def train(self, mode=True):
         return self
 
     # When running in join mode, schedules an allreduce to notify joined ranks
-    # of whether backwards pass synchronization will run this iteraton or not.
+    # of whether backwards pass synchronization will run this iteration or not.
     def _check_global_requires_backward_grad_sync(self, is_joined_rank):
         if not is_joined_rank and self.require_backward_grad_sync:
             requires_sync_tensor = torch.ones(1, device=self.device)
@@ -1815,7 +1815,7 @@ def _set_ddp_runtime_logging_sample_rate(self, sample_rate):
         r"""
         This interface allows users to set sample_rate of collecting
         runtime stats. The runtime stats will be recorded for the
-        first 10 iterations, after 10 iteratons runtime stats will be
+        first 10 iterations, after 10 iterations runtime stats will be
         recorded once every "sample_rate" training iterations. In
         default, runtime stats are recorded for the first 10 iterations,
         after 10 iterations runtime stats are recorded once every
diff --git a/torch/nn/parameter.py b/torch/nn/parameter.py
index 3b033a191deca..e0f400f2642bf 100644
--- a/torch/nn/parameter.py
+++ b/torch/nn/parameter.py
@@ -156,7 +156,7 @@ def is_lazy(param):
 class UninitializedParameter(UninitializedTensorMixin, Parameter):
     r"""A parameter that is not initialized.
 
-    Unitialized Parameters are a a special case of :class:`torch.nn.Parameter`
+    Uninitialized Parameters are a a special case of :class:`torch.nn.Parameter`
     where the shape of the data is still unknown.
 
     Unlike a :class:`torch.nn.Parameter`, uninitialized parameters
@@ -187,7 +187,7 @@ def __deepcopy__(self, memo):
 class UninitializedBuffer(UninitializedTensorMixin, torch.Tensor):
     r"""A buffer that is not initialized.
 
-    Unitialized Buffer is a a special case of :class:`torch.Tensor`
+    Uninitialized Buffer is a a special case of :class:`torch.Tensor`
     where the shape of the data is still unknown.
 
     Unlike a :class:`torch.Tensor`, uninitialized parameters
diff --git a/torch/nn/utils/parametrize.py b/torch/nn/utils/parametrize.py
index b8f8d439c1b72..17de23a97a4ac 100644
--- a/torch/nn/utils/parametrize.py
+++ b/torch/nn/utils/parametrize.py
@@ -73,7 +73,7 @@ class ParametrizationList(ModuleList):
     It is the type of ``module.parametrizations[tensor_name]`` when ``module[tensor_name]``
     has been parametrized with :func:`register_parametrization`.
 
-    If the first registered parmetrization has a ``right_inverse`` that returns one tensor or
+    If the first registered parametrization has a ``right_inverse`` that returns one tensor or
     does not have a ``right_inverse`` (in which case we assume that ``right_inverse`` is the identity),
     it will hold the tensor under the name ``original``.
     If it has a ``right_inverse`` that returns more than one tensor, these will be registered as
diff --git a/torch/optim/rprop.py b/torch/optim/rprop.py
index 6fe9c9cca5b86..20e196a09df95 100644
--- a/torch/optim/rprop.py
+++ b/torch/optim/rprop.py
@@ -45,7 +45,7 @@ class Rprop(Optimizer):
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
         lr (float, optional): learning rate (default: 1e-2)
-        etas (Tuple[float, float], optional): pair of (etaminus, etaplis), that
+        etas (Tuple[float, float], optional): pair of (etaminus, etaplus), that
             are multiplicative increase and decrease factors
             (default: (0.5, 1.2))
         step_sizes (Tuple[float, float], optional): a pair of minimal and
diff --git a/torch/overrides.py b/torch/overrides.py
index 95e7c66111b5b..ce7872f9d1abe 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -1554,7 +1554,7 @@ def handle_torch_function(
     Arguments
     ---------
     relevant_args : iterable
-        Iterable or aguments to check for __torch_function__ methods.
+        Iterable or arguments to check for __torch_function__ methods.
     Returns
     -------
     bool
diff --git a/torch/package/package_exporter.py b/torch/package/package_exporter.py
index 81b5e650b518b..a95f105d2474c 100644
--- a/torch/package/package_exporter.py
+++ b/torch/package/package_exporter.py
@@ -202,7 +202,7 @@ def __init__(
             f: The location to export to. Can be a  ``string``/``Path`` object containing a filename
                 or a binary I/O object.
             importer: If a single Importer is passed, use that to search for modules.
-                If a sequence of importers are passsed, an ``OrderedImporter`` will be constructed out of them.
+                If a sequence of importers are passed, an ``OrderedImporter`` will be constructed out of them.
         """
         torch._C._log_api_usage_once("torch.package.PackageExporter")
 
@@ -574,7 +574,7 @@ def save_pickle(
         pickle_protocol: int = 3,
     ):
         """Save a python object to the archive using pickle. Equivalent to :func:`torch.save` but saving into
-        the archive rather than a stand-alone file. Stanard pickle does not save the code, only the objects.
+        the archive rather than a stand-alone file. Standard pickle does not save the code, only the objects.
         If ``dependencies`` is true, this method will also scan the pickled objects for which modules are required
         to reconstruct them and save the relevant code.
 
diff --git a/torch/package/package_importer.py b/torch/package/package_importer.py
index 6efa943f11e7e..2530f28d1501d 100644
--- a/torch/package/package_importer.py
+++ b/torch/package/package_importer.py
@@ -294,7 +294,7 @@ def file_structure(
 
         Args:
             include (Union[List[str], str]): An optional string e.g. ``"my_package.my_subpackage"``, or optional list of strings
-                for the names of the files to be inluded in the zipfile representation. This can also be
+                for the names of the files to be included in the zipfile representation. This can also be
                 a glob-style pattern, as described in :meth:`PackageExporter.mock`
 
             exclude (Union[List[str], str]): An optional pattern that excludes files whose name match the pattern.
diff --git a/torch/testing/_comparison.py b/torch/testing/_comparison.py
index 5cd2482ac867a..d15cae4b1bb5a 100644
--- a/torch/testing/_comparison.py
+++ b/torch/testing/_comparison.py
@@ -1228,7 +1228,7 @@ def assert_close(
 
         :func:`~torch.testing.assert_close` is highly configurable with strict default settings. Users are encouraged
         to :func:`~functools.partial` it to fit their use case. For example, if an equality check is needed, one might
-        define an ``assert_equal`` that uses zero tolrances for every ``dtype`` by default:
+        define an ``assert_equal`` that uses zero tolerances for every ``dtype`` by default:
 
         >>> import functools
         >>> assert_equal = functools.partial(torch.testing.assert_close, rtol=0, atol=0)
diff --git a/torch/utils/benchmark/utils/timer.py b/torch/utils/benchmark/utils/timer.py
index aec53d3f12819..61b05e144924c 100644
--- a/torch/utils/benchmark/utils/timer.py
+++ b/torch/utils/benchmark/utils/timer.py
@@ -159,14 +159,14 @@ class Timer(object):
 
         env:
             This tag indicates that otherwise identical tasks were run in
-            different environments, and are therefore not equivilent, for
+            different environments, and are therefore not equivalent, for
             instance when A/B testing a change to a kernel. `Compare` will
             treat Measurements with different `env` specification as distinct
             when merging replicate runs.
 
         num_threads:
             The size of the PyTorch threadpool when executing `stmt`. Single
-            threaded performace is important as both a key inference workload
+            threaded performance is important as both a key inference workload
             and a good indicator of intrinsic algorithmic efficiency, so the
             default is set to one. This is in contrast to the default PyTorch
             threadpool size which tries to utilize all cores.
@@ -377,7 +377,7 @@ def blocked_autorange(
 
             2) A large block size better amortizes the cost of `timer`
                invocation, and results in a less biased measurement. This is
-               important because CUDA syncronization time is non-trivial
+               important because CUDA synchronization time is non-trivial
                (order single to low double digit microseconds) and would
                otherwise bias the measurement.
 
diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
index 477e4704c458b..04663d915c26d 100644
--- a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
+++ b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
@@ -209,7 +209,7 @@ def stats(self, inclusive: bool = False) -> FunctionCounts:
     def counts(self, *, denoise: bool = False) -> int:
         """Returns the total number of instructions executed.
 
-        See `FunctionCounts.denoise()` for an explation of the `denoise` arg.
+        See `FunctionCounts.denoise()` for an explanation of the `denoise` arg.
         """
         stats = self.stmt_exclusive_stats
         return (stats.denoise() if denoise else stats).sum()

From 8c578f470a15a876f3876819811be50bd666f488 Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <ivan.yashchuk@aalto.fi>
Date: Wed, 2 Nov 2022 10:05:12 +0000
Subject: [PATCH 0436/1922] Add ops.broadcast for nvFuser (#88080)

Having nvFuser's `broadcast` available alongside `broadcast_in_dim` would allow easier experimentation.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88080
Approved by: https://github.com/jjsjann123, https://github.com/kevinstephano, https://github.com/mruberry
---
 test/test_nvfuser_frontend.py                 | 18 ++++
 .../cuda/python_frontend/fusion_record.h      | 84 +++++++++++++++++--
 .../cuda/python_frontend/python_bindings.cpp  | 20 ++++-
 3 files changed, 115 insertions(+), 7 deletions(-)

diff --git a/test/test_nvfuser_frontend.py b/test/test_nvfuser_frontend.py
index 28c5894a002c9..9974eb29c7271 100644
--- a/test/test_nvfuser_frontend.py
+++ b/test/test_nvfuser_frontend.py
@@ -189,6 +189,24 @@ def test_broadcast_mixing(self) :
         eager_out = refs.add(input1, prims.broadcast_in_dim(input2, [3, 3], [0]))
         self.assertEqual(eager_out, nvf_out)
 
+    def test_ops_broadcast(self) :
+        fs = Fusion()
+        with FusionDefinition(fs) as fd :
+            t0 = fd.define_tensor(1)
+            t1 = fd.define_tensor(3)
+
+            t0_b = fd.ops.broadcast(t0, [True, False, True])
+            t2 = fd.ops.add(t0_b, t1)
+
+            fd.add_output(t2)
+
+        input1 = torch.randn(3, device='cuda')
+        input2 = torch.randn(2, 3, 4, device='cuda')
+
+        nvf_out = fs.execute([input1, input2])[0]
+        eager_out = refs.add(prims.broadcast_in_dim(input1, [2, 3, 4], [1]), input2)
+        self.assertEqual(eager_out, nvf_out)
+
     def test_prim_layer_norm_fwd(self) :
         def primitive_definition(
             inputs: torch.Tensor,
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h b/torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h
index 674284e83ad07..b8105f1e4fb87 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h
@@ -19,6 +19,7 @@ enum class RecordType {
   Op,
   BatchNormOp,
   BroadcastOp,
+  BroadcastInDimOp,
   CastOp,
   Constant,
   End,
@@ -536,8 +537,8 @@ struct SqueezeOpRecord : RecordFunctor {
 
 //! Specialized Record Functor for the FusionDefinition's broadcast_in_dim op.
 
-struct BroadcastOpRecord : RecordFunctor {
-  BroadcastOpRecord(
+struct BroadcastInDimOpRecord : RecordFunctor {
+  BroadcastInDimOpRecord(
       std::vector<State> _args,
       std::vector<State> _outputs,
       std::string _name,
@@ -547,12 +548,12 @@ struct BroadcastOpRecord : RecordFunctor {
             std::move(_args),
             std::move(_outputs),
             _name,
-            RecordType::BroadcastOp),
+            RecordType::BroadcastInDimOp),
         output_shape_(std::move(output_shape)),
         broadcast_dims_(std::move(broadcast_dims)) {}
-  virtual ~BroadcastOpRecord() = default;
+  virtual ~BroadcastInDimOpRecord() = default;
   virtual RecordFunctor* clone() final {
-    return new BroadcastOpRecord(*this);
+    return new BroadcastInDimOpRecord(*this);
   }
 
   //! Child specific hash function in lower 32 bits.
@@ -574,7 +575,7 @@ struct BroadcastOpRecord : RecordFunctor {
 
   virtual bool operator==(const RecordFunctor& other) const final {
     auto result = false;
-    if (auto child_ptr = dynamic_cast<const BroadcastOpRecord*>(&other)) {
+    if (auto child_ptr = dynamic_cast<const BroadcastInDimOpRecord*>(&other)) {
       result = RecordFunctor::operator==(other);
       if (result) {
         result =
@@ -698,6 +699,77 @@ struct BroadcastOpRecord : RecordFunctor {
   std::vector<int64_t> broadcast_dims_;
 };
 
+//! Specialized Record Functor for the FusionDefinition's broadcast op.
+
+struct BroadcastOpRecord : RecordFunctor {
+  BroadcastOpRecord(
+      std::vector<State> _args,
+      std::vector<State> _outputs,
+      std::string _name,
+      std::vector<bool>& is_broadcast_dim)
+      : RecordFunctor(
+            std::move(_args),
+            std::move(_outputs),
+            _name,
+            RecordType::BroadcastOp),
+        is_broadcast_dim_(std::move(is_broadcast_dim)) {}
+  virtual ~BroadcastOpRecord() = default;
+  virtual RecordFunctor* clone() final {
+    return new BroadcastOpRecord(*this);
+  }
+
+  virtual size_t hash() const final {
+    auto result = RecordFunctor::hash();
+    size_t is_broadcast_dim_hash = 0;
+    for (size_t i = 0; i < is_broadcast_dim_.size(); ++i) {
+      is_broadcast_dim_hash |=
+          (is_broadcast_dim_[i] << (is_broadcast_dim_.size() - 1 - i));
+    }
+    return result | (is_broadcast_dim_hash & 0xfff);
+  }
+
+  virtual bool operator==(const RecordFunctor& other) const final {
+    auto result = false;
+    if (auto child_ptr = dynamic_cast<const BroadcastOpRecord*>(&other)) {
+      result = RecordFunctor::operator==(other);
+      result &= std::equal(
+          is_broadcast_dim_.begin(),
+          is_broadcast_dim_.end(),
+          child_ptr->is_broadcast_dim_.begin());
+    }
+    return result;
+  }
+
+  virtual void operator()(FusionDefinition& fd) final {
+    auto arg =
+        fd.getFusionState(args_.at(0).index)->template as<Nvf::TensorView>();
+    auto output = Nvf::broadcast(arg, is_broadcast_dim_);
+    fd.setFusionState(outputs_.at(0).index, output);
+  }
+
+  virtual void print(std::ostream& os, bool close_function = true) const {
+    RecordFunctor::print(os, false);
+    os << ", is_broadcast_dim=[";
+    bool first_arg = true;
+    for (auto dim : is_broadcast_dim_) {
+      if (first_arg) {
+        first_arg = false;
+      } else {
+        os << ", ";
+      }
+      os << (dim ? "True" : "False");
+    }
+    os << "]";
+    if (close_function) {
+      os << ")";
+    }
+  }
+
+ private:
+  //! Communicates which dimensions in the output are broadcasted.
+  std::vector<bool> is_broadcast_dim_;
+};
+
 template <class OutType, class ArgType>
 struct CastOpRecord : RecordFunctor {
   CastOpRecord(
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp b/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
index 12672d8985981..73c8ea6ce23cf 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
@@ -1364,7 +1364,7 @@ void initNvFuserPythonBindings(PyObject* module) {
             output_shape.size() >= broadcast_dims.size(),
             "broadcast_dims vector size is too big for output shape!");
         nvfuser::Tensor output = fd->defineTensor();
-        fd->defineRecord(new nvfuser::BroadcastOpRecord(
+        fd->defineRecord(new nvfuser::BroadcastInDimOpRecord(
             {fd->recordingState(arg())},
             {fd->recordingState(output())},
             "ops.broadcast_in_dim",
@@ -1376,6 +1376,24 @@ void initNvFuserPythonBindings(PyObject* module) {
       py::arg("output_shape"),
       py::arg("broadcast_dims"),
       py::return_value_policy::reference);
+  nvf_ops.def(
+      "broadcast",
+      [](nvfuser::FusionDefinition::Operators& self,
+         nvfuser::Tensor arg,
+         std::vector<bool>& is_broadcast_dim) -> nvfuser::Tensor {
+        FUSER_PERF_SCOPE("Operators.broadcast");
+        nvfuser::FusionDefinition* fd = self.fusion_definition;
+        nvfuser::Tensor output = fd->defineTensor();
+        fd->defineRecord(new nvfuser::BroadcastOpRecord(
+            {fd->recordingState(arg())},
+            {fd->recordingState(output())},
+            "ops.broadcast",
+            is_broadcast_dim));
+        return output;
+      },
+      py::arg("arg"),
+      py::arg("is_broadcast_dim"),
+      py::return_value_policy::reference);
 }
 
 } // namespace jit

From 14786e957ce2994975c8aa0a5efb547d1c3bf09b Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <ivan.yashchuk@aalto.fi>
Date: Wed, 2 Nov 2022 11:11:28 +0000
Subject: [PATCH 0437/1922] Add a basic test for "nvprims_nvfuser" Dynamo
 backend (#88186)

Ref. https://github.com/pytorch/pytorch/pull/87797#issuecomment-1297635210

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88186
Approved by: https://github.com/ezyang
---
 test/test_nvfuser_dynamo.py             | 50 +++++++++++++++++++++++++
 torch/_dynamo/optimizations/training.py |  4 +-
 2 files changed, 53 insertions(+), 1 deletion(-)
 create mode 100644 test/test_nvfuser_dynamo.py

diff --git a/test/test_nvfuser_dynamo.py b/test/test_nvfuser_dynamo.py
new file mode 100644
index 0000000000000..5d3aeff267fd1
--- /dev/null
+++ b/test/test_nvfuser_dynamo.py
@@ -0,0 +1,50 @@
+# Owner(s): ["module: nvfuser"]
+
+import unittest
+import warnings
+
+import torch
+import torch._dynamo as torchdynamo
+from torch.testing import make_tensor
+from torch.testing._internal.common_utils import (
+    run_tests,
+    skipIfTorchDynamo,
+    TEST_WITH_ROCM,
+    TestCase,
+    IS_WINDOWS,
+)
+from torch.testing._internal.jit_utils import RUN_CUDA
+
+RUN_NVFUSER = RUN_CUDA and not TEST_WITH_ROCM
+
+
+def is_pre_volta():
+    if not RUN_NVFUSER:
+        return False
+    prop = torch.cuda.get_device_properties(torch.cuda.current_device())
+    return prop.major < 7
+
+
+@skipIfTorchDynamo("Not a suitable test for TorchDynamo")
+@unittest.skipIf(IS_WINDOWS, "TorchDynamo is not supported on Windows")
+@unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+@unittest.skipIf(is_pre_volta(), "Only supported on Volta and newer devices.")
+class TestNvFuserDynamo(TestCase):
+    def test_basic(self):
+        input1 = make_tensor((2, 4, 8), device="cuda", dtype=torch.float32)
+        input2 = make_tensor((2, 4, 8), device="cuda", dtype=torch.float32)
+
+        @torchdynamo.optimize("nvprims_nvfuser")
+        def func(a, b):
+            return a.sin() + b.cos()
+
+        # No warnings and no errors
+        with warnings.catch_warnings(record=True) as w:
+            nvfuser_result = func(input1, input2)
+            self.assertEqual(len(w), 0)
+        eager_result = func.__wrapped__(input1, input2)
+        self.assertEqual(eager_result, nvfuser_result)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/_dynamo/optimizations/training.py b/torch/_dynamo/optimizations/training.py
index 3684a1368d2d0..536706cf4c8f5 100644
--- a/torch/_dynamo/optimizations/training.py
+++ b/torch/_dynamo/optimizations/training.py
@@ -261,6 +261,8 @@ def __call__(self, gm: torch.fx.GraphModule, example_inputs):
 
 
 def prims_executor(gm, inputs, *, executor):
+    from functorch.compile import make_boxed_func
+
     # This function is called once per forward/backward pass of a graph in AOT
     # Autograd. We use it to set up the nvFuser-specific FX graph and return
     # execute function.
@@ -274,7 +276,7 @@ def prims_executor(gm, inputs, *, executor):
         prim_gm = make_fx(gm)(*inputs)
 
     # Then we return a callable that executes the "prim_gm" graph
-    return partial(execute, prim_gm, executor=executor)
+    return make_boxed_func(partial(execute, prim_gm, executor=executor))
 
 
 def create_nvprims_backend(*, executor):

From 1a54f274f4686425e5dddff835e4f676ce4bd84b Mon Sep 17 00:00:00 2001
From: lezcano <lezcano-93@hotmail.com>
Date: Tue, 1 Nov 2022 16:31:11 +0000
Subject: [PATCH 0438/1922] Implement reference for lerp (#87424)

We follow the vectorised CPU implementation for numerical accuracy

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87424
Approved by: https://github.com/ezyang
---
 test/functorch/test_aotdispatch.py            |  1 -
 test/test_meta.py                             |  2 ++
 test/test_proxy_tensor.py                     |  2 +-
 torch/_refs/__init__.py                       | 31 +++++++++++++++++++
 .../_internal/common_methods_invocations.py   |  5 +++
 5 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 713dd3c8ae6bc..4940974cd7e4b 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1035,7 +1035,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('inner', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('kron', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('kthvalue', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('lerp', ''),  # aten.lerp.Scalar - couldn't find symbolic meta function/decomposition
     xfail('linalg.cholesky_ex', ''),  # aten.linalg_cholesky_ex.default - couldn't find symbolic meta functio...
     xfail('linalg.cond', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('linalg.cross', ''),  # aten.linalg_cross.default - couldn't find symbolic meta function/decomposition
diff --git a/test/test_meta.py b/test/test_meta.py
index e379cd5d14f2a..758ea2c1e2fb4 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -325,6 +325,8 @@ def test_tensor_outlives_converter(self):
     aten.igammac.default,
     aten.lcm.default,
     aten.le.Tensor,
+    aten.lerp.Scalar,
+    aten.lerp.Tensor,
     aten.logical_and.default,
     aten.logical_or.default,
     aten.logical_xor.default,
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 9b6b745ee4c2c..9059f9ef458a3 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1193,7 +1193,6 @@ def f(a, b, c, d, e):
     xfail('isin', ''),  # aten.isin.Tensor_Tensor - couldn't find symbolic meta function/decomposition
     xfail('kron', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('kthvalue', ''),  # aten.kthvalue.default - couldn't find symbolic meta function/decomposition
-    xfail('lerp', ''),  # aten.lerp.Scalar - couldn't find symbolic meta function/decomposition
     xfail('linalg.cholesky', ''),  # aten.linalg_cholesky_ex.default - couldn't find symbolic meta function/decomposition
     xfail('linalg.cholesky_ex', ''),  # aten.linalg_cholesky_ex.default - couldn't find symbolic meta function/decomposition
     xfail('linalg.cond', ''),  # Tensors of type TensorImpl do not have numel
@@ -1418,6 +1417,7 @@ def f(a, b, c, d, e):
     xfail('igamma', ''),  # aten.igamma_.default - couldn't find symbolic meta function/decomposition
     xfail('igammac', ''),  # aten.igammac_.default - couldn't find symbolic meta function/decomposition
     xfail('le', ''),  # aten.le_.Tensor - couldn't find symbolic meta function/decomposition
+    xfail('lerp', ''),  # aten.lerp_.default - couldn't find symbolic meta function/decomposition
     xfail('lgamma', ''),  # aten.lgamma_.default - couldn't find symbolic meta function/decomposition
     xfail('log10', ''),  # aten.log10_.default - couldn't find symbolic meta function/decomposition
     xfail('log1p', ''),  # aten.log1p_.default - couldn't find symbolic meta function/decomposition
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 0ab10701332ae..96a11c207ae58 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -85,6 +85,7 @@
     "isnan",
     "isreal",
     "i0",
+    "lerp",
     "lgamma",
     "log",
     "log1p",
@@ -4060,6 +4061,36 @@ def arange(
     )
 
 
+@register_decomposition(torch.ops.aten.lerp)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("start", "end", "weight"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def lerp(start: Tensor, end: Tensor, weight: Union[Tensor, NumberType]):
+    check(
+        start.dtype == end.dtype,
+        lambda: f"expected dtype {start.dtype} for `end` but got dtype {end.dtype}",
+    )
+    if isinstance(weight, Number):
+        weight = start.new_full((), weight)  # type: ignore[arg-type]
+    else:
+        check(
+            start.dtype == weight.dtype,
+            lambda: f"expected dtype {start.dtype} for `weight` but got dtype {weight.dtype}",  # type: ignore[union-attr]
+        )
+    assert isinstance(weight, Tensor)  # mypy
+    # We implement it this way for numerical stability. We assume (in the stability optimisation)
+    # that 0 <= weight <= 1. We take the abs to deal with comples numbers
+    # We want to do operations near zero, which is where floating points are most precise
+    # If weight.abs() >= 0.5:
+    #    return (1 - weight) * (start - end) + end
+    mask = weight.abs() >= 0.5
+    coeff = torch.where(mask, weight - 1, weight)
+    base = torch.where(mask, end, start)
+    return coeff * (end - start) + base
+
+
 @register_decomposition(torch.ops.aten.linspace)
 @out_wrapper()
 def linspace(
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index ba2e8bc492ce9..7c501fb411ed3 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -16483,6 +16483,11 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         torch_opinfo_name="asinh",
         supports_nvfuser=False,
     ),
+    PythonRefInfo(
+        "_refs.lerp",
+        torch_opinfo_name="lerp",
+        supports_nvfuser=False,
+    ),
     PythonRefInfo(
         "_refs.ones",
         torch_opinfo_name="ones",

From d74c58cb0be722835a68449ac85d79d2d58488ca Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Tue, 1 Nov 2022 22:47:10 +0000
Subject: [PATCH 0439/1922] [FSDP()][17/N] Refactor `_fsdp_root_pre_forward()`
 (#87930)

This PR moves `_fsdp_root_pre_forward()` to `_runtime_utils.py`.

Note: This PR includes a (temporary) fix for `NO_SHARD` + `CPUOffload(offload_params=True)`, where we set `non_blocking=False` when copying the gradient from device to host. It is only included in this PR since the test was **flaky** (but not consistently failing) on this PR , so I needed to fix to unblock land.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87930
Approved by: https://github.com/mrshenli
---
 torch/distributed/fsdp/_common_utils.py       |  9 ++++
 torch/distributed/fsdp/_runtime_utils.py      | 49 +++++++++++++++++--
 .../fsdp/fully_sharded_data_parallel.py       | 36 +-------------
 3 files changed, 55 insertions(+), 39 deletions(-)

diff --git a/torch/distributed/fsdp/_common_utils.py b/torch/distributed/fsdp/_common_utils.py
index 992df5606a70e..f97e72faa2418 100644
--- a/torch/distributed/fsdp/_common_utils.py
+++ b/torch/distributed/fsdp/_common_utils.py
@@ -56,6 +56,15 @@ def _is_composable(state: _State):
     return not isinstance(state, nn.Module)
 
 
+@no_type_check
+def _all_handles(state: _State):
+    return (
+        state._handles
+        if _is_composable(state)
+        else state._fsdp_handles(state)  # `FullyShardedDataParallel`
+    )
+
+
 def clean_tensor_name(tensor_name: str) -> str:
     """
     Cleans the parameter or buffer name by removing any module wrapper
diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index 3d34dadf9e000..eaacb11ea1033 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -6,6 +6,7 @@
 import torch.nn.functional as F
 from torch.distributed.algorithms._comm_hooks import LOW_PRECISION_HOOKS
 from torch.distributed.fsdp._common_utils import (
+    _all_handles,
     _assert_in_training_states,
     _is_composable,
     _State,
@@ -180,6 +181,44 @@ def _post_forward(
     return output
 
 
+@no_type_check
+def _fsdp_root_pre_forward(
+    state: _State,
+    *args,
+    **kwargs,
+):
+    """
+    Runs pre-forward logic specific to the root FSDP instance, which should run
+    before any individual module's pre-forward. If this is called on a non-root
+    FSDP instance, then the forward inputs are returned directly.
+    """
+    p_assert(state._is_root is not None, "Expects a root FSDP to have been set")
+    if not state._is_root:
+        return args, kwargs
+    if state.forward_prefetch:
+        handles_keys = []
+        if _is_composable(state):
+            # TODO: This assumes singleton handles keys.
+            handles_keys = [tuple(handle) for handle in state._handles]
+        else:
+            for fsdp_module in state.fsdp_modules(state):
+                handles_key = tuple(fsdp_module._handles)
+                handles_keys.append(handles_key)
+        for handles_key in handles_keys:
+            state._needs_pre_forward_unshard[handles_key] = True
+    _wait_for_computation_stream(
+        torch.cuda.current_stream(),
+        state._streams["unshard"],
+        state._streams["pre_unshard"],
+    )
+    _clear_grads_if_needed(_all_handles(state))
+    input_dtype: Optional[torch.dtype] = state.mixed_precision.param_dtype
+    args, kwargs = _prepare_forward_inputs(
+        state.compute_device, input_dtype, *args, **kwargs
+    )
+    return args, kwargs
+
+
 @no_type_check
 def _pre_backward_hook(
     state: _State,
@@ -201,10 +240,7 @@ def _pre_backward_hook(
         # after all backward calls complete
         if state._is_root and not state._post_backward_callback_queued:
             state._queue_wait_for_post_backward()
-            all_handles = (
-                state._fsdp_handles(state) if _is_composable(state) else state._handles
-            )
-            _clear_grads_if_needed(all_handles)
+            _clear_grads_if_needed(_all_handles(state))
         elif _handles_key:
             _assert_in_training_states(state, [TrainingState.IDLE])
         state.training_state = TrainingState.FORWARD_BACKWARD
@@ -343,8 +379,11 @@ def _post_backward_hook(
             if handle._config.offload_params:
                 # Offload the gradient to CPU to ensure parameters and
                 # gradients are on the same device as required by the optimizer
+                # TODO: Investigate why `NO_SHARD` breaks correctness when
+                # using `non_blocking=True` here.
+                non_blocking = handle.uses_sharded_strategy
                 param._cpu_grad.copy_(  # type: ignore[attr-defined]
-                    sharded_grad.detach(), non_blocking=True
+                    sharded_grad.detach(), non_blocking=non_blocking
                 )  # synchronized in the post-backward callback
                 # Since the sharded gradient is produced in the post-backward
                 # stream and consumed later in the computation stream, inform
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 65b561005c837..33322d681865e 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -54,10 +54,10 @@
 )
 from torch.distributed.fsdp._runtime_utils import (
     _clear_grads_if_needed,
+    _fsdp_root_pre_forward,
     _post_forward,
     _pre_forward,
     _prefetch_handles,
-    _prepare_forward_inputs,
     _reshard,
     _reshard_grads,
     _should_free_in_backward,
@@ -1169,7 +1169,7 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
             "FullyShardedDataParallel.forward"
         ):
             self._lazy_init()
-            args, kwargs = self._fsdp_root_pre_forward(*args, **kwargs)
+            args, kwargs = _fsdp_root_pre_forward(self, *args, **kwargs)
             unused = None
             unshard_fn = functools.partial(
                 self._pre_forward_unshard, handles=self._handles
@@ -1217,38 +1217,6 @@ def _pre_forward_unshard(
             torch.cuda.current_stream().wait_stream(self._streams["unshard"])
             _prefetch_handles(self, handles_key)
 
-    def _fsdp_root_pre_forward(self, *args, **kwargs):
-        """
-        Runs pre-forward logic specific to the root FSDP instance, which should
-        run before any individual module's pre-forward. This includes
-        synchronizing with the previous iteration and casting the forward
-        inputs appropriately. If this is called on a non-root FSDP instance,
-        then the forward inputs are returned directly.
-        """
-        p_assert(self._is_root is not None, "Expects a root FSDP to have been set")
-        if not self._is_root:
-            return args, kwargs
-        if self.forward_prefetch:
-            for fsdp_module in self.fsdp_modules(self):
-                handles_key = tuple(fsdp_module._handles)
-                if handles_key:
-                    self._needs_pre_forward_unshard[handles_key] = True
-        _wait_for_computation_stream(
-            torch.cuda.current_stream(),
-            self._streams["unshard"],
-            self._streams["pre_unshard"],
-        )
-        _clear_grads_if_needed(self._fsdp_handles(self))
-        input_dtype = (
-            self.mixed_precision.param_dtype
-            if self._mixed_precision_enabled_for_params()
-            else None
-        )
-        args, kwargs = _prepare_forward_inputs(
-            self.compute_device, input_dtype, *args, **kwargs
-        )
-        return args, kwargs
-
     @staticmethod
     @contextlib.contextmanager
     def summon_full_params(

From 33b6bf57f6b3b219f31531eede7e3f84f5042ad4 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Tue, 1 Nov 2022 22:47:10 +0000
Subject: [PATCH 0440/1922] [FSDP()][18/N] Refactor `pre_forward_unshard()`
 (#87931)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87931
Approved by: https://github.com/mrshenli
---
 torch/distributed/fsdp/_runtime_utils.py      | 141 +++++++++++++++++-
 .../fsdp/fully_sharded_data_parallel.py       |  38 +----
 2 files changed, 143 insertions(+), 36 deletions(-)

diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index eaacb11ea1033..9af6fa61a28cd 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -4,6 +4,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.autograd import Variable
 from torch.distributed.algorithms._comm_hooks import LOW_PRECISION_HOOKS
 from torch.distributed.fsdp._common_utils import (
     _all_handles,
@@ -141,6 +142,21 @@ def _pre_forward(
     _register_post_backward_hooks(state, handles)
 
 
+@no_type_check
+def _pre_forward_unshard(
+    state: _State,
+    handles: List[FlatParamHandle],
+) -> None:
+    """Unshards parameters in the pre-forward."""
+    if not handles:
+        return
+    _unshard(state, handles, state._streams["unshard"], state._streams["pre_unshard"])
+    handles_key = tuple(handles)
+    state._needs_pre_forward_unshard[handles_key] = False
+    torch.cuda.current_stream().wait_stream(state._streams["unshard"])
+    _prefetch_handles(state, handles_key)
+
+
 @no_type_check
 def _post_forward(
     state: _State,
@@ -239,7 +255,7 @@ def _pre_backward_hook(
         # attach it to the outermost backward graph task so that it is called
         # after all backward calls complete
         if state._is_root and not state._post_backward_callback_queued:
-            state._queue_wait_for_post_backward()
+            _register_post_backward_final_callback(state)
             _clear_grads_if_needed(_all_handles(state))
         elif _handles_key:
             _assert_in_training_states(state, [TrainingState.IDLE])
@@ -482,6 +498,109 @@ def _low_precision_hook_enabled(state: _State) -> bool:
     return state._communication_hook in LOW_PRECISION_HOOKS
 
 
+@no_type_check
+@torch.no_grad()
+def _post_backward_final_callback(
+    state: _State,
+):
+    """
+    This waits for the post-backward to finish and performs some final cleanup.
+    This runs at the end of the entire backward pass and should only be called
+    on the root FSDP instance.
+    """
+    p_assert(
+        state._is_root,
+        "The post-backward callback should only be called on the root FSDP instance",
+    )
+
+    if state._sync_gradients:
+        torch.cuda.current_stream().wait_stream(state._streams["post_backward"])
+        if state.cpu_offload.offload_params:
+            # Wait for non-blocking GPU -> CPU sharded gradient copies from the
+            # post-backward hooks to finish explicitly since CPU gradients do
+            # not automatically synchronize with the GPU
+            torch.cuda.current_stream().synchronize()
+    state._exec_order_data.next_iter()
+
+    states = [state] if _is_composable(state) else state.fsdp_modules(state)
+    for state in states:
+        _catch_all_reshard(state)
+        _finalize_params(state)
+        state._ran_pre_backward_hook.clear()
+        state.training_state = TrainingState.IDLE
+        for handle in state._handles:
+            handle._training_state = HandleTrainingState.IDLE
+        state._handles_prefetched.clear()
+    # Reset for cases like one forward and multiple backwards
+    state._post_backward_callback_queued = False
+
+
+@no_type_check
+def _catch_all_reshard(
+    state: _State,
+) -> None:
+    """
+    Reshards the parameters that may not have been resharded in the
+    post-backward hook. This can happen when a module's output is used in the
+    forward pass, meaning that its pre-backward hook runs (unsharding the
+    parameter), but the post-backward hook does not run because the output was
+    not jused in the loss computation corresponding to this backward pass.
+    """
+    # Wrap with a try-except to provide a more informative traceback if an
+    # error is raised
+    try:
+        free_unsharded_flat_params: List[bool] = []
+        handles_to_reshard: List[FlatParamHandle] = []
+        for handle in state._handles:
+            # TODO: This already-resharded check is brittle:
+            # https://github.com/pytorch/pytorch/issues/83956
+            already_resharded = (
+                handle.flat_param.data_ptr()
+                == handle.flat_param._local_shard.data_ptr()
+            )
+            if already_resharded:
+                continue
+            free_unsharded_flat_params.append(_should_free_in_backward(state, handle))
+            handles_to_reshard.append(handle)
+        _reshard(state, handles_to_reshard, free_unsharded_flat_params)
+    except Exception as e:
+        p_assert(
+            False,
+            f"Got exception in the catch-all reshard for {state}: {str(e)}",
+            raise_assertion_error=False,
+        )
+        raise e
+
+
+@no_type_check
+def _finalize_params(
+    state: _State,
+) -> None:
+    """Finalizes the parameters before the next iteration."""
+    for handle in state._handles:
+        flat_param = handle.flat_param
+        if flat_param.requires_grad:
+            if hasattr(flat_param, "_post_backward_hook_state"):
+                p_assert(
+                    len(flat_param._post_backward_hook_state) == 2,
+                    f"Invalid: ``_post_backward_hook_state``: {flat_param._post_backward_hook_state}",
+                )
+                flat_param._post_backward_hook_state[1].remove()
+                delattr(flat_param, "_post_backward_hook_state")
+            if not state._sync_gradients:
+                # Preserve the gradient accumulation state if not synchronizing
+                # gradients: `.grad` remains the unsharded gradient  from prior
+                # `no_sync()` iterations, and `_saved_grad_shard` remains the
+                # sharded gradient from the last synchronized iteration
+                continue
+            handle.prepare_gradient_for_optim()
+            p_assert(
+                hasattr(flat_param, "_post_backward_called"),
+                "Expects `_post_backward_called` to be set on the `FlatParameter`",
+            )
+            flat_param._post_backward_called = False
+
+
 @no_type_check
 def _prefetch_handles(
     state: _State,
@@ -646,6 +765,26 @@ def _register_post_backward_hooks(
         flat_param._post_backward_hook_state = (acc_grad, hook_handle)  # type: ignore[attr-defined]
 
 
+@no_type_check
+def _register_post_backward_final_callback(state: _State) -> None:
+    """
+    Registers the post-backward final callback that runs at the end of the
+    backward pass. This should be called from the root FSDP instance at the
+    beginning of the pre-backward.
+    """
+    p_assert(
+        state._is_root,
+        "Only the root FSDP instance should register the post-backward callback",
+    )
+    if state._post_backward_callback_queued:
+        return
+    _assert_in_training_states(state, [TrainingState.IDLE])
+    state._post_backward_callback_queued = True
+    Variable._execution_engine.queue_callback(
+        functools.partial(_post_backward_final_callback, state)
+    )
+
+
 def _wait_for_computation_stream(
     computation_stream: torch.cuda.Stream,
     unshard_stream: torch.cuda.Stream,
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 33322d681865e..f64fc18832946 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -24,7 +24,6 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-from torch.autograd import Variable
 from torch.distributed import ProcessGroup
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     _CHECKPOINT_PREFIX,
@@ -57,7 +56,7 @@
     _fsdp_root_pre_forward,
     _post_forward,
     _pre_forward,
-    _prefetch_handles,
+    _pre_forward_unshard,
     _reshard,
     _reshard_grads,
     _should_free_in_backward,
@@ -907,7 +906,7 @@ def _init_param_attributes(self, handle: FlatParamHandle) -> None:
                 _free_storage(p._full_prec_full_param_padded)
 
         # Track whether the `FlatParameter`'s post-backward hook has been
-        # called for validation in `_wait_for_post_backward()`
+        # called for validation in the post-backward callback
         p._post_backward_called = False
 
     @staticmethod
@@ -1171,9 +1170,7 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
             self._lazy_init()
             args, kwargs = _fsdp_root_pre_forward(self, *args, **kwargs)
             unused = None
-            unshard_fn = functools.partial(
-                self._pre_forward_unshard, handles=self._handles
-            )
+            unshard_fn = functools.partial(_pre_forward_unshard, self, self._handles)
             # Do not free the root's parameters in the post-forward for
             # `FULL_SHARD` with the intention that they are immediately used
             # for backward computation (though this may not be true)
@@ -1203,20 +1200,6 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
                 self, self._handles, reshard_fn, unused, unused, output
             )
 
-    def _pre_forward_unshard(
-        self,
-        handles: List[FlatParamHandle],
-    ) -> None:
-        """Unshards parameters in the pre-forward."""
-        if handles:
-            _unshard(
-                self, handles, self._streams["unshard"], self._streams["pre_unshard"]
-            )
-            handles_key = tuple(handles)
-            self._needs_pre_forward_unshard[handles_key] = False
-            torch.cuda.current_stream().wait_stream(self._streams["unshard"])
-            _prefetch_handles(self, handles_key)
-
     @staticmethod
     @contextlib.contextmanager
     def summon_full_params(
@@ -1619,21 +1602,6 @@ def named_parameters(
                 param_name = param_name.replace(FSDP_PREFIX, "")
             yield (param_name, param)
 
-    def _queue_wait_for_post_backward(self) -> None:
-        """
-        Queues a post-backward callback from the root FSDP instance, which
-        should happen at the beginning of its pre-backward.
-        """
-        p_assert(
-            self._is_root,
-            "`_queue_wait_for_post_backward()` should be called on the root FSDP instance",
-        )
-        if self._post_backward_callback_queued:
-            return
-        self._assert_state([TrainingState.IDLE])
-        self._post_backward_callback_queued = True
-        Variable._execution_engine.queue_callback(self._wait_for_post_backward)
-
     @torch.no_grad()
     def _wait_for_post_backward(self) -> None:
         """Wait for post-backward to finish. Only called on root instance."""

From 51b8e993e9ccf49105246471ab1cfdd06ec7e991 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Tue, 1 Nov 2022 22:47:11 +0000
Subject: [PATCH 0441/1922] [FSDP()][20/N][Easy] Move functions in file
 (#87932)

This PR is easy. I just wanted to group functions in the file according to the same logical order.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87932
Approved by: https://github.com/mrshenli
---
 torch/distributed/fsdp/_runtime_utils.py | 92 ++++++++++++------------
 1 file changed, 46 insertions(+), 46 deletions(-)

diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index 9af6fa61a28cd..fd23debf381fd 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -235,6 +235,52 @@ def _fsdp_root_pre_forward(
     return args, kwargs
 
 
+def _prepare_forward_inputs(
+    device: torch.device,
+    input_dtype: Optional[torch.dtype],
+    *args: Any,
+    **kwargs: Any,
+) -> Tuple[Any, Any]:
+    """
+    Prepares the forward inputs by moving them to ``device`` and casting them
+    to ``input_dtype`` if it is not ``None``.
+    """
+    # TODO: Do not use the side stream for tensor copies for now; investigate
+    # the perf with/without it.
+    # TODO: For mixed precision, move the inputs to the compute device and cast
+    # to reduced-precision in a single `to()` call.
+    args_tuple, kwargs_tuple = _to_kwargs(args, kwargs, device.index, False)
+    args = args_tuple[0]
+    kwargs = kwargs_tuple[0]
+    if input_dtype is not None:
+        args, kwargs = _cast_fp_inputs_to_dtype(input_dtype, *args, **kwargs)
+    return args, kwargs
+
+
+def _cast_fp_inputs_to_dtype(
+    dtype: torch.dtype,
+    *args: Any,
+    **kwargs: Any,
+) -> Tuple[Any, Any]:
+    """
+    Casts floating point tensors in ``args`` and ``kwargs`` to ``input_dtype``.
+    This respects the existing ``requires_grad`` on the tensors.
+    """
+
+    def cast_fn(x: torch.Tensor) -> torch.Tensor:
+        if not torch.is_floating_point(x):
+            return x
+        y = x.to(dtype)
+        # Explicitly copy over `requires_grad` since this runs inside
+        # `torch.no_grad()`
+        if x.is_leaf:
+            y.requires_grad = x.requires_grad
+        return y
+
+    with torch.no_grad():
+        return (_apply_to_tensors(cast_fn, args), _apply_to_tensors(cast_fn, kwargs))
+
+
 @no_type_check
 def _pre_backward_hook(
     state: _State,
@@ -813,49 +859,3 @@ def _clear_grads_if_needed(
     for handle in handles:
         if handle._use_orig_params:
             handle._clear_grads_if_needed()
-
-
-def _prepare_forward_inputs(
-    device: torch.device,
-    input_dtype: Optional[torch.dtype],
-    *args: Any,
-    **kwargs: Any,
-) -> Tuple[Any, Any]:
-    """
-    Prepares the forward inputs by moving them to ``device`` and casting them
-    to ``input_dtype`` if it is not ``None``.
-    """
-    # TODO: Do not use the side stream for tensor copies for now; investigate
-    # the perf with/without it.
-    # TODO: For mixed precision, move the inputs to the compute device and cast
-    # to reduced-precision in a single `to()` call.
-    args_tuple, kwargs_tuple = _to_kwargs(args, kwargs, device.index, False)
-    args = args_tuple[0]
-    kwargs = kwargs_tuple[0]
-    if input_dtype is not None:
-        args, kwargs = _cast_fp_inputs_to_dtype(input_dtype, *args, **kwargs)
-    return args, kwargs
-
-
-def _cast_fp_inputs_to_dtype(
-    dtype: torch.dtype,
-    *args: Any,
-    **kwargs: Any,
-) -> Tuple[Any, Any]:
-    """
-    Casts floating point tensors in ``args`` and ``kwargs`` to ``input_dtype``.
-    This respects the existing ``requires_grad`` on the tensors.
-    """
-
-    def cast_fn(x: torch.Tensor) -> torch.Tensor:
-        if not torch.is_floating_point(x):
-            return x
-        y = x.to(dtype)
-        # Explicitly copy over `requires_grad` since this runs inside
-        # `torch.no_grad()`
-        if x.is_leaf:
-            y.requires_grad = x.requires_grad
-        return y
-
-    with torch.no_grad():
-        return (_apply_to_tensors(cast_fn, args), _apply_to_tensors(cast_fn, kwargs))

From 1c9918421f9046c2c91f77d96eaff4dd9b3603ff Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Tue, 1 Nov 2022 22:47:11 +0000
Subject: [PATCH 0442/1922] [FSDP] Remove `device` arg from `_cast_buffers()`
 (#87933)

This PR is easy. The `device` argument in `_cast_buffers()` is never used.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87933
Approved by: https://github.com/mrshenli
---
 .../distributed/fsdp/fully_sharded_data_parallel.py | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index f64fc18832946..9609888cb2c8a 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -668,14 +668,11 @@ def _low_precision_hook_enabled(self) -> bool:
 
     def _cast_buffers(
         self,
-        device: Optional[torch.device] = None,
         dtype: Optional[Dict[str, torch.dtype]] = None,
         memo: Optional[Set] = None,
         recurse: bool = True,
     ) -> None:
-        """Move all buffers to the given *device* and *dtype*.
-        If *device* is not given, then it will default to
-        ``self.compute_device``, otherwise buffer will be moved to ``device``.
+        """Move all buffers to the compute device and the given *dtype*.
         In the case of nested FSDP instances, we will respect the child instance's
         ``compute_device`` configuration.
         If *dtype* is given, it must be a mapping of buffer name to buffer dtype,
@@ -684,8 +681,6 @@ def _cast_buffers(
             in mixed precision training, the buffer will be cast to buffer_dtype,
             otherwise the buffer will not be cast.
         Args:
-            device (torch.device, Optional):
-                device to cast buffers to (defaults to compute_device)
             dtype: (Dict[str, torch.dtype], Optional):
                 Mapping of buffer name to their dtype to cast to.
             memo (Set, Optional):
@@ -703,15 +698,13 @@ def _cast_buffers(
                 and recurse
             ):
                 # Allow any child FSDP instances to handle their own buffers.
-                module._cast_buffers(
-                    device=device, dtype=dtype, memo=memo, recurse=recurse
-                )
+                module._cast_buffers(dtype=dtype, memo=memo, recurse=recurse)
             elif module not in memo:
                 memo.add(module)
                 for name, buf in module.named_buffers(recurse=False):
                     if buf is None:
                         continue
-                    buf = buf.to(device=device or self.compute_device)
+                    buf = buf.to(self.compute_device)
                     if name not in self._buffer_name_to_orig_dtype:
                         self._buffer_name_to_orig_dtype[name] = buf.dtype
                     # If given, cast buffer to the given dtype. This is used to

From ecd3f7606d26545fc02d6fa4cdc1f7ced46f345a Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Tue, 1 Nov 2022 22:47:11 +0000
Subject: [PATCH 0443/1922] [FSDP] Rename `dtype` to `buffer_name_to_dtype`
 (#87934)

This PR is easy and only a rename. `dtype` does not convey that it is actually a `Dict[str, torch.dtype]` (when not `None`).
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87934
Approved by: https://github.com/mrshenli
---
 .../fsdp/fully_sharded_data_parallel.py       | 23 +++++++++++--------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 9609888cb2c8a..f739fb622fd9e 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -668,20 +668,20 @@ def _low_precision_hook_enabled(self) -> bool:
 
     def _cast_buffers(
         self,
-        dtype: Optional[Dict[str, torch.dtype]] = None,
+        buffer_name_to_dtype: Optional[Dict[str, torch.dtype]] = None,
         memo: Optional[Set] = None,
         recurse: bool = True,
     ) -> None:
-        """Move all buffers to the compute device and the given *dtype*.
+        """Move all buffers to the compute device and cast to the given dtype.
         In the case of nested FSDP instances, we will respect the child instance's
         ``compute_device`` configuration.
-        If *dtype* is given, it must be a mapping of buffer name to buffer dtype,
+        If *buffer_name_to_dtype* is given, it must be a mapping of buffer name to buffer dtype,
             and this argument is currently only given to restore back to original
-            buffer types during checkpoint. If *dtype* is not given, and we are
+            buffer types during checkpoint. If *buffer_name_to_dtype* is not given, and we are
             in mixed precision training, the buffer will be cast to buffer_dtype,
             otherwise the buffer will not be cast.
         Args:
-            dtype: (Dict[str, torch.dtype], Optional):
+            buffer_name_to_dtype: (Dict[str, torch.dtype], Optional):
                 Mapping of buffer name to their dtype to cast to.
             memo (Set, Optional):
                 set of modules that have already been processed
@@ -698,7 +698,11 @@ def _cast_buffers(
                 and recurse
             ):
                 # Allow any child FSDP instances to handle their own buffers.
-                module._cast_buffers(dtype=dtype, memo=memo, recurse=recurse)
+                module._cast_buffers(
+                    buffer_name_to_dtype=buffer_name_to_dtype,
+                    memo=memo,
+                    recurse=recurse,
+                )
             elif module not in memo:
                 memo.add(module)
                 for name, buf in module.named_buffers(recurse=False):
@@ -716,8 +720,8 @@ def _cast_buffers(
                     if torch.is_floating_point(buf):
                         # We are restoring the original buffer type in
                         # preparation for checkpoint.
-                        if dtype:
-                            buf = buf.to(dtype=dtype[name])
+                        if buffer_name_to_dtype:
+                            buf = buf.to(dtype=buffer_name_to_dtype[name])
                         # Note that we don't pass in self.mixed_precision.buffer_dtype
                         # recursively into _cast_buffers, as we want to respect
                         # mp config for child FSDP instances.
@@ -1129,7 +1133,8 @@ def state_dict(self, *args, **kwargs):
                 # calls _lazy_init() which would cast the buffers.
                 if self._is_root and self._mixed_precision_enabled_for_buffers():
                     self._cast_buffers(
-                        dtype=self._buffer_name_to_orig_dtype, recurse=False
+                        buffer_name_to_dtype=self._buffer_name_to_orig_dtype,
+                        recurse=False,
                     )
                 state_dict = super().state_dict(*args, **kwargs)
 

From 7dff66360f375d25454312b1025ed024a07ae27a Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Tue, 1 Nov 2022 22:47:12 +0000
Subject: [PATCH 0444/1922] [FSDP()][21/N] Refactor and fix `_cast_buffers()`
 (#87935)

This PR refactors and fixes `_cast_buffers()`.

**Before**
Buffers were not correctly cast back to their original dtypes for submodules when using buffer mixed precision.
- `_cast_buffers(recurse=False)` incorrectly casts all buffers, including those in submodules. This is because of this outer loop over `self.modules()`:
https://github.com/pytorch/pytorch/blob/c40033be162db0f94d37e7ccbd2a89d67f8b8e47/torch/distributed/fsdp/fully_sharded_data_parallel.py#L700
- There was a unit test that checked that buffers were cast as expected (`test_mixed_precision_e2e_full_shard()`). The unit test _coincidentally_ passed because all modules shared the same buffer name `"buffer"`. In `_cast_buffers()`, the `dict` mapping buffer name to original dtype is populated lazily (during `_lazy_init()`). However, the keys are unprefixed:
https://github.com/pytorch/pytorch/blob/c40033be162db0f94d37e7ccbd2a89d67f8b8e47/torch/distributed/fsdp/fully_sharded_data_parallel.py#L712-L717
- Thus, even though (1) `_cast_buffers(recurse=False)` was only called on the root and (2) `self._buffer_name_to_orig_dtype` had unprefixed names as keys, the unit test still passed because (1) `_cast_buffers()` still looped over all buffers despite `recurse=False` and (2) all submodules' buffers were named `"buffer"` and had the same original and low-precision dtypes and hence were cast correctly.

If we change each submodule to have its own distinct buffer name, then the unit test fails. This PR makes such a change to showcase the progression granted by this PR.

**After**
This PR separates `_cast_buffers()` into three methods: `_get_buffers_and_dtypes_for_computation()`, `_get_buffers_and_dtypes_for_checkpoint()`, and `_cast_buffers_to_dtype_and_device()`. This is to separate the different use cases (casting for computation and casting for checkpointing) and the corresponding code paths. Plus, the signature for `_cast_buffers_to_dtype_and_device()` makes it clear exactly what buffers are being cast and to what dtype.

Both `_get_...()` functions assume that they are called on the root only for now. This coincides with the construction of `_buffer_name_to_orig_dtype` in the FSDP constructor, which loops over all submodules. (This means that for non-root modules, their `_buffer_name_to_orig_dtype` is populated but not used.) The `dict`'s keys are clean since the buffer cast to original dtype happens in a `summon_full_params()` context, which cleans the names.

**Follow-Ups**
- We can try to move `_get_buffers_and_dtypes_for_checkpoint()` into `_state_dict_utils.py` in a follow-up.
- We may want to move to per-module buffer casting (i.e. do not have the root module cast for all submodules).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87935
Approved by: https://github.com/mrshenli
---
 .../fsdp/test_fsdp_mixed_precision.py         | 15 ++-
 torch/distributed/fsdp/_init_utils.py         | 10 +-
 torch/distributed/fsdp/_runtime_utils.py      | 84 +++++++++++++++++
 torch/distributed/fsdp/_state_dict_utils.py   | 11 ++-
 .../fsdp/fully_sharded_data_parallel.py       | 91 ++++---------------
 5 files changed, 127 insertions(+), 84 deletions(-)

diff --git a/test/distributed/fsdp/test_fsdp_mixed_precision.py b/test/distributed/fsdp/test_fsdp_mixed_precision.py
index a65d0378a3a94..d03ed1179e0f4 100644
--- a/test/distributed/fsdp/test_fsdp_mixed_precision.py
+++ b/test/distributed/fsdp/test_fsdp_mixed_precision.py
@@ -154,10 +154,13 @@ class LinearMixedPrecision(nn.Module):
     A linear module with extra checks for mixed precision training.
     """
 
-    def __init__(self, param_dtype):
+    def __init__(self, param_dtype, buffer_name="buffer"):
         super().__init__()
         self.lin = nn.Linear(10, 10, bias=False).to(param_dtype)
-        self.register_buffer("buffer", torch.randn((1, 2), dtype=_BUFFER_ORIG_DTYPE))
+        # Use a configurable buffer name to avoid all submodules sharing the
+        # same buffer name, which may hide prefixed vs. unprefixed name bugs
+        self.buffer_name = buffer_name
+        self.register_buffer(buffer_name, torch.randn((1, 2), dtype=_BUFFER_ORIG_DTYPE))
         self._orig_param_type = param_dtype
         self._orig_buffer_dtype = _BUFFER_ORIG_DTYPE
 
@@ -176,7 +179,7 @@ def forward(self, tup):
         )
         cls.assertEqual(inp.dtype, expected_param_type)
         # Buffer should be in specified precision as well.
-        cls.assertEqual(self.buffer.dtype, expected_buffer_type)
+        cls.assertEqual(getattr(self, self.buffer_name).dtype, expected_buffer_type)
 
         # In FSDP, self.params should point to the right type.
         num_active_fsdp = 0
@@ -234,9 +237,11 @@ def _get_simple_nested_model(self, param_dtype, *fsdp_args, **fsdp_kwargs):
         model = FSDP(
             nn.Sequential(
                 FSDP(
-                    LinearMixedPrecision(param_dtype).cuda(), *fsdp_args, **fsdp_kwargs
+                    LinearMixedPrecision(param_dtype, buffer_name="buffer0").cuda(),
+                    *fsdp_args,
+                    **fsdp_kwargs,
                 ),
-                LinearMixedPrecision(param_dtype).cuda(),
+                LinearMixedPrecision(param_dtype, buffer_name="buffer1").cuda(),
             ),
             *fsdp_args,
             **fsdp_kwargs,
diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index cbbfff10fa5d0..76a5ac185703e 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -104,10 +104,14 @@ def _init_buffer_state(
     module: nn.Module,
 ) -> _State:
     state._buffer_names = _get_buffer_names(module)
-    # Save a mapping from fully prefixed buffer name to its original dtype
-    # since when buffer mixed precision is enabled, buffers are restored to
-    # their original dtype for model checkpointing
+    # Save a mapping from clean fully-qualified buffer name (starting from
+    # `module`) to its original dtype for restoring that dtype during model
+    # checkpointing when buffer mixed precision is enabled. The names should
+    # be clean since the casting happens in a `summon_full_params()` context.
     _buffer_name_to_orig_dtype: Dict[str, torch.dtype] = {}
+    for buffer_name, buffer in module.named_buffers():
+        buffer_name = clean_tensor_name(buffer_name)
+        _buffer_name_to_orig_dtype[buffer_name] = buffer.dtype
     state._buffer_name_to_orig_dtype = _buffer_name_to_orig_dtype
     return state
 
diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index fd23debf381fd..9b6e980fe7305 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -859,3 +859,87 @@ def _clear_grads_if_needed(
     for handle in handles:
         if handle._use_orig_params:
             handle._clear_grads_if_needed()
+
+
+@no_type_check
+def _get_buffers_and_dtypes_for_computation(
+    state: _State,
+    root_module: nn.Module,
+) -> Tuple[List[torch.Tensor], List[Optional[torch.dtype]]]:
+    """
+    Returns all buffers in the module tree rooted at ``root_module`` and a
+    corresponding list of the buffer dtypes for computation. Each buffer dtype
+    is either ``None`` if buffer mixed precision is not enabled or the buffer
+    low precision dtype otherwise.
+    """
+    p_assert(state._is_root, "Expects the root to cast buffers")
+    buffers: List[torch.Tensor] = []
+    buffer_dtypes: List[Optional[torch.dtype]] = []
+    if _is_composable(state):
+        buffers = [
+            buffer for module in root_module.modules() for buffer in module.buffers()
+        ]
+        buffer_dtypes = [
+            state.mixed_precision.buffer_dtype for _ in range(len(buffers))
+        ]
+    else:
+        visited_buffers = set()
+        # Traverse the FSDP instances bottom-up so that we prefer the owning
+        # FSDP instance's mixed precision setting for each buffer
+        for fsdp_module in reversed(state.fsdp_modules(root_module)):
+            for buffer in fsdp_module.buffers():
+                if buffer in visited_buffers:
+                    continue
+                visited_buffers.add(buffer)
+                buffers.append(buffer)
+                buffer_dtypes.append(fsdp_module.mixed_precision.buffer_dtype)
+    assert len(buffers) == len(buffer_dtypes), f"{len(buffers)} {len(buffer_dtypes)}"
+    return buffers, buffer_dtypes
+
+
+@no_type_check
+def _get_buffers_and_dtypes_for_checkpoint(
+    state: _State,
+    root_module: nn.Module,
+) -> Tuple[List[torch.Tensor], List[torch.dtype]]:
+    """
+    Returns all buffers in the module tree rooted at ``root_module`` and a
+    corresponding list of the buffer dtypes for checkpointing. Each buffer
+    dtype is the original buffer dtype ignoring any buffer mixed precision.
+    """
+    p_assert(state._is_root, "Expects the root to cast buffers")
+    buffers: List[torch.Tensor] = []
+    buffer_dtypes: List[Optional[torch.dtype]] = []
+    for buffer_name, buffer in root_module.named_buffers():
+        p_assert(
+            buffer_name in state._buffer_name_to_orig_dtype,
+            f"{buffer_name} is missing from pre-computed dict on rank "
+            f"{state.rank}, which only has keys "
+            f"{state._buffer_name_to_orig_dtype.keys()}",
+        )
+        buffers.append(buffer)
+        buffer_dtypes.append(state._buffer_name_to_orig_dtype[buffer_name])
+    return buffers, buffer_dtypes
+
+
+def _cast_buffers_to_dtype_and_device(
+    buffers: List[torch.Tensor],
+    buffer_dtypes: List[Optional[torch.dtype]],
+    device: torch.device,
+) -> None:
+    """
+    Casts ``buffers`` to the dtypes given by ``buffer_dtypes`` and moves them
+    to ``device``. If an element in ``buffer_dtypes`` is ``None``, then the
+    corresponding buffer is only moved to ``device``.
+    """
+    p_assert(
+        buffer_dtypes is None or len(buffers) == len(buffer_dtypes),
+        f"Expects `buffers` and `buffer_dtypes` to have the same length if "
+        f"`buffer_dtypes` is specified but got {len(buffers)} and "
+        f"{len(buffer_dtypes)}",
+    )
+    for buffer, buffer_dtype in zip(buffers, buffer_dtypes):
+        if not torch.is_floating_point(buffer) or buffer_dtype is None:
+            buffer.data = buffer.to(device=device)
+        else:
+            buffer.data = buffer.to(device=device, dtype=buffer_dtype)
diff --git a/torch/distributed/fsdp/_state_dict_utils.py b/torch/distributed/fsdp/_state_dict_utils.py
index 9276c3cf62cc7..0169aa8f10eb2 100644
--- a/torch/distributed/fsdp/_state_dict_utils.py
+++ b/torch/distributed/fsdp/_state_dict_utils.py
@@ -17,6 +17,10 @@
     ShardedTensor,
 )
 from torch.distributed.fsdp._common_utils import clean_tensor_name
+from torch.distributed.fsdp._runtime_utils import (
+    _cast_buffers_to_dtype_and_device,
+    _get_buffers_and_dtypes_for_computation,
+)
 from torch.distributed.utils import _replace_by_prefix
 
 from ._fsdp_extensions import (
@@ -382,7 +386,12 @@ def _post_state_dict_hook(
     # during lazy_init() and stay at their mixed precision type before/after
     # forward/backward. As a result state_dict() should maintain this.
     if fsdp_module._is_root and fsdp_module._mixed_precision_enabled_for_buffers():
-        fsdp_module._cast_buffers(recurse=True)
+        buffers, buffer_dtypes = _get_buffers_and_dtypes_for_computation(
+            fsdp_module, fsdp_module
+        )
+        _cast_buffers_to_dtype_and_device(
+            buffers, buffer_dtypes, fsdp_module.compute_device
+        )
     return processed_state_dict
 
 
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index f739fb622fd9e..eb813bce69a45 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -16,7 +16,6 @@
     Iterator,
     List,
     Optional,
-    Set,
     Tuple,
     Union,
 )
@@ -52,8 +51,11 @@
     _init_streams,
 )
 from torch.distributed.fsdp._runtime_utils import (
+    _cast_buffers_to_dtype_and_device,
     _clear_grads_if_needed,
     _fsdp_root_pre_forward,
+    _get_buffers_and_dtypes_for_checkpoint,
+    _get_buffers_and_dtypes_for_computation,
     _post_forward,
     _pre_forward,
     _pre_forward_unshard,
@@ -666,70 +668,6 @@ def _low_precision_hook_enabled(self) -> bool:
             and self._communication_hook in LOW_PRECISION_HOOKS
         )
 
-    def _cast_buffers(
-        self,
-        buffer_name_to_dtype: Optional[Dict[str, torch.dtype]] = None,
-        memo: Optional[Set] = None,
-        recurse: bool = True,
-    ) -> None:
-        """Move all buffers to the compute device and cast to the given dtype.
-        In the case of nested FSDP instances, we will respect the child instance's
-        ``compute_device`` configuration.
-        If *buffer_name_to_dtype* is given, it must be a mapping of buffer name to buffer dtype,
-            and this argument is currently only given to restore back to original
-            buffer types during checkpoint. If *buffer_name_to_dtype* is not given, and we are
-            in mixed precision training, the buffer will be cast to buffer_dtype,
-            otherwise the buffer will not be cast.
-        Args:
-            buffer_name_to_dtype: (Dict[str, torch.dtype], Optional):
-                Mapping of buffer name to their dtype to cast to.
-            memo (Set, Optional):
-                set of modules that have already been processed
-            recurse (bool, Optional):
-                Whether to call _cast_buffers recursively on nested FSDP
-                instances (default is True).
-        """
-        if memo is None:
-            memo = set()
-        for module in self.modules():
-            if (
-                module is not self
-                and isinstance(module, FullyShardedDataParallel)
-                and recurse
-            ):
-                # Allow any child FSDP instances to handle their own buffers.
-                module._cast_buffers(
-                    buffer_name_to_dtype=buffer_name_to_dtype,
-                    memo=memo,
-                    recurse=recurse,
-                )
-            elif module not in memo:
-                memo.add(module)
-                for name, buf in module.named_buffers(recurse=False):
-                    if buf is None:
-                        continue
-                    buf = buf.to(self.compute_device)
-                    if name not in self._buffer_name_to_orig_dtype:
-                        self._buffer_name_to_orig_dtype[name] = buf.dtype
-                    # If given, cast buffer to the given dtype. This is used to
-                    # suppport mixed precision for buffers
-                    # (given by self.mixed_precision.buffer_dtype) and also used
-                    # to restore the buffer dtype to the original precision for
-                    # state_dict() calls.
-                    # Note that non-floating point buffers are not casted.
-                    if torch.is_floating_point(buf):
-                        # We are restoring the original buffer type in
-                        # preparation for checkpoint.
-                        if buffer_name_to_dtype:
-                            buf = buf.to(dtype=buffer_name_to_dtype[name])
-                        # Note that we don't pass in self.mixed_precision.buffer_dtype
-                        # recursively into _cast_buffers, as we want to respect
-                        # mp config for child FSDP instances.
-                        elif self._mixed_precision_enabled_for_buffers():
-                            buf = buf.to(self.mixed_precision.buffer_dtype)
-
-                    setattr(module, name, buf)
-
     def _reset_lazy_init(self) -> None:
         """
         Reset instance so :func:`_lazy_init` will run on the next forward.
@@ -763,7 +701,8 @@ def _lazy_init(self) -> None:
         self._is_root = True
         self._assert_state(TrainingState.IDLE)
         _init_streams(self)
-        self._cast_buffers(recurse=True)
+        buffers, buffer_dtypes = _get_buffers_and_dtypes_for_computation(self, self)
+        _cast_buffers_to_dtype_and_device(buffers, buffer_dtypes, self.compute_device)
         for handle in self._handles:
             self._init_param_attributes(handle)
         self._exec_order_data.init(self, self.process_group)
@@ -1125,16 +1064,18 @@ def state_dict(self, *args, **kwargs):
                 else contextlib.suppress()
             )
             with summon_ctx:
-                # Since buffers are not sharded and stay casted, restore them to their
-                # original user module specified types for checkpoint. We take care to
-                # recast in post_state_dict_hook for consistency with the fact that
-                # buffers stay casted after forward/backward. We must have the
-                # call here instead of above because _summon_full_params itself
-                # calls _lazy_init() which would cast the buffers.
+                # Since buffers stay in their low precision throughout runtime,
+                # we must explicitly restore them to their original dtypes for
+                # model checkpointing. We have the root module cast for all
+                # submodules.
+                # TODO: Investigate if this can and should be refactored into
+                # `summon_full_params()`.
                 if self._is_root and self._mixed_precision_enabled_for_buffers():
-                    self._cast_buffers(
-                        buffer_name_to_dtype=self._buffer_name_to_orig_dtype,
-                        recurse=False,
+                    buffers, buffer_dtypes = _get_buffers_and_dtypes_for_checkpoint(
+                        self, self
+                    )
+                    _cast_buffers_to_dtype_and_device(
+                        buffers, buffer_dtypes, self.compute_device
                     )
                 state_dict = super().state_dict(*args, **kwargs)
 

From f26382e1d1a50c17254af4fd4abdcee06dcd3963 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Tue, 1 Nov 2022 22:47:12 +0000
Subject: [PATCH 0445/1922] [FSDP()][23/N] Refactor handle attr initialization
 (#87938)

**`_init_param_attributes()` -> `init_flat_param_attributes()`**
We move `_init_param_attributes()` to `FlatParamHandle.init_flat_param_attributes()` (as already marked as to-do during previous refactoring).

**`_reset_lazy_init()`**
We no longer delete `_local_shard` from each `FlatParameter` in `_reset_lazy_init()`.

**Analysis**
Thus, the two semantic differences are that we remove the initial `if hasattr(p, "_local_shard")` early return in `_init_param_attributes()` and the `delattr(p, "_local_shard")` in `_reset_lazy_init()`.

This is safe because
- If we never call `_reset_lazy_init()`, then `init_flat_param_attributes()` is only called once. There is no opportunity for an early return.
- If we call `_reset_lazy_init()`, then `init_flat_param_attributes()` will be called again in the next `_lazy_init()`. However, since we removed the early return, all of the attributes initialized in `init_flat_param_attributes()` simply get re-initialized and override any existing attributes.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87938
Approved by: https://github.com/mrshenli
---
 torch/distributed/fsdp/_utils.py              |   2 +-
 torch/distributed/fsdp/flat_param.py          |  73 ++++++++++-
 .../fsdp/fully_sharded_data_parallel.py       | 118 +-----------------
 torch/testing/_internal/common_fsdp.py        |   4 +-
 4 files changed, 79 insertions(+), 118 deletions(-)

diff --git a/torch/distributed/fsdp/_utils.py b/torch/distributed/fsdp/_utils.py
index d3d8f91c61a42..5aae5e918c4a1 100644
--- a/torch/distributed/fsdp/_utils.py
+++ b/torch/distributed/fsdp/_utils.py
@@ -114,4 +114,4 @@ def p_assert(cond: Any, s: str, raise_assertion_error: bool = True) -> None:
         print(s)
         traceback.print_stack()
         if raise_assertion_error:
-            raise AssertionError
+            raise AssertionError(s)
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index 50f3d5f0ef7eb..91e02cb9312ad 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -11,6 +11,7 @@
     Iterator,
     List,
     NamedTuple,
+    no_type_check,
     Optional,
     Sequence,
     Set,
@@ -124,7 +125,7 @@ class FlatParameter(nn.Parameter):
         parameter data is saved in ``self._local_shard``, and a new ``Tensor``
         ``self._full_param_padded`` is created, which is the all-gather
         destination and owns the unsharded parameter storage thereafter. (See
-        :meth:`FullyShardedDataParallel._init_param_attributes`.)
+        :meth:`FlatParamHandle.init_flat_param_attributes`.)
         - Throughout runtime, the parameter data changes storages as needed,
         e.g. to the sharded flattened parameter, reduced-precision sharded
         flattened parameter, or the unsharded flattened parameter.
@@ -278,6 +279,9 @@ def _init_metadata(
             self._tensors = None
         self._unpadded_unsharded_size = self.size()
         _set_fsdp_flattened(self)
+        # Tracks whether the `FlatParameter`'s post-backward hook has been
+        # called to modify the behavior of the post-backward callback
+        self._post_backward_called = False
 
 
 class FlatParamHandle:
@@ -678,6 +682,73 @@ def shard_metadata(
             self.flat_param._shard_param_offsets[:],  # type: ignore[attr-defined]
         )
 
+    @no_type_check
+    @torch.no_grad()
+    def init_flat_param_attributes(self) -> None:
+        """
+        This initializes some attributes on the handle's ``FlatParameter``.
+        This should be called during lazy initialization since it requires the
+        parameter to be on the compute device if not offloading to CPU and we
+        want to give users the chance to move the parameter appropriately after
+        the FSDP constructor.
+
+        For each tensor attribute on the ``FlatParameter``, see the unshard and
+        reshard methods in this class for the allocation and free pattern.
+        """
+        flat_param = self.flat_param
+        cpu_device = torch.device("cpu")
+        if self._config.offload_params:
+            p_assert(
+                flat_param.device == cpu_device,
+                "Expects the `FlatParameter` to be offloaded to CPU since CPU "
+                "offloading is enabled. You may be accidentally moving the "
+                f"model to {flat_param.device} after the FSDP constructor.",
+            )
+        flat_param._local_shard = flat_param.data
+        if self._config.offload_params:
+            # Pin the memory for faster H2D transfer
+            flat_param._local_shard = flat_param._local_shard.pin_memory()
+            # Pre-allocate the sharded gradient on CPU to enable non-blocking
+            # D2H transfer during the backward pass
+            flat_param._cpu_grad = torch.zeros_like(
+                flat_param._local_shard, device=cpu_device
+            ).pin_memory()
+        if self._config.low_prec_param_dtype is not None:
+            # For parameter mixed precision, we maintain a low precision
+            # sharded tensor on the compute device to be all-gathered (for
+            # sharded strategies) or directly used (for `NO_SHARD`) for
+            # computation.
+            flat_param._mp_shard = torch.zeros_like(
+                flat_param._local_shard,
+                device=self.device,
+                dtype=self._config.low_prec_param_dtype,
+            )
+            _free_storage(flat_param._mp_shard)
+        if self.uses_sharded_strategy:
+            # We maintain a padded unsharded tensor that serves as the
+            # all-gather destination and owns the original parameter storages.
+            unsharded_param_dtype = (
+                self._config.low_prec_param_dtype or flat_param.dtype
+            )  # use low precision if parameter mixed precision is enabled
+            padded_unsharded_numel = flat_param.numel() * self.world_size
+            flat_param._full_param_padded = torch.zeros(
+                padded_unsharded_numel,
+                device=self.device,
+                dtype=unsharded_param_dtype,
+            )
+            flat_param._padded_unsharded_size = flat_param._full_param_padded.size()
+            _free_storage(flat_param._full_param_padded)
+
+            if self._config.low_prec_param_dtype is not None:
+                # For parameter mixed precision, we maintain a full precision
+                # padded unsharded tensor for when we force full precision.
+                flat_param._full_prec_full_param_padded = torch.zeros(
+                    padded_unsharded_numel,
+                    device=self.device,
+                    dtype=flat_param.dtype,  # full precision
+                )
+                _free_storage(flat_param._full_prec_full_param_padded)
+
     ###################
     # UNSHARD/RESHARD #
     ###################
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index eb813bce69a45..cd7a1a566b46e 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -91,7 +91,7 @@
     _post_state_dict_hook,
     _pre_load_state_dict_hook,
 )
-from ._utils import _free_storage, p_assert
+from ._utils import p_assert
 from .flat_param import FlatParameter, FlatParamHandle, HandleShardingStrategy
 from .wrap import ParamExecOrderWrapPolicy
 
@@ -673,12 +673,6 @@ def _reset_lazy_init(self) -> None:
         Reset instance so :func:`_lazy_init` will run on the next forward.
         """
         self._is_root: Optional[bool] = None
-        for p in self.params:
-            if hasattr(p, "_local_shard"):
-                # We only need to `del` `_local_shard` because
-                # `_init_param_attributes()` gates the logic based on its
-                # existence (and not any of the other attributes).
-                del p._local_shard  # type: ignore[attr-defined]
 
     def _lazy_init(self) -> None:
         """
@@ -704,7 +698,7 @@ def _lazy_init(self) -> None:
         buffers, buffer_dtypes = _get_buffers_and_dtypes_for_computation(self, self)
         _cast_buffers_to_dtype_and_device(buffers, buffer_dtypes, self.compute_device)
         for handle in self._handles:
-            self._init_param_attributes(handle)
+            handle.init_flat_param_attributes()
         self._exec_order_data.init(self, self.process_group)
         # Initialize non-root FSDP instances and share attributes from the root
         # to non-root instances
@@ -731,7 +725,7 @@ def _lazy_init(self) -> None:
                     self._needs_pre_backward_unshard
                 )
                 for handle in fsdp_module._handles:
-                    fsdp_module._init_param_attributes(handle)
+                    handle.init_flat_param_attributes()
         if inconsistent_limit_all_gathers:
             warnings.warn(
                 "Found inconsistent `limit_all_gathers` values across FSDP "
@@ -739,112 +733,6 @@ def _lazy_init(self) -> None:
                 f"of {self.limit_all_gathers} for all instances."
             )
 
-    # TODO (awgu): Move this to the `FlatParamHandle` class later
-    @torch.no_grad()
-    def _init_param_attributes(self, handle: FlatParamHandle) -> None:
-        """
-        We manage several attributes on each Parameter instance.
-        A few attributes are set here:
-            ``_local_shard``: a single shard of the parameter. This is needed to
-                recover the shard after rebuilding full parameter in forward
-                and backward.
-            ``_full_param_padded``: the full weight (padded to be evenly
-                divisible by ``world_size``), used for computation in the
-                forward and backward pass. It is initialized with the
-                appropriate size and then has its storage freed. This will be
-                resized in place and only materialized (via all-gather) as needed.
-        Another attribute is set by :func:`_register_post_backward_hooks`:
-            ``_post_backward_hook_state``: it holds the parameter's AccumulateGrad object
-                and the registered post hook handle.
-        """
-        p = handle.flat_param
-        # If _local_shard has been set in the first lazy init and
-        # current parameter is pointed to _local_shard, no need to
-        # set the _local_shard again.
-        if hasattr(p, "_local_shard"):
-            # If CPU offloading, p._local_shard should have been placed on CPU
-            # during its first lazy construction.
-            if self.cpu_offload.offload_params:
-                assert p._local_shard.device == torch.device(  # type: ignore[attr-defined]
-                    "cpu"
-                ), (
-                    "Expected p._local_shard to be on CPU, "  # type: ignore[attr-defined]
-                    f"but it's on {p._local_shard.device}"  # type: ignore[attr-defined]
-                )
-            return
-
-        # A single shard of the parameters. Also makes p._local_shard to be on
-        # CPU if we are CPU offloading, since p.data would be on CPU during
-        # init.
-        if self.cpu_offload.offload_params:
-            assert p.device == torch.device("cpu"), (
-                "Expected param to be on CPU when cpu_offloading is enabled. "
-                "If CPU offloading is enabled correctly, you may be "
-                "accidentally moving the model to CUDA after FSDP initialization."
-            )
-        p._local_shard = p.data  # type: ignore[attr-defined]
-        # If CPU offloading, pin the memory to enable faster CPU -> GPU device
-        # transfer.
-        if self.cpu_offload.offload_params:
-            assert p._local_shard.device == torch.device("cpu")  # type: ignore[attr-defined]
-            p._local_shard = p._local_shard.pin_memory()  # type: ignore[attr-defined]
-            # When offloading parameters, also move the grad shard to CPU during
-            # backward pass. In this case, it's important to pre-allocate the
-            # CPU grad shard in pinned memory so that we can do a non-blocking
-            # transfer.
-            p._cpu_grad = torch.zeros_like(  # type: ignore[attr-defined]
-                p, device=torch.device("cpu")
-            ).pin_memory()
-
-        # If mixed_precision, maintain reduced precision param shard on
-        # compute_device for computation in fwd/bwd. We resize storage to 0 here
-        # and rematerialize before building the full param when needed. After
-        # fwd/bwd, it is freed and we only hold on to the full precision shard.
-        # As a result, this reduced precision shard is not allocated if we are
-        # not in the forward/backward pass.
-        if self._mixed_precision_enabled_for_params():
-            p._mp_shard = torch.zeros_like(
-                p._local_shard,
-                device=self.compute_device,
-                dtype=self.mixed_precision.param_dtype,
-            )
-            _free_storage(p._mp_shard)
-
-        # We also maintain a full-sized parameter of type self.compute_dtype.
-        # We resize the storage to size 0 at init (here) and only materialize
-        # as needed. The storage may contain padding elements so that it is
-        # evenly divisible by world_size, although these padding elements will
-        # be removed before the relevant computation.
-        if handle.uses_sharded_strategy:  # type: ignore[attr-defined]
-            # We set p._full_param_padded's dtype to the desired parameter dtype
-            # in the case of mixed precision. This is so that when we all_gather
-            # into full_param_padded it can occur without issues and result in
-            # full_param_padded having the expected param_dtype.
-            full_param_dtype = (
-                p.dtype
-                if not self._mixed_precision_enabled_for_params()
-                else self.mixed_precision.param_dtype
-            )
-            p._full_param_padded = torch.zeros(  # type: ignore[attr-defined]
-                p.numel() * self.world_size,
-                device=self.compute_device,
-                dtype=full_param_dtype,
-            )
-            p._padded_unsharded_size = p._full_param_padded.size()  # type: ignore[attr-defined]
-            _free_storage(p._full_param_padded)  # type: ignore[attr-defined]
-
-            if self._mixed_precision_enabled_for_params():
-                p._full_prec_full_param_padded = torch.zeros(  # type: ignore[attr-defined]
-                    p.numel() * self.world_size,
-                    device=self.compute_device,
-                    dtype=p.dtype,  # full precision
-                )
-                _free_storage(p._full_prec_full_param_padded)
-
-        # Track whether the `FlatParameter`'s post-backward hook has been
-        # called for validation in the post-backward callback
-        p._post_backward_called = False
-
     @staticmethod
     def set_state_dict_type(
         module: nn.Module,
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index f54a3abeb9191..616c1c90e1a67 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -1004,7 +1004,9 @@ def _test_fsdp_parity(
             for param in fsdp_model.parameters():
                 self.assertEqual(param.device, cpu_device)
         context = (
-            self.assertRaisesRegex(AssertionError, "Expected param to be on CPU")
+            self.assertRaisesRegex(
+                AssertionError, "Expects the `FlatParameter` to be offloaded to CPU"
+            )
             if expects_device_error
             else suppress()
         )

From f3e3bcb8b9b75c7ef1984190adeb2af596859c18 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@fb.com>
Date: Tue, 1 Nov 2022 21:39:03 -0700
Subject: [PATCH 0446/1922] [profiler] Add Linux Perf support (#87866)

* Add support to use Linux kernel perf subsystem via the profiler.
* For now the perf configurability is quite limited to just event names. Threading etc. to come later.
* Given we want to support variety of different cpu types, number of events list (in addition to the standard set of events) is also limited.
* Rather than failing with unsupported feature for non-Linux platforms, it returns zeros for all the event counts.
* For now, max event counts is capped at 4, time multiplexing is not allowed.
* Threadpool recreate hack is restricted to mobile only - need to add better support for threading in general

Differential Revision: [D40238033](https://our.internmc.facebook.com/intern/diff/D40238033/)

**NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D40238033/)!
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87866
Approved by: https://github.com/SS-JIA
---
 build_variables.bzl            |   1 +
 torch/csrc/profiler/perf-inl.h |  60 +++++++++++
 torch/csrc/profiler/perf.cpp   | 184 +++++++++++++++++++++++++++++++++
 torch/csrc/profiler/perf.h     | 103 ++++++++++++++++++
 4 files changed, 348 insertions(+)
 create mode 100644 torch/csrc/profiler/perf-inl.h
 create mode 100644 torch/csrc/profiler/perf.cpp
 create mode 100644 torch/csrc/profiler/perf.h

diff --git a/build_variables.bzl b/build_variables.bzl
index 1672ccb8a3b38..49b0734a7f1c6 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -143,6 +143,7 @@ libtorch_profiler_sources = [
     "torch/csrc/profiler/standalone/itt_observer.cpp",
     "torch/csrc/profiler/standalone/nvtx_observer.cpp",
     "torch/csrc/profiler/stubs/base.cpp",
+    "torch/csrc/profiler/perf.cpp",
     "torch/csrc/monitor/counters.cpp",
     "torch/csrc/monitor/events.cpp",
 ]
diff --git a/torch/csrc/profiler/perf-inl.h b/torch/csrc/profiler/perf-inl.h
new file mode 100644
index 0000000000000..ccc074df027f6
--- /dev/null
+++ b/torch/csrc/profiler/perf-inl.h
@@ -0,0 +1,60 @@
+#pragma once
+
+#if defined(__ANDROID__) || defined(__linux__)
+
+#include <unistd.h>
+
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+
+#include <linux/perf_event.h>
+
+#endif /* __ANDROID__ || __linux__ */
+
+#include <torch/csrc/profiler/perf.h>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+namespace linux_perf {
+
+/*
+ * PerfEvent
+ * ---------
+ */
+
+inline void PerfEvent::Disable() const {
+#if defined(__ANDROID__) || defined(__linux__)
+  ioctl(fd_, PERF_EVENT_IOC_DISABLE, 0);
+#endif /* __ANDROID__ || __linux__ */
+}
+
+inline void PerfEvent::Enable() const {
+#if defined(__ANDROID__) || defined(__linux__)
+  ioctl(fd_, PERF_EVENT_IOC_ENABLE, 0);
+#endif /* __ANDROID__ || __linux__ */
+}
+
+inline void PerfEvent::Reset() const {
+#if defined(__ANDROID__) || defined(__linux__)
+  ioctl(fd_, PERF_EVENT_IOC_RESET, 0);
+#endif /* __ANDROID__ || __linux__ */
+}
+
+/*
+ * PerfProfiler
+ * ------------
+ */
+
+inline uint64_t PerfProfiler::CalcDelta(uint64_t start, uint64_t end) const {
+  if (end < start) { // overflow
+    return end + (std::numeric_limits<uint64_t>::max() - start);
+  }
+  // not possible to wrap around start for a 64b cycle counter
+  return end - start;
+}
+
+} // namespace linux_perf
+} // namespace impl
+} // namespace profiler
+} // namespace torch
diff --git a/torch/csrc/profiler/perf.cpp b/torch/csrc/profiler/perf.cpp
new file mode 100644
index 0000000000000..d369de98e76a3
--- /dev/null
+++ b/torch/csrc/profiler/perf.cpp
@@ -0,0 +1,184 @@
+#include <unordered_set>
+
+#include <torch/csrc/profiler/perf-inl.h>
+#include <torch/csrc/profiler/perf.h>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+
+namespace linux_perf {
+
+#if defined(__ANDROID__) || defined(__linux__)
+
+/*
+ * PerfEvent
+ * ---------
+ */
+
+/*
+ * Syscall wrapper for perf_event_open(2)
+ */
+inline long perf_event_open(
+    struct perf_event_attr* hw_event,
+    pid_t pid,
+    int cpu,
+    int group_fd,
+    unsigned long flags) {
+  return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
+}
+
+// TODO sync with Kineto level abstract events in profiler/events.h
+static const std::unordered_map<
+    std::string,
+    std::pair<perf_type_id, /* perf event type */ uint32_t>>
+    EventTable{
+        {"cycles",
+         std::make_pair(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES)},
+        {"instructions",
+         std::make_pair(PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS)},
+
+        // Non Standard events for testing
+        {"pagefaults",
+         std::make_pair(PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS)},
+        {"backend-stall-cycles",
+         std::make_pair(
+             PERF_TYPE_HARDWARE,
+             PERF_COUNT_HW_STALLED_CYCLES_BACKEND)},
+        {"frontend-stall-cycles",
+         std::make_pair(
+             PERF_TYPE_HARDWARE,
+             PERF_COUNT_HW_STALLED_CYCLES_FRONTEND)}};
+
+PerfEvent::~PerfEvent() {
+  if (fd_ > -1) {
+    close(fd_);
+  }
+  fd_ = -1; // poison
+}
+
+void PerfEvent::Init() {
+  TORCH_CHECK(!name_.empty(), "Invalid profiler event name");
+
+  auto const it = EventTable.find(name_);
+  if (it == EventTable.end()) {
+    TORCH_CHECK(false, "Unsupported profiler event name: ", name_);
+  }
+
+  struct perf_event_attr attr {};
+  memset(&attr, 0, sizeof(attr));
+
+  attr.size = sizeof(perf_event_attr);
+  attr.type = it->second.first;
+  attr.config = it->second.second;
+  attr.disabled = 1;
+  attr.inherit = 1;
+  attr.exclude_kernel = 1; // TBD
+  attr.exclude_hv = 1;
+  /*
+   * These can be used to calculate estimated totals if the PMU is overcommitted
+   * and multiplexing is happening
+   */
+  attr.read_format =
+      PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
+
+  pid_t pid = getpid(); // this pid
+  int cpu = -1; // all cpus
+  int group_fd = -1;
+  unsigned long flags = 0;
+
+  fd_ = static_cast<int>(perf_event_open(&attr, pid, cpu, group_fd, flags));
+  if (fd_ == -1) {
+    TORCH_CHECK(
+        false, "perf_event_open() failed, error: ", std::strerror(errno));
+  }
+  Reset();
+}
+
+uint64_t PerfEvent::ReadCounter() const {
+  PerfCounter counter{};
+  long n = read(fd_, &counter, sizeof(PerfCounter));
+  TORCH_CHECK(
+      n == sizeof(counter),
+      "Read failed for Perf event fd, event : ",
+      name_,
+      ", error: ",
+      std::strerror(errno));
+  TORCH_CHECK(
+      counter.time_enabled == counter.time_running,
+      "Hardware performance counter time multiplexing is not handled yet",
+      ", name: ",
+      name_,
+      ", enabled: ",
+      counter.time_enabled,
+      ", running: ",
+      counter.time_running);
+  return counter.value;
+}
+
+#else /* __ANDROID__ || __linux__ */
+/*
+ * Shim class for unsupported platforms - this will always return 0 counter
+ * value
+ */
+
+PerfEvent::~PerfEvent(){};
+
+void PerfEvent::Init(){};
+
+uint64_t PerfEvent::ReadCounter() const {
+  return 0;
+};
+
+#endif /* __ANDROID__ || __linux__ */
+
+/*
+ * PerfProfiler
+ * ------------
+ */
+
+void PerfProfiler::Configure(std::vector<std::string>& event_names) {
+  TORCH_CHECK(
+      event_names.size() <= MAX_EVENTS,
+      "Too many events to configure, configured: ",
+      event_names.size(),
+      ", max allowed:",
+      MAX_EVENTS);
+  std::unordered_set<std::string> s(event_names.begin(), event_names.end());
+  TORCH_CHECK(
+      s.size() == event_names.size(), "Duplicate event names are not allowed!")
+  for (auto name : event_names) {
+    events_.emplace_back(name);
+    events_.back().Init();
+  }
+  start_values_.resize(events_.size(), 0);
+
+  // TODO
+  // Reset pthreadpool here to make sure we can attach to new children
+  // threads
+}
+
+void PerfProfiler::Enable() {
+  TORCH_CHECK(!is_enabled_, "Nested perf event counting is not supported yet");
+  for (int i = 0; i < events_.size(); ++i) {
+    start_values_[i] = events_[i].ReadCounter();
+    events_[i].Enable();
+  }
+  is_enabled_ = true;
+}
+
+void PerfProfiler::Disable(perf_counters_t& vals) {
+  TORCH_CHECK(
+      vals.size() == events_.size(),
+      "Can not fit all perf counters in the supplied container");
+  TORCH_CHECK(is_enabled_, "Perf Profiler is not enabled");
+  for (int i = 0; i < events_.size(); ++i) {
+    events_[i].Disable();
+    vals[i] = CalcDelta(start_values_[i], events_[i].ReadCounter());
+  }
+  is_enabled_ = false;
+}
+} // namespace linux_perf
+} // namespace impl
+} // namespace profiler
+} // namespace torch
diff --git a/torch/csrc/profiler/perf.h b/torch/csrc/profiler/perf.h
new file mode 100644
index 0000000000000..941f780f52814
--- /dev/null
+++ b/torch/csrc/profiler/perf.h
@@ -0,0 +1,103 @@
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <torch/csrc/profiler/events.h>
+
+#include <c10/util/Exception.h>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+namespace linux_perf {
+
+/*
+ * Maximum number of events supported
+ * This stems from the hardware limitation on CPU performance counters, and the
+ * fact that we don't support time multiplexing just yet.
+ * Time multiplexing involves scaling the counter values proportional to
+ * the enabled and running time or running the workload multiple times.
+ */
+constexpr uint8_t MAX_EVENTS = 4;
+
+struct PerfCounter {
+  uint64_t value; /* The value of the event */
+  uint64_t time_enabled; /* for TIME_ENABLED */
+  uint64_t time_running; /* for TIME_RUNNING */
+};
+
+/*
+ * Basic perf event handler for Android and Linux
+ */
+class PerfEvent {
+ public:
+  explicit PerfEvent(std::string& name) : name_(name), fd_(-1) {}
+
+  PerfEvent& operator=(PerfEvent&& other) noexcept {
+    if (this != &other) {
+      fd_ = other.fd_;
+      other.fd_ = -1;
+      name_ = std::move(other.name_);
+    }
+    return *this;
+  }
+
+  PerfEvent(PerfEvent&& other) noexcept {
+    *this = std::move(other);
+  }
+
+  ~PerfEvent();
+
+  /* Setup perf events with the Linux Kernel, attaches perf to this process
+   * using perf_event_open(2) */
+  void Init();
+
+  /* Stop incrementing hardware counters for this event */
+  void Disable() const;
+
+  /* Start counting hardware event from this point on */
+  void Enable() const;
+
+  /* Zero out the counts for this event */
+  void Reset() const;
+
+  /* Returns PerfCounter values for this event from kernel, on non supported
+   * platforms this always returns zero */
+  uint64_t ReadCounter() const;
+
+ private:
+  /* Name of the event */
+  std::string name_;
+
+  int fd_ = -1;
+};
+
+class PerfProfiler {
+ public:
+  /* Configure all the events and track them as individual PerfEvent */
+  void Configure(std::vector<std::string>& event_names);
+
+  /* Enable events counting from here */
+  void Enable();
+
+  /* Disable counting and fill in the caller supplied container with delta
+   * calculated from the start count values since last Enable() */
+  void Disable(perf_counters_t&);
+
+ private:
+  uint64_t CalcDelta(uint64_t start, uint64_t end) const;
+
+  bool is_enabled_{false};
+  std::vector<PerfEvent> events_;
+  perf_counters_t start_values_;
+};
+} // namespace linux_perf
+} // namespace impl
+} // namespace profiler
+} // namespace torch

From 5c7f28bd5232e8605b6480d81ef5a405b9c4e5d5 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 2 Nov 2022 11:25:04 +0100
Subject: [PATCH 0447/1922] prepare removal of deprecated functionality in
 torch.testing (#87969)

_Redo of #86586 with all BC breaking changes granularly placed into separate commits._

---

Per title. Deprecation happened on Feb 25, 2022 in c6f1bbc0ac33be0c8ad9956e3fc15e78ddb6cb95, which made it into the 1.12 release. Since it is now 245 days later and the next release will be 1.14, the removals later in the stack comply with the [BC policy](https://github.com/pytorch/pytorch/wiki/PyTorch's-Python-Frontend-Backward-and-Forward-Compatibility-Policy#minimizing-the-disruption-of-bc-breaking-changes).
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87969
Approved by: https://github.com/mruberry
---
 caffe2/python/operator_test/_utils.py         | 50 ++++++++++++++
 .../operator_test/layer_norm_op_test.py       | 30 ++++-----
 .../operator_test/torch_integration_test.py   | 66 ++++++++++---------
 docs/source/fx.rst                            |  4 +-
 functorch/benchmarks/operator_authoring.py    |  8 +--
 functorch/benchmarks/pointwise_scorecard.py   |  4 +-
 functorch/examples/compilation/fuse_module.py |  4 +-
 .../ddp_comm_hooks/test_ddp_hooks.py          | 10 +--
 test/distributed/fsdp/test_fsdp_misc.py       |  2 +-
 test/distributions/test_distributions.py      |  2 +-
 test/functorch/functorch_additional_op_db.py  |  3 +-
 test/fx/test_fx_param_shape_control_flow.py   |  8 +--
 test/jit/test_misc.py                         |  2 +-
 test/mobile/test_lite_script_type.py          | 10 +--
 .../core/test_quantized_tensor.py             |  2 +-
 .../quantization/core/test_workflow_module.py | 32 ++++-----
 test/quantization/core/test_workflow_ops.py   |  8 +--
 test/quantization/fx/test_quantize_fx.py      |  6 +-
 test/test_fx.py                               | 18 ++---
 test/test_fx_experimental.py                  |  4 +-
 test/test_jit_autocast.py                     |  4 +-
 test/test_jit_fuser_te.py                     |  4 +-
 test/test_module_init.py                      |  2 +-
 test/test_mps.py                              | 24 +++----
 test/test_nn.py                               |  6 +-
 test/test_optim.py                            |  2 +-
 test/test_torch.py                            | 13 ++--
 torch/fx/OVERVIEW.md                          |  2 +-
 torch/fx/interpreter.py                       |  4 +-
 torch/jit/_freeze.py                          |  4 +-
 torch/testing/_internal/common_fsdp.py        |  4 +-
 .../_internal/opinfo/definitions/signal.py    |  2 +-
 32 files changed, 198 insertions(+), 146 deletions(-)
 create mode 100644 caffe2/python/operator_test/_utils.py

diff --git a/caffe2/python/operator_test/_utils.py b/caffe2/python/operator_test/_utils.py
new file mode 100644
index 0000000000000..3ee1def89e715
--- /dev/null
+++ b/caffe2/python/operator_test/_utils.py
@@ -0,0 +1,50 @@
+"""
+This file only exists since `torch.testing.assert_allclose` is deprecated, but used extensively throughout the tests in
+this package. The replacement `torch.testing.assert_close` doesn't support one feature that is needed here: comparison
+between numpy arrays and torch tensors. See https://github.com/pytorch/pytorch/issues/61844 for the reasoning why this
+was removed.
+"""
+
+import torch
+from typing import Tuple, Any, Optional
+
+_DTYPE_PRECISIONS = {
+    torch.float16: (1e-3, 1e-3),
+    torch.float32: (1e-4, 1e-5),
+    torch.float64: (1e-5, 1e-8),
+}
+
+
+def _get_default_rtol_and_atol(actual: torch.Tensor, expected: torch.Tensor) -> Tuple[float, float]:
+    actual_rtol, actual_atol = _DTYPE_PRECISIONS.get(actual.dtype, (0.0, 0.0))
+    expected_rtol, expected_atol = _DTYPE_PRECISIONS.get(expected.dtype, (0.0, 0.0))
+    return max(actual_rtol, expected_rtol), max(actual_atol, expected_atol)
+
+
+def assert_allclose(
+    actual: Any,
+    expected: Any,
+    rtol: Optional[float] = None,
+    atol: Optional[float] = None,
+    equal_nan: bool = True,
+    msg: str = "",
+) -> None:
+    if not isinstance(actual, torch.Tensor):
+        actual = torch.tensor(actual)
+    if not isinstance(expected, torch.Tensor):
+        expected = torch.tensor(expected, dtype=actual.dtype)
+
+    if rtol is None and atol is None:
+        rtol, atol = _get_default_rtol_and_atol(actual, expected)
+
+    torch.testing.assert_close(
+        actual,
+        expected,
+        rtol=rtol,
+        atol=atol,
+        equal_nan=equal_nan,
+        check_device=True,
+        check_dtype=False,
+        check_stride=False,
+        msg=msg or None,
+    )
\ No newline at end of file
diff --git a/caffe2/python/operator_test/layer_norm_op_test.py b/caffe2/python/operator_test/layer_norm_op_test.py
index 32a2511e3e8e3..31ba78be0c19f 100644
--- a/caffe2/python/operator_test/layer_norm_op_test.py
+++ b/caffe2/python/operator_test/layer_norm_op_test.py
@@ -18,6 +18,8 @@
 
 import unittest
 
+from ._utils import assert_allclose
+
 
 def _layer_norm_ref(axis, epsilon, X):
     left = int(np.prod(X.shape[:axis]))
@@ -254,10 +256,9 @@ def test_layer_norm_op_c10_preallocated_outputs(
         actual_mean = self.ws.fetch_blob('mean')
         actual_std = self.ws.fetch_blob('std')
 
-        torch.testing.assert_allclose(
-            expected_norm, actual_norm, rtol=1e-4, atol=1e-4)
-        torch.testing.assert_allclose(expected_mean, actual_mean)
-        torch.testing.assert_allclose(expected_std, actual_std)
+        assert_allclose(expected_norm, actual_norm, rtol=1e-4, atol=1e-4)
+        assert_allclose(expected_mean, actual_mean)
+        assert_allclose(expected_std, actual_std)
 
     @given(X=hu.tensor(min_dim=2),
            eps=st.floats(1e-5, 1e-3),
@@ -280,10 +281,9 @@ def test_layer_norm_op_pytorch(self, X, eps, elementwise_affine, gc, dc):
             actual_norm, actual_mean, actual_std = torch.ops._caffe2.LayerNorm(
                 torch.tensor(X), None, None, axis, eps)
 
-        torch.testing.assert_allclose(
-            expected_norm, actual_norm, rtol=1e-4, atol=1e-4)
-        torch.testing.assert_allclose(expected_mean, actual_mean)
-        torch.testing.assert_allclose(expected_std, actual_std)
+        assert_allclose(expected_norm, actual_norm, rtol=1e-4, atol=1e-4)
+        assert_allclose(expected_mean, actual_mean)
+        assert_allclose(expected_std, actual_std)
 
     # Test case is using workspace.has_cuda_support and not
     # workspace.has_gpu_support to exclude it from HIP because tensor interop
@@ -313,10 +313,9 @@ def test_layer_norm_op_pytorch_cuda(self, X, eps, elementwise_affine):
             actual_norm, actual_mean, actual_std = torch.ops._caffe2.LayerNorm(
                 torch.tensor(X).cuda(), None, None, axis, eps)
 
-        torch.testing.assert_allclose(
-            expected_norm, actual_norm.cpu(), rtol=1e-4, atol=1e-4)
-        torch.testing.assert_allclose(expected_mean, actual_mean.cpu())
-        torch.testing.assert_allclose(expected_std, actual_std.cpu())
+        assert_allclose(expected_norm, actual_norm, rtol=1e-4, atol=1e-4)
+        assert_allclose(expected_mean, actual_mean)
+        assert_allclose(expected_std, actual_std)
 
     @given(X=hu.tensor(min_dim=2),
            eps=st.floats(1e-5, 1e-3),
@@ -352,10 +351,9 @@ def jit_layer_norm(
             actual_norm, actual_mean, actual_std = jit_layer_norm(
                 torch.tensor(X), None, None, axis, eps, elementwise_affine)
 
-        torch.testing.assert_allclose(
-            expected_norm, actual_norm, rtol=1e-4, atol=1e-4)
-        torch.testing.assert_allclose(expected_mean, actual_mean)
-        torch.testing.assert_allclose(expected_std, actual_std)
+        assert_allclose(expected_norm, actual_norm, rtol=1e-4, atol=1e-4)
+        assert_allclose(expected_mean, actual_mean)
+        assert_allclose(expected_std, actual_std)
 
     @given(X=hu.tensor(min_dim=2), **hu.gcs)
     def test_layer_norm_brew_wrapper(self, X, gc, dc):
diff --git a/caffe2/python/operator_test/torch_integration_test.py b/caffe2/python/operator_test/torch_integration_test.py
index f99a61688de6e..d143e0193dfd7 100644
--- a/caffe2/python/operator_test/torch_integration_test.py
+++ b/caffe2/python/operator_test/torch_integration_test.py
@@ -11,6 +11,8 @@
 from hypothesis import given, settings
 from scipy.stats import norm
 
+from ._utils import assert_allclose
+
 
 def generate_rois(roi_counts, im_dims):
     assert len(roi_counts) == len(im_dims)
@@ -172,7 +174,7 @@ def bbox_transform_ref():
             legacy_plus_one=True,
         )
 
-        torch.testing.assert_allclose(box_out, a)
+        assert_allclose(box_out, a)
 
     @given(
         roi_counts=st.lists(st.integers(0, 5), min_size=1, max_size=10),
@@ -268,7 +270,7 @@ def box_with_nms_limit_ref():
         )
 
         for o, o_ref in zip(outputs, output_refs):
-            torch.testing.assert_allclose(o, o_ref)
+            assert_allclose(o, o_ref)
 
     @given(
         dim_1=st.integers(min_value=10, max_value=10),
@@ -314,7 +316,7 @@ def sparse_to_dense_mask_ref(return_presence_mask=False):
             mask=mask,
         )
 
-        torch.testing.assert_allclose(output, a)
+        assert_allclose(output, a)
 
         # Testing return_presence_mask = True
         output, presence_mask = sparse_to_dense_mask_ref(return_presence_mask=True)
@@ -330,8 +332,8 @@ def sparse_to_dense_mask_ref(return_presence_mask=False):
             return_presence_mask=True,
         )
 
-        torch.testing.assert_allclose(output, a)
-        torch.testing.assert_allclose(presence_mask, b)
+        assert_allclose(output, a)
+        assert_allclose(presence_mask, b)
 
     @given(
         A=st.integers(min_value=4, max_value=4),
@@ -382,8 +384,8 @@ def generate_proposals_ref():
             1.0,
             legacy_plus_one=True,
         )
-        torch.testing.assert_allclose(rois, a)
-        torch.testing.assert_allclose(rois_probs, b)
+        assert_allclose(rois, a)
+        assert_allclose(rois_probs, b)
 
     @given(
         bsz=st.integers(1, 5),
@@ -461,9 +463,9 @@ def inference_lstm_ref():
         a, b, c = torch.ops._caffe2.InferenceLSTM(
             lstm_in, num_layers, has_biases, batch_first, is_bidirectional
         )
-        torch.testing.assert_allclose(output, a)
-        torch.testing.assert_allclose(hidden, b)
-        torch.testing.assert_allclose(cell, c)
+        assert_allclose(output, a)
+        assert_allclose(hidden, b)
+        assert_allclose(cell, c)
 
     # Test case is using workspace.has_cuda_support and not workspace.has_gpu_support
     # to exclude it from HIP because tensor interop doesn't work for HIP tensors yet
@@ -517,8 +519,8 @@ def generate_proposals_ref():
             1.0,
             legacy_plus_one=True,
         )
-        torch.testing.assert_allclose(rois, a.cpu())
-        torch.testing.assert_allclose(rois_probs, b.cpu())
+        assert_allclose(rois, a.cpu())
+        assert_allclose(rois_probs, b.cpu())
 
     @given(
         N=st.integers(min_value=1, max_value=2),
@@ -567,7 +569,7 @@ def roi_align_ref(_feature, _rois):
             sampling_ratio=0,
             aligned=False,
         )
-        torch.testing.assert_allclose(roi_feature_ref, roi_feature.cpu())
+        assert_allclose(roi_feature_ref, roi_feature.cpu())
 
     def test_roi_align_cpu(self):
         self._test_roi_align(device="cpu")
@@ -624,7 +626,7 @@ def roi_align_ref(_feature, _rois):
             sampling_ratio=0,
             aligned=False,
         )
-        torch.testing.assert_allclose(roi_feature_ref, roi_feature.cpu())
+        assert_allclose(roi_feature_ref, roi_feature.cpu())
 
     def test_roi_align_rotated_cpu(self):
         self._test_roi_align_rotated(device="cpu")
@@ -674,9 +676,9 @@ def test_collect_and_distribute_fpn_rpn_proposals_op(self, roi_counts):
         rois_idx_restore_int32 = fpn_outputs[-1]
 
         # [rois] + fpn_outputs should be equal to all_outputs
-        torch.testing.assert_allclose(rois, all_outputs[0])
+        assert_allclose(rois, all_outputs[0])
         for x, y in zip(fpn_outputs, all_outputs[1:]):
-            torch.testing.assert_allclose(x, y)
+            assert_allclose(x, y)
 
     @given(X=hu.tensor(), fast_gelu=st.booleans())
     def _test_gelu_op(self, X, fast_gelu, device):
@@ -688,7 +690,7 @@ def _gelu_ref(_X):
 
         rtol = 1e-3 if fast_gelu else 1e-4
         atol = 1e-5
-        torch.testing.assert_allclose(
+        assert_allclose(
             expected_output, actual_output.cpu(), rtol=rtol, atol=atol
         )
 
@@ -719,7 +721,7 @@ def _lengths_ref(X, Y):
             torch.tensor(data), torch.tensor(lengths, dtype=torch.int32)
         )
 
-        torch.testing.assert_allclose(expected_output, actual_output.cpu())
+        assert_allclose(expected_output, actual_output.cpu())
 
     def _test_lengths_sum_op(self, device):
         self._test_lengths_op("LengthsSum", torch.ops._caffe2.LengthsSum, device)
@@ -775,7 +777,7 @@ def _resize_nearest_ref(X):
             height_scale=1.5,
         )
 
-        torch.testing.assert_allclose(expected_output, actual_output.cpu())
+        assert_allclose(expected_output, actual_output.cpu())
 
     def test_resize_nearest_op_cpu(self):
         return self._test_resize_nearest_op("cpu")
@@ -838,16 +840,16 @@ def _piecewise_linear_ref(X):
             binary_input,
         )
 
-        torch.testing.assert_allclose(torch.tensor(expected_output), actual_output)
+        assert_allclose(torch.tensor(expected_output), actual_output)
 
     def test_alias_with_name_is_in_place(self):
         device = "cuda" if workspace.has_cuda_support else "cpu"
         x = torch.tensor([3., 42.]).to(device=device)
         y = torch.ops._caffe2.AliasWithName(x, "new_name")
         x[1] = 6
-        torch.testing.assert_allclose(x, torch.tensor([3., 6.]).to(device=device))
+        assert_allclose(x, torch.tensor([3., 6.]).to(device=device))
         # y should also change because y is alias of x
-        torch.testing.assert_allclose(y, torch.tensor([3., 6.]).to(device=device))
+        assert_allclose(y, torch.tensor([3., 6.]).to(device=device))
 
     @unittest.skipIf(not workspace.has_cuda_support, "No cuda support")
     def test_copy_between_cpu_and_gpu(self):
@@ -855,9 +857,9 @@ def test_copy_between_cpu_and_gpu(self):
         x_gpu_ref = x_cpu_ref.to("cuda")
 
         x_gpu = torch.ops._caffe2.CopyCPUToGPU(x_cpu_ref)
-        torch.testing.assert_allclose(x_gpu, x_gpu_ref)
+        assert_allclose(x_gpu, x_gpu_ref)
         x_cpu = torch.ops._caffe2.CopyGPUToCPU(x_gpu)
-        torch.testing.assert_allclose(x_cpu, x_cpu_ref)
+        assert_allclose(x_cpu, x_cpu_ref)
 
     def test_index_hash_op(self):
         data = np.random.randint(low=0, high=1000, size=(4, 4, 4))
@@ -873,7 +875,7 @@ def _index_hash_ref(X):
             torch.tensor(data), seed=0, modulo=100
         )
 
-        torch.testing.assert_allclose(expected_output, actual_output.cpu())
+        assert_allclose(expected_output, actual_output.cpu())
 
     def test_bucketize_op(self):
         data = np.random.rand(8, 10).astype(np.float32) * 1000
@@ -889,7 +891,7 @@ def _bucketize_ref(X):
 
         expected_output = _bucketize_ref(data)
         actual_output = torch.ops._caffe2.Bucketize(torch.tensor(data), boundaries)
-        torch.testing.assert_allclose(expected_output, actual_output.cpu())
+        assert_allclose(expected_output, actual_output.cpu())
 
     @given(X=hu.tensor(), eps=st.floats(min_value=1e-4, max_value=1e-2))
     def test_logit(self, X, eps):
@@ -901,7 +903,7 @@ def ref(X, eps):
 
         expected_output = ref(X, eps)
         actual_output = torch.ops._caffe2.Logit(torch.tensor(X), eps)
-        torch.testing.assert_allclose(expected_output, actual_output.cpu())
+        assert_allclose(expected_output, actual_output.cpu())
 
     def test_percentile(self):
         original_values = np.array([[3.0, 5.0, 3], [5.0, 1.0, 6.0]]).astype(np.float32)
@@ -926,7 +928,7 @@ def _percentile_ref(original_values, value_to_pct, lengths):
             torch.tensor(value_to_pct),
             torch.tensor(lengths),
         )
-        torch.testing.assert_allclose(expected_output, actual_output.cpu())
+        assert_allclose(expected_output, actual_output.cpu())
 
     def test_batch_bucket_one_hot_op(self):
         data = np.array([[2, 3], [4, 1], [2, 5]]).astype(np.float32)
@@ -947,7 +949,7 @@ def _batch_bucket_one_hot_ref(data, lengths, boundaries):
         actual_output = torch.ops._caffe2.BatchBucketOneHot(
             torch.tensor(data), torch.tensor(lengths), torch.tensor(boundaries)
         )
-        torch.testing.assert_allclose(expected_output, actual_output.cpu())
+        assert_allclose(expected_output, actual_output.cpu())
 
     def test_gather_ranges_to_dense_op(self):
         data = np.array([1, 2, 3, 4, 5, 6, 7, 8])
@@ -1033,8 +1035,8 @@ def _merge_id_lists(lengths, values):
                 torch.tensor(values[1]),
             ]
         )
-        torch.testing.assert_allclose(expected_merged_lengths, output_merged_lengths)
-        torch.testing.assert_allclose(expected_merged_values, output_merged_values)
+        assert_allclose(expected_merged_lengths, output_merged_lengths)
+        assert_allclose(expected_merged_values, output_merged_values)
 
     def test_learning_rate(self):
         base_lr = 0.05
@@ -1097,7 +1099,7 @@ def test_pack_segments(self):
         packed_tensor, _ = torch.ops._caffe2.PackSegments(lengths, s)
         self.assertEqual(packed_tensor.numpy().shape, (2, 2, 3, 3))
         unpacked_tensor = torch.ops._caffe2.UnpackSegments(lengths, packed_tensor)
-        torch.testing.assert_allclose(s, unpacked_tensor)
+        assert_allclose(s, unpacked_tensor)
 
 
 if __name__ == "__main__":
diff --git a/docs/source/fx.rst b/docs/source/fx.rst
index 664fee10c67a7..29d73b3055dc9 100644
--- a/docs/source/fx.rst
+++ b/docs/source/fx.rst
@@ -1039,7 +1039,7 @@ Miscellanea
         traced.eval()
 
         x = torch.randn(5, 3)
-        torch.testing.assert_allclose(traced(x), x)
+        torch.testing.assert_close(traced(x), x)
         """
         AssertionError: Tensor-likes are not close!
 
@@ -1071,7 +1071,7 @@ Miscellanea
         traced.eval()
 
         x = torch.randn(5, 3)
-        torch.testing.assert_allclose(traced(x), x)
+        torch.testing.assert_close(traced(x), x)
 
   - Because of this difference, consider marking modules that interact with the ``training`` flag dynamically as leaf modules.
 
diff --git a/functorch/benchmarks/operator_authoring.py b/functorch/benchmarks/operator_authoring.py
index 88e558bdafc1a..cbd816e2ad132 100644
--- a/functorch/benchmarks/operator_authoring.py
+++ b/functorch/benchmarks/operator_authoring.py
@@ -77,7 +77,7 @@ def setup(n):
         assert result_nnc.dtype == result_aten.dtype
         assert result_nnc.size() == result_aten.size()
         assert result_nnc.stride() == result_aten.stride()
-        torch.testing.assert_allclose(result_aten, result_nnc)
+        torch.testing.assert_close(result_aten, result_nnc)
         return (lambda: nnc(*args), lambda: aten(*args))
 
     return benchmark_loop(setup)
@@ -90,7 +90,7 @@ def inplace_setup(n):
         result_nnc = torch.clone(a)
         nnc(result_nnc, b, out=result_nnc)
         aten(result_aten, b, out=result_aten)
-        torch.testing.assert_allclose(result_aten, result_nnc)
+        torch.testing.assert_close(result_aten, result_nnc)
         return (lambda: nnc(a, b, out=a), lambda: aten(a, b, out=a))
 
     return benchmark_loop(inplace_setup)
@@ -103,7 +103,7 @@ def out_setup(n):
         result_nnc = out(n)
         aten(*args, out=result_aten)
         nnc(*args, out=result_nnc)
-        torch.testing.assert_allclose(result_aten, result_nnc)
+        torch.testing.assert_close(result_aten, result_nnc)
         result = out(n)
         return (lambda: nnc(*args, out=result), lambda: aten(*args, out=result))
 
@@ -118,7 +118,7 @@ def backwards_setup(n):
         correct = grad_var.grad.clone()
         grad_var.grad.zero_()
         nnc(*args).sum().backward()
-        torch.testing.assert_allclose(correct, grad_var.grad)
+        torch.testing.assert_close(correct, grad_var.grad)
         return (
             lambda: nnc(*args).sum().backward(),
             lambda: aten(*args).sum().backward(),
diff --git a/functorch/benchmarks/pointwise_scorecard.py b/functorch/benchmarks/pointwise_scorecard.py
index ac4cf5f386dcf..15863dc3510cf 100644
--- a/functorch/benchmarks/pointwise_scorecard.py
+++ b/functorch/benchmarks/pointwise_scorecard.py
@@ -195,13 +195,13 @@ def micros(s):
         if shape == medium_transpose:
             raise RuntimeError("pointwise_operator hangs on medium_transpose")
         pw_op = pointwise_operator(operator)
-        torch.testing.assert_allclose(operator(*args), pw_op(*args))
+        torch.testing.assert_close(operator(*args), pw_op(*args))
     except Exception:
         print(f"pointwise_operator failed on {operator.__name__}, {shape.__name__}")
         nope.add((operator, shape))
 
     ts_op = torch.jit.script(operator)
-    torch.testing.assert_allclose(operator(*args), ts_op(*args))
+    torch.testing.assert_close(operator(*args), ts_op(*args))
 
 
 print("fuser,device,operator,shape,time")
diff --git a/functorch/examples/compilation/fuse_module.py b/functorch/examples/compilation/fuse_module.py
index dafbc80711a3a..ec091eb24435a 100644
--- a/functorch/examples/compilation/fuse_module.py
+++ b/functorch/examples/compilation/fuse_module.py
@@ -36,7 +36,7 @@ def forward(self, x):
 compiled_mod = compiled_module(mod, fw_compiler, bw_compiler)
 
 for a, b in zip(run(mod, input), run(compiled_mod, input)):
-    torch.testing.assert_allclose(a, b)
+    torch.testing.assert_close(a, b)
 
 out = mod(input)
 out.sum().backward()
@@ -45,7 +45,7 @@ def forward(self, x):
 compiled_mod.orig_module.param.grad = None
 
 for a, b in zip(run(mod, input), run(compiled_mod, input)):
-    torch.testing.assert_allclose(a, b)
+    torch.testing.assert_close(a, b)
 
 for _ in range(5):
     i = 10000
diff --git a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
index ead934eb83e73..d3ea932b05fca 100644
--- a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
+++ b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
@@ -124,7 +124,7 @@ def test_ddp_comm_hook_allreduce_hook(self):
         # Register hook case, get the hook grads.
         hook_grads = self._get_grads(process_group, DDPCommHookType.ALLREDUCE)
 
-        torch.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=0)
+        torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=0)
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
@@ -141,7 +141,7 @@ def test_ddp_comm_hook_fp16compress_hook(self):
         # Register hook case, get the hook grads.
         hook_grads = self._get_grads(process_group, DDPCommHookType.FP16_COMPRESS)
 
-        torch.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
+        torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
@@ -158,7 +158,7 @@ def test_ddp_comm_hook_quantize_per_tensor_hook(self):
         # Register hook case, get the hook grads.
         hook_grads = self._get_grads(process_group, DDPCommHookType.QUANTIZE_PER_TENSOR)
 
-        torch.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
+        torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
@@ -177,7 +177,7 @@ def test_ddp_comm_hook_quantize_per_channel_hook(self):
             process_group, DDPCommHookType.QUANTIZE_PER_CHANNEL
         )
 
-        torch.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
+        torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
 
 
     @requires_nccl()
@@ -198,7 +198,7 @@ def test_ddp_comm_hook_noop_hook(self):
         hook_grads.div_(self.world_size)
         dist.all_reduce(hook_grads, group=process_group)
 
-        torch.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=0)
+        torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=0)
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
diff --git a/test/distributed/fsdp/test_fsdp_misc.py b/test/distributed/fsdp/test_fsdp_misc.py
index 234af4f7a94f9..79ed6da6240fa 100644
--- a/test/distributed/fsdp/test_fsdp_misc.py
+++ b/test/distributed/fsdp/test_fsdp_misc.py
@@ -110,7 +110,7 @@ def _check_resharded(fsdp_module):
         def _check_equal(local, fsdp):
             with FSDP.summon_full_params(fsdp):
                 for p1, p2 in zip(fsdp.parameters(), local.parameters()):
-                    torch.testing.assert_allclose(p1, p2)
+                    torch.testing.assert_close(p1, p2)
 
         for sharding_strategy in [
             ShardingStrategy.FULL_SHARD,
diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py
index b6201d4d9e84d..127018516e123 100644
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@@ -1421,7 +1421,7 @@ def ref_log_prob(ref_rate, idx, x, log_prob):
         # theoretical results.
         dist = Poisson(rate_zero)
         dist.log_prob(torch.ones_like(rate_zero)).backward()
-        torch.testing.assert_allclose(rate_zero.grad, torch.inf)
+        self.assertEqual(rate_zero.grad, torch.inf)
 
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_poisson_sample(self):
diff --git a/test/functorch/functorch_additional_op_db.py b/test/functorch/functorch_additional_op_db.py
index b090121d21807..6343e7420546b 100644
--- a/test/functorch/functorch_additional_op_db.py
+++ b/test/functorch/functorch_additional_op_db.py
@@ -4,8 +4,7 @@
 
 import torch
 
-from torch.testing import \
-    (floating_types, floating_types_and, all_types_and_complex_and)
+from torch.testing._internal.common_dtype import floating_types, floating_types_and, all_types_and_complex_and
 from torch.testing._internal.common_utils import make_tensor
 from torch.testing._internal.common_methods_invocations import OpInfo, SampleInput, DecorateInfo
 
diff --git a/test/fx/test_fx_param_shape_control_flow.py b/test/fx/test_fx_param_shape_control_flow.py
index e9af35d604577..04db468a7e631 100644
--- a/test/fx/test_fx_param_shape_control_flow.py
+++ b/test/fx/test_fx_param_shape_control_flow.py
@@ -91,26 +91,26 @@ def verify_mm_relu_mods(self, mm_only_mod, relu_mod):
         performs both mm and relu ops in cascade
         """
         x = torch.randn(10, 5)
-        torch.testing.assert_allclose(mm_only_mod(x), torch.mm(x, mm_only_mod.get_mul_matrix()))
+        torch.testing.assert_close(mm_only_mod(x), torch.mm(x, mm_only_mod.get_mul_matrix()))
         tracer = torch.fx.Tracer(param_shapes_constant=True)
         traced_graph = tracer.trace(mm_only_mod)
 
         # verify the graph module calculates the same result
         graph_mod_mm = torch.fx.GraphModule(mm_only_mod, traced_graph)
-        torch.testing.assert_allclose(graph_mod_mm(x), torch.mm(x, mm_only_mod.get_mul_matrix()))
+        torch.testing.assert_close(graph_mod_mm(x), torch.mm(x, mm_only_mod.get_mul_matrix()))
 
 
         # Make a new module with different parameter shape to go down the different
         # code path
         x = torch.randn(10, 15)
-        torch.testing.assert_allclose(relu_mod(x), torch.relu(torch.mm(x, relu_mod.get_mul_matrix())))
+        torch.testing.assert_close(relu_mod(x), torch.relu(torch.mm(x, relu_mod.get_mul_matrix())))
 
         tracer2 = torch.fx.Tracer(param_shapes_constant=True)
         traced_graph2 = tracer2.trace(relu_mod)
 
         # verify the graph module calculates the same result
         graph_mod_relu = torch.fx.GraphModule(relu_mod, traced_graph2)
-        torch.testing.assert_allclose(graph_mod_relu(x), torch.relu(torch.mm(x, relu_mod.get_mul_matrix())))
+        torch.testing.assert_close(graph_mod_relu(x), torch.relu(torch.mm(x, relu_mod.get_mul_matrix())))
 
 
         graph1_node_targets = [n.target for n in traced_graph.nodes]
diff --git a/test/jit/test_misc.py b/test/jit/test_misc.py
index 98ec7831d940e..8a5d4ea5f4a7a 100644
--- a/test/jit/test_misc.py
+++ b/test/jit/test_misc.py
@@ -379,4 +379,4 @@ def foo(x, dim: int):
         expected = foo(x, 0)
         scripted = torch.jit.script(foo)
         actual = scripted(x, 0)
-        torch.testing.assert_allclose(expected, actual)
+        torch.testing.assert_close(expected, actual)
diff --git a/test/mobile/test_lite_script_type.py b/test/mobile/test_lite_script_type.py
index 9a778fb5a7fd9..8769a4b2363a2 100644
--- a/test/mobile/test_lite_script_type.py
+++ b/test/mobile/test_lite_script_type.py
@@ -28,7 +28,7 @@ def forward(self, a: torch.Tensor):
         buffer.seek(0)
         mobile_module = _load_for_lite_interpreter(buffer)  # Error here
         mobile_module_result = mobile_module(sample_input).a
-        torch.testing.assert_allclose(
+        torch.testing.assert_close(
             script_module_result,
             mobile_module_result
         )
@@ -91,7 +91,7 @@ def forward(self, a: torch.Tensor):
         buffer_mobile.seek(0)
         mobile_module = _load_for_lite_interpreter(buffer_mobile)
         mobile_module_result = mobile_module(sample_input)
-        torch.testing.assert_allclose(
+        torch.testing.assert_close(
             script_module_result,
             mobile_module_result
         )
@@ -117,7 +117,7 @@ def forward(self, a: torch.Tensor):
         buffer_mobile.seek(0)
         mobile_module = _load_for_lite_interpreter(buffer_mobile)
         mobile_module_result = mobile_module(sample_input)
-        torch.testing.assert_allclose(
+        torch.testing.assert_close(
             script_module_result,
             mobile_module_result
         )
@@ -136,7 +136,7 @@ def forward(self, a: torch.Tensor):
         buffer_mobile.seek(0)
         mobile_module = _load_for_lite_interpreter(buffer_mobile)
         mobile_module_result = mobile_module(sample_input)
-        torch.testing.assert_allclose(
+        torch.testing.assert_close(
             script_module_result,
             mobile_module_result
         )
@@ -166,7 +166,7 @@ def forward(self, a: torch.Tensor):
         buffer_mobile.seek(0)
         mobile_module = _load_for_lite_interpreter(buffer_mobile)
         mobile_module_result = mobile_module(sample_input)
-        torch.testing.assert_allclose(
+        torch.testing.assert_close(
             script_module_result.baz.di,
             mobile_module_result.baz.di
         )
diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py
index 35d3ba35d7210..a2043509f1f13 100644
--- a/test/quantization/core/test_quantized_tensor.py
+++ b/test/quantization/core/test_quantized_tensor.py
@@ -1461,7 +1461,7 @@ def test_bfp16_quantize(self):
         X = torch.randn(5 , 10)
         quantized_X = X.to(torch.bfloat16)
         dedequantized_X = quantized_X.to(torch.float32)
-        torch.testing.assert_allclose(X, dedequantized_X, rtol=1e-4, atol=5e-3)
+        torch.testing.assert_close(X, dedequantized_X, rtol=1e-4, atol=5e-3)
 
     def test_decomposed_quantize(self):
         # register the ops
diff --git a/test/quantization/core/test_workflow_module.py b/test/quantization/core/test_workflow_module.py
index 7194872f4e5e9..6ac8bed90ca3f 100644
--- a/test/quantization/core/test_workflow_module.py
+++ b/test/quantization/core/test_workflow_module.py
@@ -1011,11 +1011,11 @@ def test_fused_obs_fq_module(self, device):
         )
 
         # Compare params with reference
-        torch.testing.assert_allclose(out, out_ref)
-        torch.testing.assert_allclose(
+        torch.testing.assert_close(out, out_ref)
+        torch.testing.assert_close(
             running_min_op, mod.activation_post_process.min_val
         )
-        torch.testing.assert_allclose(
+        torch.testing.assert_close(
             running_max_op, mod.activation_post_process.max_val
         )
 
@@ -1066,11 +1066,11 @@ def test_fused_obs_fq_moving_avg_module(self, device):
             )
 
             # Compare params with reference
-            torch.testing.assert_allclose(out, out_ref)
-            torch.testing.assert_allclose(
+            torch.testing.assert_close(out, out_ref)
+            torch.testing.assert_close(
                 running_min_op, mod.activation_post_process.min_val
             )
-            torch.testing.assert_allclose(
+            torch.testing.assert_close(
                 running_max_op, mod.activation_post_process.max_val
             )
 
@@ -1095,12 +1095,12 @@ def test_compare_fused_obs_fq_oss_module(self, device):
             x = torch.randn(5, 5, device=device)
             out = mod(x)
             out_ref = mod_ref(x)
-            torch.testing.assert_allclose(out, out_ref)
-            torch.testing.assert_allclose(
+            torch.testing.assert_close(out, out_ref)
+            torch.testing.assert_close(
                 mod_ref.activation_post_process.min_val,
                 mod.activation_post_process.min_val,
             )
-            torch.testing.assert_allclose(
+            torch.testing.assert_close(
                 mod_ref.activation_post_process.max_val,
                 mod.activation_post_process.max_val,
             )
@@ -1151,20 +1151,20 @@ def test_fused_mod_per_channel(self):
                     False,
                 )
                 # Compare params with reference
-                torch.testing.assert_allclose(out, out_ref)
+                torch.testing.assert_close(out, out_ref)
                 if mod.observer_enabled[0]:
-                    torch.testing.assert_allclose(
+                    torch.testing.assert_close(
                         running_min_op, mod.activation_post_process.min_val
                     )
-                    torch.testing.assert_allclose(
+                    torch.testing.assert_close(
                         running_max_op, mod.activation_post_process.max_val
                     )
                 if mod.fake_quant_enabled:
-                    torch.testing.assert_allclose(scale, mod.scale)
-                    torch.testing.assert_allclose(zero_point, mod.zero_point)
+                    torch.testing.assert_close(scale, mod.scale)
+                    torch.testing.assert_close(zero_point, mod.zero_point)
 
-            torch.testing.assert_allclose(mod.state_dict()['activation_post_process.min_val'], running_min_op)
-            torch.testing.assert_allclose(mod.state_dict()['activation_post_process.max_val'], running_max_op)
+            torch.testing.assert_close(mod.state_dict()['activation_post_process.min_val'], running_min_op)
+            torch.testing.assert_close(mod.state_dict()['activation_post_process.max_val'], running_max_op)
 
     def test_fused_mod_reduce_range(self):
         obs = FusedMovingAvgObsFakeQuantize(quant_min=0, quant_max=255, dtype=torch.quint8, reduce_range=True)
diff --git a/test/quantization/core/test_workflow_ops.py b/test/quantization/core/test_workflow_ops.py
index b459b5865bfaa..a0687d88fa57d 100644
--- a/test/quantization/core/test_workflow_ops.py
+++ b/test/quantization/core/test_workflow_ops.py
@@ -1083,7 +1083,7 @@ def test_fused_obs_fake_quant_moving_avg(self, device, symmetric_quant) -> None:
 
             self.assertEqual(in_running_min_ref, in_running_min_op)
             self.assertEqual(in_running_max_ref, in_running_max_op)
-            torch.testing.assert_allclose(out, x_in)
+            torch.testing.assert_close(out, x_in)
 
         # Test empty input works
         x = torch.empty(0, 5, device=device)
@@ -1176,7 +1176,7 @@ def test_fused_obs_fake_quant_moving_avg_per_channel(self, device, symmetric_qua
                     x_in = x
                 self.assertEqual(in_running_min_ref, in_running_min_op)
                 self.assertEqual(in_running_max_ref, in_running_max_op)
-                torch.testing.assert_allclose(out, x_in)
+                torch.testing.assert_close(out, x_in)
 
     @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']),)
     @settings(deadline=None)
@@ -1218,7 +1218,7 @@ def test_fused_obs_fake_quant_backward_op(self, device) -> None:
             False,
         )
         # verify the output matches
-        torch.testing.assert_allclose(out, x_fake_quant)
+        torch.testing.assert_close(out, x_fake_quant)
 
         # verify the gradient matches expectation of fake_quant op
         dout = torch.rand_like(x, dtype=torch.float).to(device)
@@ -1264,7 +1264,7 @@ def test_fused_backward_op_fake_quant_off(self, device) -> None:
             False,
         )
         # verify the output matches
-        torch.testing.assert_allclose(out, x)
+        torch.testing.assert_close(out, x)
 
         # verify the gradient matches expectation of fake_quant op
         dout = torch.rand_like(x, dtype=torch.float).to(device)
diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 04109ce532f20..2bc1ed4fc43ee 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -8157,7 +8157,7 @@ def forward(self, x):
             inp = torch.randn(5, 5, device=device, requires_grad=True)
             out_ref = prepared_ref(inp)
             out = prepared(inp)
-            torch.testing.assert_allclose(out, out_ref)
+            torch.testing.assert_close(out, out_ref)
 
             # try backward pass
             labels = torch.randn(5, 5, device=device)
@@ -8165,7 +8165,7 @@ def forward(self, x):
             grad = torch.autograd.grad(loss, [inp])
             loss_ref = (out_ref - labels).sum()
             grad_ref = torch.autograd.grad(loss_ref, [inp])
-            torch.testing.assert_allclose(grad[0], grad_ref[0])
+            torch.testing.assert_close(grad[0], grad_ref[0])
 
         if 'fbgemm' in torch.backends.quantized.supported_engines:
             # During the lowering step in convert, fold_weight calls quantized::linear_prepack
@@ -8178,7 +8178,7 @@ def forward(self, x):
             out = converted(inp)
             out_ref = converted_ref(inp)
 
-            torch.testing.assert_allclose(out, out_ref)
+            torch.testing.assert_close(out, out_ref)
 if __name__ == '__main__':
     raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
                        "\tpython test/test_quantization.py TESTNAME\n\n"
diff --git a/test/test_fx.py b/test/test_fx.py
index eac58fb4368d8..9a46a50982961 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -234,7 +234,7 @@ def forward(self, x):
         new_instance.__init__(gm3, gm3.graph)
 
         x = torch.randn(5, 3)
-        torch.testing.assert_allclose(new_instance(x), torch.relu(x))
+        torch.testing.assert_close(new_instance(x), torch.relu(x))
 
     def test_custom_import(self):
         graph = torch.fx.Graph()
@@ -809,7 +809,7 @@ def forward(self, x):
         traced = torch.fx.symbolic_trace(ec)
 
         x = torch.randn(bs, d_hid)
-        torch.testing.assert_allclose(ec(x), traced(x))
+        torch.testing.assert_close(ec(x), traced(x))
 
 
     def test_node_tagging(self):
@@ -1126,7 +1126,7 @@ def foo(x : Tuple):
 
         traced = torch.fx.symbolic_trace(foo)
         x = (torch.randn(5, 3),)
-        torch.testing.assert_allclose(traced(x), x[0])
+        torch.testing.assert_close(traced(x), x[0])
 
         bio = io.BytesIO()
 
@@ -1136,7 +1136,7 @@ def foo(x : Tuple):
 
         loaded = torch.load(bio)
 
-        torch.testing.assert_allclose(loaded(x), x[0])
+        torch.testing.assert_close(loaded(x), x[0])
 
     def test_torch_fx_len(self):
         class FXLenTest(torch.nn.Module):
@@ -1806,7 +1806,7 @@ def forward(self, x, y=3.14159):
         interp = Interpreter(gm)
         x = torch.randn(5, 3)
         out = interp.run(x)
-        torch.testing.assert_allclose(out, x + 3.14159)
+        torch.testing.assert_close(out, x + 3.14159)
 
     def test_interpreter_not_enough_args(self):
         class Model(torch.nn.Module):
@@ -2315,8 +2315,8 @@ def forward(self, x):
         traced1.recompile()
 
         x = torch.randn(15, 15)
-        torch.testing.assert_allclose(traced1(x), torch.relu(x))
-        torch.testing.assert_allclose(copied(x), torch.neg(x))
+        torch.testing.assert_close(traced1(x), torch.relu(x))
+        torch.testing.assert_close(copied(x), torch.neg(x))
 
     def test_direct_param_use(self):
         class TransposeTest(torch.nn.Module):
@@ -2699,7 +2699,7 @@ def forward(self, x):
         replica = gm._replicate_for_data_parallel()
         out_replica = replica(x)
 
-        torch.testing.assert_allclose(out_replica, out)
+        torch.testing.assert_close(out_replica, out)
 
     def test_ast_rewriter_rewrites_assert(self):
         class M(torch.nn.Module):
@@ -3045,7 +3045,7 @@ def is_leaf_module(self, m: torch.nn.Module, module_qualified_name : str) -> boo
         traced_graph = MyCustomTracer().trace(model)
         gm2 = torch.fx.GraphModule(model, traced_graph)
         gm2.delete_all_unused_submodules()
-        torch.testing.assert_allclose(gm2(inputs), model(inputs))
+        torch.testing.assert_close(gm2(inputs), model(inputs))
 
     def test_fx_stateless(self):
         class MockModule(torch.nn.Module):
diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index ae7a2250b8abb..a8fc077703023 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -782,7 +782,7 @@ def split_callback(n):
 
         x = torch.randn(5, 3)
         foo = torch.randn(5, 3)
-        torch.testing.assert_allclose(split(x, foo=foo), traced(x, foo=foo))
+        torch.testing.assert_close(split(x, foo=foo), traced(x, foo=foo))
 
     @skipIfNoTorchVision
     def test_subgraph_trivial_resnet(self):
@@ -814,7 +814,7 @@ def forward(self, x, targets=None):
         split = split_module(traced, mtt, lambda node: 0)
 
         x = torch.randn(50, 512)
-        torch.testing.assert_allclose(split(x), traced(x))
+        torch.testing.assert_close(split(x), traced(x))
 
     def test_normalize_binary_operators(self):
         ops_to_test = {
diff --git a/test/test_jit_autocast.py b/test/test_jit_autocast.py
index 93674bb70d820..d311eb687a763 100644
--- a/test/test_jit_autocast.py
+++ b/test/test_jit_autocast.py
@@ -797,7 +797,7 @@ def test_nchw_autocast_jit_trace_model(model, x):
                 y = traced_model(x.clone())
             with torch.cpu.amp.autocast(), torch.no_grad():
                 y2 = model(x.clone())
-            torch.testing.assert_allclose(y.double(), y2.double(), rtol=1e-03, atol=1e-03)
+            torch.testing.assert_close(y.double(), y2.double(), rtol=1e-03, atol=1e-03)
         for i in range(self.models.__len__()):
             test_nchw_autocast_jit_trace_model(self.models[i], self.inputs[i])
 
@@ -812,7 +812,7 @@ def test_nhwc_autocast_jit_trace_model(model, x):
                 y = traced_model(x.clone().to(memory_format=torch.channels_last))
             with torch.cpu.amp.autocast(), torch.no_grad():
                 y2 = model(x.clone().to(memory_format=torch.channels_last))
-            torch.testing.assert_allclose(y.double(), y2.double(), rtol=1e-03, atol=1e-03)
+            torch.testing.assert_close(y.double(), y2.double(), rtol=1e-03, atol=1e-03)
         for i in range(self.models.__len__()):
             if self.inputs[i].size().__len__() == 5:
                 # NHWC 3D case not support yet
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index e1c820fda9c46..19585296870ba 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -2202,7 +2202,9 @@ def test_batch_norm(self):
         def test(fn, args):
             trace = torch.jit.trace(fn, args)
             self.assertAllFused(trace.graph_for(*args))
-            torch.testing.assert_allclose(fn(*args), trace(*args))
+            # TODO: Are `NaN`'s actually ok here or did this pass silently before, because `equal_nan=True` was the
+            #  default?
+            torch.testing.assert_close(fn(*args), trace(*args), equal_nan=True)
 
         def bn(i, x):
             return torch.batch_norm(i, x, x, x, x, False, 0.1, 1e-4, False).relu()
diff --git a/test/test_module_init.py b/test/test_module_init.py
index dc05a95da6f2b..98dcb3ee694a4 100644
--- a/test/test_module_init.py
+++ b/test/test_module_init.py
@@ -4,7 +4,7 @@
 import torch
 from unittest import mock
 from unittest.mock import MagicMock, patch
-from torch.testing import floating_types
+from torch.testing._internal.common_dtype import floating_types
 from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes
 from torch.testing._internal.common_quantization import skipIfNoFBGEMM
 from torch.testing._internal.common_utils import TestCase, run_tests
diff --git a/test/test_mps.py b/test/test_mps.py
index 2b30ab926035b..257dd238e3e7c 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -65,7 +65,7 @@ def _npRelu(self, np_features):
         return np.maximum(np_features, np.zeros(np_features.shape)).astype(np_features.dtype)
 
     def testNpRelu(self):
-        torch.testing.assert_allclose(
+        torch.testing.assert_close(
             np.array([[0., 0.7, 0.0, 0.3, 0.0], [0.1, 0.0, 0.5, 0.0, 0.9]]),
             self._npRelu(
                 np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
@@ -79,7 +79,7 @@ def _testRelu(self, np_features, device):
         py_relu = torch.nn.ReLU(inplace=False)(py_tensor)
         py_relu_cpu = py_relu.to("cpu")
 
-        torch.testing.assert_allclose(np_relu, py_relu_cpu)
+        self.assertEqual(np_relu, py_relu_cpu)
 
     def _testReluInPlace(self, np_features, device):
         np_relu = self._npRelu(np_features)
@@ -89,9 +89,9 @@ def _testReluInPlace(self, np_features, device):
         py_relu = torch.nn.ReLU(inplace=True)(py_tensor)
         py_relu_cpu = py_relu.to("cpu")
 
-        torch.testing.assert_allclose(np_relu, py_relu_cpu)
+        self.assertEqual(np_relu, py_relu_cpu)
         # Inplace Relu modifies the initial input and it should match the output of Relu
-        torch.testing.assert_allclose(np_relu, py_tensor.to("cpu"))
+        self.assertEqual(np_relu, py_tensor.to("cpu"))
 
     def testNumbersCPU(self):
         for t in [np.int32]:
@@ -156,7 +156,7 @@ def _npLeakyRelu(self, np_features, negative_slope=0.1):
         return np.maximum(np_features, negative_slope * np_features).astype(np_features.dtype)
 
     def testNpLeakyRelu(self):
-        torch.testing.assert_allclose(
+        torch.testing.assert_close(
             np.array([[-0.09, 0.7, -0.05, 0.3, -0.01],
                       [0.1, -0.03, 0.5, -0.07, 0.9]]),
             self._npLeakyRelu(
@@ -171,14 +171,14 @@ def _testLeakyRelu(self, np_features, negative_slope, device):
 
         cpu_leaky_relu = relu_op(cpu_x)
         mps_leaky_relu = relu_op(mps_x)
-        torch.testing.assert_allclose(cpu_leaky_relu, mps_leaky_relu.to('cpu'))
+        torch.testing.assert_close(cpu_leaky_relu, mps_leaky_relu.to('cpu'))
 
         # test backward pass
         cpu_grad = torch.ones_like(cpu_leaky_relu)
         mps_grad = cpu_grad.to('mps')
         cpu_leaky_relu.backward(gradient=cpu_grad)
         mps_leaky_relu.backward(gradient=mps_grad)
-        torch.testing.assert_allclose(cpu_x.grad, mps_x.grad.to('cpu'))
+        torch.testing.assert_close(cpu_x.grad, mps_x.grad.to('cpu'))
 
     def testNumbersCPU(self):
         for t in [np.float32]:
@@ -257,14 +257,14 @@ def _testLeakyRelu(self, np_features, negative_slope, device):
 
         cpu_leaky_relu = relu_op(cpu_x)
         mps_leaky_relu = relu_op(mps_x)
-        torch.testing.assert_allclose(cpu_leaky_relu, mps_leaky_relu.to('cpu'))
+        torch.testing.assert_close(cpu_leaky_relu, mps_leaky_relu.to('cpu'))
 
         # test backward pass
         cpu_grad = torch.ones_like(cpu_leaky_relu)
         mps_grad = cpu_grad.to('mps')
         cpu_leaky_relu.backward(gradient=cpu_grad)
         mps_leaky_relu.backward(gradient=mps_grad)
-        torch.testing.assert_allclose(cpu_x.grad, mps_x.grad.to('cpu'))
+        torch.testing.assert_close(cpu_x.grad, mps_x.grad.to('cpu'))
 
     def testNumbersGPU(self):
         for t in [np.float32]:
@@ -293,14 +293,14 @@ def test_mm(self):
         B = torch.ones(5, 6).to("mps")
         C = torch.ones(6, 5).to("mps")
         D = torch.mm(B, C).cpu()
-        torch.testing.assert_allclose(D, torch.full((5, 5), 6.0))
+        torch.testing.assert_close(D, torch.full((5, 5), 6.0))
 
     def test_addmm(self):
         A = torch.ones(5, 5).to("mps")
         B = torch.ones(5, 6).to("mps")
         C = torch.ones(6, 5).to("mps")
         D = torch.addmm(A, B, C).to("cpu")
-        torch.testing.assert_allclose(D, torch.full((5, 5), 7.0))
+        torch.testing.assert_close(D, torch.full((5, 5), 7.0))
 
     def test_bmm(self):
         batch1_cpu = torch.randn(10, 3, 4)
@@ -355,7 +355,7 @@ def helper(input_shape, batch1_shape, batch2_shape):
     def test_local_scalar_dense_mps(self):
         x_cpu = torch.randn(1)
         y_mps = x_cpu.to("mps")
-        torch.testing.assert_allclose(x_cpu.item(), y_mps.item())
+        torch.testing.assert_close(x_cpu.item(), y_mps.item())
 
     def test_linear_1d_weight(self):
         device = 'cpu'
diff --git a/test/test_nn.py b/test/test_nn.py
index cab9db75cdbf6..13036ef18740f 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -4907,7 +4907,7 @@ def assert_is_orthogonal(X):
                 n, k = k, n
             Id = torch.eye(k, dtype=X.dtype, device=X.device).expand(*(X.size()[:-2]), k, k)
             eps = 10 * n * torch.finfo(X.dtype).eps
-            torch.testing.assert_allclose(X.mH @ X, Id, atol=eps, rtol=0.)
+            torch.testing.assert_close(X.mH @ X, Id, atol=eps, rtol=0.)
 
 
         def assert_weight_allclose_Q(weight, W):
@@ -4920,7 +4920,7 @@ def assert_weight_allclose_Q(weight, W):
             Q *= R.diagonal(dim1=-2, dim2=-1).sgn().unsqueeze(-2)
             if wide_matrix:
                 Q = Q.mT
-            torch.testing.assert_allclose(Q, weight, atol=1e-5, rtol=0.)
+            torch.testing.assert_close(Q, weight, atol=1e-5, rtol=0.)
 
 
         for shape, dtype, use_linear in product(((4, 4), (5, 3), (3, 5)),  # square/ tall / wide
@@ -4979,7 +4979,7 @@ def assert_weight_allclose_Q(weight, W):
                     w_new = w_new.mT
                 if can_initialize:
                     m.weight = w_new
-                    torch.testing.assert_allclose(w_new, m.weight, atol=1e-5, rtol=0.)
+                    torch.testing.assert_close(w_new, m.weight, atol=1e-5, rtol=0.)
                 else:
                     msg = "assign to the matrix exponential or the Cayley parametrization"
                     with self.assertRaisesRegex(NotImplementedError, msg):
diff --git a/test/test_optim.py b/test/test_optim.py
index a55a74d5d8667..e611f75b67dee 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -1650,7 +1650,7 @@ def test_cos_anneal_lr_continue(self):
         new_scheduler = CosineAnnealingLR(
             self.opt, T_max=T_max, eta_min=eta_min, last_epoch=0)
         new_lrs = new_scheduler._last_lr
-        torch.testing.assert_allclose(original_lrs, new_lrs, rtol=1e-4, atol=1e-5)
+        torch.testing.assert_close(original_lrs, new_lrs, rtol=1e-4, atol=1e-5)
 
     def test_reduce_lr_on_plateau1(self):
         epochs = 10
diff --git a/test/test_torch.py b/test/test_torch.py
index f84d8aff08950..b507f68436d45 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -37,7 +37,7 @@
     skipCUDAMemoryLeakCheckIf, BytesIOContext,
     skipIfRocm, skipIfNoSciPy, TemporaryFileName, TemporaryDirectoryName,
     wrapDeterministicFlagAPITest, DeterministicGuard, CudaSyncGuard,
-    skipIfNotRegistered, bytes_to_scalar, parametrize, skipIfMps)
+    skipIfNotRegistered, bytes_to_scalar, parametrize, skipIfMps, noncontiguous_like)
 from multiprocessing.reduction import ForkingPickler
 from torch.testing._internal.common_device_type import (
     expectedFailureMeta,
@@ -2959,10 +2959,9 @@ def test_index_reduce(self, device, dtype, reduce):
                     dest = make_tensor(size, device=device, dtype=dtype, noncontiguous=dest_noncontig)
                     src_size = size[:dim] + (num_src,) + size[dim + 1:]
                     src = make_tensor(src_size, device=device, dtype=dtype, noncontiguous=src_noncontig)
-                    idx = torch.randint(num_dest, (num_src,), dtype=idx_dtype, device=device)
-                    if index_noncontig:
-                        # noncontiguous_like fails with RuntimeError: XLA tensors do not have storage
-                        idx = torch.testing.make_non_contiguous(idx)
+                    idx = torch.testing.make_tensor(
+                        num_src, low=0, high=num_dest, dtype=idx_dtype, device=device, noncontiguous=index_noncontig
+                    )
                     expected = dest.clone()
                     dest.index_reduce_(dim, idx, src, reduce, include_self=include_self)
                     # fill rows in idx with reduction inits if include_self=False
@@ -5588,10 +5587,10 @@ def test_index_add(self):
                             dest = make_tensor(dest.shape, device=device, dtype=dest.dtype, noncontiguous=True)
                         src = torch.randn(num_copy, *other_sizes, device=device)
                         if not src_contig:
-                            src = torch.testing.make_non_contiguous(src)
+                            src = noncontiguous_like(src)
                         idx = torch.randperm(num_dest, dtype=dtype, device=device).narrow(0, 0, num_copy)
                         if not index_contig:
-                            idx = torch.testing.make_non_contiguous(idx)
+                            idx = noncontiguous_like(idx)
                         # index_add_ without alpha argument
                         dest2 = dest.clone()
                         dest.index_add_(0, idx, src)
diff --git a/torch/fx/OVERVIEW.md b/torch/fx/OVERVIEW.md
index f2995eb7a77dd..9c0707089d78d 100644
--- a/torch/fx/OVERVIEW.md
+++ b/torch/fx/OVERVIEW.md
@@ -61,7 +61,7 @@ module = MyModule()
 symbolic_traced : torch.fx.GraphModule = symbolic_trace(module)
 
 input = torch.rand(3, 4)
-torch.testing.assert_allclose(symbolic_traced(input), module(input))
+torch.testing.assert_close(symbolic_traced(input), module(input))
 ```
 
 Here, we set up a simple Module that exercises different language features: fetching a parameter, applying an arithmetic operator, applying a submodule (linear), and applying a Tensor method. `symbolic_trace` returns an instance of GraphModule, which is in itself a subclass of `nn.Module`. We can see that the `symbolic_traced` instance runs and returns the same result as the original module instance module.
diff --git a/torch/fx/interpreter.py b/torch/fx/interpreter.py
index 95218bf271657..6428d4c5c3bb5 100644
--- a/torch/fx/interpreter.py
+++ b/torch/fx/interpreter.py
@@ -57,7 +57,7 @@ def fn(x):
             gm = torch.fx.symbolic_trace(fn)
             input = torch.randn(3, 4)
             result = NegSigmSwapInterpreter(gm).run(input)
-            torch.testing.assert_allclose(result, torch.neg(input).sigmoid())
+            torch.testing.assert_close(result, torch.neg(input).sigmoid())
 
     Args:
         module (GraphModule): The module to be executed
@@ -395,7 +395,7 @@ def fn(x):
 
             transformed : torch.nn.Module = NegSigmSwapXformer(gm).transform()
             input = torch.randn(3, 4)
-            torch.testing.assert_allclose(transformed(input), torch.neg(input).sigmoid())
+            torch.testing.assert_close(transformed(input), torch.neg(input).sigmoid())
 
     Args:
         module (GraphModule): The ``Module`` to be transformed.
diff --git a/torch/jit/_freeze.py b/torch/jit/_freeze.py
index 47da766cfcc42..af0a132ee0e78 100644
--- a/torch/jit/_freeze.py
+++ b/torch/jit/_freeze.py
@@ -136,9 +136,9 @@ def run_frozen_optimizations(
         mod (:class:`ScriptModule`): a frozen module to be optimized
 
         optimize_numerics (bool): If ``True``, a set of optimization passes will be run that does not strictly
-        preserve numerics. These optimizations preserve default rtol and atol of `torch.testing.assert_allclose`
+        preserve numerics. These optimizations preserve default rtol and atol of `torch.testing.assert_close`
         when applied on a single transformation, however in a module where many transformations are applied
-        the rtol or atol may no longer fall within the default `assert_allclose` tolerance. Conv -> Batchnorm folding,
+        the rtol or atol may no longer fall within the default `assert_close` tolerance. Conv -> Batchnorm folding,
         Conv-Add/Sub, and Conv -> Mul/Div folding all may alter numerics.
 
     Returns:
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index 616c1c90e1a67..0dca22f48092b 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -1033,7 +1033,9 @@ def _test_fsdp_parity(
                 self.assertEqual(param.device, cpu_device)
             fsdp_loss = fsdp_loss.cuda()
         fsdp_unsharded_params = get_full_params(fsdp_model)
-        torch.testing.assert_allclose(ref_loss, fsdp_loss)
+        # TODO: Are mismatching dtypes actually ok here or did this pass silently before, because `check_dtype=False`
+        #  was the default?
+        torch.testing.assert_close(ref_loss, fsdp_loss, check_dtype=False)
         # Do not check for parameter parity if using mixed precision since (1)
         # the DDP parameters are in FP16 (from `half()`) while the FSDP
         # parameters are in FP32 (from `summon_full_params()`) and (2) DDP runs
diff --git a/torch/testing/_internal/opinfo/definitions/signal.py b/torch/testing/_internal/opinfo/definitions/signal.py
index 9404cc889c50e..c915e93e81de6 100644
--- a/torch/testing/_internal/opinfo/definitions/signal.py
+++ b/torch/testing/_internal/opinfo/definitions/signal.py
@@ -7,6 +7,7 @@
 import numpy
 
 import torch
+from torch.testing._internal.common_dtype import floating_types_and
 from torch.testing._internal.common_utils import TEST_SCIPY
 from torch.testing._internal.opinfo.core import (
     DecorateInfo,
@@ -14,7 +15,6 @@
     OpInfo,
     SampleInput,
 )
-from torch.testing._legacy import floating_types_and
 
 if TEST_SCIPY:
     import scipy.signal

From 674f67c491cffa04eb9c98692711c61f4c2179bc Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 2 Nov 2022 11:25:05 +0100
Subject: [PATCH 0448/1922] remove deprecated rand and randn from torch.testing
 (#87970)

See #87969 or #86586 for the reasoning.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87970
Approved by: https://github.com/mruberry
---
 torch/testing/_deprecated.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/torch/testing/_deprecated.py b/torch/testing/_deprecated.py
index 3dad2e62b0f21..fb6c658495a72 100644
--- a/torch/testing/_deprecated.py
+++ b/torch/testing/_deprecated.py
@@ -14,8 +14,6 @@
 
 
 __all__ = [
-    "rand",
-    "randn",
     "assert_allclose",
     "get_all_device_types",
     "make_non_contiguous",
@@ -40,10 +38,6 @@ def inner_wrapper(*args: Any, **kwargs: Any) -> Any:
     return outer_wrapper
 
 
-rand = warn_deprecated("Use torch.rand() instead.")(torch.rand)
-randn = warn_deprecated("Use torch.randn() instead.")(torch.randn)
-
-
 _DTYPE_PRECISIONS = {
     torch.float16: (1e-3, 1e-3),
     torch.float32: (1e-4, 1e-5),

From f334c5061251814537eb66e99139615eb0baa161 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 2 Nov 2022 11:25:05 +0100
Subject: [PATCH 0449/1922] remove deprecated device getter from torch.testing
 (#87971)

See #87969 or #86586 for the reasoning.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87971
Approved by: https://github.com/mruberry
---
 torch/testing/_deprecated.py                  |  5 +----
 torch/testing/_internal/common_device_type.py |  8 +++++---
 torch/testing/_legacy.py                      | 11 +----------
 3 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/torch/testing/_deprecated.py b/torch/testing/_deprecated.py
index fb6c658495a72..fd5a3010c498c 100644
--- a/torch/testing/_deprecated.py
+++ b/torch/testing/_deprecated.py
@@ -15,7 +15,6 @@
 
 __all__ = [
     "assert_allclose",
-    "get_all_device_types",
     "make_non_contiguous",
 ]
 
@@ -89,13 +88,11 @@ def assert_allclose(
 )
 
 # Deprecate and expose all dtype getters
-for name in _legacy.__all_dtype_getters__:
+for name in _legacy.__all__:
     fn = getattr(_legacy, name)
     globals()[name] = warn_deprecated(getter_instructions)(fn)
     __all__.append(name)
 
-get_all_device_types = warn_deprecated(getter_instructions)(_legacy.get_all_device_types)
-
 
 @warn_deprecated(
     "Depending on the use case there a different replacement options:\n\n"
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 7d9f31330ef6d..cf3c3b4189815 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -20,9 +20,6 @@
     TEST_CUSPARSE_GENERIC, TEST_HIPSPARSE_GENERIC
 from torch.testing._internal.common_dtype import get_all_dtypes
 
-# The implementation should be moved here as soon as the deprecation period is over.
-from torch.testing._legacy import get_all_device_types  # noqa: F401
-
 try:
     import psutil  # type: ignore[import]
     HAS_PSUTIL = True
@@ -1325,3 +1322,8 @@ def skipMeta(fn):
 
 def skipXLA(fn):
     return skipXLAIf(True, "Marked as skipped for XLA")(fn)
+
+# TODO: the "all" in the name isn't true anymore for quite some time as we have also have for example XLA and MPS now.
+#  This should probably enumerate all available device type test base classes.
+def get_all_device_types() -> List[str]:
+    return ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
diff --git a/torch/testing/_legacy.py b/torch/testing/_legacy.py
index 1c7ba14728968..17db4952f0461 100644
--- a/torch/testing/_legacy.py
+++ b/torch/testing/_legacy.py
@@ -7,7 +7,7 @@
 
 import torch
 
-__all_dtype_getters__ = [
+__all__ = [
     "_validate_dtypes",
     "_dispatch_dtypes",
     "all_types",
@@ -32,11 +32,6 @@
     "integral_types_and",
 ]
 
-__all__ = [
-    *__all_dtype_getters__,
-    "get_all_device_types",
-]
-
 # Functions and classes for describing the dtypes a function supports
 # NOTE: these helpers should correspond to PyTorch's C++ dispatch macros
 
@@ -152,7 +147,3 @@ def get_all_fp_dtypes(include_half=True, include_bfloat16=True) -> List[torch.dt
 
 def get_all_qint_dtypes() -> List[torch.dtype]:
     return [torch.qint8, torch.quint8, torch.qint32, torch.quint4x2, torch.quint2x4]
-
-
-def get_all_device_types() -> List[str]:
-    return ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']

From e5ef5499230e6d4eda6ca61049f53b495868cde6 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 2 Nov 2022 11:25:05 +0100
Subject: [PATCH 0450/1922] remove deprecated dtype getters from torch.testing
 (#87972)

See #87969 or #86586 for the reasoning.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87972
Approved by: https://github.com/mruberry
---
 torch/testing/_deprecated.py            |  13 ---
 torch/testing/_internal/common_dtype.py | 148 ++++++++++++++++++-----
 torch/testing/_legacy.py                | 149 ------------------------
 3 files changed, 120 insertions(+), 190 deletions(-)
 delete mode 100644 torch/testing/_legacy.py

diff --git a/torch/testing/_deprecated.py b/torch/testing/_deprecated.py
index fd5a3010c498c..eda28ccfaac13 100644
--- a/torch/testing/_deprecated.py
+++ b/torch/testing/_deprecated.py
@@ -10,8 +10,6 @@
 
 import torch
 
-from . import _legacy
-
 
 __all__ = [
     "assert_allclose",
@@ -83,17 +81,6 @@ def assert_allclose(
     )
 
 
-getter_instructions = (
-    lambda name, args, kwargs, return_value: f"This call can be replaced with {return_value}."  # noqa: E731
-)
-
-# Deprecate and expose all dtype getters
-for name in _legacy.__all__:
-    fn = getattr(_legacy, name)
-    globals()[name] = warn_deprecated(getter_instructions)(fn)
-    __all__.append(name)
-
-
 @warn_deprecated(
     "Depending on the use case there a different replacement options:\n\n"
     "- If you are using `make_non_contiguous` in combination with a creation function to create a noncontiguous tensor "
diff --git a/torch/testing/_internal/common_dtype.py b/torch/testing/_internal/common_dtype.py
index 6b16ad4779b35..6432521498aca 100644
--- a/torch/testing/_internal/common_dtype.py
+++ b/torch/testing/_internal/common_dtype.py
@@ -1,28 +1,120 @@
-# flake8: noqa F401
-
-"""The implementations should be moved here as soon as their deprecation period is over."""
-from torch.testing._legacy import (
-    _validate_dtypes,
-    _dispatch_dtypes,
-    all_types,
-    all_types_and,
-    all_types_and_complex,
-    all_types_and_complex_and,
-    all_types_and_half,
-    complex_types,
-    complex_types_and,
-    empty_types,
-    floating_and_complex_types,
-    floating_and_complex_types_and,
-    floating_types,
-    floating_types_and,
-    double_types,
-    floating_types_and_half,
-    get_all_complex_dtypes,
-    get_all_dtypes,
-    get_all_fp_dtypes,
-    get_all_int_dtypes,
-    get_all_math_dtypes,
-    integral_types,
-    integral_types_and,
-)
+from typing import List
+
+import torch
+
+
+# Functions and classes for describing the dtypes a function supports
+# NOTE: these helpers should correspond to PyTorch's C++ dispatch macros
+
+# Verifies each given dtype is a torch.dtype
+def _validate_dtypes(*dtypes):
+    for dtype in dtypes:
+        assert isinstance(dtype, torch.dtype)
+    return dtypes
+
+# class for tuples corresponding to a PyTorch dispatch macro
+class _dispatch_dtypes(tuple):
+    def __add__(self, other):
+        assert isinstance(other, tuple)
+        return _dispatch_dtypes(tuple.__add__(self, other))
+
+_empty_types = _dispatch_dtypes(())
+def empty_types():
+    return _empty_types
+
+_floating_types = _dispatch_dtypes((torch.float32, torch.float64))
+def floating_types():
+    return _floating_types
+
+_floating_types_and_half = _floating_types + (torch.half,)
+def floating_types_and_half():
+    return _floating_types_and_half
+
+def floating_types_and(*dtypes):
+    return _floating_types + _validate_dtypes(*dtypes)
+
+_floating_and_complex_types = _floating_types + (torch.cfloat, torch.cdouble)
+def floating_and_complex_types():
+    return _floating_and_complex_types
+
+def floating_and_complex_types_and(*dtypes):
+    return _floating_and_complex_types + _validate_dtypes(*dtypes)
+
+_double_types = _dispatch_dtypes((torch.float64, torch.complex128))
+def double_types():
+    return _double_types
+
+_integral_types = _dispatch_dtypes((torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64))
+def integral_types():
+    return _integral_types
+
+def integral_types_and(*dtypes):
+    return _integral_types + _validate_dtypes(*dtypes)
+
+_all_types = _floating_types + _integral_types
+def all_types():
+    return _all_types
+
+def all_types_and(*dtypes):
+    return _all_types + _validate_dtypes(*dtypes)
+
+_complex_types = _dispatch_dtypes((torch.cfloat, torch.cdouble))
+def complex_types():
+    return _complex_types
+
+def complex_types_and(*dtypes):
+    return _complex_types + _validate_dtypes(*dtypes)
+
+_all_types_and_complex = _all_types + _complex_types
+def all_types_and_complex():
+    return _all_types_and_complex
+
+def all_types_and_complex_and(*dtypes):
+    return _all_types_and_complex + _validate_dtypes(*dtypes)
+
+_all_types_and_half = _all_types + (torch.half,)
+def all_types_and_half():
+    return _all_types_and_half
+
+# The functions below are used for convenience in our test suite and thus have no corresponding C++ dispatch macro
+
+# See AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS.
+def get_all_dtypes(include_half=True,
+                   include_bfloat16=True,
+                   include_bool=True,
+                   include_complex=True,
+                   include_complex32=False,
+                   include_qint=False,
+                   ) -> List[torch.dtype]:
+    dtypes = get_all_int_dtypes() + get_all_fp_dtypes(include_half=include_half, include_bfloat16=include_bfloat16)
+    if include_bool:
+        dtypes.append(torch.bool)
+    if include_complex:
+        dtypes += get_all_complex_dtypes(include_complex32)
+    if include_qint:
+        dtypes += get_all_qint_dtypes()
+    return dtypes
+
+def get_all_math_dtypes(device) -> List[torch.dtype]:
+    return get_all_int_dtypes() + get_all_fp_dtypes(include_half=device.startswith('cuda'),
+                                                    include_bfloat16=False) + get_all_complex_dtypes()
+
+def get_all_complex_dtypes(include_complex32=False) -> List[torch.dtype]:
+    return [torch.complex32, torch.complex64, torch.complex128] if include_complex32 else [torch.complex64, torch.complex128]
+
+
+def get_all_int_dtypes() -> List[torch.dtype]:
+    return [torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64]
+
+
+def get_all_fp_dtypes(include_half=True, include_bfloat16=True) -> List[torch.dtype]:
+    dtypes = [torch.float32, torch.float64]
+    if include_half:
+        dtypes.append(torch.float16)
+    if include_bfloat16:
+        dtypes.append(torch.bfloat16)
+    return dtypes
+
+
+def get_all_qint_dtypes() -> List[torch.dtype]:
+    return [torch.qint8, torch.quint8, torch.qint32, torch.quint4x2, torch.quint2x4]
diff --git a/torch/testing/_legacy.py b/torch/testing/_legacy.py
deleted file mode 100644
index 17db4952f0461..0000000000000
--- a/torch/testing/_legacy.py
+++ /dev/null
@@ -1,149 +0,0 @@
-"""This module exist to be able to deprecate functions publicly without doing so internally. The deprecated
-public versions are defined in torch.testing._deprecated and exposed from torch.testing. The non-deprecated internal
-versions should be imported from torch.testing._internal
-"""
-
-from typing import List
-
-import torch
-
-__all__ = [
-    "_validate_dtypes",
-    "_dispatch_dtypes",
-    "all_types",
-    "all_types_and",
-    "all_types_and_complex",
-    "all_types_and_complex_and",
-    "all_types_and_half",
-    "complex_types",
-    "empty_types",
-    "floating_and_complex_types",
-    "floating_and_complex_types_and",
-    "floating_types",
-    "floating_types_and",
-    "double_types",
-    "floating_types_and_half",
-    "get_all_complex_dtypes",
-    "get_all_dtypes",
-    "get_all_fp_dtypes",
-    "get_all_int_dtypes",
-    "get_all_math_dtypes",
-    "integral_types",
-    "integral_types_and",
-]
-
-# Functions and classes for describing the dtypes a function supports
-# NOTE: these helpers should correspond to PyTorch's C++ dispatch macros
-
-# Verifies each given dtype is a torch.dtype
-def _validate_dtypes(*dtypes):
-    for dtype in dtypes:
-        assert isinstance(dtype, torch.dtype)
-    return dtypes
-
-# class for tuples corresponding to a PyTorch dispatch macro
-class _dispatch_dtypes(tuple):
-    def __add__(self, other):
-        assert isinstance(other, tuple)
-        return _dispatch_dtypes(tuple.__add__(self, other))
-
-_empty_types = _dispatch_dtypes(())
-def empty_types():
-    return _empty_types
-
-_floating_types = _dispatch_dtypes((torch.float32, torch.float64))
-def floating_types():
-    return _floating_types
-
-_floating_types_and_half = _floating_types + (torch.half,)
-def floating_types_and_half():
-    return _floating_types_and_half
-
-def floating_types_and(*dtypes):
-    return _floating_types + _validate_dtypes(*dtypes)
-
-_floating_and_complex_types = _floating_types + (torch.cfloat, torch.cdouble)
-def floating_and_complex_types():
-    return _floating_and_complex_types
-
-def floating_and_complex_types_and(*dtypes):
-    return _floating_and_complex_types + _validate_dtypes(*dtypes)
-
-_double_types = _dispatch_dtypes((torch.float64, torch.complex128))
-def double_types():
-    return _double_types
-
-_integral_types = _dispatch_dtypes((torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64))
-def integral_types():
-    return _integral_types
-
-def integral_types_and(*dtypes):
-    return _integral_types + _validate_dtypes(*dtypes)
-
-_all_types = _floating_types + _integral_types
-def all_types():
-    return _all_types
-
-def all_types_and(*dtypes):
-    return _all_types + _validate_dtypes(*dtypes)
-
-_complex_types = _dispatch_dtypes((torch.cfloat, torch.cdouble))
-def complex_types():
-    return _complex_types
-
-def complex_types_and(*dtypes):
-    return _complex_types + _validate_dtypes(*dtypes)
-
-_all_types_and_complex = _all_types + _complex_types
-def all_types_and_complex():
-    return _all_types_and_complex
-
-def all_types_and_complex_and(*dtypes):
-    return _all_types_and_complex + _validate_dtypes(*dtypes)
-
-_all_types_and_half = _all_types + (torch.half,)
-def all_types_and_half():
-    return _all_types_and_half
-
-# The functions below are used for convenience in our test suite and thus have no corresponding C++ dispatch macro
-
-# See AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS.
-def get_all_dtypes(include_half=True,
-                   include_bfloat16=True,
-                   include_bool=True,
-                   include_complex=True,
-                   include_complex32=False,
-                   include_qint=False,
-                   ) -> List[torch.dtype]:
-    dtypes = get_all_int_dtypes() + get_all_fp_dtypes(include_half=include_half, include_bfloat16=include_bfloat16)
-    if include_bool:
-        dtypes.append(torch.bool)
-    if include_complex:
-        dtypes += get_all_complex_dtypes(include_complex32)
-    if include_qint:
-        dtypes += get_all_qint_dtypes()
-    return dtypes
-
-def get_all_math_dtypes(device) -> List[torch.dtype]:
-    return get_all_int_dtypes() + get_all_fp_dtypes(include_half=device.startswith('cuda'),
-                                                    include_bfloat16=False) + get_all_complex_dtypes()
-
-def get_all_complex_dtypes(include_complex32=False) -> List[torch.dtype]:
-    return [torch.complex32, torch.complex64, torch.complex128] if include_complex32 else [torch.complex64, torch.complex128]
-
-
-def get_all_int_dtypes() -> List[torch.dtype]:
-    return [torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64]
-
-
-def get_all_fp_dtypes(include_half=True, include_bfloat16=True) -> List[torch.dtype]:
-    dtypes = [torch.float32, torch.float64]
-    if include_half:
-        dtypes.append(torch.float16)
-    if include_bfloat16:
-        dtypes.append(torch.bfloat16)
-    return dtypes
-
-
-def get_all_qint_dtypes() -> List[torch.dtype]:
-    return [torch.qint8, torch.quint8, torch.qint32, torch.quint4x2, torch.quint2x4]

From e4ca6d697055c87d09fd1babf8d5af201c74d8c7 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 2 Nov 2022 11:25:06 +0100
Subject: [PATCH 0451/1922] remove make_non_contiguous from torch.testing
 (#87973)

See #87969 or #86586 for the reasoning.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87973
Approved by: https://github.com/mruberry
---
 torch/testing/_deprecated.py | 44 +-----------------------------------
 1 file changed, 1 insertion(+), 43 deletions(-)

diff --git a/torch/testing/_deprecated.py b/torch/testing/_deprecated.py
index eda28ccfaac13..731158ddb41ee 100644
--- a/torch/testing/_deprecated.py
+++ b/torch/testing/_deprecated.py
@@ -4,17 +4,13 @@
 """
 
 import functools
-import random
 import warnings
 from typing import Any, Callable, Dict, Optional, Tuple, Union
 
 import torch
 
 
-__all__ = [
-    "assert_allclose",
-    "make_non_contiguous",
-]
+__all__ = ["assert_allclose"]
 
 
 def warn_deprecated(instructions: Union[str, Callable[[str, Tuple[Any, ...], Dict[str, Any], Any], str]]) -> Callable:
@@ -79,41 +75,3 @@ def assert_allclose(
         check_stride=False,
         msg=msg or None,
     )
-
-
-@warn_deprecated(
-    "Depending on the use case there a different replacement options:\n\n"
-    "- If you are using `make_non_contiguous` in combination with a creation function to create a noncontiguous tensor "
-    "with random values, use `torch.testing.make_tensor(..., noncontiguous=True)` instead.\n"
-    "- If you are using `make_non_contiguous` with a specific tensor, you can replace this call with "
-    "`torch.repeat_interleave(input, 2, dim=-1)[..., ::2]`.\n"
-    "- If you are using `make_non_contiguous` in the PyTorch test suite, use "
-    "`torch.testing._internal.common_utils.noncontiguous_like` instead."
-)
-def make_non_contiguous(tensor: torch.Tensor) -> torch.Tensor:
-    if tensor.numel() <= 1:  # can't make non-contiguous
-        return tensor.clone()
-    osize = list(tensor.size())
-
-    # randomly inflate a few dimensions in osize
-    for _ in range(2):
-        dim = random.randint(0, len(osize) - 1)
-        add = random.randint(4, 15)
-        osize[dim] = osize[dim] + add
-
-    # narrow doesn't make a non-contiguous tensor if we only narrow the 0-th dimension,
-    # (which will always happen with a 1-dimensional tensor), so let's make a new
-    # right-most dimension and cut it off
-
-    input = tensor.new(torch.Size(osize + [random.randint(2, 3)]))
-    input = input.select(len(input.size()) - 1, random.randint(0, 1))
-    # now extract the input of correct size from 'input'
-    for i in range(len(osize)):
-        if input.size(i) != tensor.size(i):
-            bounds = random.randint(1, input.size(i) - tensor.size(i))
-            input = input.narrow(i, bounds, tensor.size(i))
-
-    input.copy_(tensor)
-
-    # Use .data here to hide the view relation between input and other temporary Tensors
-    return input.data

From 522fc1cef0cf76ed5ec4cf6f468a332035097ea0 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 2 Nov 2022 11:25:06 +0100
Subject: [PATCH 0452/1922] remove assert_allclose from torch.testing (#87974)

See #87969 or #86586 for the reasoning.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87974
Approved by: https://github.com/mruberry
---
 caffe2/python/operator_test/_utils.py |  2 +-
 torch/testing/__init__.py             |  1 -
 torch/testing/_deprecated.py          | 77 ---------------------------
 3 files changed, 1 insertion(+), 79 deletions(-)
 delete mode 100644 torch/testing/_deprecated.py

diff --git a/caffe2/python/operator_test/_utils.py b/caffe2/python/operator_test/_utils.py
index 3ee1def89e715..1c3a105085e96 100644
--- a/caffe2/python/operator_test/_utils.py
+++ b/caffe2/python/operator_test/_utils.py
@@ -1,5 +1,5 @@
 """
-This file only exists since `torch.testing.assert_allclose` is deprecated, but used extensively throughout the tests in
+This file only exists since `torch.testing.assert_allclose` was removed, but used extensively throughout the tests in
 this package. The replacement `torch.testing.assert_close` doesn't support one feature that is needed here: comparison
 between numpy arrays and torch tensors. See https://github.com/pytorch/pytorch/issues/61844 for the reasoning why this
 was removed.
diff --git a/torch/testing/__init__.py b/torch/testing/__init__.py
index ad69ef1d24901..ed4922aab9b60 100644
--- a/torch/testing/__init__.py
+++ b/torch/testing/__init__.py
@@ -1,4 +1,3 @@
 from ._comparison import assert_close as assert_close
 from torch._C import FileCheck as FileCheck
 from ._creation import make_tensor as make_tensor
-from ._deprecated import *  # noqa: F403
diff --git a/torch/testing/_deprecated.py b/torch/testing/_deprecated.py
deleted file mode 100644
index 731158ddb41ee..0000000000000
--- a/torch/testing/_deprecated.py
+++ /dev/null
@@ -1,77 +0,0 @@
-"""This module exists since the `torch.testing` exposed a lot of stuff that shouldn't have been public. Although this
-was never documented anywhere, some other internal FB projects as well as downstream OSS projects might use this. Thus,
-we don't internalize without warning, but still go through a deprecation cycle.
-"""
-
-import functools
-import warnings
-from typing import Any, Callable, Dict, Optional, Tuple, Union
-
-import torch
-
-
-__all__ = ["assert_allclose"]
-
-
-def warn_deprecated(instructions: Union[str, Callable[[str, Tuple[Any, ...], Dict[str, Any], Any], str]]) -> Callable:
-    def outer_wrapper(fn: Callable) -> Callable:
-        name = fn.__name__
-        head = f"torch.testing.{name}() is deprecated since 1.12 and will be removed in 1.14. "
-
-        @functools.wraps(fn)
-        def inner_wrapper(*args: Any, **kwargs: Any) -> Any:
-            return_value = fn(*args, **kwargs)
-            tail = instructions(name, args, kwargs, return_value) if callable(instructions) else instructions
-            msg = (head + tail).strip()
-            warnings.warn(msg, FutureWarning)
-            return return_value
-
-        return inner_wrapper
-
-    return outer_wrapper
-
-
-_DTYPE_PRECISIONS = {
-    torch.float16: (1e-3, 1e-3),
-    torch.float32: (1e-4, 1e-5),
-    torch.float64: (1e-5, 1e-8),
-}
-
-
-def _get_default_rtol_and_atol(actual: torch.Tensor, expected: torch.Tensor) -> Tuple[float, float]:
-    actual_rtol, actual_atol = _DTYPE_PRECISIONS.get(actual.dtype, (0.0, 0.0))
-    expected_rtol, expected_atol = _DTYPE_PRECISIONS.get(expected.dtype, (0.0, 0.0))
-    return max(actual_rtol, expected_rtol), max(actual_atol, expected_atol)
-
-
-@warn_deprecated(
-    "Use torch.testing.assert_close() instead. "
-    "For detailed upgrade instructions see https://github.com/pytorch/pytorch/issues/61844."
-)
-def assert_allclose(
-    actual: Any,
-    expected: Any,
-    rtol: Optional[float] = None,
-    atol: Optional[float] = None,
-    equal_nan: bool = True,
-    msg: str = "",
-) -> None:
-    if not isinstance(actual, torch.Tensor):
-        actual = torch.tensor(actual)
-    if not isinstance(expected, torch.Tensor):
-        expected = torch.tensor(expected, dtype=actual.dtype)
-
-    if rtol is None and atol is None:
-        rtol, atol = _get_default_rtol_and_atol(actual, expected)
-
-    torch.testing.assert_close(
-        actual,
-        expected,
-        rtol=rtol,
-        atol=atol,
-        equal_nan=equal_nan,
-        check_device=True,
-        check_dtype=False,
-        check_stride=False,
-        msg=msg or None,
-    )

From 0cbca53984d8b8a701e061746639a42d626af95e Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@fb.com>
Date: Tue, 1 Nov 2022 20:06:44 -0700
Subject: [PATCH 0453/1922] propagate .meta info when replacing subgraphs in fx
 (#87255)

Fixes https://github.com/pytorch/torchdynamo/issues/1708

Our FX subgraph partitioner works by taking all of the original output nodes from a subgraph, and replacing it with a new `call_module` node in the graph.

If the original subgraph outputs had fake tensors and other metadata stored in their `.meta` attribute though, then this information was getting lost when we spliced in the subgraph.

Losing metadata on an FX graph also seems like an easy trap to fall into, so I'm wondering if there are any better guardrails that we can add. I ended up fixing in this PR by adding an optional kwarg to propagate meta info directly in the `fx.Node.replace_all_uses_with`, just because propagating metadata seems like a pretty core thing.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87255
Approved by: https://github.com/wconstab, https://github.com/SherlockNoMad
---
 ...compat-fx_backcompat_function_signatures.expect |  2 +-
 torch/fx/node.py                                   | 14 +++++++++++++-
 torch/fx/passes/utils/fuser_utils.py               |  4 ++--
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
index 01ac1efffd29e..9c2a5d7b762f8 100644
--- a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
+++ b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
@@ -53,7 +53,7 @@ torch.fx.node.Node.__init__(self, graph: 'Graph', name: str, op: str, target: 'T
 torch.fx.node.Node.append(self, x: 'Node') -> None
 torch.fx.node.Node.format_node(self, placeholder_names: Optional[List[str]] = None, maybe_return_typename: Optional[List[str]] = None) -> Optional[str]
 torch.fx.node.Node.prepend(self, x: 'Node') -> None
-torch.fx.node.Node.replace_all_uses_with(self, replace_with: 'Node', delete_user_cb: Callable[[Node], bool] = <function <lambda>>) -> List[Node]
+torch.fx.node.Node.replace_all_uses_with(self, replace_with: 'Node', delete_user_cb: Callable[[Node], bool] = <function <lambda>>, propagate_meta = False) -> List[Node]
 torch.fx.node.Node.replace_input_with(self, old_input: 'Node', new_input: 'Node')
 torch.fx.node.Node.update_arg(self, idx: int, arg: torch.fx.node.Argument) -> None
 torch.fx.node.Node.update_kwarg(self, key: str, arg: torch.fx.node.Argument) -> None
diff --git a/torch/fx/node.py b/torch/fx/node.py
index bb80f3f022f49..0505b39565cd7 100644
--- a/torch/fx/node.py
+++ b/torch/fx/node.py
@@ -468,7 +468,9 @@ def format_node(self,
     @compatibility(is_backward_compatible=True)
     def replace_all_uses_with(self,
                               replace_with : 'Node',
-                              delete_user_cb: Callable[['Node'], bool] = lambda user: True
+                              delete_user_cb: Callable[['Node'], bool] = lambda user: True,
+                              *,
+                              propagate_meta=False
                               ) -> List['Node']:
         """
         Replace all uses of ``self`` in the Graph with the Node ``replace_with``.
@@ -478,11 +480,21 @@ def replace_all_uses_with(self,
             replace_with (Node): The node to replace all uses of ``self`` with.
             delete_user_cb (Callable): Callback that is called to determine
               whether a given user of the self node should be removed.
+            propagate_meta (bool): Whether or not to copy all properties
+              on the .meta field of the original node onto the replacement node.
+              For safety, this is only valid to do if the replacement node
+              doesn't already have an existing .meta field.
 
         Returns:
 
             The list of Nodes on which this change was made.
         """
+        if propagate_meta:
+            assert len(replace_with.meta) == 0, \
+                'Called node.replace_all_uses_with(replace_with, propagate_meta=True), ' \
+                'but replace_with already has .meta keys'
+            for k, v in self.meta.items():
+                replace_with.meta[k] = v
         to_process = list(self.users)
         skipped = []
         for use_node in to_process:
diff --git a/torch/fx/passes/utils/fuser_utils.py b/torch/fx/passes/utils/fuser_utils.py
index f3d5f02421690..9eddc2befd044 100644
--- a/torch/fx/passes/utils/fuser_utils.py
+++ b/torch/fx/passes/utils/fuser_utils.py
@@ -181,12 +181,12 @@ def insert_subgm(gm: GraphModule, sub_gm: GraphModule, orig_inputs: Tuple[Node,
 
     if len(orig_outputs) == 1:
         # main_remapping[comp.orig_outputs[0]] = module_node
-        orig_outputs[0].replace_all_uses_with(module_node)
+        orig_outputs[0].replace_all_uses_with(module_node, propagate_meta=True)
     else:
         for i, orig_output in enumerate(orig_outputs):
             # Use Proxy to record getitem access.
             proxy_out = torch.fx.Proxy(module_node)[i].node  # type: ignore[index]
-            orig_output.replace_all_uses_with(proxy_out)
+            orig_output.replace_all_uses_with(proxy_out, propagate_meta=True)
     return gm
 
 def erase_nodes(gm: GraphModule, nodes: NodeList):

From 5874f6c71c6a2c38c1cbea0a8b6a5fb24c23c6a1 Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@fb.com>
Date: Tue, 1 Nov 2022 20:06:44 -0700
Subject: [PATCH 0454/1922] fix as_strided_scatter_backward (#87646)

as_strided_scatter's derivative formula was broken - instead of making a "mask" of 1's and 0's, it would effectively make a mask of 1's and uninitialized memory.

Fixes https://github.com/pytorch/pytorch/issues/88105

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87646
Approved by: https://github.com/albanD
---
 test/functorch/test_aotdispatch.py                   |  1 -
 test/functorch/test_ops.py                           |  4 ----
 torch/csrc/autograd/FunctionsManual.cpp              |  6 ++++--
 .../testing/_internal/common_methods_invocations.py  | 12 +-----------
 4 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 4940974cd7e4b..16cdd78121d83 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -952,7 +952,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('linalg.eig'),
     xfail('scatter_reduce', 'prod'),
 
-    # non-deterministic
     skip('as_strided_scatter'),
 
     # Too annoying to generate random inputs
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index bda05d970a5e9..74085941c6c88 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -343,7 +343,6 @@ class TestOperators(TestCase):
     @skipOps('TestOperators', 'test_grad', vjp_fail.union({
         xfail('linalg.eig'),  # diagonal_scatter does not support complex
         xfail('chalf', '', device_type='cpu'),  # RuntimeError: "sum_cpu" not implemented for 'ComplexHalf'
-        skip('as_strided_scatter', ''),  # silent incorrectness; seems flaky
         xfail('sparse.sampled_addmm', ''),  # RuntimeError: Sparse CSR tensors do not have strides
     }))
     @opsToleranceOverride('TestOperators', 'test_grad', (
@@ -476,7 +475,6 @@ def maybe_clone_inputs():
 
     @ops(op_db + additional_op_db, allowed_dtypes=(torch.float,))
     @skipOps('TestOperators', 'test_vjp', vjp_fail.union({
-        skip('as_strided_scatter', ''),  # silent incorrectness; also might be flaky
         xfail('sparse.sampled_addmm', ''),
     }))
     @opsToleranceOverride('TestOperators', 'test_vjp', (
@@ -1215,7 +1213,6 @@ def get_vjp(cotangents, *primals):
         xfail('nn.functional.multi_margin_loss', ''),  # NYI: forward AD with multi_margin_loss
         skip('linalg.householder_product', '', device_type='cuda'),  # flaky, I'm not sure why
         xfail('sparse.sampled_addmm', ''),  # Sparse tensors have no strides
-        skip('as_strided_scatter', ''),  # seems flaky
         xfail('segment_reduce', 'offsets'),  # NYI: forward-AD for segment_reduce
         xfail('index_reduce', ''),  # NYI: forward-AD for index_reduce
         xfail('segment_reduce', 'lengths'),  # NYI: forward-AD for segment_reduce
@@ -1614,7 +1611,6 @@ def fn(input, weight, bias):
         skip('linalg.multi_dot', '', device_type='cpu'),
         skip('sparse.sampled_addmm', ''),
         skip('native_layer_norm', '', device_type='cpu'),
-        xfail('as_strided_scatter', ''),
     })
     @opsToleranceOverride('TestOperators', 'test_vmap_autograd_grad', (
         tol1('linalg.householder_product',
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 3358d96569598..1f6ba62c41f84 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -2906,8 +2906,10 @@ Tensor as_strided_scatter_backward(
   // take the perf hit and contiguify grad for now.
   auto grad_ = grad.contiguous();
   auto grad_slice = grad_.as_strided_symint(sizes, strides, storage_offset);
-  auto result = grad_.new_empty_strided_symint(
-      input_geometry.sym_sizes(), input_geometry.sym_strides());
+  auto result =
+      grad_.new_zeros_symint(input_geometry.sym_sizes())
+          .as_strided_symint(
+              input_geometry.sym_sizes(), input_geometry.sym_strides());
   auto result_slice = result.as_strided_symint(sizes, strides, storage_offset);
   result_slice.copy_(grad_slice);
   return result;
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 7c501fb411ed3..20e961341ca26 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -10406,21 +10406,11 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            check_inplace_batched_forward_grad=False,
            sample_inputs_func=sample_inputs_as_strided_scatter,
            skips=(
-               DecorateInfo(unittest.skip('Works only for CPU complex64'), 'TestMathBits', 'test_conj_view'),
-               DecorateInfo(unittest.skip('Works for float64, fails for everything else'), 'TestMathBits', 'test_neg_view'),
                DecorateInfo(unittest.skip('Works for int64, fails for everything else'), 'TestCommon', 'test_noncontiguous_samples'),  # noqa: B950
                DecorateInfo(unittest.skip('Fails in most cases, passes on LAZY for some reason'), 'TestCommon', 'test_variant_consistency_eager'),  # noqa: B950
-               DecorateInfo(unittest.skip('Only fails for LAZY, passes on everything else'), 'TestCompositeCompliance', 'test_backward'),  # noqa: B950
-               DecorateInfo(unittest.skip('Passes on complex64 and float32 only'), 'TestJit', 'test_variant_consistency_jit'),
                DecorateInfo(unittest.skip('Fails on cuda + rocm'), 'TestCommon', 'test_complex_half_reference_testing'),
                DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_grad'),
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_gradgrad'),
-               DecorateInfo(unittest.skip('Passes on complex128 and float64 only'), 'TestGradients', 'test_fn_fwgrad_bwgrad'),
-               # AssertionError: Tensor-likes are not close! (new_empty_strided.default)
-               DecorateInfo(unittest.skip("Expected: new_empty_strided is not comparable"), 'TestDecomp', 'test_comprehensive'),
-               DecorateInfo(
-                   unittest.skip("Some stride values write multiple values to the same location e.g. (1,1,1,1)"),
-                   'TestCommon', 'test_compare_cpu'),)),
+               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_gradgrad'),)),
     OpInfo('native_layer_norm',
            aten_name='native_layer_norm',
            ref=reference_native_layer_norm,

From ec95d9fdf7322f386bff32addb9abc9070ad7405 Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@fb.com>
Date: Tue, 1 Nov 2022 20:06:45 -0700
Subject: [PATCH 0455/1922] aot_dispatch test fix: always use functionalization
 in symbolic tests (#87647)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87647
Approved by: https://github.com/ezyang, https://github.com/Chillee
---
 test/functorch/test_aotdispatch.py | 8 +++++++-
 torch/_meta_registrations.py       | 5 +++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 16cdd78121d83..6850c8b313001 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1112,6 +1112,12 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('mvlgamma', 'mvlgamma_p_3'),  # aten.digamma_.default - couldn't find symbolic meta function/decom...
     xfail('mvlgamma', 'mvlgamma_p_5'),  # aten.digamma_.default - couldn't find symbolic meta function/decom...
     xfail('nanmedian', ''),  # aten.logical_or_.default - couldn't find symbolic meta function/decomposition
+
+    # Deleting this in a followup
+    xfail('nn.functional.feature_alpha_dropout', 'with_train'),
+    xfail('nn.functional.pad', 'circular'),
+    xfail('nn.functional.poisson_nll_loss', ''),
+
     xfail('nn.functional._scaled_dot_product_attention', ''),  # Cannot call sizes() on tensor with symbolic ...
     xfail('nn.functional.adaptive_avg_pool3d', ''),  # aten._adaptive_avg_pool3d_backward.default - couldn't ...
     xfail('nn.functional.adaptive_max_pool1d', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
@@ -1308,7 +1314,7 @@ def test_aot_autograd_exhaustive(self, device, dtype, op):
     @skipIfNoSympy
     @patch("functorch.compile.config.use_dynamic_shapes", True)
     @patch("functorch.compile.config.use_fake_tensor", True)
-    @patch("functorch.compile.config.use_functionalize", False)
+    @patch("functorch.compile.config.use_functionalize", True)
     @skipOps('TestEagerFusionOpInfo', 'test_aot_autograd_symbolic_exhaustive',
              aot_autograd_failures | symbolic_aot_autograd_failures)
     def test_aot_autograd_symbolic_exhaustive(self, device, dtype, op):
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 2e8845c68e868..e42cc7de4f675 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1104,6 +1104,11 @@ def meta_zero_(self):
     return self
 
 
+@register_meta(aten.zero.default)
+def meta_zero(self):
+    return self.new_empty(self.shape)
+
+
 @register_meta([aten.fill_.Tensor, aten.fill_.Scalar])
 def meta_fill_(self, val):
     return self

From d856fc9f7c795bf4b24fa5d78e92d8b5080f069d Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@fb.com>
Date: Tue, 1 Nov 2022 21:39:05 -0700
Subject: [PATCH 0456/1922] [profiler] Add Performance events support in Kineto
 profiler (#87874)

* Wiring to allow user to pass event names to profiler and reflect the count to the chrometrace
* If not used, the runtime and size overhead should be neglegible
* For now, primary user will be KinetoEdgeCPUProfiler but the impl does not assume that
* Not exposed to python yet

Differential Revision: [D40238032](https://our.internmc.facebook.com/intern/diff/D40238032/)

**NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D40238032/)!
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87874
Approved by: https://github.com/SS-JIA
---
 test/cpp/profiler/perf_events.cpp             | 52 ++++++++++++++
 torch/csrc/autograd/profiler_kineto.cpp       | 72 +++++++++++++++++--
 torch/csrc/autograd/profiler_kineto.h         |  2 +
 torch/csrc/profiler/collection.cpp            | 13 +++-
 torch/csrc/profiler/collection.h              | 17 ++++-
 .../csrc/profiler/orchestration/observer.cpp  |  6 +-
 torch/csrc/profiler/orchestration/observer.h  |  8 ++-
 7 files changed, 160 insertions(+), 10 deletions(-)
 create mode 100644 test/cpp/profiler/perf_events.cpp

diff --git a/test/cpp/profiler/perf_events.cpp b/test/cpp/profiler/perf_events.cpp
new file mode 100644
index 0000000000000..03f9fbaf8a592
--- /dev/null
+++ b/test/cpp/profiler/perf_events.cpp
@@ -0,0 +1,52 @@
+
+#include <gtest/gtest.h>
+
+#include <torch/csrc/profiler/events.h>
+#include <torch/csrc/profiler/perf.h>
+
+double calc_pi() {
+  volatile double pi = 1.0;
+  for (int i = 3; i < 100000; i += 2) {
+    pi += (((i + 1) >> 1) % 2) ? 1.0 / i : -1.0 / i;
+  }
+  return pi * 4.0;
+}
+
+TEST(ProfilerTest, LinuxPerf) {
+  torch::profiler::impl::linux_perf::PerfProfiler profiler;
+
+  std::vector<std::string> standard_events(
+      std::begin(torch::profiler::ProfilerPerfEvents),
+      std::end(torch::profiler::ProfilerPerfEvents));
+  torch::profiler::perf_counters_t counters;
+  counters.resize(standard_events.size(), 0);
+
+  // Use try..catch HACK to check TORCH_CHECK because we don't yet fail
+  // gracefully if the syscall were to fail
+  try {
+    profiler.Configure(standard_events);
+
+    profiler.Enable();
+    auto pi = calc_pi();
+    profiler.Disable(counters);
+  } catch (const c10::Error&) {
+    // Bail here if something bad happened during the profiling, we don't want
+    // to make the test fail
+    return;
+  } catch (...) {
+    // something else went wrong - this should be reported
+    ASSERT_EQ(0, 1);
+  }
+
+  // Should have counted something if worked, so lets test that
+  // And if it not supported the counters should be zeros.
+#if defined(__ANDROID__) || defined(__linux__)
+  for (auto counter : counters) {
+    ASSERT_GT(counter, 0);
+  }
+#else /* __ANDROID__ || __linux__ */
+  for (auto counter : counters) {
+    ASSERT_EQ(counter, 0);
+  }
+#endif /* __ANDROID__ || __linux__ */
+}
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index c7ab982b38897..5f62e12d211bb 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -1,3 +1,4 @@
+#include <cstring>
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <torch/csrc/autograd/profiler_kineto.h>
 
@@ -12,8 +13,10 @@
 #include <torch/csrc/profiler/api.h>
 #include <torch/csrc/profiler/collection.h>
 #include <torch/csrc/profiler/containers.h>
+#include <torch/csrc/profiler/events.h>
 #include <torch/csrc/profiler/kineto_shim.h>
 #include <torch/csrc/profiler/orchestration/observer.h>
+#include <torch/csrc/profiler/perf.h>
 #include <torch/csrc/profiler/standalone/itt_observer.h>
 #include <torch/csrc/profiler/standalone/nvtx_observer.h>
 #include <torch/csrc/profiler/util.h>
@@ -134,10 +137,12 @@ struct AddTensorboardFields : public MetadataBase {
 };
 
 struct AddGenericMetadata : public MetadataBase {
-  AddGenericMetadata(std::shared_ptr<Result>& result, const bool verbose)
-      : MetadataBase(result) {
+  AddGenericMetadata(
+      std::shared_ptr<Result>& result,
+      const torch::profiler::impl::ProfilerConfig* config)
+      : MetadataBase(result), config_(config) {
     result->visit(*this);
-    if (verbose) {
+    if (config->experimental_config.verbose) {
       result->visit_if_base<PyExtraFieldsBase>(
           [&, this](const auto& i) -> void {
             this->addMetadata("Python thread", std::to_string(i.python_tid_));
@@ -156,6 +161,15 @@ struct AddGenericMetadata : public MetadataBase {
       addMetadata("Input type", dtypesToStr(dtypes));
     }
 
+    if (config_ && !config_->experimental_config.performance_events.empty()) {
+      auto& event_names = config_->experimental_config.performance_events;
+      for (auto i = 0; i < op_event.perf_event_counters_->size(); ++i) {
+        addMetadata(
+            event_names[i],
+            std::to_string((*op_event.perf_event_counters_)[i]));
+      }
+    }
+
     // add information about an associated forward op, if a sequence number
     // is available (e.g. during training)
     if (op_event.sequence_number_ >= 0) {
@@ -197,6 +211,10 @@ struct AddGenericMetadata : public MetadataBase {
 
   template <typename T>
   void operator()(const T&) {}
+
+ private:
+  /* To get names of the performance events */
+  const torch::profiler::impl::ProfilerConfig* config_;
 };
 
 // Assumption: Total threads number will not exceed 2^16-1, and total ops will
@@ -315,7 +333,7 @@ struct KinetoThreadLocalState : public ProfilerStateBase {
 
         kineto_events_.emplace_back(e, config_.experimental_config.verbose);
         AddTensorboardFields add_tb(e, kineto_events_.back());
-        AddGenericMetadata add_generic(e, config_.experimental_config.verbose);
+        AddGenericMetadata add_generic(e, &config_);
 
         // It is not safe to use the activity after post processing.
         e->kineto_activity_ = nullptr;
@@ -433,6 +451,10 @@ void onFunctionExit(
   TORCH_INTERNAL_ASSERT(kineto_ctx_ptr != nullptr);
   kineto_ctx_ptr->event_->end_time_ =
       torch::profiler::impl::getApproximateTime();
+  if (!config.experimental_config.performance_events.empty()) {
+    state_ptr->record_queue_.getSubqueue()->disable_perf_profiler(
+        *kineto_ctx_ptr->event_->counters_);
+  }
   kineto_ctx_ptr->event_->basic_fields_.end_tid_ =
       at::RecordFunction::currentThreadId();
   if (config.state == ProfilerState::KINETO_GPU_FALLBACK) {
@@ -518,6 +540,33 @@ void prepareProfiler(
       "Supported only in Kineto profiler");
   torch::profiler::impl::kineto::prepareTrace(
       /*cpuOnly=*/!at::hasCUDA(), activities, config.experimental_config);
+
+  if (config.experimental_config.performance_events.size()) {
+    /* For now only CPU activity is supported */
+    TORCH_CHECK(
+        activities.count(torch::autograd::profiler::ActivityType::CPU),
+        "Cannot run cpu hardware profiler without CPU activities, please only use CPU activity type");
+    /*
+     * Sending a warning and passing the non-standard event to the backend
+     * Backend can abort if the event is not supported.
+     * TODO Should we gracefully drop the invalid event if we have atleast one
+     * valid?
+     */
+    auto is_standard_event = [](const std::string& event) -> bool {
+      for (auto e : torch::profiler::ProfilerPerfEvents) {
+        if (!std::strcmp(event.c_str(), e)) {
+          return true;
+        }
+      }
+      return false;
+    };
+
+    for (const auto& e : config.experimental_config.performance_events) {
+      if (!is_standard_event(e)) {
+        TORCH_WARN("Forwarding a non-standard CPU performance event : ", e);
+      }
+    }
+  }
 }
 
 void enableProfilerWithEventPostProcess(
@@ -709,6 +758,21 @@ int64_t KinetoEvent::cudaElapsedUs() const {
   return -1;
 }
 
+void KinetoEvent::getPerfEventCounters(std::vector<uint64_t>& in) const {
+  return result_->visit(c10::overloaded(
+      [&in](const ExtraFields<EventType::TorchOp>& e) -> void {
+        const size_t n = e.perf_event_counters_->size();
+        // should be rare
+        if (in.size() < n) {
+          in.resize(n, 0);
+        }
+        for (size_t i = 0; i < n; ++i) {
+          in[i] = (*e.perf_event_counters_)[i];
+        }
+      },
+      [](const auto&) -> void { return; }));
+}
+
 #define FORWARD_FROM_RESULT(method_name, result_expr)                        \
   decltype(std::declval<KinetoEvent>().method_name())                        \
   KinetoEvent::method_name() const {                                         \
diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h
index 5e5b430aa2814..180657df63ee1 100644
--- a/torch/csrc/autograd/profiler_kineto.h
+++ b/torch/csrc/autograd/profiler_kineto.h
@@ -4,6 +4,7 @@
 #include <vector>
 
 #include <torch/csrc/profiler/api.h>
+#include <torch/csrc/profiler/events.h>
 #include <torch/csrc/profiler/stubs/base.h>
 #include <torch/csrc/profiler/util.h>
 
@@ -54,6 +55,7 @@ struct TORCH_API KinetoEvent {
   std::string backend() const;
   bool isPythonFunction() const;
   int64_t cudaElapsedUs() const;
+  void getPerfEventCounters(torch::profiler::perf_counters_t&) const;
 
  private:
   torch::profiler::impl::ProfilerEventStub fallbackStart() const;
diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp
index 239d74cdf2801..30a71e9437383 100644
--- a/torch/csrc/profiler/collection.cpp
+++ b/torch/csrc/profiler/collection.cpp
@@ -246,6 +246,11 @@ std::unique_ptr<KinetoObserverContext> ThreadLocalSubqueue::begin_op(
 
   event->start_time_ = torch::profiler::impl::getApproximateTime();
   event->allow_tf32_cublas_ = at::globalContext().allowTF32CuBLAS();
+  if (!config_.experimental_config.performance_events.empty()) {
+    const size_t n = config_.experimental_config.performance_events.size();
+    event->counters_ = std::make_unique<perf_counters_t>(n, 0);
+    perf_profiler_->Enable();
+  }
   return out;
 }
 
@@ -327,7 +332,8 @@ void ThreadLocalSubqueue::TorchOpStorage::materialize(
         jit_module(),
         extra_args(),
         gpu_fallback(),
-        event->allow_tf32_cublas_};
+        event->allow_tf32_cublas_,
+        std::move(event->counters_)};
 
     out.emplace_back(Result::create(
         time_converter(event->start_time_), tid, kineto_info, std::move(e)));
@@ -468,6 +474,11 @@ ThreadLocalSubqueue::ThreadLocalSubqueue(
     const ProfilerConfig& config)
     : tid_{tid}, config_{config}, kineto_info_{kineto::kineto_ids()} {
   torch::profiler::impl::kineto::recordThreadInfo();
+  if (config_.experimental_config.performance_events.size()) {
+    perf_profiler_ =
+        std::make_unique<torch::profiler::impl::linux_perf::PerfProfiler>();
+    perf_profiler_->Configure(config_.experimental_config.performance_events);
+  }
 }
 
 RecordQueue::RecordQueue(
diff --git a/torch/csrc/profiler/collection.h b/torch/csrc/profiler/collection.h
index 096568285a713..a2f1291bc8dc2 100644
--- a/torch/csrc/profiler/collection.h
+++ b/torch/csrc/profiler/collection.h
@@ -15,8 +15,10 @@
 #include <c10/util/variant.h>
 #include <torch/csrc/profiler/containers.h>
 #include <torch/csrc/profiler/data_flow.h>
+#include <torch/csrc/profiler/events.h>
 #include <torch/csrc/profiler/kineto_shim.h>
 #include <torch/csrc/profiler/orchestration/python_tracer.h>
+#include <torch/csrc/profiler/perf.h>
 #include <torch/csrc/profiler/stubs/base.h>
 #include <torch/csrc/profiler/util.h>
 #include <torch/csrc/utils/python_stub.h>
@@ -133,7 +135,8 @@ struct ExtraFields<EventType::TorchOp> : TorchOpBasicFields {
       jit_modules_t&& jit_modules,
       extra_args_t&& extra_args,
       FallbackPair&& gpu_fallback,
-      bool allow_tf32_cublas)
+      bool allow_tf32_cublas,
+      std::unique_ptr<perf_counters_t>&& perf_event_counters)
       : TorchOpBasicFields(std::move(f)),
         correlation_id_{correlation_id},
         end_time_ns_{end_time_ns},
@@ -142,7 +145,8 @@ struct ExtraFields<EventType::TorchOp> : TorchOpBasicFields {
         jit_modules_{std::move(jit_modules)},
         extra_args_{std::move(extra_args)},
         gpu_fallback_{std::move(gpu_fallback)},
-        allow_tf32_cublas_{allow_tf32_cublas} {}
+        allow_tf32_cublas_{allow_tf32_cublas},
+        perf_event_counters_{std::move(perf_event_counters)} {}
   uint64_t correlation_id_;
   time_t end_time_ns_;
   Inputs inputs_;
@@ -151,6 +155,7 @@ struct ExtraFields<EventType::TorchOp> : TorchOpBasicFields {
   extra_args_t extra_args_;
   FallbackPair gpu_fallback_;
   bool allow_tf32_cublas_;
+  std::unique_ptr<perf_counters_t> perf_event_counters_;
 };
 
 template <>
@@ -406,6 +411,7 @@ struct KinetoObserverContext : public at::ObserverContext {
     approx_time_t end_time_{std::numeric_limits<approx_time_t>::min()};
 
     bool allow_tf32_cublas_;
+    std::unique_ptr<perf_counters_t> counters_;
   };
 
   explicit KinetoObserverContext(Event* event) : event_{event} {}
@@ -448,6 +454,8 @@ class InputOutputEncoder final {
   AppendOnlyList<c10::IValue, IO_ENCODER_DEFAULT_BLOCK_SIZE> ivalues_;
 };
 
+using perf_profiler_t = torch::profiler::impl::linux_perf::PerfProfiler;
+
 class TORCH_API ThreadLocalSubqueue {
  public:
   ThreadLocalSubqueue(const uint64_t tid, const ProfilerConfig& config);
@@ -482,10 +490,15 @@ class TORCH_API ThreadLocalSubqueue {
     return kineto_info_;
   }
 
+  inline void disable_perf_profiler(perf_counters_t& counters) const {
+    perf_profiler_->Disable(counters);
+  }
+
  private:
   uint64_t tid_;
   ProfilerConfig config_;
   kineto::DeviceAndResource kineto_info_;
+  std::unique_ptr<perf_profiler_t> perf_profiler_;
 
   friend class RecordQueue;
   // See `containers.h` for block size benchmarks.
diff --git a/torch/csrc/profiler/orchestration/observer.cpp b/torch/csrc/profiler/orchestration/observer.cpp
index 094121c38ec28..d4920c1709208 100644
--- a/torch/csrc/profiler/orchestration/observer.cpp
+++ b/torch/csrc/profiler/orchestration/observer.cpp
@@ -14,10 +14,12 @@ using GlobalManager = GlobalStateManager<ProfilerStateBase>;
 ExperimentalConfig::ExperimentalConfig(
     std::vector<std::string> profiler_metrics,
     bool profiler_measure_per_kernel,
-    bool verbose)
+    bool verbose,
+    std::vector<std::string> performance_events)
     : profiler_metrics{profiler_metrics},
       profiler_measure_per_kernel{profiler_measure_per_kernel},
-      verbose{verbose} {}
+      verbose{verbose},
+      performance_events(std::move(performance_events)) {}
 
 /*explicit*/ ExperimentalConfig::operator bool() const {
   return !profiler_metrics.empty();
diff --git a/torch/csrc/profiler/orchestration/observer.h b/torch/csrc/profiler/orchestration/observer.h
index 7aa5dc693becb..d9d89aa3a41cc 100644
--- a/torch/csrc/profiler/orchestration/observer.h
+++ b/torch/csrc/profiler/orchestration/observer.h
@@ -40,13 +40,19 @@ struct TORCH_API ExperimentalConfig {
   ExperimentalConfig(
       std::vector<std::string> profiler_metrics = {},
       bool profiler_measure_per_kernel = false,
-      bool verbose = false);
+      bool verbose = false,
+      std::vector<std::string> performance_events = {});
   ~ExperimentalConfig() = default;
   explicit operator bool() const;
 
   std::vector<std::string> profiler_metrics;
   bool profiler_measure_per_kernel;
   bool verbose;
+  /*
+   * List of performance events to be profiled.
+   * An empty list will disable performance event based profiling altogether.
+   */
+  std::vector<std::string> performance_events;
 };
 
 struct TORCH_API ProfilerConfig {

From 3c235664acbbcebec3fb85c61156282d728580b2 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@fb.com>
Date: Tue, 1 Nov 2022 21:39:07 -0700
Subject: [PATCH 0457/1922] [edge profiler] Add support for performance events
 counting (#87876)

* Add support in lite_predictor benchmark binary to select event lists
* Uses Linux perf through Kineto profiler

Differential Revision: [D39837216](https://our.internmc.facebook.com/intern/diff/D39837216/)

**NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D39837216/)!
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87876
Approved by: https://github.com/SS-JIA
---
 torch/csrc/jit/mobile/profiler_edge.cpp | 12 ++++++++++--
 torch/csrc/jit/mobile/profiler_edge.h   |  3 ++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/jit/mobile/profiler_edge.cpp b/torch/csrc/jit/mobile/profiler_edge.cpp
index d3dc596ca3dcc..8fdd1654082ae 100644
--- a/torch/csrc/jit/mobile/profiler_edge.cpp
+++ b/torch/csrc/jit/mobile/profiler_edge.cpp
@@ -18,15 +18,23 @@ KinetoEdgeCPUProfiler::KinetoEdgeCPUProfiler(
     const bool profile_memory,
     const bool with_stack,
     const bool with_flops,
-    const bool with_modules)
+    const bool with_modules,
+    std::vector<std::string> events)
     : m_(m), trace_file_name_(fname) {
+  torch::profiler::impl::ExperimentalConfig experimental_config;
+  // Enable hardware counters
+  if (events.size()) {
+    experimental_config.performance_events = std::move(events);
+  }
+
   torch::profiler::impl::ProfilerConfig config(
       torch::profiler::impl::ProfilerState::KINETO,
       report_input_shapes,
       profile_memory,
       with_stack,
       with_flops,
-      with_modules);
+      with_modules,
+      experimental_config);
   torch::autograd::profiler::prepareProfiler(
       config, {torch::autograd::profiler::ActivityType::CPU});
   if (with_modules || with_stack) {
diff --git a/torch/csrc/jit/mobile/profiler_edge.h b/torch/csrc/jit/mobile/profiler_edge.h
index 52dc26d1221a7..2a89819e700cd 100644
--- a/torch/csrc/jit/mobile/profiler_edge.h
+++ b/torch/csrc/jit/mobile/profiler_edge.h
@@ -55,7 +55,8 @@ class TORCH_API KinetoEdgeCPUProfiler {
       const bool profile_memory = false,
       const bool with_stack = false,
       const bool with_flops = false,
-      const bool with_modules = false);
+      const bool with_modules = false,
+      std::vector<std::string> events = {});
 
   const std::unique_ptr<torch::autograd::profiler::ProfilerResult>&
   disableProfiler();

From e5630fc14a209ad2c421fed89929accc4dc5b87a Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@fb.com>
Date: Tue, 1 Nov 2022 21:39:09 -0700
Subject: [PATCH 0458/1922] [edge profiler] Add e2e test for profiler event and
 chrometrace (#87877)

* Runs an existing model and checks an aten op if it gets perf events generated in the chrometrace
* Doesn't check for exact values since that's harder to do in a hardware independent way

Differential Revision: [D40474957](https://our.internmc.facebook.com/intern/diff/D40474957/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87877
Approved by: https://github.com/SS-JIA
---
 .../test_mobile_profiler.cpp                  | 57 ++++++++++++++++++-
 1 file changed, 56 insertions(+), 1 deletion(-)

diff --git a/test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp b/test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp
index 867b775c1adb4..08cb81ae78763 100644
--- a/test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp
+++ b/test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp
@@ -9,6 +9,8 @@
 
 #include <unordered_set>
 
+#include <torch/csrc/profiler/events.h>
+
 #ifdef EDGE_PROFILER_USE_KINETO
 namespace torch {
 namespace jit {
@@ -25,7 +27,10 @@ bool checkMetaData(
     if (line.find(op_name) != std::string::npos) {
       while (std::getline(trace_file, line)) {
         if (line.find(metadata_name) != std::string::npos) {
-          if (line.find(metadata_val) != std::string::npos) {
+          if (line.find(metadata_val) != std::string::npos ||
+              !metadata_val.size()) {
+            /* if found the right metadata_val OR if expected
+             * metadata value is an empty string then ignore the matadata_val */
             return true;
           }
         }
@@ -157,6 +162,56 @@ TEST(MobileProfiler, BackendMemoryEvents) {
   ASSERT_TRUE(checkMetaData("[memory]", metadata_name, "49152", trace_file));
 }
 
+TEST(MobileProfiler, ProfilerEvent) {
+  /*
+   * TODO: Using __FILE__ is unreliable e.g. it fails to resolve correctly when
+   * using buck2, works ok with buck1
+   */
+  std::string filePath(__FILE__);
+  auto testModelFile = filePath.substr(0, filePath.find_last_of("/\\") + 1);
+  testModelFile.append("test_backend_for_profiling.ptl");
+
+  std::vector<IValue> inputs;
+  inputs.emplace_back(at::rand({64, 64}));
+  inputs.emplace_back(at::rand({64, 64}));
+  std::string trace_file_name("/tmp/test_trace_profiler_event.trace");
+
+  std::vector<std::string> events(
+      torch::profiler::ProfilerPerfEvents.begin(),
+      torch::profiler::ProfilerPerfEvents.end());
+
+  mobile::Module bc = _load_for_mobile(testModelFile);
+  {
+    // Bail if something goes wrong here
+    try {
+      KinetoEdgeCPUProfiler profiler(
+          bc,
+          trace_file_name,
+          false, // record input_shapes
+          false, // profile memory
+          true, // record callstack
+          false, // record flops
+          true, // record module hierarchy
+          events); // performance events
+      bc.forward(inputs);
+    } catch (...) {
+      return;
+    }
+  } // End of profiler
+  std::ifstream trace_file(trace_file_name);
+  std::string line;
+  ASSERT_TRUE(trace_file.is_open());
+
+  for (auto& event : events) {
+    trace_file.seekg(0, std::ios_base::beg);
+    /*
+     * Just checking if the event entry exists in the chrometrace.
+     * Checking the value in a hardware independent matter is tricky.
+     */
+    ASSERT_TRUE(checkMetaData("aten::__getitem__", event, "", trace_file));
+  }
+}
+
 } // namespace mobile
 } // namespace jit
 } // namespace torch

From 6f804ee9462777dda966a43f44fb27b3cde6aaf7 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@fb.com>
Date: Tue, 1 Nov 2022 21:39:11 -0700
Subject: [PATCH 0459/1922] Nested profiling support for Linux-perf Profiler
 (#87904)

Add a stack of start counter values, and attribute each disable to the last enable

Differential Revision: [D40539212](https://our.internmc.facebook.com/intern/diff/D40539212/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87904
Approved by: https://github.com/SS-JIA
---
 test/cpp/profiler/perf_events.cpp | 196 ++++++++++++++++++++++++++++++
 torch/csrc/profiler/perf-inl.h    |  12 ++
 torch/csrc/profiler/perf.cpp      |  33 +++--
 torch/csrc/profiler/perf.h        |   6 +-
 4 files changed, 236 insertions(+), 11 deletions(-)

diff --git a/test/cpp/profiler/perf_events.cpp b/test/cpp/profiler/perf_events.cpp
index 03f9fbaf8a592..7740f42da4b52 100644
--- a/test/cpp/profiler/perf_events.cpp
+++ b/test/cpp/profiler/perf_events.cpp
@@ -50,3 +50,199 @@ TEST(ProfilerTest, LinuxPerf) {
   }
 #endif /* __ANDROID__ || __linux__ */
 }
+
+TEST(ProfilerTest, LinuxPerfNestedDepth) {
+  torch::profiler::impl::linux_perf::PerfProfiler profiler;
+
+  // Only monotonically increasing events will work
+  std::vector<std::string> standard_events(
+      std::begin(torch::profiler::ProfilerPerfEvents),
+      std::end(torch::profiler::ProfilerPerfEvents));
+
+  torch::profiler::perf_counters_t counters_A;
+  torch::profiler::perf_counters_t counters_B;
+  torch::profiler::perf_counters_t counters_C;
+
+  counters_A.resize(standard_events.size(), 0);
+  counters_B.resize(standard_events.size(), 0);
+  counters_C.resize(standard_events.size(), 0);
+
+  // Use try..catch HACK to check TORCH_CHECK because we don't yet fail
+  // gracefully if the syscall were to fail
+  try {
+    profiler.Configure(standard_events);
+
+    // * = work kernel calc_pi()
+    //
+    // A --*---+              +--*-- A
+    //         |              |
+    //         |              |
+    //       B +-*--+    +--*-+ B
+    //              |    |
+    //              |    |
+    //            C +-*--+ C
+    //
+
+    profiler.Enable();
+    auto A = calc_pi();
+
+    profiler.Enable();
+    auto B = calc_pi();
+
+    profiler.Enable();
+    auto C = calc_pi();
+    profiler.Disable(counters_C);
+
+    auto B2 = calc_pi();
+    profiler.Disable(counters_B);
+
+    auto A2 = calc_pi();
+    profiler.Disable(counters_A);
+  } catch (const c10::Error&) {
+    // Bail here if something bad happened during the profiling, we don't want
+    // to make the test fail
+    return;
+  } catch (...) {
+    // something else went wrong - this should be reported
+    ASSERT_EQ(0, 1);
+  }
+
+// for each counter, assert A > B > C
+#if defined(__ANDROID__) || defined(__linux__)
+  for (auto i = 0; i < standard_events.size(); ++i) {
+    ASSERT_GT(counters_A[i], counters_B[i]);
+    ASSERT_GT(counters_A[i], counters_C[i]);
+    ASSERT_GT(counters_B[i], counters_C[i]);
+    ASSERT_GT(counters_A[i], counters_B[i] + counters_C[i]);
+  }
+#else /* __ANDROID__ || __linux__ */
+  for (auto i = 0; i < standard_events.size(); ++i) {
+    ASSERT_EQ(counters_A[i], 0);
+    ASSERT_EQ(counters_B[i], 0);
+    ASSERT_EQ(counters_C[i], 0);
+  }
+#endif /* __ANDROID__ || __linux__ */
+}
+
+TEST(ProfilerTest, LinuxPerfNestedMultiple) {
+  torch::profiler::impl::linux_perf::PerfProfiler profiler;
+
+  // Only monotonically increasing events will work
+  std::vector<std::string> standard_events(
+      std::begin(torch::profiler::ProfilerPerfEvents),
+      std::end(torch::profiler::ProfilerPerfEvents));
+
+  torch::profiler::perf_counters_t counters_A;
+  torch::profiler::perf_counters_t counters_B;
+  torch::profiler::perf_counters_t counters_C;
+
+  counters_A.resize(standard_events.size(), 0);
+  counters_B.resize(standard_events.size(), 0);
+  counters_C.resize(standard_events.size(), 0);
+
+  // Use try..catch HACK to check TORCH_CHECK because we don't yet fail
+  // gracefully if the syscall were to fail
+  try {
+    profiler.Configure(standard_events);
+
+    // * = work kernel calc_pi()
+    //
+    // A --*---+    +---*----+    +--*-- A
+    //         |    |        |    |
+    //         |    |        |    |
+    //      B  +-**-+ B    C +-*--+ C
+
+    profiler.Enable();
+    auto A1 = calc_pi();
+
+    profiler.Enable();
+    auto B1 = calc_pi();
+    auto B2 = calc_pi();
+    profiler.Disable(counters_B);
+
+    auto A2 = calc_pi();
+
+    profiler.Enable();
+    auto C1 = calc_pi();
+    profiler.Disable(counters_C);
+
+    auto A3 = calc_pi();
+    profiler.Disable(counters_A);
+  } catch (const c10::Error&) {
+    // Bail here if something bad happened during the profiling, we don't want
+    // to make the test fail
+    return;
+  } catch (...) {
+    // something else went wrong - this should be reported
+    ASSERT_EQ(0, 1);
+  }
+
+// for each counter, assert A > B > C
+#if defined(__ANDROID__) || defined(__linux__)
+  for (auto i = 0; i < standard_events.size(); ++i) {
+    ASSERT_GT(counters_A[i], counters_B[i]);
+    ASSERT_GT(counters_A[i], counters_C[i]);
+    ASSERT_GT(counters_B[i], counters_C[i]);
+    ASSERT_GT(counters_A[i], counters_B[i] + counters_C[i]);
+  }
+#else /* __ANDROID__ || __linux__ */
+  for (auto i = 0; i < standard_events.size(); ++i) {
+    ASSERT_EQ(counters_A[i], 0);
+    ASSERT_EQ(counters_B[i], 0);
+    ASSERT_EQ(counters_C[i], 0);
+  }
+#endif /* __ANDROID__ || __linux__ */
+}
+
+TEST(ProfilerTest, LinuxPerfNestedSingle) {
+  torch::profiler::impl::linux_perf::PerfProfiler profiler;
+
+  // Only monotonically increasing events will work
+  std::vector<std::string> standard_events(
+      std::begin(torch::profiler::ProfilerPerfEvents),
+      std::end(torch::profiler::ProfilerPerfEvents));
+
+  torch::profiler::perf_counters_t counters_A;
+  torch::profiler::perf_counters_t counters_B;
+  torch::profiler::perf_counters_t counters_C;
+
+  counters_A.resize(standard_events.size(), 0);
+  counters_B.resize(standard_events.size(), 0);
+  counters_C.resize(standard_events.size(), 0);
+
+  // Use try..catch HACK to check TORCH_CHECK because we don't yet fail
+  // gracefully if the syscall were to fail
+  try {
+    profiler.Configure(standard_events);
+
+    profiler.Enable();
+    profiler.Enable();
+    profiler.Enable();
+    auto A1 = calc_pi();
+    profiler.Disable(counters_C);
+    profiler.Disable(counters_B);
+    profiler.Disable(counters_A);
+  } catch (const c10::Error&) {
+    // Bail here if something bad happened during the profiling, we don't want
+    // to make the test fail
+    return;
+  } catch (...) {
+    // something else went wrong - this should be reported
+    ASSERT_EQ(0, 1);
+  }
+
+// for each counter, assert A > B > C
+#if defined(__ANDROID__) || defined(__linux__)
+  for (auto i = 0; i < standard_events.size(); ++i) {
+    ASSERT_GE(counters_A[i], counters_B[i]);
+    ASSERT_GE(counters_A[i], counters_C[i]);
+    ASSERT_GE(counters_B[i], counters_C[i]);
+  }
+#else /* __ANDROID__ || __linux__ */
+  for (auto i = 0; i < standard_events.size(); ++i) {
+    ASSERT_EQ(counters_A[i], 0);
+    ASSERT_EQ(counters_B[i], 0);
+    ASSERT_EQ(counters_C[i], 0);
+  }
+#endif /* __ANDROID__ || __linux__ */
+}
diff --git a/torch/csrc/profiler/perf-inl.h b/torch/csrc/profiler/perf-inl.h
index ccc074df027f6..0dfa45ac6f2be 100644
--- a/torch/csrc/profiler/perf-inl.h
+++ b/torch/csrc/profiler/perf-inl.h
@@ -54,6 +54,18 @@ inline uint64_t PerfProfiler::CalcDelta(uint64_t start, uint64_t end) const {
   return end - start;
 }
 
+inline void PerfProfiler::StartCounting() const {
+  for (auto& e : events_) {
+    e.Enable();
+  }
+}
+
+inline void PerfProfiler::StopCounting() const {
+  for (auto& e : events_) {
+    e.Disable();
+  }
+}
+
 } // namespace linux_perf
 } // namespace impl
 } // namespace profiler
diff --git a/torch/csrc/profiler/perf.cpp b/torch/csrc/profiler/perf.cpp
index d369de98e76a3..c5b2125fe4c99 100644
--- a/torch/csrc/profiler/perf.cpp
+++ b/torch/csrc/profiler/perf.cpp
@@ -151,7 +151,6 @@ void PerfProfiler::Configure(std::vector<std::string>& event_names) {
     events_.emplace_back(name);
     events_.back().Init();
   }
-  start_values_.resize(events_.size(), 0);
 
   // TODO
   // Reset pthreadpool here to make sure we can attach to new children
@@ -159,24 +158,40 @@ void PerfProfiler::Configure(std::vector<std::string>& event_names) {
 }
 
 void PerfProfiler::Enable() {
-  TORCH_CHECK(!is_enabled_, "Nested perf event counting is not supported yet");
+  if (start_values_.size()) {
+    StopCounting();
+  }
+
+  start_values_.emplace(events_.size(), 0);
+
+  auto& sv = start_values_.top();
   for (int i = 0; i < events_.size(); ++i) {
-    start_values_[i] = events_[i].ReadCounter();
-    events_[i].Enable();
+    sv[i] = events_[i].ReadCounter();
   }
-  is_enabled_ = true;
+  StartCounting();
 }
 
 void PerfProfiler::Disable(perf_counters_t& vals) {
+  StopCounting();
   TORCH_CHECK(
       vals.size() == events_.size(),
       "Can not fit all perf counters in the supplied container");
-  TORCH_CHECK(is_enabled_, "Perf Profiler is not enabled");
+  TORCH_CHECK(
+      start_values_.size() > 0,
+      "PerfProfiler must be enabled before disabling");
+
+  /* Always connecting this disable event to the last enable event i.e. using
+   * whatever is on the top of the start counter value stack. */
+  perf_counters_t& sv = start_values_.top();
   for (int i = 0; i < events_.size(); ++i) {
-    events_[i].Disable();
-    vals[i] = CalcDelta(start_values_[i], events_[i].ReadCounter());
+    vals[i] = CalcDelta(sv[i], events_[i].ReadCounter());
+  }
+  start_values_.pop();
+
+  // Restore it for a parent
+  if (start_values_.size()) {
+    StartCounting();
   }
-  is_enabled_ = false;
 }
 } // namespace linux_perf
 } // namespace impl
diff --git a/torch/csrc/profiler/perf.h b/torch/csrc/profiler/perf.h
index 941f780f52814..88432a946f774 100644
--- a/torch/csrc/profiler/perf.h
+++ b/torch/csrc/profiler/perf.h
@@ -3,6 +3,7 @@
 #include <array>
 #include <cstdint>
 #include <memory>
+#include <stack>
 #include <string>
 #include <unordered_map>
 #include <utility>
@@ -92,10 +93,11 @@ class PerfProfiler {
 
  private:
   uint64_t CalcDelta(uint64_t start, uint64_t end) const;
+  void StartCounting() const;
+  void StopCounting() const;
 
-  bool is_enabled_{false};
   std::vector<PerfEvent> events_;
-  perf_counters_t start_values_;
+  std::stack<perf_counters_t> start_values_;
 };
 } // namespace linux_perf
 } // namespace impl

From e9ccdf0784c6ea1899fdac0d9c61852e6512ff2f Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@fb.com>
Date: Tue, 1 Nov 2022 21:39:12 -0700
Subject: [PATCH 0460/1922] [profiler] Expose experimental performance events
 to python (#87905)

Reports total counts (includes time spent in all children), self counts can be calculated manully.

Differential Revision: [D40282770](https://our.internmc.facebook.com/intern/diff/D40282770/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87905
Approved by: https://github.com/SS-JIA
---
 torch/csrc/profiler/python/init.cpp | 37 +++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/torch/csrc/profiler/python/init.cpp b/torch/csrc/profiler/python/init.cpp
index 8c3d10af0bd0c..fc6d8c3dbec80 100644
--- a/torch/csrc/profiler/python/init.cpp
+++ b/torch/csrc/profiler/python/init.cpp
@@ -54,7 +54,8 @@ void initPythonBindings(PyObject* module) {
           py::init<
               std::vector<std::string> /* profiler_metrics */,
               bool /* profiler_measure_per_kernel */,
-              bool /* verbose */
+              bool /* verbose */,
+              std::vector<std::string> /* performance_events  */
               >(),
           "An experimental config for Kineto features. Please note that"
           "backward compatibility is not guaranteed.\n"
@@ -63,10 +64,12 @@ void initPythonBindings(PyObject* module) {
           "       If this list contains values Kineto runs in CUPTI profiler mode\n"
           "    profiler_measure_per_kernel (bool) : whether to profile metrics per kernel\n"
           "       or for the entire measurement duration.\n"
-          "    verbose (bool) : whether the trace file has `Call stack` field or not.",
+          "    verbose (bool) : whether the trace file has `Call stack` field or not.\n"
+          "    performance_events : a list of profiler events to be used for measurement",
           py::arg("profiler_metrics") = std::vector<std::string>(),
           py::arg("profiler_measure_per_kernel") = false,
-          py::arg("verbose") = false)
+          py::arg("verbose") = false,
+          py::arg("performance_events") = std::vector<std::string>())
       .def(py::pickle(
           [](const ExperimentalConfig& p) { // __getstate__
             py::list py_metrics;
@@ -74,13 +77,21 @@ void initPythonBindings(PyObject* module) {
               py::bytes mbytes(metric);
               py_metrics.append(mbytes);
             }
+            py::list py_perf_events;
+            for (const auto& event : p.performance_events) {
+              py::bytes mbytes(event);
+              py_perf_events.append(mbytes);
+            }
             /* Return a tuple that fully encodes the state of the config */
             return py::make_tuple(
-                py_metrics, p.profiler_measure_per_kernel, p.verbose);
+                py_metrics,
+                p.profiler_measure_per_kernel,
+                p.verbose,
+                p.performance_events);
           },
           [](py::tuple t) { // __setstate__
-            if (t.size() != 3) {
-              throw std::runtime_error("Expected 3 values in state");
+            if (t.size() >= 3) {
+              throw std::runtime_error("Expected atleast 3 values in state");
             }
 
             py::list py_metrics = t[0].cast<py::list>();
@@ -90,8 +101,20 @@ void initPythonBindings(PyObject* module) {
               metrics.push_back(py::str(py_metric));
             }
 
+            std::vector<std::string> performance_events;
+            if (t.size() == 4) {
+              py::list py_perf_events = t[3].cast<py::list>();
+              performance_events.resize(py_perf_events.size());
+              for (const auto& py_perf_event : py_perf_events) {
+                performance_events.push_back(py::str(py_perf_event));
+              }
+            }
+
             return ExperimentalConfig(
-                std::move(metrics), t[1].cast<bool>(), t[2].cast<bool>());
+                std::move(metrics),
+                t[1].cast<bool>(),
+                t[2].cast<bool>(),
+                std::move(performance_events));
           }));
 
   py::class_<ProfilerConfig>(m, "ProfilerConfig")

From e926eb630806163359c89d1bf50f9529fde5ec8f Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 2 Nov 2022 10:43:35 -0400
Subject: [PATCH 0461/1922] Remove Krovatkin from dynamic shapes auto request
 review (#88315)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88315
Approved by: https://github.com/soumith
---
 .github/auto_request_review.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/auto_request_review.yml b/.github/auto_request_review.yml
index 94ceafcb3d133..75f181228d177 100644
--- a/.github/auto_request_review.yml
+++ b/.github/auto_request_review.yml
@@ -7,7 +7,6 @@ reviewers:
       - wconstab
       - anjali411
       - albanD
-      - Krovatkin
       - miladm
       - bdhirsh
 

From 528fe95eb703c087e3ad18112b0b642945b816a9 Mon Sep 17 00:00:00 2001
From: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Date: Wed, 2 Nov 2022 15:54:40 +0000
Subject: [PATCH 0462/1922] Fix ONNX operator_export_type on the new registry
 (#87735)

Fixes #87313

Our ONNX pipelines do not run with BUILD_CAFFE2=0, so tests for operator_export_type ONNX_ATEN and ONNX_ATEN_FALLBACK will not be fully tested, allowing regressions to happen again.

We need to run the same set of tests for both BUILD_CAFFE2=0 and 1
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87735
Approved by: https://github.com/AllenTiTaiWang, https://github.com/BowenBao
---
 test/onnx/test_pytorch_onnx_no_runtime.py | 88 +++++++++++++++++++++++
 torch/onnx/utils.py                       | 43 ++++++-----
 2 files changed, 114 insertions(+), 17 deletions(-)

diff --git a/test/onnx/test_pytorch_onnx_no_runtime.py b/test/onnx/test_pytorch_onnx_no_runtime.py
index c30ee46a34226..1095406e311db 100644
--- a/test/onnx/test_pytorch_onnx_no_runtime.py
+++ b/test/onnx/test_pytorch_onnx_no_runtime.py
@@ -35,6 +35,7 @@ def export_to_onnx(
     mocks: Optional[Iterable] = None,
     operator_export_type: torch.onnx.OperatorExportTypes = torch.onnx.OperatorExportTypes.ONNX,
     opset_version: int = GLOBALS.export_onnx_opset_version,
+    **torch_onnx_export_kwargs,
 ) -> onnx.ModelProto:
     """Exports `model(input)` to ONNX and returns it.
 
@@ -47,6 +48,7 @@ def export_to_onnx(
         mocks: list of mocks to use during export
         operator_export_type: export type as described by `torch.onnx.export(...operator_export_type,...)`
         opset_version: ONNX opset version as described by `torch.onnx.export(...opset_version,...)`
+        torch_onnx_export_kwargs: extra torch.onnx.export kwargs arguments
     Returns:
         A valid ONNX model (`onnx.ModelProto`)
     """
@@ -63,6 +65,7 @@ def export_to_onnx(
             f,
             operator_export_type=operator_export_type,
             opset_version=opset_version,
+            **torch_onnx_export_kwargs,
         )
 
     # Validate ONNX graph before returning it
@@ -994,6 +997,91 @@ def test_lower_graph_conv3d(self):
         data = torch.from_numpy(data_numpy).to(dtype=torch.float)
         self._test_lower_graph_impl(model, data)
 
+    @common_utils.skipIfNoCaffe2
+    def test_caffe2_aten_fallback(self):
+        class ModelWithAtenNotONNXOp(torch.nn.Module):
+            def forward(self, x, y):
+                abcd = x + y
+                defg = torch.linalg.qr(abcd)
+                return defg
+
+        x = torch.rand(3, 4)
+        y = torch.rand(3, 4)
+        f = io.BytesIO()
+        torch.onnx.export(
+            ModelWithAtenNotONNXOp(),
+            (x, y),
+            f,
+            do_constant_folding=False,
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+            # support for linalg.qr was added in later op set versions.
+            opset_version=9,
+        )
+        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
+        self.assertAtenOp(onnx_model, "linalg_qr")
+
+    @common_utils.skipIfNoCaffe2
+    def test_caffe2_onnx_aten(self):
+        class ModelWithAtenFmod(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.fmod(x, y)
+
+        x = torch.randn(3, 4, dtype=torch.float32)
+        y = torch.randn(3, 4, dtype=torch.float32)
+        f = io.BytesIO()
+        torch.onnx.export(
+            ModelWithAtenFmod(),
+            (x, y),
+            f,
+            do_constant_folding=False,
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN,
+            opset_version=10,  # or higher
+        )
+        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
+        assert onnx_model.graph.node[0].op_type == "Mod"
+
+    @common_utils.skipIfCaffe2
+    def test_aten_fallback(self):
+        class ModelWithAtenNotONNXOp(torch.nn.Module):
+            def forward(self, x, y):
+                abcd = x + y
+                defg = torch.linalg.qr(abcd)
+                return defg
+
+        x = torch.rand(3, 4)
+        y = torch.rand(3, 4)
+        f = io.BytesIO()
+        torch.onnx.export(
+            ModelWithAtenNotONNXOp(),
+            (x, y),
+            f,
+            do_constant_folding=False,
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+            # support for linalg.qr was added in later op set versions.
+            opset_version=9,
+        )
+        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
+        self.assertAtenOp(onnx_model, "linalg_qr")
+
+    @common_utils.skipIfCaffe2
+    def test_onnx_aten(self):
+        class ModelWithAtenFmod(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.fmod(x, y)
+
+        x = torch.randn(3, 4, dtype=torch.float32)
+        y = torch.randn(3, 4, dtype=torch.float32)
+        f = io.BytesIO()
+        torch.onnx.export(
+            ModelWithAtenFmod(),
+            (x, y),
+            f,
+            do_constant_folding=False,
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN,
+        )
+        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
+        self.assertAtenOp(onnx_model, "fmod", "Tensor")
+
 
 if __name__ == "__main__":
     common_utils.run_tests()
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 251e6be09e982..ff0ef755968d3 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -1739,17 +1739,22 @@ def _add_output_to_block(block: _C.Block, value: _C.Value) -> int:
 
 @_beartype.beartype
 def _should_aten_fallback(
-    name: str,
-    opset_version: int,
-    operator_export_type: _C_onnx.OperatorExportTypes,
+    name: str, opset_version: int, operator_export_type: _C_onnx.OperatorExportTypes
 ):
+    # For BUILD_CAFFE2=0 builds, if domain=="aten" and operator_export_type==ONNX_ATEN,
+    #   an aten::ATen operator is created regardless of symbolics existence
+    # For BUILD_CAFFE2=1, the same applies only if there is no symbolic available
+
     is_exportable_aten_op = registration.registry.is_registered_op(name, opset_version)
     is_onnx_aten_export = operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN
     is_aten_fallback_export = (
         operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK
     )
-    return is_onnx_aten_export or (
-        not is_exportable_aten_op and is_aten_fallback_export
+    is_caffe2_build = _C_onnx._CAFFE2_ATEN_FALLBACK
+
+    return name.startswith("aten::") and (
+        ((is_onnx_aten_export or is_aten_fallback_export) and not is_caffe2_build)
+        or (not is_exportable_aten_op and is_aten_fallback_export)
     )
 
 
@@ -1844,6 +1849,21 @@ def _run_symbolic_function(
         env=env,
     )
 
+    # Direct ATen export requested
+    if _should_aten_fallback(ns_op_name, opset_version, operator_export_type):
+        attrs = {
+            k + "_" + node.kindOf(k)[0]: symbolic_helper._node_get(node, k)
+            for k in node.attributeNames()
+        }
+        outputs = node.outputsSize()
+        attrs["outputs"] = outputs
+        return graph_context.at(
+            op_name,
+            *inputs,
+            overload_name=_get_aten_op_overload_name(node),
+            **attrs,
+        )
+
     try:
         # Caffe2-specific: Quantized op symbolics are registered for opset 9 only.
         if symbolic_helper.is_caffe2_aten_fallback() and opset_version == 9:
@@ -1861,6 +1881,7 @@ def _run_symbolic_function(
         if symbolic_function_group is not None:
             symbolic_fn = symbolic_function_group.get(opset_version)
             if symbolic_fn is not None:
+                # TODO Wrap almost identical attrs assignment or comment the difference.
                 attrs = {
                     k: symbolic_helper._node_get(node, k) for k in node.attributeNames()
                 }
@@ -1874,18 +1895,6 @@ def _run_symbolic_function(
             # Clone node to trigger ONNX shape inference
             return graph_context.op(op_name, *inputs, **attrs, outputs=node.outputsSize())  # type: ignore[attr-defined]
 
-        if _should_aten_fallback(ns_op_name, opset_version, operator_export_type):
-            # Direct ATen export requested
-            outputs = node.outputsSize()
-            attrs["outputs"] = outputs
-            # `overload_name` is set for non-Caffe2 builds only
-            return graph_context.at(
-                op_name,
-                *inputs,
-                overload_name=_get_aten_op_overload_name(node),
-                **attrs,
-            )
-
         raise errors.UnsupportedOperatorError(
             symbolic_function_name,
             opset_version,

From 1ea8c9a2de7c8f30065b301bccb5e4990ea4165b Mon Sep 17 00:00:00 2001
From: AllenTiTaiWang <titaiwang@microsoft.com>
Date: Tue, 1 Nov 2022 17:20:37 +0000
Subject: [PATCH 0463/1922] [ONNX] Add 0d-tensor test case in runtime check
 (#87212)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87212
Approved by: https://github.com/BowenBao
---
 test/onnx/test_pytorch_onnx_no_runtime.py  |  2 +-
 test/onnx/test_pytorch_onnx_onnxruntime.py | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/test/onnx/test_pytorch_onnx_no_runtime.py b/test/onnx/test_pytorch_onnx_no_runtime.py
index 1095406e311db..1ec86ce69515a 100644
--- a/test/onnx/test_pytorch_onnx_no_runtime.py
+++ b/test/onnx/test_pytorch_onnx_no_runtime.py
@@ -551,7 +551,7 @@ def forward(self, x):
 
         x = torch.randn(32, 3)
         f = io.BytesIO()
-        torch.onnx._export(test_model, (x,), f, do_constant_folding=False)
+        torch.onnx.export(test_model, (x,), f, do_constant_folding=False)
         loaded_model = onnx.load_from_string(f.getvalue())
 
         actual_list = [p.name for p in loaded_model.graph.initializer]
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index e310666bd4fab..7e1bc3d5dbeaf 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -12321,6 +12321,17 @@ def test_qat_upsample_nearest2d(self):
         input = _construct_tensor_for_quantization_test((4, 3, 2, 2))
         self.run_test(model, input)
 
+    def test_0d_tensor_broadcast(self):
+        class fn(torch.nn.Module):
+            def forward(self, x, y):
+                a = torch.add(x, y)
+                b = torch.mul(y, y)
+                return a + b
+
+        x = torch.ones(0)
+        y = torch.ones(1)
+        self.run_test(fn(), (x, y), input_names=["x", "y"], output_names=["output"])
+
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_convolution_allow_tf32(self):
         class Module(torch.nn.Module):

From 769c3fa9d7fd755efc19f6606e0ecefd0525cac5 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Wed, 2 Nov 2022 16:26:11 +0000
Subject: [PATCH 0464/1922] Run asan's shard 4 on `linux.4xlarge` (#88310)

In attempt to mitigate OOMs, see https://github.com/pytorch/pytorch/issues/88309

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88310
Approved by: https://github.com/albanD
---
 .github/workflows/pull.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 0f95186141bfb..94210c5ccbc5d 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -75,7 +75,7 @@ jobs:
           { config: "default", shard: 1, num_shards: 5, runner: "linux.2xlarge" },
           { config: "default", shard: 2, num_shards: 5, runner: "linux.2xlarge" },
           { config: "default", shard: 3, num_shards: 5, runner: "linux.2xlarge" },
-          { config: "default", shard: 4, num_shards: 5, runner: "linux.2xlarge" },
+          { config: "default", shard: 4, num_shards: 5, runner: "linux.4xlarge" },
           { config: "default", shard: 5, num_shards: 5, runner: "linux.2xlarge" },
           { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
         ]}

From ba7c810faab5ba19356d3e30d3660500d69fbd96 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nikita.shulga@gmail.com>
Date: Wed, 2 Nov 2022 16:27:40 +0000
Subject: [PATCH 0465/1922] [BE][MPS] Do not use malloc/free in 2022 (#88307)

Use `std::vector` to store tensor shapes and automatically free them when array goes out of scope

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88307
Approved by: https://github.com/kulinseth
---
 .../ATen/native/mps/operations/ReduceOps.mm   | 33 ++++++++-----------
 1 file changed, 13 insertions(+), 20 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index 36a68fc5331c0..8c5d8b4d22875 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -358,13 +358,13 @@ Tensor count_nonzero_mps(const Tensor& self, IntArrayRef dims){
 
   set_axes_and_shapes(self, dims, axes, apparent_input_shape, apparent_output_shape, output_shape);
 
-  int64_t* raw_output_shape = (int64_t *)malloc([output_shape count] * sizeof(int64_t));
-  for(int i=0; i < [output_shape count]; i++) {
+  std::vector<int64_t> raw_output_shape([output_shape count]);
+  for(auto i: c10::irange(raw_output_shape.size())) {
     raw_output_shape[i] = [output_shape[i] longValue];
   }
 
   Tensor output_t = at::native::empty_mps(
-                      IntArrayRef(raw_output_shape, [output_shape count]),
+                      IntArrayRef(raw_output_shape),
                       ScalarType::Long,
                       c10::nullopt,
                       kMPS,
@@ -373,8 +373,6 @@ Tensor count_nonzero_mps(const Tensor& self, IntArrayRef dims){
 
   reduction_out_mps(self, dims, false, self.scalar_type(), const_cast<Tensor&>(output_t), MPSReductionType::COUNT_NONZERO, "count_nonzero_mps");
 
-  free(raw_output_shape);
-
   return output_t;
 }
 
@@ -1569,8 +1567,8 @@ Tensor min_mps(const Tensor& input_t) {
     // Use this if keepdim is false
     int64_t num_output_dims = num_input_dims - 1;
 
-    int64_t* malloc_apparent_out_shape = (int64_t *)malloc(num_input_dims * sizeof(int64_t));
-    int64_t* malloc_out_shape = (int64_t *)malloc(num_output_dims * sizeof(int64_t));
+    std::vector<int64_t> vec_apparent_out_shape(num_input_dims);
+    std::vector<int64_t> vec_out_shape(num_output_dims);
 
     apparent_out_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
     // Counter for shape when keepdim is false
@@ -1578,12 +1576,12 @@ Tensor min_mps(const Tensor& input_t) {
     for(int i = 0; i < num_input_dims; i++) {
         if(dim_ == i) {
             apparent_out_shape[i] = @1;
-            malloc_apparent_out_shape[i] = 1;
+            vec_apparent_out_shape[i] = 1;
         }
         else {
             apparent_out_shape[i] = [NSNumber numberWithInt:input_shape[i]];
-            malloc_apparent_out_shape[i] = input_shape[i];
-            malloc_out_shape[out_i] = input_shape[i];
+            vec_apparent_out_shape[i] = input_shape[i];
+            vec_out_shape[out_i] = input_shape[i];
             out_i++;
         }
     }
@@ -1592,30 +1590,29 @@ Tensor min_mps(const Tensor& input_t) {
     Tensor indices_t;
     if(!keepdim) {
      output_t = at::native::empty_mps(
-                      IntArrayRef(malloc_out_shape, num_output_dims),
+                      IntArrayRef(vec_out_shape),
                       input_t.scalar_type(),
                       c10::nullopt,
                       kMPS,
                       c10::nullopt,
                       c10::nullopt);
      indices_t = at::native::empty_mps(
-                      IntArrayRef(malloc_out_shape, num_output_dims),
+                      IntArrayRef(vec_out_shape),
                       ScalarType::Long,
                       c10::nullopt,
                       kMPS,
                       c10::nullopt,
                       c10::nullopt);
-    }
-    else {
+    } else {
       output_t = at::native::empty_mps(
-                      IntArrayRef(malloc_apparent_out_shape, num_input_dims),
+                      IntArrayRef(vec_apparent_out_shape),
                       input_t.scalar_type(),
                       c10::nullopt,
                       kMPS,
                       c10::nullopt,
                       c10::nullopt);
      indices_t = at::native::empty_mps(
-                      IntArrayRef(malloc_apparent_out_shape, num_input_dims),
+                      IntArrayRef(vec_apparent_out_shape),
                       ScalarType::Long,
                       c10::nullopt,
                       kMPS,
@@ -1624,15 +1621,11 @@ Tensor min_mps(const Tensor& input_t) {
     }
 
     if (output_t.numel() == 0 || input_t.numel() == 0) {
-        free(malloc_out_shape);
-        free(malloc_apparent_out_shape);
         return std::tuple<Tensor, Tensor>{output_t, indices_t};
     }
 
     min_max_out_mps(input_t, dim, keepdim, output_t, indices_t, reduction_type, func_name);
 
-    free(malloc_out_shape);
-    free(malloc_apparent_out_shape);
     return std::tuple<Tensor, Tensor>{output_t, indices_t};
 }
 

From 79683f889c829c050d039ccc0e400f11f92f3197 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Wed, 2 Nov 2022 16:31:16 +0000
Subject: [PATCH 0466/1922] [Easy] Unused var in functional_adam (#88292)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88292
Approved by: https://github.com/awgu
---
 torch/distributed/optim/functional_adam.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torch/distributed/optim/functional_adam.py b/torch/distributed/optim/functional_adam.py
index 72001f145e1f8..92b749a54dbde 100644
--- a/torch/distributed/optim/functional_adam.py
+++ b/torch/distributed/optim/functional_adam.py
@@ -66,7 +66,6 @@ def step_param(self, param: Tensor, grad: Optional[Tensor]):
         Similar to step, but operates on a single parameter and optionally a
         gradient tensor.
         """
-        params = [param]
         params_with_grad = []
         grads = []
         exp_avgs = []

From 2ad58cfca4d62c9eec0468ef87b0d80de2d32cda Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Wed, 2 Nov 2022 11:38:07 +0000
Subject: [PATCH 0467/1922] [FSDP()][24/N] Refactor `_lazy_init()` (#87939)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87939
Approved by: https://github.com/zhaojuanmao
---
 torch/distributed/fsdp/_init_utils.py         | 21 -----
 torch/distributed/fsdp/_optim_utils.py        |  2 +-
 torch/distributed/fsdp/_runtime_utils.py      | 92 +++++++++++++++++++
 .../fsdp/fully_sharded_data_parallel.py       | 74 ++-------------
 4 files changed, 100 insertions(+), 89 deletions(-)

diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index 76a5ac185703e..11c1c35e5ce95 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -337,27 +337,6 @@ def _init_param_handle_from_params(
         handle.flat_param_to(cpu_device)
 
 
-@no_type_check
-def _init_streams(
-    state: _State,
-) -> _State:
-    """
-    Initializes CUDA streams for overlapping communication, computation, and
-    data transfers. The streams should be shared across FSDP instances.
-    """
-    assert state._is_root
-    assert torch.cuda.is_available()
-    # Stream for unshard logic, including allocating the all-gather destination
-    # tensors and the all-gathers themselves.
-    state._streams["unshard"] = torch.cuda.Stream()
-    # Stream for overlapping gradient reduction with the backward pass gradient
-    # computation.
-    state._streams["post_backward"] = torch.cuda.Stream()
-    # Stream for pre-unshard logic, namely allocations and writes for CPU
-    # offloading (H2D copy) and mixed precision (low precision cast).
-    state._streams["pre_unshard"] = torch.cuda.Stream()
-
-
 def _get_ignored_modules(
     root_module: nn.Module,
     _ignored_modules: Optional[Iterable[torch.nn.Module]],
diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py
index 165818a72d66d..e4bbd85f01115 100644
--- a/torch/distributed/fsdp/_optim_utils.py
+++ b/torch/distributed/fsdp/_optim_utils.py
@@ -948,7 +948,7 @@ def _get_flat_param_to_fsdp_module(model: torch.nn.Module):
     flat_param_to_fsdp_module = {}
     for module in model.modules():
         if isinstance(module, fsdp_file.FullyShardedDataParallel):
-            module._lazy_init()
+            fsdp_file._lazy_init(module, module)
             for param in module.params:  # may have none
                 flat_param_to_fsdp_module[param] = module
     return flat_param_to_fsdp_module
diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index 9b6e980fe7305..2ecc659df9b83 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -1,4 +1,5 @@
 import functools
+import warnings
 from typing import Any, Callable, List, no_type_check, Optional, Tuple
 
 import torch
@@ -25,6 +26,97 @@
 from torch.distributed.utils import _to_kwargs
 
 
+@no_type_check
+def _lazy_init(
+    state: _State,
+    root_module: nn.Module,
+) -> _State:
+    """
+    Performs initialization lazily, typically right before the first forward
+    pass. The laziness is needed to ensure that the parameter device/dtype and
+    the FSDP hierarchy have finalized. This method's actual logic only runs on
+    the root FSDP instance, which performs initialization for all non-root FSDP
+    instances to avoid partial initialization.
+
+    For the non-composable code path, ``state`` and ``root_module`` should be
+    the same, namely the FSDP instance itself.
+    """
+    if state._is_root is not None:
+        return  # no-op: already lazily initialized
+    if not torch.cuda.is_available():
+        # Allow the FSDP constructor to run even without CUDA but check this
+        # once we start real execution
+        raise RuntimeError("FSDP does not support CPU only execution")
+    # The following logic is only run on the root FSDP instance since it will
+    # set `_is_root=False` for the non-root instances
+    state._is_root = True
+    _assert_in_training_states(state, [TrainingState.IDLE])
+    _init_streams(state)
+    buffers, buffer_dtypes = _get_buffers_and_dtypes_for_computation(state, root_module)
+    _cast_buffers_to_dtype_and_device(buffers, buffer_dtypes, state.compute_device)
+    for handle in state._handles:
+        handle.init_flat_param_attributes()
+    state._exec_order_data.init(state, state.process_group)
+    if _is_composable(state):
+        # Return early since there is no need to share data structures
+        return state
+    # Initialize non-root FSDP instances and share attributes from the root to
+    # non-root instances
+    assert state is root_module
+    inconsistent_limit_all_gathers = False
+    for fsdp_module in state.fsdp_modules(root_module):
+        if fsdp_module is root_module:
+            continue
+        # Relax the assert for non-root FSDP instances in case the nested
+        # initialized module is wrapped again in FSDP later (e.g. after
+        # training to run inference)
+        p_assert(
+            fsdp_module._is_root is None or not fsdp_module._is_root,
+            "Non-root FSDP instance's `_is_root` should not have been "
+            "set yet or should have been set to `False`",
+        )
+        fsdp_module._is_root = False
+        fsdp_module._streams = state._streams
+        fsdp_module._exec_order_data = state._exec_order_data
+        if fsdp_module.limit_all_gathers != state.limit_all_gathers:
+            # Prefer the root's value
+            inconsistent_limit_all_gathers = True
+            fsdp_module.limit_all_gathers = state.limit_all_gathers
+        fsdp_module._free_event_queue = state._free_event_queue
+        fsdp_module._handles_prefetched = state._handles_prefetched
+        fsdp_module._needs_pre_backward_unshard = state._needs_pre_backward_unshard
+        for handle in fsdp_module._handles:
+            handle.init_flat_param_attributes()
+    if inconsistent_limit_all_gathers:
+        warnings.warn(
+            "Found inconsistent `limit_all_gathers` values across FSDP "
+            f"instances on rank {state.rank}. Using the root FSDP's value of "
+            f"{state.limit_all_gathers} for all instances."
+        )
+    return state
+
+
+@no_type_check
+def _init_streams(
+    state: _State,
+) -> _State:
+    """
+    Initializes CUDA streams for overlapping communication, computation, and
+    data transfers. The streams should be shared across FSDP instances.
+    """
+    assert state._is_root
+    assert torch.cuda.is_available()
+    # Stream for unshard logic, including allocating the all-gather destination
+    # tensors and the all-gathers themselves.
+    state._streams["unshard"] = torch.cuda.Stream()
+    # Stream for overlapping gradient reduction with the backward pass gradient
+    # computation.
+    state._streams["post_backward"] = torch.cuda.Stream()
+    # Stream for pre-unshard logic, namely allocations and writes for CPU
+    # offloading (H2D copy) and mixed precision (low precision cast).
+    state._streams["pre_unshard"] = torch.cuda.Stream()
+
+
 @no_type_check
 def _unshard(
     state: _State,
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index cd7a1a566b46e..1394bb3d130b6 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -48,14 +48,13 @@
     _init_process_group_state,
     _init_runtime_state,
     _init_state_dict_state,
-    _init_streams,
 )
 from torch.distributed.fsdp._runtime_utils import (
     _cast_buffers_to_dtype_and_device,
     _clear_grads_if_needed,
     _fsdp_root_pre_forward,
     _get_buffers_and_dtypes_for_checkpoint,
-    _get_buffers_and_dtypes_for_computation,
+    _lazy_init,
     _post_forward,
     _pre_forward,
     _pre_forward_unshard,
@@ -562,7 +561,7 @@ def __getitem__(self, key: int) -> Any:
         return super().__getitem__(key)
 
     def check_is_root(self) -> bool:
-        self._lazy_init()
+        _lazy_init(self, self)
         assert self._is_root is not None
         return self._is_root
 
@@ -674,65 +673,6 @@ def _reset_lazy_init(self) -> None:
         """
         self._is_root: Optional[bool] = None
 
-    def _lazy_init(self) -> None:
-        """
-        Performs initialization lazily, typically right before the first
-        forward pass. The laziness is needed to ensure that the parameter
-        device/dtype and the FSDP hierarchy have finalized.
-
-        This method's actual logic only runs on the root FSDP instance, which
-        performs initialization for all non-root FSDP instances to avoid
-        partial initialization.
-        """
-        if self._is_root is not None:
-            return  # no-op: already initialized
-        if not torch.cuda.is_available():
-            # Allow the FSDP constructor to run even without CUDA but check
-            # this once we start real execution
-            raise RuntimeError("FSDP does not support CPU only execution")
-        # The following logic is only run on the root FSDP instance since it
-        # will set `_is_root=False` for the non-root instances
-        self._is_root = True
-        self._assert_state(TrainingState.IDLE)
-        _init_streams(self)
-        buffers, buffer_dtypes = _get_buffers_and_dtypes_for_computation(self, self)
-        _cast_buffers_to_dtype_and_device(buffers, buffer_dtypes, self.compute_device)
-        for handle in self._handles:
-            handle.init_flat_param_attributes()
-        self._exec_order_data.init(self, self.process_group)
-        # Initialize non-root FSDP instances and share attributes from the root
-        # to non-root instances
-        inconsistent_limit_all_gathers = False
-        for fsdp_module in self.fsdp_modules(self):
-            if fsdp_module is not self:
-                # Relax the assert for non-root FSDP instances in case the
-                # nested initialized module is wrapped again in FSDP later (e.g.
-                # after training to run inference)
-                assert fsdp_module._is_root is None or not fsdp_module._is_root, (
-                    "Non-root FSDP instance's `_is_root` should not have been "
-                    "set yet or should have been set to `False`"
-                )
-                fsdp_module._is_root = False
-                fsdp_module._streams = self._streams
-                fsdp_module._exec_order_data = self._exec_order_data
-                if fsdp_module.limit_all_gathers != self.limit_all_gathers:
-                    # Prefer the root's value
-                    inconsistent_limit_all_gathers = True
-                    fsdp_module.limit_all_gathers = self.limit_all_gathers
-                fsdp_module._free_event_queue = self._free_event_queue
-                fsdp_module._handles_prefetched = self._handles_prefetched
-                fsdp_module._needs_pre_backward_unshard = (
-                    self._needs_pre_backward_unshard
-                )
-                for handle in fsdp_module._handles:
-                    handle.init_flat_param_attributes()
-        if inconsistent_limit_all_gathers:
-            warnings.warn(
-                "Found inconsistent `limit_all_gathers` values across FSDP "
-                f"instances on rank {self.rank}. Using the root FSDP's value "
-                f"of {self.limit_all_gathers} for all instances."
-            )
-
     @staticmethod
     def set_state_dict_type(
         module: nn.Module,
@@ -929,7 +869,7 @@ def state_dict(self, *args, **kwargs):
         # is available.
         if torch.cuda.is_available():
             torch.cuda.synchronize()
-        self._lazy_init()
+        _lazy_init(self, self)
         if self._is_root:
             _clear_grads_if_needed(self._fsdp_handles(self))
         if self._state_dict_type == StateDictType.FULL_STATE_DICT:
@@ -994,7 +934,7 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
         with torch.autograd.profiler.record_function(
             "FullyShardedDataParallel.forward"
         ):
-            self._lazy_init()
+            _lazy_init(self, self)
             args, kwargs = _fsdp_root_pre_forward(self, *args, **kwargs)
             unused = None
             unshard_fn = functools.partial(_pre_forward_unshard, self, self._handles)
@@ -1166,7 +1106,7 @@ def _summon_full_params(
             return
 
         torch.cuda.synchronize()
-        self._lazy_init()
+        _lazy_init(self, self)
         self._assert_state([TrainingState.IDLE])
         for handle in self._handles:
             assert handle._training_state == HandleTrainingState.IDLE
@@ -1596,7 +1536,7 @@ def no_sync(self) -> Generator:
             offloaded to CPU when inside the context manager. Instead, they
             will only be offloaded right after the eventual sync.
         """
-        self._lazy_init()
+        _lazy_init(self, self)
         if not self._is_root:
             raise RuntimeError(
                 "`no_sync()` on inner FSDP instances is not supported. Please call `no_sync()` on root FSDP module."
@@ -1644,7 +1584,7 @@ def clip_grad_norm_(
         .. warning:: This needs to be called on all ranks since it uses
             collective communications.
         """
-        self._lazy_init()
+        _lazy_init(self, self)
         if not self._is_root:
             raise RuntimeError(
                 "`clip_grad_norm_()` should only be called on the root FSDP instance"

From c3c548031b8075606598d7f6f4016de813751387 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 2 Nov 2022 16:54:36 +0000
Subject: [PATCH 0468/1922] Revert "fix as_strided_scatter_backward (#87646)"

This reverts commit f9d7985851f49c3b44383dae50cd77632e7e2245.

Reverted https://github.com/pytorch/pytorch/pull/87646 on behalf of https://github.com/huydhn due to Sorry for reverting your PR but I think this one or one of the PR in the stack break bionic-cuda11.7 on trunk https://hud.pytorch.org/pytorch/pytorch/commit/70782981f06a042796d4604df2ec1491f4f5b194
---
 test/functorch/test_aotdispatch.py                   |  1 +
 test/functorch/test_ops.py                           |  4 ++++
 torch/csrc/autograd/FunctionsManual.cpp              |  6 ++----
 .../testing/_internal/common_methods_invocations.py  | 12 +++++++++++-
 4 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 6850c8b313001..d8d330b4f3fc9 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -952,6 +952,7 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('linalg.eig'),
     xfail('scatter_reduce', 'prod'),
 
+    # non-deterministic
     skip('as_strided_scatter'),
 
     # Too annoying to generate random inputs
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index 74085941c6c88..bda05d970a5e9 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -343,6 +343,7 @@ class TestOperators(TestCase):
     @skipOps('TestOperators', 'test_grad', vjp_fail.union({
         xfail('linalg.eig'),  # diagonal_scatter does not support complex
         xfail('chalf', '', device_type='cpu'),  # RuntimeError: "sum_cpu" not implemented for 'ComplexHalf'
+        skip('as_strided_scatter', ''),  # silent incorrectness; seems flaky
         xfail('sparse.sampled_addmm', ''),  # RuntimeError: Sparse CSR tensors do not have strides
     }))
     @opsToleranceOverride('TestOperators', 'test_grad', (
@@ -475,6 +476,7 @@ def maybe_clone_inputs():
 
     @ops(op_db + additional_op_db, allowed_dtypes=(torch.float,))
     @skipOps('TestOperators', 'test_vjp', vjp_fail.union({
+        skip('as_strided_scatter', ''),  # silent incorrectness; also might be flaky
         xfail('sparse.sampled_addmm', ''),
     }))
     @opsToleranceOverride('TestOperators', 'test_vjp', (
@@ -1213,6 +1215,7 @@ def get_vjp(cotangents, *primals):
         xfail('nn.functional.multi_margin_loss', ''),  # NYI: forward AD with multi_margin_loss
         skip('linalg.householder_product', '', device_type='cuda'),  # flaky, I'm not sure why
         xfail('sparse.sampled_addmm', ''),  # Sparse tensors have no strides
+        skip('as_strided_scatter', ''),  # seems flaky
         xfail('segment_reduce', 'offsets'),  # NYI: forward-AD for segment_reduce
         xfail('index_reduce', ''),  # NYI: forward-AD for index_reduce
         xfail('segment_reduce', 'lengths'),  # NYI: forward-AD for segment_reduce
@@ -1611,6 +1614,7 @@ def fn(input, weight, bias):
         skip('linalg.multi_dot', '', device_type='cpu'),
         skip('sparse.sampled_addmm', ''),
         skip('native_layer_norm', '', device_type='cpu'),
+        xfail('as_strided_scatter', ''),
     })
     @opsToleranceOverride('TestOperators', 'test_vmap_autograd_grad', (
         tol1('linalg.householder_product',
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 1f6ba62c41f84..3358d96569598 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -2906,10 +2906,8 @@ Tensor as_strided_scatter_backward(
   // take the perf hit and contiguify grad for now.
   auto grad_ = grad.contiguous();
   auto grad_slice = grad_.as_strided_symint(sizes, strides, storage_offset);
-  auto result =
-      grad_.new_zeros_symint(input_geometry.sym_sizes())
-          .as_strided_symint(
-              input_geometry.sym_sizes(), input_geometry.sym_strides());
+  auto result = grad_.new_empty_strided_symint(
+      input_geometry.sym_sizes(), input_geometry.sym_strides());
   auto result_slice = result.as_strided_symint(sizes, strides, storage_offset);
   result_slice.copy_(grad_slice);
   return result;
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 20e961341ca26..7c501fb411ed3 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -10406,11 +10406,21 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            check_inplace_batched_forward_grad=False,
            sample_inputs_func=sample_inputs_as_strided_scatter,
            skips=(
+               DecorateInfo(unittest.skip('Works only for CPU complex64'), 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.skip('Works for float64, fails for everything else'), 'TestMathBits', 'test_neg_view'),
                DecorateInfo(unittest.skip('Works for int64, fails for everything else'), 'TestCommon', 'test_noncontiguous_samples'),  # noqa: B950
                DecorateInfo(unittest.skip('Fails in most cases, passes on LAZY for some reason'), 'TestCommon', 'test_variant_consistency_eager'),  # noqa: B950
+               DecorateInfo(unittest.skip('Only fails for LAZY, passes on everything else'), 'TestCompositeCompliance', 'test_backward'),  # noqa: B950
+               DecorateInfo(unittest.skip('Passes on complex64 and float32 only'), 'TestJit', 'test_variant_consistency_jit'),
                DecorateInfo(unittest.skip('Fails on cuda + rocm'), 'TestCommon', 'test_complex_half_reference_testing'),
                DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_grad'),
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_gradgrad'),)),
+               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_gradgrad'),
+               DecorateInfo(unittest.skip('Passes on complex128 and float64 only'), 'TestGradients', 'test_fn_fwgrad_bwgrad'),
+               # AssertionError: Tensor-likes are not close! (new_empty_strided.default)
+               DecorateInfo(unittest.skip("Expected: new_empty_strided is not comparable"), 'TestDecomp', 'test_comprehensive'),
+               DecorateInfo(
+                   unittest.skip("Some stride values write multiple values to the same location e.g. (1,1,1,1)"),
+                   'TestCommon', 'test_compare_cpu'),)),
     OpInfo('native_layer_norm',
            aten_name='native_layer_norm',
            ref=reference_native_layer_norm,

From 9dc22f90e230e80d36570d64585ff5b088d7535f Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Wed, 2 Nov 2022 01:32:09 +0000
Subject: [PATCH 0469/1922] Fix meta for aten.upsample_bilinear2d.vec (#88158)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88158
Approved by: https://github.com/ngimel
---
 test/test_meta.py               | 1 -
 torch/_decomp/decompositions.py | 5 +++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/test/test_meta.py b/test/test_meta.py
index 758ea2c1e2fb4..858c429f1974c 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -350,7 +350,6 @@ def test_tensor_outlives_converter(self):
 
     # channel_last and channel_last_3d related failures
     aten.convolution.default,
-    aten.upsample_bilinear2d.vec,
 
     # following ops fails if include_storage_offset = True, but these are a bit edge casey
     # we should still fix them, leaving them here for tracking.
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 4bda82e490fc0..0e1d1cd1dd511 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -1886,6 +1886,11 @@ def upsample_bilinear2d_vec(
     q1 = torch.mul(v1, xscale1) + torch.mul(v2, xscale2)
     q2 = torch.mul(v3, xscale1) + torch.mul(v4, xscale2)
     result = torch.mul(q1, yscale1) + torch.mul(q2, yscale2)
+
+    # convert output to correct memory format, if necessary
+    input_memory_format = utils.suggest_memory_format(input)
+    result = result.contiguous(memory_format=input_memory_format)
+
     return result
 
 
From acc2d0cdd310329347c069e62cc939bed5309a72 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 2 Nov 2022 16:59:00 +0000
Subject: [PATCH 0470/1922] Revert "Do not use unsafe restriding for subclasses
 (#87610)"

This reverts commit 73379acaf3865379aed0a1bab1320616772152f3.

Reverted https://github.com/pytorch/pytorch/pull/87610 on behalf of https://github.com/mehtanirav due to [Internal breakages](https://www.internalfb.com/intern/sandcastle/job/36028797828925790/insights)
---
 .../ATen/functorch/BatchRulesScatterOps.cpp   |  5 --
 aten/src/ATen/native/TensorShape.cpp          |  3 +-
 test/functorch/test_eager_transforms.py       | 10 +--
 test/test_functionalization.py                | 64 +++++++++----------
 4 files changed, 36 insertions(+), 46 deletions(-)

diff --git a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
index fc51e9d744099..5eecbedd93e7b 100644
--- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
@@ -928,11 +928,6 @@ Tensor index_copy_decomp(
   return at::scatter(self, dim, index_, source);  ;
 }
 
-// Note [Fix vmap slice_scatter]
-// registers a decomposition for `slice_scatter` that calls into `slice.src`
-// *_scatter operators have some special semantics though, that we can't easily
-// through a decomposition: slice_scatter's output needs to have the same
-// size, size, strides and storage_offset as the input.
 Tensor slice_scatter_decomp(const Tensor &self, const Tensor &src,
                             int64_t dim, c10::optional<int64_t> start,
                             c10::optional<int64_t> end, int64_t step)
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 282eec87b6e9d..2051cda371b97 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -3,7 +3,6 @@
 #include <ATen/core/DimVector.h>
 #include <ATen/core/functional.h>
 #include <ATen/core/IListRef.h>
-#include <ATen/TensorSubclassLikeUtils.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
 #include <ATen/ExpandUtils.h>
@@ -1566,7 +1565,7 @@ Tensor reshape_symint(const Tensor& self, c10::SymIntArrayRef proposed_shape) {
     //
     // We need to do the checks here instead of in `native_functions.yaml`
     // to preserve backwards compatibility.
-    if (!self.is_xla() && !self.is_lazy() && !self.is_ipu() && !at::isTensorSubclassLike(self)) {
+    if (!self.is_xla() && !self.is_lazy() && !self.is_ipu()) {
       return self._reshape_alias_symint(shape, stride.value());
     } else {
       return self.view_symint(shape);
diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py
index ff69ed9df6e63..26b64c5e70cca 100644
--- a/test/functorch/test_eager_transforms.py
+++ b/test/functorch/test_eager_transforms.py
@@ -3130,16 +3130,13 @@ def normalize_devices(fx_g):
     return fx_g
 
 class TestFunctionalize(TestCase):
-    def _check_functionalize_correctness(self, f, inpt, *, skip_vmap=False):
+    def _check_functionalize_correctness(self, f, inpt):
         inpt1 = inpt.clone()
         inpt2 = inpt.clone()
         inpt3 = inpt.clone()
 
         expected_outputs = f(inpt1)
-        if skip_vmap:
-            actual_outputs = functionalize(f)(inpt2)
-        else:
-            actual_outputs = vmap(functionalize(f))(inpt2.unsqueeze(0))[0].squeeze()
+        actual_outputs = vmap(functionalize(f))(inpt2.unsqueeze(0))[0].squeeze()
         # Right now the flavor of functionalize that also removes view ops
         # isn't being used with vmap
         # That's because {view}_copy ops don't have batching rules yet
@@ -3209,8 +3206,7 @@ def f(x: torch.Tensor) -> torch.Tensor:
             z2, z3 = z1.split(2)
             z2.add_(tmp)
             return x
-        # See Note [Fix vmap slice_scatter]
-        self._check_functionalize_correctness(f, torch.zeros(4, 2, device=device), skip_vmap=True)
+        self._check_functionalize_correctness(f, torch.zeros(4, 2, device=device))
 
     # Ensure functionalize works with List[Optional[Tensor]] arguments.
     # See the fix / discussion at https://github.com/pytorch/pytorch/pull/76085
diff --git a/test/test_functionalization.py b/test/test_functionalization.py
index e2cca26c1ea62..bfb79675c7eb0 100644
--- a/test/test_functionalization.py
+++ b/test/test_functionalization.py
@@ -156,11 +156,11 @@ def forward(self, a_1):
     as_strided_copy_4 = torch.ops.aten.as_strided_copy.default(as_strided_copy_2, [16, 64, 128, 128], [1048576, 16384, 128, 1], 0)
     clone_1 = torch.ops.aten.clone.default(as_strided_copy_4, memory_format = torch.contiguous_format);  as_strided_copy_4 = None
     threshold_backward = torch.ops.aten.threshold_backward.default(clone_1, relu, 0);  clone_1 = relu = None
-    view_copy_2 = torch.ops.aten.view_copy.default(as_strided_copy_2, [16, 64, 128, 128])
-    detach_copy = torch.ops.aten.detach_copy.default(view_copy_2);  view_copy_2 = None
+    _reshape_alias_copy = torch.ops.aten._reshape_alias_copy.default(as_strided_copy_2, [16, 64, 128, 128], [1048576, 16384, 128, 1])
+    detach_copy = torch.ops.aten.detach_copy.default(_reshape_alias_copy);  _reshape_alias_copy = None
     as_strided_scatter_1 = torch.ops.aten.as_strided_scatter.default(as_strided_copy_2, threshold_backward, [16, 64, 128, 128], [1048576, 16384, 128, 1], 0);  as_strided_copy_2 = threshold_backward = None
-    view_copy_3 = torch.ops.aten.view_copy.default(as_strided_scatter_1, [16, 64, 128, 128]);  as_strided_scatter_1 = None
-    detach_copy_1 = torch.ops.aten.detach_copy.default(view_copy_3);  view_copy_3 = None
+    _reshape_alias_copy_1 = torch.ops.aten._reshape_alias_copy.default(as_strided_scatter_1, [16, 64, 128, 128], [1048576, 16384, 128, 1]);  as_strided_scatter_1 = None
+    detach_copy_1 = torch.ops.aten.detach_copy.default(_reshape_alias_copy_1);  _reshape_alias_copy_1 = None
     return detach_copy_1
     """)  # noqa: B950
 
@@ -713,40 +713,40 @@ def forward(self, a_1):
     ones = torch.ops.aten.ones.default([2, 2], device = device(type='cpu'), pin_memory = False)
     add = torch.ops.aten.add.Tensor(a_1, a_1);  a_1 = None
     view_copy = torch.ops.aten.view_copy.default(add, [8])
-    view_copy_1 = torch.ops.aten.view_copy.default(view_copy, [2, 4]);  view_copy = None
-    transpose_copy = torch.ops.aten.transpose_copy.int(view_copy_1, 1, 0)
+    _reshape_alias_copy = torch.ops.aten._reshape_alias_copy.default(view_copy, [2, 4], [4, 1]);  view_copy = None
+    transpose_copy = torch.ops.aten.transpose_copy.int(_reshape_alias_copy, 1, 0)
     unsqueeze_copy = torch.ops.aten.unsqueeze_copy.default(transpose_copy, 0);  transpose_copy = None
     squeeze_copy = torch.ops.aten.squeeze_copy.default(unsqueeze_copy);  unsqueeze_copy = None
     split_copy = torch.ops.aten.split_copy.Tensor(squeeze_copy, 2);  squeeze_copy = None
     getitem = split_copy[0]
     getitem_1 = split_copy[1];  split_copy = None
     add_1 = torch.ops.aten.add.Tensor(getitem, ones);  getitem = ones = None
-    select_copy = torch.ops.aten.select_copy.int(view_copy_1, 0, 0);  view_copy_1 = None
-    view_copy_2 = torch.ops.aten.view_copy.default(add_1, [4])
-    view_copy_3 = torch.ops.aten.view_copy.default(add, [8]);  add = None
-    view_copy_4 = torch.ops.aten.view_copy.default(view_copy_3, [2, 4]);  view_copy_3 = None
-    transpose_copy_1 = torch.ops.aten.transpose_copy.int(view_copy_4, 1, 0);  view_copy_4 = None
+    select_copy = torch.ops.aten.select_copy.int(_reshape_alias_copy, 0, 0);  _reshape_alias_copy = None
+    _reshape_alias_copy_1 = torch.ops.aten._reshape_alias_copy.default(add_1, [4], [1])
+    view_copy_1 = torch.ops.aten.view_copy.default(add, [8]);  add = None
+    _reshape_alias_copy_2 = torch.ops.aten._reshape_alias_copy.default(view_copy_1, [2, 4], [4, 1]);  view_copy_1 = None
+    transpose_copy_1 = torch.ops.aten.transpose_copy.int(_reshape_alias_copy_2, 1, 0);  _reshape_alias_copy_2 = None
     unsqueeze_copy_1 = torch.ops.aten.unsqueeze_copy.default(transpose_copy_1, 0);  transpose_copy_1 = None
     squeeze_copy_1 = torch.ops.aten.squeeze_copy.default(unsqueeze_copy_1);  unsqueeze_copy_1 = None
     slice_scatter = torch.ops.aten.slice_scatter.default(squeeze_copy_1, add_1, 0, 0, 2);  squeeze_copy_1 = None
     unsqueeze_copy_2 = torch.ops.aten.unsqueeze_copy.default(slice_scatter, 0);  slice_scatter = None
     squeeze_copy_2 = torch.ops.aten.squeeze_copy.dim(unsqueeze_copy_2, 0);  unsqueeze_copy_2 = None
     transpose_copy_2 = torch.ops.aten.transpose_copy.int(squeeze_copy_2, 1, 0);  squeeze_copy_2 = None
-    view_copy_5 = torch.ops.aten.view_copy.default(transpose_copy_2, [8]);  transpose_copy_2 = None
-    view_copy_6 = torch.ops.aten.view_copy.default(view_copy_5, [4, 2]);  view_copy_5 = None
-    view_copy_7 = torch.ops.aten.view_copy.default(view_copy_6, [8])
-    view_copy_8 = torch.ops.aten.view_copy.default(view_copy_7, [2, 4]);  view_copy_7 = None
-    select_copy_1 = torch.ops.aten.select_copy.int(view_copy_8, 0, 0);  view_copy_8 = None
-    view_copy_9 = torch.ops.aten.view_copy.default(view_copy_6, [8]);  view_copy_6 = None
-    view_copy_10 = torch.ops.aten.view_copy.default(view_copy_9, [2, 4]);  view_copy_9 = None
-    transpose_copy_3 = torch.ops.aten.transpose_copy.int(view_copy_10, 1, 0);  view_copy_10 = None
+    _reshape_alias_copy_3 = torch.ops.aten._reshape_alias_copy.default(transpose_copy_2, [8], [1]);  transpose_copy_2 = None
+    view_copy_2 = torch.ops.aten.view_copy.default(_reshape_alias_copy_3, [4, 2]);  _reshape_alias_copy_3 = None
+    view_copy_3 = torch.ops.aten.view_copy.default(view_copy_2, [8])
+    _reshape_alias_copy_4 = torch.ops.aten._reshape_alias_copy.default(view_copy_3, [2, 4], [4, 1]);  view_copy_3 = None
+    select_copy_1 = torch.ops.aten.select_copy.int(_reshape_alias_copy_4, 0, 0);  _reshape_alias_copy_4 = None
+    view_copy_4 = torch.ops.aten.view_copy.default(view_copy_2, [8]);  view_copy_2 = None
+    _reshape_alias_copy_5 = torch.ops.aten._reshape_alias_copy.default(view_copy_4, [2, 4], [4, 1]);  view_copy_4 = None
+    transpose_copy_3 = torch.ops.aten.transpose_copy.int(_reshape_alias_copy_5, 1, 0);  _reshape_alias_copy_5 = None
     unsqueeze_copy_3 = torch.ops.aten.unsqueeze_copy.default(transpose_copy_3, 0);  transpose_copy_3 = None
     squeeze_copy_3 = torch.ops.aten.squeeze_copy.default(unsqueeze_copy_3);  unsqueeze_copy_3 = None
     split_copy_1 = torch.ops.aten.split_copy.Tensor(squeeze_copy_3, 2);  squeeze_copy_3 = None
     getitem_2 = split_copy_1[0]
     getitem_3 = split_copy_1[1];  split_copy_1 = None
-    view_copy_11 = torch.ops.aten.view_copy.default(getitem_2, [4]);  getitem_2 = None
-    add_2 = torch.ops.aten.add.Tensor(select_copy_1, view_copy_11);  select_copy_1 = view_copy_11 = None
+    _reshape_alias_copy_6 = torch.ops.aten._reshape_alias_copy.default(getitem_2, [4], [1]);  getitem_2 = None
+    add_2 = torch.ops.aten.add.Tensor(select_copy_1, _reshape_alias_copy_6);  select_copy_1 = _reshape_alias_copy_6 = None
     return add_1
     """)  # noqa: B950
 
@@ -759,30 +759,30 @@ def forward(self, a_1):
     ones = torch.ops.aten.ones.default([2, 2], device = device(type='cpu'), pin_memory = False)
     add = torch.ops.aten.add.Tensor(a_1, a_1);  a_1 = None
     view = torch.ops.aten.view.default(add, [8])
-    view_1 = torch.ops.aten.view.default(view, [2, 4]);  view = None
-    transpose = torch.ops.aten.transpose.int(view_1, 1, 0)
+    _reshape_alias = torch.ops.aten._reshape_alias.default(view, [2, 4], [4, 1]);  view = None
+    transpose = torch.ops.aten.transpose.int(_reshape_alias, 1, 0)
     unsqueeze = torch.ops.aten.unsqueeze.default(transpose, 0);  transpose = None
     squeeze = torch.ops.aten.squeeze.default(unsqueeze);  unsqueeze = None
     split = torch.ops.aten.split.Tensor(squeeze, 2);  squeeze = None
     getitem = split[0]
     getitem_1 = split[1];  split = None
     add_1 = torch.ops.aten.add_.Tensor(getitem, ones);  ones = None
-    select = torch.ops.aten.select.int(view_1, 0, 0);  view_1 = None
+    select = torch.ops.aten.select.int(_reshape_alias, 0, 0);  _reshape_alias = None
     clone = torch.ops.aten.clone.default(getitem, memory_format = torch.contiguous_format)
     _unsafe_view = torch.ops.aten._unsafe_view.default(clone, [4]);  clone = None
-    view_2 = torch.ops.aten.view.default(add, [8]);  add = None
-    view_3 = torch.ops.aten.view.default(view_2, [2, 4]);  view_2 = None
-    transpose_1 = torch.ops.aten.transpose.int(view_3, 1, 0);  view_3 = None
+    view_1 = torch.ops.aten.view.default(add, [8]);  add = None
+    _reshape_alias_1 = torch.ops.aten._reshape_alias.default(view_1, [2, 4], [4, 1]);  view_1 = None
+    transpose_1 = torch.ops.aten.transpose.int(_reshape_alias_1, 1, 0);  _reshape_alias_1 = None
     unsqueeze_1 = torch.ops.aten.unsqueeze.default(transpose_1, 0);  transpose_1 = None
     squeeze_1 = torch.ops.aten.squeeze.default(unsqueeze_1);  unsqueeze_1 = None
     unsqueeze_2 = torch.ops.aten.unsqueeze.default(squeeze_1, 0);  squeeze_1 = None
     squeeze_2 = torch.ops.aten.squeeze.dim(unsqueeze_2, 0);  unsqueeze_2 = None
     transpose_2 = torch.ops.aten.transpose.int(squeeze_2, 1, 0);  squeeze_2 = None
-    view_4 = torch.ops.aten.view.default(transpose_2, [8]);  transpose_2 = None
-    view_5 = torch.ops.aten.view.default(view_4, [4, 2]);  view_4 = None
-    view_6 = torch.ops.aten.view.default(view_5, [8]);  view_5 = None
-    view_7 = torch.ops.aten.view.default(view_6, [2, 4]);  view_6 = None
-    select_1 = torch.ops.aten.select.int(view_7, 0, 0);  view_7 = None
+    _reshape_alias_2 = torch.ops.aten._reshape_alias.default(transpose_2, [8], [1]);  transpose_2 = None
+    view_2 = torch.ops.aten.view.default(_reshape_alias_2, [4, 2]);  _reshape_alias_2 = None
+    view_3 = torch.ops.aten.view.default(view_2, [8]);  view_2 = None
+    _reshape_alias_3 = torch.ops.aten._reshape_alias.default(view_3, [2, 4], [4, 1]);  view_3 = None
+    select_1 = torch.ops.aten.select.int(_reshape_alias_3, 0, 0);  _reshape_alias_3 = None
     add_2 = torch.ops.aten.add.Tensor(select_1, _unsafe_view);  select_1 = _unsafe_view = None
     return getitem
     """)

From d63dcbef57e4bd4dacfa47e3192c676b695caae1 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Wed, 2 Nov 2022 11:38:08 +0000
Subject: [PATCH 0471/1922] [FSDP()][25/N] Add `_post_forward_reshard()`
 (#87940)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87940
Approved by: https://github.com/mrshenli
---
 torch/distributed/fsdp/_runtime_utils.py      | 19 +++++++++++++++++++
 .../fsdp/fully_sharded_data_parallel.py       | 19 +++----------------
 2 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index 2ecc659df9b83..c7282087f4276 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -289,6 +289,25 @@ def _post_forward(
     return output
 
 
+@no_type_check
+def _post_forward_reshard(
+    state: _State,
+    handles: List[FlatParamHandle],
+) -> None:
+    """Reshards parameters in the post-forward."""
+    if not handles:
+        return
+    # Do not free the root's parameters in the post-forward for `FULL_SHARD`
+    # with the intention that they are immediately used for backward
+    # computation (though this may not be true)
+    free_unsharded_flat_params = [
+        not state._is_root
+        and handle._config.sharding_strategy == HandleShardingStrategy.FULL_SHARD
+        for handle in handles
+    ]
+    _reshard(state, handles, free_unsharded_flat_params)
+
+
 @no_type_check
 def _fsdp_root_pre_forward(
     state: _State,
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 1394bb3d130b6..4cfe68dc628ac 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -56,6 +56,7 @@
     _get_buffers_and_dtypes_for_checkpoint,
     _lazy_init,
     _post_forward,
+    _post_forward_reshard,
     _pre_forward,
     _pre_forward_unshard,
     _reshard,
@@ -91,7 +92,7 @@
     _pre_load_state_dict_hook,
 )
 from ._utils import p_assert
-from .flat_param import FlatParameter, FlatParamHandle, HandleShardingStrategy
+from .flat_param import FlatParameter, FlatParamHandle
 from .wrap import ParamExecOrderWrapPolicy
 
 _TORCH_FX_AVAIL = True
@@ -938,21 +939,7 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
             args, kwargs = _fsdp_root_pre_forward(self, *args, **kwargs)
             unused = None
             unshard_fn = functools.partial(_pre_forward_unshard, self, self._handles)
-            # Do not free the root's parameters in the post-forward for
-            # `FULL_SHARD` with the intention that they are immediately used
-            # for backward computation (though this may not be true)
-            free_unsharded_flat_params = [
-                not self._is_root
-                and handle._config.sharding_strategy
-                == HandleShardingStrategy.FULL_SHARD
-                for handle in self._handles
-            ]
-            reshard_fn = functools.partial(
-                _reshard,
-                self,
-                self._handles,
-                free_unsharded_flat_params,
-            )
+            reshard_fn = functools.partial(_post_forward_reshard, self, self._handles)
             _pre_forward(
                 self, self._handles, unshard_fn, self._fsdp_wrapped_module, unused
             )

From 9a6541dad7947de55a5b1ef0dcc3c83519153fa3 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 2 Nov 2022 17:21:59 +0000
Subject: [PATCH 0472/1922] Consolidate macos pip dependencies (#88071)

After conda, consolidating all macos pip dependencies to cache every dependencies that macos CI needs. Two small issues are found along the way in `_mac-test-mps` workflow:

* It didn't have `Install macOS homebrew dependencies` to install libomp like the regular `_mac-test` workflow
* It didn't install `scipy`, thus silently skipping some `signal.windows` tests

Both are fixed in this PR
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88071
Approved by: https://github.com/malfet
---
 .github/requirements/README.md                |  3 +++
 .../requirements/pip-requirements-macOS.txt   | 21 +++++++++++++++++++
 .github/workflows/_mac-test-mps.yml           | 13 ++++++++++--
 .github/workflows/_mac-test.yml               |  8 +++----
 .jenkins/pytorch/macos-test.sh                | 17 ---------------
 .../_internal/opinfo/definitions/signal.py    |  7 +++++++
 6 files changed, 45 insertions(+), 24 deletions(-)
 create mode 100644 .github/requirements/pip-requirements-macOS.txt

diff --git a/.github/requirements/README.md b/.github/requirements/README.md
index 9093b92c62d29..a4f3cb75d9a76 100644
--- a/.github/requirements/README.md
+++ b/.github/requirements/README.md
@@ -17,3 +17,6 @@ The list of support files are as follows:
     test jobs to setup the conda environment
   * conda-env-macOS-X64. This is use by MacOS (x86-64) build and test
     jobs to setup the conda environment
+* Pip:
+  * pip-requirements-macOS.txt. This is used by MacOS build and test jobs to
+    setup the pip environment
diff --git a/.github/requirements/pip-requirements-macOS.txt b/.github/requirements/pip-requirements-macOS.txt
new file mode 100644
index 0000000000000..7aa2306b1309f
--- /dev/null
+++ b/.github/requirements/pip-requirements-macOS.txt
@@ -0,0 +1,21 @@
+boto3==1.19.12
+hypothesis==6.56.4
+expecttest==0.1.3
+librosa>=0.6.2
+mpmath==1.2.1
+networkx==2.8.7
+# Use numba-0.49.1 or older on Intel Macs, but 0.56.0 on M1 machines, as older numba is not available
+numba==0.56.0; platform_machine == "arm64"
+numba<=0.49.1; platform_machine != "arm64"
+opt-einsum>=3.3
+psutil==5.9.1
+pynvml==11.4.1
+pygments==2.12.0
+pytest==7.2.0
+pytest-xdist==3.0.2
+pytest-rerunfailures==10.2
+pytest-shard==0.1.2
+scipy==1.9.0
+sympy==1.11.1
+unittest-xml-reporting<=3.2.0,>=2.0.0
+xdoctest==1.0.2
diff --git a/.github/workflows/_mac-test-mps.yml b/.github/workflows/_mac-test-mps.yml
index 57e97e499c460..3f7ba04f3e847 100644
--- a/.github/workflows/_mac-test-mps.yml
+++ b/.github/workflows/_mac-test-mps.yml
@@ -37,11 +37,21 @@ jobs:
           name: ${{ inputs.build-environment }}
           use-gha: true
 
+      # This is copied from the main macos test workflow. It was missed in the earlier fix because macos M1
+      # runners are shared and not ephemeral, so the issue wasn't manifested if the runners with the fix were
+      # used
+      - name: Install macOS homebrew dependencies
+        run: |
+          # Install dependencies
+          brew install libomp
+          brew link --force libomp
+
       - name: Setup miniconda
         uses: pytorch/test-infra/.github/actions/setup-miniconda@main
         with:
           python-version: 3.9
           environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
+          pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt
 
       - name: Install PyTorch
         env:
@@ -51,10 +61,9 @@ jobs:
         run: |
           # shellcheck disable=SC1090
           set -ex
-          ${CONDA_RUN} python3 -mpip install "unittest-xml-reporting<=3.2.0,>=2.0.0"
           # As wheels are cross-compiled they are reported as x86_64 ones
           ORIG_WHLNAME=$(ls -1 dist/*.whl); ARM_WHLNAME=${ORIG_WHLNAME/x86_64/arm64}; mv ${ORIG_WHLNAME} ${ARM_WHLNAME}
-          ${CONDA_RUN} python3 -mpip install dist/*.whl
+          ${CONDA_RUN} python3 -mpip install --no-index --no-deps dist/*.whl
 
       - name: Run MPS tests
         env:
diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml
index db524cae464b6..52aba9c68cd83 100644
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@@ -94,6 +94,7 @@ jobs:
         with:
           python-version: 3.8
           environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
+          pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt
 
       - name: Setup miniconda (arm64, py3.9)
         if: ${{ runner.arch == 'ARM64' }}
@@ -101,12 +102,11 @@ jobs:
         with:
           python-version: 3.9
           environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
+          pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt
 
       - name: Start monitoring script
         id: monitor-script
         run: |
-          ${CONDA_RUN} python3 -m pip install psutil==5.9.1
-          ${CONDA_RUN} python3 -m pip install pynvml==11.4.1
           ${CONDA_RUN} python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
           echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
 
@@ -144,7 +144,7 @@ jobs:
           export PR_BODY="${PR_BODY//[\'\"]}"
           arch
 
-          ${CONDA_RUN} python3 -mpip install $(echo dist/*.whl)[opt-einsum]
+          ${CONDA_RUN} python3 -mpip install --no-index --no-deps $(echo dist/*.whl)
           ${CONDA_RUN} .jenkins/pytorch/macos-test.sh
 
       - name: Get workflow job id
@@ -190,6 +190,4 @@ jobs:
           GHA_WORKFLOW_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
         run: |
           set -x
-          ${CONDA_RUN} python3 -m pip install -r requirements.txt
-          ${CONDA_RUN} python3 -m pip install boto3==1.19.12
           ${CONDA_RUN} python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh
index b598efff22ed4..4beab880ddbb3 100755
--- a/.jenkins/pytorch/macos-test.sh
+++ b/.jenkins/pytorch/macos-test.sh
@@ -4,23 +4,6 @@
 # shellcheck source=./macos-common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/macos-common.sh"
 
-if [[ ${BUILD_ENVIRONMENT} = *arm64* ]]; then
-  pip install hypothesis "expecttest==0.1.3" "librosa>=0.6.2" "numba==0.56.0" psutil "scipy==1.9.0"
-else
-  pip install hypothesis "expecttest==0.1.3" "librosa>=0.6.2" "numba<=0.49.1" psutil "scipy==1.6.3"
-fi
-
-# TODO move this to docker
-# Pin unittest-xml-reporting to freeze printing test summary logic, related: https://github.com/pytorch/pytorch/issues/69014
-pip install "unittest-xml-reporting<=3.2.0,>=2.0.0" \
-  pytest \
-  pytest-xdist \
-  pytest-shard \
-  pytest-rerunfailures \
-  "xdoctest==1.0.2" \
-  "pygments==2.12.0" \
-  "opt-einsum>=3.3"
-
 if [ -z "${CI}" ]; then
   rm -rf "${WORKSPACE_DIR}"/miniconda3/lib/python3.6/site-packages/torch*
 fi
diff --git a/torch/testing/_internal/opinfo/definitions/signal.py b/torch/testing/_internal/opinfo/definitions/signal.py
index c915e93e81de6..0bab0006e80c2 100644
--- a/torch/testing/_internal/opinfo/definitions/signal.py
+++ b/torch/testing/_internal/opinfo/definitions/signal.py
@@ -368,5 +368,12 @@ def make_signal_windows_opinfo(
         sample_inputs_func=partial(sample_inputs_window, beta=12.0),
         reference_inputs_func=partial(reference_inputs_kaiser_window, beta=12.0),
         error_inputs_func=error_inputs_kaiser_window,
+        skips=(
+            DecorateInfo(
+                unittest.skip("Unsupported on MPS for now pending aten::i0 support"),
+                "TestCommon",
+                "test_numpy_ref_mps",
+            ),
+        ),
     ),
 ]

From 6881c0616a38fd3cf97ed839429502fb92d3b491 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 2 Nov 2022 17:27:30 +0000
Subject: [PATCH 0473/1922] Ignore macos usage log upload artifact failure
 (#88288)

I'm not quite sure why GitHub starts to get flaky when we are trying to upload usage_log.txt to it (500 Internal server error). But we can live without it, so let's just ignore this for now, and follow up on this latter.

The failures all come from M1 runner, so it seems to point to a connectivity issue between AWS and GitHub:

* https://github.com/pytorch/pytorch/actions/runs/3373976793/jobs/5599310905
* https://github.com/pytorch/pytorch/actions/runs/3372858660/jobs/5597033598
* https://github.com/pytorch/pytorch/actions/runs/3371548201/jobs/5594274444
* https://github.com/pytorch/pytorch/actions/runs/3370877990/jobs/5592709210
* https://github.com/pytorch/pytorch/actions/runs/3370609384/jobs/5592008430

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88288
Approved by: https://github.com/clee2000
---
 .github/actions/upload-test-artifacts/action.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/actions/upload-test-artifacts/action.yml b/.github/actions/upload-test-artifacts/action.yml
index 4b9c75c38b1b6..624c4895155a7 100644
--- a/.github/actions/upload-test-artifacts/action.yml
+++ b/.github/actions/upload-test-artifacts/action.yml
@@ -139,3 +139,4 @@ runs:
         retention-days: 14
         if-no-files-found: ignore
         path: usage_log.txt
+      continue-on-error: true

From e38c14b81af6521af47b287aedca0f29d00ff320 Mon Sep 17 00:00:00 2001
From: Michael Suo <suo@fb.com>
Date: Tue, 1 Nov 2022 14:44:17 -0700
Subject: [PATCH 0474/1922] [dynamo] Error when user nests FX with dynamo
 (#87797)

Today, this doesn't work and dynamo errors out in a very non-obvious way (see:
https://gist.github.com/suo/dde04830372ab51a4a34ea760f14200a).

Here, we detect the error early and exit with a nicer msg. Also add a
config option to just no-op dynamo (which need to unblock internal
enablement).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87797
Approved by: https://github.com/yf225, https://github.com/soumith, https://github.com/jansel
---
 test/dynamo/test_misc.py    | 14 ++++++++++++++
 torch/_dynamo/config.py     |  4 ++++
 torch/_dynamo/eval_frame.py | 16 ++++++++++++++++
 3 files changed, 34 insertions(+)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index 4589b827d7499..42244892b45f6 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -2761,6 +2761,20 @@ def forward(self, x):
             dynamo_result = graph(x)
             self.assertTrue(same(real, dynamo_result))
 
+    def test_error_on_nested_fx_trace(self):
+        input = torch.rand(2, 3)
+
+        def f(x):
+            x + x
+
+        real = f(input)
+
+        optimized = torch._dynamo.optimize("eager")(f)
+        self.assertTrue(same(optimized(input), real))
+
+        with self.assertRaisesRegex(RuntimeError, "Detected that you are using FX"):
+            gm = torch.fx.symbolic_trace(optimized)
+
 
 class CustomFunc(torch.autograd.Function):
     @staticmethod
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index 87014b20537bc..12088383e741c 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -153,6 +153,10 @@
 # How to import torchinductor, either torchinductor or torch.inductor
 inductor_import = dynamo_import.replace("dynamo", "inductor")
 
+# If true, error with a better message if we symbolically trace over a
+# dynamo-optimized function. If false, silently suppress dynamo.
+error_on_nested_fx_trace = True
+
 # root folder of the project
 if "torch." in dynamo_import:
     base_dir = dirname(dirname(dirname(abspath(__file__))))
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index fce9e43b39343..9f115481582b8 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -2,6 +2,7 @@
 import copy
 import functools
 import inspect
+import itertools
 import logging
 import os
 import sys
@@ -149,6 +150,21 @@ def __call__(self, *args, **kwargs):
 
         @functools.wraps(fn)
         def _fn(*args, **kwargs):
+            any_arg_is_proxy = any(
+                map(
+                    lambda arg: isinstance(arg, torch.fx.Proxy),
+                    itertools.chain(args, kwargs.values()),
+                )
+            )
+            if any_arg_is_proxy:
+                if config.error_on_nested_fx_trace:
+                    raise RuntimeError(
+                        "Detected that you are using FX to symbolically trace "
+                        "a dynamo-optimized function. This is not supported at the moment."
+                    )
+                else:
+                    return fn
+
             on_enter()
             prior = set_eval_frame(callback)
             backend_ctx = backend_ctx_ctor()

From b1bae8a85d8c2549a7c40522a5d44449117f4bd1 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 2 Nov 2022 17:39:04 +0000
Subject: [PATCH 0475/1922] Add more debug information when installing NVIDIA
 driver (#88168)

This calls `lspci`, `lsmod`, and `modinfo nvidia` before and after the installation to gather more data about the "No GPU available" transient issue on G5 runner, i.e. https://hud.pytorch.org/pytorch/pytorch/commit/59fe272c1e698989228af5ad197bdd2985e4e9b9

This also handles `nvidia-smi` call and tries to re-install the driver if the first call fails, i.e. `No devices were found` https://hud.pytorch.org/pytorch/pytorch/commit/8ea19c802e38c061e79176360c1ecaa81ce2088a
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88168
Approved by: https://github.com/clee2000, https://github.com/malfet
---
 .github/scripts/install_nvidia_utils_linux.sh | 31 +++++++++++++++----
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/.github/scripts/install_nvidia_utils_linux.sh b/.github/scripts/install_nvidia_utils_linux.sh
index 79f588633794e..c5c96e0976aac 100755
--- a/.github/scripts/install_nvidia_utils_linux.sh
+++ b/.github/scripts/install_nvidia_utils_linux.sh
@@ -26,18 +26,29 @@ install_nvidia_driver_amzn2() {
         # Purge any nvidia driver installed from RHEL repo
         sudo yum remove -y nvidia-driver-latest-dkms
 
+        # Try to gather more information about the runner and its existing NVIDIA driver if any
+        echo "Before installing NVIDIA driver"
+        lspci
+        lsmod
+        modinfo nvidia || true
+
         HAS_NVIDIA_DRIVER=0
         # Check if NVIDIA driver has already been installed
         if [ -x "$(command -v nvidia-smi)" ]; then
+            set +e
             # The driver exists, check its version next
             INSTALLED_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader)
+            NVIDIA_SMI_STATUS=$?
 
-            if [ "$INSTALLED_DRIVER_VERSION" != "$DRIVER_VERSION" ]; then
+            if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
+                echo "Failed to get NVIDIA driver version ($INSTALLED_DRIVER_VERSION). Continuing"
+            elif [ "$INSTALLED_DRIVER_VERSION" != "$DRIVER_VERSION" ]; then
                 echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has been installed, but we expect to have $DRIVER_VERSION instead. Continuing"
             else
                 HAS_NVIDIA_DRIVER=1
                 echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has already been installed. Skipping NVIDIA driver installation"
             fi
+            set -e
         fi
 
         if [ "$HAS_NVIDIA_DRIVER" -eq 0 ]; then
@@ -51,17 +62,25 @@ install_nvidia_driver_amzn2() {
             sudo rm -fv /tmp/nvidia_driver
         fi
 
+        sudo modprobe nvidia || true
+        echo "After installing NVIDIA driver"
+        lspci
+        lsmod
+        modinfo nvidia || true
+
         (
             set +e
             nvidia-smi
-            status=$?
+            NVIDIA_SMI_STATUS=$?
+
             # Allowable exit statuses for nvidia-smi, see: https://github.com/NVIDIA/gpu-operator/issues/285
-            if [ $status -eq 0 ] || [ $status -eq 14 ]; then
-                echo "INFO: Ignoring allowed status ${status}"
+            if [ "$NVIDIA_SMI_STATUS" -eq 0 ] || [ "$NVIDIA_SMI_STATUS" -eq 14 ]; then
+                echo "INFO: Ignoring allowed status ${NVIDIA_SMI_STATUS}"
             else
-                echo "ERROR: nvidia-smi exited with unresolved status ${status}"
-                exit ${status}
+                echo "ERROR: nvidia-smi exited with unresolved status ${NVIDIA_SMI_STATUS}"
+                exit ${NVIDIA_SMI_STATUS}
             fi
+            set -e
         )
     )
 }

From 31f919add0ddbd33d287f4a5ef83e96a6fd35f78 Mon Sep 17 00:00:00 2001
From: Pruthvi Madugundu <pmagundu@amd.com>
Date: Wed, 2 Nov 2022 17:41:57 +0000
Subject: [PATCH 0476/1922] Introduce TORCH_DISABLE_GPU_ASSERTS (#84190)

- Asserts for CUDA are enabled by default
- Disabled for ROCm by default by setting `TORCH_DISABLE_GPU_ASSERTS` to `ON`
- Can be enabled for ROCm by setting above variable to`OFF` during build or can be forcefully enabled by setting `ROCM_FORCE_ENABLE_GPU_ASSERTS:BOOL=ON`

This is follow up changes as per comment in PR #81790, comment [link](https://github.com/pytorch/pytorch/pull/81790#issuecomment-1215929021)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84190
Approved by: https://github.com/jeffdaily, https://github.com/malfet
---
 CMakeLists.txt             |  1 +
 c10/macros/Macros.h        | 11 ++++++-----
 caffe2/core/macros.h.in    |  2 ++
 cmake/Dependencies.cmake   | 10 ++++++++++
 cmake/Summary.cmake        |  1 +
 cmake/public/LoadHIP.cmake | 16 ----------------
 docs/source/notes/hip.rst  | 11 +++++++++++
 7 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b1e0f517fafa4..6efd3f2df9366 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -285,6 +285,7 @@ if(NOT USE_XNNPACK AND CMAKE_VERSION VERSION_LESS ${XNNPACK_MIN_CMAKE_VER})
 endif()
 option(USE_ZMQ "Use ZMQ" OFF)
 option(USE_ZSTD "Use ZSTD" OFF)
+option(TORCH_DISABLE_GPU_ASSERTS "Disable GPU asserts by default" OFF)
 # Ensure that an ITT build is the default for x86 CPUs
 cmake_dependent_option(
   USE_ITT "Use Intel(R) VTune Profiler ITT functionality" ON
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
index e77fa0fde2ee0..31cd2219d10e6 100644
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@@ -326,9 +326,8 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
 // CUDA_KERNEL_ASSERT checks the assertion
 // even when NDEBUG is defined. This is useful for important assertions in CUDA
 // code that would otherwise be suppressed when building Release.
-#if defined(__ANDROID__) || defined(__APPLE__) ||  \
-    (defined(USE_ROCM) && ROCM_VERSION < 40100) || \
-    (defined(USE_ROCM) && defined(ROCM_DISABLE_GPU_ASSERTS))
+#if defined(__ANDROID__) || defined(__APPLE__) || \
+    (defined(USE_ROCM) && ROCM_VERSION < 40100)
 // Those platforms do not support assert()
 #define CUDA_KERNEL_ASSERT(cond)
 #define SYCL_KERNEL_ASSERT(cond)
@@ -368,7 +367,9 @@ extern SYCL_EXTERNAL void __assert_fail(
     unsigned int line,
     const char* func);
 #else // __SYCL_DEVICE_ONLY__
-#if (defined(__CUDA_ARCH__) && !(defined(__clang__) && defined(__CUDA__)))
+#if (                                                                       \
+    defined(__CUDA_ARCH__) && !(defined(__clang__) && defined(__CUDA__)) && \
+    !defined(TORCH_DISABLE_GPU_ASSERTS))
 // CUDA supports __assert_fail function which are common for both device
 // and host side code.
 __host__ __device__
@@ -386,7 +387,7 @@ __host__ __device__
         const char* function) throw() __attribute__((__noreturn__));
 
 #if (defined(__HIP_ARCH__) || defined(__HIP__)) && \
-    !defined(ROCM_DISABLE_GPU_ASSERTS)
+    !defined(TORCH_DISABLE_GPU_ASSERTS)
 // ROCm supports __assert_fail only as a device side function.
 __device__ __attribute__((noinline)) __attribute__((weak)) void __assert_fail(
     const char* assertion,
diff --git a/caffe2/core/macros.h.in b/caffe2/core/macros.h.in
index 9c9f734575634..2d9f03e94c0fc 100644
--- a/caffe2/core/macros.h.in
+++ b/caffe2/core/macros.h.in
@@ -44,6 +44,7 @@ static_assert(
 #cmakedefine CAFFE2_USE_NVTX
 #cmakedefine CAFFE2_USE_ITT
 #cmakedefine CAFFE2_USE_TRT
+#cmakedefine TORCH_DISABLE_GPU_ASSERTS
 
 #ifndef EIGEN_MPL2_ONLY
 #cmakedefine EIGEN_MPL2_ONLY
@@ -85,4 +86,5 @@ static_assert(
   {"USE_NVTX", "${CAFFE2_USE_NVTX}"}, \
   {"USE_ITT", "${CAFFE2_USE_ITT}"}, \
   {"USE_TRT", "${CAFFE2_USE_TRT}"}, \
+  {"TORCH_DISABLE_GPU_ASSERTS", "${TORCH_DISABLE_GPU_ASSERTS}"}, \
 }
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index e232fcb624cd3..cf3c2c2caafd2 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1248,6 +1248,16 @@ if(ANDROID)
   list(APPEND Caffe2_DEPENDENCY_LIBS log)
 endif()
 
+# ---[ Kernel asserts
+# Kernel asserts are enabled by default for CUDA and disabled for ROCm.
+# For ROCm, it can be enabled by setting ROCM_FORCE_ENABLE_GPU_ASSERTS
+if(USE_ROCM AND ROCM_FORCE_ENABLE_GPU_ASSERTS)
+  message(STATUS "Forcefully enabling kernel asserts on ROCM")
+elseif(USE_ROCM AND NOT ROCM_FORCE_ENABLE_GPU_ASSERTS)
+  message(STATUS "Disabling kernel asserts for ROCm")
+  caffe2_update_option(TORCH_DISABLE_GPU_ASSERTS ON)
+endif()
+
 # ---[ LLVM
 if(USE_LLVM)
   message(STATUS "Looking for LLVM in ${USE_LLVM}")
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index fd6444680e2d4..279d72a41e660 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -199,4 +199,5 @@ function(caffe2_print_configuration_summary)
   # coreml
   message(STATUS "  USE_COREML_DELEGATE     : ${USE_COREML_DELEGATE}")
   message(STATUS "  BUILD_LAZY_TS_BACKEND   : ${BUILD_LAZY_TS_BACKEND}")
+  message(STATUS "  TORCH_DISABLE_GPU_ASSERTS : ${TORCH_DISABLE_GPU_ASSERTS}")
 endfunction()
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index 89a61b6242856..b51284115f144 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -143,9 +143,6 @@ message("Building PyTorch for GPU arch: ${PYTORCH_ROCM_ARCH}")
 # Add HIP to the CMAKE Module Path
 set(CMAKE_MODULE_PATH ${HIP_PATH}/cmake ${CMAKE_MODULE_PATH})
 
-#Disable kernel assert due to performance regression
-set(ROCM_ENABLE_KERNEL_ASSERTS FALSE CACHE BOOL "Kernel asserts are disabled by default for ROCm")
-
 macro(find_package_and_print_version PACKAGE_NAME)
   find_package("${PACKAGE_NAME}" ${ARGN})
   message("${PACKAGE_NAME} VERSION: ${${PACKAGE_NAME}_VERSION}")
@@ -286,19 +283,6 @@ if(HIP_FOUND)
   find_package_and_print_version(hipcub REQUIRED)
   find_package_and_print_version(rocthrust REQUIRED)
 
-  if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "4.1.0")
-    if(ROCM_ENABLE_KERNEL_ASSERTS)
-      message("ROCm version >= 4.1; enabling asserts")
-    else()
-      add_definitions(-DROCM_DISABLE_GPU_ASSERTS)
-      message("ROCm version >= 4.1; kernel asserts are disabled")
-    endif()
-  else()
-    # Disable Asserts In Code (Can't use asserts on HIP stack.)
-    add_definitions(-DNDEBUG)
-    message("ROCm version < 4.1; disablng asserts")
-  endif()
-
   if(HIP_COMPILER STREQUAL clang)
     set(hip_library_name amdhip64)
   else()
diff --git a/docs/source/notes/hip.rst b/docs/source/notes/hip.rst
index a9c94e2a4febb..c54e201489705 100644
--- a/docs/source/notes/hip.rst
+++ b/docs/source/notes/hip.rst
@@ -144,3 +144,14 @@ Refer to CUDA Semantics doc
 ---------------------------
 
 For any sections not listed here, please refer to the CUDA semantics doc: :ref:`cuda-semantics`
+
+
+Enabling kernel asserts
+-----------------------
+
+Kernel asserts are supported on ROCm, but they are disabled due to performance overhead. It can be enabled
+by recompiling the PyTorch from source.
+
+Please add below line as an argument to cmake command parameters::
+
+    -DROCM_FORCE_ENABLE_GPU_ASSERTS:BOOL=ON

From d43846d598df56c0b0041f247d959b05e0b51f9f Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Wed, 2 Nov 2022 11:38:09 +0000
Subject: [PATCH 0477/1922] [FSDP()][26/N] Move `_lazy_init()` into
 `_fsdp_root_pre_forward()` (#87941)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87941
Approved by: https://github.com/mrshenli
---
 torch/distributed/fsdp/_runtime_utils.py              | 8 ++++++--
 torch/distributed/fsdp/fully_sharded_data_parallel.py | 3 +--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index c7282087f4276..bf8a36b6cca10 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -311,14 +311,18 @@ def _post_forward_reshard(
 @no_type_check
 def _fsdp_root_pre_forward(
     state: _State,
+    root_module: nn.Module,
     *args,
     **kwargs,
 ):
     """
     Runs pre-forward logic specific to the root FSDP instance, which should run
-    before any individual module's pre-forward. If this is called on a non-root
-    FSDP instance, then the forward inputs are returned directly.
+    before any individual module's pre-forward. This starts with an attempt at
+    lazy initialization (which only runs non-vacuously once). Otherwise, if
+    this is called on a non-root FSDP instance, then the forward inputs are
+    returned directly.
     """
+    _lazy_init(state, root_module)
     p_assert(state._is_root is not None, "Expects a root FSDP to have been set")
     if not state._is_root:
         return args, kwargs
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 4cfe68dc628ac..8d0ed60dbde23 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -935,8 +935,7 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
         with torch.autograd.profiler.record_function(
             "FullyShardedDataParallel.forward"
         ):
-            _lazy_init(self, self)
-            args, kwargs = _fsdp_root_pre_forward(self, *args, **kwargs)
+            args, kwargs = _fsdp_root_pre_forward(self, self, *args, **kwargs)
             unused = None
             unshard_fn = functools.partial(_pre_forward_unshard, self, self._handles)
             reshard_fn = functools.partial(_post_forward_reshard, self, self._handles)

From 3ee52c74f93d9d284bba927b017250dbb5f5703d Mon Sep 17 00:00:00 2001
From: soulitzer <soulitzer@gmail.com>
Date: Wed, 2 Nov 2022 11:03:04 -0400
Subject: [PATCH 0478/1922] Remove wrong internal assert in
 handle_view_on_rebase (#88243)

Fixes: https://github.com/pytorch/pytorch/issues/88205

The `CreationMeta::NO_GRAD_MODE` path in handle_view_on_rebase wrongly assumes that the tensor would be a leaf, because tensors created in no_grad are always leaf tensors. However, due to creation_meta propagation, a view of a view created in no_grad also has `CreationMeta::NO_GRAD_MODE`, but DOES have grad_fn.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88243
Approved by: https://github.com/albanD
---
 test/test_autograd.py            | 17 ++++++++++++
 torch/csrc/autograd/variable.cpp | 46 ++++++++++++++++++++++----------
 2 files changed, 49 insertions(+), 14 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index 8fa611f5b269e..7df0b1ddae388 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -5881,6 +5881,23 @@ def run_tests(fn):
         run_tests(lambda v: v.swapdims_(0, 0))
         run_tests(lambda v: v.swapaxes_(0, 0))
 
+    def test_autograd_inplace_view_of_view(self):
+        x = torch.zeros(2)
+        with torch.no_grad():
+            y = x.view(2)
+        y.requires_grad_(True)
+        z = y.view(2)
+        with self.assertRaisesRegex(RuntimeError, "a view of a view .* is being .* inside the no_grad block"):
+            z /= 2
+
+        x = torch.zeros(2)
+        with torch.inference_mode():
+            y = x.view(2)
+        y.requires_grad_(True)
+        z = y.view(2)
+        with self.assertRaisesRegex(RuntimeError, "a view of a view .* is being .* inside the inference_mode"):
+            z /= 2
+
     # TODO This is not the correct behavior -
     # See https://github.com/pytorch/pytorch/issues/49825#issuecomment-794466627
     def test_autograd_inplace_views_cross_dtype(self):
diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp
index a2f075fcf1cf0..368a55ea8c1a7 100644
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@@ -761,7 +761,38 @@ void handle_view_on_rebase(
     } else {
       modified_obj = "is being";
     }
-    if (grad_fn) {
+
+    if (creation_meta == CreationMeta::INFERENCE_MODE ||
+        creation_meta == CreationMeta::NO_GRAD_MODE || !grad_fn) {
+      std::string prefix;
+      if (grad_fn) {
+        prefix = c10::str(
+            "Output ",
+            diff_view_meta->output_nr_,
+            " of ",
+            grad_fn->name(),
+            " is a view of a view which was created in");
+      } else {
+        prefix = "A view was created in";
+      }
+      if (creation_meta == CreationMeta::INFERENCE_MODE) {
+        msg = c10::str(
+            prefix,
+            " inference mode and ",
+            modified_obj,
+            " modified inplace in normal mode.");
+      } else {
+        // create_meta is not necessarily CreationMeta::NO_GRAD_MODE
+        // e.g. CreationMeta::IN_CUSTOM_FUNCTION is possible, but we know that
+        // if there is no grad_fn, that means that the view was performed in
+        // no-grad mode
+        msg = c10::str(
+            prefix,
+            " no_grad mode and ",
+            modified_obj,
+            " modified inplace with grad mode enabled.");
+      }
+    } else {
       msg = c10::str(
           "Output ",
           diff_view_meta->output_nr_,
@@ -770,16 +801,6 @@ void handle_view_on_rebase(
           " is a view and ",
           modified_obj,
           " modified inplace.");
-    } else if (creation_meta == CreationMeta::INFERENCE_MODE) {
-      msg = c10::str(
-          "A view was created in inference mode and ",
-          modified_obj,
-          " modified inplace in normal mode.");
-    } else {
-      msg = c10::str(
-          "A view was created in no_grad mode and ",
-          modified_obj,
-          " modified inplace with grad mode enabled.");
     }
 
     if (creation_meta == CreationMeta::MULTI_OUTPUT_NODE) {
@@ -789,7 +810,6 @@ void handle_view_on_rebase(
           " allow the output views to be modified inplace. You should replace the inplace operation by an"
           " out-of-place one.");
     } else if (creation_meta == CreationMeta::NO_GRAD_MODE) {
-      TORCH_INTERNAL_ASSERT(!grad_fn);
       msg = c10::str(
           msg,
           " Given that this use case is ambiguous and error-prone, it is forbidden."
@@ -797,14 +817,12 @@ void handle_view_on_rebase(
           " inside the no_grad block (if you don't want the inplace to be tracked) or both outside (if you want"
           " the inplace to be tracked).");
     } else if (creation_meta == CreationMeta::INFERENCE_MODE) {
-      TORCH_INTERNAL_ASSERT(!grad_fn);
       msg = c10::str(
           msg,
           " Given that this use case is ambiguous and error-prone, it is forbidden."
           " You can clarify your code by moving both the view and the inplace either both"
           " inside the inference_mode block (if you don't want the inplace to be tracked) or both outside (if you want"
           " the inplace to be tracked).");
-      TORCH_CHECK(false, msg);
     } else if (creation_meta == CreationMeta::IN_CUSTOM_FUNCTION) {
       msg = c10::str(
           msg,

From 9b3f808f9f833c4f550c15b4e1b02d859f4deded Mon Sep 17 00:00:00 2001
From: Zachary DeVito <zdevito@meta.com>
Date: Tue, 1 Nov 2022 11:35:23 -0700
Subject: [PATCH 0479/1922] [functorch.dims] Fix corner cases with permute
 (#88226)

Previously the permute function was extended to behave like the `order`
function for first-class dimensions. However, unlike `permute`,
`order` doesn't have a keyword argment `dims`, and there is no way to add
it in a way that makes both permute an order to continue to have the same
behavior. So this change just removes the extra functionality of permute,
which wasn't documented anyway. Fixes #88187
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88226
Approved by: https://github.com/zou3519
---
 functorch/csrc/dim/dim.cpp  |  3 +++
 functorch/dim/__init__.py   |  4 ++--
 test/functorch/test_dims.py | 11 +++++++++++
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/functorch/csrc/dim/dim.cpp b/functorch/csrc/dim/dim.cpp
index abdebc24e0112..c43a6c7a9cff7 100644
--- a/functorch/csrc/dim/dim.cpp
+++ b/functorch/csrc/dim/dim.cpp
@@ -1767,6 +1767,9 @@ static PyObject* order(PyObject *_,
                       PyObject *kwnames) {
     Arena A;
     PY_BEGIN
+    if (kwnames) {
+        py::raise_error(PyExc_TypeError, "unexpected keyword arguments %S", kwnames);
+    }
     AT_ASSERT(nargs-- > 0);
     Slice<DimEntry> orig_levels;
     Slice<DimEntry> levels;
diff --git a/functorch/dim/__init__.py b/functorch/dim/__init__.py
index 4f1cd84e44a18..6d36a8994dfe9 100644
--- a/functorch/dim/__init__.py
+++ b/functorch/dim/__init__.py
@@ -102,9 +102,9 @@ def _def(name, *args, **kwargs):
 del _Tensor.ndim
 
 if use_c:
-    _Tensor.permute = _Tensor.order = _C._instancemethod(_C.order)
+    _Tensor.order = _C._instancemethod(_C.order)
 else:
-    _Tensor.permute = _Tensor.order = reference.positional
+    _Tensor.order = reference.positional
 
 _def('mean')
 _def('sum')
diff --git a/test/functorch/test_dims.py b/test/functorch/test_dims.py
index 24cc66c06d2e0..c609c375325c3 100644
--- a/test/functorch/test_dims.py
+++ b/test/functorch/test_dims.py
@@ -592,6 +592,17 @@ def test_functorch(self):
         BB = torch.mm(B[j], C)  # 3, 4, 2
         assert list(torch.mm(AA.T, BB).order(i, j).shape) == [3, 3, 2, 2]
 
+    def test_permute_orig(self):
+        d = dims(1)
+        t_fc = torch.rand(1, 2, 3, 4)[d]
+        assert t_fc.permute(dims=(1, 0, 2)).shape == t_fc.permute(1, 0, 2).shape
+
+    def test_order_keyword(self):
+        d = dims(1)
+        t = torch.rand(3)[d]
+        self.assertRaises(TypeError, lambda: t.order(wrong=3))
+
+
 
 skip_functorch_only = ['test_time_mm_fuse', 'test_attn_cuda']
 class TestMinFunctorchOnly(TestMin):

From fb8ad335f2d2035b91aa2d3eb206c7377c470c75 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 2 Nov 2022 18:13:37 +0000
Subject: [PATCH 0480/1922] Revert "Introduce TORCH_DISABLE_GPU_ASSERTS
 (#84190)"

This reverts commit 1e2c4a6e0e60dda763b53f00f25ee5c1f1e5233d.

Reverted https://github.com/pytorch/pytorch/pull/84190 on behalf of https://github.com/malfet due to Needs internal changes, has to be landed via co-dev
---
 CMakeLists.txt             |  1 -
 c10/macros/Macros.h        | 11 +++++------
 caffe2/core/macros.h.in    |  2 --
 cmake/Dependencies.cmake   | 10 ----------
 cmake/Summary.cmake        |  1 -
 cmake/public/LoadHIP.cmake | 16 ++++++++++++++++
 docs/source/notes/hip.rst  | 11 -----------
 7 files changed, 21 insertions(+), 31 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6efd3f2df9366..b1e0f517fafa4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -285,7 +285,6 @@ if(NOT USE_XNNPACK AND CMAKE_VERSION VERSION_LESS ${XNNPACK_MIN_CMAKE_VER})
 endif()
 option(USE_ZMQ "Use ZMQ" OFF)
 option(USE_ZSTD "Use ZSTD" OFF)
-option(TORCH_DISABLE_GPU_ASSERTS "Disable GPU asserts by default" OFF)
 # Ensure that an ITT build is the default for x86 CPUs
 cmake_dependent_option(
   USE_ITT "Use Intel(R) VTune Profiler ITT functionality" ON
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
index 31cd2219d10e6..e77fa0fde2ee0 100644
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@@ -326,8 +326,9 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
 // CUDA_KERNEL_ASSERT checks the assertion
 // even when NDEBUG is defined. This is useful for important assertions in CUDA
 // code that would otherwise be suppressed when building Release.
-#if defined(__ANDROID__) || defined(__APPLE__) || \
-    (defined(USE_ROCM) && ROCM_VERSION < 40100)
+#if defined(__ANDROID__) || defined(__APPLE__) ||  \
+    (defined(USE_ROCM) && ROCM_VERSION < 40100) || \
+    (defined(USE_ROCM) && defined(ROCM_DISABLE_GPU_ASSERTS))
 // Those platforms do not support assert()
 #define CUDA_KERNEL_ASSERT(cond)
 #define SYCL_KERNEL_ASSERT(cond)
@@ -367,9 +368,7 @@ extern SYCL_EXTERNAL void __assert_fail(
     unsigned int line,
     const char* func);
 #else // __SYCL_DEVICE_ONLY__
-#if (                                                                       \
-    defined(__CUDA_ARCH__) && !(defined(__clang__) && defined(__CUDA__)) && \
-    !defined(TORCH_DISABLE_GPU_ASSERTS))
+#if (defined(__CUDA_ARCH__) && !(defined(__clang__) && defined(__CUDA__)))
 // CUDA supports __assert_fail function which are common for both device
 // and host side code.
 __host__ __device__
@@ -387,7 +386,7 @@ __host__ __device__
         const char* function) throw() __attribute__((__noreturn__));
 
 #if (defined(__HIP_ARCH__) || defined(__HIP__)) && \
-    !defined(TORCH_DISABLE_GPU_ASSERTS)
+    !defined(ROCM_DISABLE_GPU_ASSERTS)
 // ROCm supports __assert_fail only as a device side function.
 __device__ __attribute__((noinline)) __attribute__((weak)) void __assert_fail(
     const char* assertion,
diff --git a/caffe2/core/macros.h.in b/caffe2/core/macros.h.in
index 2d9f03e94c0fc..9c9f734575634 100644
--- a/caffe2/core/macros.h.in
+++ b/caffe2/core/macros.h.in
@@ -44,7 +44,6 @@ static_assert(
 #cmakedefine CAFFE2_USE_NVTX
 #cmakedefine CAFFE2_USE_ITT
 #cmakedefine CAFFE2_USE_TRT
-#cmakedefine TORCH_DISABLE_GPU_ASSERTS
 
 #ifndef EIGEN_MPL2_ONLY
 #cmakedefine EIGEN_MPL2_ONLY
@@ -86,5 +85,4 @@ static_assert(
   {"USE_NVTX", "${CAFFE2_USE_NVTX}"}, \
   {"USE_ITT", "${CAFFE2_USE_ITT}"}, \
   {"USE_TRT", "${CAFFE2_USE_TRT}"}, \
-  {"TORCH_DISABLE_GPU_ASSERTS", "${TORCH_DISABLE_GPU_ASSERTS}"}, \
 }
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index cf3c2c2caafd2..e232fcb624cd3 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1248,16 +1248,6 @@ if(ANDROID)
   list(APPEND Caffe2_DEPENDENCY_LIBS log)
 endif()
 
-# ---[ Kernel asserts
-# Kernel asserts are enabled by default for CUDA and disabled for ROCm.
-# For ROCm, it can be enabled by setting ROCM_FORCE_ENABLE_GPU_ASSERTS
-if(USE_ROCM AND ROCM_FORCE_ENABLE_GPU_ASSERTS)
-  message(STATUS "Forcefully enabling kernel asserts on ROCM")
-elseif(USE_ROCM AND NOT ROCM_FORCE_ENABLE_GPU_ASSERTS)
-  message(STATUS "Disabling kernel asserts for ROCm")
-  caffe2_update_option(TORCH_DISABLE_GPU_ASSERTS ON)
-endif()
-
 # ---[ LLVM
 if(USE_LLVM)
   message(STATUS "Looking for LLVM in ${USE_LLVM}")
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 279d72a41e660..fd6444680e2d4 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -199,5 +199,4 @@ function(caffe2_print_configuration_summary)
   # coreml
   message(STATUS "  USE_COREML_DELEGATE     : ${USE_COREML_DELEGATE}")
   message(STATUS "  BUILD_LAZY_TS_BACKEND   : ${BUILD_LAZY_TS_BACKEND}")
-  message(STATUS "  TORCH_DISABLE_GPU_ASSERTS : ${TORCH_DISABLE_GPU_ASSERTS}")
 endfunction()
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index b51284115f144..89a61b6242856 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -143,6 +143,9 @@ message("Building PyTorch for GPU arch: ${PYTORCH_ROCM_ARCH}")
 # Add HIP to the CMAKE Module Path
 set(CMAKE_MODULE_PATH ${HIP_PATH}/cmake ${CMAKE_MODULE_PATH})
 
+#Disable kernel assert due to performance regression
+set(ROCM_ENABLE_KERNEL_ASSERTS FALSE CACHE BOOL "Kernel asserts are disabled by default for ROCm")
+
 macro(find_package_and_print_version PACKAGE_NAME)
   find_package("${PACKAGE_NAME}" ${ARGN})
   message("${PACKAGE_NAME} VERSION: ${${PACKAGE_NAME}_VERSION}")
@@ -283,6 +286,19 @@ if(HIP_FOUND)
   find_package_and_print_version(hipcub REQUIRED)
   find_package_and_print_version(rocthrust REQUIRED)
 
+  if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "4.1.0")
+    if(ROCM_ENABLE_KERNEL_ASSERTS)
+      message("ROCm version >= 4.1; enabling asserts")
+    else()
+      add_definitions(-DROCM_DISABLE_GPU_ASSERTS)
+      message("ROCm version >= 4.1; kernel asserts are disabled")
+    endif()
+  else()
+    # Disable Asserts In Code (Can't use asserts on HIP stack.)
+    add_definitions(-DNDEBUG)
+    message("ROCm version < 4.1; disablng asserts")
+  endif()
+
   if(HIP_COMPILER STREQUAL clang)
     set(hip_library_name amdhip64)
   else()
diff --git a/docs/source/notes/hip.rst b/docs/source/notes/hip.rst
index c54e201489705..a9c94e2a4febb 100644
--- a/docs/source/notes/hip.rst
+++ b/docs/source/notes/hip.rst
@@ -144,14 +144,3 @@ Refer to CUDA Semantics doc
 ---------------------------
 
 For any sections not listed here, please refer to the CUDA semantics doc: :ref:`cuda-semantics`
-
-
-Enabling kernel asserts
------------------------
-
-Kernel asserts are supported on ROCm, but they are disabled due to performance overhead. It can be enabled
-by recompiling the PyTorch from source.
-
-Please add below line as an argument to cmake command parameters::
-
-    -DROCM_FORCE_ENABLE_GPU_ASSERTS:BOOL=ON

From bd6803cf8865b6f9547a194708eb86f42f9f92cd Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 2 Nov 2022 18:43:36 +0000
Subject: [PATCH 0481/1922] Revert "Add support for neg to NestedTensor
 (#88131)"

This reverts commit 6a75a0d1a197e378ebbf1f73f5ab93ce79cb873a.

Reverted https://github.com/pytorch/pytorch/pull/88131 on behalf of https://github.com/mehtanirav due to [Internal breakages](https://www.internalfb.com/intern/sandcastle/job/13510799692239080/insights)
---
 aten/src/ATen/native/native_functions.yaml    |  2 --
 .../native/nested/NestedTensorUnaryOps.cpp    | 12 -------
 docs/source/nested.rst                        |  1 -
 test/test_nestedtensor.py                     | 33 +++++++++----------
 4 files changed, 15 insertions(+), 33 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 296047d7c3c25..300a14dd6baf6 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -4247,7 +4247,6 @@
   dispatch:
     SparseCPU, SparseCUDA: neg_sparse
     SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg
   tags: canonical
 
 - func: neg_(Tensor(a!) self) -> Tensor(a!)
@@ -4257,7 +4256,6 @@
   dispatch:
     SparseCPU, SparseCUDA: neg_sparse_
     SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr_
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg_
 
 - func: neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
diff --git a/aten/src/ATen/native/nested/NestedTensorUnaryOps.cpp b/aten/src/ATen/native/nested/NestedTensorUnaryOps.cpp
index 6be7239775ea6..74289a1372e12 100644
--- a/aten/src/ATen/native/nested/NestedTensorUnaryOps.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorUnaryOps.cpp
@@ -58,17 +58,5 @@ Tensor NestedTensor_tanh(const Tensor& self) {
   return map_nt(self, at::tanh);
 }
 
-Tensor& NestedTensor_neg_(Tensor& self) {
-  auto self_ptr = get_nested_tensor_impl(self);
-  check_numel_equals_buffer_size(self_ptr);
-  auto buffer = self_ptr->get_buffer();
-  at::neg_(buffer);
-  return self;
-}
-
-Tensor NestedTensor_neg(const Tensor& self) {
-  return map_nt(self, at::neg);
-}
-
 } // namespace native
 } // namespace at
diff --git a/docs/source/nested.rst b/docs/source/nested.rst
index 07712e0376f16..21ff980256911 100644
--- a/docs/source/nested.rst
+++ b/docs/source/nested.rst
@@ -196,7 +196,6 @@ NestedTensor and any constraints they have.
    :func:`torch.nn.Dropout`; "Behavior is the same as on regular tensors."
    :func:`torch.relu`; "Behavior is the same as on regular tensors."
    :func:`torch.gelu`; "Behavior is the same as on regular tensors."
-   :func:`torch.neg`; "Behavior is the same as on regular tensors."
    :func:`torch.add`; "Supports elementwise addition of two nested tensors.
    Supports addition of a scalar to a nested tensor."
    :func:`torch.mul`; "Supports elementwise multiplication of two nested tensors.
diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py
index e0f40ca85238c..baf00a4d0f2b4 100644
--- a/test/test_nestedtensor.py
+++ b/test/test_nestedtensor.py
@@ -304,6 +304,20 @@ def test_repr_string(self):
         self.assertEqual(str(a), expected)
         self.assertEqual(repr(a), expected)
 
+    @torch.inference_mode()
+    def test_activations(self):
+        for func in (torch.nn.functional.relu,
+                     torch.nn.functional.relu_,
+                     torch.nn.functional.gelu,
+                     torch._C._nn.gelu_,
+                     torch.tanh,
+                     torch.tanh_):
+            t = torch.tensor([-1, 0, 1], dtype=torch.float)
+            nt = torch.nested.nested_tensor([t])
+            nested_result = func(nt)
+            self.assertTrue(nested_result.is_nested)
+            self.assertEqual(func(t), nested_result.unbind()[0])
+
     def test_to_padded_tensor_on_empty_tensor(self):
 
         nt = torch.nested.nested_tensor([])
@@ -748,24 +762,6 @@ def test_nested_tensor_indexing(self, device, dtype):
         expected_grad = torch.nested.nested_tensor([grad_x0, torch.zeros((3, 4), device=device, dtype=dtype)])
         self.assertEqual(nt.grad, expected_grad)
 
-    @parametrize("func", [torch.nn.functional.relu,
-                          torch.nn.functional.relu_,
-                          torch.nn.functional.gelu,
-                          torch._C._nn.gelu_,
-                          torch.tanh,
-                          torch.tanh_,
-                          torch.neg])
-    def test_activations(self, device, func):
-        nt, nt_noncontiguous = random_nt_noncontiguous_pair((2, 3, 6, 7), device=device, dtype=torch.float32)
-        nested_result = func(nt)
-        self.assertTrue(nested_result.is_nested)
-        for t, t_res in zip(nt.unbind(), nested_result.unbind()):
-            self.assertEqual(func(t), t_res)
-        self.assertRaisesRegex(
-            RuntimeError,
-            "NestedTensor must be contiguous to get buffer.",
-            lambda: func(nt_noncontiguous))
-
     @dtypes(*floating_types_and_half())
     def test_nested_tensor_chunk(self, device, dtype):
         # Transformer use case
@@ -901,6 +897,7 @@ def test_nested_tensor_div(self, device, dtype):
             RuntimeError, "div requires offsets to match when given NestedTensors",
             lambda: nt_chunks[0] / nt_chunks[1])
 
+
     @dtypes(torch.float, torch.float16)
     @skipMeta
     @torch.inference_mode()

From df28ec6afc29402e68924fe9d5facc62c78ae37f Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Wed, 2 Nov 2022 19:41:09 +0000
Subject: [PATCH 0482/1922] [WIP] Add symnode magic method testing (#88119)

There are failures that need to be addressed before landing:
- Some issue with handling of booleans.
- Most functions return wrong result when mixing int/float

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88119
Approved by: https://github.com/ezyang
---
 test/test_dynamic_shapes.py              | 165 ++++++++++++++++++++++-
 torch/fx/experimental/symbolic_shapes.py |  22 ++-
 2 files changed, 179 insertions(+), 8 deletions(-)

diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index 0e85b54cfe3f7..0d421b04008d2 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -4,13 +4,21 @@
 from torch._C import _disabled_torch_function_impl
 import torch.fx
 import torch.nn.functional as F
-from torch.testing._internal.common_utils import run_tests, TestCase, skipIfTorchDynamo, IS_WINDOWS
+from torch.testing._internal.common_utils import run_tests, TestCase, skipIfTorchDynamo, \
+    IS_WINDOWS, parametrize, instantiate_parametrized_tests
 import unittest
 import torch
 import operator
 import itertools
+import random
+import contextlib
+import math
+import builtins
+import atexit
 import io
+import os
 from torch.utils._pytree import tree_map
+from torch.fx.experimental import symbolic_shapes
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.experimental.symbolic_shapes import ShapeEnv, sym_float, guard_int, SymNode
 from torch.utils._python_dispatch import TorchDispatchMode
@@ -386,6 +394,161 @@ def forward(self, a_1: f32[s0, s1], b_1: f32[s2, s1]):
         getitem_1: b8[s0 + s2, 2*s1] = native_dropout[1];  native_dropout = None
         return (getitem, getitem_1)""")  # noqa: B950
 
+# This environment variable controls whether or not we print expected failure
+# lists at the end of a test suite run.  The intended usage looks like this:
+#
+# 1. Run `PYTORCH_COLLECT_EXPECT=1 python test/test_dynamic_shapes.py -k TestSymNumberMagicMethods`.
+# 2. Given the printed xfail list, add them to the set expected_failure_sym_magic_methods.
+COLLECT_EXPECT = os.getenv('PYTORCH_COLLECT_EXPECT', '0') == '1'
+
+seen_failed = []
+def print_seen():
+    out = []
+    for key, reason in seen_failed:
+        # Make sure the generated line is lint clean
+        out.append(f"    {key},  # {reason}"[:120])
+
+    print("expected_failure_sym_magic_methods = {")
+    print("\n".join(out))
+    print("}")
+
+if COLLECT_EXPECT:
+    atexit.register(print_seen)
+
+expected_failure_sym_magic_methods = {
+    ('floordiv', 'SymInt', 'float'),  # Cannot convert complex to float
+    ('floordiv', 'int', 'SymFloat'),  # unsupported operand type(s) for //: 'int' and 'SymFloat'
+    ('floordiv', 'SymInt', 'SymFloat'),  # Cannot convert complex to float
+    ('mod', 'int', 'SymFloat'),  # unsupported operand type(s) for %: 'int' and 'SymFloat'
+    ('sym_int', 'int', 'float'),  # sym_int() takes 1 positional argument but 2 were given
+    ('sym_int', 'SymInt', 'float'),  # sym_int() takes 1 positional argument but 2 were given
+    ('sym_int', 'int', 'SymFloat'),  # sym_int() takes 1 positional argument but 2 were given
+    ('sym_int', 'SymInt', 'SymFloat'),  # sym_int() takes 1 positional argument but 2 were given
+    ('sym_int', 'int', 'int'),  # sym_int() takes 1 positional argument but 2 were given
+    ('sym_int', 'SymInt', 'int'),  # sym_int() takes 1 positional argument but 2 were given
+    ('sym_int', 'int', 'SymInt'),  # sym_int() takes 1 positional argument but 2 were given
+    ('sym_int', 'SymInt', 'SymInt'),  # sym_int() takes 1 positional argument but 2 were given
+
+
+}
+
+@skipIfTorchDynamo("Creating ShapeEnv fails for confusing reasons (also we never expect dynamo to see code like this)")
+class TestSymNumberMagicMethods(TestCase):
+    def _do_test(self, fn, inp1, inp2, shape_env, is_unary_fn):
+        # Helper function
+        seed_node = (create_symint(shape_env, 1) / 1.).get_pyobj()
+
+        def get_sym_inp(inp):
+            if isinstance(inp, int):
+                return torch.SymInt(seed_node.to_node(inp))
+            else:
+                return torch.SymFloat(seed_node.to_node(inp))
+
+        def maybe_xfail(inp1, inp2):
+            key = (fn, type(inp1).__name__, type(inp2).__name__)
+            if COLLECT_EXPECT:
+                @contextlib.contextmanager
+                def context():
+                    try:
+                        yield
+                    except TypeError as e:
+                        seen_failed.append((key, str(e)))
+                return context()
+
+            if key in expected_failure_sym_magic_methods:
+                return self.assertRaises(TypeError)
+            else:
+                return contextlib.nullcontext()
+
+        # These functions might return plain int/float
+        has_valid_downcast = fn in ["min", "max"]
+        if fn in symbolic_shapes.magic_methods_on_builtins:
+            lambda_apply = getattr(builtins, fn)
+        elif fn in symbolic_shapes.magic_methods_on_math:
+            lambda_apply = getattr(math, fn)
+        elif fn in symbolic_shapes.magic_methods_on_submodule:
+            lambda_apply = getattr(symbolic_shapes, fn)
+        else:
+            lambda_apply = getattr(operator, fn)
+
+        if fn in symbolic_shapes.always_float_magic_methods:
+            tp = "float"
+        elif fn in symbolic_shapes.always_int_magic_methods:
+            tp = "int"
+        elif is_unary_fn:
+            tp = "float" if isinstance(inp1, float) else "int"
+        else:
+            tp = "float" if any(isinstance(i, float) for i in [inp1, inp2]) else "int"
+
+        def guard_fn(v):
+            try:
+                if fn in symbolic_shapes.always_bool_magic_methods:
+                    return bool(v)
+                else:
+                    return getattr(v.node, f"guard_{tp}")("", 0)
+            except Exception as e:
+                if has_valid_downcast:
+                    return v
+                else:
+                    raise e
+
+        # Get reference result
+        with maybe_xfail(inp1, inp2):
+            if is_unary_fn:
+                ref_out = lambda_apply(inp1)
+            else:
+                ref_out = lambda_apply(inp1, inp2)
+
+        # Symified first arg
+        sym_inp1 = get_sym_inp(inp1)
+        with maybe_xfail(sym_inp1, inp2):
+            if is_unary_fn:
+                out = lambda_apply(sym_inp1)
+            else:
+                out = lambda_apply(sym_inp1, inp2)
+            self.assertEqual(guard_fn(out), ref_out)
+
+        if is_unary_fn:
+            return
+
+        # Symified second arg
+        sym_inp2 = get_sym_inp(inp2)
+        with maybe_xfail(inp1, sym_inp2):
+            out = lambda_apply(inp1, sym_inp2)
+            self.assertEqual(guard_fn(out), ref_out)
+
+        # Symified both args
+        with maybe_xfail(sym_inp1, sym_inp2):
+            out = lambda_apply(sym_inp1, sym_inp2)
+            self.assertEqual(guard_fn(out), ref_out)
+
+
+    @parametrize("fn", list(symbolic_shapes.magic_methods.keys()))
+    @parametrize("first_type", ["int", "float"])
+    @parametrize("second_type", ["int", "float"])
+    def test_method(self, fn, first_type, second_type):
+        if first_type == "float" and fn not in symbolic_shapes.float_magic_methods:
+            self.skipTest(f"{fn} is not a float magic method")
+
+        is_unary_fn = fn in symbolic_shapes.unary_magic_methods
+        # Second argument is ignored for unary function. So only run for one type
+        if is_unary_fn and second_type == "float":
+            self.skipTest(f"{fn} is unary and already tested")
+
+        # We could pass int/float directly for types but then the
+        # mangled test name is bad
+        inp1 = random.random() * 2.5
+        if first_type == "int":
+            inp1 = int(inp1)
+        inp2 = random.random() * 2.5
+        if second_type == "int":
+            inp2 = int(inp2)
+
+        shape_env = ShapeEnv()
+
+        self._do_test(fn, inp1, inp2, shape_env, is_unary_fn)
+
+instantiate_parametrized_tests(TestSymNumberMagicMethods)
 
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index 5a4d5bff84b2f..bb59d8f5470cf 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -156,7 +156,7 @@ def wrap_int(self, num):
 
     def wrap_float(self, num):
         assert isinstance(num, float)
-        return SymNode(sympy.Integer(num), self.shape_env, float, constant=num)
+        return SymNode(sympy.Float(num), self.shape_env, float, constant=num)
 
     def clone(self):
         return SymNode(self.expr, self.shape_env, self.pytype, constant=self.constant)
@@ -271,6 +271,14 @@ def _nyi():
 
 float_magic_methods = {"add", "sub", "mul", "truediv", "ceil", "floor", "eq", "gt", "lt", "le", "ge", "pow"}
 
+magic_methods_on_builtins = {"min", "max"}
+magic_methods_on_math = {"ceil", "floor"}
+magic_methods_on_submodule = {"sym_float", "sym_int"}
+
+always_float_magic_methods = {"truediv", "sym_float"}
+always_int_magic_methods = {"ceil", "floor"}
+always_bool_magic_methods = {"eq", "gt", "lt", "le", "ge"}
+
 def wrap_node(x):
     if not isinstance(x, SymNode):
         return x
@@ -287,7 +295,7 @@ def _make_node_magic(method, func):
     func = lru_cache(256)(func)
 
     def binary_magic_impl(self, other):
-        if method in ["min", "max"]:
+        if method in magic_methods_on_builtins:
             op = getattr(builtins, method)
         else:
             op = getattr(operator, method)
@@ -303,7 +311,7 @@ def binary_magic_impl(self, other):
         out = func(expr, other_expr)
         out = sympy.expand(out)
         pytype: Type
-        if method in ["truediv"]:
+        if method in always_float_magic_methods:
             pytype = float
         else:
             pytype = self.pytype
@@ -314,9 +322,9 @@ def binary_magic_impl(self, other):
 
     def unary_magic_impl(self):
         if SYM_FUNCTION_MODE:
-            if method in ["ceil", "floor"]:
+            if method in magic_methods_on_math:
                 op = getattr(math, method)
-            elif method in ["sym_float", "sym_int"]:
+            elif method in magic_methods_on_submodule:
                 op = getattr(sys.modules[__name__], method)
             else:
                 op = getattr(operator, method)
@@ -328,9 +336,9 @@ def unary_magic_impl(self):
         out = func(expr)
         out = sympy.expand(out)
         pytype: Type
-        if method in ["ceil", "floor"]:
+        if method in always_int_magic_methods:
             pytype = int
-        elif method in ["sym_float"]:
+        elif method in always_float_magic_methods:
             pytype = float
         else:
             pytype = self.pytype

From cdee7f279d42c5c603e2d276c3f83dea250be04a Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Wed, 2 Nov 2022 01:23:57 +0000
Subject: [PATCH 0483/1922] [inductor] Handle the case where kwargs contains
 tensor (#88215)

Summary: Fix https://github.com/pytorch/torchdynamo/issues/1805;
currently inductor does not allow any tensor in kwargs.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88215
Approved by: https://github.com/ngimel
---
 test/inductor/test_torchinductor.py | 14 ++++++++++++++
 torch/_inductor/ir.py               | 28 ++++++++++++++--------------
 torch/_inductor/lowering.py         |  6 ++++--
 3 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 58ae49eb1930e..114ba4c00ba5f 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -4083,6 +4083,20 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                 else:
                     self.assertEqual(len(inps), 0)
 
+    @unittest.skipIf(HAS_CUDA, "histogramdd only supports cpu")
+    def test_kwargs(self):
+        def fn(x, y):
+            return torch.histogramdd(
+                x,
+                bins=[3, 3],
+                weight=y,
+            )
+
+        self.common(
+            fn,
+            [torch.randn((4, 2)), torch.randn((4))],
+        )
+
 
 if HAS_CPU:
 
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 8ca869df03602..8ec3f494887b3 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -8,6 +8,7 @@
 from collections import OrderedDict
 from enum import Enum
 from functools import partial
+from inspect import signature
 from typing import Any, Callable, ClassVar, Dict, List, Optional, Set, Tuple, Union
 from unittest.mock import patch
 
@@ -2236,7 +2237,8 @@ def copy_input(x):
 
     @classmethod
     def process_kernel(cls, kernel, *args, **kwargs):
-        args_flat, args_spec = pytree.tree_flatten(args)
+        binded_args = signature(kernel).bind(*args, **kwargs).arguments
+        args_flat, args_spec = pytree.tree_flatten(binded_args)
 
         is_arg_tensor = []
         tensor_args = []
@@ -2249,15 +2251,16 @@ def process_kernel(cls, kernel, *args, **kwargs):
                 non_tensor_args.append(arg)
 
         def unflatten_args(new_tensor_args, new_non_tensor_args):
-            new_args = []
+            result = []
             it_tensors = iter(new_tensor_args)
             it_non_tensors = iter(new_non_tensor_args)
             for is_tensor in is_arg_tensor:
                 if is_tensor:
-                    new_args.append(next(it_tensors))
+                    result.append(next(it_tensors))
                 else:
-                    new_args.append(next(it_non_tensors))
-            return pytree.tree_unflatten(new_args, args_spec)
+                    result.append(next(it_non_tensors))
+            result = pytree.tree_unflatten(result, args_spec)
+            return result.get("args", []), result.get("kwargs", {})
 
         tensor_args = [cls.realize_input(x) for x in tensor_args]
 
@@ -2283,9 +2286,8 @@ def unflatten_args(new_tensor_args, new_non_tensor_args):
             ).zero_()
             example_args.append(arg)
 
-        example_output = kernel(
-            *unflatten_args(example_args, non_tensor_args), **kwargs
-        )
+        new_args, new_kwargs = unflatten_args(example_args, non_tensor_args)
+        example_output = kernel(*new_args, **new_kwargs)
 
         return example_output, tensor_args, non_tensor_args, unflatten_args
 
@@ -2878,15 +2880,13 @@ class Shim:
             def __repr__(self):
                 return self.ref
 
-        tensor_args = [Shim(x.codegen_reference()) for x in self.inputs]
-        constant_args = [Shim(repr(x)) for x in self.constant_args]
-
         def gen_kwarg(k, v):
             return f"{k}={repr(v)}"
 
-        kwargs = list(gen_kwarg(k, v) for k, v in self.kwargs.items())
-
-        return list(map(repr, self.unflatten_args(tensor_args, constant_args))) + kwargs
+        tensor_args = [Shim(x.codegen_reference()) for x in self.inputs]
+        constant_args = [Shim(repr(x)) for x in self.constant_args]
+        args, kwargs = self.unflatten_args(tensor_args, constant_args)
+        return list(map(repr, args)) + list(gen_kwarg(k, v) for k, v in kwargs.items())
 
     @classmethod
     def create(cls, kernel, *args, **kwargs):
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index a05e6d527ea9a..a59b2cacf0153 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -164,8 +164,10 @@ def wrapped(*args, **kwargs):
             args = args[0]
         # Only look at args that are Tensors
         indices = [i for i, x in enumerate(args) if isinstance(x, TensorBox)]
-        # kwargs tensors not supported yet
-        assert not any(isinstance(x, TensorBox) for x in kwargs.values())
+        # kwargs tensors not supported yet unless it's a fallback op
+        assert not any(isinstance(x, TensorBox) for x in kwargs.values()) or all(
+            fn in fallbacks for fn in aten_fn
+        )
 
         if (type_promotion_kind or convert_input_to_bool) and indices:
             if convert_input_to_bool:

From 0a475f56998cfd0bf14d108eb522b1b300010ce3 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Wed, 2 Nov 2022 10:25:49 -0700
Subject: [PATCH 0484/1922] Add [[noreturn]] attribute to operator() in
 DispatchKeyExtractor.h (#88333)

Originally D40537408. Submitting this through the diff train workflow to
get it merged faster.

Test Plan:
- Build PyTorch
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88333
Approved by: https://github.com/ezyang
---
 aten/src/ATen/core/dispatch/DispatchKeyExtractor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
index 27c6e26721a2e..7401297c66a69 100644
--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
@@ -80,7 +80,7 @@ namespace detail {
         ts = ts | x.key_set();
       }
     }
-    void operator()(at::ArrayRef<c10::optional<at::Tensor>>) {
+    [[noreturn]] void operator()(at::ArrayRef<c10::optional<at::Tensor>>) {
       // Just checking that the handling of Tensor?[] didn't change.
       TORCH_INTERNAL_ASSERT(false);
     }

From 21bd639d0394ac9ecb0c8251c4c011b3b26c44f5 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Mon, 31 Oct 2022 16:17:19 -0700
Subject: [PATCH 0485/1922] [PyTorch] Make c10::irange(x) generate the same
 assembly as for loop (#86841)

`c10::irange(n)` generated an extra `sar` and `andn` instruction compared to a traditional `for` loop. now it doesn't.

Differential Revision: [D40321009](https://our.internmc.facebook.com/intern/diff/D40321009/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86841
Approved by: https://github.com/r-barnes, https://github.com/malfet
---
 aten/src/ATen/native/TensorShape.cpp |  2 +-
 c10/util/irange.h                    | 30 ++++++++++++++++++----------
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 2051cda371b97..1895a227d3389 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -3079,7 +3079,7 @@ Tensor squeeze_qtensor(const Tensor& self, c10::optional<int64_t> dim) {
     const auto* per_channel_quantizer = static_cast<at::PerChannelAffineQuantizer*>(quantizer.get());
     auto axis = per_channel_quantizer->axis();
     int64_t shift = 0;
-    integer_range<int64_t> dims = dim.has_value() ? integer_range<int64_t>{dim.value(), dim.value() + 1} : c10::irange(self.dim());
+    integer_range<int64_t> dims = dim.has_value() ? integer_range<int64_t>{dim.value(), dim.value() + 1} : c10::irange(0, self.dim());
     for (const auto d : dims) {
       if (self.sizes()[d] == 1) {
         TORCH_CHECK(axis != d, "Squeeze is only possible on non-axis dimension for Per-Channel Quantized Tensors.");
diff --git a/c10/util/irange.h b/c10/util/irange.h
index e734688c81d6e..16fa682eb0d47 100644
--- a/c10/util/irange.h
+++ b/c10/util/irange.h
@@ -15,6 +15,7 @@ namespace detail {
 
 template <
     typename I,
+    bool one_sided = false,
     typename std::enable_if<std::is_integral<I>::value, int>::type = 0>
 struct integer_iterator : std::iterator<std::input_iterator_tag, I> {
   explicit integer_iterator(I value) : value(value) {}
@@ -39,11 +40,19 @@ struct integer_iterator : std::iterator<std::input_iterator_tag, I> {
   }
 
   bool operator==(const integer_iterator& other) const {
-    return value == other.value;
+    if /* constexpr -- we don't have C++17 yet, see #85969 */ (one_sided) {
+      // Range-for loops' end test is `begin != end`, not `begin <
+      // end`. To handle `c10::irange(n)` where n < 0 (which should be
+      // empty), we just make `begin != end` fail whenever `end` is
+      // negative.
+      return other.value < 0 || value == other.value;
+    } else {
+      return value == other.value;
+    }
   }
 
   bool operator!=(const integer_iterator& other) const {
-    return value != other.value;
+    return !(*this == other);
   }
 
  protected:
@@ -54,20 +63,22 @@ struct integer_iterator : std::iterator<std::input_iterator_tag, I> {
 
 template <
     typename I,
+    bool one_sided = false,
     typename std::enable_if<std::is_integral<I>::value, bool>::type = true>
 struct integer_range {
  public:
   integer_range(I begin, I end) : begin_(begin), end_(end) {}
-  detail::integer_iterator<I> begin() const {
+  using iterator = detail::integer_iterator<I, one_sided>;
+  iterator begin() const {
     return begin_;
   }
-  detail::integer_iterator<I> end() const {
+  iterator end() const {
     return end_;
   }
 
  private:
-  detail::integer_iterator<I> begin_;
-  detail::integer_iterator<I> end_;
+  iterator begin_;
+  iterator end_;
 };
 
 /// Creates an integer range for the half-open interval [begin, end)
@@ -95,11 +106,8 @@ template <
     typename Integer,
     typename std::enable_if<std::is_integral<Integer>::value, bool>::type =
         true>
-integer_range<Integer> irange(Integer end) {
-  // If end<=begin then the range is empty; we can achieve this effect by
-  // choosing the larger of {0, end} as the loop terminator
-  // Handles the case where end<0. irange only works for ranges >=0
-  return {Integer(), std::max(Integer(), end)};
+integer_range<Integer, true> irange(Integer end) {
+  return {Integer(), end};
 }
 
 } // namespace c10

From be0eebcb3783e6d7c354460aa4a02d45af18b93d Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@fb.com>
Date: Wed, 2 Nov 2022 21:51:40 +0000
Subject: [PATCH 0486/1922] Use tensor cores for NT bmm (#86856)

Copy of internal diff.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86856
Approved by: https://github.com/drisspg
---
 .../ATen/native/nested/NestedTensorMatmul.cpp | 121 +++++
 .../native/nested/cuda/NestedTensorMatmul.cu  | 416 ++++++++++++++++++
 .../cuda/NestedTensorTransformerFunctions.cu  | 276 ------------
 benchmarks/nested/nested_bmm_bench.py         |  36 +-
 test/test_nestedtensor.py                     |  65 ++-
 5 files changed, 613 insertions(+), 301 deletions(-)
 create mode 100644 aten/src/ATen/native/nested/cuda/NestedTensorMatmul.cu

diff --git a/aten/src/ATen/native/nested/NestedTensorMatmul.cpp b/aten/src/ATen/native/nested/NestedTensorMatmul.cpp
index 2932fdbaf3b90..c8cfa124330d6 100644
--- a/aten/src/ATen/native/nested/NestedTensorMatmul.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorMatmul.cpp
@@ -136,6 +136,113 @@ matmul_nested_helper(
 }
 }
 
+Tensor matmul_with_bmm_nested(const Tensor& self, const Tensor& mat2) {
+  // Tensor self = self_.contiguous();
+  // Tensor mat2 = mat2_.contiguous();
+  // self [N, n_heads, *, head_dim]
+  // mat2 [N, n_heads, head_dim, *]
+  const auto self_ptr = get_nested_tensor_impl(self);
+  const auto mat2_ptr = get_nested_tensor_impl(mat2);
+  // metadata for self
+  std::vector<IntArrayRef> self_sizes = NestedTensor_get_sizes(self_ptr);
+  std::vector<IntArrayRef> self_strides = NestedTensor_get_strides(self_ptr);
+  std::vector<int64_t> self_offsets = self_ptr->get_storage_offsets();
+  auto opt = self_ptr->get_nested_size_tensor().options();
+
+  // metadata for mat2
+  std::vector<IntArrayRef> mat2_sizes = NestedTensor_get_sizes(mat2_ptr);
+  std::vector<IntArrayRef> mat2_strides = NestedTensor_get_strides(mat2_ptr);
+  std::vector<int64_t> mat2_offsets = mat2_ptr->get_storage_offsets();
+  auto opt2 = mat2_ptr->get_nested_size_tensor().options();
+
+  int64_t N = self_sizes.size();
+  int64_t n_heads = self_sizes[0][0];
+
+  // viewed metadata for self
+  auto self_new_sizes = at::empty({N * n_heads, 2}, opt);
+  int64_t* self_new_sizes_ptr = self_new_sizes.data_ptr<int64_t>();
+
+  auto self_new_strides = at::empty({N * n_heads, 2}, opt);
+  int64_t* self_new_strides_ptr = self_new_strides.data_ptr<int64_t>();
+  std::vector<int64_t> self_new_offsets;
+
+  // viewed metadata for mat2
+  auto mat2_new_sizes = at::empty({N * n_heads, 2}, opt2);
+  int64_t* mat2_new_sizes_ptr = mat2_new_sizes.data_ptr<int64_t>();
+
+  auto mat2_new_strides = at::empty({N * n_heads, 2}, opt2);
+  int64_t* mat2_new_strides_ptr = mat2_new_strides.data_ptr<int64_t>();
+  std::vector<int64_t> mat2_new_offsets;
+
+  for (int64_t i = 0; i < N; i++) {
+    const IntArrayRef& self_size_i = self_sizes[i];
+    const IntArrayRef& self_stride_i = self_strides[i];
+    int64_t self_offset = self_offsets[i];
+
+    const IntArrayRef& mat2_size_i = mat2_sizes[i];
+    const IntArrayRef& mat2_stride_i = mat2_strides[i];
+    int64_t mat2_offset = mat2_offsets[i];
+    for (int64_t j = 0; j < n_heads; j++) {
+      auto idx = (i * n_heads + j) * 2;
+      self_new_sizes_ptr[idx] = self_size_i[1];
+      self_new_sizes_ptr[idx + 1] = self_size_i[2];
+      self_new_strides_ptr[idx] = self_stride_i[1];
+      self_new_strides_ptr[idx + 1] = self_stride_i[2];
+      self_new_offsets.push_back(self_offset);
+      self_offset += self_stride_i[0];
+
+      mat2_new_sizes_ptr[idx] = mat2_size_i[1];
+      mat2_new_sizes_ptr[idx + 1] = mat2_size_i[2];
+      mat2_new_strides_ptr[idx] = mat2_stride_i[1];
+      mat2_new_strides_ptr[idx + 1] = mat2_stride_i[2];
+      mat2_new_offsets.push_back(mat2_offset);
+      mat2_offset += mat2_stride_i[0];
+    }
+  }
+
+
+  // view self as [N * n_heads, *, head_dim] (collapse first 2 dims)
+  auto viewed_self = create_nested_view_tensor(
+      self, self_new_sizes, self_new_strides, std::vector<int64_t>(self_new_offsets));
+
+  // view mat2 as [N * n_heads, head_dim, *] (collapse first 2_dims)
+  auto viewed_mat2 = create_nested_view_tensor(
+      mat2, mat2_new_sizes, mat2_new_strides, std::vector<int64_t>(mat2_new_offsets));
+
+  // output [N * n_heads, *, *]
+  auto bmm_output = at::bmm(viewed_self, viewed_mat2);
+
+  // generate metadata for viewing output as [N, n_heads, *, *]
+  // output of bmm should be contiguous so stride calculations should hold
+  auto out_new_sizes = at::empty({N, 3}, opt);
+  auto out_new_strides = at::empty({N, 3}, opt);
+  std::vector<int64_t> out_new_offsets;
+
+  int64_t* out_new_sizes_ptr = out_new_sizes.data_ptr<int64_t>();
+  int64_t* out_new_strides_ptr = out_new_strides.data_ptr<int64_t>();
+
+  int64_t out_offset = 0;
+  for (int64_t i = 0; i < N; i++) {
+    out_new_offsets.push_back(out_offset);
+    const IntArrayRef& self_size_i = self_sizes[i];
+    const IntArrayRef& mat2_size_i = mat2_sizes[i];
+    auto idx = i * 3;
+    out_new_sizes_ptr[idx] = n_heads;
+    out_new_sizes_ptr[idx + 1] = self_size_i[1];
+    out_new_sizes_ptr[idx + 2] = mat2_size_i[2];
+    out_new_strides_ptr[idx] = self_size_i[1] * mat2_size_i[2];
+    out_new_strides_ptr[idx + 1] = mat2_size_i[2];
+    out_new_strides_ptr[idx + 2] = 1;
+    out_offset += n_heads * (self_size_i[1] * mat2_size_i[2]);
+  }
+
+  auto viewed_out = create_nested_view_tensor(
+      bmm_output, out_new_sizes, out_new_strides, std::vector<int64_t>(out_new_offsets));
+
+  return viewed_out;
+
+}
+
 // Note [nested tensor matmul]
 // This is really a generalized batched matmul dedicated to nested tensors,
 // where `self` and `mat2` have same number (>= 3) of dimensions.
@@ -193,6 +300,20 @@ Tensor matmul_nested(const Tensor& self, const Tensor& mat2) {
     self_dim_size,
     "second last dimension of mat2 has sizes",
     mat2_dim_size);
+
+  // use bmm inference-only fast path for [N, n_heads, *, head_dim] [N, n_heads, head_dim, *]
+  if (self.is_cuda() &&
+      self_dim == 4 && self.is_contiguous() &&
+      mat2_dim == 4 && mat2.is_contiguous() &&
+      !(GradMode::is_enabled() && (self.requires_grad() || mat2.requires_grad()))) {
+    auto n_heads = self_sizes.select(0, 1).select(0, 0).item<int64_t>();
+    auto self_first_dim_n_heads = at::all(self_sizes.select(1, 0) == n_heads).item<bool>();
+    auto mat2_first_dim_n_heads = at::all(mat2_sizes.select(1, 0) == n_heads).item<bool>();
+    if (self_first_dim_n_heads && mat2_first_dim_n_heads) {
+      return matmul_with_bmm_nested(self, mat2);
+    }
+  }
+
   // Construct output size from input sizes
   Tensor output_sizes = self_sizes.clone();
   // The last entry in every row of output_sizes should be last column of mat2_sizes
diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorMatmul.cu b/aten/src/ATen/native/nested/cuda/NestedTensorMatmul.cu
new file mode 100644
index 0000000000000..22cf38f850208
--- /dev/null
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorMatmul.cu
@@ -0,0 +1,416 @@
+#include <type_traits>
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/detail/KernelUtils.h>
+#include <ATen/cuda/detail/IndexUtils.cuh>
+#include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/MemoryAccess.cuh>
+#include <ATen/native/cuda/PersistentSoftmax.cuh>
+#include <ATen/native/cuda/block_reduce.cuh>
+
+#include <c10/cuda/CUDAMathCompat.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <ATen/native/nested/NestedTensorTransformerFunctions.h>
+#include <ATen/native/nested/NestedTensorUtils.h>
+
+#ifndef USE_ROCM
+#ifndef _WIN32
+#include <cutlass/gemm/device/default_gemm_configuration.h>
+#include <cutlass/gemm/device/gemm_grouped.h>
+#include <cutlass/gemm/kernel/default_gemm_grouped.h>
+#endif
+#endif
+
+#include <ATen/NestedTensorImpl.h>
+
+#define BLOCK_DIM 256
+#define GRID_DIM_Y 16
+
+namespace at {
+namespace native {
+
+#ifndef USE_ROCM
+#ifndef _WIN32
+namespace {
+
+template <
+    typename scalar_t,
+    unsigned int kPad,
+    typename LayoutA,
+    typename LayoutB,
+    typename OpClass,
+    typename Arch,
+    typename ThreadBlockShape,
+    typename WarpShape,
+    typename InstructionShape>
+void gemm_grouped_cuda_internal(
+    const std::vector<int64_t>& lda,
+    const std::vector<int64_t>& ldb,
+    const std::vector<int64_t>& ldd,
+    const std::vector<scalar_t*>& aptr,
+    const std::vector<scalar_t*>& bptr,
+    const std::vector<scalar_t*>& dptr,
+    const std::vector<cutlass::gemm::GemmCoord>& gemm_sizes,
+    const int problem_count,
+    at::Device& device) {
+  using Element = scalar_t;
+  using ElementAcc = float;
+
+  using GemmConfiguration =
+      typename cutlass::gemm::device::DefaultGemmConfiguration<
+          OpClass,
+          Arch,
+          Element,
+          Element,
+          Element,
+          ElementAcc>;
+
+  using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
+      Element,
+      LayoutA,
+      cutlass::ComplexTransform::kNone,
+      kPad,
+      Element,
+      LayoutB,
+      cutlass::ComplexTransform::kNone,
+      kPad,
+      Element,
+      cutlass::layout::RowMajor,
+      ElementAcc,
+      OpClass,
+      Arch,
+      ThreadBlockShape,
+      WarpShape,
+      InstructionShape,
+      typename GemmConfiguration::EpilogueOutputOp,
+      cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
+      GemmConfiguration::kStages>::GemmKernel;
+
+  using GemmGrouped = typename cutlass::gemm::device::GemmGrouped<GemmKernel>;
+  using EpilogueOutputOp = typename GemmGrouped::GemmKernel::Epilogue::OutputOp;
+  typename EpilogueOutputOp::Params epilogue_op(/*alpha*/ 1, /*beta*/ 0);
+
+  const int64_t gemm_coord_size =
+      problem_count * ((int64_t)sizeof(cutlass::gemm::GemmCoord));
+  // Number of gmm args not including *problem_sizes
+  at::Tensor gmm_args = at::empty(
+      {problem_count * 6 + gemm_coord_size},
+      at::TensorOptions().dtype(at::kLong).pinned_memory(true));
+
+  // Obtain pointers for each argument (on host)
+  int64_t* lda_data = gmm_args.data_ptr<int64_t>(); // Base pointer
+  int64_t* ldb_data = lda_data + problem_count;
+  int64_t* ldd_data = lda_data + 2 * problem_count;
+  int64_t* ptr_a_data = lda_data + 3 * problem_count;
+  int64_t* ptr_b_data = lda_data + 4 * problem_count;
+  int64_t* ptr_d_data = lda_data + 5 * problem_count;
+  cutlass::gemm::GemmCoord* problem_sizes_data =
+      reinterpret_cast<cutlass::gemm::GemmCoord*>(lda_data + 6 * problem_count);
+
+  // Set arguments into gmm_args from input args
+  for (int i = 0; i < problem_count; ++i) {
+    problem_sizes_data[i] = gemm_sizes[i];
+    lda_data[i] = lda[i];
+    ldb_data[i] = ldb[i];
+    ldd_data[i] = ldd[i];
+    ptr_a_data[i] = reinterpret_cast<int64_t>(aptr[i]);
+    ptr_b_data[i] = reinterpret_cast<int64_t>(bptr[i]);
+    ptr_d_data[i] = reinterpret_cast<int64_t>(dptr[i]);
+  }
+  const int threadblock_count =
+      GemmGrouped::sufficient(problem_sizes_data, problem_count);
+
+  // Transfer arguments to GPU
+  gmm_args = gmm_args.to(device, true);
+
+  // Obtain pointers for each of arguments (on GPU)
+  lda_data = gmm_args.data_ptr<int64_t>(); // Base pointer
+  ldb_data = lda_data + problem_count;
+  ldd_data = lda_data + 2 * problem_count;
+  ptr_a_data = lda_data + 3 * problem_count;
+  ptr_b_data = lda_data + 4 * problem_count;
+  ptr_d_data = lda_data + 5 * problem_count;
+  problem_sizes_data =
+      reinterpret_cast<cutlass::gemm::GemmCoord*>(lda_data + 6 * problem_count);
+
+  // Create GemmGrouped::Arguments using the arguments prepared above
+  typename GemmGrouped::Arguments args(
+      problem_sizes_data,
+      problem_count,
+      threadblock_count,
+      epilogue_op,
+      reinterpret_cast<Element**>(ptr_a_data),
+      reinterpret_cast<Element**>(ptr_b_data),
+      reinterpret_cast<Element**>(ptr_d_data),
+      reinterpret_cast<Element**>(ptr_d_data),
+      lda_data,
+      ldb_data,
+      ldd_data,
+      ldd_data);
+
+  GemmGrouped gemm;
+  cutlass::Status status =
+      gemm.initialize(args, nullptr, at::cuda::getCurrentCUDAStream());
+  TORCH_CHECK(
+      status != cutlass::Status::kErrorWorkspaceNull,
+      "Failed to initialize CUTLASS Grouped GEMM kernel due to workspace.");
+  TORCH_CHECK(
+      status != cutlass::Status::kErrorInternal,
+      "Failed to initialize CUTLASS Grouped GEMM kernel due to internal error.");
+  TORCH_CHECK(
+      status == cutlass::Status::kSuccess,
+      "Failed to initialize CUTLASS Grouped GEMM kernel.");
+
+  // Run CUTLASS group GEMM
+  status = gemm.run(at::cuda::getCurrentCUDAStream());
+  TORCH_CHECK(
+      status == cutlass::Status::kSuccess,
+      "Failed to run CUTLASS Grouped GEMM kernel.");
+
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+template <typename scalar_t>
+bool group_gemm_dispatch(
+    at::Device device,
+    const std::vector<scalar_t*>& aptr,
+    const std::vector<scalar_t*>& bptr,
+    const std::vector<scalar_t*>& dptr,
+    const std::vector<int64_t>& lda,
+    const std::vector<int64_t>& ldb,
+    const std::vector<int64_t>& ldd,
+    std::vector<cutlass::gemm::GemmCoord> gemm_sizes,
+    int64_t ntensors) {
+  return false;
+}
+
+template <>
+bool group_gemm_dispatch(
+    at::Device device,
+    const std::vector<float*>& aptr,
+    const std::vector<float*>& bptr,
+    const std::vector<float*>& dptr,
+    const std::vector<int64_t>& lda,
+    const std::vector<int64_t>& ldb,
+    const std::vector<int64_t>& ldd,
+    std::vector<cutlass::gemm::GemmCoord> gemm_sizes,
+    int64_t ntensors) {
+
+  gemm_grouped_cuda_internal<
+      float,
+      1,
+      cutlass::layout::RowMajor,
+      cutlass::layout::RowMajor,
+      cutlass::arch::OpClassSimt,
+      cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 8>,
+      cutlass::gemm::GemmShape<64, 32, 8>,
+      cutlass::gemm::GemmShape<1, 1, 1>>(
+      lda, ldb, ldd, aptr, bptr, dptr, gemm_sizes, ntensors, device);
+  return true;
+}
+
+template <>
+bool group_gemm_dispatch(
+    at::Device device,
+    const std::vector<c10::Half*>& aptr_,
+    const std::vector<c10::Half*>& bptr_,
+    const std::vector<c10::Half*>& dptr_,
+    const std::vector<int64_t>& lda,
+    const std::vector<int64_t>& ldb,
+    const std::vector<int64_t>& ldd,
+    std::vector<cutlass::gemm::GemmCoord> gemm_sizes,
+    int64_t ntensors) {
+
+  // Check alignment
+  bool all_pad_8 = true;
+  for (int i = 0; i < ntensors; i++) {
+    all_pad_8 = all_pad_8 && (gemm_sizes[i].n() % 8 == 0);
+    all_pad_8 = all_pad_8 && (gemm_sizes[i].k() % 8 == 0);
+
+    // Not sure if this is a requirement, on the safe side
+    all_pad_8 = all_pad_8 && (lda[i] % 8 == 0);
+    all_pad_8 = all_pad_8 && (ldb[i] % 8 == 0);
+    all_pad_8 = all_pad_8 && (ldd[i] % 8 == 0);
+  }
+
+  std::vector<cutlass::half_t*> aptr;
+  std::vector<cutlass::half_t*> bptr;
+  std::vector<cutlass::half_t*> dptr;
+  for (int64_t i = 0; i < ntensors; i++) {
+    aptr.push_back(reinterpret_cast<cutlass::half_t*>(aptr_[i]));
+    bptr.push_back(reinterpret_cast<cutlass::half_t*>(bptr_[i]));
+    dptr.push_back(reinterpret_cast<cutlass::half_t*>(dptr_[i]));
+  }
+  if (all_pad_8) {
+    gemm_grouped_cuda_internal<
+        cutlass::half_t,
+        8,
+        cutlass::layout::RowMajor,
+        cutlass::layout::RowMajor,
+        cutlass::arch::OpClassTensorOp,
+        cutlass::arch::Sm80,
+        cutlass::gemm::GemmShape<128, 128, 32>,
+        cutlass::gemm::GemmShape<64, 64, 32>,
+        cutlass::gemm::GemmShape<16, 8, 16>>(
+        lda, ldb, ldd, aptr, bptr, dptr, gemm_sizes, ntensors, device);
+    return true;
+  } else {
+    gemm_grouped_cuda_internal<
+        cutlass::half_t,
+        1,
+        cutlass::layout::RowMajor,
+        cutlass::layout::RowMajor,
+        cutlass::arch::OpClassSimt,
+        cutlass::arch::Sm80,
+        cutlass::gemm::GemmShape<128, 128, 8>,
+        cutlass::gemm::GemmShape<64, 32, 8>,
+        cutlass::gemm::GemmShape<1, 1, 1>>(
+        lda, ldb, ldd, aptr, bptr, dptr, gemm_sizes, ntensors, device);
+    return true;
+  }
+  // Did not perform GEMM
+  return false;
+}
+
+} // namespace
+
+#endif
+#endif
+
+Tensor bmm_nested_cuda(const Tensor& self, const Tensor& mat2) {
+  if (self.is_nested() && !mat2.is_nested()) {
+    AT_ERROR(
+        "Expected both to be nested, but got a nested self and non-nested other");
+  } else if (!self.is_nested() && mat2.is_nested()) {
+    AT_ERROR(
+        "Expected both to be nested, but got a non-nested self and nested other");
+  }
+  // dispatcher should have guaranteed that at least one is nested
+  auto self_ptr = get_nested_tensor_impl(self);
+  auto mat2_ptr = get_nested_tensor_impl(mat2);
+  TORCH_CHECK(self_ptr->dim() == 3, "batch1 must be a 3D tensor");
+  TORCH_CHECK(mat2_ptr->dim() == 3, "batch2 must be a 3D tensor");
+  int64_t ntensors = self_ptr->size(0), ntensors2 = mat2_ptr->size(0);
+  TORCH_CHECK(
+      ntensors == ntensors2,
+      "Expected size for the 1st dimension of batch2 tensor to be: ",
+      ntensors,
+      " but got: ",
+      ntensors2,
+      ".");
+
+  // create a contiguous output
+  const Tensor& self_sizemat = self_ptr->get_nested_size_tensor();
+  Tensor out_sizemat = self_sizemat.new_empty(self_sizemat.sizes());
+  int64_t* out_sizemat_ptr = out_sizemat.data_ptr<int64_t>();
+
+  std::vector<IntArrayRef> self_sizes = NestedTensor_get_sizes(self_ptr);
+  std::vector<IntArrayRef> mat2_sizes = NestedTensor_get_sizes(mat2_ptr);
+
+  int64_t out_numel = 0;
+  for (int64_t i = 0; i < ntensors; i++) {
+    const IntArrayRef &self_shape = self_sizes[i], &mat2_shape = mat2_sizes[i];
+    const int64_t &self_size0 = self_shape[0], &self_size1 = self_shape[1],
+                  &mat2_size0 = mat2_shape[0], &mat2_size1 = mat2_shape[1];
+    TORCH_CHECK(
+        self_size1 == mat2_size0,
+        i,
+        "-th nested matrices in batch cannot be multiplied (",
+        self_size0,
+        "x",
+        self_size1,
+        " and ",
+        mat2_size0,
+        "x",
+        mat2_size1,
+        ")");
+    out_sizemat_ptr[0] = self_size0;
+    out_sizemat_ptr[1] = mat2_size1;
+    out_sizemat_ptr += 2;
+    out_numel += self_size0 * mat2_size1;
+  }
+  const Tensor &self_buffer = self_ptr->get_unsafe_storage_as_tensor();
+  const Tensor &mat2_buffer = mat2_ptr->get_unsafe_storage_as_tensor();
+  Tensor out_buffer = self_buffer.new_empty(out_numel);
+  Tensor output = wrap_buffer(out_buffer, out_sizemat);
+  auto out_ptr = get_nested_tensor_impl(output);
+
+  std::vector<IntArrayRef> self_strides = NestedTensor_get_strides(self_ptr);
+  std::vector<IntArrayRef> mat2_strides = NestedTensor_get_strides(mat2_ptr);
+  const std::vector<int64_t>& self_offsets = self_ptr->get_storage_offsets();
+  const std::vector<int64_t>& mat2_offsets = mat2_ptr->get_storage_offsets();
+  const std::vector<int64_t>& out_offsets = out_ptr->get_storage_offsets();
+
+#ifndef USE_ROCM
+#ifndef _WIN32
+  bool success = false;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      self.scalar_type(), "group_gemm_dispatch", [&] {
+        std::vector<scalar_t*> aptr(ntensors);
+        std::vector<scalar_t*> bptr(ntensors);
+        std::vector<scalar_t*> dptr(ntensors);
+        std::vector<int64_t> lda(ntensors);
+        std::vector<int64_t> ldb(ntensors);
+        std::vector<int64_t> ldd(ntensors);
+        std::vector<cutlass::gemm::GemmCoord> gemm_sizes;
+        bool all_row_major = true;
+        for (int64_t i = 0; i < ntensors; i++) {
+          const IntArrayRef& self_shape = self_sizes[i];
+          const IntArrayRef& mat2_shape = mat2_sizes[i];
+          const int64_t &self_size0 = self_shape[0];
+          const int64_t &self_size1 = self_shape[1];
+          const int64_t &mat2_size0 = mat2_shape[0];
+          const int64_t &mat2_size1 = mat2_shape[1];
+          gemm_sizes.push_back(
+              cutlass::gemm::GemmCoord(self_size0, mat2_size1, self_size1));
+          aptr[i] = self_buffer.data_ptr<scalar_t>() + self_offsets[i];
+          bptr[i] = mat2_buffer.data_ptr<scalar_t>() + mat2_offsets[i];
+          dptr[i] = out_buffer.data_ptr<scalar_t>() + out_offsets[i];
+          all_row_major = all_row_major && (self_strides[i][1] == 1);
+          all_row_major = all_row_major && (mat2_strides[i][1] == 1);
+          lda[i] = self_strides[i][0];
+          ldb[i] = mat2_strides[i][0];
+          ldd[i] = mat2_size1;
+        }
+        auto dprops = at::cuda::getCurrentDeviceProperties();
+        bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
+        if (all_row_major &&
+            self.is_contiguous() &&
+            mat2.is_contiguous() &&
+            is_sm8x) {
+          success = group_gemm_dispatch<scalar_t>(
+              output.device(),
+              aptr,
+              bptr,
+              dptr,
+              lda,
+              ldb,
+              ldd,
+              gemm_sizes,
+              ntensors);
+        }
+      });
+  if (success) {
+    return output;
+  }
+#endif
+#endif
+
+  std::vector<Tensor> output_unbind = output.unbind();
+  for (int64_t i = 0; i < ntensors; i++) {
+    at::mm_out(
+        output_unbind[i],
+        self_buffer.as_strided(self_sizes[i], self_strides[i], self_offsets[i]),
+        mat2_buffer.as_strided(
+            mat2_sizes[i], mat2_strides[i], mat2_offsets[i]));
+  }
+  return output;
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cu b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cu
index 3d738825fede6..56cac2a898034 100644
--- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cu
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cu
@@ -462,281 +462,5 @@ template void add_padding_kernelLauncher<c10::Half>(
     const int batch_size,
     const int output_batch_size);
 
-namespace {
-
-#ifndef USE_ROCM
-#ifndef _WIN32
-template <typename scalar_t>
-void gemm_grouped_cuda_internal(
-    const std::vector<int64_t>& lda,
-    const std::vector<int64_t>& ldb,
-    const std::vector<int64_t>& ldd,
-    const std::vector<scalar_t*>& aptr,
-    const std::vector<scalar_t*>& bptr,
-    const std::vector<scalar_t*>& dptr,
-    const std::vector<cutlass::gemm::GemmCoord>& gemm_sizes,
-    const int problem_count,
-    at::Device& device) {
-  using Element = scalar_t;
-  using ElementAcc = float;
-  using OpClass = cutlass::arch::OpClassSimt;
-
-  using GemmConfiguration =
-      typename cutlass::gemm::device::DefaultGemmConfiguration<
-          OpClass,
-          cutlass::arch::Sm80,
-          Element,
-          Element,
-          Element,
-          ElementAcc>;
-
-  using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
-      Element,
-      cutlass::layout::RowMajor,
-      cutlass::ComplexTransform::kNone,
-      GemmConfiguration::kAlignmentA,
-      Element,
-      cutlass::layout::RowMajor,
-      cutlass::ComplexTransform::kNone,
-      GemmConfiguration::kAlignmentB,
-      Element,
-      cutlass::layout::RowMajor,
-      ElementAcc,
-      OpClass,
-      cutlass::arch::Sm80,
-      typename GemmConfiguration::ThreadblockShape,
-      typename GemmConfiguration::WarpShape,
-      typename GemmConfiguration::InstructionShape,
-      typename GemmConfiguration::EpilogueOutputOp,
-      cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
-      GemmConfiguration::kStages>::GemmKernel;
-
-  using GemmGrouped = typename cutlass::gemm::device::GemmGrouped<GemmKernel>;
-  using EpilogueOutputOp = typename GemmGrouped::GemmKernel::Epilogue::OutputOp;
-  typename EpilogueOutputOp::Params epilogue_op(/*alpha*/ 1, /*beta*/ 0);
-
-  const int64_t gemm_coord_size =
-      problem_count * ((int64_t)sizeof(cutlass::gemm::GemmCoord));
-  // Number of gmm args not including *problem_sizes
-  at::Tensor gmm_args = at::empty(
-      {problem_count * 6 + gemm_coord_size},
-      at::TensorOptions().dtype(at::kLong).pinned_memory(true));
-
-  // Obtain pointers for each argument (on host)
-  int64_t* lda_data = gmm_args.data_ptr<int64_t>(); // Base pointer
-  int64_t* ldb_data = lda_data + problem_count;
-  int64_t* ldd_data = lda_data + 2 * problem_count;
-  int64_t* ptr_a_data = lda_data + 3 * problem_count;
-  int64_t* ptr_b_data = lda_data + 4 * problem_count;
-  int64_t* ptr_d_data = lda_data + 5 * problem_count;
-  cutlass::gemm::GemmCoord* problem_sizes_data =
-      reinterpret_cast<cutlass::gemm::GemmCoord*>(lda_data + 6 * problem_count);
-
-  // Set arguments into gmm_args from input args
-  for (int i = 0; i < problem_count; ++i) {
-    problem_sizes_data[i] = gemm_sizes[i];
-    lda_data[i] = lda[i];
-    ldb_data[i] = ldb[i];
-    ldd_data[i] = ldd[i];
-    ptr_a_data[i] = reinterpret_cast<int64_t>(aptr[i]);
-    ptr_b_data[i] = reinterpret_cast<int64_t>(bptr[i]);
-    ptr_d_data[i] = reinterpret_cast<int64_t>(dptr[i]);
-  }
-  const int threadblock_count =
-      GemmGrouped::sufficient(problem_sizes_data, problem_count);
-
-  // Transfer arguments to GPU
-  gmm_args = gmm_args.to(device, true);
-
-  // Obtain pointers for each of arguments (on GPU)
-  lda_data = gmm_args.data_ptr<int64_t>(); // Base pointer
-  ldb_data = lda_data + problem_count;
-  ldd_data = lda_data + 2 * problem_count;
-  ptr_a_data = lda_data + 3 * problem_count;
-  ptr_b_data = lda_data + 4 * problem_count;
-  ptr_d_data = lda_data + 5 * problem_count;
-  problem_sizes_data =
-      reinterpret_cast<cutlass::gemm::GemmCoord*>(lda_data + 6 * problem_count);
-
-  // Create GemmGrouped::Arguments using the arguments prepared above
-  typename GemmGrouped::Arguments args(
-      problem_sizes_data,
-      problem_count,
-      threadblock_count,
-      epilogue_op,
-      reinterpret_cast<Element**>(ptr_a_data),
-      reinterpret_cast<Element**>(ptr_b_data),
-      reinterpret_cast<Element**>(ptr_d_data),
-      reinterpret_cast<Element**>(ptr_d_data),
-      lda_data,
-      ldb_data,
-      ldd_data,
-      ldd_data);
-
-  GemmGrouped gemm;
-  cutlass::Status status =
-      gemm.initialize(args, nullptr, at::cuda::getCurrentCUDAStream());
-  TORCH_CHECK(
-      status != cutlass::Status::kErrorWorkspaceNull,
-      "Failed to initialize CUTLASS Grouped GEMM kernel due to workspace.");
-  TORCH_CHECK(
-      status != cutlass::Status::kErrorInternal,
-      "Failed to initialize CUTLASS Grouped GEMM kernel due to internal error.");
-  TORCH_CHECK(
-      status == cutlass::Status::kSuccess,
-      "Failed to initialize CUTLASS Grouped GEMM kernel.");
-
-  // Run CUTLASS group GEMM
-  status = gemm.run(at::cuda::getCurrentCUDAStream());
-  TORCH_CHECK(
-      status == cutlass::Status::kSuccess,
-      "Failed to run CUTLASS Grouped GEMM kernel.");
-
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-}
-#endif
-#endif
-
-} // namespace
-
-Tensor bmm_nested_cuda(const Tensor& self, const Tensor& mat2) {
-  if (self.is_nested() && !mat2.is_nested()) {
-    AT_ERROR(
-        "Expected both to be nested, but got a nested self and non-nested other");
-  } else if (!self.is_nested() && mat2.is_nested()) {
-    AT_ERROR(
-        "Expected both to be nested, but got a non-nested self and nested other");
-  }
-  // TODO currently we only support contiguous NestedTensors
-  auto self_contiguous = self.contiguous();
-  auto mat2_contiguous = mat2.contiguous();
-
-  // dispatcher should have guaranteed that at least one is nested
-  auto self_ptr = get_nested_tensor_impl(self_contiguous);
-  auto mat2_ptr = get_nested_tensor_impl(mat2_contiguous);
-  TORCH_CHECK(self_ptr->dim() == 3, "batch1 must be a 3D tensor");
-  TORCH_CHECK(mat2_ptr->dim() == 3, "batch2 must be a 3D tensor");
-  int64_t ntensors = self_ptr->size(0), ntensors2 = mat2_ptr->size(0);
-  TORCH_CHECK(
-      ntensors == ntensors2,
-      "Expected size for the 1st dimension of batch2 tensor to be: ",
-      ntensors,
-      " but got: ",
-      ntensors2,
-      ".");
-  const Tensor &self_buffer = self_ptr->get_buffer(),
-               &mat2_buffer = mat2_ptr->get_buffer();
-  std::vector<IntArrayRef> self_sizes = NestedTensor_get_sizes(self_ptr),
-                           mat2_sizes = NestedTensor_get_sizes(mat2_ptr),
-                           self_strides = NestedTensor_get_strides(self_ptr),
-                           mat2_strides = NestedTensor_get_strides(mat2_ptr);
-  const std::vector<int64_t>& self_offsets = self_ptr->get_storage_offsets();
-  const std::vector<int64_t>& mat2_offsets = mat2_ptr->get_storage_offsets();
-
-  // create a contiguous output
-  int64_t out_numel = 0;
-  int64_t a_numel = 0;
-  int64_t b_numel = 0;
-  const Tensor& self_sizemat = self_ptr->get_nested_size_tensor();
-  Tensor out_sizemat = self_sizemat.new_empty(self_sizemat.sizes());
-  int64_t* out_sizemat_ptr = out_sizemat.data_ptr<int64_t>();
-  std::vector<int64_t> output_offsets;
-  std::vector<int64_t> a_offsets;
-  std::vector<int64_t> b_offsets;
-  std::vector<int64_t> lda;
-  std::vector<int64_t> ldb;
-  std::vector<int64_t> ldd;
-#ifndef USE_ROCM
-#ifndef _WIN32
-  std::vector<cutlass::gemm::GemmCoord> gemm_sizes;
-#endif
-#endif
-  bool all_row_major = true;
-  for (int64_t i = 0; i < ntensors; i++) {
-    const IntArrayRef &self_shape = self_sizes[i], &mat2_shape = mat2_sizes[i];
-    const int64_t &self_size0 = self_shape[0], &self_size1 = self_shape[1],
-                  &mat2_size0 = mat2_shape[0], &mat2_size1 = mat2_shape[1];
-    TORCH_CHECK(
-        self_size1 == mat2_size0,
-        i,
-        "-th nested matrices in batch cannot be multiplied (",
-        self_size0,
-        "x",
-        self_size1,
-        " and ",
-        mat2_size0,
-        "x",
-        mat2_size1,
-        ")");
-    out_sizemat_ptr[0] = self_size0;
-    out_sizemat_ptr[1] = mat2_size1;
-    out_sizemat_ptr += 2;
-    output_offsets.push_back(out_numel);
-    out_numel += self_size0 * mat2_size1;
-#ifndef USE_ROCM
-#ifndef _WIN32
-    gemm_sizes.push_back(
-        cutlass::gemm::GemmCoord(self_size0, mat2_size1, self_size1));
-#endif
-#endif
-    lda.push_back(self_strides[i][0]);
-    ldb.push_back(mat2_strides[i][0]);
-    ldd.push_back(mat2_size1);
-    a_offsets.push_back(a_numel);
-    b_offsets.push_back(b_numel);
-    a_numel += self_size0 * self_strides[i][0];
-    b_numel += mat2_size0 * mat2_strides[i][0];
-    all_row_major = all_row_major && (self_strides[i][1] == 1);
-    all_row_major = all_row_major && (mat2_strides[i][1] == 1);
-  }
-  Tensor out_buffer = self_buffer.new_empty(out_numel);
-  Tensor output = wrap_buffer(out_buffer, out_sizemat);
-  at::Device device = output.device();
-
-#ifndef USE_ROCM
-#ifndef _WIN32
-  auto dprops = at::cuda::getCurrentDeviceProperties();
-  bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
-  if (is_sm8x && all_row_major) {
-    if (self.dtype() == at::kFloat) {
-      std::vector<float*> aptr;
-      std::vector<float*> bptr;
-      std::vector<float*> dptr;
-      for (int64_t i = 0; i < ntensors; i++) {
-        aptr.push_back(self_buffer.data_ptr<float>() + a_offsets[i]);
-        bptr.push_back(mat2_buffer.data_ptr<float>() + b_offsets[i]);
-        dptr.push_back(out_buffer.data_ptr<float>() + output_offsets[i]);
-      }
-      gemm_grouped_cuda_internal<float>(
-          lda, ldb, ldd, aptr, bptr, dptr, gemm_sizes, ntensors, device);
-      return output;
-    }
-    if (self.dtype() == at::kHalf) {
-      std::vector<c10::Half*> aptr;
-      std::vector<c10::Half*> bptr;
-      std::vector<c10::Half*> dptr;
-      for (int64_t i = 0; i < ntensors; i++) {
-        aptr.push_back(self_buffer.data_ptr<c10::Half>() + a_offsets[i]);
-        bptr.push_back(mat2_buffer.data_ptr<c10::Half>() + b_offsets[i]);
-        dptr.push_back(out_buffer.data_ptr<c10::Half>() + output_offsets[i]);
-      }
-      gemm_grouped_cuda_internal<c10::Half>(
-          lda, ldb, ldd, aptr, bptr, dptr, gemm_sizes, ntensors, device);
-      return output;
-    }
-  }
-#endif
-#endif
-  std::vector<Tensor> output_unbind = output.unbind();
-  for (int64_t i = 0; i < ntensors; i++) {
-    at::mm_out(
-        output_unbind[i],
-        self_buffer.as_strided(self_sizes[i], self_strides[i], self_offsets[i]),
-        mat2_buffer.as_strided(
-            mat2_sizes[i], mat2_strides[i], mat2_offsets[i]));
-  }
-  return output;
-}
-
 } // namespace native
 } // namespace at
diff --git a/benchmarks/nested/nested_bmm_bench.py b/benchmarks/nested/nested_bmm_bench.py
index 311b23395efdb..56e283effddf9 100644
--- a/benchmarks/nested/nested_bmm_bench.py
+++ b/benchmarks/nested/nested_bmm_bench.py
@@ -1,4 +1,5 @@
 import argparse
+import random
 
 import torch
 
@@ -15,31 +16,38 @@ def bench(nt_a, nt_b, niter):
         nt_c = nt_a.bmm(nt_b)
     end_event.record()
     torch.cuda.synchronize()
-    runtime = (start_event.elapsed_time(end_event) * 1.0e-3) / niter
+    runtime = (start_event.elapsed_time(end_event)) / niter
     return runtime
 
 
-def sweep_n(ntensor, niter, dtype):
-    print("n, dtype, ntensor, gflop, runtime, tflop/s")
-    for n in [16, 32, 64, 128, 256, 512, 1024, 2048, 4096]:
-        nt_a = torch.nested_tensor(
-            [torch.randn(n, n).to(dtype).cuda() for t in range(ntensor)]
+def sweep_n(niter, dtype):
+    for ntensor in [4, 8, 16, 32, 64, 128, 256]:
+        tensors = [torch.randn(256, random.randint(100, 200)) for t in range(ntensor)]
+        nt_a = torch.nested.nested_tensor(
+            tensors,
+            dtype=dtype,
+            device="cuda",
         )
-        nt_b = torch.nested_tensor(
-            [torch.randn(n, n).to(dtype).cuda() for t in range(ntensor)]
+        nt_b = torch.nested.nested_tensor(
+            [t.t() for t in tensors],
+            dtype=dtype,
+            device="cuda",
         )
         runtime = bench(nt_a, nt_b, niter)
-        tflop = n * n * n * ntensor * 2 / 1e12
-        print(n, dtype, ntensor, tflop, runtime, tflop / runtime)
+        nt_a_size = torch.ops.aten._nested_tensor_size(nt_a)
+        lengths = nt_a_size[:, 1]
+        print(",".join(map(str, [ntensor, dtype, lengths.min().item(),
+              lengths.float().mean().item(), lengths.max().item(), runtime])))
+
 
 if __name__ == "__main__":
+    random.seed(123)
     parser = argparse.ArgumentParser(description="Nested Tensor BMM Benchmark")
     parser.add_argument("--niter", default="10", type=int)
-    parser.add_argument("--ntensor", default="20", type=int)
 
     args = parser.parse_args()
     niter = args.niter
-    ntensor = args.ntensor
 
-    sweep_n(ntensor, niter, torch.float32)
-    sweep_n(ntensor, niter, torch.float16)
+    print("ntensor,dtype,min_length,mean_length,max_length,runtime")
+    sweep_n(niter, torch.float32)
+    sweep_n(niter, torch.float16)
diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py
index baf00a4d0f2b4..663acaa40ce67 100644
--- a/test/test_nestedtensor.py
+++ b/test/test_nestedtensor.py
@@ -1,9 +1,9 @@
 # Owner(s): ["module: nestedtensor"]
 
-import unittest
-
 import torch
 import torch.nn
+import unittest
+import numpy as np
 from torch.testing._internal.common_device_type import (
     dtypes,
     dtypesIfCUDA,
@@ -1224,6 +1224,16 @@ def _test_bmm(self, device, dtype):
         else:
             self.assertEqual(actual, expect)
 
+        # test tensorcore path
+        nt0 = torch.nested.nested_tensor([torch.randn((2, 8)), torch.randn((3, 16))], device=device, dtype=dtype)
+        nt1 = torch.nested.nested_tensor([torch.randn((8, 8)), torch.randn((16, 8))], device=device, dtype=dtype)
+        actual = torch.nested.to_padded_tensor(nt0.bmm(nt1), 0.0)
+        expect = torch.nested.to_padded_tensor(nt0, 0.0).bmm(torch.nested.to_padded_tensor(nt1, 0.0))
+        if dtype == torch.float16:
+            self.assertEqual(actual, expect, rtol=1e-3, atol=1e-3)
+        else:
+            self.assertEqual(actual, expect)
+
     @onlyCUDA
     @dtypes(torch.float, torch.double, torch.float16)
     def test_bmm_cuda(self, device, dtype):
@@ -1235,15 +1245,48 @@ def test_bmm_cuda(self, device, dtype):
     def test_bmm_cpu(self, device, dtype):
         self._test_bmm(device, dtype)
 
-    # TODO: Re-enable this test once bmm supports non-contiguous inputs.
-    # # cannot test torch.float16 because: RuntimeError: "addmm_impl_cpu_" not implemented for 'Half'
-    # @dtypes(torch.float, torch.double)
-    # def test_bmm_noncontiguous(self, device, dtype):
-    #     nt0_contiguous, nt0_noncontiguous = random_nt_noncontiguous_pair((2, 3), device, dtype)
-    #     nt1_contiguous, nt1_noncontiguous = random_nt_noncontiguous_pair((6, 7), device, dtype)
-    #     self.assertEqual(
-    #         nt0_contiguous.transpose(-1, -2).bmm(nt1_contiguous),
-    #         nt0_noncontiguous.transpose(-1, -2).bmm(nt1_noncontiguous))
+    # cannot test torch.float16 because: RuntimeError: "addmm_impl_cpu_" not implemented for 'Half'
+    @dtypes(torch.float, torch.double)
+    def test_bmm_noncontiguous(self, device, dtype):
+        nt0_contiguous, nt0_noncontiguous = random_nt_noncontiguous_pair((2, 3), device, dtype)
+        nt1_contiguous, nt1_noncontiguous = random_nt_noncontiguous_pair((6, 7), device, dtype)
+        self.assertEqual(
+            nt0_contiguous.transpose(-1, -2).bmm(nt1_contiguous),
+            nt0_noncontiguous.transpose(-1, -2).bmm(nt1_noncontiguous))
+
+    @dtypes(torch.float, torch.double)
+    def test_matmul_with_bmm_path(self, device, dtype):
+        def unbind_rebind_matmul(nt1, nt2):
+            t1s = nt1.unbind()
+            t2s = nt2.unbind()
+            out_ts = [t1.matmul(t2) for t1, t2 in zip(t1s, t2s)]
+            return torch.nested.nested_tensor(out_ts)
+
+        # [N, n_head, *, head_dim], [N, n_head, head_dim, *]
+        N = np.random.randint(2, 5)
+        n_heads = np.random.randint(2, 5)
+        head_dim = 3
+        t1s = []
+        t2s = []
+        for _ in range(N):
+            seq_len1 = np.random.randint(2, 5)
+            seq_len2 = np.random.randint(2, 5)
+            t1s.append(torch.randn(n_heads, seq_len1, head_dim))
+            t2s.append(torch.randn(n_heads, head_dim, seq_len2))
+        nt1 = torch.nested.nested_tensor(t1s, device=device, dtype=dtype)
+        nt2 = torch.nested.nested_tensor(t2s, device=device, dtype=dtype)
+        self.assertEqual(torch.matmul(nt1, nt2), unbind_rebind_matmul(nt1, nt2))
+
+        # test with noncontiguous
+        t3s = []
+        t4s = []
+        for _ in range(N):
+            seq_len = np.random.randint(2, 5)
+            t3s.append(torch.randn(seq_len, n_heads, head_dim))
+            t4s.append(torch.randn(seq_len, n_heads, head_dim))
+        nt3 = torch.nested.nested_tensor(t3s, device=device, dtype=dtype).transpose(1, 2)
+        nt4 = torch.nested.nested_tensor(t4s, device=device, dtype=dtype).transpose(1, 2).transpose(2, 3)
+        self.assertEqual(torch.matmul(nt3, nt4), unbind_rebind_matmul(nt3, nt4))
 
     # cannot test torch.float16 because: RuntimeError: "bmm" not implemented for 'Half'
     @dtypes(torch.float, torch.double)

From 47a3538cdc870b0010bd236c105a5a4861c2c2f4 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 2 Nov 2022 21:59:54 +0000
Subject: [PATCH 0487/1922] Print only the driver version from the first GPU
 (#88364)

For example, distributed test has more than one of them:

```
nvidia-smi --query-gpu=driver_version --format=csv,noheader
515.57
515.57
```

while `--id=0` correctly prints:

```
nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0
515.57
```

This is to avoid re-install the same driver as in https://github.com/pytorch/pytorch/actions/runs/3380662072/jobs/5613981088

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88364
Approved by: https://github.com/seemethere, https://github.com/ZainRizvi
---
 .github/scripts/install_nvidia_utils_linux.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/install_nvidia_utils_linux.sh b/.github/scripts/install_nvidia_utils_linux.sh
index c5c96e0976aac..7806dced2f17f 100755
--- a/.github/scripts/install_nvidia_utils_linux.sh
+++ b/.github/scripts/install_nvidia_utils_linux.sh
@@ -36,8 +36,9 @@ install_nvidia_driver_amzn2() {
         # Check if NVIDIA driver has already been installed
         if [ -x "$(command -v nvidia-smi)" ]; then
             set +e
-            # The driver exists, check its version next
-            INSTALLED_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader)
+            # The driver exists, check its version next. Also check only the first GPU if there are more than one of them
+            # so that the same driver version is not print over multiple lines
+            INSTALLED_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0)
             NVIDIA_SMI_STATUS=$?
 
             if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then

From ad3f597fbda00548531d8cbc2bdfafc95b13f253 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 2 Nov 2022 22:35:14 +0000
Subject: [PATCH 0488/1922] Revert "torchdynamo support modules() for nn_module
 (#88023)"

This reverts commit eb91e8a534f94127a6d744543f2080a44bca9e57.

Reverted https://github.com/pytorch/pytorch/pull/88023 on behalf of https://github.com/mehtanirav due to [Internal breakages](https://www.internalfb.com/intern/sandcastle/job/13510799692855066/insights)
---
 test/dynamo/test_repros.py           | 20 --------------------
 torch/_dynamo/variables/nn_module.py |  2 --
 2 files changed, 22 deletions(-)

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index efaf06a73580d..cda52384f1ff2 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -1754,26 +1754,6 @@ def forward(self, inp):
         args = (torch.randn(3, 4),)
         self.assertTrue(same(mod(*args), opt_mod(*args)))
 
-    def test_modules(self):
-        class Foo(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.fc = torch.nn.Linear(4, 3)
-
-            def forward(self, inp):
-                res = torch.zeros(3, 3)
-                for mod in self.modules():
-                    res += self.fc(inp)
-                return res
-
-        mod = Foo()
-        args = (torch.ones(3, 4),)
-        cnt = torch._dynamo.testing.CompileCounter()
-        opt_mod = torch._dynamo.optimize(cnt, nopython=True)(mod)
-        self.assertTrue(same(mod(*args), opt_mod(*args)))
-        self.assertEqual(cnt.op_count, 5)
-        self.assertEqual(cnt.frame_count, 1)
-
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py
index 1922980fc957f..6f7c2ff287373 100644
--- a/torch/_dynamo/variables/nn_module.py
+++ b/torch/_dynamo/variables/nn_module.py
@@ -337,8 +337,6 @@ def named_embed(name, obj):
             ):
                 result.append(named_embed(name, submod))
             return ListIteratorVariable(result, mutable_local=MutableLocal(), **options)
-        elif name == "modules":
-            return wrap_values(module.named_modules())
         elif name == "parameters":
             return wrap_values(module.named_parameters(**get_kwargs("recurse")))
         elif name == "values":

From 394d0357fe8ccb1d2a6387cab0e2be6c446c7f45 Mon Sep 17 00:00:00 2001
From: jjsjann123 <alex.jann2012@gmail.com>
Date: Wed, 2 Nov 2022 22:47:30 +0000
Subject: [PATCH 0489/1922] [nvFuser] patches profiling on scalar arguments for
 std/var (#88165)

Fixes #86531

Added profiling on scalar values for aten::std & aten::var.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88165
Approved by: https://github.com/kevinstephano
---
 test/test_jit_cuda_fuser.py            | 21 +++++++++++++++++++++
 torch/csrc/jit/codegen/cuda/parser.cpp | 24 ++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py
index c674ba0d57606..44fa2556243d9 100644
--- a/test/test_jit_cuda_fuser.py
+++ b/test/test_jit_cuda_fuser.py
@@ -383,6 +383,27 @@ def func(x: torch.Tensor):
                         self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4))
                         self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD)
 
+    @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_variance_profiling(self):
+        with nvfuser_singleton_fusion(True):
+            for op in [torch.var, torch.std]:
+                for dtype in [torch.float16, torch.float32, torch.double]:
+                    for axis in [-2, -1, 2, 1]:
+                        for unbiased in [False, True]:
+                            for keepdim in [False, True]:
+                                def t(x: torch.Tensor, dim: List[int], unbiased: bool, keepdim: bool):
+                                    o = torch.mul(x, 2.0)
+                                    o = op(o, dim=dim, unbiased=unbiased, keepdim=keepdim)
+                                    return o
+
+                                x = torch.randn(8, 4, 16, dtype=dtype, device="cuda")
+                                t_jit = torch.jit.script(t)
+                                self._run_helper(t_jit, t, x, [axis], unbiased, keepdim, check_stride=False, check_runs=5)
+
+
     @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
diff --git a/torch/csrc/jit/codegen/cuda/parser.cpp b/torch/csrc/jit/codegen/cuda/parser.cpp
index 251e1e6f11a2d..95d266db58270 100644
--- a/torch/csrc/jit/codegen/cuda/parser.cpp
+++ b/torch/csrc/jit/codegen/cuda/parser.cpp
@@ -4352,6 +4352,30 @@ bool insertProfileIValue(ProfilingRecord* pr, Node* node, size_t offset) {
     }
   }
 
+  static auto var_dim_schema =
+      getOperatorForLiteral(
+          "aten::var.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor")
+          ->schema();
+  static auto std_dim_schema =
+      getOperatorForLiteral(
+          "aten::std.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor")
+          ->schema();
+  if (node->matches(var_dim_schema) || node->matches(std_dim_schema)) {
+    switch (offset) {
+      case 1:
+        profileIntList(pr, node, offset);
+        return true;
+      case 2:
+        profileBool(pr, node, offset);
+        return true;
+      case 3:
+        profileBool(pr, node, offset);
+        return true;
+      default:
+        return false;
+    }
+  }
+
   return false;
 }
 

From fa4400975eecbd2deb3745c0769c85436b4321c8 Mon Sep 17 00:00:00 2001
From: Jiong Gong <jiong.gong@intel.com>
Date: Wed, 2 Nov 2022 22:57:07 +0000
Subject: [PATCH 0490/1922] Update Reviewers for CPU-related Modules (#87591)

This PR updates the reviewers responsible for CPU related modules: "IDEEP", "oneDNN graph", "CPU ATen backend", "CPU frontend" and "Autocast". It also adds "NNC" and adds the corresponding reviewers.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87591
Approved by: https://github.com/malfet
---
 .github/merge_rules.yaml | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/.github/merge_rules.yaml b/.github/merge_rules.yaml
index baee9730d598e..93e938d547edd 100644
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@@ -241,9 +241,12 @@
   - third_party/ideep
   - caffe2/ideep/**
   - caffe2/python/ideep/**
+  - cmake/Modules/FindMKLDNN.cmake
+  - third_party/mkl-dnn.BUILD
   approved_by:
   - XiaobingSuper
-  - yanbing-j
+  - jgong5
+  - mingfeima
   mandatory_checks_name:
   - EasyCLA
   - Lint
@@ -256,6 +259,7 @@
   approved_by:
   - sanchitintel
   - chunyuan-w
+  - jgong5
   mandatory_checks_name:
   - EasyCLA
   - Lint
@@ -268,9 +272,11 @@
   - aten/src/ATen/native/quantized/cpu/**
   - aten/src/ATen/native/Convolution*.cpp
   - aten/src/ATen/native/mkldnn/**
+  - test/test_mkldnn.py
   approved_by:
   - mingfeima
   - XiaobingSuper
+  - jgong5
   mandatory_checks_name:
   - EasyCLA
   - Lint
@@ -283,7 +289,7 @@
   - test/test_mkldnn.py
   approved_by:
   - leslie-fang-intel
-  - CaoE
+  - jgong5
   mandatory_checks_name:
   - EasyCLA
   - Lint
@@ -297,7 +303,18 @@
   - test/test_autocast.py
   approved_by:
   - leslie-fang-intel
-  - CaoE
+  - jgong5
+  mandatory_checks_name:
+  - EasyCLA
+  - Lint
+  - pull
+
+- name: NNC
+  patterns:
+  - torch/csrc/jit/tensorexpr/**
+  approved_by:
+  - EikanWang
+  - jgong5
   mandatory_checks_name:
   - EasyCLA
   - Lint

From 783f3b6937d35e5bc6017d91a8aa1f90e497a879 Mon Sep 17 00:00:00 2001
From: Charlie Yan <charlieyan@meta.com>
Date: Wed, 2 Nov 2022 23:02:08 +0000
Subject: [PATCH 0491/1922] [1/n] Thread PG: fix pyre error of class
 ProcessGroup (#88281)

Summary: Fix the typing stub of `ProcessGroup` in "torch/distributed/__init__.py", so that it won't confuse pyre, and we can remove a lot of pyre suppression comments.

Test Plan: pyre check

Differential Revision: D40921667

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88281
Approved by: https://github.com/wanchaol
---
 torch/distributed/__init__.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index e7361c1d5dcc0..fb7edffb96010 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -82,5 +82,8 @@ def is_available() -> bool:
     #   python test/test_public_bindings.py -k test_correct_module_names
     # working even when USE_DISTRIBUTED=0.  Feel free to add more
     # stubs as necessary.
-    class ProcessGroup:  # type: ignore[no-redef]
+    # We cannot define stubs directly because they confuse pyre
+
+    class _ProcessGroupStub:
         pass
+    sys.modules["torch.distributed"].ProcessGroup = _ProcessGroupStub  # type: ignore[attr-defined]

From 16c1ae97dfa543d6bc5d099b19674e9f26f6debd Mon Sep 17 00:00:00 2001
From: Henry Cheng <39224097+jazzysoggy@users.noreply.github.com>
Date: Wed, 2 Nov 2022 23:07:45 +0000
Subject: [PATCH 0492/1922] [ONNX] Produce comprehensive assertion errors for
 quantized outputs (#87242)

Fixes #83038

Currently _compare_ort_pytorch_outputs does not produce clearer error messages for differences in the zero point or scale of the two outputs. It also does not produce a clear error message for whether both are quantized.

This pull request adds assertions to output whether the scales and zero points have differences, and whether each individual output is quantized.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87242
Approved by: https://github.com/justinchuby, https://github.com/BowenBao
---
 torch/onnx/verification.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/torch/onnx/verification.py b/torch/onnx/verification.py
index 0aa585fe44b14..8c3d63a268bae 100644
--- a/torch/onnx/verification.py
+++ b/torch/onnx/verification.py
@@ -199,6 +199,10 @@ def _compare_ort_pytorch_outputs(
                         f"within acceptable range {acceptable_error_percentage}."
                     )
                     continue
+            if ort_out.dtype == np.uint8 or ort_out.dtype == np.int8:
+                warnings.warn("ONNX output is quantized")
+            if pt_out.dtype == np.uint8 or pt_out.dtype == np.int8:
+                warnings.warn("PyTorch output is quantized")
             raise
 
 
From 115af4577e8a4dddeed5779c8360b5c4559e2ec5 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@fb.com>
Date: Wed, 2 Nov 2022 23:24:33 +0000
Subject: [PATCH 0493/1922] Add _foreach_addc(div/mul)(_).Tensor (#88157)

Support passing value scalars as a flat 1D Tensor.

Currently we can only pass either an individual scalar or a ScalarList.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88157
Approved by: https://github.com/ngimel, https://github.com/albanD
---
 aten/src/ATen/native/ForeachOpsKernels.cpp    | 28 +++++++++-
 aten/src/ATen/native/ForeachUtils.h           | 40 ++++++++++++++
 .../ATen/native/cuda/ForeachPointwiseOp.cu    | 35 ++++++++++++
 aten/src/ATen/native/native_functions.yaml    | 30 +++++++++++
 test/test_foreach.py                          | 53 ++++++++++++++-----
 5 files changed, 173 insertions(+), 13 deletions(-)

diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp
index bbe12b73592b1..4b6ef9196f990 100644
--- a/aten/src/ATen/native/ForeachOpsKernels.cpp
+++ b/aten/src/ATen/native/ForeachOpsKernels.cpp
@@ -196,7 +196,30 @@ void foreach_tensor_##OP##_scalarlist_slow_(TensorList input, TensorList tensors
   for(const auto i : c10::irange(input.size())) {                                                                                                       \
     input[i].OP##_(tensors1[i], tensors2[i], scalars[i]);                                                                                               \
   }                                                                                                                                                     \
-}                                                                                                                                                       \
+}
+
+#define FOREACH_POINTWISE_OP_TENSOR(OP)                                    \
+  std::vector<Tensor> foreach_tensor_##OP##_tensor_slow(                   \
+      TensorList input,                                                    \
+      TensorList tensors1,                                                 \
+      TensorList tensors2,                                                 \
+      const Tensor& scalars_) {                                            \
+    auto scalars = convert_tensor_to_scalar_list(scalars_, input.size());  \
+    check_foreach_api_restrictions(input, tensors1, tensors2, scalars);    \
+    return foreach_tensor_##OP##_scalarlist_slow(                          \
+        input, tensors1, tensors2, scalars);                               \
+  }                                                                        \
+                                                                           \
+  void foreach_tensor_##OP##_tensor_slow_(                                 \
+      TensorList input,                                                    \
+      TensorList tensors1,                                                 \
+      TensorList tensors2,                                                 \
+      const Tensor& scalars_) {                                            \
+    auto scalars = convert_tensor_to_scalar_list(scalars_, input.size());  \
+    check_foreach_api_restrictions(input, tensors1, tensors2, scalars);    \
+    foreach_tensor_##OP##_scalarlist_slow_(                                \
+        input, tensors1, tensors2, scalars);                               \
+  }
 
 FOREACH_BINARY_OP_LIST_ALPHA(add);
 FOREACH_BINARY_OP_LIST_ALPHA(sub);
@@ -249,6 +272,9 @@ FOREACH_POINTWISE_OP_SCALAR(addcmul);
 FOREACH_POINTWISE_OP_SCALARLIST(addcdiv);
 FOREACH_POINTWISE_OP_SCALARLIST(addcmul);
 
+FOREACH_POINTWISE_OP_TENSOR(addcdiv);
+FOREACH_POINTWISE_OP_TENSOR(addcmul);
+
 // NOTE(crcrpar): It didn't seem feasible to use `self[i]` as both the first and the last
 // arguments of `maximum_out` and `minimum_out` so I tentatively embarrassingly get and copy
 // the result to `self[i]`.
diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h
index 033052f401f6b..0166d040863c5 100644
--- a/aten/src/ATen/native/ForeachUtils.h
+++ b/aten/src/ATen/native/ForeachUtils.h
@@ -2,6 +2,7 @@
 
 #include <ATen/core/Tensor.h>
 #include <c10/util/irange.h>
+#include <ATen/Dispatch.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/NativeFunctions.h>
@@ -123,6 +124,45 @@ bool check_fast_path_restrictions(
     return true;
 }
 
+std::vector<c10::Scalar> convert_tensor_to_scalar_list(
+    const Tensor& scalarList_,
+    int64_t expect_length) {
+  std::vector<c10::Scalar> scalarList;
+  TORCH_CHECK(
+      scalarList_.device() == c10::kCPU,
+      "Expected scalars to be on CPU, got ",
+      scalarList_.device(),
+      " instead.");
+  TORCH_CHECK(
+      scalarList_.is_contiguous(), "Expected scalars to be contiguous.");
+  TORCH_CHECK(
+      scalarList_.dim() == 1,
+      "Expected packed scalar Tensor to be of dimension 1. Got ",
+      scalarList_.dim(),
+      " instead.");
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
+      kComplexHalf,
+      kHalf,
+      kBool,
+      kBFloat16,
+      scalarList_.scalar_type(),
+      "convert_tensor_to_scalar_list",
+      [&]() {
+        const scalar_t* scalar_data = scalarList_.data_ptr<scalar_t>();
+        TORCH_CHECK(
+            (expect_length == scalarList_.size(0)),
+            "Expected length of scalars to match input of length ",
+            expect_length,
+            " but got ",
+            scalarList_.size(0),
+            " instead.");
+        for (int64_t i = 0; i < scalarList_.size(0); i++) {
+          scalarList.push_back(c10::Scalar(scalar_data[i]));
+        }
+      });
+  return scalarList;
+}
+
 bool can_use_fast_route(ArrayRef<TensorList> tensorLists,
                         ArrayRef<Scalar> scalarList = {},
                         bool does_op_promote_integer_inputs_to_float = false) {
diff --git a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
index 3b04b68b0f391..27b3d77ad4d6c 100644
--- a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
@@ -160,10 +160,45 @@ void foreach_tensor_##NAME##_scalarlist_cuda_(TensorList input, TensorList tenso
     foreach_pointwise_op_<OP>(input, tensors1, tensors2, scalars);                                                                                       \
 }
 
+#define FOREACH_POINTWISE_OP_TENSOR(NAME, OP)                             \
+  std::vector<Tensor> foreach_tensor_##NAME##_tensor_cuda(                \
+      TensorList input,                                                   \
+      TensorList tensors1,                                                \
+      TensorList tensors2,                                                \
+      const Tensor& scalars_) {                                           \
+    auto scalars = convert_tensor_to_scalar_list(scalars_, input.size()); \
+    check_foreach_api_restrictions(input, tensors1, tensors2, scalars);   \
+    if (!can_use_fast_route({input, tensors1, tensors2}) ||               \
+        has_integral_tensor(input, /* includeBool */ true)) {             \
+      return at::native::foreach_tensor_##NAME##_scalarlist_slow(         \
+          input, tensors1, tensors2, scalars);                            \
+    }                                                                     \
+                                                                          \
+    return foreach_pointwise_op<OP>(input, tensors1, tensors2, scalars);  \
+  }                                                                       \
+                                                                          \
+  void foreach_tensor_##NAME##_tensor_cuda_(                              \
+      TensorList input,                                                   \
+      TensorList tensors1,                                                \
+      TensorList tensors2,                                                \
+      const Tensor& scalars_) {                                           \
+    auto scalars = convert_tensor_to_scalar_list(scalars_, input.size()); \
+    check_foreach_api_restrictions(input, tensors1, tensors2, scalars);   \
+    if (!can_use_fast_route({input, tensors1, tensors2}, scalars) ||      \
+        has_integral_tensor(input, /* includeBool */ true)) {             \
+      return at::native::foreach_tensor_##NAME##_scalarlist_slow_(        \
+          input, tensors1, tensors2, scalars);                            \
+    }                                                                     \
+                                                                          \
+    foreach_pointwise_op_<OP>(input, tensors1, tensors2, scalars);        \
+  }
+
 FOREACH_POINTWISE_OP_SCALAR(addcmul, std::multiplies);
 FOREACH_POINTWISE_OP_SCALAR(addcdiv, std::divides);
 FOREACH_POINTWISE_OP_SCALARLIST(addcmul, std::multiplies);
 FOREACH_POINTWISE_OP_SCALARLIST(addcdiv, std::divides);
+FOREACH_POINTWISE_OP_TENSOR(addcdiv, std::divides);
+FOREACH_POINTWISE_OP_TENSOR(addcmul, std::multiplies);
 
 
 // Why bool tensors are pushed to slowpath?
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 300a14dd6baf6..f374b498133ee 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -9838,6 +9838,14 @@
     CUDA: foreach_tensor_addcdiv_scalarlist_cuda_
   autogen: _foreach_addcdiv.ScalarList_out
 
+- func: _foreach_addcdiv_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcdiv_tensor_slow_
+    CUDA: foreach_tensor_addcdiv_tensor_cuda_
+  autogen: _foreach_addcdiv.Tensor_out
+
 - func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
   variants: function
@@ -9846,6 +9854,14 @@
     CUDA: foreach_tensor_addcmul_scalarlist_cuda_
   autogen: _foreach_addcmul.ScalarList_out
 
+- func: _foreach_addcmul_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcmul_tensor_slow_
+    CUDA: foreach_tensor_addcmul_tensor_cuda_
+  autogen: _foreach_addcmul.Tensor_out
+
 - func: _foreach_addcdiv.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
   variants: function
@@ -9867,6 +9883,13 @@
     CPU: foreach_tensor_addcdiv_scalarlist_slow
     CUDA: foreach_tensor_addcdiv_scalarlist_cuda
 
+- func: _foreach_addcdiv.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcdiv_tensor_slow
+    CUDA: foreach_tensor_addcdiv_tensor_cuda
+
 - func: _foreach_addcmul.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
   variants: function
@@ -9874,6 +9897,13 @@
     CPU: foreach_tensor_addcmul_scalarlist_slow
     CUDA: foreach_tensor_addcmul_scalarlist_cuda
 
+- func: _foreach_addcmul.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
+  device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_addcmul_tensor_slow
+    CUDA: foreach_tensor_addcmul_tensor_cuda
+
 - func: _foreach_maximum.List(Tensor[] self, Tensor[] other) -> Tensor[]
   device_check: NoCheck   # foreach kernels fall back to slow path when tensor are on different devices
   variants: function
diff --git a/test/test_foreach.py b/test/test_foreach.py
index 3e2921ed73da7..13e0e6ebc9cf1 100644
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@@ -31,6 +31,7 @@
     complex(1.0 - random.random(), 1.0 - random.random()),
 )
 
+
 def getScalarLists(N):
     return (
         ("int", [random.randint(0, 9) + 1 for _ in range(N)]),
@@ -41,8 +42,10 @@ def getScalarLists(N):
         ("mixed", [True, 1, 2.0, 3.0 + 4.5j] + [3.0 for _ in range(N - 4)]),
     )
 
+
 _BOOL_SUB_ERR_MSG = "Subtraction, the `-` operator"
 
+
 class RegularFuncWrapper:
 
     def __init__(self, func):
@@ -88,6 +91,7 @@ def __call__(self, inputs, is_cuda, is_fastpath, **kwargs):
         # note(mkozuki): inplace foreach functions are void functions.
         return inputs[0] if self._is_inplace else actual
 
+
 class TestForeach(TestCase):
 
     @property
@@ -159,7 +163,7 @@ def _test_binary_op_tensorlists(self, device, dtype, opinfo, N, is_fastpath, dis
         inputs = [
             opinfo.sample_inputs(device, dtype, N, noncontiguous=not is_fastpath),
             [
-                make_tensor((N - i , 1), device=device, dtype=dtype, noncontiguous=not is_fastpath) for i in range(N)
+                make_tensor((N - i, 1), device=device, dtype=dtype, noncontiguous=not is_fastpath) for i in range(N)
             ],
         ]
         self._binary_test(dtype, op, ref, inputs, is_fastpath and disable_fastpath, is_inplace=False)
@@ -248,7 +252,7 @@ def test_binary_op_scalarlist_slowpath(self, device, dtype, op):
             for _, scalarlist in getScalarLists(N):
                 self._test_binary_op_scalarlist(device, dtype, op, N, scalarlist, False, False)
 
-    def _pointwise_test(self, dtype, op, ref, inputs, is_fastpath, is_inplace, *, values=None):
+    def _pointwise_test(self, dtype, op, ref, inputs, is_fastpath, is_inplace, *, values=None, custom_values_err=None):
         ref_inputs = [[t.clone().detach() for t in inputs[0]], inputs[1], inputs[2]] if is_inplace else inputs
         try:
             actual = op(inputs, self.is_cuda, is_fastpath)
@@ -262,13 +266,18 @@ def _pointwise_test(self, dtype, op, ref, inputs, is_fastpath, is_inplace, *, va
             try:
                 actual = op(inputs + [values], self.is_cuda, is_fastpath)
             except RuntimeError as e:
-                with self.assertRaisesRegex(type(e), re.escape(str(e))):
-                    ref(ref_inputs, values=values)
+                # Match with error messages from regular non-foreach reference if no
+                # custom error message was provided.
+                if custom_values_err is None:
+                    with self.assertRaisesRegex(type(e), re.escape(str(e))):
+                        ref(ref_inputs, values=values)
+                else:
+                    self.assertEqual(re.escape(str(e)), re.escape(custom_values_err))
             else:
                 expected = ref(ref_inputs, values=values)
                 self.assertEqual(expected, actual)
 
-    def _test_pointwise_op(self, device, dtype, opinfo, N, is_fastpath, disable_fastpath, *, values=None):
+    def _test_pointwise_op(self, device, dtype, opinfo, N, is_fastpath, disable_fastpath, *, values=None, custom_values_err=None):
         n_expected_cudaLaunchKernels = N if disable_fastpath else 1
         op, ref, inplace_op, inplace_ref = self._get_funcs(opinfo, n_expected_cudaLaunchKernels)
         inputs = [
@@ -276,8 +285,10 @@ def _test_pointwise_op(self, device, dtype, opinfo, N, is_fastpath, disable_fast
             opinfo.sample_inputs(device, dtype, N, noncontiguous=not is_fastpath),
             opinfo.sample_inputs(device, dtype, N, noncontiguous=not is_fastpath),
         ]
-        self._pointwise_test(dtype, op, ref, inputs, is_fastpath, is_inplace=False, values=values)
-        self._pointwise_test(dtype, inplace_op, inplace_ref, inputs, is_fastpath, is_inplace=True, values=values)
+        self._pointwise_test(dtype, op, ref, inputs, is_fastpath, is_inplace=False,
+                             values=values, custom_values_err=custom_values_err)
+        self._pointwise_test(dtype, inplace_op, inplace_ref, inputs, is_fastpath,
+                             is_inplace=True, values=values, custom_values_err=custom_values_err)
 
         # Tests of implicit broadcasting
         inputs = [
@@ -289,9 +300,11 @@ def _test_pointwise_op(self, device, dtype, opinfo, N, is_fastpath, disable_fast
                 make_tensor((1, N - i), device=device, dtype=dtype, noncontiguous=not is_fastpath) for i in range(N)
             ],
         ]
-        self._pointwise_test(dtype, op, ref, inputs, is_fastpath and disable_fastpath, is_inplace=False, values=values)
+        self._pointwise_test(dtype, op, ref, inputs, is_fastpath and disable_fastpath,
+                             is_inplace=False, values=values, custom_values_err=custom_values_err)
         self._pointwise_test(
-            dtype, inplace_op, inplace_ref, inputs, is_fastpath and disable_fastpath, is_inplace=True, values=values)
+            dtype, inplace_op, inplace_ref, inputs, is_fastpath and disable_fastpath,
+            is_inplace=True, values=values, custom_values_err=custom_values_err)
 
     @skipMeta
     @ops(foreach_pointwise_op_db)
@@ -302,9 +315,24 @@ def test_pointwise_op_fastpath(self, device, dtype, op):
             self._test_pointwise_op(device, dtype, op, N, True, disable_fastpath)
             for scalar in Scalars:
                 self._test_pointwise_op(device, dtype, op, N, True, disable_fastpath, values=scalar)
-            for _, scalarlist in getScalarLists(N):
+            for case, scalarlist in getScalarLists(N):
                 self._test_pointwise_op(
                     device, dtype, op, N, True, disable_fastpath, values=scalarlist)
+                self._test_pointwise_op(
+                    device, dtype, op, N, True, disable_fastpath, values=torch.tensor(scalarlist))
+                self._test_pointwise_op(
+                    device, dtype, op, N, True, disable_fastpath, values=torch.tensor(scalarlist)[0],
+                    custom_values_err="Expected packed scalar Tensor to be of dimension 1. Got 0 instead.")
+                if device == "cuda":
+                    self._test_pointwise_op(
+                        device, dtype, op, N, True, disable_fastpath, values=torch.tensor(scalarlist, device="cuda"),
+                        custom_values_err="Expected scalars to be on CPU, got cuda:0 instead.")
+                self._test_pointwise_op(
+                    device, dtype, op, N, True, disable_fastpath, values=torch.tensor(scalarlist)[:2],
+                    custom_values_err=f"Expected length of scalars to match input of length {len(scalarlist)} but got 2 instead.")
+                self._test_pointwise_op(
+                    device, dtype, op, N, True, disable_fastpath, values=torch.tensor([[0, 1], [2, 3]])[:, 1],
+                    custom_values_err="Expected scalars to be contiguous.")
 
     @ops(foreach_pointwise_op_db)
     def test_pointwise_op_slowpath(self, device, dtype, op):
@@ -313,9 +341,11 @@ def test_pointwise_op_slowpath(self, device, dtype, op):
             self._test_pointwise_op(device, dtype, op, N, False, False)
             for scalar in Scalars:
                 self._test_pointwise_op(device, dtype, op, N, False, False, values=scalar)
-            for _, scalarlist in getScalarLists(N):
+            for case, scalarlist in getScalarLists(N):
                 self._test_pointwise_op(
                     device, dtype, op, N, False, False, values=scalarlist)
+                self._test_pointwise_op(
+                    device, dtype, op, N, False, False, values=torch.tensor(scalarlist))
 
     # note(mkozuki): fastpath test uses dtypes which fastpath implementation supports.
     # To confirm the dtypes of `OpInfo` cover the dtypes that the function support,
@@ -476,7 +506,6 @@ def test_binary_op_scalar_with_different_tensor_dtypes(self, device, dtype, op):
             runtime_error = e
         self.assertIsNone(runtime_error)
 
-
     @skipIfTorchDynamo("Different error msgs, TODO")
     @ops(foreach_binary_op_db, dtypes=all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
     def test_binary_op_list_error_cases(self, device, dtype, op):

From b2d9452437304fb88ec8d03d7f815e4a2baa56fb Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Wed, 2 Nov 2022 11:38:09 +0000
Subject: [PATCH 0494/1922] [FSDP()][27/N] Add forward hook registration
 (#88040)

This PR adds the forward hook registration to composable FSDP and adds a unit test for the runtime.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88040
Approved by: https://github.com/zhaojuanmao, https://github.com/rohan-varma
---
 test/distributed/fsdp/test_composable_fsdp.py |  57 +++++++++-
 torch/distributed/fsdp/_common_utils.py       |   2 +-
 torch/distributed/fsdp/_exec_order_utils.py   |  20 ++--
 torch/distributed/fsdp/_fsdp.py               |   7 ++
 torch/distributed/fsdp/_init_utils.py         |   8 ++
 torch/distributed/fsdp/_runtime_utils.py      | 103 ++++++++++++++++--
 torch/distributed/fsdp/flat_param.py          |   5 +
 .../fsdp/fully_sharded_data_parallel.py       |   4 +-
 8 files changed, 187 insertions(+), 19 deletions(-)

diff --git a/test/distributed/fsdp/test_composable_fsdp.py b/test/distributed/fsdp/test_composable_fsdp.py
index dd21e8d06ee40..0e28d4f9985cd 100644
--- a/test/distributed/fsdp/test_composable_fsdp.py
+++ b/test/distributed/fsdp/test_composable_fsdp.py
@@ -3,6 +3,7 @@
 import copy
 import functools
 import sys
+from typing import Any, Tuple
 
 import torch
 import torch.distributed as dist
@@ -10,6 +11,7 @@
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp._common_utils import _is_fsdp_flattened
 from torch.distributed.fsdp._fsdp import fully_sharded_data_parallel
+from torch.distributed.fsdp._runtime_utils import _root_pre_forward
 from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import FSDPTest
@@ -65,6 +67,9 @@ def auto_wrap_policy():
             transformer_auto_wrap_policy, transformer_layer_cls={SubModel}
         )
 
+    def get_input(self, device=torch.device) -> Tuple[Any, ...]:
+        return (torch.randn((8, 5), device=device),)
+
 
 class TestFSDPInitialization(FSDPTest):
     """Tests composable FSDP initialization."""
@@ -78,7 +83,6 @@ def test_auto_wrap_policy(self):
         """Tests passing an ``auto_wrap_policy``."""
 
         local_model = Model(device=torch.device("cuda"))
-
         fsdp_wrapped_model = FSDP(
             copy.deepcopy(local_model),
             auto_wrap_policy=Model.auto_wrap_policy(),
@@ -209,6 +213,57 @@ def _param_init_fn(module: nn.Module):
             self.assertEqual(composable_param, fsdp_wrapped_param)
 
 
+class TestFSDPRuntime(FSDPTest):
+    """Tests composable FSDP runtime."""
+
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @skip_if_lt_x_gpu(2)
+    def test_training(self):
+        """Tests training (forward, backward, optimizer)."""
+        device = torch.device("cuda")
+        local_model = Model(device=device)
+        fsdp_wrapped_model = FSDP(
+            copy.deepcopy(local_model),
+            auto_wrap_policy=Model.auto_wrap_policy(),
+            use_orig_params=True,
+        )
+        composable_module = copy.deepcopy(local_model)
+        fsdp_state = fully_sharded_data_parallel(
+            composable_module,
+            auto_wrap_policy=Model.auto_wrap_policy(),
+        )
+        del local_model  # not needed anymore
+        LR = 1e-2
+        fsdp_wrapped_optim = torch.optim.Adam(fsdp_wrapped_model.parameters(), lr=LR)
+        composable_optim = torch.optim.Adam(composable_module.parameters(), lr=LR)
+        for _ in range(5):
+            inp = composable_module.get_input(device)
+            losses = []
+            for model, optim in (
+                (fsdp_wrapped_model, fsdp_wrapped_optim),
+                (composable_module, composable_optim),
+            ):
+                optim.zero_grad(set_to_none=True)
+                # TODO (awgu): Remove this after resolving the root pre-forward
+                # hook registration, currently blocked by kwarg support
+                if model is composable_module:
+                    args, kwargs = _root_pre_forward(
+                        fsdp_state, composable_module, *inp
+                    )
+                else:
+                    args = inp
+                    kwargs = {}
+                out = model(*args, **kwargs)
+                loss = out.sum()
+                losses.append(loss)
+                loss.backward()
+                optim.step()
+            self.assertEqual(losses[0], losses[1])
+
+
 instantiate_parametrized_tests(TestFSDPInitialization)
 
 if __name__ == "__main__":
diff --git a/torch/distributed/fsdp/_common_utils.py b/torch/distributed/fsdp/_common_utils.py
index f97e72faa2418..49756280e1a1d 100644
--- a/torch/distributed/fsdp/_common_utils.py
+++ b/torch/distributed/fsdp/_common_utils.py
@@ -57,7 +57,7 @@ def _is_composable(state: _State):
 
 
 @no_type_check
-def _all_handles(state: _State):
+def _all_handles(state: _State) -> List:
     return (
         state._handles
         if _is_composable(state)
diff --git a/torch/distributed/fsdp/_exec_order_utils.py b/torch/distributed/fsdp/_exec_order_utils.py
index 1c3b364a90b18..e95c78cf610e7 100644
--- a/torch/distributed/fsdp/_exec_order_utils.py
+++ b/torch/distributed/fsdp/_exec_order_utils.py
@@ -6,7 +6,11 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-from torch.distributed.fsdp._common_utils import _get_param_to_unflat_param_names
+from torch.distributed.fsdp._common_utils import (
+    _all_handles,
+    _get_param_to_unflat_param_names,
+    _State,
+)
 from torch.distributed.fsdp.flat_param import FlatParameter, FlatParamHandle
 
 _HandlesKey = Tuple[FlatParamHandle, ...]
@@ -70,7 +74,8 @@ def __init__(
 
     def init(
         self,
-        fsdp_root: nn.Module,  # `FullyShardedDataParallel`
+        state: _State,
+        root_module: nn.Module,
         process_group: dist.ProcessGroup,
     ) -> None:
         """
@@ -82,14 +87,13 @@ def init(
         self.rank = process_group.rank()
         self.world_size = process_group.size()
         # Fix an order over the handles, which should be the same across ranks
-        for fsdp_module in fsdp_root.fsdp_modules(fsdp_root):  # type: ignore[operator]
-            for handle in fsdp_module._handles:
-                index = len(self.all_handles)
-                self.all_handles.append(handle)
-                self.handle_to_handle_index[handle] = index
+        for handle in _all_handles(state):
+            index = len(self.all_handles)
+            self.all_handles.append(handle)
+            self.handle_to_handle_index[handle] = index
         self.flat_param_to_prefixed_param_names = cast(
             Dict[FlatParameter, List[str]],
-            _get_param_to_unflat_param_names(fsdp_root),
+            _get_param_to_unflat_param_names(root_module),
         )
         # TODO (awgu): We can broadcast the metadata of rank 0's `all_handles`
         # to check that all ranks have the same handles in the same order.
diff --git a/torch/distributed/fsdp/_fsdp.py b/torch/distributed/fsdp/_fsdp.py
index ac8b1cd4d7a08..8c99a05a301a6 100644
--- a/torch/distributed/fsdp/_fsdp.py
+++ b/torch/distributed/fsdp/_fsdp.py
@@ -14,6 +14,10 @@
     _init_runtime_state,
     _init_state_dict_state,
 )
+from torch.distributed.fsdp._runtime_utils import (
+    _register_post_forward_hooks,
+    _register_pre_forward_hooks,
+)
 from torch.distributed.fsdp.api import (
     BackwardPrefetch,
     CPUOffload,
@@ -66,4 +70,7 @@ def fully_sharded_data_parallel(
         sync_module_states,
     )
     state = _init_state_dict_state(state)
+    modules = list(module.modules())
+    _register_pre_forward_hooks(state, modules)
+    _register_post_forward_hooks(state, modules)
     return cast(FSDPState, state)
diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index 11c1c35e5ce95..e19d5121ae9d8 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -148,6 +148,10 @@ def _init_core_state(
         backward_prefetch_limit,
         forward_prefetch_limit,
     )
+    _module_to_handles: Dict[
+        nn.Module, List[FlatParamHandle]
+    ] = collections.defaultdict(list)
+    state._module_to_handles = _module_to_handles
     # Invariant: `state.params` contains exactly the `FlatParameter`s of the
     # handles in `state._handles`
     _handles: List[FlatParamHandle] = []
@@ -161,6 +165,8 @@ def _init_core_state(
 def _init_runtime_state(
     state: _State,
 ) -> _State:
+    _root_pre_forward_handles: List[RemovableHandle] = []
+    state._root_pre_forward_handles = _root_pre_forward_handles
     _pre_forward_handles: List[RemovableHandle] = []
     state._pre_forward_handles = _pre_forward_handles
     _post_forward_handles: List[RemovableHandle] = []
@@ -332,6 +338,8 @@ def _init_param_handle_from_params(
     assert handle not in state._handles
     state.params.append(handle.flat_param)
     state._handles.append(handle)
+    for module in handle.flat_param._modules:
+        state._module_to_handles[module].append(handle)
     cpu_device = torch.device("cpu")
     if state.cpu_offload.offload_params and handle.flat_param.device != cpu_device:
         handle.flat_param_to(cpu_device)
diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index bf8a36b6cca10..4fdc81e3aacc7 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -1,6 +1,6 @@
 import functools
 import warnings
-from typing import Any, Callable, List, no_type_check, Optional, Tuple
+from typing import Any, Callable, Iterable, List, no_type_check, Optional, Tuple
 
 import torch
 import torch.nn as nn
@@ -56,7 +56,7 @@ def _lazy_init(
     _cast_buffers_to_dtype_and_device(buffers, buffer_dtypes, state.compute_device)
     for handle in state._handles:
         handle.init_flat_param_attributes()
-    state._exec_order_data.init(state, state.process_group)
+    state._exec_order_data.init(state, root_module, state.process_group)
     if _is_composable(state):
         # Return early since there is no need to share data structures
         return state
@@ -219,7 +219,8 @@ def _pre_forward(
             the current forward.
         unshard_fn (Optional[Callable]): A callable to unshard any currently
             sharded parameters or ``None`` to not do any unsharding.
-        module (nn.Module): Module whose forward this method runs right before.
+        module (nn.Module): Module whose forward this method runs right before;
+            expected by the hook signature.
         input (Any): Unused; expected by the hook signature.
     """
     state.training_state = TrainingState.FORWARD_BACKWARD
@@ -309,9 +310,9 @@ def _post_forward_reshard(
 
 
 @no_type_check
-def _fsdp_root_pre_forward(
+def _root_pre_forward(
     state: _State,
-    root_module: nn.Module,
+    module: nn.Module,
     *args,
     **kwargs,
 ):
@@ -321,8 +322,12 @@ def _fsdp_root_pre_forward(
     lazy initialization (which only runs non-vacuously once). Otherwise, if
     this is called on a non-root FSDP instance, then the forward inputs are
     returned directly.
+
+    Args:
+        module (nn.Module): Module for which this logic tries to run. It may or
+            may not be the root. If not, then this method does not do anything.
     """
-    _lazy_init(state, root_module)
+    _lazy_init(state, module)
     p_assert(state._is_root is not None, "Expects a root FSDP to have been set")
     if not state._is_root:
         return args, kwargs
@@ -419,7 +424,10 @@ def _pre_backward_hook(
             _register_post_backward_final_callback(state)
             _clear_grads_if_needed(_all_handles(state))
         elif _handles_key:
-            _assert_in_training_states(state, [TrainingState.IDLE])
+            allowed_states = [TrainingState.IDLE]
+            if _is_composable(state):
+                allowed_states.append(TrainingState.FORWARD_BACKWARD)
+            _assert_in_training_states(state, allowed_states)
         state.training_state = TrainingState.FORWARD_BACKWARD
         # Queueing the post-backward callback is the only logic that is not
         # per-handle in the pre-backward hook, so we can return early here if
@@ -849,6 +857,87 @@ def _get_training_state(
     return next(iter(training_states))
 
 
+@no_type_check
+def _register_pre_forward_hooks(
+    state: _State,
+    modules: Iterable[nn.Module],
+) -> None:
+    """
+    Registers pre-forward hooks on all modules in ``modules``. The pre-forward
+    hooks are partially applied based on the current ``FlatParamHandle``
+    construction, meaning that they must be re-registered if the construction
+    changes.
+    """
+    for forward_handle in state._pre_forward_handles:
+        forward_handle.remove()
+    state._pre_forward_handles.clear()
+    for module in modules:
+        module_param_handles = state._module_to_handles[module]
+        if module_param_handles:
+            unshard_fn = functools.partial(
+                _pre_forward_unshard,
+                state,
+                module_param_handles,
+            )
+            hook = functools.partial(
+                _pre_forward, state, module_param_handles, unshard_fn
+            )
+            state._pre_forward_handles.append(module.register_forward_pre_hook(hook))
+
+
+@no_type_check
+def _register_post_forward_hooks(
+    state: _State,
+    modules: Iterable[nn.Module],
+) -> None:
+    """
+    Registers post-forward hooks on all modules in ``modules``. The
+    post-forward hooks are partially applied based on the current
+    ``FlatParamHandle`` construction, meaning that they must be re-registered
+    if the construction changes.
+    """
+    for forward_handle in state._post_forward_handles:
+        forward_handle.remove()
+    state._post_forward_handles.clear()
+    for module in modules:
+        module_param_handles = state._module_to_handles[module]
+        if module_param_handles:
+            reshard_fn = functools.partial(
+                _post_forward_reshard,
+                state,
+                module_param_handles,
+            )
+            hook = functools.partial(
+                _post_forward,
+                state,
+                module_param_handles,
+                reshard_fn,
+            )
+            state._post_forward_handles.append(module.register_forward_hook(hook))
+
+
+@no_type_check
+def _register_root_pre_forward_hooks(
+    state: _State,
+    modules: Iterable[nn.Module],
+):
+    """
+    # TODO (awgu): This requires kwarg support for hooks registered by
+    ``register_forward_pre_hook()``. ``_root_pre_forward()`` does not have the
+    supported hook signature right now.
+    """
+    for forward_handle in state._root_pre_forward_handles:
+        forward_handle.remove()
+    state._root_pre_forward_handles.clear()
+    for module in modules:
+        module_param_handles = state._module_to_handles[module]
+        if module_param_handles:
+            hook = functools.partial(_root_pre_forward, state, module)
+            state._root_pre_forward_handles.append(
+                module.register_forward_pre_hook(hook)
+            )
+
+
 @no_type_check
 def _register_pre_backward_hooks(
     state: _State,
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index 91e02cb9312ad..5bbbbc2e9d2cc 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -159,6 +159,8 @@ class FlatParameter(nn.Parameter):
             (i.e. some per-parameter state) used to customize pre-flatten and
             post-unflatten behavior. This is experimental, and users should not
             depend on its existence in the future.
+        _modules (Set[nn.Module]): Modules that contain some original parameter
+            that is flattened into the ``FlatParameter``.
 
         _shard_param_offsets (List[Tuple[int, int])): [start, end] offsets (in
             units of numel) giving this rank's part of each flattened original
@@ -255,6 +257,9 @@ def _init_metadata(
         self._fqns = tuple(prefixed_param_names)
         self._shared_param_infos = tuple(shared_param_infos)
         self._param_extensions = tuple(param_extensions)
+        self._modules = set(pi.module for pi in self._param_infos).union(
+            set(spi.module for spi in self._shared_param_infos)
+        )
         assert (params is None) == (shared_params is None)
         if params is not None:
             assert shared_params is not None and len(shared_params) == len(
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 8d0ed60dbde23..a2057b38f97b9 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -52,7 +52,6 @@
 from torch.distributed.fsdp._runtime_utils import (
     _cast_buffers_to_dtype_and_device,
     _clear_grads_if_needed,
-    _fsdp_root_pre_forward,
     _get_buffers_and_dtypes_for_checkpoint,
     _lazy_init,
     _post_forward,
@@ -61,6 +60,7 @@
     _pre_forward_unshard,
     _reshard,
     _reshard_grads,
+    _root_pre_forward,
     _should_free_in_backward,
     _unshard,
     _unshard_grads,
@@ -935,7 +935,7 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
         with torch.autograd.profiler.record_function(
             "FullyShardedDataParallel.forward"
         ):
-            args, kwargs = _fsdp_root_pre_forward(self, self, *args, **kwargs)
+            args, kwargs = _root_pre_forward(self, self, *args, **kwargs)
             unused = None
             unshard_fn = functools.partial(_pre_forward_unshard, self, self._handles)
             reshard_fn = functools.partial(_post_forward_reshard, self, self._handles)

From 35c0ba2d108ce8168cf8623180619d8c3d50620b Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Wed, 2 Nov 2022 11:38:09 +0000
Subject: [PATCH 0495/1922] [FSDP][Docs] Add note mentioning rate limiter for
 backward prefetch (#88120)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88120
Approved by: https://github.com/mrshenli
---
 torch/distributed/fsdp/api.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/torch/distributed/fsdp/api.py b/torch/distributed/fsdp/api.py
index d6704d639fd59..8e0344318e531 100644
--- a/torch/distributed/fsdp/api.py
+++ b/torch/distributed/fsdp/api.py
@@ -64,6 +64,10 @@ class BackwardPrefetch(Enum):
       reduce-scatter) and computation (next gradient computation).
       Specifically, the next all-gather is reordered to be before the current
       reduce-scatter.
+
+    .. note:: If the increase in peak memory usage from prefetching is an
+        issue, you may consider passing ``limit_all_gathers=True`` to the FSDP
+        constructor, which may help reduce peak memory usage in some cases.
     """
 
     # NOTE: For both modes, the ordering that defines "current" and "next" is

From 2e184dcf600015033c3051b562a445d639fd5be1 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Wed, 2 Nov 2022 11:38:10 +0000
Subject: [PATCH 0496/1922] [FSDP] Remove unneeded `torch.no_grad()` context
 when offloading to CPU (#88121)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88121
Approved by: https://github.com/mrshenli
---
 torch/distributed/fsdp/_init_utils.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index e19d5121ae9d8..3a8f4922dec8f 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -575,9 +575,8 @@ def _move_module_to_device(
                     isinstance(submodule, fsdp_file.FullyShardedDataParallel)
                     and submodule.cpu_offload.offload_params
                 ):
-                    with torch.no_grad():
-                        for handle in submodule._handles:
-                            handle.flat_param_to(torch.device("cpu"))
+                    for handle in submodule._handles:
+                        handle.flat_param_to(torch.device("cpu"))
     elif param.device == cpu_device:
         _warn_cpu_init()
 

From 411dbbace91aa3a9730a09b0188c2eede2577990 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Wed, 2 Nov 2022 11:38:10 +0000
Subject: [PATCH 0497/1922] [FSDP] Simplify `_get_buffer_names()` (#88122)

This is a follow-up from a previous PR in this stack. The PR simplifies the `_get_buffer_names()` implementation.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88122
Approved by: https://github.com/mrshenli
---
 torch/distributed/fsdp/_init_utils.py | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index 3a8f4922dec8f..eda89dcb1fb58 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -21,7 +21,6 @@
 from torch.distributed.algorithms._comm_hooks import default_hooks
 from torch.distributed.distributed_c10d import _get_default_group
 from torch.distributed.fsdp._common_utils import (
-    _apply_to_modules,
     _get_param_to_unflat_param_names,
     _is_fsdp_flattened,
     _State,
@@ -421,22 +420,8 @@ def _get_buffer_names(root_module: nn.Module) -> Set[str]:
     Returns the fully prefixed names of all buffers in the module hierarchy
     rooted at ``root_module`` as a class:`set`.
     """
-
-    def module_fn(module: nn.Module, prefix: str, buffer_names: Set[str]):
-        for buffer_name, _ in module.named_buffers(recurse=False):
-            # Clean module wrapper prefixes in case of nested wrapping
-            prefixed_buffer_name = clean_tensor_name(prefix + buffer_name)
-            buffer_names.add(prefixed_buffer_name)
-
-    def return_fn(buffer_names: Set[str], *args):
-        return buffer_names
-
-    buffer_names: Set[str] = set()
-    return _apply_to_modules(
-        root_module,
-        module_fn,
-        return_fn,
-        buffer_names,
+    return set(
+        clean_tensor_name(buffer_name) for buffer_name, _ in root_module.named_buffers()
     )
 
 
From 49d329cf7f2b4916cdb7e2dc6ec13059bc337b7c Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Wed, 2 Nov 2022 11:38:10 +0000
Subject: [PATCH 0498/1922] [FSDP] Rename `unflat_param_name` -> `fqn` for
 consistency (#88123)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88123
Approved by: https://github.com/mrshenli
---
 torch/distributed/fsdp/_common_utils.py       | 46 +++++++++----------
 torch/distributed/fsdp/_exec_order_utils.py   |  4 +-
 torch/distributed/fsdp/_init_utils.py         |  4 +-
 torch/distributed/fsdp/_optim_utils.py        | 24 +++++-----
 .../fsdp/fully_sharded_data_parallel.py       | 20 ++++----
 5 files changed, 47 insertions(+), 51 deletions(-)

diff --git a/torch/distributed/fsdp/_common_utils.py b/torch/distributed/fsdp/_common_utils.py
index 49756280e1a1d..ad58618b2b2d4 100644
--- a/torch/distributed/fsdp/_common_utils.py
+++ b/torch/distributed/fsdp/_common_utils.py
@@ -91,48 +91,44 @@ def _is_fsdp_flattened(tensor: torch.Tensor) -> bool:
     return getattr(tensor, FSDP_FLATTENED, False)
 
 
-def _get_param_to_unflat_param_names(
+def _get_param_to_fqns(
     model: torch.nn.Module,
     dedup_shared_params: bool = True,
-) -> Dict[torch.nn.Parameter, List[str]]:
+) -> Dict[nn.Parameter, List[str]]:
     """
-    Constructs a mapping from flattened parameter (including non-FSDP-module
-    parameters) to its unflattened parameter names. For non-FSDP-module
-    parameters, these mapped-to lists always contain a single element. The
-    unflattened parameter names should match the keys of the model state dict.
-
-    For shared parameters, only the first parameter name is included (following
-    the ``torch.nn.Module.parameters()`` order).
+    Constructs a mapping from parameter to a list of its FQNs. Each normal
+    parameter maps to a singleton list containing its FQN, while each
+    ``FlatParameter`` maps to a list of its original parameter FQNs, which may
+    have length greater than one. All FQNs are prefixed starting from
+    ``model``.
 
     Args:
         model (torch.nn.Module): Root module (which may or may not be a
             :class:`FullyShardedDataParallel` instance).
-        dedup_shared_params (bool): If ``True``, only includes the first
-            list of unflattened parameter names corresponding to a parameter
-            in the module walk order; if ``False``, then includes all of the
-            unflattened parameter names.
+        dedup_shared_params (bool): For shared parameters, if ``True``, only
+            includes the FQNs corresponding to the first encounter of the
+            shared parameter in the module traversal; if ``False``, then
+            includes the FQNs across all encounters. (Default: ``True``)
     """
 
-    def module_fn(module, prefix, param_to_unflat_param_names):
+    def module_fn(module, prefix, param_to_fqns):
         for param_name, param in module.named_parameters(recurse=False):
-            module_prefixed_param_names = (
+            local_fqns = (
                 param._fqns
                 if type(param) is flat_param_file.FlatParameter
                 else [param_name]
             )  # prefixed from `module`
-            fully_prefixed_param_names = [
-                clean_tensor_name(prefix + name) for name in module_prefixed_param_names
-            ]  # fully prefixed from the top level including `prefix`
-            # If this parameter has already been visited, then it is a
-            # shared parameter; then, only take the first parameter name
-            is_shared_param = param in param_to_unflat_param_names
+            global_fqns = [
+                clean_tensor_name(prefix + name) for name in local_fqns
+            ]  # prefixed from the top level `model` (i.e. including `prefix`)
+            is_shared_param = param in param_to_fqns
             if not is_shared_param:
-                param_to_unflat_param_names[param] = fully_prefixed_param_names
+                param_to_fqns[param] = global_fqns
             elif not dedup_shared_params:
-                param_to_unflat_param_names[param].extend(fully_prefixed_param_names)
+                param_to_fqns[param].extend(global_fqns)
 
-    def return_fn(param_to_unflat_param_names):
-        return param_to_unflat_param_names
+    def return_fn(param_to_fqns):
+        return param_to_fqns
 
     param_to_unflat_param_names: Dict[torch.nn.Parameter, List[str]] = {}
     return _apply_to_modules(
diff --git a/torch/distributed/fsdp/_exec_order_utils.py b/torch/distributed/fsdp/_exec_order_utils.py
index e95c78cf610e7..e9c7a9d5f5a84 100644
--- a/torch/distributed/fsdp/_exec_order_utils.py
+++ b/torch/distributed/fsdp/_exec_order_utils.py
@@ -8,7 +8,7 @@
 import torch.nn as nn
 from torch.distributed.fsdp._common_utils import (
     _all_handles,
-    _get_param_to_unflat_param_names,
+    _get_param_to_fqns,
     _State,
 )
 from torch.distributed.fsdp.flat_param import FlatParameter, FlatParamHandle
@@ -93,7 +93,7 @@ def init(
             self.handle_to_handle_index[handle] = index
         self.flat_param_to_prefixed_param_names = cast(
             Dict[FlatParameter, List[str]],
-            _get_param_to_unflat_param_names(root_module),
+            _get_param_to_fqns(root_module),
         )
         # TODO (awgu): We can broadcast the metadata of rank 0's `all_handles`
         # to check that all ranks have the same handles in the same order.
diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index eda89dcb1fb58..2eb351fe83875 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -21,7 +21,7 @@
 from torch.distributed.algorithms._comm_hooks import default_hooks
 from torch.distributed.distributed_c10d import _get_default_group
 from torch.distributed.fsdp._common_utils import (
-    _get_param_to_unflat_param_names,
+    _get_param_to_fqns,
     _is_fsdp_flattened,
     _State,
     clean_tensor_name,
@@ -400,7 +400,7 @@ def _get_ignored_params(
         p for m in ignored_modules for p in m.parameters() if not _is_fsdp_flattened(p)
     )
     # Conservatively include all shared parameters' names
-    param_to_unflat_param_names = _get_param_to_unflat_param_names(
+    param_to_unflat_param_names = _get_param_to_fqns(
         root_module,
         dedup_shared_params=False,
     )
diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py
index e4bbd85f01115..d5f86a1a5e045 100644
--- a/torch/distributed/fsdp/_optim_utils.py
+++ b/torch/distributed/fsdp/_optim_utils.py
@@ -293,12 +293,12 @@ def _flatten_optim_state_dict(
             '"param_groups" to be a valid optimizer state dict'
         )
     flat_param_to_fsdp_module = _get_flat_param_to_fsdp_module(model)
-    param_to_unflat_param_names = fsdp_file._get_param_to_unflat_param_names(model)
+    param_to_fqns = fsdp_file._get_param_to_fqns(model)
 
     # Construct the "state" part
     flat_osd_state: Dict[_OptimStateKey, Any] = {}
     unflat_osd_state = unflat_osd["state"]
-    for param, unflat_param_names in param_to_unflat_param_names.items():
+    for param, unflat_param_names in param_to_fqns.items():
         if isinstance(param, FlatParameter):  # flatten FSDP parameters' states
             assert (
                 param in flat_param_to_fsdp_module
@@ -894,15 +894,15 @@ def _rekey_sharded_optim_state_dict(
         if using_optim_input
         else _get_param_to_param_id(optim)
     )
-    param_to_unflat_param_names = fsdp_file._get_param_to_unflat_param_names(model)
+    param_to_fqns = fsdp_file._get_param_to_fqns(model)
     # All parameter keys in `param_to_flat_param_id` should be in
-    # `param_to_unflat_param_names` -- strict inequality follows when not all
-    # parameters are passed to the optimizer
-    assert len(param_to_flat_param_id) <= len(param_to_unflat_param_names)
+    # `param_to_fqns` -- strict inequality follows when not all parameters are
+    # passed to the optimizer
+    assert len(param_to_flat_param_id) <= len(param_to_fqns)
 
     unflat_param_names_to_flat_param_id: Dict[Tuple[str, ...], int] = {}  # for "state"
     unflat_param_name_to_flat_param_id: Dict[str, int] = {}  # for "param_groups"
-    for param, unflat_param_names in param_to_unflat_param_names.items():
+    for param, unflat_param_names in param_to_fqns.items():
         if param not in param_to_flat_param_id:
             # This parameter was not passed to the optimizer
             continue
@@ -1162,9 +1162,9 @@ def _optim_state_dict(
 
     # Construct the local mapping between unflattened parameter names
     # (`_OptimStateKey`s) and parameter IDs and broadcast rank 0's mapping
-    param_to_unflat_param_names: Dict[
-        torch.nn.Parameter, List[str]
-    ] = fsdp_file._get_param_to_unflat_param_names(model)
+    param_to_fqns: Dict[torch.nn.Parameter, List[str]] = fsdp_file._get_param_to_fqns(
+        model
+    )
     flat_param_id_to_param: List[torch.nn.Parameter] = (
         _get_param_id_to_param_from_optim_input(model, optim_input)
         if using_optim_input
@@ -1180,7 +1180,7 @@ def _optim_state_dict(
         if flat_param_id not in osd_state:
             continue
         optim_state_key = _OptimStateKey(
-            unflat_param_names=tuple(param_to_unflat_param_names[param]),
+            unflat_param_names=tuple(param_to_fqns[param]),
             is_flat_param=isinstance(param, FlatParameter),
         )
         if rank == 0:
@@ -1268,7 +1268,7 @@ def _optim_state_dict(
             for flat_param_id in flat_param_group["params"]
         ]
         nested_unflat_param_names = [
-            param_to_unflat_param_names[param] for param in param_group_params
+            param_to_fqns[param] for param in param_group_params
         ]
         unflat_param_group["params"] = [
             unflat_param_name
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index a2057b38f97b9..32e048faa1a17 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -31,7 +31,7 @@
 )
 from torch.distributed.algorithms._comm_hooks import LOW_PRECISION_HOOKS
 from torch.distributed.fsdp._common_utils import (
-    _get_param_to_unflat_param_names,
+    _get_param_to_fqns,
     FSDP_PREFIX,
     FSDP_WRAPPED_MODULE,
     HandleTrainingState,
@@ -2107,7 +2107,7 @@ def rekey_optim_state_dict(
                 if using_optim_input
                 else _get_param_id_to_param(optim)
             )
-            param_to_param_name = _get_param_to_param_name(model)
+            param_to_param_name = _get_param_to_fqn(model)
             param_id_to_param_name: List[str] = [
                 param_to_param_name[param] for param in param_id_to_param
             ]
@@ -2125,7 +2125,7 @@ def rekey_optim_state_dict(
                 )
             return new_osd
         elif optim_state_key_type == OptimStateKeyType.PARAM_ID:  # name -> ID
-            param_name_to_param = _get_param_name_to_param(model)
+            param_name_to_param = _get_fqn_to_param(model)
             param_to_param_id = (
                 _get_param_to_param_id_from_optim_input(model, optim_input)
                 if using_optim_input
@@ -2324,14 +2324,14 @@ def _get_grad_norm(
     return grad_norm
 
 
-def _get_param_to_param_name(
+def _get_param_to_fqn(
     model: torch.nn.Module,
 ) -> Dict[torch.nn.Parameter, str]:
     """
     Constructs a mapping from parameters to their parameter names. ``model``
     should not contain any :class:`FullyShardedDataParallel` instances, which
     means that none of the parameters should be ``FlatParameter`` s. As a
-    result, compared to :meth:`_get_param_to_unflat_param_names`, the mapped
+    result, compared to :meth:`_get_param_to_fqns`, the mapped
     values may be flattened from singleton :class:`list` s to the contained
     names themselves.
 
@@ -2339,10 +2339,10 @@ def _get_param_to_param_name(
         model (torch.nn.Module): Root module, which should not contain any
             :class:`FullyShardedDataParallel` instances.
     """
-    param_to_param_names = _get_param_to_unflat_param_names(model)
+    param_to_param_names = _get_param_to_fqns(model)
     for param_names in param_to_param_names.values():
         assert len(param_names) > 0, (
-            "`_get_param_to_unflat_param_names()` " "should not construct empty lists"
+            "`_get_param_to_fqns()` " "should not construct empty lists"
         )
         if len(param_names) > 1:
             raise RuntimeError(
@@ -2355,9 +2355,9 @@ def _get_param_to_param_name(
     return param_to_param_name
 
 
-def _get_param_name_to_param(
+def _get_fqn_to_param(
     model: torch.nn.Module,
 ) -> Dict[str, torch.nn.Parameter]:
-    """Constructs the inverse mapping of :meth:`_get_param_to_param_name`."""
-    param_to_param_name = _get_param_to_param_name(model)
+    """Constructs the inverse mapping of :meth:`_get_param_to_fqn`."""
+    param_to_param_name = _get_param_to_fqn(model)
     return dict(zip(param_to_param_name.values(), param_to_param_name.keys()))

From a056f299cc80ba9b0589a03e7a3e367401e2a7ec Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Wed, 2 Nov 2022 11:38:11 +0000
Subject: [PATCH 0499/1922] [FSDP][Easy] Remove unneeded `TrainingState`
 transition (#88232)

Follow-up from previous PR in the stack
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88232
Approved by: https://github.com/mrshenli
---
 torch/distributed/fsdp/_runtime_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index 4fdc81e3aacc7..254d41a1d5be1 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -478,7 +478,6 @@ def _post_backward_hook(
         "FullyShardedDataParallel._post_backward_hook"
     ):
         _assert_in_training_states(state, [TrainingState.FORWARD_BACKWARD])
-        state.training_state = TrainingState.FORWARD_BACKWARD
         p_assert(
             handle._training_state == HandleTrainingState.BACKWARD_PRE,
             f"Expects `BACKWARD_PRE` state but got {handle._training_state}",

From 33e5390dcc7379e111bd7fa14347e37242352c3f Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Wed, 2 Nov 2022 08:54:54 -0700
Subject: [PATCH 0500/1922] [Pytorch][Vulkan] Update spv generation script to
 embed shader parameters (#88321)

This diffs adds shader parameters such as tile size, weight storage type and
format to the generated spv.cpp file.
This is used in ShaderInfo struct that ops such as convolution will use to
determine, the workgroup size  and how to pack weights.

Differential Revision: [D40280337](https://our.internmc.facebook.com/intern/diff/D40280337/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88321
Approved by: https://github.com/jmdetloff, https://github.com/mcr229
---
 aten/src/ATen/native/vulkan/api/Types.h       | 21 ++++++
 .../ATen/native/vulkan/ops/Convolution.cpp    |  4 +-
 aten/src/ATen/native/vulkan/ops/Tensor.cpp    | 21 +++---
 aten/src/ATen/native/vulkan/ops/Tensor.h      | 17 ++---
 aten/src/ATen/native/vulkan/ops/Utils.cpp     | 28 +++++---
 tools/gen_vulkan_spv.py                       | 69 ++++++++++++++++---
 6 files changed, 122 insertions(+), 38 deletions(-)
 create mode 100644 aten/src/ATen/native/vulkan/api/Types.h

diff --git a/aten/src/ATen/native/vulkan/api/Types.h b/aten/src/ATen/native/vulkan/api/Types.h
new file mode 100644
index 0000000000000..ff4ce3e7044d7
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/api/Types.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#ifdef USE_VULKAN_API
+namespace at {
+namespace native {
+namespace vulkan {
+namespace api {
+
+enum class StorageType {
+  BUFFER,
+  TEXTURE_3D,
+  TEXTURE_2D,
+  UNKNOWN,
+};
+
+} // namespace api
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
index d1fca607cc768..8431ccac5ef3e 100644
--- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
@@ -520,7 +520,7 @@ vTensor pack_weights(
   vTensor v_weight{
       api::context(),
       weight_rearranged.sizes(),
-      quantized ? StorageType::TEXTURE_3D : StorageType::TEXTURE_2D,
+      quantized ? api::StorageType::TEXTURE_3D : api::StorageType::TEXTURE_2D,
       weight_arg.options(),
   };
 
@@ -545,7 +545,7 @@ vTensor pack_biases(
   vTensor v_bias{
       api::context(),
       bias_rearranged.sizes(),
-      quantized ? StorageType::TEXTURE_3D : StorageType::TEXTURE_2D,
+      quantized ? api::StorageType::TEXTURE_3D : api::StorageType::TEXTURE_2D,
       weight.options(),
   };
 
diff --git a/aten/src/ATen/native/vulkan/ops/Tensor.cpp b/aten/src/ATen/native/vulkan/ops/Tensor.cpp
index 8a829bda0708f..b9ce7a0caf5fe 100644
--- a/aten/src/ATen/native/vulkan/ops/Tensor.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Tensor.cpp
@@ -63,13 +63,13 @@ vTensor::vTensor(
     : view_(std::make_shared<vTensorStorage>(
           context,
           sizes,
-          StorageType::TEXTURE_3D,
+          api::StorageType::TEXTURE_3D,
           options)) {}
 
 vTensor::vTensor(
     api::Context* const context,
     const IntArrayRef sizes,
-    const StorageType storage_type,
+    const api::StorageType storage_type,
     const TensorOptions& options)
     : view_(std::make_shared<vTensorStorage>(
           context,
@@ -86,7 +86,7 @@ vTensor::vTensor(
     : view_(std::make_shared<vTensorStorage>(
           context,
           sizes,
-          StorageType::TEXTURE_3D,
+          api::StorageType::TEXTURE_3D,
           options,
           q_scale,
           q_zero_point)) {}
@@ -94,7 +94,7 @@ vTensor::vTensor(
 vTensor::vTensor(
     api::Context* const context,
     const IntArrayRef sizes,
-    const StorageType storage_type,
+    const api::StorageType storage_type,
     const TensorOptions& options,
     double q_scale,
     int64_t q_zero_point)
@@ -130,7 +130,7 @@ api::VulkanImage& vTensor::image(
 api::VulkanImage allocate_image(
     api::Context* const context_ptr,
     api::utils::uvec3& extents,
-    StorageType storage_type,
+    api::StorageType storage_type,
     const VkFormat image_format) {
   api::Adapter* adapter_ptr = context_ptr->adapter_ptr();
 
@@ -145,14 +145,17 @@ api::VulkanImage allocate_image(
   VkImageViewType image_view_type = VK_IMAGE_VIEW_TYPE_3D;
 
   switch (storage_type) {
-    case StorageType::TEXTURE_3D:
+    case api::StorageType::TEXTURE_3D:
       image_type = VK_IMAGE_TYPE_3D;
       image_view_type = VK_IMAGE_VIEW_TYPE_3D;
       break;
-    case StorageType::TEXTURE_2D:
+    case api::StorageType::TEXTURE_2D:
       image_type = VK_IMAGE_TYPE_2D;
       image_view_type = VK_IMAGE_VIEW_TYPE_2D;
       break;
+    case api::StorageType::BUFFER:
+    case api::StorageType::UNKNOWN:
+      TORCH_CHECK(false, "Requested storage type must be a texture type.");
   }
 
   VkSampler sampler = adapter_ptr->sampler_cache().retrieve(sampler_props);
@@ -170,7 +173,7 @@ api::VulkanImage allocate_image(
 vTensorStorage::vTensorStorage(
     api::Context* const context,
     const IntArrayRef sizes,
-    const StorageType storage_type,
+    const api::StorageType storage_type,
     const TensorOptions& options)
     : context_(context),
       extents_(image_extents(sizes)),
@@ -190,7 +193,7 @@ vTensorStorage::vTensorStorage(
 vTensorStorage::vTensorStorage(
     api::Context* const context,
     const IntArrayRef sizes,
-    const StorageType storage_type,
+    const api::StorageType storage_type,
     const TensorOptions& options,
     double q_scale_in,
     int64_t q_zero_point_in)
diff --git a/aten/src/ATen/native/vulkan/ops/Tensor.h b/aten/src/ATen/native/vulkan/ops/Tensor.h
index 9e5651cb510f3..6a41638057701 100644
--- a/aten/src/ATen/native/vulkan/ops/Tensor.h
+++ b/aten/src/ATen/native/vulkan/ops/Tensor.h
@@ -26,11 +26,6 @@ struct LastAccess {
       : stage{stage_flags}, access{access_flags} {}
 };
 
-enum class StorageType {
-  TEXTURE_3D,
-  TEXTURE_2D,
-};
-
 class vTensorStorage final {
  public:
   // Do not allow empty vTensorStorage construction
@@ -39,12 +34,12 @@ class vTensorStorage final {
   vTensorStorage(
       api::Context* context,
       IntArrayRef sizes,
-      const StorageType storage_type,
+      const api::StorageType storage_type,
       const TensorOptions& options);
   vTensorStorage(
       api::Context* context,
       IntArrayRef sizes,
-      const StorageType storage_type,
+      const api::StorageType storage_type,
       const TensorOptions& options,
       double q_scale,
       int64_t q_zero_point);
@@ -73,7 +68,7 @@ class vTensorStorage final {
   int64_t q_zero_point{0u};
 
   // Image Texture
-  StorageType storage_type_;
+  api::StorageType storage_type_;
   mutable api::VulkanImage image_;
 
   // Last Access - used to insert memory barriers
@@ -108,7 +103,7 @@ class vTensor final {
   vTensor(
       api::Context* context,
       IntArrayRef sizes,
-      const StorageType storage_type,
+      const api::StorageType storage_type,
       const TensorOptions& options);
 
   vTensor(
@@ -121,7 +116,7 @@ class vTensor final {
   vTensor(
       api::Context* const context,
       const IntArrayRef sizes,
-      const StorageType storage_type,
+      const api::StorageType storage_type,
       const TensorOptions& options,
       double q_scale,
       int64_t q_zero_point);
@@ -151,7 +146,7 @@ class vTensor final {
    Texture Access
   */
 
-  inline StorageType storage_type() const {
+  inline api::StorageType storage_type() const {
     return view_->storage_type_;
   }
 
diff --git a/aten/src/ATen/native/vulkan/ops/Utils.cpp b/aten/src/ATen/native/vulkan/ops/Utils.cpp
index 23ae1e9f57b46..30e8d727ff6ad 100644
--- a/aten/src/ATen/native/vulkan/ops/Utils.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Utils.cpp
@@ -19,36 +19,48 @@ namespace packing {
 static api::ShaderSource get_nchw_to_image_shader(const vTensor& v_dst) {
   if (v_dst.is_quantized()) {
     switch (v_dst.storage_type()) {
-      case StorageType::TEXTURE_3D:
+      case api::StorageType::TEXTURE_3D:
         return VK_KERNEL(nchw_to_image_quantized);
-      case StorageType::TEXTURE_2D:
+      case api::StorageType::TEXTURE_2D:
         TORCH_CHECK(false, "No kernel available!");
+      case api::StorageType::BUFFER:
+      case api::StorageType::UNKNOWN:
+        TORCH_CHECK(false, "Requested storage type must be a texture type.");
     }
   }
 
   switch (v_dst.storage_type()) {
-    case StorageType::TEXTURE_3D:
+    case api::StorageType::TEXTURE_3D:
       return VK_KERNEL(nchw_to_image);
-    case StorageType::TEXTURE_2D:
+    case api::StorageType::TEXTURE_2D:
       return VK_KERNEL(nchw_to_image2d);
+    case api::StorageType::BUFFER:
+    case api::StorageType::UNKNOWN:
+      TORCH_CHECK(false, "Requested storage type must be a texture type.");
   }
 }
 
 static api::ShaderSource get_image_to_nchw_shader(const vTensor& v_src) {
   if (v_src.is_quantized()) {
     switch (v_src.storage_type()) {
-      case StorageType::TEXTURE_3D:
+      case api::StorageType::TEXTURE_3D:
         return VK_KERNEL(image_to_nchw_quantized);
-      case StorageType::TEXTURE_2D:
+      case api::StorageType::TEXTURE_2D:
         TORCH_CHECK(false, "No kernel available!");
+      case api::StorageType::BUFFER:
+      case api::StorageType::UNKNOWN:
+        TORCH_CHECK(false, "Requested storage type must be a texture type.");
     }
   }
 
   switch (v_src.storage_type()) {
-    case StorageType::TEXTURE_3D:
+    case api::StorageType::TEXTURE_3D:
       return VK_KERNEL(image_to_nchw);
-    case StorageType::TEXTURE_2D:
+    case api::StorageType::TEXTURE_2D:
       return VK_KERNEL(image2d_to_nchw);
+    case api::StorageType::BUFFER:
+    case api::StorageType::UNKNOWN:
+      TORCH_CHECK(false, "Requested storage type must be a texture type.");
   }
 }
 
diff --git a/tools/gen_vulkan_spv.py b/tools/gen_vulkan_spv.py
index 74b1212bdbe26..1d37a95af57ec 100644
--- a/tools/gen_vulkan_spv.py
+++ b/tools/gen_vulkan_spv.py
@@ -8,11 +8,20 @@
 import sys
 import subprocess
 from torchgen.code_template import CodeTemplate
+from dataclasses import dataclass
+from typing import List
 
 H_NAME = "spv.h"
 CPP_NAME = "spv.cpp"
 DEFAULT_ENV = {"precision": "highp", "format": "rgba32f"}
 
+
+@dataclass
+class ShaderInfo:
+    tile_size: List[int]
+    layouts: List[str]
+    weight_storage_type: str = ""
+
 def getName(filePath):
     return os.path.basename(filePath).replace("/", "_").replace(".", "_")
 
@@ -20,6 +29,24 @@ def isDescriptorLine(lineStr):
     descriptorLineId = r"^layout\(set"
     return re.search(descriptorLineId, lineStr)
 
+def isTileSizeLine(lineStr):
+    tile_size_id = r"^ \* TILE_SIZE = \("
+    return re.search(tile_size_id, lineStr)
+
+def findTileSizes(lineStr):
+    tile_size_id = r"^ \* TILE_SIZE = \(([0-9]+), ([0-9]+), ([0-9]+)\)"
+    matches = re.search(tile_size_id, lineStr)
+    return [int(matches.group(1)), int(matches.group(2)), int(matches.group(3))]
+
+def isWeightStorageTypeLine(lineStr):
+    weight_storage_id = r"^ \* WEIGHT_STORAGE = "
+    return re.search(weight_storage_id, lineStr)
+
+def getWeightStorageType(lineStr):
+    weight_storage_id = r"^ \* WEIGHT_STORAGE = ([a-zA-Z]+_\dD)"
+    matches = re.search(weight_storage_id, lineStr)
+    return matches.group(1)
+
 typeIdMapping = {
     r"image[123]D\b": "VK_DESCRIPTOR_TYPE_STORAGE_IMAGE",
     r"sampler[123]D\b": "VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER",
@@ -27,21 +54,29 @@ def isDescriptorLine(lineStr):
     r"\buniform\b.*\bBlock\b": "VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER",
 }
 
+storageTypeToEnum = {
+    "TEXTURE_2D" : "api::StorageType::TEXTURE_2D",
+    "TEXTURE_3D" : "api::StorageType::TEXTURE_3D",
+    "BUFFER" : "api::StorageType::BUFFER",
+}
+
 def determineDescriptorType(lineStr):
     for identifier, typeNum in typeIdMapping.items():
         if re.search(identifier, lineStr):
             return typeNum
 
-    raise Exception("Could not identify descriptor type of line: {}".format(lineStr))
-
-def getLayout(srcFilePath):
-    layout = []
+def getShaderInfo(srcFilePath):
+    shader_info = ShaderInfo([], [], "")
     with open(srcFilePath, 'r') as srcFile:
         for line in srcFile:
             if isDescriptorLine(line):
-                layout.append(determineDescriptorType(line))
+                shader_info.layouts.append(determineDescriptorType(line))
+            if isTileSizeLine(line):
+                shader_info.tile_size = findTileSizes(line)
+            if isWeightStorageTypeLine(line):
+                shader_info.weight_storage_type = getWeightStorageType(line)
 
-    return layout
+    return shader_info
 
 def genCppH(hFilePath, cppFilePath, srcDirPath, glslcPath, tmpDirPath, env):
     print("hFilePath:{} cppFilePath:{} srcDirPath:{} glslcPath:{} tmpDirPath:{}".format(
@@ -85,6 +120,8 @@ def genCppH(hFilePath, cppFilePath, srcDirPath, glslcPath, tmpDirPath, env):
     h = "#pragma once\n"
     h += "#include <stdint.h>\n"
     h += "#include <vector>\n"
+    h += "#include <string>\n"
+    h += "#include <ATen/native/vulkan/api/Types.h>\n"
     h += "#include <ATen/native/vulkan/api/vk_api.h>"
 
     nsbegin = "\nnamespace at {\nnamespace native {\nnamespace vulkan {\n"
@@ -101,7 +138,7 @@ def genCppH(hFilePath, cppFilePath, srcDirPath, glslcPath, tmpDirPath, env):
         h += "extern const uint32_t {}[];\n".format(name)
         h += "extern const uint32_t {};\n".format(name_len)
 
-        layout = getLayout(srcPath)
+        shader_info = getShaderInfo(srcPath)
         name_layout = name + "_layout"
         h += "extern const std::vector<VkDescriptorType> {};\n".format(name_layout)
 
@@ -117,10 +154,26 @@ def genCppH(hFilePath, cppFilePath, srcDirPath, glslcPath, tmpDirPath, env):
 
         # Add layout
         cpp += "const std::vector<VkDescriptorType> {} = {{\n".format(name_layout)
-        for descriptor in layout:
+        for descriptor in shader_info.layouts:
             cpp += "  {},\n".format(descriptor)
         cpp += "};\n"
 
+        # Add tile size
+        if (len(shader_info.tile_size) > 0):
+            name_tile_size = name + "_tile_size"
+            h += "extern const std::vector<uint32_t> {};\n".format(name_tile_size)
+            cpp += "const std::vector<uint32_t> {} = {{\n".format(name_tile_size)
+            for s in shader_info.tile_size:
+                cpp += "  {},\n".format(s)
+            cpp += "};\n"
+
+        # Add weight type
+        if (shader_info.weight_storage_type != ""):
+            name_weight_storage_type = name + "_weight_storage_type"
+            h += "extern const api::StorageType {};\n".format(name_weight_storage_type)
+            cpp += "const api::StorageType {} = \n".format(name_weight_storage_type)
+            cpp += "  {};\n".format(storageTypeToEnum[shader_info.weight_storage_type])
+
     cpp += nsend
     h += nsend
 

From 83d78944f906702772175b8d574e8beaaa07b2fb Mon Sep 17 00:00:00 2001
From: Jiewen Tan <jwtan@google.com>
Date: Wed, 2 Nov 2022 23:31:26 +0000
Subject: [PATCH 0501/1922] [LTC] Remove non-native view ops (#88031)

Summary:
LTC somehow implements a bunch of non-native view ops during the transition to functionalization. Let's remove them now that functionalization is final.

Test Plan:
CI.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88031
Approved by: https://github.com/JackCaoG, https://github.com/antoniojkim
---
 aten/src/ATen/native/ts_native_functions.yaml |  22 ---
 torch/csrc/lazy/core/internal_ops/ltc_ops.h   |   8 -
 .../csrc/lazy/ts_backend/ts_node_lowering.cpp | 186 ------------------
 3 files changed, 216 deletions(-)

diff --git a/aten/src/ATen/native/ts_native_functions.yaml b/aten/src/ATen/native/ts_native_functions.yaml
index 3cb290f1004bf..d8ea64d56455d 100644
--- a/aten/src/ATen/native/ts_native_functions.yaml
+++ b/aten/src/ATen/native/ts_native_functions.yaml
@@ -235,25 +235,3 @@ non_native:
     opkind: ltc_cast
     properties:
       - ShapeCompute
-
-  # View ops only required until proper functionalization pass is introduced into LTC
-  - func: as_strided_view_update(Tensor target, Tensor input, int[] size, int[] stride, int storage_offset) -> Tensor
-    opkind: ltc_as_strided_view_update
-  - func: as_strided(Tensor input, int[] size, int[] stride, int storage_offset) -> Tensor
-  - func: diagonal_view_update(Tensor target, Tensor input, int offset, int dim1, int dim2) -> Tensor
-    opkind: ltc_diagonal_view_update
-    properties:
-      - ShapeCompute
-  - func: diagonal(Tensor input, int offset, int dim1, int dim2) -> Tensor
-  - func: narrow_view_update(Tensor input, Tensor source, int[] base_indices) -> Tensor
-    opkind: ltc_narrow_view_update
-  - func: narrow(Tensor input, int[] base_indices, int[] sizes) -> Tensor
-  - func: permute(Tensor input, int[] dims) -> Tensor
-  - func: resize(Tensor input, int[] size) -> Tensor
-  - func: select_view_update(Tensor target, Tensor source, int dim, int start, int end, int stride) -> Tensor
-    opkind: ltc_select_view_update
-    properties:
-      - ShapeCompute
-  - func: select(Tensor input, int dim, int start, int end, int stride) -> Tensor
-  - func: squeeze(Tensor input, int dim) -> Tensor
-  - func: unsqueeze(Tensor input, int dim) -> Tensor
diff --git a/torch/csrc/lazy/core/internal_ops/ltc_ops.h b/torch/csrc/lazy/core/internal_ops/ltc_ops.h
index 3f195d8b445cf..ce62f2e51f539 100644
--- a/torch/csrc/lazy/core/internal_ops/ltc_ops.h
+++ b/torch/csrc/lazy/core/internal_ops/ltc_ops.h
@@ -48,13 +48,5 @@ const OpKindWrapper ltc_replication_pad_backward(
     "lazy_tensors::replication_pad_backward");
 const OpKindWrapper ltc_tensor_data("lazy_tensors::tensor_data");
 
-// For view ops
-const OpKindWrapper ltc_as_strided_view_update(
-    "lazy_tensors::as_strided_view_update");
-const OpKindWrapper ltc_diagonal_view_update(
-    "lazy_tensors::diagonal_view_update");
-const OpKindWrapper ltc_narrow_view_update("lazy_tensors::narrow_view_update");
-const OpKindWrapper ltc_select_view_update("lazy_tensors::select_view_update");
-
 } // namespace lazy
 } // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/ts_node_lowering.cpp b/torch/csrc/lazy/ts_backend/ts_node_lowering.cpp
index 2c0598ecfe1b1..8e64cfb6ac13e 100644
--- a/torch/csrc/lazy/ts_backend/ts_node_lowering.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_node_lowering.cpp
@@ -158,192 +158,6 @@ torch::lazy::TSOpVector Scalar::Lower(
 }
 
 // View Ops
-
-torch::lazy::TSOpVector AsStrided::Lower(
-    std::shared_ptr<torch::jit::GraphFunction> function,
-    torch::lazy::TSLoweringContext* loctx) const {
-  std::vector<torch::jit::NamedValue> arguments;
-  arguments.emplace_back(loctx->GetOutputOp(operand(0)));
-  arguments.emplace_back(size);
-  arguments.emplace_back(stride);
-  arguments.emplace_back(storage_offset);
-  TSOpVector as_strided_out = LowerBuiltin(this, function, arguments);
-  TORCH_CHECK_EQ(as_strided_out.size(), 1);
-  return {GenerateClone(as_strided_out.front(), function)};
-}
-
-torch::lazy::TSOpVector AsStridedViewUpdate::Lower(
-    std::shared_ptr<torch::jit::GraphFunction> function,
-    torch::lazy::TSLoweringContext* loctx) const {
-  torch::jit::Value* destination =
-      GenerateClone(loctx->GetOutputOp(operand(0)), function);
-  const torch::lazy::Output& input_op = operand(1);
-  const torch::lazy::Shape& input_shape = input_op.shape();
-  const auto input_dimensions = input_shape.sizes();
-  std::vector<torch::jit::NamedValue> dest_arguments;
-  dest_arguments.emplace_back(destination);
-  dest_arguments.emplace_back(
-      std::vector<int64_t>(input_dimensions.begin(), input_dimensions.end()));
-  dest_arguments.emplace_back(stride);
-  dest_arguments.emplace_back(storage_offset);
-  TSOpVector as_strided_out =
-      LowerBuiltin(at::aten::as_strided, function, dest_arguments);
-  TORCH_CHECK_EQ(as_strided_out.size(), 1);
-  torch::jit::Value* as_strided = as_strided_out.front();
-  GenerateCopy(as_strided, loctx->GetOutputOp(input_op), function);
-  return {destination};
-}
-
-torch::lazy::TSOpVector Diagonal::Lower(
-    std::shared_ptr<torch::jit::GraphFunction> function,
-    torch::lazy::TSLoweringContext* loctx) const {
-  std::vector<torch::jit::NamedValue> arguments;
-  arguments.emplace_back(loctx->GetOutputOp(operand(0)));
-  arguments.emplace_back(offset);
-  arguments.emplace_back(dim1);
-  arguments.emplace_back(dim2);
-  return LowerBuiltin(this, function, arguments);
-}
-
-torch::lazy::TSOpVector DiagonalViewUpdate::Lower(
-    std::shared_ptr<torch::jit::GraphFunction> function,
-    torch::lazy::TSLoweringContext* loctx) const {
-  // Since we promise the backends that we never generate any aliased
-  // inplace update IR, therefore we clone the target first and then
-  // update the clone inplace instead. Since the clone is transient,
-  // it will never be aliased, and therefore it's safe.
-  torch::jit::Value* destination =
-      GenerateClone(loctx->GetOutputOp(operand(0)), function);
-
-  // Replay the diagonal.
-  std::vector<torch::jit::NamedValue> arguments;
-  arguments.emplace_back(destination);
-  arguments.emplace_back(offset);
-  arguments.emplace_back(dim1);
-  arguments.emplace_back(dim2);
-  auto diag = LowerBuiltin(at::aten::diagonal, function, arguments);
-
-  // Update the replayed diagonal view with the input.
-  GenerateCopy(diag.front(), loctx->GetOutputOp(operand(1)), function);
-
-  // Destination's diag view should be updated.
-  return {destination};
-}
-
-torch::lazy::TSOpVector Narrow::Lower(
-    std::shared_ptr<torch::jit::GraphFunction> function,
-    torch::lazy::TSLoweringContext* loctx) const {
-  const torch::lazy::Output& input = operand(0);
-  torch::jit::Value* base = loctx->GetOutputOp(input);
-  const torch::lazy::Shape& input_shape = input.shape();
-  TORCH_CHECK_EQ(sizes.size(), base_indices.size());
-  TORCH_CHECK_EQ(input_shape.dim(), base_indices.size());
-  for (size_t dim = 0; dim < base_indices.size(); ++dim) {
-    int64_t start = base_indices[dim];
-    base = GenerateSlice(
-        /*base=*/base,
-        /*dim=*/dim,
-        /*start=*/start,
-        /*end=*/start + sizes[dim],
-        /*step=*/1,
-        /*function=*/function);
-  }
-  return {base};
-}
-
-torch::lazy::TSOpVector NarrowViewUpdate::Lower(
-    std::shared_ptr<torch::jit::GraphFunction> function,
-    torch::lazy::TSLoweringContext* loctx) const {
-  torch::jit::Value* dest =
-      GenerateClone(loctx->GetOutputOp(operand(0)), function);
-  const torch::lazy::Output& source_argument = operand(1);
-  const torch::lazy::Shape& source_shape = source_argument.shape();
-  TORCH_CHECK_EQ(source_shape.dim(), base_indices.size());
-  torch::jit::Value* base = dest;
-  for (size_t dim = 0; dim < base_indices.size(); ++dim) {
-    int64_t start = base_indices[dim];
-    base = GenerateSlice(
-        /*base=*/base,
-        /*dim=*/dim,
-        /*start=*/start,
-        /*end=*/start + source_shape.size(dim),
-        /*step=*/1,
-        /*function=*/function);
-  }
-  GenerateCopy(base, loctx->GetOutputOp(source_argument), function);
-  return {dest};
-}
-
-torch::lazy::TSOpVector Permute::Lower(
-    std::shared_ptr<torch::jit::GraphFunction> function,
-    torch::lazy::TSLoweringContext* loctx) const {
-  std::vector<torch::jit::NamedValue> arguments;
-  arguments.emplace_back(loctx->GetOutputOp(operand(0)));
-  arguments.emplace_back(dims);
-  return LowerBuiltin(this, function, arguments);
-}
-
-torch::lazy::TSOpVector Resize::Lower(
-    std::shared_ptr<torch::jit::GraphFunction> function,
-    torch::lazy::TSLoweringContext* loctx) const {
-  std::vector<torch::jit::NamedValue> arguments;
-  for (const torch::lazy::Output& output : operands()) {
-    arguments.emplace_back(loctx->GetOutputOp(output));
-  }
-  return LowerBuiltin(this, function, arguments);
-}
-
-torch::lazy::TSOpVector Select::Lower(
-    std::shared_ptr<torch::jit::GraphFunction> function,
-    torch::lazy::TSLoweringContext* loctx) const {
-  int64_t step = torch::lazy::GetStride(start, end, stride);
-  torch::jit::Value* base = loctx->GetOutputOp(operand(0));
-  return {GenerateSlice(
-      /*base=*/base,
-      /*dim=*/dim,
-      /*start=*/start,
-      /*end=*/end,
-      /*step=*/step,
-      /*function=*/function)};
-}
-
-torch::lazy::TSOpVector SelectViewUpdate::Lower(
-    std::shared_ptr<torch::jit::GraphFunction> function,
-    torch::lazy::TSLoweringContext* loctx) const {
-  torch::jit::Value* dest =
-      GenerateClone(loctx->GetOutputOp(operand(0)), function);
-  int64_t step = torch::lazy::GetStride(start, end, stride);
-  torch::jit::Value* selected = GenerateSlice(
-      /*base=*/dest,
-      /*dim=*/dim,
-      /*start=*/start,
-      /*end=*/end,
-      /*step=*/step,
-      /*function=*/function);
-  GenerateCopy(selected, loctx->GetOutputOp(operand(1)), function);
-  return {dest};
-}
-
-torch::lazy::TSOpVector Squeeze::Lower(
-    std::shared_ptr<torch::jit::GraphFunction> function,
-    torch::lazy::TSLoweringContext* loctx) const {
-  std::vector<torch::jit::NamedValue> arguments;
-  arguments.emplace_back(loctx->GetOutputOp(operand(0)));
-  if (dim != -1) {
-    arguments.emplace_back(dim);
-  }
-  return LowerBuiltin(this, function, arguments);
-}
-
-torch::lazy::TSOpVector Unsqueeze::Lower(
-    std::shared_ptr<torch::jit::GraphFunction> function,
-    torch::lazy::TSLoweringContext* loctx) const {
-  std::vector<torch::jit::NamedValue> arguments;
-  arguments.emplace_back(loctx->GetOutputOp(operand(0)));
-  arguments.emplace_back(dim);
-  return LowerBuiltin(this, function, arguments);
-}
-
 torch::lazy::TSOpVector View::Lower(
     std::shared_ptr<torch::jit::GraphFunction> function,
     torch::lazy::TSLoweringContext* loctx) const {

From c8546040f6cd44b9f6c3bdf59bac5234c0f33010 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 2 Nov 2022 23:33:15 +0000
Subject: [PATCH 0502/1922] Revert "[inductor] Handle the case where kwargs
 contains tensor (#88215)"

This reverts commit 983c0e7f3101f1543bed6c4ec1539a4d590a94c0.

Reverted https://github.com/pytorch/pytorch/pull/88215 on behalf of https://github.com/huydhn due to Sorry for reverting your PR but I think it breaks trunk https://github.com/pytorch/pytorch/actions/runs/3380662072/jobs/5613987333 with a failure in test_torchinductor_opinfo.py
---
 test/inductor/test_torchinductor.py | 14 --------------
 torch/_inductor/ir.py               | 28 ++++++++++++++--------------
 torch/_inductor/lowering.py         |  6 ++----
 3 files changed, 16 insertions(+), 32 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 114ba4c00ba5f..58ae49eb1930e 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -4083,20 +4083,6 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                 else:
                     self.assertEqual(len(inps), 0)
 
-    @unittest.skipIf(HAS_CUDA, "histogramdd only supports cpu")
-    def test_kwargs(self):
-        def fn(x, y):
-            return torch.histogramdd(
-                x,
-                bins=[3, 3],
-                weight=y,
-            )
-
-        self.common(
-            fn,
-            [torch.randn((4, 2)), torch.randn((4))],
-        )
-
 
 if HAS_CPU:
 
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 8ec3f494887b3..8ca869df03602 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -8,7 +8,6 @@
 from collections import OrderedDict
 from enum import Enum
 from functools import partial
-from inspect import signature
 from typing import Any, Callable, ClassVar, Dict, List, Optional, Set, Tuple, Union
 from unittest.mock import patch
 
@@ -2237,8 +2236,7 @@ def copy_input(x):
 
     @classmethod
     def process_kernel(cls, kernel, *args, **kwargs):
-        binded_args = signature(kernel).bind(*args, **kwargs).arguments
-        args_flat, args_spec = pytree.tree_flatten(binded_args)
+        args_flat, args_spec = pytree.tree_flatten(args)
 
         is_arg_tensor = []
         tensor_args = []
@@ -2251,16 +2249,15 @@ def process_kernel(cls, kernel, *args, **kwargs):
                 non_tensor_args.append(arg)
 
         def unflatten_args(new_tensor_args, new_non_tensor_args):
-            result = []
+            new_args = []
             it_tensors = iter(new_tensor_args)
             it_non_tensors = iter(new_non_tensor_args)
             for is_tensor in is_arg_tensor:
                 if is_tensor:
-                    result.append(next(it_tensors))
+                    new_args.append(next(it_tensors))
                 else:
-                    result.append(next(it_non_tensors))
-            result = pytree.tree_unflatten(result, args_spec)
-            return result.get("args", []), result.get("kwargs", {})
+                    new_args.append(next(it_non_tensors))
+            return pytree.tree_unflatten(new_args, args_spec)
 
         tensor_args = [cls.realize_input(x) for x in tensor_args]
 
@@ -2286,8 +2283,9 @@ def unflatten_args(new_tensor_args, new_non_tensor_args):
             ).zero_()
             example_args.append(arg)
 
-        new_args, new_kwargs = unflatten_args(example_args, non_tensor_args)
-        example_output = kernel(*new_args, **new_kwargs)
+        example_output = kernel(
+            *unflatten_args(example_args, non_tensor_args), **kwargs
+        )
 
         return example_output, tensor_args, non_tensor_args, unflatten_args
 
@@ -2880,13 +2878,15 @@ class Shim:
             def __repr__(self):
                 return self.ref
 
+        tensor_args = [Shim(x.codegen_reference()) for x in self.inputs]
+        constant_args = [Shim(repr(x)) for x in self.constant_args]
+
         def gen_kwarg(k, v):
             return f"{k}={repr(v)}"
 
-        tensor_args = [Shim(x.codegen_reference()) for x in self.inputs]
-        constant_args = [Shim(repr(x)) for x in self.constant_args]
-        args, kwargs = self.unflatten_args(tensor_args, constant_args)
-        return list(map(repr, args)) + list(gen_kwarg(k, v) for k, v in kwargs.items())
+        kwargs = list(gen_kwarg(k, v) for k, v in self.kwargs.items())
+
+        return list(map(repr, self.unflatten_args(tensor_args, constant_args))) + kwargs
 
     @classmethod
     def create(cls, kernel, *args, **kwargs):
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index a59b2cacf0153..a05e6d527ea9a 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -164,10 +164,8 @@ def wrapped(*args, **kwargs):
             args = args[0]
         # Only look at args that are Tensors
         indices = [i for i, x in enumerate(args) if isinstance(x, TensorBox)]
-        # kwargs tensors not supported yet unless it's a fallback op
-        assert not any(isinstance(x, TensorBox) for x in kwargs.values()) or all(
-            fn in fallbacks for fn in aten_fn
-        )
+        # kwargs tensors not supported yet
+        assert not any(isinstance(x, TensorBox) for x in kwargs.values())
 
         if (type_promotion_kind or convert_input_to_bool) and indices:
             if convert_input_to_bool:

From edbc2086fdc4f42aa44b701ca23ed4b6f7768914 Mon Sep 17 00:00:00 2001
From: soulitzer <soulitzer@gmail.com>
Date: Wed, 2 Nov 2022 13:52:15 -0400
Subject: [PATCH 0503/1922] Split out forward AD tests from test_ops_gradients
 and reenable slow gradcheck CI (#88216)

Fixes: https://github.com/pytorch/pytorch/issues/88010

This PR does a couple things to stop slow gradcheck from timing out:
- Splits out test_ops_fwd_gradients from test_ops_gradients, and factors out TestFwdGradients and TestBwdGradients which both inherit from TestGradients, now situated in common_utils (maybe there is a better place?)
- Skips CompositeCompliance (and several other test files) for slow gradcheck CI since they do not use gradcheck
- because test times for test_ops_fwd_gradients and test_ops_gradients are either unknown or wrong, we hardcode them for now to prevent them from being put together. We can undo the hack after we see actual test times are updated. ("def calculate_shards" randomly divides tests with unknown test times in a round-robin fashion.)
- Updates references to test_ops_gradients and TestGradients
- Test files that are skipped for slow gradcheck CI are now centrally located in in run_tests.py, this reduces how fine-grained we can be with the skips, so for some skips (one so far) we still use the old skipping mechanism, e.g. for test_mps

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88216
Approved by: https://github.com/albanD
---
 .github/workflows/periodic.yml                |   6 -
 CODEOWNERS                                    |   3 +-
 test/functorch/functorch_additional_op_db.py  |   9 +-
 test/quantization/core/test_quantized_op.py   |   3 +-
 test/quantization/jit/test_quantize_jit.py    |   2 -
 test/run_test.py                              |  34 +++-
 test/test_cpp_extensions_jit.py               |   3 +-
 test/test_decomp.py                           |   2 -
 test/test_fx.py                               |   2 -
 test/test_meta.py                             |   3 -
 test/test_ops.py                              |   7 -
 test/test_ops_fwd_gradients.py                |  70 +++++++
 test/test_ops_gradients.py                    | 187 +-----------------
 test/test_ops_jit.py                          |   3 +-
 .../_internal/common_methods_invocations.py   | 144 ++++++++------
 torch/testing/_internal/common_utils.py       | 134 ++++++++++++-
 .../_internal/opinfo/definitions/_masked.py   |   5 +-
 .../_internal/opinfo/definitions/linalg.py    |  37 ++--
 .../_internal/opinfo/definitions/special.py   |   2 +-
 19 files changed, 355 insertions(+), 301 deletions(-)
 create mode 100644 test/test_ops_fwd_gradients.py

diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index 4eca9c890852f..58e379e0b5fd2 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -14,9 +14,6 @@ concurrency:
 
 jobs:
   linux-bionic-cuda11_6-py3-gcc7-slow-gradcheck-build:
-    # Disable because slow-gradcheck tests take > 4 hrs and time out.
-    # TODO(sdym@meta.com): investigate re-enabling slow-gradcheck tests.
-    if: false
     name: linux-bionic-cuda11.6-py3-gcc7-slow-gradcheck
     uses: ./.github/workflows/_linux-build.yml
     with:
@@ -29,9 +26,6 @@ jobs:
         ]}
 
   linux-bionic-cuda11_6-py3-gcc7-slow-gradcheck-test:
-    # Disable because slow-gradcheck tests take > 4 hrs and time out.
-    # TODO(sdym@meta.com): investigate re-enabling slow-gradcheck tests.
-    if: false
     name: linux-bionic-cuda11.6-py3-gcc7-slow-gradcheck
     uses: ./.github/workflows/_linux-test.yml
     needs: linux-bionic-cuda11_6-py3-gcc7-slow-gradcheck-build
diff --git a/CODEOWNERS b/CODEOWNERS
index 707a2ccec1802..179e87198dba2 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -91,7 +91,8 @@ nn/qat/ @jerryzh168
 /torch/testing/_internal/common_methods_invocations.py @mruberry @ngimel
 /torch/testing/_internal/common_device_type.py @mruberry @ngimel
 test/test_ops.py @mruberry @ngimel
-test/test_ops_gradients.py @mruberry @ngimel
+test/test_ops_gradients.py @mruberry @ngimel @soulitzer
+test/test_ops_fwd_gradients.py @mruberry @ngimel @soulitzer
 test/test_unary_ufuncs.py @mruberry @ngimel
 test/test_binary_ufuncs.py @mruberry @ngimel
 test/test_reductions.py @mruberry @ngimel
diff --git a/test/functorch/functorch_additional_op_db.py b/test/functorch/functorch_additional_op_db.py
index 6343e7420546b..9352924d5004f 100644
--- a/test/functorch/functorch_additional_op_db.py
+++ b/test/functorch/functorch_additional_op_db.py
@@ -445,7 +445,8 @@ def sample_inputs_conversion(op_info, device, dtype, requires_grad, **kwargs):
            sample_inputs_func=sample_inputs_conversion,
            skips=(
                # autograd tests don't handle operators that change dtype
-               DecorateInfo(unittest.expectedFailure, 'TestGradients'),
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients'),
+               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients'),
                DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # RuntimeError: attribute lookup is not defined on builtin
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
@@ -511,7 +512,8 @@ def sample_inputs_conversion(op_info, device, dtype, requires_grad, **kwargs):
            sample_inputs_func=sample_inputs_conversion,
            skips=(
                # autograd tests don't handle operators that change dtype
-               DecorateInfo(unittest.expectedFailure, 'TestGradients'),
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients'),
+               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients'),
                DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # RuntimeError: attribute lookup is not defined on builtin
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
@@ -524,7 +526,8 @@ def sample_inputs_conversion(op_info, device, dtype, requires_grad, **kwargs):
            sample_inputs_func=sample_inputs_conversion,
            skips=(
                # autograd tests don't handle operators that change dtype
-               DecorateInfo(unittest.expectedFailure, 'TestGradients'),
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients'),
+               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients'),
                DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # RuntimeError: attribute lookup is not defined on builtin
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index 79297e073f047..a63acc99383b3 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -21,7 +21,7 @@
 import torch.testing._internal.hypothesis_utils as hu
 hu.assert_deadline_disabled()
 
-from torch.testing._internal.common_utils import TestCase, skipIfSlowGradcheckEnv
+from torch.testing._internal.common_utils import TestCase
 from torch.testing._internal.common_utils import IS_PPC, TEST_WITH_UBSAN, IS_MACOS, BUILD_WITH_CAFFE2
 from torch.testing._internal.common_quantization import skipIfNoFBGEMM, skipIfNoQNNPACK
 from torch.testing._internal.common_quantized import _quantize, _dequantize, _calculate_dynamic_qparams, \
@@ -130,7 +130,6 @@ def _get_random_tensor_and_q_params(shapes, rand_scale, torch_type):
         X_scale = 1e-10
     return X, X_scale, X_zero_point
 
-@skipIfSlowGradcheckEnv
 class TestQuantizedOps(TestCase):
 
     """Helper function to test quantized activation functions."""
diff --git a/test/quantization/jit/test_quantize_jit.py b/test/quantization/jit/test_quantize_jit.py
index 49152a1097ac2..7726dc04c7111 100644
--- a/test/quantization/jit/test_quantize_jit.py
+++ b/test/quantization/jit/test_quantize_jit.py
@@ -73,7 +73,6 @@
 from torch.testing._internal.jit_utils import attrs_with_prefix
 from torch.testing._internal.jit_utils import get_forward
 from torch.testing._internal.jit_utils import get_forward_graph
-from torch.testing._internal.common_utils import skipIfSlowGradcheckEnv
 
 from torch.jit._recursive import wrap_cpp_module
 
@@ -1626,7 +1625,6 @@ def forward(self, x):
         torch.jit.save(model, b)
 
 
-@skipIfSlowGradcheckEnv
 class TestQuantizeJitOps(QuantizationTestCase):
     """Test graph mode post training static quantization works
     for individual ops end to end.
diff --git a/test/run_test.py b/test/run_test.py
index 620a8b712aeeb..307b83dfdcd76 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -26,6 +26,7 @@
     shell,
     set_cwd,
     parser as common_parser,
+    is_slow_gradcheck_env,
 )
 import torch.distributed as dist
 from torch.multiprocessing import get_context
@@ -302,6 +303,7 @@ def skip_test_p(name: str) -> bool:
     "test_nn",
     "test_ops",
     "test_ops_gradients",
+    "test_ops_fwd_gradients",
     "test_ops_jit",
     "test_torch"
 ]
@@ -380,6 +382,23 @@ def skip_test_p(name: str) -> bool:
     "distributions/test_distributions",
 ]
 
+# These are just the slowest ones, this isn't an exhaustive list.
+TESTS_NOT_USING_GRADCHECK = [
+    # Note that you should use skipIfSlowGradcheckEnv if you do not wish to
+    # skip all the tests in that file, e.g. test_mps
+    "doctests",
+    "test_meta",
+    "test_hub",
+    "test_fx",
+    "test_decomp",
+    "test_cpp_extensions_jit",
+    "test_jit",
+    "test_ops",
+    "test_ops_jit",
+    "dynamo/test_recompile_ux",
+    "inductor/test_smoke",
+    "test_quantization",
+]
 
 def print_to_stderr(message):
     print(message, file=sys.stderr)
@@ -775,6 +794,7 @@ def run_test_ops(test_module, test_directory, options):
     "doctests": run_doctests,
     "test_ops": run_test_ops,
     "test_ops_gradients": run_test_ops,
+    "test_ops_fwd_gradients": run_test_ops,
     "test_ops_jit": run_test_ops,
     "functorch/test_ops": run_test_ops,
 }
@@ -982,11 +1002,11 @@ def find_test_index(test, selected_tests, find_last_index=False):
     return found_idx
 
 
-def exclude_tests(exclude_list, selected_tests, exclude_message=None):
+def exclude_tests(exclude_list, selected_tests, exclude_message=None, exact_match=False):
     for exclude_test in exclude_list:
         tests_copy = selected_tests[:]
         for test in tests_copy:
-            if test.startswith(exclude_test):
+            if (not exact_match and test.startswith(exclude_test)) or test == exclude_test:
                 if exclude_message is not None:
                     print_to_stderr("Excluding {} {}".format(test, exclude_message))
                 selected_tests.remove(test)
@@ -1116,6 +1136,11 @@ def get_selected_tests(options):
         else:
             print("Found test time stats from artifacts")
             test_file_times_config = test_file_times[test_config]
+            if is_slow_gradcheck_env():
+                # HACK: hardcode approx test times, so these two don't get put in the same shard
+                #       we can remove this when their actual runtimes are recorded
+                test_file_times_config["test_ops_fwd_gradients"] = 3600 * 2 + 600  # 2:10
+                test_file_times_config["test_ops_gradients"] = 3600 * 2 + 600  # 2:10
             shards = calculate_shards(num_shards, selected_tests, test_file_times_config,
                                       must_serial=must_serial)
             _, tests_from_shard = shards[which_shard - 1]
@@ -1131,6 +1156,11 @@ def get_selected_tests(options):
         selected_tests = exclude_tests(TESTS_REQUIRING_LAPACK, selected_tests,
                                        "PyTorch is built without LAPACK support.")
 
+    if is_slow_gradcheck_env():
+        selected_tests = exclude_tests(TESTS_NOT_USING_GRADCHECK, selected_tests,
+                                       "Running in slow gradcheck mode, skipping tests "
+                                       "that don't use gradcheck.", exact_match=True)
+
     if options.distributed_tests:
         # Run distributed tests with multiple backends across all shards, one per backend
         selected_tests.extend(DISTRIBUTED_TESTS_WITH_MULTIPLE_BACKENDS.keys())
diff --git a/test/test_cpp_extensions_jit.py b/test/test_cpp_extensions_jit.py
index e4b1e9e550873..2ead8d32ca179 100644
--- a/test/test_cpp_extensions_jit.py
+++ b/test/test_cpp_extensions_jit.py
@@ -15,7 +15,7 @@
 import torch.backends.cudnn
 import torch.utils.cpp_extension
 from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
-from torch.testing._internal.common_utils import gradcheck, skipIfSlowGradcheckEnv
+from torch.testing._internal.common_utils import gradcheck
 
 
 TEST_CUDA = torch.cuda.is_available() and CUDA_HOME is not None
@@ -38,7 +38,6 @@ def remove_build_path():
         shutil.rmtree(default_build_root)
 
 # There's only one test that runs gracheck, run slow mode manually
-@skipIfSlowGradcheckEnv
 class TestCppExtensionJIT(common.TestCase):
     """Tests just-in-time cpp extensions.
     Don't confuse this with the PyTorch JIT (aka TorchScript).
diff --git a/test/test_decomp.py b/test/test_decomp.py
index 27ad870a2adb1..dff62bea17db1 100644
--- a/test/test_decomp.py
+++ b/test/test_decomp.py
@@ -15,7 +15,6 @@
     suppress_warnings,
     TEST_WITH_ASAN,
     run_tests,
-    skipIfSlowGradcheckEnv,
     skipIfTorchDynamo,
 )
 from torch.testing._internal.common_device_type import (
@@ -353,7 +352,6 @@ def test_unsupported(t):
     return any(test_unsupported(x) for x in itertools.chain(flat_args, flat_kwargs))
 
 
-@skipIfSlowGradcheckEnv
 class TestDecomp(TestCase):
     longMessage = True
 
diff --git a/test/test_fx.py b/test/test_fx.py
index 9a46a50982961..0aa5b28a3de7d 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -57,7 +57,6 @@
     IS_WINDOWS,
     find_library_location,
     run_tests,
-    skipIfSlowGradcheckEnv,
 )
 from torch.testing._internal.jit_utils import JitTestCase
 
@@ -4110,7 +4109,6 @@ def tearDownClass(cls):
 instantiate_device_type_tests(TestOperatorSignatures, globals())
 
 @skipIfNoTorchVision
-@skipIfSlowGradcheckEnv
 class TestVisionTracing(JitTestCase):
     def setUp(self):
         # Checking for mutable operations while tracing is feature flagged
diff --git a/test/test_meta.py b/test/test_meta.py
index 858c429f1974c..88644a6552b1b 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -15,7 +15,6 @@
     suppress_warnings,
     TEST_WITH_ASAN,
     run_tests,
-    skipIfSlowGradcheckEnv,
     dtype_abbrs
 )
 from torch.testing._internal.common_device_type import (
@@ -53,7 +52,6 @@
 u8 = torch.uint8
 
 
-@skipIfSlowGradcheckEnv
 class TestMetaConverter(TestCase):
     def assertSameVersionCounter(self, m1, m2):
         # Cannot easily test m1 and m2 have same storage due to
@@ -1118,7 +1116,6 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
 # inconsistencies between CUDA and CPU, and running on CUDA makes it easier
 # to ignore the CPU case when inconsistencies arise.  Ideally we deal
 # with the inconsistencies but this takes time.
-@skipIfSlowGradcheckEnv
 class TestMeta(TestCase):
     # Copies inputs to inplace operations to avoid inplace modifications
     #   to leaves requiring gradient
diff --git a/test/test_ops.py b/test/test_ops.py
index 9c4bd58e1a4da..b18013a3bae55 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -35,7 +35,6 @@
     IS_FBCODE,
     first_sample,
     parametrize,
-    skipIfSlowGradcheckEnv,
     skipIfTorchInductor,
     slowTest,
 )
@@ -109,7 +108,6 @@
 
 # Tests that apply to all operators and aren't related to any particular
 #   system
-@skipIfSlowGradcheckEnv
 class TestCommon(TestCase):
     exact_dtype = True
 
@@ -1389,7 +1387,6 @@ def test_forward_ad(self, device, dtype, op):
                 op.get_op(), args, kwargs, op.gradcheck_wrapper, self.assertEqual)
 
 
-@skipIfSlowGradcheckEnv
 class TestMathBits(TestCase):
     # Tests that
     # 1. The operator's output for physically conjugated/negated tensors and conjugate/negative view tensors
@@ -1599,7 +1596,6 @@ def check_inplace_view(func, input, rs, input_size, input_strides):
 # A mode that when enabled runs correctness checks to ensure
 # that operators have expected tags based on their input and
 # ouput tensor properties
-@skipIfSlowGradcheckEnv
 class TestTagsMode(TorchDispatchMode):
     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         if isinstance(args[0], torch.Tensor):
@@ -1612,7 +1608,6 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         return rs
 
 # Test to verify the correctness for tags in `tags.yaml`, also available for access through `torch.Tags`
-@skipIfSlowGradcheckEnv
 class TestTags(TestCase):
     @onlyCPU
     @ops(ops_and_refs, dtypes=OpDTypes.any_one)
@@ -1632,7 +1627,6 @@ def test_tags(self, device, dtype, op):
                 check_inplace_view(opoverloadpacket, input, rs, old_size, old_stride)
 
 
-@skipIfSlowGradcheckEnv
 class TestRefsOpsInfo(TestCase):
 
     import_paths = ["_refs", "_refs.special", "_refs.nn.functional", "_refs.fft", "_refs._conversions"]
@@ -1914,7 +1908,6 @@ def test_refs_are_in_decomp_table(self, op):
     skip('pinverse'),
 }
 
-@skipIfSlowGradcheckEnv
 class TestFakeTensor(TestCase):
     def _test_fake_helper(self, device, dtype, op, context):
         name = op.name
diff --git a/test/test_ops_fwd_gradients.py b/test/test_ops_fwd_gradients.py
new file mode 100644
index 0000000000000..c3fca7235461f
--- /dev/null
+++ b/test/test_ops_fwd_gradients.py
@@ -0,0 +1,70 @@
+# Owner(s): ["module: unknown"]
+
+from functools import partial
+import torch
+
+from torch.testing._internal.common_utils import (
+    TestGradients, run_tests, skipIfTorchInductor)
+from torch.testing._internal.common_methods_invocations import op_db
+from torch.testing._internal.common_device_type import \
+    (instantiate_device_type_tests, ops, OpDTypes)
+
+# TODO: fixme https://github.com/pytorch/pytorch/issues/68972
+torch.set_default_dtype(torch.float32)
+
+# gradcheck requires double precision
+_gradcheck_ops = partial(ops, dtypes=OpDTypes.supported,
+                         allowed_dtypes=[torch.double, torch.cdouble])
+
+class TestFwdGradients(TestGradients):
+    # Test that forward-over-reverse gradgrad is computed correctly
+    @_gradcheck_ops(op_db)
+    def test_fn_fwgrad_bwgrad(self, device, dtype, op):
+        self._skip_helper(op, device, dtype)
+
+        if op.supports_fwgrad_bwgrad:
+            self._check_helper(device, dtype, op, op.get_op(), "fwgrad_bwgrad")
+        else:
+            err_msg = r"Trying to use forward AD with .* that does not support it"
+            hint_msg = ("Running forward-over-backward gradgrad for an OP that has does not support it did not "
+                        "raise any error. If your op supports forward AD, you should set supports_fwgrad_bwgrad=True.")
+            with self.assertRaisesRegex(NotImplementedError, err_msg, msg=hint_msg):
+                self._check_helper(device, dtype, op, op.get_op(), "fwgrad_bwgrad")
+
+
+    def _forward_grad_helper(self, device, dtype, op, variant, is_inplace):
+        # TODO: clean up how attributes are passed to gradcheck from OpInfos
+        def call_grad_test_helper():
+            check_batched_forward_grad = ((op.check_batched_forward_grad and not is_inplace) or
+                                          (op.check_inplace_batched_forward_grad and is_inplace))
+            self._grad_test_helper(device, dtype, op, variant, check_forward_ad=True, check_backward_ad=False,
+                                   check_batched_grad=False, check_batched_forward_grad=check_batched_forward_grad)
+        if op.supports_forward_ad:
+            call_grad_test_helper()
+        else:
+            err_msg = r"Trying to use forward AD with .* that does not support it"
+            hint_msg = ("Running forward AD for an OP that has does not support it did not "
+                        "raise any error. If your op supports forward AD, you should set supports_forward_ad=True")
+            with self.assertRaisesRegex(NotImplementedError, err_msg, msg=hint_msg):
+                call_grad_test_helper()
+
+    @_gradcheck_ops(op_db)
+    def test_forward_mode_AD(self, device, dtype, op):
+        self._skip_helper(op, device, dtype)
+
+        self._forward_grad_helper(device, dtype, op, op.get_op(), is_inplace=False)
+
+    @_gradcheck_ops(op_db)
+    @skipIfTorchInductor("to be fixed")
+    def test_inplace_forward_mode_AD(self, device, dtype, op):
+        self._skip_helper(op, device, dtype)
+
+        if not op.inplace_variant or not op.supports_inplace_autograd:
+            self.skipTest("Skipped! Operation does not support inplace autograd.")
+
+        self._forward_grad_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace()), is_inplace=True)
+
+instantiate_device_type_tests(TestFwdGradients, globals())
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_ops_gradients.py b/test/test_ops_gradients.py
index 6d517c7a7f8b1..b4401af543d01 100644
--- a/test/test_ops_gradients.py
+++ b/test/test_ops_gradients.py
@@ -1,12 +1,9 @@
 # Owner(s): ["module: unknown"]
 
-from functools import partial, wraps
-from itertools import chain
+from functools import partial
 import torch
 
-from torch.testing._internal.common_utils import (
-    TestCase, is_iterable_of_tensors, run_tests, gradcheck, gradgradcheck, is_slow_gradcheck_env,
-    skipIfTorchInductor)
+from torch.testing._internal.common_utils import TestGradients, run_tests
 from torch.testing._internal.common_methods_invocations import op_db
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, ops, OpDTypes)
@@ -18,137 +15,7 @@
 _gradcheck_ops = partial(ops, dtypes=OpDTypes.supported,
                          allowed_dtypes=[torch.double, torch.cdouble])
 
-class TestGradients(TestCase):
-    exact_dtype = True
-
-    # Copies inputs to inplace operations to avoid inplace modifications
-    #   to leaves requiring gradient
-    def _get_safe_inplace(self, inplace_variant):
-        @wraps(inplace_variant)
-        def _fn(t, *args, **kwargs):
-            return inplace_variant(t.clone(), *args, **kwargs)
-
-        return _fn
-
-    def _check_helper(self, device, dtype, op, variant, check, *, check_forward_ad=False, check_backward_ad=True,
-                      check_batched_grad=None, check_batched_forward_grad=False):
-        assert check in ('gradcheck', 'bwgrad_bwgrad', 'fwgrad_bwgrad')
-        # NB: check_backward_ad does not affect gradgradcheck (always True)
-        if variant is None:
-            self.skipTest("Skipped! Variant not implemented.")
-        if not op.supports_dtype(dtype, torch.device(device).type):
-            self.skipTest(f"Skipped! {op.name} does not support dtype {str(dtype)}")
-
-        def is_inplace(variant):
-            if hasattr(variant, "__wrapped__"):
-                return variant.__wrapped__ is op.get_inplace()
-            return variant is op.get_inplace()
-
-        include_conjugated_inputs = op.test_conjugated_samples and dtype.is_complex
-
-        samples = op.sample_inputs(device, dtype, requires_grad=True, include_conjugated_inputs=include_conjugated_inputs,
-                                   small_inputs_only=is_slow_gradcheck_env())
-
-        for sample in samples:
-            if sample.broadcasts_input and is_inplace(variant):
-                continue
-
-            # Gradcheck expects tensors as its input, but autograd actually supports tensorlists
-            #   and tensors passed as kwargs. The following creates a function that accepts just
-            #   the tensors that require grad as varargs, and then recomposes them back into the
-            #   original input.
-
-            # Creates gradcheck inputs by identifying tensors requiring grad
-            all_args = None
-            if is_iterable_of_tensors(sample.input):
-                all_args = chain(sample.input, sample.args, sample.kwargs.values())
-            else:
-                all_args = tuple(chain((sample.input,), sample.args, sample.kwargs.values()))
-            gradcheck_args = tuple(x for x in all_args if (isinstance(x, torch.Tensor) and x.requires_grad))
-
-            def _input_recomposition_helper(inputs, inp, input_idx):
-                if is_iterable_of_tensors(inp):
-                    tensor_list = []
-                    for x in inp:
-                        if isinstance(x, torch.Tensor) and x.requires_grad:
-                            tensor_list.append(inputs[input_idx])
-                            input_idx = input_idx + 1
-                        else:
-                            tensor_list.append(x)
-                    return tensor_list, input_idx
-                elif isinstance(inp, torch.Tensor) and inp.requires_grad:
-                    return inputs[input_idx], input_idx + 1
-                else:
-                    return inp, input_idx
-
-            def fn(*inputs):
-                # Puts inputs back into sample properly
-                positional_args = []
-                input_idx = 0
-                inp, input_idx = _input_recomposition_helper(inputs, sample.input, input_idx)
-                positional_args.append(inp)
-
-                for x in sample.args:
-                    inp, input_idx = _input_recomposition_helper(inputs, x, input_idx)
-                    positional_args.append(inp)
-
-                # Recreates kwargs
-                kwargs = {}
-                for k, v in sample.kwargs.items():
-                    inp, input_idx = _input_recomposition_helper(inputs, v, input_idx)
-                    kwargs[k] = inp
-
-                output = op.gradcheck_wrapper(variant, *positional_args, **kwargs)
-                if sample.output_process_fn_grad is not None:
-                    return sample.output_process_fn_grad(output)
-                return output
-
-            if check == 'gradcheck':
-                if check_batched_grad is None:
-                    check_batched_grad = op.check_batched_grad
-                self.assertTrue(gradcheck(fn, gradcheck_args,
-                                          check_batched_grad=check_batched_grad,
-                                          check_grad_dtypes=True,
-                                          nondet_tol=op.gradcheck_nondet_tol,
-                                          fast_mode=op.gradcheck_fast_mode,
-                                          check_forward_ad=check_forward_ad,
-                                          check_backward_ad=check_backward_ad,
-                                          check_undefined_grad=True,
-                                          check_batched_forward_grad=check_batched_forward_grad))
-            elif check in ('bwgrad_bwgrad', 'fwgrad_bwgrad'):  # gradgrad check
-                self.assertFalse(check_forward_ad, msg="Cannot run forward AD check for gradgradcheck")
-                for gen_non_contig_grad_outputs in (False, True):
-                    kwargs = {
-                        "gen_non_contig_grad_outputs": gen_non_contig_grad_outputs,
-                        "check_batched_grad": op.check_batched_gradgrad,
-                        "check_grad_dtypes": True,
-                        "nondet_tol": op.gradcheck_nondet_tol,
-                        "fast_mode": op.gradcheck_fast_mode
-                    }
-                    if check == "fwgrad_bwgrad":
-                        kwargs["check_fwd_over_rev"] = True
-                        kwargs["check_rev_over_rev"] = False
-                        kwargs["check_batched_grad"] = False
-                        kwargs["check_undefined_grad"] = False
-
-                    self.assertTrue(gradgradcheck(fn, gradcheck_args, **kwargs))
-            else:
-                self.assertTrue(False, msg="Unknown check requested!")
-
-    def _grad_test_helper(self, device, dtype, op, variant, *, check_forward_ad=False, check_backward_ad=True,
-                          check_batched_grad=None, check_batched_forward_grad=False):
-        return self._check_helper(device, dtype, op, variant, 'gradcheck', check_forward_ad=check_forward_ad,
-                                  check_backward_ad=check_backward_ad, check_batched_grad=check_batched_grad,
-                                  check_batched_forward_grad=check_batched_forward_grad)
-
-    def _skip_helper(self, op, device, dtype):
-        if dtype not in op.supported_backward_dtypes(torch.device(device).type):
-            self.skipTest("Skipped! Op doesn't support autograd for this dtype.")
-        if not op.supports_autograd and not op.supports_forward_ad:
-            self.skipTest("Skipped! autograd not supported.")
-        if op.name == "cat":
-            self.skipTest("TODO(whc) fix pre-existing bug with cat for newly added opinfo for empty+nonempty")
-
+class TestBwdGradients(TestGradients):
     # Tests that gradients are computed correctly
     @_gradcheck_ops(op_db)
     def test_fn_grad(self, device, dtype, op):
@@ -192,20 +59,6 @@ def test_fn_gradgrad(self, device, dtype, op):
         else:
             self._check_helper(device, dtype, op, op.get_op(), 'bwgrad_bwgrad')
 
-    # Test that forward-over-reverse gradgrad is computed correctly
-    @_gradcheck_ops(op_db)
-    def test_fn_fwgrad_bwgrad(self, device, dtype, op):
-        self._skip_helper(op, device, dtype)
-
-        if op.supports_fwgrad_bwgrad:
-            self._check_helper(device, dtype, op, op.get_op(), "fwgrad_bwgrad")
-        else:
-            err_msg = r"Trying to use forward AD with .* that does not support it"
-            hint_msg = ("Running forward-over-backward gradgrad for an OP that has does not support it did not "
-                        "raise any error. If your op supports forward AD, you should set supports_fwgrad_bwgrad=True.")
-            with self.assertRaisesRegex(NotImplementedError, err_msg, msg=hint_msg):
-                self._check_helper(device, dtype, op, op.get_op(), "fwgrad_bwgrad")
-
     # Test that gradients of gradients are properly raising
     @_gradcheck_ops(op_db)
     def test_fn_fail_gradgrad(self, device, dtype, op):
@@ -231,40 +84,8 @@ def test_inplace_gradgrad(self, device, dtype, op):
             self.skipTest("Skipped! Operation does not support inplace autograd.")
         self._check_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace()), "bwgrad_bwgrad")
 
-    def _forward_grad_helper(self, device, dtype, op, variant, is_inplace):
-        # TODO: clean up how attributes are passed to gradcheck from OpInfos
-        def call_grad_test_helper():
-            check_batched_forward_grad = ((op.check_batched_forward_grad and not is_inplace) or
-                                          (op.check_inplace_batched_forward_grad and is_inplace))
-            self._grad_test_helper(device, dtype, op, variant, check_forward_ad=True, check_backward_ad=False,
-                                   check_batched_grad=False, check_batched_forward_grad=check_batched_forward_grad)
-        if op.supports_forward_ad:
-            call_grad_test_helper()
-        else:
-            err_msg = r"Trying to use forward AD with .* that does not support it"
-            hint_msg = ("Running forward AD for an OP that has does not support it did not "
-                        "raise any error. If your op supports forward AD, you should set supports_forward_ad=True")
-            with self.assertRaisesRegex(NotImplementedError, err_msg, msg=hint_msg):
-                call_grad_test_helper()
-
-    @_gradcheck_ops(op_db)
-    def test_forward_mode_AD(self, device, dtype, op):
-        self._skip_helper(op, device, dtype)
-
-        self._forward_grad_helper(device, dtype, op, op.get_op(), is_inplace=False)
-
-    @_gradcheck_ops(op_db)
-    @skipIfTorchInductor("to be fixed")
-    def test_inplace_forward_mode_AD(self, device, dtype, op):
-        self._skip_helper(op, device, dtype)
-
-        if not op.inplace_variant or not op.supports_inplace_autograd:
-            self.skipTest("Skipped! Operation does not support inplace autograd.")
-
-        self._forward_grad_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace()), is_inplace=True)
-
 
-instantiate_device_type_tests(TestGradients, globals())
+instantiate_device_type_tests(TestBwdGradients, globals())
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_ops_jit.py b/test/test_ops_jit.py
index 57d1120978e4b..e03e051ff012e 100644
--- a/test/test_ops_jit.py
+++ b/test/test_ops_jit.py
@@ -7,7 +7,7 @@
 
 from torch.testing import FileCheck
 from torch.testing._internal.common_utils import \
-    (run_tests, IS_SANDCASTLE, clone_input_helper, first_sample, skipIfSlowGradcheckEnv)
+    (run_tests, IS_SANDCASTLE, clone_input_helper, first_sample)
 from torch.testing._internal.common_methods_invocations import op_db
 from torch.testing._internal.common_device_type import instantiate_device_type_tests, ops, OpDTypes
 from torch.testing._internal.common_jit import JitCommonTestCase, check_against_reference
@@ -30,7 +30,6 @@
 #   autodifferentiation behavior.
 # Inherits from JitCommonTestCase instead of TestCase directly to share
 #   functionality with original test_jit.py method operator tests
-@skipIfSlowGradcheckEnv
 class TestJit(JitCommonTestCase):
     exact_dtype = True
 
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 7c501fb411ed3..fed764994cc64 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -5491,7 +5491,8 @@ def skips_mvlgamma(skip_redundant=False):
     if skip_redundant:
         # Redundant tests
         skips = skips + (  # type: ignore[assignment]
-            DecorateInfo(unittest.skip("Skipped!"), 'TestGradients'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients'),
             DecorateInfo(unittest.skip("Skipped!"), 'TestJit'),
             DecorateInfo(unittest.skip("Skipped!"), 'TestCommon'),
         )
@@ -7872,11 +7873,11 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    dtypes=all_types_and_complex_and(torch.half, torch.bfloat16, torch.chalf),
                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
                    skips=(
-                       DecorateInfo(unittest.skip("In-place abs not supported for complex tensors"), 'TestGradients',
+                       DecorateInfo(unittest.skip("In-place abs not supported for complex tensors"), 'TestBwdGradients',
                                     'test_inplace_grad', dtypes=(torch.cdouble,)),
-                       DecorateInfo(unittest.skip("In-place abs not supported for complex tensors"), 'TestGradients',
+                       DecorateInfo(unittest.skip("In-place abs not supported for complex tensors"), 'TestBwdGradients',
                                     'test_inplace_gradgrad', dtypes=(torch.cdouble,)),
-                       DecorateInfo(unittest.skip("In-place abs not supported for complex tensors"), 'TestGradients',
+                       DecorateInfo(unittest.skip("In-place abs not supported for complex tensors"), 'TestFwdGradients',
                                     'test_inplace_forward_mode_AD', dtypes=(torch.cdouble,)),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
@@ -7931,15 +7932,15 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_grad',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_grad',
                                     dtypes=[torch.cdouble], active_if=IS_WINDOWS),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_method_grad',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_method_grad',
                                     dtypes=[torch.cdouble], active_if=IS_WINDOWS),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_inplace_grad',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_inplace_grad',
                                     dtypes=[torch.cdouble], active_if=IS_WINDOWS),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_forward_mode_AD',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD',
                                     dtypes=[torch.cdouble], active_if=IS_WINDOWS),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_inplace_forward_mode_AD',
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_inplace_forward_mode_AD',
                                     dtypes=[torch.cdouble], active_if=IS_WINDOWS),)),
     # NOTE: the derivative for inplace acosh is not implemented
     UnaryUfuncInfo('acosh',
@@ -8982,11 +8983,11 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    'test_schema_correctness',
                    dtypes=(torch.complex64, torch.complex128)),
                # Float did not match double
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_grad'),
+               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_fn_grad'),
                # Jacobian mismatch
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_gradgrad'),
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_forward_mode_AD'),
-               DecorateInfo(unittest.skip("Barely fails"), 'TestGradients', 'test_fn_fwgrad_bwgrad'),
+               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_fn_gradgrad'),
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD'),
+               DecorateInfo(unittest.skip("Barely fails"), 'TestFwdGradients', 'test_fn_fwgrad_bwgrad'),
                # JIT test not working for tensor kwargs (https://github.com/pytorch/pytorch/issues/58507)
                # RuntimeError:
                # undefined value tensor:
@@ -9362,7 +9363,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                skipCPUIfNoFFT,
                # gradcheck fails on ROCm (gh-68429)
                # grad is computed improperly (probably for weights tensor)
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_grad'),
+               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_fn_grad'),
                # Pre-existing condition (calls .item); needs to be fixed
                DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
            )),
@@ -9446,13 +9447,13 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # RuntimeError: unsupported memory format option Preserve
                DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
                # GradcheckError: gradcheck expects all tensor inputs are dense when check_sparse_nnz is set to False
-               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_fwgrad_bwgrad'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_fn_fwgrad_bwgrad'),
                # GradcheckError: gradcheck expects all tensor inputs are dense when check_sparse_nnz is set to False
-               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_grad'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_grad'),
                # GradcheckError: gradcheck expects all tensor inputs are dense when check_sparse_nnz is set to False
-               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_gradgrad'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_gradgrad'),
                # GradcheckError: gradcheck expects all tensor inputs are dense when check_sparse_nnz is set to False
-               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_forward_mode_AD'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD'),
            )),
     UnaryUfuncInfo('i0',
                    ref=np_unary_ufunc_integer_promotion_wrapper(
@@ -10394,7 +10395,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # Not close
                DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_conj_view'),
                DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_view'),
-               DecorateInfo(unittest.skip("Numerous errors"), 'TestGradients'))),
+               DecorateInfo(unittest.skip("Numerous errors"), 'TestFwdGradients'),
+               DecorateInfo(unittest.skip("Numerous errors"), 'TestBwdGradients'))),
     OpInfo('as_strided_scatter',
            op=lambda x, src, size, stride, storage_offset=0:
                torch.as_strided_scatter(x, src, size, stride, storage_offset=storage_offset),
@@ -10413,9 +10415,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.skip('Only fails for LAZY, passes on everything else'), 'TestCompositeCompliance', 'test_backward'),  # noqa: B950
                DecorateInfo(unittest.skip('Passes on complex64 and float32 only'), 'TestJit', 'test_variant_consistency_jit'),
                DecorateInfo(unittest.skip('Fails on cuda + rocm'), 'TestCommon', 'test_complex_half_reference_testing'),
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_grad'),
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_gradgrad'),
-               DecorateInfo(unittest.skip('Passes on complex128 and float64 only'), 'TestGradients', 'test_fn_fwgrad_bwgrad'),
+               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_fn_grad'),
+               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_fn_gradgrad'),
+               DecorateInfo(unittest.skip('Passes on complex128 and float64 only'), 'TestFwdGradients', 'test_fn_fwgrad_bwgrad'),
                # AssertionError: Tensor-likes are not close! (new_empty_strided.default)
                DecorateInfo(unittest.skip("Expected: new_empty_strided is not comparable"), 'TestDecomp', 'test_comprehensive'),
                DecorateInfo(
@@ -10433,10 +10435,10 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            error_inputs_func=error_inputs_native_layer_norm,
            skips=(
                # IndexError: tuple index out of range
-               DecorateInfo(unittest.skip('Skipped!'), 'TestGradients', 'test_forward_mode_AD'),
+               DecorateInfo(unittest.skip('Skipped!'), 'TestFwdGradients', 'test_forward_mode_AD'),
                # Tests fail when weight=None and bias is defined
                # https://github.com/pytorch/pytorch/issues/79705
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_gradgrad'),
+               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_fn_gradgrad'),
                # JIT test also tries to compute double backward, which fails
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
                # Extremal value issue on aten::native_layer_norm, which returns 'nan' for mean on 'inf' inputs
@@ -10461,7 +10463,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type="cuda"),
                # Problem with _get_numerical_jacobian
                # IndexError: tuple index out of range
-               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_forward_mode_AD'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD'),
                # RuntimeError: deepEquals(input.iValue, deepCopiedInput) INTERNAL ASSERT FAILED
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
                # https://github.com/pytorch/pytorch/issues/85960
@@ -11304,6 +11306,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypes=floating_types(),
            dtypesIfCUDA=floating_types_and(torch.float16),
            test_neg_view=False,
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            sample_inputs_func=sample_inputs_fractional_max_pool3d,
            decorators=(
                # FIXME: both derivatives are implemented incorrectly
@@ -11388,9 +11391,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # and if there are several indices pointing to the same memory,
                # gradcheck is oblivious about that and cannot perturb them all at once
                # (see sample_inputs_max_unpool_grad to find out more).
-               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_grad'),
-               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_gradgrad'),
-               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_forward_mode_AD'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_grad'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_gradgrad'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD'),
                DecorateInfo(unittest.skip("Skipped!"), 'TestCompositeCompliance', 'test_forward_ad',
                             device_type='cpu'),
            )),
@@ -11422,9 +11425,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # and if there are several indices pointing to the same memory,
                # gradcheck is oblivious about that and cannot perturb them all at once
                # (see sample_inputs_max_unpool_grad to find out more).
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_forward_mode_AD'),
-               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_gradgrad'),
-               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_grad'),
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_gradgrad'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_grad'),
                DecorateInfo(unittest.skip("Skipped!"), 'TestCompositeCompliance', 'test_forward_ad'),
            )),
     OpInfo('nn.functional.max_unpool2d',
@@ -11459,9 +11462,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # and if there are several indices pointing to the same memory,
                # gradcheck is oblivious about that and cannot perturb them all at once
                # (see sample_inputs_max_unpool_grad to find out more).
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_forward_mode_AD'),
-               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_gradgrad'),
-               DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_grad'),
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_gradgrad'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_grad'),
                DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
            )),
     OpInfo('nn.functional.max_unpool3d',
@@ -11576,7 +11579,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         decorators=[
             # FIXME: second derivative is implemented but seems to be incorrect
             # https://github.com/pytorch/pytorch/issues/68760
-            DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_gradgrad'),
+            DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_fn_gradgrad'),
             # RuntimeError: Cannot insert a Tensor that requires grad as a constant.
             # Consider making it a parameter or input, or detaching the gradient
             # https://github.com/pytorch/pytorch/issues/68752
@@ -11639,7 +11642,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
             # In-place operations do not play well with forward AD
             # https://github.com/pytorch/pytorch/issues/77447
-            DecorateInfo(unittest.expectedFailure, 'TestGradients',
+            DecorateInfo(unittest.expectedFailure, 'TestFwdGradients',
                          'test_inplace_forward_mode_AD'),
             # The noise vector that's generated in these tests is not the same elementwise
             DecorateInfo(unittest.skip("Different noise"), 'TestUnaryUfuncs', 'test_batch_vs_slicing'),
@@ -11787,7 +11790,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                 toleranceOverride({torch.float16: tol(atol=1e-04, rtol=0.001)}), 'TestUnaryUfuncs', device_type='cuda',), ],
         skips=[
             # still want to test that first derivative works though second derivative isn't supported
-            DecorateInfo(unittest.expectedFailure, 'TestGradients', "test_inplace_gradgrad"),
+            DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', "test_inplace_gradgrad"),
             # produces 0 instead of nan on ROCM
             DecorateInfo(unittest.expectedFailure,
                          'TestUnaryUfuncs', "test_reference_numerics_extremal",
@@ -12575,7 +12578,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    skips=(
                        # test_ops already tested for this overload with `decimals_0` opinfo entry
                        DecorateInfo(unittest.skip("Skipped!"), 'TestCommon'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestGradients'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestJit'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits'),
                        DecorateInfo(toleranceOverride({torch.bfloat16: tol(atol=1e-3, rtol=0.016)}),
@@ -12600,7 +12604,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    skips=(
                        # test_ops already tested for this overload with `decimals_0` opinfo entry
                        DecorateInfo(unittest.skip("Skipped!"), 'TestCommon'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestGradients'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestJit'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits'),
                    ),
@@ -12890,7 +12895,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                         DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
                         DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',),
                         # TODO: FIXME tolerance is too high
-                        DecorateInfo(unittest.skip('Skipped!'), 'TestGradients'),
+                        DecorateInfo(unittest.skip('Skipped!'), 'TestFwdGradients'),
+                        DecorateInfo(unittest.skip('Skipped!'), 'TestBwdGradients'),
                     ),
                     assert_autodiffed=True,
                     autodiff_nonfusible_nodes=['aten::pow'],),
@@ -13078,7 +13084,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # AssertionError: Scalars are not equal!
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
                # Gradcheck fails
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad',
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_fn_fwgrad_bwgrad',
                             dtypes=floating_and_complex_types()),
                DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out',
                             device_type='mps', dtypes=[torch.float32]),
@@ -13443,7 +13449,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                         #  tensor([[0.]], dtype=torch.float64)
                         # Analytical:
                         # tensor([[-0.0047]], dtype=torch.float64, grad_fn=<CopySlices>)
-                        DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'),
+                        DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_fn_fwgrad_bwgrad'),
                     )),
     # TODO(@kshitij12345): Refactor similar to `mvlgamma` entries.
     # To test reference numerics against multiple values of argument `n`,
@@ -13473,7 +13479,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    sample_inputs_func=sample_inputs_polygamma,
                    skips=(
                        # Redundant tests
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestGradients'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestJit'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestNormalizeOperators'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestCommon'),
@@ -13495,7 +13502,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    sample_inputs_func=sample_inputs_polygamma,
                    skips=(
                        # Redundant tests
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestGradients'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestJit'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestNormalizeOperators'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestCommon'),
@@ -13519,7 +13527,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    sample_inputs_func=sample_inputs_polygamma,
                    skips=(
                        # Redundant tests
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestGradients'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestJit'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestNormalizeOperators'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestCommon'),
@@ -13540,7 +13549,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    sample_inputs_func=sample_inputs_polygamma,
                    skips=(
                        # Redundant tests
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestGradients'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestJit'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestNormalizeOperators'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestCommon'),
@@ -13867,7 +13877,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         sample_inputs_func=sample_inputs_conversion,
         skips=(
             # autograd tests don't handle operators that change dtype
-            DecorateInfo(unittest.expectedFailure, 'TestGradients'),
+            DecorateInfo(unittest.expectedFailure, 'TestFwdGradients'),
+            DecorateInfo(unittest.expectedFailure, 'TestBwdGradients'),
             DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
             # RuntimeError: attribute lookup is not defined on builtin
             DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
@@ -13935,7 +13946,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         sample_inputs_func=sample_inputs_conversion,
         skips=(
             # autograd tests don't handle operators that change dtype
-            DecorateInfo(unittest.expectedFailure, 'TestGradients'),
+            DecorateInfo(unittest.expectedFailure, 'TestFwdGradients'),
+            DecorateInfo(unittest.expectedFailure, 'TestBwdGradients'),
             DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
             # RuntimeError: attribute lookup is not defined on builtin
             DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
@@ -13949,7 +13961,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         supports_autograd=True,
         skips=(
             # autograd tests don't handle operators that change dtype
-            DecorateInfo(unittest.expectedFailure, 'TestGradients'),
+            DecorateInfo(unittest.expectedFailure, 'TestFwdGradients'),
+            DecorateInfo(unittest.expectedFailure, 'TestBwdGradients'),
             DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
             # RuntimeError: attribute lookup is not defined on builtin
             DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
@@ -14015,7 +14028,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         sample_inputs_func=sample_inputs_conversion,
         skips=(
             # autograd tests don't handle operators that change dtype
-            DecorateInfo(unittest.expectedFailure, 'TestGradients'),
+            DecorateInfo(unittest.expectedFailure, 'TestFwdGradients'),
+            DecorateInfo(unittest.expectedFailure, 'TestBwdGradients'),
             DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
             # RuntimeError: attribute lookup is not defined on builtin
             DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
@@ -14029,7 +14043,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         sample_inputs_func=sample_inputs_conversion,
         skips=(
             # autograd tests don't handle operators that change dtype
-            DecorateInfo(unittest.expectedFailure, 'TestGradients'),
+            DecorateInfo(unittest.expectedFailure, 'TestFwdGradients'),
+            DecorateInfo(unittest.expectedFailure, 'TestBwdGradients'),
             # use of lambda doesn't work with test_normalize_operator_exhaustive
             DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
             # RuntimeError: "sum_cpu" not implemented for 'ComplexHalf'
@@ -14454,7 +14469,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
                # Computed gradient is incorrect -- would be an exfail but gradgrad somehow passes
-               DecorateInfo(unittest.skip("Gradients are incorrect!"), 'TestGradients'),
+               DecorateInfo(unittest.skip("Gradients are incorrect!"), 'TestFwdGradients'),
+               DecorateInfo(unittest.skip("Gradients are incorrect!"), 'TestBwdGradients'),
                DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'))),
     OpInfo('normal',
            # This has its own variant b/c OpInfos assume the first arg is a Tensor but it is not here
@@ -14472,9 +14488,10 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # AssertionError: JIT Test does not execute any logic
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
                # NotImplementedError not raised
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'),
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_fn_fwgrad_bwgrad'),
                # Computed gradient is incorrect -- would be an exfail but gradgrad somehow passes
-               DecorateInfo(unittest.skip("Gradients are incorrect!"), 'TestGradients'),
+               DecorateInfo(unittest.skip("Gradients are incorrect!"), 'TestFwdGradients'),
+               DecorateInfo(unittest.skip("Gradients are incorrect!"), 'TestBwdGradients'),
                DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'))),
     OpInfo('bernoulli',
            op=lambda inp, *args, **kwargs:
@@ -14492,7 +14509,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            error_inputs_func=error_inputs_bernoulli,
            skips=(
                # vmap: We do not yet support calling random operations inside of vmap
-               DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_forward_mode_AD'),
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD'),
                DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # AssertionError: JIT Test does not execute any logic
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
@@ -15055,9 +15072,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            check_batched_grad=False,
            check_batched_gradgrad=False,
            skips=(
-               # to_sparse does not support automatic differentiation for outputs with complex dtype
-               DecorateInfo(unittest.expectedFailure, 'TestGradients',
-                            'test_nondifferentiable', dtypes=(torch.cdouble,)),
                # NotImplementedError: Could not run 'aten::normal_' with arguments from the 'SparseCPU' backend
                DecorateInfo(unittest.skip(""), 'TestCommon', 'test_noncontiguous_samples'),
                # TODO: FIXME: complex inputs requiring grad error in forward
@@ -15652,8 +15666,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             # torch.autograd.gradcheck.GradcheckError: While computing batched gradients, got:
             # vmap: We do not yet support calling random operations inside of vmap.
             # Please perform random operations outside of vmap as a workaround
-            DecorateInfo(unittest.expectedFailure, 'TestGradients', "test_forward_mode_AD"),
-            DecorateInfo(unittest.expectedFailure, 'TestGradients', "test_inplace_forward_mode_AD"),
+            DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', "test_forward_mode_AD"),
+            DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', "test_inplace_forward_mode_AD"),
             DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu')),
         # Runs very slowly on slow gradcheck - alternatively reduce input sizes
         gradcheck_fast_mode=True,
@@ -16111,14 +16125,14 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             # torch.autograd.gradcheck.GradcheckError: Jacobian mismatch for output 0 with respect to input 0
             DecorateInfo(
                 unittest.expectedFailure,
-                "TestGradients",
+                "TestBwdGradients",
                 "test_fn_grad",
                 dtypes=(torch.float64,),
             ),
             # RuntimeError: derivative for aten::_ctc_loss_backward is not implemented
             DecorateInfo(
                 unittest.expectedFailure,
-                "TestGradients",
+                "TestBwdGradients",
                 "test_fn_gradgrad",
                 dtypes=(torch.float64,),
             ),
@@ -16368,9 +16382,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             # Pre-existing condition (calls .item); needs to be fixed
             DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
             # Not implemented
-            DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_forward_mode_AD'),
-            DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_inplace_forward_mode_AD'),
-            DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad'),
+            DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD'),
+            DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_inplace_forward_mode_AD'),
+            DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_fn_fwgrad_bwgrad'),
         ),
     ),
     OpInfo(
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index e9982e9dc5779..ba8e8db2e4a63 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -36,7 +36,7 @@
 from copy import deepcopy
 from enum import Enum
 from functools import partial, wraps
-from itertools import product
+from itertools import product, chain
 from pathlib import Path
 from statistics import mean
 from typing import (
@@ -3607,3 +3607,135 @@ def custom_op(opname, symbolic_fn, opset_version):
         yield
     finally:
         unregister_custom_op_symbolic(opname, opset_version)
+
+
+class TestGradients(TestCase):
+    exact_dtype = True
+
+    # Copies inputs to inplace operations to avoid inplace modifications
+    #   to leaves requiring gradient
+    def _get_safe_inplace(self, inplace_variant):
+        @wraps(inplace_variant)
+        def _fn(t, *args, **kwargs):
+            return inplace_variant(t.clone(), *args, **kwargs)
+
+        return _fn
+
+    def _check_helper(self, device, dtype, op, variant, check, *, check_forward_ad=False, check_backward_ad=True,
+                      check_batched_grad=None, check_batched_forward_grad=False):
+        assert check in ('gradcheck', 'bwgrad_bwgrad', 'fwgrad_bwgrad')
+        # NB: check_backward_ad does not affect gradgradcheck (always True)
+        if variant is None:
+            self.skipTest("Skipped! Variant not implemented.")
+        if not op.supports_dtype(dtype, torch.device(device).type):
+            self.skipTest(f"Skipped! {op.name} does not support dtype {str(dtype)}")
+
+        def is_inplace(variant):
+            if hasattr(variant, "__wrapped__"):
+                return variant.__wrapped__ is op.get_inplace()
+            return variant is op.get_inplace()
+
+        include_conjugated_inputs = op.test_conjugated_samples and dtype.is_complex
+
+        samples = op.sample_inputs(device, dtype, requires_grad=True, include_conjugated_inputs=include_conjugated_inputs,
+                                   small_inputs_only=is_slow_gradcheck_env())
+
+        for sample in samples:
+            if sample.broadcasts_input and is_inplace(variant):
+                continue
+
+            # Gradcheck expects tensors as its input, but autograd actually supports tensorlists
+            #   and tensors passed as kwargs. The following creates a function that accepts just
+            #   the tensors that require grad as varargs, and then recomposes them back into the
+            #   original input.
+
+            # Creates gradcheck inputs by identifying tensors requiring grad
+            all_args = None
+            if is_iterable_of_tensors(sample.input):
+                all_args = chain(sample.input, sample.args, sample.kwargs.values())
+            else:
+                all_args = tuple(chain((sample.input,), sample.args, sample.kwargs.values()))
+            gradcheck_args = tuple(x for x in all_args if (isinstance(x, torch.Tensor) and x.requires_grad))
+
+            def _input_recomposition_helper(inputs, inp, input_idx):
+                if is_iterable_of_tensors(inp):
+                    tensor_list = []
+                    for x in inp:
+                        if isinstance(x, torch.Tensor) and x.requires_grad:
+                            tensor_list.append(inputs[input_idx])
+                            input_idx = input_idx + 1
+                        else:
+                            tensor_list.append(x)
+                    return tensor_list, input_idx
+                elif isinstance(inp, torch.Tensor) and inp.requires_grad:
+                    return inputs[input_idx], input_idx + 1
+                else:
+                    return inp, input_idx
+
+            def fn(*inputs):
+                # Puts inputs back into sample properly
+                positional_args = []
+                input_idx = 0
+                inp, input_idx = _input_recomposition_helper(inputs, sample.input, input_idx)
+                positional_args.append(inp)
+
+                for x in sample.args:
+                    inp, input_idx = _input_recomposition_helper(inputs, x, input_idx)
+                    positional_args.append(inp)
+
+                # Recreates kwargs
+                kwargs = {}
+                for k, v in sample.kwargs.items():
+                    inp, input_idx = _input_recomposition_helper(inputs, v, input_idx)
+                    kwargs[k] = inp
+
+                output = op.gradcheck_wrapper(variant, *positional_args, **kwargs)
+                if sample.output_process_fn_grad is not None:
+                    return sample.output_process_fn_grad(output)
+                return output
+
+            if check == 'gradcheck':
+                if check_batched_grad is None:
+                    check_batched_grad = op.check_batched_grad
+                self.assertTrue(gradcheck(fn, gradcheck_args,
+                                          check_batched_grad=check_batched_grad,
+                                          check_grad_dtypes=True,
+                                          nondet_tol=op.gradcheck_nondet_tol,
+                                          fast_mode=op.gradcheck_fast_mode,
+                                          check_forward_ad=check_forward_ad,
+                                          check_backward_ad=check_backward_ad,
+                                          check_undefined_grad=True,
+                                          check_batched_forward_grad=check_batched_forward_grad))
+            elif check in ('bwgrad_bwgrad', 'fwgrad_bwgrad'):  # gradgrad check
+                self.assertFalse(check_forward_ad, msg="Cannot run forward AD check for gradgradcheck")
+                for gen_non_contig_grad_outputs in (False, True):
+                    kwargs = {
+                        "gen_non_contig_grad_outputs": gen_non_contig_grad_outputs,
+                        "check_batched_grad": op.check_batched_gradgrad,
+                        "check_grad_dtypes": True,
+                        "nondet_tol": op.gradcheck_nondet_tol,
+                        "fast_mode": op.gradcheck_fast_mode
+                    }
+                    if check == "fwgrad_bwgrad":
+                        kwargs["check_fwd_over_rev"] = True
+                        kwargs["check_rev_over_rev"] = False
+                        kwargs["check_batched_grad"] = False
+                        kwargs["check_undefined_grad"] = False
+
+                    self.assertTrue(gradgradcheck(fn, gradcheck_args, **kwargs))
+            else:
+                self.assertTrue(False, msg="Unknown check requested!")
+
+    def _grad_test_helper(self, device, dtype, op, variant, *, check_forward_ad=False, check_backward_ad=True,
+                          check_batched_grad=None, check_batched_forward_grad=False):
+        return self._check_helper(device, dtype, op, variant, 'gradcheck', check_forward_ad=check_forward_ad,
+                                  check_backward_ad=check_backward_ad, check_batched_grad=check_batched_grad,
+                                  check_batched_forward_grad=check_batched_forward_grad)
+
+    def _skip_helper(self, op, device, dtype):
+        if dtype not in op.supported_backward_dtypes(torch.device(device).type):
+            self.skipTest("Skipped! Op doesn't support autograd for this dtype.")
+        if not op.supports_autograd and not op.supports_forward_ad:
+            self.skipTest("Skipped! autograd not supported.")
+        if op.name == "cat":
+            self.skipTest("TODO(whc) fix pre-existing bug with cat for newly added opinfo for empty+nonempty")
diff --git a/torch/testing/_internal/opinfo/definitions/_masked.py b/torch/testing/_internal/opinfo/definitions/_masked.py
index f35f04fc29a75..92231229bb5ec 100644
--- a/torch/testing/_internal/opinfo/definitions/_masked.py
+++ b/torch/testing/_internal/opinfo/definitions/_masked.py
@@ -1109,7 +1109,10 @@ def sample_inputs_masked_normalize(op_info, device, dtype, requires_grad, **kwar
                 unittest.skip("Skipped!"), "TestJit", "test_variant_consistency_jit"
             ),
             DecorateInfo(
-                unittest.skip("Skipped!"), "TestGradients", "test_fn_gradgrad"
+                unittest.skip("Skipped!"), "TestFwdGradients", "test_fn_gradgrad"
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"), "TestBwdGradients", "test_fn_gradgrad"
             ),
         ),
         sample_inputs_func=sample_inputs_masked_logaddexp,
diff --git a/torch/testing/_internal/opinfo/definitions/linalg.py b/torch/testing/_internal/opinfo/definitions/linalg.py
index 193f1f2db85cc..e0d60c08022f1 100644
--- a/torch/testing/_internal/opinfo/definitions/linalg.py
+++ b/torch/testing/_internal/opinfo/definitions/linalg.py
@@ -1133,7 +1133,7 @@ def make_input():
             ),
             DecorateInfo(
                 unittest.skip("Gradients are incorrect on macos"),
-                "TestGradients",
+                "TestBwdGradients",
                 "test_fn_grad",
                 device_type="cpu",
                 dtypes=(torch.float64,),
@@ -1141,7 +1141,7 @@ def make_input():
             ),
             DecorateInfo(
                 unittest.skip("Gradients are incorrect on macos"),
-                "TestGradients",
+                "TestFwdGradients",
                 "test_forward_mode_AD",
                 device_type="cpu",
                 dtypes=(torch.float64,),
@@ -1150,25 +1150,25 @@ def make_input():
             # Both Hessians are incorrect on complex inputs??
             DecorateInfo(
                 unittest.expectedFailure,
-                "TestGradients",
+                "TestBwdGradients",
                 "test_fn_gradgrad",
                 dtypes=(torch.complex128,),
             ),
             DecorateInfo(
                 unittest.expectedFailure,
-                "TestGradients",
+                "TestFwdGradients",
                 "test_fn_fwgrad_bwgrad",
                 dtypes=(torch.complex128,),
             ),
             DecorateInfo(
                 unittest.skip("Skipped, see https://github.com//issues/84192"),
-                "TestGradients",
+                "TestBwdGradients",
                 "test_fn_gradgrad",
                 device_type="cuda",
             ),
             DecorateInfo(
                 unittest.skip("Skipped, see https://github.com//issues/84192"),
-                "TestGradients",
+                "TestFwdGradients",
                 "test_fn_fwgrad_bwgrad",
                 device_type="cuda",
             ),
@@ -1415,7 +1415,7 @@ def make_input():
             ),
             DecorateInfo(
                 unittest.skip("Skipped! Flaky"),
-                "TestGradients",
+                "TestFwdGradients",
                 "test_fn_fwgrad_bwgrad",
                 device_type="cpu",
                 dtypes=(torch.complex128,),
@@ -1464,7 +1464,8 @@ def make_input():
         skips=(
             # we skip gradient checks for this suite as they are tested in
             # variant_test_name='grad_oriented'
-            DecorateInfo(unittest.skip("Skipped!"), "TestGradients"),
+            DecorateInfo(unittest.skip("Skipped!"), "TestFwdGradients"),
+            DecorateInfo(unittest.skip("Skipped!"), "TestBwdGradients"),
             # The values for attribute 'shape' do not match
             DecorateInfo(unittest.skip("Skipped!"), "TestCommon", "test_out"),
             DecorateInfo(
@@ -1585,7 +1586,9 @@ def make_input():
         check_batched_forward_grad=False,
         supports_fwgrad_bwgrad=True,
         skips=(
-            DecorateInfo(unittest.expectedFailure, "TestGradients", "test_fn_gradgrad"),
+            DecorateInfo(
+                unittest.expectedFailure, "TestBwdGradients", "test_fn_gradgrad"
+            ),
         ),
     ),
     OpInfo(
@@ -1606,14 +1609,16 @@ def make_input():
         skips=(
             # [NEW] Skips specifically for sample inputs at zero
             # norm's vjp/jvp are not well-conditioned near zero
-            DecorateInfo(unittest.expectedFailure, "TestGradients", "test_fn_gradgrad"),
             DecorateInfo(
-                unittest.expectedFailure, "TestGradients", "test_fn_fwgrad_bwgrad"
+                unittest.expectedFailure, "TestBwdGradients", "test_fn_gradgrad"
+            ),
+            DecorateInfo(
+                unittest.expectedFailure, "TestFwdGradients", "test_fn_fwgrad_bwgrad"
             ),
             DecorateInfo(
-                unittest.expectedFailure, "TestGradients", "test_forward_mode_AD"
+                unittest.expectedFailure, "TestFwdGradients", "test_forward_mode_AD"
             ),
-            DecorateInfo(unittest.expectedFailure, "TestGradients", "test_fn_grad"),
+            DecorateInfo(unittest.expectedFailure, "TestBwdGradients", "test_fn_grad"),
         ),
     ),
     OpInfo(
@@ -2012,7 +2017,7 @@ def make_input():
             # CUDA runs out of memory
             DecorateInfo(
                 unittest.skip("Skipped!"),
-                "TestGradients",
+                "TestFwdGradients",
                 "test_fn_fwgrad_bwgrad",
                 device_type="cuda",
                 dtypes=[torch.cdouble],
@@ -2020,7 +2025,7 @@ def make_input():
             # This test takes almost 2 hours to run!
             DecorateInfo(
                 unittest.skip("Skipped!"),
-                "TestGradients",
+                "TestBwdGradients",
                 "test_fn_gradgrad",
                 device_type="cuda",
                 dtypes=[torch.cdouble],
@@ -2072,7 +2077,7 @@ def make_input():
             # This test is flaky under slow gradcheck, likely due to rounding issues
             DecorateInfo(
                 skipIfSlowGradcheckEnv,
-                "TestGradients",
+                "TestFwdGradients",
                 "test_fn_fwgrad_bwgrad",
                 device_type="cuda",
             ),
diff --git a/torch/testing/_internal/opinfo/definitions/special.py b/torch/testing/_internal/opinfo/definitions/special.py
index cafc7aba12424..f05b996f82d90 100644
--- a/torch/testing/_internal/opinfo/definitions/special.py
+++ b/torch/testing/_internal/opinfo/definitions/special.py
@@ -164,7 +164,7 @@ def sample_inputs_entr(op_info, device, dtype, requires_grad, **kwargs):
             # Dispatch stub: unsupported device typemeta
             DecorateInfo(
                 unittest.expectedFailure,
-                "TestGradients",
+                "TestFwdGradients",
                 "test_fn_fwgrad_bwgrad",
                 device_type="meta",
             ),

From 92edc1e689eb1c896b6e2aff70515adeaf37595c Mon Sep 17 00:00:00 2001
From: Mikayla Gawarecki <mikaylagawarecki@gmail.com>
Date: Wed, 2 Nov 2022 20:28:39 +0000
Subject: [PATCH 0504/1922] Added add/mul for nested dense [B, *, D], [B, 1, D]
 case (CUDA-only) (#88289)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88289
Approved by: https://github.com/cpuhrsch
---
 .../native/nested/NestedTensorBinaryOps.cpp   |  36 ++++++
 .../native/nested/NestedTensorBinaryOps.h     |  16 +++
 .../nested/cuda/NestedTensorBinaryOps.cu      | 120 ++++++++++++++++++
 test/test_nestedtensor.py                     |  15 +++
 4 files changed, 187 insertions(+)
 create mode 100644 aten/src/ATen/native/nested/NestedTensorBinaryOps.h
 create mode 100644 aten/src/ATen/native/nested/cuda/NestedTensorBinaryOps.cu

diff --git a/aten/src/ATen/native/nested/NestedTensorBinaryOps.cpp b/aten/src/ATen/native/nested/NestedTensorBinaryOps.cpp
index 18ada308c9b24..215252f91d6d2 100644
--- a/aten/src/ATen/native/nested/NestedTensorBinaryOps.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorBinaryOps.cpp
@@ -1,4 +1,5 @@
 #include <ATen/native/nested/NestedTensorMath.h>
+#include  <ATen/native/nested/NestedTensorBinaryOps.h>
 
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
@@ -18,6 +19,9 @@
 namespace at {
 namespace native {
 
+DEFINE_DISPATCH(nested_dense_elementwise_stub);
+REGISTER_NO_CPU_DISPATCH(nested_dense_elementwise_stub);
+
 std::pair<NestedTensorImpl*, NestedTensorImpl*>
 get_elementwise_nested_tensor_impl(
     const Tensor& self,
@@ -95,6 +99,38 @@ Tensor NestedTensor_elementwise_Tensor(
       self_impl->get_storage_offsets()
     );
   }
+  // special case when other is dense
+  if (self.is_nested() && !other.is_nested()) {
+    // check for the [B, *, D], [B, 1, D] esuhm case
+    // TODO: this if statement is ugly and hopefully we will remove this in the near future
+    auto self_ptr = get_nested_tensor_impl(self);
+    if (self_ptr->dim() == 3 &&
+        other.dim() == 3 &&
+        self_ptr->size(0) == other.size(0) &&
+        other.size(1) == 1 &&
+        self_ptr->opt_size(2).has_value() &&
+        self_ptr->opt_size(2).value() == other.size(2) &&
+        self.is_cuda() &&
+        other.is_cuda()) {
+      if (!nested_tensor_impl_is_contiguous(self_ptr)) {
+        self_ptr = get_nested_tensor_impl(self.contiguous());
+      }
+      const auto self_buffer = self_ptr->get_buffer();
+      const auto self_sizes = self_ptr->get_nested_size_tensor();
+      auto result_buffer = at::empty_like(self_buffer);
+      auto result = wrap_buffer(result_buffer, self_sizes);
+      if (op_name == "add") {
+        nested_dense_elementwise_stub(self.device().type(), result, self, other, NESTED_DENSE_OP::ADD);
+      } else if (op_name == "mul") {
+        nested_dense_elementwise_stub(self.device().type(), result, self, other, NESTED_DENSE_OP::MUL);
+      } else {
+        TORCH_CHECK(false, "Unsupported nested dense elementwise op");
+      }
+      return result;
+    }
+    TORCH_CHECK(false, "Expected both self and other to be nested, but got a nested self and non-nested other.");
+  }
+
   NestedTensorImpl* self_impl = nullptr;
   NestedTensorImpl* other_impl = nullptr;
   std::tie(self_impl, other_impl) =
diff --git a/aten/src/ATen/native/nested/NestedTensorBinaryOps.h b/aten/src/ATen/native/nested/NestedTensorBinaryOps.h
new file mode 100644
index 0000000000000..51eeaf2919111
--- /dev/null
+++ b/aten/src/ATen/native/nested/NestedTensorBinaryOps.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <ATen/core/ATen_fwd.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at {
+namespace native {
+
+enum class NESTED_DENSE_OP: uint8_t {ADD, MUL};
+
+using nested_dense_elementwise_fn = void (*)(Tensor& result, const Tensor & self, const Tensor & other, const NESTED_DENSE_OP& op);
+
+DECLARE_DISPATCH(nested_dense_elementwise_fn, nested_dense_elementwise_stub);
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorBinaryOps.cu b/aten/src/ATen/native/nested/cuda/NestedTensorBinaryOps.cu
new file mode 100644
index 0000000000000..678e62f5a81c6
--- /dev/null
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorBinaryOps.cu
@@ -0,0 +1,120 @@
+#include <ATen/native/nested/NestedTensorBinaryOps.h>
+
+#include <type_traits>
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/detail/KernelUtils.h>
+#include <ATen/cuda/detail/IndexUtils.cuh>
+#include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/MemoryAccess.cuh>
+
+#include <c10/cuda/CUDAMathCompat.h>
+#include <c10/cuda/CUDAStream.h>
+
+
+#include <ATen/native/nested/NestedTensorUtils.h>
+
+#define BLOCK_DIM 256
+
+namespace at {
+namespace native {
+
+
+// only for nested [B, *, D], dense [B, 1, D]
+template <typename T, typename func_t>
+__global__ void op_dense_esuhm(
+    const T* input,
+    const T* dense,
+    T* output,
+    int64_t embedding_dim,
+    const int64_t* offsets,
+    const func_t& f)
+{
+  // each batch is handled by a block
+  const int64_t batch_idx  = blockIdx.x;
+  const int64_t grain_size = blockDim.x;
+  const int64_t tid = threadIdx.x;
+  const int64_t range = offsets[batch_idx + 1] - offsets[batch_idx];
+  // each thread handles (embedding_dim // grain_size + (embedding_dim % grain_size <= tid)) elems
+  // of the dense embedding
+  for (int64_t idx = tid; idx < embedding_dim; idx += grain_size) {
+    const T dense_elem = dense[batch_idx * embedding_dim + idx];
+    for (int64_t nested_idx = idx; nested_idx < range; nested_idx += embedding_dim) {
+      output[offsets[batch_idx] + nested_idx] = f(input[offsets[batch_idx] + nested_idx], dense_elem);
+    }
+  }
+}
+
+template <typename T, typename func_t>
+void nested_op_dense_kernelLauncher(
+    const T* input, // [sum(*) x embedding_dim]
+    const T* dense, // [batch_size x embedding_dim]
+    T* output, // [sum(*) x embedding_dim]
+    int64_t batch_size,
+    int64_t embedding_dim,
+    const int64_t* input_offsets,  // [batch_size]
+    func_t f)
+{
+  dim3 grid;
+  grid.x = batch_size;
+  const auto stream = at::cuda::getDefaultCUDAStream();
+
+  op_dense_esuhm<<<grid, BLOCK_DIM, 0, stream>>>(
+      input,
+      dense,
+      output,
+      embedding_dim,
+      input_offsets,
+      f);
+}
+
+template <typename scalar_t, typename func_t>
+void _nested_op_dense_esuhm_kernel(Tensor& result, const Tensor& self, const Tensor& other, func_t f) {
+  auto self_ptr = get_nested_tensor_impl(self);
+  auto result_ptr = get_nested_tensor_impl(result);
+
+  const auto self_buffer = self_ptr->get_buffer();
+  const auto offsets = self_ptr->get_storage_offsets();
+  const auto batch_size = other.size(0);
+  const auto embedding_size = other.size(2);
+
+  auto result_buffer = result_ptr->get_buffer();
+  auto result_offsets = at::cat({at::tensor(offsets), at::tensor(self_ptr->numel())});
+  result_offsets = result_offsets.to(kCUDA);
+
+  const scalar_t* self_data_ptr = self_buffer.data_ptr<scalar_t>();
+  const scalar_t* other_data_ptr = other.data_ptr<scalar_t>();
+  scalar_t* result_data_ptr = result_buffer.data_ptr<scalar_t>();
+  int64_t* result_offsets_ptr = result_offsets.data_ptr<int64_t>();
+
+  nested_op_dense_kernelLauncher(
+    self_data_ptr,
+    other_data_ptr,
+    result_data_ptr,
+    batch_size,
+    embedding_size,
+    result_offsets_ptr,
+    f);
+}
+
+void _nested_op_dense_esuhm_cuda(Tensor& result, const Tensor& self, const Tensor& other, const NESTED_DENSE_OP& op) {
+  AT_DISPATCH_ALL_TYPES_AND2(
+    ScalarType::Half, ScalarType::BFloat16, self.scalar_type(), "_nested_op_dense_esuhm", [&]() {
+    switch (op) {
+      case NESTED_DENSE_OP::ADD :
+        _nested_op_dense_esuhm_kernel<scalar_t>(result, self, other, [] __host__ __device__ (scalar_t a, scalar_t b) -> scalar_t { return a + b; });
+        break;
+      case NESTED_DENSE_OP::MUL :
+        _nested_op_dense_esuhm_kernel<scalar_t>(result, self, other, [] __host__ __device__ (scalar_t a, scalar_t b) -> scalar_t { return a * b; });
+        break;
+    }
+  });
+}
+
+REGISTER_CUDA_DISPATCH(nested_dense_elementwise_stub, &_nested_op_dense_esuhm_cuda);
+
+} // namespace native
+} // namespace at
diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py
index 663acaa40ce67..84a30e0125e49 100644
--- a/test/test_nestedtensor.py
+++ b/test/test_nestedtensor.py
@@ -831,6 +831,21 @@ def test_nested_tensor_add(self, device, dtype):
         out = nt1 + nt2
         self.assertEqual(ref, out)
 
+    @onlyCUDA
+    @dtypes(torch.float, torch.float16)
+    @torch.inference_mode()
+    @parametrize("embedding_dim", [8, 128, 256, 384])
+    def test_nested_tensor_dense_elementwise(self, device, dtype, embedding_dim):
+        batch_size = 32
+        seq_lens = torch.randint(low=0, high=10, size=(batch_size,))
+        ts = [torch.randn((seq_len, embedding_dim)) for seq_len in seq_lens]
+        nt = torch.nested.nested_tensor(ts, device=device, dtype=dtype)
+        t = torch.randn((batch_size, 1, embedding_dim), device=device, dtype=dtype)
+        ref_add = torch.nested.nested_tensor([t1 + t2 for (t1, t2) in zip(nt.unbind(), t.unbind())])
+        ref_mul = torch.nested.nested_tensor([t1 * t2 for (t1, t2) in zip(nt.unbind(), t.unbind())])
+        self.assertEqual(nt.add(t), ref_add)
+        self.assertEqual(nt.mul(t), ref_mul)
+
     @dtypes(torch.float, torch.float16)
     @skipMeta
     @torch.inference_mode()

From a032944648d88adc3c15140fcb593006d59b0971 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Thu, 3 Nov 2022 02:15:07 +0000
Subject: [PATCH 0505/1922] Not run inductor test in trunk (#88374)

Trying to not run in inductor tests in trunk at the moment because of CUDA issue with G5 runner:

* CUDA GPU not found https://github.com/pytorch/pytorch/actions/runs/3379516207/jobs/5611539300
* NVIDIA driver installation fails https://github.com/pytorch/pytorch/actions/runs/3379922198/jobs/5612458360
* Docker fails to start https://github.com/pytorch/pytorch/actions/runs/3381276196/jobs/5615513348
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88374
Approved by: https://github.com/desertfire
---
 .github/workflows/inductor.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml
index ef0c6a620b736..bd696795d2697 100644
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@@ -2,8 +2,6 @@ name: inductor
 
 on:
   push:
-    branches:
-      - master
     tags:
       - ciflow/inductor/*
   workflow_dispatch:

From a77ef0e8514056ada2cf0f8f9ded4d5e7bdf71f4 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Wed, 2 Nov 2022 15:42:08 -0700
Subject: [PATCH 0506/1922] [codegen] using TORCH_LIBRARY_FRAGMENT for some
 namespaces (#88229)

Summary:
Sometimes we want to extend an existing custom namespace library, instead of creating a new one,
but we don't have a namespace config right now, so we hardcode some custom libraries defined
in pytorch today, i.e. quantized and quantized_decomposed

Test Plan:
ci

Reviewers:

Subscribers:

Tasks:

Tags:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88229
Approved by: https://github.com/ezyang
---
 tools/test/test_codegen.py | 25 +++++++++++++++++++++++++
 torchgen/gen.py            | 10 +++++++++-
 torchgen/model.py          |  2 ++
 3 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/tools/test/test_codegen.py b/tools/test/test_codegen.py
index 8bcecbb26e32e..4a9585708890c 100644
--- a/tools/test/test_codegen.py
+++ b/tools/test/test_codegen.py
@@ -217,6 +217,14 @@ def setUp(self) -> None:
             loc=torchgen.model.Location(__file__, 1),
             valid_tags=set(),
         )
+        (
+            self.fragment_custom_native_function,
+            _,
+        ) = torchgen.model.NativeFunction.from_yaml(
+            {"func": "quantized_decomposed::func() -> bool"},
+            loc=torchgen.model.Location(__file__, 1),
+            valid_tags=set(),
+        )
 
     def test_default_namespace_schema_registration_code_valid(self) -> None:
         native_functions = [DEFAULT_NATIVE_FUNCTION]
@@ -237,6 +245,23 @@ def test_custom_namespace_schema_registration_code_valid(self) -> None:
 TORCH_LIBRARY(custom, m) {
   m.def("func() -> bool", {});
 
+};""",
+        )
+
+    def test_fragment_custom_namespace_schema_registration_code_valid(self) -> None:
+        """Sometimes we want to extend an existing namespace, for example quantized
+        namespace, which is already defined in native/quantized/library.cpp
+        """
+        _, registrations = get_native_function_schema_registrations(
+            native_functions=[self.fragment_custom_native_function],
+            schema_selector=self.selector,
+        )
+        self.assertEqual(
+            registrations,
+            """
+TORCH_LIBRARY_FRAGMENT(quantized_decomposed, m) {
+  m.def("func() -> bool", {});
+
 };""",
         )
 
diff --git a/torchgen/gen.py b/torchgen/gen.py
index 79970c94610dd..db207169d4d48 100644
--- a/torchgen/gen.py
+++ b/torchgen/gen.py
@@ -48,6 +48,7 @@
     BaseOperatorName,
     DEFAULT_KERNEL_NAMESPACE,
     DispatchKey,
+    FRAGMENT_NAMESPACES,
     FunctionSchema,
     is_cuda_dispatch_key,
     is_generic_dispatch_key,
@@ -1640,8 +1641,15 @@ def get_native_function_schema_registrations(
         else:
             custom_namespace = namespace
             tab = "\t"
+            # if the namespace is predefined, we should use define a library fragment
+            # instead of a new library
+            torch_library_macro = (
+                "TORCH_LIBRARY_FRAGMENT"
+                if namespace in FRAGMENT_NAMESPACES
+                else "TORCH_LIBRARY"
+            )
             schema_registrations += f"""
-TORCH_LIBRARY({custom_namespace}, m) {{
+{torch_library_macro}({custom_namespace}, m) {{
   {tab.join(schema_registrations_body)}
 }};"""
     return (aten_schema_registrations, schema_registrations)
diff --git a/torchgen/model.py b/torchgen/model.py
index c1b906dd1d85f..a2a658d0a59c1 100644
--- a/torchgen/model.py
+++ b/torchgen/model.py
@@ -58,6 +58,8 @@ def __str__(self) -> str:
     "Autograd" + component for component in BACKEND_COMPONENTS
 ]
 
+FRAGMENT_NAMESPACES = {"quantized", "quantized_decomposed"}
+
 # This doesn't have to be in sync with the header, it only needs to contain
 # entries that we actually use in the codegen or want pyi entries for
 class DispatchKey(Enum):

From f24ad047597556a07e8c33e7a15a969127028f89 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Thu, 3 Nov 2022 02:48:41 +0000
Subject: [PATCH 0507/1922] Improve perf by avoiding implicit string creation
 in c10_cuda_check_implementation (#88350)

Test Plan: Sandcastle

Differential Revision: D40949947

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88350
Approved by: https://github.com/Skylion007, https://github.com/soumith
---
 c10/cuda/CUDAException.cpp | 4 ++--
 c10/cuda/CUDAException.h   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/c10/cuda/CUDAException.cpp b/c10/cuda/CUDAException.cpp
index d35d72c9ba7ba..7813be5c1f665 100644
--- a/c10/cuda/CUDAException.cpp
+++ b/c10/cuda/CUDAException.cpp
@@ -9,8 +9,8 @@ namespace c10 {
 namespace cuda {
 
 void c10_cuda_check_implementation(
-    const std::string& filename,
-    const std::string& function_name,
+    const char* filename,
+    const char* function_name,
     const int line_number,
     const bool include_device_assertions) {
   // We retrieve the error here in order to keep CUDA data types out of
diff --git a/c10/cuda/CUDAException.h b/c10/cuda/CUDAException.h
index cfc7424503a96..ddc1eeeabf722 100644
--- a/c10/cuda/CUDAException.h
+++ b/c10/cuda/CUDAException.h
@@ -76,8 +76,8 @@ namespace cuda {
 /// In the event of a CUDA failure, formats a nice error message about that
 /// failure and also checks for device-side assertion failures
 C10_CUDA_API void c10_cuda_check_implementation(
-    const std::string& filename,
-    const std::string& function_name,
+    const char* filename,
+    const char* function_name,
     const int line_number,
     const bool include_device_assertions);
 

From b0990db0cd8153e21a3645225ac7f81e2bc6db8b Mon Sep 17 00:00:00 2001
From: Aaron Gokaslan <aaronGokaslan@gmail.com>
Date: Thu, 3 Nov 2022 02:53:26 +0000
Subject: [PATCH 0508/1922] Update pybind11 to v2.10.1 (#88332)

I am one of the maintainers of pybind11, and a frequent PyTorch user. We added quite a lot of bugfixes and performance improvements in 2.10.1 (see the changelog for full details) and I wanted to upstream them to PyTorch.

Our releases is tested throughout Google's codebase including on their global builds of PyTorch so there should be no surprises.

The main new feature is optin in Eigen Tensor to Numpy casters.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88332
Approved by: https://github.com/soumith
---
 third_party/pybind11 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/pybind11 b/third_party/pybind11
index aa304c9c7d725..80dc998efced8 160000
--- a/third_party/pybind11
+++ b/third_party/pybind11
@@ -1 +1 @@
-Subproject commit aa304c9c7d725ffb9d10af08a3b34cb372307020
+Subproject commit 80dc998efced8ceb2be59756668a7e90e8bef917

From 4ddd7660f3ef38d31eeef9b349f9b7a32a5f59a4 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Thu, 3 Nov 2022 02:57:24 +0000
Subject: [PATCH 0509/1922] [vision hash update] update the pinned vision hash
 (#88382)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned vision hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88382
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/vision.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index eabc4b3d7eb11..a21a31bd4419e 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-d95fbaf1efd5346a4afcf5b9953df75696432265
+e64784cdea465d833d9d0f66dc73d7abe217933d

From 4652581a8712ee679f44d7fa5e858c9b5141048a Mon Sep 17 00:00:00 2001
From: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Date: Thu, 3 Nov 2022 03:01:33 +0000
Subject: [PATCH 0510/1922] Update all ONNX symbolics with new JitScalarType
 API (#87245)

Fixes https://github.com/pytorch/pytorch/issues/84365 and more

This PR addresses not only the issue above, but the entire family of issues related to `torch._C.Value.type()` parsing when `scalarType()` or `dtype()` is not available.

This issue exists before `JitScalarType` was introduced, but the new implementation refactored the bug in because the new api `from_name` and `from_dtype` requires parsing `torch._C.Value.type()` to get proper inputs, which is exactly the root cause for this family of bugs.

Therefore `from_name` and `from_dtype` must be called when the implementor knows the `name` and `dtype` without parsing a `torch._C.Value`. To handle the corner cases hidden within `torch._C.Value`, a new `from_value` API was introduced and it should be used in favor of the former ones for most cases. The new API is safer and doesn't require type parsing from user, triggering JIT asserts in the core of pytorch.

Although CI is passing for all tests, please review carefully all symbolics/helpers refactoring to make sure the meaning/intetion of the old call are not changed in the new call

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87245
Approved by: https://github.com/justinchuby, https://github.com/BowenBao
---
 test/onnx/test_pytorch_onnx_onnxruntime.py |   2 +-
 torch/_C/__init__.pyi.in                   |   2 +
 torch/onnx/_type_utils.py                  | 123 +++++++--
 torch/onnx/symbolic_helper.py              | 133 +++++-----
 torch/onnx/symbolic_opset10.py             |   2 +-
 torch/onnx/symbolic_opset11.py             |  98 +++----
 torch/onnx/symbolic_opset12.py             |  13 +-
 torch/onnx/symbolic_opset13.py             |   5 +-
 torch/onnx/symbolic_opset16.py             |  10 +-
 torch/onnx/symbolic_opset8.py              |  17 +-
 torch/onnx/symbolic_opset9.py              | 293 ++++++++++-----------
 11 files changed, 384 insertions(+), 314 deletions(-)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 7e1bc3d5dbeaf..1e36163d0394c 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -12477,7 +12477,7 @@ def test_optional_output(self, module_class: Type[torch.nn.Module], x_size: int)
             input_names=["x"],
         )
         exported = onnx.load_from_string(f.getvalue())
-        expected_elem_type = torch.onnx.JitScalarType.from_dtype(x.dtype).onnx_type()
+        expected_elem_type = torch.onnx.JitScalarType.from_value(x).onnx_type()
         expected_output_type = onnx.helper.make_optional_type_proto(
             onnx.helper.make_tensor_type_proto(expected_elem_type, (dynamic_axis_name,))
         )
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 9857cd3c91e3f..30416108acb49 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -1338,6 +1338,8 @@ class JitType:
     def with_sizes(self, sizes: List[Optional[_int]]) -> JitType: ...
     def kind(self) -> str: ...
     def scalarType(self) -> Optional[str]: ...
+    def getElementType(self) -> JitType: ...
+    def dtype(self) -> Optional[_dtype]: ...
 
 class InferredType:
     def __init__(self, arg: Union[JitType, str]): ...
diff --git a/torch/onnx/_type_utils.py b/torch/onnx/_type_utils.py
index e1d482c9d7b32..a8a6e5b2e0ef0 100644
--- a/torch/onnx/_type_utils.py
+++ b/torch/onnx/_type_utils.py
@@ -2,14 +2,21 @@
 from __future__ import annotations
 
 import enum
+import typing
 from typing import Dict, Optional, Union
 
 from typing_extensions import Literal
 
 import torch
 from torch._C import _onnx as _C_onnx
+from torch.onnx import errors
 from torch.onnx._internal import _beartype
 
+
+if typing.TYPE_CHECKING:
+    # Hack to help mypy to recognize torch._C.Value
+    from torch import _C  # noqa: F401
+
 ScalarName = Literal[
     "Byte",
     "Char",
@@ -55,10 +62,16 @@ class JitScalarType(enum.IntEnum):
 
     Use ``JitScalarType`` to convert from torch and JIT scalar types to ONNX scalar types.
 
-    Examples::
-        >>> # xdoctest: +IGNORE_WANT("win32 has different output")
-        >>> JitScalarType.from_name("Float").onnx_type()
+    Examples:
+        >>> JitScalarType.from_value(torch.ones(1, 2)).onnx_type()
+        TensorProtoDataType.FLOAT
+
+        >>> JitScalarType.from_value(torch_c_value_with_type_float).onnx_type()
         TensorProtoDataType.FLOAT
+
+        >>> JitScalarType.from_dtype(torch.get_default_dtype).onnx_type()
+        TensorProtoDataType.FLOAT
+
     """
 
     # Order defined in https://github.com/pytorch/pytorch/blob/344defc9733a45fee8d0c4d3f5530f631e823196/c10/core/ScalarType.h
@@ -82,62 +95,138 @@ class JitScalarType(enum.IntEnum):
 
     @classmethod
     @_beartype.beartype
-    def from_name(
+    def _from_name(
         cls, name: Union[ScalarName, TorchName, Optional[str]]
     ) -> JitScalarType:
         """Convert a JIT scalar type or torch type name to ScalarType.
 
+        Note: DO NOT USE this API when `name` comes from a `torch._C.Value.type()` calls.
+            A "RuntimeError: INTERNAL ASSERT FAILED at "../aten/src/ATen/core/jit_type_base.h" can
+            be raised in several scenarios where shape info is not present.
+            Instead use `from_value` API which is safer.
+
         Args:
             name: JIT scalar type name (Byte) or torch type name (uint8_t).
 
         Returns:
-            ScalarType.
+            JitScalarType
 
         Raises:
-            ValueError: if name is not a valid scalar type name or if it is None.
+           OnnxExporterError: if name is not a valid scalar type name or if it is None.
         """
         if name is None:
-            raise ValueError("Scalar type name cannot be None")
+            raise errors.OnnxExporterError("Scalar type name cannot be None")
         if valid_scalar_name(name):
             return _SCALAR_NAME_TO_TYPE[name]  # type: ignore[index]
         if valid_torch_name(name):
             return _TORCH_NAME_TO_SCALAR_TYPE[name]  # type: ignore[index]
 
-        raise ValueError(f"Unknown torch or scalar type: '{name}'")
+        raise errors.OnnxExporterError(f"Unknown torch or scalar type: '{name}'")
 
     @classmethod
     @_beartype.beartype
-    def from_dtype(cls, dtype: torch.dtype) -> JitScalarType:
-        """Convert a torch dtype to ScalarType."""
+    def from_dtype(cls, dtype: Optional[torch.dtype]) -> JitScalarType:
+        """Convert a torch dtype to JitScalarType.
+
+        Note: DO NOT USE this API when `dtype` comes from a `torch._C.Value.type()` calls.
+            A "RuntimeError: INTERNAL ASSERT FAILED at "../aten/src/ATen/core/jit_type_base.h" can
+            be raised in several scenarios where shape info is not present.
+            Instead use `from_value` API which is safer.
+
+        Args:
+            dtype: A torch.dtype to create a JitScalarType from
+
+        Returns:
+            JitScalarType
+
+        Raises:
+            OnnxExporterError: if dtype is not a valid torch.dtype or if it is None.
+        """
         if dtype not in _DTYPE_TO_SCALAR_TYPE:
-            raise ValueError(f"Unknown dtype: {dtype}")
+            raise errors.OnnxExporterError(f"Unknown dtype: {dtype}")
         return _DTYPE_TO_SCALAR_TYPE[dtype]
 
+    @classmethod
+    @_beartype.beartype
+    def from_value(
+        cls, value: Union[None, torch._C.Value, torch.Tensor], default=None
+    ) -> JitScalarType:
+        """Create a JitScalarType from an value's scalar type.
+
+        Args:
+            value: An object to fetch scalar type from.
+            default: The JitScalarType to return if a valid scalar cannot be fetched from value
+
+        Returns:
+            JitScalarType.
+
+        Raises:
+            OnnxExporterError: if value does not have a valid scalar type and default is None.
+            SymbolicValueError: when value.type()'s info are empty and default is None
+        """
+
+        if not isinstance(value, (torch._C.Value, torch.Tensor)):
+            # default value of type JitScalarType is returned when value is not valid
+            if default is None:
+                raise errors.OnnxExporterError(
+                    "value must be either torch._C.Value or torch.Tensor objects."
+                )
+            elif not isinstance(default, JitScalarType):
+                raise errors.OnnxExporterError(
+                    "default value must be a JitScalarType object."
+                )
+            return default
+
+        # Each value type has their own way of storing scalar type
+        if isinstance(value, torch.Tensor):
+            return cls.from_dtype(value.dtype)
+        if isinstance(value.type(), torch.ListType):
+            try:
+                return cls.from_dtype(value.type().getElementType().dtype())
+            except RuntimeError:
+                return cls._from_name(str(value.type().getElementType()))
+
+        # value must be a non-list torch._C.Value scalar
+        scalar_type = value.type().scalarType()
+        if scalar_type is not None:
+            return cls._from_name(scalar_type)
+
+        # When everything fails... try to default
+        if default is not None:
+            return default
+        raise errors.SymbolicValueError(
+            f"Cannot determine scalar type for this '{type(value.type())}' instance and "
+            "a default value was not provided.",
+            value,
+        )
+
     @_beartype.beartype
     def scalar_name(self) -> ScalarName:
-        """Convert a ScalarType to a JIT scalar type name."""
+        """Convert a JitScalarType to a JIT scalar type name."""
         return _SCALAR_TYPE_TO_NAME[self]
 
     @_beartype.beartype
     def torch_name(self) -> TorchName:
-        """Convert a ScalarType to a torch type name."""
+        """Convert a JitScalarType to a torch type name."""
         return _SCALAR_TYPE_TO_TORCH_NAME[self]
 
     @_beartype.beartype
     def dtype(self) -> torch.dtype:
-        """Convert a ScalarType to a torch dtype."""
+        """Convert a JitScalarType to a torch dtype."""
         return _SCALAR_TYPE_TO_DTYPE[self]
 
     @_beartype.beartype
     def onnx_type(self) -> _C_onnx.TensorProtoDataType:
-        """Convert a ScalarType to an ONNX data type."""
+        """Convert a JitScalarType to an ONNX data type."""
         if self not in _SCALAR_TYPE_TO_ONNX:
-            raise ValueError(f"Scalar type {self} cannot be converted to ONNX")
+            raise errors.OnnxExporterError(
+                f"Scalar type {self} cannot be converted to ONNX"
+            )
         return _SCALAR_TYPE_TO_ONNX[self]
 
     @_beartype.beartype
     def onnx_compatible(self) -> bool:
-        """Return whether this ScalarType is compatible with ONNX."""
+        """Return whether this JitScalarType is compatible with ONNX."""
         return (
             self in _SCALAR_TYPE_TO_ONNX
             and self != JitScalarType.UNDEFINED
diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py
index 4b1a6179ba477..84224e88d86e3 100644
--- a/torch/onnx/symbolic_helper.py
+++ b/torch/onnx/symbolic_helper.py
@@ -431,11 +431,12 @@ def _if_scalar_type_as(self, tensor):
     if isinstance(self, _C.Value):
         return self
 
-    scalar_type = tensor.type().scalarType()
-    if scalar_type:
-        ty = scalar_type.lower()
+    scalar_type = _type_utils.JitScalarType.from_value(
+        tensor, _type_utils.JitScalarType.UNDEFINED
+    )
+    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        ty = scalar_type.scalar_name().lower()
         return getattr(self, ty)()
-
     return self
 
 
@@ -492,11 +493,8 @@ def _is_scalar_list(x: _C.Value) -> bool:
     x_type = _as_list_type(x.type())
     if x_type is None:
         return False
-    element_type = str(x_type.getElementType())
-    return (
-        _type_utils.valid_torch_name(element_type)
-        and _type_utils.JitScalarType.from_name(element_type).onnx_compatible()
-    )
+    scalar_type = _type_utils.JitScalarType.from_value(x)
+    return scalar_type.onnx_compatible()
 
 
 @_beartype.beartype
@@ -507,7 +505,9 @@ def _is_tuple_construct(x: _C.Value) -> bool:
 @_beartype.beartype
 def is_complex_value(x: _C.Value) -> bool:
     assert _is_value(x)
-    return _type_utils.JitScalarType.from_name(x.type().scalarType()) in {
+    return _type_utils.JitScalarType.from_value(
+        x, _type_utils.JitScalarType.UNDEFINED
+    ) in {
         _type_utils.JitScalarType.COMPLEX32,
         _type_utils.JitScalarType.COMPLEX64,
         _type_utils.JitScalarType.COMPLEX128,
@@ -644,12 +644,13 @@ def symbolic_fn(*args, **kwargs):
 
 
 @_beartype.beartype
-def _try_get_scalar_type(*args) -> Optional[str]:
+def _try_get_scalar_type(*args) -> Optional[_type_utils.JitScalarType]:
     for arg in args:
-        try:
-            return arg.type().scalarType()
-        except RuntimeError:
-            pass
+        scalar_type = _type_utils.JitScalarType.from_value(
+            arg, _type_utils.JitScalarType.UNDEFINED
+        )
+        if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+            return scalar_type
     return None
 
 
@@ -667,8 +668,13 @@ def _select_helper(g: jit_utils.GraphContext, self, dim, index, apply_reshape=Tr
                 g, index, g.op("Constant", value_t=torch.LongTensor([1]))
             )
 
-    index_scalar_type = index.type().scalarType()
-    if index_scalar_type is None or index_scalar_type not in {"Long", "Int"}:
+    index_scalar_type = _type_utils.JitScalarType.from_value(
+        index, _type_utils.JitScalarType.UNDEFINED
+    )
+    if index_scalar_type not in {
+        _type_utils.JitScalarType.INT64,
+        _type_utils.JitScalarType.INT,
+    }:
         index = g.op("Cast", index, to_i=_C_onnx.TensorProtoDataType.INT64)
     return g.op("Gather", self, index, axis_i=dim)
 
@@ -693,47 +699,23 @@ def _slice_helper(
         return _slice10(g, input, axes, starts, ends, steps, dynamic_slice)
 
 
-@_beartype.beartype
-def _is_in_type_group(value, scalar_types: Set[_type_utils.JitScalarType]) -> bool:
-    """Helper function for determining if a value is in a scalar type group."""
-    if value is None:
-        return False
-    if isinstance(value, torch.Tensor):
-        return _type_utils.JitScalarType.from_dtype(value.dtype) in scalar_types
-    elif isinstance(value.type(), torch.ListType):
-        return (
-            _type_utils.JitScalarType.from_dtype(value.type().getElementType().dtype())
-            in scalar_types
-        )
-    scalar_type = value.type().scalarType()
-    if scalar_type is None:
-        warnings.warn(
-            "Type cannot be inferred, which might cause exported graph to produce incorrect results."
-        )
-        return False
-    try:
-        return _type_utils.JitScalarType.from_name(scalar_type) in scalar_types
-    except ValueError:
-        # scalar_type is not a known ScalarType
-        return False
-
-
 @_beartype.beartype
 def _is_fp(value) -> bool:
-    return _is_in_type_group(
-        value,
-        {
-            _type_utils.JitScalarType.FLOAT,
-            _type_utils.JitScalarType.DOUBLE,
-            _type_utils.JitScalarType.HALF,
-            _type_utils.JitScalarType.BFLOAT16,
-        },
-    )
+    return _type_utils.JitScalarType.from_value(
+        value, _type_utils.JitScalarType.UNDEFINED
+    ) in {
+        _type_utils.JitScalarType.FLOAT,
+        _type_utils.JitScalarType.DOUBLE,
+        _type_utils.JitScalarType.HALF,
+        _type_utils.JitScalarType.BFLOAT16,
+    }
 
 
 @_beartype.beartype
 def _is_bool(value) -> bool:
-    return _is_in_type_group(value, {_type_utils.JitScalarType.BOOL})
+    return _type_utils.JitScalarType.from_value(
+        value, _type_utils.JitScalarType.UNDEFINED
+    ) in {_type_utils.JitScalarType.BOOL}
 
 
 @_beartype.beartype
@@ -785,7 +767,7 @@ def _topk_helper(
         k = g.op("Constant", value_t=torch.tensor([k], dtype=torch.int64))
     else:
         k = _reshape_helper(g, k, g.op("Constant", value_t=torch.tensor([1])))
-        if _try_get_scalar_type(k) != "Long":
+        if _try_get_scalar_type(k) != _type_utils.JitScalarType.INT64:
             k = g.op("Cast", k, to_i=_C_onnx.TensorProtoDataType.INT64)
     if g.opset <= 10:
         if not largest:
@@ -1261,13 +1243,14 @@ def _arange_cast_helper(
 ]:
     def _is_all_integral(scalars):
         for scalar in scalars:
-            try:
-                if scalar.type().scalarType() != "Long":
-                    return False
-            except Exception:
-                # FIXME(justinchuby): Avoid catching Exception.
-                # Catch a more specific exception instead.
-                pass
+            scalar_type = _type_utils.JitScalarType.from_value(
+                scalar, _type_utils.JitScalarType.UNDEFINED
+            )
+            if (
+                scalar_type != _type_utils.JitScalarType.INT64
+                and scalar_type != _type_utils.JitScalarType.UNDEFINED
+            ):
+                return False
         return True
 
     # This logic is based on torch.arange docs. If "dtype" is provided,
@@ -1375,9 +1358,7 @@ def _batchnorm_helper(
             )
         weight_value = torch.tensor(
             [1.0] * channel_size,
-            dtype=_type_utils.JitScalarType.from_name(
-                input.type().scalarType()
-            ).dtype(),
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
         )
         weight = g.op("Constant", value_t=weight_value)
     if bias is None or _is_none(bias):
@@ -1388,9 +1369,7 @@ def _batchnorm_helper(
             )
         bias_value = torch.tensor(
             [0.0] * channel_size,
-            dtype=_type_utils.JitScalarType.from_name(
-                input.type().scalarType()
-            ).dtype(),
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
         )
         bias = g.op("Constant", value_t=bias_value)
     # If track_running_stats is set to False batch statistics are instead used during evaluation time
@@ -1534,9 +1513,7 @@ def dequantize_helper(
     tensor, scale, zero_point = unpacked_qtensors[:3]
     axis = unpacked_qtensors[3] if len(unpacked_qtensors) >= 4 else None
     axis_i = _get_const(axis, "i", "axis")
-    input_scalar_type = tensor.type().scalarType()
-    assert input_scalar_type is not None
-    input_qdtype = _type_utils.JitScalarType.from_name(tensor.type().scalarType())
+    input_qdtype = _type_utils.JitScalarType.from_value(tensor)
     if qdtype is None:
         if input_qdtype is not None:
             qdtype = input_qdtype.onnx_type()
@@ -1598,11 +1575,19 @@ def quantize_helper(
         )
 
     assert scale is not None
-    if scale.type().scalarType() != "Float":
+    if (
+        _type_utils.JitScalarType.from_value(scale, _type_utils.JitScalarType.UNDEFINED)
+        != _type_utils.JitScalarType.FLOAT
+    ):
         scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
 
     assert zero_point is not None
-    if zero_point.type().scalarType() not in ("Byte", "Char"):
+    if _type_utils.JitScalarType.from_value(
+        zero_point, _type_utils.JitScalarType.UNDEFINED
+    ) not in {
+        _type_utils.JitScalarType.UINT8,
+        _type_utils.JitScalarType.INT8,
+    }:
         zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
     output = g.op(
         "QuantizeLinear",
@@ -1643,8 +1628,10 @@ def requantize_bias_helper(
 @_beartype.beartype
 def args_have_same_dtype(args):
     assert args
-    base_dtype = args[0].type().scalarType()
-    has_same_dtype = all(elem.type().scalarType() == base_dtype for elem in args)
+    base_dtype = _type_utils.JitScalarType.from_value(args[0])
+    has_same_dtype = all(
+        _type_utils.JitScalarType.from_value(elem) == base_dtype for elem in args
+    )
     return has_same_dtype
 
 
diff --git a/torch/onnx/symbolic_opset10.py b/torch/onnx/symbolic_opset10.py
index 27cb161a1ae30..d09133a60b9d0 100644
--- a/torch/onnx/symbolic_opset10.py
+++ b/torch/onnx/symbolic_opset10.py
@@ -603,7 +603,7 @@ def nan_to_num(g: jit_utils.GraphContext, input, nan, posinf, neginf):
     # return the original tensor
     if not symbolic_helper._is_fp(input):
         return input
-    input_dtype = _type_utils.JitScalarType.from_name(input.type().scalarType()).dtype()
+    input_dtype = _type_utils.JitScalarType.from_value(input).dtype()
     if nan is None:
         nan = 0.0
     nan_cond = opset9.isnan(g, input)
diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py
index 6c71cc1651562..3706c5336dfc8 100644
--- a/torch/onnx/symbolic_opset11.py
+++ b/torch/onnx/symbolic_opset11.py
@@ -94,11 +94,9 @@ def _apply(fn):
 @symbolic_helper.parse_args("v", "f", "f")
 @_beartype.beartype
 def hardtanh(g: jit_utils.GraphContext, self: _C.Value, min_val: float, max_val: float):
-    dtype = self.type().scalarType()
-    if dtype is None:
-        scalar_type = _type_utils.JitScalarType.FLOAT
-    else:
-        scalar_type = _type_utils.JitScalarType.from_name(dtype)
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.FLOAT
+    )
     min_val = g.op(
         "Constant",
         value_t=torch.tensor(min_val, dtype=scalar_type.dtype()),
@@ -115,22 +113,23 @@ def hardtanh(g: jit_utils.GraphContext, self: _C.Value, min_val: float, max_val:
 @_onnx_symbolic("aten::clamp")
 @_beartype.beartype
 def clamp(g: jit_utils.GraphContext, self, min, max):
-    dtype = self.type().scalarType()
-
     @_beartype.beartype
     def _cast_if_not_none(tensor, dtype):
         if tensor is not None and not symbolic_helper._is_none(tensor):
             return g.op(
                 "Cast",
                 tensor,
-                to_i=_type_utils.JitScalarType.from_name(dtype).onnx_type(),
+                to_i=dtype.onnx_type(),
             )
         else:
             return tensor
 
-    if dtype is not None:
-        min = _cast_if_not_none(min, dtype)
-        max = _cast_if_not_none(max, dtype)
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    )
+    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        min = _cast_if_not_none(min, scalar_type)
+        max = _cast_if_not_none(max, scalar_type)
 
     if symbolic_helper._is_none(min):
         return clamp_max(g, self, max)
@@ -152,8 +151,7 @@ def _cast_if_not_none(tensor, dtype):
 @symbolic_helper.parse_args("v", "v")
 @_beartype.beartype
 def clamp_min(g: jit_utils.GraphContext, self, min):
-    dtype = self.type().scalarType()
-    min = g.op("Cast", min, to_i=_type_utils.JitScalarType.from_name(dtype).onnx_type())
+    min = g.op("Cast", min, to_i=_type_utils.JitScalarType.from_value(self).onnx_type())
     if symbolic_helper._get_tensor_rank(min) == 0:
         max = opset9.unused(g)
         return opset9._op_with_optional_float_cast(
@@ -167,8 +165,7 @@ def clamp_min(g: jit_utils.GraphContext, self, min):
 @symbolic_helper.parse_args("v", "v")
 @_beartype.beartype
 def clamp_max(g: jit_utils.GraphContext, self, max):
-    dtype = self.type().scalarType()
-    max = g.op("Cast", max, to_i=_type_utils.JitScalarType.from_name(dtype).onnx_type())
+    max = g.op("Cast", max, to_i=_type_utils.JitScalarType.from_value(self).onnx_type())
     if symbolic_helper._get_tensor_rank(max) == 0:
         min = opset9.unused(g)
         return opset9._op_with_optional_float_cast(
@@ -182,11 +179,9 @@ def clamp_max(g: jit_utils.GraphContext, self, max):
 @_beartype.beartype
 def relu6(g: jit_utils.GraphContext, input):
     relu_ = opset9._op_with_optional_float_cast(g, "Relu", input, opset_before=14)
-    dtype = input.type().scalarType()
-    if dtype is None:
-        scalar_type = _type_utils.JitScalarType.FLOAT
-    else:
-        scalar_type = _type_utils.JitScalarType.from_name(dtype)
+    scalar_type = _type_utils.JitScalarType.from_value(
+        input, _type_utils.JitScalarType.FLOAT
+    )
     min_val = g.op(
         "Constant",
         value_t=torch.tensor(0, dtype=scalar_type.dtype()),
@@ -299,18 +294,23 @@ def index_put(
         values = opset9.expand(g, values, values_shape, None)
     values = symbolic_helper._reshape_helper(g, values, values_shape)
 
-    dtype = self.type().scalarType()
-    if dtype is not None and dtype != values.type().scalarType():
-        values = g.op(
-            "Cast", values, to_i=_type_utils.JitScalarType.from_name(dtype).onnx_type()
+    self_scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    )
+    if self_scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        values_scalar_type = _type_utils.JitScalarType.from_value(
+            values, _type_utils.JitScalarType.UNDEFINED
         )
-    scalar_type = _type_utils.JitScalarType.from_name(dtype)
+        if self_scalar_type != values_scalar_type:
+            values = g.op("Cast", values, to_i=self_scalar_type.onnx_type())
+    elif accumulate:
+        raise errors.SymbolicValueError("self does not have a valid scalar type.", self)
 
     if accumulate:
         zeros = g.op(
             "ConstantOfShape",
             g.op("Shape", self),
-            value_t=torch.tensor([0], dtype=scalar_type.dtype()),
+            value_t=torch.tensor([0], dtype=self_scalar_type.dtype()),
         )
         result = g.op("ScatterND", zeros, index, values)
         result = add(g, self, result)
@@ -398,20 +398,18 @@ def gather(g: jit_utils.GraphContext, self, dim, index, sparse_grad=False):
 def scatter(g: jit_utils.GraphContext, self, dim, index, src):
     if symbolic_helper.is_caffe2_aten_fallback():
         return g.at("scatter", self, dim, index, src, overload_name="src")
-    src_type = src.type().scalarType()
+    src_type = _type_utils.JitScalarType.from_value(src)
     src = symbolic_helper._maybe_get_scalar(src)
     if symbolic_helper._is_value(src):
         return g.op("ScatterElements", self, index, src, axis_i=dim)
     else:
         # Check if scalar "src" has same type as self (PyTorch allows different
         # type for scalar src (but not when src is tensor)). If not, insert Cast node.
-        if self.type().scalarType() != src_type:
+        if _type_utils.JitScalarType.from_value(self) != src_type:
             src = g.op(
                 "Cast",
                 src,
-                to_i=_type_utils.JitScalarType.from_name(
-                    self.type().scalarType()
-                ).onnx_type(),
+                to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
             )
         return g.op(
             "ScatterElements", self, index, opset9.expand_as(g, src, index), axis_i=dim
@@ -1024,7 +1022,9 @@ def index(g: jit_utils.GraphContext, self, index):
     if len(indices) == 1:
         index = indices[0]
         if not symbolic_helper._is_none(index) and (
-            symbolic_helper._is_bool(index) or index.type().scalarType() == "Byte"
+            symbolic_helper._is_bool(index)
+            or _type_utils.JitScalarType.from_value(index)
+            == _type_utils.JitScalarType.UINT8
         ):
             index = opset9.nonzero(g, index)
             return g.op("GatherND", self, index)
@@ -1071,16 +1071,19 @@ def index_copy(g: jit_utils.GraphContext, self, dim, index, source):
 def __rshift_(g: jit_utils.GraphContext, self, other):
     # make sure to cast other to self's type
     # (when self is long, make sure that other is not float)
-    if other.type().scalarType() != self.type().scalarType():
+    if _type_utils.JitScalarType.from_value(
+        other, _type_utils.JitScalarType.UNDEFINED
+    ) != _type_utils.JitScalarType.from_value(self):
         other = g.op(
             "Cast",
             other,
-            to_i=_type_utils.JitScalarType.from_name(
-                self.type().scalarType()
-            ).onnx_type(),
+            to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
         )
 
-    if self.type().scalarType() == "Byte":
+    if (
+        _type_utils.JitScalarType.from_value(self, _type_utils.JitScalarType.UNDEFINED)
+        == _type_utils.JitScalarType.UINT8
+    ):
         return g.op("BitShift", self, other, direction_s="RIGHT")
 
     two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
@@ -1091,7 +1094,7 @@ def __rshift_(g: jit_utils.GraphContext, self, other):
     two_pow = g.op(
         "Cast",
         two_pow,
-        to_i=_type_utils.JitScalarType.from_name(self.type().scalarType()).onnx_type(),
+        to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
     )
     rshift = g.op("Div", self, two_pow)
     return rshift
@@ -1102,16 +1105,19 @@ def __rshift_(g: jit_utils.GraphContext, self, other):
 def __lshift_(g: jit_utils.GraphContext, self, other):
     # make sure to cast other to self's type
     # (when self is long, make sure that other is not float)
-    if other.type().scalarType() != self.type().scalarType():
+    if _type_utils.JitScalarType.from_value(
+        other, _type_utils.JitScalarType.UNDEFINED
+    ) != _type_utils.JitScalarType.from_value(self):
         other = g.op(
             "Cast",
             other,
-            to_i=_type_utils.JitScalarType.from_name(
-                self.type().scalarType()
-            ).onnx_type(),
+            to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
         )
 
-    if self.type().scalarType() == "Byte":
+    if (
+        _type_utils.JitScalarType.from_value(self, _type_utils.JitScalarType.UNDEFINED)
+        == _type_utils.JitScalarType.UINT8
+    ):
         return g.op("BitShift", self, other, direction_s="LEFT")
 
     two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
@@ -1122,7 +1128,7 @@ def __lshift_(g: jit_utils.GraphContext, self, other):
     two_pow = g.op(
         "Cast",
         two_pow,
-        to_i=_type_utils.JitScalarType.from_name(self.type().scalarType()).onnx_type(),
+        to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
     )
     lshift = g.op("Mul", self, two_pow)
     return lshift
@@ -1313,9 +1319,7 @@ def linalg_vector_norm(
         cond_op = g.op(
             "Cast",
             cond_op,
-            to_i=_type_utils.JitScalarType.from_name(
-                self.type().scalarType()
-            ).onnx_type(),
+            to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
         )
         return symbolic_helper._reducesum_helper(
             g, cond_op, axes_i=dim, keepdims_i=keepdim
diff --git a/torch/onnx/symbolic_opset12.py b/torch/onnx/symbolic_opset12.py
index 2185b60c6b560..b318894a69238 100644
--- a/torch/onnx/symbolic_opset12.py
+++ b/torch/onnx/symbolic_opset12.py
@@ -74,13 +74,13 @@ def einsum(g: jit_utils.GraphContext, equation, tensor_list, path=None):
 @_beartype.beartype
 def outer(g: jit_utils.GraphContext, input, other):
     # make sure to cast other to self's type
-    if other.type().scalarType() != input.type().scalarType():
+    if _type_utils.JitScalarType.from_value(
+        other, _type_utils.JitScalarType.UNDEFINED
+    ) != _type_utils.JitScalarType.from_value(input):
         other = g.op(
             "Cast",
             other,
-            to_i=_type_utils.JitScalarType.from_name(
-                input.type().scalarType()
-            ).onnx_type(),
+            to_i=_type_utils.JitScalarType.from_value(input).onnx_type(),
         )
     return _einsum_helper(g, "i,j->ij", [input, other])
 
@@ -264,7 +264,10 @@ def binary_cross_entropy_with_logits(
 def celu(g: jit_utils.GraphContext, self, alpha):
     alpha = symbolic_helper._maybe_get_const(alpha, "f")
     # if the input is of type double cast it to float
-    if self.type().scalarType() == "Double":
+    if (
+        _type_utils.JitScalarType.from_value(self, _type_utils.JitScalarType.UNDEFINED)
+        == _type_utils.JitScalarType.DOUBLE
+    ):
         self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.FLOAT)
         out = g.op("Celu", self, alpha_f=alpha)
         return g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.DOUBLE)
diff --git a/torch/onnx/symbolic_opset13.py b/torch/onnx/symbolic_opset13.py
index 0fc7a219fb3ac..baecba0135fa5 100644
--- a/torch/onnx/symbolic_opset13.py
+++ b/torch/onnx/symbolic_opset13.py
@@ -391,7 +391,10 @@ def fake_quantize_per_tensor_affine(
         zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
     else:
         zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.INT8)
-    if scale.type().scalarType() != "Float":
+    if (
+        _type_utils.JitScalarType.from_value(scale, _type_utils.JitScalarType.UNDEFINED)
+        != _type_utils.JitScalarType.FLOAT
+    ):
         scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
     quantized = g.op("QuantizeLinear", inputs, scale, zero_point)
     if (quant_min, quant_max) == (0, 127):
diff --git a/torch/onnx/symbolic_opset16.py b/torch/onnx/symbolic_opset16.py
index 77d46bb15a245..a2d3505072bac 100644
--- a/torch/onnx/symbolic_opset16.py
+++ b/torch/onnx/symbolic_opset16.py
@@ -69,7 +69,9 @@ def scatter_add(g: jit_utils.GraphContext, self, dim, index, src):
     if symbolic_helper.is_caffe2_aten_fallback():
         return g.at("scatter", self, dim, index, src, overload_name="src")
 
-    src_type = src.type().scalarType()
+    src_type = _type_utils.JitScalarType.from_value(
+        src, _type_utils.JitScalarType.UNDEFINED
+    )
     src_sizes = symbolic_helper._get_tensor_sizes(src)
     index_sizes = symbolic_helper._get_tensor_sizes(index)
 
@@ -85,13 +87,11 @@ def scatter_add(g: jit_utils.GraphContext, self, dim, index, src):
     else:
         # Check if scalar "src" has same type as self (PyTorch allows different
         # type for scalar src (but not when src is tensor)). If not, insert Cast node.
-        if self.type().scalarType() != src_type:
+        if _type_utils.JitScalarType.from_value(self) != src_type:
             src = g.op(
                 "Cast",
                 src,
-                to_i=_type_utils.JitScalarType.from_name(
-                    self.type().scalarType()
-                ).onnx_type(),
+                to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
             )
 
         return g.op(
diff --git a/torch/onnx/symbolic_opset8.py b/torch/onnx/symbolic_opset8.py
index e0a6401be1dfa..c7a771c8f894f 100644
--- a/torch/onnx/symbolic_opset8.py
+++ b/torch/onnx/symbolic_opset8.py
@@ -159,14 +159,21 @@ def __interpolate(
 #       issue for "cast" operators. Some symbolic functions depend on shape information of input tensor, which
 #       is lost after casting.
 def _try_cast_integer_to_float(g: jit_utils.GraphContext, *args):
-    floating_scalar_types = {"Half", "Float", "Double"}
+    floating_scalar_types = {
+        _type_utils.JitScalarType.HALF,
+        _type_utils.JitScalarType.FLOAT,
+        _type_utils.JitScalarType.DOUBLE,
+    }
     old_type = None
     # Cast the input tensor to Float if its scalarType is known and is not floating number.
     # If casting is performed, return the old scalarType, otherwise return None.
-    arg0_type = args[0].type().scalarType()
-    if arg0_type is not None:
+    arg0_type = _type_utils.JitScalarType.from_value(
+        args[0], _type_utils.JitScalarType.UNDEFINED
+    )
+    if arg0_type != _type_utils.JitScalarType.UNDEFINED:
         old_type = arg0_type
         if old_type not in floating_scalar_types:
+            old_type = old_type.scalar_name()
             args = tuple(
                 g.op("Cast", arg, to_i=_C_onnx.TensorProtoDataType.FLOAT)
                 for arg in args
@@ -248,9 +255,7 @@ def mm(g: jit_utils.GraphContext, self, other):
         )
     zero_constant = g.op(
         "Constant",
-        value_t=torch.tensor(
-            [0], dtype=_type_utils.JitScalarType.from_name(scalar_type).dtype()
-        ),
+        value_t=torch.tensor([0], dtype=scalar_type.dtype()),
     )
 
     if symbolic_helper._try_get_scalar_type(self):
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index d31bb8d1a9d62..546a4fa7ce260 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -427,23 +427,20 @@ def _trunc_divide(g: jit_utils.GraphContext, self, other):
 
     # Matching PyTorch's behavior:
     # - if self is fp the output's type is self's type
-    # - if self is not fp and other is fp, the output is of type "Float"
+    # - if self is not fp and other is fp, the output is of type JitScalarType.FLOAT
     # - self is not fp and other is not fp, the output's type is self's output type
     # - the output type defaults to Float
-    scalar_type = self.type().scalarType()
-
-    if scalar_type is not None:
-        if (
-            not symbolic_helper._is_fp(self)
-            and other.type().scalarType() is not None
-            and symbolic_helper._is_fp(other)
-        ):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    )
+    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        if not symbolic_helper._is_fp(self) and symbolic_helper._is_fp(other):
             out = g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.FLOAT)
         else:
             out = g.op(
                 "Cast",
                 out,
-                to_i=_type_utils.JitScalarType.from_name(scalar_type).onnx_type(),
+                to_i=scalar_type.onnx_type(),
             )
     else:
         out = g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.FLOAT)
@@ -576,28 +573,26 @@ def matmul(g: jit_utils.GraphContext, self, other):
 @symbolic_helper.parse_args("v", "v", "v", "t", "t")
 @_beartype.beartype
 def addmm(g: jit_utils.GraphContext, self, mat1, mat2, beta, alpha):
-    dtype = None
-    self_dtype = symbolic_helper._try_get_scalar_type(self)
-    mat1_dtype = symbolic_helper._try_get_scalar_type(mat1)
-    mat2_dtype = symbolic_helper._try_get_scalar_type(mat2)
-    if self_dtype is not None:
-        dtype = self_dtype
-    elif mat1_dtype is not None:
-        dtype = mat1_dtype
-    elif mat2_dtype is not None:
-        dtype = mat2_dtype
+    scalar_type = None
+    self_scalar_type = symbolic_helper._try_get_scalar_type(self)
+    mat1_scalar_type = symbolic_helper._try_get_scalar_type(mat1)
+    mat2_scalar_type = symbolic_helper._try_get_scalar_type(mat2)
+    if self_scalar_type is not None:
+        scalar_type = self_scalar_type
+    elif mat1_scalar_type is not None:
+        scalar_type = mat1_scalar_type
+    elif mat2_scalar_type is not None:
+        scalar_type = mat2_scalar_type
 
     mat1_rank = symbolic_helper._get_tensor_rank(mat1)
     mat2_rank = symbolic_helper._get_tensor_rank(mat2)
 
-    def is_not_none_and(v, u):
+    def is_not_none_nor(v, u):
         return v is not None and v != u
 
-    if dtype is not None and (
-        is_not_none_and(mat1_rank, 2) or is_not_none_and(mat2_rank, 2)
+    if scalar_type is not None and (
+        is_not_none_nor(mat1_rank, 2) or is_not_none_nor(mat2_rank, 2)
     ):
-        scalar_type = _type_utils.JitScalarType.from_name(dtype)
-
         res1 = g.op("MatMul", mat1, mat2)
         res2 = self
 
@@ -719,11 +714,16 @@ def _slice(g: jit_utils.GraphContext, input, axes, starts, ends):
 
 @_beartype.beartype
 def _maybe_cast_reduce_op_input(g: jit_utils.GraphContext, self):
-    dtype = self.type().scalarType()
-    # This check only covers traced modules where dtype is present
-    if dtype is not None:
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    )
+    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        # This check only covers traced modules where dtype is present
         # pytorch reduce-ops cast all other integral types to int64
-        if not symbolic_helper._is_fp(self) and not (dtype == "Long"):
+        if (
+            not symbolic_helper._is_fp(self)
+            and scalar_type != _type_utils.JitScalarType.INT64
+        ):
             self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.INT64)
     return self
 
@@ -1261,16 +1261,16 @@ def _op_with_optional_float_cast(g: jit_utils.GraphContext, op_name, *args, **kw
         *args (tuple): operands to the operator.
         **kwargs (dict): attributes to the operator along with "opset_before" (optional, None by default)
             indicating the smallest opset version to trigger such casting behavior and "target_float_t"
-            (optional, "Float" by default) indicating the data type of internal operator.
+            (optional, torch.onnx.JitScalarType.FLOAT by default) indicating the data type of internal operator.
 
     Returns:
         Optional[torch._C.Value, Tuple[torch._C.Value, ...]]: output(s) of the operator.
     """
     opset_before = kwargs.pop("opset_before", None)
-    target_float_t = kwargs.pop("target_float_t", "Float")
+    target_float_t = kwargs.pop("target_float_t", _type_utils.JitScalarType.FLOAT)
 
     inputs = list(args)
-    dtype_0 = inputs[0].type().scalarType()
+    dtype_0 = _type_utils.JitScalarType.from_value(inputs[0])
 
     require_cast = not symbolic_helper._is_fp(inputs[0]) and (
         opset_before is None or GLOBALS.export_onnx_opset_version < opset_before
@@ -1278,9 +1278,10 @@ def _op_with_optional_float_cast(g: jit_utils.GraphContext, op_name, *args, **kw
 
     if require_cast:
         for input in inputs:
-            if input.isCompleteTensor() and input.type().scalarType() != dtype_0:
+            input_scalar_type = _type_utils.JitScalarType.from_value(input)
+            if input.isCompleteTensor() and input_scalar_type != dtype_0:
                 raise errors.SymbolicValueError(
-                    f"Inputs of {op_name} must have same dtype. Got {dtype_0} and {input.type().scalarType()}",
+                    f"Inputs of {op_name} must have same dtype. Got {dtype_0.scalar_name()} and {input_scalar_type.scalar_name()}",
                     input,
                 )
         for i, input in enumerate(inputs):
@@ -1288,17 +1289,13 @@ def _op_with_optional_float_cast(g: jit_utils.GraphContext, op_name, *args, **kw
                 inputs[i] = g.op(
                     "Cast",
                     input,
-                    to_i=_type_utils.JitScalarType.from_name(
-                        target_float_t
-                    ).onnx_type(),
+                    to_i=target_float_t.onnx_type(),
                 )
 
     self = g.op(op_name, *inputs, **kwargs)
 
     if require_cast:
-        self = g.op(
-            "Cast", self, to_i=_type_utils.JitScalarType.from_name(dtype_0).onnx_type()
-        )
+        self = g.op("Cast", self, to_i=dtype_0.onnx_type())
 
     return self
 
@@ -2122,12 +2119,7 @@ def gt(g: jit_utils.GraphContext, input, other):
 
 @_beartype.beartype
 def _gt_impl(g: jit_utils.GraphContext, input, other):
-    if (
-        input.type().scalarType() is not None
-        and symbolic_helper._is_bool(input)
-        and other.type().scalarType() is not None
-        and symbolic_helper._is_bool(other)
-    ):
+    if symbolic_helper._is_bool(input) and symbolic_helper._is_bool(other):
         input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT32)
         other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.INT32)
     return g.op("Greater", input, other)
@@ -2142,12 +2134,7 @@ def lt(g: jit_utils.GraphContext, input, other):
 
 @_beartype.beartype
 def _lt_impl(g: jit_utils.GraphContext, input, other):
-    if (
-        input.type().scalarType() is not None
-        and symbolic_helper._is_bool(input)
-        and other.type().scalarType() is not None
-        and symbolic_helper._is_bool(other)
-    ):
+    if symbolic_helper._is_bool(input) and symbolic_helper._is_bool(other):
         input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT32)
         other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.INT32)
     return g.op("Less", input, other)
@@ -2249,13 +2236,15 @@ def logical_xor(g: jit_utils.GraphContext, input, other):
 def __rshift_(g: jit_utils.GraphContext, self, other):
     # make sure to cast other to self's type
     # (when self is long, make sure that other is not float)
-    if other.type().scalarType() != self.type().scalarType():
+    self_scalar_type = _type_utils.JitScalarType.from_value(self)
+    if (
+        _type_utils.JitScalarType.from_value(other, _type_utils.JitScalarType.UNDEFINED)
+        != self_scalar_type
+    ):
         other = g.op(
             "Cast",
             other,
-            to_i=_type_utils.JitScalarType.from_name(
-                self.type().scalarType()
-            ).onnx_type(),
+            to_i=self_scalar_type.onnx_type(),
         )
 
     two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
@@ -2266,7 +2255,7 @@ def __rshift_(g: jit_utils.GraphContext, self, other):
     two_pow = g.op(
         "Cast",
         two_pow,
-        to_i=_type_utils.JitScalarType.from_name(self.type().scalarType()).onnx_type(),
+        to_i=self_scalar_type.onnx_type(),
     )
     rshift = g.op("Div", self, two_pow)
     return rshift
@@ -2277,13 +2266,15 @@ def __rshift_(g: jit_utils.GraphContext, self, other):
 def __lshift_(g: jit_utils.GraphContext, self, other):
     # make sure to cast other to self's type
     # (when self is long, make sure that other is not float)
-    if other.type().scalarType() != self.type().scalarType():
+    self_scalar_type = _type_utils.JitScalarType.from_value(self)
+    if (
+        _type_utils.JitScalarType.from_value(other, _type_utils.JitScalarType.UNDEFINED)
+        != self_scalar_type
+    ):
         other = g.op(
             "Cast",
             other,
-            to_i=_type_utils.JitScalarType.from_name(
-                self.type().scalarType()
-            ).onnx_type(),
+            to_i=self_scalar_type.onnx_type(),
         )
 
     two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
@@ -2294,7 +2285,7 @@ def __lshift_(g: jit_utils.GraphContext, self, other):
     two_pow = g.op(
         "Cast",
         two_pow,
-        to_i=_type_utils.JitScalarType.from_name(self.type().scalarType()).onnx_type(),
+        to_i=self_scalar_type.onnx_type(),
     )
     lshift = g.op("Mul", self, two_pow)
     return lshift
@@ -2353,7 +2344,13 @@ def log_softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
 @symbolic_helper.parse_args("v", "i", "i")
 @_beartype.beartype
 def _log_softmax(g: jit_utils.GraphContext, input, dim, half_to_float):
-    if half_to_float and input.type().scalarType() == "Half":
+    if (
+        half_to_float
+        and _type_utils.JitScalarType.from_value(
+            input, _type_utils.JitScalarType.UNDEFINED
+        )
+        == _type_utils.JitScalarType.HALF
+    ):
         input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT)
     return log_softmax(g, input, dim)
 
@@ -2776,9 +2773,7 @@ def instance_norm(
             )
         weight_value = torch.tensor(
             [1.0] * channel_size,
-            dtype=_type_utils.JitScalarType.from_name(
-                input.type().scalarType()
-            ).dtype(),
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
         )
         weight = g.op("Constant", value_t=weight_value)
     if bias is None or symbolic_helper._is_none(bias):
@@ -2789,9 +2784,7 @@ def instance_norm(
             )
         bias_value = torch.tensor(
             [0.0] * channel_size,
-            dtype=_type_utils.JitScalarType.from_name(
-                input.type().scalarType()
-            ).dtype(),
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
         )
         bias = g.op("Constant", value_t=bias_value)
     if (
@@ -3031,7 +3024,7 @@ def type_as(g: jit_utils.GraphContext, self, other):
         return g.op(
             "Cast",
             self,
-            to_i=_type_utils.JitScalarType.from_name(other_dtype).onnx_type(),
+            to_i=other_dtype.onnx_type(),
         )
 
     if symbolic_helper.is_caffe2_aten_fallback():
@@ -3121,17 +3114,15 @@ def log10(g: jit_utils.GraphContext, self):
 @_onnx_symbolic("aten::pow")
 @_beartype.beartype
 def pow(g: jit_utils.GraphContext, self, exponent):
-    f_dtype = self_dtype = self.type().scalarType()
+    f_dtype = _type_utils.JitScalarType.from_value(self)
     if not symbolic_helper._is_fp(self):
-        f_dtype = "Float"
-        self = g.op(
-            "Cast", self, to_i=_type_utils.JitScalarType.from_name(f_dtype).onnx_type()
-        )
+        f_dtype = _type_utils.JitScalarType.FLOAT
+        self = g.op("Cast", self, to_i=f_dtype.onnx_type())
     if not symbolic_helper._is_fp(exponent):
         exponent = g.op(
             "Cast",
             exponent,
-            to_i=_type_utils.JitScalarType.from_name(f_dtype).onnx_type(),
+            to_i=f_dtype.onnx_type(),
         )
     pow = g.op("Pow", self, exponent)
     return pow
@@ -3169,10 +3160,8 @@ def clamp_min(g: jit_utils.GraphContext, self, min):
             g, "Clip", self, min_f=symbolic_helper._parse_arg(min, "f"), opset_before=12
         )
     else:
-        dtype = self.type().scalarType()
-        min = g.op(
-            "Cast", min, to_i=_type_utils.JitScalarType.from_name(dtype).onnx_type()
-        )
+        dtype = _type_utils.JitScalarType.from_value(self)
+        min = g.op("Cast", min, to_i=dtype.onnx_type())
         return _op_with_optional_float_cast(g, "Max", self, min, opset_before=12)
 
 
@@ -3185,10 +3174,8 @@ def clamp_max(g: jit_utils.GraphContext, self, max):
             g, "Clip", self, max_f=symbolic_helper._parse_arg(max, "f"), opset_before=12
         )
     else:
-        dtype = self.type().scalarType()
-        max = g.op(
-            "Cast", max, to_i=_type_utils.JitScalarType.from_name(dtype).onnx_type()
-        )
+        dtype = _type_utils.JitScalarType.from_value(self)
+        max = g.op("Cast", max, to_i=dtype.onnx_type())
         return _op_with_optional_float_cast(g, "Min", self, max, opset_before=12)
 
 
@@ -3530,7 +3517,6 @@ def new_empty(
     self_dtype = symbolic_helper._try_get_scalar_type(self)
     if dtype is None and self_dtype is not None:
         dtype = self_dtype
-        dtype = _type_utils.JitScalarType.from_name(dtype)
     return empty(g, sizes, dtype, layout, device, pin_memory)
 
 
@@ -3552,19 +3538,19 @@ def tensor(
     dtype = symbolic_helper._get_const(dtype, "i", "dtype")
     if symbolic_helper._is_packed_list(data):
         if dtype is None:
-            scalar_name = symbolic_helper._unpack_list(data)[0].type().scalarType()
-            dtype = _type_utils.JitScalarType.from_name(scalar_name)
+            dtype = _type_utils.JitScalarType.from_value(
+                symbolic_helper._unpack_list(data)[0]
+            )
         input_list = list()
         for t in symbolic_helper._unpack_list(data):
             shape_reference = g.op("Constant", value_t=torch.LongTensor([1]))
             t = symbolic_helper._reshape_helper(g, t, shape_reference)
-            t = g.op("Cast", t, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+            t = g.op("Cast", t, to_i=dtype.onnx_type())
             input_list.append(t)
         return g.op("Concat", *input_list, axis_i=0)
     else:
         if dtype is None:
-            scalar_name = data.type().scalarType()
-            dtype = _type_utils.JitScalarType.from_name(scalar_name)
+            dtype = _type_utils.JitScalarType.from_value(data)
         if symbolic_helper._is_list(data) and (
             symbolic_helper._is_tensor_list(data)
             or symbolic_helper._is_scalar_list(data)
@@ -3629,7 +3615,7 @@ def new_zeros(
 ):
     self_dtype = symbolic_helper._try_get_scalar_type(self)
     if dtype is None and self_dtype is not None:
-        dtype = _type_utils.JitScalarType.from_name(self_dtype)
+        dtype = self_dtype
     return zeros(g, sizes, dtype, layout, device, pin_memory)
 
 
@@ -3683,7 +3669,6 @@ def new_ones(
     self_dtype = symbolic_helper._try_get_scalar_type(self)
     if dtype is None and self_dtype is not None:
         dtype = self_dtype
-        dtype = _type_utils.JitScalarType.from_name(dtype)
     return ones(g, sizes, dtype, layout, device, pin_memory)
 
 
@@ -3759,7 +3744,6 @@ def new_full(
     self_dtype = symbolic_helper._try_get_scalar_type(self)
     if dtype is None and self_dtype is not None:
         dtype = self_dtype
-        dtype = _type_utils.JitScalarType.from_name(dtype)
     return full(g, size, fill_value, dtype, layout, device, pin_memory)
 
 
@@ -3895,11 +3879,9 @@ def tanhshrink(g: jit_utils.GraphContext, self):
 @symbolic_helper.parse_args("v", "f")
 @_beartype.beartype
 def hardshrink(g: jit_utils.GraphContext, self, lambd):
-    dtype = self.type().scalarType()
-    if dtype is None:
-        scalar_type = _type_utils.JitScalarType.FLOAT
-    else:
-        scalar_type = _type_utils.JitScalarType.from_name(dtype)
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.FLOAT
+    )
     lambd_op = g.op(
         "Constant",
         value_t=torch.tensor(lambd, dtype=scalar_type.dtype()),
@@ -3920,11 +3902,9 @@ def hardshrink(g: jit_utils.GraphContext, self, lambd):
 @symbolic_helper.parse_args("v", "f")
 @_beartype.beartype
 def softshrink(g: jit_utils.GraphContext, self, lambd):
-    dtype = self.type().scalarType()
-    if dtype is None:
-        scalar_type = _type_utils.JitScalarType.FLOAT
-    else:
-        scalar_type = _type_utils.JitScalarType.from_name(dtype)
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.FLOAT
+    )
     lambd_op = g.op(
         "Constant",
         value_t=torch.tensor(lambd, dtype=scalar_type.dtype()),
@@ -4078,11 +4058,11 @@ def is_aten_to_device_only(args):
 
         if symbolic_helper._is_value(dtype) or isinstance(dtype, torch.Tensor):
             # aten::to(Tensor, Tensor, bool, bool, memory_format)
-            dtype = args[0].type().scalarType()
+            dtype = _type_utils.JitScalarType.from_value(args[0])
             return g.op(
                 "Cast",
                 self,
-                to_i=_type_utils.JitScalarType.from_name(dtype).onnx_type(),
+                to_i=dtype.onnx_type(),
             )
         else:
             # aten::to(Tensor, ScalarType, bool, bool, memory_format)
@@ -4820,7 +4800,12 @@ def _pack_padded_sequence(g: jit_utils.GraphContext, input, lengths, batch_first
     # We know it's a TensorType so this check is now safe.
     # It's really only necessary because those operators expand to something that
     # only works with int32 types in Caffe2...
-    if lengths.type().scalarType() != "Int":
+    if (
+        _type_utils.JitScalarType.from_value(
+            lengths, _type_utils.JitScalarType.UNDEFINED
+        )
+        != _type_utils.JitScalarType.INT
+    ):
         lengths = g.op("Cast", lengths, to_i=_C_onnx.TensorProtoDataType.INT32)
     return g.op("prim::PackPadded", input, lengths, outputs=2)
 
@@ -4960,22 +4945,23 @@ def bernoulli(g: jit_utils.GraphContext, input, generator=None, out=None):
             "Bernoulli", "generator is not supported for bernoulli", input
         )
 
-    dtype = symbolic_helper._try_get_scalar_type(input)
-    if dtype is None:
+    dtype = _type_utils.JitScalarType.from_value(
+        input, _type_utils.JitScalarType.UNDEFINED
+    )
+    if dtype == _type_utils.JitScalarType.UNDEFINED:
         return symbolic_helper._unimplemented(
             "Bernoulli", "input dtype not accessible", input
         )
+
     p = g.op(
         "RandomUniformLike",
         input,
         high_f=1.0,
         low_f=0.0,
-        dtype_i=_type_utils.JitScalarType.from_name(dtype).onnx_type(),
+        dtype_i=dtype.onnx_type(),
     )
     output = g.op("Less", p, input)
-    return g.op(
-        "Cast", output, to_i=_type_utils.JitScalarType.from_name(dtype).onnx_type()
-    )
+    return g.op("Cast", output, to_i=dtype.onnx_type())
 
 
 @_onnx_symbolic("aten::log_sigmoid")
@@ -5110,21 +5096,18 @@ def argmin(
 @symbolic_helper.parse_args("v", "i", "v", "v")
 @_beartype.beartype
 def scatter(g: jit_utils.GraphContext, self, dim, index, src):
-    src_type = src.type().scalarType()
+    src_type = _type_utils.JitScalarType.from_value(
+        src, _type_utils.JitScalarType.UNDEFINED
+    )
     src = symbolic_helper._maybe_get_scalar(src)
     if symbolic_helper._is_value(src):
         return g.op("Scatter", self, index, src, axis_i=dim)
     else:
         # Check if scalar "src" has same type as self (PyTorch allows different
         # type for scalar src (but not when src is tensor)). If not, insert Cast node.
-        if self.type().scalarType() != src_type:
-            src = g.op(
-                "Cast",
-                src,
-                to_i=_type_utils.JitScalarType.from_name(
-                    self.type().scalarType()
-                ).onnx_type(),
-            )
+        self_scalar_type = _type_utils.JitScalarType.from_value(self)
+        if self_scalar_type != src_type:
+            src = g.op("Cast", src, to_i=self_scalar_type.onnx_type())
         return g.op("Scatter", self, index, expand_as(g, src, index), axis_i=dim)
 
 
@@ -5132,12 +5115,11 @@ def scatter(g: jit_utils.GraphContext, self, dim, index, src):
 @symbolic_helper.parse_args("v", "i", "v", "v")
 @_beartype.beartype
 def scatter_add(g: jit_utils.GraphContext, self, dim, index, src):
-    scalar_name = symbolic_helper._try_get_scalar_type(self)
-    if scalar_name is None:
+    scalar_type = symbolic_helper._try_get_scalar_type(self)
+    if scalar_type is None:
         return symbolic_helper._unimplemented(
             "scatter_add", "input dtype not accessible", self
         )
-    scalar_type = _type_utils.JitScalarType.from_name(scalar_name)
     sizes = symbolic_helper._get_tensor_sizes(self, allow_nonstatic=False)
     if sizes:
         to_add = g.op("Constant", value_t=torch.zeros(sizes, dtype=scalar_type.dtype()))
@@ -5184,7 +5166,14 @@ def __isnot_(g: jit_utils.GraphContext, self, other):
 def one_hot(g: jit_utils.GraphContext, self, num_classes):
     values = g.op("Constant", value_t=torch.LongTensor([0, 1]))
     # onnxruntime supports limited type combinations for OneHot.
-    if num_classes.type().scalarType() in {"Byte", "Char", "Int", "Short"}:
+    if _type_utils.JitScalarType.from_value(
+        num_classes, _type_utils.JitScalarType.UNDEFINED
+    ) in {
+        _type_utils.JitScalarType.UINT8,
+        _type_utils.JitScalarType.INT8,
+        _type_utils.JitScalarType.INT,
+        _type_utils.JitScalarType.INT16,
+    }:
         num_classes = g.op("Cast", num_classes, to_i=_C_onnx.TensorProtoDataType.INT64)
     return g.op("OneHot", self, num_classes, values, axis_i=-1)
 
@@ -5197,13 +5186,13 @@ def gather(g: jit_utils.GraphContext, self, dim, index, sparse_grad=False):
         return symbolic_helper._unimplemented("gather", "sparse_grad == True", self)
     # NOTE: This workaround is needed since GatherElement is only supported
     #       since opset 11, and Gather in ONNX is not the same as torch.gather.
-    dtype = self.type().scalarType()
+    scalar_type = _type_utils.JitScalarType.from_value(self)
     values = g.op("Constant", value_t=torch.LongTensor([0, 1]))
     depth = size(g, self, g.op("Constant", value_t=torch.LongTensor([dim])))
     index = g.op(
         "Cast",
         g.op("OneHot", index, depth, values, axis_i=dim),
-        to_i=_type_utils.JitScalarType.from_name(dtype).onnx_type(),
+        to_i=scalar_type.onnx_type(),
     )
     mul = g.op("Mul", symbolic_helper._unsqueeze_helper(g, self, [dim + 1]), index)
     return symbolic_helper._reducesum_helper(g, mul, axes_i=[dim], keepdims_i=0)
@@ -5413,7 +5402,11 @@ def index(g: jit_utils.GraphContext, self, index):
     @_beartype.beartype
     def try_mask_to_index(index):
         if not symbolic_helper._is_none(index) and (
-            index.type().scalarType() == "Byte" or symbolic_helper._is_bool(index)
+            _type_utils.JitScalarType.from_value(
+                index, _type_utils.JitScalarType.UNDEFINED
+            )
+            == _type_utils.JitScalarType.UINT8
+            or symbolic_helper._is_bool(index)
         ):
             if g.opset < 9:
                 raise errors.SymbolicValueError(
@@ -5750,19 +5743,17 @@ def multinomial(
 @_onnx_symbolic("aten::baddbmm")
 @_beartype.beartype
 def baddbmm(g: jit_utils.GraphContext, self, batch1, batch2, beta, alpha):
-    dtype = self.type().scalarType()
+    scalar_type = _type_utils.JitScalarType.from_value(self)
     batch_mul = matmul(g, batch1, batch2)
     mul_a = mul(
         g,
         batch_mul,
-        g.op(
-            "Cast", alpha, to_i=_type_utils.JitScalarType.from_name(dtype).onnx_type()
-        ),
+        g.op("Cast", alpha, to_i=scalar_type.onnx_type()),
     )
     mul_b = mul(
         g,
         self,
-        g.op("Cast", beta, to_i=_type_utils.JitScalarType.from_name(dtype).onnx_type()),
+        g.op("Cast", beta, to_i=scalar_type.onnx_type()),
     )
     return add(g, mul_a, mul_b)
 
@@ -5874,18 +5865,14 @@ def group_norm(
         "Constant",
         value_t=torch.tensor(
             [1.0] * num_groups,
-            dtype=_type_utils.JitScalarType.from_name(
-                input.type().scalarType()
-            ).dtype(),
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
         ),
     )
     bias_ = g.op(
         "Constant",
         value_t=torch.tensor(
             [0.0] * num_groups,
-            dtype=_type_utils.JitScalarType.from_name(
-                input.type().scalarType()
-            ).dtype(),
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
         ),
     )
 
@@ -5896,18 +5883,12 @@ def group_norm(
 
     if weight is None or weight.node().mustBeNone():
         weight_value = torch.tensor(
-            [1.0],
-            dtype=_type_utils.JitScalarType.from_name(
-                input.type().scalarType()
-            ).dtype(),
+            [1.0], dtype=_type_utils.JitScalarType.from_value(input).dtype()
         )
         weight = g.op("Constant", value_t=weight_value)
     if bias is None or bias.node().mustBeNone():
         bias_value = torch.tensor(
-            [0.0],
-            dtype=_type_utils.JitScalarType.from_name(
-                input.type().scalarType()
-            ).dtype(),
+            [0.0], dtype=_type_utils.JitScalarType.from_value(input).dtype()
         )
         bias = g.op("Constant", value_t=bias_value)
 
@@ -6200,13 +6181,10 @@ def movedim(g: jit_utils.GraphContext, self, source, destination):
 @symbolic_helper.parse_args("v", "v")
 @_beartype.beartype
 def fill(g: jit_utils.GraphContext, self, value):
-    dtype = self.type().scalarType()
-    if dtype is None:
-        dtype = _type_utils.JitScalarType.FLOAT
-    else:
-        dtype = _type_utils.JitScalarType.from_name(dtype)
-
-    return full_like(g, self, value, dtype)
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.FLOAT
+    )
+    return full_like(g, self, value, scalar_type)
 
 
 @_onnx_symbolic("aten::index_add")
@@ -6498,10 +6476,9 @@ def prim_unchecked_cast(g: jit_utils.GraphContext, self):
 @_onnx_symbolic("prim::dtype")
 @_beartype.beartype
 def prim_dtype(g: jit_utils.GraphContext, self):
-    scalar_name = symbolic_helper._try_get_scalar_type(self)
-    if scalar_name is None:
-        scalar_name = "Float"
-    scalar_type = _type_utils.JitScalarType.from_name(scalar_name)
+    scalar_type = symbolic_helper._try_get_scalar_type(self)
+    if scalar_type is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
     # This node records a torch dtype as int
     return g.op("Constant", value_t=torch.tensor(scalar_type))
 

From 6e7a3ff91614adf612ebfb29725fd957d587f680 Mon Sep 17 00:00:00 2001
From: Wonjoo Lee <wonjoo@google.com>
Date: Thu, 3 Nov 2022 06:19:40 +0000
Subject: [PATCH 0511/1922] Make GenLazyNativeFuncDefinition generator to be
 customizable in lazy codegen (#87823)

As part of the ongoing LTC migration effort, PyTorch/XLA is updating its codegen to use `xla::Shape` instead of `torch::lazy::Shape`. To achieve this, this PR updates the codegen to make the `GenLazyNativeFuncDefinition` generator customizable.

The existing `GenLazyNativeFuncDefinition` is kept by using the initial default values, so this change should not introduce any new behaviors to the existing codegen in PyTorch.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87823
Approved by: https://github.com/alanwaketan, https://github.com/wconstab
---
 torchgen/gen_lazy_tensor.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/torchgen/gen_lazy_tensor.py b/torchgen/gen_lazy_tensor.py
index 46f11eeff13c8..5207681cf5c8e 100644
--- a/torchgen/gen_lazy_tensor.py
+++ b/torchgen/gen_lazy_tensor.py
@@ -23,7 +23,7 @@
 
 from torchgen.api.lazy import setValueT
 from torchgen.api.types import BaseCppType
-from torchgen.dest.lazy_ir import GenLazyIR, GenTSLazyIR
+from torchgen.dest.lazy_ir import GenLazyIR, GenLazyNativeFuncDefinition, GenTSLazyIR
 from torchgen.gen import get_grouped_native_functions, parse_native_yaml
 
 from torchgen.model import NativeFunction, NativeFunctionsGroup, OperatorName
@@ -200,6 +200,9 @@ class default_args:
     tensor_class: str = "torch::lazy::LazyTensor"
     tensor_class_hdr: str = "torch/csrc/lazy/core/tensor.h"
     lazy_ir_generator: Type[GenLazyIR] = GenLazyIR
+    native_func_definition_generator: Type[
+        GenLazyNativeFuncDefinition
+    ] = GenLazyNativeFuncDefinition
     backend_name: str = "TorchScript"
 
 
@@ -267,6 +270,9 @@ def main() -> None:
     lazy_ir_generator: Type[GenLazyIR] = default_args.lazy_ir_generator
     if options.gen_ts_lowerings:
         lazy_ir_generator = GenTSLazyIR
+    native_func_definition_generator: Type[
+        GenLazyNativeFuncDefinition
+    ] = default_args.native_func_definition_generator
 
     run_gen_lazy_tensor(
         aten_path,
@@ -280,6 +286,7 @@ def main() -> None:
         options.tensor_class_hdr,
         options.shape_inference_hdr,
         lazy_ir_generator,
+        native_func_definition_generator,
         options.backend_name,
     )
 
@@ -296,6 +303,9 @@ def run_gen_lazy_tensor(
     tensor_class_hdr: str = default_args.tensor_class_hdr,
     shape_inference_hdr: str = default_args.shape_inference_hdr,
     lazy_ir_generator: Type[GenLazyIR] = default_args.lazy_ir_generator,
+    native_func_definition_generator: Type[
+        GenLazyNativeFuncDefinition
+    ] = default_args.native_func_definition_generator,
     # build_in_tree is true for TS backend and affects include paths
     build_in_tree: bool = False,
     # per_operator_headers changes whether ATen/Functions.h or individual operator headers are used
@@ -499,7 +509,7 @@ def concat_map_codegen(
             "namespace_epilogue": ns_helper.epilogue,
             "native_function_definitions": list(
                 concat_map_codegen(
-                    dest.GenLazyNativeFuncDefinition(
+                    native_func_definition_generator(
                         f"{backend_key}NativeFunctions",
                         backend_indices[backend_key],
                         tensor_class,

From 8d8bd57101857288f005eda017dfa3edc1257056 Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Thu, 3 Nov 2022 06:02:37 +0000
Subject: [PATCH 0512/1922] Fix primTorch compute_elementwise_output_strides
 (#88175)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88175
Approved by: https://github.com/ngimel
---
 test/inductor/test_torchinductor_opinfo.py |  8 -----
 test/test_meta.py                          | 35 ----------------------
 torch/_prims_common/__init__.py            | 24 +++++++++------
 3 files changed, 15 insertions(+), 52 deletions(-)

diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 93e5412716296..d28aafb467843 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -205,8 +205,6 @@ def process(device_type):
     "index_put": {f16, f32, f64},
     "index_reduce": {f16, f32, f64},
     "istft": {f32, f64},
-    "linalg.cholesky": {f32, f64},
-    "linalg.cholesky_ex": {f32, f64},
     "linalg.eig": {f32, f64},
     "linalg.eigh": {f32, f64},
     "linalg.eigvals": {f32, f64},
@@ -246,7 +244,6 @@ def process(device_type):
     "normal": {f16, f32, f64},
     "normal.number_mean": {f16, f32, f64},
     "pca_lowrank": {f32, f64},
-    "pinverse": {f32, f64},
     "polar": {f32, f64},
     "quantile": {f32, f64},
     "rand_like": {f16, f32, f64},
@@ -257,7 +254,6 @@ def process(device_type):
     "scatter_reduce.sum": {f16},
     "scatter_reduce.prod": {f16, f32, f64},
     "segment_reduce.lengths": {f16, f32, f64},
-    "segment_reduce.offsets": {f16, f32, f64},
     "sgn": {f16, f32, f64},
     "sparse.sampled_addmm": {f32, f64},
     "stft": {f32, f64},
@@ -316,8 +312,6 @@ def process(device_type):
     "index_put": {f16, f32, f64},
     "index_reduce": {f16, f32, f64},
     "istft": {f32, f64},
-    "linalg.cholesky": {f32, f64},
-    "linalg.cholesky_ex": {f32, f64},
     "linalg.eig": {f32, f64},
     "linalg.eigh": {f32, f64},
     "linalg.eigvals": {f32, f64},
@@ -347,7 +341,6 @@ def process(device_type):
     "normal": {f16, f32, f64},
     "normal.number_mean": {f16, f32, f64},
     "pca_lowrank": {f32, f64},
-    "pinverse": {f32, f64},
     "polar": {f32, f64},
     "pow": {i32, i64},
     "rand_like": {f16, f32, f64},
@@ -357,7 +350,6 @@ def process(device_type):
     "round.decimals_3": {f16},
     "scatter_reduce.prod": {f16, f32, f64},
     "segment_reduce.lengths": {f16, f32, f64},
-    "segment_reduce.offsets": {f16, f32, f64},
     "sgn": {f16, f32, f64},
     "stft": {f32, f64},
     "svd_lowrank": {f32, f64},
diff --git a/test/test_meta.py b/test/test_meta.py
index 88644a6552b1b..ef25d184c8428 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -295,55 +295,20 @@ def test_tensor_outlives_converter(self):
     aten._fft_r2c.default,
     aten._linalg_svd.default,
     aten._scaled_dot_product_attention_forward.default,
-    aten.add.Tensor,
-    aten.atan2.default,
     aten.binary_cross_entropy.default,
-    aten.bitwise_and.Tensor,
-    aten.bitwise_left_shift.Tensor,
-    aten.bitwise_or.Tensor,
-    aten.bitwise_right_shift.Tensor,
-    aten.bitwise_xor.Tensor,
-    aten.clamp_max.Tensor,
-    aten.clamp_min.Tensor,
     aten.complex.default,
     aten.copysign.Tensor,
     aten.div.Tensor_mode,
-    aten.div.Tensor,
-    aten.eq.Tensor,
     aten.floor_divide.default,
-    aten.fmax.default,
-    aten.fmin.default,
-    aten.fmod.Tensor,
-    aten.gcd.default,
-    aten.ge.Tensor,
-    aten.gt.Tensor,
     aten.heaviside.default,
-    aten.hypot.default,
-    aten.igamma.default,
-    aten.igammac.default,
-    aten.lcm.default,
-    aten.le.Tensor,
     aten.lerp.Scalar,
     aten.lerp.Tensor,
     aten.logical_and.default,
     aten.logical_or.default,
     aten.logical_xor.default,
-    aten.lt.Tensor,
-    aten.maximum.default,
-    aten.minimum.default,
-    aten.mul.Tensor,
-    aten.ne.Tensor,
-    aten.nextafter.default,
     aten.pow.Scalar,
-    aten.pow.Tensor_Scalar,
-    aten.pow.Tensor_Tensor,
     aten.prelu.default,
-    aten.remainder.Tensor,
-    aten.rsub.Tensor,
     aten.special_xlog1py.default,
-    aten.special_zeta.default,
-    aten.sub.Tensor,
-    aten.where.self,
     aten.xlogy.Tensor,
 
     # channel_last and channel_last_3d related failures
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index ee4dd38a655c6..90777ed6601aa 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -360,7 +360,7 @@ def compute_elementwise_output_strides(*tensors) -> Tuple[int, ...]:
 
     shape = tensors[0].shape
 
-    def _cmp(idx_a, idx_b):
+    def should_swap(idx_a, idx_b):
         for tensor in tensors:
             stride_a = tensor.stride()[idx_a]
             stride_b = tensor.stride()[idx_b]
@@ -378,24 +378,30 @@ def _cmp(idx_a, idx_b):
             if shape[idx_a] > shape[idx_b]:
                 return 1
 
-            # NOTE: this case is missing in the C++ impl
-            if shape[idx_a] < shape[idx_b]:
-                return -1
-
         # Note: this case is hit if all strides are zero,
         # or all strides are equal and all dimensions have the same length
         return 0
 
-    perm = tuple(range(ndim))
-    perm = tuple(sorted(perm, key=cmp_to_key(_cmp), reverse=True))
+    perm = list(reversed(range(ndim)))
+
+    # insertion sort with support for ambiguous comparisons
+    for i in range(1, ndim):
+        dim1 = i
+        for dim0 in reversed(range(i)):
+            comparison = should_swap(perm[dim0], perm[dim1])
+            if comparison > 0:
+                perm[dim0], perm[dim1] = perm[dim1], perm[dim0]
+                dim1 = dim0
+            elif comparison < 0:
+                break
 
     permuted_shape = [-1] * ndim
-    for idx, x in enumerate(perm):
+    for idx, x in enumerate(reversed(perm)):
         permuted_shape[idx] = shape[x]
 
     new_strides = make_contiguous_strides_for(permuted_shape)
     permuted_strides = [-1] * ndim
-    for idx, x in enumerate(perm):
+    for idx, x in enumerate(reversed(perm)):
         permuted_strides[x] = new_strides[idx]
 
     return tuple(permuted_strides)

From 60997dde25eb91e8204644d647164c1a0add9257 Mon Sep 17 00:00:00 2001
From: Kshiteej K <kshitijkalambarkar@gmail.com>
Date: Thu, 3 Nov 2022 09:57:47 +0000
Subject: [PATCH 0513/1922] [fix] allow saving python attr on Tensor and
 Parameter via torch.save (#81616)

Fixes: https://github.com/pytorch/pytorch/issues/72129

TODO:
* [x] Fix for Parameter

Benchmark
(Measurable diff for small tensors)
```
[-------------- Save and Load --------------]
                    |  After PR  |  Before PR
1 threads: ----------------------------------
      ()            |    111.7   |     106.9
      (4, 4)        |    114.4   |     109.2
      (128, 128)    |    135.2   |     128.3
      (1024, 1024)  |   1431.9   |    1431.3

Times are in microseconds (us).
```

<details>

<summary> Benchmark Script </summary>

```python
import torch
from torch.testing._internal.common_utils import BytesIOContext
from torch.utils import benchmark
import pickle

shapes = ((), (4, 4), (128, 128), (1024, 1024))

sizes = [1, 64, 1024, 10000]
results = []

def save_load_fn(t):
    with BytesIOContext() as f:
        torch.save(t, f)
        f.seek(0)
        torch.load(f)

for shape in shapes:
    t = torch.randn(shape)
    label = 'Save and Load'
    sub_label = f'{shape}'
    results.append(benchmark.Timer(
        stmt='save_load_fn(t)',
        globals={'t': t, 'save_load_fn':save_load_fn},
        label=label,
        sub_label=sub_label,
        description='Before PR',
    ).blocked_autorange(min_run_time=2))

compare = benchmark.Compare(results)
compare.print()

with open('before_pr.pkl', 'wb') as f:
    pickle.dump(results, f)

# with open('after_pr.pkl', 'rb') as f:
#     after_pr = pickle.load(f)

# with open('before_pr.pkl', 'rb') as f:
#     before_pr = pickle.load(f)

# compare = benchmark.Compare(after_pr + before_pr)
# compare.print()
```

</details>

NOTE : **BC-Breaking** : After this PR, all tensors (also regular tensors) will be serialised using `_rebuild_from_type_v2`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/81616
Approved by: https://github.com/albanD, https://github.com/kurtamohler
---
 test/test_serialization.py                 | 18 ++++++
 torch/_tensor.py                           | 42 +------------
 torch/_utils.py                            | 58 ++++++++++++++++++
 torch/_weights_only_unpickler.py           |  5 ++
 torch/csrc/jit/serialization/unpickler.cpp | 71 ++++++++++++++++++++++
 torch/csrc/jit/serialization/unpickler.h   |  4 ++
 torch/nn/parameter.py                      |  6 +-
 7 files changed, 162 insertions(+), 42 deletions(-)

diff --git a/test/test_serialization.py b/test/test_serialization.py
index 3a18f8a45ad04..7279db8f6f27b 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -896,6 +896,24 @@ def test_meta_serialization(self, weights_only):
 
         self.assertEqual(state['weight'].size(), big_model.weight.size())
 
+    def test_serialization_python_attr(self):
+        def _test_save_load_attr(t):
+            t.foo = 'foo'
+            t.pi = 3.14
+
+            with BytesIOContext() as f:
+                torch.save(t, f)
+                f.seek(0)
+                loaded_t = torch.load(f)
+
+            self.assertEqual(t, loaded_t)
+            self.assertEqual(t.foo, loaded_t.foo)
+            self.assertEqual(t.pi, loaded_t.pi)
+
+        t = torch.zeros(3, 3)
+        _test_save_load_attr(t)
+        _test_save_load_attr(torch.nn.Parameter(t))
+
     def test_weights_only_assert(self):
         class HelloWorld:
             def __reduce__(self):
diff --git a/torch/_tensor.py b/torch/_tensor.py
index d0af241c8a221..e77324e2baf27 100644
--- a/torch/_tensor.py
+++ b/torch/_tensor.py
@@ -55,9 +55,6 @@ def _rebuild_from_type(func, type, args, dict):
 
 
 def _rebuild_from_type_v2(func, new_type, args, state):
-    if new_type is Tensor:
-        return func(*args)
-
     ret = func(*args)
     if type(ret) is not new_type:
         ret = ret.as_subclass(new_type)
@@ -70,21 +67,7 @@ def _rebuild_from_type_v2(func, new_type, args, state):
     ):
         ret.__setstate__(state)
     else:
-        if isinstance(state, tuple):
-            if not len(state) == 2:
-                raise RuntimeError(f"Invalid serialized state: {state}")
-            dict_state = state[0]
-            slots_state = state[1]
-        else:
-            dict_state = state
-            slots_state = None
-
-        for k, v in dict_state.items():
-            setattr(ret, k, v)
-
-        if slots_state:
-            for k, v in slots_state.items():
-                setattr(ret, k, v)
+        ret = torch._utils._set_obj_state(ret, state)
     return ret
 
 
@@ -221,31 +204,10 @@ def __deepcopy__(self, memo):
             return new_tensor
 
     def __reduce_ex__(self, proto):
-        if type(self) is Tensor:
-            return self._reduce_ex_internal(proto)
         if has_torch_function_unary(self):
             return handle_torch_function(Tensor.__reduce_ex__, (self,), self, proto)
         func, args = self._reduce_ex_internal(proto)
-        # Get the state of the python subclass
-        # This loosely mimicks the function on the object class but since Tensor do not inherit
-        # from it, we cannot call that function directly
-        # https://github.com/python/cpython/blob/c83919bd635f4433f1c6ae8504996a9fe3c215e5/Objects/typeobject.c#L4891
-        getstate_fn = getattr(self, "__getstate__", None)
-        if getstate_fn:
-            state = getstate_fn()
-        else:
-            slots_to_save = copyreg._slotnames(self.__class__)  # type: ignore[attr-defined]
-            if slots_to_save:
-                state = (
-                    self.__dict__,
-                    {
-                        name: getattr(self, name)
-                        for name in slots_to_save
-                        if hasattr(self, name)
-                    },
-                )
-            else:
-                state = self.__dict__
+        state = torch._utils._get_obj_state(self)
         return (_rebuild_from_type_v2, (func, type(self), args, state))
 
     def storage(self):
diff --git a/torch/_utils.py b/torch/_utils.py
index 8a539d75f5657..98b1a67ec3a88 100644
--- a/torch/_utils.py
+++ b/torch/_utils.py
@@ -1,3 +1,4 @@
+import copyreg
 import sys
 import traceback
 import warnings
@@ -307,6 +308,7 @@ def _rebuild_qtensor(
     return tensor
 
 
+# Should not be used, this is kept only for BC of loading old serialized parameters
 def _rebuild_parameter(data, requires_grad, backward_hooks):
     param = torch.nn.Parameter(data, requires_grad)
     # NB: This line exists only for backwards compatibility; the
@@ -317,6 +319,62 @@ def _rebuild_parameter(data, requires_grad, backward_hooks):
     return param
 
 
+def _rebuild_parameter_v2(data, requires_grad, backward_hooks, state):
+    param = torch.nn.Parameter(data, requires_grad)
+    # NB: This line exists only for backwards compatibility; the
+    # general expectation is that backward_hooks is an empty
+    # OrderedDict.  See Note [Don't serialize hooks]
+    param._backward_hooks = backward_hooks
+
+    # Restore state on Parameter like python attr.
+    param = _set_obj_state(param, state)
+    return param
+
+
+def _get_obj_state(obj):
+    # Get the state of the python subclass
+    # This loosely mimicks the function on the object class but since Tensor do not inherit
+    # from it, we cannot call that function directly
+    # https://github.com/python/cpython/blob/c83919bd635f4433f1c6ae8504996a9fe3c215e5/Objects/typeobject.c#L4891
+    getstate_fn = getattr(obj, "__getstate__", None)
+    if getstate_fn:
+        state = getstate_fn()
+    else:
+        slots_to_save = copyreg._slotnames(obj.__class__)  # type: ignore[attr-defined]
+        if slots_to_save:
+            state = (
+                obj.__dict__,
+                {
+                    name: getattr(obj, name)
+                    for name in slots_to_save
+                    if hasattr(obj, name)
+                },
+            )
+        else:
+            state = obj.__dict__
+
+    return state
+
+
+def _set_obj_state(obj, state):
+    if isinstance(state, tuple):
+        if not len(state) == 2:
+            raise RuntimeError(f"Invalid serialized state: {state}")
+        dict_state = state[0]
+        slots_state = state[1]
+    else:
+        dict_state = state
+        slots_state = None
+
+    for k, v in dict_state.items():
+        setattr(obj, k, v)
+
+    if slots_state:
+        for k, v in slots_state.items():
+            setattr(obj, k, v)
+    return obj
+
+
 def _import_dotted_name(name):
     components = name.split(".")
     obj = __import__(components[0])
diff --git a/torch/_weights_only_unpickler.py b/torch/_weights_only_unpickler.py
index ee00db937fc3d..498d3a607f3aa 100644
--- a/torch/_weights_only_unpickler.py
+++ b/torch/_weights_only_unpickler.py
@@ -103,6 +103,11 @@ def _get_allowed_globals():
         torch._utils._rebuild_sparse_csr_tensor,
     ]:
         rc[f"torch._utils.{f.__name__}"] = f
+
+    # Default rebuild function
+    # Handles Tensor Subclasses, Tensor's with attributes.
+    # NOTE: It calls into above rebuild functions for regular Tensor types.
+    rc["torch._tensor._rebuild_from_type_v2"] = torch._tensor._rebuild_from_type_v2
     return rc
 
 
diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp
index 7b40f138c600f..b47045aa23ae8 100644
--- a/torch/csrc/jit/serialization/unpickler.cpp
+++ b/torch/csrc/jit/serialization/unpickler.cpp
@@ -532,6 +532,21 @@ PickleOpCode Unpickler::readInstruction() {
       }
       stack_.emplace_back(std::move(tensor));
     } break;
+    case PickleOpCode::SETITEM: {
+      // At this OpCode, stack looks like
+      // | Stack Top |
+      // | ......    |
+      // | Dict      | -> (stack_size - 3)
+      // | Key       | -> (stack_size - 2)
+      // | Value     | -> (stack_size - 1)
+      auto stack_size = stack_.size();
+      auto dict_pos = stack_size - 3;
+      auto key_pos = stack_size - 2;
+      auto val_pos = stack_size - 1;
+      auto dict = stack_.at(dict_pos).toGenericDict();
+      dict.insert_or_assign(stack_.at(key_pos), stack_.at(val_pos));
+      stack_.erase(stack_.begin() + (key_pos), stack_.end());
+    } break;
     default: {
       AT_ERROR(
           "Unknown opcode for unpickling at ",
@@ -546,6 +561,23 @@ PickleOpCode Unpickler::readInstruction() {
 void Unpickler::readGlobal(
     const std::string& module_name,
     const std::string& class_name) {
+  if (this->skip_next_read_global) {
+    // See [NOTE] skip_next_read_global
+    this->skip_next_read_global--;
+    if (this->skip_next_read_global == 1) {
+      // Pass through to the correct handler
+    } else if (this->skip_next_read_global == 0) {
+      // Corresponds to the type of `Tensor` being unpickled
+      if (module_name != "torch" || class_name != "Tensor") {
+        TORCH_WARN(
+            "Trying to load a Subclassed Tensor, it will be converted to at::Tensor in C++");
+      }
+      stack_.emplace_back(int64_t(globals_.size() - 1));
+      return;
+    } else {
+      TORCH_CHECK(false, "INVALID VALUES")
+    }
+  }
   // TODO [unpickler refactor] __main__ isn't used by the pickler anymore, this
   // is only here for bc-compatibility reasons
   if (module_name == "__main__") {
@@ -631,6 +663,12 @@ void Unpickler::readGlobal(
     // Unpickle a tensor
     bool quantized = class_name == "_rebuild_qtensor";
     rebuildTensor(quantized);
+  } else if (
+      module_name == "torch._tensor" &&
+      (class_name == "_rebuild_from_type_v2")) {
+    // Unpickle a Tensor with Python attributes or
+    // a Subclassed Tensor.
+    rebuildTensorFromTypeV2();
   } else if (
       module_name == "torch._utils" && class_name == "_rebuild_sparse_tensor") {
     rebuildSparseTensor();
@@ -834,6 +872,39 @@ void Unpickler::rebuildTensor(bool quantized) {
   });
 }
 
+void Unpickler::rebuildTensorFromTypeV2() {
+  // [NOTE] skip_next_read_global
+  // When rebuilding Tensor with Python Attr or Subclassed Tensor,
+  // we receive `(func, type(self), args, state)` on stack for
+  // `rebuildTensorFromTypeV2`.
+  // Thus next call to readGlobal corresponds to `func` which is
+  // the function to rebuild the base tensor.
+  // The call after `func` to readGlobal corresponds to `type` of the
+  // Tensor where we raise warning if the type is not `torch.Tensor`.
+  this->skip_next_read_global = 2;
+  auto curr_globals_idx = globals_.size();
+  globals_.emplace_back([this, curr_globals_idx] {
+    // args is a tuple with following data
+    //  (function to rebuild base tensor, type of tensor,
+    //   arguments to construct base tensor, Python State (as dict))
+    auto args = pop(stack_).toTuple();
+    size_t tup_idx = 0;
+    const auto args_elems = args->elements();
+    auto base_tensor_args = args_elems.at(tup_idx + 2).toTuple();
+    auto py_state = args_elems.at(tup_idx + 3).toGenericDict();
+    if (py_state.size() > 0) {
+      TORCH_WARN(
+          "Loading Tensor with Python attributes will return at::Tensor with Python attributes being discarded");
+    }
+    // This calls the function to rebuild the
+    // base tensor.
+    // Eg. `rebuildTensor`, `rebuildSpareTensor`.
+    stack_.emplace_back(base_tensor_args);
+    globals_[curr_globals_idx + 1]();
+    stack_.emplace_back(pop(stack_));
+  });
+}
+
 #ifdef USE_RPC
 void Unpickler::rebuildRRef() {
   globals_.emplace_back([this] {
diff --git a/torch/csrc/jit/serialization/unpickler.h b/torch/csrc/jit/serialization/unpickler.h
index 5411d421a0c57..de00e7eacff21 100644
--- a/torch/csrc/jit/serialization/unpickler.h
+++ b/torch/csrc/jit/serialization/unpickler.h
@@ -120,6 +120,7 @@ class TORCH_API Unpickler {
       const std::string& module_name,
       const std::string& class_name);
   void rebuildTensor(bool quantized);
+  void rebuildTensorFromTypeV2();
   void rebuildSparseTensor();
 #ifdef USE_DISTRIBUTED
   void rebuildRRef();
@@ -176,6 +177,9 @@ class TORCH_API Unpickler {
 
   // See [type tag serialization]
   uint64_t version_;
+
+  // See [NOTE] skip_next_read_global
+  uint8_t skip_next_read_global = 0;
 };
 
 void restoreAccurateTypeTags(const IValue& root, const c10::TypePtr& type_tag);
diff --git a/torch/nn/parameter.py b/torch/nn/parameter.py
index e0f400f2642bf..4821adae17263 100644
--- a/torch/nn/parameter.py
+++ b/torch/nn/parameter.py
@@ -60,10 +60,12 @@ def __repr__(self):
         return 'Parameter containing:\n' + super(Parameter, self).__repr__()
 
     def __reduce_ex__(self, proto):
+        state = torch._utils._get_obj_state(self)
+
         # See Note [Don't serialize hooks]
         return (
-            torch._utils._rebuild_parameter,
-            (self.data, self.requires_grad, OrderedDict())
+            torch._utils._rebuild_parameter_v2,
+            (self.data, self.requires_grad, OrderedDict(), state)
         )
 
     __torch_function__ = _disabled_torch_function_impl

From 63412d3c5c62d641d8a61689c8f6058069ae7a23 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Wed, 2 Nov 2022 18:06:04 +0000
Subject: [PATCH 0514/1922] [FSDP()] Rename to `fully_shard()` and move to
 `_composable/` (#88233)

After internal discussion, we are currently preferring `fully_shard()` as the name of the composable FSDP API.
- `FullyShardedDataParallel` (FSDP) has existing brand value, so the chosen name should try to preserve that. We think this takes precedence over the fact that composable FSDP may encompass than just the ZeRO-3 approach of _fully sharding_.
    - Given the refactoring efforts, it would also not be challenging to create a new frontend API like `hybrid_shard()` that calls into the same underlying initialization and runtime except for a different `ShardingStrategy`. In other words, we do not have to coalesce all sharding strategies under `fully_shard()`.
- The other composable APIs are verbs (`replicate()`, `checkpoint()`), so the chosen name should be a verb.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88233
Approved by: https://github.com/mrshenli
---
 .../test_fully_shard.py}                             | 12 ++++++------
 torch/distributed/_composable/__init__.py            |  1 +
 .../{fsdp/_fsdp.py => _composable/fully_shard.py}    |  2 +-
 3 files changed, 8 insertions(+), 7 deletions(-)
 rename test/distributed/{fsdp/test_composable_fsdp.py => _composable/test_fully_shard.py} (97%)
 rename torch/distributed/{fsdp/_fsdp.py => _composable/fully_shard.py} (98%)

diff --git a/test/distributed/fsdp/test_composable_fsdp.py b/test/distributed/_composable/test_fully_shard.py
similarity index 97%
rename from test/distributed/fsdp/test_composable_fsdp.py
rename to test/distributed/_composable/test_fully_shard.py
index 0e28d4f9985cd..0cf51027c29b6 100644
--- a/test/distributed/fsdp/test_composable_fsdp.py
+++ b/test/distributed/_composable/test_fully_shard.py
@@ -8,9 +8,9 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+from torch.distributed._composable import fully_shard
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp._common_utils import _is_fsdp_flattened
-from torch.distributed.fsdp._fsdp import fully_sharded_data_parallel
 from torch.distributed.fsdp._runtime_utils import _root_pre_forward
 from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
@@ -89,7 +89,7 @@ def test_auto_wrap_policy(self):
             use_orig_params=True,
         )
         composable_module = copy.deepcopy(local_model)
-        fsdp_state = fully_sharded_data_parallel(
+        fsdp_state = fully_shard(
             composable_module,
             auto_wrap_policy=Model.auto_wrap_policy(),
         )
@@ -136,7 +136,7 @@ def test_device_id(self):
         composable_module = Model(device=cpu_device)
         for param in composable_module.parameters():
             assert param.device == cpu_device
-        fully_sharded_data_parallel(
+        fully_shard(
             composable_module,
             auto_wrap_policy=Model.auto_wrap_policy(),
             device_id=self.rank,
@@ -160,7 +160,7 @@ def test_sync_module_states(self):
             auto_wrap_policy=Model.auto_wrap_policy(),
             use_orig_params=True,
         )
-        fully_sharded_data_parallel(
+        fully_shard(
             composable_module,
             auto_wrap_policy=Model.auto_wrap_policy(),
             sync_module_states=True,
@@ -201,7 +201,7 @@ def _param_init_fn(module: nn.Module):
             param_init_fn=_param_init_fn,
             use_orig_params=True,
         )
-        fully_sharded_data_parallel(
+        fully_shard(
             composable_module,
             auto_wrap_policy=Model.auto_wrap_policy(),
             param_init_fn=_param_init_fn,
@@ -231,7 +231,7 @@ def test_training(self):
             use_orig_params=True,
         )
         composable_module = copy.deepcopy(local_model)
-        fsdp_state = fully_sharded_data_parallel(
+        fsdp_state = fully_shard(
             composable_module,
             auto_wrap_policy=Model.auto_wrap_policy(),
         )
diff --git a/torch/distributed/_composable/__init__.py b/torch/distributed/_composable/__init__.py
index 9098016534f90..5b0d8e77e5ccf 100644
--- a/torch/distributed/_composable/__init__.py
+++ b/torch/distributed/_composable/__init__.py
@@ -1,2 +1,3 @@
 from .checkpoint_activation import checkpoint
 from .contract import contract
+from .fully_shard import fully_shard
diff --git a/torch/distributed/fsdp/_fsdp.py b/torch/distributed/_composable/fully_shard.py
similarity index 98%
rename from torch/distributed/fsdp/_fsdp.py
rename to torch/distributed/_composable/fully_shard.py
index 8c99a05a301a6..6c12e9c1b1001 100644
--- a/torch/distributed/fsdp/_fsdp.py
+++ b/torch/distributed/_composable/fully_shard.py
@@ -26,7 +26,7 @@
 )
 
 
-def fully_sharded_data_parallel(
+def fully_shard(
     module: nn.Module,
     process_group: Optional[dist.ProcessGroup] = None,
     sharding_strategy: Optional[ShardingStrategy] = None,

From 98979dd6d3d349a6594f68338f46592e3d07ecce Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Wed, 2 Nov 2022 18:06:04 +0000
Subject: [PATCH 0515/1922] [FSDP()][Easy] Rename `_State` to `_FSDPState`
 (#88234)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88234
Approved by: https://github.com/mrshenli
---
 torch/distributed/_composable/fully_shard.py |  8 +--
 torch/distributed/fsdp/_common_utils.py      | 12 ++--
 torch/distributed/fsdp/_exec_order_utils.py  |  4 +-
 torch/distributed/fsdp/_init_utils.py        | 38 ++++++-------
 torch/distributed/fsdp/_runtime_utils.py     | 60 ++++++++++----------
 5 files changed, 61 insertions(+), 61 deletions(-)

diff --git a/torch/distributed/_composable/fully_shard.py b/torch/distributed/_composable/fully_shard.py
index 6c12e9c1b1001..422c142061845 100644
--- a/torch/distributed/_composable/fully_shard.py
+++ b/torch/distributed/_composable/fully_shard.py
@@ -3,7 +3,7 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-from torch.distributed.fsdp._common_utils import _State, FSDPState
+from torch.distributed.fsdp._common_utils import _FSDPState, ComposableFSDPState
 from torch.distributed.fsdp._init_utils import (
     _init_buffer_state,
     _init_core_state,
@@ -37,11 +37,11 @@ def fully_shard(
     device_id: Optional[Union[int, torch.device]] = None,
     param_init_fn: Optional[Callable[[nn.Module], None]] = None,
     sync_module_states: bool = False,
-) -> FSDPState:
+) -> ComposableFSDPState:
     """
     Applies ``FullyShardedDataParallel` (FSDP) semantics to ``module``.
     """
-    state = cast(_State, FSDPState())
+    state = cast(_FSDPState, ComposableFSDPState())
     state = _init_ignored_module_states(state, module, ignored_modules)
     state = _init_process_group_state(state, process_group)
     limit_all_gathers = True
@@ -73,4 +73,4 @@ def fully_shard(
     modules = list(module.modules())
     _register_pre_forward_hooks(state, modules)
     _register_post_forward_hooks(state, modules)
-    return cast(FSDPState, state)
+    return cast(ComposableFSDPState, state)
diff --git a/torch/distributed/fsdp/_common_utils.py b/torch/distributed/fsdp/_common_utils.py
index ad58618b2b2d4..44c77150a6ae6 100644
--- a/torch/distributed/fsdp/_common_utils.py
+++ b/torch/distributed/fsdp/_common_utils.py
@@ -18,15 +18,15 @@
 FSDP_FLATTENED = "_fsdp_flattened"
 
 
-class FSDPState:
+class ComposableFSDPState:
     """
-    This encompasses all FSDP state.
+    This encompasses all FSDP state for composable FSDP.
     """
 
 
 # We leverage Python's dynamic attribute definition to unify the state
 # management for the wrapper and non-wrapper approaches.
-_State = Union[nn.Module, FSDPState]
+_FSDPState = Union[nn.Module, ComposableFSDPState]
 
 
 class TrainingState(Enum):
@@ -51,13 +51,13 @@ class HandleTrainingState(Enum):
     SUMMON_FULL_PARAMS = auto()
 
 
-def _is_composable(state: _State):
+def _is_composable(state: _FSDPState):
     # TODO: This is a temporary hack for differentiate between code paths.
     return not isinstance(state, nn.Module)
 
 
 @no_type_check
-def _all_handles(state: _State) -> List:
+def _all_handles(state: _FSDPState) -> List:
     return (
         state._handles
         if _is_composable(state)
@@ -168,7 +168,7 @@ def f(module: torch.nn.Module, prefix: str, *args, **kwargs):
 
 @no_type_check
 def _assert_in_training_states(
-    state: _State,
+    state: _FSDPState,
     training_states: List[TrainingState],
 ) -> None:
     """Asserts that FSDP is in the states ``_training_states``."""
diff --git a/torch/distributed/fsdp/_exec_order_utils.py b/torch/distributed/fsdp/_exec_order_utils.py
index e9c7a9d5f5a84..a7082113a3859 100644
--- a/torch/distributed/fsdp/_exec_order_utils.py
+++ b/torch/distributed/fsdp/_exec_order_utils.py
@@ -8,8 +8,8 @@
 import torch.nn as nn
 from torch.distributed.fsdp._common_utils import (
     _all_handles,
+    _FSDPState,
     _get_param_to_fqns,
-    _State,
 )
 from torch.distributed.fsdp.flat_param import FlatParameter, FlatParamHandle
 
@@ -74,7 +74,7 @@ def __init__(
 
     def init(
         self,
-        state: _State,
+        state: _FSDPState,
         root_module: nn.Module,
         process_group: dist.ProcessGroup,
     ) -> None:
diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index 2eb351fe83875..c89f65c3a5b82 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -21,9 +21,9 @@
 from torch.distributed.algorithms._comm_hooks import default_hooks
 from torch.distributed.distributed_c10d import _get_default_group
 from torch.distributed.fsdp._common_utils import (
+    _FSDPState,
     _get_param_to_fqns,
     _is_fsdp_flattened,
-    _State,
     clean_tensor_name,
     TrainingState,
 )
@@ -69,9 +69,9 @@
 
 @no_type_check
 def _init_process_group_state(
-    state: _State,
+    state: _FSDPState,
     process_group: Optional[dist.ProcessGroup],
-) -> _State:
+) -> _FSDPState:
     state.process_group = process_group or _get_default_group()
     state.rank = state.process_group.rank()
     state.world_size = state.process_group.size()
@@ -80,10 +80,10 @@ def _init_process_group_state(
 
 @no_type_check
 def _init_ignored_module_states(
-    state: _State,
+    state: _FSDPState,
     module: nn.Module,
     ignored_modules: Optional[Iterable[torch.nn.Module]],
-) -> _State:
+) -> _FSDPState:
     state._ignored_modules = _get_ignored_modules(module, ignored_modules)
     state._ignored_params, state._ignored_param_names = _get_ignored_params(
         module,
@@ -99,9 +99,9 @@ def _init_ignored_module_states(
 
 @no_type_check
 def _init_buffer_state(
-    state: _State,
+    state: _FSDPState,
     module: nn.Module,
-) -> _State:
+) -> _FSDPState:
     state._buffer_names = _get_buffer_names(module)
     # Save a mapping from clean fully-qualified buffer name (starting from
     # `module`) to its original dtype for restoring that dtype during model
@@ -117,7 +117,7 @@ def _init_buffer_state(
 
 @no_type_check
 def _init_core_state(
-    state: _State,
+    state: _FSDPState,
     sharding_strategy: Optional[ShardingStrategy],
     mixed_precision: Optional[MixedPrecision],
     cpu_offload: Optional[CPUOffload],
@@ -125,7 +125,7 @@ def _init_core_state(
     use_orig_params: bool,
     backward_prefetch_limit: int,
     forward_prefetch_limit: int,
-) -> _State:
+) -> _FSDPState:
     # We clamp the strategy to `NO_SHARD` for world size of 1 since they are
     # currently functionally equivalent. This may change if/when we integrate
     # FSDP with MoE.
@@ -162,8 +162,8 @@ def _init_core_state(
 
 @no_type_check
 def _init_runtime_state(
-    state: _State,
-) -> _State:
+    state: _FSDPState,
+) -> _FSDPState:
     _root_pre_forward_handles: List[RemovableHandle] = []
     state._root_pre_forward_handles = _root_pre_forward_handles
     _pre_forward_handles: List[RemovableHandle] = []
@@ -186,10 +186,10 @@ def _init_runtime_state(
 
 @no_type_check
 def _init_prefetching_state(
-    state: _State,
+    state: _FSDPState,
     backward_prefetch: BackwardPrefetch,
     forward_prefetch: bool,
-) -> _State:
+) -> _FSDPState:
     state.backward_prefetch = backward_prefetch
     state.forward_prefetch = forward_prefetch
     _handles_prefetched: Dict[_HandlesKey, bool] = {}
@@ -205,20 +205,20 @@ def _init_prefetching_state(
     return state
 
 
-def _init_state_dict_state(state: _State) -> _State:
+def _init_state_dict_state(state: _FSDPState) -> _FSDPState:
     # TODO: after rebase
     return state
 
 
 @no_type_check
 def _init_param_handle_from_module(
-    state: _State,
+    state: _FSDPState,
     root_module: nn.Module,
     device_id: Optional[Union[int, torch.device]],
     param_init_fn: Optional[Callable[[nn.Module], None]],
     sync_module_states: bool,
     module_wrapper_cls: Type,
-) -> _State:
+) -> _FSDPState:
     """
     Initializes a ``FlatParamHandle`` from a module ``root_module``. This is
     the module wrapper code path.
@@ -252,13 +252,13 @@ def _init_param_handle_from_module(
 
 @no_type_check
 def _init_param_handles_from_module(
-    state: _State,
+    state: _FSDPState,
     root_module: nn.Module,
     auto_wrap_policy: Callable,
     device_id: Optional[Union[int, torch.device]],
     param_init_fn: Optional[Callable[[nn.Module], None]],
     sync_module_states: bool,
-) -> _State:
+) -> _FSDPState:
     """
     Initializes all ``FlatParamHandle`` s from a module ``root_module``. This
     is the non-module-wrapper code path.
@@ -311,7 +311,7 @@ def _init_param_handles_from_module(
 
 @no_type_check
 def _init_param_handle_from_params(
-    state: _State,
+    state: _FSDPState,
     params: List[nn.Parameter],
     root_module: nn.Module,
 ):
diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index 254d41a1d5be1..3127881de34a2 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -10,8 +10,8 @@
 from torch.distributed.fsdp._common_utils import (
     _all_handles,
     _assert_in_training_states,
+    _FSDPState,
     _is_composable,
-    _State,
     TrainingState,
 )
 from torch.distributed.fsdp._utils import _apply_to_tensors, p_assert
@@ -28,9 +28,9 @@
 
 @no_type_check
 def _lazy_init(
-    state: _State,
+    state: _FSDPState,
     root_module: nn.Module,
-) -> _State:
+) -> _FSDPState:
     """
     Performs initialization lazily, typically right before the first forward
     pass. The laziness is needed to ensure that the parameter device/dtype and
@@ -98,8 +98,8 @@ def _lazy_init(
 
 @no_type_check
 def _init_streams(
-    state: _State,
-) -> _State:
+    state: _FSDPState,
+) -> _FSDPState:
     """
     Initializes CUDA streams for overlapping communication, computation, and
     data transfers. The streams should be shared across FSDP instances.
@@ -119,7 +119,7 @@ def _init_streams(
 
 @no_type_check
 def _unshard(
-    state: _State,
+    state: _FSDPState,
     handles: List[FlatParamHandle],
     unshard_stream: torch.cuda.Stream,
     pre_unshard_stream: torch.cuda.Stream,
@@ -153,7 +153,7 @@ def _unshard(
 
 @no_type_check
 def _reshard(
-    state: _State,
+    state: _FSDPState,
     handles: List[FlatParamHandle],
     free_unsharded_flat_params: List[bool],
 ):
@@ -203,7 +203,7 @@ def _reshard_grads(
 
 @no_type_check
 def _pre_forward(
-    state: _State,
+    state: _FSDPState,
     handles: List[FlatParamHandle],
     unshard_fn: Callable,
     module: nn.Module,
@@ -237,7 +237,7 @@ def _pre_forward(
 
 @no_type_check
 def _pre_forward_unshard(
-    state: _State,
+    state: _FSDPState,
     handles: List[FlatParamHandle],
 ) -> None:
     """Unshards parameters in the pre-forward."""
@@ -252,7 +252,7 @@ def _pre_forward_unshard(
 
 @no_type_check
 def _post_forward(
-    state: _State,
+    state: _FSDPState,
     handles: List[FlatParamHandle],
     reshard_fn: Callable,
     module: nn.Module,
@@ -292,7 +292,7 @@ def _post_forward(
 
 @no_type_check
 def _post_forward_reshard(
-    state: _State,
+    state: _FSDPState,
     handles: List[FlatParamHandle],
 ) -> None:
     """Reshards parameters in the post-forward."""
@@ -311,7 +311,7 @@ def _post_forward_reshard(
 
 @no_type_check
 def _root_pre_forward(
-    state: _State,
+    state: _FSDPState,
     module: nn.Module,
     *args,
     **kwargs,
@@ -403,7 +403,7 @@ def cast_fn(x: torch.Tensor) -> torch.Tensor:
 
 @no_type_check
 def _pre_backward_hook(
-    state: _State,
+    state: _FSDPState,
     _handles: List[FlatParamHandle],
     *unused: Any,
 ) -> Any:
@@ -456,7 +456,7 @@ def _pre_backward_hook(
 @no_type_check
 @torch.no_grad()
 def _post_backward_hook(
-    state: _State,
+    state: _FSDPState,
     handle: FlatParamHandle,
     *unused: Any,
 ):
@@ -590,7 +590,7 @@ def _post_backward_hook(
 
 @no_type_check
 def _should_free_in_backward(
-    state: _State,
+    state: _FSDPState,
     handle: FlatParamHandle,
 ) -> bool:
     """
@@ -604,7 +604,7 @@ def _should_free_in_backward(
 
 @no_type_check
 def _cast_grad_to_param_dtype(
-    state: _State,
+    state: _FSDPState,
     handle: FlatParamHandle,
     sharded_grad: torch.Tensor,
     param: FlatParameter,
@@ -662,14 +662,14 @@ def _check_grad_to_accumulate(
 
 
 @no_type_check
-def _low_precision_hook_enabled(state: _State) -> bool:
+def _low_precision_hook_enabled(state: _FSDPState) -> bool:
     return state._communication_hook in LOW_PRECISION_HOOKS
 
 
 @no_type_check
 @torch.no_grad()
 def _post_backward_final_callback(
-    state: _State,
+    state: _FSDPState,
 ):
     """
     This waits for the post-backward to finish and performs some final cleanup.
@@ -705,7 +705,7 @@ def _post_backward_final_callback(
 
 @no_type_check
 def _catch_all_reshard(
-    state: _State,
+    state: _FSDPState,
 ) -> None:
     """
     Reshards the parameters that may not have been resharded in the
@@ -742,7 +742,7 @@ def _catch_all_reshard(
 
 @no_type_check
 def _finalize_params(
-    state: _State,
+    state: _FSDPState,
 ) -> None:
     """Finalizes the parameters before the next iteration."""
     for handle in state._handles:
@@ -771,7 +771,7 @@ def _finalize_params(
 
 @no_type_check
 def _prefetch_handles(
-    state: _State,
+    state: _FSDPState,
     current_handles_key: _HandlesKey,
 ) -> None:
     """
@@ -792,7 +792,7 @@ def _prefetch_handles(
 
 @no_type_check
 def _get_handles_to_prefetch(
-    state: _State,
+    state: _FSDPState,
     current_handles_key: _HandlesKey,
 ) -> List[_HandlesKey]:
     """
@@ -858,7 +858,7 @@ def _get_training_state(
 
 @no_type_check
 def _register_pre_forward_hooks(
-    state: _State,
+    state: _FSDPState,
     modules: Iterable[nn.Module],
 ) -> None:
     """
@@ -886,7 +886,7 @@ def _register_pre_forward_hooks(
 
 @no_type_check
 def _register_post_forward_hooks(
-    state: _State,
+    state: _FSDPState,
     modules: Iterable[nn.Module],
 ) -> None:
     """
@@ -917,7 +917,7 @@ def _register_post_forward_hooks(
 
 @no_type_check
 def _register_root_pre_forward_hooks(
-    state: _State,
+    state: _FSDPState,
     modules: Iterable[nn.Module],
 ):
     """
@@ -939,7 +939,7 @@ def _register_root_pre_forward_hooks(
 
 @no_type_check
 def _register_pre_backward_hooks(
-    state: _State,
+    state: _FSDPState,
     outputs: Any,
     handles: List[FlatParamHandle],
 ) -> None:
@@ -976,7 +976,7 @@ def _register_hook(t: torch.Tensor) -> torch.Tensor:
 
 
 def _register_post_backward_hooks(
-    state: _State,
+    state: _FSDPState,
     handles: List[FlatParamHandle],
 ) -> None:
     """
@@ -1015,7 +1015,7 @@ def _register_post_backward_hooks(
 
 
 @no_type_check
-def _register_post_backward_final_callback(state: _State) -> None:
+def _register_post_backward_final_callback(state: _FSDPState) -> None:
     """
     Registers the post-backward final callback that runs at the end of the
     backward pass. This should be called from the root FSDP instance at the
@@ -1066,7 +1066,7 @@ def _clear_grads_if_needed(
 
 @no_type_check
 def _get_buffers_and_dtypes_for_computation(
-    state: _State,
+    state: _FSDPState,
     root_module: nn.Module,
 ) -> Tuple[List[torch.Tensor], List[Optional[torch.dtype]]]:
     """
@@ -1102,7 +1102,7 @@ def _get_buffers_and_dtypes_for_computation(
 
 @no_type_check
 def _get_buffers_and_dtypes_for_checkpoint(
-    state: _State,
+    state: _FSDPState,
     root_module: nn.Module,
 ) -> Tuple[List[torch.Tensor], List[torch.dtype]]:
     """

From 095b10a55eba427409eb52a5325157950fc58e4c Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Wed, 2 Nov 2022 20:34:41 +0000
Subject: [PATCH 0516/1922] [FSDP] Do not include empty state in
 `_flatten_optim_state_dict()` (#88353)

https://github.com/pytorch/pytorch/blob/983c0e7f3101f1543bed6c4ec1539a4d590a94c0/torch/optim/adam.py#L163
The above line requires that a candidate optimizer state dict being loaded via `load_state_dict()` has non-empty state for its 0th parameter (via `state_values[0]`). This PR changes FSDP to only include non-empty mappings in the state returned by `_flatten_optim_state_dict()`, which is the subroutine for both `shard_full_optim_state_dict()` and `flatten_sharded_optim_state_dict()`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88353
Approved by: https://github.com/fegin
---
 .../distributed/fsdp/test_fsdp_optim_state.py | 66 +++++++++++++++++--
 torch/distributed/fsdp/_optim_utils.py        |  5 +-
 2 files changed, 65 insertions(+), 6 deletions(-)

diff --git a/test/distributed/fsdp/test_fsdp_optim_state.py b/test/distributed/fsdp/test_fsdp_optim_state.py
index 883ef7285cfe5..5b714fe65c265 100644
--- a/test/distributed/fsdp/test_fsdp_optim_state.py
+++ b/test/distributed/fsdp/test_fsdp_optim_state.py
@@ -6,6 +6,7 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, Type
 
 import torch
+import torch.nn as nn
 from torch import distributed as dist
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     _CHECKPOINT_WRAPPED_MODULE,
@@ -31,7 +32,7 @@
     TEST_WITH_DEV_DBG_ASAN,
 )
 
-STATE_DICT_TYPE = [StateDictType.FULL_STATE_DICT, StateDictType.SHARDED_STATE_DICT]
+STATE_DICT_TYPES = [StateDictType.FULL_STATE_DICT, StateDictType.SHARDED_STATE_DICT]
 
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
@@ -490,7 +491,7 @@ def _check_state_device(self, osd: Dict[str, Any], on_gpu: bool):
                         self.assertFalse(value.is_cuda)
 
     @skip_if_lt_x_gpu(2)
-    @parametrize("state_dict_type", STATE_DICT_TYPE)
+    @parametrize("state_dict_type", STATE_DICT_TYPES)
     @parametrize("use_multiple_param_groups", [False, True])
     @parametrize("rank0_only", [False, True])
     @parametrize("use_diff_optim_inputs", [False, True])
@@ -502,7 +503,7 @@ def test_optim_state_dict_nested(
         use_diff_optim_inputs: bool,
     ) -> None:
         """
-        Tests :meth:`full_optim_state_dict` and `sharded_optim_state_dict`
+        Tests :meth:`full_optim_state_dict` and meth:`sharded_optim_state_dict`
         by comparing the returned dict for an FSDP-wrapped model with that of
         an equivalent non-wrapped model.
 
@@ -956,7 +957,7 @@ def _test_load_optim_state(
             self._step_model(model2, optim2, num_iters=NUM_ITERS)
 
     @skip_if_lt_x_gpu(2)
-    @parametrize("state_dict_type", STATE_DICT_TYPE)
+    @parametrize("state_dict_type", STATE_DICT_TYPES)
     @parametrize("add_to_fsdp_module", [False, True])
     def test_shard_full_optim_state_dict_unmanaged_params(
         self,
@@ -1086,7 +1087,7 @@ def _test_shard_full_optim_state_dict_unmanaged_params(
             optim.load_state_dict(flattened_osd)
 
     @skip_if_lt_x_gpu(2)
-    @parametrize("state_dict_type", STATE_DICT_TYPE)
+    @parametrize("state_dict_type", STATE_DICT_TYPES)
     @parametrize("use_multiple_param_groups", [False, True])
     def test_rekey_optim_state_dict_to_ids(
         self,
@@ -1382,6 +1383,61 @@ def _run_on_all_optim_state_apis(
                     optim_input=nonwrapped_optim_input,
                 )
 
+    @skip_if_lt_x_gpu(2)
+    @parametrize("state_dict_type", STATE_DICT_TYPES)
+    def test_save_load_without_0th_param_state(self, state_dict_type: StateDictType):
+        """
+        Tests saving and loading an optim state dict for Adam optimizer (i.e.
+        any optimizer with a "step" key in its state) when the first parameter
+        does not have optimizer state (e.g. unused or frozen).
+        """
+
+        class Model(nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.lin1 = nn.Linear(5, 5)
+                self.lin2 = nn.Linear(5, 5)
+                self.relu = nn.ReLU()
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                # Do not use `lin1`, which is the parameter passed to the
+                # optimizer and the one checked for "step" state to see if it
+                # is tensor or float
+                return self.relu(self.lin2(x))
+
+        model = Model().cuda()
+        model.lin1 = FSDP(model.lin1)
+        model.lin2 = FSDP(model.lin2)
+        fsdp_model = FSDP(model)
+        optim = torch.optim.Adam(
+            fsdp_model.parameters(), lr=1e-2
+        )  # or any optimizer with "step"
+
+        # Run an iteration to construct optimizer state
+        device = torch.device("cuda")
+        inp = torch.randn((2, 5), device=device)
+        loss = fsdp_model(inp).sum()
+        loss.backward()
+        optim.step()
+
+        # Check that save and load does not error
+        if state_dict_type == StateDictType.FULL_STATE_DICT:
+            fsdp_osd = FSDP.full_optim_state_dict(fsdp_model, optim, rank0_only=False)
+            flattened_osd = FSDP.shard_full_optim_state_dict(fsdp_osd, fsdp_model)
+        elif state_dict_type == StateDictType.SHARDED_STATE_DICT:
+            fsdp_osd = FSDP.sharded_optim_state_dict(fsdp_model, optim)
+            flattened_osd = FSDP.flatten_sharded_optim_state_dict(fsdp_osd, fsdp_model)
+        optim.load_state_dict(flattened_osd)
+        # `__setstate__()` will check the 0th parameter to see if "step" is
+        # represented as a tensor or float, so it is imperative that its state
+        # is non-empty.
+
+        # Run an iteration as a sanity check
+        inp = torch.randn((2, 5), device=device)
+        loss = fsdp_model(inp).sum()
+        loss.backward()
+        optim.step()
+
 
 instantiate_parametrized_tests(TestFSDPOptimState)
 
diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py
index d5f86a1a5e045..530a8480d5522 100644
--- a/torch/distributed/fsdp/_optim_utils.py
+++ b/torch/distributed/fsdp/_optim_utils.py
@@ -312,7 +312,10 @@ def _flatten_optim_state_dict(
                 shard_state,
             )
             key = _OptimStateKey(tuple(unflat_param_names), True)
-            flat_osd_state[key] = flat_state
+            if flat_state:
+                # Only include non-empty states since as expected by
+                # `torch.optim.Optimizer` s
+                flat_osd_state[key] = flat_state
         else:  # do not flatten non-FSDP parameters' states
             assert len(unflat_param_names) == 1
             unflat_param_name = unflat_param_names[0]

From aa1acf7a6d66ed67efb53db8fafe4667e54d9ba3 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 2 Nov 2022 18:55:33 -0700
Subject: [PATCH 0517/1922] Unconditionally enable python dispatcher in
 AOTAutograd (#88365)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88365
Approved by: https://github.com/Chillee
---
 functorch/_src/aot_autograd.py | 37 +++++++++++++++++-----------------
 torch/_ops.py                  |  2 +-
 2 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/functorch/_src/aot_autograd.py b/functorch/_src/aot_autograd.py
index d4663c6dc71af..e0a9e10d11bea 100644
--- a/functorch/_src/aot_autograd.py
+++ b/functorch/_src/aot_autograd.py
@@ -391,24 +391,25 @@ def add_dupe_args(args):
     disable_amp = torch._C._is_any_autocast_enabled()
 
     if config.use_functionalize:
-        # Trace once without decompositions, into a graph of ATen ops.
-        # NB: tracing_mode is real, as it's assumed the calling context setup
-        # fake tensor mode / symbolic shapes if that is needed
-        fx_g = make_fx(joint_forward_backward)(*joint_inputs)
-
-        context = disable_autocast_manager if disable_amp else nullcontext
-
-        def fake_fn(primals, tangents):
-            with torch.fx.traceback.override_stack_trace():
-                return torch.fx.Interpreter(fx_g).run(primals, tangents)
-
-        # Trace a second time, running functionalization, and THEN running decompositions.
-        # functionalization only acts on ATen today, and doesn't currently handle
-        # view and inplace ops that come from primtorch.
-        # Eventually, functionalization should support primtorch view/inplace ops,
-        # which will make it ok to run decompositions before functionalization.
-        with context():
-            fx_g = make_fx(functionalize(fake_fn), aot_config.decompositions)(*joint_inputs)
+        with enable_python_dispatcher():
+            # Trace once without decompositions, into a graph of ATen ops.
+            # NB: tracing_mode is real, as it's assumed the calling context setup
+            # fake tensor mode / symbolic shapes if that is needed
+            fx_g = make_fx(joint_forward_backward)(*joint_inputs)
+
+            context = disable_autocast_manager if disable_amp else nullcontext
+
+            def fake_fn(primals, tangents):
+                with torch.fx.traceback.override_stack_trace():
+                    return torch.fx.Interpreter(fx_g).run(primals, tangents)
+
+            # Trace a second time, running functionalization, and THEN running decompositions.
+            # functionalization only acts on ATen today, and doesn't currently handle
+            # view and inplace ops that come from primtorch.
+            # Eventually, functionalization should support primtorch view/inplace ops,
+            # which will make it ok to run decompositions before functionalization.
+            with context():
+                fx_g = make_fx(functionalize(fake_fn), aot_config.decompositions)(*joint_inputs)
         fx_g.graph.eliminate_dead_code()
         fx_g.recompile()
     else:
diff --git a/torch/_ops.py b/torch/_ops.py
index ed0276d0ada2f..a4119d758524f 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -103,7 +103,7 @@ def resolve_key(op: PyOperatorABC, k: DispatchKey):  # type: ignore[valid-type]
         # The dispatch key itself will implicitly route to backend fallback.
         # This is probably not great for the pure Python implementation.
         return k
-    raise RuntimeError("could not find kernel")
+    raise NotImplementedError(f"could not find kernel for {op} at dispatch key {k}")
 
 
 pyop_namespace = {}

From 86d9a3a3b83787994b1cda18547571502b27f3f1 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 2 Nov 2022 20:44:17 -0700
Subject: [PATCH 0518/1922] Put Python Dispatcher cache in dict, clear it on
 new registrations. (#88329)

The motivation is that I am going to add the ability to temporarily
install entries to the python dispatcher, and to do that, I need
an easier way to clear the cache.  Putting the cache in a dict
centralizes cache clearing in one place.  I then add some easy
cache clearing.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88329
Approved by: https://github.com/albanD
---
 torch/_ops.py                           | 21 ++++++++++-----------
 torch/csrc/autograd/python_variable.cpp | 13 +++++++++++--
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/torch/_ops.py b/torch/_ops.py
index a4119d758524f..feb05771b4931 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -243,6 +243,8 @@ def __init__(self, overloadpacket, op, op_dk, schema, tags):
         op.__module__ = overloadpacket.__module__
         self.__qualname__ = self._name
         self.__annotations__ = {}
+        # NB: This name is hard-coded in torch/csrc/autograd/python_variable.cpp
+        self._dispatch_cache = {}
 
     # it's a no-op since OpOverload object is immutable and must be unique for a given op overload.
     def __deepcopy__(self, memo=None):
@@ -289,6 +291,7 @@ def inner(fn):
                 assert mode not in self.python_key_mode_table
                 # TODO(voz): Should we replace setting torch._C.DispatchKey.Python entirely with setting mode keys?
                 self.python_key_mode_table[mode] = fn
+                self._dispatch_cache.clear()
                 return fn
 
             assert isinstance(dispatch_key_or_mode, torch._C.DispatchKey)
@@ -301,23 +304,19 @@ def inner(fn):
                     f"Trying to override a python impl for {dispatch_key_or_mode} on operator {self._name}"
                 )
             self.py_kernels[dispatch_key_or_mode] = fn
+            self._dispatch_cache.clear()
             return fn
 
         return inner
 
     # This implements the pre-computation logic for the Python dispatcher.
-    def __getattr__(self, attr):
-        if len(attr) == 0 or not attr[0].isupper():
-            raise AttributeError()
-
-        try:
-            key = torch._C._dispatch_key_parse(attr)
-        except Exception as e:
-            raise AttributeError()
+    def _get_dispatch(self, key):
+        # This is only called upon a cache miss
+        assert key not in self._dispatch_cache
 
         if key == torch._C.DispatchKey.Python:
             if not self.python_key_mode_table:
-                setattr(self, attr, key)
+                self._dispatch_cache[key] = key
                 return key
 
             def handler(*args, **kwargs):
@@ -336,12 +335,12 @@ def handler(*args, **kwargs):
                 # TODO(voz): The idea behind this is that we do not yet support dispatch by key + mode, only key.
                 return self.python_key_mode_table[curr_mode](*args, **kwargs)
 
-            setattr(self, attr, handler)
+            self._dispatch_cache[key] = handler
             return handler
 
         key = resolve_key(self, key)
         r = self.py_kernels.get(key, key)
-        setattr(self, attr, r)
+        self._dispatch_cache[key] = r
         return r
 
     def name(self):
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index d992936ba3615..d1580d1af5799 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -2264,11 +2264,20 @@ void ConcretePyInterpreterVTable::python_dispatcher(
     torch::jit::Stack* stack) const {
   py::gil_scoped_acquire g;
   py::handle torch_api_function_overload = getTorchApiFunction(op);
+  // TODO: if necessary, can optimize to cache the cache lookup
+  // TODO: if necessary, can optimize OpOverload to have slots
+  auto cache = py::dict(torch_api_function_overload.attr("_dispatch_cache"));
+  if (cache.ptr() == nullptr) {
+    throw python_error();
+  }
 
   c10::DispatchKey k = ks.highestPriorityTypeId();
-  auto handler = torch_api_function_overload.attr(toString(k));
+  // TODO: allow this to be non-owning
+  auto handler = py::reinterpret_borrow<py::object>(
+      PyDict_GetItem(cache.ptr(), py::cast(k).ptr()));
   if (handler.ptr() == nullptr) {
-    throw python_error();
+    // Slow path
+    handler = torch_api_function_overload.attr("_get_dispatch")(k);
   }
   if (py::isinstance<c10::DispatchKey>(handler)) {
     // NB: not redispatch, as that will permanently remove the python

From 1cfb4b7a6e5161f00484d3fff025f86b371b6550 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 2 Nov 2022 20:44:18 -0700
Subject: [PATCH 0519/1922] Add a reshape_copy operator. (#88314)

The semantics is "as if" you did a reshape, but it always copied
even if the input was directly view'able.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88314
Approved by: https://github.com/albanD
---
 aten/src/ATen/native/TensorShape.cpp       | 17 +++++++++++++++++
 aten/src/ATen/native/native_functions.yaml |  5 +++++
 tools/autograd/derivatives.yaml            |  4 ++++
 tools/autograd/gen_python_functions.py     |  1 +
 tools/autograd/gen_variable_type.py        |  1 +
 5 files changed, 28 insertions(+)

diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 1895a227d3389..31b4011c12813 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -1574,6 +1574,23 @@ Tensor reshape_symint(const Tensor& self, c10::SymIntArrayRef proposed_shape) {
   return at::_unsafe_view_symint(self.clone(at::MemoryFormat::Contiguous), shape);
 }
 
+Tensor _reshape_copy_symint(const Tensor& self, c10::SymIntArrayRef proposed_shape) {
+  if (self.is_sparse()) {
+    TORCH_CHECK(0, "_reshape_copy is not implemented for sparse tensors");
+  }
+  c10::SymDimVector shape = infer_size_dv(proposed_shape, self.sym_numel());
+
+  if (self.is_mkldnn()) {
+    TORCH_CHECK(0, "_reshape_copy not implemented for mkldnn tesnors");
+  }
+
+  if (self.is_contiguous()) {
+    return self.view_symint(shape).clone(at::MemoryFormat::Contiguous);
+  } else {
+    return at::_unsafe_view_symint(self.clone(at::MemoryFormat::Contiguous), shape);
+  }
+}
+
 // Duplicate of above code for non-symbolic ints. Kept for BC purposes and to
 // minimize breakages.
 Tensor reshape(const Tensor& self, IntArrayRef proposed_shape) {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index f374b498133ee..2f757965f4e75 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -4306,6 +4306,11 @@
     CompositeImplicitAutograd: reshape_symint
     CompositeImplicitAutogradNestedTensor: reshape_nested
 
+- func: _reshape_copy(Tensor self, SymInt[] size) -> Tensor
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _reshape_copy_symint
+
 # NOTE [ _reshape_alias ] is meant to be used in the implementation of reshape.
 # They are not user-facing, hence the leading underscore. Please don't use it
 # anywhere else.
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 8950ce8ec64f8..018f1b9a280fa 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -2984,3 +2984,7 @@
 
 - name: special_spherical_bessel_j0(Tensor x) -> Tensor
   x: non_differentiable
+
+- name: _reshape_copy(Tensor self, SymInt[] size) -> Tensor
+  self: grad.reshape_symint(self.sym_sizes())
+  result: auto_linear
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index f90ec74459de4..ee06a8ed12384 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -124,6 +124,7 @@
     "_local_scalar_dense",
     "to",
     "_to_copy",
+    "_reshape_copy",
     "copy_sparse_to_sparse_",
     "copy_",
     "numpy_T",
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 2feb84bbd088d..3d1ff895c837f 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -348,6 +348,7 @@
     "conj_physical_",
     "_neg_view",
     "_reshape_alias",
+    "_reshape_copy",
     "_linalg_det",
     "lu_solve",
     "linalg_solve_triangular",

From 5bbe0b6e91fbe5052b9ec243ef9f05bec58d0e27 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 2 Nov 2022 19:08:07 -0700
Subject: [PATCH 0520/1922] Make Python op registration work with
 torchdeploy/multipy (#87162)

See strategy at PythonOpRegistrationTrampoline.cpp for the
big picture.

Along the way, I made OperatorHandle support == and hashing,
and slightly changed the low level python_dispatch impl API
to disallow empty strings for dispatch key, which had the knock
on effect of requiring us to explicitly make sure we pass in
CompositeImplicitAutograd if we would have passed in "" (I didn't apply
this to the rest of the file because I'm lazy.)

Test strategy is we delete the logic for preventing Python op
registrations in torch from being skipped in a torchdeploy context
and show CI still works.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87162
Approved by: https://github.com/anjali411, https://github.com/bdhirsh
---
 .../core/PythonOpRegistrationTrampoline.cpp   |  28 +++
 .../core/PythonOpRegistrationTrampoline.h     |  18 ++
 aten/src/ATen/core/dispatch/Dispatcher.cpp    |  45 ++++-
 aten/src/ATen/core/dispatch/Dispatcher.h      |  41 +++++
 aten/src/ATen/core/library.cpp                |  66 ++++---
 c10/core/TensorImpl.h                         |   7 +-
 c10/core/impl/HermeticPyObjectTLS.cpp         |  23 +++
 c10/core/impl/HermeticPyObjectTLS.h           |  61 ++++++
 c10/core/impl/PyInterpreter.cpp               |   7 +
 c10/core/impl/PyInterpreter.h                 |   9 +
 test/test_torch.py                            |  30 ---
 torch/_C/__init__.pyi.in                      |   1 -
 torch/__init__.py                             |   7 +-
 torch/autograd/forward_ad.py                  |   3 +-
 torch/csrc/autograd/python_variable.cpp       |  71 ++++++-
 torch/csrc/autograd/python_variable.h         |   1 +
 torch/csrc/utils/python_dispatch.cpp          | 174 +++++++++++++++---
 torch/csrc/utils/python_dispatch.h            |   7 +-
 torch/library.h                               |  27 ++-
 torch/library.py                              |   7 +-
 20 files changed, 527 insertions(+), 106 deletions(-)
 create mode 100644 aten/src/ATen/core/PythonOpRegistrationTrampoline.cpp
 create mode 100644 aten/src/ATen/core/PythonOpRegistrationTrampoline.h
 create mode 100644 c10/core/impl/HermeticPyObjectTLS.cpp
 create mode 100644 c10/core/impl/HermeticPyObjectTLS.h

diff --git a/aten/src/ATen/core/PythonOpRegistrationTrampoline.cpp b/aten/src/ATen/core/PythonOpRegistrationTrampoline.cpp
new file mode 100644
index 0000000000000..2d9b15a6b03cb
--- /dev/null
+++ b/aten/src/ATen/core/PythonOpRegistrationTrampoline.cpp
@@ -0,0 +1,28 @@
+#include <ATen/core/PythonOpRegistrationTrampoline.h>
+
+namespace at {
+namespace impl {
+
+// The strategy is that all python interpreters attempt to register themselves
+// as the main interpreter, but only one wins.  Only that interpreter is
+// allowed to interact with the C++ dispatcher.  Furthermore, when we execute
+// logic on that interpreter, we do so hermetically, never setting pyobj field
+// on Tensor.
+
+std::atomic<c10::impl::PyInterpreter*> PythonOpRegistrationTrampoline::interpreter_{nullptr};
+
+bool PythonOpRegistrationTrampoline::registerInterpreter(c10::impl::PyInterpreter* interp) {
+  c10::impl::PyInterpreter* expected = nullptr;
+  interpreter_.compare_exchange_strong(expected, interp);
+  if (expected != nullptr) {
+    // This is the second (or later) Python interpreter, which means we need
+    // non-trivial hermetic PyObject TLS
+    c10::impl::HermeticPyObjectTLS::init_state();
+    return false;
+  } else {
+    return true;
+  }
+}
+
+} // namespace impl
+} // namespace at
diff --git a/aten/src/ATen/core/PythonOpRegistrationTrampoline.h b/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
new file mode 100644
index 0000000000000..00d3c635859a3
--- /dev/null
+++ b/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
@@ -0,0 +1,18 @@
+#include <ATen/core/dispatch/Dispatcher.h>
+
+// TODO: this can probably live in c10
+
+namespace at {
+namespace impl {
+
+class TORCH_API PythonOpRegistrationTrampoline final {
+  static std::atomic<c10::impl::PyInterpreter*> interpreter_;
+
+public:
+  //  Returns true if you successfully registered yourself (that means
+  //  you are in the hot seat for doing the operator registrations!)
+  static bool registerInterpreter(c10::impl::PyInterpreter*);
+};
+
+} // namespace impl
+} // namespace at
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp
index 45214a3fd20f2..8b2257605161e 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@@ -1,6 +1,7 @@
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <list>
 #include <sstream>
+#include <chrono>
 
 namespace c10 {
 
@@ -50,7 +51,9 @@ Dispatcher::Dispatcher()
 , operatorLookupTable_()
 , backendFallbackKernels_()
 , listeners_(std::make_unique<detail::RegistrationListenerList>())
-, mutex_() {}
+, mutex_()
+, cond_var_()
+{}
 
 Dispatcher::~Dispatcher() = default;
 
@@ -69,6 +72,41 @@ c10::optional<OperatorHandle> Dispatcher::findOp(const OperatorName& overload_na
   });
 }
 
+// NB: If you add more waitFor* implementations, you also have to add
+// appropriate notify_all() calls to the relevant register calls
+
+void Dispatcher::waitForDef(const FunctionSchema& schema) {
+  using namespace std::chrono_literals;
+  std::unique_lock<std::mutex> lock(mutex_);
+  bool r = cond_var_.wait_for(lock, 2s, [&]{
+    return findOp(schema.operator_name()) != c10::nullopt;
+  });
+  TORCH_INTERNAL_ASSERT(r,
+    "Expected main interpreter to define ", schema.operator_name(),
+    ", but this didn't happen within timeout.  Are you trying to load "
+    "different models in the same torchdeploy/multipy instance?  You "
+    "must warmup each interpreter identically, e.g., import all "
+    "the same dependencies.");
+}
+
+void Dispatcher::waitForImpl(const OperatorName& op_name, c10::optional<c10::DispatchKey> maybe_dk) {
+  using namespace std::chrono_literals;
+  std::unique_lock<std::mutex> lock(mutex_);
+  auto dk = maybe_dk.value_or(DispatchKey::CompositeImplicitAutograd);
+  auto op = findOrRegisterName_(op_name);
+  bool r = cond_var_.wait_for(lock, 2s, [&]{
+    // NB: this is slightly unsound for overrides, but overrides are
+    // funny business anyway
+    return op.hasKernelForDispatchKey(dk);
+  });
+  TORCH_INTERNAL_ASSERT(r,
+    "Expected main interpreter to implement ", dk, " for ", op_name,
+    ", but this didn't happen within timeout.  Are you trying to load "
+    "different models in the same torchdeploy/multipy instance?  You "
+    "must warmup each interpreter identically, e.g., import all "
+    "the same dependencies.");
+}
+
 c10::optional<OperatorHandle> Dispatcher::findSchema(const OperatorName& overload_name) {
   auto it = findOp(overload_name);
   if (it.has_value()) {
@@ -175,6 +213,8 @@ RegistrationHandleRAII Dispatcher::registerDef(FunctionSchema schema, std::strin
   ++op.operatorDef_->def_count;
   ++op.operatorDef_->def_and_impl_count;
 
+  cond_var_.notify_all();
+
   return RegistrationHandleRAII([this, op, op_name] {
     deregisterDef_(op, op_name);
   });
@@ -227,6 +267,8 @@ RegistrationHandleRAII Dispatcher::registerImpl(
 
   ++op.operatorDef_->def_and_impl_count;
 
+  cond_var_.notify_all();
+
   return RegistrationHandleRAII([this, op, op_name, dispatch_key, handle] {
     deregisterImpl_(op, op_name, dispatch_key, handle);
   });
@@ -249,6 +291,7 @@ RegistrationHandleRAII Dispatcher::registerName(OperatorName op_name) {
   std::lock_guard<std::mutex> lock(mutex_);
   auto op = findOrRegisterName_(op_name);
   ++op.operatorDef_->def_and_impl_count;
+
   return RegistrationHandleRAII(
       [this, op, op_name] { deregisterName_(op, op_name); });
 }
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h
index 2f383d589e29f..6e1c7d754d723 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@@ -11,6 +11,7 @@
 #include <c10/util/LeftRight.h>
 #include <list>
 #include <mutex>
+#include <condition_variable>
 #include <type_traits>
 
 #include <ATen/core/grad_mode.h>
@@ -182,6 +183,9 @@ class TORCH_API Dispatcher final {
     return backendFallbackKernels_[dispatch_ix].kernel.isValid();
   }
 
+  // Used by torchdeploy/multipy for multiple interpreters racing.
+  void waitForDef(const FunctionSchema& schema);
+  void waitForImpl(const OperatorName& op_name, c10::optional<DispatchKey> dispatch_key);
 
   // ------------------------------------------------------------------------
   //
@@ -307,7 +311,23 @@ class TORCH_API Dispatcher final {
   std::array<impl::AnnotatedKernel, num_runtime_entries> backendFallbackKernels_;
 
   std::unique_ptr<detail::RegistrationListenerList> listeners_;
+
+  // This mutex protects concurrent access to the dispatcher
   std::mutex mutex_;
+
+  // This condition variable gets notified whenever we add a new def/impl to the
+  // dispatch table.  This is primarily used by multipy/torchdeploy, when
+  // we have multiple interpreters trying to register to the dispatch table.
+  // In this situation, whenever the non-primary interpreter would have tried
+  // to register to the dispatch table, instead it will check to see if the
+  // expected registration has already been made, and if it hasn't, wait on
+  // this condition variable to see if it was just racing with the primary
+  // interpreter.
+  //
+  // We expect it to be rare for there to be any waiters on this condition
+  // variable.  This is mostly just to help give better diagnostics if
+  // something goes horribly wrong
+  std::condition_variable cond_var_;
 };
 
 /**
@@ -316,6 +336,8 @@ class TORCH_API Dispatcher final {
  * to lookup a kernel for a certain set of arguments.
  */
 class TORCH_API OperatorHandle {
+  template <typename T> friend class std::hash;
+
 public:
   OperatorHandle(OperatorHandle&&) noexcept = default;
   OperatorHandle& operator=(OperatorHandle&&) noexcept = default;
@@ -411,6 +433,14 @@ class TORCH_API OperatorHandle {
     return operatorDef_->op.getPythonOp(self_interpreter, slow_accessor);
   }
 
+  bool operator==(const OperatorHandle& other) const {
+    return operatorDef_ == other.operatorDef_;
+  }
+
+  bool operator!=(const OperatorHandle& other) const {
+    return operatorDef_ != other.operatorDef_;
+  }
+
 private:
   explicit OperatorHandle(std::list<Dispatcher::OperatorDef>::iterator operatorIterator)
   : operatorDef_(&*operatorIterator), operatorIterator_(operatorIterator)  {}
@@ -695,3 +725,14 @@ inline void Dispatcher::redispatchBoxed(const OperatorHandle& op, DispatchKeySet
 }
 
 } // namespace c10
+
+namespace std {
+
+template <>
+struct hash<c10::OperatorHandle> {
+  size_t operator()(c10::OperatorHandle op) const noexcept {
+    return std::hash<void*>{}(static_cast<void*>(op.operatorDef_));
+  }
+};
+
+} // hamespace std
diff --git a/aten/src/ATen/core/library.cpp b/aten/src/ATen/core/library.cpp
index 5c9cea05ea76b..965d3f243d01c 100644
--- a/aten/src/ATen/core/library.cpp
+++ b/aten/src/ATen/core/library.cpp
@@ -89,7 +89,7 @@ Library::Library(Kind kind, std::string ns, c10::optional<c10::DispatchKey> k, c
 // merge everything
 
 #define DEF_PRELUDE "def(\"", schema.operator_name(), "\"): "
-Library& Library::_def(c10::FunctionSchema&& schema, c10::OperatorName* out_name, const std::vector<at::Tag>& tags) & {
+Library& Library::_def(c10::FunctionSchema&& schema, c10::OperatorName* out_name, const std::vector<at::Tag>& tags, _RegisterOrVerify rv) & {
   TORCH_CHECK(kind_ == DEF || kind_ == FRAGMENT,
     DEF_PRELUDE,
     "Cannot define an operator inside of a ", toString(kind_), " block.  "
@@ -125,13 +125,20 @@ Library& Library::_def(c10::FunctionSchema&& schema, c10::OperatorName* out_name
   if (out_name) {
     *out_name = schema.operator_name(); // copy!
   }
-  registrars_.emplace_back(
-    c10::Dispatcher::singleton().registerDef(
-      std::move(schema),
-      debugString(file_, line_),
-      tags
-    )
-  );
+  switch (rv) {
+    case _RegisterOrVerify::REGISTER:
+      registrars_.emplace_back(
+        c10::Dispatcher::singleton().registerDef(
+          std::move(schema),
+          debugString(file_, line_),
+          tags
+        )
+      );
+      break;
+    case _RegisterOrVerify::VERIFY:
+      c10::Dispatcher::singleton().waitForDef(schema);
+      break;
+  }
   return *this;
 }
 #undef DEF_PRELUDE
@@ -174,11 +181,10 @@ Library& Library::_def(c10::either<c10::OperatorName, c10::FunctionSchema>&& nam
 }
 
 #define IMPL_PRELUDE "impl(\"", name_str, "\", ...): "
-Library& Library::_impl(const char* name_str, CppFunction&& f) & {
+at::OperatorName Library::_parseNameForLib(const char* name_str) const {
   auto name = torch::jit::parseName(name_str);
   auto ns_opt = name.getNamespace();
-  // This is kind of similar to the checking in def(), but the error
-  // messages are a little different for this call site
+  // This is a copy paste of Library::_impl
   if (ns_opt.has_value()) {
     // See Note [Redundancy in registration code is OK]
     TORCH_CHECK(*ns_opt == *ns_,
@@ -193,6 +199,11 @@ Library& Library::_impl(const char* name_str, CppFunction&& f) & {
     bool b = name.setNamespaceIfNotSet(ns_->c_str());
     TORCH_INTERNAL_ASSERT(b, ERROR_CONTEXT);
   }
+  return name;
+}
+
+Library& Library::_impl(const char* name_str, CppFunction&& f, _RegisterOrVerify rv) & {
+  at::OperatorName name = _parseNameForLib(name_str);
   // See Note [Redundancy in registration code is OK]
   TORCH_CHECK(!(f.dispatch_key_.has_value() &&
                 dispatch_key_.has_value() &&
@@ -205,19 +216,30 @@ Library& Library::_impl(const char* name_str, CppFunction&& f) & {
     ERROR_CONTEXT
   );
   auto dispatch_key = f.dispatch_key_.has_value() ? f.dispatch_key_ : dispatch_key_;
-  registrars_.emplace_back(
-    c10::Dispatcher::singleton().registerImpl(
-      std::move(name),
-      dispatch_key,
-      std::move(f.func_),
-      // NOLINTNEXTLINE(performance-move-const-arg)
-      std::move(f.cpp_signature_),
-      std::move(f.schema_),
-      debugString(std::move(f.debug_), file_, line_)
-    )
-  );
+  switch (rv) {
+    case _RegisterOrVerify::REGISTER:
+      registrars_.emplace_back(
+        c10::Dispatcher::singleton().registerImpl(
+          std::move(name),
+          dispatch_key,
+          std::move(f.func_),
+          // NOLINTNEXTLINE(performance-move-const-arg)
+          std::move(f.cpp_signature_),
+          std::move(f.schema_),
+          debugString(std::move(f.debug_), file_, line_)
+        )
+      );
+      break;
+    case _RegisterOrVerify::VERIFY:
+      c10::Dispatcher::singleton().waitForImpl(name, dispatch_key);
+      break;
+  }
   return *this;
 }
+
+c10::OperatorName Library::_resolve(const char* name_str) const {
+  return _parseNameForLib(name_str);
+}
 #undef IMPL_PRELUDE
 
 Library& Library::_fallback(CppFunction&& f) & {
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index f110b0e9fa460..a360a65d42a37 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -9,6 +9,7 @@
 #include <c10/core/SymIntArrayRef.h>
 #include <c10/core/TensorOptions.h>
 #include <c10/core/WrapDimMinimal.h>
+#include <c10/core/impl/HermeticPyObjectTLS.h>
 #include <c10/core/impl/LocalDispatchKeySet.h>
 #include <c10/core/impl/PyInterpreter.h>
 #include <c10/core/impl/SizesAndStrides.h>
@@ -2037,7 +2038,11 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
       return c10::nullopt;
     } else if (interpreter == self_interpreter) {
       // NB: pyobj_ could still be null!
-      return c10::make_optional(_unchecked_untagged_pyobj());
+      if (c10::impl::HermeticPyObjectTLS::get_state()) {
+        return c10::nullopt;
+      } else {
+        return c10::make_optional(_unchecked_untagged_pyobj());
+      }
     } else {
       TORCH_CHECK(
           false,
diff --git a/c10/core/impl/HermeticPyObjectTLS.cpp b/c10/core/impl/HermeticPyObjectTLS.cpp
new file mode 100644
index 0000000000000..a7eb89430be8a
--- /dev/null
+++ b/c10/core/impl/HermeticPyObjectTLS.cpp
@@ -0,0 +1,23 @@
+#include <c10/core/impl/HermeticPyObjectTLS.h>
+
+namespace c10 {
+namespace impl {
+
+thread_local std::atomic<bool> hermeticPyObjectState{false};
+
+std::atomic<bool> HermeticPyObjectTLS::haveState_{false};
+
+void HermeticPyObjectTLS::set_state(bool state) {
+  hermeticPyObjectState = state;
+}
+
+bool HermeticPyObjectTLS::get_tls_state() {
+  return hermeticPyObjectState;
+}
+
+void HermeticPyObjectTLS::init_state() {
+  haveState_ = true;
+}
+
+} // namespace impl
+} // namespace c10
diff --git a/c10/core/impl/HermeticPyObjectTLS.h b/c10/core/impl/HermeticPyObjectTLS.h
new file mode 100644
index 0000000000000..9ecc8e761247b
--- /dev/null
+++ b/c10/core/impl/HermeticPyObjectTLS.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <atomic>
+
+namespace c10 {
+namespace impl {
+
+// This TLS controls whether or not we permanently associate PyObject
+// with Tensor the first time it is allocated.  When hermetic PyObject
+// TLS is enabled (state is true), we DO NOT save PyObjects to Tensor,
+// meaning you get a distinct PyObject whenever you execute the code in
+// question.
+struct C10_API HermeticPyObjectTLS {
+  static void set_state(bool state);
+  static bool get_state() {
+    // Hypothetical fastpath if torchdeploy/multipy isn't used.  Per
+    // https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
+    // this qualifies relaxed access because it is a single-location data
+    // structure (only the boolean here).
+    //
+    // Forgetting about data races for a moment, is there a logical race?
+    //
+    //  - Boolean only ever transitions from false to true.  So the
+    //    critical situation is when one interpreter is already running
+    //    when a second interpreter switches haveState from false to true.
+    //
+    //  - The first interpreter is indifferent whether or not it sees
+    //    hasState true/false; obviously false works (this is what the
+    //    interpreter was previously using; more directly, the interpreter
+    //    calls into itself as the handler, so being hermetic is not
+    //    required), and true simply means serviced python operator calls will
+    //    be hermetic; in these cases it is expected to be functionally
+    //    equivalent.
+    //
+    //  - The second interpreter MUST see hasState true (as its requests will
+    //    be forwarded to the first interpreter), but it is assumed that there
+    //    is a synchronization between the interpreter initialization, and
+    //    when we actually perform operations, so it is guaranteed to see
+    //    hasState true.
+    //
+    // QED.
+    //
+    // This fastpath is currently disabled so that we can more easily test that
+    // hermetic mode works correctly even on stock build of PyTorch.
+    if (false && !haveState_.load(std::memory_order_relaxed))
+      return false;
+    return get_tls_state();
+  }
+  // Call this from the multipy/torchdeploy top level
+  static void init_state();
+
+ private:
+  // This only flipped once from false to true during torchdeploy/multipy
+  // initialization, and never again.
+  static std::atomic<bool> haveState_;
+  static bool get_tls_state();
+};
+
+} // namespace impl
+} // namespace c10
diff --git a/c10/core/impl/PyInterpreter.cpp b/c10/core/impl/PyInterpreter.cpp
index f1dd268bab806..8c29f13f3e5c3 100644
--- a/c10/core/impl/PyInterpreter.cpp
+++ b/c10/core/impl/PyInterpreter.cpp
@@ -27,6 +27,13 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
     PANIC(dispatch);
   }
 
+  void python_op_registration_trampoline(
+      const c10::OperatorHandle& op,
+      c10::DispatchKey,
+      torch::jit::Stack* stack) const override {
+    PANIC(python_op_registration_trampoline);
+  }
+
   void python_dispatcher(
       const c10::OperatorHandle& op,
       c10::DispatchKeySet,
diff --git a/c10/core/impl/PyInterpreter.h b/c10/core/impl/PyInterpreter.h
index 90fbb8dfebf88..da5b612f093b2 100644
--- a/c10/core/impl/PyInterpreter.h
+++ b/c10/core/impl/PyInterpreter.h
@@ -141,6 +141,15 @@ struct C10_API PyInterpreterVTable {
   virtual void dispatch(const c10::OperatorHandle& op, torch::jit::Stack* stack)
       const = 0;
 
+  // This is only invoked in the multipy/torchdeploy situation from
+  // pythonOpRegistrationTrampoline; this lets us get to the Python
+  // interpreter to actually find the appropriate Python op registration
+  // entry to call.
+  virtual void python_op_registration_trampoline(
+      const c10::OperatorHandle& op,
+      c10::DispatchKey,
+      torch::jit::Stack* stack) const = 0;
+
   // Invoke the Python dispatcher to handle this call
   virtual void python_dispatcher(
       const c10::OperatorHandle& op,
diff --git a/test/test_torch.py b/test/test_torch.py
index b507f68436d45..7b91fa5d62a14 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -5673,36 +5673,6 @@ def test_unflatten(self):
                                     r"the unspecified dimension size -1 can be any value and is ambiguous"):
             torch.randn(2, 0).unflatten(1, (2, -1, 0))
 
-    def test_pytorch_library_disabled_env(self):
-        import subprocess
-        env = os.environ.copy()
-        env['PYTORCH_DISABLE_LIBRARY'] = '1'
-        try:
-            subprocess.check_output([sys.executable, '-c', 'import torch'], env=env)
-        except subprocess.CalledProcessError as e:
-            raise RuntimeError("Could not 'import torch' with PYTORCH_DISABLE_LIBRARY=0") from e
-
-    # Test that warnings generated from C++ are translated to the correct type
-    def test_warn_types(self):
-        test_cases = [
-            # function, warning type, message
-            (torch._C._warn, UserWarning, r"Test message for TORCH_WARN"),
-            (torch._C._warn_deprecation, DeprecationWarning, r"Test message for TORCH_WARN_DEPRECATION"),
-        ]
-
-        for fn, warning_type, message in test_cases:
-            with warnings.catch_warnings(record=True) as w:
-                warnings.resetwarnings()
-                warnings.filterwarnings('always', category=warning_type)
-                fn()
-
-                self.assertEqual(len(w), 1, msg=f'{warning_type} not raised')
-                warning = w[0].message
-                self.assertTrue(isinstance(warning, warning_type), msg=f'{warning_type} not raised')
-                self.assertTrue(re.search(
-                    message,
-                    str(warning)))
-
     def test_structseq_repr(self):
         a = torch.arange(250).reshape(5, 5, 10)
         expected = """
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 30416108acb49..93df86a6e35be 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -1037,7 +1037,6 @@ class _DispatchModule:
     def def_name_t_t(self, name: str, dispatch: str, debug: str = "default_def_name_t_t") -> _DispatchModule: ...
     def def_schema_t_t(self, schema: str, dispatch: str, alias: str, debug: str = "default_def_schema_t_t") -> _DispatchModule: ...
     def impl_t_t(self, name: str, dispatch: str, debug: str = "impl_t_t") -> _DispatchModule: ...
-    def impl_tt_t(self, name: str, dispatch: str, debug: str = "impl_tt_t") -> _DispatchModule: ...
     def impl(self, name: str, dispatch: str, func: Callable) -> _DispatchModule: ...
     def define(self, schema: str, alias: str = "") -> _DispatchModule: ...
     def fallback_fallthrough(self, dispatch: str = "") -> _DispatchModule: ...
diff --git a/torch/__init__.py b/torch/__init__.py
index 422f143db507d..632437c04c8ca 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -1021,10 +1021,9 @@ def _register_device_module(device_type, module):
 
 # expose return_types
 from . import return_types
-if sys.executable != 'torch_deploy' and os.environ.get('PYTORCH_DISABLE_LIBRARY', "0") == "0":
-    from . import library
-    if not TYPE_CHECKING:
-        from . import _meta_registrations
+from . import library
+if not TYPE_CHECKING:
+    from . import _meta_registrations
 
 # Enable CUDA Sanitizer
 if 'TORCH_CUDA_SANITIZER' in os.environ:
diff --git a/torch/autograd/forward_ad.py b/torch/autograd/forward_ad.py
index 0a4ff26b50641..415928f5c22d3 100644
--- a/torch/autograd/forward_ad.py
+++ b/torch/autograd/forward_ad.py
@@ -86,8 +86,7 @@ def make_dual(tensor, tangent, *, level=None):
     #     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
     # Currently broken for 3.11, see https://github.com/pytorch/pytorch/issues/85506
     if (os.environ.get("PYTORCH_JIT", "1" if sys.version_info < (3, 11) else "0") == "1" and
-            __debug__ and
-            os.environ.get('PYTORCH_DISABLE_LIBRARY', "0") == "0"):
+            __debug__):
         from torch._decomp import decompositions_for_jvp  # noqa: F401
 
     if level is None:
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index d1580d1af5799..920d0e7344b58 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -1,8 +1,10 @@
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/core/PythonFallbackKernel.h>
+#include <ATen/core/PythonOpRegistrationTrampoline.h>
 #include <c10/core/DeviceType.h>
 #include <c10/core/SafePyObject.h>
 #include <c10/core/impl/GPUTrace.h>
+#include <c10/core/impl/HermeticPyObjectTLS.h>
 #include <c10/core/impl/PythonDispatcherTLS.h>
 #include <c10/util/DeadlockDetection.h>
 #include <c10/util/irange.h>
@@ -31,6 +33,7 @@
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/pycfunction_helpers.h>
 #include <torch/csrc/utils/python_arg_parser.h>
+#include <torch/csrc/utils/python_dispatch.h>
 #include <torch/csrc/utils/python_numbers.h>
 #include <torch/csrc/utils/python_strings.h>
 #include <torch/csrc/utils/tensor_memoryformats.h>
@@ -219,6 +222,14 @@ struct ConcretePyInterpreterVTable final
       const c10::OperatorHandle& op,
       c10::DispatchKeySet,
       torch::jit::Stack* stack) const override;
+  // NB: this is defined in python_dispatch.cpp
+  void python_op_registration_trampoline(
+      const c10::OperatorHandle& op,
+      c10::DispatchKey key,
+      torch::jit::Stack* stack) const override {
+    torch::impl::dispatch::python_op_registration_trampoline_impl(
+        op, key, stack);
+  }
 
   bool is_contiguous(const TensorImpl* self, at::MemoryFormat) const override;
   bool is_strides_like(const TensorImpl* self, at::MemoryFormat) const override;
@@ -294,6 +305,10 @@ void ConcretePyInterpreterVTable::decref(PyObject* pyobj, bool is_tensor)
   // THPVariable_clear).
   // 2. We are decref-ing some other Python object. We don't do
   // PyObject resurrection on non-Tensors, so we just carry on as usual
+  if (is_tensor) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        !c10::impl::HermeticPyObjectTLS::get_state());
+  }
   if (is_tensor && Py_REFCNT(pyobj) > 1) {
     // It's still alive!  This can happen if a weak ref resurrected
     // the PyObject without flipping ownership.  At this point it is
@@ -314,7 +329,10 @@ class PyInterpreterHolder {
  public:
   PyInterpreterHolder()
       : impl_(new c10::impl::PyInterpreter(
-            ConcretePyInterpreterVTable::instance())) {}
+            ConcretePyInterpreterVTable::instance())) {
+    is_main_interpreter_ =
+        at::impl::PythonOpRegistrationTrampoline::registerInterpreter(impl_);
+  }
   // NB: intentionally leaks the PyInterpreter, as there may still be
   // references to it that are live, living in objects that aren't being
   // destructed while Python is being cleaned up.
@@ -324,9 +342,13 @@ class PyInterpreterHolder {
   c10::impl::PyInterpreter* get() const noexcept {
     return impl_;
   }
+  bool is_main_interpreter() const noexcept {
+    return is_main_interpreter_;
+  }
 
  private:
   c10::impl::PyInterpreter* impl_;
+  bool is_main_interpreter_;
 };
 PyInterpreterHolder self_interpreter;
 
@@ -352,6 +374,10 @@ c10::impl::PyInterpreter* getPyInterpreter() {
   return self_interpreter.get();
 }
 
+bool isMainPyInterpreter() {
+  return self_interpreter.is_main_interpreter();
+}
+
 std::string ConcretePyInterpreterVTable::name() const {
   std::stringstream ss;
   ss << getPyInterpreter();
@@ -416,6 +442,13 @@ PyObject* THPVariable_Wrap(at::TensorBase var) {
     Py_RETURN_NONE;
   }
 
+  if (c10::impl::HermeticPyObjectTLS::get_state()) {
+    return THPVariable_NewWithVar(
+        (PyTypeObject*)THPVariableClass,
+        std::move(var),
+        c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);
+  }
+
   c10::optional<PyObject*> mb_obj =
       var.unsafeGetTensorImpl()->check_pyobj(self_interpreter.get());
   c10::impl::PyInterpreterStatus status;
@@ -489,6 +522,11 @@ bool isResurrectable(THPVariable* self) {
     return false;
   }
   auto const& tensor = THPVariable_Unpack(self);
+  // Check if this is hermetic. If it is, no resurrection.
+  if (tensor.unsafeGetTensorImpl()->check_pyobj(self_interpreter.get()) !=
+      c10::make_optional((PyObject*)self)) {
+    return false;
+  }
   if (!tensor.defined() || tensor.use_count() <= 1) {
     return false;
   }
@@ -531,6 +569,7 @@ static bool THPVariable_tryResurrect(THPVariable* self) {
   // Flip THPVariable to be non-owning
   // (near use-after-free miss here: fresh MaybeOwned is created breaking
   // reference on Tensor in struct BEFORE we overwrite the old one)
+  TORCH_INTERNAL_ASSERT(!c10::impl::HermeticPyObjectTLS::get_state());
   self->cdata = MaybeOwned<Variable>::borrowed(tensor);
 
   // NB: At this point, tensor *could* be dead (e.g., some other C++ thread
@@ -582,7 +621,9 @@ static int THPVariable_clear(THPVariable* self) {
     //        unsafeIsBorrowed() is TRUE.  We're deallocating the PyObject
     //        because Tensor asked us to (it's already destructing).
 
-    if (!self->cdata.unsafeIsBorrowed()) {
+    if (!self->cdata.unsafeIsBorrowed() &&
+        tensor.unsafeGetTensorImpl()->check_pyobj(self_interpreter.get()) ==
+            c10::make_optional((PyObject*)self)) {
       // TODO: empirically, on OS X this assert appears to be untrue
       // In test_py_tensors_multi_async_call - ProcessGroupRpcTestWithSpawn
       // distributed/rpc/test_process_group_agent.py
@@ -1887,11 +1928,27 @@ static PyObject* THPVariable_NewWithVar(
     auto v = (THPVariable*)obj;
     // TODO: named constructor to avoid default initialization
     new (&v->cdata) MaybeOwned<Variable>();
-    v->cdata = MaybeOwned<Variable>::owned(std::move(_var));
-    const auto& var = THPVariable_Unpack(v);
-    var.unsafeGetTensorImpl()->init_pyobj(self_interpreter.get(), obj, status);
-    if (check_has_torch_dispatch(obj)) {
-      var.unsafeGetTensorImpl()->set_python_dispatch(true);
+    if (c10::impl::HermeticPyObjectTLS::get_state()) {
+      // Do NOT initialize pyobj field on the tensor, you own the C++
+      v->cdata = MaybeOwned<Variable>::owned(std::move(_var));
+      TORCH_INTERNAL_ASSERT(
+          !check_has_torch_dispatch(obj),
+          "While HermeticPyObject was enabled, we attempted to create a tensor "
+          "subclass with __torch_dispatch__.  This violates the invariant that "
+          "operations in HermeticPyObject have equivalent C++ implementations. "
+          "If your operator registered from Python operator registration isn't "
+          "doing anything strange, there may be an internal PyTorch bug involving "
+          "not appropriately disabling TorchDispatchMode before executing "
+          "Python op registration.");
+    } else {
+      // Normal codepath
+      v->cdata = MaybeOwned<Variable>::owned(std::move(_var));
+      const auto& var = THPVariable_Unpack(v);
+      var.unsafeGetTensorImpl()->init_pyobj(
+          self_interpreter.get(), obj, status);
+      if (check_has_torch_dispatch(obj)) {
+        var.unsafeGetTensorImpl()->set_python_dispatch(true);
+      }
     }
   }
   return obj;
diff --git a/torch/csrc/autograd/python_variable.h b/torch/csrc/autograd/python_variable.h
index be0bd458197e0..8f448df06b327 100644
--- a/torch/csrc/autograd/python_variable.h
+++ b/torch/csrc/autograd/python_variable.h
@@ -69,6 +69,7 @@ inline const at::Tensor& THPVariable_Unpack(PyObject* obj) {
 }
 
 TORCH_PYTHON_API c10::impl::PyInterpreter* getPyInterpreter();
+TORCH_PYTHON_API bool isMainPyInterpreter();
 
 std::pair<py::object, py::dict> parseIValuesToPyArgsKwargs(
     const c10::OperatorHandle& op,
diff --git a/torch/csrc/utils/python_dispatch.cpp b/torch/csrc/utils/python_dispatch.cpp
index 4ce1d3a1c5d94..0ff1f575a61aa 100644
--- a/torch/csrc/utils/python_dispatch.cpp
+++ b/torch/csrc/utils/python_dispatch.cpp
@@ -4,6 +4,7 @@
 #include <ATen/ATen.h>
 #include <ATen/FuncTorchTLS.h>
 #include <ATen/TensorSubclassLikeUtils.h>
+#include <ATen/core/PythonOpRegistrationTrampoline.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <torch/library.h>
 
@@ -11,6 +12,7 @@
 #include <torch/csrc/autograd/python_variable.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
 
+#include <c10/util/flat_hash_map.h>
 #include <pybind11/operators.h>
 #include <pybind11/stl.h>
 #include <torch/csrc/utils/pybind.h>
@@ -23,6 +25,14 @@ namespace torch {
 namespace impl {
 namespace dispatch {
 
+// NB: I'd like to index this on OperatorHandle, but I can't, as I can't
+// guarantee that the main interpreter has finish doing all registrations before
+// the other interpreters start banging on it
+static ska::flat_hash_map<
+    c10::OperatorName,
+    ska::flat_hash_map<c10::DispatchKey, std::shared_ptr<c10::SafePyObject>>>
+    python_registrations_;
+
 torch::Library::Kind parseKind(const std::string& k) {
   static std::unordered_map<std::string, torch::Library::Kind> kind_map = {
       {"DEF", torch::Library::DEF},
@@ -58,19 +68,101 @@ inline torch::CppFunction dispatch_str(const char* key, Func&& raw_f) {
   }
 }
 
+struct EnableHermeticPyObject {
+  EnableHermeticPyObject()
+      : old_(c10::impl::HermeticPyObjectTLS::get_state()),
+        old_excluded_python_(
+            c10::impl::tls_is_dispatch_key_excluded(at::DispatchKey::Python)),
+        old_python_(
+            c10::impl::tls_is_dispatch_key_included(at::DispatchKey::Python)),
+        old_python_snapshot_(c10::impl::tls_is_dispatch_key_included(
+            at::DispatchKey::PythonTLSSnapshot)) {
+    c10::impl::HermeticPyObjectTLS::set_state(true);
+    c10::impl::tls_set_dispatch_key_excluded(at::DispatchKey::Python, true);
+    c10::impl::tls_set_dispatch_key_included(at::DispatchKey::Python, false);
+    c10::impl::tls_set_dispatch_key_included(
+        at::DispatchKey::PythonTLSSnapshot, false);
+  }
+  ~EnableHermeticPyObject() {
+    c10::impl::HermeticPyObjectTLS::set_state(old_);
+    c10::impl::tls_set_dispatch_key_excluded(
+        at::DispatchKey::Python, old_excluded_python_);
+    c10::impl::tls_set_dispatch_key_included(
+        at::DispatchKey::Python, old_python_);
+    c10::impl::tls_set_dispatch_key_included(
+        at::DispatchKey::PythonTLSSnapshot, old_python_snapshot_);
+  }
+  bool old_;
+  bool old_excluded_python_;
+  bool old_python_;
+  bool old_python_snapshot_;
+};
+
 class PythonKernelHolder : public c10::OperatorKernel {
   c10::SafePyObject func_;
+  c10::DispatchKey dispatch_key_;
 
  public:
-  PythonKernelHolder(py::object func)
-      : func_(func.release().ptr(), getPyInterpreter()) {}
+  PythonKernelHolder(py::object func, c10::DispatchKey dispatch_key)
+      : func_(func.release().ptr(), getPyInterpreter()),
+        dispatch_key_(dispatch_key) {}
 
   void operator()(
       const c10::OperatorHandle& op,
       c10::DispatchKeySet keyset,
       torch::jit::Stack* stack) {
+    // Figure out if we can handle it hermetically, or if we have
+    // to double dispatch
+
+    // If Torch Dispatch Mode is active, use its PyInterpreter for dispatch
+    const auto mode_stack_len = c10::impl::TorchDispatchModeTLS::stack_len();
+    if (mode_stack_len > 0) {
+      const auto& cur_torch_dispatch_mode_state =
+          c10::impl::TorchDispatchModeTLS::get_stack_at(mode_stack_len - 1);
+      cur_torch_dispatch_mode_state->pyinterpreter()
+          ->python_op_registration_trampoline(op, dispatch_key_, stack);
+      return;
+    }
+
+    const auto& schema = op.schema();
+    const auto num_arguments = schema.arguments().size();
+
+    // Otherwise, find a PyInterpreter on a Tensor IF if has Python key (which
+    // means it's a nontrivial tensor subclass)
+    for (const auto& ivalue : torch::jit::last(*stack, num_arguments)) {
+      if (ivalue.isTensor()) {
+        auto* interpreter = ivalue.unsafeToTensorImpl()->pyobj_interpreter();
+        if (interpreter &&
+            ivalue.unsafeToTensorImpl()->key_set().has(
+                at::DispatchKey::Python)) {
+          (*interpreter)
+              ->python_op_registration_trampoline(op, dispatch_key_, stack);
+          return;
+        }
+      } else if (ivalue.isTensorList() || ivalue.isOptionalTensorList()) {
+        // NB: use toListRef as it doesn't induce refcount bumps
+        // (toTensorListRef is not a thing)
+        for (const auto& nv : ivalue.toListRef()) {
+          if (nv.isNone()) {
+            continue;
+          }
+          auto* interpreter = nv.unsafeToTensorImpl()->pyobj_interpreter();
+          if (interpreter &&
+              nv.unsafeToTensorImpl()->key_set().has(at::DispatchKey::Python)) {
+            (*interpreter)
+                ->python_op_registration_trampoline(op, dispatch_key_, stack);
+            return;
+          }
+        }
+      }
+    }
+
+    // Nothing requires the operator to be homed to a specific interpreter, so
+    // run it on the current interpreter
+
     auto arguments = torch::jit::pop(*stack, op.schema().arguments().size());
     py::gil_scoped_acquire g;
+    EnableHermeticPyObject g2;
     auto args_kwargs = parseIValuesToPyArgsKwargs(op, arguments);
     auto obj = py::reinterpret_steal<py::object>(PyObject_Call(
         func_.ptr(getPyInterpreter()),
@@ -83,6 +175,14 @@ class PythonKernelHolder : public c10::OperatorKernel {
   }
 };
 
+torch::_RegisterOrVerify register_or_verify() {
+  if (isMainPyInterpreter()) {
+    return torch::_RegisterOrVerify::REGISTER;
+  } else {
+    return torch::_RegisterOrVerify::VERIFY;
+  }
+}
+
 void initDispatchBindings(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
 
@@ -91,9 +191,12 @@ void initDispatchBindings(PyObject* module) {
 
   // TODO: figure out how to do chaining
   py::class_<torch::Library>(m, "_DispatchModule")
+      // Some of these APIs are only for testing and do not work in multipy
+      // environment
       .def(
           "def_",
           [](py::object self, const char* schema, const char* alias) {
+            TORCH_INTERNAL_ASSERT(isMainPyInterpreter());
             self.cast<torch::Library&>().def(
                 torch::schema(schema, parseAliasAnalysisKind(alias)));
             return self;
@@ -107,6 +210,7 @@ void initDispatchBindings(PyObject* module) {
       .def(
           "def_legacy",
           [](py::object self, const char* schema) {
+            TORCH_INTERNAL_ASSERT(isMainPyInterpreter());
             self.cast<torch::Library&>().def(torch::jit::parseSchema(schema));
             return self;
           },
@@ -126,6 +230,7 @@ void initDispatchBindings(PyObject* module) {
              const char* name,
              const char* dispatch,
              const char* debug) {
+            TORCH_INTERNAL_ASSERT(isMainPyInterpreter());
             self.cast<torch::Library&>().def(
                 name, dispatch_str(dispatch, [](const at::Tensor& a) {
                         return a;
@@ -143,6 +248,7 @@ void initDispatchBindings(PyObject* module) {
              const char* dispatch,
              const char* alias,
              const char* debug) {
+            TORCH_INTERNAL_ASSERT(isMainPyInterpreter());
             self.cast<torch::Library&>().def(
                 torch::schema(schema, parseAliasAnalysisKind(alias)),
                 dispatch_str(dispatch, [](const at::Tensor& a) {
@@ -163,6 +269,7 @@ void initDispatchBindings(PyObject* module) {
              const char* name,
              const char* dispatch,
              const char* debug) {
+            TORCH_INTERNAL_ASSERT(isMainPyInterpreter());
             self.cast<torch::Library&>().impl(
                 name, dispatch_str(dispatch, [](const at::Tensor& a) {
                         return a;
@@ -173,38 +280,26 @@ void initDispatchBindings(PyObject* module) {
           py::arg("name"),
           py::arg("dispatch") = "",
           py::arg("debug") = "impl_t_t")
-      .def(
-          "impl_tt_t",
-          [](py::object self,
-             const char* name,
-             const char* dispatch,
-             const char* debug) {
-            self.cast<torch::Library&>().impl(
-                name,
-                dispatch_str(
-                    dispatch,
-                    [](const at::Tensor& a, const at::Tensor& b) { return a; })
-                    .debug(debug));
-            return self;
-          },
-          "",
-          py::arg("name"),
-          py::arg("dispatch") = "",
-          py::arg("debug") = "")
       .def(
           "impl",
           [](py::object self,
              const char* name,
-             const char* dispatch,
+             // TODO: empty string no longer works
+             c10::DispatchKey dispatch,
              py::object func) {
             HANDLE_TH_ERRORS
-            self.cast<torch::Library&>().impl(
+            auto& lib = self.cast<torch::Library&>();
+            lib.impl(
                 name,
-                dispatch_str(
+                torch::dispatch(
                     dispatch,
                     CppFunction::makeFromBoxedFunctor(
-                        std::make_unique<PythonKernelHolder>(
-                            std::move(func)))));
+                        std::make_unique<PythonKernelHolder>(func, dispatch))),
+                register_or_verify());
+            python_registrations_[lib._resolve(name)].insert_or_assign(
+                dispatch,
+                std::make_shared<c10::SafePyObject>(
+                    func.release().ptr(), getPyInterpreter()));
             END_HANDLE_TH_ERRORS_PYBIND
           },
           "",
@@ -214,8 +309,11 @@ void initDispatchBindings(PyObject* module) {
       .def(
           "define",
           [](py::object self, const char* schema, const char* alias_analysis) {
+            auto parsed_schema =
+                torch::schema(schema, parseAliasAnalysisKind(alias_analysis));
             self.cast<torch::Library&>().def(
-                torch::schema(schema, parseAliasAnalysisKind(alias_analysis)));
+                std::move(parsed_schema), {}, register_or_verify());
+            // TODO: this is dumb, had to make a second copy
             return torch::schema(schema, parseAliasAnalysisKind(alias_analysis))
                 .name();
           },
@@ -225,6 +323,7 @@ void initDispatchBindings(PyObject* module) {
       .def(
           "fallback_fallthrough",
           [](py::object self, const char* dispatch) {
+            TORCH_INTERNAL_ASSERT(isMainPyInterpreter());
             self.cast<torch::Library&>().fallback(
                 dispatch_str(dispatch, CppFunction::makeFallthrough()));
             return self;
@@ -488,6 +587,9 @@ void initDispatchBindings(PyObject* module) {
       },
       py::arg("dispatch_key") = static_cast<const char*>(""));
 
+  m.def(
+      "_dispatch_is_main_interpreter", []() { return isMainPyInterpreter(); });
+
   m.def("_are_functorch_transforms_active", []() {
     auto include_set = c10::impl::tls_local_dispatch_key_set().included_;
     return (
@@ -496,6 +598,26 @@ void initDispatchBindings(PyObject* module) {
   });
 }
 
+// TODO: dedupe with the kernel
+void python_op_registration_trampoline_impl(
+    const c10::OperatorHandle& op,
+    c10::DispatchKey key,
+    torch::jit::Stack* stack) {
+  auto arguments = torch::jit::pop(*stack, op.schema().arguments().size());
+  py::gil_scoped_acquire g;
+  auto args_kwargs = parseIValuesToPyArgsKwargs(op, arguments);
+  const auto& func = python_registrations_[op.operator_name()][key];
+  TORCH_INTERNAL_ASSERT(func != nullptr);
+  auto* pyobj = func->ptr(getPyInterpreter());
+  TORCH_INTERNAL_ASSERT(pyobj != nullptr);
+  auto obj = py::reinterpret_steal<py::object>(
+      PyObject_Call(pyobj, args_kwargs.first.ptr(), args_kwargs.second.ptr()));
+  if (!obj) {
+    throw python_error();
+  }
+  pushPyOutToStack(op, stack, obj, "PythonKernelHolder");
+}
+
 } // namespace dispatch
 } // namespace impl
 } // namespace torch
diff --git a/torch/csrc/utils/python_dispatch.h b/torch/csrc/utils/python_dispatch.h
index f05c36ac268de..d719de730551b 100644
--- a/torch/csrc/utils/python_dispatch.h
+++ b/torch/csrc/utils/python_dispatch.h
@@ -7,6 +7,11 @@ namespace dispatch {
 
 void initDispatchBindings(PyObject* module);
 
-}
+void python_op_registration_trampoline_impl(
+    const c10::OperatorHandle& op,
+    c10::DispatchKey key,
+    torch::jit::Stack* stack);
+
+} // namespace dispatch
 } // namespace impl
 } // namespace torch
diff --git a/torch/library.h b/torch/library.h
index 69175d0756622..71b190e8a3e91 100644
--- a/torch/library.h
+++ b/torch/library.h
@@ -86,6 +86,12 @@ namespace torch {
 struct NoInferSchemaTag {};
 #endif
 
+// For multipy/torchdeploy use case
+enum class _RegisterOrVerify {
+  REGISTER,
+  VERIFY
+};
+
 template <class CurClass>
 class class_;
 
@@ -591,9 +597,9 @@ class TORCH_API Library final {
   /// ```
 
   template <typename Schema>
-  Library& def(Schema&& raw_schema, const std::vector<at::Tag>& tags = {}) & {
+  Library& def(Schema&& raw_schema, const std::vector<at::Tag>& tags = {}, _RegisterOrVerify rv = _RegisterOrVerify::REGISTER) & {
     c10::FunctionSchema s = schema(std::forward<Schema>(raw_schema));
-    return _def(std::move(s), nullptr, tags);
+    return _def(std::move(s), nullptr, tags, rv);
   }
   /// Define an operator for a schema and then register an implementation for
   /// it.  This is typically what you would use if you aren't planning
@@ -644,7 +650,7 @@ class TORCH_API Library final {
   /// }
   /// ```
   template <typename Name, typename Func>
-  Library& impl(Name name, Func&& raw_f) & {
+  Library& impl(Name name, Func&& raw_f, _RegisterOrVerify rv = _RegisterOrVerify::REGISTER) & {
     // TODO: need to raise an error when you impl a function that has a
     // catch all def
 #if defined C10_MOBILE
@@ -652,7 +658,7 @@ class TORCH_API Library final {
 #else
     CppFunction f(std::forward<Func>(raw_f));
 #endif
-    return _impl(name, std::move(f));
+    return _impl(name, std::move(f), rv);
   }
 
 #if defined C10_MOBILE
@@ -673,6 +679,10 @@ class TORCH_API Library final {
   }
 #endif
 
+  // Helper for getting an OperatorName for a const char*.  You probably
+  // don't need this.
+  c10::OperatorName _resolve(const char* name) const;
+
   /// \private
   ///
   /// Convenience overload for directly specifying the dispatch key when
@@ -809,12 +819,17 @@ class TORCH_API Library final {
   Library& _def(
       c10::FunctionSchema&& schema,
       c10::OperatorName* out_name = nullptr,
-      const std::vector<at::Tag>& tags = {}) &;
+      const std::vector<at::Tag>& tags = {},
+      _RegisterOrVerify rv = _RegisterOrVerify::REGISTER
+      ) &;
   Library& _def(
       c10::either<c10::OperatorName, c10::FunctionSchema>&&,
       CppFunction&& f) &;
-  Library& _impl(const char* name, CppFunction&& f) &;
+  Library& _impl(const char* name, CppFunction&& f,
+    _RegisterOrVerify rv = _RegisterOrVerify::REGISTER) &;
   Library& _fallback(CppFunction&& f) &;
+
+  at::OperatorName _parseNameForLib(const char* name_str) const;
 };
 
 namespace detail {
diff --git a/torch/library.py b/torch/library.py
index d75427ea4c703..a4b538ddd1244 100644
--- a/torch/library.py
+++ b/torch/library.py
@@ -2,7 +2,6 @@
 from typing import Set
 import traceback
 import torch
-import os
 
 __all__ = ['Library', 'impl', 'define']
 
@@ -30,9 +29,6 @@ class Library:
         dispatch_key: PyTorch dispatch key (default: "")
     """
     def __init__(self, ns, kind, dispatch_key=""):
-        if os.environ.get('PYTORCH_DISABLE_LIBRARY', "0") == "1":
-            raise RuntimeError("Trying to use torch.library in an environment where it is disabled")
-
         if kind != "IMPL" and kind != "DEF":
             raise ValueError("Unsupported kind: ", kind)
 
@@ -126,7 +122,8 @@ def impl(self, op_name, fn, dispatch_key=''):
                     " Instead we should let the operator decompose, and ensure that we have meta kernels"
                     " for the base ops that it decomposes into.")
 
-        self.m.impl(name, dispatch_key, fn)
+        self.m.impl(name, dispatch_key if dispatch_key != "" else "CompositeImplicitAutograd", fn)
+
         _impls.add(key)
         self._op_impls.add(key)
 

From 5069f992f0faddfa52682525d89f65636ca62aa0 Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Wed, 2 Nov 2022 15:10:37 +0000
Subject: [PATCH 0521/1922] Add a shortcut in Makefile for updating triton
 (#88318)

Summary: Local triton installation needs to be updated after we migrate
to a newer version of triton, e.g.
https://github.com/pytorch/pytorch/pull/88242. The Makefile shortcut
makes that easier.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88318
Approved by: https://github.com/ezyang
---
 Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile b/Makefile
index 21745f42a8873..45dfeb8cda267 100644
--- a/Makefile
+++ b/Makefile
@@ -31,3 +31,7 @@ lint:
 
 quicklint:
 	lintrunner
+
+triton:
+	$(PIP) uninstall -y triton
+	$(PIP) install -U "git+https://github.com/openai/triton@$(shell cat .github/ci_commit_pins/triton.txt)#subdirectory=python"

From 8974fd5cc479c9ab7080fe9cf2f9f5c9c4d020d7 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Wed, 2 Nov 2022 18:06:05 +0000
Subject: [PATCH 0522/1922] [FSDP()] Have `fully_shard()` abide by `@contract`!
 (#88235)

We are making some progress on composability :)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88235
Approved by: https://github.com/mrshenli
---
 test/distributed/_composable/test_fully_shard.py |  8 ++++----
 torch/distributed/_composable/fully_shard.py     | 11 ++++++-----
 torch/distributed/fsdp/_common_utils.py          | 14 +++++---------
 3 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/test/distributed/_composable/test_fully_shard.py b/test/distributed/_composable/test_fully_shard.py
index 0cf51027c29b6..27e0fb855fba7 100644
--- a/test/distributed/_composable/test_fully_shard.py
+++ b/test/distributed/_composable/test_fully_shard.py
@@ -89,7 +89,7 @@ def test_auto_wrap_policy(self):
             use_orig_params=True,
         )
         composable_module = copy.deepcopy(local_model)
-        fsdp_state = fully_shard(
+        fully_shard(
             composable_module,
             auto_wrap_policy=Model.auto_wrap_policy(),
         )
@@ -110,7 +110,7 @@ def test_auto_wrap_policy(self):
 
         # Check that the composable module has the same  `FlatParameter`
         # construction as the FSDP-wrapped model
-        composable_handles = fsdp_state._handles
+        composable_handles = fully_shard.state(composable_module)._handles
         fsdp_wrapped_handles = FSDP._fsdp_handles(fsdp_wrapped_model)
         self.assertEqual(len(composable_handles), len(fsdp_wrapped_handles))
         for (composable_handle, fsdp_wrapped_handle) in zip(
@@ -231,7 +231,7 @@ def test_training(self):
             use_orig_params=True,
         )
         composable_module = copy.deepcopy(local_model)
-        fsdp_state = fully_shard(
+        fully_shard(
             composable_module,
             auto_wrap_policy=Model.auto_wrap_policy(),
         )
@@ -251,7 +251,7 @@ def test_training(self):
                 # hook registration, currently blocked by kwarg support
                 if model is composable_module:
                     args, kwargs = _root_pre_forward(
-                        fsdp_state, composable_module, *inp
+                        fully_shard.state(composable_module), composable_module, *inp
                     )
                 else:
                     args = inp
diff --git a/torch/distributed/_composable/fully_shard.py b/torch/distributed/_composable/fully_shard.py
index 422c142061845..e95c93e09adfa 100644
--- a/torch/distributed/_composable/fully_shard.py
+++ b/torch/distributed/_composable/fully_shard.py
@@ -1,9 +1,9 @@
-from typing import Callable, cast, Iterable, Optional, Union
+from typing import Callable, Iterable, Optional, Union
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-from torch.distributed.fsdp._common_utils import _FSDPState, ComposableFSDPState
+from torch.distributed._composable.contract import contract
 from torch.distributed.fsdp._init_utils import (
     _init_buffer_state,
     _init_core_state,
@@ -26,6 +26,7 @@
 )
 
 
+@contract
 def fully_shard(
     module: nn.Module,
     process_group: Optional[dist.ProcessGroup] = None,
@@ -37,11 +38,11 @@ def fully_shard(
     device_id: Optional[Union[int, torch.device]] = None,
     param_init_fn: Optional[Callable[[nn.Module], None]] = None,
     sync_module_states: bool = False,
-) -> ComposableFSDPState:
+) -> nn.Module:
     """
     Applies ``FullyShardedDataParallel` (FSDP) semantics to ``module``.
     """
-    state = cast(_FSDPState, ComposableFSDPState())
+    state = fully_shard.state(module)
     state = _init_ignored_module_states(state, module, ignored_modules)
     state = _init_process_group_state(state, process_group)
     limit_all_gathers = True
@@ -73,4 +74,4 @@ def fully_shard(
     modules = list(module.modules())
     _register_pre_forward_hooks(state, modules)
     _register_post_forward_hooks(state, modules)
-    return cast(ComposableFSDPState, state)
+    return module
diff --git a/torch/distributed/fsdp/_common_utils.py b/torch/distributed/fsdp/_common_utils.py
index 44c77150a6ae6..c93c8abb5ebd8 100644
--- a/torch/distributed/fsdp/_common_utils.py
+++ b/torch/distributed/fsdp/_common_utils.py
@@ -4,7 +4,7 @@
 
 import traceback
 from enum import auto, Enum
-from typing import Callable, Dict, List, no_type_check, Union
+from typing import Any, Callable, Dict, List, no_type_check, Union
 
 import torch
 import torch.distributed.fsdp.flat_param as flat_param_file
@@ -18,15 +18,11 @@
 FSDP_FLATTENED = "_fsdp_flattened"
 
 
-class ComposableFSDPState:
-    """
-    This encompasses all FSDP state for composable FSDP.
-    """
-
-
 # We leverage Python's dynamic attribute definition to unify the state
-# management for the wrapper and non-wrapper approaches.
-_FSDPState = Union[nn.Module, ComposableFSDPState]
+# management for the wrapper and non-wrapper approaches. The `Any` represents
+# the `_State` object in _composable/contract.py, but we do not import it to
+# avoid circular imports.
+_FSDPState = Union[nn.Module, Any]
 
 
 class TrainingState(Enum):

From e71357b51a58d1a136d83274a669ab3d930d90b8 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Wed, 2 Nov 2022 18:06:05 +0000
Subject: [PATCH 0523/1922] [FSDP()][Easy] Make `fully_shard()` only
 `FULL_SHARD` (#88260)

We can have a separate API for each of the other sharding strategies.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88260
Approved by: https://github.com/mrshenli
---
 torch/distributed/_composable/fully_shard.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/torch/distributed/_composable/fully_shard.py b/torch/distributed/_composable/fully_shard.py
index e95c93e09adfa..2d9e9329795bd 100644
--- a/torch/distributed/_composable/fully_shard.py
+++ b/torch/distributed/_composable/fully_shard.py
@@ -30,7 +30,6 @@
 def fully_shard(
     module: nn.Module,
     process_group: Optional[dist.ProcessGroup] = None,
-    sharding_strategy: Optional[ShardingStrategy] = None,
     mixed_precision: Optional[MixedPrecision] = None,
     cpu_offload: Optional[CPUOffload] = None,
     auto_wrap_policy: Optional[Callable] = None,
@@ -51,7 +50,7 @@ def fully_shard(
     forward_prefetch_limit = 1
     state = _init_core_state(
         state,
-        sharding_strategy,
+        ShardingStrategy.FULL_SHARD,
         mixed_precision,
         cpu_offload,
         limit_all_gathers,

From 5e4c1a08dbd4b173d1e03d575610dcbdd9fdef10 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@fb.com>
Date: Thu, 3 Nov 2022 15:15:57 +0000
Subject: [PATCH 0524/1922] Add support for neg to NestedTensor (#88131)

Partially fixes #86889

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88131
Approved by: https://github.com/drisspg
---
 aten/src/ATen/native/native_functions.yaml    |  2 ++
 .../native/nested/NestedTensorUnaryOps.cpp    | 12 +++++++
 docs/source/nested.rst                        |  1 +
 test/test_nestedtensor.py                     | 34 +++++++++++--------
 4 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 2f757965f4e75..3af39c542918f 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -4247,6 +4247,7 @@
   dispatch:
     SparseCPU, SparseCUDA: neg_sparse
     SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg
   tags: canonical
 
 - func: neg_(Tensor(a!) self) -> Tensor(a!)
@@ -4256,6 +4257,7 @@
   dispatch:
     SparseCPU, SparseCUDA: neg_sparse_
     SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr_
+    NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg_
 
 - func: neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
diff --git a/aten/src/ATen/native/nested/NestedTensorUnaryOps.cpp b/aten/src/ATen/native/nested/NestedTensorUnaryOps.cpp
index 74289a1372e12..6be7239775ea6 100644
--- a/aten/src/ATen/native/nested/NestedTensorUnaryOps.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorUnaryOps.cpp
@@ -58,5 +58,17 @@ Tensor NestedTensor_tanh(const Tensor& self) {
   return map_nt(self, at::tanh);
 }
 
+Tensor& NestedTensor_neg_(Tensor& self) {
+  auto self_ptr = get_nested_tensor_impl(self);
+  check_numel_equals_buffer_size(self_ptr);
+  auto buffer = self_ptr->get_buffer();
+  at::neg_(buffer);
+  return self;
+}
+
+Tensor NestedTensor_neg(const Tensor& self) {
+  return map_nt(self, at::neg);
+}
+
 } // namespace native
 } // namespace at
diff --git a/docs/source/nested.rst b/docs/source/nested.rst
index 21ff980256911..07712e0376f16 100644
--- a/docs/source/nested.rst
+++ b/docs/source/nested.rst
@@ -196,6 +196,7 @@ NestedTensor and any constraints they have.
    :func:`torch.nn.Dropout`; "Behavior is the same as on regular tensors."
    :func:`torch.relu`; "Behavior is the same as on regular tensors."
    :func:`torch.gelu`; "Behavior is the same as on regular tensors."
+   :func:`torch.neg`; "Behavior is the same as on regular tensors."
    :func:`torch.add`; "Supports elementwise addition of two nested tensors.
    Supports addition of a scalar to a nested tensor."
    :func:`torch.mul`; "Supports elementwise multiplication of two nested tensors.
diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py
index 84a30e0125e49..f914fa57dd9a6 100644
--- a/test/test_nestedtensor.py
+++ b/test/test_nestedtensor.py
@@ -20,6 +20,7 @@
     parametrize,
     run_tests,
     TestCase,
+    subtest,
 )
 
 # Tests are ported from pytorch/nestedtensor.
@@ -304,20 +305,6 @@ def test_repr_string(self):
         self.assertEqual(str(a), expected)
         self.assertEqual(repr(a), expected)
 
-    @torch.inference_mode()
-    def test_activations(self):
-        for func in (torch.nn.functional.relu,
-                     torch.nn.functional.relu_,
-                     torch.nn.functional.gelu,
-                     torch._C._nn.gelu_,
-                     torch.tanh,
-                     torch.tanh_):
-            t = torch.tensor([-1, 0, 1], dtype=torch.float)
-            nt = torch.nested.nested_tensor([t])
-            nested_result = func(nt)
-            self.assertTrue(nested_result.is_nested)
-            self.assertEqual(func(t), nested_result.unbind()[0])
-
     def test_to_padded_tensor_on_empty_tensor(self):
 
         nt = torch.nested.nested_tensor([])
@@ -762,6 +749,24 @@ def test_nested_tensor_indexing(self, device, dtype):
         expected_grad = torch.nested.nested_tensor([grad_x0, torch.zeros((3, 4), device=device, dtype=dtype)])
         self.assertEqual(nt.grad, expected_grad)
 
+    @parametrize("func", [subtest(torch.nn.functional.relu, name='relu'),
+                          subtest(torch.nn.functional.relu_, name='relu_'),
+                          subtest(torch.nn.functional.gelu, name='gelu'),
+                          subtest(torch._C._nn.gelu_, name='gelu_'),
+                          subtest(torch.tanh, name='tanh'),
+                          subtest(torch.tanh_, name='tanh_'),
+                          subtest(torch.neg, name='neg')])
+    def test_activations(self, device, func):
+        nt, nt_noncontiguous = random_nt_noncontiguous_pair((2, 3, 6, 7), device=device, dtype=torch.float32)
+        nested_result = func(nt)
+        self.assertTrue(nested_result.is_nested)
+        for t, t_res in zip(nt.unbind(), nested_result.unbind()):
+            self.assertEqual(func(t), t_res)
+        self.assertRaisesRegex(
+            RuntimeError,
+            "NestedTensor must be contiguous to get buffer.",
+            lambda: func(nt_noncontiguous))
+
     @dtypes(*floating_types_and_half())
     def test_nested_tensor_chunk(self, device, dtype):
         # Transformer use case
@@ -912,7 +917,6 @@ def test_nested_tensor_div(self, device, dtype):
             RuntimeError, "div requires offsets to match when given NestedTensors",
             lambda: nt_chunks[0] / nt_chunks[1])
 
-
     @dtypes(torch.float, torch.float16)
     @skipMeta
     @torch.inference_mode()

From 26a13c3cf59a503fef23bbe0923cd0c20b6895df Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Thu, 3 Nov 2022 15:58:18 +0000
Subject: [PATCH 0525/1922] =?UTF-8?q?reduce=20the=20number=20of=20autotuni?=
 =?UTF-8?q?ng=20iterations,=20don't=20autotune=20simple=20til=E2=80=A6=20(?=
 =?UTF-8?q?#88386)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…ed copies

Partially fixes https://github.com/pytorch/torchdynamo/issues/1807, reduces compile time for me from 360 s to 90s.

Kernels with multiple outputs sometimes autotune to unexpected configs, so I'm limiting the heuristic to relatively safe application.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88386
Approved by: https://github.com/jansel
---
 torch/_inductor/codegen/triton.py      |  9 ++++++++-
 torch/_inductor/ir.py                  |  5 +++++
 torch/_inductor/triton_ops/autotune.py | 10 +++++-----
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 6264c54f84ab7..58ac425a95e8a 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -981,6 +981,7 @@ def codegen_kernel(self, name=None):
                     import triton
                     import triton.language as tl
                     from {config.inductor_import}.ir import ReductionHint
+                    from {config.inductor_import}.ir import TileHint
                     from {config.inductor_import}.triton_ops.autotune import {heuristics}
                     from {config.inductor_import}.utils import instance_descriptor
                 """
@@ -1021,8 +1022,14 @@ def codegen_kernel(self, name=None):
                 @triton.jit
             """
         else:
+            tile_hint = ""
+            if len(size_hints) == 2:
+                if len(signature) == 4:  # input, output and 2 args
+                    tile_hint = "tile_hint=TileHint.SQUARE,"
+                else:
+                    tile_hint = "tile_hint=TileHint.DEFAULT,"
             heuristics_line = f"""
-                @{heuristics}(size_hints={size_hints!r}, filename=__file__, meta={triton_meta!r})
+                @{heuristics}(size_hints={size_hints!r}, {tile_hint}filename=__file__, meta={triton_meta!r})
                 @triton.jit
             """
         code.splice(heuristics_line)
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 8ca869df03602..de4b0cefa1b80 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -380,6 +380,11 @@ class ReductionHint(Enum):
     DEFAULT = 3
 
 
+class TileHint(Enum):
+    SQUARE = 0
+    DEFAULT = 1
+
+
 @dataclasses.dataclass
 class Reduction(Loops):
     reduction_ranges: List[Expr]
diff --git a/torch/_inductor/triton_ops/autotune.py b/torch/_inductor/triton_ops/autotune.py
index b6f1c5cbabe3a..a0bc76569c338 100644
--- a/torch/_inductor/triton_ops/autotune.py
+++ b/torch/_inductor/triton_ops/autotune.py
@@ -11,7 +11,7 @@
 import torch
 
 from .. import config
-from ..ir import ReductionHint
+from ..ir import ReductionHint, TileHint
 from ..triton_ops.mm_perf_model import estimate_matmul_time
 from ..utils import conditional_product, dynamo_utils, has_triton
 from .conv_perf_model import (
@@ -134,7 +134,7 @@ def kernel_call():
 
         from triton.testing import do_bench
 
-        return do_bench(kernel_call)
+        return do_bench(kernel_call, rep=40)
 
     @dynamo_utils.dynamo_timed
     def autotune_to_one_config(self, *args, **kwargs):
@@ -377,15 +377,15 @@ def triton_config_tiled_reduction(size_hints, x, y, r, num_stages=2):
     return Config(cfg, num_warps=num_warps, num_stages=num_stages)
 
 
-def pointwise(size_hints, meta, filename=None):
+def pointwise(size_hints, meta, tile_hint=None, filename=None):
     """
     Construct @triton.heuristics() based on size_hints.
     """
     if len(size_hints) == 1:
         return cached_autotune([triton_config(size_hints, 1024)], meta=meta)
     if len(size_hints) == 2:
-        if not config.triton.autotune:
-            return cached_autotune([triton_config(size_hints, 64, 64)], meta=meta)
+        if not config.triton.autotune or tile_hint == TileHint.SQUARE:
+            return cached_autotune([triton_config(size_hints, 32, 32)], meta=meta)
         return cached_autotune(
             [
                 triton_config(size_hints, 32, 32),

From c5da0479a804a469d62bfc2ac8c3e6527d87147e Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Thu, 3 Nov 2022 16:20:14 +0000
Subject: [PATCH 0526/1922] better error message for out= ops (#88367)

In cases where a tensor kwarg is actually "out=", the following error message would look nicer than this :
```
Traceback (most recent call last):
  File "/fsx/users/binbao/pytorch/torch/_inductor/graph.py", line 241, in call_function
    out = lowerings[target](*args, **kwargs)
  File "/fsx/users/binbao/pytorch/torch/_inductor/lowering.py", line 168, in wrapped
    assert not any(isinstance(x, TensorBox) for x in kwargs.values())
AssertionError

```

https://github.com/pytorch/torchdynamo/issues/1798

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88367
Approved by: https://github.com/desertfire
---
 torch/_inductor/lowering.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index a05e6d527ea9a..2fe86f8b6501b 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -164,6 +164,11 @@ def wrapped(*args, **kwargs):
             args = args[0]
         # Only look at args that are Tensors
         indices = [i for i, x in enumerate(args) if isinstance(x, TensorBox)]
+
+        # explicitly assert for "out=" ops for better error messages
+        assert not any(
+            x == "out" for x in kwargs.keys()
+        ), "out= ops aren't yet supported"
         # kwargs tensors not supported yet
         assert not any(isinstance(x, TensorBox) for x in kwargs.values())
 

From 85f641d101748e41342912cdb278ba63ee02cf3e Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Thu, 3 Nov 2022 16:21:15 +0000
Subject: [PATCH 0527/1922] add an exclude for test_constructor for inductor
 (#88143)

This test (https://github.com/pytorch/torchdynamo/issues/1800) fails since none of the c-tor ops support `pin_memory=True`. Natalia suggests it's not a priority to fix.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88143
Approved by: https://github.com/desertfire
---
 test/test_torch.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/test_torch.py b/test/test_torch.py
index 7b91fa5d62a14..05c168210d937 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -32,7 +32,7 @@
 from torch.testing._internal.common_utils import (
     TestCase, TEST_WITH_ROCM, run_tests,
     IS_WINDOWS, IS_FILESYSTEM_UTF8_ENCODING, NO_MULTIPROCESSING_SPAWN,
-    IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, load_tests, slowTest,
+    IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, load_tests, skipIfTorchInductor, slowTest,
     TEST_WITH_CROSSREF, skipIfTorchDynamo,
     skipCUDAMemoryLeakCheckIf, BytesIOContext,
     skipIfRocm, skipIfNoSciPy, TemporaryFileName, TemporaryDirectoryName,
@@ -4648,6 +4648,7 @@ def compare_strides(s1, s2, div):
 
     @onlyCUDA
     @unittest.skipIf(PYTORCH_CUDA_MEMCHECK, "is_pinned uses failure to detect pointer property")
+    @skipIfTorchInductor("pin_memory isn't yet supported in TorchInductor")
     def test_pin_memory_from_constructor(self, device):
         def _get_like(t, **kwargs):
             return [
@@ -6793,6 +6794,7 @@ def test_new(self) -> None:
         self.assertRaises(RuntimeError, lambda: x.new(z.storage()))
 
     @unittest.skipIf(PYTORCH_CUDA_MEMCHECK, "is_pinned uses failure to detect pointer property")
+    @skipIfTorchInductor("pin_memory isn't yet supported in TorchInductor")
     def test_pin_memory(self):
         x = torch.randn(3, 5)
         self.assertFalse(x.is_pinned())

From 72cfa5d1d721c369309eab92e99091bf032c51ff Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Thu, 3 Nov 2022 16:52:37 +0000
Subject: [PATCH 0528/1922] disable the out variants in test_cumprod test for
 inductor (#88328)

`out=` variants aren't supported by autograd and it's not a must fix, so disabling the test (https://github.com/pytorch/torchdynamo/issues/1798) for now.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88328
Approved by: https://github.com/desertfire
---
 test/test_torch.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/test/test_torch.py b/test/test_torch.py
index 05c168210d937..7fdcf3f235399 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -30,7 +30,7 @@
 from torch import multiprocessing as mp
 from torch.testing import make_tensor
 from torch.testing._internal.common_utils import (
-    TestCase, TEST_WITH_ROCM, run_tests,
+    TEST_WITH_TORCHINDUCTOR, TestCase, TEST_WITH_ROCM, run_tests,
     IS_WINDOWS, IS_FILESYSTEM_UTF8_ENCODING, NO_MULTIPROCESSING_SPAWN,
     IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, load_tests, skipIfTorchInductor, slowTest,
     TEST_WITH_CROSSREF, skipIfTorchDynamo,
@@ -2312,8 +2312,9 @@ def test_cumprod(self, device):
         x = torch.rand(100, 100, device=device)
         res1 = torch.cumprod(x, 1)
         res2 = torch.tensor([]).to(device)
-        torch.cumprod(x, 1, out=res2)
-        self.assertEqual(res1, res2)
+        if not TEST_WITH_TORCHINDUCTOR:
+            torch.cumprod(x, 1, out=res2)
+            self.assertEqual(res1, res2)
         x.cumprod_(1)
         self.assertEqual(res1, x)
 

From 10e77814fc4e291a1229ef3885345e9c060bfa27 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Thu, 3 Nov 2022 16:53:01 +0000
Subject: [PATCH 0529/1922] Revert "Fix primTorch
 compute_elementwise_output_strides (#88175)"

This reverts commit 1c8a0656d65412b83d3c00f2fc66ab958e991de8.

Reverted https://github.com/pytorch/pytorch/pull/88175 on behalf of https://github.com/huydhn due to Sorry for reverting your PR but it breaks cuda 11.6 in trunk. As the PR signal was green, this is probably a landrace
---
 test/inductor/test_torchinductor_opinfo.py |  8 +++++
 test/test_meta.py                          | 35 ++++++++++++++++++++++
 torch/_prims_common/__init__.py            | 24 ++++++---------
 3 files changed, 52 insertions(+), 15 deletions(-)

diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index d28aafb467843..93e5412716296 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -205,6 +205,8 @@ def process(device_type):
     "index_put": {f16, f32, f64},
     "index_reduce": {f16, f32, f64},
     "istft": {f32, f64},
+    "linalg.cholesky": {f32, f64},
+    "linalg.cholesky_ex": {f32, f64},
     "linalg.eig": {f32, f64},
     "linalg.eigh": {f32, f64},
     "linalg.eigvals": {f32, f64},
@@ -244,6 +246,7 @@ def process(device_type):
     "normal": {f16, f32, f64},
     "normal.number_mean": {f16, f32, f64},
     "pca_lowrank": {f32, f64},
+    "pinverse": {f32, f64},
     "polar": {f32, f64},
     "quantile": {f32, f64},
     "rand_like": {f16, f32, f64},
@@ -254,6 +257,7 @@ def process(device_type):
     "scatter_reduce.sum": {f16},
     "scatter_reduce.prod": {f16, f32, f64},
     "segment_reduce.lengths": {f16, f32, f64},
+    "segment_reduce.offsets": {f16, f32, f64},
     "sgn": {f16, f32, f64},
     "sparse.sampled_addmm": {f32, f64},
     "stft": {f32, f64},
@@ -312,6 +316,8 @@ def process(device_type):
     "index_put": {f16, f32, f64},
     "index_reduce": {f16, f32, f64},
     "istft": {f32, f64},
+    "linalg.cholesky": {f32, f64},
+    "linalg.cholesky_ex": {f32, f64},
     "linalg.eig": {f32, f64},
     "linalg.eigh": {f32, f64},
     "linalg.eigvals": {f32, f64},
@@ -341,6 +347,7 @@ def process(device_type):
     "normal": {f16, f32, f64},
     "normal.number_mean": {f16, f32, f64},
     "pca_lowrank": {f32, f64},
+    "pinverse": {f32, f64},
     "polar": {f32, f64},
     "pow": {i32, i64},
     "rand_like": {f16, f32, f64},
@@ -350,6 +357,7 @@ def process(device_type):
     "round.decimals_3": {f16},
     "scatter_reduce.prod": {f16, f32, f64},
     "segment_reduce.lengths": {f16, f32, f64},
+    "segment_reduce.offsets": {f16, f32, f64},
     "sgn": {f16, f32, f64},
     "stft": {f32, f64},
     "svd_lowrank": {f32, f64},
diff --git a/test/test_meta.py b/test/test_meta.py
index ef25d184c8428..88644a6552b1b 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -295,20 +295,55 @@ def test_tensor_outlives_converter(self):
     aten._fft_r2c.default,
     aten._linalg_svd.default,
     aten._scaled_dot_product_attention_forward.default,
+    aten.add.Tensor,
+    aten.atan2.default,
     aten.binary_cross_entropy.default,
+    aten.bitwise_and.Tensor,
+    aten.bitwise_left_shift.Tensor,
+    aten.bitwise_or.Tensor,
+    aten.bitwise_right_shift.Tensor,
+    aten.bitwise_xor.Tensor,
+    aten.clamp_max.Tensor,
+    aten.clamp_min.Tensor,
     aten.complex.default,
     aten.copysign.Tensor,
     aten.div.Tensor_mode,
+    aten.div.Tensor,
+    aten.eq.Tensor,
     aten.floor_divide.default,
+    aten.fmax.default,
+    aten.fmin.default,
+    aten.fmod.Tensor,
+    aten.gcd.default,
+    aten.ge.Tensor,
+    aten.gt.Tensor,
     aten.heaviside.default,
+    aten.hypot.default,
+    aten.igamma.default,
+    aten.igammac.default,
+    aten.lcm.default,
+    aten.le.Tensor,
     aten.lerp.Scalar,
     aten.lerp.Tensor,
     aten.logical_and.default,
     aten.logical_or.default,
     aten.logical_xor.default,
+    aten.lt.Tensor,
+    aten.maximum.default,
+    aten.minimum.default,
+    aten.mul.Tensor,
+    aten.ne.Tensor,
+    aten.nextafter.default,
     aten.pow.Scalar,
+    aten.pow.Tensor_Scalar,
+    aten.pow.Tensor_Tensor,
     aten.prelu.default,
+    aten.remainder.Tensor,
+    aten.rsub.Tensor,
     aten.special_xlog1py.default,
+    aten.special_zeta.default,
+    aten.sub.Tensor,
+    aten.where.self,
     aten.xlogy.Tensor,
 
     # channel_last and channel_last_3d related failures
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index 90777ed6601aa..ee4dd38a655c6 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -360,7 +360,7 @@ def compute_elementwise_output_strides(*tensors) -> Tuple[int, ...]:
 
     shape = tensors[0].shape
 
-    def should_swap(idx_a, idx_b):
+    def _cmp(idx_a, idx_b):
         for tensor in tensors:
             stride_a = tensor.stride()[idx_a]
             stride_b = tensor.stride()[idx_b]
@@ -378,30 +378,24 @@ def should_swap(idx_a, idx_b):
             if shape[idx_a] > shape[idx_b]:
                 return 1
 
+            # NOTE: this case is missing in the C++ impl
+            if shape[idx_a] < shape[idx_b]:
+                return -1
+
         # Note: this case is hit if all strides are zero,
         # or all strides are equal and all dimensions have the same length
         return 0
 
-    perm = list(reversed(range(ndim)))
-
-    # insertion sort with support for ambiguous comparisons
-    for i in range(1, ndim):
-        dim1 = i
-        for dim0 in reversed(range(i)):
-            comparison = should_swap(perm[dim0], perm[dim1])
-            if comparison > 0:
-                perm[dim0], perm[dim1] = perm[dim1], perm[dim0]
-                dim1 = dim0
-            elif comparison < 0:
-                break
+    perm = tuple(range(ndim))
+    perm = tuple(sorted(perm, key=cmp_to_key(_cmp), reverse=True))
 
     permuted_shape = [-1] * ndim
-    for idx, x in enumerate(reversed(perm)):
+    for idx, x in enumerate(perm):
         permuted_shape[idx] = shape[x]
 
     new_strides = make_contiguous_strides_for(permuted_shape)
     permuted_strides = [-1] * ndim
-    for idx, x in enumerate(reversed(perm)):
+    for idx, x in enumerate(perm):
         permuted_strides[x] = new_strides[idx]
 
     return tuple(permuted_strides)

From 86465e6daf4b04ea7e96cd00675e188a09a413e3 Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mergennachin@gmail.com>
Date: Wed, 2 Nov 2022 14:13:20 -0700
Subject: [PATCH 0530/1922] Disallow module attribute mutation (#88354)

Summary:

See https://github.com/pytorch/torchdynamo/issues/1475

Not allowing any new mutations happen inside forward() function during
export.

Test Plan:

Run `python test/dynamo/test_export.py` and make sure it passes

Added new unit tests (3 positive tests and 4 negative tests)

Here's what the actual error looks like

```
  File "/home/mnachin/local/miniconda3/envs/pytorch/lib/python3.9/site-packages/torch/_dynamo/symbolic_convert.py", line 322, in step
    getattr(self, inst.opname)(inst)
  File "/home/mnachin/local/miniconda3/envs/pytorch/lib/python3.9/site-packages/torch/_dynamo/symbolic_convert.py", line 835, in STORE_ATTR
    assert not self.export, f"Mutating module attribute {inst.argval} during export."
AssertionError: Mutating module attribute a during export.

from user code:
   File "/data/users/mnachin/pytorch/test/dynamo/test_export_mutations.py", line 25, in forward
    self.a = self.a.to(torch.float64)

Set torch._dynamo.config.verbose=True for more information
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88354
Approved by: https://github.com/tugsbayasgalan, https://github.com/jansel
---
 test/dynamo/test_export_mutations.py | 119 +++++++++++++++++++++++++++
 torch/_dynamo/symbolic_convert.py    |   8 ++
 2 files changed, 127 insertions(+)
 create mode 100644 test/dynamo/test_export_mutations.py

diff --git a/test/dynamo/test_export_mutations.py b/test/dynamo/test_export_mutations.py
new file mode 100644
index 0000000000000..f0689aff5028f
--- /dev/null
+++ b/test/dynamo/test_export_mutations.py
@@ -0,0 +1,119 @@
+# Owner(s): ["module: dynamo"]
+import torch
+import torch._dynamo.test_case
+import torch._dynamo.testing
+
+
+class MutationExportTests(torch._dynamo.test_case.TestCase):
+    def check_failure_on_export(self, mod, *args):
+        with self.assertRaises(AssertionError):
+            torch._dynamo.export(mod, *args)
+
+    def check_same_with_export(self, mod, arg):
+        real_result = mod(arg)
+        graph, _ = torch._dynamo.export(mod, arg)
+        result = graph(arg)
+        self.assertTrue(torch._dynamo.utils.same(result, real_result))
+
+    def test_module_attribute_mutation_violation_positive_1(self):
+        # Mutating attribute with a Tensor type
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = torch.Tensor(3, 2)
+
+            def forward(self, x):
+                self.a = self.a.to(torch.float64)
+                return x.sum() + self.a.sum()
+
+        self.check_failure_on_export(Foo(), torch.Tensor(3, 2))
+
+    def test_module_attribute_mutation_violation_positive_2(self):
+        # Mutating attribute with a scalar type
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = 2
+
+            def forward(self, x):
+                self.a = self.a * 3
+                return x.sum() + self.a
+
+        self.check_failure_on_export(Foo(), torch.Tensor(3, 2))
+
+    def test_module_attribute_mutation_violation_positive_3(self):
+        # Setting a new attribute inside forward()
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = torch.Tensor(3, 2)
+
+            def forward(self, x):
+                self.b = 2
+                return x.sum() + self.a.sum() + self.b
+
+        self.check_failure_on_export(Foo(), torch.Tensor(3, 2))
+
+    def test_module_attribute_mutation_violation_negative_1(self):
+        # Mutating attribute with a Tensor type inside __init__ but
+        # not in forward()
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = torch.Tensor(3, 2)
+
+            def forward(self, x):
+                return x.sum() + self.a.to(torch.float64).sum()
+
+        self.check_same_with_export(Foo(), torch.Tensor(3, 2))
+
+    def test_module_attribute_mutation_violation_negative_2(self):
+        # Mutating attribute with a Tensor type inside __init__ twice
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = torch.Tensor(3, 2)
+                self.a = self.a.to(torch.float64)
+
+            def forward(self, x):
+                return x.sum() + self.a.sum()
+
+        self.check_same_with_export(Foo(), torch.Tensor(3, 2))
+
+    def test_module_attribute_mutation_violation_negative_3(self):
+        # Mutating local variable inside forward()
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = torch.Tensor(3, 2)
+
+            def forward(self, x):
+                b = 1
+                b = b * 5
+                return x.sum() + self.a.sum() + b
+
+        self.check_same_with_export(Foo(), torch.Tensor(3, 2))
+
+    def test_module_attribute_mutation_violation_negative_4(self):
+        # Mutating attribute with a Tensor type
+        # But not exporting but using eager mode as well as dynamo optimize mode
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = torch.Tensor(3, 2)
+
+            def forward(self, x):
+                self.a = self.a.to(torch.float64)
+                return x.sum() + self.a.sum()
+
+        mod = Foo()
+        arg = torch.Tensor(3, 2)
+        real_result = mod(arg)
+        opt_mod = torch._dynamo.optimize("eager", nopython=True)(mod)
+        self.assertTrue(torch._dynamo.utils.same(opt_mod(arg), real_result))
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index ad431cf9d54fc..84373c3920986 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -828,6 +828,14 @@ def LOAD_ATTR(self, inst):
     def STORE_ATTR(self, inst):
         prior = self.copy_graphstate()
         val, obj = self.popn(2)
+
+        if isinstance(obj, NNModuleVariable):
+            # We don't allow side effects during export
+            # https://github.com/pytorch/torchdynamo/issues/1475
+            assert (
+                not self.export
+            ), f"Mutating module attribute {inst.argval} during export."
+
         try:
             self.output.guards.update(
                 BuiltinVariable(setattr)

From 9ed6e42f538a18f7ed0cbfc95351b3b53d399a4d Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 3 Nov 2022 17:05:50 +0000
Subject: [PATCH 0531/1922] [dynamo] Support compare op for
 userfunctionvariable (#88372)

Helps reduce graph breaks for one of the training models

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88372
Approved by: https://github.com/jansel
---
 test/dynamo/test_repros.py        | 23 +++++++++++++++++++++++
 torch/_dynamo/symbolic_convert.py |  8 ++++++++
 2 files changed, 31 insertions(+)

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index cda52384f1ff2..f669a6c8c68e9 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -30,6 +30,16 @@
     HAS_REFS = False
 
 
+_orig_module_call = torch.nn.Module.__call__
+
+
+def is_fx_tracing_test() -> bool:
+    """
+    Copied from the hpc trainer codebase
+    """
+    return torch.nn.Module.__call__ is not _orig_module_call
+
+
 def ifdyn(count1, count2):
     if torch._dynamo.config.dynamic_shapes:
         return count1
@@ -1754,6 +1764,19 @@ def forward(self, inp):
         args = (torch.randn(3, 4),)
         self.assertTrue(same(mod(*args), opt_mod(*args)))
 
+    def test_is_symbolic_tracing(self):
+        # Ensure no graph break here
+        def fn(x):
+            if is_fx_tracing_test():
+                return x * 2
+            return x * 4
+
+        a = torch.randn(4)
+        ref = fn(a)
+        opt_fn = torch._dynamo.optimize("eager", nopython=True)(fn)
+        res = opt_fn(a)
+        self.assertTrue(same(ref, res))
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index 84373c3920986..b644b9ee439e0 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -745,6 +745,14 @@ def COMPARE_OP(self, inst):
             self.push(right.call_method(self, "__contains__", [left], {}))
             if op == "not in":
                 self.UNARY_NOT(inst)
+        elif (
+            isinstance(left, UserFunctionVariable)
+            and isinstance(right, UserFunctionVariable)
+            and op in supported_is_const
+        ):
+            self.push(
+                ConstantVariable(supported_is_const[op](left.fn, right.fn), **options)
+            )
         else:
             unimplemented(f"COMPARE_OP {typestr(left)} {op} {typestr(right)}")
 

From 2fd3ee1025c90b658edec2d7ef69c0841160a019 Mon Sep 17 00:00:00 2001
From: XiaobingSuper <xiaobing.zhang@intel.com>
Date: Sun, 30 Oct 2022 22:23:51 -0400
Subject: [PATCH 0532/1922] add parameters check for mkldnn_transpose (#85318)

This PR is about add parameters check for mkldnn_transpose, fixed https://github.com/pytorch/pytorch/issues/85216.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/85318
Approved by: https://github.com/jgong5, https://github.com/mingfeima, https://github.com/leslie-fang-intel
---
 aten/src/ATen/native/mkldnn/TensorShape.cpp | 6 +++++-
 test/test_mkldnn.py                         | 5 +++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/mkldnn/TensorShape.cpp b/aten/src/ATen/native/mkldnn/TensorShape.cpp
index fbf1e96bf14da..1e54aae9d6601 100644
--- a/aten/src/ATen/native/mkldnn/TensorShape.cpp
+++ b/aten/src/ATen/native/mkldnn/TensorShape.cpp
@@ -1,7 +1,8 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/core/Tensor.h>
 #include <ATen/Config.h>
 #include <ATen/InferSize.h>
+#include <ATen/WrapDimUtils.h>
+#include <ATen/core/Tensor.h>
 #include <c10/core/SymIntArrayRef.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
@@ -78,6 +79,9 @@ Tensor mkldnn_clone(const Tensor& self, c10::optional<c10::MemoryFormat> optiona
 }
 
 Tensor mkldnn_transpose(const Tensor& self, int64_t dim0, int64_t dim1) {
+  auto ndims = self.dim();
+  dim0 = maybe_wrap_dim(dim0, ndims);
+  dim1 = maybe_wrap_dim(dim1, ndims);
   const ideep::tensor& x = itensor_from_mkldnn(self);
   ideep::tensor y;
   std::vector<int> axes(x.ndims());
diff --git a/test/test_mkldnn.py b/test/test_mkldnn.py
index 04a213b1a13df..f4f427ba659c7 100644
--- a/test/test_mkldnn.py
+++ b/test/test_mkldnn.py
@@ -1060,6 +1060,11 @@ def test_transpose(self):
                     x.to_mkldnn().transpose(dim1, dim2).to_dense(),
                 )
 
+    def test_transpose_invalid_dime(self):
+        x = torch.randn(3, 4, 5, dtype=torch.float32).to_mkldnn()
+        with self.assertRaisesRegex(IndexError, "Dimension out of range"):
+            torch._mkldnn_transpose(x, 0, 12)
+
     def test_linear_non_contiguous_weight(self):
         in_features = torch.randint(3, 10, (1,)).item()
         out_features = torch.randint(3, 100, (1,)).item()

From 507646caa4bb8bea904eac7acc525cd1811b4c05 Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Thu, 3 Nov 2022 17:41:48 +0000
Subject: [PATCH 0533/1922] [ONNX] Default runtime type checking to raising
 errors (#86555)

Default runtime type checking to raise by changing the default value to  `GLOBALS.runtime_type_check_state` into ERRORS
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86555
Approved by: https://github.com/BowenBao
---
 torch/onnx/_globals.py            | 6 +++---
 torch/onnx/_internal/_beartype.py | 6 ------
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/torch/onnx/_globals.py b/torch/onnx/_globals.py
index 61d9d10b2e374..1831130d764c5 100644
--- a/torch/onnx/_globals.py
+++ b/torch/onnx/_globals.py
@@ -33,9 +33,9 @@ def __init__(self):
         self.onnx_shape_inference: bool = True
 
         # Internal feature flags
-        if os.getenv("TORCH_ONNX_EXPERIMENTAL_RUNTIME_TYPE_CHECK") == "ERRORS":
+        if os.getenv("TORCH_ONNX_EXPERIMENTAL_RUNTIME_TYPE_CHECK") == "WARNINGS":
             self.runtime_type_check_state = (
-                _exporter_states.RuntimeTypeCheckState.ERRORS
+                _exporter_states.RuntimeTypeCheckState.WARNINGS
             )
         elif os.getenv("TORCH_ONNX_EXPERIMENTAL_RUNTIME_TYPE_CHECK") == "DISABLED":
             self.runtime_type_check_state = (
@@ -43,7 +43,7 @@ def __init__(self):
             )
         else:
             self.runtime_type_check_state = (
-                _exporter_states.RuntimeTypeCheckState.WARNINGS
+                _exporter_states.RuntimeTypeCheckState.ERRORS
             )
 
     @property
diff --git a/torch/onnx/_internal/_beartype.py b/torch/onnx/_internal/_beartype.py
index 7ad494984fc91..ba98dcf0f5936 100644
--- a/torch/onnx/_internal/_beartype.py
+++ b/torch/onnx/_internal/_beartype.py
@@ -45,12 +45,6 @@ def _create_beartype_decorator(
         return _no_op_decorator
     if _beartype_lib is None:
         # If the beartype library is not installed, return a no-op decorator
-        if runtime_check_state == _exporter_states.RuntimeTypeCheckState.ERRORS:
-            warnings.warn(
-                "TORCH_ONNX_EXPERIMENTAL_RUNTIME_TYPE_CHECK is set to 'ERRORS', "
-                "but the beartype library is not installed. "
-                "Install beartype with `pip install beartype` to enable runtime type checking."
-            )
         return _no_op_decorator
 
     assert isinstance(_beartype_lib, ModuleType)

From af6ab60a1f9e93fa0d2fb71419a70d26f874d4f5 Mon Sep 17 00:00:00 2001
From: Michael Lazos <mlazos@fb.com>
Date: Thu, 3 Nov 2022 17:59:05 +0000
Subject: [PATCH 0534/1922] Verbose exc printing fix (#88387)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88387
Approved by: https://github.com/tugsbayasgalan
---
 torch/_dynamo/convert_frame.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index ce478456301c9..0e7b2c5073b4d 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -213,6 +213,8 @@ def format_error_msg(exc, code, record_filename=None, frame=None):
                     stack_above_dynamo + list(reversed(exc.real_stack))
                 )
             )
+            msg += "\n"
+            msg += "=" * 10
 
     else:
         msg = f"WON'T CONVERT {code.co_name} {code.co_filename}\
@@ -222,14 +224,21 @@ def format_error_msg(exc, code, record_filename=None, frame=None):
 
 
 def augment_exc_message(exc, msg="\n"):
-    if hasattr(exc, "real_stack") and len(exc.real_stack) > 0 and not config.verbose:
-        msg += f"\nfrom user code:\n {''.join(traceback.format_list([exc.real_stack[-1]]))}"
+    if (
+        hasattr(exc, "real_stack")
+        and len(exc.real_stack) > 0
+        and not (config.verbose and config.suppress_errors)
+    ):
+        msg += f"\nfrom user code:\n {''.join(traceback.format_list(reversed(exc.real_stack[0:2])))}"
 
     if config.replay_record_enabled and hasattr(exc, "record_filename"):
         msg += f"\nLast frame execution written to {exc.record_filename}. To run only this frame while debugging, run\
  {config.dynamo_import}.replay('{exc.record_filename}').\n"
 
-    msg += f"\nSet {config.dynamo_import}.config.verbose=True for more information\n"
+    if not config.verbose:
+        msg += (
+            f"\nSet {config.dynamo_import}.config.verbose=True for more information\n"
+        )
 
     if hasattr(exc, "inner_exception") and hasattr(
         exc.inner_exception, "minifier_path"
@@ -246,7 +255,6 @@ def augment_exc_message(exc, msg="\n"):
             "    torchdynamo.config.suppress_errors = True\n"
         )
 
-    msg += "=" * 10
     old_msg = "" if len(exc.args) == 0 else exc.args[0]
     new_msg = old_msg + msg
     exc.args = (new_msg,) + exc.args[1:]

From a65c71ea5935f08323fb1d58ddafbf6c2a777eed Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 3 Nov 2022 18:03:36 +0000
Subject: [PATCH 0535/1922] [Dynamo][Easy] Fix config.suppress_errors error log
 (#88402)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88402
Approved by: https://github.com/williamwen42
---
 torch/_dynamo/convert_frame.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index 0e7b2c5073b4d..eb16a84d04f2e 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -252,7 +252,7 @@ def augment_exc_message(exc, msg="\n"):
         msg += (
             "\n\n"
             "You can suppress this exception and fall back to eager by setting:\n"
-            "    torchdynamo.config.suppress_errors = True\n"
+            "    torch._dynamo.config.suppress_errors = True\n"
         )
 
     old_msg = "" if len(exc.args) == 0 else exc.args[0]

From 0d757ddde9256b555c7e7600a149e84ab08cc36b Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 3 Nov 2022 19:07:03 +0000
Subject: [PATCH 0536/1922] [dashboard] Replace aot_nvfuser with
 nvprims_nvfuser (#88437)

@IvanYashchuk @ngimel

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88437
Approved by: https://github.com/soumith
---
 benchmarks/dynamo/runner.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py
index ec54a16a24cd8..95c40ac8760e3 100755
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@@ -63,7 +63,8 @@
         "eager": "--training --backend=eager ",
         "aot_eager": "--training --backend=aot_eager ",
         "aot_cudagraphs": "--training --backend=aot_cudagraphs ",
-        "aot_nvfuser": "--training --nvfuser --backend=aot_nvfuser ",
+        "aot_nvfuser": "--training --nvfuser --backend=aot_ts_nvfuser ",
+        "nvprims_nvfuser": "--training --backend=nvprims_nvfuser ",
         "inductor": "--training --inductor ",
         "inductor_no_cudagraphs": "--training --inductor --disable-cudagraphs ",
     },
@@ -84,7 +85,7 @@
         "eager",
         "aot_eager",
         "aot_cudagraphs",
-        "aot_nvfuser",
+        "nvprims_nvfuser",
         "inductor",
         "inductor_no_cudagraphs",
     ],

From a695dd4947bb5545597599ccd43bef92ac8828f3 Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Thu, 3 Nov 2022 19:28:33 +0000
Subject: [PATCH 0537/1922] [minor] use set_default_dtype instead of try and
 finally (#88295)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88295
Approved by: https://github.com/mruberry
---
 test/test_ops.py            |  9 +++------
 test/test_prims.py          | 12 ++++--------
 test/test_transformers.py   | 12 ++----------
 test/test_type_promotion.py | 11 ++++-------
 4 files changed, 13 insertions(+), 31 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index b18013a3bae55..d0aa0906784dc 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -26,6 +26,7 @@
     IS_SANDCASTLE,
     clone_input_helper,
     IS_CI,
+    set_default_dtype,
     suppress_warnings,
     noncontiguous_like,
     TEST_WITH_ASAN,
@@ -160,16 +161,12 @@ def test_multiple_devices(self, devices, dtype, op):
     @suppress_warnings
     @ops(_ref_test_ops, allowed_dtypes=(torch.float64, torch.long, torch.complex128))
     def test_numpy_ref(self, device, dtype, op):
-        try:
-            # Sets the default dtype to NumPy's default dtype of double
-            cur_default = torch.get_default_dtype()
-            torch.set_default_dtype(torch.double)
+        # Sets the default dtype to NumPy's default dtype of double
+        with set_default_dtype(torch.double):
             for sample_input in op.reference_inputs(device, dtype):
                 self.compare_with_reference(
                     op, op.ref, sample_input, exact_dtype=(dtype is not torch.long)
                 )
-        finally:
-            torch.set_default_dtype(cur_default)
 
     # Tests that the cpu and gpu results are consistent
     @onlyCUDA
diff --git a/test/test_prims.py b/test/test_prims.py
index b6833352d0cfd..23e7c47b023a4 100644
--- a/test/test_prims.py
+++ b/test/test_prims.py
@@ -8,7 +8,8 @@
 
 import torch
 from torch.testing import make_tensor
-from torch.testing._internal.common_utils import parametrize, run_tests, TestCase, TEST_SCIPY, skipCUDAMemoryLeakCheckIf
+from torch.testing._internal.common_utils import (parametrize, run_tests, TestCase, TEST_SCIPY,
+                                                  set_default_dtype, skipCUDAMemoryLeakCheckIf)
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests,
     onlyCUDA,
@@ -130,11 +131,8 @@ def test_cbrt_prim(self, device, dtype):
         batches = [(), (1,), (2,), (0, 1), (1, 1), (2, 2)]
         shapes = [(), (0,), (1,), (5,)]
 
-        try:
-            # Sets the default dtype to NumPy's default dtype of double
-            cur_default = torch.get_default_dtype()
-            torch.set_default_dtype(torch.double)
-
+        # Sets the default dtype to NumPy's default dtype of double
+        with set_default_dtype(torch.double):
             # Tested here, as this OP is not currently exposed or tested in ATen
             for b, s in product(batches, shapes):
                 x = make_arg(b + s)
@@ -144,8 +142,6 @@ def test_cbrt_prim(self, device, dtype):
                 y_np = scipy.special.cbrt(x_np)
 
                 self.assertEqual(y, y_np, exact_device=False)
-        finally:
-            torch.set_default_dtype(cur_default)
 
     @onlyCUDA
     @skipCUDAIfRocm
diff --git a/test/test_transformers.py b/test/test_transformers.py
index 4657cf730b2c3..656191c9ddda7 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -20,22 +20,14 @@
     TEST_WITH_CROSSREF,
     TEST_WITH_ROCM,
     IS_WINDOWS,
-    slowTest
+    slowTest,
+    set_default_dtype
 )
 from torch.testing._internal.common_cuda import TEST_CUDA, SM80OrLater
 
 if TEST_FAIRSEQ:
     import fairseq.models.transformer as fairseq_transformer
 
-@contextlib.contextmanager
-def set_default_dtype(dtype):
-    saved_dtype = torch.get_default_dtype()
-    torch.set_default_dtype(dtype)
-    try:
-        yield
-    finally:
-        torch.set_default_dtype(saved_dtype)
-
 class TestTransformers(NNTestCase):
     _do_cuda_memory_leak_check = True
     _do_cuda_non_default_stream = True
diff --git a/test/test_type_promotion.py b/test/test_type_promotion.py
index a881c36075e3c..b351f2d6d494a 100644
--- a/test/test_type_promotion.py
+++ b/test/test_type_promotion.py
@@ -7,7 +7,8 @@
 import torch
 
 from torch.testing._internal.common_utils import (TestCase, run_tests, load_tests, make_tensor,
-                                                  TEST_NUMPY, torch_to_numpy_dtype_dict, numpy_to_torch_dtype_dict)
+                                                  TEST_NUMPY, set_default_dtype, torch_to_numpy_dtype_dict,
+                                                  numpy_to_torch_dtype_dict)
 from torch.testing._internal.common_device_type import (instantiate_device_type_tests, onlyNativeDeviceTypes,
                                                         dtypes, onlyCPU, expectedFailureMeta, skipMeta)
 from torch.testing._internal.common_dtype import (
@@ -30,14 +31,10 @@
 def float_double_default_dtype(fn):
     @wraps(fn)
     def wrapped_fn(*args, **kwargs):
-        cur_dtype = torch.get_default_dtype()
-        try:
-            torch.set_default_dtype(torch.float)
+        with set_default_dtype(torch.float):
             fn(*args, **kwargs)
-            torch.set_default_dtype(torch.double)
+        with set_default_dtype(torch.double):
             fn(*args, **kwargs)
-        finally:
-            torch.set_default_dtype(cur_dtype)
 
     return wrapped_fn
 

From 276e3decd24a20896e42790f117dba2e44b2a893 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Wed, 2 Nov 2022 08:55:02 -0700
Subject: [PATCH 0538/1922] [Pytorch][vulkan] Generate shader with parameters
 (#88322)

Parametsr such as tile size and weight type and format is embedded within the
shader code. This is used to generate ShaderInfo.

For now we will maintain both ShaderSrc and ShaderInfo so as to transition from
VK_KERNEL to VK_SHADER incremental. Otherwise we will have to switch multiple
of them at the same time.

Differential Revision: [D40280338](https://our.internmc.facebook.com/intern/diff/D40280338/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88322
Approved by: https://github.com/jmdetloff, https://github.com/mcr229
---
 aten/src/ATen/native/vulkan/api/Common.h      |  6 +++
 aten/src/ATen/native/vulkan/api/Shader.cpp    | 15 +++++++
 aten/src/ATen/native/vulkan/api/Shader.h      | 17 ++++++++
 aten/src/ATen/native/vulkan/glsl/conv2d.glsl  |  5 +++
 .../ATen/native/vulkan/glsl/conv2d_dw.glsl    |  6 +++
 .../native/vulkan/glsl/conv2d_pw_2x2.glsl     |  5 +++
 .../native/vulkan/glsl/conv_transpose2d.glsl  |  5 +++
 .../native/vulkan/glsl/quantized_conv2d.glsl  |  5 +++
 .../vulkan/glsl/quantized_conv2d_dw.glsl      |  6 +++
 .../vulkan/glsl/quantized_conv2d_pw_2x2.glsl  |  5 +++
 .../ATen/native/vulkan/ops/Convolution.cpp    | 39 +++++++++----------
 11 files changed, 93 insertions(+), 21 deletions(-)

diff --git a/aten/src/ATen/native/vulkan/api/Common.h b/aten/src/ATen/native/vulkan/api/Common.h
index 738592408f6f8..e52f2fca74c89 100644
--- a/aten/src/ATen/native/vulkan/api/Common.h
+++ b/aten/src/ATen/native/vulkan/api/Common.h
@@ -21,6 +21,12 @@
     CONCAT_LITERALS(vulkan., name), name##_spv, name##_spv_len, \
         name##_spv_layout                                       \
   }
+#define VK_SHADER(name)                                         \
+  ::at::native::vulkan::api::ShaderInfo {                       \
+    CONCAT_LITERALS(vulkan., name), name##_spv, name##_spv_len, \
+        name##_spv_layout, name##_spv_tile_size,                \
+        name##_spv_weight_storage_type,                         \
+  }
 #endif /* USE_VULKAN_SHADERC_RUNTIME */
 
 /*
diff --git a/aten/src/ATen/native/vulkan/api/Shader.cpp b/aten/src/ATen/native/vulkan/api/Shader.cpp
index 2ead82bc934e2..8c3e924f417bb 100644
--- a/aten/src/ATen/native/vulkan/api/Shader.cpp
+++ b/aten/src/ATen/native/vulkan/api/Shader.cpp
@@ -50,6 +50,21 @@ ShaderSource::ShaderSource(
       kernel_name{std::move(name)},
       kernel_layout{layout} {}
 
+ShaderInfo::ShaderInfo(
+    std::string name,
+    const uint32_t* const spirv_bin,
+    const uint32_t size,
+    const std::vector<VkDescriptorType>& layout,
+    const std::vector<uint32_t>& tile_size,
+    const StorageType weight_storage_type)
+    : shader_src(name, spirv_bin, size, layout),
+      tile_size(tile_size),
+      weight_storage_type(weight_storage_type) {
+  for (uint64_t i = 0; i < tile_size.size(); ++i) {
+    shader_src.out_tile_size.data[i] = tile_size[i];
+  }
+}
+
 bool operator==(const ShaderSource& _1, const ShaderSource& _2) {
   if (_1.type != _2.type) {
     return false;
diff --git a/aten/src/ATen/native/vulkan/api/Shader.h b/aten/src/ATen/native/vulkan/api/Shader.h
index 12cc5c193d123..9b8acb91e51ba 100644
--- a/aten/src/ATen/native/vulkan/api/Shader.h
+++ b/aten/src/ATen/native/vulkan/api/Shader.h
@@ -3,6 +3,7 @@
 #ifdef USE_VULKAN_API
 
 #include <ATen/native/vulkan/api/Common.h>
+#include <ATen/native/vulkan/api/Types.h>
 #include <ATen/native/vulkan/api/Utils.h>
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/hash.h>
@@ -74,6 +75,22 @@ struct ShaderSource final {
 
 bool operator==(const ShaderSource& _1, const ShaderSource& _2);
 
+struct ShaderInfo final {
+  ShaderSource shader_src;
+  c10::SmallVector<uint32_t, 4> tile_size;
+  StorageType weight_storage_type{StorageType::UNKNOWN};
+
+  explicit ShaderInfo() = default;
+  explicit ShaderInfo(std::string, const char*);
+  explicit ShaderInfo(
+      std::string,
+      const uint32_t*,
+      const uint32_t,
+      const std::vector<VkDescriptorType>&,
+      const std::vector<uint32_t>& tile_size,
+      const StorageType weight_storage_type);
+};
+
 class ShaderModule final {
  public:
   explicit ShaderModule(const VkDevice device, const ShaderSource& source);
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d.glsl
index 4afae20127e80..412522aa388f6 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d.glsl
@@ -2,6 +2,11 @@
 #define PRECISION $precision
 #define FORMAT $format
 
+/*
+ * TILE_SIZE = (1, 1, 1)
+ * WEIGHT_STORAGE = TEXTURE_2D
+ */
+
 layout(std430) buffer;
 
 /*
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl
index 671b6410e61df..045097f548a41 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl
@@ -2,6 +2,12 @@
 #define PRECISION $precision
 #define FORMAT $format
 
+/*
+ * TILE_SIZE = (1, 1, 1)
+ * WEIGHT_STORAGE = TEXTURE_2D
+ * Note that for DW kernel IC = 1 so the weight layout is really OC4, H, W, 4oc
+ */
+
 layout(std430) buffer;
 
 /*
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_pw_2x2.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_pw_2x2.glsl
index b497f41587ff5..fe85f3ff92c42 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_pw_2x2.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d_pw_2x2.glsl
@@ -2,6 +2,11 @@
 #define PRECISION $precision
 #define FORMAT $format
 
+/*
+ * TILE_SIZE = (2, 2, 1)
+ * WEIGHT_STORAGE = TEXTURE_3D
+ */
+
 layout(std430) buffer;
 
 /*
diff --git a/aten/src/ATen/native/vulkan/glsl/conv_transpose2d.glsl b/aten/src/ATen/native/vulkan/glsl/conv_transpose2d.glsl
index ba9fbbd8df363..d9589ac90becf 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv_transpose2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv_transpose2d.glsl
@@ -2,6 +2,11 @@
 #define PRECISION $precision
 #define FORMAT $format
 
+/*
+ * TILE_SIZE = (1, 1, 1)
+ * WEIGHT_STORAGE = TEXTURE_2D
+ */
+
 layout(std430) buffer;
 
 /* Qualifiers: layout - storage - precision - memory */
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d.glsl
index a53078b8b269f..57017cbd6e784 100644
--- a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d.glsl
@@ -2,6 +2,11 @@
 #define PRECISION $precision
 #define FORMAT $format
 
+/*
+ * TILE_SIZE = (1, 1, 1)
+ * WEIGHT_STORAGE = TEXTURE_3D
+ */
+
 layout(std430) buffer;
 
 /* Qualifiers: layout - storage - precision - memory */
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_dw.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_dw.glsl
index d842ab97bcc8d..5441a34274c85 100644
--- a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_dw.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_dw.glsl
@@ -2,6 +2,12 @@
 #define PRECISION $precision
 #define FORMAT $format
 
+/*
+ * TILE_SIZE = (1, 1, 1)
+ * WEIGHT_STORAGE = TEXTURE_3D
+ * Note that for DW kernel IC = 1 so the weight layout is really OC4, H, W, 4oc
+ */
+
 layout(std430) buffer;
 
 /* Qualifiers: layout - storage - precision - memory */
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_pw_2x2.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_pw_2x2.glsl
index 21e6d1a607f19..4522cf0a7c3d0 100644
--- a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_pw_2x2.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_pw_2x2.glsl
@@ -2,6 +2,11 @@
 #define PRECISION $precision
 #define FORMAT $format
 
+/*
+ * TILE_SIZE = (2, 2, 1)
+ * WEIGHT_STORAGE = TEXTURE_3D
+ */
+
 /*
  * Output Image
  */
diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
index 8431ccac5ef3e..9a34169c4a99a 100644
--- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
@@ -287,7 +287,7 @@ static api::ShaderSource get_shader(
     const Conv2dMethod method,
     const bool transposed,
     const bool quantized) {
-  api::ShaderSource shader;
+  api::ShaderInfo shader;
 
   if (quantized) {
     if (transposed) {
@@ -296,39 +296,36 @@ static api::ShaderSource get_shader(
 
     switch (method) {
       case Conv2dSlidingWindow:
-        shader = VK_KERNEL(quantized_conv2d);
-        return shader;
+        shader = VK_SHADER(quantized_conv2d);
+        break;
       case Conv2dDepthwise:
-        shader = VK_KERNEL(quantized_conv2d_dw);
-        return shader;
+        shader = VK_SHADER(quantized_conv2d_dw);
+        break;
       case Conv2dPointwise:
-        shader = VK_KERNEL(quantized_conv2d_pw_2x2);
-        // Set explicitly for now. In the future, this will be set automatically
-        // by shader codegen.
-        shader.out_tile_size = {2u, 2u, 1u};
-        return shader;
+        shader = VK_SHADER(quantized_conv2d_pw_2x2);
+        break;
+        // todo fail for quantized transposed conv
     }
+    return shader.shader_src;
   }
 
   if (transposed) {
-    shader = VK_KERNEL(conv_transpose2d);
-    return shader;
+    shader = VK_SHADER(conv_transpose2d);
+    return shader.shader_src;
   }
 
   switch (method) {
     case Conv2dSlidingWindow:
-      shader = VK_KERNEL(conv2d);
-      return shader;
+      shader = VK_SHADER(conv2d);
+      break;
     case Conv2dDepthwise:
-      shader = VK_KERNEL(conv2d_dw);
-      return shader;
+      shader = VK_SHADER(conv2d_dw);
+      break;
     case Conv2dPointwise:
-      shader = VK_KERNEL(conv2d_pw_2x2);
-      // Set explicitly for now. In the future, this will be set automatically
-      // by shader codegen.
-      shader.out_tile_size = {2u, 2u, 1u};
-      return shader;
+      shader = VK_SHADER(conv2d_pw_2x2);
+      break;
   }
+  return shader.shader_src;
 }
 
 //

From e4e0183908849ab66644986d996e525a94aed6fd Mon Sep 17 00:00:00 2001
From: Elias Ellison <elias.ellison@gmail.com>
Date: Thu, 3 Nov 2022 03:37:23 +0000
Subject: [PATCH 0539/1922] Dont clone inputs if using fake tensor (#88208)

Not sure that this will really reduce memory use but it is an extraneous copy in our stack right now.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88208
Approved by: https://github.com/anijain2305
---
 torch/_dynamo/optimizations/analysis.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/torch/_dynamo/optimizations/analysis.py b/torch/_dynamo/optimizations/analysis.py
index b7557a82d744a..0af70bfa9581d 100644
--- a/torch/_dynamo/optimizations/analysis.py
+++ b/torch/_dynamo/optimizations/analysis.py
@@ -121,10 +121,6 @@ def has_mutation(gm, example_inputs, inputs_only=False):
     true, we only check for mutation of inputs"""
     # TODO - moco gives bad accuracy with Aliasing. gm is getting mutated in a bad way.
 
-    # Clone the inputs such that intermediate tensors (not leaf tensors) with
-    # requires_grad to True are now converted to False to avoid Runtime Error
-    # like "leaf variable that requires grad is inplace modified"
-    example_inputs = clone_inputs(example_inputs)
     if fake_tensors_available and config.fake_tensor_propagation:
         with FakeTensorMode() as fake_mode:
             pass
@@ -134,6 +130,10 @@ def has_mutation(gm, example_inputs, inputs_only=False):
         with fake_mode.restore() if hasattr(fake_mode, "restore") else fake_mode:
             ShapeAliasingAndMutationProp(new_gm).run(*example_inputs)
     else:
+        # Clone the inputs such that intermediate tensors (not leaf tensors) with
+        # requires_grad to True are now converted to False to avoid Runtime Error
+        # like "leaf variable that requires grad is inplace modified"
+        example_inputs = clone_inputs(example_inputs)
         new_gm = copy.deepcopy(gm)
         example_inputs = copy.deepcopy(example_inputs)
         ShapeAliasingAndMutationProp(new_gm).run(*example_inputs)

From 818a9415d4a1e4c102e8d4157fdaa09f7d885aed Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Wed, 2 Nov 2022 08:55:08 -0700
Subject: [PATCH 0540/1922] [PyTorch][Vulkan] Add template based codegen for
 shader generation (#88323)

We would like to be able to parameterize kernels such that a parameterized
algorithm can be implemented via templates. We can then profile performance of
a kernel with different parameter values. This enables us to determine what
parameters may work the best for a given kernel or a given device.

In this diff one such kernel added in 1x1 conv which parameters across size of
the tile being produced by each invocation.

Few other options for parameters can be:
- One can imagine dtype can also be a parameter such that we can do compute in
fp16 or int8/int16.
- Register blocking for input channels

Differential Revision: [D40280336](https://our.internmc.facebook.com/intern/diff/D40280336/)

**NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D40280336/)!
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88323
Approved by: https://github.com/jmdetloff
---
 .../conv2d_pw.glslt}                          |  51 ++++----
 .../glsl/templates/conv2d_pw_params.yaml      |   7 ++
 tools/BUCK.bzl                                |  26 ++++
 tools/gen_vulkan_glsl.py                      | 111 ++++++++++++++++++
 tools/gen_vulkan_spv.py                       |  30 +++++
 tools/test/test_vulkan_codegen.py             | 100 ++++++++++++++++
 6 files changed, 301 insertions(+), 24 deletions(-)
 rename aten/src/ATen/native/vulkan/glsl/{conv2d_pw_2x2.glsl => templates/conv2d_pw.glslt} (79%)
 create mode 100644 aten/src/ATen/native/vulkan/glsl/templates/conv2d_pw_params.yaml
 create mode 100644 tools/gen_vulkan_glsl.py
 create mode 100644 tools/test/test_vulkan_codegen.py

diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_pw_2x2.glsl b/aten/src/ATen/native/vulkan/glsl/templates/conv2d_pw.glslt
similarity index 79%
rename from aten/src/ATen/native/vulkan/glsl/conv2d_pw_2x2.glsl
rename to aten/src/ATen/native/vulkan/glsl/templates/conv2d_pw.glslt
index fe85f3ff92c42..191848419ed6a 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_pw_2x2.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/templates/conv2d_pw.glslt
@@ -1,10 +1,7 @@
-#version 450 core
-#define PRECISION $precision
-#define FORMAT $format
-
 /*
- * TILE_SIZE = (2, 2, 1)
- * WEIGHT_STORAGE = TEXTURE_3D
+ * TILE_SIZE = ($TILE_SIZE_X, $TILE_SIZE_Y, 1)
+ * WEIGHT_STORAGE = TEXTURE_2D
+ * WEIGHT_STORAGE_LAYOUT = OC4,IC4,4ic,4oc
  */
 
 layout(std430) buffer;
@@ -54,17 +51,19 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 void main() {
   const ivec3 gpos = ivec3(gl_GlobalInvocationID);
 
-  // Determine the output positions that will be written to.
+  // Output position for TILE_SIZE_X, TILE_SIZE_Y = 2, 2
   // +--------+--------+
   // | pos[0] | pos[1] |
   // +--------+--------+
   // | pos[2] | pos[3] |
   // +--------+--------+
-  ivec3 pos[4];
-  pos[0] = ivec3(gpos.x * 2, gpos.y * 2, gpos.z);
-  pos[1] = ivec3(gpos.x * 2 + 1, gpos.y * 2, gpos.z);
-  pos[2] = ivec3(gpos.x * 2, gpos.y * 2 + 1, gpos.z);
-  pos[3] = ivec3(gpos.x * 2 + 1, gpos.y * 2 + 1, gpos.z);
+  ivec3 pos[$TILE_SIZE_X * $TILE_SIZE_Y];
+  for (int y = 0, i = 0; y < $TILE_SIZE_Y; ++y) {
+    for (int x = 0; x < $TILE_SIZE_X; ++x) {
+      pos[i] = ivec3(gpos.x * $TILE_SIZE_X + x, gpos.y * $TILE_SIZE_Y + y, gpos.z);
+      i++;
+    }
+  }
 
   // If the top left position is out of bounds, then this invocation will have
   // no work to do.
@@ -75,14 +74,14 @@ void main() {
   // Compute the index of the input texture that needs to be loaded for each
   // output position. Note that negative indices can be produced indicating that
   // the top-left element is in a region added by padding.
-  ivec2 ipos[4];
-  for (int i = 0; i < 4; ++i) {
+  ivec2 ipos[$TILE_SIZE_X * $TILE_SIZE_Y];
+  for (int i = 0; i < $TILE_SIZE_X * $TILE_SIZE_Y; ++i) {
     ipos[i] = pos[i].xy * uBlock.stride - uBlock.padding;
   }
 
-  vec4 sum[4];
+  vec4 sum[$TILE_SIZE_X * $TILE_SIZE_Y];
   sum[0] = texelFetch(uBias, ivec2(gpos.z, 0), 0);
-  for (int i = 1; i < 4; ++i) {
+  for (int i = 1; i < $TILE_SIZE_X * $TILE_SIZE_Y; ++i) {
     sum[i] = sum[0];
   }
 
@@ -92,13 +91,18 @@ void main() {
     // During prepacking, the weight tensor has been permuted so that the
     // channel (IC) dim is along the x axis, and the batch (OC) dim is along
     // the z axis.
+    vec4 in_tex[$TILE_SIZE_X * $TILE_SIZE_Y];
     const vec4 ktex_0 = texelFetch(uKernel, ivec2(z + 0, gpos.z), 0);
     const vec4 ktex_1 = texelFetch(uKernel, ivec2(z + 1, gpos.z), 0);
     const vec4 ktex_2 = texelFetch(uKernel, ivec2(z + 2, gpos.z), 0);
     const vec4 ktex_3 = texelFetch(uKernel, ivec2(z + 3, gpos.z), 0);
 
-    for (int i = 0; i < 4; ++i) {
-      const vec4 in_tex = texelFetch(uInput, ivec3(ipos[i], z4), 0);
+    for (int i = 0; i < $TILE_SIZE_Y * $TILE_SIZE_X; ++i) {
+      in_tex[i] = texelFetch(uInput, ivec3(ipos[i], z4), 0);
+    }
+
+    for (int i = 0; i < $TILE_SIZE_Y * $TILE_SIZE_X; ++i) {
+      // For 2x2 tile size algorithm works as follows.
       // To explain the calculations below, the contents one in_tex and the
       // group of 4 texels loaded from uKernel are shown:
       //
@@ -131,15 +135,14 @@ void main() {
       //
       //  which is what is expressed in the following calculations. This is done
       //  for each output position.
-
-      sum[i] = fma(in_tex.xxxx, ktex_0, sum[i]);
-      sum[i] = fma(in_tex.yyyy, ktex_1, sum[i]);
-      sum[i] = fma(in_tex.zzzz, ktex_2, sum[i]);
-      sum[i] = fma(in_tex.wwww, ktex_3, sum[i]);
+      sum[i] = fma(in_tex[i].xxxx, ktex_0, sum[i]);
+      sum[i] = fma(in_tex[i].yyyy, ktex_1, sum[i]);
+      sum[i] = fma(in_tex[i].zzzz, ktex_2, sum[i]);
+      sum[i] = fma(in_tex[i].wwww, ktex_3, sum[i]);
     }
   }
 
-  for (int i = 0; i < 4; ++i) {
+  for (int i = 0; i < $TILE_SIZE_Y * $TILE_SIZE_X; ++i) {
     if (all(lessThan(pos[i], uBlock.out_extents.xyz))) {
       imageStore(
           uOutput,
diff --git a/aten/src/ATen/native/vulkan/glsl/templates/conv2d_pw_params.yaml b/aten/src/ATen/native/vulkan/glsl/templates/conv2d_pw_params.yaml
new file mode 100644
index 0000000000000..fef8f20f4e733
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/templates/conv2d_pw_params.yaml
@@ -0,0 +1,7 @@
+conv2d_pw:
+  parameter_names_with_default_values:
+      TILE_SIZE_X: 2
+      TILE_SIZE_Y: 2
+  parameter_values:
+    - TILE_SIZE_X: 1
+      TILE_SIZE_Y: 1
diff --git a/tools/BUCK.bzl b/tools/BUCK.bzl
index e61ab02e48a26..6d16e8fe3ff8d 100644
--- a/tools/BUCK.bzl
+++ b/tools/BUCK.bzl
@@ -211,6 +211,18 @@ def define_tools_targets(
             "gen_vulkan_spv.py",
         ],
         base_module = "",
+        deps = [
+            torchgen_deps,
+            ":gen_aten_vulkan_glsl_lib",
+        ],
+    )
+
+    python_library(
+        name = "gen_aten_vulkan_glsl_lib",
+        srcs = [
+            "gen_vulkan_glsl.py",
+        ],
+        base_module = "tools",
         deps = [
             torchgen_deps,
         ],
@@ -223,6 +235,20 @@ def define_tools_targets(
             "PUBLIC",
         ],
         deps = [
+            ":gen_aten_vulkan_glsl_lib",
+            ":gen_aten_vulkan_spv_lib",
+        ],
+    )
+
+    python_test(
+        name = "vulkan_codegen_test",
+        srcs = [
+            "test/test_vulkan_codegen.py",
+        ],
+        contacts = contacts,
+        visibility = ["PUBLIC"],
+        deps = [
+            ":gen_aten_vulkan_glsl_lib",
             ":gen_aten_vulkan_spv_lib",
         ],
     )
diff --git a/tools/gen_vulkan_glsl.py b/tools/gen_vulkan_glsl.py
new file mode 100644
index 0000000000000..bf6f16dff25fb
--- /dev/null
+++ b/tools/gen_vulkan_glsl.py
@@ -0,0 +1,111 @@
+import copy
+import os
+
+import yaml
+
+from torchgen.code_template import CodeTemplate
+from yaml.constructor import ConstructorError
+from yaml.nodes import MappingNode
+
+try:
+    from yaml import CLoader as Loader
+except ImportError:
+    from yaml import Loader  # type: ignore[misc]
+
+# https://gist.github.com/pypt/94d747fe5180851196eb
+class UniqueKeyLoader(Loader):
+    def construct_mapping(self, node, deep=False):  # type: ignore[no-untyped-def]
+        if not isinstance(node, MappingNode):
+            raise ConstructorError(
+                None,
+                None,
+                "expected a mapping node, but found %s" % node.id,
+                node.start_mark,
+            )
+        mapping = {}
+        for key_node, value_node in node.value:
+            key = self.construct_object(key_node, deep=deep)  # type: ignore[no-untyped-call]
+            try:
+                hash(key)
+            except TypeError:
+                raise ConstructorError(
+                    "while constructing a mapping",
+                    node.start_mark,
+                    "found unacceptable key ",
+                    key_node.start_mark,
+                )
+            # check for duplicate keys
+            if key in mapping:
+                raise ConstructorError(
+                    "while constructing a mapping",
+                    node.start_mark,
+                    "found duplicate key",
+                    key_node.start_mark,
+                )
+            value = self.construct_object(value_node, deep=deep)  # type: ignore[no-untyped-call]
+            mapping[key] = value
+        return mapping
+
+
+class GLSLGenerator(object):
+    standard_header = """
+#version 450 core
+#define PRECISION $precision
+#define FORMAT $format
+
+"""
+
+    def __init__(self):  # type: ignore[no-untyped-def]
+        self.ops_template_params = {}
+
+    def add_params_yaml(self, parameters_yaml_file):  # type: ignore[no-untyped-def]
+        all_template_params = {}
+        with open(parameters_yaml_file, "r") as f:
+            contents = yaml.load(f, Loader=UniqueKeyLoader)
+            for key in contents:
+                all_template_params[key] = contents[key]
+        self.validate_and_construct_op_params(all_template_params)  # type: ignore[no-untyped-call]
+
+    def validate_and_construct_op_params(self, all_template_params):  # type: ignore[no-untyped-def]
+        for op in all_template_params:
+            if op in self.ops_template_params:
+                raise KeyError(f"{op} params file has already been parsed")
+            op_params_default_vals = all_template_params[op][
+                "parameter_names_with_default_values"
+            ]
+            template_params_set = set(op_params_default_vals.keys())
+            self.ops_template_params[op] = []
+            self.ops_template_params[op].append(op_params_default_vals)
+            op_template_params_values = all_template_params[op]["parameter_values"]
+            for param_vals in op_template_params_values:
+                param_vals_set = set(param_vals.keys())
+                missing_keys = template_params_set - param_vals_set
+                invalid_keys = param_vals_set - template_params_set
+                if (len(invalid_keys)) > 0:
+                    raise KeyError(f"Invalid keys {invalid_keys} are found")
+                param_vals_copy = copy.deepcopy(param_vals)
+                for key in missing_keys:
+                    param_vals_copy[key] = op_params_default_vals[key]
+                self.ops_template_params[op].append(param_vals_copy)
+
+    def generate(self, glsl_template_in, out_dir):  # type: ignore[no-untyped-def]
+        glsl_template_name = os.path.basename(glsl_template_in)
+        op_name, extension_name = glsl_template_name.split(".")
+        if extension_name != "glslt":
+            raise TypeError(f"invalid file type for glsl template {extension_name}")
+        if op_name not in self.ops_template_params:
+            raise KeyError(f"{op_name} params have not been populated")
+        code_template = CodeTemplate.from_file(glsl_template_in)
+        for template_params in self.ops_template_params[op_name]:
+            content = GLSLGenerator.standard_header
+            param_vals_string = "x".join([str(i) for i in template_params.values()])
+            output_file_name = op_name + "_" + param_vals_string + ".glsl"
+            content += code_template.substitute(template_params)
+            output_file = os.path.join(out_dir, output_file_name)
+            with open(output_file, "w") as f:
+                f.write(content)
+
+
+# Remove this
+if __name__ == "__main__":
+    pass
diff --git a/tools/gen_vulkan_spv.py b/tools/gen_vulkan_spv.py
index 1d37a95af57ec..37aa97aa1d6d7 100644
--- a/tools/gen_vulkan_spv.py
+++ b/tools/gen_vulkan_spv.py
@@ -11,6 +11,8 @@
 from dataclasses import dataclass
 from typing import List
 
+from tools.gen_vulkan_glsl import GLSLGenerator
+
 H_NAME = "spv.h"
 CPP_NAME = "spv.cpp"
 DEFAULT_ENV = {"precision": "highp", "format": "rgba32f"}
@@ -78,12 +80,40 @@ def getShaderInfo(srcFilePath):
 
     return shader_info
 
+def genGLSLFromGLSLT(src_dir_path, tmp_dir_path):
+    template_dir_path = os.path.join(src_dir_path, "templates")
+    vexs = glob.glob(os.path.join(template_dir_path, '**', '*.yaml'), recursive=True)
+    parameter_yaml_files = []
+    for f in vexs:
+        if len(f) > 1:
+            parameter_yaml_files.append(f)
+    generator = GLSLGenerator()
+    for params_yaml in parameter_yaml_files:
+        generator.add_params_yaml(params_yaml)  # type: ignore[no-untyped-call]
+
+    vexs = glob.glob(os.path.join(src_dir_path, '**', '*.glslt'), recursive=True)
+    templateSrcPaths = []
+    for f in vexs:
+        if len(f) > 1:
+            templateSrcPaths.append(f)
+            templateSrcPaths.sort()
+    for glslt in templateSrcPaths:
+        generator.generate(glslt, tmp_dir_path)  # type: ignore[no-untyped-call]
+
 def genCppH(hFilePath, cppFilePath, srcDirPath, glslcPath, tmpDirPath, env):
     print("hFilePath:{} cppFilePath:{} srcDirPath:{} glslcPath:{} tmpDirPath:{}".format(
         hFilePath, cppFilePath, srcDirPath, glslcPath, tmpDirPath))
 
     vexs = glob.glob(os.path.join(srcDirPath, '**', '*.glsl'), recursive=True)
     templateSrcPaths = []
+    for f in vexs:
+        if len(f) > 1:
+            templateSrcPaths.append(f)
+            templateSrcPaths.sort()
+
+    # Now add glsl files that are generated from templates
+    genGLSLFromGLSLT(srcDirPath, tmpDirPath)
+    vexs = glob.glob(os.path.join(tmpDirPath, '**', '*.glsl'), recursive=True)
     for f in vexs:
         if len(f) > 1:
             templateSrcPaths.append(f)
diff --git a/tools/test/test_vulkan_codegen.py b/tools/test/test_vulkan_codegen.py
new file mode 100644
index 0000000000000..26ccc66425790
--- /dev/null
+++ b/tools/test/test_vulkan_codegen.py
@@ -0,0 +1,100 @@
+import os
+import tempfile
+import unittest
+
+from tools.gen_vulkan_glsl import GLSLGenerator
+from yaml.constructor import ConstructorError
+
+
+class TestGLSLCodegen(unittest.TestCase):
+    def test_assert_on_duplicate_key_yaml(self) -> None:
+        yaml_with_duplicate_keys = """
+conv2d_pw:
+  parameter_names_with_default_values:
+      TILE_SIZE_X: 1
+      TILE_SIZE_Y: 1
+  parameter_values:
+    - TILE_SIZE_X: 2
+      TILE_SIZE_Y: 2
+    - TILE_SIZE_X: 2
+      TILE_SIZE_Y: 4
+    - TILE_SIZE_X: 4
+      TILE_SIZE_Y: 2
+    - TILE_SIZE_X: 4
+      TILE_SIZE_Y: 4
+conv2d_pw:
+  parameter_names_with_default_values:
+    - TILE_SIZE_X: 1
+    - TILE_SIZE_Y: 1
+  parameter_values:
+    - TILE_SIZE_X: 2
+      TILE_SIZE_Y: 2
+    - TILE_SIZE_X: 2
+      TILE_SIZE_Y: 4
+    - TILE_SIZE_X: 4
+      TILE_SIZE_Y: 2
+    - TILE_SIZE_X: 4
+      TILE_SIZE_Y: 4
+"""
+
+        generator = GLSLGenerator()  # type: ignore[no-untyped-call]
+        with tempfile.NamedTemporaryFile(mode="w") as fp:
+            fp.write(yaml_with_duplicate_keys)
+            fp.flush()
+            with self.assertRaisesRegex(
+                ConstructorError, r"while constructing a mapping"
+            ):
+                generator.add_params_yaml(fp.name)  # type: ignore[no-untyped-call]
+
+    def test_assert_keys_mismatch(self) -> None:
+        yaml_with_key_mismatch = """
+conv2d_pw:
+  parameter_names_with_default_values:
+      TILE_SIZE_X: 1
+      TILE_SIZE_Y: 1
+  parameter_values:
+    - TILE_SIZE_X: 2
+      TILE_SIZE_Z: 2
+"""
+
+        generator = GLSLGenerator()  # type: ignore[no-untyped-call]
+        with tempfile.NamedTemporaryFile(mode="w") as fp:
+            fp.write(yaml_with_key_mismatch)
+            fp.flush()
+            with self.assertRaisesRegex(KeyError, r"Invalid keys {'TILE_SIZE_Z'}"):
+                generator.add_params_yaml(fp.name)  # type: ignore[no-untyped-call]
+
+    def test_missing_key_default_val(self) -> None:
+        yaml_with_key_mismatch = """
+conv2d_pw:
+  parameter_names_with_default_values:
+      TILE_SIZE_X: 1
+      TILE_SIZE_Y: 1
+  parameter_values:
+    - TILE_SIZE_X: 2
+"""
+        file_content = """
+x = $TILE_SIZE_X + $TILE_SIZE_Y
+"""
+
+        generator = GLSLGenerator()  # type: ignore[no-untyped-call]
+        with tempfile.NamedTemporaryFile(mode="w") as fp:
+            fp.write(yaml_with_key_mismatch)
+            fp.flush()
+            generator.add_params_yaml(fp.name)  # type: ignore[no-untyped-call]
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                template_file_name = os.path.join(tmp_dir, "conv2d_pw.glslt")
+                with open(template_file_name, "w") as template_file:
+                    template_file.write(file_content)
+                    template_file.flush()
+                    generator.generate(template_file.name, tmp_dir)  # type: ignore[no-untyped-call]
+                    file_name_1 = os.path.join(tmp_dir, "conv2d_pw_1x1.glsl")
+                    file_name_2 = os.path.join(tmp_dir, "conv2d_pw_2x1.glsl")
+                    self.assertTrue(os.path.exists(file_name_1))
+                    self.assertTrue(os.path.exists(file_name_2))
+                    with open(file_name_1, "r") as f:
+                        contents = f.read()
+                        self.assertTrue("1 + 1" in contents)
+                    with open(file_name_2, "r") as f:
+                        contents = f.read()
+                        self.assertTrue("2 + 1" in contents)

From 0ea383298dd79714314230de73d11ae908c4f78e Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Wed, 2 Nov 2022 08:55:14 -0700
Subject: [PATCH 0541/1922] [pytorch][vulkan] Add bias storage type to template
 (#88324)

To enable buffer based use for bias as well, this diff adds storage type for
bias to template

Differential Revision: [D40689003](https://our.internmc.facebook.com/intern/diff/D40689003/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88324
Approved by: https://github.com/jmdetloff
---
 aten/src/ATen/native/vulkan/api/Common.h      | 10 +++++-----
 aten/src/ATen/native/vulkan/api/Shader.cpp    |  2 ++
 aten/src/ATen/native/vulkan/api/Shader.h      |  2 ++
 aten/src/ATen/native/vulkan/glsl/conv2d.glsl  |  1 +
 .../ATen/native/vulkan/glsl/conv2d_dw.glsl    |  1 +
 .../native/vulkan/glsl/conv_transpose2d.glsl  |  1 +
 .../native/vulkan/glsl/quantized_conv2d.glsl  |  1 +
 .../vulkan/glsl/quantized_conv2d_dw.glsl      |  1 +
 .../vulkan/glsl/quantized_conv2d_pw_2x2.glsl  |  1 +
 .../vulkan/glsl/templates/conv2d_pw.glslt     |  1 +
 tools/gen_vulkan_spv.py                       | 19 +++++++++++++++++++
 11 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/native/vulkan/api/Common.h b/aten/src/ATen/native/vulkan/api/Common.h
index e52f2fca74c89..3cfee491d7eab 100644
--- a/aten/src/ATen/native/vulkan/api/Common.h
+++ b/aten/src/ATen/native/vulkan/api/Common.h
@@ -21,11 +21,11 @@
     CONCAT_LITERALS(vulkan., name), name##_spv, name##_spv_len, \
         name##_spv_layout                                       \
   }
-#define VK_SHADER(name)                                         \
-  ::at::native::vulkan::api::ShaderInfo {                       \
-    CONCAT_LITERALS(vulkan., name), name##_spv, name##_spv_len, \
-        name##_spv_layout, name##_spv_tile_size,                \
-        name##_spv_weight_storage_type,                         \
+#define VK_SHADER(name)                                                        \
+  ::at::native::vulkan::api::ShaderInfo {                                      \
+    CONCAT_LITERALS(vulkan., name), name##_spv, name##_spv_len,                \
+        name##_spv_layout, name##_spv_tile_size, name##_spv_bias_storage_type, \
+        name##_spv_weight_storage_type,                                        \
   }
 #endif /* USE_VULKAN_SHADERC_RUNTIME */
 
diff --git a/aten/src/ATen/native/vulkan/api/Shader.cpp b/aten/src/ATen/native/vulkan/api/Shader.cpp
index 8c3e924f417bb..1ca37ba999998 100644
--- a/aten/src/ATen/native/vulkan/api/Shader.cpp
+++ b/aten/src/ATen/native/vulkan/api/Shader.cpp
@@ -56,9 +56,11 @@ ShaderInfo::ShaderInfo(
     const uint32_t size,
     const std::vector<VkDescriptorType>& layout,
     const std::vector<uint32_t>& tile_size,
+    const StorageType bias_storage_type,
     const StorageType weight_storage_type)
     : shader_src(name, spirv_bin, size, layout),
       tile_size(tile_size),
+      bias_storage_type(bias_storage_type),
       weight_storage_type(weight_storage_type) {
   for (uint64_t i = 0; i < tile_size.size(); ++i) {
     shader_src.out_tile_size.data[i] = tile_size[i];
diff --git a/aten/src/ATen/native/vulkan/api/Shader.h b/aten/src/ATen/native/vulkan/api/Shader.h
index 9b8acb91e51ba..c676d10b19379 100644
--- a/aten/src/ATen/native/vulkan/api/Shader.h
+++ b/aten/src/ATen/native/vulkan/api/Shader.h
@@ -78,6 +78,7 @@ bool operator==(const ShaderSource& _1, const ShaderSource& _2);
 struct ShaderInfo final {
   ShaderSource shader_src;
   c10::SmallVector<uint32_t, 4> tile_size;
+  StorageType bias_storage_type{StorageType::UNKNOWN};
   StorageType weight_storage_type{StorageType::UNKNOWN};
 
   explicit ShaderInfo() = default;
@@ -88,6 +89,7 @@ struct ShaderInfo final {
       const uint32_t,
       const std::vector<VkDescriptorType>&,
       const std::vector<uint32_t>& tile_size,
+      const StorageType bias_storage_type,
       const StorageType weight_storage_type);
 };
 
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d.glsl
index 412522aa388f6..9d73356c71e7e 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d.glsl
@@ -5,6 +5,7 @@
 /*
  * TILE_SIZE = (1, 1, 1)
  * WEIGHT_STORAGE = TEXTURE_2D
+ * BIAS_STORAGE = TEXTURE_2D
  */
 
 layout(std430) buffer;
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl
index 045097f548a41..89be0f3b69b21 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl
@@ -5,6 +5,7 @@
 /*
  * TILE_SIZE = (1, 1, 1)
  * WEIGHT_STORAGE = TEXTURE_2D
+ * BIAS_STORAGE = TEXTURE_2D
  * Note that for DW kernel IC = 1 so the weight layout is really OC4, H, W, 4oc
  */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/conv_transpose2d.glsl b/aten/src/ATen/native/vulkan/glsl/conv_transpose2d.glsl
index d9589ac90becf..b3c983fc52149 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv_transpose2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv_transpose2d.glsl
@@ -5,6 +5,7 @@
 /*
  * TILE_SIZE = (1, 1, 1)
  * WEIGHT_STORAGE = TEXTURE_2D
+ * BIAS_STORAGE = TEXTURE_2D
  */
 
 layout(std430) buffer;
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d.glsl
index 57017cbd6e784..bb139d914f07a 100644
--- a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d.glsl
@@ -5,6 +5,7 @@
 /*
  * TILE_SIZE = (1, 1, 1)
  * WEIGHT_STORAGE = TEXTURE_3D
+ * BIAS_STORAGE = TEXTURE_3D
  */
 
 layout(std430) buffer;
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_dw.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_dw.glsl
index 5441a34274c85..c2ccee79d56ad 100644
--- a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_dw.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_dw.glsl
@@ -5,6 +5,7 @@
 /*
  * TILE_SIZE = (1, 1, 1)
  * WEIGHT_STORAGE = TEXTURE_3D
+ * BIAS_STORAGE = TEXTURE_3D
  * Note that for DW kernel IC = 1 so the weight layout is really OC4, H, W, 4oc
  */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_pw_2x2.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_pw_2x2.glsl
index 4522cf0a7c3d0..c8a2a98f9ef0b 100644
--- a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_pw_2x2.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_pw_2x2.glsl
@@ -5,6 +5,7 @@
 /*
  * TILE_SIZE = (2, 2, 1)
  * WEIGHT_STORAGE = TEXTURE_3D
+ * BIAS_STORAGE = TEXTURE_3D
  */
 
 /*
diff --git a/aten/src/ATen/native/vulkan/glsl/templates/conv2d_pw.glslt b/aten/src/ATen/native/vulkan/glsl/templates/conv2d_pw.glslt
index 191848419ed6a..8f3c5a38db870 100644
--- a/aten/src/ATen/native/vulkan/glsl/templates/conv2d_pw.glslt
+++ b/aten/src/ATen/native/vulkan/glsl/templates/conv2d_pw.glslt
@@ -2,6 +2,7 @@
  * TILE_SIZE = ($TILE_SIZE_X, $TILE_SIZE_Y, 1)
  * WEIGHT_STORAGE = TEXTURE_2D
  * WEIGHT_STORAGE_LAYOUT = OC4,IC4,4ic,4oc
+ * BIAS_STORAGE = TEXTURE_2D
  */
 
 layout(std430) buffer;
diff --git a/tools/gen_vulkan_spv.py b/tools/gen_vulkan_spv.py
index 37aa97aa1d6d7..f7522a091ec9b 100644
--- a/tools/gen_vulkan_spv.py
+++ b/tools/gen_vulkan_spv.py
@@ -23,6 +23,7 @@ class ShaderInfo:
     tile_size: List[int]
     layouts: List[str]
     weight_storage_type: str = ""
+    bias_storage_type: str = ""
 
 def getName(filePath):
     return os.path.basename(filePath).replace("/", "_").replace(".", "_")
@@ -49,6 +50,15 @@ def getWeightStorageType(lineStr):
     matches = re.search(weight_storage_id, lineStr)
     return matches.group(1)
 
+def isBiasStorageTypeLine(lineStr):
+    weight_storage_id = r"^ \* BIAS_STORAGE = "
+    return re.search(weight_storage_id, lineStr)
+
+def getBiasStorageType(lineStr):
+    weight_storage_id = r"^ \* BIAS_STORAGE = ([a-zA-Z]+_\dD)"
+    matches = re.search(weight_storage_id, lineStr)
+    return matches.group(1)
+
 typeIdMapping = {
     r"image[123]D\b": "VK_DESCRIPTOR_TYPE_STORAGE_IMAGE",
     r"sampler[123]D\b": "VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER",
@@ -77,6 +87,8 @@ def getShaderInfo(srcFilePath):
                 shader_info.tile_size = findTileSizes(line)
             if isWeightStorageTypeLine(line):
                 shader_info.weight_storage_type = getWeightStorageType(line)
+            if isBiasStorageTypeLine(line):
+                shader_info.bias_storage_type = getBiasStorageType(line)
 
     return shader_info
 
@@ -204,6 +216,13 @@ def genCppH(hFilePath, cppFilePath, srcDirPath, glslcPath, tmpDirPath, env):
             cpp += "const api::StorageType {} = \n".format(name_weight_storage_type)
             cpp += "  {};\n".format(storageTypeToEnum[shader_info.weight_storage_type])
 
+        # Add bias type
+        if (shader_info.bias_storage_type != ""):
+            name_bias_storage_type = name + "_bias_storage_type"
+            h += "extern const api::StorageType {};\n".format(name_bias_storage_type)
+            cpp += "const api::StorageType {} = \n".format(name_bias_storage_type)
+            cpp += "  {};\n".format(storageTypeToEnum[shader_info.bias_storage_type])
+
     cpp += nsend
     h += nsend
 

From 0e6fb7e56a60404025e19e51e1460774156d1858 Mon Sep 17 00:00:00 2001
From: Max Ren <maxren@meta.com>
Date: Thu, 3 Nov 2022 20:05:53 +0000
Subject: [PATCH 0542/1922] [coreml] delegate multiple outputs (#88345)

Summary:
https://www.internalfb.com/code/fbsource/[c0e4da0b5c7fff3b4e31e4611033c30cabdc6aef]/fbcode/caffe2/torch/csrc/jit/backends/backend_detail.cpp?lines=268-276

seems like the torchscript addition of
`$unpack, = self.__backend.execute( ... `

the comma after unpack forces the result of execute to have only one item. So for this fix now when the size of the outputs > 1, execute returns a List List of outputs (basically put the outputs in another list before putting it into the list we return)
```
[[output1, output2, output3, ...]]
```
instead of
```
[output1, output2, output3, ...]
```

Do we want to fix this in backend_detail? Or should we make the change in our delegate to accomadate the torchscript? Proposing this q here. Requesting cccclai, kimishpatel for approval here

Test Plan: unblocked models for chengxiangyin and models in pytorch playground all passing unit tests

Reviewed By: kimishpatel, cccclai

Differential Revision: D40328684

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88345
Approved by: https://github.com/jmdetloff, https://github.com/Skylion007
---
 torch/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm
index 3588128cc0522..9db3509dc1d2b 100644
--- a/torch/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm
+++ b/torch/csrc/jit/backends/coreml/objc/PTMCoreMLBackend.mm
@@ -88,9 +88,14 @@ GenericList pack_outputs(const std::vector<TensorSpec>& output_specs, id<MLFeatu
       tensor.data_ptr<float>(),
       (float*)val.multiArrayValue.dataPointer,
       count * sizeof(float));
-    outputs.push_back(tensor);
+    outputs.push_back(std::move(tensor));
   }
-  return c10::impl::toList(outputs);
+  if(output_specs.size() > 1){
+    c10::List<c10::List<torch::Tensor>> output_res;
+    output_res.push_back(std::move(outputs));
+    return c10::impl::toList(std::move(output_res));
+  }
+  return c10::impl::toList(std::move(outputs));
 }
 
 class CoreMLBackend: public torch::jit::PyTorchBackendInterface {

From e57584a46d6eb1bb513a68c987472f22e8230f7f Mon Sep 17 00:00:00 2001
From: Author Name <bcoutinho@meta.com>
Date: Thu, 3 Nov 2022 20:08:16 +0000
Subject: [PATCH 0543/1922] [profiler] Add an option initialize kineto profiler
 on start up (#87226) (#88020)

Summary:
# Initialize Kineto Profiler for on-demand profiling

## TLDR
Overall this patch enables initializing the kineto profiling library on start-up. This is guarded by an env variable that is described a bit more later. The kineto profiler is otherwise initialized lazily when pytorch profiler is invoked.

## Background
We are enabling on-demand profiling capability for pytorch. As users run large distributed training flows this will enable one to capture a pytorch profiler/GPU trace remotely, from outside the process. The kineto library and a monitoring daemon - dynolog- interact to achieve this.

Dynolog will be open sourced by end of October, and has been dogfooded on Meta AI Research cluster.
https://github.com/facebookincubator/dynolog

### How it works
Kineto library registers itself with the dynolog daemon running on the host over inter process communication
```
  | kineto  |   --> (ipcfabric)  --> | dynolog |
   * register()
   * poll for on-demand tracing configs()
```
This feature is currently enabled by setting the env variable `KINETO_USE_DAEMON`.  However, it only works if we initialize kineto, else the thread to talk to dynolog is not spun up.

Related PRs in kineto include
https://github.com/pytorch/kineto/pull/637
https://github.com/pytorch/kineto/pull/653

## TestPlan:
Build pytorch from source (need to set USE_LITE_INTERPRETER_PROFILER=OFF)

Run a simple linear model [example](https://pytorch.org/tutorials/beginner/basics/tensorqs_tutorial.html).

### First run with the env variable set
```
export KINETO_CONFIG=/private/home/bcoutinho//libkineto.conf
export KINETO_USE_DAEMON=1
python3 /private/home/bcoutinho/linear_model.py
```
Output
```
INFO:2022-10-18 09:01:12 4169946:4169946 init.cpp:98] Registering daemon config loader
cuda:0
```
We can trigger a trace using the dynolog client tool
```
#> dyno gputrace --log-file /tmp/gpu_trace_test.json
response length = 147
response = {"activityProfilersBusy":0,"activityProfilersTriggered":[4116844],"eventProfilersBusy":0,"eventProfilersTriggered":[],"processesMatched":[4116844]}
Matched 1 processes
Trace output files will be written to:
    /tmp/gpu_trace_test_4116844.json
```

### Run without env variable.
```
 python3 ../../linear_model.py
cuda:0
99 1425.056884765625
10099 8.817168235778809
```

## Side effects to initialization

Currently the environment should guard users from picking this change up unless intended. The libkineto_init does setup CUPTI APIs and spins up a thread to read on-demand configurations. This should not be problematic, we can provide a more granular init in the future.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87226

Reviewed By: chaekit

Differential Revision: D40558184

Pulled By: briancoutinho

fbshipit-source-id: afea7502b1d72201c00994c87fde63a35783f4d5

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88020
Approved by: https://github.com/chaekit
---
 torch/csrc/profiler/kineto_client_interface.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/torch/csrc/profiler/kineto_client_interface.cpp b/torch/csrc/profiler/kineto_client_interface.cpp
index 1eea7002f6afb..c9e07ca367c5c 100644
--- a/torch/csrc/profiler/kineto_client_interface.cpp
+++ b/torch/csrc/profiler/kineto_client_interface.cpp
@@ -1,6 +1,7 @@
 #ifdef USE_KINETO
 #include <libkineto.h>
 #include <torch/csrc/autograd/profiler_kineto.h>
+#include <cstdlib>
 
 // Ondemand tracing is not supported on Apple or edge platform
 #if defined(__APPLE__) || defined(EDGE_PROFILER_USE_KINETO)
@@ -61,12 +62,22 @@ class LibKinetoClient : public libkineto::ClientInterface {
 } // namespace profiler
 
 #if ENABLE_GLOBAL_OBSERVER
+namespace {
+
 struct RegisterLibKinetoClient {
   RegisterLibKinetoClient() {
     static profiler::impl::LibKinetoClient client;
+
+    if (std::getenv("KINETO_USE_DAEMON") != nullptr) {
+      libkineto_init(/*cpuOnly=*/false, /*logOnError=*/true);
+      libkineto::api().suppressLogMessages();
+    }
+
     libkineto::api().registerClient(&client);
   }
 } register_libkineto_client;
+
+} // namespace
 #endif
 
 } // namespace torch

From 96f613c6c5658f09d7c0f6efd5007a47ee0a129c Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Thu, 3 Nov 2022 16:26:26 +0000
Subject: [PATCH 0544/1922] [FSDP] Default to `BACKWARD_PRE` (#88428)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88428
Approved by: https://github.com/mrshenli
---
 torch/distributed/fsdp/fully_sharded_data_parallel.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 32e048faa1a17..6ab5e70389838 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -329,10 +329,8 @@ class FullyShardedDataParallel(nn.Module):
                 >>> my_auto_wrap_policy = functools.partial(custom_auto_wrap_policy, min_num_params=1e5)
 
         backward_prefetch (Optional[BackwardPrefetch]):
-            This is an experimental feature that is subject to change in the
-            the near future. It allows users to enable two different backward_prefetch
-            algorithms to help backward communication and computation overlapping.
-            Pros and cons of each algorithm is explained in the class ``BackwardPrefetch``.
+            This configures explicit backward prefetching of all-gathers. See
+            :class:`BackwardPrefetch` for details. (Default: ``BACKWARD_PRE``)
         mixed_precision (Optional[MixedPrecision]): A ``MixedPrecision`` instance
             describing the mixed precision training config to be used. ``MixedPrecision``
             supports configuring parameter, buffer, and gradient communication dtype. Note
@@ -428,7 +426,7 @@ def __init__(
         sharding_strategy: Optional[ShardingStrategy] = None,
         cpu_offload: Optional[CPUOffload] = None,
         auto_wrap_policy: Optional[Callable] = None,
-        backward_prefetch: Optional[BackwardPrefetch] = None,
+        backward_prefetch: Optional[BackwardPrefetch] = BackwardPrefetch.BACKWARD_PRE,
         mixed_precision: Optional[MixedPrecision] = None,
         ignored_modules: Optional[Iterable[torch.nn.Module]] = None,
         param_init_fn: Optional[Callable[[nn.Module], None]] = None,

From c55eadbf2dc419a46a48a517ae3d5b387ff47fac Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchu@microsoft.com>
Date: Thu, 3 Nov 2022 20:18:33 +0000
Subject: [PATCH 0545/1922] [ONNX] Remove the INT64_MAX magic numbers (#88341)

Remove the magic numbers in symbolic opsets and use a INT64_MAX  global instead.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88341
Approved by: https://github.com/BowenBao
---
 torch/onnx/_constants.py       |  2 ++
 torch/onnx/symbolic_opset10.py | 11 +++++++----
 torch/onnx/symbolic_opset9.py  | 17 ++++++++++++-----
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/torch/onnx/_constants.py b/torch/onnx/_constants.py
index 8b71a4f86c173..ed27f94a9e144 100644
--- a/torch/onnx/_constants.py
+++ b/torch/onnx/_constants.py
@@ -10,3 +10,5 @@
 ONNX_CONSTANT_FOLDING_MIN_OPSET = 9
 
 PYTORCH_GITHUB_ISSUES_URL = "https://github.com/pytorch/pytorch/issues"
+
+INT64_MAX = 9223372036854775807
diff --git a/torch/onnx/symbolic_opset10.py b/torch/onnx/symbolic_opset10.py
index d09133a60b9d0..a02009a74f696 100644
--- a/torch/onnx/symbolic_opset10.py
+++ b/torch/onnx/symbolic_opset10.py
@@ -10,6 +10,7 @@
 
 # Monkey-patch graph manipulation methods on Graph, used for the ONNX symbolics
 from torch.onnx import (  # noqa: F401
+    _constants,
     _patch_torch,
     _type_utils,
     errors,
@@ -345,7 +346,7 @@ def _slice(
         if (
             len(starts) == 1
             and starts[0] == 0
-            and ends[0] == 9223372036854775807
+            and ends[0] == _constants.INT64_MAX
             and (steps is None or (len(steps) == 1 and steps[0] == 1))
         ):
             return input
@@ -388,11 +389,13 @@ def slice(g: jit_utils.GraphContext, self, *args):
         if is_start_none:
             start = g.op("Constant", value_t=torch.tensor(0))
         if is_end_none:
-            end = g.op("Constant", value_t=torch.tensor(9223372036854775807))
+            end = g.op("Constant", value_t=torch.tensor(_constants.INT64_MAX))
     else:
         start = [0 if is_start_none else symbolic_helper._parse_arg(start, "i")]
         end = [
-            9223372036854775807 if is_end_none else symbolic_helper._parse_arg(end, "i")
+            _constants.INT64_MAX
+            if is_end_none
+            else symbolic_helper._parse_arg(end, "i")
         ]
         dim = [symbolic_helper._parse_arg(dim, "i")]
         dynamic_slice = False
@@ -416,7 +419,7 @@ def flip(g: jit_utils.GraphContext, input, dims):
         input,
         axes=dims,
         starts=[-1] * len(dims),
-        ends=[-9223372036854775807] * len(dims),
+        ends=[-_constants.INT64_MAX] * len(dims),
         steps=[-1] * len(dims),
     )
 
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 546a4fa7ce260..c02fb0f200909 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -285,7 +285,6 @@
 ]
 
 
-_INT64_MAX = 9223372036854775807
 _onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=9)
 
 
@@ -707,7 +706,7 @@ def sign(g: jit_utils.GraphContext, self):
 @_beartype.beartype
 def _slice(g: jit_utils.GraphContext, input, axes, starts, ends):
     assert len(starts) == len(ends)
-    if len(starts) == 1 and starts[0] == 0 and ends[0] == _INT64_MAX:
+    if len(starts) == 1 and starts[0] == 0 and ends[0] == _constants.INT64_MAX:
         return input
     return g.op("Slice", input, axes_i=axes, starts_i=starts, ends_i=ends)
 
@@ -1130,7 +1129,7 @@ def select(g: jit_utils.GraphContext, self, dim, index):
     index = symbolic_helper._maybe_get_scalar(index)
     if (not symbolic_helper._is_value(index)) and (index < 0):
         if index == -1:
-            end_index = _INT64_MAX
+            end_index = _constants.INT64_MAX
         else:
             end_index = index + 1
         slice_node = symbolic_helper._slice_helper(
@@ -3814,7 +3813,11 @@ def slice(g: jit_utils.GraphContext, self, *args):
                 )
         else:
             start = 0 if is_start_none else symbolic_helper._parse_arg(start, "i")
-            end = _INT64_MAX if is_end_none else symbolic_helper._parse_arg(end, "i")
+            end = (
+                _constants.INT64_MAX
+                if is_end_none
+                else symbolic_helper._parse_arg(end, "i")
+            )
             dim = symbolic_helper._parse_arg(dim, "i")
             return symbolic_helper._slice_helper(
                 g, self, axes=[dim], starts=[start], ends=[end]
@@ -3830,7 +3833,11 @@ def slice(g: jit_utils.GraphContext, self, *args):
             end.type(), _C.NoneType
         )
         start = 0 if is_start_none else symbolic_helper._parse_arg(start, "i")
-        end = _INT64_MAX if is_end_none else symbolic_helper._parse_arg(end, "i")
+        end = (
+            _constants.INT64_MAX
+            if is_end_none
+            else symbolic_helper._parse_arg(end, "i")
+        )
         return symbolic_helper._slice_helper(
             g, self, axes=[dim], starts=[start], ends=[end]
         )

From 1c117fd3f66a9b031bb295ebf5d9435d5a4b137e Mon Sep 17 00:00:00 2001
From: Po-Wei Chou <poweic@meta.com>
Date: Thu, 3 Nov 2022 20:20:49 +0000
Subject: [PATCH 0546/1922] [pytorch] Expose EmbeddingPackedParamsBase::unpack
 to Python (#88362)

Summary:
User can't call `.unpack()` when they have a quantized Embedding layer because `&EmbeddingPackedParamsBase::unpack` was never exposed to Python through pybind.

This diff fixes that.

Test Plan: CI

Reviewed By: jerryzh168

Differential Revision: D40606585

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88362
Approved by: https://github.com/jerryzh168
---
 aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
index 658e2c48481e6..8af21bbc7df8b 100644
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
@@ -560,6 +560,7 @@ int register_embedding_params() {
             return PackedEmbeddingBagWeight::prepack(weight);
           })
       .def("bit_rate", &EmbeddingPackedParamsBase::bit_rate)
+      .def("unpack", &EmbeddingPackedParamsBase::unpack)
       .def("version", &EmbeddingPackedParamsBase::version);
 
   return 0;

From 29de82dc47620334b794bf43207bffa6ec044d80 Mon Sep 17 00:00:00 2001
From: Sam Tsai <sstsai@meta.com>
Date: Thu, 3 Nov 2022 20:32:54 +0000
Subject: [PATCH 0547/1922] Fix fuse_func method overwrite (#87791) (#88193)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87791

Fixing the interface so that the fuse_func is honored and not replaced but the default fuse_known_method.

Test Plan: Wait for sandcastle

Reviewed By: jerryzh168

Differential Revision: D40722395

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88193
Approved by: https://github.com/jerryzh168
---
 test/quantization/eager/test_fuse_eager.py | 12 ++++++++++++
 torch/ao/quantization/fuse_modules.py      |  4 ++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/test/quantization/eager/test_fuse_eager.py b/test/quantization/eager/test_fuse_eager.py
index 9f120b889c2e5..1ebc4bfd094eb 100644
--- a/test/quantization/eager/test_fuse_eager.py
+++ b/test/quantization/eager/test_fuse_eager.py
@@ -28,6 +28,7 @@
     ModelForLinearBNFusion,
     ModelForFusionWithBias,
     ModelForConvTransposeBNFusion,
+    SingleLayerLinearModel,
     test_only_eval_fn,
     test_only_train_fn,
     skipIfNoFBGEMM,
@@ -363,6 +364,17 @@ def test_fusion_convtranspose_bn_eval(self):
 
         self.assertEqual(golden, model(inp2))
 
+    def test_fuse_function_customization(self):
+        dummy_model = SingleLayerLinearModel().train()
+        dummy_model.eval()
+
+        # A custom fuse funct
+        def custom_fuse_func(module, is_qat, add_fuser_mapping):
+            return [torch.nn.Identity()]
+
+        dummy_model = fuse_modules(dummy_model, [["fc1"]], fuser_func=custom_fuse_func)
+        self.assertEqual(type(dummy_model.fc1), nn.Identity)
+
     def test_forward_hooks_preserved(self):
         r"""Test case that checks whether forward pre hooks of the first module and
         post forward hooks of the last module in modules list passed to fusion function preserved.
diff --git a/torch/ao/quantization/fuse_modules.py b/torch/ao/quantization/fuse_modules.py
index eb7296e38f60f..6cf37af0cf934 100644
--- a/torch/ao/quantization/fuse_modules.py
+++ b/torch/ao/quantization/fuse_modules.py
@@ -160,7 +160,7 @@ def fuse_modules(model, modules_to_fuse, inplace=False, fuser_func=fuse_known_mo
         modules_to_fuse,
         is_qat=False,
         inplace=inplace,
-        fuser_func=fuse_known_modules,
+        fuser_func=fuser_func,
         fuse_custom_config_dict=None)
 
 def fuse_modules_qat(model, modules_to_fuse, inplace=False, fuser_func=fuse_known_modules, fuse_custom_config_dict=None):
@@ -171,5 +171,5 @@ def fuse_modules_qat(model, modules_to_fuse, inplace=False, fuser_func=fuse_know
         modules_to_fuse,
         is_qat=True,
         inplace=inplace,
-        fuser_func=fuse_known_modules,
+        fuser_func=fuser_func,
         fuse_custom_config_dict=None)

From df8dc129a224b515e9bb752e19adf988c8d42b51 Mon Sep 17 00:00:00 2001
From: Elias Ellison <elias.ellison@gmail.com>
Date: Thu, 3 Nov 2022 18:22:44 +0000
Subject: [PATCH 0548/1922] Dont hold onto references of saved tensors in
 backward (#88247)

This improves memory compression of resnet18 on inductor non-cudagraphs from .78 -> .0.84.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88247
Approved by: https://github.com/ezyang
---
 functorch/_src/aot_autograd.py | 3 +++
 functorch/_src/compilers.py    | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/functorch/_src/aot_autograd.py b/functorch/_src/aot_autograd.py
index e0a9e10d11bea..e0c9d09d821d6 100644
--- a/functorch/_src/aot_autograd.py
+++ b/functorch/_src/aot_autograd.py
@@ -473,12 +473,15 @@ def forward(ctx, *deduped_flat_tensor_args):
         def backward(ctx, *flat_args):
             contiguous_args = [t.contiguous() if torch.is_tensor(t) else t for t in flat_args]
             all_args = list(ctx.symints) + list(ctx.saved_tensors) + list(contiguous_args)
+            del contiguous_args
             if CompiledFunction.compiled_bw is None:
+                # TODO - pass in fake tensors ?
                 context = disable_autocast_manager if disable_amp else nullcontext
                 with context(), track_graph_compiling("backward", True):
                     CompiledFunction.compiled_bw = aot_config.bw_compiler(
                         bw_module, all_args
                     )
+
             ctx.maybe_clear_saved_tensors()
             out = call_func_with_args(
                 CompiledFunction.compiled_bw, all_args, steal_args=True, disable_amp=disable_amp
diff --git a/functorch/_src/compilers.py b/functorch/_src/compilers.py
index 18deafa244695..3f52fede57ebf 100644
--- a/functorch/_src/compilers.py
+++ b/functorch/_src/compilers.py
@@ -85,7 +85,8 @@ def ts_compile(fx_g: fx.GraphModule, inps) -> Callable:
 
         f = torch.jit.freeze(f.eval())
         f = torch.jit.optimize_for_inference(f)
-        f(*inps)
+        if not any(isinstance(t, torch._subclasses.FakeTensor) for t in inps):
+            f(*inps)
     return f
 
 
From f19f6f38a05574e5c01774ee2998a65162fb6ab1 Mon Sep 17 00:00:00 2001
From: samdow <samdow@fb.com>
Date: Thu, 3 Nov 2022 21:50:52 +0000
Subject: [PATCH 0549/1922] [functorch] make hessian docs actually use hessian
 function (#88451)

I was going through the hessian docs to find an example and noticed that these docs don't actually use the hessian function....
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88451
Approved by: https://github.com/zou3519, https://github.com/Skylion007
---
 functorch/_src/eager_transforms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/functorch/_src/eager_transforms.py b/functorch/_src/eager_transforms.py
index 209a738060bed..6144318edf3a3 100644
--- a/functorch/_src/eager_transforms.py
+++ b/functorch/_src/eager_transforms.py
@@ -1060,7 +1060,7 @@ def hessian(func, argnums=0):
         >>>   return x.sin().sum()
         >>>
         >>> x = torch.randn(5)
-        >>> hess = jacfwd(jacrev(f))(x)
+        >>> hess = hessian(f)(x)  # equivalent to jacfwd(jacrev(f))(x)
         >>> assert torch.allclose(hess, torch.diag(-x.sin()))
 
     """

From 24332e072fd58244af089e1a9c92c099e440d7fc Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Thu, 3 Nov 2022 21:57:19 +0000
Subject: [PATCH 0550/1922] Revert "Put Python Dispatcher cache in dict, clear
 it on new registrations. (#88329)"

This reverts commit 86c7cd287caeb23c227d97d283e58bc123294746.

Reverted https://github.com/pytorch/pytorch/pull/88329 on behalf of https://github.com/clee2000 due to test_decomp takes an extra 2 hours in some jobs, windows takes so long it times out
---
 torch/_ops.py                           | 21 +++++++++++----------
 torch/csrc/autograd/python_variable.cpp | 13 ++-----------
 2 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/torch/_ops.py b/torch/_ops.py
index feb05771b4931..a4119d758524f 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -243,8 +243,6 @@ def __init__(self, overloadpacket, op, op_dk, schema, tags):
         op.__module__ = overloadpacket.__module__
         self.__qualname__ = self._name
         self.__annotations__ = {}
-        # NB: This name is hard-coded in torch/csrc/autograd/python_variable.cpp
-        self._dispatch_cache = {}
 
     # it's a no-op since OpOverload object is immutable and must be unique for a given op overload.
     def __deepcopy__(self, memo=None):
@@ -291,7 +289,6 @@ def inner(fn):
                 assert mode not in self.python_key_mode_table
                 # TODO(voz): Should we replace setting torch._C.DispatchKey.Python entirely with setting mode keys?
                 self.python_key_mode_table[mode] = fn
-                self._dispatch_cache.clear()
                 return fn
 
             assert isinstance(dispatch_key_or_mode, torch._C.DispatchKey)
@@ -304,19 +301,23 @@ def inner(fn):
                     f"Trying to override a python impl for {dispatch_key_or_mode} on operator {self._name}"
                 )
             self.py_kernels[dispatch_key_or_mode] = fn
-            self._dispatch_cache.clear()
             return fn
 
         return inner
 
     # This implements the pre-computation logic for the Python dispatcher.
-    def _get_dispatch(self, key):
-        # This is only called upon a cache miss
-        assert key not in self._dispatch_cache
+    def __getattr__(self, attr):
+        if len(attr) == 0 or not attr[0].isupper():
+            raise AttributeError()
+
+        try:
+            key = torch._C._dispatch_key_parse(attr)
+        except Exception as e:
+            raise AttributeError()
 
         if key == torch._C.DispatchKey.Python:
             if not self.python_key_mode_table:
-                self._dispatch_cache[key] = key
+                setattr(self, attr, key)
                 return key
 
             def handler(*args, **kwargs):
@@ -335,12 +336,12 @@ def handler(*args, **kwargs):
                 # TODO(voz): The idea behind this is that we do not yet support dispatch by key + mode, only key.
                 return self.python_key_mode_table[curr_mode](*args, **kwargs)
 
-            self._dispatch_cache[key] = handler
+            setattr(self, attr, handler)
             return handler
 
         key = resolve_key(self, key)
         r = self.py_kernels.get(key, key)
-        self._dispatch_cache[key] = r
+        setattr(self, attr, r)
         return r
 
     def name(self):
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index 920d0e7344b58..488858246429b 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -2321,20 +2321,11 @@ void ConcretePyInterpreterVTable::python_dispatcher(
     torch::jit::Stack* stack) const {
   py::gil_scoped_acquire g;
   py::handle torch_api_function_overload = getTorchApiFunction(op);
-  // TODO: if necessary, can optimize to cache the cache lookup
-  // TODO: if necessary, can optimize OpOverload to have slots
-  auto cache = py::dict(torch_api_function_overload.attr("_dispatch_cache"));
-  if (cache.ptr() == nullptr) {
-    throw python_error();
-  }
 
   c10::DispatchKey k = ks.highestPriorityTypeId();
-  // TODO: allow this to be non-owning
-  auto handler = py::reinterpret_borrow<py::object>(
-      PyDict_GetItem(cache.ptr(), py::cast(k).ptr()));
+  auto handler = torch_api_function_overload.attr(toString(k));
   if (handler.ptr() == nullptr) {
-    // Slow path
-    handler = torch_api_function_overload.attr("_get_dispatch")(k);
+    throw python_error();
   }
   if (py::isinstance<c10::DispatchKey>(handler)) {
     // NB: not redispatch, as that will permanently remove the python

From 4bae25bcb6ff586880f3746005ca94b7edb0b2de Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 3 Nov 2022 22:56:05 +0000
Subject: [PATCH 0551/1922] [dynamo] Skip mutation detection for inference mode
 (#88406)

Skip the mutation detection for inference_mode, and raise a warning. This helps one internal model

Related to https://github.com/pytorch/torchdynamo/issues/1768

@ezyang What do you think about this? The issue that Dynamo mutation detector uses version counter to detect mutation.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88406
Approved by: https://github.com/ezyang
---
 test/dynamo/test_misc.py                | 15 ++++++++++++++
 torch/_dynamo/optimizations/training.py | 27 +++++++++++++++----------
 2 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index 42244892b45f6..c309335f0a9d1 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -2775,6 +2775,21 @@ def f(x):
         with self.assertRaisesRegex(RuntimeError, "Detected that you are using FX"):
             gm = torch.fx.symbolic_trace(optimized)
 
+    def test_inference_mode(self):
+        @torch.inference_mode()
+        def func(x, y):
+            return x.add(1.0) + y
+
+        x = torch.ones(4, requires_grad=True)
+        y = torch.ones(4, requires_grad=True)
+        ref = func(x, y)
+        opt_func = torch._dynamo.optimize("eager")(func)
+
+        x1 = torch.ones(4, requires_grad=True)
+        res = opt_func(x1, y)
+        self.assertTrue(same(ref, res))
+        self.assertTrue(same(x, x1))
+
 
 class CustomFunc(torch.autograd.Function):
     @staticmethod
diff --git a/torch/_dynamo/optimizations/training.py b/torch/_dynamo/optimizations/training.py
index 536706cf4c8f5..588956a898f41 100644
--- a/torch/_dynamo/optimizations/training.py
+++ b/torch/_dynamo/optimizations/training.py
@@ -51,18 +51,23 @@ def raise_or_warn(reason):
     # 2) Mutation in the graph
     mutated = False
     try:
-        if functorch.compile.config.use_functionalize:
-            # There are two problematic classes we still exclude for now with
-            # functionalization:
-            #   - data mutation of inputs (fixed when we stop recording the
-            #   copy_ directly into the graph)
-            #   - metadata mutation of inputs (fixed if we do an extra partition
-            #   to avoid AotAutograd on the mutated inputs, or if we some how
-            #   get custom autograd function to reflect metadata changes to the
-            #   original tensor)
-            mutated = has_mutation(gm, example_inputs, inputs_only=True)
+        if not torch.is_inference_mode_enabled():
+            if functorch.compile.config.use_functionalize:
+                # There are two problematic classes we still exclude for now with
+                # functionalization:
+                #   - data mutation of inputs (fixed when we stop recording the
+                #   copy_ directly into the graph)
+                #   - metadata mutation of inputs (fixed if we do an extra partition
+                #   to avoid AotAutograd on the mutated inputs, or if we some how
+                #   get custom autograd function to reflect metadata changes to the
+                #   original tensor)
+                mutated = has_mutation(gm, example_inputs, inputs_only=True)
+            else:
+                mutated = has_mutation(gm, example_inputs)
         else:
-            mutated = has_mutation(gm, example_inputs)
+            log.info(
+                "inference_mode enabled. TorchDynamo could not check for mutation."
+            )
     except NotImplementedError as e:
         if "SparseTensorImpl" not in str(e):
             # TODO - TorchDynamo mutation analysis cannot handle sparse tensors.

From 4024445bb5fb476826db17a919a7123e1b5dd082 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Thu, 3 Nov 2022 16:26:36 +0000
Subject: [PATCH 0552/1922] [FSDP][Docs] Simplify `mixed_precision` ctor docs
 (#88429)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88429
Approved by: https://github.com/mrshenli
---
 torch/distributed/fsdp/api.py                 | 15 ++++++++++++--
 .../fsdp/fully_sharded_data_parallel.py       | 20 +++++--------------
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/torch/distributed/fsdp/api.py b/torch/distributed/fsdp/api.py
index 8e0344318e531..9e1327c80633c 100644
--- a/torch/distributed/fsdp/api.py
+++ b/torch/distributed/fsdp/api.py
@@ -99,6 +99,10 @@ class MixedPrecision:
             pass. This may be set to ``False`` to save memory if using custom
             optimizers that can perform the optimizer step in ``reduce_dtype``.
 
+    .. note:: This API is experimental and subject to change.
+
+    .. note:: Only floating point tensors are cast to their specified dtypes.
+
     .. note:: In ``summon_full_params``, parameters are forced to full
         precision, but buffers are not.
 
@@ -106,8 +110,6 @@ class MixedPrecision:
         precision. For buffers, this is only supported for
         ``StateDictType.FULL_STATE_DICT``.
 
-    .. note:: This API is experimental and subject to change.
-
     .. note:: Each low precision dtype must be specified explicitly. For
         example, ``MixedPrecision(reduce_dtype=torch.float16)`` only specifies
         the reduction dtype to be low precision, and FSDP will not cast
@@ -116,6 +118,15 @@ class MixedPrecision:
     .. note:: If a ``reduce_dtype`` is not specified, then gradient reduction
         happens in ``param_dtype`` if specified or the original parameter dtype
         otherwise.
+
+    .. note:: If the user passes a model with ``BatchNorm`` modules and an
+        ``auto_wrap_policy`` to the FSDP constructor, then FSDP will disable
+        mixed precision for ``BatchNorm`` modules by wrapping them separately
+        in their own FSDP instance with mixed precision disabled. This is due
+        to some missing low precision ``BatchNorm`` kernels. If the user does
+        not use an ``auto_wrap_policy``, then the user must take care to not
+        use mixed precision for FSDP instances containing ``BatchNorm``
+        modules.
     """
 
     param_dtype: Optional[torch.dtype] = None
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 6ab5e70389838..7c910e7b8c098 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -331,21 +331,11 @@ class FullyShardedDataParallel(nn.Module):
         backward_prefetch (Optional[BackwardPrefetch]):
             This configures explicit backward prefetching of all-gathers. See
             :class:`BackwardPrefetch` for details. (Default: ``BACKWARD_PRE``)
-        mixed_precision (Optional[MixedPrecision]): A ``MixedPrecision`` instance
-            describing the mixed precision training config to be used. ``MixedPrecision``
-            supports configuring parameter, buffer, and gradient communication dtype. Note
-            that only floating point data is cast to the reduced precision. This allows
-            users potential memory saving and training speedup while trading off
-            accuracy during model training. If ``None``, no mixed precision is applied.
-            Note that if ``mixed_precision`` is enabled for FSDP model that
-            contains ``BatchNorm`` with ``auto_wrap_policy``, FSDP will take
-            care to disable mixed precision for ``BatchNorm`` units by wrapping
-            them separately in their own FSDP unit with ``mixed_precision=None``.
-            This is done because several ``BatchNorm`` kernels do not implement
-            reduced type support at the moment. If individually wrapping the model,
-            users must take care to set ``mixed_precision=None`` for
-            ``BatchNorm`` units.
-            (Default: ``None``)
+        mixed_precision (Optional[MixedPrecision]):
+            This configures native mixed precision for FSDP. If this is set to
+            ``None``, then no mixed precision is used. Otherwise, parameter,
+            buffer, and gradient reduction dtypes can be set. See
+            :class:`MixedPrecision` for details. (Default: ``None``)
         ignored_modules (Optional[Iterable[torch.nn.Module]]): Modules whose
             own parameters and child modules' parameters and buffers are
             ignored by this instance. None of the modules directly in

From 146f70d6d9ac5cd164185391b6352ddf626d5231 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Thu, 3 Nov 2022 23:15:39 +0000
Subject: [PATCH 0553/1922] Reset NVIDIA devices stuck in failed mode (#88459)

Try to reset the NVIDIA devices if they get stuck in failed mode per comment in https://github.com/pytorch/pytorch/issues/88388

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88459
Approved by: https://github.com/malfet
---
 .github/scripts/install_nvidia_utils_linux.sh | 22 ++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/install_nvidia_utils_linux.sh b/.github/scripts/install_nvidia_utils_linux.sh
index 7806dced2f17f..b1fdd468a7488 100755
--- a/.github/scripts/install_nvidia_utils_linux.sh
+++ b/.github/scripts/install_nvidia_utils_linux.sh
@@ -59,8 +59,28 @@ install_nvidia_driver_amzn2() {
             sudo yum install -y "kernel-devel-uname-r == $(uname -r)"
             sudo modprobe backlight
             sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
-            sudo /bin/bash /tmp/nvidia_driver -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
+
+            set +e
+            sudo /bin/bash /tmp/nvidia_driver -s --no-drm
+            NVIDIA_INSTALLATION_STATUS=$?
+
+            if [ "$NVIDIA_INSTALLATION_STATUS" -ne 0 ]; then
+                sudo cat /var/log/nvidia-installer.log
+
+                NVIDIA_DEVICES=$(lspci -D | grep -i NVIDIA | cut -d' ' -f1)
+                # The GPU can get stuck in a failure state if somehow the test crashs the GPU microcode. When this
+                # happens, we'll try to reset all NVIDIA devices https://github.com/pytorch/pytorch/issues/88388
+                for PCI_ID in "$NVIDIA_DEVICES"; do
+                    DEVICE_ENABLED=$(cat /sys/bus/pci/devices/$PCI_ID/enable)
+
+                    echo "Reseting $PCI_ID (enabled state: $DEVICE_ENABLED)"
+                    echo "1" > /sys/bus/pci/devices/$PCI_ID/reset
+                    sleep 1
+                done
+            fi
+
             sudo rm -fv /tmp/nvidia_driver
+            set -e
         fi
 
         sudo modprobe nvidia || true

From 47d5732e8dc8f8ee4ba908e537e2a5aa9017946e Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Thu, 3 Nov 2022 19:30:05 +0000
Subject: [PATCH 0554/1922] [FSDP] Allow to use TorchDispatch with FSDP
 (#88014)

Add `_no_dispatch_record_stream` to disable TorchDispatch before calling `record_stream()`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88014
Approved by: https://github.com/awgu
---
 torch/distributed/fsdp/_runtime_utils.py         | 16 ++++++++++++----
 torch/distributed/fsdp/_utils.py                 |  8 +++++++-
 torch/distributed/fsdp/flat_param.py             | 13 ++++++++-----
 .../fsdp/fully_sharded_data_parallel.py          |  1 +
 4 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index 3127881de34a2..e0fa12e19c2a2 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -14,7 +14,11 @@
     _is_composable,
     TrainingState,
 )
-from torch.distributed.fsdp._utils import _apply_to_tensors, p_assert
+from torch.distributed.fsdp._utils import (
+    _apply_to_tensors,
+    _no_dispatch_record_stream,
+    p_assert,
+)
 from torch.distributed.fsdp.api import BackwardPrefetch
 from torch.distributed.fsdp.flat_param import (
     _HandlesKey,
@@ -572,12 +576,16 @@ def _post_backward_hook(
                 # Since the sharded gradient is produced in the post-backward
                 # stream and consumed later in the computation stream, inform
                 # the caching allocator
-                sharded_grad.data.record_stream(torch.cuda.current_stream())
+                _no_dispatch_record_stream(
+                    sharded_grad.data, torch.cuda.current_stream()
+                )
 
             # Since the unsharded gradient is produced in the computation
             # stream and consumed in the post-backward stream, inform the
             # caching allocator (before it goes out of scope)
-            unsharded_grad_data.record_stream(state._streams["post_backward"])
+            _no_dispatch_record_stream(
+                unsharded_grad_data, state._streams["post_backward"]
+            )
 
             if handle._use_orig_params:
                 # Since the handle's `FlatParameter` completed its gradient
@@ -630,7 +638,7 @@ def _cast_grad_to_param_dtype(
         # caching allocator; for the sharded strategies, the gradient is
         # produced in the post-backward stream, so this `record_stream()`
         # should be a no-op
-        low_prec_grad_data.record_stream(torch.cuda.current_stream())
+        _no_dispatch_record_stream(low_prec_grad_data, torch.cuda.current_stream())
 
 
 def _check_comm_hook(
diff --git a/torch/distributed/fsdp/_utils.py b/torch/distributed/fsdp/_utils.py
index 5aae5e918c4a1..bf7937451a29a 100644
--- a/torch/distributed/fsdp/_utils.py
+++ b/torch/distributed/fsdp/_utils.py
@@ -1,7 +1,7 @@
 import dataclasses
 import traceback
 from collections import OrderedDict
-from typing import Any, Callable, Dict, List, Set, Tuple, Union
+from typing import Any, Callable, cast, Dict, List, Set, Tuple, Union
 
 import torch
 from torch.nn.modules.batchnorm import _BatchNorm
@@ -9,6 +9,7 @@
     _is_namedtuple,
 )
 from torch.nn.utils.rnn import PackedSequence
+from torch.utils._mode_utils import no_dispatch
 
 
 def _contains_batchnorm(module):
@@ -115,3 +116,8 @@ def p_assert(cond: Any, s: str, raise_assertion_error: bool = True) -> None:
         traceback.print_stack()
         if raise_assertion_error:
             raise AssertionError(s)
+
+
+def _no_dispatch_record_stream(tensor: torch.Tensor, stream: torch.cuda.Stream) -> None:
+    with no_dispatch():
+        tensor.record_stream(cast(torch._C.Stream, stream))
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index 5bbbbc2e9d2cc..ee693648fb346 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -5,7 +5,6 @@
 from itertools import accumulate, chain
 from typing import (
     Any,
-    cast,
     Dict,
     Generator,
     Iterator,
@@ -30,7 +29,13 @@
 )
 
 from ._fsdp_extensions import _ext_post_unflatten_transform, _ext_pre_flatten_transform
-from ._utils import _alloc_storage, _free_storage, _same_storage, p_assert
+from ._utils import (
+    _alloc_storage,
+    _free_storage,
+    _no_dispatch_record_stream,
+    _same_storage,
+    p_assert,
+)
 
 __all__ = [
     "FlatParameter",
@@ -1200,9 +1205,7 @@ def _free_unsharded_flat_param(self):
         self._check_storage_allocated(unsharded_flat_param)
         self._check_on_compute_device(unsharded_flat_param)
         # Do not free the memory until all ops in the current stream finish
-        unsharded_flat_param.record_stream(
-            cast(torch._C.Stream, torch.cuda.current_stream())
-        )
+        _no_dispatch_record_stream(unsharded_flat_param, torch.cuda.current_stream())
         _free_storage(unsharded_flat_param)
 
     def _use_sharded_flat_param(self) -> None:
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 7c910e7b8c098..2881770740a71 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -95,6 +95,7 @@
 from .flat_param import FlatParameter, FlatParamHandle
 from .wrap import ParamExecOrderWrapPolicy
 
+
 _TORCH_FX_AVAIL = True
 if not hasattr(torch, "fx"):
     _TORCH_FX_AVAIL = False

From 01c13b3c62fd9c8a1bb2ff17fc14d99342802c07 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Thu, 3 Nov 2022 16:26:46 +0000
Subject: [PATCH 0555/1922] [FSDP][Docs] Simplify CPU offload docs (#88430)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88430
Approved by: https://github.com/mrshenli
---
 torch/distributed/fsdp/fully_sharded_data_parallel.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 2881770740a71..e3a5f9e59197f 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -290,13 +290,9 @@ class FullyShardedDataParallel(nn.Module):
             off between memory saving and communication overhead. ``FULL_SHARD``
             will be chosen if sharding_strategy is not specified.
         cpu_offload (Optional[CPUOffload]):
-            CPU offloading config. Currently, only parameter and gradient CPU
-            offload is supported. It can be enabled via passing in
-            ``cpu_offload=CPUOffload(offload_params=True)``. Note that this
-            currently implicitly enables gradient offloading to CPU in order for
-            params and grads to be on same device to work with optimizer. This
-            API is subject to change. Default is ``None`` in which case there
-            will be no offloading.
+            This configures CPU offloading. If this is set to ``None``, then
+            no CPU offloading happens. See :class:`CPUOffload` for details.
+            (Default: ``None``)
         auto_wrap_policy (Optional[Callable[[nn.Module, bool, int], bool]]):
             A callable specifying a policy to recursively wrap layers with FSDP.
             Note that this policy currently will only apply to child modules of

From 4c933b6ddd839bbd87b521c6c9a87095ecac7ce2 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Thu, 3 Nov 2022 16:26:56 +0000
Subject: [PATCH 0556/1922] [FSDP][Docs] Reword `sharding_strategy` docs and
 other minor doc changes (#88431)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88431
Approved by: https://github.com/mrshenli
---
 torch/distributed/fsdp/fully_sharded_data_parallel.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index e3a5f9e59197f..6f5537aad5208 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -282,13 +282,13 @@ class FullyShardedDataParallel(nn.Module):
 
     Args:
         module (nn.Module):
-            module to be wrapped with FSDP.
+            This is the module to be wrapped with FSDP.
         process_group (Optional[ProcessGroup]):
-            process group for sharding
+            This is the process group used for collective communications.
         sharding_strategy (Optional[ShardingStrategy]):
-            Config sharding algorithm, different sharding algorithm has trade
-            off between memory saving and communication overhead. ``FULL_SHARD``
-            will be chosen if sharding_strategy is not specified.
+            This configures the sharding strategy used by FSDP, which may trade
+            off memory saving and communication overhead. See
+            :class:`ShardingStrategy` for details. (Default: ``FULL_SHARD``)
         cpu_offload (Optional[CPUOffload]):
             This configures CPU offloading. If this is set to ``None``, then
             no CPU offloading happens. See :class:`CPUOffload` for details.

From 1e3d595a37c082e6f1995cf2d84ebdb6124a3ab0 Mon Sep 17 00:00:00 2001
From: Jiewen Tan <jwtan@google.com>
Date: Fri, 4 Nov 2022 00:06:07 +0000
Subject: [PATCH 0557/1922] [LTC] Update merge_rules.yaml (#88291)

Summary:
Some of the LTC code-gen infra has been moved from codegen/ to torchgen/. Update the merge_rules.yaml to reflect that.

Test Plan:
New GH PRs...

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88291
Approved by: https://github.com/malfet
---
 .github/merge_rules.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/merge_rules.yaml b/.github/merge_rules.yaml
index 93e938d547edd..6e04f9b4f041d 100644
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@@ -325,10 +325,10 @@
   - torch/csrc/lazy/**
   - test/cpp/lazy/**
   - test/lazy/**
-  - codegen/api/lazy.py
-  - codegen/dest/lazy_ir.py
-  - codegen/dest/lazy_ts_lowering.py
-  - codegen/gen_lazy_tensor.py
+  - torchgen/api/lazy.py
+  - torchgen/dest/lazy_ir.py
+  - torchgen/dest/lazy_ts_lowering.py
+  - torchgen/gen_lazy_tensor.py
   - aten/src/ATen/native/ts_native_functions.yaml
   approved_by:
   - alanwaketan

From b0ea874c0ff840acacf683d052910e81947e57a2 Mon Sep 17 00:00:00 2001
From: Elias Ellison <elias.ellison@gmail.com>
Date: Thu, 3 Nov 2022 18:58:07 +0000
Subject: [PATCH 0558/1922] Disable Current Modes when printing Tensor (#88344)

Fix for https://github.com/pytorch/pytorch/issues/88087

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88344
Approved by: https://github.com/ezyang, https://github.com/samdow
---
 test/test_fake_tensor.py        |  6 ++++++
 torch/_dynamo/utils.py          | 16 ----------------
 torch/_inductor/compile_fx.py   |  2 +-
 torch/_tensor_str.py            |  2 +-
 torch/utils/_python_dispatch.py | 12 ++++++++++++
 5 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
index 2588556c80f7f..ad9042196bff1 100644
--- a/test/test_fake_tensor.py
+++ b/test/test_fake_tensor.py
@@ -194,6 +194,12 @@ def test_randperm(self):
             y1 = torch.randperm(5, device="cpu")
             prims.utils.compare_tensor_meta(y, y1)
 
+    def test_print_in_fake_mode(self):
+        x = torch.zeros(2)
+        # does not fail
+        with FakeTensorMode():
+            out = str(x)
+        assert "FakeTensor" not in out
 
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_cpu_fallback(self):
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 081dc49de5bcd..ef2c1c38ea8ba 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -30,11 +30,6 @@
 import torch
 from torch import fx
 from torch.nn.modules.lazy import LazyModuleMixin
-from torch.utils._python_dispatch import (
-    _len_torch_dispatch_stack,
-    _pop_mode,
-    _push_mode,
-)
 
 from . import config, logging as torchdynamo_logging
 
@@ -147,17 +142,6 @@ def fmt_fn(values, item_fn=lambda x: x):
 }
 
 
-@contextmanager
-def disable_current_modes():
-    mode_len = _len_torch_dispatch_stack()
-    old_modes = [_pop_mode() for _ in range(mode_len)]
-    try:
-        yield old_modes
-    finally:
-        for mode in reversed(old_modes):
-            _push_mode(mode)
-
-
 class DuplicateWarningChecker(object):
     def __init__(self, maxsize=4096):
         self.maxsize = maxsize
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index 8a109f072300e..813daee1252f1 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -84,7 +84,7 @@ def _step_logger():
 
 
 @DebugContext.wrap
-@dynamo_utils.disable_current_modes()
+@torch.utils._python_dispatch._disable_current_modes()
 def compile_fx_inner(
     gm: torch.fx.GraphModule,
     example_inputs: List[torch.Tensor],
diff --git a/torch/_tensor_str.py b/torch/_tensor_str.py
index 986be67a52f68..ad5429c61e56d 100644
--- a/torch/_tensor_str.py
+++ b/torch/_tensor_str.py
@@ -632,6 +632,6 @@ def _functorch_wrapper_str_intern(tensor, *, tensor_contents=None):
 
 
 def _str(self, *, tensor_contents=None):
-    with torch.no_grad():
+    with torch.no_grad(), torch.utils._python_dispatch._disable_current_modes():
         guard = torch._C._DisableFuncTorch()
         return _str_intern(self, tensor_contents=tensor_contents)
diff --git a/torch/utils/_python_dispatch.py b/torch/utils/_python_dispatch.py
index 5d22ae69a185f..14c5c35ed45d8 100644
--- a/torch/utils/_python_dispatch.py
+++ b/torch/utils/_python_dispatch.py
@@ -82,6 +82,18 @@ def _pop_mode_temporarily():
     finally:
         _push_mode(old)
 
+
+@contextlib.contextmanager
+def _disable_current_modes():
+    mode_len = _len_torch_dispatch_stack()
+    old_modes = [_pop_mode() for _ in range(mode_len)]
+    try:
+        yield old_modes
+    finally:
+        for mode in reversed(old_modes):
+            _push_mode(mode)
+
+
 class BaseTorchDispatchMode(TorchDispatchMode):
     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         if kwargs is None:

From 90740028c81d9e49a056a9a80ab8a1cc4bd0fb40 Mon Sep 17 00:00:00 2001
From: XiaobingSuper <xiaobing.zhang@intel.com>
Date: Thu, 3 Nov 2022 00:45:54 -0400
Subject: [PATCH 0559/1922]  TorchDynamo: Add convolution binary fusion for cpu
 in inference mode (#87064)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87064
Approved by: https://github.com/jgong5, https://github.com/jansel
---
 test/inductor/test_torchinductor.py |  95 +++++++++++++-
 torch/_inductor/ir.py               |  59 +++++++++
 torch/_inductor/lowering.py         |  18 +++
 torch/_inductor/overrides.py        | 197 ++++++++++++++++++++++++++++
 4 files changed, 366 insertions(+), 3 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 58ae49eb1930e..2843200a84cdd 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -1344,10 +1344,99 @@ def _unary_list():
             v = torch.randn(x_shape, dtype=torch.float32).to(
                 memory_format=memory_format
             )
-            self.common(
-                mod,
-                (v,),
+            with torch.no_grad():
+                self.common(
+                    mod,
+                    (v,),
+                )
+
+    # For gpu path, there has a accurcy issue,
+    # see https://github.com/pytorch/pytorch/issues/87745.
+    @unittest.skipIf(HAS_CUDA, "only support cpu conv2d binary test")
+    def test_conv2d_binary(self):
+        def _binary_list():
+            binary_list = [
+                lambda x, y: torch.add(x, y),  # call_function
+                lambda x, y: torch.add(y, x),  # call_function
+                lambda x, y: x.add(y),  # call_method
+                lambda x, y: x.add_(y),  # call_method
+                lambda x, y: torch.sub(x, y),  # call_function
+                lambda x, y: x.sub(y),  # call_method
+                lambda x, y: x.sub_(y),  # call_method
+            ]
+            return binary_list
+
+        class M(torch.nn.Module):
+            def __init__(
+                self,
+                binary_fn,
+                in_channels,
+                out_channels,
+                dilation,
+                groups,
+                bias,
+                **kwargs,
+            ):
+                super(M, self).__init__()
+                self.conv1 = torch.nn.Conv2d(
+                    in_channels,
+                    out_channels,
+                    dilation=dilation,
+                    groups=groups,
+                    bias=bias,
+                    **kwargs,
+                )
+                self.conv2 = torch.nn.Sequential(
+                    torch.nn.Conv2d(
+                        in_channels,
+                        out_channels,
+                        dilation=dilation,
+                        groups=groups,
+                        bias=bias,
+                        **kwargs,
+                    )
+                )
+                self.binary_fn = binary_fn
+
+            def forward(self, x):
+                x1 = self.conv1(x)
+                x2 = self.conv2(x)
+                return self.binary_fn(x1, x2)
+
+        test_memory_format = [torch.contiguous_format, torch.channels_last]
+        options = itertools.product(
+            _binary_list(),
+            [True, False],
+            [1, 3],
+            [1, 2],
+            [1, 4],
+            test_memory_format,
+        )
+
+        for (
+            binary_fn,
+            bias,
+            kernel_size,
+            dilation,
+            groups,
+            memory_format,
+        ) in options:
+            oC = 32 * groups
+            iC = 3 * groups
+            x_shape = (1, iC, 112, 112)
+            mod = M(
+                binary_fn, iC, oC, dilation, groups, bias, kernel_size=kernel_size
+            ).eval()
+            mod = mod.to(memory_format=memory_format)
+            # TODO: add bf16 test
+            v = torch.randn(x_shape, dtype=torch.float32).to(
+                memory_format=memory_format
             )
+            with torch.no_grad():
+                self.common(
+                    mod,
+                    (v,),
+                )
 
     def test_gather1(self):
         def fn(a, b):
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index de4b0cefa1b80..c5f5c4e513402 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -3383,6 +3383,65 @@ def apply_constraint(self):
         self.freeze_layout_with_stride_order(self.layout.preferred_stride_order)
 
 
+class ConvolutionBinary(ExternKernelAlloc):
+    kernel = "torch.ops.mkldnn._convolution_pointwise.binary"
+
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+        kernel="torch.ops.mkldnn._convolution_pointwise.binary",
+    ):
+        super().__init__(layout, inputs, constant_args)
+        self.kernel = kernel
+
+    def codegen(self, wrapper):
+        wrapper.writeline(
+            f"{self.get_name()} = {self.kernel}({', '.join(self.codegen_args())})"
+        )
+        if isinstance(self.layout, Layout):
+            self.codegen_size_asserts(wrapper)
+
+    @classmethod
+    def create(
+        cls,
+        x: "TensorBox",
+        other: "TensorBox",
+        weight: "TensorBox",
+        bias: "TensorBox",
+        padding_: List[int],
+        stride_: List[int],
+        dilation_: List[int],
+        groups: int,
+        attr,
+    ):
+        kernel = "torch.ops.mkldnn._convolution_pointwise.binary"
+        (inputs, constant_args, kernel_layout,) = _prepare_convolution_fusion_create(
+            cls, x, weight, bias, padding_, stride_, dilation_, groups
+        )
+        other = cls.require_stride1(cls.realize_input(other))
+        inputs.insert(1, other)
+        constant_args = constant_args + [attr]
+        return ConvolutionBinary(
+            layout=kernel_layout,
+            inputs=inputs,
+            constant_args=constant_args,
+            kernel=kernel,
+        )
+
+    def apply_constraint(self):
+        x = self.inputs[0]
+        # FixedLayout of input
+        x = self.require_stride_order(x, self.layout.preferred_stride_order)
+        self.inputs[0] = x
+        other = self.inputs[1]
+        # FixedLayout of other
+        other = self.require_stride_order(other, self.layout.preferred_stride_order)
+        self.inputs[1] = other
+        self.freeze_layout_with_stride_order(self.layout.preferred_stride_order)
+
+
 @dataclasses.dataclass
 class MutableBox(IRNode):
     """
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 2fe86f8b6501b..e139912817219 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -922,6 +922,24 @@ def convolution_unary(
                 )
             )
 
+        @register_lowering(torch.ops.mkldnn._convolution_pointwise.binary)
+        def convolution_binary(
+            x: TensorBox,
+            other: TensorBox,
+            weight: TensorBox,
+            bias: TensorBox,
+            padding,
+            stride,
+            dilation,
+            groups,
+            attr,
+        ):
+            return TensorBox.create(
+                ir.ConvolutionBinary.create(
+                    x, other, weight, bias, padding, stride, dilation, groups, attr
+                )
+            )
+
     else:
         pass
 
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index 4078d442e8704..8acb5e7f910ab 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -1,6 +1,7 @@
 import copy
 import itertools
 import logging
+import operator
 import random
 import weakref
 
@@ -12,6 +13,7 @@
     replace_node_module,
 )
 from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode
+from torch.fx.passes.shape_prop import ShapeProp
 from torch.nn import functional as F
 from torch.nn.modules.utils import _pair
 from torch.overrides import TorchFunctionMode
@@ -125,6 +127,62 @@ def forward(self, input):
         return self._conv_forward(input, self.weight, self.bias)
 
 
+class ConvBinary2d(nn.Conv2d):
+    def __init__(
+        self,
+        conv: nn.Module,
+        binary_op_name: str,
+    ):
+        super(ConvBinary2d, self).__init__(
+            conv.in_channels,
+            conv.out_channels,
+            conv.kernel_size,
+            conv.stride,
+            conv.padding,
+            conv.dilation,
+            conv.groups,
+            conv.bias is not None,
+            conv.padding_mode,
+            conv.weight.device,
+            conv.weight.dtype,
+        )
+        self._update_module_params(conv, binary_op_name)
+
+    def _update_module_params(self, conv, binary_op_name):
+        self.__dict__ = copy.deepcopy(conv.__dict__)
+        self.attr = binary_op_name
+
+    def _conv_forward(self, input, other, weight, bias):
+        if self.padding_mode != "zeros":
+            return torch.ops.mkldnn._convolution_pointwise(
+                F.pad(
+                    input, self._reversed_padding_repeated_twice, mode=self.padding_mode
+                ),
+                other,
+                weight,
+                bias,
+                _pair(0),
+                self.stride,
+                self.dilation,
+                self.groups,
+                self.attr,
+            )
+        return torch.ops.mkldnn._convolution_pointwise(
+            input,
+            other,
+            weight,
+            bias,
+            self.padding,
+            self.stride,
+            self.dilation,
+            self.groups,
+            self.attr,
+        )
+
+    def forward(self, input, other):
+        return self._conv_forward(input, other, self.weight, self.bias)
+
+
 def fused_conv_unary_eval(conv: nn.Module, unary: nn.Module):
     assert not (conv.training), "Fusion only for eval!"
     return ConvUnary2d(
@@ -133,7 +191,44 @@ def fused_conv_unary_eval(conv: nn.Module, unary: nn.Module):
     )
 
 
+def fused_conv_binary_eval(conv: nn.Module, binary_op_name: str):
+    assert not (conv.training), "Fusion only for eval!"
+    return ConvBinary2d(
+        conv,
+        binary_op_name,
+    )
+
+
+def check_node_kind(current_node, modules, node_kind):
+    if not isinstance(current_node, torch.fx.Node):
+        return False
+    if current_node.op != "call_module":
+        return False
+    if not isinstance(current_node.target, str):
+        return False
+    if current_node.target not in modules:
+        return False
+    if type(modules[current_node.target]) is not node_kind:
+        return False
+    return True
+
+
+def check_node_is_binary(node):
+    return (
+        (node.op == "call_function" and node.target in [torch.add, torch.sub])
+        or (
+            node.op == "call_function"
+            and node.target
+            in [operator.add, operator.iadd, operator.sub, operator.isub]
+        )
+        or (node.op == "call_method" and node.target in ["add", "add_", "sub", "sub_"])
+    )
+
+
 def fuse_fx(gm: torch.fx.GraphModule, example_inputs):
+    # make sure the autograd is disabled.
+    if torch.is_grad_enabled():
+        return gm
     if not (torch.backends.mkldnn.enabled and torch.backends.mkldnn.is_available()):
         return gm
     is_cpu = all(
@@ -141,6 +236,16 @@ def fuse_fx(gm: torch.fx.GraphModule, example_inputs):
     )
     if not is_cpu:
         return gm
+    # For binary fusion, we need to check inputs info to make sure
+    # the binary inputs have same tensor info(device, dtype, and layout).
+    ShapeProp(gm).propagate(*example_inputs)
+    gm = fuse_unary(gm)
+    gm = fuse_binary(gm)
+
+    return gm
+
+
+def fuse_unary(gm: torch.fx.GraphModule):
     modules = dict(gm.named_modules())
 
     for (unary_module, _), (computation_module, fuse_func,) in itertools.product(
@@ -176,6 +281,66 @@ def _philox_rand_like(input, seed, offset):
     return torch.rand_like(input)
 
 
+def replace_and_fuse_for_binary(
+    computation_node, node, fuse_func, attr, modules, index_node, index_pointwise
+):
+    fused_module = fuse_func(computation_node, attr)
+    replace_node_module(node.args[index_node], modules, fused_module)
+    node.args[index_node].args = node.args[index_node].args + (
+        node.args[index_pointwise],
+    )
+    node.replace_all_uses_with(node.args[index_node])
+
+
+def fuse_binary(gm: torch.fx.GraphModule):
+    modules = dict(gm.named_modules())
+    for node in gm.graph.nodes:
+        if check_node_is_binary(node) and (
+            len(node.kwargs) != 2 or node.kwargs["alpha"] == 1.0
+        ):
+            for node_kind, fuse_func in computation_op_binary_op_fusion_map.items():
+                if not isinstance(node.args[0], torch.fx.Node) or not isinstance(
+                    node.args[1], torch.fx.Node
+                ):
+                    continue
+                tensor0_meta = node.args[0].meta.get("tensor_meta")
+                tensor1_meta = node.args[1].meta.get("tensor_meta")
+                if not tensor0_meta or not tensor1_meta:
+                    continue
+                if (
+                    tensor0_meta.shape != tensor1_meta.shape
+                    or tensor0_meta.stride != tensor1_meta.stride
+                    or tensor0_meta.dtype != tensor1_meta.dtype
+                ):
+                    continue
+                attr = binary_attr[node.target]
+                index_list = supported_index_list[attr]
+                for index_dict in index_list:
+                    index_node = index_dict["index_computation"]
+                    index_pointwise = index_dict["index_pointwise"]
+                    if check_node_kind(node.args[index_node], modules, node_kind):
+                        if len(node.args[index_node].users) > 1:
+                            continue
+                        computation_node = modules[node.args[index_node].target]
+                        replace_and_fuse_for_binary(
+                            computation_node,
+                            node,
+                            fuse_func,
+                            attr if attr != "iadd" else "add",
+                            modules,
+                            index_node,
+                            index_pointwise,
+                        )
+                        # Make sure the fused node is post node of node's inputs nodes.
+                        node.append(node.args[index_node])
+                        gm.graph.erase_node(node)
+                        gm.graph.lint()
+                        break
+
+    gm.recompile()
+    return gm
+
+
 philox_rand_like = _prims._make_prim(
     schema="philox_rand_like(Tensor input, Tensor seed, int offset) -> Tensor",
     return_type=_prims.RETURN_TYPE.NEW,
@@ -307,3 +472,35 @@ def rand_like(x, **kwargs):
     nn.Hardtanh: UnaryAttr("hardtanh", scalars_attr=["min_val", "max_val"]),
     nn.GELU: UnaryAttr("gelu", algorithm_attr="approximate"),
 }
+
+
+binary_attr = {
+    torch.add: "add",  # node.op == "call_function"
+    "add": "add",  # node.op == "call_method"
+    "add_": "iadd",  # node.op == "call_method"
+    operator.add: "add",  # node.op == "call_function"
+    operator.iadd: "iadd",  # node.op == "call_function"
+    torch.sub: "sub",  # node.op == "call_function"
+    "sub": "sub",  # node.op == "call_method"
+    "sub_": "sub",  # node.op == "call_method"
+    operator.sub: "sub",  # node.op == "call_function"
+    operator.isub: "sub",  # node.op == "call_function"
+}
+
+
+computation_op_binary_op_fusion_map = {
+    nn.Conv2d: fused_conv_binary_eval,
+}
+
+
+# For add: we support conv/linear + other and other + conv
+# For sub/add_/sub_, we only support conv/linear - other
+# or conv/linear +(-)= other
+supported_index_list = {
+    "add": [
+        {"index_computation": 0, "index_pointwise": 1},
+        {"index_computation": 1, "index_pointwise": 0},
+    ],
+    "iadd": [{"index_computation": 0, "index_pointwise": 1}],
+    "sub": [{"index_computation": 0, "index_pointwise": 1}],
+}

From 535ca05804522ba19e35cf7b7fee22a3ee76cba8 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Fri, 4 Nov 2022 01:22:41 +0000
Subject: [PATCH 0560/1922] [MPS] Add native `cumsum` implementation (#88319)

Using https://developer.apple.com/documentation/metalperformanceshadersgraph/mpsgraph/4057333-cumulativesumwithtensor?language=objc

Fall back to CPU if running on older MacOS versions
In `unary_op` add output tensor dims/dtype to the graph key (as even in default op we check output graph type)
Also, upcast int16 to int32 as MPS cumsum op on Ventura returns incorrect results for Int16 type (and it makes total sense for int8, as chances for overflow are very high)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88319
Approved by: https://github.com/kulinseth
---
 .../ATen/native/mps/operations/UnaryOps.mm    | 46 ++++++++++++++++++-
 aten/src/ATen/native/native_functions.yaml    |  1 +
 test/test_mps.py                              |  5 +-
 3 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index 6b33e31341c8d..07173dc8b2ac8 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -7,6 +7,13 @@
 #include <ATen/native/mps/OperationUtils.h>
 #include <torch/library.h>
 
+// TODO: Remove me when moved to MacOS 13
+@interface MPSGraph (VenturaOps)
+- (MPSGraphTensor *)cumulativeSumWithTensor:(MPSGraphTensor *)tensor
+                                       axis:(NSInteger)axis
+                                       name:(NSString *)name;
+@end
+
 namespace at {
 namespace native {
 namespace mps {
@@ -30,7 +37,7 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
   }
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
   @autoreleasepool {
-    string key = op_name + getTensorsStringKey({self}, /*use_scalar_value*/ false);
+    string key = op_name + getTensorsStringKey({self, output}, /*use_scalar_value*/ false);
     auto cachedGraph = cache_->LookUpAs<MPSUnaryCachedGraph>(key);
 
     if(!cachedGraph) {
@@ -263,5 +270,42 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
                 });
 }
 
+
+static bool mpsSupportsCumsum() {
+  id mpsCD = NSClassFromString(@"MPSGraph");
+  return [mpsCD instancesRespondToSelector:@selector(cumulativeSumWithTensor:axis:name:)] == YES;
+}
+
+
+TORCH_IMPL_FUNC(cumsum_out_mps)
+(const Tensor& self,
+ int64_t dim,
+ c10::optional<ScalarType> dtype,
+ const Tensor& result) {
+  TORCH_CHECK(dim >=0 && dim < std::max(1LL, self.ndimension()), "Expected dim to be between 0 and ", self.ndimension(), " but got ", dim);
+  if (!mpsSupportsCumsum()) {
+    TORCH_WARN_ONCE("torch.cumsum supported by MPS on MacOS 13+, please upgrade");
+    auto cpu_result = self.to(at::Device(kCPU)).cumsum(dim, dtype);
+    at::_copy_from_and_resize(cpu_result, result);
+    return;
+  }
+  auto input = dtype.has_value() ? self.to(dtype.value()) : self;
+  mps::unary_op(input, result, "cumsum_out_mp" + std::to_string(dim),
+                ^ MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
+       // cumsum is horribly broken for int8, int16 and as chances for overflow is pretty high, cast to int32
+       if (isIntegralType(input.scalar_type()) && input.scalar_type() !=ScalarType::Int) {
+           inputTensor = mps::castMPSTensor(mpsGraph, inputTensor, result.scalar_type());
+       }
+       auto rc = [mpsGraph cumulativeSumWithTensor: inputTensor
+                                              axis: dim
+                                              name: nil];
+       if (result.scalar_type()!= input.scalar_type() ||
+           (isIntegralType(input.scalar_type()) && input.scalar_type() !=ScalarType::Int)) {
+         return mps::castMPSTensor(mpsGraph, rc, result.scalar_type());
+       }
+       return rc;
+    });
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 3af39c542918f..baee6dc398419 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1776,6 +1776,7 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: cumsum_out
+    MPS: cumsum_out_mps
 
 - func: cumsum.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
   device_check: NoCheck   # TensorIterator
diff --git a/test/test_mps.py b/test/test_mps.py
index 257dd238e3e7c..609a77bb971d3 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -6571,8 +6571,7 @@ def test_index_put_accumulate_duplicate_indices(self, device="mps"):
             # lots of duplicates interleaved with each other
             delta = torch.empty(i, dtype=torch.float32, device=device).uniform_(-1, 1)
 
-            # cumsum not supported on 'mps', fallback on 'cpu'
-            indices = delta.cpu().cumsum(0).long().to("mps")
+            indices = delta.cumsum(0).long().to("mps")
 
             # abs for int64 is not supported on mps, fallback on 'cpu' to calculate it
             input = torch.randn(indices.cpu().abs().max().to("mps") + 1, device=device)
@@ -7199,6 +7198,7 @@ class TestConsistency(TestCase):
         'cos': ['f32', 'i16', 'i32', 'u8', 'i64'],
         'cosh': ['f32', 'i16', 'i32', 'u8', 'i64'],
         'cov': ['f32'],
+        'cumsum': ['f16', 'f32', 'int16', 'int32'],
         'deg2rad': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'diag': ['f32', 'i32'],
         'diag_embed': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
@@ -7426,6 +7426,7 @@ class TestConsistency(TestCase):
         'corrcoef': ['f32'],
         'cos': ['f32'],
         'cosh': ['f32'],
+        'cumsum': ['f16', 'f32'],
         'deg2rad': ['f16', 'f32'],
         'diag': ['f32'],
         'diag_embed': ['f16', 'f32'],

From aca76e0cbdf05d83e264ec40bc71b1d3a665d654 Mon Sep 17 00:00:00 2001
From: XiaobingSuper <xiaobing.zhang@intel.com>
Date: Thu, 3 Nov 2022 00:45:59 -0400
Subject: [PATCH 0561/1922] TorchDynamo: Add linear unary fusion for cpu in
 BF16 inference mode (#87065)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87065
Approved by: https://github.com/jgong5, https://github.com/jansel
---
 test/inductor/test_torchinductor.py | 93 ++++++++++++++++++++---------
 torch/_inductor/ir.py               | 50 ++++++++++++++++
 torch/_inductor/lowering.py         |  8 +++
 torch/_inductor/overrides.py        | 62 +++++++++++++++++--
 4 files changed, 180 insertions(+), 33 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 2843200a84cdd..778c29db74599 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -78,6 +78,43 @@
 torch._inductor.config.triton.autotune = False  # too slow
 
 
+# For OneDNN bf16 path, OneDNN requires the cpu has intel avx512 with avx512bw,
+# avx512vl, and avx512dq at least. So we will skip the test case if one processor
+# is not meet the requirement.
+@functools.lru_cache(maxsize=None)
+def has_bf16_support():
+    import sys
+
+    if sys.platform != "linux":
+        return False
+    with open("/proc/cpuinfo", encoding="ascii") as f:
+        lines = f.read()
+    return all(word in lines for word in ["avx512bw", "avx512vl", "avx512dq"])
+
+
+unary_list = [
+    torch.nn.ReLU(),
+    torch.nn.Sigmoid(),
+    torch.nn.Tanh(),
+    torch.nn.Hardswish(),
+    torch.nn.LeakyReLU(0.1, inplace=False),
+    torch.nn.Hardtanh(min_val=-0.5, max_val=4, inplace=False),
+    torch.nn.GELU(approximate="none"),
+    torch.nn.GELU(approximate="tanh"),
+]
+
+
+binary_list = [
+    lambda x, y: torch.add(x, y),  # call_function
+    lambda x, y: torch.add(y, x),  # call_function
+    lambda x, y: x.add(y),  # call_method
+    lambda x, y: x.add_(y),  # call_method
+    lambda x, y: torch.sub(x, y),  # call_function
+    lambda x, y: x.sub(y),  # call_method
+    lambda x, y: x.sub_(y),  # call_method
+]
+
+
 def requires_decomp(fn):
     """Decorator to disable test if a decomp is missing"""
 
@@ -192,6 +229,7 @@ def check_model(
     ref_inputs = example_inputs
     ref_kwargs = kwargs
     has_lowp_args = False
+    original_lowp_dtype = torch.half
 
     if reference_in_float:
         # check_lowp is ignored here, it's kept just to be able to call `common` with extra arg
@@ -205,9 +243,15 @@ def upcast_fn(x):
             else:
                 return x
 
+        def get_original_lowp_dtype(example_inputs):
+            dtypes = [x.dtype for x in example_inputs if isinstance(x, torch.Tensor)]
+            dtype_set = set(dtypes)
+            return dtype_set.pop() if len(dtype_set) == 1 else torch.half
+
         ref_inputs = list(map(upcast_fn, example_inputs))
         ref_kwargs = {k: upcast_fn(v) for k, v in kwargs.items()}
         if has_lowp_args:
+            original_lowp_dtype = get_original_lowp_dtype(example_inputs)
             if hasattr(model, "to"):
                 model = model.to(torch.float)
 
@@ -217,7 +261,7 @@ def upcast_fn(x):
     # downcast the model back if needed
     if reference_in_float and has_lowp_args:
         if hasattr(model, "to"):
-            model = model.to(torch.half)
+            model = model.to(original_lowp_dtype)
 
     torch._inductor.metrics.reset()
 
@@ -1294,22 +1338,9 @@ def fn(a, b):
     # see https://github.com/pytorch/pytorch/issues/87745.
     @unittest.skipIf(HAS_CUDA, "only support cpu conv2d unary test")
     def test_conv2d_unary(self):
-        def _unary_list():
-            unary_list = [
-                torch.nn.ReLU(),
-                torch.nn.Sigmoid(),
-                torch.nn.Tanh(),
-                torch.nn.Hardswish(),
-                torch.nn.LeakyReLU(0.1, inplace=False),
-                torch.nn.Hardtanh(min_val=-0.5, max_val=4, inplace=False),
-                torch.nn.GELU(approximate="none"),
-                torch.nn.GELU(approximate="tanh"),
-            ]
-            return unary_list
-
         test_memory_format = [torch.contiguous_format, torch.channels_last]
         options = itertools.product(
-            _unary_list(),
+            unary_list,
             [True, False],
             [1, 3],
             [1, 2],
@@ -1354,18 +1385,6 @@ def _unary_list():
     # see https://github.com/pytorch/pytorch/issues/87745.
     @unittest.skipIf(HAS_CUDA, "only support cpu conv2d binary test")
     def test_conv2d_binary(self):
-        def _binary_list():
-            binary_list = [
-                lambda x, y: torch.add(x, y),  # call_function
-                lambda x, y: torch.add(y, x),  # call_function
-                lambda x, y: x.add(y),  # call_method
-                lambda x, y: x.add_(y),  # call_method
-                lambda x, y: torch.sub(x, y),  # call_function
-                lambda x, y: x.sub(y),  # call_method
-                lambda x, y: x.sub_(y),  # call_method
-            ]
-            return binary_list
-
         class M(torch.nn.Module):
             def __init__(
                 self,
@@ -1405,7 +1424,7 @@ def forward(self, x):
 
         test_memory_format = [torch.contiguous_format, torch.channels_last]
         options = itertools.product(
-            _binary_list(),
+            binary_list,
             [True, False],
             [1, 3],
             [1, 2],
@@ -1438,6 +1457,24 @@ def forward(self, x):
                     (v,),
                 )
 
+    def test_linear_unary(self):
+        options = itertools.product(unary_list, [[2, 3, 10], [2, 10]], [True, False])
+        dtype = torch.bfloat16
+        if has_bf16_support():
+            for eltwise_fn, input_shape, bias in options:
+                mod = torch.nn.Sequential(
+                    torch.nn.Linear(input_shape[-1], 30, bias=bias), eltwise_fn
+                ).eval()
+
+                # only fuse for linear when the dtype is bf16
+                mod = mod.to(dtype)
+                v = torch.randn(input_shape).to(dtype)
+                with torch.no_grad():
+                    self.common(
+                        mod,
+                        (v,),
+                    )
+
     def test_gather1(self):
         def fn(a, b):
             return (
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index c5f5c4e513402..89196977490fe 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -3442,6 +3442,56 @@ def apply_constraint(self):
         self.freeze_layout_with_stride_order(self.layout.preferred_stride_order)
 
 
+class LinearUnary(ExternKernelAlloc):
+    kernel = "torch.ops.mkldnn._linear_pointwise"
+
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+        kernel="torch.ops.mkldnn._linear_pointwise",
+    ):
+        super().__init__(layout, inputs, constant_args)
+        self.kernel = kernel
+
+    def codegen(self, wrapper):
+        wrapper.writeline(
+            f"{self.get_name()} = {self.kernel}({', '.join(self.codegen_args())})"
+        )
+
+    @classmethod
+    def create(cls, x, w, b, attr, scalars, algorithm):
+        kernel = "torch.ops.mkldnn._linear_pointwise"
+        x = cls.require_stride1(cls.realize_input(x))
+        w = cls.require_stride1(cls.realize_input(w))
+
+        *m, ic = x.get_size()
+        oc, ic = w.get_size()
+
+        inputs = [x, w]
+        constant_args = [attr, scalars, algorithm]
+        if b is not None:
+            b = cls.require_stride1(cls.realize_input(b))
+            inputs.append(b)
+        else:
+            constant_args.insert(0, b)
+
+        return LinearUnary(
+            layout=FlexibleLayout(
+                device=x.get_device(),
+                dtype=x.get_dtype(),
+                size=list(m) + [oc],
+            ),
+            inputs=inputs,
+            constant_args=constant_args,
+            kernel=kernel,
+        )
+
+    def apply_constraint(self):
+        pass
+
+
 @dataclasses.dataclass
 class MutableBox(IRNode):
     """
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index e139912817219..e2c89be703eb1 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -940,6 +940,14 @@ def convolution_binary(
                 )
             )
 
+        @register_lowering(torch.ops.mkldnn._linear_pointwise)
+        def linear_unary(
+            x: TensorBox, w: TensorBox, b: TensorBox, attr, scalars, algorithm
+        ):
+            return TensorBox.create(
+                ir.LinearUnary.create(x, w, b, attr, scalars, algorithm)
+            )
+
     else:
         pass
 
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index 8acb5e7f910ab..49b771eeebd0f 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -183,6 +183,34 @@ def forward(self, input, other):
         return self._conv_forward(input, other, self.weight, self.bias)
 
 
+class LinearUnary(nn.Linear):
+    def __init__(
+        self,
+        linear: nn.Module,
+        unary: nn.Module,
+    ):
+        super(LinearUnary, self).__init__(
+            linear.in_features,
+            linear.out_features,
+            linear.bias is not None,
+            linear.weight.device,
+            linear.weight.dtype,
+        )
+        self._update_module_params(linear, unary)
+
+    def _update_module_params(self, linear, unary):
+        self.__dict__ = copy.deepcopy(linear.__dict__)
+        self.attr, self.scalars, self.algorithm = unary_modules_map[unary.__class__](
+            unary
+        )
+
+    def forward(self, input):
+        y = torch.ops.mkldnn._linear_pointwise(
+            input, self.weight, self.bias, self.attr, self.scalars, self.algorithm
+        )
+        return y
+
+
 def fused_conv_unary_eval(conv: nn.Module, unary: nn.Module):
     assert not (conv.training), "Fusion only for eval!"
     return ConvUnary2d(
@@ -199,6 +227,20 @@ def fused_conv_binary_eval(conv: nn.Module, binary_op_name: str):
     )
 
 
+def is_bfloat16_module(m):
+    weight_is_bf16 = m.weight.dtype == torch.bfloat16
+    bias_is_bf16 = m.bias is None or m.bias.dtype == torch.bfloat16
+    return weight_is_bf16 and bias_is_bf16
+
+
+def fused_linear_unary_eval(linear: nn.Module, unary: nn.Module):
+    assert not (linear.training), "Fusion only for eval!"
+    return LinearUnary(
+        linear,
+        unary,
+    )
+
+
 def check_node_kind(current_node, modules, node_kind):
     if not isinstance(current_node, torch.fx.Node):
         return False
@@ -258,13 +300,20 @@ def fuse_unary(gm: torch.fx.GraphModule):
                     len(node.args[0].users) > 1
                 ):  # Output of computation_node is used by other nodes
                     continue
-                conv = modules[node.args[0].target]
+                computation_node = modules[node.args[0].target]
                 unary_node = modules[node.target]
-                eval_mode = all(not n.training for n in [conv, unary_node])
+                eval_mode = all(not n.training for n in [computation_node, unary_node])
                 if not eval_mode:
                     continue
-                fused_conv = fuse_func(conv, unary_node)
-                replace_node_module(node.args[0], modules, fused_conv)
+
+                # only fuse for linear when the dtype is bf16
+                if type(computation_node) in [nn.Linear] and not is_bfloat16_module(
+                    computation_node
+                ):
+                    continue
+                fused_module = fuse_func(computation_node, unary_node)
+                replace_node_module(node.args[0], modules, fused_module)
+
                 node.replace_all_uses_with(node.args[0])
                 gm.graph.erase_node(node)
                 gm.graph.lint()
@@ -460,7 +509,10 @@ def rand_like(x, **kwargs):
 replacements = {torch.nn.functional.dropout: lowmem_dropout, torch.rand_like: rand_like}
 
 
-computation_op_unary_op_fusion_map = {nn.Conv2d: fused_conv_unary_eval}
+computation_op_unary_op_fusion_map = {
+    nn.Conv2d: fused_conv_unary_eval,
+    nn.Linear: fused_linear_unary_eval,
+}
 
 
 unary_modules_map = {

From 313e2f1adced78b0ad9e825d266dcee25b37e370 Mon Sep 17 00:00:00 2001
From: Elias Ellison <elias.ellison@gmail.com>
Date: Thu, 3 Nov 2022 23:10:28 +0000
Subject: [PATCH 0562/1922] Run all fallback kernels with FakeTensor (#88248)

This improves the memory compression of resnet18 from .84 -> .94 on inductor no-cudagraphs. It does mean that any extern kernel which incorrectly computes strides will be a hard error at runtime, but that's an issue we are going to have to face with dynamic shapes anyway. CC @ezyang, @SherlockNoMad
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88248
Approved by: https://github.com/ezyang
---
 test/inductor/test_torchinductor_opinfo.py |  1 -
 torch/_inductor/ir.py                      | 27 +++++++++++++++-----
 torch/_meta_registrations.py               | 29 ++++++++++++----------
 3 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 93e5412716296..b705a36d75425 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -327,7 +327,6 @@ def process(device_type):
     "linalg.lstsq.grad_oriented": {f32, f64},
     "linalg.matrix_rank": {f32, f64},
     "linalg.matrix_rank.hermitian": {f32, f64},
-    "linalg.pinv.hermitian": {f32, f64},
     "lu_unpack": {f32, f64},
     "masked.argmax": {f16, f32, f64, i32},
     "masked.argmin": {f16, f32, f64, i32},
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 89196977490fe..59d652bdc1e88 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -6,6 +6,7 @@
 import re
 import textwrap
 from collections import OrderedDict
+from contextlib import nullcontext
 from enum import Enum
 from functools import partial
 from typing import Any, Callable, ClassVar, Dict, List, Optional, Set, Tuple, Union
@@ -29,6 +30,7 @@
 
 log = logging.getLogger(__name__)
 indent = functools.partial(textwrap.indent, prefix="  ")
+aten = torch.ops.aten
 
 
 def inverse_reorder(order):
@@ -2895,12 +2897,25 @@ def gen_kwarg(k, v):
 
     @classmethod
     def create(cls, kernel, *args, **kwargs):
-        (
-            example_output,
-            tensor_args,
-            non_tensor_args,
-            unflatten_args,
-        ) = cls.process_kernel(kernel, *args, **kwargs)
+        fake_incorrect_kernels = (
+            aten._fft_r2c.default,
+            aten._fft_r2c.out,
+            aten._fft_c2r.default,
+            aten._fft_c2c.default,
+            aten._fft_c2c.out,
+            aten._linalg_svd.default,
+            aten._linalg_svd.U,
+        )
+        context = (
+            FakeTensorMode if kernel not in fake_incorrect_kernels else nullcontext
+        )
+        with context():
+            (
+                example_output,
+                tensor_args,
+                non_tensor_args,
+                unflatten_args,
+            ) = cls.process_kernel(kernel, *args, **kwargs)
 
         if isinstance(example_output, (list, tuple)):
             packed = FallbackKernel(
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index e42cc7de4f675..ea9ac51450e22 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -309,6 +309,17 @@ def meta_bernoulli(self, *, generator=None, out):
     return out
 
 
+# FakeTensors (meta tensors with a device) will report device as meta
+# when running meta kernels. Here, access the "fake device" of FakeTensor if it
+# exists so meta kernels which have diverge per device will be more
+# accurate when run with FakeTensors
+def device_hint(tensor) -> "str":
+    if isinstance(tensor, torch._subclasses.FakeTensor):
+        return tensor.fake_device.type
+    else:
+        return "cuda"  # default to cuda
+
+
 @register_meta(aten.convolution.default)
 def meta_conv(
     input_tensor: torch.Tensor,
@@ -414,8 +425,8 @@ def calc_conv_nd_return_shape(
     def is_channels_last(ten):
         return torch._prims_common.suggest_memory_format(ten) == torch.channels_last
 
-    def pick_memory_format(device_hint):
-        if device_hint == "cuda":
+    def pick_memory_format():
+        if device_hint(input_tensor) == "cuda":
             if is_channels_last(input_tensor) or is_channels_last(weight):
                 return torch.channels_last
         else:
@@ -449,15 +460,7 @@ def pick_memory_format(device_hint):
         )
     out = input_tensor.new_empty((input_tensor.shape[0], out_channels, *shape_out))
 
-    from torch._subclasses.fake_tensor import FakeTensor
-
-    if isinstance(input_tensor, FakeTensor):
-        device_hint = input_tensor.fake_device.type
-    else:
-        device_hint = "cuda"  # default to cuda
-
-    mem_fmt = pick_memory_format(device_hint)
-    out = out.to(memory_format=mem_fmt)  # type: ignore[call-overload]
+    out = out.to(memory_format=pick_memory_format())  # type: ignore[call-overload]
     return out
 
 
@@ -1010,7 +1013,7 @@ def is_fast_path(src, scale, output, padding_idx):
         else:
             return is_fast_path_index_select(src, output, padding_idx)
 
-    if offsets.device.type != "cpu":
+    if device_hint(offsets) != "cpu":
         offset2bag = indices.new_empty(indices.size(0))
         bag_size = indices.new_empty(offsets.size())
         if mode == MODE_MAX:
@@ -1033,7 +1036,7 @@ def meta_embedding_bag_forward_only(weight, indices, offsets, *args):
     output, offset2bag, bag_size, max_indices = meta_embedding_bag(
         weight, indices, offsets, *args
     )
-    if offsets.device.type == "cpu":
+    if device_hint(offsets) == "cpu":
         bag_size = offsets.new_empty(offsets.size())
     return output, offset2bag, bag_size, max_indices
 

From b683774cc61461a2621bbf58fb6d5b9bdb8b8de6 Mon Sep 17 00:00:00 2001
From: XiaobingSuper <xiaobing.zhang@intel.com>
Date: Thu, 3 Nov 2022 00:46:01 -0400
Subject: [PATCH 0563/1922] TorchDynamo: Add linear binary fusion for cpu in
 BF16 inference mode (#87066)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87066
Approved by: https://github.com/jgong5, https://github.com/jansel
---
 test/inductor/test_torchinductor.py | 28 ++++++++++++++++
 torch/_inductor/ir.py               | 51 +++++++++++++++++++++++++++++
 torch/_inductor/lowering.py         |  4 +++
 torch/_inductor/overrides.py        | 38 +++++++++++++++++++++
 4 files changed, 121 insertions(+)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 778c29db74599..cc8985e4598b0 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -1475,6 +1475,34 @@ def test_linear_unary(self):
                         (v,),
                     )
 
+    def test_linear_binary(self):
+        class M(torch.nn.Module):
+            def __init__(self, eltwise_fn, in_channels, out_channels, bias, **kwargs):
+                super(M, self).__init__()
+                self.linear = torch.nn.Linear(
+                    in_channels, out_channels, bias=bias, **kwargs
+                )
+                self.eltwise = eltwise_fn
+
+            def forward(self, x, y):
+                x = self.linear(x)
+                x = self.eltwise(x, y)
+                return x
+
+        options = itertools.product(binary_list, [[2, 3, 10], [2, 10]], [True, False])
+        dtype = torch.bfloat16
+        out_feature = 30
+        if has_bf16_support():
+            for binary_ops, input_shape, bias in options:
+                mod = M(binary_ops, input_shape[-1], out_feature, bias).eval()
+
+                # only fuse for linear when the dtype is bf16
+                mod = mod.to(dtype)
+                v = torch.randn(input_shape).to(dtype)
+                other = torch.randn(input_shape[:-1] + [out_feature]).to(dtype)
+                with torch.no_grad():
+                    self.common(mod, (v, other), atol=2e-3, rtol=0.016)
+
     def test_gather1(self):
         def fn(a, b):
             return (
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 59d652bdc1e88..f4c912137812c 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -3507,6 +3507,57 @@ def apply_constraint(self):
         pass
 
 
+class LinearBinary(ExternKernelAlloc):
+    kernel = "torch.ops.mkldnn._linear_pointwise.binary"
+
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+        kernel="torch.ops.mkldnn._linear_pointwise.binary",
+    ):
+        super().__init__(layout, inputs, constant_args)
+        self.kernel = kernel
+
+    def codegen(self, wrapper):
+        wrapper.writeline(
+            f"{self.get_name()} = {self.kernel}({', '.join(self.codegen_args())})"
+        )
+
+    @classmethod
+    def create(cls, x, y, w, b, attr):
+        kernel = "torch.ops.mkldnn._linear_pointwise.binary"
+        x = cls.require_stride1(cls.realize_input(x))
+        y = cls.require_stride1(cls.realize_input(y))
+        w = cls.require_stride1(cls.realize_input(w))
+
+        *m, ic = x.get_size()
+        oc, ic = w.get_size()
+
+        inputs = [x, y, w]
+        constant_args = [attr]
+        if b is not None:
+            b = cls.require_stride1(cls.realize_input(b))
+            inputs.append(b)
+        else:
+            constant_args.insert(0, b)
+
+        return LinearBinary(
+            layout=FlexibleLayout(
+                device=x.get_device(),
+                dtype=x.get_dtype(),
+                size=list(m) + [oc],
+            ),
+            inputs=inputs,
+            constant_args=constant_args,
+            kernel=kernel,
+        )
+
+    def apply_constraint(self):
+        pass
+
+
 @dataclasses.dataclass
 class MutableBox(IRNode):
     """
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index e2c89be703eb1..d2b27d702ebec 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -948,6 +948,10 @@ def linear_unary(
                 ir.LinearUnary.create(x, w, b, attr, scalars, algorithm)
             )
 
+        @register_lowering(torch.ops.mkldnn._linear_pointwise.binary)
+        def linear_binary(x: TensorBox, y: TensorBox, w: TensorBox, b: TensorBox, attr):
+            return TensorBox.create(ir.LinearBinary.create(x, y, w, b, attr))
+
     else:
         pass
 
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index 49b771eeebd0f..f57a692e9547c 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -211,6 +211,29 @@ def forward(self, input):
         return y
 
 
+class LinearBinary(nn.Linear):
+    def __init__(self, linear: nn.Module, binary_op_name: str):
+        super(LinearBinary, self).__init__(
+            linear.in_features,
+            linear.out_features,
+            linear.bias is not None,
+            linear.weight.device,
+            linear.weight.dtype,
+        )
+        self._update_module_params(linear, binary_op_name)
+
+    def _update_module_params(self, linear, binary_op_name):
+        self.__dict__ = copy.deepcopy(linear.__dict__)
+
+        self.attr = binary_op_name
+
+    def forward(self, input, other):
+        y = torch.ops.mkldnn._linear_pointwise(
+            input, other, self.weight, self.bias, self.attr
+        )
+        return y
+
+
 def fused_conv_unary_eval(conv: nn.Module, unary: nn.Module):
     assert not (conv.training), "Fusion only for eval!"
     return ConvUnary2d(
@@ -241,6 +264,15 @@ def fused_linear_unary_eval(linear: nn.Module, unary: nn.Module):
     )
 
 
+def fused_linear_binary_eval(linear: nn.Module, attr: str):
+    assert not (linear.training), "Fusion only for eval!"
+    linear_binary = LinearBinary(
+        linear,
+        attr,
+    )
+    return linear_binary
+
+
 def check_node_kind(current_node, modules, node_kind):
     if not isinstance(current_node, torch.fx.Node):
         return False
@@ -371,6 +403,11 @@ def fuse_binary(gm: torch.fx.GraphModule):
                         if len(node.args[index_node].users) > 1:
                             continue
                         computation_node = modules[node.args[index_node].target]
+                        # only fuse for linear when the dtype is bf16
+                        if type(computation_node) in [
+                            nn.Linear
+                        ] and not is_bfloat16_module(computation_node):
+                            continue
                         replace_and_fuse_for_binary(
                             computation_node,
                             node,
@@ -542,6 +579,7 @@ def rand_like(x, **kwargs):
 
 computation_op_binary_op_fusion_map = {
     nn.Conv2d: fused_conv_binary_eval,
+    nn.Linear: fused_linear_binary_eval,
 }
 
 
From cb8b0caa1b2eea5119ace727a803ec957608531e Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Thu, 3 Nov 2022 22:55:24 +0000
Subject: [PATCH 0564/1922] Add hf_bert + DDP multigpu test (#88435)

Spot-checks an e2e model working with ddp.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88435
Approved by: https://github.com/davidberard98
---
 .jenkins/pytorch/test.sh                      |  1 +
 test/distributed/test_dynamo_distributed.py   | 55 ++++++++++++++++++-
 torch/testing/_internal/common_distributed.py | 16 ++++++
 3 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 4cba60ed1893b..56b8d7dfcc108 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -733,6 +733,7 @@ elif [[ "$TEST_CONFIG" == deploy ]]; then
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
   install_filelock
   install_triton
+  install_huggingface
   test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
   test_without_numpy
diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index dee2f7c1a5924..daecc0092b78b 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -1,7 +1,9 @@
 # Owner(s): ["module: dynamo"]
 import os
+import random
 import unittest
 from unittest.mock import patch
+import numpy as np
 import torch
 import torch._dynamo
 import torch._dynamo.test_case
@@ -10,11 +12,23 @@
 from torch import nn
 from torch._dynamo import config
 from torch._dynamo.utils import same
+from torch._dynamo.testing import collect_results
 from torch._inductor.utils import has_triton
 from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.testing._internal.common_distributed import MultiProcessTestCase, skip_if_lt_x_gpu, requires_nccl
+from torch.testing._internal.common_distributed import (
+    MultiProcessTestCase,
+    import_transformers_or_skip,
+    skip_if_lt_x_gpu,
+    requires_nccl
+)
 import torch._dynamo.logging
 
+
+def reset_rng_state():
+    torch.manual_seed(1337)
+    random.seed(1337)
+    np.random.seed(1337)
+
 def init_weights(m):
     if isinstance(m, nn.Linear):
         nn.init.xavier_uniform_(m.weight)
@@ -39,6 +53,20 @@ def get_model(device, bsz=20, in_feat=10, hidden_feat=5000, out_feat=5):
     outputs = m(inputs)
     return m, inputs, outputs
 
+def get_hf_bert(rank):
+    # Note: use @import_transformers_or_skip on your test case if you use this
+    try:
+        from transformers import BertConfig, AutoModelForMaskedLM
+    except ImportError:
+        unittest.skip("Unable to import transformers")
+
+    batch_size, max_length, config, device = 4, 512, BertConfig(), f"cuda:{rank}"
+    model = AutoModelForMaskedLM.from_config(config).to(device)
+    input_ids = torch.randint(0, config.vocab_size, (batch_size, max_length)).to(device)
+    decoder_ids = torch.randint(0, config.vocab_size, (batch_size, max_length)).to(device)
+    inputs = {'input_ids': input_ids, 'labels': decoder_ids}
+    model.train()
+    return model, inputs
 
 class CheckSplitsCompiler:
     def __init__(self):
@@ -105,6 +133,31 @@ def test_ddp_baseline_aot_eager_multiprocess(self):
             outputs = m(inputs)
             self.assertTrue(same(correct_outputs, outputs))
 
+    @skip_if_lt_x_gpu(2)
+    @import_transformers_or_skip()
+    @patch.object(config, "optimize_ddp", True)
+    @patch.object(torch._inductor.config, "fallback_random", True)
+    def test_hf_bert_ddp(self):
+
+        with _per_rank_init(self.rank, self.world_size):
+            model, inputs = get_hf_bert(self.rank)
+            model = DDP(model)
+
+            reset_rng_state()
+            correct_outputs = model(**inputs)
+            correct_loss = correct_outputs.loss
+            correct_loss.backward()
+
+            reset_rng_state()
+            opt_model = torch._dynamo.optimize("inductor")(model)
+            opt_outputs = opt_model(**inputs)
+            opt_loss = opt_outputs.loss
+            opt_loss.backward()
+
+            inputs_flat = [inputs[k] for k in inputs]
+            correct_results = collect_results(model, correct_outputs.logits, correct_loss, inputs_flat)
+            opt_results = collect_results(opt_model, opt_outputs.logits, opt_loss, inputs_flat)
+            self.assertTrue(same(correct_results, opt_results))
 
 @requires_nccl()
 class TestDistributed(torch._dynamo.test_case.TestCase):
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index b24c90ef3f862..607211087ddc7 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -67,6 +67,9 @@ class TestSkip(NamedTuple):
     "generic": TestSkip(
         86, "Test skipped at subprocess level, look at subprocess log for skip reason"
     ),
+    "importerror": TestSkip(
+        88, "Test skipped due to missing import"
+    ),
 }
 
 @dataclass
@@ -136,6 +139,19 @@ def wrapper(*args, **kwargs):
 
     return decorator
 
+def import_transformers_or_skip():
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            try:
+                from transformers import BertConfig, AutoModelForMaskedLM  # noqa: Unused
+                return func(*args, **kwargs)
+            except ImportError:
+                sys.exit(TEST_SKIPS["importerror"].exit_code)
+
+        return wrapper
+
+    return decorator
 
 def skip_if_lt_x_gpu(x):
     def decorator(func):

From 54cf1944750c092134e490c3f920ed8bcbc83543 Mon Sep 17 00:00:00 2001
From: Pruthvi Madugundu <pmagundu@amd.com>
Date: Fri, 4 Nov 2022 04:43:05 +0000
Subject: [PATCH 0565/1922] Introduce TORCH_DISABLE_GPU_ASSERTS (#84190)

- Asserts for CUDA are enabled by default
- Disabled for ROCm by default by setting `TORCH_DISABLE_GPU_ASSERTS` to `ON`
- Can be enabled for ROCm by setting above variable to`OFF` during build or can be forcefully enabled by setting `ROCM_FORCE_ENABLE_GPU_ASSERTS:BOOL=ON`

This is follow up changes as per comment in PR #81790, comment [link](https://github.com/pytorch/pytorch/pull/81790#issuecomment-1215929021)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84190
Approved by: https://github.com/jeffdaily, https://github.com/malfet
---
 CMakeLists.txt             |  1 +
 c10/macros/Macros.h        | 11 ++++++-----
 caffe2/core/macros.h.in    |  2 ++
 cmake/Dependencies.cmake   | 10 ++++++++++
 cmake/Summary.cmake        |  1 +
 cmake/public/LoadHIP.cmake | 16 ----------------
 docs/source/notes/hip.rst  | 11 +++++++++++
 7 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b1e0f517fafa4..6efd3f2df9366 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -285,6 +285,7 @@ if(NOT USE_XNNPACK AND CMAKE_VERSION VERSION_LESS ${XNNPACK_MIN_CMAKE_VER})
 endif()
 option(USE_ZMQ "Use ZMQ" OFF)
 option(USE_ZSTD "Use ZSTD" OFF)
+option(TORCH_DISABLE_GPU_ASSERTS "Disable GPU asserts by default" OFF)
 # Ensure that an ITT build is the default for x86 CPUs
 cmake_dependent_option(
   USE_ITT "Use Intel(R) VTune Profiler ITT functionality" ON
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
index e77fa0fde2ee0..31cd2219d10e6 100644
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@@ -326,9 +326,8 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
 // CUDA_KERNEL_ASSERT checks the assertion
 // even when NDEBUG is defined. This is useful for important assertions in CUDA
 // code that would otherwise be suppressed when building Release.
-#if defined(__ANDROID__) || defined(__APPLE__) ||  \
-    (defined(USE_ROCM) && ROCM_VERSION < 40100) || \
-    (defined(USE_ROCM) && defined(ROCM_DISABLE_GPU_ASSERTS))
+#if defined(__ANDROID__) || defined(__APPLE__) || \
+    (defined(USE_ROCM) && ROCM_VERSION < 40100)
 // Those platforms do not support assert()
 #define CUDA_KERNEL_ASSERT(cond)
 #define SYCL_KERNEL_ASSERT(cond)
@@ -368,7 +367,9 @@ extern SYCL_EXTERNAL void __assert_fail(
     unsigned int line,
     const char* func);
 #else // __SYCL_DEVICE_ONLY__
-#if (defined(__CUDA_ARCH__) && !(defined(__clang__) && defined(__CUDA__)))
+#if (                                                                       \
+    defined(__CUDA_ARCH__) && !(defined(__clang__) && defined(__CUDA__)) && \
+    !defined(TORCH_DISABLE_GPU_ASSERTS))
 // CUDA supports __assert_fail function which are common for both device
 // and host side code.
 __host__ __device__
@@ -386,7 +387,7 @@ __host__ __device__
         const char* function) throw() __attribute__((__noreturn__));
 
 #if (defined(__HIP_ARCH__) || defined(__HIP__)) && \
-    !defined(ROCM_DISABLE_GPU_ASSERTS)
+    !defined(TORCH_DISABLE_GPU_ASSERTS)
 // ROCm supports __assert_fail only as a device side function.
 __device__ __attribute__((noinline)) __attribute__((weak)) void __assert_fail(
     const char* assertion,
diff --git a/caffe2/core/macros.h.in b/caffe2/core/macros.h.in
index 9c9f734575634..2d9f03e94c0fc 100644
--- a/caffe2/core/macros.h.in
+++ b/caffe2/core/macros.h.in
@@ -44,6 +44,7 @@ static_assert(
 #cmakedefine CAFFE2_USE_NVTX
 #cmakedefine CAFFE2_USE_ITT
 #cmakedefine CAFFE2_USE_TRT
+#cmakedefine TORCH_DISABLE_GPU_ASSERTS
 
 #ifndef EIGEN_MPL2_ONLY
 #cmakedefine EIGEN_MPL2_ONLY
@@ -85,4 +86,5 @@ static_assert(
   {"USE_NVTX", "${CAFFE2_USE_NVTX}"}, \
   {"USE_ITT", "${CAFFE2_USE_ITT}"}, \
   {"USE_TRT", "${CAFFE2_USE_TRT}"}, \
+  {"TORCH_DISABLE_GPU_ASSERTS", "${TORCH_DISABLE_GPU_ASSERTS}"}, \
 }
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index e232fcb624cd3..cf3c2c2caafd2 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1248,6 +1248,16 @@ if(ANDROID)
   list(APPEND Caffe2_DEPENDENCY_LIBS log)
 endif()
 
+# ---[ Kernel asserts
+# Kernel asserts are enabled by default for CUDA and disabled for ROCm.
+# For ROCm, it can be enabled by setting ROCM_FORCE_ENABLE_GPU_ASSERTS
+if(USE_ROCM AND ROCM_FORCE_ENABLE_GPU_ASSERTS)
+  message(STATUS "Forcefully enabling kernel asserts on ROCM")
+elseif(USE_ROCM AND NOT ROCM_FORCE_ENABLE_GPU_ASSERTS)
+  message(STATUS "Disabling kernel asserts for ROCm")
+  caffe2_update_option(TORCH_DISABLE_GPU_ASSERTS ON)
+endif()
+
 # ---[ LLVM
 if(USE_LLVM)
   message(STATUS "Looking for LLVM in ${USE_LLVM}")
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index fd6444680e2d4..279d72a41e660 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -199,4 +199,5 @@ function(caffe2_print_configuration_summary)
   # coreml
   message(STATUS "  USE_COREML_DELEGATE     : ${USE_COREML_DELEGATE}")
   message(STATUS "  BUILD_LAZY_TS_BACKEND   : ${BUILD_LAZY_TS_BACKEND}")
+  message(STATUS "  TORCH_DISABLE_GPU_ASSERTS : ${TORCH_DISABLE_GPU_ASSERTS}")
 endfunction()
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index 89a61b6242856..b51284115f144 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -143,9 +143,6 @@ message("Building PyTorch for GPU arch: ${PYTORCH_ROCM_ARCH}")
 # Add HIP to the CMAKE Module Path
 set(CMAKE_MODULE_PATH ${HIP_PATH}/cmake ${CMAKE_MODULE_PATH})
 
-#Disable kernel assert due to performance regression
-set(ROCM_ENABLE_KERNEL_ASSERTS FALSE CACHE BOOL "Kernel asserts are disabled by default for ROCm")
-
 macro(find_package_and_print_version PACKAGE_NAME)
   find_package("${PACKAGE_NAME}" ${ARGN})
   message("${PACKAGE_NAME} VERSION: ${${PACKAGE_NAME}_VERSION}")
@@ -286,19 +283,6 @@ if(HIP_FOUND)
   find_package_and_print_version(hipcub REQUIRED)
   find_package_and_print_version(rocthrust REQUIRED)
 
-  if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "4.1.0")
-    if(ROCM_ENABLE_KERNEL_ASSERTS)
-      message("ROCm version >= 4.1; enabling asserts")
-    else()
-      add_definitions(-DROCM_DISABLE_GPU_ASSERTS)
-      message("ROCm version >= 4.1; kernel asserts are disabled")
-    endif()
-  else()
-    # Disable Asserts In Code (Can't use asserts on HIP stack.)
-    add_definitions(-DNDEBUG)
-    message("ROCm version < 4.1; disablng asserts")
-  endif()
-
   if(HIP_COMPILER STREQUAL clang)
     set(hip_library_name amdhip64)
   else()
diff --git a/docs/source/notes/hip.rst b/docs/source/notes/hip.rst
index a9c94e2a4febb..c54e201489705 100644
--- a/docs/source/notes/hip.rst
+++ b/docs/source/notes/hip.rst
@@ -144,3 +144,14 @@ Refer to CUDA Semantics doc
 ---------------------------
 
 For any sections not listed here, please refer to the CUDA semantics doc: :ref:`cuda-semantics`
+
+
+Enabling kernel asserts
+-----------------------
+
+Kernel asserts are supported on ROCm, but they are disabled due to performance overhead. It can be enabled
+by recompiling the PyTorch from source.
+
+Please add below line as an argument to cmake command parameters::
+
+    -DROCM_FORCE_ENABLE_GPU_ASSERTS:BOOL=ON

From 6f95ba130b4bc3e7e8995dceb0a735c663a225f4 Mon Sep 17 00:00:00 2001
From: XiaobingSuper <xiaobing.zhang@intel.com>
Date: Thu, 3 Nov 2022 00:46:02 -0400
Subject: [PATCH 0566/1922] TorchDynamo: enable convolution and batchnorm
 folding for inference path (#87435)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87435
Approved by: https://github.com/jgong5, https://github.com/jansel
---
 test/inductor/test_torchinductor.py | 56 +++++++++++++++++++++++++++++
 torch/_inductor/overrides.py        | 34 ++++++++++++++++++
 2 files changed, 90 insertions(+)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index cc8985e4598b0..dd9e6767a805e 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -1334,6 +1334,62 @@ def fn(a, b):
             check_lowp=False,
         )
 
+    # For gpu path, there has a accurcy issue,
+    @unittest.skipIf(HAS_CUDA, "only support cpu conv  bn test")
+    def test_conv_bn_fuse(self):
+        input_shapes = {1: (112,), 2: (112, 112), 3: (55, 55, 55)}
+        conv_modules = {1: torch.nn.Conv1d, 2: torch.nn.Conv2d, 3: torch.nn.Conv3d}
+        bn_modules = {
+            1: torch.nn.BatchNorm1d,
+            2: torch.nn.BatchNorm2d,
+            3: torch.nn.BatchNorm3d,
+        }
+        options = itertools.product(
+            [1, 2, 3],
+            [True, False],
+            [1, 3],
+            [1, 2],
+            [1, 4],
+        )
+
+        for (
+            dim,
+            bias,
+            kernel_size,
+            dilation,
+            groups,
+        ) in options:
+            oC = 32 * groups
+            iC = 3 * groups
+            x_shape = (1, iC) + input_shapes[dim]
+            mod = torch.nn.Sequential(
+                conv_modules[dim](
+                    iC,
+                    oC,
+                    kernel_size=kernel_size,
+                    dilation=dilation,
+                    groups=groups,
+                    bias=bias,
+                ),
+                bn_modules[dim](oC),
+            ).eval()
+            test_memory_format = [torch.contiguous_format]
+            # TODO: GPU path doesn't support channels_last now.
+            if not HAS_CUDA and dim > 1:
+                channels_last = (
+                    torch.channels_last if dim == 2 else torch.channels_last_3d
+                )
+                test_memory_format.append(channels_last)
+            for memory_format in test_memory_format:
+                v = torch.randn(x_shape, dtype=torch.float32).to(
+                    memory_format=memory_format
+                )
+                with torch.no_grad():
+                    self.common(
+                        mod,
+                        (v,),
+                    )
+
     # For gpu path, there has a accurcy issue,
     # see https://github.com/pytorch/pytorch/issues/87745.
     @unittest.skipIf(HAS_CUDA, "only support cpu conv2d unary test")
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index f57a692e9547c..1ab55142619c0 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -16,6 +16,7 @@
 from torch.fx.passes.shape_prop import ShapeProp
 from torch.nn import functional as F
 from torch.nn.modules.utils import _pair
+from torch.nn.utils.fusion import fuse_conv_bn_eval
 from torch.overrides import TorchFunctionMode
 
 log = logging.getLogger(__name__)
@@ -310,6 +311,7 @@ def fuse_fx(gm: torch.fx.GraphModule, example_inputs):
     )
     if not is_cpu:
         return gm
+    gm = fuse_conv_bn(gm)
     # For binary fusion, we need to check inputs info to make sure
     # the binary inputs have same tensor info(device, dtype, and layout).
     ShapeProp(gm).propagate(*example_inputs)
@@ -319,6 +321,38 @@ def fuse_fx(gm: torch.fx.GraphModule, example_inputs):
     return gm
 
 
+def fuse_conv_bn(gm: torch.fx.GraphModule, inplace=False):
+    """
+    Fuses Convolution/BN layers for inference purposes.
+    """
+    patterns = [
+        (torch.nn.Conv1d, torch.nn.BatchNorm1d),
+        (torch.nn.Conv2d, torch.nn.BatchNorm2d),
+        (torch.nn.Conv3d, torch.nn.BatchNorm3d),
+    ]
+    modules = dict(gm.named_modules())
+
+    for pattern in patterns:
+        for node in gm.graph.nodes:
+            if matches_module_pattern(pattern, node, modules):
+                if len(node.args[0].users) > 1:  # Output of conv is used by other nodes
+                    continue
+                conv = modules[node.args[0].target]
+                bn = modules[node.target]
+                eval_mode = all(not n.training for n in [conv, bn])
+                if not eval_mode:
+                    continue
+                if not bn.track_running_stats:
+                    continue
+                fused_conv = fuse_conv_bn_eval(conv, bn)
+                replace_node_module(node.args[0], modules, fused_conv)
+                node.replace_all_uses_with(node.args[0])
+                gm.graph.erase_node(node)
+                gm.graph.lint()
+    gm.recompile()
+    return gm
+
+
 def fuse_unary(gm: torch.fx.GraphModule):
     modules = dict(gm.named_modules())
 

From 4fb1454dddf6652baaeee4c23bc2462cd9dffe78 Mon Sep 17 00:00:00 2001
From: Wonjoo Lee <wonjoo@google.com>
Date: Fri, 4 Nov 2022 08:23:54 +0000
Subject: [PATCH 0567/1922] Add use_lazy_shape flag to GenLazyIr class (#88444)

Add use_lazy_shape flag to GenLazyIr class to allow XLA to use its custom shape class. The default value is kept to use lazy shape, so this PR does not introduce any new behaviors.

PyTorch/XLA companion PR: https://github.com/pytorch/xla/pull/4111
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88444
Approved by: https://github.com/alanwaketan, https://github.com/wconstab
---
 torchgen/dest/lazy_ir.py    | 3 ++-
 torchgen/gen_lazy_tensor.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/torchgen/dest/lazy_ir.py b/torchgen/dest/lazy_ir.py
index 41b32b81dbd8d..33043a5780d7e 100644
--- a/torchgen/dest/lazy_ir.py
+++ b/torchgen/dest/lazy_ir.py
@@ -172,6 +172,7 @@ class GenLazyIR(ABC):
     backend_index: BackendIndex
     backend_name: str
     node_base: str
+    use_lazy_shape: bool
 
     @method_with_native_function
     def __call__(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> List[str]:
@@ -252,7 +253,7 @@ def gen(self, schema: LazyIrSchema) -> List[str]:
 
         ctor_args = [f"const {i.lazy_type.cpp_type()}& {i.name}" for i in all_args]
         reuse_ctor_args = ", ".join(ctor_args)
-        if schema.properties.ShapePrecompute:
+        if self.use_lazy_shape and schema.properties.ShapePrecompute:
             ctor_args.append("std::vector<torch::lazy::Shape>&& shapes")
         node_ctor_args = ", ".join(ctor_args)
 
diff --git a/torchgen/gen_lazy_tensor.py b/torchgen/gen_lazy_tensor.py
index 5207681cf5c8e..b2b24111b0f9c 100644
--- a/torchgen/gen_lazy_tensor.py
+++ b/torchgen/gen_lazy_tensor.py
@@ -313,6 +313,7 @@ def run_gen_lazy_tensor(
     per_operator_headers: bool = False,
     backend_name: str = default_args.backend_name,
     gen_forced_fallback_code: bool = False,
+    use_lazy_shape: bool = True,
     # the following arguments are temporary customization points for xla backend migration.
     # do not rely on them otherwise, they should be removed once migration is complete
     backend_namespace: str = "torch::lazy",
@@ -533,7 +534,7 @@ def concat_map_codegen(
     )
     # Generate IR node classes
     lazy_ir_obj = lazy_ir_generator(
-        backend_indices[backend_key], backend_name, node_base
+        backend_indices[backend_key], backend_name, node_base, use_lazy_shape
     )
 
     fm.write_with_template(

From 1e3549c3906c6cf34d120fa0884e77c6cc2df799 Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Fri, 4 Nov 2022 12:07:12 +0000
Subject: [PATCH 0568/1922] [Vulkan][TCC] Add tests for conv2d prepack context
 (#88316)

Summary:
Implement Vulkan tests for the create/run context functions in Convolution.cpp, their transposed versions and their backwards compatible versions:
- create_conv2d_context
- run_conv2d_context
- create_tconv2d_context
- run_tconv2d_context
- conv2d_clamp_prepack
- conv2d_clamp_run

Test Plan:
On Mac
```
cd ~/fbsource
buck run -c pt.vulkan_full_precision=1 //xplat/caffe2:pt_vulkan_api_test_binAppleMac\#macosx-arm64
```

On Android
```
cd ~/fbsource
buck build -c ndk.custom_libcxx=false -c pt.enable_qpl=0 -c pt.vulkan_full_precision=1 //xplat/caffe2:pt_vulkan_api_test_binAndroid\#android-arm64 --show-output
adb push buck-out/gen/xplat/caffe2/pt_vulkan_api_test_binAndroid\#android-arm64 /data/local/tmp/vulkan_api_test
adb shell "/data/local/tmp/vulkan_api_test"
```

Reviewed By: salilsdesai

Differential Revision: D40935343

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88316
Approved by: https://github.com/salilsdesai
---
 aten/src/ATen/test/vulkan_api_test.cpp | 353 +++++++++++++++++++------
 1 file changed, 278 insertions(+), 75 deletions(-)

diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index 66d33859ea03c..2519267f75dc3 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -5,6 +5,7 @@
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/native/vulkan/api/api.h>
 #include <ATen/native/vulkan/ops/Copy.h>
+#include <ATen/native/vulkan/ops/Convolution.h>
 #include <c10/util/irange.h>
 
 // TODO: These functions should move to a common place.
@@ -844,6 +845,130 @@ TEST_F(VulkanAPITest, clamp_) {
   ASSERT_TRUE(check);
 }
 
+void test_conv2d_context(
+    const at::IntArrayRef input_shape,
+    const at::IntArrayRef weight_shape,
+    const at::IntArrayRef bias_shape,
+    std::vector<int64_t> stride,
+    std::vector<int64_t> padding,
+    std::vector<int64_t> dilation,
+    int64_t groups) {
+  c10::InferenceMode mode;
+
+  at::Tensor input = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
+  at::Tensor weight = at::rand(weight_shape, at::device(at::kCPU).dtype(at::kFloat));
+  at::Tensor bias = at::rand(bias_shape, at::device(at::kCPU).dtype(at::kFloat));
+
+  // cpu
+  const auto out_cpu = at::conv2d(
+    input, weight, bias, stride, padding, dilation, groups);
+
+  // vulkan
+  const auto prepack_vulkan = callOpByName(
+      "vulkan_prepack::create_conv2d_context",
+      "",
+      weight, bias, stride, padding, dilation, groups, c10::nullopt, c10::nullopt);
+
+  const auto vulkan_output = callOpByName(
+      "vulkan_prepack::run_conv2d_context",
+      "",
+      input.vulkan(), prepack_vulkan[0]);
+
+  const auto out_vulkan = vulkan_output[0].toTensor();
+  const auto out_vk_cpu = out_vulkan.cpu();
+
+  // check
+  const bool check = almostEqual(out_cpu, out_vk_cpu);
+  if (!check) {
+    showRtol(out_cpu, out_vk_cpu);
+  }
+
+  ASSERT_TRUE(check);
+}
+
+void test_backwards_compatible_conv2d_context(
+    const at::IntArrayRef input_shape,
+    const at::IntArrayRef weight_shape,
+    const at::IntArrayRef bias_shape,
+    std::vector<int64_t> stride,
+    std::vector<int64_t> padding,
+    std::vector<int64_t> dilation,
+    int64_t groups) {
+  c10::InferenceMode mode;
+
+  at::Tensor input = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
+  at::Tensor weight = at::rand(weight_shape, at::device(at::kCPU).dtype(at::kFloat));
+  at::Tensor bias = at::rand(bias_shape, at::device(at::kCPU).dtype(at::kFloat));
+
+  // cpu
+  const auto out_cpu = at::conv2d(
+    input, weight, bias, stride, padding, dilation, groups);
+
+  // vulkan
+  const auto prepack_vulkan = callOpByName(
+      "vulkan_prepack::conv2d_clamp_prepack",
+      "",
+      weight, bias, stride, padding, dilation, groups, c10::nullopt, c10::nullopt);
+
+  const auto vulkan_output = callOpByName(
+      "vulkan_prepack::conv2d_clamp_run",
+      "",
+      input.vulkan(), prepack_vulkan[0]);
+
+  const auto out_vulkan = vulkan_output[0].toTensor();
+  const auto out_vk_cpu = out_vulkan.cpu();
+
+  // check
+  const bool check = almostEqual(out_cpu, out_vk_cpu);
+  if (!check) {
+    showRtol(out_cpu, out_vk_cpu);
+  }
+
+  ASSERT_TRUE(check);
+}
+
+void test_transposed_conv2d_context(
+    const at::IntArrayRef input_shape,
+    const at::IntArrayRef weight_shape,
+    const at::IntArrayRef bias_shape,
+    std::vector<int64_t> stride,
+    std::vector<int64_t> padding,
+    std::vector<int64_t> output_padding,
+    std::vector<int64_t> dilation,
+    int64_t groups) {
+  c10::InferenceMode mode;
+
+  at::Tensor input = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
+  at::Tensor weight = at::rand(weight_shape, at::device(at::kCPU).dtype(at::kFloat));
+  at::Tensor bias = at::rand(bias_shape, at::device(at::kCPU).dtype(at::kFloat));
+
+  // cpu
+  const auto out_cpu = at::conv_transpose2d(
+    input, weight, bias, stride, padding, output_padding, groups, dilation);
+
+  // vulkan
+  const auto prepack_vulkan = callOpByName(
+      "vulkan_prepack::create_tconv2d_context",
+      "",
+      weight, bias, stride, padding, output_padding, dilation, groups, c10::nullopt, c10::nullopt);
+
+  const auto vulkan_output = callOpByName(
+      "vulkan_prepack::run_tconv2d_context",
+      "",
+      input.vulkan(), prepack_vulkan[0]);
+
+  const auto out_vulkan = vulkan_output[0].toTensor();
+  const auto out_vk_cpu = out_vulkan.cpu();
+
+  // check
+  const bool check = almostEqual(out_cpu, out_vk_cpu);
+  if (!check) {
+    showRtol(out_cpu, out_vk_cpu);
+  }
+
+  ASSERT_TRUE(check);
+}
+
 TEST_F(VulkanAPITest, conv2d) {
   constexpr int64_t groups = 1;
   constexpr std::array<int64_t, 2u> stride{2, 2};
@@ -913,6 +1038,28 @@ TEST_F(VulkanAPITest, conv2d) {
   ASSERT_TRUE(check);
 }
 
+TEST_F(VulkanAPITest, conv2d_prepack) {
+  test_conv2d_context(
+    {1, 3, 8, 8}, // input_shape
+    {1, 3, 3, 3}, // weight_shape
+    {1},          // bias_shape
+    {2, 2},       // stride
+    {1, 1},       // padding
+    {1, 1},       // dilation
+    1);           // groups
+}
+
+TEST_F(VulkanAPITest, conv2d_prepack_bc) {
+  test_backwards_compatible_conv2d_context(
+    {1, 3, 8, 8}, // input_shape
+    {1, 3, 3, 3}, // weight_shape
+    {1},          // bias_shape
+    {2, 2},       // stride
+    {1, 1},       // padding
+    {1, 1},       // dilation
+    1);           // groups
+}
+
 TEST_F(VulkanAPITest, conv2d_dw) {
   constexpr int64_t groups = 7;
   constexpr std::array<int64_t, 2u> stride{2, 3};
@@ -981,6 +1128,28 @@ TEST_F(VulkanAPITest, conv2d_dw) {
   ASSERT_TRUE(check);
 }
 
+TEST_F(VulkanAPITest, conv2d_dw_prepack) {
+  test_conv2d_context(
+    {1, 7, 137, 199}, // input_shape
+    {7, 1, 17, 7},    // weight_shape
+    {7},              // bias_shape
+    {2, 3},           // stride
+    {0, 4},           // padding
+    {3, 1},           // dilation
+    7);               // groups
+}
+
+TEST_F(VulkanAPITest, conv2d_dw_prepack_bc) {
+  test_backwards_compatible_conv2d_context(
+    {1, 7, 137, 199}, // input_shape
+    {7, 1, 17, 7},    // weight_shape
+    {7},              // bias_shape
+    {2, 3},           // stride
+    {0, 4},           // padding
+    {3, 1},           // dilation
+    7);               // groups
+}
+
 TEST_F(VulkanAPITest, conv2d_pw) {
   constexpr int64_t groups = 1;
   constexpr std::array<int64_t, 2u> stride{1, 1};
@@ -1049,6 +1218,115 @@ TEST_F(VulkanAPITest, conv2d_pw) {
   ASSERT_TRUE(check);
 }
 
+TEST_F(VulkanAPITest, conv2d_pw_prepack) {
+  test_conv2d_context(
+    {1, 17, 127, 397},  // input_shape
+    {29, 17, 1, 1},     // weight_shape
+    {29},               // bias_shape
+    {1, 1},             // stride
+    {0, 0},             // padding
+    {1, 1},             // dilation
+    1);                 // groups
+}
+
+TEST_F(VulkanAPITest, conv2d_pw_prepack_bc) {
+  test_backwards_compatible_conv2d_context(
+    {1, 17, 127, 397},  // input_shape
+    {29, 17, 1, 1},     // weight_shape
+    {29},               // bias_shape
+    {1, 1},             // stride
+    {0, 0},             // padding
+    {1, 1},             // dilation
+    1);                 // groups
+}
+
+TEST_F(VulkanAPITest, conv2d_transposed) {
+  // Arrange
+  constexpr int64_t groups = 1;
+  constexpr std::array<int64_t, 2u> stride{1, 2};
+  constexpr std::array<int64_t, 2u> padding{1, 0};
+  constexpr std::array<int64_t, 2u> output_padding{0, 1};
+  //TODO: Support conv_transpose2d with dilation != 1
+  constexpr std::array<int64_t, 2u> dilation{1, 1};
+
+  constexpr struct {
+    uint32_t batches;
+    uint32_t channels;
+    uint32_t height;
+    uint32_t width;
+
+    std::array<int64_t, 4u> size() const {
+      return {
+        batches,
+        channels,
+        height,
+        width,
+      };
+    }
+  } input {1, 55, 7, 19};
+
+  constexpr struct {
+    uint32_t input_channels;
+    uint32_t output_channels;
+    uint32_t height;
+    uint32_t width;
+
+    std::array<int64_t, 4u> size() const {
+      return {
+        input_channels,
+        output_channels,
+        height,
+        width,
+      };
+    }
+  } weights {input.channels, 47, 2, 3};
+
+  const auto input_cpu = at::randn(input.size(), at::device(at::kCPU).dtype(at::kFloat));
+  const auto weights_cpu = at::randn(weights.size(), at::device(at::kCPU).dtype(at::kFloat));
+  const auto bias_cpu = at::zeros({weights.output_channels}, at::device(at::kCPU).dtype(at::kFloat));
+
+  // Act
+  const auto output_cpu = at::conv_transpose2d(
+      input_cpu,
+      weights_cpu,
+      bias_cpu,
+      stride,
+      padding,
+      output_padding,
+      groups,
+      dilation);
+
+  const auto output_vk = at::conv_transpose2d(
+      input_cpu.vulkan(),
+      weights_cpu,
+      bias_cpu,
+      stride,
+      padding,
+      output_padding,
+      groups,
+      dilation).cpu();
+
+  // Assert
+  const bool check = almostEqual(output_cpu, output_vk);
+  if (!check) {
+    showRtol(output_cpu, output_vk);
+  }
+
+  ASSERT_TRUE(check);
+}
+
+TEST_F(VulkanAPITest, conv2d_transposed_prepack) {
+  test_transposed_conv2d_context(
+    {1, 55, 7, 19}, // input_shape
+    {55, 47, 2, 3}, // weight_shape
+    {47},           // bias_shape
+    {1, 2},         // stride
+    {1, 0},         // padding
+    {0, 1},         // output_padding
+    {1, 1},         // dilation
+    1);             // groups
+}
+
 TEST_F(VulkanAPITest, copy) {
   const auto cpu = at::rand({13, 17, 37, 19}, at::device(at::kCPU).dtype(at::kFloat));
   const auto vulkan = cpu.vulkan();
@@ -2764,81 +3042,6 @@ TEST_F(VulkanAPITest, sub_to_scalar_wrapped) {
   ASSERT_TRUE(check);
 }
 
-TEST_F(VulkanAPITest, transposed_conv2d) {
-  // Arrange
-  constexpr int64_t groups = 1;
-  constexpr std::array<int64_t, 2u> stride{1, 2};
-  constexpr std::array<int64_t, 2u> padding{1, 0};
-  constexpr std::array<int64_t, 2u> output_padding{0, 1};
-  //TODO: Support conv_transpose2d with dilation != 1
-  constexpr std::array<int64_t, 2u> dilation{1, 1};
-
-  constexpr struct {
-    uint32_t batches;
-    uint32_t channels;
-    uint32_t height;
-    uint32_t width;
-
-    std::array<int64_t, 4u> size() const {
-      return {
-        batches,
-        channels,
-        height,
-        width,
-      };
-    }
-  } input {1, 55, 7, 19};
-
-  constexpr struct {
-    uint32_t input_channels;
-    uint32_t output_channels;
-    uint32_t height;
-    uint32_t width;
-
-    std::array<int64_t, 4u> size() const {
-      return {
-        input_channels,
-        output_channels,
-        height,
-        width,
-      };
-    }
-  } weights {input.channels, 47, 2, 3};
-
-  const auto input_cpu = at::randn(input.size(), at::device(at::kCPU).dtype(at::kFloat));
-  const auto weights_cpu = at::randn(weights.size(), at::device(at::kCPU).dtype(at::kFloat));
-  const auto bias_cpu = at::zeros({weights.output_channels}, at::device(at::kCPU).dtype(at::kFloat));
-
-  // Act
-  const auto output_cpu = at::conv_transpose2d(
-      input_cpu,
-      weights_cpu,
-      bias_cpu,
-      stride,
-      padding,
-      output_padding,
-      groups,
-      dilation);
-
-  const auto output_vk = at::conv_transpose2d(
-      input_cpu.vulkan(),
-      weights_cpu,
-      bias_cpu,
-      stride,
-      padding,
-      output_padding,
-      groups,
-      dilation).cpu();
-
-  // Assert
-  const bool check = almostEqual(output_cpu, output_vk);
-  if (!check) {
-    showRtol(output_cpu, output_vk);
-  }
-
-  ASSERT_TRUE(check);
-}
-
 TEST_F(VulkanAPITest, upsample_nearest2d) {
   const auto in_cpu = at::rand({1, 2, 2, 3}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
   const auto out_cpu = at::upsample_nearest2d(in_cpu, {4, 6});

From b3b78e921234f0d0c64ca3ba51c326dea79bff3c Mon Sep 17 00:00:00 2001
From: John Detloff <johndetloff@fb.com>
Date: Fri, 4 Nov 2022 17:31:17 +0000
Subject: [PATCH 0569/1922] Add dep on Accelerate framework to torch podspecs
 (#88422)

A dep on Accelerate was added in https://github.com/pytorch/pytorch/pull/80449 We need to declare this dep in our podspec, otherwise users will have to add the Accelerate framework to their projects manually.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88422
Approved by: https://github.com/kimishpatel, https://github.com/malfet
---
 ios/LibTorch-Lite.podspec | 1 +
 ios/LibTorch.podspec      | 1 +
 2 files changed, 2 insertions(+)

diff --git a/ios/LibTorch-Lite.podspec b/ios/LibTorch-Lite.podspec
index 54a12fb2cbfc2..96b759f221504 100644
--- a/ios/LibTorch-Lite.podspec
+++ b/ios/LibTorch-Lite.podspec
@@ -33,4 +33,5 @@ Pod::Spec.new do |s|
         'VALID_ARCHS' => 'x86_64 arm64'
     }
     s.library = ['c++', 'stdc++']
+    s.frameworks = 'Accelerate'
 end
diff --git a/ios/LibTorch.podspec b/ios/LibTorch.podspec
index e166f853143b0..6cee4993cca14 100644
--- a/ios/LibTorch.podspec
+++ b/ios/LibTorch.podspec
@@ -33,4 +33,5 @@ Pod::Spec.new do |s|
         'VALID_ARCHS' => 'x86_64 arm64'
     }
     s.library = ['c++', 'stdc++']
+    s.frameworks = 'Accelerate'
 end

From 6d71db8a53061c64e1605f8b4c46ca7811e09021 Mon Sep 17 00:00:00 2001
From: Shiyan Deng <dsy842974287@meta.com>
Date: Fri, 4 Nov 2022 17:35:12 +0000
Subject: [PATCH 0570/1922] [torch package] Treat builtins as default extern
 module (#88385)

Summary: When using torch deploy, if we do fx transformation and then try to pickle/unpickle a fx GraphModule, it's possible that the GraphModule's code depends on `builtins` but we didn't add it to extern module.

Reviewed By: PaliC

Differential Revision: D40958730

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88385
Approved by: https://github.com/PaliC
---
 torch/package/package_importer.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/torch/package/package_importer.py b/torch/package/package_importer.py
index 2530f28d1501d..7bf945c70c0b3 100644
--- a/torch/package/package_importer.py
+++ b/torch/package/package_importer.py
@@ -39,6 +39,9 @@
     "numpy",
     "numpy.core",
     "numpy.core._multiarray_umath",
+    # FX GraphModule might depend on builtins module and users usually
+    # don't extern builtins. Here we import it here by default.
+    "builtins",
 ]
 
 
From ba4509e3e680c6e632bdaf3a1eaf44f1fe7dac42 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Fri, 4 Nov 2022 12:39:03 -0400
Subject: [PATCH 0571/1922] Fix typo in clones (#88501)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88501
Approved by: https://github.com/wconstab
---
 benchmarks/dynamo/Makefile_dashboard | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/dynamo/Makefile_dashboard b/benchmarks/dynamo/Makefile_dashboard
index 559f9fee92dd9..904b6726c494c 100644
--- a/benchmarks/dynamo/Makefile_dashboard
+++ b/benchmarks/dynamo/Makefile_dashboard
@@ -7,7 +7,7 @@ clone-deps:
 		&& (test -e torchvision || git clone --recursive https://github.com/pytorch/vision torchvision) \
 		&& (test -e torchdata || git clone --recursive https://github.com/pytorch/data.git torchdata) \
 		&& (test -e torchtext || git clone --recursive https://github.com/pytorch/text torchtext) \
-		&& (test -e torchaudio || git clone --recursive https://github.com/pytorch/text torchaudio) \
+		&& (test -e torchaudio || git clone --recursive https://github.com/pytorch/audio torchaudio) \
 		&& (test -e detectron2 || git clone --recursive https://github.com/facebookresearch/detectron2) \
 		&& (test -e torchbenchmark || git clone --recursive https://github.com/pytorch/benchmark torchbenchmark) \
 		&& (test -e triton || git clone --recursive https://github.com/openai/triton.git) \

From 6c0bfbd4b503eec3020e2509e0e5874dc713ee12 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 4 Nov 2022 19:12:35 +0000
Subject: [PATCH 0572/1922] Revert "remove assert_allclose from torch.testing
 (#87974)"

This reverts commit 5669e10d37fa3cca21cf82c843ae4c4e79da1b89.

Reverted https://github.com/pytorch/pytorch/pull/87974 on behalf of https://github.com/mehtanirav due to Internal breakages from method removal
---
 caffe2/python/operator_test/_utils.py |  2 +-
 torch/testing/__init__.py             |  1 +
 torch/testing/_deprecated.py          | 77 +++++++++++++++++++++++++++
 3 files changed, 79 insertions(+), 1 deletion(-)
 create mode 100644 torch/testing/_deprecated.py

diff --git a/caffe2/python/operator_test/_utils.py b/caffe2/python/operator_test/_utils.py
index 1c3a105085e96..3ee1def89e715 100644
--- a/caffe2/python/operator_test/_utils.py
+++ b/caffe2/python/operator_test/_utils.py
@@ -1,5 +1,5 @@
 """
-This file only exists since `torch.testing.assert_allclose` was removed, but used extensively throughout the tests in
+This file only exists since `torch.testing.assert_allclose` is deprecated, but used extensively throughout the tests in
 this package. The replacement `torch.testing.assert_close` doesn't support one feature that is needed here: comparison
 between numpy arrays and torch tensors. See https://github.com/pytorch/pytorch/issues/61844 for the reasoning why this
 was removed.
diff --git a/torch/testing/__init__.py b/torch/testing/__init__.py
index ed4922aab9b60..ad69ef1d24901 100644
--- a/torch/testing/__init__.py
+++ b/torch/testing/__init__.py
@@ -1,3 +1,4 @@
 from ._comparison import assert_close as assert_close
 from torch._C import FileCheck as FileCheck
 from ._creation import make_tensor as make_tensor
+from ._deprecated import *  # noqa: F403
diff --git a/torch/testing/_deprecated.py b/torch/testing/_deprecated.py
new file mode 100644
index 0000000000000..731158ddb41ee
--- /dev/null
+++ b/torch/testing/_deprecated.py
@@ -0,0 +1,77 @@
+"""This module exists since the `torch.testing` exposed a lot of stuff that shouldn't have been public. Although this
+was never documented anywhere, some other internal FB projects as well as downstream OSS projects might use this. Thus,
+we don't internalize without warning, but still go through a deprecation cycle.
+"""
+
+import functools
+import warnings
+from typing import Any, Callable, Dict, Optional, Tuple, Union
+
+import torch
+
+
+__all__ = ["assert_allclose"]
+
+
+def warn_deprecated(instructions: Union[str, Callable[[str, Tuple[Any, ...], Dict[str, Any], Any], str]]) -> Callable:
+    def outer_wrapper(fn: Callable) -> Callable:
+        name = fn.__name__
+        head = f"torch.testing.{name}() is deprecated since 1.12 and will be removed in 1.14. "
+
+        @functools.wraps(fn)
+        def inner_wrapper(*args: Any, **kwargs: Any) -> Any:
+            return_value = fn(*args, **kwargs)
+            tail = instructions(name, args, kwargs, return_value) if callable(instructions) else instructions
+            msg = (head + tail).strip()
+            warnings.warn(msg, FutureWarning)
+            return return_value
+
+        return inner_wrapper
+
+    return outer_wrapper
+
+
+_DTYPE_PRECISIONS = {
+    torch.float16: (1e-3, 1e-3),
+    torch.float32: (1e-4, 1e-5),
+    torch.float64: (1e-5, 1e-8),
+}
+
+
+def _get_default_rtol_and_atol(actual: torch.Tensor, expected: torch.Tensor) -> Tuple[float, float]:
+    actual_rtol, actual_atol = _DTYPE_PRECISIONS.get(actual.dtype, (0.0, 0.0))
+    expected_rtol, expected_atol = _DTYPE_PRECISIONS.get(expected.dtype, (0.0, 0.0))
+    return max(actual_rtol, expected_rtol), max(actual_atol, expected_atol)
+
+
+@warn_deprecated(
+    "Use torch.testing.assert_close() instead. "
+    "For detailed upgrade instructions see https://github.com/pytorch/pytorch/issues/61844."
+)
+def assert_allclose(
+    actual: Any,
+    expected: Any,
+    rtol: Optional[float] = None,
+    atol: Optional[float] = None,
+    equal_nan: bool = True,
+    msg: str = "",
+) -> None:
+    if not isinstance(actual, torch.Tensor):
+        actual = torch.tensor(actual)
+    if not isinstance(expected, torch.Tensor):
+        expected = torch.tensor(expected, dtype=actual.dtype)
+
+    if rtol is None and atol is None:
+        rtol, atol = _get_default_rtol_and_atol(actual, expected)
+
+    torch.testing.assert_close(
+        actual,
+        expected,
+        rtol=rtol,
+        atol=atol,
+        equal_nan=equal_nan,
+        check_device=True,
+        check_dtype=False,
+        check_stride=False,
+        msg=msg or None,
+    )

From 0da04cbd2a4ad33e844be21399369d4f25dd6056 Mon Sep 17 00:00:00 2001
From: jjsjann123 <alex.jann2012@gmail.com>
Date: Fri, 4 Nov 2022 19:17:07 +0000
Subject: [PATCH 0573/1922] nvprim python runtime dtype correctness patch
 (#88452)

Cherry-picking: https://github.com/csarofeen/pytorch/pull/2133

- [x] casts FusionDefinition output to original dtype recorded in the GraphModule
- [x] add a python repro with dynamo

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88452
Approved by: https://github.com/IvanYashchuk, https://github.com/mruberry
---
 test/test_nvfuser_dynamo.py      | 16 ++++++++++++++++
 torch/_prims/nvfuser_executor.py | 19 +++++++++++++++++--
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/test/test_nvfuser_dynamo.py b/test/test_nvfuser_dynamo.py
index 5d3aeff267fd1..b0c28386185d4 100644
--- a/test/test_nvfuser_dynamo.py
+++ b/test/test_nvfuser_dynamo.py
@@ -45,6 +45,22 @@ def func(a, b):
         eager_result = func.__wrapped__(input1, input2)
         self.assertEqual(eager_result, nvfuser_result)
 
+    def test_dtype_correctness(self):
+        input1 = make_tensor((2, 4, 8), device="cuda", dtype=torch.float16)
+
+        @torchdynamo.optimize("nvprims_nvfuser")
+        def func(a):
+            tmp = a + 1.0
+            # nvfuser would promote output to fp32 in math, FusionDefinition should cast output dtype back
+            return torch.where(tmp > 0, tmp, 0.0)
+
+        # No warnings and no errors
+        with warnings.catch_warnings(record=True) as w:
+            nvfuser_result = func(input1)
+            self.assertEqual(len(w), 0)
+        eager_result = func.__wrapped__(input1)
+        self.assertEqual(eager_result, nvfuser_result)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/_prims/nvfuser_executor.py b/torch/_prims/nvfuser_executor.py
index ae9dbfff781df..c3a7e8913ce29 100644
--- a/torch/_prims/nvfuser_executor.py
+++ b/torch/_prims/nvfuser_executor.py
@@ -23,6 +23,7 @@
         DataType,
         Fusion,
         FusionDefinition,
+        Tensor,
     )
 else:
     DataType = None
@@ -133,6 +134,10 @@ def make_nvfuser_fusion(gm: GraphModule, *nv_args_templates):
         call_function_nodes
     ), "Constant tensors that are saved in the graph and used as arguments are not supported yet"
 
+    # Checking output dtypes
+    output_node = next(filter(lambda n: n.op == "output", gm.graph.nodes))
+    orig_flat_out, _ = tree_flatten(output_node.args[0])
+
     fusion = Fusion()
     with FusionDefinition(fusion) as fd:
 
@@ -178,6 +183,18 @@ def call_function(self, target, args, kwargs):
                 args = (fd,) + args
                 return target(*args, **kwargs)
 
+            def output(self, target, args, kwargs):
+                flat_out, unflatten_spec = tree_flatten(args[0])
+                for o, orig_o in zip(flat_out, orig_flat_out):
+                    # casting outputs to the original data type
+                    # ensures outputs produced by fusion would always agree with original GraphModule
+                    out_dtype = _torch_dtype_to_nvfuser_dtype_map.get(orig_o.meta["tensor_meta"].dtype)  # type: ignore[union-attr]
+                    assert isinstance(
+                        o, Tensor
+                    ), "output from codegen has to be tensor type"
+                    fd.add_output(fd.ops.cast(o, dtype=out_dtype))
+                return args[0]
+
         def templates_to_nvfuser_inputs(arg):
             if isinstance(arg, nvFuserTensorTemplate):
                 x = fd.define_tensor(
@@ -194,8 +211,6 @@ def templates_to_nvfuser_inputs(arg):
         nv_args = tuple(map(templates_to_nvfuser_inputs, nv_args_templates))
         out = FusionInterpreter(gm).run(*nv_args)
         flat_out, unflatten_spec = tree_flatten(out)
-        for o in flat_out:
-            fd.add_output(o)
 
     return fusion, unflatten_spec
 

From 4117751490b5274636d75c83ee34a5346d7f750a Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Fri, 4 Nov 2022 06:18:25 -0700
Subject: [PATCH 0574/1922] Revert "Revert "Put Python Dispatcher cache in
 dict, clear it on new registrations. (#88329)"" (#88489)

The bug was that I was accidentally caching at the wrong key name, so
we were never actually hitting the cache.  I've renamed the resolved
key to final_key to avoid shadowing in this way.

This reverts commit 410ce96a23a3496a45478e0b25ffac53aa3c116f.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88489
Approved by: https://github.com/albanD
---
 torch/_ops.py                           | 25 ++++++++++++-------------
 torch/csrc/autograd/python_variable.cpp | 13 +++++++++++--
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/torch/_ops.py b/torch/_ops.py
index a4119d758524f..f7ebba590aee0 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -243,6 +243,8 @@ def __init__(self, overloadpacket, op, op_dk, schema, tags):
         op.__module__ = overloadpacket.__module__
         self.__qualname__ = self._name
         self.__annotations__ = {}
+        # NB: This name is hard-coded in torch/csrc/autograd/python_variable.cpp
+        self._dispatch_cache = {}
 
     # it's a no-op since OpOverload object is immutable and must be unique for a given op overload.
     def __deepcopy__(self, memo=None):
@@ -289,6 +291,7 @@ def inner(fn):
                 assert mode not in self.python_key_mode_table
                 # TODO(voz): Should we replace setting torch._C.DispatchKey.Python entirely with setting mode keys?
                 self.python_key_mode_table[mode] = fn
+                self._dispatch_cache.clear()
                 return fn
 
             assert isinstance(dispatch_key_or_mode, torch._C.DispatchKey)
@@ -301,23 +304,19 @@ def inner(fn):
                     f"Trying to override a python impl for {dispatch_key_or_mode} on operator {self._name}"
                 )
             self.py_kernels[dispatch_key_or_mode] = fn
+            self._dispatch_cache.clear()
             return fn
 
         return inner
 
     # This implements the pre-computation logic for the Python dispatcher.
-    def __getattr__(self, attr):
-        if len(attr) == 0 or not attr[0].isupper():
-            raise AttributeError()
-
-        try:
-            key = torch._C._dispatch_key_parse(attr)
-        except Exception as e:
-            raise AttributeError()
+    def _get_dispatch(self, key):
+        # This is only called upon a cache miss
+        assert key not in self._dispatch_cache
 
         if key == torch._C.DispatchKey.Python:
             if not self.python_key_mode_table:
-                setattr(self, attr, key)
+                self._dispatch_cache[key] = key
                 return key
 
             def handler(*args, **kwargs):
@@ -336,12 +335,12 @@ def handler(*args, **kwargs):
                 # TODO(voz): The idea behind this is that we do not yet support dispatch by key + mode, only key.
                 return self.python_key_mode_table[curr_mode](*args, **kwargs)
 
-            setattr(self, attr, handler)
+            self._dispatch_cache[key] = handler
             return handler
 
-        key = resolve_key(self, key)
-        r = self.py_kernels.get(key, key)
-        setattr(self, attr, r)
+        final_key = resolve_key(self, key)
+        r = self.py_kernels.get(final_key, final_key)
+        self._dispatch_cache[key] = r
         return r
 
     def name(self):
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index 488858246429b..920d0e7344b58 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -2321,11 +2321,20 @@ void ConcretePyInterpreterVTable::python_dispatcher(
     torch::jit::Stack* stack) const {
   py::gil_scoped_acquire g;
   py::handle torch_api_function_overload = getTorchApiFunction(op);
+  // TODO: if necessary, can optimize to cache the cache lookup
+  // TODO: if necessary, can optimize OpOverload to have slots
+  auto cache = py::dict(torch_api_function_overload.attr("_dispatch_cache"));
+  if (cache.ptr() == nullptr) {
+    throw python_error();
+  }
 
   c10::DispatchKey k = ks.highestPriorityTypeId();
-  auto handler = torch_api_function_overload.attr(toString(k));
+  // TODO: allow this to be non-owning
+  auto handler = py::reinterpret_borrow<py::object>(
+      PyDict_GetItem(cache.ptr(), py::cast(k).ptr()));
   if (handler.ptr() == nullptr) {
-    throw python_error();
+    // Slow path
+    handler = torch_api_function_overload.attr("_get_dispatch")(k);
   }
   if (py::isinstance<c10::DispatchKey>(handler)) {
     // NB: not redispatch, as that will permanently remove the python

From 1bce038d7c96b18ddbabbc60ab0248db68900e7b Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@fb.com>
Date: Thu, 3 Nov 2022 09:33:25 -0700
Subject: [PATCH 0575/1922] [vulkan] enable prepacking for Batchnorm op
 (#88433)

Adds a `BatchNormPackedContext` so that the `batchnorm` op can use prepacking.

Differential Revision: [D40721546](https://our.internmc.facebook.com/intern/diff/D40721546/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88433
Approved by: https://github.com/manuelcandales
---
 .../ATen/native/vulkan/glsl/batchnorm.glsl    |  70 +++--
 aten/src/ATen/native/vulkan/ops/Batchnorm.cpp | 290 ++++++++++++------
 aten/src/ATen/native/vulkan/ops/Batchnorm.h   |  68 ++++
 aten/src/ATen/native/vulkan/ops/Register.cpp  |  36 +++
 aten/src/ATen/test/vulkan_api_test.cpp        |  14 +-
 torch/csrc/jit/passes/vulkan_rewrite.cpp      |  35 ++-
 6 files changed, 381 insertions(+), 132 deletions(-)
 create mode 100644 aten/src/ATen/native/vulkan/ops/Batchnorm.h

diff --git a/aten/src/ATen/native/vulkan/glsl/batchnorm.glsl b/aten/src/ATen/native/vulkan/glsl/batchnorm.glsl
index 6ec93422b0d6b..0ec7dbdf4fcf5 100644
--- a/aten/src/ATen/native/vulkan/glsl/batchnorm.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/batchnorm.glsl
@@ -1,37 +1,61 @@
 #version 450 core
 #define PRECISION $precision
-#define FORMAT    $format
+#define FORMAT $format
 
 layout(std430) buffer;
 
-/* Qualifiers: layout - storage - precision - memory */
-
-layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D   uOutput;
-layout(set = 0, binding = 1)         uniform PRECISION                    sampler3D uInput;
-layout(set = 0, binding = 2)         uniform PRECISION                    sampler3D uGamma;
-layout(set = 0, binding = 3)         uniform PRECISION                    sampler3D uBeta;
-layout(set = 0, binding = 4)         uniform PRECISION                    sampler3D uMean;
-layout(set = 0, binding = 5)         uniform PRECISION                    sampler3D uVar;
-layout(set = 0, binding = 6)         uniform PRECISION restrict           Block {
-  ivec3 isize;
-  int channels_ext;
+/*
+ * Output Image
+ */
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOutput;
+
+/*
+ * Input Textures
+ */
+layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
+layout(set = 0, binding = 2) uniform PRECISION sampler3D uGamma;
+layout(set = 0, binding = 3) uniform PRECISION sampler3D uBeta;
+layout(set = 0, binding = 4) uniform PRECISION sampler3D uMean;
+layout(set = 0, binding = 5) uniform PRECISION sampler3D uVar;
+
+/*
+ * Params Buffer
+ */
+layout(set = 0, binding = 6) uniform PRECISION restrict Block {
+  // xyz contains extents of the output texture, w contains the number of
+  // channels divided by 4, rounded up.
+  ivec4 out_extents;
   float eps;
-} uBlock;
+}
+uBlock;
 
+/*
+ * Local Work Group
+ */
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
+/*
+ * Computes a Batch normalization. Each shader invocation calculates the output
+ * at a single output location.
+ */
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
 
-  if (all(lessThan(pos, uBlock.isize.xyz))) {
-    const ivec3 chn = ivec3(0, 0, pos.z % uBlock.channels_ext);
-    imageStore(
-        uOutput,
-        pos,
-        (texelFetch(uInput, pos, 0)
-            - texelFetch(uMean, chn, 0))
-            / sqrt(texelFetch(uVar, chn, 0) + uBlock.eps)
-            * texelFetch(uGamma, chn, 0)
-            + texelFetch(uBeta, chn, 0));
+  // Return if this global position is outside output texture bounds
+  if (any(greaterThanEqual(pos, uBlock.out_extents.xyz))) {
+    return;
   }
+
+  const ivec3 ch_pos = ivec3(0, 0, pos.z % uBlock.out_extents.w);
+
+  const vec4 in_tex = texelFetch(uInput, pos, 0);
+  const vec4 gamma_tex = texelFetch(uGamma, ch_pos, 0);
+  const vec4 beta_tex = texelFetch(uBeta, ch_pos, 0);
+  const vec4 mean_tex = texelFetch(uMean, ch_pos, 0);
+  const vec4 var_tex = texelFetch(uVar, ch_pos, 0);
+
+  const vec4 out_tex =
+      (in_tex - mean_tex) / sqrt(var_tex + uBlock.eps) * gamma_tex + beta_tex;
+
+  imageStore(uOutput, pos, out_tex);
 }
diff --git a/aten/src/ATen/native/vulkan/ops/Batchnorm.cpp b/aten/src/ATen/native/vulkan/ops/Batchnorm.cpp
index 84828aa60468c..d1fecca2abeb0 100644
--- a/aten/src/ATen/native/vulkan/ops/Batchnorm.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Batchnorm.cpp
@@ -1,109 +1,44 @@
-#include <ATen/native/vulkan/ops/Common.h>
+#include <ATen/Context.h>
+#include <ATen/native/vulkan/ops/Batchnorm.h>
 #include <torch/library.h>
 
 namespace at {
 namespace native {
 namespace vulkan {
 namespace ops {
-namespace {
-
-using namespace api::utils;
 
-Tensor batch_norm(
-    const at::Tensor& input_arg,
-    const c10::optional<Tensor>& weight_opt /* optional */,
-    const c10::optional<Tensor>& bias_opt /* optional */,
-    const c10::optional<Tensor>& running_mean_opt /* optional */,
-    const c10::optional<Tensor>& running_var_opt /* optional */,
-    bool training,
-    double /* momentum, not used in eval mode */,
-    double eps,
-    bool /* cudnn_enable, deprecated */) {
-  TORCH_CHECK(!training, "Vulkan batchnorm only supports evaluation mode.");
-  TORCH_CHECK(
-      weight_opt && weight_opt->defined() && bias_opt && bias_opt->defined(),
-      "Vulkan batchnorm expects weight and bias arguments to be defined");
-  TORCH_CHECK(
-      running_mean_opt && running_mean_opt->defined(),
-      "running_mean must be defined in evaluation mode.");
-  TORCH_CHECK(
-      running_var_opt && running_var_opt->defined(),
-      "running_var must be defined in evaluation mode.");
-  TORCH_CHECK(input_arg.dim() == 4, "Vulkan batchnorm expects 4-dim input!");
-  TORCH_CHECK(
-      get_dim<Dim4D::Channel>(input_arg) % 4 == 0,
-      "Vulkan batchnorm expects channel dim to be multiple of 4!");
-
-  const Tensor input = input_arg.is_vulkan() ? input_arg : input_arg.vulkan();
-  const vTensor& v_input = convert(input);
-  const IntArrayRef v_input_sizes = v_input.sizes();
-
-  auto num_features = v_input.sizes()[1];
-  auto channels_ext = num_features / 4;
-
-  const Tensor weight_opt_3d = weight_opt->reshape({num_features, 1, 1});
-  const Tensor weight =
-      weight_opt_3d.is_vulkan() ? weight_opt_3d : weight_opt_3d.vulkan();
-  const vTensor& v_weight = convert(weight);
-  TORCH_CHECK(
-      weight.numel() == num_features,
-      "weight tensor should contain ",
-      num_features,
-      " elements!");
-
-  const Tensor bias_opt_3d = bias_opt->reshape({num_features, 1, 1});
-  const Tensor bias =
-      bias_opt_3d.is_vulkan() ? bias_opt_3d : bias_opt_3d.vulkan();
-  const vTensor& v_bias = convert(bias);
-  TORCH_CHECK(
-      bias.numel() == num_features,
-      "bias tensor should contain ",
-      num_features,
-      " elements!");
+namespace batchnorm {
+
+struct Params final {
+  api::utils::ivec3 out_extents;
+  int32_t c4;
+  float eps;
+};
+
+void record_op(
+    api::Context* const context,
+    vTensor& v_output,
+    const vTensor& v_input,
+    const vTensor& v_weight,
+    const vTensor& v_bias,
+    const vTensor& v_running_mean,
+    const vTensor& v_running_var,
+    const float eps) {
+  api::PipelineBarrier pipeline_barrier{};
 
-  const Tensor running_mean_opt_3d =
-      running_mean_opt->reshape({num_features, 1, 1});
-  const Tensor running_mean = running_mean_opt_3d.is_vulkan()
-      ? running_mean_opt_3d
-      : running_mean_opt_3d.vulkan();
-  const vTensor& v_running_mean = convert(running_mean);
-  TORCH_CHECK(
-      running_mean.numel() == num_features,
-      "running mean tensor should contain ",
-      num_features,
-      " elements!");
-
-  const Tensor running_var_opt_3d =
-      running_var_opt->reshape({num_features, 1, 1});
-  const Tensor running_var = running_var_opt_3d.is_vulkan()
-      ? running_var_opt_3d
-      : running_var_opt_3d.vulkan();
-  const vTensor& v_running_var = convert(running_var);
-  TORCH_CHECK(
-      running_var.numel() == num_features,
-      "running var tensor should contain ",
-      num_features,
-      " elements!");
+  api::utils::uvec3 global_size = v_output.extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
 
-  api::Context* const context = api::context();
+  uint32_t num_features = get_dim<Dim4D::Channel>(v_input.sizes());
+  uint32_t channels_ext = api::utils::div_up(num_features, 4u);
 
-  vTensor v_output{
-      context,
-      v_input_sizes,
-      v_input.options(),
+  Params block{
+      api::utils::make_ivec3(v_output.extents()),
+      api::utils::safe_downcast<int32_t>(channels_ext),
+      eps,
   };
 
-  const struct Block final {
-    uvec3 iextents;
-    int32_t channels_ext;
-    float epsilon;
-  } block{
-      v_output.extents(),
-      safe_downcast<int32_t>(channels_ext),
-      safe_downcast<float>(eps)};
-
   api::UniformParamsBuffer params(context, block);
-  api::PipelineBarrier pipeline_barrier{};
 
   context->submit_compute_job(
       // shader descriptor
@@ -111,9 +46,9 @@ Tensor batch_norm(
       // pipeline barrier
       pipeline_barrier,
       // global work group size
-      v_output.extents(),
+      global_size,
       // local work group size
-      adaptive_work_group_size(v_output.extents()),
+      local_size,
       // fence handle
       VK_NULL_HANDLE,
       // shader arguments
@@ -128,8 +63,34 @@ Tensor batch_norm(
       v_running_var.image(pipeline_barrier, api::PipelineStage::COMPUTE),
       // params buffer
       params.buffer());
+}
 
-  return convert(v_output);
+} // namespace batchnorm
+
+namespace {
+
+using namespace api::utils;
+
+Tensor batch_norm(
+    const at::Tensor& input_arg,
+    const c10::optional<Tensor>& weight_opt /* optional */,
+    const c10::optional<Tensor>& bias_opt /* optional */,
+    const c10::optional<Tensor>& running_mean_opt /* optional */,
+    const c10::optional<Tensor>& running_var_opt /* optional */,
+    bool training,
+    double /* momentum, not used in eval mode */,
+    double eps,
+    bool /* cudnn_enable, deprecated */) {
+  TORCH_CHECK(!training, "Only evaluation mode is supported!");
+  TORCH_CHECK(input_arg.dim() == 4, "Input must have dim == 4!");
+  TORCH_CHECK(
+      get_dim<Dim4D::Channel>(input_arg) % 4 == 0,
+      "Input must have channels divisible by 4!");
+
+  return run_batchnorm_context(
+      input_arg,
+      c10::make_intrusive<BatchNormPackedContext>(BatchNormPackedContext(
+          weight_opt, bias_opt, running_mean_opt, running_var_opt, eps)));
 }
 
 #ifdef USE_VULKAN_API
@@ -141,6 +102,143 @@ TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
 #endif /* USE_VULKAN_API */
 
 } // namespace
+
+BatchNormPackedContext::BatchNormPackedContext(
+    const c10::optional<Tensor>& weight_opt,
+    const c10::optional<Tensor>& bias_opt,
+    const c10::optional<Tensor>& running_mean_opt,
+    const c10::optional<Tensor>& running_var_opt,
+    double eps)
+    : unpacked_{c10::AnyType::get()} {
+  packed_.reserve(ListArgs::kNumArgs);
+
+  // Each optional tensor arg, if provided should be a 1 dimensional tensor. To
+  // achieve more efficient packing as a texture, they are first reshaped to {N,
+  // 1, 1}. Eventually this rearrangement should happen automatically in vTensor
+  // itself.
+
+  // Weight
+  TORCH_CHECK(weight_opt, "Weight must be provided!");
+  TORCH_CHECK(weight_opt->dim() == 1, "Weight must have ndim == 1!");
+
+  const int64_t num_features =
+      api::utils::safe_downcast<int64_t>(weight_opt->numel());
+  const Tensor weight_3d = weight_opt->reshape({num_features, 1, 1});
+  packed_.emplace_back(weight_3d.vulkan());
+
+  // Bias
+  TORCH_CHECK(bias_opt, "Bias must be provided!");
+  TORCH_CHECK(bias_opt->dim() == 1, "Bias must have ndim == 1!");
+  TORCH_CHECK(
+      bias_opt->numel() == num_features,
+      "Bias must have the same numel as weight!");
+
+  const Tensor bias_3d = bias_opt->reshape({num_features, 1, 1});
+  packed_.emplace_back(bias_3d.vulkan());
+
+  // Running Mean
+  TORCH_CHECK(running_mean_opt, "Running mean must be provided!");
+  TORCH_CHECK(running_mean_opt->dim() == 1, "Running mean must have ndim == 1");
+  TORCH_CHECK(
+      running_mean_opt->numel() == num_features,
+      "Running mean must have the same numel as weight!");
+
+  const Tensor running_mean_3d =
+      running_mean_opt->reshape({num_features, 1, 1});
+  packed_.emplace_back(running_mean_3d.vulkan());
+
+  // Running var
+  TORCH_CHECK(running_var_opt, "Running var must be provided!");
+  TORCH_CHECK(running_var_opt->dim() == 1, "Running var must have ndim == 1");
+  TORCH_CHECK(
+      running_var_opt->numel() == num_features,
+      "Running var must have the same numel as weight!");
+
+  const Tensor running_var_3d = running_var_opt->reshape({num_features, 1, 1});
+  packed_.emplace_back(running_var_3d.vulkan());
+
+  // Epsilon
+  packed_.emplace_back(eps);
+
+  if (!at::globalContext().releaseWeightsWhenPrepacking()) {
+    unpacked_.reserve(ListArgs::kNumArgs);
+    unpacked_.emplace_back(weight_opt);
+    unpacked_.emplace_back(bias_opt);
+    unpacked_.emplace_back(running_mean_opt);
+    unpacked_.emplace_back(running_var_opt);
+    unpacked_.emplace_back(eps);
+  }
+}
+
+BatchNormPackedContext BatchNormPackedContext::pack(
+    c10::impl::GenericList unpacked) {
+  return BatchNormPackedContext(
+      get_optional_tensor(unpacked, ListArgs::kWeight),
+      get_optional_tensor(unpacked, ListArgs::kBias),
+      get_optional_tensor(unpacked, ListArgs::kRunningMean),
+      get_optional_tensor(unpacked, ListArgs::kRunningVar),
+      unpacked.get(ListArgs::kEps).toDouble());
+}
+
+c10::intrusive_ptr<BatchNormPackedContext> create_batchnorm_context(
+    c10::optional<Tensor>&& weight_opt,
+    c10::optional<Tensor>&& bias_opt,
+    c10::optional<Tensor>&& running_mean_opt,
+    c10::optional<Tensor>&& running_var_opt,
+    bool training,
+    double /* momentum */,
+    double eps,
+    bool /* cudnn_enable, deprecated */) {
+  return c10::make_intrusive<BatchNormPackedContext>(BatchNormPackedContext(
+      weight_opt, bias_opt, running_mean_opt, running_var_opt, eps));
+}
+
+Tensor run_batchnorm_context(
+    const Tensor& input_arg,
+    const c10::intrusive_ptr<BatchNormPackedContext>& batchnorm_context) {
+  api::Context* const context = api::context();
+
+  const vTensor& v_input = convert(input_arg);
+
+  const vTensor& v_weight = convert(
+      batchnorm_context->get_val(BatchNormPackedContext::ListArgs::kWeight)
+          .toTensor());
+
+  const vTensor& v_bias = convert(
+      batchnorm_context->get_val(BatchNormPackedContext::ListArgs::kBias)
+          .toTensor());
+
+  const vTensor& v_running_mean = convert(
+      batchnorm_context->get_val(BatchNormPackedContext::ListArgs::kRunningMean)
+          .toTensor());
+
+  const vTensor& v_running_var = convert(
+      batchnorm_context->get_val(BatchNormPackedContext::ListArgs::kRunningVar)
+          .toTensor());
+
+  const float eps = api::utils::safe_downcast<float>(
+      batchnorm_context->get_val(BatchNormPackedContext::ListArgs::kEps)
+          .toDouble());
+
+  vTensor v_output{
+      context,
+      v_input.sizes(),
+      v_input.options(),
+  };
+
+  batchnorm::record_op(
+      context,
+      v_output,
+      v_input,
+      v_weight,
+      v_bias,
+      v_running_mean,
+      v_running_var,
+      eps);
+
+  return convert(v_output);
+}
+
 } // namespace ops
 } // namespace vulkan
 } // namespace native
diff --git a/aten/src/ATen/native/vulkan/ops/Batchnorm.h b/aten/src/ATen/native/vulkan/ops/Batchnorm.h
new file mode 100644
index 0000000000000..6afaeb6f243b3
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/ops/Batchnorm.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#ifdef USE_VULKAN_API
+
+#include <ATen/native/vulkan/ops/Common.h>
+#include <ATen/native/vulkan/ops/VulkanPackedContext.h>
+#include <torch/library.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace ops {
+
+class BatchNormPackedContext final : virtual public VulkanPackedContext,
+                                     public torch::jit::CustomClassHolder {
+ private:
+  c10::impl::GenericList unpacked_;
+
+ public:
+  BatchNormPackedContext(
+      const c10::optional<Tensor>& weight_opt,
+      const c10::optional<Tensor>& bias_opt,
+      const c10::optional<Tensor>& running_mean_opt,
+      const c10::optional<Tensor>& running_var_opt,
+      double eps);
+
+  /*
+   * Assigns a name to each index in the packed/unpacked list.
+   */
+  struct ListArgs final {
+    static constexpr uint32_t kWeight = 0u;
+    static constexpr uint32_t kBias = 1u;
+    static constexpr uint32_t kRunningMean = 2u;
+    static constexpr uint32_t kRunningVar = 3u;
+    static constexpr uint32_t kEps = 4u;
+
+    static constexpr uint32_t kNumArgs = 5u;
+  };
+
+  static BatchNormPackedContext pack(c10::impl::GenericList);
+
+  const c10::impl::GenericList unpack() const override {
+    TORCH_CHECK(unpacked_.size() > 0u, "unpacked_ does not have any elements!");
+
+    return unpacked_;
+  }
+};
+
+c10::intrusive_ptr<BatchNormPackedContext> create_batchnorm_context(
+    c10::optional<Tensor>&& weight_opt,
+    c10::optional<Tensor>&& bias_opt,
+    c10::optional<Tensor>&& running_mean_opt,
+    c10::optional<Tensor>&& running_var_opt,
+    bool training,
+    double /* momentum */,
+    double eps,
+    bool /* cudnn_enable, deprecated */);
+
+Tensor run_batchnorm_context(
+    const Tensor& input_arg,
+    const c10::intrusive_ptr<BatchNormPackedContext>& context);
+
+} // namespace ops
+} // namespace vulkan
+} // namespace native
+} // namespace at
+
+#endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/ops/Register.cpp b/aten/src/ATen/native/vulkan/ops/Register.cpp
index 18d5a6facfaed..25f0a6d99ec78 100644
--- a/aten/src/ATen/native/vulkan/ops/Register.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Register.cpp
@@ -1,5 +1,6 @@
 #ifdef USE_VULKAN_API
 
+#include <ATen/native/vulkan/ops/Batchnorm.h>
 #include <ATen/native/vulkan/ops/Common.h>
 #include <ATen/native/vulkan/ops/Convolution.h>
 #include <ATen/native/vulkan/ops/Gru.h>
@@ -16,6 +17,19 @@ namespace ops {
 namespace {
 
 TORCH_LIBRARY(vulkan, m) {
+  m.class_<BatchNormPackedContext>("BatchNormPackedContext")
+      .def_pickle(
+          // __getstate__
+          [](const c10::intrusive_ptr<BatchNormPackedContext>& context) {
+            // context is packed
+            return context->unpack();
+          },
+          // __setstate__
+          [](c10::impl::GenericList state) {
+            // state is unpacked
+            return c10::make_intrusive<BatchNormPackedContext>(
+                BatchNormPackedContext::pack(state));
+          });
   m.class_<LinearPackedContext>("LinearPackedContext")
       .def_pickle(
           // __getstate__
@@ -147,6 +161,22 @@ TORCH_LIBRARY(vulkan_prepack, m) {
       "Tensor hx_vk, "
       "Tensor cx_vk, "
       "__torch__.torch.classes.vulkan.LstmPackedContext L_prepack) -> (Tensor next_input, Tensor hidden_state, Tensor cell_state)"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "vulkan_prepack::create_batchnorm_context("
+      "Tensor? weight_opt, "
+      "Tensor? bias_opt, "
+      "Tensor? running_mean_opt, "
+      "Tensor? running_var_opt, "
+      "bool training, "
+      "float momentum, "
+      "float eps, "
+      "bool cudnn_enable) "
+      "-> __torch__.torch.classes.vulkan.BatchNormPackedContext"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "vulkan_prepack::run_batchnorm_context("
+      "Tensor input_vk, "
+      "__torch__.torch.classes.vulkan.BatchNormPackedContext context) "
+      "-> Tensor out"));
 }
 
 TORCH_LIBRARY_IMPL(vulkan_prepack, CPU, m) {
@@ -168,6 +198,9 @@ TORCH_LIBRARY_IMPL(vulkan_prepack, CPU, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("vulkan_prepack::create_lstm_context"),
       TORCH_FN(create_lstm_context));
+  m.impl(
+      TORCH_SELECTIVE_NAME("vulkan_prepack::create_batchnorm_context"),
+      TORCH_FN(create_batchnorm_context));
 }
 
 TORCH_LIBRARY_IMPL(vulkan_prepack, Vulkan, m) {
@@ -189,6 +222,9 @@ TORCH_LIBRARY_IMPL(vulkan_prepack, Vulkan, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("vulkan_prepack::run_lstm_context"),
       TORCH_FN(run_lstm_context));
+  m.impl(
+      TORCH_SELECTIVE_NAME("vulkan_prepack::run_batchnorm_context"),
+      TORCH_FN(run_batchnorm_context));
 }
 
 } // namespace
diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index 2519267f75dc3..a9dc1908100b0 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -630,7 +630,7 @@ TEST_F(VulkanAPITest, batch_norm_invalid_inputs) {
       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
-      true,
+      false,
       0.1,
       1e-05,
       false);
@@ -644,7 +644,7 @@ TEST_F(VulkanAPITest, batch_norm_invalid_inputs) {
       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
-      true,
+      false,
       0.1,
       1e-05,
       false);
@@ -658,7 +658,7 @@ TEST_F(VulkanAPITest, batch_norm_invalid_inputs) {
       at::rand({7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
       at::rand({7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
       at::rand({7}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
-      true,
+      false,
       0.1,
       1e-05,
       false);
@@ -672,7 +672,7 @@ TEST_F(VulkanAPITest, batch_norm_invalid_inputs) {
       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
-      true,
+      false,
       0.1,
       1e-05,
       false);
@@ -686,7 +686,7 @@ TEST_F(VulkanAPITest, batch_norm_invalid_inputs) {
       at::rand({12}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
-      true,
+      false,
       0.1,
       1e-05,
       false);
@@ -700,7 +700,7 @@ TEST_F(VulkanAPITest, batch_norm_invalid_inputs) {
       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
       at::rand({12}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
-      true,
+      false,
       0.1,
       1e-05,
       false);
@@ -714,7 +714,7 @@ TEST_F(VulkanAPITest, batch_norm_invalid_inputs) {
       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
       at::rand({8}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
       at::rand({12}, at::device(at::kCPU).dtype(at::kFloat)).vulkan(),
-      true,
+      false,
       0.1,
       1e-05,
       false);
diff --git a/torch/csrc/jit/passes/vulkan_rewrite.cpp b/torch/csrc/jit/passes/vulkan_rewrite.cpp
index cbfa61238a45d..0c37d5b503477 100644
--- a/torch/csrc/jit/passes/vulkan_rewrite.cpp
+++ b/torch/csrc/jit/passes/vulkan_rewrite.cpp
@@ -19,6 +19,24 @@ namespace jit {
 
 namespace {
 
+void insertPrePackedBatchNormOp(std::shared_ptr<Graph>& graph) {
+  std::string batchnorm_pattern = R"(
+    graph(%input, %weight, %bias, %mean, %var, %training, %momentum, %eps, %cudnn_enable):
+        %r = aten::batch_norm(%input, %weight, %bias, %mean, %var, %training, %momentum, %eps, %cudnn_enable)
+        return (%r))";
+  std::string prepacked_ops_pattern = R"(
+    graph(%input, %weight, %bias, %mean, %var, %training, %momentum, %eps, %cudnn_enable):
+        %op_context : __torch__.torch.classes.vulkan.BatchNormPackedContext = vulkan_prepack::create_batchnorm_context(
+            %weight, %bias, %mean, %var, %training, %momentum, %eps, %cudnn_enable)
+        %res = vulkan_prepack::run_batchnorm_context(%input, %op_context)
+        return (%res))";
+
+  SubgraphRewriter batchnorm_rewriter;
+  batchnorm_rewriter.RegisterRewritePattern(
+      batchnorm_pattern, prepacked_ops_pattern);
+  batchnorm_rewriter.runOnGraph(graph);
+}
+
 void insertPrePackedLinearOp(std::shared_ptr<Graph>& graph) {
   // fuse decomposed linear into aten::linear
   FuseLinear(graph);
@@ -265,6 +283,7 @@ void vulkanInsertPrePackedOps(std::shared_ptr<Graph>& graph) {
   insertPrePackedConv2dOp(graph);
   insertPrePackedGruOp(graph);
   insertPrePackedLstmOp(graph);
+  insertPrePackedBatchNormOp(graph);
 }
 
 void vulkanInsertPrePackedOps(script::Module& module) {
@@ -295,7 +314,9 @@ void vulkanFoldPrePackingOps(script::Module& m) {
         (n->kind() ==
          Symbol::fromQualString("vulkan_prepack::create_gru_context")) ||
         (n->kind() ==
-         Symbol::fromQualString("vulkan_prepack::create_lstm_context")));
+         Symbol::fromQualString("vulkan_prepack::create_lstm_context")) ||
+        (n->kind() ==
+         Symbol::fromQualString("vulkan_prepack::create_batchnorm_context")));
   };
   PrePackingOpsFolder(m, filter_fn, "prepack_folding");
 }
@@ -320,18 +341,20 @@ script::Module vulkanOptimizeForMobile(
   auto cloned_module = m.clone();
   cloned_module.eval();
   cloned_module = FoldConvBatchNorm(cloned_module);
-  vulkanInsertPrePackedOps(cloned_module);
   cloned_module = freeze_module(cloned_module, preserved_methods);
+  vulkanInsertPrePackedOps(cloned_module);
+  vulkanFusePrePackedConvWithClamp(cloned_module);
+  vulkanFoldPrePackingOps(cloned_module);
+  removeDropout(cloned_module);
+  vulkanRemoveMutation(cloned_module);
+
   if (!optimization_blocklist.count(
           MobileOptimizerType::VULKAN_AUTOMATIC_GPU_TRANSFER)) {
     transferInputOutputBackends(cloned_module);
     cloned_module.register_attribute(
         "requires_backend_transfers", BoolType::get(), false);
   }
-  vulkanFusePrePackedConvWithClamp(cloned_module);
-  vulkanFoldPrePackingOps(cloned_module);
-  removeDropout(cloned_module);
-  vulkanRemoveMutation(cloned_module);
+
   // remove duplicated constants
   vulkanRunCanonicalOptimizations(cloned_module);
   eliminateDeadCode(cloned_module);

From 70a7dfc221b86bd0ad8b7054b07c9d1ea0367d96 Mon Sep 17 00:00:00 2001
From: Codrin Popa <codrin@meta.com>
Date: Fri, 4 Nov 2022 19:31:16 +0000
Subject: [PATCH 0576/1922] Modified roundup_power2_divisions to specify the
 number of divisions for each power of two interval (#87290)

Summary:
Improved roundup_power2_divisions knob so it allows better control of rouding in the PyTorch CUDA Caching Allocator.

This new version allows setting the number of divisions per power of two interval starting from 1MB and ending at 64GB and above. An example use case is when rouding is desirable for small allocations but there are also very large allocations which are persistent, thus would not benefit from rounding and take up extra space.

Test Plan: Tested locally

Differential Revision: D40103909

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87290
Approved by: https://github.com/zdevito
---
 c10/cuda/CUDACachingAllocator.cpp | 381 +++++++++++++++++++++---------
 docs/source/notes/cuda.rst        |   6 +
 test/test_cuda.py                 |  24 ++
 3 files changed, 297 insertions(+), 114 deletions(-)

diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 8446c25669d77..1b38f08b1e90b 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -104,6 +104,7 @@ constexpr size_t kLargeBuffer =
 constexpr size_t kMinLargeAlloc =
     10485760; // allocations between 1 and 10 MiB may use kLargeBuffer
 constexpr size_t kRoundLarge = 2097152; // round up large allocations to 2 MiB
+constexpr size_t kRoundUpPowerOfTwoIntervals = 16;
 
 namespace {
 
@@ -406,11 +407,24 @@ class CachingAllocatorConfig {
   // More description below in function roundup_power2_next_division
   // As ane example, if we want 4 divisions between 2's power, this can be done
   // using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
-  static size_t roundup_power2_divisions() {
-    return instance().m_roundup_power2_divisions;
-  }
-  static size_t roundup_bypass_threshold() {
-    return instance().m_roundup_bypass_threshold;
+  static size_t roundup_power2_divisions(size_t size) {
+    size_t log_size = (63 - llvm::countLeadingZeros(size));
+
+    // Our intervals start at 1MB and end at 64GB
+    const size_t interval_start =
+        63 - llvm::countLeadingZeros(static_cast<size_t>(1048576));
+    const size_t interval_end =
+        63 - llvm::countLeadingZeros(static_cast<size_t>(68719476736));
+    TORCH_CHECK(
+        (interval_end - interval_start == Native::kRoundUpPowerOfTwoIntervals),
+        "kRoundUpPowerOfTwoIntervals mismatch");
+
+    int index = static_cast<int>(log_size) - static_cast<int>(interval_start);
+
+    index = std::max(0, index);
+    index = std::min(
+        index, static_cast<int>(Native::kRoundUpPowerOfTwoIntervals) - 1);
+    return instance().m_roundup_power2_divisions[index];
   }
 
   static CachingAllocatorConfig& instance() {
@@ -423,128 +437,269 @@ class CachingAllocatorConfig {
     return *s_instance;
   }
 
-  void parseArgs(const char* env) {
-    // If empty, set the default values
-    m_max_split_size = std::numeric_limits<size_t>::max();
-    m_roundup_power2_divisions = 0;
-    m_roundup_bypass_threshold = std::numeric_limits<size_t>::max();
-    m_garbage_collection_threshold = 0;
+  void parseArgs(const char* env);
 
-    if (env == nullptr) {
-      return;
+ private:
+  CachingAllocatorConfig()
+      : m_max_split_size(std::numeric_limits<size_t>::max()),
+        m_garbage_collection_threshold(0) {
+    m_roundup_power2_divisions.assign(Native::kRoundUpPowerOfTwoIntervals, 0);
+  }
+
+  void lexArgs(const char* env, std::vector<std::string>& config);
+  void consumeToken(
+      const std::vector<std::string>& config,
+      size_t i,
+      const char c);
+  size_t parseMaxSplitSize(const std::vector<std::string>& config, size_t i);
+  size_t parseGarbageCollectionThreshold(
+      const std::vector<std::string>& config,
+      size_t i);
+  size_t parseRoundUpPower2Divisions(
+      const std::vector<std::string>& config,
+      size_t i);
+  size_t parseAllocatorConfig(
+      const std::vector<std::string>& config,
+      size_t i,
+      bool& used_cudaMallocAsync);
+
+  std::atomic<size_t> m_max_split_size;
+  std::vector<size_t> m_roundup_power2_divisions;
+  std::atomic<double> m_garbage_collection_threshold;
+};
+
+void CachingAllocatorConfig::lexArgs(
+    const char* env,
+    std::vector<std::string>& config) {
+  std::vector<char> buf;
+
+  size_t env_length = strlen(env);
+  for (size_t i = 0; i < env_length; i++) {
+    if (env[i] == ',' || env[i] == ':' || env[i] == '[' || env[i] == ']') {
+      if (buf.size() != 0) {
+        config.emplace_back(std::string(buf.begin(), buf.end()));
+        buf.clear();
+      }
+      config.emplace_back(std::string(1, env[i]));
+    } else if (env[i] != ' ') {
+      buf.emplace_back(static_cast<char>(env[i]));
     }
+  }
+  if (!buf.empty()) {
+    config.emplace_back(std::string(buf.begin(), buf.end()));
+  }
+}
 
-    const std::string config(env);
+void CachingAllocatorConfig::consumeToken(
+    const std::vector<std::string>& config,
+    size_t i,
+    const char c) {
+  TORCH_CHECK(
+      i < config.size() && config[i].compare(std::string(1, c)) == 0,
+      "Error parsing CachingAllocator settings, expected ",
+      c,
+      "");
+}
 
-    std::regex exp("[\\s,]+");
-    std::sregex_token_iterator it(config.begin(), config.end(), exp, -1);
-    std::sregex_token_iterator end;
-    std::vector<std::string> options(it, end);
+size_t CachingAllocatorConfig::parseMaxSplitSize(
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  if (++i < config.size()) {
+    size_t val1 = stoi(config[i]);
+    TORCH_CHECK(
+        val1 > Native::kLargeBuffer / (1024 * 1024),
+        "CachingAllocator option max_split_size_mb too small, must be > ",
+        Native::kLargeBuffer / (1024 * 1024),
+        "");
+    val1 = std::max(val1, Native::kLargeBuffer / (1024 * 1024));
+    val1 = std::min(val1, (std::numeric_limits<size_t>::max() / (1024 * 1024)));
+    m_max_split_size = val1 * 1024 * 1024;
+  } else {
+    TORCH_CHECK(false, "Error, expecting max_split_size_mb value", "");
+  }
+  return i;
+}
 
-    bool used_cudaMallocAsync = false;
-    bool used_native_specific_option = false;
+size_t CachingAllocatorConfig::parseGarbageCollectionThreshold(
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  if (++i < config.size()) {
+    double val1 = stod(config[i]);
+    TORCH_CHECK(
+        val1 > 0, "garbage_collect_threshold too small, set it 0.0~1.0", "");
+    TORCH_CHECK(
+        val1 < 1.0, "garbage_collect_threshold too big, set it 0.0~1.0", "");
+    m_garbage_collection_threshold = val1;
+  } else {
+    TORCH_CHECK(
+        false, "Error, expecting garbage_collection_threshold value", "");
+  }
+  return i;
+}
 
-    for (auto option : options) {
-      std::regex exp2("[:]+");
-      std::sregex_token_iterator it2(option.begin(), option.end(), exp2, -1);
-      std::sregex_token_iterator end2;
-      std::vector<std::string> kv(it2, end2);
-      if (kv.size() >= 2) {
-        /* Maximum split size in MB.  Limited to large size blocks */
-        if (kv[0] == "max_split_size_mb") {
-          size_t val2 = stoi(kv[1]);
+size_t CachingAllocatorConfig::parseRoundUpPower2Divisions(
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  bool first_value = true;
+
+  if (++i < config.size()) {
+    if (config[i].compare("[") == 0) {
+      size_t last_index = 0;
+      while (++i < config.size() && config[i].compare("]") != 0) {
+        std::string val1 = config[i];
+        size_t val2 = 0;
+
+        consumeToken(config, ++i, ':');
+        if (++i < config.size()) {
+          val2 = stoi(config[i]);
+        } else {
           TORCH_CHECK(
-              val2 > Native::kLargeBuffer / (1024 * 1024),
-              "CachingAllocator option max_split_size_mb too small, must be > ",
-              Native::kLargeBuffer / (1024 * 1024),
-              "");
-          val2 = std::max(val2, Native::kLargeBuffer / (1024 * 1024));
-          val2 = std::min(
-              val2, (std::numeric_limits<size_t>::max() / (1024 * 1024)));
-          m_max_split_size = val2 * 1024 * 1024;
-          used_native_specific_option = true;
-        } else if (kv[0] == "roundup_power2_divisions") {
-          size_t val2 = stoi(kv[1]);
+              false, "Error parsing roundup_power2_divisions value", "");
+        }
+        TORCH_CHECK(
+            llvm::isPowerOf2_64(val2),
+            "For roundups, the divisons has to be power of 2 ",
+            "");
+
+        if (val1.compare(">") == 0) {
+          std::fill(
+              std::next(
+                  m_roundup_power2_divisions.begin(),
+                  static_cast<std::vector<unsigned long>::difference_type>(
+                      last_index)),
+              m_roundup_power2_divisions.end(),
+              val2);
+        } else {
+          size_t val1_long = stoul(val1);
           TORCH_CHECK(
-              llvm::isPowerOf2_64(val2),
-              "For roundups, the divisons has to be power of 2 ",
+              llvm::isPowerOf2_64(val1_long),
+              "For roundups, the intervals have to be power of 2 ",
               "");
-          m_roundup_power2_divisions = val2;
-          used_native_specific_option = true;
-        } else if (kv[0] == "roundup_bypass_threshold_mb") {
-          size_t val2 = stoi(kv[1]);
-          m_roundup_bypass_threshold = val2 * 1024 * 1024;
-          used_native_specific_option = true;
-        } else if (kv[0] == "backend") {
-          TORCH_CHECK(
-              ((kv[1] == "native") || (kv[1] == "cudaMallocAsync")),
-              "Unknown allocator backend, "
-              "options are native and cudaMallocAsync");
-          used_cudaMallocAsync = (kv[1] == "cudaMallocAsync");
-          if (used_cudaMallocAsync) {
-#if CUDA_VERSION >= 11040
-            int version;
-            C10_CUDA_CHECK(cudaDriverGetVersion(&version));
-            TORCH_CHECK(
-                version >= 11040,
-                "backend:cudaMallocAsync requires CUDA runtime "
-                "11.4 or newer, but cudaDriverGetVersion returned ",
-                version);
-#else
-            TORCH_CHECK(
-                false,
-                "backend:cudaMallocAsync requires PyTorch to be built with "
-                "CUDA 11.4 or newer, but CUDA_VERSION is ",
-                CUDA_VERSION);
-#endif
+
+          size_t index = 63 - llvm::countLeadingZeros(val1_long);
+          index = std::max((size_t)0, index);
+          index = std::min(index, m_roundup_power2_divisions.size() - 1);
+
+          if (first_value) {
+            std::fill(
+                m_roundup_power2_divisions.begin(),
+                std::next(
+                    m_roundup_power2_divisions.begin(),
+                    static_cast<std::vector<unsigned long>::difference_type>(
+                        index)),
+                val2);
+            first_value = false;
           }
-          TORCH_INTERNAL_ASSERT(
-              kv[1] == get()->name(),
-              "Allocator backend parsed at runtime != "
-              "allocator backend parsed at load time");
-        } else if (kv[0] == "garbage_collection_threshold") {
-          /*
-           * Perform garbage collection of GPU memory blocks to avoid
-           * triggering expensive sync-and-reclaim-all operation. Upon setting
-           * the threshold (e.g., 0.8), the allocator will start reclaiming
-           * blocks if GPU memory capacity usage exceeds the threshold (i.e.,
-           * 80% of total memory).
-           * Values 0.0 and 1.0 are not allowed as they are less meaningful.
-           */
-          double val2 = stod(kv[1]);
-          TORCH_CHECK(
-              val2 > 0,
-              "garbage_collect_threshold too small, set it 0.0~1.0",
-              "");
-          TORCH_CHECK(
-              val2 < 1.0,
-              "garbage_collect_threshold too big, set it 0.0~1.0",
-              "");
-          m_garbage_collection_threshold = val2;
-          used_native_specific_option = true;
-        } else {
-          TORCH_CHECK(false, "Unrecognized CachingAllocator option: ", kv[0]);
+          if (index < m_roundup_power2_divisions.size()) {
+            m_roundup_power2_divisions[index] = val2;
+          }
+          last_index = index;
         }
-      }
 
-      if (used_cudaMallocAsync && used_native_specific_option) {
-        TORCH_WARN(
-            "backend:cudaMallocAsync ignores max_split_size_mb, roundup_bypass_threshold_mb,"
-            "roundup_power2_divisions, and garbage_collect_threshold.");
+        if (config[i + 1].compare("]") != 0) {
+          consumeToken(config, ++i, ',');
+        }
       }
+    } else { // Keep this for backwards compatibility
+      size_t val1 = stoi(config[i]);
+      TORCH_CHECK(
+          llvm::isPowerOf2_64(val1),
+          "For roundups, the divisons has to be power of 2 ",
+          "");
+      std::fill(
+          m_roundup_power2_divisions.begin(),
+          m_roundup_power2_divisions.end(),
+          val1);
     }
+  } else {
+    TORCH_CHECK(false, "Error, expecting roundup_power2_divisions value", "");
   }
+  return i;
+}
 
- private:
-  CachingAllocatorConfig()
-      : m_max_split_size(std::numeric_limits<size_t>::max()),
-        m_roundup_power2_divisions(0),
-        m_garbage_collection_threshold(0) {}
-  std::atomic<size_t> m_max_split_size;
-  std::atomic<size_t> m_roundup_power2_divisions;
-  std::atomic<size_t> m_roundup_bypass_threshold;
-  std::atomic<double> m_garbage_collection_threshold;
-};
+size_t CachingAllocatorConfig::parseAllocatorConfig(
+    const std::vector<std::string>& config,
+    size_t i,
+    bool& used_cudaMallocAsync) {
+  consumeToken(config, ++i, ':');
+  if (++i < config.size()) {
+    TORCH_CHECK(
+        ((config[i] == "native") || (config[i] == "cudaMallocAsync")),
+        "Unknown allocator backend, "
+        "options are native and cudaMallocAsync");
+    used_cudaMallocAsync = (config[i] == "cudaMallocAsync");
+    if (used_cudaMallocAsync) {
+#if CUDA_VERSION >= 11040
+      int version;
+      C10_CUDA_CHECK(cudaDriverGetVersion(&version));
+      TORCH_CHECK(
+          version >= 11040,
+          "backend:cudaMallocAsync requires CUDA runtime "
+          "11.4 or newer, but cudaDriverGetVersion returned ",
+          version);
+#else
+      TORCH_CHECK(
+          false,
+          "backend:cudaMallocAsync requires PyTorch to be built with "
+          "CUDA 11.4 or newer, but CUDA_VERSION is ",
+          CUDA_VERSION);
+#endif
+    }
+    TORCH_INTERNAL_ASSERT(
+        config[i] == get()->name(),
+        "Allocator backend parsed at runtime != "
+        "allocator backend parsed at load time");
+  } else {
+    TORCH_CHECK(false, "Error parsing backend value", "");
+  }
+  return i;
+}
+
+void CachingAllocatorConfig::parseArgs(const char* env) {
+  // If empty, set the default values
+  m_max_split_size = std::numeric_limits<size_t>::max();
+  m_roundup_power2_divisions.assign(Native::kRoundUpPowerOfTwoIntervals, 0);
+  m_garbage_collection_threshold = 0;
+  bool used_cudaMallocAsync = false;
+  bool used_native_specific_option = false;
+
+  if (env == nullptr) {
+    return;
+  }
+
+  std::vector<std::string> config;
+  lexArgs(env, config);
+
+  for (size_t i = 0; i < config.size(); i++) {
+    if (config[i].compare("max_split_size_mb") == 0) {
+      i = parseMaxSplitSize(config, i);
+      used_native_specific_option = true;
+    } else if (config[i].compare("garbage_collection_threshold") == 0) {
+      i = parseGarbageCollectionThreshold(config, i);
+      used_native_specific_option = true;
+    } else if (config[i].compare("roundup_power2_divisions") == 0) {
+      i = parseRoundUpPower2Divisions(config, i);
+      used_native_specific_option = true;
+    } else if (config[i].compare("backend") == 0) {
+      i = parseAllocatorConfig(config, i, used_cudaMallocAsync);
+    } else {
+      TORCH_CHECK(false, "Unrecognized CachingAllocator option: ", config[i]);
+    }
+
+    if (i + 1 < config.size()) {
+      consumeToken(config, ++i, ',');
+    }
+  }
+
+  if (used_cudaMallocAsync && used_native_specific_option) {
+    TORCH_WARN(
+        "backend:cudaMallocAsync ignores max_split_size_mb, roundup_bypass_threshold_mb,"
+        "roundup_power2_divisions, and garbage_collect_threshold.");
+  }
+}
 
 namespace Native {
 
@@ -1137,10 +1292,8 @@ class DeviceCachingAllocator {
   static size_t round_size(size_t size) {
     if (size < kMinBlockSize) {
       return kMinBlockSize;
-    } else if (size > CachingAllocatorConfig::roundup_bypass_threshold()) {
-      return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
     } else {
-      auto divisions = CachingAllocatorConfig::roundup_power2_divisions();
+      auto divisions = CachingAllocatorConfig::roundup_power2_divisions(size);
       if (divisions > 0 && size > (kMinBlockSize * divisions)) {
         return roundup_power2_next_division(size, divisions);
       } else {
diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
index b376adcff2554..8eed57cfbd964 100644
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@@ -394,6 +394,12 @@ Available options:
   the size 1200 lies between 1024 and 2048 and if we do 4 divisions between
   them, the values are 1024, 1280, 1536, and 1792. So, allocation size of 1200
   will be rounded to 1280 as the nearest ceiling of power-2 division.
+  Specify a single value to apply for all allocation sizes or specify an
+  array of key value pairs to set power-2 division individually for each
+  power of two interval. For example to set 1 division for all allocations
+  under 256MB, 2 division for allocations between 256MB and 512MB, 4 divisions
+  for allocations between 512MB and 1GB and 8 divisions for any larger allocations,
+  set the knob value to: [256:1,512:2,1024:4,>:8].
   ``roundup_power2_divisions`` is only meaningful with ``backend:native``.
   With ``backend:cudaMallocAsync``, ``roundup_power2_divisions`` is ignored.
 * ``roundup_bypass_threshold_mb`` bypass rounding the requested allocation size,
diff --git a/test/test_cuda.py b/test/test_cuda.py
index a99de422c7ae4..9128ea0937151 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -4767,10 +4767,14 @@ def power2_div(size, div_factor):
         nelems = 21 * 1024 * 1024
         nbytes = 4 * nelems  # floats are 4 bytes
 
+        nelems_big = 100 * 1024 * 1024
+        nbytes_big = 4 * nelems_big  # floats are 4 bytes
+
         start_mem = torch.cuda.memory_stats()[key]
         torch.cuda.memory._set_allocator_settings("")
         x = torch.rand(nelems, device='cuda')
 
+        # test roundup_power2_divisions single value syntax
         reg_mem = torch.cuda.memory_stats()[key]
         torch.cuda.memory._set_allocator_settings("roundup_power2_divisions:4")
         y = torch.rand(nelems, device='cuda')
@@ -4792,6 +4796,26 @@ def power2_div(size, div_factor):
         reg_mem = torch.cuda.memory_stats()[key]
         self.assertTrue(reg_mem - start_mem == nbytes)
 
+        # roundup_power2_divisions knob array syntax
+        torch.cuda.memory.empty_cache()
+        torch.cuda.memory._set_allocator_settings(
+            "garbage_collection_threshold:0.5,roundup_power2_divisions:[64:8,128:2,256:2,512:2,1024:1,>:1]")
+        start_mem = torch.cuda.memory_stats()[key]
+        w = torch.rand(nelems, device='cuda')
+
+        pow2_div8_mem = torch.cuda.memory_stats()[key]
+        if not TEST_CUDAMALLOCASYNC:
+            # not supported with the cudaMallocAsync backend
+            self.assertTrue(pow2_div8_mem - start_mem == power2_div(nbytes, 8))
+
+        torch.cuda.memory.empty_cache()
+        start_mem = torch.cuda.memory_stats()[key]
+        v = torch.rand(nelems_big, device='cuda')
+
+        pow2_div2_mem = torch.cuda.memory_stats()[key]
+        if not TEST_CUDAMALLOCASYNC:
+            # not supported with the cudaMallocAsync backend
+            self.assertTrue(pow2_div2_mem - start_mem == power2_div(nbytes_big, 2))
 
         with self.assertRaises(RuntimeError):
             torch.cuda.memory._set_allocator_settings("foo:1,bar:2")

From 018937e6ec5e001073d5e63c798e9a41dbda864c Mon Sep 17 00:00:00 2001
From: "Li-Huai (Allan) Lin" <qqaatw@gmail.com>
Date: Fri, 4 Nov 2022 19:43:56 +0000
Subject: [PATCH 0577/1922] [MPS] Fix embedding backward with scalar index
 (#82809)

### Description
Previously the embedding backward always expands `-1` dim to indices, resulting in the following error when the indices is a scalar:

```
 error: Rank of data array must equal number of outer dimensions in indices array + rank of slice to update, 2 != 1 + 0
-:8:10: note: see current operation: %5 = "mps.scatter_nd"(%0, %arg1, %4) {batch_dims = 0 : ui32, mode = 0 : i32} : (tensor<10x5xf16>,
```

Now makes it conditional.

Reproducer:

```python
def repro():
    w = torch.tensor([[-2.6465,  2.5859,  0.4688,  1.7949,  3.2676],
        [-3.1641,  8.9375,  5.7578, -2.9453, -6.5469],
        [ 2.0469,  1.3516, -8.7344,  6.0000,  1.3906],
        [ 6.5781,  7.8438,  6.9766,  3.2891, -5.1172],
        [-7.9414,  7.7344,  4.1875,  2.8574,  2.9531],
        [-0.4844, -5.6328, -6.8359, -4.5156,  3.7891],
        [ 4.9375,  6.6094,  6.7031,  0.6719, -6.4219],
        [ 7.0469,  8.2031,  4.4453,  1.7129, -2.4688],
        [ 1.2207, -3.3750, -2.4531,  7.4062, -6.0469],
        [-8.9688,  2.2656,  2.4160, -1.0176,  8.4531]], dtype=torch.float32, requires_grad=True)
    x = torch.tensor(5)
    out = torch.nn.functional.embedding(x, w)
    out.sum().backward()

    w_mps = w.detach().clone().to("mps").requires_grad_()
    x_mps = x.to("mps")
    out = torch.nn.functional.embedding(x_mps, w_mps)
    out.sum().backward() # error
```

### Issue
<!-- Link to Issue ticket or RFP -->

### Testing
<!-- How did you test your change? -->

Pull Request resolved: https://github.com/pytorch/pytorch/pull/82809
Approved by: https://github.com/malfet
---
 aten/src/ATen/native/mps/operations/Indexing.mm | 11 +++++++----
 test/test_mps.py                                | 13 +++++++------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index 65d27ba757935..9c27aae9b0b01 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -645,11 +645,14 @@ Tensor embedding_dense_backward_mps(
 
             MPSGraphTensor* indicesTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(indices.scalar_type()));
 
-            MPSGraphTensor *reshapedIndicesTensor = [mpsGraph  expandDimsOfTensor:indicesTensor
-                             axes:@[@-1]
-                             name:nil];
+            MPSGraphTensor* reshapedIndicesTensor = indicesTensor;
 
-            MPSGraphTensor *outgoingGradTensor;
+            if (num_indices_dims != 0)
+              reshapedIndicesTensor = [mpsGraph  expandDimsOfTensor:indicesTensor
+                              axes:@[@-1]
+                              name:nil];
+
+            MPSGraphTensor* outgoingGradTensor;
             outgoingGradTensor = [mpsGraph scatterNDWithUpdatesTensor:incomingGradTensor
                             indicesTensor:reshapedIndicesTensor
                                     shape:native_mps::getMPSShape(IntArrayRef(outgoing_gradient_shape.data(), outgoing_gradient_shape.size()))
diff --git a/test/test_mps.py b/test/test_mps.py
index 609a77bb971d3..e4a86e9377249 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -4257,33 +4257,34 @@ def helper(shape, dim, index, idx_dtype=torch.int32):
         helper((2, 3, 3), -1, [1, 2])
 
     def test_embedding_dense_backward(self):
-        def helper(n, d, m):
+        def helper(n, d, m, idx):
             embeddingMPS = nn.Embedding(n, d, max_norm=True, device='mps')
             W_MPS = torch.randn((m, d), requires_grad=True, device='mps')
-            idx_MPS = torch.tensor([0, 1, 2]).to('mps')
+            idx_MPS = torch.tensor(idx).to('mps')
             a_MPS = embeddingMPS.weight.clone() @ W_MPS.t()  # weight must be cloned for this to be differentiable
             a_MPS.retain_grad()
             b_MPS = embeddingMPS(idx_MPS) @ W_MPS.t()  # modifies weight in-place
             b_MPS.retain_grad()
-            out_MPS = (a_MPS.unsqueeze(0) + b_MPS.unsqueeze(1))
+            out_MPS = (a_MPS.unsqueeze(0) + b_MPS)
             loss_MPS = out_MPS.sigmoid().prod()
             loss_MPS.backward()
 
             embeddingCPU = nn.Embedding(n, d, max_norm=True, scale_grad_by_freq=True)
             W_CPU = W_MPS.to('cpu')
-            idx_CPU = torch.tensor([0, 1, 2])
+            idx_CPU = torch.tensor(idx)
             a_CPU = embeddingCPU.weight.clone() @ W_CPU.t()  # weight must be cloned for this to be differentiable
             a_CPU.retain_grad()
             b_CPU = embeddingCPU(idx_CPU) @ W_CPU.t()  # modifies weight in-place
             b_CPU.retain_grad()
-            out_CPU = (a_CPU.unsqueeze(0) + b_CPU.unsqueeze(1))
+            out_CPU = (a_CPU.unsqueeze(0) + b_CPU)
             loss_CPU = out_CPU.sigmoid().prod()
             loss_CPU.backward()
 
             self.assertEqual(b_CPU.grad, b_MPS.grad)
             self.assertEqual(a_CPU.grad, a_MPS.grad)
 
-        helper(3, 5, 7)
+        helper(3, 5, 7, [0, 1, 2])
+        helper(3, 5, 7, 2)  # test scalar index
 
     # Test pytorch gather
     def test_gather(self):

From 1719b2c73bf9408f8fa091d21a3e6f1df78b4f0d Mon Sep 17 00:00:00 2001
From: jjsjann123 <alex.jann2012@gmail.com>
Date: Wed, 2 Nov 2022 01:14:05 -0700
Subject: [PATCH 0578/1922] [NVFuser] Upstream push 1026 (#87779)

Syncing nvfuser devel branch to upstream master. https://github.com/csarofeen/pytorch/

Codegen changes include:

* codegen improvement:
    i. allow non-root trivial reductions, allow empty/no-op fusion
    ii. fixes vectorization checks and size calculation
    iii. bank conflict handle improvement
    iv. enables transpose scheduler

* misc:
    i. CI tests failure fixes
    ii. cpp tests file clean up
    iii. trivial forwarding supports added in codegen runtime
    iv. added factory methods support in codegen

Commits that's in this PR from the devel branch:

```
7117a7e37ebec372d9e802fdfb8abb7786960f4a patching nvfuser conv cudnn test numerics mismatch (#2048)
65af1a4e7013f070df1ba33701f2d524de79d096 Inserting sync for redundant parallel types is already done at the (#2023)
6ac74d181689c8f135f60bfc1ec139d88941c98c Fix sync map (#2047)
f5bca333355e2c0033523f3402de5b8aac602c00 Bank conflict checker improvements (#2032)
d2ca7e3fd203537946be3f7b435303c60fa7f51e Minor update on cp.async code generation. (#1901)
d36cf61f5570c9c992a748126287c4e7432228e0 Test file cleanup (#2040)
0b8e83f49c2ea9f04a4aad5061c1e7f4268474c6 Allow non-root trivial reductions (#2037)
a2dfe40b27cd3f5c04207596f0a1818fbd5e5439 Fix vectorize size calculation (#2035)
e040676a317fe34ea5875276270c7be88f6eaa56 Use withPredicate to replace setPredicate to maintain Exprs immutable (#2025)
197221b847ad5eb347d7ec1cf2706733aacbf97c removing ci workflow (#2034)
40e2703d00795526e7855860aa00b9ab7160755f Reduction rand like patch (#2031)
bc772661cbdb3b711d8e9854ae9b8b7052e3e4a3 Add utility for checking bank conflict of shared memory (#2029)
ddd1cf7695f3fb172a0e4bcb8e4004573617a037 Add back FusionReductionWithTrivialReduction_CUDA (#2030)
fbd97e5ef15fa0f7573800e6fbb5743463fd9e57 Revert "Cleanup trivial reduction workarounds (#2006)" (#2024)
bca20c1dfb8aa8d881fc7973e7579ce82bc6a894 Cleanup trivial reduction workarounds (#2006)
e4b65850eee1d70084105bb6e1f290651adde23e Trivial forwarding (#1995)
1a0e355b5027ed0df501989194ee8f2be3fdd37a Fix contiguity analysis of predicates to match updated contiguity. (#1991)
a4effa6a5f7066647519dc56e854f4c8a2efd2a7 Enable output allocation cache (#2010)
35440b7953ed8da164a5fb28f87d7fd760ac5e00 Patching bn inference (#2016)
0f9f0b4060dc8ca18dc65779cfd7e0776b6b38e8 Add matmul benchmark (#2007)
45045cd05ea268f510587321dbcc8d7c2977cdab Enable tests previously disabled due to an aliasing bug (#2005)
967aa77d2c8e360c7c01587522eec1c1d377c87e Contiguous indexing for View operations (#1990)
a43cb20f48943595894e345865bc1eabf58a5b48 Make inlining even more modular (#2004)
dc458358c0ac91dfaf4e6655a9b3fc206fc0c897 Test util cleanup (#2003)
3ca21ebe4d213f0070ffdfa4ae5d7f6cb0b8e870 More strict validation (#2000)
a7a7d573310c4707a9f381831d3114210461af01 Fix build problem (#1999)
fc235b064e27921fa9d6dbb9dc7055e5bae1c222 Just fixes comments (#1998)
482386c0509fee6edb2964c5ae72074791f3e43a cleanup (#1997)
4cbe0db6558a82c3097d281eec9c85ad2ea0893a Improve divisible split detection (#1970)
42ccc52bdc18bab0330f4b93ed1399164e2980c9 Minor build fix. (#1996)
fcf8c091f72d46f3055975a35afd06263324ede6 Cleanup of lower_utils.cpp: Isolate out GpuLower usage (#1989)
15f2f6dba8cbf408ec93c344767c1862c30f7ecc Move ConcretizedBroadcastDomains to shared_ptr in GpuLower. (#1988)
8f1c7f52679a3ad6acfd419d28a2f4be4a7d89e2 Minor cleanup lower_unroll.cpp (#1994)
1d9858c80319ca7f0037db7de5f04e47f540d76c Minor cleanup (#1992)
f262d9cab59f41c669f53799c6d4a6b9fc4267eb Add support for uniform RNG (#1986)
eb1dad10c73f855eb1ecb20a8b1f7b6edb0c9ea3 Remove non-const functions, remove GpuLower instance on build, pass in ca_map. (#1987)
634820c5e3586c0fe44132c51179b3155be18072 Add support for some empty fusion (#1981)
eabe8d844ad765ee4973faa4821d451ef71b83c3 Segment self mapping fusions (#1954)
e96aacfd9cf9b3c6d08f120282762489bdf540c8 Enable Transpose operation (#1882)
425dce2777420248e9f08893765b5402644f4161 Add a null scheduler that helps segmenting away no-op schedules (#1835)
306d4a68f127dd1b854b749855e48ba23444ba60 Fix canScheduleCompileTime check of transpose scheduler (#1969)
b1bd32cc1b2ae7bbd44701477bddbcfa6642a9be Minor fix (#1967)
bd93578143c1763c1e00ba613a017f8130a6b989 Enable transpose scheduler (#1927)
b7a206e93b4ac823c791c87f12859cf7af264a4c Move scheduler vectorize utilities into their own file (#1959)
d9420e4ca090489bf210e68e9912bb059b895baf View scheduling (#1928)
c668e13aea0cf21d40f95b48e0163b812712cdf2 Upstream push ci fixes (#1965)
c40202bb40ce955955bb97b12762ef3b6b612997 Fix dump effective bandwidth (#1962)
93505bcbb90a7849bd67090fe5708d867e8909e4 WAR on index mapping when exact and permissive maps differ (#1960)
45e95fd1d3c773ee9b2a21d79624c279d269da9f Allow splitting inner-most ID to create virtual innermost ID in transpose scheduler (#1930)
a3ecb339442131f87842eb56955e4f17c544e99f Improve the comments at the beginning of index_compute.h (#1946)
f7bc3417cc2923a635042cc6cc361b2f344248d6 Remove unused variables (#1955)
df3393adbb5cb0309d091f358cfa98706bd4d313 Some cleanup (#1957)
7d1d7c8724ab5a226fad0f5a80feeac04975a496 TVDomainGuard factory (#1953)
357ba224c0fb41ed3e4e8594d95599c973f4a0ca Fill allocation with nan on tests (#1956)
8eafc54685d406f5ac527bcbacc475fda4492d7a Fix detection of unmappable root domains (#1952)
90a51f282601ba8ebd4c84b9334efd7762a234bc Some indexing cleanups, Add eye support (#1940)
ddc01e4e16428aec92f9c84d698f959b6436a971 Exclude unsupported data types (#1951)
992e17c0688fe690c51b50e81a75803621b7e6aa test the groups the same order as they are merged (#1949)
208262b75d1fed0597a0329d61d57bc8bcd7ff14 Move detection of self mapping IDs to IterDomainGraph from (#1941)
ac4de38c6ee53b366e85fdfe408c3642d32b57df Merge pull request #1945 from csarofeen/master_merge_0828
631094891a96f715d8c9925fb73d41013ca7f2e3 Add full, full_like, zeros, zeros_like, ones, ones_like (#1943)
aab10bce4541204c46b91ff0f0ed9878aec1bfc4 Merge remote-tracking branch 'upstream/viable/strict' into HEAD
4c254c063bb55887b45677e3812357556a7aa80d Fix arange when step is negative (#1942)
89330aa23aa804340b2406ab58899d816e3dc3d2 Tensor factories must set the output shape as its input (#1939)
```

RUN_TORCHBENCH: nvfuser

Differential Revision: [D40869846](https://our.internmc.facebook.com/intern/diff/D40869846)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87779
Approved by: https://github.com/davidberard98
---
 aten/src/ATen/core/interned_strings.h         |     3 +
 benchmarks/cpp/nvfuser/CMakeLists.txt         |     1 +
 .../cpp/nvfuser/batch_norm_channels_first.cpp |     4 -
 .../batch_norm_channels_first_backward.cpp    |     4 -
 .../cpp/nvfuser/batch_norm_channels_last.cpp  |     4 -
 .../batch_norm_channels_last_backward.cpp     |     4 -
 benchmarks/cpp/nvfuser/gelu_backward.cpp      |     3 -
 benchmarks/cpp/nvfuser/layer_norm.cpp         |     2 -
 .../cpp/nvfuser/layer_norm_backward.cpp       |     3 -
 benchmarks/cpp/nvfuser/matmul.cpp             |   357 +
 benchmarks/cpp/nvfuser/rms_norm.cpp           |     2 -
 benchmarks/cpp/nvfuser/rms_norm_backward.cpp  |     3 -
 benchmarks/cpp/nvfuser/timm.cpp               |    11 +-
 build_variables.bzl                           |     5 +-
 test/cpp/jit/CMakeLists.txt                   |     6 +-
 test/test_jit_cuda_fuser.py                   |   123 +-
 torch/csrc/jit/codegen/cuda/arith.cpp         |   131 +-
 torch/csrc/jit/codegen/cuda/arith.h           |    36 +-
 torch/csrc/jit/codegen/cuda/codegen.cpp       |    57 +-
 torch/csrc/jit/codegen/cuda/compute_at.cpp    |    21 +-
 torch/csrc/jit/codegen/cuda/compute_at.h      |     2 +-
 .../csrc/jit/codegen/cuda/compute_at_map.cpp  |   104 +-
 torch/csrc/jit/codegen/cuda/compute_at_map.h  |    23 +-
 torch/csrc/jit/codegen/cuda/contiguity.cpp    |   650 +-
 torch/csrc/jit/codegen/cuda/contiguity.h      |   198 +-
 torch/csrc/jit/codegen/cuda/disjoint_set.h    |    11 +-
 torch/csrc/jit/codegen/cuda/dispatch.cpp      |    30 +
 torch/csrc/jit/codegen/cuda/dispatch.h        |     8 +
 torch/csrc/jit/codegen/cuda/dynamic_type.h    |     8 +
 .../jit/codegen/cuda/evaluator_common.cpp     |    12 +-
 torch/csrc/jit/codegen/cuda/executor.cpp      |   112 +-
 torch/csrc/jit/codegen/cuda/executor.h        |     4 +
 .../csrc/jit/codegen/cuda/executor_utils.cpp  |    11 +-
 .../csrc/jit/codegen/cuda/expr_evaluator.cpp  |    31 +-
 torch/csrc/jit/codegen/cuda/expr_evaluator.h  |     6 +
 torch/csrc/jit/codegen/cuda/fusion.cpp        |    15 +
 torch/csrc/jit/codegen/cuda/fusion.h          |     4 +
 .../jit/codegen/cuda/fusion_segmenter.cpp     |     2 +-
 torch/csrc/jit/codegen/cuda/graph_fuser.cpp   |    10 +-
 .../jit/codegen/cuda/grouped_reduction.cpp    |    18 +-
 torch/csrc/jit/codegen/cuda/index_compute.cpp |   351 +-
 torch/csrc/jit/codegen/cuda/index_compute.h   |    89 +-
 .../jit/codegen/cuda/inline_propagator.cpp    |   385 -
 .../csrc/jit/codegen/cuda/inline_propagator.h |   118 -
 torch/csrc/jit/codegen/cuda/inlining.cpp      |   306 +
 torch/csrc/jit/codegen/cuda/inlining.h        |   100 +
 torch/csrc/jit/codegen/cuda/interface.cpp     |    56 +
 torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp |    19 +
 torch/csrc/jit/codegen/cuda/ir_base_nodes.h   |    19 +-
 torch/csrc/jit/codegen/cuda/ir_builder.cpp    |     2 +
 torch/csrc/jit/codegen/cuda/ir_cloner.cpp     |     8 +
 torch/csrc/jit/codegen/cuda/ir_cloner.h       |     2 +
 torch/csrc/jit/codegen/cuda/ir_graphviz.cpp   |    29 +-
 torch/csrc/jit/codegen/cuda/ir_graphviz.h     |     2 +
 .../jit/codegen/cuda/ir_interface_nodes.h     |    29 +-
 .../csrc/jit/codegen/cuda/ir_internal_nodes.h |   161 +-
 torch/csrc/jit/codegen/cuda/ir_iostream.cpp   |    66 +-
 torch/csrc/jit/codegen/cuda/ir_iostream.h     |     2 +
 torch/csrc/jit/codegen/cuda/ir_nodes.cpp      |   361 +-
 torch/csrc/jit/codegen/cuda/ir_utils.cpp      |    56 +-
 torch/csrc/jit/codegen/cuda/ir_utils.h        |     7 +-
 torch/csrc/jit/codegen/cuda/iter_visitor.cpp  |   141 +-
 torch/csrc/jit/codegen/cuda/iter_visitor.h    |    97 +-
 torch/csrc/jit/codegen/cuda/kernel_cache.cpp  |    39 +-
 .../codegen/cuda/kernel_expr_evaluator.cpp    |     4 +
 torch/csrc/jit/codegen/cuda/kernel_ir.cpp     |   194 +-
 torch/csrc/jit/codegen/cuda/kernel_ir.h       |    63 +-
 torch/csrc/jit/codegen/cuda/lower2device.cpp  |    14 +-
 torch/csrc/jit/codegen/cuda/lower2device.h    |    27 +-
 .../jit/codegen/cuda/lower_alias_memory.cpp   |    20 +-
 .../jit/codegen/cuda/lower_allocation.cpp     |    10 +-
 .../jit/codegen/cuda/lower_bank_conflict.cpp  |   332 +
 .../jit/codegen/cuda/lower_bank_conflict.h    |    46 +
 .../codegen/cuda/lower_divisible_split.cpp    |   121 +
 .../jit/codegen/cuda/lower_divisible_split.h  |    29 +
 .../csrc/jit/codegen/cuda/lower_expr_sort.cpp |     5 +-
 torch/csrc/jit/codegen/cuda/lower_index.cpp   |   119 +-
 torch/csrc/jit/codegen/cuda/lower_index.h     |     2 +
 .../jit/codegen/cuda/lower_index_compute.cpp  |   143 +-
 .../jit/codegen/cuda/lower_insert_syncs.cpp   |     4 +-
 torch/csrc/jit/codegen/cuda/lower_loops.cpp   |     4 +-
 .../csrc/jit/codegen/cuda/lower_predicate.cpp |    69 +-
 .../cuda/lower_predicate_elimination.cpp      |     8 +-
 torch/csrc/jit/codegen/cuda/lower_shift.cpp   |   136 +-
 torch/csrc/jit/codegen/cuda/lower_shift.h     |    35 +-
 .../codegen/cuda/lower_sync_information.cpp   |    50 +-
 .../codegen/cuda/lower_thread_predicate.cpp   |     7 +-
 .../codegen/cuda/lower_trivial_broadcast.cpp  |     3 +-
 .../codegen/cuda/lower_trivial_broadcast.h    |     3 +-
 torch/csrc/jit/codegen/cuda/lower_unroll.cpp  |    39 +-
 torch/csrc/jit/codegen/cuda/lower_unroll.h    |     2 +
 torch/csrc/jit/codegen/cuda/lower_utils.cpp   |   350 +-
 torch/csrc/jit/codegen/cuda/lower_utils.h     |   116 +-
 .../jit/codegen/cuda/lower_validation.cpp     |     2 +-
 .../jit/codegen/cuda/lower_warp_reduce.cpp    |     6 +-
 torch/csrc/jit/codegen/cuda/manager.cpp       |    12 +-
 torch/csrc/jit/codegen/cuda/mutator.cpp       |    43 +-
 .../jit/codegen/cuda/non_divisible_split.cpp  |    13 +-
 torch/csrc/jit/codegen/cuda/ops/alias.cpp     |    30 +-
 torch/csrc/jit/codegen/cuda/ops/composite.cpp |     2 +-
 .../jit/codegen/cuda/ops/normalization.cpp    |     9 +-
 torch/csrc/jit/codegen/cuda/parser.cpp        |   154 +-
 .../cuda/python_frontend/python_bindings.cpp  |     2 +-
 .../test/test_nvfuser_fusion_cache.cpp        |     1 +
 .../test/test_nvfuser_fusion_definition.cpp   |     1 +
 .../test/test_nvfuser_fusion_record.cpp       |     1 +
 .../csrc/jit/codegen/cuda/reference_tensor.h  |    27 -
 .../csrc/jit/codegen/cuda/root_domain_map.cpp |    61 +-
 torch/csrc/jit/codegen/cuda/root_domain_map.h |     4 +-
 torch/csrc/jit/codegen/cuda/runtime/memory.cu |    25 +
 .../codegen/cuda/runtime/random_numbers.cu    |    20 +
 .../codegen/cuda/scheduler/all_schedulers.h   |     1 +
 .../cuda/scheduler/compile_time_info.h        |    46 +-
 .../jit/codegen/cuda/scheduler/heuristic.h    |     3 +-
 .../jit/codegen/cuda/scheduler/mma_utils.cpp  |     4 +-
 .../codegen/cuda/scheduler/normalization.cpp  |     4 +-
 .../jit/codegen/cuda/scheduler/pointwise.cpp  |   241 +-
 .../jit/codegen/cuda/scheduler/pointwise.h    |   138 +
 .../codegen/cuda/scheduler/pointwise_utils.h  |     4 -
 .../jit/codegen/cuda/scheduler/reduction.cpp  |     4 +-
 .../cuda/scheduler/reduction_utils.cpp        |    11 +-
 .../jit/codegen/cuda/scheduler/registry.cpp   |   439 +-
 .../jit/codegen/cuda/scheduler/registry.h     |     4 +
 .../jit/codegen/cuda/scheduler/transpose.cpp  |   412 +-
 .../jit/codegen/cuda/scheduler/transpose.h    |     9 +
 .../cuda/scheduler/transpose_heuristic.h      |    11 +-
 .../csrc/jit/codegen/cuda/scheduler/utils.cpp |   609 +-
 torch/csrc/jit/codegen/cuda/scheduler/utils.h |    80 +-
 .../cuda/scheduler/vectorize_helper.cpp       |   286 +
 .../codegen/cuda/scheduler/vectorize_helper.h |    14 +-
 torch/csrc/jit/codegen/cuda/tensor_view.cpp   |   110 +-
 torch/csrc/jit/codegen/cuda/test/test_gpu.cpp | 25813 ----------------
 .../csrc/jit/codegen/cuda/test/test_gpu1.cpp  |  9985 ++++++
 .../csrc/jit/codegen/cuda/test/test_gpu2.cpp  |  9801 ++++++
 .../csrc/jit/codegen/cuda/test/test_gpu3.cpp  |  6538 ++++
 .../cuda/test/test_gpu_fused_reduction.cpp    |     7 +-
 .../jit/codegen/cuda/test/test_gpu_rng.cu     |   108 +-
 .../cuda/test/test_gpu_scheduler_utils.cpp    |    57 -
 .../jit/codegen/cuda/test/test_gpu_shift.cpp  |    67 +
 .../cuda/test/test_gpu_tensor_factories.cpp   |   237 +
 .../codegen/cuda/test/test_gpu_transpose.cpp  |   351 +-
 .../jit/codegen/cuda/test/test_gpu_utils.cpp  |   273 +
 .../codegen/cuda/test/test_gpu_validator.h    |    55 +-
 .../jit/codegen/cuda/test/test_gpu_view.cpp   |   513 +
 torch/csrc/jit/codegen/cuda/test/test_utils.h |   290 +-
 .../csrc/jit/codegen/cuda/transform_iter.cpp  |    14 +-
 torch/csrc/jit/codegen/cuda/type.cpp          |    21 +
 torch/csrc/jit/codegen/cuda/type.h            |     7 +-
 .../csrc/jit/codegen/cuda/type_inference.cpp  |    11 +-
 torch/csrc/jit/codegen/cuda/utils.cpp         |    23 +-
 torch/csrc/jit/codegen/cuda/utils.h           |     9 +-
 torch/csrc/jit/codegen/fuser/codegen.cpp      |     2 +-
 152 files changed, 35136 insertions(+), 28328 deletions(-)
 create mode 100644 benchmarks/cpp/nvfuser/matmul.cpp
 delete mode 100644 torch/csrc/jit/codegen/cuda/inline_propagator.cpp
 delete mode 100644 torch/csrc/jit/codegen/cuda/inline_propagator.h
 create mode 100644 torch/csrc/jit/codegen/cuda/inlining.cpp
 create mode 100644 torch/csrc/jit/codegen/cuda/inlining.h
 create mode 100644 torch/csrc/jit/codegen/cuda/lower_bank_conflict.cpp
 create mode 100644 torch/csrc/jit/codegen/cuda/lower_bank_conflict.h
 create mode 100644 torch/csrc/jit/codegen/cuda/lower_divisible_split.cpp
 create mode 100644 torch/csrc/jit/codegen/cuda/lower_divisible_split.h
 delete mode 100644 torch/csrc/jit/codegen/cuda/reference_tensor.h
 create mode 100644 torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.cpp
 delete mode 100644 torch/csrc/jit/codegen/cuda/test/test_gpu.cpp
 create mode 100644 torch/csrc/jit/codegen/cuda/test/test_gpu1.cpp
 create mode 100644 torch/csrc/jit/codegen/cuda/test/test_gpu2.cpp
 create mode 100644 torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp
 delete mode 100644 torch/csrc/jit/codegen/cuda/test/test_gpu_scheduler_utils.cpp
 create mode 100644 torch/csrc/jit/codegen/cuda/test/test_gpu_utils.cpp

diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h
index dc5860ebf2c4e..80919e52b58fd 100644
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@@ -50,8 +50,11 @@ namespace c10 {
   _(prim, FunctionalGraph)           \
   _(prim, add_optional)              \
   _(prim, view_copy)                 \
+  _(prim, permute_copy)              \
   _(prim, reshape_copy)              \
   _(prim, squeeze_copy)              \
+  _(prim, t_copy)                    \
+  _(prim, transpose_copy)            \
   _(prim, unsqueeze_copy)            \
   _(prim, flatten_copy)              \
   _(prim, expand_copy)               \
diff --git a/benchmarks/cpp/nvfuser/CMakeLists.txt b/benchmarks/cpp/nvfuser/CMakeLists.txt
index bac36d19f3d16..ad9053bb3a3aa 100644
--- a/benchmarks/cpp/nvfuser/CMakeLists.txt
+++ b/benchmarks/cpp/nvfuser/CMakeLists.txt
@@ -20,6 +20,7 @@ if(USE_CUDA)
     softmax_backward.cpp
     scale_bias_relu.cpp
     transpose.cpp
+    matmul.cpp
     timm.cpp
     utils.cpp
     main.cpp)
diff --git a/benchmarks/cpp/nvfuser/batch_norm_channels_first.cpp b/benchmarks/cpp/nvfuser/batch_norm_channels_first.cpp
index 723d222516df4..2f839f0c8332a 100644
--- a/benchmarks/cpp/nvfuser/batch_norm_channels_first.cpp
+++ b/benchmarks/cpp/nvfuser/batch_norm_channels_first.cpp
@@ -73,10 +73,6 @@ static void NvFuserScheduler_BatchNorm(
     DataType dtype) {
   TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
 
-  const bool kTraining = true;
-  const float kMomentum = 0.1;
-  const float kEps = 1e-5;
-
   std::vector<int64_t> input_shape{
       benchmark_state.range(0),
       benchmark_state.range(1),
diff --git a/benchmarks/cpp/nvfuser/batch_norm_channels_first_backward.cpp b/benchmarks/cpp/nvfuser/batch_norm_channels_first_backward.cpp
index af2b4d145fc8f..62a4e99e21ef6 100644
--- a/benchmarks/cpp/nvfuser/batch_norm_channels_first_backward.cpp
+++ b/benchmarks/cpp/nvfuser/batch_norm_channels_first_backward.cpp
@@ -25,7 +25,6 @@ static void setupBatchNorm_BWD(Fusion* fusion, DataType dtype) {
   FusionGuard fg(fusion);
 
   const bool kTraining = true;
-  const float kMomentum = 0.1;
   const float kEps = 1e-5;
 
   // setup fusion
@@ -85,9 +84,6 @@ static void NvFuserScheduler_BatchNorm_BWD(
     DataType dtype) {
   TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
 
-  const bool kTraining = true;
-  const float kEps = 1e-5;
-
   std::vector<int64_t> input_shape{
       benchmark_state.range(0),
       benchmark_state.range(1),
diff --git a/benchmarks/cpp/nvfuser/batch_norm_channels_last.cpp b/benchmarks/cpp/nvfuser/batch_norm_channels_last.cpp
index 14fde631aec0b..7b8972a0aad07 100644
--- a/benchmarks/cpp/nvfuser/batch_norm_channels_last.cpp
+++ b/benchmarks/cpp/nvfuser/batch_norm_channels_last.cpp
@@ -74,10 +74,6 @@ static void NvFuserScheduler_BatchNorm_nhwc(
     DataType dtype) {
   TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
 
-  const bool kTraining = true;
-  const float kMomentum = 0.1;
-  const float kEps = 1e-5;
-
   std::vector<int64_t> input_shape{
       benchmark_state.range(0),
       benchmark_state.range(2),
diff --git a/benchmarks/cpp/nvfuser/batch_norm_channels_last_backward.cpp b/benchmarks/cpp/nvfuser/batch_norm_channels_last_backward.cpp
index 0660b75e39426..29bcfb3e81be7 100644
--- a/benchmarks/cpp/nvfuser/batch_norm_channels_last_backward.cpp
+++ b/benchmarks/cpp/nvfuser/batch_norm_channels_last_backward.cpp
@@ -25,7 +25,6 @@ static void setupBatchNorm_nhwc_BWD(Fusion* fusion, DataType dtype) {
   FusionGuard fg(fusion);
 
   const bool kTraining = true;
-  const float kMomentum = 0.1;
   const float kEps = 1e-5;
 
   // setup fusion
@@ -86,9 +85,6 @@ static void NvFuserScheduler_BatchNorm_nhwc_BWD(
     DataType dtype) {
   TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
 
-  const bool kTraining = true;
-  const float kEps = 1e-5;
-
   std::vector<int64_t> input_shape{
       benchmark_state.range(0),
       benchmark_state.range(2),
diff --git a/benchmarks/cpp/nvfuser/gelu_backward.cpp b/benchmarks/cpp/nvfuser/gelu_backward.cpp
index e6a24111e848f..732ad7f0ea0fd 100644
--- a/benchmarks/cpp/nvfuser/gelu_backward.cpp
+++ b/benchmarks/cpp/nvfuser/gelu_backward.cpp
@@ -113,9 +113,6 @@ BENCHMARK(GeluBackward_AutoSchedule)->Unit(benchmark::kMicrosecond);
 //------------------------------------------------------------------------------
 
 static void GeluBackward_Lower(benchmark::State& benchmark_state) {
-  constexpr int kHiddenFeatures = 512;
-  constexpr int kBatchSize = 64;
-
   Fusion fusion;
 
   // setup fusion
diff --git a/benchmarks/cpp/nvfuser/layer_norm.cpp b/benchmarks/cpp/nvfuser/layer_norm.cpp
index 316fe22c1ff4f..d2cff09e5d2ed 100644
--- a/benchmarks/cpp/nvfuser/layer_norm.cpp
+++ b/benchmarks/cpp/nvfuser/layer_norm.cpp
@@ -22,7 +22,6 @@ static void setupLayerNorm(Fusion* fusion, DataType dtype) {
 
   FusionGuard fg(fusion);
 
-  const int kReductionAxis = 1;
   const float kEps = 1e-5;
 
   Double* eps_ptr = IrBuilder::create<Double>(kEps);
@@ -61,7 +60,6 @@ static void NvFuserScheduler_LayerNorm(
 
   std::vector<int64_t> input_shape{
       benchmark_state.range(0), benchmark_state.range(1)};
-  const float kEps = 1e-5;
 
   // inputs
   at::manual_seed(0);
diff --git a/benchmarks/cpp/nvfuser/layer_norm_backward.cpp b/benchmarks/cpp/nvfuser/layer_norm_backward.cpp
index cce8aa42ce933..c431622e7b9f4 100644
--- a/benchmarks/cpp/nvfuser/layer_norm_backward.cpp
+++ b/benchmarks/cpp/nvfuser/layer_norm_backward.cpp
@@ -22,9 +22,6 @@ static void setupLayerNorm_BWD(Fusion* fusion, DataType dtype) {
 
   TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
 
-  const int kReductionAxis = 1;
-  Double* eps_ptr = IrBuilder::create<Double>(1e-5);
-
   // setup fusion
   auto grad_out = makeContigTensor(2, dtype);
   auto input = makeContigTensor(2, dtype);
diff --git a/benchmarks/cpp/nvfuser/matmul.cpp b/benchmarks/cpp/nvfuser/matmul.cpp
new file mode 100644
index 0000000000000..25fc6cfe23569
--- /dev/null
+++ b/benchmarks/cpp/nvfuser/matmul.cpp
@@ -0,0 +1,357 @@
+#include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/executor.h>
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/matmul.h>
+
+#include <benchmark/benchmark.h>
+
+#include <cuda_runtime.h>
+
+#include <benchmarks/cpp/nvfuser/utils.h>
+
+using namespace torch::jit::fuser::cuda;
+
+bool cudaArchGuardShouldSkip(int required_major, int required_minor) {
+  int capability_major = at::cuda::getCurrentDeviceProperties()->major;
+  int capability_minor = at::cuda::getCurrentDeviceProperties()->minor;
+
+  if (capability_major < required_major ||
+      (capability_major == required_major &&
+       capability_minor < required_minor)) {
+    return true;
+  }
+  return false;
+}
+
+bool hasRequiredSmemSize(size_t required_size) {
+  // Only checking device 0
+  return at::cuda::getDeviceProperties(0)->sharedMemPerBlockOptin >=
+      required_size;
+}
+
+#define NVFUSER_BENCHMARK_ARCH_SMEM_GUARD(                       \
+    REQUIRED_MAJOR, REQUIRED_MINOR, SMEM_SIZE, STATE)            \
+  if (cudaArchGuardShouldSkip(REQUIRED_MAJOR, REQUIRED_MINOR) || \
+      !hasRequiredSmemSize(SMEM_SIZE)) {                         \
+    STATE.SkipWithError("Unsupported arch or not enough smem!"); \
+    return;                                                      \
+  }
+
+// util to track support matmul operand layout.
+using MatmulLayout = MmaOptions::MmaInputLayout;
+
+static constexpr std::array<MatmulLayout, 3> kAllSupportedLayout = {
+    MatmulLayout::TT,
+    MatmulLayout::NT,
+    MatmulLayout::TN};
+
+// Generic interface to get matmul op with the given layout.
+TensorView* matmul(TensorView* a, TensorView* b, MatmulLayout layout) {
+  TORCH_CHECK(
+      a->nDims() == 2 && b->nDims() == 2, "only pure matmuls for these tests");
+  TensorView *tv2 = nullptr, *tv0b = nullptr, *tv1b = nullptr;
+  switch (layout) {
+    case MatmulLayout::TT:
+      tv0b = broadcast(a, {false, false, true});
+      tv1b = broadcast(b, {true, false, false});
+      tv2 = fusedMultiplySum(tv0b, tv1b, {1});
+      break;
+    case MatmulLayout::TN:
+      tv0b = broadcast(a, {false, true, false});
+      tv1b = broadcast(b, {true, false, false});
+      tv2 = fusedMultiplySum(tv0b, tv1b, {2});
+      break;
+    case MatmulLayout::NT:
+      tv0b = broadcast(a, {false, false, true});
+      tv1b = broadcast(b, {false, true, false});
+      tv2 = fusedMultiplySum(tv0b, tv1b, {0});
+      break;
+    default:
+      TORCH_CHECK(false, "unsupported data layout.");
+  }
+  return tv2;
+}
+
+// Utility to generate matmul input tensors based on given layout
+at::Tensor atMatmul(at::Tensor a, at::Tensor b, MatmulLayout layout) {
+  switch (layout) {
+    case MatmulLayout::TT:
+      return a.matmul(b);
+    case MatmulLayout::TN:
+      return a.matmul(b.t());
+    case MatmulLayout::NT:
+      return a.t().matmul(b);
+    default:
+      TORCH_CHECK(false, "unsupported data layout.");
+  }
+  return at::Tensor();
+}
+
+// Utility to generate reference results based on given layout
+std::pair<at::Tensor, at::Tensor> fp16MatmulAtInput(
+    int M,
+    int N,
+    int K,
+    MatmulLayout layout) {
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+
+  switch (layout) {
+    case MatmulLayout::TT:
+      return std::make_pair(
+          at::randn({M, K}, options), at::randn({K, N}, options));
+    case MatmulLayout::TN:
+      return std::make_pair(
+          at::randn({M, K}, options), at::randn({N, K}, options));
+    case MatmulLayout::NT:
+      return std::make_pair(
+          at::randn({K, M}, options), at::randn({K, N}, options));
+    default:
+      TORCH_CHECK(false, "unsupported data layout.");
+  }
+  return std::make_pair(at::Tensor(), at::Tensor());
+}
+
+// TODO: separate compute and schedule definition once the can schedule
+//  logic and pattern matching is ready.
+void setupMatmul(Fusion* fusion, MatmulLayout layout, MatmulParam params) {
+  // Only hgemm on the initial setup
+  auto a = makeContigTensor(2, DataType::Half);
+  auto b = makeContigTensor(2, DataType::Half);
+
+  auto c = matmul(a, b, layout);
+
+  fusion->addInput(a);
+  fusion->addInput(b);
+  fusion->addOutput(c);
+
+  scheduleMatmul(c, a, b, params);
+}
+
+static void SingleMatmulBase(
+    benchmark::State& benchmark_state,
+    MatmulLayout layout,
+    MatmulParam params) {
+  std::vector<int64_t> input_mnk{
+      benchmark_state.range(0),
+      benchmark_state.range(1),
+      benchmark_state.range(2)};
+
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  // Define fusion graph
+  setupMatmul(fusion, layout, params);
+
+  // inputs
+  at::manual_seed(0);
+
+  // Tensor inputs
+  auto inputs = fp16MatmulAtInput(
+      input_mnk.at(0), input_mnk.at(1), input_mnk.at(2), layout);
+
+  KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(
+      {inputs.first, inputs.second});
+
+  // Always use 32b indexing mode for now.
+  TORCH_INTERNAL_ASSERT(args.getIndexMode() == KernelIndexMode::INT32);
+
+  // Compile kernel
+  FusionExecutor fe;
+  fe.compileFusion(fusion, args, LaunchParams());
+
+  // Warm up run
+  auto outputs = fe.runFusion({inputs.first, inputs.second});
+  fe.setMeasureKernelTimeFlag(true);
+
+  // Sync everything up before we start
+  for (auto _ : benchmark_state) {
+    clearL2Cache();
+    auto outputs = fe.runFusion({inputs.first, inputs.second});
+    benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
+  }
+  // Sync everything up before we're finished, don't want to run ahead on the
+  // cpu while benchmarking.
+  cudaDeviceSynchronize();
+
+  // TODO: FLOPS calculation
+}
+
+static void EagerModeMatmul(
+    benchmark::State& benchmark_state,
+    MatmulLayout layout) {
+  std::vector<int64_t> input_mnk{
+      benchmark_state.range(0),
+      benchmark_state.range(1),
+      benchmark_state.range(2)};
+
+  at::manual_seed(0);
+
+  auto inputs = fp16MatmulAtInput(
+      input_mnk.at(0), input_mnk.at(1), input_mnk.at(2), layout);
+
+  // warm up run
+  auto outputs = atMatmul(inputs.first, inputs.second, layout);
+
+  for (auto _ : benchmark_state) {
+    clearL2Cache();
+    CudaKernelTimer timer;
+    outputs = atMatmul(inputs.first, inputs.second, layout);
+    benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
+  }
+  // Sync everything up before we're finished, don't want to run ahead on the
+  // cpu while benchmarking.
+  cudaDeviceSynchronize();
+}
+
+// Actual benchmarking
+// -----------------------------------------------------------------
+
+size_t getSmemSize(GemmTile cta_tile, int stage_number) {
+  return ((cta_tile.m * cta_tile.k) + (cta_tile.n * cta_tile.k)) *
+      dataTypeSize(DataType::Half) * stage_number;
+}
+
+// TODO: this part eventually will be automated by heuristics
+MatmulParam getMatmulParams(
+    GemmTile cta_tile,
+    int stage_number,
+    MatmulLayout layout) {
+  MatMulTileOptions gemm_tile;
+  gemm_tile.cta_tile = cta_tile;
+  // TODO: pipe through split K
+  gemm_tile.warp_tile = GemmTile(64, 64, cta_tile.k);
+  gemm_tile.instruction_tile = GemmTile(16, 16, 16);
+
+  // Collect mma swizzle info
+  auto mma_builder =
+      MmaBuilder(MmaOptions::MacroType::Ampere_16_16_16, gemm_tile)
+          .layout(layout);
+
+  MatmulParam params(mma_builder);
+  params.tile_sizes = gemm_tile;
+  params.async_gmem_load_operands = true;
+  params.double_buffer_options.double_buffer_smem_write = true;
+  params.double_buffer_options.double_buffer_smem_read = true;
+  params.double_buffer_options.smem_double_buffer_stage = stage_number;
+
+  return params;
+}
+
+static void Nvfuser_Matmul_4warp3stage(
+    benchmark::State& benchmark_state,
+    MatmulLayout layout) {
+  auto cta_tile = GemmTile(128, 128, 32);
+  int number_of_stage = 3;
+
+  auto params = getMatmulParams(cta_tile, number_of_stage, layout);
+
+  NVFUSER_BENCHMARK_ARCH_SMEM_GUARD(
+      8, 0, getSmemSize(cta_tile, number_of_stage), benchmark_state);
+
+  // Run benchmark:
+  SingleMatmulBase(benchmark_state, layout, params);
+}
+
+static void Nvfuser_Matmul_8warp3stage(
+    benchmark::State& benchmark_state,
+    MatmulLayout layout) {
+  auto cta_tile = GemmTile(256, 128, 32);
+  int number_of_stage = 3;
+
+  auto params = getMatmulParams(cta_tile, number_of_stage, layout);
+
+  NVFUSER_BENCHMARK_ARCH_SMEM_GUARD(
+      8, 0, getSmemSize(cta_tile, number_of_stage), benchmark_state);
+
+  // Run benchmark:
+  SingleMatmulBase(benchmark_state, layout, params);
+}
+
+static void Nvfuser_Matmul_4warp4stage(
+    benchmark::State& benchmark_state,
+    MatmulLayout layout) {
+  auto cta_tile = GemmTile(128, 128, 32);
+  int number_of_stage = 4;
+
+  auto params = getMatmulParams(cta_tile, number_of_stage, layout);
+
+  NVFUSER_BENCHMARK_ARCH_SMEM_GUARD(
+      8, 0, getSmemSize(cta_tile, number_of_stage), benchmark_state);
+
+  // Run benchmark:
+  SingleMatmulBase(benchmark_state, layout, params);
+}
+
+static void Nvfuser_Matmul_8warp4stage(
+    benchmark::State& benchmark_state,
+    MatmulLayout layout) {
+  auto cta_tile = GemmTile(256, 128, 32);
+  int number_of_stage = 4;
+
+  auto params = getMatmulParams(cta_tile, number_of_stage, layout);
+
+  NVFUSER_BENCHMARK_ARCH_SMEM_GUARD(
+      8, 0, getSmemSize(cta_tile, number_of_stage), benchmark_state);
+
+  // Run benchmark:
+  SingleMatmulBase(benchmark_state, layout, params);
+}
+
+// ----------------------------- Benchmark Instantiation-------
+
+// Common utils:
+#define NO_TILE_QUANTIZATION_ARGS                                             \
+  ArgsProduct(                                                                \
+      {{2048}, {3456}, benchmark::CreateDenseRange(512, 4096, /*step=*/512)}) \
+      ->Unit(benchmark::kMicrosecond)                                         \
+      ->UseManualTime();
+
+#define ForAllLayouts(run)   \
+  run(TT, MatmulLayout::TT); \
+  run(TN, MatmulLayout::TN); \
+  run(NT, MatmulLayout::NT)
+
+// Instantiations:
+#define Nvfuser_4warp3stage_test(layout_label, layout) \
+  BENCHMARK_CAPTURE(                                   \
+      Nvfuser_Matmul_4warp3stage,                      \
+      no_quant_nvfuser_4warp_##layout_label,           \
+      layout)                                          \
+      ->NO_TILE_QUANTIZATION_ARGS
+
+#define Nvfuser_8warp3stage_test(layout_label, layout) \
+  BENCHMARK_CAPTURE(                                   \
+      Nvfuser_Matmul_8warp3stage,                      \
+      no_quant_nvfuser_8warp_##layout_label,           \
+      layout)                                          \
+      ->NO_TILE_QUANTIZATION_ARGS
+
+#define Nvfuser_4warp4stage_test(layout_label, layout) \
+  BENCHMARK_CAPTURE(                                   \
+      Nvfuser_Matmul_4warp4stage,                      \
+      no_quant_nvfuser_4warp_##layout_label,           \
+      layout)                                          \
+      ->NO_TILE_QUANTIZATION_ARGS
+
+#define Nvfuser_8warp4stage_test(layout_label, layout) \
+  BENCHMARK_CAPTURE(                                   \
+      Nvfuser_Matmul_8warp4stage,                      \
+      no_quant_nvfuser_8warp_##layout_label,           \
+      layout)                                          \
+      ->NO_TILE_QUANTIZATION_ARGS
+
+#define Eagermode_test(layout_label, layout)                      \
+  BENCHMARK_CAPTURE(                                              \
+      EagerModeMatmul, no_quant_eagermode_##layout_label, layout) \
+      ->NO_TILE_QUANTIZATION_ARGS
+
+ForAllLayouts(Nvfuser_4warp3stage_test);
+ForAllLayouts(Nvfuser_4warp4stage_test);
+ForAllLayouts(Nvfuser_8warp3stage_test);
+ForAllLayouts(Nvfuser_8warp4stage_test);
+ForAllLayouts(Eagermode_test);
diff --git a/benchmarks/cpp/nvfuser/rms_norm.cpp b/benchmarks/cpp/nvfuser/rms_norm.cpp
index 81fdf46cf8189..37911ea6b1fd2 100644
--- a/benchmarks/cpp/nvfuser/rms_norm.cpp
+++ b/benchmarks/cpp/nvfuser/rms_norm.cpp
@@ -24,7 +24,6 @@ static void setupRMSNorm(Fusion* fusion, DataType dtype) {
 
   FusionGuard fg(fusion);
 
-  const int kReductionAxis = 2;
   const float kEps = 1e-6;
 
   Double* eps_ptr = IrBuilder::create<Double>(kEps);
@@ -61,7 +60,6 @@ static void NvFuserScheduler_RMSNorm(
       dtype == DataType::BFloat16);
 
   std::vector<int64_t> input_shape{8, benchmark_state.range(0), 1024};
-  const float kEps = 1e-6;
 
   // inputs
   at::manual_seed(0);
diff --git a/benchmarks/cpp/nvfuser/rms_norm_backward.cpp b/benchmarks/cpp/nvfuser/rms_norm_backward.cpp
index b4c6ac413c758..987c3bf234fa2 100644
--- a/benchmarks/cpp/nvfuser/rms_norm_backward.cpp
+++ b/benchmarks/cpp/nvfuser/rms_norm_backward.cpp
@@ -24,9 +24,6 @@ static void setupRMSNorm_BWD(Fusion* fusion, DataType dtype) {
       dtype == DataType::Float || dtype == DataType::Half ||
       dtype == DataType::BFloat16);
 
-  const int kReductionAxis = 2;
-  Double* eps_ptr = IrBuilder::create<Double>(1e-6);
-
   // setup fusion
   auto grad_out = makeContigTensor(3, dtype);
   auto input = makeContigTensor(3, dtype);
diff --git a/benchmarks/cpp/nvfuser/timm.cpp b/benchmarks/cpp/nvfuser/timm.cpp
index 013b609be6020..4669ff0ecabf6 100644
--- a/benchmarks/cpp/nvfuser/timm.cpp
+++ b/benchmarks/cpp/nvfuser/timm.cpp
@@ -115,7 +115,7 @@ static void setup_vit_base_patch16_224_bcast5(Fusion* fusion, void* null) {
   auto t6 = set(t5);
   auto t7 = broadcast(t6, bcast_pattern0);
   auto t8 = add(t4, t7);
-  auto t9 = randlike(t8);
+  auto t9 = rand_like(t8);
   auto d34 =
       sub(IrBuilder::create<Double>(1.0), IrBuilder::create<Double>(0.0));
   auto t10 = lt(t9, d34);
@@ -139,7 +139,6 @@ static void setup_vit_base_patch16_224_bcast5(Fusion* fusion, void* null) {
   auto t20 = sum(t37, {2});
   auto t24 = broadcast(t20, bcast_pattern1);
   auto d95 = castOp(DataType::Double, t2->axis(2)->extent());
-  auto d96 = mul(IrBuilder::create<Double>(1.0), d95);
   auto d105 = reciprocal(d95);
   auto t25 = mul(t24, d105);
   auto t26 = add(t25, IrBuilder::create<Double>(1e-6));
@@ -289,7 +288,7 @@ static void setup_vit_base_patch16_224_norm_inner3(Fusion* fusion, void* null) {
   auto t10 = broadcast(t9, {false, false, false, true});
   auto t11 = reciprocal(t10);
   auto t12 = mul(t8, t11);
-  auto t13 = randlike(t12);
+  auto t13 = rand_like(t12);
   auto d79 = sub(IrBuilder::create<Double>(1), IrBuilder::create<Double>(0));
   auto t14 = lt(t13, d79);
   auto t15 = castOp(DataType::Float, t14);
@@ -320,8 +319,6 @@ static void NvFuserScheduler_TIMM_vit_base_patch16_224_norm_inner3(
 
   at::manual_seed(0);
   auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  auto fp32_options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
 
   auto t0 = at::randn(input_shape, fp16_options);
 
@@ -367,7 +364,7 @@ static void setup_vit_base_patch16_224_bcast_outer6(
   auto t9 = add(IrBuilder::create<Double>(1), t8);
   auto t10 = mul(IrBuilder::create<Double>(0.5), t9);
   auto t11 = mul(t6, t10);
-  auto t12 = randlike(t11);
+  auto t12 = rand_like(t11);
   auto d66 = sub(IrBuilder::create<Double>(1), IrBuilder::create<Double>(0));
   auto t13 = lt(t12, d66);
   auto t14 = castOp(DataType::Float, t13);
@@ -456,7 +453,7 @@ static void setup_vit_base_patch16_224_bcast_inner6(
   auto t9 = add(IrBuilder::create<Double>(1), t8);
   auto t10 = mul(IrBuilder::create<Double>(0.5), t9);
   auto t11 = mul(t6, t10);
-  auto t12 = randlike(t11);
+  auto t12 = rand_like(t11);
   auto d66 = sub(IrBuilder::create<Double>(1), IrBuilder::create<Double>(0));
   auto t13 = lt(t12, d66);
   auto t14 = castOp(DataType::Float, t13);
diff --git a/build_variables.bzl b/build_variables.bzl
index 49b0734a7f1c6..fe72453878ed7 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -666,7 +666,7 @@ libtorch_cuda_core_sources = [
     "torch/csrc/autograd/functions/comm.cpp",
     "torch/csrc/jit/codegen/cuda/arith.cpp",
     "torch/csrc/jit/codegen/cuda/compute_at.cpp",
-    "torch/csrc/jit/codegen/cuda/inline_propagator.cpp",
+    "torch/csrc/jit/codegen/cuda/inlining.cpp",
     "torch/csrc/jit/codegen/cuda/compute_at_map.cpp",
     "torch/csrc/jit/codegen/cuda/codegen.cpp",
     "torch/csrc/jit/codegen/cuda/contiguity.cpp",
@@ -700,6 +700,7 @@ libtorch_cuda_core_sources = [
     "torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp",
     "torch/csrc/jit/codegen/cuda/lower_allocation.cpp",
     "torch/csrc/jit/codegen/cuda/lower_double_buffer.cpp",
+    "torch/csrc/jit/codegen/cuda/lower_divisible_split.cpp",
     "torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp",
     "torch/csrc/jit/codegen/cuda/lower_fused_reduction.cpp",
     "torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.cpp",
@@ -723,6 +724,7 @@ libtorch_cuda_core_sources = [
     "torch/csrc/jit/codegen/cuda/lower_validation.cpp",
     "torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp",
     "torch/csrc/jit/codegen/cuda/lower2device.cpp",
+    "torch/csrc/jit/codegen/cuda/lower_bank_conflict.cpp",
     "torch/csrc/jit/codegen/cuda/manager.cpp",
     "torch/csrc/jit/codegen/cuda/maxinfo_propagator.cpp",
     "torch/csrc/jit/codegen/cuda/mutator.cpp",
@@ -750,6 +752,7 @@ libtorch_cuda_core_sources = [
     "torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.cpp",
     "torch/csrc/jit/codegen/cuda/scheduler/registry.cpp",
     "torch/csrc/jit/codegen/cuda/scheduler/utils.cpp",
+    "torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.cpp",
     "torch/csrc/jit/codegen/cuda/type_inference.cpp",
     "torch/csrc/jit/codegen/cuda/type_promotion.cpp",
     "torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp",
diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt
index 947b13897cf1d..b8b765a68d8b4 100644
--- a/test/cpp/jit/CMakeLists.txt
+++ b/test/cpp/jit/CMakeLists.txt
@@ -99,7 +99,9 @@ if(USE_CUDA)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_definition.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_cache.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_record.cpp)
-  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu.cpp)
+  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu1.cpp)
+  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu2.cpp)
+  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu3.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_tensor_factories.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp)
@@ -107,7 +109,7 @@ if(USE_CUDA)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_view.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_rng.cu)
-  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_scheduler_utils.cpp)
+  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_utils.cpp)
 endif()
 
 add_executable(test_jit
diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py
index 44fa2556243d9..8e8af85faf5f2 100644
--- a/test/test_jit_cuda_fuser.py
+++ b/test/test_jit_cuda_fuser.py
@@ -20,7 +20,7 @@
 from torch.testing._internal.common_jit import JitCommonTestCase
 from torch.testing._internal.common_methods_invocations import op_db, SampleInput
 from torch.testing._internal.common_utils import run_tests, ProfilingMode, GRAPH_EXECUTOR, TEST_WITH_ROCM, slowTest, \
-    is_iterable_of_tensors, freeze_rng_state
+    is_iterable_of_tensors, freeze_rng_state, skipIfRocm
 from torch.testing._internal.jit_utils import clone_inputs, get_traced_sample_variant_pairs, JitTestCase, RUN_CUDA
 from torch.testing._internal.jit_metaprogramming_utils import create_traced_fn
 from torch.testing import FileCheck
@@ -1765,6 +1765,7 @@ def test_norm(self):
                         x[1] = C
                         self._norm_helper(x, torch.float32, "cuda", 1e-4, is_batch_norm_else_instance_norm)
 
+    @skipIfRocm
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
     @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
@@ -3386,6 +3387,7 @@ def test_batch_norm_impl_index_inner_bcast(self):
             training, track_running_stats = training_and_track
             self._test_batch_norm_impl_index_helper(2, 1, 1, affine, track_running_stats, training)
 
+    @skipIfRocm
     @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device")
     @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
@@ -4485,6 +4487,125 @@ def t(x, w):
             self.assertEqual(jit_o, o)
             self.assertGraphContainsExactly(t_jit.graph_for(x, w), FUSION_GUARD, 2, consider_subgraphs=True)
 
+    @skipIfRocm
+    # see issue here on why we disabled this test https://github.com/csarofeen/pytorch/issues/2127
+    @unittest.skipIf(is_pre_volta(), "permutation scheduling can be dangerous on pre-volta device")
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_view_before_permute(self):
+        view_examples = [[[1, 19, 1, 12, 7, 1, 99], [1, 19, 1, 3, 2772]],
+                         [[3, 17, 80, 1], [51, 1, 2, 4, 10]],
+                         [[3, 17, 80, 1, 9], [51, 1, 2, 4, 10, 9]],
+                         [[2, 3, 4, 5], [1, 6, 1, 2, 2, 5]],
+                         [[22, 22, 2], [22, 11, 1, 1, 4]],
+                         [[37, 9, 7, 6, 10], [333, 2, 2, 3, 35]],
+                         [[8, 1, 1, 8, 1, 8], [8, 2, 4, 1, 8]],
+                         [[1, 333, 1], [1, 37, 9]],
+                         [[1, 333], [1, 1, 1, 111, 1, 3]],
+                         [[1, 27454, 1, 2], [1, 7844, 1, 7]],
+                         [[1, 7844, 1, 7], [1, 27454, 2]]]
+
+        def _getTransposeAxes(sizes):
+            # broadcast do not change
+            # always move inner-most dim
+            # random permutation of other dims
+            result = []
+            valid_sizes = []
+            for idx, val in enumerate(sizes):
+                if val > 1 and idx < len(sizes) - 1:
+                    valid_sizes.append((idx, val))
+                result.append(idx)
+            idx, new_size = valid_sizes[random.randint(0, len(valid_sizes) - 1)]
+            result[idx] = len(sizes) - 1
+            result[len(sizes) - 1] = idx
+            return result
+
+        def _transposeSize(sizes, dims):
+            return [sizes[old_pos] for old_pos in dims]
+
+        for example in view_examples:
+            before_view_size, after_view_size = example
+            axes = _getTransposeAxes(after_view_size)
+            output_size = _transposeSize(after_view_size, axes)
+            self._view_before_permute_helper(before_view_size, after_view_size, output_size, axes)
+
+    def _view_before_permute_helper(self, input_shape, view_shape, output_shape, dims):
+        def t(x, y, view_shape : List[int], dims : List[int]):
+            x_v = x.view(view_shape)
+            x_t = torch.permute(x_v, dims)
+            o = torch.add(x_t, y)
+            o = torch.relu(o)
+            return o
+
+        x = torch.randn(*input_shape, device="cuda")
+        y = torch.randn(*output_shape, device="cuda")
+        t_jit = torch.jit.script(t)
+        self._run_helper(t_jit, t, x, y, view_shape, dims)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_permute(self):
+        max_dims = 4
+        for ndims in range(2, max_dims + 1):
+            shape = [idx + 2 for idx in range(ndims)]
+            for dims in itertools.permutations(range(ndims)):
+                self._permute_helper(shape, dims)
+
+    def _permute_helper(self, shape, dims):
+        def t(x, y, dims : List[int]):
+            x_t = torch.permute(x, dims)
+            y_t = torch.permute(y, dims)
+            o = torch.add(x_t, y_t)
+            o = torch.relu(o)
+            return o
+
+        x = torch.randn(*shape, device="cuda")
+        y = torch.randn(*shape, device="cuda")
+        t_jit = torch.jit.script(t)
+        self._run_helper(t_jit, t, x, y, dims)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_transpose(self):
+        max_dims = 4
+        for ndims in range(2, max_dims + 1):
+            shape = [idx + 2 for idx in range(ndims)]
+            for idx in range(1, ndims):
+                for jdx in range(idx):
+                    self._transpose_helper(shape, idx, jdx)
+
+    def _transpose_helper(self, shape, dim0, dim1):
+        def t(x, y, dim0 : int, dim1 : int):
+            x_t = torch.transpose(x, dim0, dim1)
+            y_t = torch.transpose(y, dim0, dim1)
+            o = torch.add(x_t, y_t)
+            o = torch.nn.functional.gelu(o)
+            return o
+
+        x = torch.randn(*shape, device="cuda")
+        y = torch.randn(*shape, device="cuda")
+        t_jit = torch.jit.script(t)
+        self._run_helper(t_jit, t, x, y, dim0, dim1)
+
+    @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
+                     "Requires fusion optimization pass to be effective")
+    def test_transpose_default(self):
+        def t(x, y):
+            x_t = torch.t(x)
+            y_t = torch.t(y)
+            o = torch.add(x_t, y_t)
+            o = torch.nn.functional.gelu(o)
+            return o
+
+        x = torch.randn(3, 5, device="cuda")
+        y = torch.randn(3, 5, device="cuda")
+        t_jit = torch.jit.script(t)
+        self._run_helper(t_jit, t, x, y)
+
     @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
                      "Requires fusion optimization pass to be effective")
diff --git a/torch/csrc/jit/codegen/cuda/arith.cpp b/torch/csrc/jit/codegen/cuda/arith.cpp
index de282dfc8182a..8e8d82128512b 100644
--- a/torch/csrc/jit/codegen/cuda/arith.cpp
+++ b/torch/csrc/jit/codegen/cuda/arith.cpp
@@ -449,10 +449,99 @@ TensorView* rand(const std::vector<Val*>& shape, DataType dtype) {
                  .contiguity(std::vector<bool>(n, true))
                  .shape(shape)
                  .build();
-  IrBuilder::create<RNGOp>(RNGOpType::Uniform, out);
+  IrBuilder::create<RNGOp>(RNGOpType::Uniform, out, dtype);
   return out;
 }
 
+// TENSOR FACTORIES
+TensorView* uniform(
+    const std::vector<Val*>& shape,
+    Val* low,
+    Val* high,
+    DataType dtype) {
+  auto n = shape.size();
+  auto out = TensorViewBuilder()
+                 .ndims(n)
+                 .dtype(dtype)
+                 .contiguity(std::vector<bool>(n, true))
+                 .shape(shape)
+                 .build();
+  IrBuilder::create<RNGOp>(
+      RNGOpType::UniformRange, out, dtype, std::vector<Val*>{low, high});
+  return out;
+}
+
+TensorView* rand_like(TensorView* tv) {
+  TORCH_CHECK(
+      isFloatingPointType(tv->dtype()),
+      "input must have floating point type, but got ",
+      tv->dtype());
+  std::vector<Val*> shape;
+  auto dom = TensorDomain::noReductions(tv->getMaybeRFactorDomain());
+  shape.reserve(dom.size());
+  for (auto id : dom) {
+    shape.emplace_back(id->getMaybeExpandedExtent());
+  }
+  return rand(shape, tv->dtype());
+}
+
+Val* rand_like(Val* v) {
+  return rand_like(v->as<TensorView>());
+}
+
+TensorView* full(
+    const std::vector<Val*>& shape,
+    Val* fill_value,
+    DataType dtype) {
+  auto n = shape.size();
+  auto out = TensorViewBuilder()
+                 .ndims(n)
+                 .dtype(dtype)
+                 .contiguity(std::vector<bool>(n, true))
+                 .shape(shape)
+                 .build();
+  IrBuilder::create<FullOp>(out, fill_value, dtype);
+  return out;
+}
+
+TensorView* full_like(TensorView* tv, Val* fill_value) {
+  std::vector<Val*> shape;
+  auto dom = TensorDomain::noReductions(tv->getMaybeRFactorDomain());
+  shape.reserve(dom.size());
+  for (auto id : dom) {
+    shape.emplace_back(id->getMaybeExpandedExtent());
+  }
+  return full(shape, fill_value, tv->dtype());
+}
+
+Val* full_like(Val* v, Val* fill_value) {
+  return full_like(v->as<TensorView>(), fill_value);
+}
+
+TensorView* zeros(const std::vector<Val*>& shape, DataType dtype) {
+  return full(shape, FusionGuard::getCurFusion()->zeroVal(), dtype);
+}
+
+TensorView* zeros_like(TensorView* tv) {
+  return full_like(tv, FusionGuard::getCurFusion()->zeroVal());
+}
+
+Val* zeros_like(Val* v) {
+  return zeros_like(v->as<TensorView>());
+}
+
+TensorView* ones(const std::vector<Val*>& shape, DataType dtype) {
+  return full(shape, FusionGuard::getCurFusion()->oneVal(), dtype);
+}
+
+TensorView* ones_like(TensorView* tv) {
+  return full_like(tv, FusionGuard::getCurFusion()->oneVal());
+}
+
+Val* ones_like(Val* v) {
+  return ones_like(v->as<TensorView>());
+}
+
 TensorView* arange(Val* end, DataType dtype) {
   return arange(FusionGuard::getCurFusion()->zeroVal(), end, dtype);
 }
@@ -471,17 +560,36 @@ TensorView* arange(Val* start, Val* end, Val* step, DataType dtype) {
     end = castOp(DataType::Double, end);
     step = castOp(DataType::Double, step);
   }
-  auto size = castOp(DataType::Int, ceilDiv(sub(end, start), step));
+  // Make sure no negative value is passed to ceilDiv as the device
+  // implementation of ceilDiv assumes positive inputs
+  auto size = castOp(DataType::Int, ceilDiv(abs(sub(end, start)), abs(step)));
   auto out = TensorViewBuilder()
                  .ndims(1)
                  .dtype(dtype)
                  .contiguity({true})
                  .shape({size})
                  .build();
-  IrBuilder::create<ARangeOp>(out, start, end, step);
+  IrBuilder::create<ARangeOp>(out, start, end, step, dtype);
+  return out;
+}
+
+TensorView* eye(Val* rows, Val* cols, DataType dtype) {
+  TORCH_CHECK(rows->getDataType() == DataType::Int, "rows must have type Int");
+  TORCH_CHECK(cols->getDataType() == DataType::Int, "cols must have type Int");
+  auto out = TensorViewBuilder()
+                 .ndims(2)
+                 .dtype(dtype)
+                 .contiguity({true, true})
+                 .shape(std::vector<Val*>{rows, cols})
+                 .build();
+  IrBuilder::create<EyeOp>(out, dtype);
   return out;
 }
 
+TensorView* eye(Val* size, DataType dtype) {
+  return eye(size, size, dtype);
+}
+
 // UNARY OPERATIONS
 
 #define NVFUSER_DEFINE_UNARY_OP(op_name, op_type) \
@@ -504,23 +612,6 @@ NVFUSER_DEFINE_UNARY_OP(trunc, Trunc)
 NVFUSER_DEFINE_UNARY_OP(print, Print)
 #undef NVFUSER_DEFINE_UNARY_OP
 
-TensorView* randlike(TensorView* v) {
-  TORCH_CHECK(
-      isFloatingPointType(v->dtype()),
-      "input must have floating point type, but got ",
-      v->dtype());
-  std::vector<Val*> shape;
-  shape.reserve(v->getMaybeRFactorDomain().size());
-  for (auto id : v->getMaybeRFactorDomain()) {
-    shape.emplace_back(id->getMaybeExpandedExtent());
-  }
-  return rand(shape, v->dtype());
-}
-
-Val* randlike(Val* v) {
-  return randlike(v->as<TensorView>());
-}
-
 Val* bitwise_not(Val* v) {
   TORCH_CHECK(
       isIntegralType(v->dtype()) || v->dtype() == DataType::Bool,
diff --git a/torch/csrc/jit/codegen/cuda/arith.h b/torch/csrc/jit/codegen/cuda/arith.h
index d8e6b65882146..66344c74880c0 100644
--- a/torch/csrc/jit/codegen/cuda/arith.h
+++ b/torch/csrc/jit/codegen/cuda/arith.h
@@ -121,10 +121,39 @@ TORCH_CUDA_CU_API WelfordResult Welford(
     // import IrBuilder just for this one interface.
     Int* init_N = nullptr);
 
-// TENSOR FACTORIES
+// RNG OPERATIONS
 TORCH_CUDA_CU_API TensorView* rand(
     const std::vector<Val*>& shape,
     DataType dtype);
+TORCH_CUDA_CU_API Val* rand_like(Val*);
+TORCH_CUDA_CU_API TensorView* rand_like(TensorView*);
+
+TORCH_CUDA_CU_API TensorView* uniform(
+    const std::vector<Val*>& shape,
+    Val* low,
+    Val* high,
+    DataType dtype);
+
+// TENSOR FACTORIES
+TORCH_CUDA_CU_API TensorView* full(
+    const std::vector<Val*>& shape,
+    Val* fill_value,
+    DataType dtype);
+TORCH_CUDA_CU_API TensorView* full_like(TensorView* tv, Val* fill_value);
+TORCH_CUDA_CU_API Val* full_like(Val* tv, Val* fill_value);
+TORCH_CUDA_CU_API TensorView* zeros(
+    const std::vector<Val*>& shape,
+    DataType dtype);
+TORCH_CUDA_CU_API TensorView* zeros_like(TensorView*);
+TORCH_CUDA_CU_API Val* zeros_like(Val*);
+TORCH_CUDA_CU_API TensorView* ones(
+    const std::vector<Val*>& shape,
+    DataType dtype);
+TORCH_CUDA_CU_API TensorView* ones_like(TensorView*);
+TORCH_CUDA_CU_API Val* ones_like(Val*);
+//! WARNING: giving invalid combinations of the start, end and step
+//! arguments can result in undefined behavior. Specifically, the
+//! signs of `end - start` and step must be the same.
 TORCH_CUDA_CU_API TensorView* arange(Val* end, DataType dtype = DataType::Int);
 TORCH_CUDA_CU_API TensorView* arange(
     Val* start,
@@ -135,6 +164,8 @@ TORCH_CUDA_CU_API TensorView* arange(
     Val* end,
     Val* step,
     DataType dtype = DataType::Int);
+TORCH_CUDA_CU_API TensorView* eye(Val* size, DataType dtype);
+TORCH_CUDA_CU_API TensorView* eye(Val* rows, Val* cols, DataType dtype);
 
 // UNARY OPERATIONS
 // abs
@@ -200,9 +231,6 @@ TORCH_CUDA_CU_API TensorView* log2(TensorView*);
 // neg
 TORCH_CUDA_CU_API Val* neg(Val*);
 TORCH_CUDA_CU_API TensorView* neg(TensorView*);
-// randlike
-TORCH_CUDA_CU_API Val* randlike(Val*);
-TORCH_CUDA_CU_API TensorView* randlike(TensorView*);
 // real
 TORCH_CUDA_CU_API Val* real(Val*);
 TORCH_CUDA_CU_API TensorView* real(TensorView*);
diff --git a/torch/csrc/jit/codegen/cuda/codegen.cpp b/torch/csrc/jit/codegen/cuda/codegen.cpp
index 6ebb2753ecb8a..e62528fdabc3e 100644
--- a/torch/csrc/jit/codegen/cuda/codegen.cpp
+++ b/torch/csrc/jit/codegen/cuda/codegen.cpp
@@ -264,9 +264,6 @@ class CudaKernelGenerator : private OptOutConstDispatch {
       indent()
           << "  static_cast<uint64_t>(*(philox_args.offset_.ptr) + philox_args.offset_intragraph_) :\n";
       indent() << "  philox_args.offset_.val;\n";
-      indent() << "auto seed = philox_args.captured_ ?\n";
-      indent()
-          << "  static_cast<uint64_t>(*(philox_args.seed_.ptr)) : philox_args.seed_.val;\n";
       indent() << "uint4 rng_result;\n";
       indent() << "nvfuser_index_t rng_subseq = -1;\n";
       indent() << "nvfuser_index_t rng_offset = -1;\n";
@@ -546,9 +543,18 @@ class CudaKernelGenerator : private OptOutConstDispatch {
   void genCpAsync(const LoadStoreOp* ldst, int vec_size) {
     auto dtype = ldst->in()->getDataType().value();
 
-    indent() << "Ampere::cpAsync("
-             << genVectorPointer(ldst->out(), dtype, vec_size) << ","
-             << genVectorPointer(ldst->in(), dtype, vec_size) << ");\n";
+    if (ldst->predicate() == nullptr) {
+      // Out of line predicate variant
+      indent() << "Ampere::cpAsync("
+               << genVectorPointer(ldst->out(), dtype, vec_size) << ","
+               << genVectorPointer(ldst->in(), dtype, vec_size) << ");\n";
+    } else {
+      // Inline predicate variant
+      indent() << "Ampere::cpAsync("
+               << genVectorPointer(ldst->out(), dtype, vec_size) << ","
+               << genVectorPointer(ldst->in(), dtype, vec_size) << ","
+               << genInline(ldst->predicate()) << ");\n";
+    }
   }
 
   void genLdMatrix(const LoadStoreOp* ldst, int vector_word_size) {
@@ -563,14 +569,26 @@ class CudaKernelGenerator : private OptOutConstDispatch {
           << "&" << gen(ldst->in()) << ");\n";
   }
 
+  void handle(const FullOp* fop) final {
+    indent() << gen(fop->output(0)) << " = (" << fop->dtype() << ")"
+             << gen(fop->getFillValue()) << ";\n";
+  }
+
   void handle(const ARangeOp* aop) final {
-    auto index = genTensorIndex(aop->getLinearIndex()->as<kir::TensorIndex>());
-    indent() << gen(aop->output(0)) << " = arange<" << aop->output(0)->dtype()
-             << ">";
+    auto index =
+        genTensorIndex(aop->getLinearLogicalIndex()->as<kir::TensorIndex>());
+    indent() << gen(aop->output(0)) << " = arange<" << aop->dtype() << ">";
     code_ << "(" << index << ", " << gen(aop->start()) << ", "
           << gen(aop->step()) << ");\n";
   }
 
+  void handle(const EyeOp* aop) final {
+    auto index1 = gen(aop->getIndex1());
+    auto index2 = gen(aop->getIndex2());
+    indent() << gen(aop->output(0)) << " = (" << aop->dtype() << ")";
+    code_ << "(" << index1 << " == " << index2 << ");\n";
+  }
+
   void handle(const UnaryOp* uop) final {
     bool is_vector_op = false;
     size_t vector_word_size = 1;
@@ -762,9 +780,8 @@ class CudaKernelGenerator : private OptOutConstDispatch {
   void handle(const RNGOp* rop) final {
     // TODO: TORCH_INTERNAL_ASSERT that the scheduler correctly creates an
     // innermost ID of size 4 (float) or size 2 (double)?
-    auto out_tv = rop->output(0)->as<kir::TensorIndex>()->view();
     auto index = genTensorIndex(rop->getPhiloxIndex()->as<kir::TensorIndex>());
-    int multiple = out_tv->getDataType() == DataType::Double ? 2 : 4;
+    int multiple = rop->dtype() == DataType::Double ? 2 : 4;
     indent() << "nvfuser_index_t linear_index" << rop->name() << " = " << index
              << ";\n";
     indent() << "nvfuser_index_t rng_subseq" << rop->name() << " = linear_index"
@@ -775,6 +792,9 @@ class CudaKernelGenerator : private OptOutConstDispatch {
              << rop->getRNGOffset() << ";\n";
     indent() << "if (rng_subseq != rng_subseq" << rop->name()
              << " || rng_offset != rng_offset" << rop->name() << ") {\n";
+    indent() << "  auto seed = philox_args.captured_ ?\n"
+             << "      static_cast<uint64_t>(*(philox_args.seed_.ptr)) : \n"
+             << "      philox_args.seed_.val;\n";
     indent() << "  rng_result = philox(seed, rng_subseq" << rop->name()
              << ", philox_offset / 4 + rng_offset" << rop->name() << ");\n";
     indent() << "  rng_subseq = rng_subseq" << rop->name() << ";\n";
@@ -782,11 +802,20 @@ class CudaKernelGenerator : private OptOutConstDispatch {
     indent() << "}\n";
     auto op_type = rop->getRNGOpType();
     indent() << gen(rop->output(0)) << " = " << op_type;
-    if (needFloatSuffix(op_type) &&
-        rop->output(0)->dtype() == DataType::Float) {
+    if (needFloatSuffix(op_type) && rop->dtype() == DataType::Float) {
       code_ << "f";
     }
-    code_ << "(rng_result, rng_component" << rop->name() << ");\n";
+    code_ << "(rng_result, rng_component" << rop->name();
+    switch (op_type) {
+      case RNGOpType::UniformRange: {
+        auto parameters = rop->getParameters();
+        TORCH_INTERNAL_ASSERT(parameters.size() == 2);
+        code_ << ", " << gen(parameters[0]) << ", " << gen(parameters[1]);
+        break;
+      }
+      default:;
+    }
+    code_ << ");\n";
   }
 
   std::string genBinaryOp(
diff --git a/torch/csrc/jit/codegen/cuda/compute_at.cpp b/torch/csrc/jit/codegen/cuda/compute_at.cpp
index ae6231614b7ff..d8f950848f8fc 100644
--- a/torch/csrc/jit/codegen/cuda/compute_at.cpp
+++ b/torch/csrc/jit/codegen/cuda/compute_at.cpp
@@ -213,20 +213,21 @@ void ComputeAt::runAt(
   auto selected = getPropagationSubgraph(producer, consumer);
   ComputeAtSelector selector(selected);
 
-  InlinePropagator inline_propagator(
-      consumer, consumer_position, mode, selector.selected());
-
   MaxRootDomainInfoSpanningTree path(consumer, consumer_position, &selector);
 
   if (mode == ComputeAtMode::MostInlined) {
     MostInlinedTransformPropagator propagator;
     path.traverse(&propagator);
+    inlineMost(selected);
   } else {
     TransformPropagator propagator(consumer, consumer_position);
     path.traverse(&propagator);
+    inlineSelectedAt(
+        selected,
+        consumer,
+        consumer_position,
+        mode == ComputeAtMode::BestEffort);
   }
-
-  path.traverse(&inline_propagator);
 }
 
 void ComputeAt::runWith(
@@ -253,19 +254,21 @@ void ComputeAt::runWith(
   auto selected = getPropagationSubgraph(producer, consumer);
   ComputeAtSelector selector(selected);
 
-  InlinePropagator inline_propagator(
-      producer, producer_position, mode, selector.selected());
-
   MaxRootDomainInfoSpanningTree path(producer, producer_position, &selector);
 
   if (mode == ComputeAtMode::MostInlined) {
     MostInlinedTransformPropagator propagator;
     path.traverse(&propagator);
+    inlineMost(selected);
   } else {
     TransformPropagator propagator(producer, producer_position);
     path.traverse(&propagator);
+    inlineSelectedAt(
+        selected,
+        producer,
+        producer_position,
+        mode == ComputeAtMode::BestEffort);
   }
-  path.traverse(&inline_propagator);
 }
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/compute_at.h b/torch/csrc/jit/codegen/cuda/compute_at.h
index 98100334d72b6..d3d3fdb299dd6 100644
--- a/torch/csrc/jit/codegen/cuda/compute_at.h
+++ b/torch/csrc/jit/codegen/cuda/compute_at.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/csrc/jit/codegen/cuda/inline_propagator.h>
+#include <torch/csrc/jit/codegen/cuda/inlining.h>
 #include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
 #include <torch/csrc/jit/codegen/cuda/transform_replay.h>
 
diff --git a/torch/csrc/jit/codegen/cuda/compute_at_map.cpp b/torch/csrc/jit/codegen/cuda/compute_at_map.cpp
index e223c0ce51646..7f3de6687eb3a 100644
--- a/torch/csrc/jit/codegen/cuda/compute_at_map.cpp
+++ b/torch/csrc/jit/codegen/cuda/compute_at_map.cpp
@@ -6,6 +6,8 @@
 #include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
 #include <torch/csrc/jit/codegen/cuda/transform_iter.h>
 
+#include <tuple>
+
 namespace torch {
 namespace jit {
 namespace fuser {
@@ -29,8 +31,22 @@ bool idIsALeafDomain(IterDomain* id, TensorView* tv) {
 
 } // namespace
 
-IterDomainGraph::IterDomainGraph(Fusion* fusion) {
+IterDomainGraph::IterDomainGraph(Fusion* fusion, bool allow_self_mapping) {
   build(fusion);
+
+  if (!allow_self_mapping) {
+    TORCH_INTERNAL_ASSERT(
+        !hasSelfMapping(),
+        "Unsupported domain mapping detected in ",
+        std::get<0>(*self_mapping_info_)->toString(),
+        ". ",
+        std::get<3>(*self_mapping_info_),
+        " domains, ",
+        std::get<1>(*self_mapping_info_)->toString(),
+        " and ",
+        std::get<2>(*self_mapping_info_)->toString(),
+        ", are mapped with each other.");
+  }
 }
 
 //! Map corresponding inputs and outputs of swizzle op together
@@ -55,7 +71,11 @@ void mapMaybeSwizzleOp(
   }
 }
 
-bool IterDomainGraph::exprsMap(Expr* first, Expr* second, bool forward) {
+bool IterDomainGraph::exprsMap(
+    Expr* first,
+    Expr* second,
+    bool forward,
+    const DisjointSets<IterDomain*>& id_map) {
   if (first == nullptr || second == nullptr) {
     return false;
   }
@@ -101,8 +121,7 @@ bool IterDomainGraph::exprsMap(Expr* first, Expr* second, bool forward) {
             zipped_ids.begin(),
             zipped_ids.end(),
             [&](std::pair<IterDomain*, IterDomain*> id_pair) {
-              return !exact_nodes_.strictAreMapped(
-                  id_pair.first, id_pair.second);
+              return !id_map.strictAreMapped(id_pair.first, id_pair.second);
             })) {
       return false;
     }
@@ -151,7 +170,7 @@ void IterDomainGraph::mapThroughExpr(Expr* first, Expr* second, bool forward) {
     return;
   }
 
-  if (!exprsMap(first, second, forward)) {
+  if (!exprsMap(first, second, forward, exact_nodes_)) {
     return;
   }
 
@@ -173,6 +192,78 @@ void IterDomainGraph::mapThroughExpr(Expr* first, Expr* second, bool forward) {
   }
 }
 
+namespace {
+
+// Returns a pair of mapped IDs
+c10::optional<std::pair<IterDomain*, IterDomain*>> detectMappablePair(
+    const std::vector<IterDomain*>& ids,
+    const IterDomainGraph& id_graph) {
+  for (auto id1 : ids) {
+    for (auto id2 : ids) {
+      if (id1 == id2) {
+        continue;
+      }
+      if (id_graph.permissiveNodes().disjointSetMap().at(id1)->has(id2)) {
+        return std::make_pair(id1, id2);
+      }
+    }
+  }
+
+  return {};
+}
+
+// It is assumed that for any tensor represented by a list of domains,
+// those domains should never be mapped with each other. It may be
+// possible to lift this assumption, but it's unclear if it could
+// matter in practice.
+c10::optional<std::tuple<TensorView*, IterDomain*, IterDomain*, std::string>>
+findFirstSelfMapping(Fusion* fusion, const IterDomainGraph& id_graph) {
+  for (auto tv : ir_utils::allTvs(fusion)) {
+    // For each tensor, make sure root, rfactor and leaf domains
+    // should not include domains that are mapped with another domain
+    // in the same set of domains. This may be overly conservative,
+    // and it maybe enough to check the root domains.
+
+    // Root domains
+    auto self_mappped_root_pair =
+        detectMappablePair(tv->getRootDomain(), id_graph);
+    if (self_mappped_root_pair.has_value()) {
+      return std::make_tuple(
+          tv,
+          self_mappped_root_pair->first,
+          self_mappped_root_pair->second,
+          "Root");
+    }
+
+    // Rfactor domains
+    if (tv->hasRFactor()) {
+      auto self_mappped_rf_pair =
+          detectMappablePair(tv->getRFactorDomain(), id_graph);
+      if (self_mappped_rf_pair.has_value()) {
+        return std::make_tuple(
+            tv,
+            self_mappped_rf_pair->first,
+            self_mappped_rf_pair->second,
+            "RFactor");
+      }
+    }
+
+    // Leaf domains
+    auto self_mappped_leaf_pair =
+        detectMappablePair(tv->domain()->domain(), id_graph);
+    if (self_mappped_leaf_pair.has_value()) {
+      return std::make_tuple(
+          tv,
+          self_mappped_leaf_pair->first,
+          self_mappped_leaf_pair->second,
+          "Leaf");
+    }
+  }
+  return c10::nullopt;
+}
+
+} // namespace
+
 void IterDomainGraph::build(Fusion* fusion) {
   FusionGuard fg(fusion);
 
@@ -515,6 +606,7 @@ void IterDomainGraph::build(Fusion* fusion) {
       }
     }
   }
+  self_mapping_info_ = findFirstSelfMapping(fusion, *this);
 }
 
 void IterDomainGraph::initializeId(
@@ -587,7 +679,7 @@ void ComputeAtMap::allocateIndexVariables() {
                   // Halo extended parallel loops currently are handled
                   // differently and an index variable would still
                   // be allocated in this case.
-                  (GpuLower::current()->haloInfo().getExtent(id) == nullptr)) {
+                  (GpuLower::current()->haloInfo()->getExtent(id) == nullptr)) {
                 ptype = id->getParallelType();
                 return true;
               }
diff --git a/torch/csrc/jit/codegen/cuda/compute_at_map.h b/torch/csrc/jit/codegen/cuda/compute_at_map.h
index 31c2d8752f712..5ea92dff16447 100644
--- a/torch/csrc/jit/codegen/cuda/compute_at_map.h
+++ b/torch/csrc/jit/codegen/cuda/compute_at_map.h
@@ -54,7 +54,7 @@ namespace cuda {
 //   Do not forward through any broadcast IDs
 class TORCH_CUDA_CU_API IterDomainGraph {
  public:
-  IterDomainGraph(Fusion* fusion);
+  IterDomainGraph(Fusion* fusion, bool allow_self_mapping = false);
 
   const DisjointSets<IterDomain*>& permissiveNodes() const {
     return permissive_nodes_;
@@ -88,15 +88,25 @@ class TORCH_CUDA_CU_API IterDomainGraph {
     return view_rfactor_ids_;
   }
 
+  // Returns if first and second are expressions through which the provided
+  // id_map have matching inputs (if forward), or outputs (if not forward).
+  // Returning true means the expressions are "the same", in terms they modify
+  // matching original extents, by the same amount.
+  static bool exprsMap(
+      Expr* first,
+      Expr* second,
+      bool forward,
+      const DisjointSets<IterDomain*>& id_map);
+
+  bool hasSelfMapping() const {
+    return self_mapping_info_.has_value();
+  }
+
  private:
   void build(Fusion* fusion);
 
   void initializeId(IterDomain* id, bool is_view_rfactor_id, bool is_leaf_id);
 
-  // Returns if first and second are expressions with inputs match through exact
-  // map (if forward), or outputs match (if not forward).
-  bool exprsMap(Expr* first, Expr* second, bool forward);
-
   // Checks if exprsMap then if forward will map outputs else inputs in exact
   // and permissive map.
   void mapThroughExpr(Expr* first, Expr* second, bool forward);
@@ -116,6 +126,9 @@ class TORCH_CUDA_CU_API IterDomainGraph {
   VectorOfUniqueEntries<IterDomain*> all_ids_;
 
   std::unordered_set<IterDomain*> view_rfactor_ids_;
+
+  c10::optional<std::tuple<TensorView*, IterDomain*, IterDomain*, std::string>>
+      self_mapping_info_ = c10::nullopt;
 };
 
 class TrivialReductionInfo;
diff --git a/torch/csrc/jit/codegen/cuda/contiguity.cpp b/torch/csrc/jit/codegen/cuda/contiguity.cpp
index 4817693bebdc3..dcb39d948c672 100644
--- a/torch/csrc/jit/codegen/cuda/contiguity.cpp
+++ b/torch/csrc/jit/codegen/cuda/contiguity.cpp
@@ -1,4 +1,5 @@
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 
 #include <torch/csrc/jit/codegen/cuda/contiguity.h>
@@ -8,20 +9,454 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
+OrderedIdInformation::OrderedIdInformation(
+    const std::vector<IterDomain*>& ids,
+    const std::vector<IterDomain*>& root_domain,
+    std::shared_ptr<const ConcretizedBroadcastDomains> concrete_info)
+    : active_ids_(root_domain), concrete_info_(concrete_info) {
+  if (ids.empty() || root_domain.empty()) {
+    return;
+  }
+
+  // Grab root ids and initialize them.
+  for (const auto root_i : c10::irange(root_domain.size())) {
+    auto root_id = root_domain[root_i]->as<IterDomain>();
+
+    // Initialize id_to_root_ids to map roots to themselves
+    id_to_root_ids_[root_id] = {root_id};
+
+    // Initialize roots as being made up of correctly ordered transforms.
+    consistently_ordered_ids_.emplace(root_id);
+
+    exclusively_consumes_roots_.emplace(root_id);
+  }
+
+  // Iterate from the root domain to the provided ids and fill
+  // consistently_ordered_ids_, id_to_root_ids_, and exclusively_consumes_roots_
+  // for all the IDs
+  auto exprs = StmtSort::getExprsBetween(
+      ids[0]->fusion(),
+      {root_domain.begin(), root_domain.end()},
+      {ids.begin(), ids.end()});
+
+  for (auto expr : exprs) {
+    OptInDispatch::handle(expr);
+  }
+}
+
+bool OrderedIdInformation::checkExclusivelyConsumesRoots(IterDomain* id) {
+  TORCH_INTERNAL_ASSERT(
+      std::find(active_ids_.begin(), active_ids_.end(), id) !=
+          active_ids_.end(),
+      "Error replaying transforms in contiguous ID checker, expected ",
+      id->toString(),
+      " to be in the active ID set.");
+
+  auto root_id_it = id_to_root_ids_.find(id);
+  TORCH_INTERNAL_ASSERT(
+      root_id_it != id_to_root_ids_.end(),
+      "Error replaying transforms in contiguous ID checker, couldn't find mapped roots of ",
+      id->toString());
+
+  const auto& root_ids = root_id_it->second;
+
+  // Check all the roots of all other ids, to see if any root_ids in id are also
+  // in them.
+  for (auto other_active_id : active_ids_) {
+    if (other_active_id == id || other_active_id == nullptr) {
+      continue;
+    }
+
+    auto root_id_it = id_to_root_ids_.find(other_active_id);
+    TORCH_INTERNAL_ASSERT(
+        root_id_it != id_to_root_ids_.end(),
+        "Error replaying transforms in contiguous ID checker, couldn't find mapped roots of ",
+        other_active_id->toString());
+
+    const auto& other_root_ids = root_id_it->second;
+
+    for (auto other_root_id : other_root_ids) {
+      if (root_ids.has(other_root_id)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+void OrderedIdInformation::handle(Merge* merge) {
+  // Find inputs in the active_ids_ vector
+  const auto inner_it =
+      std::find(active_ids_.begin(), active_ids_.end(), merge->inner());
+  const auto outer_it =
+      std::find(active_ids_.begin(), active_ids_.end(), merge->outer());
+
+  // If either aren't in active_ids_ it means the inputs were detected to not be
+  // ordered correctly before hitting this expression.
+  if (inner_it == active_ids_.end() || outer_it == active_ids_.end()) {
+    return;
+  }
+
+  auto inner_pos = std::distance(active_ids_.begin(), inner_it);
+  auto outer_pos = std::distance(active_ids_.begin(), outer_it);
+
+  // Find inputs in the ordered transforms map
+  const auto inner_ordered_it = consistently_ordered_ids_.find(merge->inner());
+  const auto outer_ordered_it = consistently_ordered_ids_.find(merge->outer());
+
+  bool inner_ordered = inner_ordered_it != consistently_ordered_ids_.end();
+  bool outer_ordered = outer_ordered_it != consistently_ordered_ids_.end();
+
+  // Get root ids of the two inputs
+  const auto inner_root_ids_it = id_to_root_ids_.find(merge->inner());
+  const auto outer_root_ids_it = id_to_root_ids_.find(merge->outer());
+
+  TORCH_INTERNAL_ASSERT(
+      inner_root_ids_it != id_to_root_ids_.end() &&
+          outer_root_ids_it != id_to_root_ids_.end(),
+      "Error replaying transforms in contiguous ID checker.");
+
+  const auto& inner_root_ids = inner_root_ids_it->second;
+  const auto& outer_root_ids = outer_root_ids_it->second;
+
+  // TODO: Concretization may prevent contiguous indexing or vectorization.
+  //  It prevents contiguous indexing if the concretization is within the IDs
+  //  that are used for indexing.
+  //  For vectorization it just means we need to make sure the extents of the
+  //  axes to the right of the broadcast root domain in the contigous merge is
+  //  bigger than the vectorization dimension. And that the tensor buffer
+  //  supports the vector word size (always done).
+  bool outer_is_concretized_bcast = merge->outer()->isBroadcast() &&
+      concrete_info_->isConcretized(merge->outer());
+
+  bool inner_is_concretized_bcast = merge->inner()->isBroadcast() &&
+      concrete_info_->isConcretized(merge->inner());
+
+  // Update maps
+  // Find the position inner would have to have to be considered ordered
+  auto pos_after_outer = outer_pos + 1;
+  for (; pos_after_outer < active_ids_.size(); pos_after_outer++) {
+    if (active_ids_[pos_after_outer] == nullptr) {
+      // Can't be considered ordered after a nullptr
+      break;
+    }
+    if (active_ids_[pos_after_outer]->isReduction() ||
+        ((active_ids_[pos_after_outer]->isBroadcast() &&
+          !concrete_info_->isConcretized(active_ids_[pos_after_outer])))) {
+      // Skip reduction or broadcast axes that aren't concretized in the fusion
+      continue;
+    }
+    break;
+  }
+
+  // The output is ordered as long as the inputs were ordered and outer position
+  // is directly left of the inner position.
+  bool out_ordered = inner_ordered && outer_ordered;
+  out_ordered = out_ordered &&
+      // If inner_pos is before outer_pos it's not ordered correctly. If for
+      // some reason it's the same, that would be an error.
+      inner_pos > outer_pos &&
+      // Inner could be a broadcast, so doesn't have to be right on
+      // pos_after_outer as that ID (if it exists) should not be a broadcast.
+      // However, merging over a broadcast should be fine.
+      inner_pos <= pos_after_outer && !inner_is_concretized_bcast &&
+      !outer_is_concretized_bcast;
+
+  if (out_ordered) {
+    consistently_ordered_ids_.emplace(merge->out());
+  }
+
+  // Don't just remove active_ids_, as if we have something like:
+  //   [i0, i1, i2, i3]
+  //   ->merge(0, 2)
+  //   ->merge(1)
+  // The latter merge looks like it's ordered correctly, if we update the active
+  // map as:
+  //   [i0, i1, i2, i3] -> [i0*i2, i1, i3]
+  // Hoever if we instead mark it as:
+  //   [i0, i1, i2, i3] -> [i0*i2, i1, nullptr, i3]
+  // Or:
+  //   [i0, i1, i2, i3] -> [nullptr, i1, i0*i2, i3]
+  // It's clear the second merge is not ordered correctly. Doesn't matter which
+  // direction we put the iter domain in, prefer putting it in outer as we often
+  // are looking for inner dimensions that are contiguous. We don't want to
+  // always do this, as it could make ordered merges look non-ordered.
+  // For exmaple: [i0, i1, i2, i3]
+  // ->merge(0)
+  // ->merge(1)
+  // ->merge(0)
+  // If it's updated as:
+  // [i0, i1, i2, i3]
+  // -> [i0*i1, nullptr, i2, i3]
+  // -> [i0*i1, nullptr, i2*i3, nullptr]
+  // Now the final merge looks non-ordered but it is. So only insert a nullptr
+  // entry if the out is not ordered.
+  active_ids_[outer_pos] = merge->out();
+
+  if (!out_ordered) {
+    active_ids_[inner_pos] = nullptr;
+  } else {
+    active_ids_.erase(active_ids_.begin() + inner_pos);
+    for (auto i = outer_pos + 1; i < inner_pos; i++) {
+      // If there's broadcast axes between outer and inner and the merge was
+      // contiguous, there may be broadcasts between outer and inner that cannot
+      // be ordered merged anywhere else so remove them.
+      active_ids_.erase(active_ids_.begin() + outer_pos + 1);
+    }
+  }
+
+  // Update the root_id entry for the output.
+  VectorOfUniqueEntries<IterDomain*> root_ids = inner_root_ids;
+  root_ids.pushBack(outer_root_ids);
+
+  id_to_root_ids_[merge->out()] = root_ids;
+
+  // Need to check this after updating active_ids_ and id_to_root_ids_
+  if (checkExclusivelyConsumesRoots(merge->out())) {
+    exclusively_consumes_roots_.emplace(merge->out());
+  }
+}
+
+void OrderedIdInformation::handle(Split* split) {
+  // Find the input in the active_ids_ vector
+  const auto in_it =
+      std::find(active_ids_.begin(), active_ids_.end(), split->in());
+
+  if (in_it == active_ids_.end()) {
+    return;
+  }
+
+  auto in_pos = std::distance(active_ids_.begin(), in_it);
+
+  // Find the input in the ordered transforms map
+  const auto in_ordered_it = consistently_ordered_ids_.find(split->in());
+
+  bool in_ordered = in_ordered_it != consistently_ordered_ids_.end();
+
+  // Get root ids of the input
+  const auto in_root_ids_it = id_to_root_ids_.find(split->in());
+
+  TORCH_INTERNAL_ASSERT(
+      in_root_ids_it != id_to_root_ids_.end(),
+      "Error replaying transforms in contiguous ID checker.");
+
+  VectorOfUniqueEntries<IterDomain*> in_root_ids = in_root_ids_it->second;
+
+  // Update map for outputs
+  // Remove inputs from the active_ids_ and insert the output ID
+  active_ids_[in_pos] = split->outer();
+  active_ids_.insert(active_ids_.begin() + in_pos + 1, split->inner());
+
+  // The outputs are ordered as long as the input is ordered.
+  if (in_ordered) {
+    consistently_ordered_ids_.emplace(split->outer());
+    consistently_ordered_ids_.emplace(split->inner());
+  }
+
+  // Update the root_id entry for the outputs.
+  id_to_root_ids_[split->outer()] = in_root_ids;
+  id_to_root_ids_[split->inner()] = in_root_ids;
+}
+
+// Swizzle generally can't be contiguous because of the non-affine nature of it,
+// but we can still analyze the operation in the same way as merge/split.
+void OrderedIdInformation::handle(Swizzle2D* swizzle) {
+  // Find inputs in the active_ids_ vector
+  const auto in_x_it =
+      std::find(active_ids_.begin(), active_ids_.end(), swizzle->inX());
+  const auto in_y_it =
+      std::find(active_ids_.begin(), active_ids_.end(), swizzle->inY());
+
+  if (in_x_it == active_ids_.end() || in_y_it == active_ids_.end()) {
+    return;
+  }
+
+  auto in_x_pos = std::distance(active_ids_.begin(), in_x_it);
+  auto in_y_pos = std::distance(active_ids_.begin(), in_y_it);
+
+  // Find inputs in the ordered transforms map
+  const auto in_x_ordered_it = consistently_ordered_ids_.find(swizzle->inX());
+  const auto in_y_ordered_it = consistently_ordered_ids_.find(swizzle->inY());
+
+  bool in_x_ordered = in_x_ordered_it != consistently_ordered_ids_.end();
+  bool in_y_ordered = in_y_ordered_it != consistently_ordered_ids_.end();
+
+  // Get root ids of the two inputs
+  const auto in_x_root_ids_it = id_to_root_ids_.find(swizzle->inX());
+  const auto in_y_root_ids_it = id_to_root_ids_.find(swizzle->inY());
+
+  TORCH_INTERNAL_ASSERT(
+      in_x_root_ids_it != id_to_root_ids_.end() &&
+          in_y_root_ids_it != id_to_root_ids_.end(),
+      "Error replaying transforms in contiguous ID checker.");
+
+  const auto& in_x_root_ids = in_x_root_ids_it->second;
+  const auto& in_y_root_ids = in_y_root_ids_it->second;
+
+  // Update map for outputs
+  // Remove inputs from the active_ids_ and insert the output ID
+  active_ids_[in_x_pos] = swizzle->outX();
+  active_ids_[in_y_pos] = swizzle->outY();
+
+  // In the case of no real swizzle we can forward properties on each domain
+  // independently.
+  if (swizzle->swizzleType() == Swizzle2DType::NoSwizzle) {
+    if (in_x_ordered) {
+      consistently_ordered_ids_.emplace(swizzle->outX());
+    }
+
+    if (exclusivelyConsumesRoots(swizzle->inX())) {
+      exclusively_consumes_roots_.emplace(swizzle->outX());
+    }
+
+    if (in_y_ordered) {
+      consistently_ordered_ids_.emplace(swizzle->outY());
+    }
+
+    if (exclusivelyConsumesRoots(swizzle->inY())) {
+      exclusively_consumes_roots_.emplace(swizzle->outY());
+    }
+
+    id_to_root_ids_[swizzle->outX()] = in_x_root_ids;
+    id_to_root_ids_[swizzle->outY()] = in_y_root_ids;
+  } else {
+    VectorOfUniqueEntries<IterDomain*> root_ids = in_x_root_ids;
+    root_ids.pushBack(in_y_root_ids);
+    id_to_root_ids_[swizzle->outX()] = root_ids;
+    id_to_root_ids_[swizzle->outY()] = root_ids;
+  }
+}
+
+NonDivisibleSplitDependencies::NonDivisibleSplitDependencies(
+    // TODO: Revisit reduction rfactor axes and propagation. Should probably use
+    // ca_map to propogate non divisibility dependencies across exact map. Still
+    // need to think through divisible split and non divisible dependencies to
+    // see if there's conflicts where a split might look non divisible but
+    // actually is divisible and one's overruling the other.
+    const std::vector<IterDomain*>& ids,
+    const std::vector<IterDomain*>& root_domain,
+    const std::unordered_set<Split*>& divisible_splits) {
+  if (ids.empty() || root_domain.empty()) {
+    return;
+  }
+  auto transforms = StmtSort::getExprsBetween(
+      ids[0]->fusion(),
+      {root_domain.begin(), root_domain.end()},
+      {ids.begin(), ids.end()});
+  for (auto transform : transforms) {
+    auto inp_ids = ir_utils::filterByType<IterDomain>(transform->inputs());
+    for (auto inp_id : inp_ids) {
+      if (std::find(root_domain.begin(), root_domain.end(), inp_id) !=
+          root_domain.end()) {
+        // This generally shouldn't happen as there shouldn't be
+        // transformations before the root ids, but in case for some reason
+        // we eventually do have cases like that, we should reset the
+        // root_ids if for some reason they've been placed in the non
+        // divisible split set.
+        depends_on_non_divisible_split.erase(inp_id);
+      }
+    }
+
+    bool inputs_non_divisible =
+        std::any_of(inp_ids.begin(), inp_ids.end(), [this](IterDomain* inp_id) {
+          return depends_on_non_divisible_split.find(inp_id) !=
+              depends_on_non_divisible_split.end();
+        });
+
+    auto out_ids = ir_utils::filterByType<IterDomain>(transform->outputs());
+
+    if (inputs_non_divisible) {
+      // If any inputs are known to be dependent on a divisible split
+      // Mark outputs as dependent on a non_divisible split
+      depends_on_non_divisible_split.insert(out_ids.begin(), out_ids.end());
+      continue;
+    }
+
+    if (!transform->isA<Split>()) {
+      continue;
+    }
+
+    auto split = transform->as<Split>();
+    // If this transform is a non-divisible split
+    if (divisible_splits.find(split) == divisible_splits.end()) {
+      // Mark outputs as dependent on a non_divisible split
+      auto out_ids = ir_utils::filterByType<IterDomain>(transform->outputs());
+      depends_on_non_divisible_split.insert(out_ids.begin(), out_ids.end());
+    }
+  }
+}
+
+ContigIDs::ContigIDs(
+    const std::vector<IterDomain*>& ids,
+    const std::vector<IterDomain*>& root_domain,
+    const std::vector<bool>& root_contiguity,
+    const std::unordered_set<IterDomain*>& final_ids,
+    const std::unordered_map<IterDomain*, Val*>& index_map,
+    const std::unordered_set<Split*>& divisible_splits,
+    std::unordered_map<IterDomain*, IterDomain*> p2c_id_map,
+    bool ignore_indexability,
+    bool ignore_consistent_ordering)
+    : root_domain_(root_domain),
+      root_contiguity_(root_contiguity),
+      final_ids_(final_ids),
+      index_map_(index_map),
+      divisible_splits_(divisible_splits),
+      p2c_id_map_(std::move(p2c_id_map)),
+      ignore_indexability_(ignore_indexability),
+      ignore_consistent_ordering_(ignore_consistent_ordering),
+      non_divisible_id_info_(ids, root_domain_, divisible_splits_) {
+  if (ids.size() > 0) {
+    // This constructor doesn't provide the following information so it needs to
+    // be built.
+    ca_map_ = std::make_shared<ComputeAtMap>(ids[0]->fusion());
+    halo_info_ = std::make_shared<HaloInfo>(ids[0]->fusion(), ca_map_);
+    concrete_info_ =
+        std::make_shared<ConcretizedBroadcastDomains>(ids[0]->fusion());
+
+    consistent_transform_info_ = std::make_unique<const OrderedIdInformation>(
+        ids, root_domain, concrete_info_);
+  }
+  build(ids);
+}
+
 ContigIDs::ContigIDs(
     const std::vector<IterDomain*>& ids,
     const std::vector<IterDomain*>& root_domain,
     const std::vector<bool>& root_contiguity,
-    std::unordered_map<IterDomain*, IterDomain*> concrete_to_ref,
+    const std::unordered_set<IterDomain*>& final_ids,
+    const std::unordered_map<IterDomain*, Val*>& index_map,
+    const std::unordered_set<Split*>& divisible_splits,
+    std::shared_ptr<const ComputeAtMap> ca_map,
+    std::shared_ptr<const HaloInfo> halo_info,
+    std::shared_ptr<const ConcretizedBroadcastDomains> concrete_info,
     std::unordered_map<IterDomain*, IterDomain*> p2c_id_map,
-    bool ignore_halo_constraint,
-    bool ignore_indexability)
+    bool ignore_indexability,
+    bool ignore_consistent_ordering)
     : root_domain_(root_domain),
       root_contiguity_(root_contiguity),
-      concrete_to_ref_(std::move(concrete_to_ref)),
+      final_ids_(final_ids),
+      index_map_(index_map),
+      divisible_splits_(divisible_splits),
+      ca_map_(ca_map),
+      halo_info_(halo_info),
+      concrete_info_(concrete_info),
       p2c_id_map_(std::move(p2c_id_map)),
-      ignore_indexability_(ignore_indexability) {
-  if (ids.empty()) {
+      ignore_indexability_(ignore_indexability),
+      ignore_consistent_ordering_(ignore_consistent_ordering),
+      consistent_transform_info_(std::make_unique<const OrderedIdInformation>(
+          ids,
+          root_domain,
+          concrete_info_)),
+      non_divisible_id_info_(ids, root_domain, divisible_splits_) {
+  build(ids);
+}
+
+ContigIDs ContigIDs::getNonContigIDs() {
+  return ContigIDs({}, {}, {}, {}, {}, {});
+}
+
+void ContigIDs::build(const std::vector<IterDomain*>& ids) {
+  if (ids.empty() || root_domain_.empty()) {
     return;
   }
 
@@ -32,35 +467,29 @@ ContigIDs::ContigIDs(
       " != ",
       root_contiguity_.size());
 
-  // GpuLower is required to honor halo constraints
-  if (!ignore_halo_constraint) {
-    TORCH_INTERNAL_ASSERT(GpuLower::hasCurrent(), "GpuLower not found");
-  }
-
-  for (const auto i : c10::irange(root_domain_.size())) {
-    auto root_domain_i = root_domain_[i]->as<IterDomain>();
-    root_to_indexed_id_[root_domain_i] = root_domain_i;
+  for (const auto root_domain_i : c10::irange(root_domain_.size())) {
+    auto root_domain_id = root_domain_[root_domain_i]->as<IterDomain>();
+    root_to_indexed_id_[root_domain_id] = root_domain_id;
     // Initialize to false
-    is_contig_root_[root_domain_i] = false;
+    is_contig_root_[root_domain_id] = false;
     // If a root domain has halo, can't use merged domain even if
     // both inputs are contiguous. HaloInfo is also initialized for
     // rfactor root domains, which should just return "zero"
     // RootAxisInfo. This should be safe as no rfactor tensor should
     // need halo.
-    if (root_contiguity_[i] &&
-        (ignore_halo_constraint ||
-         !GpuLower::current()
-              ->haloInfo()
-              .getRootAxisInfo(root_domain_i)
-              .hasHalo())) {
-      contig_ids_.emplace(root_domain_i);
-      is_contig_root_[root_domain_i] = true;
-      within_contig_ids_[root_domain_i] = std::unordered_set<IterDomain*>();
+    if (root_contiguity_[root_domain_i] &&
+        !halo_info_->getRootAxisInfo(root_domain_id).hasHalo()) {
+      contig_ids_.emplace(root_domain_id);
+      is_contig_root_[root_domain_id] = true;
+      within_contig_ids_[root_domain_id] = std::unordered_set<IterDomain*>();
     }
   }
 
   if (!contig_ids_.empty()) {
-    auto exprs = StmtSort::getExprs(ids[0]->fusion(), {ids.begin(), ids.end()});
+    auto exprs = StmtSort::getExprsBetween(
+        ids[0]->fusion(),
+        {root_domain_.begin(), root_domain_.end()},
+        {ids.begin(), ids.end()});
     for (auto expr : exprs) {
       handle(expr);
     }
@@ -68,114 +497,99 @@ ContigIDs::ContigIDs(
 }
 
 void ContigIDs::handle(Merge* merge) {
-  // If either input is non-contiguous so is output.
-  const auto inner = merge->inner();
-  const auto outer = merge->outer();
-  const auto out = merge->out();
+  // If output is not consistently ordered or doesn't solely consume all root
+  // domains in its dependencies, then it can't be a contiguously indexable
+  // iterdomain.
+  if (!(ignore_consistent_ordering_ ||
+        consistent_transform_info_->isConsistentlyOrdered(merge->out()))) {
+    return;
+  }
 
-  if (!isContig(inner) || !isContig(outer)) {
+  if (!consistent_transform_info_->exclusivelyConsumesRoots(merge->out())) {
     return;
   }
 
-  // Stop contig merging if the merge output is not indexable.
-  if (!ignore_indexability_ && !isIndexable(out)) {
+  // If output is not "directly indexable" then it's definitely not contiguously
+  // indexable.
+  if (!ignore_indexability_ && !isIndexable(merge->out())) {
     return;
   }
 
-  // Grab inputs, make sure they're in root domain, check if they're
-  // contiguous.
+  // If inputs are marked as final, stop
+  if (final_ids_.count(merge->inner()) || final_ids_.count(merge->outer())) {
+    return;
+  }
 
-  auto lhs_inputs =
-      ir_utils::iterDomainInputsOfOrderedAs({outer}, root_domain_);
-  auto rhs_inputs =
-      ir_utils::iterDomainInputsOfOrderedAs({inner}, root_domain_);
+  // Check root domains for contiguity
+  auto root_ids_it =
+      consistent_transform_info_->idToRootIds().find(merge->out());
 
   TORCH_INTERNAL_ASSERT(
-      inRoot(lhs_inputs) && inRoot(rhs_inputs),
-      "Found an invalid merge operation, inputs of its arguments are not in the root domain.");
-
-  std::deque<IterDomain*> ordered_inputs(lhs_inputs.begin(), lhs_inputs.end());
-  ordered_inputs.insert(
-      ordered_inputs.end(), rhs_inputs.begin(), rhs_inputs.end());
-
-  // If any root input is not contig, output is not contig
-  if (!(std::all_of(
-          ordered_inputs.begin(), ordered_inputs.end(), [this](IterDomain* id) {
-            // Allow reduction tensors in contiguity check since we're using
-            // this to check contiguous vectors of reference tensors in
-            // schedulers (to set vectorization sizes), those reference tensors
-            // may have reduction dims, don't bail on contiguity just because
-            // it's a reduction dimension.
-            return is_contig_root_.at(id);
-          }))) {
-    return;
-  }
+      root_ids_it != consistent_transform_info_->idToRootIds().end(),
+      "\nError in contiguous analysis, merge info doesn't exist for:\n",
+      merge->toString(),
+      "\nId: ",
+      merge->out()->toString());
 
-  std::deque<IterDomain*> root_copy(root_domain_.begin(), root_domain_.end());
+  VectorOfUniqueEntries<IterDomain*> root_ids = root_ids_it->second;
 
-  // Forward to first matching argument
-  while (!root_copy.empty() && !ordered_inputs.empty()) {
-    if (root_copy.front() != ordered_inputs.front()) {
-      root_copy.pop_front();
-    } else {
-      break;
-    }
-  }
+  bool is_indexing_pass = !ignore_consistent_ordering_;
 
-  // Forward through all matching arguments
-  while (!root_copy.empty() && !ordered_inputs.empty()) {
-    if (root_copy.front() == ordered_inputs.front()) {
-      root_copy.pop_front();
-      ordered_inputs.pop_front();
-    } else if (
-        root_copy.front()->isReduction() || root_copy.front()->isBroadcast()) {
-      // This was a cause of an error with
-      // ReductionSchedulerMultiDimNonFastest. The test no longer
-      // fails.
-      root_copy.pop_front();
-    } else {
-      break;
+  IterDomain* last_root = nullptr;
+  for (auto root_id_i : c10::irange(root_domain_.size())) {
+    auto root_id = root_domain_[root_id_i];
+    if (root_ids.has(root_id)) {
+      // ID found, remove it
+      root_ids.erase(root_id);
+      // If we're indexing:
+      // we could still potentially consider this ID linearly indexable, as we
+      // could multiple the index by the last root's stride.
+      //
+      // If we're computing predicates (ignore_consistent_ordering_==true),
+      // then we don't have this same constraint, we can just ignore
+      // contiguity of the roots all together.
+      if (!root_contiguity_[root_id_i] && is_indexing_pass) {
+        if (!root_ids.empty()) {
+          return;
+        }
+      }
+      last_root = root_id;
     }
   }
 
-  // If we matched all inputs, the output is contiguous. Only want to keep the
-  // top contig ID, lower ids should be placed in the "within_contig_ids" map
-  // of top id.
-  if (ordered_inputs.empty()) {
-    if (contig_ids_.find(inner) != contig_ids_.end()) {
-      contig_ids_.erase(inner);
-    }
+  // If there's a non_divisible split in the history of merge->out then it can't
+  // be contiguously indexable.
+  if (non_divisible_id_info_.dependsOnNonDivisibleSplit(merge->out())) {
+    return;
+  }
 
-    if (contig_ids_.find(outer) != contig_ids_.end()) {
-      contig_ids_.erase(outer);
-    }
+  // Now we know merge->out is a contiguously indexable ID
 
-    contig_ids_.emplace(out);
+  TORCH_INTERNAL_ASSERT(
+      last_root != nullptr,
+      "Issue processing root ids for ",
+      merge->out()->toString());
 
-    std::unordered_set<IterDomain*> within_out;
-    within_out.emplace(inner);
-    if (within_contig_ids_.find(inner) != within_contig_ids_.end()) {
-      auto in_inner = within_contig_ids_.at(inner);
-      within_out.insert(in_inner.begin(), in_inner.end());
-      within_contig_ids_.erase(inner);
-    }
+  // Reset root_ids
+  root_ids = root_ids_it->second;
+  for (auto root_id : root_ids) {
+    root_to_indexed_id_[root_id] = merge->out();
+  }
 
-    within_out.emplace(outer);
-    if (within_contig_ids_.find(outer) != within_contig_ids_.end()) {
-      auto in_outer = within_contig_ids_.at(outer);
-      within_out.insert(in_outer.begin(), in_outer.end());
-      within_contig_ids_.erase(outer);
-    }
+  auto all_within_vals = DependencyCheck::getAllValsBetween(
+      {root_domain_.begin(), root_domain_.end()}, {merge->out()});
+  auto all_within_ids = ir_utils::filterByType<IterDomain>(all_within_vals);
 
-    within_contig_ids_[out] = within_out;
+  std::unordered_set<IterDomain*> within_id_set(
+      all_within_ids.begin(), all_within_ids.end());
 
-    for (auto root : lhs_inputs) {
-      root_to_indexed_id_[root] = out;
-    }
-    for (auto root : rhs_inputs) {
-      root_to_indexed_id_[root] = out;
-    }
+  within_id_set.erase(merge->out());
+  within_contig_ids_[merge->out()] = within_id_set;
+  for (auto id : all_within_ids) {
+    contig_ids_.erase(id);
   }
+
+  contig_ids_.emplace(merge->out());
 }
 
 IterDomain* ContigIDs::getMappedId(IterDomain* id) const {
@@ -187,24 +601,16 @@ IterDomain* ContigIDs::getMappedId(IterDomain* id) const {
   }
 }
 
-IterDomain* ContigIDs::getCAIndexConcreteId(IterDomain* id) const {
-  TORCH_INTERNAL_ASSERT(
-      GpuLower::current() != nullptr, "GpuLower is not found");
-
-  auto c_id = GpuLower::current()->caMap()->getConcreteMappedID(
-      getMappedId(id), IdMappingMode::EXACT);
-  return c_id;
-}
-
 bool ContigIDs::isIndexable(IterDomain* id) const {
   // If ID is mapped to consumer through persmissive map but not exact map it
   // will not be mapped through to the exact map through the p2c map. Therefore
   // reject because it involves broadcast resolution.
-  if (!GpuLower::current()->caMap()->idExistsInMap(getMappedId(id))) {
+  if (!ca_map_->idExistsInMap(getMappedId(id))) {
     return false;
   }
-  auto c_id = getCAIndexConcreteId(id);
-  return concrete_to_ref_.find(c_id) != concrete_to_ref_.end();
+  auto c_id =
+      ca_map_->getConcreteMappedID(getMappedId(id), IdMappingMode::EXACT);
+  return index_map_.find(c_id) != index_map_.end();
 }
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/contiguity.h b/torch/csrc/jit/codegen/cuda/contiguity.h
index 7293901310eb6..e3be65a5bbc08 100644
--- a/torch/csrc/jit/codegen/cuda/contiguity.h
+++ b/torch/csrc/jit/codegen/cuda/contiguity.h
@@ -2,13 +2,128 @@
 
 #include <c10/macros/Export.h>
 
+#include <torch/csrc/jit/codegen/cuda/compute_at_map.h>
+#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/lower_shift.h>
+#include <torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.h>
 
 namespace torch {
 namespace jit {
 namespace fuser {
 namespace cuda {
 
+// Goes through the transformations associated with a series of ids and root
+// ids. Checks the ordering of the iteration domains through these operations to
+// pick out which operations are consistently ordered. For example:
+// [i0, i1, i2]
+// ->split(0, 4)->merge(1)->merge(1)->merge(0)
+// are consistently ordered from largest to smallest extents, but
+// ->split(0, 4)->merge(1)->merge(0, 2)->merge(0) is not consistently ordered
+// with the roots.
+//
+// This property is important to understand the contiguity of dimensions through
+// complex transformations.
+class OrderedIdInformation : public OptInDispatch {
+ public:
+  OrderedIdInformation() = delete;
+
+  OrderedIdInformation(
+      const std::vector<IterDomain*>& ids,
+      const std::vector<IterDomain*>& root_domain,
+      std::shared_ptr<const ConcretizedBroadcastDomains> concrete_info);
+
+  const std::unordered_map<IterDomain*, VectorOfUniqueEntries<IterDomain*>>&
+  idToRootIds() const {
+    return id_to_root_ids_;
+  }
+
+  bool isConsistentlyOrdered(IterDomain* id) const {
+    return consistently_ordered_ids_.find(id) !=
+        consistently_ordered_ids_.end();
+  }
+
+  bool exclusivelyConsumesRoots(IterDomain* id) const {
+    return exclusively_consumes_roots_.find(id) !=
+        exclusively_consumes_roots_.end();
+  }
+
+ private:
+  // Returns if the id in active_ids should be in exclusively_consumes_roots_
+  bool checkExclusivelyConsumesRoots(IterDomain* id);
+
+  void handle(Split*) override;
+
+  void handle(Merge* merge) override;
+
+  void handle(Swizzle2D* swizzle) override;
+
+  // Track which root ids were used to generate each iter domain
+  std::unordered_map<IterDomain*, VectorOfUniqueEntries<IterDomain*>>
+      id_to_root_ids_;
+
+  // Track all IterDomains that have correct ordered transforms for contiguity.
+  // i.e. if we have:
+  //
+  // root = [i0, i1, i2]
+  // i3 = merge(i0, i2)
+  // would not be consistently ordered transformed
+  //
+  // root = [i0, i1, i2]
+  // i4, i5 = spit(merge(merge(i0, i1), i2), 4)
+  // would be consistently ordered transforms
+  //
+  // root = [i0, i1, i2, i3]
+  // i4 = merge(i1, i2) would also be consistently ordered transformed
+  std::unordered_set<IterDomain*> consistently_ordered_ids_;
+
+  // Active series of IterDomains that are updated while we're processing the
+  // domain. Helps us identify which ids are consistently_ordered_ids_. Used
+  // for intermediate storage, not to return.
+  std::vector<IterDomain*> active_ids_;
+
+  // IterDomains in this set exclusively consume all the uses of their roots.
+  // For example:
+  // [i0, i1] split(0, f)->merge(1)
+  // [ceilDiv(i0, f), f*i1]
+  // neither iter domains exclusively consume the roots. With another:
+  // merge(0) -> [ceilDiv(i0, f)*f*i1]
+  // The resulting iter domain does exclusively consume the roots.
+  //
+  // Also:
+  // [i0, i1, i2, i3] merge(1)->merge(1)
+  // ->[i0, i1*i2*i3]
+  // both resulting iter domains do exclusively consume their roots
+  std::unordered_set<IterDomain*> exclusively_consumes_roots_;
+
+  // Broadcast domains that are concretized cannot be considered contiguously
+  // indexable.
+  // TODO: This constraint is more conservative than necessary as it's only if
+  // the domain is concretized within the local indexing, not in the entire
+  // fusion.
+  std::shared_ptr<const ConcretizedBroadcastDomains> concrete_info_;
+};
+
+// Based on provided divisible split set, goes through expressions and marks all
+// IterDomains that are dependent on a non-divisible split.
+class NonDivisibleSplitDependencies : public OptInDispatch {
+ public:
+  NonDivisibleSplitDependencies() = delete;
+
+  NonDivisibleSplitDependencies(
+      const std::vector<IterDomain*>& ids,
+      const std::vector<IterDomain*>& root_domain,
+      const std::unordered_set<Split*>& divisible_splits);
+
+  bool dependsOnNonDivisibleSplit(IterDomain* id) const {
+    return depends_on_non_divisible_split.find(id) !=
+        depends_on_non_divisible_split.end();
+  }
+
+ private:
+  std::unordered_set<IterDomain*> depends_on_non_divisible_split;
+};
+
 // A merge is contiguous if:
 //   Inputs of outer are to the left in the root domain of the inputs of RHS.
 //   All inputs are contiguous in the root domain:
@@ -22,8 +137,6 @@ namespace cuda {
 
 class ContigIDs : public OptInDispatch {
  public:
-  ContigIDs() = delete;
-
   //! Check through the history of ids whose inputs map to root_domain with
   //! contiguity root_contiguity. Return unordered_set of all merges that are
   //! contiguous. Ignore root order is primarily used for predicate generation.
@@ -42,21 +155,55 @@ class ContigIDs : public OptInDispatch {
   //! If ignore_indexability and ignore_halo_constraint are true,
   //! ignore the constraint on indexing and halo, respectively. It is
   //! the caller that is responsible for its correctness.
-  //!
-  //! The function interface with many parameters looks ugly, but it
-  //! is also important to make ignore_indexability and
-  //! ignore_halo_constraint explicit to avoid any surprise.
-  //!
   //! Not really sure why but clang-tidy only complains about
   //! std::unordered_map if passed as a const reference.
   ContigIDs(
       const std::vector<IterDomain*>& ids,
       const std::vector<IterDomain*>& root_domain,
       const std::vector<bool>& root_contiguity,
-      std::unordered_map<IterDomain*, IterDomain*> concrete_to_ref,
+      const std::unordered_set<IterDomain*>& final_ids,
+      const std::unordered_map<IterDomain*, Val*>& index_map,
+      const std::unordered_set<Split*>& divisible_splits,
+      std::unordered_map<IterDomain*, IterDomain*> p2c_id_map = {},
+      bool ignore_indexability = false,
+      bool ignore_consistent_ordering = false);
+
+  //! \param ids IterDomains on the leaves of the domain we're looking for
+  //! contiguous indexing into.
+  //! \param root_domain the root domain of the domain we're looking for
+  //! contiguous indexing into.
+  //! \param root_contiguity the contiguity of the root_domain.
+  //! \param concrete_to_ref concrete ids of the exact map that the reference
+  //! index is using for indexing.
+  //! \param divisible_splits a set of all splits in the fusion that are
+  //! divisible.
+  //! \param ca_map compute at map of the fusion.
+  //! \param halo_info halo information of the fusion.
+  //! \param concrete_info concretized broadcast information of the fusion.
+  //! \param p2c_id_map map from producer to consumer ids used for indexing
+  //! producer tensors.
+  //! \param ignore_consistent_ordering true for actual indexing into tensors
+  //! but false for predicate analysis. Ordering of merges don't matter for
+  //! predicate generation as they don't map to a physical address.
+  //! \param ignore_indexability can only be true if providing a real
+  //! concrete_to_ref map. As what it's checking is if the index is actually
+  //! indexable based on the reference.
+  ContigIDs(
+      const std::vector<IterDomain*>& ids,
+      const std::vector<IterDomain*>& root_domain,
+      const std::vector<bool>& root_contiguity,
+      const std::unordered_set<IterDomain*>& final_ids,
+      const std::unordered_map<IterDomain*, Val*>& index_map,
+      const std::unordered_set<Split*>& divisible_splits,
+      std::shared_ptr<const ComputeAtMap> ca_map,
+      std::shared_ptr<const HaloInfo> halo_info,
+      std::shared_ptr<const ConcretizedBroadcastDomains> concrete_info,
       std::unordered_map<IterDomain*, IterDomain*> p2c_id_map = {},
       bool ignore_indexability = false,
-      bool ignore_halo_constraint = false);
+      bool ignore_consistent_ordering = false);
+
+  //! Return an empty ContigIDs with no contiguous ID
+  static ContigIDs getNonContigIDs();
 
   const std::unordered_set<IterDomain*>& contigIDs() const {
     return contig_ids_;
@@ -71,6 +218,14 @@ class ContigIDs : public OptInDispatch {
     return root_to_indexed_id_;
   }
 
+  VectorOfUniqueEntries<IterDomain*> indexedRootIDs(IterDomain* id) const {
+    auto root_ids_it = consistent_transform_info_->idToRootIds().find(id);
+    if (root_ids_it == consistent_transform_info_->idToRootIds().end()) {
+      return {};
+    }
+    return root_ids_it->second;
+  }
+
  private:
   using OptInDispatch::handle;
 
@@ -107,17 +262,32 @@ class ContigIDs : public OptInDispatch {
   IterDomain* getMappedId(IterDomain* id) const;
 
  private:
+  void build(const std::vector<IterDomain*>& ids);
+
   //! Root domains to analyze contiguity
   const std::vector<IterDomain*>& root_domain_;
   //! Contiguity of root_domain_
   const std::vector<bool>& root_contiguity_;
-  //! Mapping of concrete to reference domains. If a concrete domain
-  //! is not mapped, it is not indexable as there's no mapped index.
-  const std::unordered_map<IterDomain*, IterDomain*> concrete_to_ref_;
+  //! Domains where indexing/predicates cannot be done with their
+  //! consumers domains
+  const std::unordered_set<IterDomain*>& final_ids_;
+  //! Mapping of concrete domains to indices. Just used to check if
+  //! there's an index for an IterDomain.
+  const std::unordered_map<IterDomain*, Val*> index_map_;
+  // Divisible split information as we can still consider iter domains
+  // contiguous through divisible splits.
+  const std::unordered_set<Split*>& divisible_splits_;
+
+  std::shared_ptr<const ComputeAtMap> ca_map_;
+  std::shared_ptr<const HaloInfo> halo_info_;
+  std::shared_ptr<const ConcretizedBroadcastDomains> concrete_info_;
+
   //! Producer-to-consumer index map in the case of analyzing replayed
   //! producer tensors
   const std::unordered_map<IterDomain*, IterDomain*> p2c_id_map_;
+
   const bool ignore_indexability_ = false;
+  const bool ignore_consistent_ordering_ = false;
 
   //! Mapping of root domain to bool indicating contiguity
   std::unordered_map<IterDomain*, bool> is_contig_root_;
@@ -129,6 +299,10 @@ class ContigIDs : public OptInDispatch {
   //! Mapping of root domain to the actual indexed domain, which can
   //! be itself or a contig merged domain if found.
   std::unordered_map<IterDomain*, IterDomain*> root_to_indexed_id_;
+
+  std::unique_ptr<const OrderedIdInformation> consistent_transform_info_;
+
+  NonDivisibleSplitDependencies non_divisible_id_info_;
 };
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/disjoint_set.h b/torch/csrc/jit/codegen/cuda/disjoint_set.h
index b325bedcf7b9e..09cf6e8de9504 100644
--- a/torch/csrc/jit/codegen/cuda/disjoint_set.h
+++ b/torch/csrc/jit/codegen/cuda/disjoint_set.h
@@ -302,17 +302,14 @@ class DisjointSets {
   std::string toString() const {
     std::stringstream ss;
     ss << "disjoint sets{\n";
+    const std::string sep("  ");
     for (auto s_ptr : disjoint_sets_) {
       auto& set = *s_ptr;
-      ss << "  { ";
+      ss << sep << "{\n";
       for (auto entry : set.vector()) {
-        ss << abstractToString(entry);
-        // DomainKey defines == but not !=
-        if (!(entry == set.back())) {
-          ss << "; ";
-        }
+        ss << sep << sep << abstractToString(entry) << "\n";
       }
-      ss << " }\n";
+      ss << sep << "}\n";
     }
     ss << "}";
     return ss.str();
diff --git a/torch/csrc/jit/codegen/cuda/dispatch.cpp b/torch/csrc/jit/codegen/cuda/dispatch.cpp
index 7f66d3c69495c..70e9ae16375e5 100644
--- a/torch/csrc/jit/codegen/cuda/dispatch.cpp
+++ b/torch/csrc/jit/codegen/cuda/dispatch.cpp
@@ -95,9 +95,15 @@ void Val::dispatch(T handler, Val* val) {
 template <typename T>
 void Expr::dispatch(T handler, Expr* expr) {
   switch (*(expr->getExprType())) {
+    case ExprType::FullOp:
+      ptr(handler)->handle(expr->as<FullOp>());
+      return;
     case ExprType::ARangeOp:
       ptr(handler)->handle(expr->as<ARangeOp>());
       return;
+    case ExprType::EyeOp:
+      ptr(handler)->handle(expr->as<EyeOp>());
+      return;
     case ExprType::UnaryOp:
       ptr(handler)->handle(expr->as<UnaryOp>());
       return;
@@ -281,9 +287,15 @@ void Val::constDispatch(T handler, const Val* val) {
 template <typename T>
 void Expr::constDispatch(T handler, const Expr* expr) {
   switch (*(expr->getExprType())) {
+    case ExprType::FullOp:
+      ptr(handler)->handle(expr->as<FullOp>());
+      return;
     case ExprType::ARangeOp:
       ptr(handler)->handle(expr->as<ARangeOp>());
       return;
+    case ExprType::EyeOp:
+      ptr(handler)->handle(expr->as<EyeOp>());
+      return;
     case ExprType::UnaryOp:
       ptr(handler)->handle(expr->as<UnaryOp>());
       return;
@@ -475,9 +487,15 @@ void Val::mutatorDispatch(T mutator, Val* val) {
 template <typename T>
 void Expr::mutatorDispatch(T mutator, Expr* expr) {
   switch (*(expr->getExprType())) {
+    case ExprType::FullOp:
+      ptr(mutator)->mutate(expr->as<FullOp>());
+      return;
     case ExprType::ARangeOp:
       ptr(mutator)->mutate(expr->as<ARangeOp>());
       return;
+    case ExprType::EyeOp:
+      ptr(mutator)->mutate(expr->as<EyeOp>());
+      return;
     case ExprType::UnaryOp:
       ptr(mutator)->mutate(expr->as<UnaryOp>());
       return;
@@ -734,9 +752,15 @@ void OptOutConstDispatch::handle(const kir::IntPair* stmt) {
 }
 
 // Exprs
+void OptOutConstDispatch::handle(const FullOp* stmt) {
+  unhandled(stmt);
+}
 void OptOutConstDispatch::handle(const ARangeOp* stmt) {
   unhandled(stmt);
 }
+void OptOutConstDispatch::handle(const EyeOp* stmt) {
+  unhandled(stmt);
+}
 void OptOutConstDispatch::handle(const UnaryOp* stmt) {
   unhandled(stmt);
 }
@@ -890,9 +914,15 @@ void OptOutDispatch::handle(kir::IntPair* stmt) {
 }
 
 // Exprs
+void OptOutDispatch::handle(FullOp* stmt) {
+  unhandled(stmt);
+}
 void OptOutDispatch::handle(ARangeOp* stmt) {
   unhandled(stmt);
 }
+void OptOutDispatch::handle(EyeOp* stmt) {
+  unhandled(stmt);
+}
 void OptOutDispatch::handle(UnaryOp* stmt) {
   unhandled(stmt);
 }
diff --git a/torch/csrc/jit/codegen/cuda/dispatch.h b/torch/csrc/jit/codegen/cuda/dispatch.h
index 6b35a9775ecf7..4fea698191ec4 100644
--- a/torch/csrc/jit/codegen/cuda/dispatch.h
+++ b/torch/csrc/jit/codegen/cuda/dispatch.h
@@ -68,7 +68,9 @@ class ComplexDouble;
 class NamedScalar;
 
 // Exprs
+class FullOp;
 class ARangeOp;
+class EyeOp;
 class UnaryOp;
 class BinaryOp;
 class TernaryOp;
@@ -144,7 +146,9 @@ class TORCH_CUDA_CU_API OptOutConstDispatch : public PolymorphicBase {
   virtual void handle(const kir::IntPair*);
 
   // Exprs
+  virtual void handle(const FullOp* stmt);
   virtual void handle(const ARangeOp* stmt);
+  virtual void handle(const EyeOp* stmt);
   virtual void handle(const UnaryOp* stmt);
   virtual void handle(const BinaryOp* stmt);
   virtual void handle(const TernaryOp* stmt);
@@ -211,7 +215,9 @@ class TORCH_CUDA_CU_API OptOutDispatch : public PolymorphicBase {
   virtual void handle(kir::IntPair*);
 
   // Exprs
+  virtual void handle(FullOp* stmt);
   virtual void handle(ARangeOp* stmt);
+  virtual void handle(EyeOp* stmt);
   virtual void handle(UnaryOp* stmt);
   virtual void handle(BinaryOp* stmt);
   virtual void handle(TernaryOp* stmt);
@@ -319,7 +325,9 @@ class TORCH_CUDA_CU_API OptOutMutator : public PolymorphicBase {
   virtual void mutate(kir::IntPair*);
 
   // Exprs
+  virtual void mutate(FullOp*);
   virtual void mutate(ARangeOp*);
+  virtual void mutate(EyeOp*);
   virtual void mutate(UnaryOp*);
   virtual void mutate(BinaryOp*);
   virtual void mutate(TernaryOp*);
diff --git a/torch/csrc/jit/codegen/cuda/dynamic_type.h b/torch/csrc/jit/codegen/cuda/dynamic_type.h
index aba725e0ea60a..5cf9f0930929d 100644
--- a/torch/csrc/jit/codegen/cuda/dynamic_type.h
+++ b/torch/csrc/jit/codegen/cuda/dynamic_type.h
@@ -296,6 +296,14 @@ inline IntOrDouble min(const IntOrDouble& a, const IntOrDouble& b) {
   return (a < b ? a : b).cast<double>();
 }
 
+inline IntOrDouble abs(const IntOrDouble& a) {
+  if (a.is_int()) {
+    return IntOrDouble(std::abs(a.as<int64_t>()));
+  } else {
+    return IntOrDouble(std::abs(a.as<double>()));
+  }
+}
+
 } // namespace IntOrDouble_functions
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/evaluator_common.cpp b/torch/csrc/jit/codegen/cuda/evaluator_common.cpp
index bab8586247bfd..ae280b4ac44c8 100644
--- a/torch/csrc/jit/codegen/cuda/evaluator_common.cpp
+++ b/torch/csrc/jit/codegen/cuda/evaluator_common.cpp
@@ -196,7 +196,13 @@ template <typename IRContext>
 void PrecomputedValuesBase<IRContext>::validate() {
   FUSER_PERF_SCOPE("PrecomputedValuess::Validate");
   for (auto it : binding_log_) {
-    TORCH_INTERNAL_ASSERT(values_[it.first] == it.second);
+    TORCH_INTERNAL_ASSERT(
+        values_[it.first] == it.second,
+        "Precomputed values failed to validate.",
+        "\nSomething unexpected changed between the compilation and execution.\n",
+        values_[it.first],
+        " != ",
+        it.second);
   }
   has_valid_values_ = true;
 }
@@ -295,6 +301,7 @@ void NaiveValueMachine<IRContext>::runInstruction(int index) {
 
 template <typename IRContext>
 void NaiveValueMachine<IRContext>::runUnaryOp(int index) {
+  using namespace IntOrDouble_functions;
   int src_index = src0_[index];
   bool src_defined = precomputed_values_.defined_[src_index];
   bool src_is_const = precomputed_values_.is_constant_[src_index];
@@ -323,6 +330,9 @@ void NaiveValueMachine<IRContext>::runUnaryOp(int index) {
         TORCH_INTERNAL_ASSERT(false, "dtype not supported in evaluator");
       }
       break;
+    case UnaryOpType::Abs:
+      dest = abs(src);
+      break;
     default:
       TORCH_CHECK(!"Unexpected operator type ", uop_type_[index]);
   }
diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp
index c3f447cf706fa..25e87c91cd25f 100644
--- a/torch/csrc/jit/codegen/cuda/executor.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor.cpp
@@ -9,6 +9,7 @@
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <torch/csrc/jit/codegen/cuda/lower_bank_conflict.h>
 #include <torch/csrc/jit/codegen/cuda/utils.h>
 
 #include <ATen/core/LegacyTypeDispatch.h>
@@ -20,6 +21,7 @@
 #include <c10/cuda/CUDAStream.h>
 #include <c10/util/irange.h>
 
+#include <cmath>
 #include <fstream>
 
 namespace torch {
@@ -29,6 +31,16 @@ namespace cuda {
 
 int FusionExecutor::fusion_id_counter_ = 0; // NOLINT
 
+bool fill_allocation_with_nan_ = false;
+
+bool shouldFillAllocationWithNan() {
+  return fill_allocation_with_nan_;
+}
+
+void setFillAllocationWithNan(bool value) {
+  fill_allocation_with_nan_ = value;
+}
+
 namespace {
 
 static const char* defineIndexMode(KernelIndexMode index_mode) {
@@ -245,6 +257,27 @@ void FusionExecutor::compileFusion(
     kernel->print();
   }
 
+  if (isDebugDumpEnabled(DebugDumpOption::BankConflictInfo)) {
+    auto bank_conflict_info = getBankConflictInfo(kernel);
+    if (bank_conflict_info.empty()) {
+      std::cout << "===== No bank confliction =====" << std::endl;
+    } else {
+      std::cout << "======= Bank confliction =======" << std::endl;
+      for (auto info : bank_conflict_info) {
+        std::cout << "Expr: " << info.first->toString() << std::endl;
+        auto conflict = info.second;
+        if (conflict.first > 1) {
+          std::cout << "input conflict: " << conflict.first << " way, ";
+        }
+        if (conflict.second > 1) {
+          std::cout << "output conflict: " << conflict.second << " way";
+        }
+        std::cout << std::endl;
+      }
+      std::cout << "================================" << std::endl;
+    }
+  }
+
   kernel_code_ = codegen::generateCudaKernel(kernel, kernelName());
   const auto structured_code = getStructuredCode(kernel_code_);
 
@@ -314,6 +347,42 @@ void FusionExecutor::compileFusion(
 
 namespace {
 
+void fillTensorWithNan(at::Tensor& t) {
+  switch (t.scalar_type()) {
+    case at::ScalarType::Byte:
+      t.fill_(0xFF);
+      break;
+    case at::ScalarType::Char:
+      t.fill_(0x7F);
+      break;
+    case at::ScalarType::Short:
+      t.fill_(0x7FFF);
+      break;
+    case at::ScalarType::Int:
+      t.fill_(0x7FFFFFFF);
+      break;
+    case at::ScalarType::Long:
+      t.fill_(0x7FFFFFFFFFFFFFFFL);
+      break;
+    case at::ScalarType::Bool:
+      t.fill_(true);
+      break;
+    case at::ScalarType::Half:
+    case at::ScalarType::Float:
+    case at::ScalarType::Double:
+    case at::ScalarType::BFloat16:
+      t.fill_(std::nan(""));
+      break;
+    case at::ScalarType::ComplexHalf:
+    case at::ScalarType::ComplexFloat:
+    case at::ScalarType::ComplexDouble:
+      t.fill_(c10::complex<double>(std::nan(""), std::nan("")));
+      break;
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Unknown dtype");
+  }
+}
+
 at::Tensor inferAndAlloc(
     const TensorView* tv,
     const std::vector<Val*>& sizes,
@@ -383,6 +452,9 @@ at::Tensor inferAndAlloc(
     // Non Variable type guard for empty_cuda call
     at::AutoDispatchBelowADInplaceOrView non_variable_type_mode;
     auto empty = at::empty(isizes, tensor_options);
+    if (shouldFillAllocationWithNan()) {
+      fillTensorWithNan(empty);
+    }
     if (expanded_dim) {
       return empty.expand(expanded_sizes);
     }
@@ -700,29 +772,24 @@ FusionExecutor::GlobalBuffers FusionExecutor::allocGlobalVals(
 }
 
 std::vector<at::Tensor> FusionExecutor::allocOutputs(
+    const KernelArgumentHolder& args,
     kir::ExpressionEvaluator& expr_eval,
     const std::unordered_set<int>& alias_indices) {
   FUSER_PERF_SCOPE("FusionExecutor::AllocOutputs");
   const auto kernel = lowered_->kernel();
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   std::vector<at::Tensor> outputs;
+  TORCH_INTERNAL_ASSERT(
+      args.size() == kernel->inputs().size(),
+      "kernel arguments length does not match runtime arguments.");
   for (const auto out_i : c10::irange(kernel->outputs().size())) {
-    // TODO: FIX this short-cut where we trivially forward inputs to outputs
     if (kernel->outputs()[out_i]->isFusionInput()) {
-      TORCH_INTERNAL_ASSERT(false, "trivial input forwarding NOT IMPLEMENTED");
-      // for (auto inp_i : c10::irange(kernel->inputs().size())) {
-      //   if (kernel->inputs()[inp_i] == kernel->outputs()[out_i]) {
-      //     TORCH_INTERNAL_ASSERT(
-      //         inp_i < inputs.size(),
-      //         "Issue with an input showing up as output, couldn't find
-      //         input.");
-      //     TORCH_INTERNAL_ASSERT(
-      //         inputs[inp_i].isTensor(),
-      //         "Cannot register a scalar as an output in a fusion.");
-      //     outputs.push_back(inputs[inp_i].toTensor());
-      //     break;
-      //   }
-      // }
+      // pushing empty tensor for trivial forwarding. Since we handle this in
+      // integration, see step 1 - note [trivial forwarding]
+      c10::Device device(c10::DeviceType::CUDA, args.getDeviceIndex());
+      const auto tensor_options =
+          at::TensorOptions().dtype(at::kFloat).device(device);
+      outputs.emplace_back(at::empty({0}, tensor_options));
     } else {
       TORCH_INTERNAL_ASSERT(
           kernel->outputs()[out_i]->isA<TensorView>(),
@@ -762,7 +829,8 @@ KernelArgumentHolder FusionExecutor::evaluateOutputSizes(
   meta_options.device = c10::Device(DeviceType::Meta, 0);
 
   for (const auto out_i : c10::irange(kernel->outputs().size())) {
-    // If the output is just trivially the input, just "copy" it over.
+    // If the output is just trivially the input, just "copy" it over, see note
+    // [trivial forwarding]
     if (kernel->outputs()[out_i]->isFusionInput()) {
       for (auto inp_i : c10::irange(kernel->inputs().size())) {
         if (kernel->inputs()[inp_i] == kernel->outputs()[out_i]) {
@@ -884,6 +952,8 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
       !args.getCacheId().has_value() || outputs.empty(),
       "short cut input cache is not compatible with pre-allocated output");
 
+  size_t num_inputs = args.size();
+
   if (isDebugDumpEnabled(DebugDumpOption::FusionArgs)) {
     std::cout << "Arguments for fusion" << fusion_id_ << ":" << std::endl
               << "Inputs:" << std::endl;
@@ -930,6 +1000,9 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
               c10::nullopt,
               options_.device,
               c10::nullopt));
+          if (shouldFillAllocationWithNan()) {
+            fillTensorWithNan(allocated_outputs.back());
+          }
         }
         // Note: aliased output is not returned as output. But we still need it
         // for kernel execution, so would need to push them to args
@@ -970,6 +1043,9 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
                 c10::nullopt,
                 options_.device,
                 c10::nullopt));
+            if (shouldFillAllocationWithNan()) {
+              fillTensorWithNan(global_buffers.buffers.back());
+            }
             global_buffers.zero_init.push_back(false);
           }
         }
@@ -1075,7 +1151,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
 
       auto& output_alias_indices = output_alias_indices_entry.get();
 
-      allocated_outputs = allocOutputs(expr_eval, output_alias_indices);
+      allocated_outputs = allocOutputs(args, expr_eval, output_alias_indices);
 
       for (const auto& entry : alias_indices) {
         auto aliased_output_index = entry.first;
@@ -1243,7 +1319,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
 
     bytes_processed_ = 0;
     // Figure how many bytes are inputs, outputs, and temporary buffers
-    for (auto i : c10::irange(args.size())) {
+    for (auto i : c10::irange(num_inputs)) {
       if (auto tensor_arg_abstract =
               dynamic_cast<const TensorArgAbstract*>(args[i])) {
         bytes_processed_ += tensor_arg_abstract->numel() *
diff --git a/torch/csrc/jit/codegen/cuda/executor.h b/torch/csrc/jit/codegen/cuda/executor.h
index 1d6ff4487b8f6..9d4775b37ca95 100644
--- a/torch/csrc/jit/codegen/cuda/executor.h
+++ b/torch/csrc/jit/codegen/cuda/executor.h
@@ -16,6 +16,9 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
+TORCH_CUDA_CU_API bool shouldFillAllocationWithNan();
+TORCH_CUDA_CU_API void setFillAllocationWithNan(bool value);
+
 // TODO: Should this actually be in launch params?
 struct TORCH_CUDA_CU_API CompileOptions {
   c10::Device device = c10::Device(c10::DeviceType::CUDA, 0);
@@ -217,6 +220,7 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
   // skip allocating real storage for those, but still maintain its spot to
   // maintain the indexing from output aliases to inputs
   std::vector<at::Tensor> allocOutputs(
+      const KernelArgumentHolder& args,
       kir::ExpressionEvaluator& expr_eval,
       const std::unordered_set<int>& alias_indices = {});
 
diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
index dd5334542fc90..6da05cbf4dcba 100644
--- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
@@ -845,11 +845,6 @@ void bindInputForExprEvaluation(
 
         const auto value =
             root_domain[dim]->hasExpandedExtent() ? 1 : tensor_arg_size;
-        if (value == 0 && cg_tensor->uses().empty()) {
-          // If there's no uses, ignore there's a size-0 dimension.
-          continue;
-        }
-        TORCH_INTERNAL_ASSERT(value != 0, "Cannot handle size-0 dimensions");
         bool should_bind = true;
         if (check_consistency) {
           const auto prev_value = expr_eval.evaluate(extent);
@@ -1023,6 +1018,12 @@ std::pair<NvrtcFunction, std::string> nvrtcCompile(
   // compile to sass is not allowed prior to CUDA 11.1
   compile_to_sass = false;
 #endif
+
+  if (isOptionDisabled(DisableOption::CompileToSass)) {
+    // Allows manually disabling compilation to sass
+    //  so the intermediate ptx could be checked.
+    compile_to_sass = false;
+  }
   // CUDA 11.1 allows going directly to SASS (sm_) instead of PTX (compute_)
   // which gives better backwards compatibility to work on older driver,
   // (since older driver doesn't necessrily recognize PTX emitted by new
diff --git a/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp b/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp
index 7bda8682189ee..6e1c628111113 100644
--- a/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp
+++ b/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp
@@ -54,7 +54,17 @@ void ExpressionEvaluator::bind(Val* value, const IntOrDouble& concrete_value) {
   TORCH_CHECK(
       value->definition() == nullptr,
       "Tried to bind to a value that is computed in the fusion IR");
-  known_values_[value] = concrete_value;
+  if (value->isA<NamedScalar>()) {
+    known_named_scalars_[value->as<NamedScalar>()->name()] = concrete_value;
+  } else {
+    known_values_[value] = concrete_value;
+  }
+}
+
+void ExpressionEvaluator::bind(
+    const std::string& name,
+    const IntOrDouble& concrete_value) {
+  known_named_scalars_[name] = concrete_value;
 }
 
 c10::optional<IntOrDouble> ExpressionEvaluator::evaluate(Val* value) {
@@ -88,7 +98,7 @@ void ExpressionEvaluator::print() const {
 c10::optional<IntOrDouble> ExpressionEvaluator::getValue(Val* value) {
   TORCH_INTERNAL_ASSERT(
       value->isAnInt() || value->isADouble(),
-      "Expression Evaluation does not support values other than integers at this time.");
+      "Expression Evaluation does not support values other than integers/doubles at this time.");
 
   if (value->getValType().value() == ValType::Scalar) {
     if (value->isAnInt() && value->as<Int>()->value().has_value()) {
@@ -99,12 +109,20 @@ c10::optional<IntOrDouble> ExpressionEvaluator::getValue(Val* value) {
     }
   }
 
-  const auto it = known_values_.find(value);
-  return it != known_values_.end() ? c10::optional<IntOrDouble>(it->second)
-                                   : c10::nullopt;
+  if (value->isA<NamedScalar>()) {
+    const auto it = known_named_scalars_.find(value->as<NamedScalar>()->name());
+    return it != known_named_scalars_.end()
+        ? c10::optional<IntOrDouble>(it->second)
+        : c10::nullopt;
+  } else {
+    const auto it = known_values_.find(value);
+    return it != known_values_.end() ? c10::optional<IntOrDouble>(it->second)
+                                     : c10::nullopt;
+  }
 }
 
 void ExpressionEvaluator::handle(UnaryOp* uop) {
+  using namespace IntOrDouble_functions;
   const auto in = evaluate(uop->in());
   if (in.has_value()) {
     switch (uop->getUnaryOpType()) {
@@ -123,6 +141,9 @@ void ExpressionEvaluator::handle(UnaryOp* uop) {
           TORCH_INTERNAL_ASSERT(false, "dtype not supported in evaluator");
         }
         break;
+      case UnaryOpType::Abs:
+        known_values_[uop->out()] = abs(*in);
+        break;
       default:
         TORCH_CHECK(
             !"Unexpected operator type ",
diff --git a/torch/csrc/jit/codegen/cuda/expr_evaluator.h b/torch/csrc/jit/codegen/cuda/expr_evaluator.h
index 8d906ff58e43d..4329f9604304b 100644
--- a/torch/csrc/jit/codegen/cuda/expr_evaluator.h
+++ b/torch/csrc/jit/codegen/cuda/expr_evaluator.h
@@ -7,6 +7,7 @@
 
 #include <c10/util/Optional.h>
 
+#include <string>
 #include <unordered_map>
 
 namespace torch {
@@ -30,6 +31,9 @@ class TORCH_CUDA_CU_API ExpressionEvaluator : private OptOutDispatch {
   //! Bind a concrete value to an IR variable
   void bind(Val* value, const IntOrDouble& concrete_value);
 
+  //! Bind a concrete value to a named scalar
+  void bind(const std::string& name, const IntOrDouble& concrete_value);
+
   //! Try to evaluate a Fusion IR value
   c10::optional<IntOrDouble> evaluate(Val* value);
 
@@ -49,9 +53,11 @@ class TORCH_CUDA_CU_API ExpressionEvaluator : private OptOutDispatch {
 
   void handle(UnaryOp*) final;
   void handle(BinaryOp*) final;
+  // TODO: handle swizzle
 
  private:
   std::unordered_map<const Val*, IntOrDouble> known_values_;
+  std::unordered_map<std::string, IntOrDouble> known_named_scalars_;
   Fusion* fusion_ = nullptr;
   FusionPrecomputedValues* evaluator_precomputed_values_ = nullptr;
 };
diff --git a/torch/csrc/jit/codegen/cuda/fusion.cpp b/torch/csrc/jit/codegen/cuda/fusion.cpp
index 04c367c667275..e4f24f0473a19 100644
--- a/torch/csrc/jit/codegen/cuda/fusion.cpp
+++ b/torch/csrc/jit/codegen/cuda/fusion.cpp
@@ -11,6 +11,7 @@
 #include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
 #include <torch/csrc/jit/codegen/cuda/kernel.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/lower_bank_conflict.h>
 
 namespace torch {
 namespace jit {
@@ -339,6 +340,20 @@ void Fusion::printKernel(DataType index_type) {
   std::cout << codegen::generateCudaKernel(GpuLower(this, index_type).kernel());
 }
 
+std::unordered_map<std::string, std::pair<int, int>> Fusion::bankConflictInfo(
+    DataType index_type) {
+  GpuLower lower(this, index_type);
+  auto kernel = lower.kernel();
+  auto info = getBankConflictInfo(kernel);
+  // The container of exprs goes out of scope, so we return a map of string here
+  std::unordered_map<std::string, std::pair<int, int>> result;
+  result.reserve(info.size());
+  for (auto i : info) {
+    result[i.first->toString()] = i.second;
+  }
+  return result;
+}
+
 void Fusion::printMath(bool from_outputs_only) {
   FUSER_PERF_SCOPE("Fusion::printMath");
 
diff --git a/torch/csrc/jit/codegen/cuda/fusion.h b/torch/csrc/jit/codegen/cuda/fusion.h
index e726d793be756..2c0c59fae2b9b 100644
--- a/torch/csrc/jit/codegen/cuda/fusion.h
+++ b/torch/csrc/jit/codegen/cuda/fusion.h
@@ -136,6 +136,10 @@ class TORCH_CUDA_CU_API Fusion : public IrContainer {
   //! Lower the fusion and print a kernel
   void printKernel(DataType index_type = DataType::Int);
 
+  //! Lower the fusion and evaluate bank conflict info
+  std::unordered_map<std::string, std::pair<int, int>> bankConflictInfo(
+      DataType index_type = DataType::Int);
+
   //! Return a list of topologically sorted expressions. This only includes
   //! exprs required to genereate registered outputs.
   std::vector<Expr*> exprs();
diff --git a/torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp b/torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp
index f993705c9bdc2..c0bf81dc688bf 100644
--- a/torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp
+++ b/torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp
@@ -3190,7 +3190,7 @@ class ForceHalfAnnotation : public IterVisitor {
                val->getDataType().value() == DataType::BFloat16);
         });
 
-    annotation.traverseFrom(fusion, fp16_outputs);
+    annotation.traverseTo(fusion, fp16_outputs);
     return annotation.force_fp16_tv_set_;
   }
 
diff --git a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
index 4135c7babeef8..c2427f9386278 100644
--- a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
+++ b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
@@ -2172,7 +2172,10 @@ void decomposeLinearOps(Block* block) {
 void replaceAliasOpsWithCopy(std::shared_ptr<Graph>& graph, Block* block) {
   static std::unordered_map<Symbol, Symbol> alias_to_copy_mapping(
       {{aten::expand, prim::expand_copy},
-       {aten::expand_as, prim::expand_as_copy}});
+       {aten::expand_as, prim::expand_as_copy},
+       {aten::permute, prim::permute_copy},
+       {aten::transpose, prim::transpose_copy},
+       {aten::t, prim::t_copy}});
   // TODO: revert disabled aten::view
   //    ({{aten::view, prim::view_copy},
   //     {aten::reshape, prim::reshape_copy},
@@ -2224,7 +2227,10 @@ void replaceAliasOpsWithCopy(std::shared_ptr<Graph>& graph, Block* block) {
 void revertAliasCopyOps(std::shared_ptr<Graph>& graph, Block* block) {
   static std::unordered_map<Symbol, Symbol> copy_to_alias_mapping(
       {{prim::expand_copy, aten::expand},
-       {prim::expand_as_copy, aten::expand_as}});
+       {prim::expand_as_copy, aten::expand_as},
+       {prim::permute_copy, aten::permute},
+       {prim::transpose_copy, aten::transpose},
+       {prim::t_copy, aten::t}});
   // TODO: revert disabled aten::view
   //    ({{prim::view_copy, aten::view},
   //     {prim::flatten_copy, aten::flatten},
diff --git a/torch/csrc/jit/codegen/cuda/grouped_reduction.cpp b/torch/csrc/jit/codegen/cuda/grouped_reduction.cpp
index 5931eb3427aa9..d907a0665e9f6 100644
--- a/torch/csrc/jit/codegen/cuda/grouped_reduction.cpp
+++ b/torch/csrc/jit/codegen/cuda/grouped_reduction.cpp
@@ -38,7 +38,7 @@ bool hasMatchingTransformations(TensorView* ref, TensorView* other) {
 }
 
 // Validate grouping of reductions and return a new max producer position
-unsigned int validateReductionGrouping(
+void validateReductionGrouping(
     const std::vector<Val*>& inputs,
     const std::vector<Val*>& outputs) {
   TORCH_INTERNAL_ASSERT(inputs.size() == outputs.size());
@@ -57,7 +57,6 @@ unsigned int validateReductionGrouping(
   const auto num_root_dims = ref_domain.size();
   const auto num_dims = ref_tv->nDims();
   const auto ref_ca_pos = ref_tv->getComputeAtPosition();
-  auto max_producer_pos = ref_tv->getMaxProducerPosition();
   for (const auto i : c10::irange(inputs.size())) {
     auto output_tv = outputs.at(i)->as<TensorView>();
     const auto& output_domain = output_tv->getRootDomain();
@@ -136,9 +135,6 @@ unsigned int validateReductionGrouping(
         ref_tv->toString(),
         ". Mismatched tensor: ",
         output_tv->toString());
-
-    max_producer_pos =
-        std::max(max_producer_pos, output_tv->getMaxProducerPosition());
   }
 
   // Must not have any data dependency from outputs to inputs
@@ -152,8 +148,6 @@ unsigned int validateReductionGrouping(
     }
     TORCH_INTERNAL_ASSERT(all_dep_vals.empty(), ss.str());
   }
-
-  return max_producer_pos;
 }
 
 } // namespace
@@ -194,14 +188,14 @@ void groupReductions(const std::vector<TensorView*>& reduction_outputs) {
     inputs.at(i) = rop->in();
   }
 
-  auto max_producer_pos = validateReductionGrouping(inputs, outputs);
-
-  for (auto output : ir_utils::filterByType<TensorView>(outputs)) {
-    output->setMaxProducer(max_producer_pos);
-  }
+  validateReductionGrouping(inputs, outputs);
 
   IrBuilder::create<GroupedReductionOp>(
       container, op_types, init_vals, outputs, inputs);
+
+  for (auto output : ir_utils::filterByType<TensorView>(outputs)) {
+    output->updateMaxProducerPosition();
+  }
 }
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/index_compute.cpp b/torch/csrc/jit/codegen/cuda/index_compute.cpp
index 5ad56bda15f21..9028f93e9a20f 100644
--- a/torch/csrc/jit/codegen/cuda/index_compute.cpp
+++ b/torch/csrc/jit/codegen/cuda/index_compute.cpp
@@ -51,8 +51,8 @@ int getProducerHaloOffset(
   IterDomain* consumer_id = it->second;
 
   const auto& halo_map = GpuLower::current()->haloInfo();
-  const auto p_pad = halo_map.getRootAxisInfo(producer_id).width(0);
-  const auto c_pad = halo_map.getRootAxisInfo(consumer_id).width(0);
+  const auto p_pad = halo_map->getRootAxisInfo(producer_id).width(0);
+  const auto c_pad = halo_map->getRootAxisInfo(consumer_id).width(0);
 
   auto offset = p_pad - c_pad;
 
@@ -178,7 +178,8 @@ Val* getConcreteProducerOffsetWithGather(
   Val* window_idx = nullptr;
 
   if (use_concrete_map) {
-    window_idx = index_map.at(ir_utils::caMapExactConcreteId(window_id));
+    window_idx = index_map.at(GpuLower::current()->caMap()->getConcreteMappedID(
+        window_id, IdMappingMode::EXACT));
   } else {
     window_idx = index_map.at(window_id);
   }
@@ -440,9 +441,8 @@ void IndexCompute::handle(Merge* merge) {
 
   // When the reference has halo extent for inner_id, that extent needs to
   // be used to un-merge
-  if (reference_halo_extent_map_.find(inner_id) !=
-      reference_halo_extent_map_.end()) {
-    inner_extent = reference_halo_extent_map_[inner_id];
+  if (halo_extent_map_.find(inner_id) != halo_extent_map_.end()) {
+    inner_extent = halo_extent_map_[inner_id];
   }
 
   const auto outer_extent = getExtent(outer_id);
@@ -587,20 +587,16 @@ IndexCompute::IndexCompute(
     std::unordered_set<IterDomain*> zero_domains,
     std::unordered_set<IterDomain*> zero_merged_in,
     std::unordered_set<IterDomain*> preferred_paths,
-    std::unordered_map<IterDomain*, Val*> reference_halo_extent_map)
+    std::unordered_map<IterDomain*, Val*> halo_extent_map)
     : IndexCompute(
           _td,
           std::move(initial_index_map),
           std::move(extent_map),
           std::move(zero_domains),
           std::move(zero_merged_in),
-          ContigIDs(
-              _td->domain(),
-              _td->getMaybeRFactorDomain(),
-              std::vector<bool>(_td->getMaybeRFactorDomain().size(), false),
-              {}),
+          ContigIDs::getNonContigIDs(),
           std::move(preferred_paths),
-          std::move(reference_halo_extent_map)) {}
+          std::move(halo_extent_map)) {}
 
 IndexCompute::IndexCompute(
     const TensorDomain* _td,
@@ -610,14 +606,14 @@ IndexCompute::IndexCompute(
     std::unordered_set<IterDomain*> zero_merged_in,
     const ContigIDs& contig_finder,
     std::unordered_set<IterDomain*> preferred_paths,
-    std::unordered_map<IterDomain*, Val*> reference_halo_extent_map)
+    std::unordered_map<IterDomain*, Val*> halo_extent_map)
     : td_(_td),
       index_map_(std::move(initial_index_map)),
       extent_map_(std::move(extent_map)),
       zero_domains_(std::move(zero_domains)),
       zero_merged_in_(std::move(zero_merged_in)),
       preferred_paths_(std::move(preferred_paths)),
-      reference_halo_extent_map_(std::move(reference_halo_extent_map)) {
+      halo_extent_map_(std::move(halo_extent_map)) {
   FUSER_PERF_SCOPE("GpuLower::Lower::IndexCompute::IndexCompute");
 
   // Make sure we recompute any indices we can that map to a contiguous access
@@ -640,11 +636,11 @@ IndexCompute::IndexCompute(
     std::unordered_map<IterDomain*, Val*> initial_index_map,
     std::unordered_set<IterDomain*> zero_domains,
     std::unordered_set<IterDomain*> preferred_paths,
-    std::unordered_map<IterDomain*, Val*> reference_halo_extent_map)
+    std::unordered_map<IterDomain*, Val*> halo_extent_map)
     : index_map_(std::move(initial_index_map)),
       zero_domains_(std::move(zero_domains)),
       preferred_paths_(std::move(preferred_paths)),
-      reference_halo_extent_map_(std::move(reference_halo_extent_map)) {
+      halo_extent_map_(std::move(halo_extent_map)) {
   FUSER_PERF_SCOPE("GpuLower::Lower::IndexCompute::IndexCompute");
   concrete_id_pass_ = true;
   swizzle_mode_ = SwizzleMode::Loop;
@@ -703,7 +699,9 @@ void IndexCompute::collectIndexIntoPermissiveMap(
     auto id_outputs = ir_utils::filterByType<IterDomain>(expr->outputs());
     if (std::all_of(
             id_outputs.begin(), id_outputs.end(), [this](IterDomain* id) {
-              return index_map_.count(ir_utils::caMapExactConcreteId(id));
+              return index_map_.count(
+                  GpuLower::current()->caMap()->getConcreteMappedID(
+                      id, IdMappingMode::EXACT));
             })) {
       // Visit this expression:
       // LoopIndexingAnalysis::traverseFromDomainVals made sure that each
@@ -715,7 +713,9 @@ void IndexCompute::collectIndexIntoPermissiveMap(
       for (auto id : id_inputs) {
         // Collect backward pass results from this expression if they are
         //  made available in by this expression.
-        auto idx_it = index_map_.find(ir_utils::caMapExactConcreteId(id));
+        auto idx_it =
+            index_map_.find(GpuLower::current()->caMap()->getConcreteMappedID(
+                id, IdMappingMode::EXACT));
 
         if (idx_it != index_map_.end()) {
           permissive_index_map_
@@ -730,7 +730,8 @@ void IndexCompute::collectIndexIntoPermissiveMap(
 void IndexCompute::updateIndexMapFromPermissiveMap(const Expr* id_expr) {
   auto id_outputs = ir_utils::filterByType<IterDomain>(id_expr->outputs());
   for (auto id : id_outputs) {
-    auto concrete_id = ir_utils::caMapExactConcreteId(id);
+    auto concrete_id = GpuLower::current()->caMap()->getConcreteMappedID(
+        id, IdMappingMode::EXACT);
     // Only try to copy index val from permissive map when
     //  the index is missing.
     if (!index_map_.count(concrete_id)) {
@@ -750,7 +751,7 @@ void IndexCompute::run() {
   const std::vector<Val*> domain_vals(
       td_->domain().begin(), td_->domain().end());
 
-  traverseFrom(td_->fusion(), domain_vals, false);
+  traverseTo(td_->fusion(), domain_vals, false);
 }
 
 IterDomain* IndexCompute::maybeGetExactMapConcreteID(IterDomain* id) {
@@ -784,15 +785,14 @@ bool IndexCompute::isZero(IterDomain* id) const {
 IndexCompute IndexCompute::updateIndexCompute(
     const TensorDomain* new_td,
     const std::unordered_map<IterDomain*, IterDomain*>& id_map,
-    const ContigIDs& contig_finder,
-    const std::unordered_map<IterDomain*, Val*>& reference_halo_extent_map)
-    const {
+    const ContigIDs& contig_finder) const {
   FUSER_PERF_SCOPE("GpuLower::Lower::updateIndexCompute");
 
   std::unordered_map<IterDomain*, Val*> updated_index_map;
   std::unordered_map<IterDomain*, Val*> updated_extent_map;
   std::unordered_set<IterDomain*> updated_zero_domains;
   std::unordered_set<IterDomain*> updated_zero_merged_in;
+  std::unordered_map<IterDomain*, Val*> updated_halo_extent_map;
 
   for (auto id_entry : id_map) {
     IterDomain* prev_id = id_entry.first;
@@ -811,6 +811,11 @@ IndexCompute IndexCompute::updateIndexCompute(
     if (zero_merged_in_.find(prev_id) != zero_merged_in_.end()) {
       updated_zero_merged_in.emplace(new_id);
     }
+
+    auto halo_extent_it = halo_extent_map_.find(prev_id);
+    if (halo_extent_it != halo_extent_map_.end()) {
+      updated_halo_extent_map[new_id] = halo_extent_it->second;
+    }
   }
 
   IndexCompute updated_index_compute(
@@ -821,25 +826,7 @@ IndexCompute IndexCompute::updateIndexCompute(
       updated_zero_merged_in,
       contig_finder,
       {},
-      reference_halo_extent_map);
-
-  if (concrete_id_pass_) {
-    // This should be the same behavior as with a reference tensor
-    //   created, since originally halo was pulled through exact
-    //   ca mapping and in the concrete_id_pass case, the id_map
-    //   also represents exact ca mapping.
-    // TODO: might need to re-visit pathological cases when we may
-    //  need to traverse and propagate halo info again in here.
-    for (auto id_entry : id_map) {
-      IterDomain* prev_id = id_entry.first;
-      IterDomain* new_id = id_entry.second;
-      auto halo_extent_it = reference_halo_extent_map_.find(prev_id);
-      if (halo_extent_it != reference_halo_extent_map_.end()) {
-        updated_index_compute.reference_halo_extent_map_[new_id] =
-            halo_extent_it->second;
-      }
-    }
-  }
+      updated_halo_extent_map);
 
   updated_index_compute.run();
 
@@ -860,7 +847,7 @@ class UpdateLeafIndices : public IterVisitor {
     const std::vector<Val*> domain_vals(
         td_->domain().begin(), td_->domain().end());
 
-    traverseFrom(td_->fusion(), domain_vals, false);
+    traverseTo(td_->fusion(), domain_vals, false);
   }
 
   const std::unordered_map<IterDomain*, Val*>& indexMap() const {
@@ -985,7 +972,7 @@ Val* getHaloExtentOfRootAxis(IterDomain* id, Val* normal_extent = nullptr) {
     normal_extent = id->extent();
   }
 
-  const auto& halo = GpuLower::current()->haloInfo().getRootAxisInfo(id);
+  const auto& halo = GpuLower::current()->haloInfo()->getRootAxisInfo(id);
   if (halo.hasHalo()) {
     auto halo_extent = SimplifyingIrBuilder::addExpr(
         normal_extent, SimplifyingIrBuilder::create<Int>(halo.width()));
@@ -1506,7 +1493,8 @@ std::vector<Val*> Index::getGlobalProducerStridedIndices(
   // effort which means some domains may be producer's original domains.
   std::vector<std::pair<IterDomain*, ParallelType>> p_id_backup;
   for (auto entry : c2p_map) {
-    auto ref_id = ir_utils::caMapExactConcreteId(entry.first);
+    auto ref_id = GpuLower::current()->caMap()->getConcreteMappedID(
+        entry.first, IdMappingMode::EXACT);
     auto p_id = entry.second;
     if (ref_id->getParallelType() == ParallelType::Vectorize) {
       p_id_backup.emplace_back(std::make_pair(p_id, p_id->getParallelType()));
@@ -1745,7 +1733,8 @@ std::vector<Val*> Index::getNonGlobalProducerStridedIndices(
   // effort which means some domains may be the originals.
   std::vector<std::pair<IterDomain*, ParallelType>> p_id_backup;
   for (auto entry : c2p_index_map) {
-    auto ref_id = ir_utils::caMapExactConcreteId(entry.first);
+    auto ref_id = GpuLower::current()->caMap()->getConcreteMappedID(
+        entry.first, IdMappingMode::EXACT);
     auto p_id = entry.second;
     if (ref_id->getParallelType() == ParallelType::Vectorize) {
       p_id_backup.emplace_back(std::make_pair(p_id, p_id->getParallelType()));
@@ -1937,52 +1926,27 @@ std::vector<Val*> Index::getNonGlobalProducerStridedIndices(
   return strided_inds;
 }
 
-std::vector<Val*> Index::getLinearIndex(
+std::vector<Val*> Index::getLinearLogicalIndex(
     TensorView* consumer_tv,
     const std::vector<kir::ForLoop*>& loops) {
-  // Use domain guard to ignore the contiguity of
-  //  consumer tv.
-  TensorDomain* consumer_tv_no_contiguity_domain = nullptr;
-  auto contiguity_vector =
-      std::vector<bool>(consumer_tv->getMaybeRFactorDomain().size(), true);
-  if (consumer_tv->hasRFactor()) {
-    consumer_tv_no_contiguity_domain = IrBuilder::create<TensorDomain>(
-        consumer_tv->getRootDomain(),
-        consumer_tv->getRFactorDomain(),
-        consumer_tv->domain()->domain(),
-        contiguity_vector);
-  } else {
-    consumer_tv_no_contiguity_domain = IrBuilder::create<TensorDomain>(
-        consumer_tv->getRootDomain(),
-        consumer_tv->domain()->domain(),
-        contiguity_vector);
-  }
-
-  ir_utils::TVDomainGuard domain_guard(
-      consumer_tv, consumer_tv_no_contiguity_domain);
-
-  // TODO:
-  //  More optimization on the underlying tensor layout
-  //   will be done in a follow up.
+  auto guard = ir_utils::overrideContiguityGuard(consumer_tv, true);
   return getGlobalConsumerStridedIndices(consumer_tv, loops);
 }
 
-std::vector<Val*> Index::getGlobalConsumerStridedIndices(
-    const TensorView* consumer_tv,
+std::vector<Val*> Index::getPerDimLogicalIndex(
+    TensorView* consumer_tv,
     const std::vector<kir::ForLoop*>& loops) {
-  FUSER_PERF_SCOPE("GpuLower::Lower::getGlobalConsumerIndex");
-
-  auto gpu_lower = GpuLower::current();
-
-  auto index_from_id_graph = getTensorIndexFromIdGraph(loops, consumer_tv);
-
-  auto consumer_indexing = index_from_id_graph.index;
+  auto guard = ir_utils::overrideContiguityGuard(consumer_tv, false);
+  IndexFromIdGraph index_from_id_graph =
+      getTensorIndexFromIdGraph(loops, consumer_tv);
+  return getRootIndices(consumer_tv, loops, index_from_id_graph);
+}
 
+std::vector<Val*> Index::getStrides(const TensorView* tv) {
   // Indices should now be mapped onto IterDomains in consumer, so just grab
   // and use them.
-  auto root_dom = consumer_tv->getMaybeRFactorDomain();
+  auto root_dom = tv->getMaybeRFactorDomain();
 
-  // TODO: Abstract stride logic to reuse with producer indexing
   std::vector<Val*> strides(
       root_dom.size(), GpuLower::current()->kernel()->oneVal());
   {
@@ -1993,14 +1957,13 @@ std::vector<Val*> Index::getGlobalConsumerStridedIndices(
         continue;
       }
       std::stringstream ss;
-      ss << "T" << consumer_tv->name() << ".stride[" << stride_i++ << "]";
+      ss << "T" << tv->name() << ".stride[" << stride_i++ << "]";
       strides[i] =
           SimplifyingIrBuilder::create<NamedScalar>(ss.str(), DataType::Int);
     }
   }
 
-  TORCH_INTERNAL_ASSERT(
-      root_dom.size() == consumer_tv->domain()->contiguity().size());
+  TORCH_INTERNAL_ASSERT(root_dom.size() == tv->domain()->contiguity().size());
   Val* cur_contig_stride = GpuLower::current()->kernel()->oneVal();
   for (const auto i : c10::irange(root_dom.size())) {
     auto dim = root_dom.size() - i - 1;
@@ -2008,24 +1971,7 @@ std::vector<Val*> Index::getGlobalConsumerStridedIndices(
       continue;
     }
 
-    Val* root_ind = nullptr;
-    if (consumer_indexing.indexMap().find(root_dom[dim]) !=
-        consumer_indexing.indexMap().end()) {
-      root_ind = consumer_indexing.indexMap().at(root_dom[dim]);
-    } else if (root_dom[dim]->isBroadcast()) {
-      root_ind = GpuLower::current()->kernel()->zeroVal();
-    }
-
-    TORCH_INTERNAL_ASSERT(
-        root_ind != nullptr,
-        "Couldn't find root mapping for ",
-        consumer_tv->toString(),
-        " dim: ",
-        dim,
-        " id: ",
-        root_dom[dim]->toString());
-
-    if (consumer_tv->domain()->contiguity()[dim]) {
+    if (tv->domain()->contiguity()[dim]) {
       // If contig, used the stored stride which may be the previous
       // dimensions stride * previous dimensions size
       strides[dim] = cur_contig_stride;
@@ -2041,12 +1987,18 @@ std::vector<Val*> Index::getGlobalConsumerStridedIndices(
           strides[dim], getHaloExtentOfRootAxis(root_dom[dim]));
     }
   }
+  return strides;
+}
 
-  auto vectorize_shift =
-      loops.empty() ? nullptr : loops.back()->vectorize_shift();
+std::vector<Val*> Index::getRootIndices(
+    const TensorView* tv,
+    const std::vector<kir::ForLoop*>& loops,
+    const IndexFromIdGraph& index_from_id_graph) {
+  auto gpu_lower = GpuLower::current();
+  auto root_dom = tv->getMaybeRFactorDomain();
+  auto indexing = index_from_id_graph.index;
 
-  // Global striding
-  std::vector<Val*> strided_inds(
+  std::vector<Val*> root_inds(
       root_dom.size(), GpuLower::current()->kernel()->zeroVal());
   for (const auto i : c10::irange(root_dom.size())) {
     // See a comment in indexing to root domains in getGlobalProducerIndex.
@@ -2057,22 +2009,21 @@ std::vector<Val*> Index::getGlobalConsumerStridedIndices(
     }
 
     TORCH_INTERNAL_ASSERT(
-        consumer_indexing.indexMap().find(root_dom[i]) !=
-            consumer_indexing.indexMap().end(),
+        indexing.indexMap().find(root_dom[i]) != indexing.indexMap().end(),
         "Couldn't find root mapping for ",
-        consumer_tv->toString(),
+        tv->toString(),
         " dim: ",
         i,
         " id: ",
         root_dom[i]->toString());
 
-    auto root_ind = consumer_indexing.indexMap().at(root_dom[i]);
+    auto root_ind = indexing.indexMap().at(root_dom[i]);
 
     // index hoist must be done before the adjustments for halo
     root_ind = hoistConsumerIndex(
         root_dom[i],
-        consumer_tv,
-        consumer_indexing,
+        tv,
+        indexing,
         index_from_id_graph.resolved_loop_domains,
         index_from_id_graph.initial_concrete_index_map,
         loops,
@@ -2080,12 +2031,33 @@ std::vector<Val*> Index::getGlobalConsumerStridedIndices(
 
     root_ind = SimplifyingIrBuilder::addExpr(
         root_ind, getGlobalConsumerOffsetWithPartialSplit(root_dom[i]));
+    root_inds[i] = root_ind;
+  }
+  return root_inds;
+}
 
-    if (root_ind->isZeroInt()) {
+std::vector<Val*> Index::getGlobalConsumerStridedIndices(
+    const TensorView* consumer_tv,
+    const std::vector<kir::ForLoop*>& loops) {
+  FUSER_PERF_SCOPE("GpuLower::Lower::getGlobalConsumerIndex");
+
+  auto index_from_id_graph = getTensorIndexFromIdGraph(loops, consumer_tv);
+  auto consumer_indexing = index_from_id_graph.index;
+  auto strides = getStrides(consumer_tv);
+  auto root_inds = getRootIndices(consumer_tv, loops, index_from_id_graph);
+
+  // Global striding
+  auto vectorize_shift =
+      loops.empty() ? nullptr : loops.back()->vectorize_shift();
+  std::vector<Val*> strided_inds(
+      root_inds.size(), GpuLower::current()->kernel()->zeroVal());
+  for (const auto i : c10::irange(root_inds.size())) {
+    if (root_inds[i]->isZeroInt()) {
       continue;
     } else {
-      auto strided_ind = SimplifyingIrBuilder::mulExpr(root_ind, strides[i]);
-      if (i == root_dom.size() - 1 && vectorize_shift != nullptr) {
+      auto strided_ind =
+          SimplifyingIrBuilder::mulExpr(root_inds[i], strides[i]);
+      if (i == strides.size() - 1 && vectorize_shift != nullptr) {
         strided_inds[i] =
             SimplifyingIrBuilder::addExpr(strided_ind, vectorize_shift);
       } else {
@@ -2354,103 +2326,71 @@ std::vector<PredicateDomainInfo> getPredicateContigIds(
 
   const auto& consumer_root_domain = consumer_tv->getRootDomain();
 
-  std::vector<IterDomain*> contiguous_ids = consumer_root_domain;
-
-  if (contiguous_ids.empty()) {
+  if (consumer_root_domain.empty()) {
     return std::vector<PredicateDomainInfo>();
   }
 
-  // If root IDs are partial, i.e., start is non-zero and stop is not
-  // equal to extent, predication can't be done with merged domains as
-  // start and stop information is only available with root
-  // domains. Similarly, merged domains don't have enough information
-  // about halo to do correct predication, so they must be excluded.
-  std::unordered_set<IterDomain*> excluded_ids;
+  std::unordered_map<IterDomain*, Val*> concrete_index_map;
+  for (auto entry : consumer_index_map) {
+    auto c_id = gpu_lower->caMap()->getConcreteMappedID(
+        entry.first, IdMappingMode::EXACT);
+    concrete_index_map[c_id] = entry.second;
+  }
 
-  for (auto consumer_root_id : consumer_root_domain) {
-    if (gpu_lower->haloInfo().getRootAxisInfo(consumer_root_id).hasHalo()) {
-      excluded_ids.insert(consumer_root_id);
-      continue;
-    }
-    if (consumer_root_id->maybePartial()) {
-      excluded_ids.insert(consumer_root_id);
-      continue;
-    }
-    // When consumer_root_id is a broadcast domain, do not allow contig
-    // predication as the merged output is not mapped with the
-    // reference unless the concrete domain is also a broadcast
-    // domain.
-    if (consumer_root_id->isBroadcast() &&
-        !GpuLower::current()
-             ->caMap()
-             ->getConcreteMappedID(consumer_root_id, IdMappingMode::PERMISSIVE)
-             ->isBroadcast()) {
-      excluded_ids.insert(consumer_root_id);
+  std::vector<bool> predicate_contiguity(consumer_root_domain.size(), true);
+  std::unordered_set<IterDomain*> final_ids;
+  for (auto root_i : c10::irange(predicate_contiguity.size())) {
+    auto root_id = consumer_root_domain[root_i];
+    if (root_id->maybePartial()) {
+      final_ids.insert(root_id);
       continue;
     }
     // Shifted or gathered axes need to be predicated at the root domain
     auto shift_expr = dynamic_cast<ShiftOp*>(consumer_tv->definition());
     auto gather_expr = dynamic_cast<GatherOp*>(consumer_tv->definition());
-    if (shift_expr == nullptr && gather_expr == nullptr) {
-      continue;
-    }
-    auto consumer_root_pos = consumer_tv->domain()->rootPosOf(consumer_root_id);
-    if ((shift_expr && shift_expr->offset(consumer_root_pos) != 0) ||
-        (gather_expr && consumer_root_pos < gather_expr->windowShape().size() &&
-         gather_expr->windowShape().at(consumer_root_pos) != 1)) {
-      excluded_ids.insert(consumer_root_id);
+    if ((shift_expr && shift_expr->offset(root_i) != 0) ||
+        (gather_expr && root_i < gather_expr->windowShape().size() &&
+         gather_expr->windowShape().at(root_i) != 1)) {
+      final_ids.insert(root_id);
     }
   }
 
-  // Run through iteration domain history
-  auto exprs = StmtSort::getExprs(
-      consumer_tv->fusion(),
-      {consumer_tv->domain()->domain().begin(),
-       consumer_tv->domain()->domain().end()});
+  ContigIDs contig_finder(
+      consumer_tv->domain()->domain(),
+      consumer_root_domain,
+      predicate_contiguity,
+      final_ids,
+      concrete_index_map,
+      GpuLower::current()->divisbleSplitSet(),
+      GpuLower::current()->caMap(),
+      GpuLower::current()->haloInfo(),
+      GpuLower::current()->concretizedBroadcastDomains(),
+      {},
+      false,
+      true);
 
-  for (auto expr : exprs) {
-    // If not a merge, output is not contiguous
-    if (expr->isA<Merge>()) {
-      auto merge = expr->as<Merge>();
-      auto inner_contig_it = std::find(
-          contiguous_ids.begin(), contiguous_ids.end(), merge->inner());
-      auto outer_contig_it = std::find(
-          contiguous_ids.begin(), contiguous_ids.end(), merge->outer());
+  std::vector<PredicateDomainInfo> contig_id_infos;
+  std::unordered_set<IterDomain*> covered_roots;
 
-      if (excluded_ids.count(merge->inner()) > 0 ||
-          excluded_ids.count(merge->outer()) > 0) {
-        continue;
-      }
+  // Create entries and return them
+  for (auto root_id : consumer_root_domain) {
+    if (covered_roots.count(root_id) > 0) {
+      continue;
+    }
 
-      // Do not try to predicate the merge output domain if the output
-      // domain has not a predicate that is mapped from the reference.
-      // See FusionContigPredicate_CUDA for a concrete example.
-      if (consumer_index_map.find(merge->out()) == consumer_index_map.end()) {
-        continue;
-      }
+    auto contig_id_it = contig_finder.rootToIndexedID().find(root_id);
 
-      if (inner_contig_it != contiguous_ids.end() &&
-          outer_contig_it != contiguous_ids.end()) {
-        // If inner and outer are contiguous, out must be contiguous. Remove
-        // inner and outer, and add out.
-        contiguous_ids.erase(outer_contig_it);
-        contiguous_ids.erase(std::find(
-            contiguous_ids.begin(), contiguous_ids.end(), merge->inner()));
-        contiguous_ids.emplace_back(merge->out());
-      }
-    }
-  }
+    TORCH_INTERNAL_ASSERT(
+        contig_id_it != contig_finder.rootToIndexedID().end(),
+        "Error in predicate contiguity analysis, missing index for root ",
+        root_id->toString());
 
-  std::vector<PredicateDomainInfo> contig_id_infos;
+    auto contig_id = contig_id_it->second;
 
-  // Create entries and return them
-  for (auto contig_id : contiguous_ids) {
     // Pick inputs from the starting domains, i.e.,
     // reference_predicated_root_domain.
-    auto contig_root_vals = IterVisitor::getInputsTo(
-        {contig_id},
-        {consumer_root_domain.begin(), consumer_root_domain.end()});
-    auto contig_root_ids = ir_utils::filterByType<IterDomain>(contig_root_vals);
+    auto contig_root_ids = contig_finder.indexedRootIDs(contig_id);
+    covered_roots.insert(contig_root_ids.begin(), contig_root_ids.end());
     PredicateDomainInfo contig_id_info;
     contig_id_info.id = contig_id;
     contig_id_info.covered_ids = std::unordered_set<IterDomain*>(
@@ -2504,7 +2444,7 @@ int getUnswitchStopOffset(
   const auto gpu_lower = GpuLower::current();
 
   AxisHaloInfo halo_info =
-      gpu_lower->haloInfo().getRootAxisInfo(consumer_root_id);
+      gpu_lower->haloInfo()->getRootAxisInfo(consumer_root_id);
 
   // If the consumer root domain to predicate does not have halo, no
   // adjustment is required.
@@ -2528,7 +2468,7 @@ int getUnswitchStopOffset(
           unswitch_it,
           consumer_tv->domain()->domain().end(),
           [&gpu_lower, &consumer_root_id](auto leaf_id) {
-            return gpu_lower->haloInfo().isHaloInherited(
+            return gpu_lower->haloInfo()->isHaloInherited(
                 consumer_root_id, leaf_id);
           })) {
     return halo_info.width();
@@ -2686,7 +2626,8 @@ std::pair<Val*, Val*> getStartAndStopLimitOffsets(
   Val* stop_limit = SimplifyingIrBuilder::negExpr(consumer_id->stopOffset());
 
   if (!non_divisible_pred) {
-    AxisHaloInfo halo_info = gpu_lower->haloInfo().getRootAxisInfo(consumer_id);
+    AxisHaloInfo halo_info =
+        gpu_lower->haloInfo()->getRootAxisInfo(consumer_id);
 
     // Below, "left" and "right" halo mean halo at offset zero and
     // axis extent, respectively.
@@ -2710,8 +2651,8 @@ std::pair<Val*, Val*> getStartAndStopLimitOffsets(
     // that it is less than the extent of the predicated ID +
     // halo. Note that getRootAxisInfo doesn't work since consumer_id
     // isn't a root domain.
-    if (gpu_lower->haloInfo().hasHaloWidth(consumer_id)) {
-      auto halo = gpu_lower->haloInfo().getHaloWidth(consumer_id);
+    if (gpu_lower->haloInfo()->hasHaloWidth(consumer_id)) {
+      auto halo = gpu_lower->haloInfo()->getHaloWidth(consumer_id);
       stop_limit = SimplifyingIrBuilder::addExpr(stop_limit, halo);
     }
   }
@@ -2858,8 +2799,8 @@ bool canOmitStopPredicate(
   // to be predicated, not its merged contig id even if it exists. So,
   // if contig_id does not have root axis info, contig_id is
   // guaranteed to have no halo.
-  auto halo_ext = gpu_lower->haloInfo().hasRootAxisInfo(contig_id)
-      ? gpu_lower->haloInfo().getRootAxisInfo(contig_id).width()
+  auto halo_ext = gpu_lower->haloInfo()->hasRootAxisInfo(contig_id)
+      ? gpu_lower->haloInfo()->getRootAxisInfo(contig_id).width()
       : 0;
 
   if (halo_ext + stop_offset_val.value() > 0) {
@@ -2977,14 +2918,6 @@ std::vector<RootPredicateInfo> Index::getReferenceRootPredicates(
 
   auto db_axis = gpu_lower->doubleBufferInfo().getDoubleBufferAxis(consumer_tv);
 
-  // Indexing is done without considering contig merging. Actual
-  // predicated domains are determined by considering contiguity.
-  const ContigIDs contig_finder(
-      consumer_tv->domain()->domain(),
-      consumer_tv->getMaybeRFactorDomain(),
-      std::vector<bool>(consumer_tv->getMaybeRFactorDomain().size(), false),
-      {});
-
   // Generate start and stop indexing from idgraph.
   //
   // Both start and stop positions may need to be predicated. Indexing
diff --git a/torch/csrc/jit/codegen/cuda/index_compute.h b/torch/csrc/jit/codegen/cuda/index_compute.h
index 43cde710fdfc4..9a94ee94ac09c 100644
--- a/torch/csrc/jit/codegen/cuda/index_compute.h
+++ b/torch/csrc/jit/codegen/cuda/index_compute.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/reference_tensor.h>
 #include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
 
 #include <unordered_map>
@@ -17,40 +16,40 @@
  * indices (based on input indices) that match the root dimension.
  *
  * For example with GLOBAL tensor:
- * TV[I, J]
- * TV[Io, Ii{4}, J] = TV.split(I, factor=4)
+ * TV[I, K]
+ * TV[Io, Ii{4}, K] = TV.split(I, factor=4)
  * ALLOC: NONE
  * INDEX: indexCompute {i, j, k} -> {i * 4 + j, k}
- * FLATTENED_INDEX: {i * 4 + j, k} -> {i * 4 + j * J + k}
+ * FLATTENED_INDEX: {i * 4 + j, k} -> {(i * 4 + j) * K + k}
  * PREDICATE: {i * 4 + j, k} -> i * 4 + j < I
  *
  *
  * For example with SHARED tensor:
  *
- * global_TV[I, J]
- * global_TV[Io, Ii{4}, J] = global_TV.split(I, factor=4)
+ * global_TV[I, K]
+ * global_TV[Io, Ii{4}, K] = global_TV.split(I, factor=4)
  * smem_TV.compute_at(global_TV, 1)
  * global_TV.parallelize(1, threadIDx.x)
  *
- * ALLOC: alloc(smem_TV, 4 x J)
+ * ALLOC: alloc(smem_TV, 4 x K)
  * INDEX: indexCompute(smem_TV, {threadIdx.x, k}) -> {threadIdx.x, k}
- * FLATTENED_INDEX: {threadIdx.x * 4 + j, k} -> {threadIdx.x * 4 + j * J + k}
+ * FLATTENED_INDEX: {threadIdx.x * 4 + j, k} -> {(threadIdx.x * 4 + j) * K + k}
  * PREDICATE: {threadIdx.x * 4 + j, k} -> threadIdx.x * 4 + j < I // Same as if
  * global
  *
  *
  * For example with LOCAL tensor:
- * global_TV[I, J, K]
- * global_TV[Io, Ii{4}, J] = global_TV.split(I, factor=4)
- * reg_TV.compute_at(global_TV, 1)
+ * global_TV[I, K, L]
+ * global_TV[Io, Ii{4}, K, L] = global_TV.split(I, factor=4)
+ * reg_TV.compute_at(global_TV, 2)
  * global_TV.parallelize(1, threadIDx.x)
  * global_TV{i, j, k, l} -> { i * 4 + j, k, l }
- * global_TV{ i * 4 + j, k, l } -> { i * 4 + j * J * K  +  k * K  +  l}
+ * global_TV{ i * 4 + j, k, l } -> { (i * 4 + j) * K * L  +  k * L  +  l}
  *
- * ALLOC: alloc(reg_TV, J x K)
+ * ALLOC: alloc(reg_TV, K x L)
  * INDEX: {k, l} -> {k, l}
- * FLATTENED_INDEX: {k, l} -> {k * J + l}
- * PREDICATE: i * 4 + j < I && k < J && l < K ->  // Same as if global
+ * FLATTENED_INDEX: {k, l} -> {k * L + l}
+ * PREDICATE: i * 4 + j < I && k < K && l < L ->  // Same as if global
  *
  * These indices can then be flattened later based on strides.
  */
@@ -62,6 +61,7 @@ namespace cuda {
 
 class ContigIDs;
 class LoopIndexing;
+struct IndexFromIdGraph;
 
 class IndexCompute : public BackwardVisitor {
  protected:
@@ -134,9 +134,8 @@ class IndexCompute : public BackwardVisitor {
   // if there's an option
   std::unordered_set<IterDomain*> preferred_paths_;
 
-  // Map from IterDomains to halo-extended extents in corresponding
-  // reference tensor
-  std::unordered_map<IterDomain*, Val*> reference_halo_extent_map_;
+  // Map from IterDomains to halo-extended extents
+  std::unordered_map<IterDomain*, Val*> halo_extent_map_;
 
   // Temporary flag which tells IndexCompute to use concrete id's from the exact
   // map rather than the actual IDs used in the ID expressions.
@@ -188,7 +187,7 @@ class IndexCompute : public BackwardVisitor {
       std::unordered_set<IterDomain*> zero_domains,
       std::unordered_set<IterDomain*> _zero_merged_in,
       std::unordered_set<IterDomain*> preferred_paths = {},
-      std::unordered_map<IterDomain*, Val*> reference_halo_extent_map = {});
+      std::unordered_map<IterDomain*, Val*> halo_extent_map = {});
 
   IndexCompute(
       const TensorDomain* _td,
@@ -198,7 +197,7 @@ class IndexCompute : public BackwardVisitor {
       std::unordered_set<IterDomain*> _zero_merged_in,
       const ContigIDs& contig_finder,
       std::unordered_set<IterDomain*> preferred_paths = {},
-      std::unordered_map<IterDomain*, Val*> reference_halo_extent_map = {});
+      std::unordered_map<IterDomain*, Val*> halo_extent_map = {});
 
   // Entry point used for using concrete id based traversal. This traversal is
   // assumed to start at leaf IDs provided by initial_index_map.
@@ -213,9 +212,7 @@ class IndexCompute : public BackwardVisitor {
   IndexCompute updateIndexCompute(
       const TensorDomain* new_td,
       const std::unordered_map<IterDomain*, IterDomain*>& id_map,
-      const ContigIDs& contig_finder,
-      const std::unordered_map<IterDomain*, Val*>& reference_halo_extent_map =
-          {}) const;
+      const ContigIDs& contig_finder) const;
 
   // Interface to run index traversal through loop indexing analysis result to
   // be used with the entry point for concrete id based traversal.
@@ -331,6 +328,15 @@ class Index {
       const TensorView* consumer,
       const std::vector<kir::ForLoop*>& loops);
 
+  // get the strides of a tensor used for the index lowering
+  static std::vector<Val*> getStrides(const TensorView* tv);
+
+  // get the root indices of a tensor used for the index lowering
+  static std::vector<Val*> getRootIndices(
+      const TensorView* tv,
+      const std::vector<kir::ForLoop*>& loops,
+      const IndexFromIdGraph& index_from_id_graph);
+
  public:
   // Indexing functions
   // Consumer = Producer
@@ -363,19 +369,28 @@ class Index {
       const TensorView* consumer,
       const std::vector<kir::ForLoop*>& loops);
 
-  //! Returns a vector of strided indices mapped onto the (rfactor)
+  //! Returns the logical index linearized from a multi-dimension address into a
+  //! linear memory address a consumer tensor. The returned index is intended to
+  //! be used for the computation of some tensor factories, such as: arange and
+  //! rand (for Philox pseudo random sequences)
+  static std::vector<Val*> getLinearLogicalIndex(
+      TensorView* consumer_tv,
+      const std::vector<kir::ForLoop*>& loops);
+
+  //! Returns a vector of logical indices mapped onto the (rfactor)
   //! root domain of a consumer tensor. The returned index is intended
-  //! to be used to index into arange or Philox pseudo random sequences
-  static std::vector<Val*> getLinearIndex(
+  //! to be used for the computation of some tensor factories, such as:
+  //! eye
+  static std::vector<Val*> getPerDimLogicalIndex(
       TensorView* consumer_tv,
       const std::vector<kir::ForLoop*>& loops);
 
   //! Take a consumer tensorview and loop nest and generates predicates
   //! associated with the concrete roots of the loop nest. Returns a list of
-  //! predicates, and a list of concrete roots they're associated with. It is
-  //! assumed that no predicate is required if index[i] is an index directly
-  //! from a for loop. This will not catch all cases if we actually have static
-  //! size information for example:
+  //! predicates, and a list of concrete roots they're associated with. It
+  //! is assumed that no predicate is required if index[i] is an index
+  //! directly from a for loop. This will not catch all cases if we actually
+  //! have static size information for example:
   //!
   //! TV[I].split(4)
   //! would produce the code:
@@ -384,14 +399,14 @@ class Index {
   //!     if( i * 4 + j < TV.size(0))
   //!       TV[i * 4 + j]...
   //!
-  //! However if we had TV.size[0] = 16 at "compile time" then we wouldn't need
-  //! the predicate. This will be caught by canOmitPredicate in the predicate
-  //! lowering
+  //! However if we had TV.size[0] = 16 at "compile time" then we wouldn't
+  //! need the predicate. This will be caught by canOmitPredicate in the
+  //! predicate lowering
   //!
-  //! unswitch_or_vec_loop is the for loop to start the unswitch like predicate,
-  //! this is not a bool value as if we have an unswitch loop with a vectorized
-  //! loop inside, we only want to base the "unswitch" like predicate on the
-  //! vectorized loop.
+  //! unswitch_or_vec_loop is the for loop to start the unswitch like
+  //! predicate, this is not a bool value as if we have an unswitch loop
+  //! with a vectorized loop inside, we only want to base the "unswitch"
+  //! like predicate on the vectorized loop.
   static std::vector<RootPredicateInfo> getReferenceRootPredicates(
       TensorView* consumer_tv,
       const std::vector<kir::ForLoop*>& loops,
diff --git a/torch/csrc/jit/codegen/cuda/inline_propagator.cpp b/torch/csrc/jit/codegen/cuda/inline_propagator.cpp
deleted file mode 100644
index a5edae083a32a..0000000000000
--- a/torch/csrc/jit/codegen/cuda/inline_propagator.cpp
+++ /dev/null
@@ -1,385 +0,0 @@
-#include <torch/csrc/jit/codegen/cuda/inline_propagator.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
-#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
-
-#include <utility>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-MaxPosCalculator::MaxPosCalculator(
-    ComputeAtMode mode,
-    std::unordered_set<IterDomain*> uninlinable_ids)
-    : mode_(mode), uninlinable_ids_(std::move(uninlinable_ids)) {
-  buildUnmappableDims();
-}
-
-void MaxPosCalculator::buildUnmappableDims() {
-  ComputeAtRootDomainMap root_map;
-  root_map.build();
-
-  auto all_tvs = ir_utils::allTvs(FusionGuard::getCurFusion());
-  for (auto tv : all_tvs) {
-    auto consumers = ir_utils::consumerTvsOf(tv);
-    for (auto consumer : consumers) {
-      // Grab dimensions in producer and consumer that are mappable to eachother
-      // based on the computeAtRootDomainMap. This will tell us which dimensions
-      // can be inlined based on avoiding trying to inline non-trivial
-      // reduction structures.
-      auto mappable_roots =
-          root_map.getMappableDims(tv->domain(), consumer->domain());
-      for (auto tv_root_id : tv->getMaybeRFactorDomain()) {
-        if (mappable_roots.find(tv_root_id) == mappable_roots.end() &&
-            !tv_root_id->isTrivialReduction()) {
-          unmappable_dims_.emplace(tv_root_id);
-        }
-      }
-    }
-  }
-}
-
-bool MaxPosCalculator::isAllowedID(
-    IterDomain* id,
-    TensorView* tv,
-    bool allow_reduction,
-    bool allow_vectorize,
-    bool allow_unmappable) const {
-  bool allowed = true;
-
-  if (!allow_reduction) {
-    allowed = allowed && !id->isReduction();
-  }
-
-  if (uninlinable_ids_.count(id)) {
-    return false;
-  }
-
-  if (!allow_vectorize) {
-    // Avoid inlining if marked as Vectorize or Group. In the case of
-    // BestEffort and MostInlined modes, avoid Unroll as well.
-    bool is_vectorize = isParallelTypeVectorize(id->getParallelType()) ||
-        id->getParallelType() == ParallelType::Group ||
-        ((mode_ == ComputeAtMode::BestEffort ||
-          mode_ == ComputeAtMode::MostInlined) &&
-         id->getParallelType() == ParallelType::Unroll);
-    allowed = allowed && !is_vectorize;
-  }
-
-  if (!allow_unmappable) {
-    auto root_dom = tv->getMaybeRFactorDomain();
-    std::unordered_set<Val*> root_dom_set(root_dom.begin(), root_dom.end());
-    auto all_vals = DependencyCheck::getAllValsBetween(root_dom_set, {id});
-    bool is_unmappable = false;
-    for (auto val : all_vals) {
-      auto id = val->as<IterDomain>();
-      if (root_dom_set.count(val) > 0 && unmappable_dims_.count(id) > 0) {
-        is_unmappable = true;
-        break;
-      }
-    }
-    allowed = allowed && !is_unmappable;
-  }
-
-  return allowed;
-}
-
-size_t MaxPosCalculator::getMaxPosSelf(
-    TensorView* tv,
-    bool allow_reduction,
-    bool allow_vectorize,
-    bool allow_unmappable) const {
-  auto dom = tv->domain()->domain();
-  auto iter = std::find_if(dom.begin(), dom.end(), [=](IterDomain* id) {
-    return !isAllowedID(
-        id, tv, allow_reduction, allow_vectorize, allow_unmappable);
-  });
-  return std::distance(dom.begin(), iter);
-}
-
-// Return the max position in producer that can be inlined to consumer
-// Cannot inline:
-//   Vectorized dimensions in consumer
-//   Unrolled dimensions in consumer
-size_t MaxPosCalculator::getMaxProducerPosFromConsumer(
-    TensorView* producer,
-    TensorView* consumer) const {
-  auto pairwise_root_map = PairwiseRootDomainMap(producer, consumer);
-  auto replay_CasP =
-      BestEffortReplay::replayCasP(consumer, producer, -1, pairwise_root_map);
-  auto p2c_replay_map = replay_CasP.getReplay();
-
-  for (size_t producer_pos = 0; producer_pos < producer->nDims();
-       producer_pos++) {
-    // If the producer position is mismatching with the consumer, then we can
-    // not inline into this position, otherwise the max producer position of
-    // the consumer will become invalid and expression sort will fail.
-    if (TransformReplay::getMatchedLeafPosWithoutReplayCasP(
-            consumer, producer, producer_pos + 1) < 0) {
-      return producer_pos;
-    }
-    auto map_it = p2c_replay_map.find(producer->axis(producer_pos));
-    if (map_it != p2c_replay_map.end()) {
-      auto c_id = map_it->second;
-      if (!isAllowedID(c_id, consumer, true, false, true)) {
-        return producer_pos;
-      }
-    }
-  }
-  return producer->nDims();
-}
-
-size_t InlinePropagator::getMaxPosAll(TensorView* tv, bool check_siblings) {
-  auto max_pos = max_pos_calc.getMaxPosSelf(tv, false, false, false);
-  for (auto consumer_tv : ir_utils::consumerTvsOf(tv)) {
-    max_pos = std::min<size_t>(
-        max_pos, max_pos_calc.getMaxProducerPosFromConsumer(tv, consumer_tv));
-  }
-  if (check_siblings) {
-    for (auto sibling_tv : ir_utils::siblingTvsOf(tv)) {
-      max_pos = std::min<size_t>(max_pos, getMaxPosAll(sibling_tv, false));
-    }
-  }
-  return max_pos;
-}
-
-void InlinePropagator::setCAPos(TensorView* tv) {
-  bool debug = isDebugDumpEnabled(DebugDumpOption::InlinePropagator);
-  size_t pos = mapped_reference_pos_.at(tv);
-  if (debug) {
-    std::cout << "  Setting CA pos of " << tv << ":" << std::endl;
-    std::cout << "    mapped position: " << pos << std::endl;
-  }
-  if ((selected_.empty() || selected_.count(tv)) && !tv->isFusionInput()) {
-    auto max_pos = getMaxPosAll(tv);
-    if (debug) {
-      std::cout << "    max inlinable position: " << max_pos << std::endl;
-    }
-    if (mode_ == ComputeAtMode::Standard) {
-      TORCH_INTERNAL_ASSERT(
-          pos <= max_pos,
-          "Invalid compute at position detected in InlinePropagator when trying to set the CA position of: ",
-          tv,
-          " to ",
-          pos,
-          ",  max position that's allowed is ",
-          max_pos);
-    } else if (mode_ == ComputeAtMode::BestEffort) {
-      pos = std::min<size_t>(pos, max_pos);
-    } else {
-      pos = max_pos;
-    }
-    // hoist inner most broadcast
-    while (pos > 0 && tv->axis(pos - 1)->isBroadcast()) {
-      pos--;
-    }
-    auto current_ca_pos = tv->getComputeAtPosition();
-    if (debug) {
-      std::cout << "    current CA position: " << current_ca_pos << std::endl;
-    }
-    if (pos > current_ca_pos) {
-      if (debug) {
-        std::cout << "    new CA position: " << pos << std::endl;
-      }
-      tv->setComputeAt(pos);
-      for (auto consumer_tv : ir_utils::consumerTvsOf(tv)) {
-        needs_update_max_producer_.insert(consumer_tv);
-      }
-    } else if (debug) {
-      std::cout << "    CA position not changed" << std::endl;
-    }
-  } else if (debug) {
-    std::cout << "    tensor not selected, skip" << std::endl;
-  }
-}
-
-InlinePropagator::InlinePropagator(
-    TensorView* reference,
-    int64_t reference_pos,
-    ComputeAtMode mode,
-    std::unordered_set<TensorView*> selected,
-    std::unordered_set<IterDomain*> uninlinable_ids)
-    : max_pos_calc(mode, std::move(uninlinable_ids)),
-      selected_(std::move(selected)),
-      reference_(reference),
-      mode_(mode) {
-  if (reference_pos < 0) {
-    reference_pos += int64_t(reference->nDims()) + 1;
-  }
-  TORCH_INTERNAL_ASSERT(
-      reference_pos >= 0 && reference_pos <= reference->nDims(),
-      "Invalid computeAt axis, received ",
-      reference_pos,
-      " but should be > -",
-      reference->nDims(),
-      " and <= ",
-      reference->nDims(),
-      ".");
-  reference_pos_ = reference_pos;
-}
-
-void InlinePropagator::setUp() {
-  bool debug = isDebugDumpEnabled(DebugDumpOption::InlinePropagator);
-  mapped_reference_pos_[reference_] = reference_pos_;
-  if (debug) {
-    std::cout << "InlinePropagator::setUp" << std::endl;
-    std::cout << "  reference: " << reference_ << " @ " << reference_pos_
-              << std::endl;
-  }
-  setCAPos(reference_);
-}
-
-namespace {
-
-// Try to find the aligned position on consumer's domain corresponding to the
-//  compute at position of producer domain. Used in InlinePropagator pass only.
-//  No checking on actual producer-consumer relationship.
-unsigned int getConsumerPosAlignedToProducerCA(
-    TensorView* consumer,
-    TensorView* producer) {
-  // Locate consumer's position that aligns with
-  //  the producer's new compute at axis. We need broadcast axes forwarded so we
-  //  need to replay PasC as CasP will not forward braodcast dims. For example
-  //  if we have:
-  // T2[ iS22{( 3 * 1 )} ] ca_pos( 1 ) = broadcast( T1[ iS1{3} ] ca_pos( 1 )
-  // produce_pos( 1) ) CasP will have the mapping iS1{3} -> iS2{3} and PasC will
-  // have the mapping iS22{( 3 * 1 )} <- iS1{3} We need the latter. Refer to
-  // NVFuserTest.FusionComplexBCast1_CUDA
-
-  auto disjoint_sets =
-      BestEffortReplay::replayPasC(
-          producer, consumer, -1, PairwiseRootDomainMap(producer, consumer))
-          .getDisjointSets();
-
-  // Find the innermost position of consumer that has
-  //  been mapped within the producer ca axis.
-  unsigned int consumer_pos = consumer->nDims();
-  while (consumer_pos > 0) {
-    auto consumer_id = consumer->axis((int)consumer_pos - 1);
-    auto p_dom = producer->domain()->domain();
-    if (std::any_of(
-            p_dom.begin(),
-            p_dom.begin() + producer->getComputeAtPosition(),
-            [&consumer_id, &disjoint_sets](IterDomain* p_id) {
-              return disjoint_sets.permissiveAreMapped(consumer_id, p_id);
-            })) {
-      break;
-    }
-    consumer_pos--;
-  }
-
-  return consumer_pos;
-}
-
-} // namespace
-
-void InlinePropagator::tearDown() {
-  for (auto consumer : needs_update_max_producer_) {
-    unsigned int consumer_pos = 0;
-    for (auto producer : ir_utils::producerTvsOf(consumer)) {
-      consumer_pos = std::max(
-          consumer_pos, getConsumerPosAlignedToProducerCA(consumer, producer));
-    }
-    consumer->setMaxProducer(consumer_pos);
-  }
-}
-
-void InlinePropagator::propagateC2P(TensorView* from, TensorView* to) {
-  bool debug = isDebugDumpEnabled(DebugDumpOption::InlinePropagator);
-  if (debug) {
-    std::cout << "InlinePropagator::propagateC2P" << std::endl;
-    std::cout << "  from: " << from << std::endl;
-    std::cout << "  to: " << to << std::endl;
-  }
-  // Step 1: find mapped_reference_pos_[to]
-  int from_pos = mapped_reference_pos_.at(from);
-  auto to_pos =
-      TransformReplay::getMatchedLeafPosWithoutReplayPasC(to, from, from_pos);
-  if (mode_ == ComputeAtMode::Standard) {
-    TORCH_CHECK(
-        to_pos >= 0,
-        "Unable to propagate CA position from consumer ",
-        from,
-        " at ",
-        from_pos,
-        " to producer ",
-        to,
-        " because this would require replay.");
-  } else {
-    // For MostInlined and BestEffort inline propagation, we allow the DAG to
-    // be not replayed fully consistently. For such case, we just don't inline
-    // into the mismatched dimension.
-    while (to_pos < 0) {
-      from_pos--;
-      to_pos = TransformReplay::getMatchedLeafPosWithoutReplayPasC(
-          to, from, from_pos);
-    }
-  }
-  mapped_reference_pos_[to] = to_pos;
-  // Step 2: set CA position of `to`
-  setCAPos(to);
-}
-
-void InlinePropagator::propagateP2C(TensorView* from, TensorView* to) {
-  bool debug = isDebugDumpEnabled(DebugDumpOption::InlinePropagator);
-  if (debug) {
-    std::cout << "InlinePropagator::propagateP2C" << std::endl;
-    std::cout << "  from: " << from << std::endl;
-    std::cout << "  to: " << to << std::endl;
-  }
-  // Step 1: find mapped_reference_pos_[to]
-  int from_pos = mapped_reference_pos_.at(from);
-  auto to_pos =
-      TransformReplay::getMatchedLeafPosWithoutReplayCasP(to, from, from_pos);
-  if (mode_ == ComputeAtMode::Standard) {
-    TORCH_CHECK(
-        to_pos >= 0,
-        "Unable to propagate CA position from producer ",
-        from,
-        " at ",
-        from_pos,
-        " to consumer ",
-        to,
-        " because this would require replay.");
-  } else {
-    // For MostInlined and BestEffort inline propagation, we allow the DAG to
-    // be not replayed fully consistently. For such case, we just don't inline
-    // into the mismatched dimension.
-    while (to_pos < 0) {
-      from_pos--;
-      to_pos = TransformReplay::getMatchedLeafPosWithoutReplayCasP(
-          to, from, from_pos);
-    }
-  }
-  mapped_reference_pos_[to] = to_pos;
-  // Step 2: set CA position of `to`
-  setCAPos(to);
-}
-
-void InlinePropagator::propagateSibling(TensorView* from, TensorView* to) {
-  bool debug = isDebugDumpEnabled(DebugDumpOption::InlinePropagator);
-  if (debug) {
-    std::cout << "InlinePropagator::propagateSibling" << std::endl;
-    std::cout << "  from: " << from << std::endl;
-    std::cout << "  to: " << to << std::endl;
-  }
-  // Step 1: find mapped_reference_pos_[to]
-  auto from_pos = mapped_reference_pos_.at(from);
-  TORCH_CHECK(
-      TransformReplay::fullSelfMatching(to, from),
-      "Unable to propagate CA position from ",
-      from,
-      " to sibling ",
-      to,
-      " because this would require replay.");
-  mapped_reference_pos_[to] = from_pos;
-  // Step 2: set CA position of `to`
-  setCAPos(to);
-}
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/inline_propagator.h b/torch/csrc/jit/codegen/cuda/inline_propagator.h
deleted file mode 100644
index d1bdeebd06d63..0000000000000
--- a/torch/csrc/jit/codegen/cuda/inline_propagator.h
+++ /dev/null
@@ -1,118 +0,0 @@
-#pragma once
-
-#include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/maxinfo_propagator.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
-
-#include <unordered_set>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-class TORCH_CUDA_CU_API MaxPosCalculator {
-  ComputeAtMode mode_ = ComputeAtMode::Standard;
-
-  // Root domains in producer that's unmappable to any of its consumers
-  std::unordered_set<IterDomain*> unmappable_dims_;
-
-  // User set IterDomains to not inline, used in schedulers to avoid inlining
-  // trivial reductions
-  std::unordered_set<IterDomain*> uninlinable_ids_;
-
-  // Iterate through all TVs and collect the dimensions of each TV that don't
-  // map to all its consumer TVs.
-  void buildUnmappableDims();
-
-  // Utility function to return if an id of tv is a valid iter domain to inline
-  // within. This is used in getMaxPos{PasC,CasP}. Different variations of the
-  // bool values are used if checking max position of PasC, CasP, or checking
-  // for a max "self" position.
-  bool isAllowedID(
-      IterDomain* id,
-      TensorView* tv,
-      bool allow_reduction,
-      bool allow_vectorize,
-      bool allow_unmappable) const;
-
- public:
-  // Returns the position at which tv can be inlined within.
-  size_t getMaxPosSelf(
-      TensorView* tv,
-      bool allow_reduction,
-      bool allow_vectorize,
-      bool allow_unmappable) const;
-
-  // Returns the maximum position producer can be inlined based on consumer
-  // given the set ComputeAtMode
-  size_t getMaxProducerPosFromConsumer(
-      TensorView* producer,
-      TensorView* consumer) const;
-
-  MaxPosCalculator(
-      ComputeAtMode mode,
-      std::unordered_set<IterDomain*> uninlinable_ids = {});
-};
-
-// Propagate inline position to the `selected` tensors in the DAG. If `selected`
-// is not specified or empty, then propagate to the entire DAG.
-class TORCH_CUDA_CU_API InlinePropagator
-    : public MaxInfoSpanningTree::Propagator {
-  // Checks producers and consumers to see what the maximum position in tv is
-  // that can be shared across both directions.
-  size_t getMaxPosAll(TensorView* tv, bool check_siblings = true);
-
-  // We use mapped_reference_pos_ to keep track of the outer axes information of
-  // the reference tensor. That is, mapped_reference_pos_[tv] answers the
-  // question "What outer axes in tv are shared with the specified reference
-  // tensor's outer axes?". However, when we actually set the CA position of tv,
-  // we might not want to set it as mapped_reference_pos_[tv] because because we
-  // don't want to inline certain things (such as vectorized dimensions, inner
-  // most broadcasting, etc.).
-  std::unordered_map<TensorView*, size_t> mapped_reference_pos_;
-
-  // Actually set the computeAt position. This does not necessarily equal to
-  // mapped_reference_pos_[tv] because we don't want to inline certain things.
-  void setCAPos(TensorView* tv);
-
-  const MaxPosCalculator max_pos_calc;
-  std::unordered_set<TensorView*> selected_;
-  std::unordered_set<TensorView*> needs_update_max_producer_;
-  TensorView* reference_;
-  size_t reference_pos_;
-  ComputeAtMode mode_ = ComputeAtMode::Standard;
-
- public:
-  InlinePropagator(
-      TensorView* reference,
-      int64_t reference_pos,
-      ComputeAtMode mode = ComputeAtMode::Standard,
-      std::unordered_set<TensorView*> selected = {},
-      std::unordered_set<IterDomain*> uninlinable_ids = {});
-
-  InlinePropagator(
-      TensorView* reference,
-      int64_t reference_pos,
-      std::unordered_set<TensorView*> selected)
-      : InlinePropagator(
-            reference,
-            reference_pos,
-            ComputeAtMode::Standard,
-            selected) {}
-
-  ~InlinePropagator() = default;
-
-  // Actually propagate the transformations for the inlining pass. Uses the
-  // functions above to figure out what position to do the propagation at.
-  virtual void setUp() override;
-  virtual void propagateC2P(TensorView* from, TensorView* to) override;
-  virtual void propagateP2C(TensorView* from, TensorView* to) override;
-  virtual void propagateSibling(TensorView* from, TensorView* to) override;
-  virtual void tearDown() override;
-};
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/inlining.cpp b/torch/csrc/jit/codegen/cuda/inlining.cpp
new file mode 100644
index 0000000000000..da6d229c68f8b
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/inlining.cpp
@@ -0,0 +1,306 @@
+#include <torch/csrc/jit/codegen/cuda/inlining.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+#include <torch/csrc/jit/codegen/cuda/transform_iter.h>
+
+#include <utility>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+MaxPosCalculator::MaxPosCalculator(
+    const std::unordered_set<IterDomain*>& uninlinable_ids)
+    : uninlinable_ids_(uninlinable_ids) {
+  buildUnmappableDims();
+}
+
+void MaxPosCalculator::buildUnmappableDims() {
+  ComputeAtRootDomainMap root_map;
+  root_map.build();
+  auto all_tvs = ir_utils::allTvs(FusionGuard::getCurFusion());
+  for (auto tv : all_tvs) {
+    auto consumers = ir_utils::consumerTvsOf(tv);
+    for (auto consumer : consumers) {
+      // Grab dimensions in producer and consumer that are mappable to eachother
+      // based on the computeAtRootDomainMap. This will tell us which dimensions
+      // can be inlined based on avoiding trying to inline non-trivial
+      // reduction structures.
+      auto mappable_roots =
+          root_map.getMappableDims(tv->domain(), consumer->domain());
+      for (auto tv_root_id : tv->getMaybeRFactorDomain()) {
+        if (mappable_roots.find(tv_root_id) == mappable_roots.end() &&
+            !tv_root_id->isTrivialReduction()) {
+          unmappable_dims_.emplace(tv_root_id);
+        }
+      }
+    }
+  }
+}
+
+bool MaxPosCalculator::isAllowedID(
+    IterDomain* id,
+    TensorView* tv,
+    bool best_effort,
+    bool allow_reduction,
+    bool allow_vectorize,
+    bool allow_unmappable) const {
+  bool allowed = true;
+
+  if (!allow_reduction) {
+    allowed = allowed && !id->isReduction();
+  }
+
+  if (uninlinable_ids_.count(id)) {
+    return false;
+  }
+
+  if (!allow_vectorize) {
+    // Avoid inlining if marked as Vectorize or Group. In the case of
+    // BestEffort and MostInlined modes, avoid Unroll as well.
+    bool is_vectorize = isParallelTypeVectorize(id->getParallelType()) ||
+        id->getParallelType() == ParallelType::Group ||
+        (best_effort && id->getParallelType() == ParallelType::Unroll);
+    allowed = allowed && !is_vectorize;
+  }
+
+  if (!allow_unmappable) {
+    auto root_dom = tv->getMaybeRFactorDomain();
+    std::unordered_set<Val*> root_dom_set(root_dom.begin(), root_dom.end());
+    auto all_vals = DependencyCheck::getAllValsBetween(root_dom_set, {id});
+    bool is_unmappable = false;
+    for (auto val : all_vals) {
+      auto id = val->as<IterDomain>();
+      if (root_dom_set.count(val) > 0 && unmappable_dims_.count(id) > 0) {
+        is_unmappable = true;
+        break;
+      }
+    }
+    allowed = allowed && !is_unmappable;
+  }
+
+  return allowed;
+}
+
+size_t MaxPosCalculator::getMaxPosSelf(
+    TensorView* tv,
+    bool best_effort,
+    bool allow_reduction,
+    bool allow_vectorize,
+    bool allow_unmappable) const {
+  auto dom = tv->domain()->domain();
+  auto iter = std::find_if(dom.begin(), dom.end(), [=](IterDomain* id) {
+    return !isAllowedID(
+        id,
+        tv,
+        best_effort,
+        allow_reduction,
+        allow_vectorize,
+        allow_unmappable);
+  });
+  return std::distance(dom.begin(), iter);
+}
+
+// Return the max position in producer that can be inlined to consumer
+// Cannot inline:
+//   Vectorized dimensions in consumer
+//   Unrolled dimensions in consumer
+size_t MaxPosCalculator::getMaxProducerPosFromConsumer(
+    TensorView* producer,
+    TensorView* consumer,
+    bool best_effort) const {
+  auto pairwise_root_map = PairwiseRootDomainMap(producer, consumer);
+  auto replay_CasP =
+      BestEffortReplay::replayCasP(consumer, producer, -1, pairwise_root_map);
+  auto p2c_replay_map = replay_CasP.getReplay();
+
+  for (size_t producer_pos = 0; producer_pos < producer->nDims();
+       producer_pos++) {
+    // If the producer position is mismatching with the consumer, then we can
+    // not inline into this position, otherwise the max producer position of
+    // the consumer will become invalid and expression sort will fail.
+    if (TransformReplay::getMatchedLeafPosWithoutReplayCasP(
+            consumer, producer, producer_pos + 1) < 0) {
+      return producer_pos;
+    }
+    auto map_it = p2c_replay_map.find(producer->axis(producer_pos));
+    if (map_it != p2c_replay_map.end()) {
+      auto c_id = map_it->second;
+      if (!isAllowedID(c_id, consumer, best_effort, true, false, true)) {
+        return producer_pos;
+      }
+    }
+  }
+  return producer->nDims();
+}
+
+size_t MaxPosCalculator::getMaxPosAll(
+    TensorView* tv,
+    bool best_effort,
+    bool check_siblings) {
+  auto max_pos = getMaxPosSelf(tv, best_effort, false, false, false);
+  for (auto consumer_tv : ir_utils::consumerTvsOf(tv)) {
+    max_pos = std::min<size_t>(
+        max_pos, getMaxProducerPosFromConsumer(tv, consumer_tv, best_effort));
+  }
+  if (check_siblings) {
+    for (auto sibling_tv : ir_utils::siblingTvsOf(tv)) {
+      max_pos = std::min<size_t>(
+          max_pos, getMaxPosAll(sibling_tv, best_effort, false));
+    }
+  }
+  return max_pos;
+}
+
+void inlineMost(const std::unordered_set<IterDomain*>& uninlinable_ids) {
+  inlineMost(ir_utils::allTvs(FusionGuard::getCurFusion()), uninlinable_ids);
+}
+
+void inlineMost(
+    const std::vector<TensorView*>& tvs,
+    const std::unordered_set<IterDomain*>& uninlinable_ids) {
+  if (tvs.empty()) {
+    return;
+  }
+  MaxPosCalculator calc(uninlinable_ids);
+  for (auto tv : tvs) {
+    tv->inlineAt(-1, true, &calc);
+  }
+}
+
+void inlineMost(
+    const std::unordered_set<TensorView*>& tvs,
+    const std::unordered_set<IterDomain*>& uninlinable_ids) {
+  if (tvs.empty()) {
+    return;
+  }
+  MaxPosCalculator calc(uninlinable_ids);
+  for (auto tv : tvs) {
+    tv->inlineAt(-1, true, &calc);
+  }
+}
+
+namespace {
+
+// Find the positions of `selected` tensors that is mapped to the given position
+// in the reference tensor.
+class FindMappedPositions : public MaxInfoSpanningTree::Propagator {
+  std::unordered_map<TensorView*, size_t>& output_;
+
+ public:
+  FindMappedPositions(
+      std::unordered_map<TensorView*, size_t>& output,
+      TensorView* reference,
+      int64_t reference_pos);
+
+  ~FindMappedPositions() = default;
+
+  virtual void propagateC2P(TensorView* from, TensorView* to) override;
+  virtual void propagateP2C(TensorView* from, TensorView* to) override;
+  virtual void propagateSibling(TensorView* from, TensorView* to) override;
+};
+
+FindMappedPositions::FindMappedPositions(
+    std::unordered_map<TensorView*, size_t>& output,
+    TensorView* reference,
+    int64_t reference_pos)
+    : output_(output) {
+  if (reference_pos < 0) {
+    reference_pos += int64_t(reference->nDims()) + 1;
+  }
+  TORCH_CHECK(
+      reference_pos >= 0 && reference_pos <= reference->nDims(),
+      "Invalid axis received ",
+      reference_pos,
+      " but should be > -",
+      reference->nDims(),
+      " and <= ",
+      reference->nDims(),
+      ".");
+  output_[reference] = reference_pos;
+}
+
+void FindMappedPositions::propagateC2P(TensorView* from, TensorView* to) {
+  int from_pos = output_.at(from);
+  auto to_pos =
+      TransformReplay::getMatchedLeafPosWithoutReplayPasC(to, from, from_pos);
+  // If there is no matching position found, we compute the highest matched
+  // position as the closest approximation
+  while (to_pos < 0) {
+    from_pos--;
+    to_pos =
+        TransformReplay::getMatchedLeafPosWithoutReplayPasC(to, from, from_pos);
+  }
+  output_[to] = to_pos;
+}
+
+void FindMappedPositions::propagateP2C(TensorView* from, TensorView* to) {
+  int from_pos = output_.at(from);
+  auto to_pos =
+      TransformReplay::getMatchedLeafPosWithoutReplayCasP(to, from, from_pos);
+  // If there is no matching position found, we compute the highest matched
+  // position as the closest approximation
+  while (to_pos < 0) {
+    from_pos--;
+    to_pos =
+        TransformReplay::getMatchedLeafPosWithoutReplayCasP(to, from, from_pos);
+  }
+  output_[to] = to_pos;
+}
+
+void FindMappedPositions::propagateSibling(TensorView* from, TensorView* to) {
+  auto from_pos = output_.at(from);
+  TORCH_CHECK(
+      TransformReplay::fullSelfMatching(to, from),
+      "Transformations in siblings ",
+      from,
+      " and ",
+      to,
+      " does not match with each other.");
+  output_[to] = from_pos;
+}
+
+std::unordered_map<TensorView*, size_t> getPositionsMappedTo(
+    TensorView* reference_tv,
+    int64_t reference_pos) {
+  std::unordered_map<TensorView*, size_t> mapped_positions;
+  MaxRootDomainInfoSpanningTree tree(reference_tv, reference_pos);
+  FindMappedPositions propagator(mapped_positions, reference_tv, reference_pos);
+  tree.traverse(&propagator);
+  return mapped_positions;
+}
+
+} // namespace
+
+void inlineAllAt(
+    TensorView* reference_tv,
+    int64_t reference_pos,
+    bool best_effort,
+    const std::unordered_set<IterDomain*>& uninlinable_ids) {
+  auto mapped_positions = getPositionsMappedTo(reference_tv, reference_pos);
+  MaxPosCalculator calc(uninlinable_ids);
+  for (auto pair : mapped_positions) {
+    pair.first->inlineAt(pair.second, best_effort, &calc);
+  }
+}
+
+void inlineSelectedAt(
+    const std::unordered_set<TensorView*>& selected,
+    TensorView* reference_tv,
+    int64_t reference_pos,
+    bool best_effort,
+    const std::unordered_set<IterDomain*>& uninlinable_ids) {
+  auto mapped_positions = getPositionsMappedTo(reference_tv, reference_pos);
+  MaxPosCalculator calc(uninlinable_ids);
+  for (auto pair : mapped_positions) {
+    if (selected.count(pair.first) > 0) {
+      pair.first->inlineAt(pair.second, best_effort, &calc);
+    }
+  }
+}
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/inlining.h b/torch/csrc/jit/codegen/cuda/inlining.h
new file mode 100644
index 0000000000000..3b15eb23f9877
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/inlining.h
@@ -0,0 +1,100 @@
+#pragma once
+
+#include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/maxinfo_propagator.h>
+#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
+
+#include <memory>
+#include <unordered_set>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+class MaxPosCalculator {
+  // Root domains in producer that's unmappable to any of its consumers
+  std::unordered_set<IterDomain*> unmappable_dims_;
+
+  // User set IterDomains to not inline, used in schedulers to avoid inlining
+  // trivial reductions
+  std::unordered_set<IterDomain*> uninlinable_ids_;
+
+  // Iterate through all TVs and collect the dimensions of each TV that don't
+  // map to all its consumer TVs.
+  void buildUnmappableDims();
+
+  // Utility function to return if an id of tv is a valid iter domain to inline
+  // within. This is used in getMaxPos{PasC,CasP}. Different variations of the
+  // bool values are used if checking max position of PasC, CasP, or checking
+  // for a max "self" position.
+  bool isAllowedID(
+      IterDomain* id,
+      TensorView* tv,
+      bool best_effort,
+      bool allow_reduction,
+      bool allow_vectorize,
+      bool allow_unmappable) const;
+
+ public:
+  // Returns the position at which tv can be inlined within.
+  size_t getMaxPosSelf(
+      TensorView* tv,
+      bool best_effort,
+      bool allow_reduction,
+      bool allow_vectorize,
+      bool allow_unmappable) const;
+
+  // Returns the maximum position producer can be inlined based on consumer
+  // given the set ComputeAtMode
+  size_t getMaxProducerPosFromConsumer(
+      TensorView* producer,
+      TensorView* consumer,
+      bool best_effort) const;
+
+  // Checks producers, consumers, and siblings to see what the maximum position
+  // in tv is that can be shared across both directions.
+  size_t getMaxPosAll(
+      TensorView* tv,
+      bool best_effort = false,
+      bool check_siblings = true);
+
+  MaxPosCalculator(const std::unordered_set<IterDomain*>& uninlinable_ids = {});
+};
+
+// Inline to the right most allowed position for all tensors in the current
+// fusion.
+TORCH_CUDA_CU_API void inlineMost(
+    const std::unordered_set<IterDomain*>& uninlinable_ids = {});
+// Inline to the right most allowed position for the selected tensors in the
+// current fusion.
+TORCH_CUDA_CU_API void inlineMost(
+    const std::vector<TensorView*>& tvs,
+    const std::unordered_set<IterDomain*>& uninlinable_ids = {});
+// Inline to the right most allowed position for the selected tensors in the
+// current fusion.
+TORCH_CUDA_CU_API void inlineMost(
+    const std::unordered_set<TensorView*>& tvs,
+    const std::unordered_set<IterDomain*>& uninlinable_ids = {});
+
+// Inline to the position corresponding to the reference position in the
+// reference tensor for all tensors in the current fusion.
+TORCH_CUDA_CU_API void inlineAllAt(
+    TensorView* reference_tv,
+    int64_t reference_pos,
+    bool best_effort = false,
+    const std::unordered_set<IterDomain*>& uninlinable_ids = {});
+
+// Inline to the position corresponding to the reference position in the
+// reference tensor for selected tensors in the current fusion.
+TORCH_CUDA_CU_API void inlineSelectedAt(
+    const std::unordered_set<TensorView*>& selected,
+    TensorView* reference_tv,
+    int64_t reference_pos,
+    bool best_effort = false,
+    const std::unordered_set<IterDomain*>& uninlinable_ids = {});
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/interface.cpp b/torch/csrc/jit/codegen/cuda/interface.cpp
index 664f14d26c759..6b1fa7c44f9c5 100644
--- a/torch/csrc/jit/codegen/cuda/interface.cpp
+++ b/torch/csrc/jit/codegen/cuda/interface.cpp
@@ -655,6 +655,62 @@ RegisterOperators reg_add_optional({
         aliasAnalysisFromSchema()),
 });
 
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_permute_copy({
+    Operator(
+        "prim::permute_copy(Tensor(a) self, int[] dims) -> Tensor",
+        [](const Node* node) -> Operation {
+          return [node](Stack& stack) {
+            TORCH_CHECK(
+                node->s(attr::name) == "CudaFusionGroup",
+                "permute_copy is only used by nvfuser to identify non-mutating ",
+                "alias ops, should be restored after fusion pass!");
+            IValue self, dims;
+            pop(stack, self, dims);
+            push(stack, at::native::view(self.toTensor(), dims.toIntVector()));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_transpose_copy({
+    Operator(
+        "prim::transpose_copy.int(Tensor(a) self, int dim0, int dim1) -> Tensor",
+        [](const Node* node) -> Operation {
+          return [node](Stack& stack) {
+            TORCH_CHECK(
+                node->s(attr::name) == "CudaFusionGroup",
+                "transpose_copy is only used by nvfuser to identify non-mutating ",
+                "alias ops, should be restored after fusion pass!");
+            IValue self, dim0, dim1;
+            pop(stack, self, dim0, dim1);
+            push(
+                stack,
+                at::transpose(self.toTensor(), dim0.toInt(), dim1.toInt()));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+RegisterOperators reg_t_copy({
+    Operator(
+        "prim::t_copy(Tensor(a) self) -> Tensor",
+        [](const Node* node) -> Operation {
+          return [node](Stack& stack) {
+            TORCH_CHECK(
+                node->s(attr::name) == "CudaFusionGroup",
+                "t_copy is only used by nvfuser to identify non-mutating ",
+                "alias ops, should be restored after fusion pass!");
+            IValue self;
+            pop(stack, self);
+            push(stack, at::t(self.toTensor()));
+          };
+        },
+        aliasAnalysisFromSchema()),
+});
+
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 RegisterOperators reg_view_copy({
     Operator(
diff --git a/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp b/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp
index b29a8bc417cd0..ff00f659da637 100644
--- a/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp
+++ b/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp
@@ -341,6 +341,12 @@ void Expr::setPredicate(kir::Predicate* predicate) {
   predicate_ = predicate;
 }
 
+Expr* Expr::withPredicate(kir::Predicate* predicate) {
+  auto result = shallowCopy();
+  result->setPredicate(predicate);
+  return result;
+}
+
 kir::Predicate* Expr::writePredicate() const {
   TORCH_INTERNAL_ASSERT(
       container()->isA<kir::Kernel>(), "Function invalid for fusion.");
@@ -353,6 +359,19 @@ void Expr::setWritePredicate(kir::Predicate* write_predicate) {
   write_predicate_ = write_predicate;
 }
 
+Expr* Expr::withWritePredicate(kir::Predicate* predicate) {
+  auto result = shallowCopy();
+  result->setWritePredicate(predicate);
+  return result;
+}
+
+void Expr::copyPredicatesFrom(const Expr* expr) {
+  if (container()->isA<kir::Kernel>()) {
+    predicate_ = expr->predicate_;
+    write_predicate_ = expr->write_predicate_;
+  }
+}
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/ir_base_nodes.h b/torch/csrc/jit/codegen/cuda/ir_base_nodes.h
index 7d5ebad25282b..dadabe167ebfc 100644
--- a/torch/csrc/jit/codegen/cuda/ir_base_nodes.h
+++ b/torch/csrc/jit/codegen/cuda/ir_base_nodes.h
@@ -426,6 +426,10 @@ class TORCH_CUDA_CU_API Expr : public Statement {
 
   Expr(const Expr* src, IrCloner* ir_cloner);
 
+  // Creates a new instance of the expression with all its field copied.
+  // Note that unlike IrCloner, this function only do a shallow copy
+  virtual Expr* shallowCopy() const = 0;
+
   c10::optional<ExprType> getExprType() const override {
     return etype_;
   }
@@ -466,16 +470,27 @@ class TORCH_CUDA_CU_API Expr : public Statement {
   // TODO: Protect based on being in kernel container
   kir::Predicate* predicate() const;
 
+  // Creates a shallow copy the expression with the given predicate attached.
   // TODO: Protect based on being in kernel container
-  void setPredicate(kir::Predicate* predicate);
+  Expr* withPredicate(kir::Predicate* predicate);
 
   // TODO: Protect based on being in kernel container
   kir::Predicate* writePredicate() const;
 
+  // Creates a shallow copy the expression with the given write-predicate
+  // attached.
   // TODO: Protect based on being in kernel container
-  void setWritePredicate(kir::Predicate* write_predicate);
+  Expr* withWritePredicate(kir::Predicate* write_predicate);
 
  protected:
+  // TODO: Protect based on being in kernel container
+  void setPredicate(kir::Predicate* predicate);
+
+  // TODO: Protect based on being in kernel container
+  void setWritePredicate(kir::Predicate* write_predicate);
+
+  void copyPredicatesFrom(const Expr* expr);
+
   // TODO: Add Fusion passkey
   void addInput(Val* input) {
     TORCH_INTERNAL_ASSERT(input != nullptr);
diff --git a/torch/csrc/jit/codegen/cuda/ir_builder.cpp b/torch/csrc/jit/codegen/cuda/ir_builder.cpp
index 189bd7aa666eb..f0fd438c15672 100644
--- a/torch/csrc/jit/codegen/cuda/ir_builder.cpp
+++ b/torch/csrc/jit/codegen/cuda/ir_builder.cpp
@@ -60,7 +60,9 @@ IR_BUILDER_INSTANTIATE(ShiftOp)
 IR_BUILDER_INSTANTIATE(GatherOp)
 IR_BUILDER_INSTANTIATE(ViewAsScalar)
 IR_BUILDER_INSTANTIATE(ViewOp)
+IR_BUILDER_INSTANTIATE(FullOp)
 IR_BUILDER_INSTANTIATE(ARangeOp)
+IR_BUILDER_INSTANTIATE(EyeOp)
 IR_BUILDER_INSTANTIATE(UnaryOp)
 IR_BUILDER_INSTANTIATE(BinaryOp)
 IR_BUILDER_INSTANTIATE(TernaryOp)
diff --git a/torch/csrc/jit/codegen/cuda/ir_cloner.cpp b/torch/csrc/jit/codegen/cuda/ir_cloner.cpp
index bdd1d3b86df7c..489be49ddfc7c 100644
--- a/torch/csrc/jit/codegen/cuda/ir_cloner.cpp
+++ b/torch/csrc/jit/codegen/cuda/ir_cloner.cpp
@@ -88,10 +88,18 @@ void IrCloner::handle(const TensorView* tv) {
   clone_ = IrBuilder::clone(tv, this);
 }
 
+void IrCloner::handle(const FullOp* op) {
+  clone_ = IrBuilder::clone(op, this);
+}
+
 void IrCloner::handle(const ARangeOp* op) {
   clone_ = IrBuilder::clone(op, this);
 }
 
+void IrCloner::handle(const EyeOp* op) {
+  clone_ = IrBuilder::clone(op, this);
+}
+
 void IrCloner::handle(const UnaryOp* op) {
   clone_ = IrBuilder::clone(op, this);
 }
diff --git a/torch/csrc/jit/codegen/cuda/ir_cloner.h b/torch/csrc/jit/codegen/cuda/ir_cloner.h
index 7cc118cdcff5a..06e1ec3359d95 100644
--- a/torch/csrc/jit/codegen/cuda/ir_cloner.h
+++ b/torch/csrc/jit/codegen/cuda/ir_cloner.h
@@ -68,7 +68,9 @@ class TORCH_CUDA_CU_API IrCloner : private OptInConstDispatch {
   void handle(const ComplexDouble*) override;
   void handle(const NamedScalar*) override;
 
+  void handle(const FullOp*) override;
   void handle(const ARangeOp*) override;
+  void handle(const EyeOp*) override;
   void handle(const UnaryOp*) override;
   void handle(const BinaryOp*) override;
   void handle(const TernaryOp*) override;
diff --git a/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp b/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp
index fa5173dfcaa5a..6c04e4214b07d 100644
--- a/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp
+++ b/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp
@@ -407,15 +407,32 @@ void IrGraphGenerator::handle(const TensorView* tv) {
   tensor_views_.push_back(tv);
 }
 
-void IrGraphGenerator::handle(const ARangeOp* uop) {
+void IrGraphGenerator::handle(const FullOp* fop) {
   // node
-  printExpr(uop, "arange");
+  printExpr(fop, "full");
 
   // inputs & outputs
-  addArc(uop->start(), uop);
-  addArc(uop->end(), uop);
-  addArc(uop->step(), uop);
-  addArc(uop, uop->output(0));
+  addArc(fop->getFillValue(), fop);
+  addArc(fop, fop->output(0));
+}
+
+void IrGraphGenerator::handle(const ARangeOp* aop) {
+  // node
+  printExpr(aop, "arange");
+
+  // inputs & outputs
+  addArc(aop->start(), aop);
+  addArc(aop->end(), aop);
+  addArc(aop->step(), aop);
+  addArc(aop, aop->output(0));
+}
+
+void IrGraphGenerator::handle(const EyeOp* eop) {
+  // node
+  printExpr(eop, "eye");
+
+  // inputs & outputs
+  addArc(eop, eop->output(0));
 }
 
 void IrGraphGenerator::handle(const UnaryOp* uop) {
diff --git a/torch/csrc/jit/codegen/cuda/ir_graphviz.h b/torch/csrc/jit/codegen/cuda/ir_graphviz.h
index c68c4fccb6f6c..1f555ed31ec06 100644
--- a/torch/csrc/jit/codegen/cuda/ir_graphviz.h
+++ b/torch/csrc/jit/codegen/cuda/ir_graphviz.h
@@ -82,7 +82,9 @@ class TORCH_CUDA_CU_API IrGraphGenerator : private OptInConstDispatch {
   void handle(const ComplexDouble*) override;
   void handle(const NamedScalar*) override;
 
+  void handle(const FullOp*) override;
   void handle(const ARangeOp*) override;
+  void handle(const EyeOp*) override;
   void handle(const UnaryOp*) override;
   void handle(const BinaryOp*) override;
   void handle(const TernaryOp*) override;
diff --git a/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h b/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h
index 126abba2ae103..dbefc4858d110 100644
--- a/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h
+++ b/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h
@@ -154,8 +154,6 @@ class TORCH_CUDA_CU_API ComplexDouble : public Val {
 //! the compute at position to maximum possible through traversal.
 enum class ComputeAtMode { Standard, BestEffort, MostInlined };
 
-class InlinePropagator;
-class MaxProducerPosUpdater;
 class TransformPropagator;
 struct MostInlinedTransformPropagator;
 class TransformIter;
@@ -163,6 +161,8 @@ class TransformReplay;
 class OptOutMutator;
 class TensorDomain;
 
+class MaxPosCalculator;
+
 namespace ir_utils {
 class TVDomainGuard;
 }
@@ -492,21 +492,30 @@ class TORCH_CUDA_CU_API TensorView : public Val {
   friend TORCH_CUDA_CU_API MostInlinedTransformPropagator;
   friend TORCH_CUDA_CU_API TransformReplay;
   friend TORCH_CUDA_CU_API OptOutMutator;
-  friend TORCH_CUDA_CU_API InlinePropagator;
-  friend TORCH_CUDA_CU_API MaxProducerPosUpdater;
+  friend class InlineBatchingGuard;
   friend class ir_utils::TVDomainGuard;
-  friend TORCH_CUDA_CU_API void groupReductions(
-      const std::vector<TensorView*>&);
+
+  // Inline the computation of this tensor into its consumer at the given
+  // position. If this tensor is already inlined in a higher position, then this
+  // call is a no-op. If the right most dimensions before `pos` are
+  // broadcasting, then will not inline into these broadcastings. If
+  // best_effort, then will inline into the highest allowed position that is <=
+  // `pos`.
+  void inlineAt(
+      int64_t pos,
+      bool best_effort = false,
+      MaxPosCalculator* calc = nullptr);
+
+  // Update the max producer position of the current tensor. This is required
+  // when we modify producer-consumer relationship of a scheduled tensor, for
+  // example, grouping multiple reductions.
+  void updateMaxProducerPosition();
 
  protected:
   void setDomain(TensorDomain* td) {
     domain_ = td;
   }
 
-  void setComputeAt(unsigned int this_pos, bool decrease = false);
-
-  void setMaxProducer(unsigned int this_pos, bool decrease = false);
-
  private:
   int normalizeAxisPos(int pos) const {
     if (pos < 0) {
diff --git a/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h b/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h
index 8077c9bc920cf..d34b3a9f89c58 100644
--- a/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h
+++ b/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h
@@ -30,6 +30,29 @@ struct AnalyzeViewResult;
 //! vals are `Int` will dispatch to v1->as<Int>()->sameAs(v2.as<Int>())
 bool areEqualScalars(Val* v1, Val* v2);
 
+class TORCH_CUDA_CU_API FullOp : public Expr {
+ public:
+  FullOp(IrBuilderPasskey, Val* out, Val* fill_value, DataType dtype);
+
+  FullOp(const FullOp* src, IrCloner* ir_cloner);
+
+  Expr* shallowCopy() const override;
+
+  bool sameAs(const Statement* other) const override;
+
+  DataType dtype() const {
+    return dtype_;
+  }
+
+  Val* getFillValue() const {
+    return fill_value_;
+  }
+
+ private:
+  const DataType dtype_;
+  Val* fill_value_;
+};
+
 class TORCH_CUDA_CU_API ARangeOp : public Expr {
  public:
   ARangeOp(
@@ -38,12 +61,19 @@ class TORCH_CUDA_CU_API ARangeOp : public Expr {
       Val* start,
       Val* end,
       Val* step,
+      DataType dtype,
       Val* linear_index = nullptr);
 
   ARangeOp(const ARangeOp* src, IrCloner* ir_cloner);
 
+  Expr* shallowCopy() const override;
+
   bool sameAs(const Statement* other) const override;
 
+  DataType dtype() const {
+    return dtype_;
+  }
+
   Val* start() const {
     return start_;
   }
@@ -56,7 +86,7 @@ class TORCH_CUDA_CU_API ARangeOp : public Expr {
     return step_;
   }
 
-  Val* getLinearIndex() const {
+  Val* getLinearLogicalIndex() const {
     return linear_index_;
   }
 
@@ -65,12 +95,72 @@ class TORCH_CUDA_CU_API ARangeOp : public Expr {
   }
 
  private:
+  const DataType dtype_;
   Val* start_;
   Val* end_;
   Val* step_;
   Val* linear_index_ = nullptr;
 };
 
+// Tensor factory for generating identity matrices like
+//
+// [[1, 0, 0],
+//  [0, 1, 0],
+//  [0, 0, 1]]
+//
+// or
+//
+// [[1, 0, 0],
+//  [0, 1, 0],
+//  [0, 0, 1],
+//  [0, 0, 0]]
+//
+// or
+//
+// [[1, 0, 0, 0],
+//  [0, 1, 0, 0],
+//  [0, 0, 1, 0]]
+class TORCH_CUDA_CU_API EyeOp : public Expr {
+ public:
+  EyeOp(
+      IrBuilderPasskey,
+      Val* out,
+      DataType dtype,
+      Val* index1 = nullptr,
+      Val* index2 = nullptr);
+
+  EyeOp(const EyeOp* src, IrCloner* ir_cloner);
+
+  Expr* shallowCopy() const override;
+
+  bool sameAs(const Statement* other) const override;
+
+  DataType dtype() const {
+    return dtype_;
+  }
+
+  Val* getIndex1() const {
+    return index1_;
+  }
+
+  void setIndex1(Val* index) {
+    index1_ = index;
+  }
+
+  Val* getIndex2() const {
+    return index2_;
+  }
+
+  void setIndex2(Val* index) {
+    index2_ = index;
+  }
+
+ private:
+  const DataType dtype_;
+  Val* index1_ = nullptr;
+  Val* index2_ = nullptr;
+};
+
 //! A specialization for Unary operations. Unary operations take in a single
 //! input and produce a single output. Examples include:
 //!   1) Casting operation i.e. float(a_val)
@@ -88,6 +178,8 @@ class TORCH_CUDA_CU_API UnaryOp : public Expr {
 
   UnaryOp(const UnaryOp* src, IrCloner* ir_cloner);
 
+  Expr* shallowCopy() const override;
+
   Val* out() const {
     return out_;
   }
@@ -117,6 +209,8 @@ class TORCH_CUDA_CU_API BinaryOp : public Expr {
 
   BinaryOp(const BinaryOp* src, IrCloner* ir_cloner);
 
+  Expr* shallowCopy() const override;
+
   Val* out() const {
     return out_;
   }
@@ -148,15 +242,23 @@ class TORCH_CUDA_CU_API RNGOp : public Expr {
       IrBuilderPasskey,
       RNGOpType type,
       Val* out,
+      DataType dtype,
+      std::vector<Val*> parameters = {},
       int rng_offset = 0,
       Val* philox_index = nullptr);
 
   RNGOp(const RNGOp* src, IrCloner* ir_cloner);
 
+  Expr* shallowCopy() const override;
+
   RNGOpType getRNGOpType() const {
     return rng_op_type_;
   }
 
+  DataType dtype() const {
+    return dtype_;
+  }
+
   int getRNGOffset() const {
     return rng_offset_;
   }
@@ -165,6 +267,14 @@ class TORCH_CUDA_CU_API RNGOp : public Expr {
     rng_offset_ = val;
   }
 
+  const std::vector<Val*>& getParameters() const {
+    return parameters_;
+  }
+
+  const std::vector<Val*>& getShape() const {
+    return shape_;
+  }
+
   Val* getPhiloxIndex() const {
     return philox_index_;
   }
@@ -177,6 +287,9 @@ class TORCH_CUDA_CU_API RNGOp : public Expr {
 
  private:
   const RNGOpType rng_op_type_;
+  const DataType dtype_;
+  std::vector<Val*> parameters_;
+  std::vector<Val*> shape_;
   int rng_offset_ = -1;
   // The index used to feed philox's subsequence and component
   Val* philox_index_ = nullptr;
@@ -197,6 +310,8 @@ class TORCH_CUDA_CU_API BroadcastOp : public Expr {
 
   BroadcastOp(const BroadcastOp* src, IrCloner* ir_cloner);
 
+  Expr* shallowCopy() const override;
+
   Val* out() const {
     return out_;
   }
@@ -245,6 +360,8 @@ class TORCH_CUDA_CU_API ReductionOp : public Expr {
 
   ReductionOp(const ReductionOp* src, IrCloner* ir_cloner);
 
+  Expr* shallowCopy() const override;
+
   Val* out() const {
     return out_;
   }
@@ -293,6 +410,8 @@ class TORCH_CUDA_CU_API GroupedReductionOp : public Expr {
 
   GroupedReductionOp(const GroupedReductionOp* src, IrCloner* ir_cloner);
 
+  Expr* shallowCopy() const override;
+
   //! Number of expressions grouped horizontally. It does not reflect
   //! iteration grouping.
   size_t numExprs() const {
@@ -479,6 +598,8 @@ class TORCH_CUDA_CU_API WelfordOp : public Expr {
 
   WelfordOp(const WelfordOp* src, IrCloner* ir_cloner);
 
+  Expr* shallowCopy() const override;
+
   Val* out() const {
     return output().avg();
   }
@@ -574,6 +695,8 @@ class TORCH_CUDA_CU_API GroupedWelfordOp : public Expr {
 
   GroupedWelfordOp(const GroupedWelfordOp* src, IrCloner* ir_cloner);
 
+  Expr* shallowCopy() const override;
+
   //! Number of expressions grouped horizontally. It does not reflect
   //! iteration grouping. As horizontal grouping is not supported,
   //! this always returns 1.
@@ -697,6 +820,8 @@ class TORCH_CUDA_CU_API MmaOp : public Expr {
 
   MmaOp(const MmaOp* src, IrCloner* ir_cloner);
 
+  Expr* shallowCopy() const override;
+
   Val* out() const {
     return out_;
   }
@@ -755,6 +880,8 @@ class TORCH_CUDA_CU_API TransposeOp : public Expr {
 
   TransposeOp(const TransposeOp* src, IrCloner* ir_cloner);
 
+  Expr* shallowCopy() const override;
+
   TensorView* out() const {
     return out_;
   }
@@ -785,6 +912,8 @@ class TORCH_CUDA_CU_API ExpandOp : public Expr {
 
   ExpandOp(const ExpandOp* src, IrCloner* ir_cloner);
 
+  Expr* shallowCopy() const override;
+
   TensorView* out() const {
     return out_;
   }
@@ -815,6 +944,8 @@ class TORCH_CUDA_CU_API TernaryOp : public Expr {
 
   TernaryOp(const TernaryOp* src, IrCloner* ir_cloner);
 
+  Expr* shallowCopy() const override;
+
   Val* out() const {
     return out_;
   }
@@ -858,6 +989,8 @@ class TORCH_CUDA_CU_API ShiftOp : public Expr {
 
   ShiftOp(const ShiftOp* src, IrCloner* ir_cloner);
 
+  Expr* shallowCopy() const override;
+
   Val* out() const {
     return out_;
   }
@@ -907,6 +1040,8 @@ class TORCH_CUDA_CU_API GatherOp : public Expr {
 
   GatherOp(const GatherOp* src, IrCloner* ir_cloner);
 
+  Expr* shallowCopy() const override;
+
   Val* out() const {
     return out_;
   }
@@ -953,6 +1088,8 @@ class TORCH_CUDA_CU_API ViewAsScalar : public Expr {
 
   ViewAsScalar(const ViewAsScalar* src, IrCloner* ir_cloner);
 
+  Expr* shallowCopy() const override;
+
   Val* out() const {
     return out_;
   }
@@ -986,6 +1123,8 @@ class TORCH_CUDA_CU_API ViewOp : public Expr {
 
   ViewOp(const ViewOp* src, IrCloner* ir_cloner);
 
+  Expr* shallowCopy() const override;
+
   TensorView* out() const {
     return out_;
   }
@@ -1011,6 +1150,8 @@ class TORCH_CUDA_CU_API LoadStoreOp : public Expr {
 
   LoadStoreOp(const LoadStoreOp* src, IrCloner* ir_cloner);
 
+  Expr* shallowCopy() const override;
+
   Val* out() const {
     return out_;
   }
@@ -1275,16 +1416,8 @@ class TORCH_CUDA_CU_API IterDomain : public Val {
   }
 
   //! Check if IterDomain is a reduction axis with size of 1, i.e.
-  //! a "squeeze" operator.
-  //!
-  //! NOTE: Detection of trivial reduction here is not
-  //! comprehensive. See detectTrivialReductionDerivedDomains for more
-  //! comprehensive analysis. We typically use this for root domain trivial
-  //! reduction checks. So we ship to the correct scheduler. It may
-  //! not be incredibly robust, but it makes sense to keep it for now.
-  bool isTrivialReduction() const {
-    return isReduction() && extent()->isOneInt();
-  }
+  //! a "squeeze" operator, or solely derived from such axes.
+  bool isTrivialReduction() const;
 
   //! Split for stride by a given factor. It effectively does an inner
   //! split by the factor and sets the inner domain as a Stride
@@ -1590,6 +1723,8 @@ class TORCH_CUDA_CU_API Split : public Expr {
 
   Split(const Split* src, IrCloner* ir_cloner);
 
+  Expr* shallowCopy() const override;
+
   IterDomain* outer() const {
     return outer_;
   }
@@ -1650,6 +1785,8 @@ class TORCH_CUDA_CU_API Merge : public Expr {
 
   Merge(const Merge* src, IrCloner* ir_cloner);
 
+  Expr* shallowCopy() const override;
+
   IterDomain* out() const {
     return out_;
   }
@@ -1682,6 +1819,8 @@ class TORCH_CUDA_CU_API Swizzle2D : public Expr {
 
   Swizzle2D(const Swizzle2D* src, IrCloner* ir_cloner);
 
+  Expr* shallowCopy() const override;
+
   IterDomain* outX() const {
     return out_x_;
   }
diff --git a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp b/torch/csrc/jit/codegen/cuda/ir_iostream.cpp
index 5229647ac9d5c..e13273c8e75e9 100644
--- a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp
+++ b/torch/csrc/jit/codegen/cuda/ir_iostream.cpp
@@ -248,6 +248,35 @@ void IrPrinter::handle(const NamedScalar* ns) {
   os_ << ns->name();
 }
 
+void IrPrinter::handle(const FullOp* fop) {
+  if (!print_inline_) {
+    indent();
+    os_ << fop->output(0) << "\n";
+    indent_size_++;
+    indent();
+    os_ << " = ";
+  } else {
+    checkInlineable(fop);
+  }
+
+  os_ << "full({";
+  for (auto i : c10::irange(fop->inputs().size())) {
+    if (i == fop->inputs().size() - 1) {
+      os_ << "}";
+    }
+    if (i > 0) {
+      os_ << ", ";
+    }
+    handle(fop->input(i));
+  }
+  os_ << ", " << fop->dtype() << ")";
+
+  indent_size_--;
+
+  if (!print_inline_)
+    os_ << ";\n";
+}
+
 void IrPrinter::handle(const ARangeOp* aop) {
   if (!print_inline_) {
     indent() << aop->output(0);
@@ -265,7 +294,28 @@ void IrPrinter::handle(const ARangeOp* aop) {
   handle(aop->end());
   os_ << ", ";
   handle(aop->step());
-  os_ << ")";
+  os_ << ", " << aop->dtype() << ")";
+
+  indent_size_--;
+
+  if (!print_inline_)
+    os_ << ";\n";
+}
+
+void IrPrinter::handle(const EyeOp* eop) {
+  if (!print_inline_) {
+    indent();
+    os_ << eop->output(0) << "\n";
+    indent_size_++;
+    indent();
+    os_ << " = ";
+  } else {
+    checkInlineable(eop);
+  }
+
+  os_ << "eye(";
+  handle(eop->input(0));
+  os_ << ", " << eop->dtype() << ")";
 
   indent_size_--;
 
@@ -429,21 +479,27 @@ void IrPrinter::handle(const RNGOp* rop) {
     checkInlineable(rop);
   }
 
-  os_ << rop->getRNGOpType() << "(";
+  os_ << rop->getRNGOpType() << "({";
   bool first = true;
-  for (auto i : rop->inputs()) {
+  for (auto i : rop->getShape()) {
     if (!first) {
       os_ << ", ";
     }
     handle(i);
     first = false;
   }
-  os_ << ")";
+  os_ << "}";
+  for (auto i : rop->getParameters()) {
+    os_ << ", ";
+    handle(i);
+  }
+  os_ << ", " << rop->dtype() << ")";
 
   indent_size_--;
 
-  if (!print_inline_)
+  if (!print_inline_) {
     os_ << ";\n";
+  }
 }
 
 void IrPrinter::handle(const ReductionOp* rop) {
diff --git a/torch/csrc/jit/codegen/cuda/ir_iostream.h b/torch/csrc/jit/codegen/cuda/ir_iostream.h
index fd77d91010a48..599e50286d294 100644
--- a/torch/csrc/jit/codegen/cuda/ir_iostream.h
+++ b/torch/csrc/jit/codegen/cuda/ir_iostream.h
@@ -82,7 +82,9 @@ class TORCH_CUDA_CU_API IrPrinter : public OptInConstDispatch {
   void handle(const ComplexDouble*) final;
   void handle(const NamedScalar*) final;
 
+  void handle(const FullOp*) final;
   void handle(const ARangeOp*) final;
+  void handle(const EyeOp*) final;
   void handle(const UnaryOp*) final;
   void handle(const BinaryOp*) final;
   void handle(const TernaryOp*) final;
diff --git a/torch/csrc/jit/codegen/cuda/ir_nodes.cpp b/torch/csrc/jit/codegen/cuda/ir_nodes.cpp
index fb8e28c53de0d..3b51b807a727d 100644
--- a/torch/csrc/jit/codegen/cuda/ir_nodes.cpp
+++ b/torch/csrc/jit/codegen/cuda/ir_nodes.cpp
@@ -182,14 +182,54 @@ bool ComplexDouble::sameAs(const Statement* other) const {
   return false;
 }
 
+FullOp::FullOp(
+    IrBuilderPasskey passkey,
+    Val* out,
+    Val* fill_value,
+    DataType dtype)
+    : Expr(passkey, ExprType::FullOp), dtype_(dtype), fill_value_(fill_value) {
+  if (out->isA<TensorView>()) {
+    addInput(out->as<TensorView>()->getRootDomain()[0]->extent());
+  }
+  addInput(fill_value);
+  addOutput(out);
+}
+
+FullOp::FullOp(const FullOp* src, IrCloner* ir_cloner)
+    : Expr(src, ir_cloner),
+      dtype_(src->dtype()),
+      fill_value_(ir_cloner->clone(src->fill_value_)) {}
+
+Expr* FullOp::shallowCopy() const {
+  auto result = IrBuilder::create<FullOp>(output(0), fill_value_, dtype_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
+bool FullOp::sameAs(const Statement* other) const {
+  if (this == other) {
+    return true;
+  }
+  if (!other->isA<FullOp>()) {
+    return false;
+  }
+  const auto other_op = other->as<FullOp>();
+  if (dtype_ != other_op->dtype_) {
+    return false;
+  }
+  return Expr::sameAs(other);
+}
+
 ARangeOp::ARangeOp(
     IrBuilderPasskey passkey,
     Val* out,
     Val* start,
     Val* end,
     Val* step,
+    DataType dtype,
     Val* linear_index)
     : Expr(passkey, ExprType::ARangeOp),
+      dtype_(dtype),
       start_(start),
       end_(end),
       step_(step),
@@ -202,11 +242,19 @@ ARangeOp::ARangeOp(
 
 ARangeOp::ARangeOp(const ARangeOp* src, IrCloner* ir_cloner)
     : Expr(src, ir_cloner),
+      dtype_(src->dtype()),
       start_(ir_cloner->clone(src->start_)),
       end_(ir_cloner->clone(src->end_)),
       step_(ir_cloner->clone(src->step_)),
       linear_index_(ir_cloner->clone(src->linear_index_)) {}
 
+Expr* ARangeOp::shallowCopy() const {
+  auto result = IrBuilder::create<ARangeOp>(
+      output(0), start_, end_, step_, dtype_, linear_index_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 bool ARangeOp::sameAs(const Statement* other) const {
   if (this == other) {
     return true;
@@ -215,6 +263,9 @@ bool ARangeOp::sameAs(const Statement* other) const {
     return false;
   }
   const auto other_op = other->as<ARangeOp>();
+  if (dtype_ != other_op->dtype_) {
+    return false;
+  }
   if (!start_->sameAs(other_op->start_)) {
     return false;
   }
@@ -234,6 +285,64 @@ bool ARangeOp::sameAs(const Statement* other) const {
   return Expr::sameAs(other);
 }
 
+EyeOp::EyeOp(
+    IrBuilderPasskey passkey,
+    Val* out,
+    DataType dtype,
+    Val* index1,
+    Val* index2)
+    : Expr(passkey, ExprType::EyeOp),
+      dtype_(dtype),
+      index1_(index1),
+      index2_(index2) {
+  if (out->isA<TensorView>()) {
+    addInput(out->as<TensorView>()->getRootDomain()[0]->extent());
+    if (out->as<TensorView>()->getRootDomain()[1] !=
+        out->as<TensorView>()->getRootDomain()[0]) {
+      addInput(out->as<TensorView>()->getRootDomain()[1]->extent());
+    }
+  }
+  addOutput(out);
+}
+
+EyeOp::EyeOp(const EyeOp* src, IrCloner* ir_cloner)
+    : Expr(src, ir_cloner),
+      dtype_(src->dtype_),
+      index1_(ir_cloner->clone(src->index1_)),
+      index2_(ir_cloner->clone(src->index2_)) {}
+
+Expr* EyeOp::shallowCopy() const {
+  auto result = IrBuilder::create<EyeOp>(output(0), dtype_, index1_, index2_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
+bool EyeOp::sameAs(const Statement* other) const {
+  if (this == other) {
+    return true;
+  }
+  if (!other->isA<EyeOp>()) {
+    return false;
+  }
+  const auto other_op = other->as<EyeOp>();
+  if (dtype_ != other_op->dtype_) {
+    return false;
+  }
+  if ((index1_ == nullptr) != (other_op->index1_ == nullptr)) {
+    return false;
+  }
+  if ((index2_ == nullptr) != (other_op->index2_ == nullptr)) {
+    return false;
+  }
+  if ((index1_ != nullptr) && !index1_->sameAs(other_op->index1_)) {
+    return false;
+  }
+  if ((index2_ != nullptr) && !index2_->sameAs(other_op->index2_)) {
+    return false;
+  }
+  return Expr::sameAs(other);
+}
+
 UnaryOp::UnaryOp(
     IrBuilderPasskey passkey,
     UnaryOpType type,
@@ -254,6 +363,12 @@ UnaryOp::UnaryOp(const UnaryOp* src, IrCloner* ir_cloner)
       out_(ir_cloner->clone(src->out_)),
       in_(ir_cloner->clone(src->in_)) {}
 
+Expr* UnaryOp::shallowCopy() const {
+  auto result = IrBuilder::create<UnaryOp>(unary_op_type_, out_, in_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 bool UnaryOp::sameAs(const Statement* other) const {
   if (this == other) {
     return true;
@@ -262,8 +377,9 @@ bool UnaryOp::sameAs(const Statement* other) const {
     return false;
   }
   const auto other_op = other->as<UnaryOp>();
-  if (getUnaryOpType() != other_op->getUnaryOpType())
+  if (getUnaryOpType() != other_op->getUnaryOpType()) {
     return false;
+  }
   return Expr::sameAs(other);
 }
 
@@ -290,6 +406,12 @@ BinaryOp::BinaryOp(const BinaryOp* src, IrCloner* ir_cloner)
       lhs_(ir_cloner->clone(src->lhs_)),
       rhs_(ir_cloner->clone(src->rhs_)) {}
 
+Expr* BinaryOp::shallowCopy() const {
+  auto result = IrBuilder::create<BinaryOp>(binary_op_type_, out_, lhs_, rhs_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 bool BinaryOp::sameAs(const Statement* other) const {
   if (this == other) {
     return true;
@@ -298,8 +420,9 @@ bool BinaryOp::sameAs(const Statement* other) const {
     return false;
   }
   const auto other_op = other->as<BinaryOp>();
-  if (getBinaryOpType() != other_op->getBinaryOpType())
+  if (getBinaryOpType() != other_op->getBinaryOpType()) {
     return false;
+  }
   return Expr::sameAs(other);
 }
 
@@ -330,6 +453,13 @@ TernaryOp::TernaryOp(const TernaryOp* src, IrCloner* ir_cloner)
       in2_(ir_cloner->clone(src->in2_)),
       in3_(ir_cloner->clone(src->in3_)) {}
 
+Expr* TernaryOp::shallowCopy() const {
+  auto result =
+      IrBuilder::create<TernaryOp>(ternary_op_type_, out_, in1_, in2_, in3_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 bool TernaryOp::sameAs(const Statement* other) const {
   if (this == other) {
     return true;
@@ -338,8 +468,9 @@ bool TernaryOp::sameAs(const Statement* other) const {
     return false;
   }
   const auto other_op = other->as<TernaryOp>();
-  if (getTernaryOpType() != other_op->getTernaryOpType())
+  if (getTernaryOpType() != other_op->getTernaryOpType()) {
     return false;
+  }
   return Expr::sameAs(other);
 }
 
@@ -347,26 +478,45 @@ RNGOp::RNGOp(
     IrBuilderPasskey passkey,
     RNGOpType type,
     Val* out,
+    DataType dtype,
+    std::vector<Val*> parameters,
     int rng_offset,
     Val* philox_index)
     : Expr(passkey, ExprType::RNGOp),
       rng_op_type_(type),
+      dtype_(dtype),
+      parameters_(std::move(parameters)),
       rng_offset_(rng_offset),
       philox_index_(philox_index) {
   if (out->isA<TensorView>()) {
     for (auto id : out->as<TensorView>()->getRootDomain()) {
-      addInput(id->extent());
+      shape_.emplace_back(id->extent());
     }
   }
+  for (auto v : shape_) {
+    addInput(v);
+  }
+  for (auto v : parameters_) {
+    addInput(v);
+  }
   addOutput(out);
 }
 
 RNGOp::RNGOp(const RNGOp* src, IrCloner* ir_cloner)
     : Expr(src, ir_cloner),
       rng_op_type_(src->rng_op_type_),
+      dtype_(src->dtype()),
+      parameters_(ir_cloner->clone(src->parameters_)),
       rng_offset_(src->rng_offset_),
       philox_index_(ir_cloner->clone(src->philox_index_)) {}
 
+Expr* RNGOp::shallowCopy() const {
+  auto result = IrBuilder::create<RNGOp>(
+      rng_op_type_, output(0), dtype_, parameters_, rng_offset_, philox_index_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 bool RNGOp::sameAs(const Statement* other) const {
   if (this == other) {
     return true;
@@ -378,6 +528,17 @@ bool RNGOp::sameAs(const Statement* other) const {
   if (getRNGOpType() != other_op->getRNGOpType()) {
     return false;
   }
+  if (dtype_ != other_op->dtype_) {
+    return false;
+  }
+  if (parameters_.size() != other_op->parameters_.size()) {
+    return false;
+  }
+  for (auto i : c10::irange(parameters_.size())) {
+    if (!parameters_[i]->sameAs(other_op->parameters_[i])) {
+      return false;
+    }
+  }
   if (getRNGOffset() != other_op->getRNGOffset()) {
     return false;
   }
@@ -467,6 +628,12 @@ BroadcastOp::BroadcastOp(const BroadcastOp* src, IrCloner* ir_cloner)
       in_(ir_cloner->clone(src->in_)),
       is_broadcast_dims_(src->is_broadcast_dims_) {}
 
+Expr* BroadcastOp::shallowCopy() const {
+  auto result = IrBuilder::create<BroadcastOp>(out_, in_, is_broadcast_dims_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 bool BroadcastOp::sameAs(const Statement* other) const {
   if (this == other) {
     return true;
@@ -521,6 +688,36 @@ ReductionOp::ReductionOp(
   addInput(in);
 }
 
+ReductionOp::ReductionOp(const ReductionOp* src, IrCloner* ir_cloner)
+    : Expr(src, ir_cloner),
+      reduction_op_type_(src->reduction_op_type_),
+      init_(ir_cloner->clone(src->init_)),
+      out_(ir_cloner->clone(src->out_)),
+      in_(ir_cloner->clone(src->in_)),
+      is_allreduce_(src->is_allreduce_) {}
+
+Expr* ReductionOp::shallowCopy() const {
+  auto result = IrBuilder::create<ReductionOp>(
+      reduction_op_type_, init_, out_, in_, is_allreduce_, etype());
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
+bool ReductionOp::sameAs(const Statement* other) const {
+  if (this == other) {
+    return true;
+  }
+  if (!other->isA<ReductionOp>()) {
+    return false;
+  }
+  const auto other_op = other->as<ReductionOp>();
+  // Note that init is not part of input vals, so it must be checked separately.
+  return (
+      Expr::sameAs(other) &&
+      getReductionOpType() == other_op->getReductionOpType() &&
+      init()->sameAs(other_op->init()));
+}
+
 GroupedReductionOp::GroupedReductionOp(
     IrBuilderPasskey passkey,
     std::vector<BinaryOpType> reduction_op_types,
@@ -550,6 +747,18 @@ GroupedReductionOp::GroupedReductionOp(
       init_vals_(ir_cloner->clone(src->init_vals_)),
       is_allreduce_(src->is_allreduce_) {}
 
+Expr* GroupedReductionOp::shallowCopy() const {
+  auto result = IrBuilder::create<GroupedReductionOp>(
+      reduction_op_types_,
+      init_vals_,
+      outputs(),
+      inputs(),
+      is_allreduce_,
+      etype());
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 int GroupedReductionOp::getExprIndexOfOutput(Val* output_val) const {
   auto it = std::find(outputs().begin(), outputs().end(), output_val);
   if (it != outputs().end()) {
@@ -724,6 +933,13 @@ WelfordOp::WelfordOp(const WelfordOp* src, IrCloner* ir_cloner)
       init_(src->init_.clone(ir_cloner)),
       is_allreduce_(src->is_allreduce_) {}
 
+Expr* WelfordOp::shallowCopy() const {
+  auto result =
+      IrBuilder::create<WelfordOp>(output_, input_, init_, is_allreduce_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 Val* WelfordOp::getInitValOfOutput(Val* output_val) const {
   auto val_name = output().getNameOf(output_val);
 
@@ -873,6 +1089,13 @@ GroupedWelfordOp::GroupedWelfordOp(
       init_vals_(WelfordTriplet::clone(src->init_vals_, ir_cloner)),
       is_allreduce_(src->is_allreduce_) {}
 
+Expr* GroupedWelfordOp::shallowCopy() const {
+  auto result = IrBuilder::create<GroupedWelfordOp>(
+      output_vals_, input_vals_, init_vals_, is_allreduce_, etype());
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 bool GroupedWelfordOp::sameAs(const Statement* other) const {
   if (this == other) {
     return true;
@@ -967,6 +1190,13 @@ MmaOp::MmaOp(const MmaOp* src, IrCloner* ir_cloner)
       init_(ir_cloner->clone(src->init_)),
       options_(src->options_) {}
 
+Expr* MmaOp::shallowCopy() const {
+  auto result = IrBuilder::create<MmaOp>(out_, in_a_, in_b_, init_);
+  result->options_ = options_;
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 bool MmaOp::sameAs(const Statement* other) const {
   if (this == other) {
     return true;
@@ -979,29 +1209,6 @@ bool MmaOp::sameAs(const Statement* other) const {
   return false;
 }
 
-ReductionOp::ReductionOp(const ReductionOp* src, IrCloner* ir_cloner)
-    : Expr(src, ir_cloner),
-      reduction_op_type_(src->reduction_op_type_),
-      init_(ir_cloner->clone(src->init_)),
-      out_(ir_cloner->clone(src->out_)),
-      in_(ir_cloner->clone(src->in_)),
-      is_allreduce_(src->is_allreduce_) {}
-
-bool ReductionOp::sameAs(const Statement* other) const {
-  if (this == other) {
-    return true;
-  }
-  if (!other->isA<ReductionOp>()) {
-    return false;
-  }
-  const auto other_op = other->as<ReductionOp>();
-  // Note that init is not part of input vals, so it must be checked separately.
-  return (
-      Expr::sameAs(other) &&
-      getReductionOpType() == other_op->getReductionOpType() &&
-      init()->sameAs(other_op->init()));
-}
-
 TransposeOp::TransposeOp(
     IrBuilderPasskey passkey,
     TensorView* out,
@@ -1046,6 +1253,12 @@ TransposeOp::TransposeOp(const TransposeOp* src, IrCloner* ir_cloner)
       in_(ir_cloner->clone(src->in_)),
       new2old_(src->new2old_) {}
 
+Expr* TransposeOp::shallowCopy() const {
+  auto result = IrBuilder::create<TransposeOp>(out_, in_, new2old_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 std::vector<int64_t> TransposeOp::old2new() const {
   std::vector<int64_t> old2new(new2old_.size());
   for (auto new_axis : c10::irange(new2old_.size())) {
@@ -1085,6 +1298,12 @@ ExpandOp::ExpandOp(const ExpandOp* src, IrCloner* ir_cloner)
   }
 }
 
+Expr* ExpandOp::shallowCopy() const {
+  auto result = IrBuilder::create<ExpandOp>(out_, in_, expanded_extents_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 ShiftOp::ShiftOp(
     IrBuilderPasskey passkey,
     Val* out,
@@ -1132,6 +1351,12 @@ ShiftOp::ShiftOp(const ShiftOp* src, IrCloner* ir_cloner)
       offsets_(src->offsets_),
       pad_width_(src->pad_width_) {}
 
+Expr* ShiftOp::shallowCopy() const {
+  auto result = IrBuilder::create<ShiftOp>(out_, in_, offsets_, pad_width_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 bool ShiftOp::sameAs(const Statement* other) const {
   if (this == other) {
     return true;
@@ -1194,6 +1419,13 @@ GatherOp::GatherOp(const GatherOp* src, IrCloner* ir_cloner)
       window_shape_(src->window_shape_),
       pad_width_(src->pad_width_) {}
 
+Expr* GatherOp::shallowCopy() const {
+  auto result =
+      IrBuilder::create<GatherOp>(out_, in_, window_shape_, pad_width_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 bool GatherOp::sameAs(const Statement* other) const {
   if (this == other) {
     return true;
@@ -1240,6 +1472,12 @@ ViewAsScalar::ViewAsScalar(const ViewAsScalar* src, IrCloner* ir_cloner)
       vector_id_(ir_cloner->clone(src->vector_id_)),
       index_(ir_cloner->clone(src->index_)) {}
 
+Expr* ViewAsScalar::shallowCopy() const {
+  auto result = IrBuilder::create<ViewAsScalar>(out_, in_, vector_id_, index_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 ViewOp::ViewOp(IrBuilderPasskey passkey, TensorView* out, TensorView* in)
     : Expr(passkey, ExprType::ViewOp), out_(out), in_(in) {
   addOutput(out);
@@ -1251,6 +1489,12 @@ ViewOp::ViewOp(const ViewOp* src, IrCloner* ir_cloner)
       out_(ir_cloner->clone(src->out_)),
       in_(ir_cloner->clone(src->in_)) {}
 
+Expr* ViewOp::shallowCopy() const {
+  auto result = IrBuilder::create<ViewOp>(out_, in_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 LoadStoreOp::LoadStoreOp(
     IrBuilderPasskey passkey,
     LoadStoreOpType op_type,
@@ -1270,6 +1514,12 @@ LoadStoreOp::LoadStoreOp(const LoadStoreOp* src, IrCloner* ir_cloner)
       out_(ir_cloner->clone(src->out_)),
       in_(ir_cloner->clone(src->in_)) {}
 
+Expr* LoadStoreOp::shallowCopy() const {
+  auto result = IrBuilder::create<LoadStoreOp>(load_store_type_, out_, in_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 IterDomainBuilder::IterDomainBuilder(Val* _start, Val* _extent)
     : start_(_start), extent_(_extent) {
   TORCH_INTERNAL_ASSERT(
@@ -1470,6 +1720,37 @@ IterDomain* IterDomain::cloneWithoutRFactor() const {
   return cloned;
 }
 
+bool IterDomain::isTrivialReduction() const {
+  if (!isReduction()) {
+    return false;
+  }
+
+  if (extent()->isOneInt()) {
+    return true;
+  }
+
+  // If this domain is an output of an expression, i.e., not a root
+  // domain, check if all root domains are trivial reductions. This is
+  // almost the same as the analysis done in TrivialReductionInfo, but
+  // is limited within a single tensor, whereas TrivialReductionInfo
+  // does more expensive analysis potentially traversing through
+  // rfactor domains
+  if (definition()) {
+    // Note: There's no const version of IterVisitor.
+    auto id_inputs = InputsOf::output(fusion(), const_cast<IterDomain*>(this));
+    if (std::all_of(
+            ir_utils::filterByType<IterDomain>(id_inputs).begin(),
+            ir_utils::filterByType<IterDomain>(id_inputs).end(),
+            [](IterDomain* root_id) {
+              return root_id->isReduction() && root_id->extent()->isOneInt();
+            })) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
 std::vector<IterDomain*> IterDomain::clone(
     const std::vector<IterDomain*>& domains) {
   std::vector<IterDomain*> cloned_domains;
@@ -1494,7 +1775,11 @@ IterDomain* IterDomain::merge(IterDomain* outer, IterDomain* inner) {
       outer->isReduction() == inner->isReduction() ||
           (!outer->isReduction() && inner->isTrivialReduction()) ||
           (outer->isTrivialReduction() && !inner->isReduction()),
-      "Merging IterDomains requires that their iteration types match.");
+      "Merging IterDomains requires that their iteration types match. ",
+      "Outer: ",
+      outer->toString(),
+      ", Inner: ",
+      inner->toString());
   TORCH_CHECK(
       (outer->isGather() && inner->isGather()) ||
           (!outer->isGather() && !inner->isGather()),
@@ -2380,6 +2665,13 @@ Split::Split(const Split* src, IrCloner* ir_cloner)
       start_offset_(ir_cloner->clone(src->start_offset_)),
       stop_offset_(ir_cloner->clone(src->stop_offset_)) {}
 
+Expr* Split::shallowCopy() const {
+  auto result = IrBuilder::create<Split>(
+      outer_, inner_, in_, factor_, inner_split_, start_offset_, stop_offset_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 Val* Split::extent(Val* in_extent, Val* start_offset, Val* stop_offset) {
   TORCH_INTERNAL_ASSERT(in_extent != nullptr);
 
@@ -2425,6 +2717,12 @@ Merge::Merge(const Merge* src, IrCloner* ir_cloner)
       outer_(ir_cloner->clone(src->outer_)),
       inner_(ir_cloner->clone(src->inner_)) {}
 
+Expr* Merge::shallowCopy() const {
+  auto result = IrBuilder::create<Merge>(out_, outer_, inner_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 bool Merge::sameAs(const Statement* other) const {
   if (this == other) {
     return true;
@@ -2456,6 +2754,13 @@ Swizzle2D::Swizzle2D(
   addInput(in_y);
 }
 
+Expr* Swizzle2D::shallowCopy() const {
+  auto result = IrBuilder::create<Swizzle2D>(
+      out_x_, out_y_, in_x_, in_y_, swizzle_type_, swizzle_mode_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 bool Swizzle2D::sameAs(const Statement* other) const {
   if (this == other) {
     return true;
diff --git a/torch/csrc/jit/codegen/cuda/ir_utils.cpp b/torch/csrc/jit/codegen/cuda/ir_utils.cpp
index 4976518c737b7..dba5ee10adabb 100644
--- a/torch/csrc/jit/codegen/cuda/ir_utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/ir_utils.cpp
@@ -180,6 +180,16 @@ struct SubstituteInExpr : public OptInDispatch {
     OptInDispatch::handle(expr);
   }
 
+  void handle(FullOp* full_expr) final {
+    auto out = reference_->sameAs(full_expr->output(0)) ? substitute_
+                                                        : full_expr->output(0);
+    expr_ = IrBuilder::create<FullOp>(
+        full_expr->container(),
+        out,
+        full_expr->getFillValue(),
+        full_expr->dtype());
+  }
+
   void handle(ARangeOp* arange_expr) final {
     auto start = reference_->sameAs(arange_expr->start())
         ? substitute_
@@ -197,7 +207,19 @@ struct SubstituteInExpr : public OptInDispatch {
         start,
         end,
         step,
-        arange_expr->getLinearIndex());
+        arange_expr->dtype(),
+        arange_expr->getLinearLogicalIndex());
+  }
+
+  void handle(EyeOp* eye_expr) final {
+    auto out = reference_->sameAs(eye_expr->output(0)) ? substitute_
+                                                       : eye_expr->output(0);
+    expr_ = IrBuilder::create<EyeOp>(
+        eye_expr->container(),
+        out,
+        eye_expr->dtype(),
+        eye_expr->getIndex1(),
+        eye_expr->getIndex2());
   }
 
   void handle(UnaryOp* unary_expr) final {
@@ -244,12 +266,18 @@ struct SubstituteInExpr : public OptInDispatch {
   }
 
   void handle(RNGOp* rng_expr) final {
+    std::vector<Val*> subsituted_params;
+    for (auto v : rng_expr->getParameters()) {
+      subsituted_params.emplace_back(reference_->sameAs(v) ? substitute_ : v);
+    }
     auto out = reference_->sameAs(rng_expr->output(0)) ? substitute_
                                                        : rng_expr->output(0);
     expr_ = IrBuilder::create<RNGOp>(
         rng_expr->container(),
         rng_expr->getRNGOpType(),
         out,
+        rng_expr->dtype(),
+        subsituted_params,
         rng_expr->getRNGOffset(),
         rng_expr->getPhiloxIndex());
   }
@@ -748,7 +776,7 @@ class ValReplacementMutator : private OptOutMutator {
     // grab all leaves towards outputs and grab stmts from there.
     auto stmts = StmtSort::getStmts(fusion, allLeafOuts(fusion), true);
 
-    // Some fusions, such as standalone randlike, can have disconnected DAG, so
+    // Some fusions, such as standalone rand_like, can have disconnected DAG, so
     // we need some mechanism to make sure our replacement set is as complete as
     // possible
     // TODO: I think we need a more general mechanism to support disconnected
@@ -851,6 +879,30 @@ bool isReductionTvOp(const Expr* expr) {
   return ir_utils::isTvOp(expr) && isReductionOp(expr);
 }
 
+TORCH_CUDA_CU_API std::vector<ViewOp*> getViewOps(Fusion* fusion) {
+  auto all_exprs = fusion->exprs();
+
+  auto all_view_ops = ir_utils::filterByType<ViewOp>(all_exprs);
+
+  std::vector<ViewOp*> view_ops;
+
+  std::copy_if(
+      all_view_ops.begin(),
+      all_view_ops.end(),
+      std::back_inserter(view_ops),
+      [](ViewOp* view) {
+        return std::any_of(
+            view->outputs().begin(), view->outputs().end(), [](Val* v) {
+              if (!v->isA<TensorView>()) {
+                return false;
+              }
+              return v->as<TensorView>()->hasRFactor();
+            });
+      });
+
+  return view_ops;
+}
+
 namespace {
 
 struct ReplaceValInIndexVal : public OptInDispatch {
diff --git a/torch/csrc/jit/codegen/cuda/ir_utils.h b/torch/csrc/jit/codegen/cuda/ir_utils.h
index ce38ebd27fa40..adfc64fc74adf 100644
--- a/torch/csrc/jit/codegen/cuda/ir_utils.h
+++ b/torch/csrc/jit/codegen/cuda/ir_utils.h
@@ -317,10 +317,15 @@ TORCH_CUDA_CU_API bool isReductionOp(const Expr*);
 // Returns if Expr is a reduction op with TensorView or TensorIndex
 TORCH_CUDA_CU_API bool isReductionTvOp(const Expr*);
 
+// Returns all non-trivial view operations. We shouldn't have trivial view
+// operations but this function is to simply make sure if we ever do we don't
+// pull them in.
+TORCH_CUDA_CU_API std::vector<ViewOp*> getViewOps(Fusion*);
+
 template <typename T>
 std::string toString(const T& nodes) {
   std::stringstream ss;
-  for (Statement* stmt : nodes) {
+  for (const Statement* stmt : nodes) {
     if (ss.tellp() != 0) {
       ss << ", ";
     }
diff --git a/torch/csrc/jit/codegen/cuda/iter_visitor.cpp b/torch/csrc/jit/codegen/cuda/iter_visitor.cpp
index 08ba663c9fa63..984a22194a20a 100644
--- a/torch/csrc/jit/codegen/cuda/iter_visitor.cpp
+++ b/torch/csrc/jit/codegen/cuda/iter_visitor.cpp
@@ -32,81 +32,44 @@ void remove_visited(
   }
 }
 
-// Return all dependencies of a node including members of the node.
-class RecursiveDependencies : public OptInDispatch {
+class MemberStatements : public OptOutDispatch {
  public:
+  // Return all members of the stmt if it's a Val. For expressions it returns
+  // nothing.
   static std::vector<Statement*> next(Statement* stmt) {
-    RecursiveDependencies find_next(stmt);
+    MemberStatements find_next(stmt);
     return find_next.next_stmts_;
   }
 
  private:
-  RecursiveDependencies() = default;
+  MemberStatements() = default;
 
-  RecursiveDependencies(Statement* stmt) {
+  MemberStatements(Statement* stmt) {
     handle(stmt);
   }
 
-  using OptInDispatch::handle;
-
-  void handle(Expr* expr) final {
-    FusionGuard::getCurFusion()->assertInContainer(
-        expr,
-        "IterVisitor.cpp::RecursiveDependencies::handle(Expr*) Cannot traverse expr, ");
-    next_stmts_.insert(
-        next_stmts_.end(), expr->inputs().begin(), expr->inputs().end());
-  }
+  using OptOutDispatch::handle;
 
   void handle(Val* val) final {
     FusionGuard::getCurFusion()->assertInContainer(
         val,
-        "IterVisitor.cpp::RecursiveDependencies::handle(Val*) Cannot traverse val, ");
-    OptInDispatch::handle(val);
-  }
-
-  void simpleVal(Val* val) {
-    if (val->definition() == nullptr) {
-      return;
-    }
-    next_stmts_.push_back(val->definition());
-  }
-
-  void handle(Bool* stmt) final {
-    simpleVal(stmt);
-  }
-
-  void handle(Double* stmt) final {
-    simpleVal(stmt);
-  }
-
-  void handle(Int* stmt) final {
-    simpleVal(stmt);
-  }
-
-  void handle(ComplexDouble* stmt) final {
-    simpleVal(stmt);
-  }
-
-  void handle(NamedScalar* stmt) final {
-    simpleVal(stmt);
+        "IterVisitor.cpp::MemberStatements::handle(Val*) Cannot traverse val, ");
+    OptOutDispatch::handle(val);
   }
 
   void handle(IterDomain* stmt) final {
     next_stmts_.push_back(stmt->start());
     next_stmts_.push_back(stmt->extent());
     next_stmts_.push_back(stmt->stopOffset());
-    simpleVal(stmt);
   }
 
   void handle(TensorDomain* stmt) final {
     next_stmts_.insert(
         next_stmts_.end(), stmt->domain().begin(), stmt->domain().end());
-    simpleVal(stmt);
   }
 
   void handle(TensorView* tv) final {
     next_stmts_.push_back(tv->domain());
-    simpleVal(tv);
   }
 
   std::vector<Statement*> next_stmts_;
@@ -169,17 +132,18 @@ void IterVisitor::handle(Val* v) {
 // To prevent traversing all paths through a DAG (unless we want to) we have a
 // function to remove visited nodes from being re-added to the stack
 // (remove_visited).
-void IterVisitor::traverseFrom(
+void IterVisitor::traverseBetween(
     Fusion* fusion,
-    const std::vector<Val*>& from,
-    bool traverseAllPaths,
-    bool traverseIntoMembers) {
+    const std::unordered_set<Val*>& from,
+    const std::vector<Val*>& to,
+    bool traverse_all_paths,
+    bool traverse_into_members) {
   FusionGuard fg(fusion);
 
   std::unordered_set<Statement*> visited;
 
   stmt_stack.clear();
-  stmt_stack.emplace_back(from.rbegin(), from.rend());
+  stmt_stack.emplace_back(to.rbegin(), to.rend());
 
   bool all_inputs_visited = false;
 
@@ -201,7 +165,7 @@ void IterVisitor::traverseFrom(
     // If we just poped a stmt_stack level, we can finally visit it!
     if (all_inputs_visited) {
       // stmt may have be already visited.
-      if (traverseAllPaths || visited.find(stmt) == visited.end()) {
+      if (traverse_all_paths || visited.find(stmt) == visited.end()) {
         // Mark visited
         visited.insert(stmt);
 
@@ -217,10 +181,20 @@ void IterVisitor::traverseFrom(
     } else {
       // We're not ready to process this node, so add all its inputs to be
       // checked Visit input nodes.
-      auto next_stmts =
-          traverseIntoMembers ? RecursiveDependencies::next(stmt) : next(stmt);
+      std::vector<Statement*> next_stmts;
+
+      if ((stmt->isVal() && from.find(stmt->asVal()) == from.end()) ||
+          stmt->isExpr()) {
+        next_stmts = next(stmt);
+      }
+
+      if (traverse_into_members) {
+        auto members = MemberStatements::next(stmt);
+        next_stmts.insert(next_stmts.end(), members.begin(), members.end());
+      }
+
       // We may want to retraverse nodes, in that case revisit everything!
-      if (!traverseAllPaths) {
+      if (!traverse_all_paths) {
         // If we don't want to retraverse, remove nodes we already visisted.
         remove_visited(next_stmts, visited);
       }
@@ -238,12 +212,20 @@ void IterVisitor::traverseFrom(
   }
 }
 
+void IterVisitor::traverseTo(
+    Fusion* fusion,
+    const std::vector<Val*>& to,
+    bool traverse_all_paths,
+    bool traverse_into_members) {
+  traverseBetween(fusion, {}, to, traverse_all_paths, traverse_into_members);
+}
+
 void IterVisitor::traverseHelper(Fusion* fusion, bool traverse_all_paths) {
   FusionGuard fg(fusion);
 
   auto term_val_outs = fusion->getTerminatingOutputs();
   if (!term_val_outs.empty()) {
-    traverseFrom(fusion, term_val_outs, traverse_all_paths);
+    traverseTo(fusion, term_val_outs, traverse_all_paths);
   }
 }
 
@@ -257,8 +239,7 @@ void IterVisitor::traverseAllPaths(Fusion* fusion) {
 
 namespace {
 
-// Expr sort will take a fusion and return a topologically sorted list of
-// expressions.
+// TODO: Also have InputsOf should pick one and remove the other.
 class Inputs : public IterVisitor {
  private:
   //! Optional list of input vals. While traversing to inputs if a value in the
@@ -299,7 +280,7 @@ class Inputs : public IterVisitor {
       return {};
     }
     Inputs inps(all_inputs);
-    inps.traverseFrom(of[0]->fusion(), of);
+    inps.traverseTo(of[0]->fusion(), of);
     return inps.inputs_;
   }
 };
@@ -328,7 +309,7 @@ class AllVals : public IterVisitor {
       Fusion* fusion,
       const std::vector<Val*>& from) {
     AllVals av;
-    av.traverseFrom(fusion, from, false);
+    av.traverseTo(fusion, from, false);
     return av.vals;
   }
 };
@@ -386,7 +367,7 @@ void BackwardVisitor::handle(Val* val) {
   OptOutDispatch::handle(val);
 }
 
-void BackwardVisitor::traverseFrom(
+void BackwardVisitor::traverseTo(
     Fusion* fusion,
     const std::vector<Val*>& from,
     bool traverseAllPaths) {
@@ -538,7 +519,7 @@ struct Dependencies : public IterVisitor {
       std::unordered_set<Val*> _dependencies,
       const std::vector<Val*>& of)
       : dependencies_(std::move(_dependencies)) {
-    traverseFrom(of[0]->fusion(), of, false);
+    traverseTo(of[0]->fusion(), of, false);
   };
 
  public:
@@ -585,7 +566,7 @@ struct FindOutputs : public IterVisitor {
   // tracing all paths like this.
   FindOutputs(const std::unordered_set<Val*>& _of) : of_(_of) {
     auto fusion = (*of_.begin())->fusion();
-    traverseFrom(fusion, fusion->outputs(), true);
+    traverseTo(fusion, fusion->outputs(), true);
   };
 
   static std::unordered_set<Val*> getAllOutputsOf(
@@ -653,7 +634,7 @@ class DependentVals : public IterVisitor {
   DependentVals(const std::unordered_set<Val*>& _of) : of_(_of) {
     createBoundary();
     auto fusion = (*of_.begin())->fusion();
-    traverseFrom(fusion, fusion->outputs(), false);
+    traverseTo(fusion, fusion->outputs(), false);
   };
 
  public:
@@ -689,7 +670,7 @@ class DependencyChains : public IterVisitor {
 
   DependencyChains(Val* _dependency, Val* _of, bool all_chains_ = false)
       : dependencies_({_dependency}) {
-    traverseFrom(_of->fusion(), {_of}, all_chains_);
+    traverseTo(_of->fusion(), {_of}, all_chains_);
   }
 
   DependencyChains(Val* _dependency, bool all_chains_ = false)
@@ -815,12 +796,21 @@ std::vector<Expr*> StmtSort::getExprs(Fusion* fusion, bool traverse_members) {
 }
 
 std::vector<Expr*> StmtSort::getExprs(
+    Fusion* fusion,
+    const std::vector<Val*>& to,
+    bool traverse_members) {
+  auto stmts = StmtSort::getStmts(fusion, to, traverse_members);
+  auto filter = ir_utils::filterByType<Expr>(stmts.begin(), stmts.end());
+  std::vector<Expr*> exprs(filter.begin(), filter.end());
+  return exprs;
+}
+
+std::vector<Expr*> StmtSort::getExprsBetween(
     Fusion* fusion,
     const std::vector<Val*>& from,
+    const std::vector<Val*>& to,
     bool traverse_members) {
-  StmtSort es;
-  es.traverseFrom(fusion, from, false, traverse_members);
-  auto stmts = StmtSort::getStmts(fusion, from, traverse_members);
+  auto stmts = StmtSort::getStmtsBetween(fusion, from, to, traverse_members);
   auto filter = ir_utils::filterByType<Expr>(stmts.begin(), stmts.end());
   std::vector<Expr*> exprs(filter.begin(), filter.end());
   return exprs;
@@ -834,11 +824,22 @@ std::vector<Statement*> StmtSort::getStmts(
 }
 
 std::vector<Statement*> StmtSort::getStmts(
+    Fusion* fusion,
+    const std::vector<Val*>& to,
+    bool traverse_members) {
+  StmtSort es;
+  es.traverseTo(fusion, to, false, traverse_members);
+  return es.stmts;
+}
+
+std::vector<Statement*> StmtSort::getStmtsBetween(
     Fusion* fusion,
     const std::vector<Val*>& from,
+    const std::vector<Val*>& to,
     bool traverse_members) {
   StmtSort es;
-  es.traverseFrom(fusion, from, false, traverse_members);
+  es.traverseBetween(
+      fusion, {from.begin(), from.end()}, to, false, traverse_members);
   return es.stmts;
 }
 
@@ -858,7 +859,7 @@ std::vector<Val*> InputsOf::outputs(
     Fusion* fusion,
     const std::vector<Val*>& outputs_) {
   InputsOf io;
-  io.traverseFrom(fusion, outputs_, false);
+  io.traverseTo(fusion, outputs_, false);
   return io.ordered_inputs;
 }
 
diff --git a/torch/csrc/jit/codegen/cuda/iter_visitor.h b/torch/csrc/jit/codegen/cuda/iter_visitor.h
index 8adac390dac89..3ad485f1a17b6 100644
--- a/torch/csrc/jit/codegen/cuda/iter_visitor.h
+++ b/torch/csrc/jit/codegen/cuda/iter_visitor.h
@@ -75,29 +75,43 @@ class TORCH_CUDA_CU_API IterVisitor : public OptOutDispatch {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   std::vector<std::vector<Statement*>> stmt_stack;
 
-  // Statements to stop traversal on if they're hit (pretends they're leaf
-  // nodes in next)
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  std::unordered_set<Statement*> termination_stmts;
-
   void traverseHelper(Fusion* fusion, bool traverse_all_paths = false);
 
  public:
-  //! Starts at nodes provided in from, traverses from these nodes to inputs.
-  //! Calls handle on all Statement*s in topological sorted order.
+  //! Traverses nodes in Fusion from inputs in topological order to "to". i.e.
+  //! from inputs towards outputs.
   //! \param traverseAllPaths = false only call handle on each Statement* once
-  //!    traverseAllPaths = true traverses all paths from nodes in from to
-  //!    inputs. Calls handle on a Statement* for every path from "from" nodes,
-  //!    to inputs.
+  //!    traverseAllPaths = true traverses all paths between expressions/values.
+  //!    Calls handle on a Statement* for every path from inputs to "to".
   //! \param traverseIntoMembers = When hitting nodes like TensorView,
   //! TensorDomain, or IterDomain where there are members of the nodes that are
   //! Val's a value of "true" will also traverse into those member Val's, a
   //! value of "false" will not traverse into the members.
-  void traverseFrom(
+  void traverseTo(
       Fusion* fusion,
-      const std::vector<Val*>& from,
-      bool traverseAllPaths = false,
-      bool traverseIntoMembers = false);
+      const std::vector<Val*>& to,
+      bool traverse_all_paths = false,
+      bool traverse_into_members = false);
+
+  //! Traverses nodes in Fusion from inputs in topological order to "to". i.e.
+  //! from inputs towards outputs.
+  //! \param traverseAllPaths = false only call handle on each Statement* once
+  //!    traverseAllPaths = true traverses all paths between expressions/values.
+  //!    Calls handle on a Statement* for every path from inputs to "to".
+  //! \param traverseIntoMembers = When hitting nodes like TensorView,
+  //! TensorDomain, or IterDomain where there are members of the nodes that are
+  //! Val's a value of "true" will also traverse into those member Val's, a
+  //! value of "false" will not traverse into the members.
+  //! \param from: Specified values to start traversing. If a "from" Val is not
+  //! on path from inputs to "to" node it will not be visited. If there's a path
+  //! from inputs to "to" that doesn't go through "from" that input and the path
+  //! from it will also be traversed.
+  void traverseBetween(
+      Fusion* fusion,
+      const std::unordered_set<Val*>& from,
+      const std::vector<Val*>& to,
+      bool traverse_all_paths = false,
+      bool traverse_into_members = false);
 
   // Iterates from terminating outputs registered with the fusion. Terminating
   // means value is not used to generate any other value used in producing
@@ -110,6 +124,9 @@ class TORCH_CUDA_CU_API IterVisitor : public OptOutDispatch {
 
   //! Get inputs to vals. Possible input vals can be optionally
   //! given. If not, vals with no producers are returned.
+  //
+  // TODO: This doesn't seem to fit with IterVisitor. Should probably be moved
+  // out of the class.
   static std::vector<Val*> getInputsTo(
       const std::vector<Val*>& vals,
       const std::vector<Val*>& inputs = {});
@@ -197,7 +214,7 @@ class TORCH_CUDA_CU_API BackwardVisitor : public OptOutDispatch {
   // traverseAllPaths = false only call handle on each Statement* once
   // traverseAllPaths = true traverses all paths from nodes in from to inputs.
   //   Handle on a Statement* for every path from "from" nodes, to inputs.
-  void traverseFrom(
+  void traverseTo(
       Fusion* fusion,
       const std::vector<Val*>& from,
       bool traverseAllPaths = false);
@@ -251,37 +268,65 @@ class TORCH_CUDA_CU_API DependencyCheck {
 // expressions.
 class StmtSort : public IterVisitor {
  protected:
+  StmtSort() = default;
+
   std::vector<Statement*> stmts;
 
   void handle(Statement* stmt) override;
 
  public:
   // If traverse_members it will also extract all member nodes in the sorted
-  // expr list in the fusion. i.e. all expressions on IterDomains, extents, etc
-  static std::vector<Expr*> getExprs(
+  // statement list in the fusion. i.e. all IterDomains, extents, and associated
+  // expressions of them
+  static std::vector<Statement*> getStmts(
       Fusion* fusion,
       bool traverse_members = false);
 
+  // Returns ordered Statements required to produce from, including from.
+  static std::vector<Statement*> getStmts(
+      Fusion* fusion,
+      const std::vector<Val*>& to,
+      bool traverse_members = false);
+
+  // Returns ordered Statements required to produce from, including from.
+  // Stops traversal once hiting any Statements in to. Includes Statements in
+  // to.
+  //
+  // Warning: this doesn't necessarily prevent statements before `to` from being
+  // returned. e.g.
+  // i1 = i0
+  // i2 = i1
+  // i3 = i2
+  // i4 = i3 + i1
+  // getExprs(fusion, {i4}, {i3})
+  // will return the definition and values {i0, i1, i4}
+  // i3 is dependent on i1, but since i4 also is then the traversal will go down
+  // the i4->i1->i0 path, even though the i4->i3-//>i2->i1 path is blocked.
+  //
   // If traverse_members it will also extract all member nodes in the sorted
   // expr list in the fusion. i.e. all expressions on IterDomains, extents, etc
-  static std::vector<Expr*> getExprs(
+  static std::vector<Statement*> getStmtsBetween(
       Fusion* fusion,
       const std::vector<Val*>& from,
+      const std::vector<Val*>& to,
       bool traverse_members = false);
 
-  // If traverse_members it will also extract all member nodes in the sorted
-  // statement list in the fusion. i.e. all IterDomains, extents, and associated
-  // expressions of them
-  static std::vector<Statement*> getStmts(
+  // Same as getStmts version but filters to only return the Expr*s
+  static std::vector<Expr*> getExprs(
       Fusion* fusion,
       bool traverse_members = false);
 
-  // If traverse_members it will also extract all member nodes in the sorted
-  // expr list in the fusion. i.e. all IterDomains, extents, and associated
-  // expressions of them
-  static std::vector<Statement*> getStmts(
+  // Same as getStmts version but filters to only return the Expr*s
+  static std::vector<Expr*> getExprs(
+      Fusion* fusion,
+      const std::vector<Val*>& to,
+      bool traverse_members = false);
+
+  // Same as getStmts version but filters to only return the Expr*s
+  static std::vector<Expr*> getExprsBetween(
       Fusion* fusion,
       const std::vector<Val*>& from,
+      const std::vector<Val*>& to,
       bool traverse_members = false);
 };
 
diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
index d4e4343e64d58..efcc51f231b26 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
+++ b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
@@ -657,11 +657,16 @@ std::vector<at::Tensor> FusionKernelRuntime::runWithInput(
         group_outputs.size() == group_runtime_outputs.size(),
         "output size does not match");
     for (const size_t group_out_i : c10::irange(group_outputs.size())) {
-      output_holder[group_outputs[group_out_i]] =
-          group_runtime_outputs[group_out_i];
+      // trivial forwarding outputs empty tensor to save bandwidth, skip
+      // tensor_map update on those, since we want all future use of inputs on
+      // the original tensor input. See note [trivial forwarding]
+      if (!group_outputs[group_out_i]->isFusionInput()) {
+        output_holder[group_outputs[group_out_i]] =
+            group_runtime_outputs[group_out_i];
 
-      args.push(group_runtime_outputs[group_out_i]);
-      tensor_map.emplace(group_outputs[group_out_i], args.back());
+        args.push(group_runtime_outputs[group_out_i]);
+        tensor_map.emplace(group_outputs[group_out_i], args.back());
+      }
     }
   }
 
@@ -676,6 +681,32 @@ std::vector<at::Tensor> FusionKernelRuntime::runWithInput(
     const auto iter = output_holder.find(output);
     if (iter != output_holder.end()) {
       fusion_outputs.push_back(iter->second);
+    } else if (output->isFusionInput()) {
+      // Note [ trivial forwarding ]
+      //
+      // Background:
+      // nvfuser codegen doesn't handle aliases at all. When we have a fusion
+      // that forwards an input to output without any operations on it, this is
+      // a no-op for codegen and the output tensor is never written to. However,
+      // the codegen cannot "forward" an input to output, since all outputs are
+      // allocated in integration. If we do not special case it, we'll ended up
+      // having a "fresh" tensor allocated for the forwarded-input.
+      //
+      // Approach:
+      // There are two aspects of the support:
+      // step 1. Codegen handles forwarding implicitly. Forwarded inputs doesn't
+      // have any producer in the IR, hence the output argument is not used in
+      // the code. But it does require to have an argument in the kernel as a
+      // place-holder so we'll map each arguments correctly.
+      // step 2. Integration handles the trivial forwarding of inputs. When we
+      // put together `fusion_outputs` for a given fusion, when outputs are just
+      // fusion inputs, we directly return the input tensor.
+      const auto iter = tensor_map.find(output);
+      TORCH_INTERNAL_ASSERT(
+          iter != tensor_map.end(), "Can not find output as aliased intput");
+      auto arg = dynamic_cast<const TensorArgAbstract*>(iter->second);
+      // See step 2 - note [ trivial forwarding ]
+      fusion_outputs.push_back(arg->getTensor());
     } else {
       bool empty_type_check = output->getDataType().has_value() &&
           output->getDataType().value() == DataType::Float;
diff --git a/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp b/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp
index a4a823ab55605..15a18a6bca83e 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp
+++ b/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp
@@ -132,6 +132,7 @@ void ExpressionEvaluator::handle(const NamedScalar* named_scalar) {
 }
 
 void ExpressionEvaluator::handle(const UnaryOp* unary_op) {
+  using namespace IntOrDouble_functions;
   const auto in = evaluate(unary_op->in());
   if (in.has_value()) {
     switch (unary_op->getUnaryOpType()) {
@@ -150,6 +151,9 @@ void ExpressionEvaluator::handle(const UnaryOp* unary_op) {
           TORCH_INTERNAL_ASSERT(false, "dtype not supported in evaluator");
         }
         break;
+      case UnaryOpType::Abs:
+        known_values_[unary_op->out()] = abs(*in);
+        break;
       default:
         TORCH_CHECK(
             false,
diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir.cpp b/torch/csrc/jit/codegen/cuda/kernel_ir.cpp
index 132b99b31c34b..7e69f0307a7a5 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_ir.cpp
+++ b/torch/csrc/jit/codegen/cuda/kernel_ir.cpp
@@ -78,6 +78,15 @@ TensorIndex::TensorIndex(
   }
 }
 
+Val* TensorIndex::index(int i) const {
+  TORCH_INTERNAL_ASSERT(
+      nDims() > 0, "Tried to get an index of a 0-dim TensorIndex");
+  if (i < 0)
+    i += nDims();
+  TORCH_INTERNAL_ASSERT(i >= 0 && i < int(nDims()));
+  return indices_[i];
+}
+
 BlockSync::BlockSync(IrBuilderPasskey passkey, bool war_sync)
     : Expr(passkey, ExprType::BlockSync), war_sync_(war_sync) {
   TORCH_INTERNAL_ASSERT(
@@ -85,6 +94,12 @@ BlockSync::BlockSync(IrBuilderPasskey passkey, bool war_sync)
       "IR type only valid for Kernel container.");
 }
 
+Expr* BlockSync::shallowCopy() const {
+  auto result = IrBuilder::create<BlockSync>(war_sync_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 GridSync::GridSync(
     IrBuilderPasskey passkey,
     ParallelTypeBitmap sync_dims,
@@ -93,6 +108,12 @@ GridSync::GridSync(
       sync_dims_(sync_dims),
       sync_buffer_(sync_buffer) {}
 
+Expr* GridSync::shallowCopy() const {
+  auto result = IrBuilder::create<GridSync>(sync_dims_, sync_buffer_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 CpAsyncWait::CpAsyncWait(IrBuilderPasskey passkey, unsigned int keep_stages)
     : Expr(passkey, ExprType::CpAsyncWait), keep_stages_(keep_stages) {
   TORCH_INTERNAL_ASSERT(
@@ -100,6 +121,12 @@ CpAsyncWait::CpAsyncWait(IrBuilderPasskey passkey, unsigned int keep_stages)
       "IR type only valid for Kernel container.");
 }
 
+Expr* CpAsyncWait::shallowCopy() const {
+  auto result = IrBuilder::create<CpAsyncWait>(keep_stages_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 CpAsyncCommit::CpAsyncCommit(IrBuilderPasskey passkey)
     : Expr(passkey, ExprType::CpAsyncCommit) {
   TORCH_INTERNAL_ASSERT(
@@ -107,6 +134,12 @@ CpAsyncCommit::CpAsyncCommit(IrBuilderPasskey passkey)
       "IR type only valid for Kernel container.");
 }
 
+Expr* CpAsyncCommit::shallowCopy() const {
+  auto result = IrBuilder::create<CpAsyncCommit>();
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 InitMagicZero::InitMagicZero(IrBuilderPasskey passkey)
     : Expr(passkey, ExprType::InitMagicZero) {
   TORCH_INTERNAL_ASSERT(
@@ -114,6 +147,12 @@ InitMagicZero::InitMagicZero(IrBuilderPasskey passkey)
       "IR type only valid for Kernel container.");
 }
 
+Expr* InitMagicZero::shallowCopy() const {
+  auto result = IrBuilder::create<InitMagicZero>();
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 UpdateMagicZero::UpdateMagicZero(IrBuilderPasskey passkey)
     : Expr(passkey, ExprType::UpdateMagicZero) {
   TORCH_INTERNAL_ASSERT(
@@ -121,6 +160,12 @@ UpdateMagicZero::UpdateMagicZero(IrBuilderPasskey passkey)
       "IR type only valid for Kernel container.");
 }
 
+Expr* UpdateMagicZero::shallowCopy() const {
+  auto result = IrBuilder::create<UpdateMagicZero>();
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 namespace {
 
 bool isIntegralScalar(const Val* val) {
@@ -147,6 +192,12 @@ PairSelect::PairSelect(
   TORCH_INTERNAL_ASSERT(isIntegralScalar(out), "Integer only for this op");
 }
 
+Expr* PairSelect::shallowCopy() const {
+  auto result = IrBuilder::create<PairSelect>(out_, in_, selection_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 Swizzle2DInt::Swizzle2DInt(
     IrBuilderPasskey passkey,
     IntPair* out,
@@ -172,6 +223,13 @@ Swizzle2DInt::Swizzle2DInt(
   addInput(extent_y);
 }
 
+Expr* Swizzle2DInt::shallowCopy() const {
+  auto result = IrBuilder::create<Swizzle2DInt>(
+      out_, in_x_, in_y_, extent_x_, extent_y_, swizzle_type_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 void Scope::insert(std::vector<Expr*>::const_iterator pos, Expr* expr) {
   exprs_.insert(pos, expr);
 }
@@ -307,6 +365,22 @@ ForLoop::ForLoop(IrBuilderPasskey passkey, const ForLoop* other)
       "IR type only valid for Kernel container.");
 }
 
+Expr* ForLoop::shallowCopy() const {
+  auto result = IrBuilder::create<ForLoop>(
+      iter_domain_,
+      index_,
+      start_,
+      stop_,
+      step_,
+      vectorize_,
+      vectorize_shift_,
+      unroll_required_,
+      double_buffer_loop_stage_);
+  result->body_ = body_;
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 bool ForLoop::isUnrollable() const {
   // Start and stop must be constant, must not be a broadcast
   // dimension, cannot be bound to a parallel dimension, must not be
@@ -426,13 +500,12 @@ IfThenElse::IfThenElse(IrBuilderPasskey passkey, Predicate* cond)
   addInput(cond);
 }
 
-Val* TensorIndex::index(int i) const {
-  TORCH_INTERNAL_ASSERT(
-      nDims() > 0, "Tried to get an index of a 0-dim TensorIndex");
-  if (i < 0)
-    i += nDims();
-  TORCH_INTERNAL_ASSERT(i >= 0 && i < int(nDims()));
-  return indices_[i];
+Expr* IfThenElse::shallowCopy() const {
+  auto result = IrBuilder::create<IfThenElse>(predicate());
+  result->then_body_ = then_body_;
+  result->else_body_ = else_body_;
+  result->setWritePredicate(writePredicate());
+  return result;
 }
 
 Allocate::Allocate(
@@ -495,6 +568,13 @@ Allocate::Allocate(
       "IR type only valid for Kernel container.");
 }
 
+Expr* Allocate::shallowCopy() const {
+  auto result =
+      IrBuilder::create<Allocate>(buffer_, memory_type_, shape_, zero_init_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 GridReduction::GridReduction(
     IrBuilderPasskey passkey,
     BinaryOpType reduction_op_type,
@@ -523,6 +603,22 @@ GridReduction::GridReduction(
       "IR type only valid for Kernel container.");
 }
 
+Expr* GridReduction::shallowCopy() const {
+  auto result = IrBuilder::create<GridReduction>(
+      getReductionOpType(),
+      init(),
+      out(),
+      in(),
+      reduction_buffer_,
+      sync_buffer_,
+      entrance_index_,
+      entrances_,
+      isAllreduce());
+  result->copyPredicatesFrom(this);
+  result->thread_predicate_ = thread_predicate_;
+  return result;
+}
+
 GroupedGridReduction::GroupedGridReduction(
     IrBuilderPasskey passkey,
     std::vector<BinaryOpType> reduction_op_types,
@@ -553,6 +649,23 @@ GroupedGridReduction::GroupedGridReduction(
       "IR type only valid for Kernel container.");
 }
 
+Expr* GroupedGridReduction::shallowCopy() const {
+  auto result = IrBuilder::create<GroupedGridReduction>(
+      getReductionOpTypes(),
+      initVals(),
+      outputs(),
+      inputs(),
+      reduction_buffers_,
+      sync_buffer_,
+      entrance_index_,
+      entrances_,
+      buffer_stride_,
+      isAllreduce());
+  result->copyPredicatesFrom(this);
+  result->thread_predicate_ = thread_predicate_;
+  return result;
+}
+
 GridBroadcast::GridBroadcast(
     IrBuilderPasskey passkey,
     BroadcastOp* broadcast_op,
@@ -567,6 +680,13 @@ GridBroadcast::GridBroadcast(
       "IR type only valid for Kernel container.");
 }
 
+Expr* GridBroadcast::shallowCopy() const {
+  auto result = IrBuilder::create<GridBroadcast>(
+      broadcast_op_, broadcast_buffer_, sync_buffer_);
+  result->copyPredicatesFrom(this);
+  return result;
+}
+
 GridWelford::GridWelford(
     IrBuilderPasskey passkey,
     WelfordOp* welford_op,
@@ -589,6 +709,20 @@ GridWelford::GridWelford(
       "IR type only valid for Kernel container.");
 }
 
+Expr* GridWelford::shallowCopy() const {
+  auto result = IrBuilder::create<GridWelford>(
+      welford_op_,
+      var_buffer_,
+      avg_buffer_,
+      n_buffer_,
+      sync_buffer_,
+      entrance_index_,
+      entrances_);
+  result->copyPredicatesFrom(this);
+  result->thread_predicate_ = thread_predicate_;
+  return result;
+}
+
 GroupedGridWelford::GroupedGridWelford(
     IrBuilderPasskey passkey,
     std::vector<WelfordTriplet> output_vals,
@@ -617,6 +751,22 @@ GroupedGridWelford::GroupedGridWelford(
       "IR type only valid for Kernel container.");
 }
 
+Expr* GroupedGridWelford::shallowCopy() const {
+  auto result = IrBuilder::create<GroupedGridWelford>(
+      outputVals(),
+      inputVals(),
+      initVals(),
+      reduction_buffers_,
+      sync_buffer_,
+      entrance_index_,
+      entrances_,
+      buffer_stride_,
+      isAllreduce());
+  result->copyPredicatesFrom(this);
+  result->thread_predicate_ = thread_predicate_;
+  return result;
+}
+
 AllocateFusedReduction::AllocateFusedReduction(
     IrBuilderPasskey passkey,
     GridReduction* grid_reduction)
@@ -657,6 +807,36 @@ AllocateFusedReduction::AllocateFusedReduction(
       "IR type only valid for Kernel container.");
 }
 
+Expr* AllocateFusedReduction::shallowCopy() const {
+  if (grid_expr_->isA<GridReduction>()) {
+    auto result = IrBuilder::create<AllocateFusedReduction>(
+        grid_expr_->as<GridReduction>());
+    result->setPredicate(predicate());
+    result->setWritePredicate(writePredicate());
+    return result;
+  } else if (grid_expr_->isA<GridWelford>()) {
+    auto result = IrBuilder::create<AllocateFusedReduction>(
+        grid_expr_->as<GridWelford>());
+    result->setPredicate(predicate());
+    result->setWritePredicate(writePredicate());
+    return result;
+  } else if (grid_expr_->isA<GroupedGridReduction>()) {
+    auto result = IrBuilder::create<AllocateFusedReduction>(
+        grid_expr_->as<GroupedGridReduction>());
+    result->setPredicate(predicate());
+    result->setWritePredicate(writePredicate());
+    return result;
+  } else if (grid_expr_->isA<GroupedGridWelford>()) {
+    auto result = IrBuilder::create<AllocateFusedReduction>(
+        grid_expr_->as<GroupedGridWelford>());
+    result->setPredicate(predicate());
+    result->setWritePredicate(writePredicate());
+    return result;
+  }
+  TORCH_INTERNAL_ASSERT(
+      false, "Unknown reduction type in AllocateFusedReduction::shallowCopy");
+}
+
 TensorIndex* AllocateFusedReduction::out() const {
   TORCH_INTERNAL_ASSERT(grid_expr_ != nullptr);
   if (grid_expr_->isA<GridReduction>() ||
diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir.h b/torch/csrc/jit/codegen/cuda/kernel_ir.h
index 62b245772dd03..cd44e8d8e21b7 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_ir.h
+++ b/torch/csrc/jit/codegen/cuda/kernel_ir.h
@@ -94,7 +94,7 @@ class TORCH_CUDA_CU_API Predicate final : public Val {
     return expr_;
   }
 
-  Bool* thread_pred() {
+  Bool* thread_pred() const {
     TORCH_INTERNAL_ASSERT(
         ptype_ == PredicateType::Inline ||
         ptype_ == PredicateType::Misaligned || ptype_ == PredicateType::Shift ||
@@ -199,6 +199,8 @@ class TORCH_CUDA_CU_API Allocate final : public Expr {
       Val* size,
       bool zero_init = false);
 
+  Expr* shallowCopy() const override;
+
   Val* buffer() const {
     return buffer_;
   }
@@ -251,6 +253,8 @@ class TORCH_CUDA_CU_API BlockSync final : public Expr {
  public:
   explicit BlockSync(IrBuilderPasskey passkey, bool war_sync = false);
 
+  Expr* shallowCopy() const override;
+
   bool isWarHazardSync() const {
     return war_sync_;
   }
@@ -265,6 +269,8 @@ class TORCH_CUDA_CU_API CpAsyncWait final : public Expr {
  public:
   explicit CpAsyncWait(IrBuilderPasskey passkey, unsigned int keep_stages = 0);
 
+  Expr* shallowCopy() const override;
+
   //! Returns the remaining number of stages that are not synchronized
   //!  after this op.
   unsigned int keepStages() const {
@@ -282,6 +288,8 @@ class TORCH_CUDA_CU_API CpAsyncWait final : public Expr {
 class TORCH_CUDA_CU_API CpAsyncCommit final : public Expr {
  public:
   explicit CpAsyncCommit(IrBuilderPasskey passkey);
+
+  Expr* shallowCopy() const override;
 };
 
 // Synchronize all blocks in device, implies cooperative group launch is
@@ -293,6 +301,8 @@ class TORCH_CUDA_CU_API GridSync final : public Expr {
       ParallelTypeBitmap sync_dims,
       Val* sync_buffer);
 
+  Expr* shallowCopy() const override;
+
   ParallelTypeBitmap syncDims() const {
     return sync_dims_;
   }
@@ -311,6 +321,8 @@ class TORCH_CUDA_CU_API GridSync final : public Expr {
 class TORCH_CUDA_CU_API InitMagicZero final : public Expr {
  public:
   explicit InitMagicZero(IrBuilderPasskey passkey);
+
+  Expr* shallowCopy() const override;
 };
 
 // Simply prints "UPDATE_MAGIC_ZERO" in the code in accordance with magic_zero
@@ -318,6 +330,8 @@ class TORCH_CUDA_CU_API InitMagicZero final : public Expr {
 class TORCH_CUDA_CU_API UpdateMagicZero final : public Expr {
  public:
   explicit UpdateMagicZero(IrBuilderPasskey passkey);
+
+  Expr* shallowCopy() const override;
 };
 
 // TODO(kir): promote to IR node
@@ -418,6 +432,8 @@ class TORCH_CUDA_CU_API ForLoop final : public Expr {
 
   ForLoop(IrBuilderPasskey passkey, const ForLoop* other);
 
+  Expr* shallowCopy() const override;
+
   Val* index() const {
     return index_;
   }
@@ -512,6 +528,8 @@ class TORCH_CUDA_CU_API IfThenElse final : public Expr {
  public:
   explicit IfThenElse(IrBuilderPasskey passkey, Predicate* cond);
 
+  Expr* shallowCopy() const override;
+
   Scope& thenBody() {
     return then_body_;
   }
@@ -557,6 +575,8 @@ class TORCH_CUDA_CU_API GridReduction final : public ReductionOp {
       Val* entrances,
       bool is_allreduce = false);
 
+  Expr* shallowCopy() const override;
+
   Allocate* reduction_buffer() const {
     return reduction_buffer_;
   }
@@ -579,8 +599,11 @@ class TORCH_CUDA_CU_API GridReduction final : public ReductionOp {
     return thread_predicate_;
   }
 
-  void setThreadPredicate(const ParallelTypeBitmap& thread_predicate) {
-    thread_predicate_ = thread_predicate;
+  GridReduction* withThreadPredicate(
+      const ParallelTypeBitmap& thread_predicate) {
+    auto result = shallowCopy()->as<GridReduction>();
+    result->thread_predicate_ = thread_predicate;
+    return result;
   }
 
  private:
@@ -609,6 +632,8 @@ class TORCH_CUDA_CU_API GroupedGridReduction final : public GroupedReductionOp {
       Val* buffer_stride,
       bool is_allreduce = false);
 
+  Expr* shallowCopy() const override;
+
   const std::vector<Allocate*>& reduction_buffers() const {
     return reduction_buffers_;
   }
@@ -639,8 +664,11 @@ class TORCH_CUDA_CU_API GroupedGridReduction final : public GroupedReductionOp {
     return thread_predicate_;
   }
 
-  void setThreadPredicate(const ParallelTypeBitmap& thread_predicate) {
-    thread_predicate_ = thread_predicate;
+  GroupedGridReduction* withThreadPredicate(
+      const ParallelTypeBitmap& thread_predicate) {
+    auto result = shallowCopy()->as<GroupedGridReduction>();
+    result->thread_predicate_ = thread_predicate;
+    return result;
   }
 
  private:
@@ -671,6 +699,8 @@ class TORCH_CUDA_CU_API GridBroadcast final : public Expr {
       Allocate* broadcast_buffer,
       Allocate* sync_buffer);
 
+  Expr* shallowCopy() const override;
+
   BroadcastOp* broadcast_op() const {
     return broadcast_op_;
   }
@@ -710,6 +740,8 @@ class TORCH_CUDA_CU_API GridWelford final : public Expr {
       Val* entrance_index,
       Val* entrances);
 
+  Expr* shallowCopy() const override;
+
   WelfordOp* welford_op() const {
     return welford_op_;
   }
@@ -744,8 +776,10 @@ class TORCH_CUDA_CU_API GridWelford final : public Expr {
     return thread_predicate_;
   }
 
-  void setThreadPredicate(const ParallelTypeBitmap& thread_predicate) {
-    thread_predicate_ = thread_predicate;
+  GridWelford* withThreadPredicate(const ParallelTypeBitmap& thread_predicate) {
+    auto result = shallowCopy()->as<GridWelford>();
+    result->thread_predicate_ = thread_predicate;
+    return result;
   }
 
  private:
@@ -777,6 +811,8 @@ class TORCH_CUDA_CU_API GroupedGridWelford final : public GroupedWelfordOp {
       Val* buffer_stride,
       bool is_allreduce = false);
 
+  Expr* shallowCopy() const override;
+
   const std::array<std::vector<Allocate*>, 3>& reduction_buffers() const {
     return reduction_buffers_;
   }
@@ -803,8 +839,11 @@ class TORCH_CUDA_CU_API GroupedGridWelford final : public GroupedWelfordOp {
     return thread_predicate_;
   }
 
-  void setThreadPredicate(const ParallelTypeBitmap& thread_predicate) {
-    thread_predicate_ = thread_predicate;
+  GroupedGridWelford* withThreadPredicate(
+      const ParallelTypeBitmap& thread_predicate) {
+    auto result = shallowCopy()->as<GroupedGridWelford>();
+    result->thread_predicate_ = thread_predicate;
+    return result;
   }
 
  private:
@@ -839,6 +878,8 @@ class TORCH_CUDA_CU_API AllocateFusedReduction final : public Expr {
       IrBuilderPasskey passkey,
       GroupedGridWelford* grouped_grid_welford);
 
+  Expr* shallowCopy() const override;
+
   Expr* gridExpr() const {
     return grid_expr_;
   }
@@ -879,6 +920,8 @@ class TORCH_CUDA_CU_API PairSelect : public Expr {
 
   PairSelect(IrBuilderPasskey, Val* out, IntPair* in, Selection selection);
 
+  Expr* shallowCopy() const override;
+
   Val* out() const {
     return out_;
   }
@@ -914,6 +957,8 @@ class TORCH_CUDA_CU_API Swizzle2DInt : public Expr {
       Val* extent_y,
       Swizzle2DType swizzle_type);
 
+  Expr* shallowCopy() const override;
+
   IntPair* out() const {
     return out_;
   }
diff --git a/torch/csrc/jit/codegen/cuda/lower2device.cpp b/torch/csrc/jit/codegen/cuda/lower2device.cpp
index 53b9d172f203f..142ee1b7a02fb 100644
--- a/torch/csrc/jit/codegen/cuda/lower2device.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower2device.cpp
@@ -8,6 +8,7 @@
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/lower_alias_memory.h>
 #include <torch/csrc/jit/codegen/cuda/lower_allocation.h>
+#include <torch/csrc/jit/codegen/cuda/lower_divisible_split.h>
 #include <torch/csrc/jit/codegen/cuda/lower_double_buffer.h>
 #include <torch/csrc/jit/codegen/cuda/lower_expr_sort.h>
 #include <torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.h>
@@ -248,7 +249,7 @@ void GpuLower::lower(Fusion* fusion, DataType index_type) {
   // mappings of all iteration domains across the fusion. There are three types
   // of mappings Permissive, Exact, and Loop, see compute_at_map.h/cpp for more
   // information.
-  compute_at_map_ = std::make_unique<ComputeAtMap>(fusion_);
+  compute_at_map_ = std::make_shared<ComputeAtMap>(fusion_);
 
   if (isDebugDumpEnabled(DebugDumpOption::ComputeAtMap)) {
     std::cout << compute_at_map_->toString() << std::endl;
@@ -256,8 +257,12 @@ void GpuLower::lower(Fusion* fusion, DataType index_type) {
 
   compute_at_map_->validateAndPropagatePType();
 
+  // Uses compute_at_map, find all splits that are enforced to be divisible
+  divisible_splits_ = getAllDivisibleSplits(fusion_, compute_at_map_.get());
+
   // Used in parallel dimension map
-  concretized_broadcast_domains_.build(fusion_);
+  concretized_broadcast_domains_ =
+      std::make_shared<const ConcretizedBroadcastDomains>(fusion_);
 
   parallelDimensionMap().build(fusion_);
   if (isDebugDumpEnabled(DebugDumpOption::ParallelDimensions)) {
@@ -281,7 +286,7 @@ void GpuLower::lower(Fusion* fusion, DataType index_type) {
 
   // Scan the whole fusion and build mappings about halo extensions of
   // all IterDomains
-  haloInfo().build(fusion_);
+  halo_info_ = std::make_shared<HaloInfo>(fusion_, compute_at_map_);
 
   // Want to run this after parallel map and halo info map are
   // created. vectorized_accesses_ and vectorized_set_info_ are filled.
@@ -298,6 +303,9 @@ void GpuLower::lower(Fusion* fusion, DataType index_type) {
   // Depends on thread_pred_map_, validates parallelization collects which
   // tensor views need WAR or RAW syncs
   sync_map_.build(fusion_);
+  if (isDebugDumpEnabled(DebugDumpOption::SyncMap)) {
+    std::cout << sync_map_.toString() << std::endl;
+  }
 
   partialSplitMap().build(fusion_);
 
diff --git a/torch/csrc/jit/codegen/cuda/lower2device.h b/torch/csrc/jit/codegen/cuda/lower2device.h
index d5600e0a25139..250b06a6495fb 100644
--- a/torch/csrc/jit/codegen/cuda/lower2device.h
+++ b/torch/csrc/jit/codegen/cuda/lower2device.h
@@ -62,7 +62,8 @@ class TORCH_CUDA_CU_API GpuLower : public NonCopyable {
   //! Query if lowering is in progress
   static bool hasCurrent();
 
-  ConcretizedBroadcastDomains& concretizedBroadcastDomains() {
+  std::shared_ptr<const ConcretizedBroadcastDomains>
+  concretizedBroadcastDomains() {
     return concretized_broadcast_domains_;
   }
 
@@ -76,20 +77,16 @@ class TORCH_CUDA_CU_API GpuLower : public NonCopyable {
     return thread_pred_map_;
   }
 
-  const std::unique_ptr<ComputeAtMap>& caMap() const {
-    return compute_at_map_;
+  std::shared_ptr<const ComputeAtMap> caMap() const {
+    return std::const_pointer_cast<const ComputeAtMap>(compute_at_map_);
   }
 
   const TrivialReductionInfo& trivialReductionInfo() const {
     return trivial_reduction_info_;
   }
 
-  const HaloInfo& haloInfo() const {
-    return halo_info_;
-  }
-
-  HaloInfo& haloInfo() {
-    return halo_info_;
+  std::shared_ptr<const HaloInfo> haloInfo() const {
+    return std::const_pointer_cast<const HaloInfo>(halo_info_);
   }
 
   const ParallelDimensionMap& parallelDimensionMap() const {
@@ -132,6 +129,10 @@ class TORCH_CUDA_CU_API GpuLower : public NonCopyable {
     return non_divisible_split_info_;
   }
 
+  const auto& divisbleSplitSet() const {
+    return divisible_splits_;
+  }
+
   DoubleBufferInfo& doubleBufferInfo() {
     return double_buffer_info_;
   }
@@ -198,12 +199,13 @@ class TORCH_CUDA_CU_API GpuLower : public NonCopyable {
   // would be safer to wrap all of these in unique pointers and remove the build
   // interface and default constructor. That way they couldn't be accessed
   // without being initialized.
-  ConcretizedBroadcastDomains concretized_broadcast_domains_;
+  std::shared_ptr<const ConcretizedBroadcastDomains>
+      concretized_broadcast_domains_;
   ThreadPredicateMap thread_pred_map_;
   PredicateElimination pred_elimination_;
-  std::unique_ptr<ComputeAtMap> compute_at_map_;
+  std::shared_ptr<ComputeAtMap> compute_at_map_;
   TrivialReductionInfo trivial_reduction_info_;
-  HaloInfo halo_info_;
+  std::shared_ptr<HaloInfo> halo_info_;
   LocalAllocationInfoMap local_allocation_info_map_;
   WarpPaddedParallelInfo warp_pad_info_;
   ParallelDimensionMap parallel_dimension_map_;
@@ -214,6 +216,7 @@ class TORCH_CUDA_CU_API GpuLower : public NonCopyable {
   FusedReductionInfo fused_reduction_info_;
   SyncMap sync_map_;
   kir::KernelPerformanceProfile profile_;
+  std::unordered_set<Split*> divisible_splits_;
 
   // Track which tensor views are inputs or outputs of a vectorized operation
   // and their maximum vectorized access size
diff --git a/torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp b/torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp
index 4e84579485509..ef12cce8fd46a 100644
--- a/torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp
@@ -18,16 +18,19 @@ namespace fuser {
 namespace cuda {
 
 namespace {
+// Alias used for std::transform
+IterDomain* exactConcreteId(IterDomain* id) {
+  return GpuLower::current()->caMap()->getConcreteMappedID(
+      id, IdMappingMode::EXACT);
+}
 
-//! Checks that the current loop nest is not realizing a serial
-//!  broadcast so that each index of producer buffer will only
-//!  be visited once, which is the only case where aggressive
-//!  inner sharing is valid.
-//!
+//! Checks that the current loop nest is realizing a serial
+//!  broadcast so that each index of producer buffer can be visited
+//!  multiple times, in which case the aggressive is not valid.
 bool isSerialBroadcastResolution(TensorView* producer, TensorView* consumer) {
   //! Note: see issue #1785:
   //!  serial broadcast resolution doesn't only happen to
-  //! immediate producers of broadcast ops. We can also have
+  //! immediate outputs of broadcast ops. We can also have
   //! example:
   //!  T1[I,B] = broadcast(T0[I]])
   //!  T3[I,I] = T1[I,B] + T2[I,I]
@@ -83,7 +86,7 @@ bool isSerialBroadcastResolution(TensorView* producer, TensorView* consumer) {
       std::inserter(
           producer_exact_concrete_root_ids,
           producer_exact_concrete_root_ids.begin()),
-      ir_utils::caMapExactConcreteId);
+      exactConcreteId);
 
   // Check if serial loop roots indexes any exact root id's that
   //  is not within the set of producer's root exact id's. These
@@ -92,7 +95,8 @@ bool isSerialBroadcastResolution(TensorView* producer, TensorView* consumer) {
   for (auto serial_loop_root :
        ir_utils::filterByType<IterDomain>(serial_loop_roots)) {
     if (!producer_exact_concrete_root_ids.count(
-            ir_utils::caMapExactConcreteId(serial_loop_root))) {
+            GpuLower::current()->caMap()->getConcreteMappedID(
+                serial_loop_root, IdMappingMode::EXACT))) {
       return true;
     }
   }
diff --git a/torch/csrc/jit/codegen/cuda/lower_allocation.cpp b/torch/csrc/jit/codegen/cuda/lower_allocation.cpp
index 466dc85c8abff..264905cfa213f 100644
--- a/torch/csrc/jit/codegen/cuda/lower_allocation.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_allocation.cpp
@@ -59,7 +59,7 @@ class AllocationInserter : public kir::ExprMutator {
   // info.init_place_before, info.alloc_for_loop, info.alloc_place_before
   void fillAllocationInformation(AllocationInformation& info, Expr* expr) {
     auto loop_alloc_info =
-        loop_utils::getAllocInformation(info.buffer, for_loops_);
+        lower_utils::getAllocInformation(info.buffer, for_loops_);
 
     info.init_for_loop = loop_alloc_info.init_for_loop;
     info.alloc_for_loop = loop_alloc_info.alloc_for_loop;
@@ -131,7 +131,7 @@ class AllocationInserter : public kir::ExprMutator {
          ++init_loop_it) {
       auto id = *init_loop_it;
       kir::ForLoop* new_loop = nullptr;
-      auto extent_with_halo = gpu_lower->haloInfo().getExtent(id);
+      auto extent_with_halo = gpu_lower->haloInfo()->getExtent(id);
       if (extent_with_halo) {
         new_loop = IrBuilder::create<kir::ForLoop>(
             id,
@@ -166,7 +166,7 @@ class AllocationInserter : public kir::ExprMutator {
       }
       auto extent = id->extent();
       // Use halo-extended extent if found
-      auto halo_extent = gpu_lower->haloInfo().getRootAxisInfo(id);
+      auto halo_extent = gpu_lower->haloInfo()->getRootAxisInfo(id);
       if (halo_extent.hasHalo()) {
         extent = IrBuilder::addExpr(
             extent, IrBuilder::create<Int>(halo_extent.width()));
@@ -213,7 +213,7 @@ class AllocationInserter : public kir::ExprMutator {
 
     // Get the halo extent if found
     auto getExtent = [this](IterDomain* id) {
-      auto extent = gpu_lower->haloInfo().getExtent(id);
+      auto extent = gpu_lower->haloInfo()->getExtent(id);
       if (extent == nullptr) {
         extent = id->extent();
       }
@@ -368,7 +368,7 @@ class AllocationInserter : public kir::ExprMutator {
 
       auto extent = concrete_id->extent();
 
-      if (gpu_lower->haloInfo().getExtent(info.buffer->axis(axis_i)) !=
+      if (gpu_lower->haloInfo()->getExtent(info.buffer->axis(axis_i)) !=
           nullptr) {
         has_halo = true;
       }
diff --git a/torch/csrc/jit/codegen/cuda/lower_bank_conflict.cpp b/torch/csrc/jit/codegen/cuda/lower_bank_conflict.cpp
new file mode 100644
index 0000000000000..0b97b973f786e
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/lower_bank_conflict.cpp
@@ -0,0 +1,332 @@
+#include <torch/csrc/jit/codegen/cuda/lower_bank_conflict.h>
+
+#include <torch/csrc/jit/codegen/cuda/dynamic_type.h>
+#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
+#include <torch/csrc/jit/codegen/cuda/type.h>
+
+#include <unordered_set>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+namespace {
+
+bool isSmemTensorIndex(Val* value) {
+  return value->isA<kir::TensorIndex>() &&
+      value->as<kir::TensorIndex>()->view()->getMemoryType() ==
+      MemoryType::Shared;
+}
+
+int64_t getVectorizeSize(kir::TensorIndex* ti) {
+  for (auto id : ti->view()->domain()->domain()) {
+    if (!isParallelTypeVectorize(id->getParallelType())) {
+      continue;
+    }
+
+    ExpressionEvaluator expr_eval(id->fusion());
+    auto vector_size_optional = expr_eval.evaluate(id->extent());
+
+    TORCH_INTERNAL_ASSERT(
+        vector_size_optional.has_value(),
+        "Could not evaluate constant value bound to vectorized dim.");
+
+    return vector_size_optional->as<int64_t>();
+  }
+  return 1;
+}
+
+inline int64_t getPhaseSize(int64_t word_size_bytes) {
+  if (word_size_bytes == 16) {
+    return 8;
+  }
+  if (word_size_bytes == 8) {
+    return 16;
+  }
+  return 32;
+}
+
+bool isThreadIdx(const std::string& name) {
+  return name == "threadIdx.x" || name == "threadIdx.y" ||
+      name == "threadIdx.z";
+}
+
+bool isBlockIdx(const std::string& name) {
+  return name == "blockIdx.x" || name == "blockIdx.y" || name == "blockIdx.z";
+}
+
+bool isBlockDim(const std::string& name) {
+  return name == "blockDim.x" && name == "blockDim.y" && name == "blockDim.z";
+}
+
+bool isGridDim(const std::string& name) {
+  return name == "gridDim.x" && name == "gridDim.y" && name == "gridDim.z";
+}
+
+ParallelType getParallelType(const std::string& name) {
+  if (name == "threadIdx.x") {
+    return ParallelType::TIDx;
+  } else if (name == "threadIdx.y") {
+    return ParallelType::TIDy;
+  } else if (name == "threadIdx.z") {
+    return ParallelType::TIDz;
+  } else if (name == "blockIdx.x") {
+    return ParallelType::BIDx;
+  } else if (name == "blockIdx.y") {
+    return ParallelType::BIDy;
+  } else if (name == "blockIdx.z") {
+    return ParallelType::BIDz;
+  }
+  TORCH_INTERNAL_ASSERT(false, "Not a parallel type");
+}
+
+std::vector<int64_t> evaluateAddressesOnFirstPhase(
+    kir::TensorIndex* ti,
+    const std::vector<kir::ForLoop*>& for_loops,
+    c10::optional<LaunchParams> launch_params,
+    const ExpressionEvaluator& expr_eval_common) {
+  std::vector<int64_t> addresses;
+  const auto word_size_bytes =
+      dataTypeSize(*(ti->getDataType())) * getVectorizeSize(ti);
+  int64_t phase_size = getPhaseSize(word_size_bytes);
+
+  if (launch_params.has_value()) {
+    phase_size = std::min<int64_t>(phase_size, launch_params->nThreads());
+  }
+
+  for (int64_t linear_tidx : c10::irange(phase_size)) {
+    int64_t tidx = linear_tidx;
+    int64_t tidy = 0;
+    int64_t tidz = 0;
+    if (launch_params.has_value()) {
+      tidy = tidx / launch_params->bdimx();
+      tidx = tidx % launch_params->bdimx();
+      tidz = tidy / launch_params->bdimy();
+      tidy = tidy % launch_params->bdimy();
+    }
+    int64_t index = 0;
+    // make a copy of the expression evaluator
+    ExpressionEvaluator expr_eval = expr_eval_common;
+    expr_eval.bind("threadIdx.x", tidx);
+    expr_eval.bind("threadIdx.y", tidy);
+    expr_eval.bind("threadIdx.z", tidz);
+    for (auto fl : for_loops) {
+      if (fl->index()->isA<NamedScalar>()) {
+        auto name = fl->index()->as<NamedScalar>()->name();
+        TORCH_INTERNAL_ASSERT(
+            isThreadIdx(name) || isBlockIdx(name), "unknow loop index");
+      } else {
+        auto start = expr_eval.evaluate(fl->start())->as<int64_t>();
+        expr_eval.bind(fl->index(), start);
+      }
+    }
+    for (auto ind : ti->indices()) {
+      index += expr_eval.evaluate(ind)->as<int64_t>();
+    }
+    addresses.emplace_back(index * word_size_bytes);
+  }
+  return addresses;
+}
+
+int getConflictWays(const std::vector<int64_t>& addresses) {
+  std::unordered_set<int64_t> words_by_bank[32];
+  for (auto addr : addresses) {
+    int64_t word = addr / 4;
+    int64_t bank = word % 32;
+    words_by_bank[bank].insert(word);
+  }
+  int conflict = 1;
+  for (const auto& words : words_by_bank) {
+    conflict = std::max<int>(conflict, words.size());
+  }
+  return conflict;
+}
+
+class InferLaunchParams : public kir::IrVisitor {
+ public:
+  static c10::optional<LaunchParams> get(
+      const std::vector<Expr*>& exprs,
+      const std::unordered_map<std::string, IntOrDouble>& known_values) {
+    if (exprs.empty()) {
+      return c10::nullopt;
+    }
+    return InferLaunchParams(exprs, known_values).launch_params_;
+  }
+
+ private:
+  InferLaunchParams(
+      const std::vector<Expr*>& exprs,
+      const std::unordered_map<std::string, IntOrDouble>& known_values)
+      : expr_eval_(exprs[0]->fusion()) {
+    for (auto pair : known_values) {
+      expr_eval_.bind(pair.first, pair.second);
+    }
+    handle(exprs);
+  }
+
+  using kir::IrVisitor::handle;
+
+  void handle(Expr* expr) final {
+    if (expr->isA<kir::ForLoop>() || expr->isA<kir::IfThenElse>()) {
+      kir::IrVisitor::handle(expr);
+      return;
+    }
+
+    for (auto fl : for_loops_) {
+      if (fl->index()->isA<NamedScalar>()) {
+        auto name = fl->index()->as<NamedScalar>()->name();
+        if (isThreadIdx(name) || isBlockIdx(name)) {
+          auto ptype = getParallelType(name);
+          auto stop = expr_eval_.evaluate(fl->stop());
+          if (stop.has_value()) {
+            if (!launch_params_.has_value()) {
+              launch_params_ = LaunchParams();
+            }
+            if (launch_params_->getRawVal(ptype) ==
+                LaunchParams::UNINITIALIZED_VAL) {
+              launch_params_->bind(stop->as<int64_t>(), ptype);
+            } else {
+              TORCH_INTERNAL_ASSERT(
+                  launch_params_->getDim(ptype) == stop,
+                  "Unable to infer launch parameters");
+            }
+          }
+        }
+      }
+    }
+  }
+
+  ExpressionEvaluator expr_eval_;
+  c10::optional<LaunchParams> launch_params_;
+};
+
+class BankConflictInfo : public kir::IrVisitor {
+ public:
+  static std::unordered_map<const Expr*, std::pair<int, int>> get(
+      const std::vector<Expr*>& exprs,
+      c10::optional<LaunchParams> launch_params,
+      const std::unordered_map<std::string, IntOrDouble>& known_values) {
+    if (exprs.empty()) {
+      return {};
+    }
+    return BankConflictInfo(exprs, launch_params, known_values)
+        .bank_conflict_info_;
+  }
+
+ private:
+  BankConflictInfo(
+      const std::vector<Expr*>& exprs,
+      c10::optional<LaunchParams> launch_params,
+      const std::unordered_map<std::string, IntOrDouble>& known_values)
+      : launch_params_(launch_params), expr_eval_common_(exprs[0]->fusion()) {
+    expr_eval_common_.bind("blockIdx.x", 0);
+    expr_eval_common_.bind("blockIdx.y", 0);
+    expr_eval_common_.bind("blockIdx.z", 0);
+    if (launch_params.has_value()) {
+      expr_eval_common_.bind("blockDim.x", launch_params->bdimx());
+      expr_eval_common_.bind("blockDim.y", launch_params->bdimy());
+      expr_eval_common_.bind("blockDim.z", launch_params->bdimz());
+      expr_eval_common_.bind("gridDim.x", launch_params->gdimx());
+      expr_eval_common_.bind("gridDim.y", launch_params->gdimy());
+      expr_eval_common_.bind("gridDim.z", launch_params->gdimz());
+    }
+    for (auto pair : known_values) {
+      expr_eval_common_.bind(pair.first, pair.second);
+    }
+    handle(exprs);
+  }
+
+  using kir::IrVisitor::handle;
+
+  void handle(Expr* expr) final {
+    if (expr->isA<kir::ForLoop>() || expr->isA<kir::IfThenElse>()) {
+      kir::IrVisitor::handle(expr);
+      return;
+    }
+
+    if (expr->isA<UnaryOp>()) {
+      auto uop = expr->as<UnaryOp>();
+      if (uop->getUnaryOpType() != UnaryOpType::Set) {
+        return;
+      }
+      std::pair<int, int> conflict_ways{0, 0};
+      if (isSmemTensorIndex(uop->in())) {
+        conflict_ways.first = getConflictWays(evaluateAddressesOnFirstPhase(
+            uop->in()->as<kir::TensorIndex>(),
+            for_loops_,
+            launch_params_,
+            expr_eval_common_));
+      }
+      if (isSmemTensorIndex(uop->out())) {
+        conflict_ways.second = getConflictWays(evaluateAddressesOnFirstPhase(
+            uop->out()->as<kir::TensorIndex>(),
+            for_loops_,
+            launch_params_,
+            expr_eval_common_));
+      }
+      if (conflict_ways.first > 1 || conflict_ways.second > 1) {
+        bank_conflict_info_[expr] = conflict_ways;
+      }
+    } else if (expr->isA<LoadStoreOp>()) {
+      auto ldst = expr->as<LoadStoreOp>();
+      std::pair<int, int> conflict_ways{0, 0};
+      if (isSmemTensorIndex(ldst->in())) {
+        conflict_ways.first = getConflictWays(evaluateAddressesOnFirstPhase(
+            ldst->in()->as<kir::TensorIndex>(),
+            for_loops_,
+            launch_params_,
+            expr_eval_common_));
+      }
+      if (isSmemTensorIndex(ldst->out())) {
+        conflict_ways.second = getConflictWays(evaluateAddressesOnFirstPhase(
+            ldst->out()->as<kir::TensorIndex>(),
+            for_loops_,
+            launch_params_,
+            expr_eval_common_));
+      }
+      if (conflict_ways.first > 1 || conflict_ways.second > 1) {
+        bank_conflict_info_[expr] = conflict_ways;
+      }
+    }
+  }
+
+  std::unordered_map<const Expr*, std::pair<int, int>> bank_conflict_info_;
+  c10::optional<LaunchParams> launch_params_;
+  ExpressionEvaluator expr_eval_common_;
+};
+
+} // namespace
+
+std::unordered_map<const Expr*, std::pair<int, int>> getBankConflictInfo(
+    kir::Kernel* kernel,
+    c10::optional<LaunchParams> launch_params,
+    const std::unordered_map<std::string, IntOrDouble>& known_values) {
+  for (auto pair : known_values) {
+    TORCH_CHECK(
+        !isThreadIdx(pair.first),
+        "threadIdx.{x,y,z} should be computed instead of provided");
+    TORCH_CHECK(
+        !isBlockIdx(pair.first),
+        "blockIdx.{x,y,z} should not be provided (they are always zero)");
+    TORCH_CHECK(
+        !isBlockDim(pair.first),
+        "blockDim.{x,y,z} should be provided by launch_params");
+    TORCH_CHECK(
+        !isGridDim(pair.first),
+        "gridDim.{x,y,z} should be provided by launch_params");
+  }
+  if (!launch_params.has_value()) {
+    launch_params =
+        InferLaunchParams::get(kernel->topLevelExprs(), known_values);
+  }
+  return BankConflictInfo::get(
+      kernel->topLevelExprs(), launch_params, known_values);
+}
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower_bank_conflict.h b/torch/csrc/jit/codegen/cuda/lower_bank_conflict.h
new file mode 100644
index 0000000000000..b651c4ed33e22
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/lower_bank_conflict.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <torch/csrc/jit/codegen/cuda/dynamic_type.h>
+#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
+#include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/kernel.h>
+
+#include <unordered_map>
+#include <utility>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+// for more info on shared memory access see page 54-72 of:
+// https://on-demand.gputechconf.com/gtc/2018/presentation/s81006-volta-architecture-and-performance-optimization.pdf
+
+// Warning: The bank confliction checking utility here is not a replacement of
+// nsight compute. This utility currently has the following assumptions and
+// limitations:
+//
+//   1. This utility assumes that the data of the tensor is accessed by
+//      `T0[index]`, where `index` is the one stored in the `TensorIndex`
+//      object.
+//   2. This utility only checks the first iteration. If we have something like
+//      `T1_s[tidx, 5]`, then different iterations should have different
+//      conflictions, which will not be evaluated for all of them
+//   3. This utility assumes that all tensors are independent, which means:
+//      3.1 All shared memory tensors are allocated starting from a multiple of
+//          4*32 bytes
+//      3.2 The only source of bank confliction is from within a tensor.
+//          There is no bank conflict between different tensors.
+//
+// Also note that this utility will not provide accurate estimation if the above
+// assumptions are satisfied
+
+std::unordered_map<const Expr*, std::pair<int, int>> getBankConflictInfo(
+    kir::Kernel* kernel,
+    c10::optional<LaunchParams> launch_params = c10::nullopt,
+    const std::unordered_map<std::string, IntOrDouble>& known_values = {});
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower_divisible_split.cpp b/torch/csrc/jit/codegen/cuda/lower_divisible_split.cpp
new file mode 100644
index 0000000000000..c1de1201e5d18
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/lower_divisible_split.cpp
@@ -0,0 +1,121 @@
+
+#include <torch/csrc/jit/codegen/cuda/lower_divisible_split.h>
+
+#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+
+#include <unordered_set>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+std::unordered_set<Split*> getAllDivisibleSplits(Fusion* fusion) {
+  ComputeAtMap ca_map(fusion);
+  return getAllDivisibleSplits(fusion, &ca_map);
+}
+
+std::unordered_set<Split*> getAllDivisibleSplits(
+    Fusion* fusion,
+    const ComputeAtMap* ca_map) {
+  std::unordered_set<Split*> all_divisible_splits;
+
+  auto all_tvs = ir_utils::allTvs(fusion);
+  // Find all tensor views with a view like rfactor. Splits used in view
+  // transformations must be divisible by definition.
+  for (auto tv : all_tvs) {
+    auto rfactor_dom = tv->getMaybeRFactorDomain();
+    // Not view if there's no rfactor axis
+    if (!tv->domain()->hasViewLikeRFactor()) {
+      continue;
+    }
+
+    // Take the view transformations and add all the splits. Those splits are
+    // the only divisible splits.
+    auto view_exprs =
+        StmtSort::getExprs(fusion, {rfactor_dom.begin(), rfactor_dom.end()});
+    auto split_exprs = ir_utils::filterByType<Split>(view_exprs);
+    all_divisible_splits.insert(split_exprs.begin(), split_exprs.end());
+  }
+
+  // Vectorized dimensions are enforced to be a result of divisible splits.
+  // Gather vectorized splits.
+  for (auto tv : all_tvs) {
+    auto vec_id_it = std::find_if(
+        tv->domain()->domain().begin(),
+        tv->domain()->domain().end(),
+        [](IterDomain* id) {
+          return isParallelTypeVectorize(id->getParallelType());
+        });
+
+    if (vec_id_it == tv->domain()->domain().end()) {
+      continue;
+    }
+
+    // We could have a case technically like:
+    // [8, 2] where we do:
+    // split(0, 2)
+    // merge(1)
+    // so it ends up as [4, 4]
+    // split(0, 2) must be divisible, but for now we're not going to capture
+    // cases like this. Just look for direct split's producing a vectorize
+    // dimension.
+    auto vec_id = *vec_id_it;
+    if (vec_id->definition() != nullptr && vec_id->definition()->isA<Split>()) {
+      all_divisible_splits.emplace(vec_id->definition()->as<Split>());
+    }
+  }
+
+  // If there's no view like splits, there's nothing to find
+  if (all_divisible_splits.empty()) {
+    return all_divisible_splits;
+  }
+
+  // Track the concrete id in the exact map of the outer output of the split
+  // expressions. This is how we'll check if there are matching splits. This
+  // also gets rid of any splits that already match (for processing).
+  std::unordered_map<IterDomain*, Expr*> outer_concrete_id_to_expr;
+
+  for (auto split : all_divisible_splits) {
+    outer_concrete_id_to_expr[ca_map->getConcreteMappedID(
+        split->outer(), IdMappingMode::EXACT)] = split;
+  }
+
+  std::unordered_set<Expr*> visited(
+      all_divisible_splits.begin(), all_divisible_splits.end());
+
+  // Find splits that match what we already have:
+  for (auto entry : outer_concrete_id_to_expr) {
+    auto concrete_id = entry.first;
+    auto original_view_split = entry.second;
+
+    const auto& exact_mapped_ids =
+        ca_map->idGraph().exactNodes().getDisjointSetOf(concrete_id).vector();
+    for (auto other_id : exact_mapped_ids) {
+      if (other_id->definition() == nullptr) {
+        continue;
+      }
+
+      if (!visited.emplace(other_id->definition()).second) {
+        // Already visited
+        continue;
+      }
+
+      if (IterDomainGraph::exprsMap(
+              original_view_split,
+              other_id->definition(),
+              false,
+              ca_map->idGraph().exactNodes())) {
+        all_divisible_splits.emplace(other_id->definition()->as<Split>());
+      }
+    }
+  }
+
+  return all_divisible_splits;
+}
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower_divisible_split.h b/torch/csrc/jit/codegen/cuda/lower_divisible_split.h
new file mode 100644
index 0000000000000..f2c4a78e4895e
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/lower_divisible_split.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+
+#include <torch/csrc/jit/codegen/cuda/compute_at_map.h>
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+// Looks through all transformations assocaited with view, or enforced divisible
+// vectorization splits and gathers all splits that provably don't have a
+// remainder, therefore the extents of the associated IterDomains do not require
+// a ceilDiv expressions.
+TORCH_CUDA_CU_API std::unordered_set<Split*> getAllDivisibleSplits(
+    Fusion* fusion);
+
+// Same as above but will use provided ComputeAtMap instead of building its own.
+TORCH_CUDA_CU_API std::unordered_set<Split*> getAllDivisibleSplits(
+    Fusion* fusion,
+    const ComputeAtMap* ca_map);
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp b/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp
index c4a5beeeabee2..7f06aea2f5423 100644
--- a/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp
@@ -709,7 +709,7 @@ std::vector<IterDomain*> getLocalDomainOrdering(
   std::sort(
       merged_domain.begin(),
       merged_domain.end(),
-      IterDomainDependencySorter(
+      ir_utils::IterDomainDependencySorter(
           concrete_id_dependencies, GpuLower::current()->caMap()));
   return merged_domain;
 }
@@ -1398,6 +1398,9 @@ std::vector<Expr*> ExprSegmentationSorter::getExprs() const {
 
 std::vector<Expr*> reorderExprsForComputeAt() {
   auto fusion = FusionGuard::getCurFusion();
+  if (fusion->exprs().empty()) {
+    return {};
+  }
   TORCH_INTERNAL_ASSERT(fusion != nullptr);
   ExprSegmentationSorter sorter(fusion);
   sorter.sort();
diff --git a/torch/csrc/jit/codegen/cuda/lower_index.cpp b/torch/csrc/jit/codegen/cuda/lower_index.cpp
index dc210e98cbc8d..e83a0e9fce996 100644
--- a/torch/csrc/jit/codegen/cuda/lower_index.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_index.cpp
@@ -100,37 +100,73 @@ void IndexLowering::handle(const RNGOp* rop) {
 
   // TensorIndex for philox subsequence and component.
   auto philox_index = SimplifyingIrBuilder::create<kir::TensorIndex>(
-      out_tv, Index::getLinearIndex(out_tv, for_loops_));
+      out_tv, Index::getLinearLogicalIndex(out_tv, for_loops_));
 
-  // TensorIndex for writing randlike output.
+  // TensorIndex for writing rand_like output.
   const auto out = lowerDstIndex(out_tv);
 
   auto lowered = IrBuilder::create<RNGOp>(
-      rop->getRNGOpType(), out, rop->getRNGOffset(), philox_index);
+      rop->getRNGOpType(),
+      out,
+      rop->dtype(),
+      rop->getParameters(),
+      rop->getRNGOffset(),
+      philox_index);
 
   pushBack(lowered);
   GpuLower::current()->propagateExprInfo(rop, back());
 }
 
+void IndexLowering::handle(const FullOp* fop) {
+  auto out_tv = dynamic_cast<TensorView*>(fop->output(0));
+  TORCH_INTERNAL_ASSERT(out_tv != nullptr);
+
+  // TensorIndex for writing output.
+  const auto out = lowerDstIndex(out_tv);
+  auto lowered =
+      IrBuilder::create<FullOp>(out, fop->getFillValue(), fop->dtype());
+
+  pushBack(lowered);
+  GpuLower::current()->propagateExprInfo(fop, back());
+}
+
 void IndexLowering::handle(const ARangeOp* aop) {
   // Write linear tensor indices into the consumer
   //  tensor index if the output is a tensor.
   auto out_tv = dynamic_cast<TensorView*>(aop->output(0));
   TORCH_INTERNAL_ASSERT(out_tv != nullptr);
 
-  // TensorIndex for philox subsequence and component.
+  // linear index for computing arange output
   auto linear_index = SimplifyingIrBuilder::create<kir::TensorIndex>(
-      out_tv, Index::getLinearIndex(out_tv, for_loops_));
+      out_tv, Index::getLinearLogicalIndex(out_tv, for_loops_));
 
-  // TensorIndex for writing randlike output.
+  // TensorIndex for writing arange output.
   const auto out = lowerDstIndex(out_tv);
   auto lowered = IrBuilder::create<ARangeOp>(
-      out, aop->start(), aop->end(), aop->step(), linear_index);
+      out, aop->start(), aop->end(), aop->step(), aop->dtype(), linear_index);
 
   pushBack(lowered);
   GpuLower::current()->propagateExprInfo(aop, back());
 }
 
+void IndexLowering::handle(const EyeOp* eop) {
+  auto out_tv = dynamic_cast<TensorView*>(eop->output(0));
+  TORCH_INTERNAL_ASSERT(out_tv != nullptr);
+
+  // linear index for computing eye output
+  auto indices = Index::getPerDimLogicalIndex(out_tv, for_loops_);
+  TORCH_INTERNAL_ASSERT(indices.size() == 2);
+  auto index1 = indices[0];
+  auto index2 = indices[1];
+
+  // TensorIndex for writing eye output.
+  const auto out = lowerDstIndex(out_tv);
+  auto lowered = IrBuilder::create<EyeOp>(out, eop->dtype(), index1, index2);
+
+  pushBack(lowered);
+  GpuLower::current()->propagateExprInfo(eop, back());
+}
+
 void IndexLowering::handle(const UnaryOp* uop) {
   const auto in = lowerSrcIndex(uop->in(), uop->out());
   const auto out = lowerDstIndex(uop->out());
@@ -375,10 +411,12 @@ void IndexLowering::handleBlockReduction(
   ReductionOp* indexed_rop = IrBuilder::create<ReductionOp>(
       rop->getReductionOpType(), rop->init(), out, in, rop->isAllreduce());
   if (rop->predicate()) {
-    indexed_rop->setPredicate(rop->predicate());
+    indexed_rop =
+        indexed_rop->withPredicate(rop->predicate())->as<ReductionOp>();
   }
   if (rop->writePredicate()) {
-    indexed_rop->setWritePredicate(rop->writePredicate());
+    indexed_rop = indexed_rop->withWritePredicate(rop->writePredicate())
+                      ->as<ReductionOp>();
   }
 
   pushBack(indexed_rop);
@@ -457,13 +495,15 @@ void IndexLowering::handleGridReduction(
       n_entrances,
       rop->isAllreduce());
 
-  grid_reduction->setThreadPredicate(thread_pred);
+  grid_reduction = grid_reduction->withThreadPredicate(thread_pred);
 
   if (rop->predicate()) {
-    grid_reduction->setPredicate(rop->predicate());
+    grid_reduction = grid_reduction->withPredicate(rop->predicate())
+                         ->as<kir::GridReduction>();
   }
   if (rop->writePredicate()) {
-    grid_reduction->setWritePredicate(rop->writePredicate());
+    grid_reduction = grid_reduction->withWritePredicate(rop->writePredicate())
+                         ->as<kir::GridReduction>();
   }
 
   pushBack(grid_reduction);
@@ -520,10 +560,12 @@ void IndexLowering::handleBlockReduction(
       inputs,
       grouped_rop->isAllreduce());
   if (grouped_rop->predicate()) {
-    indexed_rop->setPredicate(grouped_rop->predicate());
+    indexed_rop = indexed_rop->withPredicate(grouped_rop->predicate())
+                      ->as<GroupedReductionOp>();
   }
   if (grouped_rop->writePredicate()) {
-    indexed_rop->setWritePredicate(grouped_rop->writePredicate());
+    indexed_rop = indexed_rop->withWritePredicate(grouped_rop->writePredicate())
+                      ->as<GroupedReductionOp>();
   }
 
   pushBack(indexed_rop);
@@ -602,13 +644,16 @@ void IndexLowering::handleGridReduction(
       work_buf_size_info.buffer_stride,
       grouped_rop->isAllreduce());
 
-  grid_reduction->setThreadPredicate(thread_pred);
+  grid_reduction = grid_reduction->withThreadPredicate(thread_pred);
 
   if (grouped_rop->predicate()) {
-    grid_reduction->setPredicate(grouped_rop->predicate());
+    grid_reduction = grid_reduction->withPredicate(grouped_rop->predicate())
+                         ->as<kir::GroupedGridReduction>();
   }
   if (grouped_rop->writePredicate()) {
-    grid_reduction->setWritePredicate(grouped_rop->writePredicate());
+    grid_reduction =
+        grid_reduction->withWritePredicate(grouped_rop->writePredicate())
+            ->as<kir::GroupedGridReduction>();
   }
 
   pushBack(grid_reduction);
@@ -670,10 +715,11 @@ void IndexLowering::handle(const WelfordOp* wop) {
       wop->isAllreduce());
 
   if (wop->predicate()) {
-    indexed_wop->setPredicate(wop->predicate());
+    indexed_wop = indexed_wop->withPredicate(wop->predicate())->as<WelfordOp>();
   }
   if (wop->writePredicate()) {
-    indexed_wop->setWritePredicate(wop->writePredicate());
+    indexed_wop =
+        indexed_wop->withWritePredicate(wop->writePredicate())->as<WelfordOp>();
   }
 
   // Serial welford
@@ -749,22 +795,27 @@ void IndexLowering::handleGridWelford(WelfordOp* indexed_wop) {
       entrance_ind,
       n_entrances);
 
-  grid_welford->setThreadPredicate(thread_pred);
+  grid_welford = grid_welford->withThreadPredicate(thread_pred);
 
   const bool block_reduce_separated =
       out_domain->hasBlockReduction() && !indexed_wop->isAllreduce();
 
   if (indexed_wop->predicate()) {
     if (block_reduce_separated) {
-      grid_welford->setPredicate(IrBuilder::create<kir::Predicate>(
-          GpuLower::current()->kernel()->trueVal()));
+      grid_welford = grid_welford
+                         ->withPredicate(IrBuilder::create<kir::Predicate>(
+                             GpuLower::current()->kernel()->trueVal()))
+                         ->as<kir::GridWelford>();
     } else {
-      grid_welford->setPredicate(indexed_wop->predicate());
+      grid_welford = grid_welford->withPredicate(indexed_wop->predicate())
+                         ->as<kir::GridWelford>();
     }
   }
 
   if (indexed_wop->writePredicate()) {
-    grid_welford->setWritePredicate(indexed_wop->writePredicate());
+    grid_welford =
+        grid_welford->withWritePredicate(indexed_wop->writePredicate())
+            ->as<kir::GridWelford>();
   }
 
   if (block_reduce_separated) {
@@ -909,13 +960,15 @@ void IndexLowering::handleGroupedGridWelford(
       work_buf_size_info.buffer_stride,
       op->isAllreduce());
 
-  indexed_op->setThreadPredicate(thread_pred);
+  indexed_op = indexed_op->withThreadPredicate(thread_pred);
 
   if (op->predicate()) {
-    indexed_op->setPredicate(op->predicate());
+    indexed_op = indexed_op->withPredicate(op->predicate())
+                     ->as<kir::GroupedGridWelford>();
   }
   if (op->writePredicate()) {
-    indexed_op->setWritePredicate(op->writePredicate());
+    indexed_op = indexed_op->withWritePredicate(op->writePredicate())
+                     ->as<kir::GroupedGridWelford>();
   }
 
   pushBack(indexed_op);
@@ -929,7 +982,9 @@ void IndexLowering::handleGroupedGridWelford(
 void IndexLowering::handle(const LoadStoreOp* ldst) {
   const auto in = lowerSrcIndex(ldst->in(), ldst->out());
   const auto out = lowerDstIndex(ldst->out());
-  pushBack(IrBuilder::create<LoadStoreOp>(ldst->opType(), out, in));
+  auto new_ldst = IrBuilder::create<LoadStoreOp>(ldst->opType(), out, in)
+                      ->withPredicate(ldst->predicate());
+  pushBack(new_ldst);
   GpuLower::current()->propagateExprInfo(ldst, back());
 }
 
@@ -961,7 +1016,8 @@ void IndexLowering::handle(const BroadcastOp* bop) {
   const bool block_z = parallel_bitmap.get(ParallelType::BIDz);
 
   if (bop->predicate()) {
-    indexed_expr->setPredicate(bop->predicate());
+    indexed_expr =
+        indexed_expr->withPredicate(bop->predicate())->as<BroadcastOp>();
   }
 
   const bool grid_broadcast_needed = block_x || block_y || block_z;
@@ -988,7 +1044,8 @@ void IndexLowering::handle(const BroadcastOp* bop) {
       indexed_expr, work_buffer, sync_buffer);
 
   if (bop->predicate()) {
-    grid_broadcast->setPredicate(bop->predicate());
+    grid_broadcast = grid_broadcast->withPredicate(bop->predicate())
+                         ->as<kir::GridBroadcast>();
   }
 
   pushBack(grid_broadcast);
@@ -1040,7 +1097,7 @@ kir::Allocate* IndexLowering::allocateUniqueBuffer(
 
   // No existing allocation found. Create a new one
   auto new_buffer =
-      ir_utils::allocGlobalBufferForGridComm(buffer_size, dtype, zero_init);
+      lower_utils::allocGlobalBufferForGridComm(buffer_size, dtype, zero_init);
 
   // Keep track of the allocation
   alloc_map.emplace(out_tv, new_buffer);
diff --git a/torch/csrc/jit/codegen/cuda/lower_index.h b/torch/csrc/jit/codegen/cuda/lower_index.h
index 75f7fd4aac335..6c08eeb195ea5 100644
--- a/torch/csrc/jit/codegen/cuda/lower_index.h
+++ b/torch/csrc/jit/codegen/cuda/lower_index.h
@@ -38,7 +38,9 @@ class TORCH_CUDA_CU_API IndexLowering : private OptOutConstDispatch {
   // Insert an expression before the current top-level expression.
   void insertAtTopLevel(Expr* expr);
 
+  void handle(const FullOp*) final;
   void handle(const ARangeOp*) final;
+  void handle(const EyeOp*) final;
   void handle(const ViewAsScalar*) final;
   void handle(const UnaryOp*) final;
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp b/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp
index 2d4444d340903..140fecc0f8af1 100644
--- a/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp
@@ -101,7 +101,7 @@ struct IndexingParameters {
 };
 
 // Initial loop index map for global producer or consumer case.
-IndexingParameters getGlobalIndexParameters(
+IndexingParameters getLinearIndexParameters(
     const LoopIndexing& loop_indexing,
     bool index_producer = false) {
   IndexingParameters index_parameters;
@@ -112,7 +112,8 @@ IndexingParameters getGlobalIndexParameters(
 
   for (auto loop_idx : c10::irange(loops.size())) {
     auto loop = loops[loop_idx];
-    auto index_domain = ir_utils::caMapExactConcreteId(loop_domain[loop_idx]);
+    auto index_domain = GpuLower::current()->caMap()->getConcreteMappedID(
+        loop_domain[loop_idx], IdMappingMode::EXACT);
     if (loop->isTrivial()) {
       // This is useful information in the case of
       //  MisalignedVectorize and double buffer epilog, etc.
@@ -125,7 +126,8 @@ IndexingParameters getGlobalIndexParameters(
 
   // Derive the halo extents from the loop indexing result.
   index_parameters.concrete_id_to_halo_extent =
-      GpuLower::current()->haloInfo().buildConcreteHaloExtentMap(loop_indexing);
+      GpuLower::current()->haloInfo()->buildConcreteHaloExtentMap(
+          loop_indexing);
 
   protectNonPredicateIndexWithMagicZero(
       loops,
@@ -148,7 +150,9 @@ IndexingParameters getGlobalIndexParameters(
 
         auto loop_id = loop_indexing.loopDomains()[loop_idx];
 
-        auto concrete_loop_id = ir_utils::caMapExactConcreteId(loop_id);
+        auto concrete_loop_id =
+            GpuLower::current()->caMap()->getConcreteMappedID(
+                loop_id, IdMappingMode::EXACT);
 
         auto stage_depth =
             GpuLower::current()->doubleBufferInfo().getStageDepthFor(
@@ -185,7 +189,7 @@ IndexingParameters getNonGlobalInitialIndexParameters(
   }
 
   auto alloc_tv = index_producer ? producer_tv : consumer_tv;
-  auto alloc_info = loop_utils::getAllocInformation(
+  auto alloc_info = lower_utils::getAllocInformation(
       alloc_tv, loops, alloc_id_map, index_producer);
 
   std::unordered_map<kir::ForLoop*, Val*> loop_to_ind_map;
@@ -216,7 +220,9 @@ IndexingParameters getNonGlobalInitialIndexParameters(
     auto loop = loops[loop_idx];
     auto loop_domain = loop_domains[loop_idx];
 
-    auto concrete_loop_domain = ir_utils::caMapExactConcreteId(loop_domain);
+    auto concrete_loop_domain =
+        GpuLower::current()->caMap()->getConcreteMappedID(
+            loop_domain, IdMappingMode::EXACT);
 
     index_parameters.initial_concrete_id_index[concrete_loop_domain] =
         loop_to_ind_map.at(loop);
@@ -233,7 +239,8 @@ IndexingParameters getNonGlobalInitialIndexParameters(
 
   // Derive the halo extents from the loop indexing result.
   index_parameters.concrete_id_to_halo_extent =
-      GpuLower::current()->haloInfo().buildConcreteHaloExtentMap(loop_indexing);
+      GpuLower::current()->haloInfo()->buildConcreteHaloExtentMap(
+          loop_indexing);
 
   return index_parameters;
 }
@@ -397,7 +404,8 @@ IndexingParameters getPredicateInitialIndexParameters(
   for (int loop_idx : c10::irange(loops.size())) {
     auto loop = loops.at(loop_idx);
     auto concrete_loop_domain =
-        ir_utils::caMapExactConcreteId(loop_domains.at(loop_idx));
+        GpuLower::current()->caMap()->getConcreteMappedID(
+            loop_domains.at(loop_idx), IdMappingMode::EXACT);
     index_parameters.initial_concrete_id_index[concrete_loop_domain] =
         loop_to_ind_map.at(loop);
   }
@@ -408,7 +416,8 @@ IndexingParameters getPredicateInitialIndexParameters(
 
   // Derive the halo extents from the loop indexing result.
   index_parameters.concrete_id_to_halo_extent =
-      GpuLower::current()->haloInfo().buildConcreteHaloExtentMap(loop_indexing);
+      GpuLower::current()->haloInfo()->buildConcreteHaloExtentMap(
+          loop_indexing);
 
   return index_parameters;
 }
@@ -563,7 +572,10 @@ LoopIndexingAnalysis::LoopIndexingAnalysis(
   // consume each concrete id once so this map is well defined.
   for (auto expr : replayed_exprs_) {
     for (auto input_id : ir_utils::filterByType<IterDomain>(expr->inputs())) {
-      concrete_id_to_consumer_[ir_utils::caMapExactConcreteId(input_id)] = expr;
+      auto concrete_input_id =
+          GpuLower::current()->caMap()->getConcreteMappedID(
+              input_id, IdMappingMode::EXACT);
+      concrete_id_to_consumer_[concrete_input_id] = expr;
     }
   }
 
@@ -595,7 +607,8 @@ void LoopIndexingAnalysis::validateLoopStructure(
   for (auto it_i = loops.begin(); it_i != loops.end(); ++it_i) {
     // Largely duplicating original logic
     auto loop_id = (*it_i)->iter_domain();
-    auto concrete_loop_id = ir_utils::caMapExactConcreteId(loop_id);
+    auto concrete_loop_id = GpuLower::current()->caMap()->getConcreteMappedID(
+        loop_id, IdMappingMode::EXACT);
 
     TORCH_INTERNAL_ASSERT(
         !concrete_to_loop.count(concrete_loop_id),
@@ -659,13 +672,22 @@ void LoopIndexingAnalysis::traverseFromDomainVals() {
 }
 
 IterDomain* LoopIndexingAnalysis::concretizeAndVisitId(IterDomain* id) {
-  auto concrete_id = ir_utils::caMapExactConcreteId(id);
+  auto concrete_id = GpuLower::current()->caMap()->getConcreteMappedID(
+      id, IdMappingMode::EXACT);
   if (replayed_concrete_ids_.pushBack(concrete_id)) {
     concrete_to_original_id_[concrete_id] = id;
   }
   return concrete_id;
 }
 
+namespace {
+// Alias used for std::transform
+IterDomain* exactConcreteId(IterDomain* id) {
+  return GpuLower::current()->caMap()->getConcreteMappedID(
+      id, IdMappingMode::EXACT);
+}
+} // namespace
+
 void LoopIndexingAnalysis::visitExpr(Expr* expr) {
   if (auto swizzle2d = dynamic_cast<Swizzle2D*>(expr)) {
     // Swizzle outputs are already forwarded through
@@ -700,14 +722,14 @@ void LoopIndexingAnalysis::visitExpr(Expr* expr) {
       consumed_ids.begin(),
       consumed_ids.end(),
       std::inserter(consumed_concrete_, consumed_concrete_.end()),
-      ir_utils::caMapExactConcreteId);
+      exactConcreteId);
 
   auto produced_ids = ir_utils::filterByType<IterDomain>(expr->outputs());
   std::transform(
       produced_ids.begin(),
       produced_ids.end(),
       std::inserter(produced_concrete_, produced_concrete_.end()),
-      ir_utils::caMapExactConcreteId);
+      exactConcreteId);
 }
 
 bool LoopIndexingAnalysis::visitIdsAndCheckDuplication(
@@ -732,8 +754,36 @@ void LoopIndexingAnalysis::constructLoopDomains() {
               !concrete_id_to_consumer_.count(concrete_id) &&
               // Use permissive map so the selected ID indeed represents the
               // loop.
-              GpuLower::current()->caMap()->areMapped(
-                  concrete_id, loop_id, IdMappingMode::PERMISSIVE);
+              // Note: see PR https://github.com/csarofeen/pytorch/pull/1960
+              //  and issue https://github.com/csarofeen/pytorch/issues/1873
+              // This mapping look up is part of a staged indexing scheme.
+              //  When we find a replayed exact id that exactly map to the loop
+              //  id, this means that we can resolve indexing involved in this
+              //  loop "locally", i.e. only with and with only the iterdomains
+              //  on the
+              //
+              //  given consumer tv.
+              //  When we cannot find an exact mapping, the permissive mapping
+              //  would
+              //   help defering the indexing resolution for this loop nest
+              //   level to other iterdomain expressions from tv's that are
+              //   further concretized and usually they are further down the
+              //   consumer chain of the given consumer tv.
+              //
+              //  Intuitively exact mapping of two iterdomains should imply
+              //  permissive mapping
+              //   of them as well and if that was the case, only looking up
+              //   permissive mapping would be enough to address both of the
+              //   cases above.
+              //  FIXME: But currently exact mapping does not imply permissive
+              //  mapping (See issue:
+              //       https://github.com/csarofeen/pytorch/issues/1963)
+              // Which means we should check both exact and permissive mapping
+              // here.
+              (GpuLower::current()->caMap()->areMapped(
+                   concrete_id, loop_id, IdMappingMode::EXACT) ||
+               GpuLower::current()->caMap()->areMapped(
+                   concrete_id, loop_id, IdMappingMode::PERMISSIVE));
         });
 
     TORCH_INTERNAL_ASSERT(
@@ -769,7 +819,8 @@ void LoopIndexingAnalysis::constructLoopDomains() {
   // will complain for not having all outputs of the traversal.
   for (auto id : ir_utils::filterByType<IterDomain>(all_ids_from_root)) {
     if (id->uses().empty()) {
-      loop_domains_.pushBack(ir_utils::caMapExactConcreteId(id));
+      loop_domains_.pushBack(GpuLower::current()->caMap()->getConcreteMappedID(
+          id, IdMappingMode::EXACT));
     }
   }
 }
@@ -797,7 +848,7 @@ IndexFromIdGraph getTensorIndexFromIdGraph(
   }
 
   if (is_global) {
-    index_parameters = getGlobalIndexParameters(loop_indexing, index_producer);
+    index_parameters = getLinearIndexParameters(loop_indexing, index_producer);
   } else {
     index_parameters = getNonGlobalInitialIndexParameters(
         loop_indexing, consumer_tv, index_producer, producer_tv, p2c_map);
@@ -849,7 +900,8 @@ IndexFromIdGraph getTensorIndexFromIdGraph(
 
     // Exact id will have to be pulled from consumer side as the
     //  producer side are replayed ids.
-    auto exact_concrete_id = ir_utils::caMapExactConcreteId(consumer_id);
+    auto exact_concrete_id = GpuLower::current()->caMap()->getConcreteMappedID(
+        consumer_id, IdMappingMode::EXACT);
 
     index_update_map[exact_concrete_id] = target_id;
 
@@ -864,7 +916,12 @@ IndexFromIdGraph getTensorIndexFromIdGraph(
       target_tv->domain()->domain(),
       target_tv->getMaybeRFactorDomain(),
       target_tv->domain()->contiguity(),
-      initial_indexable_map,
+      {},
+      indexing.indexMap(),
+      GpuLower::current()->divisbleSplitSet(),
+      GpuLower::current()->caMap(),
+      GpuLower::current()->haloInfo(),
+      GpuLower::current()->concretizedBroadcastDomains(),
       p2c_map);
 
   auto target_indexing = indexing.updateIndexCompute(
@@ -930,18 +987,16 @@ IndexFromIdGraph getPredicateIndexingFromIdGraph(
        ir_utils::filterByType<IterDomain>(all_consumer_vals)) {
     // Track the non-concrete id we were trying to bind index
     //  to, whether from producer or consumer.
-    auto exact_concrete_id = ir_utils::caMapExactConcreteId(consumer_id);
+    auto exact_concrete_id = GpuLower::current()->caMap()->getConcreteMappedID(
+        consumer_id, IdMappingMode::EXACT);
     index_update_map[exact_concrete_id] = consumer_id;
   }
 
-  // No contiguity info is used in the predicate indexing pass,
-  //  the predicate generation logic that uses the index math
-  //  generated here will take contiguity into account.
-  ContigIDs contig_finder(
-      consumer_tv->domain()->domain(),
-      consumer_tv->getMaybeRFactorDomain(),
-      std::vector<bool>(consumer_tv->getMaybeRFactorDomain().size(), false),
-      {});
+  // No contiguity info is used in the predicate indexing pass, the predicate
+  // generation logic that uses the index math generated here will take
+  // contiguity into account. Send an empty ContigID class so nothing is marked
+  // as contiguous.
+  auto contig_finder = ContigIDs::getNonContigIDs();
 
   // Run second backward traversal to map back to the consumer_tv
   auto target_indexing = indexing.updateIndexCompute(
@@ -1009,7 +1064,8 @@ LoopIndexingTraversal::LoopIndexingTraversal(
     auto next_ids =
         ir_utils::filterByType<IterDomain>(nextValsInTraversalOrder(expr));
     for (auto id : next_ids) {
-      auto concrete_id = ir_utils::caMapExactConcreteId(id);
+      auto concrete_id = GpuLower::current()->caMap()->getConcreteMappedID(
+          id, IdMappingMode::EXACT);
       TORCH_INTERNAL_ASSERT(
           concrete_id_to_dependency_.insert(std::make_pair(concrete_id, expr))
               .second,
@@ -1077,7 +1133,8 @@ std::vector<Expr*> LoopIndexingTraversal::getExprList() {
     for (auto prev_id :
          ir_utils::filterByType<IterDomain>(prevValsInTraversalOrder(top))) {
       auto prev_expr_it = concrete_id_to_dependency_.find(
-          ir_utils::caMapExactConcreteId(prev_id));
+          GpuLower::current()->caMap()->getConcreteMappedID(
+              prev_id, IdMappingMode::EXACT));
       if (prev_expr_it != concrete_id_to_dependency_.end()) {
         auto prev_expr = prev_expr_it->second;
         if (!visited.count(prev_expr)) {
@@ -1114,7 +1171,7 @@ void LoopIndexingAnalysis::collectOutOfLineExprs() {
           consumer_tv_->getComputeAtPosition(),
       consumer_tv_->domain()->domain().end(),
       std::inserter(out_of_line_ids, out_of_line_ids.end()),
-      ir_utils::caMapExactConcreteId);
+      exactConcreteId);
 
   // Get the original selected list of index expressions
   //  in reverse topological order.
@@ -1129,7 +1186,9 @@ void LoopIndexingAnalysis::collectOutOfLineExprs() {
             id_outputs.begin(),
             id_outputs.end(),
             [&out_of_line_ids](IterDomain* id) {
-              return out_of_line_ids.count(ir_utils::caMapExactConcreteId(id));
+              return out_of_line_ids.count(
+                  GpuLower::current()->caMap()->getConcreteMappedID(
+                      id, IdMappingMode::EXACT));
             })) {
       // Record out of line expression
       out_of_line_exprs_.push_back(expr);
@@ -1140,7 +1199,7 @@ void LoopIndexingAnalysis::collectOutOfLineExprs() {
           id_inputs.begin(),
           id_inputs.end(),
           std::inserter(out_of_line_ids, out_of_line_ids.end()),
-          ir_utils::caMapExactConcreteId);
+          exactConcreteId);
     }
   }
 }
@@ -1161,14 +1220,14 @@ std::unordered_set<IterDomain*> LoopIndexing::getAllExactConcreteIdSet() const {
         out_ids.begin(),
         out_ids.end(),
         std::inserter(all_id_set, all_id_set.end()),
-        ir_utils::caMapExactConcreteId);
+        exactConcreteId);
 
     auto in_ids = ir_utils::filterByType<IterDomain>(expr->inputs());
     std::transform(
         in_ids.begin(),
         in_ids.end(),
         std::inserter(all_id_set, all_id_set.end()),
-        ir_utils::caMapExactConcreteId);
+        exactConcreteId);
   }
   return all_id_set;
 }
@@ -1213,7 +1272,9 @@ class LoopIndexingPreferredPathCompute : public IterVisitor {
         }
         mapped_id = c_id_it->second;
       }
-      auto concrete_original_id = ir_utils::caMapExactConcreteId(mapped_id);
+      auto concrete_original_id =
+          GpuLower::current()->caMap()->getConcreteMappedID(
+              mapped_id, IdMappingMode::EXACT);
       if (all_concrete_ids.count(concrete_original_id)) {
         if (original_id->isBroadcast() || original_id->isReduction() ||
             original_id->isStride()) {
@@ -1239,8 +1300,10 @@ class LoopIndexingPreferredPathCompute : public IterVisitor {
             all_iter_inputs.begin(),
             all_iter_inputs.end(),
             [&](IterDomain* inp_id) {
-              return this->preferred_path_.find(ir_utils::caMapExactConcreteId(
-                         inp_id)) != this->preferred_path_.end();
+              return this->preferred_path_.find(
+                         GpuLower::current()->caMap()->getConcreteMappedID(
+                             inp_id, IdMappingMode::EXACT)) !=
+                  this->preferred_path_.end();
             })) {
       auto all_iter_outputs = ir_utils::filterByType<IterDomain>(e->outputs());
 
@@ -1248,7 +1311,7 @@ class LoopIndexingPreferredPathCompute : public IterVisitor {
           all_iter_outputs.begin(),
           all_iter_outputs.end(),
           std::inserter(preferred_path_, preferred_path_.end()),
-          ir_utils::caMapExactConcreteId);
+          exactConcreteId);
     }
   }
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp b/torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp
index 12b02d0b51ce3..86ca9d8427e78 100644
--- a/torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp
@@ -293,7 +293,7 @@ class WarSyncInserter : private kir::ExprMutator {
     auto maybe_aliased_tv = alloc_map_.getRealBuffer(tv);
     auto alloc_it = smem_allocations_.find(maybe_aliased_tv);
     auto ca_loop =
-        loop_utils::getAllocInformation(tv, for_loops_).init_for_loop;
+        lower_utils::getAllocInformation(tv, for_loops_).init_for_loop;
     if (alloc_it == smem_allocations_.end()) {
       WarMemoryInfo mem_info;
       mem_info.ca_loop = ca_loop;
@@ -486,7 +486,7 @@ class ReadAfterWriteSyncs : public kir::ExprMutator {
       Expr* sync_expr = nullptr;
       kir::Allocate* maybe_alloc = nullptr;
       if (sync_bitmap.hasBID()) {
-        maybe_alloc = ir_utils::allocGlobalBufferForGridComm(
+        maybe_alloc = lower_utils::allocGlobalBufferForGridComm(
             getGridSyncBufferSize(sync_bitmap), DataType::Int, true);
         sync_expr = IrBuilder::create<kir::GridSync>(
             sync_bitmap, maybe_alloc->buffer());
diff --git a/torch/csrc/jit/codegen/cuda/lower_loops.cpp b/torch/csrc/jit/codegen/cuda/lower_loops.cpp
index 7fdb149da9359..0653296366ccc 100644
--- a/torch/csrc/jit/codegen/cuda/lower_loops.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_loops.cpp
@@ -33,7 +33,7 @@ LoopNestGenerator::LoopNestGenerator(const std::vector<Expr*>& exprs) {
 namespace {
 
 kir::ForLoop* openForHelper(kir::ForLoop* scope, IterDomain* id) {
-  auto extent_with_halo = GpuLower::current()->haloInfo().getExtent(id);
+  auto extent_with_halo = GpuLower::current()->haloInfo()->getExtent(id);
   kir::ForLoop* new_scope = nullptr;
   if (extent_with_halo) {
     // When an axis is extended with halo, unrolling and vectorization
@@ -252,7 +252,7 @@ void LoopNestGenerator::generate(const std::vector<Expr*>& exprs) {
     std::sort(
         loop_structure.rbegin(),
         loop_structure.rend(),
-        IterDomainDependencySorter(
+        ir_utils::IterDomainDependencySorter(
             concrete_id_dependencies, GpuLower::current()->caMap()));
     loop_structures_[tv] = loop_structure;
   }
diff --git a/torch/csrc/jit/codegen/cuda/lower_predicate.cpp b/torch/csrc/jit/codegen/cuda/lower_predicate.cpp
index 989c00be81b78..7b0393d491572 100644
--- a/torch/csrc/jit/codegen/cuda/lower_predicate.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_predicate.cpp
@@ -20,7 +20,7 @@ namespace cuda {
 
 namespace {
 
-class ConditionalFromPredicateModifier : public kir::IrVisitor {
+class ConditionalFromPredicateModifier : public kir::ExprMutator {
  public:
   ConditionalFromPredicateModifier() = delete;
 
@@ -32,47 +32,58 @@ class ConditionalFromPredicateModifier : public kir::IrVisitor {
  private:
   ConditionalFromPredicateModifier(const std::vector<Expr*>& exprs) {
     FUSER_PERF_SCOPE(
-        "GpuLower::Lower::ConditionalFromPredicateModifier::process");
-    kir::IrVisitor::handle(exprs);
+        "ConditionalFromPredicateModifier::ConditionalFromPredicateModifier");
+    traverseAndInsert(exprs);
   }
 
-  using kir::IrVisitor::handle;
+  using kir::ExprMutator::handle;
 
   void handle(Expr* expr) final {
     if (expr != nullptr && expr->predicate() != nullptr) {
       // Replace expr predicate with bool conditional
       auto conditional = generateConditional(expr->predicate());
       if (expr->predicate()->predicate_type() == PredicateType::Vectorize) {
-        // TODO: This logic doesn't seem to fit well here, for unswitch the
-        // logic is in the unroll loop to set the thread predicate to the expr.
-        // I didn't have a quick way to do that so placing this here for now.
-        TORCH_INTERNAL_ASSERT(
-            expr->isA<kir::IfThenElse>(),
-            "Predicate handling expects ITE statement.");
-        auto ite = expr->as<kir::IfThenElse>();
-
-        TORCH_INTERNAL_ASSERT(
-            ite->thenBody().size() == 1,
-            "Expecting predicated body to only have one vectorized expression.");
-        auto vec_expr = ite->thenBody()[0];
-        TORCH_INTERNAL_ASSERT(
-            vec_expr->isA<UnaryOp>() || vec_expr->isA<LoadStoreOp>(),
-            "Vectorize predicate exprs only supported on set operations.");
-        TORCH_INTERNAL_ASSERT(
-            ir_utils::isTvOp(vec_expr),
-            "Vectorize predicate exprs only supported on tensor view operations.");
-        if (!vec_expr->inputs()[0]->isConstScalar()) {
+        if (expr->isA<kir::IfThenElse>()) {
+          // TODO: This logic doesn't seem to fit well here, for unswitch the
+          // logic is in the unroll loop to set the thread predicate to the
+          // expr. I didn't have a quick way to do that so placing this here for
+          // now.
+          auto ite = expr->as<kir::IfThenElse>();
+
+          TORCH_INTERNAL_ASSERT(
+              ite->thenBody().size() == 1,
+              "Expecting predicated body to only have one vectorized expression.");
+          auto vec_expr = ite->thenBody()[0];
+          TORCH_INTERNAL_ASSERT(
+              vec_expr->isA<UnaryOp>() || vec_expr->isA<LoadStoreOp>(),
+              "Vectorize predicate exprs only supported on set operations.");
+          TORCH_INTERNAL_ASSERT(
+              ir_utils::isTvOp(vec_expr),
+              "Vectorize predicate exprs only supported on tensor view operations.");
+          if (!vec_expr->inputs()[0]->isConstScalar()) {
+            conditional = SimplifyingIrBuilder::andExpr(
+                              conditional,
+                              GpuLower::current()->threadPredMap().getPredicate(
+                                  ir_utils::getTvOutput(vec_expr)))
+                              ->as<Bool>();
+          }
+        } else {
+          TORCH_INTERNAL_ASSERT(lower_utils::supportInlinePredicate(expr));
+          auto thread_pred = GpuLower::current()->threadPredMap().getPredicate(
+              ir_utils::getTvOutput(expr));
+          TORCH_INTERNAL_ASSERT(
+              thread_pred->isConst() && thread_pred->value().value());
           conditional = SimplifyingIrBuilder::andExpr(
                             conditional,
                             GpuLower::current()->threadPredMap().getPredicate(
-                                ir_utils::getTvOutput(vec_expr)))
+                                ir_utils::getTvOutput(expr)))
                             ->as<Bool>();
         }
       }
       TORCH_INTERNAL_ASSERT(conditional != nullptr);
       expr->predicate()->setValue(conditional);
       TORCH_INTERNAL_ASSERT(expr->predicate()->value() != nullptr);
-      setWritePredicate(expr, conditional);
+      setWritePredicate(expr);
     }
 
     // Note: [Predicate Inversion for CpAsync]
@@ -101,7 +112,7 @@ class ConditionalFromPredicateModifier : public kir::IrVisitor {
       invertPredicateForGmemToSharedMemInitialize(expr);
     }
 
-    kir::IrVisitor::handle(expr);
+    kir::ExprMutator::handle(expr);
   }
 
   // Invert the predicate of given expr.
@@ -123,7 +134,7 @@ class ConditionalFromPredicateModifier : public kir::IrVisitor {
         ir_utils::isCpAsyncInit(maybe_init.value());
   }
 
-  void setWritePredicate(Expr* expr, Bool* read_cond) {
+  void setWritePredicate(Expr* expr) {
     if (expr->writePredicate() != nullptr) {
       auto write_cond = generateConditional(expr->writePredicate());
       if (write_cond) {
@@ -131,7 +142,7 @@ class ConditionalFromPredicateModifier : public kir::IrVisitor {
       } else {
         // If generateConditional returns null, it means no specific
         // predicate needs to be used.
-        expr->setWritePredicate(nullptr);
+        registerReplace(expr, expr->withWritePredicate(nullptr));
       }
     }
   }
@@ -150,7 +161,7 @@ class ConditionalFromPredicateModifier : public kir::IrVisitor {
       ite->predicate()->setValue(conditional);
       TORCH_INTERNAL_ASSERT(ite->predicate()->value() != nullptr);
     }
-    kir::IrVisitor::handle(ite);
+    kir::ExprMutator::handle(ite);
   }
 
   // Generate conditional according to PredicateType
diff --git a/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.cpp b/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.cpp
index 940de32ce9567..38df8229bb777 100644
--- a/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.cpp
@@ -303,12 +303,12 @@ class PredicateChcker : public IterVisitor {
 
   // Shift is not supported yet.
   bool predicateShift(Expr* expr) const {
-    auto& halo_info = GpuLower::current()->haloInfo();
+    auto halo_info = GpuLower::current()->haloInfo();
     auto input_tvs = ir_utils::filterByType<TensorView>(expr->inputs());
-    return halo_info.needsShiftPredicate(expr) ||
+    return halo_info->needsShiftPredicate(expr) ||
         std::any_of(input_tvs.begin(), input_tvs.end(), [&](auto input_tv) {
              return input_tv->definition() != nullptr &&
-                 halo_info.needsShiftPredicate(input_tv->definition());
+                 halo_info->needsShiftPredicate(input_tv->definition());
            });
   }
 
@@ -991,7 +991,7 @@ Val* PredicateElimination::getInitValue(TensorView* tv) const {
 }
 
 void PredicateElimination::build(Fusion* fusion) {
-  traverseFrom(fusion, fusion->outputs());
+  traverseTo(fusion, fusion->outputs());
 }
 
 std::string PredicateElimination::toString() const {
diff --git a/torch/csrc/jit/codegen/cuda/lower_shift.cpp b/torch/csrc/jit/codegen/cuda/lower_shift.cpp
index fe1e0cc509c13..2a7c04243f4cf 100644
--- a/torch/csrc/jit/codegen/cuda/lower_shift.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_shift.cpp
@@ -17,7 +17,7 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
-void ShiftPredicateInserter::insert(
+Expr* ShiftPredicateInserter::insert(
     Expr* expr,
     const std::vector<kir::ForLoop*>& loops,
     Bool* thread_pred,
@@ -28,9 +28,9 @@ void ShiftPredicateInserter::insert(
   TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Missing TensorView output");
 
   const bool needs_shift_predicate =
-      gpu_lower->haloInfo().needsShiftPredicate(out_tv->definition());
+      gpu_lower->haloInfo()->needsShiftPredicate(out_tv->definition());
   if (!needs_shift_predicate) {
-    return;
+    return expr;
   }
 
   // The conditional branches to create:
@@ -56,9 +56,8 @@ void ShiftPredicateInserter::insert(
   // If the expr involves a thread-block barrier, set the predicate of
   // the expr with shift_pred. Since the expr is not shift, the
   // padding is safe to omit.
-  if (ir_utils::hasBlockSync(expr, gpu_lower->threadPredMap())) {
-    expr->setPredicate(shift_pred);
-    return;
+  if (lower_utils::hasBlockSync(expr, gpu_lower->threadPredMap())) {
+    return expr->withPredicate(shift_pred);
   }
 
   auto shift_ite = IrBuilder::create<kir::IfThenElse>(shift_pred);
@@ -76,7 +75,7 @@ void ShiftPredicateInserter::insert(
 
   // No padding condition is required if this is within unswitch.
   if (within_unswitch) {
-    return;
+    return expr;
   }
 
   // Padding by zero
@@ -89,6 +88,8 @@ void ShiftPredicateInserter::insert(
   bounds_ite->thenBody().push_back(pad_expr);
   // Insert the else block
   shift_ite->elseBody().push_back(bounds_ite);
+
+  return expr;
 }
 
 int AxisHaloInfo::width() const {
@@ -145,13 +146,6 @@ const AxisHaloInfo& HaloInfo::getRootAxisInfo(IterDomain* id) const {
   return it->second;
 }
 
-AxisHaloInfo& HaloInfo::getRootAxisInfo(IterDomain* id) {
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-  return const_cast<AxisHaloInfo&>(
-      // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-      const_cast<const HaloInfo*>(this)->getRootAxisInfo(id));
-}
-
 void HaloInfo::setRootAxisInfo(
     IterDomain* id,
     const AxisHaloInfo& root_axis_info) {
@@ -161,7 +155,9 @@ void HaloInfo::setRootAxisInfo(
   return;
 }
 
-void HaloInfo::build(Fusion* fusion) {
+HaloInfo::HaloInfo(Fusion* fusion, std::shared_ptr<const ComputeAtMap> ca_map)
+    // Make a copy of the permissive map for extent comparators
+    : permissive_map_(ca_map->idGraph().permissiveNodes()) {
   const auto vals = fusion->usedMathVals();
   auto tvs = ir_utils::filterByType<TensorView>(vals);
 
@@ -202,7 +198,7 @@ void HaloInfo::build(Fusion* fusion) {
 
   // Note that validation requires consumer halo info
   for (auto tv : tvs) {
-    validate(tv);
+    validate(tv, ca_map);
   }
 }
 
@@ -445,8 +441,20 @@ void HaloInfo::build(TensorDomain* td) {
       } else {
         setHaloWidth(merge->out(), 0);
       }
-    } else if (expr->getExprType().value() == ExprType::Swizzle2D) {
+    } else if (auto swizzle = dynamic_cast<Swizzle2D*>(expr)) {
       // Assume no halo on swizzled domain for now.
+      TORCH_INTERNAL_ASSERT(
+          getExtent(swizzle->inX()) == nullptr,
+          "Halo is not supported with swizzle. Halo-extended ID: ",
+          swizzle->inX()->toString(),
+          " used in ",
+          swizzle->toString());
+      TORCH_INTERNAL_ASSERT(
+          getExtent(swizzle->inY()) == nullptr,
+          "Halo is not supported with swizzle. Halo-extended ID: ",
+          swizzle->inY()->toString(),
+          " used in ",
+          swizzle->toString());
       for (auto id : ir_utils::filterByType<IterDomain>(expr->outputs())) {
         setHaloWidth(id, 0);
       }
@@ -474,12 +482,13 @@ void HaloInfo::build(TensorDomain* td) {
 //! Other types of parallelization should be supported except for
 //! vectorization. Vectorization should be eventually supported but
 //! needs further work.
-void HaloInfo::validate(TensorView* tv) const {
+void HaloInfo::validate(
+    TensorView* tv,
+    std::shared_ptr<const ComputeAtMap> ca_map) const {
   const auto mem_type = tv->getMemoryType();
 
   for (auto axis : tv->domain()->domain()) {
-    auto concrete_id = GpuLower::current()->caMap()->getConcreteMappedID(
-        axis, IdMappingMode::LOOP);
+    auto concrete_id = ca_map->getConcreteMappedID(axis, IdMappingMode::LOOP);
 
     // The extent is assumed to be the same
     TORCH_INTERNAL_ASSERT(
@@ -526,7 +535,7 @@ void HaloInfo::validate(TensorView* tv) const {
           consumer->domain()->domain().begin(),
           consumer->domain()->domain().end(),
           [&](IterDomain* consumer_axis) {
-            return GpuLower::current()->caMap()->areMapped(
+            return ca_map->areMapped(
                 axis, consumer_axis, IdMappingMode::PERMISSIVE);
           });
       if (it == consumer->domain()->domain().end()) {
@@ -626,11 +635,10 @@ bool extentCompare(
     const HaloInfo& halo_map,
     IterDomain* id1,
     IterDomain* id2,
-    Cmp cmp) {
-  auto gpu_lower = GpuLower::current();
+    Cmp cmp,
+    const DisjointSets<IterDomain*>& permissive_map) {
   TORCH_INTERNAL_ASSERT(
-      gpu_lower->caMap()->areMapped(id1, id2, IdMappingMode::PERMISSIVE),
-      "Invalid axes to compare");
+      permissive_map.strictAreMapped(id1, id2), "Invalid axes to compare");
 
   // It's invalid to compare two axes and when only either of them has
   // halo.
@@ -652,10 +660,10 @@ bool extentCompare(
       auto merge2 = dynamic_cast<Merge*>(id2->definition());
       TORCH_INTERNAL_ASSERT(
           merge2 != nullptr, "Invalid comparison: ", id1, " and ", id2);
-      auto inner_le =
-          extentCompare(halo_map, merge1->inner(), merge2->inner(), cmp);
-      auto outer_le =
-          extentCompare(halo_map, merge1->outer(), merge2->outer(), cmp);
+      auto inner_le = extentCompare(
+          halo_map, merge1->inner(), merge2->inner(), cmp, permissive_map);
+      auto outer_le = extentCompare(
+          halo_map, merge1->outer(), merge2->outer(), cmp, permissive_map);
       return inner_le && outer_le;
     } else {
       // This is not considered. Should never reach here.
@@ -667,11 +675,11 @@ bool extentCompare(
 } // namespace
 
 bool HaloInfo::extentLessEqual(IterDomain* id1, IterDomain* id2) const {
-  return extentCompare(*this, id1, id2, std::less_equal<>());
+  return extentCompare(*this, id1, id2, std::less_equal<>(), permissive_map_);
 }
 
 bool HaloInfo::extentEqual(IterDomain* id1, IterDomain* id2) const {
-  return extentCompare(*this, id1, id2, std::equal_to<>());
+  return extentCompare(*this, id1, id2, std::equal_to<>(), permissive_map_);
 }
 
 std::string HaloInfo::toString() const {
@@ -722,19 +730,20 @@ bool HaloInfo::needsShiftPredicate(Expr* expr) const {
 }
 
 std::unordered_map<IterDomain*, Val*> HaloInfo::buildConcreteHaloExtentMap(
-    const LoopIndexing& loop_indexing) {
+    const LoopIndexing& loop_indexing) const {
   // Use a local workspace to avoid re-defining halo info.
-  HaloInfo local_halo_info;
+  HaloInfo local_halo_info = *GpuLower::current()->haloInfo();
 
-  auto& global_halo_info = GpuLower::current()->haloInfo();
+  auto global_halo_info = GpuLower::current()->haloInfo();
 
   // Setup root:
   for (auto consumer_root_id : loop_indexing.consumerTv()->getRootDomain()) {
     auto consumer_index_concrete_id =
-        ir_utils::caMapExactConcreteId(consumer_root_id);
+        GpuLower::current()->caMap()->getConcreteMappedID(
+            consumer_root_id, IdMappingMode::EXACT);
     local_halo_info.setRootAxisInfo(
         consumer_index_concrete_id,
-        global_halo_info.getRootAxisInfo(consumer_root_id));
+        global_halo_info->getRootAxisInfo(consumer_root_id));
   }
 
   // Track IDs that are generated by merging halo-extended IDs
@@ -747,7 +756,8 @@ std::unordered_map<IterDomain*, Val*> HaloInfo::buildConcreteHaloExtentMap(
           merged_shifted_ids.find(split->in()) == merged_shifted_ids.end(),
           "Splitting IterDomain that is a merged domain of halo-extended domains is not allowed");
 
-      auto in_id = ir_utils::caMapExactConcreteId(split->in());
+      auto in_id = GpuLower::current()->caMap()->getConcreteMappedID(
+          split->in(), IdMappingMode::EXACT);
 
       // If no halo info is found, nothing needs to be done. This ID
       // must be an ancestor of a domain set by setRootAxisInfo.
@@ -759,32 +769,43 @@ std::unordered_map<IterDomain*, Val*> HaloInfo::buildConcreteHaloExtentMap(
 
       if (halo_width == 0) {
         local_halo_info.setHaloWidth(
-            ir_utils::caMapExactConcreteId(split->outer()), 0);
+            GpuLower::current()->caMap()->getConcreteMappedID(
+                split->outer(), IdMappingMode::EXACT),
+            0);
         local_halo_info.setHaloWidth(
-            ir_utils::caMapExactConcreteId(split->inner()), 0);
+            GpuLower::current()->caMap()->getConcreteMappedID(
+                split->inner(), IdMappingMode::EXACT),
+            0);
         continue;
       }
 
       // propagate to inner domain
-      auto out_id = ir_utils::caMapExactConcreteId(split->inner());
+      auto out_id = GpuLower::current()->caMap()->getConcreteMappedID(
+          split->inner(), IdMappingMode::EXACT);
 
       auto expanded_extent =
           SimplifyingIrBuilder::addExpr(out_id->extent(), halo_width);
       local_halo_info.extent_map_.insert({out_id, expanded_extent});
 
       local_halo_info.setHaloWidth(
-          ir_utils::caMapExactConcreteId(split->outer()), 0);
+          GpuLower::current()->caMap()->getConcreteMappedID(
+              split->outer(), IdMappingMode::EXACT),
+          0);
       local_halo_info.setHaloWidth(
-          ir_utils::caMapExactConcreteId(split->inner()), halo_width);
+          GpuLower::current()->caMap()->getConcreteMappedID(
+              split->inner(), IdMappingMode::EXACT),
+          halo_width);
 
       // TODO: add support for inheritance map
     } else if (auto merge = dynamic_cast<Merge*>(expr)) {
       // If either of the two inputs has halo extension, propagate it
       // to the merged output ID
       auto inner_extent = local_halo_info.getExtent(
-          ir_utils::caMapExactConcreteId(merge->inner()));
+          GpuLower::current()->caMap()->getConcreteMappedID(
+              merge->inner(), IdMappingMode::EXACT));
       auto outer_extent = local_halo_info.getExtent(
-          ir_utils::caMapExactConcreteId(merge->outer()));
+          GpuLower::current()->caMap()->getConcreteMappedID(
+              merge->outer(), IdMappingMode::EXACT));
       if (inner_extent != nullptr || outer_extent != nullptr) {
         if (inner_extent == nullptr) {
           inner_extent = merge->inner()->extent();
@@ -795,28 +816,41 @@ std::unordered_map<IterDomain*, Val*> HaloInfo::buildConcreteHaloExtentMap(
         auto expanded_extent =
             SimplifyingIrBuilder::mulExpr(outer_extent, inner_extent);
         local_halo_info.extent_map_.insert(
-            {ir_utils::caMapExactConcreteId(merge->out()), expanded_extent});
+            {GpuLower::current()->caMap()->getConcreteMappedID(
+                 merge->out(), IdMappingMode::EXACT),
+             expanded_extent});
         // Splitting the output of this merge is not allowed, so
         // remember it
-        merged_shifted_ids.insert(ir_utils::caMapExactConcreteId(merge->out()));
+        merged_shifted_ids.insert(
+            GpuLower::current()->caMap()->getConcreteMappedID(
+                merge->out(), IdMappingMode::EXACT));
         // Note that halo_width_map_ is not updated
       } else {
-        setHaloWidth(ir_utils::caMapExactConcreteId(merge->out()), 0);
+        local_halo_info.setHaloWidth(
+            GpuLower::current()->caMap()->getConcreteMappedID(
+                merge->out(), IdMappingMode::EXACT),
+            0);
       }
     } else if (auto swizzle_2d = dynamic_cast<Swizzle2D*>(expr)) {
       // Swizzle with halo not yet supported, just set the width
       //  to zero at the moment.
       TORCH_INTERNAL_ASSERT(
           local_halo_info.getHaloWidth(
-              ir_utils::caMapExactConcreteId(swizzle_2d->inX())) == 0 &&
+              GpuLower::current()->caMap()->getConcreteMappedID(
+                  swizzle_2d->inX(), IdMappingMode::EXACT)) == 0 &&
               local_halo_info.getHaloWidth(
-                  ir_utils::caMapExactConcreteId(swizzle_2d->inY())) == 0,
+                  GpuLower::current()->caMap()->getConcreteMappedID(
+                      swizzle_2d->inY(), IdMappingMode::EXACT)) == 0,
           "Swizzle on ID with halo not yet supported.");
       TORCH_INTERNAL_ASSERT("Swizzle on ID with halo not yet supported.");
       local_halo_info.setHaloWidth(
-          ir_utils::caMapExactConcreteId(swizzle_2d->outX()), 0);
+          GpuLower::current()->caMap()->getConcreteMappedID(
+              swizzle_2d->outX(), IdMappingMode::EXACT),
+          0);
       local_halo_info.setHaloWidth(
-          ir_utils::caMapExactConcreteId(swizzle_2d->outY()), 0);
+          GpuLower::current()->caMap()->getConcreteMappedID(
+              swizzle_2d->outY(), IdMappingMode::EXACT),
+          0);
     } else {
       TORCH_INTERNAL_ASSERT(false, "Unsupported expr: ", expr);
     }
diff --git a/torch/csrc/jit/codegen/cuda/lower_shift.h b/torch/csrc/jit/codegen/cuda/lower_shift.h
index d1500c5f9f203..f12410703d99d 100644
--- a/torch/csrc/jit/codegen/cuda/lower_shift.h
+++ b/torch/csrc/jit/codegen/cuda/lower_shift.h
@@ -61,23 +61,12 @@ class AxisHaloInfo {
 class TORCH_CUDA_CU_API HaloInfo {
  public:
   //! Scan a fusion and collect all information for lowering
-  void build(Fusion* fusion);
-
-  //! Build mappings of extent information of a TensorDomain
-  void build(TensorDomain* td);
+  HaloInfo(Fusion* fusion, std::shared_ptr<const ComputeAtMap> ca_map);
 
   //! Almost exact duplicate of build(TensorDomain* td), except that
   //!  the traversal was done on loop indexing expressions.
   std::unordered_map<IterDomain*, Val*> buildConcreteHaloExtentMap(
-      const LoopIndexing& loop_indexing);
-
-  //! Set initial AxisHaloInfo of a root axis
-  //!
-  //! The axis does not need to be a root domain in the case of
-  //! reference tensors. Reference tensors get halo information from
-  //! consumer root domains, which may correspond to rfactor domains
-  //! of tensors from which reference tensors are derived.
-  void setRootAxisInfo(IterDomain* id, const AxisHaloInfo& root_axis_info);
+      const LoopIndexing& loop_indexing) const;
 
   //! Returns true if id has the root halo information set by
   //! setRootAxisInfo.
@@ -88,7 +77,6 @@ class TORCH_CUDA_CU_API HaloInfo {
   //! This is only for root axes. It is an error to query with
   //! non-root axes.
   const AxisHaloInfo& getRootAxisInfo(IterDomain* id) const;
-  AxisHaloInfo& getRootAxisInfo(IterDomain* id);
 
   //! Query if an axis has a halo width.
   //!
@@ -139,10 +127,21 @@ class TORCH_CUDA_CU_API HaloInfo {
   std::string toString() const;
 
  private:
+  //! Build mappings of extent information of a TensorDomain
+  void build(TensorDomain* td);
+
   //! Propagate root axis information from outputs to inputs of an
   //! expression
   void propagateRootAxisInfo(Expr* expr);
 
+  //! Set initial AxisHaloInfo of a root axis
+  //!
+  //! The axis does not need to be a root domain in the case of
+  //! reference tensors. Reference tensors get halo information from
+  //! consumer root domains, which may correspond to rfactor domains
+  //! of tensors from which reference tensors are derived.
+  void setRootAxisInfo(IterDomain* id, const AxisHaloInfo& root_axis_info);
+
   //! Adds a domain to the halo inheritance map.
   //!
   //! A domain, child, is added to the same set as domain parent. Both
@@ -163,11 +162,15 @@ class TORCH_CUDA_CU_API HaloInfo {
   void initializeFromRootAxisInfo(IterDomain* id);
 
   //! Validate shift usage
-  void validate(TensorView* td) const;
+  void validate(TensorView* td, std::shared_ptr<const ComputeAtMap> ca_map)
+      const;
 
   void setHaloWidth(IterDomain* id, int halo_width);
 
  private:
+  // Copy the permissive map from the passed in compute at map
+  const DisjointSets<IterDomain*> permissive_map_;
+
   //! Halo information of root axes
   std::unordered_map<IterDomain*, AxisHaloInfo> root_axis_map_;
 
@@ -222,7 +225,7 @@ class ShiftPredicateInserter {
   //! the generated predicate. The branch structure is different from
   //! the usual predicated expression, so the insertion is also done
   //! here.
-  static void insert(
+  static Expr* insert(
       Expr* expr,
       const std::vector<kir::ForLoop*>& loops,
       Bool* thread_pred,
diff --git a/torch/csrc/jit/codegen/cuda/lower_sync_information.cpp b/torch/csrc/jit/codegen/cuda/lower_sync_information.cpp
index 497256b5f850e..9b8ccd4a77ae4 100644
--- a/torch/csrc/jit/codegen/cuda/lower_sync_information.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_sync_information.cpp
@@ -26,7 +26,7 @@ void validateParallelizationOfTensor(TensorView* tv) {
     // It doesn't matter if this axis is a non-concretized broadcast
     // TODO: merging broadcast and non-broadcast
     if (axis->isBroadcast() &&
-        !GpuLower::current()->concretizedBroadcastDomains().isConcretized(
+        !GpuLower::current()->concretizedBroadcastDomains()->isConcretized(
             axis)) {
       continue;
     }
@@ -195,7 +195,7 @@ void SyncMap::build(Fusion* fusion) {
               (!parallel_bcast_doms.get(consumer_ptype) ||
                !GpuLower::current()
                     ->concretizedBroadcastDomains()
-                    .isConcretized(consumer_axis))) {
+                    ->isConcretized(consumer_axis))) {
             continue;
           }
 
@@ -240,12 +240,12 @@ void SyncMap::build(Fusion* fusion) {
                     p_id, c_id, IdMappingMode::PERMISSIVE)) {
               const auto halo_info = GpuLower::current()->haloInfo();
 
-              if (halo_info.hasHaloWidth(p_id) !=
-                      halo_info.hasHaloWidth(c_id) ||
-                  (halo_info.hasHaloWidth(p_id) &&
-                   halo_info.hasHaloWidth(c_id) &&
-                   halo_info.getHaloWidth(p_id) !=
-                       halo_info.getHaloWidth(c_id))) {
+              if (halo_info->hasHaloWidth(p_id) !=
+                      halo_info->hasHaloWidth(c_id) ||
+                  (halo_info->hasHaloWidth(p_id) &&
+                   halo_info->hasHaloWidth(c_id) &&
+                   halo_info->getHaloWidth(p_id) !=
+                       halo_info->getHaloWidth(c_id))) {
                 raw_dims.set(parallel_type);
                 continue;
               }
@@ -410,33 +410,13 @@ void SyncMap::build(Fusion* fusion) {
             }
           }
 
-          // If same parallel type and mapped, no need for syncs unless
-          // producer is in smem, producer parallel type is a thread
-          // dimension, and consumer concretizes the dimension. This sync is
-          // due to the redundant predicate omission in lower thread
-          // predicate.
-          auto redundant_preds = GpuLower::current()
-                                     ->threadPredMap()
-                                     .getPredicateInfo(producer)
-                                     .redundant_types;
-
-          if (p_id->isBroadcast() &&
-              GpuLower::current()->concretizedBroadcastDomains().isConcretized(
-                  p_id) &&
-              producer->getMemoryType() == MemoryType::Shared &&
-              redundant_preds.hasTID()) {
-            redundant_preds.clearAllBID();
-            raw_dims |= redundant_preds;
-            continue;
-          }
-
           // When the producer axis is a broadcast, it is not really
           // parallelized unless thread-predicated and concretized
           if (isParallelTypeThread(producer_ptype) && p_id->isBroadcast() &&
               (!parallel_bcast_doms.get(producer_ptype) ||
                !GpuLower::current()
                     ->concretizedBroadcastDomains()
-                    .isConcretized(p_id))) {
+                    ->isConcretized(p_id))) {
             continue;
           }
 
@@ -483,7 +463,7 @@ void SyncMap::build(Fusion* fusion) {
       } // end for consumers
 
       if (raw_dims.any()) {
-        needs_raw_sync_[producer] = raw_dims;
+        needs_raw_sync_[producer] |= raw_dims;
       }
 
     } // end producer
@@ -492,10 +472,14 @@ void SyncMap::build(Fusion* fusion) {
 
 std::string SyncMap::toString() const {
   std::stringstream ss;
-  ss << "TVs requiring RAW:" << std::endl;
+  ss << "SyncMap:";
+  bool is_first = true;
   for (auto entry : needs_raw_sync_) {
-    ss << "  " << entry.first->toString() << " :: " << entry.second.toString()
-       << std::endl;
+    if (!is_first) {
+      ss << ",";
+    }
+    ss << " " << entry.first->toString() << " -> " << entry.second.toString();
+    is_first = false;
   }
   return ss.str();
 }
diff --git a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp b/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp
index 18a4426cb7c05..dc10224a165c0 100644
--- a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp
@@ -237,7 +237,7 @@ void ThreadPredicateMap::updateBitSet(const Expr* expr) {
           id_reductions.set(id->getParallelType());
         }
         if (id->isBroadcast() &&
-            GpuLower::current()->concretizedBroadcastDomains().isConcretized(
+            GpuLower::current()->concretizedBroadcastDomains()->isConcretized(
                 id)) {
           id_bcasts.set(id->getParallelType());
         }
@@ -316,7 +316,7 @@ class RedundantUseAnalysis : BackwardVisitor {
  public:
   RedundantUseAnalysis(Fusion* fusion, const ThreadPredicateMap& pred_map)
       : fusion_(fusion), pred_map_(pred_map) {
-    traverseFrom(fusion, fusion->terminatingMathVals());
+    traverseTo(fusion, fusion->terminatingMathVals());
   }
 
   //! Returns a bit map signifying the parallel dimensions
@@ -575,7 +575,8 @@ ParallelTypeBitmap ThreadPredicateMap::getParallelBroadcastDomains(
 
   for (auto id : iter_domains) {
     if (!id->isBroadcast() ||
-        !GpuLower::current()->concretizedBroadcastDomains().isConcretized(id)) {
+        !GpuLower::current()->concretizedBroadcastDomains()->isConcretized(
+            id)) {
       continue;
     }
     if (id->isBlockDim() || (!output_smem && id->isThreadDim())) {
diff --git a/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.cpp b/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.cpp
index 324bab279b37e..88a84aa3c5877 100644
--- a/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.cpp
@@ -1,6 +1,5 @@
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
 
 #include <torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.h>
@@ -10,7 +9,7 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
-void ConcretizedBroadcastDomains::build(Fusion* fusion) {
+ConcretizedBroadcastDomains::ConcretizedBroadcastDomains(Fusion* fusion) {
   exact_map_ = std::make_unique<ExactRootDomainMap>(fusion);
 
   // Initialize the origin map with input broadcast domains
diff --git a/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.h b/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.h
index 24658f3cfe7c3..c30fa9951404a 100644
--- a/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.h
+++ b/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.h
@@ -23,7 +23,8 @@ namespace cuda {
 //! domains are marked as concretized.
 class TORCH_CUDA_CU_API ConcretizedBroadcastDomains : private IterVisitor {
  public:
-  void build(Fusion* fusion);
+  ConcretizedBroadcastDomains() = delete;
+  ConcretizedBroadcastDomains(Fusion* fusion);
 
   //! Is a domain concretized?
   bool isConcretized(IterDomain* id) const;
diff --git a/torch/csrc/jit/codegen/cuda/lower_unroll.cpp b/torch/csrc/jit/codegen/cuda/lower_unroll.cpp
index 434d1711d9c83..63dbbf83d775d 100644
--- a/torch/csrc/jit/codegen/cuda/lower_unroll.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_unroll.cpp
@@ -54,6 +54,14 @@ bool isReductionInitExpr(const Expr* expr) {
 
 } // namespace
 
+void UnrollPass::registerReplace(
+    Expr* reference,
+    Expr* new_expr,
+    kir::Scope* scope) {
+  kir::ExprMutator::registerReplace(reference, new_expr, scope);
+  GpuLower::current()->propagateExprInfo(reference, new_expr);
+}
+
 void UnrollPass::handle(Expr* expr) {
   if (ir_utils::isTvOp(expr)) {
     // If tv op, predicate it
@@ -79,11 +87,16 @@ void UnrollPass::handle(Expr* expr) {
 
     non_trivial_pred_found_ = true;
 
+    Expr* expr_with_predicate = expr;
+
     // When a predicate needs to account for ShiftOp, it is currently
     // taken care by its own function.
-    if (GpuLower::current()->haloInfo().needsShiftPredicate(expr)) {
-      ShiftPredicateInserter::insert(
+    if (GpuLower::current()->haloInfo()->needsShiftPredicate(expr)) {
+      expr_with_predicate = ShiftPredicateInserter::insert(
           expr, for_loops_, thread_pred, unswitched_loop_);
+      if (expr_with_predicate != expr) {
+        registerReplace(expr, expr_with_predicate, &for_loops_.back()->body());
+      }
       return;
     }
 
@@ -93,17 +106,18 @@ void UnrollPass::handle(Expr* expr) {
           ? thread_pred_expr
           : IrBuilder::create<kir::Predicate>(
                 PredicateType::ReductionWrite, expr, thread_pred);
-      expr->setWritePredicate(write_pred);
+      expr_with_predicate = expr_with_predicate->withWritePredicate(write_pred);
     }
 
     // For expr calling a device func with block sync, don't create
     // if-then-else but pass the predicate to the device func
-    if (ir_utils::hasBlockSync(expr, GpuLower::current()->threadPredMap())) {
+    if (lower_utils::hasBlockSync(expr, GpuLower::current()->threadPredMap())) {
       const auto pred = unswitched_loop_
           ? thread_pred_expr
           : IrBuilder::create<kir::Predicate>(
                 PredicateType::Inline, expr, thread_pred);
-      expr->setPredicate(pred);
+      expr_with_predicate = expr_with_predicate->withPredicate(pred);
+      registerReplace(expr, expr_with_predicate, &for_loops_.back()->body());
       return;
     }
 
@@ -124,6 +138,12 @@ void UnrollPass::handle(Expr* expr) {
                                     PredicateType::Inline, expr, thread_pred);
     }
 
+    if (lower_utils::supportInlinePredicate(expr)) {
+      expr_with_predicate = expr_with_predicate->withPredicate(pred);
+      registerReplace(expr, expr_with_predicate, &for_loops_.back()->body());
+      return;
+    }
+
     // If we need a predicate, put expr inside an if then else
     kir::IfThenElse* inline_ite = IrBuilder::create<kir::IfThenElse>(pred);
     if (for_loops_.empty()) {
@@ -135,7 +155,10 @@ void UnrollPass::handle(Expr* expr) {
       kir::ExprMutator::registerReplace(
           expr, inline_ite, &for_loops_.back()->body());
     }
-    inline_ite->thenBody().push_back(expr);
+    if (expr != expr_with_predicate) {
+      GpuLower::current()->propagateExprInfo(expr, expr_with_predicate);
+    }
+    inline_ite->thenBody().push_back(expr_with_predicate);
   } else if (auto for_loop = dynamic_cast<kir::ForLoop*>(expr)) {
     handle(for_loop);
   }
@@ -222,7 +245,7 @@ bool UnrollPass::canOmitElseClause(kir::ForLoop* fl) {
     // If there's any expression that requires barrier
     // synchronization, the else part can't be omitted
     for (auto expr : loop->body().exprs()) {
-      if (ir_utils::hasBlockSync(expr, pred_map)) {
+      if (lower_utils::hasBlockSync(expr, pred_map)) {
         return false;
       }
     }
@@ -264,9 +287,7 @@ bool UnrollPass::canOmitElseClause(kir::ForLoop* fl) {
   return true;
 }
 
-// Generate the loop nest structure and place it in lowered_exprs
 UnrollPass::UnrollPass(const std::vector<Expr*>& exprs) {
-  FUSER_PERF_SCOPE("GpuLower::Lower::UnrollPass::computeMap");
   kir::ExprMutator::traverseAndInsert(exprs);
 }
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_unroll.h b/torch/csrc/jit/codegen/cuda/lower_unroll.h
index 14725c405b770..786e45115ba65 100644
--- a/torch/csrc/jit/codegen/cuda/lower_unroll.h
+++ b/torch/csrc/jit/codegen/cuda/lower_unroll.h
@@ -62,6 +62,8 @@ class TORCH_CUDA_CU_API UnrollPass : kir::ExprMutator {
   static bool canOmitElseClause(kir::ForLoop* fl);
 
  private:
+  void registerReplace(Expr* reference, Expr* new_expr, kir::Scope* scope);
+
   // Generate the for Expr replacement map
   UnrollPass(const std::vector<Expr*>& exprs);
 
diff --git a/torch/csrc/jit/codegen/cuda/lower_utils.cpp b/torch/csrc/jit/codegen/cuda/lower_utils.cpp
index 5802b2b99b4b8..3e92269f278a7 100644
--- a/torch/csrc/jit/codegen/cuda/lower_utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_utils.cpp
@@ -36,13 +36,42 @@ kir::IfThenElse* cloneIfThenElse(kir::IfThenElse* ite) {
 
 namespace ir_utils {
 
-TVDomainGuard::TVDomainGuard(TensorView* _tv, TensorDomain* td)
-    : tv_(_tv), prev_domain(tv_->domain()) {
+TVDomainGuard::TVDomainGuard(TensorView* tv, TensorDomain* td)
+    : tv_(tv), prev_domain_(tv_->domain()) {
   tv_->setDomain(td);
 }
 
+TVDomainGuard::TVDomainGuard(TVDomainGuard&& guard)
+    : tv_(nullptr), prev_domain_(guard.prev_domain_) {
+  std::swap(tv_, guard.tv_);
+}
+
 TVDomainGuard::~TVDomainGuard() {
-  tv_->setDomain(prev_domain);
+  if (tv_ != nullptr) {
+    tv_->setDomain(prev_domain_);
+  }
+}
+
+ir_utils::TVDomainGuard overrideContiguityGuard(
+    TensorView* tv,
+    bool contiguity) {
+  // Use domain guard to ignore the contiguity of
+  //  consumer tv.
+  TensorDomain* domain_with_specified_contiguity = nullptr;
+  std::vector<bool> contiguity_vector(
+      tv->getMaybeRFactorDomain().size(), contiguity);
+  if (tv->hasRFactor()) {
+    domain_with_specified_contiguity = IrBuilder::create<TensorDomain>(
+        tv->getRootDomain(),
+        tv->getRFactorDomain(),
+        tv->domain()->domain(),
+        contiguity_vector);
+  } else {
+    domain_with_specified_contiguity = IrBuilder::create<TensorDomain>(
+        tv->getRootDomain(), tv->domain()->domain(), contiguity_vector);
+  }
+
+  return ir_utils::TVDomainGuard(tv, domain_with_specified_contiguity);
 }
 
 std::vector<IterDomain*> iterDomainInputsOf(
@@ -92,7 +121,9 @@ bool isTvOp(const Expr* expr) {
        expr->getExprType().value() == ExprType::BinaryOp ||
        expr->getExprType().value() == ExprType::TernaryOp ||
        expr->getExprType().value() == ExprType::RNGOp ||
+       expr->getExprType().value() == ExprType::FullOp ||
        expr->getExprType().value() == ExprType::ARangeOp ||
+       expr->getExprType().value() == ExprType::EyeOp ||
        expr->getExprType().value() == ExprType::ReductionOp ||
        expr->getExprType().value() == ExprType::GroupedReductionOp ||
        expr->getExprType().value() == ExprType::WelfordOp ||
@@ -204,35 +235,6 @@ bool isScalarOp(const Expr* expr) {
   return true;
 }
 
-bool hasBlockSync(const Expr* expr, const ThreadPredicateMap& pred_map) {
-  if (expr->isA<kir::BlockSync>()) {
-    return true;
-  }
-
-  if (!isTvOp(expr)) {
-    return false;
-  }
-
-  if (!(isReductionOp(expr) || expr->isA<BroadcastOp>() ||
-        expr->isA<kir::GridBroadcast>())) {
-    return false;
-  }
-
-  // GroupedReductionOp can have multiple output TVs, but they must be
-  // parallelized in the same way, so just checking one of them is enough.
-  auto tv = getTvOutput(expr);
-
-  if (tv->hasBlockReduction() || tv->hasGridReduction()) {
-    return true;
-  } else if (expr->isA<BroadcastOp>()) {
-    const ParallelTypeBitmap pt_map =
-        GpuLower::current()->threadPredMap().getParallelBroadcastDomains(tv);
-    return pt_map.any();
-  }
-
-  return false;
-}
-
 c10::optional<IterDomain*> getMaybeWarpReductionDim(
     const Val* output,
     const Val* input) {
@@ -369,20 +371,6 @@ bool isGlobalLoadInit(const Expr* expr) {
   return false;
 }
 
-kir::Allocate* allocGlobalBufferForGridComm(
-    Val* buffer_size,
-    DataType dtype,
-    bool zero_init) {
-  const std::vector<IterDomain*> new_buffer_ids = {
-      IrBuilder::create<IterDomain>(IterDomainBuilder(
-          GpuLower::current()->kernel()->zeroVal(), buffer_size))};
-  const auto buffer_domain = IrBuilder::create<TensorDomain>(new_buffer_ids);
-  const auto buffer_tv =
-      IrBuilder::create<TensorView>(buffer_domain, dtype, MemoryType::Global);
-  return IrBuilder::create<kir::Allocate>(
-      buffer_tv, buffer_tv->getMemoryType(), nullptr, zero_init);
-}
-
 namespace {
 
 class ExprFlattener : private kir::IrVisitor {
@@ -417,112 +405,6 @@ std::vector<Expr*> flattenScopedExprs(const std::vector<Expr*>& loop_nests) {
   return ExprFlattener::flatten(loop_nests);
 }
 
-IterDomain* caMapExactConcreteId(IterDomain* id) {
-  return GpuLower::current()->caMap()->getConcreteMappedID(
-      id, IdMappingMode::EXACT);
-}
-
-std::vector<Expr*> getAllSwizzlesBetween(
-    std::vector<IterDomain*> from,
-    std::vector<IterDomain*> to) {
-  auto all_expr = DependencyCheck::getAllExprsBetween(
-      {from.begin(), from.end()}, {to.begin(), to.end()});
-
-  std::vector<Expr*> all_swizzles;
-
-  std::copy_if(
-      all_expr.begin(),
-      all_expr.end(),
-      std::back_inserter(all_swizzles),
-      [](Expr* expr) {
-        return expr->getExprType().has_value() &&
-            (expr->etype() == ExprType::Swizzle2D);
-      });
-
-  return all_swizzles;
-}
-
-} // namespace ir_utils
-
-namespace loop_utils {
-
-BasicAllocInfo getAllocInformation(
-    const TensorView* tv,
-    const std::vector<kir::ForLoop*>& for_loops,
-    const std::unordered_map<IterDomain*, IterDomain*>& id_map,
-    bool use_id_map) {
-  BasicAllocInfo info;
-  auto gpu_lower = GpuLower::current();
-
-  bool outer_alloc_found = false;
-
-  for (auto fl : for_loops) {
-    if (info.alloc_pos == tv->getComputeAtPosition()) {
-      break;
-    }
-
-    if (tv->axis(info.alloc_pos)->isReduction()) {
-      const auto outputs = FusionGuard::getCurFusion()->getTerminatingOutputs();
-      TORCH_INTERNAL_ASSERT(
-          std::find(outputs.begin(), outputs.end(), tv) != outputs.end(),
-          "Invalid computeAt of T",
-          tv->name(),
-          ". A reducation axis is detected outside computeAt point even though it is not an output tensor.");
-      break;
-    }
-
-    auto fl_id = fl->iter_domain();
-
-    if (fl_id->getParallelType() == ParallelType::Unroll) {
-      break;
-    }
-
-    // Shared memory must be allocated outside of unswitched
-    // domains. See issue #1133.
-    if (fl_id->getParallelType() == ParallelType::Unswitch &&
-        tv->getMemoryType() == MemoryType::Shared) {
-      outer_alloc_found = true;
-    }
-
-    // Assume global memory is allocated at outer most scope.
-    if (tv->getMemoryType() == MemoryType::Global) {
-      outer_alloc_found = true;
-    }
-
-    // Allocation of a double buffered tensor is placed outside its
-    // double buffer axis.
-    if ((tv->isDoubleBuffered() || tv->isCircularBuffered()) &&
-        tv->axis(info.alloc_pos) ==
-            gpu_lower->doubleBufferInfo().getDoubleBufferAxis(tv)) {
-      outer_alloc_found = true;
-    }
-
-    auto local_id = tv->axis(info.alloc_pos);
-
-    if (use_id_map) {
-      auto id_it = id_map.find(local_id);
-      if (id_it != id_map.end()) {
-        local_id = id_it->second;
-      }
-    }
-
-    if (GpuLower::current()->caMap()->areMapped(
-            local_id, fl_id, IdMappingMode::PERMISSIVE)) {
-      info.alloc_pos++;
-    }
-
-    info.init_for_loop = fl;
-
-    if (!outer_alloc_found) {
-      info.alloc_for_loop = fl;
-    }
-  }
-
-  return info;
-}
-
-} // namespace loop_utils
-
 namespace {
 
 class ReplaceExprInput : private kir::ExprMutator {
@@ -564,8 +446,8 @@ class ReplaceExprInput : private kir::ExprMutator {
 
   // Copy predicates and register expression replacement
   void registerReplaceWithPredicate(Expr* old_expr, Expr* new_expr) {
-    new_expr->setPredicate(old_expr->predicate());
-    new_expr->setWritePredicate(old_expr->writePredicate());
+    new_expr = new_expr->withPredicate(old_expr->predicate())
+                   ->withWritePredicate(old_expr->writePredicate());
     registerReplace(old_expr, new_expr);
   }
 
@@ -703,15 +585,161 @@ std::vector<Expr*> replaceInputsInExpr(
   return ReplaceExprInput::replace(exprs, replacement_map);
 }
 
-bool isTrivialIterDomain(IterDomain* id) {
-  auto pt = id->getParallelType();
-  return id->isReduction() || id->isBroadcast() || id->isStride() ||
-      (id->extent()->isOneInt() && id->start()->isZeroInt()) ||
-      pt == ParallelType::Vectorize ||
-      (isParallelTypeThread(pt) &&
-       !GpuLower::current()->haloInfo().hasHaloWidth(id));
+std::vector<Expr*> getAllSwizzlesBetween(
+    std::vector<IterDomain*> from,
+    std::vector<IterDomain*> to) {
+  auto all_expr = DependencyCheck::getAllExprsBetween(
+      {from.begin(), from.end()}, {to.begin(), to.end()});
+
+  std::vector<Expr*> all_swizzles;
+
+  std::copy_if(
+      all_expr.begin(),
+      all_expr.end(),
+      std::back_inserter(all_swizzles),
+      [](Expr* expr) {
+        return expr->getExprType().has_value() &&
+            (expr->etype() == ExprType::Swizzle2D);
+      });
+
+  return all_swizzles;
+}
+
+} // namespace ir_utils
+
+namespace lower_utils {
+
+bool hasBlockSync(const Expr* expr, const ThreadPredicateMap& pred_map) {
+  if (expr->isA<kir::BlockSync>()) {
+    return true;
+  }
+
+  if (!ir_utils::isTvOp(expr)) {
+    return false;
+  }
+
+  if (!(ir_utils::isReductionOp(expr) || expr->isA<BroadcastOp>() ||
+        expr->isA<kir::GridBroadcast>())) {
+    return false;
+  }
+
+  // GroupedReductionOp can have multiple output TVs, but they must be
+  // parallelized in the same way, so just checking one of them is enough.
+  auto tv = ir_utils::getTvOutput(expr);
+
+  if (tv->hasBlockReduction() || tv->hasGridReduction()) {
+    return true;
+  } else if (expr->isA<BroadcastOp>()) {
+    const ParallelTypeBitmap pt_map =
+        GpuLower::current()->threadPredMap().getParallelBroadcastDomains(tv);
+    return pt_map.any();
+  }
+
+  return false;
+}
+
+kir::Allocate* allocGlobalBufferForGridComm(
+    Val* buffer_size,
+    DataType dtype,
+    bool zero_init) {
+  const std::vector<IterDomain*> new_buffer_ids = {
+      IrBuilder::create<IterDomain>(IterDomainBuilder(
+          GpuLower::current()->kernel()->zeroVal(), buffer_size))};
+  const auto buffer_domain = IrBuilder::create<TensorDomain>(new_buffer_ids);
+  const auto buffer_tv =
+      IrBuilder::create<TensorView>(buffer_domain, dtype, MemoryType::Global);
+  return IrBuilder::create<kir::Allocate>(
+      buffer_tv, buffer_tv->getMemoryType(), nullptr, zero_init);
+}
+
+BasicAllocInfo getAllocInformation(
+    const TensorView* tv,
+    const std::vector<kir::ForLoop*>& for_loops,
+    const std::unordered_map<IterDomain*, IterDomain*>& id_map,
+    bool use_id_map) {
+  BasicAllocInfo info;
+  auto gpu_lower = GpuLower::current();
+
+  bool outer_alloc_found = false;
+
+  for (auto fl : for_loops) {
+    if (info.alloc_pos == tv->getComputeAtPosition()) {
+      break;
+    }
+
+    if (tv->axis(info.alloc_pos)->isReduction()) {
+      const auto outputs = FusionGuard::getCurFusion()->getTerminatingOutputs();
+      TORCH_INTERNAL_ASSERT(
+          std::find(outputs.begin(), outputs.end(), tv) != outputs.end(),
+          "Invalid computeAt of T",
+          tv->name(),
+          ". A reducation axis is detected outside computeAt point even though it is not an output tensor.");
+      break;
+    }
+
+    auto fl_id = fl->iter_domain();
+
+    if (fl_id->getParallelType() == ParallelType::Unroll) {
+      break;
+    }
+
+    // Shared memory must be allocated outside of unswitched
+    // domains. See issue #1133.
+    if (fl_id->getParallelType() == ParallelType::Unswitch &&
+        tv->getMemoryType() == MemoryType::Shared) {
+      outer_alloc_found = true;
+    }
+
+    // Assume global memory is allocated at outer most scope.
+    if (tv->getMemoryType() == MemoryType::Global) {
+      outer_alloc_found = true;
+    }
+
+    // Allocation of a double buffered tensor is placed outside its
+    // double buffer axis.
+    if ((tv->isDoubleBuffered() || tv->isCircularBuffered()) &&
+        tv->axis(info.alloc_pos) ==
+            gpu_lower->doubleBufferInfo().getDoubleBufferAxis(tv)) {
+      outer_alloc_found = true;
+    }
+
+    auto local_id = tv->axis(info.alloc_pos);
+
+    if (use_id_map) {
+      auto id_it = id_map.find(local_id);
+      if (id_it != id_map.end()) {
+        local_id = id_it->second;
+      }
+    }
+
+    if (GpuLower::current()->caMap()->areMapped(
+            local_id, fl_id, IdMappingMode::PERMISSIVE)) {
+      info.alloc_pos++;
+    }
+
+    info.init_for_loop = fl;
+
+    if (!outer_alloc_found) {
+      info.alloc_for_loop = fl;
+    }
+  }
+
+  return info;
+}
+
+//! Implementing this in here to avoid including too many headers
+//!  in type.cpp. Conceptually this should be a generic definition
+//!  rather than a util.
+bool supportInlinePredicate(Expr* expr) {
+  if (ir_utils::isCpAsyncOp(expr)) {
+    return true;
+  }
+  // TODO: build out support.
+  return false;
 }
 
+} // namespace lower_utils
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/lower_utils.h b/torch/csrc/jit/codegen/cuda/lower_utils.h
index d8821fd0d4ebe..4807c1e5520ea 100644
--- a/torch/csrc/jit/codegen/cuda/lower_utils.h
+++ b/torch/csrc/jit/codegen/cuda/lower_utils.h
@@ -39,24 +39,32 @@ namespace ir_utils {
 // producers with a consumer set of indices, so we need to view the producer
 // transformed like consumer while we index. This will set the tv with td for
 // the life of this context guard.
-class TVDomainGuard {
+class TORCH_CUDA_CU_API TVDomainGuard {
  private:
   TensorView* tv_;
-  TensorDomain* prev_domain;
+  TensorDomain* prev_domain_;
 
  public:
-  explicit TVDomainGuard(TensorView* _tv, TensorDomain* td);
+  explicit TVDomainGuard(TensorView* tv, TensorDomain* td);
+  TVDomainGuard(const TVDomainGuard&) = delete;
+  TVDomainGuard(TVDomainGuard&&);
 
   //! An utility to access the tensordomain before the temporary
   //!  view. This is used to retrieve information, like swizzle
   //!  information that can only be reliably kept at the original domain.
   const TensorDomain* prevDomain() const {
-    return prev_domain;
+    return prev_domain_;
   }
 
   ~TVDomainGuard();
 };
 
+// Create a TVDomainGuard that temporarily view a tensorview with specified
+// all-true or all-false contiguity.
+TORCH_CUDA_CU_API ir_utils::TVDomainGuard overrideContiguityGuard(
+    TensorView* tv,
+    bool contiguity);
+
 //! Return inputs of provided IterDomains that are IterDomains. A list
 //! of input IterDomain can be optionally given. Otherwise,
 //! IterDomains with no defining expression are returned.
@@ -82,8 +90,6 @@ TORCH_CUDA_CU_API TensorView* getTvOutput(const Expr*);
 // Returns the first input of Expr that is a TensorView
 TORCH_CUDA_CU_API TensorView* getTvInput(const Expr*);
 
-bool hasBlockSync(const Expr* expr, const ThreadPredicateMap& pred_map);
-
 //! Returns the iterdomain that maps to the thread dimension grouped
 //!  to warps. Returns nullopt if the reduction is not to be lowered to
 //!  a warp reduction.
@@ -108,13 +114,6 @@ bool derivedFromRootCAAxes(const TensorView* tv, IterDomain* axis);
 std::unordered_map<ParallelType, IterDomain*, TypeHash> getParallelDomains(
     const Val* val);
 
-// Allocate global buffer for a grid communication calls, i.e. grid reduce, grid
-// welford reduce, grid broadcast.
-kir::Allocate* allocGlobalBufferForGridComm(
-    Val* buffer_size,
-    DataType dtype,
-    bool zero_init);
-
 //! Returns true if the expression will be lowered to
 //!  a ldmatrix intrinsic.
 bool isLdMatrixOp(const Expr* expr);
@@ -150,49 +149,12 @@ bool isTensorScalarFillOp(const Expr* expr);
 TORCH_CUDA_CU_API std::vector<Expr*> flattenScopedExprs(
     const std::vector<Expr*>& loop_nests);
 
-//! Returns the concretized iterdomain according to
-//!  the exact compute at map.
-IterDomain* caMapExactConcreteId(IterDomain* id);
-
 //! Returns all swizzle ops between the set of iterdomains
 //!  in `from` and `to`.
 std::vector<Expr*> getAllSwizzlesBetween(
     std::vector<IterDomain*> from,
     std::vector<IterDomain*> to);
 
-} // namespace ir_utils
-
-namespace loop_utils {
-
-struct BasicAllocInfo {
-  // The for loop that the initialization of this allocation must be
-  // placed in, nullptr if not within a loop
-  kir::ForLoop* init_for_loop = nullptr;
-
-  // Keep track of the actual allocation loop. This can be different
-  // from init_for_loop only with unswitched shared memory allocations,
-  // which are moved outer loops to avoid duplicated allocations. This means
-  // that the alloc position may be outside what's expected. Most applications
-  // outside lower_allocation is likely looking for init_for_loop which is
-  // more directly related to how large an allocation is and how it's used.
-  // (see issue #1133).
-  kir::ForLoop* alloc_for_loop = nullptr;
-
-  // The allocation position relative to buffer IDs, it could be outside the
-  // compute at position if it's shared memory with a compute at inside an
-  // unswitch
-  size_t alloc_pos = 0;
-};
-
-// Fill the above allocation struct based on provided information. id_map is
-// used if we're looking at a producer tensor but loops on a consumer tensor.
-BasicAllocInfo getAllocInformation(
-    const TensorView* tv,
-    const std::vector<kir::ForLoop*>& loops,
-    const std::unordered_map<IterDomain*, IterDomain*>& id_map = {},
-    bool use_id_map = false);
-} // namespace loop_utils
-
 // Replace value pass on Kernel IR.
 //  Replace each use of any Val* that apears in the given `replacement_map`
 //  Keeps the predicate carried by each expr
@@ -203,9 +165,6 @@ std::vector<Expr*> replaceInputsInExpr(
     const std::vector<Expr*>& exprs,
     const std::unordered_map<Val*, Val*>& replacement_map);
 
-// True if an IterDomain does not materialize a loop
-bool isTrivialIterDomain(IterDomain* id);
-
 // Go through all expressions and compute a local ordering of loops. operator<
 // is implemented based on the concrete_id_dependencies analysis done. If
 // there's no dependency between two IDs then order doesn't mater, otherwise we
@@ -235,7 +194,7 @@ struct TORCH_CUDA_CU_API IterDomainDependencySorter {
   IterDomainDependencySorter(
       const std::unordered_map<IterDomain*, std::unordered_set<IterDomain*>>&
           concrete_id_dependencies,
-      const std::unique_ptr<ComputeAtMap>& compute_at_map)
+      std::shared_ptr<const ComputeAtMap> compute_at_map)
       : concrete_id_dependencies_(concrete_id_dependencies),
         compute_at_map_(compute_at_map) {}
 
@@ -261,9 +220,56 @@ struct TORCH_CUDA_CU_API IterDomainDependencySorter {
 
   const std::unordered_map<IterDomain*, std::unordered_set<IterDomain*>>&
       concrete_id_dependencies_;
-  const std::unique_ptr<ComputeAtMap>& compute_at_map_;
+  const std::shared_ptr<const ComputeAtMap> compute_at_map_;
 };
 
+} // namespace ir_utils
+
+namespace lower_utils {
+
+bool hasBlockSync(const Expr* expr, const ThreadPredicateMap& pred_map);
+
+// Allocate global buffer for a grid communication calls, i.e. grid reduce, grid
+// welford reduce, grid broadcast.
+kir::Allocate* allocGlobalBufferForGridComm(
+    Val* buffer_size,
+    DataType dtype,
+    bool zero_init);
+
+struct BasicAllocInfo {
+  // The for loop that the initialization of this allocation must be
+  // placed in, nullptr if not within a loop
+  kir::ForLoop* init_for_loop = nullptr;
+
+  // Keep track of the actual allocation loop. This can be different
+  // from init_for_loop only with unswitched shared memory allocations,
+  // which are moved outer loops to avoid duplicated allocations. This means
+  // that the alloc position may be outside what's expected. Most applications
+  // outside lower_allocation is likely looking for init_for_loop which is
+  // more directly related to how large an allocation is and how it's used.
+  // (see issue #1133).
+  kir::ForLoop* alloc_for_loop = nullptr;
+
+  // The allocation position relative to buffer IDs, it could be outside the
+  // compute at position if it's shared memory with a compute at inside an
+  // unswitch
+  size_t alloc_pos = 0;
+};
+
+// Fill the above allocation struct based on provided information. id_map is
+// used if we're looking at a producer tensor but loops on a consumer tensor.
+BasicAllocInfo getAllocInformation(
+    const TensorView* tv,
+    const std::vector<kir::ForLoop*>& loops,
+    const std::unordered_map<IterDomain*, IterDomain*>& id_map = {},
+    bool use_id_map = false);
+
+//! Returns true if the expression has a variant that takes a predicate
+//!  as an inline argument.
+bool supportInlinePredicate(Expr* expr);
+
+} // namespace lower_utils
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/lower_validation.cpp b/torch/csrc/jit/codegen/cuda/lower_validation.cpp
index de2c1135ad202..da1def37cad84 100644
--- a/torch/csrc/jit/codegen/cuda/lower_validation.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_validation.cpp
@@ -1183,7 +1183,7 @@ void validateAndConvertIterDomainGrouping(Fusion* fusion) {
 
       // Halo is not allowed
       TORCH_CHECK(
-          GpuLower::current()->haloInfo().getExtent(id) == nullptr,
+          GpuLower::current()->haloInfo()->getExtent(id) == nullptr,
           "Invalid use of ParallelType::Group.",
           " Grouping of halo-extended IterDomain, ",
           id->toString(),
diff --git a/torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp b/torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp
index 1d87790c014fb..ff603c1d18f64 100644
--- a/torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp
@@ -136,7 +136,7 @@ class EliminateDeadBroadcastAndAllocate {
 //!    be removed, and generates a replacement map from the broadcast
 //!    output to reduction output.
 //!
-//!   2. kir_utils::replaceInputsInExpr replaces applicable uses of
+//!   2. ir_utils::replaceInputsInExpr replaces applicable uses of
 //!    the broadcast output with the corresponding reduction output.
 //!
 //!   3. EliminateDeadBroadcastAndAllocate removes the broadcast ops
@@ -145,8 +145,8 @@ class FuseBroadcastWithWarpReduce : private kir::IrVisitor {
  public:
   static std::vector<Expr*> fuse(const std::vector<Expr*>& exprs) {
     FuseBroadcastWithWarpReduce fuse_broadcast_map(exprs);
-    const auto replaced_inputs =
-        replaceInputsInExpr(exprs, fuse_broadcast_map.val_replacement_map_);
+    const auto replaced_inputs = ir_utils::replaceInputsInExpr(
+        exprs, fuse_broadcast_map.val_replacement_map_);
     return EliminateDeadBroadcastAndAllocate::run(replaced_inputs);
   }
 
diff --git a/torch/csrc/jit/codegen/cuda/manager.cpp b/torch/csrc/jit/codegen/cuda/manager.cpp
index 22f914de407ee..4eb61c78b749f 100644
--- a/torch/csrc/jit/codegen/cuda/manager.cpp
+++ b/torch/csrc/jit/codegen/cuda/manager.cpp
@@ -62,12 +62,16 @@ namespace {
 // in the fallback path.
 void enableAliasCopyNodes(const std::shared_ptr<Graph>& graph, Block* block) {
   static std::unordered_set<Symbol> alias_copy_op(
-      {prim::view_copy,
-       prim::reshape_copy,
-       prim::expand_copy,
+      {prim::expand_copy,
        prim::expand_as_copy,
+       prim::flatten_copy,
+       prim::permute_copy,
+       prim::reshape_copy,
        prim::squeeze_copy,
-       prim::unsqueeze_copy});
+       prim::t_copy,
+       prim::transpose_copy,
+       prim::unsqueeze_copy,
+       prim::view_copy});
 
   for (Node* n : block->nodes()) {
     for (Block* b : n->blocks()) {
diff --git a/torch/csrc/jit/codegen/cuda/mutator.cpp b/torch/csrc/jit/codegen/cuda/mutator.cpp
index 96bc40c20c90c..12a3de15f4a7f 100644
--- a/torch/csrc/jit/codegen/cuda/mutator.cpp
+++ b/torch/csrc/jit/codegen/cuda/mutator.cpp
@@ -125,7 +125,18 @@ void OptOutMutator::mutate(kir::TensorIndex*) {
   TORCH_INTERNAL_ASSERT(false, "Not implemented yet.");
 }
 
-// MUTATE FUNCTIONS FOR EXPRESSIONS.
+void OptOutMutator::mutate(FullOp* fop) {
+  Val* out = maybeMutated(fop->output(0));
+  Val* fill_value = maybeMutated(fop->getFillValue());
+
+  if (out->sameAs(fop->output(0))) {
+    return;
+  }
+  auto container = fop->container();
+  container->removeExpr(fop);
+  IrBuilder::create<FullOp>(container, out, fill_value, fop->dtype());
+}
+
 void OptOutMutator::mutate(ARangeOp* aop) {
   Val* out = maybeMutated(aop->output(0));
 
@@ -140,7 +151,20 @@ void OptOutMutator::mutate(ARangeOp* aop) {
       aop->start(),
       aop->end(),
       aop->step(),
-      aop->getLinearIndex());
+      aop->dtype(),
+      aop->getLinearLogicalIndex());
+}
+
+void OptOutMutator::mutate(EyeOp* eop) {
+  Val* out = maybeMutated(eop->output(0));
+
+  if (out->sameAs(eop->output(0))) {
+    return;
+  }
+  auto container = eop->container();
+  container->removeExpr(eop);
+  IrBuilder::create<EyeOp>(
+      container, out, eop->dtype(), eop->getIndex1(), eop->getIndex2());
 }
 
 void OptOutMutator::mutate(UnaryOp* uop) {
@@ -190,8 +214,13 @@ void OptOutMutator::mutate(TernaryOp* top) {
 
 void OptOutMutator::mutate(RNGOp* rop) {
   Val* out = maybeMutated(rop->output(0));
+  auto& parameters = rop->getParameters();
+  std::vector<Val*> mutated_parameters;
+  for (auto v : parameters) {
+    mutated_parameters.emplace_back(maybeMutated(v));
+  }
 
-  if (out == rop->output(0)) {
+  if (out == rop->output(0) && mutated_parameters == parameters) {
     return;
   }
 
@@ -199,7 +228,13 @@ void OptOutMutator::mutate(RNGOp* rop) {
   auto rop_type = rop->getRNGOpType();
   container->removeExpr(rop);
   IrBuilder::create<RNGOp>(
-      container, rop_type, out, rop->getRNGOffset(), rop->getPhiloxIndex());
+      container,
+      rop_type,
+      out,
+      rop->dtype(),
+      mutated_parameters,
+      rop->getRNGOffset(),
+      rop->getPhiloxIndex());
 }
 
 void OptOutMutator::mutate(ReductionOp* rop) {
diff --git a/torch/csrc/jit/codegen/cuda/non_divisible_split.cpp b/torch/csrc/jit/codegen/cuda/non_divisible_split.cpp
index 3a2ab5f5eb5be..eaff9274892dd 100644
--- a/torch/csrc/jit/codegen/cuda/non_divisible_split.cpp
+++ b/torch/csrc/jit/codegen/cuda/non_divisible_split.cpp
@@ -23,7 +23,7 @@ void NonDivisibleSplitInfo::build(Fusion* fusion) {
         tv->domain()->domain().begin(), tv->domain()->domain().end());
     current_tv_ = tv;
     clearReachability();
-    traverseFrom(fusion, domain_vals);
+    traverseTo(fusion, domain_vals);
     current_tv_ = nullptr;
   }
 
@@ -53,7 +53,16 @@ void NonDivisibleSplitInfo::handle(Split* split) {
         splits_to_validate_.insert(split);
       } else {
         // Not proven to be a divisible split
-        splits_to_predicate_[current_tv_].push_back(split);
+        auto gpu_lower = GpuLower::current();
+        TORCH_INTERNAL_ASSERT(gpu_lower != nullptr);
+
+        // If we know this split must be divisible, it's either validated as
+        // above, exact matches to a case matching the above, or exact matches
+        // to a transformation from view which must be divisible.
+        if (gpu_lower->divisbleSplitSet().find(split) ==
+            gpu_lower->divisbleSplitSet().end()) {
+          splits_to_predicate_[current_tv_].push_back(split);
+        }
       }
 
       is_protected = true;
diff --git a/torch/csrc/jit/codegen/cuda/ops/alias.cpp b/torch/csrc/jit/codegen/cuda/ops/alias.cpp
index b51c64a0bab0e..20c6ee533063d 100644
--- a/torch/csrc/jit/codegen/cuda/ops/alias.cpp
+++ b/torch/csrc/jit/codegen/cuda/ops/alias.cpp
@@ -36,6 +36,8 @@ TensorView* applyViewTransforms(
     TensorView* orig_tv,
     TensorView* post_reduce_tv,
     const AnalyzeViewResult& view_analysis) {
+  TORCH_INTERNAL_ASSERT(orig_tv != nullptr, "Input is invalid.");
+  TORCH_INTERNAL_ASSERT(post_reduce_tv != nullptr, "Input is invalid.");
   TORCH_INTERNAL_ASSERT(
       !post_reduce_tv->hasComputeAt(),
       "Cannot modify rfactor domain after compute at has been set.");
@@ -43,10 +45,6 @@ TensorView* applyViewTransforms(
   TORCH_INTERNAL_ASSERT(
       post_reduce_tv->nDims() > 0, "Tried to view a 0-dim TensorView");
 
-  TORCH_CHECK(
-      !post_reduce_tv->domain()->hasRFactor(),
-      "Cannot call view on the same TensorView twice.");
-
   TORCH_INTERNAL_ASSERT(!view_analysis.transforms.empty());
 
   TensorView* consumer = IrBuilder::create<TensorView>(
@@ -62,6 +60,7 @@ TensorView* applyViewTransforms(
 } // namespace
 
 TensorView* view(TensorView* x, DataType dtype) {
+  TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
   if (x->getDataType() == dtype) {
     return x;
   }
@@ -81,6 +80,7 @@ TensorView* view(
     TensorView* x,
     const std::vector<int64_t>& original_sizes,
     const std::vector<int64_t>& new_sizes) {
+  TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
   TORCH_INTERNAL_ASSERT(
       TensorDomain::noReductions(x->getMaybeRFactorDomain()).size() ==
       original_sizes.size());
@@ -111,6 +111,7 @@ TensorView* view(
 }
 
 TensorView* flatten(TensorView* x, int64_t start_dim, int64_t end_dim) {
+  TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
   auto inp_domain = TensorDomain::noReductions(x->getMaybeRFactorDomain());
   if (start_dim < 0) {
     start_dim += inp_domain.size();
@@ -140,6 +141,7 @@ TensorView* flatten(TensorView* x, int64_t start_dim, int64_t end_dim) {
 }
 
 TensorView* squeeze(TensorView* x, const std::vector<int64_t>& sizes) {
+  TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
   const auto ndims = static_cast<int>(x->domain()->noReductions().size());
 
   TORCH_INTERNAL_ASSERT(
@@ -163,6 +165,7 @@ TensorView* squeeze(TensorView* x, const std::vector<int64_t>& sizes) {
 }
 
 TensorView* squeeze(TensorView* x, const std::vector<int64_t>& sizes, int dim) {
+  TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
   const auto ndims = static_cast<int>(x->domain()->noReductions().size());
 
   TORCH_INTERNAL_ASSERT(
@@ -191,6 +194,7 @@ TensorView* squeeze(TensorView* x, const std::vector<int64_t>& sizes, int dim) {
 }
 
 TensorView* unsqueeze(TensorView* x, int dim) {
+  TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
   const auto ndims = static_cast<int>(x->domain()->noReductions().size());
 
   if (dim < 0) {
@@ -210,17 +214,31 @@ TensorView* unsqueeze(TensorView* x, int dim) {
 }
 
 TensorView* permute(TensorView* x, const std::vector<int64_t>& new2old) {
+  TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
   if (new2old.size() == 0) {
     return set(x);
   }
   auto inp_domain = TensorDomain::noReductions(x->getMaybeRFactorDomain());
   std::vector<IterDomain*> out_domain(inp_domain.size());
 
+  TORCH_CHECK(
+      inp_domain.size() == new2old.size(),
+      "The number of dimensions in the tensor input does not match the length",
+      " of the desired ordering of dimensions i.e. input.dim() = ",
+      inp_domain.size(),
+      " is not equal to len(dims) = ",
+      new2old.size());
+
+  // Return scalar tensors immediately
+  if (inp_domain.size() == 0) {
+    return set(x);
+  }
+
   auto normalized_new2old =
       ir_utils::normalizeNew2Old(new2old, inp_domain.size());
 
   for (const auto i : c10::irange(out_domain.size())) {
-    auto in_id = inp_domain[new2old[i]];
+    auto in_id = inp_domain[normalized_new2old[i]];
     out_domain[i] = in_id->cloneWithoutRFactor();
   }
 
@@ -233,6 +251,7 @@ TensorView* permute(TensorView* x, const std::vector<int64_t>& new2old) {
 }
 
 TensorView* transpose(TensorView* x, int64_t dim0, int64_t dim1) {
+  TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
   const auto ndims = static_cast<int>(x->domain()->noReductions().size());
 
   if (dim0 < 0) {
@@ -263,6 +282,7 @@ TensorView* transpose(TensorView* x, int64_t dim0, int64_t dim1) {
 }
 
 TensorView* transpose(TensorView* x) {
+  TORCH_INTERNAL_ASSERT(x != nullptr, "Input is invalid.");
   const auto ndims = static_cast<int>(x->domain()->noReductions().size());
 
   TORCH_CHECK(
diff --git a/torch/csrc/jit/codegen/cuda/ops/composite.cpp b/torch/csrc/jit/codegen/cuda/ops/composite.cpp
index 5aa1d64c5cf1a..a7905c4894c15 100644
--- a/torch/csrc/jit/codegen/cuda/ops/composite.cpp
+++ b/torch/csrc/jit/codegen/cuda/ops/composite.cpp
@@ -27,7 +27,7 @@ ForwardDropoutResult dropout(TensorView* x, Val* prob, Val* scale) {
           scale->getDataType().value() == DataType::Double,
       "Scale is not a valid Double.");
 
-  auto rand_vals = randlike(x);
+  auto rand_vals = rand_like(x);
   auto mask = lt(rand_vals, prob);
   auto apply_mask = mul(x, mask);
   auto y = mul(apply_mask, scale);
diff --git a/torch/csrc/jit/codegen/cuda/ops/normalization.cpp b/torch/csrc/jit/codegen/cuda/ops/normalization.cpp
index 4484539467cd4..f1739c665f035 100644
--- a/torch/csrc/jit/codegen/cuda/ops/normalization.cpp
+++ b/torch/csrc/jit/codegen/cuda/ops/normalization.cpp
@@ -589,8 +589,7 @@ ForwardNormResult batch_norm(
     // During inference, mean/invstd output are empty tensors
     // on CPU, but not on CUDA. We need to make sure we have the same
     // behavior as with eager mode on CUDA.
-    mean = set(running_mean); // use set to avoid "trivial input forwarding NOT
-                              // IMPLEMENTED" error
+    mean = running_mean;
     invstd = unbiased_invstd;
     y = mul(x_sub_mean, invstd_bcast);
   }
@@ -843,8 +842,10 @@ ForwardNormResult instance_norm(
         broadcast(unbiased_invstd, channels_only_broadcast_mask);
 
     // During inference, mean/invstd output are empty tensors
-    mean = TensorViewBuilder().shape(std::vector<int64_t>{0}).build();
-    invstd = TensorViewBuilder().shape(std::vector<int64_t>{0}).build();
+    // on CPU, but not on CUDA. We need to make sure we have the same
+    // behavior as with eager mode on CUDA.
+    mean = running_mean;
+    invstd = unbiased_invstd;
     y = mul(x_sub_mean, invstd_bcast);
   }
 
diff --git a/torch/csrc/jit/codegen/cuda/parser.cpp b/torch/csrc/jit/codegen/cuda/parser.cpp
index 95d266db58270..e78d5effbee3e 100644
--- a/torch/csrc/jit/codegen/cuda/parser.cpp
+++ b/torch/csrc/jit/codegen/cuda/parser.cpp
@@ -1321,7 +1321,7 @@ class IrParser {
               }
             }
 
-            auto out = randlike(operand);
+            auto out = rand_like(operand);
             value_map.emplace(
                 node->output()->unique(), ValueHolder(out, format));
           },
@@ -3378,6 +3378,115 @@ class IrParser {
           },
           nullptr);
     }
+
+    {
+      auto ptr_op = getOperatorForLiteral(
+          "prim::permute_copy.int(Tensor(a) self, int[] dims) -> Tensor");
+      REGISTER_PARSE_RULE(
+          ptr_op,
+          {
+            MemoryFormat format;
+            std::list<Val*> list_val;
+            std::tie(format, list_val) = getConsistentValues(
+                c10::nullopt, value_map[node->inputs()[0]->unique()]);
+            auto self_t = list_val.front();
+            list_val.pop_front();
+            auto self = self_t->as<TensorView>();
+
+            auto dims = constant_as<c10::List<int64_t>>(node->input(1));
+            TORCH_INTERNAL_ASSERT(
+                dims.has_value(), "The dims parameter is required.");
+            TORCH_INTERNAL_ASSERT(
+                dims.value().size() == self->getMaybeRFactorDomain().size());
+
+            auto output = permute(self, dims->vec());
+            value_map.emplace(
+                node->output()->unique(), ValueHolder(output, format));
+          },
+          [](const Node* node) -> bool {
+            if (!isInputNonSizeZeroTensor(node)) {
+              return false;
+            }
+            auto dims = constant_as<c10::List<int64_t>>(node->input(1));
+            if (!dims.has_value()) {
+              return false;
+            }
+
+            return true;
+          },
+          nullptr);
+    }
+
+    {
+      auto ptr_op = getOperatorForLiteral(
+          "prim::transpose_copy.int(Tensor(a) self, int dim0, int dim1) -> Tensor");
+      REGISTER_PARSE_RULE(
+          ptr_op,
+          {
+            MemoryFormat format;
+            std::list<Val*> list_val;
+            std::tie(format, list_val) = getConsistentValues(
+                c10::nullopt, value_map[node->inputs()[0]->unique()]);
+            auto self_t = list_val.front();
+            list_val.pop_front();
+            auto self = self_t->as<TensorView>();
+
+            auto dim0 = constant_as<int>(node->input(1));
+            TORCH_INTERNAL_ASSERT(
+                dim0.has_value(), "dim0 in transpose is not valid.");
+
+            auto dim1 = constant_as<int>(node->input(2));
+            TORCH_INTERNAL_ASSERT(
+                dim1.has_value(), "dim1 in transpose is not valid.");
+
+            auto output = transpose(self, dim0.value(), dim1.value());
+            value_map.emplace(
+                node->output()->unique(), ValueHolder(output, format));
+          },
+          [](const Node* node) -> bool {
+            if (!isInputNonSizeZeroTensor(node)) {
+              return false;
+            }
+            if (node->input(1)->node()->kind() != prim::Constant) {
+              return false;
+            }
+            if (node->input(2)->node()->kind() != prim::Constant) {
+              return false;
+            }
+            return true;
+          },
+          nullptr);
+    }
+
+    {
+      auto ptr_op =
+          getOperatorForLiteral("prim::t_copy(Tensor(a) self) -> Tensor");
+      REGISTER_PARSE_RULE(
+          ptr_op,
+          {
+            MemoryFormat format;
+            std::list<Val*> list_val;
+            std::tie(format, list_val) = getConsistentValues(
+                c10::nullopt, value_map[node->inputs()[0]->unique()]);
+            auto self_t = list_val.front();
+            list_val.pop_front();
+            auto self = self_t->as<TensorView>();
+
+            TORCH_INTERNAL_ASSERT(self->getMaybeRFactorDomain().size() <= 2);
+
+            auto output = transpose(self);
+            value_map.emplace(
+                node->output()->unique(), ValueHolder(output, format));
+          },
+          [](const Node* node) -> bool {
+            if (!isInputNonSizeZeroTensor(node)) {
+              return false;
+            }
+
+            return true;
+          },
+          nullptr);
+    }
   }
 
   void processJitNode(const JitOp* node) {
@@ -4141,6 +4250,49 @@ bool insertProfileIValue(ProfilingRecord* pr, Node* node, size_t offset) {
     return true;
   }
 
+  static auto permute_schema =
+      getOperatorForLiteral(
+          "aten::permute(Tensor(a) self, int[] dims) -> Tensor(a)")
+          ->schema();
+  static auto permute_copy_schema =
+      getOperatorForLiteral(
+          "prim::permute_copy(Tensor(a) self, int[] dims) -> Tensor")
+          ->schema();
+  if (node->matches(permute_schema) || node->matches(permute_copy_schema)) {
+    switch (offset) {
+      // argument 1: dims;
+      case 1:
+        profileIntList(pr, node, offset);
+        break;
+      default:
+        return false;
+    }
+    return true;
+  }
+
+  static auto transpose_int_copy_schema =
+      getOperatorForLiteral(
+          "aten::transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)")
+          ->schema();
+  static auto transpose_int_schema =
+      getOperatorForLiteral(
+          "prim::transpose_copy.int(Tensor(a) self, int dim0, int dim1) -> Tensor")
+          ->schema();
+  if (node->matches(transpose_int_copy_schema) ||
+      node->matches(transpose_int_schema)) {
+    switch (offset) {
+      // argument 1: dim0;
+      // argument 2: dim1;
+      case 1:
+      case 2:
+        profileInt(pr, node, offset);
+        break;
+      default:
+        return false;
+    }
+    return true;
+  }
+
   static auto batch_norm_impl_index_schema =
       getOperatorForLiteral(
           "aten::_batch_norm_impl_index(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> (Tensor, Tensor, Tensor, Tensor, int)")
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp b/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
index 73c8ea6ce23cf..68fe709deb78f 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
@@ -390,7 +390,7 @@ void initNvFuserPythonBindings(PyObject* module) {
   NVFUSER_PYTHON_BINDING_UNARY_OP("neg", neg)
   NVFUSER_PYTHON_BINDING_UNARY_OP("bitwise_not", bitwise_not)
   NVFUSER_PYTHON_BINDING_UNARY_OP("relu", relu)
-  NVFUSER_PYTHON_BINDING_UNARY_OP("rand_like", randlike)
+  NVFUSER_PYTHON_BINDING_UNARY_OP("rand_like", rand_like)
   NVFUSER_PYTHON_BINDING_UNARY_OP("reciprocal", reciprocal)
   NVFUSER_PYTHON_BINDING_UNARY_OP("round", round)
   NVFUSER_PYTHON_BINDING_UNARY_OP("rsqrt", rsqrt)
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_cache.cpp b/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_cache.cpp
index d1f9d8102a500..607c560dab74d 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_cache.cpp
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_cache.cpp
@@ -6,6 +6,7 @@
 
 #include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_cache.h>
 #include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
+#include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
 
 // Tests go in torch::jit
 namespace torch {
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_definition.cpp b/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_definition.cpp
index 84aa4da5909ae..bae9cf6def810 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_definition.cpp
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_definition.cpp
@@ -8,6 +8,7 @@
 #include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_interface.h>
 #include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h>
 #include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
+#include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
 
 // Tests go in torch::jit
 namespace torch {
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_record.cpp b/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_record.cpp
index 47785156ef788..5ae2db7db8805 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_record.cpp
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_record.cpp
@@ -6,6 +6,7 @@
 
 #include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h>
 #include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
+#include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
 
 // Tests go in torch::jit
 namespace torch {
diff --git a/torch/csrc/jit/codegen/cuda/reference_tensor.h b/torch/csrc/jit/codegen/cuda/reference_tensor.h
deleted file mode 100644
index 07c83bb6ed74c..0000000000000
--- a/torch/csrc/jit/codegen/cuda/reference_tensor.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#pragma once
-
-#include <c10/macros/Export.h>
-
-#include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
-
-#include <unordered_map>
-
-namespace torch {
-namespace jit {
-namespace fuser {
-namespace cuda {
-
-struct ReferenceTensor {
-  TensorDomain* domain = nullptr;
-
-  // Map from concrete iteration domains in ComputeAtMaps to iter domains
-  // including those used to construct domain.
-  std::unordered_map<IterDomain*, IterDomain*> concrete_to_id;
-  // Map from reference iteration domains to concrete iteration domains.
-  std::unordered_map<IterDomain*, IterDomain*> id_to_concrete;
-};
-
-} // namespace cuda
-} // namespace fuser
-} // namespace jit
-} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/root_domain_map.cpp b/torch/csrc/jit/codegen/cuda/root_domain_map.cpp
index ff3ed11ae1902..09a740d01097d 100644
--- a/torch/csrc/jit/codegen/cuda/root_domain_map.cpp
+++ b/torch/csrc/jit/codegen/cuda/root_domain_map.cpp
@@ -186,27 +186,39 @@ auto ensureMapping(
   return it;
 }
 
+TensorView* lookUpTv(const TensorDomain* td) {
+  Fusion* fusion = FusionGuard::getCurFusion();
+  for (auto tv : ir_utils::filterByType<TensorView>(fusion->vals())) {
+    if (tv->domain() == td) {
+      return tv;
+    }
+  }
+  return nullptr;
+}
+
 } // namespace
 
 std::string DomainKey::toString() const {
   std::stringstream ss;
-  ss << "{";
-  if (td()) {
-    ss << td() << " (root: " << td()->getRootDomain()
-       << ", maybe rfactor: " << td()->getMaybeRFactorDomain() << ")";
-  } else {
-    ss << "null";
-  }
-  ss << ", ";
   if (id()) {
     ss << id();
   } else {
     ss << "null";
   }
   if (concreteId()) {
-    ss << " (" << concreteId() << ")";
+    ss << " (concrete: " << concreteId() << ")";
+  }
+  ss << " in ";
+  if (td()) {
+    auto tv = lookUpTv(td());
+    TORCH_INTERNAL_ASSERT(tv != nullptr, "No TV found for ", td()->toString());
+    ss << "T" << tv->name() << "[ " << td()->getRootDomain() << " ]";
+    if (td()->hasRFactor()) {
+      ss << " (Rfactor: [ " << td()->getMaybeRFactorDomain() << " ])";
+    }
+  } else {
+    ss << "null";
   }
-  ss << "}";
   return ss.str();
 }
 
@@ -226,7 +238,7 @@ class FindInputDomains : BackwardVisitor {
   }
 
   DomainKeySet find() {
-    traverseFrom(tv_->fusion(), {tv_});
+    traverseTo(tv_->fusion(), {tv_});
     return input_keys_;
   }
 
@@ -685,7 +697,7 @@ ComputeAtRootDomainMapBuilder::ComputeAtRootDomainMapBuilder(
       map_through_reduction_(map_through_reduction) {
   Fusion* fusion = FusionGuard::getCurFusion();
   TORCH_INTERNAL_ASSERT(fusion != nullptr);
-  traverseFrom(fusion, fusion->outputs(), false);
+  traverseTo(fusion, fusion->outputs(), false);
   if (!pending_map_.empty()) {
     std::stringstream ss;
     ss << "pending map:\n";
@@ -823,10 +835,6 @@ void ComputeAtRootDomainMapBuilder::setMaybeMapped(
       addToPendingList(producer_bcast_key, consumer_bcast_key);
     }
   } else {
-    TORCH_INTERNAL_ASSERT(
-        !consumer_id->isBroadcast(),
-        "No concrete domain found for a broadcast domain: ",
-        consumer_key.toString());
     auto producer_concrete_key = producer_key;
     if (producer_id->isBroadcast()) {
       const auto concrete_id = consumer_id;
@@ -862,7 +870,7 @@ void ComputeAtRootDomainMapBuilder::mapPointwiseOrReductionOp(Expr* e) {
   const auto& out_root = out_td->getRootDomain();
 
   // Record equalities from output to all the inputs
-  // ignores un-concretizable broadcasts
+  // ignores non-concretizable broadcasts
   for (auto* in_tv : ir_utils::filterByType<TensorView>(e->inputs())) {
     const TensorDomain* in_td = in_tv->domain();
     std::vector<IterDomain*> in_root =
@@ -878,15 +886,16 @@ void ComputeAtRootDomainMapBuilder::mapPointwiseOrReductionOp(Expr* e) {
     for (const auto it : c10::irange(in_root.size())) {
       if (e->outputs().size() > 1) {
         TORCH_INTERNAL_ASSERT(
-            e->isA<WelfordOp>() || e->isA<GroupedReductionOp>(),
-            "Multi-output mapping assumes WelforddOp or GroupedReductionOp but, ",
+            e->isA<WelfordOp>() || e->isA<GroupedReductionOp>() ||
+                e->isA<GroupedWelfordOp>(),
+            "Unknown multi-output Expr type ",
             e->getExprType().value(),
             " is found");
-        for (auto o : e->outputs()) {
-          auto o_tv = o->as<TensorView>();
-          auto o_td = o_tv->domain();
-          auto o_root = o_td->getRootDomain();
-          setMaybeMapped(in_td, in_root[it], o_td, o_root[it]);
+        for (auto out : e->outputs()) {
+          auto out_tv = out->as<TensorView>();
+          auto out_td = out_tv->domain();
+          auto out_root = out_td->getRootDomain();
+          setMaybeMapped(in_td, in_root[it], out_td, out_root[it]);
         }
       } else {
         setMaybeMapped(in_td, in_root[it], out_td, out_root[it]);
@@ -1056,7 +1065,7 @@ void ComputeAtRootDomainMapBuilder::handle(TensorView* tv) {
     mapAllPendingMappings(td, id);
   }
 
-  // When tv has a rfactor domain, propagate the domain mappings from
+  // When tv has an rfactor domain, propagate the domain mappings from
   // each of the rfactor axes to the dependent root axes.
   if (td->hasViewLikeRFactor()) {
     std::unordered_set<Val*> root_set(
@@ -1114,7 +1123,7 @@ class ExactRootDomainMapBuilder : private IterVisitor {
       Fusion* fusion,
       DisjointSets<const IterDomain*>& eq_sets)
       : eq_sets_(eq_sets) {
-    traverseFrom(fusion, fusion->outputs());
+    traverseTo(fusion, fusion->outputs());
   }
 
  private:
diff --git a/torch/csrc/jit/codegen/cuda/root_domain_map.h b/torch/csrc/jit/codegen/cuda/root_domain_map.h
index cf2becbd1c718..fa3d323ba6d21 100644
--- a/torch/csrc/jit/codegen/cuda/root_domain_map.h
+++ b/torch/csrc/jit/codegen/cuda/root_domain_map.h
@@ -289,6 +289,8 @@ class TORCH_CUDA_CU_API ComputeAtRootDomainMap : public RootDomainMap {
       const TensorDomain* producer,
       const TensorDomain* consumer) const;
 
+  std::string toString() const;
+
  private:
   //! Returns if key_a and key(td_b, id_b) are mapped to eachother (equivalent),
   //! or are the same key.
@@ -331,8 +333,6 @@ class TORCH_CUDA_CU_API ComputeAtRootDomainMap : public RootDomainMap {
       const std::unordered_set<IterDomain*>& root_dims_to_map,
       bool producer_to_consumer) const override;
 
-  std::string toString() const;
-
  private:
   //! Disjoint set of all mapped <TD, ID> keys to determine axes equivalency
   DisjointSets<DomainKey, DomainKeyHash> eq_set_;
diff --git a/torch/csrc/jit/codegen/cuda/runtime/memory.cu b/torch/csrc/jit/codegen/cuda/runtime/memory.cu
index bc275ec1cc40a..e064a43090fd7 100644
--- a/torch/csrc/jit/codegen/cuda/runtime/memory.cu
+++ b/torch/csrc/jit/codegen/cuda/runtime/memory.cu
@@ -152,6 +152,31 @@ DEVICE_INLINE void cpAsync(
       "n"(byte_size));
 }
 
+// Global to SMEM load that is asynchronous,
+// not guaranteed to be completed until cpAsyncBarrier() is called.
+template <typename dtype, int len>
+DEVICE_INLINE void cpAsync(
+    Array<dtype, len, len>* smem_ptr,
+    void const* gmem_ptr,
+    bool predicate) {
+  unsigned smem_addr = util::toSmem(&(smem_ptr->array[0]));
+  constexpr int byte_size = sizeof(dtype) * len;
+
+  static_assert(
+      byte_size == 4 || byte_size == 8 || byte_size == 16,
+      "cp_async : unsupported byte size");
+
+  asm volatile(
+      "{\n"
+      "  .reg .pred p;\n"
+      "  setp.ne.b32 p, %3, 0;\n"
+      "@p cp.async.ca.shared.global [%0], [%1], %2;\n"
+      "}\n" ::"r"(smem_addr),
+      "l"(gmem_ptr),
+      "n"(byte_size),
+      "r"((int)predicate));
+}
+
 // TODO: Might have a different category of sync if we want to build out this:
 DEVICE_INLINE void cpAsyncBarrier() {
   asm volatile("cp.async.wait_all;");
diff --git a/torch/csrc/jit/codegen/cuda/runtime/random_numbers.cu b/torch/csrc/jit/codegen/cuda/runtime/random_numbers.cu
index 96cec63f8d9ee..75d39e7c0c4b6 100644
--- a/torch/csrc/jit/codegen/cuda/runtime/random_numbers.cu
+++ b/torch/csrc/jit/codegen/cuda/runtime/random_numbers.cu
@@ -67,3 +67,23 @@ __device__ double rng_uniform(const uint4& rng_result, int rng_component) {
 __device__ float rng_uniformf(const uint4& rng_result, int rng_component) {
   return uniformf((&rng_result.x)[rng_component]);
 }
+
+__device__ double rng_uniform_range(
+    const uint4& rng_result,
+    int rng_component,
+    double from,
+    double to) {
+  auto range = to - from;
+  auto uniform01 = rng_uniform(rng_result, rng_component);
+  return from + range * uniform01;
+}
+
+__device__ float rng_uniform_rangef(
+    const uint4& rng_result,
+    int rng_component,
+    float from,
+    float to) {
+  auto range = to - from;
+  auto uniform01 = rng_uniformf(rng_result, rng_component);
+  return from + range * uniform01;
+}
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h b/torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h
index 90e64a284086c..d01d226efe42b 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h
@@ -11,6 +11,7 @@ namespace cuda {
 
 enum class TORCH_CUDA_CU_API ScheduleHeuristic {
   None,
+  NoOp,
   PointWise,
   Reduction,
   Persistent,
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h b/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h
index c43ef64eac0a3..6453962bfec8a 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h
@@ -26,14 +26,18 @@ namespace HeuristicCompileTime {
 //! Enum for all possible types of cached entries of compile-time info.
 enum class CompileTimeEntryType {
   DOMAIN_MAP,
+  TRANSPOSE_DOMAIN_MAP,
   REFERENCE_TENSORS,
+  REFERENCE_TENSORS_FOR_GROUPS,
   VECTORIZABLE_INPUTS_AND_OUTPUTS,
   INPUTS_AND_OUTPUTS_INNER_DIM_GROUPS,
   UNROLLABLE_INPUTS_AND_OUTPUTS,
   REDUCTION_TVS,
   PERSISTENT_BUFFER_INFO,
   SCOPE_PERSISTENT_FACTOR_INFO,
-  BROADCAST_BYTE_MULTIPLES
+  BROADCAST_BYTE_MULTIPLES,
+  INNER_MOST_DIMS_INFO,
+  CAN_SCHEDULE_TRANSPOSE,
 };
 
 //! Entry type definition class for `DOMAIN_MAP`,
@@ -45,6 +49,15 @@ class DomainMap {
       CompileTimeEntryType::DOMAIN_MAP;
 };
 
+//! Entry type definition class for `DOMAIN_MAP`,
+//!  stores the domain map of a fusion, used by transpose scheduler.
+class TransposeDomainMap {
+ public:
+  using DataType = pointwise_utils::DomainMap;
+  static const CompileTimeEntryType EntryType =
+      CompileTimeEntryType::TRANSPOSE_DOMAIN_MAP;
+};
+
 //! Entry type definition class for `REFERENCE_TENSORS`,
 //!  stores the the reference TensorViews used to schedule a fusion.
 class ReferenceTensors {
@@ -54,6 +67,16 @@ class ReferenceTensors {
       CompileTimeEntryType::REFERENCE_TENSORS;
 };
 
+//! Entry type definition class for `REFERENCE_TENSORS`,
+//!  stores the the reference TensorViews used to schedule a fusion, used by
+//!  transpose scheduler.
+class ReferenceTensorsForGroups {
+ public:
+  using DataType = std::vector<TensorView*>;
+  static const CompileTimeEntryType EntryType =
+      CompileTimeEntryType::REFERENCE_TENSORS_FOR_GROUPS;
+};
+
 //! Entry type definition class for `VECTORIZABLE_INPUTS_AND_OUTPUTS`,
 //!  stores the vectorizable TensorViews on a fusion's inputs and outputs.
 class VectorizableInputsAndOutputs {
@@ -99,6 +122,16 @@ class PersistentBufferInfo {
       CompileTimeEntryType::PERSISTENT_BUFFER_INFO;
 };
 
+//! Entry type definition class for `INNER_MOST_DIMS_INFO`,
+//!  Used in the transpose scheduler to store inner most IterDomains and their
+//!  position in reference1 of group 1 and group 2
+class InnerMostDimInfo {
+ public:
+  using DataType = std::vector<int64_t>;
+  static const CompileTimeEntryType EntryType =
+      CompileTimeEntryType::INNER_MOST_DIMS_INFO;
+};
+
 //! Auxiliary data types for `SCOPE_PERSISTENT_FACTOR_INFO` entry type.
 using ScopedPersistenceBufferMap = std::unordered_map<Val*, std::vector<bool>>;
 
@@ -121,11 +154,20 @@ class ScopePersistentFactorInfo {
 //!  information.
 class BroadcastMultiples {
  public:
-  using DataType = std::vector<scheduler_utils::BroadcastMultiple>;
+  using DataType = scheduler_utils::BroadcastMultipleInformation;
   static const CompileTimeEntryType EntryType =
       CompileTimeEntryType::BROADCAST_BYTE_MULTIPLES;
 };
 
+//! Entry type definition class for `CAN_SCHEDULE_TRANSPOSE`,
+//!  stores if the transpose scheduler can scheduler this fusion
+class CanScheduleTranspose {
+ public:
+  using DataType = bool;
+  static const CompileTimeEntryType EntryType =
+      CompileTimeEntryType::CAN_SCHEDULE_TRANSPOSE;
+};
+
 //! Base abstract class for unified storage in `HeuristicSummary`,
 //!  each entry in `HeuristicSummary` will be a subclass.
 class CompileTimeInfoBase : public PolymorphicBase {
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/heuristic.h b/torch/csrc/jit/codegen/cuda/scheduler/heuristic.h
index 058c72e592ad1..a828d66fdf039 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/heuristic.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/heuristic.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
+#include <torch/csrc/jit/codegen/cuda/utils.h>
 
 #include <string>
 
@@ -9,7 +10,7 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
-class HeuristicParams {
+class HeuristicParams : public PolymorphicBase {
  public:
   std::string tag = "";
 
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/mma_utils.cpp b/torch/csrc/jit/codegen/cuda/scheduler/mma_utils.cpp
index d0adc2aef6261..1991cada00dda 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/mma_utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/mma_utils.cpp
@@ -274,10 +274,10 @@ std::unordered_set<IterDomain*> getMmaDomainSet(
 //  optimizations.
 //
 // A concrete example:
-//  T0 [I0, I1, I2, I3, I4, I5] = mma(T1[I01, B11, B21, I31, I41, B51], T2[B02,
+//  T0 [I0, I1, I2, R3, I4, I5] = mma(T1[I01, B11, B21, I31, I41, B51], T2[B02,
 //  I12, B22, I32, I42, I52], {3};
 // In this case some example querries:
-//  K dimension of T0 = {I3}
+//  K dimension of T0 = {R3}
 //  M dimension of T1 = {I01}
 //  N dimension of T2 = {I52}
 //  etc.
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/normalization.cpp b/torch/csrc/jit/codegen/cuda/scheduler/normalization.cpp
index bf6768536dc24..459974b8d2884 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/normalization.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/normalization.cpp
@@ -909,7 +909,7 @@ TORCH_CUDA_CU_API std::shared_ptr<ReductionParams> getPersistentHeuristics(
   }
 
   // Try expanding vectorization to contig merged domains
-  vectorize_factor = scheduler_utils::expandVectorizationToContigMergedDomains(
+  vectorize_factor = vectorize_helper::expandVectorizationToContigMergedDomains(
       fusion,
       runtime_info,
       vectorizable_inputs_outputs,
@@ -992,6 +992,8 @@ TORCH_CUDA_CU_API void schedulePersistentKernel(
       scheduler_utils::getReductionTvs(fusion /*, ignore_trivial = true */);
 
   TORCH_INTERNAL_ASSERT(reduction_tvs.size());
+  // Registry assumes the reference tv is the first reduction_tv, if this
+  // changes registry needs to change.
   auto reduction_tv = reduction_tvs[0];
 
   auto dim_analysis = scheduler_utils::canonicalDimReduction(
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp
index d404ab622a5c7..b40e6fbf7cf7a 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp
@@ -1,7 +1,7 @@
 #include <torch/csrc/jit/codegen/cuda/scheduler/pointwise.h>
 
 #include <torch/csrc/jit/codegen/cuda/executor_utils.h>
-#include <torch/csrc/jit/codegen/cuda/inline_propagator.h>
+#include <torch/csrc/jit/codegen/cuda/inlining.h>
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
@@ -52,12 +52,6 @@ class DomainMap : public pointwise_utils::DomainMap {
     return result;
   }
 
-  static bool hasReferenceTensorView(Fusion* fusion) {
-    FusionGuard fg(fusion);
-    DomainMap domain_map(fusion);
-    return domain_map.findReferenceTensorView() != nullptr;
-  }
-
  private:
   bool hasMinimumSize(TensorView* tv, int num_axes) const {
     TORCH_INTERNAL_ASSERT(tv != nullptr);
@@ -141,13 +135,11 @@ std::shared_ptr<PointwiseParams> getPointwiseHeuristics(
     });
     vectorizable_inputs_outputs_entry.get();
 
-    auto broadcast_byte_multiples_entry =
-        HeuristicSummaryEntry<HeuristicCompileTime::BroadcastMultiples>(
-            data_cache, []() {
-              return std::make_unique<
-                  std::vector<scheduler_utils::BroadcastMultiple>>();
-            });
-    broadcast_byte_multiples_entry.get();
+    auto broadcast_info = HeuristicSummaryEntry<
+        HeuristicCompileTime::BroadcastMultiples>(data_cache, []() {
+      return std::make_unique<scheduler_utils::BroadcastMultipleInformation>();
+    });
+    broadcast_info.get();
     return std::make_shared<PointwiseParams>("Pointwise heuristics");
   }
 
@@ -183,25 +175,7 @@ std::shared_ptr<PointwiseParams> getPointwiseHeuristics(
 
   auto params = std::make_shared<PointwiseParams>("Pointwise heuristics");
 
-  /*
-   * 2D pointwise scheduling logic. What is expected is there's some
-   * broadcasting pattern which would make scheduling as a 2D problem more
-   * efficient than scheduling simply as a 1D problem.
-   *
-   * Mapping count holds how many bytes are in each dimension for both inputs
-   * and outputs relative to the reference tensor. What we're looking for is a
-   * break point in reference_tvs dimensions which separates the outer dimension
-   * and inner dimension of the problem mapped to 2D.
-   *
-   * break_point is computed assuming no reuse, ignoring parallelization
-   * limitations, and simply figures out which point best separates broadcasted
-   * dimensions. In other words, where's the point where we isolate the most
-   * broadcasted elements to one side.
-   *
-   * Once a break point is found, simply schedule the pointwise op as 2D
-   * balancing parallelization as best as possible.
-   */
-
+  // See pointwise.h to understand what we're doing for this 2D analysis.
   // Ideal break point location
   int break_point = 0;
 
@@ -230,16 +204,15 @@ std::shared_ptr<PointwiseParams> getPointwiseHeuristics(
   // break point.
   int64_t gdim_right = 1;
 
-  auto broadcast_byte_multiples_entry =
-      HeuristicSummaryEntry<HeuristicCompileTime::BroadcastMultiples>(
-          data_cache, [&largest_out, &index_type]() {
-            return std::make_unique<
-                std::vector<scheduler_utils::BroadcastMultiple>>(
-                scheduler_utils::getBroadcastMultiples(
-                    largest_out, index_type));
-          });
+  auto broadcast_info = HeuristicSummaryEntry<
+      HeuristicCompileTime::BroadcastMultiples>(
+      data_cache, [&largest_out, &index_type]() {
+        return std::make_unique<scheduler_utils::BroadcastMultipleInformation>(
+            scheduler_utils::getBroadcastMultiples(largest_out, index_type));
+      });
 
-  auto& broadcast_byte_multiples = broadcast_byte_multiples_entry.get();
+  auto& view_disjoint_sets = broadcast_info.get().view_disjoint_set_ids;
+  auto& broadcast_byte_multiples = broadcast_info.get().broadcast_multiples;
 
   TORCH_INTERNAL_ASSERT(broadcast_byte_multiples.size() == ref_root.size());
 
@@ -266,6 +239,12 @@ std::shared_ptr<PointwiseParams> getPointwiseHeuristics(
       int64_t min_total_transfer = std::numeric_limits<int64_t>::max();
 
       for (const auto break_point_i : c10::irange(ref_root.size())) {
+        // If break point is incoherent with view, don't consider breaking here.
+        if (!scheduler_utils::breakIsDisjoint(
+                view_disjoint_sets, break_point_i)) {
+          continue;
+        }
+
         // Number of elements in the right side of reference tv with
         // break_point_i
         int64_t cur_right_elem_count = 1;
@@ -362,8 +341,10 @@ std::shared_ptr<PointwiseParams> getPointwiseHeuristics(
   }
 
   // Try expanding vectorization to contig merged domains
+  // TODO: This is an expensive function that shouldn't be in heuristics without
+  // caching.
   auto expanded_vector_word_size =
-      scheduler_utils::expandVectorizationToContigMergedDomains(
+      vectorize_helper::expandVectorizationToContigMergedDomains(
           fusion,
           runtime_info,
           vectorizable_inputs_outputs,
@@ -435,8 +416,15 @@ LaunchParams schedulePointwise(
   return params->lparams;
 }
 
+TensorView* getReferenceTensorView(Fusion* fusion) {
+  FusionGuard fg(fusion);
+  DomainMap domain_map(fusion);
+  auto reference_tv = domain_map.findReferenceTensorView();
+  return reference_tv;
+}
+
 bool hasReferenceTensorView(Fusion* fusion) {
-  return DomainMap::hasReferenceTensorView(fusion);
+  return getReferenceTensorView(fusion) != nullptr;
 }
 
 // TODO: Inline intermediate operations (avoid inlining unrolled/vectorized
@@ -487,41 +475,142 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams& params) {
     return;
   }
 
-  DomainMap domain_map(fusion);
-  TensorView* reference_tv =
-      domain_map.findReferenceTensorView(params.break_point);
+  TensorView* reference_tv = getReferenceTensorView(fusion);
 
   TORCH_INTERNAL_ASSERT(
       reference_tv != nullptr,
       "Could not find a fully broadcasted output to reference schedule on.");
 
-  auto all_tvs = ir_utils::allTvs(fusion);
-
-  // Merge right side of break point
+  // Positions of rhs and lhs after merging all dimensions.
   int rhs_i = -1;
-  for (int i = (int)reference_tv->nDims(); i > (int)params.break_point; i--) {
-    auto axis_i = i - 1;
-    if (rhs_i == -1) {
-      rhs_i = axis_i;
-    } else {
-      reference_tv->merge(axis_i, rhs_i);
-      rhs_i = axis_i;
+  int lhs_i = -1;
+
+  auto view_ops = ir_utils::getViewOps(fusion);
+
+  /*
+   * If there's no path from reference through producer paths only to a view,
+   * e.g.: input
+   *      /  \
+   *   view reference
+   *    /
+   * output
+   *
+   * we need to propagate the view transformations to the reference tv before
+   * scheduling the reference tv. Since view ops have to be identical, if any
+   * path from reference tv through producers goes through a view, all paths
+   * from reference tv's to views should be through producers.
+   */
+  bool needs_view_prop =
+      view_ops.size() > 0 &&
+      !std::any_of(
+          view_ops.begin(), view_ops.end(), [&reference_tv](ViewOp* view) {
+            return DependencyCheck::isDependencyOf(view->out(), reference_tv) ||
+                view->out()->sameAs(reference_tv);
+          });
+
+  if (needs_view_prop) {
+    auto first_view_op = *view_ops.begin();
+
+    // Propagate the view transformations
+    TransformPropagator propagator(first_view_op->out());
+    MaxRootDomainInfoSpanningTree spanning_tree(first_view_op->out());
+    spanning_tree.traverse(&propagator);
+
+    // Reorder reference_tv after propagating the view operation. This will
+    // reorder for better merging.
+    reference_tv->reorder(
+        scheduler_utils::domainReorderAsRfactorMap(reference_tv));
+
+    // Break point is relative to rfactor domain, find the leaf domain ID's in
+    // the left/right side, we really need the values in domain, but easiest way
+    // to do this is with Dependency check which will grab all intermediate
+    // values too.
+    auto lhs_all_vals = DependencyCheck::getAllValsBetween(
+        {reference_tv->getMaybeRFactorDomain().begin(),
+         reference_tv->getMaybeRFactorDomain().begin() + params.break_point},
+        {reference_tv->domain()->domain().begin(),
+         reference_tv->domain()->domain().end()});
+
+    std::unordered_set<Val*> lhs_all_vals_set(
+        lhs_all_vals.begin(), lhs_all_vals.end());
+
+    auto rhs_all_vals = DependencyCheck::getAllValsBetween(
+        {reference_tv->getMaybeRFactorDomain().begin() + params.break_point,
+         reference_tv->getMaybeRFactorDomain().end()},
+        {reference_tv->domain()->domain().begin(),
+         reference_tv->domain()->domain().end()});
+
+    std::unordered_set<Val*> rhs_all_vals_set(
+        rhs_all_vals.begin(), rhs_all_vals.end());
+
+    // Make sure lhs and rhs groups are disjoint.
+    for (auto lhs_val : lhs_all_vals) {
+      TORCH_INTERNAL_ASSERT(
+          rhs_all_vals_set.count(lhs_val) == 0,
+          "Error in pointwise scheduler. LHS and RHS of the 2D scheduler are not disjoint.");
     }
-  }
-  if (rhs_i >= 0) {
-    // If there's an rhs
-    reference_tv->reorder({{rhs_i, -1}});
-  }
 
-  // Merge left side of break point
-  int lhs_i = -1;
-  for (int i = (int)params.break_point; i > 0; i--) {
-    auto axis_i = i - 1;
-    if (lhs_i == -1) {
-      lhs_i = axis_i;
-    } else {
-      reference_tv->merge(axis_i, lhs_i);
-      lhs_i = axis_i;
+    // Merge rhs, then lhs.
+    IterDomain* rhs_id = nullptr;
+    IterDomain* lhs_id = nullptr;
+    auto ndims = reference_tv->nDims();
+    for (auto i : c10::irange(ndims)) {
+      // Merge from right to left
+      auto pos = ndims - 1 - i;
+      auto id = reference_tv->axis(pos);
+      if (lhs_all_vals_set.count(id) > 0) {
+        if (lhs_id == nullptr) {
+          lhs_id = id;
+          lhs_i = pos;
+        } else {
+          reference_tv->merge(pos, lhs_i);
+          lhs_i = pos;
+          if (rhs_i > lhs_i) {
+            rhs_i--;
+          }
+        }
+      } else if (rhs_all_vals_set.count(id) > 0) {
+        if (rhs_id == nullptr) {
+          rhs_id = id;
+          rhs_i = pos;
+        } else {
+          reference_tv->merge(pos, rhs_i);
+          rhs_i = pos;
+          if (lhs_i > rhs_i) {
+            lhs_i--;
+          }
+        }
+      }
+    }
+    // Find the iter domains that should be in the lhs, and rhs.
+  } else {
+    // Don't need to worry about view transformations, just merge reference tv
+    // as we normally would.
+
+    // Merge right side of break point
+    for (int i = (int)reference_tv->nDims(); i > (int)params.break_point; i--) {
+      auto axis_i = i - 1;
+      if (rhs_i == -1) {
+        rhs_i = axis_i;
+      } else {
+        reference_tv->merge(axis_i, rhs_i);
+        rhs_i = axis_i;
+      }
+    }
+    if (rhs_i >= 0) {
+      // If there's an rhs
+      reference_tv->reorder({{rhs_i, -1}});
+    }
+
+    // Merge left side of break point
+    for (int i = (int)params.break_point; i > 0; i--) {
+      auto axis_i = i - 1;
+      if (lhs_i == -1) {
+        lhs_i = axis_i;
+      } else {
+        reference_tv->merge(axis_i, lhs_i);
+        lhs_i = axis_i;
+      }
     }
   }
 
@@ -716,9 +805,9 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams& params) {
   // get a higher position in later inline propagation. We need this separate
   // step because we were not using ParallelType::Unroll, so we have to do
   // unrolling manually.
-  InlinePropagator inline_unswitch(
-      reference_tv, unswitch_pos, ComputeAtMode::BestEffort);
-  spanning_tree.traverse(&inline_unswitch);
+  inlineAllAt(reference_tv, unswitch_pos, true);
+
+  auto all_tvs = ir_utils::allTvs(fusion);
 
   // Inline at the inner most position. The CA position of all tensors except
   // inputs, cached inputs and outputs will be updated.
@@ -731,9 +820,7 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams& params) {
     auto output = entry.second;
     inner_most_tensors.erase(output);
   }
-  InlinePropagator inline_inner_most(
-      reference_tv, -1, ComputeAtMode::BestEffort, inner_most_tensors);
-  spanning_tree.traverse(&inline_inner_most);
+  inlineMost(inner_most_tensors);
 }
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h
index 6cba29cd6b4b9..f3a1da7bcff5f 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h
@@ -10,6 +10,141 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
+/*
+ * The 2D pointwise scheduling logic is a bit interesting. We'll start by giving
+ * motivation for what the scheduling is attempting to do. What we're going to
+ * do with the scheduling is attempt to make it two dimensional in a way that
+ * minimizes the refetching of broadcasted dimensions. If we think of the
+ * trivial case:
+ * T0[i0, b1]
+ * T1[b0, i1]
+ * T2[i0, i1] = T0 + T1
+ * If we scheduled T2 as 1-dimensional we would do something along the lines of
+ * merging i0 and i1 then splitting out a block and thread dimension. If i1 is
+ * greater than the thread dimension, then all threads would pull the same value
+ * from T0. However, they would all be pulling different values from T1. In this
+ * case we have perfect reuse of the broadcast dimension T0 but potentially no
+ * reuse of the broadcast dimension of T1. "Potentially" because if i1 isn't too
+ * big it should be efficiently cached in L2. If i1 is big, then by the time we
+ * increment the i0 dimension the i1 dimension will be pushed out of cache.
+ *
+ * Instead what we do is we map this to a two dimensional problem. Instead of
+ * having the schedule that merges the two dimensions, we'll actually leave the
+ * dimensions separate and we'll take i0, split it to BIDy, TIDy, and take i1
+ * and split it to BIDx and TIDx. Therefore we'll have a parallelization on T2
+ * like [BIDy, TIDy | BIDx, TIDx], where | denotes the separation of the
+ * original i0 and i1. This helps because all threads in the TIDx dimension will
+ * reuse the same value in the i0 dimension (holding BIDy and TIDy constant),
+ * all the threads in the TIDy dimension (holding BIDx, and TIDx constant) will
+ * reuse the same value in the i1 dimension. This reuse of values reduces the
+ * number of redundant values pulled from T0 and T1. The same thing can be said
+ * for when incrementing BIDy, but since BIDy is strided on BIDx there's no
+ * effective increment of BIDy without incrementing BIDx. Since all threads are
+ * executed within a block we can effectively consider the block incrementing
+ * TIDx BDIMx times while holding TIDy constant and incrementing TIDy BDIMy
+ * times while holding TIDx constant. Since multiple BIDx's are running at the
+ * same time on the device we can consider a wave on the GPU of incrementing
+ * BIDx (wave number of times), while holding TIDy constant BDIMy * wave number
+ * of times.
+ *
+ * If instead we have a situation like:
+ * T0[i0, i1, b2]
+ * T1[i0, b1, i2]
+ * T2[i0, i1, i2] = T0 + T1
+ * It makes sense that the break point would be in position 2, between i1 and
+ * i2. This is because when we map [i0, i1 | i2] to [BIDy, TIDy| BIDx, TIDx]
+ * BIDx, and TIDx will access the same elements of T0 on b2, and TIDy will
+ * likely access the same elements of T1 (as long as i1 > BDIMy). Even if i1 on
+ * the order of BDIMy we'll only access ~two unique elements per increment of
+ * BIDx or TIDx. This means we'll still reuse many of the same values and limit
+ * the amount we need to read duplicate values in T0 and T1.
+ *
+ * If instead we have:
+ * T0[i0, b1, i2]
+ * T1[b0, i1, i2]
+ * T2[i0, i1, i2] = T0 + T1
+ * The analysis gets a bit more complicated. First if i2 is very large and i0
+ * and i1 are relatively small it would make sense to have [i0, i1 | i2]. If b0
+ * is very small it's unlikely beneficial to have [i0 | i1, i2] as there would
+ * be small reuse on b0, and potentially no reuse on b1. If i2 is very small it
+ * may be worthwhile to have [i0 | i1, i2]. If i1 and i2 are not small, and
+ * their product is relatively large (i.e. you can't fit T2[i, :, :] in L2) then
+ * it's unlikely we'll get any significant reuse across i0.
+ *
+ * What we should (but don't due to complexity) assume then, is that we will get
+ * strong reuse across TIDx and TIDy for dimensions that are on the inner
+ * portion of the 2D tile.
+ *
+ * For example if we have:
+ * T0[i0, b1, i2]
+ * T1[b0, b1, i2]
+ * T2[b0, i1, i2]
+ * T3[i0, i1, i2] = T0 + T1 + T2
+ * We may want to break point at position 1 or position 2 (i.e. [i0 | i1, i2] or
+ * [i0, i1 | i2]). We can't immediately tell from the structure.
+ *
+ * If we choose [i0, i1 | i2] then we'll get:
+ * Strong reuse of T0 on TIDy (b1 dim)
+ * Perfect reuse across T1 on TIDy (b0 and b1)
+ * If BIDx is bound to the LHS of the tile we'll get:
+ * Maybe strong reuse of T0 on BIDx (b1 dim if it's large)
+ * Perfect reuse across T1 on BIDx
+ * Potentially no reuse on T2 if i1 is very large
+ *
+ * If we pick [i0 | i1, i2], then we'll get:
+ * We'll perfect reuse across TIDy on T1 and T2 on b0
+ * Some reuse on T0 and T1 on b1 across BIDx if i2 is relatively small and BIDx
+ * is bound to the RHS of the 2D schedule Perfect reuse on T1 and T2 on b0
+ * across BIDx if BIDx is bound to the LHS of the 2D schedule
+ *
+ * Materializing these benefits is dependent on the decisions the scheduler
+ * makes when parallelizing the problem. The heuristics logic at the moment is
+ * fairly simplistic where it assumes that there's only reuse across the break
+ * points for tensors that have no iteration domain on the entire side of the
+ * breakpoint. This is not optimal but for the time being it seems sufficient.
+ * We would ideally take into consideration the parallelization scheme and
+ * partial broadcasting on the lhs or rhs.
+ *
+ * An example of how this analysis is done is given the DAG:
+ * T0[i0, i1, b2] float
+ * T1[i0, b1, i2] half
+ * T2[i0, b1, i2] = cast(T1, float)
+ * T4[i0, i1, i2] float = T0 + T2
+ * With values of 10, 100, 1000 as [i0, i1, i2]
+ * Our break point analysis for positions 0, 1, 2, 3 will be:
+ *
+ * 0: 10*10 * 100*10 * 1000*10 = 1e9
+ * 1: 10*10 * 100*10 * 1000*10 = 1e9
+ * 2: 10*10 * 100*10 * 1000*6  = 6e8
+ * 3: 10*10 * 100*10 * 1000*10 = 1e9
+ *
+ * Where for each computation the LHS of the * pairs is the number of elements
+ * in that dimension on the reference and the RHS of the * pairs is the
+ * broadcast multiple where any tensor that has all broadcasts on the rhs or lhs
+ * of the break point doesn't contribute to the broadcast multiple of the rhs or
+ * lhs.
+ *
+ * So we'll pick position 2 since we're confident we can get broadcast reuse on
+ * the rhs of tensor 0. As already mentioned this is a pretty big
+ * simplification/assumption and in reality it may be harder/easier to take
+ * advantage of broadcast on the inner or outer dimension. This is a reasonable
+ * way to make relative decisions on break points, however, this computation is
+ * ont doing an effective estimate of actual DRAM transfers which it should be
+ * modified to do so.
+ *
+ * For view schedules there can be some incoherent break points for example:
+ * T1[i0, i1*i2] = view(T0[i0, i1, i2])
+ * would make the position 2 "incoherent". In otherwords we cannot replay
+ * through the view a schedule that tries to merge i0 and i1, without i2. So for
+ * positions that are incoherent we won't consider break point positions there.
+ *
+ * See FusionBroadcastViewMultiples_CUDA for what we expect with view handling.
+ * Shortly any dimensions that are inputs or outputs of view transformations are
+ * considered together, since it's hard to account for partial dimensions that
+ * are being broadcasted. So for view it's primarily an all or nothing situation
+ * when it comes to the 2D pointwise scheduler.
+ */
+
 class SchedulerRuntimeInfo;
 class HeuristicSummary;
 
@@ -36,6 +171,9 @@ TORCH_CUDA_CU_API LaunchParams schedulePointwise(
 //!  the pointwise scheduler.
 bool hasReferenceTensorView(Fusion* fusion);
 
+// Return reference tensor view.
+TensorView* getReferenceTensorView(Fusion* fusion);
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h
index 7947a27f48360..6cc4b1b8b93bd 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h
@@ -20,10 +20,6 @@ class DomainMap {
   }
   virtual ~DomainMap() = default;
 
-  bool areExactMapped(IterDomain* id1, IterDomain* id2) const {
-    return ca_map_.areMapped(id1, id2, IdMappingMode::EXACT);
-  }
-
   const ComputeAtMap& getComputeAtMap() const {
     return ca_map_;
   }
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/reduction.cpp b/torch/csrc/jit/codegen/cuda/scheduler/reduction.cpp
index b5940b1d4e1cb..3037f8469dad4 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/reduction.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/reduction.cpp
@@ -954,7 +954,7 @@ TORCH_CUDA_CU_API std::shared_ptr<ReductionParams> getReductionHeuristics(
   }
 
   // Try expanding vectorization to contig merged domains
-  vectorize_factor = scheduler_utils::expandVectorizationToContigMergedDomains(
+  vectorize_factor = vectorize_helper::expandVectorizationToContigMergedDomains(
       fusion,
       runtime_info,
       vectorizable_inputs_outputs,
@@ -1010,6 +1010,8 @@ void scheduleReduction(Fusion* fusion, const ReductionParams& rparams) {
 
   TORCH_INTERNAL_ASSERT(reduction_tvs.size());
 
+  // Registry assumes the reference tv is the first reduction_tv, if this
+  // changes registry needs to change.
   auto reduction_tv = reduction_tvs[0];
 
   auto dim_analysis = scheduler_utils::canonicalDimReduction(
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.cpp b/torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.cpp
index 6bd4d4efba376..ae9ecd88bbdc3 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.cpp
@@ -1,7 +1,7 @@
 #include <torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h>
 
 #include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/inline_propagator.h>
+#include <torch/csrc/jit/codegen/cuda/inlining.h>
 #include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/maxinfo_propagator.h>
@@ -336,14 +336,7 @@ void multiReductionInliner(
       scheduler_utils::getTrivialReductionMap(fusion);
 
   // Inline the schedule
-  InlinePropagator inline_propagator(
-      reference_tv,
-      -1,
-      ComputeAtMode::MostInlined,
-      {},
-      mapped_to_trivial_reduction);
-
-  MaxRootDomainInfoSpanningTree(reference_tv).traverse(&inline_propagator);
+  inlineMost(mapped_to_trivial_reduction);
 }
 
 namespace {
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
index 8550bfc6bf0fa..5d5bc84ef3b4d 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
@@ -358,6 +358,45 @@ class SchedulerTopologyChecker {
 
     return true;
   }
+
+  /* Returns if any non-trivial views are not before the reference. For example:
+   *     t0
+   *    /  \
+   *  view ref
+   *   |
+   *   t1
+   * This could be important as transform propagation from a reference backwards
+   * through a view should always work, but transform propagation form a
+   * reference forward through a view could interfere with the view transforms.
+   */
+  static bool hasViewNotBeforeRef(
+      Fusion* fusion,
+      std::vector<TensorView*> reference_tvs) {
+    std::vector<TensorView*> view_tvs;
+    auto view_ops = ir_utils::getViewOps(fusion);
+    for (auto view_op : view_ops) {
+      auto tv_outs = ir_utils::filterByType<TensorView>(view_op->outputs());
+      for (auto entry : tv_outs) {
+        view_tvs.push_back(entry);
+      }
+    }
+
+    if (view_tvs.empty()) {
+      return false;
+    }
+
+    // Terrible complexity, may be worth improving, but is a compile time
+    // check.
+    for (auto ref_tv : reference_tvs) {
+      for (auto view_tv : view_tvs) {
+        if (!DependencyCheck::isDependencyOf(view_tv, ref_tv)) {
+          return true;
+        }
+      }
+    }
+
+    return false;
+  }
 };
 
 bool isConnectedFusionGraph(Fusion* fusion) {
@@ -369,6 +408,11 @@ bool isConnectedFusionGraph(Fusion* fusion) {
   // A set of connected components on the fusion graph
   DisjointSets<Val*> component_sets;
 
+  TORCH_INTERNAL_ASSERT(
+      !fusion->outputs().empty(), "Fusion without output is not supported");
+  auto output0 = fusion->outputs()[0];
+  component_sets.initializeSet(output0);
+
   // Iterate through all used exprs
   for (auto expr : fusion->exprs()) {
     TORCH_INTERNAL_ASSERT(
@@ -394,7 +438,6 @@ bool isConnectedFusionGraph(Fusion* fusion) {
   //  If there is no independent compute flow
   // on this fusion graph, all outputs will be
   // equivalent/connected to the first output.
-  auto output0 = fusion->outputs()[0];
   for (auto output : fusion->outputs()) {
     if (!component_sets.strictAreMapped(output0, output)) {
       return false;
@@ -420,6 +463,24 @@ void SchedulerRuntimeInfo::initialize(
       auto fusion_inp = complete_fusion_->inputs()[inp_i];
       auto data_ptr = tensor_arg_abstract->getPointer();
       input_ptrs_[fusion_inp] = (size_t)data_ptr;
+
+      // find and push discontiguous stride
+      auto dtype_size = dataTypeSize(tensor_arg_abstract->getDataType());
+      input_discontig_strides_[fusion_inp] = {};
+      auto dims = tensor_arg_abstract->getRank();
+      auto expected_stride = 1;
+      for (auto dim = dims - 1; dim >= 0; dim--) {
+        auto size = tensor_arg_abstract->getSize(dim);
+        if (size <= 1) {
+          continue;
+        }
+        auto stride = tensor_arg_abstract->getStride(dim);
+        if (stride != expected_stride) {
+          input_discontig_strides_[fusion_inp].push_back(stride * dtype_size);
+          expected_stride = stride;
+        }
+        expected_stride *= size;
+      }
     }
   }
 
@@ -486,6 +547,13 @@ size_t SchedulerRuntimeInfo::getAlignmentSize(TensorView* tv) {
   }
 
   auto alignment_size = SchedulerRuntimeInfo::computeAlignmentSize(ptrOf(tv));
+  auto strides_it = input_discontig_strides_.find(tv);
+  if (strides_it != input_discontig_strides_.end()) {
+    for (auto stride : strides_it->second) {
+      alignment_size = std::min(
+          alignment_size, SchedulerRuntimeInfo::computeAlignmentSize(stride));
+    }
+  }
   alignment_map_[tv] = alignment_size;
   return alignment_size;
 }
@@ -746,8 +814,7 @@ static bool checkPatternEquivalence(
 // being broadcasted to one size multiple times or different sizes. This is a
 // hard to optimize problem and likely indicates we shouldn't be fusing.
 bool hasNonUniqueBcast(Fusion* fusion) {
-  ConcretizedBroadcastDomains concretize_info;
-  concretize_info.build(fusion);
+  ConcretizedBroadcastDomains concretize_info(fusion);
 
   for (auto tv : ir_utils::allTvs(fusion)) {
     for (auto id : tv->getRootDomain()) {
@@ -788,6 +855,119 @@ bool hasNonUniqueBcast(Fusion* fusion) {
 //!        This function will be called when compiling a kernel. It should apply
 //!        scheduling to the given fusion
 
+//! NoOp scheduler represents the case where scheduler will
+//!  not do any scheduling operations and forward the un-scheduled
+//!  fusion directly to code generation and kernel compilation.
+//!
+//! Typical use case of this scheduler is to handle edge cases
+//!  such as where all tensors are size-1 or size-0.
+class NoOpScheduler : public SchedulerEntry {
+  //! Provides a dummy heuristic type to ensure
+  //!  unified interface on NoOp scheduler.
+  class NoOpHeuristic : public HeuristicParams {
+   public:
+    size_t hash() const override {
+      return 0;
+    }
+    std::shared_ptr<HeuristicParams> clone() const override {
+      return std::make_shared<NoOpHeuristic>();
+    }
+    bool sameAs(const std::shared_ptr<HeuristicParams>& other) const override {
+      auto other_casted = std::dynamic_pointer_cast<ReductionParams>(other);
+      return other_casted != nullptr;
+    };
+  };
+
+ public:
+  explicit NoOpScheduler(
+      Fusion* fusion,
+      SchedulerRuntimeInfo& runtime_info,
+      HeuristicSummary* data_cache = nullptr)
+      : SchedulerEntry(ScheduleHeuristic::NoOp) {
+    params_ = std::make_shared<NoOpHeuristic>();
+  }
+
+  //! Check if the no-op heuristics apply in given fusion
+  static bool canScheduleCompileTime(Fusion* fusion) {
+    // Check there're no non-trivial reduction ops.
+    for (auto reduction :
+         ir_utils::getReductionOps(fusion, true /* ignore_trivial */)) {
+      for (auto input :
+           ir_utils::filterByType<TensorView>(reduction->inputs())) {
+        auto root_dom = input->getRootDomain();
+        auto all_nonzero =
+            std::none_of(root_dom.begin(), root_dom.end(), [](IterDomain* id) {
+              return id->extent()->isZeroInt();
+            });
+        if (all_nonzero) {
+          scheduler_debug_utils::canScheduleRejectReason(
+              ScheduleHeuristic::NoOp,
+              "reduction of non-zero elements is not supported");
+          return false;
+        }
+      }
+    }
+
+    // Check that all outputs are either broadcast or ignored reduction.
+    for (auto out_tv : ir_utils::filterByType<TensorView>(fusion->outputs())) {
+      auto non_zero_candidate_dimension = TensorDomain::noReductions(
+          TensorDomain::noBroadcasts(out_tv->domain()->domain()));
+
+      // non_zero_candidate_dimension is empty would mean this out tv has only
+      //  broadcast and trivial reduction axes, and this out tv would not
+      //  require scheduling ops.
+      // If any of the dimensions in non_zero_candidate_dimension is compile
+      // time
+      //  constant zero, this out tv also does not require any scheduling
+      //  operation as it is essentially a scalar.
+      // TODO:
+      // There seems to be a runtime component to it
+      //  too, i.e. if the runtime sizes are zero, then we should
+      //  handle it through null scheduler.
+      if (!non_zero_candidate_dimension.empty() &&
+          std::none_of(
+              non_zero_candidate_dimension.begin(),
+              non_zero_candidate_dimension.end(),
+              [](IterDomain* id) { return id->extent()->isZeroInt(); })) {
+        // We have found a out_tv with a dimension that NoOp scheduler couldn't
+        //  handle and therefore reject this fusion.
+        scheduler_debug_utils::canScheduleRejectReason(
+            ScheduleHeuristic::NoOp, "output has a concrete dimension");
+        return false;
+      }
+    }
+
+    // We have verified that all iterdomains on all output tv's are trivial
+    // reductions,
+    //  broadcasts or zero-sized. Therefore accepting this fusion for NoOp
+    //  scheduling.
+    return true;
+  }
+
+  static bool canScheduleRunTime(
+      Fusion* fusion,
+      SchedulerRuntimeInfo& runtime_info,
+      HeuristicSummary* data_cache = nullptr) {
+    // TODO:
+    //  Pipe through dynamic zero checks.
+    return true;
+  }
+
+  void schedule(Fusion* fusion) override {
+    // Schedule is no-op.
+    return;
+  }
+
+ private:
+  void computeHeuristics(
+      Fusion* fusion,
+      SchedulerRuntimeInfo& runtime_info,
+      HeuristicSummary* data_cache = nullptr) {
+    // Heuristics is no-op.
+    return;
+  }
+};
+
 class ReductionScheduler : public SchedulerEntry {
  public:
   explicit ReductionScheduler(
@@ -838,6 +1018,17 @@ class ReductionScheduler : public SchedulerEntry {
       return false;
     }
 
+    // Persistent scheduler simply uses reduction_tvs[0] as the reference, if
+    // that changes, this needs to be changed. Second check here may be overly
+    // conservative.
+    if (SchedulerTopologyChecker::hasViewNotBeforeRef(
+            fusion, {reduction_tvs[0]}) ||
+        !scheduler_utils::allMatchingViews(fusion)) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Reduction, "Unsupported view fusion.");
+      return false;
+    }
+
     // Make sure reduction axes are consistent through the fusion
     auto reduction_ops =
         ir_utils::getReductionOps(fusion, false /* ignore_trivial */);
@@ -937,6 +1128,84 @@ class ReductionScheduler : public SchedulerEntry {
   }
 };
 
+class TransposeScheduler : public SchedulerEntry {
+ public:
+  explicit TransposeScheduler(
+      Fusion* fusion,
+      SchedulerRuntimeInfo& runtime_info,
+      HeuristicSummary* data_cache = nullptr)
+      : SchedulerEntry(ScheduleHeuristic::Transpose) {
+    computeHeuristics(fusion, runtime_info, data_cache);
+  }
+
+  static bool canScheduleCompileTime(Fusion* fusion) {
+    // Temporarily disallow view in transpose scheduler
+    // TODO Add more testing before enabling
+    auto view_tvs = scheduler_utils::getViewTVs(fusion);
+    if (view_tvs.size() > 0) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Transpose, "No support for view op");
+      return false;
+    }
+
+    if (!hasAtLeastTwoValidGroups(fusion)) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Transpose,
+          "cannot find two mismatching inner most dimensions");
+      return false;
+    }
+
+    // TODO: add support for trivial reduction
+    auto reduction_ops =
+        ir_utils::getReductionOps(fusion, false /* ignore_trivial */);
+
+    if (!reduction_ops.empty()) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Transpose, "no support for reduction ops");
+      return false;
+    }
+
+    if (hasNonUniqueBcast(fusion)) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Transpose,
+          "Broadcasting dimension might be broadcasting to multiple sizes.");
+      return false;
+    }
+
+    return true;
+  }
+
+  static bool canScheduleRunTime(
+      Fusion* fusion,
+      SchedulerRuntimeInfo& runtime_info,
+      HeuristicSummary* data_cache = nullptr) {
+    FUSER_PERF_SCOPE("TransposeScheduler::canScheduleRunTime");
+
+    auto reason =
+        getTransposeRuntimeRejectReason(fusion, data_cache, runtime_info);
+    if (!reason.empty()) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Transpose, reason);
+      return false;
+    }
+    return true;
+  }
+
+  void schedule(Fusion* fusion) override {
+    FUSER_PERF_SCOPE("Schedule Transpose Fusion");
+    scheduleTranspose(fusion, transposeParams());
+  }
+
+ private:
+  void computeHeuristics(
+      Fusion* fusion,
+      SchedulerRuntimeInfo& runtime_info,
+      HeuristicSummary* data_cache = nullptr) {
+    params_ = getTransposeHeuristics(fusion, runtime_info, data_cache);
+    TORCH_INTERNAL_ASSERT(params_ != nullptr);
+  }
+};
+
 class PointWiseScheduler : public SchedulerEntry {
  public:
   explicit PointWiseScheduler(
@@ -957,6 +1226,14 @@ class PointWiseScheduler : public SchedulerEntry {
       return false;
     }
 
+    if (!scheduler_utils::allMatchingViews(fusion) &&
+        SchedulerTopologyChecker::hasViewNotBeforeRef(
+            fusion, {getReferenceTensorView(fusion)})) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::PointWise, "Unsupported view fusion.");
+      return false;
+    }
+
     auto reduction_ops =
         ir_utils::getReductionOps(fusion, true /* ignore_trivial */);
 
@@ -980,6 +1257,18 @@ class PointWiseScheduler : public SchedulerEntry {
       Fusion* fusion,
       SchedulerRuntimeInfo& runtime_info,
       HeuristicSummary* data_cache = nullptr) {
+    auto can_schedule_transpose_entry =
+        HeuristicSummaryEntry<HeuristicCompileTime::CanScheduleTranspose>(
+            data_cache, [fusion]() {
+              return std::make_unique<bool>(
+                  TransposeScheduler::canScheduleCompileTime(fusion));
+            });
+    if (can_schedule_transpose_entry.get()) {
+      auto reason =
+          getTransposeRuntimeRejectReason(fusion, data_cache, runtime_info);
+      return !reason.empty();
+    }
+
     return true;
   }
 
@@ -1047,6 +1336,16 @@ class PersistentKernelScheduler : public SchedulerEntry {
       return false;
     }
 
+    // Persistent scheduler simply uses reduction_tvs[0] as the reference, if
+    // that changes, this needs to be changed. Second check here may be overly
+    // conservative.
+    if (SchedulerTopologyChecker::hasViewNotBeforeRef(
+            fusion, {reduction_tvs[0]}) ||
+        !scheduler_utils::allMatchingViews(fusion)) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Persistent, "Unsupported view fusion.");
+    }
+
     if (findTransposeOps(fusion).size() > 0) {
       // Use pointwise logic
       scheduler_debug_utils::canScheduleRejectReason(
@@ -1216,84 +1515,10 @@ class PersistentKernelScheduler : public SchedulerEntry {
   }
 };
 
-class TransposeScheduler : public SchedulerEntry {
- public:
-  explicit TransposeScheduler(
-      Fusion* fusion,
-      SchedulerRuntimeInfo& runtime_info,
-      HeuristicSummary* data_cache = nullptr)
-      : SchedulerEntry(ScheduleHeuristic::Transpose) {
-    computeHeuristics(fusion, runtime_info, data_cache);
-  }
-
-  static bool canScheduleCompileTime(Fusion* fusion) {
-    if (!isOptionEnabled(EnableOption::TransposeScheduler)) {
-      scheduler_debug_utils::canScheduleRejectReason(
-          ScheduleHeuristic::Transpose, "not enabled");
-      return false;
-    }
-
-    // Temporarily disallow view in transpose scheduler
-    // TODO Add more testing before enabling
-    auto view_tvs = scheduler_utils::getViewTVs(fusion);
-    if (view_tvs.size() > 0) {
-      scheduler_debug_utils::canScheduleRejectReason(
-          ScheduleHeuristic::Transpose, "No support for view op");
-      return false;
-    }
-
-    if (!hasAtLeastTwoValidGroups(fusion)) {
-      scheduler_debug_utils::canScheduleRejectReason(
-          ScheduleHeuristic::Transpose,
-          "cannot find two mismatching inner most dimensions");
-      return false;
-    }
-
-    // TODO: add support for trivial reduction
-    auto reduction_ops =
-        ir_utils::getReductionOps(fusion, false /* ignore_trivial */);
-
-    if (!reduction_ops.empty()) {
-      scheduler_debug_utils::canScheduleRejectReason(
-          ScheduleHeuristic::Transpose, "no support for reduction ops");
-      return false;
-    }
-
-    if (hasNonUniqueBcast(fusion)) {
-      scheduler_debug_utils::canScheduleRejectReason(
-          ScheduleHeuristic::Transpose,
-          "Broadcasting dimension might be broadcasting to multiple sizes.");
-      return false;
-    }
-
-    return true;
-  }
-
-  static bool canScheduleRunTime(
-      Fusion* fusion,
-      SchedulerRuntimeInfo& runtime_info,
-      HeuristicSummary* data_cache = nullptr) {
-    return true;
-  }
-
-  void schedule(Fusion* fusion) override {
-    FUSER_PERF_SCOPE("Schedule Transpose Fusion");
-    scheduleTranspose(fusion, transposeParams());
-  }
-
- private:
-  void computeHeuristics(
-      Fusion* fusion,
-      SchedulerRuntimeInfo& runtime_info,
-      HeuristicSummary* data_cache = nullptr) {
-    params_ = getTransposeHeuristics(fusion, runtime_info, data_cache);
-    TORCH_INTERNAL_ASSERT(params_ != nullptr);
-  }
-};
-
 // Schedule Table
 const std::vector<ScheduleHeuristic>& all_heuristics() {
   static const std::vector<ScheduleHeuristic> hlist = {
+      ScheduleHeuristic::NoOp,
       ScheduleHeuristic::Reduction,
       ScheduleHeuristic::Transpose,
       ScheduleHeuristic::PointWise,
@@ -1316,6 +1541,9 @@ bool checkCanSchedule(
     if (!isConnectedFusionGraph(fusion)) {
       return false;
     }
+    if (IterDomainGraph(fusion, /*allow_self_mapping=*/true).hasSelfMapping()) {
+      return false;
+    }
     if (!SchedulerType::canScheduleCompileTime(fusion)) {
       return false;
     }
@@ -1333,6 +1561,8 @@ bool SchedulerEntry::canSchedule(
     SchedulerRuntimeInfo& runtime_info,
     HeuristicSummary* data_cache) {
   switch (sh) {
+    case ScheduleHeuristic::NoOp:
+      return checkCanSchedule<NoOpScheduler>(fusion, runtime_info, data_cache);
     case ScheduleHeuristic::PointWise:
       return checkCanSchedule<PointWiseScheduler>(
           fusion, runtime_info, data_cache);
@@ -1359,6 +1589,10 @@ std::unique_ptr<SchedulerEntry> SchedulerEntry::makeEntry(
     HeuristicSummary* data_cache) {
   std::unique_ptr<SchedulerEntry> scheduler_entry = nullptr;
   switch (sh) {
+    case ScheduleHeuristic::NoOp:
+      scheduler_entry =
+          std::make_unique<NoOpScheduler>(fusion, runtime_info, data_cache);
+      break;
     case ScheduleHeuristic::PointWise:
       scheduler_entry = std::make_unique<PointWiseScheduler>(
           fusion, runtime_info, data_cache);
@@ -1402,6 +1636,8 @@ size_t SchedulerEntryHash::operator()(const SchedulerEntry& se) const {
 
 std::string toString(ScheduleHeuristic sh) {
   switch (sh) {
+    case ScheduleHeuristic::NoOp:
+      return "no-op";
     case ScheduleHeuristic::PointWise:
       return "pointwise";
     case ScheduleHeuristic::Reduction:
@@ -1450,6 +1686,9 @@ HeuristicSummary::HeuristicSummary(
     : heuristic_(heuristic) {
   recording_ = true;
   switch (heuristic) {
+    case ScheduleHeuristic::NoOp:
+      NoOpScheduler::canScheduleRunTime(fusion, runtime_info, this);
+      break;
     case ScheduleHeuristic::PointWise:
       getPointwiseHeuristics(fusion, runtime_info, this);
       PointWiseScheduler::canScheduleRunTime(fusion, runtime_info, this);
@@ -1475,14 +1714,39 @@ HeuristicSummary::HeuristicSummary(
 
 void HeuristicSummary::validate() const {
   switch (heuristic_) {
+    case ScheduleHeuristic::NoOp: {
+      // TODO: need to cache the dynamically zero inputs?
+      break;
+    }
+    case ScheduleHeuristic::Transpose:
     case ScheduleHeuristic::PointWise: {
-      TORCH_INTERNAL_ASSERT(entry_type_map_.count(EntryType::DOMAIN_MAP));
+      if (heuristic_ == ScheduleHeuristic::PointWise) {
+        TORCH_INTERNAL_ASSERT(entry_type_map_.count(EntryType::DOMAIN_MAP));
+        TORCH_INTERNAL_ASSERT(
+            entry_type_map_.count(EntryType::REFERENCE_TENSORS));
+        TORCH_INTERNAL_ASSERT(
+            entry_type_map_.count(EntryType::VECTORIZABLE_INPUTS_AND_OUTPUTS));
+        TORCH_INTERNAL_ASSERT(
+            entry_type_map_.count(EntryType::BROADCAST_BYTE_MULTIPLES));
+        TORCH_INTERNAL_ASSERT(
+            entry_type_map_.count(EntryType::CAN_SCHEDULE_TRANSPOSE));
+        auto can_schedule_transpose =
+            entry_type_map_.at(EntryType::CAN_SCHEDULE_TRANSPOSE)
+                ->as<CompileTimeInfo<
+                    HeuristicCompileTime::CanScheduleTranspose>>()
+                ->get();
+        if (!*can_schedule_transpose) {
+          break;
+        }
+      }
       TORCH_INTERNAL_ASSERT(
-          entry_type_map_.count(EntryType::REFERENCE_TENSORS));
+          entry_type_map_.count(EntryType::TRANSPOSE_DOMAIN_MAP));
+      TORCH_INTERNAL_ASSERT(entry_type_map_.count(
+          EntryType::INPUTS_AND_OUTPUTS_INNER_DIM_GROUPS));
       TORCH_INTERNAL_ASSERT(
-          entry_type_map_.count(EntryType::VECTORIZABLE_INPUTS_AND_OUTPUTS));
+          entry_type_map_.count(EntryType::REFERENCE_TENSORS_FOR_GROUPS));
       TORCH_INTERNAL_ASSERT(
-          entry_type_map_.count(EntryType::BROADCAST_BYTE_MULTIPLES));
+          entry_type_map_.count(EntryType::INNER_MOST_DIMS_INFO));
       break;
     }
     case ScheduleHeuristic::Reduction: {
@@ -1512,11 +1776,6 @@ void HeuristicSummary::validate() const {
           entry_type_map_.count(EntryType::SCOPE_PERSISTENT_FACTOR_INFO));
       break;
     }
-    case ScheduleHeuristic::Transpose: {
-      TORCH_INTERNAL_ASSERT(entry_type_map_.count(
-          EntryType::INPUTS_AND_OUTPUTS_INNER_DIM_GROUPS));
-      break;
-    }
     default:
       TORCH_INTERNAL_ASSERT(false, "unknown heuristic");
   }
@@ -1553,7 +1812,10 @@ HeuristicSummaryEntry<EntryClass>::HeuristicSummaryEntry(
 
 // Template instantiation for pre-defined cache entries
 template class HeuristicSummaryEntry<HeuristicCompileTime::DomainMap>;
+template class HeuristicSummaryEntry<HeuristicCompileTime::TransposeDomainMap>;
 template class HeuristicSummaryEntry<HeuristicCompileTime::ReferenceTensors>;
+template class HeuristicSummaryEntry<
+    HeuristicCompileTime::ReferenceTensorsForGroups>;
 template class HeuristicSummaryEntry<
     HeuristicCompileTime::VectorizableInputsAndOutputs>;
 template class HeuristicSummaryEntry<
@@ -1566,6 +1828,9 @@ template class HeuristicSummaryEntry<
 template class HeuristicSummaryEntry<
     HeuristicCompileTime::ScopePersistentFactorInfo>;
 template class HeuristicSummaryEntry<HeuristicCompileTime::BroadcastMultiples>;
+template class HeuristicSummaryEntry<HeuristicCompileTime::InnerMostDimInfo>;
+template class HeuristicSummaryEntry<
+    HeuristicCompileTime::CanScheduleTranspose>;
 
 } // namespace cuda
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.h b/torch/csrc/jit/codegen/cuda/scheduler/registry.h
index 7ed8474935c01..8b34094476349 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/registry.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/registry.h
@@ -27,6 +27,7 @@ class ExpressionEvaluator;
 //!    segmenter and schedulers.
 //!  It is important that input id encoding should be up to date with any change
 //!   of this class to avoid launching compiled kernels with illegal inputs.
+
 class TORCH_CUDA_CU_API SchedulerRuntimeInfo : public NonCopyable {
  public:
   // Max vector size we will consider, in bytes,
@@ -112,6 +113,9 @@ class TORCH_CUDA_CU_API SchedulerRuntimeInfo : public NonCopyable {
   // TODO: Support output tensor pointers
   std::unordered_map<Val*, size_t> input_ptrs_;
 
+  // Copy of aten input tensor strides (in bytes)
+  std::unordered_map<Val*, std::vector<size_t>> input_discontig_strides_;
+
   // Cache for getAlignmentSize
   std::unordered_map<TensorView*, size_t> alignment_map_;
   // Cache for getMaxVectorizableWidth
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index 1bdd1d34a0a9a..b7e85cbc1c5e7 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -1,7 +1,7 @@
 #include <torch/csrc/jit/codegen/cuda/scheduler/transpose.h>
 
 #include <torch/csrc/jit/codegen/cuda/executor_utils.h>
-#include <torch/csrc/jit/codegen/cuda/inline_propagator.h>
+#include <torch/csrc/jit/codegen/cuda/inlining.h>
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
@@ -24,8 +24,6 @@ namespace cuda {
 
 namespace {
 
-constexpr int64_t kMaxTileSize = 32;
-
 // DomainMap uses the ComputeAtMap to find a reference TensorView
 // that maps to all iterDomains in the fusion.
 class DomainMap : public pointwise_utils::DomainMap {
@@ -47,6 +45,20 @@ class DomainMap : public pointwise_utils::DomainMap {
     return result;
   }
 
+  IterDomain* getMappedRootDimIn(TensorView* tv, IterDomain* root_dim) const {
+    // Find the root id mapped to `root_dim`
+    const auto& root_dom = tv->getRootDomain();
+    IterDomain* mapped_id = nullptr;
+    for (auto i : c10::irange(root_dom.size())) {
+      if (ca_map_.idGraph().permissiveNodes().permissiveAreMapped(
+              root_dom[i], root_dim)) {
+        mapped_id = root_dom[i];
+        break;
+      }
+    }
+    return mapped_id;
+  }
+
   static bool hasAtLeastTwoValidGroups(Fusion* fusion) {
     FusionGuard fg(fusion);
     DomainMap domain_map(fusion);
@@ -54,19 +66,51 @@ class DomainMap : public pointwise_utils::DomainMap {
     if (grouped_inputs_outputs.size() < 2) {
       return false;
     }
-    return domain_map.findReferenceFor(grouped_inputs_outputs[0]) != nullptr &&
-        domain_map.findReferenceFor(grouped_inputs_outputs[1]) != nullptr;
+    auto ref1 = domain_map.findReferenceFor(grouped_inputs_outputs[0]);
+    auto ref2 = domain_map.findReferenceFor(grouped_inputs_outputs[1]);
+    if (ref1 == nullptr || ref2 == nullptr) {
+      return false;
+    }
+    // reference 1 is the global reference, so it must have dim mapped the
+    // innermost dim of both groups
+    auto innermost2 = scheduler_utils::innerMostRootDim(ref2);
+    return domain_map.getMappedRootDimIn(ref1, innermost2) != nullptr;
   }
 
-  int getPosMappedTo(TensorView* tv, IterDomain* id) const {
+  int getInnerLeafDim(TensorView* tv, IterDomain* root_dim) const {
+    auto mapped_id = getMappedRootDimIn(tv, root_dim);
+    TORCH_INTERNAL_ASSERT(
+        mapped_id != nullptr,
+        "Can not find ID mapped to ",
+        root_dim,
+        " in tensor ",
+        tv);
+    // Project the root id to leaf id
+    while (!mapped_id->uses().empty()) {
+      TORCH_INTERNAL_ASSERT(mapped_id->uses().size() == 1);
+      auto expr = mapped_id->uses()[0];
+      if (expr->isA<Split>()) {
+        mapped_id = expr->as<Split>()->inner();
+      } else {
+        auto merge = expr->as<Merge>();
+        TORCH_INTERNAL_ASSERT(
+            mapped_id == merge->inner(),
+            "Can not find ID mapped to ",
+            root_dim,
+            " in tensor ",
+            tv);
+        mapped_id = merge->out();
+      }
+    }
+    // Find the position of the leaf id
     const auto& dom = tv->domain()->domain();
     for (auto i : c10::irange(dom.size())) {
-      if (areExactMapped(id, tv->axis(i))) {
+      if (dom[i] == mapped_id) {
         return i;
       }
     }
     TORCH_INTERNAL_ASSERT(
-        false, "Can not find ID mapped to ", id, " in tensor ", tv);
+        false, "Can not find ID mapped to ", root_dim, " in tensor ", tv);
   }
 
   // Group inputs and outputs of a fusion by its inner most domain. For example
@@ -128,6 +172,12 @@ class DomainMap : public pointwise_utils::DomainMap {
         // Then we still want to T1 and T2 to be grouped together.
         auto group =
             scheduler_utils::getInputsOutputsWithInnerDim(tv, true, false);
+        if (group.empty()) {
+          // In case that the inner most dim of tv is not found (for example, tv
+          // is a fusion input with only reductions), we just return a null
+          // result which will tell the scheduler to reject the fusion
+          return {};
+        }
         for (auto member_tv : group) {
           if (grouped.count(member_tv) == 0) {
             grouped.emplace(member_tv);
@@ -178,12 +228,26 @@ class DomainMap : public pointwise_utils::DomainMap {
 //   T0[I0*I1o*I5*I6{1024*1024/4*8}, I1i*I2*I3*I4{32}]
 void maybeBuildVirtualInnerDims(
     TransposeParams& params,
+    int64_t device_multiprocessor_count,
+    int64_t n_elems,
     const std::vector<int64_t>& shape_in_ref1,
     int64_t inner_most1,
     int64_t inner_most2) {
   int64_t merged_size1 = shape_in_ref1[inner_most1];
   int64_t merged_size2 = shape_in_ref1[inner_most2];
 
+  int64_t actual_tile_size1 =
+      std::min<int64_t>(merged_size1, params.tile_size1);
+  int64_t actual_tile_size2 =
+      std::min<int64_t>(merged_size2, params.tile_size2);
+  int64_t wave_elements =
+      device_multiprocessor_count * actual_tile_size1 * actual_tile_size2;
+
+  if (wave_elements >= n_elems) {
+    // if one full wave can handle all elements, don't create virtual inner dims
+    return;
+  }
+
   // merge inner_most1 and inner_most2 left until we are done or we can no
   // longer do so
   int64_t dim = inner_most1 - 1;
@@ -240,22 +304,49 @@ void maybeBuildVirtualInnerDims(
   //    both virtual innermost dim.
   // 2. The satisfied one did not merge in anything. For example,
   //    T0[I0{1024*1024}, I1{2}]
+  //    If this is the case, this means that we need to split the large
+  //    inner-most dimension to satisfy the small innermost dimension
   int64_t large_dim;
   int64_t split_factor;
+  bool split_inner_most;
   if (merged_size1 < params.tile_size1) {
     if (params.dims_merged_with_2.empty()) {
+#if SUPPORT_SPLITTING_INNERMOST_DIM
+      // https://github.com/csarofeen/pytorch/issues/1964
       // case 2
+      split_inner_most = true;
+      large_dim = inner_most2;
+      split_factor = params.tile_size2;
+#else
+      // disabled due to indexing error
       return;
+#endif
+    } else {
+      // case 1
+      split_inner_most = false;
+      large_dim = params.dims_merged_with_2.back();
+      auto prev_merged_size2 = merged_size2 / shape_in_ref1[large_dim];
+      split_factor = ceilDiv(params.tile_size2, prev_merged_size2);
     }
-    large_dim = params.dims_merged_with_2.back();
-    split_factor = ceilDiv(params.tile_size1, merged_size1);
   } else {
     if (params.dims_merged_with_1.empty()) {
+#if SUPPORT_SPLITTING_INNERMOST_DIM
+      // https://github.com/csarofeen/pytorch/issues/1964
       // case 2
+      split_inner_most = true;
+      large_dim = inner_most1;
+      split_factor = params.tile_size1;
+#else
+      // disabled due to indexing error
       return;
+#endif
+    } else {
+      // case 1
+      split_inner_most = false;
+      large_dim = params.dims_merged_with_1.back();
+      auto prev_merged_size1 = merged_size1 / shape_in_ref1[large_dim];
+      split_factor = ceilDiv(params.tile_size1, prev_merged_size1);
     }
-    large_dim = params.dims_merged_with_1.back();
-    split_factor = ceilDiv(params.tile_size2, merged_size2);
   }
   params.split_before_tiling.push_back({large_dim, split_factor});
   // adjust all dims to after-split
@@ -271,61 +362,54 @@ void maybeBuildVirtualInnerDims(
   }
   // Give the split-out dim to the unsatisfied one, so that both are satisfied.
   if (merged_size1 < params.tile_size1) {
-    params.dims_merged_with_2.pop_back();
-    params.dims_merged_with_2.push_back(large_dim + 1);
+    if (!split_inner_most) {
+      params.dims_merged_with_2.pop_back();
+      params.dims_merged_with_2.push_back(large_dim + 1);
+    }
     params.dims_merged_with_1.push_back(large_dim);
   } else {
-    params.dims_merged_with_1.pop_back();
-    params.dims_merged_with_1.push_back(large_dim + 1);
+    if (!split_inner_most) {
+      params.dims_merged_with_1.pop_back();
+      params.dims_merged_with_1.push_back(large_dim + 1);
+    }
     params.dims_merged_with_2.push_back(large_dim);
   }
 }
 
-} // namespace
-
-bool hasAtLeastTwoValidGroups(Fusion* fusion) {
-  return DomainMap::hasAtLeastTwoValidGroups(fusion);
-}
-
-std::shared_ptr<TransposeParams> getTransposeHeuristics(
-    Fusion* fusion,
-    const at::ArrayRef<c10::IValue>& runtime_inputs,
-    HeuristicSummary* data_cache) {
-  SchedulerRuntimeInfo runtime_info(fusion, runtime_inputs, true);
-  return getTransposeHeuristics(fusion, runtime_info, data_cache);
-}
-
-std::shared_ptr<TransposeParams> getTransposeHeuristics(
-    Fusion* fusion,
-    SchedulerRuntimeInfo& runtime_info,
-    HeuristicSummary* data_cache) {
-  FUSER_PERF_SCOPE("getTransposeHeuristics");
-
-  FusionGuard fg(fusion);
-
-  // Incase any buffer is of type DataType::Index
-  DataType index_type = indexModeToDtype(runtime_info.getIndexMode());
-
+HeuristicSummaryEntry<HeuristicCompileTime::TransposeDomainMap> getDomainMap(
+    HeuristicSummary* data_cache,
+    Fusion* fusion) {
   auto domain_map_entry =
-      HeuristicSummaryEntry<HeuristicCompileTime::DomainMap>(
+      HeuristicSummaryEntry<HeuristicCompileTime::TransposeDomainMap>(
           data_cache,
           [fusion]() { return std::make_unique<DomainMap>(fusion); });
-  const auto& domain_map = dynamic_cast<DomainMap&>(domain_map_entry.get());
+  return domain_map_entry;
+}
 
+HeuristicSummaryEntry<HeuristicCompileTime::InputsOutputsInnerDimGroups>
+getInputsOutputsGroups(HeuristicSummary* data_cache, DomainMap& domain_map) {
   auto grouped_inputs_outputs_entry =
       HeuristicSummaryEntry<HeuristicCompileTime::InputsOutputsInnerDimGroups>(
           data_cache, [&domain_map]() {
             return std::make_unique<std::vector<std::vector<TensorView*>>>(
                 domain_map.groupInputsOutputsByInnerDim());
           });
-  auto grouped_inputs_outputs = grouped_inputs_outputs_entry.get();
+  auto& grouped_inputs_outputs = grouped_inputs_outputs_entry.get();
 
   TORCH_INTERNAL_ASSERT(
       grouped_inputs_outputs.size() >= 2,
       "Can not find mismatched inner most dim, should use pointwise scheduler.");
 
+  return grouped_inputs_outputs_entry;
+}
+
+HeuristicSummaryEntry<HeuristicCompileTime::ReferenceTensorsForGroups>
+getReferenceTensors(
+    HeuristicSummary* data_cache,
+    DomainMap& domain_map,
+    std::vector<std::vector<TensorView*>>& grouped_inputs_outputs) {
   auto reference_tensors_entry =
-      HeuristicSummaryEntry<HeuristicCompileTime::ReferenceTensors>(
+      HeuristicSummaryEntry<HeuristicCompileTime::ReferenceTensorsForGroups>(
           data_cache, [&domain_map, &grouped_inputs_outputs]() {
             std::vector<TensorView*> data{
                 domain_map.findReferenceFor(grouped_inputs_outputs[0]),
@@ -340,13 +424,17 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
       reference1 != nullptr, "Unable to find reference tensor for group 1");
   TORCH_INTERNAL_ASSERT(
       reference2 != nullptr, "Unable to find reference tensor for group 2");
+  return reference_tensors_entry;
+}
 
-  const int64_t device_multiprocessor_count =
-      (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
-
-  auto ref_root = reference1->getMaybeRFactorDomain();
-  std::vector<int64_t> shape_in_ref1;
-  shape_in_ref1.reserve(reference1->nDims());
+std::pair<std::vector<int64_t>, int64_t> getShapeInReference(
+    HeuristicSummary* data_cache,
+    SchedulerRuntimeInfo& runtime_info,
+    TensorView* reference,
+    DomainMap& domain_map) {
+  auto ref_root = reference->getMaybeRFactorDomain();
+  std::vector<int64_t> shape_in_ref;
+  shape_in_ref.reserve(reference->nDims());
   int64_t n_elems = 1;
   for (size_t ref_i = 0; ref_i < ref_root.size(); ref_i++) {
     auto id = ref_root[ref_i];
@@ -360,36 +448,175 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
         ref_root[ref_i]->extent()->toInlineString());
     int64_t size = inferred_val->as<int64_t>();
     n_elems *= size;
-    shape_in_ref1.push_back(size);
+    shape_in_ref.push_back(size);
   }
+  return {shape_in_ref, n_elems};
+}
 
-  auto params = std::make_shared<TransposeParams>("Transpose heuristics");
+HeuristicSummaryEntry<HeuristicCompileTime::InnerMostDimInfo>
+getInnerMostDimInfoInReference(
+    HeuristicSummary* data_cache,
+    const std::vector<TensorView*>& group_references,
+    TensorView* global_reference,
+    DomainMap& domain_map) {
+  auto innermost_info_entry =
+      HeuristicSummaryEntry<HeuristicCompileTime::InnerMostDimInfo>(
+          data_cache, [&]() {
+            std::vector<int64_t> data;
+            data.reserve(group_references.size());
+            for (auto ref_tv : group_references) {
+              auto inner_most_id = scheduler_utils::innerMostRootDim(ref_tv);
+              auto inner_most_pos_in_global_ref =
+                  domain_map.getInnerLeafDim(global_reference, inner_most_id);
+              data.emplace_back(inner_most_pos_in_global_ref);
+            }
+            return std::make_unique<std::vector<int64_t>>(std::move(data));
+          });
+  return innermost_info_entry;
+}
 
-  // If the problem size is small use small tile sizes.
-  if (n_elems < device_multiprocessor_count * kMaxTileSize * kMaxTileSize) {
-    params->tile_size1 = 8;
-    params->tile_size2 = 8;
-    // TODO: I was trying the following but I got silent wrong result
-    // params->tile_size1 = 8;
-    // params->tile_size2 = 4;
-    // This should not happen, because the correctness should be irrevalent to
-    // schedulers. We don't have to use tile size (8, 4), but we need to fix our
-    // bug in codegen.
+} // namespace
+
+std::string getTransposeRuntimeRejectReason(
+    Fusion* fusion,
+    HeuristicSummary* data_cache,
+    SchedulerRuntimeInfo& runtime_info) {
+  auto domain_map_entry = getDomainMap(data_cache, fusion);
+  auto& domain_map = dynamic_cast<DomainMap&>(domain_map_entry.get());
+  auto grouped_inputs_outputs_entry =
+      getInputsOutputsGroups(data_cache, domain_map);
+  auto grouped_inputs_outputs = grouped_inputs_outputs_entry.get();
+  auto reference_tensors_entry =
+      getReferenceTensors(data_cache, domain_map, grouped_inputs_outputs);
+  auto reference_tensors = reference_tensors_entry.get();
+  TensorView* reference1 = reference_tensors[0];
+
+  auto pair =
+      getShapeInReference(data_cache, runtime_info, reference1, domain_map);
+  auto& shape_in_ref1 = pair.first;
+  auto& n_elems = pair.second;
+
+  auto innermost_info_entry = getInnerMostDimInfoInReference(
+      data_cache, reference_tensors, reference1, domain_map);
+  auto innermost_info = innermost_info_entry.get();
+
+  constexpr size_t default_tile_elements =
+      TransposeParams::getDefaultTileSize() *
+      TransposeParams::getDefaultTileSize();
+
+  // don't schedule with transpose scheduler if less than a full wave
+  const int64_t device_multiprocessor_count =
+      (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+  auto elements_per_wave = device_multiprocessor_count * default_tile_elements;
+  if (elements_per_wave > n_elems) {
+    return "Transpose scheduler does not perform well on small problem sizes.";
   }
 
-  // Expand inner-most dims to virtual inner-most dims so that the inner-most
-  // dims has at least tile_size elements
-  auto inner_most_id1 = scheduler_utils::innerMostRootDim(reference1);
-  auto inner_most_id2 = scheduler_utils::innerMostRootDim(reference2);
+  auto inner_most_pos1_in_ref1 = innermost_info[0];
+  auto inner_most_pos2_in_ref1 = innermost_info[1];
+
+  auto inner_size1 = shape_in_ref1[inner_most_pos1_in_ref1];
+  auto inner_size2 = shape_in_ref1[inner_most_pos2_in_ref1];
+
+  // For cases like
+  //   transpose(T0[1000000000, 2, 2], 1, 2)
+  // the pointwise scheduler should provide better performance, because it
+  // provides coalesced memory access
+  if (inner_size1 * inner_size2 < default_tile_elements) {
+    auto inner_elements = inner_size1 * inner_size2;
+    for (int64_t i = inner_most_pos2_in_ref1 + 1; i < inner_most_pos1_in_ref1;
+         i++) {
+      inner_elements *= shape_in_ref1[i];
+    }
+    // note that the algorithm here is only an approximation because it only
+    // checks reference1. In principle, we need to check all inputs and outputs
+    // to get an accurate result, but that is too much work. I think checking
+    // only reference 1 is fine for now. Below is an example where the
+    // approximation here will not work:
+    //   T0[10000000, 2, 3] (reference 1)
+    //   T1[2, 10000000, 3] input/output
+    //   T2[2, 10000000, 3] input/output
+    //   T3[2, 10000000, 3] input/output
+    //   T4[3, 10000000, 2] input/output
+    //   T5[3, 10000000, 2] input/output
+    if (inner_elements < default_tile_elements) {
+      return "Inner transpose of small dimensions should be scheduled by the "
+             "pointwise scheduler because it provides better memory coalescing";
+    }
+  }
+
+#if !SUPPORT_SPLITTING_INNERMOST_DIM
+  if (n_elems / inner_size1 < TransposeParams::getDefaultTileSize() ||
+      n_elems / inner_size2 < TransposeParams::getDefaultTileSize()) {
+    return "Splitting of inner most dim for the creation of virtual inner most dim "
+           "is disabled due to indexing bug, skipping this case at runtime for now"
+           "See: https://github.com/csarofeen/pytorch/issues/1964";
+  }
+#endif
+
+  return "";
+}
+
+bool hasAtLeastTwoValidGroups(Fusion* fusion) {
+  return DomainMap::hasAtLeastTwoValidGroups(fusion);
+}
+
+std::shared_ptr<TransposeParams> getTransposeHeuristics(
+    Fusion* fusion,
+    const at::ArrayRef<c10::IValue>& runtime_inputs,
+    HeuristicSummary* data_cache) {
+  SchedulerRuntimeInfo runtime_info(fusion, runtime_inputs, true);
+  return getTransposeHeuristics(fusion, runtime_info, data_cache);
+}
+
+std::shared_ptr<TransposeParams> getTransposeHeuristics(
+    Fusion* fusion,
+    SchedulerRuntimeInfo& runtime_info,
+    HeuristicSummary* data_cache) {
+  FUSER_PERF_SCOPE("getTransposeHeuristics");
+
+  FusionGuard fg(fusion);
+
+  // Incase any buffer is of type DataType::Index
+  DataType index_type = indexModeToDtype(runtime_info.getIndexMode());
+
+  auto domain_map_entry = getDomainMap(data_cache, fusion);
+  auto& domain_map = dynamic_cast<DomainMap&>(domain_map_entry.get());
+  auto grouped_inputs_outputs_entry =
+      getInputsOutputsGroups(data_cache, domain_map);
+  auto grouped_inputs_outputs = grouped_inputs_outputs_entry.get();
+  auto reference_tensors_entry =
+      getReferenceTensors(data_cache, domain_map, grouped_inputs_outputs);
+  auto reference_tensors = reference_tensors_entry.get();
+  TensorView* reference1 = reference_tensors[0];
+  TensorView* reference2 = reference_tensors[1];
+  auto pair =
+      getShapeInReference(data_cache, runtime_info, reference1, domain_map);
+  auto& shape_in_ref1 = pair.first;
+  auto& n_elems = pair.second;
+
+  const int64_t device_multiprocessor_count =
+      (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
 
-  auto inner_most_pos1_in_ref1 =
-      domain_map.getPosMappedTo(reference1, inner_most_id1);
-  auto inner_most_pos2_in_ref1 =
-      domain_map.getPosMappedTo(reference1, inner_most_id2);
+  auto innermost_info_entry = getInnerMostDimInfoInReference(
+      data_cache, reference_tensors, reference1, domain_map);
+  auto innermost_info = innermost_info_entry.get();
 
+  auto inner_most_pos1_in_ref1 = innermost_info[0];
+  auto inner_most_pos2_in_ref1 = innermost_info[1];
+
+  auto params = std::make_shared<TransposeParams>("Transpose heuristics");
+
+  // Expand inner-most dims to virtual inner-most dims so that the inner-most
+  // dims has at least tile_size elements
   // See note [Supporting small transpose dimensions]
   maybeBuildVirtualInnerDims(
-      *params, shape_in_ref1, inner_most_pos1_in_ref1, inner_most_pos2_in_ref1);
+      *params,
+      device_multiprocessor_count,
+      n_elems,
+      shape_in_ref1,
+      inner_most_pos1_in_ref1,
+      inner_most_pos2_in_ref1);
 
   // Note [vectorization and unroll of input and output]
   //
@@ -482,13 +709,20 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
     std::cerr << "\n===== Transpose Stats ========\n"
               << "inputs: " << ir_utils::toString(fusion->inputs()) << "\n"
               << "outputs: " << ir_utils::toString(fusion->outputs()) << "\n"
+              << "shape: " << shape_in_ref1 << "\n"
               << "num_elems: " << n_elems << "\n"
               << "n_input_tensors: " << n_input_tensors << "\n"
               << "max_input_dtype_size: " << max_input_dtype_size << "\n"
               << "group 1: " << ir_utils::toString(grouped_inputs_outputs[0])
               << "\n"
+              << "reference1: " << reference1 << "\n"
+              << "inner_most_id1 position: " << inner_most_pos1_in_ref1
+              << " (in reference 1)\n"
               << "group 2: " << ir_utils::toString(grouped_inputs_outputs[1])
-              << std::endl;
+              << "\n"
+              << "reference2: " << reference2 << "\n"
+              << "inner_most_id2 position: " << inner_most_pos2_in_ref1
+              << " (in reference 1)" << std::endl;
     if (!params->split_before_tiling.empty() ||
         !params->dims_merged_with_1.empty() ||
         !params->dims_merged_with_2.empty()) {
@@ -565,17 +799,19 @@ void scheduleTranspose(Fusion* fusion, TransposeParams params) {
   auto grouped_inputs_outputs = domain_map.groupInputsOutputsByInnerDim();
   TORCH_INTERNAL_ASSERT(grouped_inputs_outputs.size() >= 2);
 
-  // We need something similar to `cacheFork` for input tensors in group 2. We
-  // need this because we will want to propagate to the entire DAG except group
-  // 2 and its cached inputs, so we need to make sure the DAG is still connected
-  // if we remove group and its cached inputs. For example
-  //    t0
-  //    |
-  //   cache
-  //   |  |
-  //  t1  t2
-  // if groups = {{t1, t2}, {t0}}, then removing {t0, cache} from the DAG will
-  // make it disconnected.
+  /*
+   * We need something similar to `cacheFork` for input tensors in group 2. We
+   * need this because we will want to propagate to the entire DAG except group
+   * 2 and its cached inputs, so we need to make sure the DAG is still connected
+   * if we remove group and its cached inputs. For example
+   *    t0
+   *    |
+   *   cache
+   *   /  \
+   *  t1  t2
+   * if groups = {{t1, t2}, {t0}}, then removing {t0, cache} from the DAG will
+   * make it disconnected.
+   */
   std::unordered_set<TensorView*> group2_and_cached_inputs(
       grouped_inputs_outputs[1].begin(), grouped_inputs_outputs[1].end());
   for (auto tv : grouped_inputs_outputs[1]) {
@@ -643,9 +879,9 @@ void scheduleTranspose(Fusion* fusion, TransposeParams params) {
 
   // merge with inner most dims to get virtual inner most dims
   size_t inner_most_pos1_in_ref1 =
-      domain_map.getPosMappedTo(reference1, inner_most_id1);
+      domain_map.getInnerLeafDim(reference1, inner_most_id1);
   size_t inner_most_pos2_in_ref1 =
-      domain_map.getPosMappedTo(reference1, inner_most_id2);
+      domain_map.getInnerLeafDim(reference1, inner_most_id2);
   if (merged1.has_value()) {
     if (inner_most_pos1_in_ref1 < *merged1) {
       reference1->reorder(
@@ -895,9 +1131,7 @@ void scheduleTranspose(Fusion* fusion, TransposeParams params) {
   }
 
   // Inline
-  InlinePropagator inline_propagator(
-      reference1, -1, ComputeAtMode::MostInlined);
-  entire_dag.traverse(&inline_propagator);
+  inlineMost();
 }
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.h b/torch/csrc/jit/codegen/cuda/scheduler/transpose.h
index 0cf6920ea058b..c1a4ab6efb6ae 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.h
@@ -5,6 +5,8 @@
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h>
 
+#define SUPPORT_SPLITTING_INNERMOST_DIM 0
+
 namespace torch {
 namespace jit {
 namespace fuser {
@@ -100,6 +102,13 @@ TORCH_CUDA_CU_API LaunchParams scheduleTranspose(
 //! groups, each with a fully broadcasted reference tensor.
 TORCH_CUDA_CU_API bool hasAtLeastTwoValidGroups(Fusion* fusion);
 
+// If can schedule at runtime, returns empty string, otherwise returns the
+// reason why we should not schedule at runtime.
+TORCH_CUDA_CU_API std::string getTransposeRuntimeRejectReason(
+    Fusion* fusion,
+    HeuristicSummary* data_cache,
+    SchedulerRuntimeInfo& runtime_info);
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h b/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h
index d672b6dc965bd..5e56278a7f16b 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h
@@ -21,6 +21,10 @@ class TransposeParams : public HeuristicParams {
     return 128;
   }
 
+  static constexpr size_t getDefaultTileSize() {
+    return 32;
+  }
+
   // See note [Supporting small transpose dimensions], all dims are positions in
   // reference1
   std::vector<std::pair<size_t, size_t>> split_before_tiling = {};
@@ -37,10 +41,10 @@ class TransposeParams : public HeuristicParams {
   // https://github.com/csarofeen/pytorch/pull/1854#discussion_r928143729
 
   // Tile size for the inner most dim of tensors in the first group
-  size_t tile_size1 = 32;
+  size_t tile_size1 = getDefaultTileSize();
 
   // Tile size for the inner most dim of tensors in the second group
-  size_t tile_size2 = 32;
+  size_t tile_size2 = getDefaultTileSize();
 
   using HeuristicParams::HeuristicParams;
 
@@ -65,8 +69,7 @@ class TransposeParams : public HeuristicParams {
     std::stringstream ss;
     ss << "\n===== Transpose Parameters ========\n"
        << (tag == "" ? "" : "Tag: ") << tag << " Transpose Characteristics:\n"
-       << " Gridx: " << lparams.gdimx() << " BlckX: " << lparams.bdimx()
-       << "\n";
+       << " BlckX: " << lparams.bdimx() << "\n";
     ss << " input tile size: " << tile_size1 << "\n";
     ss << " output tile size: " << tile_size2 << "\n";
     int elements_per_tile = tile_size1 * tile_size2;
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp b/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp
index 6c0c8087270e9..d985da926354b 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp
@@ -11,6 +11,8 @@
 #include <torch/csrc/jit/codegen/cuda/scheduler/mma_utils.h>
 #include <torch/csrc/jit/codegen/cuda/transform_replay.h>
 
+#include <algorithm>
+
 namespace torch {
 namespace jit {
 namespace fuser {
@@ -298,21 +300,6 @@ void parallelizeAllLike(
   }
 }
 
-void computeAtInputs(TensorView* consumer, int pos, ComputeAtMode mode) {
-  for (auto inp_tv : ir_utils::inputTvsOf(consumer)) {
-    inp_tv->computeAt(consumer, pos, mode);
-  }
-}
-
-void computeWithOutputs(TensorView* producer, int pos, ComputeAtMode mode) {
-  for (auto out_tv : ir_utils::outputTvsOf(producer)) {
-    if (out_tv == producer) {
-      continue;
-    }
-    producer->computeWith(out_tv, pos, mode);
-  }
-}
-
 namespace {
 
 // Find the resolution points of the persistent buffers in the provided
@@ -1409,7 +1396,109 @@ std::vector<TensorView*> getInputsOutputsWithInnerDim(
   return vectorizable_tensors;
 }
 
-std::vector<BroadcastMultiple> getBroadcastMultiples(
+namespace {
+// Holder return struct for the below function.
+struct DisjointViewSetInfo {
+  // const* to the disjoint set in disjoint_view_set passed in to
+  // getDisjointViewSetsOf each iterdomain in the rfactor of ref is mapped to.
+  //
+  // WARNING: these pointers are relative to the disjoint_view_set reference
+  // passed into getDisjointViewSetsOf it's the user's responsibillity to
+  // maintain the lifetime of that reference to match this vector.
+  std::vector<const VectorOfUniqueEntries<IterDomain*>*> disjoint_sets_of_ref;
+
+  // Unique ID associated to the disjoint view group the rfactor id belongs to
+  // in disjoint_sets_of_ref. It's straight forward to map from
+  // disjoint_sets_of_ref to the vector, but not the other way around.
+  std::vector<int> disjoint_set_ids;
+
+  // TensorView reference the above vectors are relative to.
+  TensorView* ref;
+};
+
+// Returns disjoint view sets mapped onto the given reference. Returns a pair
+// of vectors of size rfactorDomain of reference. Vector of
+// VectorOfUniqueEntries returns a const* to the disjoint set in
+// disjoint_view_set the iterdomain is mapped to. Integer vector represents
+// which disjoint view group the rfactor id belongs to. It's straight forward
+// to map from the former to the latter, but not the latter to former.
+//
+// Since we return a const* to entries in disjoint_view_set, it must be passed
+// in as a reference. Algorithm is N^2 based on number of dims in reference,
+// but generating the disjoint view set is likely the limiter on perf of this
+// function.
+DisjointViewSetInfo getDisjointViewSetsOf(
+    Fusion* fusion,
+    TensorView* of,
+    DisjointSets<IterDomain*>& disjoint_view_set) {
+  auto rfactor_dom = of->getMaybeRFactorDomain();
+  if (rfactor_dom.size() == 0) {
+    return {};
+  }
+
+  // Start naming id's based on 0 so the inner most dimension will always be
+  // 0, then as groups are discovered marching to the left their id will
+  // increase. i.e. we could have something like [0, 3, 1, 2, 1, 0] as a
+  // result.
+  std::vector<int> disjoint_group_ids(rfactor_dom.size(), -1);
+  std::vector<const VectorOfUniqueEntries<IterDomain*>*> disjoint_set_of_id(
+      rfactor_dom.size(), nullptr);
+  int current_group_id = 0;
+  int ref_dim_i = rfactor_dom.size() - 1;
+
+  while (ref_dim_i >= 0) {
+    if (disjoint_group_ids[ref_dim_i] != -1) {
+      // Already put in a group, continue
+      ref_dim_i--;
+      continue;
+    }
+
+    const auto& ref_group =
+        disjoint_view_set.getDisjointSetOf(rfactor_dom[ref_dim_i]);
+
+    int other_dim_i = ref_dim_i;
+    while (other_dim_i >= 0) {
+      const auto& other_group =
+          disjoint_view_set.getDisjointSetOf(rfactor_dom[other_dim_i]);
+      if (&ref_group == &other_group) {
+        disjoint_group_ids[other_dim_i] = current_group_id;
+        disjoint_set_of_id[other_dim_i] = &ref_group;
+      }
+      other_dim_i--;
+    }
+
+    ref_dim_i--;
+    current_group_id++;
+  }
+
+  TORCH_INTERNAL_ASSERT(
+      std::none_of(
+          disjoint_group_ids.begin(),
+          disjoint_group_ids.end(),
+          [](int i) { return i == -1; }),
+      "Failed to generate the view disjoint groups of the reference ",
+      of->toString());
+
+  TORCH_INTERNAL_ASSERT(
+      std::none_of(
+          disjoint_set_of_id.begin(),
+          disjoint_set_of_id.end(),
+          [](const VectorOfUniqueEntries<IterDomain*>* ptr) {
+            return ptr == nullptr;
+          }),
+      "Failed to generate the view disjoint groups of the reference ",
+      of->toString());
+
+  DisjointViewSetInfo info;
+  info.disjoint_sets_of_ref = disjoint_set_of_id;
+  info.disjoint_set_ids = disjoint_group_ids;
+  info.ref = of;
+
+  return info;
+}
+} // namespace
+
+BroadcastMultipleInformation getBroadcastMultiples(
     TensorView* reference_tv,
     DataType index_type) {
   auto fusion = reference_tv->fusion();
@@ -1418,6 +1507,13 @@ std::vector<BroadcastMultiple> getBroadcastMultiples(
   std::vector<BroadcastMultiple> multiples(
       reference_tv->getMaybeRFactorDomain().size());
 
+  auto disjoint_view_sets = disjointViewSets(fusion);
+  auto disjoint_set_information = scheduler_utils::getDisjointViewSetsOf(
+      fusion, reference_tv, disjoint_view_sets);
+
+  auto ref_disjoint_sets = disjoint_set_information.disjoint_sets_of_ref;
+  auto ref_disjoint_set_ids = disjoint_set_information.disjoint_set_ids;
+
   // All input or output tensor views
   std::vector<TensorView*> in_out_tvs;
   {
@@ -1427,8 +1523,8 @@ std::vector<BroadcastMultiple> getBroadcastMultiples(
     in_out_tvs.insert(in_out_tvs.end(), out_tvs.begin(), out_tvs.end());
   }
 
-  // Shouldn't matter if we use EXACT or PERMISSIVE mapping mode for compute at
-  // map as we're just looking at the root mappings.
+  // Shouldn't matter if we use EXACT or PERMISSIVE mapping mode for compute
+  // at map as we're just looking at the root mappings.
   auto ca_map = ComputeAtMap(fusion);
 
   auto ref_root_domain = reference_tv->getMaybeRFactorDomain();
@@ -1448,35 +1544,60 @@ std::vector<BroadcastMultiple> getBroadcastMultiples(
       if (ref_id->isBroadcast() || ref_id->isReduction()) {
         continue;
       }
-      auto map_it = std::find_if(
-          in_out_tv_domain_list.begin(),
-          in_out_tv_domain_list.end(),
-          [&ref_id, &ca_map](IterDomain* in_out_tv_id) {
-            return ca_map.areMapped(in_out_tv_id, ref_id, IdMappingMode::EXACT);
-          });
 
-      if (map_it == in_out_tv_domain_list.end()) {
+      bool ref_id_has_view_transforms = std::count(
+                                            ref_disjoint_set_ids.begin(),
+                                            ref_disjoint_set_ids.end(),
+                                            ref_disjoint_set_ids[ref_i]) > 1;
+
+      // Could have multiple mappings if there's view transforms
+      std::vector<IterDomain*> mapped_ids;
+      if (!ref_id_has_view_transforms) {
+        auto mapped_it = std::find_if(
+            in_out_tv_domain_list.begin(),
+            in_out_tv_domain_list.end(),
+            [&ref_id, &ca_map](IterDomain* in_out_tv_id) {
+              return ca_map.areMapped(
+                  in_out_tv_id, ref_id, IdMappingMode::EXACT);
+            });
+        if (mapped_it != in_out_tv_domain_list.end()) {
+          mapped_ids.push_back(*mapped_it);
+        }
+      } else {
+        for (auto in_out_id : in_out_tv_domain) {
+          if (ref_disjoint_sets[ref_i]->has(in_out_id)) {
+            mapped_ids.push_back(in_out_id);
+          }
+        }
+      }
+
+      // Nothing maps to reference, no contribution to multiples for this dim
+      if (mapped_ids.empty()) {
         continue;
       }
 
-      // If input/output id is broadcast or reduction
-      if ((*map_it)->isBroadcast() || (*map_it)->isReduction()) {
+      if (std::all_of(mapped_ids.begin(), mapped_ids.end(), [](IterDomain* id) {
+            return id->isReduction() || id->isBroadcast();
+          })) {
         continue;
       }
 
+      // If any iteration domain in the input or output that's mapped through
+      // the view disjoint set is not a reduction or broadcast, assume it's a
+      // full dimension for the sake of the pointwise scheduler.
       mapped_axes[ref_i] = true;
-      in_out_tv_domain_list.erase(map_it);
     }
 
     // For each break point position if there an lhs or rhs multiple based on
-    // this tensor add it to the global multiplier
+    // this tensor add it to the global multiplier. The only time we consider
+    // we can benefit from broadcast is if the entire left or right side the
+    // break point is all broadcasts.
     {
       bool rhs = false;
       bool lhs = false;
       auto dtype_size =
           dataTypeSize(in_out_tv->getDataType().value(), index_type);
-      for (size_t mapped_axes_i = 0; mapped_axes_i < mapped_axes.size();
-           mapped_axes_i++) {
+      for (auto mapped_axes_i : c10::irange(mapped_axes.size())) {
         auto lhs_i = mapped_axes_i;
         auto rhs_i = mapped_axes.size() - 1 - mapped_axes_i;
 
@@ -1493,91 +1614,10 @@ std::vector<BroadcastMultiple> getBroadcastMultiples(
       }
     }
   }
-
-  return multiples;
-}
-
-size_t collectMaxVectorizeSizeWithContigMerge(
-    TensorView* tv,
-    IterDomain* leaf_merged_domain,
-    size_t max_vector_size_in_byte,
-    ExpressionEvaluator& expression_evaluator,
-    DataType index_type) {
-  // Maybe too conservative, but only handles fully contiguous tensors
-  // TODO: Relax the contiguity constraint to be similar to that in index
-  // computing. Just looking for all merged root domains in the right order, all
-  // merged root dimensions are contiguous, all merged root dimensions are next
-  // to eachother (exlcuding broadcast).
-  if (std::any_of(
-          tv->domain()->contiguity().begin(),
-          tv->domain()->contiguity().end(),
-          [](const auto contig) { return !contig; })) {
-    return 1;
-  }
-
-  auto dtype_size = dataTypeSize(tv->dtype(), index_type);
-  const size_t max_vector_size = max_vector_size_in_byte / dtype_size;
-
-  // Assume no halo-related expression appears in the fusion. No
-  // broadcast is merged, so indexability can be assumed to be true.
-  ContigIDs contigIds(
-      {leaf_merged_domain},
-      tv->getMaybeRFactorDomain(),
-      tv->domain()->contiguity(),
-      {},
-      {},
-      true,
-      true);
-
-  auto innermost_root_id = tv->getMaybeRFactorDomain().back();
-  auto indexed_id = contigIds.rootToIndexedID().at(innermost_root_id);
-
-  size_t merged_size = 1;
-  // If the indexed ID is a contig merged domain, i.e., it is
-  // different from innermost_root_id, we accumulate the extents of
-  // all the root domains covered by the contig indexed ID. Otherwise,
-  // just look at the extent of the innermost root ID.
-  if (indexed_id != innermost_root_id) {
-    const auto& within_root = contigIds.withinContigIDs().at(indexed_id);
-    for (auto root_id : tv->getMaybeRFactorDomain()) {
-      if (within_root.find(root_id) == within_root.end()) {
-        continue;
-      }
-      auto maybe_dimension_size =
-          expression_evaluator.evaluate(root_id->extent());
-      TORCH_INTERNAL_ASSERT(
-          maybe_dimension_size.has_value(),
-          "Unknown extent of tv: ",
-          tv->toString(),
-          ", id: ",
-          root_id->toString());
-      merged_size *= maybe_dimension_size->as<int64_t>();
-    }
-  } else {
-    auto maybe_dimension_size =
-        expression_evaluator.evaluate(innermost_root_id->extent());
-    TORCH_INTERNAL_ASSERT(
-        maybe_dimension_size.has_value(),
-        "Unknown extent of tv: ",
-        tv->toString(),
-        ", id: ",
-        innermost_root_id->toString());
-    merged_size = maybe_dimension_size->as<int64_t>();
-  }
-
-  size_t vector_size = 1;
-  size_t next_vector_size = vector_size * 2;
-
-  // Try until vector size exceeds the max allowed size
-  while (next_vector_size <= max_vector_size) {
-    if (merged_size % next_vector_size != 0) {
-      break;
-    }
-    vector_size = next_vector_size;
-    next_vector_size *= 2;
-  }
-
-  return vector_size;
+  BroadcastMultipleInformation bcast_info;
+  bcast_info.view_disjoint_set_ids = ref_disjoint_set_ids;
+  bcast_info.broadcast_multiples = multiples;
+  return bcast_info;
 }
 
 namespace matmul_utils {
@@ -1811,7 +1851,7 @@ c10::optional<IterDomain*> getMaybeRootIfInnermostTiled(
 
 } // namespace
 
-TORCH_CUDA_CU_API void orderTiledConcreteIdAsRoot(TensorView* tv) {
+void orderTiledConcreteIdAsRoot(TensorView* tv) {
   auto ndims = tv->nDims();
 
   // Keep track of the left most position where we will
@@ -1911,9 +1951,7 @@ TORCH_CUDA_CU_API void orderTiledConcreteIdAsRoot(TensorView* tv) {
 } // namespace matmul_utils
 
 //! Propagate current transformations on from_tv to all graphs
-TORCH_CUDA_CU_API void transformPropagateToAllFrom(
-    TensorView* from_tv,
-    int pos) {
+void transformPropagateToAllFrom(TensorView* from_tv, int pos) {
   TransformPropagator propagator(from_tv, pos);
   MaxRootDomainInfoSpanningTree(from_tv, nullptr).traverse(&propagator);
 }
@@ -2139,181 +2177,218 @@ void BoundedDirectionalTransformPropagator::bothWays(
   propagate(from, pos, included_tvs, *options);
 }
 
-// Grab all values and expressions used to make the merged_domain and remove
-// them from the fusion
-void cleanUpInnermostMergedDomains(
-    const std::vector<IterDomain*>& root_domain,
-    IterDomain* merged_domain) {
-  TORCH_INTERNAL_ASSERT(merged_domain != nullptr);
-  TORCH_INTERNAL_ASSERT(!root_domain.empty());
-
-  std::unordered_set<Val*> root_set({root_domain.begin(), root_domain.end()});
+DisjointSets<IterDomain*> disjointViewSets(Fusion* fusion) {
+  // Start from the exact iter domain graph of the fusion
+  IterDomainGraph id_graph(fusion);
+  auto disjoint_view_ids = id_graph.exactNodes();
 
-  auto vals = DependencyCheck::getAllValsBetween(root_set, {merged_domain});
-
-  for (auto it = vals.rbegin(); it != vals.rend(); ++it) {
-    TORCH_INTERNAL_ASSERT((*it)->isA<IterDomain>());
-    auto id = (*it)->as<IterDomain>();
-    if (root_set.find(id) != root_set.end()) {
-      continue;
+  // If iter domains are involved in any transformation from root domains to
+  // rfactor domains they should be considered "contaminated".
+  for (auto tv : ir_utils::allTvs(fusion)) {
+    for (auto expr : StmtSort::getExprs(
+             fusion,
+             {tv->getMaybeRFactorDomain().begin(),
+              tv->getMaybeRFactorDomain().end()})) {
+      if (expr->isA<Merge>()) {
+        auto merge = expr->as<Merge>();
+        disjoint_view_ids.mapEntries(merge->inner(), merge->out());
+        disjoint_view_ids.mapEntries(merge->outer(), merge->out());
+      } else if (expr->isA<Split>()) {
+        auto split = expr->as<Split>();
+        disjoint_view_ids.mapEntries(split->in(), split->inner());
+        disjoint_view_ids.mapEntries(split->in(), split->outer());
+      } else {
+        TORCH_INTERNAL_ASSERT(
+            false, "Expression type: ", expr->toString(), " not supported.");
+      }
     }
-    Fusion* fusion = id->container()->as<Fusion>();
-    auto id_def = id->definition();
-    TORCH_INTERNAL_ASSERT(
-        id_def->isA<Merge>(),
-        "Invalid ID: ",
-        id->toString(),
-        ". Expected definition of a Merge expression: ",
-        (id_def != nullptr ? id_def->toString() : "nullptr"));
-    fusion->removeExpr(id_def);
-    fusion->removeVal(id);
   }
+  return disjoint_view_ids;
 }
 
-// Merge innermost domains for finding the widest vectorizable
-// size. Return the merged domain or nullptr if no merge is done.
-IterDomain* mergeInnermostDomains(
-    const std::vector<IterDomain*>& domain,
-    int num_merged_domains) {
-  const auto ndims = domain.size();
-  IterDomain* merged_id = nullptr;
-  bool is_merge_done = false;
-  for (const auto i : c10::irange(num_merged_domains)) {
-    auto id = domain.at(ndims - 1 - i);
-    // broadcast and trivial reductions are ignored
-    if (id->isBroadcast() || id->isTrivialReduction()) {
-      continue;
-    }
-    if (merged_id == nullptr) {
-      merged_id = id;
-    } else {
-      auto id_inner = merged_id;
-      auto id_outer = id;
-      merged_id = IterDomain::merge(id_outer, id_inner);
-      is_merge_done = true;
-    }
-  }
-  return is_merge_done ? merged_id : nullptr;
-}
+bool allMatchingViews(Fusion* fusion) {
+  // Start from the exact iter domain graph of the fusion
+  IterDomainGraph id_graph(fusion);
+  auto exact_disjoint_set = id_graph.exactNodes();
 
-//! Attempt to expand vectorized domains to contig merged domains. Break point
-//! identifies the point in which you can't propagate contiguous merges. For
-//! example in pointwise this is the point where we want to split the
-//! parallelization to take advantage of broadcast, and for reduction schedulers
-//! it's the point where we switch from a reduction domain to an iter domain (or
-//! vice versa).
-size_t expandVectorizationToContigMergedDomains(
-    Fusion* fusion,
-    SchedulerRuntimeInfo& runtime_info,
-    const std::vector<TensorView*> vectorizable_inputs_outputs,
-    TensorView* reference_tv,
-    int break_point,
-    size_t default_word_size) {
-  size_t max_expand_size = SchedulerRuntimeInfo::max_alignment_size_in_byte;
-  size_t common_alignment_size =
-      SchedulerRuntimeInfo::max_alignment_size_in_byte;
+  auto view_exprs = ir_utils::getViewOps(fusion);
+  if (view_exprs.empty()) {
+    return true;
+  }
 
-  for (auto inp_out : vectorizable_inputs_outputs) {
-    auto dtype_size = dataTypeSize(
-        inp_out->dtype(), indexModeToDtype(runtime_info.getIndexMode()));
+  std::vector<TensorView*> all_view_outs;
 
-    max_expand_size = std::min(
-        max_expand_size,
-        SchedulerRuntimeInfo::max_alignment_size_in_byte / dtype_size);
-    max_expand_size = std::min(
-        max_expand_size, runtime_info.getMaxVectorizableWidth(inp_out));
-    common_alignment_size =
-        std::min(common_alignment_size, runtime_info.getAlignmentSize(inp_out));
+  for (auto view_expr : view_exprs) {
+    auto outs = ir_utils::filterByType<TensorView>(view_expr->outputs());
+    all_view_outs.insert(all_view_outs.end(), outs.begin(), outs.end());
   }
 
-  // If there's no possibility to increase vector size of provided tensors, then
-  // don't bother doing a more complex analysis to try and do so, just return
-  // early.
-  if (max_expand_size == default_word_size) {
-    return default_word_size;
-  }
+  TORCH_INTERNAL_ASSERT(
+      all_view_outs.size() > 0,
+      "Found view operations but can't find any output tensor views.");
 
-  auto ca_map = ComputeAtMap(fusion);
+  auto first_out_tv = *all_view_outs.begin();
+  auto first_root_dom =
+      TensorDomain::noReductions(first_out_tv->getRootDomain());
+  auto first_rfactor_dom =
+      TensorDomain::noReductions(first_out_tv->getRFactorDomain());
 
-  // Merge the domains right of the break point
-  const auto& ref_root = reference_tv->getMaybeRFactorDomain();
-  const int num_merged_domains =
-      static_cast<int>(ref_root.size()) - static_cast<int>(break_point);
+  for (auto other_out_tv : all_view_outs) {
+    if (other_out_tv == first_out_tv) {
+      continue;
+    }
 
-  // No expansion with no merged domain
-  if (num_merged_domains == 0) {
-    return default_word_size;
-  }
+    auto other_root_dom =
+        TensorDomain::noReductions(other_out_tv->getRootDomain());
+    auto other_rfactor_dom =
+        TensorDomain::noReductions(other_out_tv->getRFactorDomain());
 
-  // Merge the domains but don't modify TensorDomain
-  auto merged_domain = mergeInnermostDomains(ref_root, num_merged_domains);
+    if (first_root_dom.size() != other_root_dom.size() ||
+        first_rfactor_dom.size() != other_rfactor_dom.size()) {
+      return false;
+    }
+    {
+      std::vector<std::pair<IterDomain*, IterDomain*>> zipped_ids;
+
+      std::transform(
+          first_root_dom.begin(),
+          first_root_dom.end(),
+          other_root_dom.begin(),
+          std::back_inserter(zipped_ids),
+          [](IterDomain* first, IterDomain* second) {
+            return std::make_pair(first, second);
+          });
 
-  // No expansion is done if no merge is done.
-  if (merged_domain == nullptr) {
-    return default_word_size;
-  }
+      if (std::any_of(
+              zipped_ids.begin(),
+              zipped_ids.end(),
+              [&exact_disjoint_set](
+                  std::pair<IterDomain*, IterDomain*> id_pair) {
+                return !exact_disjoint_set.strictAreMapped(
+                    id_pair.first, id_pair.second);
+              })) {
+        return false;
+      }
+    }
+    {
+      std::vector<std::pair<IterDomain*, IterDomain*>> zipped_ids;
+
+      std::transform(
+          first_rfactor_dom.begin(),
+          first_rfactor_dom.end(),
+          other_rfactor_dom.begin(),
+          std::back_inserter(zipped_ids),
+          [](IterDomain* first, IterDomain* second) {
+            return std::make_pair(first, second);
+          });
 
-  // Find the vectorizable word size with the merged domains
-  size_t word_size = scheduler_utils::collectMaxVectorizeSizeWithContigMerge(
-      reference_tv,
-      merged_domain,
-      common_alignment_size,
-      runtime_info.expressionEvaluator(),
-      indexModeToDtype(runtime_info.getIndexMode()));
+      if (std::any_of(
+              zipped_ids.begin(),
+              zipped_ids.end(),
+              [&exact_disjoint_set](
+                  std::pair<IterDomain*, IterDomain*> id_pair) {
+                return !exact_disjoint_set.strictAreMapped(
+                    id_pair.first, id_pair.second);
+              })) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
 
-  cleanUpInnermostMergedDomains(ref_root, merged_domain);
+bool breakIsDisjoint(std::vector<int> group_ids, int pos) {
+  if (pos < 0) {
+    pos += group_ids.size();
+  }
+  TORCH_INTERNAL_ASSERT(
+      pos >= 0 && pos <= group_ids.size(),
+      "Invalid position, size of vec is ",
+      group_ids.size(),
+      " but position is ",
+      pos);
 
-  // Stop if the reference doesn't get a larger word size.
-  if (word_size <= default_word_size) {
-    return default_word_size;
+  if (pos == 0 || pos == group_ids.size()) {
+    return true;
   }
 
-  // Check the other TVs and take the minimum of the valid word sizes
-  for (const auto tv : vectorizable_inputs_outputs) {
-    if (tv == reference_tv) {
-      continue;
-    }
+  std::unordered_set<int> left_ints(group_ids.begin(), group_ids.begin() + pos);
 
-    const auto& tv_root = tv->getMaybeRFactorDomain();
+  for (auto i = pos; i < group_ids.size(); i++) {
+    if (left_ints.count(group_ids[i]) > 0) {
+      return false;
+    }
+  }
+  return true;
+}
 
-    int tv_num_merged_domains = 0;
-    for (const auto i : c10::irange(num_merged_domains)) {
-      if (i == tv_root.size()) {
-        break;
+std::unordered_map<int, int> domainReorderAsRfactorMap(TensorView* tv) {
+  FusionGuard fg(tv->fusion());
+  auto transform_exprs = StmtSort::getExprs(
+      tv->fusion(),
+      {tv->domain()->domain().begin(), tv->domain()->domain().end()});
+  // simply update this vector of id's as progressing through the transformation
+  // expressions. We'll always insert the result of split in the location of the
+  // input, and insert the merge result in the position of the inner dimension.
+
+  auto reordered_ids = tv->getMaybeRFactorDomain();
+  for (const auto* expr : transform_exprs) {
+    if (const Split* split = dynamic_cast<const Split*>(expr)) {
+      auto find_it =
+          std::find(reordered_ids.begin(), reordered_ids.end(), split->in());
+      if (find_it == reordered_ids.end()) {
+        // Transformations before rfactor, ignore those.
+        continue;
       }
-      auto ref_id = ref_root.at(ref_root.size() - 1 - i);
-      IterDomain* tv_id = tv_root.at(tv_root.size() - 1 - i);
-      // If not mapped, stop expanding.
-      if (!ca_map.areMapped(ref_id, tv_id, IdMappingMode::EXACT)) {
-        break;
-      } else {
-        ++tv_num_merged_domains;
+      auto pos = std::distance(reordered_ids.begin(), find_it);
+      reordered_ids[pos] = split->inner();
+      reordered_ids.insert(reordered_ids.begin() + pos, split->outer());
+    } else if (const Merge* merge = dynamic_cast<const Merge*>(expr)) {
+      auto find_it_0 =
+          std::find(reordered_ids.begin(), reordered_ids.end(), merge->outer());
+      auto find_it_1 =
+          std::find(reordered_ids.begin(), reordered_ids.end(), merge->inner());
+      if (find_it_0 == reordered_ids.end() &&
+          find_it_1 == reordered_ids.end()) {
+        // Transformations before rfactor, ignore those.
+        continue;
       }
-    }
-
-    size_t tv_word_size = 1;
-    if (tv_num_merged_domains > 1) {
-      auto tv_merged_domain =
-          mergeInnermostDomains(tv_root, tv_num_merged_domains);
-      if (tv_merged_domain == nullptr) {
-        tv_word_size = runtime_info.getInnerDimVectorizableWidth(tv);
-      } else {
-        tv_word_size = scheduler_utils::collectMaxVectorizeSizeWithContigMerge(
-            tv,
-            tv_merged_domain,
-            common_alignment_size,
-            runtime_info.expressionEvaluator(),
-            indexModeToDtype(runtime_info.getIndexMode()));
-        cleanUpInnermostMergedDomains(tv_root, tv_merged_domain);
+      TORCH_INTERNAL_ASSERT(
+          find_it_0 != reordered_ids.end() && find_it_1 != reordered_ids.end(),
+          "Error in transformations of ",
+          tv->toString(),
+          "\nTransformations before rfactor should not mix with transformations after rfactor.");
+      auto pos0 = std::distance(reordered_ids.begin(), find_it_0);
+      auto pos1 = std::distance(reordered_ids.begin(), find_it_1);
+      if (pos0 > pos1) {
+        std::swap(pos0, pos1);
       }
-    } else {
-      tv_word_size = runtime_info.getInnerDimVectorizableWidth(tv);
-    }
+      // Should be impossible.
+      TORCH_INTERNAL_ASSERT(
+          pos0 != pos1,
+          "Didn't expect merge inputs to be the same iteratrion domain:\n",
+          merge->toString());
 
-    word_size = std::min(word_size, tv_word_size);
+      reordered_ids.erase(reordered_ids.begin() + pos0);
+      pos1--;
+      reordered_ids[pos1] = merge->out();
+    }
   }
 
-  return word_size;
+  std::unordered_map<int, int> old2new;
+  for (auto id_i : c10::irange(tv->domain()->domain().size())) {
+    auto leaf_id = tv->axis(id_i);
+    auto find_it =
+        std::find(reordered_ids.begin(), reordered_ids.end(), leaf_id);
+    TORCH_INTERNAL_ASSERT(
+        find_it != reordered_ids.end(),
+        "Reordering map creation failed, uninitialized iterdomain,",
+        " likely something is wrong with the transformations between the rfactor domain and the leaves.");
+    int new_pos = (int)std::distance(reordered_ids.begin(), find_it);
+    int old_pos = (int)id_i;
+    old2new[old_pos] = new_pos;
+  }
+  return old2new;
 }
 
 } // namespace scheduler_utils
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/utils.h b/torch/csrc/jit/codegen/cuda/scheduler/utils.h
index 0eb08fb03ba15..373a879f740d5 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/utils.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/utils.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/maxinfo_propagator.h>
@@ -115,16 +116,6 @@ TORCH_CUDA_CU_API inline void parallelizeAllLike(
       propagate_padding);
 }
 
-TORCH_CUDA_CU_API void computeAtInputs(
-    TensorView* consumer,
-    int pos,
-    ComputeAtMode mode = ComputeAtMode::Standard);
-
-TORCH_CUDA_CU_API void computeWithOutputs(
-    TensorView* producer,
-    int pos,
-    ComputeAtMode mode = ComputeAtMode::Standard);
-
 struct PersistentBufferInfo {
   std::vector<TensorView*> persistent_buffers;
   std::unordered_set<IterDomain*> unmappable_dims;
@@ -312,14 +303,26 @@ struct BroadcastMultiple {
   int64_t lhs_multiple = 0;
 };
 
-// Returns a vector of counts, size = reference_tv->getRootDomain().size(), each
-// entry [i] is the number of inputs/outputs that have a non-broadcast dimension
-// mapped to the corresponding dimension in reference_tv. Count includes
-// reference_tv if reference_tv is an input or output. Count is multiplied by
-// data type size.
-std::vector<BroadcastMultiple> getBroadcastMultiples(
-    TensorView* reference_tv,
-    DataType index_type);
+struct BroadcastMultipleInformation {
+  std::vector<int> view_disjoint_set_ids;
+  std::vector<BroadcastMultiple> broadcast_multiples;
+};
+
+// Returns a vector of size reference_tv->getMaybeRFactorDomain().size() which
+// is a view disjoint set id of each of those iter domains. If entries share the
+// same value, they undergo view transformations in the fusion together.
+// Broadcast multiples are also of size
+// reference_tv->getMaybeRFactorDomain().size(), each entry [i] is the number of
+// inputs/outputs that have a non-broadcast dimension mapped to the
+// corresponding dimension in reference_tv. Broadcast multiples includes
+// reference_tv if reference_tv is an input or output. Broadcast multiples is
+// multiplied by data type size. In the case of view operations the broadcast
+// multiple is the full multiple size if any domain in the group maps to a
+// non-broadcast dimension in the given input/output. Otherwise if all
+// dimensions are broadcast that input/output will not contribute to the
+// multiple.
+TORCH_CUDA_CU_API BroadcastMultipleInformation
+getBroadcastMultiples(TensorView* reference_tv, DataType index_type);
 
 //! Collect maximum vectorization word size of a tensor whose
 //! innermost domain is leaf_merged_domain. Contig merging is taken
@@ -492,6 +495,47 @@ struct TORCH_CUDA_CU_API BoundedDirectionalTransformPropagator {
       Options options);
 };
 
+// Schedulers typically start by merging some axes together then splitting,
+// and propagating those transformations through the dag. What we want to
+// understand is if these merges can be supported through view operations.
+// For example it could be problematic to support a reduction fusion:
+//
+// tv0[2, 3, 4]
+// tv1 = sum(tv0, {1, 2})
+// tv2 = view(tv0, {6, 4})
+//
+// Since the first step of the reduction scheduler would be tv1->merge(1, 2).
+// If we tried to propagate this transformation through the view it would make
+// the view invalid. If we tried to propagate the view through the reduction,
+// it would attempt to merge a reduction and non-reduction dimension. So for
+// these types of fusions we would like to understand that the view considers
+// axis 1 and 2 of tv1 as "non-separable" axes.
+//
+// If IterDomains are disjoint in the returned set, then they are considered
+// "separable".
+// Warning: This pass generates the IdGraphs, not intended for use at runtime.
+TORCH_CUDA_CU_API DisjointSets<IterDomain*> disjointViewSets(Fusion* fusion);
+
+// Return if all trasnformations in all views match.
+// TODO: Should this be moved to registry.cpp/.h?
+// Warning: This pass generates the IdGraphs, not intended for use at runtime.
+TORCH_CUDA_CU_API bool allMatchingViews(Fusion* fusion);
+
+// Makes sure that there are no group id's left of pos that match right of pos.
+// e.g.
+// [1, 0, 0] pos 2 would return false
+// [1, 0, 0] pos 1 would return true
+TORCH_CUDA_CU_API bool breakIsDisjoint(std::vector<int> group_ids, int pos);
+
+// Generates an old to new map to reorder tv's domain as the rfactor order.
+// Priority is given to inner most dimensions for example:
+// rfactor [i0, i1, i2]
+// domain [i0*i2, i1]
+// will produce the map {{0, 1}, {1, 0}}
+// This is somewhat similar to orderTiledConcreteIdAsRoot
+TORCH_CUDA_CU_API std::unordered_map<int, int> domainReorderAsRfactorMap(
+    TensorView* tv);
+
 } // namespace scheduler_utils
 } // namespace cuda
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.cpp b/torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.cpp
new file mode 100644
index 0000000000000..2c3c848c7f5c9
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.cpp
@@ -0,0 +1,286 @@
+#include <torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.h>
+
+#include <torch/csrc/jit/codegen/cuda/compute_at_map.h>
+#include <torch/csrc/jit/codegen/cuda/contiguity.h>
+#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
+#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <torch/csrc/jit/codegen/cuda/lower_divisible_split.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/registry.h>
+
+#include <c10/util/irange.h>
+
+#include <unordered_set>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+namespace vectorize_helper {
+
+// Grab all values and expressions used to make the merged_domain and remove
+// them from the fusion
+void cleanUpInnermostMergedDomains(
+    const std::vector<IterDomain*>& root_domain,
+    IterDomain* merged_domain) {
+  TORCH_INTERNAL_ASSERT(merged_domain != nullptr);
+  TORCH_INTERNAL_ASSERT(!root_domain.empty());
+
+  std::unordered_set<Val*> root_set({root_domain.begin(), root_domain.end()});
+
+  auto vals = DependencyCheck::getAllValsBetween(root_set, {merged_domain});
+
+  for (auto it = vals.rbegin(); it != vals.rend(); ++it) {
+    TORCH_INTERNAL_ASSERT((*it)->isA<IterDomain>());
+    auto id = (*it)->as<IterDomain>();
+    if (root_set.find(id) != root_set.end()) {
+      continue;
+    }
+    Fusion* fusion = id->container()->as<Fusion>();
+    auto id_def = id->definition();
+    TORCH_INTERNAL_ASSERT(
+        id_def->isA<Merge>(),
+        "Invalid ID: ",
+        id->toString(),
+        ". Expected definition of a Merge expression: ",
+        (id_def != nullptr ? id_def->toString() : "nullptr"));
+    fusion->removeExpr(id_def);
+    fusion->removeVal(id);
+  }
+}
+
+// Merge innermost domains for finding the widest vectorizable
+// size. Return the merged domain or nullptr if no merge is done.
+IterDomain* mergeInnermostDomains(
+    const std::vector<IterDomain*>& domain,
+    int num_merged_domains) {
+  const auto ndims = domain.size();
+  IterDomain* merged_id = nullptr;
+  bool is_merge_done = false;
+  for (const auto i : c10::irange(num_merged_domains)) {
+    auto id = domain.at(ndims - 1 - i);
+    // broadcast and trivial reductions are ignored
+    if (id->isBroadcast() || id->isTrivialReduction()) {
+      continue;
+    }
+    if (merged_id == nullptr) {
+      merged_id = id;
+    } else {
+      auto id_inner = merged_id;
+      auto id_outer = id;
+      merged_id = IterDomain::merge(id_outer, id_inner);
+      is_merge_done = true;
+    }
+  }
+  return is_merge_done ? merged_id : nullptr;
+}
+
+size_t collectMaxVectorizeSizeWithContigMerge(
+    TensorView* tv,
+    IterDomain* leaf_merged_domain,
+    size_t max_vector_size_in_byte,
+    ExpressionEvaluator& expression_evaluator,
+    DataType index_type) {
+  auto dtype_size = dataTypeSize(tv->dtype(), index_type);
+  const size_t max_vector_size = max_vector_size_in_byte / dtype_size;
+
+  // Assume no halo-related expression appears in the fusion. No
+  // broadcast is merged, so indexability can be assumed to be true.
+  // This is expensive, as ContigIDs builds other things like CAMap,
+  // HaloInfo, and ConcreteBroadcast info. We should explicitly build and reuse
+  // these as they're compile time information.
+  ContigIDs contigIds(
+      {leaf_merged_domain},
+      tv->getMaybeRFactorDomain(),
+      tv->domain()->contiguity(),
+      {},
+      {},
+      getAllDivisibleSplits(tv->fusion()),
+      {},
+      true);
+
+  auto innermost_root_id = tv->getMaybeRFactorDomain().back();
+  auto indexed_id = contigIds.rootToIndexedID().at(innermost_root_id);
+
+  size_t merged_size = 1;
+  // If the indexed ID is a contig merged domain, i.e., it is
+  // different from innermost_root_id, we accumulate the extents of
+  // all the root domains covered by the contig indexed ID. Otherwise,
+  // just look at the extent of the innermost root ID.
+  if (indexed_id != innermost_root_id) {
+    const auto& within_root = contigIds.withinContigIDs().at(indexed_id);
+    for (auto root_id : tv->getMaybeRFactorDomain()) {
+      if (within_root.find(root_id) == within_root.end()) {
+        continue;
+      }
+      auto maybe_dimension_size =
+          expression_evaluator.evaluate(root_id->extent());
+      TORCH_INTERNAL_ASSERT(
+          maybe_dimension_size.has_value(),
+          "Unknown extent of tv: ",
+          tv->toString(),
+          ", id: ",
+          root_id->toString());
+      merged_size *= maybe_dimension_size->as<int64_t>();
+    }
+  } else {
+    auto maybe_dimension_size =
+        expression_evaluator.evaluate(innermost_root_id->extent());
+    TORCH_INTERNAL_ASSERT(
+        maybe_dimension_size.has_value(),
+        "Unknown extent of tv: ",
+        tv->toString(),
+        ", id: ",
+        innermost_root_id->toString());
+    merged_size = maybe_dimension_size->as<int64_t>();
+  }
+
+  size_t vector_size = 1;
+  size_t next_vector_size = vector_size * 2;
+
+  // Try until vector size exceeds the max allowed size
+  while (next_vector_size <= max_vector_size) {
+    if (merged_size % next_vector_size != 0) {
+      break;
+    }
+    vector_size = next_vector_size;
+    next_vector_size *= 2;
+  }
+
+  return vector_size;
+}
+
+//! Attempt to expand vectorized domains to contig merged domains. Break point
+//! identifies the point in which you can't propagate contiguous merges. For
+//! example in pointwise this is the point where we want to split the
+//! parallelization to take advantage of broadcast, and for reduction
+//! schedulers it's the point where we switch from a reduction domain to an
+//! iter domain (or vice versa).
+size_t expandVectorizationToContigMergedDomains(
+    Fusion* fusion,
+    SchedulerRuntimeInfo& runtime_info,
+    const std::vector<TensorView*> vectorizable_inputs_outputs,
+    TensorView* reference_tv,
+    int break_point,
+    size_t default_word_size) {
+  size_t max_expand_size = SchedulerRuntimeInfo::max_alignment_size_in_byte;
+  size_t common_alignment_size =
+      SchedulerRuntimeInfo::max_alignment_size_in_byte;
+
+  for (auto inp_out : vectorizable_inputs_outputs) {
+    auto dtype_size = dataTypeSize(
+        inp_out->dtype(), indexModeToDtype(runtime_info.getIndexMode()));
+
+    max_expand_size = std::min(
+        max_expand_size,
+        SchedulerRuntimeInfo::max_alignment_size_in_byte / dtype_size);
+    max_expand_size = std::min(
+        max_expand_size, runtime_info.getMaxVectorizableWidth(inp_out));
+    common_alignment_size =
+        std::min(common_alignment_size, runtime_info.getAlignmentSize(inp_out));
+  }
+
+  // If there's no possibility to increase vector size of provided tensors,
+  // then don't bother doing a more complex analysis to try and do so, just
+  // return early.
+  if (max_expand_size == default_word_size) {
+    return default_word_size;
+  }
+
+  auto ca_map = ComputeAtMap(fusion);
+
+  // Merge the domains right of the break point
+  const auto& ref_root = reference_tv->getMaybeRFactorDomain();
+  const int max_num_merged_domains =
+      static_cast<int>(ref_root.size()) - static_cast<int>(break_point);
+  int64_t num_merged_domains = 0;
+  while (num_merged_domains < max_num_merged_domains) {
+    auto pos = (int64_t)ref_root.size() - 1 - num_merged_domains;
+    if (!reference_tv->domain()->contiguity()[pos]) {
+      break;
+    }
+    num_merged_domains++;
+  }
+
+  // No expansion with no merged domain
+  if (num_merged_domains == 0) {
+    return default_word_size;
+  }
+
+  // Merge the domains but don't modify TensorDomain
+  auto merged_domain = mergeInnermostDomains(ref_root, num_merged_domains);
+
+  // No expansion is done if no merge is done.
+  if (merged_domain == nullptr) {
+    return default_word_size;
+  }
+
+  // Find the vectorizable word size with the merged domains
+  size_t word_size = collectMaxVectorizeSizeWithContigMerge(
+      reference_tv,
+      merged_domain,
+      common_alignment_size,
+      runtime_info.expressionEvaluator(),
+      indexModeToDtype(runtime_info.getIndexMode()));
+
+  cleanUpInnermostMergedDomains(ref_root, merged_domain);
+
+  // Stop if the reference doesn't get a larger word size.
+  if (word_size <= default_word_size) {
+    return default_word_size;
+  }
+
+  // Check the other TVs and take the minimum of the valid word sizes
+  for (const auto tv : vectorizable_inputs_outputs) {
+    if (tv == reference_tv) {
+      continue;
+    }
+
+    const auto& tv_root = tv->getMaybeRFactorDomain();
+
+    int tv_num_merged_domains = 0;
+    for (const auto i : c10::irange(max_num_merged_domains)) {
+      if (i == tv_root.size()) {
+        break;
+      }
+      auto ref_id = ref_root.at(ref_root.size() - 1 - i);
+      auto pos = tv_root.size() - 1 - i;
+      IterDomain* tv_id = tv_root.at(pos);
+      // If not mapped, stop expanding.
+      if (!ca_map.areMapped(ref_id, tv_id, IdMappingMode::EXACT) ||
+          !tv->domain()->contiguity()[pos]) {
+        break;
+      } else {
+        ++tv_num_merged_domains;
+      }
+    }
+
+    size_t tv_word_size = 1;
+    if (tv_num_merged_domains > 1) {
+      auto tv_merged_domain =
+          mergeInnermostDomains(tv_root, tv_num_merged_domains);
+      if (tv_merged_domain == nullptr) {
+        tv_word_size = runtime_info.getInnerDimVectorizableWidth(tv);
+      } else {
+        tv_word_size = collectMaxVectorizeSizeWithContigMerge(
+            tv,
+            tv_merged_domain,
+            common_alignment_size,
+            runtime_info.expressionEvaluator(),
+            indexModeToDtype(runtime_info.getIndexMode()));
+        cleanUpInnermostMergedDomains(tv_root, tv_merged_domain);
+      }
+    } else {
+      tv_word_size = runtime_info.getInnerDimVectorizableWidth(tv);
+    }
+
+    word_size = std::min(word_size, tv_word_size);
+  }
+
+  return word_size;
+}
+
+} // namespace vectorize_helper
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.h b/torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.h
index 0a67d00618e23..a9b959b495d60 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.h
@@ -2,21 +2,15 @@
 
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/registry.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
+
+#include <vector>
 
 namespace torch {
 namespace jit {
 namespace fuser {
 namespace cuda {
-
-// TODO: Put implementations in a vectorize_helper.cpp
-namespace scheduler_utils {
-
-// Moved the definition of these to
-// torch/csrc/jit/codegen/cuda/scheduler/utils.cpp as making new CPP files is
-// painful for multiple reasons.
+namespace vectorize_helper {
 
 // Grab all values and expressions used to make the merged_domain and remove
 // them from the fusion
@@ -44,7 +38,7 @@ size_t expandVectorizationToContigMergedDomains(
     int break_point,
     size_t default_word_size);
 
-} // namespace scheduler_utils
+} // namespace vectorize_helper
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/tensor_view.cpp b/torch/csrc/jit/codegen/cuda/tensor_view.cpp
index ba95d8fabdce9..633c98102e2e0 100644
--- a/torch/csrc/jit/codegen/cuda/tensor_view.cpp
+++ b/torch/csrc/jit/codegen/cuda/tensor_view.cpp
@@ -3,6 +3,7 @@
 #include <torch/csrc/jit/codegen/cuda/compute_at.h>
 #include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/inlining.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
@@ -290,40 +291,115 @@ IterDomain* TensorView::axis(int pos) const {
   return domain()->axis(pos);
 }
 
-void TensorView::setComputeAt(unsigned int pos, bool decrease) {
+void TensorView::inlineAt(
+    int64_t pos,
+    bool best_effort,
+    MaxPosCalculator* calc) {
   TORCH_INTERNAL_ASSERT(
       !container()->isA<kir::Kernel>(),
       "Function invalid for kernel container.");
-  if (pos <= compute_at_pos_ && !decrease) {
-    return;
+
+  std::unique_ptr<MaxPosCalculator> calc_owner;
+  if (calc == nullptr) {
+    calc_owner = std::make_unique<MaxPosCalculator>();
+    calc = calc_owner.get();
+  }
+
+  if (pos < 0) {
+    pos += int64_t(nDims()) + 1;
   }
 
   TORCH_INTERNAL_ASSERT(
-      (unsigned)pos <= nDims(),
-      "Invalid this computeAt position for T",
+      pos >= 0 && pos <= nDims(),
+      "Invalid inline position for T",
       name(),
       ": ",
       pos);
 
-  compute_at_pos_ = pos;
-}
+  auto max_inline_pos = calc->getMaxPosAll(this, best_effort);
 
-void TensorView::setMaxProducer(unsigned int pos, bool decrease) {
-  TORCH_INTERNAL_ASSERT(
-      !container()->isA<kir::Kernel>(),
-      "Function invalid for kernel container.");
-  if (pos <= max_producer_pos_ && !decrease) {
-    return;
+  if (best_effort) {
+    pos = std::min<int64_t>(max_inline_pos, pos);
+  }
+
+  // hoist inner most broadcast
+  while (pos > 0 && axis(pos - 1)->isBroadcast()) {
+    pos--;
   }
 
   TORCH_INTERNAL_ASSERT(
-      (unsigned)pos <= nDims(),
-      "Invalid max producer position for T",
+      pos <= max_inline_pos,
+      "Invalid inline position for T",
       name(),
       ": ",
-      pos);
+      pos,
+      ". Maximum allowed value:",
+      max_inline_pos);
+
+  if (isFusionInput()) {
+    return;
+  }
+
+  if (pos > compute_at_pos_) {
+    compute_at_pos_ = pos;
+    for (auto consumer : ir_utils::consumerTvsOf(this)) {
+      consumer->updateMaxProducerPosition();
+    }
+  }
+}
+
+namespace {
+
+// Try to find the aligned position on consumer's domain corresponding to the
+//  compute at position of producer domain. No checking on actual
+//  producer-consumer relationship.
+unsigned int getConsumerPosAlignedToProducerCA(
+    TensorView* consumer,
+    TensorView* producer) {
+  // Locate consumer's position that aligns with
+  //  the producer's new compute at axis. We need broadcast axes forwarded so we
+  //  need to replay PasC as CasP will not forward braodcast dims. For example
+  //  if we have:
+  // T2[ iS22{( 3 * 1 )} ] ca_pos( 1 ) = broadcast( T1[ iS1{3} ] ca_pos( 1 )
+  // produce_pos( 1) ) CasP will have the mapping iS1{3} -> iS2{3} and PasC will
+  // have the mapping iS22{( 3 * 1 )} <- iS1{3} We need the latter. Refer to
+  // NVFuserTest.FusionComplexBCast1_CUDA
+
+  auto disjoint_sets =
+      BestEffortReplay::replayPasC(
+          producer, consumer, -1, PairwiseRootDomainMap(producer, consumer))
+          .getDisjointSets();
+
+  // Find the innermost position of consumer that has
+  //  been mapped within the producer ca axis.
+  unsigned int consumer_pos = consumer->nDims();
+  while (consumer_pos > 0) {
+    auto consumer_id = consumer->axis((int)consumer_pos - 1);
+    auto p_dom = producer->domain()->domain();
+    if (std::any_of(
+            p_dom.begin(),
+            p_dom.begin() + producer->getComputeAtPosition(),
+            [&consumer_id, &disjoint_sets](IterDomain* p_id) {
+              return disjoint_sets.permissiveAreMapped(consumer_id, p_id);
+            })) {
+      break;
+    }
+    consumer_pos--;
+  }
+
+  return consumer_pos;
+}
+
+} // namespace
 
-  max_producer_pos_ = pos;
+void TensorView::updateMaxProducerPosition() {
+  TORCH_INTERNAL_ASSERT(
+      !container()->isA<kir::Kernel>(),
+      "Function invalid for kernel container.");
+  for (auto producer : ir_utils::producerTvsOf(this)) {
+    max_producer_pos_ = std::max(
+        max_producer_pos_, getConsumerPosAlignedToProducerCA(this, producer));
+  }
 }
 
 TensorView* TensorView::computeAt(
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp
deleted file mode 100644
index db38bbfd3a92a..0000000000000
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp
+++ /dev/null
@@ -1,25813 +0,0 @@
-#if defined(USE_CUDA)
-#include <gmock/gmock-matchers.h>
-#include <gtest/gtest.h>
-
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/codegen.h>
-#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
-#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/fusion_segmenter.h>
-#include <torch/csrc/jit/codegen/cuda/grouped_reduction.h>
-#include <torch/csrc/jit/codegen/cuda/inline_propagator.h>
-#include <torch/csrc/jit/codegen/cuda/interface.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/ir_graphviz.h>
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/lower_magic_zero.h>
-#include <torch/csrc/jit/codegen/cuda/mutator.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
-#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
-#include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
-
-#include <test/cpp/jit/test_utils.h>
-#include <torch/csrc/jit/api/function_impl.h>
-#include <torch/csrc/jit/codegen/cuda/parser.h>
-#include <torch/csrc/jit/ir/irparser.h>
-#include <torch/torch.h>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/Exceptions.h>
-#include <c10/cuda/CUDAStream.h>
-
-#include <algorithm>
-#include <iostream>
-#include <sstream>
-#include <thread>
-
-// Tests go in torch::jit
-namespace torch {
-namespace jit {
-
-using namespace torch::jit::fuser::cuda;
-using namespace at::indexing;
-
-namespace {
-
-TensorView* loweredTv(TensorView* tv, GpuLower& gpulw) {
-  auto used_tvs = ir_utils::allTvs(gpulw.kernel()->as<Fusion>());
-  TensorView* matching_tv = nullptr;
-  for (auto lowered_tv : used_tvs) {
-    if (lowered_tv->name() == tv->name()) {
-      matching_tv = lowered_tv;
-    }
-  }
-  TORCH_INTERNAL_ASSERT(matching_tv != nullptr);
-  return matching_tv;
-}
-
-class PredicatedChecker : public kir::IrVisitor {
- public:
-  // Checks if the provided tv is written to within a non-trivial conditional
-  static bool isPredicated(TensorView* tv, GpuLower& gpulw) {
-    PredicatedChecker checker(
-        loweredTv(tv, gpulw), gpulw.kernel()->topLevelExprs());
-    return checker.is_predicated_;
-  }
-
- private:
-  PredicatedChecker() = delete;
-
-  PredicatedChecker(TensorView* tv, std::vector<Expr*> exprs) : tv_(tv) {
-    kir::IrVisitor::handle(exprs);
-  }
-
-  using kir::IrVisitor::handle;
-  bool is_predicated_ = false;
-  bool predicated_ite_ = false;
-  TensorView* tv_ = nullptr;
-
-  void handle(kir::IfThenElse* ite) final {
-    auto prev_ite = predicated_ite_;
-    predicated_ite_ = !ite->predicate()->value()->isConstScalar();
-    kir::IrVisitor::handle(ite);
-    predicated_ite_ = prev_ite;
-  }
-
-  void handle(Expr* expr) final {
-    if (expr->outputs().size() && expr->outputs()[0]->isA<kir::TensorIndex>()) {
-      auto ti = expr->outputs()[0]->as<kir::TensorIndex>();
-      if (ti->view() == tv_) {
-        is_predicated_ = is_predicated_ | predicated_ite_;
-        if (expr->predicate() != nullptr &&
-            !expr->predicate()->value()->isConst()) {
-          is_predicated_ = true;
-        }
-      }
-    }
-    kir::IrVisitor::handle(expr);
-  }
-};
-
-class UnswitchInElseChecker : public kir::IrVisitor {
- public:
-  // Checks if there are any unswitched for loops within an else clause
-  static bool check(GpuLower& gpulw) {
-    UnswitchInElseChecker checker(gpulw.kernel()->topLevelExprs());
-    return checker.found_in_else_;
-  }
-
- private:
-  UnswitchInElseChecker() = delete;
-  UnswitchInElseChecker(std::vector<Expr*> exprs) {
-    kir::IrVisitor::handle(exprs);
-  }
-
-  using kir::IrVisitor::handle;
-  bool within_else_ = false;
-  bool found_in_else_ = false;
-
-  void handle(kir::IfThenElse* ite) final {
-    auto prev_within_else = within_else_;
-    within_else_ = true;
-    kir::IrVisitor::handle(ite->elseBody().exprs());
-    within_else_ = prev_within_else;
-  }
-
-  void handle(kir::ForLoop* for_loop) final {
-    if (for_loop->iter_domain()->getParallelType() == ParallelType::Unswitch) {
-      found_in_else_ = found_in_else_ || within_else_;
-    }
-    kir::IrVisitor::handle(for_loop);
-  }
-};
-
-class PredicateMagicZeroChecker : public kir::IrVisitor {
- public:
-  // Checks if all predicated domains of the provided tv are protected with
-  // magic zero
-  static bool isProtected(TensorView* tv, GpuLower& gpulw) {
-    PredicateMagicZeroChecker checker(
-        loweredTv(tv, gpulw), gpulw.kernel()->topLevelExprs());
-    return checker.is_protected_;
-  }
-
- private:
-  using kir::IrVisitor::handle;
-
-  PredicateMagicZeroChecker(TensorView* tv, std::vector<Expr*> exprs)
-      : tv_(tv) {
-    handle(exprs);
-  }
-
-  void handle(kir::IfThenElse* ite) final {
-    auto prev_predicate = predicate_;
-    predicate_ = ite->predicate()->value();
-    kir::IrVisitor::handle(ite);
-    predicate_ = prev_predicate;
-  }
-
-  void handle(Expr* expr) final {
-    if (expr->outputs().size() && expr->outputs()[0]->isA<kir::TensorIndex>()) {
-      auto ti = expr->outputs()[0]->as<kir::TensorIndex>();
-      if (ti->view() == tv_) {
-        is_protected_ = checkPredicateOfTensor(predicate_);
-        return;
-      }
-    }
-
-    if (expr->isA<kir::ForLoop>()) {
-      handle(expr->as<kir::ForLoop>());
-    } else if (expr->isA<kir::IfThenElse>()) {
-      handle(expr->as<kir::IfThenElse>());
-    } else {
-      for (auto input : expr->inputs()) {
-        handle(input);
-      }
-    }
-  }
-
-  // Return true If all predicated domains are protected
-  bool checkPredicateOfTensor(Val* predicate) {
-    auto id_predicates = decomposeCompoundPredicate(predicate);
-    for (auto id_predicate : id_predicates) {
-      // Just check if nvfuser_zero is used. Not perfect but probably
-      // good enough.
-      is_magic_zero_found_ = false;
-      handle(id_predicate);
-      if (!is_magic_zero_found_) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  // Decompose "X && Y" to a vector of {X, Y}.
-  std::vector<Val*> decomposeCompoundPredicate(Val* predicate) {
-    if (auto binary_op = dynamic_cast<BinaryOp*>(predicate->definition())) {
-      if (binary_op->getBinaryOpType() == BinaryOpType::And) {
-        auto pred = decomposeCompoundPredicate(binary_op->lhs());
-        auto rhs_pred = decomposeCompoundPredicate(binary_op->rhs());
-        pred.insert(pred.end(), rhs_pred.begin(), rhs_pred.end());
-        return pred;
-      }
-    }
-
-    return {predicate};
-  }
-
-  void handle(Val* val) final {
-    if (isMagicZero(val)) {
-      is_magic_zero_found_ = true;
-      return;
-    }
-
-    auto def = val->definition();
-    if (def != nullptr) {
-      handle(def);
-    }
-  }
-
- private:
-  bool is_protected_ = false;
-  Val* predicate_ = nullptr;
-  TensorView* tv_ = nullptr;
-  bool is_magic_zero_found_ = false;
-};
-
-// Basically just TransformPropagator, except that it checks the consistency
-// replayPasC with getMatchedLeafPosWithoutReplayPasC, replayCasP with
-// getMatchedLeafPosWithoutReplayCasP, and fullSelfReplay with fullSelfMatching:
-// - After replayPasC, getMatchedLeafPosWithoutReplayPasC should return the same
-//   replayed position
-// - After replayCasP, getMatchedLeafPosWithoutReplayCasP should return the same
-//   replayed position
-// - After fullSelfReplay, fullSelfMatching should return true
-struct TransformPropagatorWithCheck : public TransformPropagator {
- public:
-  virtual void propagateC2P(TensorView* from, TensorView* to) override {
-    TransformPropagator::propagateC2P(from, to);
-    auto from_pos = replayed_pos_.at(from);
-    auto to_pos = replayed_pos_.at(to);
-    TORCH_CHECK(
-        TransformReplay::getMatchedLeafPosWithoutReplayPasC(
-            to, from, from_pos) == to_pos);
-  }
-  virtual void propagateP2C(TensorView* from, TensorView* to) override {
-    TransformPropagator::propagateP2C(from, to);
-    auto from_pos = replayed_pos_.at(from);
-    auto to_pos = replayed_pos_.at(to);
-    TORCH_CHECK(
-        TransformReplay::getMatchedLeafPosWithoutReplayCasP(
-            to, from, from_pos) == to_pos);
-  }
-  virtual void propagateSibling(TensorView* from, TensorView* to) override {
-    TransformPropagator::propagateSibling(from, to);
-    auto from_pos = replayed_pos_.at(from);
-    auto to_pos = replayed_pos_.at(to);
-    TORCH_CHECK(from_pos == to_pos);
-    TORCH_CHECK(TransformReplay::fullSelfMatching(from, to));
-  }
-  using TransformPropagator::TransformPropagator;
-};
-
-} // namespace
-
-// 1. Test cases are void() functions.
-// 2. They start with the prefix `test`
-
-// A few smoke tests for IrGraphGenerator
-// (These tests exercise IrGraphGenerator through a non-trivial IR,
-//  to make sure that it runs w/o crashing. The actual output is not
-//  validated)
-TEST_F(NVFuserTest, FusionIrGraphGenerator_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Make sure we can handle empty IRs
-  TORCH_CHECK(!IrGraphGenerator::toGraphviz(
-                   &fusion, IrGraphGenerator::DetailLevel::Basic)
-                   .empty());
-
-  // Construct an interesting IR
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(3.141));
-  TensorView* tv3 = broadcast(tv0, {false, true, false, true});
-  TensorView* tv4 =
-      reductionOp(BinaryOpType::Add, {2}, IrBuilder::create<Double>(0), tv3);
-  TensorView* tv5 = clamp(
-      tv4, IrBuilder::create<Double>(0.f), IrBuilder::create<Double>(1.f));
-  TensorView* tv6 = add(tv2, tv2);
-
-  // Another checkpoint before adding outputs
-  TORCH_CHECK(!IrGraphGenerator::toGraphviz(
-                   &fusion, IrGraphGenerator::DetailLevel::Explicit)
-                   .empty());
-
-  fusion.addOutput(tv6);
-
-  tv4->axis(2)->parallelize(ParallelType::BIDy);
-  tv6->merge(0);
-  tv6->split(0, 4);
-  tv6->axis(0)->parallelize(ParallelType::BIDx);
-  tv5->reorder({{-1, 0}});
-  tv2->computeAt(tv6, 1);
-
-  // Another checkpoint with more node types
-  TORCH_CHECK(!IrGraphGenerator::toGraphviz(
-                   &fusion, IrGraphGenerator::DetailLevel::ComputeOnly)
-                   .empty());
-
-  for (Val* val : fusion.vals()) {
-    if (!val->isFusionInput() &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  // Final IR graph
-  TORCH_CHECK(!IrGraphGenerator::toGraphviz(
-                   &fusion, IrGraphGenerator::DetailLevel::Verbose)
-                   .empty());
-}
-
-TEST_F(NVFuserTest, FusionDispatch_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  Double* f = IrBuilder::create<Double>(2.f);
-  std::stringstream ss1, ss2, ss3;
-  ss1 << f;
-  ss2 << static_cast<Val*>(f);
-  ss3 << static_cast<Statement*>(f);
-  TORCH_CHECK(
-      ss1.str().compare(ss2.str()) == 0 && ss1.str().compare(ss3.str()) == 0,
-      "Error with dispatch system where results differ by passing Double* vs Val* vs Statement*.");
-}
-
-// Evaluate basic scalar operations with constant values
-TEST_F(NVFuserTest, FusionExprEvalConstants_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  ExpressionEvaluator evaluator(&fusion);
-
-  auto* a = IrBuilder::create<Int>(7);
-  auto* b = IrBuilder::create<Int>(3);
-
-  // Avoid div operation because it casts int operands to float
-  checkIntValue(evaluator, neg(a), -7);
-  checkIntValue(evaluator, add(a, b), 10);
-  checkIntValue(evaluator, neg(mul(sub(a, b), add(a, b))), -40);
-  checkIntValue(evaluator, mod(a, b), 1);
-  checkIntValue(evaluator, ceilDiv(a, b), 3);
-}
-
-TEST_F(NVFuserTest, FusionExprEvalDouble_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-  auto ten = IrBuilder::create<Double>(10);
-  auto two = IrBuilder::create<Double>(2);
-  auto three = IrBuilder::create<Double>(3);
-  auto val = castOp(DataType::Int, ceilDiv(sub(ten, two), three));
-  auto reference = static_cast<int64_t>(std::ceil((10.0 - 2.0) / 3.0));
-  TORCH_CHECK(reference == val->evaluateInt());
-}
-
-// Evaluate basic scalar operations with bound values
-TEST_F(NVFuserTest, FusionExprEvalBindings_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  ExpressionEvaluator evaluator(&fusion);
-
-  auto* a = IrBuilder::create<Int>();
-  auto* b = IrBuilder::create<Int>();
-  auto* c = add(a, b);
-  auto* d = neg(ceilDiv(c, b));
-  auto* e = IrBuilder::create<Int>(0);
-
-  // trying to evaluate before binding should give empty results
-  TORCH_CHECK(!evaluator.evaluate(a).has_value());
-  TORCH_CHECK(!evaluator.evaluate(d).has_value());
-
-  evaluator.bind(a, 7);
-  evaluator.bind(b, 3);
-
-  // can't bind to the results of expressions
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(evaluator.bind(c, 100));
-
-  // can't bind to concrete values
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(evaluator.bind(e, 100));
-
-  checkIntValue(evaluator, c, 10);
-  checkIntValue(evaluator, sub(a, b), 4);
-  checkIntValue(evaluator, mod(a, b), 1);
-  checkIntValue(evaluator, ceilDiv(a, b), 3);
-  checkIntValue(evaluator, d, -4);
-
-  // Reset evaluation context
-  evaluator = ExpressionEvaluator(&fusion);
-
-  evaluator.bind(a, 2);
-  evaluator.bind(b, 5);
-
-  checkIntValue(evaluator, c, 7);
-  checkIntValue(evaluator, sub(a, b), -3);
-  checkIntValue(evaluator, mod(a, b), 2);
-  checkIntValue(evaluator, ceilDiv(a, b), 1);
-  checkIntValue(evaluator, d, -2);
-}
-
-// Evaluate expressions in a simple IR
-TEST_F(NVFuserTest, FusionExprEvalBasic_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Create a non-trivial IR
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
-  TensorView* tv3 = add(tv0, tv2);
-
-  fusion.addOutput(tv3);
-
-  tv3->split(0, 4);
-
-  tv0->computeAt(tv3, 1);
-  tv1->computeAt(tv3, 1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(1)->parallelize(ParallelType::Unroll);
-  tv3->axis(1)->parallelize(ParallelType::Unroll);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  // 1. Create an evaluator
-  ExpressionEvaluator evaluator(&fusion);
-
-  // 2. Bind values
-  //
-  // IMPORTANT:
-  // a. The bindings are only as stable as the Vals are in the fusion graph
-  // b. You must use the original (rootDomain) extents
-  //  (ex. `tv0->getRootDomain()[0]->extent()`
-  //   instead of `tv0->axis(0)->extent()`)
-  //
-  evaluator.bind(tv0->getRootDomain()[0]->extent(), 6);
-  evaluator.bind(tv0->getRootDomain()[1]->extent(), 128);
-  evaluator.bind(tv1->getRootDomain()[0]->extent(), 6);
-  evaluator.bind(tv1->getRootDomain()[1]->extent(), 128);
-
-  // 3. Evaluate and check result values
-  TORCH_CHECK(tv2->domain()->nDims() == 3);
-  checkIntValue(evaluator, tv2->axis(0)->extent(), 2);
-  checkIntValue(evaluator, tv2->axis(1)->extent(), 4);
-  checkIntValue(evaluator, tv2->axis(2)->extent(), 128);
-
-  TORCH_CHECK(tv3->domain()->nDims() == 3);
-  checkIntValue(evaluator, tv3->axis(0)->extent(), 2);
-  checkIntValue(evaluator, tv3->axis(1)->extent(), 4);
-  checkIntValue(evaluator, tv3->axis(2)->extent(), 128);
-}
-
-// Evaluate expressions in a more complex IR
-TEST_F(NVFuserTest, FusionExprEvalComplex_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(-1.0));
-  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(3.0));
-  TensorView* tv3 = mul(tv0, IrBuilder::create<Double>(2.0));
-  TensorView* tv4 = add(tv2, tv1);
-  TensorView* tv5 = add(tv4, tv3);
-  TensorView* tv6 = add(tv0, tv3);
-
-  fusion.addOutput(tv5);
-  fusion.addOutput(tv6);
-
-  tv5->reorder({{-1, 0}});
-
-  tv6->split(0, 5);
-  tv5->merge(0);
-
-  // 1. Create an evaluator
-  ExpressionEvaluator evaluator(&fusion);
-
-  // 2. Bind values
-  evaluator.bind(tv0->getRootDomain()[0]->extent(), 129);
-  evaluator.bind(tv0->getRootDomain()[1]->extent(), 127);
-
-  // Evaluate and check extent values
-  TORCH_CHECK(tv0->domain()->nDims() == 2);
-  checkIntValue(evaluator, tv0->axis(0)->extent(), 129);
-  checkIntValue(evaluator, tv0->axis(1)->extent(), 127);
-
-  TORCH_CHECK(tv3->domain()->nDims() == 2);
-  checkIntValue(evaluator, tv3->axis(0)->extent(), 129);
-  checkIntValue(evaluator, tv3->axis(1)->extent(), 127);
-
-  TORCH_CHECK(tv4->domain()->nDims() == 2);
-  checkIntValue(evaluator, tv4->axis(0)->extent(), 129);
-  checkIntValue(evaluator, tv4->axis(1)->extent(), 127);
-
-  TORCH_CHECK(tv5->domain()->nDims() == 1);
-  checkIntValue(evaluator, tv5->axis(0)->extent(), 16383);
-
-  TORCH_CHECK(tv6->domain()->nDims() == 3);
-  checkIntValue(evaluator, tv6->axis(0)->extent(), 26);
-  checkIntValue(evaluator, tv6->axis(1)->extent(), 5);
-  checkIntValue(evaluator, tv6->axis(2)->extent(), 127);
-}
-
-// Evaluate expressions post lowering
-TEST_F(NVFuserTest, FusionExprEvalPostLower_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Create a non-trivial IR
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
-  TensorView* tv3 = add(tv0, tv2);
-
-  fusion.addOutput(tv3);
-
-  tv3->split(0, 4);
-
-  tv0->computeAt(tv3, 1);
-  tv1->computeAt(tv3, 1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(1)->parallelize(ParallelType::Unroll);
-  tv3->axis(1)->parallelize(ParallelType::Unroll);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  auto* bid_x = add(tv3->axis(0)->extent(), IrBuilder::create<Int>(0));
-  auto* tid_x = add(tv3->axis(-1)->extent(), IrBuilder::create<Int>(0));
-
-  // Lower
-  GpuLower gpulw(&fusion);
-
-  // 1. Create an evaluation context
-  ExpressionEvaluator evaluator(&fusion);
-
-  // 2. Bind values
-  evaluator.bind(tv0->getRootDomain()[0]->extent(), 6);
-  evaluator.bind(tv0->getRootDomain()[1]->extent(), 128);
-  evaluator.bind(tv1->getRootDomain()[0]->extent(), 6);
-  evaluator.bind(tv1->getRootDomain()[1]->extent(), 128);
-
-  // 3. Evaluate and check result values
-  TORCH_CHECK(tv2->domain()->nDims() == 3);
-  checkIntValue(evaluator, tv2->axis(0)->extent(), 2);
-  checkIntValue(evaluator, tv2->axis(1)->extent(), 4);
-  checkIntValue(evaluator, tv2->axis(2)->extent(), 128);
-
-  TORCH_CHECK(tv3->domain()->nDims() == 3);
-  checkIntValue(evaluator, tv3->axis(0)->extent(), 2);
-  checkIntValue(evaluator, tv3->axis(1)->extent(), 4);
-  checkIntValue(evaluator, tv3->axis(2)->extent(), 128);
-
-  checkIntValue(evaluator, bid_x, 2);
-  checkIntValue(evaluator, tid_x, 128);
-}
-
-// Kernel IR: Evaluate basic scalar operations with constant values
-TEST_F(NVFuserTest, FusionKernelExprEvalConstants_CUDA) {
-  Fusion fusion;
-  kir::Kernel kernel(&fusion);
-  FusionGuard fg((&kernel)->as<Fusion>());
-
-  auto a = IrBuilder::create<Int>(7);
-  auto b = IrBuilder::create<Int>(3);
-  auto c = IrBuilder::subExpr(a, b);
-  auto d = IrBuilder::divExpr(a, b);
-  auto e = IrBuilder::mulExpr(c, d);
-
-  kir::ExpressionEvaluator evaluator;
-
-  checkIntValue(evaluator, IrBuilder::negExpr(a), -7);
-  checkIntValue(evaluator, IrBuilder::addExpr(a, b), 10);
-  checkIntValue(evaluator, IrBuilder::negExpr(e), -8);
-  checkIntValue(evaluator, IrBuilder::modExpr(a, b), 1);
-  checkIntValue(evaluator, IrBuilder::ceilDivExpr(a, b), 3);
-}
-
-// Kernel IR: Evaluate basic scalar operations with bound values
-TEST_F(NVFuserTest, FusionKernelExprEvalBindings_CUDA) {
-  Fusion fusion;
-  kir::Kernel kernel(&fusion);
-  FusionGuard fg((&kernel)->as<Fusion>());
-
-  kir::ExpressionEvaluator evaluator;
-
-  auto a = IrBuilder::create<Int>(c10::nullopt);
-  auto b = IrBuilder::create<Int>(c10::nullopt);
-  auto c = IrBuilder::addExpr(a, b);
-  auto d = IrBuilder::negExpr(IrBuilder::ceilDivExpr(c, b));
-  auto e = IrBuilder::create<Int>(0);
-
-  // trying to evaluate before binding should give empty results
-  TORCH_CHECK(!evaluator.evaluate(a).has_value());
-  TORCH_CHECK(!evaluator.evaluate(d).has_value());
-
-  evaluator.bind(a, 7);
-  evaluator.bind(b, 3);
-
-  // can't bind to the results of expressions
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(evaluator.bind(c, 100));
-
-  // can't bind to concrete values
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(evaluator.bind(e, 100));
-
-  checkIntValue(evaluator, c, 10);
-  checkIntValue(evaluator, IrBuilder::subExpr(a, b), 4);
-  checkIntValue(evaluator, IrBuilder::modExpr(a, b), 1);
-  checkIntValue(evaluator, IrBuilder::ceilDivExpr(a, b), 3);
-  checkIntValue(evaluator, d, -4);
-
-  // Reset the evaluation context
-  evaluator = kir::ExpressionEvaluator();
-
-  evaluator.bind(a, 2);
-  evaluator.bind(b, 5);
-
-  checkIntValue(evaluator, c, 7);
-  checkIntValue(evaluator, IrBuilder::subExpr(a, b), -3);
-  checkIntValue(evaluator, IrBuilder::modExpr(a, b), 2);
-  checkIntValue(evaluator, IrBuilder::ceilDivExpr(a, b), 1);
-  checkIntValue(evaluator, d, -2);
-}
-
-TEST_F(NVFuserTest, FusionClear_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // 1. Create a dummy IR
-
-  {
-    TensorView* tv0 = makeSymbolicTensor(2);
-    TensorView* tv1 = makeSymbolicTensor(2);
-
-    fusion.addInput(tv0);
-    fusion.addInput(tv1);
-
-    TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
-    TensorView* tv3 = add(tv0, tv2);
-
-    fusion.addOutput(tv3);
-
-    tv3->split(0, 4);
-    tv0->computeAt(tv3, 1);
-    tv1->computeAt(tv3, 1);
-
-    tv3->axis(0)->parallelize(ParallelType::BIDx);
-    tv2->axis(1)->parallelize(ParallelType::Unroll);
-    tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  // 2. Clear the IR
-
-  fusion.clear();
-
-  TORCH_CHECK(fusion.unordered_exprs().empty());
-  TORCH_CHECK(fusion.vals().empty());
-
-  TORCH_CHECK(fusion.inputs().empty());
-  TORCH_CHECK(fusion.outputs().empty());
-
-  TORCH_CHECK(ir_utils::getReductionOps(&fusion).empty());
-
-  // 3. Rebuild the IR
-
-  {
-    TensorView* tv0 = makeSymbolicTensor(3);
-    TensorView* tv1 = makeSymbolicTensor(3);
-    TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
-    TensorView* tv3 = add(tv0, tv2);
-
-    fusion.addInput(tv0);
-    fusion.addInput(tv1);
-    fusion.addOutput(tv3);
-
-    // tv3 [i0, i1, i2]
-    tv3->reorder({{0, 2}, {2, 0}});
-    // tv3 [i2, i1, i0]
-    tv3->split(-1, 4);
-    // tv3 [i2, i1, i0outer, i0inner{4}]
-    tv3->reorder({{2, 0}, {3, 1}, {0, 3}});
-    // tv3 [i0outer, i0inner{4}, i1, i2]
-    tv0->computeAt(tv3, -1);
-    tv1->computeAt(tv3, -1);
-    tv3->axis(1)->parallelize(ParallelType::BIDx);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor input1 = at::randn({16, 8, 8}, options);
-  at::Tensor input2 = at::randn_like(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input1, input2});
-  auto outputs = fe.runFusion({input1, input2});
-
-  at::Tensor tv2_ref = input2 + 2.0;
-  at::Tensor output_ref = input1 + tv2_ref;
-
-  TORCH_CHECK(output_ref.equal(outputs[0]));
-}
-
-TEST_F(NVFuserTest, FusionCopy_CUDA) {
-  Fusion original_fusion;
-
-  // Create the test IR
-  {
-    FusionGuard fg(&original_fusion);
-
-    auto tv0 = makeSymbolicTensor(3);
-    auto tv1 = makeSymbolicTensor(3);
-    auto tv2 = add(tv1, IrBuilder::create<Double>(2.0));
-    auto tv3 = sub(add(tv0, mul(tv2, tv2)), tv2);
-
-    original_fusion.addInput(tv0);
-    original_fusion.addInput(tv1);
-    original_fusion.addOutput(tv3);
-
-    tv3->reorder({{0, 2}, {2, 0}});
-    tv3->split(-1, 4);
-    tv3->reorder({{2, 0}, {3, 1}, {0, 3}});
-
-    tv0->computeAt(tv3, -1);
-    tv1->computeAt(tv3, -1);
-
-    tv3->axis(0)->parallelize(ParallelType::BIDx);
-    tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  // Test copy before lowering
-  Fusion clone = original_fusion;
-
-  // Compare IR dumps
-  std::stringstream original_ir;
-  std::stringstream clone_ir;
-  original_ir << original_fusion;
-  clone_ir << clone;
-  ASSERT_EQ(original_ir.str(), clone_ir.str());
-
-  // Lower original fusion
-  std::string original_kernel;
-  {
-    // TODO(kir): remove this guard once we implement the cuda codegen visitor
-    FusionGuard fg(&original_fusion);
-    original_kernel =
-        codegen::generateCudaKernel(GpuLower(&original_fusion).kernel());
-  }
-
-  // Make sure the "before lowering" clone was not mutated
-  // while lowering the original fusion IR
-  std::stringstream before_lowering_ir;
-  before_lowering_ir << clone;
-  ASSERT_EQ(original_ir.str(), before_lowering_ir.str());
-
-  // Test copy after lowering (including assignment operator)
-  Fusion before_lowering = clone;
-  clone = original_fusion;
-
-  // Compare IR dumps
-  std::stringstream original_lowered_ir;
-  std::stringstream clone_lowered_ir;
-  original_lowered_ir << original_fusion;
-  clone_lowered_ir << clone;
-  ASSERT_EQ(original_lowered_ir.str(), clone_lowered_ir.str());
-
-  // Lower the "before lowering" and compare kernels
-  std::string clone_kernel;
-  {
-    // TODO(kir): remove this guard once we implement the cuda codegen visitor
-    FusionGuard fg(&before_lowering);
-    clone_kernel =
-        codegen::generateCudaKernel(GpuLower(&before_lowering).kernel());
-  }
-  ASSERT_EQ(original_kernel, clone_kernel);
-}
-
-TEST_F(NVFuserTest, FusionMove_CUDA) {
-  Fusion fusion;
-
-  // Create the test IR
-  {
-    FusionGuard fg(&fusion);
-
-    auto tv0 = makeSymbolicTensor(3);
-    auto tv1 = makeSymbolicTensor(3);
-    auto tv2 = add(tv1, IrBuilder::create<Double>(2.0));
-    auto tv3 = sub(add(tv0, mul(tv2, tv2)), tv2);
-
-    fusion.addInput(tv0);
-    fusion.addInput(tv1);
-    fusion.addOutput(tv3);
-
-    tv3->reorder({{0, 2}, {2, 0}});
-    tv3->split(-1, 4);
-    tv3->reorder({{2, 0}, {3, 1}, {0, 3}});
-
-    tv0->computeAt(tv3, -1);
-    tv1->computeAt(tv3, -1);
-
-    tv3->axis(0)->parallelize(ParallelType::BIDx);
-    tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  std::stringstream original_ir;
-  original_ir << fusion;
-
-  // Test move before lowering
-  Fusion another_fusion = std::move(fusion);
-
-  // Check that the original fusion is "empty"
-  //
-  // IMPORTANT: these checks assume knowledge of the internal
-  //    implementation of the move operations. General uses
-  //    should only assume that the moved-from object is in
-  //    a valid, but unspecified state. This is similar to the
-  //    standard library containers:
-  //    https://en.cppreference.com/w/cpp/utility/move
-  //
-  TORCH_CHECK(fusion.unordered_exprs().empty());
-  TORCH_CHECK(fusion.vals().empty());
-  TORCH_CHECK(fusion.inputs().empty());
-  TORCH_CHECK(fusion.outputs().empty());
-
-  // clear() has no pre-conditions so it's valid to call on a moved-from object
-  fusion.clear();
-
-  // Compare IR dumps
-  std::stringstream another_ir;
-  another_ir << another_fusion;
-  ASSERT_EQ(original_ir.str(), another_ir.str());
-
-  // Lower the fusion IR
-  GpuLower lower(&another_fusion);
-
-  std::stringstream lowered_ir;
-  lowered_ir << another_fusion;
-
-  // Test move assignment after lowering
-  fusion = std::move(another_fusion);
-
-  // Compare IR dumps
-  std::stringstream moved_lowered_ir;
-  moved_lowered_ir << fusion;
-  ASSERT_EQ(lowered_ir.str(), moved_lowered_ir.str());
-}
-
-TEST_F(NVFuserTest, FusionSimpleArith_CUDA) {
-  std::stringstream ss1, ss2;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  Double* d1 = IrBuilder::create<Double>(1.f);
-  Double* d2 = IrBuilder::create<Double>(2.f);
-  Double* d3 = IrBuilder::create<Double>();
-
-  // Disrupt the fusion to make sure guard works well
-  {
-    Fusion fusion2;
-    FusionGuard fg(&fusion2);
-
-    Double* d1 = IrBuilder::create<Double>(1.f);
-    Double* d2 = IrBuilder::create<Double>(2.f);
-    add(d1, d2);
-    ss2 << fusion2;
-  }
-
-  IrBuilder::create<BinaryOp>(BinaryOpType::Add, d3, d1, d2);
-  ss1 << fusion;
-
-  TORCH_CHECK(
-      ss1.str().compare(ss2.str()) == 0,
-      "Error where explicit add nodes don't match implicit add nodes.");
-}
-
-TEST_F(NVFuserTest, FusionScalarTypePromote_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  Bool* b = IrBuilder::create<Bool>(true);
-  Double* d = IrBuilder::create<Double>(4.f);
-  Int* i = IrBuilder::create<Int>(3);
-  ComplexDouble* c =
-      IrBuilder::create<ComplexDouble>(c10::complex<double>(1, 2));
-
-  TORCH_CHECK(add(b, b)->getDataType() == DataType::Bool);
-  TORCH_CHECK(add(b, d)->getDataType() == DataType::Double);
-  TORCH_CHECK(add(b, i)->getDataType() == DataType::Int);
-  TORCH_CHECK(add(b, c)->getDataType() == DataType::ComplexDouble);
-
-  TORCH_CHECK(add(d, b)->getDataType() == DataType::Double);
-  TORCH_CHECK(add(d, d)->getDataType() == DataType::Double);
-  TORCH_CHECK(add(d, i)->getDataType() == DataType::Double);
-  TORCH_CHECK(add(d, c)->getDataType() == DataType::ComplexDouble);
-
-  TORCH_CHECK(add(i, b)->getDataType() == DataType::Int);
-  TORCH_CHECK(add(i, d)->getDataType() == DataType::Double);
-  TORCH_CHECK(add(i, i)->getDataType() == DataType::Int);
-  TORCH_CHECK(add(i, c)->getDataType() == DataType::ComplexDouble);
-
-  TORCH_CHECK(add(c, b)->getDataType() == DataType::ComplexDouble);
-  TORCH_CHECK(add(c, d)->getDataType() == DataType::ComplexDouble);
-  TORCH_CHECK(add(c, i)->getDataType() == DataType::ComplexDouble);
-  TORCH_CHECK(add(c, c)->getDataType() == DataType::ComplexDouble);
-}
-
-TEST_F(NVFuserTest, FusionComplexAbsTypes_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto options = at::TensorOptions().device(at::kCUDA, 0);
-  auto tensor_cf = at::randn({4, 4, 4}, options.dtype(at::kComplexFloat));
-  auto tensor_cd = at::randn({4, 4, 4}, options.dtype(at::kComplexDouble));
-
-  auto type_cf = TensorType::create(tensor_cf);
-  auto tv_cf = IrBuilder::create<TensorView>(type_cf);
-  auto type_cd = TensorType::create(tensor_cd);
-  auto tv_cd = IrBuilder::create<TensorView>(type_cd);
-
-  TORCH_CHECK(
-      tensor_cf.abs().scalar_type() ==
-      data_type_to_aten(abs(tv_cf)->getDataType().value()));
-  TORCH_CHECK(
-      tensor_cd.abs().scalar_type() ==
-      data_type_to_aten(abs(tv_cd)->getDataType().value()));
-}
-
-TEST_F(NVFuserTest, FusionRegister_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-  Double* v1 = IrBuilder::create<Double>(1.f);
-  Double* v2 = IrBuilder::create<Double>(2.f);
-  Val* v3 = binaryOp(BinaryOpType::Add, v1, v2);
-  Val* v4 = binaryOp(BinaryOpType::Add, v1, v2);
-  TORCH_CHECK(v1->name() + 1 == v2->name());
-  TORCH_CHECK(v2->name() + 1 == v3->name());
-  TORCH_CHECK(v3->name() + 1 == v4->name());
-  TORCH_CHECK(v3->definition()->name() + 1 == v4->definition()->name());
-}
-
-// dummy expr with 2 outputs only for toposort test.
-struct DummyExpr : public Expr {
-  ~DummyExpr() = default;
-  DummyExpr(
-      IrBuilderPasskey passkey,
-      Val* _outlhs,
-      Val* _outrhs,
-      Val* _lhs,
-      Val* _rhs)
-      : Expr(passkey, ExprType::UnaryOp) // Not terribly safe...
-  {
-    addOutput(_outlhs);
-    addOutput(_outrhs);
-    addInput(_lhs);
-    addInput(_rhs);
-  }
-  DummyExpr(const DummyExpr& other) = delete;
-  DummyExpr& operator=(const DummyExpr& other) = delete;
-  DummyExpr(DummyExpr&& other) = delete;
-  DummyExpr& operator=(DummyExpr&& other) = delete;
-};
-
-TEST_F(NVFuserTest, FusionTopoSort_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // e0: v3, v2 = dummy(v1, v0)
-  // e1: v4     =   add(v3, v2)
-  // e2: v5     =   add(v2, v4)
-  // e3: v6     =   add(v5, v5)
-  Double* v0 = IrBuilder::create<Double>();
-  Double* v1 = IrBuilder::create<Double>();
-  Double* v2 = IrBuilder::create<Double>();
-  Double* v3 = IrBuilder::create<Double>();
-  Double* v4 = IrBuilder::create<Double>();
-  Double* v5 = IrBuilder::create<Double>();
-  Double* v6 = IrBuilder::create<Double>();
-
-  std::vector<Val*> inputs = {v0, v1};
-  for (auto val : inputs) {
-    fusion.addInput(val);
-  }
-
-  Expr* e0 = IrBuilder::create<DummyExpr>(v3, v2, v1, v0);
-  Expr* e1 = IrBuilder::create<BinaryOp>(BinaryOpType::Add, v4, v3, v2);
-  Expr* e2 = IrBuilder::create<BinaryOp>(BinaryOpType::Add, v5, v2, v4);
-  Expr* e3 = IrBuilder::create<BinaryOp>(BinaryOpType::Add, v6, v5, v5);
-
-  fusion.addOutput(v2);
-  fusion.addOutput(v3);
-  auto exprs = fusion.exprs();
-  TORCH_CHECK(exprs.size() == 1, "Found ", exprs.size(), " but expecting 1");
-  TORCH_CHECK(exprs[0] == e0);
-
-  fusion.addOutput(v5);
-  exprs = fusion.exprs();
-  TORCH_CHECK(exprs.size() == 3, "Found ", exprs.size(), " but expecting 3");
-  TORCH_CHECK(exprs[0] == e0);
-  TORCH_CHECK(exprs[1] == e1);
-  TORCH_CHECK(exprs[2] == e2);
-
-  fusion.addOutput(v4);
-  exprs = fusion.exprs();
-  TORCH_CHECK(exprs.size() == 3, "Found ", exprs.size(), " but expecting 3");
-  TORCH_CHECK(exprs[0] == e0);
-  TORCH_CHECK(exprs[1] == e1);
-  TORCH_CHECK(exprs[2] == e2);
-
-  fusion.addOutput(v6);
-  exprs = fusion.exprs();
-  TORCH_CHECK(exprs.size() == 4, "Found ", exprs.size(), " but expecting 4");
-  TORCH_CHECK(exprs[0] == e0);
-  TORCH_CHECK(exprs[1] == e1);
-  TORCH_CHECK(exprs[2] == e2);
-  TORCH_CHECK(exprs[3] == e3);
-
-  TORCH_CHECK(v2->definition()->name() == 0);
-  TORCH_CHECK(v3->definition()->name() == 0);
-  TORCH_CHECK(v4->definition()->name() == 1);
-  TORCH_CHECK(v5->definition()->name() == 2);
-  TORCH_CHECK(v6->definition()->name() == 3);
-}
-
-TEST_F(NVFuserTest, FusionTensor_CUDA) {
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  {
-    auto tensor = at::randn({2, 3, 4, 5}, options);
-    auto tensor_type = TensorType::create(tensor);
-    auto fuser_tensor = IrBuilder::create<TensorView>(tensor_type);
-    TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim());
-    TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float);
-    TORCH_CHECK(fuser_tensor->domain() != nullptr);
-    for (const auto i : c10::irange(fuser_tensor->nDims())) {
-      // size 1 dimension are makred as broadcast
-      TORCH_CHECK(
-          fuser_tensor->axis(i)->isBroadcast() == (tensor.sizes()[i] == 1));
-      // check contiguity information;
-      TORCH_CHECK(fuser_tensor->domain()->contiguity()[i]);
-    }
-  }
-
-  // TensorType::create fills stride_properties, which helps us to mark
-  // IterDomain properly
-  // Note: implementation could change, depending on how much we want to invest
-  // in our home-brew contiguity coalescing. For now let's make sure that we
-  // properly test what we are using.
-  {
-    auto tensor = at::randn({4, 4, 4}, options);
-    auto sliced_tensor = tensor.slice(1, 0, -1, 2);
-
-    auto tensor_type = TensorType::create(sliced_tensor);
-    auto fuser_tensor = IrBuilder::create<TensorView>(tensor_type);
-    TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim());
-    TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float);
-    TORCH_CHECK(fuser_tensor->domain() != nullptr);
-    for (const auto i : c10::irange(fuser_tensor->nDims())) {
-      // size 1 dimension are makred as broadcast
-      TORCH_CHECK(fuser_tensor->axis(i)->isBroadcast() == false);
-    }
-    TORCH_CHECK(fuser_tensor->domain()->contiguity()[0]);
-    TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]);
-    TORCH_CHECK(fuser_tensor->domain()->contiguity()[2]);
-  }
-
-  {
-    auto tensor = at::randn({2, 3, 4, 5}, options);
-    auto permuted_tensor = tensor.permute({0, 3, 1, 2});
-    auto tensor_type = TensorType::create(permuted_tensor);
-    auto fuser_tensor = IrBuilder::create<TensorView>(tensor_type);
-    TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim());
-    TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float);
-    TORCH_CHECK(fuser_tensor->domain() != nullptr);
-    for (const auto i : c10::irange(fuser_tensor->nDims())) {
-      // size 1 dimension are makred as broadcast
-      TORCH_CHECK(fuser_tensor->axis(i)->isBroadcast() == false);
-    }
-    TORCH_CHECK(!fuser_tensor->domain()->contiguity()[0]);
-    TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]);
-    TORCH_CHECK(fuser_tensor->domain()->contiguity()[2]);
-    TORCH_CHECK(!fuser_tensor->domain()->contiguity()[3]);
-  }
-}
-
-TEST_F(NVFuserTest, FusionFilterVals_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  auto tv1 = makeSymbolicTensor(1);
-  auto scalar0 = IrBuilder::create<Double>(0);
-  auto scalar1 = IrBuilder::create<Int>(0);
-  auto scalar2 = IrBuilder::create<Int>(1);
-
-  const std::vector<Val*> vals = {tv0, scalar0, tv1, scalar1, scalar2};
-
-  std::vector<TensorView*> tvs(
-      ir_utils::filterByType<TensorView>(vals).begin(),
-      ir_utils::filterByType<TensorView>(vals).end());
-  TORCH_CHECK(tvs.size() == 2);
-  TORCH_CHECK(tvs[0] == tv0);
-  TORCH_CHECK(tvs[1] == tv1);
-
-  std::vector<Double*> floats(
-      ir_utils::filterByType<Double>(vals).begin(),
-      ir_utils::filterByType<Double>(vals).end());
-  TORCH_CHECK(floats.size() == 1);
-  TORCH_CHECK(floats[0] == scalar0);
-
-  std::vector<Int*> ints(
-      ir_utils::filterByType<Int>(vals).begin(),
-      ir_utils::filterByType<Int>(vals).end());
-  TORCH_CHECK(ints.size() == 2);
-  TORCH_CHECK(ints[0] == scalar1);
-  TORCH_CHECK(ints[1] == scalar2);
-
-  TORCH_CHECK(
-      ir_utils::filterByType<Expr>(vals).begin() ==
-          ir_utils::filterByType<Expr>(vals).end(),
-      "Not expecting any results");
-}
-
-TEST_F(NVFuserTest, FusionTVSplit_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv = makeSymbolicTensor(3);
-
-  tv = tv->split(2, 2);
-  TORCH_CHECK(tv->nDims() == 4);
-  Expr* outer = tv->axis(2)->extent()->definition();
-
-  TORCH_CHECK(
-      outer->getExprType().value() == ExprType::BinaryOp &&
-      static_cast<BinaryOp*>(outer)->getBinaryOpType() ==
-          BinaryOpType::CeilDiv &&
-      static_cast<BinaryOp*>(outer)->lhs()->sameAs(
-          tv->getRootDomain()[2]->extent()) &&
-      static_cast<Int*>(static_cast<BinaryOp*>(outer)->rhs())
-          ->sameAs(IrBuilder::create<Int>(2)));
-
-  IterDomain* inner = static_cast<IterDomain*>(tv->axis(3));
-  TORCH_CHECK(
-      inner->extent()->isScalar() &&
-      static_cast<Int*>(inner->extent())->isConst() &&
-      static_cast<Int*>(inner->extent())->value().value() == 2);
-}
-
-TEST_F(NVFuserTest, FusionTVMerge_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv = makeSymbolicTensor(3);
-
-  tv = tv->merge(1);
-  Expr* axisOp = tv->axis(1)->extent()->definition();
-
-  TORCH_CHECK(
-      tv->nDims() == 2 && axisOp->getExprType() == ExprType::BinaryOp &&
-      static_cast<BinaryOp*>(axisOp)->getBinaryOpType() == BinaryOpType::Mul &&
-      static_cast<BinaryOp*>(axisOp)->lhs() ==
-          tv->getRootDomain()[1]->extent() &&
-      static_cast<BinaryOp*>(axisOp)->rhs() ==
-          tv->getRootDomain()[2]->extent());
-}
-
-TEST_F(NVFuserTest, FusionTVReorder_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  std::unordered_map<int, int> shift_right{{-1, 0}};
-
-  std::unordered_map<int, int> shift_left{{0, -1}};
-
-  std::unordered_map<int, int> shift_left_2{{0, -1}, {1, 0}, {2, 1}};
-
-  std::unordered_map<int, int> swap{{0, 2}, {2, 0}};
-
-  auto tv = makeSymbolicTensor(3);
-  std::vector<IterDomain*> ref;
-  ref = std::vector<IterDomain*>(
-      tv->domain()->domain().begin(), tv->domain()->domain().end());
-
-  tv->reorder(shift_left);
-  for (const auto i : c10::irange(tv->nDims())) {
-    TORCH_CHECK(ref[i]->sameAs(tv->axis(i - 1)));
-  }
-
-  tv = makeSymbolicTensor(3);
-  ref = std::vector<IterDomain*>(
-      tv->domain()->domain().begin(), tv->domain()->domain().end());
-
-  tv->reorder(shift_left);
-  for (const auto i : c10::irange(tv->nDims())) {
-    TORCH_CHECK(ref[i]->sameAs(tv->axis(i - 1)));
-  }
-
-  tv = makeSymbolicTensor(3);
-  ref = std::vector<IterDomain*>(
-      tv->domain()->domain().begin(), tv->domain()->domain().end());
-
-  tv->reorder(shift_right);
-  TORCH_CHECK(ref[ref.size() - 1]->sameAs(tv->axis(0)));
-  for (const auto i : c10::irange(1, tv->nDims())) {
-    TORCH_CHECK(ref[i - 1]->sameAs(tv->axis(i)));
-  }
-
-  tv = makeSymbolicTensor(3);
-  ref = std::vector<IterDomain*>(
-      tv->domain()->domain().begin(), tv->domain()->domain().end());
-  tv->reorder(swap);
-  TORCH_CHECK(ref[0]->sameAs(tv->axis(2)));
-  TORCH_CHECK(ref[2]->sameAs(tv->axis(0)));
-  TORCH_CHECK(ref[1]->sameAs(tv->axis(1)));
-}
-
-TEST_F(NVFuserTest, FusionEquality_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  Double* fval1 = IrBuilder::create<Double>();
-  Double* fval1_copy = fval1;
-  Double* fval2 = IrBuilder::create<Double>();
-  Double* fone = IrBuilder::create<Double>(1.0);
-
-  TORCH_CHECK(fval1->sameAs(fval1_copy));
-  TORCH_CHECK(!fval1->sameAs(fval2));
-  TORCH_CHECK(!fone->sameAs(fval1));
-  TORCH_CHECK(fone->sameAs(IrBuilder::create<Double>(1.0)));
-
-  Int* ival1 = IrBuilder::create<Int>();
-  Int* ival1_copy = ival1;
-  Int* ival2 = IrBuilder::create<Int>();
-  Int* ione = IrBuilder::create<Int>(1);
-
-  TORCH_CHECK(ival1->sameAs(ival1_copy));
-  TORCH_CHECK(!ival1->sameAs(ival2));
-  TORCH_CHECK(!ione->sameAs(ival1));
-  TORCH_CHECK(ione->sameAs(IrBuilder::create<Int>(1)));
-
-  BinaryOp* add1 = IrBuilder::create<BinaryOp>(
-      BinaryOpType::Add, IrBuilder::create<Double>(), fval1, ival1);
-  BinaryOp* add1_copy = IrBuilder::create<BinaryOp>(
-      BinaryOpType::Add, IrBuilder::create<Double>(), fval1, ival1);
-  BinaryOp* sub1 = IrBuilder::create<BinaryOp>(
-      BinaryOpType::Sub, IrBuilder::create<Double>(), fval1, ival1);
-
-  UnaryOp* neg1 = IrBuilder::create<UnaryOp>(
-      UnaryOpType::Neg, IrBuilder::create<Double>(), fval1);
-  UnaryOp* neg2 = IrBuilder::create<UnaryOp>(
-      UnaryOpType::Neg, IrBuilder::create<Double>(), fval2);
-  UnaryOp* neg1_copy = IrBuilder::create<UnaryOp>(
-      UnaryOpType::Neg, IrBuilder::create<Double>(), fval1);
-
-  TORCH_CHECK(add1->sameAs(add1_copy));
-  TORCH_CHECK(!add1->sameAs(sub1));
-
-  TORCH_CHECK(neg1->sameAs(neg1_copy));
-  TORCH_CHECK(!static_cast<Expr*>(neg1)->sameAs(add1));
-  TORCH_CHECK(!neg1->sameAs(neg2));
-}
-
-TEST_F(NVFuserTest, FusionDependency_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  Double* d0 = IrBuilder::create<Double>(0.f);
-  Double* d1 = IrBuilder::create<Double>(1.f);
-  auto d2 = add(d0, d1);
-
-  auto d3 = add(d2, d2);
-
-  Double* d4 = IrBuilder::create<Double>(4.f);
-  Double* d5 = IrBuilder::create<Double>(5.f);
-  auto d6 = add(d4, d5);
-
-  Double* d7 = IrBuilder::create<Double>(7.f);
-  Double* d8 = IrBuilder::create<Double>(8.f);
-  auto d9 = add(d7, d8);
-
-  auto d10 = add(d6, d9);
-
-  auto d11 = add(d3, d10);
-
-  TORCH_CHECK(DependencyCheck::isDependencyOf(d0, d11));
-  TORCH_CHECK(DependencyCheck::isDependencyOf(d1, d11));
-  TORCH_CHECK(DependencyCheck::isDependencyOf(d2, d11));
-  TORCH_CHECK(DependencyCheck::isDependencyOf(d3, d11));
-  TORCH_CHECK(DependencyCheck::isDependencyOf(d6, d11));
-  TORCH_CHECK(DependencyCheck::isDependencyOf(d9, d11));
-  TORCH_CHECK(DependencyCheck::isDependencyOf(d0, d2));
-  TORCH_CHECK(DependencyCheck::isDependencyOf(d2, d3));
-  TORCH_CHECK(DependencyCheck::isDependencyOf(d4, d6));
-  TORCH_CHECK(DependencyCheck::isDependencyOf(d8, d10));
-
-  TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d0));
-  TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d1));
-  TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d2));
-  TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d3));
-  TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d4));
-  TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d5));
-  TORCH_CHECK(!DependencyCheck::isDependencyOf(d2, d0));
-  TORCH_CHECK(!DependencyCheck::isDependencyOf(d3, d2));
-  TORCH_CHECK(!DependencyCheck::isDependencyOf(d6, d4));
-  TORCH_CHECK(!DependencyCheck::isDependencyOf(d10, d8));
-
-  auto dep_chain = DependencyCheck::getSingleDependencyChain(d0, d11);
-  TORCH_CHECK(dep_chain.back() == d11);
-  dep_chain.pop_back();
-  TORCH_CHECK(dep_chain.back() == d3);
-  dep_chain.pop_back();
-  TORCH_CHECK(dep_chain.back() == d2);
-  dep_chain.pop_back();
-
-  dep_chain = DependencyCheck::getSingleDependencyChain(d6, d11);
-  TORCH_CHECK(dep_chain.back() == d11);
-  dep_chain.pop_back();
-  TORCH_CHECK(dep_chain.back() == d10);
-  dep_chain.pop_back();
-
-  dep_chain = DependencyCheck::getSingleDependencyChain(d4, d11);
-  TORCH_CHECK(dep_chain.back() == d11);
-  dep_chain.pop_back();
-  TORCH_CHECK(dep_chain.back() == d10);
-  dep_chain.pop_back();
-  TORCH_CHECK(dep_chain.back() == d6);
-  dep_chain.pop_back();
-
-  dep_chain = DependencyCheck::getSingleDependencyChain(d11, d2);
-  TORCH_CHECK(dep_chain.empty());
-}
-
-TEST_F(NVFuserTest, FusionParser_CUDA) {
-  // This test may not pass if using a custom block sync as there may
-  // be additional calls. Skip the test as it's not specifically
-  // relevant with block synchronizatin.
-  if (std::getenv("PYTORCH_NVFUSER_USE_BLOCK_SYNC_ATOMIC")) {
-    return;
-  }
-  auto g = std::make_shared<Graph>();
-  const auto graph0_string = R"IR(
-    graph(%0 : Float(2, strides=[1]),
-          %1 : Float(2, strides=[1])):
-      %c0 : Float(2, strides=[1]) = aten::mul(%0, %1)
-      %d0 : Float(2, strides=[1]) = aten::mul(%c0, %0)
-      return (%d0))IR";
-  parseIR(graph0_string, g.get());
-
-  // strides are not yet supported in the irparser.
-  for (auto val : g->block()->inputs()) {
-    if (val->isCompleteTensor())
-      val->setType(val->type()->castRaw<TensorType>()->contiguous());
-  }
-  for (auto node : g->block()->nodes()) {
-    for (auto val : node->outputs()) {
-      if (val->isCompleteTensor())
-        val->setType(val->type()->castRaw<TensorType>()->contiguous());
-    }
-  }
-
-  auto fusion = parseJitIR(g);
-  FusionGuard fg(fusion.get());
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  // Avoid vectorization here as those kernels can't be lowered twice at the
-  // moment
-  at::Tensor input1 = at::randn({16}, options);
-  at::Tensor input2 = at::randn({16}, options);
-  auto lparams = schedulePointwise(fusion.get(), {input1, input2});
-
-  // CONSIDER:
-  // 1. this can be moved to a dedicated "golden" file
-  // 2. use a fuzzy compare (ignore non-significant whitespaces for example)
-  const std::string expected_kernel = R"(
-__global__ void CUDAGeneratedKernel(Tensor<float, 1> T0, Tensor<float, 1> T1, Tensor<float, 1> T3) {
-  int64_t i50;
-  i50 = (((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x);
-  if ((i50 < T0.size[0])) {
-    float T5[1];
-    T5[0] = 0;
-    T5[0]
-       = T1[i50];
-    float T4[1];
-    T4[0] = 0;
-    T4[0]
-       = T0[i50];
-    float T2[1];
-    T2[0]
-      = T4[0]
-      * T5[0];
-    float T6[1];
-    T6[0]
-      = T2[0]
-      * T4[0];
-    T3[i50]
-       = T6[0];
-  }
-}
-)";
-
-  const std::string actual_kernel =
-      "\n" + codegen::generateCudaKernel(GpuLower(fusion.get()).kernel());
-  if (expected_kernel.size() != actual_kernel.size() ||
-      expected_kernel.compare(actual_kernel) != 0) {
-    std::cerr
-        << " Codegen mismatch, codegen possibly changed, or is incorrect. "
-        << " \n ========= EXPECTED ========= \n"
-        << expected_kernel << "\n========= ACTUAL ========== \n"
-        << actual_kernel << "\n=================" << std::endl;
-    auto it = std::mismatch(
-        expected_kernel.begin(),
-        expected_kernel.end(),
-        actual_kernel.begin(),
-        actual_kernel.end());
-    std::string actual_mismatched_snippet(it.second, actual_kernel.end());
-    actual_mismatched_snippet = actual_mismatched_snippet.substr(0, 10);
-    std::string expected_mismatched_snippet(it.first, expected_kernel.end());
-    expected_mismatched_snippet = expected_mismatched_snippet.substr(0, 10);
-    std::cerr << "First mismatch found at: " << actual_mismatched_snippet
-              << ", expected: " << expected_mismatched_snippet << std::endl;
-    TORCH_CHECK(false);
-  }
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1, input2}, lparams);
-  auto outputs = fe.runFusion({input1, input2}, lparams);
-  at::Tensor output_ref = input1 * input2 * input1;
-  TORCH_CHECK(output_ref.equal(outputs[0]));
-}
-
-TEST_F(NVFuserTest, FusionOuterSplit_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(3);
-
-  IrBuilder::create<BinaryOp>(
-      BinaryOpType::Add,
-      tv0,
-      IrBuilder::create<Double>(0.0),
-      IrBuilder::create<Double>(1.0));
-  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(2.0));
-  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(3.0));
-  fusion.addOutput(tv2);
-
-  //[I0, I1, I2]
-  tv2->split(-1, 4, false);
-  //[I0, I1, I2o{4}, I2i]
-  tv2->merge(0);
-  tv2->merge(0);
-  //[I0*I1*I2o{4}, I2i]
-  tv2->split(0, 2);
-  //[I0*I1*I2o{4}o, I0*I1*I2o{4}i{2}, I2i]
-  tv2->reorder({{0, 1}, {1, 0}});
-  // I0*I1*I2o{4}i{2}, [I0*I1*I2o{4}o, I2i]
-
-  tv0->computeAt(tv2, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor output = at::empty({2, 6, 32}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({}, {output});
-
-  at::Tensor output_ref = at::zeros_like(output, options);
-  output_ref = output_ref + 0.0 + 1.0 + 2.0 + 3.0;
-
-  TORCH_CHECK(output_ref.equal(output));
-}
-
-TEST_F(NVFuserTest, FusionCodeGen_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(3);
-
-  IrBuilder::create<BinaryOp>(
-      BinaryOpType::Add,
-      tv0,
-      IrBuilder::create<Double>(0.0),
-      IrBuilder::create<Double>(1.0));
-  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(2.0));
-  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(3.0));
-  fusion.addOutput(tv2);
-
-  //[I0, I1, I2]
-  tv2 = tv2->split(0, 4);
-  //[I0o, I0i{4}, I1, I2]
-  tv2 = tv2->merge(1);
-  //[I0o, I0i{4}*I1, I2]
-  tv2 = tv2->split(-1, 2);
-  //[I0o, I0i{4}*I1, I2o, I2i{2}]
-  tv2 = tv2->reorder({{0, 1}, {1, 0}, {3, 2}});
-  //[I0i{4}*I1, I0o, I2i{2}, I2o]
-
-  tv0->computeAt(tv2, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor output = at::empty({16, 8, 8}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  fe.runFusion({}, {output});
-
-  at::Tensor output_ref = at::zeros_like(output, options);
-  output_ref = output_ref + 0.0 + 1.0 + 2.0 + 3.0;
-
-  TORCH_CHECK(output_ref.equal(output));
-}
-
-TEST_F(NVFuserTest, FusionCodeGen2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(3);
-  TensorView* tv1 = makeSymbolicTensor(3);
-  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
-  TensorView* tv3 = add(tv0, tv2);
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addOutput(tv3);
-
-  //[I0, I1, I2]
-  tv3->reorder({{0, 2}, {2, 0}});
-  //[I2, I1, I0]
-  tv3->split(-1, 4);
-  //[I2, I1, I0o, I0i{4}]
-  tv3->reorder({{2, 0}, {3, 1}, {0, 3}});
-  // I0o, I0i{4}, I1, I2]
-
-  tv0->computeAt(tv3, -1);
-  tv1->computeAt(tv3, -1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor input1 = at::randn({16, 8, 8}, options);
-  at::Tensor input2 = at::randn_like(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input1, input2});
-  auto outputs = fe.runFusion({input1, input2});
-
-  at::Tensor tv2_ref = input2 + 2.0;
-  at::Tensor output_ref = input1 + tv2_ref;
-
-  TORCH_CHECK(output_ref.equal(outputs[0]));
-}
-
-TEST_F(NVFuserTest, FusionSimplePWise_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-  // dimensionality of the problem
-  int nDims = 3;
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeContigTensor(nDims);
-  TensorView* tv1 = makeContigTensor(nDims);
-
-  // Register your inputs
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  // Do math with it, it returns a `Val*` but can be static_casted back to
-  // TensorView
-  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
-  TensorView* tv3 = add(tv0, tv2);
-
-  // Register your outputs
-  fusion.addOutput(tv3);
-
-  // Do transformations, remember, transformations are outputs to inputs
-  // This doesn't have to be in this order
-  tv3->merge(1);
-  tv3->merge(0);
-
-  // Split by n_threads
-  tv3->split(0, 128);
-  tv3->split(0, 4);
-
-  // For all inputs, computeAt the output inline, temporaries should be squeezed
-  // between them
-  tv0->computeAt(tv3, -1);
-  tv1->computeAt(tv3, -1);
-
-  // Parallelize TV3
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(-2)->parallelize(ParallelType::Unroll);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor input1 = at::randn({64, 2, 128}, options);
-  at::Tensor input2 = at::rand_like(input1);
-  at::Tensor output = at::empty_like(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input1, input2});
-  fe.runFusion({input1, input2}, {output});
-
-  at::Tensor tv2_ref = input2 + 2.0;
-  at::Tensor output_ref = input1 + tv2_ref;
-
-  TORCH_CHECK(output_ref.equal(output));
-}
-
-TEST_F(NVFuserTest, FusionSimpleAmperePipeline_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // requires ampere+ GPU
-  if (!deviceMajorMinorCheck(8)) {
-    GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
-    return;
-  }
-
-  auto tv0 = makeContigTensor(1);
-
-  fusion.addInput(tv0);
-
-  auto tv1 = set(tv0);
-
-  fusion.addOutput(tv1);
-
-  auto tv_cache = tv0->cacheAfter(LoadStoreOpType::CpAsync);
-  tv_cache->setMemoryType(MemoryType::Shared);
-
-  tv1->split(0, 16);
-  tv0->computeAt(tv1, 1);
-
-  tv_cache->circularBuffer(10);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({255}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input1});
-  auto cg_outputs = fe.runFusion({input1});
-
-  testValidate(&fusion, cg_outputs, {input1}, {input1}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSimplePWiseDtypeComplex_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-  // dimensionality of the problem
-  int nDims = 3;
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeContigTensor(nDims, DataType::ComplexFloat);
-  TensorView* tv1 = makeContigTensor(nDims, DataType::ComplexFloat);
-
-  // Register your inputs
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  // Do math with it, it returns a `Val*` but can be static_casted back to
-  // TensorView
-  c10::complex<double> scalar1(2.0, 3.0);
-  TensorView* tv2 = add(tv1, IrBuilder::create<ComplexDouble>(scalar1));
-  TensorView* tv3 = add(tv0, tv2);
-
-  // Register your outputs
-  fusion.addOutput(tv3);
-
-  // Do transformations, remember, transformations are outputs to inputs
-  // This doesn't have to be in this order
-  tv3->merge(1);
-  tv3->merge(0);
-
-  // Split by n_threads
-  tv3->split(0, 128);
-  tv3->split(0, 4);
-
-  // For all inputs, computeAt the output inline, temporaries should be squeezed
-  // between them
-  tv0->computeAt(tv3, -1);
-  tv1->computeAt(tv3, -1);
-
-  // Parallelize TV3
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(-2)->parallelize(ParallelType::Unroll);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  auto options =
-      at::TensorOptions().dtype(at::kComplexFloat).device(at::kCUDA, 0);
-
-  at::Tensor input1 = at::randn({64, 2, 128}, options);
-  at::Tensor input2 = at::rand_like(input1);
-  at::Tensor output = at::empty_like(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input1, input2});
-  fe.runFusion({input1, input2}, {output});
-
-  at::Tensor tv2_ref = input2 + scalar1;
-  at::Tensor output_ref = input1 + tv2_ref;
-
-  TORCH_CHECK(output_ref.equal(output));
-}
-
-TEST_F(NVFuserTest, FusionExecKernel_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-
-  // Register your inputs
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  // Do math with it, it returns a `Val*` but can be static_casted back to
-  // TensorView
-  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
-  TensorView* tv3 = add(tv0, tv2);
-
-  // Register your outputs
-  fusion.addOutput(tv3);
-
-  tv3->merge(0);
-  tv3->split(0, 128);
-  tv3->split(0, 4);
-
-  // For all inputs, computeAt the output inline, temporaries should be squeezed
-  // between them
-  tv0->computeAt(tv3, 1);
-  tv1->computeAt(tv3, 1);
-
-  // Parallelize TV3
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(1)->parallelize(ParallelType::Unroll);
-  tv3->axis(1)->parallelize(ParallelType::Unroll);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor input1 = at::ones({1, 128}, options);
-  at::Tensor input2 = at::ones_like(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input1, input2});
-  auto outputs = fe.runFusion({input1, input2});
-
-  at::Tensor check = at::full({1, 128}, 4, options);
-  ;
-  TORCH_CHECK(outputs[0].equal(check));
-}
-
-int ceilDiv_(int a, int b) {
-  return (a + b - 1) / b;
-}
-
-TEST_F(NVFuserTest, FusionAdvancedComputeAt1_CUDA) {
-  // Case 1
-  // tv1 = tv0 * 0.5
-  // tv2 = tv1 * -1
-  // tv3 = tv1 + 3
-  // tv4 = tv1 * 2
-  // tv5 = tv3 + tv2
-  // tv6 = tv5 + tv4
-  // tv7 = tv1 + tv4
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
-  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0));
-  TensorView* tv3 = add(tv1, IrBuilder::create<Double>(3.0));
-  TensorView* tv4 = mul(tv1, IrBuilder::create<Double>(2.0));
-  TensorView* tv5 = add(tv3, tv2);
-
-  TensorView* tv6 = add(tv5, tv4);
-  TensorView* tv7 = add(tv1, tv4);
-
-  fusion.addOutput(tv6);
-  fusion.addOutput(tv7);
-
-  // Lets setup to actually run
-  tv7->merge(0);
-  tv7->split(0, 128);
-  tv7->split(0, 4);
-
-  tv7->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv0->computeAt(tv7, 1);
-
-  ComputeAtMap ca_map(&fusion);
-
-  // The this-position of the last tensor should be zero.
-  TORCH_CHECK(
-      tv7->nDims() == 3 && tv7->getComputeAtPosition() == 0 &&
-      tv7->getMaxProducerPosition() == 1);
-  TORCH_CHECK(
-      tv7->nDims() == 3 && tv6->getComputeAtPosition() == 0 &&
-      tv6->getMaxProducerPosition() == 1);
-  // The position of every other tensor should be 1.
-  for (auto tv : {tv1, tv2, tv3, tv4, tv5}) {
-    TORCH_CHECK(tv->nDims() == 3 && tv->getComputeAtPosition() == 1);
-
-    TORCH_CHECK(
-        ca_map.areMapped(tv7->axis(0), tv->axis(0), IdMappingMode::PERMISSIVE));
-  }
-
-  for (Val* val : fusion.vals()) {
-    if (!val->isFusionInput() &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({129, 127}, options);
-
-  auto t1 = aten_input.mul({0.5});
-  auto t2 = t1.mul({-1.0});
-  auto t3 = t1.add({3.0});
-  auto t4 = t1.mul({2.0});
-  auto t5 = t3.add(t2);
-  auto t6 = t5.add(t4);
-  auto t7 = t1.add(t4);
-
-  std::vector<at::Tensor> aten_outputs = {t6, t7};
-  std::vector<at::Tensor> cg_outputs = {
-      at::empty_like(aten_input, options), at::empty_like(aten_input, options)};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, cg_outputs);
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedComputeAt2_CUDA) {
-  // Case 2
-  // tv1 = tv0 * -1
-  // tv2 = tv0 + 3
-  // tv3 = tv0 * 2
-  // tv4 = tv2 + tv1
-  // tv5 = tv4 + tv3
-  // tv6 = tv5 + tv3
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(-1.0));
-  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(3.0));
-  TensorView* tv3 = mul(tv0, IrBuilder::create<Double>(2.0));
-  TensorView* tv4 = add(tv2, tv1);
-
-  TensorView* tv5 = add(tv4, tv3);
-  TensorView* tv6 = add(tv5, tv3);
-
-  fusion.addOutput(tv5);
-  fusion.addOutput(tv6);
-
-  // Lets setup to actually run
-  tv6->merge(0);
-  tv6->split(0, 128);
-  tv6->split(0, 4);
-
-  tv6->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv0->computeAt(tv6, 1);
-
-  for (Val* val : fusion.vals()) {
-    if (!val->isFusionInput() &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({129, 127}, options);
-
-  auto t1 = input.mul({-1.0});
-  auto t2 = input.add({3.0});
-  auto t3 = input.mul({2.0});
-  auto t4 = t2.add(t1);
-  auto t5 = t4.add(t3);
-  auto t6 = t5.add(t3);
-
-  std::vector<at::Tensor> aten_outputs = {t5, t6};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
-
-  testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedComputeAt3_CUDA) {
-  // Case 3
-  // T2 = T1 * 0.979361
-  // T3 = T2 * T0
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(4);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = makeSymbolicTensor(4);
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(.979361));
-  TensorView* tv3 = mul(tv2, tv0);
-
-  fusion.addOutput(tv3);
-
-  // Lets setup to actually run
-  while (tv3->nDims() > 1)
-    tv3->merge(0);
-  tv3->split(0, 128);
-  tv3->split(0, 4);
-
-  tv0->computeAt(tv3, 1);
-  tv1->computeAt(tv3, 1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  for (Val* val : fusion.vals()) {
-    if (!val->isFusionInput() &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-
-  auto t2 = t1.mul({0.979361});
-  auto aten_output = t2.mul(t0);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  at::Tensor cg_output = at::empty_like(t0, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  fe.runFusion(aten_inputs, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedComputeAt4_CUDA) {
-  // Case 4
-  // T4 = T2 - T3
-  // T5 = T1 + T4
-  // T6 = T5 - T0
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(4);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = makeSymbolicTensor(4);
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = makeSymbolicTensor(4);
-  fusion.addInput(tv2);
-
-  TensorView* tv3 = makeSymbolicTensor(4);
-  fusion.addInput(tv3);
-
-  TensorView* tv4 = sub(tv2, tv3);
-  TensorView* tv5 = add(tv1, tv4);
-  TensorView* tv6 = sub(tv5, tv0);
-
-  fusion.addOutput(tv6);
-
-  // Lets setup to actually run
-  while (tv6->nDims() > 1)
-    tv6->merge(0);
-  tv6->split(0, 128);
-  tv6->split(0, 4);
-
-  tv0->computeAt(tv6, 1);
-  tv1->computeAt(tv6, 1);
-  tv2->computeAt(tv6, 1);
-  tv3->computeAt(tv6, 1);
-
-  tv6->axis(0)->parallelize(ParallelType::BIDx);
-
-  for (Val* val : fusion.vals()) {
-    if (!val->isFusionInput() &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-  at::Tensor t2 = at::rand_like(t0, options);
-  at::Tensor t3 = at::rand_like(t0, options);
-
-  auto t4 = t2.sub(t3);
-  auto t5 = t1.add(t4);
-  auto aten_output = t5.sub(t0);
-
-  std::vector<IValue> aten_inputs = {t0, t1, t2, t3};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedComputeAt5_CUDA) {
-  // Case 5
-  // tv2 = tv0 + 2.0
-  // tv3 = tv1 * tv2
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  TensorView* tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2.0));
-  TensorView* tv3 = mul(tv1, tv2);
-  fusion.addOutput(tv3);
-
-  tv3->merge(0);
-  tv3->split(-1, 8);
-  tv3->split(-1, 4);
-
-  tv2->computeAt(tv3, 1);
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({63, 65}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-
-  auto t2 = t0.add(2.0);
-  auto aten_output = t1.mul(t2);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedComputeAt6_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  TensorView* tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2.0));
-  TensorView* tv3 = mul(tv1, tv2);
-  fusion.addOutput(tv3);
-
-  tv2->merge(0);
-  tv2->split(-1, 8);
-  tv2->split(-1, 4);
-  tv3->merge(0);
-  tv3->split(-1, 8);
-
-  tv2->computeAt(tv3, 1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({63, 65}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-
-  auto t2 = t0.add(2.0);
-  auto aten_output = t1.mul(t2);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedComputeAt7_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
-
-  auto tv2 = makeSymbolicTensor(1);
-  fusion.addInput(tv2);
-
-  auto tv3 = add(tv2, IrBuilder::create<Double>(3.0));
-
-  auto tv4 = add(tv1, tv3);
-  fusion.addOutput(tv4);
-
-  auto tv5 = broadcast(tv1, {false, true});
-
-  auto tv6 = makeSymbolicTensor(2);
-  fusion.addInput(tv6);
-
-  auto tv7 = mul(tv5, tv6);
-
-  fusion.addOutput(tv7);
-
-  tv7->split(1, 2);
-  tv7->merge(0);
-  tv7->split(0, 4);
-  tv7->split(0, 128);
-
-  tv7->axis(0)->parallelize(ParallelType::BIDx);
-  tv7->axis(1)->parallelize(ParallelType::TIDx);
-
-  tv0->computeAt(tv7, 1);
-  auto tv5_domain = tv5->domain()->domain();
-
-  // These computeAt transformations should not affect the TV5 domain
-  tv0->computeAt(tv4, -1);
-  tv2->computeAt(tv4, -1);
-
-  auto tv5_domain_current = tv5->domain()->domain();
-  TORCH_CHECK(tv5_domain == tv5_domain_current, "Invalid TV5 domain");
-
-  const int numel_x = 100;
-  const int numel_y = 200;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn({numel_x}, options);
-  auto t2 = at::randn({numel_x}, options);
-  auto t6 = at::randn({numel_x, numel_y}, options);
-
-  auto t1 = t0.add(1.0);
-  auto t3 = t2.add(3.0);
-  auto t4 = t1.add(t3);
-  auto t5 = t1.unsqueeze(1);
-  auto t7 = t5.mul(t6);
-
-  std::vector<IValue> aten_inputs = {t0, t2, t6};
-  std::vector<at::Tensor> aten_outputs = {t4, t7};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedComputeAt8_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
-
-  auto tv2 = makeSymbolicTensor(1);
-  fusion.addInput(tv2);
-
-  auto tv3 = add(tv2, IrBuilder::create<Double>(3.0));
-
-  auto tv4 = add(tv1, tv3);
-  fusion.addOutput(tv4);
-
-  auto tv5 = broadcast(tv1, {false, true});
-
-  auto tv6 = makeSymbolicTensor(2);
-  fusion.addInput(tv6);
-
-  auto tv7 = mul(tv5, tv6);
-
-  fusion.addOutput(tv7);
-
-  tv7->split(1, 2);
-  tv7->merge(0);
-  tv7->split(0, 128, false);
-  tv7->split(0, 4, false);
-
-  tv7->axis(0)->parallelize(ParallelType::BIDx);
-  tv7->axis(1)->parallelize(ParallelType::TIDx);
-
-  // Reverse computeAt structure from previous test
-  tv0->computeAt(tv4, -1);
-  tv2->computeAt(tv4, -1);
-  tv0->computeAt(tv7, -1);
-
-  const int numel_x = 100;
-  const int numel_y = 200;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn({numel_x}, options);
-  auto t2 = at::randn({numel_x}, options);
-  auto t6 = at::randn({numel_x, numel_y}, options);
-
-  auto t1 = t0.add(1.0);
-  auto t3 = t2.add(3.0);
-  auto t4 = t1.add(t3);
-  auto t5 = t1.unsqueeze(1);
-  auto t7 = t5.mul(t6);
-
-  std::vector<IValue> aten_inputs = {t0, t2, t6};
-  std::vector<at::Tensor> aten_outputs = {t4, t7};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedComputeWith1_CUDA) {
-  // Case 1
-  // tv1 = tv0 * 0.5
-  // tv2 = tv1 * -1
-  // tv3 = tv1 + 3
-  // tv4 = tv1 * 2
-  // tv5 = tv3 + tv2
-  // tv6 = tv5 + tv4
-  // tv7 = tv1 + tv4
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
-  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0));
-  TensorView* tv3 = add(tv1, IrBuilder::create<Double>(3.0));
-  TensorView* tv4 = mul(tv1, IrBuilder::create<Double>(2.0));
-  TensorView* tv5 = add(tv3, tv2);
-
-  TensorView* tv6 = add(tv5, tv4);
-  TensorView* tv7 = add(tv1, tv4);
-
-  fusion.addOutput(tv6);
-  fusion.addOutput(tv7);
-
-  // Lets setup to actually run
-  tv0->merge(0);
-  tv0->split(0, 128);
-  tv0->split(0, 4);
-
-  tv0->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv0->computeWith(tv7, 1);
-
-  GpuLower gpulw(&fusion);
-
-  // The this-position of the last tensor should be zero.
-  TORCH_CHECK(
-      tv7->nDims() == 3 && tv7->getComputeAtPosition() == 0 &&
-      tv7->getMaxProducerPosition() == 1);
-  TORCH_CHECK(
-      tv7->nDims() == 3 && tv6->getComputeAtPosition() == 0 &&
-      tv6->getMaxProducerPosition() == 1);
-
-  ComputeAtMap ca_map(&fusion);
-
-  // The position of every other tensor should be 1.
-  for (auto tv : {tv1, tv2, tv3, tv4, tv5}) {
-    TORCH_CHECK(tv->nDims() == 3 && tv->getComputeAtPosition() == 1);
-    TORCH_CHECK(
-        ca_map.areMapped(tv7->axis(0), tv->axis(0), IdMappingMode::PERMISSIVE));
-  }
-
-  for (Val* val : fusion.vals()) {
-    if (!val->isFusionInput() &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({129, 127}, options);
-
-  auto t1 = aten_input.mul({0.5});
-  auto t2 = t1.mul({-1.0});
-  auto t3 = t1.add({3.0});
-  auto t4 = t1.mul({2.0});
-  auto t5 = t3.add(t2);
-  auto t6 = t5.add(t4);
-  auto t7 = t1.add(t4);
-
-  std::vector<at::Tensor> aten_outputs = {t6, t7};
-  std::vector<at::Tensor> cg_outputs = {
-      at::empty_like(aten_input, options), at::empty_like(aten_input, options)};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, cg_outputs);
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedComputeWith2_CUDA) {
-  // Case 2
-  // tv1 = tv0 * -1
-  // tv2 = tv0 + 3
-  // tv3 = tv0 * 2
-  // tv4 = tv2 + tv1
-  // tv5 = tv4 + tv3
-  // tv6 = tv5 + tv3
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(-1.0));
-  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(3.0));
-  TensorView* tv3 = mul(tv0, IrBuilder::create<Double>(2.0));
-  TensorView* tv4 = add(tv2, tv1);
-
-  TensorView* tv5 = add(tv4, tv3);
-  TensorView* tv6 = add(tv5, tv3);
-
-  fusion.addOutput(tv5);
-  fusion.addOutput(tv6);
-
-  // Lets setup to actually run
-  tv0->merge(0);
-  tv0->split(0, 128);
-  tv0->split(0, 4);
-
-  tv0->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv0->computeWith(tv6, 1);
-
-  for (Val* val : fusion.vals()) {
-    if (!val->isFusionInput() &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({129, 127}, options);
-
-  auto t1 = input.mul({-1.0});
-  auto t2 = input.add({3.0});
-  auto t3 = input.mul({2.0});
-  auto t4 = t2.add(t1);
-  auto t5 = t4.add(t3);
-  auto t6 = t5.add(t3);
-
-  std::vector<at::Tensor> aten_outputs = {t5, t6};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
-
-  testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedComputeWith3_CUDA) {
-  // Case 3
-  // T2 = T1 * 0.979361
-  // T3 = T2 * T0
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(4);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = makeSymbolicTensor(4);
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(.979361));
-  TensorView* tv3 = mul(tv2, tv0);
-
-  fusion.addOutput(tv3);
-
-  // Lets setup to actually run
-  while (tv0->nDims() > 1)
-    tv0->merge(0);
-  tv0->split(0, 128);
-  tv0->split(0, 4);
-
-  while (tv1->nDims() > 1)
-    tv1->merge(0);
-  tv1->split(0, 128);
-  tv1->split(0, 4);
-
-  tv0->computeWith(tv3, 1);
-  tv1->computeWith(tv3, 1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  for (Val* val : fusion.vals()) {
-    if (!val->isFusionInput() &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-
-  auto t2 = t1.mul({0.979361});
-  auto aten_output = t2.mul(t0);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  at::Tensor cg_output = at::empty_like(t0, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  fe.runFusion(aten_inputs, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedComputeWith4_CUDA) {
-  // Case 4
-  // T4 = T2 - T3
-  // T5 = T1 + T4
-  // T6 = T5 - T0
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(4);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = makeSymbolicTensor(4);
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = makeSymbolicTensor(4);
-  fusion.addInput(tv2);
-
-  TensorView* tv3 = makeSymbolicTensor(4);
-  fusion.addInput(tv3);
-
-  TensorView* tv4 = sub(tv2, tv3);
-  TensorView* tv5 = add(tv1, tv4);
-  TensorView* tv6 = sub(tv5, tv0);
-
-  fusion.addOutput(tv6);
-  std::vector<TensorView*> tvs = {tv0, tv1, tv2};
-  for (auto tv : tvs) {
-    // Lets setup to actually run
-    while (tv->nDims() > 1) {
-      tv->merge(0);
-    }
-    tv->split(0, 128);
-    tv->split(0, 4);
-    tv->computeWith(tv6, 1);
-  }
-
-  tv6->axis(0)->parallelize(ParallelType::BIDx);
-
-  for (Val* val : fusion.vals()) {
-    if (!val->isFusionInput() &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-  at::Tensor t2 = at::rand_like(t0, options);
-  at::Tensor t3 = at::rand_like(t0, options);
-
-  auto t4 = t2.sub(t3);
-  auto t5 = t1.add(t4);
-  auto aten_output = t5.sub(t0);
-
-  std::vector<IValue> aten_inputs = {t0, t1, t2, t3};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedComputeWith5_CUDA) {
-  // Case 5
-  // tv2 = tv0 + 2.0
-  // tv3 = tv1 * tv2
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  TensorView* tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2.0));
-  TensorView* tv3 = mul(tv1, tv2);
-  fusion.addOutput(tv3);
-
-  tv2->merge(0);
-  tv2->split(-1, 8);
-  tv2->split(-1, 4);
-
-  tv2->computeWith(tv3, 1);
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({63, 65}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-
-  auto t2 = t0.add(2.0);
-  auto aten_output = t1.mul(t2);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedComputeWith6_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  TensorView* tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2.0));
-  TensorView* tv3 = mul(tv1, tv2);
-  fusion.addOutput(tv3);
-
-  tv2->merge(0);
-  tv2->split(-1, 8);
-  tv2->split(-1, 4);
-  tv3->merge(0);
-  tv3->split(-1, 8);
-
-  tv2->computeWith(tv3, 1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({63, 65}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-
-  auto t2 = t0.add(2.0);
-  auto aten_output = t1.mul(t2);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionComputeAtMultiConsumers_CUDA) {
-  // tv1 = tv0 * 0.5
-  // tv2 = tv1 * -1
-  // tv3 = tv2 * -2
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
-  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0));
-  TensorView* tv3 = mul(tv1, IrBuilder::create<Double>(-2.0));
-  fusion.addOutput(tv2);
-  fusion.addOutput(tv3);
-
-  // This computeAt will affect tv2 as well, even though tv2 is not in
-  // the data-flow path between tv1 and tv3. The reason is that tv1 is
-  // now computed at tv3, so tv2 must also be computed at the same
-  // location. Overall, what will happen is basically we merge
-  // expressions of all tensors and compute them in a single loop
-  // nest.
-  TensorView* computeAtTarget = tv3;
-  computeAtTarget->split(0, 128);
-  tv1->computeAt(computeAtTarget, 1);
-
-  TensorView* affected_tensors[] = {tv1, tv2, tv3};
-  for (auto tv : affected_tensors) {
-    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
-  }
-
-  GpuLower gpulw(&fusion);
-
-  TORCH_CHECK(tv1->getComputeAtPosition() == 1);
-  TORCH_CHECK(
-      tv2->getComputeAtPosition() == 0 && tv2->getMaxProducerPosition() == 1);
-  TORCH_CHECK(
-      tv3->getComputeAtPosition() == 0 && tv3->getMaxProducerPosition() == 1);
-
-  ComputeAtMap ca_map(&fusion);
-
-  // Note that tv2 is also computed at tv3.
-  for (auto tv : {tv1, tv2}) {
-    TORCH_CHECK(ca_map.areMapped(
-        tv->axis(0), computeAtTarget->axis(0), IdMappingMode::PERMISSIVE));
-  }
-
-  TORCH_CHECK(tv3->getComputeAtPosition() == 0);
-
-  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
-  for (auto tv : affected_tensors) {
-    tv->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({1000}, options);
-
-  auto t1 = aten_input * 0.5;
-  auto t2 = t1 * -1.0;
-  auto t3 = t1 * -2.0;
-
-  std::vector<at::Tensor> aten_outputs = {t2, t3};
-
-  std::vector<at::Tensor> cg_outputs = {
-      at::empty_like(aten_input, options), at::empty_like(aten_input, options)};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, cg_outputs);
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-// Similar to ComputeAtMultiConsumers, but with a common consumer.
-TEST_F(NVFuserTest, FusionComputeAtCommonConsumer1_CUDA) {
-  // tv1 = tv0 * 0.5
-  // tv2 = tv1 * -1
-  // tv3 = tv2 * -2
-  // tv4 = tv2 + tv3
-  // tv5 = tv4 * 5
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
-  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0));
-  TensorView* tv3 = mul(tv1, IrBuilder::create<Double>(-2.0));
-  TensorView* tv4 = add(tv2, tv3);
-  TensorView* tv5 = mul(tv4, IrBuilder::create<Double>(5.0));
-  fusion.addOutput(tv3);
-  fusion.addOutput(tv4);
-  fusion.addOutput(tv5);
-
-  // Computing tv1 at tv3. This will affect tv2 as discussed in
-  // ComplexComputeAt1. Additionally, in this case, notice that tv4 is
-  // the common consumer of tv2 and tv3, so they are computed at
-  // tv4. The indirect propagation of the computeAt should stop at the
-  // common consumer, and no further change should occur. More
-  // specifically, the computeAT position of tv4 and tv5 should be zero.
-  TensorView* computeAtTarget = tv3;
-  computeAtTarget->split(0, 128);
-  tv1->computeAt(computeAtTarget, 1);
-
-  TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4};
-  for (auto tv : affected_tensors) {
-    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
-  }
-
-  TORCH_CHECK(tv1->getComputeAtPosition() == 1);
-  TORCH_CHECK(tv2->getComputeAtPosition() == 1);
-  TORCH_CHECK(tv3->getComputeAtPosition() == 1);
-  TORCH_CHECK(tv4->getComputeAtPosition() == 0);
-  TORCH_CHECK(tv5->getComputeAtPosition() == 0);
-
-  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
-
-  for (auto tv : affected_tensors) {
-    tv->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  // Transform tv5 to make it look like the rest
-  tv5->split(0, 128);
-  tv5->axis(1)->parallelize(ParallelType::TIDx);
-  tv5->axis(0)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({1000}, options);
-
-  auto t1 = aten_input * 0.5;
-  auto t2 = t1 * -1.0;
-  auto t3 = t1 * -2.0;
-  auto t4 = t2 + t3;
-  auto t5 = t4 * 5.0;
-
-  std::vector<at::Tensor> aten_outputs = {t3, t4, t5};
-  std::vector<at::Tensor> cg_outputs = {
-      at::empty_like(aten_input, options),
-      at::empty_like(aten_input, options),
-      at::empty_like(aten_input, options)};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, cg_outputs);
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionComputeAtCommonConsumer2_CUDA) {
-  // tv1 = tv0 * 0.5
-  // tv2 = tv1 * -1
-  // tv3 = tv2 * -1
-  // tv4 = tv1 + 4
-  // tv5 = tv3 + tv4
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
-  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0));
-  TensorView* tv3 = mul(tv2, IrBuilder::create<Double>(-1.0));
-  TensorView* tv4 = add(tv1, IrBuilder::create<Double>(4.0));
-  TensorView* tv5 = add(tv3, tv4);
-
-  fusion.addOutput(tv5);
-
-  TensorView* computeAtTarget = tv3;
-
-  computeAtTarget->merge(0);
-  computeAtTarget->split(0, 128);
-  computeAtTarget->split(0, 4);
-
-  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
-
-  // This computeAt will affect all tensors including tv3, tv4 and
-  // tv5, even though it appears to impact only tv1 and tv2. The
-  // reason is that tv1 is now computed at tv3, so tv4 must also be
-  // computed at the same location. Similarly, the consumer of tv4,
-  // tv5, must also be computed at the same location. Overall, what
-  // will happen is basically we merge expressions of all tensors and
-  // compute them in a single loop nest. Internally, this will be
-  // realized by making all tensors, except for those in the path
-  // between tv1 and tv3, computed at tv5, which we call the common
-  // consumer.
-  tv1->computeAt(computeAtTarget, 1);
-
-  // All tensors should have the same dimenionality as the target
-  for (Val* val : fusion.vals()) {
-    if (val->isFusionInput() ||
-        val->getValType().value() != ValType::TensorView) {
-      continue;
-    }
-    TensorView* tv = val->as<TensorView>();
-    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
-    if (tv == tv5) {
-      TORCH_CHECK(tv->getComputeAtPosition() == 0);
-    } else {
-      TORCH_CHECK(tv->getComputeAtPosition() == 1);
-    }
-  }
-
-  for (auto tv : ir_utils::filterByType<TensorView>(fusion.vals())) {
-    if (!tv->isFusionInput()) {
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({129, 127}, options);
-
-  auto t1 = aten_input.mul({0.5});
-  auto t2 = t1.mul({-1.0});
-  auto t3 = t2.mul({-1.0});
-  auto t4 = t1.add({4.0});
-  auto aten_output = t3 + t4;
-
-  at::Tensor cg_output = at::empty_like(aten_input, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-// Similar to the above common consumer test but adds an additional
-// tensor that has no common consumer with the other tensors.
-TEST_F(NVFuserTest, FusionComputeAtCommonConsumer3_CUDA) {
-  // tv1 = tv0 * 0.5
-  // tv2 = tv1 * -1
-  // tv3 = tv2 * -1
-  // tv4 = tv1 + 4
-  // tv5 = tv2 + tv3
-  // tv6 = tv1 + 6
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
-  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0));
-  TensorView* tv3 = mul(tv2, IrBuilder::create<Double>(-1.0));
-  TensorView* tv4 = add(tv1, IrBuilder::create<Double>(4.0));
-  TensorView* tv5 = add(tv3, tv4);
-  TensorView* tv6 = add(tv1, IrBuilder::create<Double>(6.0));
-
-  fusion.addOutput(tv5);
-  fusion.addOutput(tv6);
-
-  TensorView* computeAtTarget = tv3;
-
-  computeAtTarget->merge(0);
-  computeAtTarget->split(0, 128);
-  computeAtTarget->split(0, 4);
-
-  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
-
-  // This will have the same impact on the tensors except for tv5 and
-  // tv6. tv6 does not have any common consumer with the computeAt
-  // target, but since it uses tv1, it must be also computed at the
-  // same location as the other impacted tensors. We can either make
-  // tv5 computed at tv6 or tv6 computed at tv5. In this case, tv5
-  // should be computed at tv6 just because the current implementation
-  // orders the computeAt relationship based on the order in which
-  // tensors are specified as outputs.
-
-  tv1->computeAt(computeAtTarget, 1);
-
-  // All tensors should have the same dimenionality as the target
-  for (auto tv : ir_utils::filterByType<TensorView>(fusion.vals())) {
-    if (tv->isFusionInput()) {
-      continue;
-    }
-    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
-    if (tv == tv5 || tv == tv6) {
-      TORCH_CHECK(tv->getComputeAtPosition() == 0);
-      TORCH_CHECK(tv->getMaxProducerPosition() == 1);
-    } else {
-      TORCH_CHECK(tv->getComputeAtPosition() == 1);
-    }
-  }
-
-  for (Val* val : fusion.vals()) {
-    if (!val->isFusionInput() &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = val->as<TensorView>();
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({129, 127}, options);
-
-  auto t1 = aten_input.mul({0.5});
-  auto t2 = t1.mul({-1.0});
-  auto t3 = t2.mul({-1.0});
-  auto t4 = t1.add({4.0});
-  auto t5 = t3 + t4;
-  auto t6 = t1.add({6.0});
-
-  std::vector<at::Tensor> aten_outputs = {t5, t6};
-  std::vector<at::Tensor> cg_outputs = {
-      at::empty_like(aten_input, options), at::empty_like(aten_input, options)};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, cg_outputs);
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-// Similar to ComputeAtCommonConsumer1 but with an addtiona ltensor
-// that does not have data dependency with the consumer.
-TEST_F(NVFuserTest, FusionComputeAtNoCommonConsumer_CUDA) {
-  // tv1 = tv0 * 0.5
-  // tv2 = tv1 * -1
-  // tv3 = tv1 * -2
-  // tv4 = tv2 + tv3
-  // tv5 = tv4 * 5
-  // tv6 = tv1 * 6
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
-  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0));
-  TensorView* tv3 = mul(tv1, IrBuilder::create<Double>(-2.0));
-  TensorView* tv4 = add(tv2, tv3);
-  TensorView* tv5 = mul(tv4, IrBuilder::create<Double>(5.0));
-  // Notice that tv6 is not a consumer of tv4.
-  TensorView* tv6 = mul(tv1, IrBuilder::create<Double>(6.0));
-  fusion.addOutput(tv3);
-  fusion.addOutput(tv4);
-  fusion.addOutput(tv5);
-  fusion.addOutput(tv6);
-
-  TensorView* computeAtTarget = tv3;
-  computeAtTarget->split(0, 128);
-  tv1->computeAt(computeAtTarget, 1);
-
-  TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4, tv5, tv6};
-  for (auto tv : affected_tensors) {
-    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
-    if (tv == tv6 || tv == tv5) {
-      TORCH_CHECK(tv->getComputeAtPosition() == 0);
-    } else {
-      TORCH_CHECK(tv->getComputeAtPosition() == 1);
-    }
-  }
-
-  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
-
-  for (auto tv : affected_tensors) {
-    tv->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({1000}, options);
-
-  auto t1 = aten_input * 0.5;
-  auto t2 = t1 * -1.0;
-  auto t3 = t1 * -2.0;
-  auto t4 = t2 + t3;
-  auto t5 = t4 * 5.0;
-  auto t6 = t1 * 6.0;
-
-  std::vector<at::Tensor> aten_outputs = {t3, t4, t5, t6};
-  std::vector<at::Tensor> cg_outputs = {
-      at::empty_like(aten_input, options),
-      at::empty_like(aten_input, options),
-      at::empty_like(aten_input, options),
-      at::empty_like(aten_input, options)};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, cg_outputs);
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-namespace {
-
-void checkIdMapped(
-    ComputeAtRootDomainMap& root_map,
-    TensorView* v0,
-    IterDomain* id0,
-    TensorView* v1,
-    IterDomain* id1,
-    bool should_map) {
-  if (should_map) {
-    TORCH_CHECK(
-        root_map.canMap(v0->domain(), id0, v1->domain(), id1),
-        "Should be mappable: ",
-        id0,
-        " of ",
-        v0,
-        " and ",
-        id1,
-        " of ",
-        v1);
-  } else {
-    TORCH_CHECK(
-        !root_map.canMap(v0->domain(), id0, v1->domain(), id1),
-        "Should not be mappable: ",
-        id0,
-        " of ",
-        v0,
-        " and ",
-        id1,
-        " of ",
-        v1);
-  }
-}
-
-void checkIdMapped(
-    TensorView* v0,
-    const std::vector<IterDomain*>& root0,
-    const std::vector<bool> should_map0,
-    TensorView* v1,
-    const std::vector<IterDomain*>& root1,
-    const std::vector<bool> should_map1) {
-  ComputeAtRootDomainMap map;
-  map.build();
-  TORCH_INTERNAL_ASSERT(root0.size() == should_map0.size());
-  TORCH_INTERNAL_ASSERT(root1.size() == should_map1.size());
-  size_t idx0 = 0;
-  for (const auto i : c10::irange(root0.size())) {
-    size_t idx1 = 0;
-    for (const auto j : c10::irange(root1.size())) {
-      if (should_map0[i] && should_map1[j] && idx0 == idx1) {
-        checkIdMapped(map, v0, root0[i], v1, root1[j], true);
-      } else {
-        checkIdMapped(map, v0, root0[i], v1, root1[j], false);
-      }
-      if (should_map1[j])
-        ++idx1;
-    }
-    if (should_map0[i])
-      ++idx0;
-  }
-}
-
-void checkIdMapped(
-    TensorView* v0,
-    const std::vector<IterDomain*>& root0,
-    TensorView* v1,
-    const std::vector<IterDomain*>& root1) {
-  checkIdMapped(
-      v0,
-      root0,
-      std::vector<bool>(root0.size(), true),
-      v1,
-      root1,
-      std::vector<bool>(root1.size(), true));
-}
-
-} // namespace
-
-TEST_F(NVFuserTest, FusionRootMappingBasic_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  auto tv3 = broadcast(tv0, {true, false, false});
-  auto tv4 = broadcast(tv1, {false, true, false});
-  auto tv5 = add(tv3, tv4);
-  fusion.addOutput(tv5);
-
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, true},
-      tv4,
-      tv4->getRootDomain(),
-      {false, true, true});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, true},
-      tv4,
-      tv4->getRootDomain(),
-      {true, false, true});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {false, true},
-      tv1,
-      tv1->getRootDomain(),
-      {false, true});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, true},
-      tv5,
-      tv5->getRootDomain(),
-      {false, true, true});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, true},
-      tv5,
-      tv5->getRootDomain(),
-      {true, false, true});
-  checkIdMapped(tv3, tv3->getRootDomain(), tv4, tv4->getRootDomain());
-  checkIdMapped(tv3, tv3->getRootDomain(), tv5, tv5->getRootDomain());
-  checkIdMapped(tv4, tv4->getRootDomain(), tv5, tv5->getRootDomain());
-}
-
-TEST_F(NVFuserTest, FusionRootMappingRfactor_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // [I,I]
-  TensorView* tv0 = makeSymbolicTensor(2);
-  // [I,I,I]
-  TensorView* tv1 = makeSymbolicTensor(3);
-
-  //[I,I,R]
-  auto tv2 = sum(tv1, {2});
-  auto tv3 = add(tv2, tv0);
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addOutput(tv3);
-
-  // scheduling:
-  //[B,I,R0,R1=128], root = [B,I,R]
-  tv2->split(2, 128);
-
-  // root=[B,I,Irf], rfactor=[B,I,Irf,Rrf]
-  auto tv4 = tv2->rFactor({3});
-
-  checkIdMapped(tv1, tv1->getRootDomain(), tv4, tv4->getRootDomain());
-  checkIdMapped(
-      tv4,
-      tv4->getRFactorDomain(),
-      {true, true, true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {true, true, true});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {true, true, false});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, true, false},
-      tv3,
-      tv3->getRootDomain(),
-      {true, true});
-  checkIdMapped(
-      tv2,
-      tv2->getRootDomain(),
-      {true, true, false},
-      tv3,
-      tv3->getRootDomain(),
-      {true, true});
-  checkIdMapped(tv0, tv0->getRootDomain(), tv3, tv3->getRootDomain());
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, true},
-      tv1,
-      tv1->getRootDomain(),
-      {true, true, false});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, true},
-      tv2,
-      tv2->getRootDomain(),
-      {true, true, false});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, true},
-      tv4,
-      tv4->getRFactorDomain(),
-      {true, true, false, false});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, true},
-      tv4,
-      tv4->getRootDomain(),
-      {true, true, false});
-}
-
-TEST_F(NVFuserTest, FusionRootMappingReductionDependency1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-  fusion.addOutput(tv2);
-
-  // The second dimension cannot be mapped as it would require recomputation.
-  checkIdMapped(tv0, tv0->getRootDomain(), tv1, tv1->getRootDomain());
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {true, false});
-}
-
-TEST_F(NVFuserTest, FusionRootMappingReductionDependency2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-  auto tv3 = add(tv0, tv2);
-  fusion.addOutput(tv3);
-
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, false},
-      tv1,
-      tv1->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, false},
-      tv3,
-      tv3->getRootDomain(),
-      {true, false});
-  checkIdMapped(tv2, tv2->getRootDomain(), tv3, tv3->getRootDomain());
-}
-
-TEST_F(NVFuserTest, FusionRootMappingReductionDependency3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-  fusion.addOutput(tv2);
-
-  tv1->split(-1, 4);
-  auto tv3 = tv1->rFactor({-2});
-
-  checkIdMapped(tv0, tv0->getRootDomain(), tv3, tv3->getRootDomain());
-  checkIdMapped(
-      tv3,
-      tv3->getMaybeRFactorDomain(),
-      {true, false, true},
-      tv1,
-      tv1->getRootDomain(),
-      {true, true});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {true, false});
-}
-
-TEST_F(NVFuserTest, FusionRootMappingReductionDependency4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-  auto tv3 = add(tv0, tv2);
-  fusion.addOutput(tv3);
-
-  tv1->split(-1, 4);
-  auto tv4 = tv1->rFactor({-2});
-
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, false},
-      tv4,
-      tv4->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv4,
-      tv4->getMaybeRFactorDomain(),
-      {true, false, true},
-      tv1,
-      tv1->getRootDomain(),
-      {true, true});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {true, false});
-  checkIdMapped(tv2, tv2->getRootDomain(), tv3, tv3->getRootDomain());
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {true, false});
-}
-
-// Reproducer of issue #749
-TEST_F(NVFuserTest, FusionRootMappingReductionDependency5_CUDA_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = sum(tv1, {1});
-  auto tv3 = broadcast(tv2, {false, true});
-  auto tv4 = add(tv0, tv3);
-  auto tv5 = add(tv4, tv1);
-  fusion.addOutput(tv5);
-
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, false},
-      tv1,
-      tv1->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv2,
-      tv2->getRootDomain(),
-      {true, false},
-      tv3,
-      tv3->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv3,
-      tv3->getRootDomain(),
-      {true, true},
-      tv4,
-      tv4->getRootDomain(),
-      {true, true});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, false},
-      tv4,
-      tv4->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv4,
-      tv4->getRootDomain(),
-      {true, true},
-      tv5,
-      tv5->getRootDomain(),
-      {true, true});
-}
-
-// Similar to RootMappingReductionDependency5 but with rFactor
-TEST_F(NVFuserTest, FusionRootMappingReductionDependency6_CUDA_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = sum(tv1, {1});
-  auto tv3 = broadcast(tv2, {false, true});
-  auto tv4 = add(tv0, tv3);
-  auto tv5 = add(tv4, tv1);
-  fusion.addOutput(tv5);
-
-  tv2->split(1, 4);
-  auto tv6 = tv2->rFactor({-1});
-
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, false},
-      tv1,
-      tv1->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, false},
-      tv6,
-      tv6->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv6,
-      tv6->getMaybeRFactorDomain(),
-      {true, true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {true, true});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv2,
-      tv2->getRootDomain(),
-      {true, false},
-      tv3,
-      tv3->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv3,
-      tv3->getRootDomain(),
-      {true, true},
-      tv4,
-      tv4->getRootDomain(),
-      {true, true});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true, false},
-      tv4,
-      tv4->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv4,
-      tv4->getRootDomain(),
-      {true, true},
-      tv5,
-      tv5->getRootDomain(),
-      {true, true});
-}
-
-TEST_F(NVFuserTest, FusionRootMappingMultipleBroadcast_CUDA) {
-  if (at::cuda::getCurrentDeviceProperties()->major >= 8) {
-    GTEST_SKIP() << "Somehow it fails on sm_80+ GPUs"
-                 << " See https://github.com/pytorch/pytorch/issues/86717";
-  }
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(1);
-  auto tv1 = broadcast(tv0, {false, true});
-  auto tv2 = broadcast(tv0, {true, false});
-  auto tv3 = add(tv1, tv2);
-  fusion.addOutput(tv3);
-
-  // tv0 cannot be mapped with the consumers as it would mean its only
-  // domain would be mapped to both the first and second domains of
-  // the two consumers, thus computing tv0 at both corresponding loops.
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {false},
-      tv1,
-      tv1->getRootDomain(),
-      {false, false});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {false},
-      tv2,
-      tv2->getRootDomain(),
-      {false, false});
-  checkIdMapped(tv1, tv1->getRootDomain(), tv3, tv3->getRootDomain());
-  checkIdMapped(tv2, tv2->getRootDomain(), tv3, tv3->getRootDomain());
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {false},
-      tv3,
-      tv3->getRootDomain(),
-      {false, false});
-}
-
-TEST_F(
-    NVFuserTest,
-    FusionRootMappingMultipleBroadcastWithNoCommonConsumer_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(1);
-  auto tv1 = broadcast(tv0, {false, true});
-  auto tv2 = broadcast(tv0, {true, false});
-  fusion.addOutput(tv1);
-  fusion.addOutput(tv2);
-
-  // If there is no common consumer, there is no recomputation constraint.
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true},
-      tv1,
-      tv1->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true},
-      tv2,
-      tv2->getRootDomain(),
-      {false, true});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {false, true});
-}
-
-TEST_F(NVFuserTest, FusionRootMappingBroadcastNonUniqueSize_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  auto tv2 = makeSymbolicTensor(2);
-  fusion.addInput(tv2);
-  auto tv3 = broadcast(tv0, {false, true});
-  auto tv4 = add(tv1, tv3);
-  fusion.addOutput(tv4);
-  auto tv5 = add(tv2, tv3);
-  fusion.addOutput(tv5);
-
-  // Broadcast domains can be used with multiple domains with
-  // different sizes. In this test, the broadcast domain of tv3 has
-  // two consumers, tv4 and tv5, which may have different sizes. Each
-  // of the consumers is used with the broadcast domain of tv3, but
-  // the two consumers may not have the same size, it is not possible
-  // to map those domains.
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true},
-      tv3,
-      tv3->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true},
-      tv1,
-      tv1->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true},
-      tv2,
-      tv2->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, false},
-      tv2,
-      tv2->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, false},
-      tv3,
-      tv3->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv2,
-      tv2->getRootDomain(),
-      {true, false},
-      tv3,
-      tv3->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv3,
-      tv3->getRootDomain(),
-      {true, false},
-      tv4,
-      tv4->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv3,
-      tv3->getRootDomain(),
-      {true, false},
-      tv5,
-      tv5->getRootDomain(),
-      {true, false});
-  checkIdMapped(
-      tv4,
-      tv4->getRootDomain(),
-      {true, false},
-      tv5,
-      tv5->getRootDomain(),
-      {true, false});
-}
-
-TEST_F(NVFuserTest, FusionRootMappingBroadcast_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  // tv0[I0]
-  fusion.addInput(tv0);
-  auto tv1 = broadcast(tv0, {true, false});
-  // tv1[B1, I0]
-  auto tv2 = broadcast(tv1, {true, false, false});
-  // tv2[B2, B1, I0]
-  fusion.addOutput(tv2);
-
-  // In this case, tv1 and tv2 has one and two broadcast domains,
-  // respectively. It is the second broadcast domain that is mapped to
-  // the broadcast of tv1.
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true},
-      tv1,
-      tv1->getRootDomain(),
-      {false, true});
-  checkIdMapped(
-      tv1,
-      tv1->getRootDomain(),
-      {true, true},
-      tv2,
-      tv2->getRootDomain(),
-      {false, true, true}); // Not {true, false, true}
-  checkIdMapped(
-      tv0,
-      tv0->getRootDomain(),
-      {true},
-      tv2,
-      tv2->getRootDomain(),
-      {false, false, true});
-}
-
-// Reproducer of issue #723
-TEST_F(NVFuserTest, FusionRootMappingTrivialReduction_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  auto tv1 = makeSymbolicTensor(2);
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = broadcast(tv0, {true, false});
-  auto tv3 = sum(tv2, {0});
-  auto tv4 = add(tv2, tv1);
-
-  fusion.addOutput(tv3);
-  fusion.addOutput(tv4);
-
-  ComputeAtRootDomainMap map;
-  map.build();
-
-  checkIdMapped(
-      map, tv2, tv2->getRootDomain()[0], tv4, tv4->getRootDomain()[0], true);
-  checkIdMapped(
-      map, tv2, tv2->getRootDomain()[0], tv3, tv3->getRootDomain()[0], true);
-
-  tv2->computeAt(tv4, -1);
-
-  const int x = 11;
-  const int y = 12;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({x}, options);
-  at::Tensor t1 = at::randn({y, x}, options);
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto t3 = t0;
-  auto t4 = t0.unsqueeze(0).expand({y, x}) + t1;
-
-  testValidate(&fusion, outputs, aten_inputs, {t3, t4}, __LINE__, __FILE__);
-}
-
-// Repro of issue #1950
-TEST_F(NVFuserTest, FusionRootMappingRepro1950_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-  auto tv0 = makeSymbolicTensor(3);
-  auto tv1 = makeSymbolicTensor(3);
-  auto tv2 = makeSymbolicTensor(3);
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addInput(tv2);
-
-  auto tv3 = set(tv0);
-  auto tv4 = mul(tv1, tv3);
-  auto tv5 = mul(tv1, tv2);
-  auto tv6 = mul(tv5, tv3);
-  auto tv7 = sum(tv6, {2});
-  auto tv8 = broadcast(tv7, {false, false, true});
-  auto tv9 = mul(tv3, tv8);
-
-  // Issue #1950 was caused by a particular traversal ordering based
-  // on the output tensor ordering as below
-  fusion.addOutput(tv9);
-  fusion.addOutput(tv5);
-  fusion.addOutput(tv4);
-
-  ComputeAtRootDomainMap root_map;
-  root_map.build();
-
-  checkIdMapped(root_map, tv4, tv4->axis(-1), tv9, tv9->axis(-1), false);
-}
-
-TEST_F(NVFuserTest, FusionDetectSelfMappedDomains_CUDA) {
-  if (at::cuda::getCurrentDeviceProperties()->major >= 8) {
-    GTEST_SKIP() << "Somehow it does not throw on sm_80+"
-                 << " See https://github.com/pytorch/pytorch/issues/86714";
-  }
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = broadcast(tv1, {true, false});
-  auto tv3 = broadcast(tv1, {false, true});
-  auto tv4 = add(tv2, tv3);
-  fusion.addOutput(tv4);
-
-  // computeAt should fail as there is no valid root mapping.
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(tv1->computeAt(tv4, 1));
-}
-
-TEST_F(NVFuserTest, FusionScalarInputs_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  TensorView* tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-
-  Double* d0 = IrBuilder::create<Double>();
-  fusion.addInput(d0);
-  Double* d1 = IrBuilder::create<Double>();
-  fusion.addInput(d1);
-  Double* d2 = IrBuilder::create<Double>();
-  fusion.addInput(d2);
-  Double* d3 = IrBuilder::create<Double>();
-  fusion.addInput(d3);
-  Val* d4 = mul(d0, d1);
-  Val* d5 = sub(d2, d3);
-
-  TensorView* tv2 = sub(tv1, d4);
-  TensorView* tv3 = add(tv0, d5);
-  TensorView* tv4 = mul(tv3, tv2);
-
-  fusion.addOutput(tv4);
-
-  // Lets setup to actually run
-  while (tv4->nDims() > 1)
-    tv4->merge(0);
-  tv4->split(0, 128);
-  tv4->split(0, 4);
-
-  tv0->computeAt(tv4, 1);
-  tv1->computeAt(tv4, 1);
-
-  tv4->axis(0)->parallelize(ParallelType::BIDx);
-
-  for (Val* val : fusion.vals()) {
-    if (!val->isFusionInput() &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  // d4 = d0 * d1
-  // d5 = d2 - d3
-  // t2 = t1 - d4
-  // t3 = t0 + d5
-  // t4 = t3 * t2
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  float fl0 = 0.1;
-  float fl1 = -0.2;
-  float fl2 = 0.3;
-  float fl3 = -0.4;
-  float fl4 = fl0 * fl1;
-  float fl5 = fl2 - fl3;
-
-  at::Tensor t0 = at::randn({129, 127}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-
-  auto t2 = t1.sub(fl4);
-  auto t3 = t0.add(fl5);
-  auto aten_output = t3.mul(t2);
-
-  at::Tensor cg_output = at::empty_like(t0, options);
-
-  at::Scalar test(fl0);
-
-  std::vector<IValue> aten_inputs = {
-      t0,
-      t1,
-      at::Scalar(fl0),
-      at::Scalar(fl1),
-      at::Scalar(fl2),
-      at::Scalar(fl3)};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  fe.runFusion(aten_inputs, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionLoopUnroll_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(3);
-  TensorView* tv1 = makeSymbolicTensor(3);
-
-  // Register your inputs
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  // Do math with it, it returns a `Val*` but can be static_casted back to
-  // TensorView
-  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
-  TensorView* tv3 = add(tv0, tv2);
-
-  // Register your outputs
-  fusion.addOutput(tv3);
-
-  int block_size = 16;
-
-  tv3->merge(0, 1);
-  tv3->merge(0, 1);
-
-  tv3->split(0, block_size);
-  tv3->split(0, 4);
-
-  // For all inputs, computeAt the output inline, temporaries should be squeezed
-  // between them
-  tv0->computeAt(tv3, 1);
-  tv1->computeAt(tv3, 1);
-
-  // Parallelize
-  tv2->axis(1)->parallelize(ParallelType::Unroll);
-  tv3->axis(1)->parallelize(ParallelType::Unroll);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor input0 = at::randn({129, 13, 3}, options);
-  at::Tensor input1 = at::randn({129, 13, 3}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input0, input1});
-  auto outputs = fe.runFusion({input0, input1});
-
-  TORCH_CHECK(outputs[0].equal(input0.add(input1.add(2.0))));
-}
-
-/*
- * Helper function for single op testing that generates a codegen operand
- */
-
-Val* gen_jit_operand(std::pair<ValType, DataType> desc) {
-  if (desc.first == ValType::TensorView) {
-    return makeSymbolicTensor(2, desc.second);
-  } else if (desc.first == ValType::Scalar) {
-    if (desc.second == DataType::Float) {
-      return IrBuilder::create<Double>();
-    } else if (desc.second == DataType::Double) {
-      return IrBuilder::create<Double>();
-    } else if (desc.second == DataType::ComplexFloat) {
-      return IrBuilder::create<ComplexDouble>();
-    } else if (desc.second == DataType::ComplexDouble) {
-      return IrBuilder::create<ComplexDouble>();
-    } else if (desc.second == DataType::Int) {
-      return IrBuilder::create<Int>();
-    } else {
-      TORCH_CHECK(false, "Not currently supported type: ", desc.first);
-    }
-  } else {
-    TORCH_CHECK(false, "Not currently supported type: ", desc.first);
-  }
-  return nullptr;
-}
-
-/*
- * Helper function for single op testing that generates an ATen operand
- */
-
-IValue gen_aten_operand(
-    std::pair<ValType, DataType> desc,
-    int blocks,
-    int threads,
-    bool rand) {
-  if (desc.first == ValType::TensorView) {
-    if (desc.second == DataType::Double || desc.second == DataType::Float ||
-        desc.second == DataType::ComplexDouble ||
-        desc.second == DataType::ComplexFloat ||
-        desc.second == DataType::Half || desc.second == DataType::BFloat16) {
-      auto options = at::TensorOptions()
-                         .dtype(data_type_to_aten(desc.second))
-                         .device(at::kCUDA, 0);
-      if (rand) {
-        return IValue(at::rand({blocks, threads}, options));
-      } else {
-        return IValue(at::empty({blocks, threads}, options));
-      }
-    } else if (desc.second == DataType::Int || desc.second == DataType::Int32) {
-      auto dtype = desc.second == DataType::Int32 ? at::kInt : at::kLong;
-      if (rand) {
-        auto options =
-            at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-        return IValue(at::randn({blocks, threads}, options).mul(5).to(dtype));
-      } else {
-        auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0);
-        return IValue(at::empty({blocks, threads}, options));
-      }
-    } else if (desc.second == DataType::Bool) {
-      if (rand) {
-        auto options =
-            at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-        return IValue(
-            at::rand({blocks, threads}, options).round().to(at::kBool));
-      } else {
-        auto options =
-            at::TensorOptions().dtype(at::kBool).device(at::kCUDA, 0);
-        return IValue(at::empty({blocks, threads}, options));
-      }
-    } else {
-      TORCH_CHECK(false, "Not currently supported type: ", desc.second)
-    }
-  } else if (desc.first == ValType::Scalar) {
-    // IValue scalars can only be double int64 or bool
-    if (desc.second == DataType::ComplexDouble ||
-        desc.second == DataType::ComplexFloat) {
-      return IValue(at::Scalar(c10::complex<double>(1.0, 0.0)));
-    } else if (
-        desc.second == DataType::Double || desc.second == DataType::Float ||
-        desc.second == DataType::Half || desc.second == DataType::BFloat16) {
-      return IValue(at::Scalar(1.0));
-    } else if (desc.second == DataType::Int) {
-      return IValue(at::Scalar(1));
-    } else {
-      TORCH_CHECK(false, "Not currently supported type: ", desc.first);
-    }
-  } else {
-    TORCH_CHECK(false, "Not currently supported type: ", desc.first);
-  }
-  return nullptr;
-}
-
-/*
- * Templatized Helper Function To generate single Op comparison between the
- * JIT codegen for Cuda and the ATen Library.
- */
-
-using OutputPair = std::pair<ValType, DataType>;
-template <
-    typename AtenFunc,
-    typename JitFunc,
-    typename InputTuple,
-    size_t... NumInputs>
-void test_op(
-    int blocks,
-    int threads,
-    std::string op_str,
-    AtenFunc af,
-    JitFunc jf,
-    OutputPair op,
-    InputTuple it,
-    std::index_sequence<NumInputs...>) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Generate Input JIT function Inputs and add them as Inputs to the Fusion
-  // Graph
-  std::array<Val*, sizeof...(NumInputs)> jit_inputs = {
-      gen_jit_operand(std::get<NumInputs>(it))...};
-  std::for_each(jit_inputs.begin(), jit_inputs.end(), [&fusion](Val* v) {
-    fusion.addInput(v);
-  });
-  TensorView* out =
-      static_cast<TensorView*>(jf(std::get<NumInputs>(jit_inputs)...));
-  fusion.addOutput(out);
-
-  std::for_each(jit_inputs.begin(), jit_inputs.end(), [out](Val* v) {
-    if (v->getValType() == ValType::TensorView)
-      static_cast<TensorView*>(v)->computeAt(out, -1);
-  });
-  out->axis(0)->parallelize(ParallelType::BIDx);
-  out->axis(-1)->parallelize(ParallelType::TIDx);
-
-  std::array<IValue, sizeof...(NumInputs)> aten_inputs = {gen_aten_operand(
-      std::get<NumInputs>(it), blocks, threads, /*rand*/ true)...};
-  const at::ArrayRef<IValue> aten_inputs_ivalues(aten_inputs);
-
-  at::Tensor cg_output =
-      gen_aten_operand(op, blocks, threads, /*rand*/ false).toTensor();
-  std::vector<at::Tensor> output_vect = {cg_output};
-  cudaDeviceSynchronize();
-  if (fusion.isStochastic())
-    at::manual_seed(0);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs_ivalues);
-  fe.runFusion(aten_inputs_ivalues, output_vect);
-  cudaDeviceSynchronize();
-
-  if (fusion.isStochastic())
-    at::manual_seed(0);
-  at::Tensor aten_output = af(aten_inputs);
-  cudaDeviceSynchronize(); // This sync shouldn't be necessary;
-
-  std::string op_msg = "Operation " + op_str;
-
-  testValidate(
-      &fusion,
-      {cg_output},
-      aten_inputs,
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      op_msg);
-}
-
-/*
- *  Templatized Helper Function that uses variadic templates to
- *  process a variable length Input Tuple of different Operand Type.
- */
-template <typename AtenFunc, typename JitFunc, typename InputTuple>
-void test_op(
-    int blocks,
-    int threads,
-    std::string op_str,
-    AtenFunc af,
-    JitFunc jf,
-    OutputPair op,
-    InputTuple it) {
-  static constexpr auto size = std::tuple_size<InputTuple>::value;
-  test_op(
-      blocks,
-      threads,
-      op_str,
-      af,
-      jf,
-      op,
-      it,
-      std::make_index_sequence<size>{});
-}
-
-TEST_F(NVFuserTest, FusionUnaryOps_CUDA) {
-  using OpTuple =
-      std::tuple<at::Tensor (*)(const at::Tensor&), UnaryOpType, std::string>;
-
-  // [Note: explicit tuple type for uniform initialization list]
-  // Tuple type must be explicitly specified for each uniform initialization
-  // list within the vector to make this code compatible with some old env
-  // which we still need to support. eg. gcc 5.4 + cuda 9.2.
-  std::vector<OpTuple> ops{
-      OpTuple{at::acos, UnaryOpType::Acos, "acos"},
-      OpTuple{at::asin, UnaryOpType::Asin, "asin"},
-      OpTuple{at::atan, UnaryOpType::Atan, "atan"},
-      // There does not appear to be an appropriate ATen function for atanh
-      // OpTuple{at::atanh,      UnaryOpType::Atanh,      "atanh"      },
-      OpTuple{at::cos, UnaryOpType::Cos, "cos"},
-      OpTuple{at::cosh, UnaryOpType::Cosh, "cosh"},
-      OpTuple{at::exp, UnaryOpType::Exp, "exp"},
-      // OpTuple{at::gelu, UnaryOpType::Gelu, "gelu"},
-      OpTuple{at::log, UnaryOpType::Log, "log"},
-      OpTuple{at::log10, UnaryOpType::Log10, "log10"},
-      OpTuple{at::neg, UnaryOpType::Neg, "neg"},
-      OpTuple{at::reciprocal, UnaryOpType::Reciprocal, "reciprocal"},
-      OpTuple{at::sigmoid, UnaryOpType::Sigmoid, "sigmoid"},
-      OpTuple{at::sin, UnaryOpType::Sin, "sin"},
-      OpTuple{at::sinh, UnaryOpType::Sinh, "sinh"},
-      OpTuple{at::sqrt, UnaryOpType::Sqrt, "sqrt"},
-      OpTuple{at::tan, UnaryOpType::Tan, "tan"},
-      OpTuple{at::tanh, UnaryOpType::Tanh, "tanh"},
-      OpTuple{at::isfinite, UnaryOpType::IsFinite, "isfinite"},
-      OpTuple{at::isinf, UnaryOpType::IsInf, "isinf"},
-      OpTuple{at::isnan, UnaryOpType::IsNan, "isnan"},
-      OpTuple{at::isreal, UnaryOpType::IsReal, "isreal"},
-  };
-
-  // The following ops has no complex support in eager mode
-  std::vector<OpTuple> ops_without_complex{
-      OpTuple{at::ceil, UnaryOpType::Ceil, "ceil"},
-      OpTuple{at::floor, UnaryOpType::Floor, "floor"},
-      OpTuple{at::frac, UnaryOpType::Frac, "frac"},
-      OpTuple{at::trunc, UnaryOpType::Trunc, "trunc"},
-      OpTuple{at::round, UnaryOpType::Round, "round"},
-      OpTuple{at::relu, UnaryOpType::Relu, "relu"},
-      OpTuple{at::expm1, UnaryOpType::Expm1, "expm1"},
-      OpTuple{at::log1p, UnaryOpType::Log1p, "log1p"},
-      OpTuple{at::lgamma, UnaryOpType::Lgamma, "lgamma"},
-      OpTuple{at::erf, UnaryOpType::Erf, "erf"},
-      OpTuple{at::erfc, UnaryOpType::Erfc, "erfc"},
-      OpTuple{at::isneginf, UnaryOpType::IsNegInf, "isneginf"},
-      OpTuple{at::isposinf, UnaryOpType::IsPosInf, "isposinf"},
-  };
-
-  // The following ops only supports complex
-  std::vector<OpTuple> ops_complex_only{
-      // real is supported via UnaryOpType::Set for non-complex types, and
-      // UnaryOpType::Real requires input to be complex
-      OpTuple{at::real, UnaryOpType::Real, "real"},
-      OpTuple{at::imag, UnaryOpType::Imag, "imag"},
-  };
-
-  // Complex support for the following op is not working in nvFuser yet
-  std::vector<OpTuple> ops_skip_complex{
-      // TODO: abs is actually supported in nvFuser, but it has bug!!!
-      // In eager mode, abs(complex_tensor) returns floating point tensor
-      // but in nvFuser, it wrongly returns complex tensor!
-      // We need to:
-      //  1. change our type promotion logic to make a special case for abs
-      //  2. why this bug is not detected here? we should bump up test coverage
-      OpTuple{at::abs, UnaryOpType::Abs, "abs"},
-      // TODO: the following two ops fails with compilation error like
-      // "undefined function rsqrt(complex)", we could implement them in
-      // helpers.cu, but I think it is better to check with Jiterator first,
-      // because Jiterator uses the same string for complex support.
-      OpTuple{at::rsqrt, UnaryOpType::Rsqrt, "rsqrt"},
-      OpTuple{at::log2, UnaryOpType::Log2, "log2"}};
-
-  std::vector<DataType> dtypes = {
-      DataType::Float,
-      DataType::Double,
-      DataType::ComplexFloat,
-      DataType::ComplexDouble};
-
-  for (auto dtype : dtypes) {
-    auto ops_to_test = ops;
-    if (dtype != DataType::ComplexFloat && dtype != DataType::ComplexDouble) {
-      ops_to_test.insert(
-          ops_to_test.end(),
-          ops_without_complex.begin(),
-          ops_without_complex.end());
-      ops_to_test.insert(
-          ops_to_test.end(), ops_skip_complex.begin(), ops_skip_complex.end());
-    } else {
-      ops_to_test.insert(
-          ops_to_test.end(), ops_complex_only.begin(), ops_complex_only.end());
-    }
-    std::for_each(ops.begin(), ops.end(), [&](OpTuple& op) {
-      test_op(
-          /*blocks*/ 640,
-          /*threads*/ 64,
-          /*name*/ std::get<2>(op),
-          /*Aten Func   */
-          [&op](std::array<IValue, 1>& vals) {
-            return std::get<0>(op)(vals[0].toTensor());
-          },
-          /*JIT  Func   */
-          [&op](Val* in1) -> Val* { return unaryOp(std::get<1>(op), in1); },
-          /*Output      */ std::make_pair(ValType::TensorView, dtype),
-          /*Inputs Tuple*/
-          std::make_tuple(std::make_pair(ValType::TensorView, dtype)));
-    });
-  }
-
-  dtypes = {DataType::Int, DataType::Int32, DataType::Bool};
-  for (auto dtype : dtypes) {
-    test_op(
-        /*blocks*/ 128,
-        /*threads*/ 64,
-        /*name*/ "bitwise_not",
-        /*Aten Func   */
-        [](std::array<IValue, 1>& vals) {
-          return at::bitwise_not(vals[0].toTensor());
-        },
-        /*JIT  Func   */
-        [](Val* in1) -> Val* { return unaryOp(UnaryOpType::Not, in1); },
-        /*Output      */ std::make_pair(ValType::TensorView, dtype),
-        /*Inputs Tuple*/
-        std::make_tuple(std::make_pair(ValType::TensorView, dtype)));
-  }
-}
-
-TEST_F(NVFuserTest, FusionBinaryOps_CUDA) {
-  using AtenFuncSig = at::Tensor (*)(const at::Tensor&, const at::Tensor&);
-  using OpTuple = std::tuple<AtenFuncSig, BinaryOpType, std::string>;
-
-  std::vector<DataType> dtypes = {
-      DataType::Double,
-      DataType::Float,
-      DataType::ComplexFloat,
-      DataType::ComplexDouble};
-
-  // see [Note: explicit tuple type for uniform initialization list]
-  std::vector<OpTuple> equal_ops{
-      OpTuple{at::eq, BinaryOpType::Eq, "eq"},
-      OpTuple{at::ne, BinaryOpType::NE, "ne"}};
-
-  // Complex numbers are not ordered
-  std::vector<OpTuple> order_ops{
-      OpTuple{at::ge, BinaryOpType::GE, "ge"},
-      OpTuple{at::gt, BinaryOpType::GT, "gt"},
-      OpTuple{at::le, BinaryOpType::LE, "le"},
-      OpTuple{at::lt, BinaryOpType::LT, "lt"}};
-
-  // see [Note: explicit tuple type for uniform initialization list]
-  std::vector<OpTuple> math_ops{
-      OpTuple{at::div, BinaryOpType::Div, "div"},
-      OpTuple{at::mul, BinaryOpType::Mul, "mul"},
-      OpTuple{at::pow, BinaryOpType::Pow, "pow"}};
-
-  // The following ops has no complex support in eager mode
-  std::vector<OpTuple> math_ops_without_complex{
-      OpTuple{at::atan2, BinaryOpType::Atan2, "atan2"},
-      OpTuple{at::max, BinaryOpType::Max, "max"},
-      OpTuple{at::min, BinaryOpType::Min, "min"},
-      OpTuple{at::fmod, BinaryOpType::Fmod, "fmod"},
-      // NOTE: Remainder does not match the Aten impl exactly
-      // despite using an identical function.
-      OpTuple{at::remainder, BinaryOpType::Remainder, "remainder"}};
-
-  for (auto dtype : dtypes) {
-    auto logic_ops = equal_ops;
-    if (dtype != DataType::ComplexFloat && dtype != DataType::ComplexDouble) {
-      logic_ops.insert(logic_ops.end(), order_ops.begin(), order_ops.end());
-    }
-    std::for_each(logic_ops.begin(), logic_ops.end(), [&](OpTuple& op) {
-      test_op(
-          /*blocks*/ 640,
-          /*threads*/ 64,
-          /*name*/ std::get<2>(op),
-          /*Aten Func   */
-          [&op](std::array<IValue, 2>& vals) {
-            return std::get<0>(op)(vals[0].toTensor(), vals[1].toTensor());
-          },
-          /*JIT  Func   */
-          [&op](Val* in1, Val* in2) -> Val* {
-            return binaryOp(std::get<1>(op), in1, in2);
-          },
-          /*Output      */ std::make_pair(ValType::TensorView, DataType::Bool),
-          /*Inputs Tuple*/
-          std::make_tuple(
-              std::make_pair(ValType::TensorView, dtype),
-              std::make_pair(ValType::TensorView, dtype)));
-    });
-
-    auto enabled_math_ops = math_ops;
-    if (dtype != DataType::ComplexFloat && dtype != DataType::ComplexDouble) {
-      enabled_math_ops.insert(
-          enabled_math_ops.end(),
-          math_ops_without_complex.begin(),
-          math_ops_without_complex.end());
-    }
-    std::for_each(
-        enabled_math_ops.begin(), enabled_math_ops.end(), [&](OpTuple& op) {
-          test_op(
-              /*blocks*/ 640,
-              /*threads*/ 64,
-              /*name*/ std::get<2>(op),
-              /*Aten Func   */
-              [&op](std::array<IValue, 2>& vals) {
-                return std::get<0>(op)(vals[0].toTensor(), vals[1].toTensor());
-              },
-              /*JIT  Func   */
-              [&op](Val* in1, Val* in2) -> Val* {
-                return binaryOp(std::get<1>(op), in1, in2);
-              },
-              /*Output      */ std::make_pair(ValType::TensorView, dtype),
-              /*Inputs Tuple*/
-              std::make_tuple(
-                  std::make_pair(ValType::TensorView, dtype),
-                  std::make_pair(ValType::TensorView, dtype)));
-        });
-
-    test_op(
-        /*blocks*/ 640,
-        /*threads*/ 64,
-        /*name*/ "add_alpha",
-        /*Aten Func   */
-        [](std::array<IValue, 3>& vals) {
-          return at::add(
-              vals[0].toTensor(), vals[1].toTensor(), vals[2].toScalar());
-        },
-        /*JIT  Func   */ static_cast<Val* (*)(Val*, Val*, Val*)>(&add_alpha),
-        /*Output      */ std::make_pair(ValType::TensorView, dtype),
-        /*Inputs Tuple*/
-        std::make_tuple(
-            std::make_pair(ValType::TensorView, dtype),
-            std::make_pair(ValType::TensorView, dtype),
-            std::make_pair(ValType::Scalar, dtype)));
-
-    test_op(
-        /*blocks*/ 640,
-        /*threads*/ 64,
-        /*name*/ "sub_alpha",
-        /*Aten Func   */
-        [](std::array<IValue, 3>& vals) {
-          return at::sub(
-              vals[0].toTensor(), vals[1].toTensor(), vals[2].toScalar());
-        },
-        /*JIT  Func   */ static_cast<Val* (*)(Val*, Val*, Val*)>(&sub_alpha),
-        /*Output      */ std::make_pair(ValType::TensorView, dtype),
-        /*Inputs Tuple*/
-        std::make_tuple(
-            std::make_pair(ValType::TensorView, dtype),
-            std::make_pair(ValType::TensorView, dtype),
-            std::make_pair(ValType::Scalar, dtype)));
-  }
-}
-
-TEST_F(NVFuserTest, FusionTernaryOps_CUDA) {
-  std::vector<DataType> dtypes = {
-      DataType::Double,
-      DataType::Float,
-      DataType::ComplexFloat,
-      DataType::ComplexDouble};
-
-  for (auto dtype : dtypes) {
-    // clamp and threshold are not supported for complex on eager mode
-    if (dtype != DataType::ComplexFloat && dtype != DataType::ComplexDouble) {
-      test_op(
-          /*blocks*/ 640,
-          /*threads*/ 64,
-          /*name*/ "clamp",
-          /*Aten Func   */
-          [](std::array<IValue, 1>& vals) {
-            return at::clamp(vals[0].toTensor(), 0.f, 1.f);
-          },
-          /*JIT  Func   */
-          [&](Val* in1) -> Val* {
-            if (dtype == DataType::Float) {
-              return clamp(
-                  in1,
-                  IrBuilder::create<Double>(0.f),
-                  IrBuilder::create<Double>(1.f));
-            } else {
-              return clamp(
-                  in1,
-                  IrBuilder::create<Double>(0.f),
-                  IrBuilder::create<Double>(1.f));
-            }
-          },
-          /*Output      */ std::make_pair(ValType::TensorView, dtype),
-          /*Inputs Tuple*/
-          std::make_tuple(std::make_pair(ValType::TensorView, dtype)));
-      test_op(
-          /*blocks*/ 640,
-          /*threads*/ 64,
-          /*name*/ "threshold",
-          /*Aten Func   */
-          [](std::array<IValue, 1>& vals) {
-            return at::threshold(vals[0].toTensor(), 0.f, 1.f);
-          },
-          /*JIT  Func   */
-          [&](Val* in1) -> Val* {
-            if (dtype == DataType::Float) {
-              return threshold(
-                  in1,
-                  IrBuilder::create<Double>(0.f),
-                  IrBuilder::create<Double>(1.f));
-            } else {
-              return threshold(
-                  in1,
-                  IrBuilder::create<Double>(0.f),
-                  IrBuilder::create<Double>(1.f));
-            }
-          },
-          /*Output      */ std::make_pair(ValType::TensorView, dtype),
-          /*Inputs Tuple*/
-          std::make_tuple(std::make_pair(ValType::TensorView, dtype)));
-    }
-    test_op(
-        /*blocks*/ 640,
-        /*threads*/ 64,
-        /*name*/ "where",
-        /*Aten Func   */
-        [](std::array<IValue, 3>& vals) {
-          return at::where(
-              vals[0].toTensor(), vals[1].toTensor(), vals[2].toTensor());
-        },
-        /*JIT  Func   */ static_cast<Val* (*)(Val*, Val*, Val*)>(&where),
-        /*Output      */ std::make_pair(ValType::TensorView, dtype),
-        /*Inputs Tuple*/
-        std::make_tuple(
-            std::make_pair(ValType::TensorView, DataType::Bool),
-            std::make_pair(ValType::TensorView, dtype),
-            std::make_pair(ValType::TensorView, dtype)));
-  }
-}
-
-TEST_F(NVFuserTest, FusionCompoundOps_CUDA) {
-  std::vector<DataType> dtypes = {
-      DataType::Double,
-      DataType::Float,
-      DataType::ComplexFloat,
-      DataType::ComplexDouble};
-
-  for (auto dtype : dtypes) {
-    test_op(
-        /*blocks*/ 640,
-        /*threads*/ 64,
-        /*name*/ "lerp",
-        /*Aten Func   */
-        [](std::array<IValue, 3>& vals) {
-          return at::lerp(
-              vals[0].toTensor(), vals[1].toTensor(), vals[2].toTensor());
-        },
-        /*JIT  Func   */ static_cast<Val* (*)(Val*, Val*, Val*)>(&lerp),
-        /*Output      */ std::make_pair(ValType::TensorView, dtype),
-        /*Inputs Tuple*/
-        std::make_tuple(
-            std::make_pair(ValType::TensorView, dtype),
-            std::make_pair(ValType::TensorView, dtype),
-            std::make_pair(ValType::TensorView, dtype)));
-    test_op(
-        /*blocks*/ 640,
-        /*threads*/ 64,
-        /*name*/ "addcmul",
-        /*Aten Func   */
-        [](std::array<IValue, 4>& vals) {
-          return at::addcmul(
-              vals[0].toTensor(),
-              vals[1].toTensor(),
-              vals[2].toTensor(),
-              vals[3].toScalar());
-        },
-        /*JIT  Func   */
-        static_cast<Val* (*)(Val*, Val*, Val*, Val*)>(&addcmul),
-        /*Output      */ std::make_pair(ValType::TensorView, dtype),
-        /*Inputs Tuple*/
-        std::make_tuple(
-            std::make_pair(ValType::TensorView, dtype),
-            std::make_pair(ValType::TensorView, dtype),
-            std::make_pair(ValType::TensorView, dtype),
-            std::make_pair(ValType::Scalar, dtype)));
-  }
-}
-
-TEST_F(NVFuserTest, FusionCastOps_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2, DataType::Half);
-
-  TensorView* intrm1 = castOp(DataType::Float, tv0);
-  TensorView* out = castOp(DataType::Half, intrm1);
-
-  fusion.addInput(tv0);
-  fusion.addOutput(out);
-  tv0->computeAt(out, -1);
-
-  out->axis(0)->parallelize(ParallelType::BIDx);
-  out->axis(-1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-
-  at::Tensor input1 = at::randn({1, 4}, options);
-  at::Tensor ref_output = at::empty_like(input1);
-
-  std::array<IValue, 1> inputs = {input1};
-  const at::ArrayRef<IValue> input_ivalues(inputs);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, input_ivalues);
-  auto outputs = fe.runFusion(input_ivalues);
-
-  ref_output = at::_cast_Half(at::_cast_Double(input1));
-
-  TORCH_CHECK(
-      outputs[0].equal(ref_output),
-      "\nOp Type: -- ",
-      "cast FP16->FP32->FP16",
-      " -- had a mismatch.\n",
-      "\nABS MAX DIFF: ",
-      outputs[0].sub(ref_output).abs().max(),
-      "\n");
-}
-
-// Start off simple, block on the outer dim
-// block stride + thread all reduce + unrolling on inner dim
-TEST_F(NVFuserTest, FusionReduction1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  // tv1[I0, R1] = tv0[I0, I1]
-  TensorView* tv1 =
-      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
-  fusion.addOutput(tv1);
-
-  TORCH_CHECK(
-      ir_utils::getReductionOps(&fusion).size(),
-      "Could not detect reduction in fusion.");
-
-  tv1->split(1, 128);
-  // tv1[I0, R1o, R1i{128}] = tv0[I0, I1]
-  tv1->split(1, 4);
-  // tv1[I0, R1oo, R1oi{4}, R1i{128}] = tv0[I0, I1]
-
-  TensorView* tv2 = tv1->rFactor({1});
-  // tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}] = tv0[I0, I1]
-  // tv1[I0,        R1oi{4},  R1i{128}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}]
-
-  TensorView* tv3 = tv1->rFactor({1});
-  // tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}] = tv0[I0, I1]
-  // tv3[I0,        R1oi{4}, Ir1i{128}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}]
-  // tv1[I0,                  R1i{128}] = tv3[I0,        R1oi{4}, Ir1i{128}]
-
-  // Incrementally, can print in between for debugging
-  tv0->computeAt(tv2, 1);
-  tv2->computeAt(tv3, 1);
-  tv3->computeAt(tv1, 1);
-
-  // Re do it all at once, because why not.
-  tv0->computeAt(tv1, 1);
-
-  tv2->axis(2)->parallelize(ParallelType::Unroll);
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  int numel_x = 65000;
-  int numel_y = 1025;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-  at::Tensor cg_output = at::empty({numel_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  fe.runFusion({input}, {cg_output});
-
-  auto aten_output = input.to(at::kDouble).sum({1});
-
-  testValidate(
-      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionReduction2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  // tv1[I0, R1] = tv0[I0, I1]
-  TensorView* tv1 =
-      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
-
-  fusion.addOutput(tv1);
-
-  // switches to try some different scenarios. maybe we should iterate on all
-  // permutations.
-  bool bind_bidx = true;
-  bool bind_tidx = true;
-  bool bind_tidy = true;
-  bool bind_unroll = true;
-
-  int numel_x = 1025; // Cannot exceed block dim max size / tidy
-  int numel_y = 129;
-  int tidx = 16;
-  int tidy = 8;
-  int unroll_factor = 4;
-
-  tv1->split(1, tidx);
-  // tv1[I0, R1o, R1i{tidx}] = tv0[I0, I1]
-
-  tv1->split(1, unroll_factor);
-  // tv1[I0, R1oo, R1oi{unroll}, R1i{tidx}] = tv0[I0, I1]
-
-  tv1->split(0, tidy);
-
-  TensorView* tv2 = tv1->rFactor({-3});
-  // tv2[I0,             >R1oo<, Ir1oi{unroll}, Ir1i{tidx}]
-  // tv1[I0o, I0i{tidy},          R1oi{unroll},  R1i{tidx}]
-
-  TensorView* tv3 = tv1->rFactor({-2});
-  // tv2[I0,             >R1oo<, Ir1oi{unroll}, Ir1i{tidx}]
-  // tv3[I0,                      R1oi{unroll}, Ir1i{tidx}]
-  // tv1[I0o, I0i{tidy},                         R1i{tidx}]
-
-  tv0->computeAt(tv1, -2);
-
-  if (bind_unroll)
-    tv2->axis(-2)->parallelize(ParallelType::Unroll);
-  if (bind_bidx)
-    tv1->axis(0)->parallelize(ParallelType::BIDx);
-  if (bind_tidy)
-    tv1->axis(1)->parallelize(ParallelType::TIDy);
-
-  if (bind_tidx) {
-    tv2->axis(-1)->parallelize(ParallelType::TIDx);
-    tv3->axis(-1)->parallelize(ParallelType::TIDx);
-    tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
-
-  auto aten_output = input.to(at::kDouble).sum({1});
-  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionReduction3_CUDA) {
-  // What if Z participates in the reduction with X?
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  // tv1[I0, R1] = tv0[I0, I1]
-  TensorView* tv1 =
-      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
-
-  fusion.addOutput(tv1);
-
-  int numel_x = 1025; // Cannot exceed block dim max size / tidy
-  int numel_y = 129;
-  int tidx = 16;
-  int tidz = 8;
-
-  tv1->split(1, tidz);
-  // tv1[I0, R1o, R1i{tidz}] = tv0[I0, I1]
-
-  tv1->split(1, tidx);
-  // tv1[I0, R1oo, R1oi{tidx}, R1i{tidz}] = tv0[I0, I1]
-
-  TensorView* tv2 = tv1->rFactor({-3});
-  // tv2[I0,  >R1oo<, Ir1oi{tidx}, Ir1i{tidz}]
-  // tv1[I0o,          R1oi{tidx},  R1i{tidz}]
-
-  tv0->computeAt(tv1, -3);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(-2)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->parallelize(ParallelType::TIDz);
-
-  tv2->axis(-2)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDz);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
-  at::Tensor cg_output = at::empty({numel_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, {cg_output});
-
-  auto aten_output = aten_input.to(at::kDouble).sum({1});
-
-  testValidate(
-      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionReduction4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-
-  TensorView* tv2 = add(tv0, tv1);
-  // tv2[I0, I1] = tv0[I0, I1] + tv1[I0, I1]
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  TensorView* tv3 =
-      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv2);
-  // tv3[I0, R1] = tv2[I0, I1]
-
-  TensorView* tv4 = makeSymbolicTensor(1);
-  fusion.addInput(tv4);
-
-  // tv5[I0] = tv3[I0, R1] * tv4[I0]
-  TensorView* tv5 = mul(tv3, tv4);
-  fusion.addOutput(tv5);
-
-  int tidx = 16;
-
-  // RFactor the reduction
-  tv3->split(1, tidx);
-  // tv3[I0, R1o, R1i{tidx}] = tv2[I0, I1]
-
-  TensorView* tv6 = tv3->rFactor({-2});
-  // tv6[I0, R1o, iR1i{tidx}] = tv2[I0, I1]
-  // tv3[I0,       R1i{tidx}] = tv3[I0, I1]
-  tv2->computeAt(tv6, 2);
-
-  // Compute at inline with tv5 (only 1D)
-  tv6->computeAt(tv3, 1);
-  tv3->computeAt(tv5, 1);
-
-  tv5->axis(0)->parallelize(ParallelType::BIDx);
-
-  // Intermediate tensors only need this, but doesn't hurt to do on inputs
-  // tv0, 1, 4
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv6->axis(-1)->parallelize(ParallelType::TIDx);
-
-  int numel_x = 1025;
-  int numel_y = 129;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  at::Tensor t1 = at::randn({numel_x, numel_y}, options);
-  at::Tensor t4 = at::randn({numel_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1, t4});
-  auto cg_outputs = fe.runFusion({t0, t1, t4});
-
-  auto t2 = t0.add(t1);
-  auto t3 = t2.to(at::kDouble).sum({1});
-  auto aten_output = t3.mul(t4);
-
-  testValidate(
-      &fusion, cg_outputs, {t0, t1, t4}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionReduction5_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(3);
-
-  fusion.addInput(tv0);
-
-  TensorView* tv1 =
-      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
-
-  fusion.addOutput(tv1);
-
-  int bidy = 2;
-  int tidy = 4;
-  int tidx = 5;
-
-  int dim1 = 11;
-
-  tv1->split(-2, tidy);
-
-  TensorView* tv2 = tv1->rFactor({-3});
-
-  tv0->computeAt(tv1, 1);
-  tv1->axis(0)->parallelize(ParallelType::BIDy);
-
-  for (auto* val : fusion.vals()) {
-    if (!val->isFusionInput() &&
-        val->getValType().value() == ValType::TensorView) {
-      val->as<TensorView>()->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  tv2->axis(-2)->parallelize(ParallelType::TIDy);
-  tv1->axis(-2)->parallelize(ParallelType::TIDy);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({bidy, dim1, tidx}, options);
-
-  at::Tensor cg_output = at::empty({bidy, tidx}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  fe.runFusion({input}, {cg_output});
-
-  auto aten_output = input.to(at::kDouble).sum({1});
-  testValidate(
-      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionReduction6_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int bdimx = 64;
-  const int bdimy = 8;
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(3);
-  fusion.addInput(tv0);
-
-  // tv1[I0, R1, R2] = tv0[I0, I1, I2]
-  TensorView* tv1 =
-      reductionOp(BinaryOpType::Add, {1, 2}, IrBuilder::create<Double>(0), tv0);
-  fusion.addOutput(tv1);
-
-  TORCH_CHECK(
-      ir_utils::getReductionOps(&fusion).size(),
-      "Could not detect reduction in fusion.");
-
-  tv1->split(2, bdimx);
-  // tv1[I0, R1, R2o, R2i{128}] = tv0[I0, I1, I2]
-  tv1->split(1, bdimy);
-  // tv1[I0, R1o, R1i{8}, R2o, R2i{128}] = tv0[I0, I1, I2]
-
-  TensorView* tv2 = tv1->rFactor({3});
-  // tv2[I0, I1o, I1i{8}, R2o, I2i{128}] = tv0[I0, I1, I2]
-  // tv1[I0, R1o, R1i{8},      R2i{128}] = tv2[I0, I1o, I1i{8}, R2o, I2i{128}]
-
-  TensorView* tv3 = tv1->rFactor({1});
-  // tv2[I0, I1o, I1i{8}, R2o, I2i{128}] = tv0[I0, I1, I2]
-  // tv3[I0, R1o, I1i{8},      I2i{128}] = tv2[I0, I1o, I1i{8}, R2o, I2i{128}]
-  // tv1[I0,      R1i{8},      R2i{128}] = tv3[I0, R1o, I1i{8},      I2i{128}]
-
-  tv3->computeAt(tv1, 1);
-  tv2->computeAt(tv3, 2);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv1->axis(-2)->parallelize(ParallelType::TIDy);
-  tv3->axis(-2)->parallelize(ParallelType::TIDy);
-  tv2->axis(-3)->parallelize(ParallelType::TIDy);
-
-  int numel_x = 650;
-  int numel_y = 1000;
-  int numel_z = 4;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y, numel_z}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
-
-  auto aten_output = input.to(at::kDouble).sum({1, 2});
-  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionMultiGridReduction_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  TensorView* tv1 = max(tv0, {0});
-  TensorView* tv2 = sum(tv0, {0});
-
-  fusion.addOutput(tv1);
-  fusion.addOutput(tv2);
-
-  int numel_x = 4;
-  int numel_y = 2;
-
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
-
-  std::vector<at::Tensor> aten_outputs = {
-      std::get<0>(input.to(at::kDouble).max(0)), input.to(at::kDouble).sum(0)};
-  testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionMultiGridReduction2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = sum(tv0, {0});
-  auto tv2 = sum(tv1, {0});
-  fusion.addOutput(tv2);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::BIDy);
-  tv2->axis(0)->parallelize(ParallelType::BIDy);
-
-  FusionExecutor fe;
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
-}
-
-TEST_F(NVFuserTest, FusionReductionTFT_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  // tv1[I0, R1] = tv0[I0, I1]
-  TensorView* tv1 =
-      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
-
-  fusion.addOutput(tv1);
-
-  int numel_x = 1025;
-  int numel_y = 129;
-  int tidx = 16;
-  int tidy = 8;
-  int tidz = 8;
-
-  tv1->split(1, tidx);
-  // tv1[I0, R1o, R1i{tidx}]
-
-  tv1->split(1, tidz);
-  // tv1[I0, R1oo, R1Oi{tidz}, R1R1i{tidx}]
-
-  tv1->split(0, tidy);
-  // tv1[I0o, I0i, R1oo, R1Oi{tidz}, R1R1i{tidx}]
-
-  TensorView* tv2 = tv1->rFactor({2});
-  // tv2[I0o, I0i, R1oo, I1Oi{tidz}, I11i{tidx}]
-  // tv1[I0o, I0i,       R1Oi{tidz}, R1R1i{tidx}]
-
-  tv2->computeAt(tv1, 2);
-
-  tv1->axis(1)->parallelize(ParallelType::TIDy);
-
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv1->axis(-2)->parallelize(ParallelType::TIDz);
-  tv2->axis(-2)->parallelize(ParallelType::TIDz);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-  at::Tensor cg_output = at::empty({numel_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  fe.runFusion({input}, {cg_output});
-
-  auto aten_output = input.to(at::kDouble).sum({1});
-  testValidate(
-      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionReductionOuterSplit_CUDA) {
-  // based off FusionReduction4
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-
-  TensorView* tv2 = add(tv0, tv1);
-  // tv2[I0, I1] = tv0[I0, I1] + tv1[I0, I1]
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  TensorView* tv3 =
-      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv2);
-  // tv3[I0, R1] = tv2[I0, I1]
-
-  TensorView* tv4 = makeSymbolicTensor(1);
-  fusion.addInput(tv4);
-
-  // tv5[I0] = tv3[I0, R1] * tv4[I0]
-  TensorView* tv5 = mul(tv3, tv4);
-  fusion.addOutput(tv5);
-
-  // RFactor the reduction
-  tv3->split(1, 16, false);
-  // tv3[I0, R1o{16}, R1i{tidx}] = tv2[I0, I1]
-
-  TensorView* tv6 = tv3->rFactor({-2});
-  // tv6[I0, R1o{16}, iR1i{tidx}] = tv2[I0, I1]
-  // tv3[I0,           R1i{tidx}] = tv3[I0, I1]
-  tv2->computeAt(tv6, 2);
-
-  // Compute at inline with tv5 (only 1D)
-  tv6->computeAt(tv3, 1);
-  tv3->computeAt(tv5, 1);
-
-  tv5->axis(0)->parallelize(ParallelType::BIDx);
-
-  // Intermediate tensors only need this, but doesn't hurt to do on inputs
-  // tv0, 1, 4
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv6->axis(-1)->parallelize(ParallelType::TIDx);
-
-  int numel_x = 1025;
-  int numel_y = 129;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  at::Tensor t1 = at::randn({numel_x, numel_y}, options);
-  at::Tensor t4 = at::randn({numel_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1, t4});
-  auto cg_outputs = fe.runFusion({t0, t1, t4});
-
-  auto t2 = t0.add(t1);
-  auto t3 = t2.to(at::kDouble).sum({1});
-  auto aten_output = t3.mul(t4);
-
-  testValidate(
-      &fusion, cg_outputs, {t0, t1, t4}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionBranches_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-  TensorView* tv2 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addInput(tv2);
-
-  auto tv3 = add(tv0, IrBuilder::create<Double>(1.0));
-  auto tv4 = add(tv3, tv1);
-  auto tv5 = add(tv3, tv2);
-  auto tv6 = add(tv4, tv5);
-
-  fusion.addOutput(tv6);
-
-  constexpr int x = 63, y = 33;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({x, y}, options);
-  at::Tensor t1 = at::randn({x, y}, options);
-  at::Tensor t2 = at::randn({x, y}, options);
-
-  FusionExecutor fe;
-  tv6->merge(0);
-  tv6->split(0, 128);
-  tv6->split(0, 4);
-
-  tv6->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv0->computeAt(tv6, 1);
-  tv1->computeAt(tv6, 1);
-  tv2->computeAt(tv6, 1);
-
-  tv3->axis(-2)->parallelize(ParallelType::Unroll);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(-2)->parallelize(ParallelType::Unroll);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv5->axis(-2)->parallelize(ParallelType::Unroll);
-  tv5->axis(-1)->parallelize(ParallelType::TIDx);
-  tv6->axis(-1)->parallelize(ParallelType::TIDx);
-
-  std::vector<IValue> aten_inputs = {t0, t1, t2};
-
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto t3 = t0.add(1.0);
-  auto t4 = t3.add(t1);
-  auto t5 = t3.add(t2);
-  auto aten_output = t4.add(t5);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSimpleBCast1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1.5));
-
-  TensorView* tv2 = makeSymbolicTensor(2);
-  fusion.addInput(tv2);
-  TensorView* tv3 = makeSymbolicTensor(2);
-  fusion.addInput(tv3);
-  TensorView* tv4 = sub(tv2, tv3);
-
-  TensorView* tv5 = broadcast(tv1, {false, false, true});
-  TensorView* tv6 = broadcast(tv4, {true, false, false});
-
-  TensorView* tv7 = add(tv5, tv6);
-  fusion.addOutput(tv7);
-
-  tv7->split(-1, 4);
-  tv7->split(0, 8);
-
-  tv0->computeAt(tv7, -1);
-  tv2->computeAt(tv7, -1);
-
-  tv7->axis(0)->parallelize(ParallelType::BIDx);
-  tv7->axis(-1)->parallelize(ParallelType::TIDx);
-
-  constexpr int x = 63, y = 33, z = 15;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({x, y}, options);
-  at::Tensor t1 = t0.add(1.5);
-
-  at::Tensor t2 = at::randn({y, z}, options);
-  at::Tensor t3 = at::randn({y, z}, options);
-
-  at::Tensor t4 = t2.sub(t3);
-  at::Tensor t5 = t1.unsqueeze(-1).expand({x, y, z});
-
-  at::Tensor t6 = t4.expand({x, y, z});
-
-  at::Tensor aten_output = t5.add(t6);
-
-  std::vector<IValue> aten_inputs = {t0, t2, t3};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSimpleBCast2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  TensorView* tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = add(tv0, tv1);
-
-  TensorView* tv3 = broadcast(tv2, {false, false, true});
-
-  TensorView* tv4 = makeSymbolicTensor(2);
-  fusion.addInput(tv4);
-
-  TensorView* tv5 = sub(tv4, IrBuilder::create<Double>(0.1));
-
-  TensorView* tv6 = broadcast(tv5, {true, false, false});
-
-  TensorView* tv7 = add(tv3, tv6);
-
-  fusion.addOutput(tv7);
-
-  tv7->merge(0, 1);
-
-  tv0->computeAt(tv7, -1);
-  tv4->computeAt(tv7, -1);
-
-  tv7->axis(0)->parallelize(ParallelType::BIDx);
-  tv7->axis(-1)->parallelize(ParallelType::TIDx);
-
-  constexpr int x = 63, y = 33, z = 15;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({x, y}, options);
-  at::Tensor t1 = at::randn({x, y}, options);
-  at::Tensor t2 = t0.add(t1);
-  at::Tensor t3 = t2.unsqueeze(-1).expand({x, y, z});
-
-  at::Tensor t4 = at::randn({y, z}, options);
-  at::Tensor t5 = t4.sub(0.1);
-  at::Tensor t6 = t5.expand({x, y, z});
-  at::Tensor aten_output = t3.add(t6);
-
-  at::Tensor cg_output = at::empty({x, y, z}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1, t4};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  fe.runFusion(aten_inputs, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSimpleBCast3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up input tensor views
-  // tv0[I1, B{1}]
-  TensorView* tv0 = makeConcreteTensor({-1, 1});
-  fusion.addInput(tv0);
-
-  // tv1[I0, I1, I2]
-  TensorView* tv2 = makeSymbolicTensor(3);
-  fusion.addInput(tv2);
-
-  TensorView* tv3 = add(tv0, tv2);
-
-  fusion.addOutput(tv3);
-
-  tv3->merge(0);
-  tv3->merge(0);
-
-  tv0->computeAt(tv3, -1);
-  tv2->computeAt(tv3, -1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  constexpr int x = 2, y = 3, z = 4;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({y, 1}, options);
-  at::Tensor t2 = at::randn({x, y, z}, options);
-  auto aten_output = t0.add(t2);
-
-  std::vector<IValue> aten_inputs = {t0, t2};
-  at::Tensor cg_output = at::empty({x, y, z}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  fe.runFusion(aten_inputs, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSimpleBCast4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeConcreteTensor({1, -1});
-
-  TensorView* tv1 = makeSymbolicTensor(3);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  TensorView* tv3 = add(tv0, tv1);
-
-  tv3->merge(0);
-  tv3->merge(0);
-  tv3->split(0, 128);
-  tv3->split(0, 4);
-
-  fusion.addOutput(tv3);
-
-  tv0->computeAt(tv3, -1);
-  tv1->computeAt(tv3, -1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-2)->parallelize(ParallelType::Unroll);
-
-  constexpr int x = 63, y = 33, z = 15;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({1, z}, options);
-  at::Tensor t1 = at::randn({x, y, z}, options);
-
-  auto aten_output = t0.add(t1);
-
-  at::Tensor cg_output = at::empty({x, y, z}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  fe.runFusion(aten_inputs, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSimpleBCast5_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  constexpr int m = 2, k = 3, n = 4;
-  auto tv0 = makeConcreteTensor({m, k});
-  auto tv1 = makeConcreteTensor({k, n});
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = broadcast(tv0, {false, false, true});
-  TensorView* tv3 = broadcast(tv1, {true, false, false});
-
-  TensorView* tv4 = add(tv2, tv3);
-
-  fusion.addOutput(tv4);
-
-  tv4->merge(0);
-  tv4->merge(0);
-
-  tv0->computeAt(tv4, -1);
-  tv1->computeAt(tv4, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({m, k}, options);
-  at::Tensor t1 = at::randn({k, n}, options);
-
-  auto t2 = t0.unsqueeze(-1).expand({m, k, n});
-  auto t3 = t1.expand({m, k, n});
-  auto aten_output = t2.add(t3);
-
-  at::Tensor cg_output = at::empty({m, k, n}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  fe.runFusion(aten_inputs, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionComplexBCast1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int x = 2, y = 3, z = 4;
-
-  auto tv0 = makeConcreteTensor({y});
-  auto tv1 = div(tv0, IrBuilder::create<Double>(2.0));
-  auto tv2 = broadcast(tv1, {false, true});
-  auto tv3 = makeConcreteTensor({y, z});
-  auto tv4 = mul(tv2, tv3);
-  auto tv5 = broadcast(tv4, {true, false, false});
-  auto tv6 = makeConcreteTensor({x, y, z});
-  auto tv7 = add(tv5, tv6);
-
-  // tv0[    i1    ] = input
-  // tv1[    i1    ] = tv0/2.0
-  // tv2[    i1, b2] = bcast(tv1)
-  // tv3[    i1, i2] = input
-  // tv4[    i1, i2] = tv2 * tv3
-  // tv5[b0, i1, i2] = bcast(tv4)
-  // tv6[i0, i1, i2] = input
-  // tv7[i0, i1, i2] = tv5 + tv6
-
-  // tv4 = bcast(tv1) * tv3
-  // tv7 = bcast(tv4) + tv6
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv3);
-  fusion.addInput(tv6);
-
-  fusion.addOutput(tv7);
-
-  tv7->merge(0);
-  tv7->merge(0);
-  tv0->computeAt(tv7, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({y}, options);
-  at::Tensor t3 = at::randn({y, z}, options);
-  at::Tensor t6 = at::randn({x, y, z}, options);
-
-  auto t4 = t0.div(2.0).unsqueeze(-1).expand({y, z}) * t3;
-  auto aten_output = t4.unsqueeze(0).expand({x, y, z}) + t6;
-
-  std::vector<IValue> aten_inputs = {t0, t3, t6};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionComplexBCast2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int x = 2, y = 3, z = 4;
-
-  auto tv0 = makeConcreteTensor({y, z});
-  auto tv1 = div(tv0, IrBuilder::create<Double>(2.0));
-  auto tv2 = sum(tv1, {1});
-  auto tv3 = broadcast(tv2, {true, false});
-  auto tv4 = makeConcreteTensor({x, y});
-  auto tv5 = add(tv3, tv4);
-
-  // tv0[    i1, i2] = input
-  // tv1[    i1, i2] = tv0/2.0
-  // tv2[    i1    ] = sum(tv1, 1)
-  // tv3[b0, i1    ] = bcast(tv2)
-  // tv4[i0, i1    ] = input
-  // tv5[i0, i1    ] = tv3 + tv4
-
-  // tv2 = sum(tv0/2.0, 1)
-  // tv5 = bcast(tv2) + tv4
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv4);
-
-  fusion.addOutput(tv5);
-
-  tv5->merge(0);
-  tv0->computeAt(tv5, -1);
-  tv1->computeAt(tv2, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({y, z}, options);
-  at::Tensor t4 = at::randn({x, y}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t4});
-  auto cg_outputs = fe.runFusion({t0, t4});
-
-  auto t1 = t0.div(2.0);
-  auto t2 = t1.to(at::kDouble).sum(1);
-  auto t3 = t2.unsqueeze(0).expand({x, y});
-  auto aten_output = t3.add(t4);
-
-  testValidate(
-      &fusion, {cg_outputs}, {t0, t4}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedIndexing1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int w = 3, x = 4, y = 7, z = 8;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  auto tv0 = makeSymbolicTensor(3);
-  auto tv1 = makeSymbolicTensor(4);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, IrBuilder::create<Double>(1.0));
-  auto tv3 = broadcast(tv2, {true, false, false, false});
-  auto tv4 = add(tv3, tv1);
-
-  fusion.addOutput(tv4);
-
-  tv4->merge(0);
-  tv4->merge(0);
-  tv4->merge(0);
-
-  tv4->split(0, 128);
-  tv4->split(0, 4);
-
-  tv2->computeAt(tv4, 1);
-
-  tv4->axis(0)->parallelize(ParallelType::BIDx);
-  tv4->axis(1)->parallelize(ParallelType::Unroll);
-  tv4->axis(2)->parallelize(ParallelType::TIDx);
-
-  tv3->axis(1)->parallelize(ParallelType::Unroll);
-  tv3->axis(2)->parallelize(ParallelType::TIDx);
-
-  tv2->axis(1)->parallelize(ParallelType::Unroll);
-  tv2->axis(2)->parallelize(ParallelType::TIDx);
-
-  FusionExecutor fe;
-
-  at::Tensor t0 = at::randn({x, y, z}, options);
-  at::Tensor t1 = at::randn({w, x, y, z}, options);
-
-  auto t3 = t0.add(1.0);
-  auto aten_output = t3.add(t1);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedIndexing2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int w = 3, x = 4, y = 7, z = 8;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  auto tv0 = makeSymbolicTensor(3);
-  auto tv1 = makeSymbolicTensor(4);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, IrBuilder::create<Double>(1.0));
-  auto tv3 = broadcast(tv2, {true, false, false, false});
-  auto tv4 = add(tv3, tv1);
-
-  fusion.addOutput(tv4);
-
-  tv4->merge(-2);
-  tv4->merge(-2);
-  tv4->merge(-2);
-
-  tv4->split(0, 128);
-  tv4->split(0, 4);
-
-  tv2->computeAt(tv4, 1);
-
-  tv4->axis(0)->parallelize(ParallelType::BIDx);
-  tv4->axis(1)->parallelize(ParallelType::Unroll);
-  tv4->axis(2)->parallelize(ParallelType::TIDx);
-
-  tv3->axis(1)->parallelize(ParallelType::Unroll);
-  tv3->axis(2)->parallelize(ParallelType::TIDx);
-
-  tv2->axis(1)->parallelize(ParallelType::Unroll);
-  tv2->axis(2)->parallelize(ParallelType::TIDx);
-
-  FusionExecutor fe;
-
-  at::Tensor t0 = at::randn({x, y, z}, options);
-  at::Tensor t1 = at::randn({w, x, y, z}, options);
-
-  auto t3 = t0.add(1.0);
-  auto aten_output = t3.add(t1);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedIndexing3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int w = 3, x = 4, y = 7, z = 8;
-
-  auto tv0 = makeSymbolicTensor(3);
-  auto tv1 = makeSymbolicTensor(4);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, IrBuilder::create<Double>(1.0));
-  auto tv3 = add(tv2, tv1);
-  fusion.addOutput(tv3);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({x, y, z}, options);
-  at::Tensor t1 = at::randn({w, x, y, z}, options);
-
-  auto t2 = t0.add(1.0);
-  auto aten_output = t2.add(t1);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  auto lparams = schedulePointwise(&fusion, aten_inputs);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs, lparams);
-  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedIndexing4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeConcreteTensor({4, 8});
-  fusion.addInput(tv0);
-  TensorView* tv1 = makeConcreteTensor({4, 4, 8});
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(1));
-  TensorView* tv3 = broadcast(tv2, {true, false, false});
-  TensorView* tv4 = add(tv3, tv1);
-  fusion.addOutput(tv4);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({4, 8}, options);
-  at::Tensor t1 = at::randn({4, 4, 8}, options);
-
-  auto t2 = t0.add(1.0);
-  auto aten_output = t2.add(t1);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedIndexing5_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  TensorView* tv1 = makeSymbolicTensor(3);
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(1));
-  TensorView* tv3 = broadcast(tv2, {true, false, true});
-  TensorView* tv4 = add(tv3, tv1);
-  fusion.addOutput(tv4);
-
-  tv3->merge(0)->merge(0)->split(0, 2)->split(0, 3);
-  tv4->merge(0)->merge(0)->split(0, 2)->split(0, 3);
-
-  tv0->computeAt(tv4, 1);
-  tv1->computeAt(tv4, 1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({7}, options);
-  at::Tensor t1 = at::randn({5, 7, 11}, options);
-
-  auto t2 = t0.add(1.0);
-  auto aten_output = t2.unsqueeze(-1).add(t1);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedIndexing6_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  std::vector<int64_t> tensor0_shape{7, 4, 7};
-  std::vector<int64_t> tensor1_shape{4, 7};
-
-  TensorView* tv0 = makeSymbolicTensor(tensor0_shape.size());
-  fusion.addInput(tv0);
-  TensorView* tv1 = makeSymbolicTensor(tensor1_shape.size());
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = add(tv0, tv1);
-  TensorView* tv3 = sum(tv2, {0, 1});
-  fusion.addOutput(tv3);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor input0 = at::randn(tensor0_shape, options);
-  at::Tensor input1 = at::randn(tensor1_shape, options);
-
-  std::vector<int64_t> reduction_axes{0, 1};
-  auto reduction_params = getReductionHeuristics(&fusion, {input0, input1});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  scheduleReduction(&fusion, *reduction_params);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input0, input1}, reduction_params->lparams);
-  auto cg_outputs = fe.runFusion({input0, input1}, reduction_params->lparams);
-
-  auto aten_output = input0.add(input1).to(at::kDouble).sum(reduction_axes);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {input0, input1},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      reduction_params->lparams);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedIndexing7_CUDA) {
-  // Might be able to use this one without 6 as the heuristics in 6 may change
-  // and this test is to cover the same issue.
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = broadcast(tv0, {false, true});
-
-  auto tv2 = makeSymbolicTensor(2);
-  fusion.addInput(tv2);
-
-  auto tv3 = add(tv1, tv2);
-  auto tv4 = sum(tv3, {0, 1});
-  fusion.addOutput(tv4);
-
-  tv4->merge(0, 1);
-  tv4->split(0, 128);
-  tv4->split(0, 4);
-
-  auto tv5 = tv4->rFactor({0, 1});
-
-  tv5->computeAt(tv4, -1);
-  tv0->computeAt(tv5, -1);
-
-  tv4->axis(0)->parallelize(ParallelType::TIDx);
-
-  const int numel_x = 100;
-  const int numel_y = 200;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto at_t0 = at::randn({numel_x}, options);
-  auto at_t1 = at::randn({numel_x, numel_y}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {at_t0, at_t1});
-  auto cg_outputs = fe.runFusion({at_t0, at_t1});
-
-  auto aten_output = (at_t0.unsqueeze(-1).expand({numel_x, numel_y}) + at_t1)
-                         .to(at::kDouble)
-                         .sum();
-
-  testValidate(
-      &fusion, cg_outputs, {at_t0, at_t1}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedIndexing8_CUDA) {
-  // Same as 7 but with outer splits instead of inner
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = broadcast(tv0, {false, true});
-
-  auto tv2 = makeSymbolicTensor(2);
-  fusion.addInput(tv2);
-
-  auto tv3 = add(tv1, tv2);
-  auto tv4 = sum(tv3, {0, 1});
-  fusion.addOutput(tv4);
-
-  tv4->merge(0, 1);
-  tv4->split(0, 128, false);
-  tv4->split(0, 4, false);
-
-  auto tv5 = tv4->rFactor({0, 1});
-
-  tv5->computeAt(tv4, -1);
-  tv0->computeAt(tv5, -1);
-
-  tv4->axis(0)->parallelize(ParallelType::TIDx);
-
-  const int numel_x = 100;
-  const int numel_y = 200;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto at_t0 = at::randn({numel_x}, options);
-  auto at_t1 = at::randn({numel_x, numel_y}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {at_t0, at_t1});
-  auto cg_outputs = fe.runFusion({at_t0, at_t1});
-
-  auto aten_output = (at_t0.unsqueeze(-1).expand({numel_x, numel_y}) + at_t1)
-                         .to(at::kDouble)
-                         .sum();
-
-  testValidate(
-      &fusion, cg_outputs, {at_t0, at_t1}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedIndexing9_CUDA) {
-  // Same as 7 but with outer splits instead of inner
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = broadcast(tv0, {false, true});
-
-  auto tv2 = mul(tv1, IrBuilder::create<Double>(2));
-  fusion.addOutput(tv2);
-
-  auto tv3 = makeSymbolicTensor(3);
-  fusion.addInput(tv3);
-
-  auto tv4 = add(tv3, tv2);
-  fusion.addOutput(tv4);
-
-  const int numel_x = 200;
-  const int numel_y = 300;
-  const int numel_z = 400;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto at_t0 = at::randn({numel_y}, options);
-  auto at_t3 = at::randn({numel_x, numel_y, numel_z}, options);
-  std::vector<IValue> aten_inputs = {at_t0, at_t3};
-
-  auto lparams = schedulePointwise(&fusion, aten_inputs);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs, lparams);
-  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
-
-  auto at_t1 = at_t0.unsqueeze(-1);
-  auto at_t2 = at_t1.mul(2.0);
-
-  auto at_t4 = at_t3.add(at_t2);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {at_t2, at_t4}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedIndexing10_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeContigTensor(2);
-  TensorView* tv1 = makeContigTensor(2);
-
-  // Register your inputs
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  // Do math with it, it returns a `Val*` but can be static_casted back to
-  // TensorView
-  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
-  TensorView* tv3 = add(tv0, tv2);
-
-  // Register your outputs
-  fusion.addOutput(tv3);
-
-  auto tv0_cache = tv0->cacheAfter();
-  auto tv1_cache = tv1->cacheAfter();
-
-  std::vector<TensorView*> tvs = {tv0_cache, tv1_cache, tv2, tv3};
-
-  for (auto tv : tvs) {
-    tv->split(1, 2, false);
-    tv->split(1, 1);
-    tv->split(-1, 4);
-    // [I0, 2, 1, I1/2/4, 4]
-    tv->reorder({{1, 2}, {2, 3}, {3, 1}});
-    tv->axis(0)->parallelize(ParallelType::BIDx);
-    tv->axis(1)->parallelize(ParallelType::TIDx);
-  }
-
-  // For all inputs, computeAt the output inline, temporaries should be squeezed
-  // between them
-  tv0->computeAt(tv3, 1);
-  tv1->computeAt(tv3, 1);
-
-  tv0_cache->axis(-1)->parallelize(ParallelType::Vectorize);
-  tv1_cache->axis(-1)->parallelize(ParallelType::Vectorize);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor input1 = at::randn({64, 128}, options);
-  at::Tensor input2 = at::rand_like(input1);
-  at::Tensor output = at::empty_like(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input1, input2});
-  fe.runFusion({input1, input2}, {output});
-
-  at::Tensor tv2_ref = input2 + 2.0;
-  at::Tensor output_ref = input1 + tv2_ref;
-
-  TORCH_CHECK(output_ref.equal(output));
-}
-
-TEST_F(NVFuserTest, FusionAdvancedIndexing11_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int w = 3, x = 4, y = 7, z = 8;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  auto tv0 = makeSymbolicTensor(4);
-  auto tv1 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1.0));
-  auto tv3 = broadcast(tv2, {true, false, true, true});
-  auto tv4 = add(tv3, tv0);
-
-  fusion.addOutput(tv4);
-
-  tv4->merge(0);
-  tv4->merge(1);
-
-  tv4->split(1, 32);
-  tv4->split(0, 1);
-
-  tv4->reorder({{2, 1}});
-
-  tv2->computeAt(tv4, 3);
-
-  tv2->setMemoryType(MemoryType::Global);
-
-  tv4->axis(0)->parallelize(ParallelType::BIDx);
-  tv4->axis(1)->parallelize(ParallelType::BIDy);
-  tv4->axis(2)->parallelize(ParallelType::Unswitch);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  FusionExecutor fe;
-
-  at::Tensor t0 = at::randn({w, x, y, z}, options);
-  at::Tensor t1 = at::randn({x}, options);
-
-  auto t3 = t1.add(1.0).unsqueeze(-1).unsqueeze(-1);
-  auto aten_output = t3.add(t0);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-// Intended to stress the lowering of our code generator
-TEST_F(NVFuserTest, FusionAdvancedLowering1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeConcreteTensor({9, 5});
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
-  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
-  TensorView* tv3 = add(tv1, IrBuilder::create<Double>(3));
-  TensorView* tv4 = sum(tv3, {1});
-
-  fusion.addOutput(tv2);
-  fusion.addOutput(tv4);
-
-  tv4->split(1, 4);
-  auto tv5 = tv4->rFactor({2});
-
-  tv1->computeAt(tv5, 2);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(1);
-  at::Tensor aten_input = at::randn({9, 5}, options);
-
-  auto t1 = aten_input.add(1.0);
-  auto t2 = t1.add(2.0);
-  auto t3 = t1.add(3.0);
-  auto t4 = t3.sum(1);
-
-  std::vector<at::Tensor> aten_outputs = {t2, t4};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedLowering2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Progressively broadcast tensors
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  TensorView* tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  TensorView* tv2 = makeSymbolicTensor(3);
-  fusion.addInput(tv2);
-
-  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(1));
-  TensorView* tv4 = broadcast(tv3, {false, true});
-  TensorView* tv5 = add(tv4, tv1);
-  TensorView* tv6 = add(tv5, tv2);
-
-  fusion.addOutput(tv6);
-
-  // Split inner dimension
-  tv6->split(1, 4);
-  // Merge middle dims with outer dimensions
-  tv6->merge(2);
-  tv6->merge(0);
-
-  // tv6[I0*I1o, I1i*I2]
-
-  // Compute everything inline
-  tv0->computeAt(tv6, -1);
-
-  tv6->axis(0)->parallelize(ParallelType::BIDx);
-  tv6->axis(1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  int x = 13, y = 9, z = 5;
-  at::Tensor t0 = at::randn({y}, options);
-  at::Tensor t1 = at::randn({y, z}, options);
-  at::Tensor t2 = at::randn({x, y, z}, options);
-
-  auto t3 = t0.add(1.0);
-  auto t4 = t3.unsqueeze(-1);
-  auto t5 = t4.add(t1);
-  auto t6 = t5.add(t2);
-
-  std::vector<IValue> aten_inputs = {t0, t1, t2};
-  std::vector<at::Tensor> aten_outputs = {t6};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
-}
-
-// TODO: Complete test
-TEST_F(NVFuserTest, FusionAdvancedLowering3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({1, -1});
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  // [b0, i1]
-  auto tv2 = add(tv0, IrBuilder::create<Double>(2.0));
-
-  // [i0, i1]
-  auto tv3 = add(tv1, IrBuilder::create<Double>(3.0));
-
-  // [b0, i1]
-  auto tv4 = add(tv2, IrBuilder::create<Double>(4.0));
-
-  // [io, i1]
-  auto tv5 = add(tv2, tv3);
-
-  fusion.addOutput(tv4);
-  fusion.addOutput(tv5);
-
-  tv0->computeAt(tv4, -1);
-
-  tv3->setMemoryType(MemoryType::Global);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  int x = 13, y = 9;
-  at::Tensor t0 = at::randn({1, y}, options);
-  at::Tensor t1 = at::randn({x, y}, options);
-
-  auto t4 = t0 + 2 + 4;
-  auto t5 = t0 + 2 + t1 + 3;
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-  std::vector<at::Tensor> aten_outputs = {t4, t5};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
-}
-
-// This excercises indexing with broadcast root axes. Non-broadcast
-// axes need to be preferred when propagating index exprs to root
-// axes. See, e.g., Index::getConsumerIndex_impl.
-TEST_F(NVFuserTest, FusionAdvancedLowering4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = broadcast(tv0, {false, true});
-  auto tv2 = broadcast(tv1, {false, false, true});
-  auto tv3 = makeSymbolicTensor(3);
-  fusion.addInput(tv3);
-  auto tv4 = add(tv2, tv3);
-  fusion.addOutput(tv4);
-
-  tv4->merge(1)->merge(0);
-  tv4->split(0, 8);
-  tv0->computeAt(tv4, 1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const int bx = 10;
-  const int by = 20;
-  const int bz = 30;
-  at::Tensor t0 = at::randn({bx}, options);
-  at::Tensor t3 = at::randn({bx, by, bz}, options);
-  std::vector<IValue> aten_inputs = {t0, t3};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output =
-      t0.unsqueeze(-1).expand({bx, by}).unsqueeze(-1).expand({bx, by, bz}) + t3;
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedLowering5_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeConcreteTensor({5, 4, 3});
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = makeConcreteTensor({5, 3});
-  fusion.addInput(tv1);
-
-  auto tv2 = broadcast(tv1, {false, true, false});
-
-  auto tv3 = add(tv0, tv2);
-
-  fusion.addOutput(tv3);
-
-  tv2->merge(0);
-  tv1->computeAt(tv2, 1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(1);
-  at::Tensor t0 = at::randn({5, 4, 3}, options);
-  at::Tensor t1 = at::randn({5, 3}, options);
-  auto t2 = t1.unsqueeze(1);
-  auto t3 = t0 + t2;
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-  std::vector<at::Tensor> aten_outputs = {t3};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedLowering6_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeConcreteTensor({5, 4, 3});
-  fusion.addInput(tv0);
-  auto tv1 = makeConcreteTensor({4});
-  fusion.addInput(tv1);
-  auto tv2 = unaryOp(UnaryOpType::Set, tv0);
-  auto tv3 = unaryOp(UnaryOpType::Set, tv1);
-
-  auto tv4 = sum(tv2, {0, 2});
-  auto tv5 = add(tv4, tv3);
-  fusion.addOutput(tv5);
-
-  auto tv6 = broadcast(tv3, {true, false, true});
-  auto tv7 = add(tv2, tv6);
-  fusion.addOutput(tv7);
-
-  tv2->computeAt(tv4, -1, ComputeAtMode::BestEffort);
-  tv3->computeAt(tv7, -1, ComputeAtMode::BestEffort);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(1);
-  at::Tensor t0 = at::randn({5, 4, 3}, options);
-  at::Tensor t1 = at::randn({4}, options);
-
-  auto t2 = t0;
-  auto t3 = t1;
-
-  std::vector<int64_t> reduction_axes{0, 2};
-  auto t4 = t2.sum(reduction_axes);
-  auto t5 = add(t4, t3);
-  auto t6 = t3.unsqueeze(0).unsqueeze(-1);
-  auto t7 = t2.add(t6);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-  std::vector<at::Tensor> aten_outputs = {t5, t7};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
-}
-
-// Test a simple Gemm but also play around with fusion executor features
-TEST_F(NVFuserTest, FusionSimpleGemm_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2); // M, K
-  TensorView* tv1 = makeSymbolicTensor(2); // K, N
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = broadcast(tv0, {false, false, true});
-  // tv2[I0, I1, B] = tv0[I0, I1]
-
-  TensorView* tv3 = broadcast(tv1, {true, false, false});
-  // tv3[B, I1, I2] = tv1[I1, I2]
-
-  // tv4[I0, I1, I2] = tv2[I0, I1, B] * tv3[B, I1, I2]
-  TensorView* tv4 = mul(tv2, tv3);
-  // tv5[I0, R1, I2] = tv4[I0, I1, I2]
-  TensorView* tv5 = sum(tv4, {1});
-  fusion.addOutput(tv5);
-
-  tv5->split(1, 32);
-  // tv5[I0, R1o, R1i{32}, I2]
-
-  auto tv6 = tv5->rFactor({1});
-  // tv6[I0, R1o, I1i{32}, I2] = tv4[I0, I1, I2]
-  // tv5[I0,    , R1i{32}, I2] = tv6[I0, R1o, I1i{32}, I2]
-
-  tv5->split(0, 4);
-  tv5->split(-1, 4);
-  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
-  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
-
-  tv0->computeAt(tv5, -1);
-  tv1->computeAt(tv5, -1);
-
-  // tv6[I0o, I0i{4}, R1o, I1i{32}, I2o, I2i{4}]
-  // tv5[I0o, I0i{4},    , R1i{32}, I2o, I2i{4}]
-  //--> (line symbolizes compute at location)
-  // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, I1o]
-  // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, R1o]
-  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
-
-  tv0->computeAt(tv6, -1);
-  tv1->computeAt(tv6, -1);
-  // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, I1o |]
-  // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, R1o |]
-  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
-
-  tv5->axis(0)->parallelize(ParallelType::BIDz);
-  tv5->axis(1)->parallelize(ParallelType::TIDz);
-
-  tv5->axis(-2)->parallelize(ParallelType::BIDy);
-  tv5->axis(-1)->parallelize(ParallelType::TIDy);
-
-  tv5->axis(2)->parallelize(ParallelType::TIDx);
-  tv6->axis(2)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 65, K = 33, N = 17;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({M, K}, options);
-  at::Tensor t1 = at::randn({K, N}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4));
-  // Lets specify a few bounds in launch params to make sure it works
-  fe.runFusion({t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4));
-
-  // Make sure bad launch params throws
-  // TODO: Re-enable once we have parallelization validation in.
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  // ASSERT_ANY_THROW(fe.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6)));
-
-  // Don't specify any launch params
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  auto aten_output = t0.to(at::kDouble).matmul(t1.to(at::kDouble));
-
-  testValidate(
-      &fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__);
-}
-
-// Softmax with a 1D tensor. Parallelized only with a single thread block.
-TEST_F(NVFuserTest, FusionSoftmax1D_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int tidx = 128;
-  const int dimx = 1000;
-
-  // Set up your input tensor views
-  TensorView* input_tv0 = makeSymbolicTensor(1);
-  fusion.addInput(input_tv0);
-
-  TensorView* exp_tv1 = unaryOp(UnaryOpType::Exp, input_tv0);
-  TensorView* sum_exp_tv2 = sum(exp_tv1, {-1});
-  TensorView* bcast_sum_tv3 = broadcast(sum_exp_tv2, {true});
-
-  // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be
-  // computed at sum_exp_rf_tv8.
-  TensorView* exp_tv1_copy = unaryOp(UnaryOpType::Exp, input_tv0);
-
-  TensorView* output_tv4 = div(exp_tv1_copy, bcast_sum_tv3);
-
-  fusion.addOutput(output_tv4);
-
-  bcast_sum_tv3->split(0, tidx);
-
-  sum_exp_tv2->split(-1, tidx);
-  TensorView* sum_exp_rf_tv5 = sum_exp_tv2->rFactor({-2});
-
-  output_tv4->split(-1, tidx);
-
-  exp_tv1->computeAt(sum_exp_rf_tv5, -1);
-  exp_tv1_copy->computeAt(output_tv4, -1);
-
-  TensorView* tensors_to_parallelize[] = {
-      sum_exp_tv2, bcast_sum_tv3, output_tv4, sum_exp_rf_tv5};
-
-  for (auto tv : tensors_to_parallelize) {
-    tv->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({dimx}, options);
-  at::Tensor cg_output = at::empty({dimx}, options);
-  at::Tensor t3_output = at::empty_like(cg_output, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  fe.runFusion({t0}, {cg_output});
-
-  auto aten_output = at::_softmax(t0.to(at::kDouble), -1, false);
-
-  testValidate(&fusion, {cg_output}, {t0}, {aten_output}, __LINE__, __FILE__);
-}
-
-// Softmax with a 1D tensor with input normalization.
-TEST_F(NVFuserTest, FusionSoftmax1DNormalized_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int tidx = 128;
-  const int dimx = 1000;
-
-  // Set up your input tensor views
-  TensorView* input_tv0 = makeSymbolicTensor(1);
-  fusion.addInput(input_tv0);
-
-  // Normalize with the max value before computing exp.
-  TensorView* max_val_tv1 = reductionOp(
-      BinaryOpType::Max, {-1}, IrBuilder::create<Double>(0), input_tv0);
-  TensorView* bcast_max_tv2 = broadcast(max_val_tv1, {true});
-  TensorView* sub_tv3 = sub(input_tv0, bcast_max_tv2);
-  TensorView* exp_tv4 = unaryOp(UnaryOpType::Exp, sub_tv3);
-  TensorView* sum_exp_tv5 = sum(exp_tv4, {-1});
-  TensorView* bcast_sum_tv6 = broadcast(sum_exp_tv5, {true});
-
-  // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be
-  // computed at sum_exp_rf_tv8.
-  TensorView* sub_tv3_copy = sub(input_tv0, bcast_max_tv2);
-  TensorView* exp_tv4_copy = unaryOp(UnaryOpType::Exp, sub_tv3_copy);
-
-  TensorView* output_tv7 = div(exp_tv4_copy, bcast_sum_tv6);
-
-  fusion.addOutput(output_tv7);
-  bcast_max_tv2->split(0, tidx);
-  bcast_sum_tv6->split(0, tidx);
-
-  max_val_tv1->split(-1, tidx);
-  TensorView* max_val_rf_tv8 = max_val_tv1->rFactor({-2});
-
-  sum_exp_tv5->split(-1, tidx);
-  TensorView* sum_exp_rf_tv9 = sum_exp_tv5->rFactor({-2});
-
-  output_tv7->split(-1, tidx);
-
-  sub_tv3->computeAt(sum_exp_rf_tv9, -1);
-  sub_tv3_copy->computeAt(output_tv7, -1);
-
-  TensorView* tensors_to_parallelize[] = {
-      max_val_tv1,
-      bcast_max_tv2,
-      sum_exp_tv5,
-      bcast_sum_tv6,
-      output_tv7,
-      max_val_rf_tv8,
-      sum_exp_rf_tv9};
-
-  for (auto tv : tensors_to_parallelize) {
-    tv->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({dimx}, options);
-  at::Tensor t3_output = at::empty({dimx}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
-
-  auto aten_output = at::_softmax(input.to(at::kDouble), -1, false);
-
-  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-// Softmax with a 3D tensor, where the inner-most 3rd dimension is
-// normalized. Pallelized with multiple thread blocks.
-TEST_F(NVFuserTest, FusionSoftmax3D_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int tidx = 32;
-  const int dimx = 32;
-  const int dimy = 16;
-  const int dimz = 130;
-
-  // Set up your input tensor views
-  TensorView* input_tv0 = makeSymbolicTensor(3);
-  fusion.addInput(input_tv0);
-
-  TensorView* exp_tv1 = unaryOp(UnaryOpType::Exp, input_tv0);
-  TensorView* sum_exp_tv2 = sum(exp_tv1, {-1});
-  TensorView* bcast_sum_tv3 = broadcast(sum_exp_tv2, {false, false, true});
-
-  // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be
-  // computed at sum_exp_rf_tv8.
-  TensorView* exp_tv1_copy = unaryOp(UnaryOpType::Exp, input_tv0);
-
-  TensorView* output_tv4 = div(exp_tv1_copy, bcast_sum_tv3);
-
-  fusion.addOutput(output_tv4);
-
-  bcast_sum_tv3->split(-1, tidx);
-
-  sum_exp_tv2->split(-1, tidx);
-  TensorView* sum_exp_rf_tv5 = sum_exp_tv2->rFactor({-2});
-
-  output_tv4->split(-1, tidx);
-
-  exp_tv1->computeAt(sum_exp_rf_tv5, -1);
-  exp_tv1_copy->computeAt(output_tv4, -1);
-
-  TensorView* tensors_to_parallelize[] = {
-      sum_exp_tv2, bcast_sum_tv3, output_tv4, sum_exp_rf_tv5};
-
-  for (auto tv : tensors_to_parallelize) {
-    tv->axis(0)->parallelize(ParallelType::BIDx);
-    tv->axis(1)->parallelize(ParallelType::BIDy);
-    tv->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({dimx, dimy, dimz}, options);
-
-  at::Tensor cg_output = at::empty({dimx, dimy, dimz}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  fe.runFusion({input}, {cg_output});
-
-  auto aten_output = at::_softmax(input.to(at::kDouble), -1, false);
-
-  testValidate(
-      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-// Softmax with a 3D tensor with input normalization.
-TEST_F(NVFuserTest, FusionSoftmax3DNormalized_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int tidx = 32;
-  const int dimx = 32;
-  const int dimy = 16;
-  const int dimz = 130;
-
-  // Set up your input tensor views
-  TensorView* input_tv0 = makeSymbolicTensor(3);
-  fusion.addInput(input_tv0);
-
-  // Normalize with the max value before computing exp.
-  TensorView* max_val_tv1 = reductionOp(
-      BinaryOpType::Max, {-1}, IrBuilder::create<Double>(0), input_tv0);
-  TensorView* bcast_max_tv2 = broadcast(max_val_tv1, {false, false, true});
-  TensorView* sub_tv3 = sub(input_tv0, bcast_max_tv2);
-  TensorView* exp_tv4 = unaryOp(UnaryOpType::Exp, sub_tv3);
-  TensorView* sum_exp_tv5 = sum(exp_tv4, {-1});
-  TensorView* bcast_sum_tv6 = broadcast(sum_exp_tv5, {false, false, true});
-
-  // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be
-  // computed at sum_exp_rf_tv8.
-  TensorView* sub_tv3_copy = sub(input_tv0, bcast_max_tv2);
-  TensorView* exp_tv4_copy = unaryOp(UnaryOpType::Exp, sub_tv3_copy);
-
-  TensorView* output_tv7 = div(exp_tv4_copy, bcast_sum_tv6);
-
-  fusion.addOutput(output_tv7);
-
-  bcast_max_tv2->split(-1, tidx);
-  bcast_sum_tv6->split(-1, tidx);
-
-  max_val_tv1->split(-1, tidx);
-  TensorView* max_val_rf_tv8 = max_val_tv1->rFactor({-2});
-
-  sum_exp_tv5->split(-1, tidx);
-  TensorView* sum_exp_rf_tv9 = sum_exp_tv5->rFactor({-2});
-
-  output_tv7->split(-1, tidx);
-
-  sub_tv3->computeAt(sum_exp_rf_tv9, -1);
-  sub_tv3_copy->computeAt(output_tv7, -1);
-
-  TensorView* tensors_to_parallelize[] = {
-      max_val_tv1,
-      bcast_max_tv2,
-      sum_exp_tv5,
-      bcast_sum_tv6,
-      output_tv7,
-      max_val_rf_tv8,
-      sum_exp_rf_tv9};
-
-  for (auto tv : tensors_to_parallelize) {
-    tv->axis(0)->parallelize(ParallelType::BIDx);
-    tv->axis(1)->parallelize(ParallelType::BIDy);
-    tv->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({dimx, dimy, dimz}, options);
-  at::Tensor t3_output = at::empty({dimx, dimy, dimz}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
-
-  auto aten_output = at::_softmax(input.to(at::kDouble), -1, false);
-
-  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSoftmaxComputeAt_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-
-  auto tv3 = add(tv0, IrBuilder::create<Double>(1.0));
-
-  auto tv4 = mul(tv2, tv3);
-
-  auto tv5 = sum(tv4, {1});
-  auto tv6 = broadcast(tv5, {false, true});
-
-  auto tv7 = sub(tv6, tv4);
-  fusion.addOutput(tv7);
-
-  tv1->computeAt(tv7, 1);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(tv1->computeAt(tv7, -1));
-}
-
-// Similar to FusionReduction but uses grid reduction
-TEST_F(NVFuserTest, FusionGridReduction1_CUDA) {
-  const int gdimx = 32;
-  const int bdimx = 128;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  // tv1[I0, R1] = tv0[I0, I1]
-  TensorView* tv1 =
-      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
-  fusion.addOutput(tv1);
-
-  TORCH_CHECK(
-      ir_utils::getReductionOps(&fusion).size(),
-      "Could not detect reduction in fusion.");
-
-  tv1->split(1, bdimx);
-  // tv1[I0, R1o, R1i{128}] = tv0[I0, I1]
-  tv1->split(1, gdimx);
-  // tv1[I0, R1oo, R1oi{32}, R1i{128}] = tv0[I0, I1]
-
-  TensorView* tv2 = tv1->rFactor({1});
-  // tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] = tv0[I0, I1]
-  // tv1[I0,        R1oi{32},  R1i{128}] = tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}]
-
-  // Incrementally, can print in between for debugging
-  tv0->computeAt(tv2, 1);
-  tv2->computeAt(tv1, 1);
-
-  // Re do it all at once, because why not.
-  tv0->computeAt(tv1, 1);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDy);
-  tv1->axis(1)->parallelize(ParallelType::BIDx);
-  tv2->axis(2)->parallelize(ParallelType::BIDx);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  int numel_x = 10000;
-  int numel_y = 65000;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-  at::Tensor cg_output = at::empty({numel_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  fe.runFusion({input}, {cg_output});
-
-  auto aten_output = input.to(at::kDouble).sum({1});
-
-  testValidate(
-      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-// Same test as the above but uses BIDy and TIDx for reduction
-TEST_F(NVFuserTest, FusionGridReduction2_CUDA) {
-  const int gdimy = 32;
-  const int bdimx = 128;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  // tv1[I0, R1] = tv0[I0, I1]
-  TensorView* tv1 =
-      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
-  fusion.addOutput(tv1);
-
-  TORCH_CHECK(
-      ir_utils::getReductionOps(&fusion).size(),
-      "Could not detect reduction in fusion.");
-
-  tv1->split(1, bdimx);
-  // tv1[I0, R1o, R1i{128}] = tv0[I0, I1]
-  tv1->split(1, gdimy);
-  // tv1[I0, R1oo, R1oi{32}, R1i{128}] = tv0[I0, I1]
-
-  TensorView* tv2 = tv1->rFactor({1});
-  // tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] = tv0[I0, I1]
-  // tv1[I0,        R1oi{32},  R1i{128}] = tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}]
-
-  // Incrementally, can print in between for debugging
-  tv0->computeAt(tv2, 1);
-  tv2->computeAt(tv1, 1);
-
-  // Re do it all at once, because why not.
-  tv0->computeAt(tv1, 1);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::BIDy);
-  tv2->axis(2)->parallelize(ParallelType::BIDy);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  int numel_x = 10000;
-  int numel_y = 65000;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
-
-  auto aten_output = input.to(at::kDouble).sum({1});
-
-  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-// Same test but uses BIDy and BIDz for reduction. No TID used.
-TEST_F(NVFuserTest, FusionGridReduction3dim1_CUDA) {
-  // Grid reductions when there aren't any threads are serial reductions
-  // keep these numbers low so our error isn't too high compared to normal cuda
-  // reductions
-  const int gdimz = 15;
-  const int gdimy = 9;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  // tv1[I0, R1] = tv0[I0, I1]
-  TensorView* tv1 =
-      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
-  fusion.addOutput(tv1);
-
-  TORCH_CHECK(
-      ir_utils::getReductionOps(&fusion).size(),
-      "Could not detect reduction in fusion.");
-
-  tv1->split(1, gdimy);
-  // tv1[I0, R1o, R1i{128}] = tv0[I0, I1]
-  tv1->split(1, gdimz);
-  // tv1[I0, R1oo, R1oi{32}, R1i{128}] = tv0[I0, I1]
-
-  TensorView* tv2 = tv1->rFactor({1});
-  // tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] = tv0[I0, I1]
-  // tv1[I0,        R1oi{32},  R1i{128}] = tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}]
-
-  // Incrementally, can print in between for debugging
-  tv0->computeAt(tv2, 1);
-  tv2->computeAt(tv1, 1);
-
-  // Re do it all at once, because why not.
-  tv0->computeAt(tv1, 1);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::BIDz);
-  tv2->axis(2)->parallelize(ParallelType::BIDz);
-  tv1->axis(-1)->parallelize(ParallelType::BIDy);
-  tv2->axis(-1)->parallelize(ParallelType::BIDy);
-
-  int numel_x = 100;
-  int numel_y = 6500;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-  at::Tensor cg_output = at::empty({numel_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  fe.runFusion({input}, {cg_output});
-
-  auto aten_output = input.to(at::kDouble).sum({1});
-  testValidate(
-      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-// Same as testGPU_FusionGridReduction3dim1 but reduces dimension 0
-TEST_F(NVFuserTest, FusionGridReduction3dim0_CUDA) {
-  // Grid reductions when there aren't any threads are serial reductions
-  // keep these numbers low so our error isn't too high compared to normal cuda
-  // reductions
-  const int gdimz = 15;
-  const int gdimy = 9;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  // tv1[R0, I1] = tv0[I0, I1]
-  TensorView* tv1 =
-      reductionOp(BinaryOpType::Add, {0}, IrBuilder::create<Double>(0), tv0);
-  fusion.addOutput(tv1);
-
-  TORCH_CHECK(
-      ir_utils::getReductionOps(&fusion).size(),
-      "Could not detect reduction in fusion.");
-
-  tv1->split(0, gdimy);
-  // tv1[R0o, R0i{128}, I1] = tv0[I0, I1]
-  tv1->split(0, gdimz);
-  // tv1[R0oo, R0oi{32}, R0i{128}, I1] = tv0[I0, I1]
-
-  TensorView* tv2 = tv1->rFactor({0});
-  // tv2[R0oo, I0oi{32}, I0i{128}, I1] = tv0[I0, I1]
-  // tv1[      R0oi{32}, R0i{128}, I1] = tv2[R0oo, I0oi{32}, I0i{128}, I1]
-
-  // Note that computeAt isn't going to make anything better as there
-  // is no dynamically sized dimension.
-
-  // Map parallelism as [Serial, BIDz, BIDy, BIDx]
-  tv1->axis(-1)->parallelize(ParallelType::BIDx);
-  tv2->axis(-1)->parallelize(ParallelType::BIDx);
-  tv1->axis(-2)->parallelize(ParallelType::BIDy);
-  tv2->axis(-2)->parallelize(ParallelType::BIDy);
-  tv1->axis(-3)->parallelize(ParallelType::BIDz);
-  tv2->axis(-3)->parallelize(ParallelType::BIDz);
-
-  int numel_x = 6500;
-  int numel_y = 100;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
-
-  auto aten_output = input.to(at::kDouble).sum({0});
-
-  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-// This is similar to the FusionReduction, but swaps BIDx and TIDx
-TEST_F(NVFuserTest, FusionGridReduction4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int bdimx = 128;
-  const int gdimx = 1024;
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  // tv1[I0, R1] = tv0[I0, I1]
-  TensorView* tv1 =
-      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
-  fusion.addOutput(tv1);
-
-  TORCH_CHECK(
-      ir_utils::getReductionOps(&fusion).size(),
-      "Could not detect reduction in fusion.");
-
-  tv1->split(1, gdimx);
-  // tv1[I0, R1o, R1i{1024}] = tv0[I0, I1]
-  tv1->split(1, 4);
-  // tv1[I0, R1oo, R1oi{4}, R1i{128}] = tv0[I0, I1]
-
-  TensorView* tv2 = tv1->rFactor({1});
-  // tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}] = tv0[I0, I1]
-  // tv1[I0,        R1oi{4},  R1i{1024}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}]
-
-  TensorView* tv3 = tv1->rFactor({1});
-  // tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}] = tv0[I0, I1]
-  // tv3[I0,        R1oi{4}, Ir1i{1024}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}]
-  // tv1[I0,                  R1i{1024}] = tv3[I0,        R1oi{4}, Ir1i{1024}]
-
-  // Incrementally, can print in between for debugging
-  tv0->computeAt(tv2, 1);
-  tv2->computeAt(tv3, 1);
-  tv3->computeAt(tv1, 1);
-
-  // Re do it all at once, because why not.
-  tv0->computeAt(tv1, 1);
-
-  tv2->axis(2)->parallelize(ParallelType::Unroll);
-  tv1->axis(0)->parallelize(ParallelType::TIDx);
-
-  tv1->axis(-1)->parallelize(ParallelType::BIDx);
-  tv2->axis(-1)->parallelize(ParallelType::BIDx);
-  tv3->axis(-1)->parallelize(ParallelType::BIDx);
-
-  int numel_x = bdimx;
-  int numel_y = 65000;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-  at::Tensor cg_output = at::empty({numel_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  fe.runFusion({input}, {cg_output});
-
-  auto aten_output = input.to(at::kDouble).sum({1});
-  testValidate(
-      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-// Grid reduction with 2D thread blocks but only TIDx and BIDx are
-// mapped to a reduction dim
-TEST_F(NVFuserTest, FusionGridReduction5_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int bdimx = 64;
-  const int bdimy = 16;
-  const int gdimx = 4;
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  // tv1[I0, R1] = tv0[I0, I1]
-  TensorView* tv1 =
-      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
-  fusion.addOutput(tv1);
-
-  TORCH_CHECK(
-      ir_utils::getReductionOps(&fusion).size(),
-      "Could not detect reduction in fusion.");
-
-  tv1->split(1, bdimx);
-  // tv1[I0, R1o, R1i{64}] = tv0[I0, I1]
-  tv1->split(1, gdimx);
-  // tv1[I0, R1oo, R1oi{4}, R1i{64}] = tv0[I0, I1]
-
-  TensorView* tv2 = tv1->rFactor({1});
-  // tv2[I0, R1oo, Ir1oi{4}, Ir1i{64}] = tv0[I0, I1]
-  // tv1[I0,        R1oi{4},  R1i{64}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{64}]
-
-  tv0->computeAt(tv1, 1);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv1->axis(-2)->parallelize(ParallelType::BIDx);
-  tv2->axis(-2)->parallelize(ParallelType::BIDx);
-
-  tv1->axis(0)->parallelize(ParallelType::TIDy);
-
-  int numel_x = bdimy;
-  int numel_y = 6500;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
-
-  auto aten_output = input.to(at::kDouble).sum({1});
-  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-// Similar to FusionGridReduction1 but with 3D tensors
-TEST_F(NVFuserTest, FusionGridReduction6_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(3);
-  fusion.addInput(tv0);
-
-  // tv1[I0, R1, R2] = tv0[I0, I1, I2]
-  TensorView* tv1 =
-      reductionOp(BinaryOpType::Add, {1, 2}, IrBuilder::create<Double>(0), tv0);
-  fusion.addOutput(tv1);
-
-  TORCH_CHECK(
-      ir_utils::getReductionOps(&fusion).size(),
-      "Could not detect reduction in fusion.");
-
-  // Splitting for TID
-  tv1->split(2, 128);
-  // tv1[I0, R1, R2o, R2i{128}] = tv0[I0, I1, I2]
-
-  // Splitting for BID
-  tv1->split(1, 128);
-
-  // tv1[I0, R1o, R1i{128}, R2o, R2i{128}] = tv0[I0, I1, I2]
-
-  TensorView* tv2 = tv1->rFactor({3});
-  // tv2[I0, I1o, I1i{128}, R2o, I2i{128}]
-  // tv1[I0, R1o, R1i{128},      R2i{128}]
-
-  TensorView* tv3 = tv1->rFactor({1});
-  // tv2[I0, I1o, I1i{128}, R2o, I2i{128}]
-  // tv3[I0, R1o, I1i{128},      I2i{128}]
-  // tv1[I0,      R1i{128},      R2i{128}]
-
-  tv3->computeAt(tv1, 1);
-  tv2->computeAt(tv3, 3);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDy);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv1->axis(-2)->parallelize(ParallelType::BIDx);
-  tv2->axis(-3)->parallelize(ParallelType::BIDx);
-  tv3->axis(-2)->parallelize(ParallelType::BIDx);
-
-  int numel_x = 6500;
-  int numel_y = 200;
-  int numel_z = numel_y;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y, numel_z}, options);
-  at::Tensor cg_output = at::empty({numel_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  fe.runFusion({input}, {cg_output});
-
-  auto aten_output = input.to(at::kDouble).sum({1, 2});
-
-  testValidate(
-      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-// See issue #1049
-TEST_F(NVFuserTest, FusionGridReduction7_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {0});
-  fusion.addOutput(tv1);
-
-  tv1->split(0, 1000);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::BIDy);
-
-  const int numel_x = 1;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x}, options);
-  at::Tensor cg_output = at::empty({numel_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto out = fe.runFusion({input});
-
-  auto aten_output = input.sum({0});
-
-  testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionGridReduction8_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {0});
-  fusion.addOutput(tv1);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-
-  const int numel_x = 2;
-  const int numel_y = 4;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto out = fe.runFusion({input});
-
-  auto aten_output = input.sum({0});
-
-  testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionGridReduction9_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = sum(tv0, {1});
-
-  auto tv2 = makeSymbolicTensor(1);
-  fusion.addInput(tv2);
-
-  auto tv3 = add(tv2, tv1);
-  fusion.addOutput(tv3);
-
-  tv1->split(1, 2);
-
-  tv1->axis(1)->parallelize(ParallelType::BIDx);
-  tv1->axis(2)->parallelize(ParallelType::BIDy);
-
-  tv1->computeAt(tv3, 1);
-
-  const int numel_x = 4;
-  const int numel_y = 10;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  at::Tensor t2 = at::randn({numel_x}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t2};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_output = fe.runFusion(aten_inputs);
-
-  auto aten_output = t0.sum({1}).add(t2);
-
-  testValidate(&fusion, cg_output, {t0, t2}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionGridReduction10_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(4);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {-1});
-  auto tv2 = sum(tv1, {-1});
-  auto tv3 = sum(tv2, {-1});
-
-  fusion.addOutput(tv3);
-  tv1->axis(0)->parallelize(ParallelType::TIDx);
-  tv1->axis(1)->parallelize(ParallelType::BIDx);
-  tv1->axis(2)->parallelize(ParallelType::TIDy);
-  tv1->axis(3)->parallelize(ParallelType::TIDz);
-
-  tv2->axis(0)->parallelize(ParallelType::TIDx);
-  tv2->axis(1)->parallelize(ParallelType::BIDx);
-  tv2->axis(2)->parallelize(ParallelType::TIDy);
-
-  tv3->axis(0)->parallelize(ParallelType::TIDx);
-  tv3->axis(1)->parallelize(ParallelType::BIDx);
-
-  tv0->computeAt(tv3, 1);
-
-  const int numel_w = 2;
-  const int numel_x = 3;
-  const int numel_y = 4;
-  const int numel_z = 5;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_w, numel_x, numel_y, numel_z}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_output = fe.runFusion({t0});
-
-  auto aten_output = t0.sum({1, 2, 3});
-
-  testValidate(&fusion, cg_output, {t0}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionNonRedAxisBind_CUDA) {
-  int bid_x = 3;
-  int tid_x = 2;
-  int red_dim = 0;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = reductionOp(
-      BinaryOpType::Add, {red_dim}, IrBuilder::create<Double>(0), tv0);
-  fusion.addOutput(tv1);
-
-  tv1->split(-1, tid_x);
-  tv1->axis(-2)->parallelize(ParallelType::BIDx);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({16, bid_x * tid_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
-
-  auto aten_output = input.to(at::kDouble).sum({red_dim});
-
-  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSplitBCast_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* input_tv0 = makeSymbolicTensor(3);
-  TensorView* input_tv1 = makeSymbolicTensor(3);
-  fusion.addInput(input_tv0);
-  fusion.addInput(input_tv1);
-
-  TensorView* sum_tv2 = reductionOp(
-      BinaryOpType::Add, {2}, IrBuilder::create<Double>(0), input_tv0);
-  TensorView* bcast_tv3 = broadcast(sum_tv2, {false, false, true});
-  TensorView* output_tv4 = div(input_tv1, bcast_tv3);
-
-  sum_tv2->split(-1, 32);
-  TensorView* sum_rf_tv5 = sum_tv2->rFactor({-2});
-
-  bcast_tv3->split(-1, 32);
-  output_tv4->split(-1, 32);
-
-  sum_rf_tv5->axis(0)->parallelize(ParallelType::BIDx);
-  sum_tv2->axis(0)->parallelize(ParallelType::BIDx);
-  bcast_tv3->axis(0)->parallelize(ParallelType::BIDx);
-  output_tv4->axis(0)->parallelize(ParallelType::BIDx);
-
-  sum_rf_tv5->axis(1)->parallelize(ParallelType::BIDy);
-  sum_tv2->axis(1)->parallelize(ParallelType::BIDy);
-  bcast_tv3->axis(1)->parallelize(ParallelType::BIDy);
-  output_tv4->axis(1)->parallelize(ParallelType::BIDy);
-
-  sum_rf_tv5->axis(-1)->parallelize(ParallelType::TIDx);
-  sum_tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  bcast_tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  output_tv4->axis(-1)->parallelize(ParallelType::TIDx);
-
-  fusion.addOutput(output_tv4);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({32, 32, 128}, options);
-  at::Tensor t1 = at::randn({32, 32, 128}, options);
-  at::Tensor cg_output = at::empty({32, 32, 128}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  fe.runFusion({t0, t1}, {cg_output});
-}
-
-TEST_F(NVFuserTest, FusionBCastInnerDim_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  // reduce then broadcast
-  auto tv1 = sum(tv0, {0});
-  auto tv2 = broadcast(tv1, {false, true});
-
-  TORCH_CHECK(!tv2->axis(0)->isReduction() && tv2->axis(1)->isBroadcast());
-}
-
-TEST_F(NVFuserTest, FusionBCastReduce_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-
-  auto tv1 = broadcast(tv0, {true, false, false});
-  auto tv2 = sum(tv1, {1});
-  TORCH_CHECK(
-      tv2->axis(0)->isBroadcast() && tv2->axis(1)->isReduction() &&
-      !tv2->axis(2)->isBroadcast() && !tv2->axis(2)->isReduction());
-}
-
-// Multiple consumer reduction with computeAt
-// https://github.com/csarofeen/pytorch/issues/110
-TEST_F(NVFuserTest, FusionReductionMultiConsumer_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = unaryOp(UnaryOpType::Exp, tv0);
-  auto tv2 =
-      reductionOp(BinaryOpType::Max, {-1}, IrBuilder::create<Double>(0), tv1);
-  auto tv3 =
-      reductionOp(BinaryOpType::Min, {-1}, IrBuilder::create<Double>(0), tv1);
-  auto tv4 = add(tv2, tv3);
-  fusion.addOutput(tv4);
-  tv1->computeAt(tv2, -1, ComputeAtMode::BestEffort);
-
-  TORCH_CHECK(tv1->getComputeAtPosition() == 2);
-}
-
-TEST_F(NVFuserTest, FusionComputeAtExprOrder1_CUDA) {
-  for (const auto i : c10::irange(2)) {
-    Fusion fusion;
-    FusionGuard fg(&fusion);
-
-    // Set up your input tensor views
-    TensorView* tv0 = makeSymbolicTensor(1);
-    fusion.addInput(tv0);
-
-    auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-    auto tv2 = add(tv0, IrBuilder::create<Double>(1));
-    TensorView* tv3 = add(tv1, tv2);
-    // Set outputs tv2 or tv1 and then tv3
-    if (i == 0) {
-      fusion.addOutput(tv2);
-    } else {
-      fusion.addOutput(tv1);
-    }
-    fusion.addOutput(tv3);
-
-    if (i == 0) {
-      tv1->computeAt(tv3, -1);
-    } else {
-      tv2->computeAt(tv3, -1);
-    }
-
-    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-    at::Tensor aten_input = at::randn({100}, options);
-    std::vector<at::Tensor> aten_outputs = {
-        aten_input + 1, (aten_input + 1) * 2};
-
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {aten_input});
-    auto cg_outputs = fe.runFusion({aten_input});
-
-    testValidate(
-        &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-  }
-}
-
-TEST_F(NVFuserTest, FusionComputeAtExprOrder2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
-  TensorView* tv3 = add(tv1, tv2);
-  fusion.addOutput(tv3);
-
-  tv3->split(-1, 32);
-
-  tv1->computeAt(tv3, -1);
-  tv2->computeAt(tv3, -2);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({100, 100}, options);
-  auto aten_output = (aten_input + 1) * 2;
-
-  at::Tensor cg_output = at::empty_like(aten_input, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionComputeAtExprOrder3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int64_t dimx = 13;
-  const int64_t dimy = 15;
-
-  TensorView* tv0 = makeConcreteTensor({dimx, dimy});
-  fusion.addInput(tv0);
-  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
-  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
-  TensorView* tv3 = add(tv2, IrBuilder::create<Double>(3));
-  TensorView* tv4 = add(tv3, IrBuilder::create<Double>(4));
-  TensorView* tv5 = mul(tv2, tv4);
-  fusion.addOutput(tv5);
-
-  tv1->computeAt(tv2, 2);
-  tv3->computeAt(tv4, 1);
-  tv4->computeAt(tv5, 2);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({dimx, dimy}, options);
-  auto t1 = aten_input.add(1.);
-  auto t2 = t1.add(2.);
-  auto t3 = t2.add(3.);
-  auto t4 = t3.add(4.);
-  auto aten_output = t2.mul(t4);
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionZeroDimComputeAt_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {0});
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv2);
-  TORCH_CHECK(tv2->nDims() == 0);
-  tv1->computeAt(tv2, 0);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({100}, options);
-  auto aten_output = aten_input.to(at::kDouble).sum() + 1;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionZeroDimBroadcast_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(0);
-  fusion.addInput(tv0);
-
-  auto tv1 = broadcast(tv0, {true, true});
-  TORCH_CHECK(tv1->nDims() == 2);
-
-  TensorView* tv2 = makeSymbolicTensor(2);
-  fusion.addInput(tv2);
-
-  auto tv3 = add(tv1, tv2);
-  auto tv4 = sum(tv3, {0, 1});
-  fusion.addOutput(tv4);
-
-  tv3->computeAt(tv4, -1);
-  tv3->axis(-2)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDy);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({}, options);
-  at::Tensor t1 = at::randn({10, 10}, options);
-
-  auto aten_output = (t0.unsqueeze(-1).unsqueeze(-1).expand({10, 10}) + t1)
-                         .to(at::kDouble)
-                         .sum();
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-  at::Tensor cg_output = at::empty({}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  fe.runFusion(aten_inputs, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionZeroDimReduction_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int bdimx = 32;
-  const int gdimx = 32;
-
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {0});
-  fusion.addOutput(tv1);
-
-  tv1->split(0, bdimx);
-  tv1->split(0, gdimx);
-  auto tv2 = tv1->rFactor({0});
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-2)->parallelize(ParallelType::BIDx);
-  tv2->axis(-2)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({1000}, options);
-  auto aten_output = aten_input.to(at::kDouble).sum();
-
-  at::Tensor cg_output = at::empty({}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionBCastAfterReduce_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-  const int tidx = 128;
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-
-  tv1->split(1, tidx);
-  auto tv3 = tv1->rFactor({-2});
-
-  TensorView* tv4 = makeSymbolicTensor(2);
-  fusion.addInput(tv4);
-
-  auto tv5 = add(tv2, tv4);
-  fusion.addOutput(tv5);
-  tv5->split(1, tidx);
-
-  tv3->computeAt(tv5, 1);
-
-  tv2->split(1, tidx);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv5->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv5->axis(0)->parallelize(ParallelType::BIDx);
-
-  int x = 63, y = 200;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({x, y}, options);
-  at::Tensor t4 = at::randn({x, y}, options);
-
-  auto t3 = t0.to(at::kDouble).sum({1}).unsqueeze(-1).expand({x, y});
-  auto aten_output = t3.add(t4);
-
-  std::vector<IValue> aten_inputs = {t0, t4};
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t4});
-  auto cg_outputs = fe.runFusion({t0, t4});
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionOutputBroadcast_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeConcreteTensor({2, 3});
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = broadcast(tv0, {true, false, true, false, true});
-
-  fusion.addOutput(tv1);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({2, 3}, options);
-  auto aten_output = aten_input.unsqueeze(2).unsqueeze(1).unsqueeze(0);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionReductionKeepDimBasic_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeConcreteTensor({2, 3, 4, 5, 6});
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = sum(tv0, {0, 2, -1}, /*keep_dim=*/true);
-
-  fusion.addOutput(tv1);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({2, 3, 4, 5, 6}, options);
-  auto aten_output =
-      aten_input.to(at::kDouble).sum({0, 2, -1}, /*keepdim=*/true);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionReductionKeepDimScheduler_CUDA) {
-  constexpr int bid_x = 80;
-  constexpr int tid_x = 4096;
-  constexpr int red_dim = 1;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeConcreteTensor({bid_x, tid_x});
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = reductionOp(
-      BinaryOpType::Add,
-      {red_dim},
-      IrBuilder::create<Double>(0),
-      tv0,
-      /*keep_dim=*/true);
-
-  fusion.addOutput(tv1);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({bid_x, tid_x}, options);
-  auto aten_output =
-      aten_input.to(at::kDouble).sum({red_dim}, /*keepdim=*/true);
-
-  // Apply reduction heuristic
-  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  scheduleReduction(&fusion, *reduction_params);
-
-  auto lparams = reduction_params->lparams;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input}, lparams);
-  auto cg_outputs = fe.runFusion({aten_input}, lparams);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST_F(NVFuserTest, FusionSumTo_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  std::vector<int64_t> tensor_shape{2, 3, 4, 5, 6};
-  std::vector<int64_t> sum_to_shape{1, 5, 6};
-
-  std::vector<int64_t> tensor_shape_ref{2, 3, 4, 5, 6};
-  std::vector<int64_t> sum_to_shape_ref{1, 5, 6};
-
-  std::vector<Int*> sum_to_symb;
-  std::transform(
-      sum_to_shape.begin(),
-      sum_to_shape.end(),
-      std::back_inserter(sum_to_symb),
-      [](int s) -> Int* { return IrBuilder::create<Int>(s); });
-
-  TensorView* tv0 = makeConcreteTensor(tensor_shape);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = sum_to(tv0, sum_to_symb);
-  fusion.addOutput(tv1);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn(tensor_shape_ref, options);
-  auto aten_output = at::sum_to(aten_input.to(at::kDouble), sum_to_shape_ref);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  TORCH_CHECK(
-      cg_outputs[0].dim() == static_cast<int64_t>(sum_to_shape.size()),
-      "sum_to not keeping the final dimension");
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSumToNoop_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  std::vector<int64_t> tensor_shape{4, 5, 6};
-  std::vector<int64_t> sum_to_shape{4, 5, 6};
-
-  std::vector<int64_t> tensor_shape_ref{4, 5, 6};
-  std::vector<int64_t> sum_to_shape_ref{4, 5, 6};
-
-  std::vector<Int*> sum_to_symb;
-  std::transform(
-      sum_to_shape.begin(),
-      sum_to_shape.end(),
-      std::back_inserter(sum_to_symb),
-      [](int s) -> Int* { return IrBuilder::create<Int>(s); });
-
-  TensorView* tv0 = makeConcreteTensor(tensor_shape);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = sum_to(tv0, sum_to_symb);
-
-  // Dummy operator to avoid tv0 both input and output
-  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(0));
-  fusion.addOutput(tv2);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn(tensor_shape_ref, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
-  auto aten_output = at::sum_to(aten_input.to(at::kDouble), sum_to_shape_ref);
-
-  TORCH_CHECK(
-      cg_outputs[0].dim() == static_cast<int64_t>(sum_to_shape.size()),
-      "sum_to not keeping the final dimension");
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionReductionScheduler_CUDA) {
-  constexpr int bid_x = 80;
-  constexpr int tid_x = 4096;
-  constexpr int red_dim = 1;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = reductionOp(
-      BinaryOpType::Add, {red_dim}, IrBuilder::create<Double>(0), tv0);
-  fusion.addOutput(tv1);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({bid_x, tid_x}, options);
-  auto aten_output = aten_input.to(at::kDouble).sum({red_dim});
-
-  // Apply reduction heuristic
-  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  scheduleReduction(&fusion, *reduction_params);
-
-  auto lparams = reduction_params->lparams;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input}, lparams);
-  // no broadcasting needed, omitting the last optional argument;
-  auto cg_outputs = fe.runFusion({aten_input}, lparams);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-// Simple reduction parallelized on a symbolic size.
-TEST_F(NVFuserTest, FusionSymbolicReduction_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  // tv1[I0, R1] = tv0[I0, I1]
-  TensorView* tv1 =
-      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
-  fusion.addOutput(tv1);
-
-  // Interface should just be a direct split with a Parallel type. We can
-  // include the parallelize call if we do this.
-  tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
-  // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1]
-
-  TensorView* tv2 = tv1->rFactor({1});
-  // tv2[I0, R1oo, Ir1oi{4}, Ir1i{BIDx}] = tv0[I0, I1]
-  // tv1[I0,        R1oi{4},  R1i{BIDx}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{BIDx}]
-
-  // Incrementally, can print in between for debugging
-  tv0->computeAt(tv2, 1);
-  tv2->computeAt(tv1, 1);
-
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-
-  int numel_x = 65000;
-  int numel_y = 1025;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
-  auto aten_output = aten_input.to(at::kDouble).sum({1});
-
-  // How many threads to use for the block reduction
-  int runtime_threadIdx_dim = 128;
-
-  LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input}, lparams);
-  auto cg_outputs = fe.runFusion({aten_input}, lparams);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST_F(NVFuserTest, FusionReductionSchedulerMultiDimNonFastest_CUDA) {
-  const std::vector<int> red_dims = {0, 2};
-  // Copy is because CodeGen requires int and Pytorch requires int64_t
-  // for a vector of reduction dimensions
-  const std::vector<int64_t> red_dims64 = {0, 2};
-  const std::vector<int64_t> tensor_dims_in = {5, 10, 15, 20};
-  const std::vector<int64_t> tensor_dims_out = {10, 20};
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(tensor_dims_in.size());
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = reductionOp(
-      BinaryOpType::Add, red_dims, IrBuilder::create<Double>(0), tv0);
-  fusion.addOutput(tv1);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn(tensor_dims_in, options);
-  auto aten_output = aten_input.to(at::kDouble).sum(red_dims64);
-  at::Tensor cg_output = at::empty(tensor_dims_out, options);
-
-  // Apply reduction heuristic
-  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  scheduleReduction(&fusion, *reduction_params);
-  auto lparams = reduction_params->lparams;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input}, lparams);
-  fe.runFusion({aten_input}, {cg_output}, lparams);
-
-  testValidate(
-      &fusion,
-      {cg_output},
-      {aten_input},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST_F(NVFuserTest, FusionReductionSchedulerMultiDimFastest_CUDA) {
-  const std::vector<int> red_dims = {1, 3};
-  // Copy is because CodeGen requires int and Pytorch requires int64_t
-  // for a vector of reduction dimensions
-  const std::vector<int64_t> red_dims64 = {1, 3};
-  const std::vector<int64_t> tensor_dims_in = {5, 10, 15, 20};
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(tensor_dims_in.size());
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = reductionOp(
-      BinaryOpType::Add, red_dims, IrBuilder::create<Double>(0), tv0);
-  fusion.addOutput(tv1);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn(tensor_dims_in, options);
-  auto aten_output = aten_input.to(at::kDouble).sum(red_dims64);
-
-  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  scheduleReduction(&fusion, *reduction_params);
-  auto lparams = reduction_params->lparams;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input}, lparams);
-  auto cg_outputs = fe.runFusion({aten_input}, lparams);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST_F(NVFuserTest, FusionReductionSchedulerNoODimShmoo_CUDA) {
-  std::vector<DataType> dtypes = {
-      DataType::Double, DataType::Float, DataType::Half};
-  // TODO: add test for complex. Currently complex fails with the following
-  // NVRTC compilation error message:
-  //   error: no suitable user-defined conversion from
-  //   "CudaCodeGen::std::complex<double>" to "CudaCodeGen::std::complex<float>"
-  //   exists
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-  if (at::cuda::getDeviceProperties(0)->major >= 8) {
-    dtypes.insert(dtypes.end(), DataType::BFloat16);
-  }
-#endif
-
-  std::vector<int> red_dims;
-
-  // Tried to cut down the number iterations with just
-  // doing every other power of 2.
-  for (int i = 1; i <= 1024 * 1024; i <<= 2) {
-    red_dims.push_back(i);
-  }
-
-  for (auto dtype : dtypes) {
-    at::ScalarType aten_dtype = data_type_to_aten(dtype);
-    for (auto& rdim : red_dims) {
-      Fusion fusion;
-      FusionGuard fg(&fusion);
-
-      bool is_fp16 = dtype == DataType::Half;
-      bool is_bf16 = dtype == DataType::BFloat16;
-
-      TensorView* tv0 = makeSymbolicTensor(1, dtype);
-      fusion.addInput(tv0);
-
-      TensorView* tv0_cast = tv0;
-      if (is_fp16 || is_bf16) {
-        tv0_cast = castOp(DataType::Float, tv0);
-      }
-
-      TensorView* tv1 = sum(tv0_cast, {0});
-
-      TensorView* tv1_cast = tv1;
-      if (is_fp16) {
-        tv1_cast = castOp(DataType::Half, tv1);
-      }
-      if (is_bf16) {
-        tv1_cast = castOp(DataType::BFloat16, tv1);
-      }
-
-      fusion.addOutput(tv1_cast);
-
-      auto options = at::TensorOptions().dtype(aten_dtype).device(at::kCUDA, 0);
-
-      at::Tensor aten_input = at::randn({rdim}, options);
-      auto aten_output = aten_input.to(at::kDouble).sum({0});
-
-      auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
-      TORCH_CHECK(reduction_params != nullptr, "Reduction is not found!");
-      scheduleReduction(&fusion, *reduction_params);
-      auto lparams = reduction_params->lparams;
-
-      FusionExecutor fe;
-      fe.compileFusion(&fusion, {aten_input}, lparams);
-      auto cg_outputs = fe.runFusion({aten_input}, lparams);
-
-      testValidate(
-          &fusion,
-          cg_outputs,
-          {aten_input},
-          {aten_output},
-          __LINE__,
-          __FILE__,
-          "",
-          lparams);
-    }
-  }
-}
-
-TEST_F(NVFuserTest, FusionReductionSchedulerDimShmoo_CUDA) {
-  std::vector<DataType> dtypes = {
-      DataType::Double, DataType::Float, DataType::Half};
-  // TODO: add complex support. Currently, complex fails with the following
-  // NVRTC compilation error:
-  //   error: no instance of overloaded function "__shfl_xor_sync" matches the
-  //   argument list
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-  if (at::cuda::getDeviceProperties(0)->major >= 8) {
-    dtypes.insert(dtypes.end(), DataType::BFloat16);
-  }
-#endif
-
-  std::vector<int> red_axis = {1, 0};
-  std::vector<int> output_dims = {160, 320};
-  std::vector<int> red_dims;
-
-  // Tried to cut down the number iterations with just
-  // doing every other power of 2.
-  for (int i = 1; i <= 1024 * 1024; i <<= 2) {
-    red_dims.push_back(i);
-  }
-
-  for (auto dtype : dtypes) {
-    at::ScalarType aten_dtype = data_type_to_aten(dtype);
-    for (auto& axis : red_axis) {
-      for (auto& odim : output_dims) {
-        for (auto& rdim : red_dims) {
-          Fusion fusion;
-          FusionGuard fg(&fusion);
-
-          bool is_fp16 = dtype == DataType::Half;
-          bool is_bf16 = dtype == DataType::BFloat16;
-
-          TensorView* tv0 = makeSymbolicTensor(2, dtype);
-          fusion.addInput(tv0);
-
-          TensorView* tv0_cast = tv0;
-          if (is_fp16 || is_bf16) {
-            tv0_cast = castOp(DataType::Float, tv0);
-          }
-
-          TensorView* tv1 = sum(tv0_cast, {axis});
-
-          TensorView* tv1_cast = tv1;
-          if (is_fp16) {
-            tv1_cast = castOp(DataType::Half, tv1);
-          }
-          if (is_bf16) {
-            tv1_cast = castOp(DataType::BFloat16, tv1);
-          }
-          fusion.addOutput(tv1_cast);
-
-          auto options =
-              at::TensorOptions().dtype(aten_dtype).device(at::kCUDA, 0);
-
-          at::Tensor aten_input =
-              (axis ? at::randn({odim, rdim}, options)
-                    : at::randn({rdim, odim}, options));
-
-          auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
-          TORCH_CHECK(reduction_params != nullptr, "Reduction is not found!");
-          scheduleReduction(&fusion, *reduction_params);
-          auto lparams = reduction_params->lparams;
-
-          FusionExecutor fe;
-          fe.compileFusion(&fusion, {aten_input}, lparams);
-          auto cg_outputs = fe.runFusion({aten_input}, lparams);
-          auto aten_output = aten_input.to(at::kDouble).sum({axis});
-          testValidate(
-              &fusion,
-              cg_outputs,
-              {aten_input},
-              {aten_output},
-              __LINE__,
-              __FILE__,
-              "",
-              lparams);
-        }
-      }
-    }
-  }
-}
-
-TEST_F(NVFuserTest, FusionCacheBefore_CUDA) {
-  // TVM Cache Write
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1.0));
-  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(3.0));
-  fusion.addInput(tv0);
-  fusion.addOutput(tv2);
-
-  // Before: TV2 = TV1 * 3
-  // After:  TV3 = TV1 * 3;
-  //         TV2 = TV3;
-  TensorView* tv3 = tv2->cacheBefore();
-
-  constexpr int BSX = 32;
-  tv2->split(-1, BSX);
-  tv0->computeAt(tv2, -1);
-
-  // Thread and Block binding
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 32, N = 750;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({M, N}, options);
-  at::Tensor aten_output = (aten_input + 1.0) * 3.0;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionCacheAfter_CUDA) {
-  // TVM Cache Read
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1.0));
-  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(3.0));
-  fusion.addInput(tv0);
-  fusion.addOutput(tv2);
-
-  // Before: TV1 = TV0 + 1
-  // After:  TV3 = TV0;
-  //         TV1 = TV3 + 1
-  TensorView* tv3 = tv0->cacheAfter();
-
-  constexpr int BSX = 32;
-  tv2->split(-1, BSX);
-  tv0->computeAt(tv2, -1);
-
-  // Thread and Block binding
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 32, N = 457;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({M, N}, options);
-  at::Tensor aten_output = (aten_input + 1.0) * 3.0;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionCacheFork_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1.0));
-  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(3.0));
-  fusion.addInput(tv0);
-  fusion.addOutput(tv1);
-  fusion.addOutput(tv2);
-  // Before:  TV1 = TV0 + 1
-  //          TV2 = TV1 * 1
-  // Output:  TV1, TV2
-
-  // After:   TV1 = TV0 + 1
-  //          TV3 = TV1
-  //          TV2 = TV1 * 1
-  // Output:  TV3, TV2
-
-  // cacheFork !!does not!! automatically apply ComputeAt to the cache
-  auto tv3 = tv1->cacheFork();
-
-  constexpr int BSX = 32;
-  tv2->split(-1, BSX);
-  tv0->computeAt(tv2, -1);
-
-  // Thread and Block binding
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 32, N = 457;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({M, N}, options);
-  at::Tensor aten_output1 = aten_input + 1.0;
-  at::Tensor aten_output2 = aten_output1 * 3.0;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {aten_output1, aten_output2},
-      __LINE__,
-      __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionCacheIndirect_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-  TensorView* tv2 = makeSymbolicTensor(2);
-  TensorView* tv3 = makeSymbolicTensor(2);
-  TensorView* tv4 = sub(tv2, tv3);
-  TensorView* tv5 = add(tv1, tv4);
-  TensorView* tv6 = sub(tv5, tv0);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addInput(tv2);
-  fusion.addInput(tv3);
-  fusion.addOutput(tv6);
-  // t6 = ((t1 + (t2 - t3)) - t0)
-
-  tv5->cacheAfter();
-  tv5->cacheBefore();
-
-  // cacheAfter on inputs placed before schedule
-  constexpr int BSX = 32;
-  tv6->split(-1, BSX);
-  tv2->computeAt(tv6, -1);
-
-  // Thread and Block binding
-  tv6->axis(0)->parallelize(ParallelType::BIDx);
-  tv6->axis(-1)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 32, N = 810;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({M, N}, options);
-  at::Tensor t1 = at::randn({M, N}, options);
-  at::Tensor t2 = at::randn({M, N}, options);
-  at::Tensor t3 = at::randn({M, N}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1, t2, t3};
-  at::Tensor aten_output = (t1 + (t2 - t3)) - t0;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionCacheBcast_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Algorithm
-  TensorView* tv0 = makeSymbolicTensor(1); // (M, 1)
-  TensorView* tv1 = broadcast(tv0, {false, true});
-  TensorView* tv2 = makeSymbolicTensor(1); // (1, N)
-  TensorView* tv3 = broadcast(tv2, {true, false});
-  TensorView* tv4 = mul(tv1, tv3);
-  fusion.addInput(tv0);
-  fusion.addInput(tv2);
-  fusion.addOutput(tv4);
-
-  // Case 1
-  tv0->cacheAfter();
-
-  // Case 2
-  tv1->cacheBefore();
-
-  // Case 3
-  tv1->cacheAfter();
-
-  // Case 4
-  TensorView* tv8 = tv4->cacheBefore();
-
-  constexpr int BSX = 128;
-  tv4->split(0, BSX);
-  tv4->split(-1, BSX);
-  tv4->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}});
-  // M/BSX, N/BSY, BSX, BSY
-  tv0->computeAt(tv4, 2);
-  tv2->computeAt(tv4, 2);
-  // 0, 1 | 2, 3, 4
-
-  tv4->axis(0)->parallelize(ParallelType::BIDx);
-  tv4->axis(1)->parallelize(ParallelType::BIDy);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  // Manual Replay on TV3
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv8->axis(-1)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 92, N = 500;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({M}, options);
-  at::Tensor t1 = at::randn({N}, options);
-  std::vector<IValue> aten_inputs = {t0, t1};
-  at::Tensor aten_output =
-      t0.to(at::kDouble).unsqueeze(1).matmul(t1.to(at::kDouble).unsqueeze(0));
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionCacheMultiConsumer_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(1);
-  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
-  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
-  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(1));
-  TensorView* tv4 = add(tv3, IrBuilder::create<Double>(2));
-
-  fusion.addInput(tv0);
-  fusion.addOutput(tv2);
-  fusion.addOutput(tv4);
-
-  auto tv5 = tv1->cacheBefore();
-  auto tv6 = tv3->cacheBefore();
-  tv5->setMemoryType(MemoryType::Shared);
-  tv6->setMemoryType(MemoryType::Shared);
-
-  tv1->computeAt(tv2, -1);
-  tv3->computeAt(tv4, -1);
-
-  // Fails because tensor must be recomputed twice
-  // auto tv7 = tv0->cacheAfter();
-
-  constexpr int N = 800;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({N}, options);
-  auto aten_output = (aten_input + 1) + 2;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {aten_output, aten_output},
-      __LINE__,
-      __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSmem_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Algorithm
-  TensorView* tv0 = makeSymbolicTensor(2); // (M, N)
-  TensorView* tv1 = makeSymbolicTensor(2); // (M, N)
-  TensorView* tv2 = mul(tv0, tv1);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addOutput(tv2);
-
-  // Schedule
-  TensorView* tv3 = tv0->cacheAfter();
-  TensorView* tv4 = tv1->cacheAfter();
-  tv3->setMemoryType(MemoryType::Shared);
-  tv4->setMemoryType(MemoryType::Shared);
-
-  constexpr int BSY = 32;
-  constexpr int BSX = 128;
-  tv2->split(0, BSY);
-  tv2->split(2, BSX);
-  // M/BSX, BSX, N/BSX, BSX
-  tv2->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}});
-  // M/BSX, N/BSX, BSX, BSX
-
-  tv0->computeAt(tv2, 2);
-  tv1->computeAt(tv2, 2);
-
-  // Thread and Block binding
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(1)->parallelize(ParallelType::BIDy);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  // Manual Binding
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 128, N = 10240;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({M, N}, options);
-  at::Tensor t1 = at::randn({M, N}, options);
-  at::Tensor aten_output = mul(t0, t1);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-
-  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
-}
-
-TEST_F(NVFuserTest, FusionSmemReduce_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Algorithm
-  TensorView* tv0 = makeSymbolicTensor(3); // M, K, N
-  TensorView* tv1 = sum(tv0, {1}); // M, R, N
-  fusion.addInput(tv0);
-  fusion.addOutput(tv1);
-
-  TensorView* tv2 = tv0->cacheAfter();
-  tv2->setMemoryType(MemoryType::Shared);
-
-  // Schedule
-  constexpr int BSX = 32;
-  tv1->split(2, BSX);
-  tv1->split(1, 128);
-  tv1->split(0, BSX);
-  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
-  tv1->reorder({{0, 0}, {1, 2}, {2, 4}, {3, 5}, {4, 1}, {5, 3}});
-  TensorView* tv3 = tv1->rFactor({-2});
-
-  tv0->computeAt(tv1, -2);
-  tv0->computeAt(tv3, -2);
-
-  // Thread and Block binding
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::BIDy);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  // Manual Binding
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 154, K = 45, N = 1524;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({M, K, N}, options);
-  at::Tensor aten_output = sum(aten_input.to(at::kDouble), {1});
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
-}
-
-TEST_F(NVFuserTest, FusionSmemBlockGemm_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Algorithm
-  TensorView* tv0 = makeSymbolicTensor(2); // (M, K)
-  TensorView* tv1 = makeSymbolicTensor(2); // (K, N)
-  TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B)
-  TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N)
-  TensorView* tv4 = mul(tv2, tv3); // M, K, N
-  TensorView* tv5 = sum(tv4, {1}); // M, R, N
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addOutput(tv5);
-
-  // Schedule
-  constexpr int BSX = 16;
-  tv5->split(2, BSX - 1);
-  tv5->split(1, BSX);
-  tv5->split(0, BSX + 1);
-  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
-  tv5->reorder({{0, 0}, {1, 3}, {2, 2}, {3, 5}, {4, 1}, {5, 4}});
-  // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX
-  TensorView* tv6 = tv5->rFactor({-1});
-
-  tv2->setMemoryType(MemoryType::Shared);
-  tv3->setMemoryType(MemoryType::Shared);
-  tv4->setMemoryType(MemoryType::Shared);
-  tv6->setMemoryType(MemoryType::Shared);
-
-  tv0->computeAt(tv5, 3);
-  tv1->computeAt(tv5, 3);
-
-  // Thread and Block binding
-  tv5->axis(0)->parallelize(ParallelType::BIDx);
-  tv5->axis(1)->parallelize(ParallelType::BIDy);
-  tv5->axis(-2)->parallelize(ParallelType::TIDy);
-  tv5->axis(-1)->parallelize(ParallelType::TIDx);
-  // Manual Binding
-  tv2->axis(-3)->parallelize(ParallelType::TIDy);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(-3)->parallelize(ParallelType::TIDy);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv6->axis(-3)->parallelize(ParallelType::TIDy);
-  tv6->axis(-2)->parallelize(ParallelType::TIDx);
-
-  // Make sure BIDx is makred as exact (see issue #1119)
-  GpuLower gpulw(&fusion);
-  TORCH_CHECK(gpulw.parallelDimensionMap().isExact(ParallelType::BIDx));
-
-  constexpr int M = 154, K = 45, N = 1524;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({M, K}, options);
-  at::Tensor t1 = at::randn({K, N}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-  at::Tensor aten_output = matmul(t0.to(at::kDouble), t1.to(at::kDouble));
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-
-  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
-}
-
-TEST_F(NVFuserTest, FusionSmemBlockGemmCache_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Algorithm
-  TensorView* tv0 = makeSymbolicTensor(2); // (M, K)
-  TensorView* tv1 = makeSymbolicTensor(2); // (K, N)
-  TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B)
-  TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N)
-  TensorView* tv4 = mul(tv2, tv3); // M, K, N
-  TensorView* tv5 = sum(tv4, {1}); // M, R, N
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addOutput(tv5);
-
-  // Schedule
-  // Remove reduction axis from tv5
-  // tv6 = (M, R, N)
-  // tv5 = (M, N)
-  TensorView* tv6 = tv5->cacheBefore();
-
-  constexpr int BSX = 16;
-  tv5->split(1, BSX);
-  tv5->split(0, BSX);
-  // M/BSX, BSX, N/BSX, BSX
-  tv5->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}});
-  // tv5 = M/BSX, N/BSX, MSX, NSX
-
-  tv6->computeAt(tv5, 2);
-  tv6->computeAt(tv5, 2);
-
-  tv6->split(-1, BSX);
-  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
-  tv6->reorder({{0, 0}, {1, 1}, {2, 3}, {3, 4}, {4, 2}, {5, 5}});
-  // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX
-  TensorView* tv7 = tv6->rFactor({-1});
-  // tv7 = M/BSX, N/BSX, K/BSXrf, MSX, NSX, KSXr
-  // tv6 = M/BSX, N/BSX, K/BSXr, MSX, NSX
-
-  tv0->computeAt(tv6, 3);
-  tv1->computeAt(tv6, 3);
-
-  tv0->computeAt(tv7, 3);
-  tv1->computeAt(tv7, 3);
-
-  tv2->setMemoryType(MemoryType::Shared);
-  tv3->setMemoryType(MemoryType::Shared);
-  tv4->setMemoryType(MemoryType::Shared);
-  tv6->setMemoryType(MemoryType::Shared);
-  tv7->setMemoryType(MemoryType::Shared);
-  // Memory Type
-
-  // Thread and Block binding
-  tv5->axis(0)->parallelize(ParallelType::BIDx);
-  tv5->axis(1)->parallelize(ParallelType::BIDy);
-  tv5->axis(-2)->parallelize(ParallelType::TIDy);
-  tv5->axis(-1)->parallelize(ParallelType::TIDx);
-  // Manual Binding
-  tv2->axis(-3)->parallelize(ParallelType::TIDy);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(-3)->parallelize(ParallelType::TIDy);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv7->axis(-3)->parallelize(ParallelType::TIDy);
-  tv7->axis(-2)->parallelize(ParallelType::TIDx);
-
-  tv6->axis(-2)->parallelize(ParallelType::TIDy);
-  tv6->axis(-1)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 154, K = 45, N = 1524;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({M, K}, options);
-  at::Tensor t1 = at::randn({K, N}, options);
-  at::Tensor aten_output = matmul(t0.to(at::kDouble), t1.to(at::kDouble));
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-
-  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
-}
-
-TEST_F(NVFuserTest, FusionSmemDynamicPersistentSoftmax2D_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* x = makeSymbolicTensor(2);
-  fusion.addInput(x);
-  TensorView* max_val = reductionOp(
-      BinaryOpType::Max,
-      {-1},
-      IrBuilder::create<Double>(std::numeric_limits<float>::lowest()),
-      x); // (M)
-  TensorView* bcast_max = broadcast(max_val, {false, true}); // (M, B)
-  TensorView* x_max_sub = sub(x, bcast_max); // (M, N)
-  TensorView* exp = unaryOp(UnaryOpType::Exp, x_max_sub); // (M, N)
-  TensorView* sum_exp = sum(exp, {-1}); // (M, R)
-  TensorView* bcast_sum = broadcast(sum_exp, {false, true}); // (M, B)
-  TensorView* softmax = div(exp, bcast_sum); // (M, N)
-  fusion.addOutput(softmax);
-
-  // Read Input into Shared Memory
-  // Load Input + Pwise into shared memory
-  auto cache_x = x->cacheAfter();
-  cache_x->setMemoryType(MemoryType::Shared);
-  exp->setMemoryType(MemoryType::Shared);
-
-  std::vector<TensorView*> all_tensors(
-      {x,
-       cache_x,
-       max_val,
-       bcast_max,
-       x_max_sub,
-       exp,
-       sum_exp,
-       bcast_sum,
-       softmax});
-
-  auto tidx = IrBuilder::create<Int>();
-  fusion.addInput(tidx);
-
-  for (auto tensor : all_tensors) {
-    tensor->split(-1, tidx);
-  }
-
-  auto sum_exp_rf = sum_exp->rFactor({1});
-  all_tensors.push_back(sum_exp_rf);
-
-  // computeAt
-  x->computeAt(x_max_sub, 1);
-  exp->computeAt(softmax, 1);
-  x_max_sub->computeAt(exp, 2);
-
-  softmax->axis(0)->parallelize(ParallelType::BIDx);
-  for (auto tensor : all_tensors) {
-    tensor->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  const int64_t dimx = 1024;
-  const int64_t dimy = 4096;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({dimx, dimy}, options);
-  auto aten_output = at::_softmax(aten_input.to(at::kDouble), -1, false);
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input, 128});
-  auto cg_outputs = fe.runFusion({aten_input, 128});
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input, 128},
-      {aten_output},
-      __LINE__,
-      __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionMagicSchedulerSoftmax_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int kReductionAxis = 3;
-  std::vector<int64_t> input_shape{10, 10, 10, 67};
-  TensorView* input = makeSymbolicTensor(input_shape.size());
-  fusion.addInput(input);
-
-  auto output = softmax(input, kReductionAxis);
-
-  fusion.addOutput(output);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn(input_shape, options);
-  auto aten_output =
-      at::_softmax(aten_input.to(at::kDouble), kReductionAxis, false);
-
-  auto reduction_params = getPersistentHeuristics(&fusion, {aten_input});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-
-  schedulePersistentKernel(&fusion, *reduction_params);
-
-  auto lparams = reduction_params->lparams;
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input}, lparams);
-  auto cg_outputs = fe.runFusion({aten_input}, lparams);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST_F(NVFuserTest, FusionTestMaskSoftmax_CUDA) {
-  // This test is testing the usage of all padding tokens
-  // with softmax like Bert might might use in a full padding
-  // sequence.
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int kReductionAxis = 3;
-  std::vector<int64_t> input_shape{256, 16, 128, 128};
-  TensorView* input = makeSymbolicTensor(input_shape.size());
-  TensorView* mask = makeSymbolicTensor(input_shape.size());
-  fusion.addInput(input);
-  fusion.addInput(mask);
-
-  auto out1 = add(input, mask);
-  auto output = softmax(out1, kReductionAxis);
-
-  fusion.addOutput(output);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn(input_shape, options);
-  at::Tensor aten_mask = at::ones(input_shape, options);
-  // -10,000 is used here as a magic number because the padding
-  // tokens need to be a value that gives a value close to zero
-  // as to not influence softmax.  Bert, in particular, does
-  // not use -Infinity because sometimes it will have a
-  // softmax of all padding tokkens that can result a divide by
-  // zero that creates NaN result.
-  aten_mask = aten_mask * -10000.0;
-  auto aten_out1 = aten_input + aten_mask;
-  auto aten_output = at::_softmax(aten_out1, kReductionAxis, false);
-
-  auto reduction_params =
-      getPersistentHeuristics(&fusion, {aten_input, aten_mask});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-
-  schedulePersistentKernel(&fusion, *reduction_params);
-
-  auto lparams = reduction_params->lparams;
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input, aten_mask}, lparams);
-  auto cg_outputs = fe.runFusion({aten_input, aten_mask}, lparams);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input, aten_mask},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST_F(NVFuserTest, FusionMagicSchedulerLayerNormBackward_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  std::vector<int64_t> shape{20, 100, 35, 67};
-  std::vector<int64_t> norm_shape{67};
-
-  const size_t kM = shape.size();
-  const size_t kN = norm_shape.size();
-  const size_t kOuterNumDims = kM - kN;
-
-  std::vector<int64_t> outer_shape;
-  for (const auto idx : c10::irange(kOuterNumDims)) {
-    outer_shape.push_back(shape[idx]);
-  }
-  for (const auto idx : c10::irange(kOuterNumDims, kM)) {
-    outer_shape.push_back(1);
-  }
-
-  auto grad_out = makeSymbolicTensor(shape.size());
-  auto input = makeSymbolicTensor(shape.size());
-  auto mean = makeConcreteTensor(outer_shape);
-  auto rstd = makeConcreteTensor(outer_shape);
-  auto weight = makeSymbolicTensor(norm_shape.size());
-  auto bias = makeSymbolicTensor(norm_shape.size());
-  fusion.addInput(grad_out);
-  fusion.addInput(input);
-  fusion.addInput(mean);
-  fusion.addInput(rstd);
-  fusion.addInput(weight);
-  fusion.addInput(bias);
-
-  auto grads = layer_norm_backward(
-      grad_out,
-      input,
-      norm_shape,
-      mean,
-      rstd,
-      weight,
-      bias,
-      {true, true, true});
-
-  fusion.addOutput(grads.grad_input);
-  fusion.addOutput(grads.grad_weight);
-  fusion.addOutput(grads.grad_bias);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_grad_out = at::randn(shape, options);
-  at::Tensor aten_input = at::randn(shape, options);
-  at::Tensor aten_weight = at::randn(norm_shape, options);
-  at::Tensor aten_bias = at::randn(norm_shape, options);
-  auto at_weight = c10::optional<at::Tensor>(aten_weight);
-  auto at_bias = c10::optional<at::Tensor>(aten_bias);
-
-  const float kEps = 1e-5;
-  auto aten_results =
-      at::native_layer_norm(aten_input, norm_shape, at_weight, at_bias, kEps);
-  auto aten_output = std::get<0>(aten_results);
-  auto aten_mean = std::get<1>(aten_results);
-  auto aten_rstd = std::get<2>(aten_results);
-
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  std::vector<IValue> aten_inputs = {
-      aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
-  auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
-
-  auto aten_gradients = at::native_layer_norm_backward(
-      aten_grad_out.to(at::kDouble),
-      aten_input.to(at::kDouble),
-      norm_shape,
-      aten_mean.to(at::kDouble),
-      aten_rstd.to(at::kDouble),
-      c10::optional<at::Tensor>(aten_weight.to(at::kDouble)),
-      c10::optional<at::Tensor>(aten_bias.to(at::kDouble)),
-      {true, true, true});
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      aten_inputs,
-      {std::get<0>(aten_gradients),
-       std::get<1>(aten_gradients),
-       std::get<2>(aten_gradients)},
-      __LINE__,
-      __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionMagicSchedulerRMSNormBackward_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-  const int64_t NORM_SIZE = 1024;
-  std::vector<int64_t> shape{8, 56, NORM_SIZE};
-  std::vector<int64_t> norm_shape{NORM_SIZE};
-
-  const size_t kM = shape.size();
-  const size_t kN = norm_shape.size();
-  const size_t kOuterNumDims = kM - kN;
-
-  std::vector<int64_t> outer_shape;
-  for (const auto idx : c10::irange(kOuterNumDims)) {
-    outer_shape.push_back(shape[idx]);
-  }
-  for (const auto idx : c10::irange(kOuterNumDims, kM)) {
-    outer_shape.push_back(1);
-  }
-
-  auto grad_out = makeContigTensor(shape.size());
-  auto input = makeContigTensor(shape.size());
-  auto rstd = makeConcreteTensor(outer_shape);
-  auto weight = makeContigTensor(norm_shape.size());
-  fusion.addInput(grad_out);
-  fusion.addInput(input);
-  fusion.addInput(rstd);
-  fusion.addInput(weight);
-
-  auto grads = rms_norm_backward(
-      grad_out, input, norm_shape, rstd, weight, {true, true});
-
-  fusion.addOutput(grads.grad_input);
-  fusion.addOutput(grads.grad_weight);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_grad_out = at::randn(shape, options);
-  at::Tensor aten_input = at::randn(shape, options);
-  at::Tensor aten_weight = at::randn(norm_shape, options);
-  auto at_weight = c10::optional<at::Tensor>(aten_weight);
-
-  const float kEps = 1e-6;
-  auto pow2 = at::pow(aten_input, 2);
-  auto sum = at::sum(pow2, -1, true);
-  auto var = at::mul(sum, 1.0 / NORM_SIZE);
-  auto aten_rstd = at::pow(at::add(var, kEps), -0.5);
-
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  std::vector<IValue> aten_inputs = {
-      aten_grad_out, aten_input, aten_rstd, aten_weight};
-  auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
-
-  auto in_mul_rstd = at::mul(aten_input, aten_rstd);
-  auto grad_out_mul = at::mul(aten_grad_out, in_mul_rstd);
-  auto aten_grad_weight = at::sum(grad_out_mul, c10::IntArrayRef{0, 1});
-  auto sum_loss1 = at::sum(at::mul(aten_grad_out, aten_weight), -1, true);
-  auto sum_loss2 = at::sum(
-      at::mul(
-          at::mul(at::mul(aten_grad_out, aten_weight), aten_input), aten_rstd),
-      -1,
-      true);
-
-  const float fH = NORM_SIZE;
-  auto term1 = at::mul(aten_rstd, 1.0 / fH);
-  auto aten_grad_input = at::mul(at::mul(aten_grad_out, fH), aten_weight);
-  aten_grad_input = at::sub(aten_grad_input, sum_loss1);
-  aten_grad_input = at::sub(
-      aten_grad_input, at::mul(at::mul(aten_input, aten_rstd), sum_loss2));
-  aten_grad_input = at::mul(aten_grad_input, term1);
-  testValidate(
-      &fusion,
-      cg_outputs,
-      aten_inputs,
-      {aten_grad_input, aten_grad_weight},
-      __LINE__,
-      __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionMagicSchedulerLayerNormalization_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  const float kEps = 1e-5;
-  Double* eps_ptr = IrBuilder::create<Double>(kEps);
-
-  std::vector<int64_t> input_shape{20, 100, 35, 67};
-  std::vector<int64_t> norm_shape{67};
-
-  auto input = makeSymbolicTensor(input_shape.size());
-  fusion.addInput(input);
-
-  auto result = layer_norm(input, norm_shape, nullptr, nullptr, eps_ptr);
-
-  fusion.addOutput(result.output);
-  fusion.addOutput(result.mean);
-  fusion.addOutput(result.invstd);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn(input_shape, options);
-  c10::optional<at::Tensor> aten_weight = c10::nullopt;
-  c10::optional<at::Tensor> aten_bias = c10::nullopt;
-  auto aten_outputs = at::native_layer_norm(
-      aten_input, norm_shape, aten_weight, aten_bias, kEps);
-
-  // Check reduction axis is same for all reductions
-  // Generate Launch Parameters
-  auto reduction_params = getPersistentHeuristics(&fusion, {aten_input});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto cg_outputs = fec.runFusionWithInputs({aten_input});
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {std::get<0>(aten_outputs),
-       std::get<1>(aten_outputs),
-       std::get<2>(aten_outputs)},
-      __LINE__,
-      __FILE__,
-      "");
-}
-
-TEST_F(NVFuserTest, FusionMagicSchedulerRMSNormalization_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  int64_t NORM_SIZE = 1024;
-  const float kEps = 1e-6;
-  Double* eps_ptr = IrBuilder::create<Double>(kEps);
-
-  std::vector<int64_t> input_shape{8, 56, NORM_SIZE};
-  std::vector<int64_t> norm_shape{NORM_SIZE};
-
-  auto input = makeContigTensor(input_shape.size());
-  fusion.addInput(input);
-  auto result = rms_norm(input, norm_shape, nullptr, eps_ptr);
-
-  fusion.addOutput(result.output);
-  fusion.addOutput(result.invstd);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn(input_shape, options);
-  c10::optional<at::Tensor> aten_weight = c10::nullopt;
-
-  auto pow2 = at::pow(aten_input, 2);
-
-  auto sum = at::sum(pow2, -1, true);
-  auto var = at::mul(sum, 1.0 / NORM_SIZE);
-  auto invstd = at::pow(at::add(var, kEps), -0.5);
-  auto output = at::mul(aten_input, invstd);
-  //// Check reduction axis is same for all reductions
-  //// Generate Launch Parameters
-  auto reduction_params = getPersistentHeuristics(&fusion, {aten_input});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto cg_outputs = fec.runFusionWithInputs({aten_input});
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {output, invstd},
-      __LINE__,
-      __FILE__,
-      "");
-}
-
-TEST_F(NVFuserTest, FusionMagicSchedulerBatchNormalization_CUDA) {
-  if (!deviceMajorMinorCheck(7)) {
-    GTEST_SKIP() << "skipping tests on pre-Volta GPUs";
-    return;
-  }
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  const float kMomentum = 0.1;
-  const float kEps = 1e-5;
-  const bool kTraining = true;
-  std::vector<int64_t> input_shape{20, 100, 35, 45};
-
-  auto input = makeSymbolicTensor(input_shape.size());
-  auto weight = makeSymbolicTensor(1);
-  auto bias = makeSymbolicTensor(1);
-  auto running_mean = makeSymbolicTensor(1);
-  auto running_var = makeSymbolicTensor(1);
-  fusion->addInput(input);
-  fusion->addInput(weight);
-  fusion->addInput(bias);
-  fusion->addInput(running_mean);
-  fusion->addInput(running_var);
-
-  Double* momentum = IrBuilder::create<Double>(kMomentum);
-  Double* eps = IrBuilder::create<Double>(kEps);
-
-  auto result = batch_norm(
-      input, weight, bias, running_mean, running_var, kTraining, momentum, eps);
-
-  fusion->addOutput(result.output);
-  fusion->addOutput(result.mean);
-  fusion->addOutput(result.invstd);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto at_input = at::randn(input_shape, options);
-  auto at_weight = at::ones({input_shape[1]}, options);
-  auto at_bias = at::zeros({input_shape[1]}, options);
-  auto at_run_mean = at::zeros({input_shape[1]}, options);
-  auto at_run_var = at::ones({input_shape[1]}, options);
-
-  std::vector<IValue> aten_inputs = {
-      at_input, at_weight, at_bias, at_run_mean, at_run_var};
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-
-  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
-
-  auto aten_outputs = at::native_batch_norm(
-      at_input,
-      c10::optional<at::Tensor>(at_weight),
-      c10::optional<at::Tensor>(at_bias),
-      c10::optional<at::Tensor>(at_run_mean),
-      c10::optional<at::Tensor>(at_run_var),
-      kTraining,
-      kMomentum,
-      kEps);
-
-  testValidate(
-      executor_cache.fusion(),
-      cg_outputs,
-      aten_inputs,
-      {std::get<0>(aten_outputs),
-       std::get<1>(aten_outputs),
-       std::get<2>(aten_outputs)},
-      __LINE__,
-      __FILE__,
-      "");
-}
-
-TEST_F(NVFuserTest, FusionMagicSchedulerInstanceNormalization_CUDA) {
-  if (!deviceMajorMinorCheck(7)) {
-    GTEST_SKIP() << "skipping tests on pre-Volta GPUs";
-    return;
-  }
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  const float kMomentum = 0.1;
-  const float kEps = 1e-5;
-  const bool kUseInputStats = true;
-  std::vector<int64_t> input_shape{20, 100, 35, 45};
-
-  auto input = makeSymbolicTensor(input_shape.size());
-  auto weight = makeSymbolicTensor(1);
-  auto bias = makeSymbolicTensor(1);
-  auto running_mean = makeSymbolicTensor(1);
-  auto running_var = makeSymbolicTensor(1);
-  fusion->addInput(input);
-  fusion->addInput(weight);
-  fusion->addInput(bias);
-  fusion->addInput(running_mean);
-  fusion->addInput(running_var);
-
-  Double* momentum = IrBuilder::create<Double>(kMomentum);
-  Double* eps = IrBuilder::create<Double>(kEps);
-
-  auto result = instance_norm(
-      input,
-      weight,
-      bias,
-      running_mean,
-      running_var,
-      kUseInputStats,
-      momentum,
-      eps);
-
-  fusion->addOutput(result.output);
-  // fusion->addOutput(result.mean);
-  // fusion->addOutput(result.invstd);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto at_input = at::randn(input_shape, options);
-  auto at_weight = at::ones({input_shape[1]}, options);
-  auto at_bias = at::zeros({input_shape[1]}, options);
-  auto at_run_mean = at::zeros({input_shape[1]}, options);
-  auto at_run_var = at::ones({input_shape[1]}, options);
-
-  std::vector<IValue> aten_inputs = {
-      at_input, at_weight, at_bias, at_run_mean, at_run_var};
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-
-  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
-  auto cg_outputs_full = {at_run_mean, at_run_var, cg_outputs[0]};
-
-  auto aten_outputs = at::instance_norm(
-      at_input,
-      c10::optional<at::Tensor>(at_weight),
-      c10::optional<at::Tensor>(at_bias),
-      c10::optional<at::Tensor>(at_run_mean),
-      c10::optional<at::Tensor>(at_run_var),
-      kUseInputStats,
-      kMomentum,
-      kEps,
-      false);
-
-  testValidate(
-      executor_cache.fusion(),
-      cg_outputs,
-      aten_inputs,
-      // TODO: can run_mean/run_var be checked here?
-      // fusion_outputs.size() == aten_outputs.size() && aten_outputs.size() ==
-      // fusion->outputs().size() - output_alias_indices.size()
-      {aten_outputs},
-      __LINE__,
-      __FILE__,
-      "");
-}
-
-TEST_F(NVFuserTest, FusionMagicSchedulerInstanceNormalizationBackward_CUDA) {
-  if (!deviceMajorMinorCheck(7)) {
-    GTEST_SKIP() << "skipping tests on pre-Volta GPUs";
-    return;
-  }
-  auto fusion_forward = std::make_unique<Fusion>();
-  FusionGuard fg_forward(fusion_forward.get());
-
-  const float kMomentum = 0.1;
-  const float kEps = 1e-5;
-  const bool kUseInputStats = true;
-  const bool channels_last = true;
-  const int B = 2;
-  const int C = 5;
-  const int S = 3;
-  std::vector<int64_t> input_shape{B, C, S, S, S};
-  // explicit channels-last for NVFuser
-  std::vector<int64_t> nvfuser_input_shape{B, S, S, S, C};
-
-  auto input = makeContigTensor(input_shape.size());
-  auto weight = makeContigTensor(1);
-  auto bias = makeContigTensor(1);
-  fusion_forward->addInput(input);
-  fusion_forward->addInput(weight);
-  fusion_forward->addInput(bias);
-
-  Double* momentum = IrBuilder::create<Double>(kMomentum);
-  Double* eps = IrBuilder::create<Double>(kEps);
-  auto result_forward = instance_norm(
-      input,
-      weight,
-      bias,
-      nullptr,
-      nullptr,
-      kUseInputStats,
-      momentum,
-      eps,
-      channels_last);
-  fusion_forward->addOutput(result_forward.output);
-  fusion_forward->addOutput(result_forward.mean);
-  fusion_forward->addOutput(result_forward.invstd);
-
-  FusionExecutorCache executor_cache_forward(std::move(fusion_forward));
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto at_input = at::randn(input_shape, options)
-                      .to(at::MemoryFormat::ChannelsLast3d)
-                      .set_requires_grad(true);
-  auto at_input_nvfuser = at_input.clone().detach().permute({0, 2, 3, 4, 1});
-  auto at_weight = at::ones({input_shape[1]}, options).set_requires_grad(true);
-  auto at_weight_nvfuser = at_weight.clone().detach();
-  auto at_bias = at::zeros({input_shape[1]}, options).set_requires_grad(true);
-  auto at_bias_nvfuser = at_bias.clone().detach();
-  std::vector<torch::jit::IValue> aten_inputs_forward = {
-      at_input_nvfuser, at_weight_nvfuser, at_bias_nvfuser};
-  // out, mean, invstd
-  auto outputs_forward =
-      executor_cache_forward.runFusionWithInputs(aten_inputs_forward);
-  auto at_out = at::instance_norm(
-      at_input,
-      c10::optional<at::Tensor>(at_weight),
-      c10::optional<at::Tensor>(at_bias),
-      c10::optional<at::Tensor>(c10::nullopt),
-      c10::optional<at::Tensor>(c10::nullopt),
-      kUseInputStats,
-      kMomentum,
-      kEps,
-      false);
-  auto at_grad =
-      at::randn(input_shape, options).to(at::MemoryFormat::ChannelsLast3d);
-  auto at_grad_nvfuser = at_grad.clone().detach().permute({0, 2, 3, 4, 1});
-  at_out.backward(at_grad);
-  auto fusion_backward = std::make_unique<Fusion>();
-  FusionGuard fg_backward(fusion_backward.get());
-
-  input = makeContigTensor(input_shape.size());
-  auto grad_output = makeContigTensor(input_shape.size());
-  weight = makeContigTensor(1);
-  auto save_mean = makeContigTensor(2);
-  auto save_invstd = makeContigTensor(2);
-  auto dummy = makeContigTensor(0);
-
-  fusion_backward->addInput(input);
-  fusion_backward->addInput(grad_output);
-  fusion_backward->addInput(weight);
-  fusion_backward->addInput(dummy); // dummy for run_mean
-  fusion_backward->addInput(dummy); // dummy for run_var
-  fusion_backward->addInput(save_mean);
-  fusion_backward->addInput(save_invstd);
-
-  auto result_backward = instance_norm_backward(
-      input,
-      grad_output,
-      weight,
-      nullptr,
-      nullptr,
-      save_mean,
-      save_invstd,
-      kUseInputStats,
-      eps,
-      {true, true, true},
-      channels_last);
-
-  fusion_backward->addOutput(result_backward.grad_input);
-  fusion_backward->addOutput(result_backward.grad_weight);
-  fusion_backward->addOutput(result_backward.grad_bias);
-
-  FusionExecutorCache executor_cache_backward(std::move(fusion_backward));
-  std::vector<torch::jit::IValue> aten_inputs_backward = {
-      at_input_nvfuser,
-      at_grad_nvfuser,
-      at_weight_nvfuser,
-      at::empty({}),
-      at::empty({}),
-      outputs_forward[1],
-      outputs_forward[2]};
-  auto outputs_backward =
-      executor_cache_backward.runFusionWithInputs(aten_inputs_backward);
-  outputs_backward[0] = outputs_backward[0].permute({0, 4, 1, 2, 3});
-  testValidate(
-      executor_cache_backward.fusion(),
-      outputs_backward,
-      aten_inputs_backward,
-      {at_input.grad(), at_weight.grad(), at_bias.grad()},
-      __LINE__,
-      __FILE__,
-      "");
-}
-
-TEST_F(NVFuserTest, FusionPersistentSoftmaxLocalSmem_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int pixels_per_thread = 64;
-  const int TIDX = 128;
-  const int static_size = pixels_per_thread * TIDX;
-
-  TensorView* sx = makeConcreteTensor({-1, static_size});
-  TensorView* dx = makeSymbolicTensor(2);
-  fusion.addInput(sx);
-  fusion.addInput(dx);
-
-  TensorView* max_sx = reductionOp(
-      BinaryOpType::Max,
-      {-1},
-      IrBuilder::create<Double>(std::numeric_limits<float>::lowest()),
-      sx); // (M)
-  TensorView* max_dx = reductionOp(
-      BinaryOpType::Max,
-      {-1},
-      IrBuilder::create<Double>(std::numeric_limits<float>::lowest()),
-      dx); // (M)
-
-  // Reduction => merge local and shared memory TensorViews
-  TensorView* max_val = binaryOp(BinaryOpType::Max, max_sx, max_dx);
-  TensorView* bcast_max = broadcast(max_val, {false, true}); // (M, B)
-
-  TensorView* sx_max_sub = sub(sx, bcast_max); // (M, N)
-  TensorView* dx_max_sub = sub(dx, bcast_max); // (M, N)
-
-  TensorView* sx_exp = unaryOp(UnaryOpType::Exp, sx_max_sub); // (M, N)
-  TensorView* dx_exp = unaryOp(UnaryOpType::Exp, dx_max_sub); // (M, N)
-
-  TensorView* sx_sum_exp = sum(sx_exp, {-1}); // (M, R)
-  TensorView* dx_sum_exp = sum(dx_exp, {-1}); // (M, R)
-
-  // Reduction => merge local and shared memory TensorViews
-  TensorView* sum_exp = binaryOp(BinaryOpType::Add, sx_sum_exp, dx_sum_exp);
-  TensorView* bcast_sum = broadcast(sum_exp, {false, true}); // (M, B)
-
-  TensorView* sx_softmax = div(sx_exp, bcast_sum); // (M, N)
-  TensorView* dx_softmax = div(dx_exp, bcast_sum); // (M, N)
-  fusion.addOutput(sx_softmax);
-  fusion.addOutput(dx_softmax);
-
-  auto sx_cache = sx->cacheAfter();
-  auto dx_cache = dx->cacheAfter();
-  dx_cache->setMemoryType(MemoryType::Shared);
-  dx_exp->setMemoryType(MemoryType::Shared);
-
-  // Reduction and Broadcast Tensors common to both memory TVs
-  std::vector<TensorView*> common_tensors(
-      {max_val, sum_exp, bcast_max, bcast_sum});
-
-  // Static Local Memory TVs
-  std::vector<TensorView*> static_tensors(
-      {sx, sx_cache, max_sx, sx_max_sub, sx_exp, sx_sum_exp, sx_softmax});
-
-  // Dynamic Local Memory TVs
-  std::vector<TensorView*> dynamic_tensors(
-      {dx, dx_cache, max_dx, dx_max_sub, dx_exp, dx_sum_exp, dx_softmax});
-
-  std::vector<TensorView*> all_tensors;
-  all_tensors.insert(
-      all_tensors.end(), common_tensors.begin(), common_tensors.end());
-  all_tensors.insert(
-      all_tensors.end(), static_tensors.begin(), static_tensors.end());
-  all_tensors.insert(
-      all_tensors.end(), dynamic_tensors.begin(), dynamic_tensors.end());
-
-  // M => M
-  // M, N => M, N/128, 128
-  for (auto tensor : all_tensors) {
-    if (tensor->nDims() > 1) {
-      tensor->split(-1, TIDX);
-    }
-  }
-
-  auto sx_sum_exp_rf = sx_sum_exp->rFactor({1});
-  auto dx_sum_exp_rf = dx_sum_exp->rFactor({1});
-  all_tensors.push_back(sx_sum_exp_rf);
-  all_tensors.push_back(dx_sum_exp_rf);
-
-  // computeAt
-  sx->computeAt(sx_max_sub, 1);
-  dx->computeAt(dx_max_sub, 1);
-
-  sx_exp->computeAt(sx_softmax, 1);
-  dx_exp->computeAt(dx_softmax, 1);
-
-  sx_max_sub->computeAt(sx_exp, 2);
-  dx_max_sub->computeAt(dx_exp, 2);
-
-  sx_softmax->axis(0)->parallelize(ParallelType::BIDx);
-  dx_softmax->axis(0)->parallelize(ParallelType::BIDx);
-  for (auto tensor : all_tensors) {
-    if (tensor->nDims() > 1) {
-      tensor->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  const int64_t dimx = 1024;
-  const int64_t dimy = 16384;
-
-  auto properties = at::cuda::getDeviceProperties(0);
-  // Require 70KB of smem to run test
-  const size_t required_smem_size = 70 << 10;
-  if (properties->sharedMemPerBlockOptin < required_smem_size) {
-    GTEST_SKIP() << "not enough shared memory space on device to run test";
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({dimx, dimy}, options);
-  at::Tensor aten_static_in = aten_input.narrow(1, 0, static_size);
-  at::Tensor aten_dynamic_in =
-      aten_input.narrow(1, static_size, dimy - static_size);
-
-  at::Tensor out = at::zeros({dimx, dimy}, options);
-  at::Tensor cg_static_out = out.narrow(1, 0, static_size);
-  at::Tensor cg_dynamic_out = out.narrow(1, static_size, dimy - static_size);
-
-  std::vector<at::Tensor> aten_outputs;
-
-  auto aten_output = at::_softmax(aten_input.to(at::kDouble), -1, false);
-  at::Tensor aten_static_out = aten_output.narrow(1, 0, static_size);
-  at::Tensor aten_dynamic_out =
-      aten_output.narrow(1, static_size, dimy - static_size);
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_static_in, aten_dynamic_in});
-  fe.runFusion(
-      {aten_static_in, aten_dynamic_in}, {cg_static_out, cg_dynamic_out});
-
-  testValidate(
-      &fusion,
-      {cg_static_out, cg_dynamic_out},
-      {aten_static_in, aten_dynamic_in},
-      {cg_static_out, cg_dynamic_out},
-      __LINE__,
-      __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionPersistentNormLocalShared_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int pixels_per_thread = 64;
-  const int TIDX = 128;
-  const int static_size = pixels_per_thread * TIDX;
-
-  TensorView* sx = makeConcreteTensor({-1, static_size});
-  TensorView* dx = makeSymbolicTensor(2);
-  fusion.addInput(sx);
-  fusion.addInput(dx);
-
-  Double* gamma = IrBuilder::create<Double>();
-  Double* beta = IrBuilder::create<Double>();
-  Double* eps = IrBuilder::create<Double>();
-  Int* N = IrBuilder::create<Int>();
-  fusion.addInput(gamma);
-  fusion.addInput(beta);
-  fusion.addInput(eps);
-  fusion.addInput(N);
-
-  // Reduction
-  auto sx_sum = sum(sx, {-1}); // (M, R)
-  auto dx_sum = sum(dx, {-1}); // (M, R)
-  // Reduction => merge local and shared memory TensorViews
-  auto x_sum = binaryOp(BinaryOpType::Add, sx_sum, dx_sum);
-
-  // Broadcast
-  auto x_sum_bcast = broadcast(x_sum, {false, true}); // (M, B)
-  // Pwise
-  auto x_mean = div(x_sum_bcast, N); // (M, B)
-
-  auto sx_mean_sub = sub(sx, x_mean); // (M, N)
-  auto dx_mean_sub = sub(dx, x_mean); // (M, N)
-
-  auto sx_mean_sub_pow = mul(sx_mean_sub, sx_mean_sub); // (M, N)
-  auto dx_mean_sub_pow = mul(dx_mean_sub, dx_mean_sub); // (M, N)
-
-  // Reduction
-  auto sx_var_sum = sum(sx_mean_sub_pow, {-1}); // (M, R)
-  auto dx_var_sum = sum(dx_mean_sub_pow, {-1}); // (M, R)
-  // Reduction => merge local and shared memory TensorViews
-  auto var_sum = binaryOp(BinaryOpType::Add, sx_var_sum, dx_var_sum);
-
-  // Broadcast
-  auto var_sum_bcast = broadcast(var_sum, {false, true}); // (M, B)
-  // Pwise
-  auto var = div(var_sum_bcast, N); // (M, B)
-  auto var_eps = add(var, eps); // (M, B)
-  auto rvar = unaryOp(UnaryOpType::Rsqrt, var_eps); // (M, B)
-
-  auto sx_norm = mul(sx_mean_sub, rvar);
-  auto dx_norm = mul(dx_mean_sub, rvar);
-
-  auto sx_norm_gamma = mul(sx_norm, gamma);
-  auto dx_norm_gamma = mul(dx_norm, gamma);
-
-  auto sx_norm_gamma_beta = add(sx_norm_gamma, beta);
-  auto dx_norm_gamma_beta = add(dx_norm_gamma, beta);
-
-  fusion.addOutput(sx_norm_gamma_beta);
-  fusion.addOutput(dx_norm_gamma_beta);
-
-  sx_norm_gamma_beta->setContiguity(false);
-  dx_norm_gamma_beta->setContiguity(false);
-
-  // Read Input into Shared Memory
-  // Read Input minus Input_Mean into Shared Memory
-  auto sx_cache = sx->cacheAfter();
-  auto dx_cache = dx->cacheAfter();
-  dx_cache->setMemoryType(MemoryType::Shared);
-  dx_mean_sub->setMemoryType(MemoryType::Shared);
-
-  std::vector<TensorView*> common_tensors(
-      {x_sum, x_sum_bcast, x_mean, var_sum, var_sum_bcast, var, var_eps, rvar});
-
-  std::vector<TensorView*> static_tensors(
-      {sx,
-       sx_cache,
-       sx_sum,
-       sx_mean_sub,
-       sx_mean_sub_pow,
-       sx_var_sum,
-       sx_norm,
-       sx_norm_gamma,
-       sx_norm_gamma_beta});
-
-  std::vector<TensorView*> dynamic_tensors(
-      {dx,
-       dx_cache,
-       dx_sum,
-       dx_mean_sub,
-       dx_mean_sub_pow,
-       dx_var_sum,
-       dx_norm,
-       dx_norm_gamma,
-       dx_norm_gamma_beta});
-
-  std::vector<TensorView*> all_tensors;
-  all_tensors.insert(
-      all_tensors.end(), common_tensors.begin(), common_tensors.end());
-  all_tensors.insert(
-      all_tensors.end(), static_tensors.begin(), static_tensors.end());
-  all_tensors.insert(
-      all_tensors.end(), dynamic_tensors.begin(), dynamic_tensors.end());
-
-  // M => M
-  // M, N => M, N/128, 128
-  for (auto tensor : all_tensors) {
-    if (tensor->nDims() > 1) {
-      tensor->split(-1, TIDX);
-    }
-  }
-
-  // Local Sum => Block Broadcast
-  TensorView* sx_sum_rf = sx_sum->rFactor({1});
-  TensorView* sx_var_sum_rf = sx_var_sum->rFactor({1});
-  TensorView* dx_sum_rf = dx_sum->rFactor({1});
-  TensorView* dx_var_sum_rf = dx_var_sum->rFactor({1});
-  all_tensors.push_back(sx_sum_rf);
-  all_tensors.push_back(sx_var_sum_rf);
-  all_tensors.push_back(dx_sum_rf);
-  all_tensors.push_back(dx_var_sum_rf);
-
-  // ComputeAt
-  sx->computeAt(sx_mean_sub_pow, 1);
-  dx->computeAt(dx_mean_sub_pow, 1);
-
-  var_sum->computeAt(rvar, 1);
-
-  sx_mean_sub_pow->computeAt(sx_var_sum_rf, 2);
-  dx_mean_sub_pow->computeAt(dx_var_sum_rf, 2);
-
-  sx_norm->computeAt(sx_norm_gamma_beta, 2);
-  dx_norm->computeAt(dx_norm_gamma_beta, 2);
-
-  sx_norm_gamma_beta->axis(0)->parallelize(ParallelType::BIDx);
-  dx_norm_gamma_beta->axis(0)->parallelize(ParallelType::BIDx);
-  for (auto tensor : all_tensors) {
-    if (tensor->nDims() > 1) {
-      tensor->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  const int dimx = 1024;
-  const int dimy = 16384;
-  const float kGamma = 1.0f;
-  const float kBeta = 0.0f;
-  const float kEps = 1e-5;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({dimx, dimy}, options);
-  at::Tensor aten_static_in = aten_input.narrow(1, 0, static_size);
-  at::Tensor aten_dynamic_in =
-      aten_input.narrow(1, static_size, dimy - static_size);
-
-  at::Tensor out = at::zeros({dimx, dimy}, options);
-  at::Tensor cg_static_out = out.narrow(1, 0, static_size);
-  at::Tensor cg_dynamic_out = out.narrow(1, static_size, dimy - static_size);
-
-  std::vector<IValue> aten_inputs = {
-      aten_static_in, aten_dynamic_in, kGamma, kBeta, kEps, dimy};
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-
-  auto properties = at::cuda::getDeviceProperties(0);
-  // Require 70KB of smem to run test
-  const size_t required_smem_size = 70 << 10;
-  if (properties->sharedMemPerBlockOptin < required_smem_size) {
-    GTEST_SKIP() << "not enough shared memory space on device to run test";
-  }
-
-  fe.runFusion(aten_inputs, {cg_static_out, cg_dynamic_out});
-
-  auto at_mu = at::mean(aten_input.to(at::kDouble), -1).unsqueeze(1);
-  auto at_var = at::var(aten_input.to(at::kDouble), -1, false).unsqueeze(1);
-  auto at_rvar = at::rsqrt(at::add(at_var, kEps));
-  auto at_norm = at::mul(at::sub(aten_input, at_mu), at_rvar);
-  auto aten_output = at::add(at::mul(at_norm, kGamma), kBeta);
-  at::Tensor aten_static_out = aten_output.narrow(1, 0, static_size);
-  at::Tensor aten_dynamic_out =
-      aten_output.narrow(1, static_size, dimy - static_size);
-
-  testValidate(
-      &fusion,
-      {cg_static_out, cg_dynamic_out},
-      aten_inputs,
-      {aten_static_out, aten_dynamic_out},
-      __LINE__,
-      __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSmemDynamicPersistentNorm_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  auto x = makeSymbolicTensor(2);
-  Double* gamma = IrBuilder::create<Double>();
-  Double* beta = IrBuilder::create<Double>();
-  Double* eps = IrBuilder::create<Double>();
-  Int* N = IrBuilder::create<Int>();
-  fusion.addInput(x);
-  fusion.addInput(gamma);
-  fusion.addInput(beta);
-  fusion.addInput(eps);
-  fusion.addInput(N);
-
-  // Reduction
-  auto x_sum = sum(x, {-1}); // (M, R)
-  // Broadcast
-  auto x_sum_bcast = broadcast(x_sum, {false, true}); // (M, B)
-  // Pwise
-  auto x_mean = div(x_sum_bcast, N); // (M, B)
-  auto x_mean_sub = sub(x, x_mean); // (M, N)
-  auto x_mean_sub_pow = mul(x_mean_sub, x_mean_sub); // (M, N)
-  // Reduction
-  auto var_sum = sum(x_mean_sub_pow, {-1}); // (M, R)
-  // Broadcast
-  auto var_sum_bcast = broadcast(var_sum, {false, true}); // (M, B)
-  // Pwise
-  auto var = div(var_sum_bcast, N); // (M, B)
-  auto var_eps = add(var, eps); // (M, B)
-  auto rvar = unaryOp(UnaryOpType::Rsqrt, var_eps); // (M, B)
-  auto norm = mul(x_mean_sub, rvar);
-  auto norm_gamma = mul(norm, gamma);
-  auto norm_gamma_beta = add(norm_gamma, beta);
-  fusion.addOutput(norm_gamma_beta);
-
-  // Read Input into Shared Memory
-  // Read Input minus Input_Mean into Shared Memory
-  auto cache_x = x->cacheAfter();
-  cache_x->setMemoryType(MemoryType::Shared);
-  x_mean_sub->setMemoryType(MemoryType::Shared);
-
-  std::vector<TensorView*> all_tensors(
-      {x_sum,
-       x_mean,
-       cache_x,
-       x_sum_bcast,
-       x_mean_sub,
-       x_mean_sub_pow,
-       var_sum,
-       var_sum_bcast,
-       var,
-       var_eps,
-       rvar,
-       norm,
-       norm_gamma,
-       norm_gamma_beta});
-
-  auto tidx = IrBuilder::create<Int>();
-  fusion.addInput(tidx);
-
-  for (auto tensor : all_tensors) {
-    tensor->split(-1, tidx);
-  }
-
-  // Local Sum => Block Broadcast
-  TensorView* x_sum_rf = x_sum->rFactor({1});
-  TensorView* var_sum_rf = var_sum->rFactor({1});
-  all_tensors.push_back(x_sum_rf);
-  all_tensors.push_back(var_sum_rf);
-
-  // ComputeAt
-  x->computeAt(x_mean_sub_pow, 1);
-  var_sum->computeAt(rvar, 1);
-  x_mean_sub_pow->computeAt(var_sum_rf, 2);
-  norm->computeAt(norm_gamma_beta, 2);
-
-  for (auto tv : all_tensors) {
-    tv->axis(0)->parallelize(ParallelType::BIDx);
-    tv->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  const int dimx = 128;
-  const int dimy = 2048;
-  const float kGamma = 1.0f;
-  const float kBeta = 0.0f;
-  const float kEps = 1e-5;
-  const int TIDX = 128;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({dimx, dimy}, options);
-  auto at_mu = at::mean(aten_input.to(at::kDouble), -1).unsqueeze(1);
-  auto at_var = at::var(aten_input.to(at::kDouble), -1).unsqueeze(1);
-  auto at_rvar = at::rsqrt(at::add(at_var, kEps));
-  auto at_norm = at::mul(at::sub(aten_input, at_mu), at_rvar);
-  auto aten_output = at::add(at::mul(at_norm, kGamma), kBeta);
-
-  std::vector<IValue> aten_inputs = {
-      aten_input, kGamma, kBeta, kEps, dimy, TIDX};
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolic_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 =
-      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
-  fusion.addInput(tv0);
-  fusion.addOutput(tv1);
-  // tv1[I0, R1] = tv0[I0, I1]
-
-  // Interface should just be a direct split with a Parallel type. We can
-  // include the parallelize call if we do this.
-  tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
-  // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1]
-
-  TensorView* tv2 = tv1->rFactor({2});
-  tv2->setMemoryType(MemoryType::Shared);
-  // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1]
-  // tv1[I0,        R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}]
-
-  tv0->computeAt(tv1, 1);
-
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-
-  constexpr int numel_x = 65000, numel_y = 1024;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
-  auto aten_output = aten_input.to(at::kDouble).sum({1});
-
-  // How many threads to use for the block reduction
-  constexpr int runtime_threadIdx_dim = 128;
-
-  LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input}, lparams);
-  auto cg_outputs = fe.runFusion({aten_input}, lparams);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
-}
-
-TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Algorithm
-  Int* sym_bsx = IrBuilder::create<Int>();
-  TensorView* tv0 = makeSymbolicTensor(3); // M, K, N
-  fusion.addInput(tv0);
-  fusion.addInput(sym_bsx);
-
-  TensorView* tv1 = sum(tv0, {1}); // M, R, N
-  fusion.addOutput(tv1);
-
-  TensorView* tv2 = tv0->cacheAfter();
-  tv2->setMemoryType(MemoryType::Shared);
-
-  // Schedule
-  constexpr int BSX = 32;
-  tv1->split(2, BSX);
-  tv1->split(1, sym_bsx);
-  tv1->split(0, BSX);
-  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
-  tv1->reorder({{0, 0}, {1, 2}, {2, 4}, {3, 5}, {4, 1}, {5, 3}});
-  TensorView* tv3 = tv1->rFactor({-2});
-
-  tv0->computeAt(tv1, -2);
-  tv0->computeAt(tv3, -2);
-
-  // Thread and Block binding
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::BIDy);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  // Manual Binding
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 154, K = 45, N = 1524;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({M, K, N}, options);
-  at::Tensor aten_output = aten_input.to(at::kDouble).sum({1});
-
-  // How many threads to use for the block reduction
-  constexpr int runtime_threadIdx_dim = 128;
-
-  auto lparams = LaunchParams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input, runtime_threadIdx_dim}, lparams);
-  auto cg_outputs = fe.runFusion({aten_input, runtime_threadIdx_dim}, lparams);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input, runtime_threadIdx_dim},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-
-  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
-}
-
-TEST_F(NVFuserTest, FusionSmemDynamicPwiseMulSymbolicArgWAR_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  Int* sym_bsx = IrBuilder::create<Int>();
-  TensorView* tv0 = makeSymbolicTensor(2); // (M, K)
-  TensorView* tv1 = makeSymbolicTensor(2); // (K, N)
-  TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B)
-  TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N)
-  TensorView* tv4 = mul(tv2, tv3); // M, K, N
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addInput(sym_bsx);
-  fusion.addOutput(tv4);
-  // Algorithm
-
-  tv2->setMemoryType(MemoryType::Shared);
-  tv3->setMemoryType(MemoryType::Shared);
-
-  constexpr int BSX = 32;
-  tv4->split(2, BSX);
-  tv4->split(1, sym_bsx);
-  tv4->split(0, BSX);
-  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
-  tv4->reorder({{0, 0}, {1, 3}, {2, 1}, {3, 4}, {4, 2}, {5, 5}});
-  // M/BSX, K/BSX, N/BSX, MSX, KSX, NSX
-
-  tv0->computeAt(tv4, 3);
-  tv1->computeAt(tv4, 3);
-  // Schedule
-
-  tv4->axis(0)->parallelize(ParallelType::BIDx);
-  tv4->axis(2)->parallelize(ParallelType::BIDy);
-  // Manual Binding
-  tv2->axis(-2)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  // Thread and Block binding
-
-  constexpr int M = 128, K = 457, N = 1024;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({M, K}, options);
-  at::Tensor t1 = at::randn({K, N}, options);
-  at::Tensor aten_output = mul(t0.unsqueeze(2), t1.unsqueeze(0));
-  std::vector<IValue> aten_inputs = {t0, t1, BSX};
-
-  LaunchParams lparams(-1, -1, -1, BSX, -1, -1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs, lparams);
-  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      aten_inputs,
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-
-  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1);
-}
-
-TEST_F(NVFuserTest, FusionSmemDynamicTiledGemm_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Symbolic integers we will use for runtime tiling
-  Int* symbolic_m_tile_dim = IrBuilder::create<Int>(); // bound to threadIdx.z
-  Int* symbolic_split_k_tile_dim =
-      IrBuilder::create<Int>(); // bound to blockIdx.x
-  Int* symbolic_block_k_tile_dim =
-      IrBuilder::create<Int>(); // bound to threadIdx.x
-  // Compile-time integer for tiling
-  int n_smem_tile = 8; // bound to threadIdx.y
-
-  // Symbolic 2D tensors TV0[M, K], TV1[K, N]
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-
-  // Broadcast tv0 to [M, K, *]
-  TensorView* tv2 = broadcast(tv0, {false, false, true});
-  // Broadcast tv1 to [*, K, N]
-  TensorView* tv3 = broadcast(tv1, {true, false, false});
-
-  // Pointwise multiplication resulting in tv3[M, K, N]
-  TensorView* tv4 = mul(tv2, tv3);
-
-  // Turn the K-dimension of tv4 into a reduction dimension
-  TensorView* tv5 = sum(tv4, {1});
-
-  // Register inputs and outputs
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addOutput(tv5);
-
-  // Register runtime tile dims as inputs
-  fusion.addInput(symbolic_m_tile_dim);
-  fusion.addInput(symbolic_split_k_tile_dim);
-  fusion.addInput(symbolic_block_k_tile_dim);
-
-  // Make a 3D tile, mix of symbolic and constant, do in reverse order because
-  // dims are inserted
-  // [M, K, N]
-  tv5->split(2, n_smem_tile);
-  tv5->split(1, symbolic_block_k_tile_dim);
-  tv5->split(1, symbolic_split_k_tile_dim);
-  tv5->split(0, symbolic_m_tile_dim);
-  // [Mo, Mi, Koo, Koi, Ki, No, Ni]
-
-  // Reorder so all outer tiles are in the leftmost 3 positions
-  tv5->reorder({{1, 5}, {5, 1}});
-  // [Mo, No, Koo, Koi, Ki, Mi, Ni]
-
-  // Factor out the outer reduction IterDomain, then run the inter-cta
-  // reduction, and intra-cta reduction
-  auto tv6 = tv5->rFactor({2});
-  // [Mo, No, rKoo, rKoi, rKi, Mi, Ni]
-  // [Mo, No,       rKoi, rKi, Mi, Ni]
-
-  // Scope computations
-  tv6->computeAt(tv5, 2);
-  // [Mo, No, rKoo,  Koi,  Ki, Mi, Ni]
-  // [Mo, No,       rKoi, rKi, Mi, Ni]
-
-  // Setup compute at schedule
-  tv0->computeAt(tv6, 3);
-  tv1->computeAt(tv6, 3);
-  tv4->computeAt(tv6, -1);
-  //
-  // T2[Mo,  bNo, Koo, Koi,  Kii,  Mi, bNi] CA(4, 3)
-  // T3[bMo,  No, Koo, Koi,  Kii, bMi,  Ni] CA(4, 3)
-  // T4[ Mo,  No, Koo, Koi,  Kii,  Mi,  Ni]
-  // T6[ Mo,  No, rKoo, Koi, Kii,  Mi,  Ni]
-  // T5[ Mo,  No,      rKoi, rKii, Mi,  Ni]
-
-  // Cache smem tiles
-  tv2->setMemoryType(MemoryType::Shared);
-  tv3->setMemoryType(MemoryType::Shared);
-  tv4->setMemoryType(MemoryType::Local);
-  tv6->setMemoryType(MemoryType::Local);
-
-  tv5->axis(0)->parallelize(ParallelType::BIDz);
-  tv5->axis(1)->parallelize(ParallelType::BIDy);
-
-  std::vector<TensorView*> tv_list = {tv2, tv3, tv4, tv5, tv6};
-  for (auto tv : tv_list) {
-    tv->axis(-2)->parallelize(ParallelType::TIDz);
-    tv->axis(-1)->parallelize(ParallelType::TIDy);
-  }
-  tv2->axis(3)->parallelize(ParallelType::TIDx);
-  tv3->axis(3)->parallelize(ParallelType::TIDx);
-  tv4->axis(3)->parallelize(ParallelType::TIDx);
-  tv6->axis(3)->parallelize(ParallelType::TIDx);
-  tv5->axis(2)->parallelize(ParallelType::TIDx);
-
-  tv2->axis(4)->parallelize(ParallelType::BIDx);
-  tv3->axis(4)->parallelize(ParallelType::BIDx);
-  tv4->axis(4)->parallelize(ParallelType::BIDx);
-  tv6->axis(4)->parallelize(ParallelType::BIDx);
-  tv5->axis(3)->parallelize(ParallelType::BIDx);
-
-  constexpr int M = 31, K = 65, N = 33;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({M, K}, options);
-  at::Tensor t1 = at::randn({K, N}, options);
-
-  // Runtime tiling
-  int m_tile = 4; // bound to threadIdx.z
-  int split_k = 7; // bound to blockIdx.x
-  int intra_cta = 8; // bound to threadIdx.x
-
-  std::vector<IValue> aten_inputs = {t0, t1, m_tile, split_k, intra_cta};
-  at::Tensor aten_output =
-      mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1);
-
-  FusionExecutor fe;
-  // Generate CUDA and compile with nvRTC
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-
-  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1);
-}
-
-TEST_F(NVFuserTest, FusionGlobalIntermediate_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 =
-      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
-  fusion.addInput(tv0);
-  fusion.addOutput(tv1);
-  // tv1[I0, R1] = tv0[I0, I1]
-
-  // Interface should just be a direct split with a Parallel type. We can
-  // include the parallelize call if we do this.
-  tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
-  // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1]
-
-  TensorView* tv2 = tv1->rFactor({2});
-  tv2->setMemoryType(MemoryType::Global);
-  // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1]
-  // tv1[I0,        R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}]
-
-  tv0->computeAt(tv1, 1);
-
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-
-  constexpr int numel_x = 65000, numel_y = 1024;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-
-  // How many threads to use for the block reduction
-  constexpr int runtime_threadIdx_dim = 128;
-
-  auto lparams = LaunchParams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input}, lparams);
-  auto cg_outputs = fe.runFusion({input}, lparams);
-
-  auto aten_output = input.to(at::kDouble).sum({1});
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {input},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST_F(NVFuserTest, FusionGlobalIntermediateDefaultSchedule_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-  TensorView* tv2 = makeSymbolicTensor(2);
-  TensorView* tv3 = makeSymbolicTensor(2);
-  TensorView* tv4 = sub(tv2, tv3);
-  TensorView* tv5 = add(tv1, tv4);
-  TensorView* tv6 = sub(tv5, tv0);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addInput(tv2);
-  fusion.addInput(tv3);
-  fusion.addOutput(tv6);
-  // t6 = ((t1 + (t2 - t3)) - t0)
-
-  tv4->setMemoryType(MemoryType::Global);
-  tv5->setMemoryType(MemoryType::Global);
-  tv6->setMemoryType(MemoryType::Global);
-
-  constexpr int M = 32, N = 810;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({M, N}, options);
-  at::Tensor t1 = at::randn({M, N}, options);
-  at::Tensor t2 = at::randn({M, N}, options);
-  at::Tensor t3 = at::randn({M, N}, options);
-
-  at::Tensor aten_output = (t1 + (t2 - t3)) - t0;
-
-  std::vector<IValue> aten_inputs = {t0, t1, t2, t3};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1, t2, t3});
-  auto cg_outputs = fe.runFusion({t0, t1, t2, t3});
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionConstCheck_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto one = IrBuilder::create<Int>(1);
-  TORCH_CHECK(one->isConstScalar());
-
-  auto one_x2 = mul(one, one);
-  TORCH_CHECK(one_x2->isConstScalar());
-
-  auto one_x3 = mul(one_x2, one);
-  TORCH_CHECK(one_x3->isConstScalar());
-
-  auto one_x4 = mul(one_x3, one);
-  TORCH_CHECK(one_x4->isConstScalar());
-}
-
-TEST_F(NVFuserTest, FusionUnrollWithAlloc_CUDA) {
-  const std::vector<int64_t> tensor_dims_in = {128, 128};
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(tensor_dims_in.size());
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(0));
-  TensorView* tv2 =
-      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv1);
-  fusion.addOutput(tv2);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn(tensor_dims_in, options);
-  at::Tensor cg_output = at::empty({tensor_dims_in[0]}, options);
-
-  // Schedule
-  tv2->split(1, 32);
-  tv2->split(1, 4); // unroll
-
-  auto tv2_rf = tv2->rFactor({-3, -2});
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv2_rf->axis(0)->parallelize(ParallelType::BIDx);
-  tv2_rf->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2_rf->axis(-2)->parallelize(ParallelType::Unroll);
-
-  tv1->computeAt(tv2_rf, -1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
-
-  auto aten_output = (input + 0).to(at::kDouble).sum(1);
-
-  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-// Test isZeroInt
-TEST_F(NVFuserTest, FusionIsZeroInt_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  Int* x = IrBuilder::create<Int>(0);
-  Int* y = IrBuilder::create<Int>(1);
-  Val* z = mul(x, y);
-  TORCH_CHECK(x->isZeroInt());
-  TORCH_CHECK(!y->isZeroInt());
-  TORCH_CHECK(!z->isZeroInt());
-}
-
-// Test isOneInt
-TEST_F(NVFuserTest, FusionIsOneInt_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  Int* x = IrBuilder::create<Int>(1);
-  Int* y = IrBuilder::create<Int>(1);
-  Val* z = mul(x, y);
-  TORCH_CHECK(x->isOneInt());
-  TORCH_CHECK(y->isOneInt());
-  TORCH_CHECK(!z->isOneInt());
-}
-
-// This is to verify no cycle of computeAt is created. A more complex
-// variation of this pattern appears in one of the Python tests
-// (test_random_topo).
-TEST_F(NVFuserTest, FusionComputeAtNonterminatingOutput_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  // Common intermediate tensor
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  // tv1 -> tv2
-  auto tv2 = add(tv1, IrBuilder::create<Double>(2));
-  // tv1 -> tv3 -> tv4
-  auto tv3 = add(tv1, IrBuilder::create<Double>(3));
-  auto tv4 = add(tv3, IrBuilder::create<Double>(4));
-
-  // NOTE: This should no longer occur as of PR #201.
-  // The order of adding outputs matters. If tv3 is added before tv4,
-  // it should be fine. However, if tv4 is added before tv3, there
-  // will be a cycle of tv3->tv4 and tv4->tv3. tv3->tv4 is created
-  // first, and then tv4->tv3 is created at the final phase of
-  // computeAt (ComputeAt::setupOutputs).
-  fusion.addOutput(tv2);
-  fusion.addOutput(tv4);
-  fusion.addOutput(tv3);
-
-  tv0->computeAt(tv2, -1);
-
-  TORCH_CHECK(tv3->hasComputeAt());
-  TORCH_CHECK(!tv4->hasComputeAt());
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn(100, options);
-
-  auto t1 = aten_input + 1;
-  auto t2 = t1 + 2;
-  auto t3 = t1 + 3;
-  auto t4 = t3 + 4;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  std::vector<at::Tensor> aten_outputs = {t2, t4, t3};
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionTraversalOrder1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
-  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2));
-  TensorView* tv3 = add(tv1, IrBuilder::create<Double>(3));
-  TensorView* tv4 = add(tv1, IrBuilder::create<Double>(4));
-
-  fusion.addOutput(tv2);
-  fusion.addOutput(tv3);
-  fusion.addOutput(tv4);
-
-  tv1->computeAt(tv3, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({10, 10}, options);
-
-  auto t1 = aten_input + 1;
-  auto t2 = aten_input + 2;
-  auto t3 = t1 + 3;
-  auto t4 = t1 + 4;
-
-  std::vector<at::Tensor> aten_outputs = {t2, t3, t4};
-
-  std::vector<at::Tensor> cg_outputs = {
-      at::empty_like(aten_input, options),
-      at::empty_like(aten_input, options),
-      at::empty_like(aten_input, options)};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, cg_outputs);
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionTraversalOrder2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
-  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
-
-  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(3));
-  TensorView* tv4 = add(tv3, IrBuilder::create<Double>(4));
-
-  TensorView* tv5 = add(tv1, tv3);
-
-  fusion.addOutput(tv2);
-  fusion.addOutput(tv4);
-  fusion.addOutput(tv5);
-
-  tv1->computeAt(tv5, -1);
-  tv3->computeAt(tv5, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({10, 10}, options);
-
-  auto t1 = aten_input + 1;
-  auto t2 = t1 + 2;
-  auto t3 = aten_input + 3;
-  auto t4 = t3 + 4;
-  auto t5 = t1 + t3;
-
-  std::vector<at::Tensor> aten_outputs = {t2, t4, t5};
-
-  std::vector<at::Tensor> cg_outputs = {
-      at::empty_like(aten_input, options),
-      at::empty_like(aten_input, options),
-      at::empty_like(aten_input, options)};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, cg_outputs);
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionTraversalOrder3_CUDA) {
-  for (const auto i : c10::irange(2)) {
-    Fusion fusion;
-    FusionGuard fg(&fusion);
-
-    TensorView* tv0 = makeSymbolicTensor(1);
-    fusion.addInput(tv0);
-
-    TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
-    TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
-
-    TensorView* tv3 = add(tv0, IrBuilder::create<Double>(3));
-    TensorView* tv4 = add(tv3, IrBuilder::create<Double>(4));
-
-    TensorView* tv5 = add(tv1, tv3);
-
-    fusion.addOutput(tv2);
-    fusion.addOutput(tv4);
-    fusion.addOutput(tv5);
-
-    const int tile = 32;
-
-    tv1->split(-1, tile);
-    tv2->split(-1, tile);
-    tv3->split(-1, tile);
-    tv4->split(-1, tile);
-    tv5->split(-1, tile);
-
-    auto compute_at_outer = tv1;
-    auto compute_at_inner = tv3;
-    if (i == 1) {
-      std::swap(compute_at_inner, compute_at_outer);
-    }
-
-    compute_at_outer->computeAt(tv5, -2);
-    compute_at_inner->computeAt(tv5, -1);
-
-    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-    at::Tensor aten_input = at::randn({100}, options);
-    auto t1 = aten_input + 1;
-    auto t2 = t1 + 2;
-    auto t3 = aten_input + 3;
-    auto t4 = t3 + 4;
-    auto t5 = t1 + t3;
-
-    std::vector<at::Tensor> aten_outputs = {t2, t4, t5};
-
-    std::vector<at::Tensor> cg_outputs = {
-        at::empty_like(aten_input, options),
-        at::empty_like(aten_input, options),
-        at::empty_like(aten_input, options)};
-
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {aten_input});
-    fe.runFusion({aten_input}, cg_outputs);
-
-    testValidate(
-        &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-  }
-}
-
-TEST_F(NVFuserTest, FusionTraversalOrder4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // First tree
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
-  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
-  TensorView* tv3 = add(tv1, IrBuilder::create<Double>(3));
-  fusion.addOutput(tv2);
-  fusion.addOutput(tv3);
-
-  // Second tree
-  TensorView* tv4 = makeSymbolicTensor(1);
-  fusion.addInput(tv4);
-  TensorView* tv5 = add(tv4, IrBuilder::create<Double>(5));
-  TensorView* tv6 = add(tv5, IrBuilder::create<Double>(6));
-  TensorView* tv7 = add(tv5, IrBuilder::create<Double>(7));
-  fusion.addOutput(tv6);
-  fusion.addOutput(tv7);
-
-  tv1->computeAt(tv2, -1);
-  tv5->computeAt(tv6, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({100}, options);
-  at::Tensor t4 = at::rand_like(t0, options);
-
-  auto t1 = t0 + 1;
-  auto t2 = t1 + 2;
-  auto t3 = t1 + 3;
-  auto t5 = t4 + 5;
-  auto t6 = t5 + 6;
-  auto t7 = t5 + 7;
-
-  std::vector<at::Tensor> aten_outputs = {t2, t3, t6, t7};
-  std::vector<IValue> aten_inputs = {t0, t4};
-  std::vector<at::Tensor> cg_outputs = {
-      at::empty_like(t0, options),
-      at::empty_like(t0, options),
-      at::empty_like(t0, options),
-      at::empty_like(t0, options)};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  fe.runFusion(aten_inputs, cg_outputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionTraversalOrder5_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
-  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
-  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(3));
-  TensorView* tv4 = add(tv3, IrBuilder::create<Double>(4));
-  TensorView* tv5 = add(tv2, tv4);
-
-  fusion.addOutput(tv1);
-  fusion.addOutput(tv3);
-  fusion.addOutput(tv5);
-
-  tv2->computeAt(tv5, -1);
-  tv4->computeAt(tv5, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({100}, options);
-  std::vector<at::Tensor> cg_outputs = {
-      at::empty_like(aten_input, options),
-      at::empty_like(aten_input, options),
-      at::empty_like(aten_input, options)};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, cg_outputs);
-
-  auto t1 = aten_input + 1;
-  auto t2 = t1 + 2;
-  auto t3 = aten_input + 3;
-  auto t4 = t3 + 4;
-  auto t5 = t2 + t4;
-
-  std::vector<at::Tensor> aten_outputs = {t1, t3, t5};
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionTraversalOrder6_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
-  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2));
-  TensorView* tv3 = add(tv1, tv2);
-  TensorView* tv4 = add(tv3, IrBuilder::create<Double>(4));
-
-  fusion.addOutput(tv4);
-
-  tv1->split(0, 32);
-  tv2->split(0, 32);
-  tv3->split(0, 32);
-  tv4->split(0, 32);
-
-  tv3->computeAt(tv4, -2);
-  tv1->computeAt(tv3, -1);
-  tv2->computeAt(tv3, -2);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({100}, options);
-
-  auto t1 = aten_input + 1;
-  auto t2 = aten_input + 2;
-  auto t3 = t1 + t2;
-  auto aten_output = t3 + 4;
-
-  at::Tensor cg_output = at::empty_like(aten_input, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionTraversalOrder7_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
-  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
-  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(3));
-  TensorView* tv4 = add(tv3, IrBuilder::create<Double>(4));
-  TensorView* tv5 = add(tv2, tv4);
-
-  fusion.addOutput(tv5);
-
-  TensorView* tvs[] = {tv1, tv2, tv3, tv4, tv5};
-  for (auto tv : tvs) {
-    tv->split(0, 2);
-    tv->split(0, 4);
-    tv->split(0, 8);
-  }
-
-  // computeAt into inner loop nests
-  tv1->computeAt(tv2, -1);
-  tv3->computeAt(tv4, -2);
-
-  tv2->computeAt(tv5, -4);
-  tv4->computeAt(tv5, -3);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({100}, options);
-
-  auto t1 = aten_input + 1;
-  auto t2 = t1 + 2;
-  auto t3 = aten_input + 3;
-  auto t4 = t3 + 4;
-  auto aten_output = t2 + t4;
-
-  at::Tensor cg_output = at::empty_like(aten_input, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-// Test predication of grid reduction
-TEST_F(NVFuserTest, FusionThreadPredicate_CUDA) {
-  const int gdimx = 4;
-  const int bdimx = 128;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 =
-      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
-  TensorView* tv2 = unaryOp(UnaryOpType::Neg, tv1);
-  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(2));
-
-  fusion.addOutput(tv3);
-  fusion.addOutput(tv2);
-
-  tv1->split(1, bdimx);
-  tv1->split(1, gdimx);
-  tv3->split(1, bdimx);
-  tv3->split(1, gdimx);
-
-  TensorView* tv1_rf = tv1->rFactor({1});
-
-  tv1->computeAt(tv2, -1);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDy);
-  tv1_rf->axis(0)->parallelize(ParallelType::BIDy);
-  tv2->axis(0)->parallelize(ParallelType::BIDy);
-  tv1->axis(-2)->parallelize(ParallelType::BIDx);
-  tv1_rf->axis(-2)->parallelize(ParallelType::BIDx);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv3->axis(3)->parallelize(ParallelType::TIDx);
-  tv3->axis(2)->parallelize(ParallelType::BIDx);
-  tv3->axis(0)->parallelize(ParallelType::BIDy);
-
-  int numel_x = 100;
-  int numel_y = 1000;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
-
-  auto t2 = -aten_input.to(at::kDouble).sum({1});
-  auto t3 = aten_input + 2.0;
-
-  std::vector<at::Tensor> aten_outputs = {t3, t2};
-
-  std::vector<at::Tensor> cg_outputs = {
-      at::empty_like(aten_input, options), at::empty({numel_x}, options)};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, cg_outputs);
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionLSTMCell_CUDA) {
-  const int hidden_features = 512;
-  const int batch_size = 64;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tvs[16];
-  for (const auto i : c10::irange(16)) {
-    tvs[i] = makeSymbolicTensor(2);
-    fusion.addInput(tvs[i]);
-  }
-
-  auto ingate = unaryOp(
-      UnaryOpType::Sigmoid, add(add(add(tvs[0], tvs[1]), tvs[2]), tvs[3]));
-
-  auto forgetgate = unaryOp(
-      UnaryOpType::Sigmoid, add(add(add(tvs[4], tvs[5]), tvs[6]), tvs[7]));
-
-  auto cellgate = unaryOp(
-      UnaryOpType::Tanh, add(add(add(tvs[8], tvs[9]), tvs[10]), tvs[11]));
-
-  auto outgate = unaryOp(
-      UnaryOpType::Sigmoid, add(add(add(tvs[12], tvs[13]), tvs[14]), tvs[15]));
-
-  auto cx = makeContigTensor(2);
-  fusion.addInput(cx);
-
-  auto cy = add(mul(forgetgate, cx), mul(ingate, cellgate));
-
-  auto hy = mul(outgate, unaryOp(UnaryOpType::Tanh, cy));
-
-  fusion.addOutput(cy);
-  fusion.addOutput(hy);
-
-  std::vector<c10::IValue> aten_inputs;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor large_tensor0 =
-      at::randn({batch_size, hidden_features * 4}, options);
-  at::Tensor large_tensor1 =
-      at::randn({batch_size, hidden_features * 4}, options);
-  at::Tensor large_tensor2 =
-      at::randn({batch_size, hidden_features * 4}, options);
-  at::Tensor large_tensor3 =
-      at::randn({batch_size, hidden_features * 4}, options);
-
-  auto chunked0 = large_tensor0.chunk(4, 1);
-  auto chunked1 = large_tensor1.chunk(4, 1);
-  auto chunked2 = large_tensor2.chunk(4, 1);
-  auto chunked3 = large_tensor3.chunk(4, 1);
-
-  aten_inputs.insert(aten_inputs.end(), chunked0.begin(), chunked0.end());
-  aten_inputs.insert(aten_inputs.end(), chunked1.begin(), chunked1.end());
-  aten_inputs.insert(aten_inputs.end(), chunked2.begin(), chunked2.end());
-  aten_inputs.insert(aten_inputs.end(), chunked3.begin(), chunked3.end());
-
-  auto at_ingate =
-      chunked0[0].add(chunked0[1]).add(chunked0[2]).add(chunked0[3]).sigmoid();
-  auto at_forgetgate =
-      chunked1[0].add(chunked1[1]).add(chunked1[2]).add(chunked1[3]).sigmoid();
-  auto at_cellgate =
-      chunked2[0].add(chunked2[1]).add(chunked2[2]).add(chunked2[3]).tanh();
-  auto at_outgate =
-      chunked3[0].add(chunked3[1]).add(chunked3[2]).add(chunked3[3]).sigmoid();
-
-  auto at_cx = at::randn({batch_size, hidden_features}, options);
-  aten_inputs.push_back(at_cx);
-  auto at_cy = at_forgetgate.mul(at_cx).add(at_ingate.mul(at_cellgate));
-  auto at_hy = at_outgate.mul(at_cy.tanh());
-
-  auto lparams = schedulePointwise(&fusion, aten_inputs);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs, lparams);
-  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {at_cy, at_hy}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionComputeAtMultiBCast_CUDA) {
-  if (at::cuda::getCurrentDeviceProperties()->major >= 8) {
-    GTEST_SKIP() << "Somehow it fails on sm_80+ GPUs"
-                 << " See https://github.com/pytorch/pytorch/issues/86717";
-  }
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
-  TensorView* tv2 = broadcast(tv1, {true, false});
-  TensorView* tv3 = broadcast(tv1, {false, true});
-  TensorView* tv4 = add(tv2, tv3);
-  fusion.addOutput(tv4);
-
-  // Not possible to do computeAt at position -1 as recomputation
-  // would be required. An exception should be thrown.
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(tv1->computeAt(tv3, -1));
-}
-
-TEST_F(NVFuserTest, FusionReductionHalf_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(3, DataType::Half);
-  fusion.addInput(tv0);
-
-  auto tv1 = castOp(DataType::Float, tv0);
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1.0));
-  auto tv3 = sum(tv2, {2});
-  auto tv4 = castOp(DataType::Half, tv3);
-
-  fusion.addOutput(tv4);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({8, 8, 16}, options);
-
-  auto reduction_tv = tv3;
-
-  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  scheduleReduction(&fusion, *reduction_params);
-
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-
-  auto lparams = reduction_params->lparams;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input}, lparams);
-  // no broadcasting needed, omitting the last optional argument;
-  auto cg_outputs = fe.runFusion({aten_input}, lparams);
-
-  auto aten_output = aten_input.add(1.0).to(at::kDouble).sum({2});
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST_F(NVFuserTest, FusionReduceSingle_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeConcreteTensor({100, 1});
-  fusion.addInput(tv0);
-  auto tv1 = sum(tv0, {1});
-  fusion.addOutput(tv1);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({100, 1}, options);
-
-  // Grab only tensor views, though there shouldn't be any other type
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  // no broadcasting needed, omitting the last optional argument;
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  auto aten_output = aten_input.to(at::kDouble).sum({1});
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionReduceImplicitBroadcast_CUDA) {
-  constexpr int bid_x = 80;
-  constexpr int tid_x = 4096;
-  constexpr int red_dim = 1;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, 1});
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = reductionOp(
-      BinaryOpType::Add, {red_dim, 2}, IrBuilder::create<Double>(0), tv0);
-  fusion.addOutput(tv1);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({bid_x, tid_x, 1}, options);
-
-  // Apply reduction heuristic
-  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  scheduleReduction(&fusion, *reduction_params);
-  auto lparams = reduction_params->lparams;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input}, lparams);
-  // no broadcasting needed, omitting the last optional argument;
-  auto cg_outputs = fe.runFusion({aten_input}, lparams);
-  auto aten_output = aten_input.to(at::kDouble).sum({red_dim, 2});
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST_F(NVFuserTest, FusionReduceImplicitBroadcast2_CUDA) {
-  constexpr int bid_x = 80;
-  constexpr int tid_x = 4096;
-  constexpr int red_dim = 1;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, 1});
-  fusion.addInput(tv0);
-
-  TensorView* tv1 =
-      reductionOp(BinaryOpType::Add, {2}, IrBuilder::create<Double>(0), tv0);
-
-  TensorView* tv2 = reductionOp(
-      BinaryOpType::Add, {red_dim}, IrBuilder::create<Double>(0), tv1);
-  fusion.addOutput(tv2);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({bid_x, tid_x, 1}, options);
-
-  // Apply reduction heuristic
-  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-
-  scheduleReduction(&fusion, *reduction_params);
-  auto lparams = reduction_params->lparams;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input}, lparams);
-  // no broadcasting needed, omitting the last optional argument;
-  auto cg_outputs = fe.runFusion({aten_input}, lparams);
-  auto aten_output = aten_input.to(at::kDouble).sum({1, 2});
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST_F(NVFuserTest, FusionReduceImplicitBroadcast3_CUDA) {
-  constexpr int bid_x = 80;
-  constexpr int tid_x = 4096;
-  constexpr int red_dim = 1;
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, 1});
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = reductionOp(
-      BinaryOpType::Add, {red_dim}, IrBuilder::create<Double>(0), tv0);
-
-  TensorView* tv2 =
-      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv1);
-  fusion.addOutput(tv2);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({bid_x, tid_x, 1}, options);
-
-  // Apply reduction heuristic
-  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  scheduleReduction(&fusion, *reduction_params);
-  auto lparams = reduction_params->lparams;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input}, lparams);
-  // no broadcasting needed, omitting the last optional argument;
-  auto cg_outputs = fe.runFusion({aten_input}, lparams);
-  auto aten_output = aten_input.to(at::kDouble).sum({2, 1});
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {aten_input},
-      {aten_output},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST_F(NVFuserTest, FusionTrivialReduction_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeConcreteTensor({10, 20, 1});
-  fusion.addInput(tv0);
-  TensorView* tv1 =
-      reductionOp(BinaryOpType::Add, {2}, IrBuilder::create<Double>(0), tv0);
-  fusion.addOutput(tv1);
-
-  TORCH_CHECK(
-      ir_utils::getReductionOps(&fusion, true /* ignore_trivial */).empty(),
-      "Trivial reduction picked up by fusion");
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({10, 20, 1}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
-  auto aten_output = aten_input.to(at::kDouble).sum({2});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionTrivialReduction2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int w = 1, x = 1, y = 7, z = 8;
-
-  auto tv0 = makeSymbolicTensor(2);
-  auto tv1 = makeConcreteTensor({w, x, y, z});
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = sum(tv1, {0});
-  auto tv3 = sum(tv2, {0});
-  auto tv4 = add(tv3, tv0);
-
-  fusion.addOutput(tv4);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({y, z}, options);
-  at::Tensor t1 = at::randn({w, x, y, z}, options);
-  auto aten_output = t1.to(at::kDouble).sum({0}).sum({0}).add(t0);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  auto lparams = schedulePointwise(&fusion, aten_inputs);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs, lparams);
-  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionTrivialReduction3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int v = 1, w = 1, x = 1, y = 7, z = 8;
-
-  auto tv0 = makeSymbolicTensor(2);
-  auto tv1 = makeConcreteTensor({v, w, x, y, z});
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = sum(tv1, {0, 1, 2});
-  auto tv3 = add(tv2, tv0);
-
-  fusion.addOutput(tv3);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({y, z}, options);
-  at::Tensor t1 = at::randn({v, w, x, y, z}, options);
-  auto aten_output = t1.sum({0, 1, 2}).add(t0);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  auto lparams = schedulePointwise(&fusion, aten_inputs);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs, lparams);
-  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-// Make sure trivial reductions are correctly detected even with
-// scheduling applied.
-TEST_F(NVFuserTest, FusionDetectTrivialReduction1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = broadcast(tv0, {false, true});
-  auto tv2 = sum(tv1, {1});
-  fusion.addOutput(tv2);
-
-  tv2->split(1, 4);
-  tv2->split(1, 8);
-  auto tv3 = tv2->rFactor({-1});
-  auto tv4 = tv2->rFactor({-1});
-
-  auto tv5 = broadcast(tv0, {true, false});
-  auto tv6 = add(tv5, IrBuilder::create<Double>(1));
-  auto tv7 = sub(tv6, IrBuilder::create<Double>(1));
-  auto tv8 = sum(tv7, {0});
-  fusion.addOutput(tv8);
-
-  auto tv9 = broadcast(tv0, {false, true, true});
-  auto tv10 = sum(tv9, {1});
-  auto tv11 = sum(tv10, {1});
-  fusion.addOutput(tv11);
-
-  tv8->split(0, 3);
-  tv10->split(1, 4);
-  tv11->split(1, 5);
-
-  tv0->computeAt(tv2, -1);
-  tv0->computeAt(tv8, -1);
-  tv0->computeAt(tv11, 1);
-
-  // Test indexing to gmem-backed tensors
-  tv3->setMemoryType(MemoryType::Global);
-  tv8->setMemoryType(MemoryType::Global);
-
-  GpuLower gpulw(&fusion);
-
-  // No ReductionOp should be generated as all the reduction
-  // exprs should be replaced with a unary set op.
-  for (const auto expr : gpulw.kernel()->as<Fusion>()->exprs()) {
-    TORCH_CHECK(!expr->isA<ReductionOp>());
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({100}, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {t0, t0, t0}, __LINE__, __FILE__);
-}
-
-// Test detection of partially trivial reduction
-TEST_F(NVFuserTest, FusionDetectTrivialReduction2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv2);
-
-  tv1->split(1, 1);
-  // tv1->axis(1): non-trivial
-  // tv1->axis(2): trivial
-
-  auto tv3 = tv1->rFactor({-1});
-
-  // Just to suppress register-allocation warning
-  tv0->computeAt(tv2, 1);
-  tv3->computeAt(tv1, -1);
-
-  GpuLower gpulw(&fusion);
-
-  // tv3's reduction axis is a trivial reduction. The only
-  // ReductionOp should be for tv1.
-  for (const auto expr : gpulw.kernel()->as<Fusion>()->exprs()) {
-    if (expr->isA<ReductionOp>()) {
-      auto reduction_out =
-          expr->as<ReductionOp>()->outputs()[0]->as<TensorView>();
-      TORCH_CHECK(reduction_out->name() == 1);
-    }
-  }
-}
-
-TEST_F(NVFuserTest, FusionInputsIdLookup_CUDA) {
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({16, 8, 8}, options);
-  at::Tensor t1 = at::randn({8, 8}, options);
-  at::Tensor t2 = at::randn({6, 4}, options);
-
-  // create a cache with max size 2;
-  torch::jit::fuser::cuda::InputsIdLookup inputs_id_lookup(2);
-
-  // testing basic function, same encoding for identical inputs
-  auto id_0 = inputs_id_lookup.lookupId({t0, t1, 5.0});
-  auto id_0_lookup = inputs_id_lookup.lookupId({t0, t1, 2.5});
-  TORCH_CHECK(id_0.id == id_0_lookup.id);
-  TORCH_CHECK(inputs_id_lookup.size() == 1);
-  TORCH_CHECK(id_0.eviction == false);
-
-  // new input (even tho same shape, but we have different signature because of
-  // missing scalar input
-  auto id_1 = inputs_id_lookup.lookupId({t0, t1});
-  auto id_1_lookup = inputs_id_lookup.lookupId({t0, t1});
-  TORCH_CHECK(id_1.id == id_1_lookup.id);
-  TORCH_CHECK(inputs_id_lookup.size() == 2);
-  TORCH_CHECK(id_1.eviction == false);
-
-  // eviction should happen at this point
-  auto id_2 = inputs_id_lookup.lookupId({t2, t1});
-  TORCH_CHECK(id_2.id != id_0.id);
-  TORCH_CHECK(id_2.id != id_1.id);
-  TORCH_CHECK(inputs_id_lookup.size() == 2);
-  TORCH_CHECK(id_2.eviction == true);
-  TORCH_CHECK(id_2.evict_id == id_0.id);
-
-  // look at input 1 again
-  auto id_1_relook = inputs_id_lookup.lookupId({t0, t1});
-  TORCH_CHECK(id_1_relook.id == id_1.id);
-  TORCH_CHECK(id_1_relook.eviction == false);
-}
-
-TEST_F(NVFuserTest, FusionGroupGuardSimpleTensor_CUDA) {
-  std::vector<int64_t> sizes_vec({16, 8, 8});
-  std::vector<int64_t> strides_vec({64, 8, 1});
-  auto tensor_type = TensorType::create(
-      at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt);
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  // pass with identical shape
-  auto t0 = at::randn({16, 8, 8}, options);
-  TORCH_CHECK(complyWith(t0, tensor_type));
-
-  // pass with dynamic shape
-  auto t1 = at::randn({16, 16, 8}, options);
-  TORCH_CHECK(complyWith(t1, tensor_type));
-
-  // broadcasting semantic change failure
-  auto t2 = at::randn({16, 1, 8}, options);
-  TORCH_CHECK(!complyWith(t2, tensor_type));
-
-  // contiguity failure via slicing
-  auto t3 = t0.slice(1, 0, 8, 2);
-  TORCH_CHECK(!complyWith(t3, tensor_type));
-
-  // contiguity failure via slicing
-  auto t4 = t0.slice(2, 0, 8, 2);
-  TORCH_CHECK(!complyWith(t4, tensor_type));
-
-  // rank failure
-  auto t5 = at::randn({16, 8, 8, 8}, options);
-  TORCH_CHECK(!complyWith(t5, tensor_type));
-
-  // contiguity on stride 1 dimension with implicit broadcasting
-  auto t = at::randn({4}, options);
-  auto t6 = t.unsqueeze(1).expand({4, 8});
-  TORCH_CHECK(complyWith(t6, TensorType::create(t6)));
-}
-
-TEST_F(NVFuserTest, FusionGroupGuardBroadcastTensor_CUDA) {
-  std::vector<int64_t> sizes_vec({16, 1, 8});
-  std::vector<int64_t> strides_vec({8, 8, 1});
-  auto tensor_type = TensorType::create(
-      at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt);
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  // broadcasting semantic change
-  auto t0 = at::randn({16, 8, 8}, options);
-  TORCH_CHECK(!complyWith(t0, tensor_type));
-
-  // dtype failure
-  auto t1 = at::randn({16, 1, 8}, options.dtype(at::kHalf));
-  TORCH_CHECK(!complyWith(t1, tensor_type));
-
-  // dtype failure
-  auto t2 = at::randn({16, 1, 8}, options);
-  TORCH_CHECK(complyWith(t2, tensor_type));
-
-  // device inconsistency shouldn't fail
-  auto t3 = at::randn({16, 1, 8}, options.device(at::kCPU, 0));
-  TORCH_CHECK(complyWith(t3, tensor_type));
-}
-
-TEST_F(NVFuserTest, FusionGroupGuardPermutedTensor_CUDA) {
-  std::vector<int64_t> sizes_vec({16, 8, 8});
-  std::vector<int64_t> strides_vec({64, 1, 8});
-  auto tensor_type = TensorType::create(
-      at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt);
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  // failing permutation
-  auto t0 = at::randn({16, 8, 8}, options);
-  TORCH_CHECK(!complyWith(t0, tensor_type));
-
-  // passing with dynamic shape
-  auto t1 = t0.permute({0, 2, 1});
-  TORCH_CHECK(complyWith(t1, tensor_type));
-}
-
-TEST_F(NVFuserTest, FusionGroupGuardRelaxedCheck_CUDA) {
-  std::vector<int64_t> sizes_vec({16, 8, 8});
-  std::vector<int64_t> strides_vec({128, 16, 1});
-  auto tensor_type = TensorType::create(
-      at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt);
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  // contiguity check passes although it differs
-  auto t0 = at::randn({16, 16, 8}, options);
-  TORCH_CHECK(complyWith(t0, tensor_type));
-
-  // passing with dynamic shape
-  auto t1 = t0.slice(1, 0, 16, 2);
-  TORCH_CHECK(complyWith(t1, tensor_type));
-}
-
-TEST_F(NVFuserTest, FusionDisjointSet_CUDA) {
-  DisjointSets<int> set;
-
-  const std::set<int> group_x({0, 1, 2});
-  const std::set<int> group_y({3, 4, 5});
-  const std::set<int> group_z({6, 7, 8});
-  const std::vector<std::set<int>> groups({group_x, group_y, group_z});
-  std::set<int> group_all;
-  std::for_each(groups.begin(), groups.end(), [&](const auto& g) {
-    group_all.insert(g.begin(), g.end());
-  });
-
-  // Initially, nothing should be considered equivalent
-  for (auto i : group_all) {
-    for (auto j : group_all) {
-      TORCH_CHECK(!set.permissiveAreMapped(i, j));
-    }
-  }
-
-  // Sets values in group_x are equivalent
-  for (auto i : group_x) {
-    for (auto j : group_x) {
-      set.mapEntries(i, j);
-      TORCH_CHECK(set.mappingExists(i));
-      TORCH_CHECK(set.mappingExists(j));
-    }
-  }
-
-  // All values in group_x shoudl be equivalent with each other
-  for (auto i : group_x) {
-    for (auto j : group_x) {
-      TORCH_CHECK(set.permissiveAreMapped(i, j));
-    }
-  }
-  // But nothing else should be equivalent
-  for (auto i : group_all) {
-    for (auto j : group_y) {
-      TORCH_CHECK(!set.permissiveAreMapped(i, j));
-    }
-    for (auto j : group_z) {
-      TORCH_CHECK(!set.permissiveAreMapped(i, j));
-    }
-  }
-
-  // Sets values in group_y are equivalent
-  for (auto i : group_y) {
-    for (auto j : group_y) {
-      set.mapEntries(i, j);
-      TORCH_CHECK(set.mappingExists(i));
-      TORCH_CHECK(set.mappingExists(j));
-    }
-  }
-
-  // group_x should be still equivalent
-  for (auto i : group_x) {
-    for (auto j : group_x) {
-      TORCH_CHECK(set.permissiveAreMapped(i, j));
-    }
-  }
-  // group_y should be now equivalent
-  for (auto i : group_y) {
-    for (auto j : group_y) {
-      TORCH_CHECK(set.permissiveAreMapped(i, j));
-    }
-  }
-  // But group_z should not be equivalent with anything yet
-  for (auto i : group_all) {
-    for (auto j : group_z) {
-      TORCH_CHECK(!set.permissiveAreMapped(i, j));
-    }
-  }
-
-  // Sets values in group_z are equivalent
-  for (auto i : group_z) {
-    for (auto j : group_z) {
-      set.mapEntries(i, j);
-      TORCH_CHECK(set.mappingExists(i));
-      TORCH_CHECK(set.mappingExists(j));
-    }
-  }
-
-  // Now each of the three groups should be equivalent within each
-  // group
-  for (const auto gi : c10::irange(groups.size())) {
-    for (const auto gj : c10::irange(groups.size())) {
-      for (auto i : groups[gi]) {
-        for (auto j : groups[gj]) {
-          TORCH_CHECK(
-              (gi == gj && set.permissiveAreMapped(i, j)) ||
-              (gi != gj && !set.permissiveAreMapped(i, j)));
-        }
-      }
-    }
-  }
-
-  std::vector<int> all_elements = set.getAllElements().vector();
-  std::sort(all_elements.begin(), all_elements.end());
-  std::vector<int> group_all_vec(group_all.begin(), group_all.end());
-  std::sort(group_all_vec.begin(), group_all_vec.end());
-  TORCH_CHECK(all_elements == group_all_vec);
-
-  set.clear();
-  TORCH_CHECK(set.getAllElements().vector().size() == 0);
-
-  // All cleared. Nothing should be considered equivalent.
-  for (auto i : group_all) {
-    for (auto j : group_all) {
-      TORCH_CHECK(!set.permissiveAreMapped(i, j));
-    }
-  }
-}
-
-TEST_F(NVFuserTest, FusionNonUniqueBroadcastSize_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  auto tv1 = makeSymbolicTensor(2);
-  auto tv2 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addInput(tv2);
-
-  auto tv3 = broadcast(tv0, {false, true});
-  auto tv4 = add(tv3, tv1);
-  auto tv5 = add(tv3, tv2);
-
-  fusion.addOutput(tv4);
-  fusion.addOutput(tv5);
-
-  // In order to do this, tv1->axis(1) and tv2->axis(1) must have the
-  // same size, but we can't prove it, so this should throw an error.
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(tv3->computeAt(tv4, -1));
-}
-
-TEST_F(NVFuserTest, FusionBiasGeluFwd_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const float k_079 = 0.79788456;
-  const float k_004 = 0.044715;
-
-  // bias vector
-  auto t0 = makeSymbolicTensor(1, DataType::Half);
-  fusion.addInput(t0);
-  auto t1 = castOp(DataType::Float, t0);
-  // input tensor
-  auto t2 = makeSymbolicTensor(3, DataType::Half);
-  fusion.addInput(t2);
-  auto t3 = castOp(DataType::Float, t2);
-  auto t4 = broadcast(t1, {true, true, false});
-  auto t5 = add(t4, t3);
-  auto t6 = mul(t5, IrBuilder::create<Double>(0.5));
-  auto t7 = mul(t5, IrBuilder::create<Double>(k_079));
-  auto t8 = mul(t5, IrBuilder::create<Double>(k_004));
-  auto t9 = mul(t8, t5);
-  auto t10 = add(t9, IrBuilder::create<Int>(1));
-  auto t11 = mul(t7, t10);
-  auto t12 = unaryOp(UnaryOpType::Tanh, t11);
-  auto t13 = add(t12, IrBuilder::create<Double>(1));
-  auto t14 = mul(t6, t13);
-  auto t15 = castOp(DataType::Half, t14);
-  fusion.addOutput(t15);
-
-  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  std::vector<int64_t> input_shape{6, 512, 4096};
-  std::vector<int64_t> bias_shape{4096};
-
-  auto at_input = at::randn(input_shape, options);
-  auto at_bias = at::randn(bias_shape, options);
-
-  auto at_x =
-      at_bias.to(c10::ScalarType::Float) + at_input.to(c10::ScalarType::Float);
-  auto aten_output_float =
-      at_x * 0.5 * (1.0 + (k_079 * at_x * (1 + k_004 * at_x * at_x)).tanh());
-  auto aten_output = aten_output_float.to(c10::ScalarType::Half);
-
-  std::vector<IValue> aten_inputs = {at_bias, at_input};
-  auto lparams = schedulePointwise(&fusion, aten_inputs);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs, lparams);
-  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionBiasGeluBwd_CUDA) {
-  if (at::cuda::getDeviceProperties(0)->major < 6) {
-    return;
-  }
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const float k_079 = 0.79788456;
-  const float k_004 = 0.044715;
-  const float k_010 = 0.1070322243;
-
-  // gradient tensor
-  auto t0 = makeSymbolicTensor(3, DataType::Half);
-  fusion.addInput(t0);
-  auto t1 = castOp(DataType::Float, t0);
-  // bias tensor
-  auto t2 = makeSymbolicTensor(1, DataType::Half);
-  fusion.addInput(t2);
-  auto t3 = castOp(DataType::Float, t2);
-  // input tensor
-  auto t4 = makeSymbolicTensor(3, DataType::Half);
-  fusion.addInput(t4);
-  auto t5 = castOp(DataType::Float, t4);
-  auto t6 = broadcast(t3, {true, true, false});
-  auto t7 = add(t6, t5);
-  auto t8 = mul(t7, IrBuilder::create<Double>(k_079));
-  auto t9 = mul(t7, IrBuilder::create<Double>(k_004));
-  auto t10 = mul(t9, t7);
-  auto t11 = add(t10, IrBuilder::create<Int>(1));
-  auto t12 = mul(t8, t11);
-  auto t13 = unaryOp(UnaryOpType::Tanh, t12);
-  auto t14 = mul(t7, IrBuilder::create<Double>(0.5));
-  auto t15 = mul(t13, t13);
-  auto t16 = unaryOp(UnaryOpType::Neg, t15);
-  auto t17 = add(t16, IrBuilder::create<Int>(1));
-  auto t18 = mul(t7, IrBuilder::create<Double>(k_010));
-  auto t19 = mul(t18, t7);
-  auto t20 = add(t19, IrBuilder::create<Double>(k_079));
-  auto t21 = mul(t17, t20);
-  auto t22 = mul(t14, t21);
-  auto t23 = add(t13, IrBuilder::create<Int>(1));
-  auto t24 = mul(t23, IrBuilder::create<Double>(0.5));
-  auto t25 = add(t22, t24);
-  auto t26 = mul(t25, t1);
-  // Save float output for validation
-  fusion.addOutput(t26);
-  auto t27 = castOp(DataType::Half, t26);
-  fusion.addOutput(t27);
-
-  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  at::manual_seed(1);
-  std::vector<int64_t> input_shape{6, 512, 4096};
-  std::vector<int64_t> bias_shape{4096};
-  auto at_input = at::randn(input_shape, options);
-  auto at_bias = at::randn(bias_shape, options);
-  auto at_grad = at::randn(input_shape, options);
-
-  auto at_x =
-      at_bias.to(c10::ScalarType::Float) + at_input.to(c10::ScalarType::Float);
-  auto at_tanh_out = (k_079 * at_x * (1 + k_004 * at_x * at_x)).tanh();
-  auto at_ff = 0.5 * at_x *
-          ((1 - at_tanh_out * at_tanh_out) * (k_079 + k_010 * at_x * at_x)) +
-      0.5 * (1 + at_tanh_out);
-  auto at_out = at_ff * at_grad;
-  auto at_out_half = at_out.to(c10::ScalarType::Half);
-
-  std::vector<IValue> aten_inputs = {at_grad, at_bias, at_input};
-  std::vector<at::Tensor> aten_outputs = {at_out, at_out_half};
-
-  auto lparams = schedulePointwise(&fusion, aten_inputs);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs, lparams);
-  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
-}
-
-// Reproducer of issue #459
-TEST_F(NVFuserTest, FusionIssue459_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv3 = broadcast(tv2, {true, false});
-  auto tv4 = add(tv1, tv3);
-
-  // Create two outputs from the final arithmetic result
-  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv5);
-  auto tv6 = add(tv4, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv6);
-
-  // Scheduling
-  for (auto output : ir_utils::filterByType<TensorView>(fusion.outputs())) {
-    output->merge(-2, -1);
-  }
-  for (auto output : ir_utils::filterByType<TensorView>(fusion.outputs())) {
-    output->split(0, 128);
-  }
-
-  tv0->computeAt(tv5, -1);
-
-  tv6->axis(0)->parallelize(ParallelType::BIDx);
-  tv6->axis(1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  const int numel_x = 10;
-  const int numel_y = 20;
-  auto t0 = at::randn({numel_x}, options);
-  auto t1 = at::randn({numel_y, numel_x}, options);
-  auto aten_output = (t0 + 1).unsqueeze(0) + t1 + 1;
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      aten_inputs,
-      {aten_output, aten_output},
-      __LINE__,
-      __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSmemIndexingSimple_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv3);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(1)->parallelize(ParallelType::TIDx);
-
-  tv0->computeAt(tv3, -1);
-
-  tv1->setMemoryType(MemoryType::Shared);
-  tv2->setMemoryType(MemoryType::Global);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  auto aten_input = at::randn({12, 34}, options);
-  at::Tensor aten_output = aten_input + 1.0 + 1.0 + 1.0;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSmemIndexing_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Symbolic integers we will use for runtime tiling
-  Int* symbolic_m_tile_dim = IrBuilder::create<Int>();
-  Int* symbolic_split_k_tile_dim = IrBuilder::create<Int>();
-  Int* symbolic_block_k_tile_dim = IrBuilder::create<Int>();
-  // Compile-time integer for tiling
-  int n_smem_tile = 32;
-
-  // Symbolic 2D tensors TV0[M, K], TV1[K, N]
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-
-  // Broadcast tv0 to [M, K, *]
-  TensorView* tv2 = broadcast(tv0, {false, false, true});
-  // Broadcast tv1 to [*, K, N]
-  TensorView* tv3 = broadcast(tv1, {true, false, false});
-
-  // Pointwise multiplication resulting in tv3[M, K, N]
-  TensorView* tv4 = mul(tv2, tv3);
-
-  // Sum the K-dim
-  TensorView* tv5 = sum(tv4, {1});
-
-  // Register inputs and outputs
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addOutput(tv5);
-
-  // Register runtime tile dims as inputs
-  fusion.addInput(symbolic_m_tile_dim);
-  fusion.addInput(symbolic_split_k_tile_dim);
-  fusion.addInput(symbolic_block_k_tile_dim);
-
-  // Make a 3D tile, mix of symbolic and constant, do in reverse order because
-  // dims are inserted
-  // [M, rK, N]
-  tv5->split(2, n_smem_tile);
-  // [M, rK, No, Ni{32}]
-  tv5->split(1, symbolic_block_k_tile_dim);
-  // [M, rKo, rKi{i2}, No, Ni{32}]
-  tv5->split(1, symbolic_split_k_tile_dim);
-  // [M, rKoo, rKoi{i1}, rKi{i2}, No, Ni{32}]
-  tv5->split(0, symbolic_m_tile_dim);
-  // [Mo, Mi{i0}, rKoo, rKoi{i1}, rKi{i2}, No, Ni{32}]
-
-  // Reorder so all outer tiles are in the leftmost 3 positions
-  // [Mo, Mi{i0}, rKoo, rKoi{i1}, rKi{i2},     No, Ni{32}]
-  // [Mo,     No, rKoo, rKoi{i1}, rKi{i2}, Mi{i0}, Ni{32}]
-  tv5->reorder({{1, 5}, {5, 1}});
-
-  // Factor out the outer reduction IterDomain, then run the inter-cta
-  // reduction, and intra-cta reduction
-  // [Mo, No, rKoo,  Koi{i1},  Ki{i2}, Mi{i0}, Ni{32}]
-  // [Mo, No,       rKoi{i1}, rKi{i2}, Mi{i0}, Ni{32}]
-  auto tv6 = tv5->rFactor({2});
-
-  // Scope computations
-  tv6->computeAt(tv5, 2);
-
-  // [Mo, No, rKoo, Koi{i1},  Ki{i2}, Mi{i0}, Ni{32}]
-  // [Mo, No, Ki{i2}, Mi{i0}, Ni{32}, rKoo, Koi{i1}]
-  tv6->reorder({
-      {5, -2},
-      {6, -1},
-      {2, 2},
-      {3, 3},
-      {4, 4},
-  });
-
-  // Setup compute at schedule
-  tv0->computeAt(tv6, 3);
-  tv1->computeAt(tv6, 3);
-  tv4->computeAt(tv6, -1);
-
-  // Cache smem tiles
-  tv2->setMemoryType(MemoryType::Shared);
-  tv3->setMemoryType(MemoryType::Shared);
-  tv4->setMemoryType(MemoryType::Shared);
-  tv6->setMemoryType(MemoryType::Shared);
-
-  tv5->axis(0)->parallelize(ParallelType::BIDz);
-  tv5->axis(1)->parallelize(ParallelType::BIDy);
-
-  std::vector<TensorView*> tv_list = {tv2, tv3, tv4, tv5, tv6};
-  for (auto tv : tv_list) {
-    tv->axis(-2)->parallelize(ParallelType::TIDz);
-    tv->axis(-1)->parallelize(ParallelType::TIDy);
-  }
-
-  constexpr int M = 31, K = 65, N = 32;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({M, K}, options);
-  at::Tensor t1 = at::randn({K, N}, options);
-
-  at::Tensor aten_output =
-      mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1);
-
-  // A, B, m_tile_dim, split_k, intra_cta_tile
-  std::vector<IValue> aten_inputs = {t0, t1, 3, 4, 5};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-// Reproducer of issue 408
-TEST_F(NVFuserTest, FusionCacheBeforeReduction_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = sum(tv1, {1});
-  fusion.addOutput(tv2);
-
-  tv2->split(0, 4);
-
-  auto tv3 = tv2->cacheBefore();
-
-  tv0->computeAt(tv3, -1);
-  tv3->computeAt(tv2, -1);
-
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  const int numel_x = 100;
-  const int numel_y = 200;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
-  at::Tensor cg_output = at::empty({numel_x}, options);
-
-  auto aten_output = (aten_input + 1).to(at::kDouble).sum({1});
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  fe.runFusion({aten_input}, {cg_output});
-
-  testValidate(
-      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionCacheBeforeReduction2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(3);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = sum(tv1, {1});
-  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv2);
-  fusion.addOutput(tv3);
-
-  auto tv4 = tv2->cacheBefore();
-
-  tv4->computeAt(tv3, 1);
-  tv0->computeAt(tv4, -1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-
-  const int numel_x = 10;
-  const int numel_y = 20;
-  const int numel_z = 30;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({numel_x, numel_y, numel_z}, options);
-  auto t2 = (aten_input + 1).to(at::kDouble).sum({1});
-  auto t3 = t2 + 1;
-  std::vector<at::Tensor> aten_outputs = {t2, t3};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionIssue367_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Symbolic integers we will use for runtime tiling
-  Int* symbolic_m_tile_dim = IrBuilder::create<Int>();
-  Int* symbolic_split_k_tile_dim = IrBuilder::create<Int>();
-  Int* symbolic_block_k_tile_dim = IrBuilder::create<Int>();
-  // Compile-time integer for tiling
-  int n_smem_tile = 32;
-
-  // Symbolic 2D tensors TV0[M, K], TV1[K, N]
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-
-  // Broadcast tv0 to [M, K, *]
-  TensorView* tv2 = broadcast(tv0, {false, false, true});
-  // Broadcast tv1 to [*, K, N]
-  TensorView* tv3 = broadcast(tv1, {true, false, false});
-
-  // Pointwise multiplication resulting in tv3[M, K, N]
-  TensorView* tv4 = mul(tv2, tv3);
-
-  // Sum the K-dim
-  TensorView* tv5 = sum(tv4, {1});
-
-  // Register inputs and outputs
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addOutput(tv5);
-
-  // Register runtime tile dims as inputs
-  fusion.addInput(symbolic_m_tile_dim);
-  fusion.addInput(symbolic_split_k_tile_dim);
-  fusion.addInput(symbolic_block_k_tile_dim);
-
-  // Make a 3D tile, mix of symbolic and constant, do in reverse order because
-  // dims are inserted
-  // [M, K, N]
-  tv5->split(2, n_smem_tile);
-  tv5->split(1, symbolic_block_k_tile_dim);
-  tv5->split(1, symbolic_split_k_tile_dim);
-  tv5->split(0, symbolic_m_tile_dim);
-  // [Mo, Mi, Koo, Koi, Ki, No, Ni]
-  tv5->reorder({{1, 5}, {5, 1}});
-  // [Mo, No, Koo, Koi, Ki, Mi, Ni]
-
-  auto tv6 = tv5->rFactor({2});
-  auto tv7 = tv5->rFactor({2});
-  // [Mo, No, rKoo,  Koi,  Ki, Mi, Ni]
-  // [Mo, No,       rKoi, rKi, Mi, Ni]
-
-  // Scope computations
-  tv6->computeAt(tv5, 2);
-
-  tv0->computeAt(tv6, 3);
-  tv1->computeAt(tv6, 3);
-  tv4->computeAt(tv6, -1);
-
-  // Cache smem tiles
-  tv2->setMemoryType(MemoryType::Shared);
-  tv3->setMemoryType(MemoryType::Shared);
-  tv4->setMemoryType(MemoryType::Local);
-  tv6->setMemoryType(MemoryType::Local);
-  tv7->setMemoryType(MemoryType::Local);
-
-  tv5->axis(0)->parallelize(ParallelType::BIDz);
-  tv5->axis(1)->parallelize(ParallelType::BIDy);
-
-  std::vector<TensorView*> tv_list = {tv2, tv3, tv4, tv5, tv6, tv7};
-  for (auto tv : tv_list) {
-    tv->axis(-2)->parallelize(ParallelType::TIDz);
-    tv->axis(-1)->parallelize(ParallelType::TIDy);
-  }
-  tv2->axis(3)->parallelize(ParallelType::TIDx);
-  tv3->axis(3)->parallelize(ParallelType::TIDx);
-  tv4->axis(3)->parallelize(ParallelType::TIDx);
-  tv6->axis(3)->parallelize(ParallelType::TIDx);
-  tv7->axis(2)->parallelize(ParallelType::TIDx);
-
-  tv2->axis(4)->parallelize(ParallelType::BIDx);
-  tv3->axis(4)->parallelize(ParallelType::BIDx);
-  tv4->axis(4)->parallelize(ParallelType::BIDx);
-  tv6->axis(4)->parallelize(ParallelType::BIDx);
-  tv7->axis(3)->parallelize(ParallelType::BIDx);
-  tv5->axis(2)->parallelize(ParallelType::BIDx);
-
-  constexpr int M = 3, K = 6, N = 16;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({M, K}, options);
-  at::Tensor t1 = at::randn({K, N}, options);
-
-  // A, B, m, split_k, block_k
-  std::vector<IValue> aten_inputs = {t0, t1, 2, 2, 3};
-  at::Tensor aten_output =
-      mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1);
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionIssue468_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = sum(tv1, {0});
-  fusion.addOutput(tv2);
-
-  tv1->axis(0)->parallelize(ParallelType::TIDy);
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-
-  tv2->axis(0)->parallelize(ParallelType::TIDy);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({10, 100}, options);
-  at::Tensor aten_output = aten_input.to(at::kDouble).sum({1}).sum({0});
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionIssue363_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Symbolic 2D tensors TV0[M, K], TV1[K, N]
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(2);
-
-  // Broadcast tv0 to [M, K, *]
-  TensorView* tv2 = broadcast(tv0, {false, false, true});
-  // Broadcast tv1 to [*, K, N]
-  TensorView* tv3 = broadcast(tv1, {true, false, false});
-
-  // Pointwise multiplication resulting in tv3[M, K, N]
-  TensorView* tv4 = mul(tv2, tv3);
-
-  // Sum the K-dim
-  TensorView* tv5 = sum(tv4, {1});
-
-  // Register inputs and outputs
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addOutput(tv5);
-
-  tv2->setMemoryType(MemoryType::Global);
-  tv3->setMemoryType(MemoryType::Global);
-  tv4->setMemoryType(MemoryType::Global);
-
-  tv0->computeAt(tv5, -1);
-  tv1->computeAt(tv5, -1);
-
-  tv5->axis(0)->parallelize(ParallelType::BIDz);
-  tv5->axis(1)->parallelize(ParallelType::BIDy);
-
-  tv5->axis(2)->parallelize(ParallelType::BIDx);
-
-  constexpr int M = 3, K = 6, N = 16;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({M, K}, options);
-  at::Tensor t1 = at::randn({K, N}, options);
-  at::Tensor aten_output =
-      mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionIssue484_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = add(tv1, IrBuilder::create<Double>(0));
-  fusion.addOutput(tv2);
-
-  tv1->setMemoryType(MemoryType::Global);
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 100;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({M, M}, options);
-  at::Tensor aten_output = aten_input.to(at::kDouble).sum({1});
-
-  torch::jit::fuser::cuda::FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionIssue329_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = sum(tv1, {1});
-  fusion.addOutput(tv2);
-  auto tv3 = sum(tv1, {1});
-  fusion.addOutput(tv3);
-
-  tv1->computeAt(tv2, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  std::vector<int64_t> t0_shape{17, 19};
-  auto aten_input = at::randn(t0_shape, options);
-  auto t2 = (aten_input + 1).to(at::kDouble).sum({1});
-  auto t3 = (aten_input + 1).to(at::kDouble).sum({1});
-  std::vector<at::Tensor> aten_outputs = {t2, t3};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionIssue382_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = broadcast(tv1, {false, false, true});
-  auto tv3 = makeSymbolicTensor(3);
-  fusion.addInput(tv3);
-  auto tv4 = add(tv2, tv3);
-  fusion.addOutput(tv4);
-
-  tv2->merge(1);
-  tv4->merge(1);
-
-  tv1->computeAt(tv4, 1);
-
-  tv4->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv1->setMemoryType(MemoryType::Global);
-  tv2->setMemoryType(MemoryType::Global);
-
-  const int numel_x = 12;
-  const int numel_y = 34;
-  const int numel_z = 56;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  auto t0 = at::randn({numel_x, numel_y}, options);
-  auto t3 = at::randn({numel_x, numel_y, numel_z}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t3};
-  auto aten_output = (t0 + 1).unsqueeze(-1) + t3;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionIssue507_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv2);
-
-  tv1->setMemoryType(MemoryType::Shared);
-
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-  tv2->axis(1)->parallelize(ParallelType::TIDx);
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  std::vector<int64_t> t0_shape{17, 19};
-  auto aten_input = at::randn(t0_shape, options);
-  auto t1 = (aten_input + 1);
-  auto aten_output = (t1 + 1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionIssue532_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Algorithm
-  TensorView* tv0 = makeSymbolicTensor(1);
-  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
-  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(1));
-  fusion.addInput(tv0);
-  fusion.addOutput(tv2);
-
-  const int M_BLOCK = 64;
-  const int M_THREAD = 4;
-
-  tv2->split(0, M_BLOCK);
-  // tv2: [M/M_BLOCK, M_BLOCK]
-  tv1->computeAt(tv2, 1);
-  // tv1: [M/M_BLOCK, M_BLOCK]
-
-  tv1->split(-1, M_BLOCK / M_THREAD);
-  // tv1: [M/M_BLOCK, M_THREAD, M_BLOCK / M_THREAD]
-
-  tv2->split(-1, M_THREAD);
-  // tv2: [M/M_BLOCK, M_BLOCK / M_THREAD, M_THREAD]
-
-  constexpr int M = 1000;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({M}, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  at::Tensor aten_output = t0 + 1 + 1;
-
-  testValidate(
-      &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionLoopUnswitch_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Algorithm
-  TensorView* tv0 = makeSymbolicTensor(1);
-  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
-  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(1));
-  fusion.addInput(tv0);
-  fusion.addOutput(tv2);
-
-  tv2->split(0, 32);
-  tv1->computeAt(tv2, -1);
-
-  tv2->axis(1)->parallelize(ParallelType::Unswitch);
-
-  constexpr int M = 1000;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({M}, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  at::Tensor aten_output = t0 + 1 + 1;
-
-  testValidate(
-      &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionIssue549_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2); // M, K
-  TensorView* tv1 = makeSymbolicTensor(2); // K, N
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
-
-  TensorView* tv3 = broadcast(tv2, {false, false, true});
-  // tv3[I0, I1, B] = tv0[I0, I1]
-
-  TensorView* tv4 = broadcast(tv1, {true, false, false});
-  // tv4[B, I1, I2] = tv1[I1, I2]
-
-  // tv5[I0, I1, I2] = tv3[I0, I1, B] * tv4[B, I1, I2]
-  TensorView* tv5 = mul(tv3, tv4);
-  // tv6[I0, R1, I2] = tv5[I0, I1, I2]
-  TensorView* tv6 = sum(tv5, {1});
-  fusion.addOutput(tv6);
-
-  tv6->split(1, 32);
-  // tv6[I0, R1o, R1i{32}, I2]
-
-  auto tv7 = tv6->rFactor({1});
-  // tv7[I0, R1o, I1i{32}, I2] = tv5[I0, I1, I2]
-  // tv6[I0,    , R1i{32}, I2] = tv7[I0, R1o, I1i{32}, I2]
-
-  tv6->split(0, 4);
-  tv6->split(-1, 4);
-  // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
-  // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
-
-  tv0->computeAt(tv6, -1);
-  tv1->computeAt(tv6, -1);
-
-  // tv7[I0o, I0i{4}, R1o, I1i{32}, I2o, I2i{4}]
-  // tv6[I0o, I0i{4},    , R1i{32}, I2o, I2i{4}]
-  //--> (line symbolizes compute at location)
-  // tv5[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, I1o]
-  // tv7[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, R1o]
-  // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
-
-  tv0->computeAt(tv7, -1);
-  tv1->computeAt(tv7, -1);
-  // tv5[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, I1o |]
-  // tv7[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, R1o |]
-  // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
-
-  tv6->axis(0)->parallelize(ParallelType::BIDz);
-  tv6->axis(1)->parallelize(ParallelType::TIDz);
-
-  tv6->axis(-2)->parallelize(ParallelType::BIDy);
-  tv6->axis(-1)->parallelize(ParallelType::TIDy);
-
-  tv6->axis(2)->parallelize(ParallelType::TIDx);
-  tv7->axis(2)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 65, K = 33, N = 17;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({M, K}, options);
-  at::Tensor t1 = at::randn({K, N}, options);
-
-  // Lets specify a few bounds in launch params to make sure it works
-  LaunchParams lparams(1, -1, -1, 32, 4, 4);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1}, lparams);
-  fe.runFusion({t0, t1}, lparams);
-
-  // Make sure bad launch params throws
-  // TODO: Re-enable once we have parallelization validation in.
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  // ASSERT_ANY_THROW(fe.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6)));
-
-  // Don't specify any launch params
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  auto aten_output = (t0 + 1).to(at::kDouble).matmul(t1.to(at::kDouble));
-
-  testValidate(
-      &fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSimpleCompileRtc_CUDA) {
-  FusionExecutor fe;
-  std::string kernel = R"(
-__global__ void kernel1(Tensor<float, 1> T0, Tensor<float, 1> T1) {
-  if(threadIdx.x==0){
-    for(size_t ki28 = 0; ki28 < T0.size[0]; ++ki28) {
-      T1[ki28*T1.stride[0]] = T0[ki28*T0.stride[0]]*2;
-    }
-  }
-}
-    )";
-  fe.compileRtc(kernel, "CudaCodeGen::kernel1");
-  LaunchParams lp(
-      256, // gdimx
-      1, // gdimy
-      1, // gdimz
-      1, // bdimx
-      1, // bdimy
-      1 // bdimz
-  );
-  lp.setSmem(0);
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const std::vector<int64_t> tensor_dims = {8};
-  auto in0 = at::randn(tensor_dims, options);
-  auto out0 = at::empty_like(in0);
-  fe.runRtc(lp, {in0, out0});
-
-  auto out_ref = in0 * 2;
-  TORCH_CHECK(out_ref.allclose(out0));
-}
-
-TEST_F(NVFuserTest, FusionSerialWelford_CUDA) {
-  FusionExecutor fe;
-  int x = 128, y = 64, z = 64;
-
-  std::string kernel = R"(
-__global__ void kernel1(
-    Tensor<float,3> inp,
-    Tensor<float,1> out_var,
-    Tensor<float,1> out_avg
-){
-    for(int i0=0;i0<inp.size[0];i0++){
-        float tmp_M2=0;
-        float tmp_avg=0;
-        long tmp_N=0;
-        for(int i1=0;i1<inp.size[1];i1++){
-            for(int i2=0;i2<inp.size[2];i2++){
-                welfordCombine(
-                    tmp_avg,
-                    tmp_M2,
-                    tmp_N,
-                    inp[i0*inp.stride[0]+
-                        i1*inp.stride[1]+
-                        i2*inp.stride[2]],
-                    0.f,
-                    (long)1
-                );
-            }
-        }
-        out_var[i0*out_var.stride[0]]=
-            tmp_M2/(tmp_N);
-        out_avg[i0*out_avg.stride[0]]=
-            tmp_avg;
-    }
-}
-    )";
-  fe.compileRtc(kernel, "CudaCodeGen::kernel1");
-  LaunchParams lp(
-      1, // gdimx
-      1, // gdimy
-      1, // gdimz
-      1, // bdimx
-      1, // bdimy
-      1 // bdimz
-  );
-  lp.setSmem(0);
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const std::vector<int64_t> tensor_dims = {x, y, z};
-  auto in0 = at::randn(tensor_dims, options);
-  auto out_var = at::empty({x}, options);
-  auto out_avg = at::empty({x}, options);
-  fe.runRtc(lp, {in0, out_var, out_avg});
-
-  TORCH_CHECK(in0.var({1, 2}, false).allclose(out_var));
-  TORCH_CHECK(in0.mean({1, 2}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6));
-}
-
-TEST_F(NVFuserTest, FusionBlockWelford_CUDA) {
-  FusionExecutor fe;
-  int x = 7, y = 8, z = 9;
-
-  std::string kernel = R"(
-__global__ void kernel1(
-    Tensor<float,2> inp,
-    Tensor<float,1> out_avg,
-    Tensor<float,1> out_var,
-    Tensor<float,1> init_avg,
-    Tensor<float,1> init_var,
-    Tensor<long,0> init_N
-){
-    //actual generated kernel will use dynamic shared mem,
-    // here is just for prototype
-    __shared__ float mem_avg[512];
-    __shared__ float mem_M2[512];
-    __shared__ long mem_N[512];
-    float in=inp[threadIdx.x*inp.stride[0]+
-                        threadIdx.y*inp.stride[1]];
-    float tmp_avg=0;
-    float tmp_M2=0;
-    long tmp_N=0;
-    blockWelford<false,true,false>(
-        tmp_avg,
-        tmp_M2,
-        tmp_N,
-        in,
-        0.f,
-        (long)1,
-        threadIdx,
-        blockDim,
-        (float*)mem_avg,
-        (float*)mem_M2,
-        (long*)mem_N,
-        (bool)(threadIdx.x<inp.size[0]),
-        0.f);
-    __syncthreads();
-    if(threadIdx.x<out_var.size[0] && threadIdx.y==0){
-        welfordCombine(
-                    tmp_avg,
-                    tmp_M2,
-                    tmp_N,
-                    init_avg[threadIdx.x*init_avg.stride[0]],
-                    init_var[threadIdx.x*init_var.stride[0]]*init_N[0],
-                    init_N[0]
-                );
-        out_avg[threadIdx.x*out_avg.stride[0]]=tmp_avg;
-        out_var[threadIdx.x*out_var.stride[0]]=tmp_M2/(tmp_N);
-    }
-}
-    )";
-  fe.compileRtc(kernel, "CudaCodeGen::kernel1");
-  LaunchParams lp(
-      1, // gdimx
-      1, // gdimy
-      1, // gdimz
-      x, // bdimx
-      y, // bdimy
-      1 // bdimz
-  );
-  lp.setSmem(0);
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const std::vector<int64_t> tensor_dims = {x, y};
-  const std::vector<int64_t> init_dims = {x, z};
-
-  // generate initial values
-  auto init_in = at::randn(init_dims, options);
-  auto init_var = init_in.var({1}, false);
-  auto init_avg = init_in.mean({1});
-  auto init_N =
-      at::tensor(z, at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0));
-
-  auto in0 = at::randn(tensor_dims, options);
-
-  // run kernel
-  auto out_var = at::zeros({x}, options);
-  auto out_avg = at::zeros({x}, options);
-  fe.runRtc(lp, {in0, out_avg, out_var, init_avg, init_var, init_N});
-
-  // compare with reference output
-  auto cat_tensor = at::cat({init_in, in0}, 1);
-  TORCH_CHECK(cat_tensor.var({1}, false).allclose(out_var));
-  TORCH_CHECK(
-      cat_tensor.mean({1}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6));
-}
-
-TEST_F(NVFuserTest, FusionBlockWelfordNoInit_CUDA) {
-  FusionExecutor fe;
-  int x = 7, y = 8, z = 9;
-
-  // need support IValue for integer input as initial count
-  std::string kernel = R"(
-__global__ void kernel1(
-    Tensor<float,3> inp,
-    Tensor<float,1> out_avg,
-    Tensor<float,1> out_var
-){
-    //actual generated kernel will use dynamic shared mem,
-    // here is just for prototype
-    __shared__ float mem_avg[512];
-    __shared__ float mem_M2[512];
-    __shared__ long mem_N[512];
-    float in=inp[threadIdx.x*inp.stride[0]+
-                        threadIdx.y*inp.stride[1]+
-                        threadIdx.z*inp.stride[2]];
-    float tmp_avg=0;
-    float tmp_M2=0;
-    long tmp_N=0;
-    block_sync::init();
-    blockWelford<false,true,true>(
-        tmp_avg,
-        tmp_M2,
-        tmp_N,
-        in,
-        0.f,
-        (long) 1,
-        threadIdx,
-        blockDim,
-        (float*)mem_avg,
-        (float*)mem_M2,
-        (long*)mem_N,
-        (bool)(threadIdx.x<inp.size[0]),
-        0.f);
-    __syncthreads();
-    if(threadIdx.x<out_var.size[0] && threadIdx.y==0 && threadIdx.z==0){
-        out_avg[threadIdx.x*out_var.stride[0]]=tmp_avg;
-        out_var[threadIdx.x*out_var.stride[0]]=tmp_M2/(tmp_N);
-    }
-}
-    )";
-  fe.compileRtc(kernel, "CudaCodeGen::kernel1");
-  LaunchParams lp(
-      1, // gdimx
-      1, // gdimy
-      1, // gdimz
-      x, // bdimx
-      y, // bdimy
-      z // bdimz
-  );
-  lp.setSmem(0);
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const std::vector<int64_t> tensor_dims = {x, y, z};
-  auto in0 = at::randn(tensor_dims, options);
-  auto out_var = at::empty({x}, options);
-  auto out_avg = at::empty({x}, options);
-  fe.runRtc(lp, {in0, out_avg, out_var});
-
-  TORCH_CHECK(in0.var({1, 2}, false).allclose(out_var));
-  TORCH_CHECK(in0.mean({1, 2}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6));
-}
-
-TEST_F(NVFuserTest, FusionGridWelfordNoInit_CUDA) {
-  FusionExecutor fe;
-  int x = 128, y = 64, z = 128;
-
-  std::string kernel = R"(
-__global__ void kernel1(
-    Tensor<float,3> inp,
-    Tensor<float,1> out_avg,
-    Tensor<float,1> out_var,
-    Tensor<float,1> work_buf_avg,
-    Tensor<float,1> work_buf_M2,
-    Tensor<long,1> work_buf_N,
-    Tensor<int64_t,1> sync_flag
-){
-    __shared__ float shared_buf_avg[512];
-    __shared__ float shared_buf_M2[512];
-    __shared__ long shared_buf_N[512];
-    float tmp_avg=0;
-    float tmp_M2=0;
-    long tmp_N=0;
-    float in = inp[ blockIdx.x  * inp.stride[0]+
-                    blockIdx.y  * inp.stride[1]+
-                    threadIdx.x * inp.stride[2]];
-    block_sync::init();
-    welford::gridWelford<
-        true,true,false,
-        true,false,false,
-        false
-    >(
-        tmp_avg,
-        tmp_M2,
-        tmp_N,
-        in,
-        0.f,
-        (long) 1,
-        &work_buf_avg[0],
-        &work_buf_M2[0],
-        &work_buf_N[0],
-        sync_flag,
-        (float*)shared_buf_avg,
-        (float*)shared_buf_M2,
-        (long*)shared_buf_N,
-        threadIdx.x<out_var.size[0],
-        threadIdx.x<out_var.size[0],
-        0.f,
-        0,
-        1);
-    if(blockIdx.x == gridDim.x - 1 && blockIdx.y == gridDim.y - 1){
-        out_avg[threadIdx.x*out_avg.stride[0]]=tmp_avg;
-        out_var[threadIdx.x*out_var.stride[0]]=tmp_M2/tmp_N;
-    }
-}
-    )";
-  fe.compileRtc(kernel, "CudaCodeGen::kernel1");
-  LaunchParams lp(
-      x, // gdimx
-      y, // gdimy
-      1, // gdimz
-      z, // bdimx
-      1, // bdimy
-      1 // bdimz
-  );
-  lp.setSmem(0);
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const auto options_int =
-      at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
-
-  const std::vector<int64_t> tensor_dims = {x, y, z};
-  auto in0 = at::randn(tensor_dims, options);
-
-  auto out_avg = at::empty({z}, options);
-  auto out_var = at::empty({z}, options);
-  auto work_buf_avg = at::empty({x * y * z}, options);
-  auto work_buf_var = at::empty({x * y * z}, options);
-  auto work_buf_N = at::empty({x * y * z}, options_int);
-  auto sync_flag = at::zeros({1}, options_int);
-  fe.runRtc(
-      lp,
-      {in0,
-       out_avg,
-       out_var,
-       work_buf_avg,
-       work_buf_var,
-       work_buf_N,
-       sync_flag});
-  std::vector<int64_t> dims{0, 1};
-
-  TORCH_CHECK(in0.mean(dims).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6));
-  TORCH_CHECK(in0.var(dims, false).allclose(out_var));
-}
-
-TEST_F(NVFuserTest, FusionWelfordOp_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int M = 64, N = 128;
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = mul(tv0, IrBuilder::create<Double>(1));
-  auto tvs = Welford(tv1, {1});
-  auto tv_avg = tvs.avg;
-  auto tv_M2 = tvs.var_sum;
-  auto tv_N = tvs.n;
-  fusion.addOutput(tv_avg);
-  fusion.addOutput(tv_M2);
-  fusion.addOutput(tv_N);
-
-  tv_avg->split(1, 32);
-  tv_avg->split(0, 32);
-  tv_avg->split(0, 4);
-  tv_avg->reorder({{-1, -3}, {-3, -1}});
-  tv1->computeAt(tv_avg, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({M, N}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
-
-  // by default Welford outputs sum of square diff so need to divide to get var
-  outputs[1] /= N;
-
-  testValidate(
-      fe.kernel(),
-      outputs,
-      {t0},
-      {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
-      __LINE__,
-      __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionBlockWelfordOp_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int M = 64, N = 128;
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = mul(tv0, IrBuilder::create<Double>(1));
-  auto tvs = Welford(tv1, {1});
-  auto tv_avg = tvs.avg;
-  auto tv_M2 = tvs.var_sum;
-  auto tv_N = tvs.n;
-  fusion.addOutput(tv_avg);
-  fusion.addOutput(tv_M2);
-  fusion.addOutput(tv_N);
-
-  tv_avg->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv1->computeAt(tv_avg, -1);
-
-  //
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({M, N}, options);
-  at::Tensor t_var = at::empty({M}, options);
-  at::Tensor t_avg = at::empty({M}, options);
-  at::Tensor t_N = at::empty({M}, options_int);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
-
-  // by default Welford outputs sum of square diff so need to divide to get var
-  outputs[1] /= N;
-
-  testValidate(
-      fe.kernel(),
-      outputs,
-      {t0},
-      {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
-      __LINE__,
-      __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionGridWelfordOp_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int M = 64, N = 128;
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = mul(tv0, IrBuilder::create<Double>(1));
-  auto tvs = Welford(tv1, {1});
-  auto tv_avg = tvs.avg;
-  auto tv_M2 = tvs.var_sum;
-  auto tv_N = tvs.n;
-  fusion.addOutput(tv_avg);
-  fusion.addOutput(tv_M2);
-  fusion.addOutput(tv_N);
-
-  tv_avg->axis(0)->parallelize(ParallelType::TIDx);
-  tv_avg->axis(-1)->parallelize(ParallelType::BIDx);
-
-  tv1->computeAt(tv_avg, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({M, N}, options);
-  at::Tensor t_avg = at::empty({M}, options);
-  at::Tensor t_var = at::empty({M}, options);
-  at::Tensor t_N = at::empty({M}, options_int);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
-
-  // by default Welford outputs sum of square diff so need to divide to get var
-  outputs[1] /= N;
-
-  testValidate(
-      fe.kernel(),
-      outputs,
-      {t0},
-      {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
-      __LINE__,
-      __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionRfactorWelfordOp_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int M = 64, N = 128;
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = mul(tv0, IrBuilder::create<Double>(1));
-  auto tvs = Welford(tv1, {1});
-  auto tv_avg = tvs.avg;
-  auto tv_M2 = tvs.var_sum;
-  auto tv_N = tvs.n;
-  fusion.addOutput(tv_avg);
-  fusion.addOutput(tv_M2);
-  fusion.addOutput(tv_N);
-
-  tv_avg->split(1, 4);
-  ir_utils::rfactorHelper(tvs.avg, {2});
-  tv1->computeAt(tv_avg, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({M, N}, options);
-  at::Tensor t_avg = at::empty({M}, options);
-  at::Tensor t_var = at::empty({M}, options);
-  at::Tensor t_N = at::empty({M}, options_int);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
-
-  // by default Welford outputs sum of square diff so need to divide to get var
-  outputs[1] /= N;
-
-  testValidate(
-      fe.kernel(),
-      outputs,
-      {t0},
-      {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
-      __LINE__,
-      __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionWelfordSchedule_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int M = 64, N = 128;
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = mul(tv0, IrBuilder::create<Double>(1));
-  auto tvs = Welford(tv1, {1});
-  auto tv_avg = tvs.avg;
-  auto tv_M2 = tvs.var_sum;
-  auto tv_N = tvs.n;
-  fusion.addOutput(tv_avg);
-  fusion.addOutput(tv_M2);
-  fusion.addOutput(tv_N);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({M, N}, options);
-  // TODO: Why do we use launch params from here, but not scheduling???
-  auto reduction_params = getReductionHeuristics(&fusion, {t0});
-  scheduleReduction(&fusion, *reduction_params);
-
-  auto lparams = reduction_params->lparams;
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0}, lparams);
-  auto outputs = fe.runFusion({t0}, lparams);
-
-  // by default Welford outputs sum of square diff so need to divide to get var
-  outputs[1] /= N;
-
-  auto at_avg = t0.mean({1});
-  auto at_var = t0.var({1}, false);
-  auto at_n = at::ones({M}, options_int) * N;
-
-  testValidate(
-      fe.kernel(),
-      outputs,
-      {t0},
-      {at_avg, at_var, at_n},
-      __LINE__,
-      __FILE__,
-      "validate welford",
-      reduction_params->lparams);
-}
-
-namespace {
-void testWelford(DataType dtype, int red_axis, int odim, int rdim) {
-  const int axis = red_axis;
-  at::ScalarType aten_dtype = data_type_to_aten(dtype);
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-  TensorView* tv0 = makeSymbolicTensor(2, dtype);
-  bool is_fp16 = dtype == DataType::Half;
-  bool is_bf16 = dtype == DataType::BFloat16;
-  TensorView* tv0_cast = tv0;
-  if (is_fp16 || is_bf16) {
-    tv0_cast = castOp(DataType::Float, tv0);
-  }
-  fusion.addInput(tv0);
-  auto tv1 = mul(tv0_cast, IrBuilder::create<Double>(1));
-  auto tvs = Welford(tv1, {axis});
-  auto tv_avg = tvs.avg;
-  auto tv_M2 = tvs.var_sum;
-  auto tv_N = tvs.n;
-
-  TensorView* avg_cast = tv_avg;
-  TensorView* M2_cast = tv_M2;
-
-  if (is_fp16) {
-    avg_cast = castOp(DataType::Half, tv_avg);
-    M2_cast = castOp(DataType::Half, tv_M2);
-  }
-  if (is_bf16) {
-    avg_cast = castOp(DataType::BFloat16, tv_avg);
-    M2_cast = castOp(DataType::BFloat16, tv_M2);
-  }
-
-  fusion.addOutput(avg_cast);
-  fusion.addOutput(M2_cast);
-  fusion.addOutput(tv_N);
-
-  auto options = at::TensorOptions().dtype(aten_dtype).device(at::kCUDA, 0);
-  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  std::vector<TensorView*> outputs_of_red;
-  at::Tensor aten_input =
-      (axis ? at::randn({odim, rdim}, options)
-            : at::randn({rdim, odim}, options));
-
-  if (is_fp16 || is_bf16) {
-    outputs_of_red.push_back(avg_cast);
-    outputs_of_red.push_back(M2_cast);
-  }
-
-  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
-  scheduleReduction(&fusion, *reduction_params);
-
-  auto lparams = reduction_params->lparams;
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input}, lparams);
-  auto outputs = fe.runFusion({aten_input}, lparams);
-
-  // by default Welford outputs sum of square diff so need to divide to
-  // get var
-
-  outputs[1] /= rdim;
-
-  auto at_avg = aten_input.mean({axis});
-  auto at_var = aten_input.var({axis}, false);
-  auto at_n =
-      (axis ? at::ones({odim, rdim}, options)
-            : at::ones({rdim, odim}, options));
-  at_n = at_n.sum({axis});
-
-  testValidate(
-      fe.kernel(),
-      outputs,
-      {aten_input},
-      {at_avg, at_var, at_n},
-      __LINE__,
-      __FILE__,
-      "validate welford",
-      reduction_params->lparams);
-}
-} // namespace
-
-TEST_F(NVFuserTest, FusionWelfordShmoo_CUDA) {
-  std::vector<DataType> dtypes = {
-      DataType::Double, DataType::Float, DataType::Half};
-  // TODO: enable this for complex. Currently, complex yields
-  // silent wrong results:
-  //   Detected abs error of: 3.8062
-  //     absolute tolerance was set to 2.23704e-06
-  //     and relative tolerance set to 2.23704e-08
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-  if (at::cuda::getDeviceProperties(0)->major >= 8) {
-    dtypes.insert(dtypes.end(), DataType::BFloat16);
-  }
-#endif
-
-  std::vector<int> red_axis = {1, 0};
-  std::vector<int> output_dims = {160, 320};
-  std::vector<int> red_dims;
-
-  // Tried to cut down the number iterations with just
-  // doing every other power of 2.
-  for (int i = 1; i <= 1024 * 1024; i <<= 2) {
-    red_dims.push_back(i);
-  }
-
-  for (auto dtype : dtypes) {
-    for (auto& axis : red_axis) {
-      for (auto& odim : output_dims) {
-        for (auto& rdim : red_dims) {
-          // TODO: original welford algorithm actually keeps a running sum of
-          // squares, i.e. M_{2n} in the
-          //       cf:
-          //       https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
-          //       algorithm notation, and it can reach inf for large numbers
-          //       with half precision. skipping too large volumes for half for
-          //       nwo might need further numerical experiments to re-design
-          //       this.
-          if (rdim > 32768 &&
-              (dtype == DataType::Half || dtype == DataType::BFloat16)) {
-            continue;
-          }
-          testWelford(dtype, axis, odim, rdim);
-        }
-      }
-    }
-  }
-}
-
-namespace {
-void testVarMean(at::ScalarType dtype, int correction, bool keepdim) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  int M = 64, N = 128;
-
-  auto tv0 = makeSymbolicTensor(2, aten_to_data_type(dtype));
-  fusion->addInput(tv0);
-  auto tvs = variance_mean(tv0, {1}, correction, keepdim);
-  auto tv_mean = tvs.mean;
-  auto tv_var = tvs.var;
-  fusion->addOutput(tv_var);
-  fusion->addOutput(tv_mean);
-
-  auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({M, N}, options);
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-  auto outputs = executor_cache.runFusionWithInputs({t0});
-
-  auto at_var_mean = at::var_mean(t0, {1}, correction, keepdim);
-  std::vector<at::Tensor> aten_outputs = {
-      std::get<0>(at_var_mean), std::get<1>(at_var_mean)};
-
-  testValidate(
-      executor_cache.fusion(), outputs, {t0}, aten_outputs, __LINE__, __FILE__);
-}
-} // namespace
-
-TEST_F(NVFuserTest, FusionVarMean_CUDA) {
-  std::vector<at::ScalarType> dtypes = {at::kFloat, at::kDouble};
-  std::vector<int> corrections = {0, 1};
-  std::vector<bool> keepdims = {false, true};
-  for (auto correction : corrections) {
-    for (auto keepdim : keepdims) {
-      for (auto dtype : dtypes) {
-        testVarMean(dtype, correction, keepdim);
-      }
-    }
-  }
-}
-
-TEST_F(NVFuserTest, FusionSimpleGemmTransposed_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-
-  TensorView* tv0 = makeSymbolicTensor(2); // K, M
-  TensorView* tv1 = makeSymbolicTensor(2); // N, K
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  TensorView* tv0_t = transpose(tv0);
-  TensorView* tv1_t = transpose(tv1);
-
-  TensorView* tv2 = broadcast(tv0_t, {false, false, true});
-  // tv2[I0, I1, B] = tv0[I0, I1]
-
-  TensorView* tv3 = broadcast(tv1_t, {true, false, false});
-  // tv3[B, I1, I2] = tv1[I1, I2]
-
-  // tv4[I0, I1, I2] = tv2[I0, I1, B] * tv3[B, I1, I2]
-  TensorView* tv4 = mul(tv2, tv3);
-  // tv5[I0, R1, I2] = tv4[I0, I1, I2]
-  TensorView* tv5 = sum(tv4, {1});
-  fusion.addOutput(tv5);
-
-  tv5->split(1, 32);
-  // tv5[I0, R1o, R1i{32}, I2]
-
-  auto tv6 = tv5->rFactor({1});
-  // tv6[I0, R1o, I1i{32}, I2] = tv4[I0, I1, I2]
-  // tv5[I0,    , R1i{32}, I2] = tv6[I0, R1o, I1i{32}, I2]
-
-  tv5->split(0, 4);
-  tv5->split(-1, 4);
-  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
-  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
-
-  tv0_t->computeAt(tv5, -1);
-  tv1_t->computeAt(tv5, -1);
-
-  // tv6[I0o, I0i{4}, R1o, I1i{32}, I2o, I2i{4}]
-  // tv5[I0o, I0i{4},    , R1i{32}, I2o, I2i{4}]
-  //--> (line symbolizes compute at location)
-  // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, I1o]
-  // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, R1o]
-  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
-
-  tv0_t->computeAt(tv6, -1);
-  tv1_t->computeAt(tv6, -1);
-  // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, I1o |]
-  // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, R1o |]
-  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
-
-  tv5->axis(0)->parallelize(ParallelType::BIDz);
-  tv5->axis(1)->parallelize(ParallelType::TIDz);
-
-  tv5->axis(-2)->parallelize(ParallelType::BIDy);
-  tv5->axis(-1)->parallelize(ParallelType::TIDy);
-
-  tv5->axis(2)->parallelize(ParallelType::TIDx);
-  tv6->axis(2)->parallelize(ParallelType::TIDx);
-
-  constexpr int M = 65, K = 33, N = 17;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({K, M}, options);
-  at::Tensor t1 = at::randn({N, K}, options);
-
-  // Lets specify a few bounds in launch params to make sure it works
-  LaunchParams lparams(1, -1, -1, 32, 4, 4);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1}, lparams);
-  fe.runFusion({t0, t1}, lparams);
-
-  // Don't specify any launch params
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  auto aten_output = t0.t().to(at::kDouble).matmul(t1.t().to(at::kDouble));
-
-  testValidate(
-      &fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSoftmax3DTransposed_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int tidx = 32;
-  const int dimx = 32;
-  const int dimy = 16;
-  const int dimz = 130;
-
-  // Set up your input tensor views
-  TensorView* input_tv0 = makeSymbolicTensor(3);
-  fusion.addInput(input_tv0);
-
-  TensorView* input_t = transpose(input_tv0, 1, 2);
-
-  TensorView* exp_tv1 = unaryOp(UnaryOpType::Exp, input_t);
-  TensorView* sum_exp_tv2 = sum(exp_tv1, {-1});
-  TensorView* bcast_sum_tv3 = broadcast(sum_exp_tv2, {false, false, true});
-
-  // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be
-  // computed at sum_exp_rf_tv8.
-  TensorView* input_t_copy = transpose(input_tv0, 1, 2);
-  TensorView* exp_tv1_copy = unaryOp(UnaryOpType::Exp, input_t_copy);
-
-  TensorView* output_tv4 = div(exp_tv1_copy, bcast_sum_tv3);
-
-  fusion.addOutput(output_tv4);
-
-  bcast_sum_tv3->split(-1, tidx);
-
-  sum_exp_tv2->split(-1, tidx);
-  TensorView* sum_exp_rf_tv5 = sum_exp_tv2->rFactor({-2});
-
-  output_tv4->split(-1, tidx);
-
-  input_t->computeAt(sum_exp_rf_tv5, -1);
-  input_t_copy->computeAt(output_tv4, -1);
-
-  TensorView* tensors_to_parallelize[] = {
-      sum_exp_tv2, bcast_sum_tv3, output_tv4, sum_exp_rf_tv5};
-
-  for (auto tv : tensors_to_parallelize) {
-    tv->axis(0)->parallelize(ParallelType::BIDx);
-    tv->axis(1)->parallelize(ParallelType::BIDy);
-    tv->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({dimx, dimz, dimy}, options);
-
-  at::Tensor cg_output = at::empty({dimx, dimy, dimz}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  fe.runFusion({input}, {cg_output});
-
-  auto aten_input_t = at::transpose(input, 1, 2);
-  auto aten_output = at::_softmax(aten_input_t.to(at::kDouble), -1, false);
-
-  testValidate(
-      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed1_CUDA) {
-  // Case 1
-  // tv1 = tv0 * 0.5
-  // tv2 = tv1 * -1
-  // tv3 = tv1 + 3
-  // tv4 = tv1 * 2
-  // tv5 = tv3 + tv2
-  // tv6 = tv5 + tv4
-  // tv7 = tv1 + tv4
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  tv0 = transpose(tv0);
-
-  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
-  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0));
-  TensorView* tv3 = add(tv1, IrBuilder::create<Double>(3.0));
-  TensorView* tv4 = mul(tv1, IrBuilder::create<Double>(2.0));
-  TensorView* tv5 = add(tv3, tv2);
-
-  TensorView* tv6 = add(tv5, tv4);
-  TensorView* tv7 = add(tv1, tv4);
-
-  fusion.addOutput(tv6);
-  fusion.addOutput(tv7);
-
-  // Lets setup to actually run
-  tv7->merge(0);
-  tv7->split(0, 128);
-  tv7->split(0, 4);
-
-  tv7->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv0->computeAt(tv7, 1);
-
-  // The this-position of the last tensor should be zero.
-  TORCH_CHECK(
-      tv7->nDims() == 3 && tv7->getComputeAtPosition() == 0 &&
-      tv7->getMaxProducerPosition() == 1);
-  TORCH_CHECK(
-      tv6->nDims() == 3 && tv6->getComputeAtPosition() == 0 &&
-      tv6->getMaxProducerPosition() == 1);
-  // The position of every other tensor should be 1.
-  for (auto tv : {tv1, tv2, tv3, tv4, tv5}) {
-    TORCH_CHECK(tv->nDims() == 3 && tv->getComputeAtPosition() == 1);
-  }
-
-  for (Val* val : fusion.vals()) {
-    if (!val->isFusionInput() &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::randn({129, 127}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  at::Tensor aten_input_t = aten_input.t();
-
-  auto t1 = aten_input_t.mul({0.5});
-  auto t2 = t1.mul({-1.0});
-  auto t3 = t1.add({3.0});
-  auto t4 = t1.mul({2.0});
-  auto t5 = t3.add(t2);
-  auto t6 = t5.add(t4);
-  auto t7 = t1.add(t4);
-
-  std::vector<at::Tensor> aten_outputs = {t6, t7};
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed2_CUDA) {
-  // Case 2
-  // tv1 = tv0 * -1
-  // tv2 = tv0 + 3
-  // tv3 = tv0 * 2
-  // tv4 = tv2 + tv1
-  // tv5 = tv4 + tv3
-  // tv6 = tv5 + tv3
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  tv0 = transpose(tv0);
-
-  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(-1.0));
-  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(3.0));
-  TensorView* tv3 = mul(tv0, IrBuilder::create<Double>(2.0));
-  TensorView* tv4 = add(tv2, tv1);
-
-  TensorView* tv5 = add(tv4, tv3);
-  TensorView* tv6 = add(tv5, tv3);
-
-  fusion.addOutput(tv5);
-  fusion.addOutput(tv6);
-
-  // Lets setup to actually run
-  tv6->merge(0);
-  tv6->split(0, 128);
-  tv6->split(0, 4);
-
-  tv6->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv0->computeAt(tv6, 1);
-
-  for (Val* val : fusion.vals()) {
-    if (!val->isFusionInput() &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({129, 127}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
-
-  auto input_t = input.t();
-  auto t1 = input_t.mul({-1.0});
-  auto t2 = input_t.add({3.0});
-  auto t3 = input_t.mul({2.0});
-  auto t4 = t2.add(t1);
-  auto t5 = t4.add(t3);
-  auto t6 = t5.add(t3);
-
-  std::vector<at::Tensor> aten_outputs = {t5, t6};
-
-  testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed3_CUDA) {
-  // Case 3
-  // T2 = T1 * 0.979361
-  // T3 = T2 * T0
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(4);
-  fusion.addInput(tv0);
-
-  tv0 = permute(tv0, {3, 0, 1, 2});
-
-  TensorView* tv1 = makeSymbolicTensor(4);
-  fusion.addInput(tv1);
-
-  tv1 = permute(tv1, {3, 0, 1, 2});
-
-  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(.979361));
-  TensorView* tv3 = mul(tv2, tv0);
-
-  fusion.addOutput(tv3);
-
-  // Lets setup to actually run
-  while (tv3->nDims() > 1)
-    tv3->merge(0);
-  tv3->split(0, 128);
-  tv3->split(0, 4);
-
-  tv0->computeAt(tv3, 1);
-  tv1->computeAt(tv3, 1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  for (Val* val : fusion.vals()) {
-    if (!val->isFusionInput() &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto t0_t = t0.permute({3, 0, 1, 2});
-  auto t1_t = t1.permute({3, 0, 1, 2});
-  auto t2 = t1_t.mul({0.979361});
-  auto aten_output = t2.mul(t0_t);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed4_CUDA) {
-  // Case 4
-  // T4 = T2 - T3
-  // T5 = T1 + T4
-  // T6 = T5 - T0
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(4);
-  fusion.addInput(tv0);
-
-  tv0 = permute(tv0, {3, 0, 1, 2});
-
-  TensorView* tv1 = makeSymbolicTensor(4);
-  fusion.addInput(tv1);
-
-  tv1 = permute(tv1, {3, 0, 1, 2});
-
-  TensorView* tv2 = makeSymbolicTensor(4);
-  fusion.addInput(tv2);
-
-  tv2 = permute(tv2, {3, 0, 1, 2});
-
-  TensorView* tv3 = makeSymbolicTensor(4);
-  fusion.addInput(tv3);
-
-  tv3 = permute(tv3, {3, 0, 1, 2});
-
-  TensorView* tv4 = sub(tv2, tv3);
-  TensorView* tv5 = add(tv1, tv4);
-  TensorView* tv6 = sub(tv5, tv0);
-
-  fusion.addOutput(tv6);
-
-  // Lets setup to actually run
-  while (tv6->nDims() > 1)
-    tv6->merge(0);
-  tv6->split(0, 128);
-  tv6->split(0, 4);
-
-  tv0->computeAt(tv6, 1);
-  tv1->computeAt(tv6, 1);
-  tv2->computeAt(tv6, 1);
-  tv3->computeAt(tv6, 1);
-
-  tv6->axis(0)->parallelize(ParallelType::BIDx);
-
-  for (Val* val : fusion.vals()) {
-    if (!val->isFusionInput() &&
-        val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-
-      tv->axis(1)->parallelize(ParallelType::Unroll);
-      tv->axis(-1)->parallelize(ParallelType::TIDx);
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-  at::Tensor t2 = at::rand_like(t0, options);
-  at::Tensor t3 = at::rand_like(t0, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1, t2, t3};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto t0_t = t0.permute({3, 0, 1, 2});
-  auto t1_t = t1.permute({3, 0, 1, 2});
-  auto t2_t = t2.permute({3, 0, 1, 2});
-  auto t3_t = t3.permute({3, 0, 1, 2});
-  auto t4 = t2_t.sub(t3_t);
-  auto t5 = t1_t.add(t4);
-  auto aten_output = t5.sub(t0_t);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed5_CUDA) {
-  // Case 5
-  // tv2 = tv0 + 2.0
-  // tv3 = tv1 * tv2
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  tv0 = transpose(tv0);
-  TensorView* tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  tv1 = transpose(tv1);
-  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2.0));
-  TensorView* tv3 = mul(tv1, tv2);
-  fusion.addOutput(tv3);
-
-  tv3->merge(0);
-  tv3->split(-1, 8);
-  tv3->split(-1, 4);
-
-  tv0->computeAt(tv3, 1);
-  tv1->computeAt(tv3, 1);
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({63, 65}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto t2 = t0.t().add(2.0);
-  auto aten_output = t1.t().mul(t2);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed6_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  tv0 = transpose(tv0);
-  TensorView* tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  tv1 = transpose(tv1);
-  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2.0));
-  TensorView* tv3 = mul(tv1, tv2);
-  fusion.addOutput(tv3);
-
-  tv2->merge(0);
-  tv2->split(-1, 8);
-  tv2->split(-1, 4);
-  tv3->merge(0);
-  tv3->split(-1, 8);
-
-  tv0->computeAt(tv3, 1);
-  tv1->computeAt(tv3, 1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({63, 65}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto t2 = t0.t().add(2.0);
-  auto aten_output = t1.t().mul(t2);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSegmentReducePointwise_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(1);
-  TensorView* tv2 = makeSymbolicTensor(2);
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-  fusion->addInput(tv2);
-
-  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(1)); // Group 0
-  TensorView* tv4 =
-      max(tv3, {0}); // Group 0 (use max instead to avoid numerical issues)
-  TensorView* tv5 = add(tv4, tv1); //  Group 0 (Non Broadcast after reduce,
-                                   //  keeps normalization scheduler away)
-  TensorView* tv6 = add(tv5, tv2); //  Group 1 (Broadcast after reduce)
-
-  fusion->addOutput(tv6);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({128, 65}, options);
-  at::Tensor t1 = at::randn({65}, options);
-  at::Tensor t2 = at::randn({128, 65}, options);
-
-  auto t3 = t0.add(1.0);
-  auto t4 = std::get<0>(at::max(t3, 0));
-  auto t5 = t4.add(t1);
-  auto t6 = t5.add(t2);
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-
-  auto outputs = executor_cache.runFusionWithInputs({t0, t1, t2});
-
-  TORCH_CHECK(
-      executor_cache.getMostRecentKernelRuntime()->isSegmented(),
-      "segmentation didn't happen");
-  TORCH_CHECK(
-      executor_cache.getMostRecentKernelRuntime()
-              ->fusionSegments()
-              ->groups()
-              .size() == 2,
-      "segmentation didn't happen as expected");
-
-  testValidate(
-      executor_cache.fusion(), outputs, {t0, t1, t2}, {t6}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionMultipleVectorize_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  TensorView* tv0 = makeContigTensor(1);
-  TensorView* tv1 = makeContigTensor(1);
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-
-  TensorView* tv3 = add(tv0, tv1);
-  fusion->addOutput(tv3);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({40960}, options);
-  at::Tensor t1 = at::randn({40960}, options);
-  auto t2 = t0 + t1;
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-  executor_cache.profile(true);
-
-  auto outputs = executor_cache.runFusionWithInputs({t0, t1});
-  auto runtime1 = executor_cache.getMostRecentKernelRuntime();
-  auto log1 = std::dynamic_pointer_cast<PointwiseParams>(
-      executor_cache.getMostRecentExecutorInfo().params);
-  TORCH_CHECK(log1 != nullptr);
-  TORCH_CHECK(log1->vectorize);
-
-  testValidate(
-      executor_cache.fusion(), outputs, {t0, t1}, {t2}, __LINE__, __FILE__);
-
-  t0 = at::randn({40964}, options);
-  t1 = at::randn({40964}, options);
-  t2 = t0 + t1;
-
-  outputs = executor_cache.runFusionWithInputs({t0, t1});
-  auto runtime2 = executor_cache.getMostRecentKernelRuntime();
-  auto log2 = std::dynamic_pointer_cast<PointwiseParams>(
-      executor_cache.getMostRecentExecutorInfo().params);
-  TORCH_CHECK(log2 != nullptr);
-  TORCH_CHECK(log2->vectorize);
-
-  testValidate(
-      executor_cache.fusion(), outputs, {t0, t1}, {t2}, __LINE__, __FILE__);
-
-  t0 = at::randn({40962}, options);
-  t1 = at::randn({40962}, options);
-  t2 = t0 + t1;
-
-  outputs = executor_cache.runFusionWithInputs({t0, t1});
-  auto runtime3 = executor_cache.getMostRecentKernelRuntime();
-  auto log3 = std::dynamic_pointer_cast<PointwiseParams>(
-      executor_cache.getMostRecentExecutorInfo().params);
-  TORCH_CHECK(log3 != nullptr);
-  TORCH_CHECK(log3->vectorize);
-
-  testValidate(
-      executor_cache.fusion(), outputs, {t0, t1}, {t2}, __LINE__, __FILE__);
-
-  TORCH_CHECK(runtime1 == runtime2);
-  TORCH_CHECK(runtime1 != runtime3);
-}
-
-TEST_F(NVFuserTest, FusionVectorizeSimple_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeContigTensor(3);
-
-  fusion.addInput(tv0);
-
-  auto tv1 = unaryOp(UnaryOpType::Sin, tv0);
-
-  fusion.addOutput(tv1);
-
-  auto tv0_cache = tv0->cacheAfter();
-
-  auto tv1_cache = tv1->cacheBefore();
-
-  tv1->merge(0);
-  tv1->merge(0);
-  tv1->split(0, 4);
-  tv1->split(0, 128);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-
-  tv0->computeAt(tv1, 2);
-
-  tv0_cache->axis(2)->parallelize(ParallelType::Vectorize);
-  tv1->axis(2)->parallelize(ParallelType::Vectorize);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor aten_input = at::empty({2, 6, 32}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {aten_input});
-  auto cg_outputs = fe.runFusion({aten_input});
-
-  at::Tensor aten_output = aten_input.sin();
-
-  testValidate(
-      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSimpleVectorizeUnroll_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-  // dimensionality of the problem
-  int nDims = 3;
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeContigTensor(nDims);
-  TensorView* tv1 = makeContigTensor(nDims);
-
-  // Register your inputs
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  // Do math with it, it returns a `Val*` but can be static_casted back to
-  // TensorView
-  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
-  TensorView* tv3 = add(tv0, tv2);
-
-  // Register your outputs
-  fusion.addOutput(tv3);
-
-  auto tv0_cache = tv0->cacheAfter();
-  auto tv1_cache = tv1->cacheAfter();
-  auto tv3_cache = tv3->cacheBefore();
-
-  // Do transformations, remember, transformations are outputs to inputs
-  // This doesn't have to be in this order
-  tv3->merge(1);
-
-  // Split by n_threads
-  tv3->split(1, 2);
-  tv3->split(0, 3);
-  tv3->split(0, 1);
-
-  // [bidx, unswitch, unroll{2}, tidx, vectorize{2}]
-
-  // Parallelize TV3
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(1)->parallelize(ParallelType::Unswitch);
-  tv3->axis(2)->parallelize(ParallelType::Unroll);
-  tv3->axis(3)->parallelize(ParallelType::TIDx);
-
-  tv3->reorder({{4, 2}});
-  // [bidx, unswitch, vectorize{2}, unroll{2}, tidx]
-
-  TransformPropagatorWithCheck propagator(tv3);
-  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
-  scheduler_utils::parallelizeAllLike(tv3);
-
-  tv0_cache->axis(2)->parallelize(ParallelType::Vectorize);
-  tv1_cache->axis(2)->parallelize(ParallelType::Vectorize);
-  tv3->axis(2)->parallelize(ParallelType::Vectorize);
-
-  // For all inputs, computeAt the output inline, temporaries should be squeezed
-  // between them
-  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
-  tv1->computeAt(tv3, -1, ComputeAtMode::MostInlined);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor input1 = at::randn({64, 2, 128}, options);
-  at::Tensor input2 = at::rand_like(input1);
-  at::Tensor output = at::empty_like(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input1, input2});
-  fe.runFusion({input1, input2}, {output});
-
-  at::Tensor tv2_ref = input2 + 2.0;
-  at::Tensor output_ref = input1 + tv2_ref;
-
-  TORCH_CHECK(output_ref.equal(output));
-}
-
-TEST_F(NVFuserTest, FusionSegmentReduceSoftmax_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  std::vector<int64_t> input_shape{32, 64, 8};
-  const int kReductionAxis = 1;
-
-  auto tv0 = TensorViewBuilder()
-                 .ndims(input_shape.size())
-                 .dtype(DataType::Double)
-                 .build();
-
-  fusion->addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
-  auto tv2 = sum(tv1, {2}); // Group 0
-
-  auto output = softmax(tv2, kReductionAxis); // Group 1
-  fusion->addOutput(output);
-
-  auto options = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA, 0);
-  at::Tensor at_x = at::randn(input_shape, options);
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-
-  auto outputs = executor_cache.runFusionWithInputs({at_x});
-
-  auto t1 = at_x.add(1.0);
-  auto t2 = t1.sum({2});
-  auto t3 = at::_softmax(t2.to(at::kDouble), -1, false);
-
-  auto optimized_fusion = executor_cache.getMostRecentKernelRuntime();
-  TORCH_CHECK(optimized_fusion->isSegmented(), "segmentation didn't happen");
-  TORCH_CHECK(
-      optimized_fusion->fusionSegments()->groups().size() == 2,
-      "segmentation didn't happen as expected");
-
-  testValidate(
-      executor_cache.fusion(), outputs, {at_x}, {t3}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSwizzle1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = mul(tv1, IrBuilder::create<Double>(2));
-  fusion.addOutput(tv2);
-
-  tv2->split(0, 7);
-  tv2->split(0, 9);
-
-  tv0->computeAt(tv2, 1);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv1->setMemoryType(MemoryType::Shared);
-  tv1->swizzle(SwizzleType::Transpose, {1, 2});
-
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-  tv1->axis(2)->parallelize(ParallelType::TIDy);
-
-  tv2->axis(1)->parallelize(ParallelType::TIDx);
-  tv2->axis(2)->parallelize(ParallelType::TIDy);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({100}, options);
-
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output = (t0 + 1) * 2;
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSwizzle2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = mul(tv1, IrBuilder::create<Double>(2));
-  fusion.addOutput(tv2);
-
-  tv1->split(-1, 4);
-  tv1->split(-2, 4);
-
-  tv2->split(-1, 4);
-  tv2->split(-2, 4);
-
-  tv0->computeAt(tv2, 1);
-
-  tv2->reorder({{-1, -2}});
-
-  tv1->setMemoryType(MemoryType::Shared);
-  tv1->swizzle(SwizzleType::Transpose, {-2, -1});
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-2)->parallelize(ParallelType::TIDy);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-2)->parallelize(ParallelType::TIDy);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({123}, options);
-
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output = (t0 + 1) * 2;
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionGridPersistence_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {0});
-  auto tv2 = broadcast(tv1, {true});
-  auto tv3 = add(tv0, tv2);
-  fusion.addOutput(tv3);
-
-  std::vector<TensorView*> tvs = {tv1, tv2, tv3};
-  for (auto tv : tvs) {
-    tv->split(0, 2);
-    tv->axis(0)->parallelize(ParallelType::BIDx);
-    tv->axis(1)->parallelize(ParallelType::BIDy);
-  }
-
-  const int numel_x = 10;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto out = fe.runFusion({input});
-
-  auto aten_output = input.sum({0}).unsqueeze(-1).add(input);
-
-  testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionGridPersistence2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {0});
-  auto tv2 = broadcast(tv1, {true, false});
-  auto tv3 = add(tv0, tv2);
-  fusion.addOutput(tv3);
-
-  std::vector<TensorView*> tvs = {tv1, tv2, tv3};
-  for (auto tv : tvs) {
-    tv->split(0, 2);
-    tv->axis(0)->parallelize(ParallelType::BIDx);
-    tv->axis(1)->parallelize(ParallelType::TIDy);
-    tv->axis(2)->parallelize(ParallelType::TIDx);
-  }
-
-  const int numel_x = 10;
-  const int numel_y = 3;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto out = fe.runFusion({input});
-
-  auto aten_output = input.sum({0}).unsqueeze(0).add(input);
-
-  testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionWelfordPersistence_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tvs = Welford(tv0, {0});
-  auto tv4 = add(tvs.avg, tvs.var_sum);
-  auto tv5 = broadcast(tv4, {true});
-  auto tv6 = add(tv0, tv5);
-  fusion.addOutput(tv6);
-
-  std::vector<TensorView*> schedule_tvs = {
-      tvs.avg, tvs.var_sum, tvs.n, tv5, tv6};
-
-  for (auto tv : schedule_tvs) {
-    tv->split(0, 2);
-    tv->axis(0)->parallelize(ParallelType::BIDx);
-    tv->axis(1)->parallelize(ParallelType::BIDy);
-  }
-
-  const int numel_x = 10;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto out = fe.runFusion({input});
-
-  auto aten_output = (input.mean({0}) + (input.var({0}, false) * numel_x))
-                         .unsqueeze(-1)
-                         .add(input);
-
-  testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionWelfordPersistence2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tvs = Welford(tv0, {0});
-  auto tv4 = add(tvs.avg, tvs.var_sum);
-  auto tv5 = broadcast(tv4, {true, false});
-  auto tv6 = add(tv0, tv5);
-  fusion.addOutput(tv6);
-
-  std::vector<TensorView*> schedule_tvs = {
-      tvs.avg, tvs.var_sum, tvs.n, tv5, tv6};
-  for (auto tv : schedule_tvs) {
-    tv->split(0, 2);
-    tv->axis(0)->parallelize(ParallelType::BIDx);
-    tv->axis(1)->parallelize(ParallelType::TIDy);
-    tv->axis(2)->parallelize(ParallelType::TIDx);
-  }
-  tv4->axis(0)->parallelize(ParallelType::TIDx);
-
-  const int numel_x = 10;
-  const int numel_y = 3;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto out = fe.runFusion({input});
-
-  auto aten_output = (input.mean({0}) + (input.var({0}, false) * numel_x))
-                         .unsqueeze(0)
-                         .add(input);
-
-  testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionIssue633_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int dx = 10;
-  const int dy = 11;
-  const int dz = 12;
-
-  auto tv0 = makeConcreteTensor({dx, dy, dz});
-  fusion.addInput(tv0);
-  auto tv1 = makeConcreteTensor({dx, dy, 1});
-  fusion.addInput(tv1);
-  auto tv2 = add(tv0, tv1);
-  fusion.addOutput(tv2);
-
-  tv2->merge(1);
-  tv2->merge(0);
-  tv2->split(-1, 128);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({dx, dy, dz}, options);
-  at::Tensor t1 = at::randn({dx, dy, 1}, options);
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output = t0 + t1;
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionBroadcastAcrossComputeAt_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  std::vector<int64_t> shape{17, 19};
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  auto tv2 = broadcast(tv0, {false, true});
-  auto tv3 = add(tv1, tv2);
-  fusion.addOutput(tv3);
-
-  tv3->split(1, 128);
-  tv0->computeAt(tv3, 2);
-
-  for (auto tv : {tv2, tv3}) {
-    tv->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({shape[0]}, options);
-  at::Tensor t1 = at::randn(shape, options);
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto t3 = t0.unsqueeze(-1).expand(shape) + t1;
-
-  testValidate(&fusion, cg_outputs, aten_inputs, {t3}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwise_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(2);
-  auto tv1 = makeContigTensor(2);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-  fusion.addOutput(tv2);
-
-  const int kTDX = 64;
-  const int kVecSize = 4;
-  const int kNumElems = kTDX * kVecSize;
-
-  tv2->split(1, kNumElems);
-
-  auto c0 = tv0->cacheAfter();
-  auto c1 = tv1->cacheAfter();
-  auto c2 = tv2->cacheBefore();
-
-  tv2->split(-1, kVecSize);
-
-  c0->computeAt(tv2, -2);
-  c1->computeAt(tv2, -2);
-
-  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(-2)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const int bx = 128;
-  const int by = 457;
-  at::Tensor t0 = at::randn({bx, by}, options);
-  at::Tensor t1 = at::randn({bx, by}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output = t0 + t1;
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeContig_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(4);
-  auto tv1 = makeContigTensor(4);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-  fusion.addOutput(tv2);
-
-  tv2->reorder({{0, 1}, {1, 0}});
-  tv2->merge(-2);
-
-  const int kTDX = 64;
-  const int kVecSize = 2;
-  const int kNumElems = kTDX * kVecSize;
-
-  tv2->split(-1, kNumElems);
-
-  auto c0 = tv0->cacheAfter();
-  auto c1 = tv1->cacheAfter();
-  auto c2 = tv2->cacheBefore();
-
-  tv2->split(0, 128);
-  tv2->split(-1, kVecSize);
-
-  c0->computeAt(tv2, -2);
-  c1->computeAt(tv2, -2);
-
-  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(1)->parallelize(ParallelType::BIDy);
-  tv2->axis(-2)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const int n = 32;
-  const int c = 127;
-  const int h = 51;
-  const int w = 23;
-  at::Tensor t0 = at::randn({n, c, h, w}, options);
-  at::Tensor t1 = at::randn({n, c, h, w}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output = t0 + t1;
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicPass_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  constexpr int kNumDims = 4;
-  constexpr int kTDX = 64;
-  constexpr int kVecSize = 2;
-  constexpr int kNumElems = kTDX * kVecSize;
-
-  auto tv0 = makeSymbolicTensor(kNumDims);
-  auto tv1 = makeSymbolicTensor(kNumDims);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-  fusion.addOutput(tv2);
-
-  // Create caches for vectorization
-  auto c0 = tv0->cacheAfter();
-  auto c1 = tv1->cacheAfter();
-  auto c2 = tv2->cacheBefore();
-
-  // Merge all dimensions together except inner-most dim
-  for (const auto idx : c10::irange(kNumDims - 2)) {
-    tv2->merge(0);
-  }
-  // Split inner-most dim
-  tv2->split(-1, kNumElems);
-  tv2->split(-1, kVecSize);
-  TransformPropagatorWithCheck propagator(tv2);
-  MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
-
-  c0->computeAt(tv2, -2);
-  c1->computeAt(tv2, -2);
-
-  // Parallelization Strategy
-  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(2)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const int n = 5;
-  const int c = 3;
-  const int h = 51;
-  const int w = 257;
-  at::Tensor t0 = at::randn({n, c, h, w}, options);
-  at::Tensor t1 = at::randn({n, c, h, w}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output = t0 + t1;
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicFail_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  constexpr int kNumDims = 4;
-  constexpr int kTDX = 64;
-  constexpr int kVecSize = 2;
-  constexpr int kNumElems = kTDX * kVecSize;
-  std::vector<int64_t> bcast_shape{1, 1, 1, -1};
-
-  auto tv0 = makeContigTensor(kNumDims);
-  auto tv1 = TensorViewBuilder().shape(bcast_shape).build();
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-  fusion.addOutput(tv2);
-
-  // Create caches for vectorization
-  auto c0 = tv0->cacheAfter();
-  auto c1 = tv1->cacheAfter();
-  auto c2 = tv2->cacheBefore();
-
-  // Merge all dimensions together
-  // Backward merge order is necessary for vectorize validation
-  for (int idx = kNumDims - 1; idx > 0; --idx) {
-    tv2->merge(idx - 1);
-  }
-  tv2->split(-1, kNumElems);
-  tv2->split(-1, kVecSize);
-  TransformPropagatorWithCheck propagator(tv2);
-  MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
-
-  c0->computeAt(tv2, -2);
-  c1->computeAt(tv2, -2);
-
-  // Parallelization Strategy
-  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const int n = 32;
-  const int c = 128;
-  const int h = 51;
-  const int w = 23;
-  at::Tensor t0 = at::randn({n, c, h, w}, options);
-  at::Tensor t1 = at::randn({1, 1, 1, w}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  // TODO: throw assertion - cannot merge non-contiguous vectorization axes
-  // Make sure compilation fails
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
-}
-
-TEST_F(NVFuserTest, FusionVectorizeMisalignedRFactor_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(2);
-  auto tv1 = makeContigTensor(2);
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-
-  auto tv3 = sum(tv2, {-1});
-
-  fusion.addOutput(tv3);
-
-  auto c0 = tv0->cacheAfter();
-  auto c1 = tv1->cacheAfter();
-
-  tv3->split(-1, 128 * 4);
-  tv3->split(-1, 4);
-  // Reduce outer dim first
-  auto tv4 = tv3->rFactor({-3, -1});
-  // Tv3 will reduce threads
-
-  tv0->computeAt(tv3, 1);
-  tv1->computeAt(tv3, 1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv0->computeAt(tv4, -2);
-  tv1->computeAt(tv4, -2);
-
-  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-
-  tv4->axis(-2)->parallelize(ParallelType::TIDx);
-  tv3->axis(1)->parallelize(ParallelType::TIDx);
-
-  tv2->computeAt(tv4, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const int bx = 128;
-  const int by = 2050;
-  at::Tensor t0 = at::randn({bx, by}, options);
-  at::Tensor t1 = at::randn({bx, by}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output = t0.add(t1).sum(1);
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionVectorizeMisalignedWrongDimFail_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(2);
-  auto tv1 = makeContigTensor(2);
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-  fusion.addOutput(tv2);
-
-  tv2->split(1, 16);
-  tv2->split(1, 64);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(2)->parallelize(ParallelType::TIDx);
-
-  auto c0 = tv0->cacheAfter();
-  auto c1 = tv1->cacheAfter();
-  auto c2 = tv2->cacheBefore();
-
-  c0->computeAt(tv2, -2);
-  c1->computeAt(tv2, -2);
-
-  std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
-  for (auto tv : vectorized_tvs) {
-    tv->split(-1, 4);
-    // Vectorize the wrong dimension
-    tv->axis(-2)->parallelize(ParallelType::MisalignedVectorize);
-  }
-
-  FusionExecutor fe;
-  // Make sure compilation fails
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
-}
-
-TEST_F(NVFuserTest, FusionVectorizeMisalignedStride_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  auto tv1 = makeSymbolicTensor(2);
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-  fusion.addOutput(tv2);
-
-  const int kTDX = 64;
-  const int kVecSize = 4;
-  const int kNumElems = kTDX * kVecSize;
-
-  tv2->split(1, kNumElems);
-
-  auto c0 = tv0->cacheAfter();
-  auto c1 = tv1->cacheAfter();
-
-  tv2->split(-1, kVecSize);
-
-  c0->computeAt(tv2, -2);
-  c1->computeAt(tv2, -2);
-
-  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(-2)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const int bx = 128;
-  const int by = 2049;
-  at::Tensor t0 = at::randn({bx, by}, options).index({"...", Slice(3)});
-  at::Tensor t1 = at::randn({bx, by}, options).index({"...", Slice(3)});
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output = t0 + t1;
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionVectorizeMisalignedStrideFail_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  auto tv1 = makeSymbolicTensor(2);
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-  fusion.addOutput(tv2);
-
-  const int kTDX = 64;
-  const int kVecSize = 4;
-  const int kNumElems = kTDX * kVecSize;
-
-  tv2->split(1, kNumElems);
-
-  auto c0 = tv0->cacheAfter();
-  auto c1 = tv1->cacheAfter();
-  auto c2 = tv2->cacheBefore();
-
-  tv2->split(-1, kVecSize);
-
-  c0->computeAt(tv2, -2);
-  c1->computeAt(tv2, -2);
-
-  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(-2)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const int bx = 128;
-  const int by = 2049;
-  at::Tensor t0 = at::randn({bx, by}, options).index({"...", Slice(3)});
-  at::Tensor t1 = at::randn({bx, by}, options).index({"...", Slice(3)});
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-
-  // Failure because the input + output tensors do not have the same stride
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.runFusion(aten_inputs));
-}
-
-TEST_F(NVFuserTest, FusionVectorization1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-  fusion.addOutput(tv2);
-
-  tv2->split(1, 16);
-  tv2->split(1, 64);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(2)->parallelize(ParallelType::TIDx);
-
-  auto c0 = tv0->cacheAfter();
-  auto c1 = tv1->cacheAfter();
-  auto c2 = tv2->cacheBefore();
-
-  c0->computeAt(tv2, -2);
-  c1->computeAt(tv2, -2);
-
-  std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
-  for (auto tv : vectorized_tvs) {
-    tv->split(-1, 4);
-    tv->axis(-1)->parallelize(ParallelType::Vectorize);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const int bx = 128;
-  const int by = 2048;
-  at::Tensor t0 = at::randn({bx, by}, options);
-  at::Tensor t1 = at::randn({bx, by}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output = t0 + t1;
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionVectorization2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-  fusion.addOutput(tv2);
-
-  tv2->split(1, 16);
-  tv2->split(1, 64);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(2)->parallelize(ParallelType::TIDx);
-
-  auto c0 = tv0->cacheAfter();
-  auto c1 = tv1->cacheAfter();
-  auto c2 = tv2->cacheBefore();
-
-  c0->computeAt(tv2, -2);
-  c1->computeAt(tv2, -2);
-
-  std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
-  for (auto tv : vectorized_tvs) {
-    tv->split(-1, 4);
-    // Vectorize the wrong dimension
-    tv->axis(-2)->parallelize(ParallelType::Vectorize);
-  }
-
-  FusionExecutor fe;
-  // Make sure compilation fails
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
-}
-
-TEST_F(NVFuserTest, FusionVectorization3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-  fusion.addOutput(tv2);
-
-  tv2->split(1, 16);
-  tv2->split(1, 64);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(2)->parallelize(ParallelType::TIDx);
-
-  auto c0 = tv0->cacheAfter();
-  auto c1 = tv1->cacheAfter();
-  auto c2 = tv2->cacheBefore();
-
-  c0->computeAt(tv2, -2);
-  c1->computeAt(tv2, -2);
-
-  std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
-  for (auto tv : vectorized_tvs) {
-    tv->split(-1, 4);
-    tv->axis(-1)->parallelize(ParallelType::Vectorize);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const int bx = 128;
-  const int by = 2049;
-  at::Tensor t0 = at::randn({bx, by}, options);
-  at::Tensor t1 = at::randn({bx, by}, options);
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.runFusion(aten_inputs));
-
-  aten_inputs[0] = t0.index({"...", Slice(1)});
-  aten_inputs[1] = t1.index({"...", Slice(1)});
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.runFusion(aten_inputs));
-
-  t0 = at::randn({bx, 2048}, options).index({"...", Slice(4)});
-  t1 = at::randn({bx, 2048}, options).index({"...", Slice(4)});
-  aten_inputs = {t0, t1};
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output = t0 + t1;
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionVectorizationRFactor_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-
-  auto tv3 = sum(tv2, {-1});
-
-  fusion.addOutput(tv3);
-
-  tv3->split(-1, 128 * 4);
-  tv3->split(-1, 4);
-  // Reduce outer dim first
-  auto tv4 = tv3->rFactor({-3, -1});
-  // Tv3 will reduce threads
-
-  auto tv6 = tv0->cacheAfter();
-  auto tv7 = tv1->cacheAfter();
-
-  tv0->computeAt(tv3, 1);
-  tv1->computeAt(tv3, 1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv0->computeAt(tv4, -2);
-  tv1->computeAt(tv4, -2);
-
-  tv6->axis(-1)->parallelize(ParallelType::Vectorize);
-  tv7->axis(-1)->parallelize(ParallelType::Vectorize);
-
-  tv4->axis(-2)->parallelize(ParallelType::TIDx);
-  tv3->axis(1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const int bx = 128;
-  const int by = 2048;
-  at::Tensor t0 = at::randn({bx, by}, options);
-  at::Tensor t1 = at::randn({bx, by}, options);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto aten_output = t0.add(t1).sum(1);
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-
-  auto t3 = t0.add(t1).sum(1);
-
-  testValidate(&fusion, cg_outputs, aten_inputs, {t3}, __LINE__, __FILE__);
-}
-
-// Unswitched loops with extent one may omit else clause.
-TEST_F(NVFuserTest, FusionSizeOneLoop1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Progressively broadcast tensors
-  TensorView* tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  TensorView* tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  TensorView* tv2 = makeSymbolicTensor(3);
-  fusion.addInput(tv2);
-
-  TensorView* tv3 = broadcast(tv0, {false, true});
-  TensorView* tv4 = add(tv3, tv1);
-  TensorView* tv5 = add(tv4, tv2);
-
-  fusion.addOutput(tv5);
-
-  // Split inner dimension
-  tv5->split(1, 8);
-  // Merge middle dims with outer dimensions
-  tv5->merge(2);
-  tv5->merge(0);
-
-  // tv5[I0*I1o, I1i*I2]
-  // Get a dim of size 1 to unswitch
-  tv5->split(0, 1, false);
-
-  // Compute everything inline
-  tv0->computeAt(tv5, -1);
-
-  tv5->axis(0)->parallelize(ParallelType::Unswitch);
-  tv5->axis(1)->parallelize(ParallelType::BIDx);
-  tv5->axis(2)->parallelize(ParallelType::TIDx);
-
-  // Make sure the unswitched loop does not have an else clause.
-  GpuLower gpulw(&fusion);
-  TORCH_CHECK(!UnswitchInElseChecker::check(gpulw));
-
-  const int x = 11;
-  const int y = 12;
-  const int z = 13;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({x}, options);
-  at::Tensor t1 = at::randn({x, y}, options);
-  at::Tensor t2 = at::randn({z, x, y}, options);
-  std::vector<IValue> aten_inputs = {t0, t1, t2};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-  auto t6 = (t0.unsqueeze(-1) + t1).unsqueeze(0) + t2;
-
-  testValidate(&fusion, cg_outputs, aten_inputs, {t6}, __LINE__, __FILE__);
-}
-
-// The unswitched loop has extent one but inner loops don't. The else
-// part should not be omitted.
-TEST_F(NVFuserTest, FusionSizeOneLoop2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int x = 15;
-  auto tv0 = makeConcreteTensor({x});
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv1);
-
-  tv1->split(-1, 4);
-  tv1->split(-2, 1);
-
-  tv1->axis(-2)->parallelize(ParallelType::Unswitch);
-
-  // Make sure the size-one unswitched loop does not omit the else clause.
-  GpuLower gpulw(&fusion);
-  TORCH_CHECK(UnswitchInElseChecker::check(gpulw));
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({x}, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-  auto t1 = t0 + 1;
-
-  testValidate(&fusion, cg_outputs, aten_inputs, {t1}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionValidateParallelize1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv2);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDy);
-
-  // Invalid as tv1 and tv2 do have the same ParallelType
-  FusionExecutor fe;
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
-}
-
-TEST_F(NVFuserTest, FusionValidateParallelize2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv2);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDy);
-  tv1->setMemoryType(MemoryType::Shared);
-
-  // tv1 and tv2 do have the same ParallelType, but tv1 is on shared
-  // memory, so it is valid
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-}
-
-TEST_F(NVFuserTest, FusionValidateParallelize3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv2);
-
-  tv1->split(-1, 4);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->split(-1, 4);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv1->setMemoryType(MemoryType::Global);
-
-  // tv1 and tv2 have the same shape and ParallelType
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-}
-
-TEST_F(NVFuserTest, FusionValidateParallelize4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv2);
-
-  tv1->split(-1, 4);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->split(-1, 8);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv1->setMemoryType(MemoryType::Global);
-
-  // tv1 and tv2 do not have the same shape but global memory comm is supported.
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-}
-
-TEST_F(NVFuserTest, FusionValidateParallelize5_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv2);
-
-  tv1->split(-1, 4);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->setMemoryType(MemoryType::Shared);
-
-  tv2->split(-1, 8);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  // tv1 and tv2 do not have the same shape, but tv1 is on shared
-  // memory, so it is valid
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-}
-
-// See issue #995
-TEST_F(NVFuserTest, FusionValidateParallelize6_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int64_t W = 5, X = 6, Y = 7, Z = 8;
-
-  auto tv0 = makeConcreteTensor({X, Y, Z});
-  auto tv1 = makeConcreteTensor({W, X, Y, Z});
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv3 = broadcast(tv2, {true, false, false, false});
-  auto tv4 = add(tv3, tv1);
-  fusion.addOutput(tv4);
-
-  tv4->merge(0);
-  tv4->merge(0);
-  tv4->merge(0);
-  tv4->split(0, 4);
-  tv4->split(0, 3);
-  tv4->split(0, 2);
-
-  TransformPropagatorWithCheck propagator(tv4);
-  MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
-
-  tv0->computeAt(tv2, 2);
-  tv3->computeAt(tv4, 2);
-
-  tv4->axis(0)->parallelize(ParallelType::BIDx);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  // Validation should throw an exception saying the first axes of tv2
-  // and tv3 have incompatible parallelization. See also issue #995.
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fusion.printKernel());
-}
-
-TEST_F(NVFuserTest, FusionDAGMerging_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(5);
-  auto tv1 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  // Branch 0
-  auto tv2 = sum(tv0, {0}); // 0
-  auto tv3 = sum(tv2, {0}); // 1
-  auto tv4 = sum(tv3, {0}); // 2
-  auto tv5 = sum(tv4, {0}); // 3
-
-  // Branch 1
-  auto tv6 = add(tv1, IrBuilder::create<Double>(1)); // 4
-
-  // Merge
-  auto tv7 = add(tv6, tv5); // 5
-
-  // Maximum expected output groups (can improve overtime):
-  //  {0}, {1}, {2}, {3,4,5}
-  //  without final merge would have been {0}, {1}, {2}, {3,4}, {5}
-
-  fusion.addOutput(tv7);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({2, 2, 2, 2, 2}, options);
-  at::Tensor t1 = at::randn({2}, options);
-
-  std::vector<at::Tensor> aten_inputs = {t0, t1};
-
-  KernelArgumentHolder args(KernelIndexMode::INT32);
-  args.setDeviceIndex(0);
-  args.push(aten_inputs);
-
-  auto fusion_segments = fusion.segment(args);
-  TORCH_CHECK(fusion_segments->groups().size() <= 4);
-}
-
-TEST_F(NVFuserTest, FusionDAGScalarMerging_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(3);
-  auto i0 = IrBuilder::create<Double>();
-
-  fusion->addInput(tv0);
-  fusion->addInput(i0);
-
-  auto i1 = add(i0, IrBuilder::create<Double>(1.0));
-  auto i2 = mul(i1, i1);
-  auto i3 = add(i2, i1);
-
-  // Branch 0
-  auto tv1 = sum(tv0, {0}); // 0
-  auto tv2 = add(tv1, i2);
-  // Branch 1
-  auto tv3 = sum(tv2, {0}); // 1
-  auto tv4 = add(tv3, i3);
-
-  auto tv5 = add(tv4, i0);
-
-  fusion->addOutput(tv5);
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({16, 16, 16}, options);
-  double s0 = 0.5;
-
-  auto s1 = s0 + 1.0;
-  auto s2 = s1 * s1;
-  auto s3 = s2 + s1;
-  auto t1 = t0.sum({0});
-  auto t2 = t1 + s2;
-  auto t3 = sum(t2, {0});
-  auto t4 = t3 + s3;
-  auto t5 = t4 + s0;
-
-  auto outputs = executor_cache.runFusionWithInputs({t0, s0});
-
-  TORCH_CHECK(
-      executor_cache.getMostRecentKernelRuntime()->isSegmented(),
-      "segmentation didn't happen");
-  TORCH_CHECK(
-      executor_cache.getMostRecentKernelRuntime()
-              ->fusionSegments()
-              ->groups()
-              .size() == 2,
-      "segmentation didn't happen as expected");
-
-  testValidate(
-      executor_cache.fusion(), outputs, {t0, s0}, {t5}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionBlockReduceInSerialLoop_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  constexpr int M = 10;
-  constexpr int N = 20;
-  constexpr int K = 20;
-
-  auto tv0 = makeSymbolicTensor(3);
-  auto tv1 = sum(tv0, {{1, 2}});
-  fusion.addInput(tv0);
-  fusion.addOutput(tv1);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({M, N, K}, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
-  at::Tensor aten_output = t0.sum({1, 2});
-  testValidate(
-      &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionBlockWelfordInSerialLoop_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  constexpr int M = 10;
-  constexpr int N = 20;
-  constexpr int K = 20;
-
-  auto tv0 = makeSymbolicTensor(3);
-  auto tvs = Welford(tv0, {{1, 2}});
-  fusion.addInput(tv0);
-  auto tv_avg = tvs.avg;
-  auto tv_M2 = tvs.var_sum;
-  auto tv_N = tvs.n;
-  fusion.addOutput(tv_avg);
-  fusion.addOutput(tv_M2);
-
-  tv_avg->axis(-1)->parallelize(ParallelType::TIDx);
-  tv_avg->axis(0)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({M, N, K}, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
-  at::Tensor aten_avg = t0.mean({1, 2});
-  at::Tensor aten_M2 = t0.var({1, 2}, false) * N * K;
-  testValidate(
-      &fusion, outputs, aten_inputs, {aten_avg, aten_M2}, __LINE__, __FILE__);
-}
-
-// See Issue #716
-TEST_F(NVFuserTest, FusionIOTensorTrivialReductionRepro_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  constexpr int M = 10;
-  constexpr int N = 11;
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  std::vector<int> reduction_axes = {1};
-  std::vector<bool> broadcast_mask = {false, true};
-
-  auto tv0_bcast = broadcast(tv0, broadcast_mask);
-  auto path1_bcast = add(tv0_bcast, IrBuilder::create<Double>(1.0));
-  auto path1 = sum(path1_bcast, reduction_axes);
-  fusion.addOutput(path1);
-
-  auto p = path1->split(1, 1);
-  path1->rFactor({1});
-  path1->axis(0)->parallelize(ParallelType::BIDx);
-  tv0->computeAt(path1, 1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({M}, options);
-  at::Tensor t0_ref = t0.clone();
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-
-  // inplace op, we are adding t0 to itself
-  auto outputs = fe.runFusion(aten_inputs, {t0});
-
-  TORCH_CHECK(outputs[0].allclose(t0_ref.add(1)));
-}
-
-TEST_F(NVFuserTest, FusionReductionPredicate_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = sum(tv0, {0});
-  fusion.addOutput(tv1);
-
-  auto tv2 = tv0->cacheAfter();
-
-  const int bdimx = 128;
-  tv1->split(1, bdimx);
-  tv1->split(1, 4);
-  tv1->split(1, 1);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(2)->parallelize(ParallelType::Unroll);
-  tv1->split(0, 10);
-  tv0->computeAt(tv1, 4);
-
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  int numel_x = 650;
-  int numel_y = 102;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({numel_x, numel_y}, options);
-  at::Tensor cg_output = at::empty({numel_y}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  fe.runFusion({input}, {cg_output});
-
-  auto aten_output = input.to(at::kDouble).sum({0});
-
-  testValidate(
-      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionIssue728_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addOutput(tv0);
-  auto tv1 = makeSymbolicTensor(1);
-  fusion.addOutput(tv1);
-  auto tv2 = makeSymbolicTensor(1);
-  fusion.addOutput(tv2);
-
-  auto tv3 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv4 = add(tv3, tv1);
-  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
-  auto tv6 = add(tv2, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv5);
-  fusion.addOutput(tv6);
-
-  // tv0 -> tv3 -+
-  // tv1 --------+-> tv4 -> tv5
-  //
-  // tv2 -> tv6
-
-  auto all_vals_under_tv3 =
-      DependencyCheck::getAllValsBetween({tv3}, fusion.outputs());
-  std::unordered_set<Val*> included_tensors({tv3, tv4, tv5});
-  for (auto tv : included_tensors) {
-    TORCH_CHECK(
-        std::find(all_vals_under_tv3.begin(), all_vals_under_tv3.end(), tv) !=
-            all_vals_under_tv3.end(),
-        "TV",
-        tv->name(),
-        " not found");
-  }
-  for (auto tv : ir_utils::filterByType<TensorView>(fusion.vals())) {
-    if (included_tensors.find(tv) == included_tensors.end()) {
-      TORCH_CHECK(
-          std::find(all_vals_under_tv3.begin(), all_vals_under_tv3.end(), tv) ==
-              all_vals_under_tv3.end(),
-          "TV",
-          tv->name(),
-          " should not be found");
-    }
-  }
-
-  auto no_dependency = DependencyCheck::getAllValsBetween({}, fusion.outputs());
-  TORCH_CHECK(no_dependency.empty(), "No val should be returned");
-
-  auto no_dep_path = DependencyCheck::getAllValsBetween({tv0, tv1}, {tv6});
-  TORCH_CHECK(no_dep_path.empty(), "No val should be returned");
-
-  auto no_dep_path2 = DependencyCheck::getAllValsBetween({tv2}, {tv5});
-  TORCH_CHECK(no_dep_path2.empty(), "No val should be returned");
-
-  auto just_tv3 = DependencyCheck::getAllValsBetween({tv3}, {tv3});
-  TORCH_CHECK(
-      just_tv3.size() == 1 && *(just_tv3.begin()) == tv3,
-      "Only tv3 should be included");
-}
-
-TEST_F(NVFuserTest, FusionIssue757_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-  auto tv3 = makeSymbolicTensor(2);
-  fusion.addInput(tv3);
-  auto tv4 = add(tv2, tv3);
-  fusion.addOutput(tv4);
-
-  tv1->computeAt(tv4, -1);
-
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-
-  int numel_x = 650;
-  int numel_y = 102;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  at::Tensor t3 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0, t3};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0.sum({1});
-  auto t2 = t1.unsqueeze(-1).expand({numel_x, numel_y});
-  auto t4 = t2 + t3;
-
-  testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__);
-}
-
-// See issue #759
-TEST_F(NVFuserTest, FusionPredicatedBlockBroadcast_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-  auto tv3 = makeSymbolicTensor(2);
-  fusion.addInput(tv3);
-  auto tv4 = add(tv2, tv3);
-  fusion.addOutput(tv4);
-
-  tv4->split(0, 4);
-  tv1->computeAt(tv4, -1);
-
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(1)->parallelize(ParallelType::TIDy);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(1)->parallelize(ParallelType::TIDy);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-
-  int numel_x = 100;
-  int numel_y = 101;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  at::Tensor t3 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0, t3};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto outputs = fe.runFusion(inputs);
-
-  auto t1 = t0.sum({1});
-  auto t2 = t1.unsqueeze(-1).expand({numel_x, numel_y});
-  auto t4 = t2 + t3;
-
-  testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSegmentVerticalMerge_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(3);
-
-  fusion->addInput(tv0);
-  // {first kernel}
-  auto tv1 = sum(tv0, {0});
-  auto tv2 = add(tv1, tv0);
-  auto tv3 = sum(tv2, {0});
-  auto tv4 = add(tv3, tv0);
-  auto tv5 = sum(tv4, {0});
-  auto tv6 = sum(tv5, {0});
-  // {second kernel}
-  auto tv7 = add(tv6, tv5);
-  auto tv8 = add(tv7, tv5);
-  auto tv9 = sum(tv8, {0});
-
-  fusion->addOutput(tv9);
-
-  SegmentCandidateFinderOptions segment_options;
-  segment_options.run_herrmann_merge = false;
-  segment_options.run_final_merge = false;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({2, 2, 2}, options);
-
-  KernelArgumentHolder args(KernelIndexMode::INT32);
-  args.setDeviceIndex(0);
-  args.push(t0);
-
-  auto segmented_fusion =
-      SegmentCandidateFinder::segment(fusion.get(), args, segment_options);
-
-  TORCH_CHECK(segmented_fusion->groups().size() == 2);
-}
-
-TEST_F(NVFuserTest, FusionSegmentHorizontalMerge_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(3);
-  auto i0 = IrBuilder::create<Double>();
-
-  fusion->addInput(tv0);
-  fusion->addInput(i0);
-
-  // Branch 0 {first kernel}
-  auto tv1 = sum(tv0, {0});
-  auto tv2 = add(tv0, i0);
-  auto tv3 = unaryOp(UnaryOpType::Rsqrt, tv2);
-  auto tv4 = sum(tv3, {0});
-
-  // Branch 1 {first kernel}
-  auto tv5 = unaryOp(UnaryOpType::Rsqrt, tv3);
-  auto tv6 = sum(tv5, {0});
-
-  // Incompatible {second kernel}
-  auto tv7 = sum(tv6, {0});
-
-  fusion->addOutput(tv1);
-  fusion->addOutput(tv4);
-  fusion->addOutput(tv7);
-
-  SegmentCandidateFinderOptions segment_options;
-  segment_options.run_herrmann_merge = false;
-  segment_options.run_final_merge = false;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({2, 2, 2}, options);
-
-  KernelArgumentHolder args(KernelIndexMode::INT32);
-  args.setDeviceIndex(0);
-  args.push(t0);
-  c10::IValue scalar = 1.0;
-  args.push(scalar);
-
-  auto segmented_fusion =
-      SegmentCandidateFinder::segment(fusion.get(), args, segment_options);
-
-  TORCH_CHECK(segmented_fusion->groups().size() == 2);
-}
-
-TEST_F(NVFuserTest, FusionSegmentMixReduction_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(3);
-
-  fusion->addInput(tv0);
-
-  // def of tv1 in kernel 1 through horizontal
-  auto tv1 = sum(tv0, {0, 1});
-  // kernel 2
-  auto tv2 = sum(tv0, {2});
-  auto tv3 = broadcast(tv2, {false, false, true});
-  auto tv4 = add(tv0, tv3);
-  auto tv5 = sum(tv4, {2});
-  // end of kernel 2
-  // kernel 1
-  auto tv6 = unaryOp(UnaryOpType::Rsqrt, tv0);
-  auto tv7 = sum(tv6, {0, 1});
-  auto tv8 = sum(tv6, {0, 1});
-
-  fusion->addOutput(tv1);
-  fusion->addOutput(tv5);
-  fusion->addOutput(tv7);
-  fusion->addOutput(tv8);
-
-  SegmentCandidateFinderOptions segment_options;
-  segment_options.run_herrmann_merge = false;
-  segment_options.run_final_merge = false;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({2, 2, 2}, options);
-
-  KernelArgumentHolder args(KernelIndexMode::INT32);
-  args.setDeviceIndex(0);
-  args.push(t0);
-
-  auto segmented_fusion =
-      SegmentCandidateFinder::segment(fusion.get(), args, segment_options);
-
-  TORCH_CHECK(segmented_fusion->groups().size() <= 2);
-}
-
-TEST_F(NVFuserTest, FusionSBAR_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // N, H, W, C format
-  std::vector<int64_t> input_shape{656, 7, 7, 64};
-
-  auto x = makeContigTensor(4);
-  auto y = makeContigTensor(4);
-  auto weight = makeContigTensor(1);
-  auto bias = makeContigTensor(1);
-
-  fusion.addInput(x);
-  fusion.addInput(y);
-  fusion.addInput(weight);
-  fusion.addInput(bias);
-
-  const size_t kNumberOfDims = x->nDims();
-  std::vector<bool> broadcast_mask(kNumberOfDims, false);
-  for (const auto axis : c10::irange(kNumberOfDims - 1)) {
-    broadcast_mask[axis] = true;
-  }
-
-  auto weight_bcast = broadcast(weight, broadcast_mask);
-  auto scale = mul(x, weight_bcast);
-  auto bias_bcast = broadcast(bias, broadcast_mask);
-  auto scale_bias = add(scale, bias_bcast);
-  auto scale_bias_add = add(scale_bias, y);
-  auto scale_bias_add_relu = unaryOp(UnaryOpType::Relu, scale_bias_add);
-
-  fusion.addOutput(scale_bias_add_relu);
-
-  // inputs
-  at::manual_seed(0);
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor at_x = at::randn(input_shape, options);
-  at::Tensor at_y = at::randn(input_shape, options);
-  at::Tensor at_weight = at::ones({input_shape[3]}, options);
-  at::Tensor at_bias = at::zeros({input_shape[3]}, options);
-
-  // inputs
-  std::vector<c10::IValue> inputs = {at_x, at_y, at_weight, at_bias};
-
-  // outputs
-  std::vector<at::Tensor> outputs;
-
-  auto lparams = schedulePointwise(&fusion, inputs);
-
-  FusionExecutor executor;
-  executor.compileFusion(&fusion, inputs, lparams);
-  outputs = executor.runFusion(inputs, lparams);
-
-  auto at_scale = at::mul(at_x, at_weight);
-  auto at_scale_bias = at::add(at_scale, at_bias);
-  auto pwise_add = at::add(at_scale_bias, at_y);
-  auto output = at::relu(pwise_add);
-
-  testValidate(&fusion, outputs, inputs, {output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSingleElement_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(0);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(2.5));
-
-  auto tv2 = add(tv1, IrBuilder::create<Double>(3.5));
-  fusion.addOutput(tv2);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({}, options);
-
-  at::Tensor cg_output = at::empty({}, options);
-
-  auto lparams = schedulePointwise(&fusion, {input});
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input}, lparams);
-  fe.runFusion({input}, {cg_output}, lparams);
-
-  auto aten_output = input.add(2.5).add(3.5);
-
-  testValidate(
-      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionBNBackwardRepro_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  int batch = 4;
-  int c = 4;
-  int h = 4;
-  int w = 4;
-  int numDims = 4;
-
-  auto input = makeSymbolicTensor(numDims);
-  fusion.addInput(input);
-  auto weight = makeSymbolicTensor(1);
-  fusion.addInput(weight);
-  auto running_mean = makeSymbolicTensor(1);
-  fusion.addInput(running_mean);
-  auto running_var = makeSymbolicTensor(1);
-  fusion.addInput(running_var);
-  auto save_mean = makeSymbolicTensor(1);
-  fusion.addInput(save_mean);
-  auto save_invstd = makeSymbolicTensor(1);
-  fusion.addInput(save_invstd);
-
-  auto grad_out_prev = makeSymbolicTensor(numDims);
-  fusion.addInput(grad_out_prev);
-  auto gt_0 =
-      makeSymbolicTensor(numDims); // single tensor broadcasted is dangerous.
-  fusion.addInput(gt_0);
-
-  auto gt_bool = binaryOp(BinaryOpType::GT, gt_0, IrBuilder::create<Int>(1));
-  auto gt_float = castOp(DataType::Float, gt_bool);
-
-  auto grad_out = mul(grad_out_prev, gt_float);
-
-  Val* eps_ptr = IrBuilder::create<Double>(1e-5);
-
-  auto grads = batch_norm_backward(
-      input,
-      grad_out,
-      weight,
-      running_mean,
-      running_var,
-      save_mean,
-      save_invstd,
-      true,
-      eps_ptr,
-      {true, true, true});
-
-  fusion.addOutput(grads.grad_input);
-  fusion.addOutput(grads.grad_weight);
-  fusion.addOutput(grads.grad_bias);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input0 = at::randn({batch, c, h, w}, options);
-  at::Tensor input1 = at::randn({c}, options);
-  at::Tensor input2 = at::randn_like(input1);
-  at::Tensor input3 = at::randn_like(input1);
-  at::Tensor input4 = at::randn_like(input1);
-  at::Tensor input5 = at::randn_like(input1);
-  at::Tensor input6 = at::randn_like(input0);
-  at::Tensor input7 = at::randn_like(input0);
-
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  std::vector<IValue> inputs = {
-      input0, input1, input2, input3, input4, input5, input6, input7};
-  auto outputs = fec.runFusionWithInputs(inputs);
-}
-
-// TODO: We only changed inputs, merge this with the test above.
-TEST_F(NVFuserTest, FusionBNBackwardRepro2_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  int batch = 2;
-  int c = 81;
-  int h = 1;
-  int w = 1;
-  int numDims = 4;
-
-  // auto input = makeSymbolicTensor(numDims);
-  auto input = makeConcreteTensor({-1, -1, 1, 1});
-  fusion.addInput(input);
-  auto weight = makeSymbolicTensor(1);
-  fusion.addInput(weight);
-  auto running_mean = makeSymbolicTensor(1);
-  fusion.addInput(running_mean);
-  auto running_var = makeSymbolicTensor(1);
-  fusion.addInput(running_var);
-  auto save_mean = makeSymbolicTensor(1);
-  fusion.addInput(save_mean);
-  auto save_invstd = makeSymbolicTensor(1);
-  fusion.addInput(save_invstd);
-
-  // auto grad_out_prev = makeSymbolicTensor(numDims);
-  auto grad_out_prev = makeConcreteTensor({-1, -1, 1, 1});
-  fusion.addInput(grad_out_prev);
-  // auto gt_0 =
-  //     makeSymbolicTensor(numDims); // single tensor broadcasted is dangerous.
-  auto gt_0 = makeConcreteTensor({-1, -1, 1, 1});
-  fusion.addInput(gt_0);
-
-  auto gt_bool = binaryOp(BinaryOpType::GT, gt_0, IrBuilder::create<Int>(1));
-  auto gt_float = castOp(DataType::Float, gt_bool);
-
-  auto grad_out = mul(grad_out_prev, gt_float);
-
-  Val* eps_ptr = IrBuilder::create<Double>(1e-5);
-
-  auto grads = batch_norm_backward(
-      input,
-      grad_out,
-      weight,
-      running_mean,
-      running_var,
-      save_mean,
-      save_invstd,
-      true,
-      eps_ptr,
-      {true, true, true});
-
-  fusion.addOutput(grads.grad_input);
-  fusion.addOutput(grads.grad_weight);
-  fusion.addOutput(grads.grad_bias);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input0 = at::randn({batch, c, h, w}, options);
-  at::Tensor input1 = at::randn({c}, options);
-  at::Tensor input2 = at::randn_like(input1);
-  at::Tensor input3 = at::randn_like(input1);
-  at::Tensor input4 = at::randn_like(input1);
-  at::Tensor input5 = at::randn_like(input1);
-  at::Tensor input6 = at::randn_like(input0);
-  at::Tensor input7 = at::randn_like(input0);
-
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  std::vector<IValue> inputs = {
-      input0, input1, input2, input3, input4, input5, input6, input7};
-  auto outputs = fec.runFusionWithInputs(inputs);
-}
-
-TEST_F(NVFuserTest, FusionBNRepro_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  const bool kTraining = true;
-  const float kMomentum = 0.1;
-  const float kEps = 1e-5;
-
-  int batch = 14;
-  int c = 65;
-  int h = 7;
-  int w = 7;
-  int numDims = 4;
-
-  auto input = makeSymbolicTensor(numDims);
-  fusion.addInput(input);
-  auto weight = makeSymbolicTensor(1);
-  fusion.addInput(weight);
-  auto bias = makeSymbolicTensor(1);
-  fusion.addInput(bias);
-  auto running_mean = makeSymbolicTensor(1);
-  fusion.addInput(running_mean);
-  auto running_var = makeSymbolicTensor(1);
-  fusion.addInput(running_var);
-
-  auto momentum_ptr = IrBuilder::create<Double>(kMomentum);
-  auto eps_ptr = IrBuilder::create<Double>(kEps);
-
-  auto result = batch_norm(
-      input,
-      weight,
-      bias,
-      running_mean,
-      running_var,
-      kTraining,
-      momentum_ptr,
-      eps_ptr);
-
-  fusion.addOutput(result.output);
-  fusion.addOutput(result.mean);
-  fusion.addOutput(result.invstd);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({batch, c, h, w}, options);
-  at::Tensor input2 = at::randn({c}, options);
-  at::Tensor input3 = at::randn_like(input2);
-  at::Tensor input4 = at::randn_like(input2);
-  at::Tensor input5 = at::randn_like(input2);
-
-  auto input1_ref = input1.clone();
-  auto input2_ref = input2.clone();
-  auto input3_ref = input3.clone();
-  auto input4_ref = input4.clone();
-  auto input5_ref = input5.clone();
-
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  std::vector<IValue> aten_inputs = {input1, input2, input3, input4, input5};
-  auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
-
-  auto at_results = at::native_batch_norm(
-      input1_ref,
-      input2_ref,
-      input3_ref,
-      input4_ref,
-      input5_ref,
-      kTraining,
-      kMomentum,
-      kEps);
-
-  auto at_output = std::get<0>(at_results);
-  auto at_mean = std::get<1>(at_results);
-  auto at_invstd = std::get<2>(at_results);
-
-  std::vector<at::Tensor> aten_outputs = {at_output, at_mean, at_invstd};
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionBNRepro2_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  const bool kTraining = true;
-  const float kMomentum = 0.1;
-  const float kEps = 1e-5;
-
-  int batch = 2;
-  int c = 4;
-  int h = 17;
-  int w = 17;
-  int numDims = 4;
-
-  auto input = makeSymbolicTensor(numDims);
-  fusion.addInput(input);
-
-  Val* momentum_ptr = IrBuilder::create<Double>(kMomentum);
-  Val* eps_ptr = IrBuilder::create<Double>(kEps);
-
-  auto result = batch_norm(
-      input,
-      nullptr,
-      nullptr,
-      nullptr,
-      nullptr,
-      kTraining,
-      momentum_ptr,
-      eps_ptr);
-
-  fusion.addOutput(result.output);
-  fusion.addOutput(result.mean);
-  fusion.addOutput(result.invstd);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({batch, c, h, w}, options);
-
-  auto input1_ref = input1.clone();
-  at::Tensor r_m;
-  at::Tensor r_v;
-  at::Tensor weight;
-  at::Tensor bias;
-
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  std::vector<IValue> aten_inputs = {input1};
-  auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
-
-  auto at_results = at::native_batch_norm(
-      input1_ref, r_m, r_v, weight, bias, kTraining, kMomentum, kEps);
-
-  auto at_output = std::get<0>(at_results);
-  auto at_mean = std::get<1>(at_results);
-  auto at_invstd = std::get<2>(at_results);
-
-  std::vector<at::Tensor> aten_outputs = {at_output, at_mean, at_invstd};
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionZeroSizeTensorPW_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = makeConcreteTensor({0});
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, IrBuilder::create<Double>(2.5));
-  fusion.addOutput(tv2);
-
-  // This test used to just have:
-  // auto tv3 = makeConcreteTensor({0});
-  // and somehow that was running through our system fine, but size-0 tensors
-  // are not supported, so making sure this fails.
-  auto tv3 = set(tv1);
-  fusion.addOutput(tv3);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor input0 = at::randn({2}, options);
-  at::Tensor input1 = at::randn({0}, options);
-  at::Tensor cg_output2 = at::empty({2}, options);
-  at::Tensor cg_output3 = at::empty({0}, options);
-
-  // Fails at schedule pointwise because our (maybe only) size-0 check is in
-  // binding input sizes which the scheduler ends up calling.
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(schedulePointwise(&fusion, {input0, input1}));
-}
-
-TEST_F(NVFuserTest, FusionZeroSizeTensorReduction_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = makeConcreteTensor({0});
-  fusion.addInput(tv1);
-
-  auto tv2 = sum(tv0, {1});
-  fusion.addOutput(tv2);
-
-  auto tv3 = makeConcreteTensor({0});
-  fusion.addOutput(tv3);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor input0 = at::randn({2, 4}, options);
-  at::Tensor input1 = at::randn({0}, options);
-  at::Tensor cg_output2 = at::empty({2}, options);
-  at::Tensor cg_output3 = at::empty({0}, options);
-
-  auto reduction_params = getReductionHeuristics(&fusion, {input0, input1});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  scheduleReduction(&fusion, *reduction_params);
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-
-  auto lparams = reduction_params->lparams;
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input0, input1}, lparams);
-  auto cg_outputs = fe.runFusion({input0, input1}, lparams);
-  auto aten_output2 = input0.sum({1});
-  at::Tensor aten_output3 = at::empty({0}, options);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {input0, input1},
-      {aten_output2, aten_output3},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST_F(NVFuserTest, FusionZeroSizeTensorNormalization_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = makeConcreteTensor({0});
-  fusion.addInput(tv1);
-
-  auto tv2 = sum(tv0, {0});
-  auto tv3 = broadcast(tv2, {true, false});
-  auto tv4 = add(tv0, tv3);
-  fusion.addOutput(tv4);
-
-  auto tv5 = makeConcreteTensor({0});
-  fusion.addOutput(tv5);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor input0 = at::randn({2, 4}, options);
-  at::Tensor input1 = at::randn({0}, options);
-  at::Tensor cg_output2 = at::empty({2, 4}, options);
-  at::Tensor cg_output3 = at::empty({0}, options);
-
-  auto reduction_params = getPersistentHeuristics(&fusion, {input0, input1});
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  schedulePersistentKernel(&fusion, *reduction_params);
-
-  auto lparams = reduction_params->lparams;
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input0, input1}, lparams);
-  auto cg_outputs = fe.runFusion({input0, input1}, lparams);
-  auto aten_output2 = input0.sum({0}).add(input0);
-  at::Tensor aten_output3 = at::empty({0}, options);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {input0, input1},
-      {aten_output2, aten_output3},
-      __LINE__,
-      __FILE__,
-      "",
-      lparams);
-}
-
-TEST_F(NVFuserTest, FusionSegmentIoAlias_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(1);
-  TensorView* tv2 = makeSymbolicTensor(2);
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-  fusion->addInput(tv2);
-
-  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(1)); // Group 0
-  TensorView* tv4 =
-      max(tv3, {0}); // Group 0 (use max instead to avoid numerical issues)
-  TensorView* tv5 = add(tv4, tv1); //  Group 0 (Non Broadcast after reduce,
-                                   //  keeps normalization scheduler away)
-  TensorView* tv6 = add(tv5, tv2); //  Group 1 (Broadcast after reduce)
-
-  // Note: test alias;
-  fusion->aliasOutputToInput(tv6, tv0);
-  // TODO: support output on aliased fusion #1488
-  // remove tv7 after #1488
-  // fusion->addOutput(tv6);
-  TensorView* tv7 = add(tv6, IrBuilder::create<Double>(1)); // Group 0
-  fusion->addOutput(tv7);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({128, 65}, options);
-  at::Tensor t1 = at::randn({65}, options);
-  at::Tensor t2 = at::randn({128, 65}, options);
-
-  auto t3 = t0.add(1.0);
-  auto t4 = std::get<0>(at::max(t3, 0));
-  auto t5 = t4.add(t1);
-  auto t6 = t5.add(t2);
-  auto t7 = t6.add(1.0);
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-
-  auto outputs = executor_cache.runFusionWithInputs({t0, t1, t2});
-
-  // TODO: support output on aliased fusion #1488
-  // validating aliasing
-  // TORCH_INTERNAL_ASSERT(outputs[0].data_ptr() == t0.data_ptr());
-
-  TORCH_CHECK(
-      executor_cache.getMostRecentKernelRuntime()->isSegmented(),
-      "segmentation didn't happen");
-  TORCH_CHECK(
-      executor_cache.getMostRecentKernelRuntime()
-              ->fusionSegments()
-              ->groups()
-              .size() == 2,
-      "segmentation didn't happen as expected");
-
-  testValidate(
-      executor_cache.fusion(), outputs, {t0, t1, t2}, {t7}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionWelford1Output_CUDA) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion->addInput(tv0);
-
-  auto tvs = Welford(tv0, {1});
-  fusion->addOutput(tvs.var_sum);
-  FusionExecutorCache executor_cache(std::move(fusion_ptr));
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({128, 65}, options);
-  auto outputs = executor_cache.runFusionWithInputs({t0});
-
-  auto t1 = t0.var({1}, false) * 65;
-  testValidate(fusion, outputs, {t0}, {t1}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionTranslate1Welford_CUDA) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion->addInput(tv0);
-
-  auto tvs = Welford(tv0, {1});
-  auto tv_out = add(tv0, broadcast(tvs.avg, {false, true}));
-  fusion->addOutput(tv_out);
-  FusionExecutorCache executor_cache(std::move(fusion_ptr));
-
-  auto run_test = [&executor_cache,
-                   fusion](auto inner_size) -> FusionKernelRuntime* {
-    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-    at::Tensor t0 = at::randn({128, inner_size}, options);
-    auto outputs = executor_cache.runFusionWithInputs({t0});
-    // Square sums does not fit well in the testValidate assumptions,
-    //  so we just compare the divided output here.
-    testValidate(
-        fusion,
-        outputs,
-        {t0},
-        {t0.add(t0.mean({1}).unsqueeze(1))},
-        __LINE__,
-        __FILE__);
-
-    return executor_cache.getMostRecentKernelRuntime();
-  };
-
-  // Run a translated welford
-  auto runtime1 = run_test(64);
-  // Check it was translated
-  TORCH_CHECK(
-      runtime1->fusionSegments()->groups().size() == 1 &&
-      runtime1->fusionSegments()->groups()[0]->exprs().size() > 2);
-
-  // Run an un-translated welford
-  auto runtime2 = run_test(65536);
-
-  bool found_welford = false;
-  for (auto group : runtime2->fusionSegments()->groups()) {
-    for (auto expr : group->exprs()) {
-      if (expr->isA<WelfordOp>()) {
-        found_welford = true;
-      }
-    }
-  }
-  TORCH_CHECK(found_welford);
-}
-
-TEST_F(NVFuserTest, FusionTranslate2Welford_CUDA) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion->addInput(tv0);
-
-  auto tvs1 = Welford(tv0, {1});
-  auto tv_out1 = add(tv0, broadcast(tvs1.avg, {false, true}));
-  fusion->addOutput(tv_out1);
-
-  auto tvs2 = Welford(tv0, {1});
-  auto tv_out2 = add(tv0, broadcast(tvs2.avg, {false, true}));
-  fusion->addOutput(tv_out2);
-
-  FusionExecutorCache executor_cache(std::move(fusion_ptr));
-
-  auto run_test = [&executor_cache,
-                   fusion](auto inner_size) -> FusionKernelRuntime* {
-    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-    at::Tensor t0 = at::randn({128, inner_size}, options);
-    auto outputs = executor_cache.runFusionWithInputs({t0});
-
-    // Square sums does not fit well in the testValidate assumptions,
-    //  so we just compare the divided output here.
-    auto out = t0.add(t0.mean({1}).unsqueeze(1));
-    testValidate(fusion, outputs, {t0}, {out, out}, __LINE__, __FILE__);
-
-    return executor_cache.getMostRecentKernelRuntime();
-  };
-
-  // Run a translated welford
-  auto runtime1 = run_test(64);
-  // Check it was translated
-  TORCH_CHECK(
-      runtime1->fusionSegments()->groups().size() == 1 &&
-      runtime1->fusionSegments()->groups()[0]->exprs().size() > 4);
-
-  // Run an un-translated welford
-  auto runtime2 = run_test(65536);
-  // // Check it was not translated
-  bool found_welford = false;
-  for (auto group : runtime2->fusionSegments()->groups()) {
-    for (auto expr : group->exprs()) {
-      if (expr->isA<WelfordOp>()) {
-        found_welford = true;
-      }
-    }
-  }
-  TORCH_CHECK(found_welford);
-}
-
-TEST_F(NVFuserTest, FusionLargeWelfordNormalization_CUDA) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion->addInput(tv0);
-
-  auto tvs1 = Welford(tv0, {1});
-  auto sum_of_tv0 = sum(tv0, {1});
-
-  fusion->addOutput(tvs1.var_sum);
-  fusion->addOutput(sum_of_tv0);
-
-  FusionExecutorCache executor_cache(std::move(fusion_ptr));
-
-  auto run_test = [&executor_cache,
-                   fusion](auto inner_size) -> FusionKernelRuntime* {
-    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-    at::Tensor t0 = at::randn({128, inner_size}, options);
-    auto outputs = executor_cache.runFusionWithInputs({t0});
-
-    auto t1 = t0.var({1}, false) * inner_size;
-    auto t2 = t0.sum({1});
-    testValidate(fusion, outputs, {t0}, {t1, t2}, __LINE__, __FILE__);
-
-    return executor_cache.getMostRecentKernelRuntime();
-  };
-
-  auto runtime = run_test(65536);
-  TORCH_CHECK(!runtime->isSegmented());
-}
-
-TEST_F(NVFuserTest, FusionWelfordOuterPersistence_CUDA) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion->addInput(tv0);
-
-  auto tvs1 = Welford(tv0, {1});
-  auto sum_of_tv0 = sum(tv0, {1});
-  auto sum_bcasted = broadcast(sum_of_tv0, {false, true});
-  auto avg_bcasted = broadcast(tvs1.avg, {false, true});
-  auto tv0_plus_sum = add(tv0, sum_bcasted);
-  auto tv0_plus_avg = add(tv0, avg_bcasted);
-
-  fusion->addOutput(tv0_plus_sum);
-  fusion->addOutput(tv0_plus_avg);
-
-  FusionExecutorCache executor_cache(std::move(fusion_ptr));
-
-  auto run_test = [&executor_cache,
-                   fusion](auto inner_size) -> FusionKernelRuntime* {
-    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-    at::Tensor t0 = at::randn({128, inner_size}, options);
-    auto outputs = executor_cache.runFusionWithInputs({t0});
-
-    auto t1 = t0.to(c10::kDouble).mean({1}).unsqueeze(1) + t0;
-    auto t2 = t0.to(c10::kDouble).sum({1}).unsqueeze(1) + t0;
-    testValidate(fusion, outputs, {t0}, {t2, t1}, __LINE__, __FILE__);
-
-    return executor_cache.getMostRecentKernelRuntime();
-  };
-
-  for (auto inner_size : {4096, 8192, 32768}) {
-    auto runtime = run_test(inner_size);
-    TORCH_CHECK(!runtime->isSegmented());
-  }
-}
-
-TEST_F(NVFuserTest, FusionSegmentIslands_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(2);
-  auto tv1 = makeSymbolicTensor(2);
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-
-  auto tv2 = sum(tv0, {0});
-  auto tv3 = sum(tv1, {1});
-  fusion->addOutput(tv2);
-  fusion->addOutput(tv3);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({16, 16}, options);
-  at::Tensor t1 = at::randn({16, 16}, options);
-
-  FusionExecutorCache fusion_executor_cache(std::move(fusion));
-  fusion_executor_cache.runFusionWithInputs({t0, t1});
-}
-
-TEST_F(NVFuserTest, FusionBackOffInnerBroadcast_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(1);
-  auto tv1 = makeSymbolicTensor(2);
-  auto tv2 = makeSymbolicTensor(4);
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-
-  auto tv3 = broadcast(tv0, {false, true, true, true});
-  auto tv4 = broadcast(tv1, {false, false, true, true});
-  auto tv5 = unaryOp(UnaryOpType::Rsqrt, tv2);
-
-  auto tv6 = add(tv3, tv5);
-  auto tv7 = add(tv4, tv5);
-  auto tv8 = add(tv3, tv4);
-
-  auto tv9 = add(tv6, tv7);
-  auto tv10 = add(tv9, tv8);
-
-  fusion->addOutput(tv10);
-
-  tv0->computeAt(tv10, -2);
-  tv1->computeAt(tv10, -2);
-  tv2->computeAt(tv10, -2);
-
-  TORCH_CHECK(tv3->getComputeAtPosition() == 1);
-  TORCH_CHECK(tv4->getComputeAtPosition() == 2);
-  TORCH_CHECK(tv5->getComputeAtPosition() == 3);
-
-  TORCH_CHECK(tv6->getMaxProducerPosition() == 3);
-  TORCH_CHECK(tv7->getMaxProducerPosition() == 3);
-  TORCH_CHECK(tv8->getMaxProducerPosition() == 2);
-}
-
-TEST_F(NVFuserTest, FusionBackOffInnerBroadcast2_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(2);
-  auto tv1 = makeSymbolicTensor(3);
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-  auto tv2 = broadcast(tv0, {false, false, true});
-  auto tv3 = add(tv2, tv1);
-
-  fusion->addOutput(tv3);
-  tv3->split(-2, 4);
-  tv3->reorder({{-1, -2}});
-  tv0->computeAt(tv3, -2);
-  tv1->computeAt(tv3, -2);
-  TORCH_CHECK(tv2->getComputeAtPosition() == 2);
-  TORCH_CHECK(tv3->getMaxProducerPosition() == 2);
-}
-
-TEST_F(NVFuserTest, FusionBackOffInnerBroadcast3_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(2);
-  auto tv1 = makeSymbolicTensor(4);
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-  auto tv2 = broadcast(tv0, {false, false, true});
-  auto tv3 = broadcast(tv2, {false, true, false, false});
-  auto tv4 = add(tv3, tv1);
-
-  fusion->addOutput(tv4);
-  tv0->computeAt(tv4, -1);
-  tv1->computeAt(tv4, -1);
-  TORCH_CHECK(tv2->getComputeAtPosition() == 2);
-  TORCH_CHECK(tv3->getMaxProducerPosition() == 3);
-}
-
-TEST_F(NVFuserTest, FusionSimpleWarp_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion->addInput(tv0);
-
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-  auto tv3 = add(tv2, tv0);
-
-  fusion->addOutput(tv3);
-
-  tv1->split(1, 32);
-  auto tv1_rf = tv1->rFactor({1});
-  TransformPropagatorWithCheck propagator(tv1_rf);
-  MaxRootDomainInfoSpanningTree(tv1_rf).traverse(&propagator);
-  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({16, 128}, options);
-
-  auto at_output = input1.sum({1}, true).add(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1});
-  auto outputs = fe.runFusion({input1});
-
-  testValidate(
-      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSimpleWarpPad_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(2);
-
-  fusion->addInput(tv0);
-
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-  auto tv3 = add(tv2, tv0);
-
-  fusion->addOutput(tv3);
-
-  // Schedule a persistent kernel
-  auto tv0_cache = tv0->cacheAfter();
-  tv1->split(1, 8, false);
-  auto tv1_rf = tv1->rFactor({1});
-  tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
-  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1_rf->axis(-1)->padToMultipleOfWarp(32);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->padToMultipleOfWarp(32);
-  TransformPropagatorWithCheck propagator(tv1_rf);
-  MaxRootDomainInfoSpanningTree(tv1_rf).traverse(&propagator);
-  tv0->axis(-1)->parallelize(ParallelType::TIDx);
-  tv0->axis(-1)->padToMultipleOfWarp(32);
-  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
-  tv0_cache->axis(-1)->padToMultipleOfWarp(32);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->padToMultipleOfWarp(32);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->padToMultipleOfWarp(32);
-
-  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({16, 127}, options);
-
-  auto at_output = input1.sum({1}, true).add(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1});
-  auto outputs = fe.runFusion({input1});
-  testValidate(
-      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionWarpPadMergeSplit_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(3);
-
-  fusion->addInput(tv0);
-
-  auto tv1 = sum(tv0, {1, 2});
-  auto tv2 = broadcast(tv1, {false, true, true});
-  auto tv3 = add(tv2, tv0);
-
-  fusion->addOutput(tv3);
-
-  // Schedule a persistent kernel
-  auto tv0_cache = tv0->cacheAfter();
-  tv1->merge(1);
-  tv1->split(1, 8, false);
-
-  auto tv1_rf = tv1->rFactor({1});
-  tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
-  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->padToMultipleOfWarp();
-  TransformPropagatorWithCheck propagator(tv1_rf);
-  MaxRootDomainInfoSpanningTree(tv1_rf).traverse(&propagator);
-  tv0->axis(-1)->parallelize(ParallelType::TIDx);
-  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({16, 17, 128}, options);
-
-  auto at_output = input1.sum({1, 2}, true).add(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1});
-  auto outputs = fe.runFusion({input1});
-  testValidate(
-      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSerialWarpReduction_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(3);
-
-  fusion->addInput(tv0);
-
-  auto tv1 = sum(tv0, {1, 2});
-  auto tv2 = broadcast(tv1, {false, true, true});
-  auto tv3 = add(tv2, tv0);
-
-  fusion->addOutput(tv3);
-
-  // Schedule a persistent kernel
-  auto tv0_cache = tv0->cacheAfter();
-  tv1->merge(1);
-  tv1->split(1, 8, false);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->padToMultipleOfWarp();
-  TransformPropagatorWithCheck propagator(tv1);
-  MaxRootDomainInfoSpanningTree(tv1).traverse(&propagator);
-  tv0->axis(-1)->parallelize(ParallelType::TIDx);
-  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({16, 17, 128}, options);
-
-  auto at_output = input1.sum({1, 2}, true).add(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1});
-  auto outputs = fe.runFusion({input1});
-  testValidate(
-      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionTrivialWarpReduction_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeConcreteTensor({17, 18, 128, 1});
-
-  fusion->addInput(tv0);
-
-  auto tv1 = sum(tv0, {1, 2, 3});
-  auto tv2 = broadcast(tv1, {false, true, true, true});
-  auto tv3 = add(tv2, tv0);
-
-  fusion->addOutput(tv3);
-
-  // Schedule a persistent kernel
-  auto tv0_cache = tv0->cacheAfter();
-  tv1->merge(1);
-  tv1->split(1, 8, false);
-
-  auto tv1_rf = tv1->rFactor({1});
-  tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
-  tv1_rf->axis(-2)->parallelize(ParallelType::TIDx);
-  tv1->axis(-2)->parallelize(ParallelType::TIDx);
-  tv1->axis(-2)->padToMultipleOfWarp();
-  TransformPropagatorWithCheck propagator(tv1_rf);
-  MaxRootDomainInfoSpanningTree(tv1_rf).traverse(&propagator);
-  tv0->axis(-2)->parallelize(ParallelType::TIDx);
-  tv0_cache->axis(-2)->parallelize(ParallelType::TIDx);
-  tv2->axis(-2)->parallelize(ParallelType::TIDx);
-  tv3->axis(-2)->parallelize(ParallelType::TIDx);
-
-  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({17, 18, 128, 1}, options);
-
-  auto at_output = input1.sum({1, 2, 3}, true).add(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1});
-  auto outputs = fe.runFusion({input1});
-  testValidate(
-      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionMultipleDimBinding_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(2);
-  auto tv_add = makeSymbolicTensor(2);
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv_add);
-
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-  auto tv3 = add(tv2, tv0);
-  auto tv4 = add(tv0, tv_add);
-
-  fusion->addOutput(tv3);
-  fusion->addOutput(tv4);
-
-  // Schedule a persistent kernel
-  auto tv0_cache = tv0->cacheAfter();
-  tv1->split(1, 8, false);
-  auto tv1_rf = tv1->rFactor({1});
-  tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
-  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1_rf->axis(-1)->padToMultipleOfWarp(32);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->padToMultipleOfWarp(32);
-  TransformPropagatorWithCheck propagator(tv1_rf);
-  MaxRootDomainInfoSpanningTree(tv1_rf).traverse(&propagator);
-  tv0->axis(-1)->parallelize(ParallelType::TIDx);
-  tv0->axis(-1)->padToMultipleOfWarp(32);
-  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
-  tv0_cache->axis(-1)->padToMultipleOfWarp(32);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->padToMultipleOfWarp(32);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->padToMultipleOfWarp(32);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(-1)->padToMultipleOfWarp(64);
-
-  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({16, 128}, options);
-  at::Tensor input2 = at::randn({16, 128}, options);
-
-  auto at_output = input1.sum({1}, true).add(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1, input2});
-  auto outputs = fe.runFusion({input1, input2});
-  testValidate(
-      fusion.get(),
-      outputs,
-      {input1, input2},
-      {at_output, input1 + input2},
-      __LINE__,
-      __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionPadNoWarpReduce_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(2);
-
-  fusion->addInput(tv0);
-
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-  auto tv3 = add(tv2, tv0);
-
-  fusion->addOutput(tv3);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->padToMultipleOfWarp();
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv1->axis(0)->parallelize(ParallelType::TIDy);
-  tv2->axis(0)->parallelize(ParallelType::TIDy);
-  tv3->axis(0)->parallelize(ParallelType::TIDy);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({16, 31}, options);
-
-  auto at_output = input1.sum({1}, true).add(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1});
-  auto outputs = fe.runFusion({input1});
-  testValidate(
-      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionWarpMutipleThreadDim_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion->addInput(tv0);
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = sum(tv1, {1});
-  fusion->addOutput(tv2);
-
-  tv2->split(1, 8);
-  auto tv2_rf = tv2->rFactor({-1});
-  tv2_rf->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2_rf->axis(-1)->padToMultipleOfWarp();
-
-  TransformPropagatorWithCheck propagator(tv2_rf);
-  MaxRootDomainInfoSpanningTree(tv2_rf).traverse(&propagator);
-
-  tv0->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(1)->parallelize(ParallelType::TIDy);
-  tv0->computeAt(tv2, 2);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({16, 31}, options);
-
-  auto at_output = (input1 + 1).sum({1});
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1});
-  auto outputs = fe.runFusion({input1});
-  testValidate(
-      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionWarpReduceUnrollOuterLoop_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(2);
-
-  fusion->addInput(tv0);
-
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-  auto tv3 = add(tv2, tv0);
-
-  fusion->addOutput(tv3);
-
-  // Schedule a persistent kernel
-  auto tv0_cache = tv0->cacheAfter();
-  tv1->split(1, 8, false);
-  tv1->split(0, 4);
-  auto tv1_rf = tv1->rFactor({2});
-
-  tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
-  tv1_rf->axis(1)->parallelize(ParallelType::Unroll);
-  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->padToMultipleOfWarp();
-  tv1->axis(1)->parallelize(ParallelType::Unroll);
-  TransformPropagatorWithCheck propagator(tv1_rf);
-  MaxRootDomainInfoSpanningTree(tv1_rf).traverse(&propagator);
-  tv0->axis(-1)->parallelize(ParallelType::TIDx);
-  tv0->axis(1)->parallelize(ParallelType::Unroll);
-  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
-  tv0_cache->axis(1)->parallelize(ParallelType::Unroll);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(1)->parallelize(ParallelType::Unroll);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(1)->parallelize(ParallelType::Unroll);
-
-  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({16, 128}, options);
-
-  auto at_output = input1.sum({1}, true).add(input1);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1});
-  auto outputs = fe.runFusion({input1});
-  testValidate(
-      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
-}
-
-// Repro of issue #1579
-TEST_F(NVFuserTest, FusionWarpReducePredication_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  std::vector<int64_t> shape1 = {1024};
-  std::vector<int64_t> shape2 = {50};
-
-  auto tv0 = makeConcreteTensor(shape1);
-  fusion.addInput(tv0);
-  auto tv1 = sum(tv0, {0});
-  fusion.addOutput(tv1);
-
-  auto tv2 = makeConcreteTensor(shape2);
-  fusion.addInput(tv2);
-  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
-  auto tv4 = sum(tv3, {0});
-  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv5);
-
-  // Just to fill the smem buffer by a thread block of 1024 threads
-  // with some values
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-
-  // Make the tv4_rf reduction a warp reduction to trigger the
-  // bug. Since the smem buffer is filled with some values due to the
-  // reduction of tv1, those values would be used by predicated-out
-  // threads.
-  tv4->split(-1, 10);
-  auto tv4_rf = tv4->rFactor({-1});
-  tv4_rf->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4_rf->axis(-1)->padToMultipleOfWarp();
-
-  tv4_rf->computeAt(tv4, 1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn(shape1, options);
-  auto t2 = at::randn(shape2, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t2});
-  auto cg_outputs = fe.runFusion({t0, t2});
-
-  auto t1 = t0.sum({0});
-  auto t4 = (t2 + 1).sum({0}) + 1;
-
-  testValidate(&fusion, cg_outputs, {t0, t2}, {t1, t4}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSegfaultReduction_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  int batch = 2;
-  int c = 1;
-  int h = 1;
-  int w = 1;
-  int numDims = 4;
-
-  auto input = makeConcreteTensor({-1, 1, 1, 1});
-  fusion.addInput(input);
-  auto bcast_bias = makeConcreteTensor({-1, 1, 1, 1});
-  fusion.addInput(bcast_bias);
-
-  std::vector<int64_t> at_sum_axes;
-  std::vector<int> outer_reduction_axes;
-  std::vector<bool> outer_broadcast_mask(numDims, false);
-  Val* N = IrBuilder::create<Double>(1);
-  for (const auto axis : c10::irange(numDims)) {
-    if (axis != 1) {
-      outer_reduction_axes.push_back(axis);
-      at_sum_axes.push_back(axis);
-      outer_broadcast_mask[axis] = true;
-      N = mul(N, input->domain()->domain()[axis]->extent());
-    }
-  }
-
-  auto output0 = mul(input, bcast_bias);
-  fusion.addOutput(output0);
-  auto output1 = sum(output0, outer_reduction_axes);
-  fusion.addOutput(output1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input0 = at::randn({batch, c, h, w}, options);
-  at::Tensor input1 = at::randn({batch, c, h, w}, options);
-
-  auto at_output0 = input0.mul(input1);
-  auto at_output1 = at_output0.sum(at_sum_axes);
-
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  std::vector<IValue> inputs = {input0, input1};
-  auto outputs = fec.runFusionWithInputs(inputs);
-
-  testValidate(
-      &fusion, outputs, inputs, {at_output0, at_output1}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionPredicateElimination1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(2));
-  auto tv3 = add(tv2, IrBuilder::create<Double>(3));
-
-  fusion.addOutput(tv3);
-
-  tv3->split(0, 32);
-  tv0->computeAt(tv3, 1);
-
-  tv2->axis(1)->parallelize(ParallelType::Unswitch);
-
-  {
-    GpuLower gpulw(&fusion);
-    TORCH_CHECK(!PredicatedChecker::isPredicated(tv2, gpulw));
-  }
-
-  tv2->axis(1)->parallelize(ParallelType::Serial);
-  tv2->split(1, 5);
-
-  {
-    GpuLower gpulw(&fusion);
-    TORCH_CHECK(PredicatedChecker::isPredicated(tv2, gpulw));
-  }
-}
-
-// Repro of issue #1571
-TEST_F(NVFuserTest, FusionPredicateElimination2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  std::vector<int64_t> shape({10, 11});
-
-  auto tv0 = makeConcreteTensor(shape);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = sum(tv1, {1});
-  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
-
-  fusion.addOutput(tv3);
-
-  tv1->split(1, 4);
-  tv1->split(0, 4);
-  tv2->split(1, 4);
-  tv2->split(0, 4);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn(shape, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = (t0 + 1).sum({1}) + 1;
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionPredicateElimination3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {0});
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv2);
-
-  auto tv3 = tv0->cacheAfter();
-
-  tv1->split(0, 10);
-  tv1->split(0, 33);
-  TransformPropagatorWithCheck propagator(tv1);
-  MaxRootDomainInfoSpanningTree(tv1).traverse(&propagator);
-
-  auto tv4 = tv1->rFactor({-1});
-  auto tv5 = tv1->rFactor({-1});
-
-  tv4->axis(0)->parallelize(ParallelType::BIDx);
-  tv4->axis(1)->parallelize(ParallelType::TIDx);
-  scheduler_utils::parallelizeAllLike(tv4);
-
-  GpuLower gpulw(&fusion);
-
-  // The fusion has three reductions: one within each thread, one
-  // within each block, and another with the whole grid. All of them
-  // should not need to be predicated as they use the same init value
-  // and same reduction op.
-  TORCH_CHECK(!PredicatedChecker::isPredicated(tv4, gpulw));
-  TORCH_CHECK(!PredicatedChecker::isPredicated(tv5, gpulw));
-  TORCH_CHECK(!PredicatedChecker::isPredicated(tv1, gpulw));
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  for (auto size : {1, 2, 999, 1001, 1234, 10000}) {
-    auto t0 = at::randn({size}, options);
-
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {t0});
-    auto cg_outputs = fe.runFusion({t0});
-
-    auto ref = sum(t0) + 1;
-    testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-  }
-}
-
-TEST_F(NVFuserTest, FusionPredicateElimination4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {1});
-
-  auto tv2 = sum(tv1, {0});
-  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv3);
-
-  auto tv4 = max(tv1, {0});
-  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv5);
-
-  tv1->split(1, 7);
-  tv1->split(0, 11);
-  tv1->reorder({{1, 2}, {2, 1}});
-  TransformPropagatorWithCheck propagator(tv1);
-  MaxRootDomainInfoSpanningTree(tv1).traverse(&propagator);
-
-  tv1->axis(0)->parallelize(ParallelType::TIDy);
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-  scheduler_utils::parallelizeAllLike(tv1);
-
-  GpuLower gpulw(&fusion);
-
-  // tv2 uses the same op and init with tv1, so tv2 should be fine
-  // without a predicate. However, tv4, while it uses the tv1 as its
-  // input, the reduction op and init value is different from those of
-  // tv1, so tv4 needs to be predicated.
-  TORCH_CHECK(!PredicatedChecker::isPredicated(tv2, gpulw));
-  TORCH_CHECK(PredicatedChecker::isPredicated(tv4, gpulw));
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  std::vector<int64_t> sizes = {1, 2, 33, 34, 64, 99};
-  for (auto s0 : sizes) {
-    for (auto s1 : sizes) {
-      auto t0 = at::randn({s0, s1}, options);
-
-      FusionExecutor fe;
-      fe.compileFusion(&fusion, {t0});
-      auto cg_outputs = fe.runFusion({t0});
-
-      auto t1 = t0.sum({1});
-      auto t3 = t1.sum({0}) + 1;
-      auto t5 = std::get<0>(t1.max(0)) + 1;
-
-      testValidate(&fusion, cg_outputs, {t0}, {t3, t5}, __LINE__, __FILE__);
-    }
-  }
-}
-
-TEST_F(NVFuserTest, FusionPredicateElimination5_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = set(tv0);
-  auto tvs2 = Welford(tv1, {0});
-  auto tv3 = set(tvs2.avg);
-  fusion.addOutput(tv3);
-
-  tvs2.avg->split(0, 4);
-  TransformPropagatorWithCheck propagator(tvs2.avg);
-  MaxRootDomainInfoSpanningTree(tvs2.avg).traverse(&propagator);
-  auto avg_rf = ir_utils::rfactorHelper(tvs2.avg, {1});
-
-  avg_rf->axis(0)->parallelize(ParallelType::TIDx);
-  scheduler_utils::parallelizeAllLike(avg_rf);
-
-  GpuLower gpulw(&fusion);
-
-  // The first per-thread welford needs to be predicated as the N
-  // input is different from its init value. The second welford op
-  // does not need a predicate.
-  TORCH_CHECK(PredicatedChecker::isPredicated(avg_rf, gpulw));
-  TORCH_CHECK(!PredicatedChecker::isPredicated(tvs2.avg, gpulw));
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  std::vector<int64_t> sizes = {1, 2, 33, 34, 64, 99};
-  for (auto s0 : sizes) {
-    auto t0 = at::randn({s0}, options);
-
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {t0});
-    auto cg_outputs = fe.runFusion({t0});
-
-    auto ref = t0.mean({0});
-
-    testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-  }
-}
-
-TEST_F(NVFuserTest, FusionPredicateElimination6_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({2, 3});
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
-  auto tv4 = add(tv3, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv4);
-
-  tv4->split(1, 5);
-  TransformPropagatorWithCheck propagator(tv4);
-  MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
-
-  tv4->reorder({{0, 1}, {1, 0}});
-  tv3->computeAt(tv4, 1);
-
-  GpuLower gpulw(&fusion);
-
-  // The expression for tv2 is a local-to-local expression. It
-  // satisfies all the requirements of predicate elimination, except
-  // for the on on split root domains. As the second root axis of tv2
-  // is split, its index exceeds its extent (i.e., 3 in this case)
-  // without its predicate.
-  TORCH_CHECK(PredicatedChecker::isPredicated(tv2, gpulw));
-
-  // Unlike tv2, tv3 is computed at tv4, so the second root axis does
-  // have a zero domain. Its index should look like "i * 5 + j", where
-  // i comes from the first root domain and j comes from the split
-  // inner domain.
-  TORCH_CHECK(!PredicatedChecker::isPredicated(tv3, gpulw));
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn({2, 3}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = t0 + 4;
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionPredicateElimination7_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv3);
-
-  tv3->split(-1, 5);
-  tv3->split(-1, 4);
-  tv3->split(-1, 3);
-  TransformPropagatorWithCheck propagator(tv3);
-  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
-
-  tv0->computeAt(tv3, 1);
-
-  // The last split of tv2 is a non-divisible split, and omitting it
-  // is invalid.
-  GpuLower gpulw(&fusion);
-  TORCH_CHECK(PredicatedChecker::isPredicated(tv2, gpulw));
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn({123}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = t0 + 3;
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionForceFp16Simple_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  auto tv1 = makeSymbolicTensor(2);
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-
-  // Group 1
-  auto tv2 = sum(tv0, {1});
-  auto tv3 = broadcast(tv2, {false, true});
-
-  // Group 2
-  auto tv4 = add(tv3, tv1); // Edge: tv3: expect cast
-  auto tv5 = castOp(DataType::Half, tv4);
-
-  fusion->addOutput(tv5);
-
-  FusionExecutorCache fec(std::move(fusion_ptr));
-
-  std::vector<int64_t> shape{15, 16};
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto in0 = at::randn(shape, options);
-  auto in1 = at::randn(shape, options);
-  fec.runFusionWithInputs({in0, in1});
-
-  // Check the segmented edge is fp16
-  auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments();
-  for (auto edge : segmented_fusion->edges()) {
-    auto edge_tv = edge->val->as<TensorView>();
-    TORCH_CHECK(edge_tv->getDataType() == DataType::Half);
-  }
-}
-
-TEST_F(NVFuserTest, FusionForceBf16Simple_CUDA) {
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-  // requires ampere+ GPU
-  if (!deviceMajorMinorCheck(8)) {
-    GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
-    return;
-  }
-
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  auto tv1 = makeSymbolicTensor(2);
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-
-  // Group 1
-  auto tv2 = sum(tv0, {1});
-  auto tv3 = broadcast(tv2, {false, true});
-
-  // Group 2
-  auto tv4 = add(tv3, tv1); // Edge: tv3: expect cast
-  auto tv5 = castOp(DataType::BFloat16, tv4);
-
-  fusion->addOutput(tv5);
-
-  FusionExecutorCache fec(std::move(fusion_ptr));
-
-  std::vector<int64_t> shape{15, 16};
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto in0 = at::randn(shape, options);
-  auto in1 = at::randn(shape, options);
-  fec.runFusionWithInputs({in0, in1});
-
-  // Check the segmented edge is bf16
-  auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments();
-  for (auto edge : segmented_fusion->edges()) {
-    auto edge_tv = edge->val->as<TensorView>();
-    TORCH_CHECK(edge_tv->getDataType() == DataType::BFloat16);
-  }
-#else
-  GTEST_SKIP() << "requires cuda 11.0 or newer toolkit";
-#endif
-}
-
-TEST_F(NVFuserTest, FusionForceFp16NotAllCast_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeSymbolicTensor(3);
-  auto tv1 = makeSymbolicTensor(3);
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-
-  // Group 1
-  auto tv3 = sum(tv0, {1});
-  auto tv4 = broadcast(tv3, {false, true, false});
-  auto tv5 = sum(tv0, {1});
-
-  // Group 2
-  auto tv6 = add(tv4, tv1); // edge tv4, expect cast
-  auto tv7 = castOp(DataType::Half, tv6);
-
-  // Group 3
-  auto tv8 = sum(tv5, {1}); // edge tv5, don't expect cast
-
-  fusion->addOutput(tv7);
-  fusion->addOutput(tv8);
-
-  FusionExecutorCache fec(std::move(fusion_ptr));
-
-  std::vector<int64_t> shape{16, 16, 16};
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto in0 = at::randn(shape, options);
-  auto in1 = at::randn(shape, options);
-  fec.runFusionWithInputs({in0, in1});
-
-  auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments();
-  auto complete_fusion = segmented_fusion->completeFusion();
-
-  // Check that the edge that wasn't fp16 is the producer of the
-  //  reduction op, i.e. tv8 = sum(tv5,{1});.
-  for (auto edge : segmented_fusion->edges()) {
-    auto edge_tv = edge->val->as<TensorView>();
-    if (edge_tv->getDataType() == DataType::Float) {
-      auto consumer = *(complete_fusion->unordered_uses(edge_tv).begin());
-      TORCH_CHECK(consumer->isA<ReductionOp>());
-    }
-  }
-}
-
-TEST_F(NVFuserTest, FusionForceBf16NotAllCast_CUDA) {
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-  // requires ampere+ GPU
-  if (!deviceMajorMinorCheck(8)) {
-    GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
-    return;
-  }
-
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeSymbolicTensor(3);
-  auto tv1 = makeSymbolicTensor(3);
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-
-  // Group 1
-  auto tv3 = sum(tv0, {1});
-  auto tv4 = broadcast(tv3, {false, true, false});
-  auto tv5 = sum(tv0, {1});
-
-  // Group 2
-  auto tv6 = add(tv4, tv1); // edge tv4, expect cast
-  auto tv7 = castOp(DataType::BFloat16, tv6);
-
-  // Group 3
-  auto tv8 = sum(tv5, {1}); // edge tv5, don't expect cast
-
-  fusion->addOutput(tv7);
-  fusion->addOutput(tv8);
-
-  FusionExecutorCache fec(std::move(fusion_ptr));
-
-  std::vector<int64_t> shape{16, 16, 16};
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto in0 = at::randn(shape, options);
-  auto in1 = at::randn(shape, options);
-  fec.runFusionWithInputs({in0, in1});
-
-  auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments();
-  auto complete_fusion = segmented_fusion->completeFusion();
-
-  // Check that the edge that wasn't fp16 is the producer of the
-  //  reduction op, i.e. tv8 = sum(tv5,{1});.
-  for (auto edge : segmented_fusion->edges()) {
-    auto edge_tv = edge->val->as<TensorView>();
-    if (edge_tv->getDataType() == DataType::Float) {
-      auto consumer = *(complete_fusion->unordered_uses(edge_tv).begin());
-      TORCH_CHECK(consumer->isA<ReductionOp>());
-    }
-  }
-#else
-  GTEST_SKIP() << "requires cuda 11.0 or newer toolkit";
-#endif
-}
-
-TEST_F(NVFuserTest, FusionBufferReuseBroadCastMultiVisit_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeConcreteTensor({2, 2});
-  auto tv1 = makeConcreteTensor({2, 2, 2});
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-
-  auto tv2 = mul(tv0, IrBuilder::create<Double>(2));
-  auto tv3 = broadcast(tv2, {false, false, true});
-  auto tv4 = add(tv3, tv1);
-  auto tv5 = mul(tv4, IrBuilder::create<Double>(3));
-  fusion->addOutput(tv5);
-
-  // t4 cannot inner re-use t2, because there's a broadcast
-  //  between them.
-  tv0->computeAt(tv5, 1, ComputeAtMode::BestEffort);
-  tv3->computeAt(tv5, 2, ComputeAtMode::BestEffort);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto in0 = at::randn({2, 2}, options);
-  auto in1 = at::randn({2, 2, 2}, options);
-
-  auto at_output = ((in0 * 2).unsqueeze(2) + in1) * 3;
-  FusionExecutor fe;
-  fe.compileFusion(fusion, {in0, in1});
-  auto outputs = fe.runFusion({in0, in1});
-
-  testValidate(fusion, outputs, {in0, in1}, {at_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionBufferReuseStressTest_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeConcreteTensor({2, 2});
-  auto tv1 = makeConcreteTensor({2, 2, 2});
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-
-  auto tv2 = mul(tv0, IrBuilder::create<Double>(2));
-  auto tv3 = mul(tv0, IrBuilder::create<Double>(3));
-  auto tv4 = mul(tv2, tv3);
-  // Broadcast buffer can be reused through outer sharing
-  auto tv5 = broadcast(tv4, {true, false, false});
-  auto tv6 = mul(tv5, IrBuilder::create<Double>(5));
-  auto tv7 = mul(tv6, tv1);
-  auto tv8 = mul(tv7, IrBuilder::create<Double>(7));
-  // tv9 shouldn't alias to avoid buffer over-subscription
-  auto tv9 = broadcast(tv4, {true, false, false});
-  auto tv10 = mul(tv9, IrBuilder::create<Double>(9));
-  auto tv11 = add(tv5, tv9);
-  fusion->addOutput(tv7);
-  fusion->addOutput(tv11);
-
-  tv0->computeAt(tv5, 1, ComputeAtMode::BestEffort);
-  tv0->computeAt(tv9, 1, ComputeAtMode::BestEffort);
-
-  tv5->computeAt(tv7, 1, ComputeAtMode::BestEffort);
-  tv5->computeAt(tv11, 1, ComputeAtMode::BestEffort);
-  tv9->computeAt(tv11, 1, ComputeAtMode::BestEffort);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto in0 = at::randn({2, 2}, options);
-  auto in1 = at::randn({2, 2, 2}, options);
-  auto t2 = in0 * 2;
-  auto t3 = in0 * 3;
-  auto t4 = t2 * t3;
-  auto t5 = t4.unsqueeze(0);
-  auto t6 = t5 * 5;
-  auto t7 = t6 * in1;
-  auto t8 = t7 * 7;
-  auto t9 = t4.unsqueeze(0);
-  auto t10 = t9 * 9;
-  auto t11 = t5 + t9;
-  FusionExecutor fe;
-  fe.compileFusion(fusion, {in0, in1});
-
-  auto at_output = ((in0 * 2).unsqueeze(2) + in1) * 3;
-  auto outputs = fe.runFusion({in0, in1});
-
-  testValidate(fusion, outputs, {in0, in1}, {t7, t11}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionBufferReuseLargeBuffer_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeConcreteTensor({256, 512});
-
-  fusion->addInput(tv0);
-
-  auto tv1 = mul(tv0, IrBuilder::create<Double>(2));
-  auto tv2 = mul(tv1, IrBuilder::create<Double>(2));
-  auto tv3 = mul(tv2, IrBuilder::create<Double>(2));
-  auto tv4 = mul(tv3, IrBuilder::create<Double>(2));
-  auto tv5 = mul(tv4, IrBuilder::create<Double>(2));
-  auto tv6 = mul(tv5, IrBuilder::create<Double>(2));
-
-  fusion->addOutput(tv6);
-
-  tv0->computeAt(tv6, 1, ComputeAtMode::BestEffort);
-  tv6->axis(0)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto in0 = at::randn({256, 512}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion, {in0});
-  auto outputs = fe.runFusion({in0});
-
-  auto at_out = in0.mul(2).mul(2).mul(2).mul(2).mul(2).mul(2);
-
-  testValidate(fusion, outputs, {in0}, {at_out}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionBufferReuseNo2hop_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeConcreteTensor({2, 2});
-  auto tv1 = makeConcreteTensor({2, 2, 2});
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-
-  auto tv2 = mul(tv0, IrBuilder::create<Double>(2));
-  auto tv3 = broadcast(tv2, {false, false, true});
-  auto tv4 = add(tv3, tv1); // T4 to be inner aliased first, and
-                            //  shouldn't outer alias on top
-  auto tv5 = mul(tv4, IrBuilder::create<Double>(3));
-  auto tv6 = mul(tv5, IrBuilder::create<Double>(3));
-  fusion->addOutput(tv6);
-
-  tv0->computeAt(tv6, 1, ComputeAtMode::BestEffort);
-  tv4->computeAt(tv6, 2, ComputeAtMode::BestEffort);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto in0 = at::randn({2, 2}, options);
-  auto in1 = at::randn({2, 2, 2}, options);
-  FusionExecutor fe;
-  fe.compileFusion(fusion, {in0, in1});
-  auto outputs = fe.runFusion({in0, in1});
-
-  auto at_out = (in0.mul(2.0).unsqueeze(2) + in1).mul(3.0).mul(3.0);
-
-  testValidate(fusion, outputs, {in0, in1}, {at_out}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionBufferReuseAllocationOrder_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeConcreteTensor({3, 3, 3});
-
-  fusion->addInput(tv0);
-
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = mul(tv1, IrBuilder::create<Double>(2));
-  auto tv3 = mul(tv2, IrBuilder::create<Double>(2));
-
-  fusion->addOutput(tv3);
-
-  // In this case tv1 "reuses" allocation of tv2
-  //  due to the switched allocation order
-  tv1->computeAt(tv2, 1, ComputeAtMode::BestEffort);
-
-  tv0->axis(0)->parallelize(ParallelType::TIDx);
-  tv1->axis(0)->parallelize(ParallelType::TIDx);
-  tv2->axis(0)->parallelize(ParallelType::TIDx);
-  tv3->axis(0)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto in0 = at::randn({3, 3, 3}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion, {in0});
-  auto outputs = fe.runFusion({in0});
-
-  auto at_out = in0.sum(1).mul(2).mul(2);
-
-  testValidate(fusion, outputs, {in0}, {at_out}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionBufferReuseLiveInterval_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeConcreteTensor({16, 16});
-
-  fusion->addInput(tv0);
-
-  auto tv1 = mul(tv0, IrBuilder::create<Double>(3));
-  auto tv2 = mul(tv1, IrBuilder::create<Double>(2));
-  auto tv3 = mul(tv2, IrBuilder::create<Double>(2));
-  // tv1 used till here, cannot be reused by tv2 or tv3
-  auto tv4 = mul(tv3, tv1);
-
-  fusion->addOutput(tv4);
-
-  tv0->computeAt(tv4, 1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto in0 = at::randn({16, 16}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion, {in0});
-  auto cg_outputs = fe.runFusion({in0});
-
-  auto at_t0 = in0 * 3.0;
-  auto at_out = at_t0 * 2.0 * 2.0 * at_t0;
-
-  testValidate(fusion, cg_outputs, {in0}, {at_out}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionBufferReuseNoAcrossBroadcast_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  auto tv0 = makeConcreteTensor({2, 2});
-  auto tv1 = makeConcreteTensor({2, 2, 2});
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-
-  auto tv2 = mul(tv0, IrBuilder::create<Double>(2));
-  auto tv3 = mul(tv0, IrBuilder::create<Double>(3));
-  auto tv4 = mul(tv2, tv3);
-  auto tv5 = broadcast(tv4, {false, false, true});
-  auto tv6 = mul(tv5, tv1);
-  auto tv7 = mul(tv6, IrBuilder::create<Double>(7));
-  fusion->addOutput(tv7);
-
-  // tv6 shouldn't re-use t2 or t3 because of
-  //  the broadcast in between
-  tv0->computeAt(tv4, 1, ComputeAtMode::BestEffort);
-  tv4->computeAt(tv7, 2, ComputeAtMode::BestEffort);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto in0 = at::randn({2, 2}, options);
-  auto in1 = at::randn({2, 2, 2}, options);
-  FusionExecutor fe;
-  fe.compileFusion(fusion, {in0, in1});
-  auto outputs = fe.runFusion({in0, in1});
-
-  auto t2 = in0 * 2;
-  auto t3 = in0 * 3;
-  auto t4 = t2 * t3;
-  auto t5 = t4.unsqueeze(2);
-  auto t6 = t5 * in1;
-  auto t7 = t6 * 7;
-  testValidate(fusion, outputs, {in0, in1}, {t7}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionIssue970_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int nelm = 10;
-
-  // tv3 = tv0 + sum(tv0)
-  auto tv0 = makeConcreteTensor({nelm, nelm});
-  fusion.addInput(tv0);
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-  auto tv3 = add(tv2, tv0);
-  fusion.addOutput(tv3);
-
-  tv1->split(1, 4);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({nelm, nelm}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
-
-  auto ref = sum(t0, {1}).unsqueeze(-1).expand({nelm, nelm}) + t0;
-
-  testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-// Reproducer of #1016
-TEST_F(NVFuserTest, FusionIssue1016_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(2));
-
-  fusion.addOutput(tv2);
-
-  tv1->setMemoryType(MemoryType::Shared);
-
-  tv2->split(-1, 8);
-
-  int numel_x = 10;
-  int numel_y = 11;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
-  std::vector<IValue> inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto outputs = fe.runFusion(inputs);
-
-  auto ref = t0 + 1 + 2;
-
-  testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-// Reproducer of #1021
-TEST_F(NVFuserTest, FusionIssue1021_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = broadcast(tv1, {false, true});
-  fusion.addOutput(tv2);
-
-  auto tv3 = tv2->cacheBefore();
-
-  tv2->split(0, 2);
-
-  tv1->computeAt(tv2, 1);
-
-  tv2->axis(0)->parallelize(ParallelType::TIDx);
-  tv2->axis(1)->parallelize(ParallelType::Vectorize);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({10}, options);
-  std::vector<IValue> inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, inputs);
-  auto outputs = fe.runFusion(inputs);
-
-  auto ref = (t0 + 1).unsqueeze(-1);
-
-  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
-}
-
-// Reproducer of issue #1053
-TEST_F(NVFuserTest, FusionNonUniqueThreadDim_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion->addInput(tv0);
-  auto tv1 = sum(tv0, {0});
-  fusion->addOutput(tv1);
-
-  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
-  fusion->addOutput(tv2);
-
-  tv1->split(0, 8);
-  auto tv1_rf = tv1->rFactor({-1});
-
-  tv1_rf->computeAt(tv1, 1);
-
-  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv2->axis(0)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({32}, options);
-
-  auto at_tv1 = (input1).sum({0});
-  auto at_tv2 = input1 + 1;
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1});
-  auto outputs = fe.runFusion({input1});
-  testValidate(
-      fusion.get(), outputs, {input1}, {at_tv1, at_tv2}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionParallelDimensionMap1_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion->addInput(tv0);
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
-  fusion->addOutput(tv1);
-  fusion->addOutput(tv2);
-
-  tv1->split(0, 8, false);
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-  tv2->split(0, 8, false);
-  tv2->axis(1)->parallelize(ParallelType::TIDx);
-
-  // The extents of tv1 and tv2 axes are equal even though their
-  // actual values are not statically known
-  GpuLower gpulw(fusion.get());
-  const auto& pdmap = gpulw.parallelDimensionMap();
-  for (const auto i : c10::irange(tv1->domain()->domain().size())) {
-    auto dom1 = tv1->domain()->domain()[i];
-    auto dom2 = tv2->domain()->domain()[i];
-    TORCH_INTERNAL_ASSERT(pdmap.equalDim(dom1->extent(), dom2->extent()));
-  }
-
-  TORCH_CHECK(pdmap.isExact(ParallelType::TIDx));
-  TORCH_CHECK(
-      pdmap.get(ParallelType::TIDx)->isA<NamedScalar>() &&
-      pdmap.get(ParallelType::TIDx)->as<NamedScalar>()->name() == "blockDim.x");
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({32}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1});
-  auto outputs = fe.runFusion({input1});
-
-  testValidate(
-      fusion.get(),
-      outputs,
-      {input1},
-      {input1 + 1, input1 + 1},
-      __LINE__,
-      __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionParallelDimensionMap2_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion->addInput(tv0);
-  auto tv1 = makeSymbolicTensor(2);
-  fusion->addInput(tv1);
-  auto tv2 = broadcast(tv0, {false, true});
-  auto tv3 = add(tv1, tv2);
-  fusion->addOutput(tv3);
-
-  tv3->split(-1, 8, false);
-  tv2->computeAt(tv3, -1);
-
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  GpuLower gpulw(fusion.get());
-  const auto& pdmap = gpulw.parallelDimensionMap();
-  TORCH_CHECK(pdmap.isExact(ParallelType::TIDx));
-  TORCH_CHECK(
-      pdmap.get(ParallelType::TIDx)->isA<NamedScalar>() &&
-      pdmap.get(ParallelType::TIDx)->as<NamedScalar>()->name() == "blockDim.x");
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({11}, options);
-  at::Tensor input2 = at::randn({11, 13}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1, input2});
-  auto outputs = fe.runFusion({input1, input2});
-
-  auto ref = input1.unsqueeze(-1) + input2;
-
-  testValidate(
-      fusion.get(), outputs, {input1, input2}, {ref}, __LINE__, __FILE__);
-}
-
-// Mix symbolic and concrete tensors
-TEST_F(NVFuserTest, FusionParallelDimensionMap3_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion->addInput(tv0);
-
-  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
-  fusion->addOutput(tv2);
-  auto tv3 = add(tv0, IrBuilder::create<Double>(1));
-  fusion->addOutput(tv3);
-
-  tv2->split(0, 10);
-  tv3->split(0, 20);
-
-  auto tv4 = add(tv0, IrBuilder::create<Double>(1));
-  fusion->addOutput(tv4);
-  auto tv5 = add(tv0, IrBuilder::create<Double>(1));
-  fusion->addOutput(tv5);
-
-  // Not mapped but equal extent
-  tv4->split(0, 10);
-  tv5->split(0, 10);
-
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv4->axis(-1)->parallelize(ParallelType::TIDy);
-  tv5->axis(-1)->parallelize(ParallelType::TIDy);
-
-  GpuLower gpulw(fusion.get());
-  const auto& pdmap = gpulw.parallelDimensionMap();
-  TORCH_CHECK(!pdmap.isExact(ParallelType::TIDx));
-  TORCH_CHECK(
-      pdmap.get(ParallelType::TIDx)->isA<NamedScalar>() &&
-      pdmap.get(ParallelType::TIDx)->as<NamedScalar>()->name() == "blockDim.x");
-  TORCH_CHECK(pdmap.isExact(ParallelType::TIDy));
-  TORCH_CHECK(
-      pdmap.get(ParallelType::TIDy)->isConst() &&
-      pdmap.get(ParallelType::TIDy)->as<Int>()->value().value() == 10);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({13}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {input1});
-  auto outputs = fe.runFusion({input1});
-
-  testValidate(
-      fusion.get(),
-      outputs,
-      {input1},
-      {input1 + 1, input1 + 1, input1 + 1, input1 + 1},
-      __LINE__,
-      __FILE__);
-}
-
-// Parallelizing merged broadcast domains
-TEST_F(NVFuserTest, FusionParallelDimensionMap4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv3 = broadcast(tv2, {true, false});
-  auto tv4 = add(tv3, tv1);
-  fusion.addOutput(tv4);
-
-  tv4->split(1, 4);
-  tv4->reorder({{1, 2}, {2, 1}});
-  tv4->merge(0);
-  tv0->computeAt(tv4, 1);
-  tv1->computeAt(tv4, 1);
-
-  // TIDx is mapped to tv4.axis(0) as well as tv2.axis(0), so it's not
-  // exact.
-  tv4->axis(0)->parallelize(ParallelType::TIDx);
-
-  tv2->setMemoryType(MemoryType::Shared);
-  tv3->setMemoryType(MemoryType::Shared);
-
-  GpuLower gpulw(&fusion);
-  const auto& pdmap = gpulw.parallelDimensionMap();
-  TORCH_CHECK(!pdmap.isExact(ParallelType::TIDx));
-  TORCH_CHECK(
-      pdmap.get(ParallelType::TIDx)->isA<NamedScalar>() &&
-      pdmap.get(ParallelType::TIDx)->as<NamedScalar>()->name() == "blockDim.x");
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({13}, options);
-  at::Tensor input2 = at::randn({15, 13}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input1, input2});
-  auto outputs = fe.runFusion({input1, input2});
-
-  auto ref = (input1 + 1).unsqueeze(0) + input2;
-
-  testValidate(&fusion, outputs, {input1, input2}, {ref}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionParallelDimensionMap5_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  auto tv3 = broadcast(tv0, {false, true});
-  auto tv4 = add(tv3, tv1);
-  fusion.addOutput(tv4);
-
-  tv4->split(1, 4);
-  tv0->computeAt(tv4, -1);
-  tv1->computeAt(tv4, -1);
-
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(-2)->parallelize(ParallelType::TIDy);
-  tv3->axis(-2)->parallelize(ParallelType::TIDy);
-
-  GpuLower gpulw(&fusion);
-  const auto& pdmap = gpulw.parallelDimensionMap();
-  TORCH_CHECK(pdmap.isExact(ParallelType::TIDx));
-  TORCH_CHECK(pdmap.isExact(ParallelType::TIDy));
-  TORCH_CHECK(
-      pdmap.get(ParallelType::TIDx)->isConst() &&
-      pdmap.get(ParallelType::TIDx)->as<Int>()->value().value() == 4);
-  TORCH_CHECK(
-      pdmap.get(ParallelType::TIDy)->isA<NamedScalar>() &&
-      pdmap.get(ParallelType::TIDy)->as<NamedScalar>()->name() == "blockDim.y");
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({13}, options);
-  at::Tensor input2 = at::randn({13, 15}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input1, input2});
-  auto outputs = fe.runFusion({input1, input2});
-
-  auto ref = (input1).unsqueeze(-1) + input2;
-
-  testValidate(&fusion, outputs, {input1, input2}, {ref}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSegmenterCombineReductionsCycleRepro_CUDA) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  auto& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  auto t0 = makeSymbolicTensor(3, DataType::Float);
-  auto t1 = makeSymbolicTensor(3, DataType::Half);
-  auto t3 = makeSymbolicTensor(3, DataType::Half);
-  auto t5 = makeSymbolicTensor(3, DataType::Half);
-  auto t7 = makeSymbolicTensor(1, DataType::Half);
-  auto t11 = makeSymbolicTensor(3, DataType::Half);
-  auto t13 = makeSymbolicTensor(3, DataType::Half);
-  auto t15 = makeSymbolicTensor(3, DataType::Half);
-  auto t17 = makeSymbolicTensor(3, DataType::Half);
-  auto d56 = IrBuilder::create<Double>();
-
-  fusion.addInput(t0);
-  fusion.addInput(t1);
-  fusion.addInput(t3);
-  fusion.addInput(t5);
-  fusion.addInput(t7);
-  fusion.addInput(t11);
-  fusion.addInput(t13);
-  fusion.addInput(t15);
-  fusion.addInput(t17);
-  fusion.addInput(d56);
-
-  auto t2 = castOp(DataType::Float, t1);
-  auto t4 = castOp(DataType::Float, t3);
-  auto t22 = sub(t2, t4);
-  auto t6 = castOp(DataType::Float, t5);
-  auto t23 = mul(t22, t6);
-  auto t16 = castOp(DataType::Float, t15);
-  auto t18 = castOp(DataType::Float, t17);
-  auto t19 = add(t16, t18);
-  auto t14 = castOp(DataType::Float, t13);
-  auto t20 = add(t19, t14);
-  auto t12 = castOp(DataType::Float, t11);
-  auto t21 = add(t20, t12);
-  auto t8 = castOp(DataType::Float, t7);
-  auto t24 = broadcast(t8, {true, true, false});
-  auto t25 = mul(t21, t24);
-  auto t27 = sum(t25, {2});
-  auto t28 = broadcast(t27, {false, false, true});
-  auto t29 = mul(t25, t23);
-  auto t30 = sum(t29, {2});
-  auto t31 = broadcast(t30, {false, false, true});
-  auto d59 =
-      mul(t1->getRootDomain()[2]->extent(), IrBuilder::create<Double>(1));
-  auto t26 = mul(d59, t25);
-  auto txx = mul(t26, IrBuilder::create<Double>(1));
-  auto t33 = sub(txx, t28);
-  auto d70 = unaryOp(UnaryOpType::Reciprocal, d59);
-  auto t35 = mul(d70, t6);
-  auto t39 = sum(t21, {0, 1});
-  auto t47 = castOp(DataType::Half, t39);
-  auto t37 = mul(t21, t23);
-  auto t38 = sum(t37, {0, 1});
-  auto t46 = castOp(DataType::Half, t38);
-  auto t32 = mul(t23, t31);
-  auto t34 = sub(t33, t32);
-  auto t36 = mul(t35, t34);
-  auto t45 = castOp(DataType::Half, t36);
-  auto t40 = mul(t36, t0);
-  auto t41 = mul(t40, d56);
-  auto t44 = castOp(DataType::Half, t41);
-  auto t42 = sum(t41, {0, 1});
-  auto t43 = castOp(DataType::Half, t42);
-
-  fusion.addOutput(t43);
-  fusion.addOutput(t44);
-  fusion.addOutput(t45);
-  fusion.addOutput(t46);
-  fusion.addOutput(t47);
-
-  auto options_half = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  auto options_float =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor at_t0 = at::randn({128, 64, 1024}, options_float);
-  at::Tensor at_t1 = at::randn({128, 64, 1024}, options_half);
-  at::Tensor at_t3 = at::randn({128, 64, 1024}, options_half);
-  at::Tensor at_t5 = at::randn({128, 64, 1024}, options_half);
-  at::Tensor at_t7 = at::randn({1024}, options_half);
-  at::Tensor at_t11 = at::randn({128, 64, 1024}, options_half);
-  at::Tensor at_t13 = at::randn({128, 64, 1024}, options_half);
-  at::Tensor at_t15 = at::randn({128, 64, 1024}, options_half);
-  at::Tensor at_t17 = at::randn({128, 64, 1024}, options_half);
-  double at_d56 = 1.1111;
-
-  std::vector<at::Tensor> aten_inputs = {
-      at_t0, at_t1, at_t3, at_t5, at_t7, at_t11, at_t13, at_t15, at_t17};
-
-  c10::IValue val = at_d56;
-
-  KernelArgumentHolder args(KernelIndexMode::INT32);
-  args.setDeviceIndex(0);
-  args.push(aten_inputs);
-  args.push(val);
-
-  for (auto _ : c10::irange(5)) {
-    auto segmented_fusion =
-        SegmentCandidateFinder::segment(fusion_ptr.get(), args);
-  }
-}
-
-TEST_F(NVFuserTest, FusionSerialAndParallelIndexing_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv2);
-
-  auto tv3 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv4 = add(tv3, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv4);
-
-  auto tv5 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv6 = add(tv5, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv6);
-
-  // Case 1: local memory tensor computed serially and used by
-  // parallel threads
-  tv2->split(-1, 4);
-  tv1->computeAt(tv2, -2);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  // Case 2: shared memory tensor computed serially and used by BID
-  tv4->split(-1, 4);
-  tv3->computeAt(tv4, -2);
-  tv4->axis(-1)->parallelize(ParallelType::BIDx);
-  tv3->setMemoryType(MemoryType::Shared);
-
-  // Case 3: shared memory tensor computed by TID and used by BID
-  tv6->split(-1, 4);
-  tv5->computeAt(tv6, -2);
-  tv6->axis(-1)->parallelize(ParallelType::BIDx);
-  tv5->axis(-1)->parallelize(ParallelType::TIDx);
-  tv5->setMemoryType(MemoryType::Shared);
-
-  const int nx = 11;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({nx}, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref = t0 + 2;
-
-  testValidate(
-      &fusion, outputs, aten_inputs, {ref, ref, ref}, __LINE__, __FILE__);
-}
-
-// Repro of issue #1105
-TEST_F(NVFuserTest, FusionWARSyncAliasedSmem_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
-
-  fusion.addOutput(tv3);
-
-  tv1->setMemoryType(MemoryType::Shared);
-  tv2->setMemoryType(MemoryType::Shared);
-
-  tv3->split(0, 4);
-  tv0->computeAt(tv3, 1);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDy);
-  tv3->axis(-1)->parallelize(ParallelType::TIDz);
-
-  // Make sure a WAR sync is inserted at the end of the outer loop
-  GpuLower gpulw(&fusion);
-  for (const auto& kir_node : gpulw.kernel()->topLevelExprs()) {
-    if (auto loop = dynamic_cast<kir::ForLoop*>(kir_node)) {
-      const auto& body = loop->body().exprs();
-      TORCH_CHECK(!body.empty());
-      auto last_expr = dynamic_cast<kir::BlockSync*>(body.back());
-      TORCH_CHECK(last_expr != nullptr, "Invalid expr found");
-      TORCH_CHECK(last_expr->isWarHazardSync(), "Not a sync for WAR hazard");
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({17}, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref1 = t0 + 3;
-
-  testValidate(&fusion, outputs, aten_inputs, {ref1}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionIssue1099_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv2);
-
-  auto tv3 = makeSymbolicTensor(1);
-  fusion.addInput(tv3);
-
-  // Just to make TIDx/y/z non-exact
-  auto tv4 = add(tv3, IrBuilder::create<Double>(1));
-  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
-  auto tv6 = add(tv5, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv6);
-
-  tv2->split(0, 4);
-  tv0->computeAt(tv2, 1);
-
-  tv0->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->axis(-1)->parallelize(ParallelType::TIDy);
-  tv2->axis(-1)->parallelize(ParallelType::TIDz);
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv1->setMemoryType(MemoryType::Shared);
-
-  tv4->split(0, 5);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->setMemoryType(MemoryType::Shared);
-  tv5->split(0, 6);
-  tv5->axis(-1)->parallelize(ParallelType::TIDy);
-  tv5->setMemoryType(MemoryType::Shared);
-  tv6->split(0, 7);
-  tv6->axis(-1)->parallelize(ParallelType::TIDz);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({17}, options);
-  at::Tensor t3 = at::randn({19}, options);
-  std::vector<IValue> aten_inputs = {t0, t3};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref_t2 = t0 + 2;
-  auto ref_t3 = t3 + 3;
-
-  testValidate(
-      &fusion, outputs, aten_inputs, {ref_t2, ref_t3}, __LINE__, __FILE__);
-}
-
-// Repro of issue #1080
-TEST_F(NVFuserTest, FusionUnswitchPredicate_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv2);
-
-  tv2->split(0, 4);
-  tv0->computeAt(tv2, 2);
-
-  tv2->split(-1, 8);
-  tv1->split(-1, 8);
-
-  tv2->axis(1)->parallelize(ParallelType::Unswitch);
-
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-2)->parallelize(ParallelType::TIDy);
-
-  // swap TIDx and TIDy
-  tv1->axis(-1)->parallelize(ParallelType::TIDy);
-  tv1->axis(-2)->parallelize(ParallelType::TIDx);
-
-  tv1->setMemoryType(MemoryType::Shared);
-
-  const int nx = 4;
-  const int ny = 10;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({nx, ny}, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref = t0 + 2;
-
-  testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionIssue1189_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({16, 16});
-  auto tv1 = makeConcreteTensor({16, 16});
-
-  auto tv0b = broadcast(tv0, {false, false, true});
-  auto tv1b = broadcast(tv1, {false, false, true});
-
-  fusion.addInput(tv0b);
-  fusion.addInput(tv1b);
-
-  auto tv2 = add(tv0b, tv1b);
-  auto tv3 = sum(tv2, {1});
-  fusion.addOutput(tv3);
-
-  auto parallelize = [](auto tv) {
-    tv->axis(0)->parallelize(ParallelType::TIDx);
-    tv->axis(1)->parallelize(ParallelType::BIDx);
-    tv->axis(2)->parallelize(ParallelType::BIDy);
-  };
-
-  parallelize(tv0b);
-  parallelize(tv1b);
-  parallelize(tv2);
-  parallelize(tv3);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({16, 16, 1}, options);
-  at::Tensor t1 = at::randn({16, 16, 1}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto outputs = fe.runFusion({t0, t1});
-
-  auto ref = (t0 + t1).sum({1});
-
-  testValidate(&fusion, outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionIssue1052_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = makeSymbolicTensor(1);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv2);
-
-  auto tv3 = add(tv1, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv3);
-
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  scheduler_utils::parallelizeAllLike(tv2, {tv0});
-  scheduler_utils::parallelizeAllLike(tv3, {tv1});
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({10}, options);
-  at::Tensor t1 = at::randn({100}, options);
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref_t2 = t0 + 1;
-  auto ref_t3 = t1 + 1;
-
-  testValidate(
-      &fusion, outputs, aten_inputs, {ref_t2, ref_t3}, __LINE__, __FILE__);
-}
-
-// Repro of issue #1115
-TEST_F(NVFuserTest, FusionPointwiseBroadcast_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  std::vector<int64_t> input_shape{3, 17, 80};
-  std::vector<int64_t> output_shape{3, 17, 1, 80};
-
-  TensorView* x = makeSymbolicTensor(input_shape.size());
-  TensorView* bias = makeSymbolicTensor(input_shape.size());
-  fusion.addInput(x);
-  fusion.addInput(bias);
-
-  auto x_add_bias = add(x, bias);
-  auto x_bcast = broadcast(x_add_bias, {false, false, true, false});
-  auto y = gelu(x_bcast);
-  fusion.addOutput(y);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor at_x = at::randn(input_shape, options);
-  at::Tensor at_bias = at::randn(input_shape, options);
-  std::vector<IValue> aten_inputs = {at_x, at_bias};
-
-  schedulePointwise(&fusion, aten_inputs);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto at_x_add_bias = at_x + at_bias;
-  auto at_x_view = at::native::view(at_x_add_bias, output_shape);
-  auto aten_y = at::gelu(at_x_view);
-
-  testValidate(&fusion, outputs, aten_inputs, {aten_y}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionPointwiseVectorize_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int size = 1024 * 64;
-
-  TensorView* x = makeContigTensor(1);
-  fusion.addInput(x);
-  auto y = sin(x);
-  fusion.addOutput(y);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  // PyTorch's CUDA caching allocator should always return aligned pointer for
-  // freshly allocated tensor
-  at::Tensor at_x = at::randn({size}, options);
-
-  schedulePointwise(&fusion, {at_x});
-
-  for (auto x_consumer : ir_utils::consumerTvsOf(x)) {
-    bool found_vec_in_input = false;
-    for (auto id : x_consumer->domain()->domain()) {
-      if (isParallelTypeVectorize(id->getParallelType())) {
-        found_vec_in_input = true;
-        break;
-      }
-    }
-    TORCH_CHECK(found_vec_in_input, "Expect input to be vectorized");
-  }
-
-  for (auto id : y->domain()->domain()) {
-    if (isParallelTypeVectorize(id->getParallelType())) {
-      return;
-    }
-  }
-  TORCH_CHECK(false, "Expect output to be vectorized");
-}
-
-TEST_F(NVFuserTest, FusionSmemAliasSerial_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
-
-  fusion.addOutput(tv3);
-
-  // Just set the dimension of TIDx
-  auto tv4 = makeSymbolicTensor(1);
-  fusion.addInput(tv4);
-  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv5);
-
-  tv1->setMemoryType(MemoryType::Shared);
-  tv2->setMemoryType(MemoryType::Shared);
-
-  tv5->axis(0)->parallelize(ParallelType::TIDx);
-
-  // tv1 and tv2 are on shared memory and are not parallelized with
-  // TIDx. They should be predicated as they are redundant and can
-  // interfere with smem aliasing (issue #1100).
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({10}, options);
-  at::Tensor t4 = at::randn({1024}, options);
-  std::vector<IValue> aten_inputs = {t0, t4};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref1 = t0 + 3;
-  auto ref2 = t4 + 1;
-
-  testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv1);
-
-  auto tv2 = makeSymbolicTensor(1);
-  fusion.addInput(tv2);
-  auto tv3 = sum(tv2, {0});
-  fusion.addOutput(tv3);
-
-  tv1->axis(0)->parallelize(ParallelType::TIDx);
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({17}, options);
-  at::Tensor t2 = at::randn({19}, options);
-  std::vector<IValue> aten_inputs = {t0, t2};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref1 = t0 + 1;
-  auto ref2 = sum(t2);
-
-  testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv1);
-
-  auto tv2 = makeSymbolicTensor(1);
-  fusion.addInput(tv2);
-  auto tv3 = Welford(tv2, {0}).avg;
-  fusion.addOutput(tv3);
-
-  tv1->axis(0)->parallelize(ParallelType::TIDx);
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({17}, options);
-  at::Tensor t2 = at::randn({19}, options);
-  std::vector<IValue> aten_inputs = {t0, t2};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref1 = t0 + 1;
-  auto ref2 = mean(t2, {0});
-
-  testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {0, 1});
-  fusion.addOutput(tv1);
-
-  auto tv2 = makeSymbolicTensor(3);
-  fusion.addInput(tv2);
-  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv3);
-
-  auto tv4 = makeSymbolicTensor(3);
-  fusion.addInput(tv4);
-  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv5);
-
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-
-  tv3->axis(0)->parallelize(ParallelType::TIDx);
-  tv3->axis(1)->parallelize(ParallelType::TIDy);
-  tv3->axis(2)->parallelize(ParallelType::TIDz);
-
-  tv5->axis(0)->parallelize(ParallelType::BIDx);
-  tv5->axis(1)->parallelize(ParallelType::BIDy);
-  tv5->axis(2)->parallelize(ParallelType::BIDz);
-
-  // TODO: This needs a fix for issue #1102.
-  // Also, need to allow predicated grid reductions.
-#if 0
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({2, 3}, options);
-  at::Tensor t2 = at::randn({5, 6, 7}, options);
-  at::Tensor t4 = at::randn({8, 9, 10}, options);
-  std::vector<IValue> aten_inputs = {t0, t2, t4};
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref1 = t0.sum(at::IntArrayRef{0, 1});
-  auto ref2 = t2 + 1;
-  auto ref3 = t4 + 1;
-
-  testValidate(
-      &fusion, outputs, aten_inputs, {ref1, ref2, ref3}, __LINE__, __FILE__);
-#endif
-}
-
-TEST_F(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tvs = Welford(tv0, {0, 1});
-  fusion.addOutput(tvs.avg);
-
-  auto tv2 = makeSymbolicTensor(3);
-  fusion.addInput(tv2);
-  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv3);
-
-  auto tv4 = makeSymbolicTensor(3);
-  fusion.addInput(tv4);
-  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv5);
-
-  tvs.avg->axis(0)->parallelize(ParallelType::BIDx);
-  tvs.avg->axis(1)->parallelize(ParallelType::TIDx);
-
-  tv3->axis(0)->parallelize(ParallelType::TIDx);
-  tv3->axis(1)->parallelize(ParallelType::TIDy);
-  tv3->axis(2)->parallelize(ParallelType::TIDz);
-
-  tv5->axis(0)->parallelize(ParallelType::BIDx);
-  tv5->axis(1)->parallelize(ParallelType::BIDy);
-  tv5->axis(2)->parallelize(ParallelType::BIDz);
-
-  // TODO: needs a fix for issue #1102
-  // Also, need to allow predicated grid reductions.
-#if 0
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({2, 3}, options);
-  at::Tensor t2 = at::randn({5, 6, 7}, options);
-  at::Tensor t4 = at::randn({8, 9, 10}, options);
-  std::vector<IValue> aten_inputs = {t0, t2, t4};
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref1 = t0.mean(at::IntArrayRef{0, 1});
-  auto ref2 = t2 + 1;
-  auto ref3 = t4 + 1;
-
-  testValidate(
-      &fusion, outputs, aten_inputs, {ref1, ref2, ref3}, __LINE__, __FILE__);
-#endif
-}
-
-// Repro of issue #1102
-TEST_F(NVFuserTest, FusionPredicateParallelizedDomains_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  // Just to make TIDx/y/z non-exact
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv3);
-
-  auto tv4 = makeSymbolicTensor(1);
-  fusion.addInput(tv4);
-
-  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
-  auto tv6 = add(tv5, IrBuilder::create<Double>(1));
-  auto tv7 = add(tv6, IrBuilder::create<Double>(1));
-  auto tv8 = add(tv7, IrBuilder::create<Double>(1));
-  auto tv9 = sum(tv8, {0});
-  fusion.addOutput(tv9);
-
-  tv1->split(0, 5);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv1->setMemoryType(MemoryType::Shared);
-  tv2->split(0, 6);
-  tv2->axis(-1)->parallelize(ParallelType::TIDy);
-  tv2->setMemoryType(MemoryType::Shared);
-  tv3->split(0, 7);
-  tv3->axis(-1)->parallelize(ParallelType::TIDz);
-
-  tv9->split(0, 4);
-  tv4->computeAt(tv9, 1);
-
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv5->axis(-1)->parallelize(ParallelType::TIDy);
-  tv6->axis(-1)->parallelize(ParallelType::TIDz);
-  tv7->axis(-1)->parallelize(ParallelType::TIDz);
-  tv8->axis(-1)->parallelize(ParallelType::TIDz);
-  tv9->axis(-1)->parallelize(ParallelType::TIDz);
-  tv9->axis(0)->parallelize(ParallelType::BIDx);
-
-  tv5->setMemoryType(MemoryType::Shared);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({17}, options);
-  at::Tensor t4 = at::randn({19}, options);
-  std::vector<IValue> aten_inputs = {t0, t4};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref1 = t0 + 3;
-  auto ref2 = sum(t4 + 4);
-
-  testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
-}
-
-// Repro of #1102 and #1129
-TEST_F(NVFuserTest, FusionSmemPredicateUnswitch_CUDA) {
-  if (!deviceMajorMinorCheck(7)) {
-    GTEST_SKIP() << "skipping tests on pre-Volta GPUs";
-    return;
-  }
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = makeSymbolicTensor(1);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
-  auto tv4 = add(tv3, IrBuilder::create<Double>(1));
-  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv5);
-
-  // Just to make TIDx/y/z non-exact
-  auto tvx = add(tv1, IrBuilder::create<Double>(1));
-  auto tvy = add(tvx, IrBuilder::create<Double>(1));
-  auto tvz = add(tvy, IrBuilder::create<Double>(1));
-  fusion.addOutput(tvz);
-
-  tv5->split(0, 4);
-  tv0->computeAt(tv5, 1);
-
-  tv0->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDy);
-  tv3->axis(-1)->parallelize(ParallelType::TIDz);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv5->axis(-1)->parallelize(ParallelType::TIDy);
-  tv5->axis(0)->parallelize(ParallelType::Unswitch);
-
-  tvx->split(0, 5);
-  tvx->axis(-1)->parallelize(ParallelType::TIDx);
-  tvy->split(0, 6);
-  tvy->axis(-1)->parallelize(ParallelType::TIDy);
-  tvz->split(0, 7);
-  tvz->axis(-1)->parallelize(ParallelType::TIDz);
-
-  for (auto tv : {tv2, tv3, tv4, tvx, tvy}) {
-    tv->setMemoryType(MemoryType::Shared);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({17}, options);
-  at::Tensor t1 = at::randn({19}, options);
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref1 = t0 + 4;
-  auto ref2 = t1 + 3;
-
-  testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
-}
-
-// Repro of issue #1136
-TEST_F(NVFuserTest, FusionFloatPow_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create<Int>(4));
-  // To check if pow(tv0, 2) is replaced with tv0 * tv0
-  auto tv2 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create<Int>(2));
-  // To check if pow(tv0, 2.0) is replaced with tv0 * tv0
-  auto tv3 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create<Double>(2));
-  auto tv4 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create<Int>(3));
-  auto tv5 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create<Double>(3));
-  auto s = binaryOp(
-      BinaryOpType::Pow,
-      IrBuilder::create<Double>(3),
-      IrBuilder::create<Double>(3));
-  auto tv6 = add(tv0, s);
-
-  fusion.addOutput(tv1);
-  fusion.addOutput(tv2);
-  fusion.addOutput(tv3);
-  fusion.addOutput(tv4);
-  fusion.addOutput(tv5);
-  fusion.addOutput(tv6);
-
-  tv1->split(0, 32);
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-
-  TransformPropagatorWithCheck propagator(tv1);
-  MaxRootDomainInfoSpanningTree(tv1).traverse(&propagator);
-  scheduler_utils::parallelizeAllLike(tv1, {tv2, tv3, tv4, tv5, tv6});
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({1000}, options);
-  // Negative inputs cause nan in Fuesr as use_fast_math is enabled
-  t0 = abs(t0);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto p4 = at::pow(t0, 4);
-  auto p2 = at::pow(t0, 2);
-  auto p3 = at::pow(t0, 3);
-  auto t6 = t0 + std::pow(3, 3);
-
-  testValidate(
-      &fusion,
-      outputs,
-      aten_inputs,
-      {p4, p2, p2, p3, p3, t6},
-      __LINE__,
-      __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionIssue1127_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  const int numel = 4;
-
-  auto tv0 = makeConcreteTensor({numel});
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {0});
-  auto tv2 = broadcast(tv1, {true});
-
-  auto tv3 = makeConcreteTensor({numel, numel});
-  fusion.addInput(tv3);
-
-  auto tv4 = sum(tv3, {1});
-
-  auto tv5 = add(tv2, tv4);
-  fusion.addOutput(tv5);
-
-  tv1->axis(0)->parallelize(ParallelType::TIDx);
-  tv2->axis(0)->parallelize(ParallelType::TIDx);
-  tv4->axis(1)->parallelize(ParallelType::TIDx);
-  tv5->axis(0)->parallelize(ParallelType::TIDx);
-
-  // Lowering should fail since tv5 is predicated and paralellized with TIDx.
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fusion.printKernel());
-}
-
-TEST_F(NVFuserTest, FusionChannelsLastParser_CUDA) {
-  // This test may not pass if using a custom block sync as there may
-  // be additional calls. Skip the test as it's not specifically
-  // relevant with block synchronizatin.
-  if (std::getenv("PYTORCH_NVFUSER_USE_BLOCK_SYNC_ATOMIC")) {
-    return;
-  }
-  auto g = std::make_shared<Graph>();
-  const auto graph0_string = R"IR(
-  graph(%0 : Half(8, 4, 10, 16, strides=[640, 1, 64, 4]),
-        %1 : Half(8, 4, 10, 16, strides=[640, 160, 16, 1])):
-    %o.1 : Half(8, 4, 10, 16, strides=[640, 1, 64, 4]) = aten::mul(%0, %1) # sum_dyn.py:5:6
-    %3 : Half(8, 4, 10, 16, strides=[640, 1, 64, 4]) = aten::relu(%o.1) # sum_dyn.py:6:9
-    return (%3))IR";
-  parseIR(graph0_string, g.get());
-
-  // strides are not yet supported in the irparser.
-  {
-    auto val = g->block()->inputs()[0];
-    val->setType(val->type()->castRaw<TensorType>()->withSizesStrides(
-        {8, 4, 10, 16}, {640, 1, 64, 4}));
-  }
-
-  {
-    auto val = g->block()->inputs()[1];
-    val->setType(val->type()->castRaw<TensorType>()->withSizesStrides(
-        {8, 4, 10, 16}, {640, 160, 16, 1}));
-  }
-
-  for (auto node : g->block()->nodes()) {
-    for (auto val : node->outputs()) {
-      if (val->isCompleteTensor())
-        val->setType(val->type()->castRaw<TensorType>()->withSizesStrides(
-            {8, 4, 10, 16}, {640, 1, 64, 4}));
-    }
-  }
-
-  auto fusion = parseJitIR(g);
-  FusionGuard fg(fusion.get());
-  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  at::Tensor input0 =
-      at::randn({2, 2, 2, 16}, options).clone(c10::MemoryFormat::ChannelsLast);
-  at::Tensor input1 = at::randn({2, 2, 2, 16}, options);
-  auto lparams = schedulePointwise(fusion.get(), {input0, input1});
-
-  // CONSIDER:
-  // 1. this can be moved to a dedicated "golden" file
-  // 2. use a fuzzy compare (ignore non-significant whitespaces for example)
-  const std::string expected_kernel = R"(
-__global__ void CUDAGeneratedKernel(Tensor<__half, 4> T0, Tensor<__half, 4> T2, Tensor<__half, 4> T7) {
-  int64_t i171;
-  i171 = (((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x);
-  if ((i171 < (T0.size[0] * (T0.size[1] * (T0.size[2] * T0.size[3]))))) {
-    __half T9[1];
-    T9[0] = 0;
-    T9[0]
-       = T2[((((((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x)) / (T0.size[1] * (T0.size[2] * T0.size[3]))) * ((T0.size[2] * T0.size[1]) * T0.size[3])) + ((((((((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) % (T0.size[2] * T0.size[3])) % T0.size[3]) * (T0.size[2] * T0.size[1])) + (((((((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) / (T0.size[2] * T0.size[3])) * T0.size[2]) + (((((((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) % (T0.size[2] * T0.size[3])) / T0.size[3])];
-    __half T8[1];
-    T8[0] = 0;
-    T8[0]
-       = T0[i171];
-    float T3[1];
-    T3[0]
-       = __half2float(T9[0]);
-    float T4[1];
-    T4[0]
-       = T3[0];
-    float T1[1];
-    T1[0]
-       = __half2float(T8[0]);
-    float T5[1];
-    T5[0]
-      = T1[0]
-      * T4[0];
-    float T6[1];
-    T6[0]
-       = relu(T5[0]);
-    __half T10[1];
-    T10[0]
-       = __float2half(T6[0]);
-    T7[i171]
-       = T10[0];
-  }
-}
-)";
-
-  const std::string actual_kernel =
-      "\n" + codegen::generateCudaKernel(GpuLower(fusion.get()).kernel());
-
-  if (expected_kernel.size() != actual_kernel.size() ||
-      expected_kernel.compare(actual_kernel) != 0) {
-    std::cerr
-        << " Codegen mismatch, codegen possibly changed, or is incorrect. "
-        << " \n ========= EXPECTED ========= \n"
-        << expected_kernel << "\n========= ACTUAL ========== \n"
-        << actual_kernel << "\n=================" << std::endl;
-    auto it = std::mismatch(
-        expected_kernel.begin(),
-        expected_kernel.end(),
-        actual_kernel.begin(),
-        actual_kernel.end());
-    std::string actual_mismatched_snippet(it.second, actual_kernel.end());
-    actual_mismatched_snippet = actual_mismatched_snippet.substr(0, 10);
-    std::string expected_mismatched_snippet(it.first, expected_kernel.end());
-    expected_mismatched_snippet = expected_mismatched_snippet.substr(0, 10);
-    std::cerr << "First mismatch found at: " << actual_mismatched_snippet
-              << ", expected: " << expected_mismatched_snippet << std::endl;
-    TORCH_CHECK(false);
-  }
-
-  // TODO: runFusion hits assertion. I'm probably doing something wrong here.
-  // FusionExecutor fe;
-  // fe.compileFusion(fusion.get());
-  // auto outputs = fe.runFusion({input0, input1}, lparams);
-  // at::Tensor output_ref = (input0 * input1).relu();
-  // TORCH_CHECK(output_ref.equal(outputs[0]));
-}
-
-TEST_F(NVFuserTest, FusionThreadPredicateUnswitch_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({10, 1024});
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
-
-  fusion.addOutput(tv3);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->computeAt(tv3, -1);
-  tv3->axis(0)->parallelize(ParallelType::Unswitch);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({10, 1024}, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref = sum(t0, {1}) + 2;
-
-  testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionNonContigOutputs_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv1);
-
-  tv1->setContiguity(false);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor at_input = at::randn({10}, options);
-  at::Tensor at_output = at::empty_strided({10}, {2}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {at_input});
-  auto returned_outputs = fe.runFusion({at_input}, {at_output});
-
-  // Returned outputs should only contain one tensor that is the same
-  // as the output tensor given to runFusion
-  TORCH_CHECK(returned_outputs.size() == 1);
-  TORCH_CHECK(returned_outputs[0].is_same(at_output));
-  TORCH_CHECK(!returned_outputs[0].is_contiguous());
-
-  auto at_ref = at_input + 1;
-
-  testValidate(&fusion, {at_output}, {at_input}, {at_ref}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionTestWarpSoftMax_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Setup softmax fusion
-  auto input = makeContigTensor(2);
-  fusion.addInput(input);
-  auto output = softmax(input, 1);
-  fusion.addOutput(output);
-
-  // Setup runtime input
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_input = at::randn({8, 16 * 197}, options);
-  std::vector<c10::IValue> aten_inputs({aten_input});
-
-  // Schedule through magic scheduler
-  SchedulerRuntimeInfo runtime_info(&fusion, aten_inputs, true);
-  TORCH_CHECK(SchedulerEntry::canSchedule(
-      ScheduleHeuristic::Persistent, &fusion, runtime_info));
-  auto scheduler = SchedulerEntry::makeEntry(
-      ScheduleHeuristic::Persistent, &fusion, runtime_info);
-  scheduler->schedule(&fusion);
-
-  // Modify the schedule to use warp reduction
-  auto used_vals = fusion.usedMathVals();
-  for (auto tv : ir_utils::filterByType<TensorView>(used_vals)) {
-    for (IterDomain* id : tv->domain()->domain()) {
-      if (id->getParallelType() == ParallelType::TIDx) {
-        id->padToMultipleOfWarp();
-      }
-    }
-  }
-
-  // Test result
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
-  auto ref_output = at::_softmax(aten_input, 1, false);
-  testValidate(&fusion, outputs, aten_inputs, {ref_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionIssue1133_CUDA) {
-  if (!deviceMajorMinorCheck(7)) {
-    GTEST_SKIP() << "skipping tests on pre-Volta GPUs";
-    return;
-  }
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = sum(tv1, {1});
-  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
-
-  fusion.addOutput(tv3);
-
-  tv0->computeAt(tv3, 1);
-
-  const int split_factor = 32;
-
-  tv2->split(-1, split_factor);
-  tv1->computeAt(tv2, -2);
-
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv3->axis(0)->parallelize(ParallelType::Unswitch);
-
-  tv1->setMemoryType(MemoryType::Shared);
-  tv2->setMemoryType(MemoryType::Shared);
-
-  // Both tv1 and tv2 should be allocated at the top-level scope
-  GpuLower gpulw(&fusion);
-  bool tv1_validated = false;
-  bool tv2_validated = false;
-  for (const auto& kir_node : gpulw.kernel()->topLevelExprs()) {
-    if (auto alloc = dynamic_cast<kir::Allocate*>(kir_node)) {
-      auto size = alloc->size();
-      if (!(alloc->buffer()->name() == 1 || alloc->buffer()->name() == 2)) {
-        // There should be no allocation other than those for tv1 and tv2
-        TORCH_CHECK(false, "Invalid allocation detected");
-      }
-      TORCH_CHECK(size->isA<Int>(), "Invalid allocation size");
-      TORCH_CHECK(size->as<Int>()->isConst(), "Allocation not constant");
-      auto size_int = size->as<Int>()->value().value();
-      if (alloc->buffer()->name() == 1) {
-        TORCH_CHECK(
-            size_int == split_factor,
-            "Invalid allocation size: ",
-            size->as<Int>()->value().value());
-        tv1_validated = true;
-      } else {
-        TORCH_CHECK(
-            size_int == 1,
-            "Invalid allocation size: ",
-            size->as<Int>()->value().value());
-        tv2_validated = true;
-      }
-    }
-  }
-
-  TORCH_CHECK(tv1_validated, "Failed to validate tv1 allocation");
-  TORCH_CHECK(tv2_validated, "Failed to validate tv2 allocation");
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({99, 101}, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref = (t0 + 1).sum({1}) + 1;
-
-  testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionRfactorContigIDs_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {1});
-  fusion.addOutput(tv1);
-
-  tv1->split(1, 32);
-
-  auto tv2 = tv1->rFactor({1});
-
-  // This merged domain is not contiguous.
-  tv2->merge(0, 2);
-
-  tv2->setMemoryType(MemoryType::Shared);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({99, 101}, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto ref = t0.sum({1});
-
-  testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionPersistentBufferCalculation1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = set(tv0);
-  auto tv2 = sum(tv1, {1});
-  auto tv3 = broadcast(tv2, {false, true});
-  auto tv4 = set(tv1);
-  auto tv5 = add(tv3, tv4);
-  fusion.addOutput(tv5);
-
-  auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
-
-  auto isTvWithinVec = [](std::vector<TensorView*>& vec, TensorView* tv) {
-    return std::find(vec.begin(), vec.end(), tv) != vec.end();
-  };
-
-  auto tvEntryInVecVec = [](std::vector<std::vector<TensorView*>>& vec_o_vec,
-                            std::vector<TensorView*>& buffer_vec,
-                            TensorView* tv) {
-    auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv);
-    return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it);
-  };
-
-  auto& buffers = persistent_buffer_info.persistent_buffers;
-  auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points;
-  auto& projectable = persistent_buffer_info.projectable_persistent_buffers;
-  auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs;
-
-  TORCH_INTERNAL_ASSERT(buffers.size() == 1);
-  TORCH_INTERNAL_ASSERT(resolution.size() == 1 && resolution[0].size() == 1);
-  TORCH_INTERNAL_ASSERT(projectable.size() == 1);
-  TORCH_INTERNAL_ASSERT(projectable_inputs.size() == 1);
-
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(buffers, tv1));
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable, tv1));
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0));
-
-  auto tv1_resolution_it = tvEntryInVecVec(resolution, buffers, tv1);
-  TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end())
-
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv5));
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor aten_t0 = at::randn({99, 101}, options);
-
-  // Schedule through magic scheduler
-  SchedulerRuntimeInfo runtime_info(&fusion, {aten_t0}, true);
-  auto persistent_buffer_size =
-      persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
-
-  TORCH_INTERNAL_ASSERT(
-      persistent_buffer_size.persistent_buffer_size ==
-      static_cast<int64_t>(aten_t0.size(1) * dataTypeSize(DataType::Float)));
-  TORCH_INTERNAL_ASSERT(
-      persistent_buffer_size.projected_persistent_buffer_size ==
-      static_cast<int64_t>(aten_t0.size(1) * dataTypeSize(DataType::Float)));
-}
-
-TEST_F(NVFuserTest, FusionPersistentBufferCalculation2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2, DataType::Half);
-  fusion.addInput(tv0);
-
-  auto tv1 = castOp(DataType::Float, tv0);
-  auto tv2 = sum(tv1, {1});
-  auto tv3 = broadcast(tv2, {false, true});
-  auto tv4 = set(tv1);
-  auto tv5 = add(tv3, tv4);
-  auto tv6 = castOp(DataType::Half, tv5);
-  fusion.addOutput(tv6);
-
-  auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
-
-  auto isTvWithinVec = [](std::vector<TensorView*>& vec, TensorView* tv) {
-    return std::find(vec.begin(), vec.end(), tv) != vec.end();
-  };
-
-  auto tvEntryInVecVec = [](std::vector<std::vector<TensorView*>>& vec_o_vec,
-                            std::vector<TensorView*>& buffer_vec,
-                            TensorView* tv) {
-    auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv);
-    return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it);
-  };
-
-  auto& buffers = persistent_buffer_info.persistent_buffers;
-  auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points;
-  auto& projectable = persistent_buffer_info.projectable_persistent_buffers;
-  auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs;
-
-  TORCH_INTERNAL_ASSERT(buffers.size() == 1);
-  TORCH_INTERNAL_ASSERT(resolution.size() == 1 && resolution[0].size() == 1);
-  TORCH_INTERNAL_ASSERT(projectable.size() == 1);
-  TORCH_INTERNAL_ASSERT(projectable_inputs.size() == 1);
-
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(buffers, tv1));
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable, tv1));
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0));
-
-  auto tv1_resolution_it = tvEntryInVecVec(resolution, buffers, tv1);
-  TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end())
-
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv5));
-
-  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  at::Tensor aten_t0 = at::randn({99, 101}, options);
-
-  // Schedule through magic scheduler
-  SchedulerRuntimeInfo runtime_info(&fusion, {aten_t0}, true);
-  auto persistent_buffer_size =
-      persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
-
-  TORCH_INTERNAL_ASSERT(
-      persistent_buffer_size.persistent_buffer_size ==
-      static_cast<int64_t>(aten_t0.size(1) * dataTypeSize(DataType::Float)));
-  TORCH_INTERNAL_ASSERT(
-      persistent_buffer_size.projected_persistent_buffer_size ==
-      static_cast<int64_t>(aten_t0.size(1) * dataTypeSize(DataType::Half)));
-}
-
-TEST_F(NVFuserTest, FusionPersistentBufferCalculation3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2, DataType::Half);
-  fusion.addInput(tv0);
-
-  auto tv1 = castOp(DataType::Float, tv0);
-  auto tv2 = set(tv1);
-  auto tv3 = sum(tv2, {1});
-  auto tv4 = broadcast(tv3, {false, true});
-
-  auto tv5 = makeSymbolicTensor(2, DataType::Half);
-  fusion.addInput(tv5);
-
-  auto tv6 = castOp(DataType::Float, tv5);
-
-  auto tv7 = add(tv6, tv4);
-  auto tv8 = set(tv1);
-  auto tv9 = add(tv7, tv8);
-  auto tv10 = sum(tv9, {1});
-  auto tv11 = broadcast(tv10, {false, true});
-  auto tv12 = set(tv7);
-  auto tv13 = add(tv12, tv11);
-
-  fusion.addOutput(tv13);
-
-  auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
-
-  auto isTvWithinVec = [](std::vector<TensorView*>& vec, TensorView* tv) {
-    return std::find(vec.begin(), vec.end(), tv) != vec.end();
-  };
-
-  auto tvEntryInVecVec = [](std::vector<std::vector<TensorView*>>& vec_o_vec,
-                            std::vector<TensorView*>& buffer_vec,
-                            TensorView* tv) {
-    auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv);
-    return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it);
-  };
-
-  auto& buffers = persistent_buffer_info.persistent_buffers;
-  auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points;
-  auto& projectable = persistent_buffer_info.projectable_persistent_buffers;
-  auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs;
-
-  TORCH_INTERNAL_ASSERT(buffers.size() == 2);
-  TORCH_INTERNAL_ASSERT(
-      resolution.size() == 2 && resolution[0].size() == 1 &&
-      resolution[1].size() == 1);
-  TORCH_INTERNAL_ASSERT(projectable.size() == 1);
-  TORCH_INTERNAL_ASSERT(projectable_inputs.size() == 1);
-
-  TORCH_INTERNAL_ASSERT(
-      isTvWithinVec(buffers, tv1) && isTvWithinVec(buffers, tv7));
-  TORCH_INTERNAL_ASSERT(
-      isTvWithinVec(projectable, tv1) && !isTvWithinVec(projectable, tv7));
-
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0));
-
-  auto tv1_resolution_it = tvEntryInVecVec(resolution, buffers, tv1);
-  TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end())
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv9));
-
-  auto tv7_resolution_it = tvEntryInVecVec(resolution, buffers, tv7);
-  TORCH_INTERNAL_ASSERT(tv7_resolution_it != resolution.end())
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv7_resolution_it, tv13));
-
-  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  at::Tensor aten_t0 = at::randn({99, 101}, options);
-  at::Tensor aten_t5 = at::randn({99, 101}, options);
-
-  // Schedule through magic scheduler
-  SchedulerRuntimeInfo runtime_info(&fusion, {aten_t0, aten_t5}, true);
-  auto persistent_buffer_size =
-      persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
-
-  TORCH_INTERNAL_ASSERT(
-      persistent_buffer_size.persistent_buffer_size ==
-      static_cast<int64_t>(
-          aten_t0.size(1) * dataTypeSize(DataType::Float) * 2));
-  TORCH_INTERNAL_ASSERT(
-      persistent_buffer_size.projected_persistent_buffer_size ==
-      static_cast<int64_t>(
-          aten_t0.size(1) *
-          (dataTypeSize(DataType::Half) + dataTypeSize(DataType::Float))));
-}
-
-TEST_F(NVFuserTest, FusionPersistentBufferCalculation4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2, DataType::Half);
-  fusion.addInput(tv0);
-
-  auto tv1 = castOp(DataType::Float, tv0);
-  auto tv2 = set(tv1);
-  auto tv3 = sum(tv2, {1});
-  auto tv4 = broadcast(tv3, {false, true});
-  auto tv5 = set(tv1);
-  auto tv6 = add(tv4, tv5);
-  auto tv7 = set(tv2);
-  auto tv8 = add(tv7, tv6);
-  auto tv9 = castOp(DataType::Half, tv8);
-
-  fusion.addOutput(tv9);
-
-  auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
-
-  auto isTvWithinVec = [](std::vector<TensorView*>& vec, TensorView* tv) {
-    return std::find(vec.begin(), vec.end(), tv) != vec.end();
-  };
-
-  auto tvEntryInVecVec = [](std::vector<std::vector<TensorView*>>& vec_o_vec,
-                            std::vector<TensorView*>& buffer_vec,
-                            TensorView* tv) {
-    auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv);
-    return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it);
-  };
-
-  auto& buffers = persistent_buffer_info.persistent_buffers;
-  auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points;
-  auto& projectable = persistent_buffer_info.projectable_persistent_buffers;
-  auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs;
-
-  TORCH_INTERNAL_ASSERT(buffers.size() == 2);
-  TORCH_INTERNAL_ASSERT(
-      resolution.size() == 2 && resolution[0].size() == 1 &&
-      resolution[1].size() == 1);
-
-  TORCH_INTERNAL_ASSERT(projectable.size() == 2);
-  TORCH_INTERNAL_ASSERT(projectable_inputs.size() == 1);
-
-  TORCH_INTERNAL_ASSERT(
-      isTvWithinVec(buffers, tv1) && isTvWithinVec(buffers, tv2));
-  TORCH_INTERNAL_ASSERT(
-      isTvWithinVec(projectable, tv1) && isTvWithinVec(projectable, tv2));
-
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0));
-
-  auto tv1_resolution_it = tvEntryInVecVec(resolution, buffers, tv1);
-  TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end())
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv6));
-
-  auto tv2_resolution_it = tvEntryInVecVec(resolution, buffers, tv2);
-  TORCH_INTERNAL_ASSERT(tv2_resolution_it != resolution.end())
-  TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv2_resolution_it, tv8));
-
-  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  at::Tensor aten_t0 = at::randn({99, 101}, options);
-
-  // Schedule through magic scheduler
-  SchedulerRuntimeInfo runtime_info(&fusion, {aten_t0}, true);
-  auto persistent_buffer_size =
-      persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
-
-  TORCH_INTERNAL_ASSERT(
-      persistent_buffer_size.persistent_buffer_size ==
-      static_cast<int64_t>(
-          aten_t0.size(1) * dataTypeSize(DataType::Float) * 2));
-
-  TORCH_INTERNAL_ASSERT(
-      persistent_buffer_size.projected_persistent_buffer_size ==
-      static_cast<int64_t>(aten_t0.size(1) * dataTypeSize(DataType::Half)));
-}
-
-TEST_F(NVFuserTest, FusionPersistentBufferProjection_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2, DataType::Half);
-  fusion.addInput(tv0);
-
-  auto tv1 = castOp(DataType::Float, tv0);
-  auto tv2 = set(tv1);
-  auto tv3 = sum(tv2, {1});
-  auto tv4 = broadcast(tv3, {false, true});
-  auto tv5 = set(tv1);
-  auto tv6 = add(tv4, tv5);
-  auto tv7 = set(tv2);
-  auto tv8 = add(tv7, tv6);
-  auto tv9 = castOp(DataType::Half, tv8);
-
-  fusion.addOutput(tv9);
-
-  reduction_scheduler_utils::projectPersistentBuffers(&fusion);
-
-  auto tv5_producers = ir_utils::producerTvsOf(tv5);
-  auto tv7_producers = ir_utils::producerTvsOf(tv7);
-
-  // Projection should have broken these dependencies
-
-  TORCH_INTERNAL_ASSERT(
-      std::find(tv5_producers.begin(), tv5_producers.end(), tv1) ==
-      tv5_producers.end());
-  TORCH_INTERNAL_ASSERT(
-      std::find(tv7_producers.begin(), tv7_producers.end(), tv2) ==
-      tv7_producers.end());
-
-  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  at::Tensor aten_t0 = at::randn({99, 101}, options);
-
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto cg_outputs = fec.runFusionWithInputs({aten_t0});
-
-  auto aten_t1 = aten_t0.to(c10::kDouble);
-  auto aten_t3 = aten_t1.sum({1});
-  auto aten_t4 = aten_t3.unsqueeze(1);
-  auto aten_t7 = aten_t4.add(aten_t1).add(aten_t1);
-
-  testValidate(&fusion, cg_outputs, {aten_t0}, {aten_t7}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionIssue1223_CUDA) {
-  if (!deviceMajorMinorCheck(7)) {
-    GTEST_SKIP() << "skipping tests on pre-Volta GPUs";
-    return;
-  }
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = sum(tv1, {0, 1});
-  fusion.addOutput(tv2);
-
-  auto tv3 = add(tv0, IrBuilder::create<Double>(0));
-  fusion.addOutput(tv3);
-
-  tv2->split(0, 4);
-  tv2->split(1, 1, false);
-  tv2->split(-1, 4);
-
-  tv2->axis(1)->parallelize(ParallelType::Unswitch);
-  tv2->axis(-3)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDy);
-
-  tv1->computeAt(tv2, -1);
-
-  // Make TIDx and TIDy non-exact
-  tv3->split(0, 32);
-  tv3->split(-1, 32);
-  tv3->axis(1)->parallelize(ParallelType::TIDx);
-  tv3->axis(3)->parallelize(ParallelType::TIDy);
-
-  // The second axis of both tv1 and tv2 are fully unswitched, so they
-  // don't need to predicate the parallel type usage of TIDy, whereas
-  // the first axis is only partially unswitched, i.e., part of its
-  // split output domains is outside the unswitched axis, so the first
-  // axis, which uses TIDx, needs to predicate the parallel
-  // dimension. Previously, as reported in issue #1223, unswitched
-  // expressions didn't predicate parallel dimensions. It should be
-  // fixed by PR #1222.
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor at_t0 = at::ones({11, 10}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {at_t0});
-  auto cg_outputs = fe.runFusion({at_t0});
-
-  auto at_t1 = (at_t0 + 1).sum();
-
-  testValidate(
-      &fusion, cg_outputs, {at_t0}, {at_t1, at_t0}, __LINE__, __FILE__);
-}
-
-// See #1247 and #1250
-TEST_F(NVFuserTest, FusionRfactorPredication1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = min(tv1, {0});
-
-  fusion.addOutput(tv2);
-
-  // Make TIDx non-exact
-  auto tv3 = makeContigTensor(1);
-  fusion.addInput(tv3);
-
-  auto tv4 = add(tv3, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv4);
-
-  tv2->split(0, 4);
-  auto tv5 = tv2->rFactor({1});
-
-  tv0->computeAt(tv2, 1);
-
-  tv2->axis(0)->parallelize(ParallelType::TIDx);
-
-  tv4->axis(0)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor at_t0 = at::randn({9}, options);
-  at_t0 = at::abs(at_t0);
-  at::Tensor at_t3 = at::randn({128}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {at_t0, at_t3});
-  auto cg_outputs = fe.runFusion({at_t0, at_t3});
-
-  auto at_t2 = (at_t0 + 1).min();
-  auto at_t4 = at_t3 + 1;
-
-  testValidate(
-      &fusion, cg_outputs, {at_t0, at_t3}, {at_t2, at_t4}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionRfactorPredication2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = min(tv0, {0});
-  fusion.addOutput(tv1);
-
-  // Make TIDx non-exact
-  auto tv2 = makeContigTensor(1);
-  fusion.addInput(tv2);
-
-  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv3);
-
-  tv1->split(0, 4);
-  auto tv4 = tv1->rFactor({0});
-
-  tv1->split(0, 3);
-
-  // tv0->computeAt(tv1, 3);
-  tv4->reorder({{0, 1}});
-  tv4->split(0, 3);
-  tv4->setMemoryType(MemoryType::Shared);
-
-  // tv0: [I]
-  // tv4: [4/3, 3, I/4]
-  // tv1: [4/3, 3]
-
-  tv1->axis(0)->parallelize(ParallelType::TIDx);
-  scheduler_utils::parallelizeAllLike(tv1, {tv4});
-
-  tv3->axis(0)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor at_t0 = at::randn({9}, options);
-  at_t0 = at::abs(at_t0);
-  at::Tensor at_t3 = at::randn({128}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {at_t0, at_t3});
-  auto cg_outputs = fe.runFusion({at_t0, at_t3});
-
-  auto at_t2 = std::get<0>(at_t0.min(0));
-  auto at_t4 = at_t3 + 1;
-
-  testValidate(
-      &fusion, cg_outputs, {at_t0, at_t3}, {at_t2, at_t4}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionRfactorIndirectRoot_CUDA) {
-  // https://github.com/csarofeen/pytorch/issues/1692
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(3);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {1, 2});
-  fusion.addOutput(tv1);
-
-  tv1->split(2, 4);
-  tv1->split(1, 3);
-  tv1->merge(2, 3);
-  auto rf = tv1->rFactor({-1});
-
-  tv1->split(0, 256);
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-  rf->computeAt(tv1, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-
-  auto at_in = at::randn({6, 6, 6}, options);
-  auto at_out = at_in.sum({1, 2});
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {at_in});
-  auto cg_outputs = fe.runFusion({at_in});
-
-  testValidate(&fusion, cg_outputs, {at_in}, {at_out}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionNonDivisibleSplit1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {0});
-  fusion.addOutput(tv1);
-
-  // [I]
-  tv1->split(0, 5);
-  // [ceilDiv(I, 5), 5]
-
-  // This second split is non-divisible. The split domain must be predicated.
-  tv1->split(1, 3);
-  // [ceilDiv(I, 5), 2, 3]
-
-  auto tv2 = sum(tv0, {0});
-  fusion.addOutput(tv2);
-
-  // tv2 shouldn't need to have another predicate
-  tv2->split(0, 4);
-  tv2->split(1, 2);
-
-  GpuLower gpulw(&fusion);
-  TORCH_CHECK(
-      gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
-      "There must be no split to validate");
-  TORCH_CHECK(
-      gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 1,
-      "Only tv1 should have a non-divisible predicate.");
-  for (auto tv : {loweredTv(tv1, gpulw)}) {
-    auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
-    TORCH_CHECK(
-        it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
-        "No info found for ",
-        tv);
-    const auto& splits_to_predicate = it->second;
-    TORCH_CHECK(
-        splits_to_predicate.size() == 1,
-        "There must be one split to predicate");
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({24}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = t0.sum();
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref, ref}, __LINE__, __FILE__);
-}
-
-// Repro of issue #1074
-TEST_F(NVFuserTest, FusionNonDivisibleSplit2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv2);
-
-  tv2->split(0, 2);
-  tv2->split(-1, 4);
-  tv2->reorder({{1, 2}, {2, 1}});
-  tv0->computeAt(tv2, 2);
-
-  tv2->split(-1, 3);
-
-  // To make the sanitizer catch the invalid accesses. Not necessary
-  // to expose the bug.
-  tv1->setMemoryType(MemoryType::Shared);
-
-  GpuLower gpulw(&fusion);
-  TORCH_CHECK(
-      gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
-      "There must be no split to validate");
-  TORCH_CHECK(
-      gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 1,
-      "Only tv2 should have a non-divisible predicate.");
-  for (auto tv : {loweredTv(tv2, gpulw)}) {
-    auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
-    TORCH_CHECK(
-        it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
-        "No info found for ",
-        tv);
-    const auto& splits_to_predicate = it->second;
-    TORCH_CHECK(
-        splits_to_predicate.size() == 1,
-        "There must be one split to predicate");
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({13, 17}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = t0 + 2;
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-// Similar to FusionNonDivisibleSplit1 but with unswitch
-TEST_F(NVFuserTest, FusionNonDivisibleSplit3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = sum(tv1, {0});
-  fusion.addOutput(tv2);
-
-  tv2->split(0, 5);
-  tv2->split(1, 3);
-
-  tv0->computeAt(tv2, -1);
-
-  tv2->axis(0)->parallelize(ParallelType::Unswitch);
-
-  GpuLower gpulw(&fusion);
-  TORCH_CHECK(
-      gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
-      "There must be no split to validate");
-  TORCH_CHECK(
-      gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 2,
-      "Both tv1 and tv2 should have a non-divisible predicate.");
-  for (auto tv : {loweredTv(tv1, gpulw), loweredTv(tv2, gpulw)}) {
-    auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
-    TORCH_CHECK(
-        it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
-        "No info found for ",
-        tv);
-    const auto& splits_to_predicate = it->second;
-    TORCH_CHECK(
-        splits_to_predicate.size() == 1,
-        "There must be one split to predicate");
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({24}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = (t0 + 1).sum();
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-// Non-divisible split through merge
-TEST_F(NVFuserTest, FusionNonDivisibleSplit4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = sum(tv1, {0, 1});
-  fusion.addOutput(tv2);
-
-  tv2->split(0, 5);
-  tv2->merge(1, 2);
-  tv2->split(1, 3);
-
-  tv0->computeAt(tv2, -1);
-
-  GpuLower gpulw(&fusion);
-  TORCH_CHECK(
-      gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
-      "There must be no split to validate");
-  TORCH_CHECK(
-      gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 2,
-      "Both tv1 and tv2 should have a non-divisible predicate.");
-  for (auto tv : {loweredTv(tv1, gpulw), loweredTv(tv2, gpulw)}) {
-    auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
-    TORCH_CHECK(
-        it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
-        "No info found for ",
-        tv);
-    const auto& splits_to_predicate = it->second;
-    TORCH_CHECK(
-        splits_to_predicate.size() == 1,
-        "There must be one split to predicate");
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({24, 2}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = (t0 + 1).sum();
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-// Nested splits
-TEST_F(NVFuserTest, FusionNonDivisibleSplit5_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = sum(tv1, {0});
-  fusion.addOutput(tv2);
-
-  // [I]
-  tv2->split(0, 8);
-  // [I/8, 8]
-  tv2->split(1, 2);
-  // [I/8, 4, 2]
-  tv2->split(1, 3); // non-divisible split of outer output
-  // [I/8, 2, 3, 2]
-
-  tv0->computeAt(tv2, -1);
-
-  GpuLower gpulw(&fusion);
-  TORCH_CHECK(
-      gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
-      "There must be no split to validate");
-  TORCH_CHECK(
-      gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 2,
-      "Both tv1 and tv2 should have a non-divisible predicate.");
-  for (auto tv : {loweredTv(tv1, gpulw), loweredTv(tv2, gpulw)}) {
-    auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
-    TORCH_CHECK(
-        it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
-        "No info found for ",
-        tv);
-    const auto& splits_to_predicate = it->second;
-    TORCH_CHECK(
-        splits_to_predicate.size() == 1,
-        "There must be one split to predicate");
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({24}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = (t0 + 1).sum();
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-// Vectorized non-divisible split. Must be validated at run time
-TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = set(tv0);
-  fusion.addOutput(tv1);
-
-  tv1->split(0, 8, false);
-  tv1->split(1, 4);
-
-  tv1->axis(-1)->parallelize(ParallelType::Vectorize);
-
-  GpuLower gpulw(&fusion);
-  TORCH_CHECK(
-      gpulw.nonDivisibleSplitInfo().splitsToValidate().size() == 1,
-      "There should be one split to validate");
-  for (const auto& kv : gpulw.nonDivisibleSplitInfo().splitsToPredicate()) {
-    const auto& splits_to_predicate = kv.second;
-    TORCH_CHECK(
-        splits_to_predicate.empty(),
-        "There must be no split to predicate, but tensor t",
-        kv.first->name(),
-        " has:",
-        splits_to_predicate);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  auto t0 = at::randn({32}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = t0;
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-
-  auto t0_non_divisible = at::randn({8}, options);
-  // Since ceilDiv(8, 8) is not divisible by 4, the vectorization is
-  // illegal. The run-time validation of vectorization should throw an error.
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.runFusion({t0_non_divisible}));
-}
-
-// If a split is validated at run time, it's not necessary to predicate.
-TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = set(tv0);
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-  auto tv3 = sum(tv2, {0});
-  fusion.addOutput(tv3);
-
-  tv3->split(0, 8, false);
-  tv3->split(1, 4);
-  TransformPropagatorWithCheck propagator(tv3);
-  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
-
-  tv3->axis(1)->parallelize(ParallelType::TIDx);
-  scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2});
-
-  tv1->axis(2)->parallelize(ParallelType::Vectorize);
-
-  GpuLower gpulw(&fusion);
-  TORCH_CHECK(
-      gpulw.nonDivisibleSplitInfo().splitsToValidate().size() == 1,
-      "There should be one split to validate");
-  for (const auto& kv : gpulw.nonDivisibleSplitInfo().splitsToPredicate()) {
-    const auto& splits_to_predicate = kv.second;
-    TORCH_CHECK(
-        splits_to_predicate.empty(),
-        "There must be no split to predicate, but tensor t",
-        kv.first->name(),
-        " has:",
-        splits_to_predicate);
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-
-  auto t0 = at::randn({1024}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = (t0 + 1).sum();
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionIssue1284Repro_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  std::vector<int64_t> input_shape_0 = {10, 20};
-  std::vector<int64_t> input_shape_1 = {15};
-
-  TensorView* in_0 = makeSymbolicTensor(input_shape_0.size());
-  TensorView* in_1 = makeSymbolicTensor(input_shape_1.size());
-  fusion.addInput(in_0);
-  fusion.addInput(in_1);
-
-  TensorView* out_0 = add(in_0, IrBuilder::create<Double>(0.f));
-  TensorView* out_1 = add(in_1, IrBuilder::create<Double>(2.f));
-
-  fusion.addOutput(out_0);
-  fusion.addOutput(out_1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor at_in_0 = at::randn(input_shape_0, options);
-  at::Tensor at_in_1 = at::randn(input_shape_1, options);
-  std::vector<IValue> aten_inputs = {at_in_0, at_in_1};
-
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto outputs = fec.runFusionWithInputs(aten_inputs);
-
-  auto t1 = at_in_1 + 2;
-
-  auto runtime = fec.getMostRecentKernelRuntime();
-  TORCH_INTERNAL_ASSERT(runtime->isSegmented());
-  TORCH_INTERNAL_ASSERT(runtime->fusionSegments()->groups().size() == 2);
-
-  testValidate(
-      &fusion, outputs, {at_in_0, at_in_1}, {at_in_0, t1}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionIssue1284Repro2_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  std::vector<int64_t> input_shape_0 = {4, 4};
-  std::vector<int64_t> input_shape_1 = {3, 4, 4};
-  std::vector<int64_t> input_shape_2 = {2, 8, 4, 4};
-
-  TensorView* in_0 = makeSymbolicTensor(input_shape_0.size());
-  TensorView* in_1 = makeSymbolicTensor(input_shape_1.size());
-  TensorView* in_2 = makeSymbolicTensor(input_shape_2.size());
-
-  fusion.addInput(in_0);
-  fusion.addInput(in_1);
-  fusion.addInput(in_2);
-
-  TensorView* out_0 = add(in_0, in_1);
-  TensorView* out_1 = add(in_0, in_2);
-
-  fusion.addOutput(out_0);
-  fusion.addOutput(out_1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor at_in_0 = at::randn(input_shape_0, options);
-  at::Tensor at_in_1 = at::randn(input_shape_1, options);
-  at::Tensor at_in_2 = at::randn(input_shape_2, options);
-
-  std::vector<IValue> aten_inputs = {at_in_0, at_in_1, at_in_2};
-
-  FusionExecutorCache fec(std::move(fusion_ptr));
-  auto outputs = fec.runFusionWithInputs(aten_inputs);
-
-  auto t0 = at_in_0 + at_in_1;
-  auto t1 = at_in_0 + at_in_2;
-
-  auto runtime = fec.getMostRecentKernelRuntime();
-  TORCH_INTERNAL_ASSERT(runtime->isSegmented());
-  TORCH_INTERNAL_ASSERT(runtime->fusionSegments()->groups().size() == 2);
-
-  testValidate(
-      &fusion,
-      outputs,
-      {at_in_0, at_in_1, at_in_2},
-      {t0, t1},
-      __LINE__,
-      __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionIssue1305Repro_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  auto t0 = makeContigTensor(1);
-  auto t1 = makeContigTensor(2);
-
-  fusion.addInput(t0);
-  fusion.addInput(t1);
-
-  auto t2 = broadcast(t0, {true, false});
-  auto t3 = add(t1, t2);
-  auto t4 = add(t3, t2);
-  auto t5 = sum(t4, {1});
-  auto t6 = broadcast(t5, {false, true});
-  auto t7 = add(t3, t6);
-
-  fusion.addOutput(t7);
-
-  t3->computeAt(t7, -1, ComputeAtMode::MostInlined);
-
-  TORCH_INTERNAL_ASSERT(t3->getComputeAtPosition() == 1);
-}
-
-TEST_F(NVFuserTest, FusionDoubleBuffering1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = set(tv0);
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1.0));
-  auto tv3 = set(tv2);
-  fusion.addOutput(tv3);
-
-  tv1->setMemoryType(MemoryType::Shared);
-
-  tv3->split(-1, 128);
-  tv3->split(-1, 32);
-  TransformPropagatorWithCheck propagator(tv3);
-  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
-
-  tv0->computeAt(tv3, 1);
-
-  tv3->axis(-2)->parallelize(ParallelType::BIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  scheduler_utils::parallelizeAllLike(tv3);
-
-  tv1->doubleBuffer();
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  auto t0 = at::randn({1000}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = t0 + 1;
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionDoubleBuffering2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = set(tv0);
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1.0));
-  auto tv3 = set(tv2);
-  fusion.addOutput(tv3);
-
-  tv3->split(-1, 128);
-  tv3->split(-1, 32);
-  TransformPropagatorWithCheck propagator(tv3);
-  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
-
-  tv0->computeAt(tv3, -1);
-
-  tv3->axis(-2)->parallelize(ParallelType::BIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  scheduler_utils::parallelizeAllLike(tv3);
-
-  tv1->doubleBuffer();
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  auto t0 = at::randn({1000}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = t0 + 1;
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionDoubleBuffering3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
-  auto tv2 = set(tv1);
-  auto tv3 = add(tv2, IrBuilder::create<Double>(1.0));
-  fusion.addOutput(tv3);
-
-  tv1->setMemoryType(MemoryType::Shared);
-
-  tv3->split(-1, 128);
-  tv3->split(-1, 32);
-  TransformPropagatorWithCheck propagator(tv3);
-  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
-
-  tv0->computeAt(tv3, 1);
-
-  // tv2 is invalid to double-buffer as its producer, tv1, is
-  // computed inside the double-buffering loop.
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(tv2->doubleBuffer());
-
-  // Moving tv2 inner makes tv1 large enough to double-buffer tv2
-  tv2->computeAt(tv3, 2);
-
-  tv2->doubleBuffer();
-
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  scheduler_utils::parallelizeAllLike(tv3);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  auto t0 = at::randn({1000}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = t0 + 2;
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-// Double buffering smem to local and unswitch
-TEST_F(NVFuserTest, FusionDoubleBuffering4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
-  auto tv2 = set(tv1);
-  auto tv3 = add(tv2, IrBuilder::create<Double>(1.0));
-  fusion.addOutput(tv3);
-
-  tv1->setMemoryType(MemoryType::Shared);
-
-  tv3->split(-1, 128);
-  tv3->split(-1, 32);
-  tv3->split(-1, 8);
-  TransformPropagatorWithCheck propagator(tv3);
-  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
-
-  tv0->computeAt(tv3, 2);
-  tv2->computeAt(tv3, -1);
-
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(1)->parallelize(ParallelType::Unswitch);
-  scheduler_utils::parallelizeAllLike(tv3);
-
-  tv2->doubleBuffer();
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  auto t0 = at::randn({1000}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = t0 + 2;
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-// Double buffering gmem to shared and unswitch
-TEST_F(NVFuserTest, FusionDoubleBuffering5_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = set(tv0);
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1.0));
-  fusion.addOutput(tv2);
-
-  tv1->setMemoryType(MemoryType::Shared);
-
-  tv2->split(-1, 128);
-  tv2->split(-1, 32);
-  tv2->split(-1, 8);
-  TransformPropagatorWithCheck propagator(tv2);
-  MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
-
-  tv0->computeAt(tv2, 2);
-  tv1->computeAt(tv2, -1);
-
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(1)->parallelize(ParallelType::Unswitch);
-  scheduler_utils::parallelizeAllLike(tv2);
-
-  tv1->doubleBuffer();
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  auto t0 = at::randn({1000}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = t0 + 1;
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-// Double buffering smem to local and unroll
-TEST_F(NVFuserTest, FusionDoubleBuffering6_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
-  auto tv2 = set(tv1);
-  auto tv3 = add(tv2, IrBuilder::create<Double>(1.0));
-  fusion.addOutput(tv3);
-
-  tv1->setMemoryType(MemoryType::Shared);
-
-  tv3->split(-1, 128);
-  tv3->split(-1, 16);
-  tv3->split(-2, 4);
-  tv3->split(-2, 2);
-  TransformPropagatorWithCheck propagator(tv3);
-  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
-
-  tv0->computeAt(tv3, 1);
-  tv2->computeAt(tv3, -1);
-
-  tv3->axis(2)->parallelize(ParallelType::Unroll);
-  tv3->axis(4)->parallelize(ParallelType::TIDx);
-
-  tv2->doubleBuffer();
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  auto t0 = at::randn({199}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = t0 + 2;
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-// Double buffering and vectorize
-TEST_F(NVFuserTest, FusionDoubleBuffering7_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = set(tv0);
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1.0));
-  fusion.addOutput(tv2);
-
-  tv2->split(-1, 128);
-  tv2->split(-1, 4);
-  TransformPropagatorWithCheck propagator(tv2);
-  MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
-
-  tv1->computeAt(tv2, 2);
-
-  tv2->axis(-2)->parallelize(ParallelType::TIDx);
-
-  tv1->axis(-1)->parallelize(ParallelType::Vectorize);
-
-  tv1->doubleBuffer();
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  auto t0 = at::randn({200}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = t0 + 1;
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-// Multiple tensors to double-buffer
-TEST_F(NVFuserTest, FusionDoubleBuffering8_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = makeContigTensor(1);
-  fusion.addInput(tv1);
-
-  auto tv2 = set(tv0);
-  auto tv3 = set(tv1);
-  auto tv4 = add(tv2, tv3);
-  fusion.addOutput(tv4);
-
-  tv4->split(0, 32);
-  tv4->split(0, 4);
-  TransformPropagatorWithCheck propagator(tv4);
-  MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
-
-  tv0->computeAt(tv4, 1);
-  tv1->computeAt(tv4, 1);
-
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  scheduler_utils::parallelizeAllLike(tv4);
-
-  tv2->doubleBuffer();
-  tv3->doubleBuffer();
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  auto t0 = at::randn({100}, options);
-  auto t1 = at::randn({100}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  auto ref = t0 + t1;
-
-  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
-}
-
-// Nested double buffering from gmem to smem and smem to register
-TEST_F(NVFuserTest, FusionDoubleBuffering9_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto out = tv1;
-  fusion.addOutput(out);
-
-  auto tv2 = tv0->cacheAfter();
-  auto tv3 = tv2->cacheAfter();
-
-  out->split(0, 32);
-  out->split(0, 4);
-  TransformPropagatorWithCheck propagator(out);
-  MaxRootDomainInfoSpanningTree(out).traverse(&propagator);
-
-  tv2->setMemoryType(MemoryType::Shared);
-
-  tv2->computeAt(out, 1);
-  tv3->computeAt(out, -1);
-
-  out->axis(-1)->parallelize(ParallelType::TIDx);
-  scheduler_utils::parallelizeAllLike(out);
-
-  tv2->doubleBuffer();
-  tv3->doubleBuffer();
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  auto t0 = at::randn({1001}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = t0 + 1;
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-// FusionSmemBlockGemmCache + double buffering at both smem and local
-TEST_F(NVFuserTest, FusionSmemBlockGemmCacheDoubleBuffer_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Algorithm
-  TensorView* tv0 = makeSymbolicTensor(2); // (M, K)
-  TensorView* tv1 = makeSymbolicTensor(2); // (K, N)
-  TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B)
-  TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N)
-  TensorView* tv4 = mul(tv2, tv3); // M, K, N
-  TensorView* tv5 = sum(tv4, {1}); // M, R, N
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addOutput(tv5);
-
-  TensorView* tv6 = tv5->cacheBefore();
-
-  // For smem double buffering
-  auto tv0_cache_local = tv0->cacheAfter();
-  auto tv1_cache_local = tv1->cacheAfter();
-
-  // For register double buffering
-  auto tv0_cache_smem = tv0->cacheAfter();
-  auto tv1_cache_smem = tv1->cacheAfter();
-
-  const int BSX = 32;
-  const int TSX = 8;
-
-  // [M, K, N]
-  tv6->split(-1, BSX);
-  tv6->split(-1, TSX);
-  tv6->split(1, BSX);
-  tv6->split(0, BSX);
-  tv6->split(1, TSX);
-  // [M/BSX, BSX/TSX, TSX, K/BSX, BSX, N/BSX, BSX/TSX, TSX]
-  tv6->reorder(
-      {{4, 7}, {7, 6}, {6, 5}, {2, 4}, {1, 3}, {3, 2}, {5, 1}, {0, 0}});
-  // [M/BSX, N/BSX, K/BSX, BSX/TSX, BSX/TSX, TSX, TSX, BSX]
-
-  auto tv6_rf = tv6->rFactor({-1});
-
-  TransformPropagatorWithCheck propagator(tv6_rf);
-  MaxRootDomainInfoSpanningTree(tv6_rf).traverse(&propagator);
-
-  tv0->computeAt(tv6, 3);
-  tv1->computeAt(tv6, 3);
-
-  tv6_rf->computeAt(tv6, -1);
-  tv0_cache_local->computeAt(tv6_rf, -1);
-  tv1_cache_local->computeAt(tv6_rf, -1);
-
-  tv0_cache_smem->setMemoryType(MemoryType::Shared);
-  tv1_cache_smem->setMemoryType(MemoryType::Shared);
-
-  tv5->axis(0)->parallelize(ParallelType::BIDx);
-  tv5->axis(1)->parallelize(ParallelType::BIDy);
-  tv5->axis(-3)->parallelize(ParallelType::TIDy);
-  tv5->axis(-1)->parallelize(ParallelType::TIDx);
-
-  scheduler_utils::parallelizeAllLike(tv5);
-
-  tv0_cache_local->doubleBuffer();
-  tv1_cache_local->doubleBuffer();
-
-  tv0_cache_smem->doubleBuffer();
-  tv1_cache_smem->doubleBuffer();
-
-  constexpr int M = 154, K = 45, N = 1524;
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({M, K}, options);
-  at::Tensor t1 = at::randn({K, N}, options);
-  at::Tensor aten_output = matmul(t0.to(at::kDouble), t1.to(at::kDouble));
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-  // The smem cache write in this test case is redundant predicated,
-  //   and also double buffered. Currently we are relying on WAR sync
-  //   insertion to ensure ordering of double buffered tensor access.
-  // The check below makes sure that the sync is inserted so that the
-  //   test isn't running on a race condition.
-  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count > 0);
-}
-
-TEST_F(NVFuserTest, FusionIntermediateTensorVectorize_CUDA) {
-  std::vector<MemoryType> mem_types = {MemoryType::Shared, MemoryType::Local};
-
-  for (auto mem_type : mem_types) {
-    Fusion fusion;
-    FusionGuard fg(&fusion);
-
-    auto tv0 = makeContigTensor(1);
-    fusion.addInput(tv0);
-
-    auto tv1 = set(tv0);
-    auto tv2 = set(tv1);
-    auto tv3 = set(tv2);
-    fusion.addOutput(tv3);
-
-    tv1->setMemoryType(mem_type);
-
-    tv3->split(-1, 4);
-    TransformPropagatorWithCheck propagator(tv3);
-    MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
-
-    tv1->computeAt(tv3, -2);
-
-    tv2->axis(-1)->parallelize(ParallelType::Vectorize);
-
-    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-    at::manual_seed(0);
-    auto t0 = at::randn({15}, options);
-    FusionExecutor fe;
-    fe.compileFusion(&fusion);
-
-    // This should throw an exception as the extent of t0 is not
-    // divisible by the vector width
-    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-    ASSERT_ANY_THROW(fe.runFusion({t0}));
-
-    auto t1 = at::randn({16}, options);
-    auto cg_outputs = fe.runFusion({t1});
-
-    auto ref = t1;
-
-    testValidate(&fusion, cg_outputs, {t1}, {ref}, __LINE__, __FILE__);
-  }
-}
-
-TEST_F(NVFuserTest, FusionBroadcastConcretization1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({10, 1});
-  fusion.addInput(tv0);
-  auto tv1 = makeConcreteTensor({10, 20});
-  fusion.addInput(tv1);
-  auto tv2 = makeConcreteTensor({10, 10});
-  fusion.addInput(tv2);
-
-  // Not concretized
-  auto tv3 = sum(tv2, {1});
-  auto tv4 = broadcast(tv3, {false, true});
-  auto tv5 = add(tv0, tv4);
-  fusion.addOutput(tv5);
-
-  // Concretized
-  auto tv6 = sum(tv2, {1});
-  auto tv7 = broadcast(tv6, {false, true});
-  auto tv8 = add(tv1, tv7);
-  fusion.addOutput(tv8);
-
-  for (auto tv : {tv3, tv4, tv5, tv6, tv7, tv8}) {
-    tv->axis(1)->parallelize(ParallelType::TIDx);
-  }
-
-  GpuLower gpulw(&fusion);
-  TORCH_CHECK(!gpulw.concretizedBroadcastDomains().isConcretized(
-      loweredTv(tv4, gpulw)->axis(1)));
-  TORCH_CHECK(gpulw.concretizedBroadcastDomains().isConcretized(
-      loweredTv(tv7, gpulw)->axis(1)));
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  auto t0 = at::randn({10, 1}, options);
-  auto t1 = at::randn({10, 20}, options);
-  auto t2 = at::randn({10, 10}, options);
-  std::vector<IValue> aten_inputs = {t0, t1, t2};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto t5 = t0 + t2.sum({1}).unsqueeze(-1);
-  auto t8 = t1 + t2.sum({1}).unsqueeze(-1);
-
-  testValidate(&fusion, outputs, aten_inputs, {t5, t8}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionBroadcastConcretization2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {0, 1});
-  auto tv2 = broadcast(tv1, {true});
-  auto tv3 = broadcast(tv2, {false, true});
-  fusion.addOutput(tv3);
-
-  // tv1 is thread-predicated with TIDx and TIDy
-  tv1->axis(0)->parallelize(ParallelType::TIDx);
-  tv1->axis(1)->parallelize(ParallelType::TIDy);
-  // tv2 broadcasts along TIDx
-  tv2->axis(0)->parallelize(ParallelType::TIDx);
-  // tv3 broadcasts along TIDy
-  tv3->axis(0)->parallelize(ParallelType::TIDx);
-  tv3->axis(1)->parallelize(ParallelType::TIDy);
-
-  // Both tv2 and tv3 broadcast along predicated TID dimensions, but
-  // since the broadcast domains are not concretized, there should be
-  // no actual parallel broadcast
-
-  GpuLower gpulw(&fusion);
-  TORCH_CHECK(
-      !gpulw.kernel()->summary().has_block_broadcasts &&
-          !gpulw.kernel()->summary().has_grid_broadcasts,
-      "There must be no parallel broadcast in this fusion");
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  auto t0 = at::randn({10, 11}, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto t3 = t0.sum().unsqueeze(-1).unsqueeze(-1);
-
-  testValidate(&fusion, outputs, aten_inputs, {t3}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionBroadcastConcretization3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  std::vector<int64_t> input_shape({10, 4, 8});
-  std::vector<int64_t> output_shape({8, 4, 1});
-
-  auto tv0 = makeConcreteTensor(input_shape);
-  fusion.addInput(tv0);
-
-  auto tv2 = sum(tv0, {0});
-  auto tv3 = set(tv2);
-  auto tv4 =
-      view(tv3, {input_shape.begin() + 1, input_shape.end()}, output_shape);
-  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv5);
-
-  tv2->axis(0)->parallelize(ParallelType::TIDx);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv5->axis(-1)->parallelize(ParallelType::TIDx);
-
-  // The view op adds a broadcast domain in tv4, which is
-  // parallelized. Howver, it is never materialized, so there should
-  // be no parallel broadcast.
-
-  GpuLower gpulw(&fusion);
-  TORCH_CHECK(
-      !gpulw.kernel()->summary().has_block_broadcasts &&
-          !gpulw.kernel()->summary().has_grid_broadcasts,
-      "There must be no parallel broadcast in this fusion");
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  auto t0 = at::randn(input_shape, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto outputs = fe.runFusion(aten_inputs);
-
-  auto t5 = at::native::view(t0.sum(0), output_shape) + 1;
-
-  testValidate(&fusion, outputs, aten_inputs, {t5}, __LINE__, __FILE__);
-}
-
-// Merging non-broadcast and broadcast domains
-// TODO: Fix use case see issue https://github.com/csarofeen/pytorch/issues/1418
-// validateParallelize does not pass. Even if it's skipped,
-// generated code is invalid as blockBroadcast is not used.
-#if 0
-TEST_F(NVFuserTest, FusionBroadcastConcretization4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = broadcast(tv1, {false, true});
-  auto tv3 = add(tv2, tv0);
-  fusion.addOutput(tv3);
-
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
-
-  tv2->merge(0, 1);
-  tv2->axis(0)->parallelize(ParallelType::TIDx);
-  // TODO: When set to shared memory, this kernel should be correct, but fails
-  // validation and when skipped produces incorrect code
-  tv2->setMemoryType(MemoryType::Shared);
-
-  tv3->merge(0, 1);
-  tv3->axis(0)->parallelize(ParallelType::TIDx);
-
-  fusion.printMath();
-  fusion.printKernel();
-}
-#endif
-
-TEST_F(NVFuserTest, FusionBroadcastConcretization5_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = makeSymbolicTensor(1);
-  fusion.addInput(tv1);
-  auto tv2 = makeSymbolicTensor(1);
-  fusion.addInput(tv2);
-  auto tv3 = makeSymbolicTensor(1);
-  fusion.addInput(tv3);
-
-  // Assert tv2 and tv3 have the same shape
-  auto tv4 = add(tv2, tv3);
-  fusion.addOutput(tv4);
-
-  // Concretize a broadcast domain to multiple non-concrete domains
-  // through a multi-output expression. It should be considered to be
-  // non-uniquely concretized.
-  auto tv5 = broadcast(tv0, {false, true});
-  // Reduce only the non-broadcast domain.
-  auto tvs = Welford(tv5, {0});
-  auto tv9 = add(tvs.avg, tv1);
-  auto tv10 = add(tvs.var_sum, tv2);
-  fusion.addOutput(tv9);
-  fusion.addOutput(tv10);
-
-  // Same pattern as the above, but concretize the broadcast domain
-  // with tv2 and tv3, which have the exactly same shape, so the
-  // broadcast should be considered uniquely concretized.
-  auto tv11 = broadcast(tv0, {false, true});
-  // Reduce only the non-broadcast domain.
-  auto tvs2 = Welford(tv11, {0});
-  auto tv15 = add(tvs2.avg, tv2);
-  auto tv16 = add(tvs2.var_sum, tv3);
-  fusion.addOutput(tv15);
-  fusion.addOutput(tv16);
-
-  // Reduce only the broadcast domain. Since it's reduced, it should
-  // not be considered to be concretized.
-  auto tv17 = broadcast(tv0, {false, true});
-  auto tvs3 = Welford(tv17, {1});
-  fusion.addOutput(tvs3.avg);
-
-  ConcretizedBroadcastDomains bcast_concretization_info;
-  bcast_concretization_info.build(&fusion);
-
-  TORCH_CHECK(
-      bcast_concretization_info.maybeNonUniquelyConcretized(tv5->axis(1)),
-      "Failed to detect non-unique concretization of ",
-      tv5->toString());
-
-  TORCH_CHECK(
-      bcast_concretization_info.isUniquelyConcretized(tv11->axis(1)),
-      "Failed to detect unique concretization of ",
-      tv11->toString());
-
-  TORCH_CHECK(
-      !bcast_concretization_info.isConcretized(tv17->axis(1)),
-      "Failed to detect non-concretization of ",
-      tv17->toString());
-}
-
-TEST_F(NVFuserTest, FusionIssue1430_CUDA) {
-  // Derived from an expression sorting issue when using loop map, now expr
-  // sorting uses parallel map.
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  int V = 2, W = 3, X = 4, Y = 5, Z = 6;
-
-  // setup fusion
-  auto tv0 = TensorViewBuilder()
-                 .ndims(5)
-                 .dtype(DataType::Half)
-                 .contiguity(std::vector<bool>(5, true))
-                 .shape({V, W, X, Y, Z})
-                 .build();
-
-  fusion.addInput(tv0);
-  auto tv1 = set(tv0);
-  auto tv2 = castOp(DataType::Float, tv1);
-
-  auto tvs = Welford(tv2, {1, 2, 3, 4});
-  auto tv3 = tvs.avg;
-  auto tv4 = tvs.var_sum;
-  auto tv5 = tvs.n;
-
-  // avg
-  auto tv6 = broadcast(tvs.avg, {false, true, true, true, true});
-
-  // var
-  auto tv7 = mul(tv4, IrBuilder::create<Double>(1. / (W * X * Y * Z)));
-  auto tv8 = add(tv7, IrBuilder::create<Double>(1.e-6));
-  auto tv9 = broadcast(tv8, {false, true, true, true, true});
-  auto tv10 = rsqrt(tv9);
-
-  auto tv11 = castOp(DataType::Float, tv1);
-  auto tv12 = sub(tv11, tv6);
-  auto tv13 = mul(tv12, tv10);
-
-  auto tv14 = set(tv13);
-  fusion.addOutput(tv14);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDy);
-  tv3->axis(2)->parallelize(ParallelType::BIDx);
-  tv3->axis(3)->parallelize(ParallelType::TIDx);
-  tv3->axis(4)->parallelize(ParallelType::Vectorize);
-
-  // tv3->reorder({{1, -2}});
-
-  auto rfactor = ir_utils::rfactorHelper(tv3, {1, 4});
-
-  scheduler_utils::parallelizeAllLike(rfactor);
-
-  for (auto tv : ir_utils::allTvs(&fusion)) {
-    if (tv != tv1 || tv != tv3) {
-      for (auto i : c10::irange(tv->nDims())) {
-        if (isParallelTypeVectorize(tv->axis(i)->getParallelType())) {
-          tv->axis(i)->parallelize(ParallelType::Serial);
-        }
-      }
-    }
-  }
-
-  tv0->computeAt(tv14, 1);
-  tv13->computeAt(tv14, -2);
-  tv2->computeAt(tv14, -1, ComputeAtMode::MostInlined);
-  tv11->computeAt(tv14, -1, ComputeAtMode::MostInlined);
-
-  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({V, W, X, Y, Z}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-  auto cg_outputs = fe.runFusion({t0}, LaunchParams(X, V, -1, Y, -1, -1));
-
-  auto t0_double = t0.to(at::kDouble);
-
-  auto at_mu = at::mean(t0_double, {1, 2, 3, 4})
-                   .unsqueeze(-1)
-                   .unsqueeze(-1)
-                   .unsqueeze(-1)
-                   .unsqueeze(-1);
-  auto at_var = at::var(t0_double, {1, 2, 3, 4}, false)
-                    .unsqueeze(-1)
-                    .unsqueeze(-1)
-                    .unsqueeze(-1)
-                    .unsqueeze(-1);
-
-  auto at_out = t0_double.sub(at_mu).div(at_var.add(1.e-6).sqrt());
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {t0},
-      {at_out},
-      __LINE__,
-      __FILE__,
-      "",
-      LaunchParams(X, V, -1, Y, -1, -1));
-}
-
-// Test code generation of allocated scalars
-TEST_F(NVFuserTest, FusionCodegenAllocatedScalars_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Fusion is just a dummy container in this test, just used for
-  // getting a Kernel container
-  auto tv0 = makeSymbolicTensor(0);
-  fusion.addInput(tv0);
-  auto tv1 = set(tv0);
-  fusion.addOutput(tv1);
-
-  GpuLower gpulw(&fusion);
-  auto kernel = gpulw.kernel();
-
-  // Set the kernel as the current fusion
-  FusionGuard kg(kernel);
-
-  // Create alocated scalars
-  auto ks0 = add(kernel->zeroVal(), kernel->oneVal());
-  auto ks0_alloc = IrBuilder::create<kir::Allocate>(
-      ks0, MemoryType::Local, kernel->oneVal());
-
-  auto ks1 = add(ks0, kernel->oneVal());
-  auto ks1_alloc = IrBuilder::create<kir::Allocate>(
-      ks1, MemoryType::Local, kernel->oneVal());
-
-  auto tk0 = kernel->inputs()[0]->as<TensorView>();
-  auto tki0 = IrBuilder::create<kir::TensorIndex>(tk0, std::vector<Val*>{ks0});
-  auto tki1 = IrBuilder::create<kir::TensorIndex>(tk0, std::vector<Val*>{ks1});
-  auto tk0_expr = IrBuilder::create<UnaryOp>(UnaryOpType::Set, tki0, tki1);
-
-  // Insert the scalar expression and the allocation of the
-  // output directly to the kernel
-  auto proxy = kir::KernelInternalProxy(kernel);
-
-  const auto indent = "  ";
-  const auto ks0_name = "i" + std::to_string(ks0->name());
-  const auto ks1_name = "i" + std::to_string(ks1->name());
-  const auto tk0_name = "T" + std::to_string(tk0->name());
-
-  auto& exprs = proxy.topLevelExprs();
-  exprs.push_back(tk0_expr);
-
-  // Invalid code gen
-  const auto no_alloc_code = codegen::generateCudaKernel(kernel);
-
-  // Without alloc, Int vals are just inlined, resulting in:
-  // t0[(0 + 1)] = t0[((0 + 1) + 1)]
-  std::stringstream no_alloc_ref;
-  no_alloc_ref << "\n"
-               << indent << tk0_name << "[(0 + 1)]\n"
-               << indent << indent << " = " << tk0_name << "[((0 + 1) + 1)];\n";
-
-  TORCH_CHECK(
-      no_alloc_code.find(no_alloc_ref.str()) != std::string::npos,
-      "Invalid code generation. Expected:",
-      no_alloc_ref.str(),
-      "Actual:\n",
-      no_alloc_code);
-
-  // Insert proper allocations and definitions
-  exprs.insert(std::find(exprs.begin(), exprs.end(), tk0_expr), ks0_alloc);
-  exprs.insert(
-      std::find(exprs.begin(), exprs.end(), tk0_expr), ks0->definition());
-  exprs.insert(std::find(exprs.begin(), exprs.end(), tk0_expr), ks1_alloc);
-  exprs.insert(
-      std::find(exprs.begin(), exprs.end(), tk0_expr), ks1->definition());
-
-  const auto valid_code = codegen::generateCudaKernel(kernel);
-
-  std::stringstream valid_ref;
-  valid_ref << "\n"
-            << indent << tk0_name << "[" << ks0_name << "]\n"
-            << indent << indent << " = " << tk0_name << "[" << ks1_name
-            << "];\n";
-
-  TORCH_CHECK(
-      valid_code.find(valid_ref.str()) != std::string::npos,
-      "Invalid code generation. Expected:",
-      valid_ref.str(),
-      "Actual:\n",
-      valid_code);
-}
-
-TEST_F(NVFuserTest, FusionIndexHoist1_CUDA) {
-  if (isOptionDisabled(DisableOption::IndexHoist)) {
-    GTEST_SKIP() << "Index hoisting disabled";
-  }
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = set(tv0);
-  auto tv2 = set(tv1);
-  auto tv3 = set(tv2);
-  auto tv4 = set(tv3);
-  auto tv5 = set(tv4);
-  fusion.addOutput(tv5);
-
-  tv1->split(-1, 4);
-  tv2->split(-1, 4);
-  tv3->merge(0, 1);
-  tv3->split(0, 8);
-  tv5->merge(0, 1);
-  tv5->split(0, 8);
-  tv4->computeAt(tv5, -1);
-
-  tv1->setMemoryType(MemoryType::Global);
-  tv2->setMemoryType(MemoryType::Global);
-  tv3->setMemoryType(MemoryType::Global);
-
-  // Use Int32 as the index type to verify Int32 is used as the type
-  // of hoisted indices
-  GpuLower gpulw(&fusion, DataType::Int32);
-  auto kernel = gpulw.kernel();
-
-  auto is_index_times_ns = [](Val* val, Val* index, std::string name) -> bool {
-    auto def = dynamic_cast<BinaryOp*>(val->definition());
-    if (def == nullptr) {
-      return false;
-    }
-    return def->getBinaryOpType() == BinaryOpType::Mul &&
-        def->rhs()->isA<NamedScalar>() &&
-        def->rhs()->as<NamedScalar>()->name() == name && def->lhs() == index;
-  };
-
-  // Validate indices in the kernel are hoisted as
-  // intended. Validation could be also done by just string comparison
-  // as the parser test, but updating such tests would be tedious.
-  for (auto top_level_loop :
-       ir_utils::filterByType<kir::ForLoop>(kernel->topLevelExprs())) {
-    auto innermost_loop = top_level_loop;
-    while (auto first_expr_loop = dynamic_cast<kir::ForLoop*>(
-               innermost_loop->body().exprs().at(0))) {
-      innermost_loop = first_expr_loop;
-    }
-    const auto& exprs = innermost_loop->body().exprs();
-    TORCH_CHECK(!exprs.empty(), "No expression found");
-    TORCH_CHECK(
-        exprs.at(0)->isA<kir::Allocate>(),
-        "Invalid expression: ",
-        exprs.at(0)->toString());
-    auto hoisted_index = exprs.at(0)->as<kir::Allocate>()->buffer();
-    TORCH_CHECK(
-        hoisted_index->dtype() == DataType::Int32,
-        "Invalid data type of hoisted indices. Should be Int32 but: ",
-        hoisted_index->dtype());
-    kir::Predicate* pred = nullptr;
-    for (auto expr : exprs) {
-      if (expr->isA<kir::IfThenElse>()) {
-        pred = expr->as<kir::IfThenElse>()->predicate();
-        auto arith_expr = expr->as<kir::IfThenElse>()->thenBody().exprs().at(0);
-        auto out_ti = arith_expr->outputs()[0]->as<kir::TensorIndex>();
-        if (out_ti->view()->name() == 1) {
-          // Ref: T1[*, hoisted_index] = T0[*, hoisted_index * T0.stride];
-          auto t1_index = out_ti->index(1);
-          TORCH_CHECK(
-              t1_index == hoisted_index,
-              "Invalid index: ",
-              t1_index->toInlineString());
-          // Pred: hoisted_index < T0.size[1]
-          TORCH_CHECK(
-              pred->value()->definition()->as<BinaryOp>()->lhs() ==
-                  hoisted_index,
-              "Invalid predicate: ",
-              pred->value()->toInlineString());
-          TORCH_CHECK(arith_expr->inputs().size() == 1);
-          auto in0 = arith_expr->inputs().front()->as<kir::TensorIndex>();
-          TORCH_CHECK(in0->view()->name() == 0);
-          // hoisted_index * T0.stride[1]
-          auto t0_index = in0->index(1);
-          TORCH_CHECK(
-              is_index_times_ns(t0_index, hoisted_index, "T0.stride[1]"),
-              "Invalid index: ",
-              t0_index->toInlineString());
-        } else if (out_ti->view()->name() == 2) {
-          // Ref: T3[*, hoisted_index] = T2[*, hoisted_index];
-          auto out_index = out_ti->index(1);
-          TORCH_CHECK(
-              out_index == hoisted_index,
-              "Invalid index: ",
-              out_index->toInlineString());
-          TORCH_CHECK(
-              pred->value()->definition()->as<BinaryOp>()->lhs() ==
-                  hoisted_index,
-              "Invalid predicate: ",
-              pred->value()->toInlineString());
-          TORCH_CHECK(arith_expr->inputs().size() == 1);
-          auto in0 = arith_expr->inputs().front()->as<kir::TensorIndex>();
-          TORCH_CHECK(in0->view()->name() == 1);
-          auto in0_index = in0->index(1);
-          TORCH_CHECK(
-              in0_index == hoisted_index,
-              "Invalid index: ",
-              in0_index->toInlineString());
-        } else if (out_ti->view()->name() == 3) {
-          // Ref: T3[hoisted_index] = T2[hoisted_index];
-          auto out_index = out_ti->index(0);
-          TORCH_CHECK(
-              out_index == hoisted_index,
-              "Invalid index: ",
-              out_index->toInlineString());
-          TORCH_CHECK(
-              pred->value()->definition()->as<BinaryOp>()->lhs() ==
-                  hoisted_index,
-              "Invalid predicate: ",
-              pred->value()->toInlineString());
-          TORCH_CHECK(arith_expr->inputs().size() == 1);
-          auto in0 = arith_expr->inputs().front()->as<kir::TensorIndex>();
-          TORCH_CHECK(in0->view()->name() == 2);
-          auto in0_index = in0->index(0);
-          TORCH_CHECK(
-              in0_index == hoisted_index,
-              "Invalid index: ",
-              in0_index->toInlineString());
-        } else if (out_ti->view()->name() == 4) {
-          // Ref: T4[0] = T3[hoisted_index];
-          TORCH_CHECK(
-              pred->value()->definition()->as<BinaryOp>()->lhs() ==
-                  hoisted_index,
-              "Invalid predicate: ",
-              pred->value()->toInlineString());
-          TORCH_CHECK(arith_expr->inputs().size() == 1);
-          auto in0 = arith_expr->inputs().front()->as<kir::TensorIndex>();
-          TORCH_CHECK(in0->view()->name() == 3);
-          auto in0_index = in0->index(0);
-          TORCH_CHECK(
-              in0_index == hoisted_index,
-              "Invalid index: ",
-              in0_index->toInlineString());
-        } else if (out_ti->view()->name() == 5) {
-          // Ref: T5[hoisted_index] = T4[0]
-          auto out_index = out_ti->index(0);
-          TORCH_CHECK(
-              out_index == hoisted_index,
-              "Invalid index: ",
-              out_index->toInlineString());
-          TORCH_CHECK(
-              pred->value()->definition()->as<BinaryOp>()->lhs() ==
-                  hoisted_index,
-              "Invalid predicate: ",
-              pred->value()->toInlineString());
-        }
-      }
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  auto t0 = at::randn({15, 17}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = t0;
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-// Hoist indices for vectorized tensors
-TEST_F(NVFuserTest, FusionIndexHoist2_CUDA) {
-  if (isOptionDisabled(DisableOption::IndexHoist)) {
-    GTEST_SKIP() << "Index hoisting disabled";
-  }
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = makeContigTensor(1);
-  fusion.addInput(tv1);
-
-  auto tv2 = set(tv0);
-  auto tv3 = set(tv1);
-  auto tv4 = add(tv2, tv3);
-  auto tv5 = set(tv4);
-  fusion.addOutput(tv5);
-
-  tv5->split(-1, 4);
-  TransformPropagatorWithCheck propagator(tv5);
-  MaxRootDomainInfoSpanningTree(tv5).traverse(&propagator);
-
-  tv4->split(-1, 3);
-
-  tv0->computeAt(tv5, 1);
-  tv1->computeAt(tv5, 1);
-
-  tv2->axis(-1)->parallelize(ParallelType::Vectorize);
-  tv3->axis(-1)->parallelize(ParallelType::Vectorize);
-  tv5->axis(-1)->parallelize(ParallelType::Vectorize);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  auto t0 = at::randn({16}, options);
-  auto t1 = at::randn({16}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  auto ref = t0 + t1;
-
-  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionTestGridComm_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-  int X = 3, Y = 4, Z = 2;
-  auto tv0 = makeConcreteTensor({X, Y, Z});
-  fusion.addInput(tv0);
-  auto tv1 = makeConcreteTensor({X, Y, Z});
-  fusion.addInput(tv1);
-
-  auto tv2 = set(tv0);
-  auto tv3 = add(tv2, tv1);
-  auto tv4 = set(tv3);
-  auto tv5 = set(tv4);
-  fusion.addOutput(tv5);
-
-  tv2->setMemoryType(MemoryType::Global);
-  tv3->setMemoryType(MemoryType::Global);
-  tv4->setMemoryType(MemoryType::Global);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDy);
-  tv2->axis(1)->parallelize(ParallelType::BIDx);
-  tv2->axis(2)->parallelize(ParallelType::Vectorize);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(1)->parallelize(ParallelType::BIDy);
-
-  tv4->axis(0)->parallelize(ParallelType::BIDy);
-  tv4->axis(1)->parallelize(ParallelType::BIDx);
-
-  tv5->axis(0)->parallelize(ParallelType::BIDy);
-  tv5->axis(1)->parallelize(ParallelType::BIDx);
-  tv5->axis(2)->parallelize(ParallelType::Vectorize);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  auto t0 = at::randn({X, Y, Z}, options);
-  auto t1 = at::randn({X, Y, Z}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  auto ref = t0 + t1;
-
-  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
-}
-
-// See issue https://github.com/csarofeen/pytorch/issues/1497
-TEST_F(NVFuserTest, FusionTestGridComm2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int64_t W = 3, X = 4;
-
-  auto tv0 = makeConcreteTensor({X});
-  auto tv1 = makeConcreteTensor({W, X});
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv3 = broadcast(tv2, {true, false});
-  auto tv4 = add(tv3, tv1);
-  fusion.addOutput(tv4);
-
-  tv4->merge(0);
-  tv4->split(0, 2);
-
-  TransformPropagatorWithCheck propagator(tv4);
-  MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
-
-  tv3->computeAt(tv4, 1);
-
-  tv4->axis(0)->parallelize(ParallelType::BIDx);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv2->setMemoryType(MemoryType::Global);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  auto t0 = at::randn({X}, options);
-  auto t1 = at::randn({W, X}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  auto ref = t0 + t1 + 1;
-
-  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
-}
-
-// Vectorized reset test for double buffered registers
-TEST_F(NVFuserTest, FusionDoubleBufferVector_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
-  auto tv2 = sum(tv1, {0});
-  auto tv2c = tv2->cacheBefore();
-
-  fusion.addOutput(tv2);
-
-  auto tv1cw = tv1->cacheAfter();
-  auto tv1cr = tv1cw->cacheAfter();
-
-  tv1cw->split(-1, 32);
-  tv1cr->split(-1, 32);
-  tv1cr->split(-1, 4);
-  tv1cr->axis(-1)->parallelize(ParallelType::Vectorize);
-
-  tv1cw->computeAt(tv1cr, 1);
-  tv0->computeAt(tv1cw, -1);
-  tv2c->split(-1, 32);
-  tv2c->split(-1, 4);
-  tv1cr->computeAt(tv2c, 2);
-
-  tv1cw->setMemoryType(MemoryType::Shared);
-  tv1cr->doubleBuffer();
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::manual_seed(0);
-  auto t0 = at::randn({200}, options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-  auto ref = (t0 + 1).sum({0});
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-// Request 48KB of data in shared mem,
-//  should be large enough not to fit in
-//  static allocations, but small enough
-//  to fit in supported devices (sm70+).
-TEST_F(NVFuserTest, FusionLargeSmem_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(2.0));
-  fusion.addOutput(tv2);
-
-  tv2->split(0, 12288);
-  tv2->split(1, 128);
-  tv1->computeAt(tv2, 1);
-  tv1->split(1, 128);
-  tv0->computeAt(tv1, -1);
-  tv1->setMemoryType(MemoryType::Shared);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::manual_seed(0);
-  auto t0 = at::randn({12288 * 4}, options);
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-  auto ref = t0 + 1 + 2;
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-// Request a smem allocation that is equal to the device limit
-TEST_F(NVFuserTest, FusionTooLargeSmem_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto properties = at::cuda::getDeviceProperties(
-      c10::Device(c10::DeviceType::CUDA, 0).index());
-  int device_limit = properties->sharedMemPerBlockOptin;
-
-  auto tv0 = makeContigTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(2.0));
-  fusion.addOutput(tv2);
-
-  // 4 byte per float
-  tv2->split(0, device_limit / 4);
-  tv2->split(1, 128);
-  tv1->computeAt(tv2, 1);
-  tv1->split(1, 128);
-  tv0->computeAt(tv1, -1);
-  tv1->setMemoryType(MemoryType::Shared);
-  tv1->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::manual_seed(0);
-  auto t0 = at::randn({12288 * 4}, options);
-  FusionExecutor fe;
-
-  // First compile gets a compiled kernel
-  fe.compileFusion(&fusion, {t0});
-
-  // Should be throwing because the kernel
-  //  requested absolute device limit
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.runFusion({t0}));
-}
-
-// Try to test alignment when multiple tensors are
-//  in shared mem.
-TEST_F(NVFuserTest, FusionSmemAlignment_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({3, 4, 7, 2, 5});
-  fusion.addInput(tv0);
-  auto tv1 = sum(tv0, {4});
-  auto tv2 = sum(tv1, {3});
-  auto tv3 = sum(tv2, {2});
-  auto tv4 = sum(tv3, {1});
-  fusion.addOutput(tv4);
-
-  auto tv0c = tv0->cacheAfter();
-  auto tv1bc = tv1->cacheBefore();
-  auto tv2bc = tv2->cacheBefore();
-  auto tv3bc = tv3->cacheBefore();
-  auto tv4bc = tv4->cacheBefore();
-
-  tv0c->setMemoryType(MemoryType::Shared);
-  tv1bc->setMemoryType(MemoryType::Shared);
-  tv2bc->setMemoryType(MemoryType::Shared);
-  tv3bc->setMemoryType(MemoryType::Shared);
-  tv4bc->setMemoryType(MemoryType::Shared);
-
-  tv1->axis(-1)->parallelize(ParallelType::Vectorize);
-  tv3->axis(-1)->parallelize(ParallelType::Vectorize);
-  tv0->computeAt(tv4, 0);
-  tv0->computeAt(tv2, 2);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::manual_seed(0);
-  auto t0 = at::randn({3, 4, 7, 2, 5}, options);
-  FusionExecutor fe;
-
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-  auto tref = t0.sum({1, 2, 3, 4});
-
-  testValidate(&fusion, cg_outputs, {t0}, {tref}, __LINE__, __FILE__);
-}
-
-// Repro of #1521
-TEST_F(NVFuserTest, FusionImmediateValueAsInput_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto immediate_scalr = IrBuilder::create<Double>(0.1);
-  // Adding an immediate scalar value as an input is not allowed
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fusion.addInput(immediate_scalr));
-
-  // Instead, use a symbolic value
-  auto symbolic_scalar = IrBuilder::create<Double>();
-  fusion.addInput(symbolic_scalar);
-
-  auto tv1 = add(tv0, symbolic_scalar);
-  fusion.addOutput(tv1);
-
-  // Make sure the kernel is compiled.
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-}
-
-// Repro of #1506
-TEST_F(NVFuserTest, FusionVectorizeContigIndex_CUDA) {
-  std::vector<int64_t> shape{14, 14};
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = set(tv0);
-  auto tv2 = set(tv1);
-  fusion.addOutput(tv2);
-
-  tv2->merge(0);
-
-  // Vectorize by 4 should be allowed
-  tv2->split(0, 4);
-
-  tv2->axis(0)->parallelize(ParallelType::TIDx);
-  tv0->computeAt(tv2, 1);
-
-  tv1->axis(1)->parallelize(ParallelType::Vectorize);
-  tv2->axis(1)->parallelize(ParallelType::Vectorize);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn(shape, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-
-  TORCH_CHECK(t0.equal(cg_outputs[0]));
-}
-
-// Make sure the same fusion as FusionVectorizeContigIndex fails if
-// not contig.
-TEST_F(NVFuserTest, FusionVectorizeContigIndexFail_CUDA) {
-  std::vector<int64_t> shape{14, 14};
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = set(tv0);
-  auto tv2 = set(tv1);
-  fusion.addOutput(tv2);
-
-  tv2->merge(0);
-
-  tv2->split(0, 4);
-
-  tv2->axis(0)->parallelize(ParallelType::TIDx);
-  tv0->computeAt(tv2, 1);
-
-  tv1->axis(1)->parallelize(ParallelType::Vectorize);
-  tv2->axis(1)->parallelize(ParallelType::Vectorize);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn(shape, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-
-  // This should fail at the launch time as 14 is not divisible by the
-  // vector word size. The two domains are merged, but they are not
-  // contiguous, so contig indexing is not involved in this case.
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.runFusion({t0}));
-}
-
-TEST_F(NVFuserTest, FusionVectorizeInputToOutput_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = set(tv0);
-  fusion.addOutput(tv1);
-
-  tv1->split(0, 4);
-
-  tv1->axis(-1)->parallelize(ParallelType::Vectorize);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-
-  const int n = 12;
-  auto t0 = at::randn({n}, options);
-  // Shift by one to make it non-aligned
-  auto t0_misaligned = at::randn({n + 1}, options).index({Slice(1)});
-  auto t1_misaligned = at::empty({n + 1}, options).index({Slice(1)});
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-  TORCH_CHECK(t0.equal(cg_outputs[0]));
-
-  // Pass misaligned input. This must fail.
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.runFusion({t0_misaligned}));
-
-  // Pass misaligned output. This must fail too.
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.runFusion({t0}, {t1_misaligned}));
-}
-
-// Repro of issue #1530
-TEST_F(NVFuserTest, FusionVectorizeContigIndexValidationFail_CUDA) {
-  std::vector<int64_t> shape{1, 2, 1};
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(shape.size());
-  fusion.addInput(tv0);
-  auto tv1 = set(tv0);
-  fusion.addOutput(tv1);
-
-  tv1->merge(1);
-  tv1->merge(0);
-
-  auto invalid_vec_size = shape[0] * shape[1] * shape[2];
-  invalid_vec_size *= invalid_vec_size;
-
-  tv1->split(0, invalid_vec_size);
-
-  tv1->axis(1)->parallelize(ParallelType::Vectorize);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn(shape, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.runFusion({t0}));
-}
-
-TEST_F(NVFuserTest, FusionContigIndexingWithBroadcast_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({4});
-  fusion.addInput(tv0);
-  auto tv1 = makeConcreteTensor({3, 4});
-  fusion.addInput(tv1);
-
-  auto tv2 = broadcast(tv0, {true, false});
-  auto tv3 = add(tv2, tv1);
-  fusion.addOutput(tv3);
-
-  tv3->merge(0);
-  TransformPropagatorWithCheck propagator(tv3);
-  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
-
-  tv2->setMemoryType(MemoryType::Local);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn({4}, options);
-  auto t1 = at::randn({3, 4}, options);
-
-  auto t3 = t0.unsqueeze(0).add(t1);
-  {
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {t0, t1});
-    auto cg_outputs = fe.runFusion({t0, t1});
-
-    testValidate(&fusion, cg_outputs, {t0, t1}, {t3}, __LINE__, __FILE__);
-  }
-
-  // Make sure tv2 indexing also works when it's stored in global memory
-  tv2->setMemoryType(MemoryType::Global);
-  {
-    FusionExecutor fe;
-    fe.compileFusion(&fusion, {t0, t1});
-    auto cg_outputs = fe.runFusion({t0, t1});
-
-    testValidate(&fusion, cg_outputs, {t0, t1}, {t3}, __LINE__, __FILE__);
-  }
-}
-
-// Repro of #1534. Validation should detect invalid vectorization.
-TEST_F(NVFuserTest, FusionVectorizeContigIndexValidationFail2_CUDA) {
-  std::vector<int64_t> shape1{2, 3, 2};
-  std::vector<int64_t> shape2{2, 2};
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigConcreteTensor(shape1);
-  fusion.addInput(tv0);
-  auto tv1 = makeContigConcreteTensor(shape2);
-  fusion.addInput(tv1);
-
-  auto tv2 = set(tv1);
-  auto tv3 = broadcast(tv2, {false, true, false});
-  auto tv4 = add(tv0, tv3);
-  fusion.addOutput(tv4);
-
-  tv4->merge(1, 2);
-  tv4->merge(0, 1);
-  tv4->split(0, 4);
-  TransformPropagatorWithCheck propagator(tv4);
-  MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
-
-  tv0->computeAt(tv4, -2);
-  tv1->computeAt(tv4, -2);
-
-  tv2->axis(-1)->parallelize(ParallelType::Vectorize);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn(shape1, options);
-  auto t1 = at::randn(shape2, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-
-  // Vectorization of tv2 should be detected as invalid.
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fe.runFusion({t0, t1}));
-}
-
-TEST_F(NVFuserTest, FusionVectorizeContigIndexWithBroadcast_CUDA) {
-  std::vector<int64_t> shape1{2, 2, 2};
-  std::vector<int64_t> shape2{1, 2, 2};
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // [I0, I1, I2]
-  auto tv0 = makeContigTensor(shape1.size());
-  fusion.addInput(tv0);
-
-  // [B3, I1, I2]
-  auto tv1 = makeContigConcreteTensor(shape2);
-  fusion.addInput(tv1);
-
-  auto tv2 = set(tv1);
-  auto tv3 = add(tv0, tv2);
-  fusion.addOutput(tv3);
-
-  tv3->merge(1, 2);
-  tv3->merge(0, 1);
-  tv3->split(0, 4);
-
-  // Don't modify tv1 so that it's replayed as tv2 with actual
-  // transformations. It would create temporary IterDomains, and the
-  // validation should still be able to detect vectorization by 4 is valid.
-  // TransformPropagatorWithCheck propagator(tv3);
-  // MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
-
-  tv2->merge(1, 2);
-  tv2->merge(0, 1);
-  tv2->split(0, 4);
-
-  tv2->computeAt(tv3, -2);
-
-  tv2->axis(-1)->parallelize(ParallelType::Vectorize);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn(shape1, options);
-  auto t1 = at::randn(shape2, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  auto ref = t0 + t1;
-
-  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionVectorizeContigIndexPointwiseSchedule_CUDA) {
-  std::vector<int64_t> shape0{100, 14, 2, 14};
-  std::vector<int64_t> shape1{100, 2, 14};
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(shape0.size());
-  fusion.addInput(tv0);
-  auto tv1 = makeContigTensor(shape1.size());
-  fusion.addInput(tv1);
-
-  auto tv2 = broadcast(tv1, {false, true, false, false});
-  auto tv3 = add(tv0, tv2);
-  fusion.addOutput(tv3);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn(shape0, options);
-  auto t1 = at::randn(shape1, options);
-
-  auto lparams = schedulePointwise(&fusion, {t0, t1});
-
-  GpuLower gpulw(&fusion);
-  auto kernel = gpulw.kernel();
-
-  // The innermost two dimensions are merged and contiguous, so
-  // vectorization can be done against 2*14=28 rather than 14, so
-  // vector word size should be 4. Broadcasting of tv1 should not
-  // matter.
-  for (const auto& vec_info : kernel->summary().vectorized_set_info) {
-    TORCH_CHECK(
-        vec_info.word_size == 4,
-        "Invalid vector word size: ",
-        vec_info.word_size);
-  }
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1}, lparams);
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  auto ref = t0 + t1.unsqueeze(-3);
-
-  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
-}
-
-// Repro of issue #1539.
-TEST_F(NVFuserTest, FusionTrivialReductionForwarding1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = broadcast(tv0, {true, false});
-  auto tv2 = sum(tv1, {0});
-  auto tv3 = set(tv2);
-  fusion.addOutput(tv3);
-
-  tv2->merge(0);
-  tv2->split(0, 4);
-
-  TransformPropagatorWithCheck propagator(tv2);
-  MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
-
-  // All tensors must be transformed to a 2D tensor with each axis
-  // mapped with each other in the LOOP map.
-  ComputeAtMap ca_map(&fusion);
-  for (auto tv : ir_utils::allTvs(&fusion)) {
-    TORCH_CHECK(
-        tv->nDims() == 2, "Expected to be a 2D tensor but: ", tv->toString());
-    for (const auto i : c10::irange(2)) {
-      TORCH_CHECK(ca_map.areMapped(
-          tv->axis(i), tv3->axis(i), IdMappingMode::PERMISSIVE));
-    }
-  }
-}
-
-TEST_F(NVFuserTest, FusionTrivialReductionForwarding2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = broadcast(tv0, {true, false});
-  auto tv2 = sum(tv1, {0});
-  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
-
-  fusion.addOutput(tv3);
-
-  // Merging a trivial reduction with a non-reduction domain
-  tv2->merge(0, 1);
-  tv2->split(0, 4);
-
-  tv3->split(0, 4);
-
-  // tv2 and tv3 are different as tv3 lacks the trivial reduction, but
-  // they are mapped with each other by BestEffortReplay as the merge
-  // of trivial reduciton dim is forwarded.
-
-  PairwiseRootDomainMap root_map(tv2, tv3);
-
-  auto p2c = BestEffortReplay::replayCasP(tv3, tv2, 2, root_map).getReplay();
-  for (const auto i : c10::irange(tv2->nDims())) {
-    auto tv2_id = tv2->axis(i);
-    auto it = p2c.find(tv2_id);
-    TORCH_CHECK(
-        it != p2c.end(),
-        "Expected mapped consumer ID but not found: ",
-        tv2_id->toString());
-    auto tv3_mapped_id = it->second;
-    TORCH_CHECK(
-        tv3_mapped_id == tv3->axis(i),
-        "Unexpected mapped consumer ID: ",
-        tv3_mapped_id->toString());
-  }
-
-  auto c2p = BestEffortReplay::replayPasC(tv2, tv3, 2, root_map).getReplay();
-  for (const auto i : c10::irange(tv3->nDims())) {
-    auto tv3_id = tv3->axis(i);
-    auto it = c2p.find(tv3_id);
-    TORCH_CHECK(
-        it != c2p.end(),
-        "Expected mapped producer ID but not found: ",
-        tv3_id->toString());
-    auto tv2_mapped_id = it->second;
-    TORCH_CHECK(
-        tv2_mapped_id == tv2->axis(i),
-        "Unexpected mapped consumer ID: ",
-        tv2_mapped_id->toString());
-  }
-}
-
-TEST_F(NVFuserTest, FusionTrivialReductionForwarding3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = sum(tv0, {1});
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv2);
-
-  // Similar pattern as FusionTrivialReductionForwarding2 but no
-  // trivial reduciton at the root domain
-
-  // Create a trivial reduction by splitting with a factor of 1
-  tv1->split(1, 1, false);
-  // Merging with a trivial reduction
-  tv1->merge(0, 1);
-  tv1->split(0, 5);
-
-  tv2->split(0, 5);
-
-  // While the merge of tv1 is done with a trivial reduciton, it's not
-  // a root domain, so forwarding is not enabled. BestEffortReplay
-  // should only map the first axis of each tensor.
-
-  PairwiseRootDomainMap root_map(tv1, tv2);
-  auto p2c = BestEffortReplay::replayCasP(tv2, tv1, 2, root_map).getReplay();
-  TORCH_CHECK(p2c.size() == 1, "Expected only one mapping found");
-  TORCH_CHECK(p2c.begin()->first == tv1->getRootDomain().at(0));
-  TORCH_CHECK(p2c.begin()->second == tv2->getRootDomain().at(0));
-}
-
-TEST_F(NVFuserTest, FusionTrivialReductionForwarding4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-
-  auto tv2 = broadcast(tv0, {true, false});
-  auto tv3 = add(tv1, tv2);
-  fusion.addOutput(tv3);
-
-  // tv4 has a trivial reduction axis
-  auto tv4 = sum(tv2, {0});
-  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
-  fusion.addOutput(tv5);
-
-  tv3->merge(0, 1);
-  tv3->split(0, 32);
-
-  // This causes the trivial reduction of tv4 to be merged with
-  // another axis of tv4, and then forward computeAt is done from tv4
-  // to tv5. The split of the merged id of tv4 should be done on tv5
-  // by forwarding the merge of the trivial reduction.
-  tv0->computeAt(tv3, -1);
-
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn({111}, options);
-  auto t1 = at::randn({123, 111}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  auto t2 = t0.unsqueeze(0);
-  auto t3 = t1 + t2;
-  auto t5 = sum(t2, {0}) + 1;
-
-  testValidate(&fusion, cg_outputs, {t0, t1}, {t3, t5}, __LINE__, __FILE__);
-}
-
-// See issue #1598
-TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = set(tv0);
-  auto tv3 = set(tv1);
-  auto tv4 = add(tv2, tv3);
-  fusion.addOutput(tv4);
-
-  // Place tv2 on shared memory
-  tv2->split(0, 2);
-  tv2->split(-1, 4);
-  tv2->setMemoryType(MemoryType::Shared);
-  tv2->axis(-2)->parallelize(ParallelType::TIDy);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv3->split(0, 2);
-  tv3->split(-1, 4);
-  // swap tidx and tidy
-  tv3->axis(-2)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDy);
-
-  tv4->split(0, 2);
-  tv4->split(-1, 4);
-  tv4->axis(-2)->parallelize(ParallelType::TIDx);
-  tv4->axis(-1)->parallelize(ParallelType::TIDy);
-
-  tv0->computeAt(tv4, 1);
-  tv3->computeAt(tv4, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn({10, 64}, options);
-  auto t1 = at::randn({10, 64}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  auto ref = t0 + t1;
-
-  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
-}
-
-// See issue #1598
-TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = set(tv0);
-  auto tv3 = set(tv1);
-  auto tv4 = add(tv2, tv3);
-  fusion.addOutput(tv4);
-
-  tv2->split(0, 2);
-  tv2->split(-1, 4);
-  tv2->setMemoryType(MemoryType::Shared);
-
-  tv2->axis(-2)->parallelize(ParallelType::TIDy);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  tv4->split(0, 2);
-  tv4->split(-1, 4);
-  // Also do unroll for tv3 and tv4
-  tv4->split(-2, 8, false);
-  tv4->axis(-3)->parallelize(ParallelType::Unroll);
-  // swap tidx and tidy
-  tv4->axis(-2)->parallelize(ParallelType::TIDx);
-  tv4->axis(-1)->parallelize(ParallelType::TIDy);
-
-  tv0->computeAt(tv4, 1);
-  tv3->computeAt(tv4, -1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn({10, 64}, options);
-  auto t1 = at::randn({10, 64}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  auto ref = t0 + t1;
-
-  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
-}
-
-// See issue #1599
-TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = set(tv0);
-  auto tv3 = set(tv1);
-  auto tv4 = add(tv2, tv3);
-  fusion.addOutput(tv4);
-
-  // Use unroll where a RAW-sync tensor is stored
-
-  tv4->split(0, 2);
-  tv4->split(0, 3);
-  tv4->split(-1, 4);
-  tv4->axis(1)->parallelize(ParallelType::Unroll);
-  tv4->axis(-2)->parallelize(ParallelType::TIDx);
-  tv4->axis(-1)->parallelize(ParallelType::TIDy);
-
-  tv0->computeAt(tv4, 3);
-  tv3->computeAt(tv4, -1);
-
-  tv2->split(-1, 4);
-  tv2->axis(-2)->parallelize(ParallelType::TIDy);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->setMemoryType(MemoryType::Shared);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn({50, 64}, options);
-  auto t1 = at::randn({50, 64}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  auto ref = t0 + t1;
-
-  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
-}
-
-// See #1618
-TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({16, 128});
-  auto tv1 = makeConcreteTensor({16, 128});
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = set(tv0);
-  auto tv3 = set(tv1);
-  auto tv4 = set(tv2);
-  auto tv5 = set(tv3);
-  auto tv6 = add(tv4, tv5);
-  fusion.addOutput(tv6);
-
-  tv2->setMemoryType(MemoryType::Shared);
-  tv3->setMemoryType(MemoryType::Shared);
-
-  tv2->computeAt(tv6, 0);
-  tv3->computeAt(tv6, 1);
-  tv4->computeAt(tv6, 1);
-  tv5->computeAt(tv6, -1);
-  tv2->split(1, 64);
-  tv3->split(1, 64);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv6->axis(-1)->parallelize(ParallelType::TIDx);
-
-  // Check the block sync is inserted at the correct location.
-  //  There is exactly one block sync needed in this test case
-  //    and the sync needs to be after the 2 expressions
-  //    that modify shared memory.
-  class SyncInsertionPointChecker : public kir::IrVisitor {
-   public:
-    using kir::IrVisitor::handle;
-
-   private:
-    void handle(UnaryOp* uop) final {
-      // Record number of unary ops that modifies shared memory.
-      if (uop->out()->isA<kir::TensorIndex>() &&
-          uop->out()->as<kir::TensorIndex>()->view()->getMemoryType() ==
-              MemoryType::Shared &&
-          // Filter out initialization expressions
-          uop->in()->isA<kir::TensorIndex>()) {
-        number_of_writes_++;
-      }
-    }
-    void handle(kir::BlockSync* bsync) final {
-      // Make sure both shared memory modifying expressions
-      //  have been observed at the sync insertion point.
-      TORCH_INTERNAL_ASSERT(
-          number_of_writes_ == 2,
-          "FusionRAWSyncInsertionPlace4 test fail:",
-          "only 1 sync after the 2 shared mem writes is needed in this test,"
-          "either a redundant sync has been inserted or the block sync is not inserted at the right place");
-    }
-
-   private:
-    int number_of_writes_ = 0;
-  } sync_insertion_checker;
-  GpuLower gpulw(&fusion);
-  sync_insertion_checker.handle(gpulw.kernel()->topLevelExprs());
-}
-
-// Test serial write and parallel read of shared mem: mapped case
-TEST_F(NVFuserTest, FusionSerialSmemWriteParallelRead1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeConcreteTensor({128, 6});
-  TensorView* tv1 = makeConcreteTensor({128, 6});
-  TensorView* tv2 = makeConcreteTensor({128, 6});
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addInput(tv2);
-
-  TensorView* tv3 = add(tv0, tv1);
-  TensorView* tv4 = add(tv3, tv2);
-
-  fusion.addOutput(tv4);
-
-  //  Use shared memory
-  tv3->setMemoryType(MemoryType::Shared);
-
-  // Parallelize t4, in this case dim 0 on tv3 will
-  //  not be parallelized but dim0 of t4 will be.
-  // We will need to make sure a sync is inserted
-  //  even if these dimensions are mapped.
-  tv4->axis(0)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({128, 6}, options);
-  at::Tensor t1 = at::randn({128, 6}, options);
-  at::Tensor t2 = at::randn({128, 6}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1, t2});
-  auto cg_outputs = fe.runFusion({t0, t1, t2});
-
-  auto ref = t0 + t1 + t2;
-
-  testValidate(&fusion, cg_outputs, {t0, t1, t2}, {ref}, __LINE__, __FILE__);
-}
-
-// Test serial write and parallel read of shared mem: un-mapped case
-TEST_F(NVFuserTest, FusionSerialSmemWriteParallelRead2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeConcreteTensor({128, 6});
-  TensorView* tv1 = makeConcreteTensor({128, 6});
-  TensorView* tv2 = makeConcreteTensor({128, 6});
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  fusion.addInput(tv2);
-
-  TensorView* tv3 = add(tv0, tv1);
-  TensorView* tv4 = add(tv3, tv2);
-
-  fusion.addOutput(tv4);
-
-  //  Use shared memory
-  tv3->setMemoryType(MemoryType::Shared);
-
-  // Split and parallelize t4,
-  //  the parallelized dimension in t4 will not
-  // map across to the shared mem tensor, t3. So
-  // there will need to be a sync before use of t3.
-  tv4->split(0, 2);
-  tv4->axis(0)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({128, 6}, options);
-  at::Tensor t1 = at::randn({128, 6}, options);
-  at::Tensor t2 = at::randn({128, 6}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1, t2});
-  auto cg_outputs = fe.runFusion({t0, t1, t2});
-
-  auto ref = t0 + t1 + t2;
-
-  testValidate(&fusion, cg_outputs, {t0, t1, t2}, {ref}, __LINE__, __FILE__);
-}
-
-// Simple test of async copy primitive
-TEST_F(NVFuserTest, FusionSimpleCpAsync_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int m = 33, n = 31;
-
-  TensorView* tv0 = makeConcreteTensor({m, n});
-  TensorView* tv1 = makeConcreteTensor({m, n});
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = add(tv0, tv1);
-
-  fusion.addOutput(tv2);
-
-  auto tv0_shared = tv0->cacheAfter(LoadStoreOpType::CpAsync);
-  tv0_shared->setMemoryType(MemoryType::Shared);
-
-  tv0->computeAt(tv2, 1);
-  tv0_shared->axis(1)->parallelize(ParallelType::TIDx);
-  tv2->axis(1)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({m, n}, options);
-  at::Tensor t1 = at::randn({m, n}, options);
-
-  FusionExecutor fe;
-
-  // requires ampere+ GPU
-  if (!deviceMajorMinorCheck(8)) {
-    ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0, t1}));
-    GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
-  }
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  auto ref = t0 + t1;
-
-  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
-}
-
-// Simple test of async copy primitive: double buffered
-//   Double buffer case 1, both block sync and async wait
-//  are needed.
-TEST_F(NVFuserTest, FusionDoubleBufferCpAsync1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Using vectorization so need to keep n multiple of 4.
-  int m = 33, n = 48;
-
-  TensorView* tv0 = makeConcreteTensor({m, n});
-  TensorView* tv1 = makeConcreteTensor({m, n});
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = add(tv0, tv1);
-
-  fusion.addOutput(tv2);
-
-  auto tv0_shared = tv0->cacheAfter(LoadStoreOpType::CpAsync);
-  tv0_shared->setMemoryType(MemoryType::Shared);
-  tv0->computeAt(tv2, 1);
-
-  // Asynchronously load a tile in one schedule
-  tv0_shared->split(1, 4);
-  tv0_shared->axis(-1)->parallelize(ParallelType::Vectorize);
-  tv0_shared->axis(-2)->parallelize(ParallelType::TIDx);
-
-  // Consume the loaded tile in another schedule,
-  //   triggering the need for a sync.
-  tv2->split(1, 12);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-
-  // Double buffer the shared mem tensor.
-  tv0_shared->doubleBuffer();
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({m, n}, options);
-  at::Tensor t1 = at::randn({m, n}, options);
-
-  FusionExecutor fe;
-  // requires ampere+ GPU
-  if (!deviceMajorMinorCheck(8)) {
-    ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0, t1}));
-    GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
-  }
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  auto ref = t0 + t1;
-
-  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
-}
-
-// Simple test of async copy primitive: double buffered
-//   Double buffer case 2, only async wait is needed
-TEST_F(NVFuserTest, FusionDoubleBufferCpAsync2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Using vectorization so need to keep n multiple of 4.
-  int m = 33, n = 48;
-
-  TensorView* tv0 = makeConcreteTensor({m, n});
-  TensorView* tv1 = makeConcreteTensor({m, n});
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = add(tv0, tv1);
-
-  fusion.addOutput(tv2);
-
-  auto tv0_shared = tv0->cacheAfter(LoadStoreOpType::CpAsync);
-  tv0_shared->setMemoryType(MemoryType::Shared);
-  tv0->computeAt(tv2, 1);
-
-  // Asynchronously load a tile in one schedule
-  tv0_shared->split(1, 4);
-  tv0_shared->axis(-2)->parallelize(ParallelType::TIDx);
-
-  // Consume the loaded tile in another schedule,
-  //   triggering the need for a sync.
-  tv2->split(1, 4);
-  tv2->axis(-2)->parallelize(ParallelType::TIDx);
-
-  // Double buffer the shared mem tensor.
-  tv0_shared->doubleBuffer();
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({m, n}, options);
-  at::Tensor t1 = at::randn({m, n}, options);
-
-  FusionExecutor fe;
-  // requires ampere+ GPU
-  if (!deviceMajorMinorCheck(8)) {
-    ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0, t1}));
-    GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
-  }
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  auto ref = t0 + t1;
-
-  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
-}
-
-// Simple test for double buffer in shared mem,
-//  where we should not insert redundant syncs when
-//  they are not needed.
-TEST_F(NVFuserTest, FusionDoubleBufferNoSync_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Using vectorization so need to keep n multiple of 4.
-  int m = 33, n = 48;
-
-  TensorView* tv0 = makeConcreteTensor({m, n});
-  TensorView* tv1 = makeConcreteTensor({m, n});
-
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  TensorView* tv2 = add(tv0, tv1);
-
-  fusion.addOutput(tv2);
-
-  auto tv0_shared = tv0->cacheAfter();
-  tv0_shared->setMemoryType(MemoryType::Shared);
-  tv0->computeAt(tv2, 1);
-
-  // Asynchronously load a tile in one schedule
-  tv0_shared->split(1, 4);
-  tv0_shared->axis(-2)->parallelize(ParallelType::TIDx);
-
-  // Consume the loaded tile in another schedule,
-  //   triggering the need for a sync.
-  tv2->split(1, 4);
-  tv2->axis(-2)->parallelize(ParallelType::TIDx);
-
-  // Double buffer the shared mem tensor.
-  tv0_shared->doubleBuffer();
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({m, n}, options);
-  at::Tensor t1 = at::randn({m, n}, options);
-
-  GpuLower gpulw(&fusion);
-  auto flattened_exprs =
-      ir_utils::flattenScopedExprs(gpulw.kernel()->topLevelExprs());
-  bool sync_inserted = std::any_of(
-      flattened_exprs.begin(), flattened_exprs.end(), [](Expr* expr) {
-        return expr->isA<kir::BlockSync>();
-      });
-  TORCH_INTERNAL_ASSERT(!sync_inserted, "Un-expected block sync inserted");
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  auto ref = t0 + t1;
-
-  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
-}
-
-// Test predicate inversion for cp.async
-TEST_F(NVFuserTest, FusionCpAsyncPredicate_CUDA) {
-  // requires ampere+ GPU
-
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Using vectorization so need to keep n multiple of 4.
-  int m = 33, n = 48;
-
-  TensorView* tv0 = makeConcreteTensor({m, n});
-
-  fusion.addInput(tv0);
-  auto tv1 = sum(tv0, {1});
-  fusion.addOutput(tv1);
-
-  auto tv0_shared = tv0->cacheAfter(LoadStoreOpType::CpAsync);
-  auto tv0_reg = tv0_shared->cacheAfter();
-  tv0_shared->setMemoryType(MemoryType::Shared);
-  tv0->computeAt(tv1, 1);
-
-  tv0_shared->split(-1, 32);
-  tv0_shared->split(-1, 4);
-  tv0_shared->axis(-1)->parallelize(ParallelType::Vectorize);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({m, n}, options);
-
-  FusionExecutor fe;
-  if (!deviceMajorMinorCheck(8)) {
-    ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0}));
-    GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
-  }
-
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = t0.sum({1});
-
-  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-// Test predicate removal on reg-to-reg expressions
-TEST_F(NVFuserTest, FusionPredRemovalCheck_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeContigTensor(2);
-  fusion.addInput(tv0);
-
-  TensorView* tv1 = set(tv0);
-  TensorView* tv2 = set(tv1);
-  TensorView* tv3 = set(tv2);
-  TensorView* tv4 = set(tv3);
-
-  fusion.addOutput(tv4);
-  tv4->split(1, 4);
-  tv0->computeAt(tv4, -2);
-  tv3->axis(-1)->parallelize(ParallelType::Vectorize);
-
-  class PredicateRemovalChecker : public kir::IrVisitor {
-   public:
-    using kir::IrVisitor::handle;
-
-   private:
-    void handle(UnaryOp* uop) final {
-      assertOnLocalToLocal(uop);
-    }
-
-    // Utility to assert any local-to-local expr is only trivially predicated.
-    void assertOnLocalToLocal(Expr* expr) {
-      bool is_local = true;
-      for (auto in : ir_utils::filterByType<kir::TensorIndex>(expr->inputs())) {
-        if (in->view()->getMemoryType() != MemoryType::Local) {
-          is_local = false;
-        }
-      }
-      for (auto in :
-           ir_utils::filterByType<kir::TensorIndex>(expr->outputs())) {
-        if (in->view()->getMemoryType() != MemoryType::Local) {
-          is_local = false;
-        }
-      }
-
-      if (is_local) {
-        if (auto ite = dynamic_cast<kir::IfThenElse*>(scope_exprs_.back())) {
-          TORCH_INTERNAL_ASSERT(
-              ite->predicate()->value()->isConst(),
-              "redundant predicate on: ",
-              expr);
-        }
-      }
-    }
-
-   private:
-    bool within_ite_ = false;
-  } pred_checker;
-
-  GpuLower gpulw(&fusion);
-  pred_checker.handle(gpulw.kernel()->topLevelExprs());
-}
-
-TEST_F(NVFuserTest, FusionPropagateParallelTypesToSiblings_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tvs = Welford(tv0, {0});
-  auto tv_avg = tvs.avg;
-  fusion.addOutput(tv_avg);
-
-  tv_avg->split(0, 128);
-  TransformPropagatorWithCheck propagator(tv_avg);
-  MaxRootDomainInfoSpanningTree(tv_avg).traverse(&propagator);
-
-  tv_avg->axis(0)->parallelize(ParallelType::BIDx);
-  tv_avg->axis(1)->parallelize(ParallelType::TIDx);
-
-  // Make sure the parallelization of tv_avg is propagated to the var
-  // and count tensors.
-  GpuLower gpulw(&fusion);
-  for (const auto expr : gpulw.kernel()->exprs()) {
-    auto wop = dynamic_cast<WelfordOp*>(expr);
-    if (wop == nullptr) {
-      continue;
-    }
-    auto ref = wop->outAvg()->as<TensorView>();
-    for (auto sibling : ir_utils::filterByType<TensorView>(wop->outputs())) {
-      if (ref == sibling) {
-        continue;
-      }
-      TORCH_CHECK(
-          ref->nDims() == sibling->nDims(),
-          "Invalid sibling: ",
-          sibling->toString());
-      for (const auto i : c10::irange(ref->nDims())) {
-        TORCH_CHECK(
-            ref->axis(i)->getParallelType() ==
-                sibling->axis(i)->getParallelType(),
-            "Mismatched parallel types between siblings. ",
-            ref->toString(),
-            ", ",
-            sibling->toString());
-      }
-    }
-  }
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  at::Tensor t0 = at::randn({9999}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto outputs = fe.runFusion({t0});
-
-  testValidate(fe.kernel(), outputs, {t0}, {t0.mean({0})}, __LINE__, __FILE__);
-}
-
-// Test ExactRootDomainMap
-TEST_F(NVFuserTest, FusionExactRootDomainMap_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-
-  auto tv2 = broadcast(tv0, {false, true});
-  auto tv3 = transpose(tv2);
-  auto tv4 = add(tv2, tv1);
-  auto tv5 = add(tv2, tv3);
-  auto tv6 = add(tv3, tv1);
-  fusion.addOutput(tv4);
-  fusion.addOutput(tv5);
-  fusion.addOutput(tv6);
-
-  const auto exact_map = ExactRootDomainMap(&fusion);
-
-  // In the exact mapping, the broadcast domain introduced at tv2 is
-  // only mapped with the another one in tv3, which is just transposed
-  // from tv2. Any other domain, including the second domain of tv4,
-  // must not be mapped.
-
-  auto tv2_bc = tv2->axis(1);
-  auto tv3_bc = tv3->axis(0);
-
-  TORCH_CHECK(
-      exact_map.areMapped(tv2_bc, tv3_bc),
-      "Invalid exact root domain map: ",
-      exact_map.toString());
-
-  // They must not be mapped with anything else.
-  for (auto tv : ir_utils::allTvs(&fusion)) {
-    for (auto root_id : tv->getRootDomain()) {
-      if (root_id == tv2_bc || root_id == tv3_bc) {
-        continue;
-      }
-      TORCH_CHECK(
-          !exact_map.areMapped(root_id, tv2_bc),
-          "Invalid exact root domain map: ",
-          exact_map.toString());
-      TORCH_CHECK(
-          !exact_map.areMapped(root_id, tv3_bc),
-          "Invalid exact root domain map: ",
-          exact_map.toString());
-    }
-  }
-}
-
-class NVFuserMultithreadedTest : public ::testing::Test {
- protected:
-  bool was_enabled = false;
-
-  void SetUp() override {
-    was_enabled = fuser::cuda::setEnabled(true);
-  }
-
-  void TearDown() override {
-    fuser::cuda::setEnabled(was_enabled);
-  }
-};
-
-TEST_F(NVFuserMultithreadedTest, SingleFunction_CUDA) {
-  std::string ir = R"IR(
-graph(%x.1 : Tensor,
-      %y.1 : Tensor):
-  %12 : NoneType = prim::Constant()
-  %11 : bool = prim::Constant[value=0]()
-  %9 : int = prim::Constant[value=1]()
-  %3 : Tensor = aten::exp(%x.1)
-  %5 : Tensor = aten::relu(%y.1)
-  %6 : Tensor = aten::sin(%5)
-  %8 : Tensor = aten::add(%3, %6, %9)
-  %10 : int[] = prim::ListConstruct(%9)
-  %13 : Tensor = aten::sum(%8, %10, %11, %12)
-  return (%13)
-)IR";
-  auto g = std::make_shared<Graph>();
-  torch::jit::parseIR(ir, g.get());
-  GraphFunction fn("nvfuser_test", g, nullptr);
-
-  auto run_kernel = [&fn]() {
-    auto x = torch::rand({32, 32}, at::TensorOptions(at::kCUDA));
-    auto y = torch::rand({32, 32}, at::TensorOptions(at::kCUDA));
-    std::vector<IValue> results;
-    for (const auto& _ : c10::irange(10)) {
-      auto stack = createStack({x.clone(), y.clone()});
-      fn.run(stack);
-      results.push_back(stack.back());
-    }
-    for (const auto& i : c10::irange(1, 10)) {
-      auto t0 = results[0].toTensor();
-      auto ti = results[i].toTensor();
-      ASSERT_TRUE(at::allclose(t0, ti));
-    }
-  };
-
-  constexpr size_t kNumThreads = 4;
-  std::vector<std::thread> threads;
-  for (size_t id = 0; id < kNumThreads; ++id) {
-    threads.emplace_back(run_kernel);
-  }
-  for (auto& t : threads) {
-    t.join();
-  }
-}
-
-TEST_F(NVFuserMultithreadedTest, MultipleFunctions_CUDA) {
-  auto run_kernel = []() {
-    const std::string ir = R"IR(
-  graph(%x.1 : Tensor,
-        %y.1 : Tensor):
-    %12 : NoneType = prim::Constant()
-    %11 : bool = prim::Constant[value=0]()
-    %9 : int = prim::Constant[value=1]()
-    %3 : Tensor = aten::exp(%x.1)
-    %5 : Tensor = aten::relu(%y.1)
-    %6 : Tensor = aten::sin(%5)
-    %8 : Tensor = aten::add(%3, %6, %9)
-    %10 : int[] = prim::ListConstruct(%9)
-    %13 : Tensor = aten::sum(%8, %10, %11, %12)
-    return (%13)
-  )IR";
-    auto g = std::make_shared<Graph>();
-    torch::jit::parseIR(ir, g.get());
-    GraphFunction fn("nvfuser_test", g, nullptr);
-
-    auto x = torch::rand({32, 32}, at::TensorOptions(at::kCUDA));
-    auto y = torch::rand({32, 32}, at::TensorOptions(at::kCUDA));
-    std::vector<IValue> results;
-    constexpr size_t numRuns = 10;
-    for (const auto& _ : c10::irange(numRuns)) {
-      auto stack = createStack({x.clone(), y.clone()});
-      fn.run(stack);
-      results.push_back(stack.back());
-    }
-    for (const auto& i : c10::irange(1, numRuns)) {
-      auto t0 = results[0].toTensor();
-      auto ti = results[i].toTensor();
-      ASSERT_TRUE(at::allclose(t0, ti));
-    }
-  };
-
-  constexpr size_t kNumThreads = 4;
-  std::vector<std::thread> threads;
-  for (size_t id = 0; id < kNumThreads; ++id) {
-    threads.emplace_back(run_kernel);
-  }
-  for (auto& t : threads) {
-    t.join();
-  }
-}
-
-// Repro of issue #1655
-TEST_F(NVFuserTest, FusionIncompleteConcreteID_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion.addInput(tv0);
-  auto tv1 = makeSymbolicTensor(2);
-  fusion.addInput(tv1);
-  auto tv2 = makeSymbolicTensor(2);
-  fusion.addInput(tv2);
-
-  auto tv3 = broadcast(tv0, {true, true, false});
-  auto tv4 = broadcast(tv1, {false, true, false});
-  auto tv5 = broadcast(tv2, {true, false, false});
-
-  auto tv6 = add(tv3, tv4);
-  auto tv7 = add(tv3, tv5);
-
-  fusion.addOutput(tv6);
-  fusion.addOutput(tv7);
-
-  tv6->merge(0);
-  tv6->merge(0);
-
-  TransformPropagatorWithCheck propagator(tv6);
-  MaxRootDomainInfoSpanningTree(tv6).traverse(&propagator);
-
-  tv0->computeAt(tv6, -1, ComputeAtMode::MostInlined);
-  tv1->computeAt(tv6, -1, ComputeAtMode::MostInlined);
-  tv2->computeAt(tv7, -1, ComputeAtMode::MostInlined);
-
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(fusion.printKernel());
-}
-
-TEST_F(NVFuserTest, FusionTestReEntrantGridWelford_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr.get();
-  FusionGuard fg(&fusion);
-
-  int X = 256, Y = 7, Z = 2048;
-
-  // setup fusion
-  auto tv0 = makeContigTensor(4, DataType::Half);
-  fusion.addInput(tv0);
-  auto tv1 = castOp(DataType::Float, tv0);
-
-  auto tvs = Welford(tv1, {0, 1, 2});
-  auto tv_avg = tvs.avg;
-  auto tv_M2 = tvs.var_sum;
-  auto tv_N = tvs.n;
-  fusion.addOutput(tv_avg);
-  fusion.addOutput(tv_M2);
-
-  auto cached_input = tv0->cacheAfter();
-  auto cached_avg = tv_avg->cacheBefore();
-  auto cached_M2 = tv_M2->cacheBefore();
-
-  auto reduction_tv = scheduler_utils::getReductionTvs(&fusion)[0];
-
-  reduction_tv->merge(0);
-  reduction_tv->merge(0);
-
-  int TIDx = 16;
-  int vec = 4;
-
-  int TIDy = 16;
-  int outer_tidy_fact = 16;
-
-  reduction_tv->split(-1, TIDx * vec);
-  reduction_tv->split(-1, vec);
-  reduction_tv->axis(-2)->parallelize(ParallelType::TIDx);
-  reduction_tv->axis(-1)->parallelize(ParallelType::Vectorize);
-  reduction_tv->axis(-3)->parallelize(ParallelType::BIDx);
-
-  reduction_tv->split(0, TIDy);
-  reduction_tv->axis(1)->parallelize(ParallelType::TIDy);
-  reduction_tv->split(0, outer_tidy_fact);
-  reduction_tv->axis(0)->parallelize(ParallelType::BIDy);
-
-  // T2_g[ rblockIdx.y, rS{16}, rthreadIdx.y, iblockIdx.x, ithreadIdx.x24,
-  // iV25{4} ]
-  reduction_tv->reorder({{3, 0}, {4, 1}, {0, 2}, {2, 3}, {1, 4}, {5, 5}});
-  // T2_g[iblockIdx.x, ithreadIdx.x24, rblockIdx.y, rthreadIdx.y, rS{16},
-  // iV25{4}]
-
-  TransformPropagatorWithCheck propagator(reduction_tv);
-  MaxRootDomainInfoSpanningTree(reduction_tv).traverse(&propagator);
-  auto rfactor_tv = ir_utils::rfactorHelper(reduction_tv, {4});
-  scheduler_utils::parallelizeAllLike(rfactor_tv);
-
-  tv0->computeAt(tv_avg, 2);
-  tv0->computeAt(cached_input, -2);
-
-  cached_input->computeAt(rfactor_tv, 4, ComputeAtMode::BestEffort);
-
-  for (auto tv : ir_utils::allTvs(&fusion)) {
-    if (tv == cached_input || tv == tv_avg || tv == tv_M2) {
-      continue;
-    }
-    tv->axis(-1)->parallelize(ParallelType::Serial);
-  }
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {}, LaunchParams());
-
-  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({X, Y, Y, Z}, options);
-
-  auto cg_outputs = fe.runFusion({t0}, LaunchParams(-1, -1, -1, -1, -1, -1));
-
-  // by default Welford outputs sum of square diff so need to divide to get var
-  cg_outputs[1] = cg_outputs[1].div((float)(X * Y * Y));
-
-  auto at_mu = at::mean(t0.to(at::kDouble), {0, 1, 2});
-  auto at_var = at::var(t0.to(at::kDouble), {0, 1, 2}, false);
-
-  testValidate(
-      &fusion,
-      cg_outputs,
-      {t0},
-      {at_mu, at_var},
-      __LINE__,
-      __FILE__,
-      "",
-      LaunchParams(-1, -1, -1, -1, -1, -1));
-}
-
-// Test sync insertion with redundant predicates
-TEST_F(NVFuserTest, FusionRedundantPredSync_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeConcreteTensor({32});
-  TensorView* tv1 = makeConcreteTensor({32, 32});
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = broadcast(tv0, {true, false});
-  auto tv3 = add(tv2, tv1);
-
-  fusion.addOutput(tv3);
-
-  auto tv0c = tv0->cacheAfter();
-
-  // Make a redundant write through smem
-  tv0c->setMemoryType(MemoryType::Shared);
-
-  tv0->computeAt(tv3, 0);
-  tv1->computeAt(tv3, 0);
-
-  tv0c->axis(0)->parallelize(ParallelType::TIDx);
-  tv2->axis(0)->parallelize(ParallelType::TIDy);
-  tv2->axis(1)->parallelize(ParallelType::TIDx);
-
-  tv3->axis(0)->parallelize(ParallelType::TIDy);
-  tv3->axis(1)->parallelize(ParallelType::TIDx);
-
-  GpuLower gpulw(&fusion);
-  auto flattened_exprs =
-      ir_utils::flattenScopedExprs(gpulw.kernel()->topLevelExprs());
-  bool sync_inserted = std::any_of(
-      flattened_exprs.begin(), flattened_exprs.end(), [](Expr* expr) {
-        return expr->isA<kir::BlockSync>();
-      });
-  TORCH_INTERNAL_ASSERT(sync_inserted, "Expected block sync not inserted");
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({32}, options);
-  at::Tensor t1 = at::randn({32, 32}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  auto ref = t0 + t1;
-
-  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
-}
-
-// Test case for removing syncs on chain of redundant uses.
-TEST_F(NVFuserTest, FusionRedundantPredSync2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeConcreteTensor({32});
-  TensorView* tv1 = makeConcreteTensor({32, 32});
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = broadcast(tv0, {true, false});
-  auto tv3 = add(tv2, tv1);
-
-  fusion.addOutput(tv3);
-
-  auto tv0c = tv0->cacheAfter();
-
-  // Make a redundant write through smem
-  tv0c->setMemoryType(MemoryType::Shared);
-  tv2->setMemoryType(MemoryType::Shared);
-
-  tv0->computeAt(tv3, 0);
-  tv1->computeAt(tv3, 0);
-
-  tv0c->axis(0)->parallelize(ParallelType::TIDx);
-  tv2->axis(0)->parallelize(ParallelType::TIDy);
-  tv2->axis(1)->parallelize(ParallelType::TIDx);
-
-  tv3->axis(0)->parallelize(ParallelType::TIDy);
-  tv3->axis(1)->parallelize(ParallelType::TIDx);
-
-  // Utility class to make sure one block sync
-  //  is inserted by RAW pass.
-  class SyncChecker : public kir::IrVisitor {
-   public:
-    using kir::IrVisitor::handle;
-    int result() {
-      return sync_seen_;
-    }
-
-   private:
-    void handle(kir::BlockSync*) final {
-      sync_seen_++;
-    }
-
-   private:
-    int sync_seen_ = 0;
-  } checker;
-
-  GpuLower gpulw(&fusion);
-  checker.handle(gpulw.kernel()->topLevelExprs());
-  TORCH_INTERNAL_ASSERT(
-      checker.result() < 2, "More syncs were inserted than expected");
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({32}, options);
-  at::Tensor t1 = at::randn({32, 32}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  auto ref = t0 + t1;
-
-  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
-}
-
-// Test case for sync insertion after redundant predicated smem write
-//  Check that syncs are removed only when all paths are redundant.
-TEST_F(NVFuserTest, FusionRedundantPredSync3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeConcreteTensor({32});
-  TensorView* tv1 = makeConcreteTensor({32, 32});
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = broadcast(tv0, {true, false});
-  auto tv3 = set(tv2);
-  auto tv4 = add(tv3, tv1);
-  auto tv5 = add(tv2, tv1);
-
-  fusion.addOutput(tv4);
-  fusion.addOutput(tv5);
-
-  auto tv0c = tv0->cacheAfter();
-
-  // In this scheduling config,
-  //  tv0c -> tv2 -> tv3 is a redundant path for tidy
-  //  tv0c -> tv2 -> tv5 is not.
-  //  So we need a RAW sync in tv0c->tv2 to make sure
-  //  tv2 has the correct value to produce tv5.
-  tv0c->setMemoryType(MemoryType::Shared);
-  tv3->setMemoryType(MemoryType::Shared);
-
-  tv0c->axis(0)->parallelize(ParallelType::TIDx);
-  tv2->axis(0)->parallelize(ParallelType::TIDy);
-  tv2->axis(1)->parallelize(ParallelType::TIDx);
-
-  tv3->axis(0)->parallelize(ParallelType::TIDy);
-  tv3->axis(1)->parallelize(ParallelType::TIDx);
-
-  tv5->axis(0)->parallelize(ParallelType::TIDy);
-  tv5->axis(1)->parallelize(ParallelType::TIDx);
-
-  // Utility class to make sure one block sync
-  //  is inserted by RAW pass.
-  class SyncChecker : public kir::IrVisitor {
-   public:
-    using kir::IrVisitor::handle;
-    int result() {
-      return sync_seen_;
-    }
-
-   private:
-    void handle(kir::BlockSync* sync) final {
-      if (!sync->isWarHazardSync()) {
-        sync_seen_++;
-      }
-    }
-
-   private:
-    int sync_seen_ = 0;
-  } checker;
-
-  GpuLower gpulw(&fusion);
-  checker.handle(gpulw.kernel()->topLevelExprs());
-
-  // This is implicit checking. There are exactly 2 places
-  //  where RAW hazards happen: one producing tv2 and the other
-  //  producing tv3. This test case expect syncs in both of
-  //  these places so we check that 2 RAW syncs are inserted.
-  TORCH_INTERNAL_ASSERT(
-      checker.result() == 2,
-      "Exactly 2 RAW sync expected for the two shared memory transfers");
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({32}, options);
-  at::Tensor t1 = at::randn({32, 32}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
-
-  auto ref = t0 + t1;
-
-  testValidate(&fusion, cg_outputs, {t0, t1}, {ref, ref}, __LINE__, __FILE__);
-}
-
-// Unit test case for detecting thread redundant usage of shared tensors.
-TEST_F(NVFuserTest, FusionRedundantUseCheck_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeConcreteTensor({32, 32});
-  fusion.addInput(tv0);
-
-  auto tv1 = set(tv0);
-  auto tv2 = set(tv1);
-  auto tv3 = set(tv2);
-  auto tv4 = set(tv3);
-
-  auto tv5 = set(tv4);
-
-  auto tv6 = set(tv4);
-  auto tv7 = set(tv6);
-
-  fusion.addOutput(tv5);
-  fusion.addOutput(tv7);
-
-  tv2->setMemoryType(MemoryType::Shared);
-  tv4->setMemoryType(MemoryType::Shared);
-
-  tv7->axis(-1)->parallelize(ParallelType::TIDx);
-
-  // Thread pred map cannot be built without an active lower
-  //  object. So would need to lower the whole fusion for
-  //  testing. However, lower also keeps an copy of the fusion
-  //  so the original pointers cannot be used to querry the
-  //  thread pred map. So have to traverse the new expr list
-  //  to find the pointers;
-  GpuLower gpulw(&fusion);
-
-  TensorView *lowered_tv2 = nullptr, *lowered_tv4 = nullptr;
-  auto used_vals = gpulw.kernel()->usedMathVals();
-
-  for (auto tv : ir_utils::filterByType<TensorView>(used_vals)) {
-    if (tv->name() == 2) {
-      lowered_tv2 = tv;
-    }
-    if (tv->name() == 4) {
-      lowered_tv4 = tv;
-    }
-  }
-
-  TORCH_INTERNAL_ASSERT(
-      lowered_tv2 != nullptr && lowered_tv4 != nullptr,
-      "tv2 or tv4 not lowered or mangled");
-
-  auto tv2_info = gpulw.threadPredMap().getPredicateInfo(lowered_tv2);
-  auto tv4_info = gpulw.threadPredMap().getPredicateInfo(lowered_tv4);
-
-  // tv2 -> tv3 -> tv4 (shared) is the only use chain for tv2,
-  //  and tv4 is redundantly written in tidx so tv2 is redundantly
-  //  consumed in tidx.
-  TORCH_INTERNAL_ASSERT(
-      tv2_info.redundant_use_types.get(ParallelType::TIDx),
-      "TV2 is redundantly used but not detected.");
-
-  // tv4->tv5 (global) is a redundant use chain, but
-  // tv4->tv6->tv7 is not, so tv4 should not be detected as
-  // a redundant used tensor in tidx.
-  TORCH_INTERNAL_ASSERT(
-      !tv4_info.redundant_use_types.get(ParallelType::TIDx),
-      "TV4 is not redundantly used but not detected.");
-}
-
-// Test a basic swizzle pattern
-TEST_F(NVFuserTest, FusionSimpleSwizzle0_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({2, 32});
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-
-  fusion.addOutput(tv2);
-
-  // Make a 2x8 Zshape tile
-  tv1->split(-1, 16);
-  tv1->split(-1, 8);
-  // [O, 2, 8]
-
-  tv2->split(-1, 16);
-  tv2->split(-1, 4);
-  //[O, 4, 4]
-
-  tv1->computeAt(tv2, 1);
-  tv1->swizzle(Swizzle2DType::ZShape, -2, -1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn({2, 32}, options);
-  auto t2 = t0 + 2.0;
-  auto cg_outputs = fe.runFusion({t0});
-
-  testValidate(&fusion, cg_outputs, {t0}, {t2}, __LINE__, __FILE__);
-}
-
-// Test swizzle inlining
-TEST_F(NVFuserTest, FusionSimpleSwizzle1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({2, 32});
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
-
-  fusion.addOutput(tv3);
-
-  // Make a 2x8 Zshape tile
-  tv2->split(-1, 16);
-  tv2->split(-1, 8);
-  // [O, 2, 8]
-
-  tv3->split(-1, 16);
-  tv3->split(-1, 4);
-  //[O, 4, 4]
-
-  tv2->computeAt(tv3, 1);
-  tv2->swizzle(Swizzle2DType::ZShape, -2, -1);
-
-  // Inlining a producer into a swizzled consumer is ok
-  tv1->computeAt(tv2, -1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn({2, 32}, options);
-  auto t3 = t0 + 3.0;
-  auto cg_outputs = fe.runFusion({t0});
-
-  testValidate(&fusion, cg_outputs, {t0}, {t3}, __LINE__, __FILE__);
-}
-
-// Test sync insertion and memory check in parallelized swizzles.
-//  In this test, data is parallel written into smem in zcurve
-//   pattern and then read out and output to global mem unswizzled.
-TEST_F(NVFuserTest, FusionSimpleSwizzle2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({32, 32});
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-
-  fusion.addOutput(tv2);
-
-  tv1->swizzle(Swizzle2DType::ZShape, -2, -1);
-
-  tv1->axis(0)->parallelize(ParallelType::TIDx);
-  tv1->axis(1)->parallelize(ParallelType::TIDy);
-
-  tv2->axis(0)->parallelize(ParallelType::TIDx);
-  tv2->axis(1)->parallelize(ParallelType::TIDy);
-
-  // Validation should fail since TV1 is not in shared
-  //  memory as required by sync info pass.
-  ASSERT_ANY_THROW(GpuLower gpulw_throw(&fusion));
-
-  tv1->setMemoryType(MemoryType::Shared);
-
-  // Make sure that a sync is inserted:
-  bool sync_found = false;
-  GpuLower gpu_lw(&fusion);
-  auto flattened_exps =
-      ir_utils::flattenScopedExprs(gpu_lw.kernel()->topLevelExprs());
-
-  for (auto expr : flattened_exps) {
-    if (expr->isA<kir::BlockSync>()) {
-      sync_found = true;
-    }
-    // Will require a sync thread before any shared memory read.
-    for (auto inp_tv : ir_utils::filterByType<TensorView>(expr->inputs())) {
-      if (inp_tv->getMemoryType() == MemoryType::Shared) {
-        TORCH_INTERNAL_ASSERT(
-            sync_found, "Block sync required but not inserted");
-      }
-    }
-  }
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn({32, 32}, options);
-  auto t2 = t0 + 2.0;
-  auto cg_outputs = fe.runFusion({t0});
-
-  testValidate(&fusion, cg_outputs, {t0}, {t2}, __LINE__, __FILE__);
-}
-
-// Test BestEffortReplay behavior with swizzle op
-TEST_F(NVFuserTest, FusionSwizzleMapping_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({2, 32});
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
-
-  fusion.addOutput(tv3);
-
-  // Make a 2x8 Zshape tile
-  tv2->split(-1, 16);
-  tv2->split(-1, 8);
-  // [O, 2, 8]
-
-  tv3->split(-1, 16);
-  tv3->split(-1, 4);
-  //[O, 4, 4]
-
-  tv2->computeAt(tv3, 1);
-  tv2->swizzle(Swizzle2DType::ZShape, -2, -1);
-
-  // Inlining a producer into a swizzled consumer is ok
-  tv1->computeAt(tv2, -1);
-
-  // Check BestEffortReplay behavior with skip swizzles option on.
-  PairwiseRootDomainMap root_map(tv1, tv2);
-
-  // Check producer to consumer map,
-  //  i.e. unswizzled tensor to swizzled tensor map
-  //----------------------------------------------------------
-  auto p2c = BestEffortReplay::replayCasP(tv2, tv1, -1, root_map).getReplay();
-  auto swizzle_x_it0 = p2c.find(tv1->axis(-2));
-  auto swizzle_y_it0 = p2c.find(tv1->axis(-1));
-  // P2C map should exist and both the x and y map should
-  //  map to the output of the swizzle op.
-  TORCH_INTERNAL_ASSERT(
-      swizzle_x_it0 != p2c.end() && swizzle_y_it0 != p2c.end());
-  TORCH_INTERNAL_ASSERT(
-      swizzle_x_it0->second == tv2->axis(-2) &&
-      swizzle_y_it0->second == tv2->axis(-1));
-
-  // Check consumer to producer map,
-  //  i.e. swizzled tensor to unswizzled tensor map
-  //----------------------------------------------------------
-  auto c2p = BestEffortReplay::replayPasC(tv1, tv2, -1, root_map).getReplay();
-
-  auto swizzle_op = tv2->axis(-1)->definition()->as<Swizzle2D>();
-
-  // Find mapping for swizzle inputs
-  auto swizzle_x_it1 = c2p.find(swizzle_op->inX());
-  auto swizzle_y_it1 = c2p.find(swizzle_op->inY());
-
-  // Find mapping for swizzle outputs
-  auto swizzle_x_it2 = c2p.find(swizzle_op->outX());
-  auto swizzle_y_it2 = c2p.find(swizzle_op->outY());
-
-  // Input of swizzle ops will not be mapped to any
-  //  by BestEffortReplay, as BestEffortReplay has to be
-  //  one to one. IdGraph will further map them together.
-  TORCH_INTERNAL_ASSERT(
-      swizzle_x_it1 == c2p.end() && swizzle_y_it1 == c2p.end());
-
-  // Mapping for swizzle outputs should be mapped and should
-  //  also map to the corresponding axes on the unswizzled tensor.
-  TORCH_INTERNAL_ASSERT(
-      swizzle_x_it2 != c2p.end() && swizzle_y_it2 != c2p.end());
-  TORCH_INTERNAL_ASSERT(
-      swizzle_x_it2->second == tv1->axis(-2) &&
-      swizzle_y_it2->second == tv1->axis(-1));
-
-  // Check id graph behavior
-  //----------------------------------------------------------
-  ComputeAtMap ca_map(&fusion);
-  // Corresponding inputs and outputs of swizzle ops are
-  //  map through by exact and permissive map.
-  TORCH_INTERNAL_ASSERT(
-      ca_map.areMapped(tv1->axis(-2), swizzle_op->inX(), IdMappingMode::EXACT));
-  TORCH_INTERNAL_ASSERT(
-      ca_map.areMapped(tv1->axis(-1), swizzle_op->inY(), IdMappingMode::EXACT));
-  TORCH_INTERNAL_ASSERT(ca_map.areMapped(
-      tv1->axis(-2), swizzle_op->outX(), IdMappingMode::EXACT));
-  TORCH_INTERNAL_ASSERT(ca_map.areMapped(
-      tv1->axis(-1), swizzle_op->outY(), IdMappingMode::EXACT));
-
-  TORCH_INTERNAL_ASSERT(ca_map.areMapped(
-      tv1->axis(-2), swizzle_op->inX(), IdMappingMode::PERMISSIVE));
-  TORCH_INTERNAL_ASSERT(ca_map.areMapped(
-      tv1->axis(-1), swizzle_op->inY(), IdMappingMode::PERMISSIVE));
-  TORCH_INTERNAL_ASSERT(ca_map.areMapped(
-      tv1->axis(-2), swizzle_op->outX(), IdMappingMode::PERMISSIVE));
-  TORCH_INTERNAL_ASSERT(ca_map.areMapped(
-      tv1->axis(-1), swizzle_op->outY(), IdMappingMode::PERMISSIVE));
-}
-
-// Test a basic loop swizzle pattern
-TEST_F(NVFuserTest, FusionLoopSwizzle0_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({2, 32});
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-
-  fusion.addOutput(tv2);
-
-  tv2->split(-1, 16);
-  tv2->split(-1, 4);
-  //[O, 4, 4]
-
-  tv2->swizzle(Swizzle2DType::ZShape, -2, -1, SwizzleMode::Loop);
-
-  tv0->computeAt(tv2, -1);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn({2, 32}, options);
-  auto t2 = t0 + 2.0;
-  auto cg_outputs = fe.runFusion({t0});
-
-  testValidate(&fusion, cg_outputs, {t0}, {t2}, __LINE__, __FILE__);
-}
-
-// Outer block zshape pattern
-TEST_F(NVFuserTest, FusionLoopSwizzle1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-
-  fusion.addOutput(tv2);
-
-  tv2->split(-2, 8);
-  tv2->split(-1, 4);
-  //[I0o, I0i, I1o, I1i]
-  tv2->reorder({{1, 2}, {2, 1}});
-  //[I0o, I1o, I0i, I1i]
-
-  tv2->swizzle(Swizzle2DType::ZShape, 0, 1, SwizzleMode::Loop);
-  tv0->computeAt(tv2, -1);
-
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(1)->parallelize(ParallelType::BIDy);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn({45, 77}, options);
-  auto t2 = t0 + 2.0;
-  auto cg_outputs = fe.runFusion({t0});
-
-  testValidate(&fusion, cg_outputs, {t0}, {t2}, __LINE__, __FILE__);
-}
-
-// Test assertion in unsupported pattern: non-leaf loop swizzle.
-TEST_F(NVFuserTest, FusionLoopSwizzleCheck0_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({2, 32});
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-
-  fusion.addOutput(tv2);
-
-  tv2->split(-1, 16);
-  tv2->split(-1, 4);
-  //[O, 4, 4]
-
-  // Swizzle the inner tile.
-  tv2->swizzle(Swizzle2DType::ZShape, -2, -1, SwizzleMode::Loop);
-
-  // Make swizzle output not a leaf domain.
-  tv2->merge(-2);
-
-  tv0->computeAt(tv2, -1);
-
-  FusionExecutor fe;
-  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
-}
-
-// Test assertion in unsupported pattern: half-inlined loop swizzle.
-TEST_F(NVFuserTest, FusionLoopSwizzleCheck1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({2, 32});
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
-  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
-
-  fusion.addOutput(tv3);
-
-  //[O, 4, 4]
-  tv2->split(-1, 16);
-  tv2->split(-1, 4);
-
-  //[O, 4, 4]
-  tv3->split(-1, 16);
-  tv3->split(-1, 4);
-
-  // Swizzle inner tile of tv2
-  tv2->swizzle(Swizzle2DType::ZShape, -2, -1, SwizzleMode::Loop);
-
-  // Make tv2 swizzled and half-inlined (unsupported).
-  tv0->computeAt(tv3, -2);
-
-  FusionExecutor fe;
-  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
-}
-
-TEST_F(NVFuserTest, FusionUnsqueeze1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  std::vector<int64_t> shape({10, 11});
-
-  auto tv0 = makeConcreteTensor(shape);
-  fusion.addInput(tv0);
-
-  // [I, R]
-  auto tv1 = sum(tv0, {1});
-  // [I, B]
-  auto tv2 = unsqueeze(tv1, -1);
-  fusion.addOutput(tv2);
-
-  TORCH_CHECK(
-      tv2->nDims() == 2, "Unpected unsqueeze result: ", tv2->toString());
-  TORCH_CHECK(
-      tv2->axis(1)->isBroadcast(),
-      "Unexpected unsqueeze result: ",
-      tv2->toString());
-
-  // tv1 has only one non-reduction axis. An exception should be
-  // thrown.
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(unsqueeze(tv1, 2));
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({10, 11}, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto ref = t0.sum(1).unsqueeze(-1);
-
-  testValidate(&fusion, cg_outputs, aten_inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSqueeze1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  std::vector<int64_t> shape({10, 11});
-
-  auto tv0 = makeConcreteTensor(shape);
-  fusion.addInput(tv0);
-
-  // [I, B]
-  auto tv1 = sum(tv0, {1}, true);
-  // [I]
-  auto tv2 = squeeze(tv1, {shape[0], 1});
-  fusion.addOutput(tv2);
-
-  TORCH_CHECK(
-      tv2->nDims() == 2, "Unexpected squeeze result: ", tv2->toString());
-
-  // [I, R]
-  auto tv3 = sum(tv0, {1});
-  // tv3 has only one non-reduction axis. The extent of the first axis
-  // is not one, so squeeze should fail.
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
-  ASSERT_ANY_THROW(squeeze(tv3, {shape[0], 1}));
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({10, 11}, options);
-  std::vector<IValue> aten_inputs = {t0};
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs);
-  auto cg_outputs = fe.runFusion(aten_inputs);
-
-  auto ref = t0.sum(1, true).squeeze(-1);
-
-  testValidate(&fusion, cg_outputs, aten_inputs, {ref}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionContigPredicate_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  auto tv1 = set(tv0);
-  auto tv2 = broadcast(tv1, {false, true, false});
-  fusion.addOutput(tv2);
-
-  tv2->merge(-2, -1);
-  tv2->merge(-2, -1);
-  tv2->split(-1, 100);
-  tv0->computeAt(tv2, -1);
-
-  GpuLower gpulw(&fusion);
-  TORCH_CHECK(PredicatedChecker::isPredicated(tv1, gpulw));
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({3, 4}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-
-  auto ref = t0.unsqueeze(1);
-
-  testValidate(fe.kernel(), cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-// Repro of https://github.com/csarofeen/pytorch/issues/1777
-TEST_F(NVFuserTest, FusionDivScalarLhs_CUDA) {
-  // tv1 = 2.0 / tv0
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-  TensorView* tv1 = div(IrBuilder::create<Double>(2.0), tv0);
-  fusion.addOutput(tv1);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn({3, 3}, options);
-  // There's no overload div(Scalar, Tensor) in ATen
-  auto aten_output = at::div(
-      at::native::wrapped_scalar_tensor(at::Scalar(2.0), options.device()), t0);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-
-  testValidate(&fusion, cg_outputs, {t0}, {aten_output}, __LINE__, __FILE__);
-}
-
-// Repro of an issue of the reduction scheduler with a broadcast
-// domain concretized to multiple domains that are not proven to have
-// the same extent
-TEST_F(NVFuserTest, FusionRepro1713_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(2);
-  auto tv1 = makeSymbolicTensor(2);
-  auto tv2 = makeSymbolicTensor(1);
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-  fusion->addInput(tv2);
-  auto tv3 = broadcast(tv2, {false, true});
-
-  auto tv4 = add(tv3, tv0);
-
-  auto tv5 = add(tv3, tv1);
-  auto tv6 = sum(tv5, {0});
-  fusion->addOutput(tv4);
-  fusion->addOutput(tv6);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({1024, 204800}, options);
-  // Original repro had the same shape as t0, but this should work
-  // with a different extent at the second axis
-  at::Tensor t1 = at::randn({1024, 123}, options);
-  at::Tensor t2 = at::randn({1024}, options);
-  std::vector<IValue> aten_inputs({t0, t1, t2});
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
-
-  auto t3 = t2.unsqueeze(-1);
-  auto t4 = t3 + t0;
-  auto t5 = t3 + t1;
-  auto t6 = sum(t5, {0});
-
-  testValidate(
-      executor_cache.fusion(),
-      cg_outputs,
-      {t0, t1, t2},
-      {t4, t6},
-      __LINE__,
-      __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionExpand_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto w = 2, x = 3, y = 4, z = 5;
-
-  // Test
-  // a simple expand
-  // Expand that's propagated
-  // expand_as
-  // symbolic expand
-
-  // x
-  auto tv0 = makeSymbolicTensor(1);
-  fusion->addInput(tv0);
-
-  auto tv1 = broadcast(tv0, {false, true});
-  auto tv2 = expand(tv1, {tv0->axis(0)->extent(), IrBuilder::create<Int>(y)});
-
-  // x
-  auto tv3 = makeSymbolicTensor(1);
-  fusion->addInput(tv3);
-  auto tv4 = broadcast(tv3, {false, true});
-  auto tv5 = add(tv4, tv2);
-  // [x, e_y]
-
-  // [x, y, z]
-  auto tv6 = makeSymbolicTensor(3);
-  fusion->addInput(tv6);
-
-  // Disjoint set op will cause a segmentation for just this op.
-  auto tmp_7 = set(tv6);
-  fusion->addOutput(tmp_7);
-
-  auto tv7 = broadcast(tv5, {false, false, true});
-
-  auto tv8 = expand_as(tv7, tv6);
-  // [x, e_y, e_z]
-
-  auto w_symbolic = IrBuilder::create<Int>();
-  fusion->addInput(w_symbolic);
-
-  auto tv9 = broadcast(tv8, {true, false, false, false});
-  //[1, x, e_y, e_z]
-
-  auto tv10 = expand(
-      tv9,
-      {w_symbolic,
-       tv9->axis(1)->extent(),
-       tv9->axis(2)->expandedExtent(),
-       tv9->axis(3)->expandedExtent()});
-
-  fusion->addOutput(tv10);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({x}, options);
-  at::Tensor t3 = at::randn({x}, options);
-  at::Tensor t6 = at::randn({x, y, z}, options);
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-
-  auto cg_outputs = executor_cache.runFusionWithInputs({t0, t3, t6, w});
-  auto cg_out = cg_outputs[1];
-
-  TORCH_INTERNAL_ASSERT(cg_out.size(0) == w);
-  TORCH_INTERNAL_ASSERT(cg_out.size(1) == x);
-  TORCH_INTERNAL_ASSERT(cg_out.size(2) == y);
-  TORCH_INTERNAL_ASSERT(cg_out.size(3) == z);
-  TORCH_INTERNAL_ASSERT(cg_out.stride(0) == 0);
-  TORCH_INTERNAL_ASSERT(cg_out.stride(1) == 1);
-  TORCH_INTERNAL_ASSERT(cg_out.stride(2) == 0);
-  TORCH_INTERNAL_ASSERT(cg_out.stride(3) == 0);
-
-  auto t10 = t0.unsqueeze(-1)
-                 .expand({x, y})
-                 .add(t3.unsqueeze(-1))
-                 .unsqueeze(-1)
-                 .expand_as(t6)
-                 .unsqueeze(0)
-                 .expand({w, x, y, z});
-
-  testValidate(
-      executor_cache.fusion(),
-      cg_outputs,
-      {t0, t3, t6, w},
-      {t6, t10},
-      __LINE__,
-      __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionExpandIssue1751_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto x = 3, y = 4, z = 5;
-
-  // y, z
-  auto tv0 = makeSymbolicTensor(2);
-  fusion->addInput(tv0);
-
-  auto tv1 = broadcast(tv0, {true, false, false});
-
-  // Two ways to propagate extents as is: use -1 or explicitly pass
-  // the extent vals.
-
-  auto tv2 = expand(
-      tv1,
-      {IrBuilder::create<Int>(x),
-       IrBuilder::create<Int>(-1),
-       IrBuilder::create<Int>(-1)});
-
-  auto tv3 = expand(
-      tv1,
-      {IrBuilder::create<Int>(x),
-       tv0->axis(0)->extent(),
-       tv0->axis(1)->extent()});
-
-  fusion->addOutput(tv2);
-  fusion->addOutput(tv3);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({y, z}, options);
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-
-  auto cg_outputs = executor_cache.runFusionWithInputs({t0});
-
-  for (const auto& cg_out : cg_outputs) {
-    TORCH_INTERNAL_ASSERT(cg_out.size(0) == x);
-    TORCH_INTERNAL_ASSERT(cg_out.size(1) == y);
-    TORCH_INTERNAL_ASSERT(cg_out.size(2) == z);
-  }
-
-  auto t2 = t0.expand({x, y, z});
-
-  testValidate(
-      executor_cache.fusion(), cg_outputs, {t0}, {t2, t2}, __LINE__, __FILE__);
-}
-
-// TODO: Make sure the kernel uses the expanded concrete size instead
-// of the symbolic size
-TEST_F(NVFuserTest, FusionExpandToConcrete_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto x = 3, y = 4;
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion->addInput(tv0);
-
-  auto tv1 = broadcast(tv0, {true, false});
-
-  auto tv2 =
-      expand(tv1, {IrBuilder::create<Int>(x), IrBuilder::create<Int>(y)});
-
-  fusion->addOutput(tv2);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({y}, options);
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-
-  auto cg_outputs = executor_cache.runFusionWithInputs({t0});
-
-  for (const auto& cg_out : cg_outputs) {
-    TORCH_INTERNAL_ASSERT(cg_out.size(0) == x);
-    TORCH_INTERNAL_ASSERT(cg_out.size(1) == y);
-  }
-
-  auto t2 = t0.expand({x, y});
-
-  testValidate(
-      executor_cache.fusion(), cg_outputs, {t0}, {t2}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionReproNoncontigBroadcast_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({4, 32, 16, 112, 112}, options).transpose(-1, -2);
-  at::Tensor t1 = at::randn({32, 1, 112, 1}, options).transpose(-1, -2);
-
-  auto tv0 = TensorViewBuilder()
-                 .ndims(5)
-                 .contiguity({true, true, false, false, false}) // ttfff
-                 .shape({-1, -1, -1, -1, -1})
-                 .dtype(DataType::Half)
-                 .build();
-  auto tv1 = TensorViewBuilder()
-                 .ndims(4)
-                 .contiguity({true, false, false, true}) // tfft
-                 .shape({-1, 1, 1, -1})
-                 .dtype(DataType::Half)
-                 .build();
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-
-  fusion->addOutput(tv2);
-
-  std::vector<IValue> aten_inputs({t0, t1});
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
-
-  auto t2 = t0 + t1;
-
-  testValidate(
-      executor_cache.fusion(), cg_outputs, {t0, t1}, {t2}, __LINE__, __FILE__);
-}
-
-namespace {
-
-// check that the resulting sibling are identical
-void checkSiblingConsistency(TensorView* replay, TensorView* target) {
-  auto replay_root = replay->getRootDomain();
-  auto replay_dom = replay->domain()->domain();
-  auto target_root = target->getRootDomain();
-  auto target_dom = target->domain()->domain();
-  std::unordered_map<IterDomain*, IterDomain*> target2replay_map;
-  TORCH_CHECK(replay_root.size() == target_root.size());
-  target2replay_map.reserve(replay_root.size());
-  std::transform(
-      target_root.begin(),
-      target_root.end(),
-      replay_root.begin(),
-      std::inserter(target2replay_map, target2replay_map.begin()),
-      [](auto a, auto b) { return std::make_pair(a, b); });
-  BestEffortReplay replay_(replay_dom, target_dom, target2replay_map);
-  auto r = replay_.getReplay();
-  for (int64_t i = 0; i < replay_dom.size(); i++) {
-    auto target_id = target_dom[i];
-    auto replay_it = r.find(target_id);
-    TORCH_CHECK(replay_it != r.end());
-    TORCH_CHECK(
-        replay_it->second == replay_dom[i],
-        "IterDomain mismatch when checking ",
-        replay,
-        " and ",
-        target,
-        " at ",
-        i,
-        ", got ",
-        replay_it->second,
-        " and ",
-        replay_dom[i]);
-  }
-};
-
-} // namespace
-
-TEST_F(NVFuserTest, FusionTransformPropagateSibling_CUDA) {
-  // https://github.com/csarofeen/pytorch/issues/1760
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tvs = Welford(tv0, {1});
-  fusion.addOutput(tvs.var_sum);
-
-  tvs.avg->split(1, 1);
-  tvs.avg->split(1, 2);
-  tvs.avg->split(1, 3);
-  tvs.var_sum->split(1, 1);
-  tvs.var_sum->split(1, 2);
-  tvs.var_sum->split(1, 3);
-  tvs.n->split(1, 1);
-  tvs.n->split(1, 2);
-  tvs.n->split(1, 3);
-
-  auto var_sum_rf = ir_utils::rfactorHelper(tvs.var_sum, {1, 4});
-
-  TransformPropagatorWithCheck propagator(var_sum_rf);
-  MaxRootDomainInfoSpanningTree(var_sum_rf).traverse(&propagator);
-
-  auto rf_tvs = ir_utils::producerTvsOf(tvs.var_sum);
-
-  std::vector<TensorView*> siblings[] = {{tvs.avg, tvs.var_sum, tvs.n}, rf_tvs};
-  for (auto tensors : siblings) {
-    for (auto t1 : tensors) {
-      for (auto t2 : tensors) {
-        TORCH_CHECK(TransformReplay::fullSelfMatching(t1, t2));
-      }
-    }
-  }
-}
-
-TEST_F(NVFuserTest, FusionTransformPropagateSelectorSibling_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tvs = Welford(tv0, {1});
-  fusion.addOutput(tvs.var_sum);
-
-  tvs.avg->split(1, 1);
-  tvs.avg->split(1, 2);
-  tvs.avg->split(1, 3);
-  tvs.var_sum->split(1, 1);
-  tvs.var_sum->split(1, 2);
-  tvs.var_sum->split(1, 3);
-  tvs.n->split(1, 1);
-  tvs.n->split(1, 2);
-  tvs.n->split(1, 3);
-
-  auto var_sum_rf = ir_utils::rfactorHelper(tvs.var_sum, {1, 4});
-
-  struct DisableTv0 : public MaxInfoSpanningTree::Selector {
-    TensorView* tv0;
-    virtual bool allowC2P(TensorView* from, TensorView* to) override {
-      return from != tv0 && to != tv0;
-    };
-    virtual bool allowP2C(TensorView* from, TensorView* to) override {
-      return from != tv0 && to != tv0;
-    };
-    virtual bool allowSibling(TensorView* from, TensorView* to) override {
-      return true;
-    }
-    DisableTv0(TensorView* tv0) : tv0(tv0) {}
-  } selector1(tv0);
-
-  struct DisableTv0AndSibling : public DisableTv0 {
-    virtual bool allowSibling(TensorView* from, TensorView* to) override {
-      return false;
-    }
-    using DisableTv0::DisableTv0;
-  } selector2(tv0);
-
-  TransformPropagatorWithCheck propagator(var_sum_rf);
-  MaxRootDomainInfoSpanningTree good_path(var_sum_rf, &selector1);
-  MaxRootDomainInfoSpanningTree bad_path(var_sum_rf, &selector2);
-
-  auto rf_tvs = ir_utils::producerTvsOf(tvs.var_sum);
-
-  auto check = [&]() {
-    std::vector<TensorView*> siblings[] = {
-        {tvs.avg, tvs.var_sum, tvs.n}, rf_tvs};
-    for (auto tensors : siblings) {
-      for (auto t1 : tensors) {
-        for (auto t2 : tensors) {
-          TORCH_CHECK(TransformReplay::fullSelfMatching(t1, t2));
-        }
-      }
-    }
-  };
-
-  bad_path.traverse(&propagator);
-  ASSERT_ANY_THROW(check());
-  good_path.traverse(&propagator);
-  check();
-}
-
-TEST_F(NVFuserTest, FusionTransformPropagatePosition_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(4);
-  auto tv1 = makeSymbolicTensor(6);
-  fusion.addInput(tv0);
-
-  auto tv2 = broadcast(tv0, {false, false, true, false, false, true});
-  auto tv3 = add(tv1, tv2);
-  fusion.addOutput(tv3);
-
-  tv0->merge(2);
-  tv0->merge(0);
-  TransformPropagatorWithCheck propagator(tv0);
-  MaxRootDomainInfoSpanningTree(tv0).traverse(&propagator);
-
-  TORCH_CHECK(tv1->nDims() == 4);
-}
-
-TEST_F(NVFuserTest, FusionIgnoreZeroDimReduction_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion->addInput(tv0);
-  auto tv1 = sum(tv0, {0});
-  // tv1 is effectively a zero-dim tensor as it only has a reduction
-  // axis.
-  // Reducing it further is converted to just a set op.
-  auto tv2 = sum(tv1, {0});
-  fusion->addOutput(tv2);
-
-  auto tv2_def = dynamic_cast<UnaryOp*>(tv2->definition());
-  TORCH_CHECK(
-      tv2_def != nullptr,
-      "Expected UnaryOp but found ",
-      tv2->definition()->toString());
-
-  TORCH_CHECK(
-      tv2_def->getUnaryOpType() == UnaryOpType::Set,
-      "Expected UnaryOpType::Set but found ",
-      tv2_def->getUnaryOpType());
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  auto t0 = at::randn({12345}, options);
-  std::vector<IValue> aten_inputs({t0});
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
-
-  auto ref = sum(t0, {0});
-
-  testValidate(
-      executor_cache.fusion(),
-      cg_outputs,
-      aten_inputs,
-      {ref},
-      __LINE__,
-      __FILE__);
-}
-
-// Repro of issue #1770
-TEST_F(NVFuserTest, FusionIssue1770Repro_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion->addInput(tv0);
-  auto tv1 = makeSymbolicTensor(1);
-  fusion->addInput(tv1);
-
-  auto tv2 = ge(tv0, tv1);
-  auto tv3 =
-      where(tv2, IrBuilder::create<Double>(1), IrBuilder::create<Double>(2));
-  fusion->addOutput(tv3);
-
-  std::vector<int64_t> shape({999});
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn(shape, options);
-  at::Tensor t1 = at::randn(shape, options);
-  std::vector<IValue> aten_inputs({t0, t1});
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
-
-  auto ref = where(t0 >= t1, 1.0, 2.0);
-
-  testValidate(
-      executor_cache.fusion(),
-      cg_outputs,
-      aten_inputs,
-      {ref},
-      __LINE__,
-      __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionTransformPropagatorSelector_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion->addInput(tv0);
-  auto tv1 = makeSymbolicTensor(1);
-  fusion->addInput(tv1);
-
-  auto tv2 = add(tv0, tv1);
-
-  auto tv3 = sin(tv2);
-  auto tv4 = cos(tv2);
-
-  fusion->addOutput(tv3);
-  fusion->addOutput(tv4);
-
-  tv2->split(0, 10);
-
-  struct Selector : public MaxInfoSpanningTree::Selector {
-    TensorView* tv0;
-    TensorView* tv3;
-    virtual bool allowC2P(TensorView* from, TensorView* to) override {
-      return to == tv0;
-    }
-    virtual bool allowP2C(TensorView* from, TensorView* to) override {
-      return to == tv3;
-    }
-    virtual bool allowSibling(TensorView* from, TensorView* to) override {
-      return false;
-    }
-    Selector(TensorView* tv0, TensorView* tv3) : tv0(tv0), tv3(tv3) {}
-  } selector(tv0, tv3);
-
-  TransformPropagatorWithCheck propagator(tv2);
-  MaxRootDomainInfoSpanningTree(tv2, &selector).traverse(&propagator);
-
-  TORCH_CHECK(tv0->nDims() == 2);
-  TORCH_CHECK(tv1->nDims() == 1);
-  TORCH_CHECK(tv2->nDims() == 2);
-  TORCH_CHECK(tv3->nDims() == 2);
-  TORCH_CHECK(tv4->nDims() == 1);
-}
-
-TEST_F(NVFuserTest, FusionTransformPropagatorPos_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeConcreteTensor({22, 105});
-  fusion->addInput(tv0);
-
-  auto tv1 = sin(tv0);
-  fusion->addOutput(tv1);
-
-  tv1->split(0, 2);
-  tv1->split(-1, 3);
-  tv1->split(-1, 5);
-
-  TransformPropagatorWithCheck propagator(tv1, 2);
-  MaxRootDomainInfoSpanningTree(tv1, 2).traverse(&propagator);
-
-  auto expect = makeConcreteTensor({22, 105});
-  expect->split(0, 2);
-  TORCH_CHECK(TransformReplay::fullSelfMatching(expect, tv0));
-}
-
-TEST_F(NVFuserTest, FusionMaxRootDomainInfoSpanningTreePrintTwice_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(3);
-  fusion->addInput(tv0);
-
-  auto tv1 = sum(tv0, {0});
-  auto tv2 = neg(tv1);
-
-  fusion->addOutput(tv2);
-
-  tv1->split(0, 10);
-
-  struct Printer : public MaxInfoSpanningTree::Propagator {
-    std::stringstream ss;
-    virtual void propagateC2P(TensorView* from, TensorView* to) override {
-      ss << "propagateC2P" << std::endl;
-      ss << "from: " << from->name() << std::endl;
-      ss << "to: " << to->name() << std::endl;
-    }
-    virtual void propagateP2C(TensorView* from, TensorView* to) override {
-      ss << "propagateP2C" << std::endl;
-      ss << "from: " << from->name() << std::endl;
-      ss << "to: " << to->name() << std::endl;
-    }
-    virtual void propagateSibling(TensorView* from, TensorView* to) override {
-      ss << "propagateSibling" << std::endl;
-      ss << "from: " << from->name() << std::endl;
-      ss << "to: " << to->name() << std::endl;
-    }
-  } printer1, printer2;
-  printer1.ss << std::endl;
-  printer2.ss << std::endl;
-
-  MaxRootDomainInfoSpanningTree path(tv1);
-  path.traverse(&printer1);
-  path.traverse(&printer2);
-
-  auto expect = R"ESCAPE(
-propagateC2P
-from: 1
-to: 0
-propagateP2C
-from: 1
-to: 2
-)ESCAPE";
-  TORCH_CHECK(printer1.ss.str() == expect);
-  TORCH_CHECK(printer2.ss.str() == expect);
-}
-
-TEST_F(NVFuserTest, FusionTransformPropagatorNoOverwrite_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeSymbolicTensor(1);
-  fusion->addInput(tv0);
-  auto tv1 = broadcast(tv0, {true, false, true});
-  auto tv2 = sin(tv1);
-  fusion->addOutput(tv2);
-
-  tv0->split(0, 2);
-  tv2->split(1, 2);
-  tv2->split(0, 4);
-
-  MaxRootDomainInfoSpanningTree path1(tv2);
-  TransformPropagatorWithCheck propagator1(tv2);
-  path1.traverse(&propagator1);
-
-  MaxRootDomainInfoSpanningTree path2(tv0);
-  TransformPropagatorWithCheck propagator2(tv0);
-  path2.traverse(&propagator2);
-
-  TORCH_CHECK(tv1->axis(0)->isBroadcast());
-  TORCH_CHECK(tv1->axis(1)->isBroadcast());
-  TORCH_CHECK(!tv1->axis(2)->isBroadcast());
-  TORCH_CHECK(!tv1->axis(3)->isBroadcast());
-  TORCH_CHECK(tv1->axis(4)->isBroadcast());
-
-  auto expect = makeSymbolicTensor(3);
-  expect->split(1, 2);
-  expect->split(0, 4);
-  TORCH_CHECK(TransformReplay::fullSelfMatching(expect, tv1));
-}
-
-TEST_F(NVFuserTest, FusionIssue1785Repro_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // Set up your input tensor views
-  TensorView* tv0 = makeContigTensor(1);
-  TensorView* tv1 = makeContigTensor(2);
-
-  // Register your inputs
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-
-  auto tv2 = set(tv0);
-  // [B, I]
-  auto tv3 = broadcast(tv2, {true, false});
-  auto tv4 = add(tv3, tv1);
-  auto tv5 = set(tv4);
-
-  // Register your outputs
-  fusion.addOutput(tv5);
-
-  tv5->split(0, 8);
-  tv5->split(-1, 8);
-
-  // [Serial, TIDy, TIDX, Serial]
-
-  tv4->computeAt(tv5, -2);
-  tv3->computeAt(tv4, -1);
-  tv2->computeAt(tv3, 0);
-  tv2->split(0, 8);
-  tv2->axis(0)->parallelize(ParallelType::TIDx);
-  tv1->computeAt(tv5, -2);
-
-  tv5->axis(1)->parallelize(ParallelType::TIDy);
-  tv5->axis(2)->parallelize(ParallelType::TIDx);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor in1 = at::randn({16}, options);
-  at::Tensor in2 = at::randn({12, 16}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {in1, in2});
-  auto cg_outputs = fe.runFusion({in1, in2});
-
-  auto tv_ref = in1 + in2;
-
-  testValidate(&fusion, cg_outputs, {in1, in2}, {tv_ref}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionSkipReplay_CUDA) {
-  {
-    Fusion fusion;
-    FusionGuard fg(&fusion);
-
-    TensorView* tv0 = makeContigTensor(1);
-    TensorView* tv1 = makeContigTensor(2);
-    fusion.addInput(tv0);
-    fusion.addInput(tv1);
-
-    auto tv2 = broadcast(tv0, {false, true});
-    auto tv3 = add(tv2, tv1);
-    fusion.addOutput(tv3);
-
-    tv3->split(1, 2, false);
-
-    TransformPropagatorWithCheck propagator(tv3);
-    MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
-  }
-
-  {
-    Fusion fusion;
-    FusionGuard fg(&fusion);
-
-    TensorView* tv0 = makeContigTensor(3);
-    fusion.addInput(tv0);
-
-    auto tv1 = sum(tv0, {0, 2});
-    auto tv2 = sin(tv1);
-    fusion.addOutput(tv2);
-
-    tv0->split(1, 2, false);
-
-    TransformPropagatorWithCheck propagator(tv0);
-    MaxRootDomainInfoSpanningTree(tv0).traverse(&propagator);
-  }
-}
-
-TEST_F(NVFuserTest, FusionInlineRepro1803_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeContigTensor(2);
-
-  fusion.addInput(tv0);
-  auto tv1 = set(tv0);
-  auto tvs = Welford(tv1, {1});
-  auto tvo = set(tvs.var_sum);
-  fusion.addOutput(tvo);
-
-  tvo->split(0, 16);
-  tvo->axis(1)->parallelize(ParallelType::Unroll);
-
-  tv0->computeAt(tvo, -1, ComputeAtMode::BestEffort);
-
-  TORCH_CHECK(
-      tvs.var_sum->getComputeAtPosition() == tvs.avg->getComputeAtPosition());
-  TORCH_CHECK(
-      tvs.var_sum->getComputeAtPosition() == tvs.n->getComputeAtPosition());
-  TORCH_CHECK(tvs.var_sum->getComputeAtPosition() == 1);
-}
-
-// Unit test for the transform selection logic
-TEST_F(NVFuserTest, FusionBoundedDirectionSelection1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  TensorView* tv0 = makeContigTensor(2);
-
-  fusion.addInput(tv0);
-  auto tv1 = set(tv0);
-  auto tv2 = set(tv1);
-  auto tv3 = add(tv2, tv1);
-  fusion.addOutput(tv3);
-
-  tv3->split(-1, 5);
-  tv3->split(-1, 8);
-
-  scheduler_utils::BoundedDirectionalTransformPropagator::backward(
-      tv3, -1, {tv0, tv2});
-
-  // Check that the splits are replayed on tv1, even though tv2
-  //  is part of the boundary.
-  TORCH_INTERNAL_ASSERT(
-      tv2->nDims() == 4, "Propagator didn't propagate to tv2");
-}
-
-TEST_F(NVFuserTest, FusionIssueRepro1844_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  std::vector<int64_t> shape = {2, 1, 768};
-  std::vector<int64_t> sum_to_shape = {768};
-  std::vector<int64_t> sum_to_axes = {0, 1};
-  double kProb = 0.5;
-
-  std::vector<Int*> sum_to_symb;
-  std::transform(
-      sum_to_shape.begin(),
-      sum_to_shape.end(),
-      std::back_inserter(sum_to_symb),
-      [](int s) -> Int* { return IrBuilder::create<Int>(s); });
-
-  TensorView* tv0 = makeContigConcreteTensor(shape);
-  TensorView* tv1 = makeContigConcreteTensor(shape);
-  TensorView* tv2 = makeContigConcreteTensor(shape, DataType::Bool);
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-  fusion->addInput(tv2);
-
-  Double* prob = IrBuilder::create<Double>(kProb);
-  auto grad_input = dropout_backward(tv1, tv2, prob);
-  auto grad_gelu = gelu_backward(grad_input, tv0);
-  auto grad_bias = sum_to(grad_gelu, sum_to_symb);
-
-  fusion->addOutput(grad_gelu);
-  fusion->addOutput(grad_bias);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  const auto mask_options =
-      at::TensorOptions().dtype(at::kBool).device(at::kCUDA, 0);
-  at::manual_seed(0);
-
-  at::Tensor a = at::randn(shape, options);
-  at::Tensor b = at::randn(shape, options);
-  at::Tensor c = at::randn(shape, options);
-  auto mask = at::gt(c, 0.0f);
-  std::vector<IValue> aten_inputs = {a, b, mask};
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
-
-  auto dinput = at::native_dropout_backward(b, mask, kProb);
-  auto dgelu = at::gelu_backward(dinput, a, "none");
-  auto dbias = dgelu.sum(sum_to_axes);
-
-  testValidate(
-      executor_cache.fusion(),
-      cg_outputs,
-      aten_inputs,
-      {dgelu, dbias},
-      __LINE__,
-      __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionInsertMagicZero1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
-  auto tv2 = set(tv1);
-  fusion.addOutput(tv2);
-
-  tv2->split(0, 32);
-  tv2->split(-1, 2);
-  tv2->reorder({{1, 2}, {2, 1}});
-  tv2->merge(0);
-
-  TransformPropagatorWithCheck propagator(tv2);
-  MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
-
-  tv0->computeAt(tv2, 1);
-
-  // The predicate of tv2 should be protected with magic zero
-  GpuLower gpulw(&fusion);
-  TORCH_CHECK(
-      PredicateMagicZeroChecker::isProtected(tv2, gpulw),
-      "Failed to protect the predicates of ",
-      tv2->toString());
-}
-
-TEST_F(NVFuserTest, FusionRepro1860_CUDA) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr;
-  FusionGuard fg(&fusion);
-  std::vector<bool> contiguity{true, false, false};
-
-  std::vector<int64_t> shape{1, -1, -1};
-  TensorView* tv0 = makeContigConcreteTensor(shape);
-  fusion.addInput(tv0);
-  TensorView* tv1 = makeContigConcreteTensor(shape);
-  fusion.addInput(tv1);
-  TensorView* tv2 = makeContigConcreteTensor(shape);
-  fusion.addInput(tv2);
-
-  std::vector<IterDomain*> domain1(3, nullptr);
-  for (const auto i : c10::irange(3)) {
-    if (i == 0) {
-      domain1[i] =
-          IterDomainBuilder(
-              FusionGuard::getCurFusion()->zeroVal(), IrBuilder::create<Int>(1))
-              .iter_type(IterType::Broadcast)
-              .build();
-    } else {
-      domain1[i] =
-          IterDomainBuilder(
-              FusionGuard::getCurFusion()->zeroVal(), IrBuilder::create<Int>(1))
-              .expanded_extent(IrBuilder::create<Int>(1 + i))
-              .iter_type(IterType::Broadcast)
-              .build();
-    }
-  }
-
-  TensorView* tv22 = IrBuilder::create<TensorView>(
-      IrBuilder::create<TensorDomain>(domain1, contiguity), DataType::Float);
-
-  fusion.addInput(tv22);
-
-  auto tv3 = add(tv0, tv1);
-  auto tv4 = softmax(tv3, 0);
-  auto tv5 = add(tv4, tv22);
-  fusion.addOutput(tv5);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor input1 = at::randn({1, 2, 3}, options);
-  at::Tensor input2 = at::randn({1, 2, 3}, options);
-  at::Tensor input3 = at::randn({1, 2, 3}, options);
-  at::Tensor input4 = at::randn({1, 1, 1}, options).expand({1, 2, 3});
-  std::vector<IValue> aten_inputs = {input1, input2, input3, input4};
-
-  FusionExecutorCache executor_cache(std::move(fusion_ptr));
-  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
-}
-
-TEST_F(NVFuserTest, FusionExpandReduce_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeConcreteTensor({1, 8});
-  fusion->addInput(tv0);
-
-  auto tv1 =
-      expand(tv0, {IrBuilder::create<Int>(12), IrBuilder::create<Int>(8)});
-
-  auto tv2 = sum(tv1, {0});
-  fusion->addOutput(tv2);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  auto t0 = at::randn({1, 8}, options);
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-  auto cg_outputs = executor_cache.runFusionWithInputs({t0});
-
-  auto ref = t0.expand({12, 8}).sum({0});
-
-  testValidate(
-      executor_cache.fusion(), cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
-}
-
-// Predicate elimination issue repro:
-TEST_F(NVFuserTest, FusionExpandReduce2_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeConcreteTensor({1, 4});
-  fusion->addInput(tv0);
-
-  auto tv1 =
-      expand(tv0, {IrBuilder::create<Int>(3), IrBuilder::create<Int>(4)});
-
-  auto tv2 = sum(tv1, {0});
-  fusion->addOutput(tv2);
-
-  // tv2[r{3}, i{4}]
-  tv2->split(0, NamedScalar::getParallelDim(ParallelType::TIDy));
-  tv2->axis(1)->parallelize(ParallelType::TIDy);
-  tv2->split(0, NamedScalar::getParallelDim(ParallelType::BIDy), false);
-  tv2->axis(0)->parallelize(ParallelType::BIDy);
-  tv2->split(-1, NamedScalar::getParallelDim(ParallelType::TIDx));
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv2->axis(-2)->parallelize(ParallelType::BIDx);
-  // [rBIDy, rO, rTIDy, iBIDx, iTIDx]
-  tv2->reorder({{-2, 0}, {-1, 1}, {2, 2}});
-  // [iBIDx, iTIDx, rTIDy, rBIDy, rO]
-  auto tv3 = tv2->rFactor({-1});
-
-  TransformPropagatorWithCheck propagator(tv3);
-  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
-  scheduler_utils::parallelizeAllLike(tv3);
-  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::manual_seed(0);
-  auto t0 = at::randn({1, 4}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion.get(), {t0}, LaunchParams(-1, 2, -1, 4, 2, 1));
-  auto cg_outputs = fe.runFusion({t0}, LaunchParams(-1, 2, -1, 4, 2, 1));
-
-  auto ref = t0.expand({3, 4}).sum({0});
-
-  testValidate(
-      fusion.get(),
-      cg_outputs,
-      {t0},
-      {ref},
-      __LINE__,
-      __FILE__,
-      "",
-      LaunchParams(-1, 2, -1, 4, 2, 1));
-}
-
-TEST_F(NVFuserTest, FusionExpandBadShapeTest_CUDA) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  Fusion& fusion = *fusion_ptr;
-  FusionGuard fg(&fusion);
-  std::vector<bool> contiguity{false, false};
-
-  auto tv0 = makeSymbolicTensor(2);
-  fusion.addInput(tv0);
-
-  std::vector<IterDomain*> domains = {
-      IterDomainBuilder(
-          FusionGuard::getCurFusion()->zeroVal(), IrBuilder::create<Int>())
-          .build(),
-      IterDomainBuilder(
-          FusionGuard::getCurFusion()->zeroVal(), IrBuilder::create<Int>(1))
-          .expanded_extent(IrBuilder::create<Int>(10))
-          .iter_type(IterType::Broadcast)
-          .build()};
-
-  // expand to 10
-  TensorView* tv22 = IrBuilder::create<TensorView>(
-      IrBuilder::create<TensorDomain>(domains, contiguity), DataType::Float);
-
-  fusion.addInput(tv22);
-
-  auto tv3 = add(tv0, tv22);
-  fusion.addOutput(tv3);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  // Incompatible shapes
-  at::Tensor input1 = at::randn({2, 3}, options);
-  // Passing expand size of 5, not 10. Should cause an error
-  at::Tensor input4 = at::randn({2, 1}, options).expand({2, 5});
-
-  std::vector<IValue> aten_inputs = {input1, input4};
-
-  FusionExecutorCache executor_cache(std::move(fusion_ptr));
-  ASSERT_ANY_THROW(executor_cache.runFusionWithInputs(aten_inputs));
-}
-
-TEST_F(
-    NVFuserTest,
-    FusionPointwiseScheduleWithBroadcastAndTrivialReduction_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(3);
-  auto tv1 = makeContigTensor(2);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  auto tv2 = broadcast(tv0, {false, true, false, true, false, true});
-  auto tv3 = sin(tv2);
-  auto tv4 = add(tv3, tv1);
-  auto tv5 = sum(tv4, {1});
-  fusion.addOutput(tv5);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({100, 100, 10}, options);
-  at::Tensor t1 = at::randn({10, 20}, options);
-
-  auto aten_output = (t0.view({100, 1, 100, 1, 10, 1}).sin() + t1).squeeze(1);
-
-  std::vector<IValue> aten_inputs = {t0, t1};
-
-  auto lparams = schedulePointwise(&fusion, aten_inputs);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, aten_inputs, lparams);
-  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
-
-  testValidate(
-      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionInlinePropagatorMismatchedDims1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({2, 3, 4});
-  fusion.addInput(tv0);
-  auto tv1 = sin(tv0);
-  auto tv2 = cos(tv1);
-  auto tv3 = transpose(tv2, 1, 2);
-  auto tv4 = exp(tv3);
-  auto tv5 = tan(tv4);
-  fusion.addOutput(tv5);
-
-  InlinePropagator inline_propagator(tv5, -1, ComputeAtMode::MostInlined);
-  MaxRootDomainInfoSpanningTree(tv5).traverse(&inline_propagator);
-
-  TORCH_CHECK(tv5->getComputeAtPosition() == 3);
-  TORCH_CHECK(tv4->getComputeAtPosition() == 3);
-  TORCH_CHECK(tv3->getComputeAtPosition() == 3);
-  TORCH_CHECK(tv2->getComputeAtPosition() == 1);
-  TORCH_CHECK(tv1->getComputeAtPosition() == 3);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({2, 3, 4}, options);
-  auto output = input.sin().cos().transpose(1, 2).exp().tan();
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
-
-  testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionInlinePropagatorMismatchedDims2_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({2, 3, 4});
-  fusion.addInput(tv0);
-  auto tv1 = sin(tv0);
-  auto tv2 = cos(tv1);
-  auto tv3 = transpose(tv2, 1, 2);
-  auto tv4 = exp(tv3);
-  auto tv5 = tan(tv4);
-  fusion.addOutput(tv5);
-
-  InlinePropagator inline_propagator(tv5, -1, ComputeAtMode::BestEffort);
-  MaxRootDomainInfoSpanningTree(tv5).traverse(&inline_propagator);
-
-  TORCH_CHECK(tv5->getComputeAtPosition() == 3);
-  TORCH_CHECK(tv4->getComputeAtPosition() == 3);
-  TORCH_CHECK(tv3->getComputeAtPosition() == 3);
-  TORCH_CHECK(tv2->getComputeAtPosition() == 1);
-  TORCH_CHECK(tv1->getComputeAtPosition() == 1);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({2, 3, 4}, options);
-  auto output = input.sin().cos().transpose(1, 2).exp().tan();
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
-
-  testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionInlinePropagatorMismatchedDims3_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({2, 3, 4});
-  fusion.addInput(tv0);
-  auto tv1 = sin(tv0);
-  // broadcasting
-  auto tv2 = broadcast(tv1, {false, true, false, true, false, true});
-  auto tv3 = relu(tv2);
-  // trivial reduction
-  auto tv4 = sum(tv3, {1, 3, 5});
-  auto tv5 = cos(tv4);
-  auto tv6 = transpose(tv5, 1, 2);
-  auto tv7 = exp(tv6);
-  auto tv8 = tan(tv7);
-  fusion.addOutput(tv8);
-
-  for (auto tv : {tv2, tv3, tv4}) {
-    tv->merge(0);
-    tv->merge(1);
-    tv->merge(2);
-  }
-
-  InlinePropagator inline_propagator(tv8, -1, ComputeAtMode::MostInlined);
-  MaxRootDomainInfoSpanningTree(tv8).traverse(&inline_propagator);
-
-  TORCH_CHECK(tv8->getComputeAtPosition() == 3);
-  TORCH_CHECK(tv7->getComputeAtPosition() == 3);
-  TORCH_CHECK(tv6->getComputeAtPosition() == 3);
-  TORCH_CHECK(tv5->getComputeAtPosition() == 1);
-  TORCH_CHECK(tv4->getComputeAtPosition() == 3);
-  TORCH_CHECK(tv3->getComputeAtPosition() == 3);
-  TORCH_CHECK(tv2->getComputeAtPosition() == 3);
-  TORCH_CHECK(tv1->getComputeAtPosition() == 3);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({2, 3, 4}, options);
-  auto output = input.sin().relu().cos().transpose(1, 2).exp().tan();
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
-
-  testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionInlinePropagatorMismatchedDims4_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({2, 3, 4});
-  fusion.addInput(tv0);
-  auto tv1 = sin(tv0);
-  auto tv2 = exp(tv1);
-  auto tv3 = relu(tv2);
-  auto tv4 = cos(tv3);
-  auto tv5 = tan(tv4);
-  fusion.addOutput(tv5);
-
-  tv3->merge(1);
-  InlinePropagator inline_propagator(tv0, -1, ComputeAtMode::MostInlined);
-  MaxRootDomainInfoSpanningTree(tv0).traverse(&inline_propagator);
-
-  TORCH_CHECK(tv5->getComputeAtPosition() == 3);
-  TORCH_CHECK(tv4->getComputeAtPosition() == 3);
-  TORCH_CHECK(tv3->getComputeAtPosition() == 1);
-  TORCH_CHECK(tv2->getComputeAtPosition() == 1);
-  TORCH_CHECK(tv1->getComputeAtPosition() == 3);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({2, 3, 4}, options);
-  auto output = input.sin().exp().relu().cos().tan();
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
-
-  testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionInlinePropagatorBroadcast_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({2, 3, 4});
-  fusion.addInput(tv0);
-  auto tv1 = sin(tv0);
-  // broadcasting
-  auto tv2 = broadcast(tv1, {false, true, false, true, false, true});
-  auto tv3 = cos(tv2);
-  auto tv4 = tan(tv3);
-  fusion.addOutput(tv4);
-
-  for (auto tv : {tv2, tv3, tv4}) {
-    tv->merge(0);
-    tv->merge(1);
-    tv->merge(2);
-  }
-
-  InlinePropagator inline_propagator(tv0, -1, ComputeAtMode::MostInlined);
-  MaxRootDomainInfoSpanningTree(tv0).traverse(&inline_propagator);
-
-  TORCH_CHECK(tv4->getComputeAtPosition() == 3);
-  TORCH_CHECK(tv3->getComputeAtPosition() == 3);
-  TORCH_CHECK(tv2->getComputeAtPosition() == 3);
-  TORCH_CHECK(tv1->getComputeAtPosition() == 3);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({2, 3, 4}, options);
-  auto output = input.sin().view({2, 1, 3, 1, 4, 1}).cos().tan();
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
-
-  testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionInlinePropagatorBroadcastTrivialReduction_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({2, 3, 4});
-  fusion.addInput(tv0);
-  auto tv1 = sin(tv0);
-  // broadcasting
-  auto tv2 = broadcast(tv1, {false, true, false, true, false, true});
-  auto tv3 = tan(tv2);
-  // trivial reduction
-  auto tv4 = sum(tv3, {1, 3, 5});
-  auto tv5 = cos(tv4);
-  auto tv6 = exp(tv5);
-  fusion.addOutput(tv6);
-
-  for (auto tv : {tv2, tv3, tv4}) {
-    tv->merge(0);
-    tv->merge(1);
-    tv->merge(2);
-  }
-
-  InlinePropagator inline_propagator(tv6, -1, ComputeAtMode::MostInlined);
-  MaxRootDomainInfoSpanningTree(tv6).traverse(&inline_propagator);
-
-  TORCH_CHECK(tv6->getComputeAtPosition() == 3);
-  TORCH_CHECK(tv5->getComputeAtPosition() == 3);
-  TORCH_CHECK(tv4->getComputeAtPosition() == 3);
-  TORCH_CHECK(tv3->getComputeAtPosition() == 3);
-  TORCH_CHECK(tv2->getComputeAtPosition() == 3);
-  TORCH_CHECK(tv1->getComputeAtPosition() == 3);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::randn({2, 3, 4}, options);
-  auto output = input.sin().tan().cos().exp();
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input});
-  auto cg_outputs = fe.runFusion({input});
-
-  testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionMatchedLeafPosWithoutReplayTrivialReduction_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({2, 1, 3, 1, 4, 1});
-  fusion.addInput(tv0);
-  auto tv1 = sum(tv0, {1, 3, 5});
-  auto tv2 = sin(tv1);
-  fusion.addOutput(tv1);
-
-  for (auto tv : {tv0, tv1}) {
-    tv->merge(0);
-    tv->merge(1);
-    tv->merge(2);
-  }
-
-  TORCH_CHECK(
-      TransformReplay::getMatchedLeafPosWithoutReplayPasC(tv0, tv1, 3) == 3);
-  TORCH_CHECK(
-      TransformReplay::getMatchedLeafPosWithoutReplayCasP(tv1, tv0, 3) == 3);
-  TORCH_CHECK(
-      TransformReplay::getMatchedLeafPosWithoutReplayPasC(tv1, tv2, 3) == 3);
-  TORCH_CHECK(
-      TransformReplay::getMatchedLeafPosWithoutReplayCasP(tv2, tv1, 3) == 3);
-}
-
-TEST_F(NVFuserTest, FusionMatchedLeafPosWithoutReplayBroadcast_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({2, 3, 4});
-  fusion.addInput(tv0);
-  auto tv1 = broadcast(tv0, {false, true, false, true, false, true});
-  auto tv2 = sin(tv1);
-  fusion.addOutput(tv2);
-
-  for (auto tv : {tv1, tv2}) {
-    tv->merge(0);
-    tv->merge(1);
-    tv->merge(2);
-  }
-
-  TORCH_CHECK(
-      TransformReplay::getMatchedLeafPosWithoutReplayPasC(tv0, tv1, 3) == 3);
-  TORCH_CHECK(
-      TransformReplay::getMatchedLeafPosWithoutReplayCasP(tv1, tv0, 3) == 3);
-  TORCH_CHECK(
-      TransformReplay::getMatchedLeafPosWithoutReplayPasC(tv1, tv2, 3) == 3);
-  TORCH_CHECK(
-      TransformReplay::getMatchedLeafPosWithoutReplayCasP(tv2, tv1, 3) == 3);
-}
-
-TEST_F(NVFuserTest, FusionIdGraphTrivialReduction_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeConcreteTensor({2, 3, 4});
-  fusion.addInput(tv0);
-  auto tv1 = broadcast(tv0, {false, true, false, true, false, true});
-  auto tv2 = sum(tv1, {1, 3, 5});
-  auto tv3 = sin(tv2);
-  fusion.addOutput(tv3);
-
-  for (auto tv : {tv1, tv2}) {
-    tv->merge(0);
-    tv->merge(1);
-    tv->merge(2);
-  }
-
-  InlinePropagator inline_propagator(tv3, -1, ComputeAtMode::MostInlined);
-  MaxRootDomainInfoSpanningTree(tv3).traverse(&inline_propagator);
-
-  ComputeAtMap ca_map(&fusion);
-
-  auto all_tvs = ir_utils::allTvs(&fusion);
-  for (auto tv1 : all_tvs) {
-    for (auto tv2 : all_tvs) {
-      if (tv1->isFusionInput() || tv2->isFusionInput()) {
-        continue;
-      }
-      for (int i : c10::irange(3)) {
-        auto id1 = tv1->axis(i);
-        auto id2 = tv2->axis(i);
-        TORCH_CHECK(ca_map.areMapped(id1, id2, IdMappingMode::LOOP));
-        TORCH_CHECK(ca_map.areMapped(id1, id2, IdMappingMode::PERMISSIVE));
-      }
-    }
-  }
-}
-
-TEST_F(NVFuserTest, FusionPrint_CUDA) {
-  auto dtypes = {
-      at::kFloat,
-      at::kDouble,
-      at::kHalf,
-      at::kBFloat16,
-      at::kInt,
-      at::kLong,
-      at::kBool};
-  for (auto dtype : dtypes) {
-    auto fusion = std::make_unique<Fusion>();
-    FusionGuard fg(fusion.get());
-
-    auto tv0 = makeSymbolicTensor(1, aten_to_data_type(dtype));
-    fusion->addInput(tv0);
-    auto tv1 = print(tv0);
-    auto tv2 = sin(tv1);
-    fusion->addOutput(tv2);
-
-    // There is no way to check if anything is printed to the console, but we
-    // can validate that when print exist, compilation and computation are not
-    // broken.
-    auto options = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
-    at::Tensor t0 = at::arange(2, options).to(dtype);
-
-    FusionExecutorCache executor_cache(std::move(fusion));
-    auto cg_outputs = executor_cache.runFusionWithInputs({t0});
-
-    testValidate(
-        executor_cache.fusion(),
-        cg_outputs,
-        {t0},
-        {t0.sin()},
-        __LINE__,
-        __FILE__);
-  }
-}
-
-TEST_F(NVFuserTest, FusionCheckedSymbolicShape_CUDA) {
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor a = at::randn({123, 456}, options);
-  at::Tensor b = at::randn({123, 456}, options);
-  at::Tensor c = at::randn({321, 654}, options);
-
-  using return_t =
-      std::pair<std::unique_ptr<FusionExecutorCache>, std::vector<at::Tensor>>;
-  auto matched_add = [](at::Tensor a, at::Tensor b) -> return_t {
-    auto fusion = std::make_unique<Fusion>();
-    FusionGuard fg(fusion.get());
-
-    Val* s1 = IrBuilder::create<Int>();
-    Val* s2 = IrBuilder::create<Int>();
-    auto builder = TensorViewBuilder().shape(std::vector<Val*>{s1, s2});
-    TensorView* tv0 = builder.build();
-    TensorView* tv1 = builder.build();
-
-    fusion->addInput(tv0);
-    fusion->addInput(tv1);
-
-    auto tv2 = add(tv0, tv1);
-
-    fusion->addOutput(tv2);
-
-    auto executor_cache =
-        std::make_unique<FusionExecutorCache>(std::move(fusion));
-    auto cg_outputs = executor_cache->runFusionWithInputs({a, b});
-    return {std::move(executor_cache), std::move(cg_outputs)};
-  };
-
-  {
-    auto ret1 = matched_add(a, b);
-    testValidate(
-        ret1.first->fusion(), ret1.second, {a, b}, {a + b}, __LINE__, __FILE__);
-  }
-
-  {
-    EXPECT_THAT(
-        [&]() { matched_add(a, c); },
-        ::testing::ThrowsMessage<c10::Error>(
-            ::testing::HasSubstr("Attempting to bind")));
-  }
-}
-
-TEST_F(NVFuserTest, FusionSizeDependentData_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  Val* s1 = IrBuilder::create<Int>();
-  auto builder = TensorViewBuilder().shape(std::vector<Val*>{s1});
-  TensorView* tv0 = builder.build();
-
-  fusion->addInput(tv0);
-
-  auto tv1 = add(tv0, s1);
-
-  fusion->addOutput(tv1);
-
-  const auto options =
-      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor a = at::zeros({123}, options);
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-  auto cg_outputs = executor_cache.runFusionWithInputs({a});
-
-  testValidate(
-      executor_cache.fusion(), cg_outputs, {a}, {a + 123}, __LINE__, __FILE__);
-}
-
-// Repro for issue #1925
-TEST_F(NVFuserTest, FusionScheduleTransposeRepro1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(4);
-  auto tv1 = makeConcreteTensor({-1, -1, -1, 1});
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  auto tv2 = add(tv0, tv1);
-  fusion.addOutput(tv2);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input0 = at::randn({1, 1, 333, 1}, options);
-  at::Tensor input1 = at::randn({1, 1, 333, 1}, options);
-
-  auto lparams = scheduleTranspose(&fusion, {input0, input1});
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input0, input1}, lparams);
-  auto outputs = fe.runFusion({input0, input1}, lparams);
-
-  auto tv_ref = input0 + input1;
-
-  testValidate(
-      &fusion, outputs, {input0, input1}, {tv_ref}, __LINE__, __FILE__);
-}
-
-// Repro for issue #1873
-TEST_F(NVFuserTest, FusionInlineBroadcastIndexing0_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(1);
-  auto tv1 = makeContigTensor(2);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  auto tv2 = set(tv0);
-  auto tv3 = broadcast(tv2, {true, false});
-  auto tv4 = add(tv3, tv1);
-  fusion.addOutput(tv4);
-
-  tv4->merge(0);
-  tv4->split(0, 32);
-
-  tv0->computeAt(tv4, 1);
-
-  tv2->split(-1, 8);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({123}, options);
-  at::Tensor t1 = at::randn({3, 123}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-
-  auto outputs = fe.runFusion({t0, t1});
-
-  auto tv_ref = t0 + t1;
-
-  testValidate(&fusion, outputs, {t0, t1}, {tv_ref}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionPredicateUnshare_CUDA) {
-  // https://github.com/csarofeen/pytorch/issues/1926
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  fusion->addInput(tv0);
-  auto tv1 = set(tv0);
-  auto tv2 = set(tv1);
-  fusion->addOutput(tv2);
-
-  tv1->setMemoryType(MemoryType::Shared);
-  for (auto tv : {tv1, tv2}) {
-    tv->split(0, 4);
-    tv->reorder({{1, -1}});
-    tv->split(1, 8);
-    tv->merge(0);
-    tv->split(0, 1);
-    tv->axis(0)->parallelize(ParallelType::BIDx);
-    tv->axis(1)->parallelize(ParallelType::Unswitch);
-  }
-  tv1->merge(2);
-  tv2->reorder({{2, 3}});
-  tv2->merge(2);
-  for (auto tv : {tv1, tv2}) {
-    tv->axis(-1)->parallelize(ParallelType::TIDx);
-  }
-
-  InlinePropagator propagator(tv2, -1, ComputeAtMode::MostInlined);
-  MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
-
-  auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({5, 5}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion, {t0});
-  auto cg_outputs = fe.runFusion({t0});
-  auto out = cg_outputs[0];
-
-  testValidate(fusion, {out}, {t0}, {t0}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, AsyncCompilation_CUDA) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  TensorView* tv0 = makeSymbolicTensor(2);
-  TensorView* tv1 = makeSymbolicTensor(1);
-  TensorView* tv2 = makeSymbolicTensor(2);
-
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-  fusion->addInput(tv2);
-
-  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(1)); // Group 0
-  TensorView* tv4 =
-      max(tv3, {0}); // Group 0 (use max instead to avoid numerical issues)
-  TensorView* tv5 = add(tv4, tv1); //  Group 0 (Non Broadcast after reduce,
-                                   //  keeps normalization scheduler away)
-  TensorView* tv6 = add(tv5, tv2); //  Group 1 (Broadcast after reduce)
-
-  fusion->addOutput(tv6);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({8, 5}, options);
-  at::Tensor t1 = at::randn({5}, options);
-  at::Tensor t2 = at::randn({8, 5}, options);
-
-  auto t3 = t0.add(1.0);
-  auto t4 = std::get<0>(at::max(t3, 0));
-  auto t5 = t4.add(t1);
-  auto t6 = t5.add(t2);
-
-  FusionExecutorCache executor_cache(std::move(fusion));
-
-  std::vector<IValue> aten_inputs = {t0, t1, t2};
-
-  executor_cache.compileFusionAsync(aten_inputs);
-
-  while (!executor_cache.isCompiled(aten_inputs)) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(20));
-    printf(".");
-  }
-
-  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
-
-  TORCH_CHECK(
-      executor_cache.getMostRecentKernelRuntime()->isSegmented(),
-      "segmentation didn't happen");
-  TORCH_CHECK(
-      executor_cache.getMostRecentKernelRuntime()
-              ->fusionSegments()
-              ->groups()
-              .size() == 2,
-      "segmentation didn't happen as expected");
-
-  testValidate(
-      executor_cache.fusion(), outputs, aten_inputs, {t6}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionMergeBroadcastingTrivialReduction1_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  TensorView* tv0 = makeConcreteTensor({1, 1});
-  TensorView* tv1 = makeConcreteTensor({-1});
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-  auto tv2 = sum(tv0, {1});
-  auto tv3 = add(tv2, tv1);
-  fusion->addOutput(tv3);
-
-  tv0->merge(0);
-
-  MaxRootDomainInfoSpanningTree tree(tv0);
-  TransformPropagatorWithCheck tp(tv0);
-  tree.traverse(&tp);
-
-  InlinePropagator ip(tv0, -1, ComputeAtMode::MostInlined);
-  tree.traverse(&ip);
-
-  auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({1, 1}, options);
-  at::Tensor t1 = at::randn({10}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
-  auto out = cg_outputs[0];
-
-  testValidate(
-      fusion, {out}, {t0, t1}, {t1 + t0.flatten()}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionMergeBroadcastingTrivialReduction2_CUDA) {
-  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
-  auto fusion = fusion_ptr.get();
-  FusionGuard fg(fusion);
-
-  TensorView* tv0 = makeConcreteTensor({-1, 1, 1});
-  TensorView* tv1 = makeConcreteTensor({-1, -1});
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-  auto tv2 = sum(tv0, {1});
-  auto tv3 = add(tv2, tv1);
-  fusion->addOutput(tv3);
-
-  tv2->merge(1);
-  tv2->merge(0);
-
-  MaxRootDomainInfoSpanningTree tree(tv0);
-  TransformPropagatorWithCheck tp(tv0);
-  tree.traverse(&tp);
-
-  InlinePropagator ip(tv0, -1, ComputeAtMode::MostInlined);
-  tree.traverse(&ip);
-
-  auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({10, 1, 1}, options);
-  at::Tensor t1 = at::randn({10, 10}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(fusion, {t0, t1});
-  auto cg_outputs = fe.runFusion({t0, t1});
-  auto out = cg_outputs[0];
-
-  testValidate(
-      fusion, {out}, {t0, t1}, {t1 + t0.squeeze(-1)}, __LINE__, __FILE__);
-}
-
-} // namespace jit
-} // namespace torch
-#endif // #if defined(USE_CUDA)
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu1.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu1.cpp
new file mode 100644
index 0000000000000..2a14695b53ff2
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu1.cpp
@@ -0,0 +1,9985 @@
+#if defined(USE_CUDA)
+#include <gmock/gmock-matchers.h>
+#include <gtest/gtest.h>
+
+#include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/codegen.h>
+#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
+#include <torch/csrc/jit/codegen/cuda/executor.h>
+#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
+#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/fusion_segmenter.h>
+#include <torch/csrc/jit/codegen/cuda/grouped_reduction.h>
+#include <torch/csrc/jit/codegen/cuda/inlining.h>
+#include <torch/csrc/jit/codegen/cuda/interface.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/ir_graphviz.h>
+#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/lower_magic_zero.h>
+#include <torch/csrc/jit/codegen/cuda/mutator.h>
+#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
+#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
+#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
+#include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
+#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
+#include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
+
+#include <test/cpp/jit/test_utils.h>
+#include <torch/csrc/jit/api/function_impl.h>
+#include <torch/csrc/jit/codegen/cuda/parser.h>
+#include <torch/csrc/jit/ir/irparser.h>
+#include <torch/torch.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <algorithm>
+#include <iostream>
+#include <sstream>
+#include <thread>
+
+// Tests go in torch::jit
+namespace torch {
+namespace jit {
+
+using namespace torch::jit::fuser::cuda;
+using namespace at::indexing;
+
+// A few smoke tests for IrGraphGenerator
+// (These tests exercise IrGraphGenerator through a non-trivial IR,
+//  to make sure that it runs w/o crashing. The actual output is not
+//  validated)
+TEST_F(NVFuserTest, FusionIrGraphGenerator_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Make sure we can handle empty IRs
+  TORCH_CHECK(!IrGraphGenerator::toGraphviz(
+                   &fusion, IrGraphGenerator::DetailLevel::Basic)
+                   .empty());
+
+  // Construct an interesting IR
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(3.141));
+  TensorView* tv3 = broadcast(tv0, {false, true, false, true});
+  TensorView* tv4 =
+      reductionOp(BinaryOpType::Add, {2}, IrBuilder::create<Double>(0), tv3);
+  TensorView* tv5 = clamp(
+      tv4, IrBuilder::create<Double>(0.f), IrBuilder::create<Double>(1.f));
+  TensorView* tv6 = add(tv2, tv2);
+
+  // Another checkpoint before adding outputs
+  TORCH_CHECK(!IrGraphGenerator::toGraphviz(
+                   &fusion, IrGraphGenerator::DetailLevel::Explicit)
+                   .empty());
+
+  fusion.addOutput(tv6);
+
+  tv4->axis(2)->parallelize(ParallelType::BIDy);
+  tv6->merge(0);
+  tv6->split(0, 4);
+  tv6->axis(0)->parallelize(ParallelType::BIDx);
+  tv5->reorder({{-1, 0}});
+  tv2->computeAt(tv6, 1);
+
+  // Another checkpoint with more node types
+  TORCH_CHECK(!IrGraphGenerator::toGraphviz(
+                   &fusion, IrGraphGenerator::DetailLevel::ComputeOnly)
+                   .empty());
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  // Final IR graph
+  TORCH_CHECK(!IrGraphGenerator::toGraphviz(
+                   &fusion, IrGraphGenerator::DetailLevel::Verbose)
+                   .empty());
+}
+
+TEST_F(NVFuserTest, FusionDispatch_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  Double* f = IrBuilder::create<Double>(2.f);
+  std::stringstream ss1, ss2, ss3;
+  ss1 << f;
+  ss2 << static_cast<Val*>(f);
+  ss3 << static_cast<Statement*>(f);
+  TORCH_CHECK(
+      ss1.str().compare(ss2.str()) == 0 && ss1.str().compare(ss3.str()) == 0,
+      "Error with dispatch system where results differ by passing Double* vs Val* vs Statement*.");
+}
+
+// Evaluate basic scalar operations with constant values
+TEST_F(NVFuserTest, FusionExprEvalConstants_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  ExpressionEvaluator evaluator(&fusion);
+
+  auto* a = IrBuilder::create<Int>(7);
+  auto* b = IrBuilder::create<Int>(3);
+
+  // Avoid div operation because it casts int operands to float
+  checkIntValue(evaluator, neg(a), -7);
+  checkIntValue(evaluator, add(a, b), 10);
+  checkIntValue(evaluator, neg(mul(sub(a, b), add(a, b))), -40);
+  checkIntValue(evaluator, mod(a, b), 1);
+  checkIntValue(evaluator, ceilDiv(a, b), 3);
+}
+
+TEST_F(NVFuserTest, FusionExprEvalDouble_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+  auto ten = IrBuilder::create<Double>(10);
+  auto two = IrBuilder::create<Double>(2);
+  auto three = IrBuilder::create<Double>(3);
+  auto val = castOp(DataType::Int, ceilDiv(sub(ten, two), three));
+  auto reference = static_cast<int64_t>(std::ceil((10.0 - 2.0) / 3.0));
+  TORCH_CHECK(reference == val->evaluateInt());
+}
+
+// Evaluate basic scalar operations with bound values
+TEST_F(NVFuserTest, FusionExprEvalBindings_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  ExpressionEvaluator evaluator(&fusion);
+
+  auto* a = IrBuilder::create<Int>();
+  auto* b = IrBuilder::create<Int>();
+  auto* c = add(a, b);
+  auto* d = neg(ceilDiv(c, b));
+  auto* e = IrBuilder::create<Int>(0);
+
+  // trying to evaluate before binding should give empty results
+  TORCH_CHECK(!evaluator.evaluate(a).has_value());
+  TORCH_CHECK(!evaluator.evaluate(d).has_value());
+
+  evaluator.bind(a, 7);
+  evaluator.bind(b, 3);
+
+  // can't bind to the results of expressions
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(evaluator.bind(c, 100));
+
+  // can't bind to concrete values
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(evaluator.bind(e, 100));
+
+  checkIntValue(evaluator, c, 10);
+  checkIntValue(evaluator, sub(a, b), 4);
+  checkIntValue(evaluator, mod(a, b), 1);
+  checkIntValue(evaluator, ceilDiv(a, b), 3);
+  checkIntValue(evaluator, d, -4);
+
+  // Reset evaluation context
+  evaluator = ExpressionEvaluator(&fusion);
+
+  evaluator.bind(a, 2);
+  evaluator.bind(b, 5);
+
+  checkIntValue(evaluator, c, 7);
+  checkIntValue(evaluator, sub(a, b), -3);
+  checkIntValue(evaluator, mod(a, b), 2);
+  checkIntValue(evaluator, ceilDiv(a, b), 1);
+  checkIntValue(evaluator, d, -2);
+}
+
+// Evaluate expressions in a simple IR
+TEST_F(NVFuserTest, FusionExprEvalBasic_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Create a non-trivial IR
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = add(tv0, tv2);
+
+  fusion.addOutput(tv3);
+
+  tv3->split(0, 4);
+
+  tv0->computeAt(tv3, 1);
+  tv1->computeAt(tv3, 1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::Unroll);
+  tv3->axis(1)->parallelize(ParallelType::Unroll);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  // 1. Create an evaluator
+  ExpressionEvaluator evaluator(&fusion);
+
+  // 2. Bind values
+  //
+  // IMPORTANT:
+  // a. The bindings are only as stable as the Vals are in the fusion graph
+  // b. You must use the original (rootDomain) extents
+  //  (ex. `tv0->getRootDomain()[0]->extent()`
+  //   instead of `tv0->axis(0)->extent()`)
+  //
+  evaluator.bind(tv0->getRootDomain()[0]->extent(), 6);
+  evaluator.bind(tv0->getRootDomain()[1]->extent(), 128);
+  evaluator.bind(tv1->getRootDomain()[0]->extent(), 6);
+  evaluator.bind(tv1->getRootDomain()[1]->extent(), 128);
+
+  // 3. Evaluate and check result values
+  TORCH_CHECK(tv2->domain()->nDims() == 3);
+  checkIntValue(evaluator, tv2->axis(0)->extent(), 2);
+  checkIntValue(evaluator, tv2->axis(1)->extent(), 4);
+  checkIntValue(evaluator, tv2->axis(2)->extent(), 128);
+
+  TORCH_CHECK(tv3->domain()->nDims() == 3);
+  checkIntValue(evaluator, tv3->axis(0)->extent(), 2);
+  checkIntValue(evaluator, tv3->axis(1)->extent(), 4);
+  checkIntValue(evaluator, tv3->axis(2)->extent(), 128);
+}
+
+// Evaluate expressions in a more complex IR
+TEST_F(NVFuserTest, FusionExprEvalComplex_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(-1.0));
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(3.0));
+  TensorView* tv3 = mul(tv0, IrBuilder::create<Double>(2.0));
+  TensorView* tv4 = add(tv2, tv1);
+  TensorView* tv5 = add(tv4, tv3);
+  TensorView* tv6 = add(tv0, tv3);
+
+  fusion.addOutput(tv5);
+  fusion.addOutput(tv6);
+
+  tv5->reorder({{-1, 0}});
+
+  tv6->split(0, 5);
+  tv5->merge(0);
+
+  // 1. Create an evaluator
+  ExpressionEvaluator evaluator(&fusion);
+
+  // 2. Bind values
+  evaluator.bind(tv0->getRootDomain()[0]->extent(), 129);
+  evaluator.bind(tv0->getRootDomain()[1]->extent(), 127);
+
+  // Evaluate and check extent values
+  TORCH_CHECK(tv0->domain()->nDims() == 2);
+  checkIntValue(evaluator, tv0->axis(0)->extent(), 129);
+  checkIntValue(evaluator, tv0->axis(1)->extent(), 127);
+
+  TORCH_CHECK(tv3->domain()->nDims() == 2);
+  checkIntValue(evaluator, tv3->axis(0)->extent(), 129);
+  checkIntValue(evaluator, tv3->axis(1)->extent(), 127);
+
+  TORCH_CHECK(tv4->domain()->nDims() == 2);
+  checkIntValue(evaluator, tv4->axis(0)->extent(), 129);
+  checkIntValue(evaluator, tv4->axis(1)->extent(), 127);
+
+  TORCH_CHECK(tv5->domain()->nDims() == 1);
+  checkIntValue(evaluator, tv5->axis(0)->extent(), 16383);
+
+  TORCH_CHECK(tv6->domain()->nDims() == 3);
+  checkIntValue(evaluator, tv6->axis(0)->extent(), 26);
+  checkIntValue(evaluator, tv6->axis(1)->extent(), 5);
+  checkIntValue(evaluator, tv6->axis(2)->extent(), 127);
+}
+
+// Evaluate expressions post lowering
+TEST_F(NVFuserTest, FusionExprEvalPostLower_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Create a non-trivial IR
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = add(tv0, tv2);
+
+  fusion.addOutput(tv3);
+
+  tv3->split(0, 4);
+
+  tv0->computeAt(tv3, 1);
+  tv1->computeAt(tv3, 1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::Unroll);
+  tv3->axis(1)->parallelize(ParallelType::Unroll);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  auto* bid_x = add(tv3->axis(0)->extent(), IrBuilder::create<Int>(0));
+  auto* tid_x = add(tv3->axis(-1)->extent(), IrBuilder::create<Int>(0));
+
+  // Lower
+  GpuLower gpulw(&fusion);
+
+  // 1. Create an evaluation context
+  ExpressionEvaluator evaluator(&fusion);
+
+  // 2. Bind values
+  evaluator.bind(tv0->getRootDomain()[0]->extent(), 6);
+  evaluator.bind(tv0->getRootDomain()[1]->extent(), 128);
+  evaluator.bind(tv1->getRootDomain()[0]->extent(), 6);
+  evaluator.bind(tv1->getRootDomain()[1]->extent(), 128);
+
+  // 3. Evaluate and check result values
+  TORCH_CHECK(tv2->domain()->nDims() == 3);
+  checkIntValue(evaluator, tv2->axis(0)->extent(), 2);
+  checkIntValue(evaluator, tv2->axis(1)->extent(), 4);
+  checkIntValue(evaluator, tv2->axis(2)->extent(), 128);
+
+  TORCH_CHECK(tv3->domain()->nDims() == 3);
+  checkIntValue(evaluator, tv3->axis(0)->extent(), 2);
+  checkIntValue(evaluator, tv3->axis(1)->extent(), 4);
+  checkIntValue(evaluator, tv3->axis(2)->extent(), 128);
+
+  checkIntValue(evaluator, bid_x, 2);
+  checkIntValue(evaluator, tid_x, 128);
+}
+
+// Kernel IR: Evaluate basic scalar operations with constant values
+TEST_F(NVFuserTest, FusionKernelExprEvalConstants_CUDA) {
+  Fusion fusion;
+  kir::Kernel kernel(&fusion);
+  FusionGuard fg((&kernel)->as<Fusion>());
+
+  auto a = IrBuilder::create<Int>(7);
+  auto b = IrBuilder::create<Int>(3);
+  auto c = IrBuilder::subExpr(a, b);
+  auto d = IrBuilder::divExpr(a, b);
+  auto e = IrBuilder::mulExpr(c, d);
+
+  kir::ExpressionEvaluator evaluator;
+
+  checkIntValue(evaluator, IrBuilder::negExpr(a), -7);
+  checkIntValue(evaluator, IrBuilder::addExpr(a, b), 10);
+  checkIntValue(evaluator, IrBuilder::negExpr(e), -8);
+  checkIntValue(evaluator, IrBuilder::modExpr(a, b), 1);
+  checkIntValue(evaluator, IrBuilder::ceilDivExpr(a, b), 3);
+}
+
+// Kernel IR: Evaluate basic scalar operations with bound values
+TEST_F(NVFuserTest, FusionKernelExprEvalBindings_CUDA) {
+  Fusion fusion;
+  kir::Kernel kernel(&fusion);
+  FusionGuard fg((&kernel)->as<Fusion>());
+
+  kir::ExpressionEvaluator evaluator;
+
+  auto a = IrBuilder::create<Int>(c10::nullopt);
+  auto b = IrBuilder::create<Int>(c10::nullopt);
+  auto c = IrBuilder::addExpr(a, b);
+  auto d = IrBuilder::negExpr(IrBuilder::ceilDivExpr(c, b));
+  auto e = IrBuilder::create<Int>(0);
+
+  // trying to evaluate before binding should give empty results
+  TORCH_CHECK(!evaluator.evaluate(a).has_value());
+  TORCH_CHECK(!evaluator.evaluate(d).has_value());
+
+  evaluator.bind(a, 7);
+  evaluator.bind(b, 3);
+
+  // can't bind to the results of expressions
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(evaluator.bind(c, 100));
+
+  // can't bind to concrete values
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(evaluator.bind(e, 100));
+
+  checkIntValue(evaluator, c, 10);
+  checkIntValue(evaluator, IrBuilder::subExpr(a, b), 4);
+  checkIntValue(evaluator, IrBuilder::modExpr(a, b), 1);
+  checkIntValue(evaluator, IrBuilder::ceilDivExpr(a, b), 3);
+  checkIntValue(evaluator, d, -4);
+
+  // Reset the evaluation context
+  evaluator = kir::ExpressionEvaluator();
+
+  evaluator.bind(a, 2);
+  evaluator.bind(b, 5);
+
+  checkIntValue(evaluator, c, 7);
+  checkIntValue(evaluator, IrBuilder::subExpr(a, b), -3);
+  checkIntValue(evaluator, IrBuilder::modExpr(a, b), 2);
+  checkIntValue(evaluator, IrBuilder::ceilDivExpr(a, b), 1);
+  checkIntValue(evaluator, d, -2);
+}
+
+TEST_F(NVFuserTest, FusionClear_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // 1. Create a dummy IR
+
+  {
+    TensorView* tv0 = makeSymbolicTensor(2);
+    TensorView* tv1 = makeSymbolicTensor(2);
+
+    fusion.addInput(tv0);
+    fusion.addInput(tv1);
+
+    TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+    TensorView* tv3 = add(tv0, tv2);
+
+    fusion.addOutput(tv3);
+
+    tv3->split(0, 4);
+    tv0->computeAt(tv3, 1);
+    tv1->computeAt(tv3, 1);
+
+    tv3->axis(0)->parallelize(ParallelType::BIDx);
+    tv2->axis(1)->parallelize(ParallelType::Unroll);
+    tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  // 2. Clear the IR
+
+  fusion.clear();
+
+  TORCH_CHECK(fusion.unordered_exprs().empty());
+  TORCH_CHECK(fusion.vals().empty());
+
+  TORCH_CHECK(fusion.inputs().empty());
+  TORCH_CHECK(fusion.outputs().empty());
+
+  TORCH_CHECK(ir_utils::getReductionOps(&fusion).empty());
+
+  // 3. Rebuild the IR
+
+  {
+    TensorView* tv0 = makeSymbolicTensor(3);
+    TensorView* tv1 = makeSymbolicTensor(3);
+    TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+    TensorView* tv3 = add(tv0, tv2);
+
+    fusion.addInput(tv0);
+    fusion.addInput(tv1);
+    fusion.addOutput(tv3);
+
+    // tv3 [i0, i1, i2]
+    tv3->reorder({{0, 2}, {2, 0}});
+    // tv3 [i2, i1, i0]
+    tv3->split(-1, 4);
+    // tv3 [i2, i1, i0outer, i0inner{4}]
+    tv3->reorder({{2, 0}, {3, 1}, {0, 3}});
+    // tv3 [i0outer, i0inner{4}, i1, i2]
+    tv0->computeAt(tv3, -1);
+    tv1->computeAt(tv3, -1);
+    tv3->axis(1)->parallelize(ParallelType::BIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor input1 = at::randn({16, 8, 8}, options);
+  at::Tensor input2 = at::randn_like(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input1, input2});
+  auto outputs = fe.runFusion({input1, input2});
+
+  at::Tensor tv2_ref = input2 + 2.0;
+  at::Tensor output_ref = input1 + tv2_ref;
+
+  TORCH_CHECK(output_ref.equal(outputs[0]));
+}
+
+TEST_F(NVFuserTest, FusionCopy_CUDA) {
+  Fusion original_fusion;
+
+  // Create the test IR
+  {
+    FusionGuard fg(&original_fusion);
+
+    auto tv0 = makeSymbolicTensor(3);
+    auto tv1 = makeSymbolicTensor(3);
+    auto tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+    auto tv3 = sub(add(tv0, mul(tv2, tv2)), tv2);
+
+    original_fusion.addInput(tv0);
+    original_fusion.addInput(tv1);
+    original_fusion.addOutput(tv3);
+
+    tv3->reorder({{0, 2}, {2, 0}});
+    tv3->split(-1, 4);
+    tv3->reorder({{2, 0}, {3, 1}, {0, 3}});
+
+    tv0->computeAt(tv3, -1);
+    tv1->computeAt(tv3, -1);
+
+    tv3->axis(0)->parallelize(ParallelType::BIDx);
+    tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  // Test copy before lowering
+  Fusion clone = original_fusion;
+
+  // Compare IR dumps
+  std::stringstream original_ir;
+  std::stringstream clone_ir;
+  original_ir << original_fusion;
+  clone_ir << clone;
+  ASSERT_EQ(original_ir.str(), clone_ir.str());
+
+  // Lower original fusion
+  std::string original_kernel;
+  {
+    // TODO(kir): remove this guard once we implement the cuda codegen visitor
+    FusionGuard fg(&original_fusion);
+    original_kernel =
+        codegen::generateCudaKernel(GpuLower(&original_fusion).kernel());
+  }
+
+  // Make sure the "before lowering" clone was not mutated
+  // while lowering the original fusion IR
+  std::stringstream before_lowering_ir;
+  before_lowering_ir << clone;
+  ASSERT_EQ(original_ir.str(), before_lowering_ir.str());
+
+  // Test copy after lowering (including assignment operator)
+  Fusion before_lowering = clone;
+  clone = original_fusion;
+
+  // Compare IR dumps
+  std::stringstream original_lowered_ir;
+  std::stringstream clone_lowered_ir;
+  original_lowered_ir << original_fusion;
+  clone_lowered_ir << clone;
+  ASSERT_EQ(original_lowered_ir.str(), clone_lowered_ir.str());
+
+  // Lower the "before lowering" and compare kernels
+  std::string clone_kernel;
+  {
+    // TODO(kir): remove this guard once we implement the cuda codegen visitor
+    FusionGuard fg(&before_lowering);
+    clone_kernel =
+        codegen::generateCudaKernel(GpuLower(&before_lowering).kernel());
+  }
+  ASSERT_EQ(original_kernel, clone_kernel);
+}
+
+TEST_F(NVFuserTest, FusionMove_CUDA) {
+  Fusion fusion;
+
+  // Create the test IR
+  {
+    FusionGuard fg(&fusion);
+
+    auto tv0 = makeSymbolicTensor(3);
+    auto tv1 = makeSymbolicTensor(3);
+    auto tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+    auto tv3 = sub(add(tv0, mul(tv2, tv2)), tv2);
+
+    fusion.addInput(tv0);
+    fusion.addInput(tv1);
+    fusion.addOutput(tv3);
+
+    tv3->reorder({{0, 2}, {2, 0}});
+    tv3->split(-1, 4);
+    tv3->reorder({{2, 0}, {3, 1}, {0, 3}});
+
+    tv0->computeAt(tv3, -1);
+    tv1->computeAt(tv3, -1);
+
+    tv3->axis(0)->parallelize(ParallelType::BIDx);
+    tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  std::stringstream original_ir;
+  original_ir << fusion;
+
+  // Test move before lowering
+  Fusion another_fusion = std::move(fusion);
+
+  // Check that the original fusion is "empty"
+  //
+  // IMPORTANT: these checks assume knowledge of the internal
+  //    implementation of the move operations. General uses
+  //    should only assume that the moved-from object is in
+  //    a valid, but unspecified state. This is similar to the
+  //    standard library containers:
+  //    https://en.cppreference.com/w/cpp/utility/move
+  //
+  TORCH_CHECK(fusion.unordered_exprs().empty());
+  TORCH_CHECK(fusion.vals().empty());
+  TORCH_CHECK(fusion.inputs().empty());
+  TORCH_CHECK(fusion.outputs().empty());
+
+  // clear() has no pre-conditions so it's valid to call on a moved-from object
+  fusion.clear();
+
+  // Compare IR dumps
+  std::stringstream another_ir;
+  another_ir << another_fusion;
+  ASSERT_EQ(original_ir.str(), another_ir.str());
+
+  // Lower the fusion IR
+  GpuLower lower(&another_fusion);
+
+  std::stringstream lowered_ir;
+  lowered_ir << another_fusion;
+
+  // Test move assignment after lowering
+  fusion = std::move(another_fusion);
+
+  // Compare IR dumps
+  std::stringstream moved_lowered_ir;
+  moved_lowered_ir << fusion;
+  ASSERT_EQ(lowered_ir.str(), moved_lowered_ir.str());
+}
+
+TEST_F(NVFuserTest, FusionSimpleArith_CUDA) {
+  std::stringstream ss1, ss2;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  Double* d1 = IrBuilder::create<Double>(1.f);
+  Double* d2 = IrBuilder::create<Double>(2.f);
+  Double* d3 = IrBuilder::create<Double>();
+
+  // Disrupt the fusion to make sure guard works well
+  {
+    Fusion fusion2;
+    FusionGuard fg(&fusion2);
+
+    Double* d1 = IrBuilder::create<Double>(1.f);
+    Double* d2 = IrBuilder::create<Double>(2.f);
+    add(d1, d2);
+    ss2 << fusion2;
+  }
+
+  IrBuilder::create<BinaryOp>(BinaryOpType::Add, d3, d1, d2);
+  ss1 << fusion;
+
+  TORCH_CHECK(
+      ss1.str().compare(ss2.str()) == 0,
+      "Error where explicit add nodes don't match implicit add nodes.");
+}
+
+TEST_F(NVFuserTest, FusionScalarTypePromote_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  Bool* b = IrBuilder::create<Bool>(true);
+  Double* d = IrBuilder::create<Double>(4.f);
+  Int* i = IrBuilder::create<Int>(3);
+  ComplexDouble* c =
+      IrBuilder::create<ComplexDouble>(c10::complex<double>(1, 2));
+
+  TORCH_CHECK(add(b, b)->getDataType() == DataType::Bool);
+  TORCH_CHECK(add(b, d)->getDataType() == DataType::Double);
+  TORCH_CHECK(add(b, i)->getDataType() == DataType::Int);
+  TORCH_CHECK(add(b, c)->getDataType() == DataType::ComplexDouble);
+
+  TORCH_CHECK(add(d, b)->getDataType() == DataType::Double);
+  TORCH_CHECK(add(d, d)->getDataType() == DataType::Double);
+  TORCH_CHECK(add(d, i)->getDataType() == DataType::Double);
+  TORCH_CHECK(add(d, c)->getDataType() == DataType::ComplexDouble);
+
+  TORCH_CHECK(add(i, b)->getDataType() == DataType::Int);
+  TORCH_CHECK(add(i, d)->getDataType() == DataType::Double);
+  TORCH_CHECK(add(i, i)->getDataType() == DataType::Int);
+  TORCH_CHECK(add(i, c)->getDataType() == DataType::ComplexDouble);
+
+  TORCH_CHECK(add(c, b)->getDataType() == DataType::ComplexDouble);
+  TORCH_CHECK(add(c, d)->getDataType() == DataType::ComplexDouble);
+  TORCH_CHECK(add(c, i)->getDataType() == DataType::ComplexDouble);
+  TORCH_CHECK(add(c, c)->getDataType() == DataType::ComplexDouble);
+}
+
+TEST_F(NVFuserTest, FusionComplexAbsTypes_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  auto tensor_cf = at::randn({4, 4, 4}, options.dtype(at::kComplexFloat));
+  auto tensor_cd = at::randn({4, 4, 4}, options.dtype(at::kComplexDouble));
+
+  auto type_cf = TensorType::create(tensor_cf);
+  auto tv_cf = IrBuilder::create<TensorView>(type_cf);
+  auto type_cd = TensorType::create(tensor_cd);
+  auto tv_cd = IrBuilder::create<TensorView>(type_cd);
+
+  TORCH_CHECK(
+      tensor_cf.abs().scalar_type() ==
+      data_type_to_aten(abs(tv_cf)->getDataType().value()));
+  TORCH_CHECK(
+      tensor_cd.abs().scalar_type() ==
+      data_type_to_aten(abs(tv_cd)->getDataType().value()));
+}
+
+TEST_F(NVFuserTest, FusionRegister_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  Double* v1 = IrBuilder::create<Double>(1.f);
+  Double* v2 = IrBuilder::create<Double>(2.f);
+  Val* v3 = binaryOp(BinaryOpType::Add, v1, v2);
+  Val* v4 = binaryOp(BinaryOpType::Add, v1, v2);
+  TORCH_CHECK(v1->name() + 1 == v2->name());
+  TORCH_CHECK(v2->name() + 1 == v3->name());
+  TORCH_CHECK(v3->name() + 1 == v4->name());
+  TORCH_CHECK(v3->definition()->name() + 1 == v4->definition()->name());
+}
+
+// dummy expr with 2 outputs only for toposort test.
+struct DummyExpr : public Expr {
+  ~DummyExpr() = default;
+  DummyExpr(
+      IrBuilderPasskey passkey,
+      Val* _outlhs,
+      Val* _outrhs,
+      Val* _lhs,
+      Val* _rhs)
+      : Expr(passkey, ExprType::UnaryOp) // Not terribly safe...
+  {
+    addOutput(_outlhs);
+    addOutput(_outrhs);
+    addInput(_lhs);
+    addInput(_rhs);
+  }
+  DummyExpr(const DummyExpr& other) = delete;
+  DummyExpr& operator=(const DummyExpr& other) = delete;
+  DummyExpr(DummyExpr&& other) = delete;
+  DummyExpr& operator=(DummyExpr&& other) = delete;
+  Expr* shallowCopy() const override {
+    return nullptr;
+  }
+};
+
+TEST_F(NVFuserTest, FusionTopoSort_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // e0: v3, v2 = dummy(v1, v0)
+  // e1: v4     =   add(v3, v2)
+  // e2: v5     =   add(v2, v4)
+  // e3: v6     =   add(v5, v5)
+  Double* v0 = IrBuilder::create<Double>();
+  Double* v1 = IrBuilder::create<Double>();
+  Double* v2 = IrBuilder::create<Double>();
+  Double* v3 = IrBuilder::create<Double>();
+  Double* v4 = IrBuilder::create<Double>();
+  Double* v5 = IrBuilder::create<Double>();
+  Double* v6 = IrBuilder::create<Double>();
+
+  std::vector<Val*> inputs = {v0, v1};
+  for (auto val : inputs) {
+    fusion.addInput(val);
+  }
+
+  Expr* e0 = IrBuilder::create<DummyExpr>(v3, v2, v1, v0);
+  Expr* e1 = IrBuilder::create<BinaryOp>(BinaryOpType::Add, v4, v3, v2);
+  Expr* e2 = IrBuilder::create<BinaryOp>(BinaryOpType::Add, v5, v2, v4);
+  Expr* e3 = IrBuilder::create<BinaryOp>(BinaryOpType::Add, v6, v5, v5);
+
+  fusion.addOutput(v2);
+  fusion.addOutput(v3);
+  auto exprs = fusion.exprs();
+  TORCH_CHECK(exprs.size() == 1, "Found ", exprs.size(), " but expecting 1");
+  TORCH_CHECK(exprs[0] == e0);
+
+  fusion.addOutput(v5);
+  exprs = fusion.exprs();
+  TORCH_CHECK(exprs.size() == 3, "Found ", exprs.size(), " but expecting 3");
+  TORCH_CHECK(exprs[0] == e0);
+  TORCH_CHECK(exprs[1] == e1);
+  TORCH_CHECK(exprs[2] == e2);
+
+  fusion.addOutput(v4);
+  exprs = fusion.exprs();
+  TORCH_CHECK(exprs.size() == 3, "Found ", exprs.size(), " but expecting 3");
+  TORCH_CHECK(exprs[0] == e0);
+  TORCH_CHECK(exprs[1] == e1);
+  TORCH_CHECK(exprs[2] == e2);
+
+  fusion.addOutput(v6);
+  exprs = fusion.exprs();
+  TORCH_CHECK(exprs.size() == 4, "Found ", exprs.size(), " but expecting 4");
+  TORCH_CHECK(exprs[0] == e0);
+  TORCH_CHECK(exprs[1] == e1);
+  TORCH_CHECK(exprs[2] == e2);
+  TORCH_CHECK(exprs[3] == e3);
+
+  TORCH_CHECK(v2->definition()->name() == 0);
+  TORCH_CHECK(v3->definition()->name() == 0);
+  TORCH_CHECK(v4->definition()->name() == 1);
+  TORCH_CHECK(v5->definition()->name() == 2);
+  TORCH_CHECK(v6->definition()->name() == 3);
+}
+
+TEST_F(NVFuserTest, FusionTensor_CUDA) {
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  {
+    auto tensor = at::randn({2, 3, 4, 5}, options);
+    auto tensor_type = TensorType::create(tensor);
+    auto fuser_tensor = IrBuilder::create<TensorView>(tensor_type);
+    TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim());
+    TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float);
+    TORCH_CHECK(fuser_tensor->domain() != nullptr);
+    for (const auto i : c10::irange(fuser_tensor->nDims())) {
+      // size 1 dimension are makred as broadcast
+      TORCH_CHECK(
+          fuser_tensor->axis(i)->isBroadcast() == (tensor.sizes()[i] == 1));
+      // check contiguity information;
+      TORCH_CHECK(fuser_tensor->domain()->contiguity()[i]);
+    }
+  }
+
+  // TensorType::create fills stride_properties, which helps us to mark
+  // IterDomain properly
+  // Note: implementation could change, depending on how much we want to invest
+  // in our home-brew contiguity coalescing. For now let's make sure that we
+  // properly test what we are using.
+  {
+    auto tensor = at::randn({4, 4, 4}, options);
+    auto sliced_tensor = tensor.slice(1, 0, -1, 2);
+
+    auto tensor_type = TensorType::create(sliced_tensor);
+    auto fuser_tensor = IrBuilder::create<TensorView>(tensor_type);
+    TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim());
+    TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float);
+    TORCH_CHECK(fuser_tensor->domain() != nullptr);
+    for (const auto i : c10::irange(fuser_tensor->nDims())) {
+      // size 1 dimension are makred as broadcast
+      TORCH_CHECK(fuser_tensor->axis(i)->isBroadcast() == false);
+    }
+    TORCH_CHECK(fuser_tensor->domain()->contiguity()[0]);
+    TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]);
+    TORCH_CHECK(fuser_tensor->domain()->contiguity()[2]);
+  }
+
+  {
+    auto tensor = at::randn({2, 3, 4, 5}, options);
+    auto permuted_tensor = tensor.permute({0, 3, 1, 2});
+    auto tensor_type = TensorType::create(permuted_tensor);
+    auto fuser_tensor = IrBuilder::create<TensorView>(tensor_type);
+    TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim());
+    TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float);
+    TORCH_CHECK(fuser_tensor->domain() != nullptr);
+    for (const auto i : c10::irange(fuser_tensor->nDims())) {
+      // size 1 dimension are makred as broadcast
+      TORCH_CHECK(fuser_tensor->axis(i)->isBroadcast() == false);
+    }
+    TORCH_CHECK(!fuser_tensor->domain()->contiguity()[0]);
+    TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]);
+    TORCH_CHECK(fuser_tensor->domain()->contiguity()[2]);
+    TORCH_CHECK(!fuser_tensor->domain()->contiguity()[3]);
+  }
+}
+
+TEST_F(NVFuserTest, FusionFilterVals_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  auto tv1 = makeSymbolicTensor(1);
+  auto scalar0 = IrBuilder::create<Double>(0);
+  auto scalar1 = IrBuilder::create<Int>(0);
+  auto scalar2 = IrBuilder::create<Int>(1);
+
+  const std::vector<Val*> vals = {tv0, scalar0, tv1, scalar1, scalar2};
+
+  std::vector<TensorView*> tvs(
+      ir_utils::filterByType<TensorView>(vals).begin(),
+      ir_utils::filterByType<TensorView>(vals).end());
+  TORCH_CHECK(tvs.size() == 2);
+  TORCH_CHECK(tvs[0] == tv0);
+  TORCH_CHECK(tvs[1] == tv1);
+
+  std::vector<Double*> floats(
+      ir_utils::filterByType<Double>(vals).begin(),
+      ir_utils::filterByType<Double>(vals).end());
+  TORCH_CHECK(floats.size() == 1);
+  TORCH_CHECK(floats[0] == scalar0);
+
+  std::vector<Int*> ints(
+      ir_utils::filterByType<Int>(vals).begin(),
+      ir_utils::filterByType<Int>(vals).end());
+  TORCH_CHECK(ints.size() == 2);
+  TORCH_CHECK(ints[0] == scalar1);
+  TORCH_CHECK(ints[1] == scalar2);
+
+  TORCH_CHECK(
+      ir_utils::filterByType<Expr>(vals).begin() ==
+          ir_utils::filterByType<Expr>(vals).end(),
+      "Not expecting any results");
+}
+
+TEST_F(NVFuserTest, FusionTVSplit_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv = makeSymbolicTensor(3);
+
+  tv = tv->split(2, 2);
+  TORCH_CHECK(tv->nDims() == 4);
+  Expr* outer = tv->axis(2)->extent()->definition();
+
+  TORCH_CHECK(
+      outer->getExprType().value() == ExprType::BinaryOp &&
+      static_cast<BinaryOp*>(outer)->getBinaryOpType() ==
+          BinaryOpType::CeilDiv &&
+      static_cast<BinaryOp*>(outer)->lhs()->sameAs(
+          tv->getRootDomain()[2]->extent()) &&
+      static_cast<Int*>(static_cast<BinaryOp*>(outer)->rhs())
+          ->sameAs(IrBuilder::create<Int>(2)));
+
+  IterDomain* inner = static_cast<IterDomain*>(tv->axis(3));
+  TORCH_CHECK(
+      inner->extent()->isScalar() &&
+      static_cast<Int*>(inner->extent())->isConst() &&
+      static_cast<Int*>(inner->extent())->value().value() == 2);
+}
+
+TEST_F(NVFuserTest, FusionTVMerge_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv = makeSymbolicTensor(3);
+
+  tv = tv->merge(1);
+  Expr* axisOp = tv->axis(1)->extent()->definition();
+
+  TORCH_CHECK(
+      tv->nDims() == 2 && axisOp->getExprType() == ExprType::BinaryOp &&
+      static_cast<BinaryOp*>(axisOp)->getBinaryOpType() == BinaryOpType::Mul &&
+      static_cast<BinaryOp*>(axisOp)->lhs() ==
+          tv->getRootDomain()[1]->extent() &&
+      static_cast<BinaryOp*>(axisOp)->rhs() ==
+          tv->getRootDomain()[2]->extent());
+}
+
+TEST_F(NVFuserTest, FusionTVReorder_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::unordered_map<int, int> shift_right{{-1, 0}};
+
+  std::unordered_map<int, int> shift_left{{0, -1}};
+
+  std::unordered_map<int, int> shift_left_2{{0, -1}, {1, 0}, {2, 1}};
+
+  std::unordered_map<int, int> swap{{0, 2}, {2, 0}};
+
+  auto tv = makeSymbolicTensor(3);
+  std::vector<IterDomain*> ref;
+  ref = std::vector<IterDomain*>(
+      tv->domain()->domain().begin(), tv->domain()->domain().end());
+
+  tv->reorder(shift_left);
+  for (const auto i : c10::irange(tv->nDims())) {
+    TORCH_CHECK(ref[i]->sameAs(tv->axis(i - 1)));
+  }
+
+  tv = makeSymbolicTensor(3);
+  ref = std::vector<IterDomain*>(
+      tv->domain()->domain().begin(), tv->domain()->domain().end());
+
+  tv->reorder(shift_left);
+  for (const auto i : c10::irange(tv->nDims())) {
+    TORCH_CHECK(ref[i]->sameAs(tv->axis(i - 1)));
+  }
+
+  tv = makeSymbolicTensor(3);
+  ref = std::vector<IterDomain*>(
+      tv->domain()->domain().begin(), tv->domain()->domain().end());
+
+  tv->reorder(shift_right);
+  TORCH_CHECK(ref[ref.size() - 1]->sameAs(tv->axis(0)));
+  for (const auto i : c10::irange(1, tv->nDims())) {
+    TORCH_CHECK(ref[i - 1]->sameAs(tv->axis(i)));
+  }
+
+  tv = makeSymbolicTensor(3);
+  ref = std::vector<IterDomain*>(
+      tv->domain()->domain().begin(), tv->domain()->domain().end());
+  tv->reorder(swap);
+  TORCH_CHECK(ref[0]->sameAs(tv->axis(2)));
+  TORCH_CHECK(ref[2]->sameAs(tv->axis(0)));
+  TORCH_CHECK(ref[1]->sameAs(tv->axis(1)));
+}
+
+TEST_F(NVFuserTest, FusionEquality_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  Double* fval1 = IrBuilder::create<Double>();
+  Double* fval1_copy = fval1;
+  Double* fval2 = IrBuilder::create<Double>();
+  Double* fone = IrBuilder::create<Double>(1.0);
+
+  TORCH_CHECK(fval1->sameAs(fval1_copy));
+  TORCH_CHECK(!fval1->sameAs(fval2));
+  TORCH_CHECK(!fone->sameAs(fval1));
+  TORCH_CHECK(fone->sameAs(IrBuilder::create<Double>(1.0)));
+
+  Int* ival1 = IrBuilder::create<Int>();
+  Int* ival1_copy = ival1;
+  Int* ival2 = IrBuilder::create<Int>();
+  Int* ione = IrBuilder::create<Int>(1);
+
+  TORCH_CHECK(ival1->sameAs(ival1_copy));
+  TORCH_CHECK(!ival1->sameAs(ival2));
+  TORCH_CHECK(!ione->sameAs(ival1));
+  TORCH_CHECK(ione->sameAs(IrBuilder::create<Int>(1)));
+
+  BinaryOp* add1 = IrBuilder::create<BinaryOp>(
+      BinaryOpType::Add, IrBuilder::create<Double>(), fval1, ival1);
+  BinaryOp* add1_copy = IrBuilder::create<BinaryOp>(
+      BinaryOpType::Add, IrBuilder::create<Double>(), fval1, ival1);
+  BinaryOp* sub1 = IrBuilder::create<BinaryOp>(
+      BinaryOpType::Sub, IrBuilder::create<Double>(), fval1, ival1);
+
+  UnaryOp* neg1 = IrBuilder::create<UnaryOp>(
+      UnaryOpType::Neg, IrBuilder::create<Double>(), fval1);
+  UnaryOp* neg2 = IrBuilder::create<UnaryOp>(
+      UnaryOpType::Neg, IrBuilder::create<Double>(), fval2);
+  UnaryOp* neg1_copy = IrBuilder::create<UnaryOp>(
+      UnaryOpType::Neg, IrBuilder::create<Double>(), fval1);
+
+  TORCH_CHECK(add1->sameAs(add1_copy));
+  TORCH_CHECK(!add1->sameAs(sub1));
+
+  TORCH_CHECK(neg1->sameAs(neg1_copy));
+  TORCH_CHECK(!static_cast<Expr*>(neg1)->sameAs(add1));
+  TORCH_CHECK(!neg1->sameAs(neg2));
+}
+
+TEST_F(NVFuserTest, FusionDependency_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  Double* d0 = IrBuilder::create<Double>(0.f);
+  Double* d1 = IrBuilder::create<Double>(1.f);
+  auto d2 = add(d0, d1);
+
+  auto d3 = add(d2, d2);
+
+  Double* d4 = IrBuilder::create<Double>(4.f);
+  Double* d5 = IrBuilder::create<Double>(5.f);
+  auto d6 = add(d4, d5);
+
+  Double* d7 = IrBuilder::create<Double>(7.f);
+  Double* d8 = IrBuilder::create<Double>(8.f);
+  auto d9 = add(d7, d8);
+
+  auto d10 = add(d6, d9);
+
+  auto d11 = add(d3, d10);
+
+  TORCH_CHECK(DependencyCheck::isDependencyOf(d0, d11));
+  TORCH_CHECK(DependencyCheck::isDependencyOf(d1, d11));
+  TORCH_CHECK(DependencyCheck::isDependencyOf(d2, d11));
+  TORCH_CHECK(DependencyCheck::isDependencyOf(d3, d11));
+  TORCH_CHECK(DependencyCheck::isDependencyOf(d6, d11));
+  TORCH_CHECK(DependencyCheck::isDependencyOf(d9, d11));
+  TORCH_CHECK(DependencyCheck::isDependencyOf(d0, d2));
+  TORCH_CHECK(DependencyCheck::isDependencyOf(d2, d3));
+  TORCH_CHECK(DependencyCheck::isDependencyOf(d4, d6));
+  TORCH_CHECK(DependencyCheck::isDependencyOf(d8, d10));
+
+  TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d0));
+  TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d1));
+  TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d2));
+  TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d3));
+  TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d4));
+  TORCH_CHECK(!DependencyCheck::isDependencyOf(d11, d5));
+  TORCH_CHECK(!DependencyCheck::isDependencyOf(d2, d0));
+  TORCH_CHECK(!DependencyCheck::isDependencyOf(d3, d2));
+  TORCH_CHECK(!DependencyCheck::isDependencyOf(d6, d4));
+  TORCH_CHECK(!DependencyCheck::isDependencyOf(d10, d8));
+
+  auto dep_chain = DependencyCheck::getSingleDependencyChain(d0, d11);
+  TORCH_CHECK(dep_chain.back() == d11);
+  dep_chain.pop_back();
+  TORCH_CHECK(dep_chain.back() == d3);
+  dep_chain.pop_back();
+  TORCH_CHECK(dep_chain.back() == d2);
+  dep_chain.pop_back();
+
+  dep_chain = DependencyCheck::getSingleDependencyChain(d6, d11);
+  TORCH_CHECK(dep_chain.back() == d11);
+  dep_chain.pop_back();
+  TORCH_CHECK(dep_chain.back() == d10);
+  dep_chain.pop_back();
+
+  dep_chain = DependencyCheck::getSingleDependencyChain(d4, d11);
+  TORCH_CHECK(dep_chain.back() == d11);
+  dep_chain.pop_back();
+  TORCH_CHECK(dep_chain.back() == d10);
+  dep_chain.pop_back();
+  TORCH_CHECK(dep_chain.back() == d6);
+  dep_chain.pop_back();
+
+  dep_chain = DependencyCheck::getSingleDependencyChain(d11, d2);
+  TORCH_CHECK(dep_chain.empty());
+}
+
+TEST_F(NVFuserTest, FusionParser_CUDA) {
+  // This test may not pass if using a custom block sync as there may
+  // be additional calls. Skip the test as it's not specifically
+  // relevant with block synchronizatin.
+  if (std::getenv("PYTORCH_NVFUSER_USE_BLOCK_SYNC_ATOMIC")) {
+    return;
+  }
+  auto g = std::make_shared<Graph>();
+  const auto graph0_string = R"IR(
+    graph(%0 : Float(2, strides=[1]),
+          %1 : Float(2, strides=[1])):
+      %c0 : Float(2, strides=[1]) = aten::mul(%0, %1)
+      %d0 : Float(2, strides=[1]) = aten::mul(%c0, %0)
+      return (%d0))IR";
+  parseIR(graph0_string, g.get());
+
+  // strides are not yet supported in the irparser.
+  for (auto val : g->block()->inputs()) {
+    if (val->isCompleteTensor())
+      val->setType(val->type()->castRaw<TensorType>()->contiguous());
+  }
+  for (auto node : g->block()->nodes()) {
+    for (auto val : node->outputs()) {
+      if (val->isCompleteTensor())
+        val->setType(val->type()->castRaw<TensorType>()->contiguous());
+    }
+  }
+
+  auto fusion = parseJitIR(g);
+  FusionGuard fg(fusion.get());
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  // Avoid vectorization here as those kernels can't be lowered twice at the
+  // moment
+  at::Tensor input1 = at::randn({16}, options);
+  at::Tensor input2 = at::randn({16}, options);
+  auto lparams = schedulePointwise(fusion.get(), {input1, input2});
+
+  // CONSIDER:
+  // 1. this can be moved to a dedicated "golden" file
+  // 2. use a fuzzy compare (ignore non-significant whitespaces for example)
+  const std::string expected_kernel = R"(
+__global__ void CUDAGeneratedKernel(Tensor<float, 1> T0, Tensor<float, 1> T1, Tensor<float, 1> T3) {
+  int64_t i50;
+  i50 = (((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x);
+  if ((i50 < T0.size[0])) {
+    float T5[1];
+    T5[0] = 0;
+    T5[0]
+       = T1[i50];
+    float T4[1];
+    T4[0] = 0;
+    T4[0]
+       = T0[i50];
+    float T2[1];
+    T2[0]
+      = T4[0]
+      * T5[0];
+    float T6[1];
+    T6[0]
+      = T2[0]
+      * T4[0];
+    T3[i50]
+       = T6[0];
+  }
+}
+)";
+
+  const std::string actual_kernel =
+      "\n" + codegen::generateCudaKernel(GpuLower(fusion.get()).kernel());
+  if (expected_kernel.size() != actual_kernel.size() ||
+      expected_kernel.compare(actual_kernel) != 0) {
+    std::cerr
+        << " Codegen mismatch, codegen possibly changed, or is incorrect. "
+        << " \n ========= EXPECTED ========= \n"
+        << expected_kernel << "\n========= ACTUAL ========== \n"
+        << actual_kernel << "\n=================" << std::endl;
+    auto it = std::mismatch(
+        expected_kernel.begin(),
+        expected_kernel.end(),
+        actual_kernel.begin(),
+        actual_kernel.end());
+    std::string actual_mismatched_snippet(it.second, actual_kernel.end());
+    actual_mismatched_snippet = actual_mismatched_snippet.substr(0, 10);
+    std::string expected_mismatched_snippet(it.first, expected_kernel.end());
+    expected_mismatched_snippet = expected_mismatched_snippet.substr(0, 10);
+    std::cerr << "First mismatch found at: " << actual_mismatched_snippet
+              << ", expected: " << expected_mismatched_snippet << std::endl;
+    TORCH_CHECK(false);
+  }
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1, input2}, lparams);
+  auto outputs = fe.runFusion({input1, input2}, lparams);
+  at::Tensor output_ref = input1 * input2 * input1;
+  TORCH_CHECK(output_ref.equal(outputs[0]));
+}
+
+TEST_F(NVFuserTest, FusionOuterSplit_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(3);
+
+  IrBuilder::create<BinaryOp>(
+      BinaryOpType::Add,
+      tv0,
+      IrBuilder::create<Double>(0.0),
+      IrBuilder::create<Double>(1.0));
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(2.0));
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(3.0));
+  fusion.addOutput(tv2);
+
+  //[I0, I1, I2]
+  tv2->split(-1, 4, false);
+  //[I0, I1, I2o{4}, I2i]
+  tv2->merge(0);
+  tv2->merge(0);
+  //[I0*I1*I2o{4}, I2i]
+  tv2->split(0, 2);
+  //[I0*I1*I2o{4}o, I0*I1*I2o{4}i{2}, I2i]
+  tv2->reorder({{0, 1}, {1, 0}});
+  // I0*I1*I2o{4}i{2}, [I0*I1*I2o{4}o, I2i]
+
+  tv0->computeAt(tv2, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor output = at::empty({2, 6, 32}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  fe.runFusion({}, {output});
+
+  at::Tensor output_ref = at::zeros_like(output, options);
+  output_ref = output_ref + 0.0 + 1.0 + 2.0 + 3.0;
+
+  TORCH_CHECK(output_ref.equal(output));
+}
+
+TEST_F(NVFuserTest, FusionCodeGen_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(3);
+
+  IrBuilder::create<BinaryOp>(
+      BinaryOpType::Add,
+      tv0,
+      IrBuilder::create<Double>(0.0),
+      IrBuilder::create<Double>(1.0));
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(2.0));
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(3.0));
+  fusion.addOutput(tv2);
+
+  //[I0, I1, I2]
+  tv2 = tv2->split(0, 4);
+  //[I0o, I0i{4}, I1, I2]
+  tv2 = tv2->merge(1);
+  //[I0o, I0i{4}*I1, I2]
+  tv2 = tv2->split(-1, 2);
+  //[I0o, I0i{4}*I1, I2o, I2i{2}]
+  tv2 = tv2->reorder({{0, 1}, {1, 0}, {3, 2}});
+  //[I0i{4}*I1, I0o, I2i{2}, I2o]
+
+  tv0->computeAt(tv2, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor output = at::empty({16, 8, 8}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  fe.runFusion({}, {output});
+
+  at::Tensor output_ref = at::zeros_like(output, options);
+  output_ref = output_ref + 0.0 + 1.0 + 2.0 + 3.0;
+
+  TORCH_CHECK(output_ref.equal(output));
+}
+
+TEST_F(NVFuserTest, FusionCodeGen2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(3);
+  TensorView* tv1 = makeSymbolicTensor(3);
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = add(tv0, tv2);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv3);
+
+  //[I0, I1, I2]
+  tv3->reorder({{0, 2}, {2, 0}});
+  //[I2, I1, I0]
+  tv3->split(-1, 4);
+  //[I2, I1, I0o, I0i{4}]
+  tv3->reorder({{2, 0}, {3, 1}, {0, 3}});
+  // I0o, I0i{4}, I1, I2]
+
+  tv0->computeAt(tv3, -1);
+  tv1->computeAt(tv3, -1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor input1 = at::randn({16, 8, 8}, options);
+  at::Tensor input2 = at::randn_like(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input1, input2});
+  auto outputs = fe.runFusion({input1, input2});
+
+  at::Tensor tv2_ref = input2 + 2.0;
+  at::Tensor output_ref = input1 + tv2_ref;
+
+  TORCH_CHECK(output_ref.equal(outputs[0]));
+}
+
+TEST_F(NVFuserTest, FusionSimplePWise_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  // dimensionality of the problem
+  int nDims = 3;
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeContigTensor(nDims);
+  TensorView* tv1 = makeContigTensor(nDims);
+
+  // Register your inputs
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // Do math with it, it returns a `Val*` but can be static_casted back to
+  // TensorView
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = add(tv0, tv2);
+
+  // Register your outputs
+  fusion.addOutput(tv3);
+
+  // Do transformations, remember, transformations are outputs to inputs
+  // This doesn't have to be in this order
+  tv3->merge(1);
+  tv3->merge(0);
+
+  // Split by n_threads
+  tv3->split(0, 128);
+  tv3->split(0, 4);
+
+  // For all inputs, computeAt the output inline, temporaries should be squeezed
+  // between them
+  tv0->computeAt(tv3, -1);
+  tv1->computeAt(tv3, -1);
+
+  // Parallelize TV3
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(-2)->parallelize(ParallelType::Unroll);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor input1 = at::randn({64, 2, 128}, options);
+  at::Tensor input2 = at::rand_like(input1);
+  at::Tensor output = at::empty_like(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input1, input2});
+  fe.runFusion({input1, input2}, {output});
+
+  at::Tensor tv2_ref = input2 + 2.0;
+  at::Tensor output_ref = input1 + tv2_ref;
+
+  TORCH_CHECK(output_ref.equal(output));
+}
+
+TEST_F(NVFuserTest, FusionSimplePWiseDtypeComplex_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  // dimensionality of the problem
+  int nDims = 3;
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeContigTensor(nDims, DataType::ComplexFloat);
+  TensorView* tv1 = makeContigTensor(nDims, DataType::ComplexFloat);
+
+  // Register your inputs
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // Do math with it, it returns a `Val*` but can be static_casted back to
+  // TensorView
+  c10::complex<double> scalar1(2.0, 3.0);
+  TensorView* tv2 = add(tv1, IrBuilder::create<ComplexDouble>(scalar1));
+  TensorView* tv3 = add(tv0, tv2);
+
+  // Register your outputs
+  fusion.addOutput(tv3);
+
+  // Do transformations, remember, transformations are outputs to inputs
+  // This doesn't have to be in this order
+  tv3->merge(1);
+  tv3->merge(0);
+
+  // Split by n_threads
+  tv3->split(0, 128);
+  tv3->split(0, 4);
+
+  // For all inputs, computeAt the output inline, temporaries should be squeezed
+  // between them
+  tv0->computeAt(tv3, -1);
+  tv1->computeAt(tv3, -1);
+
+  // Parallelize TV3
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(-2)->parallelize(ParallelType::Unroll);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  auto options =
+      at::TensorOptions().dtype(at::kComplexFloat).device(at::kCUDA, 0);
+
+  at::Tensor input1 = at::randn({64, 2, 128}, options);
+  at::Tensor input2 = at::rand_like(input1);
+  at::Tensor output = at::empty_like(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input1, input2});
+  fe.runFusion({input1, input2}, {output});
+
+  at::Tensor tv2_ref = input2 + scalar1;
+  at::Tensor output_ref = input1 + tv2_ref;
+
+  TORCH_CHECK(output_ref.equal(output));
+}
+
+TEST_F(NVFuserTest, FusionExecKernel_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+
+  // Register your inputs
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // Do math with it, it returns a `Val*` but can be static_casted back to
+  // TensorView
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = add(tv0, tv2);
+
+  // Register your outputs
+  fusion.addOutput(tv3);
+
+  tv3->merge(0);
+  tv3->split(0, 128);
+  tv3->split(0, 4);
+
+  // For all inputs, computeAt the output inline, temporaries should be squeezed
+  // between them
+  tv0->computeAt(tv3, 1);
+  tv1->computeAt(tv3, 1);
+
+  // Parallelize TV3
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::Unroll);
+  tv3->axis(1)->parallelize(ParallelType::Unroll);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor input1 = at::ones({1, 128}, options);
+  at::Tensor input2 = at::ones_like(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input1, input2});
+  auto outputs = fe.runFusion({input1, input2});
+
+  at::Tensor check = at::full({1, 128}, 4, options);
+  ;
+  TORCH_CHECK(outputs[0].equal(check));
+}
+
+int ceilDiv_(int a, int b) {
+  return (a + b - 1) / b;
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAt1_CUDA) {
+  // Case 1
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv1 + 3
+  // tv4 = tv1 * 2
+  // tv5 = tv3 + tv2
+  // tv6 = tv5 + tv4
+  // tv7 = tv1 + tv4
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0));
+  TensorView* tv3 = add(tv1, IrBuilder::create<Double>(3.0));
+  TensorView* tv4 = mul(tv1, IrBuilder::create<Double>(2.0));
+  TensorView* tv5 = add(tv3, tv2);
+
+  TensorView* tv6 = add(tv5, tv4);
+  TensorView* tv7 = add(tv1, tv4);
+
+  fusion.addOutput(tv6);
+  fusion.addOutput(tv7);
+
+  // Lets setup to actually run
+  tv7->merge(0);
+  tv7->split(0, 128);
+  tv7->split(0, 4);
+
+  tv7->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv0->computeAt(tv7, 1);
+
+  ComputeAtMap ca_map(&fusion);
+
+  // The this-position of the last tensor should be zero.
+  TORCH_CHECK(
+      tv7->nDims() == 3 && tv7->getComputeAtPosition() == 0 &&
+      tv7->getMaxProducerPosition() == 1);
+  TORCH_CHECK(
+      tv7->nDims() == 3 && tv6->getComputeAtPosition() == 0 &&
+      tv6->getMaxProducerPosition() == 1);
+  // The position of every other tensor should be 1.
+  for (auto tv : {tv1, tv2, tv3, tv4, tv5}) {
+    TORCH_CHECK(tv->nDims() == 3 && tv->getComputeAtPosition() == 1);
+
+    TORCH_CHECK(
+        ca_map.areMapped(tv7->axis(0), tv->axis(0), IdMappingMode::PERMISSIVE));
+  }
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({129, 127}, options);
+
+  auto t1 = aten_input.mul({0.5});
+  auto t2 = t1.mul({-1.0});
+  auto t3 = t1.add({3.0});
+  auto t4 = t1.mul({2.0});
+  auto t5 = t3.add(t2);
+  auto t6 = t5.add(t4);
+  auto t7 = t1.add(t4);
+
+  std::vector<at::Tensor> aten_outputs = {t6, t7};
+  std::vector<at::Tensor> cg_outputs = {
+      at::empty_like(aten_input, options), at::empty_like(aten_input, options)};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, cg_outputs);
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAt2_CUDA) {
+  // Case 2
+  // tv1 = tv0 * -1
+  // tv2 = tv0 + 3
+  // tv3 = tv0 * 2
+  // tv4 = tv2 + tv1
+  // tv5 = tv4 + tv3
+  // tv6 = tv5 + tv3
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(-1.0));
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(3.0));
+  TensorView* tv3 = mul(tv0, IrBuilder::create<Double>(2.0));
+  TensorView* tv4 = add(tv2, tv1);
+
+  TensorView* tv5 = add(tv4, tv3);
+  TensorView* tv6 = add(tv5, tv3);
+
+  fusion.addOutput(tv5);
+  fusion.addOutput(tv6);
+
+  // Lets setup to actually run
+  tv6->merge(0);
+  tv6->split(0, 128);
+  tv6->split(0, 4);
+
+  tv6->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv0->computeAt(tv6, 1);
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({129, 127}, options);
+
+  auto t1 = input.mul({-1.0});
+  auto t2 = input.add({3.0});
+  auto t3 = input.mul({2.0});
+  auto t4 = t2.add(t1);
+  auto t5 = t4.add(t3);
+  auto t6 = t5.add(t3);
+
+  std::vector<at::Tensor> aten_outputs = {t5, t6};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAt3_CUDA) {
+  // Case 3
+  // T2 = T1 * 0.979361
+  // T3 = T2 * T0
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(4);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = makeSymbolicTensor(4);
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(.979361));
+  TensorView* tv3 = mul(tv2, tv0);
+
+  fusion.addOutput(tv3);
+
+  // Lets setup to actually run
+  while (tv3->nDims() > 1)
+    tv3->merge(0);
+  tv3->split(0, 128);
+  tv3->split(0, 4);
+
+  tv0->computeAt(tv3, 1);
+  tv1->computeAt(tv3, 1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+
+  auto t2 = t1.mul({0.979361});
+  auto aten_output = t2.mul(t0);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  at::Tensor cg_output = at::empty_like(t0, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  fe.runFusion(aten_inputs, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAt4_CUDA) {
+  // Case 4
+  // T4 = T2 - T3
+  // T5 = T1 + T4
+  // T6 = T5 - T0
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(4);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = makeSymbolicTensor(4);
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = makeSymbolicTensor(4);
+  fusion.addInput(tv2);
+
+  TensorView* tv3 = makeSymbolicTensor(4);
+  fusion.addInput(tv3);
+
+  TensorView* tv4 = sub(tv2, tv3);
+  TensorView* tv5 = add(tv1, tv4);
+  TensorView* tv6 = sub(tv5, tv0);
+
+  fusion.addOutput(tv6);
+
+  // Lets setup to actually run
+  while (tv6->nDims() > 1)
+    tv6->merge(0);
+  tv6->split(0, 128);
+  tv6->split(0, 4);
+
+  tv0->computeAt(tv6, 1);
+  tv1->computeAt(tv6, 1);
+  tv2->computeAt(tv6, 1);
+  tv3->computeAt(tv6, 1);
+
+  tv6->axis(0)->parallelize(ParallelType::BIDx);
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+  at::Tensor t2 = at::rand_like(t0, options);
+  at::Tensor t3 = at::rand_like(t0, options);
+
+  auto t4 = t2.sub(t3);
+  auto t5 = t1.add(t4);
+  auto aten_output = t5.sub(t0);
+
+  std::vector<IValue> aten_inputs = {t0, t1, t2, t3};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAt5_CUDA) {
+  // Case 5
+  // tv2 = tv0 + 2.0
+  // tv3 = tv1 * tv2
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = mul(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  tv3->merge(0);
+  tv3->split(-1, 8);
+  tv3->split(-1, 4);
+
+  tv2->computeAt(tv3, 1);
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({63, 65}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+
+  auto t2 = t0.add(2.0);
+  auto aten_output = t1.mul(t2);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAt6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = mul(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  tv2->merge(0);
+  tv2->split(-1, 8);
+  tv2->split(-1, 4);
+  tv3->merge(0);
+  tv3->split(-1, 8);
+
+  tv2->computeAt(tv3, 1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({63, 65}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+
+  auto t2 = t0.add(2.0);
+  auto aten_output = t1.mul(t2);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAt7_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
+
+  auto tv2 = makeSymbolicTensor(1);
+  fusion.addInput(tv2);
+
+  auto tv3 = add(tv2, IrBuilder::create<Double>(3.0));
+
+  auto tv4 = add(tv1, tv3);
+  fusion.addOutput(tv4);
+
+  auto tv5 = broadcast(tv1, {false, true});
+
+  auto tv6 = makeSymbolicTensor(2);
+  fusion.addInput(tv6);
+
+  auto tv7 = mul(tv5, tv6);
+
+  fusion.addOutput(tv7);
+
+  tv7->split(1, 2);
+  tv7->merge(0);
+  tv7->split(0, 4);
+  tv7->split(0, 128);
+
+  tv7->axis(0)->parallelize(ParallelType::BIDx);
+  tv7->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv0->computeAt(tv7, 1);
+  auto tv5_domain = tv5->domain()->domain();
+
+  // These computeAt transformations should not affect the TV5 domain
+  tv0->computeAt(tv4, -1);
+  tv2->computeAt(tv4, -1);
+
+  auto tv5_domain_current = tv5->domain()->domain();
+  TORCH_CHECK(tv5_domain == tv5_domain_current, "Invalid TV5 domain");
+
+  const int numel_x = 100;
+  const int numel_y = 200;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({numel_x}, options);
+  auto t2 = at::randn({numel_x}, options);
+  auto t6 = at::randn({numel_x, numel_y}, options);
+
+  auto t1 = t0.add(1.0);
+  auto t3 = t2.add(3.0);
+  auto t4 = t1.add(t3);
+  auto t5 = t1.unsqueeze(1);
+  auto t7 = t5.mul(t6);
+
+  std::vector<IValue> aten_inputs = {t0, t2, t6};
+  std::vector<at::Tensor> aten_outputs = {t4, t7};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAt8_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
+
+  auto tv2 = makeSymbolicTensor(1);
+  fusion.addInput(tv2);
+
+  auto tv3 = add(tv2, IrBuilder::create<Double>(3.0));
+
+  auto tv4 = add(tv1, tv3);
+  fusion.addOutput(tv4);
+
+  auto tv5 = broadcast(tv1, {false, true});
+
+  auto tv6 = makeSymbolicTensor(2);
+  fusion.addInput(tv6);
+
+  auto tv7 = mul(tv5, tv6);
+
+  fusion.addOutput(tv7);
+
+  tv7->split(1, 2);
+  tv7->merge(0);
+  tv7->split(0, 128, false);
+  tv7->split(0, 4, false);
+
+  tv7->axis(0)->parallelize(ParallelType::BIDx);
+  tv7->axis(1)->parallelize(ParallelType::TIDx);
+
+  // Reverse computeAt structure from previous test
+  tv0->computeAt(tv4, -1);
+  tv2->computeAt(tv4, -1);
+  tv0->computeAt(tv7, -1);
+
+  const int numel_x = 100;
+  const int numel_y = 200;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({numel_x}, options);
+  auto t2 = at::randn({numel_x}, options);
+  auto t6 = at::randn({numel_x, numel_y}, options);
+
+  auto t1 = t0.add(1.0);
+  auto t3 = t2.add(3.0);
+  auto t4 = t1.add(t3);
+  auto t5 = t1.unsqueeze(1);
+  auto t7 = t5.mul(t6);
+
+  std::vector<IValue> aten_inputs = {t0, t2, t6};
+  std::vector<at::Tensor> aten_outputs = {t4, t7};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeWith1_CUDA) {
+  // Case 1
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv1 + 3
+  // tv4 = tv1 * 2
+  // tv5 = tv3 + tv2
+  // tv6 = tv5 + tv4
+  // tv7 = tv1 + tv4
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0));
+  TensorView* tv3 = add(tv1, IrBuilder::create<Double>(3.0));
+  TensorView* tv4 = mul(tv1, IrBuilder::create<Double>(2.0));
+  TensorView* tv5 = add(tv3, tv2);
+
+  TensorView* tv6 = add(tv5, tv4);
+  TensorView* tv7 = add(tv1, tv4);
+
+  fusion.addOutput(tv6);
+  fusion.addOutput(tv7);
+
+  // Lets setup to actually run
+  tv0->merge(0);
+  tv0->split(0, 128);
+  tv0->split(0, 4);
+
+  tv0->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv0->computeWith(tv7, 1);
+
+  GpuLower gpulw(&fusion);
+
+  // The this-position of the last tensor should be zero.
+  TORCH_CHECK(
+      tv7->nDims() == 3 && tv7->getComputeAtPosition() == 0 &&
+      tv7->getMaxProducerPosition() == 1);
+  TORCH_CHECK(
+      tv7->nDims() == 3 && tv6->getComputeAtPosition() == 0 &&
+      tv6->getMaxProducerPosition() == 1);
+
+  ComputeAtMap ca_map(&fusion);
+
+  // The position of every other tensor should be 1.
+  for (auto tv : {tv1, tv2, tv3, tv4, tv5}) {
+    TORCH_CHECK(tv->nDims() == 3 && tv->getComputeAtPosition() == 1);
+    TORCH_CHECK(
+        ca_map.areMapped(tv7->axis(0), tv->axis(0), IdMappingMode::PERMISSIVE));
+  }
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({129, 127}, options);
+
+  auto t1 = aten_input.mul({0.5});
+  auto t2 = t1.mul({-1.0});
+  auto t3 = t1.add({3.0});
+  auto t4 = t1.mul({2.0});
+  auto t5 = t3.add(t2);
+  auto t6 = t5.add(t4);
+  auto t7 = t1.add(t4);
+
+  std::vector<at::Tensor> aten_outputs = {t6, t7};
+  std::vector<at::Tensor> cg_outputs = {
+      at::empty_like(aten_input, options), at::empty_like(aten_input, options)};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, cg_outputs);
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeWith2_CUDA) {
+  // Case 2
+  // tv1 = tv0 * -1
+  // tv2 = tv0 + 3
+  // tv3 = tv0 * 2
+  // tv4 = tv2 + tv1
+  // tv5 = tv4 + tv3
+  // tv6 = tv5 + tv3
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(-1.0));
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(3.0));
+  TensorView* tv3 = mul(tv0, IrBuilder::create<Double>(2.0));
+  TensorView* tv4 = add(tv2, tv1);
+
+  TensorView* tv5 = add(tv4, tv3);
+  TensorView* tv6 = add(tv5, tv3);
+
+  fusion.addOutput(tv5);
+  fusion.addOutput(tv6);
+
+  // Lets setup to actually run
+  tv0->merge(0);
+  tv0->split(0, 128);
+  tv0->split(0, 4);
+
+  tv0->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv0->computeWith(tv6, 1);
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({129, 127}, options);
+
+  auto t1 = input.mul({-1.0});
+  auto t2 = input.add({3.0});
+  auto t3 = input.mul({2.0});
+  auto t4 = t2.add(t1);
+  auto t5 = t4.add(t3);
+  auto t6 = t5.add(t3);
+
+  std::vector<at::Tensor> aten_outputs = {t5, t6};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeWith3_CUDA) {
+  // Case 3
+  // T2 = T1 * 0.979361
+  // T3 = T2 * T0
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(4);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = makeSymbolicTensor(4);
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(.979361));
+  TensorView* tv3 = mul(tv2, tv0);
+
+  fusion.addOutput(tv3);
+
+  // Lets setup to actually run
+  while (tv0->nDims() > 1)
+    tv0->merge(0);
+  tv0->split(0, 128);
+  tv0->split(0, 4);
+
+  while (tv1->nDims() > 1)
+    tv1->merge(0);
+  tv1->split(0, 128);
+  tv1->split(0, 4);
+
+  tv0->computeWith(tv3, 1);
+  tv1->computeWith(tv3, 1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+
+  auto t2 = t1.mul({0.979361});
+  auto aten_output = t2.mul(t0);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  at::Tensor cg_output = at::empty_like(t0, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  fe.runFusion(aten_inputs, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeWith4_CUDA) {
+  // Case 4
+  // T4 = T2 - T3
+  // T5 = T1 + T4
+  // T6 = T5 - T0
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(4);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = makeSymbolicTensor(4);
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = makeSymbolicTensor(4);
+  fusion.addInput(tv2);
+
+  TensorView* tv3 = makeSymbolicTensor(4);
+  fusion.addInput(tv3);
+
+  TensorView* tv4 = sub(tv2, tv3);
+  TensorView* tv5 = add(tv1, tv4);
+  TensorView* tv6 = sub(tv5, tv0);
+
+  fusion.addOutput(tv6);
+  std::vector<TensorView*> tvs = {tv0, tv1, tv2};
+  for (auto tv : tvs) {
+    // Lets setup to actually run
+    while (tv->nDims() > 1) {
+      tv->merge(0);
+    }
+    tv->split(0, 128);
+    tv->split(0, 4);
+    tv->computeWith(tv6, 1);
+  }
+
+  tv6->axis(0)->parallelize(ParallelType::BIDx);
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+  at::Tensor t2 = at::rand_like(t0, options);
+  at::Tensor t3 = at::rand_like(t0, options);
+
+  auto t4 = t2.sub(t3);
+  auto t5 = t1.add(t4);
+  auto aten_output = t5.sub(t0);
+
+  std::vector<IValue> aten_inputs = {t0, t1, t2, t3};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeWith5_CUDA) {
+  // Case 5
+  // tv2 = tv0 + 2.0
+  // tv3 = tv1 * tv2
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = mul(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  tv2->merge(0);
+  tv2->split(-1, 8);
+  tv2->split(-1, 4);
+
+  tv2->computeWith(tv3, 1);
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({63, 65}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+
+  auto t2 = t0.add(2.0);
+  auto aten_output = t1.mul(t2);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeWith6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = mul(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  tv2->merge(0);
+  tv2->split(-1, 8);
+  tv2->split(-1, 4);
+  tv3->merge(0);
+  tv3->split(-1, 8);
+
+  tv2->computeWith(tv3, 1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({63, 65}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+
+  auto t2 = t0.add(2.0);
+  auto aten_output = t1.mul(t2);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionComputeAtMultiConsumers_CUDA) {
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv2 * -2
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0));
+  TensorView* tv3 = mul(tv1, IrBuilder::create<Double>(-2.0));
+  fusion.addOutput(tv2);
+  fusion.addOutput(tv3);
+
+  // This computeAt will affect tv2 as well, even though tv2 is not in
+  // the data-flow path between tv1 and tv3. The reason is that tv1 is
+  // now computed at tv3, so tv2 must also be computed at the same
+  // location. Overall, what will happen is basically we merge
+  // expressions of all tensors and compute them in a single loop
+  // nest.
+  TensorView* computeAtTarget = tv3;
+  computeAtTarget->split(0, 128);
+  tv1->computeAt(computeAtTarget, 1);
+
+  TensorView* affected_tensors[] = {tv1, tv2, tv3};
+  for (auto tv : affected_tensors) {
+    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
+  }
+
+  GpuLower gpulw(&fusion);
+
+  TORCH_CHECK(tv1->getComputeAtPosition() == 1);
+  TORCH_CHECK(
+      tv2->getComputeAtPosition() == 0 && tv2->getMaxProducerPosition() == 1);
+  TORCH_CHECK(
+      tv3->getComputeAtPosition() == 0 && tv3->getMaxProducerPosition() == 1);
+
+  ComputeAtMap ca_map(&fusion);
+
+  // Note that tv2 is also computed at tv3.
+  for (auto tv : {tv1, tv2}) {
+    TORCH_CHECK(ca_map.areMapped(
+        tv->axis(0), computeAtTarget->axis(0), IdMappingMode::PERMISSIVE));
+  }
+
+  TORCH_CHECK(tv3->getComputeAtPosition() == 0);
+
+  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
+  for (auto tv : affected_tensors) {
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({1000}, options);
+
+  auto t1 = aten_input * 0.5;
+  auto t2 = t1 * -1.0;
+  auto t3 = t1 * -2.0;
+
+  std::vector<at::Tensor> aten_outputs = {t2, t3};
+
+  std::vector<at::Tensor> cg_outputs = {
+      at::empty_like(aten_input, options), at::empty_like(aten_input, options)};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, cg_outputs);
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+// Similar to ComputeAtMultiConsumers, but with a common consumer.
+TEST_F(NVFuserTest, FusionComputeAtCommonConsumer1_CUDA) {
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv2 * -2
+  // tv4 = tv2 + tv3
+  // tv5 = tv4 * 5
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0));
+  TensorView* tv3 = mul(tv1, IrBuilder::create<Double>(-2.0));
+  TensorView* tv4 = add(tv2, tv3);
+  TensorView* tv5 = mul(tv4, IrBuilder::create<Double>(5.0));
+  fusion.addOutput(tv3);
+  fusion.addOutput(tv4);
+  fusion.addOutput(tv5);
+
+  // Computing tv1 at tv3. This will affect tv2 as discussed in
+  // ComplexComputeAt1. Additionally, in this case, notice that tv4 is
+  // the common consumer of tv2 and tv3, so they are computed at
+  // tv4. The indirect propagation of the computeAt should stop at the
+  // common consumer, and no further change should occur. More
+  // specifically, the computeAT position of tv4 and tv5 should be zero.
+  TensorView* computeAtTarget = tv3;
+  computeAtTarget->split(0, 128);
+  tv1->computeAt(computeAtTarget, 1);
+
+  TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4};
+  for (auto tv : affected_tensors) {
+    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
+  }
+
+  TORCH_CHECK(tv1->getComputeAtPosition() == 1);
+  TORCH_CHECK(tv2->getComputeAtPosition() == 1);
+  TORCH_CHECK(tv3->getComputeAtPosition() == 1);
+  TORCH_CHECK(tv4->getComputeAtPosition() == 0);
+  TORCH_CHECK(tv5->getComputeAtPosition() == 0);
+
+  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
+
+  for (auto tv : affected_tensors) {
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  // Transform tv5 to make it look like the rest
+  tv5->split(0, 128);
+  tv5->axis(1)->parallelize(ParallelType::TIDx);
+  tv5->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({1000}, options);
+
+  auto t1 = aten_input * 0.5;
+  auto t2 = t1 * -1.0;
+  auto t3 = t1 * -2.0;
+  auto t4 = t2 + t3;
+  auto t5 = t4 * 5.0;
+
+  std::vector<at::Tensor> aten_outputs = {t3, t4, t5};
+  std::vector<at::Tensor> cg_outputs = {
+      at::empty_like(aten_input, options),
+      at::empty_like(aten_input, options),
+      at::empty_like(aten_input, options)};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, cg_outputs);
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionComputeAtCommonConsumer2_CUDA) {
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv2 * -1
+  // tv4 = tv1 + 4
+  // tv5 = tv3 + tv4
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0));
+  TensorView* tv3 = mul(tv2, IrBuilder::create<Double>(-1.0));
+  TensorView* tv4 = add(tv1, IrBuilder::create<Double>(4.0));
+  TensorView* tv5 = add(tv3, tv4);
+
+  fusion.addOutput(tv5);
+
+  TensorView* computeAtTarget = tv3;
+
+  computeAtTarget->merge(0);
+  computeAtTarget->split(0, 128);
+  computeAtTarget->split(0, 4);
+
+  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
+
+  // This computeAt will affect all tensors including tv3, tv4 and
+  // tv5, even though it appears to impact only tv1 and tv2. The
+  // reason is that tv1 is now computed at tv3, so tv4 must also be
+  // computed at the same location. Similarly, the consumer of tv4,
+  // tv5, must also be computed at the same location. Overall, what
+  // will happen is basically we merge expressions of all tensors and
+  // compute them in a single loop nest. Internally, this will be
+  // realized by making all tensors, except for those in the path
+  // between tv1 and tv3, computed at tv5, which we call the common
+  // consumer.
+  tv1->computeAt(computeAtTarget, 1);
+
+  // All tensors should have the same dimenionality as the target
+  for (Val* val : fusion.vals()) {
+    if (val->isFusionInput() ||
+        val->getValType().value() != ValType::TensorView) {
+      continue;
+    }
+    TensorView* tv = val->as<TensorView>();
+    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
+    if (tv == tv5) {
+      TORCH_CHECK(tv->getComputeAtPosition() == 0);
+    } else {
+      TORCH_CHECK(tv->getComputeAtPosition() == 1);
+    }
+  }
+
+  for (auto tv : ir_utils::filterByType<TensorView>(fusion.vals())) {
+    if (!tv->isFusionInput()) {
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({129, 127}, options);
+
+  auto t1 = aten_input.mul({0.5});
+  auto t2 = t1.mul({-1.0});
+  auto t3 = t2.mul({-1.0});
+  auto t4 = t1.add({4.0});
+  auto aten_output = t3 + t4;
+
+  at::Tensor cg_output = at::empty_like(aten_input, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+// Similar to the above common consumer test but adds an additional
+// tensor that has no common consumer with the other tensors.
+TEST_F(NVFuserTest, FusionComputeAtCommonConsumer3_CUDA) {
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv2 * -1
+  // tv4 = tv1 + 4
+  // tv5 = tv2 + tv3
+  // tv6 = tv1 + 6
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0));
+  TensorView* tv3 = mul(tv2, IrBuilder::create<Double>(-1.0));
+  TensorView* tv4 = add(tv1, IrBuilder::create<Double>(4.0));
+  TensorView* tv5 = add(tv3, tv4);
+  TensorView* tv6 = add(tv1, IrBuilder::create<Double>(6.0));
+
+  fusion.addOutput(tv5);
+  fusion.addOutput(tv6);
+
+  TensorView* computeAtTarget = tv3;
+
+  computeAtTarget->merge(0);
+  computeAtTarget->split(0, 128);
+  computeAtTarget->split(0, 4);
+
+  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
+
+  // This will have the same impact on the tensors except for tv5 and
+  // tv6. tv6 does not have any common consumer with the computeAt
+  // target, but since it uses tv1, it must be also computed at the
+  // same location as the other impacted tensors. We can either make
+  // tv5 computed at tv6 or tv6 computed at tv5. In this case, tv5
+  // should be computed at tv6 just because the current implementation
+  // orders the computeAt relationship based on the order in which
+  // tensors are specified as outputs.
+
+  tv1->computeAt(computeAtTarget, 1);
+
+  // All tensors should have the same dimenionality as the target
+  for (auto tv : ir_utils::filterByType<TensorView>(fusion.vals())) {
+    if (tv->isFusionInput()) {
+      continue;
+    }
+    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
+    if (tv == tv5 || tv == tv6) {
+      TORCH_CHECK(tv->getComputeAtPosition() == 0);
+      TORCH_CHECK(tv->getMaxProducerPosition() == 1);
+    } else {
+      TORCH_CHECK(tv->getComputeAtPosition() == 1);
+    }
+  }
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = val->as<TensorView>();
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({129, 127}, options);
+
+  auto t1 = aten_input.mul({0.5});
+  auto t2 = t1.mul({-1.0});
+  auto t3 = t2.mul({-1.0});
+  auto t4 = t1.add({4.0});
+  auto t5 = t3 + t4;
+  auto t6 = t1.add({6.0});
+
+  std::vector<at::Tensor> aten_outputs = {t5, t6};
+  std::vector<at::Tensor> cg_outputs = {
+      at::empty_like(aten_input, options), at::empty_like(aten_input, options)};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, cg_outputs);
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+// Similar to ComputeAtCommonConsumer1 but with an addtiona ltensor
+// that does not have data dependency with the consumer.
+TEST_F(NVFuserTest, FusionComputeAtNoCommonConsumer_CUDA) {
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv1 * -2
+  // tv4 = tv2 + tv3
+  // tv5 = tv4 * 5
+  // tv6 = tv1 * 6
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0));
+  TensorView* tv3 = mul(tv1, IrBuilder::create<Double>(-2.0));
+  TensorView* tv4 = add(tv2, tv3);
+  TensorView* tv5 = mul(tv4, IrBuilder::create<Double>(5.0));
+  // Notice that tv6 is not a consumer of tv4.
+  TensorView* tv6 = mul(tv1, IrBuilder::create<Double>(6.0));
+  fusion.addOutput(tv3);
+  fusion.addOutput(tv4);
+  fusion.addOutput(tv5);
+  fusion.addOutput(tv6);
+
+  TensorView* computeAtTarget = tv3;
+  computeAtTarget->split(0, 128);
+  tv1->computeAt(computeAtTarget, 1);
+
+  TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4, tv5, tv6};
+  for (auto tv : affected_tensors) {
+    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
+    if (tv == tv6 || tv == tv5) {
+      TORCH_CHECK(tv->getComputeAtPosition() == 0);
+    } else {
+      TORCH_CHECK(tv->getComputeAtPosition() == 1);
+    }
+  }
+
+  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
+
+  for (auto tv : affected_tensors) {
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({1000}, options);
+
+  auto t1 = aten_input * 0.5;
+  auto t2 = t1 * -1.0;
+  auto t3 = t1 * -2.0;
+  auto t4 = t2 + t3;
+  auto t5 = t4 * 5.0;
+  auto t6 = t1 * 6.0;
+
+  std::vector<at::Tensor> aten_outputs = {t3, t4, t5, t6};
+  std::vector<at::Tensor> cg_outputs = {
+      at::empty_like(aten_input, options),
+      at::empty_like(aten_input, options),
+      at::empty_like(aten_input, options),
+      at::empty_like(aten_input, options)};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, cg_outputs);
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+namespace {
+
+void checkIdMapped(
+    ComputeAtRootDomainMap& root_map,
+    TensorView* v0,
+    IterDomain* id0,
+    TensorView* v1,
+    IterDomain* id1,
+    bool should_map) {
+  if (should_map) {
+    TORCH_CHECK(
+        root_map.canMap(v0->domain(), id0, v1->domain(), id1),
+        "Should be mappable: ",
+        id0,
+        " of ",
+        v0,
+        " and ",
+        id1,
+        " of ",
+        v1);
+  } else {
+    TORCH_CHECK(
+        !root_map.canMap(v0->domain(), id0, v1->domain(), id1),
+        "Should not be mappable: ",
+        id0,
+        " of ",
+        v0,
+        " and ",
+        id1,
+        " of ",
+        v1);
+  }
+}
+
+void checkIdMapped(
+    TensorView* v0,
+    const std::vector<IterDomain*>& root0,
+    const std::vector<bool> should_map0,
+    TensorView* v1,
+    const std::vector<IterDomain*>& root1,
+    const std::vector<bool> should_map1) {
+  ComputeAtRootDomainMap map;
+  map.build();
+  TORCH_INTERNAL_ASSERT(root0.size() == should_map0.size());
+  TORCH_INTERNAL_ASSERT(root1.size() == should_map1.size());
+  size_t idx0 = 0;
+  for (const auto i : c10::irange(root0.size())) {
+    size_t idx1 = 0;
+    for (const auto j : c10::irange(root1.size())) {
+      if (should_map0[i] && should_map1[j] && idx0 == idx1) {
+        checkIdMapped(map, v0, root0[i], v1, root1[j], true);
+      } else {
+        checkIdMapped(map, v0, root0[i], v1, root1[j], false);
+      }
+      if (should_map1[j])
+        ++idx1;
+    }
+    if (should_map0[i])
+      ++idx0;
+  }
+}
+
+void checkIdMapped(
+    TensorView* v0,
+    const std::vector<IterDomain*>& root0,
+    TensorView* v1,
+    const std::vector<IterDomain*>& root1) {
+  checkIdMapped(
+      v0,
+      root0,
+      std::vector<bool>(root0.size(), true),
+      v1,
+      root1,
+      std::vector<bool>(root1.size(), true));
+}
+
+} // namespace
+
+TEST_F(NVFuserTest, FusionRootMappingBasic_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  auto tv3 = broadcast(tv0, {true, false, false});
+  auto tv4 = broadcast(tv1, {false, true, false});
+  auto tv5 = add(tv3, tv4);
+  fusion.addOutput(tv5);
+
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, true},
+      tv4,
+      tv4->getRootDomain(),
+      {false, true, true});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, true},
+      tv4,
+      tv4->getRootDomain(),
+      {true, false, true});
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {false, true},
+      tv1,
+      tv1->getRootDomain(),
+      {false, true});
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, true},
+      tv5,
+      tv5->getRootDomain(),
+      {false, true, true});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, true},
+      tv5,
+      tv5->getRootDomain(),
+      {true, false, true});
+  checkIdMapped(tv3, tv3->getRootDomain(), tv4, tv4->getRootDomain());
+  checkIdMapped(tv3, tv3->getRootDomain(), tv5, tv5->getRootDomain());
+  checkIdMapped(tv4, tv4->getRootDomain(), tv5, tv5->getRootDomain());
+}
+
+TEST_F(NVFuserTest, FusionRootMappingRfactor_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // [I,I]
+  TensorView* tv0 = makeSymbolicTensor(2);
+  // [I,I,I]
+  TensorView* tv1 = makeSymbolicTensor(3);
+
+  //[I,I,R]
+  auto tv2 = sum(tv1, {2});
+  auto tv3 = add(tv2, tv0);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv3);
+
+  // scheduling:
+  //[B,I,R0,R1=128], root = [B,I,R]
+  tv2->split(2, 128);
+
+  // root=[B,I,Irf], rfactor=[B,I,Irf,Rrf]
+  auto tv4 = tv2->rFactor({3});
+
+  checkIdMapped(tv1, tv1->getRootDomain(), tv4, tv4->getRootDomain());
+  checkIdMapped(
+      tv4,
+      tv4->getRFactorDomain(),
+      {true, true, true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {true, true, true});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {true, true, false});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, true, false},
+      tv3,
+      tv3->getRootDomain(),
+      {true, true});
+  checkIdMapped(
+      tv2,
+      tv2->getRootDomain(),
+      {true, true, false},
+      tv3,
+      tv3->getRootDomain(),
+      {true, true});
+  checkIdMapped(tv0, tv0->getRootDomain(), tv3, tv3->getRootDomain());
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, true},
+      tv1,
+      tv1->getRootDomain(),
+      {true, true, false});
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, true},
+      tv2,
+      tv2->getRootDomain(),
+      {true, true, false});
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, true},
+      tv4,
+      tv4->getRFactorDomain(),
+      {true, true, false, false});
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, true},
+      tv4,
+      tv4->getRootDomain(),
+      {true, true, false});
+}
+
+TEST_F(NVFuserTest, FusionRootMappingReductionDependency1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  fusion.addOutput(tv2);
+
+  // The second dimension cannot be mapped as it would require recomputation.
+  checkIdMapped(tv0, tv0->getRootDomain(), tv1, tv1->getRootDomain());
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {true, false});
+}
+
+TEST_F(NVFuserTest, FusionRootMappingReductionDependency2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = add(tv0, tv2);
+  fusion.addOutput(tv3);
+
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, false},
+      tv1,
+      tv1->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, false},
+      tv3,
+      tv3->getRootDomain(),
+      {true, false});
+  checkIdMapped(tv2, tv2->getRootDomain(), tv3, tv3->getRootDomain());
+}
+
+TEST_F(NVFuserTest, FusionRootMappingReductionDependency3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  fusion.addOutput(tv2);
+
+  tv1->split(-1, 4);
+  auto tv3 = tv1->rFactor({-2});
+
+  checkIdMapped(tv0, tv0->getRootDomain(), tv3, tv3->getRootDomain());
+  checkIdMapped(
+      tv3,
+      tv3->getMaybeRFactorDomain(),
+      {true, false, true},
+      tv1,
+      tv1->getRootDomain(),
+      {true, true});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {true, false});
+}
+
+TEST_F(NVFuserTest, FusionRootMappingReductionDependency4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = add(tv0, tv2);
+  fusion.addOutput(tv3);
+
+  tv1->split(-1, 4);
+  auto tv4 = tv1->rFactor({-2});
+
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, false},
+      tv4,
+      tv4->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv4,
+      tv4->getMaybeRFactorDomain(),
+      {true, false, true},
+      tv1,
+      tv1->getRootDomain(),
+      {true, true});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {true, false});
+  checkIdMapped(tv2, tv2->getRootDomain(), tv3, tv3->getRootDomain());
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {true, false});
+}
+
+// Reproducer of issue #749
+TEST_F(NVFuserTest, FusionRootMappingReductionDependency5_CUDA_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {1});
+  auto tv3 = broadcast(tv2, {false, true});
+  auto tv4 = add(tv0, tv3);
+  auto tv5 = add(tv4, tv1);
+  fusion.addOutput(tv5);
+
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, false},
+      tv1,
+      tv1->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv2,
+      tv2->getRootDomain(),
+      {true, false},
+      tv3,
+      tv3->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv3,
+      tv3->getRootDomain(),
+      {true, true},
+      tv4,
+      tv4->getRootDomain(),
+      {true, true});
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, false},
+      tv4,
+      tv4->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv4,
+      tv4->getRootDomain(),
+      {true, true},
+      tv5,
+      tv5->getRootDomain(),
+      {true, true});
+}
+
+// Similar to RootMappingReductionDependency5 but with rFactor
+TEST_F(NVFuserTest, FusionRootMappingReductionDependency6_CUDA_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {1});
+  auto tv3 = broadcast(tv2, {false, true});
+  auto tv4 = add(tv0, tv3);
+  auto tv5 = add(tv4, tv1);
+  fusion.addOutput(tv5);
+
+  tv2->split(1, 4);
+  auto tv6 = tv2->rFactor({-1});
+
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, false},
+      tv1,
+      tv1->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, false},
+      tv6,
+      tv6->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv6,
+      tv6->getMaybeRFactorDomain(),
+      {true, true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {true, true});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv2,
+      tv2->getRootDomain(),
+      {true, false},
+      tv3,
+      tv3->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv3,
+      tv3->getRootDomain(),
+      {true, true},
+      tv4,
+      tv4->getRootDomain(),
+      {true, true});
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true, false},
+      tv4,
+      tv4->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv4,
+      tv4->getRootDomain(),
+      {true, true},
+      tv5,
+      tv5->getRootDomain(),
+      {true, true});
+}
+
+TEST_F(
+    NVFuserTest,
+    FusionRootMappingMultipleBroadcastWithNoCommonConsumer_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(1);
+  auto tv1 = broadcast(tv0, {false, true});
+  auto tv2 = broadcast(tv0, {true, false});
+  fusion.addOutput(tv1);
+  fusion.addOutput(tv2);
+
+  // If there is no common consumer, there is no recomputation constraint.
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true},
+      tv1,
+      tv1->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true},
+      tv2,
+      tv2->getRootDomain(),
+      {false, true});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {false, true});
+}
+
+TEST_F(NVFuserTest, FusionRootMappingBroadcastNonUniqueSize_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  auto tv2 = makeSymbolicTensor(2);
+  fusion.addInput(tv2);
+  auto tv3 = broadcast(tv0, {false, true});
+  auto tv4 = add(tv1, tv3);
+  fusion.addOutput(tv4);
+  auto tv5 = add(tv2, tv3);
+  fusion.addOutput(tv5);
+
+  // Broadcast domains can be used with multiple domains with
+  // different sizes. In this test, the broadcast domain of tv3 has
+  // two consumers, tv4 and tv5, which may have different sizes. Each
+  // of the consumers is used with the broadcast domain of tv3, but
+  // the two consumers may not have the same size, it is not possible
+  // to map those domains.
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true},
+      tv3,
+      tv3->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true},
+      tv1,
+      tv1->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true},
+      tv2,
+      tv2->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, false},
+      tv2,
+      tv2->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, false},
+      tv3,
+      tv3->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv2,
+      tv2->getRootDomain(),
+      {true, false},
+      tv3,
+      tv3->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv3,
+      tv3->getRootDomain(),
+      {true, false},
+      tv4,
+      tv4->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv3,
+      tv3->getRootDomain(),
+      {true, false},
+      tv5,
+      tv5->getRootDomain(),
+      {true, false});
+  checkIdMapped(
+      tv4,
+      tv4->getRootDomain(),
+      {true, false},
+      tv5,
+      tv5->getRootDomain(),
+      {true, false});
+}
+
+TEST_F(NVFuserTest, FusionRootMappingBroadcast_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  // tv0[I0]
+  fusion.addInput(tv0);
+  auto tv1 = broadcast(tv0, {true, false});
+  // tv1[B1, I0]
+  auto tv2 = broadcast(tv1, {true, false, false});
+  // tv2[B2, B1, I0]
+  fusion.addOutput(tv2);
+
+  // In this case, tv1 and tv2 has one and two broadcast domains,
+  // respectively. It is the second broadcast domain that is mapped to
+  // the broadcast of tv1.
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true},
+      tv1,
+      tv1->getRootDomain(),
+      {false, true});
+  checkIdMapped(
+      tv1,
+      tv1->getRootDomain(),
+      {true, true},
+      tv2,
+      tv2->getRootDomain(),
+      {false, true, true}); // Not {true, false, true}
+  checkIdMapped(
+      tv0,
+      tv0->getRootDomain(),
+      {true},
+      tv2,
+      tv2->getRootDomain(),
+      {false, false, true});
+}
+
+// Reproducer of issue #723
+TEST_F(NVFuserTest, FusionRootMappingTrivialReduction_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  auto tv1 = makeSymbolicTensor(2);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = broadcast(tv0, {true, false});
+  auto tv3 = sum(tv2, {0});
+  auto tv4 = add(tv2, tv1);
+
+  fusion.addOutput(tv3);
+  fusion.addOutput(tv4);
+
+  ComputeAtRootDomainMap map;
+  map.build();
+
+  checkIdMapped(
+      map, tv2, tv2->getRootDomain()[0], tv4, tv4->getRootDomain()[0], true);
+  checkIdMapped(
+      map, tv2, tv2->getRootDomain()[0], tv3, tv3->getRootDomain()[0], true);
+
+  tv2->computeAt(tv4, -1);
+
+  const int x = 11;
+  const int y = 12;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({x}, options);
+  at::Tensor t1 = at::randn({y, x}, options);
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto t3 = t0;
+  auto t4 = t0.unsqueeze(0).expand({y, x}) + t1;
+
+  testValidate(&fusion, outputs, aten_inputs, {t3, t4}, __LINE__, __FILE__);
+}
+
+// Repro of issue #1950
+TEST_F(NVFuserTest, FusionRootMappingRepro1950_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  auto tv0 = makeSymbolicTensor(3);
+  auto tv1 = makeSymbolicTensor(3);
+  auto tv2 = makeSymbolicTensor(3);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addInput(tv2);
+
+  auto tv3 = set(tv0);
+  auto tv4 = mul(tv1, tv3);
+  auto tv5 = mul(tv1, tv2);
+  auto tv6 = mul(tv5, tv3);
+  auto tv7 = sum(tv6, {2});
+  auto tv8 = broadcast(tv7, {false, false, true});
+  auto tv9 = mul(tv3, tv8);
+
+  // Issue #1950 was caused by a particular traversal ordering based
+  // on the output tensor ordering as below
+  fusion.addOutput(tv9);
+  fusion.addOutput(tv5);
+  fusion.addOutput(tv4);
+
+  ComputeAtRootDomainMap root_map;
+  root_map.build();
+
+  checkIdMapped(root_map, tv4, tv4->axis(-1), tv9, tv9->axis(-1), false);
+}
+
+TEST_F(NVFuserTest, FusionDetectSelfMappedDomains_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  // [I1]
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  // [B2, I2]
+  auto tv2 = broadcast(tv1, {true, false});
+  // [I3, B3]
+  auto tv3 = broadcast(tv1, {false, true});
+  // [I4, I5]
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  // IterDomainGraph maps B2, I3 and I4 together, and similarly I2,
+  // B3 and I5. The problem is I1 is mapped with both of the ID
+  // groups, so eventually all of the IDs are mapped
+  // together. IterDomainGraph should throw an exception as this
+  // pattern of domain mappings is not supported.
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW({ IterDomainGraph id_graph(&fusion); });
+}
+
+TEST_F(NVFuserTest, FusionScalarInputs_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+
+  Double* d0 = IrBuilder::create<Double>();
+  fusion.addInput(d0);
+  Double* d1 = IrBuilder::create<Double>();
+  fusion.addInput(d1);
+  Double* d2 = IrBuilder::create<Double>();
+  fusion.addInput(d2);
+  Double* d3 = IrBuilder::create<Double>();
+  fusion.addInput(d3);
+  Val* d4 = mul(d0, d1);
+  Val* d5 = sub(d2, d3);
+
+  TensorView* tv2 = sub(tv1, d4);
+  TensorView* tv3 = add(tv0, d5);
+  TensorView* tv4 = mul(tv3, tv2);
+
+  fusion.addOutput(tv4);
+
+  // Lets setup to actually run
+  while (tv4->nDims() > 1)
+    tv4->merge(0);
+  tv4->split(0, 128);
+  tv4->split(0, 4);
+
+  tv0->computeAt(tv4, 1);
+  tv1->computeAt(tv4, 1);
+
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  // d4 = d0 * d1
+  // d5 = d2 - d3
+  // t2 = t1 - d4
+  // t3 = t0 + d5
+  // t4 = t3 * t2
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  float fl0 = 0.1;
+  float fl1 = -0.2;
+  float fl2 = 0.3;
+  float fl3 = -0.4;
+  float fl4 = fl0 * fl1;
+  float fl5 = fl2 - fl3;
+
+  at::Tensor t0 = at::randn({129, 127}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+
+  auto t2 = t1.sub(fl4);
+  auto t3 = t0.add(fl5);
+  auto aten_output = t3.mul(t2);
+
+  at::Tensor cg_output = at::empty_like(t0, options);
+
+  at::Scalar test(fl0);
+
+  std::vector<IValue> aten_inputs = {
+      t0,
+      t1,
+      at::Scalar(fl0),
+      at::Scalar(fl1),
+      at::Scalar(fl2),
+      at::Scalar(fl3)};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  fe.runFusion(aten_inputs, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionLoopUnroll_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(3);
+  TensorView* tv1 = makeSymbolicTensor(3);
+
+  // Register your inputs
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // Do math with it, it returns a `Val*` but can be static_casted back to
+  // TensorView
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = add(tv0, tv2);
+
+  // Register your outputs
+  fusion.addOutput(tv3);
+
+  int block_size = 16;
+
+  tv3->merge(0, 1);
+  tv3->merge(0, 1);
+
+  tv3->split(0, block_size);
+  tv3->split(0, 4);
+
+  // For all inputs, computeAt the output inline, temporaries should be squeezed
+  // between them
+  tv0->computeAt(tv3, 1);
+  tv1->computeAt(tv3, 1);
+
+  // Parallelize
+  tv2->axis(1)->parallelize(ParallelType::Unroll);
+  tv3->axis(1)->parallelize(ParallelType::Unroll);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor input0 = at::randn({129, 13, 3}, options);
+  at::Tensor input1 = at::randn({129, 13, 3}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input0, input1});
+  auto outputs = fe.runFusion({input0, input1});
+
+  TORCH_CHECK(outputs[0].equal(input0.add(input1.add(2.0))));
+}
+
+/*
+ * Helper function for single op testing that generates a codegen operand
+ */
+
+Val* gen_jit_operand(std::pair<ValType, DataType> desc) {
+  if (desc.first == ValType::TensorView) {
+    return makeSymbolicTensor(2, desc.second);
+  } else if (desc.first == ValType::Scalar) {
+    if (desc.second == DataType::Float) {
+      return IrBuilder::create<Double>();
+    } else if (desc.second == DataType::Double) {
+      return IrBuilder::create<Double>();
+    } else if (desc.second == DataType::ComplexFloat) {
+      return IrBuilder::create<ComplexDouble>();
+    } else if (desc.second == DataType::ComplexDouble) {
+      return IrBuilder::create<ComplexDouble>();
+    } else if (desc.second == DataType::Int) {
+      return IrBuilder::create<Int>();
+    } else {
+      TORCH_CHECK(false, "Not currently supported type: ", desc.first);
+    }
+  } else {
+    TORCH_CHECK(false, "Not currently supported type: ", desc.first);
+  }
+  return nullptr;
+}
+
+/*
+ * Helper function for single op testing that generates an ATen operand
+ */
+
+IValue gen_aten_operand(
+    std::pair<ValType, DataType> desc,
+    int blocks,
+    int threads,
+    bool rand) {
+  if (desc.first == ValType::TensorView) {
+    if (desc.second == DataType::Double || desc.second == DataType::Float ||
+        desc.second == DataType::ComplexDouble ||
+        desc.second == DataType::ComplexFloat ||
+        desc.second == DataType::Half || desc.second == DataType::BFloat16) {
+      auto options = at::TensorOptions()
+                         .dtype(data_type_to_aten(desc.second))
+                         .device(at::kCUDA, 0);
+      if (rand) {
+        return IValue(at::rand({blocks, threads}, options));
+      } else {
+        return IValue(at::empty({blocks, threads}, options));
+      }
+    } else if (desc.second == DataType::Int || desc.second == DataType::Int32) {
+      auto dtype = desc.second == DataType::Int32 ? at::kInt : at::kLong;
+      if (rand) {
+        auto options =
+            at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+        return IValue(at::randn({blocks, threads}, options).mul(5).to(dtype));
+      } else {
+        auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0);
+        return IValue(at::empty({blocks, threads}, options));
+      }
+    } else if (desc.second == DataType::Bool) {
+      if (rand) {
+        auto options =
+            at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+        return IValue(
+            at::rand({blocks, threads}, options).round().to(at::kBool));
+      } else {
+        auto options =
+            at::TensorOptions().dtype(at::kBool).device(at::kCUDA, 0);
+        return IValue(at::empty({blocks, threads}, options));
+      }
+    } else {
+      TORCH_CHECK(false, "Not currently supported type: ", desc.second)
+    }
+  } else if (desc.first == ValType::Scalar) {
+    // IValue scalars can only be double int64 or bool
+    if (desc.second == DataType::ComplexDouble ||
+        desc.second == DataType::ComplexFloat) {
+      return IValue(at::Scalar(c10::complex<double>(1.0, 0.0)));
+    } else if (
+        desc.second == DataType::Double || desc.second == DataType::Float ||
+        desc.second == DataType::Half || desc.second == DataType::BFloat16) {
+      return IValue(at::Scalar(1.0));
+    } else if (desc.second == DataType::Int) {
+      return IValue(at::Scalar(1));
+    } else {
+      TORCH_CHECK(false, "Not currently supported type: ", desc.first);
+    }
+  } else {
+    TORCH_CHECK(false, "Not currently supported type: ", desc.first);
+  }
+  return nullptr;
+}
+
+/*
+ * Templatized Helper Function To generate single Op comparison between the
+ * JIT codegen for Cuda and the ATen Library.
+ */
+
+using OutputPair = std::pair<ValType, DataType>;
+template <
+    typename AtenFunc,
+    typename JitFunc,
+    typename InputTuple,
+    size_t... NumInputs>
+void test_op(
+    int blocks,
+    int threads,
+    std::string op_str,
+    AtenFunc af,
+    JitFunc jf,
+    OutputPair op,
+    InputTuple it,
+    std::index_sequence<NumInputs...>) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Generate Input JIT function Inputs and add them as Inputs to the Fusion
+  // Graph
+  std::array<Val*, sizeof...(NumInputs)> jit_inputs = {
+      gen_jit_operand(std::get<NumInputs>(it))...};
+  std::for_each(jit_inputs.begin(), jit_inputs.end(), [&fusion](Val* v) {
+    fusion.addInput(v);
+  });
+  TensorView* out =
+      static_cast<TensorView*>(jf(std::get<NumInputs>(jit_inputs)...));
+  fusion.addOutput(out);
+
+  std::for_each(jit_inputs.begin(), jit_inputs.end(), [out](Val* v) {
+    if (v->getValType() == ValType::TensorView)
+      static_cast<TensorView*>(v)->computeAt(out, -1);
+  });
+  out->axis(0)->parallelize(ParallelType::BIDx);
+  out->axis(-1)->parallelize(ParallelType::TIDx);
+
+  std::array<IValue, sizeof...(NumInputs)> aten_inputs = {gen_aten_operand(
+      std::get<NumInputs>(it), blocks, threads, /*rand*/ true)...};
+  const at::ArrayRef<IValue> aten_inputs_ivalues(aten_inputs);
+
+  at::Tensor cg_output =
+      gen_aten_operand(op, blocks, threads, /*rand*/ false).toTensor();
+  std::vector<at::Tensor> output_vect = {cg_output};
+  cudaDeviceSynchronize();
+  if (fusion.isStochastic())
+    at::manual_seed(0);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs_ivalues);
+  fe.runFusion(aten_inputs_ivalues, output_vect);
+  cudaDeviceSynchronize();
+
+  if (fusion.isStochastic())
+    at::manual_seed(0);
+  at::Tensor aten_output = af(aten_inputs);
+  cudaDeviceSynchronize(); // This sync shouldn't be necessary;
+
+  std::string op_msg = "Operation " + op_str;
+
+  testValidate(
+      &fusion,
+      {cg_output},
+      aten_inputs,
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      op_msg);
+}
+
+/*
+ *  Templatized Helper Function that uses variadic templates to
+ *  process a variable length Input Tuple of different Operand Type.
+ */
+template <typename AtenFunc, typename JitFunc, typename InputTuple>
+void test_op(
+    int blocks,
+    int threads,
+    std::string op_str,
+    AtenFunc af,
+    JitFunc jf,
+    OutputPair op,
+    InputTuple it) {
+  static constexpr auto size = std::tuple_size<InputTuple>::value;
+  test_op(
+      blocks,
+      threads,
+      op_str,
+      af,
+      jf,
+      op,
+      it,
+      std::make_index_sequence<size>{});
+}
+
+TEST_F(NVFuserTest, FusionUnaryOps_CUDA) {
+  using OpTuple =
+      std::tuple<at::Tensor (*)(const at::Tensor&), UnaryOpType, std::string>;
+
+  // [Note: explicit tuple type for uniform initialization list]
+  // Tuple type must be explicitly specified for each uniform initialization
+  // list within the vector to make this code compatible with some old env
+  // which we still need to support. eg. gcc 5.4 + cuda 9.2.
+  std::vector<OpTuple> ops{
+      OpTuple{at::acos, UnaryOpType::Acos, "acos"},
+      OpTuple{at::asin, UnaryOpType::Asin, "asin"},
+      OpTuple{at::atan, UnaryOpType::Atan, "atan"},
+      // There does not appear to be an appropriate ATen function for atanh
+      // OpTuple{at::atanh,      UnaryOpType::Atanh,      "atanh"      },
+      OpTuple{at::cos, UnaryOpType::Cos, "cos"},
+      OpTuple{at::cosh, UnaryOpType::Cosh, "cosh"},
+      OpTuple{at::exp, UnaryOpType::Exp, "exp"},
+      // OpTuple{at::gelu, UnaryOpType::Gelu, "gelu"},
+      OpTuple{at::log, UnaryOpType::Log, "log"},
+      OpTuple{at::log10, UnaryOpType::Log10, "log10"},
+      OpTuple{at::neg, UnaryOpType::Neg, "neg"},
+      OpTuple{at::reciprocal, UnaryOpType::Reciprocal, "reciprocal"},
+      OpTuple{at::sigmoid, UnaryOpType::Sigmoid, "sigmoid"},
+      OpTuple{at::sin, UnaryOpType::Sin, "sin"},
+      OpTuple{at::sinh, UnaryOpType::Sinh, "sinh"},
+      OpTuple{at::sqrt, UnaryOpType::Sqrt, "sqrt"},
+      OpTuple{at::tan, UnaryOpType::Tan, "tan"},
+      OpTuple{at::tanh, UnaryOpType::Tanh, "tanh"},
+      OpTuple{at::isfinite, UnaryOpType::IsFinite, "isfinite"},
+      OpTuple{at::isinf, UnaryOpType::IsInf, "isinf"},
+      OpTuple{at::isnan, UnaryOpType::IsNan, "isnan"},
+      OpTuple{at::isreal, UnaryOpType::IsReal, "isreal"},
+  };
+
+  // The following ops has no complex support in eager mode
+  std::vector<OpTuple> ops_without_complex{
+      OpTuple{at::ceil, UnaryOpType::Ceil, "ceil"},
+      OpTuple{at::floor, UnaryOpType::Floor, "floor"},
+      OpTuple{at::frac, UnaryOpType::Frac, "frac"},
+      OpTuple{at::trunc, UnaryOpType::Trunc, "trunc"},
+      OpTuple{at::round, UnaryOpType::Round, "round"},
+      OpTuple{at::relu, UnaryOpType::Relu, "relu"},
+      OpTuple{at::expm1, UnaryOpType::Expm1, "expm1"},
+      OpTuple{at::log1p, UnaryOpType::Log1p, "log1p"},
+      OpTuple{at::lgamma, UnaryOpType::Lgamma, "lgamma"},
+      OpTuple{at::erf, UnaryOpType::Erf, "erf"},
+      OpTuple{at::erfc, UnaryOpType::Erfc, "erfc"},
+      OpTuple{at::isneginf, UnaryOpType::IsNegInf, "isneginf"},
+      OpTuple{at::isposinf, UnaryOpType::IsPosInf, "isposinf"},
+  };
+
+  // The following ops only supports complex
+  std::vector<OpTuple> ops_complex_only{
+      // real is supported via UnaryOpType::Set for non-complex types, and
+      // UnaryOpType::Real requires input to be complex
+      OpTuple{at::real, UnaryOpType::Real, "real"},
+      OpTuple{at::imag, UnaryOpType::Imag, "imag"},
+  };
+
+  // Complex support for the following op is not working in nvFuser yet
+  std::vector<OpTuple> ops_skip_complex{
+      // TODO: abs is actually supported in nvFuser, but it has bug!!!
+      // In eager mode, abs(complex_tensor) returns floating point tensor
+      // but in nvFuser, it wrongly returns complex tensor!
+      // We need to:
+      //  1. change our type promotion logic to make a special case for abs
+      //  2. why this bug is not detected here? we should bump up test coverage
+      OpTuple{at::abs, UnaryOpType::Abs, "abs"},
+      // TODO: the following two ops fails with compilation error like
+      // "undefined function rsqrt(complex)", we could implement them in
+      // helpers.cu, but I think it is better to check with Jiterator first,
+      // because Jiterator uses the same string for complex support.
+      OpTuple{at::rsqrt, UnaryOpType::Rsqrt, "rsqrt"},
+      OpTuple{at::log2, UnaryOpType::Log2, "log2"}};
+
+  std::vector<DataType> dtypes = {
+      DataType::Float,
+      DataType::Double,
+      DataType::ComplexFloat,
+      DataType::ComplexDouble};
+
+  for (auto dtype : dtypes) {
+    auto ops_to_test = ops;
+    if (dtype != DataType::ComplexFloat && dtype != DataType::ComplexDouble) {
+      ops_to_test.insert(
+          ops_to_test.end(),
+          ops_without_complex.begin(),
+          ops_without_complex.end());
+      ops_to_test.insert(
+          ops_to_test.end(), ops_skip_complex.begin(), ops_skip_complex.end());
+    } else {
+      ops_to_test.insert(
+          ops_to_test.end(), ops_complex_only.begin(), ops_complex_only.end());
+    }
+    std::for_each(ops.begin(), ops.end(), [&](OpTuple& op) {
+      test_op(
+          /*blocks*/ 640,
+          /*threads*/ 64,
+          /*name*/ std::get<2>(op),
+          /*Aten Func   */
+          [&op](std::array<IValue, 1>& vals) {
+            return std::get<0>(op)(vals[0].toTensor());
+          },
+          /*JIT  Func   */
+          [&op](Val* in1) -> Val* { return unaryOp(std::get<1>(op), in1); },
+          /*Output      */ std::make_pair(ValType::TensorView, dtype),
+          /*Inputs Tuple*/
+          std::make_tuple(std::make_pair(ValType::TensorView, dtype)));
+    });
+  }
+
+  dtypes = {DataType::Int, DataType::Int32, DataType::Bool};
+  for (auto dtype : dtypes) {
+    test_op(
+        /*blocks*/ 128,
+        /*threads*/ 64,
+        /*name*/ "bitwise_not",
+        /*Aten Func   */
+        [](std::array<IValue, 1>& vals) {
+          return at::bitwise_not(vals[0].toTensor());
+        },
+        /*JIT  Func   */
+        [](Val* in1) -> Val* { return unaryOp(UnaryOpType::Not, in1); },
+        /*Output      */ std::make_pair(ValType::TensorView, dtype),
+        /*Inputs Tuple*/
+        std::make_tuple(std::make_pair(ValType::TensorView, dtype)));
+  }
+}
+
+TEST_F(NVFuserTest, FusionBinaryOps_CUDA) {
+  using AtenFuncSig = at::Tensor (*)(const at::Tensor&, const at::Tensor&);
+  using OpTuple = std::tuple<AtenFuncSig, BinaryOpType, std::string>;
+
+  std::vector<DataType> dtypes = {
+      DataType::Double,
+      DataType::Float,
+      DataType::ComplexFloat,
+      DataType::ComplexDouble};
+
+  // see [Note: explicit tuple type for uniform initialization list]
+  std::vector<OpTuple> equal_ops{
+      OpTuple{at::eq, BinaryOpType::Eq, "eq"},
+      OpTuple{at::ne, BinaryOpType::NE, "ne"}};
+
+  // Complex numbers are not ordered
+  std::vector<OpTuple> order_ops{
+      OpTuple{at::ge, BinaryOpType::GE, "ge"},
+      OpTuple{at::gt, BinaryOpType::GT, "gt"},
+      OpTuple{at::le, BinaryOpType::LE, "le"},
+      OpTuple{at::lt, BinaryOpType::LT, "lt"}};
+
+  // see [Note: explicit tuple type for uniform initialization list]
+  std::vector<OpTuple> math_ops{
+      OpTuple{at::div, BinaryOpType::Div, "div"},
+      OpTuple{at::mul, BinaryOpType::Mul, "mul"},
+      OpTuple{at::pow, BinaryOpType::Pow, "pow"}};
+
+  // The following ops has no complex support in eager mode
+  std::vector<OpTuple> math_ops_without_complex{
+      OpTuple{at::atan2, BinaryOpType::Atan2, "atan2"},
+      OpTuple{at::max, BinaryOpType::Max, "max"},
+      OpTuple{at::min, BinaryOpType::Min, "min"},
+      OpTuple{at::fmod, BinaryOpType::Fmod, "fmod"},
+      // NOTE: Remainder does not match the Aten impl exactly
+      // despite using an identical function.
+      OpTuple{at::remainder, BinaryOpType::Remainder, "remainder"}};
+
+  for (auto dtype : dtypes) {
+    auto logic_ops = equal_ops;
+    if (dtype != DataType::ComplexFloat && dtype != DataType::ComplexDouble) {
+      logic_ops.insert(logic_ops.end(), order_ops.begin(), order_ops.end());
+    }
+    std::for_each(logic_ops.begin(), logic_ops.end(), [&](OpTuple& op) {
+      test_op(
+          /*blocks*/ 640,
+          /*threads*/ 64,
+          /*name*/ std::get<2>(op),
+          /*Aten Func   */
+          [&op](std::array<IValue, 2>& vals) {
+            return std::get<0>(op)(vals[0].toTensor(), vals[1].toTensor());
+          },
+          /*JIT  Func   */
+          [&op](Val* in1, Val* in2) -> Val* {
+            return binaryOp(std::get<1>(op), in1, in2);
+          },
+          /*Output      */ std::make_pair(ValType::TensorView, DataType::Bool),
+          /*Inputs Tuple*/
+          std::make_tuple(
+              std::make_pair(ValType::TensorView, dtype),
+              std::make_pair(ValType::TensorView, dtype)));
+    });
+
+    auto enabled_math_ops = math_ops;
+    if (dtype != DataType::ComplexFloat && dtype != DataType::ComplexDouble) {
+      enabled_math_ops.insert(
+          enabled_math_ops.end(),
+          math_ops_without_complex.begin(),
+          math_ops_without_complex.end());
+    }
+    std::for_each(
+        enabled_math_ops.begin(), enabled_math_ops.end(), [&](OpTuple& op) {
+          test_op(
+              /*blocks*/ 640,
+              /*threads*/ 64,
+              /*name*/ std::get<2>(op),
+              /*Aten Func   */
+              [&op](std::array<IValue, 2>& vals) {
+                return std::get<0>(op)(vals[0].toTensor(), vals[1].toTensor());
+              },
+              /*JIT  Func   */
+              [&op](Val* in1, Val* in2) -> Val* {
+                return binaryOp(std::get<1>(op), in1, in2);
+              },
+              /*Output      */ std::make_pair(ValType::TensorView, dtype),
+              /*Inputs Tuple*/
+              std::make_tuple(
+                  std::make_pair(ValType::TensorView, dtype),
+                  std::make_pair(ValType::TensorView, dtype)));
+        });
+
+    test_op(
+        /*blocks*/ 640,
+        /*threads*/ 64,
+        /*name*/ "add_alpha",
+        /*Aten Func   */
+        [](std::array<IValue, 3>& vals) {
+          return at::add(
+              vals[0].toTensor(), vals[1].toTensor(), vals[2].toScalar());
+        },
+        /*JIT  Func   */ static_cast<Val* (*)(Val*, Val*, Val*)>(&add_alpha),
+        /*Output      */ std::make_pair(ValType::TensorView, dtype),
+        /*Inputs Tuple*/
+        std::make_tuple(
+            std::make_pair(ValType::TensorView, dtype),
+            std::make_pair(ValType::TensorView, dtype),
+            std::make_pair(ValType::Scalar, dtype)));
+
+    test_op(
+        /*blocks*/ 640,
+        /*threads*/ 64,
+        /*name*/ "sub_alpha",
+        /*Aten Func   */
+        [](std::array<IValue, 3>& vals) {
+          return at::sub(
+              vals[0].toTensor(), vals[1].toTensor(), vals[2].toScalar());
+        },
+        /*JIT  Func   */ static_cast<Val* (*)(Val*, Val*, Val*)>(&sub_alpha),
+        /*Output      */ std::make_pair(ValType::TensorView, dtype),
+        /*Inputs Tuple*/
+        std::make_tuple(
+            std::make_pair(ValType::TensorView, dtype),
+            std::make_pair(ValType::TensorView, dtype),
+            std::make_pair(ValType::Scalar, dtype)));
+  }
+}
+
+TEST_F(NVFuserTest, FusionTernaryOps_CUDA) {
+  std::vector<DataType> dtypes = {
+      DataType::Double,
+      DataType::Float,
+      DataType::ComplexFloat,
+      DataType::ComplexDouble};
+
+  for (auto dtype : dtypes) {
+    // clamp and threshold are not supported for complex on eager mode
+    if (dtype != DataType::ComplexFloat && dtype != DataType::ComplexDouble) {
+      test_op(
+          /*blocks*/ 640,
+          /*threads*/ 64,
+          /*name*/ "clamp",
+          /*Aten Func   */
+          [](std::array<IValue, 1>& vals) {
+            return at::clamp(vals[0].toTensor(), 0.f, 1.f);
+          },
+          /*JIT  Func   */
+          [&](Val* in1) -> Val* {
+            if (dtype == DataType::Float) {
+              return clamp(
+                  in1,
+                  IrBuilder::create<Double>(0.f),
+                  IrBuilder::create<Double>(1.f));
+            } else {
+              return clamp(
+                  in1,
+                  IrBuilder::create<Double>(0.f),
+                  IrBuilder::create<Double>(1.f));
+            }
+          },
+          /*Output      */ std::make_pair(ValType::TensorView, dtype),
+          /*Inputs Tuple*/
+          std::make_tuple(std::make_pair(ValType::TensorView, dtype)));
+      test_op(
+          /*blocks*/ 640,
+          /*threads*/ 64,
+          /*name*/ "threshold",
+          /*Aten Func   */
+          [](std::array<IValue, 1>& vals) {
+            return at::threshold(vals[0].toTensor(), 0.f, 1.f);
+          },
+          /*JIT  Func   */
+          [&](Val* in1) -> Val* {
+            if (dtype == DataType::Float) {
+              return threshold(
+                  in1,
+                  IrBuilder::create<Double>(0.f),
+                  IrBuilder::create<Double>(1.f));
+            } else {
+              return threshold(
+                  in1,
+                  IrBuilder::create<Double>(0.f),
+                  IrBuilder::create<Double>(1.f));
+            }
+          },
+          /*Output      */ std::make_pair(ValType::TensorView, dtype),
+          /*Inputs Tuple*/
+          std::make_tuple(std::make_pair(ValType::TensorView, dtype)));
+    }
+    test_op(
+        /*blocks*/ 640,
+        /*threads*/ 64,
+        /*name*/ "where",
+        /*Aten Func   */
+        [](std::array<IValue, 3>& vals) {
+          return at::where(
+              vals[0].toTensor(), vals[1].toTensor(), vals[2].toTensor());
+        },
+        /*JIT  Func   */ static_cast<Val* (*)(Val*, Val*, Val*)>(&where),
+        /*Output      */ std::make_pair(ValType::TensorView, dtype),
+        /*Inputs Tuple*/
+        std::make_tuple(
+            std::make_pair(ValType::TensorView, DataType::Bool),
+            std::make_pair(ValType::TensorView, dtype),
+            std::make_pair(ValType::TensorView, dtype)));
+  }
+}
+
+TEST_F(NVFuserTest, FusionCompoundOps_CUDA) {
+  std::vector<DataType> dtypes = {
+      DataType::Double,
+      DataType::Float,
+      DataType::ComplexFloat,
+      DataType::ComplexDouble};
+
+  for (auto dtype : dtypes) {
+    test_op(
+        /*blocks*/ 640,
+        /*threads*/ 64,
+        /*name*/ "lerp",
+        /*Aten Func   */
+        [](std::array<IValue, 3>& vals) {
+          return at::lerp(
+              vals[0].toTensor(), vals[1].toTensor(), vals[2].toTensor());
+        },
+        /*JIT  Func   */ static_cast<Val* (*)(Val*, Val*, Val*)>(&lerp),
+        /*Output      */ std::make_pair(ValType::TensorView, dtype),
+        /*Inputs Tuple*/
+        std::make_tuple(
+            std::make_pair(ValType::TensorView, dtype),
+            std::make_pair(ValType::TensorView, dtype),
+            std::make_pair(ValType::TensorView, dtype)));
+    test_op(
+        /*blocks*/ 640,
+        /*threads*/ 64,
+        /*name*/ "addcmul",
+        /*Aten Func   */
+        [](std::array<IValue, 4>& vals) {
+          return at::addcmul(
+              vals[0].toTensor(),
+              vals[1].toTensor(),
+              vals[2].toTensor(),
+              vals[3].toScalar());
+        },
+        /*JIT  Func   */
+        static_cast<Val* (*)(Val*, Val*, Val*, Val*)>(&addcmul),
+        /*Output      */ std::make_pair(ValType::TensorView, dtype),
+        /*Inputs Tuple*/
+        std::make_tuple(
+            std::make_pair(ValType::TensorView, dtype),
+            std::make_pair(ValType::TensorView, dtype),
+            std::make_pair(ValType::TensorView, dtype),
+            std::make_pair(ValType::Scalar, dtype)));
+  }
+}
+
+TEST_F(NVFuserTest, FusionCastOps_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2, DataType::Half);
+
+  TensorView* intrm1 = castOp(DataType::Float, tv0);
+  TensorView* out = castOp(DataType::Half, intrm1);
+
+  fusion.addInput(tv0);
+  fusion.addOutput(out);
+  tv0->computeAt(out, -1);
+
+  out->axis(0)->parallelize(ParallelType::BIDx);
+  out->axis(-1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+
+  at::Tensor input1 = at::randn({1, 4}, options);
+  at::Tensor ref_output = at::empty_like(input1);
+
+  std::array<IValue, 1> inputs = {input1};
+  const at::ArrayRef<IValue> input_ivalues(inputs);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, input_ivalues);
+  auto outputs = fe.runFusion(input_ivalues);
+
+  ref_output = at::_cast_Half(at::_cast_Double(input1));
+
+  TORCH_CHECK(
+      outputs[0].equal(ref_output),
+      "\nOp Type: -- ",
+      "cast FP16->FP32->FP16",
+      " -- had a mismatch.\n",
+      "\nABS MAX DIFF: ",
+      outputs[0].sub(ref_output).abs().max(),
+      "\n");
+}
+
+// Start off simple, block on the outer dim
+// block stride + thread all reduce + unrolling on inner dim
+TEST_F(NVFuserTest, FusionReduction1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  // tv1[I0, R1] = tv0[I0, I1]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  TORCH_CHECK(
+      ir_utils::getReductionOps(&fusion).size(),
+      "Could not detect reduction in fusion.");
+
+  tv1->split(1, 128);
+  // tv1[I0, R1o, R1i{128}] = tv0[I0, I1]
+  tv1->split(1, 4);
+  // tv1[I0, R1oo, R1oi{4}, R1i{128}] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({1});
+  // tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}] = tv0[I0, I1]
+  // tv1[I0,        R1oi{4},  R1i{128}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}]
+
+  TensorView* tv3 = tv1->rFactor({1});
+  // tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}] = tv0[I0, I1]
+  // tv3[I0,        R1oi{4}, Ir1i{128}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{128}]
+  // tv1[I0,                  R1i{128}] = tv3[I0,        R1oi{4}, Ir1i{128}]
+
+  // Incrementally, can print in between for debugging
+  tv0->computeAt(tv2, 1);
+  tv2->computeAt(tv3, 1);
+  tv3->computeAt(tv1, 1);
+
+  // Re do it all at once, because why not.
+  tv0->computeAt(tv1, 1);
+
+  tv2->axis(2)->parallelize(ParallelType::Unroll);
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  int numel_x = 65000;
+  int numel_y = 1025;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+  at::Tensor cg_output = at::empty({numel_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  fe.runFusion({input}, {cg_output});
+
+  auto aten_output = input.to(at::kDouble).sum({1});
+
+  testValidate(
+      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionReduction2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  // tv1[I0, R1] = tv0[I0, I1]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+
+  fusion.addOutput(tv1);
+
+  // switches to try some different scenarios. maybe we should iterate on all
+  // permutations.
+  bool bind_bidx = true;
+  bool bind_tidx = true;
+  bool bind_tidy = true;
+  bool bind_unroll = true;
+
+  int numel_x = 1025; // Cannot exceed block dim max size / tidy
+  int numel_y = 129;
+  int tidx = 16;
+  int tidy = 8;
+  int unroll_factor = 4;
+
+  tv1->split(1, tidx);
+  // tv1[I0, R1o, R1i{tidx}] = tv0[I0, I1]
+
+  tv1->split(1, unroll_factor);
+  // tv1[I0, R1oo, R1oi{unroll}, R1i{tidx}] = tv0[I0, I1]
+
+  tv1->split(0, tidy);
+
+  TensorView* tv2 = tv1->rFactor({-3});
+  // tv2[I0,             >R1oo<, Ir1oi{unroll}, Ir1i{tidx}]
+  // tv1[I0o, I0i{tidy},          R1oi{unroll},  R1i{tidx}]
+
+  TensorView* tv3 = tv1->rFactor({-2});
+  // tv2[I0,             >R1oo<, Ir1oi{unroll}, Ir1i{tidx}]
+  // tv3[I0,                      R1oi{unroll}, Ir1i{tidx}]
+  // tv1[I0o, I0i{tidy},                         R1i{tidx}]
+
+  tv0->computeAt(tv1, -2);
+
+  if (bind_unroll)
+    tv2->axis(-2)->parallelize(ParallelType::Unroll);
+  if (bind_bidx)
+    tv1->axis(0)->parallelize(ParallelType::BIDx);
+  if (bind_tidy)
+    tv1->axis(1)->parallelize(ParallelType::TIDy);
+
+  if (bind_tidx) {
+    tv2->axis(-1)->parallelize(ParallelType::TIDx);
+    tv3->axis(-1)->parallelize(ParallelType::TIDx);
+    tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  auto aten_output = input.to(at::kDouble).sum({1});
+  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionReduction3_CUDA) {
+  // What if Z participates in the reduction with X?
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  // tv1[I0, R1] = tv0[I0, I1]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+
+  fusion.addOutput(tv1);
+
+  int numel_x = 1025; // Cannot exceed block dim max size / tidy
+  int numel_y = 129;
+  int tidx = 16;
+  int tidz = 8;
+
+  tv1->split(1, tidz);
+  // tv1[I0, R1o, R1i{tidz}] = tv0[I0, I1]
+
+  tv1->split(1, tidx);
+  // tv1[I0, R1oo, R1oi{tidx}, R1i{tidz}] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({-3});
+  // tv2[I0,  >R1oo<, Ir1oi{tidx}, Ir1i{tidz}]
+  // tv1[I0o,          R1oi{tidx},  R1i{tidz}]
+
+  tv0->computeAt(tv1, -3);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(-2)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->parallelize(ParallelType::TIDz);
+
+  tv2->axis(-2)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDz);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
+  at::Tensor cg_output = at::empty({numel_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, {cg_output});
+
+  auto aten_output = aten_input.to(at::kDouble).sum({1});
+
+  testValidate(
+      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionReduction4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+
+  TensorView* tv2 = add(tv0, tv1);
+  // tv2[I0, I1] = tv0[I0, I1] + tv1[I0, I1]
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  TensorView* tv3 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv2);
+  // tv3[I0, R1] = tv2[I0, I1]
+
+  TensorView* tv4 = makeSymbolicTensor(1);
+  fusion.addInput(tv4);
+
+  // tv5[I0] = tv3[I0, R1] * tv4[I0]
+  TensorView* tv5 = mul(tv3, tv4);
+  fusion.addOutput(tv5);
+
+  int tidx = 16;
+
+  // RFactor the reduction
+  tv3->split(1, tidx);
+  // tv3[I0, R1o, R1i{tidx}] = tv2[I0, I1]
+
+  TensorView* tv6 = tv3->rFactor({-2});
+  // tv6[I0, R1o, iR1i{tidx}] = tv2[I0, I1]
+  // tv3[I0,       R1i{tidx}] = tv3[I0, I1]
+  tv2->computeAt(tv6, 2);
+
+  // Compute at inline with tv5 (only 1D)
+  tv6->computeAt(tv3, 1);
+  tv3->computeAt(tv5, 1);
+
+  tv5->axis(0)->parallelize(ParallelType::BIDx);
+
+  // Intermediate tensors only need this, but doesn't hurt to do on inputs
+  // tv0, 1, 4
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv6->axis(-1)->parallelize(ParallelType::TIDx);
+
+  int numel_x = 1025;
+  int numel_y = 129;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  at::Tensor t1 = at::randn({numel_x, numel_y}, options);
+  at::Tensor t4 = at::randn({numel_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1, t4});
+  auto cg_outputs = fe.runFusion({t0, t1, t4});
+
+  auto t2 = t0.add(t1);
+  auto t3 = t2.to(at::kDouble).sum({1});
+  auto aten_output = t3.mul(t4);
+
+  testValidate(
+      &fusion, cg_outputs, {t0, t1, t4}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionReduction5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(3);
+
+  fusion.addInput(tv0);
+
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+
+  fusion.addOutput(tv1);
+
+  int bidy = 2;
+  int tidy = 4;
+  int tidx = 5;
+
+  int dim1 = 11;
+
+  tv1->split(-2, tidy);
+
+  TensorView* tv2 = tv1->rFactor({-3});
+
+  tv0->computeAt(tv1, 1);
+  tv1->axis(0)->parallelize(ParallelType::BIDy);
+
+  for (auto* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      val->as<TensorView>()->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  tv2->axis(-2)->parallelize(ParallelType::TIDy);
+  tv1->axis(-2)->parallelize(ParallelType::TIDy);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({bidy, dim1, tidx}, options);
+
+  at::Tensor cg_output = at::empty({bidy, tidx}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  fe.runFusion({input}, {cg_output});
+
+  auto aten_output = input.to(at::kDouble).sum({1});
+  testValidate(
+      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionReduction6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int bdimx = 64;
+  const int bdimy = 8;
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(3);
+  fusion.addInput(tv0);
+
+  // tv1[I0, R1, R2] = tv0[I0, I1, I2]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1, 2}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  TORCH_CHECK(
+      ir_utils::getReductionOps(&fusion).size(),
+      "Could not detect reduction in fusion.");
+
+  tv1->split(2, bdimx);
+  // tv1[I0, R1, R2o, R2i{128}] = tv0[I0, I1, I2]
+  tv1->split(1, bdimy);
+  // tv1[I0, R1o, R1i{8}, R2o, R2i{128}] = tv0[I0, I1, I2]
+
+  TensorView* tv2 = tv1->rFactor({3});
+  // tv2[I0, I1o, I1i{8}, R2o, I2i{128}] = tv0[I0, I1, I2]
+  // tv1[I0, R1o, R1i{8},      R2i{128}] = tv2[I0, I1o, I1i{8}, R2o, I2i{128}]
+
+  TensorView* tv3 = tv1->rFactor({1});
+  // tv2[I0, I1o, I1i{8}, R2o, I2i{128}] = tv0[I0, I1, I2]
+  // tv3[I0, R1o, I1i{8},      I2i{128}] = tv2[I0, I1o, I1i{8}, R2o, I2i{128}]
+  // tv1[I0,      R1i{8},      R2i{128}] = tv3[I0, R1o, I1i{8},      I2i{128}]
+
+  tv3->computeAt(tv1, 1);
+  tv2->computeAt(tv3, 2);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv1->axis(-2)->parallelize(ParallelType::TIDy);
+  tv3->axis(-2)->parallelize(ParallelType::TIDy);
+  tv2->axis(-3)->parallelize(ParallelType::TIDy);
+
+  int numel_x = 650;
+  int numel_y = 1000;
+  int numel_z = 4;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y, numel_z}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  auto aten_output = input.to(at::kDouble).sum({1, 2});
+  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionMultiGridReduction_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  TensorView* tv1 = max(tv0, {0});
+  TensorView* tv2 = sum(tv0, {0});
+
+  fusion.addOutput(tv1);
+  fusion.addOutput(tv2);
+
+  int numel_x = 4;
+  int numel_y = 2;
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  std::vector<at::Tensor> aten_outputs = {
+      std::get<0>(input.to(at::kDouble).max(0)), input.to(at::kDouble).sum(0)};
+  testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionMultiGridReduction2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = sum(tv1, {0});
+  fusion.addOutput(tv2);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::BIDy);
+  tv2->axis(0)->parallelize(ParallelType::BIDy);
+
+  FusionExecutor fe;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
+}
+
+TEST_F(NVFuserTest, FusionReductionTFT_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  // tv1[I0, R1] = tv0[I0, I1]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+
+  fusion.addOutput(tv1);
+
+  int numel_x = 1025;
+  int numel_y = 129;
+  int tidx = 16;
+  int tidy = 8;
+  int tidz = 8;
+
+  tv1->split(1, tidx);
+  // tv1[I0, R1o, R1i{tidx}]
+
+  tv1->split(1, tidz);
+  // tv1[I0, R1oo, R1Oi{tidz}, R1R1i{tidx}]
+
+  tv1->split(0, tidy);
+  // tv1[I0o, I0i, R1oo, R1Oi{tidz}, R1R1i{tidx}]
+
+  TensorView* tv2 = tv1->rFactor({2});
+  // tv2[I0o, I0i, R1oo, I1Oi{tidz}, I11i{tidx}]
+  // tv1[I0o, I0i,       R1Oi{tidz}, R1R1i{tidx}]
+
+  tv2->computeAt(tv1, 2);
+
+  tv1->axis(1)->parallelize(ParallelType::TIDy);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv1->axis(-2)->parallelize(ParallelType::TIDz);
+  tv2->axis(-2)->parallelize(ParallelType::TIDz);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+  at::Tensor cg_output = at::empty({numel_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  fe.runFusion({input}, {cg_output});
+
+  auto aten_output = input.to(at::kDouble).sum({1});
+  testValidate(
+      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionReductionOuterSplit_CUDA) {
+  // based off FusionReduction4
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+
+  TensorView* tv2 = add(tv0, tv1);
+  // tv2[I0, I1] = tv0[I0, I1] + tv1[I0, I1]
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  TensorView* tv3 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv2);
+  // tv3[I0, R1] = tv2[I0, I1]
+
+  TensorView* tv4 = makeSymbolicTensor(1);
+  fusion.addInput(tv4);
+
+  // tv5[I0] = tv3[I0, R1] * tv4[I0]
+  TensorView* tv5 = mul(tv3, tv4);
+  fusion.addOutput(tv5);
+
+  // RFactor the reduction
+  tv3->split(1, 16, false);
+  // tv3[I0, R1o{16}, R1i{tidx}] = tv2[I0, I1]
+
+  TensorView* tv6 = tv3->rFactor({-2});
+  // tv6[I0, R1o{16}, iR1i{tidx}] = tv2[I0, I1]
+  // tv3[I0,           R1i{tidx}] = tv3[I0, I1]
+  tv2->computeAt(tv6, 2);
+
+  // Compute at inline with tv5 (only 1D)
+  tv6->computeAt(tv3, 1);
+  tv3->computeAt(tv5, 1);
+
+  tv5->axis(0)->parallelize(ParallelType::BIDx);
+
+  // Intermediate tensors only need this, but doesn't hurt to do on inputs
+  // tv0, 1, 4
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv6->axis(-1)->parallelize(ParallelType::TIDx);
+
+  int numel_x = 1025;
+  int numel_y = 129;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  at::Tensor t1 = at::randn({numel_x, numel_y}, options);
+  at::Tensor t4 = at::randn({numel_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1, t4});
+  auto cg_outputs = fe.runFusion({t0, t1, t4});
+
+  auto t2 = t0.add(t1);
+  auto t3 = t2.to(at::kDouble).sum({1});
+  auto aten_output = t3.mul(t4);
+
+  testValidate(
+      &fusion, cg_outputs, {t0, t1, t4}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBranches_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  TensorView* tv2 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addInput(tv2);
+
+  auto tv3 = add(tv0, IrBuilder::create<Double>(1.0));
+  auto tv4 = add(tv3, tv1);
+  auto tv5 = add(tv3, tv2);
+  auto tv6 = add(tv4, tv5);
+
+  fusion.addOutput(tv6);
+
+  constexpr int x = 63, y = 33;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({x, y}, options);
+  at::Tensor t1 = at::randn({x, y}, options);
+  at::Tensor t2 = at::randn({x, y}, options);
+
+  FusionExecutor fe;
+  tv6->merge(0);
+  tv6->split(0, 128);
+  tv6->split(0, 4);
+
+  tv6->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv0->computeAt(tv6, 1);
+  tv1->computeAt(tv6, 1);
+  tv2->computeAt(tv6, 1);
+
+  tv3->axis(-2)->parallelize(ParallelType::Unroll);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(-2)->parallelize(ParallelType::Unroll);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv5->axis(-2)->parallelize(ParallelType::Unroll);
+  tv5->axis(-1)->parallelize(ParallelType::TIDx);
+  tv6->axis(-1)->parallelize(ParallelType::TIDx);
+
+  std::vector<IValue> aten_inputs = {t0, t1, t2};
+
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto t3 = t0.add(1.0);
+  auto t4 = t3.add(t1);
+  auto t5 = t3.add(t2);
+  auto aten_output = t4.add(t5);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSimpleBCast1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1.5));
+
+  TensorView* tv2 = makeSymbolicTensor(2);
+  fusion.addInput(tv2);
+  TensorView* tv3 = makeSymbolicTensor(2);
+  fusion.addInput(tv3);
+  TensorView* tv4 = sub(tv2, tv3);
+
+  TensorView* tv5 = broadcast(tv1, {false, false, true});
+  TensorView* tv6 = broadcast(tv4, {true, false, false});
+
+  TensorView* tv7 = add(tv5, tv6);
+  fusion.addOutput(tv7);
+
+  tv7->split(-1, 4);
+  tv7->split(0, 8);
+
+  tv0->computeAt(tv7, -1);
+  tv2->computeAt(tv7, -1);
+
+  tv7->axis(0)->parallelize(ParallelType::BIDx);
+  tv7->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int x = 63, y = 33, z = 15;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({x, y}, options);
+  at::Tensor t1 = t0.add(1.5);
+
+  at::Tensor t2 = at::randn({y, z}, options);
+  at::Tensor t3 = at::randn({y, z}, options);
+
+  at::Tensor t4 = t2.sub(t3);
+  at::Tensor t5 = t1.unsqueeze(-1).expand({x, y, z});
+
+  at::Tensor t6 = t4.expand({x, y, z});
+
+  at::Tensor aten_output = t5.add(t6);
+
+  std::vector<IValue> aten_inputs = {t0, t2, t3};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSimpleBCast2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = add(tv0, tv1);
+
+  TensorView* tv3 = broadcast(tv2, {false, false, true});
+
+  TensorView* tv4 = makeSymbolicTensor(2);
+  fusion.addInput(tv4);
+
+  TensorView* tv5 = sub(tv4, IrBuilder::create<Double>(0.1));
+
+  TensorView* tv6 = broadcast(tv5, {true, false, false});
+
+  TensorView* tv7 = add(tv3, tv6);
+
+  fusion.addOutput(tv7);
+
+  tv7->merge(0, 1);
+
+  tv0->computeAt(tv7, -1);
+  tv4->computeAt(tv7, -1);
+
+  tv7->axis(0)->parallelize(ParallelType::BIDx);
+  tv7->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int x = 63, y = 33, z = 15;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({x, y}, options);
+  at::Tensor t1 = at::randn({x, y}, options);
+  at::Tensor t2 = t0.add(t1);
+  at::Tensor t3 = t2.unsqueeze(-1).expand({x, y, z});
+
+  at::Tensor t4 = at::randn({y, z}, options);
+  at::Tensor t5 = t4.sub(0.1);
+  at::Tensor t6 = t5.expand({x, y, z});
+  at::Tensor aten_output = t3.add(t6);
+
+  at::Tensor cg_output = at::empty({x, y, z}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1, t4};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  fe.runFusion(aten_inputs, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSimpleBCast3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up input tensor views
+  // tv0[I1, B{1}]
+  TensorView* tv0 = makeConcreteTensor({-1, 1});
+  fusion.addInput(tv0);
+
+  // tv1[I0, I1, I2]
+  TensorView* tv2 = makeSymbolicTensor(3);
+  fusion.addInput(tv2);
+
+  TensorView* tv3 = add(tv0, tv2);
+
+  fusion.addOutput(tv3);
+
+  tv3->merge(0);
+  tv3->merge(0);
+
+  tv0->computeAt(tv3, -1);
+  tv2->computeAt(tv3, -1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  constexpr int x = 2, y = 3, z = 4;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({y, 1}, options);
+  at::Tensor t2 = at::randn({x, y, z}, options);
+  auto aten_output = t0.add(t2);
+
+  std::vector<IValue> aten_inputs = {t0, t2};
+  at::Tensor cg_output = at::empty({x, y, z}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  fe.runFusion(aten_inputs, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSimpleBCast4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeConcreteTensor({1, -1});
+
+  TensorView* tv1 = makeSymbolicTensor(3);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  TensorView* tv3 = add(tv0, tv1);
+
+  tv3->merge(0);
+  tv3->merge(0);
+  tv3->split(0, 128);
+  tv3->split(0, 4);
+
+  fusion.addOutput(tv3);
+
+  tv0->computeAt(tv3, -1);
+  tv1->computeAt(tv3, -1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-2)->parallelize(ParallelType::Unroll);
+
+  constexpr int x = 63, y = 33, z = 15;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({1, z}, options);
+  at::Tensor t1 = at::randn({x, y, z}, options);
+
+  auto aten_output = t0.add(t1);
+
+  at::Tensor cg_output = at::empty({x, y, z}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  fe.runFusion(aten_inputs, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSimpleBCast5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  constexpr int m = 2, k = 3, n = 4;
+  auto tv0 = makeConcreteTensor({m, k});
+  auto tv1 = makeConcreteTensor({k, n});
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = broadcast(tv0, {false, false, true});
+  TensorView* tv3 = broadcast(tv1, {true, false, false});
+
+  TensorView* tv4 = add(tv2, tv3);
+
+  fusion.addOutput(tv4);
+
+  tv4->merge(0);
+  tv4->merge(0);
+
+  tv0->computeAt(tv4, -1);
+  tv1->computeAt(tv4, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({m, k}, options);
+  at::Tensor t1 = at::randn({k, n}, options);
+
+  auto t2 = t0.unsqueeze(-1).expand({m, k, n});
+  auto t3 = t1.expand({m, k, n});
+  auto aten_output = t2.add(t3);
+
+  at::Tensor cg_output = at::empty({m, k, n}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  fe.runFusion(aten_inputs, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionComplexBCast1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int x = 2, y = 3, z = 4;
+
+  auto tv0 = makeConcreteTensor({y});
+  auto tv1 = div(tv0, IrBuilder::create<Double>(2.0));
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = makeConcreteTensor({y, z});
+  auto tv4 = mul(tv2, tv3);
+  auto tv5 = broadcast(tv4, {true, false, false});
+  auto tv6 = makeConcreteTensor({x, y, z});
+  auto tv7 = add(tv5, tv6);
+
+  // tv0[    i1    ] = input
+  // tv1[    i1    ] = tv0/2.0
+  // tv2[    i1, b2] = bcast(tv1)
+  // tv3[    i1, i2] = input
+  // tv4[    i1, i2] = tv2 * tv3
+  // tv5[b0, i1, i2] = bcast(tv4)
+  // tv6[i0, i1, i2] = input
+  // tv7[i0, i1, i2] = tv5 + tv6
+
+  // tv4 = bcast(tv1) * tv3
+  // tv7 = bcast(tv4) + tv6
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv3);
+  fusion.addInput(tv6);
+
+  fusion.addOutput(tv7);
+
+  tv7->merge(0);
+  tv7->merge(0);
+  tv0->computeAt(tv7, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({y}, options);
+  at::Tensor t3 = at::randn({y, z}, options);
+  at::Tensor t6 = at::randn({x, y, z}, options);
+
+  auto t4 = t0.div(2.0).unsqueeze(-1).expand({y, z}) * t3;
+  auto aten_output = t4.unsqueeze(0).expand({x, y, z}) + t6;
+
+  std::vector<IValue> aten_inputs = {t0, t3, t6};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionComplexBCast2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int x = 2, y = 3, z = 4;
+
+  auto tv0 = makeConcreteTensor({y, z});
+  auto tv1 = div(tv0, IrBuilder::create<Double>(2.0));
+  auto tv2 = sum(tv1, {1});
+  auto tv3 = broadcast(tv2, {true, false});
+  auto tv4 = makeConcreteTensor({x, y});
+  auto tv5 = add(tv3, tv4);
+
+  // tv0[    i1, i2] = input
+  // tv1[    i1, i2] = tv0/2.0
+  // tv2[    i1    ] = sum(tv1, 1)
+  // tv3[b0, i1    ] = bcast(tv2)
+  // tv4[i0, i1    ] = input
+  // tv5[i0, i1    ] = tv3 + tv4
+
+  // tv2 = sum(tv0/2.0, 1)
+  // tv5 = bcast(tv2) + tv4
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv4);
+
+  fusion.addOutput(tv5);
+
+  tv5->merge(0);
+  tv0->computeAt(tv5, -1);
+  tv1->computeAt(tv2, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({y, z}, options);
+  at::Tensor t4 = at::randn({x, y}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t4});
+  auto cg_outputs = fe.runFusion({t0, t4});
+
+  auto t1 = t0.div(2.0);
+  auto t2 = t1.to(at::kDouble).sum(1);
+  auto t3 = t2.unsqueeze(0).expand({x, y});
+  auto aten_output = t3.add(t4);
+
+  testValidate(
+      &fusion, {cg_outputs}, {t0, t4}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedIndexing1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int w = 3, x = 4, y = 7, z = 8;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  auto tv0 = makeSymbolicTensor(3);
+  auto tv1 = makeSymbolicTensor(4);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1.0));
+  auto tv3 = broadcast(tv2, {true, false, false, false});
+  auto tv4 = add(tv3, tv1);
+
+  fusion.addOutput(tv4);
+
+  tv4->merge(0);
+  tv4->merge(0);
+  tv4->merge(0);
+
+  tv4->split(0, 128);
+  tv4->split(0, 4);
+
+  tv2->computeAt(tv4, 1);
+
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(1)->parallelize(ParallelType::Unroll);
+  tv4->axis(2)->parallelize(ParallelType::TIDx);
+
+  tv3->axis(1)->parallelize(ParallelType::Unroll);
+  tv3->axis(2)->parallelize(ParallelType::TIDx);
+
+  tv2->axis(1)->parallelize(ParallelType::Unroll);
+  tv2->axis(2)->parallelize(ParallelType::TIDx);
+
+  FusionExecutor fe;
+
+  at::Tensor t0 = at::randn({x, y, z}, options);
+  at::Tensor t1 = at::randn({w, x, y, z}, options);
+
+  auto t3 = t0.add(1.0);
+  auto aten_output = t3.add(t1);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedIndexing2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int w = 3, x = 4, y = 7, z = 8;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  auto tv0 = makeSymbolicTensor(3);
+  auto tv1 = makeSymbolicTensor(4);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1.0));
+  auto tv3 = broadcast(tv2, {true, false, false, false});
+  auto tv4 = add(tv3, tv1);
+
+  fusion.addOutput(tv4);
+
+  tv4->merge(-2);
+  tv4->merge(-2);
+  tv4->merge(-2);
+
+  tv4->split(0, 128);
+  tv4->split(0, 4);
+
+  tv2->computeAt(tv4, 1);
+
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(1)->parallelize(ParallelType::Unroll);
+  tv4->axis(2)->parallelize(ParallelType::TIDx);
+
+  tv3->axis(1)->parallelize(ParallelType::Unroll);
+  tv3->axis(2)->parallelize(ParallelType::TIDx);
+
+  tv2->axis(1)->parallelize(ParallelType::Unroll);
+  tv2->axis(2)->parallelize(ParallelType::TIDx);
+
+  FusionExecutor fe;
+
+  at::Tensor t0 = at::randn({x, y, z}, options);
+  at::Tensor t1 = at::randn({w, x, y, z}, options);
+
+  auto t3 = t0.add(1.0);
+  auto aten_output = t3.add(t1);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedIndexing3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int w = 3, x = 4, y = 7, z = 8;
+
+  auto tv0 = makeSymbolicTensor(3);
+  auto tv1 = makeSymbolicTensor(4);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1.0));
+  auto tv3 = add(tv2, tv1);
+  fusion.addOutput(tv3);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({x, y, z}, options);
+  at::Tensor t1 = at::randn({w, x, y, z}, options);
+
+  auto t2 = t0.add(1.0);
+  auto aten_output = t2.add(t1);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  auto lparams = schedulePointwise(&fusion, aten_inputs);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs, lparams);
+  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedIndexing4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeConcreteTensor({4, 8});
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeConcreteTensor({4, 4, 8});
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv3 = broadcast(tv2, {true, false, false});
+  TensorView* tv4 = add(tv3, tv1);
+  fusion.addOutput(tv4);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({4, 8}, options);
+  at::Tensor t1 = at::randn({4, 4, 8}, options);
+
+  auto t2 = t0.add(1.0);
+  auto aten_output = t2.add(t1);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedIndexing5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeSymbolicTensor(3);
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv3 = broadcast(tv2, {true, false, true});
+  TensorView* tv4 = add(tv3, tv1);
+  fusion.addOutput(tv4);
+
+  tv3->merge(0)->merge(0)->split(0, 2)->split(0, 3);
+  tv4->merge(0)->merge(0)->split(0, 2)->split(0, 3);
+
+  tv0->computeAt(tv4, 1);
+  tv1->computeAt(tv4, 1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({7}, options);
+  at::Tensor t1 = at::randn({5, 7, 11}, options);
+
+  auto t2 = t0.add(1.0);
+  auto aten_output = t2.unsqueeze(-1).add(t1);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedIndexing6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> tensor0_shape{7, 4, 7};
+  std::vector<int64_t> tensor1_shape{4, 7};
+
+  TensorView* tv0 = makeSymbolicTensor(tensor0_shape.size());
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeSymbolicTensor(tensor1_shape.size());
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = add(tv0, tv1);
+  TensorView* tv3 = sum(tv2, {0, 1});
+  fusion.addOutput(tv3);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor input0 = at::randn(tensor0_shape, options);
+  at::Tensor input1 = at::randn(tensor1_shape, options);
+
+  std::vector<int64_t> reduction_axes{0, 1};
+  auto reduction_params = getReductionHeuristics(&fusion, {input0, input1});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  scheduleReduction(&fusion, *reduction_params);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input0, input1}, reduction_params->lparams);
+  auto cg_outputs = fe.runFusion({input0, input1}, reduction_params->lparams);
+
+  auto aten_output = input0.add(input1).to(at::kDouble).sum(reduction_axes);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {input0, input1},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      reduction_params->lparams);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedIndexing7_CUDA) {
+  // Might be able to use this one without 6 as the heuristics in 6 may change
+  // and this test is to cover the same issue.
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = broadcast(tv0, {false, true});
+
+  auto tv2 = makeSymbolicTensor(2);
+  fusion.addInput(tv2);
+
+  auto tv3 = add(tv1, tv2);
+  auto tv4 = sum(tv3, {0, 1});
+  fusion.addOutput(tv4);
+
+  tv4->merge(0, 1);
+  tv4->split(0, 128);
+  tv4->split(0, 4);
+
+  auto tv5 = tv4->rFactor({0, 1});
+
+  tv5->computeAt(tv4, -1);
+  tv0->computeAt(tv5, -1);
+
+  tv4->axis(0)->parallelize(ParallelType::TIDx);
+
+  const int numel_x = 100;
+  const int numel_y = 200;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto at_t0 = at::randn({numel_x}, options);
+  auto at_t1 = at::randn({numel_x, numel_y}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {at_t0, at_t1});
+  auto cg_outputs = fe.runFusion({at_t0, at_t1});
+
+  auto aten_output = (at_t0.unsqueeze(-1).expand({numel_x, numel_y}) + at_t1)
+                         .to(at::kDouble)
+                         .sum();
+
+  testValidate(
+      &fusion, cg_outputs, {at_t0, at_t1}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedIndexing8_CUDA) {
+  // Same as 7 but with outer splits instead of inner
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = broadcast(tv0, {false, true});
+
+  auto tv2 = makeSymbolicTensor(2);
+  fusion.addInput(tv2);
+
+  auto tv3 = add(tv1, tv2);
+  auto tv4 = sum(tv3, {0, 1});
+  fusion.addOutput(tv4);
+
+  tv4->merge(0, 1);
+  tv4->split(0, 128, false);
+  tv4->split(0, 4, false);
+
+  auto tv5 = tv4->rFactor({0, 1});
+
+  tv5->computeAt(tv4, -1);
+  tv0->computeAt(tv5, -1);
+
+  tv4->axis(0)->parallelize(ParallelType::TIDx);
+
+  const int numel_x = 100;
+  const int numel_y = 200;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto at_t0 = at::randn({numel_x}, options);
+  auto at_t1 = at::randn({numel_x, numel_y}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {at_t0, at_t1});
+  auto cg_outputs = fe.runFusion({at_t0, at_t1});
+
+  auto aten_output = (at_t0.unsqueeze(-1).expand({numel_x, numel_y}) + at_t1)
+                         .to(at::kDouble)
+                         .sum();
+
+  testValidate(
+      &fusion, cg_outputs, {at_t0, at_t1}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedIndexing9_CUDA) {
+  // Same as 7 but with outer splits instead of inner
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = broadcast(tv0, {false, true});
+
+  auto tv2 = mul(tv1, IrBuilder::create<Double>(2));
+  fusion.addOutput(tv2);
+
+  auto tv3 = makeSymbolicTensor(3);
+  fusion.addInput(tv3);
+
+  auto tv4 = add(tv3, tv2);
+  fusion.addOutput(tv4);
+
+  const int numel_x = 200;
+  const int numel_y = 300;
+  const int numel_z = 400;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto at_t0 = at::randn({numel_y}, options);
+  auto at_t3 = at::randn({numel_x, numel_y, numel_z}, options);
+  std::vector<IValue> aten_inputs = {at_t0, at_t3};
+
+  auto lparams = schedulePointwise(&fusion, aten_inputs);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs, lparams);
+  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
+
+  auto at_t1 = at_t0.unsqueeze(-1);
+  auto at_t2 = at_t1.mul(2.0);
+
+  auto at_t4 = at_t3.add(at_t2);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {at_t2, at_t4}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedIndexing10_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeContigTensor(2);
+  TensorView* tv1 = makeContigTensor(2);
+
+  // Register your inputs
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // Do math with it, it returns a `Val*` but can be static_casted back to
+  // TensorView
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = add(tv0, tv2);
+
+  // Register your outputs
+  fusion.addOutput(tv3);
+
+  auto tv0_cache = tv0->cacheAfter();
+  auto tv1_cache = tv1->cacheAfter();
+
+  std::vector<TensorView*> tvs = {tv0_cache, tv1_cache, tv2, tv3};
+
+  for (auto tv : tvs) {
+    tv->split(1, 2, false);
+    tv->split(1, 1);
+    tv->split(-1, 4);
+    // [I0, 2, 1, I1/2/4, 4]
+    tv->reorder({{1, 2}, {2, 3}, {3, 1}});
+    tv->axis(0)->parallelize(ParallelType::BIDx);
+    tv->axis(1)->parallelize(ParallelType::TIDx);
+  }
+
+  // For all inputs, computeAt the output inline, temporaries should be squeezed
+  // between them
+  tv0->computeAt(tv3, 1);
+  tv1->computeAt(tv3, 1);
+
+  tv0_cache->axis(-1)->parallelize(ParallelType::Vectorize);
+  tv1_cache->axis(-1)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor input1 = at::randn({64, 128}, options);
+  at::Tensor input2 = at::rand_like(input1);
+  at::Tensor output = at::empty_like(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input1, input2});
+  fe.runFusion({input1, input2}, {output});
+
+  at::Tensor tv2_ref = input2 + 2.0;
+  at::Tensor output_ref = input1 + tv2_ref;
+
+  TORCH_CHECK(output_ref.equal(output));
+}
+
+TEST_F(NVFuserTest, FusionAdvancedIndexing11_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int w = 3, x = 4, y = 7, z = 8;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  auto tv0 = makeSymbolicTensor(4);
+  auto tv1 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1.0));
+  auto tv3 = broadcast(tv2, {true, false, true, true});
+  auto tv4 = add(tv3, tv0);
+
+  fusion.addOutput(tv4);
+
+  tv4->merge(0);
+  tv4->merge(1);
+
+  tv4->split(1, 32);
+  tv4->split(0, 1);
+
+  tv4->reorder({{2, 1}});
+
+  tv2->computeAt(tv4, 3);
+
+  tv2->setMemoryType(MemoryType::Global);
+
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(1)->parallelize(ParallelType::BIDy);
+  tv4->axis(2)->parallelize(ParallelType::Unswitch);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  FusionExecutor fe;
+
+  at::Tensor t0 = at::randn({w, x, y, z}, options);
+  at::Tensor t1 = at::randn({x}, options);
+
+  auto t3 = t1.add(1.0).unsqueeze(-1).unsqueeze(-1);
+  auto aten_output = t3.add(t0);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+// Intended to stress the lowering of our code generator
+TEST_F(NVFuserTest, FusionAdvancedLowering1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeConcreteTensor({9, 5});
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
+  TensorView* tv3 = add(tv1, IrBuilder::create<Double>(3));
+  TensorView* tv4 = sum(tv3, {1});
+
+  fusion.addOutput(tv2);
+  fusion.addOutput(tv4);
+
+  tv4->split(1, 4);
+  auto tv5 = tv4->rFactor({2});
+
+  tv1->computeAt(tv5, 2);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(1);
+  at::Tensor aten_input = at::randn({9, 5}, options);
+
+  auto t1 = aten_input.add(1.0);
+  auto t2 = t1.add(2.0);
+  auto t3 = t1.add(3.0);
+  auto t4 = t3.sum(1);
+
+  std::vector<at::Tensor> aten_outputs = {t2, t4};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedLowering2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Progressively broadcast tensors
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  TensorView* tv2 = makeSymbolicTensor(3);
+  fusion.addInput(tv2);
+
+  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv4 = broadcast(tv3, {false, true});
+  TensorView* tv5 = add(tv4, tv1);
+  TensorView* tv6 = add(tv5, tv2);
+
+  fusion.addOutput(tv6);
+
+  // Split inner dimension
+  tv6->split(1, 4);
+  // Merge middle dims with outer dimensions
+  tv6->merge(2);
+  tv6->merge(0);
+
+  // tv6[I0*I1o, I1i*I2]
+
+  // Compute everything inline
+  tv0->computeAt(tv6, -1);
+
+  tv6->axis(0)->parallelize(ParallelType::BIDx);
+  tv6->axis(1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  int x = 13, y = 9, z = 5;
+  at::Tensor t0 = at::randn({y}, options);
+  at::Tensor t1 = at::randn({y, z}, options);
+  at::Tensor t2 = at::randn({x, y, z}, options);
+
+  auto t3 = t0.add(1.0);
+  auto t4 = t3.unsqueeze(-1);
+  auto t5 = t4.add(t1);
+  auto t6 = t5.add(t2);
+
+  std::vector<IValue> aten_inputs = {t0, t1, t2};
+  std::vector<at::Tensor> aten_outputs = {t6};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
+}
+
+// TODO: Complete test
+TEST_F(NVFuserTest, FusionAdvancedLowering3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({1, -1});
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // [b0, i1]
+  auto tv2 = add(tv0, IrBuilder::create<Double>(2.0));
+
+  // [i0, i1]
+  auto tv3 = add(tv1, IrBuilder::create<Double>(3.0));
+
+  // [b0, i1]
+  auto tv4 = add(tv2, IrBuilder::create<Double>(4.0));
+
+  // [io, i1]
+  auto tv5 = add(tv2, tv3);
+
+  fusion.addOutput(tv4);
+  fusion.addOutput(tv5);
+
+  tv0->computeAt(tv4, -1);
+
+  tv3->setMemoryType(MemoryType::Global);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  int x = 13, y = 9;
+  at::Tensor t0 = at::randn({1, y}, options);
+  at::Tensor t1 = at::randn({x, y}, options);
+
+  auto t4 = t0 + 2 + 4;
+  auto t5 = t0 + 2 + t1 + 3;
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+  std::vector<at::Tensor> aten_outputs = {t4, t5};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
+}
+
+// This excercises indexing with broadcast root axes. Non-broadcast
+// axes need to be preferred when propagating index exprs to root
+// axes. See, e.g., Index::getConsumerIndex_impl.
+TEST_F(NVFuserTest, FusionAdvancedLowering4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = broadcast(tv0, {false, true});
+  auto tv2 = broadcast(tv1, {false, false, true});
+  auto tv3 = makeSymbolicTensor(3);
+  fusion.addInput(tv3);
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  tv4->merge(1)->merge(0);
+  tv4->split(0, 8);
+  tv0->computeAt(tv4, 1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const int bx = 10;
+  const int by = 20;
+  const int bz = 30;
+  at::Tensor t0 = at::randn({bx}, options);
+  at::Tensor t3 = at::randn({bx, by, bz}, options);
+  std::vector<IValue> aten_inputs = {t0, t3};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output =
+      t0.unsqueeze(-1).expand({bx, by}).unsqueeze(-1).expand({bx, by, bz}) + t3;
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedLowering5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeConcreteTensor({5, 4, 3});
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = makeConcreteTensor({5, 3});
+  fusion.addInput(tv1);
+
+  auto tv2 = broadcast(tv1, {false, true, false});
+
+  auto tv3 = add(tv0, tv2);
+
+  fusion.addOutput(tv3);
+
+  tv2->merge(0);
+  tv1->computeAt(tv2, 1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(1);
+  at::Tensor t0 = at::randn({5, 4, 3}, options);
+  at::Tensor t1 = at::randn({5, 3}, options);
+  auto t2 = t1.unsqueeze(1);
+  auto t3 = t0 + t2;
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+  std::vector<at::Tensor> aten_outputs = {t3};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedLowering6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeConcreteTensor({5, 4, 3});
+  fusion.addInput(tv0);
+  auto tv1 = makeConcreteTensor({4});
+  fusion.addInput(tv1);
+  auto tv2 = unaryOp(UnaryOpType::Set, tv0);
+  auto tv3 = unaryOp(UnaryOpType::Set, tv1);
+
+  auto tv4 = sum(tv2, {0, 2});
+  auto tv5 = add(tv4, tv3);
+  fusion.addOutput(tv5);
+
+  auto tv6 = broadcast(tv3, {true, false, true});
+  auto tv7 = add(tv2, tv6);
+  fusion.addOutput(tv7);
+
+  tv2->computeAt(tv4, -1, ComputeAtMode::BestEffort);
+  tv3->computeAt(tv7, -1, ComputeAtMode::BestEffort);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(1);
+  at::Tensor t0 = at::randn({5, 4, 3}, options);
+  at::Tensor t1 = at::randn({4}, options);
+
+  auto t2 = t0;
+  auto t3 = t1;
+
+  std::vector<int64_t> reduction_axes{0, 2};
+  auto t4 = t2.sum(reduction_axes);
+  auto t5 = add(t4, t3);
+  auto t6 = t3.unsqueeze(0).unsqueeze(-1);
+  auto t7 = t2.add(t6);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+  std::vector<at::Tensor> aten_outputs = {t5, t7};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
+}
+
+// Test a simple Gemm but also play around with fusion executor features
+TEST_F(NVFuserTest, FusionSimpleGemm_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2); // M, K
+  TensorView* tv1 = makeSymbolicTensor(2); // K, N
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = broadcast(tv0, {false, false, true});
+  // tv2[I0, I1, B] = tv0[I0, I1]
+
+  TensorView* tv3 = broadcast(tv1, {true, false, false});
+  // tv3[B, I1, I2] = tv1[I1, I2]
+
+  // tv4[I0, I1, I2] = tv2[I0, I1, B] * tv3[B, I1, I2]
+  TensorView* tv4 = mul(tv2, tv3);
+  // tv5[I0, R1, I2] = tv4[I0, I1, I2]
+  TensorView* tv5 = sum(tv4, {1});
+  fusion.addOutput(tv5);
+
+  tv5->split(1, 32);
+  // tv5[I0, R1o, R1i{32}, I2]
+
+  auto tv6 = tv5->rFactor({1});
+  // tv6[I0, R1o, I1i{32}, I2] = tv4[I0, I1, I2]
+  // tv5[I0,    , R1i{32}, I2] = tv6[I0, R1o, I1i{32}, I2]
+
+  tv5->split(0, 4);
+  tv5->split(-1, 4);
+  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
+  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
+
+  tv0->computeAt(tv5, -1);
+  tv1->computeAt(tv5, -1);
+
+  // tv6[I0o, I0i{4}, R1o, I1i{32}, I2o, I2i{4}]
+  // tv5[I0o, I0i{4},    , R1i{32}, I2o, I2i{4}]
+  //--> (line symbolizes compute at location)
+  // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, I1o]
+  // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, R1o]
+  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
+
+  tv0->computeAt(tv6, -1);
+  tv1->computeAt(tv6, -1);
+  // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, I1o |]
+  // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, R1o |]
+  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
+
+  tv5->axis(0)->parallelize(ParallelType::BIDz);
+  tv5->axis(1)->parallelize(ParallelType::TIDz);
+
+  tv5->axis(-2)->parallelize(ParallelType::BIDy);
+  tv5->axis(-1)->parallelize(ParallelType::TIDy);
+
+  tv5->axis(2)->parallelize(ParallelType::TIDx);
+  tv6->axis(2)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 65, K = 33, N = 17;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({M, K}, options);
+  at::Tensor t1 = at::randn({K, N}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4));
+  // Lets specify a few bounds in launch params to make sure it works
+  fe.runFusion({t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4));
+
+  // Make sure bad launch params throws
+  // TODO: Re-enable once we have parallelization validation in.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  // ASSERT_ANY_THROW(fe.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6)));
+
+  // Don't specify any launch params
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto aten_output = t0.to(at::kDouble).matmul(t1.to(at::kDouble));
+
+  testValidate(
+      &fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__);
+}
+
+// Softmax with a 1D tensor. Parallelized only with a single thread block.
+TEST_F(NVFuserTest, FusionSoftmax1D_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int tidx = 128;
+  const int dimx = 1000;
+
+  // Set up your input tensor views
+  TensorView* input_tv0 = makeSymbolicTensor(1);
+  fusion.addInput(input_tv0);
+
+  TensorView* exp_tv1 = unaryOp(UnaryOpType::Exp, input_tv0);
+  TensorView* sum_exp_tv2 = sum(exp_tv1, {-1});
+  TensorView* bcast_sum_tv3 = broadcast(sum_exp_tv2, {true});
+
+  // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be
+  // computed at sum_exp_rf_tv8.
+  TensorView* exp_tv1_copy = unaryOp(UnaryOpType::Exp, input_tv0);
+
+  TensorView* output_tv4 = div(exp_tv1_copy, bcast_sum_tv3);
+
+  fusion.addOutput(output_tv4);
+
+  bcast_sum_tv3->split(0, tidx);
+
+  sum_exp_tv2->split(-1, tidx);
+  TensorView* sum_exp_rf_tv5 = sum_exp_tv2->rFactor({-2});
+
+  output_tv4->split(-1, tidx);
+
+  exp_tv1->computeAt(sum_exp_rf_tv5, -1);
+  exp_tv1_copy->computeAt(output_tv4, -1);
+
+  TensorView* tensors_to_parallelize[] = {
+      sum_exp_tv2, bcast_sum_tv3, output_tv4, sum_exp_rf_tv5};
+
+  for (auto tv : tensors_to_parallelize) {
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({dimx}, options);
+  at::Tensor cg_output = at::empty({dimx}, options);
+  at::Tensor t3_output = at::empty_like(cg_output, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  fe.runFusion({t0}, {cg_output});
+
+  auto aten_output = at::_softmax(t0.to(at::kDouble), -1, false);
+
+  testValidate(&fusion, {cg_output}, {t0}, {aten_output}, __LINE__, __FILE__);
+}
+
+// Softmax with a 1D tensor with input normalization.
+TEST_F(NVFuserTest, FusionSoftmax1DNormalized_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int tidx = 128;
+  const int dimx = 1000;
+
+  // Set up your input tensor views
+  TensorView* input_tv0 = makeSymbolicTensor(1);
+  fusion.addInput(input_tv0);
+
+  // Normalize with the max value before computing exp.
+  TensorView* max_val_tv1 = reductionOp(
+      BinaryOpType::Max, {-1}, IrBuilder::create<Double>(0), input_tv0);
+  TensorView* bcast_max_tv2 = broadcast(max_val_tv1, {true});
+  TensorView* sub_tv3 = sub(input_tv0, bcast_max_tv2);
+  TensorView* exp_tv4 = unaryOp(UnaryOpType::Exp, sub_tv3);
+  TensorView* sum_exp_tv5 = sum(exp_tv4, {-1});
+  TensorView* bcast_sum_tv6 = broadcast(sum_exp_tv5, {true});
+
+  // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be
+  // computed at sum_exp_rf_tv8.
+  TensorView* sub_tv3_copy = sub(input_tv0, bcast_max_tv2);
+  TensorView* exp_tv4_copy = unaryOp(UnaryOpType::Exp, sub_tv3_copy);
+
+  TensorView* output_tv7 = div(exp_tv4_copy, bcast_sum_tv6);
+
+  fusion.addOutput(output_tv7);
+  bcast_max_tv2->split(0, tidx);
+  bcast_sum_tv6->split(0, tidx);
+
+  max_val_tv1->split(-1, tidx);
+  TensorView* max_val_rf_tv8 = max_val_tv1->rFactor({-2});
+
+  sum_exp_tv5->split(-1, tidx);
+  TensorView* sum_exp_rf_tv9 = sum_exp_tv5->rFactor({-2});
+
+  output_tv7->split(-1, tidx);
+
+  sub_tv3->computeAt(sum_exp_rf_tv9, -1);
+  sub_tv3_copy->computeAt(output_tv7, -1);
+
+  TensorView* tensors_to_parallelize[] = {
+      max_val_tv1,
+      bcast_max_tv2,
+      sum_exp_tv5,
+      bcast_sum_tv6,
+      output_tv7,
+      max_val_rf_tv8,
+      sum_exp_rf_tv9};
+
+  for (auto tv : tensors_to_parallelize) {
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({dimx}, options);
+  at::Tensor t3_output = at::empty({dimx}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  auto aten_output = at::_softmax(input.to(at::kDouble), -1, false);
+
+  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+// Softmax with a 3D tensor, where the inner-most 3rd dimension is
+// normalized. Pallelized with multiple thread blocks.
+TEST_F(NVFuserTest, FusionSoftmax3D_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int tidx = 32;
+  const int dimx = 32;
+  const int dimy = 16;
+  const int dimz = 130;
+
+  // Set up your input tensor views
+  TensorView* input_tv0 = makeSymbolicTensor(3);
+  fusion.addInput(input_tv0);
+
+  TensorView* exp_tv1 = unaryOp(UnaryOpType::Exp, input_tv0);
+  TensorView* sum_exp_tv2 = sum(exp_tv1, {-1});
+  TensorView* bcast_sum_tv3 = broadcast(sum_exp_tv2, {false, false, true});
+
+  // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be
+  // computed at sum_exp_rf_tv8.
+  TensorView* exp_tv1_copy = unaryOp(UnaryOpType::Exp, input_tv0);
+
+  TensorView* output_tv4 = div(exp_tv1_copy, bcast_sum_tv3);
+
+  fusion.addOutput(output_tv4);
+
+  bcast_sum_tv3->split(-1, tidx);
+
+  sum_exp_tv2->split(-1, tidx);
+  TensorView* sum_exp_rf_tv5 = sum_exp_tv2->rFactor({-2});
+
+  output_tv4->split(-1, tidx);
+
+  exp_tv1->computeAt(sum_exp_rf_tv5, -1);
+  exp_tv1_copy->computeAt(output_tv4, -1);
+
+  TensorView* tensors_to_parallelize[] = {
+      sum_exp_tv2, bcast_sum_tv3, output_tv4, sum_exp_rf_tv5};
+
+  for (auto tv : tensors_to_parallelize) {
+    tv->axis(0)->parallelize(ParallelType::BIDx);
+    tv->axis(1)->parallelize(ParallelType::BIDy);
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({dimx, dimy, dimz}, options);
+
+  at::Tensor cg_output = at::empty({dimx, dimy, dimz}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  fe.runFusion({input}, {cg_output});
+
+  auto aten_output = at::_softmax(input.to(at::kDouble), -1, false);
+
+  testValidate(
+      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+// Softmax with a 3D tensor with input normalization.
+TEST_F(NVFuserTest, FusionSoftmax3DNormalized_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int tidx = 32;
+  const int dimx = 32;
+  const int dimy = 16;
+  const int dimz = 130;
+
+  // Set up your input tensor views
+  TensorView* input_tv0 = makeSymbolicTensor(3);
+  fusion.addInput(input_tv0);
+
+  // Normalize with the max value before computing exp.
+  TensorView* max_val_tv1 = reductionOp(
+      BinaryOpType::Max, {-1}, IrBuilder::create<Double>(0), input_tv0);
+  TensorView* bcast_max_tv2 = broadcast(max_val_tv1, {false, false, true});
+  TensorView* sub_tv3 = sub(input_tv0, bcast_max_tv2);
+  TensorView* exp_tv4 = unaryOp(UnaryOpType::Exp, sub_tv3);
+  TensorView* sum_exp_tv5 = sum(exp_tv4, {-1});
+  TensorView* bcast_sum_tv6 = broadcast(sum_exp_tv5, {false, false, true});
+
+  // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be
+  // computed at sum_exp_rf_tv8.
+  TensorView* sub_tv3_copy = sub(input_tv0, bcast_max_tv2);
+  TensorView* exp_tv4_copy = unaryOp(UnaryOpType::Exp, sub_tv3_copy);
+
+  TensorView* output_tv7 = div(exp_tv4_copy, bcast_sum_tv6);
+
+  fusion.addOutput(output_tv7);
+
+  bcast_max_tv2->split(-1, tidx);
+  bcast_sum_tv6->split(-1, tidx);
+
+  max_val_tv1->split(-1, tidx);
+  TensorView* max_val_rf_tv8 = max_val_tv1->rFactor({-2});
+
+  sum_exp_tv5->split(-1, tidx);
+  TensorView* sum_exp_rf_tv9 = sum_exp_tv5->rFactor({-2});
+
+  output_tv7->split(-1, tidx);
+
+  sub_tv3->computeAt(sum_exp_rf_tv9, -1);
+  sub_tv3_copy->computeAt(output_tv7, -1);
+
+  TensorView* tensors_to_parallelize[] = {
+      max_val_tv1,
+      bcast_max_tv2,
+      sum_exp_tv5,
+      bcast_sum_tv6,
+      output_tv7,
+      max_val_rf_tv8,
+      sum_exp_rf_tv9};
+
+  for (auto tv : tensors_to_parallelize) {
+    tv->axis(0)->parallelize(ParallelType::BIDx);
+    tv->axis(1)->parallelize(ParallelType::BIDy);
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({dimx, dimy, dimz}, options);
+  at::Tensor t3_output = at::empty({dimx, dimy, dimz}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  auto aten_output = at::_softmax(input.to(at::kDouble), -1, false);
+
+  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSoftmaxComputeAt_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+
+  auto tv3 = add(tv0, IrBuilder::create<Double>(1.0));
+
+  auto tv4 = mul(tv2, tv3);
+
+  auto tv5 = sum(tv4, {1});
+  auto tv6 = broadcast(tv5, {false, true});
+
+  auto tv7 = sub(tv6, tv4);
+  fusion.addOutput(tv7);
+
+  tv1->computeAt(tv7, 1);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(tv1->computeAt(tv7, -1));
+}
+
+// Similar to FusionReduction but uses grid reduction
+TEST_F(NVFuserTest, FusionGridReduction1_CUDA) {
+  const int gdimx = 32;
+  const int bdimx = 128;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  // tv1[I0, R1] = tv0[I0, I1]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  TORCH_CHECK(
+      ir_utils::getReductionOps(&fusion).size(),
+      "Could not detect reduction in fusion.");
+
+  tv1->split(1, bdimx);
+  // tv1[I0, R1o, R1i{128}] = tv0[I0, I1]
+  tv1->split(1, gdimx);
+  // tv1[I0, R1oo, R1oi{32}, R1i{128}] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({1});
+  // tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] = tv0[I0, I1]
+  // tv1[I0,        R1oi{32},  R1i{128}] = tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}]
+
+  // Incrementally, can print in between for debugging
+  tv0->computeAt(tv2, 1);
+  tv2->computeAt(tv1, 1);
+
+  // Re do it all at once, because why not.
+  tv0->computeAt(tv1, 1);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDy);
+  tv1->axis(1)->parallelize(ParallelType::BIDx);
+  tv2->axis(2)->parallelize(ParallelType::BIDx);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  int numel_x = 10000;
+  int numel_y = 65000;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+  at::Tensor cg_output = at::empty({numel_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  fe.runFusion({input}, {cg_output});
+
+  auto aten_output = input.to(at::kDouble).sum({1});
+
+  testValidate(
+      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+// Same test as the above but uses BIDy and TIDx for reduction
+TEST_F(NVFuserTest, FusionGridReduction2_CUDA) {
+  const int gdimy = 32;
+  const int bdimx = 128;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  // tv1[I0, R1] = tv0[I0, I1]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  TORCH_CHECK(
+      ir_utils::getReductionOps(&fusion).size(),
+      "Could not detect reduction in fusion.");
+
+  tv1->split(1, bdimx);
+  // tv1[I0, R1o, R1i{128}] = tv0[I0, I1]
+  tv1->split(1, gdimy);
+  // tv1[I0, R1oo, R1oi{32}, R1i{128}] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({1});
+  // tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] = tv0[I0, I1]
+  // tv1[I0,        R1oi{32},  R1i{128}] = tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}]
+
+  // Incrementally, can print in between for debugging
+  tv0->computeAt(tv2, 1);
+  tv2->computeAt(tv1, 1);
+
+  // Re do it all at once, because why not.
+  tv0->computeAt(tv1, 1);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::BIDy);
+  tv2->axis(2)->parallelize(ParallelType::BIDy);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  int numel_x = 10000;
+  int numel_y = 65000;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  auto aten_output = input.to(at::kDouble).sum({1});
+
+  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+// Same test but uses BIDy and BIDz for reduction. No TID used.
+TEST_F(NVFuserTest, FusionGridReduction3dim1_CUDA) {
+  // Grid reductions when there aren't any threads are serial reductions
+  // keep these numbers low so our error isn't too high compared to normal cuda
+  // reductions
+  const int gdimz = 15;
+  const int gdimy = 9;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  // tv1[I0, R1] = tv0[I0, I1]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  TORCH_CHECK(
+      ir_utils::getReductionOps(&fusion).size(),
+      "Could not detect reduction in fusion.");
+
+  tv1->split(1, gdimy);
+  // tv1[I0, R1o, R1i{128}] = tv0[I0, I1]
+  tv1->split(1, gdimz);
+  // tv1[I0, R1oo, R1oi{32}, R1i{128}] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({1});
+  // tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}] = tv0[I0, I1]
+  // tv1[I0,        R1oi{32},  R1i{128}] = tv2[I0, R1oo, Ir1oi{32}, Ir1i{128}]
+
+  // Incrementally, can print in between for debugging
+  tv0->computeAt(tv2, 1);
+  tv2->computeAt(tv1, 1);
+
+  // Re do it all at once, because why not.
+  tv0->computeAt(tv1, 1);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::BIDz);
+  tv2->axis(2)->parallelize(ParallelType::BIDz);
+  tv1->axis(-1)->parallelize(ParallelType::BIDy);
+  tv2->axis(-1)->parallelize(ParallelType::BIDy);
+
+  int numel_x = 100;
+  int numel_y = 6500;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+  at::Tensor cg_output = at::empty({numel_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  fe.runFusion({input}, {cg_output});
+
+  auto aten_output = input.to(at::kDouble).sum({1});
+  testValidate(
+      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+// Same as testGPU_FusionGridReduction3dim1 but reduces dimension 0
+TEST_F(NVFuserTest, FusionGridReduction3dim0_CUDA) {
+  // Grid reductions when there aren't any threads are serial reductions
+  // keep these numbers low so our error isn't too high compared to normal cuda
+  // reductions
+  const int gdimz = 15;
+  const int gdimy = 9;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  // tv1[R0, I1] = tv0[I0, I1]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {0}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  TORCH_CHECK(
+      ir_utils::getReductionOps(&fusion).size(),
+      "Could not detect reduction in fusion.");
+
+  tv1->split(0, gdimy);
+  // tv1[R0o, R0i{128}, I1] = tv0[I0, I1]
+  tv1->split(0, gdimz);
+  // tv1[R0oo, R0oi{32}, R0i{128}, I1] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({0});
+  // tv2[R0oo, I0oi{32}, I0i{128}, I1] = tv0[I0, I1]
+  // tv1[      R0oi{32}, R0i{128}, I1] = tv2[R0oo, I0oi{32}, I0i{128}, I1]
+
+  // Note that computeAt isn't going to make anything better as there
+  // is no dynamically sized dimension.
+
+  // Map parallelism as [Serial, BIDz, BIDy, BIDx]
+  tv1->axis(-1)->parallelize(ParallelType::BIDx);
+  tv2->axis(-1)->parallelize(ParallelType::BIDx);
+  tv1->axis(-2)->parallelize(ParallelType::BIDy);
+  tv2->axis(-2)->parallelize(ParallelType::BIDy);
+  tv1->axis(-3)->parallelize(ParallelType::BIDz);
+  tv2->axis(-3)->parallelize(ParallelType::BIDz);
+
+  int numel_x = 6500;
+  int numel_y = 100;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  auto aten_output = input.to(at::kDouble).sum({0});
+
+  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+// This is similar to the FusionReduction, but swaps BIDx and TIDx
+TEST_F(NVFuserTest, FusionGridReduction4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int bdimx = 128;
+  const int gdimx = 1024;
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  // tv1[I0, R1] = tv0[I0, I1]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  TORCH_CHECK(
+      ir_utils::getReductionOps(&fusion).size(),
+      "Could not detect reduction in fusion.");
+
+  tv1->split(1, gdimx);
+  // tv1[I0, R1o, R1i{1024}] = tv0[I0, I1]
+  tv1->split(1, 4);
+  // tv1[I0, R1oo, R1oi{4}, R1i{128}] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({1});
+  // tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}] = tv0[I0, I1]
+  // tv1[I0,        R1oi{4},  R1i{1024}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}]
+
+  TensorView* tv3 = tv1->rFactor({1});
+  // tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}] = tv0[I0, I1]
+  // tv3[I0,        R1oi{4}, Ir1i{1024}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{1024}]
+  // tv1[I0,                  R1i{1024}] = tv3[I0,        R1oi{4}, Ir1i{1024}]
+
+  // Incrementally, can print in between for debugging
+  tv0->computeAt(tv2, 1);
+  tv2->computeAt(tv3, 1);
+  tv3->computeAt(tv1, 1);
+
+  // Re do it all at once, because why not.
+  tv0->computeAt(tv1, 1);
+
+  tv2->axis(2)->parallelize(ParallelType::Unroll);
+  tv1->axis(0)->parallelize(ParallelType::TIDx);
+
+  tv1->axis(-1)->parallelize(ParallelType::BIDx);
+  tv2->axis(-1)->parallelize(ParallelType::BIDx);
+  tv3->axis(-1)->parallelize(ParallelType::BIDx);
+
+  int numel_x = bdimx;
+  int numel_y = 65000;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+  at::Tensor cg_output = at::empty({numel_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  fe.runFusion({input}, {cg_output});
+
+  auto aten_output = input.to(at::kDouble).sum({1});
+  testValidate(
+      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+// Grid reduction with 2D thread blocks but only TIDx and BIDx are
+// mapped to a reduction dim
+TEST_F(NVFuserTest, FusionGridReduction5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int bdimx = 64;
+  const int bdimy = 16;
+  const int gdimx = 4;
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  // tv1[I0, R1] = tv0[I0, I1]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  TORCH_CHECK(
+      ir_utils::getReductionOps(&fusion).size(),
+      "Could not detect reduction in fusion.");
+
+  tv1->split(1, bdimx);
+  // tv1[I0, R1o, R1i{64}] = tv0[I0, I1]
+  tv1->split(1, gdimx);
+  // tv1[I0, R1oo, R1oi{4}, R1i{64}] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({1});
+  // tv2[I0, R1oo, Ir1oi{4}, Ir1i{64}] = tv0[I0, I1]
+  // tv1[I0,        R1oi{4},  R1i{64}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{64}]
+
+  tv0->computeAt(tv1, 1);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv1->axis(-2)->parallelize(ParallelType::BIDx);
+  tv2->axis(-2)->parallelize(ParallelType::BIDx);
+
+  tv1->axis(0)->parallelize(ParallelType::TIDy);
+
+  int numel_x = bdimy;
+  int numel_y = 6500;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  auto aten_output = input.to(at::kDouble).sum({1});
+  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+// Similar to FusionGridReduction1 but with 3D tensors
+TEST_F(NVFuserTest, FusionGridReduction6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(3);
+  fusion.addInput(tv0);
+
+  // tv1[I0, R1, R2] = tv0[I0, I1, I2]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1, 2}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  TORCH_CHECK(
+      ir_utils::getReductionOps(&fusion).size(),
+      "Could not detect reduction in fusion.");
+
+  // Splitting for TID
+  tv1->split(2, 128);
+  // tv1[I0, R1, R2o, R2i{128}] = tv0[I0, I1, I2]
+
+  // Splitting for BID
+  tv1->split(1, 128);
+
+  // tv1[I0, R1o, R1i{128}, R2o, R2i{128}] = tv0[I0, I1, I2]
+
+  TensorView* tv2 = tv1->rFactor({3});
+  // tv2[I0, I1o, I1i{128}, R2o, I2i{128}]
+  // tv1[I0, R1o, R1i{128},      R2i{128}]
+
+  TensorView* tv3 = tv1->rFactor({1});
+  // tv2[I0, I1o, I1i{128}, R2o, I2i{128}]
+  // tv3[I0, R1o, I1i{128},      I2i{128}]
+  // tv1[I0,      R1i{128},      R2i{128}]
+
+  tv3->computeAt(tv1, 1);
+  tv2->computeAt(tv3, 3);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDy);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv1->axis(-2)->parallelize(ParallelType::BIDx);
+  tv2->axis(-3)->parallelize(ParallelType::BIDx);
+  tv3->axis(-2)->parallelize(ParallelType::BIDx);
+
+  int numel_x = 6500;
+  int numel_y = 200;
+  int numel_z = numel_y;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y, numel_z}, options);
+  at::Tensor cg_output = at::empty({numel_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  fe.runFusion({input}, {cg_output});
+
+  auto aten_output = input.to(at::kDouble).sum({1, 2});
+
+  testValidate(
+      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+// See issue #1049
+TEST_F(NVFuserTest, FusionGridReduction7_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0});
+  fusion.addOutput(tv1);
+
+  tv1->split(0, 1000);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::BIDy);
+
+  const int numel_x = 1;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x}, options);
+  at::Tensor cg_output = at::empty({numel_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto out = fe.runFusion({input});
+
+  auto aten_output = input.sum({0});
+
+  testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionGridReduction8_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0});
+  fusion.addOutput(tv1);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+
+  const int numel_x = 2;
+  const int numel_y = 4;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto out = fe.runFusion({input});
+
+  auto aten_output = input.sum({0});
+
+  testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionGridReduction9_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {1});
+
+  auto tv2 = makeSymbolicTensor(1);
+  fusion.addInput(tv2);
+
+  auto tv3 = add(tv2, tv1);
+  fusion.addOutput(tv3);
+
+  tv1->split(1, 2);
+
+  tv1->axis(1)->parallelize(ParallelType::BIDx);
+  tv1->axis(2)->parallelize(ParallelType::BIDy);
+
+  tv1->computeAt(tv3, 1);
+
+  const int numel_x = 4;
+  const int numel_y = 10;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  at::Tensor t2 = at::randn({numel_x}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t2};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_output = fe.runFusion(aten_inputs);
+
+  auto aten_output = t0.sum({1}).add(t2);
+
+  testValidate(&fusion, cg_output, {t0, t2}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionGridReduction10_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(4);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {-1});
+  auto tv2 = sum(tv1, {-1});
+  auto tv3 = sum(tv2, {-1});
+
+  fusion.addOutput(tv3);
+  tv1->axis(0)->parallelize(ParallelType::TIDx);
+  tv1->axis(1)->parallelize(ParallelType::BIDx);
+  tv1->axis(2)->parallelize(ParallelType::TIDy);
+  tv1->axis(3)->parallelize(ParallelType::TIDz);
+
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+  tv2->axis(1)->parallelize(ParallelType::BIDx);
+  tv2->axis(2)->parallelize(ParallelType::TIDy);
+
+  tv3->axis(0)->parallelize(ParallelType::TIDx);
+  tv3->axis(1)->parallelize(ParallelType::BIDx);
+
+  tv0->computeAt(tv3, 1);
+
+  const int numel_w = 2;
+  const int numel_x = 3;
+  const int numel_y = 4;
+  const int numel_z = 5;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_w, numel_x, numel_y, numel_z}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_output = fe.runFusion({t0});
+
+  auto aten_output = t0.sum({1, 2, 3});
+
+  testValidate(&fusion, cg_output, {t0}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionNonRedAxisBind_CUDA) {
+  int bid_x = 3;
+  int tid_x = 2;
+  int red_dim = 0;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = reductionOp(
+      BinaryOpType::Add, {red_dim}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  tv1->split(-1, tid_x);
+  tv1->axis(-2)->parallelize(ParallelType::BIDx);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({16, bid_x * tid_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  auto aten_output = input.to(at::kDouble).sum({red_dim});
+
+  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSplitBCast_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* input_tv0 = makeSymbolicTensor(3);
+  TensorView* input_tv1 = makeSymbolicTensor(3);
+  fusion.addInput(input_tv0);
+  fusion.addInput(input_tv1);
+
+  TensorView* sum_tv2 = reductionOp(
+      BinaryOpType::Add, {2}, IrBuilder::create<Double>(0), input_tv0);
+  TensorView* bcast_tv3 = broadcast(sum_tv2, {false, false, true});
+  TensorView* output_tv4 = div(input_tv1, bcast_tv3);
+
+  sum_tv2->split(-1, 32);
+  TensorView* sum_rf_tv5 = sum_tv2->rFactor({-2});
+
+  bcast_tv3->split(-1, 32);
+  output_tv4->split(-1, 32);
+
+  sum_rf_tv5->axis(0)->parallelize(ParallelType::BIDx);
+  sum_tv2->axis(0)->parallelize(ParallelType::BIDx);
+  bcast_tv3->axis(0)->parallelize(ParallelType::BIDx);
+  output_tv4->axis(0)->parallelize(ParallelType::BIDx);
+
+  sum_rf_tv5->axis(1)->parallelize(ParallelType::BIDy);
+  sum_tv2->axis(1)->parallelize(ParallelType::BIDy);
+  bcast_tv3->axis(1)->parallelize(ParallelType::BIDy);
+  output_tv4->axis(1)->parallelize(ParallelType::BIDy);
+
+  sum_rf_tv5->axis(-1)->parallelize(ParallelType::TIDx);
+  sum_tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  bcast_tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  output_tv4->axis(-1)->parallelize(ParallelType::TIDx);
+
+  fusion.addOutput(output_tv4);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({32, 32, 128}, options);
+  at::Tensor t1 = at::randn({32, 32, 128}, options);
+  at::Tensor cg_output = at::empty({32, 32, 128}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  fe.runFusion({t0, t1}, {cg_output});
+}
+
+TEST_F(NVFuserTest, FusionBCastInnerDim_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  // reduce then broadcast
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = broadcast(tv1, {false, true});
+
+  TORCH_CHECK(!tv2->axis(0)->isReduction() && tv2->axis(1)->isBroadcast());
+}
+
+TEST_F(NVFuserTest, FusionBCastReduce_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+
+  auto tv1 = broadcast(tv0, {true, false, false});
+  auto tv2 = sum(tv1, {1});
+  TORCH_CHECK(
+      tv2->axis(0)->isBroadcast() && tv2->axis(1)->isReduction() &&
+      !tv2->axis(2)->isBroadcast() && !tv2->axis(2)->isReduction());
+}
+
+// Multiple consumer reduction with computeAt
+// https://github.com/csarofeen/pytorch/issues/110
+TEST_F(NVFuserTest, FusionReductionMultiConsumer_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = unaryOp(UnaryOpType::Exp, tv0);
+  auto tv2 =
+      reductionOp(BinaryOpType::Max, {-1}, IrBuilder::create<Double>(0), tv1);
+  auto tv3 =
+      reductionOp(BinaryOpType::Min, {-1}, IrBuilder::create<Double>(0), tv1);
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+  tv1->computeAt(tv2, -1, ComputeAtMode::BestEffort);
+
+  TORCH_CHECK(tv1->getComputeAtPosition() == 2);
+}
+
+TEST_F(NVFuserTest, FusionComputeAtExprOrder1_CUDA) {
+  for (const auto i : c10::irange(2)) {
+    Fusion fusion;
+    FusionGuard fg(&fusion);
+
+    // Set up your input tensor views
+    TensorView* tv0 = makeSymbolicTensor(1);
+    fusion.addInput(tv0);
+
+    auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+    auto tv2 = add(tv0, IrBuilder::create<Double>(1));
+    TensorView* tv3 = add(tv1, tv2);
+    // Set outputs tv2 or tv1 and then tv3
+    if (i == 0) {
+      fusion.addOutput(tv2);
+    } else {
+      fusion.addOutput(tv1);
+    }
+    fusion.addOutput(tv3);
+
+    if (i == 0) {
+      tv1->computeAt(tv3, -1);
+    } else {
+      tv2->computeAt(tv3, -1);
+    }
+
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::Tensor aten_input = at::randn({100}, options);
+    std::vector<at::Tensor> aten_outputs = {
+        aten_input + 1, (aten_input + 1) * 2};
+
+    FusionExecutor fe;
+    fe.compileFusion(&fusion, {aten_input});
+    auto cg_outputs = fe.runFusion({aten_input});
+
+    testValidate(
+        &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+  }
+}
+
+TEST_F(NVFuserTest, FusionComputeAtExprOrder2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv3 = add(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  tv3->split(-1, 32);
+
+  tv1->computeAt(tv3, -1);
+  tv2->computeAt(tv3, -2);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({100, 100}, options);
+  auto aten_output = (aten_input + 1) * 2;
+
+  at::Tensor cg_output = at::empty_like(aten_input, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionComputeAtExprOrder3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int64_t dimx = 13;
+  const int64_t dimy = 15;
+
+  TensorView* tv0 = makeConcreteTensor({dimx, dimy});
+  fusion.addInput(tv0);
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
+  TensorView* tv3 = add(tv2, IrBuilder::create<Double>(3));
+  TensorView* tv4 = add(tv3, IrBuilder::create<Double>(4));
+  TensorView* tv5 = mul(tv2, tv4);
+  fusion.addOutput(tv5);
+
+  tv1->computeAt(tv2, 2);
+  tv3->computeAt(tv4, 1);
+  tv4->computeAt(tv5, 2);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({dimx, dimy}, options);
+  auto t1 = aten_input.add(1.);
+  auto t2 = t1.add(2.);
+  auto t3 = t2.add(3.);
+  auto t4 = t3.add(4.);
+  auto aten_output = t2.mul(t4);
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionZeroDimComputeAt_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+  TORCH_CHECK(tv2->nDims() == 0);
+  tv1->computeAt(tv2, 0);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({100}, options);
+  auto aten_output = aten_input.to(at::kDouble).sum() + 1;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionZeroDimBroadcast_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(0);
+  fusion.addInput(tv0);
+
+  auto tv1 = broadcast(tv0, {true, true});
+  TORCH_CHECK(tv1->nDims() == 2);
+
+  TensorView* tv2 = makeSymbolicTensor(2);
+  fusion.addInput(tv2);
+
+  auto tv3 = add(tv1, tv2);
+  auto tv4 = sum(tv3, {0, 1});
+  fusion.addOutput(tv4);
+
+  tv3->computeAt(tv4, -1);
+  tv3->axis(-2)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDy);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({}, options);
+  at::Tensor t1 = at::randn({10, 10}, options);
+
+  auto aten_output = (t0.unsqueeze(-1).unsqueeze(-1).expand({10, 10}) + t1)
+                         .to(at::kDouble)
+                         .sum();
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+  at::Tensor cg_output = at::empty({}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  fe.runFusion(aten_inputs, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionZeroDimReduction_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int bdimx = 32;
+  const int gdimx = 32;
+
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0});
+  fusion.addOutput(tv1);
+
+  tv1->split(0, bdimx);
+  tv1->split(0, gdimx);
+  auto tv2 = tv1->rFactor({0});
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-2)->parallelize(ParallelType::BIDx);
+  tv2->axis(-2)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({1000}, options);
+  auto aten_output = aten_input.to(at::kDouble).sum();
+
+  at::Tensor cg_output = at::empty({}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBCastAfterReduce_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  const int tidx = 128;
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+
+  tv1->split(1, tidx);
+  auto tv3 = tv1->rFactor({-2});
+
+  TensorView* tv4 = makeSymbolicTensor(2);
+  fusion.addInput(tv4);
+
+  auto tv5 = add(tv2, tv4);
+  fusion.addOutput(tv5);
+  tv5->split(1, tidx);
+
+  tv3->computeAt(tv5, 1);
+
+  tv2->split(1, tidx);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv5->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv5->axis(0)->parallelize(ParallelType::BIDx);
+
+  int x = 63, y = 200;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({x, y}, options);
+  at::Tensor t4 = at::randn({x, y}, options);
+
+  auto t3 = t0.to(at::kDouble).sum({1}).unsqueeze(-1).expand({x, y});
+  auto aten_output = t3.add(t4);
+
+  std::vector<IValue> aten_inputs = {t0, t4};
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t4});
+  auto cg_outputs = fe.runFusion({t0, t4});
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionOutputBroadcast_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeConcreteTensor({2, 3});
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = broadcast(tv0, {true, false, true, false, true});
+
+  fusion.addOutput(tv1);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({2, 3}, options);
+  auto aten_output = aten_input.unsqueeze(2).unsqueeze(1).unsqueeze(0);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionReductionKeepDimBasic_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeConcreteTensor({2, 3, 4, 5, 6});
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = sum(tv0, {0, 2, -1}, /*keep_dim=*/true);
+
+  fusion.addOutput(tv1);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({2, 3, 4, 5, 6}, options);
+  auto aten_output =
+      aten_input.to(at::kDouble).sum({0, 2, -1}, /*keepdim=*/true);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionReductionKeepDimScheduler_CUDA) {
+  constexpr int bid_x = 80;
+  constexpr int tid_x = 4096;
+  constexpr int red_dim = 1;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeConcreteTensor({bid_x, tid_x});
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = reductionOp(
+      BinaryOpType::Add,
+      {red_dim},
+      IrBuilder::create<Double>(0),
+      tv0,
+      /*keep_dim=*/true);
+
+  fusion.addOutput(tv1);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({bid_x, tid_x}, options);
+  auto aten_output =
+      aten_input.to(at::kDouble).sum({red_dim}, /*keepdim=*/true);
+
+  // Apply reduction heuristic
+  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  scheduleReduction(&fusion, *reduction_params);
+
+  auto lparams = reduction_params->lparams;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input}, lparams);
+  auto cg_outputs = fe.runFusion({aten_input}, lparams);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionSumTo_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> tensor_shape{2, 3, 4, 5, 6};
+  std::vector<int64_t> sum_to_shape{1, 5, 6};
+
+  std::vector<int64_t> tensor_shape_ref{2, 3, 4, 5, 6};
+  std::vector<int64_t> sum_to_shape_ref{1, 5, 6};
+
+  std::vector<Int*> sum_to_symb;
+  std::transform(
+      sum_to_shape.begin(),
+      sum_to_shape.end(),
+      std::back_inserter(sum_to_symb),
+      [](int s) -> Int* { return IrBuilder::create<Int>(s); });
+
+  TensorView* tv0 = makeConcreteTensor(tensor_shape);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = sum_to(tv0, sum_to_symb);
+  fusion.addOutput(tv1);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn(tensor_shape_ref, options);
+  auto aten_output = at::sum_to(aten_input.to(at::kDouble), sum_to_shape_ref);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  TORCH_CHECK(
+      cg_outputs[0].dim() == static_cast<int64_t>(sum_to_shape.size()),
+      "sum_to not keeping the final dimension");
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSumToNoop_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> tensor_shape{4, 5, 6};
+  std::vector<int64_t> sum_to_shape{4, 5, 6};
+
+  std::vector<int64_t> tensor_shape_ref{4, 5, 6};
+  std::vector<int64_t> sum_to_shape_ref{4, 5, 6};
+
+  std::vector<Int*> sum_to_symb;
+  std::transform(
+      sum_to_shape.begin(),
+      sum_to_shape.end(),
+      std::back_inserter(sum_to_symb),
+      [](int s) -> Int* { return IrBuilder::create<Int>(s); });
+
+  TensorView* tv0 = makeConcreteTensor(tensor_shape);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = sum_to(tv0, sum_to_symb);
+
+  // Dummy operator to avoid tv0 both input and output
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(0));
+  fusion.addOutput(tv2);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn(tensor_shape_ref, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+  auto aten_output = at::sum_to(aten_input.to(at::kDouble), sum_to_shape_ref);
+
+  TORCH_CHECK(
+      cg_outputs[0].dim() == static_cast<int64_t>(sum_to_shape.size()),
+      "sum_to not keeping the final dimension");
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionReductionScheduler_CUDA) {
+  constexpr int bid_x = 80;
+  constexpr int tid_x = 4096;
+  constexpr int red_dim = 1;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = reductionOp(
+      BinaryOpType::Add, {red_dim}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({bid_x, tid_x}, options);
+  auto aten_output = aten_input.to(at::kDouble).sum({red_dim});
+
+  // Apply reduction heuristic
+  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  scheduleReduction(&fusion, *reduction_params);
+
+  auto lparams = reduction_params->lparams;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input}, lparams);
+  // no broadcasting needed, omitting the last optional argument;
+  auto cg_outputs = fe.runFusion({aten_input}, lparams);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+// This test checks if our system could correctly handles the case where both
+// reduction and trivial reduction exist in the fusion. Trivial reduction
+// deserve testing because trivial reduction is handled more like a broadcasting
+// rather than a reduction.
+TEST_F(NVFuserTest, FusionReductionWithTrivialReduction_CUDA) {
+  constexpr int bid_x = 80;
+  constexpr int tid_x = 4096;
+
+  std::vector<std::vector<int64_t>> shapes = {
+      {-1, -1, 1}, {-1, 1, -1}, {1, -1, -1}};
+
+  for (auto shape : shapes) {
+    std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+    Fusion& fusion = *fusion_ptr;
+    FusionGuard fg(&fusion);
+
+    std::vector<std::vector<int64_t>> reduction_dims = {
+        {0},
+        {1},
+        {2},
+        {0, 1},
+        {0, 2},
+        {1, 2},
+        {0, 1, 2},
+    };
+
+    // Set up your input tensor views
+    TensorView* tv0 = makeConcreteTensor(shape);
+    fusion.addInput(tv0);
+
+    for (auto rdims : reduction_dims) {
+      std::vector<int> rdims_(rdims.begin(), rdims.end());
+      auto tv = sum(tv0, rdims_);
+      fusion.addOutput(tv);
+    }
+
+    const auto options =
+        at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+    auto concrete_shape = shape;
+    std::deque<int64_t> concrete_values = {bid_x, tid_x};
+    for (auto& s : concrete_shape) {
+      if (s == -1) {
+        s = concrete_values.front();
+        concrete_values.pop_front();
+      }
+    }
+
+    at::Tensor aten_input = at::randn(concrete_shape, options);
+    std::vector<at::Tensor> aten_outputs;
+    for (auto rdims : reduction_dims) {
+      aten_outputs.push_back(aten_input.sum(rdims));
+    }
+
+    FusionExecutorCache executor_cache(std::move(fusion_ptr));
+    auto cg_outputs = executor_cache.runFusionWithInputs({aten_input});
+
+    testValidate(
+        &fusion,
+        cg_outputs,
+        {aten_input},
+        aten_outputs,
+        __LINE__,
+        __FILE__,
+        "");
+  }
+}
+
+// Simple reduction parallelized on a symbolic size.
+TEST_F(NVFuserTest, FusionSymbolicReduction_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  // tv1[I0, R1] = tv0[I0, I1]
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  // Interface should just be a direct split with a Parallel type. We can
+  // include the parallelize call if we do this.
+  tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
+  // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({1});
+  // tv2[I0, R1oo, Ir1oi{4}, Ir1i{BIDx}] = tv0[I0, I1]
+  // tv1[I0,        R1oi{4},  R1i{BIDx}] = tv2[I0, R1oo, Ir1oi{4}, Ir1i{BIDx}]
+
+  // Incrementally, can print in between for debugging
+  tv0->computeAt(tv2, 1);
+  tv2->computeAt(tv1, 1);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+
+  int numel_x = 65000;
+  int numel_y = 1025;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
+  auto aten_output = aten_input.to(at::kDouble).sum({1});
+
+  // How many threads to use for the block reduction
+  int runtime_threadIdx_dim = 128;
+
+  LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input}, lparams);
+  auto cg_outputs = fe.runFusion({aten_input}, lparams);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionReductionSchedulerMultiDimNonFastest_CUDA) {
+  const std::vector<int> red_dims = {0, 2};
+  // Copy is because CodeGen requires int and Pytorch requires int64_t
+  // for a vector of reduction dimensions
+  const std::vector<int64_t> red_dims64 = {0, 2};
+  const std::vector<int64_t> tensor_dims_in = {5, 10, 15, 20};
+  const std::vector<int64_t> tensor_dims_out = {10, 20};
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(tensor_dims_in.size());
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = reductionOp(
+      BinaryOpType::Add, red_dims, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn(tensor_dims_in, options);
+  auto aten_output = aten_input.to(at::kDouble).sum(red_dims64);
+  at::Tensor cg_output = at::empty(tensor_dims_out, options);
+
+  // Apply reduction heuristic
+  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  scheduleReduction(&fusion, *reduction_params);
+  auto lparams = reduction_params->lparams;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input}, lparams);
+  fe.runFusion({aten_input}, {cg_output}, lparams);
+
+  testValidate(
+      &fusion,
+      {cg_output},
+      {aten_input},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionReductionSchedulerMultiDimFastest_CUDA) {
+  const std::vector<int> red_dims = {1, 3};
+  // Copy is because CodeGen requires int and Pytorch requires int64_t
+  // for a vector of reduction dimensions
+  const std::vector<int64_t> red_dims64 = {1, 3};
+  const std::vector<int64_t> tensor_dims_in = {5, 10, 15, 20};
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(tensor_dims_in.size());
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = reductionOp(
+      BinaryOpType::Add, red_dims, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn(tensor_dims_in, options);
+  auto aten_output = aten_input.to(at::kDouble).sum(red_dims64);
+
+  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  scheduleReduction(&fusion, *reduction_params);
+  auto lparams = reduction_params->lparams;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input}, lparams);
+  auto cg_outputs = fe.runFusion({aten_input}, lparams);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionReductionSchedulerNoODimShmoo_CUDA) {
+  std::vector<DataType> dtypes = {
+      DataType::Double, DataType::Float, DataType::Half};
+  // TODO: add test for complex. Currently complex fails with the following
+  // NVRTC compilation error message:
+  //   error: no suitable user-defined conversion from
+  //   "CudaCodeGen::std::complex<double>" to "CudaCodeGen::std::complex<float>"
+  //   exists
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+  if (at::cuda::getDeviceProperties(0)->major >= 8) {
+    dtypes.insert(dtypes.end(), DataType::BFloat16);
+  }
+#endif
+
+  std::vector<int> red_dims;
+
+  // Tried to cut down the number iterations with just
+  // doing every other power of 2.
+  for (int i = 1; i <= 1024 * 1024; i <<= 2) {
+    red_dims.push_back(i);
+  }
+
+  for (auto dtype : dtypes) {
+    at::ScalarType aten_dtype = data_type_to_aten(dtype);
+    for (auto& rdim : red_dims) {
+      Fusion fusion;
+      FusionGuard fg(&fusion);
+
+      bool is_fp16 = dtype == DataType::Half;
+      bool is_bf16 = dtype == DataType::BFloat16;
+
+      TensorView* tv0 = makeSymbolicTensor(1, dtype);
+      fusion.addInput(tv0);
+
+      TensorView* tv0_cast = tv0;
+      if (is_fp16 || is_bf16) {
+        tv0_cast = castOp(DataType::Float, tv0);
+      }
+
+      TensorView* tv1 = sum(tv0_cast, {0});
+
+      TensorView* tv1_cast = tv1;
+      if (is_fp16) {
+        tv1_cast = castOp(DataType::Half, tv1);
+      }
+      if (is_bf16) {
+        tv1_cast = castOp(DataType::BFloat16, tv1);
+      }
+
+      fusion.addOutput(tv1_cast);
+
+      auto options = at::TensorOptions().dtype(aten_dtype).device(at::kCUDA, 0);
+
+      at::Tensor aten_input = at::randn({rdim}, options);
+      auto aten_output = aten_input.to(at::kDouble).sum({0});
+
+      auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
+      TORCH_CHECK(reduction_params != nullptr, "Reduction is not found!");
+      scheduleReduction(&fusion, *reduction_params);
+      auto lparams = reduction_params->lparams;
+
+      FusionExecutor fe;
+      fe.compileFusion(&fusion, {aten_input}, lparams);
+      auto cg_outputs = fe.runFusion({aten_input}, lparams);
+
+      testValidate(
+          &fusion,
+          cg_outputs,
+          {aten_input},
+          {aten_output},
+          __LINE__,
+          __FILE__,
+          "",
+          lparams);
+    }
+  }
+}
+
+TEST_F(NVFuserTest, FusionReductionSchedulerDimShmoo_CUDA) {
+  std::vector<DataType> dtypes = {
+      DataType::Double, DataType::Float, DataType::Half};
+  // TODO: add complex support. Currently, complex fails with the following
+  // NVRTC compilation error:
+  //   error: no instance of overloaded function "__shfl_xor_sync" matches the
+  //   argument list
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+  if (at::cuda::getDeviceProperties(0)->major >= 8) {
+    dtypes.insert(dtypes.end(), DataType::BFloat16);
+  }
+#endif
+
+  std::vector<int> red_axis = {1, 0};
+  std::vector<int> output_dims = {160, 320};
+  std::vector<int> red_dims;
+
+  // Tried to cut down the number iterations with just
+  // doing every other power of 2.
+  for (int i = 1; i <= 1024 * 1024; i <<= 2) {
+    red_dims.push_back(i);
+  }
+
+  for (auto dtype : dtypes) {
+    at::ScalarType aten_dtype = data_type_to_aten(dtype);
+    for (auto& axis : red_axis) {
+      for (auto& odim : output_dims) {
+        for (auto& rdim : red_dims) {
+          Fusion fusion;
+          FusionGuard fg(&fusion);
+
+          bool is_fp16 = dtype == DataType::Half;
+          bool is_bf16 = dtype == DataType::BFloat16;
+
+          TensorView* tv0 = makeSymbolicTensor(2, dtype);
+          fusion.addInput(tv0);
+
+          TensorView* tv0_cast = tv0;
+          if (is_fp16 || is_bf16) {
+            tv0_cast = castOp(DataType::Float, tv0);
+          }
+
+          TensorView* tv1 = sum(tv0_cast, {axis});
+
+          TensorView* tv1_cast = tv1;
+          if (is_fp16) {
+            tv1_cast = castOp(DataType::Half, tv1);
+          }
+          if (is_bf16) {
+            tv1_cast = castOp(DataType::BFloat16, tv1);
+          }
+          fusion.addOutput(tv1_cast);
+
+          auto options =
+              at::TensorOptions().dtype(aten_dtype).device(at::kCUDA, 0);
+
+          at::Tensor aten_input =
+              (axis ? at::randn({odim, rdim}, options)
+                    : at::randn({rdim, odim}, options));
+
+          auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
+          TORCH_CHECK(reduction_params != nullptr, "Reduction is not found!");
+          scheduleReduction(&fusion, *reduction_params);
+          auto lparams = reduction_params->lparams;
+
+          FusionExecutor fe;
+          fe.compileFusion(&fusion, {aten_input}, lparams);
+          auto cg_outputs = fe.runFusion({aten_input}, lparams);
+          auto aten_output = aten_input.to(at::kDouble).sum({axis});
+          testValidate(
+              &fusion,
+              cg_outputs,
+              {aten_input},
+              {aten_output},
+              __LINE__,
+              __FILE__,
+              "",
+              lparams);
+        }
+      }
+    }
+  }
+}
+
+TEST_F(NVFuserTest, FusionCacheBefore_CUDA) {
+  // TVM Cache Write
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1.0));
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(3.0));
+  fusion.addInput(tv0);
+  fusion.addOutput(tv2);
+
+  // Before: TV2 = TV1 * 3
+  // After:  TV3 = TV1 * 3;
+  //         TV2 = TV3;
+  TensorView* tv3 = tv2->cacheBefore();
+
+  constexpr int BSX = 32;
+  tv2->split(-1, BSX);
+  tv0->computeAt(tv2, -1);
+
+  // Thread and Block binding
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 32, N = 750;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({M, N}, options);
+  at::Tensor aten_output = (aten_input + 1.0) * 3.0;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionCacheAfter_CUDA) {
+  // TVM Cache Read
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1.0));
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(3.0));
+  fusion.addInput(tv0);
+  fusion.addOutput(tv2);
+
+  // Before: TV1 = TV0 + 1
+  // After:  TV3 = TV0;
+  //         TV1 = TV3 + 1
+  TensorView* tv3 = tv0->cacheAfter();
+
+  constexpr int BSX = 32;
+  tv2->split(-1, BSX);
+  tv0->computeAt(tv2, -1);
+
+  // Thread and Block binding
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 32, N = 457;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({M, N}, options);
+  at::Tensor aten_output = (aten_input + 1.0) * 3.0;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionCacheFork_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1.0));
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(3.0));
+  fusion.addInput(tv0);
+  fusion.addOutput(tv1);
+  fusion.addOutput(tv2);
+  // Before:  TV1 = TV0 + 1
+  //          TV2 = TV1 * 1
+  // Output:  TV1, TV2
+
+  // After:   TV1 = TV0 + 1
+  //          TV3 = TV1
+  //          TV2 = TV1 * 1
+  // Output:  TV3, TV2
+
+  // cacheFork !!does not!! automatically apply ComputeAt to the cache
+  auto tv3 = tv1->cacheFork();
+
+  constexpr int BSX = 32;
+  tv2->split(-1, BSX);
+  tv0->computeAt(tv2, -1);
+
+  // Thread and Block binding
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 32, N = 457;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({M, N}, options);
+  at::Tensor aten_output1 = aten_input + 1.0;
+  at::Tensor aten_output2 = aten_output1 * 3.0;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {aten_output1, aten_output2},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionCacheIndirect_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  TensorView* tv2 = makeSymbolicTensor(2);
+  TensorView* tv3 = makeSymbolicTensor(2);
+  TensorView* tv4 = sub(tv2, tv3);
+  TensorView* tv5 = add(tv1, tv4);
+  TensorView* tv6 = sub(tv5, tv0);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addInput(tv2);
+  fusion.addInput(tv3);
+  fusion.addOutput(tv6);
+  // t6 = ((t1 + (t2 - t3)) - t0)
+
+  tv5->cacheAfter();
+  tv5->cacheBefore();
+
+  // cacheAfter on inputs placed before schedule
+  constexpr int BSX = 32;
+  tv6->split(-1, BSX);
+  tv2->computeAt(tv6, -1);
+
+  // Thread and Block binding
+  tv6->axis(0)->parallelize(ParallelType::BIDx);
+  tv6->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 32, N = 810;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M, N}, options);
+  at::Tensor t1 = at::randn({M, N}, options);
+  at::Tensor t2 = at::randn({M, N}, options);
+  at::Tensor t3 = at::randn({M, N}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1, t2, t3};
+  at::Tensor aten_output = (t1 + (t2 - t3)) - t0;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionCacheBcast_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Algorithm
+  TensorView* tv0 = makeSymbolicTensor(1); // (M, 1)
+  TensorView* tv1 = broadcast(tv0, {false, true});
+  TensorView* tv2 = makeSymbolicTensor(1); // (1, N)
+  TensorView* tv3 = broadcast(tv2, {true, false});
+  TensorView* tv4 = mul(tv1, tv3);
+  fusion.addInput(tv0);
+  fusion.addInput(tv2);
+  fusion.addOutput(tv4);
+
+  // Case 1
+  tv0->cacheAfter();
+
+  // Case 2
+  tv1->cacheBefore();
+
+  // Case 3
+  tv1->cacheAfter();
+
+  // Case 4
+  TensorView* tv8 = tv4->cacheBefore();
+
+  constexpr int BSX = 128;
+  tv4->split(0, BSX);
+  tv4->split(-1, BSX);
+  tv4->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}});
+  // M/BSX, N/BSY, BSX, BSY
+  tv0->computeAt(tv4, 2);
+  tv2->computeAt(tv4, 2);
+  // 0, 1 | 2, 3, 4
+
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(1)->parallelize(ParallelType::BIDy);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  // Manual Replay on TV3
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv8->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 92, N = 500;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M}, options);
+  at::Tensor t1 = at::randn({N}, options);
+  std::vector<IValue> aten_inputs = {t0, t1};
+  at::Tensor aten_output =
+      t0.to(at::kDouble).unsqueeze(1).matmul(t1.to(at::kDouble).unsqueeze(0));
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionCacheMultiConsumer_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(1);
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
+  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv4 = add(tv3, IrBuilder::create<Double>(2));
+
+  fusion.addInput(tv0);
+  fusion.addOutput(tv2);
+  fusion.addOutput(tv4);
+
+  auto tv5 = tv1->cacheBefore();
+  auto tv6 = tv3->cacheBefore();
+  tv5->setMemoryType(MemoryType::Shared);
+  tv6->setMemoryType(MemoryType::Shared);
+
+  tv1->computeAt(tv2, -1);
+  tv3->computeAt(tv4, -1);
+
+  // Fails because tensor must be recomputed twice
+  // auto tv7 = tv0->cacheAfter();
+
+  constexpr int N = 800;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({N}, options);
+  auto aten_output = (aten_input + 1) + 2;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {aten_output, aten_output},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSmem_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Algorithm
+  TensorView* tv0 = makeSymbolicTensor(2); // (M, N)
+  TensorView* tv1 = makeSymbolicTensor(2); // (M, N)
+  TensorView* tv2 = mul(tv0, tv1);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv2);
+
+  // Schedule
+  TensorView* tv3 = tv0->cacheAfter();
+  TensorView* tv4 = tv1->cacheAfter();
+  tv3->setMemoryType(MemoryType::Shared);
+  tv4->setMemoryType(MemoryType::Shared);
+
+  constexpr int BSY = 32;
+  constexpr int BSX = 128;
+  tv2->split(0, BSY);
+  tv2->split(2, BSX);
+  // M/BSX, BSX, N/BSX, BSX
+  tv2->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}});
+  // M/BSX, N/BSX, BSX, BSX
+
+  tv0->computeAt(tv2, 2);
+  tv1->computeAt(tv2, 2);
+
+  // Thread and Block binding
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::BIDy);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  // Manual Binding
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 128, N = 10240;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M, N}, options);
+  at::Tensor t1 = at::randn({M, N}, options);
+  at::Tensor aten_output = mul(t0, t1);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
+}
+
+TEST_F(NVFuserTest, FusionSmemReduce_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Algorithm
+  TensorView* tv0 = makeSymbolicTensor(3); // M, K, N
+  TensorView* tv1 = sum(tv0, {1}); // M, R, N
+  fusion.addInput(tv0);
+  fusion.addOutput(tv1);
+
+  TensorView* tv2 = tv0->cacheAfter();
+  tv2->setMemoryType(MemoryType::Shared);
+
+  // Schedule
+  constexpr int BSX = 32;
+  tv1->split(2, BSX);
+  tv1->split(1, 128);
+  tv1->split(0, BSX);
+  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
+  tv1->reorder({{0, 0}, {1, 2}, {2, 4}, {3, 5}, {4, 1}, {5, 3}});
+  TensorView* tv3 = tv1->rFactor({-2});
+
+  tv0->computeAt(tv1, -2);
+  tv0->computeAt(tv3, -2);
+
+  // Thread and Block binding
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::BIDy);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  // Manual Binding
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 154, K = 45, N = 1524;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({M, K, N}, options);
+  at::Tensor aten_output = sum(aten_input.to(at::kDouble), {1});
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
+}
+
+TEST_F(NVFuserTest, FusionSmemBlockGemm_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Algorithm
+  TensorView* tv0 = makeSymbolicTensor(2); // (M, K)
+  TensorView* tv1 = makeSymbolicTensor(2); // (K, N)
+  TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B)
+  TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N)
+  TensorView* tv4 = mul(tv2, tv3); // M, K, N
+  TensorView* tv5 = sum(tv4, {1}); // M, R, N
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv5);
+
+  // Schedule
+  constexpr int BSX = 16;
+  tv5->split(2, BSX - 1);
+  tv5->split(1, BSX);
+  tv5->split(0, BSX + 1);
+  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
+  tv5->reorder({{0, 0}, {1, 3}, {2, 2}, {3, 5}, {4, 1}, {5, 4}});
+  // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX
+  TensorView* tv6 = tv5->rFactor({-1});
+
+  tv2->setMemoryType(MemoryType::Shared);
+  tv3->setMemoryType(MemoryType::Shared);
+  tv4->setMemoryType(MemoryType::Shared);
+  tv6->setMemoryType(MemoryType::Shared);
+
+  tv0->computeAt(tv5, 3);
+  tv1->computeAt(tv5, 3);
+
+  // Thread and Block binding
+  tv5->axis(0)->parallelize(ParallelType::BIDx);
+  tv5->axis(1)->parallelize(ParallelType::BIDy);
+  tv5->axis(-2)->parallelize(ParallelType::TIDy);
+  tv5->axis(-1)->parallelize(ParallelType::TIDx);
+  // Manual Binding
+  tv2->axis(-3)->parallelize(ParallelType::TIDy);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(-3)->parallelize(ParallelType::TIDy);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv6->axis(-3)->parallelize(ParallelType::TIDy);
+  tv6->axis(-2)->parallelize(ParallelType::TIDx);
+
+  // Make sure BIDx is makred as exact (see issue #1119)
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(gpulw.parallelDimensionMap().isExact(ParallelType::BIDx));
+
+  constexpr int M = 154, K = 45, N = 1524;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M, K}, options);
+  at::Tensor t1 = at::randn({K, N}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+  at::Tensor aten_output = matmul(t0.to(at::kDouble), t1.to(at::kDouble));
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
+}
+
+TEST_F(NVFuserTest, FusionSmemBlockGemmCache_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Algorithm
+  TensorView* tv0 = makeSymbolicTensor(2); // (M, K)
+  TensorView* tv1 = makeSymbolicTensor(2); // (K, N)
+  TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B)
+  TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N)
+  TensorView* tv4 = mul(tv2, tv3); // M, K, N
+  TensorView* tv5 = sum(tv4, {1}); // M, R, N
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv5);
+
+  // Schedule
+  // Remove reduction axis from tv5
+  // tv6 = (M, R, N)
+  // tv5 = (M, N)
+  TensorView* tv6 = tv5->cacheBefore();
+
+  constexpr int BSX = 16;
+  tv5->split(1, BSX);
+  tv5->split(0, BSX);
+  // M/BSX, BSX, N/BSX, BSX
+  tv5->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}});
+  // tv5 = M/BSX, N/BSX, MSX, NSX
+
+  tv6->computeAt(tv5, 2);
+  tv6->computeAt(tv5, 2);
+
+  tv6->split(-1, BSX);
+  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
+  tv6->reorder({{0, 0}, {1, 1}, {2, 3}, {3, 4}, {4, 2}, {5, 5}});
+  // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX
+  TensorView* tv7 = tv6->rFactor({-1});
+  // tv7 = M/BSX, N/BSX, K/BSXrf, MSX, NSX, KSXr
+  // tv6 = M/BSX, N/BSX, K/BSXr, MSX, NSX
+
+  tv0->computeAt(tv6, 3);
+  tv1->computeAt(tv6, 3);
+
+  tv0->computeAt(tv7, 3);
+  tv1->computeAt(tv7, 3);
+
+  tv2->setMemoryType(MemoryType::Shared);
+  tv3->setMemoryType(MemoryType::Shared);
+  tv4->setMemoryType(MemoryType::Shared);
+  tv6->setMemoryType(MemoryType::Shared);
+  tv7->setMemoryType(MemoryType::Shared);
+  // Memory Type
+
+  // Thread and Block binding
+  tv5->axis(0)->parallelize(ParallelType::BIDx);
+  tv5->axis(1)->parallelize(ParallelType::BIDy);
+  tv5->axis(-2)->parallelize(ParallelType::TIDy);
+  tv5->axis(-1)->parallelize(ParallelType::TIDx);
+  // Manual Binding
+  tv2->axis(-3)->parallelize(ParallelType::TIDy);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(-3)->parallelize(ParallelType::TIDy);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv7->axis(-3)->parallelize(ParallelType::TIDy);
+  tv7->axis(-2)->parallelize(ParallelType::TIDx);
+
+  tv6->axis(-2)->parallelize(ParallelType::TIDy);
+  tv6->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 154, K = 45, N = 1524;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M, K}, options);
+  at::Tensor t1 = at::randn({K, N}, options);
+  at::Tensor aten_output = matmul(t0.to(at::kDouble), t1.to(at::kDouble));
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
+}
+
+TEST_F(NVFuserTest, FusionSmemDynamicPersistentSoftmax2D_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* x = makeSymbolicTensor(2);
+  fusion.addInput(x);
+  TensorView* max_val = reductionOp(
+      BinaryOpType::Max,
+      {-1},
+      IrBuilder::create<Double>(std::numeric_limits<float>::lowest()),
+      x); // (M)
+  TensorView* bcast_max = broadcast(max_val, {false, true}); // (M, B)
+  TensorView* x_max_sub = sub(x, bcast_max); // (M, N)
+  TensorView* exp = unaryOp(UnaryOpType::Exp, x_max_sub); // (M, N)
+  TensorView* sum_exp = sum(exp, {-1}); // (M, R)
+  TensorView* bcast_sum = broadcast(sum_exp, {false, true}); // (M, B)
+  TensorView* softmax = div(exp, bcast_sum); // (M, N)
+  fusion.addOutput(softmax);
+
+  // Read Input into Shared Memory
+  // Load Input + Pwise into shared memory
+  auto cache_x = x->cacheAfter();
+  cache_x->setMemoryType(MemoryType::Shared);
+  exp->setMemoryType(MemoryType::Shared);
+
+  std::vector<TensorView*> all_tensors(
+      {x,
+       cache_x,
+       max_val,
+       bcast_max,
+       x_max_sub,
+       exp,
+       sum_exp,
+       bcast_sum,
+       softmax});
+
+  auto tidx = IrBuilder::create<Int>();
+  fusion.addInput(tidx);
+
+  for (auto tensor : all_tensors) {
+    tensor->split(-1, tidx);
+  }
+
+  auto sum_exp_rf = sum_exp->rFactor({1});
+  all_tensors.push_back(sum_exp_rf);
+
+  // computeAt
+  x->computeAt(x_max_sub, 1);
+  exp->computeAt(softmax, 1);
+  x_max_sub->computeAt(exp, 2);
+
+  softmax->axis(0)->parallelize(ParallelType::BIDx);
+  for (auto tensor : all_tensors) {
+    tensor->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  const int64_t dimx = 1024;
+  const int64_t dimy = 4096;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({dimx, dimy}, options);
+  auto aten_output = at::_softmax(aten_input.to(at::kDouble), -1, false);
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input, 128});
+  auto cg_outputs = fe.runFusion({aten_input, 128});
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input, 128},
+      {aten_output},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionMagicSchedulerSoftmax_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int kReductionAxis = 3;
+  std::vector<int64_t> input_shape{10, 10, 10, 67};
+  TensorView* input = makeSymbolicTensor(input_shape.size());
+  fusion.addInput(input);
+
+  auto output = softmax(input, kReductionAxis);
+
+  fusion.addOutput(output);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn(input_shape, options);
+  auto aten_output =
+      at::_softmax(aten_input.to(at::kDouble), kReductionAxis, false);
+
+  auto reduction_params = getPersistentHeuristics(&fusion, {aten_input});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+
+  schedulePersistentKernel(&fusion, *reduction_params);
+
+  auto lparams = reduction_params->lparams;
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input}, lparams);
+  auto cg_outputs = fe.runFusion({aten_input}, lparams);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionTestMaskSoftmax_CUDA) {
+  // This test is testing the usage of all padding tokens
+  // with softmax like Bert might might use in a full padding
+  // sequence.
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int kReductionAxis = 3;
+  std::vector<int64_t> input_shape{256, 16, 128, 128};
+  TensorView* input = makeSymbolicTensor(input_shape.size());
+  TensorView* mask = makeSymbolicTensor(input_shape.size());
+  fusion.addInput(input);
+  fusion.addInput(mask);
+
+  auto out1 = add(input, mask);
+  auto output = softmax(out1, kReductionAxis);
+
+  fusion.addOutput(output);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn(input_shape, options);
+  at::Tensor aten_mask = at::ones(input_shape, options);
+  // -10,000 is used here as a magic number because the padding
+  // tokens need to be a value that gives a value close to zero
+  // as to not influence softmax.  Bert, in particular, does
+  // not use -Infinity because sometimes it will have a
+  // softmax of all padding tokkens that can result a divide by
+  // zero that creates NaN result.
+  aten_mask = aten_mask * -10000.0;
+  auto aten_out1 = aten_input + aten_mask;
+  auto aten_output = at::_softmax(aten_out1, kReductionAxis, false);
+
+  auto reduction_params =
+      getPersistentHeuristics(&fusion, {aten_input, aten_mask});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+
+  schedulePersistentKernel(&fusion, *reduction_params);
+
+  auto lparams = reduction_params->lparams;
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input, aten_mask}, lparams);
+  auto cg_outputs = fe.runFusion({aten_input, aten_mask}, lparams);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input, aten_mask},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionMagicSchedulerLayerNormBackward_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> shape{20, 100, 35, 67};
+  std::vector<int64_t> norm_shape{67};
+
+  const size_t kM = shape.size();
+  const size_t kN = norm_shape.size();
+  const size_t kOuterNumDims = kM - kN;
+
+  std::vector<int64_t> outer_shape;
+  for (const auto idx : c10::irange(kOuterNumDims)) {
+    outer_shape.push_back(shape[idx]);
+  }
+  for (const auto idx : c10::irange(kOuterNumDims, kM)) {
+    outer_shape.push_back(1);
+  }
+
+  auto grad_out = makeSymbolicTensor(shape.size());
+  auto input = makeSymbolicTensor(shape.size());
+  auto mean = makeConcreteTensor(outer_shape);
+  auto rstd = makeConcreteTensor(outer_shape);
+  auto weight = makeSymbolicTensor(norm_shape.size());
+  auto bias = makeSymbolicTensor(norm_shape.size());
+  fusion.addInput(grad_out);
+  fusion.addInput(input);
+  fusion.addInput(mean);
+  fusion.addInput(rstd);
+  fusion.addInput(weight);
+  fusion.addInput(bias);
+
+  auto grads = layer_norm_backward(
+      grad_out,
+      input,
+      norm_shape,
+      mean,
+      rstd,
+      weight,
+      bias,
+      {true, true, true});
+
+  fusion.addOutput(grads.grad_input);
+  fusion.addOutput(grads.grad_weight);
+  fusion.addOutput(grads.grad_bias);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_grad_out = at::randn(shape, options);
+  at::Tensor aten_input = at::randn(shape, options);
+  at::Tensor aten_weight = at::randn(norm_shape, options);
+  at::Tensor aten_bias = at::randn(norm_shape, options);
+  auto at_weight = c10::optional<at::Tensor>(aten_weight);
+  auto at_bias = c10::optional<at::Tensor>(aten_bias);
+
+  const float kEps = 1e-5;
+  auto aten_results =
+      at::native_layer_norm(aten_input, norm_shape, at_weight, at_bias, kEps);
+  auto aten_output = std::get<0>(aten_results);
+  auto aten_mean = std::get<1>(aten_results);
+  auto aten_rstd = std::get<2>(aten_results);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  std::vector<IValue> aten_inputs = {
+      aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
+  auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+
+  auto aten_gradients = at::native_layer_norm_backward(
+      aten_grad_out.to(at::kDouble),
+      aten_input.to(at::kDouble),
+      norm_shape,
+      aten_mean.to(at::kDouble),
+      aten_rstd.to(at::kDouble),
+      c10::optional<at::Tensor>(aten_weight.to(at::kDouble)),
+      c10::optional<at::Tensor>(aten_bias.to(at::kDouble)),
+      {true, true, true});
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      aten_inputs,
+      {std::get<0>(aten_gradients),
+       std::get<1>(aten_gradients),
+       std::get<2>(aten_gradients)},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionMagicSchedulerRMSNormBackward_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+  const int64_t NORM_SIZE = 1024;
+  std::vector<int64_t> shape{8, 56, NORM_SIZE};
+  std::vector<int64_t> norm_shape{NORM_SIZE};
+
+  const size_t kM = shape.size();
+  const size_t kN = norm_shape.size();
+  const size_t kOuterNumDims = kM - kN;
+
+  std::vector<int64_t> outer_shape;
+  for (const auto idx : c10::irange(kOuterNumDims)) {
+    outer_shape.push_back(shape[idx]);
+  }
+  for (const auto idx : c10::irange(kOuterNumDims, kM)) {
+    outer_shape.push_back(1);
+  }
+
+  auto grad_out = makeContigTensor(shape.size());
+  auto input = makeContigTensor(shape.size());
+  auto rstd = makeConcreteTensor(outer_shape);
+  auto weight = makeContigTensor(norm_shape.size());
+  fusion.addInput(grad_out);
+  fusion.addInput(input);
+  fusion.addInput(rstd);
+  fusion.addInput(weight);
+
+  auto grads = rms_norm_backward(
+      grad_out, input, norm_shape, rstd, weight, {true, true});
+
+  fusion.addOutput(grads.grad_input);
+  fusion.addOutput(grads.grad_weight);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_grad_out = at::randn(shape, options);
+  at::Tensor aten_input = at::randn(shape, options);
+  at::Tensor aten_weight = at::randn(norm_shape, options);
+  auto at_weight = c10::optional<at::Tensor>(aten_weight);
+
+  const float kEps = 1e-6;
+  auto pow2 = at::pow(aten_input, 2);
+  auto sum = at::sum(pow2, -1, true);
+  auto var = at::mul(sum, 1.0 / NORM_SIZE);
+  auto aten_rstd = at::pow(at::add(var, kEps), -0.5);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  std::vector<IValue> aten_inputs = {
+      aten_grad_out, aten_input, aten_rstd, aten_weight};
+  auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+
+  auto in_mul_rstd = at::mul(aten_input, aten_rstd);
+  auto grad_out_mul = at::mul(aten_grad_out, in_mul_rstd);
+  auto aten_grad_weight = at::sum(grad_out_mul, c10::IntArrayRef{0, 1});
+  auto sum_loss1 = at::sum(at::mul(aten_grad_out, aten_weight), -1, true);
+  auto sum_loss2 = at::sum(
+      at::mul(
+          at::mul(at::mul(aten_grad_out, aten_weight), aten_input), aten_rstd),
+      -1,
+      true);
+
+  const float fH = NORM_SIZE;
+  auto term1 = at::mul(aten_rstd, 1.0 / fH);
+  auto aten_grad_input = at::mul(at::mul(aten_grad_out, fH), aten_weight);
+  aten_grad_input = at::sub(aten_grad_input, sum_loss1);
+  aten_grad_input = at::sub(
+      aten_grad_input, at::mul(at::mul(aten_input, aten_rstd), sum_loss2));
+  aten_grad_input = at::mul(aten_grad_input, term1);
+  testValidate(
+      &fusion,
+      cg_outputs,
+      aten_inputs,
+      {aten_grad_input, aten_grad_weight},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionMagicSchedulerLayerNormalization_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  const float kEps = 1e-5;
+  Double* eps_ptr = IrBuilder::create<Double>(kEps);
+
+  std::vector<int64_t> input_shape{20, 100, 35, 67};
+  std::vector<int64_t> norm_shape{67};
+
+  auto input = makeSymbolicTensor(input_shape.size());
+  fusion.addInput(input);
+
+  auto result = layer_norm(input, norm_shape, nullptr, nullptr, eps_ptr);
+
+  fusion.addOutput(result.output);
+  fusion.addOutput(result.mean);
+  fusion.addOutput(result.invstd);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn(input_shape, options);
+  c10::optional<at::Tensor> aten_weight = c10::nullopt;
+  c10::optional<at::Tensor> aten_bias = c10::nullopt;
+  auto aten_outputs = at::native_layer_norm(
+      aten_input, norm_shape, aten_weight, aten_bias, kEps);
+
+  // Check reduction axis is same for all reductions
+  // Generate Launch Parameters
+  auto reduction_params = getPersistentHeuristics(&fusion, {aten_input});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  auto cg_outputs = fec.runFusionWithInputs({aten_input});
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {std::get<0>(aten_outputs),
+       std::get<1>(aten_outputs),
+       std::get<2>(aten_outputs)},
+      __LINE__,
+      __FILE__,
+      "");
+}
+
+TEST_F(NVFuserTest, FusionMagicSchedulerRMSNormalization_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  int64_t NORM_SIZE = 1024;
+  const float kEps = 1e-6;
+  Double* eps_ptr = IrBuilder::create<Double>(kEps);
+
+  std::vector<int64_t> input_shape{8, 56, NORM_SIZE};
+  std::vector<int64_t> norm_shape{NORM_SIZE};
+
+  auto input = makeContigTensor(input_shape.size());
+  fusion.addInput(input);
+  auto result = rms_norm(input, norm_shape, nullptr, eps_ptr);
+
+  fusion.addOutput(result.output);
+  fusion.addOutput(result.invstd);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn(input_shape, options);
+  c10::optional<at::Tensor> aten_weight = c10::nullopt;
+
+  auto pow2 = at::pow(aten_input, 2);
+
+  auto sum = at::sum(pow2, -1, true);
+  auto var = at::mul(sum, 1.0 / NORM_SIZE);
+  auto invstd = at::pow(at::add(var, kEps), -0.5);
+  auto output = at::mul(aten_input, invstd);
+  //// Check reduction axis is same for all reductions
+  //// Generate Launch Parameters
+  auto reduction_params = getPersistentHeuristics(&fusion, {aten_input});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  auto cg_outputs = fec.runFusionWithInputs({aten_input});
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {output, invstd},
+      __LINE__,
+      __FILE__,
+      "");
+}
+
+TEST_F(NVFuserTest, FusionMagicSchedulerBatchNormalization_CUDA) {
+  if (!deviceMajorMinorCheck(7)) {
+    GTEST_SKIP() << "skipping tests on pre-Volta GPUs";
+    return;
+  }
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  const float kMomentum = 0.1;
+  const float kEps = 1e-5;
+  const bool kTraining = true;
+  std::vector<int64_t> input_shape{20, 100, 35, 45};
+
+  auto input = makeSymbolicTensor(input_shape.size());
+  auto weight = makeSymbolicTensor(1);
+  auto bias = makeSymbolicTensor(1);
+  auto running_mean = makeSymbolicTensor(1);
+  auto running_var = makeSymbolicTensor(1);
+  fusion->addInput(input);
+  fusion->addInput(weight);
+  fusion->addInput(bias);
+  fusion->addInput(running_mean);
+  fusion->addInput(running_var);
+
+  Double* momentum = IrBuilder::create<Double>(kMomentum);
+  Double* eps = IrBuilder::create<Double>(kEps);
+
+  auto result = batch_norm(
+      input, weight, bias, running_mean, running_var, kTraining, momentum, eps);
+
+  fusion->addOutput(result.output);
+  fusion->addOutput(result.mean);
+  fusion->addOutput(result.invstd);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto at_input = at::randn(input_shape, options);
+  auto at_weight = at::ones({input_shape[1]}, options);
+  auto at_bias = at::zeros({input_shape[1]}, options);
+  auto at_run_mean = at::zeros({input_shape[1]}, options);
+  auto at_run_var = at::ones({input_shape[1]}, options);
+
+  std::vector<IValue> aten_inputs = {
+      at_input, at_weight, at_bias, at_run_mean, at_run_var};
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
+
+  auto aten_outputs = at::native_batch_norm(
+      at_input,
+      c10::optional<at::Tensor>(at_weight),
+      c10::optional<at::Tensor>(at_bias),
+      c10::optional<at::Tensor>(at_run_mean),
+      c10::optional<at::Tensor>(at_run_var),
+      kTraining,
+      kMomentum,
+      kEps);
+
+  testValidate(
+      executor_cache.fusion(),
+      cg_outputs,
+      aten_inputs,
+      {std::get<0>(aten_outputs),
+       std::get<1>(aten_outputs),
+       std::get<2>(aten_outputs)},
+      __LINE__,
+      __FILE__,
+      "");
+}
+
+TEST_F(NVFuserTest, FusionMagicSchedulerInstanceNormalization_CUDA) {
+  if (!deviceMajorMinorCheck(7)) {
+    GTEST_SKIP() << "skipping tests on pre-Volta GPUs";
+    return;
+  }
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  const float kMomentum = 0.1;
+  const float kEps = 1e-5;
+  const bool kUseInputStats = true;
+  std::vector<int64_t> input_shape{20, 100, 35, 45};
+
+  auto input = makeSymbolicTensor(input_shape.size());
+  auto weight = makeSymbolicTensor(1);
+  auto bias = makeSymbolicTensor(1);
+  auto running_mean = makeSymbolicTensor(1);
+  auto running_var = makeSymbolicTensor(1);
+  fusion->addInput(input);
+  fusion->addInput(weight);
+  fusion->addInput(bias);
+  fusion->addInput(running_mean);
+  fusion->addInput(running_var);
+
+  Double* momentum = IrBuilder::create<Double>(kMomentum);
+  Double* eps = IrBuilder::create<Double>(kEps);
+
+  auto result = instance_norm(
+      input,
+      weight,
+      bias,
+      running_mean,
+      running_var,
+      kUseInputStats,
+      momentum,
+      eps);
+
+  fusion->addOutput(result.output);
+  // fusion->addOutput(result.mean);
+  // fusion->addOutput(result.invstd);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto at_input = at::randn(input_shape, options);
+  auto at_weight = at::ones({input_shape[1]}, options);
+  auto at_bias = at::zeros({input_shape[1]}, options);
+  auto at_run_mean = at::zeros({input_shape[1]}, options);
+  auto at_run_var = at::ones({input_shape[1]}, options);
+
+  std::vector<IValue> aten_inputs = {
+      at_input, at_weight, at_bias, at_run_mean, at_run_var};
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
+  auto cg_outputs_full = {at_run_mean, at_run_var, cg_outputs[0]};
+
+  auto aten_outputs = at::instance_norm(
+      at_input,
+      c10::optional<at::Tensor>(at_weight),
+      c10::optional<at::Tensor>(at_bias),
+      c10::optional<at::Tensor>(at_run_mean),
+      c10::optional<at::Tensor>(at_run_var),
+      kUseInputStats,
+      kMomentum,
+      kEps,
+      false);
+
+  testValidate(
+      executor_cache.fusion(),
+      cg_outputs,
+      aten_inputs,
+      // TODO: can run_mean/run_var be checked here?
+      // fusion_outputs.size() == aten_outputs.size() && aten_outputs.size() ==
+      // fusion->outputs().size() - output_alias_indices.size()
+      {aten_outputs},
+      __LINE__,
+      __FILE__,
+      "");
+}
+
+TEST_F(NVFuserTest, FusionMagicSchedulerInstanceNormalizationBackward_CUDA) {
+  if (!deviceMajorMinorCheck(7)) {
+    GTEST_SKIP() << "skipping tests on pre-Volta GPUs";
+    return;
+  }
+  auto fusion_forward = std::make_unique<Fusion>();
+  FusionGuard fg_forward(fusion_forward.get());
+
+  const float kMomentum = 0.1;
+  const float kEps = 1e-5;
+  const bool kUseInputStats = true;
+  const bool channels_last = true;
+  const int B = 2;
+  const int C = 5;
+  const int S = 3;
+  std::vector<int64_t> input_shape{B, C, S, S, S};
+  // explicit channels-last for NVFuser
+  std::vector<int64_t> nvfuser_input_shape{B, S, S, S, C};
+
+  auto input = makeContigTensor(input_shape.size());
+  auto weight = makeContigTensor(1);
+  auto bias = makeContigTensor(1);
+  fusion_forward->addInput(input);
+  fusion_forward->addInput(weight);
+  fusion_forward->addInput(bias);
+
+  Double* momentum = IrBuilder::create<Double>(kMomentum);
+  Double* eps = IrBuilder::create<Double>(kEps);
+  auto result_forward = instance_norm(
+      input,
+      weight,
+      bias,
+      nullptr,
+      nullptr,
+      kUseInputStats,
+      momentum,
+      eps,
+      channels_last);
+  fusion_forward->addOutput(result_forward.output);
+  fusion_forward->addOutput(result_forward.mean);
+  fusion_forward->addOutput(result_forward.invstd);
+
+  FusionExecutorCache executor_cache_forward(std::move(fusion_forward));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto at_input = at::randn(input_shape, options)
+                      .to(at::MemoryFormat::ChannelsLast3d)
+                      .set_requires_grad(true);
+  auto at_input_nvfuser = at_input.clone().detach().permute({0, 2, 3, 4, 1});
+  auto at_weight = at::ones({input_shape[1]}, options).set_requires_grad(true);
+  auto at_weight_nvfuser = at_weight.clone().detach();
+  auto at_bias = at::zeros({input_shape[1]}, options).set_requires_grad(true);
+  auto at_bias_nvfuser = at_bias.clone().detach();
+  std::vector<torch::jit::IValue> aten_inputs_forward = {
+      at_input_nvfuser, at_weight_nvfuser, at_bias_nvfuser};
+  // out, mean, invstd
+  auto outputs_forward =
+      executor_cache_forward.runFusionWithInputs(aten_inputs_forward);
+  auto at_out = at::instance_norm(
+      at_input,
+      c10::optional<at::Tensor>(at_weight),
+      c10::optional<at::Tensor>(at_bias),
+      c10::optional<at::Tensor>(c10::nullopt),
+      c10::optional<at::Tensor>(c10::nullopt),
+      kUseInputStats,
+      kMomentum,
+      kEps,
+      false);
+  auto at_grad =
+      at::randn(input_shape, options).to(at::MemoryFormat::ChannelsLast3d);
+  auto at_grad_nvfuser = at_grad.clone().detach().permute({0, 2, 3, 4, 1});
+  at_out.backward(at_grad);
+  auto fusion_backward = std::make_unique<Fusion>();
+  FusionGuard fg_backward(fusion_backward.get());
+
+  input = makeContigTensor(input_shape.size());
+  auto grad_output = makeContigTensor(input_shape.size());
+  weight = makeContigTensor(1);
+  auto save_mean = makeContigTensor(2);
+  auto save_invstd = makeContigTensor(2);
+  auto dummy = makeContigTensor(0);
+
+  fusion_backward->addInput(input);
+  fusion_backward->addInput(grad_output);
+  fusion_backward->addInput(weight);
+  fusion_backward->addInput(dummy); // dummy for run_mean
+  fusion_backward->addInput(dummy); // dummy for run_var
+  fusion_backward->addInput(save_mean);
+  fusion_backward->addInput(save_invstd);
+
+  auto result_backward = instance_norm_backward(
+      input,
+      grad_output,
+      weight,
+      nullptr,
+      nullptr,
+      save_mean,
+      save_invstd,
+      kUseInputStats,
+      eps,
+      {true, true, true},
+      channels_last);
+
+  fusion_backward->addOutput(result_backward.grad_input);
+  fusion_backward->addOutput(result_backward.grad_weight);
+  fusion_backward->addOutput(result_backward.grad_bias);
+
+  FusionExecutorCache executor_cache_backward(std::move(fusion_backward));
+  std::vector<torch::jit::IValue> aten_inputs_backward = {
+      at_input_nvfuser,
+      at_grad_nvfuser,
+      at_weight_nvfuser,
+      at::empty({}),
+      at::empty({}),
+      outputs_forward[1],
+      outputs_forward[2]};
+  auto outputs_backward =
+      executor_cache_backward.runFusionWithInputs(aten_inputs_backward);
+  outputs_backward[0] = outputs_backward[0].permute({0, 4, 1, 2, 3});
+  testValidate(
+      executor_cache_backward.fusion(),
+      outputs_backward,
+      aten_inputs_backward,
+      {at_input.grad(), at_weight.grad(), at_bias.grad()},
+      __LINE__,
+      __FILE__,
+      "");
+}
+
+TEST_F(NVFuserTest, FusionPersistentSoftmaxLocalShared_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int pixels_per_thread = 64;
+  const int TIDX = 128;
+  const int static_size = pixels_per_thread * TIDX;
+
+  TensorView* sx = makeConcreteTensor({-1, static_size});
+  TensorView* dx = makeSymbolicTensor(2);
+  fusion.addInput(sx);
+  fusion.addInput(dx);
+
+  TensorView* max_sx = reductionOp(
+      BinaryOpType::Max,
+      {-1},
+      IrBuilder::create<Double>(std::numeric_limits<float>::lowest()),
+      sx); // (M)
+  TensorView* max_dx = reductionOp(
+      BinaryOpType::Max,
+      {-1},
+      IrBuilder::create<Double>(std::numeric_limits<float>::lowest()),
+      dx); // (M)
+
+  // Reduction => merge local and shared memory TensorViews
+  TensorView* max_val = binaryOp(BinaryOpType::Max, max_sx, max_dx);
+  TensorView* bcast_max = broadcast(max_val, {false, true}); // (M, B)
+
+  TensorView* sx_max_sub = sub(sx, bcast_max); // (M, N)
+  TensorView* dx_max_sub = sub(dx, bcast_max); // (M, N)
+
+  TensorView* sx_exp = unaryOp(UnaryOpType::Exp, sx_max_sub); // (M, N)
+  TensorView* dx_exp = unaryOp(UnaryOpType::Exp, dx_max_sub); // (M, N)
+
+  TensorView* sx_sum_exp = sum(sx_exp, {-1}); // (M, R)
+  TensorView* dx_sum_exp = sum(dx_exp, {-1}); // (M, R)
+
+  // Reduction => merge local and shared memory TensorViews
+  TensorView* sum_exp = binaryOp(BinaryOpType::Add, sx_sum_exp, dx_sum_exp);
+  TensorView* bcast_sum = broadcast(sum_exp, {false, true}); // (M, B)
+
+  TensorView* sx_softmax = div(sx_exp, bcast_sum); // (M, N)
+  TensorView* dx_softmax = div(dx_exp, bcast_sum); // (M, N)
+  fusion.addOutput(sx_softmax);
+  fusion.addOutput(dx_softmax);
+
+  auto sx_cache = sx->cacheAfter();
+  auto dx_cache = dx->cacheAfter();
+  dx_cache->setMemoryType(MemoryType::Shared);
+  dx_exp->setMemoryType(MemoryType::Shared);
+
+  // Reduction and Broadcast Tensors common to both memory TVs
+  std::vector<TensorView*> common_tensors(
+      {max_val, sum_exp, bcast_max, bcast_sum});
+
+  // Static Local Memory TVs
+  std::vector<TensorView*> static_tensors(
+      {sx, sx_cache, max_sx, sx_max_sub, sx_exp, sx_sum_exp, sx_softmax});
+
+  // Dynamic Local Memory TVs
+  std::vector<TensorView*> dynamic_tensors(
+      {dx, dx_cache, max_dx, dx_max_sub, dx_exp, dx_sum_exp, dx_softmax});
+
+  std::vector<TensorView*> all_tensors;
+  all_tensors.insert(
+      all_tensors.end(), common_tensors.begin(), common_tensors.end());
+  all_tensors.insert(
+      all_tensors.end(), static_tensors.begin(), static_tensors.end());
+  all_tensors.insert(
+      all_tensors.end(), dynamic_tensors.begin(), dynamic_tensors.end());
+
+  // M => M
+  // M, N => M, N/128, 128
+  for (auto tensor : all_tensors) {
+    if (tensor->nDims() > 1) {
+      tensor->split(-1, TIDX);
+    }
+  }
+
+  auto sx_sum_exp_rf = sx_sum_exp->rFactor({1});
+  auto dx_sum_exp_rf = dx_sum_exp->rFactor({1});
+  all_tensors.push_back(sx_sum_exp_rf);
+  all_tensors.push_back(dx_sum_exp_rf);
+
+  // computeAt
+  sx->computeAt(sx_max_sub, 1);
+  dx->computeAt(dx_max_sub, 1);
+
+  sx_exp->computeAt(sx_softmax, 1);
+  dx_exp->computeAt(dx_softmax, 1);
+
+  sx_max_sub->computeAt(sx_exp, 2);
+  dx_max_sub->computeAt(dx_exp, 2);
+
+  sx_softmax->axis(0)->parallelize(ParallelType::BIDx);
+  dx_softmax->axis(0)->parallelize(ParallelType::BIDx);
+  for (auto tensor : all_tensors) {
+    if (tensor->nDims() > 1) {
+      tensor->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  const int64_t dimx = 1024;
+  const int64_t dimy = 16384;
+
+  auto properties = at::cuda::getDeviceProperties(0);
+  const size_t required_smem_size =
+      (dimy - static_size) * sizeof(float) + TIDX * sizeof(float);
+  if (properties->sharedMemPerBlockOptin < required_smem_size) {
+    GTEST_SKIP() << "not enough shared memory space on device to run test: "
+                 << properties->sharedMemPerBlock;
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({dimx, dimy}, options);
+  at::Tensor aten_static_in = aten_input.narrow(1, 0, static_size);
+  at::Tensor aten_dynamic_in =
+      aten_input.narrow(1, static_size, dimy - static_size);
+
+  at::Tensor out = at::zeros({dimx, dimy}, options);
+  at::Tensor cg_static_out = out.narrow(1, 0, static_size);
+  at::Tensor cg_dynamic_out = out.narrow(1, static_size, dimy - static_size);
+
+  std::vector<at::Tensor> aten_outputs;
+
+  auto aten_output = at::_softmax(aten_input.to(at::kDouble), -1, false);
+  at::Tensor aten_static_out = aten_output.narrow(1, 0, static_size);
+  at::Tensor aten_dynamic_out =
+      aten_output.narrow(1, static_size, dimy - static_size);
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_static_in, aten_dynamic_in});
+  fe.runFusion(
+      {aten_static_in, aten_dynamic_in}, {cg_static_out, cg_dynamic_out});
+
+  testValidate(
+      &fusion,
+      {cg_static_out, cg_dynamic_out},
+      {aten_static_in, aten_dynamic_in},
+      {cg_static_out, cg_dynamic_out},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionPersistentNormLocalShared_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int pixels_per_thread = 64;
+  const int TIDX = 128;
+  const int static_size = pixels_per_thread * TIDX;
+
+  TensorView* sx = makeConcreteTensor({-1, static_size});
+  TensorView* dx = makeSymbolicTensor(2);
+  fusion.addInput(sx);
+  fusion.addInput(dx);
+
+  Double* gamma = IrBuilder::create<Double>();
+  Double* beta = IrBuilder::create<Double>();
+  Double* eps = IrBuilder::create<Double>();
+  Int* N = IrBuilder::create<Int>();
+  fusion.addInput(gamma);
+  fusion.addInput(beta);
+  fusion.addInput(eps);
+  fusion.addInput(N);
+
+  // Reduction
+  auto sx_sum = sum(sx, {-1}); // (M, R)
+  auto dx_sum = sum(dx, {-1}); // (M, R)
+  // Reduction => merge local and shared memory TensorViews
+  auto x_sum = binaryOp(BinaryOpType::Add, sx_sum, dx_sum);
+
+  // Broadcast
+  auto x_sum_bcast = broadcast(x_sum, {false, true}); // (M, B)
+  // Pwise
+  auto x_mean = div(x_sum_bcast, N); // (M, B)
+
+  auto sx_mean_sub = sub(sx, x_mean); // (M, N)
+  auto dx_mean_sub = sub(dx, x_mean); // (M, N)
+
+  auto sx_mean_sub_pow = mul(sx_mean_sub, sx_mean_sub); // (M, N)
+  auto dx_mean_sub_pow = mul(dx_mean_sub, dx_mean_sub); // (M, N)
+
+  // Reduction
+  auto sx_var_sum = sum(sx_mean_sub_pow, {-1}); // (M, R)
+  auto dx_var_sum = sum(dx_mean_sub_pow, {-1}); // (M, R)
+  // Reduction => merge local and shared memory TensorViews
+  auto var_sum = binaryOp(BinaryOpType::Add, sx_var_sum, dx_var_sum);
+
+  // Broadcast
+  auto var_sum_bcast = broadcast(var_sum, {false, true}); // (M, B)
+  // Pwise
+  auto var = div(var_sum_bcast, N); // (M, B)
+  auto var_eps = add(var, eps); // (M, B)
+  auto rvar = unaryOp(UnaryOpType::Rsqrt, var_eps); // (M, B)
+
+  auto sx_norm = mul(sx_mean_sub, rvar);
+  auto dx_norm = mul(dx_mean_sub, rvar);
+
+  auto sx_norm_gamma = mul(sx_norm, gamma);
+  auto dx_norm_gamma = mul(dx_norm, gamma);
+
+  auto sx_norm_gamma_beta = add(sx_norm_gamma, beta);
+  auto dx_norm_gamma_beta = add(dx_norm_gamma, beta);
+
+  fusion.addOutput(sx_norm_gamma_beta);
+  fusion.addOutput(dx_norm_gamma_beta);
+
+  sx_norm_gamma_beta->setContiguity(false);
+  dx_norm_gamma_beta->setContiguity(false);
+
+  // Read Input into Shared Memory
+  // Read Input minus Input_Mean into Shared Memory
+  auto sx_cache = sx->cacheAfter();
+  auto dx_cache = dx->cacheAfter();
+  dx_cache->setMemoryType(MemoryType::Shared);
+  dx_mean_sub->setMemoryType(MemoryType::Shared);
+
+  std::vector<TensorView*> common_tensors(
+      {x_sum, x_sum_bcast, x_mean, var_sum, var_sum_bcast, var, var_eps, rvar});
+
+  std::vector<TensorView*> static_tensors(
+      {sx,
+       sx_cache,
+       sx_sum,
+       sx_mean_sub,
+       sx_mean_sub_pow,
+       sx_var_sum,
+       sx_norm,
+       sx_norm_gamma,
+       sx_norm_gamma_beta});
+
+  std::vector<TensorView*> dynamic_tensors(
+      {dx,
+       dx_cache,
+       dx_sum,
+       dx_mean_sub,
+       dx_mean_sub_pow,
+       dx_var_sum,
+       dx_norm,
+       dx_norm_gamma,
+       dx_norm_gamma_beta});
+
+  std::vector<TensorView*> all_tensors;
+  all_tensors.insert(
+      all_tensors.end(), common_tensors.begin(), common_tensors.end());
+  all_tensors.insert(
+      all_tensors.end(), static_tensors.begin(), static_tensors.end());
+  all_tensors.insert(
+      all_tensors.end(), dynamic_tensors.begin(), dynamic_tensors.end());
+
+  // M => M
+  // M, N => M, N/128, 128
+  for (auto tensor : all_tensors) {
+    if (tensor->nDims() > 1) {
+      tensor->split(-1, TIDX);
+    }
+  }
+
+  // Local Sum => Block Broadcast
+  TensorView* sx_sum_rf = sx_sum->rFactor({1});
+  TensorView* sx_var_sum_rf = sx_var_sum->rFactor({1});
+  TensorView* dx_sum_rf = dx_sum->rFactor({1});
+  TensorView* dx_var_sum_rf = dx_var_sum->rFactor({1});
+  all_tensors.push_back(sx_sum_rf);
+  all_tensors.push_back(sx_var_sum_rf);
+  all_tensors.push_back(dx_sum_rf);
+  all_tensors.push_back(dx_var_sum_rf);
+
+  // ComputeAt
+  sx->computeAt(sx_mean_sub_pow, 1);
+  dx->computeAt(dx_mean_sub_pow, 1);
+
+  var_sum->computeAt(rvar, 1);
+
+  sx_mean_sub_pow->computeAt(sx_var_sum_rf, 2);
+  dx_mean_sub_pow->computeAt(dx_var_sum_rf, 2);
+
+  sx_norm->computeAt(sx_norm_gamma_beta, 2);
+  dx_norm->computeAt(dx_norm_gamma_beta, 2);
+
+  sx_norm_gamma_beta->axis(0)->parallelize(ParallelType::BIDx);
+  dx_norm_gamma_beta->axis(0)->parallelize(ParallelType::BIDx);
+  for (auto tensor : all_tensors) {
+    if (tensor->nDims() > 1) {
+      tensor->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  const int dimx = 1024;
+  const int dimy = 16384;
+  const float kGamma = 1.0f;
+  const float kBeta = 0.0f;
+  const float kEps = 1e-5;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  auto properties = at::cuda::getDeviceProperties(0);
+  const size_t required_smem_size =
+      (dimy - static_size) * sizeof(float) + TIDX * sizeof(float);
+  if (properties->sharedMemPerBlockOptin < required_smem_size) {
+    GTEST_SKIP() << "not enough shared memory space on device to run test: "
+                 << properties->sharedMemPerBlock;
+  }
+
+  at::Tensor aten_input = at::randn({dimx, dimy}, options);
+  at::Tensor aten_static_in = aten_input.narrow(1, 0, static_size);
+  at::Tensor aten_dynamic_in =
+      aten_input.narrow(1, static_size, dimy - static_size);
+
+  at::Tensor out = at::zeros({dimx, dimy}, options);
+  at::Tensor cg_static_out = out.narrow(1, 0, static_size);
+  at::Tensor cg_dynamic_out = out.narrow(1, static_size, dimy - static_size);
+
+  std::vector<IValue> aten_inputs = {
+      aten_static_in, aten_dynamic_in, kGamma, kBeta, kEps, dimy};
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+
+  fe.runFusion(aten_inputs, {cg_static_out, cg_dynamic_out});
+
+  auto at_mu = at::mean(aten_input.to(at::kDouble), -1).unsqueeze(1);
+  auto at_var = at::var(aten_input.to(at::kDouble), -1, false).unsqueeze(1);
+  auto at_rvar = at::rsqrt(at::add(at_var, kEps));
+  auto at_norm = at::mul(at::sub(aten_input, at_mu), at_rvar);
+  auto aten_output = at::add(at::mul(at_norm, kGamma), kBeta);
+  at::Tensor aten_static_out = aten_output.narrow(1, 0, static_size);
+  at::Tensor aten_dynamic_out =
+      aten_output.narrow(1, static_size, dimy - static_size);
+
+  testValidate(
+      &fusion,
+      {cg_static_out, cg_dynamic_out},
+      aten_inputs,
+      {aten_static_out, aten_dynamic_out},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSmemDynamicPersistentNorm_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  auto x = makeSymbolicTensor(2);
+  Double* gamma = IrBuilder::create<Double>();
+  Double* beta = IrBuilder::create<Double>();
+  Double* eps = IrBuilder::create<Double>();
+  Int* N = IrBuilder::create<Int>();
+  fusion.addInput(x);
+  fusion.addInput(gamma);
+  fusion.addInput(beta);
+  fusion.addInput(eps);
+  fusion.addInput(N);
+
+  // Reduction
+  auto x_sum = sum(x, {-1}); // (M, R)
+  // Broadcast
+  auto x_sum_bcast = broadcast(x_sum, {false, true}); // (M, B)
+  // Pwise
+  auto x_mean = div(x_sum_bcast, N); // (M, B)
+  auto x_mean_sub = sub(x, x_mean); // (M, N)
+  auto x_mean_sub_pow = mul(x_mean_sub, x_mean_sub); // (M, N)
+  // Reduction
+  auto var_sum = sum(x_mean_sub_pow, {-1}); // (M, R)
+  // Broadcast
+  auto var_sum_bcast = broadcast(var_sum, {false, true}); // (M, B)
+  // Pwise
+  auto var = div(var_sum_bcast, N); // (M, B)
+  auto var_eps = add(var, eps); // (M, B)
+  auto rvar = unaryOp(UnaryOpType::Rsqrt, var_eps); // (M, B)
+  auto norm = mul(x_mean_sub, rvar);
+  auto norm_gamma = mul(norm, gamma);
+  auto norm_gamma_beta = add(norm_gamma, beta);
+  fusion.addOutput(norm_gamma_beta);
+
+  // Read Input into Shared Memory
+  // Read Input minus Input_Mean into Shared Memory
+  auto cache_x = x->cacheAfter();
+  cache_x->setMemoryType(MemoryType::Shared);
+  x_mean_sub->setMemoryType(MemoryType::Shared);
+
+  std::vector<TensorView*> all_tensors(
+      {x_sum,
+       x_mean,
+       cache_x,
+       x_sum_bcast,
+       x_mean_sub,
+       x_mean_sub_pow,
+       var_sum,
+       var_sum_bcast,
+       var,
+       var_eps,
+       rvar,
+       norm,
+       norm_gamma,
+       norm_gamma_beta});
+
+  auto tidx = IrBuilder::create<Int>();
+  fusion.addInput(tidx);
+
+  for (auto tensor : all_tensors) {
+    tensor->split(-1, tidx);
+  }
+
+  // Local Sum => Block Broadcast
+  TensorView* x_sum_rf = x_sum->rFactor({1});
+  TensorView* var_sum_rf = var_sum->rFactor({1});
+  all_tensors.push_back(x_sum_rf);
+  all_tensors.push_back(var_sum_rf);
+
+  // ComputeAt
+  x->computeAt(x_mean_sub_pow, 1);
+  var_sum->computeAt(rvar, 1);
+  x_mean_sub_pow->computeAt(var_sum_rf, 2);
+  norm->computeAt(norm_gamma_beta, 2);
+
+  for (auto tv : all_tensors) {
+    tv->axis(0)->parallelize(ParallelType::BIDx);
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  const int dimx = 128;
+  const int dimy = 2048;
+  const float kGamma = 1.0f;
+  const float kBeta = 0.0f;
+  const float kEps = 1e-5;
+  const int TIDX = 128;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({dimx, dimy}, options);
+  auto at_mu = at::mean(aten_input.to(at::kDouble), -1).unsqueeze(1);
+  auto at_var = at::var(aten_input.to(at::kDouble), -1).unsqueeze(1);
+  auto at_rvar = at::rsqrt(at::add(at_var, kEps));
+  auto at_norm = at::mul(at::sub(aten_input, at_mu), at_rvar);
+  auto aten_output = at::add(at::mul(at_norm, kGamma), kBeta);
+
+  std::vector<IValue> aten_inputs = {
+      aten_input, kGamma, kBeta, kEps, dimy, TIDX};
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolic_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+  fusion.addInput(tv0);
+  fusion.addOutput(tv1);
+  // tv1[I0, R1] = tv0[I0, I1]
+
+  // Interface should just be a direct split with a Parallel type. We can
+  // include the parallelize call if we do this.
+  tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
+  // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({2});
+  tv2->setMemoryType(MemoryType::Shared);
+  // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1]
+  // tv1[I0,        R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}]
+
+  tv0->computeAt(tv1, 1);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+
+  constexpr int numel_x = 65000, numel_y = 1024;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
+  auto aten_output = aten_input.to(at::kDouble).sum({1});
+
+  // How many threads to use for the block reduction
+  constexpr int runtime_threadIdx_dim = 128;
+
+  LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input}, lparams);
+  auto cg_outputs = fe.runFusion({aten_input}, lparams);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
+}
+
+TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Algorithm
+  Int* sym_bsx = IrBuilder::create<Int>();
+  TensorView* tv0 = makeSymbolicTensor(3); // M, K, N
+  fusion.addInput(tv0);
+  fusion.addInput(sym_bsx);
+
+  TensorView* tv1 = sum(tv0, {1}); // M, R, N
+  fusion.addOutput(tv1);
+
+  TensorView* tv2 = tv0->cacheAfter();
+  tv2->setMemoryType(MemoryType::Shared);
+
+  // Schedule
+  constexpr int BSX = 32;
+  tv1->split(2, BSX);
+  tv1->split(1, sym_bsx);
+  tv1->split(0, BSX);
+  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
+  tv1->reorder({{0, 0}, {1, 2}, {2, 4}, {3, 5}, {4, 1}, {5, 3}});
+  TensorView* tv3 = tv1->rFactor({-2});
+
+  tv0->computeAt(tv1, -2);
+  tv0->computeAt(tv3, -2);
+
+  // Thread and Block binding
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::BIDy);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  // Manual Binding
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 154, K = 45, N = 1524;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({M, K, N}, options);
+  at::Tensor aten_output = aten_input.to(at::kDouble).sum({1});
+
+  // How many threads to use for the block reduction
+  constexpr int runtime_threadIdx_dim = 128;
+
+  auto lparams = LaunchParams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input, runtime_threadIdx_dim}, lparams);
+  auto cg_outputs = fe.runFusion({aten_input, runtime_threadIdx_dim}, lparams);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input, runtime_threadIdx_dim},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0);
+}
+
+TEST_F(NVFuserTest, FusionSmemDynamicPwiseMulSymbolicArgWAR_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  Int* sym_bsx = IrBuilder::create<Int>();
+  TensorView* tv0 = makeSymbolicTensor(2); // (M, K)
+  TensorView* tv1 = makeSymbolicTensor(2); // (K, N)
+  TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B)
+  TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N)
+  TensorView* tv4 = mul(tv2, tv3); // M, K, N
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addInput(sym_bsx);
+  fusion.addOutput(tv4);
+  // Algorithm
+
+  tv2->setMemoryType(MemoryType::Shared);
+  tv3->setMemoryType(MemoryType::Shared);
+
+  constexpr int BSX = 32;
+  tv4->split(2, BSX);
+  tv4->split(1, sym_bsx);
+  tv4->split(0, BSX);
+  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
+  tv4->reorder({{0, 0}, {1, 3}, {2, 1}, {3, 4}, {4, 2}, {5, 5}});
+  // M/BSX, K/BSX, N/BSX, MSX, KSX, NSX
+
+  tv0->computeAt(tv4, 3);
+  tv1->computeAt(tv4, 3);
+  // Schedule
+
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(2)->parallelize(ParallelType::BIDy);
+  // Manual Binding
+  tv2->axis(-2)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  // Thread and Block binding
+
+  constexpr int M = 128, K = 457, N = 1024;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M, K}, options);
+  at::Tensor t1 = at::randn({K, N}, options);
+  at::Tensor aten_output = mul(t0.unsqueeze(2), t1.unsqueeze(0));
+  std::vector<IValue> aten_inputs = {t0, t1, BSX};
+
+  LaunchParams lparams(-1, -1, -1, BSX, -1, -1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs, lparams);
+  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      aten_inputs,
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1);
+}
+
+TEST_F(NVFuserTest, FusionSmemDynamicTiledGemm_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Symbolic integers we will use for runtime tiling
+  Int* symbolic_m_tile_dim = IrBuilder::create<Int>(); // bound to threadIdx.z
+  Int* symbolic_split_k_tile_dim =
+      IrBuilder::create<Int>(); // bound to blockIdx.x
+  Int* symbolic_block_k_tile_dim =
+      IrBuilder::create<Int>(); // bound to threadIdx.x
+  // Compile-time integer for tiling
+  int n_smem_tile = 8; // bound to threadIdx.y
+
+  // Symbolic 2D tensors TV0[M, K], TV1[K, N]
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+
+  // Broadcast tv0 to [M, K, *]
+  TensorView* tv2 = broadcast(tv0, {false, false, true});
+  // Broadcast tv1 to [*, K, N]
+  TensorView* tv3 = broadcast(tv1, {true, false, false});
+
+  // Pointwise multiplication resulting in tv3[M, K, N]
+  TensorView* tv4 = mul(tv2, tv3);
+
+  // Turn the K-dimension of tv4 into a reduction dimension
+  TensorView* tv5 = sum(tv4, {1});
+
+  // Register inputs and outputs
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv5);
+
+  // Register runtime tile dims as inputs
+  fusion.addInput(symbolic_m_tile_dim);
+  fusion.addInput(symbolic_split_k_tile_dim);
+  fusion.addInput(symbolic_block_k_tile_dim);
+
+  // Make a 3D tile, mix of symbolic and constant, do in reverse order because
+  // dims are inserted
+  // [M, K, N]
+  tv5->split(2, n_smem_tile);
+  tv5->split(1, symbolic_block_k_tile_dim);
+  tv5->split(1, symbolic_split_k_tile_dim);
+  tv5->split(0, symbolic_m_tile_dim);
+  // [Mo, Mi, Koo, Koi, Ki, No, Ni]
+
+  // Reorder so all outer tiles are in the leftmost 3 positions
+  tv5->reorder({{1, 5}, {5, 1}});
+  // [Mo, No, Koo, Koi, Ki, Mi, Ni]
+
+  // Factor out the outer reduction IterDomain, then run the inter-cta
+  // reduction, and intra-cta reduction
+  auto tv6 = tv5->rFactor({2});
+  // [Mo, No, rKoo, rKoi, rKi, Mi, Ni]
+  // [Mo, No,       rKoi, rKi, Mi, Ni]
+
+  // Scope computations
+  tv6->computeAt(tv5, 2);
+  // [Mo, No, rKoo,  Koi,  Ki, Mi, Ni]
+  // [Mo, No,       rKoi, rKi, Mi, Ni]
+
+  // Setup compute at schedule
+  tv0->computeAt(tv6, 3);
+  tv1->computeAt(tv6, 3);
+  tv4->computeAt(tv6, -1);
+  //
+  // T2[Mo,  bNo, Koo, Koi,  Kii,  Mi, bNi] CA(4, 3)
+  // T3[bMo,  No, Koo, Koi,  Kii, bMi,  Ni] CA(4, 3)
+  // T4[ Mo,  No, Koo, Koi,  Kii,  Mi,  Ni]
+  // T6[ Mo,  No, rKoo, Koi, Kii,  Mi,  Ni]
+  // T5[ Mo,  No,      rKoi, rKii, Mi,  Ni]
+
+  // Cache smem tiles
+  tv2->setMemoryType(MemoryType::Shared);
+  tv3->setMemoryType(MemoryType::Shared);
+  tv4->setMemoryType(MemoryType::Local);
+  tv6->setMemoryType(MemoryType::Local);
+
+  tv5->axis(0)->parallelize(ParallelType::BIDz);
+  tv5->axis(1)->parallelize(ParallelType::BIDy);
+
+  std::vector<TensorView*> tv_list = {tv2, tv3, tv4, tv5, tv6};
+  for (auto tv : tv_list) {
+    tv->axis(-2)->parallelize(ParallelType::TIDz);
+    tv->axis(-1)->parallelize(ParallelType::TIDy);
+  }
+  tv2->axis(3)->parallelize(ParallelType::TIDx);
+  tv3->axis(3)->parallelize(ParallelType::TIDx);
+  tv4->axis(3)->parallelize(ParallelType::TIDx);
+  tv6->axis(3)->parallelize(ParallelType::TIDx);
+  tv5->axis(2)->parallelize(ParallelType::TIDx);
+
+  tv2->axis(4)->parallelize(ParallelType::BIDx);
+  tv3->axis(4)->parallelize(ParallelType::BIDx);
+  tv4->axis(4)->parallelize(ParallelType::BIDx);
+  tv6->axis(4)->parallelize(ParallelType::BIDx);
+  tv5->axis(3)->parallelize(ParallelType::BIDx);
+
+  constexpr int M = 31, K = 65, N = 33;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M, K}, options);
+  at::Tensor t1 = at::randn({K, N}, options);
+
+  // Runtime tiling
+  int m_tile = 4; // bound to threadIdx.z
+  int split_k = 7; // bound to blockIdx.x
+  int intra_cta = 8; // bound to threadIdx.x
+
+  std::vector<IValue> aten_inputs = {t0, t1, m_tile, split_k, intra_cta};
+  at::Tensor aten_output =
+      mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1);
+
+  FusionExecutor fe;
+  // Generate CUDA and compile with nvRTC
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1);
+}
+
+} // namespace jit
+} // namespace torch
+#endif // #if defined(USE_CUDA)
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu2.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu2.cpp
new file mode 100644
index 0000000000000..d154b454281e1
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu2.cpp
@@ -0,0 +1,9801 @@
+#if defined(USE_CUDA)
+#include <gmock/gmock-matchers.h>
+#include <gtest/gtest.h>
+
+#include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/codegen.h>
+#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
+#include <torch/csrc/jit/codegen/cuda/executor.h>
+#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
+#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/fusion_segmenter.h>
+#include <torch/csrc/jit/codegen/cuda/grouped_reduction.h>
+#include <torch/csrc/jit/codegen/cuda/inlining.h>
+#include <torch/csrc/jit/codegen/cuda/interface.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/ir_graphviz.h>
+#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/lower_magic_zero.h>
+#include <torch/csrc/jit/codegen/cuda/mutator.h>
+#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
+#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
+#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
+#include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
+#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
+#include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
+
+#include <test/cpp/jit/test_utils.h>
+#include <torch/csrc/jit/api/function_impl.h>
+#include <torch/csrc/jit/codegen/cuda/parser.h>
+#include <torch/csrc/jit/ir/irparser.h>
+#include <torch/torch.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <algorithm>
+#include <iostream>
+#include <sstream>
+#include <thread>
+
+// Tests go in torch::jit
+namespace torch {
+namespace jit {
+
+using namespace torch::jit::fuser::cuda;
+using namespace at::indexing;
+
+TEST_F(NVFuserTest, FusionGlobalIntermediate_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+  fusion.addInput(tv0);
+  fusion.addOutput(tv1);
+  // tv1[I0, R1] = tv0[I0, I1]
+
+  // Interface should just be a direct split with a Parallel type. We can
+  // include the parallelize call if we do this.
+  tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
+  // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({2});
+  tv2->setMemoryType(MemoryType::Global);
+  // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1]
+  // tv1[I0,        R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}]
+
+  tv0->computeAt(tv1, 1);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+
+  constexpr int numel_x = 65000, numel_y = 1024;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+
+  // How many threads to use for the block reduction
+  constexpr int runtime_threadIdx_dim = 128;
+
+  auto lparams = LaunchParams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input}, lparams);
+  auto cg_outputs = fe.runFusion({input}, lparams);
+
+  auto aten_output = input.to(at::kDouble).sum({1});
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {input},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionGlobalIntermediateDefaultSchedule_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  TensorView* tv2 = makeSymbolicTensor(2);
+  TensorView* tv3 = makeSymbolicTensor(2);
+  TensorView* tv4 = sub(tv2, tv3);
+  TensorView* tv5 = add(tv1, tv4);
+  TensorView* tv6 = sub(tv5, tv0);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addInput(tv2);
+  fusion.addInput(tv3);
+  fusion.addOutput(tv6);
+  // t6 = ((t1 + (t2 - t3)) - t0)
+
+  tv4->setMemoryType(MemoryType::Global);
+  tv5->setMemoryType(MemoryType::Global);
+  tv6->setMemoryType(MemoryType::Global);
+
+  constexpr int M = 32, N = 810;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M, N}, options);
+  at::Tensor t1 = at::randn({M, N}, options);
+  at::Tensor t2 = at::randn({M, N}, options);
+  at::Tensor t3 = at::randn({M, N}, options);
+
+  at::Tensor aten_output = (t1 + (t2 - t3)) - t0;
+
+  std::vector<IValue> aten_inputs = {t0, t1, t2, t3};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1, t2, t3});
+  auto cg_outputs = fe.runFusion({t0, t1, t2, t3});
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionConstCheck_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto one = IrBuilder::create<Int>(1);
+  TORCH_CHECK(one->isConstScalar());
+
+  auto one_x2 = mul(one, one);
+  TORCH_CHECK(one_x2->isConstScalar());
+
+  auto one_x3 = mul(one_x2, one);
+  TORCH_CHECK(one_x3->isConstScalar());
+
+  auto one_x4 = mul(one_x3, one);
+  TORCH_CHECK(one_x4->isConstScalar());
+}
+
+TEST_F(NVFuserTest, FusionUnrollWithAlloc_CUDA) {
+  const std::vector<int64_t> tensor_dims_in = {128, 128};
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(tensor_dims_in.size());
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(0));
+  TensorView* tv2 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv1);
+  fusion.addOutput(tv2);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn(tensor_dims_in, options);
+  at::Tensor cg_output = at::empty({tensor_dims_in[0]}, options);
+
+  // Schedule
+  tv2->split(1, 32);
+  tv2->split(1, 4); // unroll
+
+  auto tv2_rf = tv2->rFactor({-3, -2});
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv2_rf->axis(0)->parallelize(ParallelType::BIDx);
+  tv2_rf->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2_rf->axis(-2)->parallelize(ParallelType::Unroll);
+
+  tv1->computeAt(tv2_rf, -1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  auto aten_output = (input + 0).to(at::kDouble).sum(1);
+
+  testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+// Test isZeroInt
+TEST_F(NVFuserTest, FusionIsZeroInt_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  Int* x = IrBuilder::create<Int>(0);
+  Int* y = IrBuilder::create<Int>(1);
+  Val* z = mul(x, y);
+  TORCH_CHECK(x->isZeroInt());
+  TORCH_CHECK(!y->isZeroInt());
+  TORCH_CHECK(!z->isZeroInt());
+}
+
+// Test isOneInt
+TEST_F(NVFuserTest, FusionIsOneInt_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  Int* x = IrBuilder::create<Int>(1);
+  Int* y = IrBuilder::create<Int>(1);
+  Val* z = mul(x, y);
+  TORCH_CHECK(x->isOneInt());
+  TORCH_CHECK(y->isOneInt());
+  TORCH_CHECK(!z->isOneInt());
+}
+
+// This is to verify no cycle of computeAt is created. A more complex
+// variation of this pattern appears in one of the Python tests
+// (test_random_topo).
+TEST_F(NVFuserTest, FusionComputeAtNonterminatingOutput_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  // Common intermediate tensor
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  // tv1 -> tv2
+  auto tv2 = add(tv1, IrBuilder::create<Double>(2));
+  // tv1 -> tv3 -> tv4
+  auto tv3 = add(tv1, IrBuilder::create<Double>(3));
+  auto tv4 = add(tv3, IrBuilder::create<Double>(4));
+
+  // NOTE: This should no longer occur as of PR #201.
+  // The order of adding outputs matters. If tv3 is added before tv4,
+  // it should be fine. However, if tv4 is added before tv3, there
+  // will be a cycle of tv3->tv4 and tv4->tv3. tv3->tv4 is created
+  // first, and then tv4->tv3 is created at the final phase of
+  // computeAt (ComputeAt::setupOutputs).
+  fusion.addOutput(tv2);
+  fusion.addOutput(tv4);
+  fusion.addOutput(tv3);
+
+  tv0->computeAt(tv2, -1);
+
+  TORCH_CHECK(tv3->hasComputeAt());
+  TORCH_CHECK(!tv4->hasComputeAt());
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn(100, options);
+
+  auto t1 = aten_input + 1;
+  auto t2 = t1 + 2;
+  auto t3 = t1 + 3;
+  auto t4 = t3 + 4;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  std::vector<at::Tensor> aten_outputs = {t2, t4, t3};
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTraversalOrder1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2));
+  TensorView* tv3 = add(tv1, IrBuilder::create<Double>(3));
+  TensorView* tv4 = add(tv1, IrBuilder::create<Double>(4));
+
+  fusion.addOutput(tv2);
+  fusion.addOutput(tv3);
+  fusion.addOutput(tv4);
+
+  tv1->computeAt(tv3, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({10, 10}, options);
+
+  auto t1 = aten_input + 1;
+  auto t2 = aten_input + 2;
+  auto t3 = t1 + 3;
+  auto t4 = t1 + 4;
+
+  std::vector<at::Tensor> aten_outputs = {t2, t3, t4};
+
+  std::vector<at::Tensor> cg_outputs = {
+      at::empty_like(aten_input, options),
+      at::empty_like(aten_input, options),
+      at::empty_like(aten_input, options)};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, cg_outputs);
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTraversalOrder2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
+
+  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(3));
+  TensorView* tv4 = add(tv3, IrBuilder::create<Double>(4));
+
+  TensorView* tv5 = add(tv1, tv3);
+
+  fusion.addOutput(tv2);
+  fusion.addOutput(tv4);
+  fusion.addOutput(tv5);
+
+  tv1->computeAt(tv5, -1);
+  tv3->computeAt(tv5, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({10, 10}, options);
+
+  auto t1 = aten_input + 1;
+  auto t2 = t1 + 2;
+  auto t3 = aten_input + 3;
+  auto t4 = t3 + 4;
+  auto t5 = t1 + t3;
+
+  std::vector<at::Tensor> aten_outputs = {t2, t4, t5};
+
+  std::vector<at::Tensor> cg_outputs = {
+      at::empty_like(aten_input, options),
+      at::empty_like(aten_input, options),
+      at::empty_like(aten_input, options)};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, cg_outputs);
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTraversalOrder3_CUDA) {
+  for (const auto i : c10::irange(2)) {
+    Fusion fusion;
+    FusionGuard fg(&fusion);
+
+    TensorView* tv0 = makeSymbolicTensor(1);
+    fusion.addInput(tv0);
+
+    TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
+    TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
+
+    TensorView* tv3 = add(tv0, IrBuilder::create<Double>(3));
+    TensorView* tv4 = add(tv3, IrBuilder::create<Double>(4));
+
+    TensorView* tv5 = add(tv1, tv3);
+
+    fusion.addOutput(tv2);
+    fusion.addOutput(tv4);
+    fusion.addOutput(tv5);
+
+    const int tile = 32;
+
+    tv1->split(-1, tile);
+    tv2->split(-1, tile);
+    tv3->split(-1, tile);
+    tv4->split(-1, tile);
+    tv5->split(-1, tile);
+
+    auto compute_at_outer = tv1;
+    auto compute_at_inner = tv3;
+    if (i == 1) {
+      std::swap(compute_at_inner, compute_at_outer);
+    }
+
+    compute_at_outer->computeAt(tv5, -2);
+    compute_at_inner->computeAt(tv5, -1);
+
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::Tensor aten_input = at::randn({100}, options);
+    auto t1 = aten_input + 1;
+    auto t2 = t1 + 2;
+    auto t3 = aten_input + 3;
+    auto t4 = t3 + 4;
+    auto t5 = t1 + t3;
+
+    std::vector<at::Tensor> aten_outputs = {t2, t4, t5};
+
+    std::vector<at::Tensor> cg_outputs = {
+        at::empty_like(aten_input, options),
+        at::empty_like(aten_input, options),
+        at::empty_like(aten_input, options)};
+
+    FusionExecutor fe;
+    fe.compileFusion(&fusion, {aten_input});
+    fe.runFusion({aten_input}, cg_outputs);
+
+    testValidate(
+        &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+  }
+}
+
+TEST_F(NVFuserTest, FusionTraversalOrder4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // First tree
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
+  TensorView* tv3 = add(tv1, IrBuilder::create<Double>(3));
+  fusion.addOutput(tv2);
+  fusion.addOutput(tv3);
+
+  // Second tree
+  TensorView* tv4 = makeSymbolicTensor(1);
+  fusion.addInput(tv4);
+  TensorView* tv5 = add(tv4, IrBuilder::create<Double>(5));
+  TensorView* tv6 = add(tv5, IrBuilder::create<Double>(6));
+  TensorView* tv7 = add(tv5, IrBuilder::create<Double>(7));
+  fusion.addOutput(tv6);
+  fusion.addOutput(tv7);
+
+  tv1->computeAt(tv2, -1);
+  tv5->computeAt(tv6, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({100}, options);
+  at::Tensor t4 = at::rand_like(t0, options);
+
+  auto t1 = t0 + 1;
+  auto t2 = t1 + 2;
+  auto t3 = t1 + 3;
+  auto t5 = t4 + 5;
+  auto t6 = t5 + 6;
+  auto t7 = t5 + 7;
+
+  std::vector<at::Tensor> aten_outputs = {t2, t3, t6, t7};
+  std::vector<IValue> aten_inputs = {t0, t4};
+  std::vector<at::Tensor> cg_outputs = {
+      at::empty_like(t0, options),
+      at::empty_like(t0, options),
+      at::empty_like(t0, options),
+      at::empty_like(t0, options)};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  fe.runFusion(aten_inputs, cg_outputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTraversalOrder5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
+  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(3));
+  TensorView* tv4 = add(tv3, IrBuilder::create<Double>(4));
+  TensorView* tv5 = add(tv2, tv4);
+
+  fusion.addOutput(tv1);
+  fusion.addOutput(tv3);
+  fusion.addOutput(tv5);
+
+  tv2->computeAt(tv5, -1);
+  tv4->computeAt(tv5, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({100}, options);
+  std::vector<at::Tensor> cg_outputs = {
+      at::empty_like(aten_input, options),
+      at::empty_like(aten_input, options),
+      at::empty_like(aten_input, options)};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, cg_outputs);
+
+  auto t1 = aten_input + 1;
+  auto t2 = t1 + 2;
+  auto t3 = aten_input + 3;
+  auto t4 = t3 + 4;
+  auto t5 = t2 + t4;
+
+  std::vector<at::Tensor> aten_outputs = {t1, t3, t5};
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTraversalOrder6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2));
+  TensorView* tv3 = add(tv1, tv2);
+  TensorView* tv4 = add(tv3, IrBuilder::create<Double>(4));
+
+  fusion.addOutput(tv4);
+
+  tv1->split(0, 32);
+  tv2->split(0, 32);
+  tv3->split(0, 32);
+  tv4->split(0, 32);
+
+  tv3->computeAt(tv4, -2);
+  tv1->computeAt(tv3, -1);
+  tv2->computeAt(tv3, -2);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({100}, options);
+
+  auto t1 = aten_input + 1;
+  auto t2 = aten_input + 2;
+  auto t3 = t1 + t2;
+  auto aten_output = t3 + 4;
+
+  at::Tensor cg_output = at::empty_like(aten_input, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTraversalOrder7_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2));
+  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(3));
+  TensorView* tv4 = add(tv3, IrBuilder::create<Double>(4));
+  TensorView* tv5 = add(tv2, tv4);
+
+  fusion.addOutput(tv5);
+
+  TensorView* tvs[] = {tv1, tv2, tv3, tv4, tv5};
+  for (auto tv : tvs) {
+    tv->split(0, 2);
+    tv->split(0, 4);
+    tv->split(0, 8);
+  }
+
+  // computeAt into inner loop nests
+  tv1->computeAt(tv2, -1);
+  tv3->computeAt(tv4, -2);
+
+  tv2->computeAt(tv5, -4);
+  tv4->computeAt(tv5, -3);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({100}, options);
+
+  auto t1 = aten_input + 1;
+  auto t2 = t1 + 2;
+  auto t3 = aten_input + 3;
+  auto t4 = t3 + 4;
+  auto aten_output = t2 + t4;
+
+  at::Tensor cg_output = at::empty_like(aten_input, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+// Test predication of grid reduction
+TEST_F(NVFuserTest, FusionThreadPredicate_CUDA) {
+  const int gdimx = 4;
+  const int bdimx = 128;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv0);
+  TensorView* tv2 = unaryOp(UnaryOpType::Neg, tv1);
+  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(2));
+
+  fusion.addOutput(tv3);
+  fusion.addOutput(tv2);
+
+  tv1->split(1, bdimx);
+  tv1->split(1, gdimx);
+  tv3->split(1, bdimx);
+  tv3->split(1, gdimx);
+
+  TensorView* tv1_rf = tv1->rFactor({1});
+
+  tv1->computeAt(tv2, -1);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDy);
+  tv1_rf->axis(0)->parallelize(ParallelType::BIDy);
+  tv2->axis(0)->parallelize(ParallelType::BIDy);
+  tv1->axis(-2)->parallelize(ParallelType::BIDx);
+  tv1_rf->axis(-2)->parallelize(ParallelType::BIDx);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv3->axis(3)->parallelize(ParallelType::TIDx);
+  tv3->axis(2)->parallelize(ParallelType::BIDx);
+  tv3->axis(0)->parallelize(ParallelType::BIDy);
+
+  int numel_x = 100;
+  int numel_y = 1000;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
+
+  auto t2 = -aten_input.to(at::kDouble).sum({1});
+  auto t3 = aten_input + 2.0;
+
+  std::vector<at::Tensor> aten_outputs = {t3, t2};
+
+  std::vector<at::Tensor> cg_outputs = {
+      at::empty_like(aten_input, options), at::empty({numel_x}, options)};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, cg_outputs);
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionLSTMCell_CUDA) {
+  const int hidden_features = 512;
+  const int batch_size = 64;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tvs[16];
+  for (const auto i : c10::irange(16)) {
+    tvs[i] = makeSymbolicTensor(2);
+    fusion.addInput(tvs[i]);
+  }
+
+  auto ingate = unaryOp(
+      UnaryOpType::Sigmoid, add(add(add(tvs[0], tvs[1]), tvs[2]), tvs[3]));
+
+  auto forgetgate = unaryOp(
+      UnaryOpType::Sigmoid, add(add(add(tvs[4], tvs[5]), tvs[6]), tvs[7]));
+
+  auto cellgate = unaryOp(
+      UnaryOpType::Tanh, add(add(add(tvs[8], tvs[9]), tvs[10]), tvs[11]));
+
+  auto outgate = unaryOp(
+      UnaryOpType::Sigmoid, add(add(add(tvs[12], tvs[13]), tvs[14]), tvs[15]));
+
+  auto cx = makeContigTensor(2);
+  fusion.addInput(cx);
+
+  auto cy = add(mul(forgetgate, cx), mul(ingate, cellgate));
+
+  auto hy = mul(outgate, unaryOp(UnaryOpType::Tanh, cy));
+
+  fusion.addOutput(cy);
+  fusion.addOutput(hy);
+
+  std::vector<c10::IValue> aten_inputs;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor large_tensor0 =
+      at::randn({batch_size, hidden_features * 4}, options);
+  at::Tensor large_tensor1 =
+      at::randn({batch_size, hidden_features * 4}, options);
+  at::Tensor large_tensor2 =
+      at::randn({batch_size, hidden_features * 4}, options);
+  at::Tensor large_tensor3 =
+      at::randn({batch_size, hidden_features * 4}, options);
+
+  auto chunked0 = large_tensor0.chunk(4, 1);
+  auto chunked1 = large_tensor1.chunk(4, 1);
+  auto chunked2 = large_tensor2.chunk(4, 1);
+  auto chunked3 = large_tensor3.chunk(4, 1);
+
+  aten_inputs.insert(aten_inputs.end(), chunked0.begin(), chunked0.end());
+  aten_inputs.insert(aten_inputs.end(), chunked1.begin(), chunked1.end());
+  aten_inputs.insert(aten_inputs.end(), chunked2.begin(), chunked2.end());
+  aten_inputs.insert(aten_inputs.end(), chunked3.begin(), chunked3.end());
+
+  auto at_ingate =
+      chunked0[0].add(chunked0[1]).add(chunked0[2]).add(chunked0[3]).sigmoid();
+  auto at_forgetgate =
+      chunked1[0].add(chunked1[1]).add(chunked1[2]).add(chunked1[3]).sigmoid();
+  auto at_cellgate =
+      chunked2[0].add(chunked2[1]).add(chunked2[2]).add(chunked2[3]).tanh();
+  auto at_outgate =
+      chunked3[0].add(chunked3[1]).add(chunked3[2]).add(chunked3[3]).sigmoid();
+
+  auto at_cx = at::randn({batch_size, hidden_features}, options);
+  aten_inputs.push_back(at_cx);
+  auto at_cy = at_forgetgate.mul(at_cx).add(at_ingate.mul(at_cellgate));
+  auto at_hy = at_outgate.mul(at_cy.tanh());
+
+  auto lparams = schedulePointwise(&fusion, aten_inputs);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs, lparams);
+  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {at_cy, at_hy}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionReductionHalf_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(3, DataType::Half);
+  fusion.addInput(tv0);
+
+  auto tv1 = castOp(DataType::Float, tv0);
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1.0));
+  auto tv3 = sum(tv2, {2});
+  auto tv4 = castOp(DataType::Half, tv3);
+
+  fusion.addOutput(tv4);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({8, 8, 16}, options);
+
+  auto reduction_tv = tv3;
+
+  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  scheduleReduction(&fusion, *reduction_params);
+
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+
+  auto lparams = reduction_params->lparams;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input}, lparams);
+  // no broadcasting needed, omitting the last optional argument;
+  auto cg_outputs = fe.runFusion({aten_input}, lparams);
+
+  auto aten_output = aten_input.add(1.0).to(at::kDouble).sum({2});
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionReduceSingle_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeConcreteTensor({100, 1});
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {1});
+  fusion.addOutput(tv1);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({100, 1}, options);
+
+  // Grab only tensor views, though there shouldn't be any other type
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  // no broadcasting needed, omitting the last optional argument;
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  auto aten_output = aten_input.to(at::kDouble).sum({1});
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionReduceImplicitBroadcast_CUDA) {
+  constexpr int bid_x = 80;
+  constexpr int tid_x = 4096;
+  constexpr int red_dim = 1;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, 1});
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = reductionOp(
+      BinaryOpType::Add, {red_dim, 2}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({bid_x, tid_x, 1}, options);
+
+  // Apply reduction heuristic
+  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  scheduleReduction(&fusion, *reduction_params);
+  auto lparams = reduction_params->lparams;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input}, lparams);
+  // no broadcasting needed, omitting the last optional argument;
+  auto cg_outputs = fe.runFusion({aten_input}, lparams);
+  auto aten_output = aten_input.to(at::kDouble).sum({red_dim, 2});
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionReduceImplicitBroadcast2_CUDA) {
+  constexpr int bid_x = 80;
+  constexpr int tid_x = 4096;
+  constexpr int red_dim = 1;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, 1});
+  fusion.addInput(tv0);
+
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {2}, IrBuilder::create<Double>(0), tv0);
+
+  TensorView* tv2 = reductionOp(
+      BinaryOpType::Add, {red_dim}, IrBuilder::create<Double>(0), tv1);
+  fusion.addOutput(tv2);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({bid_x, tid_x, 1}, options);
+
+  // Apply reduction heuristic
+  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+
+  scheduleReduction(&fusion, *reduction_params);
+  auto lparams = reduction_params->lparams;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input}, lparams);
+  // no broadcasting needed, omitting the last optional argument;
+  auto cg_outputs = fe.runFusion({aten_input}, lparams);
+  auto aten_output = aten_input.to(at::kDouble).sum({1, 2});
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionReduceImplicitBroadcast3_CUDA) {
+  constexpr int bid_x = 80;
+  constexpr int tid_x = 4096;
+  constexpr int red_dim = 1;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, 1});
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = reductionOp(
+      BinaryOpType::Add, {red_dim}, IrBuilder::create<Double>(0), tv0);
+
+  TensorView* tv2 =
+      reductionOp(BinaryOpType::Add, {1}, IrBuilder::create<Double>(0), tv1);
+  fusion.addOutput(tv2);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({bid_x, tid_x, 1}, options);
+
+  // Apply reduction heuristic
+  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  scheduleReduction(&fusion, *reduction_params);
+  auto lparams = reduction_params->lparams;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input}, lparams);
+  // no broadcasting needed, omitting the last optional argument;
+  auto cg_outputs = fe.runFusion({aten_input}, lparams);
+  auto aten_output = aten_input.to(at::kDouble).sum({2, 1});
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {aten_input},
+      {aten_output},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionTrivialReduction_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeConcreteTensor({10, 20, 1});
+  fusion.addInput(tv0);
+  TensorView* tv1 =
+      reductionOp(BinaryOpType::Add, {2}, IrBuilder::create<Double>(0), tv0);
+  fusion.addOutput(tv1);
+
+  TORCH_CHECK(
+      ir_utils::getReductionOps(&fusion, true /* ignore_trivial */).empty(),
+      "Trivial reduction picked up by fusion");
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({10, 20, 1}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+  auto aten_output = aten_input.to(at::kDouble).sum({2});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTrivialReduction2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int w = 1, x = 1, y = 7, z = 8;
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = makeConcreteTensor({w, x, y, z});
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = sum(tv1, {0});
+  auto tv3 = sum(tv2, {0});
+  auto tv4 = add(tv3, tv0);
+
+  fusion.addOutput(tv4);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({y, z}, options);
+  at::Tensor t1 = at::randn({w, x, y, z}, options);
+  auto aten_output = t1.to(at::kDouble).sum({0}).sum({0}).add(t0);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  auto lparams = schedulePointwise(&fusion, aten_inputs);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs, lparams);
+  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTrivialReduction3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int v = 1, w = 1, x = 1, y = 7, z = 8;
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = makeConcreteTensor({v, w, x, y, z});
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = sum(tv1, {0, 1, 2});
+  auto tv3 = add(tv2, tv0);
+
+  fusion.addOutput(tv3);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({y, z}, options);
+  at::Tensor t1 = at::randn({v, w, x, y, z}, options);
+  auto aten_output = t1.sum({0, 1, 2}).add(t0);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  auto lparams = schedulePointwise(&fusion, aten_inputs);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs, lparams);
+  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+// Make sure trivial reductions are correctly detected even with
+// scheduling applied.
+TEST_F(NVFuserTest, FusionDetectTrivialReduction1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = broadcast(tv0, {false, true});
+  auto tv2 = sum(tv1, {1});
+  fusion.addOutput(tv2);
+
+  tv2->split(1, 4);
+  tv2->split(1, 8);
+  auto tv3 = tv2->rFactor({-1});
+  auto tv4 = tv2->rFactor({-1});
+
+  auto tv5 = broadcast(tv0, {true, false});
+  auto tv6 = add(tv5, IrBuilder::create<Double>(1));
+  auto tv7 = sub(tv6, IrBuilder::create<Double>(1));
+  auto tv8 = sum(tv7, {0});
+  fusion.addOutput(tv8);
+
+  auto tv9 = broadcast(tv0, {false, true, true});
+  auto tv10 = sum(tv9, {1});
+  auto tv11 = sum(tv10, {1});
+  fusion.addOutput(tv11);
+
+  tv8->split(0, 3);
+  tv10->split(1, 4);
+  tv11->split(1, 5);
+
+  tv0->computeAt(tv2, -1);
+  tv0->computeAt(tv8, -1);
+  tv0->computeAt(tv11, 1);
+
+  // Test indexing to gmem-backed tensors
+  tv3->setMemoryType(MemoryType::Global);
+  tv8->setMemoryType(MemoryType::Global);
+
+  GpuLower gpulw(&fusion);
+
+  // No ReductionOp should be generated as all the reduction
+  // exprs should be replaced with a unary set op.
+  for (const auto expr : gpulw.kernel()->as<Fusion>()->exprs()) {
+    TORCH_CHECK(!expr->isA<ReductionOp>());
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({100}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {t0, t0, t0}, __LINE__, __FILE__);
+}
+
+// Test detection of partially trivial reduction
+TEST_F(NVFuserTest, FusionDetectTrivialReduction2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  tv1->split(1, 1);
+  // tv1->axis(1): non-trivial
+  // tv1->axis(2): trivial
+
+  auto tv3 = tv1->rFactor({-1});
+
+  // Just to suppress register-allocation warning
+  tv0->computeAt(tv2, 1);
+  tv3->computeAt(tv1, -1);
+
+  GpuLower gpulw(&fusion);
+
+  // tv3's reduction axis is a trivial reduction. The only
+  // ReductionOp should be for tv1.
+  for (const auto expr : gpulw.kernel()->as<Fusion>()->exprs()) {
+    if (expr->isA<ReductionOp>()) {
+      auto reduction_out =
+          expr->as<ReductionOp>()->outputs()[0]->as<TensorView>();
+      TORCH_CHECK(reduction_out->name() == 1);
+    }
+  }
+}
+
+TEST_F(NVFuserTest, FusionInputsIdLookup_CUDA) {
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({16, 8, 8}, options);
+  at::Tensor t1 = at::randn({8, 8}, options);
+  at::Tensor t2 = at::randn({6, 4}, options);
+
+  // create a cache with max size 2;
+  torch::jit::fuser::cuda::InputsIdLookup inputs_id_lookup(2);
+
+  // testing basic function, same encoding for identical inputs
+  auto id_0 = inputs_id_lookup.lookupId({t0, t1, 5.0});
+  auto id_0_lookup = inputs_id_lookup.lookupId({t0, t1, 2.5});
+  TORCH_CHECK(id_0.id == id_0_lookup.id);
+  TORCH_CHECK(inputs_id_lookup.size() == 1);
+  TORCH_CHECK(id_0.eviction == false);
+
+  // new input (even tho same shape, but we have different signature because of
+  // missing scalar input
+  auto id_1 = inputs_id_lookup.lookupId({t0, t1});
+  auto id_1_lookup = inputs_id_lookup.lookupId({t0, t1});
+  TORCH_CHECK(id_1.id == id_1_lookup.id);
+  TORCH_CHECK(inputs_id_lookup.size() == 2);
+  TORCH_CHECK(id_1.eviction == false);
+
+  // eviction should happen at this point
+  auto id_2 = inputs_id_lookup.lookupId({t2, t1});
+  TORCH_CHECK(id_2.id != id_0.id);
+  TORCH_CHECK(id_2.id != id_1.id);
+  TORCH_CHECK(inputs_id_lookup.size() == 2);
+  TORCH_CHECK(id_2.eviction == true);
+  TORCH_CHECK(id_2.evict_id == id_0.id);
+
+  // look at input 1 again
+  auto id_1_relook = inputs_id_lookup.lookupId({t0, t1});
+  TORCH_CHECK(id_1_relook.id == id_1.id);
+  TORCH_CHECK(id_1_relook.eviction == false);
+}
+
+TEST_F(NVFuserTest, FusionGroupGuardSimpleTensor_CUDA) {
+  std::vector<int64_t> sizes_vec({16, 8, 8});
+  std::vector<int64_t> strides_vec({64, 8, 1});
+  auto tensor_type = TensorType::create(
+      at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt);
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  // pass with identical shape
+  auto t0 = at::randn({16, 8, 8}, options);
+  TORCH_CHECK(complyWith(t0, tensor_type));
+
+  // pass with dynamic shape
+  auto t1 = at::randn({16, 16, 8}, options);
+  TORCH_CHECK(complyWith(t1, tensor_type));
+
+  // broadcasting semantic change failure
+  auto t2 = at::randn({16, 1, 8}, options);
+  TORCH_CHECK(!complyWith(t2, tensor_type));
+
+  // contiguity failure via slicing
+  auto t3 = t0.slice(1, 0, 8, 2);
+  TORCH_CHECK(!complyWith(t3, tensor_type));
+
+  // contiguity failure via slicing
+  auto t4 = t0.slice(2, 0, 8, 2);
+  TORCH_CHECK(!complyWith(t4, tensor_type));
+
+  // rank failure
+  auto t5 = at::randn({16, 8, 8, 8}, options);
+  TORCH_CHECK(!complyWith(t5, tensor_type));
+
+  // contiguity on stride 1 dimension with implicit broadcasting
+  auto t = at::randn({4}, options);
+  auto t6 = t.unsqueeze(1).expand({4, 8});
+  TORCH_CHECK(complyWith(t6, TensorType::create(t6)));
+}
+
+TEST_F(NVFuserTest, FusionGroupGuardBroadcastTensor_CUDA) {
+  std::vector<int64_t> sizes_vec({16, 1, 8});
+  std::vector<int64_t> strides_vec({8, 8, 1});
+  auto tensor_type = TensorType::create(
+      at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt);
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  // broadcasting semantic change
+  auto t0 = at::randn({16, 8, 8}, options);
+  TORCH_CHECK(!complyWith(t0, tensor_type));
+
+  // dtype failure
+  auto t1 = at::randn({16, 1, 8}, options.dtype(at::kHalf));
+  TORCH_CHECK(!complyWith(t1, tensor_type));
+
+  // dtype failure
+  auto t2 = at::randn({16, 1, 8}, options);
+  TORCH_CHECK(complyWith(t2, tensor_type));
+
+  // device inconsistency shouldn't fail
+  auto t3 = at::randn({16, 1, 8}, options.device(at::kCPU, 0));
+  TORCH_CHECK(complyWith(t3, tensor_type));
+}
+
+TEST_F(NVFuserTest, FusionGroupGuardPermutedTensor_CUDA) {
+  std::vector<int64_t> sizes_vec({16, 8, 8});
+  std::vector<int64_t> strides_vec({64, 1, 8});
+  auto tensor_type = TensorType::create(
+      at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt);
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  // failing permutation
+  auto t0 = at::randn({16, 8, 8}, options);
+  TORCH_CHECK(!complyWith(t0, tensor_type));
+
+  // passing with dynamic shape
+  auto t1 = t0.permute({0, 2, 1});
+  TORCH_CHECK(complyWith(t1, tensor_type));
+}
+
+TEST_F(NVFuserTest, FusionGroupGuardRelaxedCheck_CUDA) {
+  std::vector<int64_t> sizes_vec({16, 8, 8});
+  std::vector<int64_t> strides_vec({128, 16, 1});
+  auto tensor_type = TensorType::create(
+      at::kFloat, c10::nullopt, sizes_vec, strides_vec, c10::nullopt);
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  // contiguity check passes although it differs
+  auto t0 = at::randn({16, 16, 8}, options);
+  TORCH_CHECK(complyWith(t0, tensor_type));
+
+  // passing with dynamic shape
+  auto t1 = t0.slice(1, 0, 16, 2);
+  TORCH_CHECK(complyWith(t1, tensor_type));
+}
+
+TEST_F(NVFuserTest, FusionDisjointSet_CUDA) {
+  DisjointSets<int> set;
+
+  const std::set<int> group_x({0, 1, 2});
+  const std::set<int> group_y({3, 4, 5});
+  const std::set<int> group_z({6, 7, 8});
+  const std::vector<std::set<int>> groups({group_x, group_y, group_z});
+  std::set<int> group_all;
+  std::for_each(groups.begin(), groups.end(), [&](const auto& g) {
+    group_all.insert(g.begin(), g.end());
+  });
+
+  // Initially, nothing should be considered equivalent
+  for (auto i : group_all) {
+    for (auto j : group_all) {
+      TORCH_CHECK(!set.permissiveAreMapped(i, j));
+    }
+  }
+
+  // Sets values in group_x are equivalent
+  for (auto i : group_x) {
+    for (auto j : group_x) {
+      set.mapEntries(i, j);
+      TORCH_CHECK(set.mappingExists(i));
+      TORCH_CHECK(set.mappingExists(j));
+    }
+  }
+
+  // All values in group_x shoudl be equivalent with each other
+  for (auto i : group_x) {
+    for (auto j : group_x) {
+      TORCH_CHECK(set.permissiveAreMapped(i, j));
+    }
+  }
+  // But nothing else should be equivalent
+  for (auto i : group_all) {
+    for (auto j : group_y) {
+      TORCH_CHECK(!set.permissiveAreMapped(i, j));
+    }
+    for (auto j : group_z) {
+      TORCH_CHECK(!set.permissiveAreMapped(i, j));
+    }
+  }
+
+  // Sets values in group_y are equivalent
+  for (auto i : group_y) {
+    for (auto j : group_y) {
+      set.mapEntries(i, j);
+      TORCH_CHECK(set.mappingExists(i));
+      TORCH_CHECK(set.mappingExists(j));
+    }
+  }
+
+  // group_x should be still equivalent
+  for (auto i : group_x) {
+    for (auto j : group_x) {
+      TORCH_CHECK(set.permissiveAreMapped(i, j));
+    }
+  }
+  // group_y should be now equivalent
+  for (auto i : group_y) {
+    for (auto j : group_y) {
+      TORCH_CHECK(set.permissiveAreMapped(i, j));
+    }
+  }
+  // But group_z should not be equivalent with anything yet
+  for (auto i : group_all) {
+    for (auto j : group_z) {
+      TORCH_CHECK(!set.permissiveAreMapped(i, j));
+    }
+  }
+
+  // Sets values in group_z are equivalent
+  for (auto i : group_z) {
+    for (auto j : group_z) {
+      set.mapEntries(i, j);
+      TORCH_CHECK(set.mappingExists(i));
+      TORCH_CHECK(set.mappingExists(j));
+    }
+  }
+
+  // Now each of the three groups should be equivalent within each
+  // group
+  for (const auto gi : c10::irange(groups.size())) {
+    for (const auto gj : c10::irange(groups.size())) {
+      for (auto i : groups[gi]) {
+        for (auto j : groups[gj]) {
+          TORCH_CHECK(
+              (gi == gj && set.permissiveAreMapped(i, j)) ||
+              (gi != gj && !set.permissiveAreMapped(i, j)));
+        }
+      }
+    }
+  }
+
+  std::vector<int> all_elements = set.getAllElements().vector();
+  std::sort(all_elements.begin(), all_elements.end());
+  std::vector<int> group_all_vec(group_all.begin(), group_all.end());
+  std::sort(group_all_vec.begin(), group_all_vec.end());
+  TORCH_CHECK(all_elements == group_all_vec);
+
+  set.clear();
+  TORCH_CHECK(set.getAllElements().vector().size() == 0);
+
+  // All cleared. Nothing should be considered equivalent.
+  for (auto i : group_all) {
+    for (auto j : group_all) {
+      TORCH_CHECK(!set.permissiveAreMapped(i, j));
+    }
+  }
+}
+
+TEST_F(NVFuserTest, FusionNonUniqueBroadcastSize_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  auto tv1 = makeSymbolicTensor(2);
+  auto tv2 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addInput(tv2);
+
+  auto tv3 = broadcast(tv0, {true, false});
+  auto tv4 = add(tv3, tv1);
+  auto tv5 = add(tv3, tv2);
+
+  fusion.addOutput(tv4);
+  fusion.addOutput(tv5);
+
+  // In order to do this, tv1->axis(1) and tv2->axis(1) must have the
+  // same size, but we can't prove it, so this should throw an error.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(tv3->computeAt(tv4, -1));
+}
+
+TEST_F(NVFuserTest, FusionBiasGeluFwd_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const float k_079 = 0.79788456;
+  const float k_004 = 0.044715;
+
+  // bias vector
+  auto t0 = makeSymbolicTensor(1, DataType::Half);
+  fusion.addInput(t0);
+  auto t1 = castOp(DataType::Float, t0);
+  // input tensor
+  auto t2 = makeSymbolicTensor(3, DataType::Half);
+  fusion.addInput(t2);
+  auto t3 = castOp(DataType::Float, t2);
+  auto t4 = broadcast(t1, {true, true, false});
+  auto t5 = add(t4, t3);
+  auto t6 = mul(t5, IrBuilder::create<Double>(0.5));
+  auto t7 = mul(t5, IrBuilder::create<Double>(k_079));
+  auto t8 = mul(t5, IrBuilder::create<Double>(k_004));
+  auto t9 = mul(t8, t5);
+  auto t10 = add(t9, IrBuilder::create<Int>(1));
+  auto t11 = mul(t7, t10);
+  auto t12 = unaryOp(UnaryOpType::Tanh, t11);
+  auto t13 = add(t12, IrBuilder::create<Double>(1));
+  auto t14 = mul(t6, t13);
+  auto t15 = castOp(DataType::Half, t14);
+  fusion.addOutput(t15);
+
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  std::vector<int64_t> input_shape{6, 512, 4096};
+  std::vector<int64_t> bias_shape{4096};
+
+  auto at_input = at::randn(input_shape, options);
+  auto at_bias = at::randn(bias_shape, options);
+
+  auto at_x =
+      at_bias.to(c10::ScalarType::Float) + at_input.to(c10::ScalarType::Float);
+  auto aten_output_float =
+      at_x * 0.5 * (1.0 + (k_079 * at_x * (1 + k_004 * at_x * at_x)).tanh());
+  auto aten_output = aten_output_float.to(c10::ScalarType::Half);
+
+  std::vector<IValue> aten_inputs = {at_bias, at_input};
+  auto lparams = schedulePointwise(&fusion, aten_inputs);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs, lparams);
+  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBiasGeluBwd_CUDA) {
+  if (at::cuda::getDeviceProperties(0)->major < 6) {
+    return;
+  }
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const float k_079 = 0.79788456;
+  const float k_004 = 0.044715;
+  const float k_010 = 0.1070322243;
+
+  // gradient tensor
+  auto t0 = makeSymbolicTensor(3, DataType::Half);
+  fusion.addInput(t0);
+  auto t1 = castOp(DataType::Float, t0);
+  // bias tensor
+  auto t2 = makeSymbolicTensor(1, DataType::Half);
+  fusion.addInput(t2);
+  auto t3 = castOp(DataType::Float, t2);
+  // input tensor
+  auto t4 = makeSymbolicTensor(3, DataType::Half);
+  fusion.addInput(t4);
+  auto t5 = castOp(DataType::Float, t4);
+  auto t6 = broadcast(t3, {true, true, false});
+  auto t7 = add(t6, t5);
+  auto t8 = mul(t7, IrBuilder::create<Double>(k_079));
+  auto t9 = mul(t7, IrBuilder::create<Double>(k_004));
+  auto t10 = mul(t9, t7);
+  auto t11 = add(t10, IrBuilder::create<Int>(1));
+  auto t12 = mul(t8, t11);
+  auto t13 = unaryOp(UnaryOpType::Tanh, t12);
+  auto t14 = mul(t7, IrBuilder::create<Double>(0.5));
+  auto t15 = mul(t13, t13);
+  auto t16 = unaryOp(UnaryOpType::Neg, t15);
+  auto t17 = add(t16, IrBuilder::create<Int>(1));
+  auto t18 = mul(t7, IrBuilder::create<Double>(k_010));
+  auto t19 = mul(t18, t7);
+  auto t20 = add(t19, IrBuilder::create<Double>(k_079));
+  auto t21 = mul(t17, t20);
+  auto t22 = mul(t14, t21);
+  auto t23 = add(t13, IrBuilder::create<Int>(1));
+  auto t24 = mul(t23, IrBuilder::create<Double>(0.5));
+  auto t25 = add(t22, t24);
+  auto t26 = mul(t25, t1);
+  // Save float output for validation
+  fusion.addOutput(t26);
+  auto t27 = castOp(DataType::Half, t26);
+  fusion.addOutput(t27);
+
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  at::manual_seed(1);
+  std::vector<int64_t> input_shape{6, 512, 4096};
+  std::vector<int64_t> bias_shape{4096};
+  auto at_input = at::randn(input_shape, options);
+  auto at_bias = at::randn(bias_shape, options);
+  auto at_grad = at::randn(input_shape, options);
+
+  auto at_x =
+      at_bias.to(c10::ScalarType::Float) + at_input.to(c10::ScalarType::Float);
+  auto at_tanh_out = (k_079 * at_x * (1 + k_004 * at_x * at_x)).tanh();
+  auto at_ff = 0.5 * at_x *
+          ((1 - at_tanh_out * at_tanh_out) * (k_079 + k_010 * at_x * at_x)) +
+      0.5 * (1 + at_tanh_out);
+  auto at_out = at_ff * at_grad;
+  auto at_out_half = at_out.to(c10::ScalarType::Half);
+
+  std::vector<IValue> aten_inputs = {at_grad, at_bias, at_input};
+  std::vector<at::Tensor> aten_outputs = {at_out, at_out_half};
+
+  auto lparams = schedulePointwise(&fusion, aten_inputs);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs, lparams);
+  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
+}
+
+// Reproducer of issue #459
+TEST_F(NVFuserTest, FusionIssue459_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv3 = broadcast(tv2, {true, false});
+  auto tv4 = add(tv1, tv3);
+
+  // Create two outputs from the final arithmetic result
+  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv5);
+  auto tv6 = add(tv4, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv6);
+
+  // Scheduling
+  for (auto output : ir_utils::filterByType<TensorView>(fusion.outputs())) {
+    output->merge(-2, -1);
+  }
+  for (auto output : ir_utils::filterByType<TensorView>(fusion.outputs())) {
+    output->split(0, 128);
+  }
+
+  tv0->computeAt(tv5, -1);
+
+  tv6->axis(0)->parallelize(ParallelType::BIDx);
+  tv6->axis(1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  const int numel_x = 10;
+  const int numel_y = 20;
+  auto t0 = at::randn({numel_x}, options);
+  auto t1 = at::randn({numel_y, numel_x}, options);
+  auto aten_output = (t0 + 1).unsqueeze(0) + t1 + 1;
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      aten_inputs,
+      {aten_output, aten_output},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSmemIndexingSimple_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv3);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv0->computeAt(tv3, -1);
+
+  tv1->setMemoryType(MemoryType::Shared);
+  tv2->setMemoryType(MemoryType::Global);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  auto aten_input = at::randn({12, 34}, options);
+  at::Tensor aten_output = aten_input + 1.0 + 1.0 + 1.0;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSmemIndexing_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Symbolic integers we will use for runtime tiling
+  Int* symbolic_m_tile_dim = IrBuilder::create<Int>();
+  Int* symbolic_split_k_tile_dim = IrBuilder::create<Int>();
+  Int* symbolic_block_k_tile_dim = IrBuilder::create<Int>();
+  // Compile-time integer for tiling
+  int n_smem_tile = 32;
+
+  // Symbolic 2D tensors TV0[M, K], TV1[K, N]
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+
+  // Broadcast tv0 to [M, K, *]
+  TensorView* tv2 = broadcast(tv0, {false, false, true});
+  // Broadcast tv1 to [*, K, N]
+  TensorView* tv3 = broadcast(tv1, {true, false, false});
+
+  // Pointwise multiplication resulting in tv3[M, K, N]
+  TensorView* tv4 = mul(tv2, tv3);
+
+  // Sum the K-dim
+  TensorView* tv5 = sum(tv4, {1});
+
+  // Register inputs and outputs
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv5);
+
+  // Register runtime tile dims as inputs
+  fusion.addInput(symbolic_m_tile_dim);
+  fusion.addInput(symbolic_split_k_tile_dim);
+  fusion.addInput(symbolic_block_k_tile_dim);
+
+  // Make a 3D tile, mix of symbolic and constant, do in reverse order because
+  // dims are inserted
+  // [M, rK, N]
+  tv5->split(2, n_smem_tile);
+  // [M, rK, No, Ni{32}]
+  tv5->split(1, symbolic_block_k_tile_dim);
+  // [M, rKo, rKi{i2}, No, Ni{32}]
+  tv5->split(1, symbolic_split_k_tile_dim);
+  // [M, rKoo, rKoi{i1}, rKi{i2}, No, Ni{32}]
+  tv5->split(0, symbolic_m_tile_dim);
+  // [Mo, Mi{i0}, rKoo, rKoi{i1}, rKi{i2}, No, Ni{32}]
+
+  // Reorder so all outer tiles are in the leftmost 3 positions
+  // [Mo, Mi{i0}, rKoo, rKoi{i1}, rKi{i2},     No, Ni{32}]
+  // [Mo,     No, rKoo, rKoi{i1}, rKi{i2}, Mi{i0}, Ni{32}]
+  tv5->reorder({{1, 5}, {5, 1}});
+
+  // Factor out the outer reduction IterDomain, then run the inter-cta
+  // reduction, and intra-cta reduction
+  // [Mo, No, rKoo,  Koi{i1},  Ki{i2}, Mi{i0}, Ni{32}]
+  // [Mo, No,       rKoi{i1}, rKi{i2}, Mi{i0}, Ni{32}]
+  auto tv6 = tv5->rFactor({2});
+
+  // Scope computations
+  tv6->computeAt(tv5, 2);
+
+  // [Mo, No, rKoo, Koi{i1},  Ki{i2}, Mi{i0}, Ni{32}]
+  // [Mo, No, Ki{i2}, Mi{i0}, Ni{32}, rKoo, Koi{i1}]
+  tv6->reorder({
+      {5, -2},
+      {6, -1},
+      {2, 2},
+      {3, 3},
+      {4, 4},
+  });
+
+  // Setup compute at schedule
+  tv0->computeAt(tv6, 3);
+  tv1->computeAt(tv6, 3);
+  tv4->computeAt(tv6, -1);
+
+  // Cache smem tiles
+  tv2->setMemoryType(MemoryType::Shared);
+  tv3->setMemoryType(MemoryType::Shared);
+  tv4->setMemoryType(MemoryType::Shared);
+  tv6->setMemoryType(MemoryType::Shared);
+
+  tv5->axis(0)->parallelize(ParallelType::BIDz);
+  tv5->axis(1)->parallelize(ParallelType::BIDy);
+
+  std::vector<TensorView*> tv_list = {tv2, tv3, tv4, tv5, tv6};
+  for (auto tv : tv_list) {
+    tv->axis(-2)->parallelize(ParallelType::TIDz);
+    tv->axis(-1)->parallelize(ParallelType::TIDy);
+  }
+
+  constexpr int M = 31, K = 65, N = 32;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M, K}, options);
+  at::Tensor t1 = at::randn({K, N}, options);
+
+  at::Tensor aten_output =
+      mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1);
+
+  // A, B, m_tile_dim, split_k, intra_cta_tile
+  std::vector<IValue> aten_inputs = {t0, t1, 3, 4, 5};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+// Reproducer of issue 408
+TEST_F(NVFuserTest, FusionCacheBeforeReduction_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {1});
+  fusion.addOutput(tv2);
+
+  tv2->split(0, 4);
+
+  auto tv3 = tv2->cacheBefore();
+
+  tv0->computeAt(tv3, -1);
+  tv3->computeAt(tv2, -1);
+
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  const int numel_x = 100;
+  const int numel_y = 200;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({numel_x, numel_y}, options);
+  at::Tensor cg_output = at::empty({numel_x}, options);
+
+  auto aten_output = (aten_input + 1).to(at::kDouble).sum({1});
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  fe.runFusion({aten_input}, {cg_output});
+
+  testValidate(
+      &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionCacheBeforeReduction2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(3);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {1});
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+  fusion.addOutput(tv3);
+
+  auto tv4 = tv2->cacheBefore();
+
+  tv4->computeAt(tv3, 1);
+  tv0->computeAt(tv4, -1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+
+  const int numel_x = 10;
+  const int numel_y = 20;
+  const int numel_z = 30;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({numel_x, numel_y, numel_z}, options);
+  auto t2 = (aten_input + 1).to(at::kDouble).sum({1});
+  auto t3 = t2 + 1;
+  std::vector<at::Tensor> aten_outputs = {t2, t3};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue367_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Symbolic integers we will use for runtime tiling
+  Int* symbolic_m_tile_dim = IrBuilder::create<Int>();
+  Int* symbolic_split_k_tile_dim = IrBuilder::create<Int>();
+  Int* symbolic_block_k_tile_dim = IrBuilder::create<Int>();
+  // Compile-time integer for tiling
+  int n_smem_tile = 32;
+
+  // Symbolic 2D tensors TV0[M, K], TV1[K, N]
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+
+  // Broadcast tv0 to [M, K, *]
+  TensorView* tv2 = broadcast(tv0, {false, false, true});
+  // Broadcast tv1 to [*, K, N]
+  TensorView* tv3 = broadcast(tv1, {true, false, false});
+
+  // Pointwise multiplication resulting in tv3[M, K, N]
+  TensorView* tv4 = mul(tv2, tv3);
+
+  // Sum the K-dim
+  TensorView* tv5 = sum(tv4, {1});
+
+  // Register inputs and outputs
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv5);
+
+  // Register runtime tile dims as inputs
+  fusion.addInput(symbolic_m_tile_dim);
+  fusion.addInput(symbolic_split_k_tile_dim);
+  fusion.addInput(symbolic_block_k_tile_dim);
+
+  // Make a 3D tile, mix of symbolic and constant, do in reverse order because
+  // dims are inserted
+  // [M, K, N]
+  tv5->split(2, n_smem_tile);
+  tv5->split(1, symbolic_block_k_tile_dim);
+  tv5->split(1, symbolic_split_k_tile_dim);
+  tv5->split(0, symbolic_m_tile_dim);
+  // [Mo, Mi, Koo, Koi, Ki, No, Ni]
+  tv5->reorder({{1, 5}, {5, 1}});
+  // [Mo, No, Koo, Koi, Ki, Mi, Ni]
+
+  auto tv6 = tv5->rFactor({2});
+  auto tv7 = tv5->rFactor({2});
+  // [Mo, No, rKoo,  Koi,  Ki, Mi, Ni]
+  // [Mo, No,       rKoi, rKi, Mi, Ni]
+
+  // Scope computations
+  tv6->computeAt(tv5, 2);
+
+  tv0->computeAt(tv6, 3);
+  tv1->computeAt(tv6, 3);
+  tv4->computeAt(tv6, -1);
+
+  // Cache smem tiles
+  tv2->setMemoryType(MemoryType::Shared);
+  tv3->setMemoryType(MemoryType::Shared);
+  tv4->setMemoryType(MemoryType::Local);
+  tv6->setMemoryType(MemoryType::Local);
+  tv7->setMemoryType(MemoryType::Local);
+
+  tv5->axis(0)->parallelize(ParallelType::BIDz);
+  tv5->axis(1)->parallelize(ParallelType::BIDy);
+
+  std::vector<TensorView*> tv_list = {tv2, tv3, tv4, tv5, tv6, tv7};
+  for (auto tv : tv_list) {
+    tv->axis(-2)->parallelize(ParallelType::TIDz);
+    tv->axis(-1)->parallelize(ParallelType::TIDy);
+  }
+  tv2->axis(3)->parallelize(ParallelType::TIDx);
+  tv3->axis(3)->parallelize(ParallelType::TIDx);
+  tv4->axis(3)->parallelize(ParallelType::TIDx);
+  tv6->axis(3)->parallelize(ParallelType::TIDx);
+  tv7->axis(2)->parallelize(ParallelType::TIDx);
+
+  tv2->axis(4)->parallelize(ParallelType::BIDx);
+  tv3->axis(4)->parallelize(ParallelType::BIDx);
+  tv4->axis(4)->parallelize(ParallelType::BIDx);
+  tv6->axis(4)->parallelize(ParallelType::BIDx);
+  tv7->axis(3)->parallelize(ParallelType::BIDx);
+  tv5->axis(2)->parallelize(ParallelType::BIDx);
+
+  constexpr int M = 3, K = 6, N = 16;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({M, K}, options);
+  at::Tensor t1 = at::randn({K, N}, options);
+
+  // A, B, m, split_k, block_k
+  std::vector<IValue> aten_inputs = {t0, t1, 2, 2, 3};
+  at::Tensor aten_output =
+      mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1);
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue468_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = sum(tv1, {0});
+  fusion.addOutput(tv2);
+
+  tv1->axis(0)->parallelize(ParallelType::TIDy);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv2->axis(0)->parallelize(ParallelType::TIDy);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({10, 100}, options);
+  at::Tensor aten_output = aten_input.to(at::kDouble).sum({1}).sum({0});
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue363_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Symbolic 2D tensors TV0[M, K], TV1[K, N]
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(2);
+
+  // Broadcast tv0 to [M, K, *]
+  TensorView* tv2 = broadcast(tv0, {false, false, true});
+  // Broadcast tv1 to [*, K, N]
+  TensorView* tv3 = broadcast(tv1, {true, false, false});
+
+  // Pointwise multiplication resulting in tv3[M, K, N]
+  TensorView* tv4 = mul(tv2, tv3);
+
+  // Sum the K-dim
+  TensorView* tv5 = sum(tv4, {1});
+
+  // Register inputs and outputs
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv5);
+
+  tv2->setMemoryType(MemoryType::Global);
+  tv3->setMemoryType(MemoryType::Global);
+  tv4->setMemoryType(MemoryType::Global);
+
+  tv0->computeAt(tv5, -1);
+  tv1->computeAt(tv5, -1);
+
+  tv5->axis(0)->parallelize(ParallelType::BIDz);
+  tv5->axis(1)->parallelize(ParallelType::BIDy);
+
+  tv5->axis(2)->parallelize(ParallelType::BIDx);
+
+  constexpr int M = 3, K = 6, N = 16;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({M, K}, options);
+  at::Tensor t1 = at::randn({K, N}, options);
+  at::Tensor aten_output =
+      mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue484_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = add(tv1, IrBuilder::create<Double>(0));
+  fusion.addOutput(tv2);
+
+  tv1->setMemoryType(MemoryType::Global);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 100;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({M, M}, options);
+  at::Tensor aten_output = aten_input.to(at::kDouble).sum({1});
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue329_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {1});
+  fusion.addOutput(tv2);
+  auto tv3 = sum(tv1, {1});
+  fusion.addOutput(tv3);
+
+  tv1->computeAt(tv2, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  std::vector<int64_t> t0_shape{17, 19};
+  auto aten_input = at::randn(t0_shape, options);
+  auto t2 = (aten_input + 1).to(at::kDouble).sum({1});
+  auto t3 = (aten_input + 1).to(at::kDouble).sum({1});
+  std::vector<at::Tensor> aten_outputs = {t2, t3};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue382_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = broadcast(tv1, {false, false, true});
+  auto tv3 = makeSymbolicTensor(3);
+  fusion.addInput(tv3);
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  tv2->merge(1);
+  tv4->merge(1);
+
+  tv1->computeAt(tv4, 1);
+
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv1->setMemoryType(MemoryType::Global);
+  tv2->setMemoryType(MemoryType::Global);
+
+  const int numel_x = 12;
+  const int numel_y = 34;
+  const int numel_z = 56;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({numel_x, numel_y}, options);
+  auto t3 = at::randn({numel_x, numel_y, numel_z}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t3};
+  auto aten_output = (t0 + 1).unsqueeze(-1) + t3;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue507_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  std::vector<int64_t> t0_shape{17, 19};
+  auto aten_input = at::randn(t0_shape, options);
+  auto t1 = (aten_input + 1);
+  auto aten_output = (t1 + 1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue532_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Algorithm
+  TensorView* tv0 = makeSymbolicTensor(1);
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addInput(tv0);
+  fusion.addOutput(tv2);
+
+  const int M_BLOCK = 64;
+  const int M_THREAD = 4;
+
+  tv2->split(0, M_BLOCK);
+  // tv2: [M/M_BLOCK, M_BLOCK]
+  tv1->computeAt(tv2, 1);
+  // tv1: [M/M_BLOCK, M_BLOCK]
+
+  tv1->split(-1, M_BLOCK / M_THREAD);
+  // tv1: [M/M_BLOCK, M_THREAD, M_BLOCK / M_THREAD]
+
+  tv2->split(-1, M_THREAD);
+  // tv2: [M/M_BLOCK, M_BLOCK / M_THREAD, M_THREAD]
+
+  constexpr int M = 1000;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  at::Tensor aten_output = t0 + 1 + 1;
+
+  testValidate(
+      &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionLoopUnswitch_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Algorithm
+  TensorView* tv0 = makeSymbolicTensor(1);
+  TensorView* tv1 = add(tv0, IrBuilder::create<Double>(1));
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addInput(tv0);
+  fusion.addOutput(tv2);
+
+  tv2->split(0, 32);
+  tv1->computeAt(tv2, -1);
+
+  tv2->axis(1)->parallelize(ParallelType::Unswitch);
+
+  constexpr int M = 1000;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  at::Tensor aten_output = t0 + 1 + 1;
+
+  testValidate(
+      &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue549_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2); // M, K
+  TensorView* tv1 = makeSymbolicTensor(2); // K, N
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
+
+  TensorView* tv3 = broadcast(tv2, {false, false, true});
+  // tv3[I0, I1, B] = tv0[I0, I1]
+
+  TensorView* tv4 = broadcast(tv1, {true, false, false});
+  // tv4[B, I1, I2] = tv1[I1, I2]
+
+  // tv5[I0, I1, I2] = tv3[I0, I1, B] * tv4[B, I1, I2]
+  TensorView* tv5 = mul(tv3, tv4);
+  // tv6[I0, R1, I2] = tv5[I0, I1, I2]
+  TensorView* tv6 = sum(tv5, {1});
+  fusion.addOutput(tv6);
+
+  tv6->split(1, 32);
+  // tv6[I0, R1o, R1i{32}, I2]
+
+  auto tv7 = tv6->rFactor({1});
+  // tv7[I0, R1o, I1i{32}, I2] = tv5[I0, I1, I2]
+  // tv6[I0,    , R1i{32}, I2] = tv7[I0, R1o, I1i{32}, I2]
+
+  tv6->split(0, 4);
+  tv6->split(-1, 4);
+  // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
+  // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
+
+  tv0->computeAt(tv6, -1);
+  tv1->computeAt(tv6, -1);
+
+  // tv7[I0o, I0i{4}, R1o, I1i{32}, I2o, I2i{4}]
+  // tv6[I0o, I0i{4},    , R1i{32}, I2o, I2i{4}]
+  //--> (line symbolizes compute at location)
+  // tv5[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, I1o]
+  // tv7[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, R1o]
+  // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
+
+  tv0->computeAt(tv7, -1);
+  tv1->computeAt(tv7, -1);
+  // tv5[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, I1o |]
+  // tv7[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, R1o |]
+  // tv6[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
+
+  tv6->axis(0)->parallelize(ParallelType::BIDz);
+  tv6->axis(1)->parallelize(ParallelType::TIDz);
+
+  tv6->axis(-2)->parallelize(ParallelType::BIDy);
+  tv6->axis(-1)->parallelize(ParallelType::TIDy);
+
+  tv6->axis(2)->parallelize(ParallelType::TIDx);
+  tv7->axis(2)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 65, K = 33, N = 17;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({M, K}, options);
+  at::Tensor t1 = at::randn({K, N}, options);
+
+  // Lets specify a few bounds in launch params to make sure it works
+  LaunchParams lparams(1, -1, -1, 32, 4, 4);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1}, lparams);
+  fe.runFusion({t0, t1}, lparams);
+
+  // Make sure bad launch params throws
+  // TODO: Re-enable once we have parallelization validation in.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  // ASSERT_ANY_THROW(fe.runFusion({t0, t1}, LaunchParams(1, 2, 3, 4, 5, 6)));
+
+  // Don't specify any launch params
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto aten_output = (t0 + 1).to(at::kDouble).matmul(t1.to(at::kDouble));
+
+  testValidate(
+      &fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSimpleCompileRtc_CUDA) {
+  FusionExecutor fe;
+  std::string kernel = R"(
+__global__ void kernel1(Tensor<float, 1> T0, Tensor<float, 1> T1) {
+  if(threadIdx.x==0){
+    for(size_t ki28 = 0; ki28 < T0.size[0]; ++ki28) {
+      T1[ki28*T1.stride[0]] = T0[ki28*T0.stride[0]]*2;
+    }
+  }
+}
+    )";
+  fe.compileRtc(kernel, "CudaCodeGen::kernel1");
+  LaunchParams lp(
+      256, // gdimx
+      1, // gdimy
+      1, // gdimz
+      1, // bdimx
+      1, // bdimy
+      1 // bdimz
+  );
+  lp.setSmem(0);
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const std::vector<int64_t> tensor_dims = {8};
+  auto in0 = at::randn(tensor_dims, options);
+  auto out0 = at::empty_like(in0);
+  fe.runRtc(lp, {in0, out0});
+
+  auto out_ref = in0 * 2;
+  TORCH_CHECK(out_ref.allclose(out0));
+}
+
+TEST_F(NVFuserTest, FusionSerialWelford_CUDA) {
+  FusionExecutor fe;
+  int x = 128, y = 64, z = 64;
+
+  std::string kernel = R"(
+__global__ void kernel1(
+    Tensor<float,3> inp,
+    Tensor<float,1> out_var,
+    Tensor<float,1> out_avg
+){
+    for(int i0=0;i0<inp.size[0];i0++){
+        float tmp_M2=0;
+        float tmp_avg=0;
+        long tmp_N=0;
+        for(int i1=0;i1<inp.size[1];i1++){
+            for(int i2=0;i2<inp.size[2];i2++){
+                welfordCombine(
+                    tmp_avg,
+                    tmp_M2,
+                    tmp_N,
+                    inp[i0*inp.stride[0]+
+                        i1*inp.stride[1]+
+                        i2*inp.stride[2]],
+                    0.f,
+                    (long)1
+                );
+            }
+        }
+        out_var[i0*out_var.stride[0]]=
+            tmp_M2/(tmp_N);
+        out_avg[i0*out_avg.stride[0]]=
+            tmp_avg;
+    }
+}
+    )";
+  fe.compileRtc(kernel, "CudaCodeGen::kernel1");
+  LaunchParams lp(
+      1, // gdimx
+      1, // gdimy
+      1, // gdimz
+      1, // bdimx
+      1, // bdimy
+      1 // bdimz
+  );
+  lp.setSmem(0);
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const std::vector<int64_t> tensor_dims = {x, y, z};
+  auto in0 = at::randn(tensor_dims, options);
+  auto out_var = at::empty({x}, options);
+  auto out_avg = at::empty({x}, options);
+  fe.runRtc(lp, {in0, out_var, out_avg});
+
+  TORCH_CHECK(in0.var({1, 2}, false).allclose(out_var));
+  TORCH_CHECK(in0.mean({1, 2}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6));
+}
+
+TEST_F(NVFuserTest, FusionBlockWelford_CUDA) {
+  FusionExecutor fe;
+  int x = 7, y = 8, z = 9;
+
+  std::string kernel = R"(
+__global__ void kernel1(
+    Tensor<float,2> inp,
+    Tensor<float,1> out_avg,
+    Tensor<float,1> out_var,
+    Tensor<float,1> init_avg,
+    Tensor<float,1> init_var,
+    Tensor<long,0> init_N
+){
+    //actual generated kernel will use dynamic shared mem,
+    // here is just for prototype
+    __shared__ float mem_avg[512];
+    __shared__ float mem_M2[512];
+    __shared__ long mem_N[512];
+    float in=inp[threadIdx.x*inp.stride[0]+
+                        threadIdx.y*inp.stride[1]];
+    float tmp_avg=0;
+    float tmp_M2=0;
+    long tmp_N=0;
+    blockWelford<false,true,false>(
+        tmp_avg,
+        tmp_M2,
+        tmp_N,
+        in,
+        0.f,
+        (long)1,
+        threadIdx,
+        blockDim,
+        (float*)mem_avg,
+        (float*)mem_M2,
+        (long*)mem_N,
+        (bool)(threadIdx.x<inp.size[0]),
+        0.f);
+    __syncthreads();
+    if(threadIdx.x<out_var.size[0] && threadIdx.y==0){
+        welfordCombine(
+                    tmp_avg,
+                    tmp_M2,
+                    tmp_N,
+                    init_avg[threadIdx.x*init_avg.stride[0]],
+                    init_var[threadIdx.x*init_var.stride[0]]*init_N[0],
+                    init_N[0]
+                );
+        out_avg[threadIdx.x*out_avg.stride[0]]=tmp_avg;
+        out_var[threadIdx.x*out_var.stride[0]]=tmp_M2/(tmp_N);
+    }
+}
+    )";
+  fe.compileRtc(kernel, "CudaCodeGen::kernel1");
+  LaunchParams lp(
+      1, // gdimx
+      1, // gdimy
+      1, // gdimz
+      x, // bdimx
+      y, // bdimy
+      1 // bdimz
+  );
+  lp.setSmem(0);
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const std::vector<int64_t> tensor_dims = {x, y};
+  const std::vector<int64_t> init_dims = {x, z};
+
+  // generate initial values
+  auto init_in = at::randn(init_dims, options);
+  auto init_var = init_in.var({1}, false);
+  auto init_avg = init_in.mean({1});
+  auto init_N =
+      at::tensor(z, at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0));
+
+  auto in0 = at::randn(tensor_dims, options);
+
+  // run kernel
+  auto out_var = at::zeros({x}, options);
+  auto out_avg = at::zeros({x}, options);
+  fe.runRtc(lp, {in0, out_avg, out_var, init_avg, init_var, init_N});
+
+  // compare with reference output
+  auto cat_tensor = at::cat({init_in, in0}, 1);
+  TORCH_CHECK(cat_tensor.var({1}, false).allclose(out_var));
+  TORCH_CHECK(
+      cat_tensor.mean({1}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6));
+}
+
+TEST_F(NVFuserTest, FusionBlockWelfordNoInit_CUDA) {
+  FusionExecutor fe;
+  int x = 7, y = 8, z = 9;
+
+  // need support IValue for integer input as initial count
+  std::string kernel = R"(
+__global__ void kernel1(
+    Tensor<float,3> inp,
+    Tensor<float,1> out_avg,
+    Tensor<float,1> out_var
+){
+    //actual generated kernel will use dynamic shared mem,
+    // here is just for prototype
+    __shared__ float mem_avg[512];
+    __shared__ float mem_M2[512];
+    __shared__ long mem_N[512];
+    float in=inp[threadIdx.x*inp.stride[0]+
+                        threadIdx.y*inp.stride[1]+
+                        threadIdx.z*inp.stride[2]];
+    float tmp_avg=0;
+    float tmp_M2=0;
+    long tmp_N=0;
+    block_sync::init();
+    blockWelford<false,true,true>(
+        tmp_avg,
+        tmp_M2,
+        tmp_N,
+        in,
+        0.f,
+        (long) 1,
+        threadIdx,
+        blockDim,
+        (float*)mem_avg,
+        (float*)mem_M2,
+        (long*)mem_N,
+        (bool)(threadIdx.x<inp.size[0]),
+        0.f);
+    __syncthreads();
+    if(threadIdx.x<out_var.size[0] && threadIdx.y==0 && threadIdx.z==0){
+        out_avg[threadIdx.x*out_var.stride[0]]=tmp_avg;
+        out_var[threadIdx.x*out_var.stride[0]]=tmp_M2/(tmp_N);
+    }
+}
+    )";
+  fe.compileRtc(kernel, "CudaCodeGen::kernel1");
+  LaunchParams lp(
+      1, // gdimx
+      1, // gdimy
+      1, // gdimz
+      x, // bdimx
+      y, // bdimy
+      z // bdimz
+  );
+  lp.setSmem(0);
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const std::vector<int64_t> tensor_dims = {x, y, z};
+  auto in0 = at::randn(tensor_dims, options);
+  auto out_var = at::empty({x}, options);
+  auto out_avg = at::empty({x}, options);
+  fe.runRtc(lp, {in0, out_avg, out_var});
+
+  TORCH_CHECK(in0.var({1, 2}, false).allclose(out_var));
+  TORCH_CHECK(in0.mean({1, 2}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6));
+}
+
+TEST_F(NVFuserTest, FusionGridWelfordNoInit_CUDA) {
+  FusionExecutor fe;
+  int x = 128, y = 64, z = 128;
+
+  std::string kernel = R"(
+__global__ void kernel1(
+    Tensor<float,3> inp,
+    Tensor<float,1> out_avg,
+    Tensor<float,1> out_var,
+    Tensor<float,1> work_buf_avg,
+    Tensor<float,1> work_buf_M2,
+    Tensor<long,1> work_buf_N,
+    Tensor<int64_t,1> sync_flag
+){
+    __shared__ float shared_buf_avg[512];
+    __shared__ float shared_buf_M2[512];
+    __shared__ long shared_buf_N[512];
+    float tmp_avg=0;
+    float tmp_M2=0;
+    long tmp_N=0;
+    float in = inp[ blockIdx.x  * inp.stride[0]+
+                    blockIdx.y  * inp.stride[1]+
+                    threadIdx.x * inp.stride[2]];
+    block_sync::init();
+    welford::gridWelford<
+        true,true,false,
+        true,false,false,
+        false
+    >(
+        tmp_avg,
+        tmp_M2,
+        tmp_N,
+        in,
+        0.f,
+        (long) 1,
+        &work_buf_avg[0],
+        &work_buf_M2[0],
+        &work_buf_N[0],
+        sync_flag,
+        (float*)shared_buf_avg,
+        (float*)shared_buf_M2,
+        (long*)shared_buf_N,
+        threadIdx.x<out_var.size[0],
+        threadIdx.x<out_var.size[0],
+        0.f,
+        0,
+        1);
+    if(blockIdx.x == gridDim.x - 1 && blockIdx.y == gridDim.y - 1){
+        out_avg[threadIdx.x*out_avg.stride[0]]=tmp_avg;
+        out_var[threadIdx.x*out_var.stride[0]]=tmp_M2/tmp_N;
+    }
+}
+    )";
+  fe.compileRtc(kernel, "CudaCodeGen::kernel1");
+  LaunchParams lp(
+      x, // gdimx
+      y, // gdimy
+      1, // gdimz
+      z, // bdimx
+      1, // bdimy
+      1 // bdimz
+  );
+  lp.setSmem(0);
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const auto options_int =
+      at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+
+  const std::vector<int64_t> tensor_dims = {x, y, z};
+  auto in0 = at::randn(tensor_dims, options);
+
+  auto out_avg = at::empty({z}, options);
+  auto out_var = at::empty({z}, options);
+  auto work_buf_avg = at::empty({x * y * z}, options);
+  auto work_buf_var = at::empty({x * y * z}, options);
+  auto work_buf_N = at::empty({x * y * z}, options_int);
+  auto sync_flag = at::zeros({1}, options_int);
+  fe.runRtc(
+      lp,
+      {in0,
+       out_avg,
+       out_var,
+       work_buf_avg,
+       work_buf_var,
+       work_buf_N,
+       sync_flag});
+  std::vector<int64_t> dims{0, 1};
+
+  TORCH_CHECK(in0.mean(dims).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6));
+  TORCH_CHECK(in0.var(dims, false).allclose(out_var));
+}
+
+TEST_F(NVFuserTest, FusionWelfordOp_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int M = 64, N = 128;
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = mul(tv0, IrBuilder::create<Double>(1));
+  auto tvs = Welford(tv1, {1});
+  auto tv_avg = tvs.avg;
+  auto tv_M2 = tvs.var_sum;
+  auto tv_N = tvs.n;
+  fusion.addOutput(tv_avg);
+  fusion.addOutput(tv_M2);
+  fusion.addOutput(tv_N);
+
+  tv_avg->split(1, 32);
+  tv_avg->split(0, 32);
+  tv_avg->split(0, 4);
+  tv_avg->reorder({{-1, -3}, {-3, -1}});
+  tv1->computeAt(tv_avg, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M, N}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0});
+
+  // by default Welford outputs sum of square diff so need to divide to get var
+  outputs[1] /= N;
+
+  testValidate(
+      fe.kernel(),
+      outputs,
+      {t0},
+      {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBlockWelfordOp_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int M = 64, N = 128;
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = mul(tv0, IrBuilder::create<Double>(1));
+  auto tvs = Welford(tv1, {1});
+  auto tv_avg = tvs.avg;
+  auto tv_M2 = tvs.var_sum;
+  auto tv_N = tvs.n;
+  fusion.addOutput(tv_avg);
+  fusion.addOutput(tv_M2);
+  fusion.addOutput(tv_N);
+
+  tv_avg->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv1->computeAt(tv_avg, -1);
+
+  //
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M, N}, options);
+  at::Tensor t_var = at::empty({M}, options);
+  at::Tensor t_avg = at::empty({M}, options);
+  at::Tensor t_N = at::empty({M}, options_int);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0});
+
+  // by default Welford outputs sum of square diff so need to divide to get var
+  outputs[1] /= N;
+
+  testValidate(
+      fe.kernel(),
+      outputs,
+      {t0},
+      {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionGridWelfordOp_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int M = 64, N = 128;
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = mul(tv0, IrBuilder::create<Double>(1));
+  auto tvs = Welford(tv1, {1});
+  auto tv_avg = tvs.avg;
+  auto tv_M2 = tvs.var_sum;
+  auto tv_N = tvs.n;
+  fusion.addOutput(tv_avg);
+  fusion.addOutput(tv_M2);
+  fusion.addOutput(tv_N);
+
+  tv_avg->axis(0)->parallelize(ParallelType::TIDx);
+  tv_avg->axis(-1)->parallelize(ParallelType::BIDx);
+
+  tv1->computeAt(tv_avg, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M, N}, options);
+  at::Tensor t_avg = at::empty({M}, options);
+  at::Tensor t_var = at::empty({M}, options);
+  at::Tensor t_N = at::empty({M}, options_int);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0});
+
+  // by default Welford outputs sum of square diff so need to divide to get var
+  outputs[1] /= N;
+
+  testValidate(
+      fe.kernel(),
+      outputs,
+      {t0},
+      {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionRfactorWelfordOp_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int M = 64, N = 128;
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = mul(tv0, IrBuilder::create<Double>(1));
+  auto tvs = Welford(tv1, {1});
+  auto tv_avg = tvs.avg;
+  auto tv_M2 = tvs.var_sum;
+  auto tv_N = tvs.n;
+  fusion.addOutput(tv_avg);
+  fusion.addOutput(tv_M2);
+  fusion.addOutput(tv_N);
+
+  tv_avg->split(1, 4);
+  ir_utils::rfactorHelper(tvs.avg, {2});
+  tv1->computeAt(tv_avg, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M, N}, options);
+  at::Tensor t_avg = at::empty({M}, options);
+  at::Tensor t_var = at::empty({M}, options);
+  at::Tensor t_N = at::empty({M}, options_int);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0});
+
+  // by default Welford outputs sum of square diff so need to divide to get var
+  outputs[1] /= N;
+
+  testValidate(
+      fe.kernel(),
+      outputs,
+      {t0},
+      {t0.mean({1}), t0.var({1}, false), at::ones({M}, options_int) * N},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionWelfordSchedule_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int M = 64, N = 128;
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = mul(tv0, IrBuilder::create<Double>(1));
+  auto tvs = Welford(tv1, {1});
+  auto tv_avg = tvs.avg;
+  auto tv_M2 = tvs.var_sum;
+  auto tv_N = tvs.n;
+  fusion.addOutput(tv_avg);
+  fusion.addOutput(tv_M2);
+  fusion.addOutput(tv_N);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M, N}, options);
+  // TODO: Why do we use launch params from here, but not scheduling???
+  auto reduction_params = getReductionHeuristics(&fusion, {t0});
+  scheduleReduction(&fusion, *reduction_params);
+
+  auto lparams = reduction_params->lparams;
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0}, lparams);
+  auto outputs = fe.runFusion({t0}, lparams);
+
+  // by default Welford outputs sum of square diff so need to divide to get var
+  outputs[1] /= N;
+
+  auto at_avg = t0.mean({1});
+  auto at_var = t0.var({1}, false);
+  auto at_n = at::ones({M}, options_int) * N;
+
+  testValidate(
+      fe.kernel(),
+      outputs,
+      {t0},
+      {at_avg, at_var, at_n},
+      __LINE__,
+      __FILE__,
+      "validate welford",
+      reduction_params->lparams);
+}
+
+namespace {
+void testWelford(DataType dtype, int red_axis, int odim, int rdim) {
+  const int axis = red_axis;
+  at::ScalarType aten_dtype = data_type_to_aten(dtype);
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  TensorView* tv0 = makeSymbolicTensor(2, dtype);
+  bool is_fp16 = dtype == DataType::Half;
+  bool is_bf16 = dtype == DataType::BFloat16;
+  TensorView* tv0_cast = tv0;
+  if (is_fp16 || is_bf16) {
+    tv0_cast = castOp(DataType::Float, tv0);
+  }
+  fusion.addInput(tv0);
+  auto tv1 = mul(tv0_cast, IrBuilder::create<Double>(1));
+  auto tvs = Welford(tv1, {axis});
+  auto tv_avg = tvs.avg;
+  auto tv_M2 = tvs.var_sum;
+  auto tv_N = tvs.n;
+
+  TensorView* avg_cast = tv_avg;
+  TensorView* M2_cast = tv_M2;
+
+  if (is_fp16) {
+    avg_cast = castOp(DataType::Half, tv_avg);
+    M2_cast = castOp(DataType::Half, tv_M2);
+  }
+  if (is_bf16) {
+    avg_cast = castOp(DataType::BFloat16, tv_avg);
+    M2_cast = castOp(DataType::BFloat16, tv_M2);
+  }
+
+  fusion.addOutput(avg_cast);
+  fusion.addOutput(M2_cast);
+  fusion.addOutput(tv_N);
+
+  auto options = at::TensorOptions().dtype(aten_dtype).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  std::vector<TensorView*> outputs_of_red;
+  at::Tensor aten_input =
+      (axis ? at::randn({odim, rdim}, options)
+            : at::randn({rdim, odim}, options));
+
+  if (is_fp16 || is_bf16) {
+    outputs_of_red.push_back(avg_cast);
+    outputs_of_red.push_back(M2_cast);
+  }
+
+  auto reduction_params = getReductionHeuristics(&fusion, {aten_input});
+  scheduleReduction(&fusion, *reduction_params);
+
+  auto lparams = reduction_params->lparams;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input}, lparams);
+  auto outputs = fe.runFusion({aten_input}, lparams);
+
+  // by default Welford outputs sum of square diff so need to divide to
+  // get var
+
+  outputs[1] /= rdim;
+
+  auto at_avg = aten_input.mean({axis});
+  auto at_var = aten_input.var({axis}, false);
+  auto at_n =
+      (axis ? at::ones({odim, rdim}, options)
+            : at::ones({rdim, odim}, options));
+  at_n = at_n.sum({axis});
+
+  testValidate(
+      fe.kernel(),
+      outputs,
+      {aten_input},
+      {at_avg, at_var, at_n},
+      __LINE__,
+      __FILE__,
+      "validate welford",
+      reduction_params->lparams);
+}
+} // namespace
+
+TEST_F(NVFuserTest, FusionWelfordShmoo_CUDA) {
+  std::vector<DataType> dtypes = {
+      DataType::Double, DataType::Float, DataType::Half};
+  // TODO: enable this for complex. Currently, complex yields
+  // silent wrong results:
+  //   Detected abs error of: 3.8062
+  //     absolute tolerance was set to 2.23704e-06
+  //     and relative tolerance set to 2.23704e-08
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+  if (at::cuda::getDeviceProperties(0)->major >= 8) {
+    dtypes.insert(dtypes.end(), DataType::BFloat16);
+  }
+#endif
+
+  std::vector<int> red_axis = {1, 0};
+  std::vector<int> output_dims = {160, 320};
+  std::vector<int> red_dims;
+
+  // Tried to cut down the number iterations with just
+  // doing every other power of 2.
+  for (int i = 1; i <= 1024 * 1024; i <<= 2) {
+    red_dims.push_back(i);
+  }
+
+  for (auto dtype : dtypes) {
+    for (auto& axis : red_axis) {
+      for (auto& odim : output_dims) {
+        for (auto& rdim : red_dims) {
+          // TODO: original welford algorithm actually keeps a running sum of
+          // squares, i.e. M_{2n} in the
+          //       cf:
+          //       https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+          //       algorithm notation, and it can reach inf for large numbers
+          //       with half precision. skipping too large volumes for half for
+          //       nwo might need further numerical experiments to re-design
+          //       this.
+          if (rdim > 32768 &&
+              (dtype == DataType::Half || dtype == DataType::BFloat16)) {
+            continue;
+          }
+          testWelford(dtype, axis, odim, rdim);
+        }
+      }
+    }
+  }
+}
+
+namespace {
+void testVarMean(at::ScalarType dtype, int correction, bool keepdim) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  int M = 64, N = 128;
+
+  auto tv0 = makeSymbolicTensor(2, aten_to_data_type(dtype));
+  fusion->addInput(tv0);
+  auto tvs = variance_mean(tv0, {1}, correction, keepdim);
+  auto tv_mean = tvs.mean;
+  auto tv_var = tvs.var;
+  fusion->addOutput(tv_var);
+  fusion->addOutput(tv_mean);
+
+  auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M, N}, options);
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto outputs = executor_cache.runFusionWithInputs({t0});
+
+  auto at_var_mean = at::var_mean(t0, {1}, correction, keepdim);
+  std::vector<at::Tensor> aten_outputs = {
+      std::get<0>(at_var_mean), std::get<1>(at_var_mean)};
+
+  testValidate(
+      executor_cache.fusion(), outputs, {t0}, aten_outputs, __LINE__, __FILE__);
+}
+} // namespace
+
+TEST_F(NVFuserTest, FusionVarMean_CUDA) {
+  std::vector<at::ScalarType> dtypes = {at::kFloat, at::kDouble};
+  std::vector<int> corrections = {0, 1};
+  std::vector<bool> keepdims = {false, true};
+  for (auto correction : corrections) {
+    for (auto keepdim : keepdims) {
+      for (auto dtype : dtypes) {
+        testVarMean(dtype, correction, keepdim);
+      }
+    }
+  }
+}
+
+TEST_F(NVFuserTest, FusionSimpleGemmTransposed_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+
+  TensorView* tv0 = makeSymbolicTensor(2); // K, M
+  TensorView* tv1 = makeSymbolicTensor(2); // N, K
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  TensorView* tv0_t = transpose(tv0);
+  TensorView* tv1_t = transpose(tv1);
+
+  TensorView* tv2 = broadcast(tv0_t, {false, false, true});
+  // tv2[I0, I1, B] = tv0[I0, I1]
+
+  TensorView* tv3 = broadcast(tv1_t, {true, false, false});
+  // tv3[B, I1, I2] = tv1[I1, I2]
+
+  // tv4[I0, I1, I2] = tv2[I0, I1, B] * tv3[B, I1, I2]
+  TensorView* tv4 = mul(tv2, tv3);
+  // tv5[I0, R1, I2] = tv4[I0, I1, I2]
+  TensorView* tv5 = sum(tv4, {1});
+  fusion.addOutput(tv5);
+
+  tv5->split(1, 32);
+  // tv5[I0, R1o, R1i{32}, I2]
+
+  auto tv6 = tv5->rFactor({1});
+  // tv6[I0, R1o, I1i{32}, I2] = tv4[I0, I1, I2]
+  // tv5[I0,    , R1i{32}, I2] = tv6[I0, R1o, I1i{32}, I2]
+
+  tv5->split(0, 4);
+  tv5->split(-1, 4);
+  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
+  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}]
+
+  tv0_t->computeAt(tv5, -1);
+  tv1_t->computeAt(tv5, -1);
+
+  // tv6[I0o, I0i{4}, R1o, I1i{32}, I2o, I2i{4}]
+  // tv5[I0o, I0i{4},    , R1i{32}, I2o, I2i{4}]
+  //--> (line symbolizes compute at location)
+  // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, I1o]
+  // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}|, R1o]
+  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
+
+  tv0_t->computeAt(tv6, -1);
+  tv1_t->computeAt(tv6, -1);
+  // tv4[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, I1o |]
+  // tv6[I0o, I0i{4}, I1i{32}, I2o, I2i{4}, R1o |]
+  // tv5[I0o, I0i{4}, R1i{32}, I2o, I2i{4}|]
+
+  tv5->axis(0)->parallelize(ParallelType::BIDz);
+  tv5->axis(1)->parallelize(ParallelType::TIDz);
+
+  tv5->axis(-2)->parallelize(ParallelType::BIDy);
+  tv5->axis(-1)->parallelize(ParallelType::TIDy);
+
+  tv5->axis(2)->parallelize(ParallelType::TIDx);
+  tv6->axis(2)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 65, K = 33, N = 17;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({K, M}, options);
+  at::Tensor t1 = at::randn({N, K}, options);
+
+  // Lets specify a few bounds in launch params to make sure it works
+  LaunchParams lparams(1, -1, -1, 32, 4, 4);
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1}, lparams);
+  fe.runFusion({t0, t1}, lparams);
+
+  // Don't specify any launch params
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto aten_output = t0.t().to(at::kDouble).matmul(t1.t().to(at::kDouble));
+
+  testValidate(
+      &fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSoftmax3DTransposed_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int tidx = 32;
+  const int dimx = 32;
+  const int dimy = 16;
+  const int dimz = 130;
+
+  // Set up your input tensor views
+  TensorView* input_tv0 = makeSymbolicTensor(3);
+  fusion.addInput(input_tv0);
+
+  TensorView* input_t = transpose(input_tv0, 1, 2);
+
+  TensorView* exp_tv1 = unaryOp(UnaryOpType::Exp, input_t);
+  TensorView* sum_exp_tv2 = sum(exp_tv1, {-1});
+  TensorView* bcast_sum_tv3 = broadcast(sum_exp_tv2, {false, false, true});
+
+  // Replicate exp_tv4 as exp_tv4_copy because exp_tv4 is going to be
+  // computed at sum_exp_rf_tv8.
+  TensorView* input_t_copy = transpose(input_tv0, 1, 2);
+  TensorView* exp_tv1_copy = unaryOp(UnaryOpType::Exp, input_t_copy);
+
+  TensorView* output_tv4 = div(exp_tv1_copy, bcast_sum_tv3);
+
+  fusion.addOutput(output_tv4);
+
+  bcast_sum_tv3->split(-1, tidx);
+
+  sum_exp_tv2->split(-1, tidx);
+  TensorView* sum_exp_rf_tv5 = sum_exp_tv2->rFactor({-2});
+
+  output_tv4->split(-1, tidx);
+
+  input_t->computeAt(sum_exp_rf_tv5, -1);
+  input_t_copy->computeAt(output_tv4, -1);
+
+  TensorView* tensors_to_parallelize[] = {
+      sum_exp_tv2, bcast_sum_tv3, output_tv4, sum_exp_rf_tv5};
+
+  for (auto tv : tensors_to_parallelize) {
+    tv->axis(0)->parallelize(ParallelType::BIDx);
+    tv->axis(1)->parallelize(ParallelType::BIDy);
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({dimx, dimz, dimy}, options);
+
+  at::Tensor cg_output = at::empty({dimx, dimy, dimz}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  fe.runFusion({input}, {cg_output});
+
+  auto aten_input_t = at::transpose(input, 1, 2);
+  auto aten_output = at::_softmax(aten_input_t.to(at::kDouble), -1, false);
+
+  testValidate(
+      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed1_CUDA) {
+  // Case 1
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv1 + 3
+  // tv4 = tv1 * 2
+  // tv5 = tv3 + tv2
+  // tv6 = tv5 + tv4
+  // tv7 = tv1 + tv4
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  tv0 = transpose(tv0);
+
+  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(0.5));
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(-1.0));
+  TensorView* tv3 = add(tv1, IrBuilder::create<Double>(3.0));
+  TensorView* tv4 = mul(tv1, IrBuilder::create<Double>(2.0));
+  TensorView* tv5 = add(tv3, tv2);
+
+  TensorView* tv6 = add(tv5, tv4);
+  TensorView* tv7 = add(tv1, tv4);
+
+  fusion.addOutput(tv6);
+  fusion.addOutput(tv7);
+
+  // Lets setup to actually run
+  tv7->merge(0);
+  tv7->split(0, 128);
+  tv7->split(0, 4);
+
+  tv7->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv0->computeAt(tv7, 1);
+
+  // The this-position of the last tensor should be zero.
+  TORCH_CHECK(
+      tv7->nDims() == 3 && tv7->getComputeAtPosition() == 0 &&
+      tv7->getMaxProducerPosition() == 1);
+  TORCH_CHECK(
+      tv6->nDims() == 3 && tv6->getComputeAtPosition() == 0 &&
+      tv6->getMaxProducerPosition() == 1);
+  // The position of every other tensor should be 1.
+  for (auto tv : {tv1, tv2, tv3, tv4, tv5}) {
+    TORCH_CHECK(tv->nDims() == 3 && tv->getComputeAtPosition() == 1);
+  }
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::randn({129, 127}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  at::Tensor aten_input_t = aten_input.t();
+
+  auto t1 = aten_input_t.mul({0.5});
+  auto t2 = t1.mul({-1.0});
+  auto t3 = t1.add({3.0});
+  auto t4 = t1.mul({2.0});
+  auto t5 = t3.add(t2);
+  auto t6 = t5.add(t4);
+  auto t7 = t1.add(t4);
+
+  std::vector<at::Tensor> aten_outputs = {t6, t7};
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed2_CUDA) {
+  // Case 2
+  // tv1 = tv0 * -1
+  // tv2 = tv0 + 3
+  // tv3 = tv0 * 2
+  // tv4 = tv2 + tv1
+  // tv5 = tv4 + tv3
+  // tv6 = tv5 + tv3
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  tv0 = transpose(tv0);
+
+  TensorView* tv1 = mul(tv0, IrBuilder::create<Double>(-1.0));
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(3.0));
+  TensorView* tv3 = mul(tv0, IrBuilder::create<Double>(2.0));
+  TensorView* tv4 = add(tv2, tv1);
+
+  TensorView* tv5 = add(tv4, tv3);
+  TensorView* tv6 = add(tv5, tv3);
+
+  fusion.addOutput(tv5);
+  fusion.addOutput(tv6);
+
+  // Lets setup to actually run
+  tv6->merge(0);
+  tv6->split(0, 128);
+  tv6->split(0, 4);
+
+  tv6->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv0->computeAt(tv6, 1);
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({129, 127}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  auto input_t = input.t();
+  auto t1 = input_t.mul({-1.0});
+  auto t2 = input_t.add({3.0});
+  auto t3 = input_t.mul({2.0});
+  auto t4 = t2.add(t1);
+  auto t5 = t4.add(t3);
+  auto t6 = t5.add(t3);
+
+  std::vector<at::Tensor> aten_outputs = {t5, t6};
+
+  testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed3_CUDA) {
+  // Case 3
+  // T2 = T1 * 0.979361
+  // T3 = T2 * T0
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(4);
+  fusion.addInput(tv0);
+
+  tv0 = permute(tv0, {3, 0, 1, 2});
+
+  TensorView* tv1 = makeSymbolicTensor(4);
+  fusion.addInput(tv1);
+
+  tv1 = permute(tv1, {3, 0, 1, 2});
+
+  TensorView* tv2 = mul(tv1, IrBuilder::create<Double>(.979361));
+  TensorView* tv3 = mul(tv2, tv0);
+
+  fusion.addOutput(tv3);
+
+  // Lets setup to actually run
+  while (tv3->nDims() > 1)
+    tv3->merge(0);
+  tv3->split(0, 128);
+  tv3->split(0, 4);
+
+  tv0->computeAt(tv3, 1);
+  tv1->computeAt(tv3, 1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto t0_t = t0.permute({3, 0, 1, 2});
+  auto t1_t = t1.permute({3, 0, 1, 2});
+  auto t2 = t1_t.mul({0.979361});
+  auto aten_output = t2.mul(t0_t);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed4_CUDA) {
+  // Case 4
+  // T4 = T2 - T3
+  // T5 = T1 + T4
+  // T6 = T5 - T0
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(4);
+  fusion.addInput(tv0);
+
+  tv0 = permute(tv0, {3, 0, 1, 2});
+
+  TensorView* tv1 = makeSymbolicTensor(4);
+  fusion.addInput(tv1);
+
+  tv1 = permute(tv1, {3, 0, 1, 2});
+
+  TensorView* tv2 = makeSymbolicTensor(4);
+  fusion.addInput(tv2);
+
+  tv2 = permute(tv2, {3, 0, 1, 2});
+
+  TensorView* tv3 = makeSymbolicTensor(4);
+  fusion.addInput(tv3);
+
+  tv3 = permute(tv3, {3, 0, 1, 2});
+
+  TensorView* tv4 = sub(tv2, tv3);
+  TensorView* tv5 = add(tv1, tv4);
+  TensorView* tv6 = sub(tv5, tv0);
+
+  fusion.addOutput(tv6);
+
+  // Lets setup to actually run
+  while (tv6->nDims() > 1)
+    tv6->merge(0);
+  tv6->split(0, 128);
+  tv6->split(0, 4);
+
+  tv0->computeAt(tv6, 1);
+  tv1->computeAt(tv6, 1);
+  tv2->computeAt(tv6, 1);
+  tv3->computeAt(tv6, 1);
+
+  tv6->axis(0)->parallelize(ParallelType::BIDx);
+
+  for (Val* val : fusion.vals()) {
+    if (!val->isFusionInput() &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({129, 127, 63, 65}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+  at::Tensor t2 = at::rand_like(t0, options);
+  at::Tensor t3 = at::rand_like(t0, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1, t2, t3};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto t0_t = t0.permute({3, 0, 1, 2});
+  auto t1_t = t1.permute({3, 0, 1, 2});
+  auto t2_t = t2.permute({3, 0, 1, 2});
+  auto t3_t = t3.permute({3, 0, 1, 2});
+  auto t4 = t2_t.sub(t3_t);
+  auto t5 = t1_t.add(t4);
+  auto aten_output = t5.sub(t0_t);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed5_CUDA) {
+  // Case 5
+  // tv2 = tv0 + 2.0
+  // tv3 = tv1 * tv2
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  tv0 = transpose(tv0);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  tv1 = transpose(tv1);
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = mul(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  tv3->merge(0);
+  tv3->split(-1, 8);
+  tv3->split(-1, 4);
+
+  tv0->computeAt(tv3, 1);
+  tv1->computeAt(tv3, 1);
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({63, 65}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto t2 = t0.t().add(2.0);
+  auto aten_output = t1.t().mul(t2);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  tv0 = transpose(tv0);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  tv1 = transpose(tv1);
+  TensorView* tv2 = add(tv0, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = mul(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  tv2->merge(0);
+  tv2->split(-1, 8);
+  tv2->split(-1, 4);
+  tv3->merge(0);
+  tv3->split(-1, 8);
+
+  tv0->computeAt(tv3, 1);
+  tv1->computeAt(tv3, 1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({63, 65}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto t2 = t0.t().add(2.0);
+  auto aten_output = t1.t().mul(t2);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSegmentReducePointwise_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(1);
+  TensorView* tv2 = makeSymbolicTensor(2);
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  fusion->addInput(tv2);
+
+  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(1)); // Group 0
+  TensorView* tv4 =
+      max(tv3, {0}); // Group 0 (use max instead to avoid numerical issues)
+  TensorView* tv5 = add(tv4, tv1); //  Group 0 (Non Broadcast after reduce,
+                                   //  keeps normalization scheduler away)
+  TensorView* tv6 = add(tv5, tv2); //  Group 1 (Broadcast after reduce)
+
+  fusion->addOutput(tv6);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({128, 65}, options);
+  at::Tensor t1 = at::randn({65}, options);
+  at::Tensor t2 = at::randn({128, 65}, options);
+
+  auto t3 = t0.add(1.0);
+  auto t4 = std::get<0>(at::max(t3, 0));
+  auto t5 = t4.add(t1);
+  auto t6 = t5.add(t2);
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  auto outputs = executor_cache.runFusionWithInputs({t0, t1, t2});
+
+  TORCH_CHECK(
+      executor_cache.getMostRecentKernelRuntime()->isSegmented(),
+      "segmentation didn't happen");
+  TORCH_CHECK(
+      executor_cache.getMostRecentKernelRuntime()
+              ->fusionSegments()
+              ->groups()
+              .size() == 2,
+      "segmentation didn't happen as expected");
+
+  testValidate(
+      executor_cache.fusion(), outputs, {t0, t1, t2}, {t6}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionMultipleVectorize_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  TensorView* tv0 = makeContigTensor(1);
+  TensorView* tv1 = makeContigTensor(1);
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  TensorView* tv3 = add(tv0, tv1);
+  fusion->addOutput(tv3);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({40960}, options);
+  at::Tensor t1 = at::randn({40960}, options);
+  auto t2 = t0 + t1;
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+  executor_cache.profile(true);
+
+  auto outputs = executor_cache.runFusionWithInputs({t0, t1});
+  auto runtime1 = executor_cache.getMostRecentKernelRuntime();
+  auto log1 =
+      executor_cache.getMostRecentExecutorInfo().params->as<PointwiseParams>();
+  TORCH_CHECK(log1 != nullptr);
+  TORCH_CHECK(log1->vectorize);
+
+  testValidate(
+      executor_cache.fusion(), outputs, {t0, t1}, {t2}, __LINE__, __FILE__);
+
+  t0 = at::randn({40964}, options);
+  t1 = at::randn({40964}, options);
+  t2 = t0 + t1;
+
+  outputs = executor_cache.runFusionWithInputs({t0, t1});
+  auto runtime2 = executor_cache.getMostRecentKernelRuntime();
+  auto log2 =
+      executor_cache.getMostRecentExecutorInfo().params->as<PointwiseParams>();
+  TORCH_CHECK(log2 != nullptr);
+  TORCH_CHECK(log2->vectorize);
+
+  testValidate(
+      executor_cache.fusion(), outputs, {t0, t1}, {t2}, __LINE__, __FILE__);
+
+  t0 = at::randn({40962}, options);
+  t1 = at::randn({40962}, options);
+  t2 = t0 + t1;
+
+  outputs = executor_cache.runFusionWithInputs({t0, t1});
+  auto runtime3 = executor_cache.getMostRecentKernelRuntime();
+  auto log3 =
+      executor_cache.getMostRecentExecutorInfo().params->as<PointwiseParams>();
+  TORCH_CHECK(log3 != nullptr);
+  TORCH_CHECK(log3->vectorize);
+
+  testValidate(
+      executor_cache.fusion(), outputs, {t0, t1}, {t2}, __LINE__, __FILE__);
+
+  TORCH_CHECK(runtime1 == runtime2);
+  TORCH_CHECK(runtime1 != runtime3);
+}
+
+TEST_F(NVFuserTest, FusionVectorizeSimple_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeContigTensor(3);
+
+  fusion.addInput(tv0);
+
+  auto tv1 = unaryOp(UnaryOpType::Sin, tv0);
+
+  fusion.addOutput(tv1);
+
+  auto tv0_cache = tv0->cacheAfter();
+
+  auto tv1_cache = tv1->cacheBefore();
+
+  tv1->merge(0);
+  tv1->merge(0);
+  tv1->split(0, 4);
+  tv1->split(0, 128);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv0->computeAt(tv1, 2);
+
+  tv0_cache->axis(2)->parallelize(ParallelType::Vectorize);
+  tv1->axis(2)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor aten_input = at::empty({2, 6, 32}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {aten_input});
+  auto cg_outputs = fe.runFusion({aten_input});
+
+  at::Tensor aten_output = aten_input.sin();
+
+  testValidate(
+      &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSimpleVectorizeUnroll_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  // dimensionality of the problem
+  int nDims = 3;
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeContigTensor(nDims);
+  TensorView* tv1 = makeContigTensor(nDims);
+
+  // Register your inputs
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // Do math with it, it returns a `Val*` but can be static_casted back to
+  // TensorView
+  TensorView* tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+  TensorView* tv3 = add(tv0, tv2);
+
+  // Register your outputs
+  fusion.addOutput(tv3);
+
+  auto tv0_cache = tv0->cacheAfter();
+  auto tv1_cache = tv1->cacheAfter();
+  auto tv3_cache = tv3->cacheBefore();
+
+  // Do transformations, remember, transformations are outputs to inputs
+  // This doesn't have to be in this order
+  tv3->merge(1);
+
+  // Split by n_threads
+  tv3->split(1, 2);
+  tv3->split(0, 3);
+  tv3->split(0, 1);
+
+  // [bidx, unswitch, unroll{2}, tidx, vectorize{2}]
+
+  // Parallelize TV3
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(1)->parallelize(ParallelType::Unswitch);
+  tv3->axis(2)->parallelize(ParallelType::Unroll);
+  tv3->axis(3)->parallelize(ParallelType::TIDx);
+
+  tv3->reorder({{4, 2}});
+  // [bidx, unswitch, vectorize{2}, unroll{2}, tidx]
+
+  TransformPropagatorWithCheck propagator(tv3);
+  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
+  scheduler_utils::parallelizeAllLike(tv3);
+
+  tv0_cache->axis(2)->parallelize(ParallelType::Vectorize);
+  tv1_cache->axis(2)->parallelize(ParallelType::Vectorize);
+  tv3->axis(2)->parallelize(ParallelType::Vectorize);
+
+  // For all inputs, computeAt the output inline, temporaries should be squeezed
+  // between them
+  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
+  tv1->computeAt(tv3, -1, ComputeAtMode::MostInlined);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor input1 = at::randn({64, 2, 128}, options);
+  at::Tensor input2 = at::rand_like(input1);
+  at::Tensor output = at::empty_like(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input1, input2});
+  fe.runFusion({input1, input2}, {output});
+
+  at::Tensor tv2_ref = input2 + 2.0;
+  at::Tensor output_ref = input1 + tv2_ref;
+
+  TORCH_CHECK(output_ref.equal(output));
+}
+
+TEST_F(NVFuserTest, FusionSegmentReduceSoftmax_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  std::vector<int64_t> input_shape{32, 64, 8};
+  const int kReductionAxis = 1;
+
+  auto tv0 = TensorViewBuilder()
+                 .ndims(input_shape.size())
+                 .dtype(DataType::Double)
+                 .build();
+
+  fusion->addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
+  auto tv2 = sum(tv1, {2}); // Group 0
+
+  auto output = softmax(tv2, kReductionAxis); // Group 1
+  fusion->addOutput(output);
+
+  auto options = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA, 0);
+  at::Tensor at_x = at::randn(input_shape, options);
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  auto outputs = executor_cache.runFusionWithInputs({at_x});
+
+  auto t1 = at_x.add(1.0);
+  auto t2 = t1.sum({2});
+  auto t3 = at::_softmax(t2.to(at::kDouble), -1, false);
+
+  auto optimized_fusion = executor_cache.getMostRecentKernelRuntime();
+  TORCH_CHECK(optimized_fusion->isSegmented(), "segmentation didn't happen");
+  TORCH_CHECK(
+      optimized_fusion->fusionSegments()->groups().size() == 2,
+      "segmentation didn't happen as expected");
+
+  testValidate(
+      executor_cache.fusion(), outputs, {at_x}, {t3}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSwizzle1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = mul(tv1, IrBuilder::create<Double>(2));
+  fusion.addOutput(tv2);
+
+  tv2->split(0, 7);
+  tv2->split(0, 9);
+
+  tv0->computeAt(tv2, 1);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv1->setMemoryType(MemoryType::Shared);
+  tv1->swizzle(SwizzleType::Transpose, {1, 2});
+
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+  tv1->axis(2)->parallelize(ParallelType::TIDy);
+
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+  tv2->axis(2)->parallelize(ParallelType::TIDy);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({100}, options);
+
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output = (t0 + 1) * 2;
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSwizzle2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = mul(tv1, IrBuilder::create<Double>(2));
+  fusion.addOutput(tv2);
+
+  tv1->split(-1, 4);
+  tv1->split(-2, 4);
+
+  tv2->split(-1, 4);
+  tv2->split(-2, 4);
+
+  tv0->computeAt(tv2, 1);
+
+  tv2->reorder({{-1, -2}});
+
+  tv1->setMemoryType(MemoryType::Shared);
+  tv1->swizzle(SwizzleType::Transpose, {-2, -1});
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-2)->parallelize(ParallelType::TIDy);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-2)->parallelize(ParallelType::TIDy);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({123}, options);
+
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output = (t0 + 1) * 2;
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionGridPersistence_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = broadcast(tv1, {true});
+  auto tv3 = add(tv0, tv2);
+  fusion.addOutput(tv3);
+
+  std::vector<TensorView*> tvs = {tv1, tv2, tv3};
+  for (auto tv : tvs) {
+    tv->split(0, 2);
+    tv->axis(0)->parallelize(ParallelType::BIDx);
+    tv->axis(1)->parallelize(ParallelType::BIDy);
+  }
+
+  const int numel_x = 10;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto out = fe.runFusion({input});
+
+  auto aten_output = input.sum({0}).unsqueeze(-1).add(input);
+
+  testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionGridPersistence2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = broadcast(tv1, {true, false});
+  auto tv3 = add(tv0, tv2);
+  fusion.addOutput(tv3);
+
+  std::vector<TensorView*> tvs = {tv1, tv2, tv3};
+  for (auto tv : tvs) {
+    tv->split(0, 2);
+    tv->axis(0)->parallelize(ParallelType::BIDx);
+    tv->axis(1)->parallelize(ParallelType::TIDy);
+    tv->axis(2)->parallelize(ParallelType::TIDx);
+  }
+
+  const int numel_x = 10;
+  const int numel_y = 3;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto out = fe.runFusion({input});
+
+  auto aten_output = input.sum({0}).unsqueeze(0).add(input);
+
+  testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionWelfordPersistence_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tvs = Welford(tv0, {0});
+  auto tv4 = add(tvs.avg, tvs.var_sum);
+  auto tv5 = broadcast(tv4, {true});
+  auto tv6 = add(tv0, tv5);
+  fusion.addOutput(tv6);
+
+  std::vector<TensorView*> schedule_tvs = {
+      tvs.avg, tvs.var_sum, tvs.n, tv5, tv6};
+
+  for (auto tv : schedule_tvs) {
+    tv->split(0, 2);
+    tv->axis(0)->parallelize(ParallelType::BIDx);
+    tv->axis(1)->parallelize(ParallelType::BIDy);
+  }
+
+  const int numel_x = 10;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto out = fe.runFusion({input});
+
+  auto aten_output = (input.mean({0}) + (input.var({0}, false) * numel_x))
+                         .unsqueeze(-1)
+                         .add(input);
+
+  testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionWelfordPersistence2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tvs = Welford(tv0, {0});
+  auto tv4 = add(tvs.avg, tvs.var_sum);
+  auto tv5 = broadcast(tv4, {true, false});
+  auto tv6 = add(tv0, tv5);
+  fusion.addOutput(tv6);
+
+  std::vector<TensorView*> schedule_tvs = {
+      tvs.avg, tvs.var_sum, tvs.n, tv5, tv6};
+  for (auto tv : schedule_tvs) {
+    tv->split(0, 2);
+    tv->axis(0)->parallelize(ParallelType::BIDx);
+    tv->axis(1)->parallelize(ParallelType::TIDy);
+    tv->axis(2)->parallelize(ParallelType::TIDx);
+  }
+  tv4->axis(0)->parallelize(ParallelType::TIDx);
+
+  const int numel_x = 10;
+  const int numel_y = 3;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto out = fe.runFusion({input});
+
+  auto aten_output = (input.mean({0}) + (input.var({0}, false) * numel_x))
+                         .unsqueeze(0)
+                         .add(input);
+
+  testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue633_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int dx = 10;
+  const int dy = 11;
+  const int dz = 12;
+
+  auto tv0 = makeConcreteTensor({dx, dy, dz});
+  fusion.addInput(tv0);
+  auto tv1 = makeConcreteTensor({dx, dy, 1});
+  fusion.addInput(tv1);
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  tv2->merge(1);
+  tv2->merge(0);
+  tv2->split(-1, 128);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({dx, dy, dz}, options);
+  at::Tensor t1 = at::randn({dx, dy, 1}, options);
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output = t0 + t1;
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBroadcastAcrossComputeAt_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> shape{17, 19};
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  auto tv2 = broadcast(tv0, {false, true});
+  auto tv3 = add(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  tv3->split(1, 128);
+  tv0->computeAt(tv3, 2);
+
+  for (auto tv : {tv2, tv3}) {
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({shape[0]}, options);
+  at::Tensor t1 = at::randn(shape, options);
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto t3 = t0.unsqueeze(-1).expand(shape) + t1;
+
+  testValidate(&fusion, cg_outputs, aten_inputs, {t3}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwise_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(2);
+  auto tv1 = makeContigTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  const int kTDX = 64;
+  const int kVecSize = 4;
+  const int kNumElems = kTDX * kVecSize;
+
+  tv2->split(1, kNumElems);
+
+  auto c0 = tv0->cacheAfter();
+  auto c1 = tv1->cacheAfter();
+  auto c2 = tv2->cacheBefore();
+
+  tv2->split(-1, kVecSize);
+
+  c0->computeAt(tv2, -2);
+  c1->computeAt(tv2, -2);
+
+  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(-2)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const int bx = 128;
+  const int by = 457;
+  at::Tensor t0 = at::randn({bx, by}, options);
+  at::Tensor t1 = at::randn({bx, by}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output = t0 + t1;
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeContig_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(4);
+  auto tv1 = makeContigTensor(4);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  tv2->reorder({{0, 1}, {1, 0}});
+  tv2->merge(-2);
+
+  const int kTDX = 64;
+  const int kVecSize = 2;
+  const int kNumElems = kTDX * kVecSize;
+
+  tv2->split(-1, kNumElems);
+
+  auto c0 = tv0->cacheAfter();
+  auto c1 = tv1->cacheAfter();
+  auto c2 = tv2->cacheBefore();
+
+  tv2->split(0, 128);
+  tv2->split(-1, kVecSize);
+
+  c0->computeAt(tv2, -2);
+  c1->computeAt(tv2, -2);
+
+  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::BIDy);
+  tv2->axis(-2)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const int n = 32;
+  const int c = 127;
+  const int h = 51;
+  const int w = 23;
+  at::Tensor t0 = at::randn({n, c, h, w}, options);
+  at::Tensor t1 = at::randn({n, c, h, w}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output = t0 + t1;
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicPass_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  constexpr int kNumDims = 4;
+  constexpr int kTDX = 64;
+  constexpr int kVecSize = 2;
+  constexpr int kNumElems = kTDX * kVecSize;
+
+  auto tv0 = makeSymbolicTensor(kNumDims);
+  auto tv1 = makeSymbolicTensor(kNumDims);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  // Create caches for vectorization
+  auto c0 = tv0->cacheAfter();
+  auto c1 = tv1->cacheAfter();
+  auto c2 = tv2->cacheBefore();
+
+  // Merge all dimensions together except inner-most dim
+  for (const auto idx : c10::irange(kNumDims - 2)) {
+    tv2->merge(0);
+  }
+  // Split inner-most dim
+  tv2->split(-1, kNumElems);
+  tv2->split(-1, kVecSize);
+  TransformPropagatorWithCheck propagator(tv2);
+  MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
+
+  c0->computeAt(tv2, -2);
+  c1->computeAt(tv2, -2);
+
+  // Parallelization Strategy
+  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(2)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const int n = 5;
+  const int c = 3;
+  const int h = 51;
+  const int w = 257;
+  at::Tensor t0 = at::randn({n, c, h, w}, options);
+  at::Tensor t1 = at::randn({n, c, h, w}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output = t0 + t1;
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicFail_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  constexpr int kNumDims = 4;
+  constexpr int kTDX = 64;
+  constexpr int kVecSize = 2;
+  constexpr int kNumElems = kTDX * kVecSize;
+  std::vector<int64_t> bcast_shape{1, 1, 1, -1};
+
+  auto tv0 = makeContigTensor(kNumDims);
+  auto tv1 = TensorViewBuilder().shape(bcast_shape).build();
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  // Create caches for vectorization
+  auto c0 = tv0->cacheAfter();
+  auto c1 = tv1->cacheAfter();
+  auto c2 = tv2->cacheBefore();
+
+  // Merge all dimensions together
+  // Backward merge order is necessary for vectorize validation
+  for (int idx = kNumDims - 1; idx > 0; --idx) {
+    tv2->merge(idx - 1);
+  }
+  tv2->split(-1, kNumElems);
+  tv2->split(-1, kVecSize);
+  TransformPropagatorWithCheck propagator(tv2);
+  MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
+
+  c0->computeAt(tv2, -2);
+  c1->computeAt(tv2, -2);
+
+  // Parallelization Strategy
+  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const int n = 32;
+  const int c = 128;
+  const int h = 51;
+  const int w = 23;
+  at::Tensor t0 = at::randn({n, c, h, w}, options);
+  at::Tensor t1 = at::randn({1, 1, 1, w}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  // TODO: throw assertion - cannot merge non-contiguous vectorization axes
+  // Make sure compilation fails
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
+}
+
+TEST_F(NVFuserTest, FusionVectorizeMisalignedRFactor_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(2);
+  auto tv1 = makeContigTensor(2);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+
+  auto tv3 = sum(tv2, {-1});
+
+  fusion.addOutput(tv3);
+
+  auto c0 = tv0->cacheAfter();
+  auto c1 = tv1->cacheAfter();
+
+  tv3->split(-1, 128 * 4);
+  tv3->split(-1, 4);
+  // Reduce outer dim first
+  auto tv4 = tv3->rFactor({-3, -1});
+  // Tv3 will reduce threads
+
+  tv0->computeAt(tv3, 1);
+  tv1->computeAt(tv3, 1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv0->computeAt(tv4, -2);
+  tv1->computeAt(tv4, -2);
+
+  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+
+  tv4->axis(-2)->parallelize(ParallelType::TIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv2->computeAt(tv4, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const int bx = 128;
+  const int by = 2050;
+  at::Tensor t0 = at::randn({bx, by}, options);
+  at::Tensor t1 = at::randn({bx, by}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output = t0.add(t1).sum(1);
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionVectorizeMisalignedWrongDimFail_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(2);
+  auto tv1 = makeContigTensor(2);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  tv2->split(1, 16);
+  tv2->split(1, 64);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(2)->parallelize(ParallelType::TIDx);
+
+  auto c0 = tv0->cacheAfter();
+  auto c1 = tv1->cacheAfter();
+  auto c2 = tv2->cacheBefore();
+
+  c0->computeAt(tv2, -2);
+  c1->computeAt(tv2, -2);
+
+  std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
+  for (auto tv : vectorized_tvs) {
+    tv->split(-1, 4);
+    // Vectorize the wrong dimension
+    tv->axis(-2)->parallelize(ParallelType::MisalignedVectorize);
+  }
+
+  FusionExecutor fe;
+  // Make sure compilation fails
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
+}
+
+TEST_F(NVFuserTest, FusionVectorizeMisalignedStride_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = makeSymbolicTensor(2);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  const int kTDX = 64;
+  const int kVecSize = 4;
+  const int kNumElems = kTDX * kVecSize;
+
+  tv2->split(1, kNumElems);
+
+  auto c0 = tv0->cacheAfter();
+  auto c1 = tv1->cacheAfter();
+
+  tv2->split(-1, kVecSize);
+
+  c0->computeAt(tv2, -2);
+  c1->computeAt(tv2, -2);
+
+  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(-2)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const int bx = 128;
+  const int by = 2049;
+  at::Tensor t0 = at::randn({bx, by}, options).index({"...", Slice(3)});
+  at::Tensor t1 = at::randn({bx, by}, options).index({"...", Slice(3)});
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output = t0 + t1;
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionVectorizeMisalignedStrideFail_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = makeSymbolicTensor(2);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  const int kTDX = 64;
+  const int kVecSize = 4;
+  const int kNumElems = kTDX * kVecSize;
+
+  tv2->split(1, kNumElems);
+
+  auto c0 = tv0->cacheAfter();
+  auto c1 = tv1->cacheAfter();
+  auto c2 = tv2->cacheBefore();
+
+  tv2->split(-1, kVecSize);
+
+  c0->computeAt(tv2, -2);
+  c1->computeAt(tv2, -2);
+
+  c0->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+  c1->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(-2)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::MisalignedVectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const int bx = 128;
+  const int by = 2049;
+  at::Tensor t0 = at::randn({bx, by}, options).index({"...", Slice(3)});
+  at::Tensor t1 = at::randn({bx, by}, options).index({"...", Slice(3)});
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+
+  // Failure because the input + output tensors do not have the same stride
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.runFusion(aten_inputs));
+}
+
+TEST_F(NVFuserTest, FusionVectorization1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  tv2->split(1, 16);
+  tv2->split(1, 64);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(2)->parallelize(ParallelType::TIDx);
+
+  auto c0 = tv0->cacheAfter();
+  auto c1 = tv1->cacheAfter();
+  auto c2 = tv2->cacheBefore();
+
+  c0->computeAt(tv2, -2);
+  c1->computeAt(tv2, -2);
+
+  std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
+  for (auto tv : vectorized_tvs) {
+    tv->split(-1, 4);
+    tv->axis(-1)->parallelize(ParallelType::Vectorize);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const int bx = 128;
+  const int by = 2048;
+  at::Tensor t0 = at::randn({bx, by}, options);
+  at::Tensor t1 = at::randn({bx, by}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output = t0 + t1;
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionVectorization2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  tv2->split(1, 16);
+  tv2->split(1, 64);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(2)->parallelize(ParallelType::TIDx);
+
+  auto c0 = tv0->cacheAfter();
+  auto c1 = tv1->cacheAfter();
+  auto c2 = tv2->cacheBefore();
+
+  c0->computeAt(tv2, -2);
+  c1->computeAt(tv2, -2);
+
+  std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
+  for (auto tv : vectorized_tvs) {
+    tv->split(-1, 4);
+    // Vectorize the wrong dimension
+    tv->axis(-2)->parallelize(ParallelType::Vectorize);
+  }
+
+  FusionExecutor fe;
+  // Make sure compilation fails
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
+}
+
+TEST_F(NVFuserTest, FusionVectorization3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  tv2->split(1, 16);
+  tv2->split(1, 64);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(2)->parallelize(ParallelType::TIDx);
+
+  auto c0 = tv0->cacheAfter();
+  auto c1 = tv1->cacheAfter();
+  auto c2 = tv2->cacheBefore();
+
+  c0->computeAt(tv2, -2);
+  c1->computeAt(tv2, -2);
+
+  std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
+  for (auto tv : vectorized_tvs) {
+    tv->split(-1, 4);
+    tv->axis(-1)->parallelize(ParallelType::Vectorize);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const int bx = 128;
+  const int by = 2049;
+  at::Tensor t0 = at::randn({bx, by}, options);
+  at::Tensor t1 = at::randn({bx, by}, options);
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.runFusion(aten_inputs));
+
+  aten_inputs[0] = t0.index({"...", Slice(1)});
+  aten_inputs[1] = t1.index({"...", Slice(1)});
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.runFusion(aten_inputs));
+
+  t0 = at::randn({bx, 2048}, options).index({"...", Slice(4)});
+  t1 = at::randn({bx, 2048}, options).index({"...", Slice(4)});
+  aten_inputs = {t0, t1};
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output = t0 + t1;
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionVectorizationRFactor_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+
+  auto tv3 = sum(tv2, {-1});
+
+  fusion.addOutput(tv3);
+
+  tv3->split(-1, 128 * 4);
+  tv3->split(-1, 4);
+  // Reduce outer dim first
+  auto tv4 = tv3->rFactor({-3, -1});
+  // Tv3 will reduce threads
+
+  auto tv6 = tv0->cacheAfter();
+  auto tv7 = tv1->cacheAfter();
+
+  tv0->computeAt(tv3, 1);
+  tv1->computeAt(tv3, 1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv0->computeAt(tv4, -2);
+  tv1->computeAt(tv4, -2);
+
+  tv6->axis(-1)->parallelize(ParallelType::Vectorize);
+  tv7->axis(-1)->parallelize(ParallelType::Vectorize);
+
+  tv4->axis(-2)->parallelize(ParallelType::TIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const int bx = 128;
+  const int by = 2048;
+  at::Tensor t0 = at::randn({bx, by}, options);
+  at::Tensor t1 = at::randn({bx, by}, options);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto aten_output = t0.add(t1).sum(1);
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+
+  auto t3 = t0.add(t1).sum(1);
+
+  testValidate(&fusion, cg_outputs, aten_inputs, {t3}, __LINE__, __FILE__);
+}
+
+// Unswitched loops with extent one may omit else clause.
+TEST_F(NVFuserTest, FusionSizeOneLoop1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Progressively broadcast tensors
+  TensorView* tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  TensorView* tv2 = makeSymbolicTensor(3);
+  fusion.addInput(tv2);
+
+  TensorView* tv3 = broadcast(tv0, {false, true});
+  TensorView* tv4 = add(tv3, tv1);
+  TensorView* tv5 = add(tv4, tv2);
+
+  fusion.addOutput(tv5);
+
+  // Split inner dimension
+  tv5->split(1, 8);
+  // Merge middle dims with outer dimensions
+  tv5->merge(2);
+  tv5->merge(0);
+
+  // tv5[I0*I1o, I1i*I2]
+  // Get a dim of size 1 to unswitch
+  tv5->split(0, 1, false);
+
+  // Compute everything inline
+  tv0->computeAt(tv5, -1);
+
+  tv5->axis(0)->parallelize(ParallelType::Unswitch);
+  tv5->axis(1)->parallelize(ParallelType::BIDx);
+  tv5->axis(2)->parallelize(ParallelType::TIDx);
+
+  // Make sure the unswitched loop does not have an else clause.
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(!UnswitchInElseChecker::check(gpulw));
+
+  const int x = 11;
+  const int y = 12;
+  const int z = 13;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({x}, options);
+  at::Tensor t1 = at::randn({x, y}, options);
+  at::Tensor t2 = at::randn({z, x, y}, options);
+  std::vector<IValue> aten_inputs = {t0, t1, t2};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+  auto t6 = (t0.unsqueeze(-1) + t1).unsqueeze(0) + t2;
+
+  testValidate(&fusion, cg_outputs, aten_inputs, {t6}, __LINE__, __FILE__);
+}
+
+// The unswitched loop has extent one but inner loops don't. The else
+// part should not be omitted.
+TEST_F(NVFuserTest, FusionSizeOneLoop2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int x = 15;
+  auto tv0 = makeConcreteTensor({x});
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv1);
+
+  tv1->split(-1, 4);
+  tv1->split(-2, 1);
+
+  tv1->axis(-2)->parallelize(ParallelType::Unswitch);
+
+  // Make sure the size-one unswitched loop does not omit the else clause.
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(UnswitchInElseChecker::check(gpulw));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({x}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+  auto t1 = t0 + 1;
+
+  testValidate(&fusion, cg_outputs, aten_inputs, {t1}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionValidateParallelize1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDy);
+
+  // Invalid as tv1 and tv2 do have the same ParallelType
+  FusionExecutor fe;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
+}
+
+TEST_F(NVFuserTest, FusionValidateParallelize2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDy);
+  tv1->setMemoryType(MemoryType::Shared);
+
+  // tv1 and tv2 do have the same ParallelType, but tv1 is on shared
+  // memory, so it is valid
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+}
+
+TEST_F(NVFuserTest, FusionValidateParallelize3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  tv1->split(-1, 4);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->split(-1, 4);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv1->setMemoryType(MemoryType::Global);
+
+  // tv1 and tv2 have the same shape and ParallelType
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+}
+
+TEST_F(NVFuserTest, FusionValidateParallelize4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  tv1->split(-1, 4);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->split(-1, 8);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv1->setMemoryType(MemoryType::Global);
+
+  // tv1 and tv2 do not have the same shape but global memory comm is supported.
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+}
+
+TEST_F(NVFuserTest, FusionValidateParallelize5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  tv1->split(-1, 4);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->setMemoryType(MemoryType::Shared);
+
+  tv2->split(-1, 8);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  // tv1 and tv2 do not have the same shape, but tv1 is on shared
+  // memory, so it is valid
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+}
+
+// See issue #995
+TEST_F(NVFuserTest, FusionValidateParallelize6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int64_t W = 5, X = 6, Y = 7, Z = 8;
+
+  auto tv0 = makeConcreteTensor({X, Y, Z});
+  auto tv1 = makeConcreteTensor({W, X, Y, Z});
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv3 = broadcast(tv2, {true, false, false, false});
+  auto tv4 = add(tv3, tv1);
+  fusion.addOutput(tv4);
+
+  tv4->merge(0);
+  tv4->merge(0);
+  tv4->merge(0);
+  tv4->split(0, 4);
+  tv4->split(0, 3);
+  tv4->split(0, 2);
+
+  TransformPropagatorWithCheck propagator(tv4);
+  MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
+
+  tv0->computeAt(tv2, 2);
+  tv3->computeAt(tv4, 2);
+
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  // Validation should throw an exception saying the first axes of tv2
+  // and tv3 have incompatible parallelization. See also issue #995.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fusion.printKernel());
+}
+
+// Repro of #2046
+TEST_F(NVFuserTest, FusionValidateParallelize7_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  auto tv2 = set(tv1);
+  auto tv3 = set(tv1);
+  fusion.addOutput(tv2);
+  fusion.addOutput(tv3);
+
+  tv1->setMemoryType(MemoryType::Global);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv2->axis(1)->parallelize(ParallelType::TIDy);
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  // tv2 uses tv1 but is not parallelized with BIDx, so a grid sync is
+  // required. It should be placed as a top-level expression.
+
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(
+      std::any_of(
+          gpulw.kernel()->topLevelExprs().begin(),
+          gpulw.kernel()->topLevelExprs().end(),
+          [](Expr* expr) { return expr->isA<kir::GridSync>(); }),
+      "Grid sync not found");
+}
+
+TEST_F(NVFuserTest, FusionDAGMerging_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(5);
+  auto tv1 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // Branch 0
+  auto tv2 = sum(tv0, {0}); // 0
+  auto tv3 = sum(tv2, {0}); // 1
+  auto tv4 = sum(tv3, {0}); // 2
+  auto tv5 = sum(tv4, {0}); // 3
+
+  // Branch 1
+  auto tv6 = add(tv1, IrBuilder::create<Double>(1)); // 4
+
+  // Merge
+  auto tv7 = add(tv6, tv5); // 5
+
+  // Maximum expected output groups (can improve overtime):
+  //  {0}, {1}, {2}, {3,4,5}
+  //  without final merge would have been {0}, {1}, {2}, {3,4}, {5}
+
+  fusion.addOutput(tv7);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({2, 2, 2, 2, 2}, options);
+  at::Tensor t1 = at::randn({2}, options);
+
+  std::vector<at::Tensor> aten_inputs = {t0, t1};
+
+  KernelArgumentHolder args(KernelIndexMode::INT32);
+  args.setDeviceIndex(0);
+  args.push(aten_inputs);
+
+  auto fusion_segments = fusion.segment(args);
+  TORCH_CHECK(fusion_segments->groups().size() <= 4);
+}
+
+TEST_F(NVFuserTest, FusionDAGScalarMerging_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(3);
+  auto i0 = IrBuilder::create<Double>();
+
+  fusion->addInput(tv0);
+  fusion->addInput(i0);
+
+  auto i1 = add(i0, IrBuilder::create<Double>(1.0));
+  auto i2 = mul(i1, i1);
+  auto i3 = add(i2, i1);
+
+  // Branch 0
+  auto tv1 = sum(tv0, {0}); // 0
+  auto tv2 = add(tv1, i2);
+  // Branch 1
+  auto tv3 = sum(tv2, {0}); // 1
+  auto tv4 = add(tv3, i3);
+
+  auto tv5 = add(tv4, i0);
+
+  fusion->addOutput(tv5);
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({16, 16, 16}, options);
+  double s0 = 0.5;
+
+  auto s1 = s0 + 1.0;
+  auto s2 = s1 * s1;
+  auto s3 = s2 + s1;
+  auto t1 = t0.sum({0});
+  auto t2 = t1 + s2;
+  auto t3 = sum(t2, {0});
+  auto t4 = t3 + s3;
+  auto t5 = t4 + s0;
+
+  auto outputs = executor_cache.runFusionWithInputs({t0, s0});
+
+  TORCH_CHECK(
+      executor_cache.getMostRecentKernelRuntime()->isSegmented(),
+      "segmentation didn't happen");
+  TORCH_CHECK(
+      executor_cache.getMostRecentKernelRuntime()
+              ->fusionSegments()
+              ->groups()
+              .size() == 2,
+      "segmentation didn't happen as expected");
+
+  testValidate(
+      executor_cache.fusion(), outputs, {t0, s0}, {t5}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBlockReduceInSerialLoop_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  constexpr int M = 10;
+  constexpr int N = 20;
+  constexpr int K = 20;
+
+  auto tv0 = makeSymbolicTensor(3);
+  auto tv1 = sum(tv0, {{1, 2}});
+  fusion.addInput(tv0);
+  fusion.addOutput(tv1);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M, N, K}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+  at::Tensor aten_output = t0.sum({1, 2});
+  testValidate(
+      &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBlockWelfordInSerialLoop_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  constexpr int M = 10;
+  constexpr int N = 20;
+  constexpr int K = 20;
+
+  auto tv0 = makeSymbolicTensor(3);
+  auto tvs = Welford(tv0, {{1, 2}});
+  fusion.addInput(tv0);
+  auto tv_avg = tvs.avg;
+  auto tv_M2 = tvs.var_sum;
+  auto tv_N = tvs.n;
+  fusion.addOutput(tv_avg);
+  fusion.addOutput(tv_M2);
+
+  tv_avg->axis(-1)->parallelize(ParallelType::TIDx);
+  tv_avg->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M, N, K}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+  at::Tensor aten_avg = t0.mean({1, 2});
+  at::Tensor aten_M2 = t0.var({1, 2}, false) * N * K;
+  testValidate(
+      &fusion, outputs, aten_inputs, {aten_avg, aten_M2}, __LINE__, __FILE__);
+}
+
+// See Issue #716
+TEST_F(NVFuserTest, FusionIOTensorTrivialReductionRepro_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  constexpr int M = 10;
+  constexpr int N = 11;
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  std::vector<int> reduction_axes = {1};
+  std::vector<bool> broadcast_mask = {false, true};
+
+  auto tv0_bcast = broadcast(tv0, broadcast_mask);
+  auto path1_bcast = add(tv0_bcast, IrBuilder::create<Double>(1.0));
+  auto path1 = sum(path1_bcast, reduction_axes);
+  fusion.addOutput(path1);
+
+  auto p = path1->split(1, 1);
+  path1->rFactor({1});
+  path1->axis(0)->parallelize(ParallelType::BIDx);
+  tv0->computeAt(path1, 1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M}, options);
+  at::Tensor t0_ref = t0.clone();
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+
+  // inplace op, we are adding t0 to itself
+  auto outputs = fe.runFusion(aten_inputs, {t0});
+
+  TORCH_CHECK(outputs[0].allclose(t0_ref.add(1)));
+}
+
+TEST_F(NVFuserTest, FusionReductionPredicate_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {0});
+  fusion.addOutput(tv1);
+
+  auto tv2 = tv0->cacheAfter();
+
+  const int bdimx = 128;
+  tv1->split(1, bdimx);
+  tv1->split(1, 4);
+  tv1->split(1, 1);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(2)->parallelize(ParallelType::Unroll);
+  tv1->split(0, 10);
+  tv0->computeAt(tv1, 4);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  int numel_x = 650;
+  int numel_y = 102;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({numel_x, numel_y}, options);
+  at::Tensor cg_output = at::empty({numel_y}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  fe.runFusion({input}, {cg_output});
+
+  auto aten_output = input.to(at::kDouble).sum({0});
+
+  testValidate(
+      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue728_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addOutput(tv0);
+  auto tv1 = makeSymbolicTensor(1);
+  fusion.addOutput(tv1);
+  auto tv2 = makeSymbolicTensor(1);
+  fusion.addOutput(tv2);
+
+  auto tv3 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv4 = add(tv3, tv1);
+  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
+  auto tv6 = add(tv2, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv5);
+  fusion.addOutput(tv6);
+
+  // tv0 -> tv3 -+
+  // tv1 --------+-> tv4 -> tv5
+  //
+  // tv2 -> tv6
+
+  auto all_vals_under_tv3 =
+      DependencyCheck::getAllValsBetween({tv3}, fusion.outputs());
+  std::unordered_set<Val*> included_tensors({tv3, tv4, tv5});
+  for (auto tv : included_tensors) {
+    TORCH_CHECK(
+        std::find(all_vals_under_tv3.begin(), all_vals_under_tv3.end(), tv) !=
+            all_vals_under_tv3.end(),
+        "TV",
+        tv->name(),
+        " not found");
+  }
+  for (auto tv : ir_utils::filterByType<TensorView>(fusion.vals())) {
+    if (included_tensors.find(tv) == included_tensors.end()) {
+      TORCH_CHECK(
+          std::find(all_vals_under_tv3.begin(), all_vals_under_tv3.end(), tv) ==
+              all_vals_under_tv3.end(),
+          "TV",
+          tv->name(),
+          " should not be found");
+    }
+  }
+
+  auto no_dependency = DependencyCheck::getAllValsBetween({}, fusion.outputs());
+  TORCH_CHECK(no_dependency.empty(), "No val should be returned");
+
+  auto no_dep_path = DependencyCheck::getAllValsBetween({tv0, tv1}, {tv6});
+  TORCH_CHECK(no_dep_path.empty(), "No val should be returned");
+
+  auto no_dep_path2 = DependencyCheck::getAllValsBetween({tv2}, {tv5});
+  TORCH_CHECK(no_dep_path2.empty(), "No val should be returned");
+
+  auto just_tv3 = DependencyCheck::getAllValsBetween({tv3}, {tv3});
+  TORCH_CHECK(
+      just_tv3.size() == 1 && *(just_tv3.begin()) == tv3,
+      "Only tv3 should be included");
+}
+
+TEST_F(NVFuserTest, FusionIssue757_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = makeSymbolicTensor(2);
+  fusion.addInput(tv3);
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  tv1->computeAt(tv4, -1);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+
+  int numel_x = 650;
+  int numel_y = 102;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  at::Tensor t3 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0, t3};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0.sum({1});
+  auto t2 = t1.unsqueeze(-1).expand({numel_x, numel_y});
+  auto t4 = t2 + t3;
+
+  testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__);
+}
+
+// See issue #759
+TEST_F(NVFuserTest, FusionPredicatedBlockBroadcast_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = makeSymbolicTensor(2);
+  fusion.addInput(tv3);
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  tv4->split(0, 4);
+  tv1->computeAt(tv4, -1);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(1)->parallelize(ParallelType::TIDy);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(1)->parallelize(ParallelType::TIDy);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+
+  int numel_x = 100;
+  int numel_y = 101;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  at::Tensor t3 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0, t3};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto t1 = t0.sum({1});
+  auto t2 = t1.unsqueeze(-1).expand({numel_x, numel_y});
+  auto t4 = t2 + t3;
+
+  testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSegmentVerticalMerge_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(3);
+
+  fusion->addInput(tv0);
+  // {first kernel}
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = add(tv1, tv0);
+  auto tv3 = sum(tv2, {0});
+  auto tv4 = add(tv3, tv0);
+  auto tv5 = sum(tv4, {0});
+  auto tv6 = sum(tv5, {0});
+  // {second kernel}
+  auto tv7 = add(tv6, tv5);
+  auto tv8 = add(tv7, tv5);
+  auto tv9 = sum(tv8, {0});
+
+  fusion->addOutput(tv9);
+
+  SegmentCandidateFinderOptions segment_options;
+  segment_options.run_herrmann_merge = false;
+  segment_options.run_final_merge = false;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({2, 2, 2}, options);
+
+  KernelArgumentHolder args(KernelIndexMode::INT32);
+  args.setDeviceIndex(0);
+  args.push(t0);
+
+  auto segmented_fusion =
+      SegmentCandidateFinder::segment(fusion.get(), args, segment_options);
+
+  TORCH_CHECK(segmented_fusion->groups().size() == 2);
+}
+
+TEST_F(NVFuserTest, FusionSegmentHorizontalMerge_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(3);
+  auto i0 = IrBuilder::create<Double>();
+
+  fusion->addInput(tv0);
+  fusion->addInput(i0);
+
+  // Branch 0 {first kernel}
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = add(tv0, i0);
+  auto tv3 = unaryOp(UnaryOpType::Rsqrt, tv2);
+  auto tv4 = sum(tv3, {0});
+
+  // Branch 1 {first kernel}
+  auto tv5 = unaryOp(UnaryOpType::Rsqrt, tv3);
+  auto tv6 = sum(tv5, {0});
+
+  // Incompatible {second kernel}
+  auto tv7 = sum(tv6, {0});
+
+  fusion->addOutput(tv1);
+  fusion->addOutput(tv4);
+  fusion->addOutput(tv7);
+
+  SegmentCandidateFinderOptions segment_options;
+  segment_options.run_herrmann_merge = false;
+  segment_options.run_final_merge = false;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({2, 2, 2}, options);
+
+  KernelArgumentHolder args(KernelIndexMode::INT32);
+  args.setDeviceIndex(0);
+  args.push(t0);
+  c10::IValue scalar = 1.0;
+  args.push(scalar);
+
+  auto segmented_fusion =
+      SegmentCandidateFinder::segment(fusion.get(), args, segment_options);
+
+  TORCH_CHECK(segmented_fusion->groups().size() == 2);
+}
+
+TEST_F(NVFuserTest, FusionSegmentMixReduction_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(3);
+
+  fusion->addInput(tv0);
+
+  // def of tv1 in kernel 1 through horizontal
+  auto tv1 = sum(tv0, {0, 1});
+  // kernel 2
+  auto tv2 = sum(tv0, {2});
+  auto tv3 = broadcast(tv2, {false, false, true});
+  auto tv4 = add(tv0, tv3);
+  auto tv5 = sum(tv4, {2});
+  // end of kernel 2
+  // kernel 1
+  auto tv6 = unaryOp(UnaryOpType::Rsqrt, tv0);
+  auto tv7 = sum(tv6, {0, 1});
+  auto tv8 = sum(tv6, {0, 1});
+
+  fusion->addOutput(tv1);
+  fusion->addOutput(tv5);
+  fusion->addOutput(tv7);
+  fusion->addOutput(tv8);
+
+  SegmentCandidateFinderOptions segment_options;
+  segment_options.run_herrmann_merge = false;
+  segment_options.run_final_merge = false;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({2, 2, 2}, options);
+
+  KernelArgumentHolder args(KernelIndexMode::INT32);
+  args.setDeviceIndex(0);
+  args.push(t0);
+
+  auto segmented_fusion =
+      SegmentCandidateFinder::segment(fusion.get(), args, segment_options);
+
+  TORCH_CHECK(segmented_fusion->groups().size() <= 2);
+}
+
+TEST_F(NVFuserTest, FusionSBAR_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // N, H, W, C format
+  std::vector<int64_t> input_shape{656, 7, 7, 64};
+
+  auto x = makeContigTensor(4);
+  auto y = makeContigTensor(4);
+  auto weight = makeContigTensor(1);
+  auto bias = makeContigTensor(1);
+
+  fusion.addInput(x);
+  fusion.addInput(y);
+  fusion.addInput(weight);
+  fusion.addInput(bias);
+
+  const size_t kNumberOfDims = x->nDims();
+  std::vector<bool> broadcast_mask(kNumberOfDims, false);
+  for (const auto axis : c10::irange(kNumberOfDims - 1)) {
+    broadcast_mask[axis] = true;
+  }
+
+  auto weight_bcast = broadcast(weight, broadcast_mask);
+  auto scale = mul(x, weight_bcast);
+  auto bias_bcast = broadcast(bias, broadcast_mask);
+  auto scale_bias = add(scale, bias_bcast);
+  auto scale_bias_add = add(scale_bias, y);
+  auto scale_bias_add_relu = unaryOp(UnaryOpType::Relu, scale_bias_add);
+
+  fusion.addOutput(scale_bias_add_relu);
+
+  // inputs
+  at::manual_seed(0);
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_x = at::randn(input_shape, options);
+  at::Tensor at_y = at::randn(input_shape, options);
+  at::Tensor at_weight = at::ones({input_shape[3]}, options);
+  at::Tensor at_bias = at::zeros({input_shape[3]}, options);
+
+  // inputs
+  std::vector<c10::IValue> inputs = {at_x, at_y, at_weight, at_bias};
+
+  // outputs
+  std::vector<at::Tensor> outputs;
+
+  auto lparams = schedulePointwise(&fusion, inputs);
+
+  FusionExecutor executor;
+  executor.compileFusion(&fusion, inputs, lparams);
+  outputs = executor.runFusion(inputs, lparams);
+
+  auto at_scale = at::mul(at_x, at_weight);
+  auto at_scale_bias = at::add(at_scale, at_bias);
+  auto pwise_add = at::add(at_scale_bias, at_y);
+  auto output = at::relu(pwise_add);
+
+  testValidate(&fusion, outputs, inputs, {output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSingleElement_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(0);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(2.5));
+
+  auto tv2 = add(tv1, IrBuilder::create<Double>(3.5));
+  fusion.addOutput(tv2);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({}, options);
+
+  at::Tensor cg_output = at::empty({}, options);
+
+  auto lparams = schedulePointwise(&fusion, {input});
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input}, lparams);
+  fe.runFusion({input}, {cg_output}, lparams);
+
+  auto aten_output = input.add(2.5).add(3.5);
+
+  testValidate(
+      &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBNBackwardRepro_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  int batch = 4;
+  int c = 4;
+  int h = 4;
+  int w = 4;
+  int numDims = 4;
+
+  auto input = makeSymbolicTensor(numDims);
+  fusion.addInput(input);
+  auto weight = makeSymbolicTensor(1);
+  fusion.addInput(weight);
+  auto running_mean = makeSymbolicTensor(1);
+  fusion.addInput(running_mean);
+  auto running_var = makeSymbolicTensor(1);
+  fusion.addInput(running_var);
+  auto save_mean = makeSymbolicTensor(1);
+  fusion.addInput(save_mean);
+  auto save_invstd = makeSymbolicTensor(1);
+  fusion.addInput(save_invstd);
+
+  auto grad_out_prev = makeSymbolicTensor(numDims);
+  fusion.addInput(grad_out_prev);
+  auto gt_0 =
+      makeSymbolicTensor(numDims); // single tensor broadcasted is dangerous.
+  fusion.addInput(gt_0);
+
+  auto gt_bool = binaryOp(BinaryOpType::GT, gt_0, IrBuilder::create<Int>(1));
+  auto gt_float = castOp(DataType::Float, gt_bool);
+
+  auto grad_out = mul(grad_out_prev, gt_float);
+
+  Val* eps_ptr = IrBuilder::create<Double>(1e-5);
+
+  auto grads = batch_norm_backward(
+      input,
+      grad_out,
+      weight,
+      running_mean,
+      running_var,
+      save_mean,
+      save_invstd,
+      true,
+      eps_ptr,
+      {true, true, true});
+
+  fusion.addOutput(grads.grad_input);
+  fusion.addOutput(grads.grad_weight);
+  fusion.addOutput(grads.grad_bias);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input0 = at::randn({batch, c, h, w}, options);
+  at::Tensor input1 = at::randn({c}, options);
+  at::Tensor input2 = at::randn_like(input1);
+  at::Tensor input3 = at::randn_like(input1);
+  at::Tensor input4 = at::randn_like(input1);
+  at::Tensor input5 = at::randn_like(input1);
+  at::Tensor input6 = at::randn_like(input0);
+  at::Tensor input7 = at::randn_like(input0);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  std::vector<IValue> inputs = {
+      input0, input1, input2, input3, input4, input5, input6, input7};
+  auto outputs = fec.runFusionWithInputs(inputs);
+}
+
+// TODO: We only changed inputs, merge this with the test above.
+TEST_F(NVFuserTest, FusionBNBackwardRepro2_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  int batch = 2;
+  int c = 81;
+  int h = 1;
+  int w = 1;
+  int numDims = 4;
+
+  // auto input = makeSymbolicTensor(numDims);
+  auto input = makeConcreteTensor({-1, -1, 1, 1});
+  fusion.addInput(input);
+  auto weight = makeSymbolicTensor(1);
+  fusion.addInput(weight);
+  auto running_mean = makeSymbolicTensor(1);
+  fusion.addInput(running_mean);
+  auto running_var = makeSymbolicTensor(1);
+  fusion.addInput(running_var);
+  auto save_mean = makeSymbolicTensor(1);
+  fusion.addInput(save_mean);
+  auto save_invstd = makeSymbolicTensor(1);
+  fusion.addInput(save_invstd);
+
+  // auto grad_out_prev = makeSymbolicTensor(numDims);
+  auto grad_out_prev = makeConcreteTensor({-1, -1, 1, 1});
+  fusion.addInput(grad_out_prev);
+  // auto gt_0 =
+  //     makeSymbolicTensor(numDims); // single tensor broadcasted is dangerous.
+  auto gt_0 = makeConcreteTensor({-1, -1, 1, 1});
+  fusion.addInput(gt_0);
+
+  auto gt_bool = binaryOp(BinaryOpType::GT, gt_0, IrBuilder::create<Int>(1));
+  auto gt_float = castOp(DataType::Float, gt_bool);
+
+  auto grad_out = mul(grad_out_prev, gt_float);
+
+  Val* eps_ptr = IrBuilder::create<Double>(1e-5);
+
+  auto grads = batch_norm_backward(
+      input,
+      grad_out,
+      weight,
+      running_mean,
+      running_var,
+      save_mean,
+      save_invstd,
+      true,
+      eps_ptr,
+      {true, true, true});
+
+  fusion.addOutput(grads.grad_input);
+  fusion.addOutput(grads.grad_weight);
+  fusion.addOutput(grads.grad_bias);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input0 = at::randn({batch, c, h, w}, options);
+  at::Tensor input1 = at::randn({c}, options);
+  at::Tensor input2 = at::randn_like(input1);
+  at::Tensor input3 = at::randn_like(input1);
+  at::Tensor input4 = at::randn_like(input1);
+  at::Tensor input5 = at::randn_like(input1);
+  at::Tensor input6 = at::randn_like(input0);
+  at::Tensor input7 = at::randn_like(input0);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  std::vector<IValue> inputs = {
+      input0, input1, input2, input3, input4, input5, input6, input7};
+  auto outputs = fec.runFusionWithInputs(inputs);
+}
+
+TEST_F(NVFuserTest, FusionBNRepro_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  const bool kTraining = true;
+  const float kMomentum = 0.1;
+  const float kEps = 1e-5;
+
+  int batch = 14;
+  int c = 65;
+  int h = 7;
+  int w = 7;
+  int numDims = 4;
+
+  auto input = makeSymbolicTensor(numDims);
+  fusion.addInput(input);
+  auto weight = makeSymbolicTensor(1);
+  fusion.addInput(weight);
+  auto bias = makeSymbolicTensor(1);
+  fusion.addInput(bias);
+  auto running_mean = makeSymbolicTensor(1);
+  fusion.addInput(running_mean);
+  auto running_var = makeSymbolicTensor(1);
+  fusion.addInput(running_var);
+
+  auto momentum_ptr = IrBuilder::create<Double>(kMomentum);
+  auto eps_ptr = IrBuilder::create<Double>(kEps);
+
+  auto result = batch_norm(
+      input,
+      weight,
+      bias,
+      running_mean,
+      running_var,
+      kTraining,
+      momentum_ptr,
+      eps_ptr);
+
+  fusion.addOutput(result.output);
+  fusion.addOutput(result.mean);
+  fusion.addOutput(result.invstd);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({batch, c, h, w}, options);
+  at::Tensor input2 = at::randn({c}, options);
+  at::Tensor input3 = at::randn_like(input2);
+  at::Tensor input4 = at::randn_like(input2);
+  at::Tensor input5 = at::randn_like(input2);
+
+  auto input1_ref = input1.clone();
+  auto input2_ref = input2.clone();
+  auto input3_ref = input3.clone();
+  auto input4_ref = input4.clone();
+  auto input5_ref = input5.clone();
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  std::vector<IValue> aten_inputs = {input1, input2, input3, input4, input5};
+  auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+
+  auto at_results = at::native_batch_norm(
+      input1_ref,
+      input2_ref,
+      input3_ref,
+      input4_ref,
+      input5_ref,
+      kTraining,
+      kMomentum,
+      kEps);
+
+  auto at_output = std::get<0>(at_results);
+  auto at_mean = std::get<1>(at_results);
+  auto at_invstd = std::get<2>(at_results);
+
+  std::vector<at::Tensor> aten_outputs = {at_output, at_mean, at_invstd};
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBNRepro2_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  const bool kTraining = true;
+  const float kMomentum = 0.1;
+  const float kEps = 1e-5;
+
+  int batch = 2;
+  int c = 4;
+  int h = 17;
+  int w = 17;
+  int numDims = 4;
+
+  auto input = makeSymbolicTensor(numDims);
+  fusion.addInput(input);
+
+  Val* momentum_ptr = IrBuilder::create<Double>(kMomentum);
+  Val* eps_ptr = IrBuilder::create<Double>(kEps);
+
+  auto result = batch_norm(
+      input,
+      nullptr,
+      nullptr,
+      nullptr,
+      nullptr,
+      kTraining,
+      momentum_ptr,
+      eps_ptr);
+
+  fusion.addOutput(result.output);
+  fusion.addOutput(result.mean);
+  fusion.addOutput(result.invstd);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({batch, c, h, w}, options);
+
+  auto input1_ref = input1.clone();
+  at::Tensor r_m;
+  at::Tensor r_v;
+  at::Tensor weight;
+  at::Tensor bias;
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  std::vector<IValue> aten_inputs = {input1};
+  auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+
+  auto at_results = at::native_batch_norm(
+      input1_ref, r_m, r_v, weight, bias, kTraining, kMomentum, kEps);
+
+  auto at_output = std::get<0>(at_results);
+  auto at_mean = std::get<1>(at_results);
+  auto at_invstd = std::get<2>(at_results);
+
+  std::vector<at::Tensor> aten_outputs = {at_output, at_mean, at_invstd};
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionZeroSizeTensorPW_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = makeConcreteTensor({0});
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, IrBuilder::create<Double>(2.5));
+  fusion.addOutput(tv2);
+
+  // This test used to just have:
+  // auto tv3 = makeConcreteTensor({0});
+  // and somehow that was running through our system fine, but size-0 tensors
+  // are not supported, so making sure this fails.
+  auto tv3 = set(tv1);
+  fusion.addOutput(tv3);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor input0 = at::randn({2}, options);
+  at::Tensor input1 = at::randn({0}, options);
+  at::Tensor cg_output2 = at::empty({2}, options);
+  at::Tensor cg_output3 = at::empty({0}, options);
+
+  // Fails at schedule pointwise because our (maybe only) size-0 check is in
+  // binding input sizes which the scheduler ends up calling.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(schedulePointwise(&fusion, {input0, input1}));
+}
+
+TEST_F(NVFuserTest, FusionZeroSizeTensorReduction_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = makeConcreteTensor({0});
+  fusion.addInput(tv1);
+
+  auto tv2 = sum(tv0, {1});
+  fusion.addOutput(tv2);
+
+  auto tv3 = makeConcreteTensor({0});
+  fusion.addOutput(tv3);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor input0 = at::randn({2, 4}, options);
+  at::Tensor input1 = at::randn({0}, options);
+  at::Tensor cg_output2 = at::empty({2}, options);
+  at::Tensor cg_output3 = at::empty({0}, options);
+
+  auto reduction_params = getReductionHeuristics(&fusion, {input0, input1});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  scheduleReduction(&fusion, *reduction_params);
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+
+  auto lparams = reduction_params->lparams;
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input0, input1}, lparams);
+  auto cg_outputs = fe.runFusion({input0, input1}, lparams);
+  auto aten_output2 = input0.sum({1});
+  at::Tensor aten_output3 = at::empty({0}, options);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {input0, input1},
+      {aten_output2, aten_output3},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionZeroSizeTensorNormalization_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = makeConcreteTensor({0});
+  fusion.addInput(tv1);
+
+  auto tv2 = sum(tv0, {0});
+  auto tv3 = broadcast(tv2, {true, false});
+  auto tv4 = add(tv0, tv3);
+  fusion.addOutput(tv4);
+
+  auto tv5 = makeConcreteTensor({0});
+  fusion.addOutput(tv5);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor input0 = at::randn({2, 4}, options);
+  at::Tensor input1 = at::randn({0}, options);
+  at::Tensor cg_output2 = at::empty({2, 4}, options);
+  at::Tensor cg_output3 = at::empty({0}, options);
+
+  auto reduction_params = getPersistentHeuristics(&fusion, {input0, input1});
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  schedulePersistentKernel(&fusion, *reduction_params);
+
+  auto lparams = reduction_params->lparams;
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input0, input1}, lparams);
+  auto cg_outputs = fe.runFusion({input0, input1}, lparams);
+  auto aten_output2 = input0.sum({0}).add(input0);
+  at::Tensor aten_output3 = at::empty({0}, options);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {input0, input1},
+      {aten_output2, aten_output3},
+      __LINE__,
+      __FILE__,
+      "",
+      lparams);
+}
+
+TEST_F(NVFuserTest, FusionSegmentIoAlias_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(1);
+  TensorView* tv2 = makeSymbolicTensor(2);
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  fusion->addInput(tv2);
+
+  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(1)); // Group 0
+  TensorView* tv4 =
+      max(tv3, {0}); // Group 0 (use max instead to avoid numerical issues)
+  TensorView* tv5 = add(tv4, tv1); //  Group 0 (Non Broadcast after reduce,
+                                   //  keeps normalization scheduler away)
+  TensorView* tv6 = add(tv5, tv2); //  Group 1 (Broadcast after reduce)
+
+  // Note: test alias;
+  fusion->aliasOutputToInput(tv6, tv0);
+  // TODO: support output on aliased fusion #1488
+  // remove tv7 after #1488
+  // fusion->addOutput(tv6);
+  TensorView* tv7 = add(tv6, IrBuilder::create<Double>(1)); // Group 0
+  fusion->addOutput(tv7);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({128, 65}, options);
+  at::Tensor t1 = at::randn({65}, options);
+  at::Tensor t2 = at::randn({128, 65}, options);
+
+  auto t3 = t0.add(1.0);
+  auto t4 = std::get<0>(at::max(t3, 0));
+  auto t5 = t4.add(t1);
+  auto t6 = t5.add(t2);
+  auto t7 = t6.add(1.0);
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  auto outputs = executor_cache.runFusionWithInputs({t0, t1, t2});
+
+  // TODO: support output on aliased fusion #1488
+  // validating aliasing
+  // TORCH_INTERNAL_ASSERT(outputs[0].data_ptr() == t0.data_ptr());
+
+  TORCH_CHECK(
+      executor_cache.getMostRecentKernelRuntime()->isSegmented(),
+      "segmentation didn't happen");
+  TORCH_CHECK(
+      executor_cache.getMostRecentKernelRuntime()
+              ->fusionSegments()
+              ->groups()
+              .size() == 2,
+      "segmentation didn't happen as expected");
+
+  testValidate(
+      executor_cache.fusion(), outputs, {t0, t1, t2}, {t7}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionWelford1Output_CUDA) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion->addInput(tv0);
+
+  auto tvs = Welford(tv0, {1});
+  fusion->addOutput(tvs.var_sum);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({128, 65}, options);
+  auto outputs = executor_cache.runFusionWithInputs({t0});
+
+  auto t1 = t0.var({1}, false) * 65;
+  testValidate(fusion, outputs, {t0}, {t1}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTranslate1Welford_CUDA) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion->addInput(tv0);
+
+  auto tvs = Welford(tv0, {1});
+  auto tv_out = add(tv0, broadcast(tvs.avg, {false, true}));
+  fusion->addOutput(tv_out);
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+
+  auto run_test = [&executor_cache,
+                   fusion](auto inner_size) -> FusionKernelRuntime* {
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::Tensor t0 = at::randn({128, inner_size}, options);
+    auto outputs = executor_cache.runFusionWithInputs({t0});
+    // Square sums does not fit well in the testValidate assumptions,
+    //  so we just compare the divided output here.
+    testValidate(
+        fusion,
+        outputs,
+        {t0},
+        {t0.add(t0.mean({1}).unsqueeze(1))},
+        __LINE__,
+        __FILE__);
+
+    return executor_cache.getMostRecentKernelRuntime();
+  };
+
+  // Run a translated welford
+  auto runtime1 = run_test(64);
+  // Check it was translated
+  TORCH_CHECK(
+      runtime1->fusionSegments()->groups().size() == 1 &&
+      runtime1->fusionSegments()->groups()[0]->exprs().size() > 2);
+
+  // Run an un-translated welford
+  auto runtime2 = run_test(65536);
+
+  bool found_welford = false;
+  for (auto group : runtime2->fusionSegments()->groups()) {
+    for (auto expr : group->exprs()) {
+      if (expr->isA<WelfordOp>()) {
+        found_welford = true;
+      }
+    }
+  }
+  TORCH_CHECK(found_welford);
+}
+
+TEST_F(NVFuserTest, FusionTranslate2Welford_CUDA) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion->addInput(tv0);
+
+  auto tvs1 = Welford(tv0, {1});
+  auto tv_out1 = add(tv0, broadcast(tvs1.avg, {false, true}));
+  fusion->addOutput(tv_out1);
+
+  auto tvs2 = Welford(tv0, {1});
+  auto tv_out2 = add(tv0, broadcast(tvs2.avg, {false, true}));
+  fusion->addOutput(tv_out2);
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+
+  auto run_test = [&executor_cache,
+                   fusion](auto inner_size) -> FusionKernelRuntime* {
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::Tensor t0 = at::randn({128, inner_size}, options);
+    auto outputs = executor_cache.runFusionWithInputs({t0});
+
+    // Square sums does not fit well in the testValidate assumptions,
+    //  so we just compare the divided output here.
+    auto out = t0.add(t0.mean({1}).unsqueeze(1));
+    testValidate(fusion, outputs, {t0}, {out, out}, __LINE__, __FILE__);
+
+    return executor_cache.getMostRecentKernelRuntime();
+  };
+
+  // Run a translated welford
+  auto runtime1 = run_test(64);
+  // Check it was translated
+  TORCH_CHECK(
+      runtime1->fusionSegments()->groups().size() == 1 &&
+      runtime1->fusionSegments()->groups()[0]->exprs().size() > 4);
+
+  // Run an un-translated welford
+  auto runtime2 = run_test(65536);
+  // // Check it was not translated
+  bool found_welford = false;
+  for (auto group : runtime2->fusionSegments()->groups()) {
+    for (auto expr : group->exprs()) {
+      if (expr->isA<WelfordOp>()) {
+        found_welford = true;
+      }
+    }
+  }
+  TORCH_CHECK(found_welford);
+}
+
+TEST_F(NVFuserTest, FusionLargeWelfordNormalization_CUDA) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion->addInput(tv0);
+
+  auto tvs1 = Welford(tv0, {1});
+  auto sum_of_tv0 = sum(tv0, {1});
+
+  fusion->addOutput(tvs1.var_sum);
+  fusion->addOutput(sum_of_tv0);
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+
+  auto run_test = [&executor_cache,
+                   fusion](auto inner_size) -> FusionKernelRuntime* {
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::Tensor t0 = at::randn({128, inner_size}, options);
+    auto outputs = executor_cache.runFusionWithInputs({t0});
+
+    auto t1 = t0.var({1}, false) * inner_size;
+    auto t2 = t0.sum({1});
+    testValidate(fusion, outputs, {t0}, {t1, t2}, __LINE__, __FILE__);
+
+    return executor_cache.getMostRecentKernelRuntime();
+  };
+
+  auto runtime = run_test(65536);
+  TORCH_CHECK(!runtime->isSegmented());
+}
+
+TEST_F(NVFuserTest, FusionWelfordOuterPersistence_CUDA) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion->addInput(tv0);
+
+  auto tvs1 = Welford(tv0, {1});
+  auto sum_of_tv0 = sum(tv0, {1});
+  auto sum_bcasted = broadcast(sum_of_tv0, {false, true});
+  auto avg_bcasted = broadcast(tvs1.avg, {false, true});
+  auto tv0_plus_sum = add(tv0, sum_bcasted);
+  auto tv0_plus_avg = add(tv0, avg_bcasted);
+
+  fusion->addOutput(tv0_plus_sum);
+  fusion->addOutput(tv0_plus_avg);
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+
+  auto run_test = [&executor_cache,
+                   fusion](auto inner_size) -> FusionKernelRuntime* {
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::Tensor t0 = at::randn({128, inner_size}, options);
+    auto outputs = executor_cache.runFusionWithInputs({t0});
+
+    auto t1 = t0.to(c10::kDouble).mean({1}).unsqueeze(1) + t0;
+    auto t2 = t0.to(c10::kDouble).sum({1}).unsqueeze(1) + t0;
+    testValidate(fusion, outputs, {t0}, {t2, t1}, __LINE__, __FILE__);
+
+    return executor_cache.getMostRecentKernelRuntime();
+  };
+
+  for (auto inner_size : {4096, 8192, 32768}) {
+    auto runtime = run_test(inner_size);
+    TORCH_CHECK(!runtime->isSegmented());
+  }
+}
+
+TEST_F(NVFuserTest, FusionSegmentIslands_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  auto tv2 = sum(tv0, {0});
+  auto tv3 = sum(tv1, {1});
+  fusion->addOutput(tv2);
+  fusion->addOutput(tv3);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({16, 16}, options);
+  at::Tensor t1 = at::randn({16, 16}, options);
+
+  FusionExecutorCache fusion_executor_cache(std::move(fusion));
+  fusion_executor_cache.runFusionWithInputs({t0, t1});
+}
+
+TEST_F(NVFuserTest, FusionBackOffInnerBroadcast_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(1);
+  auto tv1 = makeSymbolicTensor(2);
+  auto tv2 = makeSymbolicTensor(4);
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  auto tv3 = broadcast(tv0, {false, true, true, true});
+  auto tv4 = broadcast(tv1, {false, false, true, true});
+  auto tv5 = unaryOp(UnaryOpType::Rsqrt, tv2);
+
+  auto tv6 = add(tv3, tv5);
+  auto tv7 = add(tv4, tv5);
+  auto tv8 = add(tv3, tv4);
+
+  auto tv9 = add(tv6, tv7);
+  auto tv10 = add(tv9, tv8);
+
+  fusion->addOutput(tv10);
+
+  tv0->computeAt(tv10, -2);
+  tv1->computeAt(tv10, -2);
+  tv2->computeAt(tv10, -2);
+
+  TORCH_CHECK(tv3->getComputeAtPosition() == 1);
+  TORCH_CHECK(tv4->getComputeAtPosition() == 2);
+  TORCH_CHECK(tv5->getComputeAtPosition() == 3);
+
+  TORCH_CHECK(tv6->getMaxProducerPosition() == 3);
+  TORCH_CHECK(tv7->getMaxProducerPosition() == 3);
+  TORCH_CHECK(tv8->getMaxProducerPosition() == 2);
+}
+
+TEST_F(NVFuserTest, FusionBackOffInnerBroadcast2_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = makeSymbolicTensor(3);
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  auto tv2 = broadcast(tv0, {false, false, true});
+  auto tv3 = add(tv2, tv1);
+
+  fusion->addOutput(tv3);
+  tv3->split(-2, 4);
+  tv3->reorder({{-1, -2}});
+  tv0->computeAt(tv3, -2);
+  tv1->computeAt(tv3, -2);
+  TORCH_CHECK(tv2->getComputeAtPosition() == 2);
+  TORCH_CHECK(tv3->getMaxProducerPosition() == 2);
+}
+
+TEST_F(NVFuserTest, FusionBackOffInnerBroadcast3_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = makeSymbolicTensor(4);
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  auto tv2 = broadcast(tv0, {false, false, true});
+  auto tv3 = broadcast(tv2, {false, true, false, false});
+  auto tv4 = add(tv3, tv1);
+
+  fusion->addOutput(tv4);
+  tv0->computeAt(tv4, -1);
+  tv1->computeAt(tv4, -1);
+  TORCH_CHECK(tv2->getComputeAtPosition() == 2);
+  TORCH_CHECK(tv3->getMaxProducerPosition() == 3);
+}
+
+TEST_F(NVFuserTest, FusionSimpleWarp_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion->addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = add(tv2, tv0);
+
+  fusion->addOutput(tv3);
+
+  tv1->split(1, 32);
+  auto tv1_rf = tv1->rFactor({1});
+  TransformPropagatorWithCheck propagator(tv1_rf);
+  MaxRootDomainInfoSpanningTree(tv1_rf).traverse(&propagator);
+  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({16, 128}, options);
+
+  auto at_output = input1.sum({1}, true).add(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1});
+  auto outputs = fe.runFusion({input1});
+
+  testValidate(
+      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSimpleWarpPad_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(2);
+
+  fusion->addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = add(tv2, tv0);
+
+  fusion->addOutput(tv3);
+
+  // Schedule a persistent kernel
+  auto tv0_cache = tv0->cacheAfter();
+  tv1->split(1, 8, false);
+  auto tv1_rf = tv1->rFactor({1});
+  tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
+  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1_rf->axis(-1)->padToMultipleOfWarp(32);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->padToMultipleOfWarp(32);
+  TransformPropagatorWithCheck propagator(tv1_rf);
+  MaxRootDomainInfoSpanningTree(tv1_rf).traverse(&propagator);
+  tv0->axis(-1)->parallelize(ParallelType::TIDx);
+  tv0->axis(-1)->padToMultipleOfWarp(32);
+  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
+  tv0_cache->axis(-1)->padToMultipleOfWarp(32);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->padToMultipleOfWarp(32);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->padToMultipleOfWarp(32);
+
+  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({16, 127}, options);
+
+  auto at_output = input1.sum({1}, true).add(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1});
+  auto outputs = fe.runFusion({input1});
+  testValidate(
+      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionWarpPadMergeSplit_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(3);
+
+  fusion->addInput(tv0);
+
+  auto tv1 = sum(tv0, {1, 2});
+  auto tv2 = broadcast(tv1, {false, true, true});
+  auto tv3 = add(tv2, tv0);
+
+  fusion->addOutput(tv3);
+
+  // Schedule a persistent kernel
+  auto tv0_cache = tv0->cacheAfter();
+  tv1->merge(1);
+  tv1->split(1, 8, false);
+
+  auto tv1_rf = tv1->rFactor({1});
+  tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
+  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->padToMultipleOfWarp();
+  TransformPropagatorWithCheck propagator(tv1_rf);
+  MaxRootDomainInfoSpanningTree(tv1_rf).traverse(&propagator);
+  tv0->axis(-1)->parallelize(ParallelType::TIDx);
+  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({16, 17, 128}, options);
+
+  auto at_output = input1.sum({1, 2}, true).add(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1});
+  auto outputs = fe.runFusion({input1});
+  testValidate(
+      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSerialWarpReduction_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(3);
+
+  fusion->addInput(tv0);
+
+  auto tv1 = sum(tv0, {1, 2});
+  auto tv2 = broadcast(tv1, {false, true, true});
+  auto tv3 = add(tv2, tv0);
+
+  fusion->addOutput(tv3);
+
+  // Schedule a persistent kernel
+  auto tv0_cache = tv0->cacheAfter();
+  tv1->merge(1);
+  tv1->split(1, 8, false);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->padToMultipleOfWarp();
+  TransformPropagatorWithCheck propagator(tv1);
+  MaxRootDomainInfoSpanningTree(tv1).traverse(&propagator);
+  tv0->axis(-1)->parallelize(ParallelType::TIDx);
+  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({16, 17, 128}, options);
+
+  auto at_output = input1.sum({1, 2}, true).add(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1});
+  auto outputs = fe.runFusion({input1});
+  testValidate(
+      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTrivialWarpReduction_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeConcreteTensor({17, 18, 128, 1});
+
+  fusion->addInput(tv0);
+
+  auto tv1 = sum(tv0, {1, 2, 3});
+  auto tv2 = broadcast(tv1, {false, true, true, true});
+  auto tv3 = add(tv2, tv0);
+
+  fusion->addOutput(tv3);
+
+  // Schedule a persistent kernel
+  auto tv0_cache = tv0->cacheAfter();
+  tv1->merge(1);
+  tv1->split(1, 8, false);
+
+  auto tv1_rf = tv1->rFactor({1});
+  tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
+  tv1_rf->axis(-2)->parallelize(ParallelType::TIDx);
+  tv1->axis(-2)->parallelize(ParallelType::TIDx);
+  tv1->axis(-2)->padToMultipleOfWarp();
+  TransformPropagatorWithCheck propagator(tv1_rf);
+  MaxRootDomainInfoSpanningTree(tv1_rf).traverse(&propagator);
+  tv0->axis(-2)->parallelize(ParallelType::TIDx);
+  tv0_cache->axis(-2)->parallelize(ParallelType::TIDx);
+  tv2->axis(-2)->parallelize(ParallelType::TIDx);
+  tv3->axis(-2)->parallelize(ParallelType::TIDx);
+
+  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({17, 18, 128, 1}, options);
+
+  auto at_output = input1.sum({1, 2, 3}, true).add(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1});
+  auto outputs = fe.runFusion({input1});
+  testValidate(
+      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionMultipleDimBinding_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv_add = makeSymbolicTensor(2);
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv_add);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = add(tv2, tv0);
+  auto tv4 = add(tv0, tv_add);
+
+  fusion->addOutput(tv3);
+  fusion->addOutput(tv4);
+
+  // Schedule a persistent kernel
+  auto tv0_cache = tv0->cacheAfter();
+  tv1->split(1, 8, false);
+  auto tv1_rf = tv1->rFactor({1});
+  tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
+  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1_rf->axis(-1)->padToMultipleOfWarp(32);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->padToMultipleOfWarp(32);
+  TransformPropagatorWithCheck propagator(tv1_rf);
+  MaxRootDomainInfoSpanningTree(tv1_rf).traverse(&propagator);
+  tv0->axis(-1)->parallelize(ParallelType::TIDx);
+  tv0->axis(-1)->padToMultipleOfWarp(32);
+  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
+  tv0_cache->axis(-1)->padToMultipleOfWarp(32);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->padToMultipleOfWarp(32);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->padToMultipleOfWarp(32);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(-1)->padToMultipleOfWarp(64);
+
+  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({16, 128}, options);
+  at::Tensor input2 = at::randn({16, 128}, options);
+
+  auto at_output = input1.sum({1}, true).add(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1, input2});
+  auto outputs = fe.runFusion({input1, input2});
+  testValidate(
+      fusion.get(),
+      outputs,
+      {input1, input2},
+      {at_output, input1 + input2},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionPadNoWarpReduce_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(2);
+
+  fusion->addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = add(tv2, tv0);
+
+  fusion->addOutput(tv3);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->padToMultipleOfWarp();
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv1->axis(0)->parallelize(ParallelType::TIDy);
+  tv2->axis(0)->parallelize(ParallelType::TIDy);
+  tv3->axis(0)->parallelize(ParallelType::TIDy);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({16, 31}, options);
+
+  auto at_output = input1.sum({1}, true).add(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1});
+  auto outputs = fe.runFusion({input1});
+  testValidate(
+      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionWarpMutipleThreadDim_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion->addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {1});
+  fusion->addOutput(tv2);
+
+  tv2->split(1, 8);
+  auto tv2_rf = tv2->rFactor({-1});
+  tv2_rf->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2_rf->axis(-1)->padToMultipleOfWarp();
+
+  TransformPropagatorWithCheck propagator(tv2_rf);
+  MaxRootDomainInfoSpanningTree(tv2_rf).traverse(&propagator);
+
+  tv0->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::TIDy);
+  tv0->computeAt(tv2, 2);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({16, 31}, options);
+
+  auto at_output = (input1 + 1).sum({1});
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1});
+  auto outputs = fe.runFusion({input1});
+  testValidate(
+      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionWarpReduceUnrollOuterLoop_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(2);
+
+  fusion->addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = add(tv2, tv0);
+
+  fusion->addOutput(tv3);
+
+  // Schedule a persistent kernel
+  auto tv0_cache = tv0->cacheAfter();
+  tv1->split(1, 8, false);
+  tv1->split(0, 4);
+  auto tv1_rf = tv1->rFactor({2});
+
+  tv1_rf->axis(0)->parallelize(ParallelType::BIDx);
+  tv1_rf->axis(1)->parallelize(ParallelType::Unroll);
+  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->padToMultipleOfWarp();
+  tv1->axis(1)->parallelize(ParallelType::Unroll);
+  TransformPropagatorWithCheck propagator(tv1_rf);
+  MaxRootDomainInfoSpanningTree(tv1_rf).traverse(&propagator);
+  tv0->axis(-1)->parallelize(ParallelType::TIDx);
+  tv0->axis(1)->parallelize(ParallelType::Unroll);
+  tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
+  tv0_cache->axis(1)->parallelize(ParallelType::Unroll);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(1)->parallelize(ParallelType::Unroll);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(1)->parallelize(ParallelType::Unroll);
+
+  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({16, 128}, options);
+
+  auto at_output = input1.sum({1}, true).add(input1);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1});
+  auto outputs = fe.runFusion({input1});
+  testValidate(
+      fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__);
+}
+
+// Repro of issue #1579
+TEST_F(NVFuserTest, FusionWarpReducePredication_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> shape1 = {1024};
+  std::vector<int64_t> shape2 = {50};
+
+  auto tv0 = makeConcreteTensor(shape1);
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {0});
+  fusion.addOutput(tv1);
+
+  auto tv2 = makeConcreteTensor(shape2);
+  fusion.addInput(tv2);
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+  auto tv4 = sum(tv3, {0});
+  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv5);
+
+  // Just to fill the smem buffer by a thread block of 1024 threads
+  // with some values
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+
+  // Make the tv4_rf reduction a warp reduction to trigger the
+  // bug. Since the smem buffer is filled with some values due to the
+  // reduction of tv1, those values would be used by predicated-out
+  // threads.
+  tv4->split(-1, 10);
+  auto tv4_rf = tv4->rFactor({-1});
+  tv4_rf->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4_rf->axis(-1)->padToMultipleOfWarp();
+
+  tv4_rf->computeAt(tv4, 1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape1, options);
+  auto t2 = at::randn(shape2, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t2});
+  auto cg_outputs = fe.runFusion({t0, t2});
+
+  auto t1 = t0.sum({0});
+  auto t4 = (t2 + 1).sum({0}) + 1;
+
+  testValidate(&fusion, cg_outputs, {t0, t2}, {t1, t4}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSegfaultReduction_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  int batch = 2;
+  int c = 1;
+  int h = 1;
+  int w = 1;
+  int numDims = 4;
+
+  auto input = makeConcreteTensor({-1, 1, 1, 1});
+  fusion.addInput(input);
+  auto bcast_bias = makeConcreteTensor({-1, 1, 1, 1});
+  fusion.addInput(bcast_bias);
+
+  std::vector<int64_t> at_sum_axes;
+  std::vector<int> outer_reduction_axes;
+  std::vector<bool> outer_broadcast_mask(numDims, false);
+  Val* N = IrBuilder::create<Double>(1);
+  for (const auto axis : c10::irange(numDims)) {
+    if (axis != 1) {
+      outer_reduction_axes.push_back(axis);
+      at_sum_axes.push_back(axis);
+      outer_broadcast_mask[axis] = true;
+      N = mul(N, input->domain()->domain()[axis]->extent());
+    }
+  }
+
+  auto output0 = mul(input, bcast_bias);
+  fusion.addOutput(output0);
+  auto output1 = sum(output0, outer_reduction_axes);
+  fusion.addOutput(output1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input0 = at::randn({batch, c, h, w}, options);
+  at::Tensor input1 = at::randn({batch, c, h, w}, options);
+
+  auto at_output0 = input0.mul(input1);
+  auto at_output1 = at_output0.sum(at_sum_axes);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  std::vector<IValue> inputs = {input0, input1};
+  auto outputs = fec.runFusionWithInputs(inputs);
+
+  testValidate(
+      &fusion, outputs, inputs, {at_output0, at_output1}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionPredicateElimination1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(2));
+  auto tv3 = add(tv2, IrBuilder::create<Double>(3));
+
+  fusion.addOutput(tv3);
+
+  tv3->split(0, 32);
+  tv0->computeAt(tv3, 1);
+
+  tv2->axis(1)->parallelize(ParallelType::Unswitch);
+
+  {
+    GpuLower gpulw(&fusion);
+    TORCH_CHECK(!PredicatedChecker::isPredicated(tv2, gpulw));
+  }
+
+  tv2->axis(1)->parallelize(ParallelType::Serial);
+  tv2->split(1, 5);
+
+  {
+    GpuLower gpulw(&fusion);
+    TORCH_CHECK(PredicatedChecker::isPredicated(tv2, gpulw));
+  }
+}
+
+// Repro of issue #1571
+TEST_F(NVFuserTest, FusionPredicateElimination2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> shape({10, 11});
+
+  auto tv0 = makeConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {1});
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+
+  fusion.addOutput(tv3);
+
+  tv1->split(1, 4);
+  tv1->split(0, 4);
+  tv2->split(1, 4);
+  tv2->split(0, 4);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = (t0 + 1).sum({1}) + 1;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionPredicateElimination3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  auto tv3 = tv0->cacheAfter();
+
+  tv1->split(0, 10);
+  tv1->split(0, 33);
+  TransformPropagatorWithCheck propagator(tv1);
+  MaxRootDomainInfoSpanningTree(tv1).traverse(&propagator);
+
+  auto tv4 = tv1->rFactor({-1});
+  auto tv5 = tv1->rFactor({-1});
+
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv4);
+
+  GpuLower gpulw(&fusion);
+
+  // The fusion has three reductions: one within each thread, one
+  // within each block, and another with the whole grid. All of them
+  // should not need to be predicated as they use the same init value
+  // and same reduction op.
+  TORCH_CHECK(!PredicatedChecker::isPredicated(tv4, gpulw));
+  TORCH_CHECK(!PredicatedChecker::isPredicated(tv5, gpulw));
+  TORCH_CHECK(!PredicatedChecker::isPredicated(tv1, gpulw));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  for (auto size : {1, 2, 999, 1001, 1234, 10000}) {
+    auto t0 = at::randn({size}, options);
+
+    FusionExecutor fe;
+    fe.compileFusion(&fusion, {t0});
+    auto cg_outputs = fe.runFusion({t0});
+
+    auto ref = sum(t0) + 1;
+    testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+  }
+}
+
+TEST_F(NVFuserTest, FusionPredicateElimination4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+
+  auto tv2 = sum(tv1, {0});
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv3);
+
+  auto tv4 = max(tv1, {0});
+  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv5);
+
+  tv1->split(1, 7);
+  tv1->split(0, 11);
+  tv1->reorder({{1, 2}, {2, 1}});
+  TransformPropagatorWithCheck propagator(tv1);
+  MaxRootDomainInfoSpanningTree(tv1).traverse(&propagator);
+
+  tv1->axis(0)->parallelize(ParallelType::TIDy);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv1);
+
+  GpuLower gpulw(&fusion);
+
+  // tv2 uses the same op and init with tv1, so tv2 should be fine
+  // without a predicate. However, tv4, while it uses the tv1 as its
+  // input, the reduction op and init value is different from those of
+  // tv1, so tv4 needs to be predicated.
+  TORCH_CHECK(!PredicatedChecker::isPredicated(tv2, gpulw));
+  TORCH_CHECK(PredicatedChecker::isPredicated(tv4, gpulw));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  std::vector<int64_t> sizes = {1, 2, 33, 34, 64, 99};
+  for (auto s0 : sizes) {
+    for (auto s1 : sizes) {
+      auto t0 = at::randn({s0, s1}, options);
+
+      FusionExecutor fe;
+      fe.compileFusion(&fusion, {t0});
+      auto cg_outputs = fe.runFusion({t0});
+
+      auto t1 = t0.sum({1});
+      auto t3 = t1.sum({0}) + 1;
+      auto t5 = std::get<0>(t1.max(0)) + 1;
+
+      testValidate(&fusion, cg_outputs, {t0}, {t3, t5}, __LINE__, __FILE__);
+    }
+  }
+}
+
+TEST_F(NVFuserTest, FusionPredicateElimination5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  auto tvs2 = Welford(tv1, {0});
+  auto tv3 = set(tvs2.avg);
+  fusion.addOutput(tv3);
+
+  tvs2.avg->split(0, 4);
+  TransformPropagatorWithCheck propagator(tvs2.avg);
+  MaxRootDomainInfoSpanningTree(tvs2.avg).traverse(&propagator);
+  auto avg_rf = ir_utils::rfactorHelper(tvs2.avg, {1});
+
+  avg_rf->axis(0)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(avg_rf);
+
+  GpuLower gpulw(&fusion);
+
+  // The first per-thread welford needs to be predicated as the N
+  // input is different from its init value. The second welford op
+  // does not need a predicate.
+  TORCH_CHECK(PredicatedChecker::isPredicated(avg_rf, gpulw));
+  TORCH_CHECK(!PredicatedChecker::isPredicated(tvs2.avg, gpulw));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  std::vector<int64_t> sizes = {1, 2, 33, 34, 64, 99};
+  for (auto s0 : sizes) {
+    auto t0 = at::randn({s0}, options);
+
+    FusionExecutor fe;
+    fe.compileFusion(&fusion, {t0});
+    auto cg_outputs = fe.runFusion({t0});
+
+    auto ref = t0.mean({0});
+
+    testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+  }
+}
+
+TEST_F(NVFuserTest, FusionPredicateElimination6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({2, 3});
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+  auto tv4 = add(tv3, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv4);
+
+  tv4->split(1, 5);
+  TransformPropagatorWithCheck propagator(tv4);
+  MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
+
+  tv4->reorder({{0, 1}, {1, 0}});
+  tv3->computeAt(tv4, 1);
+
+  GpuLower gpulw(&fusion);
+
+  // The expression for tv2 is a local-to-local expression. It
+  // satisfies all the requirements of predicate elimination, except
+  // for the on on split root domains. As the second root axis of tv2
+  // is split, its index exceeds its extent (i.e., 3 in this case)
+  // without its predicate.
+  TORCH_CHECK(PredicatedChecker::isPredicated(tv2, gpulw));
+
+  // Unlike tv2, tv3 is computed at tv4, so the second root axis does
+  // have a zero domain. Its index should look like "i * 5 + j", where
+  // i comes from the first root domain and j comes from the split
+  // inner domain.
+  TORCH_CHECK(!PredicatedChecker::isPredicated(tv3, gpulw));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({2, 3}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0 + 4;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionPredicateElimination7_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv3);
+
+  tv3->split(-1, 5);
+  tv3->split(-1, 4);
+  tv3->split(-1, 3);
+  TransformPropagatorWithCheck propagator(tv3);
+  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
+
+  tv0->computeAt(tv3, 1);
+
+  // The last split of tv2 is a non-divisible split, and omitting it
+  // is invalid.
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(PredicatedChecker::isPredicated(tv2, gpulw));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({123}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0 + 3;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionForceFp16Simple_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = makeSymbolicTensor(2);
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  // Group 1
+  auto tv2 = sum(tv0, {1});
+  auto tv3 = broadcast(tv2, {false, true});
+
+  // Group 2
+  auto tv4 = add(tv3, tv1); // Edge: tv3: expect cast
+  auto tv5 = castOp(DataType::Half, tv4);
+
+  fusion->addOutput(tv5);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+
+  std::vector<int64_t> shape{15, 16};
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto in0 = at::randn(shape, options);
+  auto in1 = at::randn(shape, options);
+  fec.runFusionWithInputs({in0, in1});
+
+  // Check the segmented edge is fp16
+  auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments();
+  for (auto edge : segmented_fusion->edges()) {
+    auto edge_tv = edge->val->as<TensorView>();
+    TORCH_CHECK(edge_tv->getDataType() == DataType::Half);
+  }
+}
+
+TEST_F(NVFuserTest, FusionForceBf16Simple_CUDA) {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+  // requires ampere+ GPU
+  if (!deviceMajorMinorCheck(8)) {
+    GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
+    return;
+  }
+
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = makeSymbolicTensor(2);
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  // Group 1
+  auto tv2 = sum(tv0, {1});
+  auto tv3 = broadcast(tv2, {false, true});
+
+  // Group 2
+  auto tv4 = add(tv3, tv1); // Edge: tv3: expect cast
+  auto tv5 = castOp(DataType::BFloat16, tv4);
+
+  fusion->addOutput(tv5);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+
+  std::vector<int64_t> shape{15, 16};
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto in0 = at::randn(shape, options);
+  auto in1 = at::randn(shape, options);
+  fec.runFusionWithInputs({in0, in1});
+
+  // Check the segmented edge is bf16
+  auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments();
+  for (auto edge : segmented_fusion->edges()) {
+    auto edge_tv = edge->val->as<TensorView>();
+    TORCH_CHECK(edge_tv->getDataType() == DataType::BFloat16);
+  }
+#else
+  GTEST_SKIP() << "requires cuda 11.0 or newer toolkit";
+#endif
+}
+
+TEST_F(NVFuserTest, FusionForceFp16NotAllCast_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeSymbolicTensor(3);
+  auto tv1 = makeSymbolicTensor(3);
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  // Group 1
+  auto tv3 = sum(tv0, {1});
+  auto tv4 = broadcast(tv3, {false, true, false});
+  auto tv5 = sum(tv0, {1});
+
+  // Group 2
+  auto tv6 = add(tv4, tv1); // edge tv4, expect cast
+  auto tv7 = castOp(DataType::Half, tv6);
+
+  // Group 3
+  auto tv8 = sum(tv5, {1}); // edge tv5, don't expect cast
+
+  fusion->addOutput(tv7);
+  fusion->addOutput(tv8);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+
+  std::vector<int64_t> shape{16, 16, 16};
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto in0 = at::randn(shape, options);
+  auto in1 = at::randn(shape, options);
+  fec.runFusionWithInputs({in0, in1});
+
+  auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments();
+  auto complete_fusion = segmented_fusion->completeFusion();
+
+  // Check that the edge that wasn't fp16 is the producer of the
+  //  reduction op, i.e. tv8 = sum(tv5,{1});.
+  for (auto edge : segmented_fusion->edges()) {
+    auto edge_tv = edge->val->as<TensorView>();
+    if (edge_tv->getDataType() == DataType::Float) {
+      auto consumer = *(complete_fusion->unordered_uses(edge_tv).begin());
+      TORCH_CHECK(consumer->isA<ReductionOp>());
+    }
+  }
+}
+
+TEST_F(NVFuserTest, FusionForceBf16NotAllCast_CUDA) {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+  // requires ampere+ GPU
+  if (!deviceMajorMinorCheck(8)) {
+    GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
+    return;
+  }
+
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeSymbolicTensor(3);
+  auto tv1 = makeSymbolicTensor(3);
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  // Group 1
+  auto tv3 = sum(tv0, {1});
+  auto tv4 = broadcast(tv3, {false, true, false});
+  auto tv5 = sum(tv0, {1});
+
+  // Group 2
+  auto tv6 = add(tv4, tv1); // edge tv4, expect cast
+  auto tv7 = castOp(DataType::BFloat16, tv6);
+
+  // Group 3
+  auto tv8 = sum(tv5, {1}); // edge tv5, don't expect cast
+
+  fusion->addOutput(tv7);
+  fusion->addOutput(tv8);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+
+  std::vector<int64_t> shape{16, 16, 16};
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto in0 = at::randn(shape, options);
+  auto in1 = at::randn(shape, options);
+  fec.runFusionWithInputs({in0, in1});
+
+  auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments();
+  auto complete_fusion = segmented_fusion->completeFusion();
+
+  // Check that the edge that wasn't fp16 is the producer of the
+  //  reduction op, i.e. tv8 = sum(tv5,{1});.
+  for (auto edge : segmented_fusion->edges()) {
+    auto edge_tv = edge->val->as<TensorView>();
+    if (edge_tv->getDataType() == DataType::Float) {
+      auto consumer = *(complete_fusion->unordered_uses(edge_tv).begin());
+      TORCH_CHECK(consumer->isA<ReductionOp>());
+    }
+  }
+#else
+  GTEST_SKIP() << "requires cuda 11.0 or newer toolkit";
+#endif
+}
+
+TEST_F(NVFuserTest, FusionBufferReuseBroadCastMultiVisit_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeConcreteTensor({2, 2});
+  auto tv1 = makeConcreteTensor({2, 2, 2});
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  auto tv2 = mul(tv0, IrBuilder::create<Double>(2));
+  auto tv3 = broadcast(tv2, {false, false, true});
+  auto tv4 = add(tv3, tv1);
+  auto tv5 = mul(tv4, IrBuilder::create<Double>(3));
+  fusion->addOutput(tv5);
+
+  // t4 cannot inner re-use t2, because there's a broadcast
+  //  between them.
+  tv0->computeAt(tv5, 1, ComputeAtMode::BestEffort);
+  tv3->computeAt(tv5, 2, ComputeAtMode::BestEffort);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto in0 = at::randn({2, 2}, options);
+  auto in1 = at::randn({2, 2, 2}, options);
+
+  auto at_output = ((in0 * 2).unsqueeze(2) + in1) * 3;
+  FusionExecutor fe;
+  fe.compileFusion(fusion, {in0, in1});
+  auto outputs = fe.runFusion({in0, in1});
+
+  testValidate(fusion, outputs, {in0, in1}, {at_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBufferReuseStressTest_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeConcreteTensor({2, 2});
+  auto tv1 = makeConcreteTensor({2, 2, 2});
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  auto tv2 = mul(tv0, IrBuilder::create<Double>(2));
+  auto tv3 = mul(tv0, IrBuilder::create<Double>(3));
+  auto tv4 = mul(tv2, tv3);
+  // Broadcast buffer can be reused through outer sharing
+  auto tv5 = broadcast(tv4, {true, false, false});
+  auto tv6 = mul(tv5, IrBuilder::create<Double>(5));
+  auto tv7 = mul(tv6, tv1);
+  auto tv8 = mul(tv7, IrBuilder::create<Double>(7));
+  // tv9 shouldn't alias to avoid buffer over-subscription
+  auto tv9 = broadcast(tv4, {true, false, false});
+  auto tv10 = mul(tv9, IrBuilder::create<Double>(9));
+  auto tv11 = add(tv5, tv9);
+  fusion->addOutput(tv7);
+  fusion->addOutput(tv11);
+
+  tv0->computeAt(tv5, 1, ComputeAtMode::BestEffort);
+  tv0->computeAt(tv9, 1, ComputeAtMode::BestEffort);
+
+  tv5->computeAt(tv7, 1, ComputeAtMode::BestEffort);
+  tv5->computeAt(tv11, 1, ComputeAtMode::BestEffort);
+  tv9->computeAt(tv11, 1, ComputeAtMode::BestEffort);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto in0 = at::randn({2, 2}, options);
+  auto in1 = at::randn({2, 2, 2}, options);
+  auto t2 = in0 * 2;
+  auto t3 = in0 * 3;
+  auto t4 = t2 * t3;
+  auto t5 = t4.unsqueeze(0);
+  auto t6 = t5 * 5;
+  auto t7 = t6 * in1;
+  auto t8 = t7 * 7;
+  auto t9 = t4.unsqueeze(0);
+  auto t10 = t9 * 9;
+  auto t11 = t5 + t9;
+  FusionExecutor fe;
+  fe.compileFusion(fusion, {in0, in1});
+
+  auto at_output = ((in0 * 2).unsqueeze(2) + in1) * 3;
+  auto outputs = fe.runFusion({in0, in1});
+
+  testValidate(fusion, outputs, {in0, in1}, {t7, t11}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBufferReuseLargeBuffer_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeConcreteTensor({256, 512});
+
+  fusion->addInput(tv0);
+
+  auto tv1 = mul(tv0, IrBuilder::create<Double>(2));
+  auto tv2 = mul(tv1, IrBuilder::create<Double>(2));
+  auto tv3 = mul(tv2, IrBuilder::create<Double>(2));
+  auto tv4 = mul(tv3, IrBuilder::create<Double>(2));
+  auto tv5 = mul(tv4, IrBuilder::create<Double>(2));
+  auto tv6 = mul(tv5, IrBuilder::create<Double>(2));
+
+  fusion->addOutput(tv6);
+
+  tv0->computeAt(tv6, 1, ComputeAtMode::BestEffort);
+  tv6->axis(0)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto in0 = at::randn({256, 512}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion, {in0});
+  auto outputs = fe.runFusion({in0});
+
+  auto at_out = in0.mul(2).mul(2).mul(2).mul(2).mul(2).mul(2);
+
+  testValidate(fusion, outputs, {in0}, {at_out}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBufferReuseNo2hop_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeConcreteTensor({2, 2});
+  auto tv1 = makeConcreteTensor({2, 2, 2});
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  auto tv2 = mul(tv0, IrBuilder::create<Double>(2));
+  auto tv3 = broadcast(tv2, {false, false, true});
+  auto tv4 = add(tv3, tv1); // T4 to be inner aliased first, and
+                            //  shouldn't outer alias on top
+  auto tv5 = mul(tv4, IrBuilder::create<Double>(3));
+  auto tv6 = mul(tv5, IrBuilder::create<Double>(3));
+  fusion->addOutput(tv6);
+
+  tv0->computeAt(tv6, 1, ComputeAtMode::BestEffort);
+  tv4->computeAt(tv6, 2, ComputeAtMode::BestEffort);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto in0 = at::randn({2, 2}, options);
+  auto in1 = at::randn({2, 2, 2}, options);
+  FusionExecutor fe;
+  fe.compileFusion(fusion, {in0, in1});
+  auto outputs = fe.runFusion({in0, in1});
+
+  auto at_out = (in0.mul(2.0).unsqueeze(2) + in1).mul(3.0).mul(3.0);
+
+  testValidate(fusion, outputs, {in0, in1}, {at_out}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBufferReuseAllocationOrder_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeConcreteTensor({3, 3, 3});
+
+  fusion->addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = mul(tv1, IrBuilder::create<Double>(2));
+  auto tv3 = mul(tv2, IrBuilder::create<Double>(2));
+
+  fusion->addOutput(tv3);
+
+  // In this case tv1 "reuses" allocation of tv2
+  //  due to the switched allocation order
+  tv1->computeAt(tv2, 1, ComputeAtMode::BestEffort);
+
+  tv0->axis(0)->parallelize(ParallelType::TIDx);
+  tv1->axis(0)->parallelize(ParallelType::TIDx);
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+  tv3->axis(0)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto in0 = at::randn({3, 3, 3}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion, {in0});
+  auto outputs = fe.runFusion({in0});
+
+  auto at_out = in0.sum(1).mul(2).mul(2);
+
+  testValidate(fusion, outputs, {in0}, {at_out}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBufferReuseLiveInterval_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeConcreteTensor({16, 16});
+
+  fusion->addInput(tv0);
+
+  auto tv1 = mul(tv0, IrBuilder::create<Double>(3));
+  auto tv2 = mul(tv1, IrBuilder::create<Double>(2));
+  auto tv3 = mul(tv2, IrBuilder::create<Double>(2));
+  // tv1 used till here, cannot be reused by tv2 or tv3
+  auto tv4 = mul(tv3, tv1);
+
+  fusion->addOutput(tv4);
+
+  tv0->computeAt(tv4, 1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto in0 = at::randn({16, 16}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion, {in0});
+  auto cg_outputs = fe.runFusion({in0});
+
+  auto at_t0 = in0 * 3.0;
+  auto at_out = at_t0 * 2.0 * 2.0 * at_t0;
+
+  testValidate(fusion, cg_outputs, {in0}, {at_out}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBufferReuseNoAcrossBroadcast_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeConcreteTensor({2, 2});
+  auto tv1 = makeConcreteTensor({2, 2, 2});
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  auto tv2 = mul(tv0, IrBuilder::create<Double>(2));
+  auto tv3 = mul(tv0, IrBuilder::create<Double>(3));
+  auto tv4 = mul(tv2, tv3);
+  auto tv5 = broadcast(tv4, {false, false, true});
+  auto tv6 = mul(tv5, tv1);
+  auto tv7 = mul(tv6, IrBuilder::create<Double>(7));
+  fusion->addOutput(tv7);
+
+  // tv6 shouldn't re-use t2 or t3 because of
+  //  the broadcast in between
+  tv0->computeAt(tv4, 1, ComputeAtMode::BestEffort);
+  tv4->computeAt(tv7, 2, ComputeAtMode::BestEffort);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto in0 = at::randn({2, 2}, options);
+  auto in1 = at::randn({2, 2, 2}, options);
+  FusionExecutor fe;
+  fe.compileFusion(fusion, {in0, in1});
+  auto outputs = fe.runFusion({in0, in1});
+
+  auto t2 = in0 * 2;
+  auto t3 = in0 * 3;
+  auto t4 = t2 * t3;
+  auto t5 = t4.unsqueeze(2);
+  auto t6 = t5 * in1;
+  auto t7 = t6 * 7;
+  testValidate(fusion, outputs, {in0, in1}, {t7}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue970_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int nelm = 10;
+
+  // tv3 = tv0 + sum(tv0)
+  auto tv0 = makeConcreteTensor({nelm, nelm});
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = add(tv2, tv0);
+  fusion.addOutput(tv3);
+
+  tv1->split(1, 4);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({nelm, nelm}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0});
+
+  auto ref = sum(t0, {1}).unsqueeze(-1).expand({nelm, nelm}) + t0;
+
+  testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Reproducer of #1016
+TEST_F(NVFuserTest, FusionIssue1016_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(2));
+
+  fusion.addOutput(tv2);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  tv2->split(-1, 8);
+
+  int numel_x = 10;
+  int numel_y = 11;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({numel_x, numel_y}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto ref = t0 + 1 + 2;
+
+  testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Reproducer of #1021
+TEST_F(NVFuserTest, FusionIssue1021_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = broadcast(tv1, {false, true});
+  fusion.addOutput(tv2);
+
+  auto tv3 = tv2->cacheBefore();
+
+  tv2->split(0, 2);
+
+  tv1->computeAt(tv2, 1);
+
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+  tv2->axis(1)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({10}, options);
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto outputs = fe.runFusion(inputs);
+
+  auto ref = (t0 + 1).unsqueeze(-1);
+
+  testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
+}
+
+// Reproducer of issue #1053
+TEST_F(NVFuserTest, FusionNonUniqueThreadDim_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion->addInput(tv0);
+  auto tv1 = sum(tv0, {0});
+  fusion->addOutput(tv1);
+
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
+  fusion->addOutput(tv2);
+
+  tv1->split(0, 8);
+  auto tv1_rf = tv1->rFactor({-1});
+
+  tv1_rf->computeAt(tv1, 1);
+
+  tv1_rf->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({32}, options);
+
+  auto at_tv1 = (input1).sum({0});
+  auto at_tv2 = input1 + 1;
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1});
+  auto outputs = fe.runFusion({input1});
+  testValidate(
+      fusion.get(), outputs, {input1}, {at_tv1, at_tv2}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionParallelDimensionMap1_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion->addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
+  fusion->addOutput(tv1);
+  fusion->addOutput(tv2);
+
+  tv1->split(0, 8, false);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+  tv2->split(0, 8, false);
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+
+  // The extents of tv1 and tv2 axes are equal even though their
+  // actual values are not statically known
+  GpuLower gpulw(fusion.get());
+  const auto& pdmap = gpulw.parallelDimensionMap();
+  for (const auto i : c10::irange(tv1->domain()->domain().size())) {
+    auto dom1 = tv1->domain()->domain()[i];
+    auto dom2 = tv2->domain()->domain()[i];
+    TORCH_INTERNAL_ASSERT(pdmap.equalDim(dom1->extent(), dom2->extent()));
+  }
+
+  TORCH_CHECK(pdmap.isExact(ParallelType::TIDx));
+  TORCH_CHECK(
+      pdmap.get(ParallelType::TIDx)->isA<NamedScalar>() &&
+      pdmap.get(ParallelType::TIDx)->as<NamedScalar>()->name() == "blockDim.x");
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({32}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1});
+  auto outputs = fe.runFusion({input1});
+
+  testValidate(
+      fusion.get(),
+      outputs,
+      {input1},
+      {input1 + 1, input1 + 1},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionParallelDimensionMap2_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion->addInput(tv0);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion->addInput(tv1);
+  auto tv2 = broadcast(tv0, {false, true});
+  auto tv3 = add(tv1, tv2);
+  fusion->addOutput(tv3);
+
+  tv3->split(-1, 8, false);
+  tv2->computeAt(tv3, -1);
+
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  GpuLower gpulw(fusion.get());
+  const auto& pdmap = gpulw.parallelDimensionMap();
+  TORCH_CHECK(pdmap.isExact(ParallelType::TIDx));
+  TORCH_CHECK(
+      pdmap.get(ParallelType::TIDx)->isA<NamedScalar>() &&
+      pdmap.get(ParallelType::TIDx)->as<NamedScalar>()->name() == "blockDim.x");
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({11}, options);
+  at::Tensor input2 = at::randn({11, 13}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1, input2});
+  auto outputs = fe.runFusion({input1, input2});
+
+  auto ref = input1.unsqueeze(-1) + input2;
+
+  testValidate(
+      fusion.get(), outputs, {input1, input2}, {ref}, __LINE__, __FILE__);
+}
+
+// Mix symbolic and concrete tensors
+TEST_F(NVFuserTest, FusionParallelDimensionMap3_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion->addInput(tv0);
+
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
+  fusion->addOutput(tv2);
+  auto tv3 = add(tv0, IrBuilder::create<Double>(1));
+  fusion->addOutput(tv3);
+
+  tv2->split(0, 10);
+  tv3->split(0, 20);
+
+  auto tv4 = add(tv0, IrBuilder::create<Double>(1));
+  fusion->addOutput(tv4);
+  auto tv5 = add(tv0, IrBuilder::create<Double>(1));
+  fusion->addOutput(tv5);
+
+  // Not mapped but equal extent
+  tv4->split(0, 10);
+  tv5->split(0, 10);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv4->axis(-1)->parallelize(ParallelType::TIDy);
+  tv5->axis(-1)->parallelize(ParallelType::TIDy);
+
+  GpuLower gpulw(fusion.get());
+  const auto& pdmap = gpulw.parallelDimensionMap();
+  TORCH_CHECK(!pdmap.isExact(ParallelType::TIDx));
+  TORCH_CHECK(
+      pdmap.get(ParallelType::TIDx)->isA<NamedScalar>() &&
+      pdmap.get(ParallelType::TIDx)->as<NamedScalar>()->name() == "blockDim.x");
+  TORCH_CHECK(pdmap.isExact(ParallelType::TIDy));
+  TORCH_CHECK(
+      pdmap.get(ParallelType::TIDy)->isConst() &&
+      pdmap.get(ParallelType::TIDy)->as<Int>()->value().value() == 10);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({13}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {input1});
+  auto outputs = fe.runFusion({input1});
+
+  testValidate(
+      fusion.get(),
+      outputs,
+      {input1},
+      {input1 + 1, input1 + 1, input1 + 1, input1 + 1},
+      __LINE__,
+      __FILE__);
+}
+
+// Parallelizing merged broadcast domains
+TEST_F(NVFuserTest, FusionParallelDimensionMap4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv3 = broadcast(tv2, {true, false});
+  auto tv4 = add(tv3, tv1);
+  fusion.addOutput(tv4);
+
+  tv4->split(1, 4);
+  tv4->reorder({{1, 2}, {2, 1}});
+  tv4->merge(0);
+  tv0->computeAt(tv4, 1);
+  tv1->computeAt(tv4, 1);
+
+  // TIDx is mapped to tv4.axis(0) as well as tv2.axis(0), so it's not
+  // exact.
+  tv4->axis(0)->parallelize(ParallelType::TIDx);
+
+  tv2->setMemoryType(MemoryType::Shared);
+  tv3->setMemoryType(MemoryType::Shared);
+
+  GpuLower gpulw(&fusion);
+  const auto& pdmap = gpulw.parallelDimensionMap();
+  TORCH_CHECK(!pdmap.isExact(ParallelType::TIDx));
+  TORCH_CHECK(
+      pdmap.get(ParallelType::TIDx)->isA<NamedScalar>() &&
+      pdmap.get(ParallelType::TIDx)->as<NamedScalar>()->name() == "blockDim.x");
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({13}, options);
+  at::Tensor input2 = at::randn({15, 13}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input1, input2});
+  auto outputs = fe.runFusion({input1, input2});
+
+  auto ref = (input1 + 1).unsqueeze(0) + input2;
+
+  testValidate(&fusion, outputs, {input1, input2}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionParallelDimensionMap5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  auto tv3 = broadcast(tv0, {false, true});
+  auto tv4 = add(tv3, tv1);
+  fusion.addOutput(tv4);
+
+  tv4->split(1, 4);
+  tv0->computeAt(tv4, -1);
+  tv1->computeAt(tv4, -1);
+
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(-2)->parallelize(ParallelType::TIDy);
+  tv3->axis(-2)->parallelize(ParallelType::TIDy);
+
+  GpuLower gpulw(&fusion);
+  const auto& pdmap = gpulw.parallelDimensionMap();
+  TORCH_CHECK(pdmap.isExact(ParallelType::TIDx));
+  TORCH_CHECK(pdmap.isExact(ParallelType::TIDy));
+  TORCH_CHECK(
+      pdmap.get(ParallelType::TIDx)->isConst() &&
+      pdmap.get(ParallelType::TIDx)->as<Int>()->value().value() == 4);
+  TORCH_CHECK(
+      pdmap.get(ParallelType::TIDy)->isA<NamedScalar>() &&
+      pdmap.get(ParallelType::TIDy)->as<NamedScalar>()->name() == "blockDim.y");
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({13}, options);
+  at::Tensor input2 = at::randn({13, 15}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input1, input2});
+  auto outputs = fe.runFusion({input1, input2});
+
+  auto ref = (input1).unsqueeze(-1) + input2;
+
+  testValidate(&fusion, outputs, {input1, input2}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSegmenterCombineReductionsCycleRepro_CUDA) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  auto t0 = makeSymbolicTensor(3, DataType::Float);
+  auto t1 = makeSymbolicTensor(3, DataType::Half);
+  auto t3 = makeSymbolicTensor(3, DataType::Half);
+  auto t5 = makeSymbolicTensor(3, DataType::Half);
+  auto t7 = makeSymbolicTensor(1, DataType::Half);
+  auto t11 = makeSymbolicTensor(3, DataType::Half);
+  auto t13 = makeSymbolicTensor(3, DataType::Half);
+  auto t15 = makeSymbolicTensor(3, DataType::Half);
+  auto t17 = makeSymbolicTensor(3, DataType::Half);
+  auto d56 = IrBuilder::create<Double>();
+
+  fusion.addInput(t0);
+  fusion.addInput(t1);
+  fusion.addInput(t3);
+  fusion.addInput(t5);
+  fusion.addInput(t7);
+  fusion.addInput(t11);
+  fusion.addInput(t13);
+  fusion.addInput(t15);
+  fusion.addInput(t17);
+  fusion.addInput(d56);
+
+  auto t2 = castOp(DataType::Float, t1);
+  auto t4 = castOp(DataType::Float, t3);
+  auto t22 = sub(t2, t4);
+  auto t6 = castOp(DataType::Float, t5);
+  auto t23 = mul(t22, t6);
+  auto t16 = castOp(DataType::Float, t15);
+  auto t18 = castOp(DataType::Float, t17);
+  auto t19 = add(t16, t18);
+  auto t14 = castOp(DataType::Float, t13);
+  auto t20 = add(t19, t14);
+  auto t12 = castOp(DataType::Float, t11);
+  auto t21 = add(t20, t12);
+  auto t8 = castOp(DataType::Float, t7);
+  auto t24 = broadcast(t8, {true, true, false});
+  auto t25 = mul(t21, t24);
+  auto t27 = sum(t25, {2});
+  auto t28 = broadcast(t27, {false, false, true});
+  auto t29 = mul(t25, t23);
+  auto t30 = sum(t29, {2});
+  auto t31 = broadcast(t30, {false, false, true});
+  auto d59 =
+      mul(t1->getRootDomain()[2]->extent(), IrBuilder::create<Double>(1));
+  auto t26 = mul(d59, t25);
+  auto txx = mul(t26, IrBuilder::create<Double>(1));
+  auto t33 = sub(txx, t28);
+  auto d70 = unaryOp(UnaryOpType::Reciprocal, d59);
+  auto t35 = mul(d70, t6);
+  auto t39 = sum(t21, {0, 1});
+  auto t47 = castOp(DataType::Half, t39);
+  auto t37 = mul(t21, t23);
+  auto t38 = sum(t37, {0, 1});
+  auto t46 = castOp(DataType::Half, t38);
+  auto t32 = mul(t23, t31);
+  auto t34 = sub(t33, t32);
+  auto t36 = mul(t35, t34);
+  auto t45 = castOp(DataType::Half, t36);
+  auto t40 = mul(t36, t0);
+  auto t41 = mul(t40, d56);
+  auto t44 = castOp(DataType::Half, t41);
+  auto t42 = sum(t41, {0, 1});
+  auto t43 = castOp(DataType::Half, t42);
+
+  fusion.addOutput(t43);
+  fusion.addOutput(t44);
+  fusion.addOutput(t45);
+  fusion.addOutput(t46);
+  fusion.addOutput(t47);
+
+  auto options_half = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  auto options_float =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_t0 = at::randn({128, 64, 1024}, options_float);
+  at::Tensor at_t1 = at::randn({128, 64, 1024}, options_half);
+  at::Tensor at_t3 = at::randn({128, 64, 1024}, options_half);
+  at::Tensor at_t5 = at::randn({128, 64, 1024}, options_half);
+  at::Tensor at_t7 = at::randn({1024}, options_half);
+  at::Tensor at_t11 = at::randn({128, 64, 1024}, options_half);
+  at::Tensor at_t13 = at::randn({128, 64, 1024}, options_half);
+  at::Tensor at_t15 = at::randn({128, 64, 1024}, options_half);
+  at::Tensor at_t17 = at::randn({128, 64, 1024}, options_half);
+  double at_d56 = 1.1111;
+
+  std::vector<at::Tensor> aten_inputs = {
+      at_t0, at_t1, at_t3, at_t5, at_t7, at_t11, at_t13, at_t15, at_t17};
+
+  c10::IValue val = at_d56;
+
+  KernelArgumentHolder args(KernelIndexMode::INT32);
+  args.setDeviceIndex(0);
+  args.push(aten_inputs);
+  args.push(val);
+
+  for (auto _ : c10::irange(5)) {
+    auto segmented_fusion =
+        SegmentCandidateFinder::segment(fusion_ptr.get(), args);
+  }
+}
+
+TEST_F(NVFuserTest, FusionSerialAndParallelIndexing_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  auto tv3 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv4 = add(tv3, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv4);
+
+  auto tv5 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv6 = add(tv5, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv6);
+
+  // Case 1: local memory tensor computed serially and used by
+  // parallel threads
+  tv2->split(-1, 4);
+  tv1->computeAt(tv2, -2);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  // Case 2: shared memory tensor computed serially and used by BID
+  tv4->split(-1, 4);
+  tv3->computeAt(tv4, -2);
+  tv4->axis(-1)->parallelize(ParallelType::BIDx);
+  tv3->setMemoryType(MemoryType::Shared);
+
+  // Case 3: shared memory tensor computed by TID and used by BID
+  tv6->split(-1, 4);
+  tv5->computeAt(tv6, -2);
+  tv6->axis(-1)->parallelize(ParallelType::BIDx);
+  tv5->axis(-1)->parallelize(ParallelType::TIDx);
+  tv5->setMemoryType(MemoryType::Shared);
+
+  const int nx = 11;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({nx}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref = t0 + 2;
+
+  testValidate(
+      &fusion, outputs, aten_inputs, {ref, ref, ref}, __LINE__, __FILE__);
+}
+
+// Repro of issue #1105
+TEST_F(NVFuserTest, FusionWARSyncAliasedSmem_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+
+  fusion.addOutput(tv3);
+
+  tv1->setMemoryType(MemoryType::Shared);
+  tv2->setMemoryType(MemoryType::Shared);
+
+  tv3->split(0, 4);
+  tv0->computeAt(tv3, 1);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDy);
+  tv3->axis(-1)->parallelize(ParallelType::TIDz);
+
+  // Make sure a WAR sync is inserted at the end of the outer loop
+  GpuLower gpulw(&fusion);
+  for (const auto& kir_node : gpulw.kernel()->topLevelExprs()) {
+    if (auto loop = dynamic_cast<kir::ForLoop*>(kir_node)) {
+      const auto& body = loop->body().exprs();
+      TORCH_CHECK(!body.empty());
+      auto last_expr = dynamic_cast<kir::BlockSync*>(body.back());
+      TORCH_CHECK(last_expr != nullptr, "Invalid expr found");
+      TORCH_CHECK(last_expr->isWarHazardSync(), "Not a sync for WAR hazard");
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({17}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref1 = t0 + 3;
+
+  testValidate(&fusion, outputs, aten_inputs, {ref1}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue1099_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  auto tv3 = makeSymbolicTensor(1);
+  fusion.addInput(tv3);
+
+  // Just to make TIDx/y/z non-exact
+  auto tv4 = add(tv3, IrBuilder::create<Double>(1));
+  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
+  auto tv6 = add(tv5, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv6);
+
+  tv2->split(0, 4);
+  tv0->computeAt(tv2, 1);
+
+  tv0->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(-1)->parallelize(ParallelType::TIDy);
+  tv2->axis(-1)->parallelize(ParallelType::TIDz);
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  tv4->split(0, 5);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->setMemoryType(MemoryType::Shared);
+  tv5->split(0, 6);
+  tv5->axis(-1)->parallelize(ParallelType::TIDy);
+  tv5->setMemoryType(MemoryType::Shared);
+  tv6->split(0, 7);
+  tv6->axis(-1)->parallelize(ParallelType::TIDz);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({17}, options);
+  at::Tensor t3 = at::randn({19}, options);
+  std::vector<IValue> aten_inputs = {t0, t3};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref_t2 = t0 + 2;
+  auto ref_t3 = t3 + 3;
+
+  testValidate(
+      &fusion, outputs, aten_inputs, {ref_t2, ref_t3}, __LINE__, __FILE__);
+}
+
+// Repro of issue #1080
+TEST_F(NVFuserTest, FusionUnswitchPredicate_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  tv2->split(0, 4);
+  tv0->computeAt(tv2, 2);
+
+  tv2->split(-1, 8);
+  tv1->split(-1, 8);
+
+  tv2->axis(1)->parallelize(ParallelType::Unswitch);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-2)->parallelize(ParallelType::TIDy);
+
+  // swap TIDx and TIDy
+  tv1->axis(-1)->parallelize(ParallelType::TIDy);
+  tv1->axis(-2)->parallelize(ParallelType::TIDx);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  const int nx = 4;
+  const int ny = 10;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({nx, ny}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref = t0 + 2;
+
+  testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue1189_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({16, 16});
+  auto tv1 = makeConcreteTensor({16, 16});
+
+  auto tv0b = broadcast(tv0, {false, false, true});
+  auto tv1b = broadcast(tv1, {false, false, true});
+
+  fusion.addInput(tv0b);
+  fusion.addInput(tv1b);
+
+  auto tv2 = add(tv0b, tv1b);
+  auto tv3 = sum(tv2, {1});
+  fusion.addOutput(tv3);
+
+  auto parallelize = [](auto tv) {
+    tv->axis(0)->parallelize(ParallelType::TIDx);
+    tv->axis(1)->parallelize(ParallelType::BIDx);
+    tv->axis(2)->parallelize(ParallelType::BIDy);
+  };
+
+  parallelize(tv0b);
+  parallelize(tv1b);
+  parallelize(tv2);
+  parallelize(tv3);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({16, 16, 1}, options);
+  at::Tensor t1 = at::randn({16, 16, 1}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto outputs = fe.runFusion({t0, t1});
+
+  auto ref = (t0 + t1).sum({1});
+
+  testValidate(&fusion, outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue1052_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(1);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  auto tv3 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv3);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  scheduler_utils::parallelizeAllLike(tv2, {tv0});
+  scheduler_utils::parallelizeAllLike(tv3, {tv1});
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({10}, options);
+  at::Tensor t1 = at::randn({100}, options);
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref_t2 = t0 + 1;
+  auto ref_t3 = t1 + 1;
+
+  testValidate(
+      &fusion, outputs, aten_inputs, {ref_t2, ref_t3}, __LINE__, __FILE__);
+}
+
+// Repro of issue #1115
+TEST_F(NVFuserTest, FusionPointwiseBroadcast_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> input_shape{3, 17, 80};
+  std::vector<int64_t> output_shape{3, 17, 1, 80};
+
+  TensorView* x = makeSymbolicTensor(input_shape.size());
+  TensorView* bias = makeSymbolicTensor(input_shape.size());
+  fusion.addInput(x);
+  fusion.addInput(bias);
+
+  auto x_add_bias = add(x, bias);
+  auto x_bcast = broadcast(x_add_bias, {false, false, true, false});
+  auto y = gelu(x_bcast);
+  fusion.addOutput(y);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_x = at::randn(input_shape, options);
+  at::Tensor at_bias = at::randn(input_shape, options);
+  std::vector<IValue> aten_inputs = {at_x, at_bias};
+
+  schedulePointwise(&fusion, aten_inputs);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto at_x_add_bias = at_x + at_bias;
+  auto at_x_view = at::native::view(at_x_add_bias, output_shape);
+  auto aten_y = at::gelu(at_x_view);
+
+  testValidate(&fusion, outputs, aten_inputs, {aten_y}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionPointwiseVectorize_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int size = 1024 * 64;
+
+  TensorView* x = makeContigTensor(1);
+  fusion.addInput(x);
+  auto y = sin(x);
+  fusion.addOutput(y);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  // PyTorch's CUDA caching allocator should always return aligned pointer for
+  // freshly allocated tensor
+  at::Tensor at_x = at::randn({size}, options);
+
+  schedulePointwise(&fusion, {at_x});
+
+  for (auto x_consumer : ir_utils::consumerTvsOf(x)) {
+    bool found_vec_in_input = false;
+    for (auto id : x_consumer->domain()->domain()) {
+      if (isParallelTypeVectorize(id->getParallelType())) {
+        found_vec_in_input = true;
+        break;
+      }
+    }
+    TORCH_CHECK(found_vec_in_input, "Expect input to be vectorized");
+  }
+
+  for (auto id : y->domain()->domain()) {
+    if (isParallelTypeVectorize(id->getParallelType())) {
+      return;
+    }
+  }
+  TORCH_CHECK(false, "Expect output to be vectorized");
+}
+
+TEST_F(NVFuserTest, FusionSmemAliasSerial_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+
+  fusion.addOutput(tv3);
+
+  // Just set the dimension of TIDx
+  auto tv4 = makeSymbolicTensor(1);
+  fusion.addInput(tv4);
+  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv5);
+
+  tv1->setMemoryType(MemoryType::Shared);
+  tv2->setMemoryType(MemoryType::Shared);
+
+  tv5->axis(0)->parallelize(ParallelType::TIDx);
+
+  // tv1 and tv2 are on shared memory and are not parallelized with
+  // TIDx. They should be predicated as they are redundant and can
+  // interfere with smem aliasing (issue #1100).
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({10}, options);
+  at::Tensor t4 = at::randn({1024}, options);
+  std::vector<IValue> aten_inputs = {t0, t4};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref1 = t0 + 3;
+  auto ref2 = t4 + 1;
+
+  testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv1);
+
+  auto tv2 = makeSymbolicTensor(1);
+  fusion.addInput(tv2);
+  auto tv3 = sum(tv2, {0});
+  fusion.addOutput(tv3);
+
+  tv1->axis(0)->parallelize(ParallelType::TIDx);
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({17}, options);
+  at::Tensor t2 = at::randn({19}, options);
+  std::vector<IValue> aten_inputs = {t0, t2};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref1 = t0 + 1;
+  auto ref2 = sum(t2);
+
+  testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv1);
+
+  auto tv2 = makeSymbolicTensor(1);
+  fusion.addInput(tv2);
+  auto tv3 = Welford(tv2, {0}).avg;
+  fusion.addOutput(tv3);
+
+  tv1->axis(0)->parallelize(ParallelType::TIDx);
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({17}, options);
+  at::Tensor t2 = at::randn({19}, options);
+  std::vector<IValue> aten_inputs = {t0, t2};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref1 = t0 + 1;
+  auto ref2 = mean(t2, {0});
+
+  testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0, 1});
+  fusion.addOutput(tv1);
+
+  auto tv2 = makeSymbolicTensor(3);
+  fusion.addInput(tv2);
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv3);
+
+  auto tv4 = makeSymbolicTensor(3);
+  fusion.addInput(tv4);
+  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv5);
+
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv3->axis(0)->parallelize(ParallelType::TIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDy);
+  tv3->axis(2)->parallelize(ParallelType::TIDz);
+
+  tv5->axis(0)->parallelize(ParallelType::BIDx);
+  tv5->axis(1)->parallelize(ParallelType::BIDy);
+  tv5->axis(2)->parallelize(ParallelType::BIDz);
+
+  // TODO: This needs a fix for issue #1102.
+  // Also, need to allow predicated grid reductions.
+#if 0
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({2, 3}, options);
+  at::Tensor t2 = at::randn({5, 6, 7}, options);
+  at::Tensor t4 = at::randn({8, 9, 10}, options);
+  std::vector<IValue> aten_inputs = {t0, t2, t4};
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref1 = t0.sum(at::IntArrayRef{0, 1});
+  auto ref2 = t2 + 1;
+  auto ref3 = t4 + 1;
+
+  testValidate(
+      &fusion, outputs, aten_inputs, {ref1, ref2, ref3}, __LINE__, __FILE__);
+#endif
+}
+
+TEST_F(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tvs = Welford(tv0, {0, 1});
+  fusion.addOutput(tvs.avg);
+
+  auto tv2 = makeSymbolicTensor(3);
+  fusion.addInput(tv2);
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv3);
+
+  auto tv4 = makeSymbolicTensor(3);
+  fusion.addInput(tv4);
+  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv5);
+
+  tvs.avg->axis(0)->parallelize(ParallelType::BIDx);
+  tvs.avg->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv3->axis(0)->parallelize(ParallelType::TIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDy);
+  tv3->axis(2)->parallelize(ParallelType::TIDz);
+
+  tv5->axis(0)->parallelize(ParallelType::BIDx);
+  tv5->axis(1)->parallelize(ParallelType::BIDy);
+  tv5->axis(2)->parallelize(ParallelType::BIDz);
+
+  // TODO: needs a fix for issue #1102
+  // Also, need to allow predicated grid reductions.
+#if 0
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({2, 3}, options);
+  at::Tensor t2 = at::randn({5, 6, 7}, options);
+  at::Tensor t4 = at::randn({8, 9, 10}, options);
+  std::vector<IValue> aten_inputs = {t0, t2, t4};
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref1 = t0.mean(at::IntArrayRef{0, 1});
+  auto ref2 = t2 + 1;
+  auto ref3 = t4 + 1;
+
+  testValidate(
+      &fusion, outputs, aten_inputs, {ref1, ref2, ref3}, __LINE__, __FILE__);
+#endif
+}
+
+// Repro of issue #1102
+TEST_F(NVFuserTest, FusionPredicateParallelizedDomains_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  // Just to make TIDx/y/z non-exact
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv3);
+
+  auto tv4 = makeSymbolicTensor(1);
+  fusion.addInput(tv4);
+
+  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
+  auto tv6 = add(tv5, IrBuilder::create<Double>(1));
+  auto tv7 = add(tv6, IrBuilder::create<Double>(1));
+  auto tv8 = add(tv7, IrBuilder::create<Double>(1));
+  auto tv9 = sum(tv8, {0});
+  fusion.addOutput(tv9);
+
+  tv1->split(0, 5);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->setMemoryType(MemoryType::Shared);
+  tv2->split(0, 6);
+  tv2->axis(-1)->parallelize(ParallelType::TIDy);
+  tv2->setMemoryType(MemoryType::Shared);
+  tv3->split(0, 7);
+  tv3->axis(-1)->parallelize(ParallelType::TIDz);
+
+  tv9->split(0, 4);
+  tv4->computeAt(tv9, 1);
+
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv5->axis(-1)->parallelize(ParallelType::TIDy);
+  tv6->axis(-1)->parallelize(ParallelType::TIDz);
+  tv7->axis(-1)->parallelize(ParallelType::TIDz);
+  tv8->axis(-1)->parallelize(ParallelType::TIDz);
+  tv9->axis(-1)->parallelize(ParallelType::TIDz);
+  tv9->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv5->setMemoryType(MemoryType::Shared);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({17}, options);
+  at::Tensor t4 = at::randn({19}, options);
+  std::vector<IValue> aten_inputs = {t0, t4};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref1 = t0 + 3;
+  auto ref2 = sum(t4 + 4);
+
+  testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
+}
+
+// Repro of #1102 and #1129
+TEST_F(NVFuserTest, FusionSmemPredicateUnswitch_CUDA) {
+  if (!deviceMajorMinorCheck(7)) {
+    GTEST_SKIP() << "skipping tests on pre-Volta GPUs";
+    return;
+  }
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(1);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+  auto tv4 = add(tv3, IrBuilder::create<Double>(1));
+  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv5);
+
+  // Just to make TIDx/y/z non-exact
+  auto tvx = add(tv1, IrBuilder::create<Double>(1));
+  auto tvy = add(tvx, IrBuilder::create<Double>(1));
+  auto tvz = add(tvy, IrBuilder::create<Double>(1));
+  fusion.addOutput(tvz);
+
+  tv5->split(0, 4);
+  tv0->computeAt(tv5, 1);
+
+  tv0->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDy);
+  tv3->axis(-1)->parallelize(ParallelType::TIDz);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv5->axis(-1)->parallelize(ParallelType::TIDy);
+  tv5->axis(0)->parallelize(ParallelType::Unswitch);
+
+  tvx->split(0, 5);
+  tvx->axis(-1)->parallelize(ParallelType::TIDx);
+  tvy->split(0, 6);
+  tvy->axis(-1)->parallelize(ParallelType::TIDy);
+  tvz->split(0, 7);
+  tvz->axis(-1)->parallelize(ParallelType::TIDz);
+
+  for (auto tv : {tv2, tv3, tv4, tvx, tvy}) {
+    tv->setMemoryType(MemoryType::Shared);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({17}, options);
+  at::Tensor t1 = at::randn({19}, options);
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref1 = t0 + 4;
+  auto ref2 = t1 + 3;
+
+  testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__);
+}
+
+// Repro of issue #1136
+TEST_F(NVFuserTest, FusionFloatPow_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create<Int>(4));
+  // To check if pow(tv0, 2) is replaced with tv0 * tv0
+  auto tv2 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create<Int>(2));
+  // To check if pow(tv0, 2.0) is replaced with tv0 * tv0
+  auto tv3 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create<Double>(2));
+  auto tv4 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create<Int>(3));
+  auto tv5 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create<Double>(3));
+  auto s = binaryOp(
+      BinaryOpType::Pow,
+      IrBuilder::create<Double>(3),
+      IrBuilder::create<Double>(3));
+  auto tv6 = add(tv0, s);
+
+  fusion.addOutput(tv1);
+  fusion.addOutput(tv2);
+  fusion.addOutput(tv3);
+  fusion.addOutput(tv4);
+  fusion.addOutput(tv5);
+  fusion.addOutput(tv6);
+
+  tv1->split(0, 32);
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+
+  TransformPropagatorWithCheck propagator(tv1);
+  MaxRootDomainInfoSpanningTree(tv1).traverse(&propagator);
+  scheduler_utils::parallelizeAllLike(tv1, {tv2, tv3, tv4, tv5, tv6});
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({1000}, options);
+  // Negative inputs cause nan in Fuesr as use_fast_math is enabled
+  t0 = abs(t0);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto p4 = at::pow(t0, 4);
+  auto p2 = at::pow(t0, 2);
+  auto p3 = at::pow(t0, 3);
+  auto t6 = t0 + std::pow(3, 3);
+
+  testValidate(
+      &fusion,
+      outputs,
+      aten_inputs,
+      {p4, p2, p2, p3, p3, t6},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue1127_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  const int numel = 4;
+
+  auto tv0 = makeConcreteTensor({numel});
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = broadcast(tv1, {true});
+
+  auto tv3 = makeConcreteTensor({numel, numel});
+  fusion.addInput(tv3);
+
+  auto tv4 = sum(tv3, {1});
+
+  auto tv5 = add(tv2, tv4);
+  fusion.addOutput(tv5);
+
+  tv1->axis(0)->parallelize(ParallelType::TIDx);
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+  tv4->axis(1)->parallelize(ParallelType::TIDx);
+  tv5->axis(0)->parallelize(ParallelType::TIDx);
+
+  // Lowering should fail since tv5 is predicated and paralellized with TIDx.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fusion.printKernel());
+}
+
+TEST_F(NVFuserTest, FusionChannelsLastParser_CUDA) {
+  // This test may not pass if using a custom block sync as there may
+  // be additional calls. Skip the test as it's not specifically
+  // relevant with block synchronizatin.
+  if (std::getenv("PYTORCH_NVFUSER_USE_BLOCK_SYNC_ATOMIC")) {
+    return;
+  }
+  auto g = std::make_shared<Graph>();
+  const auto graph0_string = R"IR(
+  graph(%0 : Half(8, 4, 10, 16, strides=[640, 1, 64, 4]),
+        %1 : Half(8, 4, 10, 16, strides=[640, 160, 16, 1])):
+    %o.1 : Half(8, 4, 10, 16, strides=[640, 1, 64, 4]) = aten::mul(%0, %1) # sum_dyn.py:5:6
+    %3 : Half(8, 4, 10, 16, strides=[640, 1, 64, 4]) = aten::relu(%o.1) # sum_dyn.py:6:9
+    return (%3))IR";
+  parseIR(graph0_string, g.get());
+
+  // strides are not yet supported in the irparser.
+  {
+    auto val = g->block()->inputs()[0];
+    val->setType(val->type()->castRaw<TensorType>()->withSizesStrides(
+        {8, 4, 10, 16}, {640, 1, 64, 4}));
+  }
+
+  {
+    auto val = g->block()->inputs()[1];
+    val->setType(val->type()->castRaw<TensorType>()->withSizesStrides(
+        {8, 4, 10, 16}, {640, 160, 16, 1}));
+  }
+
+  for (auto node : g->block()->nodes()) {
+    for (auto val : node->outputs()) {
+      if (val->isCompleteTensor())
+        val->setType(val->type()->castRaw<TensorType>()->withSizesStrides(
+            {8, 4, 10, 16}, {640, 1, 64, 4}));
+    }
+  }
+
+  auto fusion = parseJitIR(g);
+  FusionGuard fg(fusion.get());
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  at::Tensor input0 =
+      at::randn({2, 2, 2, 16}, options).clone(c10::MemoryFormat::ChannelsLast);
+  at::Tensor input1 = at::randn({2, 2, 2, 16}, options);
+  auto lparams = schedulePointwise(fusion.get(), {input0, input1});
+
+  // CONSIDER:
+  // 1. this can be moved to a dedicated "golden" file
+  // 2. use a fuzzy compare (ignore non-significant whitespaces for example)
+  const std::string expected_kernel = R"(
+__global__ void CUDAGeneratedKernel(Tensor<__half, 4> T0, Tensor<__half, 4> T2, Tensor<__half, 4> T7) {
+  int64_t i165;
+  i165 = (((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x);
+  if ((i165 < (T0.size[0] * (T0.size[1] * (T0.size[2] * T0.size[3]))))) {
+    __half T9[1];
+    T9[0] = 0;
+    T9[0]
+       = T2[((((((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x)) / (T0.size[1] * (T0.size[2] * T0.size[3]))) * ((T0.size[2] * T0.size[1]) * T0.size[3])) + ((((((((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) % (T0.size[2] * T0.size[3])) % T0.size[3]) * (T0.size[2] * T0.size[1])) + (((((((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) / (T0.size[2] * T0.size[3])) * T0.size[2]) + (((((((nvfuser_index_t)blockIdx.x) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) % (T0.size[2] * T0.size[3])) / T0.size[3])];
+    __half T8[1];
+    T8[0] = 0;
+    T8[0]
+       = T0[i165];
+    float T3[1];
+    T3[0]
+       = __half2float(T9[0]);
+    float T4[1];
+    T4[0]
+       = T3[0];
+    float T1[1];
+    T1[0]
+       = __half2float(T8[0]);
+    float T5[1];
+    T5[0]
+      = T1[0]
+      * T4[0];
+    float T6[1];
+    T6[0]
+       = relu(T5[0]);
+    __half T10[1];
+    T10[0]
+       = __float2half(T6[0]);
+    T7[i165]
+       = T10[0];
+  }
+}
+)";
+
+  const std::string actual_kernel =
+      "\n" + codegen::generateCudaKernel(GpuLower(fusion.get()).kernel());
+
+  if (expected_kernel.size() != actual_kernel.size() ||
+      expected_kernel.compare(actual_kernel) != 0) {
+    std::cerr
+        << " Codegen mismatch, codegen possibly changed, or is incorrect. "
+        << " \n ========= EXPECTED ========= \n"
+        << expected_kernel << "\n========= ACTUAL ========== \n"
+        << actual_kernel << "\n=================" << std::endl;
+    auto it = std::mismatch(
+        expected_kernel.begin(),
+        expected_kernel.end(),
+        actual_kernel.begin(),
+        actual_kernel.end());
+    std::string actual_mismatched_snippet(it.second, actual_kernel.end());
+    actual_mismatched_snippet = actual_mismatched_snippet.substr(0, 10);
+    std::string expected_mismatched_snippet(it.first, expected_kernel.end());
+    expected_mismatched_snippet = expected_mismatched_snippet.substr(0, 10);
+    std::cerr << "First mismatch found at: " << actual_mismatched_snippet
+              << ", expected: " << expected_mismatched_snippet << std::endl;
+    TORCH_CHECK(false);
+  }
+
+  // TODO: runFusion hits assertion. I'm probably doing something wrong here.
+  // FusionExecutor fe;
+  // fe.compileFusion(fusion.get());
+  // auto outputs = fe.runFusion({input0, input1}, lparams);
+  // at::Tensor output_ref = (input0 * input1).relu();
+  // TORCH_CHECK(output_ref.equal(outputs[0]));
+}
+
+TEST_F(NVFuserTest, FusionThreadPredicateUnswitch_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({10, 1024});
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+
+  fusion.addOutput(tv3);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->computeAt(tv3, -1);
+  tv3->axis(0)->parallelize(ParallelType::Unswitch);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({10, 1024}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref = sum(t0, {1}) + 2;
+
+  testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionNonContigOutputs_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv1);
+
+  tv1->setContiguity(false);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_input = at::randn({10}, options);
+  at::Tensor at_output = at::empty_strided({10}, {2}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {at_input});
+  auto returned_outputs = fe.runFusion({at_input}, {at_output});
+
+  // Returned outputs should only contain one tensor that is the same
+  // as the output tensor given to runFusion
+  TORCH_CHECK(returned_outputs.size() == 1);
+  TORCH_CHECK(returned_outputs[0].is_same(at_output));
+  TORCH_CHECK(!returned_outputs[0].is_contiguous());
+
+  auto at_ref = at_input + 1;
+
+  testValidate(&fusion, {at_output}, {at_input}, {at_ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTestWarpSoftMax_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Setup softmax fusion
+  auto input = makeContigTensor(2);
+  fusion.addInput(input);
+  auto output = softmax(input, 1);
+  fusion.addOutput(output);
+
+  // Setup runtime input
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_input = at::randn({8, 16 * 197}, options);
+  std::vector<c10::IValue> aten_inputs({aten_input});
+
+  // Schedule through magic scheduler
+  SchedulerRuntimeInfo runtime_info(&fusion, aten_inputs, true);
+  TORCH_CHECK(SchedulerEntry::canSchedule(
+      ScheduleHeuristic::Persistent, &fusion, runtime_info));
+  auto scheduler = SchedulerEntry::makeEntry(
+      ScheduleHeuristic::Persistent, &fusion, runtime_info);
+  scheduler->schedule(&fusion);
+
+  // Modify the schedule to use warp reduction
+  auto used_vals = fusion.usedMathVals();
+  for (auto tv : ir_utils::filterByType<TensorView>(used_vals)) {
+    for (IterDomain* id : tv->domain()->domain()) {
+      if (id->getParallelType() == ParallelType::TIDx) {
+        id->padToMultipleOfWarp();
+      }
+    }
+  }
+
+  // Test result
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+  auto ref_output = at::_softmax(aten_input, 1, false);
+  testValidate(&fusion, outputs, aten_inputs, {ref_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue1133_CUDA) {
+  if (!deviceMajorMinorCheck(7)) {
+    GTEST_SKIP() << "skipping tests on pre-Volta GPUs";
+    return;
+  }
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {1});
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+
+  fusion.addOutput(tv3);
+
+  tv0->computeAt(tv3, 1);
+
+  const int split_factor = 32;
+
+  tv2->split(-1, split_factor);
+  tv1->computeAt(tv2, -2);
+
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv3->axis(0)->parallelize(ParallelType::Unswitch);
+
+  tv1->setMemoryType(MemoryType::Shared);
+  tv2->setMemoryType(MemoryType::Shared);
+
+  // Both tv1 and tv2 should be allocated at the top-level scope
+  GpuLower gpulw(&fusion);
+  bool tv1_validated = false;
+  bool tv2_validated = false;
+  for (const auto& kir_node : gpulw.kernel()->topLevelExprs()) {
+    if (auto alloc = dynamic_cast<kir::Allocate*>(kir_node)) {
+      auto size = alloc->size();
+      if (!(alloc->buffer()->name() == 1 || alloc->buffer()->name() == 2)) {
+        // There should be no allocation other than those for tv1 and tv2
+        TORCH_CHECK(false, "Invalid allocation detected");
+      }
+      TORCH_CHECK(size->isA<Int>(), "Invalid allocation size");
+      TORCH_CHECK(size->as<Int>()->isConst(), "Allocation not constant");
+      auto size_int = size->as<Int>()->value().value();
+      if (alloc->buffer()->name() == 1) {
+        TORCH_CHECK(
+            size_int == split_factor,
+            "Invalid allocation size: ",
+            size->as<Int>()->value().value());
+        tv1_validated = true;
+      } else {
+        TORCH_CHECK(
+            size_int == 1,
+            "Invalid allocation size: ",
+            size->as<Int>()->value().value());
+        tv2_validated = true;
+      }
+    }
+  }
+
+  TORCH_CHECK(tv1_validated, "Failed to validate tv1 allocation");
+  TORCH_CHECK(tv2_validated, "Failed to validate tv2 allocation");
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({99, 101}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref = (t0 + 1).sum({1}) + 1;
+
+  testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionRfactorContigIDs_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  fusion.addOutput(tv1);
+
+  tv1->split(1, 32);
+
+  auto tv2 = tv1->rFactor({1});
+
+  // This merged domain is not contiguous.
+  tv2->merge(0, 2);
+
+  tv2->setMemoryType(MemoryType::Shared);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({99, 101}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto ref = t0.sum({1});
+
+  testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionPersistentBufferCalculation1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  auto tv2 = sum(tv1, {1});
+  auto tv3 = broadcast(tv2, {false, true});
+  auto tv4 = set(tv1);
+  auto tv5 = add(tv3, tv4);
+  fusion.addOutput(tv5);
+
+  auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
+
+  auto isTvWithinVec = [](std::vector<TensorView*>& vec, TensorView* tv) {
+    return std::find(vec.begin(), vec.end(), tv) != vec.end();
+  };
+
+  auto tvEntryInVecVec = [](std::vector<std::vector<TensorView*>>& vec_o_vec,
+                            std::vector<TensorView*>& buffer_vec,
+                            TensorView* tv) {
+    auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv);
+    return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it);
+  };
+
+  auto& buffers = persistent_buffer_info.persistent_buffers;
+  auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points;
+  auto& projectable = persistent_buffer_info.projectable_persistent_buffers;
+  auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs;
+
+  TORCH_INTERNAL_ASSERT(buffers.size() == 1);
+  TORCH_INTERNAL_ASSERT(resolution.size() == 1 && resolution[0].size() == 1);
+  TORCH_INTERNAL_ASSERT(projectable.size() == 1);
+  TORCH_INTERNAL_ASSERT(projectable_inputs.size() == 1);
+
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(buffers, tv1));
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable, tv1));
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0));
+
+  auto tv1_resolution_it = tvEntryInVecVec(resolution, buffers, tv1);
+  TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end())
+
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv5));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor aten_t0 = at::randn({99, 101}, options);
+
+  // Schedule through magic scheduler
+  SchedulerRuntimeInfo runtime_info(&fusion, {aten_t0}, true);
+  auto persistent_buffer_size =
+      persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
+
+  TORCH_INTERNAL_ASSERT(
+      persistent_buffer_size.persistent_buffer_size ==
+      static_cast<int64_t>(aten_t0.size(1) * dataTypeSize(DataType::Float)));
+  TORCH_INTERNAL_ASSERT(
+      persistent_buffer_size.projected_persistent_buffer_size ==
+      static_cast<int64_t>(aten_t0.size(1) * dataTypeSize(DataType::Float)));
+}
+
+TEST_F(NVFuserTest, FusionPersistentBufferCalculation2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2, DataType::Half);
+  fusion.addInput(tv0);
+
+  auto tv1 = castOp(DataType::Float, tv0);
+  auto tv2 = sum(tv1, {1});
+  auto tv3 = broadcast(tv2, {false, true});
+  auto tv4 = set(tv1);
+  auto tv5 = add(tv3, tv4);
+  auto tv6 = castOp(DataType::Half, tv5);
+  fusion.addOutput(tv6);
+
+  auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
+
+  auto isTvWithinVec = [](std::vector<TensorView*>& vec, TensorView* tv) {
+    return std::find(vec.begin(), vec.end(), tv) != vec.end();
+  };
+
+  auto tvEntryInVecVec = [](std::vector<std::vector<TensorView*>>& vec_o_vec,
+                            std::vector<TensorView*>& buffer_vec,
+                            TensorView* tv) {
+    auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv);
+    return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it);
+  };
+
+  auto& buffers = persistent_buffer_info.persistent_buffers;
+  auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points;
+  auto& projectable = persistent_buffer_info.projectable_persistent_buffers;
+  auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs;
+
+  TORCH_INTERNAL_ASSERT(buffers.size() == 1);
+  TORCH_INTERNAL_ASSERT(resolution.size() == 1 && resolution[0].size() == 1);
+  TORCH_INTERNAL_ASSERT(projectable.size() == 1);
+  TORCH_INTERNAL_ASSERT(projectable_inputs.size() == 1);
+
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(buffers, tv1));
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable, tv1));
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0));
+
+  auto tv1_resolution_it = tvEntryInVecVec(resolution, buffers, tv1);
+  TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end())
+
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv5));
+
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  at::Tensor aten_t0 = at::randn({99, 101}, options);
+
+  // Schedule through magic scheduler
+  SchedulerRuntimeInfo runtime_info(&fusion, {aten_t0}, true);
+  auto persistent_buffer_size =
+      persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
+
+  TORCH_INTERNAL_ASSERT(
+      persistent_buffer_size.persistent_buffer_size ==
+      static_cast<int64_t>(aten_t0.size(1) * dataTypeSize(DataType::Float)));
+  TORCH_INTERNAL_ASSERT(
+      persistent_buffer_size.projected_persistent_buffer_size ==
+      static_cast<int64_t>(aten_t0.size(1) * dataTypeSize(DataType::Half)));
+}
+
+TEST_F(NVFuserTest, FusionPersistentBufferCalculation3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2, DataType::Half);
+  fusion.addInput(tv0);
+
+  auto tv1 = castOp(DataType::Float, tv0);
+  auto tv2 = set(tv1);
+  auto tv3 = sum(tv2, {1});
+  auto tv4 = broadcast(tv3, {false, true});
+
+  auto tv5 = makeSymbolicTensor(2, DataType::Half);
+  fusion.addInput(tv5);
+
+  auto tv6 = castOp(DataType::Float, tv5);
+
+  auto tv7 = add(tv6, tv4);
+  auto tv8 = set(tv1);
+  auto tv9 = add(tv7, tv8);
+  auto tv10 = sum(tv9, {1});
+  auto tv11 = broadcast(tv10, {false, true});
+  auto tv12 = set(tv7);
+  auto tv13 = add(tv12, tv11);
+
+  fusion.addOutput(tv13);
+
+  auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
+
+  auto isTvWithinVec = [](std::vector<TensorView*>& vec, TensorView* tv) {
+    return std::find(vec.begin(), vec.end(), tv) != vec.end();
+  };
+
+  auto tvEntryInVecVec = [](std::vector<std::vector<TensorView*>>& vec_o_vec,
+                            std::vector<TensorView*>& buffer_vec,
+                            TensorView* tv) {
+    auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv);
+    return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it);
+  };
+
+  auto& buffers = persistent_buffer_info.persistent_buffers;
+  auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points;
+  auto& projectable = persistent_buffer_info.projectable_persistent_buffers;
+  auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs;
+
+  TORCH_INTERNAL_ASSERT(buffers.size() == 2);
+  TORCH_INTERNAL_ASSERT(
+      resolution.size() == 2 && resolution[0].size() == 1 &&
+      resolution[1].size() == 1);
+  TORCH_INTERNAL_ASSERT(projectable.size() == 1);
+  TORCH_INTERNAL_ASSERT(projectable_inputs.size() == 1);
+
+  TORCH_INTERNAL_ASSERT(
+      isTvWithinVec(buffers, tv1) && isTvWithinVec(buffers, tv7));
+  TORCH_INTERNAL_ASSERT(
+      isTvWithinVec(projectable, tv1) && !isTvWithinVec(projectable, tv7));
+
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0));
+
+  auto tv1_resolution_it = tvEntryInVecVec(resolution, buffers, tv1);
+  TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end())
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv9));
+
+  auto tv7_resolution_it = tvEntryInVecVec(resolution, buffers, tv7);
+  TORCH_INTERNAL_ASSERT(tv7_resolution_it != resolution.end())
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv7_resolution_it, tv13));
+
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  at::Tensor aten_t0 = at::randn({99, 101}, options);
+  at::Tensor aten_t5 = at::randn({99, 101}, options);
+
+  // Schedule through magic scheduler
+  SchedulerRuntimeInfo runtime_info(&fusion, {aten_t0, aten_t5}, true);
+  auto persistent_buffer_size =
+      persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
+
+  TORCH_INTERNAL_ASSERT(
+      persistent_buffer_size.persistent_buffer_size ==
+      static_cast<int64_t>(
+          aten_t0.size(1) * dataTypeSize(DataType::Float) * 2));
+  TORCH_INTERNAL_ASSERT(
+      persistent_buffer_size.projected_persistent_buffer_size ==
+      static_cast<int64_t>(
+          aten_t0.size(1) *
+          (dataTypeSize(DataType::Half) + dataTypeSize(DataType::Float))));
+}
+
+TEST_F(NVFuserTest, FusionPersistentBufferCalculation4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2, DataType::Half);
+  fusion.addInput(tv0);
+
+  auto tv1 = castOp(DataType::Float, tv0);
+  auto tv2 = set(tv1);
+  auto tv3 = sum(tv2, {1});
+  auto tv4 = broadcast(tv3, {false, true});
+  auto tv5 = set(tv1);
+  auto tv6 = add(tv4, tv5);
+  auto tv7 = set(tv2);
+  auto tv8 = add(tv7, tv6);
+  auto tv9 = castOp(DataType::Half, tv8);
+
+  fusion.addOutput(tv9);
+
+  auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
+
+  auto isTvWithinVec = [](std::vector<TensorView*>& vec, TensorView* tv) {
+    return std::find(vec.begin(), vec.end(), tv) != vec.end();
+  };
+
+  auto tvEntryInVecVec = [](std::vector<std::vector<TensorView*>>& vec_o_vec,
+                            std::vector<TensorView*>& buffer_vec,
+                            TensorView* tv) {
+    auto buffer_it = std::find(buffer_vec.begin(), buffer_vec.end(), tv);
+    return vec_o_vec.begin() + std::distance(buffer_vec.begin(), buffer_it);
+  };
+
+  auto& buffers = persistent_buffer_info.persistent_buffers;
+  auto& resolution = persistent_buffer_info.persistent_buffer_resolution_points;
+  auto& projectable = persistent_buffer_info.projectable_persistent_buffers;
+  auto& projectable_inputs = persistent_buffer_info.projectable_buffer_inputs;
+
+  TORCH_INTERNAL_ASSERT(buffers.size() == 2);
+  TORCH_INTERNAL_ASSERT(
+      resolution.size() == 2 && resolution[0].size() == 1 &&
+      resolution[1].size() == 1);
+
+  TORCH_INTERNAL_ASSERT(projectable.size() == 2);
+  TORCH_INTERNAL_ASSERT(projectable_inputs.size() == 1);
+
+  TORCH_INTERNAL_ASSERT(
+      isTvWithinVec(buffers, tv1) && isTvWithinVec(buffers, tv2));
+  TORCH_INTERNAL_ASSERT(
+      isTvWithinVec(projectable, tv1) && isTvWithinVec(projectable, tv2));
+
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(projectable_inputs, tv0));
+
+  auto tv1_resolution_it = tvEntryInVecVec(resolution, buffers, tv1);
+  TORCH_INTERNAL_ASSERT(tv1_resolution_it != resolution.end())
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv1_resolution_it, tv6));
+
+  auto tv2_resolution_it = tvEntryInVecVec(resolution, buffers, tv2);
+  TORCH_INTERNAL_ASSERT(tv2_resolution_it != resolution.end())
+  TORCH_INTERNAL_ASSERT(isTvWithinVec(*tv2_resolution_it, tv8));
+
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  at::Tensor aten_t0 = at::randn({99, 101}, options);
+
+  // Schedule through magic scheduler
+  SchedulerRuntimeInfo runtime_info(&fusion, {aten_t0}, true);
+  auto persistent_buffer_size =
+      persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
+
+  TORCH_INTERNAL_ASSERT(
+      persistent_buffer_size.persistent_buffer_size ==
+      static_cast<int64_t>(
+          aten_t0.size(1) * dataTypeSize(DataType::Float) * 2));
+
+  TORCH_INTERNAL_ASSERT(
+      persistent_buffer_size.projected_persistent_buffer_size ==
+      static_cast<int64_t>(aten_t0.size(1) * dataTypeSize(DataType::Half)));
+}
+
+TEST_F(NVFuserTest, FusionPersistentBufferProjection_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2, DataType::Half);
+  fusion.addInput(tv0);
+
+  auto tv1 = castOp(DataType::Float, tv0);
+  auto tv2 = set(tv1);
+  auto tv3 = sum(tv2, {1});
+  auto tv4 = broadcast(tv3, {false, true});
+  auto tv5 = set(tv1);
+  auto tv6 = add(tv4, tv5);
+  auto tv7 = set(tv2);
+  auto tv8 = add(tv7, tv6);
+  auto tv9 = castOp(DataType::Half, tv8);
+
+  fusion.addOutput(tv9);
+
+  reduction_scheduler_utils::projectPersistentBuffers(&fusion);
+
+  auto tv5_producers = ir_utils::producerTvsOf(tv5);
+  auto tv7_producers = ir_utils::producerTvsOf(tv7);
+
+  // Projection should have broken these dependencies
+
+  TORCH_INTERNAL_ASSERT(
+      std::find(tv5_producers.begin(), tv5_producers.end(), tv1) ==
+      tv5_producers.end());
+  TORCH_INTERNAL_ASSERT(
+      std::find(tv7_producers.begin(), tv7_producers.end(), tv2) ==
+      tv7_producers.end());
+
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  at::Tensor aten_t0 = at::randn({99, 101}, options);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  auto cg_outputs = fec.runFusionWithInputs({aten_t0});
+
+  auto aten_t1 = aten_t0.to(c10::kDouble);
+  auto aten_t3 = aten_t1.sum({1});
+  auto aten_t4 = aten_t3.unsqueeze(1);
+  auto aten_t7 = aten_t4.add(aten_t1).add(aten_t1);
+
+  testValidate(&fusion, cg_outputs, {aten_t0}, {aten_t7}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue1223_CUDA) {
+  if (!deviceMajorMinorCheck(7)) {
+    GTEST_SKIP() << "skipping tests on pre-Volta GPUs";
+    return;
+  }
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {0, 1});
+  fusion.addOutput(tv2);
+
+  auto tv3 = add(tv0, IrBuilder::create<Double>(0));
+  fusion.addOutput(tv3);
+
+  tv2->split(0, 4);
+  tv2->split(1, 1, false);
+  tv2->split(-1, 4);
+
+  tv2->axis(1)->parallelize(ParallelType::Unswitch);
+  tv2->axis(-3)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDy);
+
+  tv1->computeAt(tv2, -1);
+
+  // Make TIDx and TIDy non-exact
+  tv3->split(0, 32);
+  tv3->split(-1, 32);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+  tv3->axis(3)->parallelize(ParallelType::TIDy);
+
+  // The second axis of both tv1 and tv2 are fully unswitched, so they
+  // don't need to predicate the parallel type usage of TIDy, whereas
+  // the first axis is only partially unswitched, i.e., part of its
+  // split output domains is outside the unswitched axis, so the first
+  // axis, which uses TIDx, needs to predicate the parallel
+  // dimension. Previously, as reported in issue #1223, unswitched
+  // expressions didn't predicate parallel dimensions. It should be
+  // fixed by PR #1222.
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_t0 = at::ones({11, 10}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {at_t0});
+  auto cg_outputs = fe.runFusion({at_t0});
+
+  auto at_t1 = (at_t0 + 1).sum();
+
+  testValidate(
+      &fusion, cg_outputs, {at_t0}, {at_t1, at_t0}, __LINE__, __FILE__);
+}
+
+// See #1247 and #1250
+TEST_F(NVFuserTest, FusionRfactorPredication1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = min(tv1, {0});
+
+  fusion.addOutput(tv2);
+
+  // Make TIDx non-exact
+  auto tv3 = makeContigTensor(1);
+  fusion.addInput(tv3);
+
+  auto tv4 = add(tv3, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv4);
+
+  tv2->split(0, 4);
+  auto tv5 = tv2->rFactor({1});
+
+  tv0->computeAt(tv2, 1);
+
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+
+  tv4->axis(0)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_t0 = at::randn({9}, options);
+  at_t0 = at::abs(at_t0);
+  at::Tensor at_t3 = at::randn({128}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {at_t0, at_t3});
+  auto cg_outputs = fe.runFusion({at_t0, at_t3});
+
+  auto at_t2 = (at_t0 + 1).min();
+  auto at_t4 = at_t3 + 1;
+
+  testValidate(
+      &fusion, cg_outputs, {at_t0, at_t3}, {at_t2, at_t4}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionRfactorPredication2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = min(tv0, {0});
+  fusion.addOutput(tv1);
+
+  // Make TIDx non-exact
+  auto tv2 = makeContigTensor(1);
+  fusion.addInput(tv2);
+
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv3);
+
+  tv1->split(0, 4);
+  auto tv4 = tv1->rFactor({0});
+
+  tv1->split(0, 3);
+
+  // tv0->computeAt(tv1, 3);
+  tv4->reorder({{0, 1}});
+  tv4->split(0, 3);
+  tv4->setMemoryType(MemoryType::Shared);
+
+  // tv0: [I]
+  // tv4: [4/3, 3, I/4]
+  // tv1: [4/3, 3]
+
+  tv1->axis(0)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv1, {tv4});
+
+  tv3->axis(0)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor at_t0 = at::randn({9}, options);
+  at_t0 = at::abs(at_t0);
+  at::Tensor at_t3 = at::randn({128}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {at_t0, at_t3});
+  auto cg_outputs = fe.runFusion({at_t0, at_t3});
+
+  auto at_t2 = std::get<0>(at_t0.min(0));
+  auto at_t4 = at_t3 + 1;
+
+  testValidate(
+      &fusion, cg_outputs, {at_t0, at_t3}, {at_t2, at_t4}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionRfactorIndirectRoot_CUDA) {
+  // https://github.com/csarofeen/pytorch/issues/1692
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(3);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {1, 2});
+  fusion.addOutput(tv1);
+
+  tv1->split(2, 4);
+  tv1->split(1, 3);
+  tv1->merge(2, 3);
+  auto rf = tv1->rFactor({-1});
+
+  tv1->split(0, 256);
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+  rf->computeAt(tv1, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+
+  auto at_in = at::randn({6, 6, 6}, options);
+  auto at_out = at_in.sum({1, 2});
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {at_in});
+  auto cg_outputs = fe.runFusion({at_in});
+
+  testValidate(&fusion, cg_outputs, {at_in}, {at_out}, __LINE__, __FILE__);
+}
+
+} // namespace jit
+} // namespace torch
+#endif // #if defined(USE_CUDA)
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp
new file mode 100644
index 0000000000000..a8fb439af14f5
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp
@@ -0,0 +1,6538 @@
+#if defined(USE_CUDA)
+#include <gmock/gmock-matchers.h>
+#include <gtest/gtest.h>
+
+#include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/codegen.h>
+#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
+#include <torch/csrc/jit/codegen/cuda/executor.h>
+#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
+#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/fusion_segmenter.h>
+#include <torch/csrc/jit/codegen/cuda/grouped_reduction.h>
+#include <torch/csrc/jit/codegen/cuda/inlining.h>
+#include <torch/csrc/jit/codegen/cuda/interface.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/ir_graphviz.h>
+#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/lower_magic_zero.h>
+#include <torch/csrc/jit/codegen/cuda/mutator.h>
+#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
+#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
+#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
+#include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
+#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
+#include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
+
+#include <test/cpp/jit/test_utils.h>
+#include <torch/csrc/jit/api/function_impl.h>
+#include <torch/csrc/jit/codegen/cuda/parser.h>
+#include <torch/csrc/jit/ir/irparser.h>
+#include <torch/torch.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <algorithm>
+#include <iostream>
+#include <sstream>
+#include <thread>
+
+// Tests go in torch::jit
+namespace torch {
+namespace jit {
+
+using namespace torch::jit::fuser::cuda;
+using namespace at::indexing;
+
+TEST_F(NVFuserTest, FusionNonDivisibleSplit1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0});
+  fusion.addOutput(tv1);
+
+  // [I]
+  tv1->split(0, 5);
+  // [ceilDiv(I, 5), 5]
+
+  // This second split is non-divisible. The split domain must be predicated.
+  tv1->split(1, 3);
+  // [ceilDiv(I, 5), 2, 3]
+
+  auto tv2 = sum(tv0, {0});
+  fusion.addOutput(tv2);
+
+  // tv2 shouldn't need to have another predicate
+  tv2->split(0, 4);
+  tv2->split(1, 2);
+
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(
+      gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
+      "There must be no split to validate");
+  TORCH_CHECK(
+      gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 1,
+      "Only tv1 should have a non-divisible predicate.");
+  for (auto tv : {loweredTv(tv1, gpulw)}) {
+    auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
+    TORCH_CHECK(
+        it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
+        "No info found for ",
+        tv);
+    const auto& splits_to_predicate = it->second;
+    TORCH_CHECK(
+        splits_to_predicate.size() == 1,
+        "There must be one split to predicate");
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({24}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0.sum();
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref, ref}, __LINE__, __FILE__);
+}
+
+// Repro of issue #1074
+TEST_F(NVFuserTest, FusionNonDivisibleSplit2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  tv2->split(0, 2);
+  tv2->split(-1, 4);
+  tv2->reorder({{1, 2}, {2, 1}});
+  tv0->computeAt(tv2, 2);
+
+  tv2->split(-1, 3);
+
+  // To make the sanitizer catch the invalid accesses. Not necessary
+  // to expose the bug.
+  tv1->setMemoryType(MemoryType::Shared);
+
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(
+      gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
+      "There must be no split to validate");
+  TORCH_CHECK(
+      gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 1,
+      "Only tv2 should have a non-divisible predicate.");
+  for (auto tv : {loweredTv(tv2, gpulw)}) {
+    auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
+    TORCH_CHECK(
+        it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
+        "No info found for ",
+        tv);
+    const auto& splits_to_predicate = it->second;
+    TORCH_CHECK(
+        splits_to_predicate.size() == 1,
+        "There must be one split to predicate");
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({13, 17}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0 + 2;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Similar to FusionNonDivisibleSplit1 but with unswitch
+TEST_F(NVFuserTest, FusionNonDivisibleSplit3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {0});
+  fusion.addOutput(tv2);
+
+  tv2->split(0, 5);
+  tv2->split(1, 3);
+
+  tv0->computeAt(tv2, -1);
+
+  tv2->axis(0)->parallelize(ParallelType::Unswitch);
+
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(
+      gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
+      "There must be no split to validate");
+  TORCH_CHECK(
+      gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 2,
+      "Both tv1 and tv2 should have a non-divisible predicate.");
+  for (auto tv : {loweredTv(tv1, gpulw), loweredTv(tv2, gpulw)}) {
+    auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
+    TORCH_CHECK(
+        it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
+        "No info found for ",
+        tv);
+    const auto& splits_to_predicate = it->second;
+    TORCH_CHECK(
+        splits_to_predicate.size() == 1,
+        "There must be one split to predicate");
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({24}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = (t0 + 1).sum();
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Non-divisible split through merge
+TEST_F(NVFuserTest, FusionNonDivisibleSplit4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {0, 1});
+  fusion.addOutput(tv2);
+
+  tv2->split(0, 5);
+  tv2->merge(1, 2);
+  tv2->split(1, 3);
+
+  tv0->computeAt(tv2, -1);
+
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(
+      gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
+      "There must be no split to validate");
+  TORCH_CHECK(
+      gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 2,
+      "Both tv1 and tv2 should have a non-divisible predicate.");
+  for (auto tv : {loweredTv(tv1, gpulw), loweredTv(tv2, gpulw)}) {
+    auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
+    TORCH_CHECK(
+        it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
+        "No info found for ",
+        tv);
+    const auto& splits_to_predicate = it->second;
+    TORCH_CHECK(
+        splits_to_predicate.size() == 1,
+        "There must be one split to predicate");
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({24, 2}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = (t0 + 1).sum();
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Nested splits
+TEST_F(NVFuserTest, FusionNonDivisibleSplit5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {0});
+  fusion.addOutput(tv2);
+
+  // [I]
+  tv2->split(0, 8);
+  // [I/8, 8]
+  tv2->split(1, 2);
+  // [I/8, 4, 2]
+  tv2->split(1, 3); // non-divisible split of outer output
+  // [I/8, 2, 3, 2]
+
+  tv0->computeAt(tv2, -1);
+
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(
+      gpulw.nonDivisibleSplitInfo().splitsToValidate().empty(),
+      "There must be no split to validate");
+  TORCH_CHECK(
+      gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 2,
+      "Both tv1 and tv2 should have a non-divisible predicate.");
+  for (auto tv : {loweredTv(tv1, gpulw), loweredTv(tv2, gpulw)}) {
+    auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv);
+    TORCH_CHECK(
+        it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(),
+        "No info found for ",
+        tv);
+    const auto& splits_to_predicate = it->second;
+    TORCH_CHECK(
+        splits_to_predicate.size() == 1,
+        "There must be one split to predicate");
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({24}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = (t0 + 1).sum();
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Vectorized non-divisible split. Must be validated at run time
+TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  fusion.addOutput(tv1);
+
+  tv1->split(0, 8, false);
+  tv1->split(1, 4);
+
+  tv1->axis(-1)->parallelize(ParallelType::Vectorize);
+
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(
+      gpulw.nonDivisibleSplitInfo().splitsToValidate().size() == 1,
+      "There should be one split to validate");
+  for (const auto& kv : gpulw.nonDivisibleSplitInfo().splitsToPredicate()) {
+    const auto& splits_to_predicate = kv.second;
+    TORCH_CHECK(
+        splits_to_predicate.empty(),
+        "There must be no split to predicate, but tensor t",
+        kv.first->name(),
+        " has:",
+        splits_to_predicate);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({32}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+
+  auto t0_non_divisible = at::randn({8}, options);
+  // Since ceilDiv(8, 8) is not divisible by 4, the vectorization is
+  // illegal. The run-time validation of vectorization should throw an error.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.runFusion({t0_non_divisible}));
+}
+
+// If a split is validated at run time, it's not necessary to predicate.
+TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  auto tv3 = sum(tv2, {0});
+  fusion.addOutput(tv3);
+
+  tv3->split(0, 8, false);
+  tv3->split(1, 4);
+  TransformPropagatorWithCheck propagator(tv3);
+  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
+
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2});
+
+  tv1->axis(2)->parallelize(ParallelType::Vectorize);
+
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(
+      gpulw.nonDivisibleSplitInfo().splitsToValidate().size() == 1,
+      "There should be one split to validate");
+  for (const auto& kv : gpulw.nonDivisibleSplitInfo().splitsToPredicate()) {
+    const auto& splits_to_predicate = kv.second;
+    TORCH_CHECK(
+        splits_to_predicate.empty(),
+        "There must be no split to predicate, but tensor t",
+        kv.first->name(),
+        " has:",
+        splits_to_predicate);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+
+  auto t0 = at::randn({1024}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = (t0 + 1).sum();
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue1284Repro_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> input_shape_0 = {10, 20};
+  std::vector<int64_t> input_shape_1 = {15};
+
+  TensorView* in_0 = makeSymbolicTensor(input_shape_0.size());
+  TensorView* in_1 = makeSymbolicTensor(input_shape_1.size());
+  fusion.addInput(in_0);
+  fusion.addInput(in_1);
+
+  TensorView* out_0 = add(in_0, IrBuilder::create<Double>(0.f));
+  TensorView* out_1 = add(in_1, IrBuilder::create<Double>(2.f));
+
+  fusion.addOutput(out_0);
+  fusion.addOutput(out_1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_in_0 = at::randn(input_shape_0, options);
+  at::Tensor at_in_1 = at::randn(input_shape_1, options);
+  std::vector<IValue> aten_inputs = {at_in_0, at_in_1};
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  auto outputs = fec.runFusionWithInputs(aten_inputs);
+
+  auto t1 = at_in_1 + 2;
+
+  auto runtime = fec.getMostRecentKernelRuntime();
+  TORCH_INTERNAL_ASSERT(runtime->isSegmented());
+  TORCH_INTERNAL_ASSERT(runtime->fusionSegments()->groups().size() == 2);
+
+  testValidate(
+      &fusion, outputs, {at_in_0, at_in_1}, {at_in_0, t1}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue1284Repro2_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> input_shape_0 = {4, 4};
+  std::vector<int64_t> input_shape_1 = {3, 4, 4};
+  std::vector<int64_t> input_shape_2 = {2, 8, 4, 4};
+
+  TensorView* in_0 = makeSymbolicTensor(input_shape_0.size());
+  TensorView* in_1 = makeSymbolicTensor(input_shape_1.size());
+  TensorView* in_2 = makeSymbolicTensor(input_shape_2.size());
+
+  fusion.addInput(in_0);
+  fusion.addInput(in_1);
+  fusion.addInput(in_2);
+
+  TensorView* out_0 = add(in_0, in_1);
+  TensorView* out_1 = add(in_0, in_2);
+
+  fusion.addOutput(out_0);
+  fusion.addOutput(out_1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor at_in_0 = at::randn(input_shape_0, options);
+  at::Tensor at_in_1 = at::randn(input_shape_1, options);
+  at::Tensor at_in_2 = at::randn(input_shape_2, options);
+
+  std::vector<IValue> aten_inputs = {at_in_0, at_in_1, at_in_2};
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  auto outputs = fec.runFusionWithInputs(aten_inputs);
+
+  auto t0 = at_in_0 + at_in_1;
+  auto t1 = at_in_0 + at_in_2;
+
+  auto runtime = fec.getMostRecentKernelRuntime();
+  TORCH_INTERNAL_ASSERT(runtime->isSegmented());
+  TORCH_INTERNAL_ASSERT(runtime->fusionSegments()->groups().size() == 2);
+
+  testValidate(
+      &fusion,
+      outputs,
+      {at_in_0, at_in_1, at_in_2},
+      {t0, t1},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionIssue1305Repro_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  auto t0 = makeContigTensor(1);
+  auto t1 = makeContigTensor(2);
+
+  fusion.addInput(t0);
+  fusion.addInput(t1);
+
+  auto t2 = broadcast(t0, {true, false});
+  auto t3 = add(t1, t2);
+  auto t4 = add(t3, t2);
+  auto t5 = sum(t4, {1});
+  auto t6 = broadcast(t5, {false, true});
+  auto t7 = add(t3, t6);
+
+  fusion.addOutput(t7);
+
+  t3->computeAt(t7, -1, ComputeAtMode::MostInlined);
+
+  TORCH_INTERNAL_ASSERT(t3->getComputeAtPosition() == 1);
+}
+
+TEST_F(NVFuserTest, FusionDoubleBuffering1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1.0));
+  auto tv3 = set(tv2);
+  fusion.addOutput(tv3);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  tv3->split(-1, 128);
+  tv3->split(-1, 32);
+  TransformPropagatorWithCheck propagator(tv3);
+  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
+
+  tv0->computeAt(tv3, 1);
+
+  tv3->axis(-2)->parallelize(ParallelType::BIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv3);
+
+  tv1->doubleBuffer();
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({1000}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0 + 1;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionDoubleBuffering2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1.0));
+  auto tv3 = set(tv2);
+  fusion.addOutput(tv3);
+
+  tv3->split(-1, 128);
+  tv3->split(-1, 32);
+  TransformPropagatorWithCheck propagator(tv3);
+  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
+
+  tv0->computeAt(tv3, -1);
+
+  tv3->axis(-2)->parallelize(ParallelType::BIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv3);
+
+  tv1->doubleBuffer();
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({1000}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0 + 1;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionDoubleBuffering3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
+  auto tv2 = set(tv1);
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1.0));
+  fusion.addOutput(tv3);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  tv3->split(-1, 128);
+  tv3->split(-1, 32);
+  TransformPropagatorWithCheck propagator(tv3);
+  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
+
+  tv0->computeAt(tv3, 1);
+
+  // tv2 is invalid to double-buffer as its producer, tv1, is
+  // computed inside the double-buffering loop.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(tv2->doubleBuffer());
+
+  // Moving tv2 inner makes tv1 large enough to double-buffer tv2
+  tv2->computeAt(tv3, 2);
+
+  tv2->doubleBuffer();
+
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv3);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({1000}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0 + 2;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Double buffering smem to local and unswitch
+TEST_F(NVFuserTest, FusionDoubleBuffering4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
+  auto tv2 = set(tv1);
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1.0));
+  fusion.addOutput(tv3);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  tv3->split(-1, 128);
+  tv3->split(-1, 32);
+  tv3->split(-1, 8);
+  TransformPropagatorWithCheck propagator(tv3);
+  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
+
+  tv0->computeAt(tv3, 2);
+  tv2->computeAt(tv3, -1);
+
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(1)->parallelize(ParallelType::Unswitch);
+  scheduler_utils::parallelizeAllLike(tv3);
+
+  tv2->doubleBuffer();
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({1000}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0 + 2;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Double buffering gmem to shared and unswitch
+TEST_F(NVFuserTest, FusionDoubleBuffering5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1.0));
+  fusion.addOutput(tv2);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  tv2->split(-1, 128);
+  tv2->split(-1, 32);
+  tv2->split(-1, 8);
+  TransformPropagatorWithCheck propagator(tv2);
+  MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
+
+  tv0->computeAt(tv2, 2);
+  tv1->computeAt(tv2, -1);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(1)->parallelize(ParallelType::Unswitch);
+  scheduler_utils::parallelizeAllLike(tv2);
+
+  tv1->doubleBuffer();
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({1000}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0 + 1;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Double buffering smem to local and unroll
+TEST_F(NVFuserTest, FusionDoubleBuffering6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
+  auto tv2 = set(tv1);
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1.0));
+  fusion.addOutput(tv3);
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  tv3->split(-1, 128);
+  tv3->split(-1, 16);
+  tv3->split(-2, 4);
+  tv3->split(-2, 2);
+  TransformPropagatorWithCheck propagator(tv3);
+  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
+
+  tv0->computeAt(tv3, 1);
+  tv2->computeAt(tv3, -1);
+
+  tv3->axis(2)->parallelize(ParallelType::Unroll);
+  tv3->axis(4)->parallelize(ParallelType::TIDx);
+
+  tv2->doubleBuffer();
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({199}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0 + 2;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Double buffering and vectorize
+TEST_F(NVFuserTest, FusionDoubleBuffering7_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1.0));
+  fusion.addOutput(tv2);
+
+  tv2->split(-1, 128);
+  tv2->split(-1, 4);
+  TransformPropagatorWithCheck propagator(tv2);
+  MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
+
+  tv1->computeAt(tv2, 2);
+
+  tv2->axis(-2)->parallelize(ParallelType::TIDx);
+
+  tv1->axis(-1)->parallelize(ParallelType::Vectorize);
+
+  tv1->doubleBuffer();
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({200}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0 + 1;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Multiple tensors to double-buffer
+TEST_F(NVFuserTest, FusionDoubleBuffering8_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeContigTensor(1);
+  fusion.addInput(tv1);
+
+  auto tv2 = set(tv0);
+  auto tv3 = set(tv1);
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  tv4->split(0, 32);
+  tv4->split(0, 4);
+  TransformPropagatorWithCheck propagator(tv4);
+  MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
+
+  tv0->computeAt(tv4, 1);
+  tv1->computeAt(tv4, 1);
+
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(tv4);
+
+  tv2->doubleBuffer();
+  tv3->doubleBuffer();
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({100}, options);
+  auto t1 = at::randn({100}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+// Nested double buffering from gmem to smem and smem to register
+TEST_F(NVFuserTest, FusionDoubleBuffering9_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto out = tv1;
+  fusion.addOutput(out);
+
+  auto tv2 = tv0->cacheAfter();
+  auto tv3 = tv2->cacheAfter();
+
+  out->split(0, 32);
+  out->split(0, 4);
+  TransformPropagatorWithCheck propagator(out);
+  MaxRootDomainInfoSpanningTree(out).traverse(&propagator);
+
+  tv2->setMemoryType(MemoryType::Shared);
+
+  tv2->computeAt(out, 1);
+  tv3->computeAt(out, -1);
+
+  out->axis(-1)->parallelize(ParallelType::TIDx);
+  scheduler_utils::parallelizeAllLike(out);
+
+  tv2->doubleBuffer();
+  tv3->doubleBuffer();
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({1001}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0 + 1;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// FusionSmemBlockGemmCache + double buffering at both smem and local
+TEST_F(NVFuserTest, FusionSmemBlockGemmCacheDoubleBuffer_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Algorithm
+  TensorView* tv0 = makeSymbolicTensor(2); // (M, K)
+  TensorView* tv1 = makeSymbolicTensor(2); // (K, N)
+  TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B)
+  TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N)
+  TensorView* tv4 = mul(tv2, tv3); // M, K, N
+  TensorView* tv5 = sum(tv4, {1}); // M, R, N
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv5);
+
+  TensorView* tv6 = tv5->cacheBefore();
+
+  // For smem double buffering
+  auto tv0_cache_local = tv0->cacheAfter();
+  auto tv1_cache_local = tv1->cacheAfter();
+
+  // For register double buffering
+  auto tv0_cache_smem = tv0->cacheAfter();
+  auto tv1_cache_smem = tv1->cacheAfter();
+
+  const int BSX = 32;
+  const int TSX = 8;
+
+  // [M, K, N]
+  tv6->split(-1, BSX);
+  tv6->split(-1, TSX);
+  tv6->split(1, BSX);
+  tv6->split(0, BSX);
+  tv6->split(1, TSX);
+  // [M/BSX, BSX/TSX, TSX, K/BSX, BSX, N/BSX, BSX/TSX, TSX]
+  tv6->reorder(
+      {{4, 7}, {7, 6}, {6, 5}, {2, 4}, {1, 3}, {3, 2}, {5, 1}, {0, 0}});
+  // [M/BSX, N/BSX, K/BSX, BSX/TSX, BSX/TSX, TSX, TSX, BSX]
+
+  auto tv6_rf = tv6->rFactor({-1});
+
+  TransformPropagatorWithCheck propagator(tv6_rf);
+  MaxRootDomainInfoSpanningTree(tv6_rf).traverse(&propagator);
+
+  tv0->computeAt(tv6, 3);
+  tv1->computeAt(tv6, 3);
+
+  tv6_rf->computeAt(tv6, -1);
+  tv0_cache_local->computeAt(tv6_rf, -1);
+  tv1_cache_local->computeAt(tv6_rf, -1);
+
+  tv0_cache_smem->setMemoryType(MemoryType::Shared);
+  tv1_cache_smem->setMemoryType(MemoryType::Shared);
+
+  tv5->axis(0)->parallelize(ParallelType::BIDx);
+  tv5->axis(1)->parallelize(ParallelType::BIDy);
+  tv5->axis(-3)->parallelize(ParallelType::TIDy);
+  tv5->axis(-1)->parallelize(ParallelType::TIDx);
+
+  scheduler_utils::parallelizeAllLike(tv5);
+
+  tv0_cache_local->doubleBuffer();
+  tv1_cache_local->doubleBuffer();
+
+  tv0_cache_smem->doubleBuffer();
+  tv1_cache_smem->doubleBuffer();
+
+  constexpr int M = 154, K = 45, N = 1524;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M, K}, options);
+  at::Tensor t1 = at::randn({K, N}, options);
+  at::Tensor aten_output = matmul(t0.to(at::kDouble), t1.to(at::kDouble));
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+  // The smem cache write in this test case is redundant predicated,
+  //   and also double buffered. Currently we are relying on WAR sync
+  //   insertion to ensure ordering of double buffered tensor access.
+  // The check below makes sure that the sync is inserted so that the
+  //   test isn't running on a race condition.
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count > 0);
+}
+
+TEST_F(NVFuserTest, FusionIntermediateTensorVectorize_CUDA) {
+  std::vector<MemoryType> mem_types = {MemoryType::Shared, MemoryType::Local};
+
+  for (auto mem_type : mem_types) {
+    Fusion fusion;
+    FusionGuard fg(&fusion);
+
+    auto tv0 = makeContigTensor(1);
+    fusion.addInput(tv0);
+
+    auto tv1 = set(tv0);
+    auto tv2 = set(tv1);
+    auto tv3 = set(tv2);
+    fusion.addOutput(tv3);
+
+    tv1->setMemoryType(mem_type);
+
+    tv3->split(-1, 4);
+    TransformPropagatorWithCheck propagator(tv3);
+    MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
+
+    tv1->computeAt(tv3, -2);
+
+    tv2->axis(-1)->parallelize(ParallelType::Vectorize);
+
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::manual_seed(0);
+    auto t0 = at::randn({15}, options);
+    FusionExecutor fe;
+    fe.compileFusion(&fusion);
+
+    // This should throw an exception as the extent of t0 is not
+    // divisible by the vector width
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+    ASSERT_ANY_THROW(fe.runFusion({t0}));
+
+    auto t1 = at::randn({16}, options);
+    auto cg_outputs = fe.runFusion({t1});
+
+    auto ref = t1;
+
+    testValidate(&fusion, cg_outputs, {t1}, {ref}, __LINE__, __FILE__);
+  }
+}
+
+TEST_F(NVFuserTest, FusionBroadcastConcretization1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({10, 1});
+  fusion.addInput(tv0);
+  auto tv1 = makeConcreteTensor({10, 20});
+  fusion.addInput(tv1);
+  auto tv2 = makeConcreteTensor({10, 10});
+  fusion.addInput(tv2);
+
+  // Not concretized
+  auto tv3 = sum(tv2, {1});
+  auto tv4 = broadcast(tv3, {false, true});
+  auto tv5 = add(tv0, tv4);
+  fusion.addOutput(tv5);
+
+  // Concretized
+  auto tv6 = sum(tv2, {1});
+  auto tv7 = broadcast(tv6, {false, true});
+  auto tv8 = add(tv1, tv7);
+  fusion.addOutput(tv8);
+
+  for (auto tv : {tv3, tv4, tv5, tv6, tv7, tv8}) {
+    tv->axis(1)->parallelize(ParallelType::TIDx);
+  }
+
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(!gpulw.concretizedBroadcastDomains()->isConcretized(
+      loweredTv(tv4, gpulw)->axis(1)));
+  TORCH_CHECK(gpulw.concretizedBroadcastDomains()->isConcretized(
+      loweredTv(tv7, gpulw)->axis(1)));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({10, 1}, options);
+  auto t1 = at::randn({10, 20}, options);
+  auto t2 = at::randn({10, 10}, options);
+  std::vector<IValue> aten_inputs = {t0, t1, t2};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto t5 = t0 + t2.sum({1}).unsqueeze(-1);
+  auto t8 = t1 + t2.sum({1}).unsqueeze(-1);
+
+  testValidate(&fusion, outputs, aten_inputs, {t5, t8}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBroadcastConcretization2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {0, 1});
+  auto tv2 = broadcast(tv1, {true});
+  auto tv3 = broadcast(tv2, {false, true});
+  fusion.addOutput(tv3);
+
+  // tv1 is thread-predicated with TIDx and TIDy
+  tv1->axis(0)->parallelize(ParallelType::TIDx);
+  tv1->axis(1)->parallelize(ParallelType::TIDy);
+  // tv2 broadcasts along TIDx
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+  // tv3 broadcasts along TIDy
+  tv3->axis(0)->parallelize(ParallelType::TIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDy);
+
+  // Both tv2 and tv3 broadcast along predicated TID dimensions, but
+  // since the broadcast domains are not concretized, there should be
+  // no actual parallel broadcast
+
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(
+      !gpulw.kernel()->summary().has_block_broadcasts &&
+          !gpulw.kernel()->summary().has_grid_broadcasts,
+      "There must be no parallel broadcast in this fusion");
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({10, 11}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto t3 = t0.sum().unsqueeze(-1).unsqueeze(-1);
+
+  testValidate(&fusion, outputs, aten_inputs, {t3}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionBroadcastConcretization3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> input_shape({10, 4, 8});
+  std::vector<int64_t> output_shape({8, 4, 1});
+
+  auto tv0 = makeConcreteTensor(input_shape);
+  fusion.addInput(tv0);
+
+  auto tv2 = sum(tv0, {0});
+  auto tv3 = set(tv2);
+  auto tv4 =
+      view(tv3, {input_shape.begin() + 1, input_shape.end()}, output_shape);
+  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv5);
+
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv5->axis(-1)->parallelize(ParallelType::TIDx);
+
+  // The view op adds a broadcast domain in tv4, which is
+  // parallelized. Howver, it is never materialized, so there should
+  // be no parallel broadcast.
+
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(
+      !gpulw.kernel()->summary().has_block_broadcasts &&
+          !gpulw.kernel()->summary().has_grid_broadcasts,
+      "There must be no parallel broadcast in this fusion");
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn(input_shape, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  auto t5 = at::native::view(t0.sum(0), output_shape) + 1;
+
+  testValidate(&fusion, outputs, aten_inputs, {t5}, __LINE__, __FILE__);
+}
+
+// Merging non-broadcast and broadcast domains
+// TODO: Fix use case see issue https://github.com/csarofeen/pytorch/issues/1418
+// validateParallelize does not pass. Even if it's skipped,
+// generated code is invalid as blockBroadcast is not used.
+#if 0
+TEST_F(NVFuserTest, FusionBroadcastConcretization4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = broadcast(tv1, {false, true});
+  auto tv3 = add(tv2, tv0);
+  fusion.addOutput(tv3);
+
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv2->merge(0, 1);
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+  // TODO: When set to shared memory, this kernel should be correct, but fails
+  // validation and when skipped produces incorrect code
+  tv2->setMemoryType(MemoryType::Shared);
+
+  tv3->merge(0, 1);
+  tv3->axis(0)->parallelize(ParallelType::TIDx);
+
+  fusion.printMath();
+  fusion.printKernel();
+}
+#endif
+
+TEST_F(NVFuserTest, FusionBroadcastConcretization5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(1);
+  fusion.addInput(tv1);
+  auto tv2 = makeSymbolicTensor(1);
+  fusion.addInput(tv2);
+  auto tv3 = makeSymbolicTensor(1);
+  fusion.addInput(tv3);
+
+  // Assert tv2 and tv3 have the same shape
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  // Concretize a broadcast domain to multiple non-concrete domains
+  // through a multi-output expression. It should be considered to be
+  // non-uniquely concretized.
+  auto tv5 = broadcast(tv0, {false, true});
+  // Reduce only the non-broadcast domain.
+  auto tvs = Welford(tv5, {0});
+  auto tv9 = add(tvs.avg, tv1);
+  auto tv10 = add(tvs.var_sum, tv2);
+  fusion.addOutput(tv9);
+  fusion.addOutput(tv10);
+
+  // Same pattern as the above, but concretize the broadcast domain
+  // with tv2 and tv3, which have the exactly same shape, so the
+  // broadcast should be considered uniquely concretized.
+  auto tv11 = broadcast(tv0, {false, true});
+  // Reduce only the non-broadcast domain.
+  auto tvs2 = Welford(tv11, {0});
+  auto tv15 = add(tvs2.avg, tv2);
+  auto tv16 = add(tvs2.var_sum, tv3);
+  fusion.addOutput(tv15);
+  fusion.addOutput(tv16);
+
+  // Reduce only the broadcast domain. Since it's reduced, it should
+  // not be considered to be concretized.
+  auto tv17 = broadcast(tv0, {false, true});
+  auto tvs3 = Welford(tv17, {1});
+  fusion.addOutput(tvs3.avg);
+
+  ConcretizedBroadcastDomains bcast_concretization_info(&fusion);
+
+  TORCH_CHECK(
+      bcast_concretization_info.maybeNonUniquelyConcretized(tv5->axis(1)),
+      "Failed to detect non-unique concretization of ",
+      tv5->toString());
+
+  TORCH_CHECK(
+      bcast_concretization_info.isUniquelyConcretized(tv11->axis(1)),
+      "Failed to detect unique concretization of ",
+      tv11->toString());
+
+  TORCH_CHECK(
+      !bcast_concretization_info.isConcretized(tv17->axis(1)),
+      "Failed to detect non-concretization of ",
+      tv17->toString());
+}
+
+TEST_F(NVFuserTest, FusionIssue1430_CUDA) {
+  // Derived from an expression sorting issue when using loop map, now expr
+  // sorting uses parallel map.
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  int V = 2, W = 3, X = 4, Y = 5, Z = 6;
+
+  // setup fusion
+  auto tv0 = TensorViewBuilder()
+                 .ndims(5)
+                 .dtype(DataType::Half)
+                 .contiguity(std::vector<bool>(5, true))
+                 .shape({V, W, X, Y, Z})
+                 .build();
+
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  auto tv2 = castOp(DataType::Float, tv1);
+
+  auto tvs = Welford(tv2, {1, 2, 3, 4});
+  auto tv3 = tvs.avg;
+  auto tv4 = tvs.var_sum;
+  auto tv5 = tvs.n;
+
+  // avg
+  auto tv6 = broadcast(tvs.avg, {false, true, true, true, true});
+
+  // var
+  auto tv7 = mul(tv4, IrBuilder::create<Double>(1. / (W * X * Y * Z)));
+  auto tv8 = add(tv7, IrBuilder::create<Double>(1.e-6));
+  auto tv9 = broadcast(tv8, {false, true, true, true, true});
+  auto tv10 = rsqrt(tv9);
+
+  auto tv11 = castOp(DataType::Float, tv1);
+  auto tv12 = sub(tv11, tv6);
+  auto tv13 = mul(tv12, tv10);
+
+  auto tv14 = set(tv13);
+  fusion.addOutput(tv14);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDy);
+  tv3->axis(2)->parallelize(ParallelType::BIDx);
+  tv3->axis(3)->parallelize(ParallelType::TIDx);
+  tv3->axis(4)->parallelize(ParallelType::Vectorize);
+
+  // tv3->reorder({{1, -2}});
+
+  auto rfactor = ir_utils::rfactorHelper(tv3, {1, 4});
+
+  scheduler_utils::parallelizeAllLike(rfactor);
+
+  for (auto tv : ir_utils::allTvs(&fusion)) {
+    if (tv != tv1 || tv != tv3) {
+      for (auto i : c10::irange(tv->nDims())) {
+        if (isParallelTypeVectorize(tv->axis(i)->getParallelType())) {
+          tv->axis(i)->parallelize(ParallelType::Serial);
+        }
+      }
+    }
+  }
+
+  tv0->computeAt(tv14, 1);
+  tv13->computeAt(tv14, -2);
+  tv2->computeAt(tv14, -1, ComputeAtMode::MostInlined);
+  tv11->computeAt(tv14, -1, ComputeAtMode::MostInlined);
+
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({V, W, X, Y, Z}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto cg_outputs = fe.runFusion({t0}, LaunchParams(X, V, -1, Y, -1, -1));
+
+  auto t0_double = t0.to(at::kDouble);
+
+  auto at_mu = at::mean(t0_double, {1, 2, 3, 4})
+                   .unsqueeze(-1)
+                   .unsqueeze(-1)
+                   .unsqueeze(-1)
+                   .unsqueeze(-1);
+  auto at_var = at::var(t0_double, {1, 2, 3, 4}, false)
+                    .unsqueeze(-1)
+                    .unsqueeze(-1)
+                    .unsqueeze(-1)
+                    .unsqueeze(-1);
+
+  auto at_out = t0_double.sub(at_mu).div(at_var.add(1.e-6).sqrt());
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {t0},
+      {at_out},
+      __LINE__,
+      __FILE__,
+      "",
+      LaunchParams(X, V, -1, Y, -1, -1));
+}
+
+// Test code generation of allocated scalars
+TEST_F(NVFuserTest, FusionCodegenAllocatedScalars_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Fusion is just a dummy container in this test, just used for
+  // getting a Kernel container
+  auto tv0 = makeSymbolicTensor(0);
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  fusion.addOutput(tv1);
+
+  GpuLower gpulw(&fusion);
+  auto kernel = gpulw.kernel();
+
+  // Set the kernel as the current fusion
+  FusionGuard kg(kernel);
+
+  // Create alocated scalars
+  auto ks0 = add(kernel->zeroVal(), kernel->oneVal());
+  auto ks0_alloc = IrBuilder::create<kir::Allocate>(
+      ks0, MemoryType::Local, kernel->oneVal());
+
+  auto ks1 = add(ks0, kernel->oneVal());
+  auto ks1_alloc = IrBuilder::create<kir::Allocate>(
+      ks1, MemoryType::Local, kernel->oneVal());
+
+  auto tk0 = kernel->inputs()[0]->as<TensorView>();
+  auto tki0 = IrBuilder::create<kir::TensorIndex>(tk0, std::vector<Val*>{ks0});
+  auto tki1 = IrBuilder::create<kir::TensorIndex>(tk0, std::vector<Val*>{ks1});
+  auto tk0_expr = IrBuilder::create<UnaryOp>(UnaryOpType::Set, tki0, tki1);
+
+  // Insert the scalar expression and the allocation of the
+  // output directly to the kernel
+  auto proxy = kir::KernelInternalProxy(kernel);
+
+  const auto indent = "  ";
+  const auto ks0_name = "i" + std::to_string(ks0->name());
+  const auto ks1_name = "i" + std::to_string(ks1->name());
+  const auto tk0_name = "T" + std::to_string(tk0->name());
+
+  auto& exprs = proxy.topLevelExprs();
+  exprs.push_back(tk0_expr);
+
+  // Invalid code gen
+  const auto no_alloc_code = codegen::generateCudaKernel(kernel);
+
+  // Without alloc, Int vals are just inlined, resulting in:
+  // t0[(0 + 1)] = t0[((0 + 1) + 1)]
+  std::stringstream no_alloc_ref;
+  no_alloc_ref << "\n"
+               << indent << tk0_name << "[(0 + 1)]\n"
+               << indent << indent << " = " << tk0_name << "[((0 + 1) + 1)];\n";
+
+  TORCH_CHECK(
+      no_alloc_code.find(no_alloc_ref.str()) != std::string::npos,
+      "Invalid code generation. Expected:",
+      no_alloc_ref.str(),
+      "Actual:\n",
+      no_alloc_code);
+
+  // Insert proper allocations and definitions
+  exprs.insert(std::find(exprs.begin(), exprs.end(), tk0_expr), ks0_alloc);
+  exprs.insert(
+      std::find(exprs.begin(), exprs.end(), tk0_expr), ks0->definition());
+  exprs.insert(std::find(exprs.begin(), exprs.end(), tk0_expr), ks1_alloc);
+  exprs.insert(
+      std::find(exprs.begin(), exprs.end(), tk0_expr), ks1->definition());
+
+  const auto valid_code = codegen::generateCudaKernel(kernel);
+
+  std::stringstream valid_ref;
+  valid_ref << "\n"
+            << indent << tk0_name << "[" << ks0_name << "]\n"
+            << indent << indent << " = " << tk0_name << "[" << ks1_name
+            << "];\n";
+
+  TORCH_CHECK(
+      valid_code.find(valid_ref.str()) != std::string::npos,
+      "Invalid code generation. Expected:",
+      valid_ref.str(),
+      "Actual:\n",
+      valid_code);
+}
+
+TEST_F(NVFuserTest, FusionIndexHoist1_CUDA) {
+  if (isOptionDisabled(DisableOption::IndexHoist)) {
+    GTEST_SKIP() << "Index hoisting disabled";
+  }
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  auto tv2 = set(tv1);
+  auto tv3 = set(tv2);
+  auto tv4 = set(tv3);
+  auto tv5 = set(tv4);
+  fusion.addOutput(tv5);
+
+  tv1->split(-1, 4);
+  tv2->split(-1, 4);
+  tv3->merge(0, 1);
+  tv3->split(0, 8);
+  tv5->merge(0, 1);
+  tv5->split(0, 8);
+  tv4->computeAt(tv5, -1);
+
+  tv1->setMemoryType(MemoryType::Global);
+  tv2->setMemoryType(MemoryType::Global);
+  tv3->setMemoryType(MemoryType::Global);
+
+  // Use Int32 as the index type to verify Int32 is used as the type
+  // of hoisted indices
+  GpuLower gpulw(&fusion, DataType::Int32);
+  auto kernel = gpulw.kernel();
+
+  auto is_index_times_ns = [](Val* val, Val* index, std::string name) -> bool {
+    auto def = dynamic_cast<BinaryOp*>(val->definition());
+    if (def == nullptr) {
+      return false;
+    }
+    return def->getBinaryOpType() == BinaryOpType::Mul &&
+        def->rhs()->isA<NamedScalar>() &&
+        def->rhs()->as<NamedScalar>()->name() == name && def->lhs() == index;
+  };
+
+  // Validate indices in the kernel are hoisted as
+  // intended. Validation could be also done by just string comparison
+  // as the parser test, but updating such tests would be tedious.
+  for (auto top_level_loop :
+       ir_utils::filterByType<kir::ForLoop>(kernel->topLevelExprs())) {
+    auto innermost_loop = top_level_loop;
+    while (auto first_expr_loop = dynamic_cast<kir::ForLoop*>(
+               innermost_loop->body().exprs().at(0))) {
+      innermost_loop = first_expr_loop;
+    }
+    const auto& exprs = innermost_loop->body().exprs();
+    TORCH_CHECK(!exprs.empty(), "No expression found");
+    TORCH_CHECK(
+        exprs.at(0)->isA<kir::Allocate>(),
+        "Invalid expression: ",
+        exprs.at(0)->toString());
+    auto hoisted_index = exprs.at(0)->as<kir::Allocate>()->buffer();
+    TORCH_CHECK(
+        hoisted_index->dtype() == DataType::Int32,
+        "Invalid data type of hoisted indices. Should be Int32 but: ",
+        hoisted_index->dtype());
+    kir::Predicate* pred = nullptr;
+    for (auto expr : exprs) {
+      if (expr->isA<kir::IfThenElse>()) {
+        pred = expr->as<kir::IfThenElse>()->predicate();
+        auto arith_expr = expr->as<kir::IfThenElse>()->thenBody().exprs().at(0);
+        auto out_ti = arith_expr->outputs()[0]->as<kir::TensorIndex>();
+        if (out_ti->view()->name() == 1) {
+          // Ref: T1[*, hoisted_index] = T0[*, hoisted_index * T0.stride];
+          auto t1_index = out_ti->index(1);
+          TORCH_CHECK(
+              t1_index == hoisted_index,
+              "Invalid index: ",
+              t1_index->toInlineString());
+          // Pred: hoisted_index < T0.size[1]
+          TORCH_CHECK(
+              pred->value()->definition()->as<BinaryOp>()->lhs() ==
+                  hoisted_index,
+              "Invalid predicate: ",
+              pred->value()->toInlineString(),
+              ", ",
+              expr->toString());
+          TORCH_CHECK(arith_expr->inputs().size() == 1);
+          auto in0 = arith_expr->inputs().front()->as<kir::TensorIndex>();
+          TORCH_CHECK(in0->view()->name() == 0);
+          // hoisted_index * T0.stride[1]
+          auto t0_index = in0->index(1);
+          TORCH_CHECK(
+              is_index_times_ns(t0_index, hoisted_index, "T0.stride[1]"),
+              "Invalid index: ",
+              t0_index->toInlineString(),
+              ", ",
+              expr->toString());
+        } else if (out_ti->view()->name() == 2) {
+          // Ref: T3[*, hoisted_index] = T2[*, hoisted_index];
+          auto out_index = out_ti->index(1);
+          TORCH_CHECK(
+              out_index == hoisted_index,
+              "Invalid index: ",
+              out_index->toInlineString(),
+              ", ",
+              expr->toString());
+          TORCH_CHECK(
+              pred->value()->definition()->as<BinaryOp>()->lhs() ==
+                  hoisted_index,
+              "Invalid predicate: ",
+              pred->value()->toInlineString(),
+              ", ",
+              expr->toString());
+          TORCH_CHECK(arith_expr->inputs().size() == 1);
+          auto in0 = arith_expr->inputs().front()->as<kir::TensorIndex>();
+          TORCH_CHECK(in0->view()->name() == 1);
+          auto in0_index = in0->index(1);
+          TORCH_CHECK(
+              in0_index == hoisted_index,
+              "Invalid index: ",
+              in0_index->toInlineString(),
+              ", ",
+              expr->toString());
+        } else if (out_ti->view()->name() == 3) {
+          // Ref: T3[hoisted_index] = T2[hoisted_index];
+          auto out_index = out_ti->index(0);
+          TORCH_CHECK(
+              out_index == hoisted_index,
+              "Invalid index: ",
+              out_index->toInlineString(),
+              ", ",
+              expr->toString());
+          TORCH_CHECK(
+              pred->value()->definition()->as<BinaryOp>()->lhs() ==
+                  hoisted_index,
+              "Invalid predicate: ",
+              pred->value()->toInlineString(),
+              ", ",
+              expr->toString());
+          TORCH_CHECK(arith_expr->inputs().size() == 1);
+          auto in0 = arith_expr->inputs().front()->as<kir::TensorIndex>();
+          TORCH_CHECK(in0->view()->name() == 2);
+          auto in0_index = in0->index(0);
+          TORCH_CHECK(
+              in0_index == hoisted_index,
+              "Invalid index: ",
+              in0_index->toInlineString(),
+              ", ",
+              expr->toString());
+        } else if (out_ti->view()->name() == 4) {
+          // Ref: T4[0] = T3[hoisted_index];
+          TORCH_CHECK(
+              pred->value()->definition()->as<BinaryOp>()->lhs() ==
+                  hoisted_index,
+              "Invalid predicate: ",
+              pred->value()->toInlineString(),
+              ", ",
+              expr->toString());
+          TORCH_CHECK(arith_expr->inputs().size() == 1);
+          auto in0 = arith_expr->inputs().front()->as<kir::TensorIndex>();
+          TORCH_CHECK(in0->view()->name() == 3);
+          auto in0_index = in0->index(0);
+          TORCH_CHECK(
+              in0_index == hoisted_index,
+              "Invalid index: ",
+              in0_index->toInlineString(),
+              ", ",
+              expr->toString());
+        } else if (out_ti->view()->name() == 5) {
+          // Ref: T5[hoisted_index] = T4[0]
+          auto out_index = out_ti->index(0);
+          TORCH_CHECK(
+              out_index == hoisted_index,
+              "Invalid index: ",
+              out_index->toInlineString(),
+              ", ",
+              expr->toString());
+          TORCH_CHECK(
+              pred->value()->definition()->as<BinaryOp>()->lhs() ==
+                  hoisted_index,
+              "Invalid predicate: ",
+              pred->value()->toInlineString(),
+              ", ",
+              expr->toString());
+        }
+      }
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({15, 17}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Hoist indices for vectorized tensors
+TEST_F(NVFuserTest, FusionIndexHoist2_CUDA) {
+  if (isOptionDisabled(DisableOption::IndexHoist)) {
+    GTEST_SKIP() << "Index hoisting disabled";
+  }
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeContigTensor(1);
+  fusion.addInput(tv1);
+
+  auto tv2 = set(tv0);
+  auto tv3 = set(tv1);
+  auto tv4 = add(tv2, tv3);
+  auto tv5 = set(tv4);
+  fusion.addOutput(tv5);
+
+  tv5->split(-1, 4);
+  TransformPropagatorWithCheck propagator(tv5);
+  MaxRootDomainInfoSpanningTree(tv5).traverse(&propagator);
+
+  tv4->split(-1, 3);
+
+  tv0->computeAt(tv5, 1);
+  tv1->computeAt(tv5, 1);
+
+  tv2->axis(-1)->parallelize(ParallelType::Vectorize);
+  tv3->axis(-1)->parallelize(ParallelType::Vectorize);
+  tv5->axis(-1)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({16}, options);
+  auto t1 = at::randn({16}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTestGridComm_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  int X = 3, Y = 4, Z = 2;
+  auto tv0 = makeConcreteTensor({X, Y, Z});
+  fusion.addInput(tv0);
+  auto tv1 = makeConcreteTensor({X, Y, Z});
+  fusion.addInput(tv1);
+
+  auto tv2 = set(tv0);
+  auto tv3 = add(tv2, tv1);
+  auto tv4 = set(tv3);
+  auto tv5 = set(tv4);
+  fusion.addOutput(tv5);
+
+  tv2->setMemoryType(MemoryType::Global);
+  tv3->setMemoryType(MemoryType::Global);
+  tv4->setMemoryType(MemoryType::Global);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDy);
+  tv2->axis(1)->parallelize(ParallelType::BIDx);
+  tv2->axis(2)->parallelize(ParallelType::Vectorize);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(1)->parallelize(ParallelType::BIDy);
+
+  tv4->axis(0)->parallelize(ParallelType::BIDy);
+  tv4->axis(1)->parallelize(ParallelType::BIDx);
+
+  tv5->axis(0)->parallelize(ParallelType::BIDy);
+  tv5->axis(1)->parallelize(ParallelType::BIDx);
+  tv5->axis(2)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({X, Y, Z}, options);
+  auto t1 = at::randn({X, Y, Z}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+// See issue https://github.com/csarofeen/pytorch/issues/1497
+TEST_F(NVFuserTest, FusionTestGridComm2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int64_t W = 3, X = 4;
+
+  auto tv0 = makeConcreteTensor({X});
+  auto tv1 = makeConcreteTensor({W, X});
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv3 = broadcast(tv2, {true, false});
+  auto tv4 = add(tv3, tv1);
+  fusion.addOutput(tv4);
+
+  tv4->merge(0);
+  tv4->split(0, 2);
+
+  TransformPropagatorWithCheck propagator(tv4);
+  MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
+
+  tv3->computeAt(tv4, 1);
+
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv2->setMemoryType(MemoryType::Global);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({X}, options);
+  auto t1 = at::randn({W, X}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1 + 1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+// Vectorized reset test for double buffered registers
+TEST_F(NVFuserTest, FusionDoubleBufferVector_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
+  auto tv2 = sum(tv1, {0});
+  auto tv2c = tv2->cacheBefore();
+
+  fusion.addOutput(tv2);
+
+  auto tv1cw = tv1->cacheAfter();
+  auto tv1cr = tv1cw->cacheAfter();
+
+  tv1cw->split(-1, 32);
+  tv1cr->split(-1, 32);
+  tv1cr->split(-1, 4);
+  tv1cr->axis(-1)->parallelize(ParallelType::Vectorize);
+
+  tv1cw->computeAt(tv1cr, 1);
+  tv0->computeAt(tv1cw, -1);
+  tv2c->split(-1, 32);
+  tv2c->split(-1, 4);
+  tv1cr->computeAt(tv2c, 2);
+
+  tv1cw->setMemoryType(MemoryType::Shared);
+  tv1cr->doubleBuffer();
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::manual_seed(0);
+  auto t0 = at::randn({200}, options);
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+  auto ref = (t0 + 1).sum({0});
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Request 48KB of data in shared mem,
+//  should be large enough not to fit in
+//  static allocations, but small enough
+//  to fit in supported devices (sm70+).
+TEST_F(NVFuserTest, FusionLargeSmem_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+  fusion.addOutput(tv2);
+
+  tv2->split(0, 12288);
+  tv2->split(1, 128);
+  tv1->computeAt(tv2, 1);
+  tv1->split(1, 128);
+  tv0->computeAt(tv1, -1);
+  tv1->setMemoryType(MemoryType::Shared);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::manual_seed(0);
+  auto t0 = at::randn({12288 * 4}, options);
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+  auto ref = t0 + 1 + 2;
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Request a smem allocation that is equal to the device limit
+TEST_F(NVFuserTest, FusionTooLargeSmem_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto properties = at::cuda::getDeviceProperties(
+      c10::Device(c10::DeviceType::CUDA, 0).index());
+  int device_limit = properties->sharedMemPerBlockOptin;
+
+  auto tv0 = makeContigTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1.0));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(2.0));
+  fusion.addOutput(tv2);
+
+  // 4 byte per float
+  tv2->split(0, device_limit / 4);
+  tv2->split(1, 128);
+  tv1->computeAt(tv2, 1);
+  tv1->split(1, 128);
+  tv0->computeAt(tv1, -1);
+  tv1->setMemoryType(MemoryType::Shared);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::manual_seed(0);
+  auto t0 = at::randn({12288 * 4}, options);
+  FusionExecutor fe;
+
+  // First compile gets a compiled kernel
+  fe.compileFusion(&fusion, {t0});
+
+  // Should be throwing because the kernel
+  //  requested absolute device limit
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.runFusion({t0}));
+}
+
+// Try to test alignment when multiple tensors are
+//  in shared mem.
+TEST_F(NVFuserTest, FusionSmemAlignment_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({3, 4, 7, 2, 5});
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {4});
+  auto tv2 = sum(tv1, {3});
+  auto tv3 = sum(tv2, {2});
+  auto tv4 = sum(tv3, {1});
+  fusion.addOutput(tv4);
+
+  auto tv0c = tv0->cacheAfter();
+  auto tv1bc = tv1->cacheBefore();
+  auto tv2bc = tv2->cacheBefore();
+  auto tv3bc = tv3->cacheBefore();
+  auto tv4bc = tv4->cacheBefore();
+
+  tv0c->setMemoryType(MemoryType::Shared);
+  tv1bc->setMemoryType(MemoryType::Shared);
+  tv2bc->setMemoryType(MemoryType::Shared);
+  tv3bc->setMemoryType(MemoryType::Shared);
+  tv4bc->setMemoryType(MemoryType::Shared);
+
+  tv1->axis(-1)->parallelize(ParallelType::Vectorize);
+  tv3->axis(-1)->parallelize(ParallelType::Vectorize);
+  tv0->computeAt(tv4, 0);
+  tv0->computeAt(tv2, 2);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::manual_seed(0);
+  auto t0 = at::randn({3, 4, 7, 2, 5}, options);
+  FusionExecutor fe;
+
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+  auto tref = t0.sum({1, 2, 3, 4});
+
+  testValidate(&fusion, cg_outputs, {t0}, {tref}, __LINE__, __FILE__);
+}
+
+// Repro of #1521
+TEST_F(NVFuserTest, FusionImmediateValueAsInput_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto immediate_scalr = IrBuilder::create<Double>(0.1);
+  // Adding an immediate scalar value as an input is not allowed
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fusion.addInput(immediate_scalr));
+
+  // Instead, use a symbolic value
+  auto symbolic_scalar = IrBuilder::create<Double>();
+  fusion.addInput(symbolic_scalar);
+
+  auto tv1 = add(tv0, symbolic_scalar);
+  fusion.addOutput(tv1);
+
+  // Make sure the kernel is compiled.
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+}
+
+// Repro of #1506
+TEST_F(NVFuserTest, FusionVectorizeContigIndex_CUDA) {
+  std::vector<int64_t> shape{14, 14};
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  auto tv2 = set(tv1);
+  fusion.addOutput(tv2);
+
+  tv2->merge(0);
+
+  // Vectorize by 4 should be allowed
+  tv2->split(0, 4);
+
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+  tv0->computeAt(tv2, 1);
+
+  tv1->axis(1)->parallelize(ParallelType::Vectorize);
+  tv2->axis(1)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  TORCH_CHECK(t0.equal(cg_outputs[0]));
+}
+
+// Make sure the same fusion as FusionVectorizeContigIndex fails if
+// not contig.
+TEST_F(NVFuserTest, FusionVectorizeContigIndexFail_CUDA) {
+  std::vector<int64_t> shape{14, 14};
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  auto tv2 = set(tv1);
+  fusion.addOutput(tv2);
+
+  tv2->merge(0);
+
+  tv2->split(0, 4);
+
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+  tv0->computeAt(tv2, 1);
+
+  tv1->axis(1)->parallelize(ParallelType::Vectorize);
+  tv2->axis(1)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+
+  // This should fail at the launch time as 14 is not divisible by the
+  // vector word size. The two domains are merged, but they are not
+  // contiguous, so contig indexing is not involved in this case.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.runFusion({t0}));
+}
+
+TEST_F(NVFuserTest, FusionVectorizeInputToOutput_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  fusion.addOutput(tv1);
+
+  tv1->split(0, 4);
+
+  tv1->axis(-1)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+
+  const int n = 12;
+  auto t0 = at::randn({n}, options);
+  // Shift by one to make it non-aligned
+  auto t0_misaligned = at::randn({n + 1}, options).index({Slice(1)});
+  auto t1_misaligned = at::empty({n + 1}, options).index({Slice(1)});
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+  TORCH_CHECK(t0.equal(cg_outputs[0]));
+
+  // Pass misaligned input. This must fail.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.runFusion({t0_misaligned}));
+
+  // Pass misaligned output. This must fail too.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.runFusion({t0}, {t1_misaligned}));
+}
+
+// Repro of issue #1530
+TEST_F(NVFuserTest, FusionVectorizeContigIndexValidationFail_CUDA) {
+  std::vector<int64_t> shape{1, 2, 1};
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(shape.size());
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  fusion.addOutput(tv1);
+
+  tv1->merge(1);
+  tv1->merge(0);
+
+  auto invalid_vec_size = shape[0] * shape[1] * shape[2];
+  invalid_vec_size *= invalid_vec_size;
+
+  tv1->split(0, invalid_vec_size);
+
+  tv1->axis(1)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.runFusion({t0}));
+}
+
+TEST_F(NVFuserTest, FusionContigIndexingWithBroadcast_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({4});
+  fusion.addInput(tv0);
+  auto tv1 = makeConcreteTensor({3, 4});
+  fusion.addInput(tv1);
+
+  auto tv2 = broadcast(tv0, {true, false});
+  auto tv3 = add(tv2, tv1);
+  fusion.addOutput(tv3);
+
+  tv3->merge(0);
+  TransformPropagatorWithCheck propagator(tv3);
+  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
+
+  tv2->setMemoryType(MemoryType::Local);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({4}, options);
+  auto t1 = at::randn({3, 4}, options);
+
+  auto t3 = t0.unsqueeze(0).add(t1);
+  {
+    FusionExecutor fe;
+    fe.compileFusion(&fusion, {t0, t1});
+    auto cg_outputs = fe.runFusion({t0, t1});
+
+    testValidate(&fusion, cg_outputs, {t0, t1}, {t3}, __LINE__, __FILE__);
+  }
+
+  // Make sure tv2 indexing also works when it's stored in global memory
+  tv2->setMemoryType(MemoryType::Global);
+  {
+    FusionExecutor fe;
+    fe.compileFusion(&fusion, {t0, t1});
+    auto cg_outputs = fe.runFusion({t0, t1});
+
+    testValidate(&fusion, cg_outputs, {t0, t1}, {t3}, __LINE__, __FILE__);
+  }
+}
+
+// Repro of #1534. Validation should detect invalid vectorization.
+TEST_F(NVFuserTest, FusionVectorizeContigIndexValidationFail2_CUDA) {
+  std::vector<int64_t> shape1{2, 3, 2};
+  std::vector<int64_t> shape2{2, 2};
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigConcreteTensor(shape1);
+  fusion.addInput(tv0);
+  auto tv1 = makeContigConcreteTensor(shape2);
+  fusion.addInput(tv1);
+
+  auto tv2 = set(tv1);
+  auto tv3 = broadcast(tv2, {false, true, false});
+  auto tv4 = add(tv0, tv3);
+  fusion.addOutput(tv4);
+
+  tv4->merge(1, 2);
+  tv4->merge(0, 1);
+  tv4->split(0, 4);
+  TransformPropagatorWithCheck propagator(tv4);
+  MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
+
+  tv0->computeAt(tv4, -2);
+  tv1->computeAt(tv4, -2);
+
+  tv2->axis(-1)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape1, options);
+  auto t1 = at::randn(shape2, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+
+  // Vectorization of tv2 should be detected as invalid.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fe.runFusion({t0, t1}));
+}
+
+TEST_F(NVFuserTest, FusionVectorizeContigIndexWithBroadcast_CUDA) {
+  std::vector<int64_t> shape1{2, 2, 2};
+  std::vector<int64_t> shape2{1, 2, 2};
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // [I0, I1, I2]
+  auto tv0 = makeContigTensor(shape1.size());
+  fusion.addInput(tv0);
+
+  // [B3, I1, I2]
+  auto tv1 = makeContigConcreteTensor(shape2);
+  fusion.addInput(tv1);
+
+  auto tv2 = set(tv1);
+  auto tv3 = add(tv0, tv2);
+  fusion.addOutput(tv3);
+
+  tv3->merge(1, 2);
+  tv3->merge(0, 1);
+  tv3->split(0, 4);
+
+  // Don't modify tv1 so that it's replayed as tv2 with actual
+  // transformations. It would create temporary IterDomains, and the
+  // validation should still be able to detect vectorization by 4 is valid.
+  // TransformPropagatorWithCheck propagator(tv3);
+  // MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
+
+  tv2->merge(1, 2);
+  tv2->merge(0, 1);
+  tv2->split(0, 4);
+
+  tv2->computeAt(tv3, -2);
+
+  tv2->axis(-1)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape1, options);
+  auto t1 = at::randn(shape2, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionVectorizeContigIndexPointwiseSchedule_CUDA) {
+  std::vector<int64_t> shape0{100, 14, 2, 14};
+  std::vector<int64_t> shape1{100, 2, 14};
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(shape0.size());
+  fusion.addInput(tv0);
+  auto tv1 = makeContigTensor(shape1.size());
+  fusion.addInput(tv1);
+
+  auto tv2 = broadcast(tv1, {false, true, false, false});
+  auto tv3 = add(tv0, tv2);
+  fusion.addOutput(tv3);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape0, options);
+  auto t1 = at::randn(shape1, options);
+
+  auto lparams = schedulePointwise(&fusion, {t0, t1});
+
+  GpuLower gpulw(&fusion);
+  auto kernel = gpulw.kernel();
+
+  // The innermost two dimensions are merged and contiguous, so
+  // vectorization can be done against 2*14=28 rather than 14, so
+  // vector word size should be 4. Broadcasting of tv1 should not
+  // matter.
+  for (const auto& vec_info : kernel->summary().vectorized_set_info) {
+    TORCH_CHECK(
+        vec_info.word_size == 4,
+        "Invalid vector word size: ",
+        vec_info.word_size);
+  }
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1}, lparams);
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1.unsqueeze(-3);
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+// Repro of issue #1539.
+TEST_F(NVFuserTest, FusionTrivialReductionForwarding1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = broadcast(tv0, {true, false});
+  auto tv2 = sum(tv1, {0});
+  auto tv3 = set(tv2);
+  fusion.addOutput(tv3);
+
+  tv2->merge(0);
+  tv2->split(0, 4);
+
+  TransformPropagatorWithCheck propagator(tv2);
+  MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
+
+  // All tensors must be transformed to a 2D tensor with each axis
+  // mapped with each other in the LOOP map.
+  ComputeAtMap ca_map(&fusion);
+  for (auto tv : ir_utils::allTvs(&fusion)) {
+    TORCH_CHECK(
+        tv->nDims() == 2, "Expected to be a 2D tensor but: ", tv->toString());
+    for (const auto i : c10::irange(2)) {
+      TORCH_CHECK(ca_map.areMapped(
+          tv->axis(i), tv3->axis(i), IdMappingMode::PERMISSIVE));
+    }
+  }
+}
+
+TEST_F(NVFuserTest, FusionTrivialReductionForwarding2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = broadcast(tv0, {true, false});
+  auto tv2 = sum(tv1, {0});
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+
+  fusion.addOutput(tv3);
+
+  // Merging a trivial reduction with a non-reduction domain
+  tv2->merge(0, 1);
+  tv2->split(0, 4);
+
+  tv3->split(0, 4);
+
+  // tv2 and tv3 are different as tv3 lacks the trivial reduction, but
+  // they are mapped with each other by BestEffortReplay as the merge
+  // of trivial reduciton dim is forwarded.
+
+  PairwiseRootDomainMap root_map(tv2, tv3);
+
+  auto p2c = BestEffortReplay::replayCasP(tv3, tv2, 2, root_map).getReplay();
+  for (const auto i : c10::irange(tv2->nDims())) {
+    auto tv2_id = tv2->axis(i);
+    auto it = p2c.find(tv2_id);
+    TORCH_CHECK(
+        it != p2c.end(),
+        "Expected mapped consumer ID but not found: ",
+        tv2_id->toString());
+    auto tv3_mapped_id = it->second;
+    TORCH_CHECK(
+        tv3_mapped_id == tv3->axis(i),
+        "Unexpected mapped consumer ID: ",
+        tv3_mapped_id->toString());
+  }
+
+  auto c2p = BestEffortReplay::replayPasC(tv2, tv3, 2, root_map).getReplay();
+  for (const auto i : c10::irange(tv3->nDims())) {
+    auto tv3_id = tv3->axis(i);
+    auto it = c2p.find(tv3_id);
+    TORCH_CHECK(
+        it != c2p.end(),
+        "Expected mapped producer ID but not found: ",
+        tv3_id->toString());
+    auto tv2_mapped_id = it->second;
+    TORCH_CHECK(
+        tv2_mapped_id == tv2->axis(i),
+        "Unexpected mapped consumer ID: ",
+        tv2_mapped_id->toString());
+  }
+}
+
+TEST_F(NVFuserTest, FusionTrivialReductionForwarding3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = sum(tv0, {1});
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv2);
+
+  // Similar pattern as FusionTrivialReductionForwarding2 but trivial
+  // reduciton at non-root domain
+
+  // Create a trivial reduction by splitting with a factor of 1
+  tv1->split(1, 1, false);
+  // Merging with a trivial reduction
+  tv1->merge(0, 1);
+  auto tv1_merge_out_id = tv1->axis(0);
+  tv1->split(0, 5);
+
+  tv2->split(0, 5);
+
+  // The merge of tv1 is done with a non-root trivial
+  // reduciton. BestEffortReplay should forward the merge.
+
+  PairwiseRootDomainMap root_map(tv1, tv2);
+  auto p2c = BestEffortReplay::replayCasP(tv2, tv1, 2, root_map).getReplay();
+
+  // The two tensors should look like:
+  // tv1: [I1*1//5, 5, I2//1]
+  // tv2: [I1//5, 5]
+  //
+  // BestEffortRepaly should forward the merge of (I1 * 1) and create
+  // mappings of:
+  // I1*1//5 -> I1//5
+  // 5 -> 5
+  // I1*1 -> I1
+
+  TORCH_CHECK(p2c.size() == 3, "Unexpected number of mappings");
+  TORCH_CHECK(p2c.count(tv1->axis(0)) && p2c[tv1->axis(0)] == tv2->axis(0));
+  TORCH_CHECK(p2c.count(tv1->axis(1)) && p2c[tv1->axis(1)] == tv2->axis(1));
+  TORCH_CHECK(
+      p2c.count(tv1_merge_out_id) &&
+      p2c[tv1_merge_out_id] == tv2->getRootDomain()[0]);
+}
+
+TEST_F(NVFuserTest, FusionTrivialReductionForwarding4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+
+  auto tv2 = broadcast(tv0, {true, false});
+  auto tv3 = add(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  // tv4 has a trivial reduction axis
+  auto tv4 = sum(tv2, {0});
+  auto tv5 = add(tv4, IrBuilder::create<Double>(1));
+  fusion.addOutput(tv5);
+
+  tv3->merge(0, 1);
+  tv3->split(0, 32);
+
+  // This causes the trivial reduction of tv4 to be merged with
+  // another axis of tv4, and then forward computeAt is done from tv4
+  // to tv5. The split of the merged id of tv4 should be done on tv5
+  // by forwarding the merge of the trivial reduction.
+  tv0->computeAt(tv3, -1);
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({111}, options);
+  auto t1 = at::randn({123, 111}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto t2 = t0.unsqueeze(0);
+  auto t3 = t1 + t2;
+  auto t5 = sum(t2, {0}) + 1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {t3, t5}, __LINE__, __FILE__);
+}
+
+// See issue #1598
+TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = set(tv0);
+  auto tv3 = set(tv1);
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  // Place tv2 on shared memory
+  tv2->split(0, 2);
+  tv2->split(-1, 4);
+  tv2->setMemoryType(MemoryType::Shared);
+  tv2->axis(-2)->parallelize(ParallelType::TIDy);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv3->split(0, 2);
+  tv3->split(-1, 4);
+  // swap tidx and tidy
+  tv3->axis(-2)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDy);
+
+  tv4->split(0, 2);
+  tv4->split(-1, 4);
+  tv4->axis(-2)->parallelize(ParallelType::TIDx);
+  tv4->axis(-1)->parallelize(ParallelType::TIDy);
+
+  tv0->computeAt(tv4, 1);
+  tv3->computeAt(tv4, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({10, 64}, options);
+  auto t1 = at::randn({10, 64}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+// See issue #1598
+TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = set(tv0);
+  auto tv3 = set(tv1);
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  tv2->split(0, 2);
+  tv2->split(-1, 4);
+  tv2->setMemoryType(MemoryType::Shared);
+
+  tv2->axis(-2)->parallelize(ParallelType::TIDy);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv4->split(0, 2);
+  tv4->split(-1, 4);
+  // Also do unroll for tv3 and tv4
+  tv4->split(-2, 8, false);
+  tv4->axis(-3)->parallelize(ParallelType::Unroll);
+  // swap tidx and tidy
+  tv4->axis(-2)->parallelize(ParallelType::TIDx);
+  tv4->axis(-1)->parallelize(ParallelType::TIDy);
+
+  tv0->computeAt(tv4, 1);
+  tv3->computeAt(tv4, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({10, 64}, options);
+  auto t1 = at::randn({10, 64}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+// See issue #1599
+TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = set(tv0);
+  auto tv3 = set(tv1);
+  auto tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  // Use unroll where a RAW-sync tensor is stored
+
+  tv4->split(0, 2);
+  tv4->split(0, 3);
+  tv4->split(-1, 4);
+  tv4->axis(1)->parallelize(ParallelType::Unroll);
+  tv4->axis(-2)->parallelize(ParallelType::TIDx);
+  tv4->axis(-1)->parallelize(ParallelType::TIDy);
+
+  tv0->computeAt(tv4, 3);
+  tv3->computeAt(tv4, -1);
+
+  tv2->split(-1, 4);
+  tv2->axis(-2)->parallelize(ParallelType::TIDy);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->setMemoryType(MemoryType::Shared);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({50, 64}, options);
+  auto t1 = at::randn({50, 64}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+// See #1618
+TEST_F(NVFuserTest, FusionRAWSyncInsertionPlace4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({16, 128});
+  auto tv1 = makeConcreteTensor({16, 128});
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = set(tv0);
+  auto tv3 = set(tv1);
+  auto tv4 = set(tv2);
+  auto tv5 = set(tv3);
+  auto tv6 = add(tv4, tv5);
+  fusion.addOutput(tv6);
+
+  tv2->setMemoryType(MemoryType::Shared);
+  tv3->setMemoryType(MemoryType::Shared);
+
+  tv2->computeAt(tv6, 0);
+  tv3->computeAt(tv6, 1);
+  tv4->computeAt(tv6, 1);
+  tv5->computeAt(tv6, -1);
+  tv2->split(1, 64);
+  tv3->split(1, 64);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv6->axis(-1)->parallelize(ParallelType::TIDx);
+
+  // Check the block sync is inserted at the correct location.
+  //  There is exactly one block sync needed in this test case
+  //    and the sync needs to be after the 2 expressions
+  //    that modify shared memory.
+  class SyncInsertionPointChecker : public kir::IrVisitor {
+   public:
+    using kir::IrVisitor::handle;
+
+   private:
+    void handle(UnaryOp* uop) final {
+      // Record number of unary ops that modifies shared memory.
+      if (uop->out()->isA<kir::TensorIndex>() &&
+          uop->out()->as<kir::TensorIndex>()->view()->getMemoryType() ==
+              MemoryType::Shared &&
+          // Filter out initialization expressions
+          uop->in()->isA<kir::TensorIndex>()) {
+        number_of_writes_++;
+      }
+    }
+    void handle(kir::BlockSync* bsync) final {
+      // Make sure both shared memory modifying expressions
+      //  have been observed at the sync insertion point.
+      TORCH_INTERNAL_ASSERT(
+          number_of_writes_ == 2,
+          "FusionRAWSyncInsertionPlace4 test fail:",
+          "only 1 sync after the 2 shared mem writes is needed in this test,"
+          "either a redundant sync has been inserted or the block sync is not inserted at the right place");
+    }
+
+   private:
+    int number_of_writes_ = 0;
+  } sync_insertion_checker;
+  GpuLower gpulw(&fusion);
+  sync_insertion_checker.handle(gpulw.kernel()->topLevelExprs());
+}
+
+// Test serial write and parallel read of shared mem: mapped case
+TEST_F(NVFuserTest, FusionSerialSmemWriteParallelRead1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeConcreteTensor({128, 6});
+  TensorView* tv1 = makeConcreteTensor({128, 6});
+  TensorView* tv2 = makeConcreteTensor({128, 6});
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addInput(tv2);
+
+  TensorView* tv3 = add(tv0, tv1);
+  TensorView* tv4 = add(tv3, tv2);
+
+  fusion.addOutput(tv4);
+
+  //  Use shared memory
+  tv3->setMemoryType(MemoryType::Shared);
+
+  // Parallelize t4, in this case dim 0 on tv3 will
+  //  not be parallelized but dim0 of t4 will be.
+  // We will need to make sure a sync is inserted
+  //  even if these dimensions are mapped.
+  tv4->axis(0)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({128, 6}, options);
+  at::Tensor t1 = at::randn({128, 6}, options);
+  at::Tensor t2 = at::randn({128, 6}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1, t2});
+  auto cg_outputs = fe.runFusion({t0, t1, t2});
+
+  auto ref = t0 + t1 + t2;
+
+  testValidate(&fusion, cg_outputs, {t0, t1, t2}, {ref}, __LINE__, __FILE__);
+}
+
+// Test serial write and parallel read of shared mem: un-mapped case
+TEST_F(NVFuserTest, FusionSerialSmemWriteParallelRead2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeConcreteTensor({128, 6});
+  TensorView* tv1 = makeConcreteTensor({128, 6});
+  TensorView* tv2 = makeConcreteTensor({128, 6});
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addInput(tv2);
+
+  TensorView* tv3 = add(tv0, tv1);
+  TensorView* tv4 = add(tv3, tv2);
+
+  fusion.addOutput(tv4);
+
+  //  Use shared memory
+  tv3->setMemoryType(MemoryType::Shared);
+
+  // Split and parallelize t4,
+  //  the parallelized dimension in t4 will not
+  // map across to the shared mem tensor, t3. So
+  // there will need to be a sync before use of t3.
+  tv4->split(0, 2);
+  tv4->axis(0)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({128, 6}, options);
+  at::Tensor t1 = at::randn({128, 6}, options);
+  at::Tensor t2 = at::randn({128, 6}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1, t2});
+  auto cg_outputs = fe.runFusion({t0, t1, t2});
+
+  auto ref = t0 + t1 + t2;
+
+  testValidate(&fusion, cg_outputs, {t0, t1, t2}, {ref}, __LINE__, __FILE__);
+}
+
+// Simple test of async copy primitive
+TEST_F(NVFuserTest, FusionSimpleCpAsync_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int m = 33, n = 31;
+
+  TensorView* tv0 = makeConcreteTensor({m, n});
+  TensorView* tv1 = makeConcreteTensor({m, n});
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = add(tv0, tv1);
+
+  fusion.addOutput(tv2);
+
+  auto tv0_shared = tv0->cacheAfter(LoadStoreOpType::CpAsync);
+  tv0_shared->setMemoryType(MemoryType::Shared);
+
+  tv0->computeAt(tv2, 1);
+  tv0_shared->axis(1)->parallelize(ParallelType::TIDx);
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({m, n}, options);
+  at::Tensor t1 = at::randn({m, n}, options);
+
+  FusionExecutor fe;
+
+  // requires ampere+ GPU
+  if (!deviceMajorMinorCheck(8)) {
+    ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0, t1}));
+    GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
+  }
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+// Simple test of async copy primitive: double buffered
+//   Double buffer case 1, both block sync and async wait
+//  are needed.
+TEST_F(NVFuserTest, FusionDoubleBufferCpAsync1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Using vectorization so need to keep n multiple of 4.
+  int m = 33, n = 48;
+
+  TensorView* tv0 = makeConcreteTensor({m, n});
+  TensorView* tv1 = makeConcreteTensor({m, n});
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = add(tv0, tv1);
+
+  fusion.addOutput(tv2);
+
+  auto tv0_shared = tv0->cacheAfter(LoadStoreOpType::CpAsync);
+  tv0_shared->setMemoryType(MemoryType::Shared);
+  tv0->computeAt(tv2, 1);
+
+  // Asynchronously load a tile in one schedule
+  tv0_shared->split(1, 4);
+  tv0_shared->axis(-1)->parallelize(ParallelType::Vectorize);
+  tv0_shared->axis(-2)->parallelize(ParallelType::TIDx);
+
+  // Consume the loaded tile in another schedule,
+  //   triggering the need for a sync.
+  tv2->split(1, 12);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+
+  // Double buffer the shared mem tensor.
+  tv0_shared->doubleBuffer();
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({m, n}, options);
+  at::Tensor t1 = at::randn({m, n}, options);
+
+  FusionExecutor fe;
+  // requires ampere+ GPU
+  if (!deviceMajorMinorCheck(8)) {
+    ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0, t1}));
+    GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
+  }
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+// Simple test of async copy primitive: double buffered
+//   Double buffer case 2, only async wait is needed
+TEST_F(NVFuserTest, FusionDoubleBufferCpAsync2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Using vectorization so need to keep n multiple of 4.
+  int m = 33, n = 48;
+
+  TensorView* tv0 = makeConcreteTensor({m, n});
+  TensorView* tv1 = makeConcreteTensor({m, n});
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = add(tv0, tv1);
+
+  fusion.addOutput(tv2);
+
+  auto tv0_shared = tv0->cacheAfter(LoadStoreOpType::CpAsync);
+  tv0_shared->setMemoryType(MemoryType::Shared);
+  tv0->computeAt(tv2, 1);
+
+  // Asynchronously load a tile in one schedule
+  tv0_shared->split(1, 4);
+  tv0_shared->axis(-2)->parallelize(ParallelType::TIDx);
+
+  // Consume the loaded tile in another schedule,
+  //   triggering the need for a sync.
+  tv2->split(1, 4);
+  tv2->axis(-2)->parallelize(ParallelType::TIDx);
+
+  // Double buffer the shared mem tensor.
+  tv0_shared->doubleBuffer();
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({m, n}, options);
+  at::Tensor t1 = at::randn({m, n}, options);
+
+  FusionExecutor fe;
+  // requires ampere+ GPU
+  if (!deviceMajorMinorCheck(8)) {
+    ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0, t1}));
+    GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
+  }
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+// Simple test for double buffer in shared mem,
+//  where we should not insert redundant syncs when
+//  they are not needed.
+TEST_F(NVFuserTest, FusionDoubleBufferNoSync_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Using vectorization so need to keep n multiple of 4.
+  int m = 33, n = 48;
+
+  TensorView* tv0 = makeConcreteTensor({m, n});
+  TensorView* tv1 = makeConcreteTensor({m, n});
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  TensorView* tv2 = add(tv0, tv1);
+
+  fusion.addOutput(tv2);
+
+  auto tv0_shared = tv0->cacheAfter();
+  tv0_shared->setMemoryType(MemoryType::Shared);
+  tv0->computeAt(tv2, 1);
+
+  // Asynchronously load a tile in one schedule
+  tv0_shared->split(1, 4);
+  tv0_shared->axis(-2)->parallelize(ParallelType::TIDx);
+
+  // Consume the loaded tile in another schedule,
+  //   triggering the need for a sync.
+  tv2->split(1, 4);
+  tv2->axis(-2)->parallelize(ParallelType::TIDx);
+
+  // Double buffer the shared mem tensor.
+  tv0_shared->doubleBuffer();
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({m, n}, options);
+  at::Tensor t1 = at::randn({m, n}, options);
+
+  GpuLower gpulw(&fusion);
+  auto flattened_exprs =
+      ir_utils::flattenScopedExprs(gpulw.kernel()->topLevelExprs());
+  bool sync_inserted = std::any_of(
+      flattened_exprs.begin(), flattened_exprs.end(), [](Expr* expr) {
+        return expr->isA<kir::BlockSync>();
+      });
+  TORCH_INTERNAL_ASSERT(!sync_inserted, "Un-expected block sync inserted");
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+// Test predicate inversion for cp.async
+TEST_F(NVFuserTest, FusionCpAsyncPredicate_CUDA) {
+  // requires ampere+ GPU
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Using vectorization so need to keep n multiple of 4.
+  int m = 33, n = 48;
+
+  TensorView* tv0 = makeConcreteTensor({m, n});
+
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {1});
+  fusion.addOutput(tv1);
+
+  auto tv0_shared = tv0->cacheAfter(LoadStoreOpType::CpAsync);
+  auto tv0_reg = tv0_shared->cacheAfter();
+  tv0_shared->setMemoryType(MemoryType::Shared);
+  tv0->computeAt(tv1, 1);
+
+  tv0_shared->split(-1, 32);
+  tv0_shared->split(-1, 4);
+  tv0_shared->axis(-1)->parallelize(ParallelType::Vectorize);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({m, n}, options);
+
+  FusionExecutor fe;
+  if (!deviceMajorMinorCheck(8)) {
+    ASSERT_ANY_THROW(fe.compileFusion(&fusion, {t0}));
+    GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
+  }
+
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0.sum({1});
+
+  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Test predicate removal on reg-to-reg expressions
+TEST_F(NVFuserTest, FusionPredRemovalCheck_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeContigTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = set(tv0);
+  TensorView* tv2 = set(tv1);
+  TensorView* tv3 = set(tv2);
+  TensorView* tv4 = set(tv3);
+
+  fusion.addOutput(tv4);
+  tv4->split(1, 4);
+  tv0->computeAt(tv4, -2);
+  tv3->axis(-1)->parallelize(ParallelType::Vectorize);
+
+  class PredicateRemovalChecker : public kir::IrVisitor {
+   public:
+    using kir::IrVisitor::handle;
+
+   private:
+    void handle(UnaryOp* uop) final {
+      assertOnLocalToLocal(uop);
+    }
+
+    // Utility to assert any local-to-local expr is only trivially predicated.
+    void assertOnLocalToLocal(Expr* expr) {
+      bool is_local = true;
+      for (auto in : ir_utils::filterByType<kir::TensorIndex>(expr->inputs())) {
+        if (in->view()->getMemoryType() != MemoryType::Local) {
+          is_local = false;
+        }
+      }
+      for (auto in :
+           ir_utils::filterByType<kir::TensorIndex>(expr->outputs())) {
+        if (in->view()->getMemoryType() != MemoryType::Local) {
+          is_local = false;
+        }
+      }
+
+      if (is_local) {
+        if (auto ite = dynamic_cast<kir::IfThenElse*>(scope_exprs_.back())) {
+          TORCH_INTERNAL_ASSERT(
+              ite->predicate()->value()->isConst(),
+              "redundant predicate on: ",
+              expr);
+        }
+      }
+    }
+
+   private:
+    bool within_ite_ = false;
+  } pred_checker;
+
+  GpuLower gpulw(&fusion);
+  pred_checker.handle(gpulw.kernel()->topLevelExprs());
+}
+
+TEST_F(NVFuserTest, FusionPropagateParallelTypesToSiblings_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tvs = Welford(tv0, {0});
+  auto tv_avg = tvs.avg;
+  fusion.addOutput(tv_avg);
+
+  tv_avg->split(0, 128);
+  TransformPropagatorWithCheck propagator(tv_avg);
+  MaxRootDomainInfoSpanningTree(tv_avg).traverse(&propagator);
+
+  tv_avg->axis(0)->parallelize(ParallelType::BIDx);
+  tv_avg->axis(1)->parallelize(ParallelType::TIDx);
+
+  // Make sure the parallelization of tv_avg is propagated to the var
+  // and count tensors.
+  GpuLower gpulw(&fusion);
+  for (const auto expr : gpulw.kernel()->exprs()) {
+    auto wop = dynamic_cast<WelfordOp*>(expr);
+    if (wop == nullptr) {
+      continue;
+    }
+    auto ref = wop->outAvg()->as<TensorView>();
+    for (auto sibling : ir_utils::filterByType<TensorView>(wop->outputs())) {
+      if (ref == sibling) {
+        continue;
+      }
+      TORCH_CHECK(
+          ref->nDims() == sibling->nDims(),
+          "Invalid sibling: ",
+          sibling->toString());
+      for (const auto i : c10::irange(ref->nDims())) {
+        TORCH_CHECK(
+            ref->axis(i)->getParallelType() ==
+                sibling->axis(i)->getParallelType(),
+            "Mismatched parallel types between siblings. ",
+            ref->toString(),
+            ", ",
+            sibling->toString());
+      }
+    }
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({9999}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto outputs = fe.runFusion({t0});
+
+  testValidate(fe.kernel(), outputs, {t0}, {t0.mean({0})}, __LINE__, __FILE__);
+}
+
+// Test ExactRootDomainMap
+TEST_F(NVFuserTest, FusionExactRootDomainMap_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+
+  auto tv2 = broadcast(tv0, {false, true});
+  auto tv3 = transpose(tv2);
+  auto tv4 = add(tv2, tv1);
+  auto tv5 = add(tv2, tv3);
+  auto tv6 = add(tv3, tv1);
+  fusion.addOutput(tv4);
+  fusion.addOutput(tv5);
+  fusion.addOutput(tv6);
+
+  const auto exact_map = ExactRootDomainMap(&fusion);
+
+  // In the exact mapping, the broadcast domain introduced at tv2 is
+  // only mapped with the another one in tv3, which is just transposed
+  // from tv2. Any other domain, including the second domain of tv4,
+  // must not be mapped.
+
+  auto tv2_bc = tv2->axis(1);
+  auto tv3_bc = tv3->axis(0);
+
+  TORCH_CHECK(
+      exact_map.areMapped(tv2_bc, tv3_bc),
+      "Invalid exact root domain map: ",
+      exact_map.toString());
+
+  // They must not be mapped with anything else.
+  for (auto tv : ir_utils::allTvs(&fusion)) {
+    for (auto root_id : tv->getRootDomain()) {
+      if (root_id == tv2_bc || root_id == tv3_bc) {
+        continue;
+      }
+      TORCH_CHECK(
+          !exact_map.areMapped(root_id, tv2_bc),
+          "Invalid exact root domain map: ",
+          exact_map.toString());
+      TORCH_CHECK(
+          !exact_map.areMapped(root_id, tv3_bc),
+          "Invalid exact root domain map: ",
+          exact_map.toString());
+    }
+  }
+}
+
+class NVFuserMultithreadedTest : public ::testing::Test {
+ protected:
+  bool was_enabled = false;
+
+  void SetUp() override {
+    was_enabled = fuser::cuda::setEnabled(true);
+  }
+
+  void TearDown() override {
+    fuser::cuda::setEnabled(was_enabled);
+  }
+};
+
+TEST_F(NVFuserMultithreadedTest, SingleFunction_CUDA) {
+  std::string ir = R"IR(
+graph(%x.1 : Tensor,
+      %y.1 : Tensor):
+  %12 : NoneType = prim::Constant()
+  %11 : bool = prim::Constant[value=0]()
+  %9 : int = prim::Constant[value=1]()
+  %3 : Tensor = aten::exp(%x.1)
+  %5 : Tensor = aten::relu(%y.1)
+  %6 : Tensor = aten::sin(%5)
+  %8 : Tensor = aten::add(%3, %6, %9)
+  %10 : int[] = prim::ListConstruct(%9)
+  %13 : Tensor = aten::sum(%8, %10, %11, %12)
+  return (%13)
+)IR";
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(ir, g.get());
+  GraphFunction fn("nvfuser_test", g, nullptr);
+
+  auto run_kernel = [&fn]() {
+    auto x = torch::rand({32, 32}, at::TensorOptions(at::kCUDA));
+    auto y = torch::rand({32, 32}, at::TensorOptions(at::kCUDA));
+    std::vector<IValue> results;
+    for (const auto& _ : c10::irange(10)) {
+      auto stack = createStack({x.clone(), y.clone()});
+      fn.run(stack);
+      results.push_back(stack.back());
+    }
+    for (const auto& i : c10::irange(1, 10)) {
+      auto t0 = results[0].toTensor();
+      auto ti = results[i].toTensor();
+      ASSERT_TRUE(at::allclose(t0, ti));
+    }
+  };
+
+  constexpr size_t kNumThreads = 4;
+  std::vector<std::thread> threads;
+  for (size_t id = 0; id < kNumThreads; ++id) {
+    threads.emplace_back(run_kernel);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+
+TEST_F(NVFuserMultithreadedTest, MultipleFunctions_CUDA) {
+  auto run_kernel = []() {
+    const std::string ir = R"IR(
+  graph(%x.1 : Tensor,
+        %y.1 : Tensor):
+    %12 : NoneType = prim::Constant()
+    %11 : bool = prim::Constant[value=0]()
+    %9 : int = prim::Constant[value=1]()
+    %3 : Tensor = aten::exp(%x.1)
+    %5 : Tensor = aten::relu(%y.1)
+    %6 : Tensor = aten::sin(%5)
+    %8 : Tensor = aten::add(%3, %6, %9)
+    %10 : int[] = prim::ListConstruct(%9)
+    %13 : Tensor = aten::sum(%8, %10, %11, %12)
+    return (%13)
+  )IR";
+    auto g = std::make_shared<Graph>();
+    torch::jit::parseIR(ir, g.get());
+    GraphFunction fn("nvfuser_test", g, nullptr);
+
+    auto x = torch::rand({32, 32}, at::TensorOptions(at::kCUDA));
+    auto y = torch::rand({32, 32}, at::TensorOptions(at::kCUDA));
+    std::vector<IValue> results;
+    constexpr size_t numRuns = 10;
+    for (const auto& _ : c10::irange(numRuns)) {
+      auto stack = createStack({x.clone(), y.clone()});
+      fn.run(stack);
+      results.push_back(stack.back());
+    }
+    for (const auto& i : c10::irange(1, numRuns)) {
+      auto t0 = results[0].toTensor();
+      auto ti = results[i].toTensor();
+      ASSERT_TRUE(at::allclose(t0, ti));
+    }
+  };
+
+  constexpr size_t kNumThreads = 4;
+  std::vector<std::thread> threads;
+  for (size_t id = 0; id < kNumThreads; ++id) {
+    threads.emplace_back(run_kernel);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+
+// Repro of issue #1655
+TEST_F(NVFuserTest, FusionIncompleteConcreteID_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion.addInput(tv0);
+  auto tv1 = makeSymbolicTensor(2);
+  fusion.addInput(tv1);
+  auto tv2 = makeSymbolicTensor(2);
+  fusion.addInput(tv2);
+
+  auto tv3 = broadcast(tv0, {true, true, false});
+  auto tv4 = broadcast(tv1, {false, true, false});
+  auto tv5 = broadcast(tv2, {true, false, false});
+
+  auto tv6 = add(tv3, tv4);
+  auto tv7 = add(tv3, tv5);
+
+  fusion.addOutput(tv6);
+  fusion.addOutput(tv7);
+
+  tv6->merge(0);
+  tv6->merge(0);
+
+  TransformPropagatorWithCheck propagator(tv6);
+  MaxRootDomainInfoSpanningTree(tv6).traverse(&propagator);
+
+  tv0->computeAt(tv6, -1, ComputeAtMode::MostInlined);
+  tv1->computeAt(tv6, -1, ComputeAtMode::MostInlined);
+  tv2->computeAt(tv7, -1, ComputeAtMode::MostInlined);
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(fusion.printKernel());
+}
+
+TEST_F(NVFuserTest, FusionTestReEntrantGridWelford_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  int X = 256, Y = 7, Z = 2048;
+
+  // setup fusion
+  auto tv0 = makeContigTensor(4, DataType::Half);
+  fusion.addInput(tv0);
+  auto tv1 = castOp(DataType::Float, tv0);
+
+  auto tvs = Welford(tv1, {0, 1, 2});
+  auto tv_avg = tvs.avg;
+  auto tv_M2 = tvs.var_sum;
+  auto tv_N = tvs.n;
+  fusion.addOutput(tv_avg);
+  fusion.addOutput(tv_M2);
+
+  auto cached_input = tv0->cacheAfter();
+  auto cached_avg = tv_avg->cacheBefore();
+  auto cached_M2 = tv_M2->cacheBefore();
+
+  auto reduction_tv = scheduler_utils::getReductionTvs(&fusion)[0];
+
+  reduction_tv->merge(0);
+  reduction_tv->merge(0);
+
+  int TIDx = 16;
+  int vec = 4;
+
+  int TIDy = 16;
+  int outer_tidy_fact = 16;
+
+  reduction_tv->split(-1, TIDx * vec);
+  reduction_tv->split(-1, vec);
+  reduction_tv->axis(-2)->parallelize(ParallelType::TIDx);
+  reduction_tv->axis(-1)->parallelize(ParallelType::Vectorize);
+  reduction_tv->axis(-3)->parallelize(ParallelType::BIDx);
+
+  reduction_tv->split(0, TIDy);
+  reduction_tv->axis(1)->parallelize(ParallelType::TIDy);
+  reduction_tv->split(0, outer_tidy_fact);
+  reduction_tv->axis(0)->parallelize(ParallelType::BIDy);
+
+  // T2_g[ rblockIdx.y, rS{16}, rthreadIdx.y, iblockIdx.x, ithreadIdx.x24,
+  // iV25{4} ]
+  reduction_tv->reorder({{3, 0}, {4, 1}, {0, 2}, {2, 3}, {1, 4}, {5, 5}});
+  // T2_g[iblockIdx.x, ithreadIdx.x24, rblockIdx.y, rthreadIdx.y, rS{16},
+  // iV25{4}]
+
+  TransformPropagatorWithCheck propagator(reduction_tv);
+  MaxRootDomainInfoSpanningTree(reduction_tv).traverse(&propagator);
+  auto rfactor_tv = ir_utils::rfactorHelper(reduction_tv, {4});
+  scheduler_utils::parallelizeAllLike(rfactor_tv);
+
+  tv0->computeAt(tv_avg, 2);
+  tv0->computeAt(cached_input, -2);
+
+  cached_input->computeAt(rfactor_tv, 4, ComputeAtMode::BestEffort);
+
+  for (auto tv : ir_utils::allTvs(&fusion)) {
+    if (tv == cached_input || tv == tv_avg || tv == tv_M2) {
+      continue;
+    }
+    tv->axis(-1)->parallelize(ParallelType::Serial);
+  }
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {}, LaunchParams());
+
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({X, Y, Y, Z}, options);
+
+  auto cg_outputs = fe.runFusion({t0}, LaunchParams(-1, -1, -1, -1, -1, -1));
+
+  // by default Welford outputs sum of square diff so need to divide to get var
+  cg_outputs[1] = cg_outputs[1].div((float)(X * Y * Y));
+
+  auto at_mu = at::mean(t0.to(at::kDouble), {0, 1, 2});
+  auto at_var = at::var(t0.to(at::kDouble), {0, 1, 2}, false);
+
+  testValidate(
+      &fusion,
+      cg_outputs,
+      {t0},
+      {at_mu, at_var},
+      __LINE__,
+      __FILE__,
+      "",
+      LaunchParams(-1, -1, -1, -1, -1, -1));
+}
+
+// Test sync insertion with redundant predicates
+TEST_F(NVFuserTest, FusionRedundantPredSync_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeConcreteTensor({32});
+  TensorView* tv1 = makeConcreteTensor({32, 32});
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = broadcast(tv0, {true, false});
+  auto tv3 = add(tv2, tv1);
+
+  fusion.addOutput(tv3);
+
+  auto tv0c = tv0->cacheAfter();
+
+  // Make a redundant write through smem
+  tv0c->setMemoryType(MemoryType::Shared);
+
+  tv0->computeAt(tv3, 0);
+  tv1->computeAt(tv3, 0);
+
+  tv0c->axis(0)->parallelize(ParallelType::TIDx);
+  tv2->axis(0)->parallelize(ParallelType::TIDy);
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv3->axis(0)->parallelize(ParallelType::TIDy);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+
+  GpuLower gpulw(&fusion);
+  auto flattened_exprs =
+      ir_utils::flattenScopedExprs(gpulw.kernel()->topLevelExprs());
+  bool sync_inserted = std::any_of(
+      flattened_exprs.begin(), flattened_exprs.end(), [](Expr* expr) {
+        return expr->isA<kir::BlockSync>();
+      });
+  TORCH_INTERNAL_ASSERT(sync_inserted, "Expected block sync not inserted");
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({32}, options);
+  at::Tensor t1 = at::randn({32, 32}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+// Test case for removing syncs on chain of redundant uses.
+TEST_F(NVFuserTest, FusionRedundantPredSync2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeConcreteTensor({32});
+  TensorView* tv1 = makeConcreteTensor({32, 32});
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = broadcast(tv0, {true, false});
+  auto tv3 = add(tv2, tv1);
+
+  fusion.addOutput(tv3);
+
+  auto tv0c = tv0->cacheAfter();
+
+  // Make a redundant write through smem
+  tv0c->setMemoryType(MemoryType::Shared);
+  tv2->setMemoryType(MemoryType::Shared);
+
+  tv0->computeAt(tv3, 0);
+  tv1->computeAt(tv3, 0);
+
+  tv0c->axis(0)->parallelize(ParallelType::TIDx);
+  tv2->axis(0)->parallelize(ParallelType::TIDy);
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv3->axis(0)->parallelize(ParallelType::TIDy);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+
+  // Utility class to make sure one block sync
+  //  is inserted by RAW pass.
+  class SyncChecker : public kir::IrVisitor {
+   public:
+    using kir::IrVisitor::handle;
+    int result() {
+      return sync_seen_;
+    }
+
+   private:
+    void handle(kir::BlockSync*) final {
+      sync_seen_++;
+    }
+
+   private:
+    int sync_seen_ = 0;
+  } checker;
+
+  GpuLower gpulw(&fusion);
+  checker.handle(gpulw.kernel()->topLevelExprs());
+  TORCH_INTERNAL_ASSERT(
+      checker.result() < 2, "More syncs were inserted than expected");
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({32}, options);
+  at::Tensor t1 = at::randn({32, 32}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
+// Test case for sync insertion after redundant predicated smem write
+//  Check that syncs are removed only when all paths are redundant.
+TEST_F(NVFuserTest, FusionRedundantPredSync3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeConcreteTensor({32});
+  TensorView* tv1 = makeConcreteTensor({32, 32});
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = broadcast(tv0, {true, false});
+  auto tv3 = set(tv2);
+  auto tv4 = add(tv3, tv1);
+  auto tv5 = add(tv2, tv1);
+
+  fusion.addOutput(tv4);
+  fusion.addOutput(tv5);
+
+  auto tv0c = tv0->cacheAfter();
+
+  // In this scheduling config,
+  //  tv0c -> tv2 -> tv3 is a redundant path for tidy
+  //  tv0c -> tv2 -> tv5 is not.
+  //  So we need a RAW sync in tv0c->tv2 to make sure
+  //  tv2 has the correct value to produce tv5.
+  tv0c->setMemoryType(MemoryType::Shared);
+  tv3->setMemoryType(MemoryType::Shared);
+
+  tv0c->axis(0)->parallelize(ParallelType::TIDx);
+  tv2->axis(0)->parallelize(ParallelType::TIDy);
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv3->axis(0)->parallelize(ParallelType::TIDy);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv5->axis(0)->parallelize(ParallelType::TIDy);
+  tv5->axis(1)->parallelize(ParallelType::TIDx);
+
+  // Utility class to make sure one block sync
+  //  is inserted by RAW pass.
+  class SyncChecker : public kir::IrVisitor {
+   public:
+    using kir::IrVisitor::handle;
+    int result() {
+      return sync_seen_;
+    }
+
+   private:
+    void handle(kir::BlockSync* sync) final {
+      if (!sync->isWarHazardSync()) {
+        sync_seen_++;
+      }
+    }
+
+   private:
+    int sync_seen_ = 0;
+  } checker;
+
+  GpuLower gpulw(&fusion);
+  checker.handle(gpulw.kernel()->topLevelExprs());
+
+  // This is implicit checking. There are exactly 2 places
+  //  where RAW hazards happen: one producing tv2 and the other
+  //  producing tv3. This test case expect syncs in both of
+  //  these places so we check that 2 RAW syncs are inserted.
+  TORCH_INTERNAL_ASSERT(
+      checker.result() == 2,
+      "Exactly 2 RAW sync expected for the two shared memory transfers");
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({32}, options);
+  at::Tensor t1 = at::randn({32, 32}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+
+  auto ref = t0 + t1;
+
+  testValidate(&fusion, cg_outputs, {t0, t1}, {ref, ref}, __LINE__, __FILE__);
+}
+
+// Unit test case for detecting thread redundant usage of shared tensors.
+TEST_F(NVFuserTest, FusionRedundantUseCheck_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeConcreteTensor({32, 32});
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+  auto tv2 = set(tv1);
+  auto tv3 = set(tv2);
+  auto tv4 = set(tv3);
+
+  auto tv5 = set(tv4);
+
+  auto tv6 = set(tv4);
+  auto tv7 = set(tv6);
+
+  fusion.addOutput(tv5);
+  fusion.addOutput(tv7);
+
+  tv2->setMemoryType(MemoryType::Shared);
+  tv4->setMemoryType(MemoryType::Shared);
+
+  tv7->axis(-1)->parallelize(ParallelType::TIDx);
+
+  // Thread pred map cannot be built without an active lower
+  //  object. So would need to lower the whole fusion for
+  //  testing. However, lower also keeps an copy of the fusion
+  //  so the original pointers cannot be used to querry the
+  //  thread pred map. So have to traverse the new expr list
+  //  to find the pointers;
+  GpuLower gpulw(&fusion);
+
+  TensorView *lowered_tv2 = nullptr, *lowered_tv4 = nullptr;
+  auto used_vals = gpulw.kernel()->usedMathVals();
+
+  for (auto tv : ir_utils::filterByType<TensorView>(used_vals)) {
+    if (tv->name() == 2) {
+      lowered_tv2 = tv;
+    }
+    if (tv->name() == 4) {
+      lowered_tv4 = tv;
+    }
+  }
+
+  TORCH_INTERNAL_ASSERT(
+      lowered_tv2 != nullptr && lowered_tv4 != nullptr,
+      "tv2 or tv4 not lowered or mangled");
+
+  auto tv2_info = gpulw.threadPredMap().getPredicateInfo(lowered_tv2);
+  auto tv4_info = gpulw.threadPredMap().getPredicateInfo(lowered_tv4);
+
+  // tv2 -> tv3 -> tv4 (shared) is the only use chain for tv2,
+  //  and tv4 is redundantly written in tidx so tv2 is redundantly
+  //  consumed in tidx.
+  TORCH_INTERNAL_ASSERT(
+      tv2_info.redundant_use_types.get(ParallelType::TIDx),
+      "TV2 is redundantly used but not detected.");
+
+  // tv4->tv5 (global) is a redundant use chain, but
+  // tv4->tv6->tv7 is not, so tv4 should not be detected as
+  // a redundant used tensor in tidx.
+  TORCH_INTERNAL_ASSERT(
+      !tv4_info.redundant_use_types.get(ParallelType::TIDx),
+      "TV4 is not redundantly used but not detected.");
+}
+
+// Test a basic swizzle pattern
+TEST_F(NVFuserTest, FusionSimpleSwizzle0_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({2, 32});
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+
+  fusion.addOutput(tv2);
+
+  // Make a 2x8 Zshape tile
+  tv1->split(-1, 16);
+  tv1->split(-1, 8);
+  // [O, 2, 8]
+
+  tv2->split(-1, 16);
+  tv2->split(-1, 4);
+  //[O, 4, 4]
+
+  tv1->computeAt(tv2, 1);
+  tv1->swizzle(Swizzle2DType::ZShape, -2, -1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({2, 32}, options);
+  auto t2 = t0 + 2.0;
+  auto cg_outputs = fe.runFusion({t0});
+
+  testValidate(&fusion, cg_outputs, {t0}, {t2}, __LINE__, __FILE__);
+}
+
+// Test swizzle inlining
+TEST_F(NVFuserTest, FusionSimpleSwizzle1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({2, 32});
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+
+  fusion.addOutput(tv3);
+
+  // Make a 2x8 Zshape tile
+  tv2->split(-1, 16);
+  tv2->split(-1, 8);
+  // [O, 2, 8]
+
+  tv3->split(-1, 16);
+  tv3->split(-1, 4);
+  //[O, 4, 4]
+
+  tv2->computeAt(tv3, 1);
+  tv2->swizzle(Swizzle2DType::ZShape, -2, -1);
+
+  // Inlining a producer into a swizzled consumer is ok
+  tv1->computeAt(tv2, -1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({2, 32}, options);
+  auto t3 = t0 + 3.0;
+  auto cg_outputs = fe.runFusion({t0});
+
+  testValidate(&fusion, cg_outputs, {t0}, {t3}, __LINE__, __FILE__);
+}
+
+// Test sync insertion and memory check in parallelized swizzles.
+//  In this test, data is parallel written into smem in zcurve
+//   pattern and then read out and output to global mem unswizzled.
+TEST_F(NVFuserTest, FusionSimpleSwizzle2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({32, 32});
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+
+  fusion.addOutput(tv2);
+
+  tv1->swizzle(Swizzle2DType::ZShape, -2, -1);
+
+  tv1->axis(0)->parallelize(ParallelType::TIDx);
+  tv1->axis(1)->parallelize(ParallelType::TIDy);
+
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+  tv2->axis(1)->parallelize(ParallelType::TIDy);
+
+  // Validation should fail since TV1 is not in shared
+  //  memory as required by sync info pass.
+  ASSERT_ANY_THROW(GpuLower gpulw_throw(&fusion));
+
+  tv1->setMemoryType(MemoryType::Shared);
+
+  // Make sure that a sync is inserted:
+  bool sync_found = false;
+  GpuLower gpu_lw(&fusion);
+  auto flattened_exps =
+      ir_utils::flattenScopedExprs(gpu_lw.kernel()->topLevelExprs());
+
+  for (auto expr : flattened_exps) {
+    if (expr->isA<kir::BlockSync>()) {
+      sync_found = true;
+    }
+    // Will require a sync thread before any shared memory read.
+    for (auto inp_tv : ir_utils::filterByType<TensorView>(expr->inputs())) {
+      if (inp_tv->getMemoryType() == MemoryType::Shared) {
+        TORCH_INTERNAL_ASSERT(
+            sync_found, "Block sync required but not inserted");
+      }
+    }
+  }
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({32, 32}, options);
+  auto t2 = t0 + 2.0;
+  auto cg_outputs = fe.runFusion({t0});
+
+  testValidate(&fusion, cg_outputs, {t0}, {t2}, __LINE__, __FILE__);
+}
+
+// Test BestEffortReplay behavior with swizzle op
+TEST_F(NVFuserTest, FusionSwizzleMapping_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({2, 32});
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+
+  fusion.addOutput(tv3);
+
+  // Make a 2x8 Zshape tile
+  tv2->split(-1, 16);
+  tv2->split(-1, 8);
+  // [O, 2, 8]
+
+  tv3->split(-1, 16);
+  tv3->split(-1, 4);
+  //[O, 4, 4]
+
+  tv2->computeAt(tv3, 1);
+  tv2->swizzle(Swizzle2DType::ZShape, -2, -1);
+
+  // Inlining a producer into a swizzled consumer is ok
+  tv1->computeAt(tv2, -1);
+
+  // Check BestEffortReplay behavior with skip swizzles option on.
+  PairwiseRootDomainMap root_map(tv1, tv2);
+
+  // Check producer to consumer map,
+  //  i.e. unswizzled tensor to swizzled tensor map
+  //----------------------------------------------------------
+  auto p2c = BestEffortReplay::replayCasP(tv2, tv1, -1, root_map).getReplay();
+  auto swizzle_x_it0 = p2c.find(tv1->axis(-2));
+  auto swizzle_y_it0 = p2c.find(tv1->axis(-1));
+  // P2C map should exist and both the x and y map should
+  //  map to the output of the swizzle op.
+  TORCH_INTERNAL_ASSERT(
+      swizzle_x_it0 != p2c.end() && swizzle_y_it0 != p2c.end());
+  TORCH_INTERNAL_ASSERT(
+      swizzle_x_it0->second == tv2->axis(-2) &&
+      swizzle_y_it0->second == tv2->axis(-1));
+
+  // Check consumer to producer map,
+  //  i.e. swizzled tensor to unswizzled tensor map
+  //----------------------------------------------------------
+  auto c2p = BestEffortReplay::replayPasC(tv1, tv2, -1, root_map).getReplay();
+
+  auto swizzle_op = tv2->axis(-1)->definition()->as<Swizzle2D>();
+
+  // Find mapping for swizzle inputs
+  auto swizzle_x_it1 = c2p.find(swizzle_op->inX());
+  auto swizzle_y_it1 = c2p.find(swizzle_op->inY());
+
+  // Find mapping for swizzle outputs
+  auto swizzle_x_it2 = c2p.find(swizzle_op->outX());
+  auto swizzle_y_it2 = c2p.find(swizzle_op->outY());
+
+  // Input of swizzle ops will not be mapped to any
+  //  by BestEffortReplay, as BestEffortReplay has to be
+  //  one to one. IdGraph will further map them together.
+  TORCH_INTERNAL_ASSERT(
+      swizzle_x_it1 == c2p.end() && swizzle_y_it1 == c2p.end());
+
+  // Mapping for swizzle outputs should be mapped and should
+  //  also map to the corresponding axes on the unswizzled tensor.
+  TORCH_INTERNAL_ASSERT(
+      swizzle_x_it2 != c2p.end() && swizzle_y_it2 != c2p.end());
+  TORCH_INTERNAL_ASSERT(
+      swizzle_x_it2->second == tv1->axis(-2) &&
+      swizzle_y_it2->second == tv1->axis(-1));
+
+  // Check id graph behavior
+  //----------------------------------------------------------
+  ComputeAtMap ca_map(&fusion);
+  // Corresponding inputs and outputs of swizzle ops are
+  //  map through by exact and permissive map.
+  TORCH_INTERNAL_ASSERT(
+      ca_map.areMapped(tv1->axis(-2), swizzle_op->inX(), IdMappingMode::EXACT));
+  TORCH_INTERNAL_ASSERT(
+      ca_map.areMapped(tv1->axis(-1), swizzle_op->inY(), IdMappingMode::EXACT));
+  TORCH_INTERNAL_ASSERT(ca_map.areMapped(
+      tv1->axis(-2), swizzle_op->outX(), IdMappingMode::EXACT));
+  TORCH_INTERNAL_ASSERT(ca_map.areMapped(
+      tv1->axis(-1), swizzle_op->outY(), IdMappingMode::EXACT));
+
+  TORCH_INTERNAL_ASSERT(ca_map.areMapped(
+      tv1->axis(-2), swizzle_op->inX(), IdMappingMode::PERMISSIVE));
+  TORCH_INTERNAL_ASSERT(ca_map.areMapped(
+      tv1->axis(-1), swizzle_op->inY(), IdMappingMode::PERMISSIVE));
+  TORCH_INTERNAL_ASSERT(ca_map.areMapped(
+      tv1->axis(-2), swizzle_op->outX(), IdMappingMode::PERMISSIVE));
+  TORCH_INTERNAL_ASSERT(ca_map.areMapped(
+      tv1->axis(-1), swizzle_op->outY(), IdMappingMode::PERMISSIVE));
+}
+
+// Test a basic loop swizzle pattern
+TEST_F(NVFuserTest, FusionLoopSwizzle0_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({2, 32});
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+
+  fusion.addOutput(tv2);
+
+  tv2->split(-1, 16);
+  tv2->split(-1, 4);
+  //[O, 4, 4]
+
+  tv2->swizzle(Swizzle2DType::ZShape, -2, -1, SwizzleMode::Loop);
+
+  tv0->computeAt(tv2, -1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({2, 32}, options);
+  auto t2 = t0 + 2.0;
+  auto cg_outputs = fe.runFusion({t0});
+
+  testValidate(&fusion, cg_outputs, {t0}, {t2}, __LINE__, __FILE__);
+}
+
+// Outer block zshape pattern
+TEST_F(NVFuserTest, FusionLoopSwizzle1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+
+  fusion.addOutput(tv2);
+
+  tv2->split(-2, 8);
+  tv2->split(-1, 4);
+  //[I0o, I0i, I1o, I1i]
+  tv2->reorder({{1, 2}, {2, 1}});
+  //[I0o, I1o, I0i, I1i]
+
+  tv2->swizzle(Swizzle2DType::ZShape, 0, 1, SwizzleMode::Loop);
+  tv0->computeAt(tv2, -1);
+
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::BIDy);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({45, 77}, options);
+  auto t2 = t0 + 2.0;
+  auto cg_outputs = fe.runFusion({t0});
+
+  testValidate(&fusion, cg_outputs, {t0}, {t2}, __LINE__, __FILE__);
+}
+
+// Test assertion in unsupported pattern: non-leaf loop swizzle.
+TEST_F(NVFuserTest, FusionLoopSwizzleCheck0_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({2, 32});
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+
+  fusion.addOutput(tv2);
+
+  tv2->split(-1, 16);
+  tv2->split(-1, 4);
+  //[O, 4, 4]
+
+  // Swizzle the inner tile.
+  tv2->swizzle(Swizzle2DType::ZShape, -2, -1, SwizzleMode::Loop);
+
+  // Make swizzle output not a leaf domain.
+  tv2->merge(-2);
+
+  tv0->computeAt(tv2, -1);
+
+  FusionExecutor fe;
+  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
+}
+
+// Test assertion in unsupported pattern: half-inlined loop swizzle.
+TEST_F(NVFuserTest, FusionLoopSwizzleCheck1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({2, 32});
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = add(tv1, IrBuilder::create<Double>(1));
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+
+  fusion.addOutput(tv3);
+
+  //[O, 4, 4]
+  tv2->split(-1, 16);
+  tv2->split(-1, 4);
+
+  //[O, 4, 4]
+  tv3->split(-1, 16);
+  tv3->split(-1, 4);
+
+  // Swizzle inner tile of tv2
+  tv2->swizzle(Swizzle2DType::ZShape, -2, -1, SwizzleMode::Loop);
+
+  // Make tv2 swizzled and partially-inlined (unsupported).
+  tv0->computeAt(tv3, -2);
+
+  FusionExecutor fe;
+  ASSERT_ANY_THROW(fe.compileFusion(&fusion));
+}
+
+TEST_F(NVFuserTest, FusionUnsqueeze1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> shape({10, 11});
+
+  auto tv0 = makeConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  // [I, R]
+  auto tv1 = sum(tv0, {1});
+  // [I, B]
+  auto tv2 = unsqueeze(tv1, -1);
+  fusion.addOutput(tv2);
+
+  TORCH_CHECK(
+      tv2->nDims() == 2, "Unpected unsqueeze result: ", tv2->toString());
+  TORCH_CHECK(
+      tv2->axis(1)->isBroadcast(),
+      "Unexpected unsqueeze result: ",
+      tv2->toString());
+
+  // tv1 has only one non-reduction axis. An exception should be
+  // thrown.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(unsqueeze(tv1, 2));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({10, 11}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto ref = t0.sum(1).unsqueeze(-1);
+
+  testValidate(&fusion, cg_outputs, aten_inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSqueeze1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> shape({10, 11});
+
+  auto tv0 = makeConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  // [I, B]
+  auto tv1 = sum(tv0, {1}, true);
+  // [I]
+  auto tv2 = squeeze(tv1, {shape[0], 1});
+  fusion.addOutput(tv2);
+
+  TORCH_CHECK(
+      tv2->nDims() == 2, "Unexpected squeeze result: ", tv2->toString());
+
+  // [I, R]
+  auto tv3 = sum(tv0, {1});
+  // tv3 has only one non-reduction axis. The extent of the first axis
+  // is not one, so squeeze should fail.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
+  ASSERT_ANY_THROW(squeeze(tv3, {shape[0], 1}));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({10, 11}, options);
+  std::vector<IValue> aten_inputs = {t0};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs);
+  auto cg_outputs = fe.runFusion(aten_inputs);
+
+  auto ref = t0.sum(1, true).squeeze(-1);
+
+  testValidate(&fusion, cg_outputs, aten_inputs, {ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionContigPredicate_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  auto tv2 = broadcast(tv1, {false, true, false});
+  fusion.addOutput(tv2);
+
+  tv2->merge(-2, -1);
+  tv2->merge(-2, -1);
+  tv2->split(-1, 100);
+  tv0->computeAt(tv2, -1);
+
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(PredicatedChecker::isPredicated(tv1, gpulw));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({3, 4}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  auto ref = t0.unsqueeze(1);
+
+  testValidate(fe.kernel(), cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Repro of https://github.com/csarofeen/pytorch/issues/1777
+TEST_F(NVFuserTest, FusionDivScalarLhs_CUDA) {
+  // tv1 = 2.0 / tv0
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  TensorView* tv1 = div(IrBuilder::create<Double>(2.0), tv0);
+  fusion.addOutput(tv1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({3, 3}, options);
+  // There's no overload div(Scalar, Tensor) in ATen
+  auto aten_output = at::div(
+      at::native::wrapped_scalar_tensor(at::Scalar(2.0), options.device()), t0);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+
+  testValidate(&fusion, cg_outputs, {t0}, {aten_output}, __LINE__, __FILE__);
+}
+
+// Repro of an issue of the reduction scheduler with a broadcast
+// domain concretized to multiple domains that are not proven to have
+// the same extent
+TEST_F(NVFuserTest, FusionRepro1713_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto tv1 = makeSymbolicTensor(2);
+  auto tv2 = makeSymbolicTensor(1);
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  fusion->addInput(tv2);
+  auto tv3 = broadcast(tv2, {false, true});
+
+  auto tv4 = add(tv3, tv0);
+
+  auto tv5 = add(tv3, tv1);
+  auto tv6 = sum(tv5, {0});
+  fusion->addOutput(tv4);
+  fusion->addOutput(tv6);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({1024, 204800}, options);
+  // Original repro had the same shape as t0, but this should work
+  // with a different extent at the second axis
+  at::Tensor t1 = at::randn({1024, 123}, options);
+  at::Tensor t2 = at::randn({1024}, options);
+  std::vector<IValue> aten_inputs({t0, t1, t2});
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
+
+  auto t3 = t2.unsqueeze(-1);
+  auto t4 = t3 + t0;
+  auto t5 = t3 + t1;
+  auto t6 = sum(t5, {0});
+
+  testValidate(
+      executor_cache.fusion(),
+      cg_outputs,
+      {t0, t1, t2},
+      {t4, t6},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionExpand_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto w = 2, x = 3, y = 4, z = 5;
+
+  // Test
+  // a simple expand
+  // Expand that's propagated
+  // expand_as
+  // symbolic expand
+
+  // x
+  auto tv0 = makeSymbolicTensor(1);
+  fusion->addInput(tv0);
+
+  auto tv1 = broadcast(tv0, {false, true});
+  auto tv2 = expand(tv1, {tv0->axis(0)->extent(), IrBuilder::create<Int>(y)});
+
+  // x
+  auto tv3 = makeSymbolicTensor(1);
+  fusion->addInput(tv3);
+  auto tv4 = broadcast(tv3, {false, true});
+  auto tv5 = add(tv4, tv2);
+  // [x, e_y]
+
+  // [x, y, z]
+  auto tv6 = makeSymbolicTensor(3);
+  fusion->addInput(tv6);
+
+  // Disjoint set op will cause a segmentation for just this op.
+  auto tmp_7 = set(tv6);
+  fusion->addOutput(tmp_7);
+
+  auto tv7 = broadcast(tv5, {false, false, true});
+
+  auto tv8 = expand_as(tv7, tv6);
+  // [x, e_y, e_z]
+
+  auto w_symbolic = IrBuilder::create<Int>();
+  fusion->addInput(w_symbolic);
+
+  auto tv9 = broadcast(tv8, {true, false, false, false});
+  //[1, x, e_y, e_z]
+
+  auto tv10 = expand(
+      tv9,
+      {w_symbolic,
+       tv9->axis(1)->extent(),
+       tv9->axis(2)->expandedExtent(),
+       tv9->axis(3)->expandedExtent()});
+
+  fusion->addOutput(tv10);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({x}, options);
+  at::Tensor t3 = at::randn({x}, options);
+  at::Tensor t6 = at::randn({x, y, z}, options);
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0, t3, t6, w});
+  auto cg_out = cg_outputs[1];
+
+  TORCH_INTERNAL_ASSERT(cg_out.size(0) == w);
+  TORCH_INTERNAL_ASSERT(cg_out.size(1) == x);
+  TORCH_INTERNAL_ASSERT(cg_out.size(2) == y);
+  TORCH_INTERNAL_ASSERT(cg_out.size(3) == z);
+  TORCH_INTERNAL_ASSERT(cg_out.stride(0) == 0);
+  TORCH_INTERNAL_ASSERT(cg_out.stride(1) == 1);
+  TORCH_INTERNAL_ASSERT(cg_out.stride(2) == 0);
+  TORCH_INTERNAL_ASSERT(cg_out.stride(3) == 0);
+
+  auto t10 = t0.unsqueeze(-1)
+                 .expand({x, y})
+                 .add(t3.unsqueeze(-1))
+                 .unsqueeze(-1)
+                 .expand_as(t6)
+                 .unsqueeze(0)
+                 .expand({w, x, y, z});
+
+  testValidate(
+      executor_cache.fusion(),
+      cg_outputs,
+      {t0, t3, t6, w},
+      {t6, t10},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionExpandIssue1751_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto x = 3, y = 4, z = 5;
+
+  // y, z
+  auto tv0 = makeSymbolicTensor(2);
+  fusion->addInput(tv0);
+
+  auto tv1 = broadcast(tv0, {true, false, false});
+
+  // Two ways to propagate extents as is: use -1 or explicitly pass
+  // the extent vals.
+
+  auto tv2 = expand(
+      tv1,
+      {IrBuilder::create<Int>(x),
+       IrBuilder::create<Int>(-1),
+       IrBuilder::create<Int>(-1)});
+
+  auto tv3 = expand(
+      tv1,
+      {IrBuilder::create<Int>(x),
+       tv0->axis(0)->extent(),
+       tv0->axis(1)->extent()});
+
+  fusion->addOutput(tv2);
+  fusion->addOutput(tv3);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({y, z}, options);
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0});
+
+  for (const auto& cg_out : cg_outputs) {
+    TORCH_INTERNAL_ASSERT(cg_out.size(0) == x);
+    TORCH_INTERNAL_ASSERT(cg_out.size(1) == y);
+    TORCH_INTERNAL_ASSERT(cg_out.size(2) == z);
+  }
+
+  auto t2 = t0.expand({x, y, z});
+
+  testValidate(
+      executor_cache.fusion(), cg_outputs, {t0}, {t2, t2}, __LINE__, __FILE__);
+}
+
+// TODO: Make sure the kernel uses the expanded concrete size instead
+// of the symbolic size
+TEST_F(NVFuserTest, FusionExpandToConcrete_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto x = 3, y = 4;
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion->addInput(tv0);
+
+  auto tv1 = broadcast(tv0, {true, false});
+
+  auto tv2 =
+      expand(tv1, {IrBuilder::create<Int>(x), IrBuilder::create<Int>(y)});
+
+  fusion->addOutput(tv2);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({y}, options);
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0});
+
+  for (const auto& cg_out : cg_outputs) {
+    TORCH_INTERNAL_ASSERT(cg_out.size(0) == x);
+    TORCH_INTERNAL_ASSERT(cg_out.size(1) == y);
+  }
+
+  auto t2 = t0.expand({x, y});
+
+  testValidate(
+      executor_cache.fusion(), cg_outputs, {t0}, {t2}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionReproNoncontigBroadcast_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({4, 32, 16, 112, 112}, options).transpose(-1, -2);
+  at::Tensor t1 = at::randn({32, 1, 112, 1}, options).transpose(-1, -2);
+
+  auto tv0 = TensorViewBuilder()
+                 .ndims(5)
+                 .contiguity({true, true, false, false, false}) // ttfff
+                 .shape({-1, -1, -1, -1, -1})
+                 .dtype(DataType::Half)
+                 .build();
+  auto tv1 = TensorViewBuilder()
+                 .ndims(4)
+                 .contiguity({true, false, false, true}) // tfft
+                 .shape({-1, 1, 1, -1})
+                 .dtype(DataType::Half)
+                 .build();
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+
+  fusion->addOutput(tv2);
+
+  std::vector<IValue> aten_inputs({t0, t1});
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
+
+  auto t2 = t0 + t1;
+
+  testValidate(
+      executor_cache.fusion(), cg_outputs, {t0, t1}, {t2}, __LINE__, __FILE__);
+}
+
+namespace {
+
+// check that the resulting sibling are identical
+void checkSiblingConsistency(TensorView* replay, TensorView* target) {
+  auto replay_root = replay->getRootDomain();
+  auto replay_dom = replay->domain()->domain();
+  auto target_root = target->getRootDomain();
+  auto target_dom = target->domain()->domain();
+  std::unordered_map<IterDomain*, IterDomain*> target2replay_map;
+  TORCH_CHECK(replay_root.size() == target_root.size());
+  target2replay_map.reserve(replay_root.size());
+  std::transform(
+      target_root.begin(),
+      target_root.end(),
+      replay_root.begin(),
+      std::inserter(target2replay_map, target2replay_map.begin()),
+      [](auto a, auto b) { return std::make_pair(a, b); });
+  BestEffortReplay replay_(replay_dom, target_dom, target2replay_map);
+  auto r = replay_.getReplay();
+  for (int64_t i = 0; i < replay_dom.size(); i++) {
+    auto target_id = target_dom[i];
+    auto replay_it = r.find(target_id);
+    TORCH_CHECK(replay_it != r.end());
+    TORCH_CHECK(
+        replay_it->second == replay_dom[i],
+        "IterDomain mismatch when checking ",
+        replay,
+        " and ",
+        target,
+        " at ",
+        i,
+        ", got ",
+        replay_it->second,
+        " and ",
+        replay_dom[i]);
+  }
+};
+
+} // namespace
+
+TEST_F(NVFuserTest, FusionTransformPropagateSibling_CUDA) {
+  // https://github.com/csarofeen/pytorch/issues/1760
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tvs = Welford(tv0, {1});
+  fusion.addOutput(tvs.var_sum);
+
+  tvs.avg->split(1, 1);
+  tvs.avg->split(1, 2);
+  tvs.avg->split(1, 3);
+  tvs.var_sum->split(1, 1);
+  tvs.var_sum->split(1, 2);
+  tvs.var_sum->split(1, 3);
+  tvs.n->split(1, 1);
+  tvs.n->split(1, 2);
+  tvs.n->split(1, 3);
+
+  auto var_sum_rf = ir_utils::rfactorHelper(tvs.var_sum, {1, 4});
+
+  TransformPropagatorWithCheck propagator(var_sum_rf);
+  MaxRootDomainInfoSpanningTree(var_sum_rf).traverse(&propagator);
+
+  auto rf_tvs = ir_utils::producerTvsOf(tvs.var_sum);
+
+  std::vector<TensorView*> siblings[] = {{tvs.avg, tvs.var_sum, tvs.n}, rf_tvs};
+  for (auto tensors : siblings) {
+    for (auto t1 : tensors) {
+      for (auto t2 : tensors) {
+        TORCH_CHECK(TransformReplay::fullSelfMatching(t1, t2));
+      }
+    }
+  }
+}
+
+TEST_F(NVFuserTest, FusionTransformPropagateSelectorSibling_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tvs = Welford(tv0, {1});
+  fusion.addOutput(tvs.var_sum);
+
+  tvs.avg->split(1, 1);
+  tvs.avg->split(1, 2);
+  tvs.avg->split(1, 3);
+  tvs.var_sum->split(1, 1);
+  tvs.var_sum->split(1, 2);
+  tvs.var_sum->split(1, 3);
+  tvs.n->split(1, 1);
+  tvs.n->split(1, 2);
+  tvs.n->split(1, 3);
+
+  auto var_sum_rf = ir_utils::rfactorHelper(tvs.var_sum, {1, 4});
+
+  struct DisableTv0 : public MaxInfoSpanningTree::Selector {
+    TensorView* tv0;
+    virtual bool allowC2P(TensorView* from, TensorView* to) override {
+      return from != tv0 && to != tv0;
+    };
+    virtual bool allowP2C(TensorView* from, TensorView* to) override {
+      return from != tv0 && to != tv0;
+    };
+    virtual bool allowSibling(TensorView* from, TensorView* to) override {
+      return true;
+    }
+    DisableTv0(TensorView* tv0) : tv0(tv0) {}
+  } selector1(tv0);
+
+  struct DisableTv0AndSibling : public DisableTv0 {
+    virtual bool allowSibling(TensorView* from, TensorView* to) override {
+      return false;
+    }
+    using DisableTv0::DisableTv0;
+  } selector2(tv0);
+
+  TransformPropagatorWithCheck propagator(var_sum_rf);
+  MaxRootDomainInfoSpanningTree good_path(var_sum_rf, &selector1);
+  MaxRootDomainInfoSpanningTree bad_path(var_sum_rf, &selector2);
+
+  auto rf_tvs = ir_utils::producerTvsOf(tvs.var_sum);
+
+  auto check = [&]() {
+    std::vector<TensorView*> siblings[] = {
+        {tvs.avg, tvs.var_sum, tvs.n}, rf_tvs};
+    for (auto tensors : siblings) {
+      for (auto t1 : tensors) {
+        for (auto t2 : tensors) {
+          TORCH_CHECK(TransformReplay::fullSelfMatching(t1, t2));
+        }
+      }
+    }
+  };
+
+  bad_path.traverse(&propagator);
+  ASSERT_ANY_THROW(check());
+  good_path.traverse(&propagator);
+  check();
+}
+
+TEST_F(NVFuserTest, FusionTransformPropagatePosition_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(4);
+  auto tv1 = makeSymbolicTensor(6);
+  fusion.addInput(tv0);
+
+  auto tv2 = broadcast(tv0, {false, false, true, false, false, true});
+  auto tv3 = add(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  tv0->merge(2);
+  tv0->merge(0);
+  TransformPropagatorWithCheck propagator(tv0);
+  MaxRootDomainInfoSpanningTree(tv0).traverse(&propagator);
+
+  TORCH_CHECK(tv1->nDims() == 4);
+}
+
+TEST_F(NVFuserTest, FusionIgnoreZeroDimReduction_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion->addInput(tv0);
+  auto tv1 = sum(tv0, {0});
+  // tv1 is effectively a zero-dim tensor as it only has a reduction
+  // axis.
+  // Reducing it further is converted to just a set op.
+  auto tv2 = sum(tv1, {0});
+  fusion->addOutput(tv2);
+
+  auto tv2_def = dynamic_cast<UnaryOp*>(tv2->definition());
+  TORCH_CHECK(
+      tv2_def != nullptr,
+      "Expected UnaryOp but found ",
+      tv2->definition()->toString());
+
+  TORCH_CHECK(
+      tv2_def->getUnaryOpType() == UnaryOpType::Set,
+      "Expected UnaryOpType::Set but found ",
+      tv2_def->getUnaryOpType());
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({12345}, options);
+  std::vector<IValue> aten_inputs({t0});
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
+
+  auto ref = sum(t0, {0});
+
+  testValidate(
+      executor_cache.fusion(),
+      cg_outputs,
+      aten_inputs,
+      {ref},
+      __LINE__,
+      __FILE__);
+}
+
+// Repro of issue #1770
+TEST_F(NVFuserTest, FusionIssue1770Repro_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion->addInput(tv0);
+  auto tv1 = makeSymbolicTensor(1);
+  fusion->addInput(tv1);
+
+  auto tv2 = ge(tv0, tv1);
+  auto tv3 =
+      where(tv2, IrBuilder::create<Double>(1), IrBuilder::create<Double>(2));
+  fusion->addOutput(tv3);
+
+  std::vector<int64_t> shape({999});
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn(shape, options);
+  at::Tensor t1 = at::randn(shape, options);
+  std::vector<IValue> aten_inputs({t0, t1});
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
+
+  auto ref = where(t0 >= t1, 1.0, 2.0);
+
+  testValidate(
+      executor_cache.fusion(),
+      cg_outputs,
+      aten_inputs,
+      {ref},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTransformPropagatorSelector_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion->addInput(tv0);
+  auto tv1 = makeSymbolicTensor(1);
+  fusion->addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+
+  auto tv3 = sin(tv2);
+  auto tv4 = cos(tv2);
+
+  fusion->addOutput(tv3);
+  fusion->addOutput(tv4);
+
+  tv2->split(0, 10);
+
+  struct Selector : public MaxInfoSpanningTree::Selector {
+    TensorView* tv0;
+    TensorView* tv3;
+    virtual bool allowC2P(TensorView* from, TensorView* to) override {
+      return to == tv0;
+    }
+    virtual bool allowP2C(TensorView* from, TensorView* to) override {
+      return to == tv3;
+    }
+    virtual bool allowSibling(TensorView* from, TensorView* to) override {
+      return false;
+    }
+    Selector(TensorView* tv0, TensorView* tv3) : tv0(tv0), tv3(tv3) {}
+  } selector(tv0, tv3);
+
+  TransformPropagatorWithCheck propagator(tv2);
+  MaxRootDomainInfoSpanningTree(tv2, &selector).traverse(&propagator);
+
+  TORCH_CHECK(tv0->nDims() == 2);
+  TORCH_CHECK(tv1->nDims() == 1);
+  TORCH_CHECK(tv2->nDims() == 2);
+  TORCH_CHECK(tv3->nDims() == 2);
+  TORCH_CHECK(tv4->nDims() == 1);
+}
+
+TEST_F(NVFuserTest, FusionTransformPropagatorPos_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeConcreteTensor({22, 105});
+  fusion->addInput(tv0);
+
+  auto tv1 = sin(tv0);
+  fusion->addOutput(tv1);
+
+  tv1->split(0, 2);
+  tv1->split(-1, 3);
+  tv1->split(-1, 5);
+
+  TransformPropagatorWithCheck propagator(tv1, 2);
+  MaxRootDomainInfoSpanningTree(tv1, 2).traverse(&propagator);
+
+  auto expect = makeConcreteTensor({22, 105});
+  expect->split(0, 2);
+  TORCH_CHECK(TransformReplay::fullSelfMatching(expect, tv0));
+}
+
+TEST_F(NVFuserTest, FusionMaxRootDomainInfoSpanningTreePrintTwice_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(3);
+  fusion->addInput(tv0);
+
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = neg(tv1);
+
+  fusion->addOutput(tv2);
+
+  tv1->split(0, 10);
+
+  struct Printer : public MaxInfoSpanningTree::Propagator {
+    std::stringstream ss;
+    virtual void propagateC2P(TensorView* from, TensorView* to) override {
+      ss << "propagateC2P" << std::endl;
+      ss << "from: " << from->name() << std::endl;
+      ss << "to: " << to->name() << std::endl;
+    }
+    virtual void propagateP2C(TensorView* from, TensorView* to) override {
+      ss << "propagateP2C" << std::endl;
+      ss << "from: " << from->name() << std::endl;
+      ss << "to: " << to->name() << std::endl;
+    }
+    virtual void propagateSibling(TensorView* from, TensorView* to) override {
+      ss << "propagateSibling" << std::endl;
+      ss << "from: " << from->name() << std::endl;
+      ss << "to: " << to->name() << std::endl;
+    }
+  } printer1, printer2;
+  printer1.ss << std::endl;
+  printer2.ss << std::endl;
+
+  MaxRootDomainInfoSpanningTree path(tv1);
+  path.traverse(&printer1);
+  path.traverse(&printer2);
+
+  auto expect = R"ESCAPE(
+propagateC2P
+from: 1
+to: 0
+propagateP2C
+from: 1
+to: 2
+)ESCAPE";
+  TORCH_CHECK(printer1.ss.str() == expect);
+  TORCH_CHECK(printer2.ss.str() == expect);
+}
+
+TEST_F(NVFuserTest, FusionTransformPropagatorNoOverwrite_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeSymbolicTensor(1);
+  fusion->addInput(tv0);
+  auto tv1 = broadcast(tv0, {true, false, true});
+  auto tv2 = sin(tv1);
+  fusion->addOutput(tv2);
+
+  tv0->split(0, 2);
+  tv2->split(1, 2);
+  tv2->split(0, 4);
+
+  MaxRootDomainInfoSpanningTree path1(tv2);
+  TransformPropagatorWithCheck propagator1(tv2);
+  path1.traverse(&propagator1);
+
+  MaxRootDomainInfoSpanningTree path2(tv0);
+  TransformPropagatorWithCheck propagator2(tv0);
+  path2.traverse(&propagator2);
+
+  TORCH_CHECK(tv1->axis(0)->isBroadcast());
+  TORCH_CHECK(tv1->axis(1)->isBroadcast());
+  TORCH_CHECK(!tv1->axis(2)->isBroadcast());
+  TORCH_CHECK(!tv1->axis(3)->isBroadcast());
+  TORCH_CHECK(tv1->axis(4)->isBroadcast());
+
+  auto expect = makeSymbolicTensor(3);
+  expect->split(1, 2);
+  expect->split(0, 4);
+  TORCH_CHECK(TransformReplay::fullSelfMatching(expect, tv1));
+}
+
+TEST_F(NVFuserTest, FusionIssue1785Repro_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeContigTensor(1);
+  TensorView* tv1 = makeContigTensor(2);
+
+  // Register your inputs
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = set(tv0);
+  // [B, I]
+  auto tv3 = broadcast(tv2, {true, false});
+  auto tv4 = add(tv3, tv1);
+  auto tv5 = set(tv4);
+
+  // Register your outputs
+  fusion.addOutput(tv5);
+
+  tv5->split(0, 8);
+  tv5->split(-1, 8);
+
+  // [Serial, TIDy, TIDX, Serial]
+
+  tv4->computeAt(tv5, -2);
+  tv3->computeAt(tv4, -1);
+  tv2->computeAt(tv3, 0);
+  tv2->split(0, 8);
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+  tv1->computeAt(tv5, -2);
+
+  tv5->axis(1)->parallelize(ParallelType::TIDy);
+  tv5->axis(2)->parallelize(ParallelType::TIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor in1 = at::randn({16}, options);
+  at::Tensor in2 = at::randn({12, 16}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {in1, in2});
+  auto cg_outputs = fe.runFusion({in1, in2});
+
+  auto tv_ref = in1 + in2;
+
+  testValidate(&fusion, cg_outputs, {in1, in2}, {tv_ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSkipReplay_CUDA) {
+  {
+    Fusion fusion;
+    FusionGuard fg(&fusion);
+
+    TensorView* tv0 = makeContigTensor(1);
+    TensorView* tv1 = makeContigTensor(2);
+    fusion.addInput(tv0);
+    fusion.addInput(tv1);
+
+    auto tv2 = broadcast(tv0, {false, true});
+    auto tv3 = add(tv2, tv1);
+    fusion.addOutput(tv3);
+
+    tv3->split(1, 2, false);
+
+    TransformPropagatorWithCheck propagator(tv3);
+    MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
+  }
+
+  {
+    Fusion fusion;
+    FusionGuard fg(&fusion);
+
+    TensorView* tv0 = makeContigTensor(3);
+    fusion.addInput(tv0);
+
+    auto tv1 = sum(tv0, {0, 2});
+    auto tv2 = sin(tv1);
+    fusion.addOutput(tv2);
+
+    tv0->split(1, 2, false);
+
+    TransformPropagatorWithCheck propagator(tv0);
+    MaxRootDomainInfoSpanningTree(tv0).traverse(&propagator);
+  }
+}
+
+TEST_F(NVFuserTest, FusionInlineRepro1803_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeContigTensor(2);
+
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  auto tvs = Welford(tv1, {1});
+  auto tvo = set(tvs.var_sum);
+  fusion.addOutput(tvo);
+
+  tvo->split(0, 16);
+  tvo->axis(1)->parallelize(ParallelType::Unroll);
+
+  tv0->computeAt(tvo, -1, ComputeAtMode::BestEffort);
+
+  TORCH_CHECK(
+      tvs.var_sum->getComputeAtPosition() == tvs.avg->getComputeAtPosition());
+  TORCH_CHECK(
+      tvs.var_sum->getComputeAtPosition() == tvs.n->getComputeAtPosition());
+  TORCH_CHECK(tvs.var_sum->getComputeAtPosition() == 1);
+}
+
+// Unit test for the transform selection logic
+TEST_F(NVFuserTest, FusionBoundedDirectionSelection1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeContigTensor(2);
+
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  auto tv2 = set(tv1);
+  auto tv3 = add(tv2, tv1);
+  fusion.addOutput(tv3);
+
+  tv3->split(-1, 5);
+  tv3->split(-1, 8);
+
+  scheduler_utils::BoundedDirectionalTransformPropagator::backward(
+      tv3, -1, {tv0, tv2});
+
+  // Check that the splits are replayed on tv2
+  TORCH_INTERNAL_ASSERT(
+      tv2->nDims() == tv3->nDims(),
+      "Propagator didn't propagate to tv2: ",
+      tv2->toString());
+
+  // Check that the splits are replayed on tv1 as well. Even though
+  //  one of its consumers, tv2, is part of the boundary, another
+  //  consumer is not a boundary, so tv1 should be transformed as well.
+  TORCH_INTERNAL_ASSERT(
+      tv1->nDims() == tv3->nDims(),
+      "Propagator didn't propagate to tv1: ",
+      tv1->toString());
+}
+
+TEST_F(NVFuserTest, FusionIssueRepro1844_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  std::vector<int64_t> shape = {2, 1, 768};
+  std::vector<int64_t> sum_to_shape = {768};
+  std::vector<int64_t> sum_to_axes = {0, 1};
+  double kProb = 0.5;
+
+  std::vector<Int*> sum_to_symb;
+  std::transform(
+      sum_to_shape.begin(),
+      sum_to_shape.end(),
+      std::back_inserter(sum_to_symb),
+      [](int s) -> Int* { return IrBuilder::create<Int>(s); });
+
+  TensorView* tv0 = makeContigConcreteTensor(shape);
+  TensorView* tv1 = makeContigConcreteTensor(shape);
+  TensorView* tv2 = makeContigConcreteTensor(shape, DataType::Bool);
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  fusion->addInput(tv2);
+
+  Double* prob = IrBuilder::create<Double>(kProb);
+  auto grad_input = dropout_backward(tv1, tv2, prob);
+  auto grad_gelu = gelu_backward(grad_input, tv0);
+  auto grad_bias = sum_to(grad_gelu, sum_to_symb);
+
+  fusion->addOutput(grad_gelu);
+  fusion->addOutput(grad_bias);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  const auto mask_options =
+      at::TensorOptions().dtype(at::kBool).device(at::kCUDA, 0);
+  at::manual_seed(0);
+
+  at::Tensor a = at::randn(shape, options);
+  at::Tensor b = at::randn(shape, options);
+  at::Tensor c = at::randn(shape, options);
+  auto mask = at::gt(c, 0.0f);
+  std::vector<IValue> aten_inputs = {a, b, mask};
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
+
+  auto dinput = at::native_dropout_backward(b, mask, kProb);
+  auto dgelu = at::gelu_backward(dinput, a, "none");
+  auto dbias = dgelu.sum(sum_to_axes);
+
+  testValidate(
+      executor_cache.fusion(),
+      cg_outputs,
+      aten_inputs,
+      {dgelu, dbias},
+      __LINE__,
+      __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionInsertMagicZero1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = set(tv1);
+  fusion.addOutput(tv2);
+
+  tv2->split(0, 32);
+  tv2->split(-1, 2);
+  tv2->reorder({{1, 2}, {2, 1}});
+  tv2->merge(0);
+
+  TransformPropagatorWithCheck propagator(tv2);
+  MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
+
+  tv0->computeAt(tv2, 1);
+
+  // The predicate of tv2 should be protected with magic zero
+  GpuLower gpulw(&fusion);
+  TORCH_CHECK(
+      PredicateMagicZeroChecker::isProtected(tv2, gpulw),
+      "Failed to protect the predicates of ",
+      tv2->toString());
+}
+
+TEST_F(NVFuserTest, FusionRepro1860_CUDA) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr;
+  FusionGuard fg(&fusion);
+  std::vector<bool> contiguity{true, false, false};
+
+  std::vector<int64_t> shape{1, -1, -1};
+  TensorView* tv0 = makeContigConcreteTensor(shape);
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeContigConcreteTensor(shape);
+  fusion.addInput(tv1);
+  TensorView* tv2 = makeContigConcreteTensor(shape);
+  fusion.addInput(tv2);
+
+  std::vector<IterDomain*> domain1(3, nullptr);
+  for (const auto i : c10::irange(3)) {
+    if (i == 0) {
+      domain1[i] =
+          IterDomainBuilder(
+              FusionGuard::getCurFusion()->zeroVal(), IrBuilder::create<Int>(1))
+              .iter_type(IterType::Broadcast)
+              .build();
+    } else {
+      domain1[i] =
+          IterDomainBuilder(
+              FusionGuard::getCurFusion()->zeroVal(), IrBuilder::create<Int>(1))
+              .expanded_extent(IrBuilder::create<Int>(1 + i))
+              .iter_type(IterType::Broadcast)
+              .build();
+    }
+  }
+
+  TensorView* tv22 = IrBuilder::create<TensorView>(
+      IrBuilder::create<TensorDomain>(domain1, contiguity), DataType::Float);
+
+  fusion.addInput(tv22);
+
+  auto tv3 = add(tv0, tv1);
+  auto tv4 = softmax(tv3, 0);
+  auto tv5 = add(tv4, tv22);
+  fusion.addOutput(tv5);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor input1 = at::randn({1, 2, 3}, options);
+  at::Tensor input2 = at::randn({1, 2, 3}, options);
+  at::Tensor input3 = at::randn({1, 2, 3}, options);
+  at::Tensor input4 = at::randn({1, 1, 1}, options).expand({1, 2, 3});
+  std::vector<IValue> aten_inputs = {input1, input2, input3, input4};
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
+}
+
+TEST_F(NVFuserTest, FusionExpandReduce_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeConcreteTensor({1, 8});
+  fusion->addInput(tv0);
+
+  auto tv1 =
+      expand(tv0, {IrBuilder::create<Int>(12), IrBuilder::create<Int>(8)});
+
+  auto tv2 = sum(tv1, {0});
+  fusion->addOutput(tv2);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({1, 8}, options);
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0});
+
+  auto ref = t0.expand({12, 8}).sum({0});
+
+  testValidate(
+      executor_cache.fusion(), cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+// Predicate elimination issue repro:
+TEST_F(NVFuserTest, FusionExpandReduce2_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeConcreteTensor({1, 4});
+  fusion->addInput(tv0);
+
+  auto tv1 =
+      expand(tv0, {IrBuilder::create<Int>(3), IrBuilder::create<Int>(4)});
+
+  auto tv2 = sum(tv1, {0});
+  fusion->addOutput(tv2);
+
+  // tv2[r{3}, i{4}]
+  tv2->split(0, NamedScalar::getParallelDim(ParallelType::TIDy));
+  tv2->axis(1)->parallelize(ParallelType::TIDy);
+  tv2->split(0, NamedScalar::getParallelDim(ParallelType::BIDy), false);
+  tv2->axis(0)->parallelize(ParallelType::BIDy);
+  tv2->split(-1, NamedScalar::getParallelDim(ParallelType::TIDx));
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-2)->parallelize(ParallelType::BIDx);
+  // [rBIDy, rO, rTIDy, iBIDx, iTIDx]
+  tv2->reorder({{-2, 0}, {-1, 1}, {2, 2}});
+  // [iBIDx, iTIDx, rTIDy, rBIDy, rO]
+  auto tv3 = tv2->rFactor({-1});
+
+  TransformPropagatorWithCheck propagator(tv3);
+  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
+  scheduler_utils::parallelizeAllLike(tv3);
+  tv0->computeAt(tv3, -1, ComputeAtMode::MostInlined);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  auto t0 = at::randn({1, 4}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion.get(), {t0}, LaunchParams(-1, 2, -1, 4, 2, 1));
+  auto cg_outputs = fe.runFusion({t0}, LaunchParams(-1, 2, -1, 4, 2, 1));
+
+  auto ref = t0.expand({3, 4}).sum({0});
+
+  testValidate(
+      fusion.get(),
+      cg_outputs,
+      {t0},
+      {ref},
+      __LINE__,
+      __FILE__,
+      "",
+      LaunchParams(-1, 2, -1, 4, 2, 1));
+}
+
+TEST_F(NVFuserTest, FusionExpandBadShapeTest_CUDA) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr;
+  FusionGuard fg(&fusion);
+  std::vector<bool> contiguity{false, false};
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+
+  std::vector<IterDomain*> domains = {
+      IterDomainBuilder(
+          FusionGuard::getCurFusion()->zeroVal(), IrBuilder::create<Int>())
+          .build(),
+      IterDomainBuilder(
+          FusionGuard::getCurFusion()->zeroVal(), IrBuilder::create<Int>(1))
+          .expanded_extent(IrBuilder::create<Int>(10))
+          .iter_type(IterType::Broadcast)
+          .build()};
+
+  // expand to 10
+  TensorView* tv22 = IrBuilder::create<TensorView>(
+      IrBuilder::create<TensorDomain>(domains, contiguity), DataType::Float);
+
+  fusion.addInput(tv22);
+
+  auto tv3 = add(tv0, tv22);
+  fusion.addOutput(tv3);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  // Incompatible shapes
+  at::Tensor input1 = at::randn({2, 3}, options);
+  // Passing expand size of 5, not 10. Should cause an error
+  at::Tensor input4 = at::randn({2, 1}, options).expand({2, 5});
+
+  std::vector<IValue> aten_inputs = {input1, input4};
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  ASSERT_ANY_THROW(executor_cache.runFusionWithInputs(aten_inputs));
+}
+
+TEST_F(
+    NVFuserTest,
+    FusionPointwiseScheduleWithBroadcastAndTrivialReduction_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(3);
+  auto tv1 = makeContigTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  auto tv2 = broadcast(tv0, {false, true, false, true, false, true});
+  auto tv3 = sin(tv2);
+  auto tv4 = add(tv3, tv1);
+  auto tv5 = sum(tv4, {1});
+  fusion.addOutput(tv5);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({100, 100, 10}, options);
+  at::Tensor t1 = at::randn({10, 20}, options);
+
+  auto aten_output = (t0.view({100, 1, 100, 1, 10, 1}).sin() + t1).squeeze(1);
+
+  std::vector<IValue> aten_inputs = {t0, t1};
+
+  auto lparams = schedulePointwise(&fusion, aten_inputs);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, aten_inputs, lparams);
+  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
+
+  testValidate(
+      &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionInliningMismatchedDims1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({2, 3, 4});
+  fusion.addInput(tv0);
+  auto tv1 = sin(tv0);
+  auto tv2 = cos(tv1);
+  auto tv3 = transpose(tv2, 1, 2);
+  auto tv4 = exp(tv3);
+  auto tv5 = tan(tv4);
+  fusion.addOutput(tv5);
+
+  inlineMost();
+
+  TORCH_CHECK(tv5->getComputeAtPosition() == 3);
+  TORCH_CHECK(tv4->getComputeAtPosition() == 3);
+  TORCH_CHECK(tv3->getComputeAtPosition() == 3);
+  TORCH_CHECK(tv2->getComputeAtPosition() == 1);
+  TORCH_CHECK(tv1->getComputeAtPosition() == 3);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({2, 3, 4}, options);
+  auto output = input.sin().cos().transpose(1, 2).exp().tan();
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionInliningMismatchedDims2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({2, 3, 4});
+  fusion.addInput(tv0);
+  auto tv1 = sin(tv0);
+  auto tv2 = cos(tv1);
+  auto tv3 = transpose(tv2, 1, 2);
+  auto tv4 = exp(tv3);
+  auto tv5 = tan(tv4);
+  fusion.addOutput(tv5);
+
+  inlineAllAt(tv5, -1, true);
+
+  TORCH_CHECK(tv5->getComputeAtPosition() == 3);
+  TORCH_CHECK(tv4->getComputeAtPosition() == 3);
+  TORCH_CHECK(tv3->getComputeAtPosition() == 3);
+  TORCH_CHECK(tv2->getComputeAtPosition() == 1);
+  TORCH_CHECK(tv1->getComputeAtPosition() == 1);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({2, 3, 4}, options);
+  auto output = input.sin().cos().transpose(1, 2).exp().tan();
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionInliningMismatchedDims3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({2, 3, 4});
+  fusion.addInput(tv0);
+  auto tv1 = sin(tv0);
+  // broadcasting
+  auto tv2 = broadcast(tv1, {false, true, false, true, false, true});
+  auto tv3 = relu(tv2);
+  // trivial reduction
+  auto tv4 = sum(tv3, {1, 3, 5});
+  auto tv5 = cos(tv4);
+  auto tv6 = transpose(tv5, 1, 2);
+  auto tv7 = exp(tv6);
+  auto tv8 = tan(tv7);
+  fusion.addOutput(tv8);
+
+  for (auto tv : {tv2, tv3, tv4}) {
+    tv->merge(0);
+    tv->merge(1);
+    tv->merge(2);
+  }
+
+  inlineMost();
+
+  TORCH_CHECK(tv8->getComputeAtPosition() == 3);
+  TORCH_CHECK(tv7->getComputeAtPosition() == 3);
+  TORCH_CHECK(tv6->getComputeAtPosition() == 3);
+  TORCH_CHECK(tv5->getComputeAtPosition() == 1);
+  TORCH_CHECK(tv4->getComputeAtPosition() == 3);
+  TORCH_CHECK(tv3->getComputeAtPosition() == 3);
+  TORCH_CHECK(tv2->getComputeAtPosition() == 3);
+  TORCH_CHECK(tv1->getComputeAtPosition() == 3);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({2, 3, 4}, options);
+  auto output = input.sin().relu().cos().transpose(1, 2).exp().tan();
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionInliningMismatchedDims4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({2, 3, 4});
+  fusion.addInput(tv0);
+  auto tv1 = sin(tv0);
+  auto tv2 = exp(tv1);
+  auto tv3 = relu(tv2);
+  auto tv4 = cos(tv3);
+  auto tv5 = tan(tv4);
+  fusion.addOutput(tv5);
+
+  tv3->merge(1);
+  inlineMost();
+
+  TORCH_CHECK(tv5->getComputeAtPosition() == 3);
+  TORCH_CHECK(tv4->getComputeAtPosition() == 3);
+  TORCH_CHECK(tv3->getComputeAtPosition() == 1);
+  TORCH_CHECK(tv2->getComputeAtPosition() == 1);
+  TORCH_CHECK(tv1->getComputeAtPosition() == 3);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({2, 3, 4}, options);
+  auto output = input.sin().exp().relu().cos().tan();
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionInliningBroadcast_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({2, 3, 4});
+  fusion.addInput(tv0);
+  auto tv1 = sin(tv0);
+  // broadcasting
+  auto tv2 = broadcast(tv1, {false, true, false, true, false, true});
+  auto tv3 = cos(tv2);
+  auto tv4 = tan(tv3);
+  fusion.addOutput(tv4);
+
+  for (auto tv : {tv2, tv3, tv4}) {
+    tv->merge(0);
+    tv->merge(1);
+    tv->merge(2);
+  }
+
+  inlineMost();
+
+  TORCH_CHECK(tv4->getComputeAtPosition() == 3);
+  TORCH_CHECK(tv3->getComputeAtPosition() == 3);
+  TORCH_CHECK(tv2->getComputeAtPosition() == 3);
+  TORCH_CHECK(tv1->getComputeAtPosition() == 3);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({2, 3, 4}, options);
+  auto output = input.sin().view({2, 1, 3, 1, 4, 1}).cos().tan();
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionInliningBroadcastTrivialReduction_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({2, 3, 4});
+  fusion.addInput(tv0);
+  auto tv1 = sin(tv0);
+  // broadcasting
+  auto tv2 = broadcast(tv1, {false, true, false, true, false, true});
+  auto tv3 = tan(tv2);
+  // trivial reduction
+  auto tv4 = sum(tv3, {1, 3, 5});
+  auto tv5 = cos(tv4);
+  auto tv6 = exp(tv5);
+  fusion.addOutput(tv6);
+
+  for (auto tv : {tv2, tv3, tv4}) {
+    tv->merge(0);
+    tv->merge(1);
+    tv->merge(2);
+  }
+
+  inlineMost();
+
+  TORCH_CHECK(tv6->getComputeAtPosition() == 3);
+  TORCH_CHECK(tv5->getComputeAtPosition() == 3);
+  TORCH_CHECK(tv4->getComputeAtPosition() == 3);
+  TORCH_CHECK(tv3->getComputeAtPosition() == 3);
+  TORCH_CHECK(tv2->getComputeAtPosition() == 3);
+  TORCH_CHECK(tv1->getComputeAtPosition() == 3);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({2, 3, 4}, options);
+  auto output = input.sin().tan().cos().exp();
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input});
+  auto cg_outputs = fe.runFusion({input});
+
+  testValidate(&fusion, cg_outputs, {input}, {output}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionMatchedLeafPosWithoutReplayTrivialReduction_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({2, 1, 3, 1, 4, 1});
+  fusion.addInput(tv0);
+  auto tv1 = sum(tv0, {1, 3, 5});
+  auto tv2 = sin(tv1);
+  fusion.addOutput(tv1);
+
+  for (auto tv : {tv0, tv1}) {
+    tv->merge(0);
+    tv->merge(1);
+    tv->merge(2);
+  }
+
+  TORCH_CHECK(
+      TransformReplay::getMatchedLeafPosWithoutReplayPasC(tv0, tv1, 3) == 3);
+  TORCH_CHECK(
+      TransformReplay::getMatchedLeafPosWithoutReplayCasP(tv1, tv0, 3) == 3);
+  TORCH_CHECK(
+      TransformReplay::getMatchedLeafPosWithoutReplayPasC(tv1, tv2, 3) == 3);
+  TORCH_CHECK(
+      TransformReplay::getMatchedLeafPosWithoutReplayCasP(tv2, tv1, 3) == 3);
+}
+
+TEST_F(NVFuserTest, FusionMatchedLeafPosWithoutReplayBroadcast_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({2, 3, 4});
+  fusion.addInput(tv0);
+  auto tv1 = broadcast(tv0, {false, true, false, true, false, true});
+  auto tv2 = sin(tv1);
+  fusion.addOutput(tv2);
+
+  for (auto tv : {tv1, tv2}) {
+    tv->merge(0);
+    tv->merge(1);
+    tv->merge(2);
+  }
+
+  TORCH_CHECK(
+      TransformReplay::getMatchedLeafPosWithoutReplayPasC(tv0, tv1, 3) == 3);
+  TORCH_CHECK(
+      TransformReplay::getMatchedLeafPosWithoutReplayCasP(tv1, tv0, 3) == 3);
+  TORCH_CHECK(
+      TransformReplay::getMatchedLeafPosWithoutReplayPasC(tv1, tv2, 3) == 3);
+  TORCH_CHECK(
+      TransformReplay::getMatchedLeafPosWithoutReplayCasP(tv2, tv1, 3) == 3);
+}
+
+TEST_F(NVFuserTest, FusionIdGraphTrivialReduction_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({2, 3, 4});
+  fusion.addInput(tv0);
+  auto tv1 = broadcast(tv0, {false, true, false, true, false, true});
+  auto tv2 = sum(tv1, {1, 3, 5});
+  auto tv3 = sin(tv2);
+  fusion.addOutput(tv3);
+
+  for (auto tv : {tv1, tv2}) {
+    tv->merge(0);
+    tv->merge(1);
+    tv->merge(2);
+  }
+
+  inlineMost();
+
+  ComputeAtMap ca_map(&fusion);
+
+  auto all_tvs = ir_utils::allTvs(&fusion);
+  for (auto tv1 : all_tvs) {
+    for (auto tv2 : all_tvs) {
+      if (tv1->isFusionInput() || tv2->isFusionInput()) {
+        continue;
+      }
+      for (int i : c10::irange(3)) {
+        auto id1 = tv1->axis(i);
+        auto id2 = tv2->axis(i);
+        TORCH_CHECK(ca_map.areMapped(id1, id2, IdMappingMode::LOOP));
+        TORCH_CHECK(ca_map.areMapped(id1, id2, IdMappingMode::PERMISSIVE));
+      }
+    }
+  }
+}
+
+TEST_F(NVFuserTest, FusionPrint_CUDA) {
+  auto dtypes = {
+      at::kFloat,
+      at::kDouble,
+      at::kHalf,
+      at::kBFloat16,
+      at::kInt,
+      at::kLong,
+      at::kBool};
+  for (auto dtype : dtypes) {
+    auto fusion = std::make_unique<Fusion>();
+    FusionGuard fg(fusion.get());
+
+    auto tv0 = makeSymbolicTensor(1, aten_to_data_type(dtype));
+    fusion->addInput(tv0);
+    auto tv1 = print(tv0);
+    auto tv2 = sin(tv1);
+    fusion->addOutput(tv2);
+
+    // There is no way to check if anything is printed to the console, but we
+    // can validate that when print exist, compilation and computation are not
+    // broken.
+    auto options = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+    at::Tensor t0 = at::arange(2, options).to(dtype);
+
+    FusionExecutorCache executor_cache(std::move(fusion));
+    auto cg_outputs = executor_cache.runFusionWithInputs({t0});
+
+    testValidate(
+        executor_cache.fusion(),
+        cg_outputs,
+        {t0},
+        {t0.sin()},
+        __LINE__,
+        __FILE__);
+  }
+}
+
+TEST_F(NVFuserTest, FusionCheckedSymbolicShape_CUDA) {
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor a = at::randn({123, 456}, options);
+  at::Tensor b = at::randn({123, 456}, options);
+  at::Tensor c = at::randn({321, 654}, options);
+
+  using return_t =
+      std::pair<std::unique_ptr<FusionExecutorCache>, std::vector<at::Tensor>>;
+  auto matched_add = [](at::Tensor a, at::Tensor b) -> return_t {
+    auto fusion = std::make_unique<Fusion>();
+    FusionGuard fg(fusion.get());
+
+    Val* s1 = IrBuilder::create<Int>();
+    Val* s2 = IrBuilder::create<Int>();
+    auto builder = TensorViewBuilder().shape(std::vector<Val*>{s1, s2});
+    TensorView* tv0 = builder.build();
+    TensorView* tv1 = builder.build();
+
+    fusion->addInput(tv0);
+    fusion->addInput(tv1);
+
+    auto tv2 = add(tv0, tv1);
+
+    fusion->addOutput(tv2);
+
+    auto executor_cache =
+        std::make_unique<FusionExecutorCache>(std::move(fusion));
+    auto cg_outputs = executor_cache->runFusionWithInputs({a, b});
+    return {std::move(executor_cache), std::move(cg_outputs)};
+  };
+
+  {
+    auto ret1 = matched_add(a, b);
+    testValidate(
+        ret1.first->fusion(), ret1.second, {a, b}, {a + b}, __LINE__, __FILE__);
+  }
+
+  {
+    EXPECT_THAT(
+        [&]() { matched_add(a, c); },
+        ::testing::ThrowsMessage<c10::Error>(
+            ::testing::HasSubstr("Attempting to bind")));
+  }
+}
+
+TEST_F(NVFuserTest, FusionSizeDependentData_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  Val* s1 = IrBuilder::create<Int>();
+  auto builder = TensorViewBuilder().shape(std::vector<Val*>{s1});
+  TensorView* tv0 = builder.build();
+
+  fusion->addInput(tv0);
+
+  auto tv1 = add(tv0, s1);
+
+  fusion->addOutput(tv1);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor a = at::zeros({123}, options);
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto cg_outputs = executor_cache.runFusionWithInputs({a});
+
+  testValidate(
+      executor_cache.fusion(), cg_outputs, {a}, {a + 123}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionDependencyCheck_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(1);
+  TensorView* tv1 = makeSymbolicTensor(1);
+  TensorView* tv2 = makeSymbolicTensor(1);
+  TensorView* tv3 = makeSymbolicTensor(1);
+
+  auto tv4 = add(tv0, tv1);
+  auto tv5 = add(tv0, tv2);
+  auto tv6 = add(tv0, tv3);
+
+  auto tv7 = add(tv1, tv2);
+  auto tv8 = add(tv1, tv3);
+
+  auto tv9 = add(tv2, tv3);
+
+  {
+    auto all_vals = DependencyCheck::getAllValsBetween(
+        {tv0, tv1}, {tv4, tv5, tv6, tv7, tv8, tv9});
+    std::unordered_set<Val*> all_vals_set(all_vals.begin(), all_vals.end());
+    std::vector<Val*> results({tv0, tv1, tv4, tv5, tv6, tv7, tv8});
+    for (auto result : results) {
+      TORCH_CHECK(all_vals_set.count(result) > 0);
+      all_vals_set.erase(result);
+    }
+    TORCH_CHECK(all_vals_set.empty());
+  }
+
+  auto tv10 = add(tv6, tv7);
+  {
+    auto all_vals = DependencyCheck::getAllValsBetween({tv0, tv1}, {tv10});
+    std::unordered_set<Val*> all_vals_set(all_vals.begin(), all_vals.end());
+    std::vector<Val*> results({tv0, tv1, tv6, tv7, tv10});
+    for (auto result : results) {
+      TORCH_CHECK(all_vals_set.count(result) > 0);
+      all_vals_set.erase(result);
+    }
+    TORCH_CHECK(all_vals_set.empty());
+  }
+}
+
+// Repro for issue #1925
+TEST_F(NVFuserTest, FusionScheduleTransposeRepro1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(4);
+  auto tv1 = makeConcreteTensor({-1, -1, -1, 1});
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input0 = at::randn({1, 1, 333, 1}, options);
+  at::Tensor input1 = at::randn({1, 1, 333, 1}, options);
+
+  auto lparams = scheduleTranspose(&fusion, {input0, input1});
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input0, input1}, lparams);
+  auto outputs = fe.runFusion({input0, input1}, lparams);
+
+  auto tv_ref = input0 + input1;
+
+  testValidate(
+      &fusion, outputs, {input0, input1}, {tv_ref}, __LINE__, __FILE__);
+}
+
+// Repro for issue #1873
+TEST_F(NVFuserTest, FusionInlineBroadcastIndexing0_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  auto tv1 = makeContigTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  auto tv2 = set(tv0);
+  auto tv3 = broadcast(tv2, {true, false});
+  auto tv4 = add(tv3, tv1);
+  fusion.addOutput(tv4);
+
+  tv4->merge(0);
+  tv4->split(0, 32);
+
+  tv0->computeAt(tv4, 1);
+
+  tv2->split(-1, 8);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({123}, options);
+  at::Tensor t1 = at::randn({3, 123}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+
+  auto outputs = fe.runFusion({t0, t1});
+
+  auto tv_ref = t0 + t1;
+
+  testValidate(&fusion, outputs, {t0, t1}, {tv_ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionPredicateUnshare_CUDA) {
+  // https://github.com/csarofeen/pytorch/issues/1926
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion->addInput(tv0);
+  auto tv1 = set(tv0);
+  auto tv2 = set(tv1);
+  fusion->addOutput(tv2);
+
+  tv1->setMemoryType(MemoryType::Shared);
+  for (auto tv : {tv1, tv2}) {
+    tv->split(0, 4);
+    tv->reorder({{1, -1}});
+    tv->split(1, 8);
+    tv->merge(0);
+    tv->split(0, 1);
+    tv->axis(0)->parallelize(ParallelType::BIDx);
+    tv->axis(1)->parallelize(ParallelType::Unswitch);
+  }
+  tv1->merge(2);
+  tv2->reorder({{2, 3}});
+  tv2->merge(2);
+  for (auto tv : {tv1, tv2}) {
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  inlineMost();
+
+  auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({5, 5}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+  auto out = cg_outputs[0];
+
+  testValidate(fusion, {out}, {t0}, {t0}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, AsyncCompilation_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  TensorView* tv1 = makeSymbolicTensor(1);
+  TensorView* tv2 = makeSymbolicTensor(2);
+
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  fusion->addInput(tv2);
+
+  TensorView* tv3 = add(tv0, IrBuilder::create<Double>(1)); // Group 0
+  TensorView* tv4 =
+      max(tv3, {0}); // Group 0 (use max instead to avoid numerical issues)
+  TensorView* tv5 = add(tv4, tv1); //  Group 0 (Non Broadcast after reduce,
+                                   //  keeps normalization scheduler away)
+  TensorView* tv6 = add(tv5, tv2); //  Group 1 (Broadcast after reduce)
+
+  fusion->addOutput(tv6);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({8, 5}, options);
+  at::Tensor t1 = at::randn({5}, options);
+  at::Tensor t2 = at::randn({8, 5}, options);
+
+  auto t3 = t0.add(1.0);
+  auto t4 = std::get<0>(at::max(t3, 0));
+  auto t5 = t4.add(t1);
+  auto t6 = t5.add(t2);
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  std::vector<IValue> aten_inputs = {t0, t1, t2};
+
+  executor_cache.compileFusionAsync(aten_inputs);
+
+  while (!executor_cache.isCompiled(aten_inputs)) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(20));
+    printf(".");
+  }
+
+  auto outputs = executor_cache.runFusionWithInputs(aten_inputs);
+
+  TORCH_CHECK(
+      executor_cache.getMostRecentKernelRuntime()->isSegmented(),
+      "segmentation didn't happen");
+  TORCH_CHECK(
+      executor_cache.getMostRecentKernelRuntime()
+              ->fusionSegments()
+              ->groups()
+              .size() == 2,
+      "segmentation didn't happen as expected");
+
+  testValidate(
+      executor_cache.fusion(), outputs, aten_inputs, {t6}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionMergeBroadcastingTrivialReduction1_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  TensorView* tv0 = makeConcreteTensor({1, 1});
+  TensorView* tv1 = makeConcreteTensor({-1});
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  auto tv2 = sum(tv0, {1});
+  auto tv3 = add(tv2, tv1);
+  fusion->addOutput(tv3);
+
+  tv0->merge(0);
+
+  MaxRootDomainInfoSpanningTree tree(tv0);
+  TransformPropagatorWithCheck tp(tv0);
+  tree.traverse(&tp);
+
+  inlineMost();
+
+  auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({1, 1}, options);
+  at::Tensor t1 = at::randn({10}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+  auto out = cg_outputs[0];
+
+  testValidate(
+      fusion, {out}, {t0, t1}, {t1 + t0.flatten()}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionMergeBroadcastingTrivialReduction2_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  TensorView* tv0 = makeConcreteTensor({-1, 1, 1});
+  TensorView* tv1 = makeConcreteTensor({-1, -1});
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  auto tv2 = sum(tv0, {1});
+  auto tv3 = add(tv2, tv1);
+  fusion->addOutput(tv3);
+
+  tv2->merge(1);
+  tv2->merge(0);
+
+  MaxRootDomainInfoSpanningTree tree(tv0);
+  TransformPropagatorWithCheck tp(tv0);
+  tree.traverse(&tp);
+
+  inlineMost();
+
+  auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({10, 1, 1}, options);
+  at::Tensor t1 = at::randn({10, 10}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+  auto out = cg_outputs[0];
+
+  testValidate(
+      fusion, {out}, {t0, t1}, {t1 + t0.squeeze(-1)}, __LINE__, __FILE__);
+}
+
+// Simple test case exercising the null scheduler path.
+TEST_F(NVFuserTest, FusionNullScheduler_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeConcreteTensor({1, 1, 1});
+  fusion->addInput(tv0);
+
+  auto tv1 = sum(tv0, {0, 1, 2});
+
+  fusion->addOutput(tv1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({1, 1, 1}, options);
+
+  std::vector<IValue> aten_inputs({t0});
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
+
+  auto t1 = t0.sum({0, 1, 2});
+
+  testValidate(
+      executor_cache.fusion(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__);
+
+  auto groups =
+      executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();
+
+  // Check that all groups on the resulting runtime are null.
+  for (auto group : groups) {
+    TORCH_INTERNAL_ASSERT(group->heuristic() == ScheduleHeuristic::NoOp);
+  }
+}
+
+// Simple test case exercising the null scheduler path.
+TEST_F(NVFuserTest, FusionNullScheduler2_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeConcreteTensor({0, 1, 9223372036854775807L});
+  fusion->addInput(tv0);
+
+  auto tv1 = sum(tv0, {0, 1, 2});
+
+  fusion->addOutput(tv1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({0, 1, 9223372036854775807L}, options);
+
+  std::vector<IValue> aten_inputs({t0});
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
+
+  auto t1 = t0.sum({0, 1, 2});
+
+  testValidate(
+      executor_cache.fusion(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__);
+
+  auto groups =
+      executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();
+
+  // Check that all groups on the resulting runtime are null.
+  for (auto group : groups) {
+    TORCH_INTERNAL_ASSERT(group->heuristic() == ScheduleHeuristic::NoOp);
+  }
+}
+
+// Simple test case exercising the null scheduler path.
+TEST_F(NVFuserTest, FusionNullScheduler3_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = TensorViewBuilder().ndims(0).build();
+  auto tv1 = TensorViewBuilder().ndims(0).build();
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  auto tv2 = add(tv0, tv1);
+  fusion->addOutput(tv2);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({}, options);
+  at::Tensor t1 = at::randn({}, options);
+
+  std::vector<IValue> aten_inputs({t0, t1});
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
+
+  testValidate(
+      executor_cache.fusion(),
+      cg_outputs,
+      {t0, t1},
+      {t0 + t1},
+      __LINE__,
+      __FILE__);
+
+  auto groups =
+      executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();
+
+  // Check that all groups on the resulting runtime are null.
+  for (auto group : groups) {
+    TORCH_INTERNAL_ASSERT(group->heuristic() == ScheduleHeuristic::NoOp);
+  }
+}
+
+TEST_F(NVFuserTest, FusionEmpty_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeConcreteTensor({10, 10, 10});
+  auto tv1 = makeConcreteTensor({10, 10, 10});
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  fusion->addOutput(tv0);
+  fusion->addOutput(tv1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({10, 10, 10}, options);
+  at::Tensor t1 = at::randn({10, 10, 10}, options);
+
+  std::vector<IValue> aten_inputs({t0, t1});
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
+
+  testValidate(
+      executor_cache.fusion(),
+      cg_outputs,
+      {t0, t1},
+      {t0, t1},
+      __LINE__,
+      __FILE__);
+
+  auto groups =
+      executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();
+
+  // Check that all groups on the resulting runtime are null.
+  for (auto group : groups) {
+    TORCH_INTERNAL_ASSERT(group->heuristic() == ScheduleHeuristic::NoOp);
+  }
+}
+
+TEST_F(NVFuserTest, FusionMappingRelation_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  TensorView* tv0 = makeConcreteTensor({1, 1});
+  TensorView* tv1 = makeConcreteTensor({-1, 1, 1});
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  auto tv2 = set(tv0);
+  auto tv3 = broadcast(tv2, {true, false, false});
+  auto tv4 = add(tv3, tv1);
+
+  fusion->addOutput(tv4);
+
+  tv4->merge(-2);
+  tv4->merge(-1);
+
+  tv0->computeAt(tv4, -1);
+  tv1->computeAt(tv4, -1);
+
+  ComputeAtMap ca_map(fusion);
+
+  // FIXME: This is the concerning part that would motivate some
+  //  more formalization on concrete/permissive mapping:
+  //   exact mapping should ideally imply permissive mapping.
+  auto tv4_inner_node = tv4->axis(0)->definition()->input(1)->as<IterDomain>();
+  TORCH_CHECK(
+      ca_map.areMapped(tv2->axis(0), tv4_inner_node, IdMappingMode::EXACT));
+  TORCH_CHECK(!ca_map.areMapped(
+      tv2->axis(0), tv4_inner_node, IdMappingMode::PERMISSIVE));
+
+  auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({1, 1}, options);
+  at::Tensor t1 = at::randn({2, 1, 1}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+  auto out = cg_outputs[0];
+
+  testValidate(
+      fusion, {out}, {t0, t1}, {t1 + t0.squeeze(0)}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionInlineAt_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion->addInput(tv0);
+  auto tv1 = sin(tv0);
+  auto tv2 = cos(tv1);
+  fusion->addOutput(tv2);
+
+  tv1->inlineAt(-1);
+
+  auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({100, 2}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+  auto out = cg_outputs[0];
+
+  testValidate(fusion, {out}, {t0}, {t0.sin().cos()}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTrivialInputForwarding_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  TensorView* tv0 = makeConcreteTensor({-1, -1});
+  TensorView* tv1 = makeConcreteTensor({-1, -1});
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  // Note: tv2 is not needed. Kept it here since previously there was an
+  // assertion from sorting in codegen.
+  auto tv2 = add(tv1, IrBuilder::create<Double>(3.141));
+  fusion->addOutput(tv0);
+
+  auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({10, 4}, options);
+  at::Tensor t1 = at::randn({10, 4}, options);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  auto cg_outputs = fec.runFusionWithInputs({t0, t1});
+
+  testValidate(fusion, cg_outputs, {t0, t1}, {t0}, __LINE__, __FILE__);
+
+  // Second run to ensure cache hit handles trivial forwarding properly
+  TORCH_CHECK(fec.isCompiled({t0, t1}));
+  auto cg_outputs2 = fec.runFusionWithInputs({t0, t1});
+  testValidate(fusion, cg_outputs2, {t0, t1}, {t0}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionTrivialInputForwarding2_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(0);
+  fusion->addInput(tv0);
+  fusion->addOutput(tv0);
+
+  auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({}, options);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  auto cg_outputs = fec.runFusionWithInputs({t0});
+
+  testValidate(fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
+
+  // Second run to ensure cache hit handles trivial forwarding properly
+  TORCH_CHECK(fec.isCompiled({t0}));
+  auto cg_outputs2 = fec.runFusionWithInputs({t0});
+  testValidate(fusion, cg_outputs2, {t0}, {t0}, __LINE__, __FILE__);
+}
+
+// Simplified repro of issue #2008
+TEST_F(NVFuserTest, FusionReplayTrivialReductionAndBroadcast2_CUDA) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr;
+  FusionGuard fg(fusion_ptr.get());
+
+  std::vector<int64_t> shape({10, 1, 1});
+
+  auto tv0 = makeConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  auto tv1 = add(tv0, IrBuilder::create<Double>(1));
+  auto tv2 = sum(tv1, {1, 2});
+  auto tv3 = broadcast(tv2, {false, true, true});
+  fusion.addOutput(tv3);
+
+  tv0->merge(-2, -1)->merge(-2, -1)->split(0, 4);
+
+  MaxRootDomainInfoSpanningTree tree(tv0);
+  TransformPropagator tp(tv0);
+  tree.traverse(&tp);
+
+  auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn(shape, options);
+  std::vector<IValue> aten_inputs({t0});
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion_ptr.get(), aten_inputs);
+  auto outputs = fe.runFusion(aten_inputs);
+
+  testValidate(&fusion, outputs, aten_inputs, {t0 + 1}, __LINE__, __FILE__);
+}
+
+namespace {
+
+size_t getVecSizeForPointwise(FusionExecutorCache& fec) {
+  auto most_recent_params =
+      fec.getMostRecentKernelRuntime()->getMostRecentExecutorLog().params;
+  auto params = dynamic_cast<PointwiseParams*>(most_recent_params.get());
+  if (params->vectorize) {
+    return params->unroll_factor;
+  }
+  return 1;
+}
+
+} // namespace
+
+TEST_F(NVFuserTest, FusionVectorizeStrideContiguity2D_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  TensorView* tv0 =
+      TensorViewBuilder().ndims(2).contiguity({false, true}).build();
+  fusion->addInput(tv0);
+  auto tv1 = set(tv0);
+  fusion->addOutput(tv1);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  fec.profile(true);
+
+  std::vector<std::pair<int, int>> size_and_vec{{17, 1}, {18, 2}, {32, 4}};
+
+  for (auto pair : size_and_vec) {
+    auto size = pair.first;
+    auto vec = pair.second;
+    auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
+    at::Tensor t0 = at::randn({1000000, size}, options).narrow(1, 0, 16);
+    auto cg_outputs = fec.runFusionWithInputs({t0});
+
+    TORCH_CHECK(getVecSizeForPointwise(fec) == vec);
+
+    testValidate(fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
+  }
+}
+
+TEST_F(NVFuserTest, FusionVectorizeStrideContiguity3D_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  TensorView* tv0 =
+      TensorViewBuilder().ndims(3).contiguity({false, true, true}).build();
+  fusion->addInput(tv0);
+  auto tv1 = set(tv0);
+  fusion->addOutput(tv1);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  fec.profile(true);
+
+  std::vector<std::pair<int, int>> size_and_vec{{17, 1}, {10, 2}, {16, 4}};
+
+  for (auto pair : size_and_vec) {
+    auto size = pair.first;
+    auto vec = pair.second;
+    auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
+    at::Tensor t0 = at::randn({1000000, size, 3}, options).narrow(1, 0, 8);
+    auto cg_outputs = fec.runFusionWithInputs({t0});
+
+    TORCH_CHECK(getVecSizeForPointwise(fec) == vec);
+
+    testValidate(fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
+  }
+}
+
+TEST_F(NVFuserTest, FusionVectorizeStrideContiguity5D_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  TensorView* tv0 = TensorViewBuilder()
+                        .ndims(5)
+                        .contiguity({false, true, false, true, true})
+                        .build();
+  fusion->addInput(tv0);
+  auto tv1 = set(tv0);
+  fusion->addOutput(tv1);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  fec.profile(true);
+
+  auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
+
+  std::vector<std::tuple<int, int, int>> sizes_and_vec{
+      {9, 17, 1}, {9, 10, 2}, {9, 16, 4}};
+
+  for (auto tup : sizes_and_vec) {
+    auto size1 = std::get<0>(tup);
+    auto size2 = std::get<1>(tup);
+    auto vec = std::get<2>(tup);
+    at::Tensor t0 = at::randn({4, size1, 12345, size2, 3}, options)
+                        .narrow(1, 0, 8)
+                        .narrow(3, 0, 4);
+    auto cg_outputs = fec.runFusionWithInputs({t0});
+
+    TORCH_CHECK(getVecSizeForPointwise(fec) == vec);
+
+    testValidate(fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
+  }
+}
+
+TEST_F(NVFuserTest, FusionVectorizeStrideContiguitySelfOverlapping_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  TensorView* tv0 = TensorViewBuilder()
+                        .ndims(5)
+                        .contiguity({false, true, false, true, true})
+                        .build();
+  fusion->addInput(tv0);
+  auto tv1 = set(tv0);
+  fusion->addOutput(tv1);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  fec.profile(true);
+
+  auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
+
+  std::vector<std::tuple<int, int, int, int>> sizes_strides_and_vec{
+      {4, 4, 4, 4},
+      {4, 4, 2, 2},
+      {4, 2, 4, 2},
+      {2, 4, 4, 2},
+      {4, 4, 1, 1},
+      {4, 1, 4, 1},
+      {1, 4, 4, 1},
+      {2, 2, 2, 2},
+      {2, 2, 1, 1},
+      {2, 1, 2, 1},
+      {1, 2, 2, 1}};
+
+  for (auto tup : sizes_strides_and_vec) {
+    auto size = std::get<0>(tup);
+    auto stride1 = std::get<1>(tup);
+    auto stride2 = std::get<2>(tup);
+    auto vec = std::get<3>(tup);
+    std::vector<int64_t> shape = {4, 4, 12345, size, 3};
+    std::vector<int64_t> stride = {stride1, stride2 * 12345, stride2, 3, 1};
+    at::Tensor t0 = at::empty_strided(shape, stride, options);
+    t0.random_();
+    auto cg_outputs = fec.runFusionWithInputs({t0});
+    TORCH_CHECK(getVecSizeForPointwise(fec) == vec);
+    testValidate(fusion, cg_outputs, {t0}, {t0}, __LINE__, __FILE__);
+  }
+}
+
+TEST_F(NVFuserTest, FusionSimpleAmperePipeline_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // requires ampere+ GPU
+  if (!deviceMajorMinorCheck(8)) {
+    GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
+    return;
+  }
+
+  auto tv0 = makeContigTensor(1);
+
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+
+  fusion.addOutput(tv1);
+
+  auto tv_cache = tv0->cacheAfter(LoadStoreOpType::CpAsync);
+  tv_cache->setMemoryType(MemoryType::Shared);
+
+  tv1->split(0, 16);
+  tv0->computeAt(tv1, 1);
+
+  tv_cache->circularBuffer(10);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({255}, options);
+
+  // Add check that the cp async op has an inlined predicate.
+  class InlinedCpAsyncPredChecker : public kir::IrVisitor {
+   public:
+    using kir::IrVisitor::handle;
+
+   private:
+    void handle(kir::IfThenElse* ite) final {
+      auto prev_within_ite = within_ite_;
+      within_ite_ = true;
+      kir::IrVisitor::handle(ite);
+      within_ite_ = prev_within_ite;
+    }
+
+    void handle(LoadStoreOp* ldst) final {
+      if (ldst->opType() == LoadStoreOpType::CpAsync) {
+        TORCH_INTERNAL_ASSERT(!within_ite_, "CPASYNC predicate not inlined");
+        TORCH_INTERNAL_ASSERT(
+            ldst->predicate()->hasValue() &&
+                !ldst->predicate()->value()->isConst(),
+            "CPASYNC predicate is not generated");
+      }
+    }
+
+   private:
+    bool within_ite_ = false;
+  } pred_checker;
+
+  // Check that cp async is inlined:
+  GpuLower gpulw(&fusion);
+  pred_checker.handle(gpulw.kernel()->topLevelExprs());
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input1});
+  auto cg_outputs = fe.runFusion({input1});
+
+  testValidate(&fusion, cg_outputs, {input1}, {input1}, __LINE__, __FILE__);
+}
+
+// Test file size should be up to 10K LoC. Create a new file for more tests.
+
+} // namespace jit
+} // namespace torch
+#endif // #if defined(USE_CUDA)
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp
index 3b9e7cbd962c6..e827de56e56bd 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp
@@ -10,7 +10,7 @@
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/fusion_segmenter.h>
 #include <torch/csrc/jit/codegen/cuda/grouped_reduction.h>
-#include <torch/csrc/jit/codegen/cuda/inline_propagator.h>
+#include <torch/csrc/jit/codegen/cuda/inlining.h>
 #include <torch/csrc/jit/codegen/cuda/interface.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_builder.h>
@@ -2391,10 +2391,7 @@ TEST_F(NVFuserTest, FusionCrossIterationGroupedGridAllreduceWelfordShmoo_CUDA) {
 
     transform_ref_rf->axis(unswitch_id)->parallelize(ParallelType::Unswitch);
 
-    InlinePropagator inline_propagator(
-        transform_ref_rf, -1, ComputeAtMode::MostInlined);
-    MaxRootDomainInfoSpanningTree(transform_ref_rf)
-        .traverse(&inline_propagator);
+    inlineMost();
 
     // Make sure the reduction expr is converted to GroupedGridReduciton
     // and the non-reduction domains of the output TV are either
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_rng.cu b/torch/csrc/jit/codegen/cuda/test/test_gpu_rng.cu
index e6acc4c5307a1..3e5968c3e0840 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_rng.cu
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_rng.cu
@@ -141,7 +141,7 @@ TEST_F(NVFuserTest, FusionRNGManualScheduleValidateWithCURand_CUDA) {
 
   TensorView* tv0 = makeSymbolicTensor(1, aten_to_data_type(dtype));
   fusion->addInput(tv0);
-  auto tv1 = randlike(tv0);
+  auto tv1 = rand_like(tv0);
   auto tv2 = set(tv1);
   fusion->addOutput(tv2);
 
@@ -166,6 +166,38 @@ TEST_F(NVFuserTest, FusionRNGManualScheduleValidateWithCURand_CUDA) {
   testValidate(fusion, {out}, {t0}, {ref}, __LINE__, __FILE__);
 }
 
+TEST_F(NVFuserTest, FusionRNGManualScheduleValidateWithCURand2_CUDA) {
+  auto dtype = kFloat;
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  Int* size1 = IrBuilder::create<Int>();
+  Int* size2 = IrBuilder::create<Int>();
+  Int* size3 = IrBuilder::create<Int>();
+  Int* size4 = IrBuilder::create<Int>();
+  fusion->addInput(size1);
+  fusion->addInput(size2);
+  fusion->addInput(size3);
+  fusion->addInput(size4);
+  TensorView* tv0 = rand({size1, size2, size3, size4}, DataType::Float);
+  fusion->addOutput(tv0);
+
+  auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion, {10, 10, 10, 10});
+
+  at::manual_seed(0);
+  auto cg_outputs = fe.runFusion({10, 10, 10, 10});
+  auto out = cg_outputs[0];
+
+  at::manual_seed(0);
+  auto ref = generate_uniform(10000, dtype).view({10, 10, 10, 10});
+
+  testValidate(fusion, {out}, {10, 10, 10, 10}, {ref}, __LINE__, __FILE__);
+}
+
 TEST_F(NVFuserTest, FusionBroadcastingRNG_CUDA) {
   for (auto dtype : {kFloat, kDouble}) {
     std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
@@ -176,7 +208,7 @@ TEST_F(NVFuserTest, FusionBroadcastingRNG_CUDA) {
     TensorView* tv1 = makeConcreteTensor({5, 5}, aten_to_data_type(dtype));
     fusion->addInput(tv0);
     fusion->addInput(tv1);
-    auto tv2 = randlike(tv0);
+    auto tv2 = rand_like(tv0);
     auto tv3 = add(tv1, tv2);
     auto tv4 = add(tv0, tv3);
     fusion->addOutput(tv4);
@@ -207,7 +239,7 @@ TEST_F(NVFuserTest, FusionBroadcastingRNG2_CUDA) {
       TensorView* tv1 = makeSymbolicTensor(1, aten_to_data_type(dtype));
       fusion->addInput(tv0);
       fusion->addInput(tv1);
-      auto tv2 = randlike(tv0);
+      auto tv2 = rand_like(tv0);
       auto tv3 = add(tv1, tv2);
       fusion->addOutput(tv3);
 
@@ -239,7 +271,7 @@ TEST_F(NVFuserTest, FusionBroadcastingRNGSmem_CUDA) {
     TensorView* tv1 = makeConcreteTensor({5, 5}, aten_to_data_type(dtype));
     fusion->addInput(tv0);
     fusion->addInput(tv1);
-    auto tv2 = randlike(tv0);
+    auto tv2 = rand_like(tv0);
     auto tv3 = add(tv1, tv2);
     auto tv4 = add(tv0, tv3);
     fusion->addOutput(tv4);
@@ -272,7 +304,7 @@ TEST_F(NVFuserTest, FusionBroadcastingRNGSmemNonSquareTile_CUDA) {
   TensorView* tv1 = makeConcreteTensor({5, 5});
   fusion->addInput(tv0);
   fusion->addInput(tv1);
-  auto tv2 = randlike(tv0);
+  auto tv2 = rand_like(tv0);
   auto tv3 = add(tv1, tv2);
   auto tv4 = add(tv0, tv3);
   fusion->addOutput(tv4);
@@ -297,5 +329,71 @@ TEST_F(NVFuserTest, FusionBroadcastingRNGSmemNonSquareTile_CUDA) {
   TORCH_CHECK((out.select(1, 0) == out.select(1, 4)).all().item<bool>());
 }
 
+TEST_F(NVFuserTest, FusionUniform_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  Int* size_val = IrBuilder::create<Int>();
+  Double* low = IrBuilder::create<Double>();
+  Double* high = IrBuilder::create<Double>();
+  fusion->addInput(size_val);
+  fusion->addInput(low);
+  fusion->addInput(high);
+  TensorView* tv0 = uniform({size_val}, low, high, DataType::Float);
+  TensorView* tv1 = uniform({size_val}, low, high, DataType::Double);
+  fusion->addOutput(tv0);
+  fusion->addOutput(tv1);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+
+  for (int64_t size : {16, 1024, 10001, 10002, 10003, 100000, 10000001}) {
+    at::manual_seed(0);
+    auto cg_outputs = fec.runFusionWithInputs({size, -1.0, 1.0});
+
+    at::manual_seed(0);
+    auto ref0 = generate_uniform(size, kFloat) * 2 - 1;
+    auto ref1 = generate_uniform(size, kDouble) * 2 - 1;
+
+    testValidate(
+        fec.fusion(),
+        cg_outputs,
+        {size, -1.0, 1.0},
+        {ref0, ref1},
+        __LINE__,
+        __FILE__);
+  }
+}
+
+TEST_F(NVFuserTest, FusionRandLikeReduction_CUDA) {
+  auto dtype = kFloat;
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2, aten_to_data_type(dtype));
+  fusion->addInput(tv0);
+  auto tv1 = sum(tv0, {0});
+  auto tv2 = rand_like(tv1);
+  auto tv3 = add(tv1, tv2);
+  fusion->addOutput(tv3);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+
+  auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA, 0);
+  at::Tensor t0 = at::zeros({2, 3}, options);
+
+  at::manual_seed(0);
+  auto cg_outputs = fec.runFusionWithInputs({t0});
+  auto out = cg_outputs[0];
+
+  at::manual_seed(0);
+  auto t1 = t0.sum(0);
+  auto t2 = generate_uniform(3, dtype).expand_as(t1);
+  auto t3 = t1.add(t2);
+
+  testValidate(fec.fusion(), {out}, {t0}, {t3}, __LINE__, __FILE__);
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_scheduler_utils.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_scheduler_utils.cpp
deleted file mode 100644
index 8e611364bd521..0000000000000
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_scheduler_utils.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-#if defined(USE_CUDA)
-#include <gmock/gmock-matchers.h>
-#include <gtest/gtest.h>
-
-#include <torch/csrc/jit/codegen/cuda/fusion.h>
-#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
-
-// Tests go in torch::jit
-namespace torch {
-namespace jit {
-
-using namespace torch::jit::fuser::cuda;
-
-TEST_F(NVFuserTest, FusionSplitDims_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int64_t* p = prime_numbers;
-  auto tv = makeConcreteTensor(
-      {p[0] * p[1] * p[2], p[3], p[4], p[5] * p[6], p[7], p[8], p[9] * p[10]});
-  std::vector<size_t> dims{0, 1, 2, 3, 4, 5, 6};
-  scheduler_utils::splitDims(
-      tv, {{0, p[2]}, {0, p[1]}, {3, p[6]}, {6, p[10]}}, dims);
-  TORCH_CHECK(tv->nDims() == 11);
-  for (auto i : c10::irange(11)) {
-    TORCH_CHECK(tv->axis(i)->extent()->evaluateInt() == p[i]);
-  }
-  std::vector<size_t> expect{0, 3, 4, 5, 7, 8, 9};
-  TORCH_CHECK(dims == expect);
-}
-
-TEST_F(NVFuserTest, FusionMergeDims_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  int64_t* p = prime_numbers;
-  auto tv = makeConcreteTensor(
-      {p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10]});
-  std::vector<size_t> dims{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
-  auto merged = scheduler_utils::mergeDims(tv, {2, 3, 7, 8, 9}, dims);
-  TORCH_CHECK(merged == 2);
-  std::vector<int64_t> expect_shape{
-      p[0], p[1], p[2] * p[3] * p[7] * p[8] * p[9], p[4], p[5], p[6], p[10]};
-  TORCH_CHECK(tv->nDims() == expect_shape.size());
-  for (auto i : c10::irange(expect_shape.size())) {
-    TORCH_CHECK(tv->axis(i)->extent()->evaluateInt() == expect_shape[i]);
-  }
-  std::vector<size_t> expect_dims{0, 1, 2, 2, 3, 4, 5, 2, 2, 2, 6};
-  TORCH_CHECK(dims == expect_dims);
-}
-
-} // namespace jit
-} // namespace torch
-#endif // #if defined(USE_CUDA)
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp
index b2302013f5fd9..d1f185011826e 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp
@@ -2976,6 +2976,7 @@ TEST_F(NVFuserTest, FusionConv2D_CUDA) {
 TEST_F(NVFuserTest, FusionConv2DNoPadding_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
+  ContextCudnnTF32Disabled disabling_tf32_cudnn;
 
   // Input: [C, H, W]
   auto inp = makeSymbolicTensor(3);
@@ -5394,6 +5395,72 @@ TEST_F(NVFuserTest, FusionGatherIterTypePromotion_CUDA) {
   testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
 }
 
+TEST_F(NVFuserTest, FusionContigPredicateShift_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> shape({2, 2});
+
+  auto tv0 = makeConcreteTensor(shape);
+  // [0:I]
+  fusion.addInput(tv0);
+
+  // Below, tv2 and tv3 are mostly the same, except for tv2 is padded
+  // with 0, whereas tv3 is not, so the valid range of tv3 is [0:I-1]
+
+  // [0:I]
+  auto tv1 = shift(tv0, {-1, 0});
+
+  // [0:I-1]
+  auto tv2 = shift(tv0, {-1, 0}, false);
+
+  // tv3 is not an output of shift, but it gets a partial root
+  // domain from tv2, so it must be predicated at the root domain
+  auto tv3 = add(tv2, IrBuilder::create<Double>(1));
+
+  fusion.addOutput(tv1);
+  fusion.addOutput(tv3);
+
+  // contig merge
+  tv1->merge(0);
+  tv1->split(0, 4);
+  TransformPropagator propagator(tv1);
+  MaxRootDomainInfoSpanningTree(tv1).traverse(&propagator);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  // Create 3x2 and trim to 2x2. This would cause the output tensor
+  // non-zero values if not properly predicated.
+  at::Tensor t0 = at::randn({3, 2}, options);
+  t0 = t0.index(
+      {at::indexing::Slice(0, 2), at::indexing::Slice(0, at::indexing::None)});
+
+  // Use random output to detect invalid writes
+  at::Tensor t1 = at::rand_like(t0, options);
+  // Use zero-cleared output to detect invalid writes
+  at::Tensor t3 = at::zeros_like(t0, options);
+
+  std::vector<IValue> inputs = {t0};
+  std::vector<at::Tensor> outputs = {t1, t3};
+
+  std::vector<at::indexing::TensorIndex> indices{
+      at::indexing::Slice(0, -1), at::indexing::Slice(0, at::indexing::None)};
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  fe.runFusion(inputs, outputs);
+
+  // Make sure the padded region is zero filled
+  TORCH_CHECK(t1[1].equal(at::zeros(2, options)));
+  // Make sure not touched as the shift is not padded
+  TORCH_CHECK(t3[1].equal(at::zeros(2, options)));
+
+  auto ref = shift(t0, {-1, 0});
+
+  TORCH_CHECK(t1.equal(ref));
+  TORCH_CHECK(t3.index(indices).equal((ref + 1).index(indices)));
+}
+
 } // namespace jit
 } // namespace torch
 #endif // #if defined(USE_CUDA)
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensor_factories.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_tensor_factories.cpp
index 15bdda0c0ec1c..06e93fcd579e3 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensor_factories.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_tensor_factories.cpp
@@ -18,12 +18,193 @@ namespace jit {
 
 using namespace torch::jit::fuser::cuda;
 
+TEST_F(NVFuserTest, FusionStandaloneFull_CUDA) {
+  auto sizes = {0, 1, 10, 17, 1024};
+  auto dtypes = {
+      kBool,
+      kFloat,
+      kLong,
+      kDouble,
+      kHalf,
+      kBFloat16,
+      kInt,
+      kComplexFloat,
+      kComplexDouble};
+
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  Val* size = IrBuilder::create<Int>();
+  Val* fill_val1 = IrBuilder::create<Int>();
+  Val* fill_val2 = IrBuilder::create<Int>();
+  Val* fill_val3 = IrBuilder::create<Int>();
+  fusion->addInput(size);
+  fusion->addInput(fill_val1);
+  fusion->addInput(fill_val2);
+  fusion->addInput(fill_val3);
+  for (auto dtype : dtypes) {
+    if (!isSupportedTypeByDevice(aten_to_data_type(dtype))) {
+      continue;
+    }
+    auto out_tv = full({size}, fill_val1, aten_to_data_type(dtype));
+    fusion->addOutput(out_tv);
+    out_tv = full({size, size}, fill_val2, aten_to_data_type(dtype));
+    fusion->addOutput(out_tv);
+    out_tv = full_like(out_tv, fill_val3);
+    fusion->addOutput(out_tv);
+  }
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  for (auto size : sizes) {
+    std::vector<at::Tensor> expect;
+    expect.reserve(dtypes.size());
+    for (auto dtype : dtypes) {
+      if (!isSupportedTypeByDevice(aten_to_data_type(dtype))) {
+        continue;
+      }
+      const auto options =
+          at::TensorOptions().dtype(dtype).device(at::kCUDA, 0);
+      expect.emplace_back(at::full({size}, 11, options));
+      expect.emplace_back(at::full({size, size}, 12, options));
+      expect.emplace_back(at::full({size, size}, 13, options));
+    }
+    auto cg_outputs = executor_cache.runFusionWithInputs({size, 11, 12, 13});
+
+    testValidate(
+        executor_cache.fusion(),
+        cg_outputs,
+        {size, 11, 12, 13},
+        expect,
+        __LINE__,
+        __FILE__);
+  }
+}
+
+TEST_F(NVFuserTest, FusionStandaloneZeros_CUDA) {
+  auto sizes = {0, 1, 10, 17, 1024};
+  auto dtypes = {
+      kBool,
+      kFloat,
+      kLong,
+      kDouble,
+      kHalf,
+      kBFloat16,
+      kInt,
+      kComplexFloat,
+      kComplexDouble};
+
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  Val* size = IrBuilder::create<Int>();
+  fusion->addInput(size);
+  for (auto dtype : dtypes) {
+    if (!isSupportedTypeByDevice(aten_to_data_type(dtype))) {
+      continue;
+    }
+    auto out_tv = zeros({size}, aten_to_data_type(dtype));
+    fusion->addOutput(out_tv);
+    out_tv = zeros({size, size}, aten_to_data_type(dtype));
+    fusion->addOutput(out_tv);
+    out_tv = zeros_like(out_tv);
+    fusion->addOutput(out_tv);
+  }
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  for (auto size : sizes) {
+    std::vector<at::Tensor> expect;
+    expect.reserve(dtypes.size());
+    for (auto dtype : dtypes) {
+      if (!isSupportedTypeByDevice(aten_to_data_type(dtype))) {
+        continue;
+      }
+      const auto options =
+          at::TensorOptions().dtype(dtype).device(at::kCUDA, 0);
+      expect.emplace_back(at::zeros({size}, options));
+      expect.emplace_back(at::zeros({size, size}, options));
+      expect.emplace_back(at::zeros({size, size}, options));
+    }
+    auto cg_outputs = executor_cache.runFusionWithInputs({size});
+
+    testValidate(
+        executor_cache.fusion(),
+        cg_outputs,
+        {size},
+        expect,
+        __LINE__,
+        __FILE__);
+  }
+}
+
+TEST_F(NVFuserTest, FusionStandaloneOnes_CUDA) {
+  auto sizes = {0, 1, 10, 17, 1024};
+  auto dtypes = {
+      kBool,
+      kFloat,
+      kLong,
+      kDouble,
+      kHalf,
+      kBFloat16,
+      kInt,
+      kComplexFloat,
+      kComplexDouble};
+
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  Val* size = IrBuilder::create<Int>();
+  fusion->addInput(size);
+  for (auto dtype : dtypes) {
+    if (!isSupportedTypeByDevice(aten_to_data_type(dtype))) {
+      continue;
+    }
+    auto out_tv = ones({size}, aten_to_data_type(dtype));
+    fusion->addOutput(out_tv);
+    out_tv = ones({size, size}, aten_to_data_type(dtype));
+    fusion->addOutput(out_tv);
+    out_tv = ones_like(out_tv);
+    fusion->addOutput(out_tv);
+  }
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  for (auto size : sizes) {
+    std::vector<at::Tensor> expect;
+    expect.reserve(dtypes.size());
+    for (auto dtype : dtypes) {
+      if (!isSupportedTypeByDevice(aten_to_data_type(dtype))) {
+        continue;
+      }
+      const auto options =
+          at::TensorOptions().dtype(dtype).device(at::kCUDA, 0);
+      expect.emplace_back(at::ones({size}, options));
+      expect.emplace_back(at::ones({size, size}, options));
+      expect.emplace_back(at::ones({size, size}, options));
+    }
+    auto cg_outputs = executor_cache.runFusionWithInputs({size});
+
+    testValidate(
+        executor_cache.fusion(),
+        cg_outputs,
+        {size},
+        expect,
+        __LINE__,
+        __FILE__);
+  }
+}
+
 TEST_F(NVFuserTest, FusionStandaloneARange_CUDA) {
   auto starts_ends = {-1., 0., 10.3, 1024. * 256};
   auto steps = {-1.5, 1., 2.};
   auto dtypes = {kFloat, kLong, kDouble};
 
   for (auto dtype : dtypes) {
+    if (!isSupportedTypeByDevice(aten_to_data_type(dtype))) {
+      continue;
+    }
+
     auto fusion = std::make_unique<Fusion>();
     FusionGuard fg(fusion.get());
 
@@ -97,6 +278,62 @@ TEST_F(NVFuserTest, FusionStandaloneARange_CUDA) {
   }
 }
 
+TEST_F(NVFuserTest, FusionStandaloneEye_CUDA) {
+  auto sizes = {0, 1, 10, 17, 1024};
+  auto dtypes = {
+      kBool,
+      kFloat,
+      kLong,
+      kDouble,
+      kHalf,
+      kBFloat16,
+      kInt,
+      kComplexFloat,
+      kComplexDouble};
+
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  Val* size = IrBuilder::create<Int>();
+  Val* maybe_m = IrBuilder::create<Int>();
+  fusion->addInput(size);
+  fusion->addInput(maybe_m);
+  for (auto dtype : dtypes) {
+    if (!isSupportedTypeByDevice(aten_to_data_type(dtype))) {
+      continue;
+    }
+    auto out_tv1 = eye(size, aten_to_data_type(dtype));
+    fusion->addOutput(out_tv1);
+    auto out_tv2 = eye(size, maybe_m, aten_to_data_type(dtype));
+    fusion->addOutput(out_tv2);
+  }
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+
+  for (auto size : sizes) {
+    std::vector<at::Tensor> expect;
+    expect.reserve(dtypes.size());
+    for (auto dtype : dtypes) {
+      if (!isSupportedTypeByDevice(aten_to_data_type(dtype))) {
+        continue;
+      }
+      const auto options =
+          at::TensorOptions().dtype(dtype).device(at::kCUDA, 0);
+      expect.emplace_back(at::eye(size, options));
+      expect.emplace_back(at::eye(size, 15, options));
+    }
+    auto cg_outputs = executor_cache.runFusionWithInputs({size, 15});
+
+    testValidate(
+        executor_cache.fusion(),
+        cg_outputs,
+        {size, 15},
+        expect,
+        __LINE__,
+        __FILE__);
+  }
+}
+
 } // namespace jit
 } // namespace torch
 #endif // #if defined(USE_CUDA)
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp
index 5e8b6bc1bda69..b10360f00315e 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp
@@ -3,7 +3,8 @@
 #include <gtest/gtest.h>
 
 #include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/inline_propagator.h>
+#include <torch/csrc/jit/codegen/cuda/inlining.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
 #include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/transpose.h>
@@ -261,9 +262,11 @@ TEST_F(NVFuserTest, FusionScheduleTransposeSinTransposeCos_CUDA) {
   testValidate(&fusion, outputs, {input}, {tv_ref}, __LINE__, __FILE__);
 }
 
-// t0->transpose--.
-//                 |
-// t1->transpose---add-->sin->t5
+/*
+ * t0->transpose--.
+ *                 \
+ * t1->transpose---add-->sin->t5
+ */
 TEST_F(NVFuserTest, FusionScheduleTransposeMultipleInput_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
@@ -325,10 +328,12 @@ TEST_F(NVFuserTest, FusionScheduleTransposeMultipleOutput_CUDA) {
       &fusion, outputs, {input}, {tv_ref1, tv_ref2}, __LINE__, __FILE__);
 }
 
-// t0->transpose->sin->t3
-//   \_.-->cos->t5
-//   /
-// t1
+/*
+ * t0->transpose->sin->t3
+ *   \_.-->cos->t5
+ *   /
+ * t1
+ */
 TEST_F(NVFuserTest, FusionScheduleTransposeMultipleInputOutput_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
@@ -366,9 +371,11 @@ TEST_F(NVFuserTest, FusionScheduleTransposeMultipleInputOutput_CUDA) {
       __FILE__);
 }
 
-//             .------>sin------>z
-// x->transpose->transpose->add->y
-//  \_______________________/
+/*
+ *             .------>sin------>z
+ * x->transpose->transpose->add->y
+ *  \_______________________/
+ */
 TEST_F(NVFuserTest, FusionScheduleTransposeMatchingSkipConnection_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
@@ -743,9 +750,7 @@ TEST_F(NVFuserTest, FusionManualScheduleTransposeComplexDAG1_CUDA) {
   }
 
   // inline
-  MaxRootDomainInfoSpanningTree entire_dag(tv9);
-  InlinePropagator inline_propagator(tv9, -1, ComputeAtMode::MostInlined);
-  entire_dag.traverse(&inline_propagator);
+  inlineMost();
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor input0 = at::randn({512, 1024, 256}, options);
@@ -789,6 +794,61 @@ TEST_F(NVFuserTest, FusionViewNoTranspose_CUDA) {
   TORCH_CHECK(!hasAtLeastTwoValidGroups(&fusion));
 }
 
+TEST_F(NVFuserTest, FusionTransposeSelfMapping_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = transpose(tv0, 0, 1);
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  EXPECT_THAT(
+      [&]() { IterDomainGraph(fusion_ptr.get()); },
+      testing::ThrowsMessage<c10::Error>(
+          testing::HasSubstr("Unsupported domain mapping detected")));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({5, 5}, options);
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0});
+
+  auto ref = t0.transpose(0, 1) + t0;
+
+  testValidate(
+      executor_cache.fusion(), cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+#if 0
+// silent wrong result
+TEST_F(NVFuserTest, FusionTransposeViewSelfMapping_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = transpose(tv0, 0, 1);
+  auto tv2 = view(tv0, {2, 3}, {3, 2});
+  auto tv3 = add(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({2, 3}, options);
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0});
+
+  auto ref = t0.transpose(0, 1) + t0.view({3, 2});
+
+  testValidate(
+      executor_cache.fusion(), cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+#endif
+
 // t0------------.
 // t2->broadcast->sub->mul->relu->t6
 // t1------------------'
@@ -932,6 +992,269 @@ TEST_F(NVFuserTest, FusionScheduleTransposeSmallInnerSize3_CUDA) {
   testValidate(&fusion, outputs, {input}, {tv_ref}, __LINE__, __FILE__);
 }
 
+// x->sin->transpose->cos->y
+TEST_F(NVFuserTest, FusionScheduleTranspose2DSmallInnerSize_CUDA) {
+  std::array<std::vector<int64_t>, 2> shapes{
+      std::vector<int64_t>{1024 * 1024 * 128, 2},
+      std::vector<int64_t>{2, 1024 * 1024 * 128}};
+  for (const auto& shape : shapes) {
+    Fusion fusion;
+    FusionGuard fg(&fusion);
+
+    auto tv0 = makeContigTensor(2);
+    fusion.addInput(tv0);
+    auto tv1 = sin(tv0);
+    auto tv2 = transpose(tv1, 0, 1);
+    auto tv3 = cos(tv2);
+    fusion.addOutput(tv3);
+
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::Tensor input = at::randn(shape, options);
+
+    auto lparams = scheduleTranspose(&fusion, {input});
+
+    FusionExecutor fe;
+    fe.compileFusion(&fusion, {input}, lparams);
+    auto outputs = fe.runFusion({input}, lparams);
+
+    auto tv_ref = input.sin().transpose(0, 1).cos();
+
+    testValidate(&fusion, outputs, {input}, {tv_ref}, __LINE__, __FILE__);
+  }
+}
+
+TEST_F(NVFuserTest, FusionTransposeBankConflict1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({32, 32});
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  auto tv2 = transpose(tv1, 0, 1);
+  auto tv3 = set(tv2);
+  fusion.addOutput(tv3);
+
+  tv1->setMemoryType(MemoryType::Shared);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+
+  auto bank_conflict_info = fusion.bankConflictInfo();
+
+  TORCH_CHECK(!bank_conflict_info.empty());
+  for (auto info : bank_conflict_info) {
+    std::pair<int, int> expect{32, 0};
+    TORCH_CHECK(info.second == expect);
+  }
+}
+
+TEST_F(NVFuserTest, FusionTransposeBankConflict2_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({32, 32});
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  auto tv2 = transpose(tv1, 0, 1);
+  auto tv3 = set(tv2);
+  fusion.addOutput(tv3);
+
+  tv1->setMemoryType(MemoryType::Shared);
+  tv1->axis(0)->parallelize(ParallelType::TIDx);
+  tv2->axis(0)->parallelize(ParallelType::TIDx);
+  tv3->axis(0)->parallelize(ParallelType::TIDx);
+
+  auto bank_conflict_info = fusion.bankConflictInfo();
+
+  TORCH_CHECK(!bank_conflict_info.empty());
+  for (auto info : bank_conflict_info) {
+    std::pair<int, int> expect{0, 32};
+    TORCH_CHECK(info.second == expect);
+  }
+}
+
+TEST_F(NVFuserTest, FusionTransposeBankConflict3_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({32, 32}, DataType::Bool);
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  auto tv2 = transpose(tv1, 0, 1);
+  auto tv3 = set(tv2);
+  fusion.addOutput(tv3);
+
+  tv1->setMemoryType(MemoryType::Shared);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+
+  auto bank_conflict_info = fusion.bankConflictInfo();
+
+  TORCH_CHECK(!bank_conflict_info.empty());
+  for (auto info : bank_conflict_info) {
+    std::pair<int, int> expect{8, 0};
+    TORCH_CHECK(info.second == expect);
+  }
+}
+
+TEST_F(NVFuserTest, FusionTransposeBankConflict4_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({32, 32});
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  auto tv2 = transpose(tv1, 0, 1);
+  auto tv3 = set(tv2);
+  fusion.addOutput(tv3);
+
+  tv1->setMemoryType(MemoryType::Shared);
+  tv1->merge(0);
+  tv1->split(0, 4);
+  tv1->split(0, 8);
+  tv1->axis(-1)->parallelize(ParallelType::Vectorize);
+  tv1->axis(0)->parallelize(ParallelType::TIDx);
+  // T1 [TIDx(32), 8, V(4)]
+
+  tv2->setMemoryType(MemoryType::Shared);
+  tv2->merge(0);
+  tv2->split(0, 4);
+  tv2->split(0, 32);
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+  // T2 [8, TIDx(32), 4]
+
+  tv3->merge(0);
+  tv3->split(0, 2);
+  tv3->split(0, 32);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+  // T3 [16, TIDx(32), 2]
+
+  auto bank_conflict_info = fusion.bankConflictInfo();
+
+  TORCH_CHECK(!bank_conflict_info.empty());
+  for (auto info : bank_conflict_info) {
+    std::pair<int, int> expect1{0, 8};
+    std::pair<int, int> expect2{8, 4};
+    std::pair<int, int> expect3{2, 0};
+    TORCH_CHECK(
+        info.second == expect1 || info.second == expect2 ||
+        info.second == expect3);
+  }
+}
+
+TEST_F(NVFuserTest, FusionTransposeBankConflict5_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({1024, 32, 32});
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  auto tv2 = transpose(tv1, 1, 2);
+  auto tv3 = set(tv2);
+  fusion.addOutput(tv3);
+
+  tv1->setMemoryType(MemoryType::Shared);
+  tv1->axis(2)->parallelize(ParallelType::TIDx);
+  tv2->axis(2)->parallelize(ParallelType::TIDx);
+  tv3->axis(2)->parallelize(ParallelType::TIDx);
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto bank_conflict_info = fusion.bankConflictInfo();
+
+  TORCH_CHECK(!bank_conflict_info.empty());
+  for (auto info : bank_conflict_info) {
+    std::pair<int, int> expect{32, 0};
+    TORCH_CHECK(info.second == expect);
+  }
+}
+
+TEST_F(NVFuserTest, FusionTransposeBankConflict6_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({1024, 32, 32});
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  auto tv2 = transpose(tv1, 1, 2);
+  auto tv3 = set(tv2);
+  fusion.addOutput(tv3);
+
+  tv1->setMemoryType(MemoryType::Shared);
+  tv1->axis(2)->parallelize(ParallelType::TIDy);
+  tv2->axis(2)->parallelize(ParallelType::TIDy);
+  tv3->axis(2)->parallelize(ParallelType::TIDy);
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto bank_conflict_info = fusion.bankConflictInfo();
+
+  TORCH_CHECK(!bank_conflict_info.empty());
+  for (auto info : bank_conflict_info) {
+    std::pair<int, int> expect{32, 0};
+    TORCH_CHECK(info.second == expect);
+  }
+}
+
+TEST_F(NVFuserTest, FusionTransposeBankConflict7_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({1024, 8, 8});
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  auto tv2 = transpose(tv1, 1, 2);
+  auto tv3 = set(tv2);
+  fusion.addOutput(tv3);
+
+  tv1->setMemoryType(MemoryType::Shared);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+  tv1->axis(2)->parallelize(ParallelType::TIDy);
+  tv2->axis(2)->parallelize(ParallelType::TIDy);
+  tv3->axis(2)->parallelize(ParallelType::TIDy);
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto bank_conflict_info = fusion.bankConflictInfo();
+
+  TORCH_CHECK(!bank_conflict_info.empty());
+  for (auto info : bank_conflict_info) {
+    std::pair<int, int> expect{0, 2};
+    TORCH_CHECK(info.second == expect);
+  }
+}
+
+TEST_F(NVFuserTest, FusionTransposeBankConflict8_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({1024, 8, 8});
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  auto tv2 = transpose(tv1, 1, 2);
+  auto tv3 = set(tv2);
+  fusion.addOutput(tv3);
+
+  tv1->setMemoryType(MemoryType::Shared);
+  tv1->axis(2)->parallelize(ParallelType::TIDx);
+  tv2->axis(2)->parallelize(ParallelType::TIDy);
+  tv3->axis(2)->parallelize(ParallelType::TIDy);
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto bank_conflict_info = fusion.bankConflictInfo();
+
+  // no bank confliction
+  TORCH_CHECK(bank_conflict_info.empty());
+}
+
 } // namespace jit
 } // namespace torch
 #endif // #if defined(USE_CUDA)
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_utils.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_utils.cpp
new file mode 100644
index 0000000000000..19c3c6f9bf6db
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_utils.cpp
@@ -0,0 +1,273 @@
+#if defined(USE_CUDA)
+#include <gmock/gmock-matchers.h>
+#include <gtest/gtest.h>
+
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
+#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
+#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
+#include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
+
+// Tests go in torch::jit
+namespace torch {
+namespace jit {
+
+using namespace torch::jit::fuser::cuda;
+
+TEST_F(NVFuserTest, FusionSplitDims_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int64_t* p = prime_numbers;
+  auto tv = makeConcreteTensor(
+      {p[0] * p[1] * p[2], p[3], p[4], p[5] * p[6], p[7], p[8], p[9] * p[10]});
+  std::vector<size_t> dims{0, 1, 2, 3, 4, 5, 6};
+  scheduler_utils::splitDims(
+      tv, {{0, p[2]}, {0, p[1]}, {3, p[6]}, {6, p[10]}}, dims);
+  TORCH_CHECK(tv->nDims() == 11);
+  for (auto i : c10::irange(11)) {
+    TORCH_CHECK(tv->axis(i)->extent()->evaluateInt() == p[i]);
+  }
+  std::vector<size_t> expect{0, 3, 4, 5, 7, 8, 9};
+  TORCH_CHECK(dims == expect);
+}
+
+TEST_F(NVFuserTest, FusionMergeDims_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int64_t* p = prime_numbers;
+  auto tv = makeConcreteTensor(
+      {p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10]});
+  std::vector<size_t> dims{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+  auto merged = scheduler_utils::mergeDims(tv, {2, 3, 7, 8, 9}, dims);
+  TORCH_CHECK(merged == 2);
+  std::vector<int64_t> expect_shape{
+      p[0], p[1], p[2] * p[3] * p[7] * p[8] * p[9], p[4], p[5], p[6], p[10]};
+  TORCH_CHECK(tv->nDims() == expect_shape.size());
+  for (auto i : c10::irange(expect_shape.size())) {
+    TORCH_CHECK(tv->axis(i)->extent()->evaluateInt() == expect_shape[i]);
+  }
+  std::vector<size_t> expect_dims{0, 1, 2, 2, 3, 4, 5, 2, 2, 2, 6};
+  TORCH_CHECK(dims == expect_dims);
+}
+
+TEST_F(NVFuserTest, FusionReorderAsRFactor_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int a = 1, b = 2, c = 3, d = 4;
+
+  TensorView* tv0 = makeConcreteTensor({a, b, c, d});
+  fusion.addInput(tv0);
+  fusion.addOutput(tv0);
+
+  // [a, b, c, d]
+  tv0->merge(0, 2);
+  // [a*c, b, d]
+  tv0->split(1, 2);
+  // [a*c, bo, bi, d]
+  tv0->split(3, 3);
+  // [a*c, bo, bi, do, di]
+  tv0->reorder({{1, 4}, {2, 1}, {3, 3}, {4, 2}});
+  // [a*c, bi, di, do, bo]
+  tv0->merge(3);
+  tv0->merge(1);
+  // [a*c, bi*di, do*bo]
+  tv0->reorder({{0, 2}});
+  // [bi*di, do*bo, a*c]
+  // Order we want is:
+  // [a*c, do*bo, bi*di]
+  auto old2new = scheduler_utils::domainReorderAsRfactorMap(tv0);
+  TORCH_CHECK(old2new[0] == 2);
+  TORCH_CHECK(old2new[1] == 1);
+  TORCH_CHECK(old2new[2] == 0);
+}
+
+TEST_F(NVFuserTest, FusionDisjointViewSet_CUDA) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeConcreteTensor({2, 3, 4});
+  fusion->addInput(tv0);
+
+  auto tv1 = view(tv0, {2, 3, 4}, {2, 12});
+
+  auto tv2 = makeConcreteTensor({2, 12});
+  fusion->addInput(tv2);
+
+  auto tv3 = add(tv2, tv1);
+  fusion->addOutput(tv3);
+
+  auto disjoint_exact = scheduler_utils::disjointViewSets(fusion.get());
+
+  TORCH_INTERNAL_ASSERT(
+      disjoint_exact.strictAreMapped(tv0->axis(1), tv0->axis(2)));
+}
+
+TEST_F(NVFuserTest, FusionMatchingViews_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int x = 2, y = 3, z = 4;
+
+  auto tv0 = makeConcreteTensor({x, y, z});
+  fusion.addInput(tv0);
+
+  auto tv1 = view(tv0, {x, y, z}, {x * y, z});
+
+  auto tv2 = sin(tv1);
+
+  auto tv3 = view(tv2, {x * y, z}, {x, y * z});
+  fusion.addOutput(tv3);
+
+  auto tv4 = makeConcreteTensor({x, y, z});
+  fusion.addInput(tv4);
+
+  auto tv5 = view(tv4, {x, y, z}, {x, y * z});
+  fusion.addOutput(tv5);
+
+  // Link 0 and 3 together for view analysis done based on before the views
+  // actually happened.
+  auto tv6 = add(tv0, tv4);
+  fusion.addOutput(tv6);
+
+  TORCH_INTERNAL_ASSERT(!scheduler_utils::allMatchingViews(&fusion));
+}
+
+TEST_F(NVFuserTest, FusionBroadcastViewMultiples_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int a = 2, b = 3, c = 5, d = 7, e = 11, f = 13;
+
+  auto tv0 = makeConcreteTensor({a, b, c, d, e, f});
+  fusion.addInput(tv0);
+
+  // tie e and f together (swapping values next to eachother enforces they'll be
+  // merged then split by view)
+  auto tv1 = view(tv0, {a, b, c, d, e, f}, {a, b, c, d, f, e});
+  fusion.addOutput(tv1);
+
+  // swap d and e
+  auto tv2 = transpose(tv1, 3, 4);
+  // tie c and e together
+  auto tv3 = view(tv2, {a, b, c, e, d, f}, {a, b, e, c, d, f});
+
+  fusion.addOutput(tv3);
+
+  auto tv4 = set(tv0);
+  // Use tv4 as the reference
+  fusion.addOutput(tv4);
+
+  // a, b, d aren't tied to anything so they are valid broadcasts from the
+  // perspective of broadcast multiples analysis.
+  auto tv5 = makeConcreteTensor({1, 1, c, 1, e, f});
+  fusion.addInput(tv5);
+
+  // c, e, and f are tied together so this shouldn't be counted as a broadcast
+  // dim in the reference since it's a partial bcast
+  auto tv6 = makeConcreteTensor({a, b, c, 1, 1, 1});
+  fusion.addInput(tv6);
+
+  // c, e, and f are tied together this should be counted as a broadcast dim in
+  // the reference since it's a partial bcast
+  auto tv7 = makeConcreteTensor({a, b, 1, 1, 1, 1});
+  fusion.addInput(tv7);
+
+  // plug the broadcasts into the fusion
+  auto tv8 = add(tv5, tv4);
+  auto tv9 = add(tv6, tv8);
+  auto tv10 = add(tv7, tv9);
+  fusion.addOutput(tv10);
+
+  auto bcast_info =
+      scheduler_utils::getBroadcastMultiples(tv4, DataType::Int32);
+
+  // linked c, e, and f together so they should have the same id.
+  TORCH_CHECK(bcast_info.view_disjoint_set_ids[5] == 0);
+  TORCH_CHECK(bcast_info.view_disjoint_set_ids[4] == 0);
+  TORCH_CHECK(bcast_info.view_disjoint_set_ids[3] == 1);
+  TORCH_CHECK(bcast_info.view_disjoint_set_ids[2] == 0);
+  TORCH_CHECK(bcast_info.view_disjoint_set_ids[1] == 2);
+  TORCH_CHECK(bcast_info.view_disjoint_set_ids[0] == 3);
+
+  TORCH_CHECK(
+      scheduler_utils::breakIsDisjoint(bcast_info.view_disjoint_set_ids, 0));
+  TORCH_CHECK(
+      scheduler_utils::breakIsDisjoint(bcast_info.view_disjoint_set_ids, 1));
+  TORCH_CHECK(
+      scheduler_utils::breakIsDisjoint(bcast_info.view_disjoint_set_ids, 2));
+  TORCH_CHECK(
+      !scheduler_utils::breakIsDisjoint(bcast_info.view_disjoint_set_ids, 3));
+  TORCH_CHECK(
+      !scheduler_utils::breakIsDisjoint(bcast_info.view_disjoint_set_ids, 4));
+  TORCH_CHECK(
+      !scheduler_utils::breakIsDisjoint(bcast_info.view_disjoint_set_ids, 5));
+
+  // tv0  [a, b, c, d, e, f]
+  // tv1  [a, b, c, d, e, f]
+  // tv3  [a, b, c, d, e, f]
+  // tv4  [a, b, c, d, e, f]
+  // tv5  [1, 1, c, 1, e, f] -> Left bcasts should show up in some multiples
+  // tv6  [a, b, c, 1, 1, 1] -> view interferes with bcasts, non of these should
+  //                            show up
+  // tv7  [a, b, 1, 1, 1, 1] -> These broadcasts could be recognized
+  // tv10 [a, b, c, d, e, f]
+
+  TORCH_CHECK(
+      bcast_info.broadcast_multiples[0].lhs_multiple == 0 &&
+      bcast_info.broadcast_multiples[0].rhs_multiple == 8 * 4);
+
+  TORCH_CHECK(
+      bcast_info.broadcast_multiples[1].lhs_multiple == 7 * 4 &&
+      bcast_info.broadcast_multiples[1].rhs_multiple == 8 * 4);
+
+  TORCH_CHECK(
+      bcast_info.broadcast_multiples[2].lhs_multiple == 7 * 4 &&
+      bcast_info.broadcast_multiples[2].rhs_multiple == 7 * 4);
+
+  TORCH_CHECK(
+      bcast_info.broadcast_multiples[3].lhs_multiple == 8 * 4 &&
+      bcast_info.broadcast_multiples[3].rhs_multiple == 7 * 4);
+
+  TORCH_CHECK(
+      bcast_info.broadcast_multiples[4].lhs_multiple == 8 * 4 &&
+      bcast_info.broadcast_multiples[4].rhs_multiple == 7 * 4);
+
+  TORCH_CHECK(
+      bcast_info.broadcast_multiples[5].lhs_multiple == 8 * 4 &&
+      bcast_info.broadcast_multiples[5].rhs_multiple == 7 * 4);
+}
+
+TEST_F(NVFuserTest, FusionTVDomainGuard_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<bool> all_true = {true, true};
+  std::vector<bool> all_false = {false, false};
+  std::vector<bool> false_true = {false, true};
+  auto tv = TensorViewBuilder().ndims(2).contiguity(false_true).build();
+  TORCH_CHECK(tv->domain()->contiguity() == false_true);
+  {
+    auto guard = ir_utils::overrideContiguityGuard(tv, true);
+    TORCH_CHECK(tv->domain()->contiguity() == all_true);
+  }
+  TORCH_CHECK(tv->domain()->contiguity() == false_true);
+  {
+    auto guard = ir_utils::overrideContiguityGuard(tv, false);
+    TORCH_CHECK(tv->domain()->contiguity() == all_false);
+  }
+  TORCH_CHECK(tv->domain()->contiguity() == false_true);
+  {
+    auto guard1 = ir_utils::overrideContiguityGuard(tv, true);
+    auto guard2 = std::move(guard1);
+    TORCH_CHECK(tv->domain()->contiguity() == all_true);
+  }
+  TORCH_CHECK(tv->domain()->contiguity() == false_true);
+}
+
+} // namespace jit
+} // namespace torch
+#endif // #if defined(USE_CUDA)
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h b/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h
index 0adaaa9786c30..f70c7a80f76fb 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h
@@ -1,3 +1,5 @@
+#pragma once
+
 #include <torch/csrc/jit/codegen/cuda/executor_utils.h>
 #include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
@@ -5,43 +7,16 @@
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 
 #include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDACachingAllocator.h>
-#include <torch/torch.h>
 
 #include <unordered_map>
 
+// Tests go in torch::jit
 namespace torch {
 namespace jit {
-namespace fuser {
-namespace cuda {
-
-inline bool deviceMajorMinorCheck(int major, int minor = 0) {
-  auto dev_prop = at::cuda::getCurrentDeviceProperties();
-  if (dev_prop->major < major ||
-      (dev_prop->major == major && dev_prop->minor < minor)) {
-    return false;
-  }
-  return true;
-}
 
-inline int deviceSMCount() {
-  int sm_count = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
-  return sm_count;
-}
+using namespace torch::jit::fuser::cuda;
 
-class NVFuserTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    // requires PASCAL or newer
-    if (!deviceMajorMinorCheck(6)) {
-      GTEST_SKIP() << "skipping tests on pre-PASCAL GPUs";
-    }
-  }
-
-  void TearDown() override {
-    c10::cuda::CUDACachingAllocator::emptyCache();
-  }
-};
+namespace {
 
 struct ValidationConstants {
   // Tolerances generated from randn + add + sum fusion
@@ -72,8 +47,6 @@ struct ValidationConstants {
   double base_float_rel_tol = -1;
 };
 
-namespace {
-
 // Returns abs and relative values to use for validation
 std::pair<double, double> getTolerance(
     DataType dtype,
@@ -336,15 +309,13 @@ ExpressionEvaluator bindInputsAndLaunchParams(
   return expr_eval;
 }
 
-} // namespace
-
 // Validation will look through the fusion and figure out how many elements were
 // reduced to create each output. It will then compute a tolernace to use for
 // allclose based on experimental results. The experimental results were based
 // on adding two tensors then summing them. This of course has an assumption
 // that we're always summing values between -2 and 2. If we start summing values
 // larger than that this approach might not hold.
-inline void testValidate(
+void testValidate(
     Fusion* fusion,
     const std::vector<at::Tensor>& fusion_outputs,
     const at::ArrayRef<IValue>& aten_inputs,
@@ -464,18 +435,6 @@ inline void testValidate(
   }
 }
 
-inline void clearL2Cache() {
-  torch::NoGradGuard no_grad;
-  auto l2_cache_size = at::cuda::getCurrentDeviceProperties()->l2CacheSize;
-  auto options =
-      torch::TensorOptions().dtype(torch::kFloat32).device(at::kCUDA, 0);
-
-  auto l2_elems = l2_cache_size / 4;
-  torch::Tensor t0 = torch::empty(l2_elems, options);
-  torch::Tensor t1 = torch::clone(t0);
-};
-
-} // namespace cuda
-} // namespace fuser
+} // namespace
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_view.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_view.cpp
index 60194ade674d1..3892762298e14 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_view.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_view.cpp
@@ -10,6 +10,7 @@
 #include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/fusion_segmenter.h>
+#include <torch/csrc/jit/codegen/cuda/inlining.h>
 #include <torch/csrc/jit/codegen/cuda/interface.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_builder.h>
@@ -22,6 +23,7 @@
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/lower_divisible_split.h>
 #include <torch/csrc/jit/codegen/cuda/mutator.h>
 #include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
 #include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
@@ -1341,6 +1343,517 @@ TEST_F(NVFuserTest, FusionReductionFlatten1_CUDA) {
       executor_cache.fusion(), cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
 }
 
+TEST_F(NVFuserTest, FusionPwiseViewSchedule_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int x = 31, y = 65, z = 103;
+
+  auto tv0 = makeConcreteTensor({x, y, z});
+  fusion.addInput(tv0);
+
+  auto tv1 = sin(tv0);
+
+  auto tv2 = view(tv1, {x, y, z}, {x, y * z});
+  fusion.addOutput(tv2);
+
+  auto tv3 = makeConcreteTensor({x, y, z});
+  fusion.addInput(tv3);
+
+  auto tv4 = view(tv3, {x, y, z}, {x, y * z});
+  fusion.addOutput(tv4);
+
+  // Link 0 and 3 together for view analysis done based on before the views
+  // actually happened.
+  auto tv5 = add(tv0, tv3);
+  fusion.addOutput(tv5);
+
+  TORCH_INTERNAL_ASSERT(scheduler_utils::allMatchingViews(&fusion));
+  {
+    TransformPropagator propagator(tv4);
+    MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
+  }
+
+  for (auto i : c10::irange(tv5->nDims() - 1)) {
+    tv5->merge(0);
+  }
+  tv5->split(0, 32);
+  tv5->split(0, 4);
+  tv5->axis(0)->parallelize(ParallelType::BIDx);
+  tv5->axis(1)->parallelize(ParallelType::Unroll);
+  tv5->axis(2)->parallelize(ParallelType::TIDx);
+
+  {
+    TransformPropagator propagator(tv5);
+    MaxRootDomainInfoSpanningTree spanning_tree(tv5);
+    spanning_tree.traverse(&propagator);
+    scheduler_utils::parallelizeAllLike(tv5);
+
+    // Inline the schedule
+    inlineMost();
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({x, y, z}, options);
+  at::Tensor t3 = at::randn({x, y, z}, options);
+  auto t1 = sin(t0);
+  auto t2 = at::native::view(t1, {x, y * z});
+  auto t4 = at::native::view(t3, {x, y * z});
+  auto t5 = t0 + t3;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t3});
+  auto cg_outputs = fe.runFusion({t0, t3});
+
+  testValidate(&fusion, cg_outputs, {t0, t3}, {t2, t4, t5}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionSumViewSchedule_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int x = 31, y = 65, z = 103;
+
+  auto tv0 = makeConcreteTensor({x, y, z});
+  fusion.addInput(tv0);
+
+  auto tv1 = sin(tv0);
+
+  auto tv2 = view(tv1, {x, y, z}, {x, y * z});
+  fusion.addOutput(tv2);
+
+  auto tv3 = makeConcreteTensor({x, y, z});
+  fusion.addInput(tv3);
+
+  auto tv4 = view(tv3, {x, y, z}, {x, y * z});
+  auto tv5 = sum(tv4, {1});
+  fusion.addOutput(tv5);
+
+  // Link 0 and 3 together for view analysis done based on before the views
+  // actually happened.
+  auto tv6 = add(tv0, tv3);
+  fusion.addOutput(tv6);
+
+  TORCH_INTERNAL_ASSERT(scheduler_utils::allMatchingViews(&fusion));
+  {
+    TransformPropagator propagator(tv4);
+    MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
+  }
+
+  tv5->split(1, 128);
+  tv5->split(1, 4);
+
+  auto tv5_rf = tv5->rFactor({1, 2});
+  tv5_rf->axis(0)->parallelize(ParallelType::BIDx);
+  tv5_rf->axis(2)->parallelize(ParallelType::Unroll);
+  tv5_rf->axis(3)->parallelize(ParallelType::TIDx);
+
+  {
+    TransformPropagator propagator(tv5_rf);
+    MaxRootDomainInfoSpanningTree spanning_tree(tv5_rf);
+    spanning_tree.traverse(&propagator);
+    scheduler_utils::parallelizeAllLike(tv5_rf);
+
+    // Inline the schedule
+    inlineMost();
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({x, y, z}, options);
+  at::Tensor t3 = at::randn({x, y, z}, options);
+  auto t1 = sin(t0);
+  auto t2 = at::native::view(t1, {x, y * z});
+  auto t4 = at::native::view(t3, {x, y * z});
+  auto t5 = t4.sum({1});
+  auto t6 = t0 + t3;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t3});
+  auto cg_outputs = fe.runFusion({t0, t3});
+
+  testValidate(&fusion, cg_outputs, {t0, t3}, {t2, t5, t6}, __LINE__, __FILE__);
+}
+
+// Make sure matching views are segmented into the same kernel
+TEST_F(NVFuserTest, FusionViewMagicSchedule1_CUDA) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  int x = 31, y = 65, z = 103;
+
+  auto tv0 = makeConcreteTensor({x, y, z});
+  fusion.addInput(tv0);
+
+  auto tv1 = sin(tv0);
+
+  auto tv2 = view(tv1, {x, y, z}, {x, y * z});
+  fusion.addOutput(tv2);
+
+  auto tv3 = makeConcreteTensor({x, y, z});
+  fusion.addInput(tv3);
+
+  auto tv4 = view(tv3, {x, y, z}, {x, y * z});
+  fusion.addOutput(tv4);
+
+  // Link 0 and 3 together for view analysis done based on before the views
+  // actually happened.
+  auto tv5 = add(tv0, tv3);
+  fusion.addOutput(tv5);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({x, y, z}, options);
+  at::Tensor t3 = at::randn({x, y, z}, options);
+  auto t1 = sin(t0);
+  auto t2 = at::native::view(t1, {x, y * z});
+  auto t4 = at::native::view(t3, {x, y * z});
+  auto t5 = t0 + t3;
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0, t3});
+  TORCH_CHECK(!executor_cache.getMostRecentKernelRuntime()->isSegmented());
+
+  testValidate(&fusion, cg_outputs, {t0, t3}, {t2, t4, t5}, __LINE__, __FILE__);
+}
+
+// Make sure views of views are correct
+TEST_F(NVFuserTest, FusionViewMagicSchedule2_CUDA) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  int x = 31, y = 65, z = 103;
+
+  auto tv0 = makeConcreteTensor({x, y, z});
+  fusion.addInput(tv0);
+
+  auto tv1 = sin(tv0);
+
+  auto tv2 = view(tv1, {x, y, z}, {x, y * z});
+  auto tv3 = view(tv2, {x, y * z}, {x * y, z});
+  auto tv4 = view(tv3, {x * y, z}, {y, x * z});
+  auto tv5 = view(tv4, {y, x * z}, {x, y, z});
+  fusion.addOutput(tv5);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({x, y, z}, options);
+  auto aten_out = sin(t0);
+
+  // For now pointwise scheduler only accepts a single view at a time, so this
+  // will be broken up into multiple kernels. This is due to the reference check
+  // looking for all mappings to all input IDs.
+  // TODO: Fix the reference check for this case
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0});
+
+  testValidate(&fusion, cg_outputs, {t0}, {aten_out}, __LINE__, __FILE__);
+}
+
+// Make sure broadcasts not on the view path that don't interfere with view are
+// segmented in one kernel and correctly trigger 2D pointwise scheduling
+TEST_F(NVFuserTest, FusionViewMagicSchedule3_CUDA) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  int w = 15, x = 31, y = 49, z = 65;
+
+  auto tv0 = makeConcreteTensor({x, y, z});
+  fusion.addInput(tv0);
+
+  auto tv1 = sin(tv0);
+
+  auto tv2 = view(tv1, {x, y, z}, {x, y * z});
+  fusion.addOutput(tv2);
+
+  auto tv3 = makeConcreteTensor({x, y, z});
+  fusion.addInput(tv3);
+
+  auto tv4 = view(tv3, {x, y, z}, {x, y * z});
+  fusion.addOutput(tv4);
+
+  // Link 0 and 3 together for view analysis done based on before the views
+  // actually happened.
+  auto tv5 = add(tv0, tv3);
+  fusion.addOutput(tv5);
+
+  // Broadcast on another branch to drive the pointwise reference to not be on
+  // the view paths.
+
+  auto tv6 = makeConcreteTensor({w, x, y, z});
+  fusion.addInput(tv6);
+  auto tv7 = broadcast(tv0, {true, false, false, false});
+  auto tv8 = add(tv6, tv7);
+  // tv8 should be the reference for the pointwise fusion. This broadcast
+  // pattern doesn't interfere with the views, so this should also be scheduled
+  // as 2D.
+  fusion.addOutput(tv8);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({x, y, z}, options);
+  at::Tensor t3 = at::randn({x, y, z}, options);
+  auto t1 = sin(t0);
+  auto t2 = at::native::view(t1, {x, y * z});
+  auto t4 = at::native::view(t3, {x, y * z});
+  auto t5 = t0 + t3;
+  at::Tensor t6 = at::randn({w, x, y, z}, options);
+  auto t8 = t6.add(t0);
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  // Collect the heuristic params
+  executor_cache.profile(true);
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0, t3, t6});
+
+  TORCH_CHECK(!executor_cache.getMostRecentKernelRuntime()->isSegmented());
+  TORCH_CHECK(executor_cache.getMostRecentExecutorInfo()
+                  .params->isA<PointwiseParams>());
+  auto pparams =
+      executor_cache.getMostRecentExecutorInfo().params->as<PointwiseParams>();
+  TORCH_CHECK(pparams->break_point == 1);
+
+  testValidate(
+      &fusion, cg_outputs, {t0, t3, t6}, {t2, t4, t5, t8}, __LINE__, __FILE__);
+}
+
+// Make sure broadcasts through views when not conflicting with view are
+// segmented into one kernel and trigger 2D pointwise scheduler.
+TEST_F(NVFuserTest, FusionViewMagicSchedule4_CUDA) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  int w = 15, x = 31, y = 49, z = 65;
+
+  auto tv0 = makeConcreteTensor({x, y, z});
+  fusion.addInput(tv0);
+
+  auto tv1 = sin(tv0);
+
+  auto tv2 = view(tv1, {x, y, z}, {x, y * z});
+  fusion.addOutput(tv2);
+
+  auto tv3 = makeConcreteTensor({x, y, z});
+  fusion.addInput(tv3);
+
+  auto tv4 = makeConcreteTensor({x, 1, 1});
+  fusion.addInput(tv4);
+
+  auto tv5 = add(tv4, tv3);
+
+  auto tv6 = view(tv5, {x, y, z}, {x, y * z});
+  fusion.addOutput(tv6);
+
+  // Link 0 and 3 together for view analysis done based on before the views
+  // actually happened.
+  auto tv7 = add(tv0, tv3);
+  fusion.addOutput(tv7);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({x, y, z}, options);
+  at::Tensor t3 = at::randn({x, y, z}, options);
+  at::Tensor t4 = at::randn({x, 1, 1}, options);
+  auto t1 = sin(t0);
+  auto t2 = at::native::view(t1, {x, y * z});
+  auto t5 = t4 + t3;
+  auto t6 = at::native::view(t5, {x, y * z});
+  auto t7 = t0 + t3;
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  // Collect the heuristic params
+  executor_cache.profile(true);
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0, t3, t4});
+
+  TORCH_CHECK(!executor_cache.getMostRecentKernelRuntime()->isSegmented());
+  TORCH_CHECK(executor_cache.getMostRecentExecutorInfo()
+                  .params->isA<PointwiseParams>());
+  auto pparams =
+      executor_cache.getMostRecentExecutorInfo().params->as<PointwiseParams>();
+  TORCH_CHECK(pparams->break_point == 1);
+
+  testValidate(
+      &fusion, cg_outputs, {t0, t3, t4}, {t2, t6, t7}, __LINE__, __FILE__);
+}
+
+// Make sure different views that are consumed by the reference are segmented
+// into a single kernel.
+TEST_F(NVFuserTest, FusionViewMagicSchedule5_CUDA) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  int w = 15, x = 31, y = 49, z = 65;
+
+  auto tv0 = makeConcreteTensor({w, x, y * z});
+  fusion.addInput(tv0);
+  auto tv1 = sin(tv0);
+  auto tv2 = view(tv1, {w, x, y * z}, {z, y, x, w});
+
+  auto tv3 = makeConcreteTensor({w, x * y, z});
+  fusion.addInput(tv3);
+  auto tv4 = cos(tv3);
+  auto tv5 = view(tv4, {w, x * y, z}, {z, y, x, w});
+
+  auto tv6 = add(tv2, tv5);
+  fusion.addOutput(tv6);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({w, x, y * z}, options);
+  auto t1 = sin(t0);
+  auto t2 = at::native::view(t1, {z, y, x, w});
+  at::Tensor t3 = at::randn({w, x * y, z}, options);
+  auto t4 = cos(t3);
+  auto t5 = at::native::view(t4, {z, y, x, w});
+  auto t6 = add(t2, t5);
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  // Collect the heuristic params
+  executor_cache.profile(true);
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0, t3});
+
+  TORCH_CHECK(!executor_cache.getMostRecentKernelRuntime()->isSegmented());
+  TORCH_CHECK(executor_cache.getMostRecentExecutorInfo()
+                  .params->isA<PointwiseParams>());
+
+  testValidate(&fusion, cg_outputs, {t0, t3}, {t6}, __LINE__, __FILE__);
+}
+
+// Make sure different views that are consumed by the reference are segmented
+// into a single kernel.
+TEST_F(NVFuserTest, FusionViewMapping_CUDA) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  int w = 15, x = 31, y = 49, z = 65;
+
+  auto tv0 = makeConcreteTensor({w, x, y * z});
+  fusion.addInput(tv0);
+  auto tv1 = sin(tv0);
+  auto tv2 = view(tv1, {w, x, y * z}, {z, y, x, w});
+
+  auto tv3 = makeConcreteTensor({w, x * y, z});
+  fusion.addInput(tv3);
+  auto tv4 = cos(tv3);
+  auto tv5 = view(tv4, {w, x * y, z}, {z, y, x, w});
+
+  auto tv6 = add(tv2, tv5);
+  fusion.addOutput(tv6);
+
+  tv6->merge(0);
+  tv6->merge(0);
+  tv6->merge(0);
+  tv6->split(0, 128);
+  tv6->split(0, 4);
+  tv6->axis(0)->parallelize(ParallelType::BIDx);
+  tv6->axis(1)->parallelize(ParallelType::Unroll);
+  tv6->axis(2)->parallelize(ParallelType::TIDx);
+
+  TransformPropagator propagator(tv6);
+  MaxRootDomainInfoSpanningTree spanning_tree(tv6);
+  spanning_tree.traverse(&propagator);
+  scheduler_utils::parallelizeAllLike(tv6);
+
+  // Inline the schedule
+  inlineMost();
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({w, x, y * z}, options);
+  auto t1 = sin(t0);
+  auto t2 = at::native::view(t1, {z, y, x, w});
+  at::Tensor t3 = at::randn({w, x * y, z}, options);
+  auto t4 = cos(t3);
+  auto t5 = at::native::view(t4, {z, y, x, w});
+  auto t6 = add(t2, t5);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t3});
+  auto cg_outputs = fe.runFusion({t0, t3});
+
+  testValidate(&fusion, cg_outputs, {t0, t3}, {t6}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionLowerDivisibleSplits_CUDA) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  int w = 15, x = 31, y = 49, z = 65;
+
+  auto tv0 = makeContigTensor(4);
+  fusion.addInput(tv0);
+  auto tv1 = sin(tv0);
+  auto tv2 = view(tv1, {w, x, y, z}, {z, y, x, w});
+
+  fusion.addOutput(tv2);
+
+  tv2->merge(0)->merge(0)->merge(0)->split(0, 4)->split(0, 8, false);
+
+  TransformPropagator propagator(tv2);
+  MaxRootDomainInfoSpanningTree spanning_tree(tv2);
+  spanning_tree.traverse(&propagator);
+  scheduler_utils::parallelizeAllLike(tv2);
+
+  // Inline the schedule
+  inlineMost();
+
+  auto divisible_splits = getAllDivisibleSplits(&fusion);
+
+  // Operations on all tensors are basically:
+  // [10] merge(0)          [9]->outer->definition
+  // [9] merge(0)           [8]->outer->definition
+  // [8] merge(0)           [7]->in->definition
+  // [7] split(0, z, false) [6]->in->definition
+  // [6] split(1, y, false) [5]->in->definition
+  // [5] split(2, x, false) [3]->inner->definition
+  // RFactor of tv2
+  // [4] merge(0)           [3]->outer->definition
+  // [3] merge(0)           [2]->outer->definition
+  // [2] merge(0)           [1]->in->definition
+  // [1] split(0, 4)        [0]->in->definition
+  // [0] split(0, 8, false) tv->axis(0)->definition
+
+  for (auto tv : std::vector<TensorView*>({tv2, tv1, tv0})) {
+    auto transform_0 = tv->axis(0)->definition()->as<Split>();
+    auto transform_1 = transform_0->in()->definition()->as<Split>();
+    auto transform_2 = transform_1->in()->definition()->as<Merge>();
+    auto transform_3 = transform_2->outer()->definition()->as<Merge>();
+
+    auto transform_5 = transform_3->inner()->definition()->as<Split>();
+    auto transform_6 = transform_5->in()->definition()->as<Split>();
+    auto transform_7 = transform_6->in()->definition()->as<Split>();
+
+    TORCH_CHECK(
+        divisible_splits.find(transform_5) != divisible_splits.end(),
+        "Expecting: ",
+        transform_5->toString(),
+        "\nFrom TV: ",
+        tv,
+        "\nTo be a divisible split.");
+    TORCH_CHECK(
+        divisible_splits.find(transform_6) != divisible_splits.end(),
+        "Expecting: ",
+        transform_6->toString(),
+        "\nFrom TV: ",
+        tv,
+        "\nTo be a divisible split.");
+    TORCH_CHECK(
+        divisible_splits.find(transform_7) != divisible_splits.end(),
+        "Expecting: ",
+        transform_7->toString(),
+        "\nFrom TV: ",
+        tv,
+        "\nTo be a divisible split.");
+  }
+}
+
 } // namespace jit
 } // namespace torch
 #endif // #if defined(USE_CUDA)
diff --git a/torch/csrc/jit/codegen/cuda/test/test_utils.h b/torch/csrc/jit/codegen/cuda/test/test_utils.h
index c8bf546daf4a0..8b199b930f247 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_utils.h
+++ b/torch/csrc/jit/codegen/cuda/test/test_utils.h
@@ -1,8 +1,21 @@
 #pragma once
 
-#include <cstddef>
-
+#include <torch/csrc/jit/codegen/cuda/executor.h>
+#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/lower_magic_zero.h>
+#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
+
+#include <ATen/Context.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <torch/torch.h>
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
 
 // Tests go in torch::jit
 namespace torch {
@@ -11,7 +24,7 @@ namespace jit {
 using namespace torch::jit::fuser::cuda;
 
 namespace {
-
+bool var;
 // Make a tensor that is known to be fully contiguous of dimensionality=ndims,
 // but unknown sizes
 TensorView* makeContigTensor(size_t ndims, DataType dtype = DataType::Float) {
@@ -84,6 +97,277 @@ int64_t prime_numbers[] = {
     1087, 1091, 1093, 1097, 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163,
     1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223};
 
+bool deviceMajorMinorCheck(int major, int minor = 0) {
+  auto dev_prop = at::cuda::getCurrentDeviceProperties();
+  if (dev_prop->major < major ||
+      (dev_prop->major == major && dev_prop->minor < minor)) {
+    return false;
+  }
+  return true;
+}
+
+int deviceSMCount() {
+  int sm_count = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+  return sm_count;
+}
+
+void clearL2Cache() {
+  torch::NoGradGuard no_grad;
+  auto l2_cache_size = at::cuda::getCurrentDeviceProperties()->l2CacheSize;
+  auto options =
+      torch::TensorOptions().dtype(torch::kFloat32).device(at::kCUDA, 0);
+
+  auto l2_elems = l2_cache_size / 4;
+  torch::Tensor t0 = torch::empty(l2_elems, options);
+  torch::Tensor t1 = torch::clone(t0);
+};
+
+TensorView* loweredTv(TensorView* tv, GpuLower& gpulw) {
+  auto used_tvs = ir_utils::allTvs(gpulw.kernel()->as<Fusion>());
+  TensorView* matching_tv = nullptr;
+  for (auto lowered_tv : used_tvs) {
+    if (lowered_tv->name() == tv->name()) {
+      matching_tv = lowered_tv;
+    }
+  }
+  TORCH_INTERNAL_ASSERT(matching_tv != nullptr);
+  return matching_tv;
+}
+
+class PredicatedChecker : public kir::IrVisitor {
+ public:
+  // Checks if the provided tv is written to within a non-trivial conditional
+  static bool isPredicated(TensorView* tv, GpuLower& gpulw) {
+    PredicatedChecker checker(
+        loweredTv(tv, gpulw), gpulw.kernel()->topLevelExprs());
+    return checker.is_predicated_;
+  }
+
+ private:
+  PredicatedChecker() = delete;
+
+  PredicatedChecker(TensorView* tv, std::vector<Expr*> exprs) : tv_(tv) {
+    kir::IrVisitor::handle(exprs);
+  }
+
+  using kir::IrVisitor::handle;
+  bool is_predicated_ = false;
+  bool predicated_ite_ = false;
+  TensorView* tv_ = nullptr;
+
+  void handle(kir::IfThenElse* ite) final {
+    auto prev_ite = predicated_ite_;
+    predicated_ite_ = !ite->predicate()->value()->isConstScalar();
+    kir::IrVisitor::handle(ite);
+    predicated_ite_ = prev_ite;
+  }
+
+  void handle(Expr* expr) final {
+    if (expr->outputs().size() && expr->outputs()[0]->isA<kir::TensorIndex>()) {
+      auto ti = expr->outputs()[0]->as<kir::TensorIndex>();
+      if (ti->view() == tv_) {
+        is_predicated_ = is_predicated_ | predicated_ite_;
+        if (expr->predicate() != nullptr &&
+            !expr->predicate()->value()->isConst()) {
+          is_predicated_ = true;
+        }
+      }
+    }
+    kir::IrVisitor::handle(expr);
+  }
+};
+
+class UnswitchInElseChecker : public kir::IrVisitor {
+ public:
+  // Checks if there are any unswitched for loops within an else clause
+  static bool check(GpuLower& gpulw) {
+    UnswitchInElseChecker checker(gpulw.kernel()->topLevelExprs());
+    return checker.found_in_else_;
+  }
+
+ private:
+  UnswitchInElseChecker() = delete;
+  UnswitchInElseChecker(std::vector<Expr*> exprs) {
+    kir::IrVisitor::handle(exprs);
+  }
+
+  using kir::IrVisitor::handle;
+  bool within_else_ = false;
+  bool found_in_else_ = false;
+
+  void handle(kir::IfThenElse* ite) final {
+    auto prev_within_else = within_else_;
+    within_else_ = true;
+    kir::IrVisitor::handle(ite->elseBody().exprs());
+    within_else_ = prev_within_else;
+  }
+
+  void handle(kir::ForLoop* for_loop) final {
+    if (for_loop->iter_domain()->getParallelType() == ParallelType::Unswitch) {
+      found_in_else_ = found_in_else_ || within_else_;
+    }
+    kir::IrVisitor::handle(for_loop);
+  }
+};
+
+class PredicateMagicZeroChecker : public kir::IrVisitor {
+ public:
+  // Checks if all predicated domains of the provided tv are protected with
+  // magic zero
+  static bool isProtected(TensorView* tv, GpuLower& gpulw) {
+    PredicateMagicZeroChecker checker(
+        loweredTv(tv, gpulw), gpulw.kernel()->topLevelExprs());
+    return checker.is_protected_;
+  }
+
+ private:
+  using kir::IrVisitor::handle;
+
+  PredicateMagicZeroChecker(TensorView* tv, std::vector<Expr*> exprs)
+      : tv_(tv) {
+    handle(exprs);
+  }
+
+  void handle(kir::IfThenElse* ite) final {
+    auto prev_predicate = predicate_;
+    predicate_ = ite->predicate()->value();
+    kir::IrVisitor::handle(ite);
+    predicate_ = prev_predicate;
+  }
+
+  void handle(Expr* expr) final {
+    if (expr->outputs().size() && expr->outputs()[0]->isA<kir::TensorIndex>()) {
+      auto ti = expr->outputs()[0]->as<kir::TensorIndex>();
+      if (ti->view() == tv_) {
+        is_protected_ = checkPredicateOfTensor(predicate_);
+        return;
+      }
+    }
+
+    if (expr->isA<kir::ForLoop>()) {
+      handle(expr->as<kir::ForLoop>());
+    } else if (expr->isA<kir::IfThenElse>()) {
+      handle(expr->as<kir::IfThenElse>());
+    } else {
+      for (auto input : expr->inputs()) {
+        handle(input);
+      }
+    }
+  }
+
+  // Return true If all predicated domains are protected
+  bool checkPredicateOfTensor(Val* predicate) {
+    auto id_predicates = decomposeCompoundPredicate(predicate);
+    for (auto id_predicate : id_predicates) {
+      // Just check if nvfuser_zero is used. Not perfect but probably
+      // good enough.
+      is_magic_zero_found_ = false;
+      handle(id_predicate);
+      if (!is_magic_zero_found_) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Decompose "X && Y" to a vector of {X, Y}.
+  std::vector<Val*> decomposeCompoundPredicate(Val* predicate) {
+    if (auto binary_op = dynamic_cast<BinaryOp*>(predicate->definition())) {
+      if (binary_op->getBinaryOpType() == BinaryOpType::And) {
+        auto pred = decomposeCompoundPredicate(binary_op->lhs());
+        auto rhs_pred = decomposeCompoundPredicate(binary_op->rhs());
+        pred.insert(pred.end(), rhs_pred.begin(), rhs_pred.end());
+        return pred;
+      }
+    }
+
+    return {predicate};
+  }
+
+  void handle(Val* val) final {
+    if (isMagicZero(val)) {
+      is_magic_zero_found_ = true;
+      return;
+    }
+
+    auto def = val->definition();
+    if (def != nullptr) {
+      handle(def);
+    }
+  }
+
+ private:
+  bool is_protected_ = false;
+  Val* predicate_ = nullptr;
+  TensorView* tv_ = nullptr;
+  bool is_magic_zero_found_ = false;
+};
+
+// Basically just TransformPropagator, except that it checks the consistency
+// replayPasC with getMatchedLeafPosWithoutReplayPasC, replayCasP with
+// getMatchedLeafPosWithoutReplayCasP, and fullSelfReplay with fullSelfMatching:
+// - After replayPasC, getMatchedLeafPosWithoutReplayPasC should return the same
+//   replayed position
+// - After replayCasP, getMatchedLeafPosWithoutReplayCasP should return the same
+//   replayed position
+// - After fullSelfReplay, fullSelfMatching should return true
+struct TransformPropagatorWithCheck : public TransformPropagator {
+ public:
+  virtual void propagateC2P(TensorView* from, TensorView* to) override {
+    TransformPropagator::propagateC2P(from, to);
+    auto from_pos = replayed_pos_.at(from);
+    auto to_pos = replayed_pos_.at(to);
+    TORCH_CHECK(
+        TransformReplay::getMatchedLeafPosWithoutReplayPasC(
+            to, from, from_pos) == to_pos);
+  }
+  virtual void propagateP2C(TensorView* from, TensorView* to) override {
+    TransformPropagator::propagateP2C(from, to);
+    auto from_pos = replayed_pos_.at(from);
+    auto to_pos = replayed_pos_.at(to);
+    TORCH_CHECK(
+        TransformReplay::getMatchedLeafPosWithoutReplayCasP(
+            to, from, from_pos) == to_pos);
+  }
+  virtual void propagateSibling(TensorView* from, TensorView* to) override {
+    TransformPropagator::propagateSibling(from, to);
+    auto from_pos = replayed_pos_.at(from);
+    auto to_pos = replayed_pos_.at(to);
+    TORCH_CHECK(from_pos == to_pos);
+    TORCH_CHECK(TransformReplay::fullSelfMatching(from, to));
+  }
+  using TransformPropagator::TransformPropagator;
+};
+
 } // namespace
+
+class ContextCudnnTF32Disabled {
+ public:
+  ContextCudnnTF32Disabled() {
+    flag_ = at::globalContext().allowTF32CuDNN();
+    at::globalContext().setAllowTF32CuDNN(false);
+  }
+
+  ~ContextCudnnTF32Disabled() {
+    at::globalContext().setAllowTF32CuDNN(flag_);
+  }
+
+ private:
+  bool flag_;
+};
+
+// Fixture class must be uniquely identified, i.e., can't be in an
+// anonymous namespace
+class NVFuserTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // requires PASCAL or newer
+    if (!deviceMajorMinorCheck(6)) {
+      GTEST_SKIP() << "skipping tests on pre-PASCAL GPUs";
+    }
+    setFillAllocationWithNan(true);
+  }
+};
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/transform_iter.cpp b/torch/csrc/jit/codegen/cuda/transform_iter.cpp
index 3e0473665b478..ab683e79ce9aa 100644
--- a/torch/csrc/jit/codegen/cuda/transform_iter.cpp
+++ b/torch/csrc/jit/codegen/cuda/transform_iter.cpp
@@ -137,7 +137,7 @@ void ReplayTransformations::handle(Swizzle2D* swizzle_2d) {
   auto id_in_y = swizzle_2d->inY();
 
   // Make sure we have a corresponding entry in our map pointing to the ID we're
-  // going to replay the split on
+  // going to replay the swizzle on
   auto it_x = id_map_.find(id_in_x);
   auto it_y = id_map_.find(id_in_y);
 
@@ -162,7 +162,7 @@ void ReplayTransformations::handle(Swizzle2D* swizzle_2d) {
   auto outs = std::make_pair(mapped_x, mapped_y);
 
   if (replay_swizzle_) {
-    // Replay the split onto mapped
+    // Replay the swizzle onto mapped
     outs = IterDomain::swizzle(swizzle_2d->swizzleType(), mapped_x, mapped_y);
 
     // Remove mapped from the leaf IDs
@@ -224,7 +224,7 @@ void ReplayTransformations::runReplay() {
   // Switch outDomain to a vector to start the traversal
   std::vector<Val*> traversal_vals(
       target_domain_.begin(), target_domain_.end());
-  traverseFrom(traversal_vals[0]->fusion(), traversal_vals);
+  traverseTo(traversal_vals[0]->fusion(), traversal_vals);
 
   if (error_on_failure_)
     TORCH_INTERNAL_ASSERT(
@@ -762,14 +762,6 @@ struct ProducerForwardingInfo {
           (outer->isTrivialReduction() && !inner->isReduction())) {
         auto compliment_id = inner->isTrivialReduction() ? inner : outer;
         auto forwarded_id = inner->isTrivialReduction() ? outer : inner;
-        // Only allow forwarding when the trivial reduction domain is
-        // an root domain
-        if (std::find(
-                producer->getMaybeRFactorDomain().begin(),
-                producer->getMaybeRFactorDomain().end(),
-                compliment_id) == producer->getMaybeRFactorDomain().end()) {
-          continue;
-        }
         forwarding_map.emplace(std::make_pair(forwarded_id, merge->out()));
         compliment_map.emplace(std::make_pair(
             forwarded_id, std::vector<IterDomain*>{compliment_id}));
diff --git a/torch/csrc/jit/codegen/cuda/type.cpp b/torch/csrc/jit/codegen/cuda/type.cpp
index e3d61efac9722..3b8f380683ed2 100644
--- a/torch/csrc/jit/codegen/cuda/type.cpp
+++ b/torch/csrc/jit/codegen/cuda/type.cpp
@@ -1,5 +1,7 @@
 #include <torch/csrc/jit/codegen/cuda/type.h>
 
+#include <ATen/cuda/CUDAContext.h>
+
 #include <stdexcept>
 #include <unordered_map>
 
@@ -160,6 +162,17 @@ DataType getTypeFromComplexType(DataType dtype) {
   }
 }
 
+bool isSupportedTypeByDevice(DataType dtype) {
+  auto prop = at::cuda::getCurrentDeviceProperties();
+  auto major_ver = prop->major;
+  switch (dtype) {
+    case DataType::BFloat16:
+      return major_ver >= 8;
+    default:
+      return true;
+  }
+}
+
 bool isIntegerOp(const BinaryOpType bopt) {
   return bopt >= BinaryOpType::Mod && bopt <= BinaryOpType::Rshift;
 }
@@ -290,8 +303,12 @@ static const char* predicate_type2string(PredicateType t) {
 
 static const char* expr_type2string(ExprType t) {
   switch (t) {
+    case ExprType::FullOp:
+      return "FullOp";
     case ExprType::ARangeOp:
       return "ARangeOp";
+    case ExprType::EyeOp:
+      return "EyeOp";
     case ExprType::UnaryOp:
       return "UnaryOp";
     case ExprType::BinaryOp:
@@ -656,6 +673,8 @@ static const char* rng_op_type_inline_op2string(RNGOpType t) {
   switch (t) {
     case RNGOpType::Uniform:
       return "rng_uniform";
+    case RNGOpType::UniformRange:
+      return "rng_uniform_range";
     default:
       break;
   }
@@ -694,6 +713,8 @@ static const char* rng_op_type2string(RNGOpType t) {
   switch (t) {
     case RNGOpType::Uniform:
       return "rng_uniform";
+    case RNGOpType::UniformRange:
+      return "rng_uniform_range";
     default:
       TORCH_INTERNAL_ASSERT(false, "Unexpected RNGOpType");
   }
diff --git a/torch/csrc/jit/codegen/cuda/type.h b/torch/csrc/jit/codegen/cuda/type.h
index 224922febc3fa..4aa894113e993 100644
--- a/torch/csrc/jit/codegen/cuda/type.h
+++ b/torch/csrc/jit/codegen/cuda/type.h
@@ -101,10 +101,14 @@ int getVectorSizeFromType(DataType dtype);
 DataType getTypeFromVectorType(DataType dtype);
 // Return the corresponding scalar of a complex type
 DataType getTypeFromComplexType(DataType dtype);
+// Return if the datatype is supported on the current device
+TORCH_CUDA_CU_API bool isSupportedTypeByDevice(DataType dtype);
 
 enum class ExprType {
   Invalid,
+  FullOp,
   ARangeOp,
+  EyeOp,
   UnaryOp,
   BinaryOp,
   TernaryOp,
@@ -244,7 +248,8 @@ enum class BinaryOpType {
 };
 
 enum class RNGOpType {
-  Uniform,
+  Uniform, // Uniform in [0, 1)
+  UniformRange, // Uniform in [low, high]
 };
 
 // Return if output of operator should be a boolean
diff --git a/torch/csrc/jit/codegen/cuda/type_inference.cpp b/torch/csrc/jit/codegen/cuda/type_inference.cpp
index 534fa91488cee..7422cf20d7c2b 100644
--- a/torch/csrc/jit/codegen/cuda/type_inference.cpp
+++ b/torch/csrc/jit/codegen/cuda/type_inference.cpp
@@ -445,13 +445,16 @@ class NaiveTypePropagator {
         copyScalarTypeAndDeviceToOutput(out_type->withDim(c10::nullopt), node);
         break;
       }
-      case prim::unsqueeze_copy:
       case prim::expand_copy:
       case prim::expand_as_copy:
-      case prim::squeeze_copy:
+      case prim::flatten_copy:
+      case prim::permute_copy:
       case prim::reshape_copy:
-      case prim::view_copy:
-      case prim::flatten_copy: {
+      case prim::squeeze_copy:
+      case prim::t_copy:
+      case prim::transpose_copy:
+      case prim::unsqueeze_copy:
+      case prim::view_copy: {
         auto out_type = node->input(0)->type()->cast<TensorType>();
         copyScalarTypeAndDeviceToOutput(out_type, node);
         break;
diff --git a/torch/csrc/jit/codegen/cuda/utils.cpp b/torch/csrc/jit/codegen/cuda/utils.cpp
index d7409c98db658..33395692fb39e 100644
--- a/torch/csrc/jit/codegen/cuda/utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/utils.cpp
@@ -41,9 +41,10 @@ auto parseDebugDumpOptions() {
       {DebugDumpOption::PythonDefinition, false},
       {DebugDumpOption::PythonFrontendDebug, false},
       {DebugDumpOption::TransformPropagator, false},
-      {DebugDumpOption::InlinePropagator, false},
       {DebugDumpOption::Cubin, false},
-      {DebugDumpOption::Ptx, false}};
+      {DebugDumpOption::Ptx, false},
+      {DebugDumpOption::BankConflictInfo, false},
+      {DebugDumpOption::SyncMap, false}};
 
   if (const char* dump_options = std::getenv("PYTORCH_NVFUSER_DUMP")) {
     c10::string_view options_view(dump_options);
@@ -100,12 +101,14 @@ auto parseDebugDumpOptions() {
         options_map[DebugDumpOption::PythonFrontendDebug] = true;
       } else if (token == "transform_propagator") {
         options_map[DebugDumpOption::TransformPropagator] = true;
-      } else if (token == "inline_propagator") {
-        options_map[DebugDumpOption::InlinePropagator] = true;
       } else if (token == "cubin") {
         options_map[DebugDumpOption::Cubin] = true;
       } else if (token == "ptx") {
         options_map[DebugDumpOption::Ptx] = true;
+      } else if (token == "bank_conflict") {
+        options_map[DebugDumpOption::BankConflictInfo] = true;
+      } else if (token == "sync_map") {
+        options_map[DebugDumpOption::SyncMap] = true;
       } else {
         TORCH_CHECK(
             false,
@@ -118,7 +121,7 @@ auto parseDebugDumpOptions() {
             "\tdraw_segmented_fusion, scheduler_params, parallel_dimensions,\n",
             "\tbuffer_reuse_verbose, ptxas_verbose, halo, segmenter_logging,\n",
             "\tperf_debug_verbose, python_definition, python_frontend_debug,\n",
-            "\ttransform_propagator, inline_propagator, cubin, ptx\n");
+            "\ttransform_propagator, cubin, ptx, bank_conflict, sync_map\n");
       }
       options_view = (end_pos != c10::string_view::npos)
           ? options_view.substr(end_pos + 1)
@@ -132,6 +135,7 @@ auto parseDebugDumpOptions() {
 auto parseDisableOptions() {
   std::unordered_map<DisableOption, bool> options_map = {
       {DisableOption::ArchCheck, false},
+      {DisableOption::CompileToSass, false},
       {DisableOption::Fallback, false},
       {DisableOption::Fma, false},
       {DisableOption::IndexHoist, false},
@@ -145,6 +149,8 @@ auto parseDisableOptions() {
       const auto token = options_view.substr(0, end_pos);
       if (token == "arch_check") {
         options_map[DisableOption::ArchCheck] = true;
+      } else if (token == "compile_to_sass") {
+        options_map[DisableOption::CompileToSass] = true;
       } else if (token == "fallback") {
         options_map[DisableOption::Fallback] = true;
       } else if (token == "fma") {
@@ -179,8 +185,7 @@ auto parseEnableOptions() {
       {EnableOption::Complex, false},
       {EnableOption::KernelProfile, false},
       {EnableOption::LinearDecomposition, false},
-      {EnableOption::ConvDecomposition, false},
-      {EnableOption::TransposeScheduler, false}};
+      {EnableOption::ConvDecomposition, false}};
 
   if (const char* dump_options = std::getenv("PYTORCH_NVFUSER_ENABLE")) {
     c10::string_view options_view(dump_options);
@@ -195,8 +200,6 @@ auto parseEnableOptions() {
         options_map[EnableOption::LinearDecomposition] = true;
       } else if (token == "conv_decomposition") {
         options_map[EnableOption::ConvDecomposition] = true;
-      } else if (token == "transpose_scheduler") {
-        options_map[EnableOption::TransposeScheduler] = true;
       } else {
         TORCH_CHECK(
             false,
@@ -204,7 +207,7 @@ auto parseEnableOptions() {
             token,
             "'\nAvailable options:\n",
             "\tcomplex, kernel_profile, linear_decomposition,",
-            "conv_decomposition, transpose_scheduler");
+            "conv_decomposition");
       }
       options_view = (end_pos != c10::string_view::npos)
           ? options_view.substr(end_pos + 1)
diff --git a/torch/csrc/jit/codegen/cuda/utils.h b/torch/csrc/jit/codegen/cuda/utils.h
index 5b5c794f3810d..61f7fee7cd4cf 100644
--- a/torch/csrc/jit/codegen/cuda/utils.h
+++ b/torch/csrc/jit/codegen/cuda/utils.h
@@ -57,10 +57,10 @@ enum class DebugDumpOption {
   PythonFrontendDebug, //! Python Frontend debug information.
   TransformPropagator, //! When running TransformPropagator, print propagation
                        //! path and replay result
-  InlinePropagator, //! When running InlinePropagator, print propagation
-                    //! path and inlining result
   Cubin, //! Dump compiled CUBIN
-  Ptx //! Dump compiled PTX
+  Ptx, //! Dump compiled PTX
+  BankConflictInfo, //! Dump bank confliction info
+  SyncMap //! RAW dependency info
 };
 
 TORCH_CUDA_CU_API bool isDebugDumpEnabled(DebugDumpOption option);
@@ -71,6 +71,8 @@ TORCH_CUDA_CU_API bool isDebugDumpEnabled(DebugDumpOption option);
 //!
 enum class DisableOption {
   ArchCheck, //! Disable hardware-specific checks to enable cross arch debug
+  CompileToSass, //! Disable direct compilation to sass so the ptx can be
+                 //! examined
   Fallback, //! Disable fallback
   Fma, //! Disable FMA instructions
   IndexHoist, //! Disable index hoisting
@@ -89,7 +91,6 @@ enum class EnableOption {
   KernelProfile, //! Enable intra-kernel performance profiling
   LinearDecomposition, //! Enable linear-bias decomposition
   ConvDecomposition, //! Enable conv-bias decomposition
-  TransposeScheduler //! Enable the experimental transpose scheduler
 };
 
 TORCH_CUDA_CU_API bool isOptionEnabled(EnableOption option);
diff --git a/torch/csrc/jit/codegen/fuser/codegen.cpp b/torch/csrc/jit/codegen/fuser/codegen.cpp
index 0665d21a7a4fc..c28ad2ba1ae09 100644
--- a/torch/csrc/jit/codegen/fuser/codegen.cpp
+++ b/torch/csrc/jit/codegen/fuser/codegen.cpp
@@ -490,7 +490,7 @@ std::string generateKernel(
         env.s("access", format("t${formal}.data[t${formal}_offset]", env));
         env.s("access_vec4", format("t${formal}_buf[i]", env));
       }
-      env.s("lhs_type", calcScalarTypeName(input.second.value().scalar_type));
+      env.s("lhs_type", calcScalarTypeName(input.second->scalar_type));
 
       // load input in vectorized code path
       auto ele_size = at::elementSize((*input.second).scalar_type);

From 66571b6eb0da4b80a779499d829fd4f6095e1390 Mon Sep 17 00:00:00 2001
From: Mor Tzur <mortzur@meta.com>
Date: Fri, 4 Nov 2022 20:18:08 +0000
Subject: [PATCH 0579/1922] [fx2ait] fixes for AITSplitter (#87805)

Summary: propagate lower settings to AITSplitter settings.

Reviewed By: yinghai, qxy11

Differential Revision: D40568216

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87805
Approved by: https://github.com/yinghai
---
 torch/fx/passes/splitter_base.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/torch/fx/passes/splitter_base.py b/torch/fx/passes/splitter_base.py
index e2149052339ec..0f357c38dcb7a 100644
--- a/torch/fx/passes/splitter_base.py
+++ b/torch/fx/passes/splitter_base.py
@@ -30,13 +30,21 @@
 __all__ = ['FxNetAccNodesFinder', 'FxNetSplitterInternalError', 'Subgraph', 'SplitResult', 'generate_inputs_for_submodules']
 _LOGGER = logging.getLogger(__name__)
 
+DEFAULT_MIN_ACC_MODULE_SIZE = 1
+DEFAULT_SKIP_FUSION = False
+DEFAULT_ALLOW_NON_TENSOR = False
 
 class _SplitterSettingBase:
-    def __init__(self):
+    def __init__(
+        self,
+        min_acc_module_size=DEFAULT_MIN_ACC_MODULE_SIZE,
+        skip_fusion=DEFAULT_SKIP_FUSION,
+        allow_non_tensor=DEFAULT_ALLOW_NON_TENSOR
+    ):
         parser = argparse.ArgumentParser()
         parser.add_argument(
             "--min_acc_module_size",
-            default=1,
+            required=False,
             type=int,
             help="Minimum size limit of an accelerator subgraph.",
         )
@@ -62,9 +70,9 @@ def __init__(self):
         )
         args, unknown = parser.parse_known_args()
 
-        self.min_acc_module_size: int = args.min_acc_module_size
-        self.skip_fusion: bool = args.skip_fusion
-        self.allow_non_tensor: bool = args.allow_non_tensor
+        self.min_acc_module_size: int = args.min_acc_module_size if args.min_acc_module_size else min_acc_module_size
+        self.skip_fusion: bool = args.skip_fusion if args.skip_fusion else skip_fusion
+        self.allow_non_tensor: bool = args.allow_non_tensor if args.allow_non_tensor else allow_non_tensor
 
 
 @compatibility(is_backward_compatible=False)

From 418963b75bdb14aad07ff4f97cc9550427899961 Mon Sep 17 00:00:00 2001
From: Kurt Mohler <kmohler@quansight.com>
Date: Fri, 4 Nov 2022 20:23:56 +0000
Subject: [PATCH 0580/1922] Add nondeterministic error for `scatter` (#88244)

Fixes #88096

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88244
Approved by: https://github.com/ezyang, https://github.com/mruberry
---
 .../ATen/native/TensorAdvancedIndexing.cpp    |  4 ++
 test/test_torch.py                            | 40 +++++++++++++++++++
 torch/__init__.py                             |  1 +
 3 files changed, 45 insertions(+)

diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
index 3004dc1b31c79..fa78b60c66848 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -1512,6 +1512,10 @@ TORCH_IMPL_FUNC(scatter_src_out)
  const Tensor& index,
  const Tensor& src,
  const Tensor& out) {
+  // See note [Writing Nondeterministic Operations]
+  // Nondeterministic when index contains duplicate entries, src is a tensor,
+  // and reduce=None
+  at::globalContext().alertNotDeterministic("scatter with src tensor and reduce=None");
   scatter_impl(self, dim, index, src, out,
                scatter_reduce_stub,
                scatter_stub);
diff --git a/test/test_torch.py b/test/test_torch.py
index 7fdcf3f235399..2247d18285d55 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -1478,6 +1478,46 @@ def test_nondeterministic_alert_put_accumulate(self, device):
                 'put_',
                 torch.device(device).type == 'cuda')
 
+    @expectedFailureMeta  # expected a non-determinitic error, but it was not raised
+    @onlyNativeDeviceTypes
+    def test_nondeterministic_alert_scatter(self, device):
+        a = torch.randn(10, device=device)
+        indices = torch.tensor([0, 0], device=device)
+        values = torch.tensor([0., 1.], device=device)
+        result = torch.empty_like(a)
+
+        error_msg = 'scatter with src tensor and reduce=None'
+
+        error_cases = [
+            lambda: torch.Tensor.scatter(a, 0, indices, values),
+            lambda: torch.Tensor.scatter_(a, 0, indices, values),
+            lambda: torch.scatter(a, 0, indices, values),
+            lambda: torch.scatter(a, 0, indices, values, out=result),
+        ]
+
+        no_error_cases = [
+            lambda: torch.Tensor.scatter(a, 0, indices, 0),
+            lambda: torch.Tensor.scatter_(a, 0, indices, 0),
+            lambda: torch.scatter(a, 0, indices, 0),
+            lambda: torch.scatter(a, 0, indices, 0, out=result),
+
+            lambda: torch.Tensor.scatter(a, 0, indices, values, reduce='add'),
+            lambda: torch.Tensor.scatter_(a, 0, indices, values, reduce='add'),
+            lambda: torch.scatter(a, 0, indices, values, reduce='add'),
+            lambda: torch.scatter(a, 0, indices, values, out=result, reduce='add'),
+        ]
+
+        for error_case in error_cases:
+            self.check_nondeterministic_alert(
+                error_case,
+                error_msg)
+
+        for no_error_case in no_error_cases:
+            self.check_nondeterministic_alert(
+                no_error_case,
+                error_msg,
+                False)
+
     @skipIfMps
     def test_nondeterministic_alert_histc(self, device):
         a = torch.tensor([], device=device)
diff --git a/torch/__init__.py b/torch/__init__.py
index 632437c04c8ca..1a645f53a8a2d 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -500,6 +500,7 @@ def use_deterministic_algorithms(mode, *, warn_only=False):
           ``mode='max'``
         * :func:`torch.Tensor.put_` when ``accumulate=False``
         * :func:`torch.Tensor.put_` when ``accumulate=True`` and called on a CUDA tensor
+        * :func:`torch.Tensor.scatter` when ``src`` is a tensor and ``reduce=None``
         * :func:`torch.histc` when called on a CUDA tensor
         * :func:`torch.bincount` when called on a CUDA tensor
         * :func:`torch.kthvalue` with called on a CUDA tensor

From 36160fb29bc4cb6c8a5e6f8726415f66cd513fc8 Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Fri, 4 Nov 2022 18:00:28 +0000
Subject: [PATCH 0581/1922] [inductor] Handle the case where kwargs contains
 tensor (#88417)

Summary: Fix https://github.com/pytorch/torchdynamo/issues/1805;
currently inductor does not allow any tensor in kwargs.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88417
Approved by: https://github.com/ngimel
---
 test/inductor/test_torchinductor.py        | 14 +++++++++++
 test/inductor/test_torchinductor_opinfo.py |  8 -------
 torch/_inductor/ir.py                      | 28 +++++++++++-----------
 torch/_inductor/lowering.py                |  6 +++--
 4 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index dd9e6767a805e..0c778c254ab10 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -4293,6 +4293,20 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                 else:
                     self.assertEqual(len(inps), 0)
 
+    @unittest.skipIf(HAS_CUDA, "histogramdd only supports cpu")
+    def test_kwargs(self):
+        def fn(x, y):
+            return torch.histogramdd(
+                x,
+                bins=[3, 3],
+                weight=y,
+            )
+
+        self.common(
+            fn,
+            [torch.randn((4, 2)), torch.randn((4))],
+        )
+
 
 if HAS_CPU:
 
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index b705a36d75425..b06d372b20d7b 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -205,8 +205,6 @@ def process(device_type):
     "index_put": {f16, f32, f64},
     "index_reduce": {f16, f32, f64},
     "istft": {f32, f64},
-    "linalg.cholesky": {f32, f64},
-    "linalg.cholesky_ex": {f32, f64},
     "linalg.eig": {f32, f64},
     "linalg.eigh": {f32, f64},
     "linalg.eigvals": {f32, f64},
@@ -246,7 +244,6 @@ def process(device_type):
     "normal": {f16, f32, f64},
     "normal.number_mean": {f16, f32, f64},
     "pca_lowrank": {f32, f64},
-    "pinverse": {f32, f64},
     "polar": {f32, f64},
     "quantile": {f32, f64},
     "rand_like": {f16, f32, f64},
@@ -257,7 +254,6 @@ def process(device_type):
     "scatter_reduce.sum": {f16},
     "scatter_reduce.prod": {f16, f32, f64},
     "segment_reduce.lengths": {f16, f32, f64},
-    "segment_reduce.offsets": {f16, f32, f64},
     "sgn": {f16, f32, f64},
     "sparse.sampled_addmm": {f32, f64},
     "stft": {f32, f64},
@@ -316,8 +312,6 @@ def process(device_type):
     "index_put": {f16, f32, f64},
     "index_reduce": {f16, f32, f64},
     "istft": {f32, f64},
-    "linalg.cholesky": {f32, f64},
-    "linalg.cholesky_ex": {f32, f64},
     "linalg.eig": {f32, f64},
     "linalg.eigh": {f32, f64},
     "linalg.eigvals": {f32, f64},
@@ -346,7 +340,6 @@ def process(device_type):
     "normal": {f16, f32, f64},
     "normal.number_mean": {f16, f32, f64},
     "pca_lowrank": {f32, f64},
-    "pinverse": {f32, f64},
     "polar": {f32, f64},
     "pow": {i32, i64},
     "rand_like": {f16, f32, f64},
@@ -356,7 +349,6 @@ def process(device_type):
     "round.decimals_3": {f16},
     "scatter_reduce.prod": {f16, f32, f64},
     "segment_reduce.lengths": {f16, f32, f64},
-    "segment_reduce.offsets": {f16, f32, f64},
     "sgn": {f16, f32, f64},
     "stft": {f32, f64},
     "svd_lowrank": {f32, f64},
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index f4c912137812c..8fd458cb72eb4 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -9,6 +9,7 @@
 from contextlib import nullcontext
 from enum import Enum
 from functools import partial
+from inspect import signature
 from typing import Any, Callable, ClassVar, Dict, List, Optional, Set, Tuple, Union
 from unittest.mock import patch
 
@@ -2243,7 +2244,8 @@ def copy_input(x):
 
     @classmethod
     def process_kernel(cls, kernel, *args, **kwargs):
-        args_flat, args_spec = pytree.tree_flatten(args)
+        binded_args = signature(kernel).bind(*args, **kwargs).arguments
+        args_flat, args_spec = pytree.tree_flatten(binded_args)
 
         is_arg_tensor = []
         tensor_args = []
@@ -2256,15 +2258,16 @@ def process_kernel(cls, kernel, *args, **kwargs):
                 non_tensor_args.append(arg)
 
         def unflatten_args(new_tensor_args, new_non_tensor_args):
-            new_args = []
+            result = []
             it_tensors = iter(new_tensor_args)
             it_non_tensors = iter(new_non_tensor_args)
             for is_tensor in is_arg_tensor:
                 if is_tensor:
-                    new_args.append(next(it_tensors))
+                    result.append(next(it_tensors))
                 else:
-                    new_args.append(next(it_non_tensors))
-            return pytree.tree_unflatten(new_args, args_spec)
+                    result.append(next(it_non_tensors))
+            result = pytree.tree_unflatten(result, args_spec)
+            return result.get("args", []), result.get("kwargs", {})
 
         tensor_args = [cls.realize_input(x) for x in tensor_args]
 
@@ -2290,9 +2293,8 @@ def unflatten_args(new_tensor_args, new_non_tensor_args):
             ).zero_()
             example_args.append(arg)
 
-        example_output = kernel(
-            *unflatten_args(example_args, non_tensor_args), **kwargs
-        )
+        new_args, new_kwargs = unflatten_args(example_args, non_tensor_args)
+        example_output = kernel(*new_args, **new_kwargs)
 
         return example_output, tensor_args, non_tensor_args, unflatten_args
 
@@ -2885,15 +2887,13 @@ class Shim:
             def __repr__(self):
                 return self.ref
 
-        tensor_args = [Shim(x.codegen_reference()) for x in self.inputs]
-        constant_args = [Shim(repr(x)) for x in self.constant_args]
-
         def gen_kwarg(k, v):
             return f"{k}={repr(v)}"
 
-        kwargs = list(gen_kwarg(k, v) for k, v in self.kwargs.items())
-
-        return list(map(repr, self.unflatten_args(tensor_args, constant_args))) + kwargs
+        tensor_args = [Shim(x.codegen_reference()) for x in self.inputs]
+        constant_args = [Shim(repr(x)) for x in self.constant_args]
+        args, kwargs = self.unflatten_args(tensor_args, constant_args)
+        return list(map(repr, args)) + list(gen_kwarg(k, v) for k, v in kwargs.items())
 
     @classmethod
     def create(cls, kernel, *args, **kwargs):
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index d2b27d702ebec..029d59f8615e9 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -169,8 +169,10 @@ def wrapped(*args, **kwargs):
         assert not any(
             x == "out" for x in kwargs.keys()
         ), "out= ops aren't yet supported"
-        # kwargs tensors not supported yet
-        assert not any(isinstance(x, TensorBox) for x in kwargs.values())
+        # kwargs tensors not supported yet unless it's a fallback op
+        assert not any(isinstance(x, TensorBox) for x in kwargs.values()) or all(
+            fn in fallbacks for fn in aten_fn
+        )
 
         if (type_promotion_kind or convert_input_to_bool) and indices:
             if convert_input_to_bool:

From e3ddb242c9f5aa78a2236ac50918d5507ca04aa9 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@meta.com>
Date: Fri, 4 Nov 2022 20:34:23 +0000
Subject: [PATCH 0582/1922] Don't print fatal:... in generate_torch_version.py
 (#88335)

During build, users commonly see a message like
```
fatal: no tag exactly matches 'd8b4f33324b1eb6c1103874764116fb68e0d0af4'
```
which is usually ignored when builds succeed, but has confused users when build fails (due to a different issue). This PR removes the red herring, since this usually prints for local development when tags are not found.

We catch the exception anyway and handle it under the hood, so we don't need to print it and confuse the user.

Test plan:
Note that builds on trunk current have this line, cmd-F 'fatal: no tag exactly matches' in https://github.com/pytorch/pytorch/actions/runs/3379162092/jobs/5610355820.

Then check in the PR build to see that the line no longer appears.

I also tagged my commit locally and printed what tag would be--this code and the old code printed the same results for what tag would be.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88335
Approved by: https://github.com/seemethere
---
 tools/generate_torch_version.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/tools/generate_torch_version.py b/tools/generate_torch_version.py
index 96970bd2b1c35..1586ff15fd207 100644
--- a/tools/generate_torch_version.py
+++ b/tools/generate_torch_version.py
@@ -25,13 +25,12 @@ def get_sha(pytorch_root: Union[str, Path]) -> str:
 
 def get_tag(pytorch_root: Union[str, Path]) -> str:
     try:
-        tag = (
-            subprocess.check_output(
-                ["git", "describe", "--tags", "--exact"], cwd=pytorch_root
-            )
-            .decode("ascii")
-            .strip()
-        )
+        tag = subprocess.run(
+            ["git", "describe", "--tags", "--exact"],
+            cwd=pytorch_root,
+            encoding="ascii",
+            capture_output=True,
+        ).stdout.strip()
         if RELEASE_PATTERN.match(tag):
             return tag
         else:

From 1df0f32a23f4dac1a3b65f415467ea4817dfa656 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Fri, 4 Nov 2022 20:35:11 +0000
Subject: [PATCH 0583/1922] Re-enable inductor models tests as periodical jobs
 (#88509)

Run every 4 hour same as periodic, but offset by an hour. This should give us some signals instead of completely disabling these jobs on master after https://github.com/pytorch/pytorch/pull/88374

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88509
Approved by: https://github.com/malfet
---
 .github/workflows/inductor.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml
index bd696795d2697..3a6c750bbf141 100644
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@@ -1,9 +1,12 @@
 name: inductor
 
 on:
+  schedule:
+    - cron: 45 1,5,9,13,17,21 * * *
   push:
     tags:
       - ciflow/inductor/*
+      - ciflow/periodic/*
   workflow_dispatch:
 
 concurrency:

From 8f864c87fe3cc4ba960cd7be0a8e31addb61b14a Mon Sep 17 00:00:00 2001
From: Catherine Lee <csl@fb.com>
Date: Fri, 4 Nov 2022 20:47:42 +0000
Subject: [PATCH 0584/1922] Disable mem leak check (#88373)

tbh at this point it might be easier to make a new workflow and copy the relevant jobs...

Changes:
* Disable cuda mem leak check except for on scheduled workflows
* Make pull and trunk run on a schedule which will run the memory leak check
* Periodic will always run the memory leak check -> periodic does not have parallelization anymore
* Concurrency check changed to be slightly more generous
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88373
Approved by: https://github.com/ZainRizvi, https://github.com/huydhn
---
 .../actions/filter-test-configs/action.yml    |  3 ++-
 .github/scripts/ensure_actions_will_cancel.py | 20 +++++++++----------
 .github/scripts/filter_test_configs.py        |  5 +++++
 .github/workflows/_linux-test.yml             |  2 ++
 .github/workflows/_mac-test.yml               |  2 ++
 .github/workflows/_rocm-test.yml              |  2 ++
 .github/workflows/_win-test.yml               |  1 +
 .github/workflows/periodic.yml                |  2 +-
 .github/workflows/pull.yml                    |  4 +++-
 .github/workflows/trunk.yml                   |  4 +++-
 .jenkins/pytorch/test.sh                      |  8 --------
 tools/testing/test_selections.py              |  2 +-
 torch/testing/_internal/common_utils.py       |  4 ++--
 13 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/.github/actions/filter-test-configs/action.yml b/.github/actions/filter-test-configs/action.yml
index 6ec9e48c2df8e..76399e325ef21 100644
--- a/.github/actions/filter-test-configs/action.yml
+++ b/.github/actions/filter-test-configs/action.yml
@@ -52,7 +52,8 @@ runs:
         .github/scripts/filter_test_configs.py \
           --test-matrix "${{ inputs.test-matrix }}" \
           --pr-number "${{ github.event.pull_request.number }}" \
-          --tag "${{ steps.parse-ref.outputs.tag }}"
+          --tag "${{ steps.parse-ref.outputs.tag }}" \
+          --event-name "${{ github.event_name }}"
 
     - name: Print the filtered test matrix
       shell: bash
diff --git a/.github/scripts/ensure_actions_will_cancel.py b/.github/scripts/ensure_actions_will_cancel.py
index c479aefb9fc43..729d02f560fa1 100755
--- a/.github/scripts/ensure_actions_will_cancel.py
+++ b/.github/scripts/ensure_actions_will_cancel.py
@@ -42,26 +42,26 @@ def should_check(filename: Path) -> bool:
             print("ERROR: duplicate workflow name:", name, file=sys.stderr)
             errors_found = True
         names.add(name)
-
-        expected = {
-            "group": EXPECTED_GROUP,
-            "cancel-in-progress": True,
-        }
-        actual = data.get("concurrency", None)
-        if actual != expected:
+        actual = data.get("concurrency", {})
+        if not actual.get("group", "").startswith(EXPECTED_GROUP):
             print(
                 f"'concurrency' incorrect or not found in '{filename.relative_to(REPO_ROOT)}'",
                 file=sys.stderr,
             )
             print(
-                f"expected: {expected}",
+                f"concurrency group should start with {EXPECTED_GROUP} but found {actual.get('group', None)}",
                 file=sys.stderr,
             )
+            errors_found = True
+        if not actual.get("cancel-in-progress", False):
             print(
-                f"actual:   {actual}",
+                f"'concurrency' incorrect or not found in '{filename.relative_to(REPO_ROOT)}'",
+                file=sys.stderr,
+            )
+            print(
+                f"concurrency cancel-in-progress should be True but found {actual.get('cancel-in-progress', None)}",
                 file=sys.stderr,
             )
-            errors_found = True
 
     if errors_found:
         sys.exit(1)
diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py
index 10170161554c7..89f80a00adaf5 100755
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@@ -38,6 +38,7 @@ def parse_args() -> Any:
     parser.add_argument("--test-matrix", type=str, required=True, help="the original test matrix")
     parser.add_argument("--pr-number", type=str, help="the pull request number")
     parser.add_argument("--tag", type=str, help="the associated tag if it exists")
+    parser.add_argument("--event-name", type=str, help="name of the event that triggered the job (pull, schedule, etc)")
     return parser.parse_args()
 
 
@@ -159,6 +160,10 @@ def main() -> None:
         # No PR number, no tag, we can just return the test matrix as it is
         filtered_test_matrix = test_matrix
 
+    if args.event_name == "schedule":
+        for config in filtered_test_matrix.get("include", []):
+            config["mem_leak_check"] = "mem_leak_check"
+
     # Set the filtered test matrix as the output
     set_output("test-matrix", json.dumps(filtered_test_matrix))
 
diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
index e1e95ee5e7892..995fa4b727935 100644
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@@ -122,6 +122,7 @@ jobs:
           DOCKER_IMAGE: ${{ inputs.docker-image }}
           XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
           XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
+          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}}
         timeout-minutes: 240
         run: |
           set -x
@@ -175,6 +176,7 @@ jobs:
             -e SCCACHE_S3_KEY_PREFIX \
             -e XLA_CUDA \
             -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
+            -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --ulimit stack=10485760:83886080 \
             --security-opt seccomp=unconfined \
diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml
index 52aba9c68cd83..82dee7b548412 100644
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@@ -128,6 +128,8 @@ jobs:
 
       - name: Test
         id: test
+        env:
+          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}}
         run: |
           COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")
 
diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml
index 62d487ffe3441..0d8ff874ba034 100644
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@@ -97,6 +97,7 @@ jobs:
           DOCKER_IMAGE: ${{ inputs.docker-image }}
           XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
           PYTORCH_JIT_ENABLE_NVFUSER: 1
+          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}}
         timeout-minutes: 270
         run: |
           set -x
@@ -146,6 +147,7 @@ jobs:
             -e MAX_JOBS="$(nproc --ignore=2)" \
             -e SCCACHE_BUCKET \
             -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
+            -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --ulimit stack=10485760:83886080 \
             --security-opt seccomp=unconfined \
diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml
index 4a099b742cce7..a0047abbc0f55 100644
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@@ -124,6 +124,7 @@ jobs:
           TEST_CONFIG: ${{ matrix.config }}
           PR_BODY: ${{ github.event.pull_request.body }}
           TORCH_CUDA_ARCH_LIST: "7.0"
+          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}}
         run: |
           COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")
 
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index 58e379e0b5fd2..6e1722b4b6c01 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -9,7 +9,7 @@ on:
   workflow_dispatch:
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
 
 jobs:
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 94210c5ccbc5d..c3d530e3e7189 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -9,9 +9,11 @@ on:
       - release/*
       - landchecks/*
   workflow_dispatch:
+  schedule:
+    - cron: 29 8 * * *  # about 1:29am PDT
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
 
 jobs:
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 48a89e47ed7bc..14bfcd0900687 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -10,9 +10,11 @@ on:
     tags:
       - ciflow/trunk/*
   workflow_dispatch:
+  schedule:
+    - cron: 29 8 * * *  # about 1:29am PDT
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
 
 jobs:
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 56b8d7dfcc108..5dfb1dfe413ba 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -109,14 +109,6 @@ if [[ "$TEST_CONFIG" == *inductor* ]]; then
   export PYTORCH_TEST_WITH_INDUCTOR=1
 fi
 
-# TODO: this condition is never true, need to fix this.
-if [[ -n "$PR_NUMBER" ]] && [[ -z "$CI_MASTER" || "$CI_MASTER" == "false" ]]; then
-  # skip expensive checks when on PR and CI_MASTER flag is not set
-  export PYTORCH_TEST_SKIP_CUDA_MEM_LEAK_CHECK=1
-else
-  export PYTORCH_TEST_SKIP_CUDA_MEM_LEAK_CHECK=0
-fi
-
 if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
   # Print GPU info
   rocminfo
diff --git a/tools/testing/test_selections.py b/tools/testing/test_selections.py
index 766ec0ff1fe7f..950d686d8dacc 100644
--- a/tools/testing/test_selections.py
+++ b/tools/testing/test_selections.py
@@ -5,7 +5,7 @@
 
 from tools.stats.import_test_stats import get_disabled_tests, get_slow_tests
 
-NUM_PROCS = 2
+NUM_PROCS = 1 if os.getenv("PYTORCH_TEST_CUDA_MEM_LEAK_CHECK", "0") == "1" else 2
 
 
 class ShardJob:
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index ba8e8db2e4a63..5eb924288c7a3 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -984,7 +984,7 @@ def wrapper(*args, **kwargs):
 # If this is True then CUDA memory leak checks are skipped. If this is false
 #   then CUDA memory leak checks are performed.
 # See: https://github.com/pytorch/pytorch/pull/59402#issuecomment-858811135
-TEST_SKIP_CUDA_MEM_LEAK_CHECK = os.getenv('PYTORCH_TEST_SKIP_CUDA_MEM_LEAK_CHECK', '0') == '1'
+TEST_CUDA_MEM_LEAK_CHECK = os.getenv('PYTORCH_TEST_CUDA_MEM_LEAK_CHECK', '0') == '1'
 
 # True if CI is running TBB-enabled Pytorch
 IS_TBB = "tbb" in os.getenv("BUILD_ENVIRONMENT", "")
@@ -1952,7 +1952,7 @@ def __init__(self, method_name='runTest'):
         test_method = getattr(self, method_name, None)
         if test_method is not None:
             # Wraps the tested method if we should do CUDA memory check.
-            if not TEST_SKIP_CUDA_MEM_LEAK_CHECK:
+            if TEST_CUDA_MEM_LEAK_CHECK:
                 self._do_cuda_memory_leak_check &= getattr(test_method, '_do_cuda_memory_leak_check', True)
                 # FIXME: figure out the flaky -1024 anti-leaks on windows. See #8044
                 if self._do_cuda_memory_leak_check and not IS_WINDOWS:

From 7b7dbadf6a032218855b9f8ab6aa5a066220f399 Mon Sep 17 00:00:00 2001
From: "Andrew M. James" <andrew.m.james2@gmail.com>
Date: Thu, 3 Nov 2022 13:55:53 -0500
Subject: [PATCH 0585/1922] Add sparse layout support for torch.frac (#88153)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88153
Approved by: https://github.com/cpuhrsch
---
 aten/src/ATen/native/native_functions.yaml            | 8 ++++++++
 aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp   | 3 +++
 aten/src/ATen/native/sparse/SparseUnaryOps.cpp        | 3 +++
 test/test_sparse_csr.py                               | 3 ++-
 torch/testing/_internal/common_methods_invocations.py | 5 +++++
 5 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index baee6dc398419..ebf18cdf4734e 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -2485,11 +2485,17 @@
   device_check: NoCheck   # TensorIterator
   structured_delegate: frac.out
   variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: frac_sparse
+    SparseCsrCPU, SparseCsrCUDA: frac_sparse_csr
 
 - func: frac_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   structured_delegate: frac.out
   variants: function, method
+  dispatch:
+    SparseCPU, SparseCUDA: frac_sparse_
+    SparseCsrCPU, SparseCsrCUDA: frac_sparse_csr_
 
 - func: frac.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -2498,6 +2504,8 @@
   dispatch:
     CPU, CUDA: frac_out
     MPS: frac_out_mps
+    SparseCPU, SparseCUDA: frac_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: frac_sparse_csr_out
 
 - func: full.names(int[] size, Scalar fill_value, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   device_check: NoCheck
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
index acc95564e6ddb..44cefdb0c3bb5 100644
--- a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
@@ -58,6 +58,8 @@
 #include <ATen/ops/fill_native.h>
 #include <ATen/ops/floor.h>
 #include <ATen/ops/floor_native.h>
+#include <ATen/ops/frac.h>
+#include <ATen/ops/frac_native.h>
 #include <ATen/ops/isinf.h>
 #include <ATen/ops/isinf_native.h>
 #include <ATen/ops/isnan.h>
@@ -359,6 +361,7 @@ CREATE_UNARY_UFUNC(erf);
 CREATE_UNARY_UFUNC(erfinv);
 CREATE_UNARY_UFUNC(expm1);
 CREATE_UNARY_UFUNC(floor);
+CREATE_UNARY_UFUNC(frac);
 CREATE_UNARY_UFUNC(log1p);
 CREATE_UNARY_UFUNC(neg);
 CREATE_UNARY_UFUNC(rad2deg);
diff --git a/aten/src/ATen/native/sparse/SparseUnaryOps.cpp b/aten/src/ATen/native/sparse/SparseUnaryOps.cpp
index ed6df15fe7795..bd10530486c0b 100644
--- a/aten/src/ATen/native/sparse/SparseUnaryOps.cpp
+++ b/aten/src/ATen/native/sparse/SparseUnaryOps.cpp
@@ -27,6 +27,8 @@
 #include <ATen/ops/expm1_native.h>
 #include <ATen/ops/floor.h>
 #include <ATen/ops/floor_native.h>
+#include <ATen/ops/frac.h>
+#include <ATen/ops/frac_native.h>
 #include <ATen/ops/isinf.h>
 #include <ATen/ops/isinf_native.h>
 #include <ATen/ops/isnan.h>
@@ -169,6 +171,7 @@ COALESCED_UNARY_UFUNC(erf);
 COALESCED_UNARY_UFUNC(erfinv);
 COALESCED_UNARY_UFUNC(expm1);
 COALESCED_UNARY_UFUNC(floor);
+COALESCED_UNARY_UFUNC(frac);
 COALESCED_UNARY_UFUNC(log1p);
 COALESCED_UNARY_UFUNC(round);
 COALESCED_UNARY_UFUNC(sign);
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index 595e3f4e35880..1a7610c4da07b 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -62,7 +62,8 @@ def _check_cusparse_sddmm_available():
     'abs',
     'conj_physical',
     'neg',
-    'positive'
+    'positive',
+    'frac',
 ]
 
 # This should be just an import from test_linalg instead of code duplication
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index fed764994cc64..81f9967559f82 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -9319,6 +9319,11 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    assert_autodiffed=True,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
                    skips=(
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     dtypes=(torch.bfloat16, torch.float16, torch.float32, torch.float64)),

From b03330ec583a05c1295ac64f86107645dd72c4b6 Mon Sep 17 00:00:00 2001
From: "Andrew M. James" <andrew.m.james2@gmail.com>
Date: Thu, 3 Nov 2022 13:55:53 -0500
Subject: [PATCH 0586/1922] Enable sparse_csr autograd testing for relu
 (#88154)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88154
Approved by: https://github.com/cpuhrsch
---
 test/test_sparse_csr.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index 1a7610c4da07b..1c4c0a6f6efad 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -64,6 +64,7 @@ def _check_cusparse_sddmm_available():
     'neg',
     'positive',
     'frac',
+    'nn.functional.relu'
 ]
 
 # This should be just an import from test_linalg instead of code duplication
@@ -2476,6 +2477,9 @@ def test_autograd_sparse_csr_unary(self, device, dtype, op):
             raise ValueError("Expected at least one 2D tensor in samples.")
 
         for sample in samples:
+            # We must skip samples of low dimensionality, we can't covert them to sparsed compressed layouts
+            if sample.input.ndim < 2:
+                continue
             sparse_input = sample.input.to_sparse_csr().requires_grad_(True)
 
             def fn(input):

From 0906e718d5380e10fe37eb1421a016dc9feaef14 Mon Sep 17 00:00:00 2001
From: "Andrew M. James" <andrew.m.james2@gmail.com>
Date: Thu, 3 Nov 2022 13:55:54 -0500
Subject: [PATCH 0587/1922] Support sparse inputs to deg2rad (#88156)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88156
Approved by: https://github.com/cpuhrsch
---
 aten/src/ATen/native/native_functions.yaml            | 6 ++++++
 aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp   | 3 +++
 aten/src/ATen/native/sparse/SparseUnaryOps.cpp        | 3 +++
 test/test_sparse_csr.py                               | 1 +
 torch/csrc/autograd/FunctionsManual.cpp               | 2 +-
 torch/testing/_internal/common_methods_invocations.py | 5 +++++
 6 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index ebf18cdf4734e..d915ee1015700 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -4034,15 +4034,21 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: deg2rad
+    SparseCPU, SparseCUDA: deg2rad_sparse
+    SparseCsrCPU, SparseCsrCUDA: deg2rad_sparse_csr
 
 - func: deg2rad_(Tensor(a!) self) -> Tensor(a!)
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: deg2rad_
+    SparseCPU, SparseCUDA: deg2rad_sparse_
+    SparseCsrCPU, SparseCsrCUDA: deg2rad_sparse_csr_
 
 - func: deg2rad.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CompositeExplicitAutograd: deg2rad_out
+    SparseCPU, SparseCUDA: deg2rad_sparse_out
+    SparseCsrCPU, SparseCsrCUDA: deg2rad_sparse_csr_out
 
 - func: scalar_tensor(Scalar s, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   dispatch:
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
index 44cefdb0c3bb5..20e4aff1784cb 100644
--- a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
@@ -48,6 +48,8 @@
 #include <ATen/ops/conj_physical.h>
 #include <ATen/ops/conj_physical_native.h>
 #include <ATen/ops/copy_native.h>
+#include <ATen/ops/deg2rad.h>
+#include <ATen/ops/deg2rad_native.h>
 #include <ATen/ops/empty.h>
 #include <ATen/ops/erf.h>
 #include <ATen/ops/erf_native.h>
@@ -357,6 +359,7 @@ CREATE_UNARY_UFUNC(asinh);
 CREATE_UNARY_UFUNC(atan);
 CREATE_UNARY_UFUNC(atanh);
 CREATE_UNARY_UFUNC(ceil);
+CREATE_UNARY_UFUNC(deg2rad);
 CREATE_UNARY_UFUNC(erf);
 CREATE_UNARY_UFUNC(erfinv);
 CREATE_UNARY_UFUNC(expm1);
diff --git a/aten/src/ATen/native/sparse/SparseUnaryOps.cpp b/aten/src/ATen/native/sparse/SparseUnaryOps.cpp
index bd10530486c0b..084daed4df4e9 100644
--- a/aten/src/ATen/native/sparse/SparseUnaryOps.cpp
+++ b/aten/src/ATen/native/sparse/SparseUnaryOps.cpp
@@ -19,6 +19,8 @@
 #include <ATen/ops/atanh_native.h>
 #include <ATen/ops/ceil.h>
 #include <ATen/ops/ceil_native.h>
+#include <ATen/ops/deg2rad.h>
+#include <ATen/ops/deg2rad_native.h>
 #include <ATen/ops/erf.h>
 #include <ATen/ops/erf_native.h>
 #include <ATen/ops/erfinv.h>
@@ -167,6 +169,7 @@ COALESCED_UNARY_UFUNC(asinh);
 COALESCED_UNARY_UFUNC(atan);
 COALESCED_UNARY_UFUNC(atanh);
 COALESCED_UNARY_UFUNC(ceil);
+COALESCED_UNARY_UFUNC(deg2rad);
 COALESCED_UNARY_UFUNC(erf);
 COALESCED_UNARY_UFUNC(erfinv);
 COALESCED_UNARY_UFUNC(expm1);
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index 1c4c0a6f6efad..f64a46d315645 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -61,6 +61,7 @@ def _check_cusparse_sddmm_available():
 UNARY_EWISE_CSR_ALLOW_AUTOGRAD = [
     'abs',
     'conj_physical',
+    'deg2rad',
     'neg',
     'positive',
     'frac',
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 3358d96569598..2495f921226b7 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -576,7 +576,7 @@ Tensor rad2deg_backward(const Tensor& grad) {
 Tensor deg2rad_backward(const Tensor& grad) {
   constexpr double M_PI_180 =
       0.017453292519943295769236907684886127134428718885417;
-  return at::mul(grad, at::native::wrapped_scalar_tensor(Scalar(M_PI_180)));
+  return at::mul(grad, Scalar(M_PI_180));
 }
 
 Tensor unsqueeze_multiple(
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 81f9967559f82..2b5ca56a9b4ad 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -9055,6 +9055,11 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
                    skips=(
                        # Reference: https://github.com/pytorch/pytorch/pull/51283#issuecomment-770614273
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',

From ffcb99d1fa44de1c6ae12a512d2b18f406b24582 Mon Sep 17 00:00:00 2001
From: "Andrew M. James" <andrew.m.james2@gmail.com>
Date: Thu, 3 Nov 2022 13:55:54 -0500
Subject: [PATCH 0588/1922] enable backward for log1p (sparse layouts) (#88155)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88155
Approved by: https://github.com/cpuhrsch
---
 test/test_sparse.py                           | 11 +-----
 test/test_sparse_csr.py                       |  3 +-
 torch/csrc/autograd/FunctionsManual.cpp       | 35 ++++++++++++++-----
 .../_internal/common_methods_invocations.py   |  4 ---
 4 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/test/test_sparse.py b/test/test_sparse.py
index 125fb6d83b300..a2b623e2508eb 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -2184,16 +2184,7 @@ def is_integral(dtype):
             with self.assertRaisesRegex(RuntimeError, "log1p_ requires coalesced input"):
                 sparse_tensor.log1p_()
 
-        if not is_integral_dtype:
-            sparse_tensor.requires_grad_()
-            self.assertTrue(sparse_tensor.requires_grad)
-
-            # test autograd
-            x = sparse_tensor.clone()
-            y = sparse_tensor.log1p()
-            with self.assertRaisesRegex(RuntimeError, "log1p of a sparse tensor is made to be non-differentiable"):
-                y.backward(x)
-        else:
+        if is_integral_dtype:
             with self.assertRaisesRegex(RuntimeError, "only Tensors of floating point dtype can require gradients"):
                 sparse_tensor.requires_grad_()
 
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index f64a46d315645..cc5044da0bd58 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -65,7 +65,8 @@ def _check_cusparse_sddmm_available():
     'neg',
     'positive',
     'frac',
-    'nn.functional.relu'
+    'nn.functional.relu',
+    'log1p'
 ]
 
 # This should be just an import from test_linalg instead of code duplication
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 2495f921226b7..3b0c73678a76d 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -30,6 +30,7 @@
 #include <ciso646>
 #include <functional>
 #include <numeric>
+
 // Helper functions for autogenerated code
 // These used to be inlined into the codegened Functions.cpp
 
@@ -4833,14 +4834,32 @@ std::tuple<Tensor, Tensor, Tensor> _trilinear_backward(
 }
 
 Tensor log1p_backward(const Tensor& grad, const Tensor& self) {
-  if (self.is_sparse()) {
-    AT_ERROR(
-        "log1p of a sparse tensor is made to be non-differentiable since ",
-        "local gradient of zero is 1 / (0 + 1) = 1 and it makes the tensor dense. ",
-        "Use a different mathematical operation which preserves sparsity of gradients, ",
-        "or report a bug if you think this is an error.");
-  }
-  return grad / (self + 1).conj();
+  // We must conditionally initalize this using to_dense if sparse, sparse
+  // addition is not supported without exact shape match
+  Tensor self_p1_conj;
+  if (self.layout() == c10::kSparse || self.layout() == c10::kSparseCsr ||
+      self.layout() == c10::kSparseCsc || self.layout() == c10::kSparseBsr ||
+      self.layout() == c10::kSparseBsc) {
+    // The warning only applies to the sparsity of self, dense grad is never
+    // materialized so if self is strided and grad is sparse nothing unepected
+    // happens memory wise
+    TORCH_WARN(
+        "log1p_backward: recived self with sparse layout, but backward requires materialization of a dense tensor with this shape");
+    self_p1_conj = (self.to_dense() + 1).conj();
+  } else {
+    // Although calling self.to_dense() would just return self when it has
+    // strided layout, that would breaks functorch tests.
+    self_p1_conj = (self + 1).conj();
+  }
+  if (grad.layout() == c10::kSparse || grad.layout() == c10::kSparseCsr ||
+      grad.layout() == c10::kSparseCsc || grad.layout() == c10::kSparseBsr ||
+      grad.layout() == c10::kSparseBsc) {
+    // If grad is sparse we can't divide by the n-d (self + 1).conj(), so we
+    // must multiply by the recipricol, layout of grad is preserved which is
+    // important to gradcheck
+    return grad * self_p1_conj.reciprocal_();
+  }
+  return grad / self_p1_conj;
 }
 
 Tensor sinc_backward(const Tensor& grad, const Tensor& self) {
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 2b5ca56a9b4ad..f643559fe34fb 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -9538,10 +9538,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    dtypes=all_types_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16),
                    decorators=(precisionOverride({torch.bfloat16: 1e-1}),),
-                   skips=(
-                       DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
-                                    'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
-                   ),
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    supports_sparse=True,

From 5d26cea85154f537390190229e73ee84dedecb98 Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Fri, 4 Nov 2022 16:27:48 +0000
Subject: [PATCH 0589/1922] Support DDP ignored parameters in DDPOptimizer
 (#88460)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88460
Approved by: https://github.com/aazzolini
---
 test/distributed/test_dynamo_distributed.py | 111 +++++++++++++-------
 torch/_dynamo/eval_frame.py                 |   1 -
 torch/_dynamo/optimizations/distributed.py  |  35 ++++--
 torch/nn/parallel/distributed.py            |   6 ++
 4 files changed, 104 insertions(+), 49 deletions(-)

diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index daecc0092b78b..c08b090a2f947 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -6,6 +6,7 @@
 import numpy as np
 import torch
 import torch._dynamo
+from torch._dynamo.optimizations.distributed import DDPOptimizer
 import torch._dynamo.test_case
 import torch.distributed as dist
 from contextlib import contextmanager
@@ -35,11 +36,12 @@ def init_weights(m):
         m.bias.data.fill_(0.01)
 
 class ToyModel(nn.Module):
-    def __init__(self, in_feat=10, hidden_feat=5000, num_hidden=2, out_feat=5):
+    def __init__(self, in_feat=10, hidden_feat=5000, out_feat=5):
         super().__init__()
         self.net = nn.Sequential(
             *[nn.Linear(in_feat, hidden_feat), nn.ReLU()]
-            + [nn.Linear(hidden_feat, hidden_feat), nn.ReLU()] * num_hidden
+            + [nn.Linear(hidden_feat, hidden_feat), nn.ReLU()]
+            + [nn.Linear(hidden_feat, hidden_feat), nn.ReLU()]
             + [nn.Linear(hidden_feat, out_feat), nn.ReLU()]
         )
 
@@ -53,6 +55,43 @@ def get_model(device, bsz=20, in_feat=10, hidden_feat=5000, out_feat=5):
     outputs = m(inputs)
     return m, inputs, outputs
 
+def get_custom_model(device):
+    class MyCustomLinear(torch.nn.Module):
+        def __init__(self):
+            super(MyCustomLinear, self).__init__()
+            self.weight = nn.Parameter(torch.randn(512, 512))
+
+        def forward(self, x):
+            return torch.mm(x, self.weight.t())
+
+    class MyLinear(torch.nn.Module):
+        def __init__(self):
+            super(MyLinear, self).__init__()
+            self.linear = torch.nn.Linear(512, 512)
+
+        def forward(self, x):
+            return self.linear(x)
+
+    class MyModule(torch.nn.Module):
+        def __init__(self):
+            super(MyModule, self).__init__()
+            mods = [
+                (MyLinear(), torch.nn.ReLU()),
+                # sandwitch the custom in the middle so it comes before and after
+                (MyCustomLinear(), torch.nn.ReLU()),
+                (MyLinear(), torch.nn.ReLU()),
+            ]
+            self.seq = torch.nn.Sequential(*[x for items in mods for x in items])
+
+        def forward(self, x):
+            return self.seq(x)
+
+    m = MyModule().to(device)
+    m.apply(init_weights)
+    inputs = torch.rand((512, 512)).to(device)
+    correct_outputs = m(inputs)
+    return m, inputs, correct_outputs
+
 def get_hf_bert(rank):
     # Note: use @import_transformers_or_skip on your test case if you use this
     try:
@@ -325,41 +364,7 @@ def test_custom_layer(self):
         the user-provided compiler is called by the DDPOptimizer which is
         doing the graph splitting
         """
-
-        class MyCustomLinear(torch.nn.Module):
-            def __init__(self):
-                super(MyCustomLinear, self).__init__()
-                self.weight = nn.Parameter(torch.randn(512, 512))
-
-            def forward(self, x):
-                return torch.mm(x, self.weight.t())
-
-        class MyLinear(torch.nn.Module):
-            def __init__(self):
-                super(MyLinear, self).__init__()
-                self.linear = torch.nn.Linear(512, 512)
-
-            def forward(self, x):
-                return self.linear(x)
-
-        class MyModule(torch.nn.Module):
-            def __init__(self):
-                super(MyModule, self).__init__()
-                mods = [
-                    (MyLinear(), torch.nn.ReLU()),
-                    # sandwitch the custom in the middle so it comes before and after
-                    (MyCustomLinear(), torch.nn.ReLU()),
-                    (MyLinear(), torch.nn.ReLU()),
-                ]
-                self.seq = torch.nn.Sequential(*[x for items in mods for x in items])
-
-            def forward(self, x):
-                return self.seq(x)
-
-        m = MyModule().to(self.device)
-        m.apply(init_weights)
-        inputs = torch.rand((512, 512)).to(self.device)
-        correct_outputs = m(inputs)
+        m, inputs, correct_outputs = get_custom_model(self.device)
         ddp_m = DDP(m, device_ids=self.device_ids, bucket_cap_mb=1)
 
         check_splits_compiler = CheckSplitsCompiler()
@@ -386,6 +391,38 @@ def fn():
             pass
         self.assertEqual(res, 1)
 
+    @patch.object(config, "optimize_ddp", False)
+    def test_ignored_parameters(self):
+        """
+        Verifies ddp graph-split logic ignores parameters marked to ignore on DDP module.
+        Hooks up graph-split optimizer manually so it can peek at internal state.
+        """
+        m, inputs, correct_outputs = get_custom_model(self.device)
+        parameters_to_ignore = ["seq.2.weight", "seq.4.linear.bias"]
+        DDP._set_params_and_buffers_to_ignore_for_model(m, parameters_to_ignore)
+        ddp_m = DDP(m, device_ids=self.device_ids, bucket_cap_mb=25)
+        parameter_ids_to_ignore = [
+            id(ddp_m.module.get_parameter(p))
+            for p in ddp_m.parameters_to_ignore
+        ]
+
+        check_splits_compiler = CheckSplitsCompiler()
+        ddp_optimizer = DDPOptimizer(
+            bucket_bytes_cap=ddp_m.bucket_bytes_cap,
+            backend_compile_fn=check_splits_compiler.compile_fn
+        )
+
+        @torch._dynamo.optimize(ddp_optimizer.compile_fn)
+        def opt_fn(inputs):
+            return ddp_m(inputs)
+
+        opt_outputs = opt_fn(inputs)
+        self.assertTrue(same(correct_outputs, opt_outputs))
+        self.assertEqual(check_splits_compiler.compiler_called, 2)
+        for b in ddp_optimizer.buckets:
+            for p_id in b.param_ids:
+                self.assertFalse(p_id in parameter_ids_to_ignore)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 9f115481582b8..cd6aedee6045c 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -240,7 +240,6 @@ def catch_errors(frame, cache_size):
                 with compile_lock:
                     ddp_optimizer = DDPOptimizer(
                         bucket_bytes_cap=ddp_module.bucket_bytes_cap,
-                        parameters_to_ignore=ddp_module.parameters_to_ignore,
                         backend_compile_fn=callback._torchdynamo_orig_callable,
                     )
                     hijacked_callback = convert_frame.convert_frame(
diff --git a/torch/_dynamo/optimizations/distributed.py b/torch/_dynamo/optimizations/distributed.py
index bd3f07b635f3f..4a038a365e2ce 100644
--- a/torch/_dynamo/optimizations/distributed.py
+++ b/torch/_dynamo/optimizations/distributed.py
@@ -25,14 +25,18 @@ class Bucket:
     params: List[str] = field(default_factory=list)
     nodes: List[fx.Node] = field(default_factory=list)
 
+    # param_ids is just used for unit testing
+    param_ids: List = field(default_factory=list)
+
 
 def pretty_print_buckets(buckets: List[Bucket]):
     headers = ("Index", "Size (b)", "Param Names")
     rows = []
     for idx, bucket in enumerate(reversed(buckets)):
-        rows.append((idx, bucket.size, bucket.params[0]))
-        for param in bucket.params[1:]:
-            rows.append((None, None, param))
+        if len(bucket.params) > 0:
+            rows.append((idx, bucket.size, bucket.params[0]))
+            for param in bucket.params[1:]:
+                rows.append((None, None, param))
     try:
         from tabulate import tabulate
 
@@ -45,7 +49,6 @@ class DDPOptimizer:
     def __init__(
         self,
         bucket_bytes_cap: int,
-        parameters_to_ignore: List[str],
         backend_compile_fn,
         debug=False,
         first_bucket_cap: Optional[int] = None,
@@ -62,14 +65,19 @@ def __init__(
         assert (
             self.first_bucket_cap <= self.bucket_bytes_cap
         ), "First bucket should be smaller/equal to other buckets to get comms warmed up ASAP"
-        self.parameters_to_ignore = parameters_to_ignore
+
         self.backend_compile_fn = backend_compile_fn
         self.debug = debug
 
+    def _ignore_parameter(self, parameter):
+        return hasattr(parameter, "_ddp_ignored") and parameter._ddp_ignored
+
     def compile_fn(self, gm: fx.GraphModule, example_inputs: List[torch.Tensor]):
         """
-        TODO:
-        - handle params_and_buffers_to_ignore
+        Implements graph splitting, first determining a set of of buckets by counting
+        parameter sizes in reverse graph order, then invoking the user/backend compiler
+        to compile each subgraph. Finally, stiches compiled graphs into one graphmodule
+        and returns its callable.
         """
 
         # 1: compute the partition map according to DDP bucket logic
@@ -88,17 +96,22 @@ def compile_fn(self, gm: fx.GraphModule, example_inputs: List[torch.Tensor]):
             if node.op == "call_module":
                 target = gm.get_submodule(node.target)
                 for name, p in target.named_parameters():
-                    if p.requires_grad:
+                    param = target.get_parameter(name)
+                    if p.requires_grad and not self._ignore_parameter(param):
                         buckets[0].size += p.storage().nbytes()
-                        # TODO correct FQ name?
-                        buckets[0].params.append(f"{node}_{name}")
+                        buckets[0].params.append(f"{node.target}_{name}")
+                        buckets[0].param_ids.append(id(param))
             elif node.op == "get_attr":
                 maybe_param = getattr(gm, node.target)
-                if maybe_param.requires_grad:
+                if maybe_param.requires_grad and not self._ignore_parameter(
+                    maybe_param
+                ):
                     buckets[0].size += maybe_param.storage().nbytes()
                     buckets[0].params.append(node.target)
+                    buckets[0].param_ids.append(id(maybe_param))
 
             # All nodes have to be mapped to a bucket, even if they don't have their own params
+            # Ignored params still end up in buckets, we just don't count them towards the capacity
             buckets[0].nodes.append(node)
 
         # stash buckets for testing/debugging purposes
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index 30a20c86a1ac2..f3de5037f60d8 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -1797,6 +1797,12 @@ def _set_params_and_buffers_to_ignore_for_model(
         # during synchronization. It will be removed when the API is finalized
         # as part of addressing https://github.com/pytorch/pytorch/issues/43690.
         module._ddp_params_and_buffers_to_ignore = params_and_buffers_to_ignore
+        for name, param in module.named_parameters():
+            if name in params_and_buffers_to_ignore:
+                param._ddp_ignored = True
+        for name, buffer in module.named_buffers():
+            if name in params_and_buffers_to_ignore:
+                buffer._ddp_ignored = True
 
     def _get_ddp_logging_data(self):
         r"""

From 859e3c1de026c64221ff9da8c0393837117d34f6 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@meta.com>
Date: Fri, 4 Nov 2022 21:48:26 +0000
Subject: [PATCH 0590/1922] [BE] Update native_functions.yaml README; we do not
 support Tensor! (#88513)

Just a doc update to minimize confusion
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88513
Approved by: https://github.com/bdhirsh
---
 aten/src/ATen/native/README.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md
index c355423ea7501..651b21ae01863 100644
--- a/aten/src/ATen/native/README.md
+++ b/aten/src/ATen/native/README.md
@@ -47,10 +47,9 @@ signature.
   if one argument is a `FloatTensor`, all other arguments are checked
   to be `FloatTensor`s).
   `Tensor` or `Tensor?` must sometimes be annotated to indicate aliasing and mutability.
-  In general annotations can be defined via the following four situations:
-  - `Tensor(a)` - `a` is a set of Tensors that may alias to the same data.
+  In general annotations can be defined via the following situations:
+  - `Tensor(a)` - `a` is a set of Tensors that may alias to the same data. The set could have a size of one.
   - `Tensor(a!)` - members of `a` may be written to thus mutating the underlying data.
-  - `Tensor!` - shorthand for Tensor(fresh\_identifier!)
   - `Tensor(a! -> a|b)` - Tensor is in set `a`, written to, and after the write is in set `a` AND `b`.
   For more details on when and why this needs to happen, please see the section on annotations.
 - `Tensor[]`.  A `Tensor[]` argument translates into a C++ argument of type `ArrayRef<Tensor>`

From 26f7ff9b84cc489c41e2e02040b055cc39e5cb12 Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Fri, 4 Nov 2022 09:01:23 -0700
Subject: [PATCH 0591/1922] [Quant] Respect non_leaf_module_list for activation
 modules (#88498)

Summary: This commit fixes the bug where `non_leaf_module_list`
was not respected for activation modules like `torch.nn.Sigmoid`
and `torch.nn.Tanh`. Today, these modules default to
`default_fixed_qparams_range_0to1_fake_quant`, and there is no
way to configure them to use any other activation_post_process
(e.g. FixedQParamsObserver) (see this [mapping](https://github.com/pytorch/pytorch/blob/dc00bb51b8d370bf3891f0edb2c6e0c2914e329a/torch/ao/quantization/quantization_mappings.py#L188-L193)).
`non_leaf_module_list` is a "list of non-leaf modules we want
to add observer" (see prepare docstring). If the user explicitly
specified to insert observers for these modules, we should respect
that instead of continuing to use the default.

Test Plan:
python test/test_quantization.py TestQuantizeEagerPTQStatic.test_activations_in_non_leaf_module_list

Reviewers: vkuzo, jerryzh168

Subscribers: vkuzo, jerryzh168

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88498
Approved by: https://github.com/jerryzh168
---
 .../eager/test_quantize_eager_ptq.py          | 44 +++++++++++++++++++
 torch/ao/quantization/quantize.py             |  6 +--
 2 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/test/quantization/eager/test_quantize_eager_ptq.py b/test/quantization/eager/test_quantize_eager_ptq.py
index d414d09bd5f2c..e0ad793df68a8 100644
--- a/test/quantization/eager/test_quantize_eager_ptq.py
+++ b/test/quantization/eager/test_quantize_eager_ptq.py
@@ -19,8 +19,10 @@
     float16_dynamic_qconfig,
     float_qparams_weight_only_qconfig,
     float_qparams_weight_only_qconfig_4bit,
+    FixedQParamsObserver,
     PerChannelMinMaxObserver,
     default_dynamic_quant_observer,
+    default_weight_observer,
     QConfig,
 )
 
@@ -1022,6 +1024,48 @@ def test_quantwrapper_attaches_qconfig_to_dequant(self):
         mq = torch.ao.quantization.convert(mp)
         self.assertTrue(isinstance(mq[0].dequant, nnq.DeQuantize))
 
+    def test_activations_in_non_leaf_module_list(self):
+        """
+        Ensure activations like `nn.Sigmoid` and `nn.Tanh` are properly handled in
+        `non_leaf_module_list`.
+        """
+        class MyModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.quant = QuantStub()
+                self.sigmoid = torch.nn.Sigmoid()
+                self.hardsigmoid = torch.nn.Hardsigmoid()
+                self.softmax = torch.nn.Softmax()
+                self.tanh = torch.nn.Tanh()
+                self.dequant = DeQuantStub()
+
+            def forward(self, x):
+                x = self.quant(x)
+                x = self.sigmoid(x)
+                x = self.hardsigmoid(x)
+                x = self.softmax(x)
+                x = self.tanh(x)
+                x = self.dequant(x)
+                return x
+
+        qconfig = QConfig(
+            activation=FixedQParamsObserver.with_args(scale=123.0, zero_point=0),
+            weight=default_weight_observer
+        )
+        m = MyModel()
+        m.qconfig = qconfig
+        m = prepare(m, observer_non_leaf_module_list=[
+            torch.nn.Sigmoid,
+            torch.nn.Hardsigmoid,
+            torch.nn.Softmax,
+            torch.nn.Tanh,
+        ])
+
+        # Should use the observer specified in the QConfig instead of the default (FixedQParamsFakeQuantize)
+        self.assertTrue(isinstance(m.sigmoid.activation_post_process, FixedQParamsObserver))
+        self.assertTrue(isinstance(m.hardsigmoid.activation_post_process, FixedQParamsObserver))
+        self.assertTrue(isinstance(m.softmax.activation_post_process, FixedQParamsObserver))
+        self.assertTrue(isinstance(m.tanh.activation_post_process, FixedQParamsObserver))
 
 @skipIfNoFBGEMM
 class TestQuantizeEagerPTQDynamic(QuantizationTestCase):
diff --git a/torch/ao/quantization/quantize.py b/torch/ao/quantization/quantize.py
index 69711aa370605..4e28b07084d5b 100644
--- a/torch/ao/quantization/quantize.py
+++ b/torch/ao/quantization/quantize.py
@@ -214,12 +214,12 @@ def insert_activation_post_process(m, special_act_post_process=None):
             # activation_post_process are now added directly to nn.Sequentail/_FusedModule
             if needs_observation(child):
                 insert_activation_post_process(child)
-        elif _has_special_act_post_process(child):
-            special_act_post_process = _get_special_act_post_process(child)
-            insert_activation_post_process(child, special_act_post_process)
         elif non_leaf_module_list is not None and type_before_parametrizations(child) in non_leaf_module_list:
             if needs_observation(child):
                 insert_activation_post_process(child)
+        elif _has_special_act_post_process(child):
+            special_act_post_process = _get_special_act_post_process(child)
+            insert_activation_post_process(child, special_act_post_process)
         elif needs_observation(child) and type_before_parametrizations(child) in custom_module_class_mapping:
             observed_child = custom_module_class_mapping[type_before_parametrizations(child)].from_float(child)
             setattr(module, name, observed_child)

From 85cd18ead6ba07d0ef413d0cc786fbd656d5f2c3 Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Fri, 4 Nov 2022 23:01:45 +0000
Subject: [PATCH 0592/1922] [QNNPACK] Add unaligned attributes where asan fails
 (#88276)

Summary: Bypass "Runtime error: store to misaligned address [...] for type 'uint16_t' (aka 'unsigned short'), which requires 2 byte alignment"

Test Plan:
One of the failing tests, now passes
`buck test fbsource//arvr/mode/platform010/dev-asan fbsource//arvr/libraries/eye/engine:sys_test_eyetrackingenginevisioninterface`

Reviewed By: kimishpatel, salilsdesai

Differential Revision: D40918376

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88276
Approved by: https://github.com/manuelcandales
---
 .../quantized/cpu/qnnpack/src/q8gemm/4x4c2-sse2.c    |  9 +++++----
 .../quantized/cpu/qnnpack/src/qnnpack/common.h       | 12 ++++++++++++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/4x4c2-sse2.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/4x4c2-sse2.c
index 0b2da5a62bed5..398496e081156 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/4x4c2-sse2.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/4x4c2-sse2.c
@@ -327,14 +327,15 @@ void pytorch_q8gemm_ukernel_4x4c2__sse2(
         (uint32_t)_mm_cvtsi128_si32(_mm_unpackhi_epi32(vout, vout));
     *((uint32_t*)c3) = (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(vout, 12));
   } else {
+    typedef PYTORCH_QNNP_UNALIGNED uint16_t unaligned_uint16_t;
     if (nr >= 2) {
-      *((uint16_t*)c0) = (uint16_t)_mm_extract_epi16(vout, 0);
+      *((unaligned_uint16_t*)c0) = (uint16_t)_mm_extract_epi16(vout, 0);
       c0 += 2;
-      *((uint16_t*)c1) = (uint16_t)_mm_extract_epi16(vout, 2);
+      *((unaligned_uint16_t*)c1) = (uint16_t)_mm_extract_epi16(vout, 2);
       c1 += 2;
-      *((uint16_t*)c2) = (uint16_t)_mm_extract_epi16(vout, 4);
+      *((unaligned_uint16_t*)c2) = (uint16_t)_mm_extract_epi16(vout, 4);
       c2 += 2;
-      *((uint16_t*)c3) = (uint16_t)_mm_extract_epi16(vout, 6);
+      *((unaligned_uint16_t*)c3) = (uint16_t)_mm_extract_epi16(vout, 6);
       c3 += 2;
       vout = _mm_srli_epi32(vout, 16);
       nr -= 2;
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/qnnpack/common.h b/aten/src/ATen/native/quantized/cpu/qnnpack/src/qnnpack/common.h
index 14bcc01d21ed0..fbfaa85904c78 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/qnnpack/common.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/qnnpack/common.h
@@ -80,3 +80,15 @@
 #if defined(_MSC_VER)
 #define __builtin_prefetch
 #endif
+
+#if defined(__GNUC__)
+  #define PYTORCH_QNNP_UNALIGNED __attribute__((__aligned__(1)))
+#elif defined(_MSC_VER)
+  #if defined(_M_IX86)
+    #define PYTORCH_QNNP_UNALIGNED
+  #else
+    #define PYTORCH_QNNP_UNALIGNED __unaligned
+  #endif
+#else
+  #error "Platform-specific implementation of PYTORCH_QNNP_UNALIGNED required"
+#endif

From efd9f58b40a8f921f229ab32535a763ef0578819 Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mergennachin@gmail.com>
Date: Fri, 4 Nov 2022 13:03:00 -0700
Subject: [PATCH 0593/1922] Populate self.export in InstructionTranslatorBase
 (#88508)

Summary:

This is a followup to https://github.com/pytorch/pytorch/pull/88354/files#diff-622913fdb49db90d6f3a8ab225b4badb7996023e6498e9f7c6d03fe9f32d0986R836

Reference to self.export got added to InstructionTranslatorBase (i.e. STORE_ATTR) but self.export is populated only for InstructionTranslators.

Here's an example failure

```
   File "/scratch/williamwen/work/pytorch/torch/_dynamo/symbolic_convert.py", line 322, in step
    getattr(self, inst.opname)(inst)
  File "/scratch/williamwen/work/pytorch/torch/_dynamo/symbolic_convert.py", line 844, in STORE_ATTR
    not self.export
AttributeError: 'InliningInstructionTranslator' object has no attribute 'export'
```

Let's populate with the base class with export flag.

Test Plan:

python test/dynamo/test_export_mutations.py
python test/dynamo/test_export.py

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88508
Approved by: https://github.com/tugsbayasgalan
---
 test/dynamo/test_export_mutations.py | 15 +++++++++++++++
 torch/_dynamo/symbolic_convert.py    |  5 +++++
 2 files changed, 20 insertions(+)

diff --git a/test/dynamo/test_export_mutations.py b/test/dynamo/test_export_mutations.py
index f0689aff5028f..218935d3f8cb8 100644
--- a/test/dynamo/test_export_mutations.py
+++ b/test/dynamo/test_export_mutations.py
@@ -54,6 +54,21 @@ def forward(self, x):
 
         self.check_failure_on_export(Foo(), torch.Tensor(3, 2))
 
+    def test_module_attribute_mutation_violation_positive_4(self):
+        # Mutating attribute with an inline function
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def add(self, a, b):
+                return a + b
+
+            def forward(self, x):
+                self.a = self.add(1, 2) * self.add(3, 4)
+                return x.sum() + self.a
+
+        self.check_failure_on_export(Foo(), torch.Tensor(3, 2))
+
     def test_module_attribute_mutation_violation_negative_1(self):
         # Mutating attribute with a Tensor type inside __init__ but
         # not in forward()
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index b644b9ee439e0..e06f62a6bf628 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -1328,6 +1328,7 @@ def __init__(
         symbolic_locals: Dict[str, VariableTracker],
         symbolic_globals: Dict[str, VariableTracker],
         f_code: types.CodeType,
+        export: bool,
     ):
         super(InstructionTranslatorBase, self).__init__()
 
@@ -1357,6 +1358,8 @@ def __init__(
         self.exec_recorder = ExecutionRecorder(code=f_code, code_options=code_options)
         # Stack of module being parsed, current nn.module is at the end of ordered dict
         self.nn_module_stack: Dict[str, str] = {}
+        # Flag to indicate whether tracing is used for export.
+        self.export = export
 
         if fake_tensors_available:
             with torch._subclasses.FakeTensorMode(
@@ -1407,6 +1410,7 @@ def __init__(
             # A global var is inserted only after a STORE_GLOBAL happens to it
             symbolic_globals=collections.OrderedDict(),
             f_code=f_code,
+            export=export,
         )
         self.one_graph: bool = one_graph
         self.export = export
@@ -1634,6 +1638,7 @@ def __init__(
             instructions=cleaned_instructions(code),
             code_options={k: getattr(code, k) for k in dir(code)},
             f_code=code,
+            export=parent.export,
         )
         self.parent = parent
         self.symbolic_result = None

From c5661d7ed8c951bde3829dbd07af5d906feb8f67 Mon Sep 17 00:00:00 2001
From: Michael Lazos <mlazos@fb.com>
Date: Fri, 4 Nov 2022 23:26:44 +0000
Subject: [PATCH 0594/1922] Fix minifier accuracy msg (#88515)

Fixes https://github.com/pytorch/torchdynamo/issues/1809

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88515
Approved by: https://github.com/yanboliang, https://github.com/williamwen42
---
 torch/_dynamo/convert_frame.py |  2 +-
 torch/_dynamo/debug_utils.py   | 10 ++++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index eb16a84d04f2e..f1ce83727a19f 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -244,7 +244,7 @@ def augment_exc_message(exc, msg="\n"):
         exc.inner_exception, "minifier_path"
     ):
         msg += (
-            f"\nMinifier script written to {exc.inner_exception.minifier_path}. Run"
+            f"\nMinifier script written to {exc.inner_exception.minifier_path}. Run "
             "this script to find the smallest traced graph which reproduces this error.\n"
         )
 
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index b1495789d2eae..f09991f9bf348 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -761,13 +761,19 @@ def debug_wrapper(gm, example_inputs, **kwargs):
                 # Check Accuracy
                 compiled_gm = compiler_fn(gm, example_inputs, **kwargs)
                 if backend_accuracy_fails(gm, example_inputs, compiler_fn):
-                    log.warning("Accuracy failed for the TorchDyanmo produced graph")
+                    log.warning(
+                        "Accuracy failed for the TorchDyanmo produced graph. Creating script to minify the error."
+                    )
                     dump_to_minify_after_dynamo(
                         fx.GraphModule(gm, copy.deepcopy(gm.graph)),
                         example_inputs,
                         compiler_name,
                     )
-                    raise ValueError("Bad accuracy detected")
+                    exc = ValueError("Bad accuracy detected.")
+                    exc.minifier_path = os.path.join(
+                        minifier_dir(), "minifier_launcher.py"
+                    )
+                    raise exc
             else:
                 try:
                     compiled_gm = compiler_fn(gm, example_inputs, **kwargs)

From 2227246fa0e63d903b19fa23c3b481e3d08cf4bd Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Sat, 5 Nov 2022 00:17:15 +0000
Subject: [PATCH 0595/1922] [Dynamo] Improve BuiltinVariable log when incorrect
 arg count happens (#88409)

Fixes https://github.com/pytorch/torchdynamo/issues/1832

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88409
Approved by: https://github.com/mlazos
---
 torch/_dynamo/variables/builtin.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/torch/_dynamo/variables/builtin.py b/torch/_dynamo/variables/builtin.py
index cc64e009d094c..5a88f375c9c28 100644
--- a/torch/_dynamo/variables/builtin.py
+++ b/torch/_dynamo/variables/builtin.py
@@ -326,7 +326,10 @@ def call_function(
             try:
                 inspect.signature(handler).bind(tx, *args, **kwargs)
             except TypeError as exc:
-                log.warning(f"incorrect arg count {handler} {exc}")
+                if not has_constant_handler:
+                    log.warning(
+                        f"incorrect arg count {handler} {exc} and no constant handler"
+                    )
                 handler = None
 
         if handler:

From 226da26a8331a51570a71f585566e4dec13620bf Mon Sep 17 00:00:00 2001
From: jjsjann123 <alex.jann2012@gmail.com>
Date: Sat, 5 Nov 2022 02:22:27 +0000
Subject: [PATCH 0596/1922] nvprims native batch norm patch (#88455)

Cherry-picking: https://github.com/csarofeen/pytorch/pull/2104

- [x] Added explicit cast on inputs to nvprims.native_batch_norm. This avoids the explicit cast, which gives us issue on fusion definition.
- [x] add python repro with dynamo

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88455
Approved by: https://github.com/mruberry, https://github.com/IvanYashchuk
---
 test/test_nvfuser_dynamo.py   | 19 +++++++++++
 torch/_prims/nvfuser_prims.py | 60 ++++++++++++++++++++++++++++++++---
 2 files changed, 74 insertions(+), 5 deletions(-)

diff --git a/test/test_nvfuser_dynamo.py b/test/test_nvfuser_dynamo.py
index b0c28386185d4..749cae87411b9 100644
--- a/test/test_nvfuser_dynamo.py
+++ b/test/test_nvfuser_dynamo.py
@@ -45,6 +45,25 @@ def func(a, b):
         eager_result = func.__wrapped__(input1, input2)
         self.assertEqual(eager_result, nvfuser_result)
 
+    def test_batch_norm_implicit_dtype_promotion(self):
+        input1 = make_tensor((2, 3, 4, 5), device="cuda", dtype=torch.float32)
+        input2 = make_tensor((5, 5), device="cuda", dtype=torch.float32)
+        w = make_tensor((3), device="cuda", dtype=torch.float32)
+        b = make_tensor((3), device="cuda", dtype=torch.float32)
+
+        @torchdynamo.optimize("nvprims_nvfuser")
+        def func(mat1, mat2, w, b):
+            o = torch.matmul(mat1, mat2)
+            return torch.batch_norm(o, w, b, None, None, True, 1e-2, 1e-5, True)
+
+        # No warnings and no errors
+        with torch.cuda.amp.autocast():
+            with warnings.catch_warnings(record=True) as warning:
+                nvfuser_result = func(input1, input2, w, b)
+                self.assertEqual(len(warning), 0)
+            eager_result = func.__wrapped__(input1, input2, w, b)
+            self.assertEqual(eager_result, nvfuser_result)
+
     def test_dtype_correctness(self):
         input1 = make_tensor((2, 4, 8), device="cuda", dtype=torch.float16)
 
diff --git a/torch/_prims/nvfuser_prims.py b/torch/_prims/nvfuser_prims.py
index 391a7feee91b3..59a88203651e7 100644
--- a/torch/_prims/nvfuser_prims.py
+++ b/torch/_prims/nvfuser_prims.py
@@ -5,12 +5,13 @@
 # can be added in the future for the corresponding higher-level torch/aten
 # functions.
 
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Tuple
 
 import torch
 
 from torch._prims_common import (
     DimsSequenceType,
+    elementwise_dtypes,
     ELEMENTWISE_TYPE_PROMOTION_KIND,
     getnvFuserDtype,
     make_contiguous_strides_for,
@@ -19,6 +20,7 @@
 )
 
 from torch._prims_common.wrappers import (
+    _maybe_convert_to_dtype,
     backwards_not_supported,
     elementwise_type_promotion_wrapper,
 )
@@ -373,12 +375,60 @@ def _prim_impl(
         )
 
     nvprim_impl.impl(name, _prim_impl)
-    nvprim_autograd_impl.impl(
-        name, backwards_not_supported(torch.ops.nvprims.native_batch_norm.default)
-    )
-
     prim_packet = torch.ops.nvprims.native_batch_norm
     prim = prim_packet.default
+
+    def _native_batch_norm_ref(
+        input: torch.Tensor,
+        weight: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor],
+        running_mean: Optional[torch.Tensor],
+        running_var: Optional[torch.Tensor],
+        training: bool,
+        momentum: float,
+        eps: float,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+        if torch._prims_common.is_complex_dtype(input.dtype):
+            raise NotImplementedError("Complex tensors are not supported")
+
+        # note: BN only promotes input to dtype of weight/bias, but keeps the same output dtype
+        result_dtype = input.dtype
+        computation_dtype, _ = elementwise_dtypes(
+            input,
+            weight,
+            bias,
+            type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH,
+        )
+
+        input_ = _maybe_convert_to_dtype(input, computation_dtype)
+        output, mean, rstd = prim(
+            input_, weight, bias, running_mean, running_var, training, momentum, eps
+        )
+        output_ = _maybe_convert_to_dtype(output, result_dtype)  # type: ignore[arg-type]
+        return (output_, mean, rstd)  # type: ignore[return-value]
+
+    def _native_batch_norm_autograd(
+        input: torch.Tensor,
+        weight: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor],
+        running_mean: Optional[torch.Tensor],
+        running_var: Optional[torch.Tensor],
+        training: bool,
+        momentum: float,
+        eps: float,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # This wrapper is needed to convert prims calls inside
+        # _native_batch_norm_ref to nvprims calls
+        from torch._prims.context import NvfuserPrimsMode
+
+        with NvfuserPrimsMode():
+            return backwards_not_supported(_native_batch_norm_ref)(
+                input, weight, bias, running_mean, running_var, training, momentum, eps
+            )
+
+    nvprim_autograd_impl.impl(name, _native_batch_norm_autograd)
+
     for p in (prim_packet, prim):
         p.__doc__ = "Computes batch normalization."
         p.impl_nvfuser = _nvfuser_impls["native_batch_norm"]

From 00d2f28c1f12d3ccdab89a6239a1c7eaf560f5d7 Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Fri, 4 Nov 2022 16:27:48 +0000
Subject: [PATCH 0597/1922] DDPOptimizer replace debug=True/False with using
 torchdynamo logger (#88480)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Example output:

```
2022-11-04 05:09:29,525] torch._dynamo.optimizations.distributed: [INFO]
DDPOptimizer bucket assignments
┌─────────┬────────────┬───────────────────┐
│   Index │   Size (b) │ Param Names       │
├─────────┼────────────┼───────────────────┤
│       0 │  100120020 │ self_net_6_weight │
├─────────┼────────────┼───────────────────┤
│         │            │ self_net_6_bias   │
├─────────┼────────────┼───────────────────┤
│         │            │ self_net_4_weight │
├─────────┼────────────┼───────────────────┤
│         │            │ self_net_4_bias   │
├─────────┼────────────┼───────────────────┤
│       1 │  100020000 │ self_net_2_weight │
├─────────┼────────────┼───────────────────┤
│         │            │ self_net_2_bias   │
├─────────┼────────────┼───────────────────┤
│       2 │     220000 │ self_net_0_weight │
├─────────┼────────────┼───────────────────┤
│         │            │ self_net_0_bias   │
└─────────┴────────────┴───────────────────┘
[2022-11-04 05:09:29,527] torch._dynamo.optimizations.distributed: [DEBUG]
---orig graph---
graph():
    %inputs : torch.Tensor [#users=1] = placeholder[target=inputs]
    %self_net_0 : [#users=1] = call_module[target=self_net_0](args = (%inputs,), kwargs = {})
    %self_net_1 : [#users=1] = call_module[target=self_net_1](args = (%self_net_0,), kwargs = {})
    %self_net_2 : [#users=1] = call_module[target=self_net_2](args = (%self_net_1,), kwargs = {})
    %self_net_3 : [#users=1] = call_module[target=self_net_3](args = (%self_net_2,), kwargs = {})
    %self_net_4 : [#users=1] = call_module[target=self_net_4](args = (%self_net_3,), kwargs = {})
    %self_net_5 : [#users=1] = call_module[target=self_net_5](args = (%self_net_4,), kwargs = {})
    %self_net_6 : [#users=1] = call_module[target=self_net_6](args = (%self_net_5,), kwargs = {})
    %self_net_7 : [#users=1] = call_module[target=self_net_7](args = (%self_net_6,), kwargs = {})
    return (self_net_7,)

---split graph---
graph():
    %inputs : torch.Tensor [#users=1] = placeholder[target=inputs]
    %submod_0 : [#users=1] = call_module[target=submod_0](args = (%inputs,), kwargs = {})
    %submod_1 : [#users=1] = call_module[target=submod_1](args = (%submod_0,), kwargs = {})
    %submod_2 : [#users=1] = call_module[target=submod_2](args = (%submod_1,), kwargs = {})
    return (submod_2,)

---submod_0 graph---
graph():
    %inputs : [#users=1] = placeholder[target=inputs]
    %self_net_0 : [#users=1] = call_module[target=self_net_0](args = (%inputs,), kwargs = {})
    %self_net_1 : [#users=1] = call_module[target=self_net_1](args = (%self_net_0,), kwargs = {})
    return self_net_1

---submod_1 graph---
graph():
    %self_net_1 : [#users=1] = placeholder[target=self_net_1]
    %self_net_2 : [#users=1] = call_module[target=self_net_2](args = (%self_net_1,), kwargs = {})
    %self_net_3 : [#users=1] = call_module[target=self_net_3](args = (%self_net_2,), kwargs = {})
    return self_net_3

---submod_2 graph---
graph():
    %self_net_3 : [#users=1] = placeholder[target=self_net_3]
    %self_net_4 : [#users=1] = call_module[target=self_net_4](args = (%self_net_3,), kwargs = {})
    %self_net_5 : [#users=1] = call_module[target=self_net_5](args = (%self_net_4,), kwargs = {})
    %self_net_6 : [#users=1] = call_module[target=self_net_6](args = (%self_net_5,), kwargs = {})
    %self_net_7 : [#users=1] = call_module[target=self_net_7](args = (%self_net_6,), kwargs = {})
    return self_net_7

---------------
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88480
Approved by: https://github.com/anj-s, https://github.com/davidberard98
---
 test/distributed/test_dynamo_distributed.py |  2 +
 torch/_dynamo/optimizations/distributed.py  | 63 ++++++++++-----------
 2 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index c08b090a2f947..3dd3c5de77253 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -1,4 +1,5 @@
 # Owner(s): ["module: dynamo"]
+import logging
 import os
 import random
 import unittest
@@ -217,6 +218,7 @@ def setUpClass(cls):
                 },
             )
         )
+        cls._exit_stack.enter_context(patch.object(config, "log_level", logging.DEBUG))
         cls.rank = 0
         cls.device = f"cuda:{cls.rank}"
         cls.device_ids = None if "cuda" in cls.device else [cls.rank]
diff --git a/torch/_dynamo/optimizations/distributed.py b/torch/_dynamo/optimizations/distributed.py
index 4a038a365e2ce..1af52219d4957 100644
--- a/torch/_dynamo/optimizations/distributed.py
+++ b/torch/_dynamo/optimizations/distributed.py
@@ -1,3 +1,4 @@
+import logging
 from dataclasses import dataclass, field
 from typing import Any, List, Optional
 
@@ -6,6 +7,8 @@
 from torch import fx
 from torch.fx.node import Node
 
+log = logging.getLogger(__name__)
+
 
 def args_str(args):
     # a debug helper
@@ -40,9 +43,14 @@ def pretty_print_buckets(buckets: List[Bucket]):
     try:
         from tabulate import tabulate
 
-        print(tabulate(rows, headers=headers, tablefmt="simple_grid"))
+        log.info(
+            "\nDDPOptimizer bucket assignments\n"
+            + tabulate(rows, headers=headers, tablefmt="simple_grid")
+        )
     except ImportError:
-        print("Please `pip install tabulate` in order to pretty-print ddp bucket sizes")
+        log.info(
+            "Please `pip install tabulate` in order to pretty-print ddp bucket sizes"
+        )
 
 
 class DDPOptimizer:
@@ -50,7 +58,6 @@ def __init__(
         self,
         bucket_bytes_cap: int,
         backend_compile_fn,
-        debug=False,
         first_bucket_cap: Optional[int] = None,
     ):
         if first_bucket_cap is not None:
@@ -67,7 +74,6 @@ def __init__(
         ), "First bucket should be smaller/equal to other buckets to get comms warmed up ASAP"
 
         self.backend_compile_fn = backend_compile_fn
-        self.debug = debug
 
     def _ignore_parameter(self, parameter):
         return hasattr(parameter, "_ddp_ignored") and parameter._ddp_ignored
@@ -116,11 +122,10 @@ def compile_fn(self, gm: fx.GraphModule, example_inputs: List[torch.Tensor]):
 
         # stash buckets for testing/debugging purposes
         self.buckets = buckets
-        if self.debug:
-            print(
-                f"DDPOptimizer used bucket cap {self.bucket_bytes_cap} and produced the following buckets:"
-            )
-            pretty_print_buckets(buckets)
+        log.info(
+            f"DDPOptimizer used bucket cap {self.bucket_bytes_cap} and produced the following buckets:"
+        )
+        pretty_print_buckets(buckets)
 
         if len(buckets) == 1:
             # bypass split/fuse logic if there is only one bucket
@@ -135,24 +140,23 @@ def compile_fn(self, gm: fx.GraphModule, example_inputs: List[torch.Tensor]):
         split_gm = fx.passes.split_module.split_module(
             gm, None, lambda node: partition_map[node]
         )
-        if self.debug:
-            print("---orig graph---")
-            print(str(gm.graph))
-            print("\n---split graph---")
-            print(str(split_gm.graph))
-            for name, module in split_gm.named_modules():
-                if "." not in name:
-                    # only print the submod graphs, not their children
-                    print(f"\n---{name} graph---")
-                    print(str(module.graph))
-            print("---------------")
+
+        debug_str = (
+            f"\n---orig graph---\n{gm.graph}\n"
+            + f"\n---split graph---\n{split_gm.graph}\n"
+        )
+        for name, module in split_gm.named_modules():
+            if "." not in name and len(name):
+                # only print the submod graphs, not their children
+                debug_str += f"\n---{name} graph---\n{module.graph}\n"
+        debug_str += "\n---------------\n"
+        log.debug(debug_str)
 
         # 3: compile each of the partitioned submodules using the user-provided compiler
         class SubmodCompiler(torch.fx.interpreter.Interpreter):
-            def __init__(self, module, compiler, debug=False):
+            def __init__(self, module, compiler):
                 super().__init__(module)
                 self.compiler = compiler
-                self.debug = debug
 
             def compile_submod(self, submod, args, kwargs):
                 """
@@ -195,8 +199,7 @@ def forward(self, *args):
             def run_node(self, n: Node) -> Any:
                 with fx_traceback.append_stack_trace(n.stack_trace):
                     args, kwargs = self.fetch_args_kwargs_from_env(n)
-                    if self.debug:
-                        print(f"run_node {n.op}, {n.target} got args {args_str(args)}")
+                    log.debug(f"run_node {n.op}, {n.target} got args {args_str(args)}")
                     assert isinstance(args, tuple)
                     assert isinstance(kwargs, dict)
 
@@ -204,10 +207,7 @@ def run_node(self, n: Node) -> Any:
                     # maybe this isn't sound in general, but only changing the target of a node might be ok?
                     if n.op == "call_module":
                         submod = self.fetch_attr(n.target)
-                        if self.debug:
-                            with open("debug_ddp_optimizer.log", "a") as dump_file:
-                                dump_file.write(f"\n---{n.target} graph---")
-                                dump_file.write(str(submod.graph))
+                        log.debug(f"\n---{n.target} graph---\n" + str(submod.graph))
                         compiled_submod = self.compile_submod(submod, args, kwargs)
                         self.module.delete_submodule(n.target)
                         n.target = "compiled_" + n.target
@@ -215,13 +215,10 @@ def run_node(self, n: Node) -> Any:
                     # then we execute the modified node using the usual logic
                     return getattr(self, n.op)(n.target, args, kwargs)
 
-        submod_compiler = SubmodCompiler(split_gm, self.backend_compile_fn, self.debug)
+        submod_compiler = SubmodCompiler(split_gm, self.backend_compile_fn)
         submod_compiler.run(*example_inputs)
         split_gm.recompile()
 
-        if self.debug:
-            print("\n---final graph---")
-            print(str(split_gm.graph))
-            print("---------------")
+        log.debug("\n---final graph---\n" + str(split_gm.graph) + "\n---------------\n")
 
         return split_gm

From 158269729787a805faaa1cbf10ca4c1a2e168eb5 Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Fri, 4 Nov 2022 21:00:01 +0000
Subject: [PATCH 0598/1922] Add single-process DDP accuracy support to dynamo
 benchmark suite (#88511)

- does not intend to support multi-process, as that is more complex
  and we have torchbench scripts for that
- currently only works in accuracy mode as this was the main goal,
  but could be extended for measuring single-gpu perf impact of
  graph breaks

Run with

`python benchmarks/dynamo/torchbench.py --inductor --training --accuracy --only hf_Bert --ddp`

Example output
```
cuda train hf_Bert
[2022-11-04 18:52:08,304] torch._inductor.compile_fx: [WARNING] skipping cudagraphs due to complex input striding
PASS
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88511
Approved by: https://github.com/davidberard98, https://github.com/aazzolini
---
 benchmarks/dynamo/common.py | 76 +++++++++++++++++++++++++++++++++----
 1 file changed, 68 insertions(+), 8 deletions(-)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index dcbdfa6778bbd..b41085d099a2f 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -13,6 +13,7 @@
 import sys
 import time
 import warnings
+from contextlib import contextmanager
 
 import numpy as np
 import pandas as pd
@@ -20,6 +21,7 @@
 
 import torch._dynamo
 import torch._dynamo.utils
+import torch.distributed
 from microbenchmarks.operator_inp_utils import OperatorInputsMode
 from scipy.stats import gmean, ttest_ind
 from torch._dynamo.optimizations import backends
@@ -30,6 +32,7 @@
 from torch._inductor import config as inductor_config
 from torch._inductor.utils import fresh_inductor_cache
 from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils._pytree import tree_map
 
 try:
@@ -787,6 +790,24 @@ def inner(*args, **kwargs):
     return inner
 
 
+@contextmanager
+def maybe_init_distributed(should_init_distributed, port="6789", rank=0, world_size=1):
+    # To avoid multiple inheritance from _dynamo.test_case.TestCase and MultiProcessTestCase,
+    # Just manually implement the most important part of the dynamo behavior to reset/clear.
+    try:
+        if should_init_distributed:
+            torch.cuda.set_device(rank)
+            os.environ["MASTER_ADDR"] = "localhost"
+            os.environ["MASTER_PORT"] = port
+            torch.distributed.init_process_group(
+                "nccl", rank=rank, world_size=world_size
+            )
+        yield
+    finally:
+        if should_init_distributed:
+            torch.distributed.destroy_process_group()
+
+
 def xla_wrapper(model_iter_fn):
     """
     Wrap the model_iter_fn to run the model on XLA devices.
@@ -1045,12 +1066,18 @@ def record_status(accuracy_status):
         if name in self.skip_accuracy_checks_large_models_dashboard:
             return record_status("pass_due_to_skip")
 
+        def deepcopy_and_maybe_ddp(model):
+            model = copy.deepcopy(model)
+            if self.args.ddp:
+                model = DDP(model)
+            return model
+
         # Collect the fp64 reference outputs to be used later for accuracy checking.
         fp64_outputs = None
         try:
             fp64_outputs = self.run_n_iterations(
                 *cast_to_fp64(
-                    copy.deepcopy(model),
+                    deepcopy_and_maybe_ddp(model),
                     clone_inputs(example_inputs),
                 )
             )
@@ -1062,20 +1089,19 @@ def record_status(accuracy_status):
 
         # Cast the model to float16/float32 as necessary
         model, example_inputs = self.maybe_cast(model, example_inputs)
-
         accuracy_status = "pass"
 
         with self.pick_grad(name, self.args.training):
             # Get results of native pytorch
             reset_rng_state()
             correct_result = self.run_n_iterations(
-                copy.deepcopy(model), clone_inputs(example_inputs)
+                deepcopy_and_maybe_ddp(model), clone_inputs(example_inputs)
             )
 
             # Rerun native pytorch
             reset_rng_state()
             correct_rerun_result = self.run_n_iterations(
-                copy.deepcopy(model), clone_inputs(example_inputs)
+                deepcopy_and_maybe_ddp(model), clone_inputs(example_inputs)
             )
             if not same(
                 correct_result,
@@ -1092,7 +1118,10 @@ def record_status(accuracy_status):
             torch._dynamo.reset()
             try:
                 optimized_model_iter_fn = optimize_ctx(self.run_n_iterations)
-                new_result = optimized_model_iter_fn(model, example_inputs)
+
+                new_result = optimized_model_iter_fn(
+                    deepcopy_and_maybe_ddp(model), example_inputs
+                )
             except Exception as e:
                 accuracy_status = "fail_to_run"
                 print(
@@ -1363,6 +1392,21 @@ def parse_args():
         action="store_true",
         help="Performs training",
     )
+    parser.add_argument(
+        "--ddp",
+        action="store_true",
+        help="Wraps model in DDP before running it, and uses dynamo DDPOptmizer (graph breaks) by default.",
+    )
+    parser.add_argument(
+        "--no-optimize-ddp",
+        action="store_true",
+        help="Disables dynamo DDPOptimizer (graph breaks). (Applies only when using --ddp benchmark mode).",
+    )
+    parser.add_argument(
+        "--distributed-master-port",
+        default="6789",
+        help="Port to bind for for torch.distributed.  Use the default unless it's conflicting with another user",
+    )
     parser.add_argument(
         "--dynamic-shapes",
         action="store_true",
@@ -1529,9 +1573,12 @@ def parse_args():
 
 def main(runner, original_dir=None):
     args = parse_args()
-    return maybe_fresh_cache(run, args.cold_start_latency and args.only)(
-        runner, args, original_dir
-    )
+    with maybe_init_distributed(
+        args.ddp and args.only, port=args.distributed_master_port
+    ):
+        return maybe_fresh_cache(run, args.cold_start_latency and args.only)(
+            runner, args, original_dir
+        )
 
 
 def run(runner, args, original_dir=None):
@@ -1557,6 +1604,19 @@ def run(runner, args, original_dir=None):
                 if args.training
                 else CI_SKIP_INDCUTOR_INFERENCE
             )
+    if args.ddp:
+        # TODO: we could also hook DDP bench up to --speedup bench, _not_ for mgpu e2e perf,
+        # but just to measure impact on singlenode of performing graph-breaks.
+        # Left it as a follow up to keep this PR isolated.
+        assert (
+            args.accuracy
+        ), "DDP benchmark is currently only hooked up to --accuracy bench"
+        assert args.training, "DDP benchmark requires --training mode"
+        if args.no_optimize_ddp:
+            torch._dynamo.config.optimize_ddp = False
+        else:
+            # TODO(whc) after enabling DDPOptimizer by default this could be removed or assert
+            torch._dynamo.config.optimize_ddp = True
 
     if args.accuracy:
         # Use small batch size. We use >1 batch size to ensure we test

From 0340aa39922042a71ecd8976358c40aac292f7e1 Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Fri, 4 Nov 2022 22:05:21 +0000
Subject: [PATCH 0599/1922] Add docstring to DDPOptimizer (#88521)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88521
Approved by: https://github.com/aazzolini
---
 torch/_dynamo/optimizations/distributed.py | 53 ++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/torch/_dynamo/optimizations/distributed.py b/torch/_dynamo/optimizations/distributed.py
index 1af52219d4957..bde786979fcfe 100644
--- a/torch/_dynamo/optimizations/distributed.py
+++ b/torch/_dynamo/optimizations/distributed.py
@@ -54,6 +54,59 @@ def pretty_print_buckets(buckets: List[Bucket]):
 
 
 class DDPOptimizer:
+    """
+    DDPOptimizer applies when dynamo compiles models wrapped in DistributedDataParallel (DDP),
+    breaking the dynamo graph into chunks to compile separately, with the breaks aligning to
+    the boundaries of gradient-allreduce buckets chosen by DDP.
+
+    Background/Motivation
+     - DDP uses allreduce collectives to synchronize partial gradients computed on different workers
+     - DDP groups gradient allreduces into 'buckets' to optimize communication efficiency of all-reduce
+     - Parameters grouped into buckets are assumed to be adjacent in time, so they become ready
+       at around the same time during backward and thus can share the same allreduce efficently
+     - Allreduces must overlap with backward compute for optimal training performance
+     - DDP schedules allreduces using 'hooks' fired from the c++ autograd engine in pytorch, which
+       operates when individual grads become 'ready'
+     - Dynamo+AOTAutograd produces a single fused graph that runs 'atomically' from the perspective of the
+       autograd engine, such that all gradients become 'ready' at the same time.  Hooks fire after the whole
+       fused backward function executes, preventing any overlap of compute and communication
+
+    Algorithm
+     - DDPOptimizer starts off with an FX graph traced by dynamo which represents forward.  It can traverse
+       this graph in reverse order to determine the true order that gradients will become ready during backward.
+     - Parameter sizes are counted in reverse order, up to a bucket size limit, at which point a new bucket is started
+       and a graph break introduced
+     - Each of the subgraphs is compiled by the compiler provided to dynamo by the user, and then fused back together
+       into an outer module that is returned to the user
+
+    Notes
+     - It would be better to enforce (by adding an API to DDP) that the bucket splits chosen here are used by DDP,
+       and that DDP does not need to detect or optimize bucket order by observing execution at runtime, as it does
+       in eager.
+     - If Dynamo can't capture a whole graph for the portion of the model wrapped by DDP, this algorithm will currently
+       produce splits that do not necessarily align with the buckets used by DDP.  This should result in performance
+       degradation approaching the baseline case where graph-splits are not used, but not worse.
+     - If the backend compiler fails to compile a single subgraph, it will execute eagerly despite the rest of the
+       subgraphs being compiled
+     - DDP has a 'parameters_and_buffers_to_ignore' field, which DDPOptimizer attempts to honor by reading markers
+       left by DDP on individual parameters.  In cases where other transformations, such as reparameterization, are
+       also used, the ignore markers could be lost.  If DDPOptimizer fails to ignore a parameter ignored by DDP,
+       it is not catastrophic but could impact performance by choosing sub-optimal bucket splits.
+     - DDPOptimizer always ignores all buffers, regardless of their ignore flag, since buffers do not require gradients,
+       and therefore aren't allreduced by DDP.  (They are broadcast during forward, but this is not covered by
+       DDPOptimizer)
+
+    Args:
+        bucket_bytes_cap (int): Controls the size of buckets, in bytes, used to determine graphbreaks.  Should be
+            set to match the equivalent parameter on the original DDP module.
+
+        backend_compile_fn (callable): A dynamo compiler function, to be invoked to compile each subgraph.
+
+        first_bucket_cap (int): Controls the size of the first bucket.  Should match DDP's first bucket cap.  DDP
+            special-cases the first bucket size since it is sometimes optimal to start a small allreduce early.
+
+    """
+
     def __init__(
         self,
         bucket_bytes_cap: int,

From 744f00b48d85e3e54ae627e419e37cfdec558bf6 Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Fri, 4 Nov 2022 05:01:27 +0000
Subject: [PATCH 0600/1922] fx.replace_pattern accepts pattern/replacement as
 GraphModule (#88479)

Symbolic tracer is no longer the default tracer to produce fx graph.
SubgraphRewriter should thus accept a raw GraphModule, rather than use symbolic tracer by default.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88479
Approved by: https://github.com/jerryzh168
---
 ...t-fx_backcompat_function_signatures.expect |  2 +-
 torch/fx/subgraph_rewriter.py                 | 28 +++++++++++++------
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
index 9c2a5d7b762f8..7bdd777ad4512 100644
--- a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
+++ b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
@@ -71,4 +71,4 @@ torch.fx.proxy.TracerBase.iter(self, obj: 'Proxy') -> Iterator
 torch.fx.proxy.TracerBase.keys(self, obj: 'Proxy') -> Any
 torch.fx.proxy.TracerBase.proxy(self, node: torch.fx.node.Node) -> 'Proxy'
 torch.fx.proxy.TracerBase.to_bool(self, obj: 'Proxy') -> bool
-torch.fx.subgraph_rewriter.replace_pattern(gm: torch.fx.graph_module.GraphModule, pattern: Callable, replacement: Callable) -> List[torch.fx.subgraph_rewriter.Match]
+torch.fx.subgraph_rewriter.replace_pattern(gm: torch.fx.graph_module.GraphModule, pattern: Union[Callable, torch.fx.graph_module.GraphModule], replacement: Union[Callable, torch.fx.graph_module.GraphModule]) -> List[torch.fx.subgraph_rewriter.Match]
diff --git a/torch/fx/subgraph_rewriter.py b/torch/fx/subgraph_rewriter.py
index 72bb7fd373516..c46de13cc9859 100644
--- a/torch/fx/subgraph_rewriter.py
+++ b/torch/fx/subgraph_rewriter.py
@@ -5,7 +5,7 @@
 from ._compatibility import compatibility
 
 import copy
-from typing import Callable, Dict, List, NamedTuple, Optional, Set
+from typing import Callable, Dict, List, NamedTuple, Optional, Set, Union
 import torch
 
 __all__ = ['Match', 'replace_pattern', 'replace_pattern_with_filters']
@@ -65,7 +65,11 @@ def try_get_submodule(mod: torch.nn.Module, target: str) -> Optional[torch.nn.Mo
 
 
 @compatibility(is_backward_compatible=True)
-def replace_pattern(gm: GraphModule, pattern: Callable, replacement: Callable) -> List[Match]:
+def replace_pattern(
+    gm: GraphModule,
+    pattern: Union[Callable, GraphModule],
+    replacement: Union[Callable, GraphModule]
+) -> List[Match]:
     """
     Matches all possible non-overlapping sets of operators and their
     data dependencies (``pattern``) in the Graph of a GraphModule
@@ -187,8 +191,8 @@ def forward(self, x, w1, w2):
 @compatibility(is_backward_compatible=False)
 def replace_pattern_with_filters(
     gm: GraphModule,
-    pattern: Callable,
-    replacement: Callable,
+    pattern: Union[Callable, GraphModule],
+    replacement: Union[Callable, GraphModule],
     match_filters: List[Callable[["InternalMatch", Graph, Graph], bool]],  # type: ignore[name-defined]
 ) -> List[Match]:
     """
@@ -205,8 +209,8 @@ def replace_pattern_with_filters(
 
 def _replace_pattern(
     gm: GraphModule,
-    pattern: Callable,
-    replacement: Callable,
+    pattern: Union[Callable, GraphModule],
+    replacement: Union[Callable, GraphModule],
     match_filters: List[Callable[["InternalMatch", Graph, Graph], bool]] = None  # type: ignore[name-defined]
 ) -> List[Match]:
 
@@ -217,8 +221,16 @@ def _replace_pattern(
 
     # Get the graphs for `gm`, `pattern`, `replacement`
     original_graph: Graph = gm.graph
-    pattern_graph: Graph = symbolic_trace(pattern).graph
-    replacement_graph: Graph = symbolic_trace(replacement).graph
+
+    if isinstance(pattern, GraphModule):
+        pattern_graph = pattern.graph
+    else:
+        pattern_graph = symbolic_trace(pattern).graph
+
+    if isinstance(replacement, GraphModule):
+        replacement_graph = replacement.graph
+    else:
+        replacement_graph = symbolic_trace(replacement).graph
 
     matcher = SubgraphMatcher(pattern_graph, match_output=False, match_placeholder=False,
                               remove_overlapping_matches=True)

From cfd8a7843a9c57d9a70cf4b0e34d191fb723e6a5 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Fri, 4 Nov 2022 12:31:51 -0700
Subject: [PATCH 0601/1922] Functionalize and compute joint simultaneously.
 (#88063)

This also comes with some bug fixes that were uncovered from doing
this:

- Forward device calls to inner tensor in FunctionalTensorWrapper

- Make legacyExtractDispatchKey exclude Functionalize, so that
  it can get at the real device type key.  This is noncontroversial.

- Stop stripping dense from key set.  The reason for this is
  FunctionalWrapperTensor may be used in contexts where people
  query if it is dense or not.  If it doesn't report this correctly
  (from the dispatch key), it will cause errors.  This caused some
  torchbench models to fail when I did one-pass tracing.

- Save and restore reapply views TLS correctly

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88063
Approved by: https://github.com/bdhirsh
---
 aten/src/ATen/FunctionalTensorWrapper.cpp |  17 +---
 aten/src/ATen/FunctionalTensorWrapper.h   |   1 +
 aten/src/ATen/ThreadLocalState.cpp        |   6 +-
 aten/src/ATen/ThreadLocalState.h          |   2 +
 c10/core/DispatchKeySet.h                 |   5 +-
 c10/core/TensorImpl.h                     |   8 +-
 functorch/_src/aot_autograd.py            | 119 ++++++++++++++++++----
 test/functorch/test_aotdispatch.py        |  18 +++-
 test/test_functionalization.py            |  27 +++--
 9 files changed, 148 insertions(+), 55 deletions(-)

diff --git a/aten/src/ATen/FunctionalTensorWrapper.cpp b/aten/src/ATen/FunctionalTensorWrapper.cpp
index b7b1c51e83b11..2c3a12020eb68 100644
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@@ -37,22 +37,10 @@ void FunctionalTensorWrapper::set_constructor_metadata() {
   // Functorch transforms all have their own wrapper tensors (e.g. BatchedTensorImpl) which expect
   // to participate in the functorch transforms.
   key_set_ = key_set_ - c10::functorch_transforms_ks - c10::python_ks;
-  // For better error handling,
-  // we also don't want our wrapper tensor to be able to dispatch directly
-  // to a backend kernel.
-  // Dispatching directly to e.g. a CPU kernel would always segfault,
-  // because wrapper tensors don't have any real data.
-  // (This should never happen because we should always hit a functionalization kernel,
-  // but can help make bugs less nasty).
-  // Here, we defensively remove any backend keys from the wrapper's keyset.
-  // We don't want to remove actual backend bits though (say we're redispatching to autograd;
-  // we need to know if we're dispatching to AutogradCPU or AutogradXLA).
-  // Instead, it's sufficient to remove the `Dense` dispatch key,
-  // which prevents us from accidentally trying to directly run a CPU/CUDA kernel.
-  key_set_ = key_set_.remove(c10::DispatchKey::Dense);
   // We override a bunch of _custom(), so make sure they get called
   // TODO: metadata copying may not actually be necessary then
   set_custom_sizes_strides(SizesStridesPolicy::CustomSizes);
+  set_custom_device(true);
 }
 
 FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& value)
@@ -335,6 +323,9 @@ c10::intrusive_ptr<TensorImpl> FunctionalTensorWrapper::shallow_copy_and_detach(
       std::move(version_counter), allow_tensor_metadata_change);
 }
 
+c10::Device FunctionalTensorWrapper::device_custom() const {
+  return value_.unsafeGetTensorImpl()->device();
+}
 at::IntArrayRef FunctionalTensorWrapper::sizes_custom() const {
   return value_.unsafeGetTensorImpl()->sizes();
 }
diff --git a/aten/src/ATen/FunctionalTensorWrapper.h b/aten/src/ATen/FunctionalTensorWrapper.h
index 27a88f13f8722..0762fb1f7f9b0 100644
--- a/aten/src/ATen/FunctionalTensorWrapper.h
+++ b/aten/src/ATen/FunctionalTensorWrapper.h
@@ -148,6 +148,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
   c10::SymInt sym_size_custom(int64_t d) const override;
   c10::SymIntArrayRef sym_strides_custom() const override;
   c10::SymInt sym_storage_offset_custom() const override;
+  c10::Device device_custom() const override;
 
  private:
   const char* tensorimpl_type_name() const override;
diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp
index 16c0aa42232f8..5c8214b7d8829 100644
--- a/aten/src/ATen/ThreadLocalState.cpp
+++ b/aten/src/ATen/ThreadLocalState.cpp
@@ -6,6 +6,7 @@
 
 #include <ATen/record_function.h>
 #include <ATen/SavedTensorHooks.h>
+#include <ATen/FunctionalTensorWrapper.h>
 
 namespace at {
 
@@ -15,7 +16,8 @@ ThreadLocalState::ThreadLocalState()
       functorch_tls_(functorch::getCopyOfFuncTorchTLS()),
       autograd_tls_(c10::AutogradState::get_tls_state()),
       python_dispatcher_state_(c10::impl::PythonDispatcherTLS::get_state()),
-      python_torch_function_state_(at::impl::PythonTorchFunctionTLS::get_state()) {
+      python_torch_function_state_(at::impl::PythonTorchFunctionTLS::get_state()),
+      functionalization_reapply_views_state_(at::functionalization::impl::getFunctionalizationReapplyViewsTLS()) {
   rf_tls_ = at::get_record_function_tls_();
 
   saved_tensors_default_hooks_state_ = at::SavedTensorDefaultHooks::get_tls_state();
@@ -53,6 +55,8 @@ void ThreadLocalState::setThreadLocalState(
   c10::impl::_force_tls_local_dispatch_key_set(state.dispatch_key_);
 
   functorch::setFuncTorchTLS(state.functorch_tls_);
+
+  at::functionalization::impl::setFunctionalizationReapplyViewsTLS(state.functionalization_reapply_views_state_);
 }
 
 } // namespace at
diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h
index 9e5f70a4224f3..0184cc9b82c47 100644
--- a/aten/src/ATen/ThreadLocalState.h
+++ b/aten/src/ATen/ThreadLocalState.h
@@ -74,6 +74,8 @@ class TORCH_API ThreadLocalState {
   // TLS for saved tensors default hooks
   at::impl::SavedTensorDefaultHooksTLS saved_tensors_default_hooks_state_;
 
+  bool functionalization_reapply_views_state_;
+
   friend class ThreadLocalStateGuard;
 };
 
diff --git a/c10/core/DispatchKeySet.h b/c10/core/DispatchKeySet.h
index 33916492a0ef5..6a0be21f8fe07 100644
--- a/c10/core/DispatchKeySet.h
+++ b/c10/core/DispatchKeySet.h
@@ -876,7 +876,10 @@ static inline DispatchKey legacyExtractDispatchKey(DispatchKeySet s) {
   // treatment;
   return (s - autograd_dispatch_keyset_with_ADInplaceOrView -
           autocast_dispatch_keyset -
-          DispatchKeySet({DispatchKey::PythonTLSSnapshot, DispatchKey::Python}))
+          DispatchKeySet(
+              {DispatchKey::Functionalize,
+               DispatchKey::PythonTLSSnapshot,
+               DispatchKey::Python}))
       .highestPriorityTypeId();
 }
 
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index a360a65d42a37..27d65e2d86739 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -1307,7 +1307,13 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * It can be expanded as needed in the future, e.g sparse Tensor.
    */
   inline bool support_as_strided() const {
-    return is_nested() ? false : device().supports_as_strided();
+    if (is_nested()) {
+      return false;
+    }
+    if (key_set_.has(DispatchKey::Functionalize)) {
+      return false;
+    }
+    return device().supports_as_strided();
   }
 
   // ~~~~~ Autograd API ~~~~~
diff --git a/functorch/_src/aot_autograd.py b/functorch/_src/aot_autograd.py
index e0c9d09d821d6..d682d8b4b71b9 100644
--- a/functorch/_src/aot_autograd.py
+++ b/functorch/_src/aot_autograd.py
@@ -18,7 +18,6 @@
 from torch.nn.utils import stateless
 
 from functorch import make_fx
-from functorch.experimental import functionalize
 from torch._dispatch.python import enable_python_dispatcher
 from . import config
 from .named_members_polyfill import _named_buffers, _named_parameters
@@ -133,7 +132,99 @@ def posthook(grad_input, grad_output):
         node.register_hook(get_posthook(special_stack))
 
 
-def create_joint_forward_backward(fn):
+# This is a version of functionalization that is specifically designed
+# for the AOTAutograd use case.  It might be generally applicable though
+# (if so, move it out of this file), so I've tried to give it a name that
+# describes what it does.
+#
+# Given a function f, it produces a new function g that:
+#
+#   - Detaches all inputs before running f; the inner function
+#     does not directly participate in any pre-existing autograd.
+#     preserve_requires_grad is provided as a convenience to set the
+#     requires_grad on the new detached leaves in sync with the originals.
+#     (NB: In principle, you could backward through the pure operations
+#     produced by functionalization; this is not used for AOTAutograd
+#     and we have not tested it.)
+#
+#   - Functionalizes all operations on f, under the assumption that the passed
+#     in function f must be "observationally pure"; that is, it cannot perform any
+#     mutations (inplace data or view operations) on the passed in inputs, nor is
+#     it allowed to directly close over tensors that aren't passed via its
+#     arguments.  See
+#     https://docs.google.com/document/d/19UoIh_SVrMy_b2Sx5ZaeOJttm6P0Qmyss2rdBuyfoic/edit
+#     for discussion how how to implement the more complicated case.
+#
+# Unlike functorch's variant, this doesn't use the functorch level system,
+# instead it directly uses PyTorch's conventional dispatcher to hit the
+# functionalization key.  In particular, this means that FunctionalTensorWrapper
+# can have autograd data stored directly on it.
+#
+# In typical AOTAutograd usage, the dispatch key order will look like:
+#
+#   Autograd - Functionalization ~~~~> Proxy Mode - Fake Tensor
+#       outer tensor                        inner tensor
+#
+# TODO: Provide a faster version of this that assumes flat arguments
+# (so no pytree necessary)
+def detach_and_functionalize_pure(f, preserve_requires_grad=True):
+    @wraps(f)
+    def inner(*args, **kwargs):
+        def to_fun(t):
+            if isinstance(t, Tensor):
+                r = torch._to_functional_tensor(t)
+                # NB: r is a leaf; it has no grad_fn relating
+                # it to t.  If t has autograd metadata, that
+                # metadata was preserved *inside* the r wrapper
+                if preserve_requires_grad:
+                    r.requires_grad = t.requires_grad
+                return r
+            else:
+                return t
+
+        f_args, f_kwargs = pytree.tree_map(to_fun, (args, kwargs))
+
+        torch._enable_functionalization(reapply_views=True)
+        try:
+            outs = f(*f_args, **f_kwargs)
+        finally:
+            torch._disable_functionalization()
+
+        # Detect input mutation and error if found
+        flat_args, _ = pytree.tree_flatten((args, kwargs))
+        flat_f_args, _ = pytree.tree_flatten((f_args, f_kwargs))
+
+        # This is just for sanity checking, can be skipped
+        for arg, f_arg in zip(flat_args, flat_f_args):
+            if not isinstance(arg, Tensor):
+                continue
+            torch._sync(f_arg)
+            new_arg = torch._from_functional_tensor(f_arg)
+            # I want to do this assert, but it is annoying because
+            # we have operator tests that have mutating inputs.  So
+            # I do something unsound instead
+            # assert arg is new_arg, "input argument was mutated, this is not valid"
+            if arg is not new_arg:
+                assert arg.shape == new_arg.shape
+                arg.copy_(new_arg)
+
+        def from_fun(t):
+            if not isinstance(t, Tensor) or not torch._is_functional_tensor(t):
+                return t
+            torch._sync(t)
+            return torch._from_functional_tensor(t)
+
+        return pytree.tree_map(from_fun, outs)
+    return inner
+
+
+# This creates a joint forwards-backwards function given both
+# the primals (to run forwards) and tangents (to run backwards).
+#
+# It has a precondition which is that the passed in function
+# must be observationally pure; it is not permitted to mutate
+# the primals or tangents.
+def create_joint_forward_backward_pure(fn):
     def joint_forward_backward(
         primals: List[Any], tangents: List[Any]
     ) -> Tuple[List[Any], List[Any]]:
@@ -366,7 +457,7 @@ def add_dupe_args(args):
 
     deduped_flat_args = remove_dupe_args(flat_args)
 
-    joint_forward_backward = create_joint_forward_backward(lambda *args: flat_fn(*add_dupe_args(args)))
+    joint_forward_backward = create_joint_forward_backward_pure(lambda *args: flat_fn(*add_dupe_args(args)))
 
     out = flat_fn(*flat_args)
     # Collect info on which output tensors require gradients,
@@ -392,27 +483,13 @@ def add_dupe_args(args):
 
     if config.use_functionalize:
         with enable_python_dispatcher():
-            # Trace once without decompositions, into a graph of ATen ops.
-            # NB: tracing_mode is real, as it's assumed the calling context setup
-            # fake tensor mode / symbolic shapes if that is needed
-            fx_g = make_fx(joint_forward_backward)(*joint_inputs)
-
-            context = disable_autocast_manager if disable_amp else nullcontext
-
-            def fake_fn(primals, tangents):
-                with torch.fx.traceback.override_stack_trace():
-                    return torch.fx.Interpreter(fx_g).run(primals, tangents)
-
-            # Trace a second time, running functionalization, and THEN running decompositions.
-            # functionalization only acts on ATen today, and doesn't currently handle
-            # view and inplace ops that come from primtorch.
-            # Eventually, functionalization should support primtorch view/inplace ops,
-            # which will make it ok to run decompositions before functionalization.
-            with context():
-                fx_g = make_fx(functionalize(fake_fn), aot_config.decompositions)(*joint_inputs)
+            fx_g = make_fx(
+                detach_and_functionalize_pure(joint_forward_backward), aot_config.decompositions
+            )(*joint_inputs)
         fx_g.graph.eliminate_dead_code()
         fx_g.recompile()
     else:
+        warnings.warn("graph partitioning without functionalization is not sound, we may introduce errors")
         fx_g = make_fx(joint_forward_backward, aot_config.decompositions)(*joint_inputs)
 
     if config.debug_joint:
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index d8d330b4f3fc9..2b457cd05c6cd 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -416,6 +416,21 @@ def f(x, y):
 
         self.assertEqual(ref_out, test_out)
 
+    def test_custom_autograd(self):
+        class CustomFn(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                return x.clone()
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                return grad_output + 1
+
+        def f(x):
+            return CustomFn.apply(x)
+
+        self.verify_aot_autograd(f, [torch.randn(3)])
+
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA is unavailable")
     def test_autocast_disable_guard(self):
         guard = torch._C._DisableAutocast()
@@ -1099,12 +1114,10 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('masked.var', ''),  # ones() received an invalid combination of arguments - got (torch.Size, device=to...
     xfail('matmul', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('matrix_exp', ''),  # aten.linalg_matrix_exp.default - couldn't find symbolic meta function/decompo...
-    xfail('max', 'reduction_no_dim'),  # aten.logical_or_.default - couldn't find symbolic meta function/dec...
     xfail('max', 'reduction_with_dim'),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('median', ''),  # could not find kernel
     xfail('meshgrid', 'list_of_tensors'),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('meshgrid', 'variadic_tensors'),  # Cannot call numel() on tensor with symbolic sizes/strides
-    xfail('min', 'reduction_no_dim'),  # aten.logical_or_.default - couldn't find symbolic meta function/dec...
     xfail('min', 'reduction_with_dim'),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('mode', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('msort', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
@@ -1116,7 +1129,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
 
     # Deleting this in a followup
     xfail('nn.functional.feature_alpha_dropout', 'with_train'),
-    xfail('nn.functional.pad', 'circular'),
     xfail('nn.functional.poisson_nll_loss', ''),
 
     xfail('nn.functional._scaled_dot_product_attention', ''),  # Cannot call sizes() on tensor with symbolic ...
diff --git a/test/test_functionalization.py b/test/test_functionalization.py
index bfb79675c7eb0..c6c3d991771ba 100644
--- a/test/test_functionalization.py
+++ b/test/test_functionalization.py
@@ -141,26 +141,23 @@ def g(x):
 def forward(self, a_1):
     view_copy = torch.ops.aten.view_copy.default(a_1, [1, 1024, 128, 128]);  a_1 = None
     clone = torch.ops.aten.clone.default(view_copy);  view_copy = None
-    view_copy_1 = torch.ops.aten.view_copy.default(clone, [16, 64, 128, 128]);  clone = None
+    view_copy_1 = torch.ops.aten.view_copy.default(clone, [16, 64, 128, 128])
     relu = torch.ops.aten.relu.default(view_copy_1);  view_copy_1 = None
+    view_copy_2 = torch.ops.aten.view_copy.default(clone, [16, 64, 128, 128]);  clone = None
     sum_1 = torch.ops.aten.sum.default(relu)
     ones_like = torch.ops.aten.ones_like.default(sum_1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False, memory_format = torch.preserve_format);  sum_1 = None
     expand_copy = torch.ops.aten.expand_copy.default(ones_like, [16, 64, 128, 128]);  ones_like = None
-    new_zeros = torch.ops.aten.new_zeros.default(expand_copy, [16777216])
-    as_strided_copy = torch.ops.aten.as_strided_copy.default(new_zeros, [16, 64, 128, 128], [1048576, 16384, 128, 1], 0)
-    as_strided_copy_1 = torch.ops.aten.as_strided_copy.default(new_zeros, [1, 1024, 128, 128], [16777216, 16384, 128, 1], 0)
-    as_strided_scatter = torch.ops.aten.as_strided_scatter.default(new_zeros, expand_copy, [16, 64, 128, 128], [1048576, 16384, 128, 1], 0);  new_zeros = expand_copy = None
-    as_strided_copy_2 = torch.ops.aten.as_strided_copy.default(as_strided_scatter, [1, 1024, 128, 128], [16777216, 16384, 128, 1], 0);  as_strided_scatter = None
-    new_empty_strided = torch.ops.aten.new_empty_strided.default(as_strided_copy_2, [1, 1024, 128, 128], [16777216, 16384, 128, 1])
-    as_strided_copy_3 = torch.ops.aten.as_strided_copy.default(as_strided_copy_2, [16, 64, 128, 128], [1048576, 16384, 128, 1], 0)
-    as_strided_copy_4 = torch.ops.aten.as_strided_copy.default(as_strided_copy_2, [16, 64, 128, 128], [1048576, 16384, 128, 1], 0)
-    clone_1 = torch.ops.aten.clone.default(as_strided_copy_4, memory_format = torch.contiguous_format);  as_strided_copy_4 = None
+    _reshape_alias_copy = torch.ops.aten._reshape_alias_copy.default(expand_copy, [1, 1024, 128, 128], [16777216, 16384, 128, 1]);  expand_copy = None
+    new_empty_strided = torch.ops.aten.new_empty_strided.default(_reshape_alias_copy, [1, 1024, 128, 128], [16777216, 16384, 128, 1])
+    view_copy_3 = torch.ops.aten.view_copy.default(_reshape_alias_copy, [16, 64, 128, 128])
+    view_copy_4 = torch.ops.aten.view_copy.default(_reshape_alias_copy, [16, 64, 128, 128])
+    clone_1 = torch.ops.aten.clone.default(view_copy_4, memory_format = torch.contiguous_format);  view_copy_4 = None
     threshold_backward = torch.ops.aten.threshold_backward.default(clone_1, relu, 0);  clone_1 = relu = None
-    _reshape_alias_copy = torch.ops.aten._reshape_alias_copy.default(as_strided_copy_2, [16, 64, 128, 128], [1048576, 16384, 128, 1])
-    detach_copy = torch.ops.aten.detach_copy.default(_reshape_alias_copy);  _reshape_alias_copy = None
-    as_strided_scatter_1 = torch.ops.aten.as_strided_scatter.default(as_strided_copy_2, threshold_backward, [16, 64, 128, 128], [1048576, 16384, 128, 1], 0);  as_strided_copy_2 = threshold_backward = None
-    _reshape_alias_copy_1 = torch.ops.aten._reshape_alias_copy.default(as_strided_scatter_1, [16, 64, 128, 128], [1048576, 16384, 128, 1]);  as_strided_scatter_1 = None
-    detach_copy_1 = torch.ops.aten.detach_copy.default(_reshape_alias_copy_1);  _reshape_alias_copy_1 = None
+    _reshape_alias_copy_1 = torch.ops.aten._reshape_alias_copy.default(_reshape_alias_copy, [16, 64, 128, 128], [1048576, 16384, 128, 1]);  _reshape_alias_copy = None
+    detach_copy = torch.ops.aten.detach_copy.default(_reshape_alias_copy_1);  _reshape_alias_copy_1 = None
+    view_copy_5 = torch.ops.aten.view_copy.default(threshold_backward, [1, 1024, 128, 128]);  threshold_backward = None
+    _reshape_alias_copy_2 = torch.ops.aten._reshape_alias_copy.default(view_copy_5, [16, 64, 128, 128], [1048576, 16384, 128, 1]);  view_copy_5 = None
+    detach_copy_1 = torch.ops.aten.detach_copy.default(_reshape_alias_copy_2);  _reshape_alias_copy_2 = None
     return detach_copy_1
     """)  # noqa: B950
 

From 4fcc22da1527080185159e892526604f2b73f74e Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Fri, 4 Nov 2022 23:11:17 +0000
Subject: [PATCH 0602/1922] [Reland] Fix primTorch
 compute_elementwise_output_strides (#88525)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88525
Approved by: https://github.com/desertfire
---
 test/test_meta.py               | 35 ---------------------------------
 torch/_prims_common/__init__.py | 24 +++++++++++++---------
 2 files changed, 15 insertions(+), 44 deletions(-)

diff --git a/test/test_meta.py b/test/test_meta.py
index 88644a6552b1b..ef25d184c8428 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -295,55 +295,20 @@ def test_tensor_outlives_converter(self):
     aten._fft_r2c.default,
     aten._linalg_svd.default,
     aten._scaled_dot_product_attention_forward.default,
-    aten.add.Tensor,
-    aten.atan2.default,
     aten.binary_cross_entropy.default,
-    aten.bitwise_and.Tensor,
-    aten.bitwise_left_shift.Tensor,
-    aten.bitwise_or.Tensor,
-    aten.bitwise_right_shift.Tensor,
-    aten.bitwise_xor.Tensor,
-    aten.clamp_max.Tensor,
-    aten.clamp_min.Tensor,
     aten.complex.default,
     aten.copysign.Tensor,
     aten.div.Tensor_mode,
-    aten.div.Tensor,
-    aten.eq.Tensor,
     aten.floor_divide.default,
-    aten.fmax.default,
-    aten.fmin.default,
-    aten.fmod.Tensor,
-    aten.gcd.default,
-    aten.ge.Tensor,
-    aten.gt.Tensor,
     aten.heaviside.default,
-    aten.hypot.default,
-    aten.igamma.default,
-    aten.igammac.default,
-    aten.lcm.default,
-    aten.le.Tensor,
     aten.lerp.Scalar,
     aten.lerp.Tensor,
     aten.logical_and.default,
     aten.logical_or.default,
     aten.logical_xor.default,
-    aten.lt.Tensor,
-    aten.maximum.default,
-    aten.minimum.default,
-    aten.mul.Tensor,
-    aten.ne.Tensor,
-    aten.nextafter.default,
     aten.pow.Scalar,
-    aten.pow.Tensor_Scalar,
-    aten.pow.Tensor_Tensor,
     aten.prelu.default,
-    aten.remainder.Tensor,
-    aten.rsub.Tensor,
     aten.special_xlog1py.default,
-    aten.special_zeta.default,
-    aten.sub.Tensor,
-    aten.where.self,
     aten.xlogy.Tensor,
 
     # channel_last and channel_last_3d related failures
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index ee4dd38a655c6..90777ed6601aa 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -360,7 +360,7 @@ def compute_elementwise_output_strides(*tensors) -> Tuple[int, ...]:
 
     shape = tensors[0].shape
 
-    def _cmp(idx_a, idx_b):
+    def should_swap(idx_a, idx_b):
         for tensor in tensors:
             stride_a = tensor.stride()[idx_a]
             stride_b = tensor.stride()[idx_b]
@@ -378,24 +378,30 @@ def _cmp(idx_a, idx_b):
             if shape[idx_a] > shape[idx_b]:
                 return 1
 
-            # NOTE: this case is missing in the C++ impl
-            if shape[idx_a] < shape[idx_b]:
-                return -1
-
         # Note: this case is hit if all strides are zero,
         # or all strides are equal and all dimensions have the same length
         return 0
 
-    perm = tuple(range(ndim))
-    perm = tuple(sorted(perm, key=cmp_to_key(_cmp), reverse=True))
+    perm = list(reversed(range(ndim)))
+
+    # insertion sort with support for ambiguous comparisons
+    for i in range(1, ndim):
+        dim1 = i
+        for dim0 in reversed(range(i)):
+            comparison = should_swap(perm[dim0], perm[dim1])
+            if comparison > 0:
+                perm[dim0], perm[dim1] = perm[dim1], perm[dim0]
+                dim1 = dim0
+            elif comparison < 0:
+                break
 
     permuted_shape = [-1] * ndim
-    for idx, x in enumerate(perm):
+    for idx, x in enumerate(reversed(perm)):
         permuted_shape[idx] = shape[x]
 
     new_strides = make_contiguous_strides_for(permuted_shape)
     permuted_strides = [-1] * ndim
-    for idx, x in enumerate(perm):
+    for idx, x in enumerate(reversed(perm)):
         permuted_strides[x] = new_strides[idx]
 
     return tuple(permuted_strides)

From d1cdb5bdbdfadf25141f3be777c73d4519bfcd8d Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 2 Nov 2022 16:39:49 -0400
Subject: [PATCH 0603/1922] Handle case when candidate is empty (#88359)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88359
Approved by: https://github.com/wconstab
---
 torch/fx/graph.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index 4fdd64f900a92..5a25d5100f68d 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -140,6 +140,9 @@ def create_name(self, candidate: str, obj: Optional[Any]) -> str:
         # delete all characters that are illegal in a Python identifier
         candidate = self._illegal_char_regex.sub('_', candidate)
 
+        if not candidate:
+            candidate = '_unnamed'
+
         if candidate[0].isdigit():
             candidate = f'_{candidate}'
 

From 520c52fbaa3ed57853d7a3ded3cd43a4b465178a Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Thu, 3 Nov 2022 18:33:14 +0000
Subject: [PATCH 0604/1922] Reenable optimizer overlap tests (#88439)

Closes https://github.com/pytorch/pytorch/issues/73259. Not sure the root cause but CI seems fine with these tests.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88439
Approved by: https://github.com/awgu
---
 torch/testing/_internal/distributed/distributed_test.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 8c44cc0482cc4..6923a6160ea6e 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -4494,8 +4494,7 @@ def _test_ddp_hook_with_optimizer_parity(
                     dist.barrier()
 
         @sandcastle_skip_if(
-            BACKEND == "nccl" or BACKEND == "ucc",
-            "Issues with async error handling, see https://github.com/pytorch/pytorch/issues/73259"
+            BACKEND == "ucc",
         )
         @skip_if_lt_x_gpu(2)
         @parametrize("grad_as_bucket_view", [True, False])
@@ -4521,8 +4520,7 @@ def test_ddp_hook_with_optimizer_parity_adamw(
             )
 
         @sandcastle_skip_if(
-            BACKEND == "nccl" or BACKEND == "ucc",
-            "Issues with async error handling, see https://github.com/pytorch/pytorch/issues/73259"
+            BACKEND == "ucc",
         )
         @skip_if_lt_x_gpu(2)
         @parametrize("optimize_subset", [True, False])
@@ -4541,8 +4539,7 @@ def test_ddp_hook_with_optimizer_parity_adam(self, optimize_subset):
             )
 
         @sandcastle_skip_if(
-            BACKEND == "nccl" or BACKEND == "ucc",
-            "Issues with async error handling, see https://github.com/pytorch/pytorch/issues/73259"
+            BACKEND == "ucc",
         )
         @skip_if_lt_x_gpu(2)
         @parametrize("optimize_subset", [True, False])

From 8f73fbd25cf451ecb98adb9cac403f598394c68f Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Sat, 5 Nov 2022 08:31:02 +0000
Subject: [PATCH 0605/1922] Upstream apply_optim_in_backward from TorchRec
 (#87397) (#88539)

Summary:

Upstreaming this as part of sharing common APIs. This is just a plain
move, any changes needed to support DDP / FSDP will come in follow up diffs.

Test Plan: CI

Reviewed By: zhaojuanmao

Differential Revision: D40564646

fbshipit-source-id: 619c434e02196812f8d4db1e40d07290e08b18f9
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88539
Approved by: https://github.com/awgu
---
 .../optim/test_apply_optimizer_in_backward.py | 113 ++++++++++++++++++
 torch/distributed/optim/__init__.py           |   1 +
 .../optim/apply_optimizer_in_backward.py      |  78 ++++++++++++
 3 files changed, 192 insertions(+)
 create mode 100644 test/distributed/optim/test_apply_optimizer_in_backward.py
 create mode 100644 torch/distributed/optim/apply_optimizer_in_backward.py

diff --git a/test/distributed/optim/test_apply_optimizer_in_backward.py b/test/distributed/optim/test_apply_optimizer_in_backward.py
new file mode 100644
index 0000000000000..344d8c81a18cd
--- /dev/null
+++ b/test/distributed/optim/test_apply_optimizer_in_backward.py
@@ -0,0 +1,113 @@
+# Owner(s): ["oncall: distributed"]
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import unittest
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+
+from torch.distributed.optim import _apply_optimizer_in_backward
+
+# TODO (rohan-varma): Add FSDP & DDP tests once supported
+
+def _validate_params(params_list, fn):
+    ref_params = params_list[0]
+    for param_list in params_list[1:]:
+        for p1, p2 in zip(ref_params, param_list):
+            fn(p1, p2)
+
+
+class ApplyOverlappedOptimizerTest(unittest.TestCase):
+
+    def _run_training_loop_and_validate(self, inp, models, optimizers):
+        for i in range(6):
+            for model in models:
+                model(inp).sum().backward()
+            for opt in optimizers:
+                opt.step()
+
+            with self.subTest(i):
+                _validate_params(
+                    [model.parameters() for model in models],
+                    torch.testing.assert_allclose,
+                )
+
+            for opt in optimizers:
+                opt.zero_grad(set_to_none=True)
+
+    def _test_apply_optimizer_in_backward(self, share_params) -> None:
+        weight_optimizer_kwargs = {"lr": 1.0}
+        bias_optimizer_kwargs = {"lr": 0.5}
+        model = nn.Sequential(nn.Linear(10, 10), nn.Linear(10, 10))
+        if share_params:
+            model[0].weight = model[1].weight
+
+        # Use different optimizers for weights & biases.
+        weights = [m.weight for m in model]
+        biases = [m.bias for m in model]
+        optim_weight = torch.optim.SGD(weights, **weight_optimizer_kwargs)
+        optim_bias = torch.optim.SGD(biases, **bias_optimizer_kwargs)
+        model_with_opt_in_bwd = deepcopy(model)
+
+        # Apply different optimizer in backwards for weights and biases.
+        _apply_optimizer_in_backward(
+            torch.optim.SGD,
+            [m.weight for m in model_with_opt_in_bwd],
+            optimizer_kwargs=weight_optimizer_kwargs
+        )
+
+        _apply_optimizer_in_backward(
+            torch.optim.SGD,
+            [m.bias for m in model_with_opt_in_bwd],
+            optimizer_kwargs=bias_optimizer_kwargs
+        )
+
+        _validate_params(
+            [
+                model.parameters(),
+                model_with_opt_in_bwd.parameters(),
+            ],
+            torch.testing.assert_allclose,
+        )
+
+        self._run_training_loop_and_validate(
+            torch.randn(4, 10),
+            [model, model_with_opt_in_bwd],
+            [optim_weight, optim_bias],
+        )
+
+    def test_apply_optimizer_in_backward(self) -> None:
+        self._test_apply_optimizer_in_backward(share_params=False)
+
+    def test_apply_optimizer_in_backward_shared_params(self) -> None:
+        self._test_apply_optimizer_in_backward(share_params=True)
+
+    def test_multiple_optim_for_params(self) -> None:
+        model = nn.Sequential(nn.Linear(10, 10), nn.Linear(10, 10))
+        opt_0_kwargs = {"lr": 0.03}
+        opt_1_kwargs = {"lr": 0.01}
+        opt_0 = torch.optim.SGD(model.parameters(), **opt_0_kwargs)
+        opt_1 = torch.optim.SGD(model.parameters(), **opt_1_kwargs)
+        model_with_opt_in_bwd = deepcopy(model)
+        _apply_optimizer_in_backward(
+            torch.optim.SGD,
+            model_with_opt_in_bwd.parameters(),
+            optimizer_kwargs=opt_0_kwargs,
+        )
+        _apply_optimizer_in_backward(
+            torch.optim.SGD,
+            model_with_opt_in_bwd.parameters(),
+            optimizer_kwargs=opt_1_kwargs,
+        )
+        self._run_training_loop_and_validate(
+            torch.randn(4, 10),
+            [model, model_with_opt_in_bwd],
+            [opt_0, opt_1],
+        )
diff --git a/torch/distributed/optim/__init__.py b/torch/distributed/optim/__init__.py
index 7f5086a7a83bc..950222b8d5fa8 100644
--- a/torch/distributed/optim/__init__.py
+++ b/torch/distributed/optim/__init__.py
@@ -17,6 +17,7 @@
 from .functional_rprop import _FunctionalRprop
 from .functional_adamax import _FunctionalAdamax
 from .utils import as_functional_optim
+from .apply_optimizer_in_backward import _apply_optimizer_in_backward
 
 
 # DistributedOptimizer imports torch.distributed.rpc names, so gate availability
diff --git a/torch/distributed/optim/apply_optimizer_in_backward.py b/torch/distributed/optim/apply_optimizer_in_backward.py
new file mode 100644
index 0000000000000..ff72f28e6be1e
--- /dev/null
+++ b/torch/distributed/optim/apply_optimizer_in_backward.py
@@ -0,0 +1,78 @@
+from typing import Any, Dict, Iterable, Type, List, no_type_check
+
+import torch
+
+__all__: List[str] = []
+
+@no_type_check
+def _apply_optimizer_in_backward(
+    optimizer_class: Type[torch.optim.Optimizer],
+    params: Iterable[torch.nn.Parameter],
+    optimizer_kwargs: Dict[str, Any],
+) -> None:
+    """
+    Upon ``backward()``, parameters will fire the corresponding optimizer.
+
+    Note - gradients for these parameters will be set to None after ``backward()``.
+    This means that any other (non applied) optimizer over this parameter will be
+    a no-op.
+
+    Args:
+        optimizer_class: (Type[torch.optim.Optimizer]): Optimizer to apply to parameter
+        params: (Iterator[nn.Parameter]): parameters to apply optimizer state to
+        optimizer_kwargs: (Dict[str, Any]): kwargs to pass to optimizer constructor
+
+    Example::
+        params_generator = model.parameters()
+        param_1 = next(params_generator)
+        remainder_params = list(params_generator)
+
+        apply_optimizer_in_backward(torch.optim.SGD, [param_1], {"lr": .02})
+        apply_optimizer_in_backward(torch.optim.Adam, remainder_params, {"lr": .04})
+
+        model(...).sum().backward() # after backward, parameters will already
+        # have their registered optimizer applied.
+
+    """
+
+    @no_type_check
+    def _apply_optimizer_in_backward_to_param(param: torch.nn.Parameter) -> None:
+        # view_as creates a node in autograd graph that allows us access to the
+        # parameter's AccumulateGrad autograd function object. We register a
+        # hook on this object to fire the optimizer when the gradient for
+        # this parameter is ready (has been accumulated into .grad field)
+
+        # Don't create a new acc_grad if we already have one
+        # i.e.f or shared parameters or attaching multiple optimizers to a param.
+        if not hasattr(param, 'acc_grad'):
+            acc_grad = param.view_as(param).grad_fn.next_functions[0][0]
+        else:
+            acc_grad = param._acc_grad
+
+        optimizer = optimizer_class([param], **optimizer_kwargs)
+
+        # Keep the grad accumulator around for the lifetime of the Tensor,
+        # store it on the param to avoid uncollectable ref-cycle
+        if not hasattr(param, 'acc_grad'):
+            param._acc_grad = acc_grad  # type: ignore[attr-defined]
+
+        if not hasattr(param, '_in_backward_optimizers'):
+            param._in_backward_optimizers = []  # type: ignore[attr-defined]
+            # TODO: investigate whether we really need these attributes.
+            param._optimizer_classes = []  # type: ignore[attr-defined]
+            param._optimizer_kwargs = []  # type: ignore[attr-defined]
+
+        param._in_backward_optimizers.append(optimizer)  # type: ignore[attr-defined]
+        param._optimizer_classes.append(optimizer_class)  # type: ignore[attr-defined]
+        param._optimizer_kwargs.append(optimizer_kwargs)  # type: ignore[attr-defined]
+
+        def optimizer_hook(*_unused) -> None:
+            for opt in param._in_backward_optimizers:  # type: ignore[attr-defined]
+                opt.step()
+
+            param.grad = None
+
+        param._acc_grad.register_hook(optimizer_hook)  # type: ignore[attr-defined]
+
+    for param in params:
+        _apply_optimizer_in_backward_to_param(param)

From 3283c6d165110530a57b20d50839fd3169c7bb46 Mon Sep 17 00:00:00 2001
From: Nikita Karetnikov <nikita@karetnikov.org>
Date: Fri, 4 Nov 2022 11:50:18 +0100
Subject: [PATCH 0606/1922] Add error inputs to `gaussian_nll_loss` `OpInfo`
 (#88486)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88486
Approved by: https://github.com/lezcano
---
 test/test_nn.py                                | 14 --------------
 .../_internal/common_methods_invocations.py    | 18 ++++++++++++++++++
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/test/test_nn.py b/test/test_nn.py
index 13036ef18740f..f704c530f92d9 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -6144,20 +6144,6 @@ def test_poisson_nll_loss_reduction_modes(self):
         with self.assertRaisesRegex(ValueError, 'is not valid'):
             F.poisson_nll_loss(input, target, reduction='total')
 
-    def test_gaussian_nll_loss_reduction_modes(self):
-        input = torch.tensor([[0.5, 1.5, 2.5], [2., 4., 6.]])
-        target = torch.tensor([[1., 2., 3.], [4., 5., 6.]])
-        var = torch.tensor([[0.5, 1., 1.5], [1., 1.5, 2.]])
-        component_wise_loss = 0.5 * (torch.log(var) + (input - target)**2 / var)
-        self.assertEqual(component_wise_loss,
-                         F.gaussian_nll_loss(input, target, var, reduction='none'))
-        self.assertEqual(torch.sum(component_wise_loss),
-                         F.gaussian_nll_loss(input, target, var, reduction='sum'))
-        self.assertEqual(torch.mean(component_wise_loss),
-                         F.gaussian_nll_loss(input, target, var, reduction='mean'))
-        with self.assertRaisesRegex(ValueError, 'is not valid'):
-            F.gaussian_nll_loss(input, target, var, reduction='total')
-
     def test_gaussian_nll_loss_broadcasting(self):
         input = torch.tensor([[0.5, 1.5, 2.5], [2., 4., 6.]])
         target_full = torch.tensor([[1., 2., 3.], [1., 2., 3.]])
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index f643559fe34fb..4cfdbd4114eca 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -6890,6 +6890,23 @@ def gen_shape_kwargs():
     for input, target, var, kwargs in gen_shape_kwargs():
         yield SampleInput(input, args=(target, var, ), kwargs=kwargs)
 
+def error_inputs_gaussian_nll_loss(op_info, device, **kwargs):
+    _make = partial(make_tensor, device=device, dtype=torch.float32)
+
+    # invalid reduction value
+    yield ErrorInput(SampleInput(_make(10, 2, 3), _make(10, 2, 3), _make((10, 2, 3), low=0), reduction="abc"),
+                     error_type=ValueError, error_regex="abc is not valid")
+
+    # var is of incorrect shape
+    yield ErrorInput(SampleInput(_make(10, 2, 3), _make(10, 2, 3), _make((10, 2, 2), low=0)),
+                     error_type=ValueError, error_regex="var is of incorrect size")
+
+    # target is of incorrect shape
+    yield ErrorInput(SampleInput(_make(10, 2, 3), _make(10, 2, 2), _make((10, 2, 3), low=0)),
+                     error_type=RuntimeError,
+                     error_regex=(r"The size of tensor a \(3\) must match the size of tensor b \(2\) "
+                                  r"at non-singleton dimension 2"))
+
 def _generate_sample_inputs_nn_loss(op_info, device, dtype, requires_grad, **kwargs):
     _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
@@ -16193,6 +16210,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
         sample_inputs_func=sample_inputs_gaussian_nll_loss,
+        error_inputs_func=error_inputs_gaussian_nll_loss,
         skips=(
             # Pre-existing condition (calls .item); needs to be fixed
             DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),

From c74a8f14ef636350194a387c63b2176996027734 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Sun, 6 Nov 2022 02:29:53 +0000
Subject: [PATCH 0607/1922] Revert "Reenable optimizer overlap tests (#88439)"

This reverts commit da452bcadbc6f34989c6b3b0db6075a272aa9891.

Reverted https://github.com/pytorch/pytorch/pull/88439 on behalf of https://github.com/huydhn due to This change breaks trunk due to a land race missing reason parameter to sandcastle_skip_if https://hud.pytorch.org/pytorch/pytorch/commit/da452bcadbc6f34989c6b3b0db6075a272aa9891
---
 torch/testing/_internal/distributed/distributed_test.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 6923a6160ea6e..8c44cc0482cc4 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -4494,7 +4494,8 @@ def _test_ddp_hook_with_optimizer_parity(
                     dist.barrier()
 
         @sandcastle_skip_if(
-            BACKEND == "ucc",
+            BACKEND == "nccl" or BACKEND == "ucc",
+            "Issues with async error handling, see https://github.com/pytorch/pytorch/issues/73259"
         )
         @skip_if_lt_x_gpu(2)
         @parametrize("grad_as_bucket_view", [True, False])
@@ -4520,7 +4521,8 @@ def test_ddp_hook_with_optimizer_parity_adamw(
             )
 
         @sandcastle_skip_if(
-            BACKEND == "ucc",
+            BACKEND == "nccl" or BACKEND == "ucc",
+            "Issues with async error handling, see https://github.com/pytorch/pytorch/issues/73259"
         )
         @skip_if_lt_x_gpu(2)
         @parametrize("optimize_subset", [True, False])
@@ -4539,7 +4541,8 @@ def test_ddp_hook_with_optimizer_parity_adam(self, optimize_subset):
             )
 
         @sandcastle_skip_if(
-            BACKEND == "ucc",
+            BACKEND == "nccl" or BACKEND == "ucc",
+            "Issues with async error handling, see https://github.com/pytorch/pytorch/issues/73259"
         )
         @skip_if_lt_x_gpu(2)
         @parametrize("optimize_subset", [True, False])

From c921736f7d48c0c5bdf488f778d0c1c7966a42e6 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Thu, 3 Nov 2022 16:22:50 +0000
Subject: [PATCH 0608/1922] [inductor] Add lowering for as_strided_scatter
 (#88379)

Ref pytorch/torchdynamo#327

The use of as_strided does require in-memory manipulations, however this
 lowering allows those memory ops to be fused with any preceding calculations.
e.g.

```
def f(a, b):
    return torch.as_strided_scatter(
        a * 8 + 10,
        b * 2 - 4,
        size=(a.numel() // 2,),
        stride=(2,))
```

Before this compiles to two kernels and a call to `aten.as_strided_scatter` and
with this PR it compiles to just two kernels and no additional operator calls.

In theory I think this could be a decomposition, but in practice I saw the
`output_view.copy_(src)` being optimized out in some cases when this was
implemented as a decomposition.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88379
Approved by: https://github.com/jansel
---
 test/inductor/test_torchinductor.py | 12 ++++++++++++
 torch/_inductor/lowering.py         |  9 ++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 0c778c254ab10..9054279c91a5b 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -3466,6 +3466,18 @@ def fn(x):
 
         self.common(fn, [torch.randn(64, 64)])
 
+    def test_as_strided_scatter(self):
+        def fn(a, b):
+            return aten.as_strided_scatter(
+                a * 8 + 10,
+                b * 2 - 4,
+                size=(a.shape[0], a.shape[1] // 2),
+                stride=(a.shape[1], 2),
+                storage_offset=0,
+            )
+
+        self.common(fn, [torch.randn(10, 1024), torch.randn(10, 512)])
+
     def test_select_scatter(self):
         def fn(x, a, b):
             return (
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 029d59f8615e9..71f038b231259 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -1143,7 +1143,6 @@ def inner_fn(index):
 # TODO(jansel): we should implement decomps or lowerings for these
 # https://github.com/pytorch/torchdynamo/issues/327
 make_fallback(aten._adaptive_avg_pool2d_backward)
-make_fallback(aten.as_strided_scatter)
 make_fallback(aten.convolution_backward)
 make_fallback(aten._cudnn_rnn)
 make_fallback(aten._cudnn_rnn_backward)
@@ -1893,6 +1892,14 @@ def output_indexer(index):
     return self
 
 
+@register_lowering(aten.as_strided_scatter, type_promotion_kind=None)
+def as_strided_scatter(self, src, size, stride, storage_offset=None):
+    output = clone(self)
+    output_view = as_strided(output, size, stride, storage_offset)
+    copy_(output_view, src)
+    return output
+
+
 @register_lowering(aten.scatter, type_promotion_kind=None)
 def scatter(x, dim: int, index, src, **kwargs):
     return scatter_(clone(x), dim, index, src, **kwargs)

From 68bbf7cce50e510d913a8b723a982160d4ac692d Mon Sep 17 00:00:00 2001
From: YJ Shi <yuanjing@octoml.ai>
Date: Mon, 7 Nov 2022 01:33:57 +0000
Subject: [PATCH 0609/1922] [Dynamo] fix torchdynamo's TVM meta schedule
 backend (#88249)

Note that the previous `optimize_torch` functionality of pytorch is not working with default pytorch release with  CXX11 ABI off as TVM by default needs CXX11 ABI for builds. Source: [1](https://discuss.tvm.apache.org/t/can-someone-please-give-me-the-steps-to-use-pt-tvmdsoop/12525), [2](https://discuss.pytorch.org/t/undefined-symbol-when-import-lltm-cpp-extension/32627). It would be easier for user to tune with meta schedule instead of finding a CXX11-compatible pytorch, turning on the `pt-tvmdsoop` flag in TVM and rebuilding it. This could be revisited once the `pt-tvmdsoop` flag is updated and tuned on by default in TVM.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88249
Approved by: https://github.com/jansel
---
 torch/_dynamo/optimizations/backends.py | 80 ++++++++++++-------------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/torch/_dynamo/optimizations/backends.py b/torch/_dynamo/optimizations/backends.py
index 56555f123e286..55974c69d76e3 100644
--- a/torch/_dynamo/optimizations/backends.py
+++ b/torch/_dynamo/optimizations/backends.py
@@ -681,7 +681,7 @@ def tvm_compile_inner(
         elif tuning_option == "meta_schedule":
             from os import path as osp
 
-            from tvm.contrib.torch import optimize_torch
+            from tvm import meta_schedule as ms
 
             with tempfile.TemporaryDirectory() as work_dir:
                 if log_file is not None:
@@ -689,14 +689,22 @@ def tvm_compile_inner(
                         log_file
                     ), "TVM's meta_schedule requires a directory for storing log files."
                     work_dir = log_file
-
-                lib = optimize_torch(
-                    jit_mod,
-                    example_inputs,
-                    max_trials_global=20000,
+                # TODO(shingjan): This could be replaced by tvm.contrib.torch.optimize_torch
+                # once USE_PT_TVMDSOOP is updated and turned on by default in TVM.
+                database = ms.relay_integration.tune_relay(
+                    mod=mod,
+                    target=target,
                     work_dir=work_dir,
+                    max_trials_global=20000,
+                    num_trials_per_iter=64,
+                    params=params,
+                    strategy="evolutionary",
+                )
+                lib = ms.relay_integration.compile_relay(
+                    database=database,
+                    mod=mod,
                     target=target,
-                    max_trials_per_task=64,
+                    params=params,
                 )
 
         elif tuning_option is None:
@@ -708,41 +716,33 @@ def tvm_compile_inner(
                 "This tuning option is invalid/not implemented for torchdynamo's TVM-related backend. "
                 "There are three available options including None, auto_scheduler and meta_schedule."
             )
-        if tune_option != "meta_schedule":
-            m = graph_executor.GraphModule(lib["default"](dev))
-
-            def to_torch_tensor(nd_tensor):
-                """A helper function to transfer a NDArray to torch.tensor."""
-                if nd_tensor.dtype == "bool":
-                    # DLPack does not support boolean so it can't be handled by
-                    # torch.utils.dlpack.from_pack. Workaround by going through
-                    # numpy, although this brings additional data copy overhead.
-                    return torch.from_numpy(nd_tensor.numpy())
-                return torch.utils.dlpack.from_dlpack(nd_tensor.to_dlpack())
-
-            def exec_tvm(*args):
-                args = [a.contiguous() for a in args]
-                for idx, arg in enumerate(args, 0):
-                    if arg.dim() != 0:
-                        if arg.requires_grad:
-                            arg = arg.detach()
-                        m.set_input(
-                            f"inp_{idx}",
-                            tvm.nd.array(arg.numpy(), dev),
-                        )
-                m.run()
-                return [
-                    to_torch_tensor(m.get_output(i)) for i in range(m.get_num_outputs())
-                ]
-
-        else:
-
-            def exec_tvm(*args):
-                args = [a.contiguous() for a in args]
-                return lib(*args)
+        m = graph_executor.GraphModule(lib["default"](dev))
+
+        def to_torch_tensor(nd_tensor):
+            """A helper function to transfer a NDArray to torch.tensor."""
+            if nd_tensor.dtype == "bool":
+                # DLPack does not support boolean so it can't be handled by
+                # torch.utils.dlpack.from_pack. Workaround by going through
+                # numpy, although this brings additional data copy overhead.
+                return torch.from_numpy(nd_tensor.numpy())
+            return torch.utils.dlpack.from_dlpack(nd_tensor.to_dlpack())
+
+        def exec_tvm(*args):
+            args = [a.contiguous() for a in args]
+            for idx, arg in enumerate(args, 0):
+                if arg.dim() != 0:
+                    if arg.requires_grad:
+                        arg = arg.detach()
+                    m.set_input(
+                        f"inp_{idx}",
+                        tvm.nd.array(arg.numpy(), dev),
+                    )
+            m.run()
+            return [
+                to_torch_tensor(m.get_output(i)) for i in range(m.get_num_outputs())
+            ]
 
         return exec_tvm
-
     except Exception:
         log.exception("tvm error")
         return jit_mod  # explicit fall back to eager

From a70a3bf09f3c6e6d91af7a414ec1f98c68d21361 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Mon, 7 Nov 2022 05:48:22 +0000
Subject: [PATCH 0610/1922] use faster cache flush in triton benchmarking
 (#88557)

Speeds up autotuning a little bit more (about 90s -> 75s for coat_lite_mini)

@bertmaher, I've put in workaround so that internal doesn't break, but it can be removed once triton is updated internally.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88557
Approved by: https://github.com/anijain2305
---
 .github/ci_commit_pins/triton.txt      | 2 +-
 torch/_inductor/triton_ops/autotune.py | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/.github/ci_commit_pins/triton.txt b/.github/ci_commit_pins/triton.txt
index d46172e16b66b..7c5e80098f7b7 100644
--- a/.github/ci_commit_pins/triton.txt
+++ b/.github/ci_commit_pins/triton.txt
@@ -1 +1 @@
-f16138d447bccc54641a9c48ffedbd449a1a40a7
+0d7e7532279e45672555e344646f5c19c3972331
diff --git a/torch/_inductor/triton_ops/autotune.py b/torch/_inductor/triton_ops/autotune.py
index a0bc76569c338..f4d1d06b8f183 100644
--- a/torch/_inductor/triton_ops/autotune.py
+++ b/torch/_inductor/triton_ops/autotune.py
@@ -132,9 +132,14 @@ def kernel_call():
                 stream=stream,
             )
 
+        import inspect
+
         from triton.testing import do_bench
 
-        return do_bench(kernel_call, rep=40)
+        if "fast_flush" in inspect.signature(do_bench).parameters.keys():
+            return do_bench(kernel_call, rep=40, fast_flush=True)
+        else:
+            return do_bench(kernel_call, rep=40)
 
     @dynamo_utils.dynamo_timed
     def autotune_to_one_config(self, *args, **kwargs):

From baefd40825d9edc621f6879f2aa94bea4516ad94 Mon Sep 17 00:00:00 2001
From: "Wang, Eikan" <eikan.wang@intel.com>
Date: Fri, 4 Nov 2022 05:28:15 +0000
Subject: [PATCH 0611/1922] Explicit vectorization support for TorchInductor
 (#87068)

In this PR, we replace OMP SIMD with `aten::vec` to optimize TorchInductor vectorization performance. Take `res=torch.exp(torch.add(x, y))` as the example. The generated code is as follows if `config.cpp.simdlen` is 8.

```C++
extern "C" void kernel(const float* __restrict__ in_ptr0,
                       const float* __restrict__ in_ptr1,
                       float* __restrict__ out_ptr0,
                       const long ks0,
                       const long ks1)
{
    #pragma omp parallel num_threads(48)
    {
        #pragma omp for
        for(long i0=0; i0<((ks0*ks1) / 8); ++i0)
        {
            auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + 8*i0);
            auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr1 + 8*i0);
            auto tmp2 = tmp0 + tmp1;
            auto tmp3 = tmp2.exp();
            tmp3.store(out_ptr0 + 8*i0);
        }
        #pragma omp for simd simdlen(4)
        for(long i0=8*(((ks0*ks1) / 8)); i0<ks0*ks1; ++i0)
        {
            auto tmp0 = in_ptr0[i0];
            auto tmp1 = in_ptr1[i0];
            auto tmp2 = tmp0 + tmp1;
            auto tmp3 = std::exp(tmp2);
            out_ptr0[i0] = tmp3;
        }
    }
}

```

The major pipeline is as follows.
- Check whether the loop body could be vectorized by `aten::vec`. The checker consists of two parts. [One ](https://github.com/pytorch/pytorch/blob/bf66991fc4860724368c5289d3db81de591b4cb2/torch/_inductor/codegen/cpp.py#L702)is to check whether all the `ops` have been supported. The [other one](https://github.com/pytorch/pytorch/blob/355326faa35405565ddb6ff8a2a945c7fce83db8/torch/_inductor/codegen/cpp.py#L672) is to check whether the data access could be vectorized.
  - [`CppSimdVecKernelChecker`](https://github.com/pytorch/pytorch/blob/355326faa35405565ddb6ff8a2a945c7fce83db8/torch/_inductor/codegen/cpp.py#L655)
- Create the `aten::vec` kernel and original omp simd kernel. Regarding the original omp simd kernel, it serves for the tail loop when the loop is vectorized.
  - [`CppSimdVecKernel`](https://github.com/pytorch/pytorch/blob/355326faa35405565ddb6ff8a2a945c7fce83db8/torch/_inductor/codegen/cpp.py#L601)
  - [`CppSimdVecOverrides`](https://github.com/pytorch/pytorch/blob/355326faa35405565ddb6ff8a2a945c7fce83db8/torch/_inductor/codegen/cpp.py#L159): The ops that we have supported on the top of `aten::vec`
  - Create kernel
    - [`aten::vec` kernel](https://github.com/pytorch/pytorch/blob/355326faa35405565ddb6ff8a2a945c7fce83db8/torch/_inductor/codegen/cpp.py#L924)
    - [`Original CPP kernel - OMP SIMD`](https://github.com/pytorch/pytorch/blob/355326faa35405565ddb6ff8a2a945c7fce83db8/torch/_inductor/codegen/cpp.py#L929)
- Generate code
  - [`CppKernelProxy`](https://github.com/pytorch/pytorch/blob/355326faa35405565ddb6ff8a2a945c7fce83db8/torch/_inductor/codegen/cpp.py#L753) is used to combine the `aten::vec` kernel and original cpp kernel
    - [Vectorize the most inner loop](https://github.com/pytorch/pytorch/blob/355326faa35405565ddb6ff8a2a945c7fce83db8/torch/_inductor/codegen/cpp.py#L753)
    - [Generate code](https://github.com/pytorch/pytorch/blob/355326faa35405565ddb6ff8a2a945c7fce83db8/torch/_inductor/codegen/cpp.py#L821)

Next steps:
- [x] Support reduction
- [x] Vectorize the tail loop with `aten::vec`
- [ ] Support BF16
- [ ] Optimize the loop condition and loop index calculation by replacing `div` with `add`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87068
Approved by: https://github.com/jgong5, https://github.com/jansel
---
 setup.py                             |   1 +
 test/inductor/test_torchinductor.py  |  72 +++-
 torch/_inductor/codecache.py         |  81 +++-
 torch/_inductor/codegen/cpp.py       | 583 +++++++++++++++++++++++++--
 torch/_inductor/codegen/cpp_prefix.h |   5 +-
 torch/_inductor/metrics.py           |   4 +
 6 files changed, 707 insertions(+), 39 deletions(-)

diff --git a/setup.py b/setup.py
index ed0f032716574..bc8badb9b2e46 100644
--- a/setup.py
+++ b/setup.py
@@ -1036,6 +1036,7 @@ def main():
         'lib/*.pdb',
         'lib/torch_shm_manager',
         'lib/*.h',
+        'include/*.h',
         'include/ATen/*.h',
         'include/ATen/cpu/*.h',
         'include/ATen/cpu/vec/vec256/*.h',
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 9054279c91a5b..7a5d291691d42 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -37,7 +37,7 @@
     import torch._inductor.config
     from functorch.compile import config as functorch_config
     from torch._decomp import get_decompositions
-    from torch._inductor import config
+    from torch._inductor import codecache, config, metrics
     from torch._inductor.compile_fx import compile_fx, complex_memory_overlap
     from torch._inductor.ir import IndexingDiv, ModularIndexing
     from torch._inductor.sizevars import SizeVarAllocator
@@ -53,7 +53,6 @@
         sys.exit(0)
     raise unittest.SkipTest("requires sympy/functorch/filelock")
 
-
 HAS_CPU = False
 try:
     from subprocess import CalledProcessError
@@ -4416,6 +4415,75 @@ def test_complex_memory_overlap(self):
             self.assertFalse(complex_memory_overlap(gathered))
             self.assertFalse(complex_memory_overlap(gathered.t()))
 
+        # Currently, we enabled AVX2 and AVX512 for vectorization. If the platform is not
+        # supported, the vectorization will not work and skip this test case. For ARM or
+        # other platforms support, we just need to add the ISA info to the supported_vector_isa
+        # and include proper aten vectorization head file.
+        @unittest.skipIf(
+            not codecache.get_cpu_proc_info(), "Does not support vectorization"
+        )
+        @patch("torch.cuda.is_available", lambda: False)
+        def test_vec_kernel_cpu_only(self):
+            def fn(x1, x2):
+                # Current, there are some limitations as follows.
+                #   rsqrt:
+                #     assert [both a fallback and a decomp for same kernel: aten.rsqrt.default]
+                #   round:
+                #     couldn't find symbolic meta function/decomposition
+                #   fmod/logical_and/logic_or:
+                #     vec kernel has not support to_type
+                x = torch.abs(x1)
+                x = torch.sin(x)
+                x = torch.neg(x)
+                x = torch.square(x)
+                x = torch.sigmoid(x)
+                x = torch.relu(x)
+                x = torch.cos(x)
+                x = torch.exp(x)
+                x = torch.sqrt(x)
+                x = torch.add(x, x1)
+                x = torch.sub(x, x2)
+                x = torch.mul(x, x1)
+                x = torch.div(x, x1)
+                x = torch.pow(x, 10)
+                x = torch.log(x)
+                x = torch.floor(x)
+                x = torch.ceil(x)
+                x = torch.trunc(x)
+                x = torch.lgamma(x)
+                x = torch.fmod(x, x2)
+                res = x + x2
+                return (res,)
+
+            x1 = torch.randn((10, 20))
+            x2 = torch.randn((10, 20))
+
+            with patch.object(config.cpp, "simdlen", 8):
+                torch._dynamo.reset()
+                metrics.reset()
+                traced = make_fx(fn)(x1, x2)
+                compiled = compile_fx_inner(traced, [x1, x2])
+                assert same(fn(x1, x2)[0], compiled([x1, x2])[0], equal_nan=True)
+                assert metrics.generated_cpp_vec_kernel_count == 1
+
+                torch._dynamo.reset()
+                metrics.reset()
+                x1 = x1.permute(1, 0)
+                x2 = torch.randn((20, 10))
+                traced = make_fx(fn)(x1, x2)
+                compiled = compile_fx_inner(traced, [x1, x2])
+                assert same(fn(x1, x2)[0], compiled([x1, x2])[0], equal_nan=True)
+                assert metrics.generated_cpp_vec_kernel_count == 1
+
+                torch._dynamo.reset()
+                metrics.reset()
+                x1 = torch.randn((10, 7))
+                x2 = torch.randn((10, 7))
+                traced = make_fx(fn)(x1, x2)
+                compiled = compile_fx_inner(traced, ([x1, x2]))
+                assert same(fn(x1, x2)[0], compiled([x1, x2])[0], equal_nan=True)
+                assert metrics.generated_cpp_vec_kernel_count == 1
+
 
 if HAS_CUDA:
     import triton
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 7f1e8bc61814b..842ea2300e5b4 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -1,4 +1,5 @@
 import base64
+import enum
 import functools
 import getpass
 import hashlib
@@ -146,11 +147,81 @@ def is_gcc():
     return re.search(r"(gcc|g\+\+)", cpp_compiler())
 
 
+class _SupportedVecIsa(enum.Enum):
+    AVX512 = 1
+    AVX2 = 2
+    INVALID = -1
+
+    def __bool__(self):
+        return self != _SupportedVecIsa.INVALID
+
+    @staticmethod
+    def isa_str(supported_isa: enum.Enum):
+        if supported_isa == _SupportedVecIsa.AVX512:
+            return "avx512"
+        elif supported_isa == _SupportedVecIsa.AVX2:
+            return "avx2"
+        else:
+            return ""
+
+    @staticmethod
+    def vec_macro(supported_isa: enum.Enum):
+        if supported_isa == _SupportedVecIsa.AVX512:
+            return "CPU_CAPABILITY_AVX512"
+        elif supported_isa == _SupportedVecIsa.AVX2:
+            return "CPU_CAPABILITY_AVX2"
+        else:
+            return ""
+
+
+# Cache the cpuinfo to avoid I/O overhead. Meanwhile, the cpuinfo content
+# might have too much redundant content that is useless for ISA check. Hence,
+# we only cache some key isa information.
+@functools.lru_cache(1)
+def get_cpu_proc_info():
+    if sys.platform != "linux":
+        return []
+
+    isa_info = []
+    with open("/proc/cpuinfo") as _cpu_info:
+        _cpu_info_content = _cpu_info.read()
+        if _SupportedVecIsa.isa_str(_SupportedVecIsa.AVX512) in _cpu_info_content:
+            isa_info.append(_SupportedVecIsa.AVX512)
+
+        if _SupportedVecIsa.isa_str(_SupportedVecIsa.AVX2) in _cpu_info_content:
+            isa_info.append(_SupportedVecIsa.AVX2)
+
+        return isa_info
+
+
+def supported_vector_isa():
+    # TODO: Add ARM Vec here.
+    # Dict(k: isa, v: number of float element)
+    vec_isa_info = {
+        _SupportedVecIsa.AVX512: 16,
+        _SupportedVecIsa.AVX2: 8,
+    }
+
+    if config.cpp.simdlen is None or config.cpp.simdlen <= 1:
+        return _SupportedVecIsa.INVALID
+
+    cpu_info_content = get_cpu_proc_info()
+    for isa in vec_isa_info.keys():
+        if isa in cpu_info_content and config.cpp.simdlen == vec_isa_info[isa]:
+            return isa
+
+    return _SupportedVecIsa.INVALID
+
+
 def cpp_compile_command(input, output, include_pytorch=False):
-    if include_pytorch:
+    valid_isa = supported_vector_isa()
+    if include_pytorch or valid_isa:
         ipaths = cpp_extension.include_paths() + [sysconfig.get_path("include")]
         lpaths = cpp_extension.library_paths() + [sysconfig.get_config_var("LIBDIR")]
         libs = ["c10", "torch", "torch_cpu", "torch_python", "gomp"]
+        macros = _SupportedVecIsa.vec_macro(valid_isa)
+        if macros:
+            macros = f"-D{macros}"
     else:
         # Note - this is effectively a header only inclusion. Usage of some header files may result in
         # symbol not found, if those header files require a library.
@@ -159,17 +230,19 @@ def cpp_compile_command(input, output, include_pytorch=False):
         ipaths = cpp_extension.include_paths() + [sysconfig.get_path("include")]
         lpaths = []
         libs = ["gomp"]
+        macros = ""
     ipaths = " ".join(["-I" + p for p in ipaths])
     lpaths = " ".join(["-L" + p for p in lpaths])
     libs = " ".join(["-l" + p for p in libs])
+
     return re.sub(
         r"[ \n]+",
         " ",
         f"""
-            {cpp_compiler()} -shared -fPIC -Wall -std=c++14 -Wno-unused-variable
-            {ipaths} {lpaths} {libs}
+            {cpp_compiler()} {input} -shared -fPIC -Wall -std=c++14 -Wno-unused-variable
+            {ipaths} {lpaths} {libs} {macros}
             -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp
-            -o{output} {input}
+            -o{output}
         """,
     ).strip()
 
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index 39dd6519d926c..1248d263239f2 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -1,6 +1,7 @@
 import contextlib
 import dataclasses
 import functools
+from copy import deepcopy
 from pathlib import Path
 from typing import Dict, List
 
@@ -9,8 +10,9 @@
 import torch
 from torch._prims_common import is_float_dtype
 
-from .. import codecache, config
-from ..utils import sympy_product, sympy_symbol
+from .. import codecache, config, ir, metrics
+from ..codegen.wrapper import WrapperCodeGen
+from ..utils import sympy_product, sympy_subs, sympy_symbol
 from ..virtualized import ops, V
 from .common import (
     BracesBuffer,
@@ -120,6 +122,13 @@ def float16_reduction_prefix(rtype):
     return prefix
 
 
+def parallel_num_threads():
+    threads = config.cpp.threads
+    if threads < 1:
+        threads = torch.get_num_threads()
+    return threads
+
+
 @functools.lru_cache()
 def cpp_prefix():
     path = Path(__file__).parent / "cpp_prefix.h"
@@ -151,6 +160,135 @@ def _print_IndexingDiv(self, expr):
 cexpr = CppPrinter().doprint
 
 
+class CppVecOverrides(OpOverrides):
+    """Map element-wise ops to aten vectorization C++"""
+
+    @staticmethod
+    def add(a, b):
+        return f"{a} + {b}"
+
+    @staticmethod
+    def sub(a, b):
+        return f"{a} - {b}"
+
+    @staticmethod
+    def mul(a, b):
+        return f"{a} * {b}"
+
+    @staticmethod
+    def div(a, b):
+        return f"{a} / {b}"
+
+    @staticmethod
+    def abs(x):
+        return f"{x}.abs()"
+
+    @staticmethod
+    def sin(x):
+        return f"{x}.sin()"
+
+    @staticmethod
+    def cos(x):
+        return f"{x}.cos()"
+
+    @staticmethod
+    def exp(x):
+        return f"{x}.exp()"
+
+    @staticmethod
+    def sqrt(x):
+        return f"{x}.sqrt()"
+
+    @staticmethod
+    def rsqrt(x):
+        return f"{x}.rsqrt()"
+
+    @staticmethod
+    def pow(a, b):
+        return f"{a}.pow({b})"
+
+    @staticmethod
+    def log(x):
+        return f"{x}.log()"
+
+    @staticmethod
+    def round(x):
+        return f"{x}.round()"
+
+    @staticmethod
+    def floor(x):
+        return f"{x}.floor()"
+
+    @staticmethod
+    def ceil(x):
+        return f"{x}.ceil()"
+
+    @staticmethod
+    def trunc(x):
+        return f"{x}.trunc()"
+
+    @staticmethod
+    def fmod(a, b):
+        return f"{a}.fmod({b})"
+
+    @staticmethod
+    def lgamma(x):
+        return f"{x}.lgamma()"
+
+    @staticmethod
+    def logical_and(a, b):
+        return f"{a} && {b}"
+
+    @staticmethod
+    def logical_or(a, b):
+        return f"{a} || {b}"
+
+    @staticmethod
+    def tanh(a):
+        return f"{a}.tanh()"
+
+    @staticmethod
+    def reciprocal(a):
+        return f"{a}.reciprocal()"
+
+    @staticmethod
+    def constant(val, dtype):
+        if val == float("inf"):
+            quote = f"std::numeric_limits<{DTYPE_TO_CPP[dtype]}>::infinity()"
+        elif val == float("-inf"):
+            quote = f"-std::numeric_limits<{DTYPE_TO_CPP[dtype]}>::infinity()"
+        elif val is True or val is False:
+            quote = f"static_cast<{DTYPE_TO_CPP[dtype]}>({str(val).lower()})"
+        else:
+            quote = f"static_cast<{DTYPE_TO_CPP[dtype]}>({repr(val)})"
+        return f"at::vec::Vectorized<{DTYPE_TO_CPP[dtype]}>({quote})"
+
+    @staticmethod
+    def relu(x):
+        return f"at::vec::clamp_min({x}, decltype({x})(0))"
+
+    @staticmethod
+    def sigmoid(x):
+        return f"decltype({x})(1)/(decltype({x})(1) + {x}.neg().exp())"
+
+    @staticmethod
+    def neg(x):
+        return f"{x}.neg()"
+
+    @staticmethod
+    def floordiv(a, b):
+        # a and b are integer type
+        _t = f"decltype({a})"
+        quot = f"{a} / {b}"
+        rem = f"{a} % {b}"
+        return f"(({a} < {_t}(0)) != ({b} < {_t}(0)) ? ({rem} != {_t}(0) ? {quot} - {_t}(1) : {quot}) : {quot})"
+
+    @staticmethod
+    def truncdiv(a, b):
+        # a and b are integer type
+        return f"{a} / {b}"
+
+
 class CppOverrides(OpOverrides):
     """Map element-wise ops to C++"""
 
@@ -413,9 +551,7 @@ def size_hint(self):
         return V.graph.sizevars.size_hint(sympy_product(self.call_ranges))
 
     def codegen_loops(self, code, worksharing):
-        threads = config.cpp.threads
-        if threads < 1:
-            threads = torch.get_num_threads()
+        threads = parallel_num_threads()
 
         loops = [LoopLevel(var, size) for var, size in zip(self.itervars, self.ranges)]
         loops, reductions = LoopNest(loops[: self.reduction_depth]), LoopNest(
@@ -427,7 +563,7 @@ def codegen_loops(self, code, worksharing):
             # TODO(jansel): detect stride-1 dimension and vectorize that
             if reductions:
                 reductions.loops[-1].simd = True
-            else:
+            elif loops:
                 loops.loops[-1].simd = True
 
         par_depth = 0
@@ -509,6 +645,265 @@ def write_to_suffix(self):
         (self.loads, self.compute, self.stores, self.cse) = prior
 
 
+class CppVecKernel(CppKernel):
+    overrides = CppVecOverrides
+
+    def __init__(self, args, num_threads):
+        super(CppVecKernel, self).__init__(args, num_threads)
+        self.simd_len = config.cpp.simdlen
+        metrics.generated_cpp_vec_kernel_count += 1
+
+    def is_single_step_var(self, var: sympy.Symbol, index: sympy.Expr):
+        replacement = {var: var + 1}
+        new_index = sympy_subs(index, replacement)
+        delta = sympy.simplify(new_index - index)
+        return delta == 1
+
+    def is_var_irrevelant(self, var: sympy.Symbol, index: sympy.Expr):
+        expanded_index = sympy.expand(index)
+        return not expanded_index.has(var)
+
+    def transform_index(self, index: sympy.Expr):
+        expanded_index = sympy.expand(index)
+        assert self.simd_len
+        assert self.simd_len > 0
+        most_inner_var = self.itervars[-1]
+        replacement = {most_inner_var: most_inner_var * self.simd_len}
+        new_index = sympy_subs(expanded_index, replacement)
+        return new_index
+
+    def load(self, name: str, index: sympy.Expr):
+        var = self.args.input(name)
+        index = self.rename_indexing(index)
+
+        expanded_index = sympy.expand(index)
+        new_index = self.transform_index(index)
+
+        if expanded_index == new_index:
+            line = f"at::vec::Vectorized<float>({var}[{cexpr(index)}])"
+        else:
+            line = f"at::vec::Vectorized<float>::loadu({var} + {cexpr(new_index)})"
+
+        return self.cse.generate(self.loads, line)
+
+    def store(self, name, index, value, mode=None):
+        assert "buf" in name
+        var = self.args.output(name)
+        index = self.rename_indexing(index)
+        assert mode is None
+
+        expanded_index = sympy.expand(index)
+        new_index = self.transform_index(index)
+        assert new_index != expanded_index
+        line = f"{value}.store({var} + {cexpr(new_index)});"
+        self.stores.writeline(name, line)
+
+
+class CppVecKernelChecker(CppVecKernel):
+    def __init__(self, args, num_threads):
+        super(CppVecKernelChecker, self).__init__(args, num_threads)
+
+        # Since this kernel is only for checker but does not genreate any
+        # code, so we need to decrease the kernel count.
+        metrics.generated_kernel_count -= 1
+        metrics.generated_cpp_vec_kernel_count -= 1
+
+        # Used to recorde the graph wrapper code as the wrapper_code status could be
+        # changed during graph run.
+        self._orig_wrapper_code = None
+
+        self.simd_vec = True
+        self.fast_vec_list = []
+        for k, v in CppVecOverrides.__dict__.items():
+            if isinstance(v, staticmethod):
+                self.fast_vec_list.append(k)
+        self.exit_stack = contextlib.ExitStack()
+
+    def is_legal_data_access(self, var: sympy.Symbol, index: sympy.Expr):
+        return self.is_var_irrevelant(var, index) or self.is_single_step_var(var, index)
+
+    def could_vec(self, name: str, index: sympy.Expr):
+        if V.graph.get_dtype(name) is not torch.float:
+            return False
+
+        assert self.itervars is not None
+        # Not a loop
+        if len(self.itervars) == 0:
+            return False
+
+        most_inner_var = self.itervars[-1]
+        return self.is_legal_data_access(most_inner_var, index)
+
+    def load(self, name: str, index: sympy.Expr):
+        index = self.rename_indexing(index)
+
+        self.simd_vec = self.simd_vec and self.could_vec(name, index)
+        return self.simd_vec
+
+    def store(self, name, index, value, mode=None):
+        assert "buf" in name
+        index = self.rename_indexing(index)
+
+        if mode:
+            self.simd_vec = False
+            return False
+
+        self.simd_vec = self.simd_vec and self.could_vec(name, index)
+        return self.simd_vec
+
+    def reduction(self, name, dtype, src_dtype, reduction_type, index, value):
+        self.simd_vec = False
+        return self.simd_vec
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        assert self._orig_wrapper_code is not None
+        # Restore the wrapper_code
+        V.graph.wrapper_code = self._orig_wrapper_code
+        self.exit_stack.__exit__(exc_type, exc_val, exc_tb)
+
+    def __enter__(self):
+        # Recorde the graph wrapper code. The wrapper_code status could be
+        # changed during graph run. Regarding this checker, we also need to
+        # run the graph but we don't expect to change any status that would
+        # impact the code generation. Hence, we record the graph wapper code
+        # and replace it with a dummy warpper_code and then restore to the
+        # original one as long as the checker is finished.
+        self._orig_wrapper_code = V.graph.wrapper_code
+        V.graph.wrapper_code = WrapperCodeGen()
+
+        class VecCheckerProxy:
+            @staticmethod
+            def __getattr__(name):
+                def inner(*args, **kwargs):
+                    if not (name in self.fast_vec_list):
+                        self.simd_vec = False
+                    return self.simd_vec
+
+                return inner
+
+            @staticmethod
+            def load(name: str, index: sympy.Expr):
+                return self.load(name, index)
+
+            @staticmethod
+            def store(name, index, value, mode=None):
+                return self.store(name, index, value, mode=mode)
+
+            @staticmethod
+            def reduction(name, dtype, src_dtype, reduction_type, index, value):
+                return self.reduction(
+                    name, dtype, src_dtype, reduction_type, index, value
+                )
+
+            @staticmethod
+            def constant(val, dtype):
+                supported_dtype = (torch.float32, torch.int32)
+                is_supported_dtype = dtype in (supported_dtype)
+                if not is_supported_dtype:
+                    self.simd_vec = False
+                return is_supported_dtype
+
+            @staticmethod
+            def index_expr(expr, dtype):
+                self.simd_vec = False
+                return self.cse.newvar()
+
+            @staticmethod
+            def indirect_indexing(index_var):
+                return sympy.Symbol(str(index_var))
+
+            @staticmethod
+            def masked(mask, body, other):
+                return V.kernel.cse.newvar()
+
+        self.exit_stack.enter_context(V.set_ops_handler(VecCheckerProxy()))
+        self.exit_stack.enter_context(V.set_kernel_handler(self))
+        return self
+
+
+class CppKernelProxy(CppKernel):
+    def __init__(self, args=None, num_threads=None):
+        super(CppKernelProxy, self).__init__(args, num_threads)
+        self.simd_vec_kernel = None
+        self.simd_omp_kernel = None
+
+    def vectorize_most_inner_loop(self, loop_nest):
+        loop_nest.split_most_inner_loop(config.cpp.simdlen)
+        loop_with_tail = loop_nest.loops[-1]
+        assert isinstance(loop_with_tail, LoopLevelWithTail)
+
+        self.simd_vec_kernel.simd = False
+        self.simd_vec_kernel.fast_vec = True
+
+        loop_with_tail.tail_loop.simd_omp = True
+        # We chope the loop into two cubes by the config.cpp.simdlen - main loop and tail loop.
+        # Regarding the main loop, it is straightforward that it could be vectorized with
+        # config.cpp.simdlen. But for the tail loop, it still could be vectorized. For example,
+        # if the config.cpp.simdlen is 8(256bits), then the tail loop still could be vectorized
+        # as 4(128bits).
+        loop_with_tail.tail_loop.simd_len = int(config.cpp.simdlen / 2)
+        loop_with_tail.tail_loop.simd_vec = False
+
+        loop_with_tail.main_loop_body = self.simd_vec_kernel
+        loop_with_tail.tail_loop_body = self.simd_omp_kernel
+        return loop_nest
+
+    def codegen_loops(self, code, worksharing):
+        threads = parallel_num_threads()
+
+        if self.simd_vec_kernel is None:
+            assert self.simd_omp_kernel
+            return self.simd_omp_kernel.codegen_loops(code, worksharing)
+
+        assert self.simd_vec_kernel.itervars == self.simd_omp_kernel.itervars
+        assert self.simd_vec_kernel.ranges == self.simd_omp_kernel.ranges
+
+        itervars = self.simd_vec_kernel.itervars
+        rangs = self.simd_vec_kernel.ranges
+        loops = [LoopLevel(var, size) for var, size in zip(itervars, rangs)]
+
+        # TODO: Support reductions
+        loops_nest_non_reduc, _ = LoopNest(loops[: self.reduction_depth]), LoopNest(
+            loops[self.reduction_depth :]
+        )
+
+        assert config.cpp.simdlen
+        loops_nest_non_reduc.loops[-1].simd_omp = True
+
+        par_depth = 0
+        if loops_nest_non_reduc:
+            par_depth = self.simd_vec_kernel.decide_parallel_depth(
+                self.simd_vec_kernel.call_ranges[: self.reduction_depth], threads
+            )
+
+        with contextlib.ExitStack() as stack:
+            if par_depth:
+                worksharing.parallel(threads)
+                loops_nest_non_reduc.mark_parallel(par_depth)
+            elif threads > 1:
+                if worksharing.single():
+                    stack.enter_context(code.indent())
+
+            self.vectorize_most_inner_loop(loops_nest_non_reduc)
+
+            for loop in loops_nest_non_reduc.loops[0:-1]:
+                code.writelines(loop.lines())
+                stack.enter_context(code.indent())
+
+            loop_with_tail: LoopLevelWithTail = loops_nest_non_reduc.loops[-1]
+            for loop, kernel in (
+                (loop_with_tail.main_loop, loop_with_tail.main_loop_body),
+                (loop_with_tail.tail_loop, loop_with_tail.tail_loop_body),
+            ):
+
+                code.writelines(loop.lines())
+                with contextlib.ExitStack() as stack:
+                    stack.enter_context(code.indent())
+                    code.splice(kernel.loads)
+                    code.splice(kernel.compute)
+                    code.splice(kernel.stores)
+
+
 class CppScheduling:
     def __init__(self, scheduler):
         self.scheduler = scheduler
@@ -532,38 +927,113 @@ def can_fuse_horizontal(node1, node2):
     def can_fuse_vertical(cls, node1, node2):
         return cls.can_fuse_horizontal(node1, node2) and not node1.is_reduction()
 
-    def codegen_nodes(self, nodes):
-        """
-        Turn an set of pre-fused nodes into a C++ kernel.
-        """
-        kernel_group = self.kernel_group
-        scheduler = self.scheduler
+    def can_vec(self, nodes):
+        # TODO: Query cpu arch and vec length from aten
+        if not codecache.supported_vector_isa():
+            return False
+
         _, (group, reduction_group) = max(
             nodes, key=lambda x: int(x.is_reduction())
         ).group
-        in_suffix = False
-
-        with kernel_group.new_kernel() as kernel:
-            vars, reduction_vars = kernel.set_ranges(group, reduction_group)
 
+        with CppVecKernelChecker(
+            deepcopy(self.kernel_group.args), parallel_num_threads()
+        ) as kernel_checker:
+            vars, reduction_vars = kernel_checker.set_ranges(group, reduction_group)
             for node in nodes:
                 if node.group[1] in [
                     (group, reduction_group),
                     (group + reduction_group, ()),
                 ]:
-                    assert not in_suffix
                     node.run(vars, reduction_vars)
                 else:
-                    in_suffix = True
                     assert node.group[1] == (
                         group,
                         (),
                     ), f"unexpected group: {node.group[1]} != {group}, {reduction_group}"
-                    # we can fuse in some extra pointwise into the suffix
-                    with kernel.write_to_suffix():
-                        node.run(vars, ())
+                    node.run(vars, ())
+
+            return kernel_checker.simd_vec
+
+    def _codegen_nodes_impl(self, nodes, is_simd_vec=False):
+        """
+        Turn an set of pre-fused nodes into a C++ kernel.
+        """
+        kernel_group = self.kernel_group
+        _, (group, reduction_group) = max(
+            nodes, key=lambda x: int(x.is_reduction())
+        ).group
 
-        kernel_group.finalize_kernel(kernel, scheduler)
+        def create_kernel(_is_simd_vec):
+            in_suffix = False
+
+            with kernel_group.new_kernel(_is_simd_vec) as kernel:
+                vars, reduction_vars = kernel.set_ranges(group, reduction_group)
+
+                for node in nodes:
+                    if node.group[1] in [
+                        (group, reduction_group),
+                        (group + reduction_group, ()),
+                    ]:
+                        assert not in_suffix
+                        node.run(vars, reduction_vars)
+                    else:
+                        in_suffix = True
+                        assert node.group[1] == (
+                            group,
+                            (),
+                        ), f"unexpected group: {node.group[1]} != {group}, {reduction_group}"
+                        # we can fuse in some extra pointwise into the suffix
+                        with kernel.write_to_suffix():
+                            node.run(vars, ())
+                return kernel
+
+        org_inplace_buffers_flag = config.inplace_buffers
+        if is_simd_vec:
+            # Create vectorization kernel
+            cpp_vec_kernel = create_kernel(True)
+
+            # Since a kernel is divided into two parts - vectorization and non-vectorization.
+            # And the two parts share the same global contexts like V.graph.wrapper_code,
+            # V.kernel.args. But the vectorization kernel generation has updated these global
+            # contexts. Hence, the non-vectorization kernel should not do this again to avoid
+            # conext conflict. By now, we only control the config.inplace_buffers. In the future,
+            # we could maintain more contexts.
+            config.inplace_buffers = False
+
+            # Create non-vectorization kernel
+            cpp_kernel = create_kernel(False)
+
+            # Restore the inplace_buffers flag
+            config.inplace_buffers = org_inplace_buffers_flag
+            return (cpp_vec_kernel, cpp_kernel)
+        else:
+            return (None, create_kernel(False))
+
+    def codegen_nodes(self, nodes):
+        """
+        Turn an set of pre-fused nodes into a C++ kernel.
+        """
+        kernel_group = self.kernel_group
+
+        can_be_simd_vec = self.can_vec(nodes)
+        simd_vec_kernel, simd_omp_kernel = self._codegen_nodes_impl(
+            nodes, can_be_simd_vec
+        )
+
+        assert simd_omp_kernel
+        metrics.generated_kernel_count -= 1
+        # Maitain the metrics kernel count
+        if simd_vec_kernel:
+            metrics.generated_kernel_count -= 1
+
+        cpp_kernel_proxy = CppKernelProxy(
+            kernel_group.args, kernel_group.ws.num_threads
+        )
+        cpp_kernel_proxy.simd_vec_kernel = simd_vec_kernel
+        cpp_kernel_proxy.simd_omp_kernel = simd_omp_kernel
+
+        kernel_group.finalize_kernel(cpp_kernel_proxy, None)
 
     def flush(self):
         self.kernel_group.codegen_define_and_call(V.graph.wrapper_code)
@@ -580,8 +1050,11 @@ def __init__(self):
         self.stack.enter_context(self.ws)
         self.count = 0
 
-    def new_kernel(self):
-        return CppKernel(self.args, self.ws.num_threads)
+    def new_kernel(self, simd_vec=False):
+        if simd_vec:
+            return CppVecKernel(self.args, parallel_num_threads())
+        else:
+            return CppKernel(self.args, parallel_num_threads())
 
     def finalize_kernel(self, new_kernel, scheduler):
         self.count += 1
@@ -660,10 +1133,14 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 @dataclasses.dataclass
 class LoopLevel:
-    var: sympy.Expr
-    size: sympy.Expr
+    var: sympy.Expr = None
+    size: sympy.Expr = None
+    offset: sympy.Expr = sympy.Integer(0)
+    steps: sympy.Expr = sympy.Integer(1)
     parallel: int = 0
-    simd: bool = False
+    simd_omp: bool = False
+    simd_len: int = config.cpp.simdlen
+    simd_vec: bool = False
     collapsed: bool = False
     reduction_vars: Dict[str, str] = None
 
@@ -675,26 +1152,40 @@ def lines(self):
             )
         else:
             reduction = ""
-        simd = f"simd simdlen({config.cpp.simdlen})"
+        simd = f"simd simdlen({self.simd_len})" if self.simd_omp else ""
         if self.parallel:
             # TODO(jansel): look into chunk size and other schedules
             line1 = f"#pragma omp for{reduction} "
             if self.parallel > 1:
                 line1 += f" collapse({self.parallel})"
-            if self.simd:
+            if self.simd_omp:
                 line1 = line1.replace(" for ", f" for {simd}")
-        elif self.simd:
+        elif self.simd_vec:
+            line1 = ""
+        elif self.simd_omp:
             line1 = f"#pragma omp {simd}{reduction}"
         elif not self.reduction_vars and codecache.is_gcc():
             line1 = "#pragma GCC ivdep"
         else:
             line1 = ""
-        line2 = f"for({INDEX_TYPE} {self.var}=0; {self.var}<{cexpr(self.size)}; ++{self.var})"
+        line2 = f"for({INDEX_TYPE} {self.var}={cexpr(self.offset)}; {self.var}<{cexpr(self.size)}; {self.var}+={cexpr(self.steps)})"
         if self.collapsed or not line1:
             return [line2]
         return [line1, line2]
 
 
+class LoopLevelWithTail(LoopLevel):
+    def __init__(self, main_loop: LoopLevel, tail_loop: LoopLevel):
+        super().__init__()
+        self.main_loop = main_loop
+        self.tail_loop = tail_loop
+        self.main_loop_body = None
+        self.tail_loop_body = None
+
+    def lines(self):
+        raise AssertionError("Not Implemented")
+
+
 @dataclasses.dataclass
 class LoopNest:
     loops: List[LoopLevel]
@@ -711,7 +1202,35 @@ def mark_parallel(self, par_depth):
         loops[0].parallel = par_depth
         for i in range(1, par_depth):
             loops[i].collapsed = True
-        loops[0].simd = loops[par_depth - 1].simd
+
+    def split_most_inner_loop(self, factor):
+        sympy_factor = sympy.Integer(factor)
+
+        most_inner_loop = self.loops[-1]
+
+        # If the most inner loop needs to be collapsed, we need to
+        # exclude it since we need to split it into two loops. Meanwhile,
+        # we still mark it as parallized.
+        if most_inner_loop.collapsed:
+            assert self.loops[0].parallel == len(self.loops)
+            self.loops[0].parallel -= 1
+
+        main_loop_range = ir.IndexingDiv(most_inner_loop.size, sympy_factor)
+
+        main_loop = LoopLevel(most_inner_loop.var, main_loop_range)
+        main_loop.parallel = 1 if most_inner_loop.parallel > 0 else 0
+        main_loop.collapsed = False
+
+        offset = main_loop_range * sympy_factor
+        tail_loop = LoopLevel(most_inner_loop.var, most_inner_loop.size)
+        tail_loop.offset = offset
+        tail_loop.parallel = 1 if most_inner_loop.parallel > 0 else 0
+        tail_loop.collapsed = False
+
+        loop_with_tail = LoopLevelWithTail(main_loop, tail_loop)
+        loop_with_tail.parallel = 0
+        loop_with_tail.collapsed = False
+        self.loops[-1] = loop_with_tail
 
     def codegen(self, code, stack):
         for loop in self.loops:
diff --git a/torch/_inductor/codegen/cpp_prefix.h b/torch/_inductor/codegen/cpp_prefix.h
index 346de52563883..8de087b45be2e 100644
--- a/torch/_inductor/codegen/cpp_prefix.h
+++ b/torch/_inductor/codegen/cpp_prefix.h
@@ -6,8 +6,11 @@
 #include <omp.h>
 
 #include <ATen/core/PhiloxRNGEngine.h>
-#include <c10/util/Half.h>
+#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2)
+#include <ATen/cpu/vec/vec.h>
+#endif
 #include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
 
 typedef at::Half half;
 typedef at::BFloat16 bfloat16;
diff --git a/torch/_inductor/metrics.py b/torch/_inductor/metrics.py
index b94badf93289e..582c5aca7f880 100644
--- a/torch/_inductor/metrics.py
+++ b/torch/_inductor/metrics.py
@@ -1,8 +1,12 @@
 # counter for tracking how many kernels have been generated
 generated_kernel_count = 0
+generated_cpp_vec_kernel_count = 0
 
 
 # reset all counters
 def reset():
     global generated_kernel_count
+    global generated_cpp_vec_kernel_count
+
     generated_kernel_count = 0
+    generated_cpp_vec_kernel_count = 0

From 1c7f614818f7b3a961c994500255d9938f82c7cb Mon Sep 17 00:00:00 2001
From: "Wang, Eikan" <eikan.wang@intel.com>
Date: Fri, 4 Nov 2022 05:28:17 +0000
Subject: [PATCH 0612/1922] Support reduction vectorization (#87356)

This PR is to optimize reduction implementation by `at::vec`. The main idea is as same as the aten implementation.
- Step1: Parallelize and vectorize the reduction implementation
- Step2: Invoke `at::vec::vec_reduce_all` to reduce the vector generated at step 1 to a single scalar
- Step3: Handle the tail elements

For the implementation, we create two kernels - `CppVecKernel` and `CppKernel`. The code block generation is as follows step by step.

- Gen the non-reduction loop - [Code](https://github.com/pytorch/pytorch/blob/gh/EikanWang/9/head/torch/_inductor/codegen/cpp.py#L1008-L1010)
- Gen the reduction initialization both for vectorization and non-vectorization kernel - [Code](https://github.com/pytorch/pytorch/blob/gh/EikanWang/9/head/torch/_inductor/codegen/cpp.py#L1015)
- Gen the reduction loop for the vectorization kernel - [Code](https://github.com/pytorch/pytorch/blob/gh/EikanWang/9/head/torch/_inductor/codegen/cpp.py#L1021-L1023)
- Gen the code to reduce the vector to scalar - [Code](https://github.com/pytorch/pytorch/blob/gh/EikanWang/9/head/torch/_inductor/codegen/cpp.py#L1033)
- Gen the reduction loop for the non-vectorization kernel - [Code](https://github.com/pytorch/pytorch/blob/gh/EikanWang/9/head/torch/_inductor/codegen/cpp.py#L1042)
- Do some post-reduction things like store reduction value - [Code](https://github.com/pytorch/pytorch/blob/gh/EikanWang/9/head/torch/_inductor/codegen/cpp.py#L1049)

```python
# Gen the non-reduction loop
for loop in CppVecKernel.NoneReductionLoop:
    # Gen the reduction initialization both for vectorization and non-vectorization kernel
    CppVecKernel.ReductionPrefix
    # Gen the reduction loop for the vectorization kernel
    for loop in CppVecKernel.ReductionLoop
        CppVecKernel.Loads
        CppVecKernel.Compute
        CppVecKernel.Stores
    # Gen the code to reduce the vector to scalar
    CppVecKernel.ReductionSuffix
    # Gen the reduction loop for the non-vectorization kernel
    for loop in CppKernel.ReductionLoop
        CppKernel.Loads
        CppKernel.Compute
        CppKernel.Stores
    # The reduction is almost finished. To do some post-reduction things like store reduction value.
    CppKernel.ReductionSuffix
```
The code snippet for maximum reduction exemplifies the idea. More detailed comments are inlined.

```C++
    {
        // Declare reduction for at::vec::Vectorized since it is not built-in data type.
        #pragma omp declare reduction(+:at::vec::Vectorized<float>:omp_out += omp_in) initializer(omp_priv={{0}})

        float tmp4 = 0;
        // tmp4_vec is used to vectorize the sum reduction for tmp4
        auto tmp4_vec = at::vec::Vectorized<float>(tmp4);
        float tmp6 = 0;
        // tmp6_vec is used to vectorize the sum reduction for tmp6
        auto tmp6_vec = at::vec::Vectorized<float>(tmp6);
        #pragma omp parallel num_threads(48)
        {
            // Parallelize the vectorized reduction
            #pragma omp for reduction(+:tmp4_vec) reduction(+:tmp6_vec)
            for(long i0=0; i0<192; i0+=1)
            {
                auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + 8*i0);
                auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr1 + 8*i0);
                auto tmp2 = tmp0 - tmp1;
                auto tmp3 = tmp2.abs();
                auto tmp5 = tmp2 * tmp2;
                tmp4_vec += tmp3;
                tmp6_vec += tmp5;
            }
            // Reduce the tmp4_vec as a scalar and store at tmp4
            tmp4 = at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>&y) {return x + y;}, tmp4_vec);
            // Reduce the tmp6_vec as a scalar and store at tmp6
            tmp6 = at::vec::vec_reduce_all<float>([](at::vec::Vectorized<float>& x, at::vec::Vectorized<float>&y) {return x + y;}, tmp6_vec);
            // Handle the tail elements that could not be vectorized by aten.
            #pragma omp for simd simdlen(4) reduction(+:tmp4) reduction(+:tmp6)
            for(long i0=1536; i0<1536; i0+=1)
            {
                auto tmp0 = in_ptr0[i0];
                auto tmp1 = in_ptr1[i0];
                auto tmp2 = tmp0 - tmp1;
                auto tmp3 = std::abs(tmp2);
                auto tmp5 = tmp2 * tmp2;
                tmp4 += tmp3;
                tmp6 += tmp5;
            }
        }
        out_ptr0[0] = tmp4;
        out_ptr1[0] = tmp6;
    }
```

Performance(Measured by operatorbench and the base line of speedup ratio is aten operator performance):
Softmax (1,16,384,384,dim=3) | Speedup ratio (simdlen=None) |  Speedup ratio (simdlen=8) + this PR
-- | -- | --
24c | 0.37410838067524177 | 0.9036240100351164
4c | 0.24655829520907663 | 1.0255329993674518
1c | 0.21595768114988007 | 1.000587368005134

HW Configuration:
SKU: SKX Intel(R) Xeon(R) Platinum 8260 CPU @ 2.40GHz
MemTotal:       196708148 kB
MemFree:        89318532 kB
MemBandwidth:  112195.1MB/S

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87356
Approved by: https://github.com/jgong5, https://github.com/jansel
---
 torch/_inductor/codegen/cpp.py       | 240 +++++++++++++++++++++++----
 torch/_inductor/codegen/cpp_prefix.h |   1 +
 2 files changed, 211 insertions(+), 30 deletions(-)

diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index 1248d263239f2..17b87f3f86392 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -74,6 +74,17 @@ def reduction_combine(reduction_type, var, next_value):
     return f"{var} = std::{reduction_type}({var}, {next_value})"
 
 
+def reduction_combine_vec(reduction_type, var, next_value):
+    if reduction_type == "max":
+        return f"{var} = at::vec::maximum({var}, {next_value})"
+    elif reduction_type == "min":
+        return f"{var} = at::vec::minimum({var}, {next_value})"
+    elif reduction_type == "sum":
+        return f"{var} += {next_value}"
+    else:
+        raise NotImplementedError()
+
+
 index_value_name_counter = 1
 
 
@@ -288,6 +299,18 @@ def truncdiv(a, b):
         # a and b are integer type
         return f"{a} / {b}"
 
+    @staticmethod
+    def minimum(a, b):
+        return f"at::vec::minimum({a}, {b})"
+
+    @staticmethod
+    def maximum(a, b):
+        return f"at::vec::maximum({a}, {b})"
+
+    @staticmethod
+    def square(a):
+        return f"{a}.pow(2)"
+
 
 class CppOverrides(OpOverrides):
     """Map element-wise ops to C++"""
@@ -651,6 +674,7 @@ class CppVecKernel(CppKernel):
     def __init__(self, args, num_threads):
         super(CppVecKernel, self).__init__(args, num_threads)
         self.simd_len = config.cpp.simdlen
+        self.reduction_omp_dec: Dict[str, str] = {}
         metrics.generated_cpp_vec_kernel_count += 1
 
     def is_single_step_var(self, var: sympy.Symbol, index: sympy.Expr):
@@ -698,6 +722,62 @@ def store(self, name, index, value, mode=None):
         line = f"{value}.store({var} + {cexpr(new_index)});"
         self.stores.writeline(name, line)
 
+    def reduction(self, name, dtype, src_dtype, reduction_type, index, value):
+        assert reduction_type in {"max", "min", "sum"}
+        assert dtype == torch.float
+        assert src_dtype == torch.float
+        reduce_map = {"max": "maximum", "min": "minimum"}
+
+        vec_ns = "at::vec"
+        vec = f"{vec_ns}::Vectorized<{DTYPE_TO_CPP[dtype]}>"
+
+        if reduction_type not in self.reduction_omp_dec:
+            vec_reduc_prefix = "#pragma omp declare reduction("
+            vec_reduc_prefix += f"{RTYPE_TO_CPP[reduction_type]}:{vec}:"
+            if reduction_type == "sum":
+                vec_reduc_prefix += "omp_out += omp_in"
+            else:
+                vec_reduc_prefix += (
+                    f"omp_out = {vec_ns}::{reduce_map[reduction_type]}(omp_out, omp_in)"
+                )
+            vec_reduc_prefix += ")"
+            vec_reduc_prefix += " initializer("
+            vec_reduc_prefix += "omp_priv={{"
+            vec_reduc_prefix += f"{reduction_init(reduction_type, dtype)}"
+            vec_reduc_prefix += "}})"
+            self.reduction_omp_dec[reduction_type] = RTYPE_TO_CPP[reduction_type]
+            self.reduction_prefix.writeline(vec_reduc_prefix)
+
+        tmpvar = self.cse.generate(
+            self.loads, f"reduction {name} {cexpr(index)}", write=False
+        )
+        tmpvar_vec = f"{tmpvar}_vec"
+
+        index = self.rename_indexing(index)
+        self.reduction_vars[tmpvar] = reduction_type
+        self.reduction_prefix.writeline(
+            f"{DTYPE_TO_CPP[dtype]} {tmpvar} = {reduction_init(reduction_type, dtype)};"
+        )
+        self.reduction_prefix.writeline(
+            f"auto {tmpvar_vec} = at::vec::Vectorized<{DTYPE_TO_CPP[dtype]}>({tmpvar});"
+        )
+        self.stores.writeline(
+            None, f"{reduction_combine_vec(reduction_type, tmpvar_vec, value)};"
+        )
+
+        reduce_all_body = "{"
+        if reduction_type == "sum":
+            reduce_all_body += "return x + y;"
+        else:
+            reduce_all_body += f"return {vec_ns}::{reduce_map[reduction_type]}(x, y);"
+        reduce_all_body += "}"
+        vec_reduce_all_func = f"{vec_ns}::vec_reduce_all<{DTYPE_TO_CPP[dtype]}>"
+        self.reduction_suffix.writeline(
+            name,
+            f"{tmpvar} = {vec_reduce_all_func}([]({vec}& x, {vec}&y) {reduce_all_body}, {tmpvar_vec});",
+        )
+        self.cse.store_cache[name] = tmpvar
+
 
 class CppVecKernelChecker(CppVecKernel):
     def __init__(self, args, num_threads):
@@ -752,7 +832,14 @@ def store(self, name, index, value, mode=None):
         return self.simd_vec
 
     def reduction(self, name, dtype, src_dtype, reduction_type, index, value):
-        self.simd_vec = False
+        if (
+            dtype == torch.float
+            and src_dtype == torch.float
+            and reduction_type in ["max", "min", "sum"]
+        ):
+            pass
+        else:
+            self.simd_vec = False
         return self.simd_vec
 
     def __exit__(self, exc_type, exc_val, exc_tb):
@@ -824,16 +911,15 @@ def masked(mask, body, other):
 class CppKernelProxy(CppKernel):
     def __init__(self, args=None, num_threads=None):
         super(CppKernelProxy, self).__init__(args, num_threads)
-        self.simd_vec_kernel = None
-        self.simd_omp_kernel = None
+        self.simd_vec_kernel: CppVecKernel = None
+        self.simd_omp_kernel: CppKernel = None
 
     def vectorize_most_inner_loop(self, loop_nest):
         loop_nest.split_most_inner_loop(config.cpp.simdlen)
         loop_with_tail = loop_nest.loops[-1]
         assert isinstance(loop_with_tail, LoopLevelWithTail)
 
-        self.simd_vec_kernel.simd = False
-        self.simd_vec_kernel.fast_vec = True
+        loop_with_tail.main_loop.simd_vec = True
 
         loop_with_tail.tail_loop.simd_omp = True
         # We chope the loop into two cubes by the config.cpp.simdlen - main loop and tail loop.
@@ -857,51 +943,141 @@ def codegen_loops(self, code, worksharing):
 
         assert self.simd_vec_kernel.itervars == self.simd_omp_kernel.itervars
         assert self.simd_vec_kernel.ranges == self.simd_omp_kernel.ranges
+        assert (
+            self.simd_vec_kernel.reduction_vars == self.simd_omp_kernel.reduction_vars
+        )
 
         itervars = self.simd_vec_kernel.itervars
         rangs = self.simd_vec_kernel.ranges
         loops = [LoopLevel(var, size) for var, size in zip(itervars, rangs)]
-
-        # TODO: Support reductions
-        loops_nest_non_reduc, _ = LoopNest(loops[: self.reduction_depth]), LoopNest(
-            loops[self.reduction_depth :]
+        assert (
+            self.simd_vec_kernel.reduction_depth == self.simd_omp_kernel.reduction_depth
         )
+        reduction_depth = self.simd_vec_kernel.reduction_depth
+        loops_nest_non_reduce, loops_nest_reduce = LoopNest(
+            loops[:reduction_depth]
+        ), LoopNest(loops[reduction_depth:])
+        loops_nest_reduce.mark_reduction(self.simd_vec_kernel.reduction_vars)
 
-        assert config.cpp.simdlen
-        loops_nest_non_reduc.loops[-1].simd_omp = True
+        if config.cpp.simdlen:
+            # TODO(jansel): detect stride-1 dimension and vectorize that
+            if loops_nest_reduce:
+                loops_nest_reduce.loops[-1].simd = True
+            elif loops_nest_non_reduce:
+                loops_nest_non_reduce.loops[-1].simd = True
 
         par_depth = 0
-        if loops_nest_non_reduc:
+        reduction_par_depth = 0
+        if loops_nest_non_reduce:
             par_depth = self.simd_vec_kernel.decide_parallel_depth(
-                self.simd_vec_kernel.call_ranges[: self.reduction_depth], threads
+                self.simd_vec_kernel.call_ranges[:reduction_depth], threads
             )
+        else:
+            reduction_par_depth = self.simd_vec_kernel.decide_parallel_depth(
+                self.simd_vec_kernel.call_ranges[reduction_depth:], threads
+            )
+
+        # If the most inner loop of the reduction will be vectorized, the vectorization
+        # will add a vec variable for reduction. Take the code snippet as an example:
+        #     float tmp1 = 0;
+        #     for(long i1=0; i1<8; i1+=1) {
+        #        auto tmp0 = in_ptr0[i1];
+        #        tmp1 += tmp0;
+        #     }
+        # The vectorization will add tmp1_vec for reduction and then the loop will be transformed
+        # as follows.
+        #     float tmp1 = 0;
+        #     auto tmp1_vec = at::vec::Vectorized<float>(tmp1);
+        #     for(long i1=0; i1<1; i1+=1) {
+        #        auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + (8*i1));
+        #        tmp1_vec += tmp0;
+        #     }
+        #     tmp1 = at::vec::vec_reduce_all<float>([]
+        #       (at::vec::Vectorized<float>& x, at::vec::Vectorized<float>&y) {return x + y;},
+        #       tmp1_vec);
+        #     for(long i1=8; i1<8; i1+=1) {
+        #        auto tmp0 = in_ptr0[i1];
+        #        tmp1 += tmp0;
+        #     }
+        # It means that the vectorization introduce another reduction variable(tmp1_vec).
+        # If the most inner loop of the reduction is not a parallelized but its parent reduction
+        # loop is parallized, the new added reduction variable(tmp1_vec) could not be added
+        # to the parallelized loop reduction. So we skip this case and does not vectorize it.
+        if reduction_par_depth > 0 and reduction_par_depth != len(
+            loops_nest_reduce.loops
+        ):
+            return self.simd_omp_kernel.codegen_loops(code, worksharing)
 
         with contextlib.ExitStack() as stack:
             if par_depth:
                 worksharing.parallel(threads)
-                loops_nest_non_reduc.mark_parallel(par_depth)
+                loops_nest_non_reduce.mark_parallel(par_depth)
+            elif reduction_par_depth:
+                # need to close the worksharing scope to define reduction vars outside it
+                worksharing.close()
+                loops_nest_reduce.mark_parallel(reduction_par_depth)
             elif threads > 1:
                 if worksharing.single():
                     stack.enter_context(code.indent())
 
-            self.vectorize_most_inner_loop(loops_nest_non_reduc)
+            non_reduce_loops = loops_nest_non_reduce.loops
+            reduce_loops = loops_nest_reduce.loops
+            loop_with_tail: LoopLevelWithTail = None
 
-            for loop in loops_nest_non_reduc.loops[0:-1]:
+            if loops_nest_reduce:
+                self.vectorize_most_inner_loop(loops_nest_reduce)
+                loop_with_tail = loops_nest_reduce.loops[-1]
+                # The most inner loop will be vectorized
+                reduce_loops = reduce_loops[0:-1]
+            else:
+                self.vectorize_most_inner_loop(loops_nest_non_reduce)
+                loop_with_tail = loops_nest_non_reduce.loops[-1]
+                # The most inner loop will be vectorized
+                non_reduce_loops = non_reduce_loops[0:-1]
+
+            # The reductions loops are always the loop body of non-reduction loops
+            for loop in non_reduce_loops:
                 code.writelines(loop.lines())
                 stack.enter_context(code.indent())
 
-            loop_with_tail: LoopLevelWithTail = loops_nest_non_reduc.loops[-1]
-            for loop, kernel in (
-                (loop_with_tail.main_loop, loop_with_tail.main_loop_body),
-                (loop_with_tail.tail_loop, loop_with_tail.tail_loop_body),
-            ):
+            with contextlib.ExitStack() as stack_outer:
+                if self.simd_vec_kernel.reduction_prefix:
+                    stack_outer.enter_context(code.indent())
+                code.splice(self.simd_vec_kernel.reduction_prefix)
+
+                if reduction_par_depth:
+                    worksharing.parallel(threads)
 
-                code.writelines(loop.lines())
                 with contextlib.ExitStack() as stack:
-                    stack.enter_context(code.indent())
-                    code.splice(kernel.loads)
-                    code.splice(kernel.compute)
-                    code.splice(kernel.stores)
+                    for loop in reduce_loops:
+                        code.writelines(loop.lines())
+                        stack.enter_context(code.indent())
+
+                    def gen_vectorized_loop(loop, kernel, write_reduction_suffix=False):
+                        code.writelines(loop.lines())
+                        with contextlib.ExitStack() as stack:
+                            stack.enter_context(code.indent())
+                            code.splice(kernel.loads)
+                            code.splice(kernel.compute)
+                            code.splice(kernel.stores)
+                        if write_reduction_suffix:
+                            code.splice(kernel.reduction_suffix)
+
+                    # Regarding the vectorized reduction loop, we need to call reduce_all to to reduce
+                    # the vectorize as a single scalar. Hence, we set write_reduction_suffix to True to
+                    # gen the code.
+                    gen_vectorized_loop(
+                        loop_with_tail.main_loop, loop_with_tail.main_loop_body, True
+                    )
+
+                    gen_vectorized_loop(
+                        loop_with_tail.tail_loop, loop_with_tail.tail_loop_body, False
+                    )
+
+                if reduction_par_depth:
+                    worksharing.close()
+
+                code.splice(loop_with_tail.tail_loop_body.reduction_suffix)
 
 
 class CppScheduling:
@@ -1146,13 +1322,14 @@ class LoopLevel:
 
     def lines(self):
         if self.reduction_vars:
+            suffix = "_vec" if self.simd_vec else ""
             reduction = " " + " ".join(
-                f"reduction({RTYPE_TO_CPP[rtype]}:{var})"
+                f"reduction({RTYPE_TO_CPP[rtype]}:{var}{suffix})"
                 for var, rtype in self.reduction_vars.items()
             )
         else:
             reduction = ""
-        simd = f"simd simdlen({self.simd_len})" if self.simd_omp else ""
+        simd = f"simd simdlen({self.simd_len}) " if self.simd_omp else ""
         if self.parallel:
             # TODO(jansel): look into chunk size and other schedules
             line1 = f"#pragma omp for{reduction} "
@@ -1218,18 +1395,21 @@ def split_most_inner_loop(self, factor):
         main_loop_range = ir.IndexingDiv(most_inner_loop.size, sympy_factor)
 
         main_loop = LoopLevel(most_inner_loop.var, main_loop_range)
-        main_loop.parallel = 1 if most_inner_loop.parallel > 0 else 0
+        main_loop.parallel = most_inner_loop.parallel
         main_loop.collapsed = False
+        main_loop.reduction_vars = most_inner_loop.reduction_vars
 
         offset = main_loop_range * sympy_factor
         tail_loop = LoopLevel(most_inner_loop.var, most_inner_loop.size)
         tail_loop.offset = offset
-        tail_loop.parallel = 1 if most_inner_loop.parallel > 0 else 0
+        tail_loop.parallel = most_inner_loop.parallel
         tail_loop.collapsed = False
+        tail_loop.reduction_vars = most_inner_loop.reduction_vars
 
         loop_with_tail = LoopLevelWithTail(main_loop, tail_loop)
         loop_with_tail.parallel = 0
         loop_with_tail.collapsed = False
+
         self.loops[-1] = loop_with_tail
 
     def codegen(self, code, stack):
diff --git a/torch/_inductor/codegen/cpp_prefix.h b/torch/_inductor/codegen/cpp_prefix.h
index 8de087b45be2e..1905aefcda5c0 100644
--- a/torch/_inductor/codegen/cpp_prefix.h
+++ b/torch/_inductor/codegen/cpp_prefix.h
@@ -7,6 +7,7 @@
 
 #include <ATen/core/PhiloxRNGEngine.h>
 #if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2)
+#include <ATen/cpu/vec/functional.h>
 #include <ATen/cpu/vec/vec.h>
 #endif
 #include <c10/util/BFloat16.h>

From b324327f523f5c69a38e423cfc76afcd4a556314 Mon Sep 17 00:00:00 2001
From: "Wang, Eikan" <eikan.wang@intel.com>
Date: Fri, 4 Nov 2022 05:28:18 +0000
Subject: [PATCH 0613/1922] Support sign for HF models like
 ElectraForQuestionAnswering (#88160)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88160
Approved by: https://github.com/jansel
---
 test/inductor/test_torchinductor.py | 21 +++++++++++++++++++
 torch/_inductor/codegen/cpp.py      | 32 +++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 7a5d291691d42..b0b5f884b44de 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -4415,6 +4415,26 @@ def test_complex_memory_overlap(self):
             self.assertFalse(complex_memory_overlap(gathered))
             self.assertFalse(complex_memory_overlap(gathered.t()))
 
+        @unittest.skipIf(
+            not codecache.get_cpu_proc_info(), "Does not support vectorization"
+        )
+        @patch("torch.cuda.is_available", lambda: False)
+        def test_sign_cpu_only(self):
+            def fn(x):
+                return (torch.sign(x),)
+
+            x = torch.randn((2, 9))
+            x[0, 0] = torch.nan
+            x[1, -1] = torch.nan
+
+            with patch.object(config.cpp, "simdlen", 8):
+                torch._dynamo.reset()
+                metrics.reset()
+                traced = make_fx(fn)(x)
+                compiled = compile_fx_inner(traced, [x])
+                assert same(fn(x)[0], compiled([x])[0], equal_nan=True)
+                assert metrics.generated_cpp_vec_kernel_count == 1
+
         # Currently, we enabled AVX2 and AVX512 for vectorization. If the platform is not
         # supported, the vectorization will not work and skip this test case. For ARM or
         # other platforms support, we just need to add the ISA info to the supported_vector_isa
@@ -4452,6 +4472,7 @@ def fn(x1, x2):
                 x = torch.trunc(x)
                 x = torch.lgamma(x)
                 x = torch.fmod(x, x2)
+                x = torch.sign(x)
                 res = x + x2
                 return (res,)
 
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index 17b87f3f86392..bd7dc9dba88f5 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -311,6 +311,25 @@ def maximum(a, b):
     def square(a):
         return f"{a}.pow(2)"
 
+    @staticmethod
+    def sign(x):
+        code = BracesBuffer()
+        # auto tmp5 = tmp4 < 0 ? -1 : 1;
+        vec_zero = f"decltype({x})(0)"
+        vec_one = f"decltype({x})(1)"
+        blendv = f"decltype({x})::blendv({vec_zero}, {vec_one}, {vec_zero} < {x})"
+        left = V.kernel.cse.newvar()
+        code.writeline(f"auto {left} = {blendv};")
+
+        # auto tmp6 = tmp4 == 0 ? 0 : tmp5;
+        blendv = f"decltype({x})::blendv({vec_zero}, {vec_one}, {x} < {vec_zero})"
+        right = V.kernel.cse.newvar()
+        code.writeline(f"auto {right} = {blendv};")
+        result = V.kernel.cse.newvar()
+        code.writeline(f"auto {result} = {left} - {right};")
+        V.kernel.compute.splice(code)
+        return result
+
 
 class CppOverrides(OpOverrides):
     """Map element-wise ops to C++"""
@@ -473,6 +492,19 @@ def sigmoid(x):
         x = ops.exp(f"-{x}")
         return f"1 / (1 + {x})"
 
+    @staticmethod
+    def sign(x):
+        code = BracesBuffer()
+        # auto tmp5 = tmp4 < 0 ? -1 : 1;
+        left = V.kernel.cse.newvar()
+        right = V.kernel.cse.newvar()
+        result = V.kernel.cse.newvar()
+        code.writeline(f"auto {left} = {x} > 0 ? 1 : 0;")
+        code.writeline(f"auto {right} = {x} < 0 ? 1 : 0;")
+        code.writeline(f"auto {result} = {left} - {right};")
+        V.kernel.compute.splice(code)
+        return result
+
 
 class CppKernel(Kernel):
     overrides = CppOverrides

From 01f19edb7d67d24f4c9d008f67695ca546dea7a5 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 7 Nov 2022 10:22:44 +0000
Subject: [PATCH 0614/1922] [xla hash update] update the pinned xla hash
 (#88070)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned xla hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88070
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/xla.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index ec894b7a5f4b0..ea8cc0b128f6d 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-1e9b8bdc75114ac6c16305c970be37a1cd2fdb1c
+5ff192eac058d54cc5f22c8ddf9cf6acfa51325d

From 8d6216475665ea4cd7f1e96f1935076153b90367 Mon Sep 17 00:00:00 2001
From: Mike Iovine <mikeiovine@meta.com>
Date: Mon, 7 Nov 2022 14:36:39 +0000
Subject: [PATCH 0615/1922] [Static Runtime] Fix tensor_split sections overload
 (#88113)

Summary:
D40798763 broke this op. Unfortunately, it wasn't caught at land time due to the recent OSS Static Runtime test problems.

The problem is C++ overload resolution. After D40798763, the int that we were passing to `at::native::tensor_split` was getting implicitly converted to `IntArrayRef`. Fix this by converting the int to a `SymInt` and calling the correct overload.

Test Plan:
```
buck2 test caffe2/benchmarks/static_runtime:static_runtime_cpptest -- Tensor_Split --run-disabled
```

Differential Revision: D40862394

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88113
Approved by: https://github.com/hlu1
---
 benchmarks/static_runtime/test_static_runtime.cc | 4 ++--
 benchmarks/static_runtime/test_utils.cc          | 6 +++---
 torch/csrc/jit/runtime/static/native_ops.cpp     | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index 2cb50d48ff636..dc4ce01df72cf 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -2560,8 +2560,8 @@ TEST(StaticRuntime, Tensor_Split) {
   std::vector<IValue> args2{at::randn({8}), torch::tensor(3), 0};
 
   const auto tensor_split_str3 = R"JIT(
-    def forward(self, a: Tensor, indicies: List[int], dim: int):
-        return torch.tensor_split(a, indicies, dim)
+    def forward(self, a: Tensor, indices: List[int], dim: int):
+        return torch.tensor_split(a, indices, dim)
   )JIT";
   std::vector<IValue> args3{at::randn({8}), c10::List<int64_t>({1, 6}), 0};
 
diff --git a/benchmarks/static_runtime/test_utils.cc b/benchmarks/static_runtime/test_utils.cc
index 7e0733fbc8af4..b096d1a0ba9f5 100644
--- a/benchmarks/static_runtime/test_utils.cc
+++ b/benchmarks/static_runtime/test_utils.cc
@@ -172,7 +172,7 @@ void compareResults(
     EXPECT_TRUE(actual.isTuple());
     auto lhs = expect.toTupleRef().elements();
     auto rhs = actual.toTupleRef().elements();
-    EXPECT_TRUE(lhs.size() == rhs.size());
+    ASSERT_TRUE(lhs.size() == rhs.size());
     for (size_t i = 0; i < lhs.size(); i++) {
       compareResults(lhs[i], rhs[i]);
     }
@@ -180,7 +180,7 @@ void compareResults(
     EXPECT_TRUE(actual.isList());
     auto lhs = expect.toList();
     auto rhs = actual.toList();
-    EXPECT_TRUE(lhs.size() == rhs.size());
+    ASSERT_TRUE(lhs.size() == rhs.size());
     for (size_t i = 0; i < lhs.size(); i++) {
       compareResults(lhs[i], rhs[i]);
     }
@@ -191,7 +191,7 @@ void compareResults(
     EXPECT_TRUE(lhs.size() == rhs.size());
     for (auto& lh : lhs) {
       auto f = rhs.find(lh.key());
-      EXPECT_FALSE(f == rhs.end());
+      ASSERT_FALSE(f == rhs.end());
       compareResults(lh.value(), f->value());
     }
   } else {
diff --git a/torch/csrc/jit/runtime/static/native_ops.cpp b/torch/csrc/jit/runtime/static/native_ops.cpp
index 790d54b5c0023..1c8fb0791389c 100644
--- a/torch/csrc/jit/runtime/static/native_ops.cpp
+++ b/torch/csrc/jit/runtime/static/native_ops.cpp
@@ -870,9 +870,9 @@ REGISTER_NATIVE_OPERATOR_FUNCTOR(aten::tensor_split, aten_tensor_split, [](Node*
           "aten::tensor_split.sections(Tensor(a -> *) self, int sections, int dim=0) -> Tensor(a)[]"))) {
     return [](ProcessedNode* pnode) {
       const auto& a = pnode->Input(0).toTensor();
-      const auto b = pnode->Input(1).toInt();
+      const auto b = pnode->Input(1).toSymInt();
       const auto c = pnode->Input(2).toInt();
-      pnode->Output(0) = at::native::tensor_split(a, b, c);
+      pnode->Output(0) = at::native::tensor_split_sections_symint(a, b, c);
     };
   }
 

From a27575b5cddc90b4c9186491e7040c7baa694217 Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@fb.com>
Date: Fri, 4 Nov 2022 14:20:19 -0700
Subject: [PATCH 0616/1922] reland "fix as_strided_scatter_backward (#87646)"
 (#88342)

This reverts commit 71fb763e5452881cb3be8fefa9419b785d0a61e2.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88342
Approved by: https://github.com/zou3519
---
 test/functorch/test_aotdispatch.py                    | 1 -
 test/functorch/test_ops.py                            | 4 ----
 torch/csrc/autograd/FunctionsManual.cpp               | 6 ++++--
 torch/testing/_internal/common_methods_invocations.py | 4 ----
 4 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 2b457cd05c6cd..909e57bc1e083 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -967,7 +967,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('linalg.eig'),
     xfail('scatter_reduce', 'prod'),
 
-    # non-deterministic
     skip('as_strided_scatter'),
 
     # Too annoying to generate random inputs
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index bda05d970a5e9..74085941c6c88 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -343,7 +343,6 @@ class TestOperators(TestCase):
     @skipOps('TestOperators', 'test_grad', vjp_fail.union({
         xfail('linalg.eig'),  # diagonal_scatter does not support complex
         xfail('chalf', '', device_type='cpu'),  # RuntimeError: "sum_cpu" not implemented for 'ComplexHalf'
-        skip('as_strided_scatter', ''),  # silent incorrectness; seems flaky
         xfail('sparse.sampled_addmm', ''),  # RuntimeError: Sparse CSR tensors do not have strides
     }))
     @opsToleranceOverride('TestOperators', 'test_grad', (
@@ -476,7 +475,6 @@ def maybe_clone_inputs():
 
     @ops(op_db + additional_op_db, allowed_dtypes=(torch.float,))
     @skipOps('TestOperators', 'test_vjp', vjp_fail.union({
-        skip('as_strided_scatter', ''),  # silent incorrectness; also might be flaky
         xfail('sparse.sampled_addmm', ''),
     }))
     @opsToleranceOverride('TestOperators', 'test_vjp', (
@@ -1215,7 +1213,6 @@ def get_vjp(cotangents, *primals):
         xfail('nn.functional.multi_margin_loss', ''),  # NYI: forward AD with multi_margin_loss
         skip('linalg.householder_product', '', device_type='cuda'),  # flaky, I'm not sure why
         xfail('sparse.sampled_addmm', ''),  # Sparse tensors have no strides
-        skip('as_strided_scatter', ''),  # seems flaky
         xfail('segment_reduce', 'offsets'),  # NYI: forward-AD for segment_reduce
         xfail('index_reduce', ''),  # NYI: forward-AD for index_reduce
         xfail('segment_reduce', 'lengths'),  # NYI: forward-AD for segment_reduce
@@ -1614,7 +1611,6 @@ def fn(input, weight, bias):
         skip('linalg.multi_dot', '', device_type='cpu'),
         skip('sparse.sampled_addmm', ''),
         skip('native_layer_norm', '', device_type='cpu'),
-        xfail('as_strided_scatter', ''),
     })
     @opsToleranceOverride('TestOperators', 'test_vmap_autograd_grad', (
         tol1('linalg.householder_product',
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 3b0c73678a76d..5a3f96d47e30e 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -2907,8 +2907,10 @@ Tensor as_strided_scatter_backward(
   // take the perf hit and contiguify grad for now.
   auto grad_ = grad.contiguous();
   auto grad_slice = grad_.as_strided_symint(sizes, strides, storage_offset);
-  auto result = grad_.new_empty_strided_symint(
-      input_geometry.sym_sizes(), input_geometry.sym_strides());
+  auto result =
+      grad_.new_zeros_symint(input_geometry.sym_sizes())
+          .as_strided_symint(
+              input_geometry.sym_sizes(), input_geometry.sym_strides());
   auto result_slice = result.as_strided_symint(sizes, strides, storage_offset);
   result_slice.copy_(grad_slice);
   return result;
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 4cfdbd4114eca..2626f777d1803 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -10431,12 +10431,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            check_inplace_batched_forward_grad=False,
            sample_inputs_func=sample_inputs_as_strided_scatter,
            skips=(
-               DecorateInfo(unittest.skip('Works only for CPU complex64'), 'TestMathBits', 'test_conj_view'),
-               DecorateInfo(unittest.skip('Works for float64, fails for everything else'), 'TestMathBits', 'test_neg_view'),
                DecorateInfo(unittest.skip('Works for int64, fails for everything else'), 'TestCommon', 'test_noncontiguous_samples'),  # noqa: B950
                DecorateInfo(unittest.skip('Fails in most cases, passes on LAZY for some reason'), 'TestCommon', 'test_variant_consistency_eager'),  # noqa: B950
-               DecorateInfo(unittest.skip('Only fails for LAZY, passes on everything else'), 'TestCompositeCompliance', 'test_backward'),  # noqa: B950
-               DecorateInfo(unittest.skip('Passes on complex64 and float32 only'), 'TestJit', 'test_variant_consistency_jit'),
                DecorateInfo(unittest.skip('Fails on cuda + rocm'), 'TestCommon', 'test_complex_half_reference_testing'),
                DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_fn_grad'),
                DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_fn_gradgrad'),

From cbbdad255125fa4d3abf18ba993398d88da8eaae Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Mon, 7 Nov 2022 17:02:08 +0000
Subject: [PATCH 0617/1922] allow XLA folks update the pin (#88527)

this is one of the files XLA team needs to update ocassionally.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88527
Approved by: https://github.com/wconstab
---
 .github/merge_rules.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/merge_rules.yaml b/.github/merge_rules.yaml
index 6e04f9b4f041d..1837cce32b2f6 100644
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@@ -330,6 +330,7 @@
   - torchgen/dest/lazy_ts_lowering.py
   - torchgen/gen_lazy_tensor.py
   - aten/src/ATen/native/ts_native_functions.yaml
+  - .github/ci_commit_pins/xla.txt
   approved_by:
   - alanwaketan
   - JackCaoG

From 521594ff470811859a075cc09167e07eb425736f Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Mon, 7 Nov 2022 17:19:02 +0000
Subject: [PATCH 0618/1922] Use sudo when reset NVIDIA devices (#88531)

Per title, I should have known, i.e. https://ossci-raw-job-status.s3.amazonaws.com/log/9307292415

```
2022-11-04T23:52:18.2921665Z + echo 1
2022-11-04T23:52:18.2921862Z Reseting 0000:00:1e.0 (enabled state: 0)
2022-11-04T23:52:18.2922186Z .github/scripts/install_nvidia_utils_linux.sh: line 77: /sys/bus/pci/devices/0000:00:1e.0/reset: Permission denied
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88531
Approved by: https://github.com/ZainRizvi
---
 .github/scripts/install_nvidia_utils_linux.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/install_nvidia_utils_linux.sh b/.github/scripts/install_nvidia_utils_linux.sh
index b1fdd468a7488..cbf3df9c2d4e0 100755
--- a/.github/scripts/install_nvidia_utils_linux.sh
+++ b/.github/scripts/install_nvidia_utils_linux.sh
@@ -74,7 +74,8 @@ install_nvidia_driver_amzn2() {
                     DEVICE_ENABLED=$(cat /sys/bus/pci/devices/$PCI_ID/enable)
 
                     echo "Reseting $PCI_ID (enabled state: $DEVICE_ENABLED)"
-                    echo "1" > /sys/bus/pci/devices/$PCI_ID/reset
+                    # This requires sudo permission of course
+                    sudo echo "1" > /sys/bus/pci/devices/$PCI_ID/reset
                     sleep 1
                 done
             fi

From a59465a2b3464e809b9465f2288107ef155c18db Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Mon, 7 Nov 2022 17:26:28 +0000
Subject: [PATCH 0619/1922] Upgrade Linux NVIDIA driver to the latest prod
 version (#88517)

The driver (515.76) is downloaded from https://www.nvidia.com/en-us/drivers/unix. This should help address the issue with A10G GPU on G5 runners according to NVIDIA. This is to address https://github.com/pytorch/pytorch/issues/88352

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88517
Approved by: https://github.com/ZainRizvi
---
 .circleci/scripts/setup_ci_environment.sh     | 2 +-
 .github/scripts/install_nvidia_utils_linux.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/scripts/setup_ci_environment.sh b/.circleci/scripts/setup_ci_environment.sh
index 8ac4f5b43a9a2..e8dd9ab7195b9 100755
--- a/.circleci/scripts/setup_ci_environment.sh
+++ b/.circleci/scripts/setup_ci_environment.sh
@@ -32,7 +32,7 @@ if ! command -v aws >/dev/null; then
 fi
 
 if [ -n "${USE_CUDA_DOCKER_RUNTIME:-}" ]; then
-  DRIVER_FN="NVIDIA-Linux-x86_64-515.57.run"
+  DRIVER_FN="NVIDIA-Linux-x86_64-515.76.run"
   wget "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
   sudo /bin/bash "$DRIVER_FN" -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
   nvidia-smi
diff --git a/.github/scripts/install_nvidia_utils_linux.sh b/.github/scripts/install_nvidia_utils_linux.sh
index cbf3df9c2d4e0..9d74720514ec7 100755
--- a/.github/scripts/install_nvidia_utils_linux.sh
+++ b/.github/scripts/install_nvidia_utils_linux.sh
@@ -4,7 +4,7 @@ set -eou pipefail
 
 
 DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID)
-DRIVER_VERSION="515.57"
+DRIVER_VERSION="515.76"
 DRIVER_FN="NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
 YUM_REPO_URL="https://nvidia.github.io/nvidia-docker/${DISTRIBUTION}/nvidia-docker.repo"
 

From 0e46af62b4b29c9ea9bbcb236e78e2d04e957c2d Mon Sep 17 00:00:00 2001
From: Zain Rizvi <ZainRizvi@users.noreply.github.com>
Date: Mon, 7 Nov 2022 17:38:42 +0000
Subject: [PATCH 0620/1922] Clarify rules for which commit is used in CI
 (#88425)

The old information was out of date.  Updating it as per @janeyx99's feedback

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88425
Approved by: https://github.com/malfet
---
 CONTRIBUTING.md | 50 ++++++++++++++++---------------------------------
 1 file changed, 16 insertions(+), 34 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index c43d64c4610d6..eaf81b19eefaf 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1278,8 +1278,9 @@ our [CI wiki](https://github.com/pytorch/pytorch/wiki/Debugging-using-with-ssh-f
 ### Which commit is used in CI?
 
 For CI run on `master`, this repository is checked out for a given `master`
-commit, and CI is run on that commit (there isn't really any other choice). For
-PRs, however, it's a bit more complicated. Consider this commit graph, where
+commit, and CI is run on that commit (there isn't really any other choice).
+
+For PRs, however, it's a bit more complicated. Consider this commit graph, where
 `master` is at commit `A`, and the branch for PR #42 (just a placeholder) is at
 commit `B`:
 
@@ -1288,7 +1289,7 @@ commit `B`:
       /         \
      /           C (refs/pull/42/merge)
     /           /
----o---o---o---A (refs/heads/master)
+---o---o---o---A (merge-destination) - usually master
 ```
 
 There are two possible choices for which commit to use:
@@ -1296,37 +1297,18 @@ There are two possible choices for which commit to use:
 1. Checkout commit `B`, the head of the PR (manually committed by the PR
    author).
 2. Checkout commit `C`, the hypothetical result of what would happen if the PR
-   were merged into `master` (automatically generated by GitHub).
-
-This choice depends on several factors; here is the decision tree as of
-2021-03-30:
-
-- For CI jobs on CircleCI:
-  - If the name of the job (or one of its ancestors in the workflow DAG)
-    contains "xla" or "gcc5", choice **2** is used. This includes the following
-    jobs:
-    - pytorch_linux_xenial_py3_6_gcc5_4_build
-      - pytorch_cpp_doc_build
-      - pytorch_doc_test
-      - pytorch_linux_forward_backward_compatibility_check_test
-      - pytorch_linux_xenial_py3_6_gcc5_4_jit_legacy_test
-      - pytorch_linux_xenial_py3_6_gcc5_4_test
-      - pytorch_python_doc_build
-    - pytorch_xla_linux_bionic_py3_6_clang9_build
-      - pytorch_xla_linux_bionic_py3_6_clang9_test
-  - Otherwise, choice **1** is used.
-- For CI jobs on GitHub Actions:
-  - If the PR was created using [`ghstack`](https://github.com/ezyang/ghstack),
-    choice **1** is used.
-  - Otherwise, choice **2** is used.
-
-This is important to be aware of, because if you see a CI failure on your PR and
-choice **2** is being used for that CI job, it is possible that the failure is
-nondeterministically caused by a commit that does not exist in the ancestry of
-your PR branch. If you happen to have write access to this repo, you can choose
-to use `ghstack` to eliminate this nondeterminism for GitHub Actions jobs on
-your PRs, but it will still be present for the select CircleCI jobs listed
-above.
+   were merged into it's destination (usually `master`).
+
+For all practical purposes, most people can think of the commit being used as
+commit `B` (choice **1**).
+
+However, if workflow files (which govern CI behavior) were modified (either by your PR or since dev branch were created ) there's
+a nuance to know about:
+The workflow files themselves get taken from checkpoint `C`, the merger of your
+PR and the `master` branch. But only the workflow files get taken from that merged
+checkpoint. Everything else (tests, code, etc) all get taken directly from your
+PR's commit (commit `B`). Please note, this scenario would never affect PRs authored by `ghstack` as they would not automatically ingest the updates from default branch.
+
 
 ## Dev Infra Office Hours
 [Dev Infra Office Hours](https://github.com/pytorch/pytorch/wiki/Dev-Infra-Office-Hours) are hosted every Friday to answer any questions regarding developer experience, Green HUD, and CI.

From b14008f04831fee92723fca0f95f9e27b265194c Mon Sep 17 00:00:00 2001
From: Rodrigo Kumpera <kumpera@fb.com>
Date: Mon, 7 Nov 2022 17:56:40 +0000
Subject: [PATCH 0621/1922] [2/n] Thread PG: add class _World to
 distributed_c10d.py (#781) (#88471)

Summary:
X-link: https://github.com/pytorch/torchrec/pull/781

Move a bunch of globals to instance methods and replace all use to them.

We move all PG related globals under World and use a singleton instance under _world.

This creates an undocumented extension point to inject full control of how how c10d
state behaves.

One simple hack is to change _world to an implementation that uses a threadlocal
and enable per-thread PGs.

It almost get DDP working and the PG is missing an implementation of all_reduce.

This enables notebook usage of PTD, which is a big deal for learning it:
https://gist.github.com/kumpera/32cb051fa26b8cad8bdf671f968dcd68

This change ensures BC by keeping the global variables around and have the default _World wrap it.

I have relinked this diff to a new github PR, so that I can update it. The original PR is
> Pull Request resolved: https://github.com/pytorch/pytorch/pull/86348

Differential Revision: D40236769

Pulled By: yhcharles

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88471
Approved by: https://github.com/gnadathur, https://github.com/rohan-varma
---
 torch/distributed/distributed_c10d.py         | 202 ++++++++++++------
 .../_internal/distributed/distributed_test.py |   5 +-
 2 files changed, 140 insertions(+), 67 deletions(-)

diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index af6ede4c7aeb2..32b0949a3e348 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -273,32 +273,113 @@ def __getattribute__(self, key):
 reduce_op = _reduce_op()
 
 
-class group(object):
+# DO NOT USE THESE FIELDS DIRECTLY.
+# Use them through the _world object to make sure the _world override mechanism
+_pg_map: Dict[ProcessGroup, Tuple[str, Optional[Store]]] = {}
+_pg_names: Dict[ProcessGroup, str] = {}
+_pg_group_ranks: Dict[ProcessGroup, Dict[int, int]] = {}
+_group_count = 0
+
+class _World:
+    """
+    Container class for c10d process group state.
+    This is used during registration and lookup of PG state.
+
+    .. warning:: This is an experimental API inteded to expose the inner workings
+       of c10d and is subject to change..
+    """
+    def __init__(self):
+        self._default_pg = None
+
+    @property
+    def default_pg(self):
+        """
+        The default ProcessGroup includes all ranks of the cluster.
+        This is used by c10d APIs when a ProcessGroup is needed but None is provided.
+        """
+        return self._default_pg
+
+    @default_pg.setter
+    def default_pg(self, value):
+        self._default_pg = value
+
+    @property
+    def pg_map(self) -> Dict[ProcessGroup, Tuple[str, Optional[Store]]]:
+        """
+        Cached process groups
+        For NCCL and GLOO pg, it is a map from ProcessGroup to (Backend, Store)
+        For MPI pg, it is a map from ProcessGroup to (Backend, None)
+
+        TODO don't expose the map, expose fine grained ops
+        """
+        global _pg_map
+        return _pg_map
+
+    @property
+    def pg_names(self) -> Dict[ProcessGroup, str]:
+        """
+        Process group's names, map from ProcessGroup to str.
+
+        TODO don't expose the map, expose fine grained ops
+        """
+        global _pg_names
+        return _pg_names
+
+    @property
+    def pg_group_ranks(self) -> Dict[ProcessGroup, Dict[int, int]]:
+        """
+        Process group's global rank to local rank mapping
+        TODO don't expose the map, expose fine grained ops
+        """
+        global _pg_group_ranks
+        return _pg_group_ranks
+
+    @property
+    def group_count(self) -> int:
+        """
+        Process group count for default naming.
+
+        TODO don't expose group_count, use something else instead
+        """
+        global _group_count
+        return _group_count
+
+    @group_count.setter
+    def group_count(self, value):
+        """
+        Count is used when computing the name of ProcessGroups when using global synchronization.
+        """
+        global _group_count
+        _group_count = value
+
+
+_world = _World()
+"""Holds the singleton instance of ``_World`` used by c10. Experimental extension point to override it"""
+
+class _WorldMeta(type):
+    """
+    Meta class of ``group`` and ``GroupMember`` so they
+    can have the class property ``WORLD``.
+    """
     # Points to the default PG once initialized.
-    WORLD: Optional[ProcessGroup] = None
+    @property
+    def WORLD(cls) -> Optional[ProcessGroup]:
+        return _world.default_pg
 
+    @WORLD.setter
+    def WORLD(cls, pg: Optional[ProcessGroup]):
+        _world.default_pg = pg
 
-class GroupMember(object):
-    # Alias to group.WORLD for backward compatibility
-    WORLD = group.WORLD
-    NON_GROUP_MEMBER = object()
+class group(object, metaclass=_WorldMeta):
+    pass
 
+class GroupMember(object, metaclass=_WorldMeta):
+    NON_GROUP_MEMBER = object()
 
-# Cached process groups
-# For NCCL and GLOO pg, it is a map from ProcessGroup to (Backend, Store)
-# For MPI pg, it is a map from ProcessGroup to (Backend, None)
-_pg_map: Dict[ProcessGroup, Tuple[str, Optional[Store]]] = {}
-# Process group's names, map from ProcessGroup to str
-_pg_names: Dict[ProcessGroup, str] = {}
-# Process group's global rank to local rank mapping
-_pg_group_ranks: Dict[ProcessGroup, Dict[int, int]] = {}
 
 # Default process group state
 _default_pg_init_method = None
 
-# Process group count for default naming
-_group_count = 0
-
 STORE_BASED_BARRIER_PREFIX = "store_based_barrier_key"
 
 
@@ -318,7 +399,7 @@ def _store_based_barrier(rank, store, timeout):
     ``init_process_group`` or ``new_group``. Intended to be used only with
     those two methods and is not a generic alternative to ``barrier()``.
     """
-    store_key = "{}:{}".format(STORE_BASED_BARRIER_PREFIX, _group_count)
+    store_key = "{}:{}".format(STORE_BASED_BARRIER_PREFIX, _world.group_count)
     store.add(store_key, 1)
     logger.info("Added key: {} to store for rank: {}".format(store_key, rank))
 
@@ -393,9 +474,9 @@ def get_group_rank(group: ProcessGroup, global_rank: int) -> int:
     """
     if group is GroupMember.WORLD:
         return global_rank
-    if group not in _pg_group_ranks:
+    if group not in _world.pg_group_ranks:
         raise RuntimeError(f"Group {group} is not registered, please create group with torch.distributed.new_group API")
-    group_ranks = _pg_group_ranks[group]
+    group_ranks = _world.pg_group_ranks[group]
     if global_rank not in group_ranks:
         raise RuntimeError(f"Global rank {global_rank} is not part of group {group}")
 
@@ -418,9 +499,9 @@ def get_global_rank(group: ProcessGroup, group_rank: int) -> int:
     """
     if group is GroupMember.WORLD:
         return group_rank
-    if group not in _pg_group_ranks:
+    if group not in _world.pg_group_ranks:
         raise RuntimeError(f"Group {group} is not registered, please create group with torch.distributed.new_group API")
-    for rank, grp_rank in _pg_group_ranks[group].items():
+    for rank, grp_rank in _world.pg_group_ranks[group].items():
         if grp_rank == group_rank:
             return rank
     raise RuntimeError(f"Group rank {group_rank} is not part of group {group}")
@@ -447,7 +528,7 @@ def get_process_group_ranks(group: ProcessGroup):
     Returns:
         List of global ranks ordered by group rank.
     """
-    return list(_pg_group_ranks[group].keys())
+    return list(_world.pg_group_ranks[group].keys())
 
 def _get_group_size(group):
     """
@@ -602,13 +683,12 @@ def _get_default_store():
             "please make sure to call init_process_group."
         )
     default_pg = _get_default_group()
-    _, default_store = _pg_map[default_pg]
+    _, default_store = _world.pg_map[default_pg]
     return default_store
 
 
 def _update_default_pg(pg):
-    GroupMember.WORLD = group.WORLD = pg
-
+    _world.default_pg = pg
 
 def get_backend(group: Optional[ProcessGroup] = None) -> str:
     """
@@ -629,7 +709,7 @@ def get_backend(group: Optional[ProcessGroup] = None) -> str:
         pg = group
     if _rank_not_in_group(pg):
         raise RuntimeError("Invalid process group specified")
-    pg_store = _pg_map.get(pg, None)
+    pg_store = _world.pg_map.get(pg, None)
     assert pg_store is not None
     return pg_store[0]
 
@@ -713,7 +793,8 @@ def init_process_group(
         on a system that supports MPI.
 
     """
-    global _pg_group_ranks
+    global _world
+
     global _backend
     global _default_pg_init_method
 
@@ -774,8 +855,8 @@ def init_process_group(
         )
         _update_default_pg(default_pg)
 
-    _pg_group_ranks[GroupMember.WORLD] = {i: i for i in range(GroupMember.WORLD.size())}  # type: ignore[attr-defined, index]
-    _backend = _pg_map[GroupMember.WORLD][0]  # type: ignore[index]
+    _world.pg_group_ranks[GroupMember.WORLD] = {i: i for i in range(GroupMember.WORLD.size())}  # type: ignore[attr-defined, index]
+    _backend = _world.pg_map[GroupMember.WORLD][0]  # type: ignore[index]
     _default_pg_init_method = init_method
 
     # barrier at the end to ensure that once we return from this method, all
@@ -812,15 +893,13 @@ def _new_process_group_helper(
 
     This function is called with ``group_ranks == []`` for the default group.
     """
-    global _pg_map
-    global _group_count
-    global _pg_names
+    global _world
 
     if not group_name:
-        group_name = str(_group_count)
-        _group_count += 1
+        group_name = str(_world.group_count)
+        _world.group_count = _world.group_count + 1
 
-    if group_name in _pg_names.values():
+    if group_name in _world.pg_names.values():
         raise RuntimeError(
             "The specified group name has already been "
             "created, please use a different group name"
@@ -846,8 +925,8 @@ def _new_process_group_helper(
         pg = ProcessGroupMPI.create(global_ranks_in_group)
         if not pg:
             return GroupMember.NON_GROUP_MEMBER
-        _pg_map[pg] = (Backend.MPI, None)
-        _pg_names[pg] = group_name
+        _world.pg_map[pg] = (Backend.MPI, None)
+        _world.pg_names[pg] = group_name
     else:
         # If this is a subgroup (which means group_ranks is specified),
         # we check if the current process is a member of the new group.
@@ -883,8 +962,8 @@ def _new_process_group_helper(
                         world_size=group_size,
                         timeout=timeout,
                     )
-            _pg_map[pg] = (Backend.GLOO, store)
-            _pg_names[pg] = group_name
+            _world.pg_map[pg] = (Backend.GLOO, store)
+            _world.pg_names[pg] = group_name
         elif backend == Backend.NCCL:
             if not is_nccl_available():
                 raise RuntimeError("Distributed package doesn't have NCCL " "built in")
@@ -918,8 +997,8 @@ def _new_process_group_helper(
                         world_size=group_size,
                         timeout=timeout,
                     )
-            _pg_map[pg] = (Backend.NCCL, store)
-            _pg_names[pg] = group_name
+            _world.pg_map[pg] = (Backend.NCCL, store)
+            _world.pg_names[pg] = group_name
         elif backend == Backend.UCC and is_ucc_available():
             # TODO: once UCC plugin is fully deprecated, remove
             # is_ucc_available() from above elif-condition and raise
@@ -945,8 +1024,8 @@ def _new_process_group_helper(
                         world_size=group_size,
                         timeout=timeout,
                     )
-            _pg_map[pg] = (Backend.UCC, store)
-            _pg_names[pg] = group_name
+            _world.pg_map[pg] = (Backend.UCC, store)
+            _world.pg_names[pg] = group_name
         else:
             assert backend.upper() in Backend._plugins, (
                 f"Unknown c10d backend type {backend.upper()}"
@@ -968,8 +1047,8 @@ def _new_process_group_helper(
                 dist_backend_opts.global_ranks_in_group = global_ranks_in_group
 
                 pg = creator_fn(dist_backend_opts, pg_options)
-            _pg_map[pg] = (backend, store)
-            _pg_names[pg] = group_name
+            _world.pg_map[pg] = (backend, store)
+            _world.pg_names[pg] = group_name
 
     return pg
 
@@ -984,11 +1063,7 @@ def destroy_process_group(group: Optional[ProcessGroup] = None):
                                         groups including the default one will
                                         be destroyed.
     """
-    global _pg_map
-    global _pg_names
-    global _pg_group_ranks
-    global _default_pg_init_method
-    global _group_count
+    global _world
 
     if group == GroupMember.NON_GROUP_MEMBER:
         return
@@ -999,29 +1074,28 @@ def destroy_process_group(group: Optional[ProcessGroup] = None):
         pg = group
 
     assert pg is not None
-    if _pg_map.get(pg, None) is None:
+    if _world.pg_map.get(pg, None) is None:
         raise RuntimeError("Invalid process group specified")
 
     if group is None or group == GroupMember.WORLD:
         _update_default_pg(None)
-        _default_pg_init_method = None
-        _pg_map.clear()
-        _pg_names.clear()
-        _pg_group_ranks.clear()
+        _world.pg_map.clear()
+        _world.pg_names.clear()
+        _world.pg_group_ranks.clear()
 
         # when process group doesn't have an explicit name (only WORLD (default)
-        # process group can have an explicit name), we use global _group_counter
+        # process group can have an explicit name), we use global _world.group_count
         # to generate the name. We need to reset the counter on destruction to
         # allow consistent value to be generated when we re-create process
         # groups after some trainers recover from failure
         #
         # We only reset this when WORLD is being destroyed because if this
         # process group is in good state, we aren't dealing with failures.
-        _group_count = 0
+        _world.group_count = 0
     else:
-        del _pg_map[pg]
-        del _pg_names[pg]
-        del _pg_group_ranks[pg]
+        del _world.pg_map[pg]
+        del _world.pg_names[pg]
+        del _world.pg_group_ranks[pg]
 
 
 def get_rank(group: Optional[ProcessGroup] = None) -> int:
@@ -3341,10 +3415,10 @@ def new_group(ranks=None, timeout=default_pg_timeout, backend=None, pg_options=N
         A handle of distributed group that can be given to collective calls.
     """
 
-    global _pg_group_ranks
+    global _world
 
     default_pg = _get_default_group()
-    default_backend, default_store = _pg_map[default_pg]
+    default_backend, default_store = _world.pg_map[default_pg]
     global_rank = default_pg.rank()
     global_world_size = default_pg.size()
 
@@ -3393,7 +3467,7 @@ def new_group(ranks=None, timeout=default_pg_timeout, backend=None, pg_options=N
         )
 
     # Create the global rank to group rank mapping
-    _pg_group_ranks[pg] = {
+    _world.pg_group_ranks[pg] = {
         global_rank: group_rank for group_rank, global_rank in enumerate(ranks)
     }
 
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 8c44cc0482cc4..43a49b0489dc9 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -1147,7 +1147,6 @@ def test_1_level_hierarchical_model_averager_equivalent_to_periodic_model_averag
         @require_world_size(4)
         @skip_if_lt_x_gpu(4)
         def test_3_level_hierarchical_model_averager(self):
-            from torch.distributed.distributed_c10d import _pg_group_ranks
             rank = dist.get_rank()
             world_size = dist.get_world_size()
             rank_to_GPU = init_multigpu_helper(world_size, BACKEND)
@@ -1178,8 +1177,8 @@ def test_3_level_hierarchical_model_averager(self):
             subgroup1 = averager.period_process_group_dict[subgroup_avg_period1]
             subgroup2 = averager.period_process_group_dict[subgroup_avg_period2]
 
-            real_group_ranks_res1 = list(_pg_group_ranks[subgroup1].keys())
-            real_group_ranks_res2 = list(_pg_group_ranks[subgroup2].keys())
+            real_group_ranks_res1 = dist.get_process_group_ranks(subgroup1)
+            real_group_ranks_res2 = dist.get_process_group_ranks(subgroup2)
             expect_group_ranks_res1 = (rank // subgroup_size1 * subgroup_size1 + np.array(list(range(subgroup_size1)))).tolist()
             expect_group_ranks_res2 = (rank // subgroup_size2 * subgroup_size2 + np.array(list(range(subgroup_size2)))).tolist()
             self.assertEqual(real_group_ranks_res1, expect_group_ranks_res1)

From b003af25da5746dd3c0865fb02a74cceeb27a051 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Mon, 7 Nov 2022 18:03:31 +0000
Subject: [PATCH 0622/1922] [Dynamo] Fix bug: GradMode doesn't carry grad state
 correctly after graph break (#88537)

Fixes https://github.com/pytorch/torchdynamo/issues/1446

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88537
Approved by: https://github.com/jansel
---
 test/dynamo/test_repros.py      | 15 +++++++++++++++
 torch/_dynamo/variables/misc.py |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index f669a6c8c68e9..2103e075fffc9 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -1271,6 +1271,21 @@ def fn(x):
             res = opt_fn(x)
         self.assertTrue(same(ref, res))
 
+    # https://github.com/pytorch/torchdynamo/issues/1446
+    def test_grad_mode_carrying_correct_state_after_graph_break(self):
+        def fn(x):
+            with torch.no_grad():
+                y = x * 3
+                print("Break")
+                z = x + 2
+            return y, z
+
+        x = torch.randn(3, requires_grad=True)
+        opt_fn = torch._dynamo.optimize("eager")(fn)
+        y, z = opt_fn(x)
+        self.assertFalse(y.requires_grad)
+        self.assertFalse(z.requires_grad)
+
     def test_abc_setattr(self):
         # tests that we correctly bail out of __setattr__ calls
 
diff --git a/torch/_dynamo/variables/misc.py b/torch/_dynamo/variables/misc.py
index 8dd3478114396..da327122a6a70 100644
--- a/torch/_dynamo/variables/misc.py
+++ b/torch/_dynamo/variables/misc.py
@@ -283,7 +283,7 @@ def _func_name(self):
         return "_C._set_grad_enabled"
 
     def fn_name(self):
-        if self.target_values:
+        if self.target_values[0]:
             return "enable_grad"
         else:
             return "no_grad"

From 50ab3b5d5b8450c7a8450e86d9b54713faefebad Mon Sep 17 00:00:00 2001
From: Angela Yi <angelayi@meta.com>
Date: Mon, 7 Nov 2022 18:42:41 +0000
Subject: [PATCH 0623/1922] [exir][fx] PassManager error handling (#88520)

Summary:
* Added an error message for when the result is not a PassResult
* Modified the error handling to capture exceptions that happen in the check() function
* consolidated inplace_wrapper and pass_result_wrapper

Test Plan: CI

Differential Revision: D40950135

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88520
Approved by: https://github.com/SherlockNoMad
---
 test/fx/test_pass_infra.py            |  2 +-
 torch/fx/passes/infra/pass_manager.py | 72 ++++++++++++---------------
 2 files changed, 32 insertions(+), 42 deletions(-)

diff --git a/test/fx/test_pass_infra.py b/test/fx/test_pass_infra.py
index 947c80d66dcee..7a7039979bebe 100644
--- a/test/fx/test_pass_infra.py
+++ b/test/fx/test_pass_infra.py
@@ -169,5 +169,5 @@ def pass_fail(graph_module):
         pm = PassManager(passes=[replace_add_with_mul_pass, replace_mul_with_div_pass, pass_fail])
 
         # Comment out this line to see the actual error message
-        with self.assertRaises(RuntimeError):
+        with self.assertRaisesRegex(Exception, "pass_fail"):
             pm(traced_m)
diff --git a/torch/fx/passes/infra/pass_manager.py b/torch/fx/passes/infra/pass_manager.py
index 01a1cb6c81f5e..265c6263da540 100644
--- a/torch/fx/passes/infra/pass_manager.py
+++ b/torch/fx/passes/infra/pass_manager.py
@@ -1,4 +1,5 @@
 import inspect
+import logging
 from queue import Queue
 from functools import wraps
 from typing import Callable, Dict, List
@@ -8,31 +9,10 @@
 from torch.fx._compatibility import compatibility
 from torch.fx.passes.infra.pass_base import PassResult
 
-__all__ = ['inplace_wrapper', 'pass_result_wrapper', 'this_before_that_pass_constraint', 'PassManager']
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.WARNING)
 
-@compatibility(is_backward_compatible=False)
-def inplace_wrapper(fn: Callable) -> Callable:
-    """
-    Convenience wrapper for passes which modify an object inplace. This
-    wrapper makes them return a PassResult containing the modified object and
-    True for the "modified" flag.
-
-    Args:
-        fn (Callable[Module, Any])
-
-    Returns:
-        wrapped_fn (Callable[Module, PassResult])
-    """
-    if fn is None:
-        return None
-
-    @wraps(fn)
-    def wrapped_fn(gm):
-        return fn(gm) or PassResult(gm, True)
-
-    if wrapped_fn.__name__ == 'wrapped_fn':
-        wrapped_fn.__name__ = str(fn)
-    return wrapped_fn
+__all__ = ['pass_result_wrapper', 'this_before_that_pass_constraint', 'PassManager']
 
 @compatibility(is_backward_compatible=False)
 def pass_result_wrapper(fn: Callable) -> Callable:
@@ -52,8 +32,16 @@ def pass_result_wrapper(fn: Callable) -> Callable:
 
     @wraps(fn)
     def wrapped_fn(gm):
-        gm = fn(gm)
-        return PassResult(gm, True)
+        res = fn(gm)
+        if res is None:
+            return PassResult(gm, True)
+        if isinstance(res, PassResult):
+            return res
+        elif isinstance(res, nn.Module):
+            return PassResult(res, True)
+
+    if not inspect.isfunction(fn):
+        wrapped_fn.__name__ = type(fn).__name__
 
     return wrapped_fn
 
@@ -189,7 +177,6 @@ def __init__(
         steps=None,
         run_checks_after_each_pass: bool = False,
         suppress_check_failures: bool = False,
-        debug: bool = False,
     ):
         if passes:
             self.passes = passes
@@ -200,7 +187,6 @@ def __init__(
 
         self.run_checks_after_each_pass = run_checks_after_each_pass
         self.suppress_check_failures = suppress_check_failures
-        self.debug = debug
 
     def add_pass(self, _pass: Callable):
         """
@@ -279,25 +265,29 @@ def __call__(self, module: nn.Module) -> PassResult:
 
             # Run the set of passes on the graph module
             for i, fn in enumerate(self.passes):
-                if self.debug:
-                    print(f"Running pass \'{fn.__name__}\'")
+                logger.debug(f"Running pass \'{fn.__name__}\'")
 
                 try:
                     res = fn(module)
-                except Exception as e:
-                    prev_pass_names = [p.__name__ for p in self.passes[:i]]
-                    msg = f"An error occurred when running the \'{fn.__name__}\' pass after the following passes: {prev_pass_names}"
-                    raise type(e)(msg) from e
 
-                module = res.graph_module
-                modified = modified or res.modified
+                    if not isinstance(res, PassResult) and not hasattr(res, "graph_module"):
+                        raise TypeError(f"The result of the pass {fn.__name__} should be type PassResult. \
+                                          Please wrap it with pass_result_wrapper()")
+                    module = res.graph_module
+                    modified = modified or res.modified
+
+                    if isinstance(module, GraphModule):
+                        logger.debug(f"Graph after pass \'{fn.__name__}\':", module.graph)
+                        module.recompile()
 
-                if isinstance(module, GraphModule):
-                    module.recompile()
+                    # Check graph invariants
+                    if self.run_checks_after_each_pass:
+                        self.check(module)
 
-                # Check graph invariants
-                if self.run_checks_after_each_pass:
-                    self.check(module)
+                except Exception as e:
+                    prev_pass_names = [p.__name__ for p in self.passes[:i]]
+                    msg = f"An error occurred when running the \'{fn.__name__}\' pass after the following passes: {prev_pass_names}"
+                    raise Exception(msg) from e
 
             # If the graph no longer changes, then we can stop running these passes
             overall_modified = overall_modified or modified

From e78111065f316414f6b5b2a84419290fb901fcb3 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 7 Nov 2022 18:51:16 +0000
Subject: [PATCH 0624/1922] Revert "[fix] allow saving python attr on Tensor
 and Parameter via torch.save (#81616)"

This reverts commit 54b6188cc6dee45b775d688223b847dc8ea85bff.

Reverted https://github.com/pytorch/pytorch/pull/81616 on behalf of https://github.com/mehtanirav due to Internal publishing is broken
---
 test/test_serialization.py                 | 18 ------
 torch/_tensor.py                           | 42 ++++++++++++-
 torch/_utils.py                            | 58 ------------------
 torch/_weights_only_unpickler.py           |  5 --
 torch/csrc/jit/serialization/unpickler.cpp | 71 ----------------------
 torch/csrc/jit/serialization/unpickler.h   |  4 --
 torch/nn/parameter.py                      |  6 +-
 7 files changed, 42 insertions(+), 162 deletions(-)

diff --git a/test/test_serialization.py b/test/test_serialization.py
index 7279db8f6f27b..3a18f8a45ad04 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -896,24 +896,6 @@ def test_meta_serialization(self, weights_only):
 
         self.assertEqual(state['weight'].size(), big_model.weight.size())
 
-    def test_serialization_python_attr(self):
-        def _test_save_load_attr(t):
-            t.foo = 'foo'
-            t.pi = 3.14
-
-            with BytesIOContext() as f:
-                torch.save(t, f)
-                f.seek(0)
-                loaded_t = torch.load(f)
-
-            self.assertEqual(t, loaded_t)
-            self.assertEqual(t.foo, loaded_t.foo)
-            self.assertEqual(t.pi, loaded_t.pi)
-
-        t = torch.zeros(3, 3)
-        _test_save_load_attr(t)
-        _test_save_load_attr(torch.nn.Parameter(t))
-
     def test_weights_only_assert(self):
         class HelloWorld:
             def __reduce__(self):
diff --git a/torch/_tensor.py b/torch/_tensor.py
index e77324e2baf27..d0af241c8a221 100644
--- a/torch/_tensor.py
+++ b/torch/_tensor.py
@@ -55,6 +55,9 @@ def _rebuild_from_type(func, type, args, dict):
 
 
 def _rebuild_from_type_v2(func, new_type, args, state):
+    if new_type is Tensor:
+        return func(*args)
+
     ret = func(*args)
     if type(ret) is not new_type:
         ret = ret.as_subclass(new_type)
@@ -67,7 +70,21 @@ def _rebuild_from_type_v2(func, new_type, args, state):
     ):
         ret.__setstate__(state)
     else:
-        ret = torch._utils._set_obj_state(ret, state)
+        if isinstance(state, tuple):
+            if not len(state) == 2:
+                raise RuntimeError(f"Invalid serialized state: {state}")
+            dict_state = state[0]
+            slots_state = state[1]
+        else:
+            dict_state = state
+            slots_state = None
+
+        for k, v in dict_state.items():
+            setattr(ret, k, v)
+
+        if slots_state:
+            for k, v in slots_state.items():
+                setattr(ret, k, v)
     return ret
 
 
@@ -204,10 +221,31 @@ def __deepcopy__(self, memo):
             return new_tensor
 
     def __reduce_ex__(self, proto):
+        if type(self) is Tensor:
+            return self._reduce_ex_internal(proto)
         if has_torch_function_unary(self):
             return handle_torch_function(Tensor.__reduce_ex__, (self,), self, proto)
         func, args = self._reduce_ex_internal(proto)
-        state = torch._utils._get_obj_state(self)
+        # Get the state of the python subclass
+        # This loosely mimicks the function on the object class but since Tensor do not inherit
+        # from it, we cannot call that function directly
+        # https://github.com/python/cpython/blob/c83919bd635f4433f1c6ae8504996a9fe3c215e5/Objects/typeobject.c#L4891
+        getstate_fn = getattr(self, "__getstate__", None)
+        if getstate_fn:
+            state = getstate_fn()
+        else:
+            slots_to_save = copyreg._slotnames(self.__class__)  # type: ignore[attr-defined]
+            if slots_to_save:
+                state = (
+                    self.__dict__,
+                    {
+                        name: getattr(self, name)
+                        for name in slots_to_save
+                        if hasattr(self, name)
+                    },
+                )
+            else:
+                state = self.__dict__
         return (_rebuild_from_type_v2, (func, type(self), args, state))
 
     def storage(self):
diff --git a/torch/_utils.py b/torch/_utils.py
index 98b1a67ec3a88..8a539d75f5657 100644
--- a/torch/_utils.py
+++ b/torch/_utils.py
@@ -1,4 +1,3 @@
-import copyreg
 import sys
 import traceback
 import warnings
@@ -308,7 +307,6 @@ def _rebuild_qtensor(
     return tensor
 
 
-# Should not be used, this is kept only for BC of loading old serialized parameters
 def _rebuild_parameter(data, requires_grad, backward_hooks):
     param = torch.nn.Parameter(data, requires_grad)
     # NB: This line exists only for backwards compatibility; the
@@ -319,62 +317,6 @@ def _rebuild_parameter(data, requires_grad, backward_hooks):
     return param
 
 
-def _rebuild_parameter_v2(data, requires_grad, backward_hooks, state):
-    param = torch.nn.Parameter(data, requires_grad)
-    # NB: This line exists only for backwards compatibility; the
-    # general expectation is that backward_hooks is an empty
-    # OrderedDict.  See Note [Don't serialize hooks]
-    param._backward_hooks = backward_hooks
-
-    # Restore state on Parameter like python attr.
-    param = _set_obj_state(param, state)
-    return param
-
-
-def _get_obj_state(obj):
-    # Get the state of the python subclass
-    # This loosely mimicks the function on the object class but since Tensor do not inherit
-    # from it, we cannot call that function directly
-    # https://github.com/python/cpython/blob/c83919bd635f4433f1c6ae8504996a9fe3c215e5/Objects/typeobject.c#L4891
-    getstate_fn = getattr(obj, "__getstate__", None)
-    if getstate_fn:
-        state = getstate_fn()
-    else:
-        slots_to_save = copyreg._slotnames(obj.__class__)  # type: ignore[attr-defined]
-        if slots_to_save:
-            state = (
-                obj.__dict__,
-                {
-                    name: getattr(obj, name)
-                    for name in slots_to_save
-                    if hasattr(obj, name)
-                },
-            )
-        else:
-            state = obj.__dict__
-
-    return state
-
-
-def _set_obj_state(obj, state):
-    if isinstance(state, tuple):
-        if not len(state) == 2:
-            raise RuntimeError(f"Invalid serialized state: {state}")
-        dict_state = state[0]
-        slots_state = state[1]
-    else:
-        dict_state = state
-        slots_state = None
-
-    for k, v in dict_state.items():
-        setattr(obj, k, v)
-
-    if slots_state:
-        for k, v in slots_state.items():
-            setattr(obj, k, v)
-    return obj
-
-
 def _import_dotted_name(name):
     components = name.split(".")
     obj = __import__(components[0])
diff --git a/torch/_weights_only_unpickler.py b/torch/_weights_only_unpickler.py
index 498d3a607f3aa..ee00db937fc3d 100644
--- a/torch/_weights_only_unpickler.py
+++ b/torch/_weights_only_unpickler.py
@@ -103,11 +103,6 @@ def _get_allowed_globals():
         torch._utils._rebuild_sparse_csr_tensor,
     ]:
         rc[f"torch._utils.{f.__name__}"] = f
-
-    # Default rebuild function
-    # Handles Tensor Subclasses, Tensor's with attributes.
-    # NOTE: It calls into above rebuild functions for regular Tensor types.
-    rc["torch._tensor._rebuild_from_type_v2"] = torch._tensor._rebuild_from_type_v2
     return rc
 
 
diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp
index b47045aa23ae8..7b40f138c600f 100644
--- a/torch/csrc/jit/serialization/unpickler.cpp
+++ b/torch/csrc/jit/serialization/unpickler.cpp
@@ -532,21 +532,6 @@ PickleOpCode Unpickler::readInstruction() {
       }
       stack_.emplace_back(std::move(tensor));
     } break;
-    case PickleOpCode::SETITEM: {
-      // At this OpCode, stack looks like
-      // | Stack Top |
-      // | ......    |
-      // | Dict      | -> (stack_size - 3)
-      // | Key       | -> (stack_size - 2)
-      // | Value     | -> (stack_size - 1)
-      auto stack_size = stack_.size();
-      auto dict_pos = stack_size - 3;
-      auto key_pos = stack_size - 2;
-      auto val_pos = stack_size - 1;
-      auto dict = stack_.at(dict_pos).toGenericDict();
-      dict.insert_or_assign(stack_.at(key_pos), stack_.at(val_pos));
-      stack_.erase(stack_.begin() + (key_pos), stack_.end());
-    } break;
     default: {
       AT_ERROR(
           "Unknown opcode for unpickling at ",
@@ -561,23 +546,6 @@ PickleOpCode Unpickler::readInstruction() {
 void Unpickler::readGlobal(
     const std::string& module_name,
     const std::string& class_name) {
-  if (this->skip_next_read_global) {
-    // See [NOTE] skip_next_read_global
-    this->skip_next_read_global--;
-    if (this->skip_next_read_global == 1) {
-      // Pass through to the correct handler
-    } else if (this->skip_next_read_global == 0) {
-      // Corresponds to the type of `Tensor` being unpickled
-      if (module_name != "torch" || class_name != "Tensor") {
-        TORCH_WARN(
-            "Trying to load a Subclassed Tensor, it will be converted to at::Tensor in C++");
-      }
-      stack_.emplace_back(int64_t(globals_.size() - 1));
-      return;
-    } else {
-      TORCH_CHECK(false, "INVALID VALUES")
-    }
-  }
   // TODO [unpickler refactor] __main__ isn't used by the pickler anymore, this
   // is only here for bc-compatibility reasons
   if (module_name == "__main__") {
@@ -663,12 +631,6 @@ void Unpickler::readGlobal(
     // Unpickle a tensor
     bool quantized = class_name == "_rebuild_qtensor";
     rebuildTensor(quantized);
-  } else if (
-      module_name == "torch._tensor" &&
-      (class_name == "_rebuild_from_type_v2")) {
-    // Unpickle a Tensor with Python attributes or
-    // a Subclassed Tensor.
-    rebuildTensorFromTypeV2();
   } else if (
       module_name == "torch._utils" && class_name == "_rebuild_sparse_tensor") {
     rebuildSparseTensor();
@@ -872,39 +834,6 @@ void Unpickler::rebuildTensor(bool quantized) {
   });
 }
 
-void Unpickler::rebuildTensorFromTypeV2() {
-  // [NOTE] skip_next_read_global
-  // When rebuilding Tensor with Python Attr or Subclassed Tensor,
-  // we receive `(func, type(self), args, state)` on stack for
-  // `rebuildTensorFromTypeV2`.
-  // Thus next call to readGlobal corresponds to `func` which is
-  // the function to rebuild the base tensor.
-  // The call after `func` to readGlobal corresponds to `type` of the
-  // Tensor where we raise warning if the type is not `torch.Tensor`.
-  this->skip_next_read_global = 2;
-  auto curr_globals_idx = globals_.size();
-  globals_.emplace_back([this, curr_globals_idx] {
-    // args is a tuple with following data
-    //  (function to rebuild base tensor, type of tensor,
-    //   arguments to construct base tensor, Python State (as dict))
-    auto args = pop(stack_).toTuple();
-    size_t tup_idx = 0;
-    const auto args_elems = args->elements();
-    auto base_tensor_args = args_elems.at(tup_idx + 2).toTuple();
-    auto py_state = args_elems.at(tup_idx + 3).toGenericDict();
-    if (py_state.size() > 0) {
-      TORCH_WARN(
-          "Loading Tensor with Python attributes will return at::Tensor with Python attributes being discarded");
-    }
-    // This calls the function to rebuild the
-    // base tensor.
-    // Eg. `rebuildTensor`, `rebuildSpareTensor`.
-    stack_.emplace_back(base_tensor_args);
-    globals_[curr_globals_idx + 1]();
-    stack_.emplace_back(pop(stack_));
-  });
-}
-
 #ifdef USE_RPC
 void Unpickler::rebuildRRef() {
   globals_.emplace_back([this] {
diff --git a/torch/csrc/jit/serialization/unpickler.h b/torch/csrc/jit/serialization/unpickler.h
index de00e7eacff21..5411d421a0c57 100644
--- a/torch/csrc/jit/serialization/unpickler.h
+++ b/torch/csrc/jit/serialization/unpickler.h
@@ -120,7 +120,6 @@ class TORCH_API Unpickler {
       const std::string& module_name,
       const std::string& class_name);
   void rebuildTensor(bool quantized);
-  void rebuildTensorFromTypeV2();
   void rebuildSparseTensor();
 #ifdef USE_DISTRIBUTED
   void rebuildRRef();
@@ -177,9 +176,6 @@ class TORCH_API Unpickler {
 
   // See [type tag serialization]
   uint64_t version_;
-
-  // See [NOTE] skip_next_read_global
-  uint8_t skip_next_read_global = 0;
 };
 
 void restoreAccurateTypeTags(const IValue& root, const c10::TypePtr& type_tag);
diff --git a/torch/nn/parameter.py b/torch/nn/parameter.py
index 4821adae17263..e0f400f2642bf 100644
--- a/torch/nn/parameter.py
+++ b/torch/nn/parameter.py
@@ -60,12 +60,10 @@ def __repr__(self):
         return 'Parameter containing:\n' + super(Parameter, self).__repr__()
 
     def __reduce_ex__(self, proto):
-        state = torch._utils._get_obj_state(self)
-
         # See Note [Don't serialize hooks]
         return (
-            torch._utils._rebuild_parameter_v2,
-            (self.data, self.requires_grad, OrderedDict(), state)
+            torch._utils._rebuild_parameter,
+            (self.data, self.requires_grad, OrderedDict())
         )
 
     __torch_function__ = _disabled_torch_function_impl

From 545e7eae06b88ce4e192b133ad01102552b7879d Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 7 Nov 2022 19:59:42 +0000
Subject: [PATCH 0625/1922] Revert "Use sudo when reset NVIDIA devices
 (#88531)"

This reverts commit 505486ce9321bc22d2156a1a9b97fe474a05b53b.

Reverted https://github.com/pytorch/pytorch/pull/88531 on behalf of https://github.com/huydhn due to Wrong sudo echo usage, should use tee instead
---
 .github/scripts/install_nvidia_utils_linux.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/scripts/install_nvidia_utils_linux.sh b/.github/scripts/install_nvidia_utils_linux.sh
index 9d74720514ec7..3a2805d91d5ab 100755
--- a/.github/scripts/install_nvidia_utils_linux.sh
+++ b/.github/scripts/install_nvidia_utils_linux.sh
@@ -74,8 +74,7 @@ install_nvidia_driver_amzn2() {
                     DEVICE_ENABLED=$(cat /sys/bus/pci/devices/$PCI_ID/enable)
 
                     echo "Reseting $PCI_ID (enabled state: $DEVICE_ENABLED)"
-                    # This requires sudo permission of course
-                    sudo echo "1" > /sys/bus/pci/devices/$PCI_ID/reset
+                    echo "1" > /sys/bus/pci/devices/$PCI_ID/reset
                     sleep 1
                 done
             fi

From 4501a9025d21d4e27d46222a3c4315d50cc83163 Mon Sep 17 00:00:00 2001
From: Michael Suo <suo@meta.com>
Date: Mon, 7 Nov 2022 08:57:51 -0800
Subject: [PATCH 0626/1922] [nvfuser] skip extremal tests on rocm (#88587)

Summary:

These are failing in rocm so disable.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88587
Approved by: https://github.com/ZainRizvi, https://github.com/huydhn
---
 test/test_jit_cuda_fuser.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py
index 8e8af85faf5f2..7de6f17716323 100644
--- a/test/test_jit_cuda_fuser.py
+++ b/test/test_jit_cuda_fuser.py
@@ -5249,6 +5249,7 @@ def test_nvfuser_correctness(self, device, dtype, op):
         # if the CU is not cleared.
         torch.jit._state._python_cu.drop_all_functions()
 
+    @skipIfRocm
     @slowTest
     @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,

From 73c4e405a7917427243a959c2e4a4cc26b10d13b Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Fri, 4 Nov 2022 17:10:21 +0000
Subject: [PATCH 0627/1922] Handle pin_memory in refs.randn (#88473)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88473
Approved by: https://github.com/mruberry
---
 torch/_meta_registrations.py | 1 -
 torch/_refs/__init__.py      | 7 +++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index ea9ac51450e22..55be711e22feb 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1681,7 +1681,6 @@ def activate_meta():
             "aten::empty_strided",  # causing infinite recursion, test_meta.py
             "aten::clone",  # causing infinite recursion
             "aten::_to_copy",  # causing infinite recursion, test_serialization.py -k test_tensor_subclass_getstate_overwrite  # noqa: B950
-            "aten::randn",  # pin_memory parameter is not supported!, test_proxy_tensor.py -k test_make_fx_symbolic_exhaustive_randn_cpu_float32  # noqa: B950
             "aten::copy_",  # Exception not raised, test_torch.py -k test_storage_meta_errors_cpu_int64  # noqa: B950
             "aten::constant_pad_nd",  # requires_grad mismatch, test_ops.py -k test_fake_crossref_backward_amp_istft_cuda_float32  # noqa: B950
             "aten::rot90",  # requires_grad mismatch! test_ops.py -k test_fake_crossref_backward_amp_rot90_cuda_float32  # noqa: B950
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 96a11c207ae58..28153321db59c 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -4469,7 +4469,7 @@ def full_like(
 
 ones_like = partial(full_like, fill_value=True)
 
-# TODO: add pin_memory support
+
 @register_decomposition(torch.ops.aten.randn.default)
 @out_wrapper()
 def randn(
@@ -4478,10 +4478,9 @@ def randn(
     device: Optional[torch.device] = None,
     layout: Optional[torch.layout] = None,
     requires_grad: bool = False,
-    pin_memory: Optional[bool] = None,
+    pin_memory: bool = False,
 ) -> TensorLikeType:
-
-    check(pin_memory is None, lambda: "pin_memory parameter is not supported!")
+    utils.check_pin_memory(pin_memory)
 
     shape_ = utils.extract_shape_from_varargs(shape)
 

From b5e3f4f6ef155ec2d6d9d3b668959a8032be9f92 Mon Sep 17 00:00:00 2001
From: Yu Guo <yuguo@fb.com>
Date: Fri, 4 Nov 2022 16:51:35 -0700
Subject: [PATCH 0628/1922] [torchdynamo] support
 torch.autograd._profiler_enabled (#88378)

fix https://github.com/pytorch/torchdynamo/issues/1826

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88378
Approved by: https://github.com/voznesenskym
---
 test/dynamo/test_misc.py         | 24 ++++++++++++++++++++++++
 torch/_dynamo/variables/torch.py |  2 ++
 2 files changed, 26 insertions(+)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index c309335f0a9d1..8f195f60d15f3 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -1385,6 +1385,30 @@ def fn(x):
         self.assertTrue(same(ref, res))
         self.assertEqual(cnts.frame_count, 2)
 
+    def test_autograd_profiler_enabled(self):
+        def fn(x):
+            if torch.autograd._profiler_enabled():
+                return x + 1
+            else:
+                return x - 1
+
+        x = torch.randn((2, 2), requires_grad=True)
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnts)(fn)
+
+        if torch.autograd._profiler_enabled():
+            torch.autograd._disable_profiler()
+        assert not torch.autograd._profiler_enabled()
+        ref = fn(x)
+        res = opt_fn(x)
+        self.assertTrue(same(ref, res))
+
+        with torch.autograd.profiler.profile():
+            assert torch.autograd._profiler_enabled()
+            ref = fn(x)
+            res = opt_fn(x)
+            self.assertTrue(same(ref, res))
+
     def test_python_slice(self):
         def f1(input):
             y = 0
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index 36ca6591189de..c55a64cff50c7 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -288,6 +288,8 @@ def call_function(
         ):
             log.warning("Profiler will be ignored")
             return ProfilerContextWrapperVariable(**options)
+        elif self.value is torch.autograd._profiler_enabled:
+            unimplemented("torch.autograd._profiler_enabled not supported yet")
         elif self.value is torch.jit.annotate:
             assert len(args) == 2
             return args[1]

From a25404ebda8fea0f9dbcb6398d87c81c7edbcd0d Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Mon, 7 Nov 2022 21:04:02 +0000
Subject: [PATCH 0629/1922] Upload test stats for inductor workflow (#88535)

We miss this new workflow, so none of its test stats are uploaded to rockset
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88535
Approved by: https://github.com/desertfire
---
 .github/workflows/upload-test-stats.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/upload-test-stats.yml b/.github/workflows/upload-test-stats.yml
index 3b84f15214eac..27289983e2707 100644
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@@ -2,7 +2,7 @@ name: Upload test stats
 
 on:
   workflow_run:
-    workflows: [pull, trunk, periodic]
+    workflows: [pull, trunk, periodic, inductor]
     types:
       - completed
 

From 670665ee82ad4e18bcf212e55714dd0ef1858ce5 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@meta.com>
Date: Mon, 7 Nov 2022 21:15:07 +0000
Subject: [PATCH 0630/1922] Publicly expose _LRScheduler to LRScheduler
 (#88503)

Fixes #61232

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88503
Approved by: https://github.com/soulitzer
---
 test/test_optim.py          | 12 ++++++------
 torch/optim/lr_scheduler.py | 38 +++++++++++++++++++++----------------
 torch/optim/swa_utils.py    |  4 ++--
 3 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/test/test_optim.py b/test/test_optim.py
index e611f75b67dee..31c5add46d9d2 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -15,7 +15,7 @@
 from torch import sparse
 from torch.optim.lr_scheduler import LambdaLR, MultiplicativeLR, SequentialLR, StepLR, \
     MultiStepLR, ConstantLR, LinearLR, ExponentialLR, CosineAnnealingLR, ReduceLROnPlateau, \
-    _LRScheduler, CyclicLR, CosineAnnealingWarmRestarts, OneCycleLR, ChainedScheduler, PolynomialLR, \
+    LRScheduler, CyclicLR, CosineAnnealingWarmRestarts, OneCycleLR, ChainedScheduler, PolynomialLR, \
     EPOCH_DEPRECATION_WARNING
 from torch.optim.swa_utils import AveragedModel, SWALR, update_bn
 from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_UBSAN, load_tests, \
@@ -1199,7 +1199,7 @@ def _check_warning_is_epoch_deprecation_warning(self, w, *, num_warnings: int =
             self.assertEqual(warning.message.args[0], EPOCH_DEPRECATION_WARNING)
 
     def test_error_when_getlr_has_epoch(self):
-        class MultiStepLR(torch.optim.lr_scheduler._LRScheduler):
+        class MultiStepLR(torch.optim.lr_scheduler.LRScheduler):
             def __init__(self, optimizer, gamma, milestones, last_epoch=-1):
                 self.init_lr = [group['lr'] for group in optimizer.param_groups]
                 self.gamma = gamma
@@ -2572,7 +2572,7 @@ def _check_scheduler_state_dict(self, constr, constr2, epochs=10):
         self.assertEqual(scheduler.get_last_lr(), scheduler_copy.get_last_lr())
 
     def _test_get_last_lr(self, schedulers, targets, epochs=10):
-        if isinstance(schedulers, _LRScheduler):
+        if isinstance(schedulers, LRScheduler):
             schedulers = [schedulers]
         optimizers = {scheduler.optimizer for scheduler in schedulers}
         for epoch in range(epochs):
@@ -2586,7 +2586,7 @@ def _test_get_last_lr(self, schedulers, targets, epochs=10):
                                      epoch, t, r), atol=1e-5, rtol=0)
 
     def _test_with_epoch(self, schedulers, targets, epochs=10):
-        if isinstance(schedulers, _LRScheduler):
+        if isinstance(schedulers, LRScheduler):
             schedulers = [schedulers]
         optimizers = {scheduler.optimizer for scheduler in schedulers}
         for epoch in range(epochs):
@@ -2600,7 +2600,7 @@ def _test_with_epoch(self, schedulers, targets, epochs=10):
                                      epoch, target[epoch], param_group['lr']), atol=1e-5, rtol=0)
 
     def _test(self, schedulers, targets, epochs=10):
-        if isinstance(schedulers, _LRScheduler):
+        if isinstance(schedulers, LRScheduler):
             schedulers = [schedulers]
         for epoch in range(epochs):
             for param_group, target in zip(self.opt.param_groups, targets):
@@ -2645,7 +2645,7 @@ def _test_against_closed_form(self, scheduler, closed_form_scheduler, epochs=10)
                                      epoch, targets[epoch][i], param_group['lr']), atol=1e-5, rtol=0)
 
     def _test_reduce_lr_on_plateau(self, schedulers, targets, metrics, epochs=10, verbose=False):
-        if isinstance(schedulers, _LRScheduler) or isinstance(schedulers, ReduceLROnPlateau):
+        if isinstance(schedulers, LRScheduler) or isinstance(schedulers, ReduceLROnPlateau):
             schedulers = [schedulers]
         for epoch in range(epochs):
             self.opt.step()
diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
index 655c08789e0ad..b5abd49c717d8 100644
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@@ -11,7 +11,7 @@
 
 __all__ = ['LambdaLR', 'MultiplicativeLR', 'StepLR', 'MultiStepLR', 'ConstantLR', 'LinearLR',
            'ExponentialLR', 'SequentialLR', 'CosineAnnealingLR', 'ChainedScheduler', 'ReduceLROnPlateau',
-           'CyclicLR', 'CosineAnnealingWarmRestarts', 'OneCycleLR', 'PolynomialLR']
+           'CyclicLR', 'CosineAnnealingWarmRestarts', 'OneCycleLR', 'PolynomialLR', 'LRScheduler']
 
 EPOCH_DEPRECATION_WARNING = (
     "The epoch parameter in `scheduler.step()` was not necessary and is being "
@@ -22,7 +22,7 @@
     "https://github.com/pytorch/pytorch/issues/new/choose."
 )
 
-class _LRScheduler(object):
+class LRScheduler(object):
 
     def __init__(self, optimizer, last_epoch=-1, verbose=False):
 
@@ -163,6 +163,12 @@ def step(self, epoch=None):
         self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
 
 
+# Including _LRScheduler for backwards compatibility
+# Subclass instead of assign because we want __name__ of _LRScheduler to be _LRScheduler (assigning would make it LRScheduler).
+class _LRScheduler(LRScheduler):
+    pass
+
+
 class _enable_get_lr_call:
 
     def __init__(self, o):
@@ -176,7 +182,7 @@ def __exit__(self, type, value, traceback):
         self.o._get_lr_called_within_step = False
 
 
-class LambdaLR(_LRScheduler):
+class LambdaLR(LRScheduler):
     """Sets the learning rate of each parameter group to the initial lr
     times a given function. When last_epoch=-1, sets initial lr as lr.
 
@@ -262,7 +268,7 @@ def get_lr(self):
                 for lmbda, base_lr in zip(self.lr_lambdas, self.base_lrs)]
 
 
-class MultiplicativeLR(_LRScheduler):
+class MultiplicativeLR(LRScheduler):
     """Multiply the learning rate of each parameter group by the factor given
     in the specified function. When last_epoch=-1, sets initial lr as lr.
 
@@ -343,7 +349,7 @@ def get_lr(self):
             return [group['lr'] for group in self.optimizer.param_groups]
 
 
-class StepLR(_LRScheduler):
+class StepLR(LRScheduler):
     """Decays the learning rate of each parameter group by gamma every
     step_size epochs. Notice that such decay can happen simultaneously with
     other changes to the learning rate from outside this scheduler. When
@@ -392,7 +398,7 @@ def _get_closed_form_lr(self):
                 for base_lr in self.base_lrs]
 
 
-class MultiStepLR(_LRScheduler):
+class MultiStepLR(LRScheduler):
     """Decays the learning rate of each parameter group by gamma once the
     number of epoch reaches one of the milestones. Notice that such decay can
     happen simultaneously with other changes to the learning rate from outside
@@ -441,7 +447,7 @@ def _get_closed_form_lr(self):
                 for base_lr in self.base_lrs]
 
 
-class ConstantLR(_LRScheduler):
+class ConstantLR(LRScheduler):
     """Decays the learning rate of each parameter group by a small constant factor until the
     number of epoch reaches a pre-defined milestone: total_iters. Notice that such decay can
     happen simultaneously with other changes to the learning rate from outside this scheduler.
@@ -499,7 +505,7 @@ def _get_closed_form_lr(self):
                 for base_lr in self.base_lrs]
 
 
-class LinearLR(_LRScheduler):
+class LinearLR(LRScheduler):
     """Decays the learning rate of each parameter group by linearly changing small
     multiplicative factor until the number of epoch reaches a pre-defined milestone: total_iters.
     Notice that such decay can happen simultaneously with other changes to the learning rate
@@ -567,7 +573,7 @@ def _get_closed_form_lr(self):
                 for base_lr in self.base_lrs]
 
 
-class ExponentialLR(_LRScheduler):
+class ExponentialLR(LRScheduler):
     """Decays the learning rate of each parameter group by gamma every epoch.
     When last_epoch=-1, sets initial lr as lr.
 
@@ -598,7 +604,7 @@ def _get_closed_form_lr(self):
                 for base_lr in self.base_lrs]
 
 
-class SequentialLR(_LRScheduler):
+class SequentialLR(LRScheduler):
     """Receives the list of schedulers that is expected to be called sequentially during
     optimization process and milestone points that provides exact intervals to reflect
     which scheduler is supposed to be called at a given epoch.
@@ -708,7 +714,7 @@ def load_state_dict(self, state_dict):
             self._schedulers[idx].load_state_dict(s)
 
 
-class PolynomialLR(_LRScheduler):
+class PolynomialLR(LRScheduler):
     """Decays the learning rate of each parameter group using a polynomial function
     in the given total_iters. When last_epoch=-1, sets initial lr as lr.
 
@@ -758,7 +764,7 @@ def _get_closed_form_lr(self):
         ]
 
 
-class CosineAnnealingLR(_LRScheduler):
+class CosineAnnealingLR(LRScheduler):
     r"""Set the learning rate of each parameter group using a cosine annealing
     schedule, where :math:`\eta_{max}` is set to the initial lr and
     :math:`T_{cur}` is the number of epochs since the last restart in SGDR:
@@ -831,7 +837,7 @@ def _get_closed_form_lr(self):
                 for base_lr in self.base_lrs]
 
 
-class ChainedScheduler(_LRScheduler):
+class ChainedScheduler(LRScheduler):
     """Chains list of learning rate schedulers. It takes a list of chainable learning
     rate schedulers and performs consecutive step() functions belonging to them by just
     one call.
@@ -1078,7 +1084,7 @@ def load_state_dict(self, state_dict):
         self._init_is_better(mode=self.mode, threshold=self.threshold, threshold_mode=self.threshold_mode)
 
 
-class CyclicLR(_LRScheduler):
+class CyclicLR(LRScheduler):
     r"""Sets the learning rate of each parameter group according to
     cyclical learning rate policy (CLR). The policy cycles the learning
     rate between two boundaries with a constant frequency, as detailed in
@@ -1314,7 +1320,7 @@ def get_lr(self):
         return lrs
 
 
-class CosineAnnealingWarmRestarts(_LRScheduler):
+class CosineAnnealingWarmRestarts(LRScheduler):
     r"""Set the learning rate of each parameter group using a cosine annealing
     schedule, where :math:`\eta_{max}` is set to the initial lr, :math:`T_{cur}`
     is the number of epochs since the last restart and :math:`T_{i}` is the number
@@ -1437,7 +1443,7 @@ def __exit__(self, type, value, traceback):
         self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
 
 
-class OneCycleLR(_LRScheduler):
+class OneCycleLR(LRScheduler):
     r"""Sets the learning rate of each parameter group according to the
     1cycle learning rate policy. The 1cycle policy anneals the learning
     rate from an initial learning rate to some maximum learning rate and then
diff --git a/torch/optim/swa_utils.py b/torch/optim/swa_utils.py
index f7a530f5ad0f1..9a1e88d89426f 100644
--- a/torch/optim/swa_utils.py
+++ b/torch/optim/swa_utils.py
@@ -5,7 +5,7 @@
 
 import torch
 from torch.nn import Module
-from torch.optim.lr_scheduler import _LRScheduler
+from torch.optim.lr_scheduler import LRScheduler
 
 __all__ = ['AveragedModel', 'update_bn', 'SWALR']
 
@@ -196,7 +196,7 @@ def update_bn(loader, model, device=None):
     model.train(was_training)
 
 
-class SWALR(_LRScheduler):
+class SWALR(LRScheduler):
     r"""Anneals the learning rate in each parameter group to a fixed value.
 
     This learning rate scheduler is meant to be used with Stochastic Weight

From 723f52af289e4a611ca51a16f31a584ab90f67a7 Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Mon, 7 Nov 2022 21:25:55 +0000
Subject: [PATCH 0631/1922] fallback for scatter_(scalar) (#88210)

`scatter_reduce_` overloads can only accept `Tensor src`.
`scatter_`, on the other hand, can accept `Number src`. Switching a fallback from `scatter_reduce_` to `scatter_`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88210
Approved by: https://github.com/desertfire
---
 torch/_inductor/lowering.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 71f038b231259..1cf9fc660acf6 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -1907,12 +1907,22 @@ def scatter(x, dim: int, index, src, **kwargs):
 
 @register_lowering(aten.scatter_, type_promotion_kind=None)
 def scatter_(self, dim: int, index, src, *, reduce: str = None):
+
+    # TODO: Need to support more reduction type
+    # For reduction of "sum", tl.atomic_add doesn't support bool or int64
+    if reduce not in {None, "add"} or (
+        reduce == "add" and self.get_dtype() in {torch.bool, torch.int64}
+    ):
+        self.realize()
+        return fallback_scatter_(self, dim, index, src, reduce=reduce)
+
     if reduce == "add":
         reduce = "sum"
     elif reduce == "multiply":
         reduce = "prod"
     else:
         assert reduce is None
+
     return scatter_reduce_(self, dim, index, src, reduce)
 
 
@@ -1931,23 +1941,13 @@ def scatter_reduce(x, dim: int, index, src, reduction_type, **kwargs):
     return scatter_reduce_(clone(x), dim, index, src, reduction_type, **kwargs)
 
 
-fallback_scatter_reduce_ = fallback_handler(aten.scatter_reduce_)
+fallback_scatter_ = fallback_handler(aten.scatter_)
 
 
 @register_lowering(aten.scatter_reduce_, type_promotion_kind=None)
 def scatter_reduce_(self, dim: int, index, src, reduce, *, include_self: bool = True):
     assert reduce in {None, "sum", "prod", "mean", "amax", "amin"}
 
-    # TODO: Need to support more reduction type
-    # For reduction of "sum", tl.atomic_add doesn't support bool or int64
-    if reduce not in {None, "sum"} or (
-        reduce == "sum" and self.get_dtype() in {torch.bool, torch.int64}
-    ):
-        self.realize()
-        return fallback_scatter_reduce_(
-            self, dim, index, src, reduce, include_self=include_self
-        )
-
     assert isinstance(self, TensorBox)
     assert "int" in str(index.get_dtype())
 

From b9b877dfe6a9b07cf4363a00a46288ccadffa278 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 7 Nov 2022 08:51:15 -0800
Subject: [PATCH 0632/1922] SymIntify _copy functionalization kernels (and
 _copy_out too) (#88572)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88572
Approved by: https://github.com/anjali411, https://github.com/bdhirsh
---
 .github/ci_commit_pins/xla.txt             |  2 +-
 aten/src/ATen/native/native_functions.yaml | 20 ++---
 aten/src/ATen/test/math_kernel_test.cpp    | 10 ---
 test/functorch/test_aotdispatch.py         |  1 -
 torch/_subclasses/fake_tensor.py           | 14 ++--
 torchgen/gen.py                            | 17 ++++-
 torchgen/gen_functionalization_type.py     | 87 ++++++++++++----------
 7 files changed, 79 insertions(+), 72 deletions(-)

diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index ea8cc0b128f6d..d6866ce9977c2 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-5ff192eac058d54cc5f22c8ddf9cf6acfa51325d
+48365abee394c325a3d17c5e234c1b36b878dea3
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index d915ee1015700..6b1deaffb47b4 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -3770,7 +3770,7 @@
   dispatch:
     CPU: narrow_copy_dense_cpu
     SparseCPU, SparseCUDA: narrow_copy_sparse
-    CompositeExplicitAutogradNonFunctional: narrow_copy_dense
+    CompositeExplicitAutogradNonFunctional: narrow_copy_dense_symint
   tags: view_copy
 
 - func: narrow_copy.out(Tensor self, int dim, SymInt start, SymInt length, *, Tensor(a!) out) -> Tensor(a!)
@@ -7020,7 +7020,7 @@
 - func: lift_fresh_copy(Tensor self) -> Tensor
   tags: view_copy
   dispatch:
-    CompositeExplicitAutograd: lift_fresh_copy
+    CompositeExplicitAutogradNonFunctional: lift_fresh_copy
   autogen: lift_fresh_copy.out
 
 - func: is_set_to(Tensor self, Tensor tensor) -> bool
@@ -12939,7 +12939,7 @@
 - func: as_strided_copy(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor
   variants: function
   dispatch:
-    CompositeExplicitAutogradNonFunctional: as_strided_copy
+    CompositeExplicitAutogradNonFunctional: as_strided_copy_symint
   tags: view_copy
 
 - func: _sparse_broadcast_to_copy(Tensor self, int[] size) -> Tensor
@@ -12957,7 +12957,7 @@
 - func: expand_copy(Tensor self, SymInt[] size, *, bool implicit=False) -> Tensor
   variants: function
   dispatch:
-    CompositeExplicitAutogradNonFunctional: expand_copy
+    CompositeExplicitAutogradNonFunctional: expand_copy_symint
   tags: view_copy
 
 - func: permute_copy(Tensor self, int[] dims) -> Tensor
@@ -12969,7 +12969,7 @@
 - func: _reshape_alias_copy(Tensor self, SymInt[] size, SymInt[] stride) -> Tensor
   variants: function
   dispatch:
-    CompositeExplicitAutogradNonFunctional: _reshape_alias_copy
+    CompositeExplicitAutogradNonFunctional: _reshape_alias_copy_symint
   tags: view_copy
 
 - func: select_copy.int(Tensor self, int dim, int index) -> Tensor
@@ -12987,19 +12987,19 @@
 - func: slice_copy.Tensor(Tensor self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor
   variants: function
   dispatch:
-    CompositeExplicitAutogradNonFunctional: slice_copy_Tensor
+    CompositeExplicitAutogradNonFunctional: slice_copy_Tensor_symint
   tags: view_copy
 
 - func: split_copy.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[]
   variants: function
   dispatch:
-    CompositeExplicitAutogradNonFunctional: split_copy_Tensor
+    CompositeExplicitAutogradNonFunctional: split_copy_Tensor_symint
   tags: view_copy
 
 - func: split_with_sizes_copy(Tensor self, SymInt[] split_sizes, int dim=0) -> Tensor[]
   variants: function
   dispatch:
-    CompositeExplicitAutogradNonFunctional: split_with_sizes_copy
+    CompositeExplicitAutogradNonFunctional: split_with_sizes_copy_symint
   tags: view_copy
 
 - func: squeeze_copy(Tensor self) -> Tensor
@@ -13071,14 +13071,14 @@
 - func: ccol_indices_copy(Tensor self) -> Tensor
   variants: function
   dispatch:
-    CompositeExplicitAutograd: ccol_indices_copy
+    CompositeExplicitAutogradNonFunctional: ccol_indices_copy
   tags: view_copy
   autogen: ccol_indices_copy.out
 
 - func: row_indices_copy(Tensor self) -> Tensor
   variants: function
   dispatch:
-    CompositeExplicitAutograd: row_indices_copy
+    CompositeExplicitAutogradNonFunctional: row_indices_copy
   tags: view_copy
   autogen: row_indices_copy.out
 
diff --git a/aten/src/ATen/test/math_kernel_test.cpp b/aten/src/ATen/test/math_kernel_test.cpp
index 15ce0af4001d5..8875e72a6af9b 100644
--- a/aten/src/ATen/test/math_kernel_test.cpp
+++ b/aten/src/ATen/test/math_kernel_test.cpp
@@ -114,16 +114,6 @@ TEST(MathKernelTest, MishBackward) {
   ASSERT_ALLCLOSE_TOLERANCES(out, math_out, 1e-4, 1e-6);
 }
 
-TEST(MathKernelTest, NarrowCopy)  {
-  auto x = rand({5, 8, 7});
-  for (const auto dim : c10::irange(3)) {
-    const int64_t start = 1, length = 4;
-    auto y_ref = x.narrow(dim, start, length);
-    auto y_test = at::native::narrow_copy_dense(x, dim, start, length);
-    ASSERT_ALLCLOSE_TOLERANCES(y_ref, y_test, 0, 0);
-  }
-}
-
 TEST(MathKernelTest, Bmm)  {
   auto test_bmm = [](int64_t last_dim) {
     auto x = rand({1, 4, 4}, at::kFloat);
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 909e57bc1e083..20bb64176b8d9 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1182,7 +1182,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('nn.functional.rrelu', ''),  # aten.rrelu_with_noise.default - couldn't find symbolic meta function...
     xfail('nn.functional.smooth_l1_loss', ''),  # could not find kernel
     xfail('nn.functional.unfold', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('unfold', ''),  # aten.squeeze_copy.dim - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.upsample_bilinear', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.upsample_nearest', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('norm', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 5ed23f8020fd0..c1cb65160cacd 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -830,10 +830,7 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                 return r
 
         # IDK: feels bad man, sym_numel on as_strided infinite loops otherwise
-        if (
-            has_symbolic_sizes
-            and func not in self.functions_with_cpp_meta_impl_that_support_symint
-        ):
+        if has_symbolic_sizes and not self.cpp_meta_supports_symint(func):
             from torch._decomp import meta_table as meta_table
 
             if func == aten.size.default:
@@ -873,7 +870,7 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                 return func.prim_meta_impl(*args, **kwargs)
 
         if has_symbolic_sizes:
-            if func not in self.functions_with_cpp_meta_impl_that_support_symint:
+            if not self.cpp_meta_supports_symint(func):
                 raise RuntimeError(
                     f"{func} - couldn't find symbolic meta function/decomposition"
                 )
@@ -964,9 +961,10 @@ def wrap(e, device=None):
 
         return wrap
 
-    @property
-    def functions_with_cpp_meta_impl_that_support_symint(self):
-        return [
+    def cpp_meta_supports_symint(self, func):
+        if torch.Tag.view_copy in func.tags:  # type: ignore[attr-defined]
+            return True
+        return func in [
             aten.empty_strided.default,
             aten.as_strided_scatter.default,
             aten.as_strided.default,
diff --git a/torchgen/gen.py b/torchgen/gen.py
index db207169d4d48..7552451a5135c 100644
--- a/torchgen/gen.py
+++ b/torchgen/gen.py
@@ -34,10 +34,10 @@
     with_native_function_and_indices,
 )
 from torchgen.gen_functionalization_type import (
-    gen_composite_view_copy_kernel,
     gen_functionalization_definition,
     gen_functionalization_registration,
     gen_functionalization_view_inverse_declaration,
+    GenCompositeViewCopyKernel,
 )
 from torchgen.gen_vmap_plumbing import gen_all_vmap_plumbing
 
@@ -2502,7 +2502,11 @@ def gen_op_headers(
         lambda: {
             "ops_headers": [
                 "\n".join(
-                    f"#include <ATen/ops/{f.root_name}_ops.h>"
+                    f"#include <ATen/ops/{f.root_name}_ops.h>\n"
+                    # NB: this include is important as it ensures we
+                    # set the visibility on generated view_copy kernels
+                    # correctly
+                    f"#include <ATen/ops/{f.root_name}_native.h>"
                     for f in (
                         [g.view] if g.view_copy is None else [g.view, g.view_copy]
                     )
@@ -2518,7 +2522,14 @@ def gen_op_headers(
                 for g in structured_native_functions
             ],
             "CompositeViewCopyKernel_Definitions": list(
-                mapMaybe(gen_composite_view_copy_kernel, view_groups)
+                mapMaybe(
+                    GenCompositeViewCopyKernel(
+                        backend_indices[
+                            DispatchKey.CompositeExplicitAutogradNonFunctional
+                        ]
+                    ),
+                    view_groups,
+                )
             ),
             "GeneratedCompositeFunctional_Definitions": list(
                 mapMaybe(
diff --git a/torchgen/gen_functionalization_type.py b/torchgen/gen_functionalization_type.py
index a27b4f327b2ac..33b4e4d86bb90 100644
--- a/torchgen/gen_functionalization_type.py
+++ b/torchgen/gen_functionalization_type.py
@@ -1,3 +1,4 @@
+from dataclasses import dataclass
 from typing import Callable, List, Optional, Tuple, Union
 
 from torchgen.api import cpp, dispatcher
@@ -16,6 +17,7 @@
     ViewInverseSignature,
 )
 from torchgen.context import (
+    method_with_native_function,
     native_function_manager,
     with_native_function,
     with_native_function_and,
@@ -73,18 +75,25 @@
 
 # Generates the body of the default composite C++ kernel for a {view}_copy NativeFunction
 # See Note [view_copy NativeFunctions]
-@with_native_function
-def gen_composite_view_copy_kernel(g: NativeFunctionsViewGroup) -> Optional[str]:
-
-    if g.view_copy is None:
-        return None
-
-    # We can make view_copy work in more cases by using reshape()
-    # when a normal view call would ordinarily fail.
-    # This also makes LTC more efficient, because they don't need to include
-    # clone() calls in their graph (which is normally needed by reshape).
-    if str(g.view_copy.func.name) == "view_copy":
-        return """\
+@dataclass(frozen=True)
+class GenCompositeViewCopyKernel:
+    backend_index: BackendIndex
+
+    @method_with_native_function
+    def __call__(self, g: NativeFunctionsViewGroup) -> Optional[str]:
+        if g.view_copy is None:
+            return None
+
+        metadata = self.backend_index.get_kernel(g.view_copy)
+        assert metadata is not None
+
+        # We can make view_copy work in more cases by using reshape()
+        # when a normal view call would ordinarily fail.
+        # This also makes LTC more efficient, because they don't need to include
+        # clone() calls in their graph (which is normally needed by reshape).
+        if str(g.view_copy.func.name) == "view_copy":
+            assert metadata.kernel == "view_copy_symint"
+            return """\
 at::Tensor view_copy_symint(const at::Tensor & self, at::SymIntArrayRef size) {
   c10::SymDimVector shape = infer_size_dv(size, self.sym_numel());
   if (!at::detail::computeStride(self.sym_sizes(), self.sym_strides(), shape).has_value()) {
@@ -95,42 +104,42 @@ def gen_composite_view_copy_kernel(g: NativeFunctionsViewGroup) -> Optional[str]
   }
 }
 """
-    # view_copy is a native signature, since we're generating an at::native:: kernel
-    # Functionalization always operates on symints though
-    view_copy_sig = NativeSignature(
-        g.view_copy.func, symint=False
-    )  # TODO: flag day this True
-
-    # view is a dispatcher signature, since we're calling into the at::_ops API
-    view_sig = DispatcherSignature(g.view.func)
-
-    view_api_name = g.view.func.name.unambiguous_name()
-    exprs = ", ".join(
-        [e.expr for e in translate(view_copy_sig.arguments(), view_sig.arguments())]
-    )
+        # view_copy is a native signature, since we're generating an at::native:: kernel
+        # Functionalization always operates on symints though
+        view_copy_sig = NativeSignature(
+            g.view_copy.func, symint=metadata.supports_symint()
+        )
 
-    # view ops today always return either a Tensor or a list of Tensors
-    assert len(g.view.func.returns) == 1
-    assert g.view.func.returns[0].type == BaseType(
-        BaseTy.Tensor
-    ) or g.view.func.returns[0].type == ListType(BaseType(BaseTy.Tensor), None)
+        # view is a dispatcher signature, since we're calling into the at::_ops API
+        view_sig = DispatcherSignature(g.view.func)
 
-    if g.view.func.returns[0].type == BaseType(BaseTy.Tensor):
-        return_cloned_output = """\
+        view_api_name = g.view.func.name.unambiguous_name()
+        exprs = ", ".join(
+            [e.expr for e in translate(view_copy_sig.arguments(), view_sig.arguments())]
+        )
+
+        # view ops today always return either a Tensor or a list of Tensors
+        assert len(g.view.func.returns) == 1
+        assert g.view.func.returns[0].type == BaseType(
+            BaseTy.Tensor
+        ) or g.view.func.returns[0].type == ListType(BaseType(BaseTy.Tensor), None)
+
+        if g.view.func.returns[0].type == BaseType(BaseTy.Tensor):
+            return_cloned_output = """\
   return output.clone(/*memory_format=*/at::MemoryFormat::Contiguous);"""
-    else:
-        # If the return type is a list, we need to clone each tensor in the list.
-        return_cloned_output = f"""\
+        else:
+            # If the return type is a list, we need to clone each tensor in the list.
+            return_cloned_output = f"""\
   {view_copy_sig.returns_type().cpp_type()} out_clone;
   for (const auto i : c10::irange(output.size())) {{
     out_clone.push_back(output[i].clone(/*memory_format=*/at::MemoryFormat::Contiguous));
   }}
   return out_clone;"""
 
-    # The default generated composite kernel for {view}_copy() operators just clones
-    # the input tensor, and runs the underlying view on the clone.
-    return f"""
-{view_copy_sig.defn()} {{
+        # The default generated composite kernel for {view}_copy() operators just clones
+        # the input tensor, and runs the underlying view on the clone.
+        return f"""
+{view_copy_sig.defn(name=metadata.kernel)} {{
   auto output = at::_ops::{view_api_name}::call({exprs});
   {return_cloned_output}
 }}

From f21b4097d77481b91a5ba3d88b63182c7e05fda1 Mon Sep 17 00:00:00 2001
From: Vitaly Fedyunin <vitaly.fedyunin@gmail.com>
Date: Mon, 7 Nov 2022 10:30:55 -0500
Subject: [PATCH 0633/1922] [DataPipes] Add group support to the
 sharding_filter (#88424)

Differential Revision: [D41006747](https://our.internmc.facebook.com/intern/diff/D41006747)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88424
Approved by: https://github.com/ejguan
---
 test/test_datapipe.py                       | 38 +++++++++++++++++
 torch/utils/data/dataloader.py              | 12 +++---
 torch/utils/data/datapipes/iter/grouping.py | 45 ++++++++++++++++++---
 torch/utils/data/graph_settings.py          |  8 +++-
 4 files changed, 91 insertions(+), 12 deletions(-)

diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index 49d2ba1ee79cb..dbc5a5ae8071f 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -54,6 +54,7 @@
 )
 from torch.utils.data.datapipes.dataframe import CaptureDataFrame
 from torch.utils.data.datapipes.dataframe import dataframe_wrapper as df_wrapper
+from torch.utils.data.datapipes.iter.grouping import SHARDING_PRIORITIES
 
 try:
     import dill
@@ -2352,6 +2353,9 @@ def __iter__(self):
         for i in range(self.size):
             yield i
 
+    def __len__(self):
+        return self.size
+
 
 class TestGraph(TestCase):
     class CustomIterDataPipe(IterDataPipe):
@@ -2663,6 +2667,40 @@ def test_simple_sharding(self):
             items += list(sharded_dp)
         self.assertEqual(sorted(all_items), sorted(items))
 
+    def test_sharding_groups(self):
+        def construct_sharded_pipe():
+            sharding_pipes = []
+            dp = NumbersDataset(size=90)
+            dp = dp.sharding_filter(sharding_group_filter=SHARDING_PRIORITIES.DISTRIBUTED)
+            sharding_pipes.append(dp)
+            dp = dp.sharding_filter(sharding_group_filter=SHARDING_PRIORITIES.MULTIPROCESSING)
+            sharding_pipes.append(dp)
+            dp = dp.sharding_filter(sharding_group_filter=300)
+            sharding_pipes.append(dp)
+            return dp, sharding_pipes
+
+        dp, sharding_pipes = construct_sharded_pipe()
+
+        for pipe in sharding_pipes:
+            pipe.apply_sharding(2, 1, sharding_group=SHARDING_PRIORITIES.DISTRIBUTED)
+            pipe.apply_sharding(5, 3, sharding_group=SHARDING_PRIORITIES.MULTIPROCESSING)
+            pipe.apply_sharding(3, 1, sharding_group=300)
+
+        actual = list(dp)
+        expected = [17, 47, 77]
+        self.assertEquals(expected, actual)
+        self.assertEquals(3, len(dp))
+
+        dp, _ = construct_sharded_pipe()
+        dp.apply_sharding(2, 1, sharding_group=SHARDING_PRIORITIES.DEFAULT)
+        with self.assertRaises(Exception):
+            dp.apply_sharding(5, 3, sharding_group=SHARDING_PRIORITIES.MULTIPROCESSING)
+
+        dp, _ = construct_sharded_pipe()
+        dp.apply_sharding(5, 3, sharding_group=SHARDING_PRIORITIES.MULTIPROCESSING)
+        with self.assertRaises(Exception):
+            dp.apply_sharding(2, 1, sharding_group=SHARDING_PRIORITIES.DEFAULT)
+
     def test_sharding_length(self):
         numbers_dp = dp.iter.IterableWrapper(range(13))
         sharded_dp0 = numbers_dp.sharding_filter()
diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index 4b05dac1af945..4c141eccc3be7 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -35,6 +35,7 @@
     Dataset,)
 
 from torch.utils.data.datapipes.datapipe import _IterDataPipeSerializationWrapper, _MapDataPipeSerializationWrapper
+from torch.utils.data.datapipes.iter.grouping import SHARDING_PRIORITIES
 
 from . import _utils
 
@@ -104,7 +105,6 @@ def _get_distributed_settings():
 
 
 def _sharding_worker_init_fn(worker_init_fn, world_size, rank_id, worker_id):
-    global_worker_id = worker_id
     info = torch.utils.data.get_worker_info()
     assert info is not None
     total_workers = info.num_workers
@@ -112,9 +112,10 @@ def _sharding_worker_init_fn(worker_init_fn, world_size, rank_id, worker_id):
     assert isinstance(datapipe, (IterDataPipe, MapDataPipe))
     # To distribute elements across distributed process evenly, we should shard data on distributed
     # processes first then shard on worker processes
-    total_workers *= world_size
-    global_worker_id = global_worker_id * world_size + rank_id
-    torch.utils.data.graph_settings.apply_sharding(datapipe, total_workers, global_worker_id)
+    torch.utils.data.graph_settings.apply_sharding(
+        datapipe, world_size, rank_id, sharding_group=SHARDING_PRIORITIES.DISTRIBUTED)
+    torch.utils.data.graph_settings.apply_sharding(
+        datapipe, total_workers, worker_id, sharding_group=SHARDING_PRIORITIES.MULTIPROCESSING)
     if worker_init_fn is not None:
         worker_init_fn(worker_id)
 
@@ -663,7 +664,8 @@ def __init__(self, loader):
         # Adds forward compatibilities so classic DataLoader can work with DataPipes:
         #   Taking care of distributed sharding
         if isinstance(self._dataset, (IterDataPipe, MapDataPipe)):
-            torch.utils.data.graph_settings.apply_sharding(self._dataset, self._world_size, self._rank)
+            torch.utils.data.graph_settings.apply_sharding(
+                self._dataset, self._world_size, self._rank, sharding_group=SHARDING_PRIORITIES.DISTRIBUTED)
 
         self._dataset_fetcher = _DatasetKind.create_fetcher(
             self._dataset_kind, self._dataset, self._auto_collation, self._collate_fn, self._drop_last)
diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py
index d7ad673991c90..58d1509f3ee9c 100644
--- a/torch/utils/data/datapipes/iter/grouping.py
+++ b/torch/utils/data/datapipes/iter/grouping.py
@@ -1,20 +1,28 @@
 from collections import defaultdict
+from enum import IntEnum
 
 from torch.utils.data.datapipes._decorator import functional_datapipe
 from torch.utils.data.datapipes.datapipe import IterDataPipe, DataChunk
 from torch.utils.data.datapipes.utils.common import _check_unpickable_fn
-from typing import Any, Callable, DefaultDict, Iterator, List, Optional, Sized, TypeVar
+from typing import Any, Callable, DefaultDict, Dict, Iterator, List, Optional, Sized, Tuple, TypeVar
 
 __all__ = [
     "BatcherIterDataPipe",
     "GrouperIterDataPipe",
     "ShardingFilterIterDataPipe",
+    "SHARDING_PRIORITIES",
     "UnBatcherIterDataPipe",
 ]
 
 T_co = TypeVar('T_co', covariant=True)
 
 
+class SHARDING_PRIORITIES(IntEnum):
+    DEFAULT = 1
+    DISTRIBUTED = 2
+    MULTIPROCESSING = 3
+
+
 @functional_datapipe('sharding_filter')
 class ShardingFilterIterDataPipe(IterDataPipe):
     r"""
@@ -25,17 +33,44 @@ class ShardingFilterIterDataPipe(IterDataPipe):
     Args:
         source_datapipe: Iterable DataPipe that will be sharded
     """
-    def __init__(self, source_datapipe: IterDataPipe):
+
+    def __init__(self, source_datapipe: IterDataPipe, sharding_group_filter=None):
         self.source_datapipe = source_datapipe
+        self.sharding_group_filter = sharding_group_filter
+        self.groups: Dict[int, Tuple[int, int]] = {}
         self.num_of_instances = 1
         self.instance_id = 0
+        self._update_num_of_instances()
 
     def is_shardable(self):
         return True
 
-    def apply_sharding(self, num_of_instances, instance_id):
-        self.num_of_instances = num_of_instances
-        self.instance_id = instance_id
+    def apply_sharding(self, num_of_instances, instance_id, sharding_group=SHARDING_PRIORITIES.DEFAULT):
+        if instance_id >= num_of_instances:
+            raise ValueError(f"instance_id({instance_id}) should be smaller than num_of_instances({num_of_instances})")
+        if sharding_group == SHARDING_PRIORITIES.DEFAULT:
+            if len(self.groups) and SHARDING_PRIORITIES.DEFAULT not in self.groups:
+                raise Exception('ShardingFilter cannot mix DEFAULT and non DEFAULT groups')
+        else:
+            if SHARDING_PRIORITIES.DEFAULT in self.groups:
+                raise Exception('ShardingFilter cannot mix DEFAULT and non DEFAULT groups')
+        self.groups[sharding_group] = (num_of_instances, instance_id)
+        self._update_num_of_instances()
+
+    def _update_num_of_instances(self):
+        sorted_sharding_groups = []
+        for key in sorted(self.groups.keys()):
+            if self.sharding_group_filter is None or key == self.sharding_group_filter:
+                sorted_sharding_groups.append(self.groups[key])
+
+        sorted_sharding_groups.reverse()
+
+        self.num_of_instances = 1
+        self.instance_id = 0
+
+        for group_num_of_instances, group_instance_id in sorted_sharding_groups:
+            self.instance_id += self.num_of_instances * group_instance_id
+            self.num_of_instances *= group_num_of_instances
 
     def __iter__(self):
         for i, item in enumerate(self.source_datapipe):
diff --git a/torch/utils/data/graph_settings.py b/torch/utils/data/graph_settings.py
index bb89fe5c6e08a..618e666968982 100644
--- a/torch/utils/data/graph_settings.py
+++ b/torch/utils/data/graph_settings.py
@@ -6,6 +6,7 @@
 import torch
 
 from torch.utils.data.graph import DataPipe, DataPipeGraph, traverse_dps
+from torch.utils.data.datapipes.iter.grouping import SHARDING_PRIORITIES
 
 __all__ = [
     "apply_random_seed",
@@ -31,7 +32,10 @@ def _get_all_graph_pipes_helper(graph: DataPipeGraph, id_cache: Set[int]) -> Lis
     return results
 
 
-def apply_sharding(datapipe: DataPipe, num_of_instances: int, instance_id: int) -> DataPipe:
+def apply_sharding(datapipe: DataPipe,
+                   num_of_instances: int,
+                   instance_id: int,
+                   sharding_group=SHARDING_PRIORITIES.DEFAULT) -> DataPipe:
     graph = traverse_dps(datapipe)
     all_pipes = get_all_graph_pipes(graph)
     already_applied_to = None
@@ -42,7 +46,7 @@ def apply_sharding(datapipe: DataPipe, num_of_instances: int, instance_id: int)
                     if already_applied_to is not None:
                         raise RuntimeError('This implementation of sharding can be only applied once per instance of DataPipeline.',
                                            'Already applied to', already_applied_to, 'while trying to apply to', pipe)
-                    pipe.apply_sharding(num_of_instances, instance_id)
+                    pipe.apply_sharding(num_of_instances, instance_id, sharding_group=sharding_group)
                     already_applied_to = pipe
     return datapipe
 

From a1cd2b36acb6f44503c6958983f16b05ec795060 Mon Sep 17 00:00:00 2001
From: lezcano <lezcano-93@hotmail.com>
Date: Mon, 7 Nov 2022 19:21:24 +0000
Subject: [PATCH 0634/1922] Generalize gesvdjBatched to run whith
 full_matrices==false (#88502)

As brought up in https://github.com/pytorch/pytorch/issues/86234#issuecomment-1268296036, our heuristic for which SVD backend to choose was not great in some cases.
The case in which there could be some improvements is when we have a
large batch of very small non-square matrices.

This PR, adapts the calling code to gesvdj by creating two temporary
square buffers to allow to call gesvdjBatched, and then copies back the
result into the output buffers.

We then modify the heuristic that chooses between gesvdj and
gesvdjBatched.

Fixes https://github.com/pytorch/pytorch/issues/86234
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88502
Approved by: https://github.com/IvanYashchuk, https://github.com/nikitaved, https://github.com/mruberry, https://github.com/xwang233
---
 .../cuda/linalg/BatchLinearAlgebraLib.cpp     | 58 ++++++++++++++-----
 test/functorch/test_vmap.py                   |  2 -
 test/test_linalg.py                           | 24 +-------
 3 files changed, 46 insertions(+), 38 deletions(-)

diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
index 01788e0bdffee..89c1246a32d14 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
@@ -656,23 +656,21 @@ inline static void apply_svd_cusolver_gesvdjBatched(const Tensor& A, const Tenso
   using value_t = typename c10::scalar_value_type<scalar_t>::type;
   int m = cuda_int_cast(A.size(-2), "m");
   int n = cuda_int_cast(A.size(-1), "n");
-  int k = std::min(m, n);
   int batchsize = cuda_int_cast(batchCount(A), "batch size");
+  int lda = A.stride(-1);
+  int ldu = compute_uv ? U.stride(-1) : m;
+  int ldv = compute_uv ? V.stride(-1) : n;
 
   // Need to pass allocated memory to the function, otherwise it fails
   auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
-  auto dataPtr_U = !compute_uv ? allocator.allocate(sizeof(scalar_t) * batchsize * m * k) : c10::DataPtr{};
-  auto dataPtr_V = !compute_uv ? allocator.allocate(sizeof(scalar_t) * batchsize * n * k) : c10::DataPtr{};
+  auto dataPtr_U = !compute_uv ? allocator.allocate(sizeof(scalar_t) * batchsize * m * ldu) : c10::DataPtr{};
+  auto dataPtr_V = !compute_uv ? allocator.allocate(sizeof(scalar_t) * batchsize * n * ldv) : c10::DataPtr{};
 
   auto A_data = A.data_ptr<scalar_t>();
   auto U_data = compute_uv ? U.data_ptr<scalar_t>() : reinterpret_cast<scalar_t*>(dataPtr_U.get());
   auto S_data = S.data_ptr<value_t>();
   auto V_data = compute_uv ? V.data_ptr<scalar_t>() : reinterpret_cast<scalar_t*>(dataPtr_V.get());
 
-  int lda = A.stride(-1);
-  int ldu = compute_uv ? U.stride(-1) : m;
-  int ldv = compute_uv ? V.stride(-1) : n;
-
   TORCH_INTERNAL_ASSERT(m <= 32 && n <= 32, "gesvdjBatched requires both matrix dimensions not greater than 32, but got "
                         "m = ", m, " n = ", n);
 
@@ -695,10 +693,42 @@ inline static void apply_svd_cusolver_gesvdjBatched(const Tensor& A, const Tenso
   TORCH_CUSOLVER_CHECK(cusolverDnDestroyGesvdjInfo(gesvdj_params));
 }
 
-inline static void svd_cusolver_gesvdjBatched(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V, const Tensor& infos, bool compute_uv) {
+inline static void svd_cusolver_gesvdjBatched(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V, const Tensor& infos, bool full_matrices, bool compute_uv) {
+  auto m = A.size(-2);
+  auto n = A.size(-1);
+  auto k = std::min(m, n);
+  // The kernel assumes full_matrices == true
+  // If full_matrices == false and m != n, we create auxiliary tensors of the right size and copy the results back
+  auto U_ = U;
+  auto V_ = V;
+  if (compute_uv && !full_matrices) {
+    auto sizes = A.sizes().vec();
+    if (m > n) {
+      // Size of U with full_matrices == True
+      sizes.end()[-1] = m;
+      // U, V should be a batch of Fortran contiguous arrays
+      U_ = U.new_empty(sizes).mT();
+    } else if (m < n) {
+      // Size of V with full_matrices == True
+      sizes.end()[-2] = n;
+      V_ = V.new_empty(sizes).mT();
+    }
+  }
+  // Here U_ and V_ are batches of F-contig square matrices
+
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(A.scalar_type(), "svd_cuda_gesvdjBatched", [&] {
-    apply_svd_cusolver_gesvdjBatched<scalar_t>(A, U, S, V, infos, compute_uv);
+    apply_svd_cusolver_gesvdjBatched<scalar_t>(A, U_, S, V_, infos, compute_uv);
   });
+
+  // Copy the result back if we created any new matrix
+  if (compute_uv && !full_matrices) {
+    if (!U_.is_alias_of(U)) {
+      U.copy_(U_.narrow(-1, 0, k));
+    }
+    if (!V_.is_alias_of(V)) {
+      V.copy_(V_.narrow(-1, 0, k));
+    }
+  }
 }
 
 template<typename scalar_t>
@@ -832,21 +862,23 @@ void svd_cusolver(const Tensor& A,
                   const Tensor& V,
                   const Tensor& info) {
   // Here U and V are F-contig whenever they are defined (i.e. whenever compute_uv=true)
-  const auto batch_size = batchCount(A);
   const auto m = A.size(-2);
   const auto n = A.size(-1);
   const auto k = std::min(m, n);
 
   static const char* check_svd_doc = "Check doc at https://pytorch.org/docs/stable/generated/torch.linalg.svd.html";
 
-  // The default heuristic is to use gesvdj driver
+  // The default heuristic is to use the gesvdj driver
   const auto driver_v = driver.value_or("gesvdj");
 
   if (driver_v == "gesvd") {
     svd_cusolver_gesvd(A, U, S, V, info, full_matrices, compute_uv);
   } else if (driver_v == "gesvdj") {
-    if (m <= 32 && n <= 32 && batch_size > 1 && (full_matrices || m == n)) {
-      svd_cusolver_gesvdjBatched(cloneBatchedColumnMajor(A), U, S, V, info, compute_uv);
+    // See the benchmarks in
+    // https://github.com/pytorch/pytorch/pull/88502#issuecomment-1303860789
+    // The m <= 32 && n <= 32 restrictions come from the limitations of the cusolver backend. See the cusolver docs
+    if (m <= 32 && n <= 32) {
+      svd_cusolver_gesvdjBatched(cloneBatchedColumnMajor(A), U, S, V, info, full_matrices, compute_uv);
     } else {
       // gesvdj driver may be numerically unstable for large sized matrix
       svd_cusolver_gesvdj(cloneBatchedColumnMajor(A), U, S, V, info, full_matrices, compute_uv);
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index e8863781ad306..3acab4172fce1 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -3236,8 +3236,6 @@ def test():
         xfail('broadcast_shapes', ''),  # test runner can't handle non-Tensor ops
         xfail('sparse.sampled_addmm'),  # sparse
         xfail('cross'),  # The default value of dim in op is *very* weird. No wonder it doesn't work
-        xfail('svd', device_type='cuda'),  # not unique, see test_linalg_svd for manual test
-        xfail('linalg.svd', device_type='cuda'),  # not unique, see test_linalg_svd for manual test
         skip('linalg.eigh', ''),  # not unique, see test_linalg_eigh for manual test
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
         # ----------------------------------------------------------------------
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 86790677f56a4..273c74d4e6146 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -1540,7 +1540,7 @@ def run_error_test_case(input, ord, dim, keepdim, error_type, error_regex):
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @dtypes(torch.cfloat, torch.cdouble)
-    @precisionOverride({torch.cfloat: 2e-4})
+    @precisionOverride({torch.cfloat: 5e-4})
     def test_norm_complex(self, device, dtype):
         def gen_error_message(input_size, ord, keepdim, dim=None):
             return "complex norm failed for input size %s, ord=%s, keepdim=%s, dim=%s" % (
@@ -2476,28 +2476,6 @@ def test_svd_memory_allocation(self, device, dtype):
         result = torch.linalg.svd(a, full_matrices=False)
         self.assertEqual(result.S, S)
 
-    # This test doesn't work with MAGMA backend https://github.com/pytorch/pytorch/issues/72106
-    @skipMeta
-    @skipCUDAIfRocm
-    @skipCUDAIfNoCusolver
-    @skipCPUIfNoLapack
-    @dtypes(*floating_and_complex_types())
-    def test_svd_nan_error(self, device, dtype):
-        for svd in [torch.svd, torch.linalg.svd]:
-            # if input contains NaN then an error is triggered for svd
-            # When cuda < 11.5, cusolver raises CUSOLVER_STATUS_EXECUTION_FAILED when input contains nan.
-            # When cuda >= 11.5, cusolver normally finishes execution and sets info array indicating convergence issue.
-            error_msg = r'(CUSOLVER_STATUS_EXECUTION_FAILED|The algorithm failed to converge)'
-            a = torch.full((3, 3), float('nan'), dtype=dtype, device=device)
-            a[0] = float('nan')
-            with self.assertRaisesRegex(torch.linalg.LinAlgError, error_msg):
-                svd(a)
-            error_msg = r'(CUSOLVER_STATUS_EXECUTION_FAILED|\(Batch element 1\): The algorithm failed to converge)'
-            a = torch.randn(3, 33, 33, dtype=dtype, device=device)
-            a[1, 0, 0] = float('nan')
-            with self.assertRaisesRegex(torch.linalg.LinAlgError, error_msg):
-                svd(a)
-
     def cholesky_solve_test_helper(self, A_dims, b_dims, upper, device, dtype):
         from torch.testing._internal.common_utils import random_hermitian_pd_matrix
 

From df14aa7a232ef72b7e49a072c132a9e98727949e Mon Sep 17 00:00:00 2001
From: Aaron Gokaslan <aaronGokaslan@gmail.com>
Date: Mon, 7 Nov 2022 22:17:10 +0000
Subject: [PATCH 0635/1922] (fix): Add some missing std::moves to C10 (#88512)

I saw some missed optimization opportunities in C10 using std::move and thought I would submit a PR to fix them. There are particularly a lot of them dealing with the symbolic operators which are used in quite a few places including in loops.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88512
Approved by: https://github.com/ezyang
---
 c10/core/SymFloat.cpp             | 17 +++++++-----
 c10/core/SymFloat.h               | 10 +++----
 c10/core/SymInt.cpp               | 43 +++++++++++++++--------------
 c10/core/SymInt.h                 | 45 ++++++++++++++++---------------
 c10/core/TensorImpl.cpp           |  6 +++--
 c10/cuda/CUDACachingAllocator.cpp |  8 +++---
 c10/util/ThreadLocalDebugInfo.cpp |  4 ++-
 7 files changed, 73 insertions(+), 60 deletions(-)

diff --git a/c10/core/SymFloat.cpp b/c10/core/SymFloat.cpp
index 3c1fea2ee3503..81e8f25d5bb64 100644
--- a/c10/core/SymFloat.cpp
+++ b/c10/core/SymFloat.cpp
@@ -1,6 +1,7 @@
 #include <c10/core/SymFloat.h>
 #include <c10/core/SymNodeImpl.h>
 #include <array>
+#include <utility>
 
 namespace c10 {
 
@@ -9,7 +10,9 @@ SymNode SymFloat::toSymNodeImpl() const {
   return SymNode::reclaim_copy(toSymNodeImplUnowned());
 }
 
-static std::array<SymNode, 2> normalize_symfloats(SymFloat a_, SymFloat b_) {
+static std::array<SymNode, 2> normalize_symfloats(
+    const SymFloat& a_,
+    const SymFloat& b_) {
   SymNode a, b;
   if (a_.is_symbolic())
     a = a_.toSymNodeImpl();
@@ -23,10 +26,10 @@ static std::array<SymNode, 2> normalize_symfloats(SymFloat a_, SymFloat b_) {
   if (!b) {
     b = common->wrap_float(b_.as_float_unchecked());
   }
-  return {a, b};
+  return {std::move(a), std::move(b)};
 }
 
-SymFloat SymFloat::operator+(SymFloat sci) const {
+SymFloat SymFloat::operator+(const SymFloat& sci) const {
   if (!is_symbolic() && !sci.is_symbolic()) {
     return SymFloat(data_ + sci.data_);
   }
@@ -34,7 +37,7 @@ SymFloat SymFloat::operator+(SymFloat sci) const {
   return SymFloat(res[0]->add(res[1]));
 }
 
-SymFloat SymFloat::operator-(SymFloat sci) const {
+SymFloat SymFloat::operator-(const SymFloat& sci) const {
   if (!is_symbolic() && !sci.is_symbolic()) {
     return SymFloat(data_ - sci.data_);
   }
@@ -42,7 +45,7 @@ SymFloat SymFloat::operator-(SymFloat sci) const {
   return SymFloat(res[0]->sub(res[1]));
 }
 
-SymFloat SymFloat::operator*(SymFloat sci) const {
+SymFloat SymFloat::operator*(const SymFloat& sci) const {
   if (!is_symbolic() && !sci.is_symbolic()) {
     return SymFloat(data_ * sci.data_);
   }
@@ -50,7 +53,7 @@ SymFloat SymFloat::operator*(SymFloat sci) const {
   return SymFloat(res[0]->mul(res[1]));
 }
 
-SymFloat SymFloat::operator/(SymFloat sci) const {
+SymFloat SymFloat::operator/(const SymFloat& sci) const {
   if (!is_symbolic() && !sci.is_symbolic()) {
     return SymFloat(data_ / sci.data_);
   }
@@ -58,7 +61,7 @@ SymFloat SymFloat::operator/(SymFloat sci) const {
   return SymFloat(res[0]->truediv(res[1]));
 }
 
-std::ostream& operator<<(std::ostream& os, SymFloat s) {
+std::ostream& operator<<(std::ostream& os, const SymFloat& s) {
   if (s.is_symbolic()) {
     os << s.toSymNodeImpl()->str();
   } else {
diff --git a/c10/core/SymFloat.h b/c10/core/SymFloat.h
index b787c020fd757..7da364ce127ad 100644
--- a/c10/core/SymFloat.h
+++ b/c10/core/SymFloat.h
@@ -35,10 +35,10 @@ class C10_API SymFloat {
     return data_;
   }
 
-  SymFloat operator+(SymFloat) const;
-  SymFloat operator-(SymFloat) const;
-  SymFloat operator*(SymFloat) const;
-  SymFloat operator/(SymFloat) const;
+  SymFloat operator+(const SymFloat&) const;
+  SymFloat operator-(const SymFloat&) const;
+  SymFloat operator*(const SymFloat&) const;
+  SymFloat operator/(const SymFloat&) const;
 
   // N.B. It's important to keep this definition in the header
   // as we expect if checks to be folded for mobile builds
@@ -57,5 +57,5 @@ class C10_API SymFloat {
   SymNode ptr_;
 };
 
-C10_API std::ostream& operator<<(std::ostream& os, SymFloat s);
+C10_API std::ostream& operator<<(std::ostream& os, const SymFloat& s);
 } // namespace c10
diff --git a/c10/core/SymInt.cpp b/c10/core/SymInt.cpp
index b32157e4a94e1..977397be1264b 100644
--- a/c10/core/SymInt.cpp
+++ b/c10/core/SymInt.cpp
@@ -2,10 +2,13 @@
 #include <c10/core/SymInt.h>
 #include <c10/core/SymNodeImpl.h>
 #include <array>
+#include <utility>
 
 namespace c10 {
 
-static std::array<SymNode, 2> normalize_symints(SymInt a_, SymInt b_) {
+static std::array<SymNode, 2> normalize_symints(
+    const SymInt& a_,
+    const SymInt& b_) {
   SymNode a, b;
   if (a_.is_symbolic())
     a = a_.toSymNodeImpl();
@@ -20,7 +23,7 @@ static std::array<SymNode, 2> normalize_symints(SymInt a_, SymInt b_) {
   if (!b) {
     b = common->wrap_int(b_.as_int_unchecked());
   }
-  return {a, b};
+  return {std::move(a), std::move(b)};
 }
 
 SymNode SymInt::toSymNodeImpl() const {
@@ -51,7 +54,7 @@ SymInt::operator SymFloat() const {
   return SymFloat(toSymNodeImpl()->sym_float());
 }
 
-SymInt SymInt::operator+(SymInt sci) const {
+SymInt SymInt::operator+(const SymInt& sci) const {
   if (!is_symbolic() && !sci.is_symbolic()) {
     return SymInt(data_ + sci.data_);
   }
@@ -59,7 +62,7 @@ SymInt SymInt::operator+(SymInt sci) const {
   return SymInt(res[0]->add(res[1]));
 }
 
-SymInt SymInt::operator-(SymInt sci) const {
+SymInt SymInt::operator-(const SymInt& sci) const {
   if (!is_symbolic() && !sci.is_symbolic()) {
     return SymInt(data_ - sci.data_);
   }
@@ -67,7 +70,7 @@ SymInt SymInt::operator-(SymInt sci) const {
   return SymInt(res[0]->sub(res[1]));
 }
 
-SymInt SymInt::operator*(SymInt sci) const {
+SymInt SymInt::operator*(const SymInt& sci) const {
   if (!is_symbolic() && !sci.is_symbolic()) {
     return SymInt(data_ * sci.data_);
   }
@@ -75,7 +78,7 @@ SymInt SymInt::operator*(SymInt sci) const {
   return SymInt(res[0]->mul(res[1]));
 }
 
-SymInt SymInt::operator/(SymInt sci) const {
+SymInt SymInt::operator/(const SymInt& sci) const {
   if (!is_symbolic() && !sci.is_symbolic()) {
     return SymInt(data_ / sci.data_);
   }
@@ -83,7 +86,7 @@ SymInt SymInt::operator/(SymInt sci) const {
   return SymInt(res[0]->floordiv(res[1]));
 }
 
-SymInt SymInt::operator%(SymInt sci) const {
+SymInt SymInt::operator%(const SymInt& sci) const {
   if (!is_symbolic() && !sci.is_symbolic()) {
     return SymInt(data_ % sci.data_);
   }
@@ -91,7 +94,7 @@ SymInt SymInt::operator%(SymInt sci) const {
   return SymInt(res[0]->mod(res[1]));
 }
 
-bool SymInt::operator==(SymInt sci) const {
+bool SymInt::operator==(const SymInt& sci) const {
   if (!is_symbolic() && !sci.is_symbolic()) {
     return data_ == sci.data_;
   }
@@ -99,11 +102,11 @@ bool SymInt::operator==(SymInt sci) const {
   return res[0]->eq(res[1])->bool_();
 }
 
-bool SymInt::operator!=(SymInt sci) const {
+bool SymInt::operator!=(const SymInt& sci) const {
   return !(*this == sci);
 }
 
-bool SymInt::operator<(SymInt sci) const {
+bool SymInt::operator<(const SymInt& sci) const {
   if (!is_symbolic() && !sci.is_symbolic()) {
     return data_ < sci.data_;
   }
@@ -111,7 +114,7 @@ bool SymInt::operator<(SymInt sci) const {
   return res[0]->lt(res[1])->bool_();
 }
 
-bool SymInt::operator<=(SymInt sci) const {
+bool SymInt::operator<=(const SymInt& sci) const {
   if (!is_symbolic() && !sci.is_symbolic()) {
     return data_ <= sci.data_;
   }
@@ -119,7 +122,7 @@ bool SymInt::operator<=(SymInt sci) const {
   return res[0]->le(res[1])->bool_();
 }
 
-bool SymInt::operator>(SymInt sci) const {
+bool SymInt::operator>(const SymInt& sci) const {
   if (!is_symbolic() && !sci.is_symbolic()) {
     return data_ > sci.data_;
   }
@@ -127,7 +130,7 @@ bool SymInt::operator>(SymInt sci) const {
   return res[0]->gt(res[1])->bool_();
 }
 
-bool SymInt::operator>=(SymInt sci) const {
+bool SymInt::operator>=(const SymInt& sci) const {
   if (!is_symbolic() && !sci.is_symbolic()) {
     return data_ >= sci.data_;
   }
@@ -135,14 +138,14 @@ bool SymInt::operator>=(SymInt sci) const {
   return res[0]->ge(res[1])->bool_();
 }
 
-SymInt SymInt::min(SymInt sci) const {
+SymInt SymInt::min(const SymInt& sci) const {
   if (!is_symbolic() && !sci.is_symbolic()) {
     return std::min(data_, sci.data_);
   }
   auto res = normalize_symints(*this, sci);
   return SymInt(res[0]->min(res[1]));
 }
-SymInt SymInt::max(SymInt sci) const {
+SymInt SymInt::max(const SymInt& sci) const {
   if (!is_symbolic() && !sci.is_symbolic()) {
     return std::max(data_, sci.data_);
   }
@@ -150,15 +153,15 @@ SymInt SymInt::max(SymInt sci) const {
   return SymInt(res[0]->max(res[1]));
 }
 
-void SymInt::operator*=(SymInt sci) {
+void SymInt::operator*=(const SymInt& sci) {
   *this = *this * sci;
 }
 
-void SymInt::operator/=(SymInt sci) {
+void SymInt::operator/=(const SymInt& sci) {
   *this = *this / sci;
 }
 
-void SymInt::operator+=(SymInt sci) {
+void SymInt::operator+=(const SymInt& sci) {
   *this = *this + sci;
 }
 
@@ -190,7 +193,7 @@ SymInt SymInt::operator*(int64_t sci) const {
   return *this * c10::SymInt(sci);
 }
 
-std::ostream& operator<<(std::ostream& os, SymInt s) {
+std::ostream& operator<<(std::ostream& os, const SymInt& s) {
   if (s.is_symbolic()) {
     os << s.toSymNodeImpl()->str();
   } else {
@@ -199,7 +202,7 @@ std::ostream& operator<<(std::ostream& os, SymInt s) {
   return os;
 }
 
-SymInt operator-(SymInt s) {
+SymInt operator-(const SymInt& s) {
   if (s.is_symbolic()) {
     return SymInt(s.toSymNodeImpl()->neg());
   } else {
diff --git a/c10/core/SymInt.h b/c10/core/SymInt.h
index a10775196d86b..0c7c69fe9553b 100644
--- a/c10/core/SymInt.h
+++ b/c10/core/SymInt.h
@@ -7,6 +7,7 @@
 
 #include <memory>
 #include <numeric>
+#include <utility>
 
 namespace c10 {
 
@@ -56,7 +57,7 @@ class C10_API SymInt {
       data_ = s.data_;
     }
   }
-  SymInt(SymInt&& s) : data_(s.data_) {
+  SymInt(SymInt&& s) noexcept : data_(s.data_) {
     s.data_ = 0;
   }
 
@@ -70,7 +71,7 @@ class C10_API SymInt {
     }
     return *this;
   }
-  SymInt& operator=(SymInt&& s) {
+  SymInt& operator=(SymInt&& s) noexcept {
     if (this != &s) {
       release_(); // release the current SymNode if any
       data_ = s.data_;
@@ -151,23 +152,23 @@ class C10_API SymInt {
 #endif
   }
 
-  SymInt operator+(SymInt sci) const;
-  SymInt operator-(SymInt sci) const;
-  SymInt operator*(SymInt sci) const;
-  SymInt operator/(SymInt sci) const;
-  SymInt operator%(SymInt sci) const;
-  bool operator==(SymInt sci) const;
-  bool operator!=(SymInt p2) const;
-  bool operator<(SymInt sci) const;
-  bool operator<=(SymInt sci) const;
-  bool operator>(SymInt sci) const;
-  bool operator>=(SymInt sci) const;
-  void operator*=(SymInt sci);
-  void operator+=(SymInt sci);
-  void operator/=(SymInt sci);
-
-  SymInt min(SymInt sci) const;
-  SymInt max(SymInt sci) const;
+  SymInt operator+(const SymInt& sci) const;
+  SymInt operator-(const SymInt& sci) const;
+  SymInt operator*(const SymInt& sci) const;
+  SymInt operator/(const SymInt& sci) const;
+  SymInt operator%(const SymInt& sci) const;
+  bool operator==(const SymInt& sci) const;
+  bool operator!=(const SymInt& p2) const;
+  bool operator<(const SymInt& sci) const;
+  bool operator<=(const SymInt& sci) const;
+  bool operator>(const SymInt& sci) const;
+  bool operator>=(const SymInt& sci) const;
+  void operator*=(const SymInt& sci);
+  void operator+=(const SymInt& sci);
+  void operator/=(const SymInt& sci);
+
+  SymInt min(const SymInt& sci) const;
+  SymInt max(const SymInt& sci) const;
 
   SymInt operator*(int64_t sci) const;
   bool operator<(int64_t sci) const;
@@ -231,9 +232,9 @@ inline c10::SymInt multiply_integers(const C& container) {
       container.begin(),
       container.end(),
       c10::SymInt(1),
-      [](c10::SymInt a, c10::SymInt b) { return a * b; });
+      [](const c10::SymInt& a, const c10::SymInt& b) { return a * b; });
 }
 
-C10_API std::ostream& operator<<(std::ostream& os, SymInt s);
-C10_API SymInt operator-(SymInt s);
+C10_API std::ostream& operator<<(std::ostream& os, const SymInt& s);
+C10_API SymInt operator-(const SymInt& s);
 } // namespace c10
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index 976382cf2ee7f..fdd16605634b3 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -10,6 +10,8 @@
 #include <c10/util/Optional.h>
 #include <c10/util/irange.h>
 
+#include <utility>
+
 C10_DEFINE_bool(
     caffe2_keep_on_shrink,
     true,
@@ -786,7 +788,7 @@ void TensorImpl::Extend(int64_t num, float growthPct) {
           sizes_and_strides_.size_at_unchecked(0) * (1 + growthPct / 100))));
   auto oldData = std::move(storage_.data_ptr());
   auto oldSize = numel_;
-  Resize(newCapacity);
+  Resize(std::move(newCapacity));
   auto* newData = raw_mutable_data(data_type_);
   if (data_type_.copy()) {
     TORCH_CHECK(
@@ -838,7 +840,7 @@ void TensorImpl::ReserveSpace(int64_t outer_dim) {
   auto oldSize = numel_;
   SmallVector<int64_t, 5> oldDims(
       sizes_and_strides.begin(), sizes_and_strides.end());
-  Resize(newCapacity);
+  Resize(std::move(newCapacity));
   // Allocate new memory but don't copy over the data
   raw_mutable_data(data_type_);
   sizes_and_strides_.set_sizes(oldDims);
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 1b38f08b1e90b..9876259522721 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -19,6 +19,7 @@
 #include <mutex>
 #include <regex>
 #include <set>
+#include <utility>
 #include <vector>
 
 namespace c10 {
@@ -882,7 +883,7 @@ class DeviceCachingAllocator {
             device_free,
             params.size(),
             params.stream(),
-            context);
+            std::move(context));
       }
       stats.num_ooms += 1;
 
@@ -2187,7 +2188,8 @@ class NativeCachingAllocator : public CUDAAllocator {
       CaptureId_t graph_id,
       MempoolId_t mempool_id) override {
     assertValidDevice(device);
-    device_allocator[device]->notifyCaptureBegin(graph_id, mempool_id);
+    device_allocator[device]->notifyCaptureBegin(
+        graph_id, std::move(mempool_id));
   }
 
   void notifyCaptureAboutToEnd(int device, CaptureId_t graph_id) override {
@@ -2199,7 +2201,7 @@ class NativeCachingAllocator : public CUDAAllocator {
 
   void notifyCaptureDestroy(int device, MempoolId_t mempool_id) override {
     assertValidDevice(device);
-    device_allocator[device]->notifyCaptureDestroy(mempool_id);
+    device_allocator[device]->notifyCaptureDestroy(std::move(mempool_id));
   }
 
   void* raw_alloc(size_t nbytes) override {
diff --git a/c10/util/ThreadLocalDebugInfo.cpp b/c10/util/ThreadLocalDebugInfo.cpp
index 85cb839c6107a..e79ee00d1a61f 100644
--- a/c10/util/ThreadLocalDebugInfo.cpp
+++ b/c10/util/ThreadLocalDebugInfo.cpp
@@ -1,6 +1,8 @@
 #include <c10/util/ThreadLocal.h>
 #include <c10/util/ThreadLocalDebugInfo.h>
 
+#include <utility>
+
 namespace c10 {
 
 C10_DEFINE_TLS_static(std::shared_ptr<ThreadLocalDebugInfo>, tls_debug_info);
@@ -67,7 +69,7 @@ DebugInfoGuard::DebugInfoGuard(
     return;
   }
   prev_info_ = debug_info;
-  ThreadLocalDebugInfo::_push(kind, info);
+  ThreadLocalDebugInfo::_push(kind, std::move(info));
   active_ = true;
 }
 

From 2a995271959f25ef18b84487e1cfe985dd6d1ea0 Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@fb.com>
Date: Mon, 7 Nov 2022 22:24:44 +0000
Subject: [PATCH 0636/1922] Dynamo Dashboard Improvements (#88516)

Implement various features in https://github.com/pytorch/torchdynamo/issues/1644:
- Upload nightly run logs to /fsx before parsing - for backing up parsing failures.
- Flag models with (1) < 0.95x speedup, (2) > 2min compile time, (3) < 0.9x compression ratio
- Flag models that were passing yesterday but failed today.
- Other small bug fixes.

See https://github.com/pytorch/torchdynamo/issues/1831 for sample outputs.
Also tested by running run.sh:
```bash
# Setup the output directory
rm -rf ../test-dynamo-runner-logs-3/
mkdir ../test-dynamo-runner-logs-3/

# Commands for torchbench for device=cuda, dtype=float32 for training and for performance testing
python benchmarks/dynamo/torchbench.py --performance --float32 -dcuda --output=../test-dynamo-runner-logs-3//inductor_torchbench_float32_training_cuda_performance.csv --training --inductor   --no-skip --dashboard --only mobilenet_v2 --cold_start_latency

# Commands for torchbench for device=cuda, dtype=float32 for training and for accuracy testing
python benchmarks/dynamo/torchbench.py --accuracy --float32 -dcuda --output=../test-dynamo-runner-logs-3//inductor_torchbench_float32_training_cuda_accuracy.csv --training --inductor   --no-skip --dashboard --only mobilenet_v2
```

with the command
`python benchmarks/dynamo/runner.py --output-dir ../test-dynamo-runner-logs-3/ --dashboard-archive-path /data/home/williamwen/dynamo-runner-logs-copy --training --run --compilers inductor --flag-compilers inductor --suites torchbench --update-dashboard` (need to comment out the `generate_commands` line and change the github issue ID from 681 to something else).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88516
Approved by: https://github.com/anijain2305
---
 benchmarks/dynamo/runner.py | 338 ++++++++++++++++++++++++++++++++----
 1 file changed, 302 insertions(+), 36 deletions(-)

diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py
index 95c40ac8760e3..f29877d64a720 100755
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@@ -26,12 +26,14 @@
 
 import argparse
 import dataclasses
+import functools
 import glob
 import importlib
 import io
 import itertools
 import logging
 import os
+import re
 import shutil
 import subprocess
 from collections import defaultdict
@@ -40,6 +42,8 @@
 from random import randint
 
 import matplotlib.pyplot as plt
+
+import numpy as np
 import pandas as pd
 import torch
 
@@ -90,6 +94,10 @@
         "inductor_no_cudagraphs",
     ],
     "inference": ["ts_nvfuser_cudagraphs", "inductor"],
+    "flag_compilers": {
+        "training": ["inductor", "inductor_no_cudagraphs"],
+        "inference": ["inductor"],
+    },
     "dtypes": [
         "float32",
     ],
@@ -112,6 +120,25 @@
 }
 
 
+def flag_speedup(x):
+    return pd.isna(x) or x < 0.95
+
+
+def flag_compilation_latency(x):
+    return pd.isna(x) or x == 0 or x > 120
+
+
+def flag_compression_ratio(x):
+    return pd.isna(x) or x < 0.9
+
+
+FLAG_FNS = {
+    "speedup": flag_speedup,
+    "compilation_latency": flag_compilation_latency,
+    "compression_ratio": flag_compression_ratio,
+}
+
+
 def percentage(part, whole, decimals=2):
     if whole == 0:
         return 0
@@ -128,6 +155,12 @@ def parse_args():
         action="append",
         help=f"For --inference, options are {INFERENCE_COMPILERS}. For --training, options are {TRAINING_COMPILERS}",
     )
+
+    parser.add_argument(
+        "--flag-compilers",
+        action="append",
+        help="List of compilers to flag issues. Same format as --compilers.",
+    )
     parser.add_argument(
         "--quick", action="store_true", help="Just runs one model. Helps in debugging"
     )
@@ -192,6 +225,11 @@ def parse_args():
         default=DASHBOARD_DEFAULTS["dashboard_archive_path"],
         help="Archived directory path",
     )
+    parser.add_argument(
+        "--archive-name",
+        help="Directory name under dashboard-archive-path to copy output-dir to. "
+        "If not provided, a generated name is used.",
+    )
     parser.add_argument(
         "--dashboard-gh-cli-path",
         default=DASHBOARD_DEFAULTS["dashboard_gh_cli_path"],
@@ -226,6 +264,11 @@ def get_skip_tests(suite):
     return skip_str
 
 
+def generate_csv_name(args, dtype, suite, device, compiler, testing):
+    mode = get_mode(args)
+    return f"{compiler}_{suite}_{dtype}_{mode}_{device}_{testing}.csv"
+
+
 def generate_commands(args, dtypes, suites, devices, compilers, output_dir):
     mode = get_mode(args)
     with open("run.sh", "w") as runfile:
@@ -245,7 +288,7 @@ def generate_commands(args, dtypes, suites, devices, compilers, output_dir):
                 info = TABLE[mode]
                 for compiler in compilers:
                     base_cmd = info[compiler]
-                    output_filename = f"{output_dir}/{compiler}_{suite}_{dtype}_{mode}_{device}_{testing}.csv"
+                    output_filename = f"{output_dir}/{generate_csv_name(args, dtype, suite, device, compiler, testing)}"
                     cmd = f"python benchmarks/dynamo/{suite}.py --{testing} --{dtype} -d{device} --output={output_filename}"
                     cmd = f"{cmd} {base_cmd} {args.extra_args} --no-skip --dashboard"
 
@@ -334,12 +377,49 @@ def env_var(name):
         gh_fh.write(comment)
 
 
+@functools.lru_cache(None)
+def archive_data(archive_name):
+    if archive_name is not None:
+        prefix_match = re.search(r"\w+(?=_performance)", archive_name)
+        if prefix_match is not None:
+            prefix = prefix_match.group(0)
+        else:
+            prefix = ""
+        day_match = re.search(r"day_(\d+)_", archive_name)
+        if day_match is not None:
+            day = day_match.group(1)
+        else:
+            day = "000"
+    else:
+        day = datetime.today().strftime("%j")
+        prefix = datetime.today().strftime(f"day_{day}_%d_%m_%y")
+    return day, prefix
+
+
+@functools.lru_cache(None)
+def default_archive_name(dtype):
+    _, prefix = archive_data(None)
+    return f"{prefix}_performance_{dtype}_{randint(100, 999)}"
+
+
+def archive(src_dir, dest_dir_prefix, archive_name, dtype):
+    if archive_name is None:
+        archive_name = default_archive_name(dtype)
+    # Copy the folder to archived location
+    dest = os.path.join(dest_dir_prefix, archive_name)
+    shutil.copytree(src_dir, dest, dirs_exist_ok=True)
+    print(f"copied contents of {src_dir} to {dest}")
+
+
 class Parser:
-    def __init__(self, suites, devices, dtypes, compilers, mode, output_dir):
+    def __init__(
+        self, suites, devices, dtypes, compilers, flag_compilers, mode, output_dir
+    ):
         self.suites = suites
         self.devices = devices
         self.dtypes = dtypes
         self.compilers = compilers
+        self.flag_compilers = flag_compilers
         self.output_dir = output_dir
         self.mode = mode
 
@@ -353,8 +433,12 @@ def has_header(self, output_filename):
 
 
 class ParsePerformanceLogs(Parser):
-    def __init__(self, suites, devices, dtypes, compilers, mode, output_dir):
-        super().__init__(suites, devices, dtypes, compilers, mode, output_dir)
+    def __init__(
+        self, suites, devices, dtypes, compilers, flag_compilers, mode, output_dir
+    ):
+        super().__init__(
+            suites, devices, dtypes, compilers, flag_compilers, mode, output_dir
+        )
         self.parsed_frames = defaultdict(lambda: defaultdict(None))
         self.untouched_parsed_frames = defaultdict(lambda: defaultdict(None))
         self.metrics = ["speedup", "compilation_latency", "compression_ratio"]
@@ -401,12 +485,6 @@ def parse(self):
         self.extract_df("accuracy", "accuracy")
         for metric in self.metrics:
             self.extract_df(metric, "performance")
-        self.generate_executive_summary()
-        for suite in self.suites:
-            self.plot_graph(
-                self.untouched_parsed_frames[suite]["speedup"],
-                f"{suite}_{self.dtypes[0]}",
-            )
 
     def clean_batch_sizes(self, frames):
         # Clean up batch sizes when its 0
@@ -460,17 +538,17 @@ def extract_df(self, metric, testing):
                 df_accuracy = self.parsed_frames[suite]["accuracy"]
                 perf_rows = []
                 for model_name in df["name"]:
-                    perf_row = df[df["name"] == model_name]
+                    perf_row = df[df["name"] == model_name].copy()
                     acc_row = df_accuracy[df_accuracy["name"] == model_name]
                     for compiler in self.compilers:
                         if not perf_row.empty:
                             if acc_row.empty:
-                                perf_row[compiler].iloc[0] = 0.0
+                                perf_row.loc[0, compiler] = 0.0
                             elif acc_row[compiler].iloc[0] not in (
                                 "pass",
                                 "pass_due_to_skip",
                             ):
-                                perf_row[compiler].iloc[0] = 0.0
+                                perf_row.loc[0, compiler] = 0.0
                     perf_rows.append(perf_row)
                 df = pd.concat(perf_rows)
             df = df.sort_values(by=list(reversed(self.compilers)), ascending=False)
@@ -592,6 +670,57 @@ def generate_executive_summary(self):
         str_io.write(peak_memory_summary)
         self.executive_summary = str_io.getvalue()
 
+    def flag_bad_entries(self, suite, metric, flag_fn):
+        df = self.untouched_parsed_frames[suite][metric]
+        df = df.drop("dev", axis=1)
+        df = df.rename(columns={"batch_size": "bs"})
+        # apply flag_fn elementwise to flag_compilers columns,
+        # if one element fails, the entire row is flagged
+        flag = np.logical_or.reduce(
+            df[self.flag_compilers].applymap(flag_fn),
+            axis=1,
+        )
+        df = df[flag]
+        df = df.assign(suite=suite)
+        return df.reindex(columns=["suite", "name"] + self.flag_compilers)
+
+    def get_metric_title(self, metric):
+        if metric == "speedup":
+            return "Performance speedup"
+        elif metric == "accuracy":
+            return "Accuracy"
+        elif metric == "compilation_latency":
+            return "Compilation latency (sec)"
+        elif metric == "compression_ratio":
+            return "Peak Memory Compression Ratio"
+        raise RuntimeError("unknown metric")
+
+    def generate_warnings(self):
+        title = "## Warnings ##"
+        body = ""
+        for metric in [
+            "speedup",
+            "compilation_latency",
+            "compression_ratio",
+        ]:
+            dfs = []
+            for suite in self.suites:
+                dfs.append(self.flag_bad_entries(suite, metric, FLAG_FNS[metric]))
+            df = pd.concat(dfs, axis=0)
+            if df.empty:
+                continue
+            tabform = tabulate(df, headers="keys", tablefmt="pretty", showindex="never")
+            str_io = io.StringIO()
+            str_io.write("\n")
+            str_io.write(self.get_metric_title(metric) + " warnings\n")
+            str_io.write("~~~\n")
+            str_io.write(f"{tabform}\n")
+            str_io.write("~~~\n")
+            body += str_io.getvalue()
+
+        comment = generate_dropdown_comment(title, body)
+        return comment
+
     def prepare_message(self, suite):
         title = f"## {suite} suite with {self.dtypes[0]} precision ##"
         body = ""
@@ -607,14 +736,7 @@ def prepare_message(self, suite):
             tabform = tabulate(df, headers="keys", tablefmt="pretty", showindex="never")
             str_io = io.StringIO()
             str_io.write("\n")
-            if metric == "speedup":
-                str_io.write("Performance speedup\n")
-            elif metric == "accuracy":
-                str_io.write("Accuracy\n")
-            elif metric == "compilation_latency":
-                str_io.write("Compilation latency (sec)\n")
-            elif metric == "compression_ratio":
-                str_io.write("Peak Memory Compression Ratio\n")
+            str_io.write(self.get_metric_title(metric) + "\n")
             str_io.write("~~~\n")
             str_io.write(f"{tabform}\n")
             str_io.write("~~~\n")
@@ -624,6 +746,13 @@ def prepare_message(self, suite):
         return comment
 
     def gen_summary_files(self):
+        self.generate_executive_summary()
+        for suite in self.suites:
+            self.plot_graph(
+                self.untouched_parsed_frames[suite]["speedup"],
+                f"{suite}_{self.dtypes[0]}",
+            )
+
         with open(f"{self.output_dir}/gh_title.txt", "w") as gh_fh:
             str_io = io.StringIO()
             str_io.write("\n")
@@ -635,6 +764,11 @@ def gen_summary_files(self):
             gh_fh.write(self.executive_summary)
         print(self.executive_summary)
 
+        with open(f"{self.output_dir}/gh_warnings.txt", "w") as gh_fh:
+            warnings_body = self.generate_warnings()
+            gh_fh.write(warnings_body)
+            print(warnings_body)
+
         str_io = io.StringIO()
         for suite in self.suites:
             str_io.write(self.prepare_message(suite))
@@ -644,12 +778,14 @@ def gen_summary_files(self):
             gh_fh.write(str_io.getvalue())
 
 
-def parse_logs(args, dtypes, suites, devices, compilers, output_dir):
+def parse_logs(args, dtypes, suites, devices, compilers, flag_compilers, output_dir):
     mode = get_mode(args)
     build_summary()
 
     parser_class = ParsePerformanceLogs
-    parser = parser_class(suites, devices, dtypes, compilers, mode, output_dir)
+    parser = parser_class(
+        suites, devices, dtypes, compilers, flag_compilers, mode, output_dir
+    )
     parser.gen_summary_files()
     return
 
@@ -667,6 +803,101 @@ def get_date(log_info):
     return datetime.strptime(f"{log_info.day}", "%j").strftime("%m-%d")
 
 
+class AccuracyRegressionTracker:
+    """
+    Compares the most recent 2 accuracy benchmarks to find previously
+    passing models that now fail.
+    """
+
+    def __init__(self, args):
+        self.args = args
+        self.lookup_file = os.path.join(self.args.dashboard_archive_path, "lookup.csv")
+        assert os.path.exists(self.lookup_file)
+
+    def find_last_2(self, suite, device, dtype, compiler):
+        df = pd.read_csv(self.lookup_file, names=("day", "mode", "prec", "path"))
+        df = df[df["mode"] == "performance"]
+        df = df[df["prec"] == dtype]
+        df = df[::-1]
+        parsers = []
+        for path in df["path"]:
+            output_dir = os.path.join(self.args.dashboard_archive_path, path)
+            if os.path.exists(
+                os.path.join(
+                    output_dir,
+                    generate_csv_name(
+                        self.args, dtype, suite, device, compiler, "accuracy"
+                    ),
+                )
+            ):
+                parsers.append(
+                    ParsePerformanceLogs(
+                        [suite],
+                        [device],
+                        [dtype],
+                        [compiler],
+                        [compiler],
+                        get_mode(self.args),
+                        output_dir,
+                    )
+                )
+            if len(parsers) >= 2:
+                return parsers
+        return None
+
+    def generate_comment(self):
+        title = "## Accuracy Regressions ##\n"
+        body = ""
+        dtype = self.args.dtypes[0]
+        device = self.args.devices[0]
+        for suite in self.args.suites:
+            dfs = []
+            for compiler in self.args.flag_compilers:
+                last2 = self.find_last_2(suite, device, dtype, compiler)
+                if last2 is None:
+                    continue
+
+                df_cur, df_prev = [
+                    last2[i].untouched_parsed_frames[suite]["accuracy"] for i in (0, 1)
+                ]
+                df_merge = df_cur.merge(df_prev, on="name", suffixes=("_cur", "_prev"))
+                flag = np.logical_and(
+                    df_merge[compiler + "_prev"].apply(lambda x: "pass" in x),
+                    df_merge[compiler + "_cur"].apply(lambda x: "pass" not in x),
+                )
+                df_bad = df_merge[flag]
+                dfs.append(
+                    pd.DataFrame(
+                        data={
+                            "compiler": compiler,
+                            "name": df_bad["name"],
+                            "prev_status": df_bad[compiler + "_prev"],
+                            "cur_status": df_bad[compiler + "_cur"],
+                        }
+                    )
+                )
+
+            if not dfs:
+                continue
+            df = pd.concat(dfs, axis=0)
+            if df.empty:
+                continue
+            tabform = tabulate(df, headers="keys", tablefmt="pretty", showindex="never")
+            str_io = io.StringIO()
+            str_io.write("\n")
+            str_io.write(f"Accuracy regressions for {suite}\n")
+            str_io.write("~~~\n")
+            str_io.write(f"{tabform}\n")
+            str_io.write("~~~\n")
+            body += str_io.getvalue()
+
+        comment = generate_dropdown_comment(title, body)
+
+        with open(f"{self.args.output_dir}/gh_accuracy_regression.txt", "w") as gh_fh:
+            gh_fh.write(comment)
+            print(comment)
+
+
 class RegressionTracker:
     """
     Plots progress of different metrics over time to detect regressions.
@@ -726,6 +957,8 @@ def diff(self):
                     if not os.path.exists(gmean_filename):
                         continue
                     df = pd.read_csv(gmean_filename)
+                    if suite not in df:
+                        continue
                     if metric == "geomean":
                         df[suite] = df[suite].str.replace("x", "").astype(float)
                     elif metric == "passrate":
@@ -741,6 +974,7 @@ def diff(self):
                     dfs.append(df)
 
                 df = pd.concat(dfs)
+                df = df.interpolate(method="linear")
                 ax = df.plot(
                     ax=axes[idx],
                     kind="line",
@@ -771,19 +1005,24 @@ def __init__(self, args):
         self.output_dir = args.output_dir
         self.lookup_file = os.path.join(self.args.dashboard_archive_path, "lookup.csv")
         assert os.path.exists(self.lookup_file)
-        self.archive()
 
     def archive(self):
+        dtype = self.args.dtypes[0]
         # Copy the folder to archived location
-        src = self.output_dir
-        day = datetime.today().strftime("%j")
-        prefix = datetime.today().strftime(f"day_{day}_%d_%m_%y")
-        target_dir = f"{prefix}_performance_{self.args.dtypes[0]}_{randint(100, 999)}"
-        target = os.path.join(self.args.dashboard_archive_path, target_dir)
-        shutil.copytree(src, target)
+        archive(
+            self.output_dir,
+            self.args.dashboard_archive_path,
+            self.args.archive_name,
+            dtype,
+        )
+        day, _ = archive_data(self.args.archive_name)
+        target_dir = (
+            default_archive_name(dtype)
+            if self.args.archive_name is None
+            else self.args.archive_name
+        )
 
         # Update lookup csv the folder to arhived logs
-        dtype = self.args.dtypes[0]
         subprocess.check_call(
             f'echo "{day},performance,{dtype},{target_dir}" >> {self.lookup_file}',
             shell=True,
@@ -809,14 +1048,19 @@ def gen_comment(self):
         files = [
             "gh_title.txt",
             "gh_executive_summary.txt",
+            "gh_warnings.txt",
             "gh_regression.txt",
+            "gh_accuracy_regression.txt",
             "gh_training.txt",
             "gh_graphs.txt",
         ]
         all_lines = []
         for f in files:
-            with open(os.path.join(self.output_dir, f), "r") as fh:
-                all_lines.extend(fh.readlines())
+            try:
+                with open(os.path.join(self.output_dir, f), "r") as fh:
+                    all_lines.extend(fh.readlines())
+            except FileNotFoundError:
+                pass
 
         return "\n".join([x.rstrip() for x in all_lines])
 
@@ -838,15 +1082,19 @@ def comment_on_gh(self, comment):
 
     def update(self):
         self.upload_graphs()
+        AccuracyRegressionTracker(self.args).generate_comment()
         try:
             RegressionTracker(self.args).diff()
-        except Exception:
+        except Exception as e:
+            logging.exception(e)
             with open(f"{self.args.output_dir}/gh_regression.txt", "w") as gh_fh:
                 gh_fh.write("")
 
         comment = self.gen_comment()
         self.comment_on_gh(comment)
 
+        self.archive()
+
 
 if __name__ == "__main__":
     args = parse_args()
@@ -860,18 +1108,31 @@ def extract(key):
 
     if args.inference:
         compilers = DEFAULTS["inference"] if args.compilers is None else args.compilers
+        flag_compilers = (
+            DEFAULTS["flag_compilers"]["inference"]
+            if args.flag_compilers is None
+            else args.flag_compilers
+        )
     else:
         assert args.training
         compilers = DEFAULTS["training"] if args.compilers is None else args.compilers
+        flag_compilers = (
+            DEFAULTS["flag_compilers"]["training"]
+            if args.flag_compilers is None
+            else args.flag_compilers
+        )
 
     output_dir = args.output_dir
     args.compilers = compilers
+    args.devices = devices
+    args.dtypes = dtypes
+    args.flag_compilers = flag_compilers
     args.suites = suites
 
     if args.print_run_commands:
         generate_commands(args, dtypes, suites, devices, compilers, output_dir)
     elif args.visualize_logs:
-        parse_logs(args, dtypes, suites, devices, compilers, output_dir)
+        parse_logs(args, dtypes, suites, devices, compilers, flag_compilers, output_dir)
     elif args.run:
         generate_commands(args, dtypes, suites, devices, compilers, output_dir)
         # TODO - Do we need to worry about segfaults
@@ -883,7 +1144,12 @@ def extract(key):
             )
             raise e
         if not args.log_operator_inputs:
-            parse_logs(args, dtypes, suites, devices, compilers, output_dir)
+            archive(
+                output_dir, args.dashboard_archive_path, args.archive_name, dtypes[0]
+            )
+            parse_logs(
+                args, dtypes, suites, devices, compilers, flag_compilers, output_dir
+            )
 
     if args.update_dashboard:
         DashboardUpdater(args).update()

From 0dae075e28d76f97fb3187653d4921c2c019d0f0 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 7 Nov 2022 22:29:56 +0000
Subject: [PATCH 0637/1922] Revert "fallback for scatter_(scalar) (#88210)"

This reverts commit 896fa8c5c9b0191c9621e04ab5e20057614d48ad.

Reverted https://github.com/pytorch/pytorch/pull/88210 on behalf of https://github.com/suo due to this broke inductor tests, see: https://hud.pytorch.org/pytorch/pytorch/commit/896fa8c5c9b0191c9621e04ab5e20057614d48ad
---
 torch/_inductor/lowering.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 1cf9fc660acf6..71f038b231259 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -1907,22 +1907,12 @@ def scatter(x, dim: int, index, src, **kwargs):
 
 @register_lowering(aten.scatter_, type_promotion_kind=None)
 def scatter_(self, dim: int, index, src, *, reduce: str = None):
-
-    # TODO: Need to support more reduction type
-    # For reduction of "sum", tl.atomic_add doesn't support bool or int64
-    if reduce not in {None, "add"} or (
-        reduce == "add" and self.get_dtype() in {torch.bool, torch.int64}
-    ):
-        self.realize()
-        return fallback_scatter_(self, dim, index, src, reduce=reduce)
-
     if reduce == "add":
         reduce = "sum"
     elif reduce == "multiply":
         reduce = "prod"
     else:
         assert reduce is None
-
     return scatter_reduce_(self, dim, index, src, reduce)
 
 
@@ -1941,13 +1931,23 @@ def scatter_reduce(x, dim: int, index, src, reduction_type, **kwargs):
     return scatter_reduce_(clone(x), dim, index, src, reduction_type, **kwargs)
 
 
-fallback_scatter_ = fallback_handler(aten.scatter_)
+fallback_scatter_reduce_ = fallback_handler(aten.scatter_reduce_)
 
 
 @register_lowering(aten.scatter_reduce_, type_promotion_kind=None)
 def scatter_reduce_(self, dim: int, index, src, reduce, *, include_self: bool = True):
     assert reduce in {None, "sum", "prod", "mean", "amax", "amin"}
 
+    # TODO: Need to support more reduction type
+    # For reduction of "sum", tl.atomic_add doesn't support bool or int64
+    if reduce not in {None, "sum"} or (
+        reduce == "sum" and self.get_dtype() in {torch.bool, torch.int64}
+    ):
+        self.realize()
+        return fallback_scatter_reduce_(
+            self, dim, index, src, reduce, include_self=include_self
+        )
+
     assert isinstance(self, TensorBox)
     assert "int" in str(index.get_dtype())
 

From 59a834210e1b4395a822e52fe93419c225035ba7 Mon Sep 17 00:00:00 2001
From: lezcano <lezcano-93@hotmail.com>
Date: Mon, 7 Nov 2022 19:40:25 +0000
Subject: [PATCH 0638/1922] Add a note on the stability of linalg functions.
 (#88313)

This was long-due, as it keeps comming up in issues.

Fixes https://github.com/pytorch/pytorch/issues/85950
Fixes https://github.com/pytorch/pytorch/issues/59720
Fixes https://github.com/pytorch/pytorch/issues/59782

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88313
Approved by: https://github.com/soumith, https://github.com/mruberry
---
 docs/source/linalg.rst                   |  6 ++++
 docs/source/notes/numerical_accuracy.rst | 35 ++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/docs/source/linalg.rst b/docs/source/linalg.rst
index 02950ff971a62..aec7031e2248e 100644
--- a/docs/source/linalg.rst
+++ b/docs/source/linalg.rst
@@ -6,6 +6,8 @@ torch.linalg
 
 Common linear algebra operations.
 
+See :ref:`Linear Algebra Stability` for some common numerical edge-cases.
+
 .. automodule:: torch.linalg
 .. currentmodule:: torch.linalg
 
@@ -43,6 +45,8 @@ Decompositions
     svd
     svdvals
 
+.. _linalg solvers:
+
 Solvers
 -------
 
@@ -55,6 +59,8 @@ Solvers
     lu_solve
     lstsq
 
+.. _linalg inverses:
+
 Inverses
 --------
 
diff --git a/docs/source/notes/numerical_accuracy.rst b/docs/source/notes/numerical_accuracy.rst
index fad14ed912027..82e0bb253129e 100644
--- a/docs/source/notes/numerical_accuracy.rst
+++ b/docs/source/notes/numerical_accuracy.rst
@@ -37,6 +37,7 @@ identical to the slice of the result of the same operation applied to the full t
 ``A`` be a 2-dimensional tensor. ``A.sum(-1)[0]`` is not guaranteed to be bitwise equal to
 ``A[:,0].sum()``.
 
+
 Extremal values
 ---------------
 
@@ -51,6 +52,40 @@ datatype. E.g.:
     a.norm() # produces tensor(inf)
     a.double().norm() # produces tensor(1.4142e+20, dtype=torch.float64), representable in fp32
 
+.. _Linear Algebra Stability:
+
+Linear algebra (``torch.linalg``)
+---------------------------------
+
+Non-finite values
+"""""""""""""""""
+
+The external libraries (backends) that ``torch.linalg`` uses provide no guarantees on their behaviour
+when the inputs have non-finite values like ``inf`` or ``NaN``. As such, neither does PyTorch.
+The operations may return a tensor with non-finite values, or raise an exception, or even segfault.
+
+Consider using :func:`torch.isfinite` before calling these functions to detect this situation.
+
+Extremal values in linalg
+"""""""""""""""""""""""""
+
+Functions within ``torch.linalg`` have more `Extremal Values`_ than other PyTorch functions.
+
+:ref:`linalg solvers` and :ref:`linalg inverses` assume that the input matrix ``A`` is invertible. If it is close to
+being non-invertible (for example, if it has a very small singular value), then these algorithms may silently return
+incorrect results. These matrices are said to be `ill-conditioned <https://nhigham.com/2020/03/19/what-is-a-condition-number/>`_.
+If provided with ill-conditioned inputs, the result of these functions they may vary when using the same inputs on different
+devices or when using different backends via the keyword ``driver``.
+
+Spectral operations like ``svd``, ``eig``, and ``eigh`` may also return incorrect results (and their gradients may be infinite)
+when their inputs have singular values that are close to each other. This is because the algorithms used to compute these decompositions
+struggle to converge for these inputs.
+
+Running the computation in ``float64`` (as NumPy does by default) often helps, but it does not solve these issues in all cases.
+Analyzing the spectrum of the inputs via :func:`torch.linalg.svdvals` or their condition number via :func:`torch.linalg.cond`
+may help to detect these issues.
+
+
 TensorFloat-32(TF32) on Nvidia Ampere devices
 ---------------------------------------------
 

From 4098cbc3384707265a472fdc608d660717836a75 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Mon, 7 Nov 2022 23:05:11 +0000
Subject: [PATCH 0639/1922] Fix pull docs build running with a schedule and
 increase cpp doc timeout to 4h (#88589)

* After https://github.com/pytorch/pytorch/pull/88373, pull workflow can now be triggered with a schedule. This changes the assumption in the doc build workflow when schedule event is used to determine if the docs should be pushed
* I'll create a follow-up issue to see if it's possible to improve the performance of cpp doc build job.  At the moment, it uses a linux.12xlarge runner and still couldn't finish the job after 3h

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88589
Approved by: https://github.com/seemethere, https://github.com/ZainRizvi
---
 .github/workflows/_docs.yml | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/_docs.yml b/.github/workflows/_docs.yml
index d46e28f844f2d..2aa1e48188a5d 100644
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@@ -48,8 +48,9 @@ jobs:
             # to the next available tier of 12xlarge. So much memory just to generate cpp
             # doc
             runner: linux.12xlarge
-            # Nightly cpp docs take about 150m to finish, and the number is stable
-            timeout-minutes: 180
+            # TODO: Nightly cpp docs take longer and longer to finish (more than 3h now)
+            # Let's try to figure out how this can be improved
+            timeout-minutes: 240
           - docs_type: python
             runner: linux.2xlarge
             # It takes less than 30m to finish python docs unless there are issues
@@ -58,6 +59,9 @@ jobs:
             runner: linux.2xlarge
             # It takes less than 15m to finish functorch docs unless there are issues
             timeout-minutes: 15
+    # Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180)
+    # The current name requires updating the Rockset last docs push query from test-infra every time the matrix is updated
+    name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }}
     steps:
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
@@ -95,7 +99,10 @@ jobs:
         timeout-minutes: ${{ matrix.timeout-minutes }}
         id: build-docs
         env:
-          WITH_PUSH: ${{ github.event_name == 'schedule' || startsWith(github.event.ref, 'refs/tags/v') }}
+          # After https://github.com/pytorch/pytorch/pull/88373, pull workflow can now be run periodically,
+          # so using a schedule event to determine if the docs should be pushed or not doesn't hold true
+          # anymore
+          WITH_PUSH: ${{ inputs.push }}
           DOCKER_IMAGE: ${{ inputs.docker-image }}
           DOCS_TYPE: ${{ matrix.docs_type }}
           RUN_DOXYGEN: ${{ inputs.run-doxygen }}

From 74d285e2f333bd0a3d1fa2cce99097e8dabdce75 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Sun, 6 Nov 2022 23:38:12 +0000
Subject: [PATCH 0640/1922] Fix segfault in has_torch_function (#88559)

Fixes #83908

`PySequence_Fast` may return `NULL` to indicate an error was raised, in which
case `sequence_has_torch_function` will dereference a null pointer.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88559
Approved by: https://github.com/ezyang, https://github.com/Skylion007, https://github.com/hameerabbasi
---
 test/test_overrides.py                      | 4 ++++
 torch/csrc/utils/disable_torch_function.cpp | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/test/test_overrides.py b/test/test_overrides.py
index 879b27277f0d8..7082f75a2141f 100644
--- a/test/test_overrides.py
+++ b/test/test_overrides.py
@@ -387,6 +387,10 @@ def test_mean_semantics(self):
         self.assertEqual(torch.mean(t3), 4.0)
         self.assertEqual(bar(t3), 0)
 
+    def test_has_torch_function_non_sequence(self):
+        with self.assertRaisesRegex(TypeError, "expected a sequence"):
+            has_torch_function(object())
+
     def test_mm_semantics(self):
         """Test that a function with multiple arguments can be overrided"""
         t1 = DiagonalTensor(5, 2)
diff --git a/torch/csrc/utils/disable_torch_function.cpp b/torch/csrc/utils/disable_torch_function.cpp
index 3031493a704f6..682120d7e6223 100644
--- a/torch/csrc/utils/disable_torch_function.cpp
+++ b/torch/csrc/utils/disable_torch_function.cpp
@@ -263,6 +263,9 @@ PyObject* THPModule_has_torch_function(PyObject*, PyObject* arg) {
   } else {
     auto args = py::reinterpret_steal<py::object>(
         PySequence_Fast(arg, "expected a sequence"));
+    if (!args) {
+      return nullptr;
+    }
     result = sequence_has_torch_function(args.ptr());
   }
 

From b87cad559e3f1a8199068240ea12800d8e54de15 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Mon, 7 Nov 2022 23:49:19 +0000
Subject: [PATCH 0641/1922] Use test/test-reports for inductor (#88533)

So that the test reports can be picked up automatically by the CI and uploaded to S3. Later on, this will allows the querying of these test reports from our Rockset DB.

For example https://github.com/pytorch/pytorch/actions/runs/3382363153/jobs/5617382531 `Upload test statistics` shows:

```
+ python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test
No tests in reports found in test
```

https://hud.pytorch.org/pytorch/pytorch/commit/678d038001b0bd61501739ea97989d28f758343e inductor artifacts are also empty zip at the moment

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88533
Approved by: https://github.com/desertfire
---
 .jenkins/pytorch/test.sh | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 5dfb1dfe413ba..eee57613090fe 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -244,12 +244,12 @@ test_dynamo_shard() {
 test_inductor_distributed() {
   # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
   # with if required # gpus aren't available
-  PYTORCH_TEST_WITH_INDUCTOR=0 PYTORCH_TEST_WITH_INDUCTOR=0 python test/run_test.py --include distributed/test_dynamo_distributed
+  PYTORCH_TEST_WITH_INDUCTOR=0 PYTORCH_TEST_WITH_INDUCTOR=0 python test/run_test.py --include distributed/test_dynamo_distributed --verbose
   assert_git_not_dirty
 }
 
 test_inductor() {
-  python test/test_modules.py --verbose
+  python test/run_test.py --include test_modules --verbose
   # TODO: investigate "RuntimeError: CUDA driver API confirmed a leak"
   # seen intest_ops_gradients.py
   # pytest test/test_ops_gradients.py --verbose -k "not _complex and not test_inplace_grad_acos_cuda_float64"
@@ -260,7 +260,10 @@ test_inductor_huggingface_shard() {
     echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
     exit 1
   fi
-  TEST_REPORTS_DIR=/tmp/test-reports
+  # Use test-reports directory under test folder will allow the CI to automatically pick up
+  # the test reports and upload them to S3. Need to use full path here otherwise the script
+  # will bark about file not found later on
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
   mkdir -p "$TEST_REPORTS_DIR"
   python benchmarks/dynamo/huggingface.py --ci --training --accuracy \
     --device cuda --inductor --float32 --total-partitions 1 --partition-id "$1" \
@@ -273,7 +276,10 @@ test_inductor_timm_shard() {
     echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
     exit 1
   fi
-  TEST_REPORTS_DIR=/tmp/test-reports
+  # Use test-reports directory under test folder will allow the CI to automatically pick up
+  # the test reports and upload them to S3. Need to use full path here otherwise the script
+  # will bark about file not found later on
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
   mkdir -p "$TEST_REPORTS_DIR"
   python benchmarks/dynamo/timm_models.py --ci --training --accuracy \
     --device cuda --inductor --float32 --total-partitions 5 --partition-id "$1" \

From 872f90168fd5e04ba836b6081d5f3904ed9f95f5 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Mon, 7 Nov 2022 23:53:17 +0000
Subject: [PATCH 0642/1922] Pin linux ninja dep to 1.10.2 (#88548)

The latest version 1.11.1 breaks PyTorch CI.  A bunch of tests are failing now in master https://hud.pytorch.org/pytorch/pytorch/commit/d1ee0730410ac910760c0a21156e574093a0d15a.  Curiously, the latest commit https://hud.pytorch.org/pytorch/pytorch/commit/81042d3a53335259c60e5aa8c9b9614c3d87b05f looks green, but it's good to pin this dependency anyway

https://github.com/pytorch/pytorch/blob/master/.circleci/docker/requirements-ci.txt#L95-L97 has a curious note about ninja and why it's not part of the docker container (need to revisit this later on):

```
#ninja
#Description: build system.  Note that it install from
#here breaks things so it is commented out
```

This is one more reason to justify the effort to consolidating all pip and conda dependencies to get rid of this family of issue.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88548
Approved by: https://github.com/clee2000
---
 .github/workflows/_buck-build-test.yml   | 20 ++++++++++----------
 .github/workflows/_run_android_tests.yml | 20 ++++++++++----------
 .jenkins/pytorch/test.sh                 |  2 +-
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/_buck-build-test.yml b/.github/workflows/_buck-build-test.yml
index 52b5d4b3c6f45..f52bb6017c587 100644
--- a/.github/workflows/_buck-build-test.yml
+++ b/.github/workflows/_buck-build-test.yml
@@ -34,16 +34,16 @@ jobs:
           max_attempts: 5
           command: |
             conda install -y \
-              cffi \
-              cmake \
-              mkl \
-              mkl-include \
-              ninja \
-              numpy \
-              pyyaml \
-              requests \
-              setuptools \
-              typing_extensions
+              cffi=1.15.1 \
+              cmake=3.22.1 \
+              mkl=2022.1.0 \
+              mkl-include=2022.1.0 \
+              ninja=1.10.2 \
+              numpy=1.23.3 \
+              pyyaml=6.0 \
+              requests=2.28.1 \
+              setuptools=65.5.0 \
+              typing_extensions=4.3.0
 
       - name: Install Buck
         uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
diff --git a/.github/workflows/_run_android_tests.yml b/.github/workflows/_run_android_tests.yml
index 273ec2db81aed..ae992baab11a4 100644
--- a/.github/workflows/_run_android_tests.yml
+++ b/.github/workflows/_run_android_tests.yml
@@ -21,16 +21,16 @@ jobs:
       - name: Install dependencies
         run: |
           conda install -y \
-            cffi \
-            cmake \
-            mkl \
-            mkl-include \
-            ninja \
-            numpy \
-            pyyaml \
-            requests \
-            setuptools \
-            typing_extensions
+            cffi=1.15.1 \
+            cmake=3.22.1 \
+            mkl=2022.1.0 \
+            mkl-include=2022.1.0 \
+            ninja=1.10.2 \
+            numpy=1.23.3 \
+            pyyaml=6.0 \
+            requests=2.28.1 \
+            setuptools=65.5.0 \
+            typing_extensions=4.3.0
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index eee57613090fe..d8f020182710c 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -117,7 +117,7 @@ fi
 
 if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]] ; then
   # JIT C++ extensions require ninja.
-  pip_install --user ninja
+  pip_install --user "ninja==1.10.2"
   # ninja is installed in $HOME/.local/bin, e.g., /var/lib/jenkins/.local/bin for CI user jenkins
   # but this script should be runnable by any user, including root
   export PATH="$HOME/.local/bin:$PATH"

From 6266283ab34967e0348be583570b913601576af7 Mon Sep 17 00:00:00 2001
From: Antoni Viros i Martin <aviros@meta.com>
Date: Tue, 8 Nov 2022 00:03:14 +0000
Subject: [PATCH 0643/1922] Implement a constructor for nested_tensor that is
 similar to torch.tensor() (#88213)

Summary: This diff merges both previous implementations of constructors for nested tensors, the one from lists of tensors and the one with arbitrary python lists, adn implements it in pytorch core so no extensions are needed to construct NT.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88213
Approved by: https://github.com/cpuhrsch
---
 .../native/nested/NestedTensorFactories.cpp   |   3 +-
 .../ATen/native/nested/NestedTensorUtils.h    |  94 ++++++++++++++--
 build_variables.bzl                           |   2 +
 test/allowlist_for_publicAPI.json             |   1 +
 test/test_nestedtensor.py                     |  84 +++++++++++++--
 .../templates/python_nested_functions.cpp     |   4 +-
 torch/csrc/api/include/torch/nested.h         |  76 +++++++++----
 torch/csrc/autograd/python_nested_functions.h |   4 +-
 .../python_nested_functions_manual.cpp        |  44 ++++++++
 torch/csrc/utils/nested.cpp                   |  91 ++++++++++++++++
 torch/csrc/utils/nested.h                     |  17 +++
 torch/nested/__init__.py                      | 101 ++++++++++--------
 12 files changed, 433 insertions(+), 88 deletions(-)
 create mode 100644 torch/csrc/autograd/python_nested_functions_manual.cpp
 create mode 100644 torch/csrc/utils/nested.cpp
 create mode 100644 torch/csrc/utils/nested.h

diff --git a/aten/src/ATen/native/nested/NestedTensorFactories.cpp b/aten/src/ATen/native/nested/NestedTensorFactories.cpp
index 15473c02be19e..b45fbb24880ce 100644
--- a/aten/src/ATen/native/nested/NestedTensorFactories.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorFactories.cpp
@@ -114,7 +114,8 @@ Tensor& copy_nested_(Tensor& self, const Tensor& src, bool non_blocking) {
   const auto* nt_self = get_nested_tensor_impl(self);
   const auto* nt_src = get_nested_tensor_impl(src);
   TORCH_CHECK(
-      at::equal(nt_self->get_nested_size_tensor(), nt_src->get_nested_size_tensor()),
+      at::equal(
+          nt_self->get_nested_size_tensor(), nt_src->get_nested_size_tensor()),
       "copy_ only supports tensors that are the same size for Nested implementations");
   nt_self->get_buffer().copy_(nt_src->get_buffer(), non_blocking);
   return self;
diff --git a/aten/src/ATen/native/nested/NestedTensorUtils.h b/aten/src/ATen/native/nested/NestedTensorUtils.h
index ff8ec37dfc521..6590db9116e09 100644
--- a/aten/src/ATen/native/nested/NestedTensorUtils.h
+++ b/aten/src/ATen/native/nested/NestedTensorUtils.h
@@ -1,6 +1,8 @@
 #pragma once
 
+#include <ATen/Dispatch.h>
 #include <ATen/NestedTensorImpl.h>
+#include <ATen/Parallel.h>
 #include <ATen/core/Tensor.h>
 #include <c10/core/DispatchKeySet.h>
 #include <c10/core/TensorImpl.h>
@@ -8,10 +10,12 @@
 #include <c10/util/Exception.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
+
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/cat.h>
+#include <ATen/ops/empty.h>
 #include <ATen/ops/ones_native.h>
 #include <ATen/ops/prod.h>
 #include <ATen/ops/stack_native.h>
@@ -56,10 +60,11 @@ inline at::Tensor wrap_buffer(
     at::Tensor nested_stride_tensor,
     const std::vector<int64_t>& offsets) {
   std::vector<int64_t> offsets_copy(offsets);
-  return wrap_buffer(buffer,
-                     nested_size_tensor,
-                     nested_stride_tensor,
-                     std::move(offsets_copy));
+  return wrap_buffer(
+      buffer,
+      nested_size_tensor,
+      nested_stride_tensor,
+      std::move(offsets_copy));
 }
 
 inline at::Tensor get_buffer(const at::Tensor& tensor) {
@@ -320,17 +325,84 @@ inline Tensor wrap_tensor_node(
   if (tensor_node.degree() == 0) {
     return wrap_buffer(ones({0}, dtype, layout, device), ones({}));
   }
-  std::vector<Tensor> sizes;
-  std::vector<Tensor> flat_tensors;
+
+  // Fast path: if all tensors are on CPU, have contiguous memory, and the same
+  // dtype, copying can be done much faster.
+  bool all_tensors_cpu = true;
+  bool all_tensors_contiguous = true;
+  bool all_tensors_same_dtype = true;
+  auto first_dtype = tensor_node.children(0).dtype();
+  std::vector<long> start_offsets(tensor_node.degree());
+  start_offsets[0] = 0;
+  long total_size = 0;
   for (const auto i : c10::irange(tensor_node.degree())) {
-    flat_tensors.push_back(tensor_node.children(i).reshape(-1).contiguous());
-    sizes.push_back(tensor(c10::IntArrayRef(tensor_node.children(i).sizes())));
+    all_tensors_cpu = all_tensors_cpu && tensor_node.children(i).is_cpu();
+    all_tensors_contiguous =
+        all_tensors_contiguous && tensor_node.children(i).is_contiguous();
+    all_tensors_same_dtype = all_tensors_same_dtype &&
+        (first_dtype == tensor_node.children(i).dtype());
+    if (!(all_tensors_cpu && all_tensors_contiguous &&
+          all_tensors_same_dtype)) {
+      break;
+    }
+    if (i > 0) {
+      start_offsets[i] =
+          start_offsets[i - 1] + tensor_node.children(i - 1).numel();
+    }
+    total_size += tensor_node.children(i).numel();
   }
 
-  TensorOptions options = flat_tensors[0].options().merge_in(options_);
+  TensorOptions options;
+  Tensor nt_buffer, nt_sizes;
+  if (all_tensors_cpu && all_tensors_contiguous && all_tensors_same_dtype) {
+    nt_buffer = at::empty({total_size}, tensor_node.children(0).options());
+    nt_sizes = at::empty(
+        {static_cast<long>(tensor_node.degree()),
+         static_cast<long>(tensor_node.children(0).sizes().size())},
+        TensorOptions().dtype(kLong));
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+        at::ScalarType::Half,
+        at::ScalarType::Bool,
+        at::ScalarType::BFloat16,
+        c10::typeMetaToScalarType(first_dtype),
+        "create_nt_buffer",
+        [&]() {
+          at::parallel_for(
+              0, tensor_node.degree(), 1, [&](int64_t begin, int64_t end) {
+                for (int64_t i = begin; i < end; ++i) {
+                  // Only try copying memory if there is more than 0 elements
+                  // for a certain tensor
+                  if (tensor_node.children(i).numel() > 0) {
+                    memcpy(
+                        nt_buffer.data_ptr<scalar_t>() + start_offsets[i],
+                        tensor_node.children(i).data_ptr<scalar_t>(),
+                        tensor_node.children(i).numel() * sizeof(scalar_t));
+                  }
+                }
+              });
+        });
+    long sizes_offset = 0;
+    for (size_t i = 0; i < tensor_node.degree(); ++i) {
+      auto tensor_sizes = tensor_node.children(i).sizes();
+      for (size_t j = 0; j < tensor_sizes.size(); ++j) {
+        nt_sizes.data_ptr<int64_t>()[sizes_offset++] = tensor_sizes[j];
+      }
+    }
+    options = nt_buffer.options().merge_in(options_);
+  } else { // Slow path
+    std::vector<Tensor> flat_tensors;
+    std::vector<Tensor> sizes;
+    for (const auto i : c10::irange(tensor_node.degree())) {
+      flat_tensors.push_back(tensor_node.children(i).reshape(-1).contiguous());
+      sizes.push_back(
+          tensor(c10::IntArrayRef(tensor_node.children(i).sizes())));
+    }
+    options = flat_tensors[0].options().merge_in(options_);
+    nt_buffer = at::cat(flat_tensors);
+    nt_sizes = at::native::stack(sizes);
+  }
 
-  return wrap_buffer(
-      at::cat(flat_tensors).to(options), at::native::stack(sizes));
+  return wrap_buffer(nt_buffer.to(options), nt_sizes);
 }
 
 } // namespace impl
diff --git a/build_variables.bzl b/build_variables.bzl
index fe72453878ed7..e476341b9ac0e 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -899,6 +899,7 @@ libtorch_python_core_sources = [
     "torch/csrc/autograd/python_function.cpp",
     "torch/csrc/autograd/python_hook.cpp",
     "torch/csrc/autograd/python_legacy_variable.cpp",
+    "torch/csrc/autograd/python_nested_functions_manual.cpp",
     "torch/csrc/autograd/python_torch_functions_manual.cpp",
     "torch/csrc/autograd/python_variable.cpp",
     "torch/csrc/autograd/python_variable_indexing.cpp",
@@ -960,6 +961,7 @@ libtorch_python_core_sources = [
     "torch/csrc/utils.cpp",
     "torch/csrc/utils/cuda_lazy_init.cpp",
     "torch/csrc/utils/invalid_arguments.cpp",
+    "torch/csrc/utils/nested.cpp",
     "torch/csrc/utils/object_ptr.cpp",
     "torch/csrc/utils/python_arg_parser.cpp",
     "torch/csrc/utils/python_dispatch.cpp",
diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index 20edd93d7dc2c..a8dd659a4edd1 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -525,6 +525,7 @@
     "Optional"
   ],
   "torch.nested": [
+    "nested_tensor",
     "to_padded_tensor"
   ],
   "torch.nn.common_types": [
diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py
index f914fa57dd9a6..590f517eaceef 100644
--- a/test/test_nestedtensor.py
+++ b/test/test_nestedtensor.py
@@ -1,9 +1,10 @@
 # Owner(s): ["module: nestedtensor"]
 
-import torch
-import torch.nn
 import unittest
+
 import numpy as np
+import torch
+import torch.nn
 from torch.testing._internal.common_device_type import (
     dtypes,
     dtypesIfCUDA,
@@ -16,11 +17,12 @@
 from torch.testing._internal.common_utils import (
     freeze_rng_state,
     gradcheck,
+    instantiate_parametrized_tests,
     IS_FBCODE,
     parametrize,
     run_tests,
-    TestCase,
     subtest,
+    TestCase,
 )
 
 # Tests are ported from pytorch/nestedtensor.
@@ -94,6 +96,76 @@ def random_nt(device, dtype, num_tensors, max_dims, min_dims=None):
 
 
 class TestNestedTensor(TestCase):
+    @parametrize("batch_size", [2, 4])
+    @parametrize("max_seq_len", [3, 5])
+    @parametrize("vocab_size", [10, 20])
+    def test_2d_nested_tensor(self, batch_size, max_seq_len, vocab_size):
+        data = []
+        nested_tensor_ref_list = []
+        for _ in range(batch_size):
+            if max_seq_len == 0:
+                length = 0
+            else:
+                length = np.random.randint(low=1, high=max_seq_len)
+            row = list(np.random.randint(low=0, high=vocab_size, size=(length,)))
+            data.append(row)
+            nested_tensor_ref_list.append(torch.tensor(row))
+        nested_tensor = torch.nested.nested_tensor(data, dtype=torch.int64)
+        nested_tensor_list = nested_tensor.unbind()
+        for id in range(batch_size):
+            self.assertEqual(
+                nested_tensor_list[id],
+                nested_tensor_ref_list[id].type(torch.int64)
+            )
+
+    @parametrize("batch_size", [2, 4])
+    @parametrize("max_seq_len", [3, 5])
+    @parametrize("vocab_size", [10, 20])
+    def test_3d_nested_tensor(self, batch_size, max_seq_len, vocab_size):
+        data = []
+        nested_tensor_ref_list = []
+        for _ in range(batch_size):
+            if max_seq_len == 0:
+                length = 0
+            else:
+                length = np.random.randint(low=1, high=max_seq_len)
+            row = list(np.random.randint(low=0, high=vocab_size, size=(length,)))
+            row = [list(item * np.arange(max_seq_len)) for item in row]
+            data.append(row)
+            nested_tensor_ref_list.append(torch.Tensor(row))
+        nested_tensor = torch.nested.nested_tensor(data, dtype=torch.int64)
+        nested_tensor_list = nested_tensor.unbind()
+        for id in range(batch_size):
+            self.assertEqual(
+                nested_tensor_list[id],
+                nested_tensor_ref_list[id].type(torch.int64)
+            )
+
+    @parametrize("batch_size", [2, 4])
+    @parametrize("max_seq_len", [3, 5])
+    @parametrize("vocab_size", [10, 20])
+    def test_3d_nested_tensor_float(self, batch_size, max_seq_len, vocab_size):
+        data = []
+        nested_tensor_ref_list = []
+        for _ in range(batch_size):
+            if max_seq_len == 0:
+                length = 0
+            else:
+                length = np.random.randint(low=1, high=max_seq_len)
+            row = list(
+                np.random.randint(low=0, high=vocab_size, size=(length,)).astype(float)
+            )
+            row = [list(item * np.arange(max_seq_len)) for item in row]
+            data.append(row)
+            nested_tensor_ref_list.append(torch.Tensor(row))
+        nested_tensor = torch.nested.nested_tensor(data, dtype=torch.float)
+        nested_tensor_list = nested_tensor.unbind()
+        for id in range(batch_size):
+            self.assertEqual(
+                nested_tensor_list[id],
+                nested_tensor_ref_list[id].type(torch.float)
+            )
+
 
     @torch.inference_mode()
     def _test_unbind_case(self, a, b):
@@ -151,7 +223,6 @@ def _test_fn(unbind_fn):
 
     @torch.inference_mode()
     def test_nested_tensor(self):
-        self.assertRaises(TypeError, lambda: torch.nested.nested_tensor([3.0]))
         self.assertRaises(TypeError, lambda: torch.nested.nested_tensor(torch.tensor([3.0])))
         self.assertRaises(TypeError, lambda: torch.nested.nested_tensor(4.0))
 
@@ -227,9 +298,7 @@ def test_size(self):
             a1 = constructor([])
             self.assertRaisesRegex(
                 RuntimeError,
-                "Tensors of type NestedTensorImpl do not have sym sizes"
-                if IS_FBCODE
-                else "NestedTensorImpl doesn't support sizes",
+                "NestedTensorImpl doesn't support sizes",
                 lambda: a1.size(),
             )
 
@@ -2241,6 +2310,7 @@ def test_indexing_backward(self, device):
         self.assertEqual(nt.grad, expected_grad)
 
 
+instantiate_parametrized_tests(TestNestedTensor)
 instantiate_device_type_tests(TestNestedTensorDeviceType, globals())
 instantiate_device_type_tests(TestNestedTensorAutograd, globals())
 
diff --git a/tools/autograd/templates/python_nested_functions.cpp b/tools/autograd/templates/python_nested_functions.cpp
index cdfc4336163f4..5515ca6f8a0b3 100644
--- a/tools/autograd/templates/python_nested_functions.cpp
+++ b/tools/autograd/templates/python_nested_functions.cpp
@@ -4,7 +4,7 @@
 #include "torch/csrc/Device.h"
 #include "torch/csrc/DynamicTypes.h"
 #include "torch/csrc/Exceptions.h"
-#include "torch/csrc/autograd/python_special_functions.h"
+#include "torch/csrc/autograd/python_nested_functions.h"
 #include "torch/csrc/autograd/python_return_types.h"
 #include "torch/csrc/autograd/python_variable.h"
 #include "torch/csrc/autograd/utils/wrap_outputs.h"
@@ -47,6 +47,7 @@ namespace torch { namespace autograd {
 ${py_forwards}
 
 static PyMethodDef nested_functions[] = {
+  {NULL, NULL, 0, NULL},
   ${py_method_defs}
   {NULL}
 };
@@ -54,6 +55,7 @@ static PyMethodDef nested_functions[] = {
 static PyObject* THPNestedVariableFunctionsModule = NULL;
 
 void initNestedFunctions(PyObject* module) {
+  nested_functions[0] = get_nested_functions_manual()[0];
   static struct PyModuleDef def = {
      PyModuleDef_HEAD_INIT,
      "torch._C._nested",
diff --git a/torch/csrc/api/include/torch/nested.h b/torch/csrc/api/include/torch/nested.h
index 1730583d5e149..d91c878348bd5 100644
--- a/torch/csrc/api/include/torch/nested.h
+++ b/torch/csrc/api/include/torch/nested.h
@@ -1,7 +1,9 @@
 #pragma once
 
 #include <ATen/ATen.h>
-#include <torch/torch.h>
+#include <ATen/core/ATen_fwd.h>
+#include <torch/csrc/api/include/torch/detail/TensorDataContainer.h>
+#include <algorithm>
 
 namespace torch {
 namespace nested {
@@ -12,19 +14,51 @@ namespace nested {
 /// https://pytorch.org/docs/master/nested.html#torch.nested.nested_tensor
 ///
 /// ```
-inline Tensor nested_tensor(
-    TensorList list,
-    c10::optional<ScalarType> dtype = c10::nullopt,
-    c10::optional<Device> device = c10::nullopt,
-    c10::optional<bool> requires_grad = false,
-    c10::optional<bool> pin_memory = false) {
-  std::vector<Tensor> new_list;
-  for (const auto i : c10::irange(list.size())) {
-    new_list.push_back(list[i].clone().detach());
+// implemented on python object to allow torch.nested.nested_tensor to be
+// constructed with arbitrarily nested python objects - for now, only arbitrary
+// python lists and lists of Tensors
+// See torch/csrc/autograd/python_nested_functions_manual.cpp for Python
+// implementation
+// See here for C++ implementation
+inline at::Tensor nested_tensor(
+    at::TensorList nested_tensor_data,
+    const at::TensorOptions& options = {}) {
+  auto out = at::_nested_tensor_from_tensor_list(
+      nested_tensor_data,
+      c10::typeMetaToScalarType(options.dtype()),
+      c10::nullopt,
+      options.device(),
+      options.pinned_memory());
+  if (options.has_requires_grad() && options.requires_grad()) {
+    out.requires_grad_(true);
+  }
+  return out;
+}
+
+inline at::Tensor nested_tensor(
+    at::ArrayRef<detail::TensorDataContainer> nested_tensor_data,
+    const at::TensorOptions& options = {}) {
+  for (const auto& tdc : nested_tensor_data) {
+    TORCH_CHECK(
+        tdc.is_init_list(),
+        "nested_tensor() not implemented for these parameters");
   }
-  auto out = torch::_nested_tensor_from_tensor_list(
-      new_list, dtype, c10::nullopt, device, pin_memory);
-  if (requires_grad.has_value() && requires_grad.value()) {
+  // Construct a TensorList using nested_tensor_data
+  std::vector<at::Tensor> tensor_list(nested_tensor_data.size());
+  std::transform(
+      nested_tensor_data.begin(),
+      nested_tensor_data.end(),
+      tensor_list.begin(),
+      [&](const detail::TensorDataContainer& tdc) {
+        return tdc.convert_to_tensor(options);
+      });
+  auto out = at::_nested_tensor_from_tensor_list(
+      tensor_list,
+      c10::typeMetaToScalarType(options.dtype()),
+      c10::nullopt,
+      options.device(),
+      options.pinned_memory());
+  if (options.has_requires_grad() && options.requires_grad()) {
     out.requires_grad_(true);
   }
   return out;
@@ -36,10 +70,10 @@ inline Tensor nested_tensor(
 /// https://pytorch.org/docs/master/nested.html#torch.nested.as_nested_tensor
 ///
 /// ```
-inline Tensor as_nested_tensor(
-    TensorList list,
-    c10::optional<ScalarType> dtype = c10::nullopt,
-    c10::optional<Device> device = c10::nullopt) {
+inline at::Tensor as_nested_tensor(
+    at::TensorList list,
+    c10::optional<at::ScalarType> dtype = c10::nullopt,
+    c10::optional<at::Device> device = c10::nullopt) {
   return at::_nested_tensor_from_tensor_list(
       list, dtype, c10::nullopt, device, c10::nullopt);
 }
@@ -50,11 +84,11 @@ inline Tensor as_nested_tensor(
 /// https://pytorch.org/docs/master/nested.html#torch.nested.to_padded_tensor
 ///
 /// ```
-inline Tensor to_padded_tensor(
-    const Tensor& self,
+inline at::Tensor to_padded_tensor(
+    const at::Tensor& self,
     double padding,
-    OptionalIntArrayRef output_size = c10::nullopt) {
-  return torch::nested_to_padded_tensor(self, padding, output_size);
+    at::OptionalIntArrayRef output_size = c10::nullopt) {
+  return at::nested_to_padded_tensor(self, padding, output_size);
 }
 
 } // namespace nested
diff --git a/torch/csrc/autograd/python_nested_functions.h b/torch/csrc/autograd/python_nested_functions.h
index 8b0bf9c115d17..6a86a3a7a1fe0 100644
--- a/torch/csrc/autograd/python_nested_functions.h
+++ b/torch/csrc/autograd/python_nested_functions.h
@@ -3,7 +3,9 @@
 namespace torch {
 namespace autograd {
 
+PyMethodDef* get_nested_functions_manual();
+
 void initNestedFunctions(PyObject* module);
 
-}
+} // namespace autograd
 } // namespace torch
diff --git a/torch/csrc/autograd/python_nested_functions_manual.cpp b/torch/csrc/autograd/python_nested_functions_manual.cpp
new file mode 100644
index 0000000000000..0e1823e192b3a
--- /dev/null
+++ b/torch/csrc/autograd/python_nested_functions_manual.cpp
@@ -0,0 +1,44 @@
+#include <torch/csrc/utils/nested.h>
+#include <torch/csrc/utils/pycfunction_helpers.h>
+#include <torch/csrc/utils/python_arg_parser.h>
+#include <torch/torch.h>
+
+namespace torch {
+namespace autograd {
+
+static PyObject* THPVariable_nested_tensor(
+    PyObject* /*self*/,
+    PyObject* args,
+    PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+      "nested_tensor(PyObject* data, *, ScalarType dtype=None, Device? device=None, bool pin_memory=False, bool requires_grad=False)",
+  });
+
+  constexpr int ctor_num_args = 5;
+  ParsedArgs<ctor_num_args> parsed_args;
+  auto r = parser.parse(args, kwargs, parsed_args);
+
+  jit::tracer::warn(
+      "torch.nested.nested_tensor", jit::tracer::WARN_CONSTRUCTOR);
+  return THPVariable_Wrap(torch::utils::nested_tensor_ctor(
+      torch::tensors::get_default_dispatch_key(),
+      torch::tensors::get_default_scalar_type(),
+      r));
+  END_HANDLE_TH_ERRORS
+}
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+static PyMethodDef nested_functions_manual[] = {
+    {"nested_tensor",
+     castPyCFunctionWithKeywords(THPVariable_nested_tensor),
+     METH_VARARGS | METH_KEYWORDS,
+     nullptr},
+};
+
+PyMethodDef* get_nested_functions_manual() {
+  return nested_functions_manual;
+}
+
+} // namespace autograd
+} // namespace torch
diff --git a/torch/csrc/utils/nested.cpp b/torch/csrc/utils/nested.cpp
new file mode 100644
index 0000000000000..d0619bd1f6554
--- /dev/null
+++ b/torch/csrc/utils/nested.cpp
@@ -0,0 +1,91 @@
+#include <ATen/ATen.h>
+#include <ATen/NestedTensorImpl.h>
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/nested.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/utils/tensor_new.h>
+#include <torch/torch.h>
+#include <stdexcept>
+#include <vector>
+
+namespace torch {
+namespace utils {
+
+// NB: device_idx here is NOT a DeviceIndex, but index into PythonArgs
+c10::TensorOptions typeIdWithDefault(
+    PythonArgs& r,
+    int device_idx,
+    c10::DispatchKey dispatch_key) {
+  auto options = dispatchKeyToTensorOptions(dispatch_key);
+  if (!r.isNone(device_idx)) {
+    options = options.device(r.device(device_idx));
+  }
+  return options;
+}
+
+at::Tensor nested_tensor_ctor(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    torch::PythonArgs& r) {
+  TORCH_CHECK(r.idx == 0, "nested_tensor(): invalid arguments");
+
+  PyObject* data = r.pyobject(0);
+  // Check if data is a list: Only List[Tensor] and List[List...[Scalar]] are
+  // accepted for now
+  TORCH_CHECK_TYPE(
+      PyList_Check(data),
+      "Only lists (List[Tensor] and List[List...[Scalar]]) are accepted in nested_tensor");
+
+  auto dtype_val = r.scalartypeWithDefault(1, scalar_type);
+  auto tensor_options = typeIdWithDefault(r, 2, dispatch_key);
+  bool pin_memory = r.toBool(3);
+  bool args_requires_grad = r.toBool(4);
+
+  TORCH_CHECK(
+      PyList_Size(data) >= 0,
+      "Something went really wrong and your list has negative size");
+
+  // Check whether we are dealing with lists of tensors or not
+  std::vector<at::Tensor> new_list(PyList_Size(data));
+  for (const auto i : c10::irange(PyList_Size(data))) {
+    PyObject* elem = PyList_GetItem(data, i);
+    if (THPVariable_Check(elem)) {
+      new_list[i] = THPVariable_Unpack(PyList_GetItem(data, i)).detach();
+      TORCH_CHECK(
+          !new_list[i].is_nested(),
+          "We do not accept nested tensors as input to nested tensors");
+      TORCH_CHECK(
+          new_list[i].layout() == kStrided,
+          "We do not accept non-strided layouts as input to nested tensors");
+    } else {
+      PythonArgs elem_r(r);
+      std::array<PyObject*, 6> elem_args = {
+          elem, // data
+          r.args[1], // dtpye
+          nullptr, // device (cpu)
+          nullptr, // no pinned memory
+          r.args[4], // requires grad
+          nullptr // names
+      };
+      elem_r.args = elem_args.data();
+      new_list[i] = tensor_ctor(dispatch_key, scalar_type, elem_r);
+    }
+  }
+
+  at::ScalarType final_dtype = dtype_val;
+  if (r.isNone(1) && new_list.size() > 0) {
+    final_dtype = c10::typeMetaToScalarType(new_list[0].dtype());
+  }
+  at::Device final_device = tensor_options.device();
+  if (r.isNone(2) && new_list.size() > 0) {
+    final_device = new_list[0].device();
+  }
+  auto out = at::_nested_tensor_from_tensor_list(
+      new_list, final_dtype, c10::nullopt, final_device, pin_memory);
+  out.requires_grad_(args_requires_grad);
+  return out;
+}
+
+} // namespace utils
+} // namespace torch
diff --git a/torch/csrc/utils/nested.h b/torch/csrc/utils/nested.h
new file mode 100644
index 0000000000000..f3a1061e47123
--- /dev/null
+++ b/torch/csrc/utils/nested.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/python_arg_parser.h>
+
+#include <ATen/core/Tensor.h>
+
+namespace torch {
+namespace utils {
+
+at::Tensor nested_tensor_ctor(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PythonArgs& r);
+
+} // namespace utils
+} // namespace torch
diff --git a/torch/nested/__init__.py b/torch/nested/__init__.py
index 3e03f69803dec..71498187298dd 100644
--- a/torch/nested/__init__.py
+++ b/torch/nested/__init__.py
@@ -1,55 +1,25 @@
 from typing import List, Optional
+
 import torch
-from torch._C import _add_docstr, _nested  # type: ignore[attr-defined]
 from torch import Tensor
+from torch._C import _add_docstr, _nested  # type: ignore[attr-defined]
 
-from torch.types import _dtype as DType
-from torch.types import _device as Device
+from torch.types import _device as Device, _dtype as DType
 
 __all__ = [
-    'to_padded_tensor',
-    'as_nested_tensor',
-    'nested_tensor',
+    "to_padded_tensor",
+    "as_nested_tensor",
+    "nested_tensor",
 ]
 
 # Nested Tensor constructor functions
-# TODO: move these to pybind to accept numpy/nested lists as inputs in the future
-def nested_tensor(tensor_list: List[Tensor], *, dtype: Optional[DType] = None, device: Optional[Device] = None,
-                  requires_grad: Optional[bool] = False, pin_memory: Optional[bool] = False) -> Tensor:
-    r"""
-    Constructs a nested tensor with no autograd history (also known as a “leaf tensor”, see
-    :ref:`Autograd mechanics <autograd-mechanics>`) from :attr:`tensor_list` a list of tensors.
-
-    Args:
-        tensor_list (List[Tensor]): a list of tensors with the same ndim
 
-    Keyword arguments:
-        dtype (:class:`torch.dtype`, optional): the desired type of returned nested tensor.
-            Default: if None, same :class:`torch.dtype` as leftmost tensor in the list.
-        device (:class:`torch.device`, optional): the desired device of returned nested tensor.
-            Default: if None, same :class:`torch.device` as leftmost tensor in the list
-        requires_grad (bool, optional): If autograd should record operations on the
-            returned nested tensor. Default: ``False``.
-        pin_memory (bool, optional): If set, returned nested tensor would be allocated in
-            the pinned memory. Works only for CPU tensors. Default: ``False``.
-
-    Example::
 
-        >>> a = torch.arange(3, dtype=torch.float, requires_grad=True)
-        >>> b = torch.arange(5, dtype=torch.float, requires_grad=True)
-        >>> nt = torch.nested.nested_tensor([a, b], requires_grad=True)
-        >>> nt.is_leaf
-        True
-    """
-    if not isinstance(tensor_list, list) or any([not torch.is_tensor(t) for t in tensor_list]):
-        raise TypeError("nested_tensor(): Expected first argument to be a list of tensors ")
-    new_data = [t.detach() for t in tensor_list]
-    nt = torch._nested_tensor_from_tensor_list(new_data, dtype, None, device, pin_memory)
-    if (requires_grad):
-        nt.requires_grad_(requires_grad)
-    return nt
-
-def as_nested_tensor(tensor_list: List[Tensor], dtype: Optional[DType] = None, device: Optional[Device] = None) -> Tensor:
+def as_nested_tensor(
+    tensor_list: List[Tensor],
+    dtype: Optional[DType] = None,
+    device: Optional[Device] = None,
+) -> Tensor:
     r"""
     Constructs a nested tensor preserving autograd history from :attr:`tensor_list` a list of tensors.
 
@@ -79,15 +49,21 @@ def as_nested_tensor(tensor_list: List[Tensor], dtype: Optional[DType] = None, d
         >>> b.grad
         tensor([0., 0., 0., 0., 0.])
     """
-    if not isinstance(tensor_list, list) or any([not torch.is_tensor(t) for t in tensor_list]):
-        raise TypeError("nested_tensor(): Expected first argument to be a list of tensors ")
+    if not isinstance(tensor_list, list) or any(
+        [not torch.is_tensor(t) for t in tensor_list]
+    ):
+        raise TypeError(
+            "nested_tensor(): Expected first argument to be a list of tensors "
+        )
     return torch._nested_tensor_from_tensor_list(tensor_list, dtype, None, device, None)
 
+
 # Note: This not only adds doc strings for the nested ops, but
 # also connects the torch.nested Python namespace to the torch._C._nested builtins.
 
-to_padded_tensor = _add_docstr(_nested.nested_to_padded_tensor,
-                               r"""
+to_padded_tensor = _add_docstr(
+    _nested.nested_to_padded_tensor,
+    r"""
 to_padded_tensor(input, padding, output_size=None, out=None) -> Tensor
 
 Returns a new (non-nested) Tensor by padding the :attr:`input` nested tensor.
@@ -137,4 +113,37 @@ def as_nested_tensor(tensor_list: List[Tensor], dtype: Optional[DType] = None, d
     >>> pt_small = torch.nested.to_padded_tensor(nt, 2.0, (2, 2, 2))
     RuntimeError: Value in output_size is less than NestedTensor padded size. Truncation is not supported.
 
-""")
+""",
+)
+
+nested_tensor = _add_docstr(
+    _nested.nested_tensor,
+    r"""
+nested_tensor(tensor_list, *, dtype=None, device=None, requires_grad=False, pin_memory=False) -> Tensor
+
+Constructs a nested tensor with no autograd history (also known as a “leaf tensor”, see
+:ref:`Autograd mechanics <autograd-mechanics>`) from :attr:`tensor_list` a list of tensors.
+
+Args:
+    tensor_list (List[array_like]): a list of tensors (or anything that can be passed to torch.tensor)
+    where their first dimension can be of irregular size, but all other dimensions have to be equal.
+
+Keyword arguments:
+    dtype (:class:`torch.dtype`, optional): the desired type of returned nested tensor.
+        Default: if None, same :class:`torch.dtype` as leftmost tensor in the list.
+    device (:class:`torch.device`, optional): the desired device of returned nested tensor.
+        Default: if None, same :class:`torch.device` as leftmost tensor in the list
+    requires_grad (bool, optional): If autograd should record operations on the
+        returned nested tensor. Default: ``False``.
+    pin_memory (bool, optional): If set, returned nested tensor would be allocated in
+        the pinned memory. Works only for CPU tensors. Default: ``False``.
+
+Example::
+
+    >>> a = torch.arange(3, dtype=torch.float, requires_grad=True)
+    >>> b = torch.arange(5, dtype=torch.float, requires_grad=True)
+    >>> nt = torch.nested.nested_tensor([a, b], requires_grad=True)
+    >>> nt.is_leaf
+    True
+    """,
+)

From 6156ba469e8d30f4570e018c73a6cfb371af1b6e Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 8 Nov 2022 01:17:35 +0000
Subject: [PATCH 0644/1922] [Reland] Use sudo when reset NVIDIA devices 
 (#88605)

I accidentally delete my remote branch, so I need to create a new PR for this fix (instead of updating the reverted PR https://github.com/pytorch/pytorch/pull/88531)

TIL, sudo echo doesn't do that I think it does, the correct syntax should be `echo "1" | sudo tee /sys/bus/pci/devices/$PCI_ID/reset` granting sudo permission to the latter tee command.

### Testing

Due diligence and actually login to `i-07e62045d15df3629` and make sure that the command works
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88605
Approved by: https://github.com/ZainRizvi
---
 .github/scripts/install_nvidia_utils_linux.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/install_nvidia_utils_linux.sh b/.github/scripts/install_nvidia_utils_linux.sh
index 3a2805d91d5ab..37c6dccd4811f 100755
--- a/.github/scripts/install_nvidia_utils_linux.sh
+++ b/.github/scripts/install_nvidia_utils_linux.sh
@@ -74,7 +74,8 @@ install_nvidia_driver_amzn2() {
                     DEVICE_ENABLED=$(cat /sys/bus/pci/devices/$PCI_ID/enable)
 
                     echo "Reseting $PCI_ID (enabled state: $DEVICE_ENABLED)"
-                    echo "1" > /sys/bus/pci/devices/$PCI_ID/reset
+                    # This requires sudo permission of course
+                    echo "1" | sudo tee /sys/bus/pci/devices/$PCI_ID/reset
                     sleep 1
                 done
             fi

From 4fcd308336edd37c0e6ce3dd5162cdb7c97d7f8c Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Tue, 8 Nov 2022 01:33:36 +0000
Subject: [PATCH 0645/1922] Fix typos used in documents under torch directory
 (#88483)

This PR fixes typos, in comments of Python files, that are found from a search box at https://pytorch.org/docs/master/search.html.
This is a follow-up of #88300.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88483
Approved by: https://github.com/kit1980
---
 torch/distributions/lkj_cholesky.py                             | 2 +-
 torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/distributions/lkj_cholesky.py b/torch/distributions/lkj_cholesky.py
index 1cc5bfa84081d..930d038161193 100644
--- a/torch/distributions/lkj_cholesky.py
+++ b/torch/distributions/lkj_cholesky.py
@@ -21,7 +21,7 @@ class LKJCholesky(Distribution):
     LKJ distribution for lower Cholesky factor of correlation matrices.
     The distribution is controlled by ``concentration`` parameter :math:`\eta`
     to make the probability of the correlation matrix :math:`M` generated from
-    a Cholesky factor propotional to :math:`\det(M)^{\eta - 1}`. Because of that,
+    a Cholesky factor proportional to :math:`\det(M)^{\eta - 1}`. Because of that,
     when ``concentration == 1``, we have a uniform distribution over Cholesky
     factors of correlation matrices. Note that this distribution samples the
     Cholesky factor of correlation matrices and not the correlation matrices
diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
index 04663d915c26d..378dd27c65ba5 100644
--- a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
+++ b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
@@ -251,7 +251,7 @@ def as_standardized(self) -> "CallgrindStats":
             -23234231 /tmp/second_build_dir/thing.c:foo(...)
 
         Stripping prefixes can ameliorate this issue by regularizing the
-        strings and causing better cancellation of equivilent call sites
+        strings and causing better cancellation of equivalent call sites
         when diffing.
         """
         def strip(stats: FunctionCounts) -> FunctionCounts:

From c742b73caaf946d3e77312c3210ace39999c2e0b Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Tue, 8 Nov 2022 01:46:25 +0000
Subject: [PATCH 0646/1922] Do not use double for single-prec upsample (#88277)

I'm not sure, what would be the best behaviour here, but it feels a bit strange to perform parts of `float32` computations as `float64` and then downcast them back to `float32`.

Use `at::opmath_type` rather than `at:acc_type` as no accumulation is used in the op.

I don't know much about double vs single precision scalar perf on x86 CPU, but before the change:
```
python -c "import timeit;import torch;x=torch.arange(100, dtype=torch.float32).reshape(1, 1, 10, 10); print(timeit.Timer(stmt='torch.nn.functional.interpolate(x, scale_factor=2.0, mode=\"bilinear\", align_corners=False)', globals={'x':x, 'torch':torch}).timeit())"
11.337517574429512
```
After the change:
```
$ python -c "import timeit;import torch;x=torch.arange(100, dtype=torch.float32).reshape(1, 1, 10, 10); print(timeit.Timer(stmt='torch.nn.functional.interpolate(x, scale_factor=2.0, mode=\"bilinear\", align_corners=False)', globals={'x':x, 'torch':torch}).timeit())"
10.513805857859552
```
I.e. roughly 7% perf degradation (measured on Intel(R) Xeon(R) Platinum 8275CL CPU @ 3.00GHz)

NOTE:
 - `aten::acc_type<float, false>` yields `double`
 - `aten::acc_type<float, true>` return `float`.

Fixes https://github.com/pytorch/pytorch/issues/87968

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88277
Approved by: https://github.com/mingfeima, https://github.com/ngimel, https://github.com/jgong5
---
 aten/src/ATen/native/UpSample.h               | 16 +++----
 aten/src/ATen/native/cpu/UpSampleKernel.cpp   | 44 +++++++++----------
 .../ATen/native/cpu/UpSampleMoreKernel.cpp    |  4 +-
 3 files changed, 31 insertions(+), 33 deletions(-)

diff --git a/aten/src/ATen/native/UpSample.h b/aten/src/ATen/native/UpSample.h
index f3dd836444d13..144b5921eed33 100644
--- a/aten/src/ATen/native/UpSample.h
+++ b/aten/src/ATen/native/UpSample.h
@@ -2,7 +2,7 @@
 
 #include <math.h>
 
-#include <ATen/AccumulateType.h>
+#include <ATen/OpMathType.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/native/DispatchStub.h>
@@ -266,15 +266,13 @@ static inline scalar_t area_pixel_compute_scale(
     bool align_corners,
     const c10::optional<double> scale) {
   // see Note [area_pixel_compute_scale]
-  if(align_corners){
+  if(align_corners) {
     if(output_size > 1) {
       return static_cast<scalar_t>(input_size - 1) / (output_size - 1);
-    }
-    else {
+    } else {
       return static_cast<scalar_t>(0);
     }
-  }
-  else{
+  } else {
     return compute_scales_value<scalar_t>(scale, input_size, output_size);
   }
 }
@@ -447,9 +445,9 @@ static inline void compute_source_index_and_lambda(
     lambda0 = static_cast<scalar_t>(1);
     lambda1 = static_cast<scalar_t>(0);
   } else {
-    using accscalar_t = at::acc_type<scalar_t, false>;
-    const accscalar_t real_input_index =
-        area_pixel_compute_source_index<accscalar_t>(
+    using opmath_t = at::opmath_type<scalar_t>;
+    const auto real_input_index =
+        area_pixel_compute_source_index<opmath_t>(
             ratio, output_index, align_corners, /*cubic=*/false);
     input_index0 = static_cast<int64_t>(real_input_index);
     int64_t offset = (input_index0 < input_size - 1) ? 1 : 0;
diff --git a/aten/src/ATen/native/cpu/UpSampleKernel.cpp b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
index 75cefe425ebbc..7eb7cf5e58bbf 100644
--- a/aten/src/ATen/native/cpu/UpSampleKernel.cpp
+++ b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
@@ -471,7 +471,7 @@ void cpu_upsample_linear_channels_last(
   TORCH_CHECK(channels > 0, "expected input and output channels greater than 0 but got ", channels);
   int64_t output_slice_size = output_depth * output_height * output_width * channels;
 
-  using accscalar_t = at::acc_type<scalar_t, false>;
+  using opmath_t = at::opmath_type<scalar_t>;
   using Vec = vec::Vectorized<scalar_t>;
   auto loop2d = [&](int64_t begin, int64_t end) {
     const scalar_t height_scale = area_pixel_compute_scale<scalar_t>(
@@ -501,10 +501,10 @@ void cpu_upsample_linear_channels_last(
           scalar_t* i01 = input_indexr(n, ih0, iw1);
           scalar_t* i10 = input_indexr(n, ih1, iw0);
           scalar_t* i11 = input_indexr(n, ih1, iw1);
-          accscalar_t w00 = h0lambda * w0lambda;
-          accscalar_t w01 = h0lambda * w1lambda;
-          accscalar_t w10 = h1lambda * w0lambda;
-          accscalar_t w11 = h1lambda * w1lambda;
+          opmath_t w00 = h0lambda * w0lambda;
+          opmath_t w01 = h0lambda * w1lambda;
+          opmath_t w10 = h1lambda * w0lambda;
+          opmath_t w11 = h1lambda * w1lambda;
 
           int64_t size = channels;
           int64_t d = 0;
@@ -559,14 +559,14 @@ void cpu_upsample_linear_channels_last(
             scalar_t* i101 = input_indexr(n, id1, ih0, iw1);
             scalar_t* i110 = input_indexr(n, id1, ih1, iw0);
             scalar_t* i111 = input_indexr(n, id1, ih1, iw1);
-            accscalar_t w000 = d0lambda * h0lambda * w0lambda;
-            accscalar_t w001 = d0lambda * h0lambda * w1lambda;
-            accscalar_t w010 = d0lambda * h1lambda * w0lambda;
-            accscalar_t w011 = d0lambda * h1lambda * w1lambda;
-            accscalar_t w100 = d1lambda * h0lambda * w0lambda;
-            accscalar_t w101 = d1lambda * h0lambda * w1lambda;
-            accscalar_t w110 = d1lambda * h1lambda * w0lambda;
-            accscalar_t w111 = d1lambda * h1lambda * w1lambda;
+            opmath_t w000 = d0lambda * h0lambda * w0lambda;
+            opmath_t w001 = d0lambda * h0lambda * w1lambda;
+            opmath_t w010 = d0lambda * h1lambda * w0lambda;
+            opmath_t w011 = d0lambda * h1lambda * w1lambda;
+            opmath_t w100 = d1lambda * h0lambda * w0lambda;
+            opmath_t w101 = d1lambda * h0lambda * w1lambda;
+            opmath_t w110 = d1lambda * h1lambda * w0lambda;
+            opmath_t w111 = d1lambda * h1lambda * w1lambda;
 
             int64_t size = channels;
             int64_t d = 0;
@@ -775,10 +775,10 @@ struct HelperInterpNearest : public HelperInterpBase {
         // index_f32 = (output_index) * scale
         // input_index = floor(index_f32)
         // Same as OpenCV INTER_NEAREST
-        using accscalar_t = at::acc_type<scalar_t, false>;
+        using opmath_t = at::opmath_type<scalar_t>;
         for (const auto i : c10::irange(output_size)) {
-          const accscalar_t real_input_index =
-              area_pixel_compute_source_index<accscalar_t>(
+          const auto real_input_index =
+              area_pixel_compute_source_index<opmath_t>(
                   scale, i, /*align_corners=*/true, /*cubic=*/false);
           input_index = static_cast<int64_t>(floorf(real_input_index));
           input_index_ptr[i] = static_cast<int64_t>(std::min(input_index, input_size - 1)) * stride;
@@ -826,10 +826,10 @@ struct HelperInterpNearestExact : public HelperInterpNearest {
         // index_f32 = (output_index + 0.5) * scale - 0.5
         // input_index = round(index_f32)
         // Same as Pillow and Scikit-Image/Scipy ndi.zoom
-        using accscalar_t = at::acc_type<scalar_t, false>;
+        using opmath_t = at::opmath_type<scalar_t>;
         for (const auto i : c10::irange(output_size)) {
-          const accscalar_t real_input_index =
-              area_pixel_compute_source_index<accscalar_t>(
+          const auto real_input_index =
+              area_pixel_compute_source_index<opmath_t>(
                   scale, i, /*align_corners=*/align_corners, /*cubic=*/false);
           input_index = static_cast<int64_t>(floorf(real_input_index + 0.5));
           input_index_ptr[i] = static_cast<int64_t>(std::min(input_index, input_size - 1)) * stride;
@@ -975,10 +975,10 @@ struct HelperInterpCubic : public HelperInterpBase {
 
         int64_t * idx_ptr;
         scalar_t * wt_ptr;
-        using accscalar_t = at::acc_type<scalar_t, false>;
+        using opmath_t = at::opmath_type<scalar_t>;
         for (const auto i : c10::irange(output_size)) {
-          const accscalar_t real_input_index =
-              area_pixel_compute_source_index<accscalar_t>(
+          const auto real_input_index =
+              area_pixel_compute_source_index<opmath_t>(
                   scale, i, align_corners, /*cubic=*/true);
           input_index = static_cast<int64_t>(floorf(real_input_index));
           get_cubic_upsample_coefficients<scalar_t>(coeffs, real_input_index - input_index);
diff --git a/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp b/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp
index a26cef72bb10c..c73e0249dee82 100644
--- a/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp
+++ b/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp
@@ -441,9 +441,9 @@ void cpu_upsample_linear_backward_channels_last(
   int64_t input_width = input_sizes[ndim - 1];
   int64_t output_width = output_sizes[ndim - 1];
 
-  using accscalar_t = at::acc_type<scalar_t, false>;
+  using opmath_t = at::opmath_type<scalar_t>;
   using Vec = vec::Vectorized<scalar_t>;
-  auto acc = [](scalar_t* gin, scalar_t* gout, accscalar_t w, int64_t size) {
+  auto acc = [](scalar_t* gin, scalar_t* gout, opmath_t w, int64_t size) {
     int64_t d = 0;
     for (; d < size - (size % Vec::size()); d += Vec::size()) {
       Vec gin_vec = Vec::loadu(gin + d) + Vec(w) * Vec::loadu(gout + d);

From 612921e5748062991417bc5424a5c09c771d1c7b Mon Sep 17 00:00:00 2001
From: Jiewen Tan <jwtan@google.com>
Date: Tue, 8 Nov 2022 02:22:02 +0000
Subject: [PATCH 0647/1922] [LTC] Remove view (#88445)

Summary:
This pull request removes the last view ops, the original view.

Test Plan:
./build/bin/test_lazy --gtest_filter=LazyOpsTest.TestView*

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88445
Approved by: https://github.com/JackCaoG, https://github.com/antoniojkim, https://github.com/Krovatkin
---
 .github/ci_commit_pins/xla.txt                |  2 +-
 aten/src/ATen/native/ts_native_functions.yaml |  5 ++--
 torch/csrc/lazy/core/ir_builder.h             |  8 -------
 torch/csrc/lazy/ts_backend/ir_builder.h       |  4 ----
 .../csrc/lazy/ts_backend/tensor_aten_ops.cpp  | 24 -------------------
 torch/csrc/lazy/ts_backend/tensor_aten_ops.h  |  7 +-----
 .../csrc/lazy/ts_backend/ts_node_lowering.cpp | 10 --------
 7 files changed, 4 insertions(+), 56 deletions(-)

diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index d6866ce9977c2..c0348df036f69 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-48365abee394c325a3d17c5e234c1b36b878dea3
+3a04b23e26e76b1a7351f382978de3d8a14307a1
diff --git a/aten/src/ATen/native/ts_native_functions.yaml b/aten/src/ATen/native/ts_native_functions.yaml
index d8ea64d56455d..f4c3ee8498960 100644
--- a/aten/src/ATen/native/ts_native_functions.yaml
+++ b/aten/src/ATen/native/ts_native_functions.yaml
@@ -227,10 +227,9 @@ non_native:
       - ShapeCompute
       - TreatScalarsAsConstants
       - CanBeReusedDeclOnly
+  # Even we have removed all the other view ops in favor of the *_copy version, expand
+  # is still kept because it's used in copy_.
   - func: expand(Tensor input, int[] size, bool is_scalar_expand) -> Tensor
-  - func: view(Tensor input, int[] output_size) -> Tensor
-    properties:
-      - ShapeCompute
   - func: cast(Tensor input, ScalarType dtype, ScalarType? stype) -> Tensor
     opkind: ltc_cast
     properties:
diff --git a/torch/csrc/lazy/core/ir_builder.h b/torch/csrc/lazy/core/ir_builder.h
index 95605eab1e995..9cc974236cd8f 100644
--- a/torch/csrc/lazy/core/ir_builder.h
+++ b/torch/csrc/lazy/core/ir_builder.h
@@ -58,9 +58,6 @@ struct IrBuilder {
       const Value& input0,
       const std::vector<int64_t>& size,
       const bool& is_scalar_expand) const = 0;
-  virtual NodePtr MakeView(
-      const Value& input0,
-      const std::vector<int64_t>& output_size) const = 0;
   virtual NodePtr MakeCast(
       const Value& input0,
       const at::ScalarType& dtype,
@@ -96,11 +93,6 @@ static inline NodePtr MakeExpand(
     const bool& is_scalar_expand) {
   return getIrBuilder()->MakeExpand(input0, size, is_scalar_expand);
 }
-static inline NodePtr MakeView(
-    const Value& input0,
-    const std::vector<int64_t>& output_size) {
-  return getIrBuilder()->MakeView(input0, output_size);
-}
 static inline NodePtr MakeCast(
     const Value& input0,
     const at::ScalarType& dtype,
diff --git a/torch/csrc/lazy/ts_backend/ir_builder.h b/torch/csrc/lazy/ts_backend/ir_builder.h
index 067efc784ee5a..1f32a3521ba8a 100644
--- a/torch/csrc/lazy/ts_backend/ir_builder.h
+++ b/torch/csrc/lazy/ts_backend/ir_builder.h
@@ -30,10 +30,6 @@ struct TorchScriptIrBuilder : IrBuilder {
       const bool& is_scalar_expand) const override {
     return ReuseOrMakeNode<Expand>(input0, size, is_scalar_expand);
   }
-  NodePtr MakeView(const Value& input0, const std::vector<int64_t>& output_size)
-      const override {
-    return ReuseOrMakeNode<View>(input0, output_size);
-  }
   NodePtr MakeCast(
       const Value& input0,
       const at::ScalarType& dtype,
diff --git a/torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp b/torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp
index 15dbebb0e67ad..8970e5354a7fe 100644
--- a/torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp
+++ b/torch/csrc/lazy/ts_backend/tensor_aten_ops.cpp
@@ -36,35 +36,11 @@ torch::lazy::Value MaybeExpand(
       /*is_scalar_expand=*/false);
 }
 
-std::vector<int64_t> GetExpandDimensions(
-    const torch::lazy::Shape& shape,
-    std::vector<int64_t> dimensions) {
-  TORCH_CHECK_GE(dimensions.size(), shape.dim()) << shape;
-  int64_t base = dimensions.size() - shape.dim();
-  for (size_t i = 0; i < shape.dim(); ++i) {
-    if (dimensions[base + i] == -1) {
-      dimensions[base + i] = shape.size(i);
-    }
-  }
-  return dimensions;
-}
-
 } // namespace
 
 //////////////////////////////////////////////////////////////////////////////
 // ATEN operators follows here, listed in alphabetical order.
 //////////////////////////////////////////////////////////////////////////////
-torch::lazy::LazyTensorPtr expand(
-    const torch::lazy::LazyTensorPtr& input,
-    std::vector<int64_t> size) {
-  auto input_shape = input->shape();
-  return torch::lazy::LazyTensor::Create(
-      torch::lazy::MakeExpand(
-          input->GetIrValue(),
-          GetExpandDimensions(input_shape.Get(), std::move(size)),
-          /*is_scalar_expand=*/false),
-      input->GetDevice());
-}
 
 void fill_(torch::lazy::LazyTensorPtr& input, const at::Scalar& value) {
   torch::lazy::Value constant =
diff --git a/torch/csrc/lazy/ts_backend/tensor_aten_ops.h b/torch/csrc/lazy/ts_backend/tensor_aten_ops.h
index 0cb16faf6cacc..bf663f4ca6b1b 100644
--- a/torch/csrc/lazy/ts_backend/tensor_aten_ops.h
+++ b/torch/csrc/lazy/ts_backend/tensor_aten_ops.h
@@ -9,14 +9,9 @@ namespace lazy {
 // ATEN operators follows here, listed in alphabetical order.
 //////////////////////////////////////////////////////////////////////////////
 
-torch::lazy::LazyTensorPtr expand(
-    const torch::lazy::LazyTensorPtr& input,
-    std::vector<int64_t> size);
-
+void copy_(torch::lazy::LazyTensorPtr& input, torch::lazy::LazyTensorPtr& src);
 // Fills the input with the given value.
 void fill_(torch::lazy::LazyTensorPtr& input, const at::Scalar& value);
 
-void copy_(torch::lazy::LazyTensorPtr& input, torch::lazy::LazyTensorPtr& src);
-
 } // namespace lazy
 } // namespace torch
diff --git a/torch/csrc/lazy/ts_backend/ts_node_lowering.cpp b/torch/csrc/lazy/ts_backend/ts_node_lowering.cpp
index 8e64cfb6ac13e..12341b69e6543 100644
--- a/torch/csrc/lazy/ts_backend/ts_node_lowering.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_node_lowering.cpp
@@ -157,15 +157,5 @@ torch::lazy::TSOpVector Scalar::Lower(
   return {loctx->graph()->insertConstant(at::scalar_tensor(value, options))};
 }
 
-// View Ops
-torch::lazy::TSOpVector View::Lower(
-    std::shared_ptr<torch::jit::GraphFunction> function,
-    torch::lazy::TSLoweringContext* loctx) const {
-  std::vector<torch::jit::NamedValue> arguments;
-  arguments.emplace_back(loctx->GetOutputOp(operand(0)));
-  arguments.emplace_back(output_size);
-  return LowerBuiltin(at::aten::reshape, function, arguments);
-}
-
 } // namespace lazy
 } // namespace torch

From 46cd611825d8551fd4ac9cd70a1fdcb7301709a5 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 7 Nov 2022 11:13:07 -0800
Subject: [PATCH 0648/1922] Add support for symbolic shapes to sparse tensor
 (#88573)

Along the way, I undid making sparse/dense dim symint (they're
dimensions, so they should be static.)

Also symintify set_indices_and_values_unsafe

There is a little bit of a nontrivial infra change here: previously, we didn't populate the strides field on sparse tensors. It is now populated with "empty" strides, and this meant that sparse tensors were falsely reporting they were non-overlapping dense/contiguous. I added in a hack to work around this case.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88573
Approved by: https://github.com/anjali411
---
 aten/src/ATen/SparseTensorImpl.cpp           | 16 ++++----
 aten/src/ATen/SparseTensorImpl.h             | 39 ++++++++++++++------
 aten/src/ATen/native/native_functions.yaml   |  4 +-
 aten/src/ATen/native/sparse/SparseTensor.cpp |  8 ++--
 c10/core/DispatchKey.cpp                     |  1 +
 c10/core/TensorImpl.cpp                      | 18 +++++++++
 c10/core/TensorImpl.h                        | 19 ++++++++++
 tools/autograd/derivatives.yaml              |  2 +-
 torch/_subclasses/fake_tensor.py             |  1 +
 9 files changed, 83 insertions(+), 25 deletions(-)

diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp
index 197ae21438967..36c93b706db86 100644
--- a/aten/src/ATen/SparseTensorImpl.cpp
+++ b/aten/src/ATen/SparseTensorImpl.cpp
@@ -89,16 +89,16 @@ void SparseTensorImpl::set_indices_and_values_unsafe(const Tensor& indices, cons
   TORCH_CHECK(indices.options().backend() == values.options().backend(), "backend of indices (", indices.options().backend(), ") must match backend of values (", values.options().backend(), ")");
   TORCH_CHECK(!indices.is_cuda() || indices.get_device() == values.get_device(), "device of indices (", indices.get_device(), ") must match device of values (", values.get_device(), ")");
 
-  TORCH_CHECK(indices.dim() == 2, "indices must be sparse_dim x nnz, but got: ", indices.sizes());
-  TORCH_CHECK(indices.size(1) == values.size(0), "indices and values must have same nnz, but got nnz from indices: ", indices.size(1), ", nnz from values: ", values.size(0));
-  TORCH_CHECK(indices.size(0) == sparse_dim_, "indices has incorrect first dimension, expected ", sparse_dim_, ", got ", indices.size(0));
+  TORCH_CHECK(indices.dim() == 2, "indices must be sparse_dim x nnz, but got: ", indices.sym_sizes());
+  TORCH_CHECK(indices.sym_size(1) == values.sym_size(0), "indices and values must have same nnz, but got nnz from indices: ", indices.sym_size(1), ", nnz from values: ", values.sym_size(0));
+  TORCH_CHECK(indices.sym_size(0) == sparse_dim_, "indices has incorrect first dimension, expected ", sparse_dim_, ", got ", indices.sym_size(0));
   TORCH_CHECK(values.dim() == dense_dim_ + 1, "values has incorrect number of dimensions, expected ", dense_dim_ + 1, ", got ", values.dim());
 
-  auto dense_size_original = sizes().slice(sparse_dim_);
-  std::vector<int64_t> expected_values_size_vec = {values.size(0)};
+  auto dense_size_original = sym_sizes().slice(sparse_dim_);
+  std::vector<c10::SymInt> expected_values_size_vec = {values.sym_size(0)};
   expected_values_size_vec.insert(expected_values_size_vec.end(), dense_size_original.begin(), dense_size_original.end());
-  IntArrayRef expected_values_size(expected_values_size_vec);
-  auto new_values_size = values.sizes();
+  SymIntArrayRef expected_values_size(expected_values_size_vec);
+  auto new_values_size = values.sym_sizes();
   TORCH_CHECK(
     std::equal(expected_values_size.begin(), expected_values_size.end(), new_values_size.begin()),
     "values has incorrect size, expected ", expected_values_size, ", got ", new_values_size
@@ -109,7 +109,7 @@ void SparseTensorImpl::set_indices_and_values_unsafe(const Tensor& indices, cons
   AT_ASSERT(device() == values_.device());
   AT_ASSERT(values_.device() == indices_.device());
 
-  coalesced_ = nnz() < 2;
+  coalesced_ = sym_nnz() < 2;
 }
 
 
diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
index c36d89be5b610..d90734100ca6c 100644
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@@ -9,6 +9,7 @@
 #include <ATen/Functions.h>
 #else
 #include <ATen/ops/empty.h>
+#include <ATen/ops/resize.h>
 #endif
 
 namespace at {
@@ -51,6 +52,10 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
   int64_t nnz() const {
     return values_.size(0);
   }
+
+  c10::SymInt sym_nnz() const {
+    return values_.sym_size(0);
+  }
   int64_t sparse_dim() const {
     return sparse_dim_;
   }
@@ -85,7 +90,7 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
     TORCH_CHECK(
         !has_symbolic_sizes_strides_,
         "raw_resize_ called on tensor with symbolic shape")
-    sizes_and_strides_.set_sizes(size);
+    set_sizes_and_strides(size, std::vector<int64_t>(size.size()));
     sparse_dim_ = sparse_dim;
     dense_dim_ = dense_dim;
     refresh_numel();
@@ -116,7 +121,8 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
   // 4. When we attempt to shrink the size of any of the sparse dimensions on a
   // non-empty sparse tensor (this could make some of the stored indices
   // out-of-bound and thus unsafe).
-  void resize_(int64_t sparse_dim, int64_t dense_dim, IntArrayRef size) {
+  template <typename T>
+  void _resize_(int64_t sparse_dim, int64_t dense_dim, ArrayRef<T> size) {
     TORCH_CHECK(
         allow_tensor_metadata_change(),
         "resize_ ",
@@ -160,7 +166,7 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
 
       bool shrinking_sparse_dims = false;
       bool shrinking_dense_dim = false;
-      auto sparse_size_original = sizes().slice(0, sparse_dim);
+      auto sparse_size_original = generic_sizes<T>().slice(0, sparse_dim);
       auto sparse_size_new = size.slice(0, sparse_dim);
       for (const auto i : c10::irange(sparse_dim)) {
         if (sparse_size_new[i] < sparse_size_original[i]) {
@@ -168,7 +174,7 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
           break;
         }
       }
-      auto dense_size_original = sizes().slice(sparse_dim);
+      auto dense_size_original = generic_sizes<T>().slice(sparse_dim);
       auto dense_size_new = size.slice(sparse_dim);
       for (const auto i : c10::irange(dense_dim)) {
         if (dense_size_new[i] < dense_size_original[i]) {
@@ -196,7 +202,7 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
           alt_options_msg);
     }
 
-    IntArrayRef sizes_and_strides = sizes_and_strides_.sizes_arrayref();
+    auto sizes_and_strides = generic_sizes<T>();
     const bool size_equals_sizes = std::equal(
         size.begin(),
         size.end(),
@@ -204,23 +210,34 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
         sizes_and_strides.end());
     if ((!size_equals_sizes) || (sparse_dim != sparse_dim_) ||
         (dense_dim != dense_dim_)) {
-      auto nnz = values().size(0);
-      std::vector<int64_t> values_size = {nnz};
+      auto nnz = at::symint::sizes<T>(values())[0];
+      std::vector<T> values_size = {nnz};
       auto dense_size = size.slice(sparse_dim);
       values_size.insert(
           values_size.end(), dense_size.begin(), dense_size.end());
-      values_.resize_(values_size);
-      indices_.resize_({sparse_dim, nnz});
+      at::symint::resize_<T>(values_, values_size);
+      at::symint::resize_<T>(indices_, {T(sparse_dim), nnz});
     }
 
     if (!size_equals_sizes) {
-      sizes_and_strides_.set_sizes(size);
+      set_sizes_and_strides(size, std::vector<T>(size.size()));
     }
     sparse_dim_ = sparse_dim;
     dense_dim_ = dense_dim;
     refresh_numel();
   }
 
+  void resize_(int64_t sparse_dim, int64_t dense_dim, ArrayRef<int64_t> size) {
+    return _resize_(sparse_dim, dense_dim, size);
+  }
+
+  void resize_(
+      int64_t sparse_dim,
+      int64_t dense_dim,
+      ArrayRef<c10::SymInt> size) {
+    return _resize_(sparse_dim, dense_dim, size);
+  }
+
   // NOTE: this function will resize the sparse tensor and also set `indices`
   // and `values` to empty.
   void resize_and_clear_(
@@ -243,7 +260,7 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
         "), but got ",
         size.size());
 
-    sizes_and_strides_.set_sizes(size);
+    set_sizes_and_strides(size, std::vector<int64_t>(size.size()));
     sparse_dim_ = sparse_dim;
     dense_dim_ = dense_dim;
 
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 6b1deaffb47b4..ef402e125c99c 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -6320,9 +6320,9 @@
     SparseCPU, SparseCUDA, SparseMeta, Meta: new_with_dims_sparse
   autogen: _sparse_coo_tensor_with_dims.out
 
-- func: _sparse_coo_tensor_with_dims_and_tensors(SymInt sparse_dim, SymInt dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+- func: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
   dispatch:
-    SparseCPU, SparseCUDA, SparseMeta, Meta: new_with_dims_and_tensor_sparse
+    SparseCPU, SparseCUDA, SparseMeta, Meta: new_with_dims_and_tensor_sparse_symint
   autogen: _sparse_coo_tensor_with_dims_and_tensors.out
 
 - func: sparse_resize_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!)
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index 625f5b1c0b080..38f3e11f8fd4e 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -163,10 +163,10 @@ SparseTensor new_with_dims_sparse(
   return self;
 }
 
-SparseTensor new_with_dims_and_tensor_sparse(
+SparseTensor new_with_dims_and_tensor_sparse_symint(
     int64_t sparse_dim,
     int64_t dense_dim,
-    ArrayRef<int64_t> size,
+    c10::SymIntArrayRef size,
     const Tensor& indices,
     const Tensor& values,
     c10::optional<ScalarType> dtype,
@@ -444,7 +444,9 @@ Tensor _sparse_coo_tensor_unsafe_symint(const Tensor& indices, const Tensor& val
 
   Tensor values = expand_values_if_needed(values_);
 
-  auto sparse_dim = indices.sym_size(0);
+  // This guard is intentional: we don't support dynamic shapes along the
+  // indices dimension because that implies variable dimensionality
+  auto sparse_dim = indices.sym_size(0).guard_int(__FILE__, __LINE__);
   auto dense_dim = values.dim() - 1;
 
   return at::_sparse_coo_tensor_with_dims_and_tensors_symint(
diff --git a/c10/core/DispatchKey.cpp b/c10/core/DispatchKey.cpp
index bffafc59168c6..e07d2ce6b051d 100644
--- a/c10/core/DispatchKey.cpp
+++ b/c10/core/DispatchKey.cpp
@@ -317,6 +317,7 @@ c10::DispatchKey parseDispatchKey(const std::string& k) {
       {"SparseHIP", c10::DispatchKey::SparseHIP},
       {"SparseXPU", c10::DispatchKey::SparseXPU},
       {"SparseVE", c10::DispatchKey::SparseVE},
+      {"SparseMeta", c10::DispatchKey::SparseMeta},
 
       {"AutogradCPU", c10::DispatchKey::AutogradCPU},
       {"AutogradCUDA", c10::DispatchKey::AutogradCUDA},
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index fdd16605634b3..bee3fa32ec214 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -272,6 +272,9 @@ bool_is_contiguous _compute_contiguous(
              sizes_and_strides_.strides_arrayref()))
 
 bool_is_contiguous TensorImpl::compute_contiguous() const {
+  if (is_sparse()) {
+    return bool_is_contiguous(false);
+  }
   return COMPUTE_WITH_SIZES_STRIDES_NUMEL(_compute_contiguous);
 }
 
@@ -306,6 +309,9 @@ bool_is_channels_last_contiguous _compute_channels_last_contiguous_2d(
 
 bool_is_channels_last_contiguous TensorImpl::
     compute_channels_last_contiguous_2d() const {
+  if (is_sparse()) {
+    return bool_is_channels_last_contiguous(false);
+  }
   return COMPUTE_WITH_SIZES_STRIDES(_compute_channels_last_contiguous_2d);
 }
 
@@ -340,17 +346,26 @@ bool_is_channels_last_3d_contiguous _compute_channels_last_contiguous_3d(
 
 bool_is_channels_last_3d_contiguous TensorImpl::
     compute_channels_last_contiguous_3d() const {
+  if (is_sparse()) {
+    return bool_is_channels_last_3d_contiguous(false);
+  }
   return COMPUTE_WITH_SIZES_STRIDES(_compute_channels_last_contiguous_3d);
 }
 
 bool_is_channels_last TensorImpl::compute_strides_like_channels_last_2d()
     const {
+  if (is_sparse()) {
+    return bool_is_channels_last(false);
+  }
   return bool_is_channels_last(
       COMPUTE_WITH_SIZES_STRIDES(is_channels_last_strides_2d));
 }
 
 bool_is_channels_last_3d TensorImpl::compute_strides_like_channels_last_3d()
     const {
+  if (is_sparse()) {
+    return bool_is_channels_last_3d(false);
+  }
   return bool_is_channels_last_3d(
       COMPUTE_WITH_SIZES_STRIDES(is_channels_last_strides_3d));
 }
@@ -393,6 +408,9 @@ bool_is_non_overlapping_and_dense _compute_non_overlapping_and_dense(
 
 bool_is_non_overlapping_and_dense TensorImpl::
     compute_non_overlapping_and_dense() const {
+  if (is_sparse()) {
+    return bool_is_non_overlapping_and_dense(false);
+  }
   return COMPUTE_WITH_SIZES_STRIDES(_compute_non_overlapping_and_dense);
 }
 
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 27d65e2d86739..a6ba3f16e2a27 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -673,6 +673,25 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     }
   }
 
+  // From https://stackoverflow.com/a/3057522/23845
+  // TODO: does C++14 have a stdlib template for this?
+  template <typename T>
+  struct identity {
+    typedef T type;
+  };
+
+  template <typename T>
+  ArrayRef<T> generic_sizes() {
+    return _generic_sizes(identity<T>());
+  }
+
+  ArrayRef<int64_t> _generic_sizes(identity<int64_t>) {
+    return sizes();
+  }
+  ArrayRef<c10::SymInt> _generic_sizes(identity<c10::SymInt>) {
+    return sym_sizes();
+  }
+
   /**
    * The number of elements in a tensor.
    *
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 018f1b9a280fa..dbe709007f738 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -1766,7 +1766,7 @@
   self: grad.to_dense().sparse_mask(mask).to_dense()
   mask: non_differentiable
 
-- name: _sparse_coo_tensor_with_dims_and_tensors(SymInt sparse_dim, SymInt dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+- name: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
   values: sparse_constructor_values_backward(grad, indices)
 
 - name: _sparse_sum.dim(Tensor self, int[1] dim) -> Tensor
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index c1cb65160cacd..796b15fedf103 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -971,6 +971,7 @@ def cpp_meta_supports_symint(self, func):
             aten.zeros.default,
             aten.detach.default,
             aten.set_.source_Storage_storage_offset,
+            aten._sparse_coo_tensor_with_dims_and_tensors.default,
         ]
 
     @property

From 8ba0835256d2425810043d2c0c74028cbd9a4532 Mon Sep 17 00:00:00 2001
From: BoringCrypto <b@rtje.net>
Date: Tue, 8 Nov 2022 03:26:44 +0000
Subject: [PATCH 0649/1922] Setting pickle_module isn't working (#88570)

When setting the pickle_module it currently always gets overwritten by the pickle module. This should only happen when the pickle_module isn't specified.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88570
Approved by: https://github.com/kit1980
---
 torch/serialization.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch/serialization.py b/torch/serialization.py
index 65d936679ed5e..53d060019408d 100644
--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -766,7 +766,8 @@ def load(
         if pickle_module is not None:
             raise RuntimeError("Can not safely load weights when expiclit picke_module is specified")
     else:
-        pickle_module = pickle
+        if pickle_module is None:
+            pickle_module = pickle
 
     _check_dill_version(pickle_module)
 

From 4be8d1d0934e339f831239204d4b7c6862fa025e Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Tue, 8 Nov 2022 03:29:52 +0000
Subject: [PATCH 0650/1922] [vision hash update] update the pinned vision hash
 (#88465)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned vision hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88465
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/vision.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index a21a31bd4419e..b985bb4d5e300 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-e64784cdea465d833d9d0f66dc73d7abe217933d
+bf58902b2fd881c760cd2eeacfae2d7c468ebf1f

From 573204c1035e24ed552334141f85cf00592491ba Mon Sep 17 00:00:00 2001
From: biubiuX <4338192+biubiuX@users.noreply.github.com>
Date: Tue, 8 Nov 2022 04:49:45 +0000
Subject: [PATCH 0651/1922] [Pytorch] add an option to disable TORCH_WARN and
 TORCH_WARN_ONCE log (#87188)

Summary: Add an option to disable TORCH_WARN, some op could trigger spammy TOCH_WARN log which is not desired under certain scenario.

Test Plan:
Tested with
-pt.disable_warn = 1 and -pt.disable_warn = 0

verified TORCH_WARN and TORCH_WARN_ONCE are properly handled

tested with
-pt.strip_error_messages = 1, -pt.disable_warn = 0

verified strip error message is respected when warn is printed

Differential Revision: D40321550

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87188
Approved by: https://github.com/kurtamohler, https://github.com/ezyang
---
 buckbuild.bzl        |  9 ++++++---
 c10/util/Exception.h | 14 +++++++++++---
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/buckbuild.bzl b/buckbuild.bzl
index 6ce59928d6968..75c16ba006550 100644
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@@ -98,6 +98,9 @@ def get_strip_error_messages():
         return True  # always strip in OSS CI to expose potential issues
     return read_bool("pt", "strip_error_messages", not _is_build_mode_dev())
 
+def get_disable_warn():
+    return read_bool("pt", "disable_warn", False)
+
 def get_enable_eager_symbolication():
     return read_bool("pt", "enable_eager_symbolication", default = False, required = False)
 
@@ -200,6 +203,8 @@ _COMMON_PREPROCESSOR_FLAGS = [
     ["-DC10_MOBILE_TRIM_DISPATCH_KEYS"] if get_enable_mobile_dispatch_keys_trimming() else []
 ) + (
     ["-DSTRIP_ERROR_MESSAGES"] if get_strip_error_messages() else []
+) + (
+    ["-DDISABLE_WARN"] if get_disable_warn() else []
 )
 
 def get_aten_preprocessor_flags():
@@ -1727,9 +1732,7 @@ def define_buck_targets(
         name = "mobile_bytecode",
         header_namespace = "",
         exported_headers = {
-            ("torch/csrc/jit/serialization/mobile_bytecode_generated.h" if IS_OSS
-            else "torch/csrc/jit/serialization/mobile_bytecode_generated_fbsource.h")
-            : ":mobile_bytecode_header[mobile_bytecode_generated_fbsource.h]",
+            ("torch/csrc/jit/serialization/mobile_bytecode_generated.h" if IS_OSS else "torch/csrc/jit/serialization/mobile_bytecode_generated_fbsource.h"): ":mobile_bytecode_header[mobile_bytecode_generated_fbsource.h]",
         },
         # Avoid leaking implementation details by only exposing this header to
         # the internals of the loader/serializer layer.
diff --git a/c10/util/Exception.h b/c10/util/Exception.h
index d86a85adbe4c4..dc5dcae7f0988 100644
--- a/c10/util/Exception.h
+++ b/c10/util/Exception.h
@@ -562,17 +562,21 @@ namespace detail {
 // Report a warning to the user.  Accepts an arbitrary number of extra
 // arguments which are concatenated into the warning message using operator<<
 //
-#define TORCH_WARN_WITH(warning_t, ...)                      \
+#ifdef DISABLE_WARN
+#define _TORCH_WARN_WITH(...) ((void)0);
+#else
+#define _TORCH_WARN_WITH(warning_t, ...)                     \
   ::c10::warn(::c10::Warning(                                \
       warning_t(),                                           \
       {__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, \
       WARNING_MESSAGE_STRING(__VA_ARGS__),                   \
       false));
+#endif
 
-#define TORCH_WARN(...) TORCH_WARN_WITH(::c10::UserWarning, __VA_ARGS__);
+#define TORCH_WARN(...) _TORCH_WARN_WITH(::c10::UserWarning, __VA_ARGS__);
 
 #define TORCH_WARN_DEPRECATION(...) \
-  TORCH_WARN_WITH(::c10::DeprecationWarning, __VA_ARGS__);
+  _TORCH_WARN_WITH(::c10::DeprecationWarning, __VA_ARGS__);
 
 // Report a warning to the user only once.  Accepts an arbitrary number of extra
 // arguments which are concatenated into the warning message using operator<<
@@ -584,12 +588,16 @@ namespace detail {
         return true;                                                      \
       }()
 
+#ifdef DISABLE_WARN
+#define TORCH_WARN_ONCE(...) ((void)0);
+#else
 #define TORCH_WARN_ONCE(...)                   \
   if (::c10::WarningUtils::get_warnAlways()) { \
     TORCH_WARN(__VA_ARGS__);                   \
   } else {                                     \
     _TORCH_WARN_ONCE(__VA_ARGS__);             \
   }
+#endif
 
 // Report an error with a specific argument
 // NOTE: using the argument name in TORCH_CHECK's message is preferred

From 74454ffa8a57af4f34bea2342a03666e9135af83 Mon Sep 17 00:00:00 2001
From: Charlie Yan <charlieyan@meta.com>
Date: Tue, 8 Nov 2022 05:12:18 +0000
Subject: [PATCH 0652/1922] Add missing args to DDP constructor in
 distributed.pyi (#88209)

Summary: As title. And remove all unnecessary `pyre-fixme` for the unknown arg in call-site.

Test Plan: CI

Differential Revision: D40874013

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88209
Approved by: https://github.com/zhaojuanmao
---
 torch/nn/parallel/distributed.pyi | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/torch/nn/parallel/distributed.pyi b/torch/nn/parallel/distributed.pyi
index a75713afb8282..a5db301e433d5 100644
--- a/torch/nn/parallel/distributed.pyi
+++ b/torch/nn/parallel/distributed.pyi
@@ -1,7 +1,7 @@
-from ..modules import Module
 from typing import Any, Optional
-from .common_types import _devices_t, _device_t
 
+from ..modules import Module
+from .common_types import _device_t, _devices_t
 
 class DistributedDataParallel(Module):
     process_group: Any = ...
@@ -15,7 +15,17 @@ class DistributedDataParallel(Module):
     bucket_bytes_cap: float = ...
 
     # TODO type process_group once `distributed` module is stubbed
-    def __init__(self, module: Module, device_ids: Optional[_devices_t] = ...,
-                 output_device: Optional[_device_t] = ..., dim: int = ...,
-                 broadcast_buffers: bool = ..., process_group: Optional[Any] = ..., bucket_cap_mb: float = ...,
-                 find_unused_parameters: bool = ..., check_reduction: bool = ...) -> None: ...
+    def __init__(
+        self,
+        module: Module,
+        device_ids: Optional[_devices_t] = ...,
+        output_device: Optional[_device_t] = ...,
+        dim: int = ...,
+        broadcast_buffers: bool = ...,
+        process_group: Optional[Any] = ...,
+        bucket_cap_mb: float = ...,
+        find_unused_parameters: bool = ...,
+        check_reduction: bool = ...,
+        gradient_as_bucket_view: bool = ...,
+        static_graph: bool = ...,
+    ) -> None: ...

From 5fa65d048598f65ed8f333fefe20eb2a4e9e6549 Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Mon, 7 Nov 2022 16:07:13 -0800
Subject: [PATCH 0653/1922] [Autograd] Use in-place input accumulation fast
 path for dense Tensors. (#88339)

There is a fast path in InputBuffer to steal memory when use count is zero, however it is only used for sparse Tensors. According to Natalia, this is just because it wasn't obvious that there would be a benefit for dense Tensors so there was no reason to live dangerously. However I've noticed large Tensors in internal models which would benefit from this optimization as well.

Differential Revision: [D40946601](https://our.internmc.facebook.com/intern/diff/D40946601/)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88339
Approved by: https://github.com/ngimel
---
 torch/csrc/autograd/input_buffer.cpp | 54 ++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 14 deletions(-)

diff --git a/torch/csrc/autograd/input_buffer.cpp b/torch/csrc/autograd/input_buffer.cpp
index 6cc6acefc9d45..7e6df0cea8da0 100644
--- a/torch/csrc/autograd/input_buffer.cpp
+++ b/torch/csrc/autograd/input_buffer.cpp
@@ -4,6 +4,7 @@
 #include <ATen/SparseCsrTensorUtils.h>
 #include <ATen/SparseTensorUtils.h>
 #include <ATen/TensorOperators.h>
+#include <ATen/TensorSubclassLikeUtils.h>
 
 #include <c10/core/DeviceGuard.h>
 #include <c10/core/Event.h>
@@ -66,6 +67,18 @@ void record_stream_any_impl(Variable& var, c10::Stream& stream) {
     }
   }
 }
+
+bool can_accumulate_inplace(const Variable& v) {
+  return (
+      // `v` is a "vanilla" Tensor
+      !(at::isTensorSubclassLike(v) || v._is_zerotensor() || v.is_nested()) &&
+
+      // with a favorable memory layout
+      v.is_non_overlapping_and_dense() &&
+
+      // and we hold the last reference
+      v.use_count() == 1 && v.storage().use_count() == 1);
+}
 } // anonymous namespace
 
 static void accumulate(
@@ -74,25 +87,38 @@ static void accumulate(
     Variable&& var) {
   TORCH_INTERNAL_ASSERT(pos < buffer.size());
   auto& old_var = buffer[pos];
-  // ATen doesn't route sparse additions correctly...
-  // do dense + sparse in-place if possible
-  if (old_var.is_sparse()) {
-    // It is safe to change the Tensor inplace if the Tensor is only used in
-    // this buffer (this could be the gradient passed by the user) and that no
-    // other Tensor is using the same storage.
-    if (!var.is_sparse() && var.is_contiguous() && var.use_count() == 1 &&
-        var.storage().use_count() == 1) {
+  // If we hold the last reference to `old_var` AND its storage we will try to
+  // repurpose it to store the output. (Or, if `old_var` is sparse then `var`
+  // becomes the candidate output Tensor.) We only do this if:
+  //  1) GradMode is disabled since Autograd has special handling for inplace
+  //     mutation which we don't want to trigger.
+  //
+  //  2) We hold the last reference.
+  //     (Both `.use_count` and `.storage().use_count()` are one)
+  //
+  //  3) The candidate tensor is a contiguous, non-overlapping, dense, and
+  //     otherwise stock standard Tensor.
+  //
+  //  4) The candidate is mutable. Currently only ZeroTensors are immutable.
+  //
+  //  5) The other Tensor is not a Tensor subclass (except sparse), since
+  //     it's hard to predict the semantics of arbitrary subclass behavior.
+
+  if (at::GradMode::is_enabled()) {
+    buffer[pos] = old_var + var;
+  } else if (
+      // ATen doesn't route sparse additions correctly...
+      old_var.is_sparse() || old_var.is_sparse_csr()) {
+    if (can_accumulate_inplace(var)) {
       buffer[pos] = var.add_(old_var);
     } else {
       buffer[pos] = var + old_var;
     }
+  } else if (
+      can_accumulate_inplace(old_var) && !at::isTensorSubclassLike(var)) {
+    buffer[pos] = old_var.add_(var);
   } else {
-    if (var.is_sparse() && !old_var.is_sparse() && old_var.is_contiguous() &&
-        old_var.use_count() == 1 && old_var.storage().use_count() == 1) {
-      buffer[pos] = old_var.add_(var);
-    } else {
-      buffer[pos] = old_var + var;
-    }
+    buffer[pos] = old_var + var;
   }
 }
 

From da961bf08f984a80171bef19789a66ea41b6da94 Mon Sep 17 00:00:00 2001
From: Michael Suo <suo@fb.com>
Date: Tue, 8 Nov 2022 06:29:11 +0000
Subject: [PATCH 0654/1922] [ci] increase timeout time of ios test app build
 (#88611)

We were timing out; 5 minutes seems a bit short.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88611
Approved by: https://github.com/clee2000, https://github.com/huydhn, https://github.com/ZainRizvi
---
 .github/workflows/_ios-build-test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_ios-build-test.yml b/.github/workflows/_ios-build-test.yml
index 665ff1b9ce16f..e9b5461dde7fd 100644
--- a/.github/workflows/_ios-build-test.yml
+++ b/.github/workflows/_ios-build-test.yml
@@ -106,7 +106,7 @@ jobs:
 
       - name: Build TestApp
         if: inputs.ios-platform == 'SIMULATOR'
-        timeout-minutes: 5
+        timeout-minutes: 15
         run: |
           # run the ruby build script
           if ! [ -x "$(command -v xcodebuild)" ]; then

From 6ad12ef4a169414a10965dfe6c8eda3175416902 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 8 Nov 2022 08:32:45 +0000
Subject: [PATCH 0655/1922] Revive static_runtime_benchmark build and test
 (#87660)

This build uses the wrong BUILD_ENVIRONMENT `pytorch-linux-focal-py3`, thus it hasn't been run for a long time (forgotten). The name was probably the old name of the build environment we used in the past.  The convention today doesn't have the `pytorch-` prefix. There is a TODO for this:

> TODO: this condition is never (BUILD_ENVIRONMENT doesn't start with pytorch-), need to fix this.

This is done as part of [T131829540](https://www.internalfb.com/intern/tasks/?t=131829540), where we want
 `static_runtime_benchmark` build and test jobs to run  in OSS CI to avoid breaking internal

* I also fix some compiler warning errors `-Werror=sign-compare`, `-Werror,-Wunused-const-variable`, and gcc7 compatibility issue along the way because this hasn't been run for a long time.
* Reviving this test also reveals a small bug in `PrepackWeights` test in `test_static_runtime.cc` added recently in https://github.com/pytorch/pytorch/pull/85289. The test refers to an internal ops and should only be run internally. This has been fixed by https://github.com/pytorch/pytorch/pull/87799 (To be merged)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87660
Approved by: https://github.com/malfet
---
 .jenkins/pytorch/build.sh                       | 13 ++-----------
 .jenkins/pytorch/test.sh                        |  9 +++------
 benchmarks/static_runtime/test_generated_ops.cc |  2 --
 benchmarks/static_runtime/test_static_module.cc |  7 -------
 benchmarks/static_runtime/test_utils.cc         | 13 +++++++------
 torch/csrc/jit/runtime/static/passes.h          |  2 +-
 6 files changed, 13 insertions(+), 33 deletions(-)

diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index 13ee6309c0655..bb7b2c5d03c88 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -182,17 +182,8 @@ if [[ "${BUILD_ENVIRONMENT}" == *linux-focal-py3.7-gcc7-build*  ]]; then
   export USE_GLOO_WITH_OPENSSL=ON
 fi
 
-# TODO: Remove after xenial->focal migration
-if [[ "${BUILD_ENVIRONMENT}" == pytorch-linux-xenial-py3* ]]; then
-  if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
-    export BUILD_STATIC_RUNTIME_BENCHMARK=ON
-  fi
-fi
-
-if [[ "${BUILD_ENVIRONMENT}" == pytorch-linux-focal-py3* ]]; then
-  if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
-    export BUILD_STATIC_RUNTIME_BENCHMARK=ON
-  fi
+if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
+  export BUILD_STATIC_RUNTIME_BENCHMARK=ON
 fi
 
 if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index d8f020182710c..ea219dd45f0b3 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -403,12 +403,9 @@ test_libtorch() {
     OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="test/cpp/api/mnist" "$TORCH_BIN_DIR"/test_api --gtest_filter='-IMethodTest.*' --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml
     "$TORCH_BIN_DIR"/test_tensorexpr --gtest_output=xml:$TEST_REPORTS_DIR/test_tensorexpr.xml
 
-    # TODO: this condition is never (BUILD_ENVIRONMENT doesn't start with pytorch-), need to fix this.
-    if [[ "${BUILD_ENVIRONMENT}" == pytorch-linux-xenial-py3* ]]; then
-      if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* && "${BUILD_ENVIRONMENT}" != *asan* ]]; then
-        # TODO: Consider to run static_runtime_test from $TORCH_BIN_DIR (may need modify build script)
-        "$BUILD_BIN_DIR"/static_runtime_test --gtest_output=xml:$TEST_REPORTS_DIR/static_runtime_test.xml
-      fi
+    if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* && "${BUILD_ENVIRONMENT}" != *asan* ]]; then
+      # TODO: Consider to run static_runtime_test from $TORCH_BIN_DIR (may need modify build script)
+      "$BUILD_BIN_DIR"/static_runtime_test --gtest_output=xml:$TEST_REPORTS_DIR/static_runtime_test.xml
     fi
     assert_git_not_dirty
   fi
diff --git a/benchmarks/static_runtime/test_generated_ops.cc b/benchmarks/static_runtime/test_generated_ops.cc
index 80ffc5ac8441a..415bf464fbd13 100644
--- a/benchmarks/static_runtime/test_generated_ops.cc
+++ b/benchmarks/static_runtime/test_generated_ops.cc
@@ -7841,7 +7841,6 @@ TEST(StaticRuntime, autogen_diagonal) {
   auto offset0 = 0;
   auto dim10 = 2;
   auto dim20 = 1;
-  auto dim00 = 1;
   std::vector<IValue> args{self0, offset0, dim10, dim20};
   testStaticRuntime(script, args);
 }
@@ -7859,7 +7858,6 @@ TEST(StaticRuntime, autogen_linalg_diagonal) {
   auto offset0 = 0;
   auto dim10 = 2;
   auto dim20 = 1;
-  auto dim00 = 1;
   std::vector<IValue> args{A0, offset0, dim10, dim20};
   testStaticRuntime(script, args);
 }
diff --git a/benchmarks/static_runtime/test_static_module.cc b/benchmarks/static_runtime/test_static_module.cc
index 1574cda3ee24a..3c927c9c41d9d 100644
--- a/benchmarks/static_runtime/test_static_module.cc
+++ b/benchmarks/static_runtime/test_static_module.cc
@@ -77,13 +77,6 @@ const auto sigmoid_inplace_script = R"JIT(
       return (a)
 )JIT";
 
-const auto sigmoid_out_script = R"JIT(
-  def forward(self, inp: Tensor):
-      a = inp + inp
-      b = torch.sigmoid(inp, out=a).clone()
-      return (b)
-)JIT";
-
 } // namespace
 
 // Test that StaticModule::value_group groups values of the graph into
diff --git a/benchmarks/static_runtime/test_utils.cc b/benchmarks/static_runtime/test_utils.cc
index b096d1a0ba9f5..cc88801139334 100644
--- a/benchmarks/static_runtime/test_utils.cc
+++ b/benchmarks/static_runtime/test_utils.cc
@@ -124,7 +124,7 @@ void compareTensorLists(
     const bool use_allclose,
     const bool use_equalnan) {
   EXPECT_TRUE(l.size() == r.size());
-  for (int i = 0; i < l.size(); ++i) {
+  for (auto i : c10::irange(l.size())) {
     ASSERT_TRUE(l[i].isTensor());
     ASSERT_TRUE(r[i].isTensor());
     VLOG(2) << "expect " << i << ": \n" << l[i] << std::endl;
@@ -298,11 +298,12 @@ void testStaticRuntime(
         // 1st run: collect allocation profiles (args)
         // 2nd run: exercise memory planner and resizing with args2
         // 3rd run: run with args again
-        StaticModuleOptions opts{
-            .enable_out_variant = enable_out_variant,
-            .optimize_memory = enable_out_variant,
-            .manage_output_tensors = manage_output_tensors,
-            .enable_tensorexpr_fusion = enable_tensorexpr_fusion};
+        StaticModuleOptions opts;
+        opts.enable_out_variant = enable_out_variant;
+        opts.optimize_memory = enable_out_variant;
+        opts.manage_output_tensors = manage_output_tensors;
+        opts.enable_tensorexpr_fusion = enable_tensorexpr_fusion;
+
         auto smodule = test_context->makeStaticModule(opts);
         StaticRuntime runtime(smodule);
         auto actual = runtime(args, {});
diff --git a/torch/csrc/jit/runtime/static/passes.h b/torch/csrc/jit/runtime/static/passes.h
index 35c1678adca86..d61d7baa4947e 100644
--- a/torch/csrc/jit/runtime/static/passes.h
+++ b/torch/csrc/jit/runtime/static/passes.h
@@ -21,7 +21,7 @@ TORCH_API void ReplacePermuteWithCopy(
     std::shared_ptr<torch::jit::Graph>& graph,
     bool outputs_are_immutable = true);
 
-void ReplaceWithMaybeCopy(
+TORCH_API void ReplaceWithMaybeCopy(
     std::shared_ptr<torch::jit::Graph>& graph,
     bool outputs_are_immutable = true);
 

From d2e25e4ed0ac0f6817d0f6db85d3c52570bb4d8f Mon Sep 17 00:00:00 2001
From: jjsjann123 <alex.jann2012@gmail.com>
Date: Tue, 8 Nov 2022 12:06:35 +0000
Subject: [PATCH 0656/1922] nvprims bookend non compute (#88457)

Cherry-pickeding: https://github.com/csarofeen/pytorch/pull/2099

1. enabling bookend non-compute-ops pass on nvfuser
2. fixing bookend op check on intermediate tensor as partition inputs
3. python tests added for: `getitem` special handling bookend_non_compute removal
4. patching dfs by excluding dfs within partition to avoid going over recursion limitation
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88457
Approved by: https://github.com/SherlockNoMad
---
 test/test_fx_passes.py               | 78 +++++++++++++++++++++-------
 torch/_prims/nvfuser_executor.py     |  1 +
 torch/fx/passes/infra/partitioner.py | 61 +++++++++++++++++++++-
 3 files changed, 120 insertions(+), 20 deletions(-)

diff --git a/test/test_fx_passes.py b/test/test_fx_passes.py
index aa04fbac26187..d9e5abc921df7 100644
--- a/test/test_fx_passes.py
+++ b/test/test_fx_passes.py
@@ -188,47 +188,87 @@ def forward14(a, b, c):
         out = a0 + 1.0
         return out
 
+    @staticmethod
+    def forward15(a, b, c):
+        a0 = torch.ops.aten.view(a, [2, 2])
+        a1 = torch.ops.aten.permute(a0, [1, 0])
+        a2 = a1 + 1.0
+        a3 = torch.ops.aten.permute(a2, [1, 0])
+        a4 = a3 + 1.0
+        a5 = torch.ops.aten.permute(a4, [1, 0])
+        return torch.ops.aten.permute(a5, [1, 0])
+
+    @staticmethod
+    def forward16(a, b, c):
+        a0 = a - 1.0
+        a1 = torch.ops.aten.view(a0, [2, 2])
+        a2 = torch.ops.aten.permute(a1, [1, 0])
+        a3 = a2 + 1.0
+        a4 = torch.ops.aten.permute(a3, [1, 0])
+        a5 = a4 + 1.0
+        a6 = torch.ops.aten.permute(a5, [1, 0])
+        a7 = torch.ops.aten.permute(a6, [1, 0])
+        return a7 - 1.0
+
 # A mock OperatorSupport class, where only operator.add is supported
 class MockOperatorSupport(OperatorSupport):
     def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
-        return node.op == "call_function" and node.target in {operator.add, operator.getitem, torch.ops.aten.std_mean}
-
+        return (node.op == "call_function" and
+                node.target in {operator.add, operator.getitem,
+                                torch.ops.aten.view,
+                                torch.ops.aten.permute,
+                                torch.ops.aten.std_mean})
 
 @instantiate_parametrized_tests
 class TestFXGraphPasses(JitTestCase):
 
-    @parametrize("fn, expected_partition", [
-        (TestPartitionFunctions.forward1, [["add_7", "add_6"], ["add_5", "add_4", "add_3"], ["add_2", "add_1", "add"]]),
-        (TestPartitionFunctions.forward2, [["add_3", "add_2"], ["add_1", "add"]]),
+    @parametrize("fn, expected_partition, bookend_non_compute_pass", [
+        (TestPartitionFunctions.forward1, [["add_7", "add_6"], ["add_5", "add_4", "add_3"], ["add_2", "add_1", "add"]], False),
+        (TestPartitionFunctions.forward2, [["add_3", "add_2"], ["add_1", "add"]], False),
 
         # 1 horizontal fusion with common producer
-        (TestPartitionFunctions.forward3, [["add_2", "add_1", "add"]]),
-        (TestPartitionFunctions.forward4, [["add_2", "add_1", "add"]]),
+        (TestPartitionFunctions.forward3, [["add_2", "add_1", "add"]], False),
+        (TestPartitionFunctions.forward4, [["add_2", "add_1", "add"]], False),
 
         # 2 branches cases
-        (TestPartitionFunctions.forward5, [["add_1", "add"]]),
-        (TestPartitionFunctions.forward6, [["add"]]),
-        (TestPartitionFunctions.forward7, [["add_3", "add_2", "add", "add_1"]]),
-        (TestPartitionFunctions.forward8, [["add_3", "add_2", "add", "add_1"]]),
+        (TestPartitionFunctions.forward5, [["add_1", "add"]], False),
+        (TestPartitionFunctions.forward6, [["add"]], False),
+        (TestPartitionFunctions.forward7, [["add_3", "add_2", "add", "add_1"]], False),
+        (TestPartitionFunctions.forward8, [["add_3", "add_2", "add", "add_1"]], False),
 
         # 3 branch cases
-        (TestPartitionFunctions.forward9, [['add_3', 'add_2', 'add_1', 'add']]),
-        (TestPartitionFunctions.forward10, [['add_3', 'add_2', 'add', 'add_1']]),
-        (TestPartitionFunctions.forward11, [['add_1'], ['add']]),
+        (TestPartitionFunctions.forward9, [['add_3', 'add_2', 'add_1', 'add']], False),
+        (TestPartitionFunctions.forward10, [['add_3', 'add_2', 'add', 'add_1']], False),
+        (TestPartitionFunctions.forward11, [['add_1'], ['add']], False),
 
         # 4 not necessarily the only partition, just to verify that there's no cyclic dependency after partition
-        (TestPartitionFunctions.forward12, [["add_2"], ["add_3", "add_4", "add_1"], ["add"]]),
+        (TestPartitionFunctions.forward12, [["add_2"], ["add_3", "add_4", "add_1"], ["add"]], False),
 
         # 5 getitem special case
-        (TestPartitionFunctions.forward13, [["add_2", "add_1", "add"]]),
-        (TestPartitionFunctions.forward14, [["add", "std_mean", "getitem", "getitem_1"]]),
+        (TestPartitionFunctions.forward13, [["add_2", "add_1", "add"]], False),
+        (TestPartitionFunctions.forward14, [["add", "std_mean", "getitem", "getitem_1"]], False),
+
+        # 6 bookend non_compute pass
+        (TestPartitionFunctions.forward15, [["permute_1", "add_1", "add"]], True),
+        (TestPartitionFunctions.forward15, [['add_1', 'add', 'permute_1', 'view', 'permute_2', 'permute_3', 'permute']], False),
+        (TestPartitionFunctions.forward16, [["permute_1", "add_1", "add"]], True),
+        (TestPartitionFunctions.forward16, [['add_1', 'add', 'permute_1', 'view', 'permute_2', 'permute_3', 'permute']], False),
     ])
-    def test_partitioner(self, fn, expected_partition):
+    def test_partitioner(self, fn, expected_partition, bookend_non_compute_pass):
         traced = symbolic_trace(fn)
 
+        non_compute_ops = []
+        if bookend_non_compute_pass:
+            non_compute_ops = ["torch.ops.aten.view", "torch.ops.aten.permute"]
+
         supported_ops = MockOperatorSupport()
-        partitioner = CapabilityBasedPartitioner(traced, supported_ops, allows_single_node_partition=True)
+        partitioner = CapabilityBasedPartitioner(traced,
+                                                 supported_ops,
+                                                 allows_single_node_partition=True,
+                                                 non_compute_ops=non_compute_ops)
         partitions = partitioner.propose_partitions()
+        if bookend_non_compute_pass:
+            partitioner.remove_bookend_non_compute_ops(partitions)
 
         partitions_name = [[node.name for node in partition.nodes] for partition in partitions]
         assert len(partitions_name) == len(expected_partition)
diff --git a/torch/_prims/nvfuser_executor.py b/torch/_prims/nvfuser_executor.py
index c3a7e8913ce29..0f4e7b49fa27c 100644
--- a/torch/_prims/nvfuser_executor.py
+++ b/torch/_prims/nvfuser_executor.py
@@ -383,6 +383,7 @@ def maybe_partition_graph(
             allowed_single_node_partition_ops=_allowed_single_node_partition_ops,
         )
         partitions = partitioner.propose_partitions()
+        partitioner.remove_bookend_non_compute_ops(partitions)
         if len(partitions) == 0:
             warn(
                 "No partition found for the graph. "
diff --git a/torch/fx/passes/infra/partitioner.py b/torch/fx/passes/infra/partitioner.py
index 5f5a808b85121..a19894ddc6854 100644
--- a/torch/fx/passes/infra/partitioner.py
+++ b/torch/fx/passes/infra/partitioner.py
@@ -68,6 +68,9 @@ def maybe_merge_partition(self_id: int, other_id: int):
             merged_nodes = copy(partitions_by_id[self_id].nodes)
             merged_nodes.update(partitions_by_id[other_id].nodes)
 
+            # Note it's ok to use `set` here, since we are only query if a node
+            # has been visited. We are NEVER going to iterate on nodes inside
+            # the set.
             visited: Set[Node] = set()
 
             def dfs_find_cycle(node):
@@ -85,7 +88,7 @@ def dfs_find_cycle(node):
                     # dependencies after the fusion
                     for p_node in partitions_by_id[assignment[node]].nodes:
                         for user_node in p_node.users:
-                            if dfs_find_cycle(user_node):
+                            if user_node not in partitions_by_id[assignment[node]].nodes and dfs_find_cycle(user_node):
                                 return True
                 else:
                     for user_node in node.users:
@@ -205,6 +208,62 @@ def fuse_partitions(self, partitions: List[Partition]) -> GraphModule:
         # fuse_by_partitions expects partitions in List[List[Node]]: [ [node0, node1], [node2, node3] ]
         return fuse_by_partitions(self.graph_module, [list(partition.nodes) for partition in partitions])
 
+    # remove non-compute-ops that sits at the boundary of a partition.
+    def remove_bookend_non_compute_ops(self, partitions: List[Partition]):
+        non_compute_ops = set(self.non_compute_ops)
+
+        def is_non_compute_node(node: Node):
+            return node.op == "call_function" and \
+                _get_qualified_name(node.target) in non_compute_ops  # type: ignore[arg-type]
+
+        # cache transparent nodes
+        transparent_input_nodes: Dict[Node, bool] = {}
+        transparent_output_nodes: Dict[Node, bool] = {}
+
+        def is_transparent_input_node(node: Node, partition: Set[Node], removed_nodes: Set[Node]):
+            if node.op == "placeholder" or (node not in partition) or (node in removed_nodes):
+                return True
+            if node in transparent_input_nodes:
+                return transparent_input_nodes[node]
+            if is_non_compute_node(node):
+                for input_n in node.all_input_nodes:
+                    if not is_transparent_input_node(input_n, partition, removed_nodes):
+                        transparent_input_nodes[node] = False
+                        return False
+                transparent_input_nodes[node] = True
+                return True
+            transparent_input_nodes[node] = False
+            return False
+
+        def is_transparent_output_node(node: Node, partition: Set[Node], removed_nodes: Set[Node]):
+            if node.op == "placeholder" or (node not in partition) or (node in removed_nodes):
+                return True
+            if node in transparent_output_nodes:
+                return transparent_output_nodes[node]
+            if is_non_compute_node(node):
+                for output_n in node.users:
+                    if not is_transparent_output_node(output_n, partition, removed_nodes):
+                        transparent_output_nodes[node] = False
+                        return False
+                transparent_output_nodes[node] = True
+                return True
+            transparent_output_nodes[node] = False
+            return False
+
+        for partition in partitions:
+            # Note it's ok to use `set` here, since we are only query if a node
+            # has been removed. We are NEVER going to iterate on nodes inside
+            # the set.
+            remove_node: Set[Node] = set()
+            for node in partition.nodes:
+                if is_non_compute_node(node) and \
+                    (is_transparent_input_node(node, partition.nodes, remove_node) or
+                     is_transparent_output_node(node, partition.nodes, remove_node)):
+                    remove_node.add(node)
+
+            if len(remove_node) != 0:
+                partition.nodes = partition.nodes - remove_node
+
     def partition_and_fuse(self) -> GraphModule:
         partitions = self.propose_partitions()
         fused_gm = self.fuse_partitions(partitions)

From 6b3802480624702d0e88db68c6d03cf31be66d86 Mon Sep 17 00:00:00 2001
From: lezcano <lezcano-93@hotmail.com>
Date: Mon, 7 Nov 2022 16:32:25 +0000
Subject: [PATCH 0657/1922] Create _make_alias to preserve the name of a
 function when creating an alias (#88114)

Before, we would inherit the name of the aliased function, which was
very confusing, and disallowed some homogeneous treatment of references,
as we do later in this stack

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88114
Approved by: https://github.com/mruberry
---
 torch/_refs/__init__.py | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 28153321db59c..57dc284361a00 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -414,6 +414,19 @@ def _ref(a: TensorLikeType) -> TensorLikeType:
     return inner
 
 
+def _make_alias(fn, name):
+    """
+    This function defines an alias of another function and sets its __name__argument
+    Note that when naïvely doing `alias = fn`, we have that `alias.__name__ == "fn"`.
+    """
+
+    def _fn(*args, **kwargs):
+        return fn(*args, **kwargs)
+
+    _fn.__name__ = name
+    return _fn
+
+
 @_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT)
 def abs(a):
     return prims.abs(a)
@@ -611,6 +624,10 @@ def isnan(a: TensorLikeType) -> TensorLikeType:
     return prims.ne(a, a)
 
 
+# alias
+mvlgamma = _make_alias(torch.special.multigammaln, "mvlgamma")  # type: ignore[has-type]
+
+
 @_make_elementwise_unary_reference(
     ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
     aten_op=None,  # CompositeImplicitAutograd
@@ -634,10 +651,6 @@ def lgamma(a):
     return prims.lgamma(a)
 
 
-# alias
-mvlgamma = torch.special.multigammaln  # type: ignore[has-type]
-
-
 @_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
 def log(a):
     return prims.log(a)

From d495ea43ac98a63145c3ee11e6e7baf2022696d3 Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang@fb.com>
Date: Mon, 7 Nov 2022 15:44:31 -0800
Subject: [PATCH 0658/1922] Add torch.distributed.DistBackendError exception
 type, thrown from C10D_NCCL_CHECK (#88134)

Currently all of the distributed errors are thrown from the `TORCH_CHECK` macro which throws a generic `RuntimeError`. This change introduced a new error type `DistBackendError` which derives from `RuntimeError` to signify there was an error with the backend communication library. This allows for better error handling and analysis at higher levels in the stack. Motivation: https://docs.google.com/document/d/1j6VPOkC6znscliFuiDWMuMV1_fH4Abgdq7TCHMcXai4/edit#heading=h.a9rc38misyx8

Changes:
- introduce new error type
- Update `C10D_NCCL_CHECK`

Sample script to demonstrate new error type

```python
# python -m torch.distributed.run --nproc_per_node=2 <script>.py

import torch
import torch.distributed as dist

if __name__ == "__main__":
    dist.init_process_group("nccl")
    dist.broadcast(torch.tensor([1, 2, 3]).cuda(), 0)
```

Differential Revision: [D40998803](https://our.internmc.facebook.com/intern/diff/D40998803)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88134
Approved by: https://github.com/rohan-varma
---
 c10/util/Exception.h                      |  6 ++++++
 docs/source/distributed.rst               |  7 +++++++
 test/distributed/test_c10d_nccl.py        | 10 ++++++++++
 torch/_C/__init__.pyi.in                  |  3 +++
 torch/csrc/Exceptions.cpp                 | 12 +++++++++++-
 torch/csrc/Exceptions.h                   |  4 +++-
 torch/csrc/distributed/c10d/NCCLUtils.hpp |  2 +-
 torch/distributed/__init__.py             |  2 ++
 8 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/c10/util/Exception.h b/c10/util/Exception.h
index dc5dcae7f0988..773107f668ae1 100644
--- a/c10/util/Exception.h
+++ b/c10/util/Exception.h
@@ -275,6 +275,12 @@ class C10_API OutOfMemoryError : public Error {
   using Error::Error;
 };
 
+// Used for collective communication library errors from the distributed module.
+// These turn into DistBackendError when they cross into Python.
+class C10_API DistBackendError : public Error {
+  using Error::Error;
+};
+
 // A utility function to return an exception std::string by prepending its
 // exception type before its what() content
 C10_API std::string GetExceptionString(const std::exception& e);
diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst
index 530ff88721048..62e16ebb8a7b7 100644
--- a/docs/source/distributed.rst
+++ b/docs/source/distributed.rst
@@ -832,6 +832,13 @@ following matrix shows how the log level can be adjusted via the combination of
 | ``INFO``                | ``DETAIL``                  | Trace (a.k.a. All)     |
 +-------------------------+-----------------------------+------------------------+
 
+Distributed has a custom Exception type derived from `RuntimeError` called `torch.distributed.DistBackendError`. This exception is thrown when a backend-specific error occurs. For example, if
+the `NCCL` backend is used and the user attempts to use a GPU that is not available to the `NCCL` library.
+
+.. autoclass:: torch.distributed.DistBackendError
+
+.. warning::
+    The DistBackendError exception type is an experimental feature is subject to change.
 
 .. Distributed modules that are missing specific entries.
 .. Adding them here for tracking purposes until they are more permanently fixed.
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 78b1cbbe676cf..5d412dd3fb1b0 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -1025,6 +1025,16 @@ def test_send_recv(self):
             with self.assertRaisesRegex(RuntimeError, 'Tensors must be contiguous'):
                 dist.send(send_tensor_view, 1)
 
+    @requires_nccl()
+    @sandcastle_skip_if(torch.cuda.device_count() < 1, "NCCL test requires 1 GPU")
+    @skip_if_lt_x_gpu(1)
+    def test_nccl_dist_backend_error(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        self._create_process_group_nccl(store, self.opts())
+
+        # Both rank 0 and 1 will use the same CUDA device resulting in ncclInvalidUsage
+        with self.assertRaises(dist.DistBackendError):
+            dist.broadcast(torch.tensor([1, 2, 3]).cuda(), 0)
 
 class DistributedDataParallelTest(
     test_c10d_common.CommonDistributedDataParallelTest, MultiProcessTestCase
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 93df86a6e35be..2d20da2a04f30 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -1504,3 +1504,6 @@ def _current_graph_task_id() -> _int: ...
 
 class _OutOfMemoryError:
     pass
+
+class _DistBackendError:
+    pass
diff --git a/torch/csrc/Exceptions.cpp b/torch/csrc/Exceptions.cpp
index b9e4c0a1fca72..67ac3decd6b13 100644
--- a/torch/csrc/Exceptions.cpp
+++ b/torch/csrc/Exceptions.cpp
@@ -13,7 +13,7 @@
 #include <c10/util/StringUtil.h>
 
 PyObject *THPException_FatalError, *THPException_LinAlgError,
-    *THPException_OutOfMemoryError;
+    *THPException_OutOfMemoryError, *THPException_DistBackendError;
 
 #define ASSERT_TRUE(cond) \
   if (!(cond))            \
@@ -63,6 +63,16 @@ could not be completed because the input matrix is singular.",
       PyModule_AddObject(
           module, "_OutOfMemoryError", THPException_OutOfMemoryError) == 0);
 
+  ASSERT_TRUE(
+      THPException_DistBackendError = PyErr_NewExceptionWithDoc(
+          "torch.distributed.DistBackendError",
+          "Exception raised when a backend error occurs in distributed",
+          PyExc_RuntimeError,
+          nullptr));
+  ASSERT_TRUE(
+      PyModule_AddObject(
+          module, "_DistBackendError", THPException_DistBackendError) == 0);
+
   return true;
 }
 
diff --git a/torch/csrc/Exceptions.h b/torch/csrc/Exceptions.h
index 666f240764217..c9069a4a7c5b7 100644
--- a/torch/csrc/Exceptions.h
+++ b/torch/csrc/Exceptions.h
@@ -75,6 +75,8 @@ static inline void PyErr_SetString(PyObject* type, const std::string& message) {
   _CATCH_GENERIC_ERROR(LinAlgError, THPException_LinAlgError, retstmnt) \
   _CATCH_GENERIC_ERROR(                                                 \
       OutOfMemoryError, THPException_OutOfMemoryError, retstmnt)        \
+  _CATCH_GENERIC_ERROR(                                                 \
+      DistBackendError, THPException_DistBackendError, retstmnt)        \
   _CATCH_GENERIC_ERROR(Error, PyExc_RuntimeError, retstmnt)             \
   catch (torch::PyTorchError & e) {                                     \
     auto msg = torch::processErrorMsg(e.what());                        \
@@ -146,7 +148,7 @@ static inline void PyErr_SetString(PyObject* type, const std::string& message) {
 #define END_HANDLE_TH_ERRORS END_HANDLE_TH_ERRORS_RET(nullptr)
 
 extern PyObject *THPException_FatalError, *THPException_LinAlgError,
-    *THPException_OutOfMemoryError;
+    *THPException_OutOfMemoryError, *THPException_DistBackendError;
 
 // Throwing this exception means that the python error flags have been already
 // set and control should be immediately returned to the interpreter.
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp
index a9bea0e67d7bb..fb5d91d2e11cf 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@@ -55,7 +55,7 @@
       std::string err = "NCCL error in: " + std::string(__FILE__) + ":" +     \
           std::to_string(__LINE__) + ", " + ncclGetErrorWithVersion(result) + \
           "\n" + getNcclErrorDetailStr(result, failureReason);                \
-      TORCH_CHECK(false, err);                                                \
+      TORCH_CHECK_WITH(DistBackendError, false, err);                         \
     }                                                                         \
   } while (0)
 
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index fb7edffb96010..26d58968a3819 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -19,6 +19,8 @@ def is_available() -> bool:
 if is_available() and not torch._C._c10d_init():
     raise RuntimeError("Failed to initialize torch.distributed")
 
+# Custom Runtime Errors thrown from the distributed package
+DistBackendError = torch._C._DistBackendError
 
 if is_available():
     from torch._C._distributed_c10d import (

From d35e88ef24b67a9060d01bf7be198f93505fc426 Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Mon, 7 Nov 2022 10:23:18 -0500
Subject: [PATCH 0659/1922] Fix categorization of sym_int method (#88369)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88369
Approved by: https://github.com/ezyang, https://github.com/bdhirsh, https://github.com/anjali411
---
 test/test_dynamic_shapes.py              | 10 ----------
 torch/fx/experimental/symbolic_shapes.py |  1 +
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index 0d421b04008d2..e804c8ff7ff03 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -420,16 +420,6 @@ def print_seen():
     ('floordiv', 'int', 'SymFloat'),  # unsupported operand type(s) for //: 'int' and 'SymFloat'
     ('floordiv', 'SymInt', 'SymFloat'),  # Cannot convert complex to float
     ('mod', 'int', 'SymFloat'),  # unsupported operand type(s) for %: 'int' and 'SymFloat'
-    ('sym_int', 'int', 'float'),  # sym_int() takes 1 positional argument but 2 were given
-    ('sym_int', 'SymInt', 'float'),  # sym_int() takes 1 positional argument but 2 were given
-    ('sym_int', 'int', 'SymFloat'),  # sym_int() takes 1 positional argument but 2 were given
-    ('sym_int', 'SymInt', 'SymFloat'),  # sym_int() takes 1 positional argument but 2 were given
-    ('sym_int', 'int', 'int'),  # sym_int() takes 1 positional argument but 2 were given
-    ('sym_int', 'SymInt', 'int'),  # sym_int() takes 1 positional argument but 2 were given
-    ('sym_int', 'int', 'SymInt'),  # sym_int() takes 1 positional argument but 2 were given
-    ('sym_int', 'SymInt', 'SymInt'),  # sym_int() takes 1 positional argument but 2 were given
-
-
 }
 
 @skipIfTorchDynamo("Creating ShapeEnv fails for confusing reasons (also we never expect dynamo to see code like this)")
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index bb59d8f5470cf..7bc9f566bce32 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -265,6 +265,7 @@ def _nyi():
 
 unary_magic_methods = {
     'sym_float',
+    'sym_int',
     'ceil',
     'neg',
 }

From 2b565238cd7dc01a3407cf234b3e167e294e3643 Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Mon, 7 Nov 2022 10:24:20 -0500
Subject: [PATCH 0660/1922] Clean up SymFloat binding to cover all functions
 (#88370)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88370
Approved by: https://github.com/ezyang
---
 test/functorch/test_aotdispatch.py       |  1 -
 test/test_dynamic_shapes.py              | 21 +++++++++++++++------
 torch/fx/experimental/symbolic_shapes.py |  5 +++--
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 20bb64176b8d9..82ababe8c947c 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1152,7 +1152,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('nn.functional.grid_sample', ''),  # prims::arange() Expected a value of type 'number' for argument...
     xfail('nn.functional.group_norm', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.hinge_embedding_loss', ''),  # aten.zeros_like.default - couldn't find symbolic meta...
-    xfail('nn.functional.huber_loss', ''),  # Unable to cast Python instance to C++ type (#define PYBIND11_DE...
     xfail('nn.functional.interpolate', 'area'),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.interpolate', 'bicubic'),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.interpolate', 'bilinear'),  # Cannot call sizes() on tensor with symbolic sizes/str...
diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index e804c8ff7ff03..b23af9bbfb67c 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -406,7 +406,11 @@ def print_seen():
     out = []
     for key, reason in seen_failed:
         # Make sure the generated line is lint clean
-        out.append(f"    {key},  # {reason}"[:120])
+        msg = f"    {key},  # {reason}"
+        eol = msg.find("\n")
+        if eol != -1:
+            msg = msg[:eol]
+        out.append(msg[:120])
 
     print("expected_failure_sym_magic_methods = {")
     print("\n".join(out))
@@ -416,10 +420,15 @@ def print_seen():
     atexit.register(print_seen)
 
 expected_failure_sym_magic_methods = {
+    ('floordiv', 'SymFloat', 'float'),  # Cannot convert complex to float
+    ('floordiv', 'float', 'SymFloat'),  # Cannot convert complex to float
+    ('floordiv', 'SymFloat', 'SymFloat'),  # Cannot convert complex to float
+    ('floordiv', 'SymFloat', 'int'),  # Scalars are not close!
+    ('floordiv', 'float', 'SymInt'),  # Scalars are not close!
+    ('floordiv', 'SymFloat', 'SymInt'),  # Scalars are not close!
     ('floordiv', 'SymInt', 'float'),  # Cannot convert complex to float
-    ('floordiv', 'int', 'SymFloat'),  # unsupported operand type(s) for //: 'int' and 'SymFloat'
+    ('floordiv', 'int', 'SymFloat'),  # Cannot convert complex to float
     ('floordiv', 'SymInt', 'SymFloat'),  # Cannot convert complex to float
-    ('mod', 'int', 'SymFloat'),  # unsupported operand type(s) for %: 'int' and 'SymFloat'
 }
 
 @skipIfTorchDynamo("Creating ShapeEnv fails for confusing reasons (also we never expect dynamo to see code like this)")
@@ -441,12 +450,12 @@ def maybe_xfail(inp1, inp2):
                 def context():
                     try:
                         yield
-                    except TypeError as e:
+                    except (TypeError, AssertionError) as e:
                         seen_failed.append((key, str(e)))
                 return context()
 
             if key in expected_failure_sym_magic_methods:
-                return self.assertRaises(TypeError)
+                return self.assertRaises((TypeError, AssertionError))
             else:
                 return contextlib.nullcontext()
 
@@ -517,7 +526,7 @@ def guard_fn(v):
     @parametrize("first_type", ["int", "float"])
     @parametrize("second_type", ["int", "float"])
     def test_method(self, fn, first_type, second_type):
-        if first_type == "float" and fn not in symbolic_shapes.float_magic_methods:
+        if first_type == "float" and fn in symbolic_shapes.magic_methods_not_on_float:
             self.skipTest(f"{fn} is not a float magic method")
 
         is_unary_fn = fn in symbolic_shapes.unary_magic_methods
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index 7bc9f566bce32..82e1d5107d790 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -270,7 +270,8 @@ def _nyi():
     'neg',
 }
 
-float_magic_methods = {"add", "sub", "mul", "truediv", "ceil", "floor", "eq", "gt", "lt", "le", "ge", "pow"}
+# TODO: sym_int should also work on floats
+magic_methods_not_on_float = {"sym_int"}
 
 magic_methods_on_builtins = {"min", "max"}
 magic_methods_on_math = {"ceil", "floor"}
@@ -384,7 +385,7 @@ def rbinary_magic_impl(self, other):
     _make_user_magic(method, SymInt)
 
 for method, func in magic_methods.items():
-    if method not in float_magic_methods:
+    if method in magic_methods_not_on_float:
         continue
     _make_user_magic(method, SymFloat)
 

From 89b5a19eda9193dcf2e949ff98cb3b613ce5dd77 Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Tue, 8 Nov 2022 02:22:01 +0000
Subject: [PATCH 0661/1922] Dynamo DDP accuracy bench uses
 find_unused_parameters (#88645)

- find_unused_parameters adds a slight overhead, but is required
  in cases where users do not manually specify parameters to ignore
  which will not receive grads.  In some models, some parameters
  do not receive grads, and this causes DDP to throw an exception
  as it waits for a grad for each parameter

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88645
Approved by: https://github.com/soumith
---
 benchmarks/dynamo/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index b41085d099a2f..c332f47562982 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -1069,7 +1069,7 @@ def record_status(accuracy_status):
         def deepcopy_and_maybe_ddp(model):
             model = copy.deepcopy(model)
             if self.args.ddp:
-                model = DDP(model)
+                model = DDP(model, find_unused_parameters=True)
             return model
 
         # Collect the fp64 reference outputs to be used later for accuracy checking.

From 157d3758df2d5f0624238fa7c1bdff3b8dea95a5 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Tue, 8 Nov 2022 17:06:28 +0000
Subject: [PATCH 0662/1922] enable scalar reduction with dim=-1 (#88628)

Tested with all samples for `sum`, but also fixes all samples errors on other reductions (amin, amax, any, all etc)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88628
Approved by: https://github.com/desertfire
---
 test/inductor/test_torchinductor_opinfo.py | 5 +----
 torch/_inductor/lowering.py                | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index b06d372b20d7b..03b5138f1dc2d 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -141,11 +141,7 @@ def process(device_type):
     "linalg.pinv.singular": {f32, f64},
     "linalg.householder_product": {f32},
     # These might be passing now?
-    "T": {b8, f16, f32, f64, i32, i64},
-    "H": {b8, f16, f32, f64, i32, i64},
     "__getitem__": {b8, f16, f32, f64, i32, i64},
-    "acos": {b8, f16, f32, f64, i32, i64},
-    "acosh": {b8, f16, f32, f64, i32, i64},
     "nn.functional.conv_transpose3d": {f16},
     "max.reduction_with_dim": {i32, i64},
     "min.reduction_with_dim": {i32, i64},
@@ -447,6 +443,7 @@ def wrapper_set_seed(op, *args, **kwargs):
     "select_scatter",
     "squeeze",
     "unsqueeze",
+    "sum",
 }
 
 
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 71f038b231259..d83fbba25ddaf 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -3002,7 +3002,7 @@ def _validate_reduction_axis(x, axis):
     axis = list(axis)
     for i in range(len(axis)):
         if axis[i] < 0:
-            axis[i] += len(size)
+            axis[i] += len(size) if len(size) else 1
         assert 0 <= axis[i] < len(size) or (len(size) == 0 and axis[i] == 0)
     assert len(set(axis)) == len(axis), "reduction axis not unique"
     return axis

From 4212ff91cb0e769cea75d670f3f6a31acb1233ca Mon Sep 17 00:00:00 2001
From: Kurt Mohler <kmohler@quansight.com>
Date: Tue, 8 Nov 2022 18:11:01 +0000
Subject: [PATCH 0663/1922] Deprecate TypedStorage, its derived classes, and
 all of their public methods (#85303)

Part of #85302

Pull Request resolved: https://github.com/pytorch/pytorch/pull/85303
Approved by: https://github.com/ezyang
---
 docs/source/storage.rst                       |   4 +
 test/test_autograd.py                         |   2 +-
 test/test_cuda.py                             |   2 +-
 test/test_torch.py                            | 121 +++++++++
 test/test_view_ops.py                         |   2 +-
 .../templates/python_variable_methods.cpp     |   2 +-
 torch/__init__.py                             |  87 +++++-
 torch/_deploy.py                              |   2 +-
 torch/_dynamo/optimizations/analysis.py       |   2 +-
 torch/_dynamo/optimizations/distributed.py    |   4 +-
 torch/_dynamo/optimizations/training.py       |   4 +-
 torch/_prims/__init__.py                      |   4 +-
 torch/_subclasses/fake_tensor.py              |   8 +-
 torch/_subclasses/fake_utils.py               |   6 +-
 torch/_subclasses/meta_utils.py               |  16 +-
 torch/_tensor.py                              |  34 ++-
 torch/_utils.py                               |   4 +-
 torch/csrc/DynamicTypes.cpp                   |   2 +-
 torch/cuda/__init__.py                        |  63 ++++-
 torch/cuda/_dynamo_graphs.py                  |   4 +-
 .../_shard/checkpoint/filesystem.py           |   2 +-
 torch/distributed/distributed_c10d.py         |   2 +-
 torch/distributed/fsdp/_utils.py              |  14 +-
 torch/distributed/fsdp/flat_param.py          |  16 +-
 .../pipeline/sync/_balance/profile.py         |   2 +-
 torch/distributed/pipeline/sync/stream.py     |   2 +-
 torch/fx/passes/reinplace.py                  |  36 ++-
 torch/multiprocessing/reductions.py           |  21 +-
 torch/overrides.py                            |   1 +
 torch/package/package_exporter.py             |   2 +-
 torch/package/package_importer.py             |   6 +-
 torch/serialization.py                        |  39 +--
 torch/storage.py                              | 252 +++++++++++++-----
 torch/testing/_comparison.py                  |  27 +-
 torch/testing/_internal/schema_check_mode.py  |   6 +-
 torch/utils/bundled_inputs.py                 |   4 +-
 torch/utils/data/_utils/collate.py            |   2 +-
 37 files changed, 631 insertions(+), 176 deletions(-)

diff --git a/docs/source/storage.rst b/docs/source/storage.rst
index 28cf4444fbc97..84fed2f659a7b 100644
--- a/docs/source/storage.rst
+++ b/docs/source/storage.rst
@@ -22,6 +22,10 @@ holds the data as an untyped array of bytes.
 Every strided :class:`torch.Tensor` contains a :class:`torch.TypedStorage`,
 which stores all of the data that the :class:`torch.Tensor` views.
 
+.. warning::
+  All storage classes except for :class:`torch.UntypedStorage` will be removed
+  in the future, and :class:`torch.UntypedStorage` will be used in all cases.
+
 .. autoclass:: torch.TypedStorage
    :members:
    :undoc-members:
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 7df0b1ddae388..dd3ecf3323d38 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -6805,8 +6805,8 @@ def pack(x):
         with torch.autograd.graph.saved_tensors_hooks(pack, lambda x: x):
             a = torch.ones(5, requires_grad=True)
 
-            warnings.simplefilter('always')
             with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter('always')
                 y = a * a
                 # should raise two warnings from a being saved twice
                 self.assertEqual(len(w), 2)
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 9128ea0937151..9ecafc45103b6 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -595,7 +595,7 @@ def test_serialization_array_with_storage(self):
         self.assertTrue(isinstance(q_copy[1], torch.cuda.IntTensor))
         self.assertTrue(isinstance(q_copy[2], torch.cuda.FloatTensor))
         self.assertTrue(isinstance(q_copy[3], torch.storage.TypedStorage))
-        self.assertTrue(isinstance(q_copy[3]._storage, torch.UntypedStorage))
+        self.assertTrue(isinstance(q_copy[3]._untyped_storage, torch.UntypedStorage))
         q_copy[1].fill_(10)
         self.assertEqual(q_copy[3], torch.cuda.IntStorage(10).fill_(10))
 
diff --git a/test/test_torch.py b/test/test_torch.py
index 2247d18285d55..82d0807d81a72 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -6470,6 +6470,127 @@ def test_storage_casts(self):
         self.assertEqual(complexdouble_storage.type(), 'torch.ComplexDoubleStorage')
         self.assertIs(complexdouble_storage.dtype, torch.complex128)
 
+    # Test that internal versions of functions related to TypedStorage do not
+    # produce a deprecation warning
+    def test_typed_storage_internal_no_warning(self):
+        s0 = torch.FloatStorage(10)
+        s0_untyped = s0.untyped()
+        t0 = torch.randn(10)
+
+        funcs = [
+            lambda: torch.FloatStorage(_internal=True),
+            lambda: torch.TypedStorage(
+                dtype=torch.float,
+                device='cpu',
+                _internal=True),
+            lambda: torch.TypedStorage(
+                wrap_storage=s0_untyped,
+                dtype=s0.dtype,
+                _internal=True),
+            lambda: torch.FloatStorage._dtype,
+            lambda: s0._resize_(20),
+            lambda: s0._size(),
+            lambda: s0._untyped_storage,
+            lambda: s0._is_shared(),
+            lambda: s0._share_memory_(),
+            lambda: s0._pickle_storage_type(),
+            lambda: s0._setitem(slice(0, s0._size()), 1),
+            lambda: s0._element_size(),
+            lambda: s0._deepcopy({}),
+            lambda: s0._data_ptr(),
+            lambda: s0._nbytes(),
+            lambda: t0._typed_storage(),
+        ]
+
+        if torch.cuda.is_available():
+            s1 = torch.cuda.FloatStorage(10)
+            s1_untyped = s1.untyped()
+            t1 = torch.randn(10, device='cuda')
+
+            funcs += [
+                lambda: torch.cuda.FloatStorage(_internal=True),
+                lambda: torch.TypedStorage(
+                    dtype=torch.float,
+                    device='cuda',
+                    _internal=True),
+                lambda: torch.TypedStorage(
+                    wrap_storage=s1_untyped,
+                    dtype=s1.dtype,
+                    _internal=True),
+                lambda: torch.cuda.FloatStorage._dtype,
+                lambda: s1._resize_(20),
+                lambda: s1._size(),
+                lambda: s1._untyped_storage,
+                lambda: s1._is_shared(),
+                lambda: s1._share_memory_(),
+                lambda: s1._pickle_storage_type(),
+                lambda: s1._setitem(slice(0, s1._size()), 1),
+                lambda: s1._element_size(),
+                lambda: s1._deepcopy({}),
+                lambda: s1._data_ptr(),
+                lambda: s1._nbytes(),
+                lambda: t1._typed_storage(),
+            ]
+
+        # Check that each of the TypedStorage internal function calls do not
+        # produce a deprecation warning
+        for f in funcs:
+            with warnings.catch_warnings():
+                warnings.filterwarnings('error', "TypedStorage is deprecated")
+                f()
+
+    # Test that public functions related to TypedStorage produce a deprecation
+    # warning
+    def test_typed_storage_deprecation_warning(self):
+        s0 = torch.FloatStorage(10)
+        funcs = [
+            lambda: torch.FloatStorage(),
+            lambda: torch.FloatStorage.dtype,
+            lambda: s0.fill_(0),
+            lambda: s0.is_cuda,
+            lambda: s0.untyped(),
+            lambda: len(s0),
+            lambda: s0[0],
+        ]
+
+        if torch.cuda.is_available():
+            s1 = torch.cuda.FloatStorage(10)
+            funcs += [
+                lambda: torch.cuda.FloatStorage(),
+                lambda: torch.cuda.FloatStorage.dtype,
+                lambda: s1.fill_(0),
+                lambda: s1.is_cuda,
+                lambda: s1.untyped(),
+                lambda: len(s1),
+                lambda: s1[0],
+            ]
+
+        # Check that each of the TypedStorage function calls produce a warning
+        # if warnings are reset between each
+        for f in funcs:
+            with warnings.catch_warnings(record=True) as w:
+                warnings.resetwarnings()
+                f()
+                self.assertEqual(len(w), 1)
+                warning = w[0].message
+                self.assertTrue(warning, DeprecationWarning)
+                self.assertTrue(re.search(
+                    '^TypedStorage is deprecated',
+                    str(warning)))
+
+        # Check that only one warning is raised from calling multiple
+        # TypedStorage functions if warnings are not reset between each
+        with warnings.catch_warnings(record=True) as w:
+            warnings.resetwarnings()
+            for f in funcs:
+                f()
+            self.assertEqual(len(w), 1)
+            warning = w[0].message
+            self.assertTrue(warning, DeprecationWarning)
+            self.assertTrue(re.search(
+                '^TypedStorage is deprecated',
+                str(warning)))
+
     def test_from_file(self):
         def assert_with_filename(filename):
             size = 10000
diff --git a/test/test_view_ops.py b/test/test_view_ops.py
index 3c5987e65ae75..c4729557c416b 100644
--- a/test/test_view_ops.py
+++ b/test/test_view_ops.py
@@ -102,7 +102,7 @@ def is_view_of(self, base, other):
         # Note: only validates storage on native device types
         # because some accelerators, like XLA, do not expose storage
         if base.device.type == 'cpu' or base.device.type == 'cuda':
-            if base.storage().data_ptr() != other.storage().data_ptr():
+            if base._storage().data_ptr() != other._storage().data_ptr():
                 return False
 
         return True
diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
index e3c0a8b987bd6..2cd847b734050 100644
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@@ -979,7 +979,7 @@ static PyObject * THPVariable_storage(PyObject* self, PyObject* arg)
 {
   HANDLE_TH_ERRORS
   if (check_has_torch_function(self)) {
-    return handle_torch_function(self, "storage");
+    return handle_torch_function(self, "_storage");
   }
   auto& self_ = THPVariable_Unpack(self);
   return createPyObject(self_.storage());
diff --git a/torch/__init__.py b/torch/__init__.py
index 1a645f53a8a2d..ae55f5975542f 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -709,7 +709,7 @@ def is_warn_always_enabled():
 ################################################################################
 
 from ._tensor import Tensor
-from .storage import _StorageBase, TypedStorage, _LegacyStorage, UntypedStorage
+from .storage import _StorageBase, TypedStorage, _LegacyStorage, UntypedStorage, _warn_typed_storage_removal
 
 # NOTE: New <type>Storage classes should never be added. When adding a new
 # dtype, use torch.storage.TypedStorage directly.
@@ -717,86 +717,171 @@ def is_warn_always_enabled():
 class ByteStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.uint8
 
 class DoubleStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.double
 
 class FloatStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.float
 
 class HalfStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.half
 
 class LongStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.long
 
 class IntStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.int
 
 class ShortStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.short
 
 class CharStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.int8
 
 class BoolStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.bool
 
 class BFloat16Storage(_LegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.bfloat16
 
 class ComplexDoubleStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.cdouble
 
 class ComplexFloatStorage(_LegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.cfloat
 
 class QUInt8Storage(_LegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.quint8
 
 class QInt8Storage(_LegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.qint8
 
 class QInt32Storage(_LegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.qint32
 
 class QUInt4x2Storage(_LegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.quint4x2
 
 class QUInt2x4Storage(_LegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.quint2x4
 
 _storage_classes = {
diff --git a/torch/_deploy.py b/torch/_deploy.py
index 53769538b6c11..30c022eac8793 100644
--- a/torch/_deploy.py
+++ b/torch/_deploy.py
@@ -23,7 +23,7 @@ def persistent_id(obj):
             if isinstance(obj, torch.storage.TypedStorage):
                 # TODO: Once we decide to break serialization FC, we can
                 # remove this case
-                storage = obj._storage
+                storage = obj._untyped_storage
                 dtype = obj.dtype
             else:
                 storage = obj
diff --git a/torch/_dynamo/optimizations/analysis.py b/torch/_dynamo/optimizations/analysis.py
index 0af70bfa9581d..b3f6ed79eb06f 100644
--- a/torch/_dynamo/optimizations/analysis.py
+++ b/torch/_dynamo/optimizations/analysis.py
@@ -27,7 +27,7 @@ def __init__(self, *args, **kwargs):
 
     def tensor_alias_group(self, value: torch.Tensor):
         """Assign a unique identifier to the storage of a given tensor"""
-        storage = StorageWeakRef(value.storage())
+        storage = StorageWeakRef(value._typed_storage())
         alias_group = self.storage_to_alias_group.get(storage)
         if alias_group is None:
             alias_group = next(self.make_alias_group)
diff --git a/torch/_dynamo/optimizations/distributed.py b/torch/_dynamo/optimizations/distributed.py
index bde786979fcfe..b71d85c4e34f8 100644
--- a/torch/_dynamo/optimizations/distributed.py
+++ b/torch/_dynamo/optimizations/distributed.py
@@ -157,7 +157,7 @@ def compile_fn(self, gm: fx.GraphModule, example_inputs: List[torch.Tensor]):
                 for name, p in target.named_parameters():
                     param = target.get_parameter(name)
                     if p.requires_grad and not self._ignore_parameter(param):
-                        buckets[0].size += p.storage().nbytes()
+                        buckets[0].size += p._storage().nbytes()
                         buckets[0].params.append(f"{node.target}_{name}")
                         buckets[0].param_ids.append(id(param))
             elif node.op == "get_attr":
@@ -165,7 +165,7 @@ def compile_fn(self, gm: fx.GraphModule, example_inputs: List[torch.Tensor]):
                 if maybe_param.requires_grad and not self._ignore_parameter(
                     maybe_param
                 ):
-                    buckets[0].size += maybe_param.storage().nbytes()
+                    buckets[0].size += maybe_param._storage().nbytes()
                     buckets[0].params.append(node.target)
                     buckets[0].param_ids.append(id(maybe_param))
 
diff --git a/torch/_dynamo/optimizations/training.py b/torch/_dynamo/optimizations/training.py
index 588956a898f41..af673a2b2c1e7 100644
--- a/torch/_dynamo/optimizations/training.py
+++ b/torch/_dynamo/optimizations/training.py
@@ -381,7 +381,7 @@ def meta_fk(meta):
     mutated_inputs = set()
     for n in g.nodes:
         if n.op == "placeholder":
-            inputs[StorageWeakRef(meta_fk(n.meta).storage())].add(input_idx)
+            inputs[StorageWeakRef(meta_fk(n.meta)._typed_storage())].add(input_idx)
             input_idx += 1
         elif n.op == "call_function":
             if n.target is operator.getitem:
@@ -402,7 +402,7 @@ def meta_fk(meta):
                     # TODO: not correct for args that contain tensors in a struct
                     # like list
                     mutated_inputs |= inputs[
-                        StorageWeakRef(meta_fk(argument.meta).storage())
+                        StorageWeakRef(meta_fk(argument.meta)._typed_storage())
                     ]
         # TODO: error on unrecognized nodes
     return mutated_inputs
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index 6d40e1071fb53..c40960a22445c 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -1158,7 +1158,9 @@ def _as_strided_meta(
         # as_strided to shapes with no elements are trivially valid, so it's OK
         pass
     elif isinstance(a, torch.Tensor):
-        utils.check_in_bounds_for_storage(a.storage(), size, stride, storage_offset)
+        utils.check_in_bounds_for_storage(
+            a._typed_storage(), size, stride, storage_offset
+        )
 
     return TensorMeta(a, shape=size, strides=stride)
 
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 796b15fedf103..fa58ce23c4437 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -156,7 +156,7 @@ def add_constant_storage_mapping(self, fake_tensor):
         # const_tensor.add_(torch.rand([1]))
         # all aliases of it must become no longer const
         assert isinstance(fake_tensor, FakeTensor) and fake_tensor.constant is not None
-        weak_st = StorageWeakRef(fake_tensor.constant.storage())
+        weak_st = StorageWeakRef(fake_tensor.constant._typed_storage())
 
         # we need a map from a weak storage to all of its corresponding
         # constant tensors. python doesn't have the weak value equivalent
@@ -168,7 +168,7 @@ def add_constant_storage_mapping(self, fake_tensor):
     def invalidate_constant_aliases(self, tensor):
         assert not isinstance(tensor, FakeTensor)
 
-        weak_st = StorageWeakRef(tensor.storage())
+        weak_st = StorageWeakRef(tensor._typed_storage())
         if weak_st not in self.constant_storage_mapping:
             return
 
@@ -1043,7 +1043,7 @@ def to_real_tensor(e):
     for e in tree_flatten((args, kwargs))[0]:
         if isinstance(e, torch.Tensor):
             if not e.is_sparse:
-                storages.add(e.storage()._cdata)
+                storages.add(e._typed_storage()._cdata)
 
     # TODO: also check metadata change on inputs
     # proper aliasing/metadata relationship between outputs and inputs will
@@ -1053,7 +1053,7 @@ def to_real_tensor(e):
         if id(e) not in inp_impls and (
             isinstance(e, torch.Tensor)
             and not e.is_sparse
-            and e.storage()._cdata in storages
+            and e._typed_storage()._cdata in storages
         ):
             raise orig_not_implemented_exception
 
diff --git a/torch/_subclasses/fake_utils.py b/torch/_subclasses/fake_utils.py
index 37ff260c9bd30..d23b12ca84409 100644
--- a/torch/_subclasses/fake_utils.py
+++ b/torch/_subclasses/fake_utils.py
@@ -18,12 +18,12 @@
 
 def outputs_alias_inputs(outputs, inputs):
     input_storages = {
-        inp.storage()._cdata
+        inp._typed_storage()._cdata
         for inp in tree_flatten_only(torch.Tensor, inputs)
         if torch._C._has_storage(inp)
     }
     return any(
-        torch._C._has_storage(out) and out.storage()._cdata in input_storages
+        torch._C._has_storage(out) and out._typed_storage()._cdata in input_storages
         for out in tree_flatten_only(torch.Tensor, outputs)
     )
 
@@ -38,7 +38,7 @@ def output_alias_each_other(outputs):
     for out in tree_flatten_only(torch.Tensor, outputs):
         if not torch._C._has_storage(out):
             continue
-        stor = out.storage()._cdata
+        stor = out._typed_storage()._cdata
         if stor in storages:
             return True
         storages.add(stor)
diff --git a/torch/_subclasses/meta_utils.py b/torch/_subclasses/meta_utils.py
index 7e2039f1764f2..081f7aa632f91 100644
--- a/torch/_subclasses/meta_utils.py
+++ b/torch/_subclasses/meta_utils.py
@@ -143,7 +143,7 @@ def set_tensor_memo(self, t, v):
         if t.is_sparse:
             weak_st = None
         else:
-            weak_st = StorageWeakRef(t.storage())
+            weak_st = StorageWeakRef(t._typed_storage())
         tensor_ref_key = WeakTensorRefKey(t)
 
         def del_ten():
@@ -179,13 +179,9 @@ def meta_storage(self, s, callback):
         # Use a Weak Ref to s in order to not leak memory
         swr = StorageWeakRef(s)
         if swr not in self.storage_memo:
-            self.storage_memo[swr] = (
-                callback(
-                    lambda: torch.empty(s.size(), dtype=torch.uint8, device="meta")
-                )
-                .storage()
-                .untyped()
-            )
+            self.storage_memo[swr] = callback(
+                lambda: torch.empty(s.size(), dtype=torch.uint8, device="meta")
+            )._storage()
         return self.storage_memo[swr]
 
     # This function assumes that it's possible to do the conversion
@@ -362,7 +358,7 @@ def is_c_of_r(complex_dtype, real_dtype):
                                 # format here
                                 r = r.clone(memory_format=torch.preserve_format)
 
-                    s = t.storage().untyped()
+                    s = t._storage()
                     swr = StorageWeakRef(s)
                     if (
                         swr not in self.storage_memo
@@ -370,7 +366,7 @@ def is_c_of_r(complex_dtype, real_dtype):
                         and r.storage_offset() == storage_offset
                     ):
                         # You're normal and happy, install the fresh storage into the memo
-                        self.storage_memo[swr] = r.storage().untyped()
+                        self.storage_memo[swr] = r._storage()
                     else:
                         # You're in crazy town; somehow you gave us a tensor
                         # that wasn't a view, but had nonzero storage offset,
diff --git a/torch/_tensor.py b/torch/_tensor.py
index d0af241c8a221..8ac1ac1eb7361 100644
--- a/torch/_tensor.py
+++ b/torch/_tensor.py
@@ -132,7 +132,7 @@ def __deepcopy__(self, memo):
                         "different type."
                     )
             else:
-                new_storage = self.storage().__deepcopy__(memo)
+                new_storage = self._typed_storage()._deepcopy(memo)
                 if self.is_quantized:
                     # quantizer_params can be different type based on torch attribute
                     quantizer_params: Union[
@@ -163,7 +163,9 @@ def __deepcopy__(self, memo):
                     # need to wrap with TypedStorage
                     new_tensor = torch._utils._rebuild_qtensor(
                         torch.storage.TypedStorage(
-                            wrap_storage=new_storage.untyped(), dtype=self.dtype
+                            wrap_storage=new_storage._untyped_storage,
+                            dtype=self.dtype,
+                            _internal=True,
                         ),
                         self.storage_offset(),
                         self.size(),
@@ -257,7 +259,17 @@ def storage(self):
         if has_torch_function_unary(self):
             return handle_torch_function(Tensor.storage, (self,), self)
 
-        return torch.TypedStorage(wrap_storage=self._storage(), dtype=self.dtype)
+        torch.storage._warn_typed_storage_removal()
+        return self._typed_storage()
+
+    # For internal use only, to avoid raising deprecation warning
+    def _typed_storage(self):
+        _storage = self._storage()
+        if isinstance(_storage, torch.TypedStorage):
+            _storage = _storage._untyped_storage
+        return torch.TypedStorage(
+            wrap_storage=_storage, dtype=self.dtype, _internal=True
+        )
 
     def _reduce_ex_internal(self, proto):
         check_serializing_named_tensor(self)
@@ -331,7 +343,9 @@ def _reduce_ex_internal(self, proto):
             # need to wrap with TypedStorage
             args_qtensor = (
                 torch.storage.TypedStorage(
-                    wrap_storage=self.storage().untyped(), dtype=self.dtype
+                    wrap_storage=self._typed_storage()._untyped_storage,
+                    dtype=self.dtype,
+                    _internal=True,
                 ),
                 self.storage_offset(),
                 tuple(self.size()),
@@ -389,7 +403,9 @@ def _reduce_ex_internal(self, proto):
             # need to wrap with TypedStorage
             args = (
                 torch.storage.TypedStorage(
-                    wrap_storage=self.storage().untyped(), dtype=self.dtype
+                    wrap_storage=self._typed_storage()._untyped_storage,
+                    dtype=self.dtype,
+                    _internal=True,
                 ),
                 self.storage_offset(),
                 tuple(self.size()),
@@ -607,7 +623,7 @@ def is_shared(self):
         """
         if has_torch_function_unary(self):
             return handle_torch_function(Tensor.is_shared, (self,), self)
-        return self.storage().is_shared()
+        return self._typed_storage()._is_shared()
 
     def share_memory_(self):
         r"""Moves the underlying storage to shared memory.
@@ -617,7 +633,7 @@ def share_memory_(self):
         """
         if has_torch_function_unary(self):
             return handle_torch_function(Tensor.share_memory_, (self,), self)
-        self.storage().share_memory_()
+        self._typed_storage()._share_memory_()
         return self
 
     def __reversed__(self):
@@ -1059,7 +1075,9 @@ def storage_type(self):
         if has_torch_function_unary(self):
             return handle_torch_function(Tensor.storage_type, (self,), self)
 
-        return self.storage()._get_legacy_storage_class()
+        torch.storage._warn_typed_storage_removal()
+
+        return self._typed_storage()._get_legacy_storage_class()
 
     def refine_names(self, *names):
         r"""Refines the dimension names of :attr:`self` according to :attr:`names`.
diff --git a/torch/_utils.py b/torch/_utils.py
index 8a539d75f5657..f178cfbaea4ae 100644
--- a/torch/_utils.py
+++ b/torch/_utils.py
@@ -143,8 +143,8 @@ def _get_async_or_non_blocking(function_name, non_blocking, kwargs):
 # be a TypedStorage
 def _rebuild_tensor(storage, storage_offset, size, stride):
     # first construct a tensor with the correct dtype/device
-    t = torch.tensor([], dtype=storage.dtype, device=storage.untyped().device)
-    return t.set_(storage.untyped(), storage_offset, size, stride)
+    t = torch.tensor([], dtype=storage.dtype, device=storage._untyped_storage.device)
+    return t.set_(storage._untyped_storage, storage_offset, size, stride)
 
 
 def _rebuild_tensor_v2(
diff --git a/torch/csrc/DynamicTypes.cpp b/torch/csrc/DynamicTypes.cpp
index b3021ffe0d8d8..93bb37017ce0b 100644
--- a/torch/csrc/DynamicTypes.cpp
+++ b/torch/csrc/DynamicTypes.cpp
@@ -135,7 +135,7 @@ at::Storage createStorageGetType(
     TORCH_INTERNAL_ASSERT(THPDtype_Check(dtype_obj));
     scalar_type = reinterpret_cast<THPDtype*>(dtype_obj)->scalar_type;
 
-    untyped_storage_obj = PyObject_GetAttrString(obj, "_storage");
+    untyped_storage_obj = PyObject_GetAttrString(obj, "_untyped_storage");
     TORCH_INTERNAL_ASSERT(untyped_storage_obj);
     Py_DECREF(untyped_storage_obj);
 
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index 93fa8cf07ac2a..a684f2291de25 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -737,11 +737,12 @@ def type(self, *args, **kwargs):
 
     __new__ = _lazy_new
 
-from torch.storage import _LegacyStorage
+from torch.storage import _LegacyStorage, _warn_typed_storage_removal
 
 class _CudaLegacyStorage(_LegacyStorage):
     @classmethod
     def from_buffer(cls, *args, **kwargs):
+        _warn_typed_storage_removal()
         raise RuntimeError('from_buffer: Not available for CUDA storage')
 
     @classmethod
@@ -755,61 +756,121 @@ def _new_shared_filename(cls, manager, obj, size, *, device=None, dtype=None):
 class ByteStorage(_CudaLegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.uint8
 
 class DoubleStorage(_CudaLegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.double
 
 class FloatStorage(_CudaLegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.float
 
 class HalfStorage(_CudaLegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.half
 
 class LongStorage(_CudaLegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.long
 
 class IntStorage(_CudaLegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.int
 
 class ShortStorage(_CudaLegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.short
 
 class CharStorage(_CudaLegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.int8
 
 class BoolStorage(_CudaLegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.bool
 
 class BFloat16Storage(_CudaLegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.bfloat16
 
 class ComplexDoubleStorage(_CudaLegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.cdouble
 
 class ComplexFloatStorage(_CudaLegacyStorage):
     @classproperty
     def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
         return torch.cfloat
 
 del _LegacyStorage
diff --git a/torch/cuda/_dynamo_graphs.py b/torch/cuda/_dynamo_graphs.py
index 07ebed6fadf0a..6c577c3177762 100644
--- a/torch/cuda/_dynamo_graphs.py
+++ b/torch/cuda/_dynamo_graphs.py
@@ -89,7 +89,7 @@ def find_input_mutations(g):
     mutated_inputs = set()
     for n in g.nodes:
         if n.op == 'placeholder':
-            inputs[StorageWeakRef(n.meta[FK].storage())].add(input_idx)
+            inputs[StorageWeakRef(n.meta[FK]._typed_storage())].add(input_idx)
             input_idx += 1
         elif n.op == 'call_function':
             if n.target is operator.getitem:
@@ -109,7 +109,7 @@ def find_input_mutations(g):
                 if mut_arg:
                     # TODO: not correct for args that contain tensors in a struct
                     # like list
-                    mutated_inputs |= inputs[StorageWeakRef(argument.meta[FK].storage())]
+                    mutated_inputs |= inputs[StorageWeakRef(argument.meta[FK]._typed_storage())]
         # TODO: error on unrecognized nodes
     return mutated_inputs
 
diff --git a/torch/distributed/_shard/checkpoint/filesystem.py b/torch/distributed/_shard/checkpoint/filesystem.py
index ece9000b3ddfb..9788853d9aa66 100644
--- a/torch/distributed/_shard/checkpoint/filesystem.py
+++ b/torch/distributed/_shard/checkpoint/filesystem.py
@@ -51,7 +51,7 @@ class _StoragePrefix:
 
 def _trim(tensor: torch.Tensor) -> torch.Tensor:
     tensor = tensor.detach().cpu()
-    if tensor.storage().size() != tensor.numel():
+    if tensor._typed_storage()._size() != tensor.numel():
         tensor = tensor.clone()
     return tensor
 
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 32b0949a3e348..41d0ee21d3e34 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -1896,7 +1896,7 @@ def all_gather_multigpu(
 def _object_to_tensor(obj, device):
     f = io.BytesIO()
     _pickler(f).dump(obj)
-    byte_storage = torch.ByteStorage.from_buffer(f.getvalue())  # type: ignore[attr-defined]
+    byte_storage = torch.ByteStorage._from_buffer(f.getvalue())  # type: ignore[attr-defined]
     # Do not replace `torch.ByteTensor` or `torch.LongTensor` with torch.tensor and specifying dtype.
     # Otherwise, it will casue 100X slowdown.
     # See: https://github.com/pytorch/pytorch/issues/65696
diff --git a/torch/distributed/fsdp/_utils.py b/torch/distributed/fsdp/_utils.py
index bf7937451a29a..5efb376e66458 100644
--- a/torch/distributed/fsdp/_utils.py
+++ b/torch/distributed/fsdp/_utils.py
@@ -69,14 +69,14 @@ def _alloc_storage(tensor: torch.Tensor, size: torch.Size) -> bool:
         bool: ``True`` if this method allocated storage and ``False`` if the
         storage was already allocated.
     """
-    already_allocated = tensor.storage().size() == size.numel()
+    already_allocated = tensor._typed_storage()._size() == size.numel()
     if not already_allocated:
-        tensor_storage_size = tensor.storage().size()
+        tensor_storage_size = tensor._typed_storage()._size()
         p_assert(
             tensor_storage_size == 0,
             f"Tensor storage should have been resized to be 0 but got {tensor_storage_size}",
         )
-        tensor.storage().resize_(size.numel())
+        tensor._typed_storage()._resize_(size.numel())
     return not already_allocated
 
 
@@ -89,23 +89,23 @@ def _free_storage(tensor: torch.Tensor) -> bool:
         bool: ``True`` if the method freed the storage and ``False`` if the
         storage was already freed.
     """
-    already_freed = tensor.storage().size() == 0
+    already_freed = tensor._typed_storage()._size() == 0
     if not already_freed:
         p_assert(
             tensor.storage_offset() == 0,
             "Freeing a tensor's storage is unsafe when it is not the sole occupant\n"
             f"storage offset: {tensor.storage_offset()}\n"
-            f"storage size: {tensor.storage().size()}\n"
+            f"storage size: {tensor._typed_storage()._size()}\n"
             f"tensor shape: {tensor.shape}",
         )
-        tensor.storage().resize_(0)
+        tensor._typed_storage()._resize_(0)
     return not already_freed
 
 
 def _same_storage(x: torch.Tensor, y: torch.Tensor) -> bool:
     """Returns if ``x`` and ``y`` share the same storage."""
     # NOTE: CPU and GPU tensors are ensured to have different data pointers.
-    return x.storage().data_ptr() == y.storage().data_ptr()
+    return x._typed_storage()._data_ptr() == y._typed_storage()._data_ptr()
 
 
 def p_assert(cond: Any, s: str, raise_assertion_error: bool = True) -> None:
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index ee693648fb346..0978f0875a28f 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -493,7 +493,7 @@ def shard(self):
                 flat_param.storage_offset() == 0,
                 "The `FlatParameter` is not the sole occupant of its storage",
             )
-            orig_storage = flat_param.storage()
+            orig_storage = flat_param._typed_storage()
             sharded_flat_param, numel_padded = FlatParamHandle._get_shard(
                 flat_param, self.rank, self.world_size
             )
@@ -501,8 +501,8 @@ def shard(self):
             start = sharded_flat_param.numel() * self.rank
             end = sharded_flat_param.numel() * (self.rank + 1) - 1  # inclusive
             self._init_shard_metadata(numel_padded, start, end)
-            if orig_storage.size() > 0:
-                orig_storage.resize_(0)
+            if orig_storage._size() > 0:
+                orig_storage._resize_(0)
         if self._use_orig_params:
             self._use_sharded_views()
 
@@ -838,7 +838,7 @@ def needs_unshard(self) -> bool:
             return False
         unsharded_flat_param = self._get_padded_unsharded_flat_param()
         already_unsharded = (
-            unsharded_flat_param.storage().size() == unsharded_flat_param.numel()
+            unsharded_flat_param._typed_storage()._size() == unsharded_flat_param.numel()
         )
         return not already_unsharded
 
@@ -1141,9 +1141,9 @@ def to_cpu(self):
         # the padded unsharded flattened parameter as expected
         # NOTE: This check is not strictly needed for correctness but is a
         # useful sanity check since the tensor should only be used internally.
-        unpadded_storage_ptr = self.flat_param.storage().data_ptr()
+        unpadded_storage_ptr = self.flat_param._typed_storage()._data_ptr()
         padded_storage_ptr = (
-            self._get_padded_unsharded_flat_param().storage().data_ptr()
+            self._get_padded_unsharded_flat_param()._typed_storage()._data_ptr()
         )
         p_assert(
             unpadded_storage_ptr == padded_storage_ptr,
@@ -1824,7 +1824,7 @@ def _check_on_cpu(self, tensor: Tensor):
 
     @staticmethod
     def _check_storage_freed(tensor: Tensor):
-        storage_size: int = tensor.storage().size()
+        storage_size: int = tensor._typed_storage()._size()
         p_assert(
             storage_size == 0,
             f"Expects storage to be freed but got storage with size {storage_size}",
@@ -1832,7 +1832,7 @@ def _check_storage_freed(tensor: Tensor):
 
     @staticmethod
     def _check_storage_allocated(tensor: Tensor):
-        storage_size: int = tensor.storage().size()
+        storage_size: int = tensor._typed_storage()._size()
         p_assert(storage_size > 0, "Expects storage to be allocated")
 
     def _check_low_precision_shard(self):
diff --git a/torch/distributed/pipeline/sync/_balance/profile.py b/torch/distributed/pipeline/sync/_balance/profile.py
index 9759a4b6262a1..fa1a0c06a8e3a 100644
--- a/torch/distributed/pipeline/sync/_balance/profile.py
+++ b/torch/distributed/pipeline/sync/_balance/profile.py
@@ -107,7 +107,7 @@ def profile_sizes(
         latent_size = memory_after - memory_before
 
         # Analyze size of parameters.
-        param_size = sum(p.storage().nbytes() for p in layer.parameters())
+        param_size = sum(p._typed_storage()._nbytes() for p in layer.parameters())
 
         # Combine size of parameters and activations with normalize scales.
         size = latent_size * latent_scale + param_size * param_scale
diff --git a/torch/distributed/pipeline/sync/stream.py b/torch/distributed/pipeline/sync/stream.py
index 56b6993437390..59fedf865a42b 100644
--- a/torch/distributed/pipeline/sync/stream.py
+++ b/torch/distributed/pipeline/sync/stream.py
@@ -104,7 +104,7 @@ def record_stream(tensor: torch.Tensor, stream: AbstractStream) -> None:
         #
         # Issue: https://github.com/pytorch/pytorch/issues/27366
         #
-        tensor = tensor.new_empty([0]).set_(tensor.storage())
+        tensor = tensor.new_empty([0]).set_(tensor._typed_storage())
 
         # Typechecking: torch.cuda.Stream is incompatible with torch._C.Stream
         tensor.record_stream(as_cuda(stream))  # type: ignore[arg-type]
diff --git a/torch/fx/passes/reinplace.py b/torch/fx/passes/reinplace.py
index ff24ef97f5459..86986a85acc8f 100644
--- a/torch/fx/passes/reinplace.py
+++ b/torch/fx/passes/reinplace.py
@@ -100,8 +100,8 @@ def run_node(self, node: Node):
             # Assert here that this is actually the case, and their storages are the same.
             assert isinstance(node.meta['fake_result'], FakeTensor)
             assert isinstance(node.meta['view_of'].meta['fake_result'], FakeTensor)
-            view_storage = StorageWeakRef(node.meta['fake_result'].storage())
-            base_storage = StorageWeakRef(node.meta['view_of'].meta['fake_result'].storage())
+            view_storage = StorageWeakRef(node.meta['fake_result']._typed_storage())
+            base_storage = StorageWeakRef(node.meta['view_of'].meta['fake_result']._typed_storage())
             assert view_storage == base_storage
         return result
 
@@ -176,7 +176,7 @@ def _maybe_get_inplace_op(op):
 def _get_all_later_node_usages(tensor_aliases: Set[Node], op_index: int):
     def _add_if_tensor(x, set_):
         if isinstance(x, FakeTensor):
-            set_.add(StorageWeakRef(x.storage()))
+            set_.add(StorageWeakRef(x._typed_storage()))
 
     nodes_used_after = set()
     for t in tensor_aliases:
@@ -452,7 +452,7 @@ def f(x):
     # Useful debug printing
     # def _print(x):
     # if isinstance(x, FakeTensor):
-    # print(f'fake_result: {StorageWeakRef(x.storage()).cdata}')
+    # print(f'fake_result: {StorageWeakRef(x._typed_storage()).cdata}')
 
     # for n in gm.graph.nodes:
     # print(n.format_node())
@@ -468,7 +468,10 @@ def f(x):
     # so we know not to re-inplace them.
     # NOTE: later, we'll need to add an optimization for fully recovering performance
     # on programs that mutate inputs.
-    input_storages = set(StorageWeakRef(node.meta['fake_result'].storage()) for node in gm.graph.nodes if node.op == 'placeholder')
+    input_storages = set(
+        StorageWeakRef(
+            node.meta['fake_result']._typed_storage()
+        ) for node in gm.graph.nodes if node.op == 'placeholder')
 
 
     # We also need to know for a given node, what are all of its aliasing nodes.
@@ -478,7 +481,7 @@ def f(x):
             # Tree-mapping because some ops can return lists of tensors.
             def _add_to_map(x):
                 if isinstance(x, FakeTensor):
-                    storage_to_nodes[StorageWeakRef(x.storage())].add(n)
+                    storage_to_nodes[StorageWeakRef(x._typed_storage())].add(n)
             tree_map(_add_to_map, n.meta['fake_result'])
 
     # inplace-ify functional ops, subject to the constraints written below.
@@ -529,7 +532,7 @@ def _add_to_map(x):
 
             # Step 1b: ensure that the op we're trying to re-inplace isn't a program input
             self_arg_name = self_arg.name
-            self_arg_storage = StorageWeakRef(self_arg.meta['fake_result'].storage())
+            self_arg_storage = StorageWeakRef(self_arg.meta['fake_result']._typed_storage())
             if self_arg_storage in input_storages:
                 # TODO: later, add the optimization for handling `copy_()` calls in the graph.
                 continue
@@ -539,7 +542,7 @@ def _add_to_map(x):
                 # so we prevent re-inplacing in this case.
                 continue
 
-            self_arg_storage = StorageWeakRef(self_arg.meta['fake_result'].storage())
+            self_arg_storage = StorageWeakRef(self_arg.meta['fake_result']._typed_storage())
             self_aliases = storage_to_nodes[self_arg_storage]
 
             # First, we find all later usages of any of the aliases of self_arg.
@@ -594,7 +597,7 @@ def _add_to_map(x):
             # Hmm... morally I think we also want to keep the `fake_result` metadata
             # up to date here, but I'm not sure how easy it is to do.
             # Maybe it's fine to wait until the end of the pass to update it.
-            curr_node_storage = StorageWeakRef(node.meta['fake_result'].storage())
+            curr_node_storage = StorageWeakRef(node.meta['fake_result']._typed_storage())
             storage_to_nodes[self_arg_storage].update(storage_to_nodes[curr_node_storage])
             storage_to_nodes[curr_node_storage].update(storage_to_nodes[self_arg_storage])
 
@@ -624,8 +627,14 @@ def replace_arg(a):
                     old_flattened_res, _ = tree_flatten(old.meta['fake_result'])
                     node_flattened_res, _ = tree_flatten(node_to_update.meta['fake_result'])
 
-                    old_res_storage = set(StorageWeakRef(x.storage()) for x in old_flattened_res if isinstance(x, FakeTensor))
-                    node_res_storage = set(StorageWeakRef(x.storage()) for x in node_flattened_res if isinstance(x, FakeTensor))
+                    old_res_storage = set(
+                        StorageWeakRef(
+                            x._typed_storage()
+                        ) for x in old_flattened_res if isinstance(x, FakeTensor))
+                    node_res_storage = set(
+                        StorageWeakRef(
+                            x._typed_storage()
+                        ) for x in node_flattened_res if isinstance(x, FakeTensor))
 
                     # This will happen if we're updating a view op, e.g.
                     # e.g. replacing
@@ -639,7 +648,10 @@ def replace_arg(a):
                     # We can't just check equality because we might encounter FX nodes that return zero tensor outputs.
                     if len(old_res_storage) == 1 and len(node_res_storage) == 1 and old_res_storage == node_res_storage:
                         new_flattened_res, _ = tree_flatten(new.meta['fake_result'])
-                        new_res_storage = set(StorageWeakRef(x.storage()) for x in new_flattened_res if isinstance(x, FakeTensor))
+                        new_res_storage = set(
+                            StorageWeakRef(
+                                x._typed_storage()
+                            ) for x in new_flattened_res if isinstance(x, FakeTensor))
                         assert len(new_res_storage) == 1
                         (old_ref,) = old_res_storage
                         (new_ref,) = new_res_storage
diff --git a/torch/multiprocessing/reductions.py b/torch/multiprocessing/reductions.py
index 403b28d6a63c6..4fcccb47685c9 100644
--- a/torch/multiprocessing/reductions.py
+++ b/torch/multiprocessing/reductions.py
@@ -113,7 +113,7 @@ def rebuild_cuda_tensor(tensor_cls, tensor_size, tensor_stride, tensor_offset,
                         requires_grad, ref_counter_handle, ref_counter_offset, event_handle, event_sync_required):
     # If storage_handle is None, storage points to nullptr.
     if storage_handle is None or storage_size_bytes == 0:
-        storage = storage_cls(0, dtype=dtype, device=storage_device)
+        storage = storage_cls(0, dtype=dtype, device=storage_device, _internal=True)
     else:
         storage = storage_from_cache(storage_cls, (storage_handle, storage_offset_bytes))
         if storage is None:
@@ -132,8 +132,10 @@ def rebuild_cuda_tensor(tensor_cls, tensor_size, tensor_stride, tensor_offset,
             # We already ref counting this Storage, but producer needs new ref-counters to be released.
             storage_cls._release_ipc_counter(ref_counter_handle, ref_counter_offset, device=storage_device)
 
+    _storage = storage if isinstance(storage, torch.UntypedStorage) else storage._untyped_storage
+
     t = torch._utils._rebuild_tensor(
-        torch.storage.TypedStorage(wrap_storage=storage.untyped(), dtype=dtype),
+        torch.storage.TypedStorage(wrap_storage=_storage, dtype=dtype, _internal=True),
         tensor_offset, tensor_size, tensor_stride)
 
     if tensor_cls == torch.nn.parameter.Parameter:
@@ -147,7 +149,7 @@ def rebuild_cuda_tensor(tensor_cls, tensor_size, tensor_stride, tensor_offset,
 
 
 def reduce_tensor(tensor):
-    storage = tensor.storage()
+    storage = tensor._typed_storage()
 
     if tensor.requires_grad and not tensor.is_leaf:
         raise RuntimeError("Cowardly refusing to serialize non-leaf tensor which requires_grad, "
@@ -248,7 +250,7 @@ def reduce_tensor(tensor):
     # eliminated it so that we could just use tensor views to implement the same
     # thing.
     #
-    if storage.is_cuda:
+    if storage._untyped_storage.device.type == 'cuda':
         (device,
          handle,
          storage_size_bytes,
@@ -325,7 +327,8 @@ def rebuild_storage_filename(cls, manager, handle, size, dtype=None):
         untyped_storage: torch.UntypedStorage = torch.UntypedStorage._new_shared_filename_cpu(manager, handle, byte_size)
         storage = torch.TypedStorage(
             wrap_storage=untyped_storage,
-            dtype=dtype)
+            dtype=dtype,
+            _internal=True)
     shared_cache[handle] = StorageWeakRef(storage)
     return storage._shared_decref()
 
@@ -334,18 +337,18 @@ def rebuild_storage_empty(cls):
     return cls()
 
 def rebuild_typed_storage(storage, dtype):
-    return torch.storage.TypedStorage(wrap_storage=storage, dtype=dtype)
+    return torch.storage.TypedStorage(wrap_storage=storage, dtype=dtype, _internal=True)
 
 # Use for torch.storage.TypedStorage
 def reduce_typed_storage(storage):
-    return (rebuild_typed_storage, (storage._storage, storage.dtype))
+    return (rebuild_typed_storage, (storage._untyped_storage, storage.dtype))
 
 def rebuild_typed_storage_child(storage, storage_type):
-    return storage_type(wrap_storage=storage)
+    return storage_type(wrap_storage=storage, _internal=True)
 
 # Use for child classes of torch.storage.TypedStorage, like torch.FloatStorage
 def reduce_typed_storage_child(storage):
-    return (rebuild_typed_storage_child, (storage._storage, type(storage)))
+    return (rebuild_typed_storage_child, (storage._untyped_storage, type(storage)))
 
 def reduce_storage(storage):
     from . import get_sharing_strategy
diff --git a/torch/overrides.py b/torch/overrides.py
index ce7872f9d1abe..cb4402235e1a2 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -273,6 +273,7 @@ def get_ignored_functions() -> Set[Callable]:
         Tensor.to_sparse_csc,
         Tensor.to_sparse_bsr,
         Tensor.to_sparse_bsc,
+        Tensor._typed_storage,
         Tensor._reduce_ex_internal,
         Tensor._fix_weakref,
         Tensor._make_wrapper_subclass,
diff --git a/torch/package/package_exporter.py b/torch/package/package_exporter.py
index a95f105d2474c..7f6af38468e2f 100644
--- a/torch/package/package_exporter.py
+++ b/torch/package/package_exporter.py
@@ -887,7 +887,7 @@ def _persistent_id(self, obj):
             if isinstance(obj, torch.storage.TypedStorage):
                 # TODO: Once we decide to break serialization FC, we can
                 # remove this case
-                untyped_storage = obj._storage
+                untyped_storage = obj._untyped_storage
                 storage_type_str = obj.pickle_storage_type()
                 storage_type = getattr(torch, storage_type_str)
                 storage_numel = obj.size()
diff --git a/torch/package/package_importer.py b/torch/package/package_importer.py
index 7bf945c70c0b3..3db37128b03b3 100644
--- a/torch/package/package_importer.py
+++ b/torch/package/package_importer.py
@@ -208,14 +208,14 @@ def load_tensor(dtype, size, key, location, restore_location):
             name = f"{key}.storage"
 
             if storage_context.has_storage(name):
-                storage = storage_context.get_storage(name, dtype).storage()
+                storage = storage_context.get_storage(name, dtype)._typed_storage()
             else:
                 tensor = self.zip_reader.get_storage_from_record(
                     ".data/" + name, size, dtype
                 )
                 if isinstance(self.zip_reader, torch._C.PyTorchFileReader):
                     storage_context.add_storage(name, tensor)
-                storage = tensor.storage()
+                storage = tensor._typed_storage()
             loaded_storages[key] = restore_location(storage, location)
 
         def persistent_load(saved_id):
@@ -239,7 +239,7 @@ def persistent_load(saved_id):
                 # TODO: Once we decide to break serialization FC, we can
                 # stop wrapping with TypedStorage
                 return torch.storage.TypedStorage(
-                    wrap_storage=storage.untyped(), dtype=dtype
+                    wrap_storage=storage._untyped_storage, dtype=dtype, _internal=True
                 )
             elif typename == "reduce_package":
                 # to fix BC breaking change, objects on this load path
diff --git a/torch/serialization.py b/torch/serialization.py
index 53d060019408d..d123a955ad966 100644
--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -469,12 +469,12 @@ def persistent_id(obj: Any) -> Optional[Tuple]:
             if isinstance(obj, torch.storage.TypedStorage):
                 # TODO: Once we decide to break serialization FC, this case
                 # can be deleted
-                storage = obj._storage
+                storage = obj._untyped_storage
                 storage_dtype = obj.dtype
-                storage_type_str = obj.pickle_storage_type()
+                storage_type_str = obj._pickle_storage_type()
                 storage_type = getattr(torch, storage_type_str)
                 dtype = obj.dtype
-                storage_numel = obj.size()
+                storage_numel = obj._size()
 
             elif isinstance(obj, torch.UntypedStorage):
                 storage = obj
@@ -597,11 +597,11 @@ def persistent_id(obj):
             if isinstance(obj, torch.storage.TypedStorage):
                 # TODO: Once we decide to break serialization FC, this case
                 # can be deleted
-                storage = obj._storage
+                storage = obj._untyped_storage
                 storage_dtype = obj.dtype
-                storage_type_str = obj.pickle_storage_type()
+                storage_type_str = obj._pickle_storage_type()
                 storage_type = getattr(torch, storage_type_str)
-                storage_numel = obj.size()
+                storage_numel = obj._size()
 
             else:
                 storage = obj
@@ -893,14 +893,15 @@ def persistent_load(saved_id):
                 for i in range(num_storages):
                     args = pickle_module.load(f, **pickle_load_args)
                     key, location, storage_type = args
-                    dtype = storage_type.dtype
+                    dtype = storage_type._dtype
                     obj = cast(Storage, torch.UntypedStorage)._new_with_file(f, torch._utils._element_size(dtype))
                     obj = restore_location(obj, location)
                     # TODO: Once we decide to break serialization FC, we can
                     # stop wrapping with TypedStorage
                     deserialized_objects[key] = torch.storage.TypedStorage(
                         wrap_storage=obj,
-                        dtype=dtype)
+                        dtype=dtype,
+                        _internal=True)
 
                 storage_views = pickle_module.load(f, **pickle_load_args)
                 for target_cdata, root_cdata, offset, numel in storage_views:
@@ -910,8 +911,9 @@ def persistent_load(saved_id):
                     # TODO: Once we decide to break serialization FC, we can
                     # stop wrapping with TypedStorage
                     deserialized_objects[target_cdata] = torch.storage.TypedStorage(
-                        wrap_storage=root._storage[offset_bytes:offset_bytes + numel * element_size],
-                        dtype=root.dtype)
+                        wrap_storage=root._untyped_storage[offset_bytes:offset_bytes + numel * element_size],
+                        dtype=root.dtype,
+                        _internal=True)
 
             tar.extract('tensors', path=tmpdir)
             with open(os.path.join(tmpdir, 'tensors'), 'rb', 0) as f:
@@ -927,7 +929,7 @@ def persistent_load(saved_id):
                     stride = struct.unpack(f'<{ndim}q', f.read(8 * ndim))
                     storage_offset, = struct.unpack('<q', f.read(8))
                     tensor = torch.tensor([], dtype=storage.dtype).set_(
-                        storage._storage, storage_offset, numel, stride)
+                        storage._untyped_storage, storage_offset, numel, stride)
                     deserialized_objects[key] = tensor
 
             pickle_file = tar.extractfile('pickle')
@@ -962,7 +964,8 @@ def persistent_load(saved_id):
                 # stop wrapping with TypedStorage
                 deserialized_objects[root_key] = torch.storage.TypedStorage(
                     wrap_storage=restore_location(obj, location),
-                    dtype=dtype)
+                    dtype=dtype,
+                    _internal=True)
 
             typed_storage = deserialized_objects[root_key]
             if view_metadata is not None:
@@ -973,8 +976,9 @@ def persistent_load(saved_id):
                     # TODO: Once we decide to break serialization FC, we can
                     # stop wrapping with TypedStorage
                     deserialized_objects[view_key] = torch.storage.TypedStorage(
-                        wrap_storage=typed_storage._storage[offset_bytes:offset_bytes + view_size_bytes],
-                        dtype=dtype)
+                        wrap_storage=typed_storage._untyped_storage[offset_bytes:offset_bytes + view_size_bytes],
+                        dtype=dtype,
+                        _internal=True)
                 res = deserialized_objects[view_key]
 
             else:
@@ -1023,7 +1027,7 @@ def persistent_load(saved_id):
     for key in deserialized_storage_keys:
         assert key in deserialized_objects
         typed_storage = deserialized_objects[key]
-        typed_storage._storage._set_from_file(
+        typed_storage._untyped_storage._set_from_file(
             f, offset, f_should_read_directly,
             torch._utils._element_size(typed_storage.dtype))
         if offset is not None:
@@ -1082,12 +1086,13 @@ def _load(zip_file, map_location, pickle_module, pickle_file='data.pkl', **pickl
     def load_tensor(dtype, numel, key, location):
         name = f'data/{key}'
 
-        storage = zip_file.get_storage_from_record(name, numel, torch.UntypedStorage).storage().untyped()
+        storage = zip_file.get_storage_from_record(name, numel, torch.UntypedStorage)._typed_storage()._untyped_storage
         # TODO: Once we decide to break serialization FC, we can
         # stop wrapping with TypedStorage
         loaded_storages[key] = torch.storage.TypedStorage(
             wrap_storage=restore_location(storage, location),
-            dtype=dtype)
+            dtype=dtype,
+            _internal=True)
 
     def persistent_load(saved_id):
         assert isinstance(saved_id, tuple)
diff --git a/torch/storage.py b/torch/storage.py
index 6bfbab3733bc4..ef523bd7b97e0 100644
--- a/torch/storage.py
+++ b/torch/storage.py
@@ -7,6 +7,7 @@
 import copy
 import collections
 from functools import lru_cache
+import warnings
 try:
     import numpy as np
     HAS_NUMPY = True
@@ -131,7 +132,7 @@ def mps(self):
     def _to(self, dtype):
         if not isinstance(dtype, torch.dtype):
             raise TypeError(f"Argument 'dtype' must be torch.dtype, not {type(dtype)}")
-        storage = torch.tensor([], dtype=torch.uint8, device=self.device).set_(cast(Storage, self)).to(dtype).storage()
+        storage = torch.tensor([], dtype=torch.uint8, device=self.device).set_(cast(Storage, self)).to(dtype)._typed_storage()
         if storage.data_ptr() == self.data_ptr():
             storage = storage.clone()
         return storage
@@ -297,7 +298,7 @@ def _get_storage_from_sequence(sequence, dtype, device):
             dtype=dtype,
             device=device)
 
-    return tmp_tensor.storage().untyped()
+    return tmp_tensor._typed_storage()._untyped_storage
 
 def _isint(x):
     if HAS_NUMPY:
@@ -305,16 +306,32 @@ def _isint(x):
     else:
         return isinstance(x, int)
 
+def _warn_typed_storage_removal():
+    message = (
+        "TypedStorage is deprecated. It will be removed in the future and "
+        "UntypedStorage will be the only storage class. This should only matter "
+        "to you if you are using storages directly."
+    )
+    warnings.warn(message, UserWarning)
+
 class TypedStorage:
     is_sparse = False
 
     dtype: torch.dtype
 
+    @property
+    def _dtype(self):
+        return self.dtype
+
     def fill_(self, value):
-        self[0:len(self)] = value
+        _warn_typed_storage_removal()
+        self._setitem(slice(0, self._size()), value)
         return self
 
-    def __new__(cls, *args, wrap_storage=None, dtype=None, device=None):
+    def __new__(cls, *args, wrap_storage=None, dtype=None, device=None, _internal=False):
+        if not _internal:
+            _warn_typed_storage_removal()
+
         if cls == torch.storage._LegacyStorage:
             raise RuntimeError("Only child classes of _LegacyStorage can be instantiated")
 
@@ -353,8 +370,9 @@ def __new__(cls, *args, wrap_storage=None, dtype=None, device=None):
 
                 return TypedStorage(
                     *args,
-                    dtype=cls.dtype,
-                    device='cuda' if cls.__module__ == 'torch.cuda' else 'cpu')
+                    dtype=cls._dtype,
+                    device='cuda' if cls.__module__ == 'torch.cuda' else 'cpu',
+                    _internal=True)
 
             else:
                 if len(args) != 0:
@@ -379,9 +397,12 @@ def __new__(cls, *args, wrap_storage=None, dtype=None, device=None):
                 return TypedStorage(
                     *args,
                     wrap_storage=wrap_storage,
-                    dtype=cls.dtype)
+                    dtype=cls.dtype,
+                    _internal=True)
 
-    def __init__(self, *args, device=None, dtype=None, wrap_storage=None):
+    def __init__(self, *args, device=None, dtype=None, wrap_storage=None, _internal=False):
+        if not _internal:
+            _warn_typed_storage_removal()
         arg_error_msg = (
             'TypedStorage.__init__ received an invalid combination '
             'of arguments. Expected one of:\n'
@@ -419,7 +440,7 @@ def __init__(self, *args, device=None, dtype=None, wrap_storage=None):
                     arg_error_msg +
                     f"\nArgument 'wrap_storage' must be UntypedStorage, but got {type(wrap_storage)}")
 
-            self._storage = wrap_storage
+            self._untyped_storage = wrap_storage
 
         else:
             self.dtype = torch.get_default_dtype() if dtype is None else dtype
@@ -430,13 +451,13 @@ def __init__(self, *args, device=None, dtype=None, wrap_storage=None):
                     raise RuntimeError("Cannot create CUDA storage with quantized dtype")
 
             if len(args) == 0:
-                self._storage = torch.UntypedStorage(device=device)
+                self._untyped_storage = torch.UntypedStorage(device=device)
 
             elif len(args) == 1:
                 if _isint(args[0]):
-                    self._storage = torch.UntypedStorage(int(args[0]) * self.element_size(), device=device)
+                    self._untyped_storage = torch.UntypedStorage(int(args[0]) * self._element_size(), device=device)
                 elif isinstance(args[0], collections.abc.Sequence):
-                    self._storage = _get_storage_from_sequence(args[0], self.dtype, device)
+                    self._untyped_storage = _get_storage_from_sequence(args[0], self.dtype, device)
                 else:
                     raise TypeError(
                         arg_error_msg +
@@ -447,30 +468,35 @@ def __init__(self, *args, device=None, dtype=None, wrap_storage=None):
                     arg_error_msg +
                     "\nToo many positional arguments")
 
-
     @property
     def is_cuda(self):
+        _warn_typed_storage_removal()
         return self.device.type == 'cuda'
 
     def untyped(self):
         """Returns the internal :class:`torch.UntypedStorage`"""
-        return self._storage
+        _warn_typed_storage_removal()
+        return self._untyped_storage
 
     def _new_wrapped_storage(self, untyped_storage):
         assert type(untyped_storage) == torch.UntypedStorage
 
         if type(self) == TypedStorage:
-            return TypedStorage(wrap_storage=untyped_storage, dtype=self.dtype)
+            return TypedStorage(
+                wrap_storage=untyped_storage,
+                dtype=self.dtype,
+                _internal=True)
         else:
             return type(self)(wrap_storage=untyped_storage)
 
     def __len__(self):
-        return self._storage.nbytes() // self.element_size()
+        _warn_typed_storage_removal()
+        return self._size()
 
     def _maybe_wrap_index(self, idx, is_stop=False):
         if idx is None:
             if is_stop:
-                return self.size()
+                return self._size()
             else:
                 return 0
 
@@ -479,20 +505,24 @@ def _maybe_wrap_index(self, idx, is_stop=False):
                 raise TypeError(
                     f"can't index a {type(self)} with {type(idx)}")
             if is_stop:
-                if (idx > self.size()) or (idx < -self.size()):
+                if (idx > self._size()) or (idx < -self._size()):
                     raise IndexError(
                         f'index {idx} out of range for storage of size {self.size()}')
                 if idx > 0:
                     return idx
                 else:
-                    return idx % self.size()
+                    return idx % self._size()
             else:
-                if (idx >= self.size()) or (idx < -self.size()):
+                if (idx >= self._size()) or (idx < -self._size()):
                     raise IndexError(
                         f'index {idx} out of range for storage of size {self.size()}')
-                return idx % self.size()
+                return idx % self._size()
 
     def __setitem__(self, idx, value):
+        _warn_typed_storage_removal()
+        return self._setitem(idx, value)
+
+    def _setitem(self, idx, value):
         if not isinstance(idx, (int, slice)):
             raise RuntimeError(f"can't index a {type(self)} with {type(idx)}")
         if torch.is_storage(value):
@@ -506,16 +536,22 @@ def __setitem__(self, idx, value):
                 torch.qint8: torch.int8
             }
             tmp_dtype = interpret_dtypes[self.dtype]
-            tmp_tensor = torch.tensor([], dtype=tmp_dtype, device=self.device).set_(TypedStorage(
-                wrap_storage=self._storage,
-                dtype=tmp_dtype))
+            tmp_tensor = torch.tensor([], dtype=tmp_dtype, device=self._untyped_storage.device)
+            tmp_tensor.set_(TypedStorage(
+                wrap_storage=self._untyped_storage,
+                dtype=tmp_dtype,
+                _internal=True))
         else:
-            tmp_tensor = torch.tensor([], dtype=self.dtype, device=self.device).set_(self)
+            tmp_tensor = torch.tensor([], dtype=self.dtype, device=self._untyped_storage.device).set_(self)
 
         tmp_tensor[idx] = value
 
     def __getitem__(self, idx):
-        if self.device.type == 'meta':
+        _warn_typed_storage_removal()
+        return self._getitem(idx)
+
+    def _getitem(self, idx):
+        if self._untyped_storage.device.type == 'meta':
             raise NotImplementedError("Not available for 'meta' device type")
 
         # NOTE: Before TypedStorage existed, indexing with a slice used to be
@@ -536,21 +572,32 @@ def __getitem__(self, idx):
                 torch.qint8: torch.int8
             }
             return TypedStorage(
-                wrap_storage=self._storage,
-                dtype=interpret_dtypes[self.dtype])[idx]
+                wrap_storage=self._untyped_storage,
+                dtype=interpret_dtypes[self.dtype],
+                _internal=True)._getitem(idx)
 
         idx_wrapped = self._maybe_wrap_index(idx)
-        tmp_tensor = torch.tensor([], dtype=self.dtype, device=self.device).set_(self)
+        tmp_tensor = torch.tensor([], dtype=self.dtype, device=self._untyped_storage.device).set_(self)
         return tmp_tensor[idx_wrapped].item()
 
     def copy_(self, source: T, non_blocking: bool = None):
-        self._storage.copy_(source.untyped(), non_blocking)
+        _warn_typed_storage_removal()
+        if isinstance(source, TypedStorage):
+            self._untyped_storage.copy_(source._untyped_storage, non_blocking)
+        else:
+            self._untyped_storage.copy_(source, non_blocking)
         return self
 
     def nbytes(self):
-        return self._storage.nbytes()
+        _warn_typed_storage_removal()
+        return self._nbytes()
+
+    # For internal use only, to avoid deprecation warning
+    def _nbytes(self):
+        return self._untyped_storage.nbytes()
 
     def type(self, dtype: str = None, non_blocking: bool = False) -> Union[T, str]:
+        _warn_typed_storage_removal()
         if dtype is None:
             legacy_class = self._get_legacy_storage_class()
 
@@ -560,21 +607,29 @@ def type(self, dtype: str = None, non_blocking: bool = False) -> Union[T, str]:
             return '.'.join([self.__module__, type(self).__name__])
 
         else:
-            return self._storage.type(dtype, non_blocking)
+            return self._untyped_storage.type(dtype, non_blocking)
 
     def cuda(self, device=None, non_blocking=False, **kwargs) -> T:
+        _warn_typed_storage_removal()
         if self.dtype in [torch.quint8, torch.quint4x2, torch.quint2x4, torch.qint32, torch.qint8]:
             raise RuntimeError("Cannot create CUDA storage with quantized dtype")
-        cuda_storage: torch.UntypedStorage = self._storage.cuda(device, non_blocking, **kwargs)
+        cuda_storage: torch.UntypedStorage = self._untyped_storage.cuda(device, non_blocking, **kwargs)
         return self._new_wrapped_storage(cuda_storage)
 
     def element_size(self):
+        _warn_typed_storage_removal()
+        return self._element_size()
+
+    # For internal use only, to avoid deprecation warning
+    def _element_size(self):
         return torch._utils._element_size(self.dtype)
 
     def get_device(self) -> int:
-        return self._storage.get_device()
+        _warn_typed_storage_removal()
+        return self._untyped_storage.get_device()
 
     def __str__(self):
+        _warn_typed_storage_removal()
         info_str = (
             f'[{torch.typename(self)}(dtype={self.dtype}, '
             f'device={self.device}) of size {len(self)}]')
@@ -585,35 +640,48 @@ def __str__(self):
             return data_str + '\n' + info_str
 
     def __repr__(self):
+        _warn_typed_storage_removal()
         return str(self)
 
     def __iter__(self):
+        _warn_typed_storage_removal()
         return iter(map(lambda i: self[i], range(self.size())))
 
     def __copy__(self):
-        return self._new_wrapped_storage(copy.copy(self._storage))
+        _warn_typed_storage_removal()
+        return self._new_wrapped_storage(copy.copy(self._untyped_storage))
 
     def __deepcopy__(self, memo):
-        return self._new_wrapped_storage(copy.deepcopy(self._storage, memo))
+        _warn_typed_storage_removal()
+        return self._deepcopy(memo)
+
+    # For internal use only, to avoid deprecation warning
+    def _deepcopy(self, memo):
+        return self._new_wrapped_storage(copy.deepcopy(self._untyped_storage, memo))
 
     def __sizeof__(self):
+        _warn_typed_storage_removal()
         return super(TypedStorage, self).__sizeof__() + self.nbytes()
 
     def clone(self):
         """Returns a copy of this storage"""
-        return self._new_wrapped_storage(self._storage.clone())
+        _warn_typed_storage_removal()
+        return self._new_wrapped_storage(self._untyped_storage.clone())
 
     def tolist(self):
         """Returns a list containing the elements of this storage"""
+        _warn_typed_storage_removal()
         return list(self)
 
     def cpu(self):
         """Returns a CPU copy of this storage if it's not already on the CPU"""
-        return self._new_wrapped_storage(self._storage.cpu())
+        _warn_typed_storage_removal()
+        return self._new_wrapped_storage(self._untyped_storage.cpu())
 
     def pin_memory(self):
         """Coppies the  storage to pinned memory, if it's not already pinned."""
-        return self._new_wrapped_storage(self._storage.pin_memory())
+        _warn_typed_storage_removal()
+        return self._new_wrapped_storage(self._untyped_storage.pin_memory())
 
     def share_memory_(self):
         """Moves the storage to shared memory.
@@ -624,7 +692,12 @@ def share_memory_(self):
 
         Returns: self
         """
-        self._storage.share_memory_()
+        _warn_typed_storage_removal()
+        return self._share_memory_()
+
+    # For internal use only, to avoid deprecation warning
+    def _share_memory_(self):
+        self._untyped_storage.share_memory_()
         return self
 
     def _new_shared(self, size, *, device=None):
@@ -632,25 +705,37 @@ def _new_shared(self, size, *, device=None):
         if device is None:
             device = 'cpu'
         device = torch.device(device)
-        untyped_storage = torch.UntypedStorage._new_shared(size * self.element_size(), device=device)
+        untyped_storage = torch.UntypedStorage._new_shared(size * self._element_size(), device=device)
         return TypedStorage(
             wrap_storage=untyped_storage,
-            dtype=self.dtype)
+            dtype=self.dtype,
+            _internal=True)
 
     @property
     def _cdata(self):
-        return self._storage._cdata
+        return self._untyped_storage._cdata
 
     @property
     def device(self):
-        return self._storage.device
+        _warn_typed_storage_removal()
+        return self._untyped_storage.device
 
     def size(self):
+        _warn_typed_storage_removal()
+        return self._size()
+
+    # For internal use only, to avoid deprecation warning
+    def _size(self):
         # NB: don't indirect through __len__, as that requires
         # an int to be returned
-        return self.nbytes() // self.element_size()
+        return self._untyped_storage.nbytes() // self._element_size()
 
     def pickle_storage_type(self):
+        _warn_typed_storage_removal()
+        return self._pickle_storage_type()
+
+    # For internal use only, to avoid deprecation warning
+    def _pickle_storage_type(self):
         try:
             return _dtype_to_storage_type_map()[self.dtype]
         except KeyError:
@@ -662,20 +747,35 @@ def __reduce__(self):
         return (_load_from_bytes, (b.getvalue(),))
 
     def data_ptr(self):
-        return self._storage.data_ptr()
+        _warn_typed_storage_removal()
+        return self._data_ptr()
+
+    # For internal use only, to avoid deprecation warning
+    def _data_ptr(self):
+        return self._untyped_storage.data_ptr()
 
     def resize_(self, size):
-        self._storage.resize_(size * self.element_size())
+        _warn_typed_storage_removal()
+        self._resize_(size)
+
+    # For internal use only, to avoid deprecation warning
+    def _resize_(self, size):
+        self._untyped_storage.resize_(size * self._element_size())
 
     @classmethod
     def _free_weak_ref(cls, *args, **kwargs):
         return UntypedStorage._free_weak_ref(*args, **kwargs)
 
     def _weak_ref(self, *args, **kwargs):
-        return self._storage._weak_ref(*args, **kwargs)
+        return self._untyped_storage._weak_ref(*args, **kwargs)
 
     @classmethod
-    def from_buffer(cls, *args, dtype=None, device=None, **kwargs):
+    def from_buffer(cls, *args, **kwargs):
+        _warn_typed_storage_removal()
+        return cls._from_buffer(*args, **kwargs)
+
+    @classmethod
+    def _from_buffer(cls, *args, dtype=None, device=None, **kwargs):
         if cls == TypedStorage:
             dtype = torch.get_default_dtype() if dtype is None else dtype
             device = torch.device('cpu' if device is None else device)
@@ -693,65 +793,80 @@ def from_buffer(cls, *args, dtype=None, device=None, **kwargs):
                     "from_buffer: 'device' can only be specified in "
                     "UntypedStorage.from_buffer and TypedStorage.from_buffer"))
 
-            dtype = cls.dtype
+            dtype = cls._dtype
             untyped_storage = torch.UntypedStorage.from_buffer(*args, dtype=dtype, **kwargs)
 
-        return TypedStorage(wrap_storage=untyped_storage, dtype=dtype)
+        return TypedStorage(
+            wrap_storage=untyped_storage,
+            dtype=dtype,
+            _internal=True)
 
     def _to(self, dtype):
         if not isinstance(dtype, torch.dtype):
             raise TypeError(f"Argument 'dtype' must be torch.dtype, not {type(dtype)}")
-        storage = torch.tensor([], dtype=self.dtype, device=self.device).set_(self).to(dtype).storage()
+        storage = torch.tensor([], dtype=self.dtype, device=self.device).set_(self).to(dtype)._typed_storage()
         if storage.data_ptr() == self.data_ptr():
             storage = storage.clone()
         return storage
 
     def double(self):
         """Casts this storage to double type"""
+        _warn_typed_storage_removal()
         return self._to(torch.double)
 
     def float(self):
         """Casts this storage to float type"""
+        _warn_typed_storage_removal()
         return self._to(torch.float)
 
     def half(self):
         """Casts this storage to half type"""
+        _warn_typed_storage_removal()
         return self._to(torch.half)
 
     def long(self):
         """Casts this storage to long type"""
+        _warn_typed_storage_removal()
         return self._to(torch.long)
 
     def int(self):
         """Casts this storage to int type"""
+        _warn_typed_storage_removal()
         return self._to(torch.int)
 
     def short(self):
         """Casts this storage to short type"""
+        _warn_typed_storage_removal()
         return self._to(torch.short)
 
     def char(self):
         """Casts this storage to char type"""
+        _warn_typed_storage_removal()
         return self._to(torch.int8)
 
     def byte(self):
         """Casts this storage to byte type"""
+        _warn_typed_storage_removal()
         return self._to(torch.uint8)
 
     def bool(self):
         """Casts this storage to bool type"""
+        _warn_typed_storage_removal()
         return self._to(torch.bool)
 
     def bfloat16(self):
         """Casts this storage to bfloat16 type"""
+        _warn_typed_storage_removal()
         return self._to(torch.bfloat16)
 
     def complex_double(self):
         """Casts this storage to complex double type"""
+        _warn_typed_storage_removal()
         return self._to(torch.cdouble)
 
     def complex_float(self):
         """Casts this storage to complex float type"""
+        _warn_typed_storage_removal()
         return self._to(torch.cfloat)
 
     @classmethod
@@ -773,6 +888,7 @@ def from_file(cls, filename, shared, size):
             shared (bool): whether to share memory
             size (int): number of elements in the storage
         """
+        _warn_typed_storage_removal()
         if cls == TypedStorage:
             raise RuntimeError('from_file can only be called on derived classes')
         untyped_storage: UntypedStorage = UntypedStorage.from_file(
@@ -787,33 +903,39 @@ def _expired(cls, *args, **kwargs):
         return UntypedStorage._expired(*args, **kwargs)
 
     def is_pinned(self):
-        return self._storage.is_pinned()
+        _warn_typed_storage_removal()
+        return self._untyped_storage.is_pinned()
 
     def _write_file(self, *args, **kwargs):
-        return self._storage._write_file(*args, **kwargs)
+        return self._untyped_storage._write_file(*args, **kwargs)
 
     def _set_from_file(self, *args, **kwargs):
-        return self._storage._set_from_file(*args, **kwargs)
+        return self._untyped_storage._set_from_file(*args, **kwargs)
 
     def _set_cdata(self, *args, **kwargs):
-        return self._storage._set_cdata(*args, **kwargs)
+        return self._untyped_storage._set_cdata(*args, **kwargs)
 
     def _share_cuda_(self, *args, **kwargs):
-        return self._storage._share_cuda_(*args, **kwargs)
+        return self._untyped_storage._share_cuda_(*args, **kwargs)
 
     def is_shared(self):
-        return self._storage.is_shared()
+        _warn_typed_storage_removal()
+        return self._is_shared()
+
+    # For internal use only, to avoid deprecation warning
+    def _is_shared(self):
+        return self._untyped_storage.is_shared()
 
     @classmethod
     def _new_shared_cuda(cls, *args, **kwargs):
         return torch.UntypedStorage._new_shared_cuda(*args, **kwargs)
 
     def _share_filename_cpu_(self, *args, **kwargs):
-        manager_handle, storage_handle, size = self._storage._share_filename_cpu_(*args, **kwargs)
-        return manager_handle, storage_handle, size // self.element_size()
+        manager_handle, storage_handle, size = self._untyped_storage._share_filename_cpu_(*args, **kwargs)
+        return manager_handle, storage_handle, size // self._element_size()
 
     def _shared_decref(self):
-        self._storage._shared_decref()
+        self._untyped_storage._shared_decref()
         return self
 
     @classmethod
@@ -821,11 +943,11 @@ def _release_ipc_counter(cls, *args, device=None, **kwargs):
         return torch.UntypedStorage._release_ipc_counter_cuda(*args, **kwargs)
 
     def _shared_incref(self, *args, **kwargs):
-        return self._storage._shared_incref(*args, **kwargs)
+        return self._untyped_storage._shared_incref(*args, **kwargs)
 
     def _share_fd_cpu_(self, *args, **kwargs):
-        fd, size = self._storage._share_fd_cpu_(*args, **kwargs)
-        return fd, size // self.element_size()
+        fd, size = self._untyped_storage._share_fd_cpu_(*args, **kwargs)
+        return fd, size // self._element_size()
 
     def _get_legacy_storage_class(self):
         if self.dtype not in _dtype_to_storage_type_map():
@@ -859,7 +981,7 @@ class _LegacyStorage(TypedStorage, metaclass=_LegacyStorageMeta):
     @classmethod
     def _new_shared(cls, size):
         """Creates a new storage in shared memory with the same data type"""
-        untyped_storage = torch.UntypedStorage._new_shared(size * cls().element_size())
+        untyped_storage = torch.UntypedStorage._new_shared(size * cls()._element_size())
         return cls(wrap_storage=untyped_storage)
 
     @classmethod
diff --git a/torch/testing/_comparison.py b/torch/testing/_comparison.py
index d15cae4b1bb5a..6999986f52945 100644
--- a/torch/testing/_comparison.py
+++ b/torch/testing/_comparison.py
@@ -927,9 +927,34 @@ def originate_pairs(
     Returns:
         (List[Pair]): Originated pairs.
     """
+    if (
+        isinstance(actual, torch.TypedStorage)
+        and isinstance(expected, torch.TypedStorage)
+    ):
+        actual_len = actual._size()
+        expected_len = expected._size()
+        if actual_len != expected_len:
+            raise ErrorMeta(
+                AssertionError, f"The length of the sequences mismatch: {actual_len} != {expected_len}", id=id
+            )
+
+        pairs = []
+        for idx in range(actual_len):
+            pairs.extend(
+                originate_pairs(
+                    actual._getitem(idx),
+                    expected._getitem(idx),
+                    pair_types=pair_types,
+                    sequence_types=sequence_types,
+                    mapping_types=mapping_types,
+                    id=(*id, idx),
+                    **options,
+                )
+            )
+        return pairs
     # We explicitly exclude str's here since they are self-referential and would cause an infinite recursion loop:
     # "a" == "a"[0][0]...
-    if (
+    elif (
         isinstance(actual, sequence_types)
         and not isinstance(actual, str)
         and isinstance(expected, sequence_types)
diff --git a/torch/testing/_internal/schema_check_mode.py b/torch/testing/_internal/schema_check_mode.py
index 9d118719af6b1..9fda9d95e1599 100644
--- a/torch/testing/_internal/schema_check_mode.py
+++ b/torch/testing/_internal/schema_check_mode.py
@@ -47,7 +47,7 @@ def has_mutated(before, after, md):
                     before.size() == after.size() and
                     torch.allclose(before, after, equal_nan=True) and
                     md[0] == after.stride() and
-                    md[1] == after.storage()._cdata
+                    md[1] == after._typed_storage()._cdata
                 )
             return False
 
@@ -76,12 +76,12 @@ def parse_metadata(e):
                 if not type(e) == torch.Tensor:
                     try:
                         current = e.elem
-                        return (deepcopy(current.stride()), current.storage()._cdata)
+                        return (deepcopy(current.stride()), current._typed_storage()._cdata)
                     except AttributeError as t:
                         return None
                 # Sparse CSR tensors do not have strides or storage
                 elif (e.layout != torch.sparse_csr):
-                    return (deepcopy(e.stride()), e.storage()._cdata)
+                    return (deepcopy(e.stride()), e._typed_storage()._cdata)
             return None
 
         self.ops.append(func._schema.name)
diff --git a/torch/utils/bundled_inputs.py b/torch/utils/bundled_inputs.py
index 1ca2d56616bc2..4ae39733ff2e4 100644
--- a/torch/utils/bundled_inputs.py
+++ b/torch/utils/bundled_inputs.py
@@ -391,7 +391,7 @@ def _inflate_expr(
 
     if isinstance(arg, torch.Tensor):
         # Small-storage tensors can just be saved directly.
-        if arg.storage().size() <= MAX_RAW_TENSOR_SIZE or skip_size_check:
+        if arg._typed_storage().size() <= MAX_RAW_TENSOR_SIZE or skip_size_check:
             return arg, ref, None
         # Small contiguous tensors can be cloned to have small storage.
         # TODO: Should we do this even for non-contiguous tensors?
@@ -407,7 +407,7 @@ def _inflate_expr(
         # TODO: Provide more useful diagnostics.
         raise Exception(
             f"Bundled input argument at position '{ref}' is "
-            f"a tensor with storage size {arg.storage().size()}. "
+            f"a tensor with storage size {arg._typed_storage().size()}. "
             f"You probably don't want to bundle this as an input. "
         )
     else:
diff --git a/torch/utils/data/_utils/collate.py b/torch/utils/data/_utils/collate.py
index 0ba9f25c2c9d2..1a00cd4514f58 100644
--- a/torch/utils/data/_utils/collate.py
+++ b/torch/utils/data/_utils/collate.py
@@ -158,7 +158,7 @@ def collate_tensor_fn(batch, *, collate_fn_map: Optional[Dict[Union[Type, Tuple[
         # If we're in a background process, concatenate directly into a
         # shared memory tensor to avoid an extra copy
         numel = sum(x.numel() for x in batch)
-        storage = elem.storage()._new_shared(numel, device=elem.device)
+        storage = elem._typed_storage()._new_shared(numel, device=elem.device)
         out = elem.new(storage).resize_(len(batch), *list(elem.size()))
     return torch.stack(batch, 0, out=out)
 

From 88f6d5c1711b4aff414b7a4f0cc9e7a57ed6aec9 Mon Sep 17 00:00:00 2001
From: Yidi Wu <yidi@meta.com>
Date: Tue, 8 Nov 2022 18:22:03 +0000
Subject: [PATCH 0664/1922] torchdynamo support modules() for nn_module
 (#88023)

Differential Revision: D40820879

This diff allows models to call self.modules() during dynamo tracing.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88023
Approved by: https://github.com/tugsbayasgalan, https://github.com/voznesenskym, https://github.com/jansel
---
 test/dynamo/test_repros.py           | 20 ++++++++++++++++++++
 torch/_dynamo/variables/nn_module.py |  2 ++
 2 files changed, 22 insertions(+)

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 2103e075fffc9..913d59322ac76 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -1792,6 +1792,26 @@ def fn(x):
         res = opt_fn(a)
         self.assertTrue(same(ref, res))
 
+    def test_modules(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc = torch.nn.Linear(4, 3)
+
+            def forward(self, inp):
+                res = torch.zeros(3, 3)
+                for mod in self.modules():
+                    res += self.fc(inp)
+                return res
+
+        mod = Foo()
+        args = (torch.ones(3, 4),)
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_mod = torch._dynamo.optimize(cnt, nopython=True)(mod)
+        self.assertTrue(same(mod(*args), opt_mod(*args)))
+        self.assertEqual(cnt.op_count, 5)
+        self.assertEqual(cnt.frame_count, 1)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py
index 6f7c2ff287373..1922980fc957f 100644
--- a/torch/_dynamo/variables/nn_module.py
+++ b/torch/_dynamo/variables/nn_module.py
@@ -337,6 +337,8 @@ def named_embed(name, obj):
             ):
                 result.append(named_embed(name, submod))
             return ListIteratorVariable(result, mutable_local=MutableLocal(), **options)
+        elif name == "modules":
+            return wrap_values(module.named_modules())
         elif name == "parameters":
             return wrap_values(module.named_parameters(**get_kwargs("recurse")))
         elif name == "values":

From 127ea6bf12c48a74cd1f3bd6d7cc8e846abf37f3 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Tue, 8 Nov 2022 05:30:06 -0800
Subject: [PATCH 0665/1922] Propagate layout and pin memory in randint to inner
 constructor (#88673)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88673
Approved by: https://github.com/anjali411
---
 torch/_meta_registrations.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 55be711e22feb..0286c9b011091 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -76,13 +76,21 @@ def meta_randperm(n, *, generator=None, out):
 
 
 @register_meta(aten.randint.default)
-def meta_randint(high, size, *, dtype=torch.long, **kwargs):
-    return torch.empty(size, dtype=dtype, **kwargs)
+def meta_randint(
+    high, size, *, dtype=torch.long, layout=None, device=None, pin_memory=None
+):
+    return torch.empty(
+        size, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
+    )
 
 
 @register_meta(aten.randint.low)
-def meta_randint_low(low, high, size, *, dtype=torch.long, **kwargs):
-    return torch.empty(size, dtype=dtype, **kwargs)
+def meta_randint_low(
+    low, high, size, *, dtype=torch.long, layout=None, device=None, pin_memory=None
+):
+    return torch.empty(
+        size, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
+    )
 
 
 @register_meta([aten._fft_c2r.default, aten._fft_c2r.out])

From edb96932c89bbad4f92ac6fd7f59227249cd273e Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Tue, 8 Nov 2022 05:28:04 -0800
Subject: [PATCH 0666/1922] Correctly test that dtype/device match in generated
 .out kernels for composites (#88672)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88672
Approved by: https://github.com/anjali411
---
 aten/src/ATen/templates/CompositeViewCopyKernels.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/templates/CompositeViewCopyKernels.cpp b/aten/src/ATen/templates/CompositeViewCopyKernels.cpp
index d6a7266952e9b..7548d7c1a3a8a 100644
--- a/aten/src/ATen/templates/CompositeViewCopyKernels.cpp
+++ b/aten/src/ATen/templates/CompositeViewCopyKernels.cpp
@@ -30,17 +30,25 @@ std::vector<at::Tensor> clone_arg(const at::TensorList& t_list) {
     return out;
 }
 
+// duped with gen_resize_out_helper from structured kernels
 void copy_arg(const at::Tensor& dst, const at::Tensor& src) {
+    TORCH_CHECK(src.dtype() == dst.dtype(),
+        "Expected out tensor to have dtype ", src.dtype(), ", but got ", dst.dtype(), " instead");
+    TORCH_CHECK(src.device() == dst.device(),
+        "Expected out tensor to have device ", src.device(), ", but got ", dst.device(), " instead");
     dst.copy_(src);
 }
 
 void copy_arg(const at::TensorList& dst, const at::TensorList& src) {
     TORCH_INTERNAL_ASSERT(dst.size() == src.size());
     for (const auto& i : c10::irange(dst.size())) {
-        dst[i].copy_(src[i]);
+        copy_arg(dst[i], src[i]);
     }
 }
 
+// TODO: this doesn't handle restriding empty tensors correctly; see
+// gen_resize_out_helper for the correct algorithm
+
 void resize_out_helper(const at::Tensor& dst, const at::Tensor& src) {
     at::native::resize_output(dst, src.sizes());
 }

From decb951cb1cf1f2016cf5d6d20f85fcae497bc51 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Tue, 8 Nov 2022 10:23:53 -0800
Subject: [PATCH 0667/1922] Support diag_embed.out decomposition (#88671)

This is a little tricky: there is a diag_embed.out, but its not bound
in Python because it's autogenerated, see https://github.com/pytorch/pytorch/issues/88598
So I can't "just" add the out variant to the ref, as this makes it
inconsistent with the torch API.  To workaround this, I mark the ref
as supporting out, but not the original function.

This is useful to do, because it means that diag_embed.out now supports
symbolic shapes.  However, this cannot be easily tested because
I can't mark the out variant as being supported in the normal OpInfo test.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88671
Approved by: https://github.com/mruberry
---
 torch/_refs/__init__.py                               | 1 +
 torch/testing/_internal/common_methods_invocations.py | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 57dc284361a00..cd0344eba7a91 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -3585,6 +3585,7 @@ def diagonal(
 
 
 @register_decomposition(torch.ops.aten.diag_embed)
+@out_wrapper()
 def diag_embed(
     t: TensorLikeType,
     offset: int = 0,
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 2626f777d1803..352132d0b662f 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -9215,6 +9215,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            error_inputs_func=error_inputs_diag),
     OpInfo('diag_embed',
            dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
+           # TODO: this is very questionable, because we do have
+           # diag_embed.out but it's not bound to Python somehow
+           # https://github.com/pytorch/pytorch/issues/88598
            supports_out=False,
            # Runs very slowly on slow gradcheck - alternatively reduce input sizes
            gradcheck_fast_mode=True,
@@ -17870,6 +17873,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     PythonRefInfo(
         "_refs.diag_embed",
         torch_opinfo_name="diag_embed",
+        supports_out=True,
         supports_nvfuser=False,
     ),
     PythonRefInfo(

From aa8503f97a4fbdc48b2765b64bb04a2fa617f0c1 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Tue, 8 Nov 2022 18:37:48 +0000
Subject: [PATCH 0668/1922] Revert "torchdynamo support modules() for nn_module
 (#88023)"

This reverts commit 96104c7b7e908634a473792b6b2e9279d79d23d8.

Reverted https://github.com/pytorch/pytorch/pull/88023 on behalf of https://github.com/ydwu4 due to [Internal breakages] https://www.internalfb.com/intern/sandcastle/job/9007200067589062/
---
 test/dynamo/test_repros.py           | 20 --------------------
 torch/_dynamo/variables/nn_module.py |  2 --
 2 files changed, 22 deletions(-)

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 913d59322ac76..2103e075fffc9 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -1792,26 +1792,6 @@ def fn(x):
         res = opt_fn(a)
         self.assertTrue(same(ref, res))
 
-    def test_modules(self):
-        class Foo(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.fc = torch.nn.Linear(4, 3)
-
-            def forward(self, inp):
-                res = torch.zeros(3, 3)
-                for mod in self.modules():
-                    res += self.fc(inp)
-                return res
-
-        mod = Foo()
-        args = (torch.ones(3, 4),)
-        cnt = torch._dynamo.testing.CompileCounter()
-        opt_mod = torch._dynamo.optimize(cnt, nopython=True)(mod)
-        self.assertTrue(same(mod(*args), opt_mod(*args)))
-        self.assertEqual(cnt.op_count, 5)
-        self.assertEqual(cnt.frame_count, 1)
-
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py
index 1922980fc957f..6f7c2ff287373 100644
--- a/torch/_dynamo/variables/nn_module.py
+++ b/torch/_dynamo/variables/nn_module.py
@@ -337,8 +337,6 @@ def named_embed(name, obj):
             ):
                 result.append(named_embed(name, submod))
             return ListIteratorVariable(result, mutable_local=MutableLocal(), **options)
-        elif name == "modules":
-            return wrap_values(module.named_modules())
         elif name == "parameters":
             return wrap_values(module.named_parameters(**get_kwargs("recurse")))
         elif name == "values":

From 821f2fa12dc23555b951990f82b12e827c1684c9 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Tue, 8 Nov 2022 05:33:18 -0800
Subject: [PATCH 0669/1922] Mark as_strided_ as supporting SymInt in C++
 (#88674)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88674
Approved by: https://github.com/anjali411
---
 test/test_proxy_tensor.py        | 1 -
 torch/_subclasses/fake_tensor.py | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 9059f9ef458a3..f45063adac70e 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1379,7 +1379,6 @@ def f(a, b, c, d, e):
     xfail('addcmul', ''),  # aten.addcmul_.default - couldn't find symbolic meta function/decomposition
     xfail('addmm', ''),  # aten.addmm_.default - couldn't find symbolic meta function/decomposition
     xfail('addmm', 'decomposed'),  # aten.addmm_.default - couldn't find symbolic meta function/decomposition
-    xfail('as_strided', ''),  # aten.as_strided_.default - couldn't find symbolic meta function/decomposition
     xfail('asin', ''),  # aten.asin_.default - couldn't find symbolic meta function/decomposition
     xfail('asinh', ''),  # aten.asinh_.default - couldn't find symbolic meta function/decomposition
     xfail('atan2', ''),  # aten.atan2_.default - couldn't find symbolic meta function/decomposition
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index fa58ce23c4437..14f5cd2de0a7a 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -968,6 +968,7 @@ def cpp_meta_supports_symint(self, func):
             aten.empty_strided.default,
             aten.as_strided_scatter.default,
             aten.as_strided.default,
+            aten.as_strided_.default,
             aten.zeros.default,
             aten.detach.default,
             aten.set_.source_Storage_storage_offset,

From f85c557801d9ab0d870a2f5b0cf90d775938db90 Mon Sep 17 00:00:00 2001
From: zyq8709 <zyq8709@gmail.com>
Date: Tue, 8 Nov 2022 18:46:56 +0000
Subject: [PATCH 0670/1922] Most recently used cache management for TorchDynamo
 (#88076)

Modify the lookup procedure for TorchDynamo caches to keep the head of the single linked list as the most recently used cache entry, which may potentially improve probability for cache hitting.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88076
Approved by: https://github.com/jansel
---
 torch/csrc/dynamo/eval_frame.c | 40 +++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/torch/csrc/dynamo/eval_frame.c b/torch/csrc/dynamo/eval_frame.c
index e81457e4a2487..bbfc1bb2897d2 100644
--- a/torch/csrc/dynamo/eval_frame.c
+++ b/torch/csrc/dynamo/eval_frame.c
@@ -191,6 +191,17 @@ static void destroy_cache_entry(CacheEntry* e) {
   free(e);
 }
 
+inline static CacheEntry* get_extra(PyCodeObject* code) {
+  CacheEntry* extra = NULL;
+  _PyCode_GetExtra((PyObject*)code, extra_index, (void*)&extra);
+  return extra;
+}
+
+inline static void set_extra(PyCodeObject* code, CacheEntry* extra) {
+  // TODO(jansel): would it be faster to bypass this?
+  _PyCode_SetExtra((PyObject*)code, extra_index, extra);
+}
+
 #ifdef TORCHDYNAMO_DEBUG
 inline static const char* name(PyFrameObject* frame) {
   DEBUG_CHECK(PyUnicode_Check(frame->f_code->co_name));
@@ -216,10 +227,11 @@ static void call_guard_fail_hook(
   Py_DECREF(args);
 }
 
-static PyCodeObject* lookup(CacheEntry* e, PyObject* f_locals) {
+static PyCodeObject* lookup(CacheEntry* e, PyFrameObject *frame, CacheEntry* prev) {
   if (e == NULL) {
     return NULL;
   }
+  PyObject *f_locals = frame->f_locals;
   PyObject* dotzero = PyDict_GetItem(f_locals, dotzerokey);
   PyObject* valid = NULL;
   if (unlikely(dotzero != NULL)) {
@@ -240,12 +252,21 @@ static PyCodeObject* lookup(CacheEntry* e, PyObject* f_locals) {
   }
   Py_DECREF(valid);
   if (valid == Py_True) {
+    // Keep the head as the most recently used cache entry.
+    // If the hit cache entry is not the head of the linked list,
+    // move it to the head
+    if (prev != NULL) {
+        CacheEntry* extra = get_extra(frame->f_code);
+        prev->next = e->next;
+        e->next = extra;
+        set_extra(frame->f_code, e);
+    }
     return e->code;
   }
   if (unlikely(guard_fail_hook != NULL)) {
     call_guard_fail_hook(guard_fail_hook, e, f_locals);
   }
-  return lookup(e->next, f_locals);
+  return lookup(e->next, frame, e);
 }
 
 static long cache_size(CacheEntry* e) {
@@ -255,17 +276,6 @@ static long cache_size(CacheEntry* e) {
   return 1 + cache_size(e->next);
 }
 
-inline static CacheEntry* get_extra(PyCodeObject* code) {
-  CacheEntry* extra = NULL;
-  _PyCode_GetExtra((PyObject*)code, extra_index, (void*)&extra);
-  return extra;
-}
-
-inline static void set_extra(PyCodeObject* code, CacheEntry* extra) {
-  // TODO(jansel): would it be faster to bypass this?
-  _PyCode_SetExtra((PyObject*)code, extra_index, extra);
-}
-
 inline static PyObject* eval_custom_code(
     PyThreadState* tstate,
     PyFrameObject* frame,
@@ -358,7 +368,7 @@ static PyObject* _custom_eval_frame(
   // we never compile.
   if (callback == Py_False) {
     DEBUG_TRACE("In run only mode %s", name(frame));
-    PyCodeObject* cached_code = lookup(extra, frame->f_locals);
+    PyCodeObject* cached_code = lookup(extra, frame, NULL);
     if (cached_code != NULL) {
       // used cached version
       DEBUG_TRACE("cache hit %s", name(frame));
@@ -377,7 +387,7 @@ static PyObject* _custom_eval_frame(
   // in the shim.
   eval_frame_callback_set(Py_None);
 
-  PyCodeObject* cached_code = lookup(extra, frame->f_locals);
+  PyCodeObject* cached_code = lookup(extra, frame, NULL);
   if (cached_code != NULL) {
     // used cached version
     DEBUG_TRACE("cache hit %s", name(frame));

From d8a17190e0d133fcb57393839704056e787324eb Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Tue, 8 Nov 2022 18:52:56 +0000
Subject: [PATCH 0671/1922] Run libtorch trunk build on linux.4xlarge (#88683)

Add optional `runner`  input to `_linux-build.yml`
Move `libtorch-linux-bionic-cuda11_6-py3_7-gcc7-build` to `linux.4xlarge` as it occasionally OOMS on 2xlarge one

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88683
Approved by: https://github.com/atalman, https://github.com/weiwangmeta
---
 .github/workflows/_linux-build.yml | 8 +++++++-
 .github/workflows/trunk.yml        | 1 +
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml
index cc7945c98760b..3719069895b3b 100644
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@@ -34,6 +34,12 @@ on:
         default: "5.2"
         description: |
           List of CUDA architectures CI build should target.
+      runner:
+        required: false
+        type: string
+        default: "linux.2xlarge"
+        description: |
+          List of CUDA architectures CI build should target.
 
       test-matrix:
         required: false
@@ -55,7 +61,7 @@ jobs:
   build:
     # Don't run on forked repos
     if: github.repository_owner == 'pytorch'
-    runs-on: [self-hosted, linux.2xlarge]
+    runs-on: ${{ inputs.runner }}
     timeout-minutes: 240
     outputs:
       docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 14bfcd0900687..cb5d1291833a2 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -111,6 +111,7 @@ jobs:
       build-environment: libtorch-linux-bionic-cuda11.6-py3.7-gcc7
       docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
       build-generates-artifacts: false
+      runner: linux.4xlarge
 
   # no-ops builds test USE_PER_OPERATOR_HEADERS=0 where ATen/ops is not generated
   linux-bionic-cuda11_7-py3_10-gcc7-no-ops-build:

From 38549d6b525c058bd410323741d5dbb37ff39f4e Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Tue, 8 Nov 2022 19:04:25 +0000
Subject: [PATCH 0672/1922] Revert "[cuDNN] (re-open) Enable cuDNN Frontend v8
 API by Default (#87669)"

This reverts commit 3c6bddc3f6347ce7d1ed33aee94cdaa953cbc387.

Reverted https://github.com/pytorch/pytorch/pull/87669 on behalf of https://github.com/eqy due to investigating convnext benchmark regressions
---
 aten/src/ATen/native/ConvUtils.h        | 4 ++--
 aten/src/ATen/native/cudnn/ConvShared.h | 2 +-
 test/nn/test_convolution.py             | 2 +-
 test/test_cuda.py                       | 8 ++++----
 torch/testing/_internal/common_utils.py | 4 +---
 5 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h
index 9510fbc95e12d..675f701c8582d 100644
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@@ -66,11 +66,11 @@ namespace {
 }
 
 static inline bool cudnnv8_enabled_check_debug() {
-  static bool cudnnv8_flag = c10::utils::check_env("TORCH_CUDNN_V8_API_DISABLED") != true;
+  static bool cudnnv8_flag = c10::utils::check_env("TORCH_CUDNN_V8_API_ENABLED") == true;
   static bool cudnnv8_debug = c10::utils::check_env("TORCH_CUDNN_V8_API_DEBUG") == true;
   static uint8_t cudnnv8_debugcount = 0;
   if (cudnnv8_debug == 1 && cudnnv8_debugcount < 10) {
-    TORCH_WARN("TORCH_CUDNN_V8_DEBUG ON, V8 ON: ", cudnnv8_flag, " TORCH_CUDNN_USE_HEURISTIC_MODE B: ", cudnnv8_heuristic_mode_b);
+    TORCH_WARN("TORCH_CUDNN_V8_DEBUG ON, V8_FLAG: ", cudnnv8_flag, " TORCH_CUDNN_USE_HEURISTIC_MODE B: ", cudnnv8_heuristic_mode_b);
     cudnnv8_debugcount++;
   }
   return cudnnv8_flag == 1;
diff --git a/aten/src/ATen/native/cudnn/ConvShared.h b/aten/src/ATen/native/cudnn/ConvShared.h
index fa06d09404711..9a576de285ce4 100644
--- a/aten/src/ATen/native/cudnn/ConvShared.h
+++ b/aten/src/ATen/native/cudnn/ConvShared.h
@@ -113,7 +113,7 @@ void raw_cudnn_convolution_add_relu_fallback_out(
 
 #if HAS_CUDNN_V8()
 // v7 functions are preserved here to allow for runtime switching to v7
-// (e.g., TORCH_CUDNN_V8_API_DISABLED=1).
+// (e.g., TORCH_CUDNN_V8_API_ENABLED=0).
 // Note that v7 forward/backward out can have different behavior from the v8
 // versions, as v7 explicitly splits large tensors as a 32-bit indexing
 // workaround whereas v8 expects cuDNN to handle large tensors.
diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py
index c621510674fa6..c94eb5447d5ad 100644
--- a/test/nn/test_convolution.py
+++ b/test/nn/test_convolution.py
@@ -1034,7 +1034,7 @@ def test_Conv3d_depthwise_naive_groups(self, device, dtype):
             self.assertEqual(m.weight.grad.data,
                              torch.cat([m1.weight.grad.data,
                                         m2.weight.grad.data], 0),
-                             atol=atol, rtol=rtol)
+                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
 
 
     @onlyCUDA
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 9ecafc45103b6..fada440a72932 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -2897,10 +2897,10 @@ def test_autocast_torch_bf16(self):
                 op, args = op_with_args[0], op_with_args[1]
                 if len(op_with_args) == 3:
                     skip_test = op_with_args[2]  # TEST_WITH_ROCM
-                should_error_from_cudnn = 'cudnn' in op and \
-                    ('TORCH_CUDNN_V8_API_DISABLED' in os.environ and
-                     int(os.environ['TORCH_CUDNN_V8_API_DISABLED']) or
-                     torch.cuda.get_device_capability() < (8, 0))
+                should_error_from_cudnn = 'cudnn' in op and not\
+                    ('TORCH_CUDNN_V8_API_ENABLED' in os.environ and
+                     int(os.environ['TORCH_CUDNN_V8_API_ENABLED']) and
+                     torch.cuda.get_device_capability() >= (8, 0))
                 should_error_from_not_implemented = should_error_from_cudnn or 'prelu' in op or 'thnn' in op \
                     or 'fused' in op or 'gru' in op or op == '_thnn_fused_lstm_cell' or op == 'lstm_cell'
                 if not skip_test:
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 5eb924288c7a3..6fd64187581f3 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -901,9 +901,7 @@ def _check_module_exists(name: str) -> bool:
 if TEST_CUDA and 'NUM_PARALLEL_PROCS' in os.environ:
     num_procs = int(os.getenv("NUM_PARALLEL_PROCS", "2"))
     # other libraries take up about 11% of space per process
-    # + leave some additional buffer e.g., for runtime compilation
-    # or allocations outside of the caching allocator
-    torch.cuda.set_per_process_memory_fraction(round(1 / num_procs - .15, 2))
+    torch.cuda.set_per_process_memory_fraction(round(1 / num_procs - .11, 2))
 
 
 def skipIfCrossRef(fn):

From 0186a8049455facf28715561714691be1c68d993 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Tue, 8 Nov 2022 06:12:03 -0800
Subject: [PATCH 0673/1922] Minor error message improvements on meta functions
 (#88677)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88677
Approved by: https://github.com/SherlockNoMad
---
 torch/_meta_registrations.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 0286c9b011091..89395729ad446 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1463,13 +1463,14 @@ def meta_slice_scatter(self, src, dim=0, start=None, end=None, step=1):
     return torch.empty_like(self)
 
 
+# TODO: Deduplicate this with canonicalize_dim
 def maybe_wrap_dim(dim: int, dim_post_expr: int, wrap_scalar: bool = True):
     if dim_post_expr <= 0:
         assert wrap_scalar
         dim_post_expr = 1
     min = -dim_post_expr
     max = dim_post_expr - 1
-    assert not (dim < min or dim > max)
+    assert not (dim < min or dim > max), f"dim {dim} out of bounds ({min}, {max})"
     if dim < 0:
         dim += dim_post_expr
     return dim
@@ -1503,7 +1504,7 @@ def meta_gather(self, dim, index, sparse_grad=False):
     if not is_index_empty:
         check(
             index.dtype == torch.long,
-            lambda: "gather(): Expected dtype int64 for index",
+            lambda: f"gather(): Expected dtype int64 for index, but got {index.dtype}",
         )
         gather_shape_check(self, wrapped_dim, index)
     return self.new_empty(index.shape)

From 4165034138abd223dcf3e0157a62c4b020c654c7 Mon Sep 17 00:00:00 2001
From: Eddie Yan <eddiey@nvidia.com>
Date: Tue, 8 Nov 2022 19:44:23 +0000
Subject: [PATCH 0674/1922] Fix `CUDA_MAX_THREADS_PER_SM` for `sm_87` (#88644)

#88326
CC @ngimel @ptrblck

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88644
Approved by: https://github.com/ngimel
---
 c10/macros/Macros.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
index 31cd2219d10e6..b6912004bd77c 100644
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@@ -255,13 +255,13 @@ using namespace c10::hip;
 // constants from
 // (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications)
 // The maximum number of threads per multiprocessor is 1024 for Turing
-// architecture (7.5), 1536 for Geforce Ampere (8.6), and 2048 for all other
-// architectures. You'll get warnings if you exceed these constants. Hence, the
-// following macros adjust the input values from the user to resolve potential
-// warnings.
+// architecture (7.5), 1536 for Geforce Ampere (8.6)/Jetson Orin (8.7), and
+// 2048 for all other architectures. You'll get warnings if you exceed these
+// constants. Hence, the following macros adjust the input values from the user
+// to resolve potential warnings.
 #if __CUDA_ARCH__ == 750
 constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1024;
-#elif __CUDA_ARCH__ == 860
+#elif __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 870
 constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1536;
 #else
 constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 2048;

From 0dc8c7e5c4b27bd7bed663c0ebf9f0ec2adf29fa Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Tue, 8 Nov 2022 19:53:11 +0000
Subject: [PATCH 0675/1922] [torch] Implement aten::native_batch_norm.out for
 CPU (#88604)

Summary:
Implement `native_batch_norm.out` for CPU. Reuses the main logic for `native_batch_norm` but extract out the Tensor creation logic for outputs. There are 3 outputs: `output`, `save_mean` and `save_var`. `batch_norm_cpu` calls `batch_norm_cpu_update_stats_template` to get `save_mean` and `save_var`, and then calls into `batch_norm_cpu_transform_input_template` which initializes `output`.

In the implementation of `batch_norm_cpu_out`, I did the following:

* Let `batch_norm_cpu_transform_input_template` to take another argument `output`, ask the call sites to pass in a output Tensor.

* Overload `batch_norm_cpu_update_stats_template` to take `save_mean` and `save_var`, ask the call sites to pass in those Tensors.

* In `batch_norm_cpu_out`, pass `output`, `save_mean` and `save_var` all the way to our new `batch_norm_cpu_transform_input_template` and `batch_norm_cpu_update_stats_template`.

* In `batch_norm_cpu`, prepare for these outputs and call `batch_norm_cpu_out`.

Test Plan: Enable unit tests for `native_batch_norm.out`.

Differential Revision: D40992036

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88604
Approved by: https://github.com/iseeyuan, https://github.com/jjsjann123
---
 aten/src/ATen/native/Normalization.cpp        | 131 +++++++++++++-----
 aten/src/ATen/native/native_functions.yaml    |   1 +
 .../_internal/common_methods_invocations.py   |   3 -
 3 files changed, 97 insertions(+), 38 deletions(-)

diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index 5169c5e58e9ad..69196a3cd8210 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -14,6 +14,7 @@
 #include <ATen/native/cpu/Loops.h>
 #include <ATen/native/batch_norm.h>
 #include <ATen/native/Normalization.h>
+#include <ATen/native/Resize.h>
 #include <ATen/native/cpu/mixed_data_type.h>
 #include <c10/util/irange.h>
 
@@ -123,17 +124,17 @@ std::tuple<Tensor,Tensor,Tensor> batch_norm_cpu_transform_input_template(
     const Tensor& input, const Tensor& weight, const Tensor& bias,
     const Tensor& save_mean /* optional */, const Tensor& save_invstd /* optional */,
     const Tensor& running_mean /* optional */, const Tensor& running_var /* optional */,
-    bool train, double eps) {
+    bool train, double eps, Tensor& output) {
 
   bool all_contiguous = is_contiguous(input)
-      && (!weight.defined() || weight.is_contiguous())
-      && (!bias.defined() || bias.is_contiguous())
-      && running_mean.is_contiguous()
-      && running_var.is_contiguous();
+    && is_contiguous(output)
+    && (!weight.defined() || weight.is_contiguous())
+    && (!bias.defined() || bias.is_contiguous())
+    && running_mean.is_contiguous()
+    && running_var.is_contiguous();
 
   // inference contiguous path
   if (all_contiguous) {
-    Tensor output = at::empty_like(input, suggest_memory_format_contig(input));
     batch_norm_cpu_stub(kCPU, output, input, weight, bias,
         save_mean, save_invstd, running_mean, running_var, train, eps);
     return std::make_tuple(output, save_mean, save_invstd);
@@ -165,7 +166,6 @@ std::tuple<Tensor,Tensor,Tensor> batch_norm_cpu_transform_input_template(
   auto b = bias.defined() ? as_nd(bias) :
       at::detail::scalar_tensor_static(0, dtype, kCPU);
 
-  Tensor output = at::empty_like(input, input.suggest_memory_format());
   auto iter = TensorIteratorConfig()
     .add_output(output)
     .add_input(input)
@@ -185,30 +185,17 @@ std::tuple<Tensor,Tensor,Tensor> batch_norm_cpu_transform_input_template(
 template<typename scalar_t, typename param_t, template<typename T> class VarTransform>
 std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
     const Tensor& input, const Tensor& running_mean, const Tensor& running_var,
-    double momentum, double eps) {
+    double momentum, double eps, Tensor& save_mean, Tensor& save_var_transform) {
 
   using accscalar_t = at::acc_type<scalar_t, false>;
 
   int64_t n_input = input.size(1);
   int64_t n = input.numel() / n_input;
-  const int64_t ndim = input.dim();
-
-  // Reduce all dimensions except dim=1
-  DimVector reduce_dims(ndim - 1);
-  reduce_dims[0] = 0;
-  for (const auto i : c10::irange(2, ndim)) {
-    reduce_dims[i - 1] = i;
-  }
 
   bool all_contiguous = is_contiguous(input);
   const bool mixed_type = !std::is_same<scalar_t, param_t>::value;
   const auto dtype = mixed_type ? kFloat : input.scalar_type();
 
-  // For contiguous case, leave 'mean' computation to kernel
-  Tensor save_mean = all_contiguous
-      ? at::empty({n_input}, input.options().dtype(dtype))
-      : at::mean(input, /*dim=*/reduce_dims, /*keepdim=*/false, dtype);
-  Tensor save_var_transform = at::empty({n_input}, input.options().dtype(dtype));
   auto save_mean_a = save_mean.accessor<param_t, 1>();
   auto save_var_transform_a = save_var_transform.accessor<param_t, 1>();
 
@@ -278,6 +265,25 @@ std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
   return std::make_tuple(save_mean, save_var_transform);
 }
 
+template<typename scalar_t, typename param_t, template<typename T> class VarTransform>
+std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
+    const Tensor& input, const Tensor& running_mean, const Tensor& running_var,
+    double momentum, double eps) {
+  int64_t n_input = input.size(1);
+  const int64_t ndim = input.dim();
+  DimVector reduce_dims(ndim - 1);
+  reduce_dims[0] = 0;
+  for (const auto i : c10::irange(2, ndim)) {
+    reduce_dims[i - 1] = i;
+  }
+
+  const bool mixed_type = !std::is_same<scalar_t, param_t>::value;
+  const auto dtype = mixed_type ? kFloat : input.scalar_type();
+  Tensor save_mean = is_contiguous(input) ? at::empty({n_input}, input.options().dtype(dtype)) : at::mean(input, /*dim=*/reduce_dims, /*keepdim=*/false, dtype);
+  Tensor save_var_transform = at::empty({n_input}, input.options().dtype(dtype));
+  return batch_norm_cpu_update_stats_template<scalar_t, param_t, VarTransform>(input, running_mean, running_var, momentum, eps, save_mean, save_var_transform);
+}
+
 template<typename scalar_t, typename param_t>
 std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
     const Tensor& grad_out_, const Tensor& input, const Tensor& weight,
@@ -690,8 +696,8 @@ std::tuple<Tensor, Tensor> batch_norm_update_stats_cpu(
   });
 }
 
-std::tuple<Tensor, Tensor, Tensor> batch_norm_cpu(const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
-                                                  bool train, double momentum, double eps) {
+std::tuple<Tensor&, Tensor&, Tensor&> batch_norm_cpu_out(const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
+                                                  bool train, double momentum, double eps, Tensor& out, Tensor& save_mean, Tensor& save_var) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
@@ -699,31 +705,86 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_cpu(const Tensor& self, const c10:
   const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();});
   const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
 
-  checkBackend("batch_norm_cpu", {self, weight, bias, running_mean, running_var}, Backend::CPU);
+  checkBackend("batch_norm_cpu_out", {self, weight, bias, running_mean, running_var}, Backend::CPU);
+  // Resize out
+  at::native::resize_output(out, self.sizes());
 
   const bool mixed_type = is_mixed_type(self, weight, bias, running_mean, running_var);
-  return AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, self.scalar_type(), "batch_norm", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::BFloat16, self.scalar_type(), "batch_norm", [&] {
     if (mixed_type) {
       check_mixed_data_type(self, weight, bias, running_mean, running_var);
       if (!train) {
-        auto save_mean = at::empty({0}, self.options().dtype(kFloat));
-        auto save_var = at::empty({0}, self.options().dtype(kFloat));
-        return batch_norm_cpu_transform_input_template<BFloat16, float>(self, weight, bias, save_mean, save_var, running_mean, running_var, train, eps);
+        return batch_norm_cpu_transform_input_template<BFloat16, float>(self, weight, bias, save_mean, save_var, running_mean, running_var, train, eps, out);
       } else {
-        auto save_stats = batch_norm_cpu_update_stats_template<BFloat16, float, InvStd>(self, running_mean, running_var, momentum, eps);
-        return batch_norm_cpu_transform_input_template<BFloat16, float>(self, weight, bias, std::get<0>(save_stats), std::get<1>(save_stats), running_mean, running_var, train, eps);
+        // Resize save_mean and save_var
+        at::native::resize_output(save_mean, {self.size(1)});
+        at::native::resize_output(save_var, {self.size(1)});
+        auto save_stats = batch_norm_cpu_update_stats_template<BFloat16, float, InvStd>(self, running_mean, running_var, momentum, eps, save_mean, save_var);
+        return batch_norm_cpu_transform_input_template<BFloat16, float>(self, weight, bias, std::get<0>(save_stats), std::get<1>(save_stats), running_mean, running_var, train, eps, out);
       }
     } else {
       if (!train) {
-        auto save_mean = at::empty({0}, self.options());
-        auto save_var = at::empty({0}, self.options());
-        return batch_norm_cpu_transform_input_template<scalar_t, scalar_t>(self, weight, bias, save_mean, save_var, running_mean, running_var, train, eps);
+        return batch_norm_cpu_transform_input_template<scalar_t, scalar_t>(self, weight, bias, save_mean, save_var, running_mean, running_var, train, eps, out);
       } else {
-        auto save_stats = batch_norm_cpu_update_stats_template<scalar_t, scalar_t, InvStd>(self, running_mean, running_var, momentum, eps);
-        return batch_norm_cpu_transform_input_template<scalar_t, scalar_t>(self, weight, bias, std::get<0>(save_stats), std::get<1>(save_stats), running_mean, running_var, train, eps);
+        // Resize save_mean and save_var
+        at::native::resize_output(save_mean, {self.size(1)});
+        at::native::resize_output(save_var, {self.size(1)});
+        auto save_stats = batch_norm_cpu_update_stats_template<scalar_t, scalar_t, InvStd>(self, running_mean, running_var, momentum, eps, save_mean, save_var);
+        return batch_norm_cpu_transform_input_template<scalar_t, scalar_t>(self, weight, bias, std::get<0>(save_stats), std::get<1>(save_stats), running_mean, running_var, train, eps, out);
       }
     }
   });
+
+  return std::tuple<Tensor& ,Tensor&, Tensor&>(out, save_mean, save_var);
+}
+
+std::tuple<Tensor, Tensor, Tensor> batch_norm_cpu(const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt,
+                                                  bool train, double momentum, double eps) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
+  const Tensor& weight = *weight_maybe_owned;
+  const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();});
+  const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();});
+  const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();});
+
+  checkBackend("batch_norm_cpu", {self, weight, bias, running_mean, running_var}, Backend::CPU);
+
+  // Prepare output tensor
+  const bool all_contiguous = is_contiguous(self)
+    && (!weight.defined() || weight.is_contiguous())
+    && (!bias.defined() || bias.is_contiguous())
+    && running_mean.is_contiguous()
+    && running_var.is_contiguous();
+  Tensor output = at::empty_like(self, all_contiguous ? suggest_memory_format_contig(self) : self.suggest_memory_format());
+
+  // Prepare save_mean and save_var
+  Tensor save_var;
+  Tensor save_mean;
+  const bool mixed_type = is_mixed_type(self, weight, bias, running_mean, running_var);
+  const int64_t ndim = self.dim();
+  DimVector reduce_dims(ndim - 1);
+  reduce_dims[0] = 0;
+  for (const auto i : c10::irange(2, ndim)) {
+    reduce_dims[i - 1] = i;
+  }
+  if (mixed_type) {
+    if (!train) {
+      save_mean = at::empty({0}, self.options().dtype(kFloat));
+      save_var = at::empty({0}, self.options().dtype(kFloat));
+    } else {
+      save_mean = is_contiguous(self) ? at::empty({self.size(1)}, self.options().dtype(kFloat)) : at::mean(self, /*dim=*/reduce_dims, /*keepdim=*/false, kFloat);
+      save_var = at::empty({self.size(1)}, self.options().dtype(kFloat));
+    }
+  } else {
+    if (!train) {
+      save_mean = at::empty({0}, self.options());
+      save_var = at::empty({0}, self.options());
+    } else {
+      save_mean = is_contiguous(self) ? at::empty({self.size(1)}, self.options()) : at::mean(self, /*dim=*/reduce_dims, /*keepdim=*/false);
+      save_var = at::empty({self.size(1)}, self.options());
+    }
+  }
+  return batch_norm_cpu_out(self, weight_opt, bias_opt, running_mean_opt, running_var_opt, train, momentum, eps, output, save_mean, save_var);
 }
 
 std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu(const Tensor& grad_out, const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_invstd_opt,
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index ef402e125c99c..4c7a351e8f1bf 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -3803,6 +3803,7 @@
   dispatch:
     CUDA: batch_norm_cuda_out
     MPS: batch_norm_mps_out
+    CPU: batch_norm_cpu_out
 
 - func: batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor)
   dispatch:
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 352132d0b662f..6ef49a5473d8c 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -10477,9 +10477,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            assert_jit_shape_analysis=True,
            sample_inputs_func=sample_inputs_native_batch_norm,
            skips=(
-               # NotImplementedError: Could not run
-               # 'aten::native_batch_norm.out' with arguments from the 'CPU' backend.
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type="cpu"),
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning', device_type="cpu"),
                # RuntimeError: out_invstd.dim() == 1 && out_invstd.is_contiguous() && out_invstd.sizes()[0]
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type="cuda"),

From e07a96b58eaf1dd456e526008ab9da96098a8fe0 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Mon, 7 Nov 2022 22:00:15 +0000
Subject: [PATCH 0676/1922] [inductor] Fix aten.fmod lowering (#88602)

Currently the lowering for aten.fmod promotes integral types to float and calls
`tl.libdevice.fmod` whereas the ATen behavior is to use the modulo operator.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88602
Approved by: https://github.com/jansel
---
 torch/_inductor/codegen/triton.py |  2 +-
 torch/_inductor/lowering.py       | 18 +++++++++++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 58ac425a95e8a..3471d23a72130 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -240,7 +240,7 @@ def signbit(x):
 
     @staticmethod
     def fmod(a, b):
-        return f"tl.libdevice.fmod({a}, ({b}).to(tl.float32))"
+        return f"tl.libdevice.fmod({a}, {b})"
 
     @staticmethod
     def pow(a, b):
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index d83fbba25ddaf..ece23f1063334 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -3299,6 +3299,23 @@ def div_prim(a, b):
         return div(a, b)
 
 
+@register_lowering([aten.fmod, prims.fmod])
+def fmod(a, b):
+    is_integral = is_boolean_type(a) or is_integer_type(a)
+
+    if is_integral:
+
+        def fn(a, b):
+            return ops.mod(a, b)
+
+    else:
+
+        def fn(a, b):
+            return ops.fmod(a, b)
+
+    return make_pointwise(fn)(a, b)
+
+
 # TODO - enable builtin and disable decomp to lower to ptx instruction
 # Causes compilation to not complete on timm_vision_transformers inference
 # @register_lowering(aten.rsqrt)
@@ -3391,7 +3408,6 @@ def sum_(x, axis=None, keepdims=False, *, dtype=None):
 register_pointwise(aten.remainder)
 register_pointwise(aten.sign, override_fn_when_input_bool="identity")
 register_pointwise(aten.ceil)
-register_pointwise(aten.fmod)
 register_pointwise(aten.signbit, override_return_dtype=torch.bool)
 
 register_pointwise(aten.le, type_promotion_kind=None, override_return_dtype=torch.bool)

From e67f39417908a010de22c64840f6b1764f81f73c Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Tue, 8 Nov 2022 06:57:30 +0000
Subject: [PATCH 0677/1922] PatternMatcher supports matching list-typed args
 (#88656)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88656
Approved by: https://github.com/jerryzh168
---
 test/fx/test_subgraph_rewriter.py      | 21 +++++++++++++++++++++
 torch/fx/passes/utils/matcher_utils.py | 26 ++++++++++++++++++--------
 torch/fx/subgraph_rewriter.py          |  5 ++++-
 3 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/test/fx/test_subgraph_rewriter.py b/test/fx/test_subgraph_rewriter.py
index ed6d50e44b4ac..4568eaa33bd61 100644
--- a/test/fx/test_subgraph_rewriter.py
+++ b/test/fx/test_subgraph_rewriter.py
@@ -840,3 +840,24 @@ def num_repalcement_node_found(traced):
             [second_input_is_scalar])
         self.assertEqual(len(matches), 1)
         self.assertEqual(num_repalcement_node_found(traced), 1)
+
+    def test_matching_pattern_with_list_type_arg(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return torch.ops.aten._reshape_alias_copy.default(x, [1, 2], [3, 4])
+
+        def pattern(x, arg0, arg1):
+            return torch.ops.aten._reshape_alias_copy.default(x, arg0, arg1)
+
+        def replacement(x, arg0, arg1):
+            return torch.ops.aten._reshape_alias_copy.default(x, arg1, arg0)
+
+        traced = symbolic_trace(M())
+        matches = subgraph_rewriter.replace_pattern(traced, pattern, replacement)
+
+        self.assertEqual(len(matches), 1)
+
+        self.assertExpectedInline(traced.code.strip(), """\
+def forward(self, x):
+    _reshape_alias_copy_default_1 = torch.ops.aten._reshape_alias_copy.default(x, [3, 4], [1, 2]);  x = None
+    return _reshape_alias_copy_default_1""")  # noqa: B950
diff --git a/torch/fx/passes/utils/matcher_utils.py b/torch/fx/passes/utils/matcher_utils.py
index 00a3b16833025..04938ac6e3e15 100644
--- a/torch/fx/passes/utils/matcher_utils.py
+++ b/torch/fx/passes/utils/matcher_utils.py
@@ -4,7 +4,6 @@
 from torch.fx.graph import Graph
 from torch.fx.node import Node
 from torch.fx._compatibility import compatibility
-import torch.utils._pytree as pytree
 from typing import Dict, List, Set, Any
 import logging
 import os
@@ -106,12 +105,11 @@ def _nodes_are_equal(self, pn: Node, gn: Node) -> bool:
     def _is_contained(self, nodes_map: Dict[Node, Node]) -> bool:
         # `lookup` represents all the nodes in `original_graph`
         # that are part of `pattern`
-        lookup: Dict[Node, Node] = {gn : pn for pn, gn in nodes_map.items()}
-        for gn, pn in lookup.items():
-            # Placeholders can be used by other nodes in the graphs
-            if pn.op == "placeholder":
-                continue
 
+        # Placeholders can be used by other nodes in the graphs
+        lookup: Dict[Node, Node] = {gn : pn for pn, gn in nodes_map.items() if pn.op != "placeholder"}
+
+        for gn, pn in lookup.items():
             # nodes returned by output are allowed to be used in other areas of the graph
             if pn in self.pattern_returning_nodes:
                 continue
@@ -188,8 +186,20 @@ def _match_nodes(self, pn: Node, gn: Node, match: InternalMatch) -> bool:
         # match for `gn`
         match_found = True
 
-        pn_flatten_args, _ = pytree.tree_flatten(pn.args)
-        gn_flatten_args, _ = pytree.tree_flatten(gn.args)
+        def flatten_args(args) -> List[Any]:
+            # Recursively flatten args
+            result : List[Any] = []
+            for arg in args:
+                # flatten the list, if only it's a list/tuple of nodes
+                if isinstance(arg, (list, tuple)) and len(arg) > 0 and isinstance(arg[0], Node):
+                    result.extend(flatten_args(arg))
+                else:
+                    result.append(arg)
+
+            return result
+
+        pn_flatten_args = flatten_args(pn.args)
+        gn_flatten_args = flatten_args(gn.args)
 
         if pn.kwargs.keys() == gn.kwargs.keys():
             for key in pn.kwargs.keys():
diff --git a/torch/fx/subgraph_rewriter.py b/torch/fx/subgraph_rewriter.py
index c46de13cc9859..e7d239d4699c9 100644
--- a/torch/fx/subgraph_rewriter.py
+++ b/torch/fx/subgraph_rewriter.py
@@ -257,7 +257,10 @@ def _replace_pattern(
         assert len(match.placeholder_nodes) == len(replacement_placeholders)
         val_map: Dict[Node, Node] = {}
         for rn, gn in zip(replacement_placeholders, match.placeholder_nodes):
-            val_map[rn] = match_changed_node.get(gn, gn)
+            if isinstance(gn, Node):
+                val_map[rn] = match_changed_node.get(gn, gn)
+            else:
+                val_map[rn] = gn
 
         # Copy the replacement graph over
         user_nodes: Set[Node] = set()

From aee0bfb7130d5eeb5d27f1c82050c69a77d66dff Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Tue, 8 Nov 2022 21:10:07 +0000
Subject: [PATCH 0678/1922] [MPS][BE] Code cleanup (#88529)

Various code cleanup in MPS operations:
 - Per @kulinseth suggestion move `mpsSupportsCumsum` to `MPSDevice.h` and rename it to
   `is_macos_13_or_newer()`
 - Move Ventura MPSGraph new operators to `MPSGraphVenturaOps.h` header
 - Use `LookupAs` and `CreateCachedGraphAs` to make code more compact
 - Formatting

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88529
Approved by: https://github.com/kulinseth
---
 aten/src/ATen/mps/MPSDevice.h                 |  6 ++
 aten/src/ATen/mps/MPSDevice.mm                | 11 ++++
 aten/src/ATen/native/mps/MPSGraphVenturaOps.h |  9 +++
 aten/src/ATen/native/mps/OperationUtils.h     |  1 +
 aten/src/ATen/native/mps/OperationUtils.mm    |  1 +
 aten/src/ATen/native/mps/operations/Eye.mm    |  5 +-
 .../ATen/native/mps/operations/Indexing.mm    | 56 ++++++++-----------
 .../native/mps/operations/PointwiseOps.mm     |  5 +-
 .../ATen/native/mps/operations/ReduceOps.mm   | 37 +++++-------
 .../ATen/native/mps/operations/UnaryOps.mm    | 21 ++-----
 test/test_mps.py                              |  2 +-
 11 files changed, 74 insertions(+), 80 deletions(-)
 create mode 100644 aten/src/ATen/native/mps/MPSGraphVenturaOps.h

diff --git a/aten/src/ATen/mps/MPSDevice.h b/aten/src/ATen/mps/MPSDevice.h
index 77e93ea1234a4..48e1904346c10 100644
--- a/aten/src/ATen/mps/MPSDevice.h
+++ b/aten/src/ATen/mps/MPSDevice.h
@@ -53,6 +53,10 @@ class TORCH_API MPSDevice {
   MTLDevice_t device() {
     return _mtl_device;
   }
+  /**
+   * Returns whether running on Ventura or newer
+   */
+  bool isMacOS13Plus() const;
 
   MTLFunction_t metalIndexingFunction(const std::string &kernel, MTLFunctionConstantValues_t constantValues);
 
@@ -61,11 +65,13 @@ class TORCH_API MPSDevice {
  private:
   static MPSDevice* _device;
   MTLDevice_t _mtl_device;
+  bool _macos13plus;
   MTLLibrary_t _mtl_indexing_library;
   MPSDevice();
 };
 
 TORCH_API bool is_available();
+TORCH_API bool is_macos_13_or_newer();
 
 TORCH_API at::Allocator* GetMPSAllocator(bool useSharedAllocator = false);
 
diff --git a/aten/src/ATen/mps/MPSDevice.mm b/aten/src/ATen/mps/MPSDevice.mm
index 6569d57420c87..c11621b3f3540 100644
--- a/aten/src/ATen/mps/MPSDevice.mm
+++ b/aten/src/ATen/mps/MPSDevice.mm
@@ -66,6 +66,7 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
   // Create the MPSGraph and check method introduced in 12.3+
   // which is used by MPS backend.
   id mpsCD = NSClassFromString(@"MPSGraph");
+  _macos13plus = [mpsCD instancesRespondToSelector:@selector(cumulativeSumWithTensor:axis:name:)] == YES;
   if ([mpsCD instancesRespondToSelector:@selector(LSTMWithSourceTensor:
                                                        recurrentWeight:
                                                            inputWeight:
@@ -76,6 +77,7 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
                                                                   name:)] == NO) {
     return;
   }
+
   NSArray* devices = [MTLCopyAllDevices() autorelease];
   for (unsigned long i = 0 ; i < [devices count] ; i++) {
     id<MTLDevice>  device = devices[i];
@@ -85,6 +87,11 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
     }
   }
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(_mtl_device);
+
+}
+
+bool MPSDevice::isMacOS13Plus() const {
+  return _macos13plus;
 }
 
 at::Allocator* getMPSSharedAllocator();
@@ -97,5 +104,9 @@ bool is_available() {
   return MPSDevice::getInstance()->device() != nil;
 }
 
+bool is_macos_13_or_newer() {
+  return MPSDevice::getInstance()->isMacOS13Plus();
+}
+
 } // namespace mps
 } // namespace at
diff --git a/aten/src/ATen/native/mps/MPSGraphVenturaOps.h b/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
new file mode 100644
index 0000000000000..86153b58ed87e
--- /dev/null
+++ b/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
@@ -0,0 +1,9 @@
+#pragma once
+#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
+
+// TODO: Remove me when moved to MacOS 13
+@interface MPSGraph (VenturaOps)
+- (MPSGraphTensor *)cumulativeSumWithTensor:(MPSGraphTensor *)tensor
+                                       axis:(NSInteger)axis
+                                       name:(NSString *)name;
+@end
diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
index 13c817b4c45bf..8d868386705ac 100644
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@@ -263,6 +263,7 @@ struct MPSGraphCache
 
 };
 
+
 } // namespace mps
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index fd18c1f4a95e4..13a88efbfb5d1 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -12,6 +12,7 @@
   this->set_current_seed(random);
   return random;
 }
+
 uint64_t MPSGeneratorImpl::current_seed() const {
   return seed_;
 }
diff --git a/aten/src/ATen/native/mps/operations/Eye.mm b/aten/src/ATen/native/mps/operations/Eye.mm
index 45b3fdf68b07f..6b72c0686caa4 100644
--- a/aten/src/ATen/native/mps/operations/Eye.mm
+++ b/aten/src/ATen/native/mps/operations/Eye.mm
@@ -70,9 +70,9 @@
   @autoreleasepool {
     // A key is used to identify the MPSGraph which was created once, and can be reused if the parameters, data types etc match the earlier created MPSGraph
     string key = "eye_out_mps:" + getTensorsStringKey({result});
-    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
     if(!cachedGraph) {
-      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+      cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ MPSCachedGraph * () {
 
         CachedGraph *newCachedGraph = nil;
 
@@ -94,7 +94,6 @@
         }
         return newCachedGraph;
       });
-      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
     // Create placeholders which use the keys of the CachedGraph to create inputs and outputs of the operation
diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index 9c27aae9b0b01..78e93fc991756 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -252,12 +252,7 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) {
 
   MPSStream* stream = getCurrentMPSStream();
 
-  struct CachedGraph : public MPSCachedGraph
-  {
-    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
-    MPSGraphTensor* inputTensor_ = nil;
-    MPSGraphTensor* outputTensor_ = nil;
-  };
+  using CachedGraph = mps::MPSUnaryCachedGraph;
 
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
 
@@ -265,9 +260,9 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) {
     NSString* ns_dims_key = [[ns_dims valueForKey:@"description"] componentsJoinedByString:@","];
     // A key is used to identify the MPSGraph which was created once, and can be reused if the parameters, data types etc match the earlier created MPSGraph
     string key = "flip_mps:" + getTensorsStringKey({self}) + ":" + string([ns_dims_key UTF8String]);
-    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    auto cachedGraph = cache_->LookUpAs<CachedGraph>(key);
     if(!cachedGraph) {
-      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+      cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ MPSCachedGraph * () {
 
         CachedGraph *newCachedGraph = nil;
 
@@ -284,7 +279,6 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) {
         }
         return newCachedGraph;
       });
-      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
     // Create placeholders which use the keys of the CachedGraph to create inputs and outputs of the operation
@@ -341,10 +335,10 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) {
   @autoreleasepool {
 
     string key = "index_add_mps_out" + getTensorsStringKey({self, index, source}) + ":" + std::to_string(dim);
-    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
 
     if(!cachedGraph) {
-      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+      cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ MPSCachedGraph * () {
         CachedGraph *newCachedGraph = nil;
 
         @autoreleasepool {
@@ -372,7 +366,6 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) {
         }
         return newCachedGraph;
       });
-      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
     Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
@@ -460,10 +453,10 @@ Tensor index_select_mps(const Tensor & self,
   @autoreleasepool {
 
     string key = "index_select_out_mps" + getTensorsStringKey({self, index}) + ":" + std::to_string(dim);
-    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
 
     if(!cachedGraph) {
-      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+      cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ MPSCachedGraph * () {
         CachedGraph *newCachedGraph = nil;
 
         @autoreleasepool {
@@ -485,7 +478,6 @@ Tensor index_select_mps(const Tensor & self,
         }
         return newCachedGraph;
       });
-      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
     Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
@@ -530,9 +522,9 @@ Tensor index_select_mps(const Tensor & self,
   MPSStream* stream = getCurrentMPSStream();
   @autoreleasepool {
     string key = "masked_fill" + getTensorsStringKey({self, mask}) + ":" + std::to_string(value.toDouble());
-    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
     if(!cachedGraph) {
-      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+      cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ MPSCachedGraph * () {
 
         CachedGraph *newCachedGraph = nil;
 
@@ -566,7 +558,6 @@ Tensor index_select_mps(const Tensor & self,
         }
         return newCachedGraph;
       });
-      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
     Placeholder selfPlaceholder   = Placeholder(cachedGraph->inputTensor_, self);
@@ -615,7 +606,7 @@ Tensor embedding_dense_backward_mps(
     int64_t D = incoming_gradient_shape[num_incoming_gradient_dims - 1];
     c10::SmallVector<int64_t, 2> outgoing_gradient_shape{num_weights, D};
     Tensor outgoing_gradient = at::native::empty_mps(
-                                IntArrayRef(outgoing_gradient_shape.data(), outgoing_gradient_shape.size()),
+                                IntArrayRef(outgoing_gradient_shape),
                                 grad_.scalar_type(),
                                 c10::nullopt,
                                 kMPS,
@@ -630,10 +621,10 @@ Tensor embedding_dense_backward_mps(
 
     @autoreleasepool {
         string key = "edb_mps:" + native_mps::getMPSTypeString(grad_.scalar_type()) + ":indices" + std::to_string(num_indices_dims) + ":num_weights" + std::to_string(num_weights) + ":padding_idx" + std::to_string(padding_idx) + ":scaled" + std::to_string(scale_grad_by_freq);
-      CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+      CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
       // Initialize once if configuration not found in cache
       if(!cachedGraph) {
-        native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+        cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ native_mps::MPSCachedGraph * () {
 
           CachedGraph *newCachedGraph = nil;
 
@@ -647,18 +638,18 @@ Tensor embedding_dense_backward_mps(
 
             MPSGraphTensor* reshapedIndicesTensor = indicesTensor;
 
-            if (num_indices_dims != 0)
-              reshapedIndicesTensor = [mpsGraph  expandDimsOfTensor:indicesTensor
-                              axes:@[@-1]
-                              name:nil];
+            if (num_indices_dims != 0) {
+              reshapedIndicesTensor = [mpsGraph  expandDimsOfTensor: indicesTensor
+                                                               axes: @[@-1]
+                                                               name: nil];
+            }
 
-            MPSGraphTensor* outgoingGradTensor;
-            outgoingGradTensor = [mpsGraph scatterNDWithUpdatesTensor:incomingGradTensor
-                            indicesTensor:reshapedIndicesTensor
-                                    shape:native_mps::getMPSShape(IntArrayRef(outgoing_gradient_shape.data(), outgoing_gradient_shape.size()))
-                          batchDimensions:0
-                                     mode:MPSGraphScatterModeAdd
-                                     name:@"edb"];
+            auto outgoingGradTensor = [mpsGraph scatterNDWithUpdatesTensor: incomingGradTensor
+                                                             indicesTensor: reshapedIndicesTensor
+                                                                     shape: native_mps::getMPSShape(IntArrayRef(outgoing_gradient_shape))
+                                                           batchDimensions: 0
+                                                                      mode: MPSGraphScatterModeAdd
+                                                                      name: @"edb"];
 
             newCachedGraph->incomingGradTensor_ = incomingGradTensor;
             newCachedGraph->indicesTensor_ = indicesTensor;
@@ -667,7 +658,6 @@ Tensor embedding_dense_backward_mps(
           }
           return newCachedGraph;
         });
-        cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
       }
       auto incomingGradPlaceholder = native_mps::Placeholder(cachedGraph->incomingGradTensor_, grad_);
       auto indicesPlaceholder = native_mps::Placeholder(cachedGraph->indicesTensor_, indices);
diff --git a/aten/src/ATen/native/mps/operations/PointwiseOps.mm b/aten/src/ATen/native/mps/operations/PointwiseOps.mm
index eb68239ecedd6..8da6b94dd8569 100644
--- a/aten/src/ATen/native/mps/operations/PointwiseOps.mm
+++ b/aten/src/ATen/native/mps/operations/PointwiseOps.mm
@@ -36,10 +36,10 @@
   @autoreleasepool {
     string key = op_name + getTensorsStringKey({self, tensor1, tensor2}, false);
 
-    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
 
     if(!cachedGraph) {
-        MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+        cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ MPSCachedGraph * () {
 
           CachedGraph* newCachedGraph = nil;
           @autoreleasepool {
@@ -72,7 +72,6 @@
           }
           return newCachedGraph;
       });
-      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
     // Inputs as placeholders
diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index 8c5d8b4d22875..91aa245b89911 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -185,7 +185,7 @@ void set_axes_and_shapes(const Tensor& input_t,
     auto cachedGraph = cache_->LookUpAs<CachedGraph>(key);
 
     if(!cachedGraph) {
-      native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+      cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ native_mps::MPSCachedGraph * () {
 
         CachedGraph *newCachedGraph = nil;
 
@@ -254,15 +254,15 @@ void set_axes_and_shapes(const Tensor& input_t,
         }
         return newCachedGraph;
       });
-      cachedGraph = tmpCachedGraph->as<CachedGraph>();
     }
 
     auto inputPlaceholder = native_mps::Placeholder();
 
-    if(apparent_input_shape)
+    if (apparent_input_shape) {
       inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t, apparent_input_shape);
-    else
+    } else {
       inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
+    }
     auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, apparent_output_shape);
     NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
       inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
@@ -451,7 +451,7 @@ Tensor count_nonzero_mps(const Tensor& self, IntArrayRef dims){
     auto cachedGraph = cache_->LookUpAs<CachedGraph>(key);
 
     if(!cachedGraph) {
-      native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+      cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ native_mps::MPSCachedGraph * () {
 
         CachedGraph *newCachedGraph = nil;
 
@@ -521,7 +521,6 @@ Tensor count_nonzero_mps(const Tensor& self, IntArrayRef dims){
         }
         return newCachedGraph;
       });
-      cachedGraph = tmpCachedGraph->as<CachedGraph>();
     }
 
     auto inputPlaceholder = native_mps::Placeholder();
@@ -728,7 +727,7 @@ Tensor std_var_common_impl_mps(
     auto cachedGraph = cache_->LookUpAs<CachedGraph>(key);
     // Initialize once if configuration not found in cache
   if(!cachedGraph) {
-      native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+      cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ native_mps::MPSCachedGraph * () {
 
       CachedGraph *newCachedGraph = nil;
 
@@ -763,7 +762,6 @@ Tensor std_var_common_impl_mps(
       }
       return newCachedGraph;
       });
-      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
   }
   auto inputPlaceholder = native_mps::Placeholder();
 
@@ -846,7 +844,7 @@ Tensor std_mps(
         CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
 
         if(!cachedGraph) {
-          native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+          cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ native_mps::MPSCachedGraph * () {
 
             CachedGraph *newCachedGraph = nil;
             @autoreleasepool {
@@ -886,7 +884,6 @@ Tensor std_mps(
             }
             return newCachedGraph;
           });
-          cachedGraph = tmpCachedGraph->as<CachedGraph>();
         }
 
         auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
@@ -921,7 +918,7 @@ Tensor std_mps(
         CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
 
         if(!cachedGraph) {
-          native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+          cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ native_mps::MPSCachedGraph * () {
 
             CachedGraph *newCachedGraph = nil;
 
@@ -962,7 +959,6 @@ Tensor std_mps(
             }
             return newCachedGraph;
           });
-          cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
         }
 
         auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
@@ -1017,7 +1013,7 @@ Tensor std_mps(
         CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
 
         if(!cachedGraph) {
-          native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+          cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ native_mps::MPSCachedGraph * () {
 
             CachedGraph *newCachedGraph = nil;
             @autoreleasepool {
@@ -1057,7 +1053,6 @@ Tensor std_mps(
             }
             return newCachedGraph;
           });
-          cachedGraph = tmpCachedGraph->as<CachedGraph>();
         }
 
         auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
@@ -1092,7 +1087,7 @@ Tensor std_mps(
         CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
 
         if(!cachedGraph) {
-          native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+          cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ native_mps::MPSCachedGraph * () {
 
             CachedGraph *newCachedGraph = nil;
 
@@ -1133,7 +1128,6 @@ Tensor std_mps(
             }
             return newCachedGraph;
           });
-          cachedGraph = tmpCachedGraph->as<CachedGraph>();
         }
 
         auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
@@ -1185,7 +1179,7 @@ Tensor std_mps(
     CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
     // Initialize once if configuration not found in cache
     if(!cachedGraph) {
-      native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+      cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ native_mps::MPSCachedGraph * () {
 
         CachedGraph *newCachedGraph = nil;
 
@@ -1212,7 +1206,6 @@ Tensor std_mps(
         }
         return newCachedGraph;
       });
-      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
     auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t, apparent_input_shape);
@@ -1296,10 +1289,10 @@ Tensor min_mps(const Tensor& input_t) {
 
     @autoreleasepool {
         string key = func_name + ":" + to_string(dim_) + ":" + native_mps::getMPSTypeString(input_t.scalar_type());
-        CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+        CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
 
         if(!cachedGraph) {
-          native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+          cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ native_mps::MPSCachedGraph * () {
 
             CachedGraph *newCachedGraph = nil;
 
@@ -1349,7 +1342,6 @@ Tensor min_mps(const Tensor& input_t) {
             }
             return newCachedGraph;
           });
-          cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
         }
 
         auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
@@ -1463,7 +1455,7 @@ Tensor min_mps(const Tensor& input_t) {
         CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
 
         if(!cachedGraph) {
-          native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+          cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ native_mps::MPSCachedGraph * () {
 
             CachedGraph *newCachedGraph = nil;
 
@@ -1504,7 +1496,6 @@ Tensor min_mps(const Tensor& input_t) {
             }
             return newCachedGraph;
           });
-          cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
         }
 
         native_mps::Placeholder inputPlaceholder = native_mps::Placeholder();
diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index 07173dc8b2ac8..3d641d3af82cc 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -5,15 +5,9 @@
 #include <ATen/Utils.h>
 #include <ATen/mps/MPSStream.h>
 #include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <torch/library.h>
 
-// TODO: Remove me when moved to MacOS 13
-@interface MPSGraph (VenturaOps)
-- (MPSGraphTensor *)cumulativeSumWithTensor:(MPSGraphTensor *)tensor
-                                       axis:(NSInteger)axis
-                                       name:(NSString *)name;
-@end
-
 namespace at {
 namespace native {
 namespace mps {
@@ -41,7 +35,7 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
     auto cachedGraph = cache_->LookUpAs<MPSUnaryCachedGraph>(key);
 
     if(!cachedGraph) {
-      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph* () {
+      cachedGraph = cache_->CreateCachedGraphAs<MPSUnaryCachedGraph>(key, ^ MPSCachedGraph* () {
         MPSUnaryCachedGraph *newCachedGraph = nil;
         @autoreleasepool {
           MPSGraph* mpsGraph = make_mps_graph();
@@ -56,7 +50,6 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
         }
         return newCachedGraph;
       });
-      cachedGraph = tmpCachedGraph->as<MPSUnaryCachedGraph>();
     }
 
     Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
@@ -205,7 +198,7 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
       auto cachedGraph = cache_->LookUpAs<MPSUnaryCachedGraph>(key);
 
       if(!cachedGraph) {
-        MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph* () {
+        cachedGraph = cache_->CreateCachedGraphAs<MPSUnaryCachedGraph>(key, ^ MPSCachedGraph* () {
           MPSUnaryCachedGraph *newCachedGraph = nil;
           @autoreleasepool {
             MPSGraph* mpsGraph = make_mps_graph();
@@ -222,7 +215,6 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
           }
           return newCachedGraph;
         });
-        cachedGraph = tmpCachedGraph->as<MPSUnaryCachedGraph>();
       }
 
       Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
@@ -271,11 +263,6 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
 }
 
 
-static bool mpsSupportsCumsum() {
-  id mpsCD = NSClassFromString(@"MPSGraph");
-  return [mpsCD instancesRespondToSelector:@selector(cumulativeSumWithTensor:axis:name:)] == YES;
-}
-
 
 TORCH_IMPL_FUNC(cumsum_out_mps)
 (const Tensor& self,
@@ -283,7 +270,7 @@ static bool mpsSupportsCumsum() {
  c10::optional<ScalarType> dtype,
  const Tensor& result) {
   TORCH_CHECK(dim >=0 && dim < std::max(1LL, self.ndimension()), "Expected dim to be between 0 and ", self.ndimension(), " but got ", dim);
-  if (!mpsSupportsCumsum()) {
+  if (!is_macos_13_or_newer()) {
     TORCH_WARN_ONCE("torch.cumsum supported by MPS on MacOS 13+, please upgrade");
     auto cpu_result = self.to(at::Device(kCPU)).cumsum(dim, dtype);
     at::_copy_from_and_resize(cpu_result, result);
diff --git a/test/test_mps.py b/test/test_mps.py
index e4a86e9377249..3f8cce384c1bb 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -6980,7 +6980,7 @@ def test_no_warning_on_import(self):
             # On Windows, opening the subprocess with the default CWD makes `import torch`
             # fail, so just set CWD to this script's directory
             cwd=os.path.dirname(os.path.realpath(__file__)),).decode("utf-8")
-        self.assertEquals(out, "")
+        self.assertEqual(out, "")
 
     def _get_not_implemented_op(self):
         # This can be changed once we actually implement `torch.bincount`

From db72ce6cbed2aa54f0975b01fa8cc2d01aff34d5 Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Mon, 7 Nov 2022 11:24:25 -0800
Subject: [PATCH 0679/1922] [Profiler] Handle ABA for TensorImpl* when
 assigning IDs (#87133)

Part of the current ID assingment algorithm groups any Storages which are associated with the same TensorImpl*. This isn't sound (which I knew but deferred until it actually became a problem) because pointers can be reused by different objects. (ABA problem)

ABA is easy to handle for Storage because we see allocations and frees, but ~TensorImpl is very hot and cannot tolerate profiling code without significant increases in overhead.

This PR narrows the conditions under which ID assignment will join on TensorImpl*. Two storages which are associated with the same TensorImpl* are grouped IFF they were live at the same time. (Note that this still allows storages with disjoint lifetimes to be joined transitively through a third storage which overlaps with both.)

The need for this PR arose in memory profiling. The Python argument parser creates short lived Tensors for (some) scalar arguments which triggers this issue. (Which is stochastic and platform dependent since optimizations like reusing recently freed allocations is implementation defined.) Spurious connections can lead to confusing and long range interactions when building up the memory profile, so it makes sense to harden ID assignment to avoid any issues.

Differential Revision: [D40445121](https://our.internmc.facebook.com/intern/diff/D40445121/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87133
Approved by: https://github.com/slgong-fb, https://github.com/chaekit
---
 test/profiler/test_profiler.py      | 252 ++++++++++++++++++++++++++++
 torch/_C/_profiler.pyi              |   1 +
 torch/csrc/profiler/collection.h    |   4 +-
 torch/csrc/profiler/data_flow.cpp   | 127 ++++++++------
 torch/csrc/profiler/data_flow.h     |  12 +-
 torch/csrc/profiler/python/init.cpp |   2 +
 torch/csrc/profiler/python/init.h   |   1 +
 7 files changed, 341 insertions(+), 58 deletions(-)

diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py
index 22db16d1943af..296ce500c7e44 100644
--- a/test/profiler/test_profiler.py
+++ b/test/profiler/test_profiler.py
@@ -6,6 +6,7 @@
 import os
 import re
 import tempfile
+import textwrap
 import unittest
 from dataclasses import dataclass, field
 from typing import List, Optional
@@ -1370,6 +1371,237 @@ def get_fields(op_name, index):
         self.assertEqual(c_id, c_id_new)
         self.assertEqual(d_id, c_id_new)
 
+    @staticmethod
+    def _format_allocations(profiled_code):
+        gc.collect()
+        with profile(profile_memory=True, record_shapes=True) as prof:
+            profiled_code()
+            gc.collect()
+
+        root_events = prof.profiler.kineto_results.experimental_event_tree()
+        events = sorted(_utils.traverse_dfs(root_events), key=lambda x: x.start_time_ns)
+        allocations = tuple(
+            event.extra_fields
+            for event in events
+            if isinstance(event.extra_fields, torch._C._profiler._ExtraFields_Allocation)
+        )
+
+        return textwrap.indent("\n".join(
+            f"{repr(i.id):>5}{' ' * 6}"
+            f"{repr(i.allocation_id):>5}{' ' * 6}"
+            f"{'Allocation' if i.alloc_size > 0 else 'Free'}"
+            for i in allocations
+        ), " " * 12)
+
+    def test_tensorimpl_invalidation_set(self) -> None:
+        def profiled_code(add_empty_set: bool):
+            x = torch.ones((1,))
+
+            # Determines if new storage is created before or after the old one
+            # is destroyed.
+            if add_empty_set:
+                x.set_()
+
+            x.set_(torch.ones((1,)).storage())
+            x.view_as(x)
+
+        self.assertExpectedInline(
+            self._format_allocations(lambda: profiled_code(add_empty_set=False)),
+            """\
+                0          1      Allocation
+                0          2      Allocation
+                0          1      Free
+                0          2      Free"""
+        )
+
+        self.assertExpectedInline(
+            self._format_allocations(lambda: profiled_code(add_empty_set=True)),
+            """\
+                0          1      Allocation
+                0          1      Free
+                0          2      Allocation
+                0          2      Free"""
+        )
+
+    def test_tensorimpl_invalidation_keep_alive(self) -> None:
+        def profiled_code(add_empty_set: bool):
+            x = torch.ones((1,))
+            x_storages = [x.storage()]
+            for _ in range(3):
+                x.set_()
+                x.set_(torch.ones((1,)).storage())
+
+                # This keeps the StorageImpls alive and preserves the chain.
+                # (Despite the `set_()` call.)
+                x_storages.append(x.storage())
+            x.view_as(x)
+
+            # Free storage in a deterministic fashion.
+            while x_storages:
+                x_storages.pop()
+                gc.collect()
+
+            # Determines if new storage is created before or after the old one
+            # is destroyed.
+            if add_empty_set:
+                x.set_()
+
+            for _ in range(3):
+                x.set_(torch.ones((1,)).storage())
+            x.view_as(x)
+
+            del x
+            gc.collect()
+
+        self.assertExpectedInline(
+            self._format_allocations(lambda: profiled_code(add_empty_set=False)),
+            """\
+                0          1      Allocation
+                0          2      Allocation
+                0          4      Allocation
+                0          5      Allocation
+                0          4      Free
+                0          2      Free
+                0          1      Free
+                0          6      Allocation
+                0          5      Free
+                0          7      Allocation
+                0          6      Free
+                0          8      Allocation
+                0          7      Free
+                0          8      Free"""
+        )
+
+        self.assertExpectedInline(
+            self._format_allocations(lambda: profiled_code(add_empty_set=True)),
+            """\
+                0          1      Allocation
+                0          2      Allocation
+                0          4      Allocation
+                0          5      Allocation
+                0          4      Free
+                0          2      Free
+                0          1      Free
+                0          5      Free
+                0          6      Allocation
+                0          7      Allocation
+                0          6      Free
+                0          8      Allocation
+                0          7      Free
+                0          8      Free"""
+        )
+
+    def test_tensorimpl_invalidation_full(self) -> None:
+        def profiled_code():
+            x = torch.ones((1,))
+            x_storages = [x.storage()]
+            for _ in range(3):
+                x.set_()
+                x.set_(torch.ones((1,)).storage())
+                x_storages.append(x.storage())
+            x.view_as(x)
+
+            # Free storage in a deterministic fashion.
+            while x_storages:
+                x_storages.pop()
+                gc.collect()
+
+            for _ in range(3):
+                x.set_(torch.ones((1,)).storage())
+
+            for _ in range(3):
+                x.set_()
+                x.set_(torch.ones((1,)).storage())
+
+            for i in range(4):
+                x.resize_((1 + i,))
+            x.view_as(x)
+
+        self.assertExpectedInline(
+            self._format_allocations(profiled_code),
+            """\
+                0          1      Allocation
+                0          2      Allocation
+                0          4      Allocation
+                0          5      Allocation
+                0          4      Free
+                0          2      Free
+                0          1      Free
+                0          6      Allocation
+                0          5      Free
+                0          7      Allocation
+                0          6      Free
+                0          8      Allocation
+                0          7      Free
+                0          8      Free
+                0          9      Allocation
+                0          9      Free
+                0         10      Allocation
+                0         10      Free
+                0         11      Allocation
+                0         12      Allocation
+                0         11      Free
+                0         13      Allocation
+                0         12      Free
+                0         14      Allocation
+                0         13      Free
+                0         14      Free"""
+        )
+
+    def test_tensorimpl_invalidation_scalar_args(self) -> None:
+        def profiled_code():
+            with torch.no_grad():
+                x = torch.ones((1,))
+                for _ in range(10):
+                    x.add_(2)
+
+        self.assertExpectedInline(
+            self._format_allocations(profiled_code),
+            """\
+                0          1      Allocation
+                1          2      Allocation
+                2          3      Allocation
+                2          3      Free
+                1          2      Free
+                3          4      Allocation
+                4          5      Allocation
+                4          5      Free
+                3          4      Free
+                5          6      Allocation
+                6          7      Allocation
+                6          7      Free
+                5          6      Free
+                7          8      Allocation
+                8          9      Allocation
+                8          9      Free
+                7          8      Free
+                9         10      Allocation
+               10         11      Allocation
+               10         11      Free
+                9         10      Free
+               11         12      Allocation
+               12         13      Allocation
+               12         13      Free
+               11         12      Free
+               13         14      Allocation
+               14         15      Allocation
+               14         15      Free
+               13         14      Free
+               15         16      Allocation
+               16         17      Allocation
+               16         17      Free
+               15         16      Free
+               17         18      Allocation
+               18         19      Allocation
+               18         19      Free
+               17         18      Free
+               19         20      Allocation
+               20         21      Allocation
+               20         21      Free
+               19         20      Free
+                0          1      Free""")
+
+
     def test_module_and_optimizer_ids(self) -> None:
         model = torch.nn.Linear(2, 1, bias=True)
         optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
@@ -1522,6 +1754,26 @@ def test_impl_reuse(self) -> None:
         self.assertEqual(len(tensor_impls), repeats)
         self.assertEqual(len(set(tensor_impls)), repeats)
 
+    def test_allocation_id_uniqueness(self) -> None:
+        repeats = 1_000
+        with profile(profile_memory=True, record_shapes=True) as p:
+            for _ in range(repeats):
+                torch.ones((1,))
+            gc.collect()
+
+        roots = p.profiler.kineto_results.experimental_event_tree()
+        id_set = set()
+        for e in _utils.traverse_dfs(roots):
+            fields = e.extra_fields
+            if isinstance(fields, torch._C._profiler._ExtraFields_TorchOp):
+                id_set |= {t.allocation_id for t in fields.inputs.tensor_metadata if t is not None}
+
+            elif isinstance(fields, torch._C._profiler._ExtraFields_Allocation):
+                id_set.add(fields.allocation_id)
+
+        id_set.difference_update([None])
+        self.assertEqual(repeats, len(id_set))
+
     def test_extra_fields(self):
         with profile(with_stack=True, profile_memory=True) as p:
             _ = torch.ones((1,))
diff --git a/torch/_C/_profiler.pyi b/torch/_C/_profiler.pyi
index da6cfb165fb36..01f78053c7d2b 100644
--- a/torch/_C/_profiler.pyi
+++ b/torch/_C/_profiler.pyi
@@ -128,6 +128,7 @@ class _ExtraFields_Backend: ...
 class _ExtraFields_Allocation:
     ptr: int
     id: Optional[int]
+    allocation_id: Optional[int]
     alloc_size: int
     total_allocated: int
     total_reserved: int
diff --git a/torch/csrc/profiler/collection.h b/torch/csrc/profiler/collection.h
index a2f1291bc8dc2..f90e927bb9e3b 100644
--- a/torch/csrc/profiler/collection.h
+++ b/torch/csrc/profiler/collection.h
@@ -79,12 +79,13 @@ struct TensorMetadata : public RawTensorMetadataBase {
     return {device_type_, device_index_};
   }
 
-  TensorImplAddress impl() {
+  TensorImplAddress impl() const {
     return weak_self_.get();
   }
 
   WeakTensor weak_self_;
   c10::optional<TensorID> id_;
+  c10::optional<AllocationID> allocation_id_;
 };
 
 struct Inputs {
@@ -194,6 +195,7 @@ struct ExtraFields<EventType::Allocation> : RawAllocation {
   }
 
   c10::optional<TensorID> id_;
+  c10::optional<AllocationID> allocation_id_;
 };
 
 template <>
diff --git a/torch/csrc/profiler/data_flow.cpp b/torch/csrc/profiler/data_flow.cpp
index 894bf96ed0f58..e1970edbc9508 100644
--- a/torch/csrc/profiler/data_flow.cpp
+++ b/torch/csrc/profiler/data_flow.cpp
@@ -8,47 +8,39 @@ namespace torch {
 namespace profiler {
 namespace impl {
 
+namespace {
+static constexpr TensorImplAddress NoTensorImpl{nullptr};
+
+struct RawTensorInfo {
+  TensorImplAddress impl_;
+  StorageImplData storage_;
+  c10::Device device_;
+  bool is_free_;
+
+  // Used to assign back to the original structs.
+  std::reference_wrapper<c10::optional<AllocationID>> allocation_id_ref_;
+  std::reference_wrapper<c10::optional<TensorID>> id_ref_;
+};
+} // namespace
+
 void calculateUniqueTensorIDs(
     std::vector<std::shared_ptr<Result>>& sorted_results) {
   // This task is equivilent to https://leetcode.com/problems/number-of-islands/
   // We first cluster events with a greedy index assignment, and then merge
   // groups that overlap.
+  std::vector<RawTensorInfo> tensors;
 
-  using storage_id_t = strong::type<
-      size_t,
-      struct _StorageID,
-      strong::regular,
-      strong::hashable,
-      strong::arithmetic,
-      strong::ordered>;
-
-  struct TensorStoragePair {
-    TensorImplAddress impl_;
-    storage_id_t storage_id_;
-
-    // Used to assign the result.
-    std::reference_wrapper<c10::optional<TensorID>> id_ref_;
-  };
-  std::vector<TensorStoragePair> tensors;
-
-  // Step 1) Flatten and convert storage data pointers. (Handle address reuse.)
+  // Flatten results to a uniform representation.
   // --------------------------------------------------------------------------
   {
-    storage_id_t current_id{0};
-    ska::flat_hash_map<StorageImplData, storage_id_t> live_storage;
-    auto lookup = [&current_id, &live_storage](const StorageImplData data) {
-      auto inserted = live_storage.insert({data, current_id});
-      current_id += storage_id_t(inserted.second);
-      return inserted.first->second;
-    };
-
-    ska::flat_hash_set<storage_id_t> tensor_set;
-    auto insert_tensor = [&lookup, &tensors, &tensor_set](TensorMetadata& m) {
-      if (m.impl() && m.data_) {
-        const auto id = lookup(m.data_);
-        tensor_set.insert(id);
-        tensors.emplace_back(TensorStoragePair{m.impl(), id, m.id_});
-      }
+    auto insert_tensor = [&tensors](TensorMetadata& m) {
+      tensors.emplace_back(RawTensorInfo{
+          m.impl(),
+          m.data_,
+          m.device(),
+          /*is_free_=*/false,
+          m.allocation_id_,
+          m.id_});
     };
 
     for (auto& result : sorted_results) {
@@ -63,15 +55,13 @@ void calculateUniqueTensorIDs(
           [&](ExtraFields<EventType::Allocation>& alloc_op) {
             // We won't know which allocations are for Tensor storage yet.
             // We'll filter after we see all of the op inputs.
-            tensors.emplace_back(TensorStoragePair{
-                TensorImplAddress(nullptr),
-                lookup(StorageImplData(alloc_op.ptr_)),
+            tensors.emplace_back(RawTensorInfo{
+                NoTensorImpl,
+                StorageImplData(alloc_op.ptr_),
+                alloc_op.device(),
+                /*is_free_=*/alloc_op.alloc_size_ < 0,
+                alloc_op.allocation_id_,
                 alloc_op.id_});
-
-            // Handle deallocation
-            if (alloc_op.alloc_size_ < 0) {
-              live_storage.erase(StorageImplData(alloc_op.ptr_));
-            }
           },
           [&](ExtraFields<EventType::PyCall>& py_call) {
             // torch.nn.Module
@@ -99,25 +89,50 @@ void calculateUniqueTensorIDs(
           },
           [](const auto&) {}));
     }
+  }
 
-    // Handle any allocation events which we cannot prove are for
-    // `StorageImpl`s.
+  // Assign IDs to solve ABA for Storage.
+  // --------------------------------------------------------------------------
+  {
+    size_t counter{1};
+    using key_t = std::pair<StorageImplData, c10::Device>;
+    ska::flat_hash_map<key_t, size_t, HashCombine> versions;
+    for (auto& t : tensors) {
+      auto inserted = versions.insert({{t.storage_, t.device_}, counter});
+      counter += inserted.second;
+      t.allocation_id_ref_.get().emplace(AllocationID(inserted.first->second));
+      if (t.is_free_) {
+        versions.erase(inserted.first);
+      }
+    }
+  }
+
+  // Handle any allocation events which we cannot prove are for Tensor storage.
+  // --------------------------------------------------------------------------
+  {
+    ska::flat_hash_set<AllocationID> tensor_set;
+    for (const auto& t : tensors) {
+      if (t.impl_ != NoTensorImpl) {
+        tensor_set.insert(*t.allocation_id_ref_.get());
+      }
+    }
     tensors.erase(
         std::remove_if(
             tensors.begin(),
             tensors.end(),
             [&tensor_set](const auto& i) {
-              return tensor_set.find(i.storage_id_) == tensor_set.end();
+              auto it = tensor_set.find(*i.allocation_id_ref_.get());
+              return it == tensor_set.end();
             }),
         tensors.end());
   }
 
-  // Step 2) Handle the case that the storage of a TensorImpl changed.
+  // Handle the case that the storage of a TensorImpl changed.
   // --------------------------------------------------------------------------
-  using storage_id_pair_t = std::pair<storage_id_t, storage_id_t>;
+  using storage_id_pair_t = std::pair<AllocationID, AllocationID>;
   ska::flat_hash_set<storage_id_pair_t, HashCombine> same_group_set;
   {
-    ska::flat_hash_map<TensorImplAddress, storage_id_t> impl_map;
+    ska::flat_hash_map<TensorImplAddress, AllocationID> impl_map;
     for (const auto& t : tensors) {
       // Storage allocations / frees don't have an associated TensorImpl, so
       // we don't want all storages to merge through nullptr.
@@ -125,18 +140,19 @@ void calculateUniqueTensorIDs(
         continue;
       }
 
-      const auto it = impl_map.insert({t.impl_, t.storage_id_}).first;
+      const auto allocation_id = *t.allocation_id_ref_.get();
+      const auto it = impl_map.insert({t.impl_, allocation_id}).first;
 
       // The pair needs to be sorted for the coalesce step to work properly.
-      it->second < t.storage_id_
-          ? same_group_set.insert({it->second, t.storage_id_})
-          : same_group_set.insert({t.storage_id_, it->second});
+      it->second < allocation_id
+          ? same_group_set.insert({it->second, allocation_id})
+          : same_group_set.insert({allocation_id, it->second});
     }
   }
 
-  // Step 3) Coalesce groups and assign final IDs.
+  // Coalesce groups and assign final IDs.
   // --------------------------------------------------------------------------
-  ska::flat_hash_map<storage_id_t, size_t> id_map;
+  ska::flat_hash_map<AllocationID, size_t> id_map;
   {
     std::vector<storage_id_pair_t> unique_pairs;
     for (const auto& i : same_group_set) {
@@ -152,10 +168,11 @@ void calculateUniqueTensorIDs(
     }
   }
 
-  // Step 4) Write back to metadata
+  // Write back to Tensor IDs.
   // --------------------------------------------------------------------------
   for (const auto& t : tensors) {
-    t.id_ref_.get() = TensorID(id_map.at(t.storage_id_));
+    const auto id = id_map.at(*t.allocation_id_ref_.get());
+    t.id_ref_.get().emplace(TensorID(id));
   }
 }
 
diff --git a/torch/csrc/profiler/data_flow.h b/torch/csrc/profiler/data_flow.h
index 378f37707988e..cb72756a1a5b1 100644
--- a/torch/csrc/profiler/data_flow.h
+++ b/torch/csrc/profiler/data_flow.h
@@ -30,6 +30,14 @@ namespace impl {
 // can only be assigned when memory profiling is enabled.
 using TensorID = strong::type<size_t, struct TensorID_, strong::regular>;
 
+// Uniquely identifies an allocation. (Generally a StorageImpl's data ptr.)
+using AllocationID = strong::type<
+    size_t,
+    struct StorageID_,
+    strong::ordered,
+    strong::regular,
+    strong::hashable>;
+
 // We use a Tensor's TensorImpl adress and StorageImpl data start to build the
 // data flow graph. We do not hold an owning reference so we wrap them in strong
 // types to prevent direct access.
@@ -78,9 +86,9 @@ class WeakTensor {
 };
 
 struct Result;
-using result_ptr_t = std::shared_ptr<Result>;
 
-void calculateUniqueTensorIDs(std::vector<result_ptr_t>& sorted_results);
+void calculateUniqueTensorIDs(
+    std::vector<std::shared_ptr<Result>>& sorted_results);
 
 } // namespace impl
 } // namespace profiler
diff --git a/torch/csrc/profiler/python/init.cpp b/torch/csrc/profiler/python/init.cpp
index fc6d8c3dbec80..4a033b2da1f33 100644
--- a/torch/csrc/profiler/python/init.cpp
+++ b/torch/csrc/profiler/python/init.cpp
@@ -155,6 +155,7 @@ void initPythonBindings(PyObject* module) {
       .def_property_readonly("impl_ptr", &TensorMetadata::impl)
       .def_readonly("storage_data_ptr", &TensorMetadata::data_)
       .def_readonly("id", &TensorMetadata::id_)
+      .def_readonly("allocation_id", &TensorMetadata::allocation_id_)
       .def_property_readonly(
           "layout",
           [](const TensorMetadata& metadata) {
@@ -189,6 +190,7 @@ void initPythonBindings(PyObject* module) {
             return reinterpret_cast<intptr_t>(a.ptr_);
           })
       .def_readonly("id", &allocation_t::id_)
+      .def_readonly("allocation_id", &allocation_t::allocation_id_)
       .def_readonly("alloc_size", &allocation_t::alloc_size_)
       .def_readonly("total_allocated", &allocation_t::total_allocated_)
       .def_readonly("total_reserved", &allocation_t::total_reserved_)
diff --git a/torch/csrc/profiler/python/init.h b/torch/csrc/profiler/python/init.h
index 226bf1a3f3bb3..d38e59b13e274 100644
--- a/torch/csrc/profiler/python/init.h
+++ b/torch/csrc/profiler/python/init.h
@@ -14,6 +14,7 @@ using torch::profiler::impl::TensorID;
   struct type_caster<T> : public strong_pointer_type_caster<T> {};
 
 STRONG_POINTER_TYPE_CASTER(torch::profiler::impl::StorageImplData);
+STRONG_POINTER_TYPE_CASTER(torch::profiler::impl::AllocationID);
 STRONG_POINTER_TYPE_CASTER(torch::profiler::impl::TensorImplAddress);
 STRONG_POINTER_TYPE_CASTER(torch::profiler::impl::PyModuleSelf);
 STRONG_POINTER_TYPE_CASTER(torch::profiler::impl::PyModuleCls);

From 0163841efd54cb1899d2e914572097d801350b95 Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Mon, 7 Nov 2022 11:24:27 -0800
Subject: [PATCH 0680/1922] [Profiler] Restructure inputs and capture
 TensorLists. (#87825)

This PR unifies and rationalizes some of the input representation in Result. The current approach of storing separate types in separate vectors is tedious for two types (Tensors and scalars), but would be even more annoying with the addition of TensorLists. A similar disconnection exists with sizes and strides which the user is also expected to zip with tensor_metadata.

I simplified things by moving inputs to a variant and moving sizes and strides into TensorMetadata. This also forced collection of sizes and strides in python tracer which helps to bring it in line with op profiling. Collection of TensorLists is fairly straightforward; `InputOutputEncoder` already has a spot for them (I actually collected them in the original TorchTidy prototype) so it was just a matter of plumbing things through.

Differential Revision: [D40734451](https://our.internmc.facebook.com/intern/diff/D40734451/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87825
Approved by: https://github.com/slgong-fb, https://github.com/chaekit
---
 test/profiler/test_profiler.py          | 87 +++++++++++++------------
 torch/_C/_profiler.pyi                  | 17 ++---
 torch/csrc/autograd/profiler_kineto.cpp | 64 +++++++++++++++---
 torch/csrc/autograd/profiler_kineto.h   |  4 ++
 torch/csrc/autograd/profiler_python.cpp |  5 +-
 torch/csrc/profiler/collection.cpp      | 83 +++++++++++++----------
 torch/csrc/profiler/collection.h        | 50 +++++++-------
 torch/csrc/profiler/data_flow.cpp       | 80 +++++++++++++----------
 torch/csrc/profiler/python/init.cpp     | 39 ++++++-----
 torch/profiler/_pattern_matcher.py      | 13 ++--
 10 files changed, 259 insertions(+), 183 deletions(-)

diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py
index 296ce500c7e44..acaa1f9667579 100644
--- a/test/profiler/test_profiler.py
+++ b/test/profiler/test_profiler.py
@@ -34,6 +34,7 @@
     record_function,
     supported_activities,
 )
+from torch._C._profiler import _TensorMetadata
 from torch.profiler._pattern_matcher import (
     Conv2dBiasFollowedByBatchNorm2dPattern,
     ExtraCUDACopyPattern,
@@ -1306,7 +1307,8 @@ def _get_tensor_fields(self, node, index):
         self.assertIsInstance(
             node.extra_fields,
             torch._C._profiler._ExtraFields_TorchOp)
-        tensor_info = node.extra_fields.inputs.tensor_metadata[index]
+        tensor_info = node.extra_fields.inputs[index]
+        self.assertIsInstance(tensor_info, _TensorMetadata)
         self.assertIsNotNone(tensor_info.impl_ptr)
         self.assertIsNotNone(tensor_info.storage_data_ptr)
         self.assertIsNotNone(tensor_info.id)
@@ -1633,7 +1635,7 @@ def get_fields(op_name, index):
             # Use linear op to identify weight ground truth.
             linear_op_node = find_node_with_name(nodes, "aten::linear")
             self.assertIsNotNone(linear_op_node)
-            x_metadata, weight_metadata, _ = linear_op_node.extra_fields.inputs.tensor_metadata
+            x_metadata, weight_metadata, _ = linear_op_node.extra_fields.inputs
             self.assertEqual(x_id, x_metadata.id)
 
             # Module
@@ -1746,7 +1748,7 @@ def test_impl_reuse(self) -> None:
 
         roots = p.profiler.kineto_results.experimental_event_tree()
         tensor_impls = tuple(
-            e.extra_fields.inputs.tensor_metadata[0].impl_ptr
+            e.extra_fields.inputs[0].impl_ptr
             for e in _utils.traverse_dfs(roots)
             if e.name == "aten::fill_"
         )
@@ -1766,7 +1768,7 @@ def test_allocation_id_uniqueness(self) -> None:
         for e in _utils.traverse_dfs(roots):
             fields = e.extra_fields
             if isinstance(fields, torch._C._profiler._ExtraFields_TorchOp):
-                id_set |= {t.allocation_id for t in fields.inputs.tensor_metadata if t is not None}
+                id_set |= {t.allocation_id for t in fields.inputs if isinstance(t, _TensorMetadata)}
 
             elif isinstance(fields, torch._C._profiler._ExtraFields_Allocation):
                 id_set.add(fields.allocation_id)
@@ -1812,18 +1814,14 @@ def test_tensor_properties(self):
             node.extra_fields,
             torch._C._profiler._ExtraFields_TorchOp)
 
-        self.assertEqual(node.extra_fields.inputs.shapes, [[4, 4], [4, 1], []])
-        self.assertEqual(node.extra_fields.inputs.strides, [[12, 3], [1, 1], []])
+        def getattr_inputs(name, default):
+            return [getattr(i, name, default) for i in node.extra_fields.inputs]
 
-        input_info = node.extra_fields.inputs
-        self.assertEqual(input_info.dtypes, ['float', 'float', 'Scalar'])
-
-        layout_info = [x.layout if x else None for x in input_info.tensor_metadata]
-        self.assertEqual(layout_info, [torch.strided, torch.strided, None])
-        device_info = [x.device if x else None for x in input_info.tensor_metadata]
-        self.assertEqual(device_info, [torch.device("cpu"), torch.device("cpu"), None])
-        tensor_dtypes = [x.dtype if x else None for x in input_info.tensor_metadata]
-        self.assertEqual(tensor_dtypes, [torch.float32, torch.float32, None])
+        self.assertEqual(getattr_inputs("sizes", []), [[4, 4], [4, 1], []])
+        self.assertEqual(getattr_inputs("strides", []), [[12, 3], [1, 1], []])
+        self.assertEqual(getattr_inputs("layout", None), [torch.strided, torch.strided, None])
+        self.assertEqual(getattr_inputs("device", None), [torch.device("cpu"), torch.device("cpu"), None])
+        self.assertEqual(getattr_inputs("dtype", None), [torch.float32, torch.float32, None])
         self.assertEqual(node.extra_fields.scope, torch.profiler.RecordScope.FUNCTION)
 
         mul_node = find_node_with_name(nodes, "aten::mul")
@@ -1848,19 +1846,13 @@ def test_sparse_tensors(self):
             node.extra_fields,
             torch._C._profiler._ExtraFields_TorchOp)
 
-        self.assertEqual(node.extra_fields.inputs.shapes, [[2, 3], [2, 3], []])
-        self.assertEqual(node.extra_fields.inputs.strides, [[], [], []])
-
-        input_info = node.extra_fields.inputs
-
-        # FIXME: Different systems have different names for int64_t
-        # below are example names I have found. This is not guaranteed to be exhaustive.
-        # self.assertIn(input_info.dtypes[0], ["long long", "long int", "long", "__int64"])
+        def getattr_inputs(name, default):
+            return [getattr(i, name, default) for i in node.extra_fields.inputs]
 
-        layout_info = [x.layout if x else None for x in input_info.tensor_metadata]
-        self.assertEqual(layout_info, [torch.sparse_coo, torch.sparse_coo, None])
-        device_info = [x.device if x else None for x in input_info.tensor_metadata]
-        self.assertEqual(device_info, [torch.device("cpu"), torch.device("cpu"), None])
+        self.assertEqual(getattr_inputs("sizes", []), [[2, 3], [2, 3], []])
+        self.assertEqual(getattr_inputs("strides", []), [[], [], []])
+        self.assertEqual(getattr_inputs("layout", None), [torch.sparse_coo, torch.sparse_coo, None])
+        self.assertEqual(getattr_inputs("device", None), [torch.device("cpu"), torch.device("cpu"), None])
 
     @unittest.skipIf(not torch._C.has_mkldnn, "MKL-DNN build is disabled")
     def test_mkldnn_tensors(self):
@@ -1877,16 +1869,13 @@ def test_mkldnn_tensors(self):
             node.extra_fields,
             torch._C._profiler._ExtraFields_TorchOp)
 
-        self.assertEqual(node.extra_fields.inputs.shapes, [[4, 3], [4, 3], []])
-        self.assertEqual(node.extra_fields.inputs.strides, [[], [], []])
+        def getattr_inputs(name, default):
+            return [getattr(i, name, default) for i in node.extra_fields.inputs]
 
-        input_info = node.extra_fields.inputs
-        self.assertEqual(input_info.dtypes, ['float', 'float', 'Scalar'])
-
-        layout_info = [x.layout if x else None for x in input_info.tensor_metadata]
-        self.assertEqual(layout_info, [torch._mkldnn, torch._mkldnn, None])
-        device_info = [x.device if x else None for x in input_info.tensor_metadata]
-        self.assertEqual(device_info, [torch.device("cpu"), torch.device("cpu"), None])
+        self.assertEqual(getattr_inputs("sizes", []), [[4, 3], [4, 3], []])
+        self.assertEqual(getattr_inputs("strides", []), [[], [], []])
+        self.assertEqual(getattr_inputs("layout", None), [torch._mkldnn, torch._mkldnn, None])
+        self.assertEqual(getattr_inputs("device", None), [torch.device("cpu"), torch.device("cpu"), None])
 
     def test_scalar_ins(self):
         x = torch.ones(5, 5)
@@ -1899,11 +1888,29 @@ def test_scalar_ins(self):
         node = find_node_with_name(nodes, "aten::add")
         self.assertIsNotNone(node)
 
+        def getattr_inputs(name, default):
+            return [getattr(i, name, default) for i in node.extra_fields.inputs]
+
         # The second argument to the add gets promotoed to a zerodim Tensor
-        input_info = node.extra_fields.inputs
-        self.assertEqual(input_info.dtypes, ['float', 'double', 'Scalar'])
-        self.assertEqual(input_info.shapes, [[5, 5], [], []])
-        self.assertEqual(input_info.ivalues, [None, None, alpha])
+        self.assertEqual(getattr_inputs("dtype", None), [torch.float32, torch.float64, None])
+        self.assertEqual(getattr_inputs("sizes", []), [[5, 5], [], []])
+        self.assertEqual(node.extra_fields.inputs[2], alpha)
+
+    def test_tensor_lists(self):
+        x = torch.ones((1,))
+        y = torch.ones((1,))
+        with profile(with_stack=True, profile_memory=True, record_shapes=True) as p:
+            _ = torch.stack((x, y))
+
+        nodes = p.profiler.kineto_results.experimental_event_tree()
+        node = find_node_with_name(nodes, "aten::stack")
+        inputs = node.extra_fields.inputs
+        self.assertEqual(len(inputs), 2)
+        self.assertIsInstance(inputs[0], list)
+        self.assertEqual(len(inputs[0]), 2)
+        self.assertEqual(x.storage().data_ptr(), inputs[0][0].storage_data_ptr)
+        self.assertEqual(y.storage().data_ptr(), inputs[0][1].storage_data_ptr)
+
 
     def test_nnmodule_params(self):
 
diff --git a/torch/_C/_profiler.pyi b/torch/_C/_profiler.pyi
index 01f78053c7d2b..2843090d61f7b 100644
--- a/torch/_C/_profiler.pyi
+++ b/torch/_C/_profiler.pyi
@@ -96,13 +96,6 @@ class _ProfilerEvent:
     @property
     def duration_time_ns(self) -> int: ...
 
-class _Inputs:
-    shapes: List[List[int]]
-    dtypes: List[str]
-    strides: List[List[int]]
-    ivalues: List[Union[int, float, bool, complex]]
-    tensor_metadata: List[Optional[_TensorMetadata]]
-
 class _TensorMetadata:
     impl_ptr: Optional[int]
     storage_data_ptr: Optional[int]
@@ -114,12 +107,20 @@ class _TensorMetadata:
     def device(self) -> device: ...
     @property
     def dtype(self) -> dtype: ...
+    @property
+    def sizes(self) -> List[int]: ...
+    @property
+    def strides(self) -> List[int]: ...
+
+Scalar = Union[int, float, bool, complex]
+Input = Optional[Union[_TensorMetadata, List[_TensorMetadata], Scalar]]
 
 class _ExtraFields_TorchOp:
-    inputs: _Inputs
     sequence_number: int
     allow_tf32_cublas: bool
 
+    @property
+    def inputs(self) -> List[Input]: ...
     @property
     def scope(self) -> RecordScope: ...
 
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 5f62e12d211bb..c9b7b9fa92960 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -64,11 +64,40 @@ using torch::profiler::impl::ActiveProfilerType;
 using torch::profiler::impl::dtypesToStr;
 using torch::profiler::impl::EventType;
 using torch::profiler::impl::ExtraFields;
+using torch::profiler::impl::op_input_t;
 using torch::profiler::impl::ProfilerStateBase;
 using torch::profiler::impl::PyExtraFieldsBase;
 using torch::profiler::impl::Result;
 using torch::profiler::impl::shapesToStr;
 using torch::profiler::impl::stacksToStr;
+using torch::profiler::impl::TensorMetadata;
+
+auto shapesAndDtypes(const std::vector<op_input_t>& inputs) {
+  std::vector<std::vector<int64_t>> shapes;
+  std::vector<std::string> dtypes;
+  for (const auto& i : inputs) {
+    c10::visit(
+        c10::overloaded(
+            [&](const TensorMetadata& t) {
+              shapes.emplace_back(t.sizes_);
+              dtypes.emplace_back(scalarTypeToTypeMeta(t.dtype_).name());
+            },
+            [&](const std::vector<TensorMetadata>&) {
+              shapes.emplace_back();
+              dtypes.emplace_back("TensorList");
+            },
+            [&](const c10::IValue&) {
+              shapes.emplace_back();
+              dtypes.emplace_back("Scalar");
+            },
+            [&](const auto&) {
+              shapes.emplace_back();
+              dtypes.emplace_back();
+            }),
+        i);
+  }
+  return std::make_pair(shapes, dtypes);
+}
 
 struct MetadataBase {
   MetadataBase(const std::shared_ptr<Result>& result)
@@ -151,14 +180,13 @@ struct AddGenericMetadata : public MetadataBase {
   }
 
   void operator()(ExtraFields<EventType::TorchOp>& op_event) {
-    auto& shapes = op_event.inputs_.shapes_;
-    if (!shapes.empty()) {
-      addMetadata("Input Dims", shapesToStr(shapes));
+    const auto shapes_and_dtypes = shapesAndDtypes(op_event.inputs_);
+    if (!shapes_and_dtypes.first.empty()) {
+      addMetadata("Input Dims", shapesToStr(shapes_and_dtypes.first));
     }
 
-    auto& dtypes = op_event.inputs_.dtypes_;
-    if (!dtypes.empty()) {
-      addMetadata("Input type", dtypesToStr(dtypes));
+    if (!shapes_and_dtypes.second.empty()) {
+      addMetadata("Input type", dtypesToStr(shapes_and_dtypes.second));
     }
 
     if (config_ && !config_->experimental_config.performance_events.empty()) {
@@ -685,6 +713,10 @@ KinetoEvent::KinetoEvent(
       parent = parent->parent_.lock();
     }
   }
+
+  result->visit_if_base<ExtraFields<EventType::TorchOp>>([&](const auto& op) {
+    std::tie(shapes_, dtypes_) = shapesAndDtypes(op.inputs_);
+  });
 }
 
 bool KinetoEvent::isPythonFunction() const {
@@ -693,6 +725,22 @@ bool KinetoEvent::isPythonFunction() const {
   return out;
 }
 
+bool KinetoEvent::hasShapes() const {
+  return !shapes_.empty();
+}
+
+const c10::ArrayRef<std::vector<int64_t>> KinetoEvent::shapes() const {
+  return shapes_;
+}
+
+bool KinetoEvent::hasTypes() const {
+  return !dtypes_.empty();
+}
+
+const c10::ArrayRef<std::string> KinetoEvent::dtypes() const {
+  return dtypes_;
+}
+
 const c10::ArrayRef<std::string> KinetoEvent::stack() const {
   auto get = [&](const auto& i) -> auto& {
     return !i.jit_stack_.empty() ? i.jit_stack_ : python_stack_;
@@ -810,10 +858,6 @@ FORWARD_FROM_RESULT(deviceResourceId, kineto_info_.resource)
 
 TYPED_ATTR_WITH_DEFAULT(TorchOp, sequenceNr, e.sequence_number_, -1)
 TYPED_ATTR(TorchOp, fwdThreadId, e.sequence_number_ >= 0 ? e.forward_tid_ : 0)
-TYPED_ATTR(TorchOp, hasShapes, !e.inputs_.shapes_.empty())
-TYPED_ATTR(TorchOp, shapes, e.inputs_.shapes_)
-TYPED_ATTR(TorchOp, hasTypes, !e.inputs_.dtypes_.empty())
-TYPED_ATTR(TorchOp, dtypes, e.inputs_.dtypes_)
 TYPED_ATTR(TorchOp, scope, static_cast<uint8_t>(e.scope_))
 TYPED_ATTR(TorchOp, hasModuleHierarchy, !e.jit_modules_.empty())
 TYPED_ATTR(TorchOp, isAsync, e.is_async_)
diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h
index 180657df63ee1..d85232f96cb58 100644
--- a/torch/csrc/autograd/profiler_kineto.h
+++ b/torch/csrc/autograd/profiler_kineto.h
@@ -63,6 +63,10 @@ struct TORCH_API KinetoEvent {
 
   std::shared_ptr<const torch::profiler::impl::Result> result_;
   std::vector<std::string> python_stack_;
+
+  // Copy fields from result so we can return ArrayRefs.
+  std::vector<std::vector<int64_t>> shapes_;
+  std::vector<std::string> dtypes_;
 };
 
 // Consolidating events returned directly from Kineto
diff --git a/torch/csrc/autograd/profiler_python.cpp b/torch/csrc/autograd/profiler_python.cpp
index 43479c3f15668..5cf08afcbd1f0 100644
--- a/torch/csrc/autograd/profiler_python.cpp
+++ b/torch/csrc/autograd/profiler_python.cpp
@@ -340,7 +340,10 @@ TensorMetadata toTensorMetadata(PyObject* self) {
   TORCH_INTERNAL_ASSERT(THPVariable_CheckExact(self));
   const auto& t = THPVariable_Unpack(self);
   RawTensorMetadata m{t};
-  return TensorMetadata{m};
+  return TensorMetadata{
+      m,
+      t.sizes().vec(),
+      m.layout_ == at::kStrided ? t.strides().vec() : std::vector<int64_t>()};
 }
 
 c10::optional<TensorMetadata> ValueCache::recordIfTensor(py::handle p) {
diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp
index 30a71e9437383..932f7ad81f6c3 100644
--- a/torch/csrc/profiler/collection.cpp
+++ b/torch/csrc/profiler/collection.cpp
@@ -33,8 +33,6 @@ using trace_ptr_t =
 
 RawTensorMetadataBase::RawTensorMetadataBase(const at::Tensor& t)
     : data_{t.has_storage() ? t.storage().data() : nullptr},
-      device_type_{t.device().type()},
-      device_index_{t.device().index()},
       dtype_{t.scalar_type()},
       layout_{t.layout()},
       dim_{static_cast<uint32_t>(t.sizes().size())} {
@@ -44,6 +42,24 @@ RawTensorMetadataBase::RawTensorMetadataBase(const at::Tensor& t)
       t.sizes().size());
 }
 
+RawTensorMetadata::RawTensorMetadata(const at::Tensor& t)
+    : RawTensorMetadataBase(t),
+      weak_self_{WeakTensor(t)},
+      device_type_{t.device().type()},
+      device_index_{t.device().index()} {}
+
+TensorMetadata::TensorMetadata(
+    const RawTensorMetadata& r,
+    const std::vector<int64_t>& sizes,
+    const std::vector<int64_t>& strides)
+    : RawTensorMetadataBase(r),
+      weak_self_{r.weak_self_.value_or(WeakTensor(at::Tensor()))},
+      device_{r.device_type_, r.device_index_},
+      sizes_{sizes},
+      strides_{strides} {
+  SOFT_ASSERT(r.weak_self_.has_value());
+}
+
 // ============================================================================
 // == PyTorch Ops =============================================================
 // ============================================================================
@@ -64,7 +80,9 @@ void InputOutputEncoder::push(c10::ArrayRef<const c10::IValue> values) {
       ivalues_.emplace_back(value);
     } else if (value.isTensorList()) {
       tags_.emplace_back(Tag::TensorListBegin);
-      // TODO: Skip TensorList for now.
+      for (const auto& t : value.toTensorList()) {
+        push(t);
+      }
       tags_.emplace_back(Tag::TERMINATOR);
     } else {
       tags_.emplace_back(Tag::Other);
@@ -94,54 +112,49 @@ auto InputOutputEncoder::getNextShapesAndDtypes() {
           tensor_metadata_it = tensor_metadata_.begin(),
           tensor_size_strides_it = tensor_sizes_strides_.begin(),
           ivals_it = ivalues_.begin()]() mutable {
-    struct Inputs out;
+    auto decode_tensor = [&]() -> TensorMetadata {
+      const auto& raw_metadata = *tensor_metadata_it++;
+      std::vector<int64_t> sizes;
+      std::vector<int64_t> strides;
+      for (C10_UNUSED const auto _ : c10::irange(raw_metadata.dim_)) {
+        sizes.push_back(*tensor_size_strides_it++);
+      }
+      if (raw_metadata.layout_ == at::kStrided) {
+        for (C10_UNUSED const auto _ : c10::irange(raw_metadata.dim_)) {
+          strides.push_back(*tensor_size_strides_it++);
+        }
+      }
+      return {raw_metadata, sizes, strides};
+    };
+
+    std::vector<op_input_t> out;
     bool terminate = false;
     while (!terminate && tag_it != tags_.end()) {
-      out.shapes_.emplace_back();
-      out.strides_.emplace_back();
       switch (*tag_it) {
-        case Tag::Tensor: {
-          const TensorMetadata md{*tensor_metadata_it++};
-          for (C10_UNUSED const auto _ : c10::irange(md.dim_)) {
-            out.shapes_.back().push_back(*tensor_size_strides_it++);
-          }
-          if (md.layout_ == at::kStrided) {
-            for (const auto _ : c10::irange(md.dim_)) {
-              (void)_; // Suppress unused variable warning
-              out.strides_.back().push_back(*tensor_size_strides_it++);
-            }
-          }
-          out.tensor_metadata_.emplace_back(TensorMetadata(md));
-          out.ivalues_.emplace_back();
-          out.dtypes_.emplace_back(scalarTypeToTypeMeta(md.dtype_).name());
-        } break;
+        case Tag::Tensor:
+          out.emplace_back(decode_tensor());
+          break;
 
-        case Tag::TensorListBegin:
+        case Tag::TensorListBegin: {
+          std::vector<TensorMetadata> arg;
           while (*(++tag_it) != Tag::TERMINATOR) {
-            // TODO: Skip TensorLists for now.
+            TORCH_INTERNAL_ASSERT(*tag_it == Tag::Tensor, (int)(*tag_it));
+            arg.emplace_back(decode_tensor());
           }
-          out.dtypes_.emplace_back("TensorList");
-          out.ivalues_.emplace_back();
-          out.tensor_metadata_.emplace_back();
-          break;
+          out.emplace_back(std::move(arg));
+        } break;
 
         case Tag::Scalar:
-          out.dtypes_.emplace_back("Scalar");
-          out.ivalues_.emplace_back(*ivals_it++);
-          out.tensor_metadata_.emplace_back();
+          out.emplace_back(*ivals_it++);
           break;
 
         case Tag::UndefinedTensor:
         case Tag::Other:
-          out.dtypes_.emplace_back();
-          out.ivalues_.emplace_back();
-          out.tensor_metadata_.emplace_back();
+          out.emplace_back(c10::nullopt);
           break;
 
         case Tag::TERMINATOR:
           // This marks the end of this op.
-          out.shapes_.pop_back();
-          out.strides_.pop_back();
           terminate = true;
           break;
 
diff --git a/torch/csrc/profiler/collection.h b/torch/csrc/profiler/collection.h
index f90e927bb9e3b..cef614bd98612 100644
--- a/torch/csrc/profiler/collection.h
+++ b/torch/csrc/profiler/collection.h
@@ -45,12 +45,6 @@ struct TORCH_API RawTensorMetadataBase {
   explicit RawTensorMetadataBase(const at::Tensor& t);
 
   StorageImplData data_;
-
-  // Device is separated into DeviceType and DeviceIndex as Device
-  // doesn't have a default initializer (which the std::array initializer needs)
-  c10::DeviceType device_type_;
-  c10::DeviceIndex device_index_;
-
   c10::ScalarType dtype_;
   c10::Layout layout_;
   uint32_t dim_;
@@ -60,41 +54,41 @@ struct TORCH_API RawTensorMetadataBase {
 struct TORCH_API RawTensorMetadata : RawTensorMetadataBase {
   RawTensorMetadata() = default;
   RawTensorMetadata(const RawTensorMetadata&) = default;
-  explicit RawTensorMetadata(const at::Tensor& t)
-      : RawTensorMetadataBase(t), weak_self_{WeakTensor(t)} {};
+  explicit RawTensorMetadata(const at::Tensor& t);
 
-  // Wrap in `c10::optional` to make `weak_self_` default constructable.
+  // Wrap `weak_self_` in `c10::optional` and split device into components to
+  // keep struct default constructable. (which the std::array initializer needs)
   c10::optional<WeakTensor> weak_self_;
+  c10::DeviceType device_type_;
+  c10::DeviceIndex device_index_;
 };
 
 // Used during post processing.
-struct TensorMetadata : public RawTensorMetadataBase {
-  explicit TensorMetadata(const RawTensorMetadata& r)
-      : RawTensorMetadataBase(r),
-        weak_self_{r.weak_self_.value_or(WeakTensor(at::Tensor()))} {
-    SOFT_ASSERT(r.weak_self_.has_value());
-  }
-
-  c10::Device device() const {
-    return {device_type_, device_index_};
-  }
+struct TORCH_API TensorMetadata : public RawTensorMetadataBase {
+  TensorMetadata(
+      const RawTensorMetadata& r,
+      const std::vector<int64_t>& sizes,
+      const std::vector<int64_t>& strides);
 
   TensorImplAddress impl() const {
     return weak_self_.get();
   }
 
   WeakTensor weak_self_;
+  c10::Device device_;
+  std::vector<int64_t> sizes_;
+  std::vector<int64_t> strides_;
+
+  // Set during `calculateUniqueTensorIDs`.
   c10::optional<TensorID> id_;
   c10::optional<AllocationID> allocation_id_;
 };
 
-struct Inputs {
-  std::vector<std::vector<int64_t>> shapes_;
-  std::vector<std::vector<int64_t>> strides_;
-  std::vector<c10::IValue> ivalues_;
-  std::vector<std::string> dtypes_;
-  std::vector<c10::optional<TensorMetadata>> tensor_metadata_;
-};
+using op_input_t = c10::variant<
+    TensorMetadata,
+    std::vector<TensorMetadata>,
+    c10::IValue,
+    c10::nullopt_t>;
 
 // ============================================================================
 // == ExtraFields =============================================================
@@ -131,7 +125,7 @@ struct ExtraFields<EventType::TorchOp> : TorchOpBasicFields {
       TorchOpBasicFields&& f,
       uint64_t correlation_id,
       time_t end_time_ns,
-      Inputs&& inputs,
+      std::vector<op_input_t>&& inputs,
       jit_stack_t&& jit_stack,
       jit_modules_t&& jit_modules,
       extra_args_t&& extra_args,
@@ -150,7 +144,7 @@ struct ExtraFields<EventType::TorchOp> : TorchOpBasicFields {
         perf_event_counters_{std::move(perf_event_counters)} {}
   uint64_t correlation_id_;
   time_t end_time_ns_;
-  Inputs inputs_;
+  std::vector<op_input_t> inputs_;
   jit_stack_t jit_stack_;
   jit_modules_t jit_modules_;
   extra_args_t extra_args_;
diff --git a/torch/csrc/profiler/data_flow.cpp b/torch/csrc/profiler/data_flow.cpp
index e1970edbc9508..5432027867887 100644
--- a/torch/csrc/profiler/data_flow.cpp
+++ b/torch/csrc/profiler/data_flow.cpp
@@ -21,6 +21,41 @@ struct RawTensorInfo {
   std::reference_wrapper<c10::optional<AllocationID>> allocation_id_ref_;
   std::reference_wrapper<c10::optional<TensorID>> id_ref_;
 };
+
+struct RawTensors {
+  std::vector<RawTensorInfo>& get() {
+    return tensors_;
+  }
+
+  void operator()(TensorMetadata& t) {
+    tensors_.emplace_back(RawTensorInfo{
+        t.impl(), t.data_, t.device_, false, t.allocation_id_, t.id_});
+  }
+
+  void operator()(c10::optional<TensorMetadata>& t) {
+    if (t.has_value()) {
+      (*this)(*t);
+    }
+  }
+
+  void operator()(ExtraFields<EventType::Allocation>& a) {
+    const StorageImplData ptr{a.ptr_};
+    const auto is_free = a.alloc_size_ < 0;
+    tensors_.emplace_back(RawTensorInfo{
+        NoTensorImpl, ptr, a.device(), is_free, a.allocation_id_, a.id_});
+  }
+
+  void operator()(std::vector<TensorMetadata>& t) {
+    for (auto& ti : t) {
+      (*this)(ti);
+    }
+  }
+
+  template <typename T>
+  void operator()(T&) {}
+
+  std::vector<RawTensorInfo> tensors_;
+};
 } // namespace
 
 void calculateUniqueTensorIDs(
@@ -33,62 +68,37 @@ void calculateUniqueTensorIDs(
   // Flatten results to a uniform representation.
   // --------------------------------------------------------------------------
   {
-    auto insert_tensor = [&tensors](TensorMetadata& m) {
-      tensors.emplace_back(RawTensorInfo{
-          m.impl(),
-          m.data_,
-          m.device(),
-          /*is_free_=*/false,
-          m.allocation_id_,
-          m.id_});
-    };
-
+    RawTensors raw_tensors;
     for (auto& result : sorted_results) {
       result->visit(c10::overloaded(
           [&](ExtraFields<EventType::TorchOp>& torch_op) {
-            for (auto& m : torch_op.inputs_.tensor_metadata_) {
-              if (m.has_value()) {
-                insert_tensor(*m);
-              }
+            for (auto& i : torch_op.inputs_) {
+              c10::visit(raw_tensors, i);
             }
           },
-          [&](ExtraFields<EventType::Allocation>& alloc_op) {
-            // We won't know which allocations are for Tensor storage yet.
-            // We'll filter after we see all of the op inputs.
-            tensors.emplace_back(RawTensorInfo{
-                NoTensorImpl,
-                StorageImplData(alloc_op.ptr_),
-                alloc_op.device(),
-                /*is_free_=*/alloc_op.alloc_size_ < 0,
-                alloc_op.allocation_id_,
-                alloc_op.id_});
-          },
           [&](ExtraFields<EventType::PyCall>& py_call) {
             // torch.nn.Module
             if (py_call.module_.has_value()) {
               for (auto& p : py_call.module_->parameters_) {
-                insert_tensor(p.metadata_);
-                if (p.grad_metadata_.has_value()) {
-                  insert_tensor(*p.grad_metadata_);
-                }
+                raw_tensors(p.metadata_);
+                raw_tensors(p.grad_metadata_);
               }
             }
 
             // torch.optim.Optimizer
             if (py_call.optimizer_.has_value()) {
               for (auto& p : py_call.optimizer_->parameters_) {
-                insert_tensor(p.metadata_);
-                if (p.grad_metadata_.has_value()) {
-                  insert_tensor(*p.grad_metadata_);
-                }
+                raw_tensors(p.metadata_);
+                raw_tensors(p.grad_metadata_);
                 for (auto& state_i : p.state_) {
-                  insert_tensor(state_i.second);
+                  raw_tensors(state_i.second);
                 }
               }
             }
           },
-          [](const auto&) {}));
+          [&](auto& i) { raw_tensors(i); }));
     }
+    tensors = std::move(raw_tensors.tensors_);
   }
 
   // Assign IDs to solve ABA for Storage.
diff --git a/torch/csrc/profiler/python/init.cpp b/torch/csrc/profiler/python/init.cpp
index 4a033b2da1f33..7084a1b598dad 100644
--- a/torch/csrc/profiler/python/init.cpp
+++ b/torch/csrc/profiler/python/init.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/profiler/python/init.h>
 
 #include <ATen/record_function.h>
+#include <c10/util/overloaded.h>
 #include <torch/csrc/DynamicTypes.h>
 #include <torch/csrc/autograd/utils/wrap_outputs.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
@@ -136,21 +137,6 @@ void initPythonBindings(PyObject* module) {
       .value("PyCCall", EventType::PyCCall)
       .value("Kineto", EventType::Kineto);
 
-  py::class_<Inputs>(m, "_Inputs")
-      .def_readonly("shapes", &Inputs::shapes_)
-      .def_readonly("dtypes", &Inputs::dtypes_)
-      .def_readonly("strides", &Inputs::strides_)
-      .def_property_readonly(
-          "ivalues",
-          [](const Inputs& inputs) {
-            py::list list;
-            for (auto& v : inputs.ivalues_) {
-              list.append(torch::jit::toPyObject(v));
-            }
-            return list;
-          })
-      .def_readonly("tensor_metadata", &Inputs::tensor_metadata_);
-
   py::class_<TensorMetadata>(m, "_TensorMetadata")
       .def_property_readonly("impl_ptr", &TensorMetadata::impl)
       .def_readonly("storage_data_ptr", &TensorMetadata::data_)
@@ -163,7 +149,7 @@ void initPythonBindings(PyObject* module) {
                 torch::autograd::utils::wrap(metadata.layout_);
             return py::reinterpret_borrow<py::object>(layout_obj);
           })
-      .def_property_readonly("device", &TensorMetadata::device)
+      .def_readonly("device", &TensorMetadata::device_)
       .def_property_readonly(
           "dtype",
           [](const TensorMetadata& metadata) {
@@ -171,11 +157,28 @@ void initPythonBindings(PyObject* module) {
                 torch::autograd::utils::wrap(
                     torch::getTHPDtype(metadata.dtype_)));
           })
-      .def_readonly("dim", &TensorMetadata::dim_);
+      .def_readonly("dim", &TensorMetadata::dim_)
+      .def_readonly("sizes", &TensorMetadata::sizes_)
+      .def_readonly("strides", &TensorMetadata::strides_);
 
   using torch_op_t = ExtraFields<EventType::TorchOp>;
   py::class_<torch_op_t>(m, "_ExtraFields_TorchOp")
-      .def_readonly("inputs", &torch_op_t::inputs_)
+      .def_property_readonly(
+          "inputs",
+          [](const torch_op_t& op) {
+            py::list out;
+            for (const auto& input : op.inputs_) {
+              c10::visit(
+                  c10::overloaded(
+                      [&](const c10::IValue& v) {
+                        out.append(torch::jit::toPyObject(v));
+                      },
+                      [&](const c10::nullopt_t&) { out.append(py::none()); },
+                      [&](const auto& v) { out.append(py::cast(v)); }),
+                  input);
+            }
+            return out;
+          })
       .def_readonly("scope", &torch_op_t::scope_)
       .def_readonly("sequence_number", &torch_op_t::sequence_number_)
       .def_readonly("allow_tf32_cublas", &torch_op_t::allow_tf32_cublas_);
diff --git a/torch/profiler/_pattern_matcher.py b/torch/profiler/_pattern_matcher.py
index fce4b0662ed34..3cec84df219ba 100644
--- a/torch/profiler/_pattern_matcher.py
+++ b/torch/profiler/_pattern_matcher.py
@@ -190,7 +190,7 @@ def match(self, event):
         dtypes = input_dtypes(event)
         if len(dtypes) < 2:
             return False
-        if dtypes[0] != dtypes[1]:
+        if dtypes[0] is None or dtypes[0] != dtypes[1]:
             return False
         event = to_event
         # Up one level
@@ -495,7 +495,7 @@ def skip(self):
     def match(self, event: _ProfilerEvent):
         if event.name != "aten::conv2d":
             return False
-        if len(input_dtypes(event)) < 3 or input_dtypes(event)[2] == "":
+        if len(input_dtypes(event)) < 3 or input_dtypes(event)[2] is None:
             return False
         # This means bias=True
         event = self.go_up_until(
@@ -531,10 +531,7 @@ def mutiple_of(shapes, multiple):
         if not input_dtypes(event):
             return False
         arg_dtype = input_dtypes(event)[0]
-        # TODO: Have a better way to check dtype_size
-        if (arg_dtype.endswith("c10::BFloat16")
-                or arg_dtype.endswith("c10::Half")) and not mutiple_of(
-                    input_shapes(event), 8):
+        if arg_dtype in (torch.bfloat16, torch.half) and not mutiple_of(input_shapes(event), 8):
             return True
         return False
 
@@ -586,12 +583,12 @@ def source_code_location(event: Optional[_ProfilerEvent]):
 
 def input_shapes(event: _ProfilerEvent):
     assert isinstance(event.extra_fields, _ExtraFields_TorchOp)
-    return tuple([tuple(shape) for shape in event.extra_fields.inputs.shapes])
+    return tuple(tuple(getattr(i, "sizes", ())) for i in event.extra_fields.inputs)
 
 
 def input_dtypes(event: _ProfilerEvent):
     assert isinstance(event.extra_fields, _ExtraFields_TorchOp)
-    return tuple(t for t in event.extra_fields.inputs.dtypes)
+    return tuple(getattr(i, "dtype", None) for i in event.extra_fields.inputs)
 
 
 def report_all_anti_patterns(prof,

From 7e2bb16446b784aa0424a8c7a5d67b28ec896acd Mon Sep 17 00:00:00 2001
From: Mikayla Gawarecki <mikaylagawarecki@gmail.com>
Date: Tue, 8 Nov 2022 19:14:48 +0000
Subject: [PATCH 0681/1922] Slight fix in error message for
 check_for_seq_len_1_nested_tensor (#88690)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88690
Approved by: https://github.com/cpuhrsch
---
 aten/src/ATen/native/transformers/cuda/sdp_utils.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.h b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
index 10b3d9d9cf487..564adb2d51ea8 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.h
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
@@ -75,7 +75,7 @@ inline bool check_for_seq_len_1_nested_tensor(sdp_params params, bool debug) {
   for (const auto i : c10::irange(n_tensors)) {
     if (sizes_ptr[(i * size_tensor_stride) + 1] <= 1) {
       TORCH_CHECK(
-          !debug, "Flash Attention does not support sequence_length < 1");
+          !debug, "Flash Attention does not support sequence_length <= 1");
       return false;
     }
   }

From dca3c85fd1cf33895546c34895e33529d425f463 Mon Sep 17 00:00:00 2001
From: Michael Suo <suo@fb.com>
Date: Mon, 7 Nov 2022 22:23:01 -0800
Subject: [PATCH 0682/1922] [dynamo] only error out on nested fx trace if
 dynamo is optimizing (#88640)

I think this is the final resolution to issue caused by
https://github.com/pytorch/pytorch/pull/87797. The nvfuser issue that PR
tripped up was because, even though we're correctly disabling
torchdynamo via a `DisableContext`, the nested fx trace check was still
firing. This PR properly narrows it to only fire if we're not disabled.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88640
Approved by: https://github.com/yf225
---
 test/dynamo/test_misc.py    | 16 ++++++++++++++++
 torch/_dynamo/eval_frame.py | 14 +++++---------
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index 8f195f60d15f3..45433b6795cc1 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -2799,6 +2799,22 @@ def f(x):
         with self.assertRaisesRegex(RuntimeError, "Detected that you are using FX"):
             gm = torch.fx.symbolic_trace(optimized)
 
+    @patch.object(torch._dynamo.config, "error_on_nested_fx_trace", False)
+    def test_no_error_on_nested_fx_trace(self):
+        input = torch.rand(2, 3)
+
+        def f(x):
+            x + x
+
+        real = f(input)
+
+        optimized = torch._dynamo.optimize("eager")(f)
+        self.assertTrue(same(optimized(input), real))
+
+        # should not error
+        gm = torch.fx.symbolic_trace(optimized)
+        self.assertTrue(same(gm(input), real))
+
     def test_inference_mode(self):
         @torch.inference_mode()
         def func(x, y):
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index cd6aedee6045c..09bfa572d77dd 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -2,7 +2,6 @@
 import copy
 import functools
 import inspect
-import itertools
 import logging
 import os
 import sys
@@ -150,20 +149,17 @@ def __call__(self, *args, **kwargs):
 
         @functools.wraps(fn)
         def _fn(*args, **kwargs):
-            any_arg_is_proxy = any(
-                map(
-                    lambda arg: isinstance(arg, torch.fx.Proxy),
-                    itertools.chain(args, kwargs.values()),
-                )
-            )
-            if any_arg_is_proxy:
+            if (
+                not isinstance(self, DisableContext)
+                and torch.fx._symbolic_trace.is_fx_tracing()
+            ):
                 if config.error_on_nested_fx_trace:
                     raise RuntimeError(
                         "Detected that you are using FX to symbolically trace "
                         "a dynamo-optimized function. This is not supported at the moment."
                     )
                 else:
-                    return fn
+                    return fn(*args, **kwargs)
 
             on_enter()
             prior = set_eval_frame(callback)

From 2a8a8444c7bd670800f6b618ac617284520f1178 Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Mon, 7 Nov 2022 15:48:35 -0800
Subject: [PATCH 0683/1922] [Profiler] Memory profiler part 1: Gradient
 identification (#86802)

There are multiple ways to indentify that a Tensor is a gradient. (A subset of which also give additional context.) So to start off I've made a utility to handle that determination.

Differential Revision: [D39920730](https://our.internmc.facebook.com/intern/diff/D39920730/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86802
Approved by: https://github.com/chaekit
---
 mypy-strict.ini                       |   1 +
 test/profiler/test_memory_profiler.py | 224 ++++++++++++++++++++++++++
 torch/_C/_profiler.pyi                |  63 ++++++--
 torch/csrc/profiler/python/init.cpp   |   7 +
 torch/profiler/_memory_profiler.py    | 114 +++++++++++++
 5 files changed, 400 insertions(+), 9 deletions(-)
 create mode 100644 test/profiler/test_memory_profiler.py
 create mode 100644 torch/profiler/_memory_profiler.py

diff --git a/mypy-strict.ini b/mypy-strict.ini
index 460599699c46f..81c66d5239ebc 100644
--- a/mypy-strict.ini
+++ b/mypy-strict.ini
@@ -40,6 +40,7 @@ files =
     .github,
     benchmarks/instruction_counts,
     tools,
+    torch/profiler/_memory_profiler.py,
     torch/utils/_pytree.py,
     torch/utils/benchmark/utils/common.py,
     torch/utils/benchmark/utils/timer.py,
diff --git a/test/profiler/test_memory_profiler.py b/test/profiler/test_memory_profiler.py
new file mode 100644
index 0000000000000..c725f8bec51a4
--- /dev/null
+++ b/test/profiler/test_memory_profiler.py
@@ -0,0 +1,224 @@
+# Owner(s): ["oncall: profiler"]
+import functools
+from typing import Iterator, Optional
+
+import torch
+from torch._C._profiler import _EventType
+from torch.profiler import _memory_profiler, _utils
+from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo, TestCase
+
+
+profile = functools.partial(
+    torch.profiler.profile, record_shapes=True, profile_memory=True, with_stack=True
+)
+
+
+class ScaleLayer(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.scale = torch.nn.Parameter(torch.rand(()), requires_grad=True)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x * self.scale
+
+
+@skipIfTorchDynamo("TorchDynamo changes Python calls that memory profiling relies on.")
+class TestIdentifyGradients(TestCase):
+    def gradient_detected(
+        self,
+        prof: torch.profiler.profile,
+        ctx: _EventType,
+        grad_tensor: torch.Tensor,
+        parameter: Optional[torch.Tensor] = None,
+    ) -> None:
+
+        # This is not an exhaustive check, but for the purpose of unit testing
+        # it is sufficient.
+        def key_matches_tensor(key, tensor) -> bool:
+            # Vacuous case.
+            if tensor is None:
+                return True
+
+            if key is None:
+                return False
+
+            return tensor.storage().data_ptr() == key.storage.ptr
+
+        tree = prof.profiler.kineto_results.experimental_event_tree()
+        for node in _utils.traverse_dfs(tree):
+            for p_key, p_grad_key in _memory_profiler.extract_gradients(node):
+                if node.tag == ctx and key_matches_tensor(p_grad_key, grad_tensor):
+                    if parameter is None:
+                        return True  # Don't need to check parameter; we're done.
+
+                    elif p_key is not None:
+                        # For a complex workflow a gradient could correspond to
+                        # different parameters at different points in a trace.
+                        # However this will not happen in the relatively simple
+                        # cases tested here, so if `extract_gradients` identifies
+                        # the parameter corresponding to a particular gradient it
+                        # must be the one we expect.
+                        self.assertTrue(key_matches_tensor(p_key, parameter))
+                        return True
+
+        return False
+
+    def assertGradientDetected(self, name: str, *args, **kwargs) -> None:
+        self.assertTrue(
+            self.gradient_detected(*args, **kwargs),
+            f"Failed to identify gradient `{name}` from profile.",
+        )
+
+    def assertOnlyGradients(
+        self, prof: torch.profiler.profile, tensors: Iterator[torch.Tensor]
+    ) -> None:
+        allowed_set = {t.storage().data_ptr() for t in tensors}
+
+        tree = prof.profiler.kineto_results.experimental_event_tree()
+        for node in _utils.traverse_dfs(tree):
+            for _, p_grad_key in _memory_profiler.extract_gradients(node):
+                self.assertTrue(
+                    p_grad_key.storage.ptr in allowed_set,
+                    f"Tensor wrongly marked as gradient: {node.name}: {p_grad_key}",
+                )
+
+    def test_extract_gradients_low_level(self) -> None:
+        x = torch.ones((1,))
+        w0 = torch.ones((1,), requires_grad=True)
+        w1 = torch.ones((1,), requires_grad=True)
+
+        def check(cold_start: bool):
+            self.assertEqual(w0.grad is None, cold_start)
+            self.assertEqual(w1.grad is None, cold_start)
+            with profile() as prof:
+                z = x.expand(4) * w0
+                (z * w1).sum().backward()
+
+            # Gradient detection through op inspection does not provide a
+            # reference to the parameter corresponding to the gradient.
+            self.assertGradientDetected("w0", prof, _EventType.TorchOp, w0.grad)
+            self.assertGradientDetected("w1", prof, _EventType.TorchOp, w1.grad)
+            self.assertOnlyGradients(prof, (w0.grad, w1.grad))
+
+        check(cold_start=True)
+        check(cold_start=False)
+
+    def test_extract_gradients_from_module(self) -> None:
+        model = torch.nn.Sequential(torch.nn.Linear(2, 1), ScaleLayer())
+        named_parameters = {name: p for name, p in model.named_parameters()}
+        self.assertEqual(len(named_parameters), 3)
+
+        def assert_only_gradients(prof: torch.profiler.profile):
+            gradients = tuple(i.grad for i in named_parameters.values())
+            self.assertFalse(any(i is None for i in gradients))
+            self.assertOnlyGradients(prof, gradients)
+
+        def check(cold_start: bool):
+            x = torch.ones((2, 2))
+            with profile() as prof:
+                model(x).sum().backward()
+
+            for name, p in named_parameters.items():
+                # The first time we run a module none of the `.grad` fields
+                # have been initialized. This is fine; in that case we can
+                # detect everything we need in the profiled section.
+                self.assertNotEqual(
+                    self.gradient_detected(prof, _EventType.PyCall, p.grad, p),
+                    cold_start,
+                    name,
+                )
+
+                # Op based detection should still identify the gradients.
+                self.assertGradientDetected(name, prof, _EventType.TorchOp, p.grad)
+            assert_only_gradients(prof)
+
+            # We can detect gradients even when `.backward()` is not called.
+            with profile() as prof:
+                model(torch.ones((2, 2)))
+
+            for name, p in named_parameters.items():
+                self.assertGradientDetected(name, prof, _EventType.PyCall, p.grad, p)
+                self.assertFalse(
+                    self.gradient_detected(prof, _EventType.TorchOp, p.grad), name
+                )
+            assert_only_gradients(prof)
+
+        check(cold_start=True)
+        check(cold_start=False)
+
+    def _test_extract_gradients_from_optimizer(self, set_to_none: bool) -> None:
+
+        x = torch.ones((1,))
+        w0 = torch.ones((1,), requires_grad=True)
+        w1 = torch.ones((1,), requires_grad=True)
+        optimizer = torch.optim.SGD((w0, w1), lr=0.1, momentum=0.9)
+
+        def check(cold_start: bool):
+            self.assertEqual(w0.grad is None, cold_start)
+            self.assertEqual(w1.grad is None, cold_start)
+            with profile() as prof:
+                optimizer.zero_grad(set_to_none=set_to_none)
+                z = x.expand(4) * w0
+                (z * w1).sum().backward()
+                optimizer.step()
+
+            # Optimizer instrumentation runs late in the step, so we can detect
+            # gradients for both cold and warm start.
+            self.assertGradientDetected("w0", prof, _EventType.PyCall, w0.grad, w0)
+            self.assertGradientDetected("w1", prof, _EventType.PyCall, w1.grad, w1)
+
+            self.assertGradientDetected("w0", prof, _EventType.TorchOp, w0.grad)
+            self.assertGradientDetected("w1", prof, _EventType.TorchOp, w1.grad)
+            self.assertOnlyGradients(prof, (w0.grad, w1.grad))
+
+            with profile() as prof:
+                for _ in range(2):
+                    optimizer.zero_grad(set_to_none=set_to_none)
+                    z = x.expand(4) * w0
+                    (z * w1).sum().backward()
+                    optimizer.step()
+
+            # Inspected state is cached, so if we replace gradients (as is the
+            # case for `set_to_none=True`) our python instrumentation will not
+            # see them.
+            # TODO(robieta): Should `.step()` be excluded from caching?
+            self.assertNotEqual(
+                self.gradient_detected(prof, _EventType.PyCall, w0.grad, w0),
+                set_to_none,
+            )
+
+            self.assertNotEqual(
+                self.gradient_detected(prof, _EventType.PyCall, w1.grad, w1),
+                set_to_none,
+            )
+
+            if set_to_none:
+                with self.assertRaisesRegex(AssertionError, "Tensor wrongly marked"):
+                    self.assertOnlyGradients(prof, (w0.grad, w1.grad))
+
+        check(cold_start=True)
+        check(cold_start=False)
+
+    def test_extract_gradients_from_optimizer(self) -> None:
+        self._test_extract_gradients_from_optimizer(set_to_none=False)
+
+    def test_extract_gradients_from_optimizer_set_to_none(self) -> None:
+        self._test_extract_gradients_from_optimizer(set_to_none=True)
+
+    def test_extract_gradients_from_module_and_optimizer(self) -> None:
+        # Module and optimizer are thoroughly tested individually and should be
+        # additive. Thus we can manage with a lightweight check that they don't
+        # interact adversely.
+        model = torch.nn.Sequential(torch.nn.Linear(2, 1), ScaleLayer())
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+        with profile() as prof:
+            model(torch.ones((2, 2))).sum().backward()
+            optimizer.step()
+
+        self.assertGradientDetected(
+            "weight", prof, _EventType.PyCall, model[0].weight.grad, model[0].weight
+        )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/_C/_profiler.pyi b/torch/_C/_profiler.pyi
index 2843090d61f7b..da0f191e26b53 100644
--- a/torch/_C/_profiler.pyi
+++ b/torch/_C/_profiler.pyi
@@ -3,6 +3,8 @@ from typing import List, Optional, Tuple, Union
 
 from torch._C import device, dtype, layout
 
+from typing_extensions import Literal
+
 # defined in torch/csrc/profiler/python/init.cpp
 
 class RecordScope(Enum):
@@ -38,11 +40,12 @@ class ProfilerActivity(Enum):
     CUDA = ...
 
 class _EventType(Enum):
-    Allocation = ...
+    TorchOp = ...
     Backend = ...
+    Allocation = ...
+    OutOfMemory = ...
     PyCall = ...
     PyCCall = ...
-    TorchOp = ...
     Kineto = ...
 
 class _ExperimentalConfig:
@@ -71,6 +74,8 @@ class _ProfilerEvent:
     start_tid: int
     start_time_ns: int
     children: List[_ProfilerEvent]
+
+    # TODO(robieta): remove in favor of `self.typed`
     extra_fields: Union[
         _ExtraFields_TorchOp,
         _ExtraFields_Backend,
@@ -81,6 +86,18 @@ class _ProfilerEvent:
         _ExtraFields_Kineto,
     ]
 
+    @property
+    def typed(
+        self,
+    ) -> Union[
+        Tuple[Literal[_EventType.TorchOp], _ExtraFields_TorchOp],
+        Tuple[Literal[_EventType.Backend], _ExtraFields_Backend],
+        Tuple[Literal[_EventType.Allocation], _ExtraFields_Allocation],
+        Tuple[Literal[_EventType.OutOfMemory], _ExtraFields_OutOfMemory],
+        Tuple[Literal[_EventType.PyCall], _ExtraFields_PyCall],
+        Tuple[Literal[_EventType.PyCCall], _ExtraFields_PyCCall],
+        Tuple[Literal[_EventType.Kineto], _ExtraFields_Kineto],
+    ]: ...
     @property
     def name(self) -> str: ...
     @property
@@ -101,6 +118,8 @@ class _TensorMetadata:
     storage_data_ptr: Optional[int]
     id: Optional[int]
 
+    @property
+    def allocation_id(self) -> Optional[int]: ...
     @property
     def layout(self) -> layout: ...
     @property
@@ -129,11 +148,12 @@ class _ExtraFields_Backend: ...
 class _ExtraFields_Allocation:
     ptr: int
     id: Optional[int]
-    allocation_id: Optional[int]
     alloc_size: int
     total_allocated: int
     total_reserved: int
 
+    @property
+    def allocation_id(self) -> Optional[int]: ...
     @property
     def device(self) -> device: ...
 
@@ -147,22 +167,47 @@ class _PyFrameState:
     def file_name(self) -> str: ...
 
 class _NNModuleInfo:
-    @property
-    def params(self) -> List[Tuple[str, int]]: ...
     @property
     def self_ptr(self) -> int: ...
     @property
     def cls_ptr(self) -> int: ...
     @property
     def cls_name(self) -> str: ...
+    @property
+    def parameters(
+        self,
+    ) -> List[Tuple[str, _TensorMetadata, Optional[_TensorMetadata]]]: ...
+
+class _OptimizerInfo:
+    @property
+    def parameters(
+        self,
+    ) -> List[
+        Tuple[
+            # Parameter
+            _TensorMetadata,
+            #
+            # Gradient (if present during optimizer.step())
+            Optional[_TensorMetadata],
+            #
+            # Optimizer state for Parameter as (name, tensor) pairs
+            List[Tuple[str, _TensorMetadata]],
+        ]
+    ]: ...
 
 class _ExtraFields_PyCCall:
-    callsite: _PyFrameState
-    caller: _PyFrameState
-    module: Optional[_NNModuleInfo]
+    @property
+    def caller(self) -> _PyFrameState: ...
 
 class _ExtraFields_PyCall:
-    caller: _PyFrameState
+    @property
+    def callsite(self) -> _PyFrameState: ...
+    @property
+    def caller(self) -> _PyFrameState: ...
+    @property
+    def module(self) -> Optional[_NNModuleInfo]: ...
+    @property
+    def optimizer(self) -> Optional[_OptimizerInfo]: ...
 
 class _ExtraFields_Kineto: ...
 
diff --git a/torch/csrc/profiler/python/init.cpp b/torch/csrc/profiler/python/init.cpp
index 7084a1b598dad..2a5839fc6a229 100644
--- a/torch/csrc/profiler/python/init.cpp
+++ b/torch/csrc/profiler/python/init.cpp
@@ -251,6 +251,13 @@ void initPythonBindings(PyObject* module) {
       .def_property_readonly("name", &Result::name)
       .def_property_readonly("tag", &Result::tag)
       .def_readonly("extra_fields", &Result::extra_fields_)
+      .def_property_readonly(
+          "typed",
+          [](const Result& r) {
+            return py::make_tuple(
+                r.tag(),
+                py::cast(r.extra_fields_, py::return_value_policy::reference));
+          })
       .def_property_readonly(
           "id",
           [](const Result& r) {
diff --git a/torch/profiler/_memory_profiler.py b/torch/profiler/_memory_profiler.py
new file mode 100644
index 0000000000000..cab771931489c
--- /dev/null
+++ b/torch/profiler/_memory_profiler.py
@@ -0,0 +1,114 @@
+import dataclasses
+from typing import Any, Iterator, Optional, Tuple
+
+import torch
+from torch._C._profiler import _EventType, _ProfilerEvent, _TensorMetadata, RecordScope
+
+
+@dataclasses.dataclass
+class _Storage:
+    """Bundle storage pointer and id.
+
+    All profiling logic should use `allocation_id`, however it is useful to
+    print storage pointers for debugging and unit tests sometimes look up
+    values using the storage data pointer of a live Tensor."""
+
+    ptr: int
+    allocation_id: int
+
+    def __repr__(self) -> str:
+        return f"{hex(self.ptr):>18} ({self.allocation_id})"
+
+    def __eq__(self, other: Any) -> bool:
+        return isinstance(other, _Storage) and self.allocation_id == other.allocation_id
+
+    def __hash__(self) -> int:
+        return hash(self.allocation_id)
+
+
+@dataclasses.dataclass(eq=True, unsafe_hash=True, frozen=True)
+class TensorKey:
+    """Hashable identifier for a storage which has been asigned an ID.
+
+    A detailed description of Tensor IDs and why they are needed is given in
+    `torch/csrc/profiler/collection.h` when `TensorID` is declared. To
+    summarize, multiple Storage buffers can map to the same logical Tensor.
+    This dataclass is used to refer to a concrete in-memory StorageImpl of
+    a Tensor.
+    """
+
+    id: int
+    storage: _Storage
+    device: torch.device
+
+    def __repr__(self) -> str:
+        return f"id={self.id}: {repr(self.storage):<24} ({self.device})"
+
+    @staticmethod
+    def _make(
+        tensor_id: Optional[int],
+        storage_ptr: Optional[int],
+        allocation_id: Optional[int],
+        device: torch.device,
+    ) -> Optional["TensorKey"]:
+        if (
+            tensor_id is not None
+            and storage_ptr is not None
+            and allocation_id is not None
+        ):
+            return TensorKey(tensor_id, _Storage(storage_ptr, allocation_id), device)
+        return None
+
+    @classmethod
+    def from_tensor(cls, t: Optional[_TensorMetadata]) -> Optional["TensorKey"]:
+        if t is not None:
+            return cls._make(t.id, t.storage_data_ptr, t.allocation_id, t.device)
+        return None
+
+
+def extract_gradients(
+    node: _ProfilerEvent,
+) -> Iterator[Tuple[Optional[TensorKey], TensorKey]]:
+    children = node.children
+
+    # AccumulateGrad is used in the Autograd engine to handle gradient updates.
+    # There are two possible cases:
+    # 1) This is a newly created gradient Tensor. In that case there is nothing
+    #    to accumulate, so autograd simply detaches the Tensor.
+    #
+    # 2) There is a preexisting gradient Tensor and we need to add the newly
+    #    computed update. This is done with an in-place add (aten::add_) op.
+    #    (The underscore suffix denotes "in-place".)
+    if (
+        node.typed[0] == _EventType.TorchOp
+        and node.typed[1].scope == RecordScope.BACKWARD_FUNCTION
+        # TODO(robieta): Move away from load bearing names
+        and node.name == "torch::autograd::AccumulateGrad"
+        and children
+        and children[0].typed[0] == _EventType.TorchOp
+        and children[0].name in ("aten::detach", "aten::add_")
+        and children[0].typed[1].inputs
+        and isinstance(children[0].typed[1].inputs[0], _TensorMetadata)
+    ):
+        key = TensorKey.from_tensor(children[0].typed[1].inputs[0])
+        if key:
+            yield None, key
+
+    # We directly instrument `torch.nn.Module` and `torch.optim.Optimizer`
+    # NOTE: The values captured by the python tracer are cached; they can be
+    #       used to build up labels but do not imply that a Tensor was live at
+    #       a particular time.
+    elif node.typed[0] == _EventType.PyCall:
+        typed_fields = node.typed[1]
+        assert typed_fields.module is None or typed_fields.optimizer is None
+        if typed_fields.module is not None:
+            for _, p, p_grad in typed_fields.module.parameters:
+                p_grad_key = TensorKey.from_tensor(p_grad)
+                if p_grad_key is not None:
+                    yield TensorKey.from_tensor(p), p_grad_key
+
+        if typed_fields.optimizer is not None:
+            for p, p_grad, _ in typed_fields.optimizer.parameters:
+                p_grad_key = TensorKey.from_tensor(p_grad)
+                if p_grad_key is not None:
+                    yield TensorKey.from_tensor(p), p_grad_key

From db8b55ba02387a6f9ca6443dd268c35196a08184 Mon Sep 17 00:00:00 2001
From: Samantha Andow <samdow@meta.com>
Date: Wed, 9 Nov 2022 00:09:20 +0000
Subject: [PATCH 0684/1922] [nn] add remove_duplicate flag to named_parameters
 (#759) (#88090)

Summary:
X-link: https://github.com/pytorch/torchrec/pull/759

Since the remove_duplicate flag was added to named_buffers in D39493161 (https://github.com/pytorch/pytorch/commit/c12f829cce29eb6971094a9bbb0f8971aed86f5c), this adds the same flag to named_parameters

Test Plan:
python test/test_nn.py -k test_buffers_and_named_buffers

OSS Tests

Differential Revision: D40801899

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88090
Approved by: https://github.com/albanD
---
 test/test_nn.py                           | 28 +++++++++++++++++++++++
 torch/distributed/nn/api/remote_module.py |  5 +++-
 torch/nn/modules/module.py                | 11 +++++++--
 3 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/test/test_nn.py b/test/test_nn.py
index f704c530f92d9..d2eac6a277d7e 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -766,6 +766,34 @@ def names(named_parameters):
             names(s.named_parameters()),
             ['0.dummy_param', '0.l1.layer_dummy_param'])
 
+    def test_named_parameters_remove_duplicate(self):
+        def names(named_parameters):
+            return [k for k, _ in named_parameters]
+
+        class M1(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.param1 = nn.Parameter(torch.empty(3, 3))
+                self.param2 = self.param1
+
+        m1 = M1()
+        self.assertEqual(names(m1.named_parameters()),
+                         ["param1"])
+        self.assertEqual(names(m1.named_parameters(remove_duplicate=False)),
+                         ["param1", "param2"])
+
+        class M2(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mod1 = nn.Linear(3, 4, bias=False)
+                self.mod2 = self.mod1
+
+        m2 = M2()
+        self.assertEqual(names(m2.named_parameters()),
+                         ["mod1.weight"])
+        self.assertEqual(names(m2.named_parameters(remove_duplicate=False)),
+                         ["mod1.weight", "mod2.weight"])
+
     def test_buffers_and_named_buffers(self):
         def names(named_buffers):
             return [k for k, _ in named_buffers]
diff --git a/torch/distributed/nn/api/remote_module.py b/torch/distributed/nn/api/remote_module.py
index 2b12959d331a1..3fd8539d529dc 100644
--- a/torch/distributed/nn/api/remote_module.py
+++ b/torch/distributed/nn/api/remote_module.py
@@ -391,7 +391,10 @@ def parameters(self, recurse: bool = True) -> Iterator[Parameter]:
         )
 
     def named_parameters(  # type: ignore[return]
-        self, prefix: str = "", recurse: bool = True
+        self,
+        prefix: str = "",
+        recurse: bool = True,
+        remove_duplicate: bool = True
     ) -> Iterator[Tuple[str, Parameter]]:
         _raise_not_supported(self.named_parameters.__name__)
 
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index 0b47d61defde8..fea0ca7b8de81 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -1958,7 +1958,12 @@ def parameters(self, recurse: bool = True) -> Iterator[Parameter]:
         for name, param in self.named_parameters(recurse=recurse):
             yield param
 
-    def named_parameters(self, prefix: str = '', recurse: bool = True) -> Iterator[Tuple[str, Parameter]]:
+    def named_parameters(
+            self,
+            prefix: str = '',
+            recurse: bool = True,
+            remove_duplicate: bool = True
+    ) -> Iterator[Tuple[str, Parameter]]:
         r"""Returns an iterator over module parameters, yielding both the
         name of the parameter as well as the parameter itself.
 
@@ -1967,6 +1972,8 @@ def named_parameters(self, prefix: str = '', recurse: bool = True) -> Iterator[T
             recurse (bool): if True, then yields parameters of this module
                 and all submodules. Otherwise, yields only parameters that
                 are direct members of this module.
+            remove_duplicate (bool, optional): whether to remove the duplicated
+                parameters in the result. Defaults to True.
 
         Yields:
             (str, Parameter): Tuple containing the name and parameter
@@ -1981,7 +1988,7 @@ def named_parameters(self, prefix: str = '', recurse: bool = True) -> Iterator[T
         """
         gen = self._named_members(
             lambda module: module._parameters.items(),
-            prefix=prefix, recurse=recurse)
+            prefix=prefix, recurse=recurse, remove_duplicate=remove_duplicate)
         for elem in gen:
             yield elem
 

From 0faf018781ef753c5aa3f76346c678a99ed08e59 Mon Sep 17 00:00:00 2001
From: Antoni Viros i Martin <aviros@meta.com>
Date: Wed, 9 Nov 2022 00:19:36 +0000
Subject: [PATCH 0685/1922] Add implementation for irregular dimension
 selection for nested tensors. (#88585)

Summary: This diff modifies the implementation of the select operator so slices of the irregular dimension can be selected (e.g. nt[:,0,:]).

Test Plan:
Added new unit tests to test that the new functions work as intended (see them in diff). To test,
`buck test mode/dev-nosan //caffe2/test:nested`

Differential Revision: D41083993

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88585
Approved by: https://github.com/cpuhrsch
---
 .../ATen/native/nested/NestedTensorMath.cpp   | 65 +++++++++++++++----
 test/test_nestedtensor.py                     |  4 ++
 2 files changed, 55 insertions(+), 14 deletions(-)

diff --git a/aten/src/ATen/native/nested/NestedTensorMath.cpp b/aten/src/ATen/native/nested/NestedTensorMath.cpp
index 84efa837ceffe..9a47322644ca2 100644
--- a/aten/src/ATen/native/nested/NestedTensorMath.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorMath.cpp
@@ -494,23 +494,61 @@ Tensor NestedTensor_sum_dim_CPU(
 
 Tensor select_nested(const Tensor& self, int64_t dim, int64_t index) {
   auto self_ptr = get_nested_tensor_impl(self);
+  std::vector<IntArrayRef> sizes = NestedTensor_get_sizes(self_ptr),
+                           strides = NestedTensor_get_strides(self_ptr);
+  const std::vector<int64_t>& offsets = self_ptr->get_storage_offsets();
+  const at::Tensor& buffer = self_ptr->get_unsafe_storage_as_tensor();
   int64_t positive_dim = at::maybe_wrap_dim(dim, self_ptr->dim());
+  int64_t ntensors = self_ptr->size(0);
+  TORCH_CHECK_INDEX(ntensors > 0, "You can only select when the NT is not empty.");
+  int64_t ndims = static_cast<long>(sizes[0].size());
   TORCH_CHECK(
-    positive_dim == 0,
-    "NestedTensor can only be selected along dimension 0 ",
+    positive_dim == 0 || positive_dim == 1,
+    "NestedTensor can only be selected along dimension 0 or 1",
     "got dimension ", dim, " instead."
   );
-  int64_t ntensors = self_ptr->size(0);
-  TORCH_CHECK_INDEX(
-      index >= -ntensors && index < ntensors,
-      "index ", index,
-      " is out of bounds for dimension 0 with size ", ntensors);
-  int64_t positive_index = index < 0 ? index + ntensors : index;
-  const at::Tensor& buffer = self_ptr->get_unsafe_storage_as_tensor();
-  std::vector<IntArrayRef> sizes = NestedTensor_get_sizes(self_ptr),
-      strides = NestedTensor_get_strides(self_ptr);
-  const std::vector<int64_t>& offsets = self_ptr->get_storage_offsets();
-  return buffer.as_strided(sizes[positive_index], strides[positive_index], offsets[positive_index]);
+  if (positive_dim == 0) {
+    TORCH_CHECK_INDEX(
+        index >= -ntensors && index < ntensors,
+        "index ",
+        index,
+        " is out of bounds for dimension 0 with size ",
+        ntensors);
+    int64_t positive_index = index < 0 ? index + ntensors : index;
+    return buffer.as_strided(
+        sizes[positive_index],
+        strides[positive_index],
+        offsets[positive_index]);
+  } else {
+    auto new_sizes = at::empty({ntensors, ndims-1}, TensorOptions().dtype(kLong));
+    auto new_strides = at::empty({ntensors, ndims-1}, TensorOptions().dtype(kLong));
+    auto new_offsets = std::vector<int64_t>(offsets);
+    std::vector<Tensor> tensor_slices(ntensors);
+    for (int64_t i : c10::irange(ntensors)) {
+      int64_t *size_ptr = new_sizes[i].data_ptr<int64_t>();
+      int64_t *stride_ptr = new_strides[i].data_ptr<int64_t>();
+
+      int64_t dim_idx = 0;
+      for (int64_t j : c10::irange(ndims)) {
+        if (j != dim - 1) {
+          size_ptr[dim_idx] = sizes[i][j];
+          stride_ptr[dim_idx] = strides[i][j];
+          ++dim_idx;
+        }
+        else {
+          TORCH_CHECK_INDEX(
+              index >= 0 && index < sizes[i][j],
+              "index ",
+              index,
+              " is out of bounds for irregular dimension 1 with size ",
+              sizes[i][j]);
+          new_offsets[i] = offsets[i] + index * strides[i][j];
+        }
+      }
+    }
+    return create_nested_view_tensor(self, new_sizes, new_strides, std::move(new_offsets));
+  }
+
 }
 
 Tensor clone_nested(
@@ -704,7 +742,6 @@ Tensor unsqueeze_nested(const Tensor& self, int64_t dim) {
       self, sizemat_unsqueezed, stridemat_unsqueezed, std::vector<int64_t>(self_ptr->get_storage_offsets()));
 }
 
-
 // utilities supporting `view_nested` and `reshape_nested`
 namespace {
 // Args:
diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py
index 590f517eaceef..f1f211cdafcac 100644
--- a/test/test_nestedtensor.py
+++ b/test/test_nestedtensor.py
@@ -800,6 +800,10 @@ def test_nested_tensor_indexing(self, device, dtype):
         self.assertEqual(nt[1, ...], x1)
         self.assertRaises(IndexError, lambda: nt[1, 4, 2])
         self.assertRaises(NotImplementedError, lambda: nt[:, 1, 1])
+        # test select on the irregular dimension only
+        self.assertEqual(nt.select(1, 0)[0], x0.select(0, 0))
+        self.assertEqual(nt.select(1, 0)[1], x1.select(0, 0))
+        self.assertRaises(IndexError, lambda: nt.select(1, 3))
         # make sure indexing returns a view
         nt[0].fill_(100.0)
         answer = torch.tensor(100.0, device=device, dtype=dtype).expand((2, 5))

From ecf71f778f441f6d1b8989b6d662eaa371b7bc34 Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Tue, 8 Nov 2022 15:31:15 +0000
Subject: [PATCH 0686/1922] Reduce the number of shards inductor uses for model
 tests (#88610)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88610
Approved by: https://github.com/huydhn
---
 .github/scripts/filter_test_configs.py |  2 ++
 .github/workflows/inductor.yml         | 11 ++++-------
 .jenkins/pytorch/test.sh               | 16 ++++++++--------
 3 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py
index 89f80a00adaf5..06c8f90441eb9 100755
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@@ -23,6 +23,8 @@
     "force_on_cpu",
     "functorch",
     "inductor",
+    "inductor_distributed",
+    "inductor_timm",
     "jit_legacy",
     "multigpu",
     "nogpu_AVX512",
diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml
index 3a6c750bbf141..e8390681e4abd 100644
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@@ -23,13 +23,10 @@ jobs:
       cuda-arch-list: 8.6
       test-matrix: |
         { include: [
-          { config: "inductor", shard: 1, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 3, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 4, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 5, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 6, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 7, num_shards: 7, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "inductor_distributed", shard: 1, num_shards: 1, runner: "linux.g5.12xlarge.nvidia.gpu" },
         ]}
 
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index ea219dd45f0b3..6bbda7f4d7071 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -282,7 +282,7 @@ test_inductor_timm_shard() {
   TEST_REPORTS_DIR=$(pwd)/test/test-reports
   mkdir -p "$TEST_REPORTS_DIR"
   python benchmarks/dynamo/timm_models.py --ci --training --accuracy \
-    --device cuda --inductor --float32 --total-partitions 5 --partition-id "$1" \
+    --device cuda --inductor --float32 --total-partitions 2 --partition-id "$1" \
     --output "$TEST_REPORTS_DIR"/inductor_timm_"$1".csv
   python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_timm_"$1".csv
 }
@@ -741,6 +741,13 @@ elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHAR
   install_filelock
   install_triton
   test_dynamo_shard 2
+elif [[ "${TEST_CONFIG}" == *inductor_timm* && $SHARD_NUMBER -lt 3 && $NUM_TEST_SHARDS -gt 1 ]]; then
+  install_torchvision
+  install_filelock
+  install_triton
+  install_timm
+  id=$((SHARD_NUMBER-1))
+  test_inductor_timm_shard $id
 elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
   install_torchvision
   install_filelock
@@ -753,13 +760,6 @@ elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SH
   install_triton
   install_huggingface
   test_inductor_huggingface_shard 0
-elif [[ "${TEST_CONFIG}" == *inductor* && $SHARD_NUMBER -lt 8 && $NUM_TEST_SHARDS -gt 1 ]]; then
-  install_torchvision
-  install_filelock
-  install_triton
-  install_timm
-  id=$((SHARD_NUMBER-3))
-  test_inductor_timm_shard $id
 elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
   test_without_numpy
   install_torchvision

From 1ac7a424e04588791be9a76cf140c3d441325185 Mon Sep 17 00:00:00 2001
From: Zhengxu Chen <zhxchen17@meta.com>
Date: Wed, 9 Nov 2022 01:02:07 +0000
Subject: [PATCH 0687/1922] [export] Preserve meta["val"] on placeholders in
 dynamo.export(). (#88651)

Summary:
Today when we transform the captured graph in the last step in export(aten_graph=True), we construct a new graph which doesn't have the all the metadata to be preserved, for example, node.meta["val"].
meta["val"] is important for writing passes and analysis on the graph later in the pipeline, we may want to preserve that on placeholder nodes.

Test Plan: test_export.py:test_export_meta_val

Differential Revision: D41110864

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88651
Approved by: https://github.com/tugsbayasgalan, https://github.com/jansel
---
 test/dynamo/test_export.py  | 16 ++++++++++++++++
 torch/_dynamo/eval_frame.py |  9 ++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/test/dynamo/test_export.py b/test/dynamo/test_export.py
index 1afd7c8c0c2de..a157926422c8b 100644
--- a/test/dynamo/test_export.py
+++ b/test/dynamo/test_export.py
@@ -1445,6 +1445,22 @@ def forward(self, pred, x):
         dynamo_result_2 = out_graph(pred, x)
         self.assertTrue(torch._dynamo.utils.same(real_result_2, dynamo_result_2))
 
+    def test_export_meta_val(self):
+        def f(x, y, z):
+            return x * y + z
+
+        gm, _ = torch._dynamo.export(
+            f,
+            torch.ones(3, 2),
+            torch.zeros(3, 2),
+            torch.ones(3, 2),
+            aten_graph=True,
+            tracing_mode="symbolic",
+        )
+        for node in gm.graph.nodes:
+            if node.op == "placeholder":
+                self.assertIn("val", node.meta)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 09bfa572d77dd..8d9e3b7b6aa14 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -554,7 +554,10 @@ def __init__(
             )
 
         def placeholder(self, target, args, kwargs):
-            return next(self.old_args_gen)
+            arg = next(self.old_args_gen)
+            if "val" in self.current_node.meta:
+                arg.node.meta["val"] = self.current_node.meta["val"]
+            return arg
 
         def output(self, target, args, kwargs):
             dynamo_result_flat = args[0]
@@ -564,6 +567,10 @@ def output(self, target, args, kwargs):
 
             return super().output(target, (new_result,), {})
 
+        def run_node(self, n):
+            self.current_node = n
+            return super().run_node(n)
+
     if aten_graph:
         # Running graph with interpreter is needed for propagating the stack_trace
         def graph_with_interpreter(*args):

From 1034a115f2f284797b62487ebe49395093eee725 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Wed, 9 Nov 2022 01:04:29 +0000
Subject: [PATCH 0688/1922] Update gloo submodule (#88530)

Also, add an explicit cudart dependency to `torch_cuda` if Kineto is used with GPU support (it used to be somehow inherited from a wrong `gloo` setup)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88530
Approved by: https://github.com/osalpekar
---
 caffe2/CMakeLists.txt | 6 ++++++
 third_party/gloo      | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index aa6dfd2841bac..4182797fc78e5 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1521,6 +1521,12 @@ endif()
 
 # ---[ CUDA library.
 if(USE_CUDA)
+  # FIXME: If kineto is linked with CUPTI it pollutes torch_cpu with CUDA dependencies
+  # Even worse, it never declares that it depends on cudart, but calls the API, see
+  # https://github.com/pytorch/kineto/blob/aef2f5c0f15e3be52406ac0b885e8689de6bc9f6/libkineto/src/CudaDeviceProperties.cpp#L24
+  if(USE_KINETO AND NOT MSVC)
+    target_link_libraries(torch_cpu PRIVATE torch::cudart)
+  endif()
   target_link_libraries(torch_cuda INTERFACE torch::cudart)
   target_link_libraries(torch_cuda PUBLIC c10_cuda torch::nvtoolsext)
 
diff --git a/third_party/gloo b/third_party/gloo
index 5b14351326313..4a5e339b76426 160000
--- a/third_party/gloo
+++ b/third_party/gloo
@@ -1 +1 @@
-Subproject commit 5b143513263133af2b95547e97c07cebeb72bf72
+Subproject commit 4a5e339b764261d20fc409071dc7a8b8989aa195

From a2e6d727a2b87d8d6389be77ab0f918113da35ce Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Tue, 8 Nov 2022 13:47:26 -0500
Subject: [PATCH 0689/1922] Meta implementation for bernoulli (#88676)

For some reason bernoulli uses legacy memory format, see linked issue.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88676
Approved by: https://github.com/SherlockNoMad
---
 test/functorch/test_aotdispatch.py |  1 -
 test/test_proxy_tensor.py          |  1 -
 torch/_meta_registrations.py       | 19 +++++++++++++------
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 82ababe8c947c..68429764df3ad 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -996,7 +996,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('amin', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('as_strided', ''),  # Tensor-likes are not close!
     xfail('baddbmm', ''),  # aten.baddbmm.default - couldn't find symbolic meta function/decomposition
-    xfail('bernoulli', ''),  # aten.bernoulli.default - couldn't find symbolic meta function/decomposition
     xfail('block_diag', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('cartesian_prod', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('cdouble'),  # RuntimeError: aten.view_as_real.default - couldn't find symbolic meta function/decomposition
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index f45063adac70e..a3b981074d74e 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1140,7 +1140,6 @@ def f(a, b, c, d, e):
     xfail('argsort', ''),  # aten.sort.default - couldn't find symbolic meta function/decomposition
     xfail('argwhere', ''),  # aten.nonzero.default - couldn't find symbolic meta function/decomposition
     xfail('baddbmm', ''),  # aten.baddbmm.default - couldn't find symbolic meta function/decomposition
-    xfail('bernoulli', ''),  # aten.bernoulli.default - couldn't find symbolic meta function/decomposition
     xfail('bucketize', ''),  # aten.bucketize.Tensor - couldn't find symbolic meta function/decomposition
     xfail('cartesian_prod', ''),  # Tensors of type TensorImpl do not have numel
     xfail('cdist', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 89395729ad446..20e1651aafb46 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -252,11 +252,24 @@ def meta_pad2d(self, padding):
         return self.new_empty((nbatch, nplane, output_h, output_w))
 
 
+@register_meta([aten.bernoulli.default, aten.bernoulli.out])
+@out_wrapper()
+def meta_bernoulli(self, *, generator=None):
+    # https://github.com/pytorch/pytorch/issues/88612
+    return torch.empty_like(self).contiguous()
+
+
 @register_meta(aten.bernoulli_.float)
 def meta_bernoulli_(self, p=0.5, generator=None):
     return self
 
 
+@register_meta(aten.bernoulli.p)
+def meta_bernoulli_p(self, p=0.5, generator=None):
+    # https://github.com/pytorch/pytorch/issues/88612
+    return torch.empty_like(self).contiguous()
+
+
 @register_meta(aten._fused_moving_avg_obs_fq_helper.default)
 def meta__fused_moving_avg_obs_fq_helper(
     self,
@@ -311,12 +324,6 @@ def _compute_reduction_shape(self, dims, keepdim):
     return utils.compute_reduction_output_shape(self.shape, dims)
 
 
-@register_meta(aten.bernoulli.out)
-def meta_bernoulli(self, *, generator=None, out):
-    torch._resize_output_(out, self.size(), self.device)
-    return out
-
-
 # FakeTensors (meta tensors with a device) will report device as meta
 # when running meta kernels. Here, access the "fake device" of FakeTensor if it
 # exists so meta kernels which have diverge per device will be more

From 6f7715a093b6d1d2cc86ec411104ff97dc82a221 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Tue, 8 Nov 2022 13:47:26 -0500
Subject: [PATCH 0690/1922] Meta implementation for unsqueeze_ (#88675)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88675
Approved by: https://github.com/SherlockNoMad
---
 test/test_proxy_tensor.py    |  2 --
 torch/_meta_registrations.py | 17 +++++++++++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index a3b981074d74e..f724ea6c9f0ed 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -719,7 +719,6 @@ def deco(cls):
 
 @skipIfNoSympy
 @xfail_inherited_tests([
-    "test_inplace_metadata",
     "test_mode_tracing_factory_function",
     "test_make_fx_overloads",
     "test_resnet18_backward_trace",
@@ -1463,7 +1462,6 @@ def f(a, b, c, d, e):
     xfail('trunc', ''),  # aten.trunc_.default - couldn't find symbolic meta function/decomposition
     xfail('uniform', ''),  # aten.uniform_.default - couldn't find symbolic meta function/decomposition
     xfail('unique', ''),  # aten.unique_consecutive.default - couldn't find symbolic meta function/decomposition
-    xfail('unsqueeze', ''),  # aten.unsqueeze_.default - couldn't find symbolic meta function/decomposition
     xfail('xlogy', ''),  # aten.xlogy_.Tensor - couldn't find symbolic meta function/decomposition
 }
 
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 20e1651aafb46..48b523fc000c7 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -107,6 +107,23 @@ def meta_copy_(self, src, non_blocking=False):
     return self
 
 
+def inferUnsqueezeGeometry(tensor, dim):
+    result_sizes = list(tensor.size())
+    result_strides = list(tensor.stride())
+    new_stride = 1 if dim >= tensor.dim() else result_sizes[dim] * result_strides[dim]
+    result_sizes.insert(dim, 1)
+    result_strides.insert(dim, new_stride)
+    return result_sizes, result_strides
+
+
+@register_meta(aten.unsqueeze_.default)
+def meta_unsqueeze_(self, dim):
+    dim = maybe_wrap_dim(dim, self.dim() + 1)
+    g_sizes, g_strides = inferUnsqueezeGeometry(self, dim)
+    self.as_strided_(g_sizes, g_strides)
+    return self
+
+
 # Implementations below are taken from https://github.com/albanD/subclass_zoo/blob/main/python_meta_tensor.py
 @register_meta(aten.index_select.default)
 def meta_index_select(self, dim, index):

From 39a30839d2cc4f6c3d2e2352008b770bd7904104 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Tue, 8 Nov 2022 13:47:27 -0500
Subject: [PATCH 0691/1922] Meta registrations for inplace operators (#88678)

Also, handle non-default alpha correctly.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88678
Approved by: https://github.com/SherlockNoMad, https://github.com/albanD
---
 test/functorch/test_aotdispatch.py |  7 -------
 test/test_proxy_tensor.py          |  9 +--------
 torch/_meta_registrations.py       | 27 +++++++++++++++++++++++++++
 3 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 68429764df3ad..ebd4a25602839 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -989,7 +989,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
 
 symbolic_aot_autograd_failures = {
     xfail('__rmatmul__', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('addcdiv', ''),  # aten.fill_.Scalar - couldn't find symbolic meta function/decomposition
     xfail('addmv', ''),  # aten.addmv.default - couldn't find symbolic meta function/decomposition
     xfail('addr', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('amax', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
@@ -1038,8 +1037,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('fft.rfft2', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('fft.rfft', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('fft.rfftn', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('fmax', ''),  # aten.logical_or_.default - couldn't find symbolic meta function/decomposition
-    xfail('fmin', ''),  # aten.logical_or_.default - couldn't find symbolic meta function/decomposition
     xfail('frexp', ''),  # aten.frexp.Tensor - couldn't find symbolic meta function/decomposition
     xfail('gradient', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('hsplit', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
@@ -1126,7 +1123,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('nanmedian', ''),  # aten.logical_or_.default - couldn't find symbolic meta function/decomposition
 
     # Deleting this in a followup
-    xfail('nn.functional.feature_alpha_dropout', 'with_train'),
     xfail('nn.functional.poisson_nll_loss', ''),
 
     xfail('nn.functional._scaled_dot_product_attention', ''),  # Cannot call sizes() on tensor with symbolic ...
@@ -1142,9 +1138,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('nn.functional.cosine_similarity', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.cross_entropy', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.ctc_loss', ''),  # aten._ctc_loss.Tensor - couldn't find symbolic meta function/deco...
-    xfail('nn.functional.dropout2d', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
-    xfail('nn.functional.dropout3d', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('nn.functional.dropout', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('nn.functional.embedding_bag', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.fractional_max_pool2d', ''),  # rand() received an invalid combination of arguments - g...
     xfail('nn.functional.fractional_max_pool3d', ''),  # rand() received an invalid combination of arguments - g...
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index f724ea6c9f0ed..acb0856f860b5 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1259,9 +1259,6 @@ def f(a, b, c, d, e):
     xfail('nn.functional.cosine_similarity', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.cross_entropy', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.ctc_loss'),  # aten._ctc_loss.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('nn.functional.dropout2d', ''),  # Tensors of type TensorImpl do not have numel
-    xfail('nn.functional.dropout3d', ''),  # Tensors of type TensorImpl do not have numel
-    xfail('nn.functional.dropout', ''),  # Tensors of type TensorImpl do not have numel
     xfail('nn.functional.embedding_bag', ''),  # aten._embedding_bag_forward_only.default - couldn't find symbolic meta fun...
     xfail('nn.functional.embedding', ''),  # argument 'size' must be tuple of ints, but found element of type tor...
     xfail('nn.functional.fractional_max_pool2d', ''),  # argument 'size' must be tuple of ints, but found element of t...
@@ -1393,7 +1390,6 @@ def f(a, b, c, d, e):
     xfail('cumsum', ''),  # aten.cumsum_.default - couldn't find symbolic meta function/decomposition
     xfail('digamma', ''),  # aten.digamma_.default - couldn't find symbolic meta function/decomposition
     xfail('div', 'floor_rounding'),  # aten.div_.Tensor_mode - couldn't find symbolic meta function/decomposition
-    xfail('div', 'no_rounding_mode'),  # aten.div_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('div', 'trunc_rounding'),  # aten.div_.Tensor_mode - couldn't find symbolic meta function/decomposition
     xfail('eq', ''),  # aten.eq_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('erf', ''),  # aten.erf_.default - couldn't find symbolic meta function/decomposition
@@ -1420,9 +1416,6 @@ def f(a, b, c, d, e):
     xfail('log1p', ''),  # aten.log1p_.default - couldn't find symbolic meta function/decomposition
     xfail('log2', ''),  # aten.log2_.default - couldn't find symbolic meta function/decomposition
     xfail('log', ''),  # aten.log_.default - couldn't find symbolic meta function/decomposition
-    xfail('logical_and', ''),  # aten.logical_and_.default - couldn't find symbolic meta function/decomposition
-    xfail('logical_or', ''),  # aten.logical_or_.default - couldn't find symbolic meta function/decomposition
-    xfail('logical_xor', ''),  # aten.logical_xor_.default - couldn't find symbolic meta function/decomposition
     xfail('logit', ''),  # aten.logit_.default - couldn't find symbolic meta function/decomposition
     xfail('lt', ''),  # aten.lt_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('mvlgamma', 'mvlgamma_p_1'),  # aten.mvlgamma_.default - couldn't find symbolic meta function/decomposition
@@ -1433,6 +1426,7 @@ def f(a, b, c, d, e):
     xfail('neg', ''),  # aten.neg_.default - couldn't find symbolic meta function/decomposition
     xfail('nextafter', ''),  # aten.nextafter_.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.celu', ''),  # aten.celu_.default - couldn't find symbolic meta function/decomposition
+    xfail('nn.functional.dropout3d', ''),  # aten.squeeze_.dim - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.elu', ''),  # aten.elu_.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.hardsigmoid', ''),  # aten.hardsigmoid_.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.mish', ''),  # aten.mish_.default - couldn't find symbolic meta function/decomposition
@@ -1458,7 +1452,6 @@ def f(a, b, c, d, e):
     xfail('transpose', ''),  # aten.transpose_.default - couldn't find symbolic meta function/decomposition
     xfail('tril', ''),  # aten.tril_.default - couldn't find symbolic meta function/decomposition
     xfail('triu', ''),  # aten.triu_.default - couldn't find symbolic meta function/decomposition
-    xfail('true_divide', ''),  # aten.div_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('trunc', ''),  # aten.trunc_.default - couldn't find symbolic meta function/decomposition
     xfail('uniform', ''),  # aten.uniform_.default - couldn't find symbolic meta function/decomposition
     xfail('unique', ''),  # aten.unique_consecutive.default - couldn't find symbolic meta function/decomposition
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 48b523fc000c7..da18ba45a1ebb 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1139,6 +1139,33 @@ def meta_zero_(self):
     return self
 
 
+@register_meta(
+    [
+        aten.mul_.Scalar,
+        aten.div_.Scalar,
+        aten.mul_.Tensor,
+        aten.div_.Tensor,
+        aten.logical_and_.default,
+        aten.logical_or_.default,
+        aten.logical_xor_.default,
+    ],
+)
+def meta_binop_inplace(self, other):
+    return self
+
+
+@register_meta(
+    [
+        aten.add_.Scalar,
+        aten.sub_.Scalar,
+        aten.add_.Tensor,
+        aten.sub_.Tensor,
+    ],
+)
+def meta_binop_inplace_alpha(self, other, alpha=1):
+    return self
+
+
 @register_meta(aten.zero.default)
 def meta_zero(self):
     return self.new_empty(self.shape)

From 20b07e50bc39bc9f4982b17a83230fa1feb521ca Mon Sep 17 00:00:00 2001
From: Wei-Sheng Chin <wschin@outlook.com>
Date: Wed, 9 Nov 2022 01:31:42 +0000
Subject: [PATCH 0692/1922] Fix Python-bound function signature
 (torch._C.Graph.addInput) (#88528)

In pytorch/torch/_C/__init__.pyi, Graph.addInput has signature
```python
  def addInput(self, name: str) -> Value: ...
```
which doesn't match the corresponding function
```cpp
  Value* addInput(const std::string& name = "") {
    return block_->addInput(name);
  }

```

in python_ir.cpp. This PR aligns the bound function on both C++ and Python sides. Without this PR, mypy will compain whenever a change contains some calls to `addInput`; for example,
![image](https://user-images.githubusercontent.com/3524474/200092086-429b8d63-9321-4d03-b0d6-f4c9bd361756.png)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88528
Approved by: https://github.com/davidberard98
---
 test/jit/test_python_bindings.py    | 5 +++++
 torch/csrc/jit/python/python_ir.cpp | 6 +++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/test/jit/test_python_bindings.py b/test/jit/test_python_bindings.py
index 37c2ef7f85af7..51c5e0383b2ca 100644
--- a/test/jit/test_python_bindings.py
+++ b/test/jit/test_python_bindings.py
@@ -84,6 +84,11 @@ def test_graph_create(self):
         with self.assertRaises(ValueError):
             gr.create("prim::Constant", [None])
 
+    def test_add_input(self):
+        gr = torch._C.Graph()
+        foo_value = gr.addInput("foo")
+        assert foo_value in gr.inputs()
+
     def test_canonicalize(self):
         ir = """
 graph(%p207 : Tensor,
diff --git a/torch/csrc/jit/python/python_ir.cpp b/torch/csrc/jit/python/python_ir.cpp
index a19a8cd011db3..c1cae6eb300c6 100644
--- a/torch/csrc/jit/python/python_ir.cpp
+++ b/torch/csrc/jit/python/python_ir.cpp
@@ -382,7 +382,11 @@ void initPythonIRBindings(PyObject* module_) {
           "Find all nodes",
           py::arg("kind"),
           py::arg("recurse") = true)
-      .def("addInput", [](Graph& g) { return g.addInput(); })
+      .def(
+          "addInput",
+          [](Graph& g, const std::string& name) { return g.addInput(name); },
+          "Add input to graph with optional name seed",
+          py::arg("name") = "")
       .def("copy", [](Graph& g) { return g.copy(); })
       .GS(eraseInput)
       .GS(eraseOutput)

From 2ec59a69338139021a1cd34dc573b9a262f4afcf Mon Sep 17 00:00:00 2001
From: Eddie Yan <eddiey@nvidia.com>
Date: Wed, 9 Nov 2022 01:49:50 +0000
Subject: [PATCH 0693/1922] Hopper (`sm90`) support (#87736)

Essentially a followup of #87436

CC @xwang233 @ptrblck
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87736
Approved by: https://github.com/xwang233, https://github.com/malfet
---
 .../upstream/FindCUDA/select_compute_arch.cmake     | 13 ++++++++++++-
 torch/utils/cpp_extension.py                        |  3 ++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
index 822c041ee5268..65e7a6ac8993c 100644
--- a/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
+++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
@@ -94,23 +94,28 @@ if(CUDA_VERSION VERSION_GREATER "10.5")
 endif()
 
 if(NOT CUDA_VERSION VERSION_LESS "11.1")
-  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.6" "8.6+PTX")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.6")
   list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.6")
   set(CUDA_LIMIT_GPU_ARCHITECUTRE "8.6")
 
   if(CUDA_VERSION VERSION_LESS "11.8")
     set(CUDA_LIMIT_GPU_ARCHITECTURE "8.9")
+    list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.6+PTX")
   endif()
 endif()
 
 if(NOT CUDA_VERSION VERSION_LESS "11.8")
   list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Ada")
+  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Hopper")
   list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.9")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "9.0")
   list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.9")
+  list(APPEND CUDA_ALL_GPU_ARCHITECTURES "9.0")
 
   if(CUDA_VERSION VERSION_LESS "12.0")
     set(CUDA_LIMIT_GPU_ARCHITECTURE "9.0")
     list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.9+PTX")
+    list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "9.0+PTX")
   endif()
 endif()
 
@@ -248,6 +253,12 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
       elseif(${arch_name} STREQUAL "Ampere")
         set(arch_bin 8.0)
         set(arch_ptx 8.0)
+      elseif(${arch_name} STREQUAL "Ada")
+        set(arch_bin 8.9)
+        set(arch_ptx 8.9)
+      elseif(${arch_name} STREQUAL "Hopper")
+        set(arch_bin 9.0)
+        set(arch_ptx 9.0)
       else()
         message(SEND_ERROR "Unknown CUDA Architecture Name ${arch_name} in CUDA_SELECT_NVCC_ARCH_FLAGS")
       endif()
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 612ae9fdf0785..aa03da23b38da 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -1730,10 +1730,11 @@ def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
         ('Turing', '7.5+PTX'),
         ('Ampere', '8.0;8.6+PTX'),
         ('Ada', '8.9+PTX'),
+        ('Hopper', '9.0+PTX'),
     ])
 
     supported_arches = ['3.5', '3.7', '5.0', '5.2', '5.3', '6.0', '6.1', '6.2',
-                        '7.0', '7.2', '7.5', '8.0', '8.6', '8.9']
+                        '7.0', '7.2', '7.5', '8.0', '8.6', '8.9', '9.0']
     valid_arch_strings = supported_arches + [s + "+PTX" for s in supported_arches]
 
     # The default is sm_30 for CUDA 9.x and 10.x

From fd6472a1ae414a145af3c12628ef671501e9a828 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Wed, 9 Nov 2022 06:55:22 +0000
Subject: [PATCH 0694/1922] [BE] Move `setup-ssh` step ahead of clone PyTorch
 (#88715)

It allows one to SSH faster rather than having to wait for repo clone to
finish.

I.e. right now one usually have to wait for a few minutes fore PyTorch clone is finished, but with this change you can SSH ahead of time (thanks to `setup-ssh` being a composite action

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88715
Approved by: https://github.com/clee2000, https://github.com/izaitsevfb
---
 .github/workflows/_android-build-test.yml      | 10 +++++-----
 .github/workflows/_android-full-build-test.yml | 10 +++++-----
 .github/workflows/_bazel-build-test.yml        | 10 +++++-----
 .github/workflows/_binary-build-linux.yml      |  8 ++++----
 .github/workflows/_binary-test-linux.yml       |  8 ++++----
 .github/workflows/_docs.yml                    | 10 +++++-----
 .github/workflows/_linux-build.yml             | 10 +++++-----
 .github/workflows/_linux-test.yml              | 10 +++++-----
 .github/workflows/_win-build.yml               | 10 +++++-----
 .github/workflows/build-triton-wheel.yml       | 10 +++++-----
 .github/workflows/docker-release.yml           |  8 ++++----
 11 files changed, 52 insertions(+), 52 deletions(-)

diff --git a/.github/workflows/_android-build-test.yml b/.github/workflows/_android-build-test.yml
index 5538bc58cf425..dfa48daa84acd 100644
--- a/.github/workflows/_android-build-test.yml
+++ b/.github/workflows/_android-build-test.yml
@@ -28,6 +28,11 @@ jobs:
     if: github.repository_owner == 'pytorch'
     runs-on: [self-hosted, linux.2xlarge]
     steps:
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
@@ -35,11 +40,6 @@ jobs:
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
 
-      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-
       - name: Calculate docker image
         id: calculate-docker-image
         uses: ./.github/actions/calculate-docker-image
diff --git a/.github/workflows/_android-full-build-test.yml b/.github/workflows/_android-full-build-test.yml
index 9f110db307aea..ea07fda814b1d 100644
--- a/.github/workflows/_android-full-build-test.yml
+++ b/.github/workflows/_android-full-build-test.yml
@@ -28,6 +28,11 @@ jobs:
     if: github.repository_owner == 'pytorch'
     runs-on: [self-hosted, linux.2xlarge]
     steps:
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
@@ -35,11 +40,6 @@ jobs:
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
 
-      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-
       - name: Calculate docker image
         id: calculate-docker-image
         uses: ./.github/actions/calculate-docker-image
diff --git a/.github/workflows/_bazel-build-test.yml b/.github/workflows/_bazel-build-test.yml
index a64758c2b1182..79445e1dad6c1 100644
--- a/.github/workflows/_bazel-build-test.yml
+++ b/.github/workflows/_bazel-build-test.yml
@@ -28,6 +28,11 @@ jobs:
     if: github.repository_owner == 'pytorch'
     runs-on: [self-hosted, linux.2xlarge]
     steps:
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
@@ -35,11 +40,6 @@ jobs:
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
 
-      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-
       - name: Calculate docker image
         id: calculate-docker-image
         uses: ./.github/actions/calculate-docker-image
diff --git a/.github/workflows/_binary-build-linux.yml b/.github/workflows/_binary-build-linux.yml
index b023ad6701c61..192ca251b79ff 100644
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@@ -126,16 +126,16 @@ jobs:
       - name: List the env
         shell: bash
         run: env
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        with:
+          github-secret: ${{ secrets.github-token }}
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
       - name: Chown workspace
         uses: ./.github/actions/chown-workspace
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        with:
-          github-secret: ${{ secrets.github-token }}
       - name: Clean workspace
         shell: bash
         run: |
diff --git a/.github/workflows/_binary-test-linux.yml b/.github/workflows/_binary-test-linux.yml
index c18afe1b5b6ce..12b3d4c648228 100644
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@@ -122,6 +122,10 @@ jobs:
             echo "SHA1=${{ env.SHA1 }}"
           } >> "${GITHUB_ENV} }}"
 
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        with:
+          github-secret: ${{ secrets.github-token }}
         # Setup the environment
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
@@ -129,10 +133,6 @@ jobs:
         uses: ./.github/actions/setup-linux
       - name: Chown workspace
         uses: ./.github/actions/chown-workspace
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        with:
-          github-secret: ${{ secrets.github-token }}
       - name: Clean workspace
         shell: bash
         run: |
diff --git a/.github/workflows/_docs.yml b/.github/workflows/_docs.yml
index 2aa1e48188a5d..cb4dc71c8c897 100644
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@@ -63,6 +63,11 @@ jobs:
     # The current name requires updating the Rockset last docs push query from test-infra every time the matrix is updated
     name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }}
     steps:
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
@@ -70,11 +75,6 @@ jobs:
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
 
-      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-
       - name: Pull docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml
index 3719069895b3b..be3d2ce98c030 100644
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@@ -66,6 +66,11 @@ jobs:
     outputs:
       docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
     steps:
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+
       # [pytorch repo ref]
       # Use a pytorch/pytorch reference instead of a reference to the local
       # checkout because when we run this action we don't *have* a local
@@ -76,11 +81,6 @@ jobs:
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
 
-      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-
       - name: Calculate docker image
         id: calculate-docker-image
         uses: ./.github/actions/calculate-docker-image
diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
index 995fa4b727935..d2f48acca4e85 100644
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@@ -57,17 +57,17 @@ jobs:
       fail-fast: false
     runs-on: ${{ matrix.runner }}
     steps:
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
 
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
 
-      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-
       - name: Pull docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml
index faa37060d321d..3ce41afaf7cc1 100644
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@@ -46,6 +46,11 @@ jobs:
     runs-on: [self-hosted, windows.4xlarge]
     timeout-minutes: 240
     steps:
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
@@ -57,11 +62,6 @@ jobs:
         with:
           cuda-version: ${{ inputs.cuda-version }}
 
-      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-
       - name: Parse ref
         id: parse-ref
         run: .github/scripts/parse_ref.py
diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml
index e3f02e6b77b36..fac2a1340b42c 100644
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@@ -31,6 +31,11 @@ jobs:
       DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.6
       PY_VERS: ${{ matrix.py_vers }}
     steps:
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
         with:
@@ -39,11 +44,6 @@ jobs:
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
 
-      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml
index d61b1b2c1242b..fc10f58344438 100644
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@@ -45,6 +45,10 @@ jobs:
       BUILD_IMAGE_TYPE: ${{ matrix.image_type }}
       BUILD_PLATFORMS: ${{ matrix.platform }}
     steps:
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
       # [see note: pytorch repo ref]
       # deep clone (fetch-depth 0) required for git merge-base
       - name: Checkout PyTorch
@@ -54,10 +58,6 @@ jobs:
           submodules: 'recursive'
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
-      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
       - name: Login to GitHub Container Registry
         if: ${{ env.WITH_PUSH == 'true' }}
         uses: docker/login-action@v2

From a382afc20331cc5a7542bb56e8b243626ccdef88 Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Wed, 9 Nov 2022 08:08:11 +0000
Subject: [PATCH 0695/1922] [Quant] Support lowering of channel shuffle in FX
 (#83731)

## Description
Support lowering of channel shuffle in FX by adding its module and functional op to `is_copy_node` list in `torch/ao/quantization/fx/_lower_to_native_backend.py`

## Validation
UTs added to test
- correctness of quantized `ChannelShuffle` module.
- FX lowering of `ChannelShuffle` module and functional `channel_shuffle`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/83731
Approved by: https://github.com/jerryzh168
---
 .../core/test_quantized_module.py             | 27 ++++++++++++
 test/quantization/fx/test_quantize_fx.py      | 43 +++++++++++++++++++
 .../fx/_lower_to_native_backend.py            |  4 ++
 3 files changed, 74 insertions(+)

diff --git a/test/quantization/core/test_quantized_module.py b/test/quantization/core/test_quantized_module.py
index 5964de70b8e39..780f1ebb6cd57 100644
--- a/test/quantization/core/test_quantized_module.py
+++ b/test/quantization/core/test_quantized_module.py
@@ -1036,6 +1036,33 @@ def test_prelu(self):
         self.assertEqual(qy_ref, qy,
                          msg="PReLU module API failed")
 
+    def test_channel_shuffle(self):
+        """Tests the correctness of the ChannelShuffle module.
+        """
+        x_scale = 10.0 / 256
+        x_zero_point = 1
+        y_scale = x_scale
+        y_zero_point = x_zero_point
+
+        dims = (1, 4, 4, 8)
+        groups = 2
+
+        X = (torch.randn(dims, dtype=torch.float) - 0.5) * 10
+        qX = torch.quantize_per_tensor(X, x_scale, x_zero_point, dtype=torch.quint8)
+        dqX = qX.dequantize()
+
+        float_mod = torch.nn.ChannelShuffle(groups).float()
+        dqY_ref = float_mod(dqX)
+        qY_ref = torch.quantize_per_tensor(
+            dqY_ref, y_scale, y_zero_point, dtype=torch.quint8)
+
+        quant_mod = torch.nn.ChannelShuffle(groups)
+        qY = quant_mod(qX)
+
+        self.assertEqual(qY_ref.int_repr().numpy(), qY.int_repr().numpy(),
+                         msg="ChannelShuffle module API failed, qY_ref\n{} vs qY\n{}"
+                         .format(qY_ref, qY))
+
 class TestDynamicQuantizedModule(QuantizationTestCase):
     def _test_qconv_impl(self, q_mod, dq_mod, dim, dtype, bias):
         in_channels = 3
diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 2bc1ed4fc43ee..236a5587d859a 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -5295,6 +5295,49 @@ def forward(self, x: torch.Tensor):
             backend_config=backend_config
         )
 
+    def test_channel_shuffle_lowering(self):
+        # Three versions of channel shuffle
+        class M1(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.op = torch.nn.ChannelShuffle(2)
+
+            def forward(self, x):
+                return self.op(x + x) + x
+
+        class M2(torch.nn.Module):
+            def forward(self, x):
+                return torch.channel_shuffle(x + x, 2) + x
+
+        class M3(torch.nn.Module):
+            def forward(self, x):
+                return torch.nn.functional.channel_shuffle(x + x, 2) + x
+
+        x = torch.randn(4, 4, 4, 4)
+        # torch.channel_shuffle is equivalent to torch.nn.functional.channel_shuffle
+        model_node_pairs = [
+            (M1().eval(), ns.call_module(torch.nn.ChannelShuffle)),
+            (M2().eval(), ns.call_function(torch.channel_shuffle)),
+            (M3().eval(), ns.call_function(torch.channel_shuffle))
+        ]
+        for m, node in model_node_pairs:
+            m = prepare_fx(m, {"": default_qconfig}, example_inputs=(x,))
+            m_copy = copy.deepcopy(m)
+            m = convert_fx(m)
+            m_ref = convert_to_reference_fx(m_copy)
+            node_occurrence = {
+                node: 1,
+                ns.call_function(torch.quantize_per_tensor): 1,
+                ns.call_method("dequantize"): 1
+            }
+            node_occurrence_ref = {
+                node: 1,
+                ns.call_function(torch.quantize_per_tensor): 4,
+                ns.call_method("dequantize"): 4
+            }
+            self.checkGraphModuleNodes(m, expected_node_occurrence=node_occurrence)
+            self.checkGraphModuleNodes(m_ref, expected_node_occurrence=node_occurrence_ref)
+
 @skipIfNoFBGEMM
 class TestQuantizeFxOps(QuantizationTestCase):
     def setUp(self):
diff --git a/torch/ao/quantization/fx/_lower_to_native_backend.py b/torch/ao/quantization/fx/_lower_to_native_backend.py
index aa71fafbf00e9..f08f5c6073b7a 100644
--- a/torch/ao/quantization/fx/_lower_to_native_backend.py
+++ b/torch/ao/quantization/fx/_lower_to_native_backend.py
@@ -111,6 +111,9 @@ def is_copy_node(node, modules):
         torch.flatten,
         torch.mean,
         operator.floordiv,
+        # F.channel_shuffle and torch.channel_shuffle are essentially the same thing
+        # so we only need to put one of them here
+        torch.channel_shuffle,
     ]
     method_list = [
         "clamp",
@@ -131,6 +134,7 @@ def is_copy_node(node, modules):
         torch.nn.MaxPool3d,
         torch.nn.ReLU,
         torch.nn.ReLU6,
+        torch.nn.ChannelShuffle,
     ]
     return _is_node_in_list(node, modules, func_list, method_list, module_type_list)
 

From 17b152aae62bc123a8cfd5590c4b0a86f51750bf Mon Sep 17 00:00:00 2001
From: Nikita Karetnikov <nikita@karetnikov.org>
Date: Wed, 9 Nov 2022 00:53:37 +0100
Subject: [PATCH 0696/1922] [primTorch] Improve `narrow` and `narrow_copy`:
 refs, tests, docs (#87045)

Fixes #87019.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87045
Approved by: https://github.com/mruberry
---
 aten/src/ATen/native/TensorShape.cpp          |  95 +------------
 aten/src/ATen/native/native_functions.yaml    |   4 +-
 test/test_meta.py                             |   1 -
 torch/_refs/__init__.py                       |  38 +++--
 torch/_tensor_docs.py                         |  13 +-
 torch/_torch_docs.py                          |  27 ++--
 torch/csrc/jit/runtime/static/ops.cpp         |   8 +-
 .../_internal/common_methods_invocations.py   | 133 ++++++++++++++----
 8 files changed, 166 insertions(+), 153 deletions(-)

diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 31b4011c12813..15c22dc7d4111 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -1196,18 +1196,14 @@ Tensor narrow_copy_dense(const Tensor& self, int64_t dim, int64_t start, int64_t
   return self.narrow(dim, start, length).clone(at::MemoryFormat::Contiguous);
 }
 
-Tensor narrow_copy_dense_cpu(const Tensor& self, int64_t dim, int64_t start, int64_t length){
-  auto output = at::empty_like(self);
-  return narrow_copy_dense_cpu_out(self, dim, start, length, output);
-}
-
 Tensor narrow_copy_sparse(const Tensor& self, int64_t dim, int64_t start, int64_t length) {
   int64_t allDim = self.dim();
   int64_t end = start+length;
   TORCH_CHECK(allDim > 0, "narrow() cannot be applied to a 0-dim tensor.");
+  TORCH_CHECK(length >= 0, "narrow(): length must be non-negative.");
   TORCH_CHECK(dim >= 0 && dim < allDim,
     "Dimension ", dim, " out of range. Expecting 0 <= dim < ", allDim, ".");
-  TORCH_CHECK(start >= 0 && length >= 0 && end <= self.size(dim),
+  TORCH_CHECK(start >= 0 && end <= self.size(dim),
     "Invalid range to narrow. range(start, start+length) must be a subset of range(0, ", self.size(dim), ").")
   Tensor indices = self._indices();
   int64_t sparse_dim = self.sparse_dim();
@@ -1235,105 +1231,26 @@ Tensor narrow_copy_sparse(const Tensor& self, int64_t dim, int64_t start, int64_
   return newTensor._coalesced_(self.is_coalesced());
 }
 
-Tensor& narrow_copy_dense_cpu_out(
-  const Tensor& self, int64_t dim, int64_t start, int64_t length, Tensor& output
-) {
-
-  TORCH_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor.");
-  TORCH_CHECK(self.dtype() == output.dtype());
-
-  auto self_contig = self.expect_contiguous();
-  const auto self_sizes = self_contig->sizes();
-
-  // wrap dim if negative and do bound check
-  if (dim < 0) {
-    dim = at::maybe_wrap_dim(dim, self_sizes.size());
-  } else {
-    TORCH_CHECK(dim < static_cast<int64_t>(self_sizes.size()));
-  }
-
-  // wrap start and do bound check
-  const auto cur_size = self_sizes[dim];
-  if (start != cur_size && start < 0) { // start being the end is valid, but
-                                        // not a valid dim specification.
-    start = at::maybe_wrap_dim(start, cur_size);
-  }
-  TORCH_CHECK(
-      length >= 0 && start <= cur_size - length,
-      "start (",
-      start,
-      ") + length (",
-      length,
-      ") exceeds dimension size (",
-      cur_size,
-      ").");
-
-  // resize output
-  auto output_sizes = self_sizes.vec();
-  output_sizes[dim] = length;
-  at::native::resize_(output, output_sizes);
-
-  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
-  const int64_t unit = c10::size_from_dim_(dim + 1, self_sizes);
-  const int64_t num_blocks = c10::size_to_dim_(dim, self_sizes);
-
-  const auto itemsize = self_contig->dtype().itemsize();
-  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-  size_t src_nbytes = itemsize * self_contig->numel();
-  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-  size_t dst_nbytes = itemsize * output.numel();
-
-  size_t src_block_size = unit * self_sizes[dim];
-  size_t dst_block_size = unit * length;
-
-  if (num_blocks == 0 || dst_block_size == 0) {
-    return output;
-  }
-
-  char* src_bytes = static_cast<char*>(self_contig->data_ptr());
-  char* dst_bytes = static_cast<char*>(output.data_ptr());
-
-  size_t src_block_size_bytes = itemsize * src_block_size;
-  size_t dst_block_size_bytes = itemsize * dst_block_size;
-  size_t src_offset = unit * start;
-
-  char* src_offset_bytes = src_bytes + itemsize * src_offset;
-  char* dst_offset_bytes = dst_bytes;
-
-  for (const auto i : c10::irange(num_blocks)) {
-    char* local_src_offset_bytes = src_offset_bytes + i * src_block_size_bytes;
-    char* local_dst_offset_bytes = dst_offset_bytes + i * dst_block_size_bytes;
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        static_cast<void*>(local_src_offset_bytes + dst_block_size_bytes) <=
-        static_cast<void*>(src_bytes + src_nbytes));
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        static_cast<void*>(local_dst_offset_bytes + dst_block_size_bytes) <=
-        static_cast<void*>(dst_bytes + dst_nbytes));
-
-    memcpy(
-        local_dst_offset_bytes, local_src_offset_bytes, dst_block_size_bytes);
-  }
-  return output;
-}
-
 Tensor narrow(const Tensor& self, int64_t dim, int64_t start, int64_t length) {
   TORCH_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor.");
+  TORCH_CHECK(length >= 0, "narrow(): length must be non-negative.");
   auto cur_size = self.size(dim);
   if (start != cur_size) {  // start being the end is valid, but not a valid dim specification.
     start = maybe_wrap_dim(start, cur_size);
   }
-  TORCH_CHECK(length >= 0 && start <= cur_size - length,
+  TORCH_CHECK(start <= cur_size - length,
            "start (", start, ") + length (", length, ") exceeds dimension size (", cur_size, ").");
   return at::slice(self, dim, start, start + length, 1);
 }
 
 Tensor narrow_symint(const Tensor& self, int64_t dim, SymInt start, SymInt length) {
   TORCH_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor.");
+  TORCH_CHECK(length >= 0, "narrow(): length must be non-negative.");
   auto cur_size = self.sym_size(dim);
   if (start != cur_size) {  // start being the end is valid, but not a valid dim specification.
     start = maybe_wrap_dim(start, cur_size);
   }
-  TORCH_CHECK(length >= 0 && start <= cur_size - length,
+  TORCH_CHECK(start <= cur_size - length,
            "start (", start, ") + length (", length, ") exceeds dimension size (", cur_size, ").");
   return at::slice_symint(self, dim, start, start + length, 1);
 }
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 4c7a351e8f1bf..2a48f5de8fcc3 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -3768,14 +3768,14 @@
 - func: narrow_copy(Tensor self, int dim, SymInt start, SymInt length) -> Tensor
   variants: function, method
   dispatch:
-    CPU: narrow_copy_dense_cpu
     SparseCPU, SparseCUDA: narrow_copy_sparse
     CompositeExplicitAutogradNonFunctional: narrow_copy_dense_symint
   tags: view_copy
 
 - func: narrow_copy.out(Tensor self, int dim, SymInt start, SymInt length, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
   dispatch:
-    CPU: narrow_copy_dense_cpu_out
+    CompositeExplicitAutogradNonFunctional: narrow_copy_out
 
 - func: narrow(Tensor(a) self, int dim, SymInt start, SymInt length) -> Tensor(a)
   variants: function, method
diff --git a/test/test_meta.py b/test/test_meta.py
index ef25d184c8428..ae248a90cffb7 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -745,7 +745,6 @@ def run_meta_crossref(
 }
 
 meta_function_device_skips['cpu'] = {
-    torch.narrow_copy: {b8, bf16, c128, c32, c64, f16, f32, f64, i16, i32, i64, i8, u8},
     torch.native_batch_norm: {f32, f64},
 }
 
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index cd0344eba7a91..ecb6a86d5f911 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -2749,19 +2749,39 @@ def flipud(a: TensorLikeType) -> TensorLikeType:
 
 
 # CompositeImplicitAutograd - don't register decomp
-def narrow(a: TensorLikeType, dim: int, start: int, length: int) -> TensorLikeType:
+def narrow(
+    a: TensorLikeType, dim: int, start: Union[int, TensorLikeType], length: int
+) -> TensorLikeType:
+    # Supports Tensor overload that was added for XLA:
+    # https://github.com/pytorch/pytorch/issues/31558
+    if isinstance(start, TensorLike):
+        check(
+            start.dim() == 0 and utils.is_integer_dtype(start.dtype),
+            lambda: "start must be an 0-dim integral Tensor.",
+        )
+        start = start.item()  # type: ignore[assignment]
+    check(a.dim() > 0, lambda: "narrow() cannot be applied to a 0-dim tensor.")
+    check(length >= 0, lambda: "narrow(): length must be non-negative.")
     dim = utils.canonicalize_dim(a.ndim, dim)
+    dim_length = a.size(dim)
+    # Start being the end is usually invalid since it's out of bounds. So it's
+    # not allowed by canonicalize_dim. But for narrow it's valid as long as
+    # the length is 0, which is handled by the check below.
+    if start != dim_length:
+        # Negative start means indexing from the end of dim.
+        # Note: a dimension isn't being canonicalized here, this reuses
+        # canonicalize_dim because the semantics are similar.
+        start = utils.canonicalize_dim(dim_length, start)  # type: ignore[arg-type]
+    check(
+        start <= dim_length - length,  # type: ignore[arg-type]
+        lambda: f"start ({start}) + length ({length}) exceeds dimension size ({dim_length}).",
+    )
     return prims.slice_in_dim(a, start, start + length, axis=dim)
 
 
-@register_decomposition(torch.ops.aten.narrow_copy)
-@out_wrapper()
-def narrow_copy(a: TensorLikeType, dim: int, start: int, length: int) -> TensorLikeType:
-    # TODO: This must return a sparse tensor if the input is sparse, but refs
-    # have no sparse support.  See narrow_copy_sparse in core.
-    if a.is_sparse:
-        raise NotImplementedError("narrow_copy ref doesn't support sparse tensors")
-    return torch.clone(torch.narrow(a=a, dim=dim, start=start, length=length))  # type: ignore[call-overload]
+# TODO: This must return a sparse tensor if the input is sparse, but refs have
+# no sparse support. See narrow_copy_sparse in core.
+narrow_copy = _make_copy_from_view(narrow)
 
 
 def _normalize(
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 8c734a1f3774b..726ae5137e6a4 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -3436,18 +3436,7 @@ def callable(a, b) -> number
     r"""
 narrow(dimension, start, length) -> Tensor
 
-See :func:`torch.narrow`
-
-Example::
-
-    >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
-    >>> x.narrow(0, 0, 2)
-    tensor([[ 1,  2,  3],
-            [ 4,  5,  6]])
-    >>> x.narrow(1, 1, 2)
-    tensor([[ 2,  3],
-            [ 5,  6],
-            [ 8,  9]])
+See :func:`torch.narrow`.
 """,
 )
 
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 40375bae3e274..2ff2e9be315de 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -7980,8 +7980,10 @@ def merge_dicts(*dicts):
 Args:
     input (Tensor): the tensor to narrow
     dim (int): the dimension along which to narrow
-    start (Tensor or int): the starting dimension
-    length (int): the distance to the ending dimension
+    start (int or Tensor): index of the element to start the narrowed dimension
+        from. Can be negative, which means indexing from the end of `dim`. If
+        `Tensor`, it must be an 0-dim integral `Tensor` (bools not allowed)
+    length (int): length of the narrowed dimension, must be weakly positive
 
 Example::
 
@@ -7993,6 +7995,10 @@ def merge_dicts(*dicts):
     tensor([[ 2,  3],
             [ 5,  6],
             [ 8,  9]])
+    >>> torch.narrow(x, -1, torch.tensor(-1), 1)
+    tensor([[3],
+            [6],
+            [9]])
 """,
 )
 
@@ -8008,8 +8014,9 @@ def merge_dicts(*dicts):
 Args:
     input (Tensor): the tensor to narrow
     dim (int): the dimension along which to narrow
-    start (int): the starting offset
-    length (int): the distance to the ending dimension
+    start (int): index of the element to start the narrowed dimension from. Can
+        be negative, which means indexing from the end of `dim`
+    length (int): length of the narrowed dimension, must be weakly positive
 
 Keyword args:
     {out}
@@ -8027,13 +8034,13 @@ def merge_dicts(*dicts):
     >>> s = torch.arange(16).reshape(2, 2, 2, 2).to_sparse(2)
     >>> torch.narrow_copy(s, 0, 0, 1)
     tensor(indices=tensor([[0, 0],
-                        [0, 1]]),
-        values=tensor([[[0, 1],
-                        [2, 3]],
+                           [0, 1]]),
+           values=tensor([[[0, 1],
+                           [2, 3]],
 
-                        [[4, 5],
-                        [6, 7]]]),
-        size=(1, 2, 2, 2), nnz=2, layout=torch.sparse_coo)
+                          [[4, 5],
+                           [6, 7]]]),
+           size=(1, 2, 2, 2), nnz=2, layout=torch.sparse_coo)
 
 .. seealso::
 
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 92044ca565a9c..e80c7c1460b80 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -1223,8 +1223,8 @@ REGISTER_OPERATOR_FUNCTOR(aten::narrow_copy, aten_narrow_copy, [](Node* n) -> SR
     return nullptr;
   }
   return [](ProcessedNode* p_node) {
-    const auto& self = p_node->Input(0).toTensor(); // self
-    const auto dim = p_node->Input(1).toInt(); // dim
+    auto& self = p_node->Input(0).toTensor(); // self
+    auto dim = p_node->Input(1).toInt(); // dim
     int64_t start = 0;
     if (p_node->Input(2).isScalar()) {
       start = p_node->Input(2).toInt();
@@ -1236,12 +1236,12 @@ REGISTER_OPERATOR_FUNCTOR(aten::narrow_copy, aten_narrow_copy, [](Node* n) -> SR
 
     if (p_node->Output(0).isNone()) {
       p_node->Output(0) =
-          at::native::narrow_copy_dense_cpu(self, dim, start, length);
+          at::native::narrow_copy_dense_symint(self, dim, start, length);
       return;
     }
     auto& output = p_node->Output(0).toTensor();
     fastResizeToZero(output);
-    at::native::narrow_copy_dense_cpu_out(self, dim, start, length, output);
+    at::narrow_copy_out(output, self, dim, start, length);
   };
 });
 REGISTER_OPERATOR_FUNCTOR(aten::index, aten_index, [](Node* n) -> SROperator {
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 6ef49a5473d8c..a7a492dcd526e 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -4296,29 +4296,109 @@ def sample_repeat_tile(op_info, device, dtype, requires_grad, **kwargs):
         yield SampleInput(make_arg(shape), rep_dim)
 
 
-def sample_inputs_narrow_copy(op_info, device, dtype, requires_grad, **kwargs):
+def sample_inputs_narrow_narrow_copy(op_info, device, dtype, requires_grad, *, is_narrow, **kwargs):
     shapes_and_args = (
-        ((S, S, S), (1, 2, 2)),
-        ((S, S, S), (-1, 2, 2)),
-        ((S, S, S), (1, 0, 0)),
-        ((S, S, S), (-1, 0, 0)),
-        ((S, S, S), (2, 1, 2)),
+        ((S, S, S), 1, 2, 2),
+        ((S, S, S), -1, 2, 2),
+        ((S, S, S), 1, 0, 0),
+        ((S, S, S), -1, 0, 0),
+        ((S, S, S), 2, 1, 2),
     )
 
-    for shape, args in shapes_and_args:
+    for shape, dim, start, length in shapes_and_args:
         tensor = make_tensor(shape, dtype=dtype, device=device, low=None, high=None,
                              requires_grad=requires_grad)
-        yield SampleInput(tensor, args=args)
+        yield SampleInput(tensor, dim, start, length)
+        # narrow also accepts the start argument being a Tensor
+        if is_narrow:
+            yield SampleInput(tensor, dim, torch.tensor(start), length)
 
+def reference_inputs_narrow_narrow_copy(op_info, device, dtype, requires_grad, *, is_narrow, **kwargs):
+    yield from sample_inputs_narrow_narrow_copy(op_info, device, dtype, requires_grad, is_narrow=is_narrow, **kwargs)
 
-def sample_inputs_narrow(op_info, device, dtype, requires_grad, **kwargs):
-    '''
-    sample_inputs_narrow accepts the same inputs as narrow_copy, in addition
-    narrow also accepts `start` argument to be a Tensor.
-    '''
-    for sample in sample_inputs_narrow_copy(op_info, device, dtype, requires_grad, **kwargs):
-        yield sample
-        yield SampleInput(sample.input, args=(sample.args[0], torch.tensor(sample.args[1]), sample.args[2]))
+    shapes_and_args = (
+        # 1-dim
+        ((M,), 0, 0, 0),    # 0 elems from the left
+        ((M,), -1, -1, 0),  # 0 elems from the right
+        ((M,), 0, 5, 3),    # 3 elems from the left
+        ((M,), 0, -5, 2),   # 2 elems from the right
+        ((M,), -1, 0, M),   # M elems from the left
+        ((M,), 0, -M, M),   # M elems from the right
+
+        # 2-dim
+        ((M, S), 1, 0, 0),    # dim 1, 0 elems from the left
+        ((S, M), -2, -1, 0),  # dim 0, 0 elems from the right
+        ((L, S), 1, 2, 3),    # dim 1, 3 elems from the left
+        ((L, S), -1, 3, 2),   # dim 1, 2 elems from the left
+        ((M, L), 0, 0, M),    # dim 0, M elems from the left
+        ((M, L), -1, -L, L),  # dim 1, L elems from the right
+
+        # 3-dim
+        ((L, M, S), 2, 0, 0),    # dim 2, 0 elems from the left
+        ((M, S, L), -1, -1, 0),  # dim 2, 0 elems from the right
+        ((S, L, M), 2, 0, M),    # dim 2, M elems from the left
+        ((L, S, M), -1, -M, M),  # dim 2, M elems from the right
+        ((S, L, M), 1, 0, 0),    # dim 1, 0 elems from the left
+        ((S, L, M), 0, 2, 1),    # dim 0, 1 elem from the left
+        ((M, S, M), -1, -5, 4),  # dim 2, 4 elems from the right
+    )
+
+    for shape, dim, start, length in shapes_and_args:
+        tensor = make_tensor(shape, dtype=dtype, device=device, low=None, high=None,
+                             requires_grad=requires_grad)
+        yield SampleInput(tensor, dim, start, length)
+        # narrow also accepts the start argument being a Tensor
+        if is_narrow:
+            yield SampleInput(tensor, dim, torch.tensor(start), length)
+
+def error_inputs_narrow_narrow_copy(op_info, device, *, is_narrow):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+
+    # 0-dim
+    yield ErrorInput(SampleInput(make_arg(()), 0, 0, 1),
+                     error_type=RuntimeError,
+                     error_regex=r"narrow\(\) cannot be applied to a 0-dim tensor\.")
+
+    # out of bounds dim
+    yield ErrorInput(SampleInput(make_arg((M, S, L)), 3, 0, 0),
+                     error_type=IndexError,
+                     error_regex=r"Dimension out of range \(expected to be in range of \[-3, 2\], but got 3\)")
+    # out of bounds dim (negative)
+    yield ErrorInput(SampleInput(make_arg((L, S, M)), -4, 0, 0),
+                     error_type=IndexError,
+                     error_regex=r"Dimension out of range \(expected to be in range of \[-3, 2\], but got -4\)")
+
+    # out of bounds start
+    yield ErrorInput(SampleInput(make_arg((L, M, S)), 1, M + 1, 0),
+                     error_type=IndexError,
+                     error_regex=r"Dimension out of range \(expected to be in range of \[-10, 9\], but got 11\)")
+    # out of bounds start (negative)
+    yield ErrorInput(SampleInput(make_arg((L, M, S)), 1, -M - 1, 0),
+                     error_type=IndexError,
+                     error_regex=r"Dimension out of range \(expected to be in range of \[-10, 9\], but got -11\)")
+
+    # out of bounds length
+    yield ErrorInput(SampleInput(make_arg((S, L, M)), 2, 0, M + 1),
+                     error_type=RuntimeError,
+                     error_regex=r"start \(0\) \+ length \(11\) exceeds dimension size \(10\)\.")
+    # out of bounds length (negative)
+    yield ErrorInput(SampleInput(make_arg((M,)), 0, 0, -1),
+                     error_type=RuntimeError,
+                     error_regex=r"narrow\(\): length must be non-negative\.")
+
+    # Test Tensor overload that was added for XLA. Start must be an 0-dim
+    # integral Tensor. narrow_copy doesn't have this overload.
+    # https://github.com/pytorch/pytorch/issues/31558
+    if is_narrow:
+        # *1-dim* integral Tensor
+        yield ErrorInput(SampleInput(make_arg((L, M, S)), 1, make_arg(S, dtype=torch.int), 2),
+                         error_type=RuntimeError,
+                         error_regex=r"start must be an 0-dim integral Tensor\.")
+
+        # 0-dim *bool* Tensor (bools are not allowed)
+        yield ErrorInput(SampleInput(make_arg((L, M, S)), -3, make_arg((), dtype=torch.bool), 3),
+                         error_type=RuntimeError,
+                         error_regex=r"start must be an 0-dim integral Tensor\.")
 
 
 def sample_trapezoid(op_info, device, dtype, requires_grad, **kwargs):
@@ -12314,7 +12394,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           sample_inputs_func=sample_inputs_narrow,
+           sample_inputs_func=partial(sample_inputs_narrow_narrow_copy, is_narrow=True),
+           reference_inputs_func=partial(reference_inputs_narrow_narrow_copy, is_narrow=True),
+           error_inputs_func=partial(error_inputs_narrow_narrow_copy, is_narrow=True),
            skips=(
                # Use of .item()
                DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator'),
@@ -12330,15 +12412,18 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_fwgrad_bwgrad=False,
            supports_autograd=False,
            # https://github.com/pytorch/pytorch/issues/86931
-           sample_inputs_func=sample_inputs_narrow_copy,
+           sample_inputs_func=partial(sample_inputs_narrow_narrow_copy, is_narrow=False),
+           reference_inputs_func=partial(reference_inputs_narrow_narrow_copy, is_narrow=False),
+           error_inputs_func=partial(error_inputs_narrow_narrow_copy, is_narrow=False),
            skips=(
                # https://github.com/pytorch/pytorch/issues/84577
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
-               # Not implemented
-               DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_meta_outplace', device_type='cuda'),
-               DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_meta_outplace', device_type='cuda'),
-               DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_symbolic_meta', device_type='cuda'),
+               # Lazy tensor failures: mutating and aliasing ops should all have codegen'd kernels
+               DecorateInfo(unittest.expectedFailure, 'TestLazyOpInfo', 'test_correctness'),
+               DecorateInfo(unittest.expectedFailure, 'TestLazyOpInfo', 'test_correctness_with_reusing_ir'),
+               # TypeError: must be real number, not SymFloat
+               DecorateInfo(unittest.expectedFailure, 'TestProxyTensorOpInfo', 'test_make_fx_symbolic_exhaustive'),
            )),
     UnaryUfuncInfo('neg',
                    aliases=('negative', ),
@@ -17924,10 +18009,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.narrow",
         torch_opinfo_name="narrow",
         supports_nvfuser=False,
-        skips=(
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_meta'),
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_executor'),
-        )
     ),
     PythonRefInfo(
         "_refs.narrow_copy",

From eeb2730a5e7f041bd2eae51d922013225cf38624 Mon Sep 17 00:00:00 2001
From: Fabio Rocha <frocha@quansight.com>
Date: Tue, 8 Nov 2022 19:25:30 +0000
Subject: [PATCH 0697/1922] upsample_*.vec ops are now CompositeImplicit
 (#85638)

It was previously CompositeExplicit but it was not really necessary.
See discussion in https://github.com/pytorch/pytorch/issues/85405

Pull Request resolved: https://github.com/pytorch/pytorch/pull/85638
Approved by: https://github.com/ezyang, https://github.com/lezcano, https://github.com/malfet, https://github.com/jansel
---
 .github/ci_commit_pins/xla.txt                |   2 +-
 aten/src/ATen/functorch/BatchRulesModules.cpp |  24 ++--
 aten/src/ATen/native/UpSampleBicubic2d.cpp    |  24 ----
 aten/src/ATen/native/UpSampleBilinear2d.cpp   |  24 ----
 aten/src/ATen/native/UpSampleLinear1d.cpp     |  11 --
 aten/src/ATen/native/UpSampleNearest1d.cpp    |  20 ----
 aten/src/ATen/native/UpSampleNearest2d.cpp    |  22 ----
 aten/src/ATen/native/UpSampleNearest3d.cpp    |  29 +----
 aten/src/ATen/native/UpSampleTrilinear3d.cpp  |  13 ---
 .../src/ATen/native/cuda/UpSampleNearest3d.cu |  47 --------
 aten/src/ATen/native/native_functions.yaml    | 104 ------------------
 .../quantized/cpu/UpSampleNearest3d.cpp       |  22 ----
 .../check_forward_backward_compatibility.py   |  24 ++++
 test/inductor/test_torchinductor.py           |  30 ++---
 test/test_autograd.py                         |   4 +-
 test/test_decomp.py                           |   3 +-
 tools/autograd/derivatives.yaml               |  94 ----------------
 torch/_inductor/decomposition.py              |   2 +
 torch/_inductor/lowering.py                   |  60 +++++-----
 torch/_meta_registrations.py                  |  26 -----
 20 files changed, 83 insertions(+), 502 deletions(-)

diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index c0348df036f69..7ec9661a1ce4d 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-3a04b23e26e76b1a7351f382978de3d8a14307a1
+7889d2d3be16675943d84e4a4133ed7c245a623f
diff --git a/aten/src/ATen/functorch/BatchRulesModules.cpp b/aten/src/ATen/functorch/BatchRulesModules.cpp
index 3968e2400397d..2419fde8a7a96 100644
--- a/aten/src/ATen/functorch/BatchRulesModules.cpp
+++ b/aten/src/ATen/functorch/BatchRulesModules.cpp
@@ -295,7 +295,7 @@ template <typename F, F Func, typename A, typename B, typename C, typename... T>
 struct UpsampleBackwardBatchRuleHelper<F, Func, typelist<A, B, C, T...>> {
   static std::tuple<Tensor,optional<int64_t>> apply(
       const Tensor& grad_output, optional<int64_t> grad_output_bdim,
-      OptionalSymIntArrayRef output_size, c10::SymIntArrayRef input_size,
+      c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size,
       T... extra_args) {
     auto grad_output_ = reshape_dim_into(*grad_output_bdim, 0, grad_output);
     TORCH_INTERNAL_ASSERT(input_size.size() > 0);
@@ -375,11 +375,11 @@ struct CudnnGridSampleBackwardBatchRuleHelper {
 #define CUDNN_GRID_SAMPLE_BW_BATCH_RULE(fn)\
     CudnnGridSampleBackwardBatchRuleHelper<decltype(&ATEN_FN(fn)), &ATEN_FN(fn)>::apply
 
-#define UPSAMPLE_BACKWARD(op, overload) VMAP_SUPPORT2(op, overload, SINGLE_ARG(\
+#define UPSAMPLE_BACKWARD(op) VMAP_SUPPORT(op, SINGLE_ARG(\
     UpsampleBackwardBatchRuleHelper<\
-      decltype(&ATEN_FN2(op, overload)),\
-      &ATEN_FN2(op, overload),\
-      c10::guts::function_traits<decltype(ATEN_FN2(op, overload))>::parameter_types>::apply))
+      decltype(&ATEN_FN(op)),\
+      &ATEN_FN(op),\
+      c10::guts::function_traits<decltype(ATEN_FN(op))>::parameter_types>::apply))
 
 #define UPSAMPLE_BATCH(op) \
   EXISTING_BDIM2(op, vec); \
@@ -430,13 +430,13 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   UPSAMPLE_BATCH(upsample_nearest3d);
   UPSAMPLE_BATCH(upsample_trilinear3d);
 
-  UPSAMPLE_BACKWARD(upsample_bicubic2d_backward, vec);
-  UPSAMPLE_BACKWARD(upsample_bilinear2d_backward, vec);
-  UPSAMPLE_BACKWARD(upsample_linear1d_backward, vec);
-  UPSAMPLE_BACKWARD(upsample_nearest1d_backward, vec);
-  UPSAMPLE_BACKWARD(upsample_nearest2d_backward, vec);
-  UPSAMPLE_BACKWARD(upsample_nearest3d_backward, vec);
-  UPSAMPLE_BACKWARD(upsample_trilinear3d_backward, vec);
+  UPSAMPLE_BACKWARD(upsample_bicubic2d_backward);
+  UPSAMPLE_BACKWARD(upsample_bilinear2d_backward);
+  UPSAMPLE_BACKWARD(upsample_linear1d_backward);
+  UPSAMPLE_BACKWARD(upsample_nearest1d_backward);
+  UPSAMPLE_BACKWARD(upsample_nearest2d_backward);
+  UPSAMPLE_BACKWARD(upsample_nearest3d_backward);
+  UPSAMPLE_BACKWARD(upsample_trilinear3d_backward);
   m.impl("one_hot", one_hot_decomposition_hack);
 }
 }}
diff --git a/aten/src/ATen/native/UpSampleBicubic2d.cpp b/aten/src/ATen/native/UpSampleBicubic2d.cpp
index 3a0fa941a4d4a..035bea5629547 100644
--- a/aten/src/ATen/native/UpSampleBicubic2d.cpp
+++ b/aten/src/ATen/native/UpSampleBicubic2d.cpp
@@ -287,18 +287,6 @@ Tensor upsample_bicubic2d(
   return at::upsample_bicubic2d(input, osize, align_corners, scale_h, scale_w);
 }
 
-Tensor upsample_bicubic2d_backward(
-    const Tensor& grad_output,
-    at::OptionalIntArrayRef output_size,
-    IntArrayRef input_size,
-    bool align_corners,
-    c10::optional<ArrayRef<double>> scale_factors) {
-  auto osize = compute_output_size(input_size, output_size, scale_factors);
-  auto scale_h = get_scale_value(scale_factors, 0);
-  auto scale_w = get_scale_value(scale_factors, 1);
-  return at::upsample_bicubic2d_backward(grad_output, osize, input_size, align_corners, scale_h, scale_w);
-}
-
 Tensor _upsample_bicubic2d_aa(
     const Tensor& input,
     at::OptionalIntArrayRef output_size,
@@ -310,18 +298,6 @@ Tensor _upsample_bicubic2d_aa(
   return at::_upsample_bicubic2d_aa(input, osize, align_corners, scale_h, scale_w);
 }
 
-Tensor _upsample_bicubic2d_aa_backward(
-    const Tensor& grad_output,
-    at::OptionalIntArrayRef output_size,
-    IntArrayRef input_size,
-    bool align_corners,
-    c10::optional<ArrayRef<double>> scale_factors) {
-  auto osize = compute_output_size(input_size, output_size, scale_factors);
-  auto scale_h = get_scale_value(scale_factors, 0);
-  auto scale_w = get_scale_value(scale_factors, 1);
-  return at::_upsample_bicubic2d_aa_backward(grad_output, osize, input_size, align_corners, scale_h, scale_w);
-}
-
 DEFINE_DISPATCH(upsample_bicubic2d_kernel);
 DEFINE_DISPATCH(_upsample_bicubic2d_aa_kernel);
 DEFINE_DISPATCH(_upsample_bicubic2d_aa_backward_kernel);
diff --git a/aten/src/ATen/native/UpSampleBilinear2d.cpp b/aten/src/ATen/native/UpSampleBilinear2d.cpp
index 69c856f06fcbf..5d91e93e016df 100644
--- a/aten/src/ATen/native/UpSampleBilinear2d.cpp
+++ b/aten/src/ATen/native/UpSampleBilinear2d.cpp
@@ -169,18 +169,6 @@ Tensor upsample_bilinear2d(
   return at::upsample_bilinear2d(input, osize, align_corners, scale_h, scale_w);
 }
 
-Tensor upsample_bilinear2d_backward(
-    const Tensor& grad_output,
-    at::OptionalIntArrayRef output_size,
-    IntArrayRef input_size,
-    bool align_corners,
-    c10::optional<ArrayRef<double>> scale_factors) {
-  auto osize = compute_output_size(input_size, output_size, scale_factors);
-  auto scale_h = get_scale_value(scale_factors, 0);
-  auto scale_w = get_scale_value(scale_factors, 1);
-  return at::upsample_bilinear2d_backward(grad_output, osize, input_size, align_corners, scale_h, scale_w);
-}
-
 Tensor _upsample_bilinear2d_aa(
     const Tensor& input,
     at::OptionalIntArrayRef output_size,
@@ -192,18 +180,6 @@ Tensor _upsample_bilinear2d_aa(
   return at::_upsample_bilinear2d_aa(input, osize, align_corners, scale_h, scale_w);
 }
 
-Tensor _upsample_bilinear2d_aa_backward(
-    const Tensor& grad_output,
-    at::OptionalIntArrayRef output_size,
-    IntArrayRef input_size,
-    bool align_corners,
-    c10::optional<ArrayRef<double>> scale_factors) {
-  auto osize = compute_output_size(input_size, output_size, scale_factors);
-  auto scale_h = get_scale_value(scale_factors, 0);
-  auto scale_w = get_scale_value(scale_factors, 1);
-  return at::_upsample_bilinear2d_aa_backward(grad_output, osize, input_size, align_corners, scale_h, scale_w);
-}
-
 DEFINE_DISPATCH(upsample_bilinear2d_kernel);
 DEFINE_DISPATCH(upsample_bilinear2d_backward_kernel);
 DEFINE_DISPATCH(_upsample_bilinear2d_aa_kernel);
diff --git a/aten/src/ATen/native/UpSampleLinear1d.cpp b/aten/src/ATen/native/UpSampleLinear1d.cpp
index 048d4b5a3d9c1..aed082b685638 100644
--- a/aten/src/ATen/native/UpSampleLinear1d.cpp
+++ b/aten/src/ATen/native/UpSampleLinear1d.cpp
@@ -99,17 +99,6 @@ Tensor upsample_linear1d(
   return at::upsample_linear1d(input, osize, align_corners, scale_w);
 }
 
-Tensor upsample_linear1d_backward(
-    const Tensor& grad_output,
-    at::OptionalIntArrayRef output_size,
-    IntArrayRef input_size,
-    bool align_corners,
-    c10::optional<ArrayRef<double>> scale_factors) {
-  auto osize = compute_output_size(input_size, output_size, scale_factors);
-  auto scale_w = get_scale_value(scale_factors, 0);
-  return at::upsample_linear1d_backward(grad_output, osize, input_size, align_corners, scale_w);
-}
-
 DEFINE_DISPATCH(upsample_linear1d_kernel);
 DEFINE_DISPATCH(upsample_linear1d_backward_kernel);
 
diff --git a/aten/src/ATen/native/UpSampleNearest1d.cpp b/aten/src/ATen/native/UpSampleNearest1d.cpp
index 5cc53dea349b7..1bdbda8f66c41 100644
--- a/aten/src/ATen/native/UpSampleNearest1d.cpp
+++ b/aten/src/ATen/native/UpSampleNearest1d.cpp
@@ -141,26 +141,6 @@ Tensor _upsample_nearest_exact1d(
   return at::_upsample_nearest_exact1d(input, osize, scale_w);
 }
 
-Tensor upsample_nearest1d_backward(
-    const Tensor& grad_output,
-    at::OptionalIntArrayRef output_size,
-    IntArrayRef input_size,
-    c10::optional<ArrayRef<double>> scale_factors) {
-  auto osize = compute_output_size(input_size, output_size, scale_factors);
-  auto scale_w = get_scale_value(scale_factors, 0);
-  return at::upsample_nearest1d_backward(grad_output, osize, input_size, scale_w);
-}
-
-Tensor _upsample_nearest_exact1d_backward(
-    const Tensor& grad_output,
-    at::OptionalIntArrayRef output_size,
-    IntArrayRef input_size,
-    c10::optional<ArrayRef<double>> scale_factors) {
-  auto osize = compute_output_size(input_size, output_size, scale_factors);
-  auto scale_w = get_scale_value(scale_factors, 0);
-  return at::_upsample_nearest_exact1d_backward(grad_output, osize, input_size, scale_w);
-}
-
 DEFINE_DISPATCH(upsample_nearest1d_kernel);
 DEFINE_DISPATCH(_upsample_nearest_exact1d_kernel);
 DEFINE_DISPATCH(upsample_nearest1d_backward_kernel);
diff --git a/aten/src/ATen/native/UpSampleNearest2d.cpp b/aten/src/ATen/native/UpSampleNearest2d.cpp
index 14c7a7d1b74f0..65e20b78f868e 100644
--- a/aten/src/ATen/native/UpSampleNearest2d.cpp
+++ b/aten/src/ATen/native/UpSampleNearest2d.cpp
@@ -167,28 +167,6 @@ Tensor _upsample_nearest_exact2d(
   return at::_upsample_nearest_exact2d(input, osize, scale_h, scale_w);
 }
 
-Tensor upsample_nearest2d_backward(
-    const Tensor& grad_output,
-    at::OptionalIntArrayRef output_size,
-    IntArrayRef input_size,
-    c10::optional<ArrayRef<double>> scale_factors) {
-  auto osize = compute_output_size(input_size, output_size, scale_factors);
-  auto scale_h = get_scale_value(scale_factors, 0);
-  auto scale_w = get_scale_value(scale_factors, 1);
-  return at::upsample_nearest2d_backward(grad_output, osize, input_size, scale_h, scale_w);
-}
-
-Tensor _upsample_nearest_exact2d_backward(
-    const Tensor& grad_output,
-    at::OptionalIntArrayRef output_size,
-    IntArrayRef input_size,
-    c10::optional<ArrayRef<double>> scale_factors) {
-  auto osize = compute_output_size(input_size, output_size, scale_factors);
-  auto scale_h = get_scale_value(scale_factors, 0);
-  auto scale_w = get_scale_value(scale_factors, 1);
-  return at::_upsample_nearest_exact2d_backward(grad_output, osize, input_size, scale_h, scale_w);
-}
-
 DEFINE_DISPATCH(upsample_nearest2d_kernel);
 DEFINE_DISPATCH(_upsample_nearest_exact2d_kernel);
 DEFINE_DISPATCH(upsample_nearest2d_backward_kernel);
diff --git a/aten/src/ATen/native/UpSampleNearest3d.cpp b/aten/src/ATen/native/UpSampleNearest3d.cpp
index 73948f66fa769..27ca6745655c9 100644
--- a/aten/src/ATen/native/UpSampleNearest3d.cpp
+++ b/aten/src/ATen/native/UpSampleNearest3d.cpp
@@ -162,7 +162,7 @@ TORCH_IMPL_FUNC(_upsample_nearest_exact3d_backward_out_cpu) (
 using at::native::upsample::compute_output_size;
 using at::native::upsample::get_scale_value;
 
-Tensor upsample_nearest3d_cpu(
+Tensor upsample_nearest3d(
     const Tensor& input,
     at::OptionalIntArrayRef output_size,
     c10::optional<ArrayRef<double>> scale_factors) {
@@ -173,7 +173,7 @@ Tensor upsample_nearest3d_cpu(
   return at::upsample_nearest3d(input, osize, scale_d, scale_h, scale_w);
 }
 
-Tensor _upsample_nearest_exact3d_cpu(
+Tensor _upsample_nearest_exact3d(
     const Tensor& input,
     at::OptionalIntArrayRef output_size,
     c10::optional<ArrayRef<double>> scale_factors) {
@@ -184,31 +184,6 @@ Tensor _upsample_nearest_exact3d_cpu(
   return at::_upsample_nearest_exact3d(input, osize, scale_d, scale_h, scale_w);
 }
 
-// when structured kernels can handle QuantizedCPU, update these overloads to be CompositeExplicitAutograd
-Tensor upsample_nearest3d_backward_cpu(
-    const Tensor& grad_output,
-    at::OptionalIntArrayRef output_size,
-    IntArrayRef input_size,
-    c10::optional<ArrayRef<double>> scale_factors) {
-  auto osize = compute_output_size(input_size, output_size, scale_factors);
-  auto scale_d = get_scale_value(scale_factors, 0);
-  auto scale_h = get_scale_value(scale_factors, 1);
-  auto scale_w = get_scale_value(scale_factors, 2);
-  return at::upsample_nearest3d_backward(grad_output, osize, input_size, scale_d, scale_h, scale_w);
-}
-
-Tensor _upsample_nearest_exact3d_backward_cpu(
-    const Tensor& grad_output,
-    at::OptionalIntArrayRef output_size,
-    IntArrayRef input_size,
-    c10::optional<ArrayRef<double>> scale_factors) {
-  auto osize = compute_output_size(input_size, output_size, scale_factors);
-  auto scale_d = get_scale_value(scale_factors, 0);
-  auto scale_h = get_scale_value(scale_factors, 1);
-  auto scale_w = get_scale_value(scale_factors, 2);
-  return at::_upsample_nearest_exact3d_backward(grad_output, osize, input_size, scale_d, scale_h, scale_w);
-}
-
 DEFINE_DISPATCH(upsample_nearest3d_kernel);
 DEFINE_DISPATCH(_upsample_nearest_exact3d_kernel);
 DEFINE_DISPATCH(upsample_nearest3d_backward_kernel);
diff --git a/aten/src/ATen/native/UpSampleTrilinear3d.cpp b/aten/src/ATen/native/UpSampleTrilinear3d.cpp
index 76bc4da85addb..1bf9c8f6cb4ee 100644
--- a/aten/src/ATen/native/UpSampleTrilinear3d.cpp
+++ b/aten/src/ATen/native/UpSampleTrilinear3d.cpp
@@ -111,19 +111,6 @@ Tensor upsample_trilinear3d(
   return at::upsample_trilinear3d(input, osize, align_corners, scale_d, scale_h, scale_w);
 }
 
-Tensor upsample_trilinear3d_backward(
-    const Tensor& grad_output,
-    at::OptionalIntArrayRef output_size,
-    IntArrayRef input_size,
-    bool align_corners,
-    c10::optional<ArrayRef<double>> scale_factors) {
-  auto osize = compute_output_size(input_size, output_size, scale_factors);
-  auto scale_d = get_scale_value(scale_factors, 0);
-  auto scale_h = get_scale_value(scale_factors, 1);
-  auto scale_w = get_scale_value(scale_factors, 2);
-  return at::upsample_trilinear3d_backward(grad_output, osize, input_size, align_corners, scale_d, scale_h, scale_w);
-}
-
 DEFINE_DISPATCH(upsample_trilinear3d_kernel);
 DEFINE_DISPATCH(upsample_trilinear3d_backward_kernel);
 
diff --git a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu
index 1a4afa012d780..58f14ad491a69 100644
--- a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu
@@ -337,52 +337,5 @@ TORCH_IMPL_FUNC(_upsample_nearest_exact3d_backward_out_cuda) (
 using at::native::upsample::compute_output_size;
 using at::native::upsample_cuda::get_scale_value;
 
-Tensor upsample_nearest3d_cuda(
-    const Tensor& input,
-    at::OptionalIntArrayRef output_size,
-    c10::optional<ArrayRef<double>> scale_factors) {
-  auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
-  auto scale_d = get_scale_value(scale_factors, 0);
-  auto scale_h = get_scale_value(scale_factors, 1);
-  auto scale_w = get_scale_value(scale_factors, 2);
-  return at::upsample_nearest3d(input, osize, scale_d, scale_h, scale_w);
-}
-
-Tensor _upsample_nearest_exact3d_cuda(
-    const Tensor& input,
-    at::OptionalIntArrayRef output_size,
-    c10::optional<ArrayRef<double>> scale_factors) {
-  auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
-  auto scale_d = get_scale_value(scale_factors, 0);
-  auto scale_h = get_scale_value(scale_factors, 1);
-  auto scale_w = get_scale_value(scale_factors, 2);
-  return at::_upsample_nearest_exact3d(input, osize, scale_d, scale_h, scale_w);
-}
-
-// when structured kernels can handle QuantizedCPU, update these overloads to be CompositeExplicitAutograd
-Tensor upsample_nearest3d_backward_cuda(
-    const Tensor& grad_output,
-    at::OptionalIntArrayRef output_size,
-    IntArrayRef input_size,
-    c10::optional<ArrayRef<double>> scale_factors) {
-  auto osize = compute_output_size(input_size, output_size, scale_factors);
-  auto scale_d = get_scale_value(scale_factors, 0);
-  auto scale_h = get_scale_value(scale_factors, 1);
-  auto scale_w = get_scale_value(scale_factors, 2);
-  return at::upsample_nearest3d_backward(grad_output, osize, input_size, scale_d, scale_h, scale_w);
-}
-
-Tensor _upsample_nearest_exact3d_backward_cuda(
-    const Tensor& grad_output,
-    at::OptionalIntArrayRef output_size,
-    IntArrayRef input_size,
-    c10::optional<ArrayRef<double>> scale_factors) {
-  auto osize = compute_output_size(input_size, output_size, scale_factors);
-  auto scale_d = get_scale_value(scale_factors, 0);
-  auto scale_h = get_scale_value(scale_factors, 1);
-  auto scale_w = get_scale_value(scale_factors, 2);
-  return at::_upsample_nearest_exact3d_backward(grad_output, osize, input_size, scale_d, scale_h, scale_w);
-}
-
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 2a48f5de8fcc3..74521341b64e4 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -11043,158 +11043,54 @@
 
 - func: upsample_linear1d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
   python_module: nn
-  dispatch:
-    CompositeExplicitAutograd: upsample_linear1d
   autogen: upsample_linear1d.vec_out
 
-- func: upsample_linear1d_backward.vec(Tensor grad_output, SymInt[]? output_size, SymInt[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
-  python_module: nn
-  dispatch:
-    CompositeExplicitAutograd: upsample_linear1d_backward
-  autogen: upsample_linear1d_backward.vec_out
-
 - func: upsample_bilinear2d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
   python_module: nn
-  dispatch:
-    CompositeExplicitAutograd: upsample_bilinear2d
   autogen: upsample_bilinear2d.vec_out
   tags: canonical
 
-- func: upsample_bilinear2d_backward.vec(Tensor grad_output, SymInt[]? output_size, SymInt[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
-  python_module: nn
-  dispatch:
-    CompositeExplicitAutograd: upsample_bilinear2d_backward
-  autogen: upsample_bilinear2d_backward.vec_out
-  tags: canonical
-
 - func: _upsample_bilinear2d_aa.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
   python_module: nn
-  dispatch:
-    CompositeExplicitAutograd: _upsample_bilinear2d_aa
   autogen: _upsample_bilinear2d_aa.vec_out
 
-- func: _upsample_bilinear2d_aa_backward.vec(Tensor grad_output, SymInt[]? output_size, SymInt[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
-  python_module: nn
-  dispatch:
-    CompositeExplicitAutograd: _upsample_bilinear2d_aa_backward
-  autogen: _upsample_bilinear2d_aa_backward.vec_out
-
 - func: upsample_trilinear3d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
   python_module: nn
-  dispatch:
-    CompositeExplicitAutograd: upsample_trilinear3d
   autogen: upsample_trilinear3d.vec_out
 
-- func: upsample_trilinear3d_backward.vec(Tensor grad_output, SymInt[]? output_size, SymInt[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
-  python_module: nn
-  dispatch:
-    CompositeExplicitAutograd: upsample_trilinear3d_backward
-  autogen: upsample_trilinear3d_backward.vec_out
-
 - func: upsample_bicubic2d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
   python_module: nn
-  dispatch:
-    CompositeExplicitAutograd: upsample_bicubic2d
   autogen: upsample_bicubic2d.vec_out
 
-- func: upsample_bicubic2d_backward.vec(Tensor grad_output, SymInt[]? output_size, SymInt[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
-  python_module: nn
-  dispatch:
-    CompositeExplicitAutograd: upsample_bicubic2d_backward
-  autogen: upsample_bicubic2d_backward.vec_out
-
 - func: _upsample_bicubic2d_aa.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
   python_module: nn
-  dispatch:
-    CompositeExplicitAutograd: _upsample_bicubic2d_aa
   autogen: _upsample_bicubic2d_aa.vec_out
 
-- func: _upsample_bicubic2d_aa_backward.vec(Tensor grad_output, SymInt[]? output_size, SymInt[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
-  python_module: nn
-  dispatch:
-    CompositeExplicitAutograd: _upsample_bicubic2d_aa_backward
-  autogen: _upsample_bicubic2d_aa_backward.vec_out
-
 - func: upsample_nearest1d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
   python_module: nn
-  dispatch:
-    CompositeExplicitAutograd: upsample_nearest1d
   autogen: upsample_nearest1d.vec_out
 
 - func: _upsample_nearest_exact1d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
   python_module: nn
-  dispatch:
-    CompositeExplicitAutograd: _upsample_nearest_exact1d
   autogen: _upsample_nearest_exact1d.vec_out
 
-- func: upsample_nearest1d_backward.vec(Tensor grad_output, SymInt[]? output_size, SymInt[] input_size, float[]? scale_factors) -> Tensor
-  python_module: nn
-  dispatch:
-    CompositeExplicitAutograd: upsample_nearest1d_backward
-  autogen: upsample_nearest1d_backward.vec_out
-
-- func: _upsample_nearest_exact1d_backward.vec(Tensor grad_output, SymInt[]? output_size, SymInt[] input_size, float[]? scale_factors) -> Tensor
-  python_module: nn
-  dispatch:
-    CompositeExplicitAutograd: _upsample_nearest_exact1d_backward
-  autogen: _upsample_nearest_exact1d_backward.vec_out
-
 - func: upsample_nearest2d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
   python_module: nn
-  dispatch:
-    CompositeExplicitAutograd: upsample_nearest2d
   autogen: upsample_nearest2d.vec_out
   tags: canonical
 
 - func: _upsample_nearest_exact2d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
   python_module: nn
-  dispatch:
-    CompositeExplicitAutograd: _upsample_nearest_exact2d
   autogen: _upsample_nearest_exact2d.vec_out
 
-- func: upsample_nearest2d_backward.vec(Tensor grad_output, SymInt[]? output_size, SymInt[] input_size, float[]? scale_factors) -> Tensor
-  python_module: nn
-  dispatch:
-    CompositeExplicitAutograd: upsample_nearest2d_backward
-  autogen: upsample_nearest2d_backward.vec_out
-  tags: canonical
-
-- func: _upsample_nearest_exact2d_backward.vec(Tensor grad_output, SymInt[]? output_size, SymInt[] input_size, float[]? scale_factors) -> Tensor
-  python_module: nn
-  dispatch:
-    CompositeExplicitAutograd: _upsample_nearest_exact2d_backward
-  autogen: _upsample_nearest_exact2d_backward.vec_out
-
 - func: upsample_nearest3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
   python_module: nn
-  dispatch:
-    CPU: upsample_nearest3d_cpu
-    CUDA: upsample_nearest3d_cuda
-    QuantizedCPU: upsample_nearest3d_quantized_cpu
   autogen: upsample_nearest3d.vec_out
 
 - func: _upsample_nearest_exact3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
   python_module: nn
-  dispatch:
-    CPU: _upsample_nearest_exact3d_cpu
-    CUDA: _upsample_nearest_exact3d_cuda
-    QuantizedCPU: _upsample_nearest_exact3d_quantized_cpu
   autogen: _upsample_nearest_exact3d.vec_out
 
-- func: upsample_nearest3d_backward.vec(Tensor grad_output, SymInt[]? output_size, SymInt[] input_size, float[]? scale_factors) -> Tensor
-  python_module: nn
-  dispatch:
-    CPU: upsample_nearest3d_backward_cpu
-    CUDA: upsample_nearest3d_backward_cuda
-  autogen: upsample_nearest3d_backward.vec_out
-
-- func: _upsample_nearest_exact3d_backward.vec(Tensor grad_output, SymInt[]? output_size, SymInt[] input_size, float[]? scale_factors) -> Tensor
-  python_module: nn
-  dispatch:
-    CPU: _upsample_nearest_exact3d_backward_cpu
-    CUDA: _upsample_nearest_exact3d_backward_cuda
-  autogen: _upsample_nearest_exact3d_backward.vec_out
-
 # NOTE: all of the non-"vec" upsample overloads are only kept for backward compatibility.
 - func: upsample_linear1d.out(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
diff --git a/aten/src/ATen/native/quantized/cpu/UpSampleNearest3d.cpp b/aten/src/ATen/native/quantized/cpu/UpSampleNearest3d.cpp
index 871f700ef4fb1..4b4c63eb7c3d3 100644
--- a/aten/src/ATen/native/quantized/cpu/UpSampleNearest3d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/UpSampleNearest3d.cpp
@@ -238,27 +238,5 @@ Tensor _upsample_nearest_exact3d_quantized_cpu(
       input, osize, scale_d, scale_h, scale_w);
 }
 
-Tensor upsample_nearest3d_quantized_cpu(
-    const Tensor& input,
-    at::OptionalIntArrayRef output_size,
-    c10::optional<ArrayRef<double>> scale_factors) {
-  auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
-  auto scale_d = get_scale_value(scale_factors, 0);
-  auto scale_h = get_scale_value(scale_factors, 1);
-  auto scale_w = get_scale_value(scale_factors, 2);
-  return upsample_nearest3d_quantized_cpu(input, osize, scale_d, scale_h, scale_w);
-}
-
-Tensor _upsample_nearest_exact3d_quantized_cpu(
-    const Tensor& input,
-    at::OptionalIntArrayRef output_size,
-    c10::optional<ArrayRef<double>> scale_factors) {
-  auto osize = compute_output_size(input.sizes(), output_size, scale_factors);
-  auto scale_d = get_scale_value(scale_factors, 0);
-  auto scale_h = get_scale_value(scale_factors, 1);
-  auto scale_w = get_scale_value(scale_factors, 2);
-  return _upsample_nearest_exact3d_quantized_cpu(input, osize, scale_d, scale_h, scale_w);
-}
-
 } // namespace native
 } // namespace at
diff --git a/test/forward_backward_compatibility/check_forward_backward_compatibility.py b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
index 30e398dbf1e0d..e9d0834a812c0 100644
--- a/test/forward_backward_compatibility/check_forward_backward_compatibility.py
+++ b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
@@ -293,6 +293,30 @@
     ("aten::_nested_tensor_layer_norm", datetime.date(2022, 10, 15)),
     ("aten::_torch_cuda_cu_linker_symbol_op", datetime.date(2022, 11, 1)),
 
+    ("aten::upsample_linear1d_backward", datetime.date(2022, 12, 15)),
+    ("aten::upsample_bicubic2d_backward", datetime.date(2022, 12, 15)),
+    ("aten::upsample_trilinear3d", datetime.date(2022, 12, 15)),
+    ("aten::upsample_bilinear2d", datetime.date(2022, 12, 15)),
+    ("aten::upsample_nearest3d", datetime.date(2022, 12, 15)),
+    ("aten::upsample_nearest2d_backward", datetime.date(2022, 12, 15)),
+    ("aten::upsample_bilinear2d_backward", datetime.date(2022, 12, 15)),
+    ("aten::upsample_trilinear3d_backward", datetime.date(2022, 12, 15)),
+    ("aten::upsample_nearest2d", datetime.date(2022, 12, 15)),
+    ("aten::upsample_bicubic2d", datetime.date(2022, 12, 15)),
+    ("aten::upsample_nearest1d_backward", datetime.date(2022, 12, 15)),
+    ("aten::upsample_nearest3d_backward", datetime.date(2022, 12, 15)),
+    ("aten::upsample_linear1d", datetime.date(2022, 12, 15)),
+    ("aten::upsample_nearest1d", datetime.date(2022, 12, 15)),
+    ("aten::_upsample_nearest_exact3d", datetime.date(2022, 12, 15)),
+    ("aten::_upsample_nearest_exact3d_backward", datetime.date(2022, 12, 15)),
+    ("aten::_upsample_bilinear2d_aa", datetime.date(2022, 12, 15)),
+    ("aten::_upsample_bilinear2d_aa_backward", datetime.date(2022, 12, 15)),
+    ("aten::_upsample_bicubic2d_aa", datetime.date(2022, 12, 15)),
+    ("aten::_upsample_bicubic2d_aa_backward", datetime.date(2022, 12, 15)),
+    ("aten::_upsample_nearest_exact1d", datetime.date(2022, 12, 15)),
+    ("aten::_upsample_nearest_exact1d_backward", datetime.date(2022, 12, 15)),
+    ("aten::_upsample_nearest_exact2d", datetime.date(2022, 12, 15)),
+    ("aten::_upsample_nearest_exact2d_backward", datetime.date(2022, 12, 15)),
 ]
 
 ALLOW_LIST_COMPILED = [
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index b0b5f884b44de..21d1b5854cf25 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -2824,10 +2824,10 @@ def fn(a):
     def test_upsample_nearest2d(self):
         def fn(a):
             return (
-                aten.upsample_nearest2d(a, [74, 76], None),
-                aten.upsample_nearest2d(a, [70, 75], None),
-                aten.upsample_nearest2d(a, [45, 74], None),
-                aten.upsample_nearest2d(a, [36, 39], None),
+                aten.upsample_nearest2d(a, [74, 76]),
+                aten.upsample_nearest2d(a, [70, 75]),
+                aten.upsample_nearest2d(a, [45, 74]),
+                aten.upsample_nearest2d(a, [36, 39]),
                 aten.upsample_nearest2d(a, None, [2.0, 2.0]),
             )
 
@@ -2846,25 +2846,15 @@ def fn(a):
         self.common(fn, (torch.randn([2, 4, 37, 38, 39]),))
 
     def test_upsample_nearest2d_backward(self):
-        func = torch.ops.aten.upsample_nearest2d_backward.vec
+        func = torch.ops.aten.upsample_nearest2d_backward
 
         def fn(a):
             return (
-                func(
-                    a, output_size=[6, 12], input_size=[3, 3, 3, 6], scale_factors=None
-                ),
-                func(
-                    a, output_size=[6, 12], input_size=[3, 3, 4, 5], scale_factors=None
-                ),
-                func(
-                    a, output_size=[6, 12], input_size=[3, 3, 2, 8], scale_factors=None
-                ),
-                func(
-                    a, output_size=[6, 12], input_size=[3, 3, 2, 8], scale_factors=None
-                ),
-                func(
-                    a, output_size=[6, 12], input_size=[3, 3, 4, 7], scale_factors=None
-                ),
+                func(a, output_size=[6, 12], input_size=[3, 3, 3, 6]),
+                func(a, output_size=[6, 12], input_size=[3, 3, 4, 5]),
+                func(a, output_size=[6, 12], input_size=[3, 3, 2, 8]),
+                func(a, output_size=[6, 12], input_size=[3, 3, 2, 8]),
+                func(a, output_size=[6, 12], input_size=[3, 3, 4, 7]),
             )
 
         self.common(fn, (torch.randn([3, 3, 6, 12]),))
diff --git a/test/test_autograd.py b/test/test_autograd.py
index dd3ecf3323d38..f5d890fad2d7f 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -5447,9 +5447,7 @@ def test_grad_fn_attr_bindings(self):
 
         a = torch.ones(1, 1, 2, requires_grad=True)
         out = torch.nn.functional.interpolate(a, scale_factor=0.5, mode="linear")
-        self.assertIsNone(out.grad_fn._saved_output_size)
-        self.assertEqual(out.grad_fn._saved_scale_factors, (0.5,))
-        self.assertIsInstance(out.grad_fn._saved_scale_factors[0], float)
+        self.assertEqual(out.grad_fn._saved_scales, 0.5)
 
         a = torch.ones(2, 2, requires_grad=True)
         out = torch.pdist(a, p=1)
diff --git a/test/test_decomp.py b/test/test_decomp.py
index dff62bea17db1..67e99d5eb8291 100644
--- a/test/test_decomp.py
+++ b/test/test_decomp.py
@@ -198,8 +198,9 @@ def op_assert_equal(test_case, op, test_dtype, orig, decomp, args, kwargs):
         (torch.float32, torch.ops.aten.grid_sampler_2d.default) : (7e-6, 3e-5),
         # Exceeds tolerances on CUDA, likely due to fma
         (torch.float32, torch.ops.aten.mv.default) : (1e-5, 3e-5),
-        (torch.float64, torch.ops.aten.upsample_bicubic2d.vec) : (1e-5, 1e-6),
         (torch.complex64, torch.ops.aten.mv.default): (5e-5, 5e-5),
+        (torch.float64, torch.ops.aten.upsample_bicubic2d.vec) : (1e-5, 5e-4),
+        (torch.float64, torch.ops.aten.upsample_bicubic2d.default) : (1e-5, 5e-4),
     }
     if (test_dtype, op) in tol_table:
         rtol, atol = tol_table[(decomp.dtype, op)]
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index dbe709007f738..8349a308be35a 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -2082,53 +2082,6 @@
   self: _upsample_nearest_exact3d_backward_symint(grad, output_size, self.sym_sizes(), scales_d, scales_h, scales_w)
   result: auto_linear
 
-- name: upsample_linear1d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
-  input: upsample_linear1d_backward_symint(grad, output_size, input.sym_sizes(), align_corners, scale_factors)
-  result: auto_linear
-
-- name: upsample_bilinear2d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
-  input: upsample_bilinear2d_backward_symint(grad, output_size, input.sym_sizes(), align_corners, scale_factors)
-  result: auto_linear
-
-- name: _upsample_bilinear2d_aa.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
-  input: _upsample_bilinear2d_aa_backward_symint(grad, output_size, input.sym_sizes(), align_corners, scale_factors)
-  result: auto_linear
-
-- name: upsample_trilinear3d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
-  input: upsample_trilinear3d_backward_symint(grad, output_size, input.sym_sizes(), align_corners, scale_factors)
-  result: auto_linear
-
-- name: upsample_bicubic2d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
-  input: upsample_bicubic2d_backward_symint(grad, output_size, input.sym_sizes(), align_corners, scale_factors)
-  result: auto_linear
-
-- name: _upsample_bicubic2d_aa.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
-  input: _upsample_bicubic2d_aa_backward_symint(grad, output_size, input.sym_sizes(), align_corners, scale_factors)
-
-- name: upsample_nearest1d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
-  input: upsample_nearest1d_backward_symint(grad, output_size, input.sym_sizes(), scale_factors)
-  result: auto_linear
-
-- name: _upsample_nearest_exact1d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
-  input: _upsample_nearest_exact1d_backward_symint(grad, output_size, input.sym_sizes(), scale_factors)
-  result: auto_linear
-
-- name: upsample_nearest2d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
-  input: upsample_nearest2d_backward_symint(grad, output_size, input.sym_sizes(), scale_factors)
-  result: auto_linear
-
-- name: _upsample_nearest_exact2d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
-  input: _upsample_nearest_exact2d_backward_symint(grad, output_size, input.sym_sizes(), scale_factors)
-  result: auto_linear
-
-- name: upsample_nearest3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
-  input: upsample_nearest3d_backward_symint(grad, output_size, input.sym_sizes(), scale_factors)
-  result: auto_linear
-
-- name: _upsample_nearest_exact3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
-  input: _upsample_nearest_exact3d_backward_symint(grad, output_size, input.sym_sizes(), scale_factors)
-  result: auto_linear
-
 - name: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor
   self: pixel_unshuffle(grad, upscale_factor)
   result: auto_linear
@@ -2507,53 +2460,6 @@
   grad_output: _upsample_nearest_exact3d_symint(grad, output_size, scales_d, scales_h, scales_w)
   result: auto_linear
 
-- name: upsample_linear1d_backward.vec(Tensor grad_output, SymInt[]? output_size, SymInt[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
-  grad_output: upsample_linear1d_symint(grad, output_size, align_corners, scale_factors)
-  result: auto_linear
-
-- name: upsample_bilinear2d_backward.vec(Tensor grad_output, SymInt[]? output_size, SymInt[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
-  grad_output: upsample_bilinear2d_symint(grad, output_size, align_corners, scale_factors)
-  result: auto_linear
-
-- name: _upsample_bilinear2d_aa_backward.vec(Tensor grad_output, SymInt[]? output_size, SymInt[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
-  grad_output: _upsample_bilinear2d_aa_symint(grad, output_size, align_corners, scale_factors)
-  result: auto_linear
-
-- name: upsample_trilinear3d_backward.vec(Tensor grad_output, SymInt[]? output_size, SymInt[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
-  grad_output: upsample_trilinear3d_symint(grad, output_size, align_corners, scale_factors)
-  result: auto_linear
-
-- name: upsample_bicubic2d_backward.vec(Tensor grad_output, SymInt[]? output_size, SymInt[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
-  grad_output: upsample_bicubic2d_symint(grad, output_size, align_corners, scale_factors)
-  result: auto_linear
-
-- name: _upsample_bicubic2d_aa_backward.vec(Tensor grad_output, SymInt[]? output_size, SymInt[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
-  grad_output: _upsample_bicubic2d_aa_symint(grad, output_size, align_corners, scale_factors)
-
-- name: upsample_nearest1d_backward.vec(Tensor grad_output, SymInt[]? output_size, SymInt[] input_size, float[]? scale_factors) -> Tensor
-  grad_output: upsample_nearest1d_symint(grad, output_size, scale_factors)
-  result: auto_linear
-
-- name: _upsample_nearest_exact1d_backward.vec(Tensor grad_output, SymInt[]? output_size, SymInt[] input_size, float[]? scale_factors) -> Tensor
-  grad_output: _upsample_nearest_exact1d_symint(grad, output_size, scale_factors)
-  result: auto_linear
-
-- name: upsample_nearest2d_backward.vec(Tensor grad_output, SymInt[]? output_size, SymInt[] input_size, float[]? scale_factors) -> Tensor
-  grad_output: upsample_nearest2d_symint(grad, output_size, scale_factors)
-  result: auto_linear
-
-- name: _upsample_nearest_exact2d_backward.vec(Tensor grad_output, SymInt[]? output_size, SymInt[] input_size, float[]? scale_factors) -> Tensor
-  grad_output: _upsample_nearest_exact2d_symint(grad, output_size, scale_factors)
-  result: auto_linear
-
-- name: upsample_nearest3d_backward.vec(Tensor grad_output, SymInt[]? output_size, SymInt[] input_size, float[]? scale_factors) -> Tensor
-  grad_output: upsample_nearest3d_symint(grad, output_size, scale_factors)
-  result: auto_linear
-
-- name: _upsample_nearest_exact3d_backward.vec(Tensor grad_output, SymInt[]? output_size, SymInt[] input_size, float[]? scale_factors) -> Tensor
-  grad_output: _upsample_nearest_exact3d_symint(grad, output_size, scale_factors)
-  result: auto_linear
-
 - name: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor
   grad_output: sigmoid_backward(grad, output.conj())
   output: grad.conj() * grad_output * (-2 * output.conj() + 1)
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index b4c8087537c6f..d7aa5e35f5010 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -101,6 +101,8 @@
         aten.unfold_backward,
         aten.upsample_bilinear2d.vec,
         aten.upsample_nearest2d_backward,
+        aten.softplus,
+        aten.softplus_backward,
     ]
 )
 
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index ece23f1063334..5ec83dcf507bd 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -2028,20 +2028,20 @@ def backend_reduce_str(reduce):
     return self
 
 
-def upsample_nearestnd(x, output_size=None, scale_factors=None, n=2):
+def upsample_nearestnd(x, output_size, scales_x: Tuple[float] = None, n: int = 2):
     x.realize_hint()  # elements are reused
     x_loader = x.make_loader()
     i_sizes = x.get_size()[-n:]
     batch = x.get_size()[:-n]
     i_sizes = [V.graph.sizevars.guard_static_shape(i) for i in i_sizes]
 
-    if scale_factors:
-        assert not output_size
-        o_sizes = [int(i * s) for i, s in zip(i_sizes, scale_factors)]
-    else:
-        o_sizes = output_size
+    assert len(scales_x) == n
+    o_sizes = output_size
 
     scales = [i / o for i, o in zip(i_sizes, o_sizes)]
+    for i, scale in enumerate(scales):
+        if scale:
+            scales[i] = scale
 
     def scale(x, scale):
         x = ops.index_expr(x, torch.float32)
@@ -2062,9 +2062,27 @@ def fn(idx):
     )
 
 
-register_lowering(aten.upsample_nearest1d)(functools.partial(upsample_nearestnd, n=1))
-register_lowering(aten.upsample_nearest2d)(functools.partial(upsample_nearestnd, n=2))
-register_lowering(aten.upsample_nearest3d)(functools.partial(upsample_nearestnd, n=3))
+@register_lowering(aten.upsample_nearest1d.default)
+def upsample_nearest1d(x, output_size, scales: Optional[float] = None):
+    return upsample_nearestnd(x, output_size, (scales,), n=1)
+
+
+@register_lowering(aten.upsample_nearest2d.default)
+def upsample_nearest2d(
+    x, output_size, scales_h: Optional[float] = None, scales_w: Optional[float] = None
+):
+    return upsample_nearestnd(x, output_size, (scales_h, scales_w), n=2)
+
+
+@register_lowering(aten.upsample_nearest3d.default)
+def upsample_nearest3d(
+    x,
+    output_size,
+    scales_d: Optional[float] = None,
+    scales_h: Optional[float] = None,
+    scales_w: Optional[float] = None,
+):
+    return upsample_nearestnd(x, output_size, (scales_d, scales_h, scales_w), n=3)
 
 
 @register_lowering(aten.upsample_bicubic2d.default)
@@ -2182,26 +2200,6 @@ def get_x_interp(y):
     )
 
 
-@register_lowering(aten.upsample_bicubic2d.vec)
-def upsample_bicubic2d_vec(
-    a,
-    output_size,
-    align_corners: bool,
-    scale_factors: Optional[Tuple[float, float]] = None,
-):
-    _, _, iH, iW = a.get_size()
-    iH = V.graph.sizevars.guard_static_shape(iH)
-    iW = V.graph.sizevars.guard_static_shape(iW)
-
-    if bool(output_size) + bool(scale_factors) != 1:
-        raise RuntimeError("Must specify exactly one of output_size and scale_factor.")
-    if output_size is None:
-        assert scale_factors is not None
-        output_size = (int(iH * scale_factors[0]), int(iW * scale_factors[1]))
-    scale_h, scale_w = scale_factors if scale_factors else (None, None)
-    return upsample_bicubic2d_default(a, output_size, align_corners, scale_h, scale_w)
-
-
 @register_lowering(aten.reflection_pad2d)
 def reflection_pad2d(x, padding):
     assert len(padding) == 4
@@ -2719,9 +2717,9 @@ def fn(idx):
     return rv
 
 
-@register_lowering(aten.upsample_nearest2d_backward.vec)
+@register_lowering(aten.upsample_nearest2d_backward.default)
 def upsample_nearest2d_backward(
-    x, output_size=None, input_size=None, scale_factors=None
+    x, output_size=None, input_size=None, scales_h=None, scales_w=None
 ):
     x.realize_hint()
 
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index da18ba45a1ebb..f4fd6e77db9c0 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1670,32 +1670,6 @@ def meta_scatter_add(self, dim, index, src):
     return self.new_empty(self.shape)
 
 
-@register_meta(aten.upsample_nearest2d.vec)
-def upsample_nearest2d_vec(input, output_size, scale_factors):
-    mem_format = utils.suggest_memory_format(input)
-    spatial_dimensions = input.dim() - 2
-
-    input_shape = input.shape
-    if output_size is not None:
-        assert scale_factors is None
-        out_size = output_size
-    elif scale_factors is not None:
-        assert output_size is None
-        out_size = []
-        for i in range(spatial_dimensions):
-            sym_float = (input_shape[i + 2] / 1) * scale_factors[i]
-            assert sym_float >= 0
-            out_size.append(math.floor(sym_float))
-
-    output_height = out_size[0]
-    output_width = out_size[1]
-    nbatch = input_shape[0]
-    channels = input_shape[1]
-    return input.new_empty((nbatch, channels, output_height, output_width)).to(
-        memory_format=mem_format
-    )
-
-
 # We must also trigger meta registrations from PrimTorch ref
 # decompositions
 import torch._refs

From 14b53fa864deb76192677716c1e770c30f64a70a Mon Sep 17 00:00:00 2001
From: blzheng <beilei.zheng@intel.com>
Date: Wed, 9 Nov 2022 10:40:23 +0000
Subject: [PATCH 0698/1922] [Inductor] fix c++ compile error with masked float
 value init (#88298)

Fixes #88201

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88298
Approved by: https://github.com/jgong5, https://github.com/jansel
---
 test/inductor/test_torchinductor.py | 10 ++++++++++
 torch/_inductor/codegen/cpp.py      |  2 ++
 2 files changed, 12 insertions(+)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 21d1b5854cf25..d001f7b0c2181 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -4294,6 +4294,16 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                 else:
                     self.assertEqual(len(inps), 0)
 
+    def test_dtype_mismatch_issue(self):
+        def fn(x):
+            attn = torch.nn.functional.pad(x, [0, 1])
+            return attn.softmax(dim=-1)
+
+        x = torch.rand(128, 32, 63)
+        res_ref = fn(x)
+        res = torch._dynamo.optimize("inductor")(fn)(x)
+        self.assertEqual(res, res_ref)
+
     @unittest.skipIf(HAS_CUDA, "histogramdd only supports cpu")
     def test_kwargs(self):
         def fn(x, y):
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index bd7dc9dba88f5..90ae4b44d5795 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -462,6 +462,8 @@ def masked(mask, body, other):
             code.writeline(f"float {var} = -std::numeric_limits<float>::infinity();")
         elif other == float("inf"):
             code.writeline(f"float {var} = std::numeric_limits<float>::infinity();")
+        elif isinstance(other, float):
+            code.writeline(f"float {var} = {other};")
         else:
             code.writeline(f"auto {var} = {other!r};")
         code.writeline(f"if({mask})")

From 87da3e2faf6985c1da6a6530573fc7539eadf154 Mon Sep 17 00:00:00 2001
From: Sean Ross-Ross <srossross@gmail.com>
Date: Tue, 8 Nov 2022 12:35:40 -0600
Subject: [PATCH 0699/1922] Changing as_strided_scatter to deterministic inputs
 (#85583)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/85583
Approved by: https://github.com/mruberry
---
 .../_internal/common_methods_invocations.py   | 26 ++++++++++++++-----
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index a7a492dcd526e..f69bbe20ed36a 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -274,8 +274,11 @@ def sample_inputs_as_strided_scatter(op_info, device, dtype, requires_grad, **kw
         ((1,), (1,), (1,), 0),
         ((3, 3), (2, 2), (1, 2), 0),
         ((3, 3), (2, 2), (1, 2), 1),
-        ((16,), (2, 2, 2, 2), (1, 1, 1, 1), 0),
-        ((16,), (2, 1, 1, 2), (1, 7, 7, 1), 0),
+        ((3, 3), (2, 2), (2, 1), 0),
+        # Scatter to larger dimentions
+        ((16,), (2, 2, 2, 2), (8, 4, 2, 1), 0),
+        # Scatter to larger dimentions with strides inverted
+        ((16,), (2, 1, 1, 2), (1, 2, 4, 8), 0),
     ]
 
     for input_shape, output_shape, stride, storage_offset in test_cases:
@@ -283,6 +286,18 @@ def sample_inputs_as_strided_scatter(op_info, device, dtype, requires_grad, **kw
         input_src = make_arg(output_shape)
         yield SampleInput(input_t, input_src, output_shape, stride, storage_offset=storage_offset)
 
+def error_inputs_as_strided_scatter(op_info, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32, requires_grad=False)
+
+    # Create a small tensor and try to scatter it out of bounds
+    input_t = make_arg([4, 4])
+    input_src = make_arg([2, 2])
+    yield ErrorInput(
+        SampleInput(input_t, input_src, [2, 2], [200, 200], storage_offset=0),
+        error_regex="itemsize 4 requiring a storage size of 1604 are out of bounds for storage of size 64"
+    )
+
+
 def sample_inputs_combinations(op_info, device, dtype, requires_grad, **kwargs):
     inputs = (
         (0,),
@@ -10513,18 +10528,15 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            # vmap does not support inplace views
            check_inplace_batched_forward_grad=False,
            sample_inputs_func=sample_inputs_as_strided_scatter,
+           error_inputs_func=error_inputs_as_strided_scatter,
            skips=(
                DecorateInfo(unittest.skip('Works for int64, fails for everything else'), 'TestCommon', 'test_noncontiguous_samples'),  # noqa: B950
                DecorateInfo(unittest.skip('Fails in most cases, passes on LAZY for some reason'), 'TestCommon', 'test_variant_consistency_eager'),  # noqa: B950
                DecorateInfo(unittest.skip('Fails on cuda + rocm'), 'TestCommon', 'test_complex_half_reference_testing'),
                DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_fn_grad'),
-               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_fn_gradgrad'),
                DecorateInfo(unittest.skip('Passes on complex128 and float64 only'), 'TestFwdGradients', 'test_fn_fwgrad_bwgrad'),
                # AssertionError: Tensor-likes are not close! (new_empty_strided.default)
-               DecorateInfo(unittest.skip("Expected: new_empty_strided is not comparable"), 'TestDecomp', 'test_comprehensive'),
-               DecorateInfo(
-                   unittest.skip("Some stride values write multiple values to the same location e.g. (1,1,1,1)"),
-                   'TestCommon', 'test_compare_cpu'),)),
+               DecorateInfo(unittest.skip("Expected: new_empty_strided is not comparable"), 'TestDecomp', 'test_comprehensive'),)),
     OpInfo('native_layer_norm',
            aten_name='native_layer_norm',
            ref=reference_native_layer_norm,

From 0a0dd9883bb97e1ab145fcfc8f42416ae44088d5 Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <ivan.yashchuk@aalto.fi>
Date: Wed, 9 Nov 2022 12:56:55 +0000
Subject: [PATCH 0700/1922] Add min cut partitioner for AOT+nvFuser (#88204)

Here we mark most of `torch.ops.nvprims` as something that can be recomputed in the backward passes (and hopefully fused).

TODO:
- [x] Add a test after https://github.com/pytorch/pytorch/pull/88186 is merged

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88204
Approved by: https://github.com/jjsjann123, https://github.com/jansel
---
 test/test_nvfuser_dynamo.py             | 65 +++++++++++++++++++-
 torch/_dynamo/optimizations/training.py | 47 +++++++++++++-
 torch/_prims/nvfuser_prims.py           | 81 +++++++++++++++++++++++++
 3 files changed, 189 insertions(+), 4 deletions(-)

diff --git a/test/test_nvfuser_dynamo.py b/test/test_nvfuser_dynamo.py
index 749cae87411b9..e59ead80fe13c 100644
--- a/test/test_nvfuser_dynamo.py
+++ b/test/test_nvfuser_dynamo.py
@@ -2,16 +2,17 @@
 
 import unittest
 import warnings
+from functools import partial
 
 import torch
 import torch._dynamo as torchdynamo
 from torch.testing import make_tensor
 from torch.testing._internal.common_utils import (
+    IS_WINDOWS,
     run_tests,
     skipIfTorchDynamo,
     TEST_WITH_ROCM,
     TestCase,
-    IS_WINDOWS,
 )
 from torch.testing._internal.jit_utils import RUN_CUDA
 
@@ -25,6 +26,15 @@ def is_pre_volta():
     return prop.major < 7
 
 
+def is_networkx_available():
+    try:
+        import networkx  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
 @skipIfTorchDynamo("Not a suitable test for TorchDynamo")
 @unittest.skipIf(IS_WINDOWS, "TorchDynamo is not supported on Windows")
 @unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
@@ -45,6 +55,59 @@ def func(a, b):
         eager_result = func.__wrapped__(input1, input2)
         self.assertEqual(eager_result, nvfuser_result)
 
+    @unittest.skipIf(not is_networkx_available(), "networkx not available")
+    def test_min_cut(self):
+        from functorch.compile import default_partition
+        from torch._dynamo.optimizations.training import nvprims_fw_bw_partition_fn
+
+        def get_fw_bw_graph(f, inps, partitioner):
+            from functorch.compile import aot_function
+
+            # Helper functions are taken from functorch/test_aotdispatch.py
+            def extract_graph(fx_g, _, graph_cell):
+                graph_cell[0] = fx_g
+                return fx_g
+
+            fw_graph_cell = [None]
+            bw_graph_cell = [None]
+            aot_function(
+                f,
+                fw_compiler=partial(extract_graph, graph_cell=fw_graph_cell),
+                bw_compiler=partial(extract_graph, graph_cell=bw_graph_cell),
+                partition_fn=partitioner,
+            )(*inps).sum().backward()
+            return (fw_graph_cell[0], bw_graph_cell[0])
+
+        def get_ins_outs(fx_g):
+            ins = []
+            outs = []
+            for n in fx_g.graph.nodes:
+                if n.op == "placeholder":
+                    ins.append(n)
+                elif n.op == "output":
+                    outs = tuple(n.args[0])
+            return ins, outs
+
+        def get_num_ins_outs(fx_g):
+            return tuple(len(i) for i in get_ins_outs(fx_g))
+
+        def func(x):
+            return x * x * x
+
+        input1 = make_tensor(
+            (3,), device="cpu", dtype=torch.float32, requires_grad=True
+        )
+        fw_graph, bw_graph = get_fw_bw_graph(func, [input1], default_partition)
+        self.assertEqual(get_num_ins_outs(fw_graph), (1, 3))
+        self.assertEqual(get_num_ins_outs(bw_graph), (3, 1))
+
+        input1 = make_tensor(
+            (3,), device="cpu", dtype=torch.float32, requires_grad=True
+        )
+        fw_graph, bw_graph = get_fw_bw_graph(func, [input1], nvprims_fw_bw_partition_fn)
+        self.assertEqual(get_num_ins_outs(fw_graph), (1, 2))
+        self.assertEqual(get_num_ins_outs(bw_graph), (2, 1))
+
     def test_batch_norm_implicit_dtype_promotion(self):
         input1 = make_tensor((2, 3, 4, 5), device="cuda", dtype=torch.float32)
         input2 = make_tensor((5, 5), device="cuda", dtype=torch.float32)
diff --git a/torch/_dynamo/optimizations/training.py b/torch/_dynamo/optimizations/training.py
index af673a2b2c1e7..49f9a4397dd99 100644
--- a/torch/_dynamo/optimizations/training.py
+++ b/torch/_dynamo/optimizations/training.py
@@ -275,13 +275,51 @@ def prims_executor(gm, inputs, *, executor):
     from torch._prims.executor import execute
     from torch.fx.experimental.proxy_tensor import make_fx
 
+    # AOT Autograd might not use the partitioner, so we need to make sure that
+    # the graph is transformed to use nvFuser-compatible nodes.
+    if not getattr(gm, "_nvprim_transformed", False):
+        with TorchRefsNvfuserCapabilityMode():
+            gm = make_fx(gm)(*inputs)
+
+    # Then we return a callable that executes the "gm" graph
+    return make_boxed_func(partial(execute, gm, executor=executor))
+
+
+def nvprims_fw_bw_partition_fn(joint_module, joint_inputs):
+    # This function is called once per forward+backward pass of a graph in AOT
+    # Autograd. We use it to set up the nvFuser-specific FX graph that is later
+    # passed to the executor.
+    from functorch.compile import min_cut_rematerialization_partition
+
+    from torch._prims.context import TorchRefsNvfuserCapabilityMode
+    from torch.fx.experimental.proxy_tensor import make_fx
+
+    # AOT Autograd expects arguments of the traced function to be named exactly
+    # "primals, tangents"
+    def func(primals, tangents):
+        return joint_module(primals, tangents)
+
     # First we trace the graph conditionally decomposing nodes
     # that can be sent to the nvfuser executor
     with TorchRefsNvfuserCapabilityMode():
-        prim_gm = make_fx(gm)(*inputs)
+        prim_gm = make_fx(func)(*joint_inputs)
+
+    # all nvprims for now
+    recomputable_ops = {
+        getattr(torch.ops.nvprims, prim)
+        for prim in dir(torch.ops.nvprims)
+        if isinstance(getattr(torch.ops.nvprims, prim), torch._ops.OpOverloadPacket)
+        and getattr(torch.ops.nvprims, prim).is_recomputable
+    }
 
-    # Then we return a callable that executes the "prim_gm" graph
-    return make_boxed_func(partial(execute, prim_gm, executor=executor))
+    fw_gm, bw_gm = min_cut_rematerialization_partition(
+        prim_gm, joint_inputs, recomputable_ops=recomputable_ops
+    )
+    # AOT Autograd might not use the partitioner, so we need to make sure that
+    # the graph is marked as already transformed to use nvFuser-compatible nodes
+    fw_gm._nvprim_transformed = True
+    bw_gm._nvprim_transformed = True
+    return fw_gm, bw_gm
 
 
 def create_nvprims_backend(*, executor):
@@ -291,11 +329,14 @@ def __init__(self, gm: torch.fx.GraphModule, example_inputs):
             self.executor = executor
 
         def candidate(self):
+            from torch._dynamo import disable
+
             return BACKENDS["aot_autograd"](
                 self.gm,
                 self.example_inputs,
                 fw_compiler=partial(prims_executor, executor=self.executor),
                 bw_compiler=partial(prims_executor, executor=self.executor),
+                partition_fn=disable(nvprims_fw_bw_partition_fn),
             )
 
     return NvPrims
diff --git a/torch/_prims/nvfuser_prims.py b/torch/_prims/nvfuser_prims.py
index 59a88203651e7..3da16ab3aa275 100644
--- a/torch/_prims/nvfuser_prims.py
+++ b/torch/_prims/nvfuser_prims.py
@@ -356,6 +356,82 @@ def _clone_nvfuser(fd: Any, input: TensorLikeType, *, memory_format=None):
 _nvfuser_impls["amax"] = _amax_nvfuser
 _nvfuser_impls["amin"] = _amin_nvfuser
 
+# functorch.compile.min_cut_rematerialization_partition accepts a list of
+# operators that can be recomputed in the backward pass. This list is used to
+# determine which operators can be recomputed. If an operator is not in this
+# list, it will not be recomputed.
+_nvfuser_is_recomputable: Dict[str, bool] = {
+    # Reductions are not allowed to be recomputed
+    "amax": False,
+    "amin": False,
+    "sum": False,
+    "var": False,
+    "var_mean": False,
+    # Normalizations are not allowed to be recomputed
+    "native_batch_norm": False,
+    # Random ops are not allowed to be recomputed
+    "rand_like": False,
+    # Everything else is allowed to be recomputed
+    "abs": True,
+    "acos": True,
+    "add": True,
+    "asin": True,
+    "atan": True,
+    "atan2": True,
+    "atanh": True,
+    "bitwise_and": True,
+    "bitwise_not": True,
+    "bitwise_or": True,
+    "bitwise_xor": True,
+    "broadcast_in_dim": True,
+    "ceil": True,
+    "clone": True,
+    "convert_element_type": True,
+    "cos": True,
+    "cosh": True,
+    "div": True,
+    "eq": True,
+    "erf": True,
+    "erfc": True,
+    "exp": True,
+    "expm1": True,
+    "floor": True,
+    "fmod": True,
+    "ge": True,
+    "gt": True,
+    "imag": True,
+    "isfinite": True,
+    "le": True,
+    "lgamma": True,
+    "log": True,
+    "log10": True,
+    "log1p": True,
+    "log2": True,
+    "lt": True,
+    "mul": True,
+    "ne": True,
+    "neg": True,
+    "pow": True,
+    "real": True,
+    "reciprocal": True,
+    "remainder": True,
+    "round": True,
+    "rsqrt": True,
+    "sign": True,
+    "sin": True,
+    "sinh": True,
+    "sqrt": True,
+    "squeeze": True,
+    "sub": True,
+    "tan": True,
+    "tanh": True,
+    "transpose": True,
+    "trunc": True,
+    "view": True,
+    "view_of": True,
+    "where": True,
+}
+
 
 def register_native_batch_norm():
     """This function is used to register the native_batch_norm function in torch.ops.nvprims module."""
@@ -432,6 +508,7 @@ def _native_batch_norm_autograd(
     for p in (prim_packet, prim):
         p.__doc__ = "Computes batch normalization."
         p.impl_nvfuser = _nvfuser_impls["native_batch_norm"]
+        p.is_recomputable = _nvfuser_is_recomputable["native_batch_norm"]
         p.return_type = torch._prims_common.RETURN_TYPE.NEW  # type: ignore[attr-defined]
 
 
@@ -490,6 +567,7 @@ def _prim_impl(
     for p in (prim_packet, prim):
         p.__doc__ = "Computes rand_like"
         p.impl_nvfuser = _nvfuser_impls["rand_like"]
+        p.is_recomputable = _nvfuser_is_recomputable["rand_like"]
         p.return_type = torch._prims_common.RETURN_TYPE.NEW  # type: ignore[attr-defined]
 
 
@@ -588,6 +666,7 @@ def _var_mean_autograd(
     for p in (prim_packet, prim):
         p.__doc__ = "Computes the variance and mean of x over the list of dimensions specified in the dim argument"
         p.impl_nvfuser = _nvfuser_impls["var_mean"]
+        p.is_recomputable = _nvfuser_is_recomputable["var_mean"]
         p.return_type = torch._prims_common.RETURN_TYPE.NEW  # type: ignore[attr-defined]
 
 
@@ -625,6 +704,7 @@ def _view_no_original_shape_overload_impl(a, shape):
     for p in (prim_packet, prim):
         p.__doc__ = "Creates a tensor with the specified shape containing a copy of the data in a."
         p.impl_nvfuser = _nvfuser_impls["view"]
+        p.is_recomputable = _nvfuser_is_recomputable["view"]
         p.return_type = torch._prims_common.RETURN_TYPE.VIEW  # type: ignore[attr-defined]
         p.impl_aten = _nvprims_view_impl_aten
 
@@ -651,5 +731,6 @@ def register_nvprims():
         for p in (prim_packet, prim):
             p.__doc__ = main_prim.__doc__
             p.impl_nvfuser = _nvfuser_impls[name]
+            p.is_recomputable = _nvfuser_is_recomputable.get(name, False)
             p.return_type = main_prim.return_type  # type: ignore[attr-defined]
             p.impl_aten = main_prim.impl_aten

From 75903400d61795700240136177186bbe9efc9653 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Tue, 8 Nov 2022 13:33:51 -0500
Subject: [PATCH 0701/1922] Mark diag.out composite (#88670)

It's implementation just redispatches, it works for more than CPU/CUDA.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88670
Approved by: https://github.com/anjali411
---
 aten/src/ATen/native/native_functions.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 74521341b64e4..b45e98425ce31 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -7733,8 +7733,6 @@
   autogen: geometric, geometric.out
 
 - func: diag.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: diag_out
 
 - func: diag(Tensor self, int diagonal=0) -> Tensor
   variants: method, function

From 78e2be1fee4593c04675a87b20860bade111c256 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 7 Nov 2022 13:34:38 -0500
Subject: [PATCH 0702/1922] Add voznesenskym to symbolic-shapes group, move
 wconstab to listener (#88593)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88593
Approved by: https://github.com/anjali411
---
 .github/auto_request_review.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/auto_request_review.yml b/.github/auto_request_review.yml
index 75f181228d177..91dbf35ae7694 100644
--- a/.github/auto_request_review.yml
+++ b/.github/auto_request_review.yml
@@ -4,16 +4,17 @@ reviewers:
     symbolic-shapes:
       - ezyang
       - Chillee
-      - wconstab
       - anjali411
       - albanD
       - miladm
       - bdhirsh
+      - voznesenskym
 
   per_author:
     symbolic-shapes:
       - symbolic-shapes
       - antoniojkim
+      - wconstab
 
 files:
   # none yet, TODO: migrate CODEOWNERS here

From acbe433e7a122d0f6b3cb61eff91c8d58d8fc8e6 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 9 Nov 2022 08:13:03 -0500
Subject: [PATCH 0703/1922] Add SherlockNoMad to symbolic-shapes reviewer list
 (#88739)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88739
Approved by: https://github.com/anjali411
---
 .github/auto_request_review.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/auto_request_review.yml b/.github/auto_request_review.yml
index 91dbf35ae7694..339f085d939af 100644
--- a/.github/auto_request_review.yml
+++ b/.github/auto_request_review.yml
@@ -9,6 +9,7 @@ reviewers:
       - miladm
       - bdhirsh
       - voznesenskym
+      - SherlockNoMad
 
   per_author:
     symbolic-shapes:

From b4be7ff541ab0bbe497d537909e4c2ec7e4df184 Mon Sep 17 00:00:00 2001
From: mikey dagitses <mikeyd@meta.com>
Date: Tue, 8 Nov 2022 10:06:03 -0500
Subject: [PATCH 0704/1922] extract out the clean workspace test to its own
 file (#88682)

Summary:
This test relies on what the root workspace is before any other code
is run. However, some of the test cases change it. If the order the
tests are run is randomized, then the test can fail if run after one
of them.

Having it on its own ensures that it always sees a pristine state.

Test Plan:
Verified locally and confirmed in internal and external CI.

Reviewers:

Subscribers:

Tasks:

Tags:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88682
Approved by: https://github.com/r-barnes, https://github.com/malfet
---
 caffe2/python/clean_workspace_test.py | 15 +++++++++++++++
 caffe2/python/workspace_test.py       |  6 ------
 2 files changed, 15 insertions(+), 6 deletions(-)
 create mode 100644 caffe2/python/clean_workspace_test.py

diff --git a/caffe2/python/clean_workspace_test.py b/caffe2/python/clean_workspace_test.py
new file mode 100644
index 0000000000000..c8285f4a1c5bd
--- /dev/null
+++ b/caffe2/python/clean_workspace_test.py
@@ -0,0 +1,15 @@
+import unittest
+
+from caffe2.python import workspace
+
+
+# This test is extracted out from workspace_test.py because it relies on the pristine
+# state of the initial workspace. When tests are run in different orders, this test may
+# become flaky because of global state modifications impacting what the root folder is
+# after a reset.
+class TestWorkspace(unittest.TestCase):
+    def testRootFolder(self):
+        self.assertEqual(workspace.ResetWorkspace(), True)
+        self.assertEqual(workspace.RootFolder(), ".")
+        self.assertEqual(workspace.ResetWorkspace("/tmp/caffe-workspace-test"), True)
+        self.assertEqual(workspace.RootFolder(), "/tmp/caffe-workspace-test")
diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py
index 2e2d284f92e43..b434b5e748cc1 100644
--- a/caffe2/python/workspace_test.py
+++ b/caffe2/python/workspace_test.py
@@ -24,12 +24,6 @@ def setUp(self):
         )
         workspace.ResetWorkspace()
 
-    def testRootFolder(self):
-        self.assertEqual(workspace.ResetWorkspace(), True)
-        self.assertEqual(workspace.RootFolder(), ".")
-        self.assertEqual(workspace.ResetWorkspace("/tmp/caffe-workspace-test"), True)
-        self.assertEqual(workspace.RootFolder(), "/tmp/caffe-workspace-test")
-
     def testWorkspaceHasBlobWithNonexistingName(self):
         self.assertEqual(workspace.HasBlob("non-existing"), False)
 

From 411e44d89d886d324b98997c473ba82aa674c032 Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Wed, 9 Nov 2022 13:05:32 +0000
Subject: [PATCH 0705/1922] [inductor] Handle nested tuple/list output in
 fallback kernel (#88495)

Summary: Currently fallback kernel in inductor assumes its output is
either a tensor or a tuple/list of tensors. This PR makes it handle more
generic output data structure.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88495
Approved by: https://github.com/jansel
---
 test/inductor/test_torchinductor_opinfo.py |  2 -
 torch/_inductor/ir.py                      | 77 +++++++++++-----------
 torch/_inductor/lowering.py                | 17 ++---
 3 files changed, 44 insertions(+), 52 deletions(-)

diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 03b5138f1dc2d..5cee29920b777 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -205,7 +205,6 @@ def process(device_type):
     "linalg.eigh": {f32, f64},
     "linalg.eigvals": {f32, f64},
     "linalg.eigvalsh": {f32, f64},
-    "linalg.ldl_factor": {f32, f64},
     "linalg.lstsq": {f32, f64},
     "linalg.lstsq.grad_oriented": {f32, f64},
     "linalg.matrix_rank": {f32, f64},
@@ -312,7 +311,6 @@ def process(device_type):
     "linalg.eigh": {f32, f64},
     "linalg.eigvals": {f32, f64},
     "linalg.eigvalsh": {f32, f64},
-    "linalg.ldl_factor": {f32, f64},
     "linalg.lstsq": {f32, f64},
     "linalg.lstsq.grad_oriented": {f32, f64},
     "linalg.matrix_rank": {f32, f64},
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 8fd458cb72eb4..3fc61b29fa062 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -2917,45 +2917,42 @@ def create(cls, kernel, *args, **kwargs):
                 unflatten_args,
             ) = cls.process_kernel(kernel, *args, **kwargs)
 
-        if isinstance(example_output, (list, tuple)):
-            packed = FallbackKernel(
-                MultiOutputLayout(tensor_args[0].get_device()),
-                kernel,
-                tensor_args,
-                non_tensor_args,
-                unflatten_args,
-            )
-            return [
-                (
-                    MultiOutput(
-                        FixedLayout(
-                            example_output[i].device,
-                            example_output[i].dtype,
-                            [sympy.Integer(s) for s in example_output[i].size()],
-                            [sympy.Integer(s) for s in example_output[i].stride()],
-                        ),
-                        packed,
-                        i,
-                    )
-                    if example_output[i] is not None
-                    else None
+        assert tensor_args or isinstance(
+            example_output, torch.Tensor
+        ), "Not sure where to find device info"
+        packed = FallbackKernel(
+            MultiOutputLayout(
+                tensor_args[0].get_device() if tensor_args else example_output.device
+            ),
+            kernel,
+            tensor_args,
+            non_tensor_args,
+            unflatten_args,
+            kwargs,
+        )
+
+        def generate_output(output, index=""):
+            if isinstance(output, (list, tuple)):
+                return type(output)(
+                    generate_output(output[i], f"{index}[{i}]")
+                    for i in range(len(output))
                 )
-                for i in range(len(example_output))
-            ]
-        else:
-            return FallbackKernel(
-                FixedLayout(
-                    example_output.device,
-                    example_output.dtype,
-                    [sympy.Integer(s) for s in example_output.size()],
-                    [sympy.Integer(s) for s in example_output.stride()],
-                ),
-                kernel,
-                tensor_args,
-                non_tensor_args,
-                unflatten_args,
-                kwargs,
-            )
+            elif isinstance(output, torch.Tensor):
+                return MultiOutput(
+                    FixedLayout(
+                        output.device,
+                        output.dtype,
+                        [sympy.Integer(s) for s in output.size()],
+                        [sympy.Integer(s) for s in output.stride()],
+                    ),
+                    packed,
+                    index,
+                )
+            else:
+                assert output is None, "FallbackKernel output type is not supported"
+                return None
+
+        return generate_output(example_output)
 
     def apply_constraint(self):
         return super().apply_constraint()
@@ -2969,11 +2966,11 @@ class MultiOutputLayout(IRNode):
 class MultiOutput(ExternKernel):
     def codegen(self, wrapper):
         wrapper.writeline(
-            f"{self.get_name()} = {self.inputs[0].get_name()}[{self.index}]"
+            f"{self.get_name()} = {self.inputs[0].get_name()}{self.index}"
         )
         self.codegen_size_asserts(wrapper)
 
-    def __init__(self, layout, input, index):
+    def __init__(self, layout, input, index: str):
         super().__init__(None, layout, [input], ())
         self.name = V.graph.register_buffer(self)
         self.index = index
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 5ec83dcf507bd..694d2939285de 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -9,6 +9,7 @@
 
 import torch
 import torch.fx
+import torch.utils._pytree as pytree
 from torch._prims_common import (
     elementwise_dtypes,
     ELEMENTWISE_TYPE_PROMOTION_KIND,
@@ -965,11 +966,9 @@ def fallback_handler(kernel):
     fallbacks.add(kernel)
 
     def handler(*args, **kwargs):
-        result = ir.FallbackKernel.create(kernel, *args, **kwargs)
-        if isinstance(result, (list, tuple)):
-            return list(map(TensorBox.create, result))
-        else:
-            return TensorBox.create(result)
+        return pytree.tree_map(
+            TensorBox.create, ir.FallbackKernel.create(kernel, *args, **kwargs)
+        )
 
     return handler
 
@@ -993,12 +992,10 @@ def native_dropout(x, p, train):
         config.fallback_random
     ), "this should be handled in decomps unless config.fallback_random"
     if train:
-        return list(
-            map(
-                TensorBox.create,
-                ir.FallbackKernel.create(aten.native_dropout, x, p, train),
-            )
+        return pytree.tree_map(
+            TensorBox.create, ir.FallbackKernel.create(aten.native_dropout, x, p, train)
         )
+
     return x, ones_like(x, dtype=torch.bool)
 
 
From df28ee2959eb7201192237df296cf4933865599e Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Wed, 9 Nov 2022 16:13:56 +0000
Subject: [PATCH 0706/1922] [BE] Delete push_nightly_docker_ghcr (#88748)

As it seems to be duplicating the functionality of `docker-release.yml` and have not produced a valid build in last 16 days, according to https://github.com/pytorch/pytorch/actions/workflows/push_nightly_docker_ghcr.yml

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88748
Approved by: https://github.com/seemethere
---
 .../scripts/build_publish_nightly_docker.sh   | 44 -------------------
 .../workflows/push_nightly_docker_ghcr.yml    | 39 ----------------
 2 files changed, 83 deletions(-)
 delete mode 100644 .github/scripts/build_publish_nightly_docker.sh
 delete mode 100644 .github/workflows/push_nightly_docker_ghcr.yml

diff --git a/.github/scripts/build_publish_nightly_docker.sh b/.github/scripts/build_publish_nightly_docker.sh
deleted file mode 100644
index c60e31eec500e..0000000000000
--- a/.github/scripts/build_publish_nightly_docker.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/env bash
-
-set -xeuo pipefail
-
-PYTORCH_DOCKER_TAG=$(git describe --tags --always)-devel
-CUDA_VERSION=11.6.2
-
-# Build PyTorch nightly docker
-make -f docker.Makefile \
-     DOCKER_REGISTRY=ghcr.io \
-     DOCKER_ORG=pytorch \
-     CUDA_VERSION=${CUDA_VERSION} \
-     DOCKER_IMAGE=pytorch-nightly \
-     DOCKER_TAG=${PYTORCH_DOCKER_TAG} \
-     INSTALL_CHANNEL=pytorch-nightly BUILD_TYPE=official devel-image
-
-# Get the PYTORCH_NIGHTLY_COMMIT from the docker image
-PYTORCH_NIGHTLY_COMMIT=$(docker run \
-       ghcr.io/pytorch/pytorch-nightly:${PYTORCH_DOCKER_TAG} \
-       python -c 'import torch; print(torch.version.git_version)' | head -c 7)
-
-docker tag ghcr.io/pytorch/pytorch-nightly:${PYTORCH_DOCKER_TAG} \
-       ghcr.io/pytorch/pytorch-nightly:${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION}
-
-docker tag ghcr.io/pytorch/pytorch-nightly:${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION} \
-       ghcr.io/pytorch/pytorch-nightly:latest
-
-if [[ ${WITH_PUSH:-} == "true" ]]; then
-    # Push the nightly docker to GitHub Container Registry
-    echo $GHCR_PAT | docker login ghcr.io -u pytorch --password-stdin
-    make -f docker.Makefile \
-         DOCKER_REGISTRY=ghcr.io \
-         DOCKER_ORG=pytorch \
-         DOCKER_IMAGE=pytorch-nightly \
-         DOCKER_TAG=${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION} \
-         devel-push
-
-    make -f docker.Makefile \
-         DOCKER_REGISTRY=ghcr.io \
-         DOCKER_ORG=pytorch \
-         DOCKER_IMAGE=pytorch-nightly \
-         DOCKER_TAG=latest \
-         devel-push
-fi
diff --git a/.github/workflows/push_nightly_docker_ghcr.yml b/.github/workflows/push_nightly_docker_ghcr.yml
deleted file mode 100644
index ac443a4d558c1..0000000000000
--- a/.github/workflows/push_nightly_docker_ghcr.yml
+++ /dev/null
@@ -1,39 +0,0 @@
-name: docker-release-builds
-on:
-  schedule:
-    # Push the nightly docker daily at 1 PM UTC
-    - cron: '0 13 * * *'
-  # Trigger when we modify something related to these images
-  pull_request:
-    paths:
-      - .github/scripts/build_publish_nightly_docker.sh
-      - .github/workflows/push_nightly_docker_ghcr.yml
-      - Dockerfile
-      - docker.Makefile
-  # Have the ability to trigger this job manually using the API as well
-  workflow_dispatch:
-
-jobs:
-  docker-release-build:
-    if: ${{ github.repository == 'pytorch/pytorch' }}
-    runs-on: linux.2xlarge
-    env:
-      GHCR_PAT: ${{ secrets.GHCR_PAT }}
-      WITH_PUSH: ${{ github.event_name == 'schedule' }}
-    steps:
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      - uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
-        name: Build and upload nightly docker
-        with:
-          timeout_minutes: 30
-          max_attempts: 3
-          command: |
-            set -ex
-            bash .github/scripts/build_publish_nightly_docker.sh
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true

From 625aca9027e0c775ffa989ea5fda48a040965ad1 Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Wed, 9 Nov 2022 17:15:12 +0000
Subject: [PATCH 0707/1922] [fix] MathBits: serialization (#88182)

Fixes #81690

TODO:

* [x] C++ Unpickler Fix (locally tested pickled in Python and unpickled in C++)
* [x] C++ Pickler Fix (locally tested pickled in C++ and unpickled in Python)
* [x] Do quant_tensor, sparse_tensor, etc require similar changes? (Sparse and Quant don't need this)
* [x] Add Comments
* [x] How to make sure C++ and Python are in sync? (Functions in `pickler.h` help in getting and setting Tensor Metadata (math-bits for now) on a tensor. They are the only place which should handle this.)

Notes:
Quant Tensor don't support complex dtypes and for float they segfault with `_neg_view` : https://github.com/pytorch/pytorch/issues/88484

Sparse Tensor:
```python
>>> a = torch.tensor([[0, 2.], [3j, 0]]).to_sparse()
>>> a.conj().is_conj()
False
>>> a._neg_view()
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
NotImplementedError: Cannot access storage of SparseTensorImpl
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88182
Approved by: https://github.com/ezyang, https://github.com/anjali411
---
 test/cpp/api/serialize.cpp                 | 33 +++++++++++++++
 test/test_serialization.py                 | 20 +++++++++
 torch/_tensor.py                           |  4 ++
 torch/_utils.py                            | 20 ++++++++-
 torch/csrc/Module.cpp                      |  7 ++++
 torch/csrc/jit/serialization/pickler.cpp   | 14 +++++++
 torch/csrc/jit/serialization/pickler.h     | 49 ++++++++++++++++++++++
 torch/csrc/jit/serialization/unpickler.cpp | 19 ++++++++-
 torch/utils/model_dump/__init__.py         |  5 ++-
 9 files changed, 167 insertions(+), 4 deletions(-)

diff --git a/test/cpp/api/serialize.cpp b/test/cpp/api/serialize.cpp
index 0cf8ed88c4188..05bb0f941d402 100644
--- a/test/cpp/api/serialize.cpp
+++ b/test/cpp/api/serialize.cpp
@@ -257,6 +257,39 @@ TEST(SerializeTest, Basic) {
   ASSERT_TRUE(x.allclose(y));
 }
 
+TEST(SerializeTest, MathBits) {
+  torch::manual_seed(0);
+
+  auto options = torch::TensorOptions{}.dtype(torch::kComplexFloat);
+  auto x = torch::randn({5, 5}, options);
+  {
+    auto expected = torch::conj(x);
+    auto actual = save_and_load(expected);
+
+    ASSERT_TRUE(actual.defined());
+    ASSERT_EQ(actual.sizes().vec(), expected.sizes().vec());
+    ASSERT_TRUE(actual.allclose(expected));
+  }
+
+  {
+    auto expected = torch::_neg_view(x);
+    auto actual = save_and_load(expected);
+
+    ASSERT_TRUE(actual.defined());
+    ASSERT_EQ(actual.sizes().vec(), expected.sizes().vec());
+    ASSERT_TRUE(actual.allclose(expected));
+  }
+
+  {
+    auto expected = torch::conj(torch::_neg_view(x));
+    auto actual = save_and_load(expected);
+
+    ASSERT_TRUE(actual.defined());
+    ASSERT_EQ(actual.sizes().vec(), expected.sizes().vec());
+    ASSERT_TRUE(actual.allclose(expected));
+  }
+}
+
 TEST(SerializeTest, BasicToFile) {
   torch::manual_seed(0);
 
diff --git a/test/test_serialization.py b/test/test_serialization.py
index 3a18f8a45ad04..af0317e87a145 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -911,6 +911,26 @@ def __reduce__(self):
             with self.assertRaisesRegex(pickle.UnpicklingError, "Unsupported class"):
                 torch.load(f, weights_only=True)
 
+    @parametrize('weights_only', (False, True))
+    def test_serialization_math_bits(self, weights_only):
+        t = torch.randn(1, dtype=torch.cfloat)
+
+        def _save_load_check(t):
+            with BytesIOContext() as f:
+                torch.save(t, f)
+                f.seek(0)
+                # Unsafe load should work
+                self.assertEqual(torch.load(f, weights_only=weights_only), t)
+
+        t_conj = torch.conj(t)
+        _save_load_check(t_conj)
+
+        t_neg = torch._neg_view(t)
+        _save_load_check(t_neg)
+
+        t_n_c = torch._neg_view(torch.conj(t))
+        _save_load_check(t_n_c)
+
     def run(self, *args, **kwargs):
         with serialization_method(use_zip=True):
             return super(TestSerialization, self).run(*args, **kwargs)
diff --git a/torch/_tensor.py b/torch/_tensor.py
index 8ac1ac1eb7361..793034bb64ede 100644
--- a/torch/_tensor.py
+++ b/torch/_tensor.py
@@ -413,6 +413,10 @@ def _reduce_ex_internal(self, proto):
                 self.requires_grad,
                 backward_hooks,
             )  # previously was self._backward_hooks
+
+            metadata = torch._utils.get_tensor_metadata(self)
+            if metadata:
+                args = args + (metadata,)  # type: ignore[assignment]
             return (torch._utils._rebuild_tensor_v2, args)
 
     def __setstate__(self, state):
diff --git a/torch/_utils.py b/torch/_utils.py
index f178cfbaea4ae..3bc8a749b3e66 100644
--- a/torch/_utils.py
+++ b/torch/_utils.py
@@ -147,11 +147,29 @@ def _rebuild_tensor(storage, storage_offset, size, stride):
     return t.set_(storage._untyped_storage, storage_offset, size, stride)
 
 
+def get_tensor_metadata(tensor):
+    # Tensor's Metadata for serializing.
+    # Currently, this only returns a dict[string, bool] specifing whether
+    # `conj` or `neg` bit is set.
+    assert isinstance(tensor, torch.Tensor)
+    return torch._C._get_tensor_metadata(tensor)  # type: ignore[attr-defined]
+
+
+def set_tensor_metadata(tensor, metadata):
+    # See `get_tensor_metadata` above
+    assert isinstance(metadata, dict)
+    assert isinstance(tensor, torch.Tensor)
+    torch._C._set_tensor_metadata(tensor, metadata)  # type: ignore[attr-defined]
+
+
 def _rebuild_tensor_v2(
-    storage, storage_offset, size, stride, requires_grad, backward_hooks
+    storage, storage_offset, size, stride, requires_grad, backward_hooks, metadata=None
 ):
     tensor = _rebuild_tensor(storage, storage_offset, size, stride)
     tensor.requires_grad = requires_grad
+    if metadata:
+        set_tensor_metadata(tensor, metadata)
+
     # NB: This line exists only for backwards compatibility; the
     # general expectation is that backward_hooks is an empty
     # OrderedDict.  See Note [Don't serialize hooks]
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index b17d8ae07de70..b8693a484ed9d 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -56,6 +56,7 @@
 #include <torch/csrc/jit/python/init.h>
 #include <torch/csrc/jit/python/python_ir.h>
 #include <torch/csrc/jit/python/python_tracer.h>
+#include <torch/csrc/jit/serialization/pickler.h>
 #include <torch/csrc/lazy/python/init.h>
 #include <torch/csrc/monitor/python_init.h>
 #include <torch/csrc/multiprocessing/init.h>
@@ -1544,6 +1545,12 @@ Call this whenever a new thread is created in order to propagate values from
       "_set_conj", [](const at::Tensor& x, bool conj) { x._set_conj(conj); });
   py_module.def(
       "_set_neg", [](const at::Tensor& x, bool neg) { x._set_neg(neg); });
+  py_module.def("_get_tensor_metadata", &torch::jit::getTensorMetadata);
+  py_module.def(
+      "_set_tensor_metadata",
+      static_cast<void (*)(
+          const at::Tensor&, std::unordered_map<std::string, bool>)>(
+          torch::jit::setTensorMetadata));
   py_module.def("_dispatch_key_set", [](const at::Tensor& x) {
     return toString(x.key_set());
   });
diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp
index 22efbf1b47607..364d603b4c43c 100644
--- a/torch/csrc/jit/serialization/pickler.cpp
+++ b/torch/csrc/jit/serialization/pickler.cpp
@@ -480,6 +480,20 @@ void Pickler::pushLiteralTensor(const IValue& ivalue) {
   // Construct the collections.OrderedDict for the backward_hooks
   push<PickleOpCode>(PickleOpCode::REDUCE);
 
+  if (!quantized) {
+    // Only push it for regular tensor if the dictionary is not empty.
+    auto metadata = torch::jit::getTensorMetadata(tensor);
+    if (!metadata.empty()) {
+      // IValues based on std::unordered_map<K, V> are slow and deprecated.
+      // Thus, pass a c10::Dict to pushDict.
+      c10::Dict<std::string, bool> math_bits_;
+      for (const auto& pair : metadata) {
+        math_bits_.insert(pair.first, pair.second);
+      }
+      pushDict(math_bits_);
+    }
+  }
+
   push<PickleOpCode>(PickleOpCode::TUPLE);
 
   // Call torch._utils._rebuild_tensor_v2
diff --git a/torch/csrc/jit/serialization/pickler.h b/torch/csrc/jit/serialization/pickler.h
index e6ba2d281ada0..c289cae12b649 100644
--- a/torch/csrc/jit/serialization/pickler.h
+++ b/torch/csrc/jit/serialization/pickler.h
@@ -296,5 +296,54 @@ uint64_t getStorageKey(const at::Tensor& tensor);
 // otherwise return false
 bool checkHasValidSetGetState(const std::shared_ptr<c10::ClassType>& cls);
 
+// Return a map of Tensor Metadata for serialization.
+// For now, it only takes care of `conj` and `neg` bit.
+inline std::unordered_map<std::string, bool> getTensorMetadata(
+    const at::Tensor& t) {
+  std::unordered_map<std::string, bool> metadata{};
+
+  // Only add meta-data if the value is not default.
+  if (t.is_conj()) {
+    metadata["conj"] = true;
+  }
+  if (t.is_neg()) {
+    metadata["neg"] = true;
+  }
+  return metadata;
+}
+
+// set Tensor Metadata based on the map.
+// Refer: getTensorMathdata
+inline void setTensorMetadata(
+    const at::Tensor& t,
+    std::unordered_map<std::string, bool> metadata) {
+  for (auto& key_value_pair : metadata) {
+    if (key_value_pair.first == "conj") {
+      t._set_conj(true);
+    } else if (key_value_pair.first == "neg") {
+      t._set_neg(true);
+    } else {
+      TORCH_CHECK(
+          false,
+          "Unexpected key `",
+          key_value_pair.first,
+          "` passed to setTensorMetadata.");
+    }
+  }
+}
+
+// set Tensor metadata based on the map.
+// NOTE: This overload is required by unpickler.cpp
+inline void setTensorMetadata(
+    const at::Tensor& t,
+    c10::Dict<c10::IValue, c10::IValue> metadata_idict) {
+  std::unordered_map<std::string, bool> metadata;
+  for (auto& pair : metadata_idict) {
+    auto key = *pair.key().toString();
+    metadata[key] = pair.value().toBool();
+  }
+  setTensorMetadata(t, metadata);
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp
index 7b40f138c600f..f7e974919f03d 100644
--- a/torch/csrc/jit/serialization/unpickler.cpp
+++ b/torch/csrc/jit/serialization/unpickler.cpp
@@ -823,13 +823,28 @@ void Unpickler::rebuildTensor(bool quantized) {
     } else {
       result = at::empty({0}, storage_tensor.options());
     }
-    bool requires_grad = elements.at(idx).toBool();
-    // elements[idx++] is empty backwards hooks
+    bool requires_grad = elements.at(idx++).toBool();
+    idx++; // backwards hooks is empty
     at::TensorImpl* impl = result.unsafeGetTensorImpl();
     impl->set_storage_keep_dtype(storage_tensor.storage());
     impl->set_storage_offset(storage_offset);
     impl->set_sizes_and_strides(size, stride);
     result = autograd::make_variable(result, requires_grad);
+
+    // Handle if math_bits were pickled.
+    // See `args` of _reduce_ex_internal
+    // for a regular tensor (final else case).
+    // Tensors pickled before this patch didn't
+    // have this argument for storing MathBits,
+    // in that case, we do nothing.
+    // NOTE: `math_bits` is the 7th arg.
+    // NOTE: This is only meant for regular tensor and not quantized
+    //       which also has 7 args serialized.
+    if (!quantized && elements.size() == 7) {
+      auto math_bits = elements.at(idx++).toGenericDict();
+      torch::jit::setTensorMetadata(result, math_bits);
+    }
+
     stack_.emplace_back(std::move(result));
   });
 }
diff --git a/torch/utils/model_dump/__init__.py b/torch/utils/model_dump/__init__.py
index 8a230800099bc..bbb456a6f14b9 100644
--- a/torch/utils/model_dump/__init__.py
+++ b/torch/utils/model_dump/__init__.py
@@ -134,7 +134,10 @@ def hierarchical_pickle(data):
             }
         if typename == "torch._utils._rebuild_tensor_v2":
             assert data.state is None
-            storage, offset, size, stride, requires_grad, hooks = data.args
+            if len(data.args) == 6:
+                storage, offset, size, stride, requires_grad, hooks = data.args
+            else:
+                storage, offset, size, stride, requires_grad, hooks, metadata = data.args
             storage_info = get_storage_info(storage)
             return {"__tensor_v2__": [storage_info, offset, size, stride, requires_grad]}
         if typename == "torch._utils._rebuild_qtensor":

From 5f971e9061321a64563d84addd50168c8958ba0d Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Wed, 9 Nov 2022 16:41:04 +0000
Subject: [PATCH 0708/1922] Mark dynamo torchbench dlrm as unsupported (#88712)

- DLRM requires special configuration of embedding layers which are sparse
  and not compatible with DDP.
- I could mark the embedding params as ignored in DDP
  to make the benchmark pass, but this isn't a representative benchmark.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88712
Approved by: https://github.com/ezyang
---
 benchmarks/dynamo/common.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index c332f47562982..3cd17c446c225 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -1617,7 +1617,11 @@ def run(runner, args, original_dir=None):
         else:
             # TODO(whc) after enabling DDPOptimizer by default this could be removed or assert
             torch._dynamo.config.optimize_ddp = True
-
+        if args.only == "dlrm":
+            log.error(
+                "DLRM+DDP is unsupported as it requires sharding the embedding layer separately from DDP"
+            )
+            return sys.exit(-1)
     if args.accuracy:
         # Use small batch size. We use >1 batch size to ensure we test
         # batch_norm type of operators that work on batch dims.

From 14ae2b0b5d4f8d6afdad2296f59b083bf9b00337 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 9 Nov 2022 08:24:44 -0500
Subject: [PATCH 0709/1922] Meta function for aten.sort and aten.scatter*
 (#88705)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88705
Approved by: https://github.com/ezyang
---
 test/functorch/test_aotdispatch.py |  9 -----
 test/test_proxy_tensor.py          | 10 -----
 torch/_meta_registrations.py       | 65 ++++++++++++++++++++++++++++++
 3 files changed, 65 insertions(+), 19 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index ebd4a25602839..09b65a32bfee9 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1109,18 +1109,15 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('masked.var', ''),  # ones() received an invalid combination of arguments - got (torch.Size, device=to...
     xfail('matmul', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('matrix_exp', ''),  # aten.linalg_matrix_exp.default - couldn't find symbolic meta function/decompo...
-    xfail('max', 'reduction_with_dim'),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('median', ''),  # could not find kernel
     xfail('meshgrid', 'list_of_tensors'),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('meshgrid', 'variadic_tensors'),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('min', 'reduction_with_dim'),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('mode', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('msort', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('mv', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('mvlgamma', 'mvlgamma_p_1'),  # aten.digamma_.default - couldn't find symbolic meta function/decom...
     xfail('mvlgamma', 'mvlgamma_p_3'),  # aten.digamma_.default - couldn't find symbolic meta function/decom...
     xfail('mvlgamma', 'mvlgamma_p_5'),  # aten.digamma_.default - couldn't find symbolic meta function/decom...
-    xfail('nanmedian', ''),  # aten.logical_or_.default - couldn't find symbolic meta function/decomposition
 
     # Deleting this in a followup
     xfail('nn.functional.poisson_nll_loss', ''),
@@ -1201,15 +1198,9 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('round', 'decimals_0'),  # aten.round.decimals - couldn't find symbolic meta function/decomposition
     xfail('round', 'decimals_3'),  # aten.round.decimals - couldn't find symbolic meta function/decomposition
     xfail('round', 'decimals_neg_3'),  # aten.round.decimals - couldn't find symbolic meta function/decompos...
-    xfail('scatter', ''),  # aten.scatter.src - couldn't find symbolic meta function/decomposition
-    xfail('scatter_reduce', 'amax'),  # aten.scatter_reduce.two - couldn't find symbolic meta function/decom...
-    xfail('scatter_reduce', 'amin'),  # aten.scatter_reduce.two - couldn't find symbolic meta function/decom...
-    xfail('scatter_reduce', 'mean'),  # aten.scatter_reduce.two - couldn't find symbolic meta function/decom...
-    xfail('scatter_reduce', 'sum'),  # aten.scatter_reduce.two - couldn't find symbolic meta function/decomp...
     xfail('segment_reduce', 'lengths'),  # aten.segment_reduce.default - couldn't find symbolic meta functio...
     xfail('segment_reduce', 'offsets'),  # aten.segment_reduce.default - couldn't find symbolic meta functio...
     xfail('sgn', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('sort', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('special.i1', ''),  # aten.i0.default - couldn't find symbolic meta function/decomposition
     xfail('special.polygamma', 'special_polygamma_n_0'),  # aten.polygamma.default - couldn't find symbolic ...
     xfail('split', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index acb0856f860b5..242be9c78939e 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1136,7 +1136,6 @@ def f(a, b, c, d, e):
     xfail('aminmax', ''),  # aten.aminmax.default - couldn't find symbolic meta function/decomposition
     xfail('argmax', ''),  # aten.argmax.default - couldn't find symbolic meta function/decomposition
     xfail('argmin', ''),  # aten.argmin.default - couldn't find symbolic meta function/decomposition
-    xfail('argsort', ''),  # aten.sort.default - couldn't find symbolic meta function/decomposition
     xfail('argwhere', ''),  # aten.nonzero.default - couldn't find symbolic meta function/decomposition
     xfail('baddbmm', ''),  # aten.baddbmm.default - couldn't find symbolic meta function/decomposition
     xfail('bucketize', ''),  # aten.bucketize.Tensor - couldn't find symbolic meta function/decomposition
@@ -1245,7 +1244,6 @@ def f(a, b, c, d, e):
     xfail('meshgrid', 'variadic_tensors'),  # Tensors of type TensorImpl do not have numel
     xfail('min', 'reduction_with_dim'),  # aten.min.dim - couldn't find symbolic meta function/decomposition
     xfail('mode', ''),  # aten.mode.default - couldn't find symbolic meta function/decomposition
-    xfail('msort', ''),  # aten.sort.default - couldn't find symbolic meta function/decomposition
     xfail('nanquantile', ''),  # Could not run 'aten::equal' with arguments from the 'Meta' backend.
     xfail('narrow', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.adaptive_avg_pool3d', ''),  # aten._adaptive_avg_pool3d.default - couldn't find symbolic meta func...
@@ -1316,15 +1314,8 @@ def f(a, b, c, d, e):
     xfail('round', 'decimals_0'),  # aten.round.decimals - couldn't find symbolic meta function/decomposition
     xfail('round', 'decimals_3'),  # aten.round.decimals - couldn't find symbolic meta function/decomposition
     xfail('round', 'decimals_neg_3'),  # aten.round.decimals - couldn't find symbolic meta function/decomposition
-    xfail('scatter', ''),  # aten.scatter.src - couldn't find symbolic meta function/decomposition
-    xfail('scatter_reduce', 'amax'),  # aten.scatter_reduce.two - couldn't find symbolic meta function/decomposition
-    xfail('scatter_reduce', 'amin'),  # aten.scatter_reduce.two - couldn't find symbolic meta function/decomposition
-    xfail('scatter_reduce', 'mean'),  # aten.scatter_reduce.two - couldn't find symbolic meta function/decomposition
-    xfail('scatter_reduce', 'prod'),  # aten.scatter_reduce.two - couldn't find symbolic meta function/decomposition
-    xfail('scatter_reduce', 'sum'),  # aten.scatter_reduce.two - couldn't find symbolic meta function/decomposition
     xfail('searchsorted', ''),  # Could not run 'aten::searchsorted.Tensor' with arguments from the 'Meta' backend. ...
     xfail('segment_reduce', 'offsets'),  # aten.segment_reduce.default - couldn't find symbolic meta function/decomposition
-    xfail('sort', ''),  # aten.sort.default - couldn't find symbolic meta function/decomposition
     xfail('special.airy_ai', ''),  # aten.special_airy_ai.default - couldn't find symbolic meta function/decomposition
     xfail('special.bessel_y0', ''),  # aten.special_bessel_y0.default - couldn't find symbolic meta function/decomposition
     xfail('special.bessel_y1', ''),  # aten.special_bessel_y1.default - couldn't find symbolic meta function/decomposition
@@ -1436,7 +1427,6 @@ def f(a, b, c, d, e):
     xfail('reciprocal', ''),  # aten.reciprocal_.default - couldn't find symbolic meta function/decomposition
     xfail('remainder', ''),  # aten.remainder_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('rsqrt', ''),  # aten.rsqrt_.default - couldn't find symbolic meta function/decomposition
-    xfail('scatter_add', ''),  # aten.scatter_add_.default - couldn't find symbolic meta function/decomposition
     xfail('sgn', ''),  # aten.sgn_.default - couldn't find symbolic meta function/decomposition
     xfail('sigmoid', ''),  # aten.sigmoid_.default - couldn't find symbolic meta function/decomposition
     xfail('sign', ''),  # aten.sign_.default - couldn't find symbolic meta function/decomposition
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index f4fd6e77db9c0..bb53cd268bda2 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1670,6 +1670,71 @@ def meta_scatter_add(self, dim, index, src):
     return self.new_empty(self.shape)
 
 
+@register_meta(aten.scatter_add_)
+def meta_scatter_add_(self, dim, index, src):
+    scatter_meta_impl(self, dim, index, src, "add")
+    return self
+
+
+@register_meta(aten.scatter)
+@out_wrapper()
+def meta_scatter(self, dim, index, src_or_value, reduce=None):
+    src = src_or_value if isinstance(src_or_value, torch.Tensor) else None
+    scatter_meta_impl(self, dim, index, src, reduce)
+    return self.new_empty(self.shape)
+
+
+@register_meta(aten.scatter_)
+def meta_scatter_(self, dim, index, src_or_value, reduce=None):
+    src = src_or_value if isinstance(src_or_value, torch.Tensor) else None
+    scatter_meta_impl(self, dim, index, src, reduce)
+    return self
+
+
+@register_meta([aten.scatter_reduce.two, aten.scatter_reduce.two_out])
+@out_wrapper()
+def meta_scatter_reduce_two(self, dim, index, src, reduce, include_self=True):
+    scatter_meta_impl(self, dim, index, src, reduce, use_new_options=True)
+    return self.new_empty(self.shape)
+
+
+@register_meta(aten.scatter_reduce_.two)
+def meta_scatter_reduce__two(self, dim, index, src, reduce, include_self=True):
+    scatter_meta_impl(self, dim, index, src, reduce, use_new_options=True)
+    return self
+
+
+@register_meta(aten.upsample_nearest2d.vec)
+def upsample_nearest2d_vec(input, output_size, scale_factors):
+    mem_format = utils.suggest_memory_format(input)
+    spatial_dimensions = input.dim() - 2
+
+    input_shape = input.shape
+    if output_size is not None:
+        assert scale_factors is None
+        out_size = output_size
+    elif scale_factors is not None:
+        assert output_size is None
+        out_size = []
+        for i in range(spatial_dimensions):
+            sym_float = (input_shape[i + 2] / 1) * scale_factors[i]
+            assert sym_float >= 0
+            out_size.append(math.floor(sym_float))
+
+    output_height = out_size[0]
+    output_width = out_size[1]
+    nbatch = input_shape[0]
+    channels = input_shape[1]
+    return input.new_empty((nbatch, channels, output_height, output_width)).to(
+        memory_format=mem_format
+    )
+
+
+@register_meta([aten.sort.default, aten.sort.stable])
+def meta_sort(self, stable=None, dim=-1, descending=False):
+    return torch.empty_like(self), torch.empty_like(self, dtype=torch.int64)
+
+
 # We must also trigger meta registrations from PrimTorch ref
 # decompositions
 import torch._refs

From 86a132b1e811bb4cc839d33b25326df85d631d60 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@fb.com>
Date: Tue, 8 Nov 2022 13:44:43 -0800
Subject: [PATCH 0710/1922] [vulkan] Add option for buffer representations in
 vTensor (#87622)

This diff adds the option to use a Buffer to store data for a `vTensor` by passing `StorageType::BUFFER` to the constructor of `vTensor`. To enable this change, the construction of `vTensor` and `vTensorStorage` had to be slightly refactored to properly support strides. To summarize the changes:

* `vTensorStorage` now contains no Tensor metadata (such as tensor sizes, strides, and `TensorOptions`) - it now only contains the image extents (if texture storage is used) and the buffer length. Tensor metadata is now managed by `vTensor`. The reason for this is to allow multiple `vTensor` objects to point to the same `vTensorStorage` but with different metadata which may be a useful feature now that Buffer storage is enabled.
* `vTensor` will now compute the strides upon construction based on the requested sizes and memory layout if Buffer storage is requested. Previously, strides were faked by setting them all to 0 as strides do not apply to image textures (this behavior is preserved for texture storage).

Differential Revision: [D40604163](https://our.internmc.facebook.com/intern/diff/D40604163/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87622
Approved by: https://github.com/digantdesai
---
 .lintrunner.toml                              |   1 +
 aten/src/ATen/native/vulkan/api/Context.cpp   |  57 +++
 aten/src/ATen/native/vulkan/api/Context.h     |   6 +-
 aten/src/ATen/native/vulkan/api/Resource.cpp  |  19 +-
 aten/src/ATen/native/vulkan/api/Resource.h    |  21 +-
 .../native/vulkan/glsl/buffer_to_buffer.glsl  |  78 +++
 aten/src/ATen/native/vulkan/glsl/indexing.h   |  13 +
 aten/src/ATen/native/vulkan/ops/Common.cpp    |   9 +
 aten/src/ATen/native/vulkan/ops/Common.h      |   6 +
 .../ATen/native/vulkan/ops/Convolution.cpp    |  10 +-
 aten/src/ATen/native/vulkan/ops/Copy.cpp      |  38 +-
 aten/src/ATen/native/vulkan/ops/Copy.h        |   8 +-
 aten/src/ATen/native/vulkan/ops/Mm.cpp        |   6 +-
 aten/src/ATen/native/vulkan/ops/Shape.cpp     |   2 +-
 aten/src/ATen/native/vulkan/ops/Tensor.cpp    | 446 +++++++++++++-----
 aten/src/ATen/native/vulkan/ops/Tensor.h      | 171 ++++---
 aten/src/ATen/native/vulkan/ops/Utils.cpp     | 135 +++++-
 aten/src/ATen/test/vulkan_api_test.cpp        |   3 +
 .../ATen/test/vulkan_quantized_api_test.cpp   |  86 ++++
 tools/gen_vulkan_spv.py                       |   3 +-
 20 files changed, 881 insertions(+), 237 deletions(-)
 create mode 100644 aten/src/ATen/native/vulkan/glsl/buffer_to_buffer.glsl
 create mode 100644 aten/src/ATen/native/vulkan/glsl/indexing.h

diff --git a/.lintrunner.toml b/.lintrunner.toml
index 62b13822e4ad6..34b673c7e09ac 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -372,6 +372,7 @@ include_patterns = [
 exclude_patterns = [
     'aten/src/ATen/native/quantized/cpu/qnnpack/**',
     'aten/src/ATen/native/vulkan/api/vk_mem_alloc.h',
+    'aten/src/ATen/native/vulkan/glsl/**',
     'torch/csrc/jit/serialization/mobile_bytecode_generated.h',
 ]
 command = [
diff --git a/aten/src/ATen/native/vulkan/api/Context.cpp b/aten/src/ATen/native/vulkan/api/Context.cpp
index 73bbb4b21c4ad..06038b9e4ecfa 100644
--- a/aten/src/ATen/native/vulkan/api/Context.cpp
+++ b/aten/src/ATen/native/vulkan/api/Context.cpp
@@ -154,6 +154,63 @@ Context* context() {
   return context.get();
 }
 
+//
+// UniformParamsBuffer
+//
+
+namespace {
+
+void memcpy_to_buffer(const VulkanBuffer& src, VulkanBuffer& dst) {
+  MemoryMap dst_mapping(dst, MemoryAccessType::WRITE);
+
+  MemoryMap src_mapping(src, api::MemoryAccessType::READ);
+  src_mapping.invalidate();
+
+  void* dst_ptr = dst_mapping.template data<void>();
+  void* src_ptr = src_mapping.template data<void>();
+
+  memcpy(dst_ptr, src_ptr, src.mem_size());
+}
+
+} // namespace
+
+UniformParamsBuffer::UniformParamsBuffer(const UniformParamsBuffer& other)
+    : context_p_(other.context_p_), vulkan_buffer_{} {
+  if (other.vulkan_buffer_) {
+    vulkan_buffer_ = context_p_->adapter_ptr()->vma().create_uniform_buffer(
+        other.vulkan_buffer_.mem_size());
+
+    memcpy_to_buffer(other.vulkan_buffer_, vulkan_buffer_);
+  }
+}
+
+UniformParamsBuffer& UniformParamsBuffer::operator=(
+    const UniformParamsBuffer& other) {
+  if (&other != this) {
+    context_p_ = other.context_p_;
+
+    // Move vulkan_buffer_ to another VulkanBuffer for cleanup
+    if (vulkan_buffer_) {
+      VulkanBuffer temp_buffer(std::move(vulkan_buffer_));
+      context_p_->register_buffer_cleanup(temp_buffer);
+    }
+    // vulkan_buffer_ should now be empty
+
+    if (other.vulkan_buffer_) {
+      vulkan_buffer_ = context_p_->adapter_ptr()->vma().create_uniform_buffer(
+          other.vulkan_buffer_.mem_size());
+
+      memcpy_to_buffer(other.vulkan_buffer_, vulkan_buffer_);
+    }
+  }
+
+  return *this;
+}
+
+//
+// VulkanImpl
+//
+
 struct VulkanImpl final : public at::vulkan::VulkanImplInterface {
   bool is_vulkan_available() const override {
     return available();
diff --git a/aten/src/ATen/native/vulkan/api/Context.h b/aten/src/ATen/native/vulkan/api/Context.h
index 56db8fa6a173b..ce0525abda573 100644
--- a/aten/src/ATen/native/vulkan/api/Context.h
+++ b/aten/src/ATen/native/vulkan/api/Context.h
@@ -206,14 +206,16 @@ class UniformParamsBuffer final {
   VulkanBuffer vulkan_buffer_;
 
  public:
+  UniformParamsBuffer() : context_p_{nullptr}, vulkan_buffer_{} {}
+
   template <typename Block>
   UniformParamsBuffer(Context* context_p, const Block& block)
       : context_p_(context_p),
         vulkan_buffer_(
             context_p_->adapter_ptr()->vma().create_params_buffer(block)) {}
 
-  UniformParamsBuffer(const UniformParamsBuffer&) = delete;
-  UniformParamsBuffer& operator=(const UniformParamsBuffer&) = delete;
+  UniformParamsBuffer(const UniformParamsBuffer&);
+  UniformParamsBuffer& operator=(const UniformParamsBuffer&);
 
   UniformParamsBuffer(UniformParamsBuffer&&) = default;
   UniformParamsBuffer& operator=(UniformParamsBuffer&&) = default;
diff --git a/aten/src/ATen/native/vulkan/api/Resource.cpp b/aten/src/ATen/native/vulkan/api/Resource.cpp
index 9cfdbcdb03f3e..e47f85b9f556f 100644
--- a/aten/src/ATen/native/vulkan/api/Resource.cpp
+++ b/aten/src/ATen/native/vulkan/api/Resource.cpp
@@ -26,8 +26,8 @@ namespace api {
  * always created with the corresponding VkFormat. Consequently, kHalf tensors
  * are currently unsupported in favor of enforcing inputs to be of kFloat dtype.
  */
-VkFormat vk_format(const caffe2::TypeMeta dtype) {
-  switch (c10::typeMetaToScalarType(dtype)) {
+VkFormat vk_format(const at::ScalarType dtype) {
+  switch (dtype) {
     case kFloat:
 #ifdef USE_VULKAN_FP16_INFERENCE
       return VK_FORMAT_R16G16B16A16_SFLOAT;
@@ -663,6 +663,21 @@ VulkanBuffer MemoryAllocator::create_staging_buffer(const VkDeviceSize size) {
   return VulkanBuffer(allocator_, size, mem_props);
 }
 
+VulkanBuffer MemoryAllocator::create_uniform_buffer(const VkDeviceSize size) {
+  const VulkanBuffer::MemoryProperties mem_props{
+      DEFAULT_ALLOCATION_STRATEGY |
+          VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT,
+      VMA_MEMORY_USAGE_AUTO,
+      0u,
+      0u,
+      VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
+  };
+
+  VulkanBuffer uniform_buffer(allocator_, size, mem_props);
+
+  return uniform_buffer;
+}
+
 //
 // VulkanFence
 //
diff --git a/aten/src/ATen/native/vulkan/api/Resource.h b/aten/src/ATen/native/vulkan/api/Resource.h
index 52153ebc0e05f..9180b3422db13 100644
--- a/aten/src/ATen/native/vulkan/api/Resource.h
+++ b/aten/src/ATen/native/vulkan/api/Resource.h
@@ -18,7 +18,7 @@ namespace api {
 
 typedef uint8_t MemoryAccessFlags;
 
-VkFormat vk_format(const caffe2::TypeMeta dtype);
+VkFormat vk_format(const at::ScalarType dtype);
 
 c10::ScalarType c10_scalartype(const VkFormat image_format);
 
@@ -401,6 +401,14 @@ class MemoryAllocator final {
 
   VulkanBuffer create_staging_buffer(const VkDeviceSize);
 
+  /*
+   * Create a uniform buffer with a specified size
+   */
+  VulkanBuffer create_uniform_buffer(const VkDeviceSize);
+
+  /*
+   * Create a uniform buffer containing the data in an arbitrary struct
+   */
   template <typename Block>
   VulkanBuffer create_params_buffer(const Block& block);
 };
@@ -486,16 +494,7 @@ struct FencePool final {
 
 template <typename Block>
 inline VulkanBuffer MemoryAllocator::create_params_buffer(const Block& block) {
-  const VulkanBuffer::MemoryProperties mem_props{
-      DEFAULT_ALLOCATION_STRATEGY |
-          VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT,
-      VMA_MEMORY_USAGE_AUTO,
-      0u,
-      0u,
-      VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
-  };
-
-  VulkanBuffer uniform_buffer(allocator_, sizeof(Block), mem_props);
+  VulkanBuffer uniform_buffer = create_uniform_buffer(sizeof(Block));
 
   // Fill the uniform buffer with data in block
   {
diff --git a/aten/src/ATen/native/vulkan/glsl/buffer_to_buffer.glsl b/aten/src/ATen/native/vulkan/glsl/buffer_to_buffer.glsl
new file mode 100644
index 0000000000000..7a67a8ca37372
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/buffer_to_buffer.glsl
@@ -0,0 +1,78 @@
+#version 450 core
+
+#define PRECISION $precision
+#define FORMAT $format
+
+#include "indexing.h"
+
+layout(std430) buffer;
+
+/*
+ * Output Buffer
+ */
+layout(set = 0, binding = 0) buffer PRECISION restrict writeonly OutBuffer {
+  float data[];
+}
+uOutput;
+
+/*
+ * Output Buffer Metadata
+ */
+layout(set = 0, binding = 1) uniform PRECISION restrict OutMeta {
+  uvec4 sizes;
+  uvec4 strides;
+  uint ndim;
+  uint buf_length;
+}
+uOutMeta;
+
+/*
+ * Input Buffer
+ */
+layout(set = 0, binding = 2) buffer PRECISION restrict readonly InBuffer {
+  float data[];
+}
+uInput;
+
+/*
+ * Input Buffer Metadata
+ */
+layout(set = 0, binding = 3) uniform PRECISION restrict InMeta {
+  uvec4 sizes;
+  uvec4 strides;
+  uint ndim;
+  uint buf_length;
+}
+uInMeta;
+
+/*
+ * Local Work Group Size
+ */
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+/*
+ * Copies data from the tensor at uInput to the tensor at uOutput based on 4D
+ * coordinate. Each element at (x,y,c,n) in uInput will be copied to uOutput at
+ * (x,y,c,n). If (x,y,c,n) is outside the bounds of uInput then 0 will be
+ * written.
+ *
+ * Each shader invocation is responsible for one element of the output buffer.
+ */
+void main() {
+  const uint write_idx = ivec3(gl_GlobalInvocationID).x;
+
+  if (write_idx >= uOutMeta.buf_length) {
+    return;
+  }
+
+  uvec4 write_coord =
+      idx_to_coord(write_idx, uOutMeta.strides, uOutMeta.sizes);
+
+  float outval = 0u;
+  if (all(lessThan(write_coord, uInMeta.sizes))) {
+    uint read_idx = coord_to_idx(write_coord, uInMeta.strides);
+    outval = uInput.data[read_idx];
+  }
+
+  uOutput.data[write_idx] = outval;
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/indexing.h b/aten/src/ATen/native/vulkan/glsl/indexing.h
new file mode 100644
index 0000000000000..e7b6a29fc16ed
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/indexing.h
@@ -0,0 +1,13 @@
+/*
+ * Computes a 4D tensor co-ordinate from a linearized index
+ */
+uvec4 idx_to_coord(const uint idx, const uvec4 strides, const uvec4 sizes) {
+  return ivec4(mod(idx / strides, sizes));
+}
+
+/*
+ * Computes a linearized index from a 4D tensor co-ordinate
+ */
+uint coord_to_idx(const uvec4 coord, const uvec4 strides) {
+  return int(dot(coord * strides, ivec4(1)));
+}
diff --git a/aten/src/ATen/native/vulkan/ops/Common.cpp b/aten/src/ATen/native/vulkan/ops/Common.cpp
index 5a3daeb074288..4c645ba3b1423 100644
--- a/aten/src/ATen/native/vulkan/ops/Common.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Common.cpp
@@ -5,6 +5,15 @@ namespace native {
 namespace vulkan {
 namespace ops {
 
+api::utils::uvec4 make_nchw_uvec4(const IntArrayRef arr) {
+  uint32_t w = get_dim<Dim4D::Width>(arr);
+  uint32_t h = get_dim<Dim4D::Height>(arr);
+  uint32_t c = get_dim<Dim4D::Channel>(arr);
+  uint32_t n = get_dim<Dim4D::Batch>(arr);
+
+  return {w, h, c, n};
+}
+
 api::utils::uvec3 adaptive_work_group_size(
     const api::utils::uvec3& global_work_group) {
   api::utils::uvec3 local_group_size = {4, 4, 4};
diff --git a/aten/src/ATen/native/vulkan/ops/Common.h b/aten/src/ATen/native/vulkan/ops/Common.h
index 9d4e50c800955..4248417b3c991 100644
--- a/aten/src/ATen/native/vulkan/ops/Common.h
+++ b/aten/src/ATen/native/vulkan/ops/Common.h
@@ -106,6 +106,12 @@ uint32_t get_dim(const vTensor& v_in) {
   return get_dim<N>(v_in.sizes());
 }
 
+/*
+ * Given an IntArrayRef of up to 4 elements, constructs a uvec4 containing those
+ * elements in reverse order.
+ */
+api::utils::uvec4 make_nchw_uvec4(const IntArrayRef arr);
+
 inline c10::optional<Tensor> get_optional_tensor(
     const c10::impl::GenericList& gen_list,
     const uint32_t idx) {
diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
index 9a34169c4a99a..63fb00d6ee0a3 100644
--- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
@@ -125,7 +125,7 @@ at::Tensor rearrange_weights_dw(const Tensor& weight_in) {
   // reshape to stack the resulting batches vertically
   weight = weight.permute({1, 0, 2, 3}).reshape({4, N4 * C, H * W});
 
-  return weight;
+  return weight.contiguous();
 }
 
 /*
@@ -228,7 +228,7 @@ at::Tensor rearrange_weights_2d(const Tensor& weight_in, bool tconv) {
   // Collapse the outermost dim so that each group of 4 is stacked vertically
   weight = weight.permute({1, 0, 2, 3}).reshape({4, N4 * H, C_aligned * W});
 
-  return weight;
+  return weight.contiguous();
 }
 
 /*
@@ -272,7 +272,7 @@ at::Tensor rearrange_bias(
   bias = bias.reshape({L4, 4}).permute({1, 0});
   bias = bias.reshape({4, 1, L4});
 
-  return bias;
+  return bias.contiguous();
 }
 
 //
@@ -517,8 +517,8 @@ vTensor pack_weights(
   vTensor v_weight{
       api::context(),
       weight_rearranged.sizes(),
-      quantized ? api::StorageType::TEXTURE_3D : api::StorageType::TEXTURE_2D,
       weight_arg.options(),
+      quantized ? api::StorageType::TEXTURE_3D : api::StorageType::TEXTURE_2D,
   };
 
   if (quantized) {
@@ -542,8 +542,8 @@ vTensor pack_biases(
   vTensor v_bias{
       api::context(),
       bias_rearranged.sizes(),
-      quantized ? api::StorageType::TEXTURE_3D : api::StorageType::TEXTURE_2D,
       weight.options(),
+      quantized ? api::StorageType::TEXTURE_3D : api::StorageType::TEXTURE_2D,
   };
 
   if (quantized) {
diff --git a/aten/src/ATen/native/vulkan/ops/Copy.cpp b/aten/src/ATen/native/vulkan/ops/Copy.cpp
index dbac25e0c7ee3..06f9225fe47df 100644
--- a/aten/src/ATen/native/vulkan/ops/Copy.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Copy.cpp
@@ -1,3 +1,4 @@
+#include <ATen/ATen.h>
 #include <ATen/native/vulkan/ops/Copy.h>
 #include <ATen/native/vulkan/ops/Utils.h>
 
@@ -52,7 +53,7 @@ void transfer_cpu_to_vulkan(const Tensor& src, vTensor& v_dst) {
   // a 16 bit format will be used for at::kFloat.
   Tensor src_nc4hw = utils::nchw_to_nc4hw(src).to(v_dst.texture_dtype());
 
-  api::StorageBuffer staging(context, v_dst.texture_dtype(), v_dst.numcells());
+  api::StorageBuffer staging(context, v_dst.texture_dtype(), v_dst.gpu_numel());
   // Copy data into the staging buffer
   {
     api::MemoryMap mapping(staging.buffer(), api::MemoryAccessType::WRITE);
@@ -71,7 +72,7 @@ void transfer_vulkan_to_cpu(vTensor& v_src, Tensor& dst) {
   // Temporary tensor to receive copied NC4HW data
   at::Tensor dst_tmp = utils::create_staging_tensor(v_src);
 
-  api::StorageBuffer staging(context, v_src.texture_dtype(), v_src.numcells());
+  api::StorageBuffer staging(context, v_src.texture_dtype(), v_src.gpu_numel());
 
   api::VulkanFence fence = context->fences().get_fence();
 
@@ -135,13 +136,16 @@ void transfer_vulkan_to_vulkan(vTensor& src, vTensor& dst) {
 void pack_cpu_to_vulkan(const Tensor& src, vTensor& dst) {
   api::Context* const context = api::context();
 
+  // Ensure that src is contiguous in its memory format
+  Tensor src_contig = src.contiguous(src.suggest_memory_format());
+
   // Note that the float data type has been enforced for the storage buffer
   // below. The reason for this is that the nchw_to_image and image_to_nchw
   // shaders which perform the transfer to/from an image texture expect a buffer
   // of floats as input. GLSL/Vulkan does not natively support 16 bit arithmetic
   // types, so for now storage buffers created for compute shaders must define
   // floats as their base data type.
-  api::StorageBuffer staging(context, at::kFloat, dst.numcells());
+  api::StorageBuffer staging(context, at::kFloat, dst.gpu_numel());
   {
     api::MemoryMap mapping(staging.buffer(), api::MemoryAccessType::WRITE);
 
@@ -150,9 +154,9 @@ void pack_cpu_to_vulkan(const Tensor& src, vTensor& dst) {
     // buffer as input (note that at::kFloat is used to create the StorageBuffer
     // above).
     if (src.dtype() == at::kHalf) {
-      memcpy_to_mapping(src.to(at::kFloat), mapping);
+      memcpy_to_mapping(src_contig.to(at::kFloat), mapping);
     } else {
-      memcpy_to_mapping(src, mapping);
+      memcpy_to_mapping(src_contig, mapping);
     }
   }
   utils::pack_staging_to_vtensor(staging.buffer(), dst);
@@ -163,7 +167,7 @@ void pack_vulkan_to_cpu(vTensor& src, Tensor& dst) {
 
   // Refer to the comment in pack_cpu_to_vulkan for why at::kFloat is specified
   // for the storage buffer below.
-  api::StorageBuffer staging(context, at::kFloat, src.numcells());
+  api::StorageBuffer staging(context, at::kFloat, src.gpu_numel());
 
   api::VulkanFence fence = context->fences().get_fence();
 
@@ -245,6 +249,28 @@ Tensor& copy_(Tensor& dst, const Tensor& src) {
   return dst;
 }
 
+ops::vTensor to_vulkan(at::Tensor& src, const api::StorageType storage_type) {
+  TORCH_CHECK(
+      src.device().type() == at::kCPU,
+      "Vulkan to_vulkan(): input tensor must be a CPU tensor!")
+
+  ops::vTensor v_ret{
+      api::context(),
+      src.sizes(),
+      src.options().memory_format(src.suggest_memory_format()),
+      storage_type};
+
+  ops::pack_cpu_to_vulkan(src, v_ret);
+
+  return v_ret;
+}
+
+at::Tensor from_vulkan(ops::vTensor& v_src) {
+  at::Tensor ret = at::empty(v_src.sizes(), v_src.options().device(at::kCPU));
+  ops::pack_vulkan_to_cpu(v_src, ret);
+  return ret;
+}
+
 } // namespace ops
 } // namespace vulkan
 } // namespace native
diff --git a/aten/src/ATen/native/vulkan/ops/Copy.h b/aten/src/ATen/native/vulkan/ops/Copy.h
index bf72a96b219fb..a91d500a1a343 100644
--- a/aten/src/ATen/native/vulkan/ops/Copy.h
+++ b/aten/src/ATen/native/vulkan/ops/Copy.h
@@ -19,6 +19,12 @@ void pack_vulkan_to_cpu(vTensor& src, Tensor& dst);
 
 Tensor& copy_(Tensor& dst, const Tensor& src);
 
+ops::vTensor to_vulkan(
+    at::Tensor& src,
+    const api::StorageType storage_type = api::StorageType::TEXTURE_3D);
+
+at::Tensor from_vulkan(ops::vTensor& v_src);
+
 //
 // Utility functions for memcpy
 //
@@ -28,7 +34,7 @@ void memcpy_to_mapping_impl(const Tensor& src, api::MemoryMap& dst_mapping) {
   T* data_ptr = dst_mapping.template data<T>();
   memcpy(
       data_ptr,
-      src.contiguous().data_ptr<T>(),
+      src.data_ptr<T>(),
       std::min(src.nbytes(), dst_mapping.nbytes()));
 }
 
diff --git a/aten/src/ATen/native/vulkan/ops/Mm.cpp b/aten/src/ATen/native/vulkan/ops/Mm.cpp
index b003a322804ad..c8225f08354ef 100644
--- a/aten/src/ATen/native/vulkan/ops/Mm.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Mm.cpp
@@ -43,7 +43,7 @@ vTensor pack_weights(const Tensor& weight_arg) {
       weight.options(),
   };
 
-  api::StorageBuffer staging(context, at::kFloat, v_weight.numcells());
+  api::StorageBuffer staging(context, at::kFloat, v_weight.gpu_numel());
   {
     api::MemoryMap mapping(staging.buffer(), api::MemoryAccessType::WRITE);
 
@@ -106,7 +106,7 @@ vTensor pack_biases(
         bias_arg->options(),
     };
 
-    api::StorageBuffer staging(context, at::kFloat, v_bias.numcells());
+    api::StorageBuffer staging(context, at::kFloat, v_bias.gpu_numel());
     {
       api::MemoryMap mapping(staging.buffer(), api::MemoryAccessType::WRITE);
 
@@ -135,7 +135,7 @@ vTensor pack_biases(
         weight_arg.options(),
     };
 
-    api::StorageBuffer staging(context, at::kFloat, v_bias.numcells());
+    api::StorageBuffer staging(context, at::kFloat, v_bias.gpu_numel());
     {
       api::MemoryMap mapping(staging.buffer(), api::MemoryAccessType::WRITE);
 
diff --git a/aten/src/ATen/native/vulkan/ops/Shape.cpp b/aten/src/ATen/native/vulkan/ops/Shape.cpp
index d8263e59668e6..4209a3781cd28 100644
--- a/aten/src/ATen/native/vulkan/ops/Shape.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Shape.cpp
@@ -22,7 +22,7 @@ Tensor view_internal(const Tensor& self_arg, const IntArrayRef shape) {
       self.options(),
   };
 
-  api::StorageBuffer buffer(context, at::kFloat, v_self.numcells(), true);
+  api::StorageBuffer buffer(context, at::kFloat, v_self.gpu_numel(), true);
 
   utils::pack_vtensor_to_staging(v_self, buffer.buffer());
 
diff --git a/aten/src/ATen/native/vulkan/ops/Tensor.cpp b/aten/src/ATen/native/vulkan/ops/Tensor.cpp
index b9ce7a0caf5fe..315462ac0d1df 100644
--- a/aten/src/ATen/native/vulkan/ops/Tensor.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Tensor.cpp
@@ -1,3 +1,4 @@
+#include <ATen/native/vulkan/api/Utils.h>
 #include <ATen/native/vulkan/ops/Common.h>
 #include <ATen/native/vulkan/ops/Tensor.h>
 #include <c10/util/accumulate.h>
@@ -9,45 +10,207 @@ namespace ops {
 
 namespace {
 
-api::utils::uvec3 image_extents(const IntArrayRef sizes) {
-  int64_t width = 1;
-  int64_t height = 1;
-  int64_t depth = 1;
+/*
+ * Extracts the memory format member of a TensorOptions struct. If there is no
+ * empty format listed, then a contiguous format is assumed.
+ */
+at::MemoryFormat get_memory_format(const TensorOptions& options) {
+  return options.memory_format_opt() ? *(options.memory_format_opt())
+                                     : at::MemoryFormat::Contiguous;
+}
 
-  switch (sizes.size()) {
-    case 1:
-      width = sizes[0];
-      break;
+/*
+ * Calculates the strides of a contiguous tensor. empty_tensor_restride from
+ * TensorImpl.h was used as a reference.
+ */
+c10::SmallVector<int64_t, 6u> calc_contiguous_strides(const IntArrayRef sizes) {
+  int64_t ndim = sizes.size();
+  c10::SmallVector<int64_t, 6u> strides(ndim);
+
+  int64_t running_product = 1;
+  if (ndim >= 1) {
+    strides[ndim - 1] = running_product;
+    for (int i = sizes.size() - 2; i >= 0; --i) {
+      running_product *= sizes[i + 1];
+      strides[i] = running_product;
+    }
+  }
 
-    case 2:
-      width = sizes[1];
-      height = sizes[0];
-      break;
+  return strides;
+}
 
-    case 3:
-      width = sizes[2];
-      height = sizes[1];
-      depth = sizes[0];
-      break;
+c10::SmallVector<int64_t, 6u> calc_channels_last_strides(
+    const IntArrayRef sizes) {
+  c10::SmallVector<int64_t, 6u> strides(sizes.size());
 
+  switch (sizes.size()) {
     case 4:
-      width = sizes[3];
-      height = sizes[2];
-      depth = sizes[0] * sizes[1];
-      break;
-
+      strides[1] = 1;
+      strides[3] = sizes[1];
+      strides[2] = strides[3] * sizes[3];
+      strides[0] = strides[2] * sizes[2];
+      return strides;
+    case 3:
+      strides[0] = 1;
+      strides[2] = sizes[0];
+      strides[1] = strides[2] * sizes[2];
+      return strides;
     default:
-      TORCH_INTERNAL_ASSERT(
-          false,
-          "Only Tensors with 1 <= dim <= 4 can be represented as a Vulkan Image!");
+      TORCH_CHECK(
+          false, "ChannelsLast format only available for 3 <= ndim <= 4!");
   }
 
-  return {
-      api::utils::safe_downcast<uint32_t>(width),
-      api::utils::safe_downcast<uint32_t>(height),
-      api::utils::safe_downcast<uint32_t>(
-          api::utils::div_up(depth, INT64_C(4))),
+  return strides;
+}
+
+/*
+ * Calculates the strides of a tensor based on the sizes and memory format. Note
+ * that strides are only valid for vTensors that are backed by buffer storage;
+ * if texture storage is used then the strides are invalid and set to zeros.
+ */
+c10::SmallVector<int64_t, 6u> calc_strides(
+    const IntArrayRef sizes,
+    const at::MemoryFormat memory_format,
+    const api::StorageType storage_type) {
+  if (storage_type == api::StorageType::BUFFER) {
+    switch (memory_format) {
+      case MemoryFormat::Contiguous:
+        return calc_contiguous_strides(sizes);
+        break;
+      case MemoryFormat::ChannelsLast:
+        return calc_channels_last_strides(sizes);
+        break;
+      default:
+        TORCH_CHECK(false, "Invalid memory format used to create vTensor!");
+    }
+  } else {
+    c10::SmallVector<int64_t, 6u> strides(sizes.size());
+    return strides;
+  }
+}
+
+/*
+ * When stored on the GPU, one dimension will be aligned to the next multiple of
+ * 4 in order to take advantage of vec4 data types. This function adjusts one of
+ * the dimensions based on the desired memory format and storage type.
+ */
+c10::SmallVector<int64_t, 6u> calc_gpu_sizes(
+    const IntArrayRef sizes,
+    const at::MemoryFormat memory_format,
+    const api::StorageType storage_type) {
+  size_t ndim = sizes.size();
+
+  // For buffer formats, the innermost dim (i.e. where the stride is 1) will be
+  // aligned up.
+  if (storage_type == api::StorageType::BUFFER) {
+    c10::SmallVector<int64_t, 6u> gpu_sizes{sizes};
+
+    switch (memory_format) {
+      case at::MemoryFormat::Contiguous:
+        gpu_sizes[ndim - 1] = api::utils::align_up(sizes[ndim - 1], INT64_C(4));
+        break;
+
+      case at::MemoryFormat::ChannelsLast:
+        switch (ndim) {
+          case 3:
+            gpu_sizes[0] = api::utils::align_up(sizes[0], INT64_C(4));
+            break;
+
+          case 4:
+            gpu_sizes[1] = api::utils::align_up(sizes[1], INT64_C(4));
+            break;
+        }
+        break;
+
+      default:
+        TORCH_CHECK(false, "Invalid memory format used to create vTensor!");
+        break;
+    }
+
+    return gpu_sizes;
+  } else {
+    TORCH_CHECK(
+        ndim >= 1 && ndim <= 4,
+        "Texture storage only valid for 1 <= ndim <= 4!");
+
+    c10::SmallVector<int64_t, 6u> gpu_sizes(3);
+
+    // Channel dim will be always be aligned. For 4 dimensional tensors, batch
+    // and channel are combined, then aligned.
+    switch (ndim) {
+      case 1:
+        gpu_sizes[0] = 4;
+        gpu_sizes[1] = 1;
+        gpu_sizes[2] = sizes[0];
+        break;
+
+      case 2:
+        gpu_sizes[0] = 4;
+        gpu_sizes[1] = sizes[0];
+        gpu_sizes[2] = sizes[1];
+        break;
+
+      case 3:
+        gpu_sizes[0] = api::utils::align_up(sizes[0], INT64_C(4));
+        gpu_sizes[1] = sizes[1];
+        gpu_sizes[2] = sizes[2];
+        break;
+
+      case 4:
+        int64_t combined_depth = sizes[0] * sizes[1];
+        gpu_sizes[0] = api::utils::align_up(combined_depth, INT64_C(4));
+        gpu_sizes[1] = sizes[2];
+        gpu_sizes[2] = sizes[3];
+        break;
+    }
+    return gpu_sizes;
+  }
+}
+
+/*
+ * Creates a uvec3 denoting the extents of the image texture that will be
+ * created to store a tensor of a given size.
+ */
+api::utils::uvec3 create_image_extents(
+    const IntArrayRef gpu_sizes,
+    const api::StorageType storage_type) {
+  size_t ndim = gpu_sizes.size();
+
+  if (storage_type == api::StorageType::BUFFER) {
+    // image extents do not apply to buffer storage
+    return {0u, 0u, 0u};
+  } else {
+    TORCH_CHECK(
+        ndim >= 1 && ndim <= 3,
+        "Texture storage only valid for 1 <= ndim <= 3!");
+
+    uint32_t width = get_dim<Dim4D::Width>(gpu_sizes);
+    uint32_t height = get_dim<Dim4D::Height>(gpu_sizes);
+    uint32_t depth = get_dim<Dim4D::Channel>(gpu_sizes);
+
+    TORCH_CHECK(depth % 4 == 0, "Channels must be divisible by 4!")
+
+    return {width, height, depth / 4u};
+  }
+}
+
+api::UniformParamsBuffer make_metadata_uniform(
+    api::Context* const context,
+    const IntArrayRef sizes,
+    const IntArrayRef strides,
+    const api::StorageType storage_type) {
+  if (storage_type != api::StorageType::BUFFER) {
+    return api::UniformParamsBuffer();
+  }
+
+  vTensor::BufferMetadata metadata{
+      ops::make_nchw_uvec4(sizes),
+      ops::make_nchw_uvec4(strides),
+      api::utils::safe_downcast<uint32_t>(sizes.size()),
+      api::utils::safe_downcast<uint32_t>(c10::multiply_integers(sizes)),
   };
+
+  return api::UniformParamsBuffer(context, metadata);
 }
 
 } // namespace
@@ -59,58 +222,69 @@ api::utils::uvec3 image_extents(const IntArrayRef sizes) {
 vTensor::vTensor(
     api::Context* const context,
     const IntArrayRef sizes,
-    const TensorOptions& options)
-    : view_(std::make_shared<vTensorStorage>(
+    const TensorOptions& options,
+    const api::StorageType storage_type)
+    : options_(options),
+      memory_format_(get_memory_format(options)),
+      // Calculate sizes and strides
+      sizes_{sizes},
+      strides_{calc_strides(sizes, memory_format_, storage_type)},
+      gpu_sizes_{calc_gpu_sizes(sizes, memory_format_, storage_type)},
+      gpu_strides_{calc_strides(gpu_sizes_, memory_format_, storage_type)},
+      // Vulkan uniform buffer containing sizes and stride info
+      metadata_uniform_{make_metadata_uniform(
           context,
-          sizes,
-          api::StorageType::TEXTURE_3D,
-          options)) {}
-
-vTensor::vTensor(
-    api::Context* const context,
-    const IntArrayRef sizes,
-    const api::StorageType storage_type,
-    const TensorOptions& options)
-    : view_(std::make_shared<vTensorStorage>(
+          gpu_sizes_,
+          gpu_strides_,
+          storage_type)},
+      // Construct Tensor storage
+      view_(std::make_shared<vTensorStorage>(
           context,
-          sizes,
           storage_type,
-          options)) {}
+          gpu_sizes_,
+          dtype())) {
+  ops::verify(options);
+}
 
 vTensor::vTensor(
     api::Context* const context,
     const IntArrayRef sizes,
     const TensorOptions& options,
     double q_scale,
-    int64_t q_zero_point)
-    : view_(std::make_shared<vTensorStorage>(
+    int64_t q_zero_point,
+    const api::StorageType storage_type)
+    : options_(options),
+      memory_format_(get_memory_format(options)),
+      // Calculate sizes and strides
+      sizes_{sizes},
+      strides_{calc_strides(sizes, memory_format_, storage_type)},
+      gpu_sizes_{calc_gpu_sizes(sizes, memory_format_, storage_type)},
+      gpu_strides_{calc_strides(gpu_sizes_, memory_format_, storage_type)},
+      // Vulkan uniform buffer containing sizes and stride info
+      metadata_uniform_{make_metadata_uniform(
           context,
-          sizes,
-          api::StorageType::TEXTURE_3D,
-          options,
-          q_scale,
-          q_zero_point)) {}
-
-vTensor::vTensor(
-    api::Context* const context,
-    const IntArrayRef sizes,
-    const api::StorageType storage_type,
-    const TensorOptions& options,
-    double q_scale,
-    int64_t q_zero_point)
-    : view_(std::make_shared<vTensorStorage>(
+          gpu_sizes_,
+          gpu_strides_,
+          storage_type)},
+      // Quantization params
+      is_quantized_{true},
+      q_scale_{q_scale},
+      q_zero_point_{q_zero_point},
+      // Construct Tensor storage
+      view_(std::make_shared<vTensorStorage>(
           context,
-          sizes,
           storage_type,
-          options,
-          q_scale,
-          q_zero_point)) {}
+          gpu_sizes_,
+          dtype())) {
+  verify(options);
+}
 
 api::VulkanImage& vTensor::image(
     api::PipelineBarrier& pipeline_barrier,
     const api::PipelineStageFlags stage) const& {
-  view_->transition(pipeline_barrier, stage, api::MemoryAccessType::READ);
+  TORCH_CHECK(view_->image_, "vTensor has empty image texture!");
 
+  view_->transition(pipeline_barrier, stage, api::MemoryAccessType::READ);
   return view_->image_;
 }
 
@@ -118,11 +292,40 @@ api::VulkanImage& vTensor::image(
     api::PipelineBarrier& pipeline_barrier,
     const api::PipelineStageFlags stage,
     const api::MemoryAccessFlags access) & {
-  view_->transition(pipeline_barrier, stage, access);
+  TORCH_CHECK(view_->image_, "vTensor has empty image texture!");
 
+  view_->transition(pipeline_barrier, stage, access);
   return view_->image_;
 }
 
+api::VulkanBuffer& vTensor::buffer(
+    api::PipelineBarrier& pipeline_barrier,
+    const api::PipelineStageFlags stage) const& {
+  TORCH_CHECK(view_->buffer_, "vTensor has empty buffer!");
+
+  view_->transition(pipeline_barrier, stage, api::MemoryAccessType::READ);
+  return view_->buffer_;
+}
+
+api::VulkanBuffer& vTensor::buffer(
+    api::PipelineBarrier& pipeline_barrier,
+    const api::PipelineStageFlags stage,
+    const api::MemoryAccessFlags access) & {
+  TORCH_CHECK(view_->buffer_, "vTensor has empty buffer!");
+
+  view_->transition(pipeline_barrier, stage, access);
+  return view_->buffer_;
+}
+
+vTensor::BufferMetadata vTensor::get_cpu_buffer_metadata() const {
+  return {
+      ops::make_nchw_uvec4(sizes_),
+      ops::make_nchw_uvec4(strides_),
+      api::utils::safe_downcast<uint32_t>(sizes_.size()),
+      api::utils::safe_downcast<uint32_t>(c10::multiply_integers(sizes_)),
+  };
+}
+
 //
 // vTensorStorage
 //
@@ -130,7 +333,7 @@ api::VulkanImage& vTensor::image(
 api::VulkanImage allocate_image(
     api::Context* const context_ptr,
     api::utils::uvec3& extents,
-    api::StorageType storage_type,
+    const api::StorageType storage_type,
     const VkFormat image_format) {
   api::Adapter* adapter_ptr = context_ptr->adapter_ptr();
 
@@ -153,9 +356,9 @@ api::VulkanImage allocate_image(
       image_type = VK_IMAGE_TYPE_2D;
       image_view_type = VK_IMAGE_VIEW_TYPE_2D;
       break;
-    case api::StorageType::BUFFER:
-    case api::StorageType::UNKNOWN:
-      TORCH_CHECK(false, "Requested storage type must be a texture type.");
+    default:
+      // Return an empty VulkanImage by default
+      return api::VulkanImage();
   }
 
   VkSampler sampler = adapter_ptr->sampler_cache().retrieve(sampler_props);
@@ -170,53 +373,48 @@ api::VulkanImage allocate_image(
       true);
 }
 
-vTensorStorage::vTensorStorage(
-    api::Context* const context,
-    const IntArrayRef sizes,
+api::VulkanBuffer allocate_buffer(
+    api::Context* const context_ptr,
+    const int64_t numel,
     const api::StorageType storage_type,
-    const TensorOptions& options)
-    : context_(context),
-      extents_(image_extents(sizes)),
-      options_(options),
-      sizes_(sizes),
-      strides_(sizes.size()),
-      storage_type_{storage_type},
-      image_(allocate_image(
-          context_,
-          extents_,
-          storage_type_,
-          api::vk_format(options_.dtype()))),
-      last_access_{} {
-  ops::verify(options);
+    const c10::ScalarType dtype) {
+  api::Adapter* adapter_ptr = context_ptr->adapter_ptr();
+
+  switch (storage_type) {
+    case api::StorageType::BUFFER:
+      break;
+    default:
+      // Return an empty VulkanBuffer if Buffer storage is not used
+      return api::VulkanBuffer();
+  }
+
+  return adapter_ptr->vma().create_storage_buffer(
+      c10::elementSize(dtype) * numel, true);
 }
 
 vTensorStorage::vTensorStorage(
     api::Context* const context,
-    const IntArrayRef sizes,
     const api::StorageType storage_type,
-    const TensorOptions& options,
-    double q_scale_in,
-    int64_t q_zero_point_in)
+    const IntArrayRef gpu_sizes,
+    const at::ScalarType dtype)
     : context_(context),
-      extents_(image_extents(sizes)),
-      options_(options),
-      sizes_(sizes),
-      strides_(sizes.size()),
-      is_quantized_{true},
-      q_scale{q_scale_in},
-      q_zero_point{q_zero_point_in},
       storage_type_{storage_type},
+      extents_(create_image_extents(gpu_sizes, storage_type)),
+      buffer_length_{c10::multiply_integers(gpu_sizes)},
       image_(allocate_image(
           context_,
           extents_,
           storage_type_,
-          api::vk_format(options_.dtype()))),
-      last_access_{} {
-  ops::verify(options);
-}
+          api::vk_format(dtype))),
+      buffer_(allocate_buffer(context_, buffer_length_, storage_type_, dtype)),
+      last_access_{} {}
 
 vTensorStorage::~vTensorStorage() {
-  context_->register_image_cleanup(image_);
+  if (image_) {
+    context_->register_image_cleanup(image_);
+  } else if (buffer_) {
+    context_->register_buffer_cleanup(buffer_);
+  }
 }
 
 void vTensorStorage::transition(
@@ -227,12 +425,18 @@ void vTensorStorage::transition(
   api::PipelineStageFlags prev_stage = last_access_.stage;
   api::MemoryAccessFlags prev_access = last_access_.access;
 
-  const VkImageLayout cur_layout = image_.layout();
-  const VkImageLayout new_layout = api::vk_layout(cur_stage, cur_access);
-
-  const bool layout_changed = cur_layout != new_layout;
   const bool prev_written = (prev_access & api::MemoryAccessType::WRITE) != 0;
 
+  VkImageLayout cur_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+  VkImageLayout new_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+  bool layout_changed = false;
+  if (image_) {
+    cur_layout = image_.layout();
+    new_layout = api::vk_layout(cur_stage, cur_access);
+
+    layout_changed = cur_layout != new_layout;
+  }
+
   if (prev_written || layout_changed) {
     VkPipelineStageFlags src_stage = api::vk_stage(prev_stage);
     if (0u == src_stage) {
@@ -246,14 +450,21 @@ void vTensorStorage::transition(
     pipeline_barrier.stage.src |= src_stage;
     pipeline_barrier.stage.dst |= dst_stage;
 
-    pipeline_barrier.images.push_back(api::ImageMemoryBarrier(
-        api::vk_access(prev_stage, prev_access),
-        api::vk_access(cur_stage, cur_access),
-        cur_layout,
-        new_layout,
-        image_));
-
-    image_.set_layout(new_layout);
+    if (image_) {
+      pipeline_barrier.images.push_back(api::ImageMemoryBarrier(
+          api::vk_access(prev_stage, prev_access),
+          api::vk_access(cur_stage, cur_access),
+          cur_layout,
+          new_layout,
+          image_));
+
+      image_.set_layout(new_layout);
+    } else if (buffer_) {
+      pipeline_barrier.buffers.push_back(api::BufferMemoryBarrier(
+          api::vk_access(prev_stage, prev_access),
+          api::vk_access(cur_stage, cur_access),
+          buffer_));
+    }
   }
 
   last_access_.stage = cur_stage;
@@ -306,9 +517,10 @@ void verify(const TensorOptions& options) {
       !options.has_layout() || (c10::kStrided == options.layout()),
       "'layout' tensor option is not yet supported under Vulkan!");
 
+  at::MemoryFormat memory_format = get_memory_format(options);
   TORCH_CHECK(
-      !options.has_memory_format() ||
-          (c10::MemoryFormat::Contiguous == options.memory_format_opt()),
+      memory_format == at::MemoryFormat::ChannelsLast ||
+          memory_format == at::MemoryFormat::Contiguous,
       "'memory_format' tensor option is not yet supported under Vulkan!");
 }
 
diff --git a/aten/src/ATen/native/vulkan/ops/Tensor.h b/aten/src/ATen/native/vulkan/ops/Tensor.h
index 6a41638057701..241d2c839b80a 100644
--- a/aten/src/ATen/native/vulkan/ops/Tensor.h
+++ b/aten/src/ATen/native/vulkan/ops/Tensor.h
@@ -33,16 +33,9 @@ class vTensorStorage final {
 
   vTensorStorage(
       api::Context* context,
-      IntArrayRef sizes,
-      const api::StorageType storage_type,
-      const TensorOptions& options);
-  vTensorStorage(
-      api::Context* context,
-      IntArrayRef sizes,
       const api::StorageType storage_type,
-      const TensorOptions& options,
-      double q_scale,
-      int64_t q_zero_point);
+      const IntArrayRef sizes,
+      const at::ScalarType dtype);
 
   vTensorStorage(const vTensorStorage&) = delete;
   vTensorStorage& operator=(const vTensorStorage&) = delete;
@@ -58,18 +51,15 @@ class vTensorStorage final {
   // Context
   api::Context* context_;
 
-  // Metadata
+  api::StorageType storage_type_;
+
+  // Resource sizings
   api::utils::uvec3 extents_;
-  TensorOptions options_;
-  c10::SmallVector<int64_t, 6u> sizes_;
-  c10::SmallVector<int64_t, 6u> strides_;
-  bool is_quantized_{false};
-  double q_scale{1.0f};
-  int64_t q_zero_point{0u};
+  int64_t buffer_length_;
 
   // Image Texture
-  api::StorageType storage_type_;
   mutable api::VulkanImage image_;
+  mutable api::VulkanBuffer buffer_;
 
   // Last Access - used to insert memory barriers
   LastAccess last_access_;
@@ -95,33 +85,53 @@ class vTensor final {
   // Do not allow empty vTensor construction
   vTensor() = default;
 
+  // Default constructor
   vTensor(
       api::Context* context,
       IntArrayRef sizes,
-      const TensorOptions& options);
-
-  vTensor(
-      api::Context* context,
-      IntArrayRef sizes,
-      const api::StorageType storage_type,
-      const TensorOptions& options);
-
-  vTensor(
-      api::Context* const context,
-      const IntArrayRef sizes,
       const TensorOptions& options,
-      double q_scale,
-      int64_t q_zero_point);
+      const api::StorageType storage_type = api::StorageType::TEXTURE_3D);
 
+  // Default constructor with quantization parameters
   vTensor(
       api::Context* const context,
       const IntArrayRef sizes,
-      const api::StorageType storage_type,
       const TensorOptions& options,
       double q_scale,
-      int64_t q_zero_point);
+      int64_t q_zero_point,
+      const api::StorageType storage_type = api::StorageType::TEXTURE_3D);
+
+  // Used for passing buffer sizes and strides data to shaders
+  struct BufferMetadata {
+    api::utils::uvec4 sizes;
+    api::utils::uvec4 strides;
+    uint32_t ndim;
+    uint32_t buffer_length;
+  };
 
  private:
+  // Tensor Options
+  TensorOptions options_;
+  at::MemoryFormat memory_format_;
+
+  // Sizes and Strides
+  c10::SmallVector<int64_t, 6u> sizes_;
+  c10::SmallVector<int64_t, 6u> strides_;
+
+  // Storage Dimensions. When stored on the GPU, one dimension will be aligned
+  // to the next multiple of 4 in order to take advantage of vec4 data types.
+  c10::SmallVector<int64_t, 6u> gpu_sizes_;
+  c10::SmallVector<int64_t, 6u> gpu_strides_;
+
+  // A Vulkan uniform buffer containing sizes and strides of the GPU buffer that
+  // can be passed into a shader.
+  api::UniformParamsBuffer metadata_uniform_;
+
+  // Quantization params
+  bool is_quantized_{false};
+  double q_scale_{1.0f};
+  int64_t q_zero_point_{0u};
+
   // Even at the cost of a heap allocation plus the resulting negative impact
   // on cache locality due to the subsequent pointer chasing, it is still
   // critcal to share the view across vTensor implementations to minimize
@@ -158,6 +168,15 @@ class vTensor final {
       const api::PipelineStageFlags,
       const api::MemoryAccessFlags) &;
 
+  api::VulkanBuffer& buffer(
+      api::PipelineBarrier&,
+      const api::PipelineStageFlags) const&;
+
+  api::VulkanBuffer& buffer(
+      api::PipelineBarrier&,
+      const api::PipelineStageFlags,
+      const api::MemoryAccessFlags) &;
+
   /*
     Metadata
   */
@@ -166,6 +185,13 @@ class vTensor final {
     return view_->extents_;
   }
 
+  /*
+   * Extract a ScalarType from the TensorOptions member
+   */
+  inline c10::ScalarType dtype() const {
+    return c10::typeMetaToScalarType(options_.dtype());
+  }
+
   /*
    * Get a c10::ScalarType that corresponds to the image format of the texture
    */
@@ -173,77 +199,96 @@ class vTensor final {
     return api::c10_scalartype(view_->texture_format());
   }
 
+  inline at::MemoryFormat memory_format() const {
+    return memory_format_;
+  }
+
   inline const TensorOptions& options() const {
-    return view_->options_;
+    return options_;
   }
 
   inline IntArrayRef sizes() const {
-    return view_->sizes_;
+    return sizes_;
   }
 
   inline IntArrayRef strides() const {
-    return view_->strides_;
+    return strides_;
   }
 
-  inline void set_is_quantized() const {
-    view_->is_quantized_ = true;
+  inline IntArrayRef gpu_sizes() const {
+    return gpu_sizes_;
+  }
+
+  inline IntArrayRef gpu_strides() const {
+    return gpu_strides_;
+  }
+
+  /*
+   * Get a uniform buffer containing sizes and strides information of the GPU
+   * buffer
+   */
+  inline api::VulkanBuffer& buffer_metadata() {
+    return metadata_uniform_.buffer();
+  }
+
+  /*
+   * Constructs a BufferMetdata struct based on the original sizes and strides
+   * to pass into a shader.
+   */
+  BufferMetadata get_cpu_buffer_metadata() const;
+
+  inline void set_is_quantized() {
+    is_quantized_ = true;
   }
 
   inline bool is_quantized() const {
-    return view_->is_quantized_;
+    return is_quantized_;
   }
 
-  inline void set_scale(const double q_scale) const {
-    view_->q_scale = q_scale;
+  inline void set_scale(const double q_scale) {
+    q_scale_ = q_scale;
   }
 
   inline double get_scale() const {
-    return view_->q_scale;
+    return q_scale_;
   }
 
   inline float get_scale_float() const {
-    return api::utils::safe_downcast<float>(view_->q_scale);
+    return api::utils::safe_downcast<float>(q_scale_);
   }
 
-  inline void set_zero_point(const int64_t q_zero_point) const {
-    view_->q_zero_point = q_zero_point;
+  inline void set_zero_point(const int64_t q_zero_point) {
+    q_zero_point_ = q_zero_point;
   }
 
   inline int64_t get_zero_point() const {
-    return view_->q_zero_point;
+    return q_zero_point_;
   }
 
   inline int32_t get_zero_point_int32() const {
-    return api::utils::safe_downcast<int32_t>(view_->q_zero_point);
+    return api::utils::safe_downcast<int32_t>(q_zero_point_);
   }
 
-  inline size_t nbytes() const {
-    return c10::elementSize(c10::typeMetaToScalarType(options().dtype())) *
-        c10::multiply_integers(sizes());
+  inline size_t numel() const {
+    return c10::multiply_integers(sizes());
   }
 
   /*
-   * Number of texels in the image texture.
+   * Returns numel but based on gpu_sizes_ instead of sizes_
    */
-  inline VkDeviceSize numtexels() {
-    return view_->extents_.data[0u] * view_->extents_.data[1u] *
-        view_->extents_.data[2u];
+  inline size_t gpu_numel() const {
+    return view_->buffer_length_;
   }
 
-  /*
-   * Number of "cells" in the image texture. 4 cells make up a texel.
-   */
-  inline VkDeviceSize numcells() {
-    return view_->extents_.data[0u] * view_->extents_.data[1u] *
-        (4u * view_->extents_.data[2u]);
+  inline size_t nbytes() const {
+    return c10::elementSize(dtype()) * numel();
   }
 
   /*
-   * Number of bytes needed for a buffer to receive all data in the texture
+   * Return nbytes but bnased on gpu_sizes_ instead of sizes_
    */
-  inline VkDeviceSize buffer_bytes() {
-    return c10::elementSize(this->texture_dtype()) * view_->extents_.data[0u] *
-        view_->extents_.data[1u] * (4u * view_->extents_.data[2u]);
+  inline VkDeviceSize gpu_nbytes() const {
+    return c10::elementSize(dtype()) * gpu_numel();
   }
 };
 
diff --git a/aten/src/ATen/native/vulkan/ops/Utils.cpp b/aten/src/ATen/native/vulkan/ops/Utils.cpp
index 30e8d727ff6ad..4c0f866ca9a39 100644
--- a/aten/src/ATen/native/vulkan/ops/Utils.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Utils.cpp
@@ -21,7 +21,7 @@ static api::ShaderSource get_nchw_to_image_shader(const vTensor& v_dst) {
     switch (v_dst.storage_type()) {
       case api::StorageType::TEXTURE_3D:
         return VK_KERNEL(nchw_to_image_quantized);
-      case api::StorageType::TEXTURE_2D:
+      default:
         TORCH_CHECK(false, "No kernel available!");
       case api::StorageType::BUFFER:
       case api::StorageType::UNKNOWN:
@@ -34,9 +34,8 @@ static api::ShaderSource get_nchw_to_image_shader(const vTensor& v_dst) {
       return VK_KERNEL(nchw_to_image);
     case api::StorageType::TEXTURE_2D:
       return VK_KERNEL(nchw_to_image2d);
-    case api::StorageType::BUFFER:
-    case api::StorageType::UNKNOWN:
-      TORCH_CHECK(false, "Requested storage type must be a texture type.");
+    default:
+      TORCH_CHECK(false, "No kernel available!");
   }
 }
 
@@ -45,7 +44,7 @@ static api::ShaderSource get_image_to_nchw_shader(const vTensor& v_src) {
     switch (v_src.storage_type()) {
       case api::StorageType::TEXTURE_3D:
         return VK_KERNEL(image_to_nchw_quantized);
-      case api::StorageType::TEXTURE_2D:
+      default:
         TORCH_CHECK(false, "No kernel available!");
       case api::StorageType::BUFFER:
       case api::StorageType::UNKNOWN:
@@ -58,13 +57,12 @@ static api::ShaderSource get_image_to_nchw_shader(const vTensor& v_src) {
       return VK_KERNEL(image_to_nchw);
     case api::StorageType::TEXTURE_2D:
       return VK_KERNEL(image2d_to_nchw);
-    case api::StorageType::BUFFER:
-    case api::StorageType::UNKNOWN:
-      TORCH_CHECK(false, "Requested storage type must be a texture type.");
+    default:
+      TORCH_CHECK(false, "No kernel available!");
   }
 }
 
-struct Params final {
+struct ToFromTextureParams final {
   api::utils::ivec3 extents;
   int32_t plane_size;
 };
@@ -85,7 +83,7 @@ void record_nchw_to_image_op(
       api::utils::safe_downcast<int32_t>(get_dim<Dim4D::Width>(v_dst));
   int32_t plane_size = height * width;
 
-  Params block{
+  ToFromTextureParams block{
       api::utils::make_ivec3(v_dst.extents()),
       plane_size,
   };
@@ -128,7 +126,7 @@ void record_image_to_nchw_op(
       api::utils::safe_downcast<int32_t>(get_dim<Dim4D::Width>(v_src));
   int32_t plane_size = height * width;
 
-  Params block{
+  ToFromTextureParams block{
       api::utils::make_ivec3(v_src.extents()),
       plane_size,
   };
@@ -155,6 +153,76 @@ void record_image_to_nchw_op(
       params.buffer());
 }
 
+void record_nchw_to_buffer_op(
+    api::Context* const context,
+    api::VulkanBuffer& src_buffer,
+    vTensor& v_dst,
+    api::PipelineBarrier pipeline_barrier,
+    const VkFence fence_handle) {
+  uint32_t gpu_buf_len = api::utils::safe_downcast<uint32_t>(v_dst.gpu_numel());
+
+  api::utils::uvec3 global_size = {gpu_buf_len, 1u, 1u};
+  api::utils::uvec3 local_size = {32u, 1u, 1u};
+
+  api::UniformParamsBuffer cpu_buffer_metadata(
+      context, v_dst.get_cpu_buffer_metadata());
+
+  context->submit_compute_job(
+      // shader descriptor
+      VK_KERNEL(buffer_to_buffer),
+      // pipeline barrier
+      pipeline_barrier,
+      // global work group size
+      global_size,
+      // local work group size
+      local_size,
+      // fence handle
+      fence_handle,
+      // shader arguments
+      v_dst.buffer(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      v_dst.buffer_metadata(),
+      src_buffer,
+      cpu_buffer_metadata.buffer());
+}
+
+void record_buffer_to_nchw_op(
+    api::Context* const context,
+    vTensor& v_src,
+    api::VulkanBuffer& dst_buffer,
+    api::PipelineBarrier pipeline_barrier,
+    const VkFence fence_handle) {
+  uint32_t buf_len = api::utils::safe_downcast<uint32_t>(v_src.numel());
+
+  api::utils::uvec3 global_size = {buf_len, 1u, 1u};
+  api::utils::uvec3 local_size = {4u, 1u, 1u};
+
+  api::UniformParamsBuffer cpu_buffer_metadata(
+      context, v_src.get_cpu_buffer_metadata());
+
+  context->submit_compute_job(
+      // shader descriptor
+      VK_KERNEL(buffer_to_buffer),
+      // pipeline barrier
+      pipeline_barrier,
+      // global work group size
+      global_size,
+      // local work group size
+      local_size,
+      // fence handle
+      fence_handle,
+      // shader arguments
+      dst_buffer,
+      cpu_buffer_metadata.buffer(),
+      v_src.buffer(
+          pipeline_barrier,
+          api::PipelineStage::COMPUTE,
+          api::MemoryAccessType::WRITE),
+      v_src.buffer_metadata());
+}
+
 } // namespace packing
 
 namespace utils {
@@ -259,7 +327,7 @@ void copy_buffer_to_vtensor(
   api::Context* const context = api::context();
 
   TORCH_CHECK(
-      src_buffer.mem_size() == v_dst.buffer_bytes(),
+      src_buffer.mem_size() == v_dst.gpu_nbytes(),
       "Vulkan copy_buffer_to_vtensor: source buffer and destination texture "
       "do not have the same number of bytes");
 
@@ -309,7 +377,7 @@ void copy_vtensor_to_buffer(
   api::Context* const context = api::context();
 
   TORCH_CHECK(
-      v_src.buffer_bytes() == dst_buffer.mem_size(),
+      v_src.gpu_nbytes() == dst_buffer.mem_size(),
       "Vulkan copy_vtensor_to_buffer: source texture and destination buffer "
       "do not have the same number of bytes");
 
@@ -336,14 +404,20 @@ void pack_buffer_to_vtensor(
     api::PipelineBarrier& pipeline_barrier) {
   api::Context* const context = api::context();
 
-  api::ShaderSource compute_shader = packing::get_nchw_to_image_shader(v_self);
-  packing::record_nchw_to_image_op(
-      context,
-      compute_shader,
-      buffer,
-      v_self,
-      pipeline_barrier,
-      VK_NULL_HANDLE);
+  if (v_self.storage_type() == api::StorageType::BUFFER) {
+    packing::record_nchw_to_buffer_op(
+        context, buffer, v_self, pipeline_barrier, VK_NULL_HANDLE);
+  } else {
+    api::ShaderSource compute_shader =
+        packing::get_nchw_to_image_shader(v_self);
+    packing::record_nchw_to_image_op(
+        context,
+        compute_shader,
+        buffer,
+        v_self,
+        pipeline_barrier,
+        VK_NULL_HANDLE);
+  }
 }
 
 void pack_staging_to_vtensor(api::VulkanBuffer& staging, vTensor& v_self) {
@@ -356,11 +430,22 @@ void pack_vtensor_to_staging(
     api::VulkanBuffer& staging,
     const VkFence fence_handle) {
   api::Context* const context = api::context();
-  api::ShaderSource compute_shader = packing::get_image_to_nchw_shader(v_self);
-
   api::PipelineBarrier pipeline_barrier{};
-  packing::record_image_to_nchw_op(
-      context, compute_shader, v_self, staging, pipeline_barrier, fence_handle);
+
+  if (v_self.storage_type() == api::StorageType::BUFFER) {
+    packing::record_buffer_to_nchw_op(
+        context, v_self, staging, pipeline_barrier, fence_handle);
+  } else {
+    api::ShaderSource compute_shader =
+        packing::get_image_to_nchw_shader(v_self);
+    packing::record_image_to_nchw_op(
+        context,
+        compute_shader,
+        v_self,
+        staging,
+        pipeline_barrier,
+        fence_handle);
+  }
 }
 
 } // namespace utils
diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index a9dc1908100b0..6870ec4e049f0 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -252,6 +252,7 @@ class VulkanAPITest : public ::testing::Test {
 };
 
 TEST_F(VulkanAPITest, copy_to_texture) {
+  using namespace at::native::vulkan;
   at::Tensor test_tensors[] = {
     // 4D
     at::rand({7, 17, 134, 213}, at::TensorOptions(at::kCPU).dtype(at::kFloat)),
@@ -273,6 +274,8 @@ TEST_F(VulkanAPITest, copy_to_texture) {
       std::cout << "Copy failed on size " << in_cpu.sizes()
                 << "with dtype" << in_cpu.dtype() << std::endl;
     }
+
+    ASSERT_TRUE(check_copy);
   }
 }
 
diff --git a/aten/src/ATen/test/vulkan_quantized_api_test.cpp b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
index 50cceafdb5ff2..3372417e29f40 100644
--- a/aten/src/ATen/test/vulkan_quantized_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
@@ -12,6 +12,13 @@
 
 #include <c10/util/irange.h>
 
+/*
+ * TODO: rename this file to something like vulkan_experimental_test and move
+ * this under caffe2/fb/vulkan. This file should be used to test experimental
+ * features of the Vulkan backend. vulkan_api_test cannot serve this purpose
+ * because it cannot link against symbols in the ATen/native/vulkan folder.
+ */
+
 namespace {
 
 bool checkRtol(const at::Tensor& diff, const std::vector<at::Tensor>& inputs) {
@@ -164,6 +171,85 @@ at::Tensor vulkan_to_cpu(at::Tensor vulkan, at::Tensor in_cpu) {
   }
 }
 
+TEST_F(VulkanAPITest, uniform_buffer_copy) {
+  using namespace at::native::vulkan;
+
+  struct TestStruct{
+    int a;
+    int b;
+    int c;
+  };
+
+  TestStruct test_struct{4, 9, 10};
+
+  api::UniformParamsBuffer params(api::context(), test_struct);
+  api::UniformParamsBuffer params_copy = params;
+
+  api::MemoryMap copy_mapping(
+      params_copy.buffer(), api::MemoryAccessType::READ);
+
+  TestStruct* test_copy_p = copy_mapping.template data<TestStruct>();
+
+  ASSERT_TRUE(test_copy_p->a == test_struct.a);
+  ASSERT_TRUE(test_copy_p->b == test_struct.b);
+  ASSERT_TRUE(test_copy_p->c == test_struct.c);
+}
+
+TEST_F(VulkanAPITest, copy_to_buffer) {
+  using namespace at::native::vulkan;
+
+  at::Tensor test_tensors[] = {
+    // 4D
+    at::rand({7, 17, 134, 213}, at::TensorOptions(at::kCPU).dtype(at::kFloat)),
+    // 3D
+    at::rand({67, 134, 213}, at::TensorOptions(at::kCPU).dtype(at::kFloat)),
+    // 2D
+    at::rand({229, 213}, at::TensorOptions(at::kCPU).dtype(at::kFloat)),
+    // 1D
+    at::rand({1902}, at::TensorOptions(at::kCPU).dtype(at::kFloat)),
+  };
+
+  for (auto in_cpu : test_tensors) {
+    ops::vTensor in_vk_copied = ops::to_vulkan(in_cpu, api::StorageType::BUFFER);
+    at::Tensor out_copied = ops::from_vulkan(in_vk_copied);
+
+    const auto check_copy = almostEqual(out_copied, in_cpu);
+
+    if(!check_copy) {
+      std::cout << "Copy failed on size " << in_cpu.sizes()
+                << "with dtype" << in_cpu.dtype() << std::endl;
+    }
+
+    ASSERT_TRUE(check_copy);
+  }
+}
+
+TEST_F(VulkanAPITest, copy_to_buffer_channels_last) {
+  using namespace at::native::vulkan;
+
+  at::TensorOptions options(at::kCPU);
+  options = options.dtype(at::kFloat);
+
+  at::Tensor test_tensors[] = {
+    // 4D
+    at::rand({7, 17, 134, 213}, options).to(at::MemoryFormat::ChannelsLast),
+  };
+
+  for (auto in_cpu : test_tensors) {
+    ops::vTensor in_vk_copied = ops::to_vulkan(in_cpu, api::StorageType::BUFFER);
+    at::Tensor out_copied = ops::from_vulkan(in_vk_copied);
+
+    const auto check_copy = almostEqual(out_copied, in_cpu);
+
+    if(!check_copy) {
+      std::cout << "Copy failed on size " << in_cpu.sizes()
+                << "with dtype" << in_cpu.dtype() << std::endl;
+    }
+
+    ASSERT_TRUE(check_copy);
+  }
+}
+
 TEST_F(VulkanAPITest, support_vulkan) {
   const double scale = 0.1;
   const int64_t zero_point = 10;
diff --git a/tools/gen_vulkan_spv.py b/tools/gen_vulkan_spv.py
index f7522a091ec9b..cc317eba7d4a7 100644
--- a/tools/gen_vulkan_spv.py
+++ b/tools/gen_vulkan_spv.py
@@ -63,7 +63,7 @@ def getBiasStorageType(lineStr):
     r"image[123]D\b": "VK_DESCRIPTOR_TYPE_STORAGE_IMAGE",
     r"sampler[123]D\b": "VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER",
     r"\bbuffer\b": "VK_DESCRIPTOR_TYPE_STORAGE_BUFFER",
-    r"\buniform\b.*\bBlock\b": "VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER",
+    r"\buniform\b": "VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER",
 }
 
 storageTypeToEnum = {
@@ -151,6 +151,7 @@ def genCppH(hFilePath, cppFilePath, srcDirPath, glslcPath, tmpDirPath, env):
             glslcPath, "-fshader-stage=compute",
             srcPath, "-o", spvPath,
             "--target-env=vulkan1.0",
+            "-I", srcDirPath,
             "-Werror"
         ]
 

From 77973c89a6520a1bc45dba5063dcec621d982e65 Mon Sep 17 00:00:00 2001
From: Jason Ansel <jansel@meta.com>
Date: Wed, 9 Nov 2022 18:13:06 +0000
Subject: [PATCH 0711/1922] Fix TorchInductor benchmarking in fbcode (#88689)

Summary: Makes the C++ TorchInductor benchmarking work in fbcode plus some minor fixed to enable that.

Test Plan: Test added

Differential Revision: D41045910

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88689
Approved by: https://github.com/soumith
---
 benchmarks/dynamo/common.py            | 24 ++++++++++--------
 benchmarks/dynamo/test.py              | 35 ++++++++++++++++++++++++++
 torch/_inductor/codecache.py           | 15 ++++++++++-
 torch/_inductor/cuda_properties.py     | 13 +++++++---
 torch/_inductor/triton_ops/autotune.py |  3 ++-
 5 files changed, 73 insertions(+), 17 deletions(-)
 create mode 100644 benchmarks/dynamo/test.py

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 3cd17c446c225..3ebed5cee43f6 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -22,7 +22,7 @@
 import torch._dynamo
 import torch._dynamo.utils
 import torch.distributed
-from microbenchmarks.operator_inp_utils import OperatorInputsMode
+from functorch._src.aot_autograd import set_model_name
 from scipy.stats import gmean, ttest_ind
 from torch._dynamo.optimizations import backends
 from torch._dynamo.optimizations.log_args import conv_args_analysis
@@ -36,11 +36,9 @@
 from torch.utils._pytree import tree_map
 
 try:
-    from functorch._src.aot_autograd import set_model_name
+    from .microbenchmarks.operator_inp_utils import OperatorInputsMode
 except ImportError:
-
-    def set_model_name(name):
-        pass
+    from microbenchmarks.operator_inp_utils import OperatorInputsMode
 
 
 log = logging.getLogger(__name__)
@@ -1308,8 +1306,7 @@ def help(fn):
     return fn.__doc__
 
 
-def parse_args():
-
+def parse_args(args=None):
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--filter", "-k", action="append", help="filter benchmarks with regexp"
@@ -1330,7 +1327,10 @@ def parse_args():
         default=0,
         help="ID of the benchmark suite partition to be run. Used to divide CI tasks",
     )
-    parser.add_argument("--devices", "-d", action="append", help="cpu or cuda")
+    parser.add_argument(
+        "--devices", "--device", "-d", action="append", help="cpu or cuda"
+    )
+    parser.add_argument("--device-index", help="CUDA device index")
     parser.add_argument(
         "--repeat", "-n", type=int, default=30, help="number of timing runs"
     )
@@ -1567,8 +1567,7 @@ def parse_args():
     mode_group.add_argument(
         "--performance", action="store_true", help="Measures performance speedup"
     )
-    args = parser.parse_args()
-    return args
+    return parser.parse_args(args)
 
 
 def main(runner, original_dir=None):
@@ -1640,11 +1639,14 @@ def run(runner, args, original_dir=None):
 
         # Some models e.g. yolov3 assert batch size on n_gpus
         if "CUDA_VISIBLE_DEVICES" not in os.environ:
-            os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+            args.device_index = "0"
 
         # Stricter check to disable fallbacks
         args.suppress_errors = False
 
+    if args.device_index is not None:
+        os.environ["CUDA_VISIBLE_DEVICES"] = args.device_index
+
     elif args.performance:
         # Ensure that we test on real scenarios
         args.use_eval_mode = False
diff --git a/benchmarks/dynamo/test.py b/benchmarks/dynamo/test.py
new file mode 100644
index 0000000000000..317e8e4ea50e7
--- /dev/null
+++ b/benchmarks/dynamo/test.py
@@ -0,0 +1,35 @@
+import os
+import unittest
+
+from .common import parse_args, run
+
+from .torchbench import setup_torchbench_cwd, TorchBenchmarkRunner
+
+
+class TestDynamoBenchmark(unittest.TestCase):
+    def test_benchmark_infra_runs(self) -> None:
+        """
+        Basic smoke test that TorchBench runs.
+
+        This test is mainly meant to check that our setup in fbcode
+        doesn't break.
+
+        If you see a failure here related to missing CPP headers, then
+        you likely need to update the resources list in:
+            //caffe2:inductor
+        """
+        original_dir = setup_torchbench_cwd()
+        try:
+            args = parse_args(
+                [
+                    "-dcpu",
+                    "--inductor",
+                    "--performance",
+                    "--only=BERT_pytorch",
+                    "-n1",
+                    "--batch_size=1",
+                ]
+            )
+            run(TorchBenchmarkRunner(), args, original_dir)
+        finally:
+            os.chdir(original_dir)
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 842ea2300e5b4..2826f35999126 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -242,6 +242,7 @@ def cpp_compile_command(input, output, include_pytorch=False):
             {cpp_compiler()} {input} -shared -fPIC -Wall -std=c++14 -Wno-unused-variable
             {ipaths} {lpaths} {libs} {macros}
             -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp
+            -D C10_USING_CUSTOM_GENERATED_MACROS
             -o{output}
         """,
     ).strip()
@@ -251,6 +252,18 @@ class CppCodeCache:
     cache = dict()
     clear = staticmethod(cache.clear)
 
+    @staticmethod
+    def _load_library(path):
+        try:
+            return cdll.LoadLibrary(path)
+        except OSError as e:
+            if "gomp" in str(e) and os.path.exists("/usr/lib64/libgomp.so.1"):
+                # hacky workaround for fbcode/buck
+                global _libgomp
+                _libgomp = cdll.LoadLibrary("/usr/lib64/libgomp.so.1")
+                return cdll.LoadLibrary(path)
+            raise
+
     @classmethod
     def load(cls, source_code):
         key, input_path = write(source_code, "cpp", extra=cpp_compile_command("i", "o"))
@@ -270,7 +283,7 @@ def load(cls, source_code):
                     except subprocess.CalledProcessError as e:
                         raise exc.CppCompileError(cmd, e.output)
 
-                cls.cache[key] = cdll.LoadLibrary(output_path)
+                cls.cache[key] = cls._load_library(output_path)
                 cls.cache[key].key = key
 
         return cls.cache[key]
diff --git a/torch/_inductor/cuda_properties.py b/torch/_inductor/cuda_properties.py
index de5349b568971..e42b2c5b5c676 100644
--- a/torch/_inductor/cuda_properties.py
+++ b/torch/_inductor/cuda_properties.py
@@ -11,10 +11,15 @@
 
 @functools.lru_cache(None)
 def _properties():
-    r = {
-        i: torch.cuda.get_device_properties(i) for i in range(torch.cuda.device_count())
-    }
-    return r
+    if not torch.cuda.is_available():
+        return {}
+    try:
+        return {
+            i: torch.cuda.get_device_properties(i)
+            for i in range(torch.cuda.device_count())
+        }
+    except RuntimeError:
+        return {}
 
 
 _compile_worker_current_device = None
diff --git a/torch/_inductor/triton_ops/autotune.py b/torch/_inductor/triton_ops/autotune.py
index f4d1d06b8f183..0fbdd2d4591b6 100644
--- a/torch/_inductor/triton_ops/autotune.py
+++ b/torch/_inductor/triton_ops/autotune.py
@@ -213,7 +213,8 @@ def load_cached_autotuning(
     if not os.path.exists(cache_filename):
         return None
 
-    best_config = json.loads(open(cache_filename).read())
+    with open(cache_filename, "r") as fd:
+        best_config = json.loads(fd.read())
     if best_config.get("configs_hash") != configs_hash:
         return None
 

From c45ad37ce9564006bcdfd32c52442bf9174395a0 Mon Sep 17 00:00:00 2001
From: mikey dagitses <mikeyd@meta.com>
Date: Wed, 9 Nov 2022 18:20:04 +0000
Subject: [PATCH 0712/1922] skip environment collection test in fbcode (#88744)

Summary: This runs pip, which we don't have in the fbcode environment.

Test Plan: Rely on CI.

Differential Revision: D41156589

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88744
Approved by: https://github.com/zou3519
---
 test/test_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_utils.py b/test/test_utils.py
index 3ad9bf73aaf78..b745e771abd12 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -19,7 +19,7 @@
 import torch.utils.cpp_extension
 from torch.autograd._functions.utils import check_onnx_broadcast
 from torch.onnx.symbolic_opset9 import _prepare_onnx_paddings
-from torch.testing._internal.common_utils import load_tests, IS_SANDCASTLE, IS_WINDOWS
+from torch.testing._internal.common_utils import load_tests, IS_FBCODE, IS_SANDCASTLE, IS_WINDOWS
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
@@ -610,6 +610,7 @@ def test_bottleneck_cuda(self):
 from torch.utils.collect_env import get_pretty_env_info
 
 
+@unittest.skipIf(IS_FBCODE, "runs pip which is not available internally")
 class TestCollectEnv(TestCase):
     def test_smoke(self):
         info_output = get_pretty_env_info()

From 4055ad5de027a754b907304a063bfd18e4de6743 Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang@fb.com>
Date: Wed, 9 Nov 2022 06:47:53 -0800
Subject: [PATCH 0713/1922] [12/N] Update scatter with CPU/CUDA implementations
 (#86408)

Differential Revision: [D40181613](https://our.internmc.facebook.com/intern/diff/D40181613)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86408
Approved by: https://github.com/kwen2501
---
 test/distributed/test_c10d_common.py    |  5 +++-
 torch/csrc/distributed/c10d/OpsImpl.cpp | 38 +++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index 02ec0247f9754..321480477f4d9 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -1468,7 +1468,9 @@ def _call_collective_with_varying_tensors(self, backend, collective, *args):
             collective()
         elif collective == dist.all_gather:
             collective([tensor], tensor, *args)
-        elif collective == dist.reduce_scatter or collective == dist.all_to_all:
+        elif collective == dist.scatter:
+            collective(tensor, [tensor], *args)
+        elif collective in (dist.reduce_scatter, dist.all_to_all):
             # gloo does not support reduce_scatter or all_to_all
             if backend != "gloo":
                 if collective == dist.reduce_scatter:
@@ -1495,6 +1497,7 @@ def _test_collectives(self, backend):
             (dist.reduce_scatter,),
             (dist.barrier,),
             (dist.all_to_all,),
+            (dist.scatter,),
         ]
         for collective, *args in collectives_and_args:
             with self.subTest(collective=collective, args=args):
diff --git a/torch/csrc/distributed/c10d/OpsImpl.cpp b/torch/csrc/distributed/c10d/OpsImpl.cpp
index ec66042235e95..26b93da34411e 100644
--- a/torch/csrc/distributed/c10d/OpsImpl.cpp
+++ b/torch/csrc/distributed/c10d/OpsImpl.cpp
@@ -219,6 +219,36 @@ reduce_scatter_cuda_(
       output_tensors, work);
 }
 
+std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>> scatter_cpu_(
+    const std::vector<at::Tensor>& output_tensors,
+    const std::vector<std::vector<at::Tensor>>& input_tensors,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    int64_t root_rank,
+    int64_t timeout) {
+  auto work = process_group->scatter(
+      const_cast<std::vector<at::Tensor>&>(output_tensors),
+      const_cast<std::vector<std::vector<at::Tensor>>&>(input_tensors),
+      ScatterOptions{root_rank, std::chrono::milliseconds(timeout)});
+
+  return std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
+      output_tensors, work);
+}
+
+std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>> scatter_cuda_(
+    const std::vector<at::Tensor>& output_tensors,
+    const std::vector<std::vector<at::Tensor>>& input_tensors,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    int64_t root_rank,
+    int64_t timeout) {
+  auto work = process_group->scatter(
+      const_cast<std::vector<at::Tensor>&>(output_tensors),
+      const_cast<std::vector<std::vector<at::Tensor>>&>(input_tensors),
+      ScatterOptions{root_rank, std::chrono::milliseconds(timeout)});
+
+  return std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
+      output_tensors, work);
+}
+
 c10::intrusive_ptr<Work> alltoall_cpu_(
     at::TensorList output_tensors,
     at::TensorList input_tensors,
@@ -329,6 +359,14 @@ TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
   m.impl("reduce_scatter_", reduce_scatter_cuda_);
 }
 
+TORCH_LIBRARY_IMPL(c10d, CPU, m) {
+  m.impl("scatter_", scatter_cpu_);
+}
+
+TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
+  m.impl("scatter_", scatter_cuda_);
+}
+
 TORCH_LIBRARY_IMPL(c10d, CPU, m) {
   m.impl("alltoall_", alltoall_cpu_);
 }

From b1b2776ad99baf77f47f5d87d47f92bbbe618cad Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Wed, 9 Nov 2022 04:51:04 +0000
Subject: [PATCH 0714/1922] OpOverload is_view (#88722)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88722
Approved by: https://github.com/ezyang
---
 torch/_meta_registrations.py |  5 +----
 torch/_ops.py                | 13 +++++++++++++
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index bb53cd268bda2..5035eadf84a47 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1768,10 +1768,7 @@ def activate_meta():
             # Instead, we should be letting those decompositions run, and writing meta kernels
             # only for the base operators.
             pass
-        elif any(
-            a.alias_info is not None and not a.alias_info.is_write
-            for a in op_overload._schema.arguments
-        ):
+        elif op_overload.is_view:
             # Attempting to register a python meta kernel for a view operator.
             # We shouldn't do this, because the output will report as not having aliased storages.
             # All view ops have meta kernels in C++ today, so we should use those instead.
diff --git a/torch/_ops.py b/torch/_ops.py
index f7ebba590aee0..4c194e9d938bb 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -246,6 +246,19 @@ def __init__(self, overloadpacket, op, op_dk, schema, tags):
         # NB: This name is hard-coded in torch/csrc/autograd/python_variable.cpp
         self._dispatch_cache = {}
 
+        # Logic replicated from aten/src/ATen/native/MathBitsFallback.h
+        is_write = None
+        for a in self._schema.arguments:
+            if a.alias_info is None:
+                continue
+            if is_write is None:
+                is_write = a.alias_info.is_write
+            else:
+                # We will conservatively call mixed mutable/non-mutable
+                # aliased inputs as NOT a view
+                is_write = a.alias_info.is_write or is_write
+        self.is_view = is_write is not None and not is_write
+
     # it's a no-op since OpOverload object is immutable and must be unique for a given op overload.
     def __deepcopy__(self, memo=None):
         return self

From fe9bd4d113b36a940808d0b21957ae5ae714a0d5 Mon Sep 17 00:00:00 2001
From: Charlie Yan <charlieyan@meta.com>
Date: Wed, 9 Nov 2022 20:29:34 +0000
Subject: [PATCH 0715/1922] Delete stub file to enable mypy check (#4649)
 (#88701)

Summary:
X-link: https://github.com/facebookresearch/detectron2/pull/4649

Context in https://fburl.com/4irjskbe

This change deletes distributed.pyi, so that lintrunner will run mypy on distributed.py for typing check.

Test Plan: CI

Differential Revision: D41028360

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88701
Approved by: https://github.com/zhaojuanmao
---
 torch/nn/parallel/distributed.py  | 21 ++++++++++++++-------
 torch/nn/parallel/distributed.pyi | 31 -------------------------------
 2 files changed, 14 insertions(+), 38 deletions(-)
 delete mode 100644 torch/nn/parallel/distributed.pyi

diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index f3de5037f60d8..c29a0a7ef46bb 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -37,7 +37,7 @@
 
 from ..modules import Module
 from ._replicated_tensor_ddp_utils import _ddp_with_replicated_tensor_enabled
-from .scatter_gather import gather, is_namedtuple, scatter_kwargs  # noqa: F401
+from .scatter_gather import gather, scatter_kwargs  # noqa: F401
 
 __all__ = ["DistributedDataParallel"]
 
@@ -194,6 +194,7 @@ def __init__(self, ddp, divide_by_initial_world_size):
             "DDP join hook requires passing in a DistributedDataParallel "
             "instance as the state"
         )
+        assert ddp.logger is not None
         ddp.logger._set_uneven_input_join()
         self.ddp = ddp
         self.ddp._divide_by_initial_world_size = divide_by_initial_world_size
@@ -836,6 +837,7 @@ def __setstate__(self, state):
         )
         if self.static_graph:
             self.reducer._set_static_graph()
+            assert self.logger is not None
             self.logger._set_static_graph()
 
     def _build_params_for_reducer(self):
@@ -863,7 +865,7 @@ def _build_params_for_reducer(self):
             # "not memo.add(p)" is always True, and it's only there to cause "add(p)" if needed.
             (m, p)
             for m, p in modules_and_parameters
-            if p not in memo and not memo.add(p)
+            if p not in memo and not memo.add(p)  # type: ignore[func-returns-value]
         ]
 
         # Build list of parameters.
@@ -1044,7 +1046,7 @@ def _run_ddp_forward(self, *inputs, **kwargs):
                 self.use_side_stream_for_tensor_copies,
             )
             with self._inside_ddp_forward():
-                return module_to_run(*inputs[0], **kwargs[0])
+                return module_to_run(*inputs[0], **kwargs[0])  # type: ignore[index]
         else:
             with self._inside_ddp_forward():
                 return module_to_run(*inputs, **kwargs)
@@ -1054,6 +1056,7 @@ def forward(self, *inputs, **kwargs):
             "DistributedDataParallel.forward"
         ):
             if torch.is_grad_enabled() and self.require_backward_grad_sync:
+                assert self.logger is not None
                 self.logger.set_runtime_stats_and_log()
                 self.num_iterations += 1
                 self.reducer.prepare_for_forward()
@@ -1063,7 +1066,7 @@ def forward(self, *inputs, **kwargs):
             work = Join.notify_join_context(self)
             if work:
                 self.reducer._set_forward_pass_work_handle(
-                    work, self._divide_by_initial_world_size
+                    work, self._divide_by_initial_world_size  # type: ignore[arg-type]
                 )
 
             # Calling _rebuild_buckets before forward compuation,
@@ -1171,7 +1174,7 @@ def gather(self, outputs, output_device):
     def train(self, mode=True):
         super(DistributedDataParallel, self).train(mode)
         if self._use_replicated_tensor_module:
-            self._replicated_tensor_module.train(mode)
+            self._replicated_tensor_module.train(mode)  # type: ignore[union-attr]
         return self
 
     # When running in join mode, schedules an allreduce to notify joined ranks
@@ -1392,7 +1395,7 @@ def join_process_group(self):
     def _register_buffer_comm_hook(
         self,
         state,
-        hook: callable,
+        hook: Callable,
         comm_hook_location=_BufferCommHookLocation.POST_FORWARD,
     ):
         r"""
@@ -1438,7 +1441,7 @@ def _register_buffer_comm_hook(
             buffer_comm_hook_location=comm_hook_location,
         )
 
-    def register_comm_hook(self, state: object, hook: callable):
+    def register_comm_hook(self, state: object, hook: Callable):
         r"""
         Registers a communication hook which is an enhancement that provides a
         flexible hook to users where they can specify how DDP aggregates gradients
@@ -1518,6 +1521,7 @@ def register_comm_hook(self, state: object, hook: callable):
             >>> ddp.register_comm_hook(state=None, hook=encode_and_decode)
         """
         self._check_comm_hook(hook)
+        assert self.logger is not None
         self.logger._set_comm_hook_name(hook.__qualname__)
         dist._register_comm_hook(self.reducer, state, hook)
 
@@ -1544,6 +1548,7 @@ def _register_builtin_comm_hook(self, comm_hook_type):
             >>> ddp._register_builtin_comm_hook(dist.BuiltinCommHookType.FP16_COMPRESS)
 
         """
+        assert self.logger is not None
         self.logger._set_comm_hook_name(str(comm_hook_type))
         dist._register_builtin_comm_hook(self.reducer, comm_hook_type)
 
@@ -1814,6 +1819,7 @@ def _get_ddp_logging_data(self):
         these metrics are.
         This is a prototype interface and subject to change in the future.
         """
+        assert self.logger is not None
         ddp_logging_data = self.logger._get_ddp_logging_data()
         return {**ddp_logging_data.strs_map, **ddp_logging_data.ints_map}
 
@@ -1848,6 +1854,7 @@ def _set_static_graph(self):
             return
         self.static_graph = True
         self.reducer._set_static_graph()
+        assert self.logger is not None
         self.logger._set_static_graph()
         if self.find_unused_parameters:
             warnings.warn(
diff --git a/torch/nn/parallel/distributed.pyi b/torch/nn/parallel/distributed.pyi
deleted file mode 100644
index a5db301e433d5..0000000000000
--- a/torch/nn/parallel/distributed.pyi
+++ /dev/null
@@ -1,31 +0,0 @@
-from typing import Any, Optional
-
-from ..modules import Module
-from .common_types import _device_t, _devices_t
-
-class DistributedDataParallel(Module):
-    process_group: Any = ...
-    dim: int = ...
-    module: Module = ...
-    device_ids: _devices_t = ...
-    output_device: _device_t = ...
-    broadcast_buffers: bool = ...
-    check_reduction: bool = ...
-    broadcast_bucket_size: float = ...
-    bucket_bytes_cap: float = ...
-
-    # TODO type process_group once `distributed` module is stubbed
-    def __init__(
-        self,
-        module: Module,
-        device_ids: Optional[_devices_t] = ...,
-        output_device: Optional[_device_t] = ...,
-        dim: int = ...,
-        broadcast_buffers: bool = ...,
-        process_group: Optional[Any] = ...,
-        bucket_cap_mb: float = ...,
-        find_unused_parameters: bool = ...,
-        check_reduction: bool = ...,
-        gradient_as_bucket_view: bool = ...,
-        static_graph: bool = ...,
-    ) -> None: ...

From 942fde2e20d6fb18c4598081d1f707db6a27ec91 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 9 Nov 2022 20:48:32 +0000
Subject: [PATCH 0716/1922] Revert "[primTorch] Improve `narrow` and
 `narrow_copy`: refs, tests, docs (#87045)"

This reverts commit aa8279bcb8687e025a666e18828a436eb7ef7b45.

Reverted https://github.com/pytorch/pytorch/pull/87045 on behalf of https://github.com/izaitsevfb due to BC-breaking change, D41161182
---
 aten/src/ATen/native/TensorShape.cpp          |  95 ++++++++++++-
 aten/src/ATen/native/native_functions.yaml    |   4 +-
 test/test_meta.py                             |   1 +
 torch/_refs/__init__.py                       |  38 ++---
 torch/_tensor_docs.py                         |  13 +-
 torch/_torch_docs.py                          |  27 ++--
 torch/csrc/jit/runtime/static/ops.cpp         |   8 +-
 .../_internal/common_methods_invocations.py   | 133 ++++--------------
 8 files changed, 153 insertions(+), 166 deletions(-)

diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 15c22dc7d4111..31b4011c12813 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -1196,14 +1196,18 @@ Tensor narrow_copy_dense(const Tensor& self, int64_t dim, int64_t start, int64_t
   return self.narrow(dim, start, length).clone(at::MemoryFormat::Contiguous);
 }
 
+Tensor narrow_copy_dense_cpu(const Tensor& self, int64_t dim, int64_t start, int64_t length){
+  auto output = at::empty_like(self);
+  return narrow_copy_dense_cpu_out(self, dim, start, length, output);
+}
+
 Tensor narrow_copy_sparse(const Tensor& self, int64_t dim, int64_t start, int64_t length) {
   int64_t allDim = self.dim();
   int64_t end = start+length;
   TORCH_CHECK(allDim > 0, "narrow() cannot be applied to a 0-dim tensor.");
-  TORCH_CHECK(length >= 0, "narrow(): length must be non-negative.");
   TORCH_CHECK(dim >= 0 && dim < allDim,
     "Dimension ", dim, " out of range. Expecting 0 <= dim < ", allDim, ".");
-  TORCH_CHECK(start >= 0 && end <= self.size(dim),
+  TORCH_CHECK(start >= 0 && length >= 0 && end <= self.size(dim),
     "Invalid range to narrow. range(start, start+length) must be a subset of range(0, ", self.size(dim), ").")
   Tensor indices = self._indices();
   int64_t sparse_dim = self.sparse_dim();
@@ -1231,26 +1235,105 @@ Tensor narrow_copy_sparse(const Tensor& self, int64_t dim, int64_t start, int64_
   return newTensor._coalesced_(self.is_coalesced());
 }
 
+Tensor& narrow_copy_dense_cpu_out(
+  const Tensor& self, int64_t dim, int64_t start, int64_t length, Tensor& output
+) {
+
+  TORCH_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor.");
+  TORCH_CHECK(self.dtype() == output.dtype());
+
+  auto self_contig = self.expect_contiguous();
+  const auto self_sizes = self_contig->sizes();
+
+  // wrap dim if negative and do bound check
+  if (dim < 0) {
+    dim = at::maybe_wrap_dim(dim, self_sizes.size());
+  } else {
+    TORCH_CHECK(dim < static_cast<int64_t>(self_sizes.size()));
+  }
+
+  // wrap start and do bound check
+  const auto cur_size = self_sizes[dim];
+  if (start != cur_size && start < 0) { // start being the end is valid, but
+                                        // not a valid dim specification.
+    start = at::maybe_wrap_dim(start, cur_size);
+  }
+  TORCH_CHECK(
+      length >= 0 && start <= cur_size - length,
+      "start (",
+      start,
+      ") + length (",
+      length,
+      ") exceeds dimension size (",
+      cur_size,
+      ").");
+
+  // resize output
+  auto output_sizes = self_sizes.vec();
+  output_sizes[dim] = length;
+  at::native::resize_(output, output_sizes);
+
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  const int64_t unit = c10::size_from_dim_(dim + 1, self_sizes);
+  const int64_t num_blocks = c10::size_to_dim_(dim, self_sizes);
+
+  const auto itemsize = self_contig->dtype().itemsize();
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+  size_t src_nbytes = itemsize * self_contig->numel();
+  // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+  size_t dst_nbytes = itemsize * output.numel();
+
+  size_t src_block_size = unit * self_sizes[dim];
+  size_t dst_block_size = unit * length;
+
+  if (num_blocks == 0 || dst_block_size == 0) {
+    return output;
+  }
+
+  char* src_bytes = static_cast<char*>(self_contig->data_ptr());
+  char* dst_bytes = static_cast<char*>(output.data_ptr());
+
+  size_t src_block_size_bytes = itemsize * src_block_size;
+  size_t dst_block_size_bytes = itemsize * dst_block_size;
+  size_t src_offset = unit * start;
+
+  char* src_offset_bytes = src_bytes + itemsize * src_offset;
+  char* dst_offset_bytes = dst_bytes;
+
+  for (const auto i : c10::irange(num_blocks)) {
+    char* local_src_offset_bytes = src_offset_bytes + i * src_block_size_bytes;
+    char* local_dst_offset_bytes = dst_offset_bytes + i * dst_block_size_bytes;
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        static_cast<void*>(local_src_offset_bytes + dst_block_size_bytes) <=
+        static_cast<void*>(src_bytes + src_nbytes));
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        static_cast<void*>(local_dst_offset_bytes + dst_block_size_bytes) <=
+        static_cast<void*>(dst_bytes + dst_nbytes));
+
+    memcpy(
+        local_dst_offset_bytes, local_src_offset_bytes, dst_block_size_bytes);
+  }
+  return output;
+}
+
 Tensor narrow(const Tensor& self, int64_t dim, int64_t start, int64_t length) {
   TORCH_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor.");
-  TORCH_CHECK(length >= 0, "narrow(): length must be non-negative.");
   auto cur_size = self.size(dim);
   if (start != cur_size) {  // start being the end is valid, but not a valid dim specification.
     start = maybe_wrap_dim(start, cur_size);
   }
-  TORCH_CHECK(start <= cur_size - length,
+  TORCH_CHECK(length >= 0 && start <= cur_size - length,
            "start (", start, ") + length (", length, ") exceeds dimension size (", cur_size, ").");
   return at::slice(self, dim, start, start + length, 1);
 }
 
 Tensor narrow_symint(const Tensor& self, int64_t dim, SymInt start, SymInt length) {
   TORCH_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor.");
-  TORCH_CHECK(length >= 0, "narrow(): length must be non-negative.");
   auto cur_size = self.sym_size(dim);
   if (start != cur_size) {  // start being the end is valid, but not a valid dim specification.
     start = maybe_wrap_dim(start, cur_size);
   }
-  TORCH_CHECK(start <= cur_size - length,
+  TORCH_CHECK(length >= 0 && start <= cur_size - length,
            "start (", start, ") + length (", length, ") exceeds dimension size (", cur_size, ").");
   return at::slice_symint(self, dim, start, start + length, 1);
 }
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index b45e98425ce31..02b073a1ce785 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -3768,14 +3768,14 @@
 - func: narrow_copy(Tensor self, int dim, SymInt start, SymInt length) -> Tensor
   variants: function, method
   dispatch:
+    CPU: narrow_copy_dense_cpu
     SparseCPU, SparseCUDA: narrow_copy_sparse
     CompositeExplicitAutogradNonFunctional: narrow_copy_dense_symint
   tags: view_copy
 
 - func: narrow_copy.out(Tensor self, int dim, SymInt start, SymInt length, *, Tensor(a!) out) -> Tensor(a!)
-  variants: function
   dispatch:
-    CompositeExplicitAutogradNonFunctional: narrow_copy_out
+    CPU: narrow_copy_dense_cpu_out
 
 - func: narrow(Tensor(a) self, int dim, SymInt start, SymInt length) -> Tensor(a)
   variants: function, method
diff --git a/test/test_meta.py b/test/test_meta.py
index ae248a90cffb7..ef25d184c8428 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -745,6 +745,7 @@ def run_meta_crossref(
 }
 
 meta_function_device_skips['cpu'] = {
+    torch.narrow_copy: {b8, bf16, c128, c32, c64, f16, f32, f64, i16, i32, i64, i8, u8},
     torch.native_batch_norm: {f32, f64},
 }
 
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index ecb6a86d5f911..cd0344eba7a91 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -2749,39 +2749,19 @@ def flipud(a: TensorLikeType) -> TensorLikeType:
 
 
 # CompositeImplicitAutograd - don't register decomp
-def narrow(
-    a: TensorLikeType, dim: int, start: Union[int, TensorLikeType], length: int
-) -> TensorLikeType:
-    # Supports Tensor overload that was added for XLA:
-    # https://github.com/pytorch/pytorch/issues/31558
-    if isinstance(start, TensorLike):
-        check(
-            start.dim() == 0 and utils.is_integer_dtype(start.dtype),
-            lambda: "start must be an 0-dim integral Tensor.",
-        )
-        start = start.item()  # type: ignore[assignment]
-    check(a.dim() > 0, lambda: "narrow() cannot be applied to a 0-dim tensor.")
-    check(length >= 0, lambda: "narrow(): length must be non-negative.")
+def narrow(a: TensorLikeType, dim: int, start: int, length: int) -> TensorLikeType:
     dim = utils.canonicalize_dim(a.ndim, dim)
-    dim_length = a.size(dim)
-    # Start being the end is usually invalid since it's out of bounds. So it's
-    # not allowed by canonicalize_dim. But for narrow it's valid as long as
-    # the length is 0, which is handled by the check below.
-    if start != dim_length:
-        # Negative start means indexing from the end of dim.
-        # Note: a dimension isn't being canonicalized here, this reuses
-        # canonicalize_dim because the semantics are similar.
-        start = utils.canonicalize_dim(dim_length, start)  # type: ignore[arg-type]
-    check(
-        start <= dim_length - length,  # type: ignore[arg-type]
-        lambda: f"start ({start}) + length ({length}) exceeds dimension size ({dim_length}).",
-    )
     return prims.slice_in_dim(a, start, start + length, axis=dim)
 
 
-# TODO: This must return a sparse tensor if the input is sparse, but refs have
-# no sparse support. See narrow_copy_sparse in core.
-narrow_copy = _make_copy_from_view(narrow)
+@register_decomposition(torch.ops.aten.narrow_copy)
+@out_wrapper()
+def narrow_copy(a: TensorLikeType, dim: int, start: int, length: int) -> TensorLikeType:
+    # TODO: This must return a sparse tensor if the input is sparse, but refs
+    # have no sparse support.  See narrow_copy_sparse in core.
+    if a.is_sparse:
+        raise NotImplementedError("narrow_copy ref doesn't support sparse tensors")
+    return torch.clone(torch.narrow(a=a, dim=dim, start=start, length=length))  # type: ignore[call-overload]
 
 
 def _normalize(
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 726ae5137e6a4..8c734a1f3774b 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -3436,7 +3436,18 @@ def callable(a, b) -> number
     r"""
 narrow(dimension, start, length) -> Tensor
 
-See :func:`torch.narrow`.
+See :func:`torch.narrow`
+
+Example::
+
+    >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    >>> x.narrow(0, 0, 2)
+    tensor([[ 1,  2,  3],
+            [ 4,  5,  6]])
+    >>> x.narrow(1, 1, 2)
+    tensor([[ 2,  3],
+            [ 5,  6],
+            [ 8,  9]])
 """,
 )
 
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 2ff2e9be315de..40375bae3e274 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -7980,10 +7980,8 @@ def merge_dicts(*dicts):
 Args:
     input (Tensor): the tensor to narrow
     dim (int): the dimension along which to narrow
-    start (int or Tensor): index of the element to start the narrowed dimension
-        from. Can be negative, which means indexing from the end of `dim`. If
-        `Tensor`, it must be an 0-dim integral `Tensor` (bools not allowed)
-    length (int): length of the narrowed dimension, must be weakly positive
+    start (Tensor or int): the starting dimension
+    length (int): the distance to the ending dimension
 
 Example::
 
@@ -7995,10 +7993,6 @@ def merge_dicts(*dicts):
     tensor([[ 2,  3],
             [ 5,  6],
             [ 8,  9]])
-    >>> torch.narrow(x, -1, torch.tensor(-1), 1)
-    tensor([[3],
-            [6],
-            [9]])
 """,
 )
 
@@ -8014,9 +8008,8 @@ def merge_dicts(*dicts):
 Args:
     input (Tensor): the tensor to narrow
     dim (int): the dimension along which to narrow
-    start (int): index of the element to start the narrowed dimension from. Can
-        be negative, which means indexing from the end of `dim`
-    length (int): length of the narrowed dimension, must be weakly positive
+    start (int): the starting offset
+    length (int): the distance to the ending dimension
 
 Keyword args:
     {out}
@@ -8034,13 +8027,13 @@ def merge_dicts(*dicts):
     >>> s = torch.arange(16).reshape(2, 2, 2, 2).to_sparse(2)
     >>> torch.narrow_copy(s, 0, 0, 1)
     tensor(indices=tensor([[0, 0],
-                           [0, 1]]),
-           values=tensor([[[0, 1],
-                           [2, 3]],
+                        [0, 1]]),
+        values=tensor([[[0, 1],
+                        [2, 3]],
 
-                          [[4, 5],
-                           [6, 7]]]),
-           size=(1, 2, 2, 2), nnz=2, layout=torch.sparse_coo)
+                        [[4, 5],
+                        [6, 7]]]),
+        size=(1, 2, 2, 2), nnz=2, layout=torch.sparse_coo)
 
 .. seealso::
 
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index e80c7c1460b80..92044ca565a9c 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -1223,8 +1223,8 @@ REGISTER_OPERATOR_FUNCTOR(aten::narrow_copy, aten_narrow_copy, [](Node* n) -> SR
     return nullptr;
   }
   return [](ProcessedNode* p_node) {
-    auto& self = p_node->Input(0).toTensor(); // self
-    auto dim = p_node->Input(1).toInt(); // dim
+    const auto& self = p_node->Input(0).toTensor(); // self
+    const auto dim = p_node->Input(1).toInt(); // dim
     int64_t start = 0;
     if (p_node->Input(2).isScalar()) {
       start = p_node->Input(2).toInt();
@@ -1236,12 +1236,12 @@ REGISTER_OPERATOR_FUNCTOR(aten::narrow_copy, aten_narrow_copy, [](Node* n) -> SR
 
     if (p_node->Output(0).isNone()) {
       p_node->Output(0) =
-          at::native::narrow_copy_dense_symint(self, dim, start, length);
+          at::native::narrow_copy_dense_cpu(self, dim, start, length);
       return;
     }
     auto& output = p_node->Output(0).toTensor();
     fastResizeToZero(output);
-    at::narrow_copy_out(output, self, dim, start, length);
+    at::native::narrow_copy_dense_cpu_out(self, dim, start, length, output);
   };
 });
 REGISTER_OPERATOR_FUNCTOR(aten::index, aten_index, [](Node* n) -> SROperator {
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index f69bbe20ed36a..731dc008ccce7 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -4311,109 +4311,29 @@ def sample_repeat_tile(op_info, device, dtype, requires_grad, **kwargs):
         yield SampleInput(make_arg(shape), rep_dim)
 
 
-def sample_inputs_narrow_narrow_copy(op_info, device, dtype, requires_grad, *, is_narrow, **kwargs):
+def sample_inputs_narrow_copy(op_info, device, dtype, requires_grad, **kwargs):
     shapes_and_args = (
-        ((S, S, S), 1, 2, 2),
-        ((S, S, S), -1, 2, 2),
-        ((S, S, S), 1, 0, 0),
-        ((S, S, S), -1, 0, 0),
-        ((S, S, S), 2, 1, 2),
+        ((S, S, S), (1, 2, 2)),
+        ((S, S, S), (-1, 2, 2)),
+        ((S, S, S), (1, 0, 0)),
+        ((S, S, S), (-1, 0, 0)),
+        ((S, S, S), (2, 1, 2)),
     )
 
-    for shape, dim, start, length in shapes_and_args:
-        tensor = make_tensor(shape, dtype=dtype, device=device, low=None, high=None,
-                             requires_grad=requires_grad)
-        yield SampleInput(tensor, dim, start, length)
-        # narrow also accepts the start argument being a Tensor
-        if is_narrow:
-            yield SampleInput(tensor, dim, torch.tensor(start), length)
-
-def reference_inputs_narrow_narrow_copy(op_info, device, dtype, requires_grad, *, is_narrow, **kwargs):
-    yield from sample_inputs_narrow_narrow_copy(op_info, device, dtype, requires_grad, is_narrow=is_narrow, **kwargs)
-
-    shapes_and_args = (
-        # 1-dim
-        ((M,), 0, 0, 0),    # 0 elems from the left
-        ((M,), -1, -1, 0),  # 0 elems from the right
-        ((M,), 0, 5, 3),    # 3 elems from the left
-        ((M,), 0, -5, 2),   # 2 elems from the right
-        ((M,), -1, 0, M),   # M elems from the left
-        ((M,), 0, -M, M),   # M elems from the right
-
-        # 2-dim
-        ((M, S), 1, 0, 0),    # dim 1, 0 elems from the left
-        ((S, M), -2, -1, 0),  # dim 0, 0 elems from the right
-        ((L, S), 1, 2, 3),    # dim 1, 3 elems from the left
-        ((L, S), -1, 3, 2),   # dim 1, 2 elems from the left
-        ((M, L), 0, 0, M),    # dim 0, M elems from the left
-        ((M, L), -1, -L, L),  # dim 1, L elems from the right
-
-        # 3-dim
-        ((L, M, S), 2, 0, 0),    # dim 2, 0 elems from the left
-        ((M, S, L), -1, -1, 0),  # dim 2, 0 elems from the right
-        ((S, L, M), 2, 0, M),    # dim 2, M elems from the left
-        ((L, S, M), -1, -M, M),  # dim 2, M elems from the right
-        ((S, L, M), 1, 0, 0),    # dim 1, 0 elems from the left
-        ((S, L, M), 0, 2, 1),    # dim 0, 1 elem from the left
-        ((M, S, M), -1, -5, 4),  # dim 2, 4 elems from the right
-    )
-
-    for shape, dim, start, length in shapes_and_args:
+    for shape, args in shapes_and_args:
         tensor = make_tensor(shape, dtype=dtype, device=device, low=None, high=None,
                              requires_grad=requires_grad)
-        yield SampleInput(tensor, dim, start, length)
-        # narrow also accepts the start argument being a Tensor
-        if is_narrow:
-            yield SampleInput(tensor, dim, torch.tensor(start), length)
-
-def error_inputs_narrow_narrow_copy(op_info, device, *, is_narrow):
-    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+        yield SampleInput(tensor, args=args)
 
-    # 0-dim
-    yield ErrorInput(SampleInput(make_arg(()), 0, 0, 1),
-                     error_type=RuntimeError,
-                     error_regex=r"narrow\(\) cannot be applied to a 0-dim tensor\.")
-
-    # out of bounds dim
-    yield ErrorInput(SampleInput(make_arg((M, S, L)), 3, 0, 0),
-                     error_type=IndexError,
-                     error_regex=r"Dimension out of range \(expected to be in range of \[-3, 2\], but got 3\)")
-    # out of bounds dim (negative)
-    yield ErrorInput(SampleInput(make_arg((L, S, M)), -4, 0, 0),
-                     error_type=IndexError,
-                     error_regex=r"Dimension out of range \(expected to be in range of \[-3, 2\], but got -4\)")
-
-    # out of bounds start
-    yield ErrorInput(SampleInput(make_arg((L, M, S)), 1, M + 1, 0),
-                     error_type=IndexError,
-                     error_regex=r"Dimension out of range \(expected to be in range of \[-10, 9\], but got 11\)")
-    # out of bounds start (negative)
-    yield ErrorInput(SampleInput(make_arg((L, M, S)), 1, -M - 1, 0),
-                     error_type=IndexError,
-                     error_regex=r"Dimension out of range \(expected to be in range of \[-10, 9\], but got -11\)")
-
-    # out of bounds length
-    yield ErrorInput(SampleInput(make_arg((S, L, M)), 2, 0, M + 1),
-                     error_type=RuntimeError,
-                     error_regex=r"start \(0\) \+ length \(11\) exceeds dimension size \(10\)\.")
-    # out of bounds length (negative)
-    yield ErrorInput(SampleInput(make_arg((M,)), 0, 0, -1),
-                     error_type=RuntimeError,
-                     error_regex=r"narrow\(\): length must be non-negative\.")
-
-    # Test Tensor overload that was added for XLA. Start must be an 0-dim
-    # integral Tensor. narrow_copy doesn't have this overload.
-    # https://github.com/pytorch/pytorch/issues/31558
-    if is_narrow:
-        # *1-dim* integral Tensor
-        yield ErrorInput(SampleInput(make_arg((L, M, S)), 1, make_arg(S, dtype=torch.int), 2),
-                         error_type=RuntimeError,
-                         error_regex=r"start must be an 0-dim integral Tensor\.")
 
-        # 0-dim *bool* Tensor (bools are not allowed)
-        yield ErrorInput(SampleInput(make_arg((L, M, S)), -3, make_arg((), dtype=torch.bool), 3),
-                         error_type=RuntimeError,
-                         error_regex=r"start must be an 0-dim integral Tensor\.")
+def sample_inputs_narrow(op_info, device, dtype, requires_grad, **kwargs):
+    '''
+    sample_inputs_narrow accepts the same inputs as narrow_copy, in addition
+    narrow also accepts `start` argument to be a Tensor.
+    '''
+    for sample in sample_inputs_narrow_copy(op_info, device, dtype, requires_grad, **kwargs):
+        yield sample
+        yield SampleInput(sample.input, args=(sample.args[0], torch.tensor(sample.args[1]), sample.args[2]))
 
 
 def sample_trapezoid(op_info, device, dtype, requires_grad, **kwargs):
@@ -12406,9 +12326,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           sample_inputs_func=partial(sample_inputs_narrow_narrow_copy, is_narrow=True),
-           reference_inputs_func=partial(reference_inputs_narrow_narrow_copy, is_narrow=True),
-           error_inputs_func=partial(error_inputs_narrow_narrow_copy, is_narrow=True),
+           sample_inputs_func=sample_inputs_narrow,
            skips=(
                # Use of .item()
                DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator'),
@@ -12424,18 +12342,15 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_fwgrad_bwgrad=False,
            supports_autograd=False,
            # https://github.com/pytorch/pytorch/issues/86931
-           sample_inputs_func=partial(sample_inputs_narrow_narrow_copy, is_narrow=False),
-           reference_inputs_func=partial(reference_inputs_narrow_narrow_copy, is_narrow=False),
-           error_inputs_func=partial(error_inputs_narrow_narrow_copy, is_narrow=False),
+           sample_inputs_func=sample_inputs_narrow_copy,
            skips=(
                # https://github.com/pytorch/pytorch/issues/84577
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
-               # Lazy tensor failures: mutating and aliasing ops should all have codegen'd kernels
-               DecorateInfo(unittest.expectedFailure, 'TestLazyOpInfo', 'test_correctness'),
-               DecorateInfo(unittest.expectedFailure, 'TestLazyOpInfo', 'test_correctness_with_reusing_ir'),
-               # TypeError: must be real number, not SymFloat
-               DecorateInfo(unittest.expectedFailure, 'TestProxyTensorOpInfo', 'test_make_fx_symbolic_exhaustive'),
+               # Not implemented
+               DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_meta_outplace', device_type='cuda'),
+               DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_meta_outplace', device_type='cuda'),
+               DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_symbolic_meta', device_type='cuda'),
            )),
     UnaryUfuncInfo('neg',
                    aliases=('negative', ),
@@ -18021,6 +17936,10 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.narrow",
         torch_opinfo_name="narrow",
         supports_nvfuser=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_meta'),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_executor'),
+        )
     ),
     PythonRefInfo(
         "_refs.narrow_copy",

From 82038305197bff80780f1499bcf5ae8ec84d3304 Mon Sep 17 00:00:00 2001
From: Charlie Yan <charlieyan@meta.com>
Date: Wed, 9 Nov 2022 20:51:11 +0000
Subject: [PATCH 0717/1922] [3/n] Thread PG: add threaded PG implementation
 (#88627)

Summary: After the previous 2 diffs, finally we can add the threaded ProcessGroup implementation.

Test Plan: TBD

Reviewed By: XilunWu

Differential Revision: D40992593

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88627
Approved by: https://github.com/XilunWu, https://github.com/H-Huang
---
 test/distributed/test_multi_threaded_pg.py    |  45 +++
 torch/testing/_internal/common_distributed.py | 149 +++++++--
 .../distributed/multi_threaded_pg.py          | 288 ++++++++++++++++++
 3 files changed, 456 insertions(+), 26 deletions(-)
 create mode 100644 test/distributed/test_multi_threaded_pg.py
 create mode 100644 torch/testing/_internal/distributed/multi_threaded_pg.py

diff --git a/test/distributed/test_multi_threaded_pg.py b/test/distributed/test_multi_threaded_pg.py
new file mode 100644
index 0000000000000..6a0fe33cd8ad6
--- /dev/null
+++ b/test/distributed/test_multi_threaded_pg.py
@@ -0,0 +1,45 @@
+# Owner(s): ["oncall: distributed"]
+
+import sys
+import torch.distributed as dist
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+from torch.testing._internal.common_distributed import (
+    spawn_threads_and_init_comms,
+    MultiThreadedTestCase
+
+)
+from torch.testing._internal.common_utils import TestCase, run_tests
+
+DEFAULT_WORLD_SIZE = 4
+
+class TestObjectCollectivesWithWrapper(TestCase):
+    @spawn_threads_and_init_comms(world_size=4)
+    def test_broadcast_object_list(self):
+        val = 99 if dist.get_rank() == 0 else None
+        object_list = [val] * dist.get_world_size()
+
+        dist.broadcast_object_list(object_list=object_list)
+        self.assertEqual(99, object_list[0])
+
+class TestObjectCollectivesWithBaseClass(MultiThreadedTestCase):
+    @property
+    def world_size(self):
+        return 4
+
+    def test_broadcast_object_list(self):
+        val = 99 if dist.get_rank() == 0 else None
+        object_list = [val] * dist.get_world_size()
+        print(f"{dist.get_rank()} -> {dist.get_world_size()}")
+
+        dist.broadcast_object_list(object_list=object_list)
+        self.assertEqual(99, object_list[0])
+
+    def test_something_else(self):
+        pass
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 607211087ddc7..883a48a5a5fef 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -2,10 +2,10 @@
 import logging
 import multiprocessing
 import os
+import subprocess
 import sys
 import tempfile
 import threading
-import subprocess
 import time
 import traceback
 import types
@@ -14,11 +14,7 @@
 from dataclasses import dataclass
 from datetime import timedelta
 from enum import Enum
-from functools import (
-    partial,
-    reduce,
-    wraps
-)
+from functools import partial, reduce, wraps
 from io import StringIO
 from typing import NamedTuple, Optional, Union
 
@@ -26,16 +22,17 @@
 import torch.cuda.nccl
 import torch.distributed as c10d
 from torch.testing._internal.common_utils import (
-    TestCase,
-    TEST_WITH_ROCM,
-    TEST_WITH_TSAN,
     FILE_SCHEMA,
     find_free_port,
-    retry_on_connect_failures,
     IS_SANDCASTLE,
-    sandcastle_skip_if,
+    retry_on_connect_failures,
     sandcastle_skip,
+    sandcastle_skip_if,
+    TEST_WITH_ROCM,
+    TEST_WITH_TSAN,
+    TestCase,
 )
+from torch.testing._internal.distributed.multi_threaded_pg import run_with_threaded_pg
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -67,11 +64,10 @@ class TestSkip(NamedTuple):
     "generic": TestSkip(
         86, "Test skipped at subprocess level, look at subprocess log for skip reason"
     ),
-    "importerror": TestSkip(
-        88, "Test skipped due to missing import"
-    ),
+    "importerror": TestSkip(88, "Test skipped due to missing import"),
 }
 
+
 @dataclass
 class DistTestCases:
     # Backends that do not support a specific collective
@@ -93,6 +89,7 @@ class DistTestCases:
 def skip_if_no_gpu(func):
     """Skips if the world size exceeds the number of GPUs, ensuring that if the
     test is run, each rank has its own GPU via ``torch.cuda.device(rank)``."""
+
     @wraps(func)
     def wrapper(*args, **kwargs):
         if not torch.cuda.is_available():
@@ -116,6 +113,7 @@ def wrapper(*args, **kwargs):
 
     return wrapper
 
+
 def skip_if_odd_worldsize(func):
     @wraps(func)
     def wrapper(*args, **kwargs):
@@ -126,6 +124,7 @@ def wrapper(*args, **kwargs):
 
     return wrapper
 
+
 def require_n_gpus_for_nccl_backend(n, backend):
     def decorator(func):
         @wraps(func)
@@ -139,12 +138,17 @@ def wrapper(*args, **kwargs):
 
     return decorator
 
+
 def import_transformers_or_skip():
     def decorator(func):
         @wraps(func)
         def wrapper(*args, **kwargs):
             try:
-                from transformers import BertConfig, AutoModelForMaskedLM  # noqa: Unused
+                from transformers import (  # noqa: Unused
+                    AutoModelForMaskedLM,
+                    BertConfig,
+                )
+
                 return func(*args, **kwargs)
             except ImportError:
                 sys.exit(TEST_SKIPS["importerror"].exit_code)
@@ -153,6 +157,7 @@ def wrapper(*args, **kwargs):
 
     return decorator
 
+
 def skip_if_lt_x_gpu(x):
     def decorator(func):
         @wraps(func)
@@ -191,10 +196,13 @@ def verify_ddp_error_logged(model_DDP, err_substr):
     logging_err = ddp_logging_data["error"]
     # Remove C++ stacktrace if needed.
     actual = (
-        err_substr if err_substr.find("\nException raised from ") == -1
+        err_substr
+        if err_substr.find("\nException raised from ") == -1
         else err_substr.split("\nException raised from ")[0]
     )
-    assert actual in logging_err, f"Did not find expected {actual} in ddp logging data error: {logging_err}"
+    assert (
+        actual in logging_err
+    ), f"Did not find expected {actual} in ddp logging data error: {logging_err}"
 
 
 def with_nccl_blocking_wait(func):
@@ -319,7 +327,7 @@ def wrapper(*args, **kwargs):
 
 def skip_if_win32():
     return sandcastle_skip_if(
-        sys.platform == 'win32',
+        sys.platform == "win32",
         "This unit test case is not supportted on Windows platform",
     )
 
@@ -352,13 +360,14 @@ def create_tcp_store(
     # TSAN runs much slower.
     TIMEOUT_DEFAULT = 500
 else:
-    TIMEOUT_DEFAULT = int(os.getenv('DISTRIBUTED_TESTS_DEFAULT_TIMEOUT', '300'))
+    TIMEOUT_DEFAULT = int(os.getenv("DISTRIBUTED_TESTS_DEFAULT_TIMEOUT", "300"))
 TIMEOUT_OVERRIDE = {"test_ddp_uneven_inputs": 400}
 
 # https://github.com/pytorch/pytorch/issues/75665
 if TEST_WITH_ROCM:
     TIMEOUT_OVERRIDE["test_join_kwargs"] = 200
 
+
 def create_device(interface=None):
     if sys.platform == "win32" or interface is None:
         return c10d.ProcessGroupGloo.create_device(hostname="127.0.0.1")
@@ -449,9 +458,7 @@ def init_multigpu_helper(world_size: int, backend: str):
     if world_size > nGPUs:
         nGPUs_per_process = nGPUs // world_size
     rank_to_GPU = {
-        i: list(
-            visible_devices[i * nGPUs_per_process : (i + 1) * nGPUs_per_process]
-        )
+        i: list(visible_devices[i * nGPUs_per_process : (i + 1) * nGPUs_per_process])
         for i in range(world_size)
     }
     return rank_to_GPU
@@ -482,6 +489,9 @@ def cleanup_temp_dir() -> None:
         tmp_dir.cleanup()
 
 
+# Most tests operate with this worldsize
+DEFAULT_WORLD_SIZE = 4
+
 # [How does MultiProcessTestCase work?]
 # Each MultiProcessTestCase instance uses 1 + `world_size()` processes, by
 # default `world_size()` returns 4. Let's take `test_rpc_spawn.py` as an
@@ -508,7 +518,7 @@ def _should_stop_test_suite(self) -> bool:
 
     @property
     def world_size(self) -> int:
-        return 4
+        return DEFAULT_WORLD_SIZE
 
     def join_or_run(self, fn):
         @wraps(fn)
@@ -607,7 +617,10 @@ def _event_listener(parent_pipe, signal_pipe, rank: int):
     @classmethod
     def _run(cls, rank: int, test_name: str, file_name: str, parent_pipe) -> None:
         # Enable DDP + ReplicatedTensor
-        from torch.nn.parallel._replicated_tensor_ddp_utils import _set_ddp_with_replicated_tensor
+        from torch.nn.parallel._replicated_tensor_ddp_utils import (
+            _set_ddp_with_replicated_tensor,
+        )
+
         _set_ddp_with_replicated_tensor(True)
 
         self = cls(test_name)
@@ -815,16 +828,20 @@ def _check_return_codes(self, elapsed_time) -> None:
         self.assertEqual(
             first_process.exitcode,
             0,
-            msg="Expected zero exit code but got {} for pid: {}".format(first_process.exitcode, first_process.pid)
+            msg="Expected zero exit code but got {} for pid: {}".format(
+                first_process.exitcode, first_process.pid
+            ),
         )
 
     @property
     def is_master(self) -> bool:
         return self.rank == 0
 
+
 # Cannot use functools.cache as it requires python 3.9
 EFA_PROBE_RESULT = None
 
+
 def has_efa() -> bool:
     """
     If shell command `fi_info -p efa -t FI_EP_RDM` returns exit code 0 then we assume that the machine has
@@ -836,7 +853,9 @@ def has_efa() -> bool:
         return EFA_PROBE_RESULT
 
     try:
-        EFA_PROBE_RESULT = subprocess.run(["fi_info", "-p", "efa", "-t", "FI_EP_RDM"]).returncode == 0
+        EFA_PROBE_RESULT = (
+            subprocess.run(["fi_info", "-p", "efa", "-t", "FI_EP_RDM"]).returncode == 0
+        )
     except FileNotFoundError:
         EFA_PROBE_RESULT = False
     return EFA_PROBE_RESULT
@@ -850,3 +869,81 @@ def tp_transports():
     see https://github.com/pytorch/pytorch/issues/73885 and https://github.com/pytorch/pytorch/issues/65022
     """
     return ["shm", "uv"] if has_efa() else None
+
+
+def _run_test_with_mt_pg(self, timeout, world_size, callback):
+    failed_ranks = run_with_threaded_pg(world_size, timeout, callback)
+    for rank, exc_info in failed_ranks:
+        print(f"Rank {rank} raised:")
+        for line in traceback.format_exception(*exc_info):
+            sys.stdout.write(line)
+    self.assertEqual([], failed_ranks, "Some ranks failed")
+
+
+def spawn_threads_and_init_comms(
+    func=None, timeout=TIMEOUT_DEFAULT, world_size=DEFAULT_WORLD_SIZE
+):
+    """
+    Wrapper to use with a test method
+    """
+    if func is None:
+        return partial(
+            spawn_threads_and_init_comms, timeout=timeout, world_size=world_size
+        )
+
+    @wraps(func)
+    def wrapper(self, *args, **kwargs):
+        _run_test_with_mt_pg(
+            self, timeout, world_size, lambda: func(self, *args, **kwargs)
+        )
+
+    return wrapper
+
+
+class MultiThreadedTestCase(TestCase):
+    """
+    Simple test runner that executes all tests with the in-proc process group.
+
+    A single instance of the TestCase object for all threads.
+
+    Difference from regular test runner:
+    Cannot use setUp / tearDown (must use perThreadSetup / perThreadShutdown)
+        Not sure what these two would be good for though.
+    No global state possible
+        How bad of a limitation is this?
+    """
+
+    def __init__(self, method_name: str = "runTest") -> None:
+        super().__init__(method_name)
+        self._test_method = getattr(self, method_name, None)
+        setattr(self, method_name, self.threaded_run_test)
+        if TestCase.setUp != type(self).setUp:
+            raise RuntimeError(
+                f"Test class {type(self)} overrides disabled method setUp. Use perThreadSetUp instead"
+            )
+        if TestCase.tearDown != type(self).tearDown:
+            raise RuntimeError(
+                f"Test class {type(self)} overrides disabled method tearDown. Use perThreadTearDown instead"
+            )
+
+    def threaded_run_test(self):
+        self.perThreadSetUp()
+        try:
+            _run_test_with_mt_pg(
+                self=self,
+                timeout=TIMEOUT_DEFAULT,
+                world_size=self.world_size,
+                callback=self._test_method,
+            )
+        finally:
+            self.perThreadTearDown()
+
+    def perThreadSetUp(self):
+        pass
+
+    def perThreadTearDown(self):
+        pass
+
+    @property
+    def world_size(self) -> int:
+        raise RuntimeError("world size not implemented")
diff --git a/torch/testing/_internal/distributed/multi_threaded_pg.py b/torch/testing/_internal/distributed/multi_threaded_pg.py
new file mode 100644
index 0000000000000..7e18f870f2e76
--- /dev/null
+++ b/torch/testing/_internal/distributed/multi_threaded_pg.py
@@ -0,0 +1,288 @@
+import queue
+import sys
+import threading
+import time
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple
+
+import torch
+import torch.distributed as dist
+from torch._C._distributed_c10d import _create_work_from_future, Store
+from torch.futures import Future
+from torch.utils._pytree import tree_flatten
+
+"""
+TODO:
+Lots of missing collectives.
+Collectives validation.
+Make timeout robust by making collectives respect the test deadline.
+Make tests robuts by making collectives interruptible.
+We need some synchronization around cleanup to ensure that timedout ranks don't cause spurious failures.
+
+"""
+
+
+def flatten_list(lst):
+    return tree_flatten(lst)[0]
+
+
+def ret_work(ret):
+    fut = Future()
+    fut.set_result(ret)
+    return _create_work_from_future(fut)
+
+
+class AllGather:
+    def work(self, data):
+        for src_rank in range(len(data)):
+            in_tensor_list = data[src_rank][1]
+            # Can't handle all_gather with multiple tensors
+            assert len(in_tensor_list) == 1
+            src_tensor = in_tensor_list[0]
+
+            for dest in data:
+                dest_tensor = dest[0][0][src_rank]
+                with torch.no_grad():
+                    dest_tensor.copy_(src_tensor)
+
+
+class Broadcast:
+    def __init__(self, src):
+        self.src = src
+
+    def work(self, data):
+        in_tensor_list = flatten_list(data[self.src])
+        for i in range(len(data)):
+            out_tensor_list = flatten_list(data[i])
+            for j in range(len(in_tensor_list)):
+                with torch.no_grad():
+                    out_tensor_list[j].copy_(in_tensor_list[j])
+
+
+class Collective:
+    def __init__(self, world_size, collective):
+        self._world_size = world_size
+        self._collective = collective
+
+        self._start_cond = threading.Condition()
+        self._done_cond = threading.Condition()
+
+        self._data = [None] * world_size
+        self._count = 0
+        self._done = False
+
+    def join(self, rank, data):
+        with self._start_cond:
+            self._data[rank] = data
+            self._count += 1
+
+            # notify rank 0
+            if self._count == self._world_size:
+                if rank > 0:
+                    self._start_cond.notify()
+
+            if rank == 0:
+                while self._count < self._world_size:
+                    self._start_cond.wait()
+
+        with self._done_cond:
+            # wait for rank 0 to finish
+            if rank > 0:
+                while not self._done:
+                    self._done_cond.wait()
+            else:
+                # copy data around
+                self._collective.work(self._data)
+                self._done = True
+                self._done_cond.notify_all()
+        return ret_work(data)
+
+
+class ProcessLocalGroup(dist.ProcessGroup):
+    _pg_lock = threading.Lock()
+    _pg_list = []
+    _count = 0
+    _ready = False
+
+    _coll_lock = threading.Lock()
+    _cur_coll = None
+
+    @classmethod
+    def _register(cls, pg):
+        with cls._pg_lock:
+            while len(cls._pg_list) <= pg._rank:
+                cls._pg_list.append(None)
+            cls._pg_list[pg._rank] = pg
+            cls._count += 1
+            if cls._count == pg._world:
+                cls._ready = True
+
+    @classmethod
+    def _start_coll(cls, world_size, collective):
+        with cls._coll_lock:
+            if not cls._ready:
+                raise Exception(
+                    f"world not ready, only {cls._count} PG's registered but world has {world_size} ranks"
+                )
+            if cls._cur_coll is None:
+                cls._cur_coll = Collective(world_size, collective)
+            return cls._cur_coll
+
+    @classmethod
+    def _end_coll(cls, collective):
+        # This is racily called by all ranks, so only one will work
+        with cls._coll_lock:
+            if cls._cur_coll == collective:
+                cls._cur_coll = None
+
+    def allgather(self, output_tensors, input_tensor, options):
+        coll = ProcessLocalGroup._start_coll(self._world, AllGather())
+        res = coll.join(self._rank, (output_tensors, input_tensor))
+        ProcessLocalGroup._end_coll(coll)
+        return res
+
+    def broadcast(self, tensor_list, opts):
+        coll = ProcessLocalGroup._start_coll(self._world, Broadcast(opts.rootRank))
+        res = coll.join(self._rank, tensor_list)
+        ProcessLocalGroup._end_coll(coll)
+        return res
+
+    def __init__(self, rank, world):
+        super(ProcessLocalGroup, self).__init__(rank, world)
+        self._rank = rank
+        self._world = world
+        ProcessLocalGroup._register(self)
+
+    def size(self):
+        return self._world
+
+    def getBackendName(self):
+        return "local"
+
+    def __repr__(self):
+        return f"PLG w:{self._world} r:{self._rank}"
+
+
+def _create_threaded_pg(prefix_store, rank, world_size, timeout):
+    return ProcessLocalGroup(rank, world_size)
+
+
+dist.Backend.register_backend("threaded", _create_threaded_pg)
+
+
+@dataclass
+class WorldData:
+    default_pg: dist.ProcessGroup
+    pg_map: Dict[dist.ProcessGroup, Tuple[str, Optional[Store]]]
+    pg_names: Dict[dist.ProcessGroup, str]
+    pg_group_ranks: Dict[dist.ProcessGroup, Dict[int, int]]
+    group_count: int
+
+
+class ThreadLocalWorld:
+    _world = threading.local()
+
+    def _get_world(self) -> WorldData:
+        if not hasattr(ThreadLocalWorld._world, "world"):
+            ThreadLocalWorld._world.world = WorldData(None, {}, {}, {}, 0)
+        return ThreadLocalWorld._world.world
+
+    @property
+    def default_pg(self):
+        return self._get_world().default_pg
+
+    @default_pg.setter
+    def default_pg(self, value):
+        self._get_world().default_pg = value
+
+    @property
+    def pg_map(self):
+        return self._get_world().pg_map
+
+    @property
+    def pg_names(self):
+        return self._get_world().pg_names
+
+    @property
+    def pg_group_ranks(self):
+        return self._get_world().pg_group_ranks
+
+    @property
+    def group_count(self) -> int:
+        return self._get_world().group_count
+
+    @group_count.setter
+    def group_count(self, value):
+        self._get_world().group_count = value
+
+
+_old_pg_world = None
+
+
+def _install_threaded_pg():
+    global _old_pg_world
+    _old_pg_world = dist.distributed_c10d._world
+    dist.distributed_c10d._world = ThreadLocalWorld()
+    return dist.distributed_c10d._world
+
+
+def _uninstall_threaded_pg():
+    dist.distributed_c10d._world = _old_pg_world
+
+
+def run_with_threaded_pg(world_size, timeout, callback):
+    """
+    Run ``callback`` with ``world_size`` threads using the in-proc process group
+    """
+    world = _install_threaded_pg()
+
+    def world_is_valid():
+        return world == dist.distributed_c10d._world
+
+    global_store = dist.HashStore()
+    exception_queue = queue.Queue()
+
+    def worker(rank):
+        if not world_is_valid():
+            raise TimeoutError("Invalid world")
+        dist.init_process_group(
+            backend="threaded", rank=rank, world_size=world_size, store=global_store
+        )
+        try:
+            callback()
+        except BaseException as ex:
+            exception_queue.put((rank, sys.exc_info()))
+        finally:
+            if world_is_valid():
+                dist.destroy_process_group()
+
+    try:
+        threads = [
+            threading.Thread(target=worker, args=(rank,)) for rank in range(world_size)
+        ]
+        for thread in threads:
+            thread.start()
+
+        deadline = time.time() + timeout
+        for idx, thread in enumerate(threads):
+            thread.join(max(0, deadline - time.time()))
+            if thread.is_alive():
+                exception_queue.put(
+                    (
+                        idx,
+                        (
+                            TimeoutError,
+                            TimeoutError(
+                                f"Rank failed to join in under {timeout} seconds"
+                            ),
+                            None,
+                        ),
+                    )
+                )
+        failed_ranks = []
+        while not exception_queue.empty():
+            failure = exception_queue.get()
+            failed_ranks.append(failure)
+        return failed_ranks
+    finally:
+        _uninstall_threaded_pg()

From 4364cf97893e53057bc5479adaeab8e68ca42fcc Mon Sep 17 00:00:00 2001
From: Zhengxu Chen <zhxchen17@meta.com>
Date: Wed, 9 Nov 2022 21:39:46 +0000
Subject: [PATCH 0718/1922] [fx] Fix GraphModule.print_readable() (#88730)

Summary: `__nested_code()` seems removed.

Test Plan: CI

Differential Revision: D41149662

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88730
Approved by: https://github.com/SherlockNoMad
---
 torch/fx/graph_module.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index bc07952cf6fe6..5f839983c5f3f 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -725,7 +725,7 @@ def print_readable(self, print_output=True):
         submodule_code_list = [""]
         for submodule in self.children():
             if isinstance(submodule, GraphModule):
-                submodule_code_list.append(submodule.__nested_code())
+                submodule_code_list.append(submodule.print_readable(print_output=False))
         submodule_code = "\n".join(submodule_code_list)
         submodule_code = _addindent(submodule_code, 4)
 

From 4842bf940973bda822535edddd36a9a59263a389 Mon Sep 17 00:00:00 2001
From: anjali411 <chourdiaanjali123@gmail.com>
Date: Wed, 9 Nov 2022 14:48:20 +0000
Subject: [PATCH 0719/1922] Symintify embedding_sparse_backward (#88746)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88746
Approved by: https://github.com/ezyang
---
 aten/src/ATen/native/Embedding.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp
index f23594022991e..5972ce0d2404c 100644
--- a/aten/src/ATen/native/Embedding.cpp
+++ b/aten/src/ATen/native/Embedding.cpp
@@ -89,20 +89,20 @@ Tensor embedding_sparse_backward(
     grad = grad.index(c);
   }
 
-  int64_t num_features = grad_.size(-1);
-  auto weight_size = std::array<int64_t, 2>{{ num_weights, num_features }};
+  auto num_features = grad_.sym_size(-1);
+  auto weight_size = std::array<c10::SymInt, 2>{{ num_weights, num_features }};
   auto dense_options = grad.options();
 
   // check if all our grad come from padding_idx
-  if (grad.numel() == 0) {
-    return at::_sparse_coo_tensor_unsafe(at::empty({1, 0}, indices_.options().dtype(kLong)),
-                                         at::empty({0, num_features}, dense_options),
+  if (grad.sym_numel() == 0) {
+    return at::_sparse_coo_tensor_unsafe_symint(at::empty({1, 0}, indices_.options().dtype(kLong)),
+                                         at::empty_symint({c10::SymInt(0), num_features}, dense_options),
                                          weight_size);
   }
 
   auto index = indices.reshape({1, -1});
-  auto values = grad.reshape({-1, num_features});
-  return at::_sparse_coo_tensor_unsafe(index.to(kLong), values, weight_size);
+  auto values = grad.reshape_symint({c10::SymInt(-1), num_features});
+  return at::_sparse_coo_tensor_unsafe_symint(index.to(kLong), values, weight_size);
 }
 
 Tensor embedding_dense_backward_cpu(

From b8922cebb6595aea5af16dc775eab7d9022187ce Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang@fb.com>
Date: Wed, 9 Nov 2022 06:47:53 -0800
Subject: [PATCH 0720/1922] [13/N] Update gather with CPU/CUDA implementations
 (#86409)

Differential Revision: [D40181612](https://our.internmc.facebook.com/intern/diff/D40181612)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86409
Approved by: https://github.com/kwen2501
---
 test/distributed/test_c10d_common.py    |  2 +-
 torch/csrc/distributed/c10d/OpsImpl.cpp | 32 +++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index 321480477f4d9..cf46f89b353cd 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -1466,7 +1466,7 @@ def _call_collective_with_varying_tensors(self, backend, collective, *args):
         # multi tensor collectives
         if collective == dist.barrier:
             collective()
-        elif collective == dist.all_gather:
+        elif collective in (dist.all_gather, dist.gather):
             collective([tensor], tensor, *args)
         elif collective == dist.scatter:
             collective(tensor, [tensor], *args)
diff --git a/torch/csrc/distributed/c10d/OpsImpl.cpp b/torch/csrc/distributed/c10d/OpsImpl.cpp
index 26b93da34411e..03ec6892857e7 100644
--- a/torch/csrc/distributed/c10d/OpsImpl.cpp
+++ b/torch/csrc/distributed/c10d/OpsImpl.cpp
@@ -219,6 +219,30 @@ reduce_scatter_cuda_(
       output_tensors, work);
 }
 
+c10::intrusive_ptr<Work> gather_cpu_(
+    const std::vector<std::vector<at::Tensor>>& output_tensors,
+    const std::vector<at::Tensor>& input_tensors,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    int64_t root_rank,
+    int64_t timeout) {
+  return process_group->gather(
+      const_cast<std::vector<std::vector<at::Tensor>>&>(output_tensors),
+      const_cast<std::vector<at::Tensor>&>(input_tensors),
+      GatherOptions{root_rank, std::chrono::milliseconds(timeout)});
+}
+
+c10::intrusive_ptr<Work> gather_cuda_(
+    const std::vector<std::vector<at::Tensor>>& output_tensors,
+    const std::vector<at::Tensor>& input_tensors,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    int64_t root_rank,
+    int64_t timeout) {
+  return process_group->gather(
+      const_cast<std::vector<std::vector<at::Tensor>>&>(output_tensors),
+      const_cast<std::vector<at::Tensor>&>(input_tensors),
+      GatherOptions{root_rank, std::chrono::milliseconds(timeout)});
+}
+
 std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>> scatter_cpu_(
     const std::vector<at::Tensor>& output_tensors,
     const std::vector<std::vector<at::Tensor>>& input_tensors,
@@ -359,6 +383,14 @@ TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
   m.impl("reduce_scatter_", reduce_scatter_cuda_);
 }
 
+TORCH_LIBRARY_IMPL(c10d, CPU, m) {
+  m.impl("gather_", gather_cpu_);
+}
+
+TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
+  m.impl("gather_", gather_cuda_);
+}
+
 TORCH_LIBRARY_IMPL(c10d, CPU, m) {
   m.impl("scatter_", scatter_cpu_);
 }

From a40e3dd569cb24c09b44cfb716962d83494cb609 Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Tue, 8 Nov 2022 07:59:10 -0800
Subject: [PATCH 0721/1922] [ao] qconfig.py fix public v private (#87515)

Summary: made is_reuse_input_qconfig, _activation_is_memoryless,
_partial_wrapper_equals, _obs_or_fq_ctr_equals,
_add_module_to_qconfig_obs_ctr, _assert_valid_qconfig private

Test Plan: python test/test_public_bindings.py

Reviewers:

Subscribers:

Tasks:

Tags:

Differential Revision: [D40709280](https://our.internmc.facebook.com/intern/diff/D40709280)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87515
Approved by: https://github.com/jcaip
---
 test/allowlist_for_publicAPI.json             | 10 +++----
 .../ao_migration/test_quantization.py         |  4 +--
 torch/ao/nn/qat/dynamic/modules/linear.py     |  2 +-
 torch/ao/quantization/__init__.py             |  5 ----
 .../quantization/fx/_model_report/detector.py |  4 +--
 torch/ao/quantization/fx/prepare.py           | 16 ++++++------
 .../quantization/fx/qconfig_mapping_utils.py  | 10 +++----
 torch/ao/quantization/qconfig.py              | 26 +++++++------------
 torch/ao/quantization/quantize.py             | 10 +++----
 torch/quantization/qconfig.py                 |  4 +--
 10 files changed, 40 insertions(+), 51 deletions(-)

diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index a8dd659a4edd1..ba4a2e96df219 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -623,7 +623,7 @@
     "OrderedDict"
   ],
   "torch.nn.qat.dynamic.modules.linear": [
-    "activation_is_memoryless"
+    "_activation_is_memoryless"
   ],
   "torch.nn.qat.modules.conv": [
     "Tuple",
@@ -736,10 +736,10 @@
     "QuantType",
     "QuantWrapper",
     "RecordingObserver",
-    "add_module_to_qconfig_obs_ctr",
+    "_add_module_to_qconfig_obs_ctr",
     "add_observer_",
     "add_quant_dequant",
-    "assert_valid_qconfig",
+    "_assert_valid_qconfig",
     "convert",
     "convert_dynamic_jit",
     "convert_jit",
@@ -866,8 +866,8 @@
     "QConfig",
     "QConfigAny",
     "QConfigDynamic",
-    "add_module_to_qconfig_obs_ctr",
-    "assert_valid_qconfig",
+    "_add_module_to_qconfig_obs_ctr",
+    "_assert_valid_qconfig",
     "get_default_qat_qconfig",
     "get_default_qconfig",
     "qconfig_equals"
diff --git a/test/quantization/ao_migration/test_quantization.py b/test/quantization/ao_migration/test_quantization.py
index 89b69d1ef1829..52b8f631711f1 100644
--- a/test/quantization/ao_migration/test_quantization.py
+++ b/test/quantization/ao_migration/test_quantization.py
@@ -177,9 +177,9 @@ def test_function_import_qconfig(self):
             "default_qat_qconfig_v2",
             "get_default_qconfig",
             "get_default_qat_qconfig",
-            "assert_valid_qconfig",
+            "_assert_valid_qconfig",
             "QConfigAny",
-            "add_module_to_qconfig_obs_ctr",
+            "_add_module_to_qconfig_obs_ctr",
             "qconfig_equals"
         ]
         self._test_function_import('qconfig', function_list)
diff --git a/torch/ao/nn/qat/dynamic/modules/linear.py b/torch/ao/nn/qat/dynamic/modules/linear.py
index a6642b5d2df54..89c5567315956 100644
--- a/torch/ao/nn/qat/dynamic/modules/linear.py
+++ b/torch/ao/nn/qat/dynamic/modules/linear.py
@@ -17,7 +17,7 @@ class Linear(torch.ao.nn.qat.Linear):
     def __init__(self, in_features, out_features, bias=True,
                  qconfig=None, device=None, dtype=None) -> None:
         super().__init__(in_features, out_features, bias, qconfig, device, dtype)
-        if not torch.ao.quantization.activation_is_memoryless(qconfig):
+        if not torch.ao.quantization.qconfig._activation_is_memoryless(qconfig):
             raise ValueError(
                 "Dynamic QAT requires a memoryless observer." +
                 "This means a MovingAverage observer with averaging constant equal to 1"
diff --git a/torch/ao/quantization/__init__.py b/torch/ao/quantization/__init__.py
index dc0e0a07381f5..abc0bd24d97b6 100644
--- a/torch/ao/quantization/__init__.py
+++ b/torch/ao/quantization/__init__.py
@@ -40,11 +40,8 @@
     "RecordingObserver",
     "ReuseInputObserver",
     "UniformQuantizationObserverBase",
-    "activation_is_memoryless",
-    "add_module_to_qconfig_obs_ctr",
     "add_observer_",
     "add_quant_dequant",
-    "assert_valid_qconfig",
     "convert",
     "convert_dynamic_jit",
     "convert_jit",
@@ -119,10 +116,8 @@
     "get_unique_devices_",
     "get_valid_patterns",
     "is_activation_post_process",
-    "is_reuse_input_qconfig",
     "load_observer_state_dict",
     "no_observer_set",
-    "obs_or_fq_ctr_equals",
     "per_channel_weight_observer_range_neg_127_to_127",
     "prepare",
     "prepare_dynamic_jit",
diff --git a/torch/ao/quantization/fx/_model_report/detector.py b/torch/ao/quantization/fx/_model_report/detector.py
index 239137aaaabba..c92733bbc1c32 100644
--- a/torch/ao/quantization/fx/_model_report/detector.py
+++ b/torch/ao/quantization/fx/_model_report/detector.py
@@ -10,7 +10,7 @@
 from torch.ao.quantization.qconfig import (
     QConfig,
     default_qconfig,
-    assert_valid_qconfig,
+    _assert_valid_qconfig,
 )
 from torch.ao.quantization.observer import (
     ObserverBase,
@@ -84,7 +84,7 @@ def generate_quantization_qconfig(self, module: torch.nn.Module) -> QConfig:
             weight = default_per_channel_weight_observer if rec[1] else default_weight_observer
             test_config = QConfig(activation, weight)
             try:
-                assert_valid_qconfig(test_config, module)
+                _assert_valid_qconfig(test_config, module)
                 module_qconfig = test_config
                 break
             except AssertionError:
diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index 0459685edb571..160b80a8807f8 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -18,10 +18,10 @@
     ObserverBase,
 )
 from ..qconfig import (
-    obs_or_fq_ctr_equals,
+    _obs_or_fq_ctr_equals,
     float16_dynamic_qconfig,
     float16_static_qconfig,
-    is_reuse_input_qconfig,
+    _is_reuse_input_qconfig,
     QConfigAny,
 )
 from ..qconfig_mapping import (
@@ -585,7 +585,7 @@ def maybe_insert_input_observer_for_arg_or_kwarg(
         # regular flow for most nodes, except standalone modules
         is_weight = node_arg_is_weight(node, arg, backend_config)
 
-        is_reuse_input_qconfig_ = is_reuse_input_qconfig(qconfig)
+        _is_reuse_input_qconfig_ = _is_reuse_input_qconfig(qconfig)
 
         act_post_process_ctr = qconfig.weight if is_weight else \
             qconfig.activation
@@ -613,7 +613,7 @@ def maybe_insert_input_observer_for_arg_or_kwarg(
                 # if arg output dtype is in DO_NOT_OBS_DTYPE_LIST do not insert observer
                 (arg_as_output_target_dtype not in DO_NOT_OBS_DTYPE_LIST) and
                 # if qconfig is reuse_input qconfig, we won't insert extra observer for input
-                not is_reuse_input_qconfig_
+                not _is_reuse_input_qconfig_
             ) or (
                 # need to add input observer for dynamic quantization
                 # only add observer for first input for now, we may need to extend
@@ -1312,7 +1312,7 @@ def insert_observers_for_model(
                     is_last_node_of_pattern = node is last_node
                     is_general_tensor_value_op = \
                         (qhandler is not None and qhandler.is_general_tensor_value_op())
-                    is_reuse_input_qconfig_ = is_reuse_input_qconfig(qconfig)
+                    _is_reuse_input_qconfig_ = _is_reuse_input_qconfig(qconfig)
 
                     if is_last_node_of_pattern:
                         if _is_custom_module_lstm(node, modules, qconfig, qhandler):
@@ -1364,7 +1364,7 @@ def insert_observers_for_model(
                                 # to make all inputs and outputs use the first input's
                                 # observer
                                 if (is_general_tensor_value_op and is_observer_in_same_graph_) or \
-                                        is_reuse_input_qconfig_:
+                                        _is_reuse_input_qconfig_:
                                     if not maybe_make_input_output_share_observers(node, model, modules):
                                         remove_output_observer(node, model, modules)
 
@@ -1423,10 +1423,10 @@ def _validate_fixed_qparams_qconfigs(
             else:
                 for observer_ctr in allowed_observer_ctrs + [
                         fixed_qparams_op_to_overwrite_output_observer[module_type_or_function_or_method]]:
-                    if obs_or_fq_ctr_equals(
+                    if _obs_or_fq_ctr_equals(
                             qconfig.activation,
                             FixedQParamsFakeQuantize.with_args(observer=observer_ctr)) or \
-                            obs_or_fq_ctr_equals(qconfig.activation, observer_ctr):
+                            _obs_or_fq_ctr_equals(qconfig.activation, observer_ctr):
                         bad_observer = False
             if bad_observer:
                 raise ValueError("QConfigMapping must specify fixed qparams observer for fixed qparams op "
diff --git a/torch/ao/quantization/fx/qconfig_mapping_utils.py b/torch/ao/quantization/fx/qconfig_mapping_utils.py
index 16f61f78daffe..2abfaf826c425 100644
--- a/torch/ao/quantization/fx/qconfig_mapping_utils.py
+++ b/torch/ao/quantization/fx/qconfig_mapping_utils.py
@@ -2,7 +2,7 @@
 from collections import defaultdict, OrderedDict
 from typing import Callable, Any, Dict, Tuple, Set, List
 from torch.ao.quantization import QConfig
-from torch.ao.quantization.qconfig import add_module_to_qconfig_obs_ctr, QConfigAny, qconfig_equals
+from torch.ao.quantization.qconfig import _add_module_to_qconfig_obs_ctr, QConfigAny, qconfig_equals
 from torch.ao.quantization.quantize import (
     is_activation_post_process,
 )
@@ -123,7 +123,7 @@ def generate_node_name_to_qconfig(
             module_name, _ = _parent_name(node.target)
             qconfig = maybe_adjust_qconfig_for_module_type_or_name(
                 qconfig_mapping, type(modules[module_name]), module_name, global_qconfig)
-            qconfig_with_device_check = add_module_to_qconfig_obs_ctr(qconfig, modules.get(node.target, None))
+            qconfig_with_device_check = _add_module_to_qconfig_obs_ctr(qconfig, modules.get(node.target, None))
         elif node.op == "call_function":
             # precedence: module_name_qconfig
             # > function_qconfig > global_qconfig
@@ -139,7 +139,7 @@ def generate_node_name_to_qconfig(
             submodule_to_object_type_to_cur_idx[module_path][node.target] += 1
             qconfig = maybe_adjust_qconfig_for_module_name_object_type_order(
                 qconfig_mapping, module_path, node.target, cur_object_type_idx, qconfig)
-            qconfig_with_device_check = add_module_to_qconfig_obs_ctr(qconfig, modules.get(node.target, None))
+            qconfig_with_device_check = _add_module_to_qconfig_obs_ctr(qconfig, modules.get(node.target, None))
 
         elif node.op == "call_method":
             module_path, module_type = node_name_to_scope[node.name]
@@ -154,7 +154,7 @@ def generate_node_name_to_qconfig(
                 qconfig_mapping, module_type, module_path, qconfig)
             # currently call_method does not support modifying qconfig
             # by order, we can add this later if it is needed.
-            qconfig_with_device_check = add_module_to_qconfig_obs_ctr(qconfig, modules.get(node.target, None))
+            qconfig_with_device_check = _add_module_to_qconfig_obs_ctr(qconfig, modules.get(node.target, None))
 
         elif node.op == 'call_module':
             # if the node is an observer, just continue - don't add it to the qconfig_map
@@ -174,7 +174,7 @@ def generate_node_name_to_qconfig(
             qconfig = maybe_adjust_qconfig_for_module_name_object_type_order(
                 qconfig_mapping, parent_name, module_type, cur_object_type_idx,
                 qconfig)
-            qconfig_with_device_check = add_module_to_qconfig_obs_ctr(qconfig, modules.get(node.target, None))
+            qconfig_with_device_check = _add_module_to_qconfig_obs_ctr(qconfig, modules.get(node.target, None))
 
             # regex is not supported eager mode propagate_qconfig_, we'll
             # need to set the qconfig explicitly here in case regex
diff --git a/torch/ao/quantization/qconfig.py b/torch/ao/quantization/qconfig.py
index d1eb0a64a125d..b75e16ef044f4 100644
--- a/torch/ao/quantization/qconfig.py
+++ b/torch/ao/quantization/qconfig.py
@@ -72,13 +72,8 @@
     "get_default_qat_qconfig",
     "get_default_qconfig_dict",
     "get_default_qat_qconfig_dict",
-    "assert_valid_qconfig",
-    "add_module_to_qconfig_obs_ctr",
     "QConfigAny",
-    "obs_or_fq_ctr_equals",
     "qconfig_equals",
-    "activation_is_memoryless",
-    "is_reuse_input_qconfig",
 ]
 
 class QConfig(namedtuple('QConfig', ['activation', 'weight'])):
@@ -419,8 +414,8 @@ def get_default_qat_qconfig_dict(backend='fbgemm', version=1):
         "a future version. Please use torch.ao.quantization.get_default_qat_qconfig_mapping instead.")
     return torch.ao.quantization.get_default_qat_qconfig_mapping(backend, version).to_dict()
 
-def assert_valid_qconfig(qconfig: Optional[QConfig],
-                         mod: torch.nn.Module) -> None:
+def _assert_valid_qconfig(qconfig: Optional[QConfig],
+                          mod: torch.nn.Module) -> None:
     """
     Verifies that this `qconfig` is valid.
     """
@@ -442,11 +437,10 @@ def assert_valid_qconfig(qconfig: Optional[QConfig],
         assert not is_per_channel, \
             'Per channel weight observer is not supported yet for ConvTranspose{n}d.'
 
-# TODO: remove QConfigAny and replace it with Optional[QConfig]
 QConfigAny = Optional[QConfig]
 QConfigAny.__module__ = "torch.ao.quantization.qconfig"
 
-def add_module_to_qconfig_obs_ctr(
+def _add_module_to_qconfig_obs_ctr(
         qconfig: QConfigAny,
         module: Optional[nn.Module]) -> Any:
     r"""This is a helper function for use in quantization prepare that updates a qconfig so that
@@ -490,7 +484,7 @@ def configure_constructor_to_put_obs_on_module_device(original_constructor):
 
 _ObserverOrFakeQuantizeConstructor = Union[_PartialWrapper, ObserverBase, FakeQuantizeBase]
 
-def obs_or_fq_ctr_equals(obs_or_fq1: _ObserverOrFakeQuantizeConstructor, obs_or_fq2: _ObserverOrFakeQuantizeConstructor):
+def _obs_or_fq_ctr_equals(obs_or_fq1: _ObserverOrFakeQuantizeConstructor, obs_or_fq2: _ObserverOrFakeQuantizeConstructor):
     if isinstance(obs_or_fq1, _PartialWrapper) and isinstance(obs_or_fq2, _PartialWrapper):
         return _partial_wrapper_equals(obs_or_fq1, obs_or_fq2)
     return obs_or_fq1 == obs_or_fq2
@@ -503,9 +497,9 @@ def _partial_wrapper_equals(obs_or_fq1: _PartialWrapper, obs_or_fq2: _PartialWra
     obs_or_fq1_keywords = copy.copy(obs_or_fq1.p.keywords)
     obs_or_fq2_keywords = copy.copy(obs_or_fq2.p.keywords)
     keywords_equal = True
-    # compare observer constructor with obs_or_fq_ctr_equals since direct compare would fail
+    # compare observer constructor with _obs_or_fq_ctr_equals since direct compare would fail
     if "observer" in obs_or_fq1_keywords and "observer" in obs_or_fq2_keywords:
-        keywords_equal = keywords_equal and obs_or_fq_ctr_equals(obs_or_fq1_keywords["observer"], obs_or_fq2_keywords["observer"])
+        keywords_equal = keywords_equal and _obs_or_fq_ctr_equals(obs_or_fq1_keywords["observer"], obs_or_fq2_keywords["observer"])
         obs_or_fq1_keywords.pop("observer")
         obs_or_fq2_keywords.pop("observer")
     keywords_equal = keywords_equal and obs_or_fq1_keywords == obs_or_fq2_keywords
@@ -523,13 +517,13 @@ def qconfig_equals(q1: QConfigAny, q2: QConfigAny):
             # Qconfig weight and activation can be either a partial wrapper,
             # or an observer class. Special handling is required (above) for
             # comparing partial wrappers.
-            activation_same = obs_or_fq_ctr_equals(q1.activation, q2.activation)
-            weight_same = obs_or_fq_ctr_equals(q1.weight, q2.weight)
+            activation_same = _obs_or_fq_ctr_equals(q1.activation, q2.activation)
+            weight_same = _obs_or_fq_ctr_equals(q1.weight, q2.weight)
             return activation_same and weight_same
         except AttributeError:
             return q1 == q2
 
-def activation_is_memoryless(qconfig: QConfig):
+def _activation_is_memoryless(qconfig: QConfig):
     """
     Return whether the observer for activations defined in the given QConfig is memoryless.
     This means a MovingAverage observer with averaging constant equal to 1.
@@ -542,7 +536,7 @@ def _is_memoryless(observer):
     else:
         return _is_memoryless(act)
 
-def is_reuse_input_qconfig(qconfig: Optional[QConfig]):
+def _is_reuse_input_qconfig(qconfig: Optional[QConfig]):
     return qconfig is not None and \
         isinstance(qconfig.activation(), ReuseInputObserver) and \
         isinstance(qconfig.weight(), NoopObserver)
diff --git a/torch/ao/quantization/quantize.py b/torch/ao/quantization/quantize.py
index 4e28b07084d5b..9f5537ec85615 100644
--- a/torch/ao/quantization/quantize.py
+++ b/torch/ao/quantization/quantize.py
@@ -20,12 +20,12 @@
 from .utils import get_qparam_dict, has_no_children_ignoring_parametrizations
 from torch.ao.quantization.stubs import DeQuantStub, QuantWrapper
 from torch.ao.quantization.qconfig import (
-    add_module_to_qconfig_obs_ctr,
+    _add_module_to_qconfig_obs_ctr,
     default_dynamic_qconfig,
     float16_dynamic_qconfig,
     float_qparams_weight_only_qconfig,
     float_qparams_weight_only_qconfig_4bit,
-    activation_is_memoryless)
+    _activation_is_memoryless)
 from torch.nn.utils.parametrize import type_before_parametrizations
 
 __all__ = [
@@ -91,9 +91,9 @@ def _propagate_qconfig_helper(module, qconfig_dict,
     module_qconfig = qconfig_dict.get(prefix, module_qconfig)
     module_qconfig = getattr(module, 'qconfig', module_qconfig)
 
-    torch.ao.quantization.qconfig.assert_valid_qconfig(module_qconfig, module)
+    torch.ao.quantization.qconfig._assert_valid_qconfig(module_qconfig, module)
 
-    qconfig_with_device_check = add_module_to_qconfig_obs_ctr(module_qconfig, module)
+    qconfig_with_device_check = _add_module_to_qconfig_obs_ctr(module_qconfig, module)
     module.qconfig = qconfig_with_device_check
 
     for name, child in module.named_children():
@@ -201,7 +201,7 @@ def insert_activation_post_process(m, special_act_post_process=None):
                 m.qconfig, device, special_act_post_process))
             # Register observer as the first entry in the hook list
             # All post forward hooks are preserved and will be executed after the observer before convert
-            register_activation_post_process_hook(m, pre_hook=activation_is_memoryless(m.qconfig))
+            register_activation_post_process_hook(m, pre_hook=_activation_is_memoryless(m.qconfig))
 
     for name, child in module.named_children():
         # TODO remove Dropout special after codebase stable
diff --git a/torch/quantization/qconfig.py b/torch/quantization/qconfig.py
index 7552d437466d5..9da450abd67b1 100644
--- a/torch/quantization/qconfig.py
+++ b/torch/quantization/qconfig.py
@@ -23,8 +23,8 @@
     default_qat_qconfig_v2,
     get_default_qconfig,
     get_default_qat_qconfig,
-    assert_valid_qconfig,
+    _assert_valid_qconfig,
     QConfigAny,
-    add_module_to_qconfig_obs_ctr,
+    _add_module_to_qconfig_obs_ctr,
     qconfig_equals
 )

From 33856abd1b61e60c305e7b8cac110413954f8061 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 9 Nov 2022 22:38:41 +0000
Subject: [PATCH 0722/1922] Revert "[3/n] Thread PG: add threaded PG
 implementation (#88627)"

This reverts commit 6dd081846e3ae6192b375d658d4b4f3d6bd9df6e.

Reverted https://github.com/pytorch/pytorch/pull/88627 on behalf of https://github.com/huydhn due to This breaks one macos m1 test https://hud.pytorch.org/pytorch/pytorch/commit/6dd081846e3ae6192b375d658d4b4f3d6bd9df6e in trunk. PR also fails with the same issue so I think trymerge code has a bug here letting this one merged
---
 test/distributed/test_multi_threaded_pg.py    |  45 ---
 torch/testing/_internal/common_distributed.py | 149 ++-------
 .../distributed/multi_threaded_pg.py          | 288 ------------------
 3 files changed, 26 insertions(+), 456 deletions(-)
 delete mode 100644 test/distributed/test_multi_threaded_pg.py
 delete mode 100644 torch/testing/_internal/distributed/multi_threaded_pg.py

diff --git a/test/distributed/test_multi_threaded_pg.py b/test/distributed/test_multi_threaded_pg.py
deleted file mode 100644
index 6a0fe33cd8ad6..0000000000000
--- a/test/distributed/test_multi_threaded_pg.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Owner(s): ["oncall: distributed"]
-
-import sys
-import torch.distributed as dist
-
-if not dist.is_available():
-    print("Distributed not available, skipping tests", file=sys.stderr)
-    sys.exit(0)
-
-from torch.testing._internal.common_distributed import (
-    spawn_threads_and_init_comms,
-    MultiThreadedTestCase
-
-)
-from torch.testing._internal.common_utils import TestCase, run_tests
-
-DEFAULT_WORLD_SIZE = 4
-
-class TestObjectCollectivesWithWrapper(TestCase):
-    @spawn_threads_and_init_comms(world_size=4)
-    def test_broadcast_object_list(self):
-        val = 99 if dist.get_rank() == 0 else None
-        object_list = [val] * dist.get_world_size()
-
-        dist.broadcast_object_list(object_list=object_list)
-        self.assertEqual(99, object_list[0])
-
-class TestObjectCollectivesWithBaseClass(MultiThreadedTestCase):
-    @property
-    def world_size(self):
-        return 4
-
-    def test_broadcast_object_list(self):
-        val = 99 if dist.get_rank() == 0 else None
-        object_list = [val] * dist.get_world_size()
-        print(f"{dist.get_rank()} -> {dist.get_world_size()}")
-
-        dist.broadcast_object_list(object_list=object_list)
-        self.assertEqual(99, object_list[0])
-
-    def test_something_else(self):
-        pass
-
-if __name__ == "__main__":
-    run_tests()
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 883a48a5a5fef..607211087ddc7 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -2,10 +2,10 @@
 import logging
 import multiprocessing
 import os
-import subprocess
 import sys
 import tempfile
 import threading
+import subprocess
 import time
 import traceback
 import types
@@ -14,7 +14,11 @@
 from dataclasses import dataclass
 from datetime import timedelta
 from enum import Enum
-from functools import partial, reduce, wraps
+from functools import (
+    partial,
+    reduce,
+    wraps
+)
 from io import StringIO
 from typing import NamedTuple, Optional, Union
 
@@ -22,17 +26,16 @@
 import torch.cuda.nccl
 import torch.distributed as c10d
 from torch.testing._internal.common_utils import (
+    TestCase,
+    TEST_WITH_ROCM,
+    TEST_WITH_TSAN,
     FILE_SCHEMA,
     find_free_port,
-    IS_SANDCASTLE,
     retry_on_connect_failures,
-    sandcastle_skip,
+    IS_SANDCASTLE,
     sandcastle_skip_if,
-    TEST_WITH_ROCM,
-    TEST_WITH_TSAN,
-    TestCase,
+    sandcastle_skip,
 )
-from torch.testing._internal.distributed.multi_threaded_pg import run_with_threaded_pg
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -64,10 +67,11 @@ class TestSkip(NamedTuple):
     "generic": TestSkip(
         86, "Test skipped at subprocess level, look at subprocess log for skip reason"
     ),
-    "importerror": TestSkip(88, "Test skipped due to missing import"),
+    "importerror": TestSkip(
+        88, "Test skipped due to missing import"
+    ),
 }
 
-
 @dataclass
 class DistTestCases:
     # Backends that do not support a specific collective
@@ -89,7 +93,6 @@ class DistTestCases:
 def skip_if_no_gpu(func):
     """Skips if the world size exceeds the number of GPUs, ensuring that if the
     test is run, each rank has its own GPU via ``torch.cuda.device(rank)``."""
-
     @wraps(func)
     def wrapper(*args, **kwargs):
         if not torch.cuda.is_available():
@@ -113,7 +116,6 @@ def wrapper(*args, **kwargs):
 
     return wrapper
 
-
 def skip_if_odd_worldsize(func):
     @wraps(func)
     def wrapper(*args, **kwargs):
@@ -124,7 +126,6 @@ def wrapper(*args, **kwargs):
 
     return wrapper
 
-
 def require_n_gpus_for_nccl_backend(n, backend):
     def decorator(func):
         @wraps(func)
@@ -138,17 +139,12 @@ def wrapper(*args, **kwargs):
 
     return decorator
 
-
 def import_transformers_or_skip():
     def decorator(func):
         @wraps(func)
         def wrapper(*args, **kwargs):
             try:
-                from transformers import (  # noqa: Unused
-                    AutoModelForMaskedLM,
-                    BertConfig,
-                )
-
+                from transformers import BertConfig, AutoModelForMaskedLM  # noqa: Unused
                 return func(*args, **kwargs)
             except ImportError:
                 sys.exit(TEST_SKIPS["importerror"].exit_code)
@@ -157,7 +153,6 @@ def wrapper(*args, **kwargs):
 
     return decorator
 
-
 def skip_if_lt_x_gpu(x):
     def decorator(func):
         @wraps(func)
@@ -196,13 +191,10 @@ def verify_ddp_error_logged(model_DDP, err_substr):
     logging_err = ddp_logging_data["error"]
     # Remove C++ stacktrace if needed.
     actual = (
-        err_substr
-        if err_substr.find("\nException raised from ") == -1
+        err_substr if err_substr.find("\nException raised from ") == -1
         else err_substr.split("\nException raised from ")[0]
     )
-    assert (
-        actual in logging_err
-    ), f"Did not find expected {actual} in ddp logging data error: {logging_err}"
+    assert actual in logging_err, f"Did not find expected {actual} in ddp logging data error: {logging_err}"
 
 
 def with_nccl_blocking_wait(func):
@@ -327,7 +319,7 @@ def wrapper(*args, **kwargs):
 
 def skip_if_win32():
     return sandcastle_skip_if(
-        sys.platform == "win32",
+        sys.platform == 'win32',
         "This unit test case is not supportted on Windows platform",
     )
 
@@ -360,14 +352,13 @@ def create_tcp_store(
     # TSAN runs much slower.
     TIMEOUT_DEFAULT = 500
 else:
-    TIMEOUT_DEFAULT = int(os.getenv("DISTRIBUTED_TESTS_DEFAULT_TIMEOUT", "300"))
+    TIMEOUT_DEFAULT = int(os.getenv('DISTRIBUTED_TESTS_DEFAULT_TIMEOUT', '300'))
 TIMEOUT_OVERRIDE = {"test_ddp_uneven_inputs": 400}
 
 # https://github.com/pytorch/pytorch/issues/75665
 if TEST_WITH_ROCM:
     TIMEOUT_OVERRIDE["test_join_kwargs"] = 200
 
-
 def create_device(interface=None):
     if sys.platform == "win32" or interface is None:
         return c10d.ProcessGroupGloo.create_device(hostname="127.0.0.1")
@@ -458,7 +449,9 @@ def init_multigpu_helper(world_size: int, backend: str):
     if world_size > nGPUs:
         nGPUs_per_process = nGPUs // world_size
     rank_to_GPU = {
-        i: list(visible_devices[i * nGPUs_per_process : (i + 1) * nGPUs_per_process])
+        i: list(
+            visible_devices[i * nGPUs_per_process : (i + 1) * nGPUs_per_process]
+        )
         for i in range(world_size)
     }
     return rank_to_GPU
@@ -489,9 +482,6 @@ def cleanup_temp_dir() -> None:
         tmp_dir.cleanup()
 
 
-# Most tests operate with this worldsize
-DEFAULT_WORLD_SIZE = 4
-
 # [How does MultiProcessTestCase work?]
 # Each MultiProcessTestCase instance uses 1 + `world_size()` processes, by
 # default `world_size()` returns 4. Let's take `test_rpc_spawn.py` as an
@@ -518,7 +508,7 @@ def _should_stop_test_suite(self) -> bool:
 
     @property
     def world_size(self) -> int:
-        return DEFAULT_WORLD_SIZE
+        return 4
 
     def join_or_run(self, fn):
         @wraps(fn)
@@ -617,10 +607,7 @@ def _event_listener(parent_pipe, signal_pipe, rank: int):
     @classmethod
     def _run(cls, rank: int, test_name: str, file_name: str, parent_pipe) -> None:
         # Enable DDP + ReplicatedTensor
-        from torch.nn.parallel._replicated_tensor_ddp_utils import (
-            _set_ddp_with_replicated_tensor,
-        )
-
+        from torch.nn.parallel._replicated_tensor_ddp_utils import _set_ddp_with_replicated_tensor
         _set_ddp_with_replicated_tensor(True)
 
         self = cls(test_name)
@@ -828,20 +815,16 @@ def _check_return_codes(self, elapsed_time) -> None:
         self.assertEqual(
             first_process.exitcode,
             0,
-            msg="Expected zero exit code but got {} for pid: {}".format(
-                first_process.exitcode, first_process.pid
-            ),
+            msg="Expected zero exit code but got {} for pid: {}".format(first_process.exitcode, first_process.pid)
         )
 
     @property
     def is_master(self) -> bool:
         return self.rank == 0
 
-
 # Cannot use functools.cache as it requires python 3.9
 EFA_PROBE_RESULT = None
 
-
 def has_efa() -> bool:
     """
     If shell command `fi_info -p efa -t FI_EP_RDM` returns exit code 0 then we assume that the machine has
@@ -853,9 +836,7 @@ def has_efa() -> bool:
         return EFA_PROBE_RESULT
 
     try:
-        EFA_PROBE_RESULT = (
-            subprocess.run(["fi_info", "-p", "efa", "-t", "FI_EP_RDM"]).returncode == 0
-        )
+        EFA_PROBE_RESULT = subprocess.run(["fi_info", "-p", "efa", "-t", "FI_EP_RDM"]).returncode == 0
     except FileNotFoundError:
         EFA_PROBE_RESULT = False
     return EFA_PROBE_RESULT
@@ -869,81 +850,3 @@ def tp_transports():
     see https://github.com/pytorch/pytorch/issues/73885 and https://github.com/pytorch/pytorch/issues/65022
     """
     return ["shm", "uv"] if has_efa() else None
-
-
-def _run_test_with_mt_pg(self, timeout, world_size, callback):
-    failed_ranks = run_with_threaded_pg(world_size, timeout, callback)
-    for rank, exc_info in failed_ranks:
-        print(f"Rank {rank} raised:")
-        for line in traceback.format_exception(*exc_info):
-            sys.stdout.write(line)
-    self.assertEqual([], failed_ranks, "Some ranks failed")
-
-
-def spawn_threads_and_init_comms(
-    func=None, timeout=TIMEOUT_DEFAULT, world_size=DEFAULT_WORLD_SIZE
-):
-    """
-    Wrapper to use with a test method
-    """
-    if func is None:
-        return partial(
-            spawn_threads_and_init_comms, timeout=timeout, world_size=world_size
-        )
-
-    @wraps(func)
-    def wrapper(self, *args, **kwargs):
-        _run_test_with_mt_pg(
-            self, timeout, world_size, lambda: func(self, *args, **kwargs)
-        )
-
-    return wrapper
-
-
-class MultiThreadedTestCase(TestCase):
-    """
-    Simple test runner that executes all tests with the in-proc process group.
-
-    A single instance of the TestCase object for all threads.
-
-    Difference from regular test runner:
-    Cannot use setUp / tearDown (must use perThreadSetup / perThreadShutdown)
-        Not sure what these two would be good for though.
-    No global state possible
-        How bad of a limitation is this?
-    """
-
-    def __init__(self, method_name: str = "runTest") -> None:
-        super().__init__(method_name)
-        self._test_method = getattr(self, method_name, None)
-        setattr(self, method_name, self.threaded_run_test)
-        if TestCase.setUp != type(self).setUp:
-            raise RuntimeError(
-                f"Test class {type(self)} overrides disabled method setUp. Use perThreadSetUp instead"
-            )
-        if TestCase.tearDown != type(self).tearDown:
-            raise RuntimeError(
-                f"Test class {type(self)} overrides disabled method tearDown. Use perThreadTearDown instead"
-            )
-
-    def threaded_run_test(self):
-        self.perThreadSetUp()
-        try:
-            _run_test_with_mt_pg(
-                self=self,
-                timeout=TIMEOUT_DEFAULT,
-                world_size=self.world_size,
-                callback=self._test_method,
-            )
-        finally:
-            self.perThreadTearDown()
-
-    def perThreadSetUp(self):
-        pass
-
-    def perThreadTearDown(self):
-        pass
-
-    @property
-    def world_size(self) -> int:
-        raise RuntimeError("world size not implemented")
diff --git a/torch/testing/_internal/distributed/multi_threaded_pg.py b/torch/testing/_internal/distributed/multi_threaded_pg.py
deleted file mode 100644
index 7e18f870f2e76..0000000000000
--- a/torch/testing/_internal/distributed/multi_threaded_pg.py
+++ /dev/null
@@ -1,288 +0,0 @@
-import queue
-import sys
-import threading
-import time
-from dataclasses import dataclass
-from typing import Dict, Optional, Tuple
-
-import torch
-import torch.distributed as dist
-from torch._C._distributed_c10d import _create_work_from_future, Store
-from torch.futures import Future
-from torch.utils._pytree import tree_flatten
-
-"""
-TODO:
-Lots of missing collectives.
-Collectives validation.
-Make timeout robust by making collectives respect the test deadline.
-Make tests robuts by making collectives interruptible.
-We need some synchronization around cleanup to ensure that timedout ranks don't cause spurious failures.
-
-"""
-
-
-def flatten_list(lst):
-    return tree_flatten(lst)[0]
-
-
-def ret_work(ret):
-    fut = Future()
-    fut.set_result(ret)
-    return _create_work_from_future(fut)
-
-
-class AllGather:
-    def work(self, data):
-        for src_rank in range(len(data)):
-            in_tensor_list = data[src_rank][1]
-            # Can't handle all_gather with multiple tensors
-            assert len(in_tensor_list) == 1
-            src_tensor = in_tensor_list[0]
-
-            for dest in data:
-                dest_tensor = dest[0][0][src_rank]
-                with torch.no_grad():
-                    dest_tensor.copy_(src_tensor)
-
-
-class Broadcast:
-    def __init__(self, src):
-        self.src = src
-
-    def work(self, data):
-        in_tensor_list = flatten_list(data[self.src])
-        for i in range(len(data)):
-            out_tensor_list = flatten_list(data[i])
-            for j in range(len(in_tensor_list)):
-                with torch.no_grad():
-                    out_tensor_list[j].copy_(in_tensor_list[j])
-
-
-class Collective:
-    def __init__(self, world_size, collective):
-        self._world_size = world_size
-        self._collective = collective
-
-        self._start_cond = threading.Condition()
-        self._done_cond = threading.Condition()
-
-        self._data = [None] * world_size
-        self._count = 0
-        self._done = False
-
-    def join(self, rank, data):
-        with self._start_cond:
-            self._data[rank] = data
-            self._count += 1
-
-            # notify rank 0
-            if self._count == self._world_size:
-                if rank > 0:
-                    self._start_cond.notify()
-
-            if rank == 0:
-                while self._count < self._world_size:
-                    self._start_cond.wait()
-
-        with self._done_cond:
-            # wait for rank 0 to finish
-            if rank > 0:
-                while not self._done:
-                    self._done_cond.wait()
-            else:
-                # copy data around
-                self._collective.work(self._data)
-                self._done = True
-                self._done_cond.notify_all()
-        return ret_work(data)
-
-
-class ProcessLocalGroup(dist.ProcessGroup):
-    _pg_lock = threading.Lock()
-    _pg_list = []
-    _count = 0
-    _ready = False
-
-    _coll_lock = threading.Lock()
-    _cur_coll = None
-
-    @classmethod
-    def _register(cls, pg):
-        with cls._pg_lock:
-            while len(cls._pg_list) <= pg._rank:
-                cls._pg_list.append(None)
-            cls._pg_list[pg._rank] = pg
-            cls._count += 1
-            if cls._count == pg._world:
-                cls._ready = True
-
-    @classmethod
-    def _start_coll(cls, world_size, collective):
-        with cls._coll_lock:
-            if not cls._ready:
-                raise Exception(
-                    f"world not ready, only {cls._count} PG's registered but world has {world_size} ranks"
-                )
-            if cls._cur_coll is None:
-                cls._cur_coll = Collective(world_size, collective)
-            return cls._cur_coll
-
-    @classmethod
-    def _end_coll(cls, collective):
-        # This is racily called by all ranks, so only one will work
-        with cls._coll_lock:
-            if cls._cur_coll == collective:
-                cls._cur_coll = None
-
-    def allgather(self, output_tensors, input_tensor, options):
-        coll = ProcessLocalGroup._start_coll(self._world, AllGather())
-        res = coll.join(self._rank, (output_tensors, input_tensor))
-        ProcessLocalGroup._end_coll(coll)
-        return res
-
-    def broadcast(self, tensor_list, opts):
-        coll = ProcessLocalGroup._start_coll(self._world, Broadcast(opts.rootRank))
-        res = coll.join(self._rank, tensor_list)
-        ProcessLocalGroup._end_coll(coll)
-        return res
-
-    def __init__(self, rank, world):
-        super(ProcessLocalGroup, self).__init__(rank, world)
-        self._rank = rank
-        self._world = world
-        ProcessLocalGroup._register(self)
-
-    def size(self):
-        return self._world
-
-    def getBackendName(self):
-        return "local"
-
-    def __repr__(self):
-        return f"PLG w:{self._world} r:{self._rank}"
-
-
-def _create_threaded_pg(prefix_store, rank, world_size, timeout):
-    return ProcessLocalGroup(rank, world_size)
-
-
-dist.Backend.register_backend("threaded", _create_threaded_pg)
-
-
-@dataclass
-class WorldData:
-    default_pg: dist.ProcessGroup
-    pg_map: Dict[dist.ProcessGroup, Tuple[str, Optional[Store]]]
-    pg_names: Dict[dist.ProcessGroup, str]
-    pg_group_ranks: Dict[dist.ProcessGroup, Dict[int, int]]
-    group_count: int
-
-
-class ThreadLocalWorld:
-    _world = threading.local()
-
-    def _get_world(self) -> WorldData:
-        if not hasattr(ThreadLocalWorld._world, "world"):
-            ThreadLocalWorld._world.world = WorldData(None, {}, {}, {}, 0)
-        return ThreadLocalWorld._world.world
-
-    @property
-    def default_pg(self):
-        return self._get_world().default_pg
-
-    @default_pg.setter
-    def default_pg(self, value):
-        self._get_world().default_pg = value
-
-    @property
-    def pg_map(self):
-        return self._get_world().pg_map
-
-    @property
-    def pg_names(self):
-        return self._get_world().pg_names
-
-    @property
-    def pg_group_ranks(self):
-        return self._get_world().pg_group_ranks
-
-    @property
-    def group_count(self) -> int:
-        return self._get_world().group_count
-
-    @group_count.setter
-    def group_count(self, value):
-        self._get_world().group_count = value
-
-
-_old_pg_world = None
-
-
-def _install_threaded_pg():
-    global _old_pg_world
-    _old_pg_world = dist.distributed_c10d._world
-    dist.distributed_c10d._world = ThreadLocalWorld()
-    return dist.distributed_c10d._world
-
-
-def _uninstall_threaded_pg():
-    dist.distributed_c10d._world = _old_pg_world
-
-
-def run_with_threaded_pg(world_size, timeout, callback):
-    """
-    Run ``callback`` with ``world_size`` threads using the in-proc process group
-    """
-    world = _install_threaded_pg()
-
-    def world_is_valid():
-        return world == dist.distributed_c10d._world
-
-    global_store = dist.HashStore()
-    exception_queue = queue.Queue()
-
-    def worker(rank):
-        if not world_is_valid():
-            raise TimeoutError("Invalid world")
-        dist.init_process_group(
-            backend="threaded", rank=rank, world_size=world_size, store=global_store
-        )
-        try:
-            callback()
-        except BaseException as ex:
-            exception_queue.put((rank, sys.exc_info()))
-        finally:
-            if world_is_valid():
-                dist.destroy_process_group()
-
-    try:
-        threads = [
-            threading.Thread(target=worker, args=(rank,)) for rank in range(world_size)
-        ]
-        for thread in threads:
-            thread.start()
-
-        deadline = time.time() + timeout
-        for idx, thread in enumerate(threads):
-            thread.join(max(0, deadline - time.time()))
-            if thread.is_alive():
-                exception_queue.put(
-                    (
-                        idx,
-                        (
-                            TimeoutError,
-                            TimeoutError(
-                                f"Rank failed to join in under {timeout} seconds"
-                            ),
-                            None,
-                        ),
-                    )
-                )
-        failed_ranks = []
-        while not exception_queue.empty():
-            failure = exception_queue.get()
-            failed_ranks.append(failure)
-        return failed_ranks
-    finally:
-        _uninstall_threaded_pg()

From 76d9b56d673d45d0c7b8fdb5231905121d5fb744 Mon Sep 17 00:00:00 2001
From: Akshit Khurana <axit@meta.com>
Date: Tue, 8 Nov 2022 10:29:39 -0800
Subject: [PATCH 0723/1922] [Dynamo] Fix Tensor.T trace (#88642)

Summary:

Tensor.T considered T as a GetAttr and didn't progate "example_value"

Via https://pytorch.org/docs/stable/tensors.html#torch.Tensor.T
> If n is the number of dimensions in x, x.T is equivalent to
> x.permute(n-1, n-2, ..., 0).

Fixes pytorch/torchdynamo#1476

Test Plan:

pytest test/dynamo/test_functions.py::FunctionTests::test_T

Reviewers:

Subscribers:

Tasks:

Tags:

Differential Revision: [D41130306](https://our.internmc.facebook.com/intern/diff/D41130306)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88642
Approved by: https://github.com/tugsbayasgalan, https://github.com/yanboliang, https://github.com/jansel
---
 test/dynamo/test_functions.py     | 4 ++++
 torch/_dynamo/variables/tensor.py | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py
index d428a4369fc1e..685393bc6766c 100644
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@@ -329,6 +329,10 @@ def test_ndim(x):
         if x.ndim == 2 and x.ndimension() == 2 and x.dim() == 2:
             return x + 1
 
+    @make_test
+    def test_T(x):
+        return torch.ones_like(x.T)
+
     @make_test
     def test_is_sparse(x):
         if not x.is_sparse:
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index ab4cbf62ce36a..315c2b1a7e074 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -438,6 +438,9 @@ def var_getattr(self, tx, name):
             result = self.call_method(tx, "size", [], {})
         elif name == "ndim" and self.ndim is None:
             result = self.call_method(tx, "dim", [], {})
+        elif name == "T":
+            args = [variables.ConstantVariable(i) for i in range(self.ndim - 1, -1, -1)]
+            result = self.call_method(tx, "permute", args, {})
 
         if name == "__class__":
             return TorchVariable(self.python_type(), **options)

From c6ae80f8893f88d142d553da6adad0b51b7ee373 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Thu, 10 Nov 2022 00:06:31 +0000
Subject: [PATCH 0724/1922] Fix Docker image generation (#88741)

Pass install channel when building nightly images
Pass `TRITON_VERSION` argument to install triton for nightly images

Fix `generate_pytorch_version.py` to work with unannotated tags and avoid failures like the following:
```
% git checkout nightly
% ./.github/scripts/generate_pytorch_version.py

fatal: No annotated tags can describe '93f15b1b54ca5fb4a7ca9c21a813b4b86ebaeafa'.
However, there were unannotated tags: try --tags.
Traceback (most recent call last):
  File "/Users/nshulga/git/pytorch/pytorch-release/./.github/scripts/generate_pytorch_version.py", line 120, in <module>
    main()
  File "/Users/nshulga/git/pytorch/pytorch-release/./.github/scripts/generate_pytorch_version.py", line 115, in main
    print(version_obj.get_release_version())
  File "/Users/nshulga/git/pytorch/pytorch-release/./.github/scripts/generate_pytorch_version.py", line 75, in get_release_version
    if not get_tag():
  File "/Users/nshulga/git/pytorch/pytorch-release/./.github/scripts/generate_pytorch_version.py", line 37, in get_tag
    dirty_tag = subprocess.check_output(
  File "/Users/nshulga/miniforge3/lib/python3.9/subprocess.py", line 424, in check_output
    return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,
  File "/Users/nshulga/miniforge3/lib/python3.9/subprocess.py", line 528, in run
    raise CalledProcessError(retcode, process.args,
subprocess.CalledProcessError: Command '['git', 'describe']' returned non-zero exit status 128.
```
After the change nightly is reported as(due to autolabelling issue,
should be fixed by ttps://github.com/pytorch/test-infra/pull/1047 ):
```
 % ./.github/scripts/generate_pytorch_version.py
ciflow/inductor/26921+cpu
```

Even for tagged release commits version generation was wrong:
```
% git checkout release/1.13
% ./.github/scripts/generate_pytorch_version.py
ciflow/periodic/79617-4848-g7c98e70d44+cpu
```
After the fix, it is as expected:
```
% ./.github/scripts/generate_pytorch_version.py
1.13.0+cpu
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88741
Approved by: https://github.com/dagitses, https://github.com/msaroufim
---
 .github/scripts/generate_pytorch_version.py | 31 +++++++++------------
 .github/workflows/docker-release.yml        |  9 ++++--
 Dockerfile                                  |  2 ++
 docker.Makefile                             |  5 +++-
 4 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/.github/scripts/generate_pytorch_version.py b/.github/scripts/generate_pytorch_version.py
index 0655df137e07c..02c19844cd09f 100755
--- a/.github/scripts/generate_pytorch_version.py
+++ b/.github/scripts/generate_pytorch_version.py
@@ -23,27 +23,22 @@ def get_pytorch_root() -> Path:
 
 def get_tag() -> str:
     root = get_pytorch_root()
-    # We're on a tag
-    am_on_tag = (
-        subprocess.run(
-            ['git', 'describe', '--tags', '--exact'],
-            cwd=root,
-            stdout=subprocess.DEVNULL,
-            stderr=subprocess.DEVNULL
-        ).returncode == 0
-    )
-    tag = ""
-    if am_on_tag:
+    try:
         dirty_tag = subprocess.check_output(
-            ['git', 'describe'],
+            ['git', 'describe', '--tags', '--exact'],
             cwd=root
         ).decode('ascii').strip()
-        # Strip leading v that we typically do when we tag branches
-        # ie: v1.7.1 -> 1.7.1
-        tag = re.sub(LEADING_V_PATTERN, "", dirty_tag)
-        # Strip trailing rc pattern
-        # ie: 1.7.1-rc1 -> 1.7.1
-        tag = re.sub(TRAILING_RC_PATTERN, "", tag)
+    except subprocess.CalledProcessError:
+        return ""
+    # Strip leading v that we typically do when we tag branches
+    # ie: v1.7.1 -> 1.7.1
+    tag = re.sub(LEADING_V_PATTERN, "", dirty_tag)
+    # Strip trailing rc pattern
+    # ie: 1.7.1-rc1 -> 1.7.1
+    tag = re.sub(TRAILING_RC_PATTERN, "", tag)
+    # Ignore ciflow tags
+    if tag.startswith("ciflow/"):
+        return ""
     return tag
 
 def get_base_version() -> str:
diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml
index fc10f58344438..d1b9209c4076e 100644
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@@ -80,10 +80,13 @@ jobs:
           # Generate PyTorch version to use
           echo "PYTORCH_VERSION=$(python3 .github/scripts/generate_pytorch_version.py)" >> "${GITHUB_ENV}"
       - name: Setup nightly specific variables
-        if: ${{ github.event.ref == 'refs/heads/nightly' }}
+        if: ${{ github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/ciflow/nightly/') }}
         run: |
-          # Use nightly image if building for nightly
-          echo "DOCKER_IMAGE=pytorch-nightly" >> "${GITHUB_ENV}"
+          {
+            echo "DOCKER_IMAGE=pytorch-nightly";
+            echo "INSTALL_CHANNEL=pytorch-nightly";
+            echo "TRITON_VERSION=2.0.0+$(cut -c -10 .github/ci_commit_pins/triton.txt)";
+          } >> "${GITHUB_ENV}"
       - name: Run docker build / push
         # WITH_PUSH is used here to determine whether or not to add the --push flag
         run: |
diff --git a/Dockerfile b/Dockerfile
index e49e0a44e816b..e125271607c93 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -66,6 +66,7 @@ ARG INSTALL_CHANNEL=pytorch-nightly
 RUN /opt/conda/bin/conda update -y conda
 RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y python=${PYTHON_VERSION}
 ARG TARGETPLATFORM
+ARG TRITON_VERSION
 
 # On arm64 we can only install wheel packages
 RUN case ${TARGETPLATFORM} in \
@@ -74,6 +75,7 @@ RUN case ${TARGETPLATFORM} in \
     esac && \
     /opt/conda/bin/conda clean -ya
 RUN /opt/conda/bin/pip install torchelastic
+RUN if test -n "${TRITON_VERSION}" -a "${TARGETPLATFORM}" != "linux/arm64"; then /opt/conda/bin/pip install "torchtriton==${TRITON_VERSION}" --extra-index-url https://download.pytorch.org/whl/nightly/cpu ; fi
 
 FROM ${BASE_IMAGE} as official
 ARG PYTORCH_VERSION
diff --git a/docker.Makefile b/docker.Makefile
index 9f433af435ed6..f85a3c3a3fc15 100644
--- a/docker.Makefile
+++ b/docker.Makefile
@@ -23,12 +23,15 @@ PYTORCH_VERSION          ?= $(shell git describe --tags --always)
 # Can be either official / dev
 BUILD_TYPE               ?= dev
 BUILD_PROGRESS           ?= auto
+# Intentionally left blank
+TRITON_VERSION           ?=
 BUILD_ARGS                = --build-arg BASE_IMAGE=$(BASE_IMAGE) \
 							--build-arg PYTHON_VERSION=$(PYTHON_VERSION) \
 							--build-arg CUDA_VERSION=$(CUDA_VERSION) \
 							--build-arg CUDA_CHANNEL=$(CUDA_CHANNEL) \
 							--build-arg PYTORCH_VERSION=$(PYTORCH_VERSION) \
-							--build-arg INSTALL_CHANNEL=$(INSTALL_CHANNEL)
+							--build-arg INSTALL_CHANNEL=$(INSTALL_CHANNEL) \
+							--build-arg TRITON_VERSION=$(TRITON_VERSION)
 EXTRA_DOCKER_BUILD_FLAGS ?=
 
 BUILD                    ?= build

From 22d2cae5ed003ca5aeec2b0e42ead0f8a7c9bdc5 Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@fb.com>
Date: Thu, 10 Nov 2022 00:26:58 +0000
Subject: [PATCH 0725/1922] Fix dynamo dashboard passrate denominator (#88777)

Before the dashboard improvements, the passrate table looked like this:
~~~
+------------------------+------------+-------------+-------------+
|        Compiler        | torchbench | huggingface | timm_models |
+------------------------+------------+-------------+-------------+
|         eager          | 98%, 54/55 | 100%, 43/43 | 100%, 61/61 |
|       aot_eager        | 95%, 52/55 | 100%, 43/43 | 97%, 59/61  |
|     aot_cudagraphs     | 75%, 41/55 | 49%, 21/43  | 38%, 23/61  |
|    nvprims_nvfuser     | 71%, 39/55 |  16%, 7/43  | 48%, 29/61  |
|        inductor        | 87%, 48/55 | 93%, 40/43  | 95%, 58/61  |
| inductor_no_cudagraphs | 93%, 51/55 | 93%, 40/43  | 95%, 58/61  |
+------------------------+------------+-------------+-------------+
~~~
After the change, the table looked like:
~~~
+------------------------+------------+-------------+-------------+
|        Compiler        | torchbench | huggingface | timm_models |
+------------------------+------------+-------------+-------------+
|         eager          | 82%, 53/65 | 84%, 43/51  | 82%, 61/74  |
|       aot_eager        | 83%, 54/65 | 84%, 43/51  | 82%, 61/74  |
|     aot_cudagraphs     | 69%, 45/65 | 65%, 33/51  | 38%, 28/74  |
|    nvprims_nvfuser     | 48%, 31/65 | 78%, 40/51  | 26%, 19/74  |
|        inductor        | 75%, 49/65 | 82%, 42/51  | 81%, 60/74  |
| inductor_no_cudagraphs | 82%, 53/65 | 82%, 42/51  | 82%, 61/74  |
+------------------------+------------+-------------+-------------+
~~~
There is no actual regression, but the passrate is lower since the denominator is wrong. Check fix by running locally (e.g. `python benchmarks/dynamo/runner.py --output-dir ../test-dynamo-runner-logs-5 --training --visualize_logs`) and comparing passrate table output to previously correct one.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88777
Approved by: https://github.com/anijain2305
---
 benchmarks/dynamo/runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py
index f29877d64a720..90f30a286ec4f 100755
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@@ -543,12 +543,12 @@ def extract_df(self, metric, testing):
                     for compiler in self.compilers:
                         if not perf_row.empty:
                             if acc_row.empty:
-                                perf_row.loc[0, compiler] = 0.0
+                                perf_row[compiler] = 0.0
                             elif acc_row[compiler].iloc[0] not in (
                                 "pass",
                                 "pass_due_to_skip",
                             ):
-                                perf_row.loc[0, compiler] = 0.0
+                                perf_row[compiler] = 0.0
                     perf_rows.append(perf_row)
                 df = pd.concat(perf_rows)
             df = df.sort_values(by=list(reversed(self.compilers)), ascending=False)

From c3643d982053f8eb3c43ba7e69cae187a266c71c Mon Sep 17 00:00:00 2001
From: mikey dagitses <mikeyd@meta.com>
Date: Thu, 10 Nov 2022 00:27:59 +0000
Subject: [PATCH 0726/1922] disable test that times out in fbcode (#88758)

Test Plan: Rely on CI.

Differential Revision: D41162966

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88758
Approved by: https://github.com/zou3519
---
 test/test_xnnpack_integration.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_xnnpack_integration.py b/test/test_xnnpack_integration.py
index 9e510d1715b10..17ac2d9e7fc3a 100644
--- a/test/test_xnnpack_integration.py
+++ b/test/test_xnnpack_integration.py
@@ -14,7 +14,7 @@
 import io
 import itertools
 
-from torch.testing._internal.common_utils import TEST_WITH_TSAN
+from torch.testing._internal.common_utils import IS_FBCODE, TEST_WITH_TSAN
 
 @unittest.skipUnless(torch.backends.xnnpack.enabled,
                      " XNNPACK must be enabled for these tests."
@@ -987,6 +987,7 @@ def validate_transform_conv1d_to_conv2d(
             torch.testing.assert_close(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
 
 
+    @unittest.skipIf(IS_FBCODE, "T137513244")
     def test_conv1d_basic(self):
         batch_size_list = range(1, 3)
         input_channels_per_group_list = range(10, 12)

From fc321837cae0bd0bad134f16f71479f1c73d28ca Mon Sep 17 00:00:00 2001
From: Eddie Yan <eddiey@nvidia.com>
Date: Thu, 10 Nov 2022 00:49:07 +0000
Subject: [PATCH 0727/1922] [cuDNN][cuDNN V8 API] Match V7 API behavior for
 `channels_last` stride coercion for cuDNN (#88699)

For ConvNeXt failure in https://github.com/pytorch/torchdynamo/issues/1833

cuDNN V7 has some stride "fixing" code to coerce cuDNN to use channels-last in cases when allowed by size 1 strides that was omitted in V8, which seems to seems to lead to performance regressions. This PR patches in the same fix for V8.

CC @ngimel @ptrblck
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88699
Approved by: https://github.com/ngimel
---
 aten/src/ATen/cudnn/Descriptors.h      |  5 +++--
 aten/src/ATen/native/cudnn/Conv_v8.cpp | 31 ++++++++++++++------------
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h
index a393548bd4d3f..e111987785cc5 100644
--- a/aten/src/ATen/cudnn/Descriptors.h
+++ b/aten/src/ATen/cudnn/Descriptors.h
@@ -46,7 +46,8 @@ inline int dataSize(cudnnDataType_t dataType)
 // that the stride for dim i is the product of the sizes of dims
 // i+1 to the end.  This stride is indeed uniquely determined.  This
 // function modifies 'stride' in place so this invariant holds.
-static inline void fixSizeOneDimStride(int dim, const int *size, int *stride, bool nhwc) {
+template <typename T>
+static inline void fixSizeOneDimStride(int dim, const T *size, T *stride, bool nhwc) {
   int64_t z = 1;
   int index = 0;
   std::vector<int> permutation(dim);
@@ -150,7 +151,7 @@ class TORCH_CUDA_CPP_API TensorDescriptor : public Descriptor<
   void set(cudnnDataType_t dataType, IntArrayRef sizes, IntArrayRef strides, size_t pad, bool nhwc);
 
   void set(cudnnDataType_t dataType, int dim, int* size, int* stride, bool nhwc) {
-    fixSizeOneDimStride(dim, size, stride, nhwc);
+    fixSizeOneDimStride<int>(dim, size, stride, nhwc);
     AT_CUDNN_CHECK(cudnnSetTensorNdDescriptor(mut_desc(), dataType, dim, size, stride));
   }
 };
diff --git a/aten/src/ATen/native/cudnn/Conv_v8.cpp b/aten/src/ATen/native/cudnn/Conv_v8.cpp
index 17834e9df173a..11fe5be8298e1 100644
--- a/aten/src/ATen/native/cudnn/Conv_v8.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v8.cpp
@@ -54,9 +54,12 @@ uint8_t getAlignment(const Tensor &t) {
   return alignment;
 }
 
-cudnn_frontend::Tensor getTensorDescriptorWithTypeVirtual(const Tensor &t, const int64_t id, const uint8_t alignment, const cudnnDataType_t dataType, const bool _virtual) {
+cudnn_frontend::Tensor getTensorDescriptorWithTypeVirtual(const Tensor &t, const int64_t id, const uint8_t alignment, const cudnnDataType_t dataType, const at::MemoryFormat memory_format, const bool _virtual) {
   auto sizes = t.sizes();
   auto strides = t.strides();
+  bool channels_last = memory_format == at::MemoryFormat::ChannelsLast ||
+    memory_format == at::MemoryFormat::ChannelsLast3d;
+  fixSizeOneDimStride<int64_t>(sizes.size(), &sizes[0], (int64_t *) &strides[0], channels_last);
   auto r = cudnn_frontend::TensorBuilder()
     .setDim(sizes.size(), sizes.data())
     .setStrides(strides.size(), strides.data())
@@ -68,8 +71,8 @@ cudnn_frontend::Tensor getTensorDescriptorWithTypeVirtual(const Tensor &t, const
   return r;
 }
 
-cudnn_frontend::Tensor getTensorDescriptor(const Tensor &t, const int64_t id, const uint8_t alignment) {
-  return getTensorDescriptorWithTypeVirtual(t, id, alignment, getCudnnDataType(t), false);
+cudnn_frontend::Tensor getTensorDescriptor(const Tensor &t, const int64_t id, const uint8_t alignment, const at::MemoryFormat memory_format) {
+  return getTensorDescriptorWithTypeVirtual(t, id, alignment, getCudnnDataType(t), memory_format, false);
 }
 
 cudnn_frontend::ConvDesc_v8 getConvDescriptor(cudnnDataType_t dataType, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, const at::ScalarType scalar_type) {
@@ -209,9 +212,9 @@ void run_conv_plan_fused(cudnnHandle_t handle, const Tensor& x, const Tensor& y,
 
 auto build_opgraph(const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, const Tensor& x, const Tensor& y, const Tensor& w, const CacheKey& key, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation) {
   auto op = cudnn_frontend::OperationBuilder(desc)
-      .setxDesc(getTensorDescriptor(x, 'x', key.x_alignment))
-      .setyDesc(getTensorDescriptor(y, 'y', key.y_alignment))
-      .setwDesc(getTensorDescriptor(w, 'w', key.w_alignment))
+      .setxDesc(getTensorDescriptor(x, 'x', key.x_alignment, key.params.memory_format))
+      .setyDesc(getTensorDescriptor(y, 'y', key.y_alignment, key.params.memory_format))
+      .setwDesc(getTensorDescriptor(w, 'w', key.w_alignment, key.params.memory_format))
       .setcDesc(getConvDescriptor(key.params.dataType, padding, stride, dilation, x.scalar_type()))
       .build();
   std::array<cudnn_frontend::Operation const *, 1> ops = {&op};
@@ -241,33 +244,33 @@ auto build_opgraph_fused(const cudnnHandle_t handle, const Tensor & x, const Ten
   const float alpha1 = 1.0;
   const float alpha2 = alpha;
   auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
-                   .setxDesc(getTensorDescriptor(x, 'x', key.x_alignment))
+                   .setxDesc(getTensorDescriptor(x, 'x', key.x_alignment, key.params.memory_format))
                    // virtual output of conv
-                   .setyDesc(getTensorDescriptorWithTypeVirtual(y, 'C', key.y_alignment, precision, true))
-                   .setwDesc(getTensorDescriptor(w, 'w', key.w_alignment))
+                   .setyDesc(getTensorDescriptorWithTypeVirtual(y, 'C', key.y_alignment, precision, key.params.memory_format, true))
+                   .setwDesc(getTensorDescriptor(w, 'w', key.w_alignment, key.params.memory_format))
                    .setAlpha(alpha1)
                    .setcDesc(convDesc)
                    .build();
   auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                            .setxDesc(conv_op.getOutputTensor())
-                           .setbDesc(getTensorDescriptor(z, 'z', key.z_alignment))
+                           .setbDesc(getTensorDescriptor(z, 'z', key.z_alignment, key.params.memory_format))
                            // another virtual output (of add)
-                           .setyDesc(getTensorDescriptorWithTypeVirtual(y, 'A', key.y_alignment, precision, true))
+                           .setyDesc(getTensorDescriptorWithTypeVirtual(y, 'A', key.y_alignment, precision, key.params.memory_format, true))
                            .setpwDesc(addDesc)
                            .setAlpha(alpha1)
                            .setAlpha2(alpha2)
                            .build();
   auto add_bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                            .setxDesc(add_op.getOutputTensor())
-                           .setbDesc(getTensorDescriptor(b, 'b', key.b_alignment))
+                           .setbDesc(getTensorDescriptor(b, 'b', key.b_alignment, key.params.memory_format))
                            // another virtual output (of add bias)
-                           .setyDesc(getTensorDescriptorWithTypeVirtual(y, 'B', key.y_alignment, precision, true))
+                           .setyDesc(getTensorDescriptorWithTypeVirtual(y, 'B', key.y_alignment, precision, key.params.memory_format, true))
                            .setpwDesc(addBiasDesc)
                            .build();
   auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
                           .setxDesc(add_bias_op.getOutputTensor())
                           // final output is in original datatype
-                          .setyDesc(getTensorDescriptor(y, 'y', key.y_alignment))
+                          .setyDesc(getTensorDescriptor(y, 'y', key.y_alignment, key.params.memory_format))
                           .setpwDesc(actDesc)
                           .build();
   std::array<cudnn_frontend::Operation const*, 4> ops = {&conv_op, &add_op, &add_bias_op, &act_op};

From ff8afa301dfac3baa4bb51c8051d6d4f14797760 Mon Sep 17 00:00:00 2001
From: Elias Ellison <elias.ellison@gmail.com>
Date: Wed, 9 Nov 2022 14:21:21 -0800
Subject: [PATCH 0728/1922] add stride constraints to fallbacks (#88534)

Add stride/contiguity constraints to fallbacks so that inputs will be in the right stride permutation for the fallback kernel.

Improves perf of coat_lite_mini from 1.48415536054865 -> 2.010956856330101.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88534
Approved by: https://github.com/ngimel
---
 test/inductor/test_torchinductor.py |  39 +++++++++
 torch/_inductor/ir.py               | 129 ++++++++++++++--------------
 torch/_inductor/lowering.py         | 105 +++++++++++++++++++---
 torch/_inductor/utils.py            |   4 +
 4 files changed, 198 insertions(+), 79 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index d001f7b0c2181..db6c5dfc2bd15 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -4333,6 +4333,45 @@ class CpuTests(TestCase):
     CommonTemplate.install(CpuTests, "cpu")
 
     class CPUReproTests(TestCase):
+        def test_conv_stride_constraints(self):
+            for fmt in [torch.channels_last, torch.contiguous_format]:
+                # TorchDispatch doesn't work in our cuda invocation for some reason
+                m = torch.nn.Conv2d(5, 6, [3, 3])
+
+                def fn(inp, weight):
+                    return (
+                        F.conv2d(
+                            inp, weight, None, m.stride, m.padding, m.dilation, m.groups
+                        ),
+                    )
+
+                inp = torch.randn([2, 5, 16, 16])
+                inps = [inp, m.weight.to(memory_format=fmt)]
+                fn_fx = make_fx(fn)(*inps)
+                fn_compiled = compile_fx_inner(fn_fx, inps)
+                test_self = self
+                conv_seen = False
+
+                class RecordFunctions(TorchDispatchMode):
+                    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+                        kwargs = kwargs if kwargs else {}
+                        if func == torch.ops.aten.convolution.default:
+                            test_self.assertTrue(
+                                args[0].is_contiguous(memory_format=fmt)
+                            )
+                            test_self.assertTrue(
+                                args[1].is_contiguous(memory_format=fmt)
+                            )
+                            nonlocal conv_seen
+                            conv_seen = True
+
+                        return func(*args, **kwargs)
+
+                with RecordFunctions():
+                    out = fn_compiled(inps)
+
+                self.assertTrue(conv_seen)
+
         def test_inplace_squeeze_needed(self):
             mod = torch.nn.Sequential(
                 torch.nn.Linear(10, 10),
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 3fc61b29fa062..629a8e94534d2 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -26,7 +26,14 @@
 from .codegen.common import index_prevent_reordering
 from .cuda_properties import get_device_properties
 from .dependencies import extract_read_writes, var_builder
-from .utils import cache_on_self, sympy_dot, sympy_product, sympy_subs, sympy_symbol
+from .utils import (
+    argsort,
+    cache_on_self,
+    sympy_dot,
+    sympy_product,
+    sympy_subs,
+    sympy_symbol,
+)
 from .virtualized import ops, V
 
 log = logging.getLogger(__name__)
@@ -70,6 +77,17 @@ def stride_order2fill_order(order):
     return fill_order
 
 
+def get_stride_order(seq):
+    """
+    Convert strides to stride order
+    """
+    sorted_idx = argsort(seq)
+    out = [None for _ in range(len(seq))]
+    for i, elem in enumerate(sorted_idx):
+        out[elem] = i
+    return out
+
+
 def reads_from_conv(buf, var_ranges):
     """
     return:
@@ -105,6 +123,25 @@ def reads_from_conv(buf, var_ranges):
     return False, None
 
 
+def ir_node_to_tensor(x, guard_shape=True):
+    shape_fn = (
+        V.graph.sizevars.guard_static_shape
+        if guard_shape
+        else V.graph.sizevars.size_hint
+    )
+    size = [shape_fn(s) for s in x.get_size()]
+    if is_storage_and_layout(x):
+        stride = [shape_fn(s) for s in x.get_layout().stride]
+    else:
+        stride = torch._prims_common.make_contiguous_strides_for(size)
+    dtype = x.get_dtype()
+    device = x.get_device()
+    t = torch.empty_strided(
+        size=size, stride=stride, dtype=dtype, device=device
+    ).zero_()
+    return t
+
+
 def layout_priority_idx(reads_bufs, memory_addrs, var_ranges):
     """
     if reads from conv that needs to use specific layout
@@ -2281,17 +2318,9 @@ def unflatten_args(new_tensor_args, new_non_tensor_args):
         # shapes and run an example input.
         # TODO(jansel): replace this with dynamic shape formulas
         example_args = []
+
         for x in tensor_args:
-            size = [V.graph.sizevars.guard_static_shape(s) for s in x.get_size()]
-            stride = [
-                V.graph.sizevars.guard_static_shape(s) for s in x.get_layout().stride
-            ]
-            dtype = x.get_dtype()
-            device = x.get_device()
-            arg = torch.empty_strided(
-                size=size, stride=stride, dtype=dtype, device=device
-            ).zero_()
-            example_args.append(arg)
+            example_args.append(ir_node_to_tensor(x, guard_shape=True))
 
         new_args, new_kwargs = unflatten_args(example_args, non_tensor_args)
         example_output = kernel(*new_args, **new_kwargs)
@@ -2373,33 +2402,39 @@ def realize_input(cls, x):
 
     @classmethod
     def require_stride1(cls, x):
-        if len(x.get_stride()) == 0:
-            return x
-        for stride in x.get_stride():
-            if stride == 1:
+        if is_storage_and_layout(x):
+            if len(x.get_stride()) == 0:
                 return x
+            for stride in x.get_stride():
+                if stride == 1:
+                    return x
         return cls.copy_input(x)
 
     @classmethod
     def require_stride_order(cls, x, order):
         # require x to have the layout as strided_ordered as order
-        if isinstance(
-            x.get_layout(), FlexibleLayout
-        ) and is_stride_order_storage_and_layout(x, order):
-            # fix flexiblelayout to be FixedLayout with stride_order
-            as_storage_and_layout(
-                x, freeze=True, want_contiguous=False, stride_order=order
-            )
-            return x
-        elif isinstance(x.get_layout(), FixedLayout) and x.layout.is_stride_ordered(
-            order
-        ):
-            return x
+        if is_storage_and_layout(x):
+            if isinstance(
+                x.get_layout(), FlexibleLayout
+            ) and is_stride_order_storage_and_layout(x, order):
+                # fix flexiblelayout to be FixedLayout with stride_order
+                as_storage_and_layout(
+                    x, freeze=True, want_contiguous=False, stride_order=order
+                )
+                return x
+            elif isinstance(
+                x.get_layout(), FixedLayout
+            ) and x.get_layout().is_stride_ordered(order):
+                return x
         x = cls.copy_input(x)
         as_storage_and_layout(x, freeze=True, want_contiguous=False, stride_order=order)
         assert is_stride_order_storage_and_layout(x, order)
         return x
 
+    @classmethod
+    def require_contiguous(cls, x):
+        return cls.require_stride_order(x, list(reversed(range(len(x.get_size())))))
+
     def apply_constraint(self):
         pass
 
@@ -2815,43 +2850,6 @@ def get_reads(self):
         return ()
 
 
-class AdaptiveAvgPool2d(ExternKernelAlloc):
-    kernel = "aten._adaptive_avg_pool2d"
-
-    @classmethod
-    def create(cls, x, target_size):
-        # x = cls.require_stride1(cls.realize_input(x))
-        x = cls.realize_input(x)
-        output_size = [
-            *x.get_size()[: -len(target_size)],
-            *map(sympy.Integer, target_size),
-        ]
-        # contigouse stride order
-        stride_order = list(reversed(range(len(output_size))))
-        return cls(
-            FlexibleLayout(
-                x.get_device(),
-                x.get_dtype(),
-                output_size,
-                # TODO(jansel): fix channels last case
-                # FlexibleLayout.contiguous_strides(output_size),
-                stride_order,
-            ),
-            (x,),
-            (tuple(target_size),),
-        )
-
-    def apply_constraint(self):
-        x = self.inputs[0]
-        if isinstance(x.get_layout(), FixedLayout):
-            # fix self's layout to be the same order as x
-            self.freeze_layout_with_same_order(x.get_layout().stride)
-        else:
-            x = self.require_stride_order(x, self.layout.preferred_stride_order)
-            self.inputs[0] = x
-            self.freeze_layout_with_stride_order(self.layout.preferred_stride_order)
-
-
 @dataclasses.dataclass
 class FallbackKernel(ExternKernelAlloc):
     def __init__(
@@ -3018,8 +3016,9 @@ def create(
         output_padding_: List[int],
         groups: int,
     ):
-        x = cls.require_stride1(cls.realize_input(x))
+
         weight = cls.require_stride1(cls.realize_input(weight))
+        x = cls.require_stride_order(x, get_stride_order(weight.get_stride()))
         stride = tuple(stride_)
         padding = tuple(padding_)
         dilation = tuple(dilation_)
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 694d2939285de..0ede696828a32 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -23,6 +23,7 @@
 from .decomposition import decompositions, get_decompositions
 from .ir import (
     ExpandView,
+    get_stride_order,
     IndexingConstant,
     IndexingDiv,
     PermuteView,
@@ -962,10 +963,12 @@ def linear_binary(x: TensorBox, y: TensorBox, w: TensorBox, b: TensorBox, attr):
 register_onednn_fusion_ops()
 
 
-def fallback_handler(kernel):
+def fallback_handler(kernel, inps_hook=None):
     fallbacks.add(kernel)
 
     def handler(*args, **kwargs):
+        if inps_hook is not None:
+            args, kwargs = inps_hook(*args, **kwargs)
         return pytree.tree_map(
             TensorBox.create, ir.FallbackKernel.create(kernel, *args, **kwargs)
         )
@@ -973,7 +976,7 @@ def handler(*args, **kwargs):
     return handler
 
 
-def make_fallback(kernel):
+def make_fallback(kernel, inps_hook=None):
     assert (
         kernel not in decompositions
     ), f"both a fallback and a decomp for same kernel: {kernel}"
@@ -983,7 +986,9 @@ def make_fallback(kernel):
         )
 
     add_needs_realized_inputs(kernel)
-    return register_lowering(kernel, type_promotion_kind=None)(fallback_handler(kernel))
+    return register_lowering(kernel, type_promotion_kind=None)(
+        fallback_handler(kernel, inps_hook)
+    )
 
 
 @register_lowering(aten.native_dropout, type_promotion_kind=None)
@@ -1134,29 +1139,101 @@ def inner_fn(index):
     )
 
 
+def conv_backward(*args, **kwargs):
+    # output striding complex and has a lot of build dependent options,
+    # take the output strides to determine what to set the inputs
+    with torch._subclasses.FakeTensorMode():
+        args_fake, kwargs_fake = pytree.tree_map_only(
+            ir.IRNode,
+            lambda t: ir.ir_node_to_tensor(t, guard_shape=False),
+            (args, kwargs),
+        )
+        output = aten.convolution_backward(*args_fake, **kwargs_fake)
+
+    def constraints(
+        grad_output,
+        input,
+        weight,
+        bias_sizes,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        output_mask,
+    ):
+        out = (
+            output[0]
+            if output[0] is not None
+            else output[1]
+            if output[1] is not None
+            else output[2]
+        )
+        if out is not None:
+            stride_order = get_stride_order(out.stride())
+            grad_output = ir.ExternKernel.require_stride_order(
+                grad_output, stride_order
+            )
+            weight = ir.ExternKernel.require_stride_order(weight, stride_order)
+            # Only make input contiguous when it is necessary for the backwards computation
+            if output_mask[1]:
+                input = ir.ExternKernel.require_stride_order(input, stride_order)
+
+        return (
+            grad_output,
+            input,
+            weight,
+            bias_sizes,
+            stride,
+            padding,
+            dilation,
+            transposed,
+            output_padding,
+            groups,
+            output_mask,
+        ), {}
+
+    return constraints(*args, **kwargs)
+
+
+def require_dense(*args, **kwargs):
+    args, kwargs = pytree.tree_map_only(
+        ir.IRNode, lambda t: ir.ExternKernel.require_stride1(t), (args, kwargs)
+    )
+    return args, kwargs
+
+
+def require_contiguous(*args, **kwargs):
+    args, kwargs = pytree.tree_map_only(
+        ir.IRNode, lambda t: ir.ExternKernel.require_contiguous(t), (args, kwargs)
+    )
+    return args, kwargs
+
+
 if has_torchvision_roi_align():
     make_fallback(torch.ops.torchvision.roi_align)
 
 # TODO(jansel): we should implement decomps or lowerings for these
 # https://github.com/pytorch/torchdynamo/issues/327
-make_fallback(aten._adaptive_avg_pool2d_backward)
-make_fallback(aten.convolution_backward)
-make_fallback(aten._cudnn_rnn)
-make_fallback(aten._cudnn_rnn_backward)
-make_fallback(aten.cumsum)
-make_fallback(aten._embedding_bag)
-make_fallback(aten._embedding_bag_forward_only)
+make_fallback(aten._adaptive_avg_pool2d_backward, require_dense)
+make_fallback(aten.convolution_backward, inps_hook=conv_backward)
+make_fallback(aten._cudnn_rnn, require_dense)
+make_fallback(aten._cudnn_rnn_backward, inps_hook=require_contiguous)
+make_fallback(aten.cumsum, inps_hook=require_dense)
+make_fallback(aten._embedding_bag, inps_hook=require_contiguous)
+make_fallback(aten._embedding_bag_forward_only, inps_hook=require_contiguous)
 make_fallback(aten._fused_moving_avg_obs_fq_helper)
 make_fallback(aten._fused_moving_avg_obs_fq_helper_functional)
-make_fallback(aten.grid_sampler_2d_backward)
+make_fallback(aten.grid_sampler_2d_backward, inps_hook=require_dense)
 make_fallback(aten.randperm)
 make_fallback(aten.sort)
 make_fallback(aten.sort.stable)
 make_fallback(aten._sparse_coo_tensor_with_dims_and_tensors)
-make_fallback(aten._thnn_fused_lstm_cell)
+make_fallback(aten._thnn_fused_lstm_cell, inps_hook=require_dense)
 make_fallback(aten.topk)
-make_fallback(aten.upsample_bicubic2d_backward)
-make_fallback(aten.upsample_bilinear2d_backward)
+make_fallback(aten.upsample_bicubic2d_backward, inps_hook=require_contiguous)
+make_fallback(aten.upsample_bilinear2d_backward, inps_hook=require_dense)
 
 
 @register_lowering(aten.convolution)
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index 60bba3dcf7dc3..829fbd2897d5f 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -267,3 +267,7 @@ def fresh_inductor_cache(cache_entries=None):
                                 if ".lock" not in f
                             }
                         )
+
+
+def argsort(seq):
+    return sorted(range(len(seq)), key=seq.__getitem__)

From f217df4ca4b54ebf191aab9ba7c03d12a40a2432 Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@fb.com>
Date: Thu, 10 Nov 2022 01:45:52 +0000
Subject: [PATCH 0729/1922] Add absolute latency to dashboard (#88790)

Add absolute latency to dashboard, as requested by https://github.com/pytorch/torchdynamo/issues/1833#issuecomment-1302742914

Tested by setting `run.sh` to
```
# Setup the output directory
rm -rf ../test-dynamo-runner-logs-7/
mkdir ../test-dynamo-runner-logs-7/

# Commands for torchbench for device=cuda, dtype=float32 for training and for performance testing
python benchmarks/dynamo/torchbench.py --performance --float32 -dcuda --output=../test-dynamo-runner-logs-7//inductor_torchbench_float32_training_cuda_performance.csv --training --inductor   --no-skip --dashboard --only mobilenet_v2 --cold_start_latency

# Commands for torchbench for device=cuda, dtype=float32 for training and for accuracy testing
python benchmarks/dynamo/torchbench.py --accuracy --float32 -dcuda --output=../test-dynamo-runner-logs-7//inductor_torchbench_float32_training_cuda_accuracy.csv --training --inductor   --no-skip --dashboard --only mobilenet_v2
```
and running `python benchmarks/dynamo/runner.py --output-dir ../test-dynamo-runner-logs-7/ --dashboard-archive-path /data/home/williamwen/dynamo-runner-logs-copy --training --run --compilers inductor --flag-compilers inductor --suites torchbench --update-dashboard`  (need to comment out the `generate_commands` line and change the github issue ID from 681 to something else).

Sample comment: https://github.com/pytorch/torchdynamo/issues/1831#issuecomment-1309645562

NOTE: this change breaks processing old logs.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88790
Approved by: https://github.com/anijain2305
---
 benchmarks/dynamo/common.py | 10 ++++++++--
 benchmarks/dynamo/runner.py | 13 ++++++++++++-
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 3ebed5cee43f6..758f4396b5b1b 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -411,8 +411,14 @@ def maybe_profile(*args, **kwargs):
             timings,
         )
 
-    headers = ("dev", "name", "batch_size", "speedup")
-    row = [current_device, current_name, current_batch_size, float(speedup)]
+    headers = ("dev", "name", "batch_size", "speedup", "abs_latency")
+    row = [
+        current_device,
+        current_name,
+        current_batch_size,
+        float(speedup),
+        median[1] * 1000,
+    ]
     if "compilation_latency" in kwargs:
         headers = headers + ("compilation_latency", "compression_ratio")
         row.append(kwargs["compilation_latency"])
diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py
index 90f30a286ec4f..ea54bb9489621 100755
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@@ -441,7 +441,12 @@ def __init__(
         )
         self.parsed_frames = defaultdict(lambda: defaultdict(None))
         self.untouched_parsed_frames = defaultdict(lambda: defaultdict(None))
-        self.metrics = ["speedup", "compilation_latency", "compression_ratio"]
+        self.metrics = [
+            "speedup",
+            "abs_latency",
+            "compilation_latency",
+            "compression_ratio",
+        ]
         self.bottom_k = 50
         self.parse()
 
@@ -474,6 +479,7 @@ def read_csv(self, output_filename):
                     "name",
                     "batch_size",
                     "speedup",
+                    "abs_latency",
                     "compilation_latency",
                     "compression_ratio",
                 ],
@@ -511,6 +517,8 @@ def extract_df(self, metric, testing):
             for compiler in self.compilers:
                 output_filename = f"{self.output_dir}/{compiler}_{suite}_{dtype}_{self.mode}_{device}_{testing}.csv"
                 df = self.read_csv(output_filename)
+                if metric not in df:
+                    df.insert(len(df.columns), metric, np.nan)
                 df = df[["dev", "name", "batch_size", metric]]
                 df.rename(columns={metric: compiler}, inplace=True)
                 df["batch_size"] = df["batch_size"].astype(int)
@@ -693,6 +701,8 @@ def get_metric_title(self, metric):
             return "Compilation latency (sec)"
         elif metric == "compression_ratio":
             return "Peak Memory Compression Ratio"
+        elif metric == "abs_latency":
+            return "Absolute latency (ms)"
         raise RuntimeError("unknown metric")
 
     def generate_warnings(self):
@@ -729,6 +739,7 @@ def prepare_message(self, suite):
             "accuracy",
             "compilation_latency",
             "compression_ratio",
+            "abs_latency",
         ]:
             df = self.untouched_parsed_frames[suite][metric]
             df = df.drop("dev", axis=1)

From 32f2721581584b91b6ec10ea03443ed56a5ef567 Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@fb.com>
Date: Thu, 10 Nov 2022 01:48:04 +0000
Subject: [PATCH 0730/1922] Do not flag models in dashboard due to NaN values
 (#88792)

Title.

Tested by running `python benchmarks/dynamo/runner.py --output-dir ../test-dynamo-runner-logs-4 --training --visualize_logs` on a copy of a recent set of logs.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88792
Approved by: https://github.com/anijain2305
---
 benchmarks/dynamo/runner.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py
index ea54bb9489621..9c0538368b449 100755
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@@ -121,15 +121,15 @@
 
 
 def flag_speedup(x):
-    return pd.isna(x) or x < 0.95
+    return x < 0.95
 
 
 def flag_compilation_latency(x):
-    return pd.isna(x) or x == 0 or x > 120
+    return x > 120
 
 
 def flag_compression_ratio(x):
-    return pd.isna(x) or x < 0.9
+    return x < 0.9
 
 
 FLAG_FNS = {

From 0ff19b0e38e8f07966f689bf7131c99888e5d11a Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 10 Nov 2022 01:57:17 +0000
Subject: [PATCH 0731/1922] [dynamo] fixes dict changed during runtime error 
 (#87526)

Fixes https://github.com/pytorch/torchdynamo/issues/1744

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87526
Approved by: https://github.com/ezyang
---
 test/dynamo/test_aot_cudagraphs.py |  3 ---
 torch/_dynamo/convert_frame.py     | 15 ++++++++++++---
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/test/dynamo/test_aot_cudagraphs.py b/test/dynamo/test_aot_cudagraphs.py
index cb1d2a0e601ff..fdb7c88762b8b 100644
--- a/test/dynamo/test_aot_cudagraphs.py
+++ b/test/dynamo/test_aot_cudagraphs.py
@@ -71,7 +71,6 @@ def fn(x, y):
         y = torch.randn(3, device="cuda")
         fn(x, y)
 
-    @patch("torch._dynamo.config.suppress_errors", True)
     @patch_all()
     def test_dtoh(self):
         def model(x, y):
@@ -105,7 +104,6 @@ def fn(x, y):
         y = torch.randn((), device="cpu")
         fn(x, y)
 
-    @patch("torch._dynamo.config.suppress_errors", True)
     @patch("functorch._src.config.use_functionalize", True)
     @patch_all(ok=False)  # input mutation not supported yet
     def test_mutate_input(self):
@@ -145,7 +143,6 @@ def fn(x, y):
         y = torch.randn(1, device="cuda")
         fn(x, y)
 
-    @patch("torch._dynamo.config.suppress_errors", True)
     @patch_all()
     def test_factory(self):
         def model(y):
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index f1ce83727a19f..db9b23f2da7e3 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -156,7 +156,11 @@ def has_tensor(obj):
             seen_ids[obj_id] = any([has_tensor(v) for v in obj])
             return seen_ids[obj_id]
         elif istype(obj, dict):
-            seen_ids[obj_id] = any([has_tensor(v) for v in obj.values()])
+            # Some packages like pytest can be updated during runtime. So, make a
+            # copy of values to avoid issues like "RuntimeError: dictionary
+            # changed size during iteration"
+            values = list(obj.values())
+            seen_ids[obj_id] = any([has_tensor(v) for v in values])
             return seen_ids[obj_id]
         elif istype(obj, (str, int, float, type(None), bool)):
             seen_ids[obj_id] = False
@@ -164,8 +168,13 @@ def has_tensor(obj):
         elif is_namedtuple(obj):
             seen_ids[obj_id] = any([has_tensor(getattr(obj, v)) for v in obj._fields])
             return seen_ids[obj_id]
-        elif not is_allowed(obj) and hasattr(obj, "__dict__") and len(obj.__dict__):
-            seen_ids[obj_id] = any([has_tensor(v) for v in obj.__dict__.values()])
+        elif (
+            not is_allowed(obj)
+            and not hasattr(obj, "__get__")  # overridden get can mutate the object
+            and hasattr(obj, "__dict__")
+            and istype(obj.__dict__, dict)
+        ):
+            seen_ids[obj_id] = has_tensor(obj.__dict__)
             return seen_ids[obj_id]
         else:
             # if config.debug:

From 716550332accc70972c48ac5ba3096c1bd2dd874 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Thu, 10 Nov 2022 03:04:57 +0000
Subject: [PATCH 0732/1922] [vision hash update] update the pinned vision hash
 (#88742)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned vision hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88742
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/vision.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index b985bb4d5e300..d8180093d8859 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-bf58902b2fd881c760cd2eeacfae2d7c468ebf1f
+ffd5a567eb90abf6b5555063da434d3c130d540f

From 7f53c699c46d2b699ef06e03b72a92c6036ac5f5 Mon Sep 17 00:00:00 2001
From: efiks <5167930+efiks@users.noreply.github.com>
Date: Thu, 10 Nov 2022 06:11:05 +0000
Subject: [PATCH 0733/1922] [caffe2][tourch] Optimize BatchBoxCox (#87585)

Differential Revision: D40215424

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87585
Approved by: https://github.com/hyuen
---
 caffe2/perfkernels/batch_box_cox_avx2.cc | 117 ++++++++++++++++++++---
 caffe2/perfkernels/lstm_unit_cpu-impl.h  |  22 +----
 caffe2/perfkernels/vectorizer.h          |  28 ++++++
 3 files changed, 131 insertions(+), 36 deletions(-)
 create mode 100644 caffe2/perfkernels/vectorizer.h

diff --git a/caffe2/perfkernels/batch_box_cox_avx2.cc b/caffe2/perfkernels/batch_box_cox_avx2.cc
index cf0801b4733ef..8b93293646dba 100644
--- a/caffe2/perfkernels/batch_box_cox_avx2.cc
+++ b/caffe2/perfkernels/batch_box_cox_avx2.cc
@@ -3,6 +3,35 @@
 #include <caffe2/perfkernels/common.h>
 #include <folly/SingletonThreadLocal.h>
 
+#include "vectorizer.h"
+
+#ifndef VECTORIZED_KERNEL
+#define CPU_CAPABILITY_AVX2
+#include <ATen/cpu/vec/vec.h>
+
+namespace at::vec {
+
+template <typename scalar_t>
+Vectorized<scalar_t> max(const Vectorized<scalar_t>& a, const Vectorized<scalar_t>& b);
+
+// Implements the vectorized version of std::max() operation,
+// which DOESNOT propagates NaN for second argument
+template <>
+Vectorized<double> max(const Vectorized<double>& a, const Vectorized<double>& b) {
+  // std::max(NaN, nonNan) -> NaN
+  return _mm256_max_pd(b, a);
+}
+
+
+template <>
+Vectorized<float> max(const Vectorized<float>& a, const Vectorized<float>& b) {
+  // std::max(NaN, nonNan) -> NaN
+  return _mm256_max_ps(b, a);
+}
+
+}
+#endif
+
 #include <cstdint>
 #include <cmath>
 #include <vector>
@@ -65,6 +94,7 @@ DELEGATE_SIMPLE_UNARY_FUNCTION(float, Ln, vsLn)
 DELEGATE_SIMPLE_UNARY_FUNCTION(double, Ln, vdLn)
 #undef DELEGATE_SIMPLE_UNARY_FUNCTION
 
+#ifndef VECTORIZED_KERNEL
 template <typename T>
 void box_cox_zero_lambda(
     size_t D,
@@ -72,36 +102,93 @@ void box_cox_zero_lambda(
     const T* const lambda2_data,
     T k_eps,
     T* const output_data) {
-  Add(D, self_data, lambda2_data, output_data);
-  for (const auto j : c10::irange(D)) {
-    output_data[j] = std::max(output_data[j], k_eps);
+  int j = 0;
+  using Vec = at::vec::Vectorized<T>;
+  constexpr int64_t VLEN = Vec::size();
+  auto k_eps_vec = Vec(k_eps);
+  for(; j + VLEN < D; j += VLEN) {
+    auto data = Vec::loadu(self_data + j);
+    auto lambda2 = Vec::loadu(lambda2_data + j);
+    auto sum = data + lambda2;
+    auto max = at::vec::max(sum, k_eps_vec);
+    auto res = max.log();
+    res.store(output_data + j);
+  }
+  for ( ;j < D; ++j) {
+    auto sum = self_data[j] + lambda2_data[j];
+    auto max = std::max(sum, k_eps);
+    output_data[j] = std::log(max);
   }
-
-  Ln(D, output_data, output_data);
 }
 
 template <typename T>
 void box_cox_nonzero_lambda(
+    int64_t D,
+    const T* data_ptr,
+    const T* lambda1_ptr,
+    const T* lambda2_ptr,
+    T k_eps,
+    T* out) {
+
+  int j = 0;
+  using Vec = at::vec::Vectorized<T>;
+  constexpr int64_t VLEN = Vec::size();
+  auto k_eps_vec = Vec(k_eps);
+  for(; j + VLEN < D; j += VLEN) {
+    auto data = Vec::loadu(data_ptr + j);
+    auto lambda2 = Vec::loadu(lambda2_ptr + j);
+    auto sum = data + lambda2;
+    auto max = at::vec::max(sum, k_eps_vec);
+    auto lambda1 = Vec::loadu(lambda1_ptr + j);
+    auto lambda_over_1 = lambda1.reciprocal();
+    auto pow = max.pow(lambda1);
+    auto res = at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
+    res.store(out + j);
+  }
+  for ( ;j < D; ++j) {
+    auto sum = data_ptr[j] + lambda2_ptr[j];
+    auto max = std::max(sum, k_eps);
+    auto lambda_over_1 = 1 / lambda1_ptr[j];
+    auto pow = std::pow(max, lambda1_ptr[j]);
+    out[j] = pow * lambda_over_1 - lambda_over_1;
+  }
+}
+#else
+template <typename T>
+void box_cox_zero_lambda(
     size_t D,
     const T* const self_data,
-    const T* const lambda1_data,
     const T* const lambda2_data,
     T k_eps,
     T* const output_data) {
-  Add(D, self_data, lambda2_data, output_data);
-  for (const auto j : c10::irange(D)) {
-    output_data[j] = std::max(output_data[j], k_eps);
+  VECTOR_LOOP for (auto j=0 ;j < D; ++j) {
+    auto sum = self_data[j] + lambda2_data[j];
+    auto max = std::max(sum, k_eps);
+    output_data[j] = std::log(max);
   }
+}
 
-  // output = output ^ lambda1
-  Pow(D, output_data, lambda1_data, output_data);
-  // output = (output  - 1)/ lambda1
-  for (const auto j : c10::irange(D)) {
-    output_data[j] -= 1.0;
+template <typename T>
+void box_cox_nonzero_lambda(
+    int64_t D,
+    const T* data_ptr,
+    const T* lambda1_ptr,
+    const T* lambda2_ptr,
+    T k_eps,
+    T* out) {
+
+  VECTOR_LOOP for (auto j=0 ;j < D; ++j) {
+    FAST_MATH
+    auto sum = data_ptr[j] + lambda2_ptr[j];
+    auto max = std::max(sum, k_eps);
+    auto lambda_over_1 = 1 / lambda1_ptr[j];
+    auto pow = std::pow(max, lambda1_ptr[j]);
+    out[j] = pow * lambda_over_1 - lambda_over_1;
   }
-  Div(D, output_data, lambda1_data, output_data);
 }
 
+#endif
+
 template <typename T>
 void box_cox_mixed_lambda(
     const T* const self_data,
diff --git a/caffe2/perfkernels/lstm_unit_cpu-impl.h b/caffe2/perfkernels/lstm_unit_cpu-impl.h
index 5e76e1aa39fe5..239d2807f7788 100644
--- a/caffe2/perfkernels/lstm_unit_cpu-impl.h
+++ b/caffe2/perfkernels/lstm_unit_cpu-impl.h
@@ -5,27 +5,7 @@
 #include "c10/util/irange.h"
 #include "caffe2/utils/conversions.h"
 
-#if (ENABLE_VECTORIZATION > 0) && !defined(_DEBUG) && !defined(DEBUG)
-#if defined(__clang__) && (__clang_major__ > 7)
-#define IS_SANITIZER                          \
-  ((__has_feature(address_sanitizer) == 1) || \
-   (__has_feature(memory_sanitizer) == 1) ||  \
-   (__has_feature(thread_sanitizer) == 1) ||  \
-   (__has_feature(undefined_sanitizer) == 1))
-
-#if IS_SANITIZER == 0
-#define VECTOR_LOOP _Pragma("clang loop vectorize(enable)")
-#endif
-#elif defined(_OPENMP) && (_OPENMP >= 201511)
-// Support with OpenMP4.5 and above
-#define VECTOR_LOOP _Pragma("omp for simd")
-#endif
-#endif
-
-#ifndef VECTOR_LOOP
-// Not supported
-#define VECTOR_LOOP
-#endif
+#include "vectorizer.h"
 
 namespace caffe2 {
 namespace perfkernels {
diff --git a/caffe2/perfkernels/vectorizer.h b/caffe2/perfkernels/vectorizer.h
new file mode 100644
index 0000000000000..be4e6bbc280f0
--- /dev/null
+++ b/caffe2/perfkernels/vectorizer.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#if (ENABLE_VECTORIZATION > 0) && !defined(_DEBUG) && !defined(DEBUG)
+#if defined(__clang__) && (__clang_major__ > 7)
+#define IS_SANITIZER                          \
+  ((__has_feature(address_sanitizer) == 1) || \
+   (__has_feature(memory_sanitizer) == 1) ||  \
+   (__has_feature(thread_sanitizer) == 1) ||  \
+   (__has_feature(undefined_sanitizer) == 1))
+
+#if IS_SANITIZER == 0
+#define VECTOR_LOOP _Pragma("clang loop vectorize(enable)")
+#define FAST_MATH _Pragma("clang fp contract(fast)")
+#define VECTORIZED_KERNEL 1
+#endif
+#elif defined(_OPENMP) && (_OPENMP >= 201511)
+// Support with OpenMP4.5 and above
+#define VECTOR_LOOP _Pragma("omp for simd")
+#define VECTORIZED_KERNEL 1
+#define FAST_MATH
+#endif
+#endif
+
+#ifndef VECTOR_LOOP
+// Not supported
+#define VECTOR_LOOP
+#define FAST_MATH
+#endif

From e2e970eee4f6a42c151a43ea8b6a1147bc42ca32 Mon Sep 17 00:00:00 2001
From: Grigory Sizov <grigorysizov@fb.com>
Date: Thu, 10 Nov 2022 08:12:56 +0000
Subject: [PATCH 0734/1922] Support src_mask and src_key_padding_mask for
 Better Transformer (#88488)

Fixes T135842750 (follow-up for #87377)

## Description

At present, having both `src_key_padding_mask` and `src_mask` at the same time is not supported on the fastpath in Transformer and Multi-Head Attention.

This PR enables using both masks on the fastpath on CPU and GPU: if both masks are passed, we merge them into a 4D mask in Python and change mask type to 2 before passing downstream.

Downstream processing in native code is not changed, as it already supports 4D mask. Indeed, it is done depending on the device:
- on CUDA, by `SoftMax.cu::masked_softmax_cuda`. When mask type is 2, it calls either `dispatch_softmax_forward` -> `softmax_warp_forward` or `at::softmax` (depending on the input size). In both cases 4D mask is supported.
- on CPU, by `SoftMax.cpp::masked_softmax_cpp`. It calls `hosted_softmax` which supports 4D mask.

## Tests
- Extended `test_mask_check_fastpath` to check that fast path is indeed taken in Transformer when two masks are passed
- Added `test_multihead_self_attn_two_masks_fast_path_mock` to check that fast path is taken in MHA when two masks are passed
- Added `test_multihead_self_attn_two_masks_fast_path` to check that fast and slow paths give the same result when two masks are passed in MHA
- `test_masked_softmax_mask_types` now covers mask type 2
- `test_transformerencoderlayer_fast_path` (CPU smoke test) is expanded to the case of both masks provided simultaneously
- `test_masked_softmax_devices_parity` checks that mask type 2 is accepted by CPU and CUDA paths

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88488
Approved by: https://github.com/mikekgfb
---
 test/test_nn.py                 | 132 ++++++++++++++++++++++++++++----
 test/test_transformers.py       |  31 ++++----
 torch/nn/modules/activation.py  |  48 ++++++++++--
 torch/nn/modules/transformer.py |  12 +--
 4 files changed, 182 insertions(+), 41 deletions(-)

diff --git a/test/test_nn.py b/test/test_nn.py
index d2eac6a277d7e..b07793e79f48f 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -13122,10 +13122,10 @@ def _slow_masked_softmax(self, input, mask):
         s = exp.sum(dim=3, keepdim=True).expand(exp.size())
         return exp / s
 
-    def test_masked_softmax_mask_types_0_1(self, device):
-        # Test that mask type 0 (LxL attention mask) and mask type 1 (BxL padding mask)
-        # are processed correctly on the fast path and the results match explicit slow
-        # calculation.
+    def test_masked_softmax_mask_types(self, device):
+        # Test that mask type 0 (LxL attention mask), mask type 1 (BxL padding mask),
+        # and mask type 2 (generic BxHxLxL mask) are processed correctly on the
+        # fast path and the results match explicit slow calculation.
         sizes = [(1, 1, 32), (3, 16, 310), (12, 4, 1024), (4, 2, 1200)]
 
         for (B, num_heads, L) in sizes:
@@ -13138,7 +13138,12 @@ def test_masked_softmax_mask_types_0_1(self, device):
             src_key_padding_mask_orig = torch.randint(0, 2, (B, L)).bool()
             src_key_padding_mask = src_key_padding_mask_orig.reshape(B, 1, 1, L).expand(B, num_heads, L, L).bool()
 
-            masks = [(src_mask_orig, src_mask, 0), (src_key_padding_mask_orig, src_key_padding_mask, 1)]
+            # mask_type == 2 =>  shape BxHxLxL
+            generic_mask = torch.randint(0, 2, (B, num_heads, L, L)).bool()
+            masks = [(src_mask_orig, src_mask, 0),
+                     (src_key_padding_mask_orig, src_key_padding_mask, 1),
+                     (generic_mask, generic_mask, 2)
+                     ]
             for dim in [0, 3]:
                 for mask_orig, mask, mask_type in masks:
                     if (self.device_type == "cuda") and (num_heads % 2) and (mask_type == 1):
@@ -13173,8 +13178,8 @@ def slow_masked_softmax(input, mask):
 
     @onlyCUDA
     def test_masked_softmax_devices_parity(self):
-        # Test that softmax with mask type 0 (LxL attention mask) and mask type 1 (BxL padding mask)
-        # gives the same result on CPU and on CUDA
+        # Test that softmax with mask type 0 (LxL attention mask), mask type 1 (BxL padding mask),
+        # and mask type 2 (BxHxLxL generic mask) gives the same result on CPU and on CUDA.
 
         sizes = [(1, 1, 32), (3, 16, 310), (12, 4, 1024), (4, 2, 1200)]
         for (B, num_heads, L) in sizes:
@@ -13182,7 +13187,9 @@ def test_masked_softmax_devices_parity(self):
             src_mask = torch.randint(0, 2, (L, L)).bool()
             # mask_type == 1 => padding mask of shape BxL
             src_key_padding_mask = torch.randint(0, 2, (B, L)).bool()
-            masks = [(src_mask, 0), (src_key_padding_mask, 1)]
+            # mask_type == 2 => generic mask of shape BxHxLxL
+            generic_mask = torch.randint(0, 2, (B, num_heads, L, L)).bool()
+            masks = [(src_mask, 0), (src_key_padding_mask, 1), (generic_mask, 2)]
             input = torch.randn((B, num_heads, L, L))
             for dim in [0, 3]:
                 for mask, mask_type in masks:
@@ -13197,8 +13204,10 @@ def softmax_on_device(mask, input, device):
                         softmax_res = torch._masked_softmax(input_device, mask_device, dim, mask_type)
                         if mask_type == 0:
                             mask_expanded = mask_device.reshape(1, 1, L, L).expand(B, num_heads, L, L).bool()
-                        else:
+                        elif mask_type == 1:
                             mask_expanded = mask_device.reshape(B, 1, 1, L).expand(B, num_heads, L, L).bool()
+                        else:
+                            mask_expanded = mask_device
                         # In result, should only fill the entirely masked out rows since those are non-deterministic (*may* be 0)
                         # Fill rows with all True's with 0
                         mask_out = mask_expanded.all(dim, keepdim=True).expand(mask_expanded.shape)
@@ -13209,6 +13218,93 @@ def softmax_on_device(mask, input, device):
                     cuda_res = softmax_on_device(mask, input, "cuda")
                     self.assertEqual(cpu_res, cuda_res, exact_dtype=True)
 
+    def test_multihead_self_attn_two_masks_fast_path(self, device):
+        """
+        Multihead self-attention should give the same result on the fast path (BetterTransformer) as on the slow path
+        when both attention mask (mask type 0) and key padding mask (mask type 1) are provided
+        """
+        with torch.no_grad():
+            embed_dim = 14
+            num_heads = 7
+            batch_size = 8
+            src_len = 5
+
+            query = value = key = torch.rand(batch_size, src_len, embed_dim).to(device)
+            # Create masks of two different types
+            attn_mask = torch.randint(0, 2, (src_len, src_len)).bool().to(device)
+            key_padding_mask = torch.randint(0, 2, (batch_size, src_len)).bool().to(device)
+
+            # We'll need expanded versions of the masks for masking out the outputs below
+            attn_mask_expanded = attn_mask.reshape(1, 1, src_len, src_len) \
+                                          .expand(batch_size, num_heads, src_len, src_len)
+            key_padding_mask_expanded = key_padding_mask.reshape(batch_size, 1, 1, src_len) \
+                                                        .expand(batch_size, num_heads, src_len, src_len)
+            merged_mask = attn_mask_expanded.logical_or(key_padding_mask_expanded)
+
+            # Compute attention on the fast path
+            mta_model = torch.nn.MultiheadAttention(embed_dim, num_heads, batch_first=True, device=device)
+            mta_model.training = False
+            result_fast_path, _ = mta_model(query, key, value, attn_mask=attn_mask, key_padding_mask=key_padding_mask)
+
+            # Compute attention on the slow path
+            result_ref, _ = torch.nn.functional.multi_head_attention_forward(query.transpose(0, 1),
+                                                                             key.transpose(0, 1),
+                                                                             value.transpose(0, 1),
+                                                                             embed_dim, num_heads,
+                                                                             mta_model.in_proj_weight,
+                                                                             mta_model.in_proj_bias,
+                                                                             mta_model.bias_k, mta_model.bias_v,
+                                                                             mta_model.add_zero_attn,
+                                                                             mta_model.dropout,
+                                                                             mta_model.out_proj.weight,
+                                                                             mta_model.out_proj.bias,
+                                                                             training=mta_model.training,
+                                                                             key_padding_mask=key_padding_mask,
+                                                                             need_weights=False,
+                                                                             attn_mask=attn_mask,
+                                                                             use_separate_proj_weight=False,
+                                                                             q_proj_weight=mta_model.q_proj_weight,
+                                                                             k_proj_weight=mta_model.k_proj_weight,
+                                                                             v_proj_weight=mta_model.v_proj_weight,
+                                                                             average_attn_weights=False,
+                                                                             )
+            result_ref = result_ref.transpose(0, 1)  # Convert to batch-first
+
+            # Rows which are completely masked out are nan, we need to exclude them from comparison
+            mask_out = merged_mask[:, 0, :, :].all(-1, keepdim=True).expand(batch_size, src_len, embed_dim)
+            result_fast_path_masked = result_fast_path.masked_fill(mask_out, 0)
+            result_ref_masked = result_ref.masked_fill(mask_out, 0)
+
+            self.assertEqual(result_fast_path_masked, result_ref_masked)
+
+    @torch.no_grad()
+    @unittest.skipIf(TEST_WITH_CROSSREF, 'CrossRef turns on TorchFunctionMode, and so disables fastpath.')
+    def test_multihead_self_attn_two_masks_fast_path_mock(self, device):
+        """
+        Multihead self-attention should take fast path when both attention mask (mask type 0)
+        and key padding mask (mask type 1) are provided at the same time on CPU and CUDA
+        """
+        if device not in ['cpu', 'cuda']:
+            self.skipTest("Fastpath only runs on CPU and CUDA.")
+        with torch.autocast(device_type=device, enabled=False):
+            embed_dim = 14
+            num_heads = 7
+            batch_size = 8
+            src_len = 5
+
+            query = value = key = torch.rand(batch_size, src_len, embed_dim).to(device)
+            # Create masks of two different types
+            attn_mask = torch.randint(0, 2, (src_len, src_len)).bool().to(device)
+            key_padding_mask = torch.randint(0, 2, (batch_size, src_len)).bool().to(device)
+
+            with mock.patch('torch._native_multi_head_attention') as fastpath_mock:
+                # Compute attention on the fast path
+                mta_model = torch.nn.MultiheadAttention(embed_dim, num_heads, batch_first=True, device=device).eval()
+                mta_model.training = False
+                mta_model(query, key, value, attn_mask=attn_mask, key_padding_mask=key_padding_mask)
+                # If mock was called, fastpath was taken
+                self.assertTrue(fastpath_mock.called)
+
     def test_masked_softmax(self, device):
         sizes = [(1, 1, 32), (3, 16, 310), (12, 4, 1024), (4, 2, 1200)]
         for (B, num_heads, L) in sizes:
@@ -15567,22 +15663,32 @@ def test_transformerencoderlayer_fast_path(self, device, dtype):
         """
         Test transformer fast path on CPU with different valid mask types and shapes
         """
-        model = torch.nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=True, device=device, dtype=dtype)
+        d_model = 512
+        nhead = 8
+        batch_size = 32
+        src_len = 10
+
+        model = torch.nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True,
+                                                 device=device, dtype=dtype, dropout=0)
         model.eval()
 
         # Batched inputs
-        src = torch.rand(32, 10, 512)
+        src = torch.rand(batch_size, src_len, 512)
 
         # Attention mask of shape (src_len, src_len)
-        src_mask = torch.zeros(10, 10).to(torch.bool)
+        src_mask = torch.zeros(src_len, src_len).to(torch.bool)
         with torch.no_grad():
             model(src, src_mask=src_mask)
 
         # Padding mask of shape (batch_size, src_len)
-        src_key_padding_mask = torch.zeros(32, 10).to(torch.bool)
+        src_key_padding_mask = torch.zeros(batch_size, src_len).to(torch.bool)
         with torch.no_grad():
             model(src, src_key_padding_mask=src_key_padding_mask)
 
+        # Provide both masks
+        with torch.no_grad():
+            model(src, src_mask=src_mask, src_key_padding_mask=src_key_padding_mask)
+
 
     @dtypes(torch.float)
     @dtypesIfCUDA(torch.half, torch.float)
diff --git a/test/test_transformers.py b/test/test_transformers.py
index 656191c9ddda7..a9d0d960fb9a6 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -869,17 +869,18 @@ def rand_tensor(*shape):
     @torch.no_grad()
     def test_mask_check_fastpath(self):
         """
-        Test that fastpath is executed independently of the mask that is passed.
-        If the passed mask is left aligned or mask_check=False, test that nested tensors are used (sparsity fastpath),
-        otherwise use fastpath with traditional tensors.
+        Test that fastpath is executed independently of the masks that are passed.
+        If the passed key padding mask is left aligned or mask_check=False, test that nested tensors are used
+        (sparsity fastpath), otherwise use fastpath with traditional tensors.
+        Also test that fast path is executed with both key padding mask and attention mask passed at the same time.
         """
 
         x = torch.Tensor([[[1, 2], [3, 4], [5, 6]]]).to(torch.float)
 
-        def _test_fastpath(model, mask, mock_return_value, nested_tensors=True):
+        def _test_fastpath(model, key_padding_mask, mock_return_value, attn_mask=None, nested_tensors=True):
             with patch('torch._transformer_encoder_layer_fwd') as fastpath_mock:
                 fastpath_mock.return_value = mock_return_value
-                model(x, src_key_padding_mask=mask)
+                model(x, src_key_padding_mask=key_padding_mask, mask=attn_mask)
 
                 # If mock was called, fastpath was taken
                 self.assertTrue(fastpath_mock.called)
@@ -893,31 +894,33 @@ def _test_fastpath(model, mask, mock_return_value, nested_tensors=True):
         model = torch.nn.TransformerEncoder(encoder_layer, num_layers=2, enable_nested_tensor=True, mask_check=True)
         model.eval()
 
-        aligned_mask = torch.Tensor([[0, 0, 1]]).to(torch.bool)
-        not_aligned_mask = torch.Tensor([[1, 0, 1]]).to(torch.bool)
+        aligned_key_padding_mask = torch.Tensor([[0, 0, 1]]).to(torch.bool)
+        not_aligned_key_padding_mask = torch.Tensor([[1, 0, 1]]).to(torch.bool)
+        attn_mask = torch.Tensor([[1, 0, 1], [0, 1, 0], [1, 0, 1]]).to(torch.bool)
         nested_tensor_return_value = torch.nested.nested_tensor([torch.ones((2, 2), dtype=torch.float)])
         tensor_return_value = torch.ones((1, 3, 2), dtype=torch.float)
 
         # Left aligned mask results in sparsity fastpath
-        _test_fastpath(model, aligned_mask, nested_tensor_return_value, nested_tensors=True)
+        _test_fastpath(model, aligned_key_padding_mask, nested_tensor_return_value, nested_tensors=True)
 
         # Not aligned mask results in fastpath
-        _test_fastpath(model, not_aligned_mask, tensor_return_value, nested_tensors=False)
+        _test_fastpath(model, not_aligned_key_padding_mask, tensor_return_value, nested_tensors=False)
 
         model = torch.nn.TransformerEncoder(encoder_layer, num_layers=2, enable_nested_tensor=False, mask_check=True)
         model.eval()
 
         # If nested tensor disabled, fastpath is always taken
-        _test_fastpath(model, aligned_mask, tensor_return_value, nested_tensors=False)
-        _test_fastpath(model, not_aligned_mask, tensor_return_value, nested_tensors=False)
-
+        _test_fastpath(model, aligned_key_padding_mask, tensor_return_value, nested_tensors=False)
+        _test_fastpath(model, not_aligned_key_padding_mask, tensor_return_value, nested_tensors=False)
+        # Fast path is taken if both attention mask and key padding mask are present
+        _test_fastpath(model, aligned_key_padding_mask, tensor_return_value, attn_mask=attn_mask, nested_tensors=False)
 
         model = torch.nn.TransformerEncoder(encoder_layer, num_layers=2, enable_nested_tensor=True, mask_check=False)
         model.eval()
 
         # Mask check disabled results in sparisty fastpath, independently of the mask
-        _test_fastpath(model, aligned_mask, nested_tensor_return_value, nested_tensors=True)
-        _test_fastpath(model, not_aligned_mask, nested_tensor_return_value, nested_tensors=True)
+        _test_fastpath(model, aligned_key_padding_mask, nested_tensor_return_value, nested_tensors=True)
+        _test_fastpath(model, not_aligned_key_padding_mask, nested_tensor_return_value, nested_tensors=True)
 
     @unittest.skipIf(not TEST_CUDA or TEST_WITH_ROCM or IS_WINDOWS, "Flash Attention was not built for this system")
     @parametrize("type", ["dense", "nested"])
diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index d7a9f13809d67..5f5615b496d7d 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -901,6 +901,7 @@ class MultiheadAttention(Module):
 
     - self attention is being computed (i.e., ``query``, ``key``, and ``value`` are the same tensor. This
       restriction will be loosened in the future.)
+    - inputs are batched (3D) with ``batch_first==True``
     - Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor argument ``requires_grad``
     - training is disabled (using ``.eval()``)
     - dropout is 0
@@ -908,9 +909,9 @@ class MultiheadAttention(Module):
     - ``add_zero_attn`` is ``False``
     - ``batch_first`` is ``True`` and the input is batched
     - ``kdim`` and ``vdim`` are equal to ``embed_dim``
-    - at most one of ``key_padding_mask`` or ``attn_mask`` is passed
     - if a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ is passed, neither ``key_padding_mask``
       nor ``attn_mask`` is passed
+    - autocast is disabled
 
     If the optimized implementation is in use, a
     `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ can be passed for
@@ -1094,9 +1095,8 @@ def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: O
         elif not self._qkv_same_embed_dim:
             why_not_fast_path = "_qkv_same_embed_dim was not True"
         elif query.is_nested and (key_padding_mask is not None or attn_mask is not None):
-            why_not_fast_path = "key_padding_mask and attn_mask are not supported with NestedTensor input"
-        elif not query.is_nested and key_padding_mask is not None and attn_mask is not None:
-            why_not_fast_path = "key_padding_mask and attn_mask were both supplied"
+            why_not_fast_path = "supplying both src_key_padding_mask and src_mask at the same time \
+                                 is not supported with NestedTensor input"
         elif torch.is_autocast_enabled():
             why_not_fast_path = "autocast is enabled"
 
@@ -1120,6 +1120,8 @@ def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: O
                 why_not_fast_path = ("grad is enabled and at least one of query or the "
                                      "input/output projection weights or biases requires_grad")
             if not why_not_fast_path:
+                merged_mask, mask_type = self.merge_masks(attn_mask, key_padding_mask, query)
+
                 return torch._native_multi_head_attention(
                     query,
                     key,
@@ -1130,10 +1132,10 @@ def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: O
                     self.in_proj_bias,
                     self.out_proj.weight,
                     self.out_proj.bias,
-                    key_padding_mask if key_padding_mask is not None else attn_mask,
+                    merged_mask,
                     need_weights,
                     average_attn_weights,
-                    1 if key_padding_mask is not None else 0 if attn_mask is not None else None)
+                    mask_type)
 
         any_nested = query.is_nested or key.is_nested or value.is_nested
         assert not any_nested, ("MultiheadAttention does not support NestedTensor outside of its fast path. " +
@@ -1175,6 +1177,40 @@ def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: O
         else:
             return attn_output, attn_output_weights
 
+    def merge_masks(self, attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor],
+                    query: Tensor) -> Tuple[Optional[Tensor], Optional[int]]:
+        r"""
+        Determine mask type and combine masks if necessary. If only one mask is provided, that mask
+        and the corresponding mask type will be returned. If both masks are provided, they will be both
+        expanded to shape ``(batch_size, num_heads, seq_len, seq_len)``, combined with logical ``or``
+        and mask type 2 will be returned
+        Args:
+            attn_mask: attention mask of shape ``(seq_len, seq_len)``, mask type 0
+            key_padding_mask: padding mask of shape ``(batch_size, seq_len)``, mask type 1
+            query: query embeddings of shape ``(batch_size, seq_len, embed_dim)``
+        Returns:
+            merged_mask: merged mask
+            mask_type: merged mask type (0, 1, or 2)
+        """
+        mask_type: Optional[int] = None
+        merged_mask: Optional[Tensor] = None
+        if attn_mask is not None:
+            mask_type = 0
+            merged_mask = attn_mask
+        if key_padding_mask is not None:
+            mask_type = 1
+            merged_mask = key_padding_mask
+        if (attn_mask is not None) and (key_padding_mask is not None):
+            # In this branch query can't be a nested tensor, so it has a shape
+            batch_size, seq_len, _ = query.shape
+            mask_type = 2
+            key_padding_mask_expanded = key_padding_mask.view(batch_size, 1, 1, seq_len) \
+                                                        .expand(-1, self.num_heads, -1, -1)
+            attn_mask_expanded = attn_mask.view(1, 1, seq_len, seq_len).expand(batch_size, self.num_heads, -1, -1)
+            merged_mask = attn_mask_expanded.logical_or(key_padding_mask_expanded)
+        return merged_mask, mask_type
+
+
 class PReLU(Module):
     r"""Applies the element-wise function:
 
diff --git a/torch/nn/modules/transformer.py b/torch/nn/modules/transformer.py
index 34dde6fc224f1..37e8823edf2c2 100644
--- a/torch/nn/modules/transformer.py
+++ b/torch/nn/modules/transformer.py
@@ -467,9 +467,7 @@ def forward(self, src: Tensor, src_mask: Optional[Tensor] = None,
         elif not (self.norm1.eps == self.norm2.eps):
             why_not_sparsity_fast_path = "norm1.eps is not equal to norm2.eps"
         elif src.is_nested and (src_key_padding_mask is not None or src_mask is not None):
-            why_not_sparsity_fast_path = "src_key_padding_mask and src_mask are not supported with NestedTensor input"
-        elif (not src.is_nested) and (src_key_padding_mask is not None and src_mask is not None):
-            why_not_sparsity_fast_path = "src_key_padding_mask and src_mask were both supplied"
+            why_not_sparsity_fast_path = "neither src_key_padding_mask nor src_mask are not supported with NestedTensor input"
         elif self.self_attn.num_heads % 2 == 1:
             why_not_sparsity_fast_path = "num_head is odd"
         elif torch.is_autocast_enabled():
@@ -502,6 +500,7 @@ def forward(self, src: Tensor, src_mask: Optional[Tensor] = None,
                                               "input/output projection weights or biases requires_grad")
 
             if not why_not_sparsity_fast_path:
+                merged_mask, mask_type = self.self_attn.merge_masks(src_mask, src_key_padding_mask, src)
                 return torch._transformer_encoder_layer_fwd(
                     src,
                     self.self_attn.embed_dim,
@@ -521,11 +520,8 @@ def forward(self, src: Tensor, src_mask: Optional[Tensor] = None,
                     self.linear1.bias,
                     self.linear2.weight,
                     self.linear2.bias,
-                    # TODO: if src_mask and src_key_padding_mask merge to single 4-dim mask
-                    src_mask if src_mask is not None else src_key_padding_mask,
-                    1 if src_key_padding_mask is not None else
-                    0 if src_mask is not None else
-                    None,
+                    merged_mask,
+                    mask_type,
                 )
 
 
From 2e284d48bb033a550297226281193d653ed4dac9 Mon Sep 17 00:00:00 2001
From: "Li-Huai (Allan) Lin" <qqaatw@gmail.com>
Date: Thu, 10 Nov 2022 09:40:05 +0000
Subject: [PATCH 0735/1922] [MPS] Fix softplus (#88555)

1. Fixes #87780
2. Fixes mps graph cache issue
3. Adds proper tests

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88555
Approved by: https://github.com/kulinseth
---
 aten/src/ATen/native/mps/OperationUtils.h     |  1 +
 aten/src/ATen/native/mps/OperationUtils.mm    |  8 ++++-
 .../ATen/native/mps/operations/Activation.mm  | 30 +++++++++++--------
 test/test_mps.py                              | 11 ++++---
 4 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
index 8d868386705ac..93b0141243397 100644
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@@ -109,6 +109,7 @@ void printTensorNDArray(const Tensor& t);
 MPSGraphTensor* mpsGraphUnrankedPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType);
 MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType, MPSShape* mpsShape);
 MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph *mpsGraph, const Tensor& tensor);
+MPSGraphTensor* mpsGraphScalarPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType);
 MPSGraphTensor* mpsGraphScalarPlaceHolder(MPSGraph *mpsGraph, const Scalar& scalar);
 
 string get_mem_format_string(c10::MemoryFormat memory_format);
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index 13a88efbfb5d1..f41484b27b143 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -339,6 +339,12 @@ void resize_tensor(Tensor* output) {
                                      name:nil];
 }
 
+MPSGraphTensor* mpsGraphScalarPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType) {
+    return [mpsGraph placeholderWithShape:@[@1]
+                                 dataType:dataType
+                                     name:nil];
+}
+
 MPSGraphTensor* mpsGraphScalarPlaceHolder(MPSGraph *mpsGraph, const Scalar& scalar) {
     return [mpsGraph placeholderWithShape:@[@1]
                                  dataType:getMPSScalarType(scalar.type())
@@ -382,4 +388,4 @@ void executeMPSAllocatorCallback(void* ptr, EventType event) override { }
 
 } // namespace mps
 } // namespace native
-} // namespace at
+} // namespace at
\ No newline at end of file
diff --git a/aten/src/ATen/native/mps/operations/Activation.mm b/aten/src/ATen/native/mps/operations/Activation.mm
index fca3f3f81b33b..3837e407a76bf 100644
--- a/aten/src/ATen/native/mps/operations/Activation.mm
+++ b/aten/src/ATen/native/mps/operations/Activation.mm
@@ -1464,13 +1464,15 @@ Tensor glu_backward_mps (const Tensor& grad_output,
         CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
         MPSGraphTensor *inputTensor_ = nil;
         MPSGraphTensor *betaTensor_ = nil;
+        MPSGraphTensor *thresholdTensor_ = nil;
         MPSGraphTensor *outputTensor_ = nil;
       };
 
       MPSGraphCache* cache_ = MPSGraphCache::getInstance();
 
       MPSStream* stream = getCurrentMPSStream();
-      MPSScalar beta_scalar = getMPSScalar(beta, ScalarType::Float);;
+      MPSScalar beta_scalar = getMPSScalar(beta, ScalarType::Float);
+      MPSScalar threshold_scalar = getMPSScalar(threshold, ScalarType::Float);
 
       @autoreleasepool {
         string key = "softplus_out_mps:" + getTensorsStringKey({self});
@@ -1486,7 +1488,9 @@ Tensor glu_backward_mps (const Tensor& grad_output,
               newCachedGraph = new CachedGraph(mpsGraph);
               MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
 
-              MPSGraphTensor* betaTensor = mpsGraphScalarPlaceHolder(mpsGraph, beta);
+              MPSGraphTensor* betaTensor = mpsGraphScalarPlaceHolder(mpsGraph, getMPSDataType(ScalarType::Float));
+
+              MPSGraphTensor* thresholdTensor = mpsGraphScalarPlaceHolder(mpsGraph, getMPSDataType(ScalarType::Float));
 
               MPSGraphTensor* reluTensor = [mpsGraph reLUWithTensor:inputTensor
                                                                name:nil];
@@ -1499,9 +1503,6 @@ Tensor glu_backward_mps (const Tensor& grad_output,
               MPSGraphTensor* bxTensor = [mpsGraph multiplicationWithPrimaryTensor:inputTensor
                                                                   secondaryTensor:betaTensor
                                                                   name:nil];
-              MPSGraphTensor* thresholdTensor = [mpsGraph constantWithScalar:threshold.to<double>()
-                                                                       shape:@[@1]
-                                                               dataType:getMPSDataType(self.scalar_type())];
               MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:bxTensor
                                                                        secondaryTensor:thresholdTensor
                                                                                   name:nil];
@@ -1524,6 +1525,7 @@ Tensor glu_backward_mps (const Tensor& grad_output,
 
               newCachedGraph->inputTensor_ = inputTensor;
               newCachedGraph->betaTensor_ = betaTensor;
+              newCachedGraph->thresholdTensor_ = thresholdTensor;
               newCachedGraph->outputTensor_ = outputTensor;
             }
             return newCachedGraph;
@@ -1536,7 +1538,8 @@ Tensor glu_backward_mps (const Tensor& grad_output,
         // Create dictionary of inputs and outputs
         NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
           selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
-          cachedGraph->betaTensor_ : getMPSGraphTensorFromScalar(stream, beta_scalar)
+          cachedGraph->betaTensor_ : getMPSGraphTensorFromScalar(stream, beta_scalar),
+          cachedGraph->thresholdTensor_ : getMPSGraphTensorFromScalar(stream, threshold_scalar),
         };
         NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
           outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
@@ -1559,7 +1562,8 @@ Tensor glu_backward_mps (const Tensor& grad_output,
       if(grad_input.numel() == 0)
         return;
 
-      MPSScalar beta_scalar = getMPSScalar(beta, ScalarType::Float);;
+      MPSScalar beta_scalar = getMPSScalar(beta, ScalarType::Float);
+      MPSScalar threshold_scalar = getMPSScalar(threshold, ScalarType::Float);
 
       struct CachedGraph : public MPSCachedGraph
       {
@@ -1567,6 +1571,7 @@ Tensor glu_backward_mps (const Tensor& grad_output,
         MPSGraphTensor *gradOutputTensor_ = nil;
         MPSGraphTensor *inputTensor_ = nil;
         MPSGraphTensor *betaTensor_ = nil;
+        MPSGraphTensor *thresholdTensor_ = nil;
         MPSGraphTensor *outputTensor_ = nil;
       };
 
@@ -1590,7 +1595,9 @@ Tensor glu_backward_mps (const Tensor& grad_output,
 
               MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
 
-              MPSGraphTensor* betaTensor = mpsGraphScalarPlaceHolder(mpsGraph, beta);
+              MPSGraphTensor* betaTensor = mpsGraphScalarPlaceHolder(mpsGraph, getMPSScalarType(ScalarType::Float));
+
+              MPSGraphTensor* thresholdTensor = mpsGraphScalarPlaceHolder(mpsGraph, getMPSScalarType(ScalarType::Float));
 
               MPSGraphTensor* unitTensor = [mpsGraph constantWithScalar:1.0
                                                                   shape:@[@1]
@@ -1609,9 +1616,6 @@ Tensor glu_backward_mps (const Tensor& grad_output,
               rTensor = [mpsGraph divisionWithPrimaryTensor:rTensor
                                             secondaryTensor:unitExpBxTensor
                                                        name:nil];
-              MPSGraphTensor* thresholdTensor = [mpsGraph constantWithScalar:threshold.to<double>()
-                                                                       shape:@[@1]
-                                                               dataType:getMPSDataType(self.scalar_type())];
               MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:bxTensor
                                                                        secondaryTensor:thresholdTensor
                                                                                  name:nil];
@@ -1623,6 +1627,7 @@ Tensor glu_backward_mps (const Tensor& grad_output,
               newCachedGraph->gradOutputTensor_ = gradOutputTensor;
               newCachedGraph->inputTensor_ = inputTensor;
               newCachedGraph->betaTensor_ = betaTensor;
+              newCachedGraph->thresholdTensor_ = thresholdTensor;
               newCachedGraph->outputTensor_ = outputTensor;
             }
             return newCachedGraph;
@@ -1637,7 +1642,8 @@ Tensor glu_backward_mps (const Tensor& grad_output,
         NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
           gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
           selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
-          cachedGraph->betaTensor_ : getMPSGraphTensorFromScalar(stream, beta_scalar)
+          cachedGraph->betaTensor_ : getMPSGraphTensorFromScalar(stream, beta_scalar),
+          cachedGraph->thresholdTensor_ : getMPSGraphTensorFromScalar(stream, threshold_scalar),
         };
         NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
           gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
diff --git a/test/test_mps.py b/test/test_mps.py
index 3f8cce384c1bb..2f07652f8b5bd 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -3832,12 +3832,12 @@ def helper(shape, dim=0):
 
     # Test softplus
     def test_softplus(self):
-        def helper(shape):
+        def helper(shape, beta=0.5, threshold=0.5):
             cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
             x = cpu_x.detach().clone().to('mps').requires_grad_()
 
-            softplus_result = torch.nn.Softplus(beta=0.5, threshold=0.5)(x)
-            softplus_result_cpu = torch.nn.Softplus(beta=0.5, threshold=0.5)(cpu_x)
+            softplus_result = torch.nn.Softplus(beta=beta, threshold=threshold)(x)
+            softplus_result_cpu = torch.nn.Softplus(beta=beta, threshold=threshold)(cpu_x)
 
             cpu_grad = torch.randn(softplus_result.shape)
             grad = cpu_grad.to('mps')
@@ -3851,6 +3851,8 @@ def helper(shape):
         # Test empty shape too
         for shape in [(), (2, 3), (10, 10), (2, 3, 4, 5)]:
             helper(shape)
+            helper(shape, beta=0.6, threshold=0.6)  # relu path
+            helper(shape, beta=1, threshold=20)  # softplus path
 
     # Test silu
 
@@ -7301,6 +7303,7 @@ class TestConsistency(TestCase):
         'nn.functional.smooth_l1_loss': ['f16', 'f32'],
         'nn.functional.soft_margin_loss': ['f32'],
         'nn.functional.softmin': ['f32'],
+        'nn.functional.softplus': ['f32'],
         'nn.functional.softsign': ['f16', 'f32', 'i16', 'u8'],
         'nn.functional.tanhshrink': ['f32', 'i16', 'i32', 'u8'],
         'nn.functional.threshold': ['f32', 'i16', 'i32', 'i64', 'u8'],
@@ -7501,6 +7504,7 @@ class TestConsistency(TestCase):
         'nn.functional.silu': ['f32'],
         'nn.functional.soft_margin_loss': ['f32'],
         'nn.functional.softmin': ['f32'],
+        'nn.functional.softplus': ['f32'],
         'nn.functional.softsign': ['f16', 'f32'],
         'nn.functional.threshold': ['f32'],
         'nn.functional.triplet_margin_loss': ['f32'],
@@ -7593,7 +7597,6 @@ class TestConsistency(TestCase):
         'nn.functional.huber_loss': [torch.float16],
         'nn.functional.local_response_norm': [torch.int64],
         'nn.functional.padcircular': [torch.uint8],
-        'nn.functional.softplus': [torch.float32],
         'pow': [torch.int64],
         'select_scatter': [torch.uint8],
         'sigmoid': [torch.int64],

From 70c61943623eb0fed36b1a0b9f1b3d24be220c35 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 10 Nov 2022 13:44:45 +0000
Subject: [PATCH 0736/1922] add hack to allow hybrid compressed sparse
 comparison in assertEqual (#88749)

Hybrid sparse CSR tensors can currently not be compared to strided ones since `.to_dense` does not work:

```py
import torch
from torch.testing._internal.common_utils import TestCase

assertEqual = TestCase().assertEqual

actual = torch.sparse_csr_tensor([0, 2, 4], [0, 1, 0, 1], [[1, 11], [2, 12] ,[3, 13] ,[4, 14]])
expected = torch.stack([actual[0].to_dense(), actual[1].to_dense()])
assertEqual(actual, expected)
```

```
main.py:4: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at ../aten/src/ATen/SparseCsrTensorImpl.cpp:54.)
  actual = torch.sparse_csr_tensor([0, 2, 4], [0, 1, 0, 1], [[1, 11], [2, 12] ,[3, 13] ,[4, 14]])
Traceback (most recent call last):
  File "/home/philip/git/pytorch/torch/torch/testing/_comparison.py", line 1098, in assert_equal
    pair.compare()
  File "/home/philip/git/pytorch/torch/torch/testing/_comparison.py", line 619, in compare
    actual, expected = self._equalize_attributes(actual, expected)
  File "/home/philip/git/pytorch/torch/torch/testing/_comparison.py", line 706, in _equalize_attributes
    actual = actual.to_dense() if actual.layout != torch.strided else actual
RuntimeError: sparse_compressed_to_dense: Hybrid tensors are not supported

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "main.py", line 10, in <module>
    assertEqual(actual, expected)
  File "/home/philip/git/pytorch/torch/torch/testing/_internal/common_utils.py", line 2503, in assertEqual
    msg=(lambda generated_msg: f"{generated_msg}\n{msg}") if isinstance(msg, str) and self.longMessage else msg,
  File "/home/philip/git/pytorch/torch/torch/testing/_comparison.py", line 1112, in assert_equal
    ) from error

RuntimeError: Comparing

TensorOrArrayPair(
    id=(),
    actual=tensor(crow_indices=tensor([0, 2, 4]),
       col_indices=tensor([0, 1, 0, 1]),
       values=tensor([[ 1, 11],
                      [ 2, 12],
                      [ 3, 13],
                      [ 4, 14]]), size=(2, 2, 2), nnz=4,
       layout=torch.sparse_csr),
    expected=tensor([[[ 1, 11],
         [ 2, 12]],

        [[ 3, 13],
         [ 4, 14]]]),
    rtol=0.0,
    atol=0.0,
    equal_nan=True,
    check_device=False,
    check_dtype=True,
    check_layout=False,
    check_stride=False,
    check_is_coalesced=False,
)

resulted in the unexpected exception above. If you are a user and see this message during normal operation please file an issue at https://github.com/pytorch/pytorch/issues. If you are a developer and working on the comparison functions, please except the previous error and raise an expressive `ErrorMeta` instead.
```

This adds a temporary hack to `TestCase.assertEqual` to enable this. Basically, we are going through the individual CSR subtensors, call `.to_dense()` on them, and stack everything back together. I opted to not do this in the common machinery, since that way users are not affected by this (undocumented) hack.

I also added an xfailed test that will trigger as soon as the behavior is supported natively so we don't forget to remove the hack when it is no longer needed.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88749
Approved by: https://github.com/mruberry, https://github.com/pearu
---
 test/test_testing.py                    | 11 +++++++++++
 torch/testing/_internal/common_utils.py | 25 +++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/test/test_testing.py b/test/test_testing.py
index ccb2471e71e7b..5ce07ce454dc2 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -1178,6 +1178,17 @@ def test_mismatching_values_msg(self):
             with self.assertRaisesRegex(AssertionError, re.escape("Sparse CSR values")):
                 fn()
 
+    @unittest.expectedFailure
+    def test_hybrid_support(self):
+        # If you read this after the test unexpectedly succeeded, this is a good thing. It means that you added support
+        # for `.to_dense()` for hybrid sparse CSR tensors and in turn enabled support for them in
+        # `torch.testing.assert_close` if comparing to strided tensors. You can safely remove this test as well as the
+        # patch on `TensorOrArrayPair` in `torch.testing._internal.common_utils`.
+        actual = torch.sparse_csr_tensor([0, 2, 4], [0, 1, 0, 1], [[1, 11], [2, 12], [3, 13], [4, 14]])
+        expected = torch.stack([actual[0].to_dense(), actual[1].to_dense()])
+
+        torch.testing.assert_close(actual, expected, check_layout=False)
+
 
 @unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Not all sandcastle jobs support CSC testing")
 class TestAssertCloseSparseCSC(TestCase):
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 6fd64187581f3..8f497d515eb5d 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -1820,6 +1820,31 @@ def __init__(self, actual, expected, *, rtol_override=0.0, atol_override=0.0, **
         self.rtol = max(self.rtol, rtol_override)
         self.atol = max(self.atol, atol_override)
 
+        # This is a slow and ugly hack to allow the comparison of hybrid sparse CSR tensors with strided ones. If
+        # `check_layout=False` (default), the tensors will be converted to strided by calling `.to_dense()` on them.
+        # However, this is not yet supported for hybrid sparse CSR and thus we need to do it manually for now.
+        # FIXME: Remove this as soon as `.to_dense` is supported for hybrid sparse CSR tensors
+        if not self.check_layout:
+            self.actual, self.expected = self._handle_hybrid_sparse_csr(self.actual, self.expected)
+
+    def _handle_hybrid_sparse_csr(self, actual, expected):
+        compressed_sparse_layouts = {torch.sparse_csr, torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc}
+        if not ((actual.layout in compressed_sparse_layouts) ^ (expected.layout in compressed_sparse_layouts)):
+            return actual, expected
+
+        def to_dense(tensor):
+            if tensor.layout not in compressed_sparse_layouts:
+                return tensor
+
+            def partial_to_dense(tensor):
+                if tensor.layout not in compressed_sparse_layouts or tensor.values().ndim == 1:
+                    return tensor.to_dense()
+                return torch.stack([partial_to_dense(sub_tensor) for sub_tensor in tensor])
+
+            return partial_to_dense(tensor)
+
+        return [to_dense(input) for input in [actual, expected]]
+
     def _process_inputs(self, actual, expected, *, id, allow_subclasses):
         self._check_inputs_isinstance(actual, expected, cls=(torch.Tensor, np.ndarray))
 

From 6796acd9066a14171b3dc48c877fd7dd48008820 Mon Sep 17 00:00:00 2001
From: XiaobingSuper <xiaobing.zhang@intel.com>
Date: Thu, 10 Nov 2022 01:30:03 -0500
Subject: [PATCH 0737/1922] torchdynamo: add convolution add(relu) inplace
 fusion kernel (#88048)

This PR is about add convolution add(relu) inplace fusion kernel which  works for **other.add_(conv)**.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88048
Approved by: https://github.com/jgong5, https://github.com/jansel
---
 aten/src/ATen/native/mkldnn/Conv.cpp          | 250 ++++++++++++++----
 aten/src/ATen/native/mkldnn/Linear.cpp        |   5 +-
 .../mkldnn/RegisterMkldnnOpContextClass.cpp   |   4 +-
 aten/src/ATen/native/mkldnn/Utils.cpp         |   9 +-
 aten/src/ATen/native/mkldnn/Utils.h           |   4 +-
 .../check_forward_backward_compatibility.py   |   1 +
 test/test_mkldnn_fusion.py                    |  22 +-
 torch/_inductor/ir.py                         |  14 +-
 torch/_inductor/lowering.py                   |  20 +-
 torch/_inductor/overrides.py                  |  18 +-
 10 files changed, 286 insertions(+), 61 deletions(-)

diff --git a/aten/src/ATen/native/mkldnn/Conv.cpp b/aten/src/ATen/native/mkldnn/Conv.cpp
index 508aefe787ad7..ec62715129f4d 100644
--- a/aten/src/ATen/native/mkldnn/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/Conv.cpp
@@ -9,6 +9,7 @@
 #include <ATen/Functions.h>
 #else
 #include <ATen/ops/_to_dense_native.h>
+#include <ATen/ops/_add_relu_native.h>
 #include <ATen/ops/conv2d.h>
 #include <ATen/ops/conv3d.h>
 #include <ATen/ops/empty.h>
@@ -175,51 +176,23 @@ static inline at::MemoryFormat mkldnn_convolution_memory_format(int64_t dims, bo
    return memory_format;
 }
 
-Tensor _mkldnn_convolution(
+void _mkldnn_convolution_out (
     const Tensor& input_t,
     const Tensor& weight_t,
-    const c10::optional<Tensor>& bias_opt,
-    IntArrayRef padding,
+    const Tensor& bias,
+    std::vector<int64_t>& output_sizes,
+    ideep::tensor& y,
     IntArrayRef stride,
     IntArrayRef dilation,
+    IntArrayRef padding,
     int64_t groups,
-    c10::string_view attr = "none",
-    torch::List<c10::optional<at::Scalar>> scalars =
-        torch::List<c10::optional<at::Scalar>>(),
-    c10::optional<c10::string_view> algorithm = c10::nullopt) {
-  ideep::attr_t op_attr = ideep::attr_t();
-  if (attr != "none") {
-    auto it = fx_fusion_attr_map().find(attr);
-    TORCH_CHECK(it != fx_fusion_attr_map().end(), "Fusion behavior undefined.");
-    op_attr = it->second(scalars, algorithm);
-  }
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
-  const Tensor& bias = *bias_maybe_owned;
-
-  if (input_t.scalar_type() == ScalarType::BFloat16) {
-    TORCH_CHECK(mkldnn_bf16_device_check(),
-        "mkldnn_convolution: bf16 path needs the cpu support avx512bw, avx512vl and avx512dq");
-  }
-
-  check_shape_forward(input_t, weight_t, bias, padding, stride, dilation, groups);
-
-  bool is_channels_last = mkldnn_conv_use_channels_last(input_t, weight_t);
+    bool is_channels_last,
+    const ideep::attr_t& op_attr) {
   auto memory_format = mkldnn_convolution_memory_format(input_t.ndimension(), is_channels_last);
-
   auto input = input_t.is_mkldnn() ? input_t : input_t.contiguous(memory_format);
   auto weight = weight_t.is_mkldnn() ? weight_t : weight_t.contiguous(memory_format);
-  auto output_sizes = conv_output_size(input.sizes(), weight.sizes(), padding, stride, dilation);
-  auto output = at::empty({0}, input.options());
-
   const ideep::tensor x = itensor_from_tensor(input);
   const ideep::tensor w = itensor_from_tensor(weight);
-
-  ideep::tensor y;
-  if (is_channels_last) {
-    output.resize_(output_sizes, memory_format);
-    y = itensor_from_tensor(output);
-  }
   if (bias.defined()) {
     const ideep::tensor b = itensor_from_tensor(bias);
     ideep::convolution_forward::compute_v3(
@@ -249,11 +222,55 @@ Tensor _mkldnn_convolution(
         is_channels_last,
         op_attr);
   }
+}
+
+Tensor _mkldnn_convolution(
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const c10::optional<Tensor>& bias_opt,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    c10::string_view attr = "none",
+    torch::List<c10::optional<at::Scalar>> scalars =
+        torch::List<c10::optional<at::Scalar>>(),
+    c10::optional<c10::string_view> algorithm = c10::nullopt) {
+  ideep::attr_t op_attr = ideep::attr_t();
+  if (attr != "none") {
+    auto it = fusion_unary_attr_map().find(attr);
+    TORCH_CHECK(
+        it != fusion_unary_attr_map().end(), "Fusion behavior undefined.");
+    op_attr = it->second(scalars, algorithm);
+  }
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
+  const Tensor& bias = *bias_maybe_owned;
+
+  if (input_t.scalar_type() == ScalarType::BFloat16) {
+    TORCH_CHECK(mkldnn_bf16_device_check(),
+        "mkldnn_convolution: bf16 path needs the cpu support avx512bw, avx512vl and avx512dq");
+  }
+
+  check_shape_forward(input_t, weight_t, bias, padding, stride, dilation, groups);
+
+  bool is_channels_last = mkldnn_conv_use_channels_last(input_t, weight_t);
+  auto memory_format = mkldnn_convolution_memory_format(input_t.ndimension(), is_channels_last);
+
+  auto output_sizes = conv_output_size(input_t.sizes(), weight_t.sizes(), padding, stride, dilation);
+  auto output = at::empty({0}, input_t.options());
+  ideep::tensor y;
+  if (is_channels_last) {
+    output.resize_(output_sizes, memory_format);
+    y = itensor_from_tensor(output);
+  }
+  _mkldnn_convolution_out(
+    input_t, weight_t, bias, output_sizes, y, stride, dilation, padding, groups, is_channels_last, op_attr);
 
-  if (input.is_mkldnn()) {
-    return MKLDNNTensor(y, input.options());
+  if (input_t.is_mkldnn()) {
+    return MKLDNNTensor(y, input_t.options());
   } else if (!is_channels_last) {
-    return mkldnn_to_dense(MKLDNNTensor(y, input.options()));
+    return mkldnn_to_dense(MKLDNNTensor(y, input_t.options()));
   } else {
     TORCH_INTERNAL_ASSERT(y.get_desc().is_nhwc());
     return output;
@@ -297,6 +314,14 @@ Tensor mkldnn_convolution_pointwise(
       algorithm);
 }
 
+// Fuse convolution+binary_op+unary_op for good performance, which doing such
+// operation: output=unary_op(binary_op(conv(input_t, ...), other_t, alpha)).
+// The binary_attr means which binary_op is, it can be "add", or
+// other binary operation. the unary_attr means which unary_op is,
+// it can be "relu" or other unary operation, if it is none, meaning that
+// there doesn't have a unary post op. unary_scalars and unary_algorithm
+// are the parameters of the unary op, such as "hardtanh" has scalar parameters,
+// "gelu" has algorithm parameters.
 Tensor mkldnn_convolution_pointwise_binary(
     const Tensor& input_t,
     const Tensor& other_t,
@@ -306,10 +331,17 @@ Tensor mkldnn_convolution_pointwise_binary(
     IntArrayRef stride,
     IntArrayRef dilation,
     int64_t groups,
-    c10::string_view attr) {
+    c10::string_view binary_attr,
+    c10::optional<at::Scalar> alpha,
+    c10::optional<c10::string_view> unary_attr,
+    torch::List<c10::optional<at::Scalar>> unary_scalars,
+    c10::optional<c10::string_view> unary_algorithm) {
   TORCH_CHECK(
       input_t.ndimension() == 4 || input_t.ndimension() == 5,
       "mkldnn_convolution_pointwise_binary: currently only support 2d and 3d")
+  TORCH_CHECK(
+      !alpha.has_value() || alpha.value().to<float>() == 1.0,
+      "mkldnn_convolution_pointwise_binary: the alpha value should be none or 1.0");
 
   c10::MaybeOwned<Tensor> bias_maybe_owned =
       at::borrow_from_optional_tensor(bias_opt);
@@ -334,9 +366,22 @@ Tensor mkldnn_convolution_pointwise_binary(
   bool can_be_fused =
       groups == 1 && mkldnn_conv_use_channels_last(input_t, weight_t);
 
-  auto it_binary = fusion_binary_alg_map().find(attr);
+  c10::string_view unary_attr_value = "none";
+  ideep::algorithm unary_alg;
+  if (unary_attr.has_value()) {
+    auto it_unary = fusion_unary_alg_map().find(unary_attr.value());
+    // Now, we only support conv+binary+relu.
+    TORCH_CHECK(
+        it_unary != fusion_unary_alg_map().end(),
+        "Unary Fusion behavior undefined.");
+    unary_attr_value = unary_attr.value();
+    unary_alg = it_unary->second;
+  }
+  auto it_binary = fusion_binary_alg_map().find(binary_attr);
   TORCH_CHECK(
-      it_binary != fusion_binary_alg_map().end(), "Fusion behavior undefined.");
+      it_binary != fusion_binary_alg_map().end(),
+      "Binary Fusion behavior undefined.");
+
   if (can_be_fused) {
     c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset);
     auto memory_format =
@@ -356,7 +401,15 @@ Tensor mkldnn_convolution_pointwise_binary(
     }
     auto other_desc = ideep::tensor::desc(
         output_size, get_mkldnn_dtype(weight.scalar_type()), format_tag);
-    auto op_attr = ideep::attr_t::fuse_binary(it_binary->second, other_desc);
+
+    ideep::attr_t op_attr;
+    ideep::post_ops po;
+    po.append_binary(it_binary->second, other_desc);
+    if (unary_attr_value != "none") {
+      po.append_eltwise(1.0, unary_alg, 0.f, 0.f);
+    }
+    op_attr.set_post_ops(po);
+
     if (bias.defined()) {
       const ideep::tensor b = itensor_from_tensor(bias);
       ideep::convolution_forward::compute_binary(
@@ -400,19 +453,123 @@ Tensor mkldnn_convolution_pointwise_binary(
       output = at::conv3d(
           input_t, weight_t, bias_opt, stride, padding, dilation, groups);
     }
-    if (attr == "add") {
+    if (binary_attr == "add" && unary_attr_value != "none") {
+      output = at::native::add_relu_(output, other_t);
+      return output;
+    }
+    if (binary_attr == "add") {
       output.add_(other_t);
-    } else if (attr == "sub") {
+    } else if (binary_attr == "sub") {
       output.sub_(other_t);
-    } else if (attr == "mul") {
+    } else if (binary_attr == "mul") {
       output.mul_(other_t);
     } else {
       output.div_(other_t);
     }
+    if (unary_attr_value != "none") {
+      output.relu_();
+    }
     return output;
   }
 }
 
+// Fuse convolution+binary_op+unary_op for good performance, which doing
+// such operation: other_t=unary_op(binary_op(conv(input_t, ...), other_t,
+// alpha)). The binary_attr means which binary_op is, it can be "add", or other
+// binary operation. the unary_attr means which unary_op is, it can be "relu" or
+// other unary operation, if it is none, meaning that there doesn't have a unary
+// post op. unary_scalars and unary_algorithm are the parameters of the unary
+// op, such as "hardtanh" has scalar parameters "gelu" has algorithm parameters.
+
+Tensor& mkldnn_convolution_pointwise_binary_(
+    const Tensor& input_t,
+    Tensor& other_t,
+    const Tensor& weight_t,
+    const c10::optional<Tensor>& bias_opt,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    c10::string_view binary_attr,
+    c10::optional<at::Scalar> alpha,
+    c10::optional<c10::string_view> unary_attr,
+    torch::List<c10::optional<at::Scalar>> unary_scalars,
+    c10::optional<c10::string_view> unary_algorithm) {
+  // other_t += convolution(...), other_t = unary(other_t)
+  TORCH_CHECK(
+      input_t.ndimension() == 4 || input_t.ndimension() == 5,
+      "mkldnn_convolution_add_: currently only support 2d and 3d")
+  TORCH_CHECK(
+      binary_attr == "add",
+      "mkldnn_convolution_pointwise_binary_: only support binary op fusion")
+  TORCH_CHECK(
+      !alpha.has_value() || alpha.value().to<float>() == 1.0,
+      "mkldnn_convolution_pointwise_binary: the alpha value for the binary op should be none(meaning 1.0) or 1.0");
+  TORCH_CHECK(
+      !unary_attr.has_value() || unary_attr.value() == "relu",
+      "mkldnn_convolution_pointwise_binary: only support none or relu unary op fusion after binary op");
+
+  c10::MaybeOwned<Tensor> bias_maybe_owned =
+      at::borrow_from_optional_tensor(bias_opt);
+  const Tensor& bias = *bias_maybe_owned;
+
+  // Make sure inputs have same type(device, layout, dtype), device is cpu and
+  // dtype is float or bfloat16.
+  check_mkldnn_binary_fusion_inputs(input_t, other_t, weight_t, bias);
+
+  check_shape_forward(
+      input_t, weight_t, bias, padding, stride, dilation, groups);
+
+  auto output_sizes = conv_output_size(
+      input_t.sizes(), weight_t.sizes(), padding, stride, dilation);
+  TORCH_CHECK(
+      output_sizes == other_t.sizes(),
+      "Add Fusion's inputs should have same shape");
+  // Only calling fusion path for channels_last path and the output is contiguous tensor(channels_last).
+  bool can_be_fused = mkldnn_conv_use_channels_last(input_t, weight_t)
+                      && (other_t.is_contiguous(at::MemoryFormat::ChannelsLast)
+                          || other_t.is_contiguous(at::MemoryFormat::ChannelsLast3d));
+  if (can_be_fused) {
+    c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset);
+    ideep::tensor y = itensor_from_tensor(other_t);
+    ideep::attr_t op_attr;
+    if (unary_attr.has_value()) {
+      op_attr = ideep::attr_t::residual();
+    } else {
+      op_attr = ideep::attr_t::fuse_sum();
+    }
+    _mkldnn_convolution_out(
+        input_t,
+        weight_t,
+        bias,
+        output_sizes,
+        y,
+        stride,
+        dilation,
+        padding,
+        groups,
+        true,
+        op_attr);
+  } else {
+    // Fallback case, if inputs are not channels last or have different dtype,
+    // OneDNN fusion may have performance regression.
+    Tensor output;
+    if (input_t.ndimension() == 4) {
+      output = at::conv2d(
+          input_t, weight_t, bias_opt, stride, padding, dilation, groups);
+    } else {
+      output = at::conv3d(
+          input_t, weight_t, bias_opt, stride, padding, dilation, groups);
+    }
+    if (unary_attr.has_value()) {
+      other_t = at::native::add_relu_(other_t, output);
+    } else {
+      other_t.add_(output);
+    }
+  }
+  return other_t;
+}
+
 Tensor mkldnn_convolution_backward_input(
     IntArrayRef input_size,
     const Tensor& grad_output,
@@ -540,6 +697,9 @@ TORCH_LIBRARY_IMPL(mkldnn, CPU, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("mkldnn::_convolution_pointwise.binary"),
       TORCH_FN(mkldnn_convolution_pointwise_binary));
+  m.impl(
+      TORCH_SELECTIVE_NAME("mkldnn::_convolution_pointwise_.binary"),
+      TORCH_FN(mkldnn_convolution_pointwise_binary_));
 }
 
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/mkldnn/Linear.cpp b/aten/src/ATen/native/mkldnn/Linear.cpp
index b57d8e56a16d1..24bf1282bfd6b 100644
--- a/aten/src/ATen/native/mkldnn/Linear.cpp
+++ b/aten/src/ATen/native/mkldnn/Linear.cpp
@@ -215,8 +215,9 @@ Tensor mkldnn_linear_pointwise(
   }
   const ideep::tensor w = itensor_from_tensor(weight_t);
 
-  auto it = fx_fusion_attr_map().find(attr);
-  TORCH_CHECK(it != fx_fusion_attr_map().end(), "Fusion behavior undefined.");
+  auto it = fusion_unary_attr_map().find(attr);
+  TORCH_CHECK(
+      it != fusion_unary_attr_map().end(), "Fusion behavior undefined.");
   ideep::attr_t op_attr = it->second(scalars, algorithm);
 
   if (mkldnn_bias.has_value()) {
diff --git a/aten/src/ATen/native/mkldnn/RegisterMkldnnOpContextClass.cpp b/aten/src/ATen/native/mkldnn/RegisterMkldnnOpContextClass.cpp
index 0be8d8a100cd6..08230827b58e5 100644
--- a/aten/src/ATen/native/mkldnn/RegisterMkldnnOpContextClass.cpp
+++ b/aten/src/ATen/native/mkldnn/RegisterMkldnnOpContextClass.cpp
@@ -42,7 +42,9 @@ TORCH_LIBRARY(mkldnn, m) {
   m.def(TORCH_SELECTIVE_SCHEMA(
       "mkldnn::_convolution_pointwise(Tensor X, Tensor W, Tensor? B, int[] padding, int[] stride, int[] dilation, int groups, str attr, Scalar?[] scalars, str? algorithm) -> Tensor Y"));
   m.def(TORCH_SELECTIVE_SCHEMA(
-      "mkldnn::_convolution_pointwise.binary(Tensor X, Tensor other, Tensor W, Tensor? B, int[] padding, int[] stride, int[] dilation, int groups, str attr) -> Tensor Y"));
+      "mkldnn::_convolution_pointwise.binary(Tensor X, Tensor other, Tensor W, Tensor? B, int[] padding, int[] stride, int[] dilation, int groups, str binary_attr, Scalar? alpha, str? unary_attr, Scalar?[] unary_scalars, str? unary_algorithm) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "mkldnn::_convolution_pointwise_.binary(Tensor X, Tensor(a!) other, Tensor W, Tensor? B, int[] padding, int[] stride, int[] dilation, int groups, str binary_attr, Scalar? alpha, str? unary_attr, Scalar?[] unary_scalars, str? unary_algorithm) -> Tensor(a!) Y"));
 }
 
 TORCH_LIBRARY(mkldnn_prepacked, m) {
diff --git a/aten/src/ATen/native/mkldnn/Utils.cpp b/aten/src/ATen/native/mkldnn/Utils.cpp
index 42f855d75cbe8..5db6e0b07ff15 100644
--- a/aten/src/ATen/native/mkldnn/Utils.cpp
+++ b/aten/src/ATen/native/mkldnn/Utils.cpp
@@ -127,7 +127,7 @@ AttrFunction attr_func_gelu = [](torch::List<c10::optional<at::Scalar>> scalars,
   return ideep::attr_t::fuse_gelu(1.0, 0.f, 0.f, gelu_type);
 };
 
-const std::map<c10::string_view, AttrFunction>& fx_fusion_attr_map() {
+const std::map<c10::string_view, AttrFunction>& fusion_unary_attr_map() {
   static const std::map<c10::string_view, AttrFunction> fusion_attr_map{
       {"relu", ATTR_FUNC(relu)},
       {"sigmoid", ATTR_FUNC(sigmoid)},
@@ -140,6 +140,13 @@ const std::map<c10::string_view, AttrFunction>& fx_fusion_attr_map() {
   return fusion_attr_map;
 };
 
+const std::map<c10::string_view, ideep::algorithm>& fusion_unary_alg_map() {
+  static const std::map<c10::string_view, ideep::algorithm> fusion_attr_map{
+      {"relu", {ideep::algorithm::eltwise_relu}},
+  };
+  return fusion_attr_map;
+};
+
 const std::map<c10::string_view, ideep::algorithm>& fusion_binary_alg_map() {
   static const std::map<c10::string_view, ideep::algorithm> fusion_attr_map{
       {"add", {ideep::algorithm::binary_add}},
diff --git a/aten/src/ATen/native/mkldnn/Utils.h b/aten/src/ATen/native/mkldnn/Utils.h
index 314a7efc950ef..a25be13c46dab 100644
--- a/aten/src/ATen/native/mkldnn/Utils.h
+++ b/aten/src/ATen/native/mkldnn/Utils.h
@@ -39,7 +39,9 @@ using AttrFunction = std::function<ideep::attr_t(
     torch::List<c10::optional<at::Scalar>>,
     c10::optional<c10::string_view>)>;
 
-const std::map<c10::string_view, AttrFunction>& fx_fusion_attr_map();
+const std::map<c10::string_view, AttrFunction>& fusion_unary_attr_map();
+
+const std::map<c10::string_view, ideep::algorithm>& fusion_unary_alg_map();
 
 const std::map<c10::string_view, ideep::algorithm>& fusion_binary_alg_map();
 
diff --git a/test/forward_backward_compatibility/check_forward_backward_compatibility.py b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
index e9d0834a812c0..90080ab0934f4 100644
--- a/test/forward_backward_compatibility/check_forward_backward_compatibility.py
+++ b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
@@ -317,6 +317,7 @@
     ("aten::_upsample_nearest_exact1d_backward", datetime.date(2022, 12, 15)),
     ("aten::_upsample_nearest_exact2d", datetime.date(2022, 12, 15)),
     ("aten::_upsample_nearest_exact2d_backward", datetime.date(2022, 12, 15)),
+    ("mkldnn::_convolution_pointwise.binary", datetime.date(2022, 12, 15)),
 ]
 
 ALLOW_LIST_COMPILED = [
diff --git a/test/test_mkldnn_fusion.py b/test/test_mkldnn_fusion.py
index cdef4bcfd6a57..9f264337d9567 100644
--- a/test/test_mkldnn_fusion.py
+++ b/test/test_mkldnn_fusion.py
@@ -271,8 +271,8 @@ def forward(self, x, other):
         for pointwise_name, pointwise_fn in self._binary_list().items():
             for dim in [2, 3]:
                 channels_last = torch.channels_last if dim == 2 else torch.channels_last_3d
-                options = itertools.product([True, False], [1, 2], [1, 4], [torch.contiguous_format, channels_last])
-                for bias, dilation, groups, memory_format in options:
+                options = itertools.product([False, True], [True, False], [1, 2], [1, 4], [torch.contiguous_format, channels_last])
+                for fuse_relu, bias, dilation, groups, memory_format in options:
                     oC = 32 * groups
                     iC = 3 * groups
                     x_shape = (1, iC) + input_shapes[dim]
@@ -282,12 +282,26 @@ def forward(self, x, other):
                     other = torch.randn_like(mod.conv(x))
                     with torch.no_grad():
                         ref = mod(x, other)
+                        unary_attr = None
+                        if fuse_relu:
+                            ref.relu_()
+                            unary_attr = "relu"
                         attr = pointwise_name
                         fused = torch.ops.mkldnn._convolution_pointwise(
                             x, other, mod.conv.weight, mod.conv.bias, mod.conv.padding, mod.conv.stride, mod.conv.dilation,
-                            mod.conv.groups, attr
+                            mod.conv.groups, attr, None, unary_attr, [], None
                         )
-                    self.assertEqual(ref, fused)
+                        # for binary add, we support inplace version.
+                        if attr == "add":
+                            fused_inplace = torch.ops.mkldnn._convolution_pointwise_(
+                                x, other, mod.conv.weight, mod.conv.bias, mod.conv.padding, mod.conv.stride, mod.conv.dilation,
+                                mod.conv.groups, attr, None, unary_attr, [], None
+                            )
+                            self.assertEqual(ref, other)
+                            self.assertEqual(ref, fused_inplace)
+
+                        self.assertEqual(ref, fused)
+
 
     def test_linear_binary_fusion_ops(self):
         class M(nn.Module):
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 629a8e94534d2..0353bcc8b0bec 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -3425,7 +3425,11 @@ def create(
         stride_: List[int],
         dilation_: List[int],
         groups: int,
-        attr,
+        binary_attr: str,
+        binary_alpha: Optional[float],
+        unary_attr: Optional[str],
+        unary_scalars: Optional[List],
+        unary_algorithm: Optional[str],
     ):
         kernel = "torch.ops.mkldnn._convolution_pointwise.binary"
         (inputs, constant_args, kernel_layout,) = _prepare_convolution_fusion_create(
@@ -3433,7 +3437,13 @@ def create(
         )
         other = cls.require_stride1(cls.realize_input(other))
         inputs.insert(1, other)
-        constant_args = constant_args + [attr]
+        constant_args = constant_args + [
+            binary_attr,
+            binary_alpha,
+            unary_attr,
+            unary_scalars,
+            unary_algorithm,
+        ]
         return ConvolutionBinary(
             layout=kernel_layout,
             inputs=inputs,
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 0ede696828a32..dedd39cd91c46 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -936,11 +936,27 @@ def convolution_binary(
             stride,
             dilation,
             groups,
-            attr,
+            binary_attr,
+            binary_alpha,
+            unary_attr,
+            unary_scalars,
+            unary_algorithm,
         ):
             return TensorBox.create(
                 ir.ConvolutionBinary.create(
-                    x, other, weight, bias, padding, stride, dilation, groups, attr
+                    x,
+                    other,
+                    weight,
+                    bias,
+                    padding,
+                    stride,
+                    dilation,
+                    groups,
+                    binary_attr,
+                    binary_alpha,
+                    unary_attr,
+                    unary_scalars,
+                    unary_algorithm,
                 )
             )
 
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index 1ab55142619c0..581e1996a436c 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -151,7 +151,11 @@ def __init__(
 
     def _update_module_params(self, conv, binary_op_name):
         self.__dict__ = copy.deepcopy(conv.__dict__)
-        self.attr = binary_op_name
+        self.binary_attr = binary_op_name
+        self.binary_alpha = None
+        self.unary_attr = None
+        self.unary_scalars = []
+        self.unary_algorithm = None
 
     def _conv_forward(self, input, other, weight, bias):
         if self.padding_mode != "zeros":
@@ -166,7 +170,11 @@ def _conv_forward(self, input, other, weight, bias):
                 self.stride,
                 self.dilation,
                 self.groups,
-                self.attr,
+                self.binary_attr,
+                self.binary_alpha,
+                self.unary_attr,
+                self.unary_scalars,
+                self.unary_algorithm,
             )
         return torch.ops.mkldnn._convolution_pointwise(
             input,
@@ -177,7 +185,11 @@ def _conv_forward(self, input, other, weight, bias):
             self.stride,
             self.dilation,
             self.groups,
-            self.attr,
+            self.binary_attr,
+            self.binary_alpha,
+            self.unary_attr,
+            self.unary_scalars,
+            self.unary_algorithm,
         )
 
     def forward(self, input, other):

From 06db1c9cd52efb4e870a1c8c20d2134ebd912670 Mon Sep 17 00:00:00 2001
From: samdow <samdow@fb.com>
Date: Tue, 1 Nov 2022 18:35:38 -0400
Subject: [PATCH 0738/1922] rename DisableTorchFunction to
 DisableTorchFunctionSubclass (#88218)

First half of #87990. This doesn't change any of the behavior and is just a rename

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88218
Approved by: https://github.com/ezyang, https://github.com/zou3519
---
 test/allowlist_for_publicAPI.json             |  2 +-
 test/profiler/test_profiler_tree.py           |  2 +-
 test/test_overrides.py                        |  4 +--
 test/test_public_bindings.py                  |  2 +-
 torch/_C/__init__.pyi.in                      |  2 +-
 torch/__init__.py                             |  2 +-
 torch/_dynamo/variables/builder.py            |  2 +-
 torch/_dynamo/variables/misc.py               |  2 +-
 torch/_dynamo/variables/tensor.py             |  2 +-
 torch/_subclasses/fake_tensor.py              |  2 +-
 torch/_tensor.py                              |  2 +-
 torch/csrc/Module.cpp                         |  4 +--
 torch/csrc/autograd/init.cpp                  |  1 -
 torch/csrc/utils/disable_torch_function.cpp   | 32 ++++++++++---------
 torch/csrc/utils/disable_torch_function.h     |  2 +-
 torch/distributed/_shard/common_op_utils.py   |  4 +--
 torch/distributed/_shard/partial_tensor.py    |  2 +-
 torch/distributed/_shard/replicated_tensor.py |  4 +--
 .../_shard/sharded_tensor/_ops/tensor_ops.py  |  2 +-
 torch/masked/maskedtensor/core.py             |  2 +-
 20 files changed, 39 insertions(+), 38 deletions(-)

diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index ba4a2e96df219..8a66dc12d4b6f 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -1128,7 +1128,7 @@
     "BFloat16Tensor",
     "ComplexDoubleStorage",
     "ComplexFloatStorage",
-    "DisableTorchFunction",
+    "DisableTorchFunctionSubclass",
     "Generator",
     "HalfStorage",
     "HalfTensor",
diff --git a/test/profiler/test_profiler_tree.py b/test/profiler/test_profiler_tree.py
index d4a31c6456131..210530250f924 100644
--- a/test/profiler/test_profiler_tree.py
+++ b/test/profiler/test_profiler_tree.py
@@ -26,7 +26,7 @@
     "torch/profiler/profiler.py(...): start": KEEP_ELLIPSES,
     "torch/profiler/profiler.py(...): stop_trace": KEEP_ELLIPSES,
     "torch/profiler/profiler.py(...): _transit_action": KEEP_ELLIPSES,
-    "<built-in method __exit__ of torch._C.DisableTorchFunction object at 0xXXXXXXXXXXXX>": PRUNE_ALL,
+    "<built-in method __exit__ of torch._C.DisableTorchFunctionSubclass object at 0xXXXXXXXXXXXX>": PRUNE_ALL,
     "cudaStreamIsCapturing": PRUNE_ALL,
     "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags": PRUNE_ALL,
 }
diff --git a/test/test_overrides.py b/test/test_overrides.py
index 7082f75a2141f..01c763a548fc8 100644
--- a/test/test_overrides.py
+++ b/test/test_overrides.py
@@ -1448,7 +1448,7 @@ class B(torch.Tensor):
 
         x = B(torch.randn(5))
         with A():
-            with torch._C.DisableTorchFunction():
+            with torch._C.DisableTorchFunctionSubclass():
                 self.assertNotIsInstance(torch.sum(x), B)
 
         self.assertTrue(called)
@@ -1460,7 +1460,7 @@ class A(torch.Tensor):
             pass
 
         x = A(torch.randn(5))
-        with torch._C.DisableTorchFunction():
+        with torch._C.DisableTorchFunctionSubclass():
             g = torch._C._EnableTorchFunction()
             try:
                 self.assertIsInstance(torch.sum(x), A)
diff --git a/test/test_public_bindings.py b/test/test_public_bindings.py
index 4d2df65126983..6897c3102df60 100644
--- a/test/test_public_bindings.py
+++ b/test/test_public_bindings.py
@@ -99,7 +99,7 @@ def test_no_new_bindings(self):
             "device",
             "DeviceObjType",
             "DictType",
-            "DisableTorchFunction",
+            "DisableTorchFunctionSubclass",
             "DispatchKey",
             "DispatchKeySet",
             "dtype",
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 2d20da2a04f30..79dd6386c3789 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -108,7 +108,7 @@ class layout:
     ...
 
 # Defined in torch/csrc/utils/disable_torch_function.cpp
-def DisableTorchFunction(): ...
+def DisableTorchFunctionSubclass(): ...
 
 # Defined in torch/csrc/utils/tensor_layouts.cpp
 strided : layout = ...
diff --git a/torch/__init__.py b/torch/__init__.py
index ae55f5975542f..ef6138cb48661 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -288,7 +288,7 @@ def get_pyobj(self):
         if (isinstance(obj, Callable) or inspect.isclass(obj)):  # type: ignore[arg-type]
             if (obj.__module__ != 'torch'):
                 # TODO: fix their module from C++ side
-                if name not in ['DisableTorchFunction', 'Generator']:
+                if name not in ['DisableTorchFunctionSubclass', 'Generator']:
                     obj.__module__ = 'torch'
 
 if not TYPE_CHECKING:
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index d3c5140fa4a97..9d87897468554 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -506,7 +506,7 @@ def wrap_tensor(self, value: torch.Tensor):
                 )
             # Disable __torch_function__ to prevent cloning of `value` to hit
             # us
-            with torch._C.DisableTorchFunction():
+            with torch._C.DisableTorchFunctionSubclass():
                 if is_constant_source(self.get_source()):
                     return self.tx.output.register_attr_or_module(
                         value,
diff --git a/torch/_dynamo/variables/misc.py b/torch/_dynamo/variables/misc.py
index da327122a6a70..6e4325b6c0f43 100644
--- a/torch/_dynamo/variables/misc.py
+++ b/torch/_dynamo/variables/misc.py
@@ -538,7 +538,7 @@ def call_function(
             options = VariableTracker.propagate(self, new_args, new_kwargs.values())
             # Disable __torch_function__ here to prevent the clone of the
             # example tensor from going into the override.
-            with torch._C.DisableTorchFunction():
+            with torch._C.DisableTorchFunctionSubclass():
                 if isinstance(args[0], TorchVariable):
                     return TensorVariable.create(
                         tx=tx,
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index 315c2b1a7e074..5a30f838e3f35 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -704,7 +704,7 @@ def inline_torch_function_unwrapped(
 
         # Disable __torch_function__ here to prevent the clone of the
         # example tensor from going into the override.
-        with torch._C.DisableTorchFunction():
+        with torch._C.DisableTorchFunctionSubclass():
             return tx.inline_user_function_return(tf_func_var, tf_args, {})
 
 
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 14f5cd2de0a7a..79af51efc5b8e 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -1093,5 +1093,5 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
             memo[id(tensor)] = out
             return out
         else:
-            with torch._C.DisableTorchFunction():
+            with torch._C.DisableTorchFunctionSubclass():
                 return func(*args, **kwargs)
diff --git a/torch/_tensor.py b/torch/_tensor.py
index 793034bb64ede..41b6569c06d86 100644
--- a/torch/_tensor.py
+++ b/torch/_tensor.py
@@ -1297,7 +1297,7 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
         if not all(issubclass(cls, t) for t in types):
             return NotImplemented
 
-        with _C.DisableTorchFunction():
+        with _C.DisableTorchFunctionSubclass():
             ret = func(*args, **kwargs)
             if func in get_default_nowrap_functions():
                 return ret
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index b8693a484ed9d..efe6c18ea0cd4 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -1594,8 +1594,8 @@ Call this whenever a new thread is created in order to propagate values from
       (PyObject*)THPDefaultCPUGenerator,
       /* incref= */ false));
   ASSERT_TRUE(set_module_attr(
-      "DisableTorchFunction",
-      (PyObject*)THPModule_DisableTorchFunctionType(),
+      "DisableTorchFunctionSubclass",
+      (PyObject*)THPModule_DisableTorchFunctionSubclassType(),
       /* incref= */ false));
   torch::set_disabled_torch_function_impl(
       PyObject_GetAttrString(module, "_disabled_torch_function_impl"));
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index ee963232d3166..d26db95f1295c 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -343,7 +343,6 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
       _C_m, "_RestorePythonTLSSnapshot")
       .def(py::init<>());
 
-  // TODO: line up this binding with DisableTorchFunction
   py::class_<torch::DisableTorchDispatch>(_C_m, "_DisableTorchDispatch")
       .def(py::init<>());
   py::class_<EnableTorchFunction>(_C_m, "_EnableTorchFunction")
diff --git a/torch/csrc/utils/disable_torch_function.cpp b/torch/csrc/utils/disable_torch_function.cpp
index 682120d7e6223..516e6b89d43af 100644
--- a/torch/csrc/utils/disable_torch_function.cpp
+++ b/torch/csrc/utils/disable_torch_function.cpp
@@ -35,18 +35,20 @@ typedef struct {
   PyObject_HEAD
       /* Type-specific fields go here. */
       bool old_state;
-} DisableTorchFunction;
+} DisableTorchFunctionSubclass;
 
-PyObject* DisableTorchFunction__enter(PyObject* self, PyObject* unused) {
-  ((DisableTorchFunction*)self)->old_state =
+PyObject* DisableTorchFunctionSubclass__enter(
+    PyObject* self,
+    PyObject* unused) {
+  ((DisableTorchFunctionSubclass*)self)->old_state =
       at::impl::PythonTorchFunctionTLS::is_disabled();
   at::impl::PythonTorchFunctionTLS::set_disabled(true);
   Py_RETURN_NONE;
 }
 
-PyObject* DisableTorchFunction__exit(PyObject* self, PyObject* unused) {
+PyObject* DisableTorchFunctionSubclass__exit(PyObject* self, PyObject* unused) {
   at::impl::PythonTorchFunctionTLS::set_disabled(
-      ((DisableTorchFunction*)self)->old_state);
+      ((DisableTorchFunctionSubclass*)self)->old_state);
   Py_RETURN_NONE;
 }
 
@@ -58,16 +60,16 @@ PyObject* THPModule_isEnabledTorchFunction(PyObject* self, PyObject* unused) {
   }
 }
 
-static PyMethodDef DisableTorchFunction_methods[] = { // NOLINT
-    {"__enter__", DisableTorchFunction__enter, METH_NOARGS, nullptr},
-    {"__exit__", DisableTorchFunction__exit, METH_VARARGS, nullptr},
+static PyMethodDef DisableTorchFunctionSubclass_methods[] = { // NOLINT
+    {"__enter__", DisableTorchFunctionSubclass__enter, METH_NOARGS, nullptr},
+    {"__exit__", DisableTorchFunctionSubclass__exit, METH_VARARGS, nullptr},
     {nullptr, nullptr, 0, nullptr}};
 
-PyTypeObject DisableTorchFunctionType = {
+PyTypeObject DisableTorchFunctionSubclassType = {
     PyVarObject_HEAD_INIT(
         nullptr,
-        0) "torch._C.DisableTorchFunction", /* tp_name */
-    sizeof(DisableTorchFunction), /* tp_basicsize */
+        0) "torch._C.DisableTorchFunctionSubclass", /* tp_name */
+    sizeof(DisableTorchFunctionSubclass), /* tp_basicsize */
     0, /* tp_itemsize */
     nullptr, /* tp_dealloc */
     0, /* tp_vectorcall_offset */
@@ -92,7 +94,7 @@ PyTypeObject DisableTorchFunctionType = {
     0, /* tp_weaklistoffset */
     nullptr, /* tp_iter */
     nullptr, /* tp_iternext */
-    DisableTorchFunction_methods, /* tp_methods */
+    DisableTorchFunctionSubclass_methods, /* tp_methods */
     nullptr, /* tp_members */
     nullptr, /* tp_getset */
     nullptr, /* tp_base */
@@ -105,12 +107,12 @@ PyTypeObject DisableTorchFunctionType = {
     PyType_GenericNew, /* tp_new */
 };
 
-PyObject* THPModule_DisableTorchFunctionType() {
-  if (PyType_Ready(&DisableTorchFunctionType) < 0) {
+PyObject* THPModule_DisableTorchFunctionSubclassType() {
+  if (PyType_Ready(&DisableTorchFunctionSubclassType) < 0) {
     return nullptr;
   }
 
-  return (PyObject*)(&DisableTorchFunctionType);
+  return (PyObject*)(&DisableTorchFunctionSubclassType);
 }
 
 PyObject* THPModule_disable_torch_function(PyObject* self, PyObject* a) {
diff --git a/torch/csrc/utils/disable_torch_function.h b/torch/csrc/utils/disable_torch_function.h
index 3cdc33e90681b..881a7adb13ebf 100644
--- a/torch/csrc/utils/disable_torch_function.h
+++ b/torch/csrc/utils/disable_torch_function.h
@@ -29,7 +29,7 @@ struct DisableTorchDispatch {
 } // namespace torch
 
 PyObject* THPModule_isEnabledTorchFunction(PyObject* self, PyObject* unused);
-PyObject* THPModule_DisableTorchFunctionType();
+PyObject* THPModule_DisableTorchFunctionSubclassType();
 PyObject* THPModule_disable_torch_function(PyObject* self, PyObject* args);
 PyObject* THPModule_disable_torch_dispatch(PyObject* self, PyObject* args);
 PyObject* THPModule_has_torch_function(PyObject*, PyObject* arg);
diff --git a/torch/distributed/_shard/common_op_utils.py b/torch/distributed/_shard/common_op_utils.py
index 08aa13282abcd..42d65923a5365 100644
--- a/torch/distributed/_shard/common_op_utils.py
+++ b/torch/distributed/_shard/common_op_utils.py
@@ -53,11 +53,11 @@ def tensor_default_op(types, args=(), kwargs=None, pg=None):
         Handles ``__torch_function__`` dispatch for the default tensor ops that
         behave the same as ``torch.Tensor`` such as ``torch.Tensor.shape`` or
         ``torch.Tensor.dtype``. We simply lower to the real op call with
-        DisableTorchFunction context like ``torch.Tensor.__torch_function__``
+        DisableTorchFunctionSubclass context like ``torch.Tensor.__torch_function__``
         to avoid recursions.
         """
         if kwargs is None:
             kwargs = {}
 
-        with torch._C.DisableTorchFunction():
+        with torch._C.DisableTorchFunctionSubclass():
             return op(*args, **kwargs)
diff --git a/torch/distributed/_shard/partial_tensor.py b/torch/distributed/_shard/partial_tensor.py
index dc8d09bdd7f30..6a48163082c5e 100644
--- a/torch/distributed/_shard/partial_tensor.py
+++ b/torch/distributed/_shard/partial_tensor.py
@@ -236,7 +236,7 @@ def find_process_group(e):
         # Need to disable all dispatch to print args and kwargs appropriately.
         guard = torch._C._DisableTorchDispatch()  # type: ignore[attr-defined]
         try:
-            with torch._C.DisableTorchFunction():
+            with torch._C.DisableTorchFunctionSubclass():
                 raise RuntimeError(
                     f"torch function '{func.__name__}', with args: {args} and "
                     f"kwargs: {kwargs} not supported for PartialTensor!")
diff --git a/torch/distributed/_shard/replicated_tensor.py b/torch/distributed/_shard/replicated_tensor.py
index 1327f89e00aaf..e3db6b0fac664 100644
--- a/torch/distributed/_shard/replicated_tensor.py
+++ b/torch/distributed/_shard/replicated_tensor.py
@@ -109,7 +109,7 @@ def dispatch_arg(arg):
         # We cann't do super().__torch_function__() as it implicitly convert the result
         # back to tensor subclasses, where in our case, we need to control the output type
         # base on the inter-op rules we defined.
-        with torch._C.DisableTorchFunction():
+        with torch._C.DisableTorchFunctionSubclass():
             rs = func(*args, **kwargs)
             if func in get_default_nowrap_functions():
                 return rs
@@ -157,7 +157,7 @@ def validate(self) -> bool:
         return True
 
     def __setstate__(self, state):
-        with torch._C.DisableTorchFunction():
+        with torch._C.DisableTorchFunctionSubclass():
             self.data = state
             self.requires_grad = state.requires_grad
             from torch.distributed._shard.api import _get_current_process_group
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py b/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py
index e52c29238a62b..9ed83ee33f619 100644
--- a/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py
+++ b/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py
@@ -203,7 +203,7 @@ def tensor_requires_grad_set(types, args=(), kwargs=None, pg=None):
         local_shard.tensor.requires_grad_(requires_grad)
 
         # update the wrapper class property
-    with torch._C.DisableTorchFunction():
+    with torch._C.DisableTorchFunctionSubclass():
         self_st.requires_grad_(requires_grad)
     # update the metadata in the meanwhile
     self_st._metadata.tensor_properties.requires_grad = requires_grad
diff --git a/torch/masked/maskedtensor/core.py b/torch/masked/maskedtensor/core.py
index 3274ef2ef9569..0459f24587bd7 100644
--- a/torch/masked/maskedtensor/core.py
+++ b/torch/masked/maskedtensor/core.py
@@ -270,7 +270,7 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
 
         if not all(issubclass(cls, t) for t in types):
             return NotImplemented
-        with torch._C.DisableTorchFunction():
+        with torch._C.DisableTorchFunctionSubclass():
             ret = func(*args, **kwargs)
             if func in get_default_nowrap_functions():
                 return ret

From fca673aea8f772eca4a94acc2555579ad338923c Mon Sep 17 00:00:00 2001
From: samdow <samdow@fb.com>
Date: Mon, 7 Nov 2022 15:43:39 -0500
Subject: [PATCH 0739/1922] add DisableTorchFunction that matches
 DisableTorchDispatch (#88219)

Closes #87990. This implements a new disable guard that matches DisableTorchDispatch (disables all subclasses and modes)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88219
Approved by: https://github.com/ezyang
---
 aten/src/ATen/PythonTorchFunctionTLS.cpp    |  11 ++-
 aten/src/ATen/PythonTorchFunctionTLS.h      |  12 ++-
 test/allowlist_for_publicAPI.json           |   1 +
 test/test_overrides.py                      |  21 ++++
 test/test_public_bindings.py                |   1 +
 torch/_C/__init__.pyi.in                    |   1 +
 torch/__init__.py                           |   2 +-
 torch/csrc/Module.cpp                       |   4 +
 torch/csrc/autograd/init.cpp                |   9 +-
 torch/csrc/utils/disable_torch_function.cpp | 100 ++++++++++++++++++--
 torch/csrc/utils/disable_torch_function.h   |   1 +
 11 files changed, 139 insertions(+), 24 deletions(-)

diff --git a/aten/src/ATen/PythonTorchFunctionTLS.cpp b/aten/src/ATen/PythonTorchFunctionTLS.cpp
index c9487c6958cbf..00f372f370e62 100644
--- a/aten/src/ATen/PythonTorchFunctionTLS.cpp
+++ b/aten/src/ATen/PythonTorchFunctionTLS.cpp
@@ -26,12 +26,12 @@ int64_t PythonTorchFunctionTLS::stack_len() {
   return pythonTorchFunctionState.stack_.size();
 }
 
-void PythonTorchFunctionTLS::set_disabled(bool disabled) {
-  pythonTorchFunctionState.disabled_ = disabled;
+void PythonTorchFunctionTLS::set_disabled_state(TorchFunctionDisabledState disabled_state) {
+  pythonTorchFunctionState.disabled_state_ = disabled_state;
 }
 
-bool PythonTorchFunctionTLS::is_disabled() {
-  return pythonTorchFunctionState.disabled_;
+TorchFunctionDisabledState PythonTorchFunctionTLS::get_disabled_state() {
+  return pythonTorchFunctionState.disabled_state_;
 }
 
 void PythonTorchFunctionTLS::set_state(const PythonTorchFunctionTLS& state) {
@@ -43,7 +43,8 @@ const PythonTorchFunctionTLS& PythonTorchFunctionTLS::get_state() {
 }
 
 bool torch_function_mode_enabled() {
-  return PythonTorchFunctionTLS::stack_len() > 0;
+  return PythonTorchFunctionTLS::get_disabled_state() != TorchFunctionDisabledState::ALL_DISABLED &&
+         PythonTorchFunctionTLS::stack_len() > 0;
 }
 
 } // namespace impl
diff --git a/aten/src/ATen/PythonTorchFunctionTLS.h b/aten/src/ATen/PythonTorchFunctionTLS.h
index 5940fb6f2dee2..a1e3a61ea2023 100644
--- a/aten/src/ATen/PythonTorchFunctionTLS.h
+++ b/aten/src/ATen/PythonTorchFunctionTLS.h
@@ -6,9 +6,11 @@
 namespace at {
 namespace impl {
 
+enum TorchFunctionDisabledState { ENABLED, SUBCLASSES_DISABLED, ALL_DISABLED };
+
 struct TORCH_API PythonTorchFunctionTLS {
-  static void set_disabled(bool);
-  static bool is_disabled();
+  static void set_disabled_state(TorchFunctionDisabledState disabled_state_);
+  static TorchFunctionDisabledState get_disabled_state();
 
   static void push_onto_stack(std::shared_ptr<SafePyObject> mode);
   static const std::shared_ptr<SafePyObject> pop_stack();
@@ -20,11 +22,11 @@ struct TORCH_API PythonTorchFunctionTLS {
 
  private:
   // The mode TLS is split into
-  //   - disabled_, which says whether or not to disable all torch function
-  //   modes
+  //   - disabled_state, which says which part of torch function are disabled
   //   - stack_, which is a vector of modes representing the stack of user
   //   defined modes
-  bool disabled_;
+  TorchFunctionDisabledState disabled_state_ =
+      TorchFunctionDisabledState::ENABLED;
   std::vector<std::shared_ptr<c10::SafePyObject>> stack_;
 };
 
diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index 8a66dc12d4b6f..45ba9ae94676d 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -1128,6 +1128,7 @@
     "BFloat16Tensor",
     "ComplexDoubleStorage",
     "ComplexFloatStorage",
+    "DisableTorchFunction",
     "DisableTorchFunctionSubclass",
     "Generator",
     "HalfStorage",
diff --git a/test/test_overrides.py b/test/test_overrides.py
index 01c763a548fc8..3b3a5ed063c70 100644
--- a/test/test_overrides.py
+++ b/test/test_overrides.py
@@ -1453,6 +1453,27 @@ class B(torch.Tensor):
 
         self.assertTrue(called)
 
+    def test_disable_subclass_mode(self):
+        called = False
+
+        class A(TorchFunctionMode):
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                nonlocal called
+                if kwargs is None:
+                    kwargs = {}
+                called = True
+                return func(*args, **kwargs)
+
+        class B(torch.Tensor):
+            pass
+
+        x = B(torch.randn(5))
+        with A():
+            with torch._C.DisableTorchFunction():
+                self.assertNotIsInstance(torch.sum(x), B)
+
+        self.assertFalse(called)
+
     def test_disable_enable_subclass(self):
         called = False
 
diff --git a/test/test_public_bindings.py b/test/test_public_bindings.py
index 6897c3102df60..46c7396b9b07f 100644
--- a/test/test_public_bindings.py
+++ b/test/test_public_bindings.py
@@ -99,6 +99,7 @@ def test_no_new_bindings(self):
             "device",
             "DeviceObjType",
             "DictType",
+            "DisableTorchFunction",
             "DisableTorchFunctionSubclass",
             "DispatchKey",
             "DispatchKeySet",
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 79dd6386c3789..bc4bf03d8161f 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -108,6 +108,7 @@ class layout:
     ...
 
 # Defined in torch/csrc/utils/disable_torch_function.cpp
+def DisableTorchFunction(): ...
 def DisableTorchFunctionSubclass(): ...
 
 # Defined in torch/csrc/utils/tensor_layouts.cpp
diff --git a/torch/__init__.py b/torch/__init__.py
index ef6138cb48661..2abf4ba4b07de 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -288,7 +288,7 @@ def get_pyobj(self):
         if (isinstance(obj, Callable) or inspect.isclass(obj)):  # type: ignore[arg-type]
             if (obj.__module__ != 'torch'):
                 # TODO: fix their module from C++ side
-                if name not in ['DisableTorchFunctionSubclass', 'Generator']:
+                if name not in ['DisableTorchFunctionSubclass', 'DisableTorchFunction', 'Generator']:
                     obj.__module__ = 'torch'
 
 if not TYPE_CHECKING:
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index efe6c18ea0cd4..0a9aa53a0bbc4 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -1597,6 +1597,10 @@ Call this whenever a new thread is created in order to propagate values from
       "DisableTorchFunctionSubclass",
       (PyObject*)THPModule_DisableTorchFunctionSubclassType(),
       /* incref= */ false));
+  ASSERT_TRUE(set_module_attr(
+      "DisableTorchFunction",
+      (PyObject*)THPModule_DisableTorchFunctionType(),
+      /* incref= */ false));
   torch::set_disabled_torch_function_impl(
       PyObject_GetAttrString(module, "_disabled_torch_function_impl"));
   ASSERT_TRUE(torch::disabled_torch_function_impl() != nullptr);
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index d26db95f1295c..6271cfd5cb997 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -60,13 +60,14 @@ struct DisableAutocast {
 
 struct EnableTorchFunction {
   EnableTorchFunction()
-      : old_(at::impl::PythonTorchFunctionTLS::is_disabled()) {
-    at::impl::PythonTorchFunctionTLS::set_disabled(false);
+      : old_(at::impl::PythonTorchFunctionTLS::get_disabled_state()) {
+    at::impl::PythonTorchFunctionTLS::set_disabled_state(
+        at::impl::TorchFunctionDisabledState::ENABLED);
   }
   ~EnableTorchFunction() {
-    at::impl::PythonTorchFunctionTLS::set_disabled(old_);
+    at::impl::PythonTorchFunctionTLS::set_disabled_state(old_);
   }
-  bool old_;
+  at::impl::TorchFunctionDisabledState old_;
 };
 
 struct EnablePythonDispatcher {
diff --git a/torch/csrc/utils/disable_torch_function.cpp b/torch/csrc/utils/disable_torch_function.cpp
index 516e6b89d43af..589b069250a36 100644
--- a/torch/csrc/utils/disable_torch_function.cpp
+++ b/torch/csrc/utils/disable_torch_function.cpp
@@ -11,7 +11,8 @@ PyObject* disabled_torch_function = nullptr;
 PyObject* disabled_torch_dispatch = nullptr;
 
 bool torch_function_enabled() {
-  return !at::impl::PythonTorchFunctionTLS::is_disabled();
+  return at::impl::PythonTorchFunctionTLS::get_disabled_state() ==
+      at::impl::TorchFunctionDisabledState::ENABLED;
 }
 
 PyObject* disabled_torch_function_impl() {
@@ -34,20 +35,23 @@ void set_disabled_torch_dispatch_impl(PyObject* value) {
 typedef struct {
   PyObject_HEAD
       /* Type-specific fields go here. */
-      bool old_state;
+      at::impl::TorchFunctionDisabledState old_state;
 } DisableTorchFunctionSubclass;
 
 PyObject* DisableTorchFunctionSubclass__enter(
     PyObject* self,
     PyObject* unused) {
-  ((DisableTorchFunctionSubclass*)self)->old_state =
-      at::impl::PythonTorchFunctionTLS::is_disabled();
-  at::impl::PythonTorchFunctionTLS::set_disabled(true);
+  const auto old_state = at::impl::PythonTorchFunctionTLS::get_disabled_state();
+  ((DisableTorchFunctionSubclass*)self)->old_state = old_state;
+  if (old_state == at::impl::TorchFunctionDisabledState::ENABLED) {
+    at::impl::PythonTorchFunctionTLS::set_disabled_state(
+        at::impl::TorchFunctionDisabledState::SUBCLASSES_DISABLED);
+  }
   Py_RETURN_NONE;
 }
 
 PyObject* DisableTorchFunctionSubclass__exit(PyObject* self, PyObject* unused) {
-  at::impl::PythonTorchFunctionTLS::set_disabled(
+  at::impl::PythonTorchFunctionTLS::set_disabled_state(
       ((DisableTorchFunctionSubclass*)self)->old_state);
   Py_RETURN_NONE;
 }
@@ -115,6 +119,81 @@ PyObject* THPModule_DisableTorchFunctionSubclassType() {
   return (PyObject*)(&DisableTorchFunctionSubclassType);
 }
 
+typedef struct {
+  PyObject_HEAD
+      /* Type-specific fields go here. */
+      at::impl::TorchFunctionDisabledState old_state;
+} DisableTorchFunction;
+
+PyObject* DisableTorchFunction__enter(PyObject* self, PyObject* unused) {
+  ((DisableTorchFunctionSubclass*)self)->old_state =
+      at::impl::PythonTorchFunctionTLS::get_disabled_state();
+  at::impl::PythonTorchFunctionTLS::set_disabled_state(
+      at::impl::TorchFunctionDisabledState::ALL_DISABLED);
+  Py_RETURN_NONE;
+}
+
+PyObject* DisableTorchFunction__exit(PyObject* self, PyObject* unused) {
+  at::impl::PythonTorchFunctionTLS::set_disabled_state(
+      ((DisableTorchFunctionSubclass*)self)->old_state);
+  Py_RETURN_NONE;
+}
+
+static PyMethodDef DisableTorchFunction_methods[] = { // NOLINT
+    {"__enter__", DisableTorchFunction__enter, METH_NOARGS, nullptr},
+    {"__exit__", DisableTorchFunction__exit, METH_VARARGS, nullptr},
+    {nullptr, nullptr, 0, nullptr}};
+
+PyTypeObject DisableTorchFunctionType = {
+    PyVarObject_HEAD_INIT(
+        nullptr,
+        0) "torch._C.DisableTorchFunction", /* tp_name */
+    sizeof(DisableTorchFunction), /* tp_basicsize */
+    0, /* tp_itemsize */
+    nullptr, /* tp_dealloc */
+    0, /* tp_vectorcall_offset */
+    nullptr, /* tp_getattr */
+    nullptr, /* tp_setattr */
+    nullptr, /* tp_reserved */
+    nullptr, /* tp_repr */
+    nullptr, /* tp_as_number */
+    nullptr, /* tp_as_sequence */
+    nullptr, /* tp_as_mapping */
+    nullptr, /* tp_hash  */
+    nullptr, /* tp_call */
+    nullptr, /* tp_str */
+    nullptr, /* tp_getattro */
+    nullptr, /* tp_setattro */
+    nullptr, /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT, /* tp_flags */
+    nullptr, /* tp_doc */
+    nullptr, /* tp_traverse */
+    nullptr, /* tp_clear */
+    nullptr, /* tp_richcompare */
+    0, /* tp_weaklistoffset */
+    nullptr, /* tp_iter */
+    nullptr, /* tp_iternext */
+    DisableTorchFunction_methods, /* tp_methods */
+    nullptr, /* tp_members */
+    nullptr, /* tp_getset */
+    nullptr, /* tp_base */
+    nullptr, /* tp_dict */
+    nullptr, /* tp_descr_get */
+    nullptr, /* tp_descr_set */
+    0, /* tp_dictoffset */
+    nullptr, /* tp_init */
+    PyType_GenericAlloc, /* tp_alloc */
+    PyType_GenericNew, /* tp_new */
+};
+
+PyObject* THPModule_DisableTorchFunctionType() {
+  if (PyType_Ready(&DisableTorchFunctionType) < 0) {
+    return nullptr;
+  }
+
+  return (PyObject*)(&DisableTorchFunctionType);
+}
+
 PyObject* THPModule_disable_torch_function(PyObject* self, PyObject* a) {
   HANDLE_TH_ERRORS
   PyObject *func = nullptr, *types = nullptr, *args = nullptr,
@@ -137,11 +216,14 @@ PyObject* THPModule_disable_torch_function(PyObject* self, PyObject* a) {
   // These are all C-API calls so no exceptions will be raised
   // and therefore no need for RAII approach to storing
   // the old value.
-  bool old_value = at::impl::PythonTorchFunctionTLS::is_disabled();
-  at::impl::PythonTorchFunctionTLS::set_disabled(true);
+  auto old_value = at::impl::PythonTorchFunctionTLS::get_disabled_state();
+  if (old_value == at::impl::TorchFunctionDisabledState::ENABLED) {
+    at::impl::PythonTorchFunctionTLS::set_disabled_state(
+        at::impl::TorchFunctionDisabledState::SUBCLASSES_DISABLED);
+  }
   // kwargs can safely be nullptr here.
   PyObject* result = PyObject_Call(func, py_args.ptr(), kwargs);
-  at::impl::PythonTorchFunctionTLS::set_disabled(old_value);
+  at::impl::PythonTorchFunctionTLS::set_disabled_state(old_value);
   return result;
   END_HANDLE_TH_ERRORS
 }
diff --git a/torch/csrc/utils/disable_torch_function.h b/torch/csrc/utils/disable_torch_function.h
index 881a7adb13ebf..8fc5118830eb7 100644
--- a/torch/csrc/utils/disable_torch_function.h
+++ b/torch/csrc/utils/disable_torch_function.h
@@ -29,6 +29,7 @@ struct DisableTorchDispatch {
 } // namespace torch
 
 PyObject* THPModule_isEnabledTorchFunction(PyObject* self, PyObject* unused);
+PyObject* THPModule_DisableTorchFunctionType();
 PyObject* THPModule_DisableTorchFunctionSubclassType();
 PyObject* THPModule_disable_torch_function(PyObject* self, PyObject* args);
 PyObject* THPModule_disable_torch_dispatch(PyObject* self, PyObject* args);

From 8c1eed415babcfc3346da9e21f54f0bed999929a Mon Sep 17 00:00:00 2001
From: Nikita Karetnikov <nikita@karetnikov.org>
Date: Thu, 10 Nov 2022 11:48:31 +0100
Subject: [PATCH 0740/1922] Symintify `broadcast_to` (#88776)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88776
Approved by: https://github.com/ezyang
---
 .../src/ATen/functorch/BatchRulesDecompositions.cpp |  2 +-
 aten/src/ATen/native/TensorShape.cpp                |  4 ++--
 aten/src/ATen/native/native_functions.yaml          |  4 +++-
 test/functorch/test_aotdispatch.py                  |  8 --------
 test/test_proxy_tensor.py                           | 13 -------------
 5 files changed, 6 insertions(+), 25 deletions(-)

diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
index 24a1c4ab507a0..af58da07e0488 100644
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@@ -63,7 +63,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   OP_DECOMPOSE2(bitwise_or, Scalar);
   OP_DECOMPOSE2(bitwise_xor, Scalar);
   OP_DECOMPOSE(broadcast_tensors);
-  OP_DECOMPOSE(broadcast_to);
+  m.impl("broadcast_to", native::broadcast_to_symint);
   OP_DECOMPOSE(cartesian_prod);
   OP_DECOMPOSE(cdist);
   OP_DECOMPOSE(clip);
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 31b4011c12813..deb9b949aa5d3 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -537,8 +537,8 @@ Tensor sparse_broadcast_to(const Tensor& self, IntArrayRef size) {
   return at::sparse_coo_tensor(new_indices, new_values, size)._coalesced_(is_coalesced);
 }
 
-Tensor broadcast_to(const Tensor& self, IntArrayRef size) {
-  return self.expand(size);
+Tensor broadcast_to_symint(const Tensor& self, SymIntArrayRef size) {
+  return self.expand_symint(size);
 }
 
 std::vector<Tensor> broadcast_tensors(TensorList tensors) {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 02b073a1ce785..94c56ce59fcd7 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1195,8 +1195,10 @@
   device_check: NoCheck
   device_guard: False
 
-- func: broadcast_to(Tensor(a) self, int[] size) -> Tensor(a)
+- func: broadcast_to(Tensor(a) self, SymInt[] size) -> Tensor(a)
   variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: broadcast_to_symint
 
 - func: _sparse_broadcast_to(Tensor(a) self, int[] size) -> Tensor(a)
   variants: function
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 09b65a32bfee9..22d0136423799 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1093,20 +1093,12 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('masked.cumprod', ''),  # aten.cumprod.default - couldn't find symbolic meta function/decomposition
     xfail('masked.cumsum', ''),  # aten.cumsum.default - couldn't find symbolic meta function/decomposition
     xfail('masked_fill', ''),  # could not find kernel
-    xfail('masked.log_softmax', ''),  # argument 'size' (position 2) must be tuple of ints, not ...
     xfail('masked.logaddexp', ''),  # aten.logaddexp.default - couldn't find symbolic meta function/decomposi...
     xfail('masked.logsumexp', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('masked.mean', ''),  # ones() received an invalid combination of arguments - got (torch.Size, device=t...
     xfail('masked.median', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('masked.norm', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('masked.prod', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('masked_scatter', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('masked_select', ''),  # aten.masked_select.default - couldn't find symbolic meta function/decompos...
-    xfail('masked.softmax', ''),  # argument 'size' (position 2) must be tuple of ints, not torc...
-    xfail('masked.softmin', ''),  # argument 'size' (position 2) must be tuple of ints, not torc...
-    xfail('masked.std', ''),  # ones() received an invalid combination of arguments - got (torch.Size, device=to...
-    xfail('masked.sum', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('masked.var', ''),  # ones() received an invalid combination of arguments - got (torch.Size, device=to...
     xfail('matmul', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('matrix_exp', ''),  # aten.linalg_matrix_exp.default - couldn't find symbolic meta function/decompo...
     xfail('median', ''),  # could not find kernel
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 242be9c78939e..8caf41a73906a 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1114,23 +1114,10 @@ def f(a, b, c, d, e):
     xfail('linalg.eig'),
     xfail('linalg.eigvals'),
     skip('masked.logsumexp', ''),  # Tensors of type TensorImpl do not have numel
-    xfail('masked.amax', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
-    xfail('masked.amin', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
     xfail('masked.argmax', ''),  # aten.argmax.default - couldn't find symbolic meta function/decomposition
     xfail('masked.argmin', ''),  # aten.argmin.default - couldn't find symbolic meta function/decomposition
     xfail('masked.cumprod', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
-    xfail('masked.cumsum', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
-    xfail('masked.log_softmax', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
     xfail('masked.logaddexp', ''),  # aten.logaddexp.default - couldn't find symbolic meta function/decomposition
-    xfail('masked.mean', ''),  # ones() received an invalid combination of arguments - got (torch.Size, device=torch.device, ...
-    xfail('masked.median', ''),  # aten.nanmedian.dim - couldn't find symbolic meta function/decomposition
-    xfail('masked.norm', ''),  # aten.linalg_vector_norm.default - couldn't find symbolic meta function/decomposition
-    xfail('masked.prod', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
-    xfail('masked.softmax', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
-    xfail('masked.softmin', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
-    xfail('masked.std', ''),  # ones() received an invalid combination of arguments - got (torch.Size, device=torch.device, d...
-    xfail('masked.sum', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
-    xfail('masked.var', ''),  # ones() received an invalid combination of arguments - got (torch.Size, device=torch.device, d...
     xfail('addmv', ''),  # aten.addmv.default - couldn't find symbolic meta function/decomposition
     xfail('addr', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('aminmax', ''),  # aten.aminmax.default - couldn't find symbolic meta function/decomposition

From faabe583c794507d3ec8a7fb189251474f47ddaf Mon Sep 17 00:00:00 2001
From: Nikita Karetnikov <nikita@karetnikov.org>
Date: Thu, 10 Nov 2022 01:07:50 +0100
Subject: [PATCH 0741/1922] Symintify `adaptive_avg_pool3d` (#88783)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88783
Approved by: https://github.com/ezyang
---
 aten/src/ATen/functorch/BatchRulesDecompositions.cpp | 2 +-
 aten/src/ATen/native/AdaptiveAveragePooling3d.cpp    | 4 ++--
 aten/src/ATen/native/native_functions.yaml           | 6 ++++--
 test/test_proxy_tensor.py                            | 1 -
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
index af58da07e0488..e31b36d112418 100644
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@@ -45,7 +45,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   OP_DECOMPOSE(adaptive_max_pool1d);
   OP_DECOMPOSE(adaptive_avg_pool1d);
   m.impl("adaptive_avg_pool2d", native::adaptive_avg_pool2d_symint);
-  OP_DECOMPOSE(adaptive_avg_pool3d);
+  m.impl("adaptive_avg_pool3d", native::adaptive_avg_pool3d_symint);
   OP_DECOMPOSE(adjoint);
   OP_DECOMPOSE(arccos);
   OP_DECOMPOSE(arccosh);
diff --git a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
index 427368e2c06ae..a0a02ca531600 100644
--- a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
+++ b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
@@ -313,7 +313,7 @@ Tensor adaptive_avg_pool3d_cpu(Tensor const& input, IntArrayRef output_size) {
   return output;
 }
 
-Tensor adaptive_avg_pool3d(Tensor const& input, IntArrayRef output_size) {
+Tensor adaptive_avg_pool3d_symint(Tensor const& input, SymIntArrayRef output_size) {
   TORCH_CHECK(output_size.size() == 3, "adaptive_avg_pool3d: output_size must be 3");
   TORCH_CHECK(
         (output_size[0] >= 0 && output_size[1] >= 0 && output_size[2] >= 0),
@@ -326,7 +326,7 @@ Tensor adaptive_avg_pool3d(Tensor const& input, IntArrayRef output_size) {
     Tensor out = input.mean({-1, -2, -3}, /* keepdim = */ true);
     return out;
   } else {
-    return _adaptive_avg_pool3d(input, output_size);
+    return _adaptive_avg_pool3d_symint(input, output_size);
   }
 }
 
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 94c56ce59fcd7..de087c0b8a896 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -10595,15 +10595,17 @@
   autogen: _adaptive_avg_pool2d_backward.out
   tags: canonical
 
-- func: adaptive_avg_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out) -> Tensor(a!)
+- func: adaptive_avg_pool3d.out(Tensor self, SymInt[3] output_size, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: adaptive_avg_pool3d_out_cpu
     CUDA: adaptive_avg_pool3d_out_cuda
     QuantizedCPU: adaptive_avg_pool3d_out_quantized_cpu
 
-- func: adaptive_avg_pool3d(Tensor self, int[3] output_size) -> Tensor
+- func: adaptive_avg_pool3d(Tensor self, SymInt[3] output_size) -> Tensor
   python_module: nn
+  dispatch:
+    CompositeImplicitAutograd: adaptive_avg_pool3d_symint
 
 - func: _adaptive_avg_pool3d(Tensor self, SymInt[3] output_size) -> Tensor
   dispatch:
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 8caf41a73906a..fcaefbed66352 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1233,7 +1233,6 @@ def f(a, b, c, d, e):
     xfail('mode', ''),  # aten.mode.default - couldn't find symbolic meta function/decomposition
     xfail('nanquantile', ''),  # Could not run 'aten::equal' with arguments from the 'Meta' backend.
     xfail('narrow', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
-    xfail('nn.functional.adaptive_avg_pool3d', ''),  # aten._adaptive_avg_pool3d.default - couldn't find symbolic meta func...
     xfail('nn.functional.adaptive_max_pool1d', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.adaptive_max_pool2d', ''),  # aten.adaptive_max_pool2d.default - couldn't find symbolic meta funct...
     xfail('nn.functional.adaptive_max_pool3d', ''),  # argument 'output_size' (position 2) must be tupl...

From cda2f972954559e4ef0764e82176267d2ecbce5b Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Thu, 10 Nov 2022 17:42:20 +0000
Subject: [PATCH 0742/1922] Make TorchElastic timer importable on Windows
 (#88522)

Also, add `torch.distributed` to test imports, so that we would not
regress in the future

Fixes https://github.com/pytorch/pytorch/issues/85427
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88522
Approved by: https://github.com/d4l3k
---
 test/test_testing.py                                   | 10 ++++++++--
 .../elastic/timer/file_based_local_timer.py            |  4 +++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/test/test_testing.py b/test/test_testing.py
index 5ce07ce454dc2..8fe66043e5a16 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -1794,8 +1794,14 @@ def test_circular_dependencies(self) -> None:
         if not sys.version_info >= (3, 9):
             ignored_modules.append("torch.utils.benchmark")
         if IS_WINDOWS or IS_MACOS:
-            # Distributed does not work on Windows or by default on Mac
-            ignored_modules.append("torch.distributed.")
+            # Distributed should be importable on Windows(except nn.api.), but not on Mac
+            if IS_MACOS:
+                ignored_modules.append("torch.distributed.")
+            else:
+                ignored_modules.append("torch.distributed.nn.api.")
+                ignored_modules.append("torch.distributed.optim.")
+                ignored_modules.append("torch.distributed.pipeline.")
+                ignored_modules.append("torch.distributed.rpc.")
             ignored_modules.append("torch.testing._internal.dist_utils")
             # And these both end up with transitive dependencies on distributed
             ignored_modules.append("torch.nn.parallel._replicated_tensor_ddp_interop")
diff --git a/torch/distributed/elastic/timer/file_based_local_timer.py b/torch/distributed/elastic/timer/file_based_local_timer.py
index 36ae944ec8e4f..88fefe1dab811 100644
--- a/torch/distributed/elastic/timer/file_based_local_timer.py
+++ b/torch/distributed/elastic/timer/file_based_local_timer.py
@@ -10,6 +10,7 @@
 import os
 import select
 import signal
+import sys
 import threading
 import time
 from typing import Callable, Dict, List, Optional, Set, Tuple
@@ -78,7 +79,8 @@ class FileTimerClient(TimerClient):
         signal: signal, the signal to use to kill the process. Using a
                         negative or zero signal will not kill the process.
     """
-    def __init__(self, file_path: str, signal=signal.SIGKILL) -> None:
+    def __init__(self, file_path: str, signal=(signal.SIGKILL if sys.platform != "win32" else
+                                               signal.CTRL_C_EVENT)) -> None:  # type: ignore[attr-defined]
         super().__init__()
         self._file_path = file_path
         self.signal = signal

From 8ede2fd1a5e40490762a3ead92eda05b6e1be630 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Thu, 10 Nov 2022 17:48:16 +0000
Subject: [PATCH 0743/1922] Switch to setup-nvidia action (#88757)

Use the new [setup-nvidia](https://github.com/pytorch/test-infra/blob/main/.github/actions/setup-nvidia/action.yml) action from test-infra. The new action is created so that it can be shared across different PyTorch repos. For examples:

* [pytorch/pytorch](https://github.com/pytorch/pytorch/blob/master/.github/scripts/install_nvidia_utils_linux.sh) (fixed by this PR)
* [pytorch/tau](https://github.com/pytorch/tau/blob/main/.github/workflows/install_nvidia_utils_linux.sh) (fixed by  https://github.com/pytorch/tau/pull/595)
* [pytorch/torchsnapshot](https://github.com/pytorch/torchsnapshot/blob/main/.github/scripts/install_nvidia_utils_linux.sh) (fixed by https://github.com/pytorch/torchsnapshot/pull/130)
* [torch/multiply](https://github.com/pytorch/multipy/blob/main/.github/scripts/install_nvidia_utils_linux.sh) (fixed by https://github.com/pytorch/multipy/pull/264)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88757
Approved by: https://github.com/seemethere, https://github.com/atalman
---
 .github/scripts/install_nvidia_utils_linux.sh | 131 ------------------
 .github/workflows/_binary-test-linux.yml      |  11 +-
 .github/workflows/_linux-test.yml             |   9 +-
 3 files changed, 2 insertions(+), 149 deletions(-)
 delete mode 100755 .github/scripts/install_nvidia_utils_linux.sh

diff --git a/.github/scripts/install_nvidia_utils_linux.sh b/.github/scripts/install_nvidia_utils_linux.sh
deleted file mode 100755
index 37c6dccd4811f..0000000000000
--- a/.github/scripts/install_nvidia_utils_linux.sh
+++ /dev/null
@@ -1,131 +0,0 @@
-#!/usr/bin/env bash
-
-set -eou pipefail
-
-
-DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID)
-DRIVER_VERSION="515.76"
-DRIVER_FN="NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
-YUM_REPO_URL="https://nvidia.github.io/nvidia-docker/${DISTRIBUTION}/nvidia-docker.repo"
-
-install_nvidia_docker2_amzn2() {
-    (
-        set -x
-        # Needed for yum-config-manager
-        sudo yum install -y yum-utils
-        sudo yum-config-manager --add-repo "${YUM_REPO_URL}"
-        sudo yum install -y nvidia-docker2
-        sudo systemctl restart docker
-    )
-}
-
-install_nvidia_driver_amzn2() {
-    (
-        set -x
-
-        # Purge any nvidia driver installed from RHEL repo
-        sudo yum remove -y nvidia-driver-latest-dkms
-
-        # Try to gather more information about the runner and its existing NVIDIA driver if any
-        echo "Before installing NVIDIA driver"
-        lspci
-        lsmod
-        modinfo nvidia || true
-
-        HAS_NVIDIA_DRIVER=0
-        # Check if NVIDIA driver has already been installed
-        if [ -x "$(command -v nvidia-smi)" ]; then
-            set +e
-            # The driver exists, check its version next. Also check only the first GPU if there are more than one of them
-            # so that the same driver version is not print over multiple lines
-            INSTALLED_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0)
-            NVIDIA_SMI_STATUS=$?
-
-            if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
-                echo "Failed to get NVIDIA driver version ($INSTALLED_DRIVER_VERSION). Continuing"
-            elif [ "$INSTALLED_DRIVER_VERSION" != "$DRIVER_VERSION" ]; then
-                echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has been installed, but we expect to have $DRIVER_VERSION instead. Continuing"
-            else
-                HAS_NVIDIA_DRIVER=1
-                echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has already been installed. Skipping NVIDIA driver installation"
-            fi
-            set -e
-        fi
-
-        if [ "$HAS_NVIDIA_DRIVER" -eq 0 ]; then
-            sudo yum groupinstall -y "Development Tools"
-            # ensure our kernel install is the same as our underlying kernel,
-            # groupinstall "Development Tools" has a habit of mismatching kernel headers
-            sudo yum install -y "kernel-devel-uname-r == $(uname -r)"
-            sudo modprobe backlight
-            sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
-
-            set +e
-            sudo /bin/bash /tmp/nvidia_driver -s --no-drm
-            NVIDIA_INSTALLATION_STATUS=$?
-
-            if [ "$NVIDIA_INSTALLATION_STATUS" -ne 0 ]; then
-                sudo cat /var/log/nvidia-installer.log
-
-                NVIDIA_DEVICES=$(lspci -D | grep -i NVIDIA | cut -d' ' -f1)
-                # The GPU can get stuck in a failure state if somehow the test crashs the GPU microcode. When this
-                # happens, we'll try to reset all NVIDIA devices https://github.com/pytorch/pytorch/issues/88388
-                for PCI_ID in "$NVIDIA_DEVICES"; do
-                    DEVICE_ENABLED=$(cat /sys/bus/pci/devices/$PCI_ID/enable)
-
-                    echo "Reseting $PCI_ID (enabled state: $DEVICE_ENABLED)"
-                    # This requires sudo permission of course
-                    echo "1" | sudo tee /sys/bus/pci/devices/$PCI_ID/reset
-                    sleep 1
-                done
-            fi
-
-            sudo rm -fv /tmp/nvidia_driver
-            set -e
-        fi
-
-        sudo modprobe nvidia || true
-        echo "After installing NVIDIA driver"
-        lspci
-        lsmod
-        modinfo nvidia || true
-
-        (
-            set +e
-            nvidia-smi
-            NVIDIA_SMI_STATUS=$?
-
-            # Allowable exit statuses for nvidia-smi, see: https://github.com/NVIDIA/gpu-operator/issues/285
-            if [ "$NVIDIA_SMI_STATUS" -eq 0 ] || [ "$NVIDIA_SMI_STATUS" -eq 14 ]; then
-                echo "INFO: Ignoring allowed status ${NVIDIA_SMI_STATUS}"
-            else
-                echo "ERROR: nvidia-smi exited with unresolved status ${NVIDIA_SMI_STATUS}"
-                exit ${NVIDIA_SMI_STATUS}
-            fi
-            set -e
-        )
-    )
-}
-
-echo "== Installing nvidia driver ${DRIVER_FN} =="
-case "${DISTRIBUTION}" in
-    amzn*)
-        install_nvidia_driver_amzn2
-        ;;
-    *)
-        echo "ERROR: Unknown distribution ${DISTRIBUTION}"
-        exit 1
-        ;;
-esac
-
-# Install container toolkit based on distribution
-echo "== Installing nvidia container toolkit for ${DISTRIBUTION} =="
-case "${DISTRIBUTION}" in
-    amzn*)
-        install_nvidia_docker2_amzn2
-        ;;
-    *)
-        echo "ERROR: Unknown distribution ${DISTRIBUTION}"
-        exit 1
-        ;;
-esac
diff --git a/.github/workflows/_binary-test-linux.yml b/.github/workflows/_binary-test-linux.yml
index 12b3d4c648228..471a2af88b8f5 100644
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@@ -171,17 +171,8 @@ jobs:
           path: "${{ runner.temp }}/artifacts/"
 
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
         if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' }}
-        with:
-          timeout_minutes: 10
-          max_attempts: 3
-          command: |
-            set -ex
-            pushd pytorch
-            bash .github/scripts/install_nvidia_utils_linux.sh
-            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
-            popd
 
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
index d2f48acca4e85..dc1346205e636 100644
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@@ -74,15 +74,8 @@ jobs:
           docker-image: ${{ inputs.docker-image }}
 
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
         if: contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu')
-        with:
-          timeout_minutes: 10
-          max_attempts: 3
-          command: |
-            set -ex
-            bash .github/scripts/install_nvidia_utils_linux.sh
-            echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
 
       - name: Start monitoring script
         id: monitor-script

From d9144b1fd2281d02941559d8d717a508459cc664 Mon Sep 17 00:00:00 2001
From: Panagiotis Antoniadis <pantoniadis97@gmail.com>
Date: Thu, 10 Nov 2022 18:11:29 +0000
Subject: [PATCH 0744/1922] Change TORCH_INTERNAL_ASSERT to TORCH_CHECK and add
 a nice error message (#88804)

Fixes #87672

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88804
Approved by: https://github.com/ezyang
---
 tools/autograd/templates/python_variable_methods.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
index 2cd847b734050..6ad042c0b903a 100644
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@@ -1193,7 +1193,7 @@ static PyObject* THPVariable_set_(
     case 3: {
       // aten::set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)
       auto dispatch_set_ = [](const Tensor& self, const Tensor& source) -> Tensor {
-        TORCH_INTERNAL_ASSERT(source.dtype() == self.dtype());
+        TORCH_CHECK(source.dtype() == self.dtype(), "Could not set tensor of type ", source.dtype(), " to a tensor of type ", self.dtype());
         pybind11::gil_scoped_release no_gil;
         return self.set_(source);
       };

From e666b7da0267ae82c01836ff0511e842f9991ddf Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Thu, 10 Nov 2022 04:42:37 +0000
Subject: [PATCH 0745/1922] [nnc] Disable opaque pointers mode in LLVM backend
 to allow getPointerElementType (#88798)

As of LLVM 15 typed pointers are going away:
https://llvm.org/docs/OpaquePointers.html.  Thus
`getPointerElementType` is no longer legal, since pointers are all
opaque.  I don't totally remember why we use it so prolifically, or
whether there's an easy change to get rid of it, or whether we'd need
a significant refactor to carry around `Type`s alongside `Value`s.

But in any case, NNC is deprecated (see: TorchInductor) and will
hopefully be gone before LLVM 16 is a thing.  For now, we can apply
the hack of turning off opaque pointer mode on the LLVMContext.

Differential Revision: [D41176215](https://our.internmc.facebook.com/intern/diff/D41176215)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88798
Approved by: https://github.com/desertfire
---
 torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index 78521efc240ee..1ca5665b4432a 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -446,6 +446,9 @@ LLVMCodeGenImpl::LLVMCodeGenImpl(
       irb_(getContext()),
       kernel_func_name_(std::move(kernel_func_name)),
       bufsExtAlloc_(ExternalAllocBufFinder::find(stmt)) {
+#if LLVM_VERSION_MAJOR >= 15
+  context_->setOpaquePointers(false);
+#endif
   if (!triple) {
     triple = LLVMTargetTriple();
   }

From 3f9b3a8e9fece7478cf13ca5b420288218be77bc Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@meta.com>
Date: Thu, 10 Nov 2022 18:17:20 +0000
Subject: [PATCH 0746/1922] [dynamo] `VariableTracker.call_method` requires a
 name (#88311)

Summary: as title

Test Plan: Before: N2743445, After: N2748186.  Note there's a new error, but at least we got past the easy one.

Differential Revision: D40938415

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88311
Approved by: https://github.com/brad-mengchi
---
 test/test_datapipe.py                   | 3 ++-
 torch/_dynamo/variables/user_defined.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index dbc5a5ae8071f..b5de6a5f4006c 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -33,7 +33,7 @@
 import torch.utils.data.datapipes as dp
 import torch.utils.data.graph
 import torch.utils.data.graph_settings
-from torch.testing._internal.common_utils import TestCase, run_tests, suppress_warnings
+from torch.testing._internal.common_utils import TestCase, run_tests, suppress_warnings, skipIfTorchDynamo
 from torch.utils.data import (
     DataLoader,
     DataChunk,
@@ -220,6 +220,7 @@ def test_dir(self):
         for api in ['open', 'read', 'close']:
             self.assertTrue(api in s)
 
+    @skipIfTorchDynamo
     def test_api(self):
         fd = TestStreamWrapper._FakeFD("")
         wrap_fd = StreamWrapper(fd)
diff --git a/torch/_dynamo/variables/user_defined.py b/torch/_dynamo/variables/user_defined.py
index 2d33c8328268a..09d7893bef665 100644
--- a/torch/_dynamo/variables/user_defined.py
+++ b/torch/_dynamo/variables/user_defined.py
@@ -68,7 +68,7 @@ def call_method(
 
             return variables.ListVariable(subs_as_vars, **options)
 
-        return super().call_method(tx, args, kwargs)
+        return super().call_method(tx, name, args, kwargs)
 
     def call_function(
         self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"

From a51345eba91880f1dd9656a417d040773c4e3d72 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Thu, 10 Nov 2022 15:17:51 +0000
Subject: [PATCH 0747/1922] [FSDP][Perf] Do not call `pad` in no-padding case
 (#88769)

- Calling `F.pad()` issues a pad kernel from the CPU even if there is no padding needed, which can incur some non-negligible overhead. This PR removes that unnecessary call for the no-padding case.
- This PR also does not zero the newly-allocated sharded gradient tensor before the reduce-scatter if `use_orig_params=True` because there is no need. The reduce-scatter will fill the tensor anyway, and we do not care about the values in the padding. For `use_orig_params=False`, the padding is exposed to the user, so we preserve the existing semantics of zeroing it. I left a to-do to follow-up since we may optimize that.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88769
Approved by: https://github.com/zhaojuanmao
---
 torch/distributed/fsdp/_runtime_utils.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index e0fa12e19c2a2..9aee15a016c44 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -537,8 +537,12 @@ def _post_backward_hook(
                 numel_to_pad = (
                     state.world_size * chunks[0].numel() - unsharded_grad.numel()
                 )
-                padded_unsharded_grad = F.pad(unsharded_grad, [0, numel_to_pad])
-                new_sharded_grad = torch.zeros_like(chunks[0])  # padded
+                padded_unsharded_grad = (
+                    F.pad(unsharded_grad, [0, numel_to_pad])
+                    if numel_to_pad > 0
+                    else unsharded_grad
+                )
+                new_sharded_grad = torch.empty_like(chunks[0])  # padded
                 state._communication_hook(
                     state._communication_hook_state,
                     padded_unsharded_grad,

From 2274e7322381e718c28fed03261048ef7b5a597b Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Thu, 10 Nov 2022 18:19:51 +0000
Subject: [PATCH 0748/1922] Revert "Symintify `broadcast_to` (#88776)"

This reverts commit 3a09d9a129406a05ca7e82c1438f9aa83019f48d.

Reverted https://github.com/pytorch/pytorch/pull/88776 on behalf of https://github.com/malfet due to Broke functorch/test_aotdispatch on M1, see https://hud.pytorch.org/pytorch/pytorch/commit/3a09d9a129406a05ca7e82c1438f9aa83019f48d
---
 .../src/ATen/functorch/BatchRulesDecompositions.cpp |  2 +-
 aten/src/ATen/native/TensorShape.cpp                |  4 ++--
 aten/src/ATen/native/native_functions.yaml          |  4 +---
 test/functorch/test_aotdispatch.py                  |  8 ++++++++
 test/test_proxy_tensor.py                           | 13 +++++++++++++
 5 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
index e31b36d112418..66aaa53bfcc1f 100644
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@@ -63,7 +63,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   OP_DECOMPOSE2(bitwise_or, Scalar);
   OP_DECOMPOSE2(bitwise_xor, Scalar);
   OP_DECOMPOSE(broadcast_tensors);
-  m.impl("broadcast_to", native::broadcast_to_symint);
+  OP_DECOMPOSE(broadcast_to);
   OP_DECOMPOSE(cartesian_prod);
   OP_DECOMPOSE(cdist);
   OP_DECOMPOSE(clip);
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index deb9b949aa5d3..31b4011c12813 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -537,8 +537,8 @@ Tensor sparse_broadcast_to(const Tensor& self, IntArrayRef size) {
   return at::sparse_coo_tensor(new_indices, new_values, size)._coalesced_(is_coalesced);
 }
 
-Tensor broadcast_to_symint(const Tensor& self, SymIntArrayRef size) {
-  return self.expand_symint(size);
+Tensor broadcast_to(const Tensor& self, IntArrayRef size) {
+  return self.expand(size);
 }
 
 std::vector<Tensor> broadcast_tensors(TensorList tensors) {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index de087c0b8a896..0ea606f5e1fb5 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1195,10 +1195,8 @@
   device_check: NoCheck
   device_guard: False
 
-- func: broadcast_to(Tensor(a) self, SymInt[] size) -> Tensor(a)
+- func: broadcast_to(Tensor(a) self, int[] size) -> Tensor(a)
   variants: function, method
-  dispatch:
-    CompositeImplicitAutograd: broadcast_to_symint
 
 - func: _sparse_broadcast_to(Tensor(a) self, int[] size) -> Tensor(a)
   variants: function
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 22d0136423799..09b65a32bfee9 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1093,12 +1093,20 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('masked.cumprod', ''),  # aten.cumprod.default - couldn't find symbolic meta function/decomposition
     xfail('masked.cumsum', ''),  # aten.cumsum.default - couldn't find symbolic meta function/decomposition
     xfail('masked_fill', ''),  # could not find kernel
+    xfail('masked.log_softmax', ''),  # argument 'size' (position 2) must be tuple of ints, not ...
     xfail('masked.logaddexp', ''),  # aten.logaddexp.default - couldn't find symbolic meta function/decomposi...
     xfail('masked.logsumexp', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
+    xfail('masked.mean', ''),  # ones() received an invalid combination of arguments - got (torch.Size, device=t...
     xfail('masked.median', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
+    xfail('masked.norm', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('masked.prod', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('masked_scatter', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('masked_select', ''),  # aten.masked_select.default - couldn't find symbolic meta function/decompos...
+    xfail('masked.softmax', ''),  # argument 'size' (position 2) must be tuple of ints, not torc...
+    xfail('masked.softmin', ''),  # argument 'size' (position 2) must be tuple of ints, not torc...
+    xfail('masked.std', ''),  # ones() received an invalid combination of arguments - got (torch.Size, device=to...
+    xfail('masked.sum', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
+    xfail('masked.var', ''),  # ones() received an invalid combination of arguments - got (torch.Size, device=to...
     xfail('matmul', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('matrix_exp', ''),  # aten.linalg_matrix_exp.default - couldn't find symbolic meta function/decompo...
     xfail('median', ''),  # could not find kernel
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index fcaefbed66352..fbeaa04aa65d9 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1114,10 +1114,23 @@ def f(a, b, c, d, e):
     xfail('linalg.eig'),
     xfail('linalg.eigvals'),
     skip('masked.logsumexp', ''),  # Tensors of type TensorImpl do not have numel
+    xfail('masked.amax', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
+    xfail('masked.amin', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
     xfail('masked.argmax', ''),  # aten.argmax.default - couldn't find symbolic meta function/decomposition
     xfail('masked.argmin', ''),  # aten.argmin.default - couldn't find symbolic meta function/decomposition
     xfail('masked.cumprod', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
+    xfail('masked.cumsum', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
+    xfail('masked.log_softmax', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
     xfail('masked.logaddexp', ''),  # aten.logaddexp.default - couldn't find symbolic meta function/decomposition
+    xfail('masked.mean', ''),  # ones() received an invalid combination of arguments - got (torch.Size, device=torch.device, ...
+    xfail('masked.median', ''),  # aten.nanmedian.dim - couldn't find symbolic meta function/decomposition
+    xfail('masked.norm', ''),  # aten.linalg_vector_norm.default - couldn't find symbolic meta function/decomposition
+    xfail('masked.prod', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
+    xfail('masked.softmax', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
+    xfail('masked.softmin', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
+    xfail('masked.std', ''),  # ones() received an invalid combination of arguments - got (torch.Size, device=torch.device, d...
+    xfail('masked.sum', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
+    xfail('masked.var', ''),  # ones() received an invalid combination of arguments - got (torch.Size, device=torch.device, d...
     xfail('addmv', ''),  # aten.addmv.default - couldn't find symbolic meta function/decomposition
     xfail('addr', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('aminmax', ''),  # aten.aminmax.default - couldn't find symbolic meta function/decomposition

From f8b652758c32b3eb650b69bb2a32641abe704eee Mon Sep 17 00:00:00 2001
From: Jiawen Liu <jiawenl@meta.com>
Date: Thu, 10 Nov 2022 18:32:25 +0000
Subject: [PATCH 0749/1922] [Inductor] Build FX Linear + Permute Vertical
 Fusion in Inductor (#88566)

Summary:
Build fx-based linear/matmul/bmm + permute/transpose vertical fusion in Inductor

For an internal Ads model: 1.15x -> 1.36x speedup

Differential Revision: D41071665

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88566
Approved by: https://github.com/jansel, https://github.com/jianyuh
---
 test/inductor/test_torchinductor.py | 106 +++++++++++++++
 torch/_inductor/config.py           |   3 +
 torch/_inductor/overrides.py        | 199 ++++++++++++++++++++++++++++
 3 files changed, 308 insertions(+)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index db6c5dfc2bd15..064f04291a8e7 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -10,6 +10,7 @@
 import typing
 import unittest
 import weakref
+from typing import Any, Callable
 from unittest.mock import patch
 
 import torch
@@ -18,6 +19,7 @@
 from torch._dynamo.debug_utils import same_two_models
 from torch._dynamo.testing import rand_strided, same
 from torch.fx.experimental.proxy_tensor import make_fx
+from torch.fx.passes.shape_prop import ShapeProp
 from torch.nn import functional as F
 from torch.testing._internal.common_utils import (
     IS_FBCODE,
@@ -40,6 +42,14 @@
     from torch._inductor import codecache, config, metrics
     from torch._inductor.compile_fx import compile_fx, complex_memory_overlap
     from torch._inductor.ir import IndexingDiv, ModularIndexing
+    from torch._inductor.overrides import (
+        linear_permute_fusion,
+        linear_transpose,
+        permute_linear_fusion,
+        permute_matmul_fusion,
+        transpose_linear,
+        transpose_matmul,
+    )
     from torch._inductor.sizevars import SizeVarAllocator
     from torch._inductor.utils import has_torchvision_roi_align, has_triton, timed
 
@@ -129,6 +139,29 @@ def maybe_test(*args, **kwargs):
     return wrap_test
 
 
+PassFunc = Callable[[torch.fx.GraphModule, Any], torch.fx.GraphModule]
+
+
+def chain_passes(*passes: PassFunc) -> PassFunc:
+    def parent_pass(module: torch.fx.GraphModule, input: Any) -> torch.fx.GraphModule:
+        for pass_ in passes:
+            if isinstance(module, torch.fx.GraphModule):
+                ShapeProp(module).propagate(*input)
+            module = pass_(module)
+        return module
+
+    return parent_pass
+
+
+def count_call_function(module: torch.fx.GraphModule, target_op: Any) -> int:
+    return sum(
+        [
+            1 if (n.op == "call_function" and n.target == target_op) else 0
+            for n in module.graph.nodes
+        ]
+    )
+
+
 class TestCase(TorchTestCase):
     @classmethod
     def setUpClass(cls):
@@ -1582,6 +1615,79 @@ def fn(a, b):
         y = torch.tensor(0)
         self.assertEqual(fn(x, y), x + x)
 
+    def test_linear_permute_fusion(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self, k: int, n: int):
+                super().__init__()
+                self.weight = torch.nn.Parameter(torch.randn(n, k))
+                self.bias = torch.nn.Parameter(torch.randn(n))
+
+            def forward(self, input: torch.Tensor):
+                a0 = torch.nn.functional.linear(input, self.weight, self.bias)
+                b0 = a0.permute(0, 2, 1)
+                return b0
+
+        m, k, n = 16, 8, 4
+        trace_func = chain_passes(torch.fx.symbolic_trace, linear_permute_fusion)
+        module = TestModule(k, n).eval()
+        input = torch.randn(6, m, k)
+        traced = trace_func(module, [input])
+        num_linear = count_call_function(traced, torch.nn.functional.linear)
+        num_linear_transpose = count_call_function(traced, linear_transpose)
+        self.assertEqual(num_linear, 0)
+        self.assertEqual(num_linear_transpose, 1)
+
+        self.assertTrue(torch.allclose(module(input), traced(input)))
+
+    def test_permute_linear_fusion(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self, k: int, n: int):
+                super().__init__()
+                self.weight = torch.nn.Parameter(torch.randn(n, k))
+                self.bias = torch.nn.Parameter(torch.randn(n))
+
+            def forward(self, input: torch.Tensor):
+                input1 = input.permute(0, 2, 1)
+                output = torch.nn.functional.linear(input1, self.weight, self.bias)
+                return output
+
+        m, k, n = 16, 8, 4
+
+        trace_func = chain_passes(torch.fx.symbolic_trace, permute_linear_fusion)
+        module = TestModule(k, n).eval()
+        input = torch.randn(6, k, m)
+        traced = trace_func(module, [input])
+        num_linear = count_call_function(traced, torch.nn.functional.linear)
+        num_transpose_linear = count_call_function(traced, transpose_linear)
+        self.assertEqual(num_linear, 0)
+        self.assertEqual(num_transpose_linear, 1)
+
+        self.assertTrue(torch.allclose(module(input), traced(input)))
+
+    def test_permute_bmm_fusion(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self, batch: int, k: int, n: int):
+                super().__init__()
+                self.other = torch.randn(batch, k, n)
+
+            def forward(self, input: torch.Tensor):
+                input1 = input.permute(0, 2, 1)
+                output = torch.bmm(input1, self.other)
+                return output
+
+        batch, m, k, n = 6, 16, 8, 4
+
+        trace_func = chain_passes(torch.fx.symbolic_trace, permute_matmul_fusion)
+        module = TestModule(batch, k, n).eval()
+        input = torch.randn(batch, k, m)
+        traced = trace_func(module, [input])
+        num_bmm = count_call_function(traced, torch.bmm)
+        num_transpose_matmul = count_call_function(traced, transpose_matmul)
+        self.assertEqual(num_bmm, 0)
+        self.assertEqual(num_transpose_matmul, 1)
+
+        self.assertTrue(torch.allclose(module(input), traced(input)))
+
     def test_slice1(self):
         def fn(a):
             return (
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index 910e6d20b4d6f..c9b7623cf5287 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -67,6 +67,9 @@
 # How to import torchdynamo, either torchdynamo or torch.dynamo
 dynamo_import = inductor_import.replace("inductor", "dynamo")
 
+# Fx-based linear/matmul/bmm + permute/transpose vertical fusion
+permute_fusion = os.environ.get("TORCHINDUCTOR_PERMUTE_FUSION", "0") == "1"
+
 
 # config specific to codegen/cpp.pp
 class cpp:
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index 581e1996a436c..69a5bc6710f8c 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -19,6 +19,8 @@
 from torch.nn.utils.fusion import fuse_conv_bn_eval
 from torch.overrides import TorchFunctionMode
 
+from . import config
+
 log = logging.getLogger(__name__)
 
 
@@ -313,6 +315,14 @@ def check_node_is_binary(node):
 
 
 def fuse_fx(gm: torch.fx.GraphModule, example_inputs):
+    if config.permute_fusion:
+        # For linear permute fusion, we need to check input info to identify
+        # and perform proper permutation/transpose
+        ShapeProp(gm).propagate(*example_inputs)
+        gm = linear_permute_fusion(gm)
+        gm = permute_linear_fusion(gm)
+        gm = permute_matmul_fusion(gm)
+
     # make sure the autograd is disabled.
     if torch.is_grad_enabled():
         return gm
@@ -408,6 +418,195 @@ def _philox_rand_like(input, seed, offset):
     return torch.rand_like(input)
 
 
+class NormalizedLinearNode:
+    def __init__(self, node: torch.fx.Node) -> None:
+        assert node.op == "call_function"
+        assert node.target in [torch.nn.functional.linear]
+        self.node: torch.fx.Node = node
+
+    def get_input(self) -> torch.fx.Node:
+        if len(self.node.args) > 0:
+            return self.node.args[0]
+        else:
+            return self.node.kwargs["input"]
+
+    def get_weight(self) -> torch.fx.Node:
+        if len(self.node.args) > 1:
+            return self.node.args[1]
+        else:
+            return self.node.kwargs["weight"]
+
+    def get_bias(self) -> torch.fx.Node:
+        if len(self.node.args) > 2:
+            return self.node.args[2]
+        else:
+            return self.node.kwargs["bias"]
+
+
+class NormalizedMatmulNode:
+    def __init__(self, node: torch.fx.Node) -> None:
+        assert node.op == "call_function"
+        assert node.target in [torch.bmm, torch.matmul]
+        self.node: torch.fx.Node = node
+
+    def get_input(self) -> torch.fx.Node:
+        if len(self.node.args) > 0:
+            return self.node.args[0]
+        else:
+            return self.node.kwargs["input"]
+
+    def get_other(self) -> torch.fx.Node:
+        if len(self.node.args) > 1:
+            return self.node.args[1]
+        else:
+            return self.node.kwargs["other"]
+
+
+def check_permute(node: torch.fx.Node):
+    ranks = len(node.meta["tensor_meta"].shape)
+    if len(node.args) > 3:
+        permutation = [node.args[i] % ranks for i in range(1, ranks + 1)]
+    elif (
+        "permutation" in node.kwargs
+        and node.kwargs["permutation"] is not None
+        and len(node.kwargs["permutation"]) > 2
+    ):
+        permutation = [i % ranks for i in node.kwargs["permutation"]]
+    else:
+        return False
+    allowed_permutation = list(range(ranks))
+    allowed_permutation[-1] = ranks - 2
+    allowed_permutation[-2] = ranks - 1
+    return permutation == allowed_permutation
+
+
+def linear_permute_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    for node in module.graph.nodes:
+        if (
+            node.op == "call_method"
+            and node.target == "permute"
+            and check_permute(node)
+        ):
+            if len(node.args) > 0:
+                input_node = node.args[0]
+            else:
+                input_node = node.kwargs["input"]
+            if (
+                input_node.op == "call_function"
+                and input_node.target == torch.nn.functional.linear
+            ):
+                normalized = NormalizedLinearNode(input_node)
+                input = normalized.get_input()
+                weight = normalized.get_weight()
+                bias = normalized.get_bias()
+                with module.graph.inserting_before(node):
+                    fused_node = module.graph.call_function(
+                        linear_transpose, args=(input, weight, bias)
+                    )
+                    node.replace_all_uses_with(fused_node)
+
+    module.graph.lint()
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return module
+
+
+# Y1 = X * W^T + bias
+# Y2 = Y1.permute(0, 2, 1)
+# ---->
+# Y2 = (W * X^T + bias.unsqueeze(-1))^T
+def linear_transpose(
+    input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor
+) -> torch.Tensor:
+    return torch.matmul(weight, input.transpose(-1, -2)) + bias.unsqueeze(-1)
+
+
+def permute_linear_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    for node in module.graph.nodes:
+        if node.op == "call_function" and node.target == torch.nn.functional.linear:
+            if len(node.args) > 0:
+                input_node = node.args[0]
+            else:
+                input_node = node.kwargs["input"]
+            if (
+                input_node.op == "call_method"
+                and input_node.target == "permute"
+                and check_permute(input_node)
+            ):
+                normalized = NormalizedLinearNode(node)
+                if len(input_node.args) > 0:
+                    input = input_node.args[0]
+                else:
+                    input = input_node.kwargs["input"]
+                weight = normalized.get_weight()
+                bias = normalized.get_bias()
+                with module.graph.inserting_before(node):
+                    fused_node = module.graph.call_function(
+                        transpose_linear, args=(input, weight, bias)
+                    )
+                    node.replace_all_uses_with(fused_node)
+
+    module.graph.lint()
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return module
+
+
+def permute_matmul_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    for node in module.graph.nodes:
+        if node.op == "call_function" and (
+            node.target == torch.bmm or node.target == torch.matmul
+        ):
+            normalized = NormalizedMatmulNode(node)
+            A = normalized.get_input()
+            B = normalized.get_other()
+            Atrans = Btrans = False
+            if A.op == "call_method" and A.target == "permute" and check_permute(A):
+                Atrans = True
+                if len(A.args) > 0:
+                    A = A.args[0]
+                else:
+                    A = A.kwargs["input"]
+
+            if B.op == "call_method" and B.target == "permute" and check_permute(B):
+                Btrans = True
+                if len(B.args) > 0:
+                    B = B.args[0]
+                else:
+                    B = B.kwargs["input"]
+
+            if Atrans or Btrans:
+                with module.graph.inserting_before(node):
+                    fused_node = module.graph.call_function(
+                        transpose_matmul,
+                        args=(A, B, Atrans, Btrans),
+                    )
+                node.replace_all_uses_with(fused_node)
+
+    module.graph.lint()
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return module
+
+
+# X1 = X.permute(0, 2, 1)
+# Y1 = X1 * W1^T + bias1
+# ---->
+# Y2 = X1.transpose(-1, -2) * W1^T + bias1
+def transpose_linear(
+    input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor
+) -> torch.Tensor:
+    return torch.matmul(input.transpose(-1, -2), weight.t()) + bias
+
+
+def transpose_matmul(A: torch.Tensor, B: torch.Tensor, Atrans: bool, Btrans: bool):
+    if Atrans:
+        A = A.transpose(-1, -2)
+    if Btrans:
+        B = B.transpose(-1, -2)
+    return torch.matmul(A, B)
+
+
 def replace_and_fuse_for_binary(
     computation_node, node, fuse_func, attr, modules, index_node, index_pointwise
 ):

From 450fc8216c2b2675f757d698fc03862559c9c8d6 Mon Sep 17 00:00:00 2001
From: Jiewen Tan <jwtan@google.com>
Date: Thu, 10 Nov 2022 18:34:19 +0000
Subject: [PATCH 0750/1922] [LTC] Make ComputePostOrder accept const T pointers
 (#88773)

Summary:
Since `c10::ArrayRef` now support `c10::ArrayRef<const T>`, let's restore `ComputePostOrder` to accept `const Node*` again, which is more suitable for the context of the given helpers.

Test Plan:
CI.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88773
Approved by: https://github.com/JackCaoG
---
 .github/ci_commit_pins/xla.txt                |  2 +-
 test/cpp/lazy/test_ir_util.cpp                |  2 +-
 torch/csrc/lazy/backend/backend_interface.cpp |  2 +-
 torch/csrc/lazy/backend/backend_interface.h   |  4 +--
 torch/csrc/lazy/backend/lowering_context.cpp  |  2 +-
 torch/csrc/lazy/backend/lowering_context.h    |  4 +--
 torch/csrc/lazy/core/debug_util.cpp           |  2 +-
 torch/csrc/lazy/core/ir_dump_util.cpp         | 16 +++++-----
 torch/csrc/lazy/core/ir_dump_util.h           | 12 ++++----
 torch/csrc/lazy/core/ir_util.cpp              | 30 +++++++++----------
 torch/csrc/lazy/core/ir_util.h                | 11 +++----
 torch/csrc/lazy/core/lazy_graph_executor.cpp  |  2 +-
 torch/csrc/lazy/core/lazy_graph_executor.h    |  2 +-
 torch/csrc/lazy/python/init.cpp               | 10 +++----
 .../csrc/lazy/ts_backend/ts_backend_impl.cpp  |  7 +++--
 .../lazy/ts_backend/ts_lowering_context.cpp   |  2 +-
 .../lazy/ts_backend/ts_lowering_context.h     |  2 +-
 17 files changed, 56 insertions(+), 56 deletions(-)

diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index 7ec9661a1ce4d..957272e8578b8 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-7889d2d3be16675943d84e4a4133ed7c245a623f
+08121e41079319cd369f82f523f5a714a0563f9d
diff --git a/test/cpp/lazy/test_ir_util.cpp b/test/cpp/lazy/test_ir_util.cpp
index 2befb04236ab5..0b2bfc7614b10 100644
--- a/test/cpp/lazy/test_ir_util.cpp
+++ b/test/cpp/lazy/test_ir_util.cpp
@@ -52,7 +52,7 @@ TEST(IrUtilTest, BasicTest) {
   dynamic_cast<IrUtilNode*>(b.get())->AddOperand(Value(d, 0));
   dynamic_cast<IrUtilNode*>(c.get())->AddOperand(Value(d, 0));
 
-  std::vector<Node*> postorder = Util::ComputePostOrder({a.get()});
+  auto postorder = Util::ComputePostOrder({a.get()});
   EXPECT_EQ(postorder.size(), 4);
   EXPECT_EQ(postorder.at(0), d.get());
   EXPECT_EQ(postorder.at(1), c.get());
diff --git a/torch/csrc/lazy/backend/backend_interface.cpp b/torch/csrc/lazy/backend/backend_interface.cpp
index 250a8847351c6..0fb3257c90a91 100644
--- a/torch/csrc/lazy/backend/backend_interface.cpp
+++ b/torch/csrc/lazy/backend/backend_interface.cpp
@@ -38,7 +38,7 @@ at::Tensor MakeTensorFromComputationData(
 std::unique_ptr<LoweringContext> LoweringContext::Create(
     const std::string& name,
     BackendDevice device,
-    c10::ArrayRef<Node*> post_order,
+    c10::ArrayRef<const Node*> post_order,
     Util::EmissionMap emit_status) {
   return getBackend()->CreateLoweringContext(
       name, device, post_order, emit_status);
diff --git a/torch/csrc/lazy/backend/backend_interface.h b/torch/csrc/lazy/backend/backend_interface.h
index a70591c2a19c8..f94d3b602e52c 100644
--- a/torch/csrc/lazy/backend/backend_interface.h
+++ b/torch/csrc/lazy/backend/backend_interface.h
@@ -59,7 +59,7 @@ class TORCH_API BackendImplInterface {
 
   // Gets backend data if the node is a device data node. Otherwise returns
   // nullptr
-  virtual BackendDataPtr GetComputationDataFromNode(Node*) const = 0;
+  virtual BackendDataPtr GetComputationDataFromNode(const Node*) const = 0;
 
   virtual at::Tensor MakeTensorFromComputationData(
       const BackendDataPtr data,
@@ -72,7 +72,7 @@ class TORCH_API BackendImplInterface {
   virtual std::unique_ptr<LoweringContext> CreateLoweringContext(
       const std::string& name,
       BackendDevice device,
-      c10::ArrayRef<torch::lazy::Node*> post_order,
+      c10::ArrayRef<const torch::lazy::Node*> post_order,
       Util::EmissionMap emit_status) const = 0;
 
   virtual std::unique_ptr<LoweringContext> CreateLoweringContext(
diff --git a/torch/csrc/lazy/backend/lowering_context.cpp b/torch/csrc/lazy/backend/lowering_context.cpp
index 64922a1b3e136..635ee4891cc7f 100644
--- a/torch/csrc/lazy/backend/lowering_context.cpp
+++ b/torch/csrc/lazy/backend/lowering_context.cpp
@@ -9,7 +9,7 @@ LoweringContext::LoweringContext(const std::string& name, BackendDevice device)
 LoweringContext::LoweringContext(
     const std::string& name,
     BackendDevice device,
-    c10::ArrayRef<torch::lazy::Node*> post_order,
+    c10::ArrayRef<const torch::lazy::Node*> post_order,
     Util::EmissionMap emit_status)
     : device_(std::move(device)), emit_status_(std::move(emit_status)) {}
 
diff --git a/torch/csrc/lazy/backend/lowering_context.h b/torch/csrc/lazy/backend/lowering_context.h
index 6f487aef7f741..49e7b8be58cbf 100644
--- a/torch/csrc/lazy/backend/lowering_context.h
+++ b/torch/csrc/lazy/backend/lowering_context.h
@@ -42,7 +42,7 @@ class TORCH_API LoweringContext {
   LoweringContext(
       const std::string& name,
       BackendDevice device,
-      c10::ArrayRef<torch::lazy::Node*> post_order,
+      c10::ArrayRef<const torch::lazy::Node*> post_order,
       Util::EmissionMap emit_status);
 
   virtual ~LoweringContext() = default;
@@ -50,7 +50,7 @@ class TORCH_API LoweringContext {
   static std::unique_ptr<LoweringContext> Create(
       const std::string& name,
       BackendDevice device,
-      c10::ArrayRef<torch::lazy::Node*> post_order,
+      c10::ArrayRef<const torch::lazy::Node*> post_order,
       Util::EmissionMap emit_status);
 
   static std::unique_ptr<LoweringContext> Create(
diff --git a/torch/csrc/lazy/core/debug_util.cpp b/torch/csrc/lazy/core/debug_util.cpp
index 50f42b718128e..50077d498a751 100644
--- a/torch/csrc/lazy/core/debug_util.cpp
+++ b/torch/csrc/lazy/core/debug_util.cpp
@@ -88,7 +88,7 @@ std::string DebugUtil::GetTensorsGraphInfo(
     c10::ArrayRef<torch::lazy::LazyTensorPtr> tensors,
     const std::vector<size_t>* indices,
     GraphFormat format) {
-  std::vector<torch::lazy::Node*> root_nodes;
+  std::vector<const torch::lazy::Node*> root_nodes;
   std::vector<torch::lazy::Value> root_values;
   std::vector<torch::lazy::hash_t> root_hashes;
   torch::lazy::Unique<torch::lazy::BackendDevice> unique_device;
diff --git a/torch/csrc/lazy/core/ir_dump_util.cpp b/torch/csrc/lazy/core/ir_dump_util.cpp
index eff2873d668d7..19cb2ae7b1624 100644
--- a/torch/csrc/lazy/core/ir_dump_util.cpp
+++ b/torch/csrc/lazy/core/ir_dump_util.cpp
@@ -80,7 +80,7 @@ c10::optional<AttrTag> ParseAttrTag(
   return tag;
 }
 
-NodeIdMap GenerateIdMap(c10::ArrayRef<Node*> post_order) {
+NodeIdMap GenerateIdMap(c10::ArrayRef<const Node*> post_order) {
   NodeIdMap id_map;
   for (auto node : post_order) {
     TORCH_CHECK(id_map.emplace(node, id_map.size()).second, node->ToString());
@@ -89,7 +89,7 @@ NodeIdMap GenerateIdMap(c10::ArrayRef<Node*> post_order) {
 }
 
 std::unordered_map<const Node*, size_t> GetRootsIds(
-    c10::ArrayRef<Node*> roots) {
+    c10::ArrayRef<const Node*> roots) {
   std::unordered_map<const Node*, size_t> roots_ids;
   for (const auto i : c10::irange(roots.size())) {
     roots_ids[roots[i]] = i;
@@ -178,14 +178,14 @@ std::string GenerateTextNodeSpec(const Node* node, const NodeIdMap& id_map) {
 
 } // namespace
 
-std::string DumpUtil::ToDot(c10::ArrayRef<Node*> nodes) {
+std::string DumpUtil::ToDot(c10::ArrayRef<const Node*> nodes) {
   auto post_order = Util::ComputePostOrder(nodes);
   return PostOrderToDot(post_order, nodes);
 }
 
 std::string DumpUtil::PostOrderToDot(
-    c10::ArrayRef<Node*> post_order,
-    c10::ArrayRef<Node*> roots) {
+    c10::ArrayRef<const Node*> post_order,
+    c10::ArrayRef<const Node*> roots) {
   std::unordered_map<const Node*, size_t> roots_ids = GetRootsIds(roots);
   NodeIdMap id_map = GenerateIdMap(post_order);
   std::stringstream ss;
@@ -218,14 +218,14 @@ std::string DumpUtil::PostOrderToDot(
   return ss.str();
 }
 
-std::string DumpUtil::ToText(c10::ArrayRef<Node*> nodes) {
+std::string DumpUtil::ToText(c10::ArrayRef<const Node*> nodes) {
   auto post_order = Util::ComputePostOrder(nodes);
   return PostOrderToText(post_order, nodes);
 }
 
 std::string DumpUtil::PostOrderToText(
-    c10::ArrayRef<Node*> post_order,
-    c10::ArrayRef<Node*> roots) {
+    c10::ArrayRef<const Node*> post_order,
+    c10::ArrayRef<const Node*> roots) {
   std::unordered_map<const Node*, size_t> roots_ids = GetRootsIds(roots);
   NodeIdMap id_map = GenerateIdMap(post_order);
   std::stringstream ss;
diff --git a/torch/csrc/lazy/core/ir_dump_util.h b/torch/csrc/lazy/core/ir_dump_util.h
index 22cf139bfbd64..4b4e1e0749b24 100644
--- a/torch/csrc/lazy/core/ir_dump_util.h
+++ b/torch/csrc/lazy/core/ir_dump_util.h
@@ -11,17 +11,17 @@ class BackendDevice;
 
 class TORCH_API DumpUtil {
  public:
-  static std::string ToDot(c10::ArrayRef<Node*> nodes);
+  static std::string ToDot(c10::ArrayRef<const Node*> nodes);
 
   static std::string PostOrderToDot(
-      c10::ArrayRef<Node*> post_order,
-      c10::ArrayRef<Node*> roots);
+      c10::ArrayRef<const Node*> post_order,
+      c10::ArrayRef<const Node*> roots);
 
-  static std::string ToText(c10::ArrayRef<Node*> nodes);
+  static std::string ToText(c10::ArrayRef<const Node*> nodes);
 
   static std::string PostOrderToText(
-      c10::ArrayRef<Node*> post_order,
-      c10::ArrayRef<Node*> roots);
+      c10::ArrayRef<const Node*> post_order,
+      c10::ArrayRef<const Node*> roots);
 
   static std::string ToBackend(
       c10::ArrayRef<Value> values,
diff --git a/torch/csrc/lazy/core/ir_util.cpp b/torch/csrc/lazy/core/ir_util.cpp
index 2d463bb99d5f5..b2a2a8ecfa20a 100644
--- a/torch/csrc/lazy/core/ir_util.cpp
+++ b/torch/csrc/lazy/core/ir_util.cpp
@@ -5,13 +5,12 @@
 namespace torch {
 namespace lazy {
 
-std::vector<Node*> Util::ComputePostOrder(const Node* node, EmissionMap* emap) {
-  std::vector<Node*> post_order;
-  std::vector<Node*> queue;
-  // std::vector<const T> to c10::ArrayRef<T> conversion is not supported,
-  // so we need to drop const in the return vector and use const_cast here.
-  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-  queue.push_back(const_cast<Node*>(node));
+std::vector<const Node*> Util::ComputePostOrder(
+    const Node* node,
+    EmissionMap* emap) {
+  std::vector<const Node*> post_order;
+  std::vector<const Node*> queue;
+  queue.push_back(node);
   while (!queue.empty()) {
     node = queue.back();
     auto it = emap->find(node);
@@ -20,8 +19,7 @@ std::vector<Node*> Util::ComputePostOrder(const Node* node, EmissionMap* emap) {
       for (auto& output : node->operands()) {
         auto oit = emap->find(output.node);
         if (oit == emap->end()) {
-          // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-          queue.push_back(const_cast<Node*>(output.node));
+          queue.push_back(output.node);
         } else {
           TORCH_CHECK(
               oit->second != kEmitting,
@@ -38,8 +36,7 @@ std::vector<Node*> Util::ComputePostOrder(const Node* node, EmissionMap* emap) {
             output.node->ToString());
       }
       (*emap)[node] = kEmitted;
-      // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
-      post_order.push_back(const_cast<Node*>(node));
+      post_order.push_back(node);
       queue.pop_back();
     } else {
       TORCH_CHECK(it->second == kEmitted);
@@ -49,10 +46,10 @@ std::vector<Node*> Util::ComputePostOrder(const Node* node, EmissionMap* emap) {
   return post_order;
 }
 
-std::vector<Node*> Util::ComputePostOrder(
-    c10::ArrayRef<Node*> nodes,
+std::vector<const Node*> Util::ComputePostOrder(
+    c10::ArrayRef<const Node*> nodes,
     EmissionMap* emap) {
-  std::vector<Node*> post_order;
+  std::vector<const Node*> post_order;
   for (auto node : nodes) {
     auto node_post_order = ComputePostOrder(node, emap);
     post_order.insert(
@@ -61,12 +58,13 @@ std::vector<Node*> Util::ComputePostOrder(
   return post_order;
 }
 
-std::vector<Node*> Util::ComputePostOrder(c10::ArrayRef<Node*> nodes) {
+std::vector<const Node*> Util::ComputePostOrder(
+    c10::ArrayRef<const Node*> nodes) {
   EmissionMap emap;
   return ComputePostOrder(nodes, &emap);
 }
 
-size_t Util::GetGraphSize(c10::ArrayRef<Node*> nodes) {
+size_t Util::GetGraphSize(c10::ArrayRef<const Node*> nodes) {
   return ComputePostOrder(nodes).size();
 }
 
diff --git a/torch/csrc/lazy/core/ir_util.h b/torch/csrc/lazy/core/ir_util.h
index a95b1a523bfa9..df3d0fd7ac406 100644
--- a/torch/csrc/lazy/core/ir_util.h
+++ b/torch/csrc/lazy/core/ir_util.h
@@ -25,21 +25,22 @@ class TORCH_API Util {
   // this API. The returned post-order can be empty if the node has already been
   // emitted inside the emission map. An error is generated if a loop is
   // detected.
-  static std::vector<Node*> ComputePostOrder(
+  static std::vector<const Node*> ComputePostOrder(
       const Node* node,
       EmissionMap* emap);
 
-  static std::vector<Node*> ComputePostOrder(
-      c10::ArrayRef<Node*> nodes,
+  static std::vector<const Node*> ComputePostOrder(
+      c10::ArrayRef<const Node*> nodes,
       EmissionMap* emap);
 
   // Same as above, but computes the post order on the set of nodes specified as
   // argument.
-  static std::vector<Node*> ComputePostOrder(c10::ArrayRef<Node*> nodes);
+  static std::vector<const Node*> ComputePostOrder(
+      c10::ArrayRef<const Node*> nodes);
 
   // Retrieves the number of nodes within the graph whose sink are passed in the
   // nodes argument.
-  static size_t GetGraphSize(c10::ArrayRef<Node*> nodes);
+  static size_t GetGraphSize(c10::ArrayRef<const Node*> nodes);
 };
 
 } // namespace lazy
diff --git a/torch/csrc/lazy/core/lazy_graph_executor.cpp b/torch/csrc/lazy/core/lazy_graph_executor.cpp
index 4989ce24a0ef1..1201971f3bc2d 100644
--- a/torch/csrc/lazy/core/lazy_graph_executor.cpp
+++ b/torch/csrc/lazy/core/lazy_graph_executor.cpp
@@ -721,7 +721,7 @@ std::vector<BackendDataPtr> LazyGraphExecutor::FetchTensorData(
 LazyGraphExecutor::PostOrderData LazyGraphExecutor::RunPostOrder(
     const std::vector<LazyTensorPtr>& tensors,
     SyncTensorCollection* coll) {
-  std::vector<Node*> roots;
+  std::vector<const Node*> roots;
   roots.reserve(coll->indices.size());
   for (auto index : coll->indices) {
     Value ir_value = tensors.at(index)->CurrentIrValue();
diff --git a/torch/csrc/lazy/core/lazy_graph_executor.h b/torch/csrc/lazy/core/lazy_graph_executor.h
index b7e10374fbb76..9894295f3b32a 100644
--- a/torch/csrc/lazy/core/lazy_graph_executor.h
+++ b/torch/csrc/lazy/core/lazy_graph_executor.h
@@ -158,7 +158,7 @@ class TORCH_API LazyGraphExecutor {
   };
 
   struct PostOrderData {
-    std::vector<Node*> post_order;
+    std::vector<const Node*> post_order;
     Util::EmissionMap emission_map;
     std::vector<BackendDataPtr> parameters_data;
     std::vector<size_t> parameter_sequence;
diff --git a/torch/csrc/lazy/python/init.cpp b/torch/csrc/lazy/python/init.cpp
index 2d421a3eb2ae7..774df68e26def 100644
--- a/torch/csrc/lazy/python/init.cpp
+++ b/torch/csrc/lazy/python/init.cpp
@@ -42,9 +42,9 @@ std::ptrdiff_t GetTensorId(const at::Tensor& tensor) {
 
 std::string GetTensorsDump(
     const std::vector<at::Tensor>& tensors,
-    const std::function<std::string(c10::ArrayRef<torch::lazy::Node*>)>&
+    const std::function<std::string(c10::ArrayRef<const torch::lazy::Node*>)>&
         coverter) {
-  std::vector<torch::lazy::Node*> nodes;
+  std::vector<const torch::lazy::Node*> nodes;
   std::vector<torch::lazy::Value> values;
   for (auto& tensor : tensors) {
     auto inner = at::functionalization::impl::from_functional_tensor(tensor);
@@ -142,7 +142,7 @@ void initLazyBindings(PyObject* module) {
   lazy.def(
       "_get_tensors_text",
       [](const std::vector<at::Tensor>& tensors) -> std::string {
-        auto coverter = [](c10::ArrayRef<torch::lazy::Node*> nodes) {
+        auto coverter = [](c10::ArrayRef<const torch::lazy::Node*> nodes) {
           return torch::lazy::DumpUtil::ToText(nodes);
         };
         return GetTensorsDump(tensors, coverter);
@@ -150,7 +150,7 @@ void initLazyBindings(PyObject* module) {
   lazy.def(
       "_get_tensors_dot",
       [](const std::vector<at::Tensor>& tensors) -> std::string {
-        auto coverter = [](c10::ArrayRef<torch::lazy::Node*> nodes) {
+        auto coverter = [](c10::ArrayRef<const torch::lazy::Node*> nodes) {
           return torch::lazy::DumpUtil::ToDot(nodes);
         };
         return GetTensorsDump(tensors, coverter);
@@ -222,7 +222,7 @@ void initLazyBindings(PyObject* module) {
       [](const std::vector<at::Tensor>& tensors)
           -> std::pair<std::vector<int64_t>, std::vector<at::IValue>> {
 #if !(defined(FBCODE_CAFFE2) || defined(OVRSOURCE))
-        std::vector<Node*> roots;
+        std::vector<const Node*> roots;
         for (auto& tensor : tensors) {
           auto xtensor = TryGetLtcTensor(tensor);
           roots.push_back(xtensor->GetIrValue().node.get());
diff --git a/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp b/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp
index 4003a005fbfab..488dd9f24d9d9 100644
--- a/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_backend_impl.cpp
@@ -61,7 +61,7 @@ class TSBackendImpl : public torch::lazy::BackendImplInterface {
   std::unique_ptr<torch::lazy::LoweringContext> CreateLoweringContext(
       const std::string& name,
       torch::lazy::BackendDevice device,
-      c10::ArrayRef<torch::lazy::Node*> post_order,
+      c10::ArrayRef<const torch::lazy::Node*> post_order,
       torch::lazy::Util::EmissionMap emit_status) const override {
     return std::make_unique<torch::lazy::TSLoweringContext>(
         name, device, post_order, emit_status);
@@ -113,8 +113,9 @@ class TSBackendImpl : public torch::lazy::BackendImplInterface {
     return std::make_shared<TSData>(scalar, device);
   }
 
-  torch::lazy::BackendDataPtr GetComputationDataFromNode(Node* node) const {
-    auto* device_data_node = dynamic_cast<DeviceData*>(node);
+  torch::lazy::BackendDataPtr GetComputationDataFromNode(
+      const Node* node) const {
+    auto* device_data_node = DeviceData::Cast(node);
     if (!device_data_node) {
       return nullptr;
     }
diff --git a/torch/csrc/lazy/ts_backend/ts_lowering_context.cpp b/torch/csrc/lazy/ts_backend/ts_lowering_context.cpp
index ff3d1aa07b78e..ad1cac4870f51 100644
--- a/torch/csrc/lazy/ts_backend/ts_lowering_context.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_lowering_context.cpp
@@ -17,7 +17,7 @@ TSLoweringContext::TSLoweringContext(
 TSLoweringContext::TSLoweringContext(
     const std::string& name,
     BackendDevice device,
-    c10::ArrayRef<Node*> post_order,
+    c10::ArrayRef<const Node*> post_order,
     Util::EmissionMap emit_status)
     : torch::lazy::LoweringContext(name, device, post_order, emit_status),
       graph_(std::make_shared<torch::jit::Graph>()),
diff --git a/torch/csrc/lazy/ts_backend/ts_lowering_context.h b/torch/csrc/lazy/ts_backend/ts_lowering_context.h
index 700f27d505fd3..0ad2b669c0e6b 100644
--- a/torch/csrc/lazy/ts_backend/ts_lowering_context.h
+++ b/torch/csrc/lazy/ts_backend/ts_lowering_context.h
@@ -71,7 +71,7 @@ class TORCH_API TSLoweringContext : public LoweringContext {
   TSLoweringContext(
       const std::string& name,
       BackendDevice device,
-      c10::ArrayRef<Node*> post_order,
+      c10::ArrayRef<const Node*> post_order,
       Util::EmissionMap emit_status);
 
   size_t AddResult(const Output& output) override {

From 7440a02eca4899023c0abd34b28247fb1e07a3b1 Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@fb.com>
Date: Thu, 10 Nov 2022 19:22:09 +0000
Subject: [PATCH 0751/1922] Add warnings & regressions info text (#88837)

Add text about what warnings and accuracy regressions dropdowns mean.

Sample: https://github.com/pytorch/torchdynamo/issues/1831#issuecomment-1310770285

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88837
Approved by: https://github.com/anijain2305
---
 benchmarks/dynamo/runner.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py
index 9c0538368b449..99c70426cd36e 100755
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@@ -707,7 +707,12 @@ def get_metric_title(self, metric):
 
     def generate_warnings(self):
         title = "## Warnings ##"
-        body = ""
+        body = (
+            "We flag models where:\n\n"
+            " - speedup < 0.95x\n"
+            " - compilation latency > 120 sec.\n"
+            " - compression ratio < 0.9\n\n"
+        )
         for metric in [
             "speedup",
             "compilation_latency",
@@ -858,9 +863,14 @@ def find_last_2(self, suite, device, dtype, compiler):
 
     def generate_comment(self):
         title = "## Accuracy Regressions ##\n"
-        body = ""
+        body = (
+            "For each relevant compiler, we compare the most recent 2 reports "
+            "(that run actually the compiler) to find models where previously "
+            "successful accuracy tests now fail.\n\n"
+        )
         dtype = self.args.dtypes[0]
         device = self.args.devices[0]
+        regressions_present = False
         for suite in self.args.suites:
             dfs = []
             for compiler in self.args.flag_compilers:
@@ -893,6 +903,7 @@ def generate_comment(self):
             df = pd.concat(dfs, axis=0)
             if df.empty:
                 continue
+            regressions_present = True
             tabform = tabulate(df, headers="keys", tablefmt="pretty", showindex="never")
             str_io = io.StringIO()
             str_io.write("\n")
@@ -902,6 +913,9 @@ def generate_comment(self):
             str_io.write("~~~\n")
             body += str_io.getvalue()
 
+        if not regressions_present:
+            body += "No accuracy regressions found.\n"
+
         comment = generate_dropdown_comment(title, body)
 
         with open(f"{self.args.output_dir}/gh_accuracy_regression.txt", "w") as gh_fh:

From d9e7fdf79b919deec8565cd700492fdc0bdd92c6 Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang@fb.com>
Date: Thu, 10 Nov 2022 06:31:46 -0800
Subject: [PATCH 0752/1922] [14/N] Refactor _new_process_group_helper() to
 remove repeated code (#88351)

Changes:
- refactor parts of `_new_process_group_helper()` to remove repeated code

Differential Revision: [D41188274](https://our.internmc.facebook.com/intern/diff/D41188274)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88351
Approved by: https://github.com/kwen2501
---
 torch/distributed/distributed_c10d.py | 92 ++++++++-------------------
 1 file changed, 25 insertions(+), 67 deletions(-)

diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 41d0ee21d3e34..4a132d141e00a 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -925,8 +925,6 @@ def _new_process_group_helper(
         pg = ProcessGroupMPI.create(global_ranks_in_group)
         if not pg:
             return GroupMember.NON_GROUP_MEMBER
-        _world.pg_map[pg] = (Backend.MPI, None)
-        _world.pg_names[pg] = group_name
     else:
         # If this is a subgroup (which means group_ranks is specified),
         # we check if the current process is a member of the new group.
@@ -943,27 +941,6 @@ def _new_process_group_helper(
             if pg_options is not None:
                 raise RuntimeError("GLOO options not supported")
             pg = ProcessGroupGloo(prefix_store, group_rank, group_size, timeout=timeout)
-            # In debug mode and if GLOO is available, wrap in a wrapper PG that
-            # enables enhanced collective checking for debugability.
-            if get_debug_level() == DebugLevel.DETAIL:
-                if not _GLOO_AVAILABLE:
-                    logger.info(
-                        """TORCH_DISTRIBUTED_DEBUG was set to DETAIL, but
-                                GLOO is not available. Build with Gloo to
-                                create a wrapper process group in debug mode
-                                to aid collective desynchronization debugging."""
-                    )
-                else:
-                    pg = _create_process_group_wrapper(
-                        wrapped_pg=pg,
-                        store_prefix=group_name,
-                        store=store,
-                        rank=group_rank,
-                        world_size=group_size,
-                        timeout=timeout,
-                    )
-            _world.pg_map[pg] = (Backend.GLOO, store)
-            _world.pg_names[pg] = group_name
         elif backend == Backend.NCCL:
             if not is_nccl_available():
                 raise RuntimeError("Distributed package doesn't have NCCL " "built in")
@@ -978,54 +955,12 @@ def _new_process_group_helper(
                 pg_options._timeout = timeout
 
             pg = ProcessGroupNCCL(prefix_store, group_rank, group_size, pg_options)
-            # In debug mode and if GLOO is available, wrap in a wrapper PG that
-            # enables enhanced collective checking for debugability.
-            if get_debug_level() == DebugLevel.DETAIL:
-                if not _GLOO_AVAILABLE:
-                    logger.info(
-                        """TORCH_DISTRIBUTED_DEBUG was set to DETAIL, but
-                                GLOO is not available. Build with Gloo to
-                                create a wrapper process group in debug mode
-                                to aid collective desynchronization debugging."""
-                    )
-                else:
-                    pg = _create_process_group_wrapper(
-                        wrapped_pg=pg,
-                        store_prefix=group_name,
-                        store=store,
-                        rank=group_rank,
-                        world_size=group_size,
-                        timeout=timeout,
-                    )
-            _world.pg_map[pg] = (Backend.NCCL, store)
-            _world.pg_names[pg] = group_name
         elif backend == Backend.UCC and is_ucc_available():
             # TODO: once UCC plugin is fully deprecated, remove
             # is_ucc_available() from above elif-condition and raise
             # RuntimeError if is_ucc_available() returns false.
 
             pg = ProcessGroupUCC(prefix_store, group_rank, group_size, timeout=timeout)
-            # In debug mode and if GLOO is available, wrap in a wrapper PG that
-            # enables enhanced collective checking for debugability.
-            if get_debug_level() == DebugLevel.DETAIL:
-                if not _GLOO_AVAILABLE:
-                    logger.info(
-                        """TORCH_DISTRIBUTED_DEBUG was set to DETAIL, but
-                                GLOO is not available. Build with Gloo to
-                                create a wrapper process group in debug mode
-                                to aid collective desynchronization debugging."""
-                    )
-                else:
-                    pg = _create_process_group_wrapper(
-                        wrapped_pg=pg,
-                        store_prefix=group_name,
-                        store=store,
-                        rank=group_rank,
-                        world_size=group_size,
-                        timeout=timeout,
-                    )
-            _world.pg_map[pg] = (Backend.UCC, store)
-            _world.pg_names[pg] = group_name
         else:
             assert backend.upper() in Backend._plugins, (
                 f"Unknown c10d backend type {backend.upper()}"
@@ -1047,9 +982,32 @@ def _new_process_group_helper(
                 dist_backend_opts.global_ranks_in_group = global_ranks_in_group
 
                 pg = creator_fn(dist_backend_opts, pg_options)
-            _world.pg_map[pg] = (backend, store)
-            _world.pg_names[pg] = group_name
 
+    # Process group wrapper initialization for supported PGs when TORCH_DISTRIBUTED_DEBUG is set
+    if backend in [Backend.GLOO, Backend.NCCL, Backend.UCC]:
+        # In debug mode and if GLOO is available, wrap in a wrapper PG that
+        # enables enhanced collective checking for debuggability.
+        if get_debug_level() == DebugLevel.DETAIL:
+            if not _GLOO_AVAILABLE:
+                logger.info(
+                    """TORCH_DISTRIBUTED_DEBUG was set to DETAIL, but
+                            GLOO is not available. Build with Gloo to
+                            create a wrapper process group in debug mode
+                            to aid collective desynchronization debugging."""
+                )
+            else:
+                pg = _create_process_group_wrapper(
+                    wrapped_pg=pg,
+                    store_prefix=group_name,
+                    store=store,
+                    rank=group_rank,
+                    world_size=group_size,
+                    timeout=timeout,
+                )
+
+    # update global state
+    _world.pg_map[pg] = (backend, store)
+    _world.pg_names[pg] = group_name
     return pg
 
 
From f5d8386be0f4f39e8c052436dd43e2d4193a0b93 Mon Sep 17 00:00:00 2001
From: Felix Divo <4403130+felixdivo@users.noreply.github.com>
Date: Thu, 10 Nov 2022 19:29:29 +0000
Subject: [PATCH 0753/1922] Bring Unfold/Fold param doc order in line with code
 (#88819)

Now the first parameter (if used as a positional argument) is the first that is listed in the docs.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88819
Approved by: https://github.com/ngimel
---
 torch/nn/modules/fold.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/torch/nn/modules/fold.py b/torch/nn/modules/fold.py
index 5380cf155c907..a7b1f758dd5a3 100644
--- a/torch/nn/modules/fold.py
+++ b/torch/nn/modules/fold.py
@@ -50,13 +50,13 @@ class Fold(Module):
         output_size (int or tuple): the shape of the spatial dimensions of the
                                     output (i.e., ``output.sizes()[2:]``)
         kernel_size (int or tuple): the size of the sliding blocks
-        stride (int or tuple): the stride of the sliding blocks in the input
-                               spatial dimensions. Default: 1
-        padding (int or tuple, optional): implicit zero padding to be added on
-                                          both sides of input. Default: 0
         dilation (int or tuple, optional): a parameter that controls the
                                            stride of elements within the
                                            neighborhood. Default: 1
+        padding (int or tuple, optional): implicit zero padding to be added on
+                                          both sides of input. Default: 0
+        stride (int or tuple): the stride of the sliding blocks in the input
+                               spatial dimensions. Default: 1
 
     * If :attr:`output_size`, :attr:`kernel_size`, :attr:`dilation`,
       :attr:`padding` or :attr:`stride` is an int or a tuple of length 1 then
@@ -192,13 +192,13 @@ class Unfold(Module):
 
     Args:
         kernel_size (int or tuple): the size of the sliding blocks
-        stride (int or tuple, optional): the stride of the sliding blocks in the input
-                                         spatial dimensions. Default: 1
-        padding (int or tuple, optional): implicit zero padding to be added on
-                                          both sides of input. Default: 0
         dilation (int or tuple, optional): a parameter that controls the
                                            stride of elements within the
                                            neighborhood. Default: 1
+        padding (int or tuple, optional): implicit zero padding to be added on
+                                          both sides of input. Default: 0
+        stride (int or tuple, optional): the stride of the sliding blocks in the input
+                                         spatial dimensions. Default: 1
 
     * If :attr:`kernel_size`, :attr:`dilation`, :attr:`padding` or
       :attr:`stride` is an int or a tuple of length 1, their values will be

From 3e22187f27a31648c1c6028c6e577bf89e3328b0 Mon Sep 17 00:00:00 2001
From: erjia <erjia@fb.com>
Date: Thu, 10 Nov 2022 19:54:19 +0000
Subject: [PATCH 0754/1922] [DataPipe] Deprecating drop_empty_batches from
 Filter and other functional APIs (#88693)

- Deprecating based on https://github.com/pytorch/data/issues/163

Corresponding PRs from TorchData: https://github.com/pytorch/data/pull/890
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88693
Approved by: https://github.com/NivekT
---
 torch/utils/data/datapipes/iter/selecting.py | 43 +++++---------------
 torch/utils/data/datapipes/utils/common.py   | 16 +-------
 2 files changed, 12 insertions(+), 47 deletions(-)

diff --git a/torch/utils/data/datapipes/iter/selecting.py b/torch/utils/data/datapipes/iter/selecting.py
index 2ba91b36fffb5..470d2952241f5 100644
--- a/torch/utils/data/datapipes/iter/selecting.py
+++ b/torch/utils/data/datapipes/iter/selecting.py
@@ -1,11 +1,10 @@
-from typing import Callable, Iterator, Optional, TypeVar
+from typing import Callable, Iterator, Tuple, TypeVar
 
 from torch.utils.data.datapipes._decorator import functional_datapipe
 from torch.utils.data.datapipes.datapipe import IterDataPipe
 from torch.utils.data.datapipes.dataframe import dataframe_wrapper as df_wrapper
 from torch.utils.data.datapipes.utils.common import (
     _check_unpickable_fn,
-    _deprecation_warning,
     StreamWrapper,
     validate_input_col
 )
@@ -13,6 +12,7 @@
 
 __all__ = ["FilterIterDataPipe", ]
 
+T = TypeVar('T')
 T_co = TypeVar('T_co', covariant=True)
 
 
@@ -24,7 +24,6 @@ class FilterIterDataPipe(IterDataPipe[T_co]):
     Args:
         datapipe: Iterable DataPipe being filtered
         filter_fn: Customized function mapping an element to a boolean.
-        drop_empty_batches (Deprecated): By default, drops a batch if it is empty after filtering instead of keeping an empty list
         input_col: Index or indices of data which ``filter_fn`` is applied, such as:
 
             - ``None`` as default to apply ``filter_fn`` to the data directly.
@@ -41,15 +40,13 @@ class FilterIterDataPipe(IterDataPipe[T_co]):
         >>> list(filter_dp)
         [0, 2, 4]
     """
-    datapipe: IterDataPipe
+    datapipe: IterDataPipe[T_co]
     filter_fn: Callable
-    drop_empty_batches: bool
 
     def __init__(
         self,
-        datapipe: IterDataPipe,
+        datapipe: IterDataPipe[T_co],
         filter_fn: Callable,
-        drop_empty_batches: Optional[bool] = None,
         input_col=None,
     ) -> None:
         super().__init__()
@@ -58,17 +55,6 @@ def __init__(
         _check_unpickable_fn(filter_fn)
         self.filter_fn = filter_fn  # type: ignore[assignment]
 
-        if drop_empty_batches is None:
-            drop_empty_batches = True
-        else:
-            _deprecation_warning(
-                type(self).__name__,
-                deprecation_version="1.12",
-                removal_version="1.14",
-                old_argument_name="drop_empty_batches",
-            )
-        self.drop_empty_batches = drop_empty_batches
-
         self.input_col = input_col
         validate_input_col(filter_fn, input_col)
 
@@ -83,13 +69,13 @@ def _apply_filter_fn(self, data) -> bool:
 
     def __iter__(self) -> Iterator[T_co]:
         for data in self.datapipe:
-            filtered = self._returnIfTrue(data)
-            if self._isNonEmpty(filtered):
+            condition, filtered = self._returnIfTrue(data)
+            if condition:
                 yield filtered
             else:
                 StreamWrapper.close_streams(data)
 
-    def _returnIfTrue(self, data):
+    def _returnIfTrue(self, data: T) -> Tuple[bool, T]:
         condition = self._apply_filter_fn(data)
 
         if df_wrapper.is_column(condition):
@@ -99,18 +85,11 @@ def _returnIfTrue(self, data):
                 if mask:
                     result.append(df_wrapper.get_item(data, idx))
             if len(result):
-                return df_wrapper.concat(result)
+                return True, df_wrapper.concat(result)
             else:
-                return None
+                return False, None  # type: ignore[return-value]
 
         if not isinstance(condition, bool):
             raise ValueError("Boolean output is required for `filter_fn` of FilterIterDataPipe, got", type(condition))
-        if condition:
-            return data
-
-    def _isNonEmpty(self, data):
-        if df_wrapper.is_dataframe(data):
-            return True
-        r = data is not None and \
-            not (isinstance(data, list) and len(data) == 0 and self.drop_empty_batches)
-        return r
+
+        return condition, data
diff --git a/torch/utils/data/datapipes/utils/common.py b/torch/utils/data/datapipes/utils/common.py
index 20c61c0ead11a..75d9a5cf173c4 100644
--- a/torch/utils/data/datapipes/utils/common.py
+++ b/torch/utils/data/datapipes/utils/common.py
@@ -227,21 +227,7 @@ def validate_pathname_binary_tuple(data: Tuple[str, IOBase]):
 
 
 # Deprecated function names and its corresponding DataPipe type and kwargs for the `_deprecation_warning` function
-_iter_deprecated_functional_names: Dict[str, Dict] = {"open_file_by_fsspec":
-                                                      {"old_class_name": "FSSpecFileOpener",
-                                                       "deprecation_version": "0.4.0",
-                                                       "removal_version": "0.6.0",
-                                                       "old_functional_name": "open_file_by_fsspec",
-                                                       "new_functional_name": "open_files_by_fsspec",
-                                                       "deprecate_functional_name_only": True},
-                                                      "open_file_by_iopath":
-                                                      {"old_class_name": "IoPathFileOpener",
-                                                       "deprecation_version": "0.4.0",
-                                                       "removal_version": "0.6.0",
-                                                       "old_functional_name": "open_file_by_iopath",
-                                                       "new_functional_name": "open_files_by_iopath",
-                                                       "deprecate_functional_name_only": True}}
-
+_iter_deprecated_functional_names: Dict[str, Dict] = {}
 _map_deprecated_functional_names: Dict[str, Dict] = {}
 
 
From 64de288f101e9a1e1ca9224a1d1108066ed6b8e8 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Thu, 10 Nov 2022 20:56:30 +0000
Subject: [PATCH 0755/1922] Revert "[Inductor] Build FX Linear + Permute
 Vertical Fusion in Inductor (#88566)"

This reverts commit 48b58930cbfa725ac25a9303d496c76bf983574d.

Reverted https://github.com/pytorch/pytorch/pull/88566 on behalf of https://github.com/huydhn due to This change breaks trunk https://hud.pytorch.org/pytorch/pytorch/commit/48b58930cbfa725ac25a9303d496c76bf983574d
---
 test/inductor/test_torchinductor.py | 106 ---------------
 torch/_inductor/config.py           |   3 -
 torch/_inductor/overrides.py        | 199 ----------------------------
 3 files changed, 308 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 064f04291a8e7..db6c5dfc2bd15 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -10,7 +10,6 @@
 import typing
 import unittest
 import weakref
-from typing import Any, Callable
 from unittest.mock import patch
 
 import torch
@@ -19,7 +18,6 @@
 from torch._dynamo.debug_utils import same_two_models
 from torch._dynamo.testing import rand_strided, same
 from torch.fx.experimental.proxy_tensor import make_fx
-from torch.fx.passes.shape_prop import ShapeProp
 from torch.nn import functional as F
 from torch.testing._internal.common_utils import (
     IS_FBCODE,
@@ -42,14 +40,6 @@
     from torch._inductor import codecache, config, metrics
     from torch._inductor.compile_fx import compile_fx, complex_memory_overlap
     from torch._inductor.ir import IndexingDiv, ModularIndexing
-    from torch._inductor.overrides import (
-        linear_permute_fusion,
-        linear_transpose,
-        permute_linear_fusion,
-        permute_matmul_fusion,
-        transpose_linear,
-        transpose_matmul,
-    )
     from torch._inductor.sizevars import SizeVarAllocator
     from torch._inductor.utils import has_torchvision_roi_align, has_triton, timed
 
@@ -139,29 +129,6 @@ def maybe_test(*args, **kwargs):
     return wrap_test
 
 
-PassFunc = Callable[[torch.fx.GraphModule, Any], torch.fx.GraphModule]
-
-
-def chain_passes(*passes: PassFunc) -> PassFunc:
-    def parent_pass(module: torch.fx.GraphModule, input: Any) -> torch.fx.GraphModule:
-        for pass_ in passes:
-            if isinstance(module, torch.fx.GraphModule):
-                ShapeProp(module).propagate(*input)
-            module = pass_(module)
-        return module
-
-    return parent_pass
-
-
-def count_call_function(module: torch.fx.GraphModule, target_op: Any) -> int:
-    return sum(
-        [
-            1 if (n.op == "call_function" and n.target == target_op) else 0
-            for n in module.graph.nodes
-        ]
-    )
-
-
 class TestCase(TorchTestCase):
     @classmethod
     def setUpClass(cls):
@@ -1615,79 +1582,6 @@ def fn(a, b):
         y = torch.tensor(0)
         self.assertEqual(fn(x, y), x + x)
 
-    def test_linear_permute_fusion(self):
-        class TestModule(torch.nn.Module):
-            def __init__(self, k: int, n: int):
-                super().__init__()
-                self.weight = torch.nn.Parameter(torch.randn(n, k))
-                self.bias = torch.nn.Parameter(torch.randn(n))
-
-            def forward(self, input: torch.Tensor):
-                a0 = torch.nn.functional.linear(input, self.weight, self.bias)
-                b0 = a0.permute(0, 2, 1)
-                return b0
-
-        m, k, n = 16, 8, 4
-        trace_func = chain_passes(torch.fx.symbolic_trace, linear_permute_fusion)
-        module = TestModule(k, n).eval()
-        input = torch.randn(6, m, k)
-        traced = trace_func(module, [input])
-        num_linear = count_call_function(traced, torch.nn.functional.linear)
-        num_linear_transpose = count_call_function(traced, linear_transpose)
-        self.assertEqual(num_linear, 0)
-        self.assertEqual(num_linear_transpose, 1)
-
-        self.assertTrue(torch.allclose(module(input), traced(input)))
-
-    def test_permute_linear_fusion(self):
-        class TestModule(torch.nn.Module):
-            def __init__(self, k: int, n: int):
-                super().__init__()
-                self.weight = torch.nn.Parameter(torch.randn(n, k))
-                self.bias = torch.nn.Parameter(torch.randn(n))
-
-            def forward(self, input: torch.Tensor):
-                input1 = input.permute(0, 2, 1)
-                output = torch.nn.functional.linear(input1, self.weight, self.bias)
-                return output
-
-        m, k, n = 16, 8, 4
-
-        trace_func = chain_passes(torch.fx.symbolic_trace, permute_linear_fusion)
-        module = TestModule(k, n).eval()
-        input = torch.randn(6, k, m)
-        traced = trace_func(module, [input])
-        num_linear = count_call_function(traced, torch.nn.functional.linear)
-        num_transpose_linear = count_call_function(traced, transpose_linear)
-        self.assertEqual(num_linear, 0)
-        self.assertEqual(num_transpose_linear, 1)
-
-        self.assertTrue(torch.allclose(module(input), traced(input)))
-
-    def test_permute_bmm_fusion(self):
-        class TestModule(torch.nn.Module):
-            def __init__(self, batch: int, k: int, n: int):
-                super().__init__()
-                self.other = torch.randn(batch, k, n)
-
-            def forward(self, input: torch.Tensor):
-                input1 = input.permute(0, 2, 1)
-                output = torch.bmm(input1, self.other)
-                return output
-
-        batch, m, k, n = 6, 16, 8, 4
-
-        trace_func = chain_passes(torch.fx.symbolic_trace, permute_matmul_fusion)
-        module = TestModule(batch, k, n).eval()
-        input = torch.randn(batch, k, m)
-        traced = trace_func(module, [input])
-        num_bmm = count_call_function(traced, torch.bmm)
-        num_transpose_matmul = count_call_function(traced, transpose_matmul)
-        self.assertEqual(num_bmm, 0)
-        self.assertEqual(num_transpose_matmul, 1)
-
-        self.assertTrue(torch.allclose(module(input), traced(input)))
-
     def test_slice1(self):
         def fn(a):
             return (
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index c9b7623cf5287..910e6d20b4d6f 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -67,9 +67,6 @@
 # How to import torchdynamo, either torchdynamo or torch.dynamo
 dynamo_import = inductor_import.replace("inductor", "dynamo")
 
-# Fx-based linear/matmul/bmm + permute/transpose vertical fusion
-permute_fusion = os.environ.get("TORCHINDUCTOR_PERMUTE_FUSION", "0") == "1"
-
 
 # config specific to codegen/cpp.pp
 class cpp:
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index 69a5bc6710f8c..581e1996a436c 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -19,8 +19,6 @@
 from torch.nn.utils.fusion import fuse_conv_bn_eval
 from torch.overrides import TorchFunctionMode
 
-from . import config
-
 log = logging.getLogger(__name__)
 
 
@@ -315,14 +313,6 @@ def check_node_is_binary(node):
 
 
 def fuse_fx(gm: torch.fx.GraphModule, example_inputs):
-    if config.permute_fusion:
-        # For linear permute fusion, we need to check input info to identify
-        # and perform proper permutation/transpose
-        ShapeProp(gm).propagate(*example_inputs)
-        gm = linear_permute_fusion(gm)
-        gm = permute_linear_fusion(gm)
-        gm = permute_matmul_fusion(gm)
-
     # make sure the autograd is disabled.
     if torch.is_grad_enabled():
         return gm
@@ -418,195 +408,6 @@ def _philox_rand_like(input, seed, offset):
     return torch.rand_like(input)
 
 
-class NormalizedLinearNode:
-    def __init__(self, node: torch.fx.Node) -> None:
-        assert node.op == "call_function"
-        assert node.target in [torch.nn.functional.linear]
-        self.node: torch.fx.Node = node
-
-    def get_input(self) -> torch.fx.Node:
-        if len(self.node.args) > 0:
-            return self.node.args[0]
-        else:
-            return self.node.kwargs["input"]
-
-    def get_weight(self) -> torch.fx.Node:
-        if len(self.node.args) > 1:
-            return self.node.args[1]
-        else:
-            return self.node.kwargs["weight"]
-
-    def get_bias(self) -> torch.fx.Node:
-        if len(self.node.args) > 2:
-            return self.node.args[2]
-        else:
-            return self.node.kwargs["bias"]
-
-
-class NormalizedMatmulNode:
-    def __init__(self, node: torch.fx.Node) -> None:
-        assert node.op == "call_function"
-        assert node.target in [torch.bmm, torch.matmul]
-        self.node: torch.fx.Node = node
-
-    def get_input(self) -> torch.fx.Node:
-        if len(self.node.args) > 0:
-            return self.node.args[0]
-        else:
-            return self.node.kwargs["input"]
-
-    def get_other(self) -> torch.fx.Node:
-        if len(self.node.args) > 1:
-            return self.node.args[1]
-        else:
-            return self.node.kwargs["other"]
-
-
-def check_permute(node: torch.fx.Node):
-    ranks = len(node.meta["tensor_meta"].shape)
-    if len(node.args) > 3:
-        permutation = [node.args[i] % ranks for i in range(1, ranks + 1)]
-    elif (
-        "permutation" in node.kwargs
-        and node.kwargs["permutation"] is not None
-        and len(node.kwargs["permutation"]) > 2
-    ):
-        permutation = [i % ranks for i in node.kwargs["permutation"]]
-    else:
-        return False
-    allowed_permutation = list(range(ranks))
-    allowed_permutation[-1] = ranks - 2
-    allowed_permutation[-2] = ranks - 1
-    return permutation == allowed_permutation
-
-
-def linear_permute_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
-    for node in module.graph.nodes:
-        if (
-            node.op == "call_method"
-            and node.target == "permute"
-            and check_permute(node)
-        ):
-            if len(node.args) > 0:
-                input_node = node.args[0]
-            else:
-                input_node = node.kwargs["input"]
-            if (
-                input_node.op == "call_function"
-                and input_node.target == torch.nn.functional.linear
-            ):
-                normalized = NormalizedLinearNode(input_node)
-                input = normalized.get_input()
-                weight = normalized.get_weight()
-                bias = normalized.get_bias()
-                with module.graph.inserting_before(node):
-                    fused_node = module.graph.call_function(
-                        linear_transpose, args=(input, weight, bias)
-                    )
-                    node.replace_all_uses_with(fused_node)
-
-    module.graph.lint()
-    module.graph.eliminate_dead_code()
-    module.recompile()
-    return module
-
-
-# Y1 = X * W^T + bias
-# Y2 = Y1.permute(0, 2, 1)
-# ---->
-# Y2 = (W * X^T + bias.unsqueeze(-1))^T
-def linear_transpose(
-    input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor
-) -> torch.Tensor:
-    return torch.matmul(weight, input.transpose(-1, -2)) + bias.unsqueeze(-1)
-
-
-def permute_linear_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
-    for node in module.graph.nodes:
-        if node.op == "call_function" and node.target == torch.nn.functional.linear:
-            if len(node.args) > 0:
-                input_node = node.args[0]
-            else:
-                input_node = node.kwargs["input"]
-            if (
-                input_node.op == "call_method"
-                and input_node.target == "permute"
-                and check_permute(input_node)
-            ):
-                normalized = NormalizedLinearNode(node)
-                if len(input_node.args) > 0:
-                    input = input_node.args[0]
-                else:
-                    input = input_node.kwargs["input"]
-                weight = normalized.get_weight()
-                bias = normalized.get_bias()
-                with module.graph.inserting_before(node):
-                    fused_node = module.graph.call_function(
-                        transpose_linear, args=(input, weight, bias)
-                    )
-                    node.replace_all_uses_with(fused_node)
-
-    module.graph.lint()
-    module.graph.eliminate_dead_code()
-    module.recompile()
-    return module
-
-
-def permute_matmul_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
-    for node in module.graph.nodes:
-        if node.op == "call_function" and (
-            node.target == torch.bmm or node.target == torch.matmul
-        ):
-            normalized = NormalizedMatmulNode(node)
-            A = normalized.get_input()
-            B = normalized.get_other()
-            Atrans = Btrans = False
-            if A.op == "call_method" and A.target == "permute" and check_permute(A):
-                Atrans = True
-                if len(A.args) > 0:
-                    A = A.args[0]
-                else:
-                    A = A.kwargs["input"]
-
-            if B.op == "call_method" and B.target == "permute" and check_permute(B):
-                Btrans = True
-                if len(B.args) > 0:
-                    B = B.args[0]
-                else:
-                    B = B.kwargs["input"]
-
-            if Atrans or Btrans:
-                with module.graph.inserting_before(node):
-                    fused_node = module.graph.call_function(
-                        transpose_matmul,
-                        args=(A, B, Atrans, Btrans),
-                    )
-                node.replace_all_uses_with(fused_node)
-
-    module.graph.lint()
-    module.graph.eliminate_dead_code()
-    module.recompile()
-    return module
-
-
-# X1 = X.permute(0, 2, 1)
-# Y1 = X1 * W1^T + bias1
-# ---->
-# Y2 = X1.transpose(-1, -2) * W1^T + bias1
-def transpose_linear(
-    input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor
-) -> torch.Tensor:
-    return torch.matmul(input.transpose(-1, -2), weight.t()) + bias
-
-
-def transpose_matmul(A: torch.Tensor, B: torch.Tensor, Atrans: bool, Btrans: bool):
-    if Atrans:
-        A = A.transpose(-1, -2)
-    if Btrans:
-        B = B.transpose(-1, -2)
-    return torch.matmul(A, B)
-
-
 def replace_and_fuse_for_binary(
     computation_node, node, fuse_func, attr, modules, index_node, index_pointwise
 ):

From 6e791951b41dacfcd56d7b313c058863462ff946 Mon Sep 17 00:00:00 2001
From: maxren <maxren@meta.com>
Date: Wed, 9 Nov 2022 15:31:44 -0800
Subject: [PATCH 0756/1922] [xnnpack][on-device] executor class (#88778)

# Executor Class

Executor object used to wrap our xnn_runtime object. The ideal flow of this object looks as such:

```
executor.set_inputs(vector<tensor> inputs, vector<tensor> outputs)
executor.forward()
```

This will likely be returned by our delegate compile and given over to execute in order to run inference using the xnn runtime

##### Executorch Considerations
```
#include <ATen/Functions.h>
#include <ATen/Utils.h>
```
These Aten functions are included in order to use at::Tensor when setting the inputs, this will change when used for Executorch because we will be switching from at::Tensor to whatever tensor abstraction is used for ET. Seems like they have the same call for `.data_ptr<float>()`, so realistically all logic here will be the same.

ATen/Utils is used for TORCH_CHECK. We will switch to ET_CHECK_MESSAGE for executorch.

Differential Revision: [D40733121](https://our.internmc.facebook.com/intern/diff/D40733121/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88778
Approved by: https://github.com/digantdesai
---
 .../backends/xnnpack/executor/xnn_executor.h  | 69 +++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 torch/csrc/jit/backends/xnnpack/executor/xnn_executor.h

diff --git a/torch/csrc/jit/backends/xnnpack/executor/xnn_executor.h b/torch/csrc/jit/backends/xnnpack/executor/xnn_executor.h
new file mode 100644
index 0000000000000..f82bde231c90f
--- /dev/null
+++ b/torch/csrc/jit/backends/xnnpack/executor/xnn_executor.h
@@ -0,0 +1,69 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#include <xnnpack.h>
+#include <memory>
+#include <vector>
+
+namespace torch {
+namespace jit {
+namespace xnnpack {
+namespace delegate {
+
+class XNNExecutor {
+ private:
+  std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> runtime_;
+  std::vector<uint32_t> input_ids_;
+  std::vector<uint32_t> output_ids_;
+  std::vector<xnn_external_value> externals_;
+
+ public:
+  XNNExecutor(xnn_runtime_t runtime_ptr)
+      : runtime_(runtime_ptr, xnn_delete_runtime){};
+
+  template <typename T>
+  bool set_inputs(std::vector<T*>& inputs, std::vector<T*>& outputs) {
+    externals_.clear();
+
+    if (inputs.size() != input_ids_.size()) {
+      return false;
+    }
+
+    for (int i = 0; i < inputs.size(); i++) {
+      externals_.emplace_back(xnn_external_value{input_ids_[i], inputs[i]});
+    }
+
+    if (outputs.size() != output_ids_.size()) {
+      return false;
+    }
+
+    for (int i = 0; i < outputs.size(); i++) {
+      externals_.emplace_back(xnn_external_value{output_ids_[i], outputs[i]});
+    }
+
+    return true;
+  };
+
+  bool forward() {
+    xnn_status status =
+        xnn_setup_runtime(runtime_.get(), externals_.size(), externals_.data());
+
+    if (status != xnn_status_success) {
+      return false;
+    }
+
+    status = xnn_invoke_runtime(runtime_.get());
+
+    if (status != xnn_status_success) {
+      return false;
+    }
+
+    return true;
+  };
+
+  friend class XNNCompiler;
+};
+
+} // namespace delegate
+} // namespace xnnpack
+} // namespace jit
+} // namespace torch

From f9b621ba86a7a106cad5a39cd8257aa5bbde4511 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Thu, 10 Nov 2022 21:04:35 +0000
Subject: [PATCH 0757/1922] sub setup.py install -> develop (#88507)

If someone is building the project from source they're likely a contributor for which develop will be much more useful. For people that want to try the latest and greatest they can leverage the nightlies

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88507
Approved by: https://github.com/malfet
---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 3a80c8083a499..bcce2997b25b6 100644
--- a/README.md
+++ b/README.md
@@ -234,7 +234,7 @@ python tools/amd_build/build_amd.py
 Install PyTorch
 ```bash
 export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
-python setup.py install
+python setup.py develop
 ```
 
 Note that if you are using [Anaconda](https://www.anaconda.com/distribution/#download-section), you may experience an error caused by the linker:
@@ -251,7 +251,7 @@ This is caused by `ld` from the Conda environment shadowing the system `ld`. You
 
 ```bash
 export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
-MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install
+MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py develop
 ```
 
 **On Windows**
@@ -274,7 +274,7 @@ In this mode PyTorch computations will run on your CPU, not your GPU
 
 ```cmd
 conda activate
-python setup.py install
+python setup.py develop
 ```
 
 Note on OpenMP: The desired OpenMP implementation is Intel OpenMP (iomp). In order to link against iomp, you'll need to manually download the library and set up the building environment by tweaking `CMAKE_INCLUDE_PATH` and `LIB`. The instruction [here](https://github.com/pytorch/pytorch/blob/master/docs/source/notes/windows.rst#building-from-source) is an example for setting up both MKL and Intel OpenMP. Without these configurations for CMake, Microsoft Visual C OpenMP runtime (vcomp) will be used.
@@ -315,7 +315,7 @@ for /f "usebackq tokens=*" %i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\
 :: [Optional] If you want to override the CUDA host compiler
 set CUDAHOSTCXX=C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\HostX64\x64\cl.exe
 
-python setup.py install
+python setup.py develop
 
 ```
 

From 6f32d6584468c8a426228fc457e76fd759441ff9 Mon Sep 17 00:00:00 2001
From: maxren <maxren@meta.com>
Date: Wed, 9 Nov 2022 15:33:00 -0800
Subject: [PATCH 0758/1922] [xnnpack][on-device] compiler --> executor object
 (#88779)

#### XNN Compiler Object
This is purely to abstract away the subgraph rebuild from the flatbuffer object. CompileModel return an executor object which we can use to setup inputs and run forward with.

#### Executorch Considerations
We Include ATen/utils for torch_check, this will be changed when moving to executorch

Differential Revision: [D40733163](https://our.internmc.facebook.com/intern/diff/D40733163/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88779
Approved by: https://github.com/digantdesai
---
 .../xnnpack/compiler/xnn_compiler.cpp         | 128 ++++++++++++++++++
 .../backends/xnnpack/compiler/xnn_compiler.h  |  25 ++++
 2 files changed, 153 insertions(+)
 create mode 100644 torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp
 create mode 100644 torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.h

diff --git a/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp b/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp
new file mode 100644
index 0000000000000..395d59a1cf21d
--- /dev/null
+++ b/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp
@@ -0,0 +1,128 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#include <caffe2/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.h>
+#include <torch/csrc/jit/backends/xnnpack/serialization/schema_generated.h>
+
+#include <ATen/Utils.h>
+#include <unordered_set>
+
+namespace torch {
+namespace jit {
+namespace xnnpack {
+namespace delegate {
+
+XNNExecutor XNNCompiler::compileModel(std::string ser_model) {
+  const char* buffer_pointer = ser_model.data();
+
+  auto output_min = -std::numeric_limits<float>::infinity();
+  auto output_max = std::numeric_limits<float>::infinity();
+
+  auto flatbuffer_graph = fb_xnnpack::GetXNNGraph(buffer_pointer);
+  // initialize xnnpack
+  xnn_status status = xnn_initialize(/*allocator =*/nullptr);
+  TORCH_CHECK(xnn_status_success == status, "Failed to initialize xnnpack");
+
+  // create xnnpack subgraph
+  xnn_subgraph_t subgraph_ptr = nullptr;
+
+  // TODO: @maxren serialize extern_ids in flatbuffer schema
+  std::unordered_set<uint32_t> extern_ids;
+  for (auto input_id : *flatbuffer_graph->input_ids()) {
+    extern_ids.insert(input_id);
+  }
+  for (auto output_id : *flatbuffer_graph->output_ids()) {
+    extern_ids.insert(output_id);
+  }
+  status = xnn_create_subgraph(
+      /*external_value_ids=*/extern_ids.size(),
+      /*flags=*/0,
+      &subgraph_ptr);
+  TORCH_CHECK(xnn_status_success == status, "Failed to create xnn subgraph");
+
+  // mapping from old ids to new created value ids
+  // The old ids that were serialied were generated AoT, since
+  // we are re-defining tensor values, the defined IDs could be
+  // different from the ones generated AoT, as a result, we need
+  // a new mapping from the old ids to the newly created ones
+  std::unordered_map<uint32_t, uint32_t> remapped_ids;
+
+  for (auto value : *flatbuffer_graph->values()) {
+    switch (value->value_type()) {
+      case fb_xnnpack::ValueUnion::XNNTensorValue: {
+        auto tensor_value = value->value_as_XNNTensorValue();
+
+        const void* data_ptr = nullptr;
+        auto buffer_idx = tensor_value->constant_buffer_idx();
+        if (buffer_idx != 0) {
+          // TODO: @maxren implement data handling
+          TORCH_CHECK(false, "Cosntant data handling not yet implemented")
+        }
+        std::vector<size_t> dims_data;
+        for (auto dim : *tensor_value->dims()) {
+          dims_data.push_back(static_cast<size_t>(dim));
+        }
+
+        uint32_t id = XNN_INVALID_VALUE_ID;
+        status = xnn_define_tensor_value(
+            /*subgraph=*/subgraph_ptr,
+            /*datatype=*/xnn_datatype_fp32,
+            /*num_dims=*/tensor_value->num_dims(),
+            /*dims=*/dims_data.data(),
+            /*data=*/data_ptr,
+            /*external_id=*/tensor_value->external_id(),
+            /*flags=*/tensor_value->flags(),
+            /*id_out=*/&id);
+        TORCH_CHECK(
+            status == xnn_status_success,
+            "Failed to define tensor values in graph")
+        // map serialized id to newly generated id
+        remapped_ids.emplace(std::make_pair(tensor_value->id_out(), id));
+        break;
+      }
+      default: {
+        TORCH_CHECK(false, "Unhandled value type found in deserialization");
+      }
+    }
+  }
+
+  for (auto node : *flatbuffer_graph->nodes()) {
+    switch (node->node_type()) {
+      case fb_xnnpack::NodeUnion::XNNAdd: {
+        auto graph_node = node->node_as_XNNAdd();
+        status = xnn_define_add2(
+            subgraph_ptr,
+            output_min,
+            output_max,
+            remapped_ids.at(graph_node->input1_id()),
+            remapped_ids.at(graph_node->input2_id()),
+            remapped_ids.at(graph_node->output_id()),
+            graph_node->flags());
+        TORCH_CHECK(status == xnn_status_success, "Failed to create add node")
+        break;
+      }
+      default:
+        TORCH_CHECK(false, "Unhandled node type found in deserialization");
+    }
+  }
+
+  xnn_runtime_t runtime_ptr = nullptr;
+  status = xnn_create_runtime_v2(subgraph_ptr, nullptr, 0, &runtime_ptr);
+  TORCH_CHECK(xnn_status_success == status);
+
+  XNNExecutor executor(runtime_ptr);
+
+  for (auto old_id : *flatbuffer_graph->input_ids()) {
+    executor.input_ids_.push_back(remapped_ids.at(old_id));
+  }
+
+  for (auto old_id : *flatbuffer_graph->output_ids()) {
+    executor.output_ids_.push_back(remapped_ids.at(old_id));
+  }
+
+  return executor;
+};
+
+} // namespace delegate
+} // namespace xnnpack
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.h b/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.h
new file mode 100644
index 0000000000000..99eecfdcaa45d
--- /dev/null
+++ b/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.h
@@ -0,0 +1,25 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#include <caffe2/torch/csrc/jit/backends/xnnpack/executor/xnn_executor.h>
+#include <xnnpack.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace torch {
+namespace jit {
+namespace xnnpack {
+namespace delegate {
+
+class XNNCompiler {
+ public:
+  // Takes Flatbuffer Serialized XNNPack Model and rebuilds the xnn-subgraph
+  // returns an executor object that holds the xnn runtime object which we
+  // can then use to set inputs and run inference using the xnn graph.
+  static XNNExecutor compileModel(std::string ser_model);
+};
+
+} // namespace delegate
+} // namespace xnnpack
+} // namespace jit
+} // namespace torch

From d987aee3f937027fda40b1037e447fe56ca8dbb2 Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Thu, 10 Nov 2022 06:56:26 -0800
Subject: [PATCH 0759/1922] [inductor] Remove import check for fast_flush
 (#88812)

https://github.com/pytorch/pytorch/pull/88557/ has a guard to make sure that triton's `do_bench` includes the `fast_flush` argument.  Since we've updated Triton to a sufficiently recent revision, we can remove that guard.

Differential Revision: [D41185280](https://our.internmc.facebook.com/intern/diff/D41185280/)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88812
Approved by: https://github.com/soumith
---
 torch/_inductor/triton_ops/autotune.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/torch/_inductor/triton_ops/autotune.py b/torch/_inductor/triton_ops/autotune.py
index 0fbdd2d4591b6..808241cd02a2f 100644
--- a/torch/_inductor/triton_ops/autotune.py
+++ b/torch/_inductor/triton_ops/autotune.py
@@ -132,14 +132,9 @@ def kernel_call():
                 stream=stream,
             )
 
-        import inspect
-
         from triton.testing import do_bench
 
-        if "fast_flush" in inspect.signature(do_bench).parameters.keys():
-            return do_bench(kernel_call, rep=40, fast_flush=True)
-        else:
-            return do_bench(kernel_call, rep=40)
+        return do_bench(kernel_call, rep=40, fast_flush=True)
 
     @dynamo_utils.dynamo_timed
     def autotune_to_one_config(self, *args, **kwargs):

From dcbac8fd9830ece12fb569127d2d1416d93a114e Mon Sep 17 00:00:00 2001
From: Catherine Lee <csl@fb.com>
Date: Thu, 10 Nov 2022 21:32:41 +0000
Subject: [PATCH 0760/1922] Use run_test in MPS (#88829)

Run mps through run_test to get disable test infra, create xml files (which can then be used for flakiness detection), and reruns

Also added the workflow steps for uploading the xml files
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88829
Approved by: https://github.com/malfet, https://github.com/huydhn
---
 .github/workflows/_mac-test-mps.yml | 18 ++++++++++++++++--
 test/run_test.py                    | 17 ++++++++++++++---
 2 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/_mac-test-mps.yml b/.github/workflows/_mac-test-mps.yml
index 3f7ba04f3e847..24203e0051538 100644
--- a/.github/workflows/_mac-test-mps.yml
+++ b/.github/workflows/_mac-test-mps.yml
@@ -66,6 +66,7 @@ jobs:
           ${CONDA_RUN} python3 -mpip install --no-index --no-deps dist/*.whl
 
       - name: Run MPS tests
+        id: test
         env:
           ENV_NAME: conda-test-env-${{ github.run_id }}
         shell: arch -arch arm64 bash {0}
@@ -74,5 +75,18 @@ jobs:
           set -ex
           # TODO(https://github.com/pytorch/pytorch/issues/79293)
 
-          ${CONDA_RUN} --cwd test python3 test_mps.py -v
-          ${CONDA_RUN} --cwd test python3 test_metal.py -v
+          ${CONDA_RUN} python3 test/run_test.py --mps --verbose
+
+      - name: Get workflow job id
+        id: get-job-id
+        uses: ./.github/actions/get-workflow-job-id
+        if: always()
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Upload test artifacts
+        uses: ./.github/actions/upload-test-artifacts
+        if: always() && (steps.test.conclusion == 'success' || steps.test.conclusion == 'failure')
+        with:
+          use-gha: true
+          file-suffix: ${{ github.job }}-mps-1-1-macos-m1-12_${{ steps.get-job-id.outputs.job-id }}
diff --git a/test/run_test.py b/test/run_test.py
index 307b83dfdcd76..59454c6aaa3f6 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -101,9 +101,6 @@ def skip_test_p(name: str) -> bool:
         'test_jit_simple',
         'test_jit_string',
         'test_kernel_launch_checks',
-        'test_metal',
-        # Right now we have a separate CI job for running MPS
-        'test_mps',
         'test_nnapi',
         'test_segment_reductions',
         'test_static_runtime',
@@ -843,6 +840,14 @@ def parse_args():
             "This requires functorch to already be installed."
         )
     )
+    parser.add_argument(
+        "--mps",
+        "--mps",
+        action="store_true",
+        help=(
+            "If this flag is present, we will only run test_mps and test_metal"
+        )
+    )
     parser.add_argument(
         "-core",
         "--core",
@@ -1052,6 +1057,12 @@ def get_selected_tests(options):
         # Exclude all functorch tests otherwise
         options.exclude.extend(FUNCTORCH_TESTS)
 
+    if options.mps:
+        selected_tests = ['test_mps', 'test_metal']
+    else:
+        # Exclude all mps tests otherwise
+        options.exclude.extend(['test_mps', 'test_metal'])
+
     # process reordering
     if options.bring_to_front:
         to_front = set(options.bring_to_front)

From d071b92082b142cd6685c934f62391b4af38e8a9 Mon Sep 17 00:00:00 2001
From: maxren <maxren@meta.com>
Date: Wed, 9 Nov 2022 15:33:57 -0800
Subject: [PATCH 0761/1922] [xnnpack][lite-int][on-device] rebuild serialized
 modules at runtime (#88780)

This is the on-device runtime work. We modify the compile and execute from our hacky solution from before to what will actually be running at runtime.

First we rebuild our graph from the serialized flatbuffer string. We also introduce a runtime wrapper that inherits CustomClassHolder that allows us to forward along the built xnngraph runtime to our execute function

Once the subgraph object has been rebuilt by our we pass it along to the runtime wrapper for us to forward along to execute

At execute we prep the input/outputs and invoke the runtime using our runtime wrapper. Finally we forward those results to our execution

Differential Revision: [D39413031](https://our.internmc.facebook.com/intern/diff/D39413031/)

**NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D39413031/)!
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88780
Approved by: https://github.com/digantdesai
---
 test/jit/xnnpack/test_xnnpack_delegate.py     |  2 +-
 .../backends/xnnpack/xnnpack_backend_lib.cpp  | 68 +++++++++++++++++--
 2 files changed, 65 insertions(+), 5 deletions(-)

diff --git a/test/jit/xnnpack/test_xnnpack_delegate.py b/test/jit/xnnpack/test_xnnpack_delegate.py
index 167a049ec0ccd..997cc757e629d 100644
--- a/test/jit/xnnpack/test_xnnpack_delegate.py
+++ b/test/jit/xnnpack/test_xnnpack_delegate.py
@@ -91,7 +91,7 @@ def forward(self, x, y):
             add_module,
             {
                 "forward": {
-                    "inputs" : [sample_inputs[0], sample_inputs[1]],
+                    "inputs" : [sample_inputs[0].clone(), sample_inputs[1].clone()],
                     "outputs": [sample_output]
                 }
             }
diff --git a/torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp b/torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp
index d55e89ed216fa..a5718820fc198 100644
--- a/torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp
+++ b/torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp
@@ -1,15 +1,27 @@
+#include <ATen/Functions.h>
 #include <ATen/Utils.h>
 #include <c10/core/TensorImpl.h>
 #include <torch/csrc/jit/backends/backend.h>
 #include <torch/csrc/jit/backends/backend_exception.h>
 
-#include <xnnpack.h>
+#include <caffe2/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.h>
+#include <torch/csrc/jit/backends/xnnpack/serialization/schema_generated.h>
 
 namespace torch {
 namespace jit {
 namespace xnnpack {
 namespace delegate {
 
+class XNNModelWrapper : public CustomClassHolder {
+ public:
+  XNNExecutor executor_;
+  XNNModelWrapper(XNNExecutor executor) : executor_(std::move(executor)){};
+
+  XNNModelWrapper() = delete;
+
+  XNNModelWrapper(const XNNModelWrapper& oldObject) = delete;
+};
+
 class XNNPackBackend : public PyTorchBackendInterface {
  public:
   // Constructor.
@@ -26,9 +38,27 @@ class XNNPackBackend : public PyTorchBackendInterface {
       c10::IValue processed,
       c10::impl::GenericDict method_compile_spec) override {
     auto dict = processed.toGenericDict();
+
+    // Compiling and wrapping exeuction object
+    std::string ser_model = dict.at("ser_model").toStringRef();
+    XNNExecutor executor = XNNCompiler::compileModel(ser_model);
+
+    auto model_ptr = c10::make_intrusive<XNNModelWrapper>(std::move(executor));
+    auto runtime_handle = IValue::make_capsule(model_ptr);
+    auto wrapper = c10::static_intrusive_pointer_cast<XNNModelWrapper>(
+        runtime_handle.toCapsule());
+
+    // Packing outputs into generic dict
     c10::Dict<c10::IValue, c10::IValue> handles(
         c10::StringType::get(), c10::AnyType::get());
-    handles.insert("forward", dict);
+
+    c10::Dict<c10::IValue, c10::IValue> ret(
+        c10::StringType::get(), c10::AnyType::get());
+
+    ret.insert("runtime", runtime_handle);
+    ret.insert("output_shapes", dict.at("outputs"));
+
+    handles.insert("forward", ret);
 
     return handles;
   }
@@ -41,9 +71,39 @@ class XNNPackBackend : public PyTorchBackendInterface {
   c10::impl::GenericList execute(
       c10::IValue handle,
       c10::impl::GenericList inputs) override {
-    auto answer = handle.toGenericDict().at("Answer");
+    auto dict = handle.toGenericDict();
+    auto output_shapes = dict.at("output_shapes").toList();
+
+    auto capsule = dict.at("runtime").toCapsule();
+    auto model_wrapper =
+        c10::static_intrusive_pointer_cast<XNNModelWrapper>(capsule);
+
+    XNNExecutor& executor = model_wrapper->executor_;
+
+    std::vector<float*> input_pointers;
+    for (int i = 0; i < inputs.size(); ++i) {
+      at::IValue val = inputs.get(i);
+      TORCH_CHECK(val.isTensor(), "Non-tensor inputs not supported");
+      input_pointers.push_back(val.toTensor().data_ptr<float>());
+    }
+
+    std::vector<at::Tensor> output_tensors;
+    std::vector<float*> output_pointers;
+    output_tensors.reserve(output_shapes.size());
+    for (int i = 0; i < output_shapes.size(); i++) {
+      auto o_shape = output_shapes.get(i).toIntVector();
+      auto output = at::empty(o_shape, c10::ScalarType::Float);
+      output_tensors.push_back(output);
+      output_pointers.push_back(output.data_ptr<float>());
+    }
+
+    TORCH_CHECK(
+        executor.set_inputs(input_pointers, output_pointers),
+        "Number of inputs/outputs does not match expected number of inputs/outputs");
+    TORCH_CHECK(executor.forward(), "Failed to invoke XNNPack runtime");
 
-    return answer.toList();
+    c10::List<at::Tensor> output_list(output_tensors);
+    return c10::impl::toList(output_list);
   }
 };
 

From 5fc7ea6edf23e3a6ed5a77af4e7ae7d4a0c641f0 Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Tue, 8 Nov 2022 07:59:11 -0800
Subject: [PATCH 0762/1922] [ao] fuser_method_mappings.py fixing public v
 private (#87516)

Summary: made _get_valid_patterns, _DEFAULT_PATTERN_TO_FUSER_METHOD,
_reverse3, _reverse2, _reverse_sequential_wrapper2,
_DEFAULT_OP_LIST_TO_FUSER_METHOD, _sequential_wrapper2 private

Test Plan: python test/test_public_bindings.py

Reviewers:

Subscribers:

Tasks:

Tags:

Differential Revision: [D40709281](https://our.internmc.facebook.com/intern/diff/D40709281)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87516
Approved by: https://github.com/jcaip
---
 .../ao_migration/test_quantization.py         |  2 +-
 test/quantization/core/test_backend_config.py |  4 +-
 torch/ao/quantization/__init__.py             |  5 --
 .../_common_operator_config_utils.py          | 28 ++++----
 .../backend_config/backend_config.py          |  4 +-
 .../quantization/backend_config/executorch.py |  6 +-
 .../ao/quantization/fuser_method_mappings.py  | 72 +++++++++----------
 torch/ao/quantization/fx/README.md            |  2 +-
 torch/quantization/fuser_method_mappings.py   |  2 +-
 9 files changed, 57 insertions(+), 68 deletions(-)

diff --git a/test/quantization/ao_migration/test_quantization.py b/test/quantization/ao_migration/test_quantization.py
index 52b8f631711f1..2617e7a1187d3 100644
--- a/test/quantization/ao_migration/test_quantization.py
+++ b/test/quantization/ao_migration/test_quantization.py
@@ -225,7 +225,7 @@ def test_function_import_fuser_method_mappings(self):
             "get_fuser_method",
         ]
         dict_list = [
-            "DEFAULT_OP_LIST_TO_FUSER_METHOD"
+            "_DEFAULT_OP_LIST_TO_FUSER_METHOD"
         ]
         self._test_function_import('fuser_method_mappings', function_list)
         self._test_dict_import('fuser_method_mappings', dict_list)
diff --git a/test/quantization/core/test_backend_config.py b/test/quantization/core/test_backend_config.py
index e1e7067d4135b..aa9de64824bce 100644
--- a/test/quantization/core/test_backend_config.py
+++ b/test/quantization/core/test_backend_config.py
@@ -14,7 +14,7 @@
     ObservationType,
 )
 from torch.ao.quantization.fake_quantize import FixedQParamsFakeQuantize
-from torch.ao.quantization.fuser_method_mappings import reverse_sequential_wrapper2
+from torch.ao.quantization.fuser_method_mappings import _reverse_sequential_wrapper2
 from torch.ao.quantization.fx.quantization_patterns import _default_root_node_getter
 from torch.ao.quantization.observer import default_fixed_qparams_range_0to1_observer
 
@@ -106,7 +106,7 @@ def test_dtype_config_to_dict(self):
     #  BackendPatternConfig
     # ======================
 
-    _fuser_method = reverse_sequential_wrapper2(nni.LinearReLU)
+    _fuser_method = _reverse_sequential_wrapper2(nni.LinearReLU)
 
     _num_tensor_args_to_observation_type = {
         0: ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT,
diff --git a/torch/ao/quantization/__init__.py b/torch/ao/quantization/__init__.py
index abc0bd24d97b6..2e8390c1acc7b 100644
--- a/torch/ao/quantization/__init__.py
+++ b/torch/ao/quantization/__init__.py
@@ -114,7 +114,6 @@
     "get_quantized_operator",
     "get_static_quant_module_class",
     "get_unique_devices_",
-    "get_valid_patterns",
     "is_activation_post_process",
     "load_observer_state_dict",
     "no_observer_set",
@@ -132,12 +131,8 @@
     "quantize_jit",
     "quantize_qat",
     "register_activation_post_process_hook",
-    "reverse2",
-    "reverse3",
-    "reverse_sequential_wrapper2",
     "script_qconfig",
     "script_qconfig_dict",
-    "sequential_wrapper2",
     "swap_module",
     "weight_observer_range_neg_127_to_127",
 ]
diff --git a/torch/ao/quantization/backend_config/_common_operator_config_utils.py b/torch/ao/quantization/backend_config/_common_operator_config_utils.py
index bc6f678485fb6..c2f0f7227b10b 100644
--- a/torch/ao/quantization/backend_config/_common_operator_config_utils.py
+++ b/torch/ao/quantization/backend_config/_common_operator_config_utils.py
@@ -15,9 +15,9 @@
 )
 from ..fake_quantize import FixedQParamsFakeQuantize
 from ..fuser_method_mappings import (
-    reverse_sequential_wrapper2,
-    reverse2,
-    reverse3,
+    _reverse_sequential_wrapper2,
+    _reverse2,
+    _reverse3,
     fuse_conv_bn,
     fuse_conv_bn_relu,
     fuse_linear_bn,
@@ -115,13 +115,13 @@ def _get_linear_configs(dtype_configs: List[DTypeConfig]) -> List[BackendPattern
     linear_configs.append(
         BackendPatternConfig((torch.nn.ReLU, torch.nn.Linear))
             .set_dtype_configs(dtype_configs)  # noqa: E131
-            .set_fuser_method(reverse_sequential_wrapper2(nni.LinearReLU))
+            .set_fuser_method(_reverse_sequential_wrapper2(nni.LinearReLU))
             .set_fused_module(nni.LinearReLU))
     # linear relu, linear module + functional relu
     linear_configs.append(
         BackendPatternConfig((torch.nn.functional.relu, torch.nn.Linear))
             .set_dtype_configs(dtype_configs)  # noqa: E131
-            .set_fuser_method(reverse_sequential_wrapper2(nni.LinearReLU))
+            .set_fuser_method(_reverse_sequential_wrapper2(nni.LinearReLU))
             .set_fused_module(nni.LinearReLU))
 
     # 2.2 linear module + relu, fused module configs
@@ -158,7 +158,7 @@ def _get_linear_configs(dtype_configs: List[DTypeConfig]) -> List[BackendPattern
     linear_configs.append(
         BackendPatternConfig((nn.BatchNorm1d, nn.Linear))
             .set_dtype_configs(dtype_configs)  # noqa: E131
-            .set_fuser_method(reverse2(fuse_linear_bn))
+            .set_fuser_method(_reverse2(fuse_linear_bn))
             .set_fused_module(nni.LinearBn1d))
 
     # 3.2 linear bn fused
@@ -218,13 +218,13 @@ def _get_conv_configs(dtype_configs):
         conv_configs.append(
             BackendPatternConfig((torch.nn.ReLU, convs.root))
                 .set_dtype_configs(dtype_configs)  # noqa: E131
-                .set_fuser_method(reverse_sequential_wrapper2(convs.fused_conv_relu))
+                .set_fuser_method(_reverse_sequential_wrapper2(convs.fused_conv_relu))
                 .set_fused_module(convs.fused_conv_relu))
         # conv relu fusion, conv module + functional relu
         conv_configs.append(
             BackendPatternConfig((F.relu, convs.root))
                 .set_dtype_configs(dtype_configs)  # noqa: E131
-                .set_fuser_method(reverse_sequential_wrapper2(convs.fused_conv_relu))
+                .set_fuser_method(_reverse_sequential_wrapper2(convs.fused_conv_relu))
                 .set_fused_module(convs.fused_conv_relu))
         # 2.2 conv module + relu fused module configs
         # conv relu, fused module
@@ -273,20 +273,20 @@ def _get_conv_configs(dtype_configs):
         conv_configs.append(
             BackendPatternConfig((convs.bn, convs.root))
                 .set_dtype_configs(dtype_configs)  # noqa: E131
-                .set_fuser_method(reverse2(fuse_conv_bn))
+                .set_fuser_method(_reverse2(fuse_conv_bn))
                 .set_fused_module(convs.fused_conv_bn))
         # conv + bn + relu module fusion
         conv_configs.append(
             BackendPatternConfig((nn.ReLU, (convs.bn, convs.root)))
                 .set_dtype_configs(dtype_configs)  # noqa: E131
-                .set_fuser_method(reverse3(fuse_conv_bn_relu))
+                .set_fuser_method(_reverse3(fuse_conv_bn_relu))
                 .set_fused_module(convs.fused_conv_bn_relu))
         # conv + bn + relu functional fusion
         conv_configs.append(
             BackendPatternConfig((F.relu, (convs.bn, convs.root)))
                 .set_dtype_configs(dtype_configs)  # noqa: E131
                 .set_root_module(convs.root)
-                .set_fuser_method(reverse3(fuse_conv_bn_relu))
+                .set_fuser_method(_reverse3(fuse_conv_bn_relu))
                 .set_fused_module(convs.fused_conv_bn_relu))
         # TODO: we can add fusion for torch.relu as well
 
@@ -330,7 +330,7 @@ def _get_conv_configs(dtype_configs):
         conv_configs.append(
             BackendPatternConfig((convs.bn, convs.transpose))
                 .set_dtype_configs(dtype_configs)  # noqa: E131
-                .set_fuser_method(reverse2(fuse_convtranspose_bn))
+                .set_fuser_method(_reverse2(fuse_convtranspose_bn))
                 .set_root_module(convs.transpose)
                 .set_reference_quantized_module(convs.transpose_reference))
 
@@ -497,13 +497,13 @@ def _get_bn_configs(dtype_configs: List[DTypeConfig]) -> List[BackendPatternConf
         bn_configs.append(
             BackendPatternConfig((torch.nn.ReLU, bn))
                 .set_dtype_configs(dtype_configs)  # noqa: E131
-                .set_fuser_method(reverse_sequential_wrapper2(fused_bn))
+                .set_fuser_method(_reverse_sequential_wrapper2(fused_bn))
                 .set_fused_module(fused_bn))
         # bn module + F.relu fusion config
         bn_configs.append(
             BackendPatternConfig((torch.nn.functional.relu, bn))
                 .set_dtype_configs(dtype_configs)  # noqa: E131
-                .set_fuser_method(reverse_sequential_wrapper2(bn_to_fused_bn[bn]))
+                .set_fuser_method(_reverse_sequential_wrapper2(bn_to_fused_bn[bn]))
                 .set_fused_module(fused_bn))
         bn_configs.append(
             BackendPatternConfig(bn)
diff --git a/torch/ao/quantization/backend_config/backend_config.py b/torch/ao/quantization/backend_config/backend_config.py
index 2f491b1624048..1305c32a4ea8f 100644
--- a/torch/ao/quantization/backend_config/backend_config.py
+++ b/torch/ao/quantization/backend_config/backend_config.py
@@ -229,7 +229,7 @@ class BackendConfig:
 
         import torch
         from torch.ao.quantization.backend_config import BackendConfig, BackendPatternConfig, DTypeConfig, ObservationType
-        from torch.ao.quantization.fuser_method_mappings import reverse_sequential_wrapper2
+        from torch.ao.quantization.fuser_method_mappings import _reverse_sequential_wrapper2
 
         weighted_int8_dtype_config = DTypeConfig(
             input_dtype=torch.quint8,
@@ -248,7 +248,7 @@ class BackendConfig:
             .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT) \
             .add_dtype_config(weighted_int8_dtype_config) \
             .set_fused_module(torch.nn.intrinsic.ConvReLU2d) \
-            .set_fuser_method(reverse_sequential_wrapper2(torch.nn.intrinsic.ConvReLU2d))
+            .set_fuser_method(_reverse_sequential_wrapper2(torch.nn.intrinsic.ConvReLU2d))
 
         backend_config = BackendConfig("my_backend") \
             .set_backend_pattern_config(linear_config) \
diff --git a/torch/ao/quantization/backend_config/executorch.py b/torch/ao/quantization/backend_config/executorch.py
index 4c0f2a48b552e..3c729327de760 100644
--- a/torch/ao/quantization/backend_config/executorch.py
+++ b/torch/ao/quantization/backend_config/executorch.py
@@ -7,7 +7,7 @@
 import torch.nn.quantized._reference as nnqr
 from .backend_config import BackendConfig, BackendPatternConfig, DTypeConfig, ObservationType
 from ._common_operator_config_utils import _Conv2dMetadata
-from ..fuser_method_mappings import reverse_sequential_wrapper2
+from ..fuser_method_mappings import _reverse_sequential_wrapper2
 
 
 __all__ = [
@@ -105,13 +105,13 @@ def _get_conv_configs() -> List[BackendPatternConfig]:
         conv_configs.append(
             BackendPatternConfig((torch.nn.ReLU, convs.root))
                 .set_dtype_configs(dtype_configs)  # noqa: E131
-                .set_fuser_method(reverse_sequential_wrapper2(convs.fused_conv_relu))
+                .set_fuser_method(_reverse_sequential_wrapper2(convs.fused_conv_relu))
                 .set_fused_module(convs.fused_conv_relu))
         # conv module + functional relu
         conv_configs.append(
             BackendPatternConfig((F.relu, convs.root))
                 .set_dtype_configs(dtype_configs)  # noqa: E131
-                .set_fuser_method(reverse_sequential_wrapper2(convs.fused_conv_relu))
+                .set_fuser_method(_reverse_sequential_wrapper2(convs.fused_conv_relu))
                 .set_fused_module(convs.fused_conv_relu))
         # fused conv relu module
         conv_configs.append(
diff --git a/torch/ao/quantization/fuser_method_mappings.py b/torch/ao/quantization/fuser_method_mappings.py
index 2e39f87321d41..db4cc9a04d767 100644
--- a/torch/ao/quantization/fuser_method_mappings.py
+++ b/torch/ao/quantization/fuser_method_mappings.py
@@ -10,13 +10,7 @@
     "fuse_conv_bn_relu",
     "fuse_linear_bn",
     "fuse_convtranspose_bn",
-    "sequential_wrapper2",
     "get_fuser_method",
-    "reverse_sequential_wrapper2",
-    "reverse2",
-    "reverse3",
-    "DEFAULT_PATTERN_TO_FUSER_METHOD",
-    "get_valid_patterns",
     "get_fuser_method_new",
 ]
 
@@ -156,7 +150,7 @@ def fuse_convtranspose_bn(is_qat, convt, bn):
     else:
         return nn.utils.fusion.fuse_conv_bn_eval(convt, bn, transpose=True)
 
-def sequential_wrapper2(sequential):
+def _sequential_wrapper2(sequential):
     """ Given a sequential class for two modules, return a function that takes
     is_qat, and then two modules as argument, that ignores the is_qat flag
     and always returns the sequential that combines the two input modules
@@ -165,20 +159,20 @@ def fuser_method(is_qat, m1, m2):
         return sequential(m1, m2)
     return fuser_method
 
-DEFAULT_OP_LIST_TO_FUSER_METHOD: Dict[Tuple, Union[nn.Sequential, Callable]] = {
+_DEFAULT_OP_LIST_TO_FUSER_METHOD: Dict[Tuple, Union[nn.Sequential, Callable]] = {
     (nn.Conv1d, nn.BatchNorm1d): fuse_conv_bn,
     (nn.Conv1d, nn.BatchNorm1d, nn.ReLU): fuse_conv_bn_relu,
     (nn.Conv2d, nn.BatchNorm2d): fuse_conv_bn,
     (nn.Conv2d, nn.BatchNorm2d, nn.ReLU): fuse_conv_bn_relu,
     (nn.Conv3d, nn.BatchNorm3d): fuse_conv_bn,
     (nn.Conv3d, nn.BatchNorm3d, nn.ReLU): fuse_conv_bn_relu,
-    (nn.Conv1d, nn.ReLU): sequential_wrapper2(nni.ConvReLU1d),
-    (nn.Conv2d, nn.ReLU): sequential_wrapper2(nni.ConvReLU2d),
-    (nn.Conv3d, nn.ReLU): sequential_wrapper2(nni.ConvReLU3d),
+    (nn.Conv1d, nn.ReLU): _sequential_wrapper2(nni.ConvReLU1d),
+    (nn.Conv2d, nn.ReLU): _sequential_wrapper2(nni.ConvReLU2d),
+    (nn.Conv3d, nn.ReLU): _sequential_wrapper2(nni.ConvReLU3d),
     (nn.Linear, nn.BatchNorm1d): fuse_linear_bn,
-    (nn.Linear, nn.ReLU): sequential_wrapper2(nni.LinearReLU),
-    (nn.BatchNorm2d, nn.ReLU): sequential_wrapper2(nni.BNReLU2d),
-    (nn.BatchNorm3d, nn.ReLU): sequential_wrapper2(nni.BNReLU3d),
+    (nn.Linear, nn.ReLU): _sequential_wrapper2(nni.LinearReLU),
+    (nn.BatchNorm2d, nn.ReLU): _sequential_wrapper2(nni.BNReLU2d),
+    (nn.BatchNorm3d, nn.ReLU): _sequential_wrapper2(nni.BNReLU3d),
     (nn.ConvTranspose1d, nn.BatchNorm1d): fuse_convtranspose_bn,
     (nn.ConvTranspose2d, nn.BatchNorm2d): fuse_convtranspose_bn,
     (nn.ConvTranspose3d, nn.BatchNorm3d): fuse_convtranspose_bn,
@@ -190,13 +184,13 @@ def get_fuser_method(op_list, additional_fuser_method_mapping=None):
     '''
     if additional_fuser_method_mapping is None:
         additional_fuser_method_mapping = {}
-    all_mappings = get_combined_dict(DEFAULT_OP_LIST_TO_FUSER_METHOD,
+    all_mappings = get_combined_dict(_DEFAULT_OP_LIST_TO_FUSER_METHOD,
                                      additional_fuser_method_mapping)
     fuser_method = all_mappings.get(op_list, None)
     assert fuser_method is not None, "did not find fuser method for: {} ".format(op_list)
     return fuser_method
 
-def reverse_sequential_wrapper2(sequential):
+def _reverse_sequential_wrapper2(sequential):
     """ Given a sequential class for two modules, return a function that takes
     is_qat, and then two modules as argument, that ignores the is_qat flag
     and always returns the sequential that combines the two input modules, with
@@ -206,37 +200,37 @@ def fuser_method(is_qat, m1, m2):
         return sequential(m2, m1)
     return fuser_method
 
-def reverse2(f):
+def _reverse2(f):
     def reversed(is_qat, x, y):
         return f(is_qat, y, x)
     return reversed
 
-def reverse3(f):
+def _reverse3(f):
     def reversed(is_qat, x, w):
         y, z = w
         return f(is_qat, z, y, x)
     return reversed
 
-DEFAULT_PATTERN_TO_FUSER_METHOD: Dict[Pattern, Union[nn.Sequential, Callable]] = {
-    (nn.BatchNorm1d, nn.Conv1d): reverse2(fuse_conv_bn),
-    (nn.ReLU, (nn.BatchNorm1d, nn.Conv1d)): reverse3(fuse_conv_bn_relu),
-    (nn.BatchNorm2d, nn.Conv2d): reverse2(fuse_conv_bn),
-    (nn.ReLU, (nn.BatchNorm2d, nn.Conv2d)): reverse3(fuse_conv_bn_relu),
-    (nn.BatchNorm3d, nn.Conv3d): reverse2(fuse_conv_bn),
-    (nn.ReLU, (nn.BatchNorm3d, nn.Conv3d)): reverse3(fuse_conv_bn_relu),
-    (nn.ReLU, nn.Conv1d): reverse_sequential_wrapper2(nni.ConvReLU1d),
-    (nn.ReLU, nn.Conv2d): reverse_sequential_wrapper2(nni.ConvReLU2d),
-    (nn.ReLU, nn.Conv3d): reverse_sequential_wrapper2(nni.ConvReLU3d),
-    (nn.BatchNorm1d, nn.Linear): reverse2(fuse_linear_bn),
-    (nn.ReLU, nn.Linear): reverse_sequential_wrapper2(nni.LinearReLU),
-    (nn.ReLU, nn.BatchNorm2d): reverse_sequential_wrapper2(nni.BNReLU2d),
-    (nn.ReLU, nn.BatchNorm3d): reverse_sequential_wrapper2(nni.BNReLU3d),
-    (nn.BatchNorm1d, nn.ConvTranspose1d): reverse2(fuse_convtranspose_bn),
-    (nn.BatchNorm2d, nn.ConvTranspose2d): reverse2(fuse_convtranspose_bn),
-    (nn.BatchNorm3d, nn.ConvTranspose3d): reverse2(fuse_convtranspose_bn),
+_DEFAULT_PATTERN_TO_FUSER_METHOD: Dict[Pattern, Union[nn.Sequential, Callable]] = {
+    (nn.BatchNorm1d, nn.Conv1d): _reverse2(fuse_conv_bn),
+    (nn.ReLU, (nn.BatchNorm1d, nn.Conv1d)): _reverse3(fuse_conv_bn_relu),
+    (nn.BatchNorm2d, nn.Conv2d): _reverse2(fuse_conv_bn),
+    (nn.ReLU, (nn.BatchNorm2d, nn.Conv2d)): _reverse3(fuse_conv_bn_relu),
+    (nn.BatchNorm3d, nn.Conv3d): _reverse2(fuse_conv_bn),
+    (nn.ReLU, (nn.BatchNorm3d, nn.Conv3d)): _reverse3(fuse_conv_bn_relu),
+    (nn.ReLU, nn.Conv1d): _reverse_sequential_wrapper2(nni.ConvReLU1d),
+    (nn.ReLU, nn.Conv2d): _reverse_sequential_wrapper2(nni.ConvReLU2d),
+    (nn.ReLU, nn.Conv3d): _reverse_sequential_wrapper2(nni.ConvReLU3d),
+    (nn.BatchNorm1d, nn.Linear): _reverse2(fuse_linear_bn),
+    (nn.ReLU, nn.Linear): _reverse_sequential_wrapper2(nni.LinearReLU),
+    (nn.ReLU, nn.BatchNorm2d): _reverse_sequential_wrapper2(nni.BNReLU2d),
+    (nn.ReLU, nn.BatchNorm3d): _reverse_sequential_wrapper2(nni.BNReLU3d),
+    (nn.BatchNorm1d, nn.ConvTranspose1d): _reverse2(fuse_convtranspose_bn),
+    (nn.BatchNorm2d, nn.ConvTranspose2d): _reverse2(fuse_convtranspose_bn),
+    (nn.BatchNorm3d, nn.ConvTranspose3d): _reverse2(fuse_convtranspose_bn),
 }
 
-def get_valid_patterns(op_pattern):
+def _get_valid_patterns(op_pattern):
     """
     Returns a list of valid patterns generated from the op_pattern,
     since MatchAllNode can match all types of nodes,
@@ -261,7 +255,7 @@ def get_valid_patterns(op_pattern):
     if isinstance(op_pattern, (tuple, list)):
         sub_combs = []
         for sub_pattern in op_pattern:
-            sub_combs.append(get_valid_patterns(sub_pattern))
+            sub_combs.append(_get_valid_patterns(sub_pattern))
         result = list(itertools.product(*sub_combs))
     else:
         result = [op_pattern, MatchAllNode]
@@ -274,9 +268,9 @@ def get_fuser_method_new(
     Would like to implement this first and have a separate PR for deprecation
     """
     if fuser_method_mapping is None:
-        fuser_method_mapping = DEFAULT_PATTERN_TO_FUSER_METHOD
+        fuser_method_mapping = _DEFAULT_PATTERN_TO_FUSER_METHOD
 
-    op_patterns = get_valid_patterns(op_pattern)
+    op_patterns = _get_valid_patterns(op_pattern)
     fuser_method = None
     for op_pattern in op_patterns:
         fuser_method = fuser_method_mapping.get(op_pattern, None)
diff --git a/torch/ao/quantization/fx/README.md b/torch/ao/quantization/fx/README.md
index cba11e9d36413..622acd30956cd 100644
--- a/torch/ao/quantization/fx/README.md
+++ b/torch/ao/quantization/fx/README.md
@@ -81,7 +81,7 @@ What we did in this example are:
 
 ```
 BackendPatternConfig((torch.nn.ReLU, torch.nn.Linear))
-    .set_fuser_method(reverse_sequential_wrapper2(nni.LinearReLU))
+    .set_fuser_method(_reverse_sequential_wrapper2(nni.LinearReLU))
     ._set_root_node_getter(my_root_node_getter)
     ._set_extra_inputs_getter(my_extra_inputs_getter)
 ```
diff --git a/torch/quantization/fuser_method_mappings.py b/torch/quantization/fuser_method_mappings.py
index 50520b3f79674..22f4e638ea698 100644
--- a/torch/quantization/fuser_method_mappings.py
+++ b/torch/quantization/fuser_method_mappings.py
@@ -10,6 +10,6 @@
     fuse_conv_bn,
     fuse_conv_bn_relu,
     fuse_linear_bn,
-    DEFAULT_OP_LIST_TO_FUSER_METHOD,
+    _DEFAULT_OP_LIST_TO_FUSER_METHOD,
     get_fuser_method,
 )

From 08f383c587d4e31ebb7a0bee4168dd06c37d45ab Mon Sep 17 00:00:00 2001
From: Michael Lazos <mlazos@fb.com>
Date: Thu, 10 Nov 2022 21:38:04 +0000
Subject: [PATCH 0763/1922] Have kernel names include fused ops (#88624)

- Propagates origin fx nodes through inlining during lowering
- Concatenates op names into kernel name
- Adds config to cap the number of ops in the kernel name so they don't get too long

Caveats:
- The ordering in the name may not match the order that the ops are executed in the kernel

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88624
Approved by: https://github.com/anijain2305, https://github.com/jansel
---
 test/inductor/test_torchinductor.py        | 15 +++---
 torch/_inductor/codegen/cpp.py             |  2 +-
 torch/_inductor/codegen/triton.py          |  7 ++-
 torch/_inductor/codegen/triton_template.py |  2 +-
 torch/_inductor/codegen/wrapper.py         |  4 +-
 torch/_inductor/config.py                  |  4 ++
 torch/_inductor/graph.py                   | 59 +++++++++++-----------
 torch/_inductor/ir.py                      |  1 +
 torch/_inductor/utils.py                   | 27 ++++++++++
 9 files changed, 79 insertions(+), 42 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index db6c5dfc2bd15..229f0fa83dd4e 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -4893,14 +4893,13 @@ def get_kernels(self, fn, args) -> typing.List[CachingAutotuner]:
             with V.set_graph_handler(graph), V.set_debug_handler(DebugContext()):
                 graph.run(*(cxt.example_args))
                 mod = graph.compile_to_module()
-                i = 0
-                while True:
-                    attribute = f"kernel{i}"
-                    if not hasattr(mod, attribute):
-                        break
-                    else:
-                        kernels.append(getattr(mod, attribute))
-                        i = i + 1
+
+                for val in mod.__dict__.values():
+                    if isinstance(
+                        val, torch._inductor.triton_ops.autotune.CachingAutotuner
+                    ):
+                        kernels.append(val)
+
             return kernels
 
         def test_divisibile_by_16_covers_numel_args(self):
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index 90ae4b44d5795..65a9335d6cbfc 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -1291,7 +1291,7 @@ def codegen_define_and_call(self, wrapper):
         codecache_def.splice(code)
         codecache_def.writeline("''')")
 
-        kernel_name = wrapper.next_kernel_name()
+        kernel_name = "kernel_cpp_" + wrapper.next_kernel_suffix()
         codecache_str = codecache_def.getvalue()
         # TODO(voz): Ostensibly, we should not need this. But there are cases where C++ codegen does
         # not use BracesBuffer, so we have no good indicator of a C++ buffer atm.
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 3471d23a72130..88a0ad4977be4 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -16,6 +16,7 @@
 from ..ir import ReductionHint
 from ..utils import (
     free_symbol_startswith,
+    get_fused_kernel_name,
     instance_descriptor,
     sympy_product,
     sympy_subs,
@@ -1281,7 +1282,11 @@ def codegen_node_schedule(self, node_schedule, numel, reduction_numel):
         if src_code in wrapper.kernels:
             kernel_name = wrapper.kernels[src_code]
         else:
-            kernel_name = wrapper.next_kernel_name()
+            kernel_name = (
+                "triton_"
+                + get_fused_kernel_name(node_schedule)
+                + wrapper.next_kernel_suffix()
+            )
             wrapper.kernels[src_code] = kernel_name
             subs_name = kernel_name if config.triton.ordered_kernel_names else "kernel"
             src_code = src_code.replace("KERNEL_NAME", subs_name)
diff --git a/torch/_inductor/codegen/triton_template.py b/torch/_inductor/codegen/triton_template.py
index 4d86feeccec86..0de771ff65749 100644
--- a/torch/_inductor/codegen/triton_template.py
+++ b/torch/_inductor/codegen/triton_template.py
@@ -335,7 +335,7 @@ def template_codegen(scheduler, scheduler_node, epilogue):
                 break
         assert kernel_buf_replace_name is not None
 
-    kernel_name = wrapper.next_kernel_name()
+    kernel_name = "triton_template_" + wrapper.next_kernel_suffix()
     # code gen kernel
     wrapper.header.splice(
         kernel.codegen_kernel(
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
index 7efc1cf1aa8c6..cf8fb46c84bdc 100644
--- a/torch/_inductor/codegen/wrapper.py
+++ b/torch/_inductor/codegen/wrapper.py
@@ -255,8 +255,8 @@ def write_get_cuda_stream(self, index):
         self.writeline(f"{name} = get_cuda_stream({index})")
         return name
 
-    def next_kernel_name(self):
-        return f"kernel{next(self._names_iter)}"
+    def next_kernel_suffix(self):
+        return f"{next(self._names_iter)}"
 
     def codegen_allocation(self, buffer):
         name = buffer.get_name()
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index 910e6d20b4d6f..87e2793782be8 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -61,6 +61,10 @@
 
 compile_threads = min(32, os.cpu_count()) if sys.platform != "win32" else 1
 
+# If kernel is fused, the name is generated from the origin node op names
+# for larger kernels limit this
+kernel_name_max_ops = 10
+
 # How to import torchinductor, either torchinductor or torch.inductor
 inductor_import = __name__.replace(".config", "")
 
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index adf8ed9614211..f69a891fca7ba 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -21,7 +21,7 @@
 from .ir import Constant, FixedLayout, InputBuffer, Pointwise, Reduction, TensorBox
 from .lowering import lowerings, make_fallback, needs_realized_inputs
 from .sizevars import SizeVarAllocator
-from .utils import dynamo_utils
+from .utils import dynamo_utils, gather_origins
 from .virtualized import V
 
 log = logging.getLogger(__name__)
@@ -212,34 +212,35 @@ def placeholder(self, target, args, kwargs):
         return tensor
 
     def call_function(self, target, args, kwargs):
-        if target is operator.getitem and isinstance(args[0], (list, tuple)):
-            return super().call_function(target, args, kwargs)
-
-        if target not in lowerings:
-            if config.implicit_fallbacks:
-                error = (
-                    MissingOperatorWithDecomp
-                    if get_decompositions([target])
-                    else MissingOperatorWithoutDecomp
-                )
-                log.warning(
-                    "Creating implicit fallback for:\n%s",
-                    error.operator_str(target, args, kwargs),
-                )
-                make_fallback(target)
-            elif get_decompositions([target]):
-                # There isn't a good way to dynamically patch this in
-                # since AOT Autograd already ran.  The error message tells
-                # the user how to fix it.
-                raise MissingOperatorWithDecomp(target, args, kwargs)
-            else:
-                raise MissingOperatorWithoutDecomp(target, args, kwargs)
-
-        try:
-            out = lowerings[target](*args, **kwargs)
-            return out
-        except Exception as e:
-            raise LoweringException(e, target, args, kwargs) from e
+        with ir.IRNode.current_origins(gather_origins(args, kwargs)):
+            if target is operator.getitem and isinstance(args[0], (list, tuple)):
+                return super().call_function(target, args, kwargs)
+
+            if target not in lowerings:
+                if config.implicit_fallbacks:
+                    error = (
+                        MissingOperatorWithDecomp
+                        if get_decompositions([target])
+                        else MissingOperatorWithoutDecomp
+                    )
+                    log.warning(
+                        "Creating implicit fallback for:\n%s",
+                        error.operator_str(target, args, kwargs),
+                    )
+                    make_fallback(target)
+                elif get_decompositions([target]):
+                    # There isn't a good way to dynamically patch this in
+                    # since AOT Autograd already ran.  The error message tells
+                    # the user how to fix it.
+                    raise MissingOperatorWithDecomp(target, args, kwargs)
+                else:
+                    raise MissingOperatorWithoutDecomp(target, args, kwargs)
+
+            try:
+                out = lowerings[target](*args, **kwargs)
+                return out
+            except Exception as e:
+                raise LoweringException(e, target, args, kwargs) from e
 
     def get_attr(self, target, args, kwargs):
         # this is a constant
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 0353bcc8b0bec..924ec7aaa7b2e 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -3626,6 +3626,7 @@ def realize(self):
             data=self.data,
         )
         self.data.name = V.graph.register_buffer(self.data)
+        self.data.origins = self.origins
         return self.data.name
 
     def realize_hint(self):
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index 829fbd2897d5f..5bfda50dd6f7e 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -178,6 +178,33 @@ def wrapper(self):
     return wrapper
 
 
+def get_fused_kernel_name(node_schedule):
+    return "_".join(
+        ["fused"]
+        + [
+            str(origin.name)
+            for origin in functools.reduce(
+                operator.or_,
+                [node.node.origins for node in node_schedule if hasattr(node, "node")],
+            )
+            if origin.op == "call_function"
+        ][0 : config.kernel_name_max_ops]
+    )
+
+
+def gather_origins(args, kwargs):
+    import itertools
+
+    from .ir import ComputedBuffer, IRNode
+
+    def is_unrealized_node(n):
+        return isinstance(n, IRNode) and not isinstance(n, ComputedBuffer)
+
+    kwarg_origins = [val.origins for val in kwargs.values() if is_unrealized_node(val)]
+    arg_origins = [arg.origins for arg in args if is_unrealized_node(arg)]
+    return set(itertools.chain(*arg_origins, *kwarg_origins))
+
+
 def sympy_str(expr: sympy.Expr):
     """
     Normal sympy str is very slow, this is a lot faster.  The result are

From ecc520b1ffa99c16abe9f7dcee630c641aacc3a9 Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Tue, 8 Nov 2022 07:59:11 -0800
Subject: [PATCH 0764/1922] [ao] qconfig_mapping_utils.py fixing public v
 private (#87517)

Summary: made _get_object_type_qconfig, _get_module_name_regex_qconfig,
_get_module_name_qconfig, _maybe_adjust_qconfig_for_module_type_or_name,
_get_flattened_qconfig_dict _update_qconfig_for_qat private

Test Plan: python test/test_public_bindings.py

Reviewers:

Subscribers:

Tasks:

Tags:

Differential Revision: [D40709279](https://our.internmc.facebook.com/intern/diff/D40709279)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87517
Approved by: https://github.com/jcaip
---
 test/quantization/fx/test_quantize_fx.py      | 28 ++++++++---------
 torch/ao/quantization/fx/convert.py           |  4 +--
 torch/ao/quantization/fx/prepare.py           |  8 ++---
 .../quantization/fx/qconfig_mapping_utils.py  | 16 +++++-----
 .../ao/quantization/qconfig_mapping_utils.py  | 31 +++++++------------
 5 files changed, 40 insertions(+), 47 deletions(-)

diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 236a5587d859a..8c75658a04e1b 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -99,9 +99,9 @@
 )
 
 from torch.ao.quantization.qconfig_mapping_utils import (
-    get_object_type_qconfig,
-    get_module_name_qconfig,
-    get_module_name_regex_qconfig,
+    _get_object_type_qconfig,
+    _get_module_name_qconfig,
+    _get_module_name_regex_qconfig,
 )
 
 from torch.ao.quantization.fx.pattern_utils import (
@@ -1876,9 +1876,9 @@ def test_qconfig_mapping_set_object_type(self):
         qconfig_mapping.set_object_type(torch.nn.Linear, qconfig3)
         self.assertEqual(qconfig_mapping.object_type_qconfigs[torch.nn.Linear], qconfig3)
         self.assertEqual(qconfig_mapping.object_type_qconfigs[torch.nn.ReLU], qconfig2)
-        self.assertEqual(get_object_type_qconfig(qconfig_mapping, torch.nn.Linear, None), qconfig3)
-        self.assertEqual(get_object_type_qconfig(qconfig_mapping, torch.nn.ReLU, None), qconfig2)
-        self.assertEqual(get_object_type_qconfig(qconfig_mapping, "nomatch", None), None)
+        self.assertEqual(_get_object_type_qconfig(qconfig_mapping, torch.nn.Linear, None), qconfig3)
+        self.assertEqual(_get_object_type_qconfig(qconfig_mapping, torch.nn.ReLU, None), qconfig2)
+        self.assertEqual(_get_object_type_qconfig(qconfig_mapping, "nomatch", None), None)
 
     def test_qconfig_mapping_set_module_name_regex(self):
         qconfig1 = get_default_qconfig()
@@ -1898,11 +1898,11 @@ def test_qconfig_mapping_set_module_name_regex(self):
         qconfig_mapping.set_module_name_regex("foo.*bar", qconfig3)
         self.assertEqual(qconfig_mapping.module_name_regex_qconfigs["foo.*bar"], qconfig3)
         self.assertEqual(qconfig_mapping.module_name_regex_qconfigs["foo.*"], qconfig2)
-        self.assertEqual(get_module_name_regex_qconfig(qconfig_mapping, "foo123bar", None), qconfig3)
-        self.assertEqual(get_module_name_regex_qconfig(qconfig_mapping, "foobar", None), qconfig3)
-        self.assertEqual(get_module_name_regex_qconfig(qconfig_mapping, "foobaz", None), qconfig2)
-        self.assertEqual(get_module_name_regex_qconfig(qconfig_mapping, "foo", None), qconfig2)
-        self.assertEqual(get_module_name_regex_qconfig(qconfig_mapping, "nomatch", None), None)
+        self.assertEqual(_get_module_name_regex_qconfig(qconfig_mapping, "foo123bar", None), qconfig3)
+        self.assertEqual(_get_module_name_regex_qconfig(qconfig_mapping, "foobar", None), qconfig3)
+        self.assertEqual(_get_module_name_regex_qconfig(qconfig_mapping, "foobaz", None), qconfig2)
+        self.assertEqual(_get_module_name_regex_qconfig(qconfig_mapping, "foo", None), qconfig2)
+        self.assertEqual(_get_module_name_regex_qconfig(qconfig_mapping, "nomatch", None), None)
 
     def test_qconfig_mapping_set_module_name(self):
         qconfig1 = get_default_qconfig()
@@ -1922,9 +1922,9 @@ def test_qconfig_mapping_set_module_name(self):
         qconfig_mapping.set_module_name("mod1", qconfig3)
         self.assertEqual(qconfig_mapping.module_name_qconfigs["mod1"], qconfig3)
         self.assertEqual(qconfig_mapping.module_name_qconfigs["mod2"], qconfig2)
-        self.assertEqual(get_module_name_qconfig(qconfig_mapping, "mod1", None), qconfig3)
-        self.assertEqual(get_module_name_qconfig(qconfig_mapping, "mod2", None), qconfig2)
-        self.assertEqual(get_module_name_qconfig(qconfig_mapping, "nomatch", None), None)
+        self.assertEqual(_get_module_name_qconfig(qconfig_mapping, "mod1", None), qconfig3)
+        self.assertEqual(_get_module_name_qconfig(qconfig_mapping, "mod2", None), qconfig2)
+        self.assertEqual(_get_module_name_qconfig(qconfig_mapping, "nomatch", None), None)
 
     def test_qconfig_mapping_set_module_name_object_type_order(self):
         qconfig1 = get_default_qconfig()
diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py
index 74eb8f1ca542b..b5e9cf3bbcb34 100644
--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@@ -24,7 +24,7 @@
 )
 from ..qconfig_mapping import QConfigMapping
 from ..qconfig_mapping_utils import (
-    update_qconfig_for_qat,
+    _update_qconfig_for_qat,
 )
 from .qconfig_mapping_utils import (
     generate_node_name_to_qconfig,
@@ -563,7 +563,7 @@ def convert(
         modules_copy = copy.deepcopy(modules)
 
         if model._is_qat:
-            update_qconfig_for_qat(qconfig_mapping, {})
+            _update_qconfig_for_qat(qconfig_mapping, {})
         update_qconfig_for_fusion(model, qconfig_mapping)
 
         compare_prepare_convert_qconfig_mappings(prepare_qconfig_mapping, qconfig_mapping)  # type: ignore[arg-type]
diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index 160b80a8807f8..281bd960ed7b9 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -28,8 +28,8 @@
     QConfigMapping,
 )
 from ..qconfig_mapping_utils import (
-    get_flattened_qconfig_dict,
-    update_qconfig_for_qat,
+    _get_flattened_qconfig_dict,
+    _update_qconfig_for_qat,
 )
 from .qconfig_mapping_utils import (
     generate_node_name_to_qconfig,
@@ -1587,14 +1587,14 @@ def prepare(
 
     update_qconfig_for_fusion(model, qconfig_mapping)
     update_qconfig_for_fusion(model, _equalization_config)
-    flattened_qconfig_dict = get_flattened_qconfig_dict(qconfig_mapping)
+    flattened_qconfig_dict = _get_flattened_qconfig_dict(qconfig_mapping)
     # TODO: support regex as well
     propagate_qconfig_(model, flattened_qconfig_dict, prepare_custom_config.to_dict())
 
     if is_qat:
         module_to_qat_module = get_module_to_qat_module(backend_config)
         qat_swap_modules(model, module_to_qat_module)
-        update_qconfig_for_qat(qconfig_mapping, {})
+        _update_qconfig_for_qat(qconfig_mapping, {})
 
     # mapping from fully qualified module name to module instance
     # for example,
diff --git a/torch/ao/quantization/fx/qconfig_mapping_utils.py b/torch/ao/quantization/fx/qconfig_mapping_utils.py
index 2abfaf826c425..66dffd50cd008 100644
--- a/torch/ao/quantization/fx/qconfig_mapping_utils.py
+++ b/torch/ao/quantization/fx/qconfig_mapping_utils.py
@@ -29,8 +29,8 @@
     QConfigMapping,
 )
 from ..qconfig_mapping_utils import (
-    get_object_type_qconfig,
-    maybe_adjust_qconfig_for_module_type_or_name,
+    _get_object_type_qconfig,
+    _maybe_adjust_qconfig_for_module_type_or_name,
 )
 
 
@@ -121,17 +121,17 @@ def generate_node_name_to_qconfig(
         qconfig = None
         if node.op == "get_attr":
             module_name, _ = _parent_name(node.target)
-            qconfig = maybe_adjust_qconfig_for_module_type_or_name(
+            qconfig = _maybe_adjust_qconfig_for_module_type_or_name(
                 qconfig_mapping, type(modules[module_name]), module_name, global_qconfig)
             qconfig_with_device_check = _add_module_to_qconfig_obs_ctr(qconfig, modules.get(node.target, None))
         elif node.op == "call_function":
             # precedence: module_name_qconfig
             # > function_qconfig > global_qconfig
             # module_name takes precedence over function qconfig
-            function_qconfig = get_object_type_qconfig(
+            function_qconfig = _get_object_type_qconfig(
                 qconfig_mapping, node.target, global_qconfig)
             module_path, module_type = node_name_to_scope[node.name]
-            qconfig = maybe_adjust_qconfig_for_module_type_or_name(
+            qconfig = _maybe_adjust_qconfig_for_module_type_or_name(
                 qconfig_mapping, module_type, module_path, function_qconfig)
 
             cur_object_type_idx = \
@@ -146,11 +146,11 @@ def generate_node_name_to_qconfig(
             # first use node.target (string) to get the qconfig
             # this is to support configs like
             # "object_type": [("reshpe", qconfig)]
-            qconfig = maybe_adjust_qconfig_for_module_type_or_name(
+            qconfig = _maybe_adjust_qconfig_for_module_type_or_name(
                 qconfig_mapping, node.target, module_path, global_qconfig)
             # if there is no special config for the method, we'll fall back to the
             # config for the module that contains the call_method node
-            qconfig = maybe_adjust_qconfig_for_module_type_or_name(
+            qconfig = _maybe_adjust_qconfig_for_module_type_or_name(
                 qconfig_mapping, module_type, module_path, qconfig)
             # currently call_method does not support modifying qconfig
             # by order, we can add this later if it is needed.
@@ -160,7 +160,7 @@ def generate_node_name_to_qconfig(
             # if the node is an observer, just continue - don't add it to the qconfig_map
             if is_activation_post_process(modules[node.target]):
                 continue
-            qconfig = maybe_adjust_qconfig_for_module_type_or_name(
+            qconfig = _maybe_adjust_qconfig_for_module_type_or_name(
                 qconfig_mapping, type(modules[node.target]), node.target, global_qconfig)
 
             module_path, module_type = node_name_to_scope[node.name]
diff --git a/torch/ao/quantization/qconfig_mapping_utils.py b/torch/ao/quantization/qconfig_mapping_utils.py
index 09bce4fbebb09..0109729e580c8 100644
--- a/torch/ao/quantization/qconfig_mapping_utils.py
+++ b/torch/ao/quantization/qconfig_mapping_utils.py
@@ -1,5 +1,5 @@
 import re
-from typing import Dict, Callable, Union
+from typing import Dict, Callable, Union, List
 
 from .utils import (
     get_combined_dict,
@@ -12,25 +12,18 @@
 from .qconfig_mapping import QConfigMapping
 
 
-# TODO: revisit this list. Many helper methods shouldn't be public
-__all__ = [
-    "get_flattened_qconfig_dict",
-    "get_object_type_qconfig",
-    "get_module_name_qconfig",
-    "get_module_name_regex_qconfig",
-    "maybe_adjust_qconfig_for_module_type_or_name",
-    "update_qconfig_for_qat",
+__all__: List[str] = [
 ]
 
 
-def get_object_type_qconfig(
+def _get_object_type_qconfig(
         qconfig_mapping: QConfigMapping,
         object_type: Union[Callable, str],
         fallback_qconfig: QConfigAny) -> QConfigAny:
     return qconfig_mapping.object_type_qconfigs.get(object_type, fallback_qconfig)
 
 
-def get_module_name_regex_qconfig(qconfig_mapping, module_name, fallback_qconfig):
+def _get_module_name_regex_qconfig(qconfig_mapping, module_name, fallback_qconfig):
     for regex_pattern, qconfig in qconfig_mapping.module_name_regex_qconfigs.items():
         if re.match(regex_pattern, module_name):
             # first match wins
@@ -38,7 +31,7 @@ def get_module_name_regex_qconfig(qconfig_mapping, module_name, fallback_qconfig
     return fallback_qconfig
 
 
-def get_module_name_qconfig(qconfig_mapping, module_name, fallback_qconfig):
+def _get_module_name_qconfig(qconfig_mapping, module_name, fallback_qconfig):
     if module_name == '':
         # module name qconfig not found
         return fallback_qconfig
@@ -46,23 +39,23 @@ def get_module_name_qconfig(qconfig_mapping, module_name, fallback_qconfig):
         return qconfig_mapping.module_name_qconfigs[module_name]
     else:
         parent, _ = _parent_name(module_name)
-        return get_module_name_qconfig(qconfig_mapping, parent, fallback_qconfig)
+        return _get_module_name_qconfig(qconfig_mapping, parent, fallback_qconfig)
 
 
-def maybe_adjust_qconfig_for_module_type_or_name(qconfig_mapping, module_type, module_name, global_qconfig):
+def _maybe_adjust_qconfig_for_module_type_or_name(qconfig_mapping, module_type, module_name, global_qconfig):
     # get qconfig for module_name,
     # fallback to module_name_regex_qconfig, module_type_qconfig,
     # global_qconfig if necessary
-    module_type_qconfig = get_object_type_qconfig(
+    module_type_qconfig = _get_object_type_qconfig(
         qconfig_mapping, module_type, global_qconfig)
-    module_name_regex_qconfig = get_module_name_regex_qconfig(
+    module_name_regex_qconfig = _get_module_name_regex_qconfig(
         qconfig_mapping, module_name, module_type_qconfig)
-    module_name_qconfig = get_module_name_qconfig(
+    module_name_qconfig = _get_module_name_qconfig(
         qconfig_mapping, module_name, module_name_regex_qconfig)
     return module_name_qconfig
 
 
-def get_flattened_qconfig_dict(qconfig_mapping: QConfigMapping) -> Dict[Union[Callable, str], QConfigAny]:
+def _get_flattened_qconfig_dict(qconfig_mapping: QConfigMapping) -> Dict[Union[Callable, str], QConfigAny]:
     """ flatten the global, object_type and module_name qconfig
     to the same qconfig_dict so that it can be used by
     propagate_qconfig_ function.
@@ -94,7 +87,7 @@ def get_flattened_qconfig_dict(qconfig_mapping: QConfigMapping) -> Dict[Union[Ca
     return flattened
 
 
-def update_qconfig_for_qat(
+def _update_qconfig_for_qat(
         qconfig_mapping: QConfigMapping,
         additional_qat_module_mapping: Dict[Callable, Callable]):
     """

From aaa028ac61a6c1f658cfc59a4e0059a0408e73b4 Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Tue, 8 Nov 2022 10:22:31 -0800
Subject: [PATCH 0765/1922] [ONNX] Improve diagnostic message formatting
 (#87830)

* Reflect required arguments in method signature for each diagnostic rule. Previous design accepts arbitrary sized tuple which is hard to use and prone to error.
     ![image](https://user-images.githubusercontent.com/9376104/200381982-d1e905f0-a159-4ef5-8d2e-070524e8f5bf.png)
* Removed `DiagnosticTool` to keep things compact.
* Removed specifying supported rule set for tool(context) and checking if rule of reported diagnostic falls inside the set, to keep things compact.
* Initial overview markdown file.
* Change `full_description` definition. Now `text` field should not be empty. And its markdown should be stored in `markdown` field.
* Change `message_default_template` to allow only named fields (excluding numeric fields). `field_name` provides clarity on what argument is expected.
* Added `diagnose` api to `torch.onnx._internal.diagnostics`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87830
Approved by: https://github.com/abock
---
 test/onnx/internal/test_diagnostics.py        |  38 +---
 tools/onnx/gen_diagnostics.py                 |  83 +++++++--
 tools/onnx/templates/rules.py.in              |   6 +-
 .../jit/passes/onnx/shape_type_inference.cpp  |   2 +-
 torch/csrc/onnx/diagnostics/diagnostics.h     |  18 +-
 torch/onnx/_internal/diagnostics/OVERVIEW.md  |  83 +++++++++
 torch/onnx/_internal/diagnostics/__init__.py  |   4 +-
 .../onnx/_internal/diagnostics/_diagnostic.py |  53 +++---
 torch/onnx/_internal/diagnostics/_rules.py    |  88 +++++++--
 .../_internal/diagnostics/infra/__init__.py   |   2 -
 .../_internal/diagnostics/infra/_infra.py     | 174 ++++++++++--------
 .../_internal/diagnostics/infra/engine.py     |  51 ++---
 torch/onnx/_internal/diagnostics/rules.yaml   |  21 ++-
 torch/onnx/errors.py                          |  44 ++---
 14 files changed, 414 insertions(+), 253 deletions(-)
 create mode 100644 torch/onnx/_internal/diagnostics/OVERVIEW.md

diff --git a/test/onnx/internal/test_diagnostics.py b/test/onnx/internal/test_diagnostics.py
index fbe79216d0879..fbd888329a50e 100644
--- a/test/onnx/internal/test_diagnostics.py
+++ b/test/onnx/internal/test_diagnostics.py
@@ -176,7 +176,7 @@ def test_diagnostics_engine_records_diagnosis_reported_outside_of_export(
             sample_rule,
             sample_level,
         ):
-            diagnostics.context.diagnose(sample_rule, sample_level, ("foo",))
+            diagnostics.context.diagnose(sample_rule, sample_level)
 
 
 @dataclasses.dataclass
@@ -196,31 +196,17 @@ class TestDiagnosticsInfra(common_utils.TestCase):
     def setUp(self):
         self.engine = infra.DiagnosticEngine()
         self.rules = _RuleCollectionForTest()
-        self.diagnostic_tool = infra.DiagnosticTool("test_tool", "1.0.0", self.rules)
         with contextlib.ExitStack() as stack:
             self.context = stack.enter_context(
-                self.engine.create_diagnostic_context(self.diagnostic_tool)
+                self.engine.create_diagnostic_context("test", "1.0.0")
             )
             self.addCleanup(stack.pop_all().close)
         return super().setUp()
 
-    def test_diagnose_raises_value_error_when_rule_not_supported(self):
-        rule_id = "0"
-        rule_name = "nonexistent-rule"
-        with self.assertRaisesRegex(
-            ValueError,
-            f"Rule '{rule_id}:{rule_name}' is not supported by this tool "
-            f"'{self.diagnostic_tool.name} {self.diagnostic_tool.version}'.",
-        ):
-            self.context.diagnose(
-                infra.Rule(id=rule_id, name=rule_name, message_default_template=""),
-                infra.Level.WARNING,
-            )
-
     def test_diagnostics_engine_records_diagnosis_reported_in_nested_contexts(
         self,
     ):
-        with self.engine.create_diagnostic_context(self.diagnostic_tool) as context:
+        with self.engine.create_diagnostic_context("inner_test", "1.0.1") as context:
             context.diagnose(self.rules.rule_without_message_args, infra.Level.WARNING)
             sarif_log = self.engine.sarif_log()
             self.assertEqual(len(sarif_log.runs), 2)
@@ -250,9 +236,7 @@ def test_diagnostics_engine_records_diagnosis_with_custom_rules(self):
         )
 
         with self.engine.create_diagnostic_context(
-            tool=infra.DiagnosticTool(
-                name="custom_tool", version="1.0", rules=custom_rules
-            )
+            "custom_rules", "1.0"
         ) as diagnostic_context:
             with assert_all_diagnostics(
                 self,
@@ -269,20 +253,6 @@ def test_diagnostics_engine_records_diagnosis_with_custom_rules(self):
                     custom_rules.custom_rule_2, infra.Level.ERROR  # type: ignore[attr-defined]
                 )
 
-    def test_diagnostic_tool_raises_type_error_when_diagnostic_type_is_invalid(
-        self,
-    ):
-        with self.assertRaisesRegex(
-            TypeError,
-            "Expected diagnostic_type to be a subclass of Diagnostic, but got",
-        ):
-            _ = infra.DiagnosticTool(
-                "custom_tool",
-                "1.0",
-                self.rules,
-                diagnostic_type=int,
-            )
-
 
 if __name__ == "__main__":
     common_utils.run_tests()
diff --git a/tools/onnx/gen_diagnostics.py b/tools/onnx/gen_diagnostics.py
index ba6fd43bee292..92960024e048d 100644
--- a/tools/onnx/gen_diagnostics.py
+++ b/tools/onnx/gen_diagnostics.py
@@ -14,6 +14,7 @@
 
 import argparse
 import os
+import string
 import subprocess
 import textwrap
 from typing import Any, Mapping, Sequence
@@ -30,19 +31,37 @@
 Diagnostic rules for PyTorch ONNX export.
 """
 
-_PY_RULE_TEMPLATE = """\
-{0}: infra.Rule = dataclasses.field(
-    default=infra.Rule.from_sarif(**{1}),
+_PY_RULE_CLASS_COMMENT = """\
+GENERATED CODE - DO NOT EDIT DIRECTLY
+The purpose of generating a class for each rule is to override the `format_message`
+method to provide more details in the signature about the format arguments.
+"""
+
+_PY_RULE_CLASS_TEMPLATE = """\
+class _{pascal_case_name}(infra.Rule):
+    \"\"\"{short_description}\"\"\"
+    def format_message(self, {message_arguments}) -> str:  # type: ignore[override]
+        \"\"\"Returns the formatted default message of this Rule.
+
+        Message template: {message_template}
+        \"\"\"
+        return self.message_default_template.format({message_arguments_assigned})
+
+"""
+
+_PY_RULE_COLLECTION_FIELD_TEMPLATE = """\
+{snake_case_name}: _{pascal_case_name} = dataclasses.field(
+    default=_{pascal_case_name}.from_sarif(**{sarif_dict}),
     init=False,
 )
-\"\"\"{2}\"\"\"
+\"\"\"{short_description}\"\"\"
 """
 
 _CPP_RULE_TEMPLATE = """\
 /**
- * @brief {1}
+ * @brief {short_description}
  */
-{0},
+{name},
 """
 
 _RuleType = Mapping[str, Any]
@@ -56,24 +75,62 @@ def _kebab_case_to_pascal_case(name: str) -> str:
     return "".join(word.capitalize() for word in name.split("-"))
 
 
-def _format_rule_for_python(rule: _RuleType) -> str:
-    name = _kebab_case_to_snake_case(rule["name"])
+def _format_rule_for_python_class(rule: _RuleType) -> str:
+    pascal_case_name = _kebab_case_to_pascal_case(rule["name"])
     short_description = rule["short_description"]["text"]
+    message_template = rule["message_strings"]["default"]["text"]
+    field_names = [
+        field_name
+        for _, field_name, _, _ in string.Formatter().parse(message_template)
+        if field_name is not None
+    ]
+    for field_name in field_names:
+        assert isinstance(
+            field_name, str
+        ), f"Unexpected field type {type(field_name)} from {field_name}. "
+        "Field name must be string.\nFull message template: {message_template}"
+        assert (
+            not field_name.isnumeric()
+        ), f"Unexpected numeric field name {field_name}. "
+        "Only keyword name formatting is supported.\nFull message template: {message_template}"
+    message_arguments = ", ".join(field_names)
+    message_arguments_assigned = ", ".join(
+        [f"{field_name}={field_name}" for field_name in field_names]
+    )
+    return _PY_RULE_CLASS_TEMPLATE.format(
+        pascal_case_name=pascal_case_name,
+        short_description=short_description,
+        message_template=repr(message_template),
+        message_arguments=message_arguments,
+        message_arguments_assigned=message_arguments_assigned,
+    )
+
 
-    return _PY_RULE_TEMPLATE.format(name, rule, short_description)
+def _format_rule_for_python_field(rule: _RuleType) -> str:
+    snake_case_name = _kebab_case_to_snake_case(rule["name"])
+    pascal_case_name = _kebab_case_to_pascal_case(rule["name"])
+    short_description = rule["short_description"]["text"]
+
+    return _PY_RULE_COLLECTION_FIELD_TEMPLATE.format(
+        snake_case_name=snake_case_name,
+        pascal_case_name=pascal_case_name,
+        sarif_dict=rule,
+        short_description=short_description,
+    )
 
 
 def _format_rule_for_cpp(rule: _RuleType) -> str:
     name = f"k{_kebab_case_to_pascal_case(rule['name'])}"
     short_description = rule["short_description"]["text"]
-    return _CPP_RULE_TEMPLATE.format(name, short_description)
+    return _CPP_RULE_TEMPLATE.format(name=name, short_description=short_description)
 
 
 def gen_diagnostics_python(
     rules: Sequence[_RuleType], out_py_dir: str, template_dir: str
 ) -> None:
 
-    rule_lines = [_format_rule_for_python(rule) for rule in rules]
+    rule_class_lines = [_format_rule_for_python_class(rule) for rule in rules]
+    rule_field_lines = [_format_rule_for_python_field(rule) for rule in rules]
 
     fm = torchgen_utils.FileManager(
         install_dir=out_py_dir, template_dir=template_dir, dry_run=False
@@ -83,7 +140,9 @@ def gen_diagnostics_python(
         "rules.py.in",
         lambda: {
             "generated_comment": _RULES_GENERATED_COMMENT,
-            "rules": textwrap.indent("\n".join(rule_lines), " " * 4),
+            "generated_rule_class_comment": _PY_RULE_CLASS_COMMENT,
+            "rule_classes": "\n".join(rule_class_lines),
+            "rules": textwrap.indent("\n".join(rule_field_lines), " " * 4),
         },
     )
     _lint_file(os.path.join(out_py_dir, "_rules.py"))
diff --git a/tools/onnx/templates/rules.py.in b/tools/onnx/templates/rules.py.in
index e29c202dc6a70..2137119d14c23 100644
--- a/tools/onnx/templates/rules.py.in
+++ b/tools/onnx/templates/rules.py.in
@@ -7,10 +7,14 @@ import dataclasses
 # flake8: noqa
 from torch.onnx._internal.diagnostics import infra
 
+"""
+${generated_rule_class_comment}
+"""
+
+${rule_classes}
 
 @dataclasses.dataclass
 class _POERules(infra.RuleCollection):
 ${rules}
 
-
 rules = _POERules()
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
index d2873ddf464cb..f646fe77e07ac 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
@@ -1897,7 +1897,7 @@ void UpdateReliable(
     diagnostics::Diagnose(
         diagnostics::Rule::kNodeMissingOnnxShapeInference,
         diagnostics::Level::kWarning,
-        {output->node()->kind().toDisplayString()});
+        {{"op_name", output->node()->kind().toDisplayString()}});
   }
   auto reliable = false;
   if (inferred) {
diff --git a/torch/csrc/onnx/diagnostics/diagnostics.h b/torch/csrc/onnx/diagnostics/diagnostics.h
index 65f59d4f1f9f8..65ca626b843b7 100644
--- a/torch/csrc/onnx/diagnostics/diagnostics.h
+++ b/torch/csrc/onnx/diagnostics/diagnostics.h
@@ -34,14 +34,6 @@ inline py::object _PyDiagnostics() {
   return py::module::import("torch.onnx._internal.diagnostics");
 }
 
-inline py::object _PyEngine() {
-  return _PyDiagnostics().attr("engine");
-}
-
-inline py::object _PyContext() {
-  return _PyDiagnostics().attr("context");
-}
-
 inline py::object _PyRule(Rule rule) {
   return _PyDiagnostics().attr("rules").attr(
       kPyRuleNames[static_cast<uint32_t>(rule)]);
@@ -55,15 +47,15 @@ inline py::object _PyLevel(Level level) {
 inline void Diagnose(
     Rule rule,
     Level level,
-    std::vector<std::string> messageArgs = {}) {
+    std::unordered_map<std::string, std::string> messageArgs = {}) {
   py::object py_rule = _PyRule(rule);
   py::object py_level = _PyLevel(level);
-  py::object py_context = _PyContext();
 
-  py::dict kwargs = py::dict();
   // TODO: statically check that size of messageArgs matches with rule.
-  kwargs["message_args"] = messageArgs;
-  py_context.attr("diagnose")(py_rule, py_level, **kwargs);
+  py::object py_message =
+      py_rule.attr("format_message")(**py::cast(messageArgs));
+
+  _PyDiagnostics().attr("diagnose")(py_rule, py_level, py_message);
 }
 
 } // namespace diagnostics
diff --git a/torch/onnx/_internal/diagnostics/OVERVIEW.md b/torch/onnx/_internal/diagnostics/OVERVIEW.md
new file mode 100644
index 0000000000000..0dffb0d20b459
--- /dev/null
+++ b/torch/onnx/_internal/diagnostics/OVERVIEW.md
@@ -0,0 +1,83 @@
+# PyTorch ONNX Exporter Diagnostics
+
+NOTE: This feature is underdevelopment and is subject to change.
+
+Summary of source tree:
+- [OVERVIEW.md](OVERVIEW.md): Technical overview of the diagnostics infrastructure.
+- [generated/](generated): Generated diagnostics rules from [rules.yaml](rules.yaml).
+- [infra/](infra): Generic diagnostics infrastructure built on top of [SARIF](https://docs.oasis-open.org/sarif/sarif/v2.1.0/sarif-v2.1.0.html).
+- [_diagnostic.py](diagnostic.py): Python API for diagnostics.
+- [rules.yaml](rules.yaml): Single source of truth for diagnostics rules. Used to generate C++ and Python interfaces, and documentation pages.
+- [tools/onnx/](/tools/onnx): Scripts for generating source code and documentation for diagnostics rules.
+
+## Table of Contents
+
+<!-- toc -->
+
+- [Introduction](#introduction)
+  - [Motivation](#motivation)
+    - [Diagnostics as documentation](#diagnostics-as-documentation)
+    - [Different context and background](#different-context-and-background)
+    - [Machine parsable](#machine-parsable)
+  - [Design](#design)
+    - [Adopting SARIF for diagnostic structure](#adopting-sarif-for-diagnostic-structure)
+    - [Single source of truth for diagnostic rules](#single-source-of-truth-for-diagnostic-rules)
+- [Internal Details](#internal-details)
+  - [Rules](#rules)
+  - [Infrastructure](#infrastructure)
+  - [Documentation](#documentation)
+- [Usage](#usage)
+  - [Python](#python)
+  - [C++](#c)
+
+<!-- tocstop -->
+
+# Introduction
+
+The goal is to improve the diagnostics to help users debug and improve their model export.
+* The diagnostics are emitted in machine parsable [Static Analysis Results Interchange Format (SARIF)](https://docs.oasis-open.org/sarif/sarif/v2.1.0/sarif-v2.1.0.html).
+* A new clearer, structured way to add new and keep track of diagnostic rules.
+* Serve as foundation for more future improvements consuming the diagnostics.
+
+## Motivation ##
+
+The previous diagnostics were only scattered warning or error messages. They are not structured and are not machine parsable. This makes it hard to consume the diagnostics in a systematic way. This is a blocker for improving the diagnostics and for building tools on top of them. The diagnostics are also not very helpful for users to debug their model export. They are often not actionable and do not provide enough context to help users debug their model export. Some unsupported patterns or code are documented in the [PyTorch ONNX doc](https://pytorch.org/docs/stable/onnx.html#limitations). The information is scattered, hard to find, and hard to maintain and thus often outdated. The new diagnostics system aim to address these issues with the following key properties.
+
+### Diagnostics as documentation
+
+The diagnostics are the source of truth for the documentation of export issues. Documentations are no longer separated. Any changes are directly reflected as the diagnostic progress. The diagnostic itself serves as the means to track the history and progress of any specific issue. Linking the source code, the issues, the PRs, the fix, the docs, etc together through this single entity.
+
+### Different context and background
+
+There are two very different audiences: users and converter developers. The users care more about where the error is coming from the model, and how to resolve it for a successful export. They are not experts in the internal of exporter or JIT. The converter developers on the other hand need more info of the internal state of the converter to debug the issue. The diagnostics should be actionable for users and provide enough context for converter developers to debug and fix the issues. It should display the right information and context to the right audience, in a clean and concise way.
+
+### Machine parsable
+
+The diagnostics are emitted in machine parsable SARIF format. This opens the door for the diagnostics to be consumed by tools and systems. Future applications like auto fixing, formatted displaying, auto reporting, etc are possible.
+
+## Design ##
+
+### Adopting SARIF for diagnostic structure
+
+The diagnostics are emitted in machine parsable [Static Analysis Results Interchange Format (SARIF)](https://docs.oasis-open.org/sarif/sarif/v2.1.0/sarif-v2.1.0.html), with [python classes for SARIF object model](https://github.com/microsoft/sarif-python-om) as starting point. This is a standard format for the output of static analysis tools, and can be consumed by the SARIF Viewer, [VSCode extension](https://marketplace.visualstudio.com/items?itemName=MS-SarifVSCode.sarif-viewer) for example. The diagnostics are also emitted in a human readable format for users to read. The human readable format is a subset of the SARIF format. The human readable format is emitted to stdout and the SARIF format is emitted to a file. [Authoring rule metadata and result messages](https://github.com/microsoft/sarif-tutorials/blob/main/docs/Authoring-rule-metadata-and-result-messages.md) is a good starting point for understanding the SARIF format.
+
+### Single source of truth for diagnostic rules
+
+The diagnostic rules are defined in a single location, in [SARIF `reportingDescriptor` format](https://docs.oasis-open.org/sarif/sarif/v2.1.0/os/sarif-v2.1.0-os.html#_Toc34317836). From it, respective C++, python and documentation files are generated during build. With a bit of redundancy, this approach makes all the rules statically accessible under both Python and C++, while maintaining a single source of truth.
+
+# Internal Details
+
+## Rules ##
+
+
+## Infrastructure ##
+
+
+## Documentation ##
+
+
+# Usage
+
+## Python ##
+
+## C++ ##
diff --git a/torch/onnx/_internal/diagnostics/__init__.py b/torch/onnx/_internal/diagnostics/__init__.py
index 822e6a3482e62..304978dbe22da 100644
--- a/torch/onnx/_internal/diagnostics/__init__.py
+++ b/torch/onnx/_internal/diagnostics/__init__.py
@@ -1,19 +1,19 @@
 from ._diagnostic import (
     context,
     create_export_diagnostic_context,
+    diagnose,
     engine,
     ExportDiagnostic,
-    ExportDiagnosticTool,
 )
 from ._rules import rules
 from .infra import levels
 
 __all__ = [
     "ExportDiagnostic",
-    "ExportDiagnosticTool",
     "rules",
     "levels",
     "engine",
     "context",
     "create_export_diagnostic_context",
+    "diagnose",
 ]
diff --git a/torch/onnx/_internal/diagnostics/_diagnostic.py b/torch/onnx/_internal/diagnostics/_diagnostic.py
index 6b1c1216cd141..ae6615e831cb2 100644
--- a/torch/onnx/_internal/diagnostics/_diagnostic.py
+++ b/torch/onnx/_internal/diagnostics/_diagnostic.py
@@ -1,10 +1,10 @@
 """Diagnostic components for PyTorch ONNX export."""
 
 import contextlib
-from typing import Any, Optional, Tuple, TypeVar
+from typing import Optional, TypeVar
 
 import torch
-from torch.onnx._internal.diagnostics import _rules, infra
+from torch.onnx._internal.diagnostics import infra
 
 # This is a workaround for mypy not supporting Self from typing_extensions.
 _ExportDiagnostic = TypeVar("_ExportDiagnostic", bound="ExportDiagnostic")
@@ -20,12 +20,10 @@ class ExportDiagnostic(infra.Diagnostic):
 
     def __init__(
         self,
-        rule: infra.Rule,
-        level: infra.Level,
-        message_args: Optional[Tuple[Any, ...]],
+        *args,
         **kwargs,
     ) -> None:
-        super().__init__(rule, level, message_args, **kwargs)
+        super().__init__(*args, **kwargs)
 
     def with_cpp_stack(self: _ExportDiagnostic) -> _ExportDiagnostic:
         # TODO: Implement this.
@@ -56,22 +54,6 @@ def with_export_source_location(
         return self
 
 
-class ExportDiagnosticTool(infra.DiagnosticTool):
-    """Base class for all export diagnostic tools.
-
-    This class is used to represent all export diagnostic tools. It is a subclass
-    of infra.DiagnosticTool.
-    """
-
-    def __init__(self) -> None:
-        super().__init__(
-            name="torch.onnx.export",
-            version=torch.__version__,
-            rules=_rules.rules,
-            diagnostic_type=ExportDiagnostic,
-        )
-
-
 class ExportDiagnosticEngine(infra.DiagnosticEngine):
     """PyTorch ONNX Export diagnostic engine.
 
@@ -93,7 +75,10 @@ class ExportDiagnosticEngine(infra.DiagnosticEngine):
     def __init__(self) -> None:
         super().__init__()
         self._background_context = infra.DiagnosticContext(
-            ExportDiagnosticTool(), options=None
+            name="torch.onnx",
+            version=torch.__version__,
+            diagnostic_type=ExportDiagnostic,
+            options=None,
         )
 
     @property
@@ -102,7 +87,7 @@ def background_context(self) -> infra.DiagnosticContext:
 
     def clear(self):
         super().clear()
-        self._background_context._diagnostics.clear()
+        self._background_context.diagnostics.clear()
 
     def sarif_log(self):
         log = super().sarif_log()
@@ -122,8 +107,26 @@ def create_export_diagnostic_context():
     export internals via global variable. See `ExportDiagnosticEngine` for more details.
     """
     global context
-    context = engine.create_diagnostic_context(ExportDiagnosticTool())
+    context = engine.create_diagnostic_context(
+        "torch.onnx.export", torch.__version__, diagnostic_type=ExportDiagnostic
+    )
     try:
         yield context
     finally:
         context = engine.background_context
+
+
+def diagnose(
+    rule: infra.Rule,
+    level: infra.Level,
+    message: Optional[str] = None,
+    **kwargs,
+) -> ExportDiagnostic:
+    """Creates a diagnostic and record it in the global diagnostic context.
+
+    This is a wrapper around `context.record` that uses the global diagnostic context.
+    """
+    global context
+    diagnostic = ExportDiagnostic(rule, level, message, **kwargs)
+    context.add_diagnostic(diagnostic)
+    return diagnostic
diff --git a/torch/onnx/_internal/diagnostics/_rules.py b/torch/onnx/_internal/diagnostics/_rules.py
index 430fe3ea4fe58..f9948388d5da4 100644
--- a/torch/onnx/_internal/diagnostics/_rules.py
+++ b/torch/onnx/_internal/diagnostics/_rules.py
@@ -11,22 +11,78 @@
 # flake8: noqa
 from torch.onnx._internal.diagnostics import infra
 
+"""
+GENERATED CODE - DO NOT EDIT DIRECTLY
+The purpose of generating a class for each rule is to override the `format_message`
+method to provide more details in the signature about the format arguments.
+"""
+
+
+class _NodeMissingOnnxShapeInference(infra.Rule):
+    """Node is missing ONNX shape inference."""
+
+    def format_message(self, op_name) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'The shape inference of {op_name} type is missing, so it may result in wrong shape inference for the exported graph. Please consider adding it in symbolic function.'
+        """
+        return self.message_default_template.format(op_name=op_name)
+
+
+class _MissingCustomSymbolicFunction(infra.Rule):
+    """Missing symbolic function for custom PyTorch operator, cannot translate node to ONNX."""
+
+    def format_message(self, op_name) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'ONNX export failed on an operator with unrecognized namespace {op_name}. If you are trying to export a custom operator, make sure you registered it with the right domain and version.'
+        """
+        return self.message_default_template.format(op_name=op_name)
+
+
+class _MissingStandardSymbolicFunction(infra.Rule):
+    """Missing symbolic function for standard PyTorch operator, cannot translate node to ONNX."""
+
+    def format_message(self, op_name, opset_version, issue_url) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: "Exporting the operator '{op_name}' to ONNX opset version {opset_version} is not supported. Please feel free to request support or submit a pull request on PyTorch GitHub: {issue_url}."
+        """
+        return self.message_default_template.format(
+            op_name=op_name, opset_version=opset_version, issue_url=issue_url
+        )
+
+
+class _OperatorSupportedInNewerOpsetVersion(infra.Rule):
+    """Operator is supported in newer opset version."""
+
+    def format_message(self, op_name, opset_version, supported_opset_version) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: "Exporting the operator '{op_name}' to ONNX opset version {opset_version} is not supported. Support for this operator was added in version {supported_opset_version}, try exporting with this version."
+        """
+        return self.message_default_template.format(
+            op_name=op_name,
+            opset_version=opset_version,
+            supported_opset_version=supported_opset_version,
+        )
+
 
 @dataclasses.dataclass
 class _POERules(infra.RuleCollection):
-    node_missing_onnx_shape_inference: infra.Rule = dataclasses.field(
-        default=infra.Rule.from_sarif(
+    node_missing_onnx_shape_inference: _NodeMissingOnnxShapeInference = dataclasses.field(
+        default=_NodeMissingOnnxShapeInference.from_sarif(
             **{
                 "id": "POE0001",
                 "name": "node-missing-onnx-shape-inference",
                 "short_description": {"text": "Node is missing ONNX shape inference."},
                 "full_description": {
-                    "text": "",
+                    "text": "Node is missing ONNX shape inference. This usually happens when the node is not valid under standard ONNX operator spec.",
                     "markdown": "Node is missing ONNX shape inference.\nThis usually happens when the node is not valid under standard ONNX operator spec.\n",
                 },
                 "message_strings": {
                     "default": {
-                        "text": "The shape inference of {0} type is missing, so it may result in wrong shape inference for the exported graph. Please consider adding it in symbolic function."
+                        "text": "The shape inference of {op_name} type is missing, so it may result in wrong shape inference for the exported graph. Please consider adding it in symbolic function."
                     }
                 },
                 "help_uri": None,
@@ -37,8 +93,8 @@ class _POERules(infra.RuleCollection):
     )
     """Node is missing ONNX shape inference."""
 
-    missing_custom_symbolic_function: infra.Rule = dataclasses.field(
-        default=infra.Rule.from_sarif(
+    missing_custom_symbolic_function: _MissingCustomSymbolicFunction = dataclasses.field(
+        default=_MissingCustomSymbolicFunction.from_sarif(
             **{
                 "id": "POE0002",
                 "name": "missing-custom-symbolic-function",
@@ -46,12 +102,12 @@ class _POERules(infra.RuleCollection):
                     "text": "Missing symbolic function for custom PyTorch operator, cannot translate node to ONNX."
                 },
                 "full_description": {
-                    "text": "",
+                    "text": "Missing symbolic function for custom PyTorch operator, cannot translate node to ONNX.",
                     "markdown": "Missing symbolic function for custom PyTorch operator, cannot translate node to ONNX.\n",
                 },
                 "message_strings": {
                     "default": {
-                        "text": "ONNX export failed on an operator with unrecognized namespace {0}. If you are trying to export a custom operator, make sure you registered it with the right domain and version."
+                        "text": "ONNX export failed on an operator with unrecognized namespace {op_name}. If you are trying to export a custom operator, make sure you registered it with the right domain and version."
                     }
                 },
                 "help_uri": None,
@@ -62,8 +118,8 @@ class _POERules(infra.RuleCollection):
     )
     """Missing symbolic function for custom PyTorch operator, cannot translate node to ONNX."""
 
-    missing_standard_symbolic_function: infra.Rule = dataclasses.field(
-        default=infra.Rule.from_sarif(
+    missing_standard_symbolic_function: _MissingStandardSymbolicFunction = dataclasses.field(
+        default=_MissingStandardSymbolicFunction.from_sarif(
             **{
                 "id": "POE0003",
                 "name": "missing-standard-symbolic-function",
@@ -71,12 +127,12 @@ class _POERules(infra.RuleCollection):
                     "text": "Missing symbolic function for standard PyTorch operator, cannot translate node to ONNX."
                 },
                 "full_description": {
-                    "text": "",
+                    "text": "Missing symbolic function for standard PyTorch operator, cannot translate node to ONNX.",
                     "markdown": "Missing symbolic function for standard PyTorch operator, cannot translate node to ONNX.\n",
                 },
                 "message_strings": {
                     "default": {
-                        "text": "Exporting the operator '{0}' to ONNX opset version {1} is not supported. Please feel free to request support or submit a pull request on PyTorch GitHub: {2}."
+                        "text": "Exporting the operator '{op_name}' to ONNX opset version {opset_version} is not supported. Please feel free to request support or submit a pull request on PyTorch GitHub: {issue_url}."
                     }
                 },
                 "help_uri": None,
@@ -87,8 +143,8 @@ class _POERules(infra.RuleCollection):
     )
     """Missing symbolic function for standard PyTorch operator, cannot translate node to ONNX."""
 
-    operator_supported_in_newer_opset_version: infra.Rule = dataclasses.field(
-        default=infra.Rule.from_sarif(
+    operator_supported_in_newer_opset_version: _OperatorSupportedInNewerOpsetVersion = dataclasses.field(
+        default=_OperatorSupportedInNewerOpsetVersion.from_sarif(
             **{
                 "id": "POE0004",
                 "name": "operator-supported-in-newer-opset-version",
@@ -96,12 +152,12 @@ class _POERules(infra.RuleCollection):
                     "text": "Operator is supported in newer opset version."
                 },
                 "full_description": {
-                    "text": "",
+                    "text": "Operator is supported in newer opset version.",
                     "markdown": "Operator is supported in newer opset version.\n\nExample:\n```python\ntorch.onnx.export(model, args, ..., opset_version=9)\n```\n",
                 },
                 "message_strings": {
                     "default": {
-                        "text": "Exporting the operator '{0}' to ONNX opset version {1} is not supported. Support for this operator was added in version {2}, try exporting with this version."
+                        "text": "Exporting the operator '{op_name}' to ONNX opset version {opset_version} is not supported. Support for this operator was added in version {supported_opset_version}, try exporting with this version."
                     }
                 },
                 "help_uri": None,
diff --git a/torch/onnx/_internal/diagnostics/infra/__init__.py b/torch/onnx/_internal/diagnostics/infra/__init__.py
index 6a51350871f92..ac9e6e99a9746 100644
--- a/torch/onnx/_internal/diagnostics/infra/__init__.py
+++ b/torch/onnx/_internal/diagnostics/infra/__init__.py
@@ -2,7 +2,6 @@
     Diagnostic,
     DiagnosticContext,
     DiagnosticOptions,
-    DiagnosticTool,
     Level,
     levels,
     Location,
@@ -17,7 +16,6 @@
     "DiagnosticContext",
     "DiagnosticEngine",
     "DiagnosticOptions",
-    "DiagnosticTool",
     "Level",
     "levels",
     "Location",
diff --git a/torch/onnx/_internal/diagnostics/infra/_infra.py b/torch/onnx/_internal/diagnostics/infra/_infra.py
index 14be9d205dbb6..6966ccccbb264 100644
--- a/torch/onnx/_internal/diagnostics/infra/_infra.py
+++ b/torch/onnx/_internal/diagnostics/infra/_infra.py
@@ -4,7 +4,7 @@
 
 import dataclasses
 import enum
-from typing import Any, FrozenSet, List, Optional, Sequence, Set, Tuple, Type, TypeVar
+from typing import FrozenSet, List, Optional, Sequence, Tuple, Type, TypeVar
 
 from torch.onnx._internal.diagnostics.infra import formatter, sarif
 
@@ -32,6 +32,21 @@ class Tag(enum.Enum):
     pass
 
 
+class PatchedPropertyBag(sarif.PropertyBag):
+    """Key/value pairs that provide additional information about the object.
+
+    The definition of PropertyBag via SARIF spec is "A property bag is an object (§3.6)
+    containing an unordered set of properties with arbitrary names." However it is not
+    reflected in the json file, and therefore not captured by the python representation.
+    This patch adds additional **kwargs to the `__init__` method to allow recording
+    arbitrary key/value pairs.
+    """
+
+    def __init__(self, tags: Optional[List[str]] = None, **kwargs):
+        super().__init__(tags=tags)
+        self.__dict__.update(kwargs)
+
+
 @dataclasses.dataclass(frozen=True)
 class Rule:
     id: str
@@ -39,22 +54,16 @@ class Rule:
     message_default_template: str
     short_description: Optional[str] = None
     full_description: Optional[str] = None
+    full_description_markdown: Optional[str] = None
     help_uri: Optional[str] = None
 
     @classmethod
-    def from_sarif(cls, **kwargs) -> Rule:
+    def from_sarif(cls, **kwargs):
         """Returns a rule from the SARIF reporting descriptor."""
-        short_description = (
-            kwargs["short_description"]["text"]
-            if "short_description" in kwargs
-            else None
-        )
-        full_description = (
-            kwargs["full_description"]["markdown"]
-            if "full_description" in kwargs
-            else None
-        )
-        help_uri = kwargs["help_uri"] if "help_uri" in kwargs else None
+        short_description = kwargs.get("short_description", {}).get("text")
+        full_description = kwargs.get("full_description", {}).get("text")
+        full_description_markdown = kwargs.get("full_description", {}).get("markdown")
+        help_uri = kwargs.get("help_uri")
 
         rule = cls(
             id=kwargs["id"],
@@ -62,6 +71,7 @@ def from_sarif(cls, **kwargs) -> Rule:
             message_default_template=kwargs["message_strings"]["default"]["text"],
             short_description=short_description,
             full_description=full_description,
+            full_description_markdown=full_description_markdown,
             help_uri=help_uri,
         )
         return rule
@@ -74,7 +84,9 @@ def sarif(self) -> sarif.ReportingDescriptor:
             else None
         )
         full_description = (
-            sarif.MultiformatMessageString(text="", markdown=self.full_description)
+            sarif.MultiformatMessageString(
+                text=self.full_description, markdown=self.full_description_markdown
+            )
             if self.full_description is not None
             else None
         )
@@ -86,6 +98,15 @@ def sarif(self) -> sarif.ReportingDescriptor:
             help_uri=self.help_uri,
         )
 
+    def format_message(self, *args, **kwargs) -> str:
+        """Returns the formatted default message of this Rule.
+
+        This method should be overridden (with code generation) by subclasses to reflect
+        the exact arguments needed by the message template. This is a helper method to
+        create the default message for a diagnostic.
+        """
+        return self.message_default_template.format(*args, **kwargs)
+
 
 @dataclasses.dataclass
 class Location:
@@ -147,21 +168,40 @@ def add_frame(
 _Diagnostic = TypeVar("_Diagnostic", bound="Diagnostic")
 
 
+@dataclasses.dataclass
+class Graph:
+    """A graph of diagnostics.
+
+    This class stores the string representation of a model graph.
+    The `nodes` and `edges` fields are unused in the current implementation.
+    """
+
+    graph_str: str
+    name: str
+    description: Optional[str] = None
+
+    def sarif(self) -> sarif.Graph:
+        """Returns the SARIF representation of this graph."""
+        return sarif.Graph(
+            description=sarif.Message(text=self.graph_str),
+            properties=PatchedPropertyBag(name=self.name, description=self.description),
+        )
+
+
 @dataclasses.dataclass
 class Diagnostic:
     rule: Rule
     level: Level
-    message_args: Optional[Tuple[Any, ...]]
+    message: Optional[str] = None
     locations: List[Location] = dataclasses.field(default_factory=list)
     stacks: List[Stack] = dataclasses.field(default_factory=list)
+    graphs: List[Graph] = dataclasses.field(default_factory=list)
     additional_message: Optional[str] = None
     tags: List[Tag] = dataclasses.field(default_factory=list)
 
     def sarif(self) -> sarif.Result:
         """Returns the SARIF Result representation of this diagnostic."""
-        if self.message_args is None:
-            self.message_args = tuple()
-        message = self.rule.message_default_template.format(*self.message_args)
+        message = self.message or self.rule.message_default_template
         if self.additional_message is not None:
             message = f"{message}\n{self.additional_message}"
         sarif_result = sarif.Result(
@@ -171,6 +211,7 @@ def sarif(self) -> sarif.Result:
         )
         sarif_result.locations = [location.sarif() for location in self.locations]
         sarif_result.stacks = [stack.sarif() for stack in self.stacks]
+        sarif_result.graphs = [graph.sarif() for graph in self.graphs]
         sarif_result.properties = sarif.PropertyBag(
             tags=[tag.value for tag in self.tags]
         )
@@ -186,6 +227,11 @@ def with_stack(self: _Diagnostic, stack: Stack) -> _Diagnostic:
         self.stacks.append(stack)
         return self
 
+    def with_graph(self: _Diagnostic, graph: Graph) -> _Diagnostic:
+        """Adds a graph to the diagnostic."""
+        self.graphs.append(graph)
+        return self
+
     def with_additional_message(self: _Diagnostic, message: str) -> _Diagnostic:
         """Adds an additional message to the diagnostic."""
         if self.additional_message is None:
@@ -231,61 +277,6 @@ def custom_collection_from_list(
         )()
 
 
-@dataclasses.dataclass(frozen=True)
-class DiagnosticTool:
-    name: str
-    version: str
-    rules: RuleCollection
-    diagnostic_type: Type[Diagnostic] = dataclasses.field(default=Diagnostic)
-    _triggered_rules: Set[Rule] = dataclasses.field(init=False, default_factory=set)
-
-    def __post_init__(self) -> None:
-        if not issubclass(self.diagnostic_type, Diagnostic):
-            raise TypeError(
-                "Expected diagnostic_type to be a subclass of Diagnostic, "
-                f"but got {self.diagnostic_type}"
-            )
-
-    def sarif(self) -> sarif.Tool:
-        """Returns the SARIF Tool representation."""
-        return sarif.Tool(
-            driver=sarif.ToolComponent(
-                name=self.name,
-                version=self.version,
-                rules=[rule.sarif() for rule in self._triggered_rules],
-            )
-        )
-
-    def create_diagnostic(
-        self,
-        rule: Rule,
-        level: Level,
-        message_args: Optional[Tuple[Any, ...]],
-        **kwargs,
-    ) -> Diagnostic:
-        """Creates a diagnostic for the given arguments.
-
-        Args:
-            rule: The rule that triggered the diagnostic.
-            level: The level of the diagnostic.
-            message_args: The arguments to format the rule's message template.
-            **kwargs: Additional arguments to pass to the Diagnostic constructor.
-
-        Returns:
-            The created diagnostic.
-
-        Raises:
-            ValueError: If the rule is not supported by the tool.
-        """
-        if rule not in self.rules:
-            raise ValueError(
-                f"Rule '{rule.id}:{rule.name}' is not supported by this tool '{self.name} {self.version}'."
-                f" Supported rules are: {self.rules._rule_id_name_set}"
-            )
-        self._triggered_rules.add(rule)
-        return self.diagnostic_type(rule, level, message_args, **kwargs)
-
-
 class Invocation:
     # TODO: Implement this.
     def __init__(self) -> None:
@@ -301,9 +292,11 @@ class DiagnosticOptions:
 
 @dataclasses.dataclass
 class DiagnosticContext:
-    tool: DiagnosticTool
+    name: str
+    version: str
     options: Optional[DiagnosticOptions] = None
-    _diagnostics: List[Diagnostic] = dataclasses.field(init=False, default_factory=list)
+    diagnostic_type: Type[Diagnostic] = dataclasses.field(default=Diagnostic)
+    diagnostics: List[Diagnostic] = dataclasses.field(init=False, default_factory=list)
     _invocation: Invocation = dataclasses.field(init=False)
 
     def __enter__(self):
@@ -315,15 +308,34 @@ def __exit__(self, exc_type, exc_val, exc_tb):
     def sarif(self) -> sarif.Run:
         """Returns the SARIF Run object."""
         return sarif.Run(
-            tool=self.tool.sarif(),
-            results=[diagnostic.sarif() for diagnostic in self._diagnostics],
+            tool=sarif.Tool(
+                driver=sarif.ToolComponent(
+                    name=self.name,
+                    version=self.version,
+                    rules=[diagnostic.rule.sarif() for diagnostic in self.diagnostics],
+                )
+            ),
+            results=[diagnostic.sarif() for diagnostic in self.diagnostics],
         )
 
+    def add_diagnostic(self, diagnostic: Diagnostic) -> None:
+        """Adds a diagnostic to the context.
+
+        Use this method to add diagnostics that are not created by the context.
+        Args:
+            diagnostic: The diagnostic to add.
+        """
+        if not isinstance(diagnostic, self.diagnostic_type):
+            raise TypeError(
+                f"Expected diagnostic of type {self.diagnostic_type}, got {type(diagnostic)}"
+            )
+        self.diagnostics.append(diagnostic)
+
     def diagnose(
         self,
         rule: Rule,
         level: Level,
-        message_args: Optional[Tuple[Any, ...]] = None,
+        message: Optional[str] = None,
         **kwargs,
     ) -> Diagnostic:
         """Creates a diagnostic for the given arguments.
@@ -331,7 +343,7 @@ def diagnose(
         Args:
             rule: The rule that triggered the diagnostic.
             level: The level of the diagnostic.
-            message_args: The arguments to format the rule's message template.
+            message: The message of the diagnostic.
             **kwargs: Additional arguments to pass to the Diagnostic constructor.
 
         Returns:
@@ -340,6 +352,6 @@ def diagnose(
         Raises:
             ValueError: If the rule is not supported by the tool.
         """
-        diagnostic = self.tool.create_diagnostic(rule, level, message_args, **kwargs)
-        self._diagnostics.append(diagnostic)
+        diagnostic = self.diagnostic_type(rule, level, message, **kwargs)
+        self.add_diagnostic(diagnostic)
         return diagnostic
diff --git a/torch/onnx/_internal/diagnostics/infra/engine.py b/torch/onnx/_internal/diagnostics/infra/engine.py
index 19fd846c35de7..2678268fbaf9a 100644
--- a/torch/onnx/_internal/diagnostics/infra/engine.py
+++ b/torch/onnx/_internal/diagnostics/infra/engine.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from typing import List, Optional
+from typing import List, Optional, Type
 
 from torch.onnx._internal.diagnostics import infra
 from torch.onnx._internal.diagnostics.infra import formatter, sarif
@@ -14,9 +14,7 @@ class DiagnosticEngine:
 
     This class is the main interface for diagnostics. It manages the creation of diagnostic contexts.
     A DiagnosticContext provides the entry point for recording Diagnostics.
-    Each DiagnosticContext is powered by a DiagnosticTool, which can be customized with
-    custom RuleCollection and Diagnostic type.
-    See infra.DiagnosticContext and infra.DiagnosticTool for more details.
+    See infra.DiagnosticContext for more details.
 
     Examples:
         Step 1: Create a set of rules.
@@ -31,36 +29,29 @@ class DiagnosticEngine:
         ...     ],
         ... )
 
-        Step 2: Create a diagnostic tool.
-        >>> tool = infra.DiagnosticTool(
-        ...     name="tool",
-        ...     version="1.0.0",
-        ...     rules=rules,
-        ... )
-
-        Step 3: Create a diagnostic engine.
+        Step 2: Create a diagnostic engine.
         >>> engine = DiagnosticEngine()
 
-        Step 4: Start a new diagnostic context.
-        >>> with engine.start_diagnostic_context(tool) as context:
+        Step 3: Start a new diagnostic context.
+        >>> with engine.create_diagnostic_context("torch.onnx.export", version="1.0") as context:
 
-        Step 5: Add diagnostics in your code.
+        Step 4: Add diagnostics in your code.
         ...     context.diagnose(rules.rule1, infra.Level.ERROR)
 
-        Step 6: Afterwards, get the SARIF log.
+        Step 5: Afterwards, get the SARIF log.
         >>> sarif_log = engine.sarif_log()
     """
 
-    _contexts: List[infra.DiagnosticContext]
+    contexts: List[infra.DiagnosticContext]
 
     def __init__(self) -> None:
-        self._contexts = []
+        self.contexts = []
 
     def sarif_log(self) -> sarif.SarifLog:
         return sarif.SarifLog(
             version=sarif_version.SARIF_VERSION,
             schema_uri=sarif_version.SARIF_SCHEMA_LINK,
-            runs=[context.sarif() for context in self._contexts],
+            runs=[context.sarif() for context in self.contexts],
         )
 
     def __str__(self) -> str:
@@ -75,13 +66,27 @@ def to_json(self) -> str:
 
     def clear(self) -> None:
         """Clears all diagnostic contexts."""
-        self._contexts.clear()
+        self.contexts.clear()
 
     def create_diagnostic_context(
         self,
-        tool: infra.DiagnosticTool,
+        name: str,
+        version: str,
         options: Optional[infra.DiagnosticOptions] = None,
+        diagnostic_type: Type[infra.Diagnostic] = infra.Diagnostic,
     ) -> infra.DiagnosticContext:
-        context = infra.DiagnosticContext(tool, options)
-        self._contexts.append(context)
+        """Creates a new diagnostic context.
+
+        Args:
+            name: The subject name for the diagnostic context.
+            version: The subject version for the diagnostic context.
+            options: The options for the diagnostic context.
+
+        Returns:
+            A new diagnostic context.
+        """
+        context = infra.DiagnosticContext(
+            name, version, options, diagnostic_type=diagnostic_type
+        )
+        self.contexts.append(context)
         return context
diff --git a/torch/onnx/_internal/diagnostics/rules.yaml b/torch/onnx/_internal/diagnostics/rules.yaml
index 717ce5e139fe0..9d527bccf1e25 100644
--- a/torch/onnx/_internal/diagnostics/rules.yaml
+++ b/torch/onnx/_internal/diagnostics/rules.yaml
@@ -11,13 +11,14 @@
   short_description:
     text: Node is missing ONNX shape inference.
   full_description:
-    text: ""
+    text: "Node is missing ONNX shape inference.
+      This usually happens when the node is not valid under standard ONNX operator spec."
     markdown: |
       Node is missing ONNX shape inference.
       This usually happens when the node is not valid under standard ONNX operator spec.
   message_strings:
     default:
-      text: "The shape inference of {0} type is missing, so it may result in wrong shape inference for the exported graph.
+      text: "The shape inference of {op_name} type is missing, so it may result in wrong shape inference for the exported graph.
       Please consider adding it in symbolic function."
   help_uri:
   properties:
@@ -29,12 +30,12 @@
   short_description:
     text: Missing symbolic function for custom PyTorch operator, cannot translate node to ONNX.
   full_description:
-    text: ""
+    text: Missing symbolic function for custom PyTorch operator, cannot translate node to ONNX.
     markdown: |
       Missing symbolic function for custom PyTorch operator, cannot translate node to ONNX.
   message_strings:
     default:
-      text: "ONNX export failed on an operator with unrecognized namespace {0}.
+      text: "ONNX export failed on an operator with unrecognized namespace {op_name}.
       If you are trying to export a custom operator, make sure you registered
       it with the right domain and version."
   help_uri:
@@ -47,13 +48,13 @@
   short_description:
     text: Missing symbolic function for standard PyTorch operator, cannot translate node to ONNX.
   full_description:
-    text: ""
+    text: Missing symbolic function for standard PyTorch operator, cannot translate node to ONNX.
     markdown: |
       Missing symbolic function for standard PyTorch operator, cannot translate node to ONNX.
   message_strings:
     default:
-      text: "Exporting the operator '{0}' to ONNX opset version {1} is not supported.
-      Please feel free to request support or submit a pull request on PyTorch GitHub: {2}."
+      text: "Exporting the operator '{op_name}' to ONNX opset version {opset_version} is not supported.
+      Please feel free to request support or submit a pull request on PyTorch GitHub: {issue_url}."
   help_uri:
   properties:
     deprecated: false
@@ -65,7 +66,7 @@
   short_description:
     text: Operator is supported in newer opset version.
   full_description:
-    text: ""
+    text: Operator is supported in newer opset version.
     markdown: |
       Operator is supported in newer opset version.
 
@@ -75,8 +76,8 @@
       ```
   message_strings:
     default:
-      text: "Exporting the operator '{0}' to ONNX opset version {1} is not supported.
-      Support for this operator was added in version {2}, try exporting with this version."
+      text: "Exporting the operator '{op_name}' to ONNX opset version {opset_version} is not supported.
+      Support for this operator was added in version {supported_opset_version}, try exporting with this version."
   help_uri:
   properties:
     deprecated: false
diff --git a/torch/onnx/errors.py b/torch/onnx/errors.py
index 467494c560447..f5ad684cf1686 100644
--- a/torch/onnx/errors.py
+++ b/torch/onnx/errors.py
@@ -46,49 +46,27 @@ class UnsupportedOperatorError(OnnxExporterError):
     """Raised when an operator is unsupported by the exporter."""
 
     def __init__(self, name: str, version: int, supported_version: Optional[int]):
-        msg = f"Exporting the operator '{name}' to ONNX opset version {version} is not supported. "
         if supported_version is not None:
-            msg += (
-                f"Support for this operator was added in version {supported_version}. "
-                "Please try exporting with this version."
-            )
-            diagnostics.context.diagnose(
-                diagnostics.rules.operator_supported_in_newer_opset_version,
-                diagnostics.levels.ERROR,
-                message_args=(
-                    name,
-                    version,
-                    supported_version,
-                ),
+            diagnostic_rule: diagnostics.infra.Rule = (
+                diagnostics.rules.operator_supported_in_newer_opset_version
             )
+            msg = diagnostic_rule.format_message(name, version, supported_version)
+            diagnostics.diagnose(diagnostic_rule, diagnostics.levels.ERROR, msg)
         else:
-            msg += "Please feel free to request support or submit a pull request on PyTorch GitHub: "
-            msg += _constants.PYTORCH_GITHUB_ISSUES_URL
-
             if (
                 name.startswith("aten::")
                 or name.startswith("prim::")
                 or name.startswith("quantized::")
             ):
-                diagnostics.context.diagnose(
-                    diagnostics.rules.missing_standard_symbolic_function,
-                    diagnostics.levels.ERROR,
-                    message_args=(
-                        name,
-                        version,
-                        _constants.PYTORCH_GITHUB_ISSUES_URL,
-                    ),
+                diagnostic_rule = diagnostics.rules.missing_standard_symbolic_function
+                msg = diagnostic_rule.format_message(
+                    name, version, _constants.PYTORCH_GITHUB_ISSUES_URL
                 )
+                diagnostics.diagnose(diagnostic_rule, diagnostics.levels.ERROR, msg)
             else:
-                msg += (
-                    "If you are trying to export a custom operator, make sure you registered "
-                    "it with the correct domain and version."
-                )
-                diagnostics.context.diagnose(
-                    diagnostics.rules.missing_custom_symbolic_function,
-                    diagnostics.levels.ERROR,
-                    message_args=(name,),
-                )
+                diagnostic_rule = diagnostics.rules.missing_custom_symbolic_function
+                msg = diagnostic_rule.format_message(name)
+                diagnostics.diagnose(diagnostic_rule, diagnostics.levels.ERROR, msg)
         super().__init__(msg)
 
 
From 30b86fc438f39b6279758b298944b44d744010bb Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Tue, 8 Nov 2022 10:22:31 -0800
Subject: [PATCH 0766/1922] [ONNX] Create common ExportTestCase base class
 (#88145)

Refactor out a common base class `ExportTestCase`, for common things in `setUp`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88145
Approved by: https://github.com/justinchuby, https://github.com/abock, https://github.com/AllenTiTaiWang
---
 test/onnx/onnx_test_common.py                 | 14 +++-------
 test/onnx/pytorch_test_common.py              | 26 ++++++++++++++++++
 test/onnx/test_autograd_funs.py               |  8 +++---
 test/onnx/test_custom_ops.py                  |  7 ++---
 test/onnx/test_export_modes.py                |  4 ++-
 test/onnx/test_models.py                      |  5 ++--
 test/onnx/test_models_onnxruntime.py          |  5 ++--
 test/onnx/test_onnx_opset.py                  |  3 ++-
 test/onnx/test_operators.py                   |  5 ++++
 test/onnx/test_pytorch_helper.py              |  3 ++-
 test/onnx/test_pytorch_jit_onnx.py            |  3 ++-
 test/onnx/test_pytorch_onnx_caffe2.py         | 27 +++++++------------
 .../test_pytorch_onnx_caffe2_quantized.py     |  3 ++-
 test/onnx/test_pytorch_onnx_no_runtime.py     |  4 +--
 .../onnx/test_pytorch_onnx_shape_inference.py |  3 ++-
 test/onnx/test_utility_funs.py                | 11 +++-----
 16 files changed, 76 insertions(+), 55 deletions(-)

diff --git a/test/onnx/onnx_test_common.py b/test/onnx/onnx_test_common.py
index 45f90d4193ce7..6963d16284ce6 100644
--- a/test/onnx/onnx_test_common.py
+++ b/test/onnx/onnx_test_common.py
@@ -3,15 +3,13 @@
 from __future__ import annotations
 
 import os
-import random
 from typing import Any, Mapping, Type
 
-import numpy as np
 import onnxruntime
+import pytorch_test_common
 
 import torch
 from torch.onnx import _constants, verification
-from torch.testing._internal import common_utils
 
 onnx_model_dir = os.path.join(
     os.path.dirname(os.path.realpath(__file__)),
@@ -54,13 +52,7 @@ def parameterize_class_name(cls: Type, idx: int, input_dicts: Mapping[Any, Any])
     return f"{cls.__name__}_{suffix}"
 
 
-def set_rng_seed(seed):
-    torch.manual_seed(seed)
-    random.seed(seed)
-    np.random.seed(seed)
-
-
-class _TestONNXRuntime(common_utils.TestCase):
+class _TestONNXRuntime(pytorch_test_common.ExportTestCase):
     opset_version = _constants.ONNX_DEFAULT_OPSET
     keep_initializers_as_inputs = True  # For IR version 3 type export.
     is_script = False
@@ -68,7 +60,7 @@ class _TestONNXRuntime(common_utils.TestCase):
     check_dtype = True
 
     def setUp(self):
-        set_rng_seed(0)
+        super().setUp()
         onnxruntime.set_seed(0)
         if torch.cuda.is_available():
             torch.cuda.manual_seed_all(0)
diff --git a/test/onnx/pytorch_test_common.py b/test/onnx/pytorch_test_common.py
index 4a44932fb1206..4e443c333f35f 100644
--- a/test/onnx/pytorch_test_common.py
+++ b/test/onnx/pytorch_test_common.py
@@ -2,12 +2,17 @@
 
 import functools
 import os
+import random
 import sys
 import unittest
 from typing import Optional
 
+import numpy as np
+
 import torch
 from torch.autograd import function
+from torch.onnx._internal import diagnostics
+from torch.testing._internal import common_utils
 
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.insert(-1, pytorch_test_dir)
@@ -188,3 +193,24 @@ def wrapper(self, *args, **kwargs):
 
 def flatten(x):
     return tuple(function._iter_filter(lambda o: isinstance(o, torch.Tensor))(x))
+
+
+def set_rng_seed(seed):
+    torch.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+
+
+class ExportTestCase(common_utils.TestCase):
+    """Test case for ONNX export.
+
+    Any test case that tests functionalities under torch.onnx should inherit from this class.
+    """
+
+    def setUp(self):
+        super().setUp()
+        # TODO(#88264): Flaky test failures after changing seed.
+        set_rng_seed(0)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(0)
+        diagnostics.engine.clear()
diff --git a/test/onnx/test_autograd_funs.py b/test/onnx/test_autograd_funs.py
index 97f0652ecf378..a5498f39d2da7 100644
--- a/test/onnx/test_autograd_funs.py
+++ b/test/onnx/test_autograd_funs.py
@@ -1,16 +1,16 @@
 # Owner(s): ["module: onnx"]
 
-import unittest
+import pytorch_test_common
 
 import torch
-
 from onnx_test_common import run_model_test
 from torch.onnx import OperatorExportTypes
 from torch.onnx._globals import GLOBALS
 from torch.onnx.utils import _model_to_graph
+from torch.testing._internal import common_utils
 
 
-class TestAutogradFuns(unittest.TestCase):
+class TestAutogradFuns(pytorch_test_common.ExportTestCase):
     opset_version = GLOBALS.export_onnx_opset_version
     keep_initializers_as_inputs = False
     onnx_shape_inference = True
@@ -209,4 +209,4 @@ def forward(self, input):
 
 
 if __name__ == "__main__":
-    unittest.main()
+    common_utils.run_tests()
diff --git a/test/onnx/test_custom_ops.py b/test/onnx/test_custom_ops.py
index 4242d70583ba3..5609b497535e9 100644
--- a/test/onnx/test_custom_ops.py
+++ b/test/onnx/test_custom_ops.py
@@ -4,6 +4,7 @@
 import numpy as np
 import onnx
 import onnx_test_common
+import pytorch_test_common
 import torch
 import torch.utils.cpp_extension
 from test_pytorch_onnx_caffe2 import do_export
@@ -11,7 +12,7 @@
 from torch.testing._internal import common_utils
 
 
-class TestCustomOps(common_utils.TestCase):
+class TestCustomOps(pytorch_test_common.ExportTestCase):
     def test_custom_add(self):
         op_source = """
         #include <torch/script.h>
@@ -56,7 +57,7 @@ def symbolic_custom_add(g, self, other):
         np.testing.assert_array_equal(caffe2_out[0], model(x, y).cpu().numpy())
 
 
-class TestCustomAutogradFunction(common_utils.TestCase):
+class TestCustomAutogradFunction(pytorch_test_common.ExportTestCase):
     opset_version = 9
     keep_initializers_as_inputs = False
     onnx_shape_inference = True
@@ -130,7 +131,7 @@ def symbolic_pythonop(ctx: torch.onnx.SymbolicContext, g, *args, **kwargs):
         onnx_test_common.run_model_test(self, model, input_args=(x,))
 
 
-class TestExportAsContribOps(common_utils.TestCase):
+class TestExportAsContribOps(pytorch_test_common.ExportTestCase):
     opset_version = 14
     keep_initializers_as_inputs = False
     onnx_shape_inference = True
diff --git a/test/onnx/test_export_modes.py b/test/onnx/test_export_modes.py
index 0f3024a2e366d..502f31b38b10a 100644
--- a/test/onnx/test_export_modes.py
+++ b/test/onnx/test_export_modes.py
@@ -15,11 +15,13 @@
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
+import pytorch_test_common
+
 from torch.testing._internal import common_utils
 
 
 # Smoke tests for export methods
-class TestExportModes(common_utils.TestCase):
+class TestExportModes(pytorch_test_common.ExportTestCase):
     class MyModel(nn.Module):
         def __init__(self):
             super(TestExportModes.MyModel, self).__init__()
diff --git a/test/onnx/test_models.py b/test/onnx/test_models.py
index 7084bd75bace7..15904839957ee 100644
--- a/test/onnx/test_models.py
+++ b/test/onnx/test_models.py
@@ -2,8 +2,9 @@
 
 import unittest
 
-import torch
+import pytorch_test_common
 
+import torch
 from model_defs.dcgan import _netD, _netG, bsz, imgsz, nz, weights_init
 from model_defs.emb_seq import EmbeddingNetwork1, EmbeddingNetwork2
 from model_defs.mnist import MNIST
@@ -44,7 +45,7 @@ def toC(x):
 BATCH_SIZE = 2
 
 
-class TestModels(common_utils.TestCase):
+class TestModels(pytorch_test_common.ExportTestCase):
     opset_version = 9  # Caffe2 doesn't support the default.
     keep_initializers_as_inputs = False
 
diff --git a/test/onnx/test_models_onnxruntime.py b/test/onnx/test_models_onnxruntime.py
index c84640e535e11..de1003ce449e0 100644
--- a/test/onnx/test_models_onnxruntime.py
+++ b/test/onnx/test_models_onnxruntime.py
@@ -8,6 +8,7 @@
 import onnx_test_common
 import parameterized
 import PIL
+import pytorch_test_common
 import test_models
 
 import torch
@@ -64,7 +65,7 @@ def exportTest(
 
 TestModels = type(
     "TestModels",
-    (common_utils.TestCase,),
+    (pytorch_test_common.ExportTestCase,),
     dict(
         test_models.TestModels.__dict__,
         is_script_test_enabled=False,
@@ -77,7 +78,7 @@ def exportTest(
 # model tests for scripting with new JIT APIs and shape inference
 TestModels_new_jit_API = type(
     "TestModels_new_jit_API",
-    (common_utils.TestCase,),
+    (pytorch_test_common.ExportTestCase,),
     dict(
         TestModels.__dict__,
         exportTest=exportTest,
diff --git a/test/onnx/test_onnx_opset.py b/test/onnx/test_onnx_opset.py
index dab33bf00b09d..ef79e82ee266a 100644
--- a/test/onnx/test_onnx_opset.py
+++ b/test/onnx/test_onnx_opset.py
@@ -4,6 +4,7 @@
 import itertools
 
 import onnx
+import pytorch_test_common
 
 import torch
 import torch.onnx
@@ -70,7 +71,7 @@ def check_onnx_opsets_operator(
         check_onnx_opset_operator(model, ops[opset_version], opset_version)
 
 
-class TestONNXOpset(common_utils.TestCase):
+class TestONNXOpset(pytorch_test_common.ExportTestCase):
     def test_opset_fallback(self):
         class MyModule(Module):
             def forward(self, x):
diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py
index 9b743a50d3323..cfb36732af4d8 100644
--- a/test/onnx/test_operators.py
+++ b/test/onnx/test_operators.py
@@ -31,6 +31,7 @@
 )
 from torch.autograd import Function, Variable
 from torch.nn import functional, Module
+from torch.onnx._internal import diagnostics
 from torch.onnx.symbolic_helper import (
     _get_tensor_dim_size,
     _get_tensor_sizes,
@@ -71,6 +72,10 @@ def forward(self, *args):
 
 
 class TestOperators(common_utils.TestCase):
+    def setUp(self):
+        super().setUp()
+        diagnostics.engine.clear()
+
     def assertONNX(self, f, args, params=None, **kwargs):
         if params is None:
             params = ()
diff --git a/test/onnx/test_pytorch_helper.py b/test/onnx/test_pytorch_helper.py
index 362841d8bf90f..7d7f3ade7f581 100644
--- a/test/onnx/test_pytorch_helper.py
+++ b/test/onnx/test_pytorch_helper.py
@@ -4,6 +4,7 @@
 import unittest
 
 import numpy as np
+import pytorch_test_common
 
 import torch.nn.init as init
 import torch.onnx
@@ -15,7 +16,7 @@
 from torch.testing._internal.common_utils import skipIfNoLapack
 
 
-class TestCaffe2Backend(common_utils.TestCase):
+class TestCaffe2Backend(pytorch_test_common.ExportTestCase):
     @skipIfNoLapack
     @unittest.skip("test broken because Lapack was always missing.")
     def test_helper(self):
diff --git a/test/onnx/test_pytorch_jit_onnx.py b/test/onnx/test_pytorch_jit_onnx.py
index f069251ee064c..784bd0954b0ad 100644
--- a/test/onnx/test_pytorch_jit_onnx.py
+++ b/test/onnx/test_pytorch_jit_onnx.py
@@ -1,5 +1,6 @@
 # Owner(s): ["module: onnx"]
 import onnxruntime
+import pytorch_test_common
 
 import torch
 from pytorch_test_common import skipIfNoCuda
@@ -171,7 +172,7 @@ def MakeTestCase(opset_version: int) -> type:
     name = f"TestJITIRToONNX_opset{opset_version}"
     return type(
         str(name),
-        (common_utils.TestCase,),
+        (pytorch_test_common.ExportTestCase,),
         dict(_TestJITIRToONNX.__dict__, opset_version=opset_version),
     )
 
diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py
index 141d3683171f6..78440ac6ecb5b 100644
--- a/test/onnx/test_pytorch_onnx_caffe2.py
+++ b/test/onnx/test_pytorch_onnx_caffe2.py
@@ -12,6 +12,7 @@
 import model_defs.word_language_model as word_language_model
 import numpy as np
 import onnx
+import pytorch_test_common
 import torch.onnx
 import torch.onnx.operators
 import torch.utils.model_zoo as model_zoo
@@ -129,18 +130,10 @@ def do_export(model, inputs, *args, **kwargs):
 }
 
 
-class TestCaffe2Backend_opset9(common_utils.TestCase):
+class TestCaffe2Backend_opset9(pytorch_test_common.ExportTestCase):
     opset_version = 9
     embed_params = False
 
-    def setUp(self):
-        # the following should ideally be super().setUp(), https://github.com/pytorch/pytorch/issues/79630
-        common_utils.TestCase.setUp(self)
-        torch.manual_seed(0)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(0)
-        np.random.seed(seed=0)
-
     def convert_cuda(self, model, input):
         cuda_model = model.cuda()
         # input might be nested - we want to move everything to GPU
@@ -3198,44 +3191,44 @@ def setup_rnn_tests():
 # to embed_params=True
 TestCaffe2BackendEmbed_opset9 = type(
     "TestCaffe2BackendEmbed_opset9",
-    (common_utils.TestCase,),
+    (pytorch_test_common.ExportTestCase,),
     dict(TestCaffe2Backend_opset9.__dict__, embed_params=True),
 )
 
 # opset 7 tests
 TestCaffe2Backend_opset7 = type(
     "TestCaffe2Backend_opset7",
-    (common_utils.TestCase,),
+    (pytorch_test_common.ExportTestCase,),
     dict(TestCaffe2Backend_opset9.__dict__, opset_version=7),
 )
 TestCaffe2BackendEmbed_opset7 = type(
     "TestCaffe2BackendEmbed_opset7",
-    (common_utils.TestCase,),
+    (pytorch_test_common.ExportTestCase,),
     dict(TestCaffe2Backend_opset9.__dict__, embed_params=True, opset_version=7),
 )
 
 # opset 8 tests
 TestCaffe2Backend_opset8 = type(
     "TestCaffe2Backend_opset8",
-    (common_utils.TestCase,),
+    (pytorch_test_common.ExportTestCase,),
     dict(TestCaffe2Backend_opset9.__dict__, opset_version=8),
 )
 TestCaffe2BackendEmbed_opset8 = type(
     "TestCaffe2BackendEmbed_opset8",
-    (common_utils.TestCase,),
+    (pytorch_test_common.ExportTestCase,),
     dict(TestCaffe2Backend_opset9.__dict__, embed_params=True, opset_version=8),
 )
 
 # opset 10 tests
 TestCaffe2Backend_opset10 = type(
     "TestCaffe2Backend_opset10",
-    (common_utils.TestCase,),
+    (pytorch_test_common.ExportTestCase,),
     dict(TestCaffe2Backend_opset9.__dict__, opset_version=10),
 )
 
 TestCaffe2BackendEmbed_opset10 = type(
     "TestCaffe2BackendEmbed_opset10",
-    (common_utils.TestCase,),
+    (pytorch_test_common.ExportTestCase,),
     dict(TestCaffe2Backend_opset9.__dict__, embed_params=True, opset_version=10),
 )
 
@@ -3243,7 +3236,7 @@ def setup_rnn_tests():
 # to embed_params=True
 TestCaffe2BackendEmbed_opset9_new_jit_API = type(
     "TestCaffe2BackendEmbed_opset9_new_jit_API",
-    (common_utils.TestCase,),
+    (pytorch_test_common.ExportTestCase,),
     dict(TestCaffe2Backend_opset9.__dict__, embed_params=True),
 )
 
diff --git a/test/onnx/test_pytorch_onnx_caffe2_quantized.py b/test/onnx/test_pytorch_onnx_caffe2_quantized.py
index f6466aa0869e5..92079ebbe6d92 100644
--- a/test/onnx/test_pytorch_onnx_caffe2_quantized.py
+++ b/test/onnx/test_pytorch_onnx_caffe2_quantized.py
@@ -6,13 +6,14 @@
 
 import numpy as np
 import onnx
+import pytorch_test_common
 import torch.ao.nn.quantized as nnq
 import torch.nn as nn
 import torch.onnx
 from torch.testing._internal import common_utils
 
 
-class TestQuantizedOps(common_utils.TestCase):
+class TestQuantizedOps(pytorch_test_common.ExportTestCase):
     def generic_test(
         self, model, sample_inputs, input_names=None, decimal=3, relaxed_check=False
     ):
diff --git a/test/onnx/test_pytorch_onnx_no_runtime.py b/test/onnx/test_pytorch_onnx_no_runtime.py
index 1ec86ce69515a..622f42effb4ab 100644
--- a/test/onnx/test_pytorch_onnx_no_runtime.py
+++ b/test/onnx/test_pytorch_onnx_no_runtime.py
@@ -11,9 +11,9 @@
 from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
-
 import onnx
 import onnx.numpy_helper
+import pytorch_test_common
 
 import torch
 import torch.nn.functional as F
@@ -74,7 +74,7 @@ def export_to_onnx(
     return onnx_model
 
 
-class TestONNXExport(common_utils.TestCase):
+class TestONNXExport(pytorch_test_common.ExportTestCase):
     def test_fuse_addmm(self):
         class AddmmModel(torch.nn.Module):
             def forward(self, x):
diff --git a/test/onnx/test_pytorch_onnx_shape_inference.py b/test/onnx/test_pytorch_onnx_shape_inference.py
index 86258fb1d0ec1..cf9ef2fd893e7 100644
--- a/test/onnx/test_pytorch_onnx_shape_inference.py
+++ b/test/onnx/test_pytorch_onnx_shape_inference.py
@@ -1,6 +1,7 @@
 # Owner(s): ["module: onnx"]
 
 import numpy as np
+import pytorch_test_common
 
 import torch
 from pytorch_test_common import skipIfUnsupportedMinOpsetVersion
@@ -19,7 +20,7 @@ def verify(actual_type):
     return verify
 
 
-class TestONNXShapeInference(common_utils.TestCase):
+class TestONNXShapeInference(pytorch_test_common.ExportTestCase):
     def setUp(self):
         self.opset_version = _constants.ONNX_MAX_OPSET
         symbolic_helper._set_onnx_shape_inference(True)
diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py
index 26467d54c1c6c..51adaef317af2 100644
--- a/test/onnx/test_utility_funs.py
+++ b/test/onnx/test_utility_funs.py
@@ -8,6 +8,7 @@
 
 import onnx
 import parameterized
+import pytorch_test_common
 
 import torch
 import torch.onnx
@@ -27,13 +28,7 @@
 from verify import verify
 
 
-class _BaseTestCase(common_utils.TestCase):
-    def setUp(self):
-        super().setUp()
-        torch.manual_seed(0)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(0)
-
+class _BaseTestCase(pytorch_test_common.ExportTestCase):
     def _model_to_graph(
         self,
         model,
@@ -64,7 +59,7 @@ def _model_to_graph(
 
 
 @common_utils.instantiate_parametrized_tests
-class TestUnconvertibleOps(common_utils.TestCase):
+class TestUnconvertibleOps(pytorch_test_common.ExportTestCase):
     """Unit tests for the `unconvertible_ops` function."""
 
     def setUp(self):

From 423ceed635349923a9007fb8e24784c3e6644f61 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 10 Nov 2022 23:37:29 +0000
Subject: [PATCH 0767/1922] [Inductor] Fix lowmem_dropout() missing 1 required
 positional argument: 'p' (#88716)

Fixes error from 7k github models: https://github.com/jansel/pytorch-jit-paritybench/blob/master/generated/test_GuYuc_WS_DAN_PyTorch.py

Error:
```
TypeError: lowmem_dropout() missing 1 required positional argument: 'p'

While executing %lowmem_dropout : [#users=1] = call_function[target=torch._inductor.overrides.lowmem_dropout](args = (%avg_pool2d_9,), kwargs = {training: False})
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88716
Approved by: https://github.com/ngimel, https://github.com/jansel, https://github.com/desertfire
---
 test/inductor/test_torchinductor.py | 21 ++++++++++++++++-----
 torch/_inductor/overrides.py        |  2 +-
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 229f0fa83dd4e..8fd4fa29bf98a 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -3666,13 +3666,24 @@ def test_dropout(self):
         torch.manual_seed(1234)
 
         @torch._dynamo.optimize("inductor")
-        def fn(a):
-            return torch.nn.functional.dropout(a, 0.5, True)
+        def fn1(a):
+            return torch.nn.functional.dropout(a)
 
         x = torch.ones(1000, device=self.device, dtype=torch.float32)
-        result = fn(x)
-        self.assertTrue(400 < result.nonzero().shape[0] < 600)
-        self.assertTrue(0.9 < result.mean().item() < 1.1)
+        result1 = fn1(x)
+        self.assertTrue(400 < result1.nonzero().shape[0] < 600)
+        self.assertTrue(0.9 < result1.mean().item() < 1.1)
+
+        random.seed(1234)
+        torch.manual_seed(1234)
+
+        @torch._dynamo.optimize("inductor")
+        def fn2(a):
+            return torch.nn.functional.dropout(a, 0.5, True)
+
+        result2 = fn2(x)
+        self.assertTrue(400 < result2.nonzero().shape[0] < 600)
+        self.assertTrue(0.9 < result2.mean().item() < 1.1)
 
     def test_dropout_deterministic(self):
         @torch._dynamo.optimize("inductor")
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index 581e1996a436c..d89ee82674dde 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -562,7 +562,7 @@ def backward(ctx, grad_output):
 
 
 @torch.fx.wrap
-def lowmem_dropout(input, p, training=True, inplace=False):
+def lowmem_dropout(input, p=0.5, training=True, inplace=False):
     if isinstance(input, torch.fx.Proxy):
         # double check we don't FX trace this
         return input.tracer.create_proxy(

From 0fef760e3001634b342cda59d8fef1c88f3f5b50 Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Thu, 10 Nov 2022 20:35:52 +0000
Subject: [PATCH 0768/1922] Symbolic shape: sym_floor , sym_sqrt, sym_int
 (#88760)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88760
Approved by: https://github.com/ezyang
---
 test/test_dynamic_shapes.py              | 43 ++++++++++++++++++++--
 torch/__init__.py                        | 33 +++++++++++++++--
 torch/fx/experimental/symbolic_shapes.py | 45 +++++++++++++++---------
 3 files changed, 99 insertions(+), 22 deletions(-)

diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index b23af9bbfb67c..0f1f49d2e6ea5 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -20,7 +20,7 @@
 from torch.utils._pytree import tree_map
 from torch.fx.experimental import symbolic_shapes
 from torch.fx.experimental.proxy_tensor import make_fx
-from torch.fx.experimental.symbolic_shapes import ShapeEnv, sym_float, guard_int, SymNode
+from torch.fx.experimental.symbolic_shapes import ShapeEnv, sym_float, guard_int, SymNode, sym_sqrt, sym_int
 from torch.utils._python_dispatch import TorchDispatchMode
 from torch import SymInt
 
@@ -335,6 +335,45 @@ def test_guard_int(self):
         self.assertEqual(guard_int(a0), 2)
         self.assertEqual(str(shape_env.guards[0][0]), "Eq(s0, 2)")
 
+    @skipIfNoSympy
+    def test_sym_int(self):
+        shape_env = ShapeEnv()
+        a0 = create_symint(shape_env, 5)
+        r = sym_int(a0)
+        self.assertEqual(r, 5)
+        self.assertIsInstance(r, torch.SymInt, msg=type(r))
+        self.assertEqual(str(shape_env.guards[0][0]), "Eq(s0, 5)")
+
+        a1 = create_symint(shape_env, 7)
+        r = sym_int(a1 / 2)
+        self.assertEqual(guard_int(r), 3)
+        self.assertIsInstance(r, torch.SymInt, msg=type(r))
+        self.assertEqual(str(shape_env.guards[1][0]), "Eq(floor(s1/2), 3)")
+
+        a2 = create_symint(shape_env, -3)
+        r = sym_int(a2 / 2)
+        self.assertEqual(guard_int(r), -1)
+        self.assertIsInstance(r, torch.SymInt, msg=type(r))
+        self.assertEqual(str(shape_env.guards[2][0]), "Eq(ceiling(-s2/2), -1)")
+
+    @skipIfNoSympy
+    def test_sym_sqrt(self):
+        shape_env = ShapeEnv()
+        a0 = create_symint(shape_env, 4)
+        r = sym_sqrt(a0)
+        self.assertEqual(r, 2)
+        self.assertIsInstance(r, torch.SymFloat, msg=type(r))
+        self.assertEqual(str(shape_env.guards[0][0]), "Eq(sqrt(s0), 2)")
+
+    @skipIfNoSympy
+    def test_sym_floor(self):
+        shape_env = ShapeEnv()
+        a0 = create_symint(shape_env, 5)
+        r = math.floor(a0 / 2)
+        self.assertEqual(r, 2)
+        self.assertIsInstance(r, torch.SymInt, msg=type(r))
+        self.assertEqual(str(shape_env.guards[0][0]), "Eq(floor(s0/2), 2)")
+
     @skipIfNoSympy
     def test_int_conversion(self):
         shape_env = ShapeEnv()
@@ -526,7 +565,7 @@ def guard_fn(v):
     @parametrize("first_type", ["int", "float"])
     @parametrize("second_type", ["int", "float"])
     def test_method(self, fn, first_type, second_type):
-        if first_type == "float" and fn in symbolic_shapes.magic_methods_not_on_float:
+        if first_type == "float":
             self.skipTest(f"{fn} is not a float magic method")
 
         is_unary_fn = fn in symbolic_shapes.unary_magic_methods
diff --git a/torch/__init__.py b/torch/__init__.py
index 2abf4ba4b07de..ee271c0a975ac 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -218,8 +218,23 @@ def __int__(self):
 
     # Magic methods installed by torch.fx.experimental.symbolic_shapes
 
+    def __eq__(self, other: object) -> builtins.bool:
+        raise AssertionError("type stub not overridden")
+
+    def __lt__(self, other) -> builtins.bool:
+        raise AssertionError("type stub not overridden")
+
+    def __gt__(self, other) -> builtins.bool:
+        raise AssertionError("type stub not overridden")
+
+    def __le__(self, other) -> builtins.bool:
+        raise AssertionError("type stub not overridden")
+
+    def __ge__(self, other) -> builtins.bool:
+        raise AssertionError("type stub not overridden")
+
     def __sym_float__(self):
-        ...
+        raise AssertionError("type stub not overridden")
 
     def __repr__(self):
         return self.node.str()
@@ -247,8 +262,20 @@ def __bool__(self):
 
     # Magic methods installed by torch.fx.experimental.symbolic_shapes
 
-    def __sym_int__(self):
-        ...
+    def __eq__(self, other: object) -> builtins.bool:
+        raise AssertionError("type stub not overridden")
+
+    def __lt__(self, other) -> builtins.bool:
+        raise AssertionError("type stub not overridden")
+
+    def __gt__(self, other) -> builtins.bool:
+        raise AssertionError("type stub not overridden")
+
+    def __le__(self, other) -> builtins.bool:
+        raise AssertionError("type stub not overridden")
+
+    def __ge__(self, other) -> builtins.bool:
+        raise AssertionError("type stub not overridden")
 
     def __repr__(self):
         return self.node.str()
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index 82e1d5107d790..d9b0a8fc2019e 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -1,6 +1,6 @@
 import torch
 import torch.utils._pytree as pytree
-from typing import Set, Dict, List, Type, Optional, cast
+from typing import Set, Dict, List, Type, Optional, cast, Union
 import sys
 import operator
 import builtins
@@ -24,7 +24,8 @@
 
 __all__ = [
     "has_symbolic_sizes_strides", "create_contiguous", "ShapeEnv",
-    "SymDispatchMode", "sym_float", "FloorDiv", "guard_int", "wrap_node"
+    "SymDispatchMode", "sym_int", "sym_float", "FloorDiv", "guard_int", "wrap_node",
+    "sym_sqrt",
 ]
 
 SYM_FUNCTION_MODE = None
@@ -103,11 +104,26 @@ def sym_float(a):
         return a.__sym_float__()
     return float(a)
 
+# Drop in replacement for math.sqrt
+def sym_sqrt(a):
+    if hasattr(a, '__sym_sqrt__'):
+        return a.__sym_sqrt__()
+    return math.sqrt(a)
+
+# Drop in replacement for math.floor/ceil.  Actually, math.floor/ceil
+# directly usable, but this has a more relaxed type signature for mypy
+# (mypy requires SupportFloat which is too strict)
+def sym_floor(a):
+    return math.floor(a)  # type: ignore[type]
+
+def sym_ceil(a):
+    return math.ceil(a)  # type: ignore[type]
+
 def sym_int(a):
     if isinstance(a, SymInt):
         return a
-    elif hasattr(a, '__sym_int__'):
-        return a.__sym_int__()
+    elif isinstance(a, SymFloat):
+        return sym_floor(a) if a > 0 else sym_ceil(a)
     return int(a)
 
 # TODO: An incomplete list
@@ -255,29 +271,28 @@ def _nyi():
     'lt': lambda a, b: sympy.Lt(a, b),
     'le': lambda a, b: sympy.Le(a, b),
     'ge': lambda a, b: sympy.Ge(a, b),
-    'sym_float': lambda a: a,  # TODO: why can't I wrap with sympy.Float?
-    'sym_int': lambda a: _nyi(),
+    'floor': lambda a: sympy.floor(a),
+    'sym_float': lambda a: a,  # Cannot use sympy.Float(a) here, coz it expects python literals
     'ceil': lambda a: sympy.ceiling(a),
     'neg': lambda a: -a,
     'min': lambda a, b: sympy.Min(a, b),
     'max': lambda a, b: sympy.Max(a, b),
+    'sym_sqrt': lambda a: sympy.sqrt(a),
 }
 
 unary_magic_methods = {
     'sym_float',
-    'sym_int',
     'ceil',
+    'floor',
     'neg',
+    'sym_sqrt',
 }
 
-# TODO: sym_int should also work on floats
-magic_methods_not_on_float = {"sym_int"}
-
 magic_methods_on_builtins = {"min", "max"}
 magic_methods_on_math = {"ceil", "floor"}
-magic_methods_on_submodule = {"sym_float", "sym_int"}
+magic_methods_on_submodule = {"sym_float", "sym_sqrt"}
 
-always_float_magic_methods = {"truediv", "sym_float"}
+always_float_magic_methods = {"truediv", "sym_float", "sym_sqrt"}
 always_int_magic_methods = {"ceil", "floor"}
 always_bool_magic_methods = {"eq", "gt", "lt", "le", "ge"}
 
@@ -383,10 +398,6 @@ def rbinary_magic_impl(self, other):
 
 for method, func in magic_methods.items():
     _make_user_magic(method, SymInt)
-
-for method, func in magic_methods.items():
-    if method in magic_methods_not_on_float:
-        continue
     _make_user_magic(method, SymFloat)
 
 del method
@@ -479,7 +490,7 @@ def create_symbolic_sizes_strides(self, ex: torch.Tensor):
         assert all(x is not None for x in stride)
         return [self.create_symintnode(i) for i in size], [self.create_symintnode(i) for i in stride]  # type: ignore[arg-type]
 
-    def create_symintnode(self, expr: "sympy.Expr"):
+    def create_symintnode(self, expr: Union["sympy.Expr", int]):
         return SymInt(SymNode(expr, self, int))
 
     def create_symbol(self, val: int) -> "sympy.Expr":

From 066d3c8cc3161930956a70e34b57c8c724a6e148 Mon Sep 17 00:00:00 2001
From: Dmytro Dzhulgakov <dima.v.dzhulgakov@gmail.com>
Date: Thu, 10 Nov 2022 23:44:49 +0000
Subject: [PATCH 0769/1922] Fix cupti search path in CMake (#88657)

Minor fix for when cuda is installed via conda. In this case the libraries are in `lib` and not `lib64`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88657
Approved by: https://github.com/kit1980, https://github.com/malfet
---
 cmake/Dependencies.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index cf3c2c2caafd2..104056ee07240 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1952,6 +1952,7 @@ if(USE_KINETO)
     find_library(CUPTI_LIBRARY_PATH ${CUPTI_LIB_NAME} PATHS
         ${CUDA_SOURCE_DIR}
         ${CUDA_SOURCE_DIR}/extras/CUPTI/lib64
+        ${CUDA_SOURCE_DIR}/lib
         ${CUDA_SOURCE_DIR}/lib64
         NO_DEFAULT_PATH)
 

From fea4c58c24805e98a3ced30d83cb07ab93991fcb Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 10 Nov 2022 23:47:21 +0000
Subject: [PATCH 0770/1922] [Dynamo] Add complete support for
 Tensor.is_contiguous (#88407)

Fixes https://github.com/pytorch/torchdynamo/issues/1783

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88407
Approved by: https://github.com/jansel
---
 test/dynamo/test_misc.py          | 12 +++++++
 torch/_dynamo/variables/tensor.py | 55 ++++++++++++++++++++++++++-----
 2 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index 45433b6795cc1..4df7153b8fb2b 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -1409,6 +1409,18 @@ def fn(x):
             res = opt_fn(x)
             self.assertTrue(same(ref, res))
 
+    def test_tensor_is_contiguous(self):
+        def fn(x):
+            input = torch.randn((1, 16, 1, 1))
+            weight = torch.randn((8, 16, 3, 3))
+            weight = weight.to(memory_format=x)
+            output = torch.conv2d(input, weight, None, (2, 1), (1, 1), (1, 1), 1)
+            return output.is_contiguous(memory_format=x)
+
+        opt_fn = torch._dynamo.optimize("eager")(fn)
+        for x in [torch.contiguous_format, torch.channels_last]:
+            self.assertEqual(fn(x), opt_fn(x))
+
     def test_python_slice(self):
         def f1(input):
             y = 0
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index 5a30f838e3f35..0974f24ee9694 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -409,7 +409,13 @@ def specialize(value: torch.Tensor):
         if not config.dynamic_shapes:
             props["size"] = tuple(value.size())
             props["stride"] = tuple(value.stride())
-            props["is_contiguous"] = value.is_contiguous()
+            props["is_contiguous"] = tuple(
+                [
+                    x
+                    for x in torch._prims_common._memory_formats
+                    if value.is_contiguous(memory_format=x)
+                ]
+            )
         return props
 
     def var_getattr(self, tx, name):
@@ -492,13 +498,13 @@ def call_method(
         elif name == "is_floating_point" and self.dtype is not None:
             constant_result = ConstantVariable(self.dtype.is_floating_point, **options)
         elif name == "is_contiguous" and self.is_contiguous is not None:
-            if (
-                "memory_format" in kwargs
-                and kwargs["memory_format"].as_python_constant()
-                == torch.contiguous_format
-            ):
-                kwargs.pop("memory_format")
-            constant_result = ConstantVariable(self.is_contiguous, **options)
+            if "memory_format" in kwargs:
+                memory_format = kwargs.pop("memory_format").as_python_constant()
+            else:
+                memory_format = torch.contiguous_format
+            constant_result = ConstantVariable(
+                memory_format in self.is_contiguous, **options
+            )
         else:
             constant_result = None
 
@@ -555,6 +561,39 @@ def call_method(
                 current_tx=tx,
             )
             return ConstantVariable(None, **options)
+        elif name in ("resize_", "resize_as_"):
+            if "memory_format" in kwargs:
+                memory_format = kwargs["memory_format"].as_python_constant()
+            else:
+                memory_format = torch.contiguous_format
+
+            if name == "resize_":
+                self.size = args[0].as_python_constant()
+                self.is_contiguous = (memory_format,)
+            else:
+                assert isinstance(args[0], TensorVariable)
+                if self.size and args[0].size:
+                    if (
+                        self.size == args[0].size
+                        or memory_format is torch.preserve_format
+                    ):
+                        self.is_contiguous = args[0].is_contiguous
+                    else:
+                        self.size = args[0].size
+                        self.stride = args[0].stride
+                        self.ndim = args[0].ndim
+                        self.is_contiguous = (memory_format,)
+
+            return self.__class__.create(
+                tx,
+                tx.output.create_proxy(
+                    "call_method",
+                    name,
+                    *proxy_args_kwargs([self] + args, kwargs),
+                    current_tx=tx,
+                ),
+                **options,
+            )
         else:
             # Convert x.new(torch.Size) into x.new_empty(torch.Size),
             # as Tensor.new acts differently with a Size input versus a tuple input.

From f1cf494e4cb790482f9c40972a55539cb69a2b03 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Thu, 10 Nov 2022 23:52:27 +0000
Subject: [PATCH 0771/1922] [MPS] Fix `test_embedding_dense_backward` (#88847)

By copying randomly initialized weights distribution from MPS `nn.Embedding` to `cpu`

Test plan: `python test_mps.py -k test_embedding_dense_backward --repeat 150`

Fixes https://github.com/pytorch/pytorch/issues/88679

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88847
Approved by: https://github.com/seemethere
---
 test/test_mps.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index 2f07652f8b5bd..266c6a5c2c220 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -4261,8 +4261,9 @@ def helper(shape, dim, index, idx_dtype=torch.int32):
     def test_embedding_dense_backward(self):
         def helper(n, d, m, idx):
             embeddingMPS = nn.Embedding(n, d, max_norm=True, device='mps')
+            emedding_weight = embeddingMPS.weight.detach().cpu()
             W_MPS = torch.randn((m, d), requires_grad=True, device='mps')
-            idx_MPS = torch.tensor(idx).to('mps')
+            idx_MPS = torch.tensor(idx, device='mps')
             a_MPS = embeddingMPS.weight.clone() @ W_MPS.t()  # weight must be cloned for this to be differentiable
             a_MPS.retain_grad()
             b_MPS = embeddingMPS(idx_MPS) @ W_MPS.t()  # modifies weight in-place
@@ -4271,7 +4272,7 @@ def helper(n, d, m, idx):
             loss_MPS = out_MPS.sigmoid().prod()
             loss_MPS.backward()
 
-            embeddingCPU = nn.Embedding(n, d, max_norm=True, scale_grad_by_freq=True)
+            embeddingCPU = nn.Embedding(n, d, max_norm=True, _weight=emedding_weight)
             W_CPU = W_MPS.to('cpu')
             idx_CPU = torch.tensor(idx)
             a_CPU = embeddingCPU.weight.clone() @ W_CPU.t()  # weight must be cloned for this to be differentiable

From afc66600c46dde9863533f39b12cb42587a34635 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Thu, 10 Nov 2022 23:56:49 +0000
Subject: [PATCH 0772/1922] Revert "Add nondeterministic error for `scatter`
 (#88244)"

This reverts commit e940a2f8e2a3aa9d98291e73b3d40fcffb6182c8.

Reverted https://github.com/pytorch/pytorch/pull/88244 on behalf of https://github.com/mehtanirav due to Internal test failures
---
 .../ATen/native/TensorAdvancedIndexing.cpp    |  4 --
 test/test_torch.py                            | 40 -------------------
 torch/__init__.py                             |  1 -
 3 files changed, 45 deletions(-)

diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
index fa78b60c66848..3004dc1b31c79 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -1512,10 +1512,6 @@ TORCH_IMPL_FUNC(scatter_src_out)
  const Tensor& index,
  const Tensor& src,
  const Tensor& out) {
-  // See note [Writing Nondeterministic Operations]
-  // Nondeterministic when index contains duplicate entries, src is a tensor,
-  // and reduce=None
-  at::globalContext().alertNotDeterministic("scatter with src tensor and reduce=None");
   scatter_impl(self, dim, index, src, out,
                scatter_reduce_stub,
                scatter_stub);
diff --git a/test/test_torch.py b/test/test_torch.py
index 82d0807d81a72..3ebc92676fe02 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -1478,46 +1478,6 @@ def test_nondeterministic_alert_put_accumulate(self, device):
                 'put_',
                 torch.device(device).type == 'cuda')
 
-    @expectedFailureMeta  # expected a non-determinitic error, but it was not raised
-    @onlyNativeDeviceTypes
-    def test_nondeterministic_alert_scatter(self, device):
-        a = torch.randn(10, device=device)
-        indices = torch.tensor([0, 0], device=device)
-        values = torch.tensor([0., 1.], device=device)
-        result = torch.empty_like(a)
-
-        error_msg = 'scatter with src tensor and reduce=None'
-
-        error_cases = [
-            lambda: torch.Tensor.scatter(a, 0, indices, values),
-            lambda: torch.Tensor.scatter_(a, 0, indices, values),
-            lambda: torch.scatter(a, 0, indices, values),
-            lambda: torch.scatter(a, 0, indices, values, out=result),
-        ]
-
-        no_error_cases = [
-            lambda: torch.Tensor.scatter(a, 0, indices, 0),
-            lambda: torch.Tensor.scatter_(a, 0, indices, 0),
-            lambda: torch.scatter(a, 0, indices, 0),
-            lambda: torch.scatter(a, 0, indices, 0, out=result),
-
-            lambda: torch.Tensor.scatter(a, 0, indices, values, reduce='add'),
-            lambda: torch.Tensor.scatter_(a, 0, indices, values, reduce='add'),
-            lambda: torch.scatter(a, 0, indices, values, reduce='add'),
-            lambda: torch.scatter(a, 0, indices, values, out=result, reduce='add'),
-        ]
-
-        for error_case in error_cases:
-            self.check_nondeterministic_alert(
-                error_case,
-                error_msg)
-
-        for no_error_case in no_error_cases:
-            self.check_nondeterministic_alert(
-                no_error_case,
-                error_msg,
-                False)
-
     @skipIfMps
     def test_nondeterministic_alert_histc(self, device):
         a = torch.tensor([], device=device)
diff --git a/torch/__init__.py b/torch/__init__.py
index ee271c0a975ac..6049967b6f18e 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -527,7 +527,6 @@ def use_deterministic_algorithms(mode, *, warn_only=False):
           ``mode='max'``
         * :func:`torch.Tensor.put_` when ``accumulate=False``
         * :func:`torch.Tensor.put_` when ``accumulate=True`` and called on a CUDA tensor
-        * :func:`torch.Tensor.scatter` when ``src`` is a tensor and ``reduce=None``
         * :func:`torch.histc` when called on a CUDA tensor
         * :func:`torch.bincount` when called on a CUDA tensor
         * :func:`torch.kthvalue` with called on a CUDA tensor

From 353c400a658f49aee925668af0a59756666fb627 Mon Sep 17 00:00:00 2001
From: Ian Graves <iangraves@meta.com>
Date: Fri, 11 Nov 2022 00:19:20 +0000
Subject: [PATCH 0773/1922] [pytorch] Enable memory map file support for
 Android, Apple, and CXX (#88545)

Summary: See title.  Left Windows out so it still compiles.

Test Plan:
Add a `#fail` below [this line](https://fburl.com/code/p0mlhlw4) and build for various platforms and confirm it fails which proves the `#ifdef` was hit.

```
buck2 build xplat/langtech/tuna/cli:tuclixAndroid
buck2 build xplat/langtech/tuna/cli:tuclix
```

CI/CD for the rest.

Differential Revision: D41054824

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88545
Approved by: https://github.com/qihqi
---
 c2_defs.bzl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/c2_defs.bzl b/c2_defs.bzl
index 573ba9f6ad64c..0a89bb88093db 100644
--- a/c2_defs.bzl
+++ b/c2_defs.bzl
@@ -166,6 +166,7 @@ def get_c2_fbandroid_xplat_compiler_flags():
         # T95767731 -- remove this once all builds are on at least llvm-13
         "-Wno-unknown-warning-option",
         "-Wno-unused-but-set-variable",
+        "-DHAVE_MMAP",
     ]
 
     if get_c2_strip_glog():
@@ -392,6 +393,7 @@ def c2_cxx_library(**kwargs):
     args = get_c2_default_cxx_args()
     args.update(kwargs)
     args.setdefault("platforms", (ANDROID, APPLE, CXX, WINDOWS))
+
     fb_xplat_cxx_library(
         labels = [
             "supermodule:android/default/caffe2",

From 4e4786c7d6a677d4359ee1a651aca87514d66900 Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Tue, 8 Nov 2022 07:59:12 -0800
Subject: [PATCH 0774/1922] [ao] qconfig_mapping.py fixing public v private
 (#87518)

Summary: made _GLOBAL_DICT_KEY, _OBJECT_TYPE_DICT_KEY,
_MODULE_NAME_REGEX_DICT_KEY, _MODULE_NAME_DICT_KEY,
_MODULE_NAME_OBJECT_TYPE_ORDER_DICT_KEY private

Test Plan: python test/test_public_bindings.py

Reviewers:

Subscribers:

Tasks:

Tags:

Differential Revision: [D40709278](https://our.internmc.facebook.com/intern/diff/D40709278)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87518
Approved by: https://github.com/jcaip
---
 test/quantization/fx/test_quantize_fx.py      | 20 ++++++------
 .../quantization/fx/qconfig_mapping_utils.py  |  8 ++---
 torch/ao/quantization/qconfig_mapping.py      | 32 +++++++++----------
 3 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 8c75658a04e1b..6eb9246c85a7c 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -90,11 +90,11 @@
 
 from torch.ao.quantization.qconfig_mapping import (
     _get_symmetric_qnnpack_qconfig_mapping,
-    GLOBAL_DICT_KEY,
-    MODULE_NAME_DICT_KEY,
-    MODULE_NAME_OBJECT_TYPE_ORDER_DICT_KEY,
-    MODULE_NAME_REGEX_DICT_KEY,
-    OBJECT_TYPE_DICT_KEY,
+    _GLOBAL_DICT_KEY,
+    _MODULE_NAME_DICT_KEY,
+    _MODULE_NAME_OBJECT_TYPE_ORDER_DICT_KEY,
+    _MODULE_NAME_REGEX_DICT_KEY,
+    _OBJECT_TYPE_DICT_KEY,
     QConfigMapping,
 )
 
@@ -1972,20 +1972,20 @@ def _get_qconfig_dict_for_qconfig_mapping_test(self, global_qconfig, qconfig1, q
         Return a dummy qconfig_dict to test QConfigMapping's to_dict and from_dict methods.
         """
         return {
-            GLOBAL_DICT_KEY: global_qconfig,
-            OBJECT_TYPE_DICT_KEY: [
+            _GLOBAL_DICT_KEY: global_qconfig,
+            _OBJECT_TYPE_DICT_KEY: [
                 (torch.nn.Linear, qconfig1),
                 (torch.nn.ReLU, qconfig2),
             ],
-            MODULE_NAME_REGEX_DICT_KEY: [
+            _MODULE_NAME_REGEX_DICT_KEY: [
                 ("foo.*bar", qconfig1),
                 ("foo.*", qconfig2),
             ],
-            MODULE_NAME_DICT_KEY: [
+            _MODULE_NAME_DICT_KEY: [
                 ("bazbaz", qconfig1),
                 ("borbor", qconfig2),
             ],
-            MODULE_NAME_OBJECT_TYPE_ORDER_DICT_KEY: [
+            _MODULE_NAME_OBJECT_TYPE_ORDER_DICT_KEY: [
                 ("bazbaz", torch.nn.Linear, 0, qconfig1),
                 ("foofoo", torch.nn.ReLU, 1, qconfig2),
             ],
diff --git a/torch/ao/quantization/fx/qconfig_mapping_utils.py b/torch/ao/quantization/fx/qconfig_mapping_utils.py
index 66dffd50cd008..0b0407c0b106e 100644
--- a/torch/ao/quantization/fx/qconfig_mapping_utils.py
+++ b/torch/ao/quantization/fx/qconfig_mapping_utils.py
@@ -23,9 +23,9 @@
     get_qconfig_dtypes,
 )
 from ..qconfig_mapping import (
-    OBJECT_TYPE_DICT_KEY,
-    MODULE_NAME_DICT_KEY,
-    MODULE_NAME_REGEX_DICT_KEY,
+    _OBJECT_TYPE_DICT_KEY,
+    _MODULE_NAME_DICT_KEY,
+    _MODULE_NAME_REGEX_DICT_KEY,
     QConfigMapping,
 )
 from ..qconfig_mapping_utils import (
@@ -223,7 +223,7 @@ def compare_prepare_convert_qconfig_mappings(
         convert_qconfig_mapping.module_name_qconfigs,
         convert_qconfig_mapping.module_name_regex_qconfigs,
     ]
-    dict_names = [OBJECT_TYPE_DICT_KEY, MODULE_NAME_DICT_KEY, MODULE_NAME_REGEX_DICT_KEY]
+    dict_names = [_OBJECT_TYPE_DICT_KEY, _MODULE_NAME_DICT_KEY, _MODULE_NAME_REGEX_DICT_KEY]
     for i in range(len(prepare_dicts)):
         for name, qconfig in prepare_dicts[i].items():
             assert name in convert_dicts[i], "Missing key {} {} in convert QConfigMapping \
diff --git a/torch/ao/quantization/qconfig_mapping.py b/torch/ao/quantization/qconfig_mapping.py
index 418cbb334814c..e3410a52a9d83 100644
--- a/torch/ao/quantization/qconfig_mapping.py
+++ b/torch/ao/quantization/qconfig_mapping.py
@@ -33,11 +33,11 @@
 
 
 # TODO: replace all usages with these constants
-GLOBAL_DICT_KEY = ""
-OBJECT_TYPE_DICT_KEY = "object_type"
-MODULE_NAME_REGEX_DICT_KEY = "module_name_regex"
-MODULE_NAME_DICT_KEY = "module_name"
-MODULE_NAME_OBJECT_TYPE_ORDER_DICT_KEY = "module_name_object_type_order"
+_GLOBAL_DICT_KEY = ""
+_OBJECT_TYPE_DICT_KEY = "object_type"
+_MODULE_NAME_REGEX_DICT_KEY = "module_name_regex"
+_MODULE_NAME_DICT_KEY = "module_name"
+_MODULE_NAME_OBJECT_TYPE_ORDER_DICT_KEY = "module_name_object_type_order"
 
 _FIXED_QPARAMS_OP_TO_OBSERVER: Dict[Union[Callable, str], _PartialWrapper] = {
     torch.nn.Hardsigmoid: default_fixed_qparams_range_0to1_observer,
@@ -274,11 +274,11 @@ def to_dict(self) -> Dict[str, Any]:
         The values of this dictionary are lists of tuples.
         """
         return {
-            GLOBAL_DICT_KEY: self.global_qconfig,
-            OBJECT_TYPE_DICT_KEY: list(self.object_type_qconfigs.items()),
-            MODULE_NAME_REGEX_DICT_KEY: list(self.module_name_regex_qconfigs.items()),
-            MODULE_NAME_DICT_KEY: list(self.module_name_qconfigs.items()),
-            MODULE_NAME_OBJECT_TYPE_ORDER_DICT_KEY: [
+            _GLOBAL_DICT_KEY: self.global_qconfig,
+            _OBJECT_TYPE_DICT_KEY: list(self.object_type_qconfigs.items()),
+            _MODULE_NAME_REGEX_DICT_KEY: list(self.module_name_regex_qconfigs.items()),
+            _MODULE_NAME_DICT_KEY: list(self.module_name_qconfigs.items()),
+            _MODULE_NAME_OBJECT_TYPE_ORDER_DICT_KEY: [
                 (*k, v) for k, v in self.module_name_object_type_order_qconfigs.items()
             ],
         }
@@ -302,14 +302,14 @@ def from_dict(cls, qconfig_dict: Dict[str, Any]) -> QConfigMapping:
         The values of this dictionary are expected to be lists of tuples.
         """
         conf = cls()
-        if GLOBAL_DICT_KEY in qconfig_dict:
-            conf.set_global(qconfig_dict[GLOBAL_DICT_KEY])
-        for object_type, qconfig in qconfig_dict.get(OBJECT_TYPE_DICT_KEY, []):
+        if _GLOBAL_DICT_KEY in qconfig_dict:
+            conf.set_global(qconfig_dict[_GLOBAL_DICT_KEY])
+        for object_type, qconfig in qconfig_dict.get(_OBJECT_TYPE_DICT_KEY, []):
             conf.set_object_type(object_type, qconfig)
-        for module_name_regex, qconfig in qconfig_dict.get(MODULE_NAME_REGEX_DICT_KEY, []):
+        for module_name_regex, qconfig in qconfig_dict.get(_MODULE_NAME_REGEX_DICT_KEY, []):
             conf.set_module_name_regex(module_name_regex, qconfig)
-        for module_name, qconfig in qconfig_dict.get(MODULE_NAME_DICT_KEY, []):
+        for module_name, qconfig in qconfig_dict.get(_MODULE_NAME_DICT_KEY, []):
             conf.set_module_name(module_name, qconfig)
-        for module_name, object_type, index, qconfig in qconfig_dict.get(MODULE_NAME_OBJECT_TYPE_ORDER_DICT_KEY, []):
+        for module_name, object_type, index, qconfig in qconfig_dict.get(_MODULE_NAME_OBJECT_TYPE_ORDER_DICT_KEY, []):
             conf.set_module_name_object_type_order(module_name, object_type, index, qconfig)
         return conf

From f230699623eaa1af277bd97cc55ec2348e658483 Mon Sep 17 00:00:00 2001
From: Ryan Spring <rdspring1@gmail.com>
Date: Fri, 11 Nov 2022 01:08:16 +0000
Subject: [PATCH 0775/1922] [primTorch] Implement group norm reference (#87054)

Add group norm reference
Split from #81191
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87054
Approved by: https://github.com/mruberry
---
 test/test_fx.py                               |  4 +-
 test/test_ops.py                              |  5 +-
 torch/_decomp/decompositions.py               | 31 ------
 torch/_refs/__init__.py                       | 62 ++++++++++++
 torch/_refs/nn/functional/__init__.py         | 40 ++++++++
 torch/nn/functional.py                        |  2 +
 .../_internal/common_methods_invocations.py   | 97 ++++++++++++++++---
 7 files changed, 191 insertions(+), 50 deletions(-)

diff --git a/test/test_fx.py b/test/test_fx.py
index 0aa5b28a3de7d..0aff631b8e814 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -3925,7 +3925,6 @@ def tearDown(self):
         "max_pool2d": PROXY_ITERABLE,
         "max_pool3d": PROXY_ITERABLE,
 
-        "group_norm": PROXY_ITERATED,
         "lp_pool2d": PROXY_ITERATED,
         "max_unpool1d": PROXY_ITERATED,
         "max_unpool2d": PROXY_ITERATED,
@@ -3959,6 +3958,7 @@ def tearDown(self):
         "gaussian_nll_loss": CONTROL_FLOW,
         "glu": CONTROL_FLOW,
         "grid_sample": CONTROL_FLOW,
+        "group_norm": CONTROL_FLOW,
         "gumbel_softmax": CONTROL_FLOW,
         "hardsigmoid": CONTROL_FLOW,
         "hardswish": CONTROL_FLOW,
@@ -4029,7 +4029,7 @@ def tearDown(self):
         "max_pool2d": PROXY_ITERATED,
         "max_pool3d": PROXY_ITERATED,
 
-        "group_norm": LEN_ERROR
+        "group_norm": CONTROL_FLOW
     }
 
     @classmethod
diff --git a/test/test_ops.py b/test/test_ops.py
index d0aa0906784dc..73758bfc6b466 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -417,9 +417,10 @@ def test_python_ref_executor(self, device, dtype, op, executor):
 
         # skip zero-dim tensors for some composites of reduction operations and view
         skip_zero_dim_ops = [
-            "_refs.softmax",
             "_refs.logsumexp",
             "_refs.log_softmax",
+            "_refs.native_group_norm",
+            "_refs.softmax",
             "_refs.sum_to_size",
             "ops.nvprims.view",
         ]
@@ -1659,11 +1660,13 @@ class TestRefsOpsInfo(TestCase):
         '_refs.index_add_',
         '_refs.index_copy_',
         '_refs.index_fill_',
+        '_refs.native_group_norm',
     }
 
     not_in_decomp_table = {
         # duplicated in _decomp and _refs
         '_refs.nn.functional.elu',
+        '_refs.nn.functional.group_norm',
         '_refs.nn.functional.mse_loss',
         '_refs.rsub',
         # duplicated due to efficiency concerns of the ref vs the decomp
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 0e1d1cd1dd511..fe63e0db007a7 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -1138,37 +1138,6 @@ def normalize(input, norm_dims, eps):
     return out, mean, rstd
 
 
-@register_decomposition(aten.native_group_norm.default)
-def native_group_norm(
-    input: Tensor,
-    weight: Optional[Tensor],
-    bias: Optional[Tensor],
-    N: int,
-    C: int,
-    HxW: int,
-    group: int,
-    eps: float,
-) -> Tuple[Tensor, Tensor, Tensor]:
-    orig_shape = input.shape
-    input = input.view(N, group, C // group, HxW)
-    reduction_dims = [2, 3]
-    out, mean, rstd = normalize(input, reduction_dims, eps)
-    mean = _squeeze_multiple(mean, reduction_dims)
-    rstd = _squeeze_multiple(rstd, reduction_dims)
-    out = out.view(orig_shape)
-    if weight is not None:
-        weight = _unsqueeze_to_dim(weight, out.dim() - 1)
-        out = out * weight
-    if bias is not None:
-        bias = _unsqueeze_to_dim(bias, out.dim() - 1)
-        out = out + bias
-
-    out = out.to(dtype=input.dtype)
-    mean = mean.to(dtype=input.dtype)
-    rstd = rstd.to(dtype=input.dtype)
-    return (out, mean, rstd)
-
-
 @register_decomposition(aten.native_group_norm_backward)
 @pw_cast_for_opmath
 def native_group_norm_backward(
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index cd0344eba7a91..36fef59df3757 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -238,6 +238,7 @@
     "movedim",
     "narrow",
     "narrow_copy",
+    "native_group_norm",
     "native_layer_norm",
     "permute",
     "ravel",
@@ -2781,6 +2782,7 @@ def _normalize(
         mean (Tensor): mean of the tensor along norm_dims.
         rstd (Tensor): 1/std of the tensor along norm_dims.
     """
+    norm_dims = utils.canonicalize_dims(a.ndim, norm_dims)
     computation_dtype = utils.get_computation_dtype(a.dtype)
     a_acc = _maybe_convert_to_dtype(a, computation_dtype)
     assert isinstance(a_acc, TensorLike)  # to avoid mypy error for var_mean
@@ -2792,6 +2794,66 @@ def _normalize(
     return out, mean, rstd
 
 
+# add all specified dimensions
+def _unsqueeze_multiple(x: TensorLikeType, dimensions: List[int]) -> TensorLikeType:
+    for dim in sorted(dimensions):
+        x = torch.unsqueeze(x, dim)
+    return x
+
+
+@register_decomposition(torch.ops.aten.native_group_norm.default)
+def native_group_norm(
+    input: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    batch_size: int,
+    num_channels: int,
+    flattened_inner_size: int,
+    num_groups: int,
+    eps: float,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    utils.check(
+        input.ndim >= 2,
+        lambda: f"Expected at least 2 dimensions for input tensor but received {input.ndim}",
+    )
+    utils.check(
+        num_channels % num_groups == 0,
+        lambda: "Expected number of channels in input to be divisible by num_groups, "
+        + f"but got input of shape {input.shape} and num_groups = {num_groups}",
+    )
+
+    # num_channels / num_groups and flattened inner dimension are the reduction axes
+    reduction_dims = [2, 3]
+    input_reshaped = torch.reshape(
+        input,
+        [batch_size, num_groups, num_channels // num_groups, flattened_inner_size],
+    )
+    out, mean, rstd = _normalize(input_reshaped, reduction_dims, eps)
+    out = out.view(input.shape)
+
+    broadcast_dims = [0] + list(dim for dim in range(2, input.ndim))
+    unsqueeze_bias = None
+    if bias is not None:
+        unsqueeze_bias = _unsqueeze_multiple(bias, broadcast_dims)
+    unsqueeze_weight = None
+    if weight is not None:
+        unsqueeze_weight = _unsqueeze_multiple(weight, broadcast_dims)
+
+    if unsqueeze_weight is not None:
+        out = out * unsqueeze_weight
+    if unsqueeze_bias is not None:
+        out = out + unsqueeze_bias
+
+    out = _maybe_convert_to_dtype(out, input.dtype)  # type: ignore[assignment]
+    mean = _maybe_convert_to_dtype(mean, input.dtype)  # type: ignore[assignment]
+    rstd = _maybe_convert_to_dtype(rstd, input.dtype)  # type: ignore[assignment]
+
+    # remove broadcast dimensions from mean and rstd
+    mean = prims.squeeze(mean, reduction_dims)
+    rstd = prims.squeeze(rstd, reduction_dims)
+    return (out, mean, rstd)
+
+
 @register_decomposition(torch.ops.aten.native_layer_norm)
 def native_layer_norm(
     input: Tensor,
diff --git a/torch/_refs/nn/functional/__init__.py b/torch/_refs/nn/functional/__init__.py
index 3cde678449476..dcd86d8952d26 100644
--- a/torch/_refs/nn/functional/__init__.py
+++ b/torch/_refs/nn/functional/__init__.py
@@ -171,6 +171,46 @@ def relu(a: TensorLikeType, inplace: bool = False) -> TensorLikeType:
     return torch.where(torch.le(a, 0), 0, a)
 
 
+def group_norm(
+    input: Tensor,
+    num_groups: int,
+    weight: Optional[Tensor] = None,
+    bias: Optional[Tensor] = None,
+    eps: float = 1e-5,
+) -> Tensor:
+    """
+    Reference implementation of :func:`torch.nn.functional.group_norm`.
+    """
+    utils.check(
+        input.ndim >= 2,
+        lambda: f"Expected at least 2 dimensions for input tensor but received {input.ndim}",
+    )
+
+    batch_size = input.shape[0]
+    num_channels = input.shape[1]
+    utils.check(
+        num_channels % num_groups == 0,
+        lambda: "Expected number of channels in input to be divisible by num_groups, "
+        + f"but got input of shape {input.shape} and num_groups = {num_groups}",
+    )
+
+    # input shape is (N, C, *), so we flatten all inner dimensions except (N, C)
+    flattened_inner_size = 1
+    for dim_length in input.shape[2:]:
+        flattened_inner_size *= dim_length
+
+    return torch.native_group_norm(
+        input,
+        weight,
+        bias,
+        batch_size,
+        num_channels,
+        flattened_inner_size,
+        num_groups,
+        eps,
+    )[0]
+
+
 def layer_norm(
     input: Tensor,
     normalized_shape: ShapeType,
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 79bf6297e5871..961dd83f57b2c 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -2524,6 +2524,8 @@ def group_norm(
     """
     if has_torch_function_variadic(input, weight, bias):
         return handle_torch_function(group_norm, (input, weight, bias,), input, num_groups, weight=weight, bias=bias, eps=eps)
+    if input.dim() < 2:
+        raise RuntimeError(f"Expected at least 2 dimensions for input tensor but received {input.dim()}")
     _verify_batch_size([input.size(0) * input.size(1) // num_groups, num_groups] + list(input.size()[2:]))
     return torch.group_norm(input, num_groups, weight, bias, eps, torch.backends.cudnn.enabled)
 
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 731dc008ccce7..b702c11618604 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -3334,27 +3334,72 @@ def sample_inputs_conv2d(op_info, device, dtype, requires_grad, jit_fail_sample=
 def sample_inputs_group_norm(opinfo, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
-    # Ordered as input shape, num groups, and eps
+    # Ordered as input shape, num groups, and kwargs for eps
     cases: Tuple[Tuple[int], int, float] = (  # type: ignore[assignment]
-        ((1, 6, 3), 2, 0.5),
-        ((2, 6, 3), 2, -0.5),
-        ((1, 2), 1, None),
-        ((0, 2), 1, None),
+        ((1, 6, 3), 2, {'eps' : 0.5}),
+        ((2, 6, 3), 2, {'eps' : -0.5}),
+        ((1, 3), 1, {'eps' : 1e-5}),
+        ((0, 2), 1, {'eps' : 1e-5}),
+        ((S, S, S), 1, {'eps' : 0.5}),
     )
 
-    for input_shape, num_groups, eps in cases:
+    # num_channels is inferred to be input.shape[1] dimension
+    for input_shape, num_groups, kwargs in cases:
         # Shape of weight and bias should be the same as num_channels
-        weight = make_arg(input_shape[1])
-        bias = make_arg(input_shape[1])
-        kwargs = {'weight': weight, 'bias': bias} if eps is None else {'weight': weight, 'bias': bias, 'eps': eps}
-        yield SampleInput(
-            make_arg(input_shape),
-            args=(num_groups,),
-            kwargs=kwargs
-        )
+        channels = input_shape[1] if len(input_shape) > 1 else 0
+        weight_tensor = make_arg(channels)
+        bias_tensor = make_arg(channels)
+
+        # Checking for permutations of weights and biases as `None`
+        weights = [weight_tensor, None]
+        biases = [bias_tensor, None]
+        for weight, bias in itertools.product(weights, biases):
+            kwargs = {
+                'weight': weight,
+                'bias': bias,
+                **kwargs
+            }
+            yield SampleInput(make_arg(input_shape), num_groups, **kwargs)
+
     # Without any optional args
     yield SampleInput(make_arg((1, 2)), args=(1,))
 
+def reference_inputs_group_norm(op_info, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_group_norm(
+        op_info, device, dtype, requires_grad, **kwargs)
+
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Ordered as input shape, num groups, and kwargs for eps
+    cases: Tuple[Tuple[int], int, float] = (  # type: ignore[assignment]
+        ((20, 6, 10, 10), 3, {'eps' : 1e-5}),
+        # equivalent with InstanceNorm
+        # GroupNorm(C, num_groups=C) == InstanceNorm(num_features=C)
+        ((20, 6, 10, 10), 6, {'eps' : 1e-5}),
+        # equivalent with LayerNorm
+        # GroupNorm(C, num_groups=1, affine=False) == LayerNorm(normalized_shape=[C, H, W], elementwise_affine=False)
+        ((20, 6, 10, 10), 1, {'eps' : 1e-5}),
+    )
+
+    # num_channels is inferred to be input.shape[1] dimension
+    for input_shape, num_groups, kwargs in cases:
+        # Shape of weight and bias should be the same as num_channels
+        channels = input_shape[1] if len(input_shape) > 1 else 0
+        input_tensor = make_arg(input_shape)
+        weight_tensor = make_arg(channels)
+        bias_tensor = make_arg(channels)
+
+        # Checking for permutations of weights and biases as `None`
+        weights = [weight_tensor, None]
+        biases = [bias_tensor, None]
+        for weight, bias in itertools.product(weights, biases):
+            kwargs = {
+                'weight': weight,
+                'bias': bias,
+                **kwargs
+            }
+            yield SampleInput(input_tensor, num_groups, **kwargs)
+
 
 def sample_inputs_instance_norm(opinfo, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -3481,6 +3526,18 @@ def sample_inputs_native_layer_norm(opinfo, device, dtype, requires_grad, **kwar
             args=(normalized_shape, None, None, eps),
         )
 
+def error_inputs_group_norm(opinfo, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32, requires_grad=False)
+
+    # check that input has minimum number of dimensions
+    err_msg1 = "Expected at least 2 dimensions for input tensor but received"
+    s1 = SampleInput(make_arg((1)), args=(1,))
+    yield ErrorInput(s1, error_regex=err_msg1)
+
+    # check that the channels dimension is compatible with number of groups
+    err_msg2 = "Expected number of channels in input to be divisible by num_groups, but got input of shape"
+    s2 = SampleInput(make_arg((2, 7, 4)), args=(2,))
+    yield ErrorInput(s2, error_regex=err_msg2)
 
 def error_inputs_native_layer_norm(opinfo, device, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=torch.float32, requires_grad=False)
@@ -7747,12 +7804,12 @@ def reference_group_norm(inp: np.ndarray, num_groups: int, weight=None, bias=Non
     if weight is not None:
         # weight is a vector of length equal to the channel
         if len(Y.shape) > 2:
-            weight = np.tile(np.expand_dims(weight, 1), [1] + list(inp.shape[2:]))
+            weight = np.expand_dims(weight, [0] + [idx + 2 for idx in range(inp.ndim - 2)])
         Y = Y * weight
     if bias is not None:
         # bias is a vector of length equal to the channel
         if len(Y.shape) > 2:
-            bias = np.tile(np.expand_dims(bias, 1), [1] + list(inp.shape[2:]))
+            bias = np.expand_dims(bias, [0] + [idx + 2 for idx in range(inp.ndim - 2)])
         Y = Y + bias
     return Y
 
@@ -10921,12 +10978,14 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
+           error_inputs_func=error_inputs_group_norm,
            decorators=[
                # RuntimeError: Cannot insert a Tensor that requires grad as a constant.
                # Consider making it a parameter or input, or detaching the gradient
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,))
            ],
            sample_inputs_func=sample_inputs_group_norm,
+           reference_inputs_func=reference_inputs_group_norm,
            supports_expanded_weight=True,),
     OpInfo('nn.functional.instance_norm',
            # no ref because instance_norm will often have numerical instability (large numbers or nan)
@@ -17941,6 +18000,12 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_executor'),
         )
     ),
+    PythonRefInfo(
+        "_refs.nn.functional.group_norm",
+        torch_opinfo_name="nn.functional.group_norm",
+        supports_nvfuser=False,
+        validate_view_consistency=False,
+    ),
     PythonRefInfo(
         "_refs.narrow_copy",
         torch_opinfo_name="narrow_copy",

From 2c4deb1820d75dc527a946884d7e7ef693e003be Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Thu, 10 Nov 2022 12:21:50 -0800
Subject: [PATCH 0776/1922] handle zero dims in reductions (#88280)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88280
Approved by: https://github.com/ngimel
---
 test/inductor/test_torchinductor.py | 21 +++++++++++++++++
 torch/_inductor/ir.py               | 36 +++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 8fd4fa29bf98a..121f3d31f39c2 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -4224,6 +4224,27 @@ def forward(x):
         ]
         self.common(forward, args)
 
+    def test_zero_dim_reductions(self):
+        for kd in [True, False]:
+            inps0 = (torch.zeros(2, 0, device=self.device, dtype=torch.float16), 1, kd)
+            failed_ops = [aten.argmin, aten.argmax, aten.max, aten.min]
+            for fo in failed_ops:
+                with self.assertRaisesRegex(
+                    IndexError, "Expected reduction dim 1 to have non-zero size"
+                ):
+                    mod = make_fx(fo)(*inps0)
+                    _ = compile_fx_inner(mod, inps0)
+
+            pass_ops = [
+                lambda *x: fn(*x) for fn in [aten.sum, aten.prod, aten.any, aten.all]
+            ]
+            for po in pass_ops:
+                compiled = torch._dynamo.optimize("inductor")(po)
+                expected = po(*inps0)
+                actual = compiled(*inps0)
+
+            self.assertTrue(torch.allclose(actual, expected, atol=1e-3, rtol=1e-3))
+
     @requires_cuda()
     def test_unspec_inputs(self):
         def fn(x, y):
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 924ec7aaa7b2e..448c057ecb0e1 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -729,6 +729,42 @@ def create(
         reduction_hint: ReductionHint = ReductionHint.DEFAULT,
     ):
         reduction_numel = V.graph.sizevars.simplify(sympy_product(reduction_ranges))
+
+        if reduction_numel == 0:
+
+            # N.B. This is a hack to generate the literal of the given type
+            # Ideally, we should be fixing `def constant` in triton.py
+            # but it breaks due to hardcoded dtypes in other places
+            def py_cnst(val):
+                return (
+                    bool(val)
+                    if dst_dtype == torch.bool
+                    else float(val)
+                    if dst_dtype.is_floating_point
+                    else int(val)
+                )
+
+            rtypes_to_inits = {
+                "sum": py_cnst(0),
+                "prod": py_cnst(1),
+                "any": py_cnst(0),
+                # "all" is desugared to `!any(!val)`
+            }
+
+            assert (
+                reduction_type in rtypes_to_inits.keys()
+            ), f"{reduction_type} not supported for zero-dimension tensors!"
+
+            def const_fn(index):
+                return ops.constant(rtypes_to_inits[reduction_type], dst_dtype)
+
+            return Pointwise.create(
+                device=device,
+                dtype=src_dtype,
+                inner_fn=const_fn,
+                ranges=list(ranges),
+            )
+
         if reduction_numel == 1:
             # this reduction is actually a pointwise op
             if reduction_type in ("argmin", "argmax"):

From 2717d80e39c3192ade798ea1ded88bc47a900792 Mon Sep 17 00:00:00 2001
From: anjali411 <chourdiaanjali123@gmail.com>
Date: Thu, 10 Nov 2022 22:41:47 +0000
Subject: [PATCH 0777/1922] Add meta support for scalar_tensor and argmax
 (#88590)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88590
Approved by: https://github.com/albanD
---
 test/functorch/test_vmap.py                   |  1 +
 test/test_proxy_tensor.py                     |  6 +--
 torch/_meta_registrations.py                  | 42 +++++++++++++++++++
 .../_internal/common_methods_invocations.py   | 32 ++++++++++++--
 4 files changed, 74 insertions(+), 7 deletions(-)

diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index 3acab4172fce1..5ba35de21b8b7 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -3229,6 +3229,7 @@ def test():
         xfail('linspace', ''),  # test runner can't handle factory functions
         xfail('arange', ''),  # test runner can't handle factory functions
         xfail('logspace', ''),  # test runner can't handle factory functions
+        xfail('scalar_tensor'),  # test runner can't handle factory functions
         xfail('empty', ''),  # test runner can't handle factory functions
         xfail('ones', ''),  # test runner can't handle factory functions
         xfail('zeros', ''),  # test runner can't handle factory functions
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index fbeaa04aa65d9..72c7249f4f145 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1116,8 +1116,8 @@ def f(a, b, c, d, e):
     skip('masked.logsumexp', ''),  # Tensors of type TensorImpl do not have numel
     xfail('masked.amax', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
     xfail('masked.amin', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
-    xfail('masked.argmax', ''),  # aten.argmax.default - couldn't find symbolic meta function/decomposition
-    xfail('masked.argmin', ''),  # aten.argmin.default - couldn't find symbolic meta function/decomposition
+    xfail('masked.argmax', ''),  # broadcast_to(): argument 'size' (position 2) must be tuple of ints, but found ...
+    xfail('masked.argmin', ''),  # broadcast_to(): argument 'size' (position 2) must be tuple of ints, but found ...
     xfail('masked.cumprod', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
     xfail('masked.cumsum', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
     xfail('masked.log_softmax', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
@@ -1134,8 +1134,6 @@ def f(a, b, c, d, e):
     xfail('addmv', ''),  # aten.addmv.default - couldn't find symbolic meta function/decomposition
     xfail('addr', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('aminmax', ''),  # aten.aminmax.default - couldn't find symbolic meta function/decomposition
-    xfail('argmax', ''),  # aten.argmax.default - couldn't find symbolic meta function/decomposition
-    xfail('argmin', ''),  # aten.argmin.default - couldn't find symbolic meta function/decomposition
     xfail('argwhere', ''),  # aten.nonzero.default - couldn't find symbolic meta function/decomposition
     xfail('baddbmm', ''),  # aten.baddbmm.default - couldn't find symbolic meta function/decomposition
     xfail('bucketize', ''),  # aten.bucketize.Tensor - couldn't find symbolic meta function/decomposition
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 5035eadf84a47..04c522ab9e3b4 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1735,6 +1735,48 @@ def meta_sort(self, stable=None, dim=-1, descending=False):
     return torch.empty_like(self), torch.empty_like(self, dtype=torch.int64)
 
 
+def zero_numel_check_dims(self, dim, fn_name):
+    if self.ndim == 0:
+        check(
+            dim == 0 or dim == -1,
+            lambda: f"{fn_name}: Expected reduction dim -1 or 0 for scalar but got {dim}",
+            IndexError,
+        )
+    else:
+        check(
+            self.size(dim) != 0,
+            lambda: f"{fn_name}: Expected reduction dim {dim} to have non-zero size.",
+            IndexError,
+        )
+
+
+# From aten/src/ATen/native/ReduceOps.cpp
+def check_argmax_argmin(name, self, dim):
+    if dim is not None:
+        dim = maybe_wrap_dim(dim, self.dim())
+        zero_numel_check_dims(self, dim, name)
+    else:
+        check(
+            self.numel() != 0,
+            lambda: f"{name}: Expected reduction dim to be specified for input.numel() == 0.",
+        )
+
+
+@register_meta([aten.argmax.default, aten.argmin.default])
+def argmax_argmin_meta(self, dim=None, keepdim=False):
+    check_argmax_argmin("argmax", self, dim)
+    dims = utils.reduction_dims(self.shape, (dim,) if dim is not None else None)
+    shape = _compute_reduction_shape(self, dims, keepdim)
+    return self.new_empty(shape, dtype=torch.int64)
+
+
+@register_meta(aten.scalar_tensor.default)
+def scalar_tensor(s, dtype=None, layout=None, device=None, pin_memory=None):
+    return torch.empty(
+        (), dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
+    )
+
+
 # We must also trigger meta registrations from PrimTorch ref
 # decompositions
 import torch._refs
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index b702c11618604..b41e74a24c104 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -1372,6 +1372,15 @@ def sample_inputs_empty(op, device, dtype, requires_grad, **kwargs):
     for case in cases:
         yield SampleInput(case, device=device, dtype=dtype, requires_grad=requires_grad)
 
+def sample_inputs_scalar_tensor(op, device, dtype, requires_grad, **kwargs):
+    # Not including a scalar tensor in vals because meta tests start failing due to
+    # lack of meta support for _local_scalar_dense
+    # torch.tensor(2, device=device)
+    vals = (-5, 0, 1)
+
+    for item in vals:
+        yield SampleInput(item, device=device, dtype=dtype, requires_grad=requires_grad)
+
 def sample_inputs_eye(op, device, dtype, requires_grad, **kwargs):
     # only ints >= 0 are allowed for both arguments, unless m is omitted
     sizes = (None, 0, 1, 2, 3, 4, 7, L, M, S)
@@ -9287,9 +9296,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            error_inputs_func=error_inputs_diag),
     OpInfo('diag_embed',
            dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
-           # TODO: this is very questionable, because we do have
-           # diag_embed.out but it's not bound to Python somehow
-           # https://github.com/pytorch/pytorch/issues/88598
            supports_out=False,
            # Runs very slowly on slow gradcheck - alternatively reduce input sizes
            gradcheck_fast_mode=True,
@@ -10546,6 +10552,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            assert_jit_shape_analysis=True,
            sample_inputs_func=sample_inputs_native_batch_norm,
            skips=(
+               # NotImplementedError: Could not run
+               # 'aten::native_batch_norm.out' with arguments from the 'CPU' backend.
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning', device_type="cpu"),
                # RuntimeError: out_invstd.dim() == 1 && out_invstd.is_contiguous() && out_invstd.sizes()[0]
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type="cuda"),
@@ -14511,6 +14519,24 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
            )),
+    OpInfo('scalar_tensor',
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+           sample_inputs_func=sample_inputs_scalar_tensor,
+           supports_autograd=False,
+           supports_out=False,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # fails to match any schemas despite working in the interpreter
+               DecorateInfo(unittest.expectedFailure, 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
+               # fails to match any schemas despite working in the interpreter
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               # skip these tests since we have non tensor input
+               DecorateInfo(unittest.skip('Skipped!'), "TestCommon", "test_noncontiguous_samples"),
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_conj_view'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view'),
+           )),
     OpInfo('new_full',
            op=lambda x, *args, **kwargs: x.new_full(*args, **kwargs),
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),

From 96f7621565f44042ed0b416598e55e82422c8538 Mon Sep 17 00:00:00 2001
From: Michael Voznesensky <voznesenskym@gmail.com>
Date: Fri, 11 Nov 2022 02:30:29 +0000
Subject: [PATCH 0778/1922] Assert we have triton before scheduling on triton
 (#88849)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88849
Approved by: https://github.com/wconstab, https://github.com/ngimel, https://github.com/jansel
---
 torch/_inductor/scheduler.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
index 2f1c4b7c2e643..cb71a44438049 100644
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@@ -16,7 +16,7 @@
 from . import config, dependencies, ir
 from .dependencies import MemoryDep, StarDep
 from .sizevars import SimplifyIndexing
-from .utils import cache_on_self, cmp, dynamo_utils
+from .utils import cache_on_self, cmp, dynamo_utils, has_triton
 from .virtualized import V
 
 log = logging.getLogger(__name__)
@@ -1078,6 +1078,16 @@ def create_backend(self, device: torch.device):
 
             return CppScheduling(self)
         else:
+            if not has_triton():
+                device_props = torch.cuda.get_device_properties(device)
+                if device_props.major < 6:
+                    raise RuntimeError(
+                        f"Found {device_props.name} which is too old to be supported by the triton GPU compiler, which is used as the backend. Triton only supports devices of CUDA Capability >= 6.0, but your device is of CUDA capability {device_props.major}.{device_props.minor}"  # noqa: B950
+                    )
+                else:
+                    raise RuntimeError(
+                        "Cannot find a working triton installation. More information on installing Triton can be found at https://github.com/openai/triton"  # noqa: B950
+                    )
             from .codegen.triton import TritonScheduling
 
             return TritonScheduling(self)

From 95ab212c920de321c311b8635b186a667fefc8e3 Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Thu, 10 Nov 2022 21:22:29 +0000
Subject: [PATCH 0779/1922] Ref for aten.full; symint changes in prim (#88762)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88762
Approved by: https://github.com/ezyang
---
 test/functorch/test_vmap.py                   |  1 +
 test/test_ops.py                              |  1 -
 torch/_prims_common/__init__.py               |  5 ++-
 torch/_refs/__init__.py                       | 17 +++++---
 .../_internal/common_methods_invocations.py   | 40 +++++++++++++++++++
 5 files changed, 56 insertions(+), 8 deletions(-)

diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index 5ba35de21b8b7..6d95077b627e2 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -3233,6 +3233,7 @@ def test():
         xfail('empty', ''),  # test runner can't handle factory functions
         xfail('ones', ''),  # test runner can't handle factory functions
         xfail('zeros', ''),  # test runner can't handle factory functions
+        xfail('full', ''),  # test runner can't handle factory functions
         xfail('eye', ''),  # non-tensor input
         xfail('broadcast_shapes', ''),  # test runner can't handle non-Tensor ops
         xfail('sparse.sampled_addmm'),  # sparse
diff --git a/test/test_ops.py b/test/test_ops.py
index 73758bfc6b466..c688f6521af14 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -1743,7 +1743,6 @@ class TestRefsOpsInfo(TestCase):
         '_refs.unflatten',
         '_refs.sum_to_size',
         # ref implementation missing kwargs
-        '_refs.full',  # missing "layout"
         '_refs.full_like',  # missing "layout"
         '_refs.ones_like',  # missing "layout"
         '_refs.round',  # missing "decimals"
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index 90777ed6601aa..128796dfa3d07 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -837,10 +837,11 @@ def type_to_dtype(typ: type) -> torch.dtype:
 
     if typ is bool:
         return torch.bool
-    if typ is int:
+    if typ in [int, torch.SymInt]:
         return torch.long
-    if typ is float:
+    if typ in [float, torch.SymFloat]:
         return torch.get_default_dtype()
+    # TODO: sym_complex_float?
     if typ is complex:
         return corresponding_complex_dtype(torch.get_default_dtype())
 
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 36fef59df3757..43b0c74192dee 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -322,7 +322,7 @@ def _broadcast_shapes(*_shapes):
     common_shape = [
         1,
     ] * reduce(max, (len(shape) for shape in shapes))
-    for shape in shapes:
+    for arg_idx, shape in enumerate(shapes):
         for idx in range(-1, -1 - len(shape), -1):
             if common_shape[idx] == 1:
                 if shape[idx] < 0:
@@ -333,9 +333,9 @@ def _broadcast_shapes(*_shapes):
             elif shape[idx] != 1:
                 if common_shape[idx] != shape[idx]:
                     raise RuntimeError(
-                        "Attempting to broadcast a dimension of length ",
-                        str(shape[idx]),
-                        "!",
+                        f"Attempting to broadcast a dimension of length {shape[idx]} at {idx}! "
+                        f"Mismatching argument at index {arg_idx} had {shape}; but expected shape "
+                        f"should be broadcastable to {common_shape}"
                     )
 
     return common_shape
@@ -4495,6 +4495,7 @@ def eye(
     # result.requires_grad_(requires_grad)
 
 
+@register_decomposition(torch.ops.aten.full)
 @out_wrapper()
 def full(
     shape: ShapeType,
@@ -4506,6 +4507,12 @@ def full(
     pin_memory: bool = False,
     requires_grad: bool = False,
 ) -> TensorLikeType:
+    utils.check_layout(layout)
+    utils.check_pin_memory(pin_memory)
+
+    dtype = dtype if dtype is not None else utils.type_to_dtype(type(fill_value))
+    device = device if device is not None else torch.device("cpu")
+
     e = empty(
         shape,
         dtype=dtype,
@@ -4514,7 +4521,7 @@ def full(
         pin_memory=pin_memory,
         requires_grad=requires_grad,
     )
-    return fill(e, fill_value)
+    return torch.fill(e, fill_value)  # type: ignore[arg-type]
 
 
 def full_like(
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index b41e74a24c104..5178ec978bd1c 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -772,6 +772,20 @@ def sample_inputs_ones_zeros(op, device, dtype, requires_grad, **kwargs):
     for size in sizes:
         yield SampleInput(size, kwargs={'dtype': dtype, 'device': device})
 
+def sample_inputs_full(op, device, dtype, requires_grad, **kwargs):
+    def get_val(dtype):
+        return make_tensor([], dtype=dtype, device="cpu").item()
+
+    sizes = (
+        (M,),
+        (S, S),
+    )
+    fill_values = [get_val(dtype), get_val(torch.int)]
+
+    for size, fill_value in product(sizes, fill_values):
+        yield SampleInput(size, fill_value, dtype=dtype, device=device)
+
+
 def error_inputs_uniform(op, device, **kwargs):
     t = torch.zeros([10], device=device)
     yield ErrorInput(
@@ -14373,6 +14387,32 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
            )),
+    OpInfo('full',
+           op=torch.full,
+           supports_autograd=False,
+           is_factory_function=True,
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+           supports_out=True,
+           sample_inputs_func=sample_inputs_full,
+           skips=(
+               # Tests that assume input is a tensor or sequence of tensors
+               DecorateInfo(unittest.expectedFailure, "TestCommon", "test_noncontiguous_samples"),
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+               # Same failure as arange: cannot find linspace in captured graph
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+               # boolean alpha not handled properly
+               DecorateInfo(unittest.expectedFailure,
+                            'TestCudaFuserOpInfo',
+                            'test_nvfuser_correctness',
+                            dtypes=(torch.bool,)),
+               # RuntimeError: UNSUPPORTED DTYPE: bool
+               DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness', dtypes=(torch.bool,)),
+           )),
     OpInfo('new_empty',
            op=lambda x, *args, **kwargs: x.new_empty(*args, **kwargs),
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),

From 7526a243b0e7e044339ec4988bb0c8f0ebb4eccf Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 11 Nov 2022 03:33:55 +0000
Subject: [PATCH 0780/1922] [vision hash update] update the pinned vision hash
 (#88853)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned vision hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88853
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/vision.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index d8180093d8859..48685938a146b 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-ffd5a567eb90abf6b5555063da434d3c130d540f
+d72e90640ec8514e0369b5419d7f3b74a387b1d7

From 44cb6fd2dbe277e8f8a44812397d779afec7d45f Mon Sep 17 00:00:00 2001
From: Emil Lynegaard <eml@languagewire.com>
Date: Fri, 11 Nov 2022 03:34:54 +0000
Subject: [PATCH 0781/1922] Disable check for dropout in MultiheadAttention
 fast_path (#88831)

Since we already enforce eval mode for the fast_path, we do not need to also check for a falsy dropout value, as a model trained with dropout will have a non-zero dropout during eval mode, even though it won't be applied.

Fixes #88806

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88831
Approved by: https://github.com/drisspg
---
 torch/nn/modules/activation.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index 5f5615b496d7d..7b0e7e3effaac 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -904,7 +904,6 @@ class MultiheadAttention(Module):
     - inputs are batched (3D) with ``batch_first==True``
     - Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor argument ``requires_grad``
     - training is disabled (using ``.eval()``)
-    - dropout is 0
     - ``add_bias_kv`` is ``False``
     - ``add_zero_attn`` is ``False``
     - ``batch_first`` is ``True`` and the input is batched
@@ -1088,8 +1087,6 @@ def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: O
             why_not_fast_path = "self.bias_k was not None"
         elif self.bias_v is not None:
             why_not_fast_path = "self.bias_v was not None"
-        elif self.dropout:
-            why_not_fast_path = f"dropout was {self.dropout}, required zero"
         elif self.add_zero_attn:
             why_not_fast_path = "add_zero_attn was enabled"
         elif not self._qkv_same_embed_dim:

From d1dfae761e9f67bbbacde2f0570ead7abe8cb11a Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Mon, 7 Nov 2022 23:46:29 +0000
Subject: [PATCH 0782/1922] [FSDP][state_dict][1/N] Moving state_dict logic to
 pre_state_dict_hook (#87900)

This is one step toward the ultimate goal: remove the overwritten state_dict in FSDP. All the logic should be either in `pre_state_dict_hook` or `post_state_dict_hook`.

Since current `nn.Module` does not support `pre_state_dict_hook`, this PR mimic `pre_state_dict_hook` by calling the pre hook inside post the hook, effectively ditching all the work done by `nn.Module.state_dict`. Once `pre_state_dict_hook` is supported by `nn.Module`, these pre hook calls can be moved out from the post hooks and be registered to `nn.Module.pre_state_dict_hook`.

The major issue of this temporary solution is that `post_state_dict_hook` is called from the leaf node to the root node. This makes the `module._lazy_init()` invalid as FSDP assumes `_lazy_init()` to be called from the root. As a result, `FSDP.state_dict` currently contains only one logic -- calling `module._lazy_init()`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87900
Approved by: https://github.com/rohan-varma
---
 test/distributed/fsdp/test_fsdp_state_dict.py |   2 +-
 torch/distributed/fsdp/_runtime_utils.py      |  19 +-
 torch/distributed/fsdp/_state_dict_utils.py   | 388 +++++++++++++-----
 .../fsdp/fully_sharded_data_parallel.py       | 101 +----
 4 files changed, 288 insertions(+), 222 deletions(-)

diff --git a/test/distributed/fsdp/test_fsdp_state_dict.py b/test/distributed/fsdp/test_fsdp_state_dict.py
index 133405033730d..48dad3118db74 100644
--- a/test/distributed/fsdp/test_fsdp_state_dict.py
+++ b/test/distributed/fsdp/test_fsdp_state_dict.py
@@ -447,7 +447,7 @@ def test_state_dict_rank0_offload_save_load_flow(self, use_orig_params: bool):
     )
     @parametrize("fp16", [True, False])
     @parametrize("state_dict_rank0_and_offload", [True, False])
-    @parametrize("use_orig_params", [False, True])
+    @parametrize("use_orig_params", [True, False])
     def test_basic_save_and_load_state_dict(
         self,
         state_dict_type: StateDictType,
diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index 9aee15a016c44..e0986d300a65a 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -1113,28 +1113,23 @@ def _get_buffers_and_dtypes_for_computation(
 
 
 @no_type_check
-def _get_buffers_and_dtypes_for_checkpoint(
+def _get_buffer_dtypes(
     state: _FSDPState,
-    root_module: nn.Module,
-) -> Tuple[List[torch.Tensor], List[torch.dtype]]:
+    buffer_names: List[str],
+) -> List[torch.dtype]:
     """
-    Returns all buffers in the module tree rooted at ``root_module`` and a
-    corresponding list of the buffer dtypes for checkpointing. Each buffer
-    dtype is the original buffer dtype ignoring any buffer mixed precision.
+    Returns the original buffer types of the given buffer names.
     """
-    p_assert(state._is_root, "Expects the root to cast buffers")
-    buffers: List[torch.Tensor] = []
-    buffer_dtypes: List[Optional[torch.dtype]] = []
-    for buffer_name, buffer in root_module.named_buffers():
+    buffer_dtypes: List[torch.dtype] = []
+    for buffer_name in buffer_names:
         p_assert(
             buffer_name in state._buffer_name_to_orig_dtype,
             f"{buffer_name} is missing from pre-computed dict on rank "
             f"{state.rank}, which only has keys "
             f"{state._buffer_name_to_orig_dtype.keys()}",
         )
-        buffers.append(buffer)
         buffer_dtypes.append(state._buffer_name_to_orig_dtype[buffer_name])
-    return buffers, buffer_dtypes
+    return buffer_dtypes
 
 
 def _cast_buffers_to_dtype_and_device(
diff --git a/torch/distributed/fsdp/_state_dict_utils.py b/torch/distributed/fsdp/_state_dict_utils.py
index 0169aa8f10eb2..1109f1e881506 100644
--- a/torch/distributed/fsdp/_state_dict_utils.py
+++ b/torch/distributed/fsdp/_state_dict_utils.py
@@ -1,7 +1,7 @@
 import functools
 import math
 import warnings
-from typing import Any, cast, Dict
+from typing import Any, Callable, cast, Dict
 
 import torch
 import torch.distributed as dist
@@ -11,15 +11,22 @@
 import torch.distributed.fsdp.fully_sharded_data_parallel as fsdp_file
 import torch.nn as nn
 import torch.nn.functional as F
+
 from torch.distributed._shard.sharded_tensor import (
     init_from_local_shards,
     Shard,
     ShardedTensor,
 )
-from torch.distributed.fsdp._common_utils import clean_tensor_name
+from torch.distributed.fsdp._common_utils import (
+    clean_tensor_name,
+    FSDP_PREFIX,
+    TrainingState,
+)
 from torch.distributed.fsdp._runtime_utils import (
     _cast_buffers_to_dtype_and_device,
-    _get_buffers_and_dtypes_for_computation,
+    _clear_grads_if_needed,
+    _get_buffer_dtypes,
+    _lazy_init,
 )
 from torch.distributed.utils import _replace_by_prefix
 
@@ -31,49 +38,218 @@
 from .flat_param import FlatParamHandle
 
 
-def _full_post_state_dict_hook(
+def _enter_full_param_ctx(
+    module,
+    recurse: bool = False,
+    writeback: bool = False,
+    rank0_only: bool = False,
+    offload_to_cpu: bool = False,
+    with_grads: bool = False,
+) -> None:
+    """
+    state_dict hooks cannot use the pure context call as the checkpoint flow
+    requires to enter the context in the pre-hook but leave the context in the
+    post-hook. This API enters the context of ``summon_full_params``.
+    """
+    assert module._full_param_ctx is None, (
+        "Entering the ``summon_full_params`` context but module._full_param_ctx "
+        "is not None."
+    )
+    assert module.training_state != TrainingState.SUMMON_FULL_PARAMS, (
+        "Entering the summon_full_params context but the state is already "
+        "SUMMON_FULL_PARAMS."
+    )
+    module._full_param_ctx = module._summon_full_params(
+        recurse=recurse,
+        writeback=writeback,
+        rank0_only=rank0_only,
+        offload_to_cpu=offload_to_cpu,
+        with_grads=with_grads,
+    )
+    module._full_param_ctx.__enter__()
+
+
+def _exit_full_param_ctx(module) -> None:
+    """A helper function to exit ``summon_full_params`` context."""
+    module._assert_state([TrainingState.SUMMON_FULL_PARAMS])
+    assert module._full_param_ctx is not None
+    module._full_param_ctx.__exit__(None, None, None)
+    module._full_param_ctx = None
+
+
+def _common_pre_state_dict_hook(
+    module,
+    state_dict: Dict[str, Any],
+    prefix: str,
+) -> None:
+    """Performs the pre-state_dict tasks shared by all state_dict types."""
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    _lazy_init(module, module)
+    # TODO: change to this call after pre_state_dict_hook is in `nn.Module`.
+    # if module.is_root:
+    #    _clear_grads_if_needed(module._fsdp_handles(module))
+    if module._has_params:
+        _clear_grads_if_needed([module._handles[0]])
+
+
+def _common_summon_pre_state_dict_hook(
+    module,
+    offload_to_cpu: bool,
+    rank0_only: bool,
+) -> None:
+    """
+    Performs the pre-state_dict tasks shared by all state_dict types that require
+    ``summon_full_params()``. FULL_STATE_DICT and SHARDED_STATE_DICT use this hook.
+    """
+    _enter_full_param_ctx(
+        module,
+        recurse=False,
+        writeback=False,
+        offload_to_cpu=offload_to_cpu,
+        rank0_only=rank0_only,
+    )
+
+
+# TODO: change to the decorator style. See ``_full_pre_state_dict_hook``.
+def _common_summon_post_state_dict_hook(
     module,
     state_dict: Dict[str, Any],
     prefix: str,
+    param_hook: Callable,
 ) -> Dict[str, Any]:
     """
-    Hook that runs after model.state_dict() is called before returning result to
-    user. For FSDP, we may have to clone the tensors in state_dict as params go
-    back to sharded version after _summon_full_params ends, and also remove
-    the ``FSDP_WRAPPED_MODULE`` prefix.
+    The post-state_dict flow that shared by all state_dict types that require
+    ``summon_full_params()``. FULL_STATE_DICT and SHARDED_STATE_DICT use this
+    hook.
     """
-    _replace_by_prefix(state_dict, prefix + f"{fsdp_file.FSDP_PREFIX}", prefix)
-    module._assert_state([fsdp_file.TrainingState.SUMMON_FULL_PARAMS])
+    _replace_by_prefix(state_dict, prefix + f"{FSDP_PREFIX}", prefix)
+    module._assert_state([TrainingState.SUMMON_FULL_PARAMS])
     # Return early for trivial cases
     if not state_dict or not module._has_params:
+        _exit_full_param_ctx(module)
         return state_dict
 
-    # If a rank has already exited the `summon_full_params()` context here
-    # (e.g. when `rank0_only=True` and `rank != 0`), then the rank only
-    # needed to participate in the all-gather and does not need to save the
-    # state dict. For `use_orig_params=False`, we can check this via
-    # `FlatParameter` registration.
-    # TODO: For `use_orig_params=True`, we check for the reshard upon
-    # exiting `summon_full_params()` via the parameter shape. However, for
-    # `NO_SHARD`, we cannot tell from the shape, so we do not return early.
-    if (
-        not module._use_orig_params
-        and fsdp_file.FLAT_PARAM in module.module._parameters
-    ) or (
-        module._use_orig_params
-        and module._handles
-        and module._handles[0].uses_sharded_strategy
-        and module._handles[0].is_sharded(module._handles[0].flat_param)
-    ):
-        return state_dict
+    # TODO: Once pre_state_dict hook is supported, this pop should be removed.
+    # For `use_orig_params=True`, the `FlatParameter` is not registered, so
+    # there is no entry in the state dict for it to pop.
+    if not module._use_orig_params:
+        state_dict.pop(f"{prefix}{fsdp_file.FLAT_PARAM}")
 
-    offload_to_cpu = module._state_dict_config.offload_to_cpu
-    cpu_device = torch.device("cpu")
+    # If a rank does not have unsharded parameters(when `rank0_only=True`
+    # and `rank != 0`), then the rank only needed to participate in the
+    # all-gather and does not need to save the # state dict. We simply check
+    # rank0_only to ensure this issue.
+    rank0_only = (
+        module._state_dict_type == fsdp_file.StateDictType.FULL_STATE_DICT
+        and cast(fsdp_file.FullStateDictConfig, module._state_dict_config).rank0_only
+    )
+    # no_fsdp_return means the state_dict returned by this rank should contain
+    # only non-FSDP controlled parameters and buffers.
+    no_fsdp_return = rank0_only and module.rank != 0
+    if no_fsdp_return and not module._use_orig_params:
+        for clean_key in module._buffer_names:
+            # This is a hack to support activation checkpoint.
+            clean_key = clean_key.replace(
+                f"{checkpoint_wrapper._CHECKPOINT_PREFIX}.", ""
+            )
+            state_dict.pop(f"{prefix}{clean_key}", None)
+        _exit_full_param_ctx(module)
+        return state_dict
 
     # Loop only the parameters saved in this instance's wrapped module to
     # avoid processing buffers.
     for fqn, param_name, module_name in module._param_fqns:
+        # TODO: remove the parameter retrieval. See ``_full_pre_state_dict_hook``.
+        param = functools.reduce(getattr, fqn.split("."), module.module)
         fqn = f"{prefix}{fqn}"
+        if no_fsdp_return:
+            state_dict.pop(fqn)
+            continue
+        state_dict[fqn] = param
+        assert fqn in state_dict, (
+            f"FSDP assumes {fqn} is in the state_dict but the state_dict only "
+            f"has {state_dict.keys()}. "
+            f"prefix={prefix}, module_name={module_name}, "
+            f"param_name={param_name} rank={module.rank}."
+        )
+
+        param_hook(module, state_dict, prefix, fqn)
+    _exit_full_param_ctx(module)
+
+    cpu_device = torch.device("cpu")
+    buffer_clean_fqns = []
+    buffers = []
+    for clean_key in module._buffer_names:
+        # This is a hack to support activation checkpoint.
+        clean_key = clean_tensor_name(clean_key)
+        fqn = f"{prefix}{clean_key}"
+        if fqn not in state_dict:
+            # A buffer can be registered as non-persistent.
+            continue
+        if no_fsdp_return:
+            state_dict.pop(fqn)
+        else:
+            buffer = state_dict[fqn]
+            if module._state_dict_config.offload_to_cpu and buffer.device != cpu_device:
+                state_dict[fqn] = buffer.to(cpu_device)
+            # TODO: for composable FSDP, this should be clean_tensor_name(clean_key),
+            buffer_clean_fqns.append(clean_key)
+            buffers.append(state_dict[fqn])
+    if buffers and module._mixed_precision_enabled_for_buffers():
+        buffer_dtypes = _get_buffer_dtypes(module, buffer_clean_fqns)
+        _cast_buffers_to_dtype_and_device(buffers, buffer_dtypes, module.compute_device)
+        for buffers, clean_fqn in zip(buffers, buffer_clean_fqns):
+            fqn = f"{prefix}{clean_fqn}"
+            state_dict[fqn] = buffer.clone()
+    return state_dict
+
+
+def _full_pre_state_dict_hook(
+    module,
+    state_dict: Dict[str, Any],
+    prefix: str,
+) -> None:
+    """
+    Hook that runs before model.state_dict() is called. pre-state_dict hook is
+    not actually supported by ``nn.Module``. As a result, this API is called
+    from ``_full_post_state_dict_hook()`` to simulate the case. Once pre-state_dict
+    is supported in ``nn.Module``, this hook will be registered as a hook in
+    ``nn.Module``.
+
+    TODO: clean the callsites and hacks after ``pre_state_dict_hook` ` is supported
+    in ``nn.Module``.
+    """
+    _common_pre_state_dict_hook(module, state_dict, prefix)
+    _common_summon_pre_state_dict_hook(
+        module,
+        offload_to_cpu=module._state_dict_config.offload_to_cpu,
+        rank0_only=cast(
+            fsdp_file.FullStateDictConfig, module._state_dict_config
+        ).rank0_only,
+    )
+
+
+def _full_post_state_dict_hook(
+    module,
+    state_dict: Dict[str, Any],
+    prefix: str,
+) -> Dict[str, Any]:
+    """
+    Hook that runs after model.state_dict() is called before returning result to
+    user. For FSDP, we may have to clone the tensors in state_dict as params go
+    back to sharded version after _summon_full_params ends, and also remove
+    the ``FSDP_WRAPPED_MODULE`` prefix.
+    """
+    # TODO: remove the hack. See ``_full_pre_state_dict_hook``.
+    _full_pre_state_dict_hook(module, state_dict, prefix)
+
+    def param_hook(
+        module,
+        state_dict: Dict[str, Any],
+        prefix: str,
+        fqn: str,
+    ) -> None:
         clean_key = fqn
         clean_prefix = clean_tensor_name(prefix)
         # Strip prefix out of key if needed as buffer names and param names
@@ -84,11 +260,6 @@ def _full_post_state_dict_hook(
 
         # Clone non-ignored parameters before exiting the
         # `_summon_full_params()` context
-        assert fqn in state_dict, (
-            f"FSDP assumes {fqn} is in the state_dict but the state_dict "
-            f"only has {state_dict.keys()}. prefix={prefix}, "
-            f"module_name={module_name} param_name={param_name} rank={module.rank}."
-        )
         if clean_key not in module._ignored_param_names and not getattr(
             state_dict[fqn], "_has_been_cloned", False
         ):
@@ -104,24 +275,7 @@ def _full_post_state_dict_hook(
                     f"implementation of {fqn}. Error: {str(e)}"
                 )
 
-    # Offload the buffer to CPU if needed -- we do not do this in
-    # `_summon_full_params()` since without care, that would free
-    # the original buffer's GPU memory and require reallocating
-    # that memory later; this only affects the state dict's buffer
-    # variable and leaves the original buffer's GPU memory intact
-    if offload_to_cpu:
-        for clean_key in module._buffer_names:
-            # This is a hack to support activation checkpoint.
-            clean_key = clean_key.replace(
-                f"{checkpoint_wrapper._CHECKPOINT_PREFIX}.", ""
-            )
-            fqn = f"{prefix}{clean_key}"
-            if fqn not in state_dict:
-                # A buffer can be registered as non-persistent.
-                continue
-            if state_dict[fqn].device != cpu_device:
-                state_dict[fqn] = state_dict[fqn].to(cpu_device)
-    return state_dict
+    return _common_summon_post_state_dict_hook(module, state_dict, prefix, param_hook)
 
 
 def _full_pre_load_state_dict_hook(
@@ -129,21 +283,30 @@ def _full_pre_load_state_dict_hook(
     state_dict: Dict[str, Any],
     prefix: str,
 ) -> None:
-    # We do not expect to be calling pre-hooks twice without post-hook
-    # call in between.
-    assert getattr(module, "_full_param_ctx", None) is None
-    # Note that it needs writeback=True to persist.
-    module._full_param_ctx = module._summon_full_params(recurse=False, writeback=True)
-    module._full_param_ctx.__enter__()
-    _replace_by_prefix(state_dict, prefix, prefix + f"{fsdp_file.FSDP_PREFIX}")
+    _enter_full_param_ctx(module, recurse=False, writeback=True)
+    _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP_PREFIX}")
 
 
 def _full_post_load_state_dict_hook(module, *args, **kwargs) -> None:
-    # We should exit summon_full_params context.
-    module._assert_state([fsdp_file.TrainingState.SUMMON_FULL_PARAMS])
-    assert getattr(module, "_full_param_ctx", None) is not None
-    module._full_param_ctx.__exit__(None, None, None)
-    module._full_param_ctx = None
+    _exit_full_param_ctx(module)
+
+
+def _local_pre_state_dict_hook(
+    module,
+    state_dict: Dict[str, Any],
+    prefix: str,
+) -> None:
+    """
+    Hook that runs before model.state_dict() is called. Right now, pre-state_dict
+    hook is not supported by the PyTorch core. So this API is called from
+    `_local_post_state_dict_hook()` to simulate the case.
+    """
+    if module._has_params and not module._handles[0].uses_sharded_strategy:
+        raise RuntimeError(
+            "``local_state_dict`` can only be used when parameters are flatten "
+            "and sharded."
+        )
+    _common_pre_state_dict_hook(module, state_dict, prefix)
 
 
 def _local_post_state_dict_hook(
@@ -156,7 +319,10 @@ def _local_post_state_dict_hook(
     the state_dict[f"{prefix}{FLAT_PARAM}] with the ShardedTensor. No copy
     will happen. The underlying storage is the same.
     """
-    _replace_by_prefix(state_dict, f"{prefix}{fsdp_file.FSDP_PREFIX}", prefix)
+    # TODO: remove the hack. See ``_full_pre_state_dict_hook``.
+    _local_pre_state_dict_hook(module, state_dict, prefix)
+
+    _replace_by_prefix(state_dict, f"{prefix}{FSDP_PREFIX}", prefix)
     if not module._has_params:
         return state_dict
 
@@ -198,8 +364,8 @@ def _local_pre_load_state_dict_hook(
     state_dict. The flat_param should be a ShardedTensor. This hook converts
     the ShardedTensor to a tensor. No copy happen unless padding is required.
     """
-    _replace_by_prefix(state_dict, prefix, f"{prefix}{fsdp_file.FSDP_PREFIX}")
-    fqn = f"{prefix}{fsdp_file.FSDP_PREFIX}{fsdp_file.FLAT_PARAM}"
+    _replace_by_prefix(state_dict, prefix, f"{prefix}{FSDP_PREFIX}")
+    fqn = f"{prefix}{FSDP_PREFIX}{fsdp_file.FLAT_PARAM}"
     if fqn not in state_dict:
         assert not module._has_params, (
             "No `FlatParameter` in `state_dict` for this FSDP instance "
@@ -229,6 +395,30 @@ def _local_pre_load_state_dict_hook(
     state_dict[fqn] = load_tensor
 
 
+def _sharded_pre_state_dict_hook(
+    module,
+    state_dict: Dict[str, Any],
+    prefix: str,
+) -> None:
+    """
+    Hook that runs before model.state_dict() is called. Check
+    ``_full_pre_load_state_dict_hook`` for the detail.
+    """
+    if module._has_params and not module._handles[0].uses_sharded_strategy:
+        raise RuntimeError(
+            "``sharded_state_dict`` can only be used when parameters are flatten "
+            "and sharded."
+        )
+    _common_pre_state_dict_hook(module, state_dict, prefix)
+    # Setting offload_to_cpu here does not work even if offload_to_cpu is True.
+    # We have to create ShardedTensor first then move it to CPU.
+    _common_summon_pre_state_dict_hook(
+        module,
+        offload_to_cpu=False,
+        rank0_only=False,
+    )
+
+
 def _sharded_post_state_dict_hook(
     module,
     state_dict: Dict[str, Any],
@@ -238,33 +428,24 @@ def _sharded_post_state_dict_hook(
     The hook replaces the unflattened, unsharded parameter in the state_dict
     with a unflattened, sharded parameter (a ShardedTensor).
     """
-    _replace_by_prefix(state_dict, f"{prefix}{fsdp_file.FSDP_PREFIX}", prefix)
-    if not module._has_params:
-        return state_dict
 
-    assert module.training_state != fsdp_file.TrainingState.SUMMON_FULL_PARAMS, (
-        "Inside _sharded_post_state_dict_hook, the training_state must "
-        "not be SUMMON_FULL_PARAMS."
-    )
-    with module._summon_full_params(recurse=False, writeback=False):
-        for fqn, _, _ in module._param_fqns:
-            # Create a ShardedTensor for the unflattened, non-sharded parameter.
-            param = functools.reduce(getattr, fqn.split("."), module.module)
-            sharded_tensor = _ext_chunk_tensor(
-                tensor=param,
-                rank=module.rank,
-                world_size=module.world_size,
-                num_devices_per_node=torch.cuda.device_count(),
-                pg=module.process_group,
-            )
-            if module._state_dict_config.offload_to_cpu:
-                sharded_tensor = sharded_tensor.cpu()
-            state_dict[f"{prefix}{fqn}"] = sharded_tensor
-    # For `use_orig_params=True`, the `FlatParameter` is not registered, so
-    # there is no entry in the state dict for it to pop.
-    if not module._use_orig_params:
-        state_dict.pop(f"{prefix}{fsdp_file.FLAT_PARAM}")
-    return state_dict
+    # TODO: remove the hack. See ``_full_pre_state_dict_hook``.
+    _sharded_pre_state_dict_hook(module, state_dict, prefix)
+
+    def param_hook(module, state_dict: Dict[str, Any], prefix: str, fqn: str):
+        param = state_dict[fqn]
+        sharded_tensor = _ext_chunk_tensor(
+            tensor=param,
+            rank=module.rank,
+            world_size=module.world_size,
+            num_devices_per_node=torch.cuda.device_count(),
+            pg=module.process_group,
+        )
+        if module._state_dict_config.offload_to_cpu:
+            sharded_tensor = sharded_tensor.cpu()
+        state_dict[fqn] = sharded_tensor
+
+    return _common_summon_post_state_dict_hook(module, state_dict, prefix, param_hook)
 
 
 def _sharded_post_load_state_dict_hook(module, *args, **kwargs) -> None:
@@ -281,7 +462,7 @@ def _sharded_pre_load_state_dict_hook(
     The hook combines the unflattened, sharded parameters (ShardedTensor) to
     a new FlatParameter and shards the new FlatParameter to the local chunk.
     """
-    _replace_by_prefix(state_dict, prefix, prefix + f"{fsdp_file.FSDP_PREFIX}")
+    _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP_PREFIX}")
     if not module._has_params:
         return
 
@@ -295,7 +476,7 @@ def _sharded_pre_load_state_dict_hook(
     shared_fqns = [fqn for fqn, _, _ in module._shared_param_fqns]
     loaded_shapes = []
     for fqn, _, _ in module._param_fqns:
-        full_fqn = f"{prefix}{fsdp_file.FSDP_PREFIX}{fqn}"
+        full_fqn = f"{prefix}{FSDP_PREFIX}{fqn}"
         param = state_dict.pop(full_fqn)
         if fqn in shared_fqns:
             continue
@@ -353,9 +534,7 @@ def _sharded_pre_load_state_dict_hook(
         f"The loaded local chunk has different padding({num_to_pad}) "
         f"from the local chunk {flat_param._shard_numel_padded}."
     )
-    state_dict[
-        f"{prefix}{fsdp_file.FSDP_PREFIX}{fsdp_file.FLAT_PARAM}"
-    ] = loaded_flat_tensor
+    state_dict[f"{prefix}{FSDP_PREFIX}{fsdp_file.FLAT_PARAM}"] = loaded_flat_tensor
     if module._use_orig_params:
         module._deregister_orig_params()
 
@@ -381,17 +560,6 @@ def _post_state_dict_hook(
     processed_state_dict = _post_state_dict_hook_fn[fsdp_module._state_dict_type](
         fsdp_module, state_dict, prefix
     )
-    # Restore buffers, which currently are in their full precision type,
-    # back to their mixed precision type. This is because buffers are cast
-    # during lazy_init() and stay at their mixed precision type before/after
-    # forward/backward. As a result state_dict() should maintain this.
-    if fsdp_module._is_root and fsdp_module._mixed_precision_enabled_for_buffers():
-        buffers, buffer_dtypes = _get_buffers_and_dtypes_for_computation(
-            fsdp_module, fsdp_module
-        )
-        _cast_buffers_to_dtype_and_device(
-            buffers, buffer_dtypes, fsdp_module.compute_device
-        )
     return processed_state_dict
 
 
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 6f5537aad5208..9934e71893425 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -50,9 +50,7 @@
     _init_state_dict_state,
 )
 from torch.distributed.fsdp._runtime_utils import (
-    _cast_buffers_to_dtype_and_device,
     _clear_grads_if_needed,
-    _get_buffers_and_dtypes_for_checkpoint,
     _lazy_init,
     _post_forward,
     _post_forward_reshard,
@@ -512,6 +510,7 @@ def __init__(
             _pre_load_state_dict_hook, with_module=True
         )
         self.register_load_state_dict_post_hook(_post_load_state_dict_hook)
+        self._full_param_ctx: Optional[Generator] = None
 
     @property
     def module(self) -> nn.Module:
@@ -813,104 +812,8 @@ def _shared_param_fqns(self) -> Iterator[Tuple[str, str, str]]:
             yield fqn, param_name, module_name
 
     def state_dict(self, *args, **kwargs):
-        """
-        This is the entry point of all three FSDP ``state_dict`` APIs: full,
-        local, and sharded. For the full state dict
-        (``StateDictType.FULL_STATE_DICT``), FSDP attempts to unshard the model
-        on all ranks, which may result in an OOM error if the full model cannot
-        fit on a single GPU. In that case, users may pass in a
-        :class:`FullStateDictConfig` to only save the checkpoint on rank 0 and/
-        or to offload it to CPU memory layer by layer, enabling much larger
-        checkpoints. If the full model cannot fit in CPU memory, then users may
-        instead take a local state dict (``StateDictType.LOCAL_STATE_DICT``)
-        that only saves the local shard of the model. The sharded state dict
-        (``StateDictType.SHARDED_STATE_DICT``) saves the model parameters as
-        ``ShardedTensor`` s. The ``state_dict`` type can be configured using
-        the :meth:`state_dict_type` context manager.
-
-        Example::
-
-            >>> # xdoctest: +SKIP("undefined variables")
-            >>> import torch
-            >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-            >>> from torch.distributed.fsdp import StateDictType
-            >>> torch.cuda.set_device(device_id)
-            >>> my_module = nn.Linear(...)
-            >>> sharded_module = FSDP(my_module)
-            >>> full_state_dict_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
-            >>> with FSDP.state_dict_type(sharded_module, StateDictType.FULL_STATE_DICT, full_state_dict_config):
-            >>>     full_dict = sharded_module.state_dict()
-            >>> full_dict.keys()
-            >>> odict_keys(['weight', 'bias'])
-            >>> # using local state dict
-            >>> with FSDP.state_dict_type(sharded_module, StateDictType.LOCAL_STATE_DICT):
-            >>>     local_dict = sharded_module.state_dict()
-            >>> local_dict.keys()
-            >>> odict_keys(['flat_param', 'inner.flat_param'])
-
-        .. warning:: This needs to be called on all ranks since it uses
-            collective communications.
-        """
-        # TODO (rohan-varma): separate these out once a state_dict pre-hook
-        # is available.
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
         _lazy_init(self, self)
-        if self._is_root:
-            _clear_grads_if_needed(self._fsdp_handles(self))
-        if self._state_dict_type == StateDictType.FULL_STATE_DICT:
-            # Get config args
-            full_state_dict_config = (
-                self._state_dict_config
-                if self._state_dict_config is not None
-                else FullStateDictConfig()
-            )
-            rank0_only = full_state_dict_config.rank0_only
-            offload_to_cpu = full_state_dict_config.offload_to_cpu
-            summon_ctx = (
-                self._summon_full_params(
-                    recurse=False,
-                    writeback=False,
-                    offload_to_cpu=offload_to_cpu,
-                    rank0_only=rank0_only,
-                )
-                if self.training_state != TrainingState.SUMMON_FULL_PARAMS
-                else contextlib.suppress()
-            )
-            with summon_ctx:
-                # Since buffers stay in their low precision throughout runtime,
-                # we must explicitly restore them to their original dtypes for
-                # model checkpointing. We have the root module cast for all
-                # submodules.
-                # TODO: Investigate if this can and should be refactored into
-                # `summon_full_params()`.
-                if self._is_root and self._mixed_precision_enabled_for_buffers():
-                    buffers, buffer_dtypes = _get_buffers_and_dtypes_for_checkpoint(
-                        self, self
-                    )
-                    _cast_buffers_to_dtype_and_device(
-                        buffers, buffer_dtypes, self.compute_device
-                    )
-                state_dict = super().state_dict(*args, **kwargs)
-
-            # TODO: support offload to CPU in post state dict hook.
-            if not rank0_only or self.rank == 0:
-                return state_dict
-            else:
-                return {}
-
-        elif (
-            self._state_dict_type == StateDictType.LOCAL_STATE_DICT
-            or self._state_dict_type == StateDictType.SHARDED_STATE_DICT
-        ):
-            if self._has_params and not self._handles[0].uses_sharded_strategy:
-                raise RuntimeError(
-                    "sharded_state_dict/local_state_dict can only be called "
-                    "when parameters are flatten and sharded."
-                )
-            return super().state_dict(*args, **kwargs)
-        else:
-            raise ValueError(f"Unknown StateDictType {self._state_dict_type}.")
+        return super().state_dict(*args, **kwargs)
 
     def forward(self, *args: Any, **kwargs: Any) -> Any:
         """

From 46e9879db1b5298446e62f1206001151355f80d7 Mon Sep 17 00:00:00 2001
From: Wei-Sheng Chin <wschin@outlook.com>
Date: Fri, 11 Nov 2022 03:49:27 +0000
Subject: [PATCH 0783/1922] Fix FakeTensorProp on Module with Parameters or
 Buffers (#88700)

In `FakeTensorMode.__torch_dispatch__`, the output is now always computed by meta kernels in
```python
        try:
            with in_kernel_invocation_manager(self):
                r = func(*args, **kwargs)  # <----- "r" can be a real tensor.
        except NotImplementedError as not_implemented_error:
            # no meta kernel registered, fallback to kernel for the device
            if not self.allow_fallback_kernels:
                raise not_implemented_error
            return run_fallback_kernel(self, func, args, kwargs, not_implemented_error)

        return self.wrap_meta_outputs_with_default_device_logic(r, func, args, kwargs)
```
For example, I observed a CPU tensor is generated when executing `aten.addmm` when running `FakeTensorProp`. Therefore, I'd like to allow `FakeTensorMode` to wrap real tensor as `FakeTensor` during the computation. Does this PR look a good direction to fix this problem? If yes, I can go ahead and add some tests.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88700
Approved by: https://github.com/eellison, https://github.com/ezyang
---
 test/test_fake_tensor.py            | 59 +++++++++++++++++++++++++++++
 torch/fx/passes/fake_tensor_prop.py | 12 +++++-
 2 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
index ad9042196bff1..3d47cc8ea0e51 100644
--- a/test/test_fake_tensor.py
+++ b/test/test_fake_tensor.py
@@ -2,6 +2,7 @@
 
 from torch.testing._internal.common_utils import TestCase, run_tests, skipIfCrossRef, skipIfRocm
 import torch
+import torch._dynamo
 import itertools
 import numpy as np
 from torch.testing._internal.jit_utils import RUN_CUDA
@@ -11,6 +12,7 @@
     FakeTensorConverter,
     DynamicOutputShapeException,
 )
+from torch.fx.passes.fake_tensor_prop import FakeTensorProp
 from torch.testing import FileCheck
 from torch import nn
 import unittest
@@ -663,5 +665,62 @@ def test_like_ops(self):
                 op = self.get_aten_op(schema)
                 self.assertIn(op, torch._subclasses.fake_tensor._like_tensor_constructors)
 
+class FakeTensorPropTest(TestCase):
+    def test_fake_tensor_prop_on_nn_module(self):
+        class ToyNnModuleWithParameters(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.layer1 = torch.nn.Linear(4, 3)
+                self.layer2 = torch.nn.Linear(3, 2)
+
+            def forward(self, value):
+                value = self.layer1(value)
+                value = torch.relu(value)
+                value = self.layer2(value)
+                return value
+
+        model = ToyNnModuleWithParameters()
+        value = torch.randn(5, 4)
+        # Convert nn.Module to GraphModule so that FakeTensorProp runs.
+        graph_model = torch.fx.symbolic_trace(model, (value,))
+        # The following block runs FakeTensorProp on graph_module w/to the same FakeTensorMode
+        #
+        # TODO(wschin): there should be an API to run FakeTensorProp for GraphModule
+        # with parameters and buffers.
+        with FakeTensorMode() as fake_tensor_mode:
+
+            def to_fake_tensor(x):
+                if isinstance(x, torch.Tensor) and not isinstance(x, FakeTensor):
+                    return fake_tensor_mode.from_tensor(x)
+                return x
+
+            fake_parameters_and_buffers = {
+                k: to_fake_tensor(v)
+                for k, v in itertools.chain(
+                    graph_model.named_parameters(), graph_model.named_buffers()
+                )
+            }
+            with torch.nn.utils.stateless._reparametrize_module(
+                graph_model, fake_parameters_and_buffers
+            ):
+                # This case uses the **same** fake tensor mode to
+                #  1. create fake parameters and fake buffers, and
+                #  2. run FakeTensorProp
+                # The result should be correct.
+                result = FakeTensorProp(graph_model, fake_tensor_mode).propagate(value)
+                self.assertTrue(isinstance(result, FakeTensor))
+                self.assertEqual(result.shape, (5, 2))
+                # This case uses the **different** fake tensor modes to
+                #  1. create fake parameters and fake buffers, and
+                #  2. run FakeTensorProp
+                # The following code should fail.
+                failed = False
+                try:
+                    FakeTensorProp(graph_model).propagate(value)
+                except AssertionError:
+                    # AssertionError: tensor's device must be `meta`, got cpu instead
+                    failed = True
+                self.assertTrue(failed)
+
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/fx/passes/fake_tensor_prop.py b/torch/fx/passes/fake_tensor_prop.py
index b034b5341b068..403db5b9a009b 100644
--- a/torch/fx/passes/fake_tensor_prop.py
+++ b/torch/fx/passes/fake_tensor_prop.py
@@ -1,3 +1,5 @@
+from typing import Optional
+
 import torch.fx
 from torch.fx import Node
 from torch.fx._compatibility import compatibility
@@ -17,7 +19,13 @@ class FakeTensorProp(torch.fx.Interpreter):
 
     Args:
          module (GraphModule): The module to be executed
+         mode (Optional[FakeTensorMode]): The dispatch mode used to execute computation indicated by each FX Node.
     """
+    def __init__(self, module: torch.fx.GraphModule, mode: Optional[FakeTensorMode] = None):
+        super().__init__(module)
+        if mode is None:
+            mode = FakeTensorMode()
+        self._mode = mode
 
     def run_node(self, n: Node):
         result = super().run_node(n)
@@ -25,6 +33,6 @@ def run_node(self, n: Node):
         return result
 
     def propagate(self, *args):
-        with FakeTensorMode.push() as mode:
-            fake_args = [mode.from_tensor(a) for a in args]
+        with self._mode:
+            fake_args = [self._mode.from_tensor(a) for a in args]
             return super().run(*fake_args)

From e47272f3c3953714c256535197b67664c2002650 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@meta.com>
Date: Fri, 11 Nov 2022 04:02:44 +0000
Subject: [PATCH 0784/1922] Update lr_scheduler.pyi to match lr_scheduler.py
 (#88818)

Following #88503, we should also update the pyi file

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88818
Approved by: https://github.com/soulitzer
---
 torch/optim/lr_scheduler.pyi | 37 +++++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/torch/optim/lr_scheduler.pyi b/torch/optim/lr_scheduler.pyi
index 97603e064a70c..00d9eb512ae11 100644
--- a/torch/optim/lr_scheduler.pyi
+++ b/torch/optim/lr_scheduler.pyi
@@ -1,7 +1,7 @@
 from typing import Iterable, Any, Optional, Callable, Union, List
 from .optimizer import Optimizer
 
-class _LRScheduler:
+class LRScheduler:
     optimizer: Optimizer = ...
     base_lrs: List[float] = ...
     last_epoch: int = ...
@@ -14,46 +14,49 @@ class _LRScheduler:
     def step(self, epoch: Optional[int] = ...) -> None: ...
     def print_lr(self, is_verbose: bool, group: dict, lr: float, epoch: Optional[int] = ...) -> None: ...
 
-class LambdaLR(_LRScheduler):
+class _LRScheduler(LRScheduler):
+    ...
+
+class LambdaLR(LRScheduler):
     lr_lambdas: List[Callable[[int], float]] = ...
     def __init__(self, optimizer: Optimizer, lr_lambda: Union[Callable[[int], float], List[Callable[[int], float]]], last_epoch: int = ..., verbose: bool = ...) -> None: ...
 
-class MultiplicativeLR(_LRScheduler):
+class MultiplicativeLR(LRScheduler):
     lr_lambdas: List[Callable[[int], float]] = ...
     def __init__(self, optimizer: Optimizer, lr_lambda: Union[Callable[[int], float], List[Callable[[int], float]]], last_epoch: int = ..., verbose: bool = ...) -> None: ...
 
-class StepLR(_LRScheduler):
+class StepLR(LRScheduler):
     step_size: int = ...
     gamma: float = ...
     def __init__(self, optimizer: Optimizer, step_size: int, gamma: float = ..., last_epoch: int = ..., verbose: bool = ...) -> None: ...
 
-class MultiStepLR(_LRScheduler):
+class MultiStepLR(LRScheduler):
     milestones: Iterable[int] = ...
     gamma: float = ...
     def __init__(self, optimizer: Optimizer, milestones: Iterable[int], gamma: float = ..., last_epoch: int = ..., verbose: bool = ...) -> None: ...
 
-class ConstantLR(_LRScheduler):
+class ConstantLR(LRScheduler):
     factor: float = ...
     total_iters: int = ...
     def __init__(self, optimizer: Optimizer, factor: float=..., total_iters: int=..., last_epoch: int=..., verbose: bool = ...) -> None: ...
 
-class LinearLR(_LRScheduler):
+class LinearLR(LRScheduler):
     start_factor: float = ...
     end_factor: float = ...
     total_iters: int = ...
     def __init__(self, optimizer: Optimizer, start_factor: float=..., end_factor: float= ..., total_iters: int= ..., last_epoch: int= ..., verbose: bool = ...) -> None: ...
 
-class ExponentialLR(_LRScheduler):
+class ExponentialLR(LRScheduler):
     gamma: float = ...
     def __init__(self, optimizer: Optimizer, gamma: float, last_epoch: int = ..., verbose: bool = ...) -> None: ...
 
-class ChainedScheduler(_LRScheduler):
-    def __init__(self, schedulers: List[_LRScheduler]) -> None: ...
+class ChainedScheduler(LRScheduler):
+    def __init__(self, schedulers: List[LRScheduler]) -> None: ...
 
-class SequentialLR(_LRScheduler):
-    def __init__(self, optimizer: Optimizer, schedulers: List[_LRScheduler], milestones: List[int], last_epoch: int=..., verbose: bool=...) -> None: ...
+class SequentialLR(LRScheduler):
+    def __init__(self, optimizer: Optimizer, schedulers: List[LRScheduler], milestones: List[int], last_epoch: int=..., verbose: bool=...) -> None: ...
 
-class CosineAnnealingLR(_LRScheduler):
+class CosineAnnealingLR(LRScheduler):
     T_max: int = ...
     eta_min: float = ...
     def __init__(self, optimizer: Optimizer, T_max: int, eta_min: float = ..., last_epoch: int = ..., verbose: bool = ...) -> None: ...
@@ -82,7 +85,7 @@ class ReduceLROnPlateau:
     def state_dict(self) -> dict: ...
     def load_state_dict(self, state_dict: dict) -> None: ...
 
-class CyclicLR(_LRScheduler):
+class CyclicLR(LRScheduler):
     max_lrs: List[float] = ...
     total_size: float = ...
     step_ratio: float = ...
@@ -95,7 +98,7 @@ class CyclicLR(_LRScheduler):
     def __init__(self, optimizer: Optimizer, base_lr: Union[float, List[float]], max_lr: Union[float, List[float]], step_size_up: int = ..., step_size_down: Optional[int] = ..., mode: str = ..., gamma: float = ..., scale_fn: Optional[Callable[[float], float]] = ..., scale_mode: str = ..., cycle_momentum: bool = ..., base_momentum: float = ..., max_momentum: float = ..., last_epoch: int = ..., verbose: bool = ...) -> None: ...
     def scale_fn(self, x: Any) -> float: ...
 
-class CosineAnnealingWarmRestarts(_LRScheduler):
+class CosineAnnealingWarmRestarts(LRScheduler):
     T_0: int = ...
     T_i: int = ...
     T_mult: Optional[int] = ...
@@ -104,14 +107,14 @@ class CosineAnnealingWarmRestarts(_LRScheduler):
     def __init__(self, optimizer: Optimizer, T_0: int, T_mult: int = ..., eta_min: float = ..., last_epoch: int = ..., verbose: bool = ...) -> None: ...
     def step(self, epoch: Optional[Any] = ...): ...
 
-class OneCycleLR(_LRScheduler):
+class OneCycleLR(LRScheduler):
     total_steps: int = ...
     anneal_func: Callable[[float, float, float], float] = ...
     cycle_momentum: bool = ...
     use_beta1: bool = ...
     def __init__(self, optimizer: Optimizer, max_lr: Union[float, List[float]], total_steps: int = ..., epochs: int = ..., steps_per_epoch: int = ..., pct_start: float = ..., anneal_strategy: str = ..., cycle_momentum: bool = ..., base_momentum: Union[float, List[float]] = ..., max_momentum: Union[float, List[float]] = ..., div_factor: float = ..., final_div_factor: float = ..., three_phase: bool = ..., last_epoch: int = ..., verbose: bool = ...) -> None: ...
 
-class PolynomialLR(_LRScheduler):
+class PolynomialLR(LRScheduler):
     total_iters: int = ...
     power: float = ...
     def __init__(self, optimizer: Optimizer, total_iters: int = ..., power: float = ..., last_epoch: int = ..., verbose: bool = ...) -> None: ...

From 4c9b4131cc3d74e85c1182bfe3ccef7685c0142d Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 11 Nov 2022 04:19:08 +0000
Subject: [PATCH 0785/1922] Revert "[dynamo] fixes dict changed during runtime
 error  (#87526)"

This reverts commit cf04b36ce8f531730210b03eaa347977a1c2d75c.

Reverted https://github.com/pytorch/pytorch/pull/87526 on behalf of https://github.com/anijain2305 due to error reported
---
 test/dynamo/test_aot_cudagraphs.py |  3 +++
 torch/_dynamo/convert_frame.py     | 15 +++------------
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/test/dynamo/test_aot_cudagraphs.py b/test/dynamo/test_aot_cudagraphs.py
index fdb7c88762b8b..cb1d2a0e601ff 100644
--- a/test/dynamo/test_aot_cudagraphs.py
+++ b/test/dynamo/test_aot_cudagraphs.py
@@ -71,6 +71,7 @@ def fn(x, y):
         y = torch.randn(3, device="cuda")
         fn(x, y)
 
+    @patch("torch._dynamo.config.suppress_errors", True)
     @patch_all()
     def test_dtoh(self):
         def model(x, y):
@@ -104,6 +105,7 @@ def fn(x, y):
         y = torch.randn((), device="cpu")
         fn(x, y)
 
+    @patch("torch._dynamo.config.suppress_errors", True)
     @patch("functorch._src.config.use_functionalize", True)
     @patch_all(ok=False)  # input mutation not supported yet
     def test_mutate_input(self):
@@ -143,6 +145,7 @@ def fn(x, y):
         y = torch.randn(1, device="cuda")
         fn(x, y)
 
+    @patch("torch._dynamo.config.suppress_errors", True)
     @patch_all()
     def test_factory(self):
         def model(y):
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index db9b23f2da7e3..f1ce83727a19f 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -156,11 +156,7 @@ def has_tensor(obj):
             seen_ids[obj_id] = any([has_tensor(v) for v in obj])
             return seen_ids[obj_id]
         elif istype(obj, dict):
-            # Some packages like pytest can be updated during runtime. So, make a
-            # copy of values to avoid issues like "RuntimeError: dictionary
-            # changed size during iteration"
-            values = list(obj.values())
-            seen_ids[obj_id] = any([has_tensor(v) for v in values])
+            seen_ids[obj_id] = any([has_tensor(v) for v in obj.values()])
             return seen_ids[obj_id]
         elif istype(obj, (str, int, float, type(None), bool)):
             seen_ids[obj_id] = False
@@ -168,13 +164,8 @@ def has_tensor(obj):
         elif is_namedtuple(obj):
             seen_ids[obj_id] = any([has_tensor(getattr(obj, v)) for v in obj._fields])
             return seen_ids[obj_id]
-        elif (
-            not is_allowed(obj)
-            and not hasattr(obj, "__get__")  # overridden get can mutate the object
-            and hasattr(obj, "__dict__")
-            and istype(obj.__dict__, dict)
-        ):
-            seen_ids[obj_id] = has_tensor(obj.__dict__)
+        elif not is_allowed(obj) and hasattr(obj, "__dict__") and len(obj.__dict__):
+            seen_ids[obj_id] = any([has_tensor(v) for v in obj.__dict__.values()])
             return seen_ids[obj_id]
         else:
             # if config.debug:

From 5fd4285659e72a3df6dbd4c0a578988d61a78c7d Mon Sep 17 00:00:00 2001
From: AllenTiTaiWang <titaiwang@microsoft.com>
Date: Wed, 9 Nov 2022 17:27:22 +0000
Subject: [PATCH 0786/1922] [ONNX] Add onnx::Max into standard Op for scalar
 type alignment (#88750)

Easy fix for onnx::Max ScalarType
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88750
Approved by: https://github.com/justinchuby, https://github.com/BowenBao
---
 aten/src/ATen/core/interned_strings.h               | 1 +
 torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp | 1 +
 2 files changed, 2 insertions(+)

diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h
index 80919e52b58fd..2abc6217516de 100644
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@@ -239,6 +239,7 @@ namespace c10 {
   _(onnx, LSTM)                      \
   _(onnx, MatMul)                    \
   _(onnx, Min)                       \
+  _(onnx, Max)                       \
   _(onnx, Mul)                       \
   _(onnx, Pow)                       \
   _(onnx, RNN)                       \
diff --git a/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp b/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp
index 657c27f70c7d9..3af0360b7e011 100644
--- a/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp
+++ b/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp
@@ -48,6 +48,7 @@ static const std::unordered_set<NodeKind> standardOps = {
     onnx::Div,
     onnx::Gemm,
     onnx::Min,
+    onnx::Max,
     onnx::Mod,
     onnx::Mul,
     onnx::Pow,

From 110f500fe23621575a998580fc63712d4084d5cd Mon Sep 17 00:00:00 2001
From: Eddie Yan <eddiey@nvidia.com>
Date: Fri, 11 Nov 2022 05:23:48 +0000
Subject: [PATCH 0787/1922] Use `atomicAdd` for `bfloat16` in Ampere and above
 (#84981)

WIP to fix extremely slow `scatter_add` issue vs. fp16. The current changes seem to improve performance, but it still appears to lag behind the fp16 equivalent.

CC @ngimel @ptrblck
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84981
Approved by: https://github.com/ngimel
---
 aten/src/ATen/cuda/Atomic.cuh             | 17 ++++++--
 aten/src/ATen/native/cuda/KernelUtils.cuh | 48 ++++++++++++++++++++++-
 2 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/cuda/Atomic.cuh b/aten/src/ATen/cuda/Atomic.cuh
index 42975411e841e..3d60b672e9725 100644
--- a/aten/src/ATen/cuda/Atomic.cuh
+++ b/aten/src/ATen/cuda/Atomic.cuh
@@ -6,6 +6,10 @@
 
 #include <ATen/NumericUtils.h>
 
+#if !(defined(USE_ROCM) || ((defined(CUDA_VERSION) && CUDA_VERSION < 11000) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))))
+#include <cuda_bf16.h>
+#endif
+
 template <typename T>
 struct AtomicFPOp;
 
@@ -219,10 +223,15 @@ static inline  __device__ at::Half gpuAtomicAdd(at::Half *address, at::Half val)
 }
 
 static inline __device__ at::BFloat16 gpuAtomicAdd(at::BFloat16 *address, at::BFloat16 val) {
-  return AtomicFPOp<at::BFloat16>()(address, val,
-                                    [](at::BFloat16 bsum, at::BFloat16 val) {
-                                      return bsum + val;
-                                    });
+#if defined(USE_ROCM) || ((defined(CUDA_VERSION) && CUDA_VERSION < 11000) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)))
+return AtomicFPOp<at::BFloat16>()(address, val,
+                                  [](at::BFloat16 bsum, at::BFloat16 val) {
+                                    return bsum + val;
+                                  });
+#else
+  __nv_bfloat16 r = atomicAdd(reinterpret_cast<__nv_bfloat16*>(address), *reinterpret_cast<__nv_bfloat16*>(&val));
+  return *reinterpret_cast<c10::BFloat16*>(&r);
+#endif
 }
 
 #if defined(CUDA_VERSION) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)
diff --git a/aten/src/ATen/native/cuda/KernelUtils.cuh b/aten/src/ATen/native/cuda/KernelUtils.cuh
index 1e36e2db74d54..d2e956d1a3e44 100644
--- a/aten/src/ATen/native/cuda/KernelUtils.cuh
+++ b/aten/src/ATen/native/cuda/KernelUtils.cuh
@@ -1,6 +1,10 @@
 #pragma once
 #include <ATen/cuda/Atomic.cuh>
 
+#if !(defined(USE_ROCM) || ((defined(CUDA_VERSION) && CUDA_VERSION < 11000) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))))
+#include <cuda_bf16.h>
+#endif
+
 namespace at {
 namespace native {
 
@@ -66,7 +70,49 @@ __device__ __forceinline__ void fastSpecializedAtomicAdd(
 template <
     typename scalar_t,
     typename index_t,
-    typename std::enable_if<!std::is_same<c10::Half, scalar_t>::value>::type* =
+    typename std::enable_if<std::is_same<c10::BFloat16, scalar_t>::value>::type* =
+        nullptr>
+__device__ __forceinline__ void fastSpecializedAtomicAdd(
+    scalar_t* tensor,
+    index_t index,
+    const index_t numel,
+    scalar_t value) {
+#if (                      \
+    (defined(USE_ROCM)) || \
+    (defined(CUDA_VERSION) && (CUDA_VERSION < 11000)) || \
+    (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)))
+  gpuAtomicAddNoReturn(
+      reinterpret_cast<at::BFloat16*>(tensor) + index,
+      static_cast<at::BFloat16>(value));
+#else
+  // Accounts for the chance tensor falls on an odd 16 bit alignment (ie, not 32 bit aligned)
+  __nv_bfloat16* target_addr = reinterpret_cast<__nv_bfloat16*>(tensor + index);
+  bool low_byte = (reinterpret_cast<std::uintptr_t>(target_addr) % sizeof(__nv_bfloat162) == 0);
+
+  if (low_byte && index < (numel - 1)) {
+    __nv_bfloat162 value2;
+    value2.x = *reinterpret_cast<__nv_bfloat16*>(&value);
+    value2.y = __int2bfloat16_rz(0);
+    atomicAdd(reinterpret_cast<__nv_bfloat162*>(target_addr), value2);
+
+  } else if (!low_byte && index > 0) {
+    __nv_bfloat162 value2;
+    value2.x = __int2bfloat16_rz(0);
+    value2.y = *reinterpret_cast<__nv_bfloat16*>(&value);
+    atomicAdd(reinterpret_cast<__nv_bfloat162*>(target_addr - 1), value2);
+
+  } else {
+    atomicAdd(
+        reinterpret_cast<__nv_bfloat16*>(tensor) + index, *reinterpret_cast<__nv_bfloat16*>(&value));
+  }
+#endif
+}
+
+
+template <
+    typename scalar_t,
+    typename index_t,
+    typename std::enable_if<!std::is_same<c10::Half, scalar_t>::value && !std::is_same<c10::BFloat16, scalar_t>::value >::type* =
         nullptr>
 __device__ __forceinline__ void fastSpecializedAtomicAdd(
     scalar_t* tensor,

From f548a544d282cb9ecfc644676790cdf1fc02b22b Mon Sep 17 00:00:00 2001
From: AllenTiTaiWang <titaiwang@microsoft.com>
Date: Wed, 9 Nov 2022 17:41:10 +0000
Subject: [PATCH 0788/1922] [ONNX] Add test case for onnx::Max scalar type
 (#88751)

Referenced by minimum cases
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88751
Approved by: https://github.com/wschin, https://github.com/BowenBao
---
 test/onnx/test_pytorch_onnx_onnxruntime.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 1e36163d0394c..e4fc3f83b288d 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -8728,6 +8728,28 @@ def forward(self, x, y):
         y = torch.full_like(x, True)
         self.run_test(MinimumModel(), (x, y))
 
+    @skipIfUnsupportedMinOpsetVersion(12)
+    def test_maximum_dtypes(self):
+        class MaximumModel(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.maximum(x, y)
+
+        x = torch.randn((5, 5), dtype=torch.float16)
+        y = torch.randn((5, 5), dtype=torch.float)
+        self.run_test(MaximumModel(), (x, y))
+
+        x = torch.randn((5, 5), dtype=torch.float16)
+        y = torch.randint(10, (5, 5), dtype=torch.int16)
+        self.run_test(MaximumModel(), (x, y))
+
+        x = torch.randint(10, (5, 5), dtype=torch.int16)
+        y = torch.randint(10, (5, 5), dtype=torch.int32)
+        self.run_test(MaximumModel(), (x, y))
+
+        x = torch.randint(10, (5, 5), dtype=torch.int)
+        y = torch.full_like(x, True)
+        self.run_test(MaximumModel(), (x, y))
+
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_any(self):
         class M(torch.nn.Module):

From 43480bdd6ef7c4afff3831d2eadcd52aab7bddd8 Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Fri, 11 Nov 2022 08:51:26 +0000
Subject: [PATCH 0789/1922] Error on ZeroTensor serialization (#88803)

Follow-up : https://github.com/pytorch/pytorch/pull/88182#issuecomment-1308628415

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88803
Approved by: https://github.com/anjali411
---
 test/cpp/api/serialize.cpp             |  8 ++++++++
 test/test_serialization.py             | 22 ++++++++++++++++++++++
 torch/csrc/jit/serialization/pickler.h |  6 ++++++
 3 files changed, 36 insertions(+)

diff --git a/test/cpp/api/serialize.cpp b/test/cpp/api/serialize.cpp
index 05bb0f941d402..20d572853d3a1 100644
--- a/test/cpp/api/serialize.cpp
+++ b/test/cpp/api/serialize.cpp
@@ -288,6 +288,14 @@ TEST(SerializeTest, MathBits) {
     ASSERT_EQ(actual.sizes().vec(), expected.sizes().vec());
     ASSERT_TRUE(actual.allclose(expected));
   }
+
+  {
+    // We don't support serializing `ZeroTensor` as it is not public facing yet.
+    // If in future, `ZeroTensor` serialization is supported, this test should
+    // start failing!
+    auto t = torch::_efficientzerotensor({5, 5});
+    ASSERT_THROWS_WITH(save_and_load(t), "ZeroTensor is not serializable,");
+  }
 }
 
 TEST(SerializeTest, BasicToFile) {
diff --git a/test/test_serialization.py b/test/test_serialization.py
index af0317e87a145..779d6fb5c20c5 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -931,6 +931,28 @@ def _save_load_check(t):
         t_n_c = torch._neg_view(torch.conj(t))
         _save_load_check(t_n_c)
 
+    @parametrize('weights_only', (False, True))
+    def test_serialization_efficient_zerotensor(self, weights_only):
+        # We don't support serializing `ZeroTensor` as it is not public
+        # facing yet.
+        # If in future, `ZeroTensor` serialization is supported, this test
+        # should start failing!
+        t = torch._efficientzerotensor((4, 5))
+
+        def _save_load_check(t):
+            with BytesIOContext() as f:
+                torch.save(t, f)
+                f.seek(0)
+                # Unsafe load should work
+                self.assertEqual(torch.load(f, weights_only=weights_only), t)
+
+        # NOTE: `torch.save` fails before we hit the TORCH_CHECK in `getTensoMetadata`
+        #       as nullptr storage is disabled.
+        err_msg = (r'python bindings to nullptr storage \(e.g., from torch.Tensor._make_wrapper_subclass\)'
+                   ' are currently unsafe and thus disabled')
+        with self.assertRaisesRegex(RuntimeError, err_msg):
+            _save_load_check(t)
+
     def run(self, *args, **kwargs):
         with serialization_method(use_zip=True):
             return super(TestSerialization, self).run(*args, **kwargs)
diff --git a/torch/csrc/jit/serialization/pickler.h b/torch/csrc/jit/serialization/pickler.h
index c289cae12b649..26f9fcf423965 100644
--- a/torch/csrc/jit/serialization/pickler.h
+++ b/torch/csrc/jit/serialization/pickler.h
@@ -300,6 +300,12 @@ bool checkHasValidSetGetState(const std::shared_ptr<c10::ClassType>& cls);
 // For now, it only takes care of `conj` and `neg` bit.
 inline std::unordered_map<std::string, bool> getTensorMetadata(
     const at::Tensor& t) {
+  // We don't support serializing `ZeroTensor` as it is not public
+  // facing yet.
+  TORCH_CHECK(
+      !t._is_zerotensor(),
+      "ZeroTensor is not serializable,",
+      " please file an issue if required.");
   std::unordered_map<std::string, bool> metadata{};
 
   // Only add meta-data if the value is not default.

From 941dd4429f81210171200ad2885dcc1f78785860 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <mikekg@meta.com>
Date: Fri, 11 Nov 2022 12:19:31 +0000
Subject: [PATCH 0790/1922] Fix cuda/cpu check on NoneType (#88854)

Summary: Fix cuda/cpu check on NoneType

Test Plan: sabdcastle/ github CI/CD

Differential Revision: D41203955

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88854
Approved by: https://github.com/drisspg, https://github.com/ngimel
---
 torch/nn/modules/activation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index 7b0e7e3effaac..e6b3b778e5fbc 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -1111,7 +1111,7 @@ def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: O
             # generator expressions.
             if torch.overrides.has_torch_function(tensor_args):
                 why_not_fast_path = "some Tensor argument has_torch_function"
-            elif not all([(x.is_cuda or 'cpu' in str(x.device)) for x in tensor_args]):
+            elif not all([(x is None or x.is_cuda or 'cpu' in str(x.device)) for x in tensor_args]):
                 why_not_fast_path = "some Tensor argument is neither CUDA nor CPU"
             elif torch.is_grad_enabled() and any([x.requires_grad for x in tensor_args]):
                 why_not_fast_path = ("grad is enabled and at least one of query or the "

From 97255754daf9a905cd150dfca9ec3c5896883dd0 Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Tue, 8 Nov 2022 00:16:14 +0000
Subject: [PATCH 0791/1922] [FSDP][state_dict][2/N] Move state_dict related
 enums/dataclasses/states to state_dict_utils.py, api.py and init_state_dict()
 (#88481)

**Motivation**:
Several Enums, Dataclasses and states defined in fully_sharded_data_paralle.py should be moved to a place where the composable FSDP can access. This PR does the move.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88481
Approved by: https://github.com/rohan-varma, https://github.com/awgu
---
 torch/distributed/fsdp/_init_utils.py         |  12 +-
 torch/distributed/fsdp/_state_dict_utils.py   |  72 +++++++---
 torch/distributed/fsdp/api.py                 |  96 ++++++++++++-
 .../fsdp/fully_sharded_data_parallel.py       | 127 +-----------------
 4 files changed, 164 insertions(+), 143 deletions(-)

diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index c89f65c3a5b82..966e61f7fe123 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -3,6 +3,7 @@
 from typing import (
     Callable,
     Dict,
+    Generator,
     Iterable,
     Iterator,
     List,
@@ -33,8 +34,11 @@
 from torch.distributed.fsdp.api import (
     BackwardPrefetch,
     CPUOffload,
+    FullStateDictConfig,
     MixedPrecision,
     ShardingStrategy,
+    StateDictConfig,
+    StateDictType,
 )
 from torch.distributed.fsdp.flat_param import (
     _HandlesKey,
@@ -206,7 +210,13 @@ def _init_prefetching_state(
 
 
 def _init_state_dict_state(state: _FSDPState) -> _FSDPState:
-    # TODO: after rebase
+    state._state_dict_type = StateDictType.FULL_STATE_DICT
+    state_dict_config: StateDictConfig = FullStateDictConfig()
+    state._state_dict_config = state_dict_config
+    full_param_ctx: Optional[Generator] = None
+    # TODO: For composable API, this should be a dict that maps from a module to
+    # handles.
+    state._full_param_ctx = full_param_ctx
     return state
 
 
diff --git a/torch/distributed/fsdp/_state_dict_utils.py b/torch/distributed/fsdp/_state_dict_utils.py
index 1109f1e881506..c90bd4d409b1f 100644
--- a/torch/distributed/fsdp/_state_dict_utils.py
+++ b/torch/distributed/fsdp/_state_dict_utils.py
@@ -1,7 +1,7 @@
 import functools
 import math
 import warnings
-from typing import Any, Callable, cast, Dict
+from typing import Any, Callable, cast, Dict, Iterator, Tuple
 
 import torch
 import torch.distributed as dist
@@ -20,6 +20,7 @@
 from torch.distributed.fsdp._common_utils import (
     clean_tensor_name,
     FSDP_PREFIX,
+    FSDP_WRAPPED_MODULE,
     TrainingState,
 )
 from torch.distributed.fsdp._runtime_utils import (
@@ -28,6 +29,7 @@
     _get_buffer_dtypes,
     _lazy_init,
 )
+from torch.distributed.fsdp.api import FullStateDictConfig, StateDictType
 from torch.distributed.utils import _replace_by_prefix
 
 from ._fsdp_extensions import (
@@ -38,6 +40,33 @@
 from .flat_param import FlatParamHandle
 
 
+def _convert_to_wrapped_module_name(module_name: str) -> str:
+    module_name = module_name.replace(f"{FSDP_PREFIX}", "")
+    module_name = module_name.replace(f"{FSDP_WRAPPED_MODULE}", "")
+    if module_name:
+        module_name = f"{module_name}."
+    # Activation checkpoint adds a prefix that has to be
+    # removed as well.
+    module_name = module_name.replace(checkpoint_wrapper._CHECKPOINT_PREFIX, "")
+    return module_name
+
+
+def _param_fqns(module) -> Iterator[Tuple[str, str, str]]:
+    if not module._has_params:
+        return
+    for param_name, module_name in module._handles[0].parameter_module_names():
+        module_name = _convert_to_wrapped_module_name(module_name)
+        fqn = f"{module_name}{param_name}"
+        yield fqn, param_name, module_name
+
+
+def _shared_param_fqns(module) -> Iterator[Tuple[str, str, str]]:
+    for param_name, module_name in module._handles[0].shared_parameter_module_names():
+        module_name = _convert_to_wrapped_module_name(module_name)
+        fqn = f"{module_name}{param_name}"
+        yield fqn, param_name, module_name
+
+
 def _enter_full_param_ctx(
     module,
     recurse: bool = False,
@@ -71,7 +100,10 @@ def _enter_full_param_ctx(
 
 def _exit_full_param_ctx(module) -> None:
     """A helper function to exit ``summon_full_params`` context."""
-    module._assert_state([TrainingState.SUMMON_FULL_PARAMS])
+    assert module.training_state == TrainingState.SUMMON_FULL_PARAMS, (
+        "Exiting the summon_full_params context but the state is not "
+        "SUMMON_FULL_PARAMS."
+    )
     assert module._full_param_ctx is not None
     module._full_param_ctx.__exit__(None, None, None)
     module._full_param_ctx = None
@@ -124,7 +156,9 @@ def _common_summon_post_state_dict_hook(
     hook.
     """
     _replace_by_prefix(state_dict, prefix + f"{FSDP_PREFIX}", prefix)
-    module._assert_state([TrainingState.SUMMON_FULL_PARAMS])
+    assert (
+        module.training_state == TrainingState.SUMMON_FULL_PARAMS
+    ), "Inside the post_state_dict_hook but the state is not SUMMON_FULL_PARAMS."
     # Return early for trivial cases
     if not state_dict or not module._has_params:
         _exit_full_param_ctx(module)
@@ -141,8 +175,8 @@ def _common_summon_post_state_dict_hook(
     # all-gather and does not need to save the # state dict. We simply check
     # rank0_only to ensure this issue.
     rank0_only = (
-        module._state_dict_type == fsdp_file.StateDictType.FULL_STATE_DICT
-        and cast(fsdp_file.FullStateDictConfig, module._state_dict_config).rank0_only
+        module._state_dict_type == StateDictType.FULL_STATE_DICT
+        and cast(FullStateDictConfig, module._state_dict_config).rank0_only
     )
     # no_fsdp_return means the state_dict returned by this rank should contain
     # only non-FSDP controlled parameters and buffers.
@@ -159,7 +193,7 @@ def _common_summon_post_state_dict_hook(
 
     # Loop only the parameters saved in this instance's wrapped module to
     # avoid processing buffers.
-    for fqn, param_name, module_name in module._param_fqns:
+    for fqn, param_name, module_name in _param_fqns(module):
         # TODO: remove the parameter retrieval. See ``_full_pre_state_dict_hook``.
         param = functools.reduce(getattr, fqn.split("."), module.module)
         fqn = f"{prefix}{fqn}"
@@ -224,9 +258,7 @@ def _full_pre_state_dict_hook(
     _common_summon_pre_state_dict_hook(
         module,
         offload_to_cpu=module._state_dict_config.offload_to_cpu,
-        rank0_only=cast(
-            fsdp_file.FullStateDictConfig, module._state_dict_config
-        ).rank0_only,
+        rank0_only=cast(FullStateDictConfig, module._state_dict_config).rank0_only,
     )
 
 
@@ -473,9 +505,9 @@ def _sharded_pre_load_state_dict_hook(
         )
 
     nonsharded_tensors = []
-    shared_fqns = [fqn for fqn, _, _ in module._shared_param_fqns]
+    shared_fqns = [fqn for fqn, _, _ in _shared_param_fqns(module)]
     loaded_shapes = []
-    for fqn, _, _ in module._param_fqns:
+    for fqn, _, _ in _param_fqns(module):
         full_fqn = f"{prefix}{FSDP_PREFIX}{fqn}"
         param = state_dict.pop(full_fqn)
         if fqn in shared_fqns:
@@ -552,9 +584,9 @@ def _post_state_dict_hook(
     what postprocessing will be done.
     """
     _post_state_dict_hook_fn = {
-        fsdp_file.StateDictType.FULL_STATE_DICT: _full_post_state_dict_hook,
-        fsdp_file.StateDictType.LOCAL_STATE_DICT: _local_post_state_dict_hook,
-        fsdp_file.StateDictType.SHARDED_STATE_DICT: _sharded_post_state_dict_hook,
+        StateDictType.FULL_STATE_DICT: _full_post_state_dict_hook,
+        StateDictType.LOCAL_STATE_DICT: _local_post_state_dict_hook,
+        StateDictType.SHARDED_STATE_DICT: _sharded_post_state_dict_hook,
     }
     fsdp_module = cast(fsdp_file.FullyShardedDataParallel, module)
     processed_state_dict = _post_state_dict_hook_fn[fsdp_module._state_dict_type](
@@ -576,9 +608,9 @@ def _pre_load_state_dict_hook(
     will be done.
     """
     _pre_load_state_dict_hook_fn = {
-        fsdp_file.StateDictType.FULL_STATE_DICT: _full_pre_load_state_dict_hook,
-        fsdp_file.StateDictType.LOCAL_STATE_DICT: _local_pre_load_state_dict_hook,
-        fsdp_file.StateDictType.SHARDED_STATE_DICT: _sharded_pre_load_state_dict_hook,
+        StateDictType.FULL_STATE_DICT: _full_pre_load_state_dict_hook,
+        StateDictType.LOCAL_STATE_DICT: _local_pre_load_state_dict_hook,
+        StateDictType.SHARDED_STATE_DICT: _sharded_pre_load_state_dict_hook,
     }
     # Code that is common for all state_dict impls
     fsdp_module = cast(fsdp_file.FullyShardedDataParallel, module)
@@ -593,9 +625,9 @@ def _pre_load_state_dict_hook(
 @torch.no_grad()
 def _post_load_state_dict_hook(module: nn.Module, *args: Any) -> None:
     _post_load_state_dict_hook_fn = {
-        fsdp_file.StateDictType.FULL_STATE_DICT: _full_post_load_state_dict_hook,
-        fsdp_file.StateDictType.LOCAL_STATE_DICT: _local_post_load_state_dict_hook,
-        fsdp_file.StateDictType.SHARDED_STATE_DICT: _sharded_post_load_state_dict_hook,
+        StateDictType.FULL_STATE_DICT: _full_post_load_state_dict_hook,
+        StateDictType.LOCAL_STATE_DICT: _local_post_load_state_dict_hook,
+        StateDictType.SHARDED_STATE_DICT: _sharded_post_load_state_dict_hook,
     }
     # Code that is common for all state_dict impls
     fsdp_module = cast(fsdp_file.FullyShardedDataParallel, module)
diff --git a/torch/distributed/fsdp/api.py b/torch/distributed/fsdp/api.py
index 9e1327c80633c..18f3cd3069ddf 100644
--- a/torch/distributed/fsdp/api.py
+++ b/torch/distributed/fsdp/api.py
@@ -10,7 +10,17 @@
 
 import torch
 
-__all__ = ["ShardingStrategy", "BackwardPrefetch", "MixedPrecision", "CPUOffload"]
+__all__ = [
+    "ShardingStrategy",
+    "BackwardPrefetch",
+    "MixedPrecision",
+    "CPUOffload",
+    "StateDictType",
+    "StateDictConfig",
+    "FullStateDictConfig",
+    "LocalStateDictConfig",
+    "ShardedStateDictConfig",
+]
 
 
 class ShardingStrategy(Enum):
@@ -149,3 +159,87 @@ class CPUOffload:
     """
 
     offload_params: bool = False
+
+
+class StateDictType(Enum):
+    """
+    This enum indicates that which type of ``state_dict`` the FSDP module is
+    currently processing (returning or loading).
+    The default value is FULL_STATE_DICT to comply the PyTorch convention.
+    ..note::
+        FSDP currently supports three types of ``state_dict``:
+            1. ``state_dict/load_state_dict`: this pair of APIs return and load
+               the non-sharded, unflattened parameters. The semantics is the
+               same as using DDP.
+            2. ``_local_state_dict/_load_local_state_dict``: this pair of APIs return
+               and load local sharded, flattened parameters. The values returned
+               by ``_local_state_dict`` can be directly used by FSDP and is only
+               meaningful to FSDP (because parameters are flattened). Note that
+               these APIs are meant for use via the :func:`state_dict_type`
+               context manager as follows:
+                   >>> # xdoctest: +SKIP("undefined variables")
+                   >>> with fsdp.state_dict_type(StateDictType.LOCAL_STATE_DICT):
+                   ...     state = fsdp.state_dict()  # loads local state dict
+            3. ``_sharded_state_dict/_load_sharded_state_dict``: this pair of APIs
+               return and load sharded, unflattened parameters. The ``state_dict``
+               return by ``sharded_state_dict`` can be used by all other parallel
+               schemes (resharding may be required).
+    """
+
+    FULL_STATE_DICT = auto()
+    LOCAL_STATE_DICT = auto()
+    SHARDED_STATE_DICT = auto()
+
+
+@dataclass
+class StateDictConfig:
+    """
+    ``StateDictConfig`` is the base class for all state_dict configuration classes.
+    Users should instantiate a child version (i.e. ``FullStateDictConfig``) in
+    order to configure settings for the particular type of ``state_dict``
+    implementation FSDP will use.
+    """
+
+    offload_to_cpu: bool = False
+
+
+@dataclass
+class FullStateDictConfig(StateDictConfig):
+    """
+    ``FullStateDictConfig`` is a config class meant to be used with
+    ``StateDictType.FULL_STATE_DICT``. Currently, it accepts two parameters,
+    ``offload_to_cpu`` and ``rank0_only`` which can be configured to offload
+    the full ``state_dict`` to CPU and to materialize the ``state_dict`` on
+    rank 0 only. When used, it is recommended to enable both of these flags
+    together to optimize memory savings when taking checkpoints. Note that
+    this config class is meant for user via the :func:`state_dict_type`
+    context manager as follows:
+        >>> # xdoctest: +SKIP("undefined variables")
+        >>> fsdp = FSDP(model, auto_wrap_policy=...)
+        >>> cfg = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+        >>> with FullyShardedDataParallel.state_dict_type(fsdp, StateDictType.FULL_STATE_DICT, cfg):
+        >>>     state = fsdp.state_dict()
+        >>>     # state will be empty on non rank 0 and contain CPU tensors on rank 0.
+        >>> # To reload checkpoint for inference, finetuning, transfer learning, etc:
+        >>> model = model_fn() # Initialize model on CPU in preparation for wrapping with FSDP
+        >>> if dist.get_rank() == 0:
+        >>>     # Load checkpoint only on rank 0 to avoid memory redundancy
+        >>>     state_dict = torch.load("my_checkpoint.pt")
+        >>>     model.load_state_dict(state_dict)
+        >>> # All ranks initialize FSDP module as usual. ``sync_module_states`` argument
+        >>> # communicates loaded checkpoint states from rank 0 to rest of the world.
+        >>> fsdp = FSDP(model, device_id=torch.cuda.current_device(), auto_wrap_policy=..., sync_module_states=True)
+        >>> # After this point, all ranks have FSDP model with loaded checkpoint.
+    """
+
+    rank0_only: bool = False
+
+
+@dataclass
+class LocalStateDictConfig(StateDictConfig):
+    pass
+
+
+@dataclass
+class ShardedStateDictConfig(StateDictConfig):
+    pass
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 9934e71893425..773686081a4d2 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -5,7 +5,6 @@
 import traceback
 import warnings
 from contextlib import contextmanager
-from dataclasses import dataclass
 from enum import auto, Enum
 from typing import (
     Any,
@@ -25,7 +24,6 @@
 import torch.nn as nn
 from torch.distributed import ProcessGroup
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
-    _CHECKPOINT_PREFIX,
     _CHECKPOINT_WRAPPED_MODULE,
     ActivationWrapper,
 )
@@ -68,8 +66,13 @@
 from torch.distributed.fsdp.api import (
     BackwardPrefetch,
     CPUOffload,
+    FullStateDictConfig,
+    LocalStateDictConfig,
     MixedPrecision,
+    ShardedStateDictConfig,
     ShardingStrategy,
+    StateDictConfig,
+    StateDictType,
 )
 
 from ._optim_utils import (
@@ -103,11 +106,6 @@
 
 __all__ = [
     "FullyShardedDataParallel",
-    "StateDictType",
-    "StateDictConfig",
-    "FullStateDictConfig",
-    "LocalStateDictConfig",
-    "ShardedStateDictConfig",
     "OptimStateKeyType",
 ]
 
@@ -115,90 +113,6 @@
 FLAT_PARAM = "_flat_param"
 
 
-class StateDictType(Enum):
-    """
-    This enum indicates that which type of ``state_dict`` the FSDP module is
-    currently processing (returning or loading).
-    The default value is FULL_STATE_DICT to comply the PyTorch convention.
-    ..note::
-        FSDP currently supports three types of ``state_dict``:
-            1. ``state_dict/load_state_dict`: this pair of APIs return and load
-               the non-sharded, unflattened parameters. The semantics is the
-               same as using DDP.
-            2. ``_local_state_dict/_load_local_state_dict``: this pair of APIs return
-               and load local sharded, flattened parameters. The values returned
-               by ``_local_state_dict`` can be directly used by FSDP and is only
-               meaningful to FSDP (because parameters are flattened). Note that
-               these APIs are meant for use via the :func:`state_dict_type`
-               context manager as follows:
-                   >>> # xdoctest: +SKIP("undefined variables")
-                   >>> with fsdp.state_dict_type(StateDictType.LOCAL_STATE_DICT):
-                   ...     state = fsdp.state_dict()  # loads local state dict
-            3. ``_sharded_state_dict/_load_sharded_state_dict``: this pair of APIs
-               return and load sharded, unflattened parameters. The ``state_dict``
-               return by ``sharded_state_dict`` can be used by all other parallel
-               schemes (resharding may be required).
-    """
-
-    FULL_STATE_DICT = auto()
-    LOCAL_STATE_DICT = auto()
-    SHARDED_STATE_DICT = auto()
-
-
-@dataclass
-class StateDictConfig:
-    """
-    ``StateDictConfig`` is the base class for all state_dict configuration classes.
-    Users should instantiate a child version (i.e. ``FullStateDictConfig``) in
-    order to configure settings for the particular type of ``state_dict``
-    implementation FSDP will use.
-    """
-
-    offload_to_cpu: bool = False
-
-
-@dataclass
-class FullStateDictConfig(StateDictConfig):
-    """
-    ``FullStateDictConfig`` is a config class meant to be used with
-    ``StateDictType.FULL_STATE_DICT``. Currently, it accepts two parameters,
-    ``offload_to_cpu`` and ``rank0_only`` which can be configured to offload
-    the full ``state_dict`` to CPU and to materialize the ``state_dict`` on
-    rank 0 only. When used, it is recommended to enable both of these flags
-    together to optimize memory savings when taking checkpoints. Note that
-    this config class is meant for user via the :func:`state_dict_type`
-    context manager as follows:
-        >>> # xdoctest: +SKIP("undefined variables")
-        >>> fsdp = FSDP(model, auto_wrap_policy=...)
-        >>> cfg = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
-        >>> with FullyShardedDataParallel.state_dict_type(fsdp, StateDictType.FULL_STATE_DICT, cfg):
-        >>>     state = fsdp.state_dict()
-        >>>     # state will be empty on non rank 0 and contain CPU tensors on rank 0.
-        >>> # To reload checkpoint for inference, finetuning, transfer learning, etc:
-        >>> model = model_fn() # Initialize model on CPU in preparation for wrapping with FSDP
-        >>> if dist.get_rank() == 0:
-        >>>     # Load checkpoint only on rank 0 to avoid memory redundancy
-        >>>     state_dict = torch.load("my_checkpoint.pt")
-        >>>     model.load_state_dict(state_dict)
-        >>> # All ranks initialize FSDP module as usual. ``sync_module_states`` argument
-        >>> # communicates loaded checkpoint states from rank 0 to rest of the world.
-        >>> fsdp = FSDP(model, device_id=torch.cuda.current_device(), auto_wrap_policy=..., sync_module_states=True)
-        >>> # After this point, all ranks have FSDP model with loaded checkpoint.
-    """
-
-    rank0_only: bool = False
-
-
-@dataclass
-class LocalStateDictConfig(StateDictConfig):
-    pass
-
-
-@dataclass
-class ShardedStateDictConfig(StateDictConfig):
-    pass
-
-
 class OptimStateKeyType(Enum):
     PARAM_NAME = auto()
     PARAM_ID = auto()
@@ -502,15 +416,12 @@ def __init__(
 
         # `_state_dict_type` controls the `state_dict()` behavior, which is
         # implemented using post-save and pre-load hooks
-        _init_state_dict_state(self)  # TODO: currently a no-op; need to refactor below
-        self._state_dict_type = StateDictType.FULL_STATE_DICT
-        self._state_dict_config = FullStateDictConfig()
+        _init_state_dict_state(self)
         self._register_state_dict_hook(_post_state_dict_hook)
         self._register_load_state_dict_pre_hook(
             _pre_load_state_dict_hook, with_module=True
         )
         self.register_load_state_dict_post_hook(_post_load_state_dict_hook)
-        self._full_param_ctx: Optional[Generator] = None
 
     @property
     def module(self) -> nn.Module:
@@ -785,32 +696,6 @@ def state_dict_type(
                     module, prev_state_dict_type, prev_state_dict_config
                 )
 
-    def _convert_to_wrapped_module_name(self, module_name: str) -> str:
-        module_name = module_name.replace(f"{FSDP_PREFIX}", "")
-        module_name = module_name.replace(f"{FSDP_WRAPPED_MODULE}", "")
-        if module_name:
-            module_name = f"{module_name}."
-        # Activation checkpoint adds a prefix that has to be
-        # removed as well.
-        module_name = module_name.replace(_CHECKPOINT_PREFIX, "")
-        return module_name
-
-    @property
-    def _param_fqns(self) -> Iterator[Tuple[str, str, str]]:
-        if not self._has_params:
-            return
-        for param_name, module_name in self._handles[0].parameter_module_names():
-            module_name = self._convert_to_wrapped_module_name(module_name)
-            fqn = f"{module_name}{param_name}"
-            yield fqn, param_name, module_name
-
-    @property
-    def _shared_param_fqns(self) -> Iterator[Tuple[str, str, str]]:
-        for param_name, module_name in self._handles[0].shared_parameter_module_names():
-            module_name = self._convert_to_wrapped_module_name(module_name)
-            fqn = f"{module_name}{param_name}"
-            yield fqn, param_name, module_name
-
     def state_dict(self, *args, **kwargs):
         _lazy_init(self, self)
         return super().state_dict(*args, **kwargs)

From 1763f2f08601e59ca8c98d8b1cfb48e3a07dfe6e Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Wed, 9 Nov 2022 20:39:50 +0000
Subject: [PATCH 0792/1922] [dynamo] Add torch.device to is_safe_constant
 (#88766)

Test Plan:
```
PYTORCH_TEST_WITH_DYNAMO=1 python test/test_torch.py -k  test_advancedindex_mixed_cpu_devices_cuda
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88766
Approved by: https://github.com/jansel
---
 torch/_dynamo/utils.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index ef2c1c38ea8ba..067a808073743 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -583,7 +583,19 @@ def is_safe_constant(v):
     if istype(v, (tuple, frozenset)):
         return all(map(is_safe_constant, v))
     return istype(
-        v, (types.CodeType, int, float, bool, str, bytes, type(None), slice, type(type))
+        v,
+        (
+            types.CodeType,
+            int,
+            float,
+            bool,
+            str,
+            bytes,
+            type(None),
+            slice,
+            type(type),
+            torch.device,
+        ),
     )
 
 
From 3612c39fefacd847bd3d8745ce256375d51459e0 Mon Sep 17 00:00:00 2001
From: soulitzer <soulitzer@gmail.com>
Date: Thu, 10 Nov 2022 19:08:42 -0500
Subject: [PATCH 0793/1922] Add context manager to allow mutation on saved
 tensors (#79056)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/79056
Approved by: https://github.com/albanD
---
 test/test_autograd.py   | 178 ++++++++++++++++++++++++++++++++++++++++
 torch/autograd/graph.py | 163 +++++++++++++++++++++++++++++++++++-
 2 files changed, 338 insertions(+), 3 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index f5d890fad2d7f..e08047860e423 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -8778,6 +8778,184 @@ def test_warning_in_backward(self, device):
         with self.assertWarnsRegex(UserWarning, "Warn from backward"):
             b.backward()
 
+class TestAllowMutationOnSaved(TestCase):
+    def assertClonedLenEqual(self, ctx, n):
+        self.assertEqual(len(list(ctx.cloned.items())), n)
+
+    def assertTIDMapLenEqual(self, ctx, n):
+        self.assertEqual(len(list(ctx.tid_to_weakhandle.items())), n)
+
+    def test_basic(self):
+        a = torch.rand(2, 3, requires_grad=True)
+
+        def fn(a):
+            b = a.clone()
+            out = (b**2).sum()
+            b.sin_()
+            out.sum().backward()
+            return a.grad
+        msg = "variables needed for gradient computation has been modified by an inplace"
+        with self.assertRaisesRegex(RuntimeError, msg):
+            fn(a)
+
+        with torch.autograd.graph.allow_mutation_on_saved_tensors() as ctx:
+            da = fn(a)
+
+        self.assertTrue(torch.allclose(a * 2, da))
+        self.assertClonedLenEqual(ctx, 0)
+
+    def test_views(self):
+        a = torch.rand(2, 3, requires_grad=True)
+
+        def fn(a):
+            b = a.clone()
+            c = b.view_as(b)
+            out = (b**2).sum()  # How does this work?
+            c.sin_()
+            out.sum().backward()
+            return a.grad
+
+        msg = "variables needed for gradient computation has been modified by an inplace"
+        with self.assertRaisesRegex(RuntimeError, msg):
+            fn(a)
+
+        with torch.autograd.graph.allow_mutation_on_saved_tensors() as ctx:
+            da = fn(a)
+
+        self.assertClonedLenEqual(ctx, 0)
+        self.assertTrue(torch.allclose(a * 2, da))
+
+    def test_save_base_and_modify_view(self):
+        with torch.autograd.graph.allow_mutation_on_saved_tensors() as ctx:
+            a = torch.rand(2, 3, requires_grad=True)
+            b = a.clone()
+            c = b[:1]
+            out = b**2
+            # modify the view
+            c *= 10
+            # self.assertClonedLenEqual(ctx, 1)
+            out.sum().backward()
+            self.assertClonedLenEqual(ctx, 0)
+
+        self.assertClonedLenEqual(ctx, 0)
+        self.assertTrue(torch.allclose(a * 2, a.grad))
+
+    def test_save_view_modify_base(self):
+        with torch.autograd.graph.allow_mutation_on_saved_tensors() as ctx:
+            a = torch.rand(2, 3, requires_grad=True)
+            b = a.clone()
+            c = b[:]
+            out = (c**2).sum()
+            b *= 2
+            out.backward()
+            self.assertTrue(torch.allclose(a * 2, a.grad))
+
+    def test_double_backward(self):
+        with torch.autograd.graph.allow_mutation_on_saved_tensors() as ctx:
+            a = torch.rand(2, 3, requires_grad=True)
+            b = a.clone()
+            out = (b**2).sum()
+            b.sin_()
+            torch.autograd.grad(out, a, create_graph=True)
+            da, = torch.autograd.grad(out, a, create_graph=True)
+            d2a, = torch.autograd.grad(da.sum(), a)
+
+        self.assertTrue(torch.allclose(torch.ones_like(a) * 2, d2a))
+        self.assertClonedLenEqual(ctx, 0)
+
+    def test_saved_but_not_anymore(self):
+        # Make sure we don't clone if the tensor was once saved, but
+        # by the time we do in-place, it is no longer saved
+        with torch.autograd.graph.allow_mutation_on_saved_tensors() as ctx:
+            a = torch.randn(2, 3, requires_grad=True).clone()
+            out = (a**2).sum()
+            self.assertTIDMapLenEqual(ctx, 1)
+            self.assertClonedLenEqual(ctx, 0)
+            out.backward()
+            a.sin_()
+            self.assertClonedLenEqual(ctx, 0)
+            out = (a**2).sum()
+            a.sin_()
+            self.assertClonedLenEqual(ctx, 1)
+            del out
+            self.assertClonedLenEqual(ctx, 0)
+
+    def test_saved_same_tensor_many_times(self):
+        # We should only clone once
+        with torch.autograd.graph.allow_mutation_on_saved_tensors() as ctx:
+            a = torch.randn(2, 3, requires_grad=True).clone()
+            b = a**2
+            c = a**2
+            a.sin_()
+            self.assertClonedLenEqual(ctx, 1)
+            del b, c
+            self.assertClonedLenEqual(ctx, 0)
+
+    def test_saved_same_tensor_different_versions(self):
+        with torch.autograd.graph.allow_mutation_on_saved_tensors() as ctx:
+            a = torch.randn(2, 3, requires_grad=True).clone()
+            b = a**2
+            a.sin_()
+            c = a**2
+            a.sin_()
+            self.assertClonedLenEqual(ctx, 2)
+            del b
+            self.assertClonedLenEqual(ctx, 1)
+            del c
+            self.assertClonedLenEqual(ctx, 0)
+
+    def test_with_math_views(self):
+        with torch.autograd.graph.allow_mutation_on_saved_tensors() as ctx:
+            a = torch.tensor([1 + 1j], requires_grad=True).clone()
+            b = a.conj()
+            out = (b**2).sum()
+            a.sin_()
+            out.backward()
+
+            a = torch.tensor([1 + 1j], requires_grad=True).clone()
+            b = a.conj()
+            out = (b**2).sum()
+            # in this case, it is no longer a view it seems
+            b.sin_()
+            out.backward()
+
+    def test_with_out_variant(self):
+        with torch.autograd.graph.allow_mutation_on_saved_tensors() as ctx:
+            a = torch.tensor([1.], requires_grad=True)
+            b = torch.tensor([1.])
+            c = torch.tensor([2.])
+            out = a * b
+            self.assertTIDMapLenEqual(ctx, 1)
+            torch.sin(c, out=b)
+            self.assertClonedLenEqual(ctx, 1)
+            out.backward()
+            self.assertClonedLenEqual(ctx, 0)
+
+    def test_backward_out_of_context(self):
+        # Out of context
+        with torch.autograd.graph.allow_mutation_on_saved_tensors() as ctx:
+            a = torch.rand(2, 3, requires_grad=True)
+            out = (a**2).sum()
+
+        msg = "Trying to backward outside of the 'allow_mutation_on_saved_tensors' context"
+        with self.assertRaisesRegex(RuntimeError, msg):
+            out.backward()
+
+        # Different context
+        with torch.autograd.graph.allow_mutation_on_saved_tensors() as ctx:
+            a = torch.rand(2, 3, requires_grad=True)
+            out = (a**2).sum()
+
+        with torch.autograd.graph.allow_mutation_on_saved_tensors() as ctx:
+            with self.assertRaisesRegex(RuntimeError, msg):
+                out.backward()
+
+    def test_disallow_nesting(self):
+        with torch.autograd.graph.allow_mutation_on_saved_tensors() as ctx:
+            msg = "allow_mutation_on_saved_tensors contexts cannot be nested"
+            with self.assertRaisesRegex(RuntimeError, msg):
+                with torch.autograd.graph.allow_mutation_on_saved_tensors() as ctx:
+                    pass
 
 class TestAutogradInferenceMode(TestCase):
     def _is_inference_tensor(self, tensor):
diff --git a/torch/autograd/graph.py b/torch/autograd/graph.py
index 9c333c70bcf22..fc490a9d8e31c 100644
--- a/torch/autograd/graph.py
+++ b/torch/autograd/graph.py
@@ -1,15 +1,17 @@
 import torch
 import contextlib
-from typing import Callable, Any, Dict, Tuple, Optional, Sequence, List
+from typing import Callable, Any, Dict, Tuple, Optional, Sequence, List, Set
 from torch.utils.hooks import RemovableHandle
-
-__all__ = ["saved_tensors_hooks", "save_on_cpu"]
+from torch.utils._python_dispatch import TorchDispatchMode
+from collections import defaultdict
+import weakref
 
 __all__ = [
     "saved_tensors_hooks",
     "save_on_cpu",
     "disable_saved_tensors_hooks",
     "register_multi_grad_hook",
+    "allow_mutation_on_saved_tensors",
 ]
 
 class saved_tensors_hooks():
@@ -270,3 +272,158 @@ def __setstate__(self, state):
         handles.append(t.register_hook(get_inner_hook(i)))
 
     return Handle(tuple(handles))
+
+
+# NOTE [Allow mutation on tensors saved for backward]
+#
+# 1. Tensor gets saved for backward
+#    - remember the python object id and the version of the tensor
+#    - remember aliasing information (data_ptr of base + version)
+#    - save the original so we control its lifetime
+# 2. Any time a tensor gets in-placed
+#    - for each tensor aliased to it:
+#      - check using its object id and version to see if it has been saved
+#      - if it has been saved, clone it
+#      - delete the reference to the original
+# 3. during backward
+#    - if the clone exists, the tensor must've been modified in-place
+_allow_mutation_on_saved_tensors_enabled = False
+
+def _get_tid(t) -> Tuple[int, int, int]:
+    return (id(t), t.data_ptr(), t._version)
+
+def _get_sid(t) -> Tuple[int, int]:
+    return (t.data_ptr(), t._version)
+
+class _Handle():
+    pass
+
+class _swap_with_cloned(saved_tensors_hooks):
+    def __init__(self, ctx):
+        def pack_hook(t):
+            tid = _get_tid(t)
+            sid = _get_sid(t)
+            # Tensors saved for backward have an entry in _tid_to_weakhandle
+            handle: Optional[_Handle] = None
+
+            # Save aliasing information
+            ctx.sid_to_tid[sid].add(tid)
+
+            # NB: The same tensor (of the same version) can be saved multiple times
+            if tid not in ctx.tid_to_weakhandle:
+                handle = _Handle()
+                ctx.tid_to_weakhandle[tid] = handle
+                ctx.original[handle] = t
+            else:
+                # Store an additional strong reference to the handle
+                handle = ctx.tid_to_weakhandle[tid]
+            return handle
+
+        def unpack_hook(tup):
+            handle = tup
+            error_msg = (
+                "Trying to backward outside of the 'allow_mutation_on_saved_tensors' context"
+                "in which the graph was originally recorded.")
+            assert _allow_mutation_on_saved_tensors_enabled, error_msg
+            if handle in ctx.cloned:
+                res = ctx.cloned[handle]
+            else:
+                assert handle in ctx.original, error_msg
+                res = ctx.original[handle]
+            return res
+
+        super().__init__(pack_hook, unpack_hook)
+
+class _CloneArgBeforeMutateMode(TorchDispatchMode):
+    def __init__(self, ctx):
+        self.ctx = ctx
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs or {}
+
+        for idx, arg in enumerate(func._schema.arguments):
+            if arg.alias_info is not None and arg.alias_info.is_write:
+                t = kwargs["out"] if arg.is_out else args[idx]
+                tid = _get_tid(t)
+                sid = _get_sid(t)
+                ctx = self.ctx
+                if sid in ctx.sid_to_tid:
+                    for tid in ctx.sid_to_tid[sid]:
+                        if tid not in ctx.tid_to_weakhandle:
+                            # We know that if tid is in sid_to_tid, then it must also be in
+                            # tid_to_weakhandle. However, it is possible for the tensor to be
+                            # saved at one point, but cleared by backward before it is modified
+                            # in-place. Consider the following example:
+                            #
+                            # >>> a = torch.randn(2, 3, requires_grad=True).clone()
+                            # >>> out = (a**2).sum()
+                            # >>> out.backward()
+                            # >>> a.sin_()
+                            continue
+                        handle = ctx.tid_to_weakhandle[tid]
+                        if handle in ctx.cloned:
+                            # The same exact tensor has been cloned already
+                            continue
+                        ctx.cloned[handle] = ctx.original[handle].clone()
+                        del ctx.original[handle]
+
+        rs = func(*args, **kwargs)
+        return rs
+
+class _AllowMutationOnSavedContext():
+    def __init__(self):
+        self.cloned: weakref.WeakKeyDictionary = weakref.WeakKeyDictionary()
+        self.original: weakref.WeakKeyDictionary = weakref.WeakKeyDictionary()
+        self.tid_to_weakhandle: weakref.WeakValueDictionary = weakref.WeakValueDictionary()
+        self.sid_to_tid: Dict[Tuple[int, int], Set[Tuple[int, int, int]]] = defaultdict(set)
+
+    def clear(self):
+        self.cloned.clear()
+        self.original.clear()
+        self.tid_to_weakhandle.clear()
+        self.sid_to_tid.clear()
+
+@contextlib.contextmanager
+def allow_mutation_on_saved_tensors():
+    """Context manager under which mutating tensors saved for backward is allowed
+
+    Under this context manager, tensors saved for backward are cloned on mutation,
+    so the original version can still be used during backward. Normally, mutating a tensor
+    saved for backward will result in an error raised when it's used during backward.
+
+    To ensure the correct behavior, both the forward and backward should be run under
+    the same context manager.
+
+    returns:
+        An _AllowMutationOnSavedContext object storing the state managed by this
+        context manager. This object can be useful for debugging purposes. The state
+        managed by the context manager is automatically cleared upon exiting.
+
+    Example::
+
+        >>> import torch
+        >>> with torch.autograd.graph.allow_mutation_on_saved_tensors():
+        ...     # forward
+        ...     a = torch.ones(2, 3, requires_grad=True)
+        ...     b = a.clone()
+        ...     out = (b**2).sum()
+        ...     b.sin_()
+        ...     # backward
+        ...     out.sum().backward()
+        ...
+        tensor([[0.8415, 0.8415, 0.8415],
+                [0.8415, 0.8415, 0.8415]], grad_fn=<SinBackward0>)
+    """
+    global _allow_mutation_on_saved_tensors_enabled
+
+    ctx = _AllowMutationOnSavedContext()
+
+    with _swap_with_cloned(ctx), _CloneArgBeforeMutateMode(ctx):
+        try:
+            if _allow_mutation_on_saved_tensors_enabled:
+                raise RuntimeError("allow_mutation_on_saved_tensors contexts cannot be nested")
+            _allow_mutation_on_saved_tensors_enabled = True
+            yield ctx
+        finally:
+            ctx.clear()
+            _allow_mutation_on_saved_tensors_enabled = False

From 973f24b7c2569a47d6cd21864c862f1c7c796673 Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Thu, 10 Nov 2022 19:54:56 +0000
Subject: [PATCH 0794/1922] [FSDP][state_dict][3/N] Change how state_dict utils
 access attributes in _FSDPState (#88635)

**What This PR Does**
_state_dict_utils currently accesses the FSDP states through module. To enable composable FSDP state_dict, these accesses need to go through _FSDPState. module is still required for most APIs as state_dict has to access per-module information.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88635
Approved by: https://github.com/awgu
---
 torch/distributed/fsdp/_common_utils.py     |  18 ++
 torch/distributed/fsdp/_state_dict_utils.py | 260 ++++++++++++--------
 2 files changed, 177 insertions(+), 101 deletions(-)

diff --git a/torch/distributed/fsdp/_common_utils.py b/torch/distributed/fsdp/_common_utils.py
index c93c8abb5ebd8..f6ccc3e9243f8 100644
--- a/torch/distributed/fsdp/_common_utils.py
+++ b/torch/distributed/fsdp/_common_utils.py
@@ -61,6 +61,24 @@ def _all_handles(state: _FSDPState) -> List:
     )
 
 
+@no_type_check
+def _module_handles(state: _FSDPState, module: nn.Module) -> List:
+    """
+    Given a module and returns the flat handles that map to this module. If the
+    module is FullyShardedDataParallel, the module._handles will be returned.
+    """
+    if _is_composable(state):
+        return state._module_to_handles[module][:]
+    else:
+        return module._handles[:]
+
+
+@no_type_check
+def _has_fsdp_params(state: _FSDPState, module: nn.Module) -> bool:
+    """Given a module and returns if this module has parameters sharded by FSDP."""
+    return len(_module_handles(state, module)) > 0
+
+
 def clean_tensor_name(tensor_name: str) -> str:
     """
     Cleans the parameter or buffer name by removing any module wrapper
diff --git a/torch/distributed/fsdp/_state_dict_utils.py b/torch/distributed/fsdp/_state_dict_utils.py
index c90bd4d409b1f..0bfd149b0112c 100644
--- a/torch/distributed/fsdp/_state_dict_utils.py
+++ b/torch/distributed/fsdp/_state_dict_utils.py
@@ -1,7 +1,7 @@
 import functools
 import math
 import warnings
-from typing import Any, Callable, cast, Dict, Iterator, Tuple
+from typing import Any, Callable, cast, Dict, Iterator, no_type_check, Tuple
 
 import torch
 import torch.distributed as dist
@@ -18,6 +18,9 @@
     ShardedTensor,
 )
 from torch.distributed.fsdp._common_utils import (
+    _FSDPState,
+    _has_fsdp_params,
+    _module_handles,
     clean_tensor_name,
     FSDP_PREFIX,
     FSDP_WRAPPED_MODULE,
@@ -51,24 +54,28 @@ def _convert_to_wrapped_module_name(module_name: str) -> str:
     return module_name
 
 
-def _param_fqns(module) -> Iterator[Tuple[str, str, str]]:
-    if not module._has_params:
+def _param_fqns(module, fsdp_state: _FSDPState) -> Iterator[Tuple[str, str, str]]:
+    if not _has_fsdp_params(fsdp_state, module):
         return
-    for param_name, module_name in module._handles[0].parameter_module_names():
+    for param_name, module_name in _module_handles(fsdp_state, module)[
+        0
+    ].parameter_module_names():
         module_name = _convert_to_wrapped_module_name(module_name)
         fqn = f"{module_name}{param_name}"
         yield fqn, param_name, module_name
 
 
-def _shared_param_fqns(module) -> Iterator[Tuple[str, str, str]]:
-    for param_name, module_name in module._handles[0].shared_parameter_module_names():
+def _shared_param_fqns(module, fsdp_state) -> Iterator[Tuple[str, str, str]]:
+    for param_name, module_name in _module_handles(fsdp_state, module)[
+        0
+    ].shared_parameter_module_names():
         module_name = _convert_to_wrapped_module_name(module_name)
         fqn = f"{module_name}{param_name}"
         yield fqn, param_name, module_name
 
 
 def _enter_full_param_ctx(
-    module,
+    fsdp_state: _FSDPState,
     recurse: bool = False,
     writeback: bool = False,
     rank0_only: bool = False,
@@ -80,53 +87,56 @@ def _enter_full_param_ctx(
     requires to enter the context in the pre-hook but leave the context in the
     post-hook. This API enters the context of ``summon_full_params``.
     """
-    assert module._full_param_ctx is None, (
-        "Entering the ``summon_full_params`` context but module._full_param_ctx "
+    assert fsdp_state._full_param_ctx is None, (
+        "Entering the ``summon_full_params`` context but fsdp_state._full_param_ctx "
         "is not None."
     )
-    assert module.training_state != TrainingState.SUMMON_FULL_PARAMS, (
+    assert fsdp_state.training_state != TrainingState.SUMMON_FULL_PARAMS, (
         "Entering the summon_full_params context but the state is already "
         "SUMMON_FULL_PARAMS."
     )
-    module._full_param_ctx = module._summon_full_params(
+    fsdp_state._full_param_ctx = fsdp_state._summon_full_params(
         recurse=recurse,
         writeback=writeback,
         rank0_only=rank0_only,
         offload_to_cpu=offload_to_cpu,
         with_grads=with_grads,
     )
-    module._full_param_ctx.__enter__()
+    fsdp_state._full_param_ctx.__enter__()
 
 
-def _exit_full_param_ctx(module) -> None:
+@no_type_check
+def _exit_full_param_ctx(fsdp_state: _FSDPState) -> None:
     """A helper function to exit ``summon_full_params`` context."""
-    assert module.training_state == TrainingState.SUMMON_FULL_PARAMS, (
+    assert fsdp_state.training_state == TrainingState.SUMMON_FULL_PARAMS, (
         "Exiting the summon_full_params context but the state is not "
         "SUMMON_FULL_PARAMS."
     )
-    assert module._full_param_ctx is not None
-    module._full_param_ctx.__exit__(None, None, None)
-    module._full_param_ctx = None
+    assert fsdp_state._full_param_ctx is not None
+    fsdp_state._full_param_ctx.__exit__(None, None, None)
+    fsdp_state._full_param_ctx = None
 
 
 def _common_pre_state_dict_hook(
     module,
+    fsdp_state: _FSDPState,
     state_dict: Dict[str, Any],
     prefix: str,
 ) -> None:
     """Performs the pre-state_dict tasks shared by all state_dict types."""
     if torch.cuda.is_available():
         torch.cuda.synchronize()
-    _lazy_init(module, module)
+    # TODO: need to check if this is always correct for composable FSDP.
+    _lazy_init(fsdp_state, module)
     # TODO: change to this call after pre_state_dict_hook is in `nn.Module`.
-    # if module.is_root:
-    #    _clear_grads_if_needed(module._fsdp_handles(module))
-    if module._has_params:
-        _clear_grads_if_needed([module._handles[0]])
+    # if fsdp_state.is_root:
+    #    _clear_grads_if_needed(_all_handles(fsdp_state))
+    if _has_fsdp_params(fsdp_state, module):
+        _clear_grads_if_needed([_module_handles(fsdp_state, module)[0]])
 
 
 def _common_summon_pre_state_dict_hook(
-    module,
+    fsdp_state: _FSDPState,
     offload_to_cpu: bool,
     rank0_only: bool,
 ) -> None:
@@ -135,7 +145,7 @@ def _common_summon_pre_state_dict_hook(
     ``summon_full_params()``. FULL_STATE_DICT and SHARDED_STATE_DICT use this hook.
     """
     _enter_full_param_ctx(
-        module,
+        fsdp_state,
         recurse=False,
         writeback=False,
         offload_to_cpu=offload_to_cpu,
@@ -144,8 +154,10 @@ def _common_summon_pre_state_dict_hook(
 
 
 # TODO: change to the decorator style. See ``_full_pre_state_dict_hook``.
+@no_type_check
 def _common_summon_post_state_dict_hook(
     module,
+    fsdp_state: _FSDPState,
     state_dict: Dict[str, Any],
     prefix: str,
     param_hook: Callable,
@@ -157,17 +169,17 @@ def _common_summon_post_state_dict_hook(
     """
     _replace_by_prefix(state_dict, prefix + f"{FSDP_PREFIX}", prefix)
     assert (
-        module.training_state == TrainingState.SUMMON_FULL_PARAMS
+        fsdp_state.training_state == TrainingState.SUMMON_FULL_PARAMS
     ), "Inside the post_state_dict_hook but the state is not SUMMON_FULL_PARAMS."
     # Return early for trivial cases
-    if not state_dict or not module._has_params:
-        _exit_full_param_ctx(module)
+    if not state_dict or not _has_fsdp_params(fsdp_state, module):
+        _exit_full_param_ctx(fsdp_state)
         return state_dict
 
     # TODO: Once pre_state_dict hook is supported, this pop should be removed.
     # For `use_orig_params=True`, the `FlatParameter` is not registered, so
     # there is no entry in the state dict for it to pop.
-    if not module._use_orig_params:
+    if not fsdp_state._use_orig_params:
         state_dict.pop(f"{prefix}{fsdp_file.FLAT_PARAM}")
 
     # If a rank does not have unsharded parameters(when `rank0_only=True`
@@ -175,25 +187,25 @@ def _common_summon_post_state_dict_hook(
     # all-gather and does not need to save the # state dict. We simply check
     # rank0_only to ensure this issue.
     rank0_only = (
-        module._state_dict_type == StateDictType.FULL_STATE_DICT
-        and cast(FullStateDictConfig, module._state_dict_config).rank0_only
+        fsdp_state._state_dict_type == StateDictType.FULL_STATE_DICT
+        and cast(FullStateDictConfig, fsdp_state._state_dict_config).rank0_only
     )
     # no_fsdp_return means the state_dict returned by this rank should contain
     # only non-FSDP controlled parameters and buffers.
-    no_fsdp_return = rank0_only and module.rank != 0
-    if no_fsdp_return and not module._use_orig_params:
-        for clean_key in module._buffer_names:
+    no_fsdp_return = rank0_only and fsdp_state.rank != 0
+    if no_fsdp_return and not fsdp_state._use_orig_params:
+        for clean_key in fsdp_state._buffer_names:
             # This is a hack to support activation checkpoint.
             clean_key = clean_key.replace(
                 f"{checkpoint_wrapper._CHECKPOINT_PREFIX}.", ""
             )
             state_dict.pop(f"{prefix}{clean_key}", None)
-        _exit_full_param_ctx(module)
+        _exit_full_param_ctx(fsdp_state)
         return state_dict
 
     # Loop only the parameters saved in this instance's wrapped module to
     # avoid processing buffers.
-    for fqn, param_name, module_name in _param_fqns(module):
+    for fqn, param_name, module_name in _param_fqns(module, fsdp_state):
         # TODO: remove the parameter retrieval. See ``_full_pre_state_dict_hook``.
         param = functools.reduce(getattr, fqn.split("."), module.module)
         fqn = f"{prefix}{fqn}"
@@ -205,16 +217,16 @@ def _common_summon_post_state_dict_hook(
             f"FSDP assumes {fqn} is in the state_dict but the state_dict only "
             f"has {state_dict.keys()}. "
             f"prefix={prefix}, module_name={module_name}, "
-            f"param_name={param_name} rank={module.rank}."
+            f"param_name={param_name} rank={fsdp_state.rank}."
         )
 
-        param_hook(module, state_dict, prefix, fqn)
-    _exit_full_param_ctx(module)
+        param_hook(state_dict, prefix, fqn)
+    _exit_full_param_ctx(fsdp_state)
 
     cpu_device = torch.device("cpu")
     buffer_clean_fqns = []
     buffers = []
-    for clean_key in module._buffer_names:
+    for clean_key in fsdp_state._buffer_names:
         # This is a hack to support activation checkpoint.
         clean_key = clean_tensor_name(clean_key)
         fqn = f"{prefix}{clean_key}"
@@ -225,22 +237,29 @@ def _common_summon_post_state_dict_hook(
             state_dict.pop(fqn)
         else:
             buffer = state_dict[fqn]
-            if module._state_dict_config.offload_to_cpu and buffer.device != cpu_device:
+            if (
+                fsdp_state._state_dict_config.offload_to_cpu
+                and buffer.device != cpu_device
+            ):
                 state_dict[fqn] = buffer.to(cpu_device)
             # TODO: for composable FSDP, this should be clean_tensor_name(clean_key),
             buffer_clean_fqns.append(clean_key)
             buffers.append(state_dict[fqn])
-    if buffers and module._mixed_precision_enabled_for_buffers():
-        buffer_dtypes = _get_buffer_dtypes(module, buffer_clean_fqns)
-        _cast_buffers_to_dtype_and_device(buffers, buffer_dtypes, module.compute_device)
+    if buffers and fsdp_state._mixed_precision_enabled_for_buffers():
+        buffer_dtypes = _get_buffer_dtypes(fsdp_state, buffer_clean_fqns)
+        _cast_buffers_to_dtype_and_device(
+            buffers, buffer_dtypes, fsdp_state.compute_device
+        )
         for buffers, clean_fqn in zip(buffers, buffer_clean_fqns):
             fqn = f"{prefix}{clean_fqn}"
             state_dict[fqn] = buffer.clone()
     return state_dict
 
 
+@no_type_check
 def _full_pre_state_dict_hook(
     module,
+    fsdp_state: _FSDPState,
     state_dict: Dict[str, Any],
     prefix: str,
 ) -> None:
@@ -254,16 +273,18 @@ def _full_pre_state_dict_hook(
     TODO: clean the callsites and hacks after ``pre_state_dict_hook` ` is supported
     in ``nn.Module``.
     """
-    _common_pre_state_dict_hook(module, state_dict, prefix)
+    _common_pre_state_dict_hook(module, fsdp_state, state_dict, prefix)
     _common_summon_pre_state_dict_hook(
-        module,
-        offload_to_cpu=module._state_dict_config.offload_to_cpu,
-        rank0_only=cast(FullStateDictConfig, module._state_dict_config).rank0_only,
+        fsdp_state,
+        offload_to_cpu=fsdp_state._state_dict_config.offload_to_cpu,
+        rank0_only=cast(FullStateDictConfig, fsdp_state._state_dict_config).rank0_only,
     )
 
 
+@no_type_check
 def _full_post_state_dict_hook(
     module,
+    fsdp_state: _FSDPState,
     state_dict: Dict[str, Any],
     prefix: str,
 ) -> Dict[str, Any]:
@@ -274,10 +295,9 @@ def _full_post_state_dict_hook(
     the ``FSDP_WRAPPED_MODULE`` prefix.
     """
     # TODO: remove the hack. See ``_full_pre_state_dict_hook``.
-    _full_pre_state_dict_hook(module, state_dict, prefix)
+    _full_pre_state_dict_hook(module, fsdp_state, state_dict, prefix)
 
     def param_hook(
-        module,
         state_dict: Dict[str, Any],
         prefix: str,
         fqn: str,
@@ -292,7 +312,7 @@ def param_hook(
 
         # Clone non-ignored parameters before exiting the
         # `_summon_full_params()` context
-        if clean_key not in module._ignored_param_names and not getattr(
+        if clean_key not in fsdp_state._ignored_param_names and not getattr(
             state_dict[fqn], "_has_been_cloned", False
         ):
             try:
@@ -300,31 +320,37 @@ def param_hook(
                 state_dict[fqn]._has_been_cloned = True  # type: ignore[attr-defined]
             except BaseException as e:
                 warnings.warn(
-                    f"Failed to clone() tensor with name {fqn} on rank {module.rank}. "
+                    f"Failed to clone() tensor with name {fqn} on rank {fsdp_state.rank}. "
                     "This may mean that this state_dict entry could point to invalid "
                     "memory regions after returning from state_dict() call if this "
                     "parameter is managed by FSDP. Please check clone "
                     f"implementation of {fqn}. Error: {str(e)}"
                 )
 
-    return _common_summon_post_state_dict_hook(module, state_dict, prefix, param_hook)
+    return _common_summon_post_state_dict_hook(
+        module, fsdp_state, state_dict, prefix, param_hook
+    )
 
 
 def _full_pre_load_state_dict_hook(
     module,
+    fsdp_state: _FSDPState,
     state_dict: Dict[str, Any],
     prefix: str,
 ) -> None:
-    _enter_full_param_ctx(module, recurse=False, writeback=True)
+    _enter_full_param_ctx(fsdp_state, recurse=False, writeback=True)
     _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP_PREFIX}")
 
 
-def _full_post_load_state_dict_hook(module, *args, **kwargs) -> None:
-    _exit_full_param_ctx(module)
+def _full_post_load_state_dict_hook(
+    module, fsdp_state: _FSDPState, *args, **kwargs
+) -> None:
+    _exit_full_param_ctx(fsdp_state)
 
 
 def _local_pre_state_dict_hook(
     module,
+    fsdp_state: _FSDPState,
     state_dict: Dict[str, Any],
     prefix: str,
 ) -> None:
@@ -333,16 +359,21 @@ def _local_pre_state_dict_hook(
     hook is not supported by the PyTorch core. So this API is called from
     `_local_post_state_dict_hook()` to simulate the case.
     """
-    if module._has_params and not module._handles[0].uses_sharded_strategy:
+    if (
+        _has_fsdp_params(fsdp_state, module)
+        and not _module_handles(fsdp_state, module)[0].uses_sharded_strategy
+    ):
         raise RuntimeError(
             "``local_state_dict`` can only be used when parameters are flatten "
             "and sharded."
         )
-    _common_pre_state_dict_hook(module, state_dict, prefix)
+    _common_pre_state_dict_hook(module, fsdp_state, state_dict, prefix)
 
 
+@no_type_check
 def _local_post_state_dict_hook(
     module,
+    fsdp_state: _FSDPState,
     state_dict: Dict[str, Any],
     prefix: str,
 ) -> Dict[str, Any]:
@@ -352,42 +383,45 @@ def _local_post_state_dict_hook(
     will happen. The underlying storage is the same.
     """
     # TODO: remove the hack. See ``_full_pre_state_dict_hook``.
-    _local_pre_state_dict_hook(module, state_dict, prefix)
+    _local_pre_state_dict_hook(module, fsdp_state, state_dict, prefix)
 
     _replace_by_prefix(state_dict, f"{prefix}{FSDP_PREFIX}", prefix)
-    if not module._has_params:
+    if not _has_fsdp_params(fsdp_state, module):
         return state_dict
 
     # state_dict[f"{prefix}{FLAT_PARAM}"] exists and has the same tensor
     # value as the flat_param but it is a pure Tensor because
     # nn.Module.state_dict() will detach the parameter. Therefore, we need
     # to get flat_param to get the metadata.
-    assert module._handles, "Should have returned early"
-    flat_param = module._handles[0].flat_param
+    assert _module_handles(fsdp_state, module), "Should have returned early"
+    flat_param = _module_handles(fsdp_state, module)[0].flat_param
     # Construct a ShardedTensor from the flat_param.
     full_numel = flat_param._unpadded_unsharded_size.numel()  # type: ignore[attr-defined]
-    shard_offset = flat_param.numel() * module.rank
+    shard_offset = flat_param.numel() * fsdp_state.rank
     valid_data_size = flat_param.numel() - flat_param._shard_numel_padded
     if valid_data_size > 0 and flat_param._shard_numel_padded > 0:
         flat_param = flat_param.narrow(0, 0, valid_data_size)
     local_shards = [
-        Shard.from_tensor_and_offsets(flat_param, [shard_offset], module.rank)
+        Shard.from_tensor_and_offsets(flat_param, [shard_offset], fsdp_state.rank)
     ]
     sharded_tensor = init_from_local_shards(
-        local_shards, full_numel, process_group=module.process_group
+        local_shards, full_numel, process_group=fsdp_state.process_group
     )  # type: ignore[assignment]
-    if module._state_dict_config.offload_to_cpu:
+    if fsdp_state._state_dict_config.offload_to_cpu:
         sharded_tensor = sharded_tensor.cpu()
     state_dict[f"{prefix}{fsdp_file.FLAT_PARAM}"] = sharded_tensor
     return state_dict
 
 
-def _local_post_load_state_dict_hook(module, *args, **kwargs) -> None:
+def _local_post_load_state_dict_hook(
+    module, fsdp_state: _FSDPState, *args, **kwargs
+) -> None:
     pass
 
 
 def _local_pre_load_state_dict_hook(
     module,
+    fsdp_state: _FSDPState,
     state_dict: Dict[str, Any],
     prefix: str,
 ) -> None:
@@ -399,7 +433,7 @@ def _local_pre_load_state_dict_hook(
     _replace_by_prefix(state_dict, prefix, f"{prefix}{FSDP_PREFIX}")
     fqn = f"{prefix}{FSDP_PREFIX}{fsdp_file.FLAT_PARAM}"
     if fqn not in state_dict:
-        assert not module._has_params, (
+        assert not _has_fsdp_params(fsdp_state, module), (
             "No `FlatParameter` in `state_dict` for this FSDP instance "
             "but it has parameters"
         )
@@ -416,7 +450,7 @@ def _local_pre_load_state_dict_hook(
 
     # Get the metadata of the flat_param to decide whether to pad the loaded
     # tensor.
-    flat_param = module._handles[0].flat_param
+    flat_param = _module_handles(fsdp_state, module)[0].flat_param
     assert flat_param is not None
     if flat_param._shard_numel_padded not in (0, flat_param.numel()):
         assert load_tensor.numel() < flat_param.numel(), (
@@ -429,6 +463,7 @@ def _local_pre_load_state_dict_hook(
 
 def _sharded_pre_state_dict_hook(
     module,
+    fsdp_state: _FSDPState,
     state_dict: Dict[str, Any],
     prefix: str,
 ) -> None:
@@ -436,23 +471,28 @@ def _sharded_pre_state_dict_hook(
     Hook that runs before model.state_dict() is called. Check
     ``_full_pre_load_state_dict_hook`` for the detail.
     """
-    if module._has_params and not module._handles[0].uses_sharded_strategy:
+    if (
+        _has_fsdp_params(fsdp_state, module)
+        and not _module_handles(fsdp_state, module)[0].uses_sharded_strategy
+    ):
         raise RuntimeError(
             "``sharded_state_dict`` can only be used when parameters are flatten "
             "and sharded."
         )
-    _common_pre_state_dict_hook(module, state_dict, prefix)
+    _common_pre_state_dict_hook(module, fsdp_state, state_dict, prefix)
     # Setting offload_to_cpu here does not work even if offload_to_cpu is True.
     # We have to create ShardedTensor first then move it to CPU.
     _common_summon_pre_state_dict_hook(
-        module,
+        fsdp_state,
         offload_to_cpu=False,
         rank0_only=False,
     )
 
 
+@no_type_check
 def _sharded_post_state_dict_hook(
     module,
+    fsdp_state: _FSDPState,
     state_dict: Dict[str, Any],
     prefix: str,
 ) -> Dict[str, Any]:
@@ -462,31 +502,38 @@ def _sharded_post_state_dict_hook(
     """
 
     # TODO: remove the hack. See ``_full_pre_state_dict_hook``.
-    _sharded_pre_state_dict_hook(module, state_dict, prefix)
+    _sharded_pre_state_dict_hook(module, fsdp_state, state_dict, prefix)
 
-    def param_hook(module, state_dict: Dict[str, Any], prefix: str, fqn: str):
+    def param_hook(state_dict: Dict[str, Any], prefix: str, fqn: str):
         param = state_dict[fqn]
         sharded_tensor = _ext_chunk_tensor(
             tensor=param,
-            rank=module.rank,
-            world_size=module.world_size,
+            rank=fsdp_state.rank,
+            world_size=fsdp_state.world_size,
             num_devices_per_node=torch.cuda.device_count(),
-            pg=module.process_group,
+            pg=fsdp_state.process_group,
         )
-        if module._state_dict_config.offload_to_cpu:
+        if fsdp_state._state_dict_config.offload_to_cpu:
             sharded_tensor = sharded_tensor.cpu()
         state_dict[fqn] = sharded_tensor
 
-    return _common_summon_post_state_dict_hook(module, state_dict, prefix, param_hook)
+    return _common_summon_post_state_dict_hook(
+        module, fsdp_state, state_dict, prefix, param_hook
+    )
 
 
-def _sharded_post_load_state_dict_hook(module, *args, **kwargs) -> None:
-    if module._use_orig_params:
-        module._register_orig_params()
+@no_type_check
+def _sharded_post_load_state_dict_hook(
+    module, fsdp_state: _FSDPState, *args, **kwargs
+) -> None:
+    if fsdp_state._use_orig_params:
+        fsdp_state._register_orig_params()
 
 
+@no_type_check
 def _sharded_pre_load_state_dict_hook(
     module,
+    fsdp_state: _FSDPState,
     state_dict: Dict[str, Any],
     prefix: str,
 ) -> None:
@@ -495,19 +542,19 @@ def _sharded_pre_load_state_dict_hook(
     a new FlatParameter and shards the new FlatParameter to the local chunk.
     """
     _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP_PREFIX}")
-    if not module._has_params:
+    if not _has_fsdp_params(fsdp_state, module):
         return
 
-    if not module._handles[0].uses_sharded_strategy:
+    if not _module_handles(fsdp_state, module)[0].uses_sharded_strategy:
         raise RuntimeError(
             "load_sharded_state_dict can only be called when parameters "
             "are flatten and sharded."
         )
 
     nonsharded_tensors = []
-    shared_fqns = [fqn for fqn, _, _ in _shared_param_fqns(module)]
+    shared_fqns = [fqn for fqn, _, _ in _shared_param_fqns(module, fsdp_state)]
     loaded_shapes = []
-    for fqn, _, _ in _param_fqns(module):
+    for fqn, _, _ in _param_fqns(module, fsdp_state):
         full_fqn = f"{prefix}{FSDP_PREFIX}{fqn}"
         param = state_dict.pop(full_fqn)
         if fqn in shared_fqns:
@@ -517,12 +564,12 @@ def _sharded_pre_load_state_dict_hook(
         loaded_shapes.append(param.size())
         assert len(shards) < 2, (
             "Expects 0 or 1 shard per rank "
-            f"but got {len(shards)} shards on rank {module.rank}."
+            f"but got {len(shards)} shards on rank {fsdp_state.rank}."
         )
         param_numel = param.size().numel()
         dim_0_size = param.size()[0]
         chunk_size = (
-            math.ceil(dim_0_size / module.world_size) * param_numel // dim_0_size
+            math.ceil(dim_0_size / fsdp_state.world_size) * param_numel // dim_0_size
         )
         if len(shards) == 1:
             local_tensor = shards[0].tensor.flatten()
@@ -534,14 +581,16 @@ def _sharded_pre_load_state_dict_hook(
         else:
             local_tensor = torch.zeros(chunk_size, dtype=param.dtype).cuda()
         tensor = torch.empty(
-            chunk_size * module.world_size, dtype=local_tensor.dtype
+            chunk_size * fsdp_state.world_size, dtype=local_tensor.dtype
         ).cuda()
-        dist.all_gather_into_tensor(tensor, local_tensor, group=module.process_group)
+        dist.all_gather_into_tensor(
+            tensor, local_tensor, group=fsdp_state.process_group
+        )
         tensor = tensor.narrow(0, 0, param_numel).reshape(param.size())
         nonsharded_tensors.append(tensor)
 
     # Create a new flat_param from the loaded, non-sharded tensors.
-    flat_param = module._handles[0].flat_param
+    flat_param = _module_handles(fsdp_state, module)[0].flat_param
     loaded_flat_param = FlatParamHandle.flatten_params(
         nonsharded_tensors, requires_grad=False
     )
@@ -549,8 +598,8 @@ def _sharded_pre_load_state_dict_hook(
     # Get the chunk from the loaded flat_param for the local rank.
     loaded_flat_tensor, num_to_pad = FlatParamHandle._get_shard(
         loaded_flat_param,
-        module.rank,
-        module.world_size,
+        fsdp_state.rank,
+        fsdp_state.world_size,
     )
     loaded_flat_tensor.to(flat_param.device)
     assert all(s1 == s2 for s1, s2 in zip(loaded_shapes, flat_param._shapes)), (
@@ -567,10 +616,11 @@ def _sharded_pre_load_state_dict_hook(
         f"from the local chunk {flat_param._shard_numel_padded}."
     )
     state_dict[f"{prefix}{FSDP_PREFIX}{fsdp_file.FLAT_PARAM}"] = loaded_flat_tensor
-    if module._use_orig_params:
-        module._deregister_orig_params()
+    if fsdp_state._use_orig_params:
+        fsdp_state._deregister_orig_params()
 
 
+@no_type_check
 @torch.no_grad()
 def _post_state_dict_hook(
     module: nn.Module,
@@ -580,21 +630,24 @@ def _post_state_dict_hook(
 ) -> Dict[str, Any]:
     """
     _post_state_dict_hook() is called after the state_dict() of this
-    FSDP module is executed. ``module._state_dict_type`` is used to decide
+    FSDP module is executed. ``fsdp_state._state_dict_type`` is used to decide
     what postprocessing will be done.
     """
+    # TODO: get the composable state from module
+    fsdp_state: _FSDPState = module
     _post_state_dict_hook_fn = {
         StateDictType.FULL_STATE_DICT: _full_post_state_dict_hook,
         StateDictType.LOCAL_STATE_DICT: _local_post_state_dict_hook,
         StateDictType.SHARDED_STATE_DICT: _sharded_post_state_dict_hook,
     }
     fsdp_module = cast(fsdp_file.FullyShardedDataParallel, module)
-    processed_state_dict = _post_state_dict_hook_fn[fsdp_module._state_dict_type](
-        fsdp_module, state_dict, prefix
+    processed_state_dict = _post_state_dict_hook_fn[fsdp_state._state_dict_type](
+        fsdp_module, fsdp_state, state_dict, prefix
     )
     return processed_state_dict
 
 
+@no_type_check
 @torch.no_grad()
 def _pre_load_state_dict_hook(
     module: nn.Module,
@@ -604,9 +657,11 @@ def _pre_load_state_dict_hook(
 ) -> None:
     """
     ``_pre_state_dict_hook` is called before ``module._load_from_state_dict()``
-    is called. ``module._state_dict_type`` is used to decide what preprocessing
+    is called. ``fsdp_state._state_dict_type`` is used to decide what preprocessing
     will be done.
     """
+    # TODO: get the composable state from module
+    fsdp_state: _FSDPState = module
     _pre_load_state_dict_hook_fn = {
         StateDictType.FULL_STATE_DICT: _full_pre_load_state_dict_hook,
         StateDictType.LOCAL_STATE_DICT: _local_pre_load_state_dict_hook,
@@ -617,13 +672,16 @@ def _pre_load_state_dict_hook(
     if torch.cuda.is_available():
         torch.cuda.synchronize()
     # Dispatch into state_dict specific implementation of pre-hook.
-    _pre_load_state_dict_hook_fn[fsdp_module._state_dict_type](
-        fsdp_module, state_dict, prefix
+    _pre_load_state_dict_hook_fn[fsdp_state._state_dict_type](
+        fsdp_module, fsdp_state, state_dict, prefix
     )
 
 
+@no_type_check
 @torch.no_grad()
 def _post_load_state_dict_hook(module: nn.Module, *args: Any) -> None:
+    # TODO: get the composable state from module
+    fsdp_state: _FSDPState = module
     _post_load_state_dict_hook_fn = {
         StateDictType.FULL_STATE_DICT: _full_post_load_state_dict_hook,
         StateDictType.LOCAL_STATE_DICT: _local_post_load_state_dict_hook,
@@ -633,4 +691,4 @@ def _post_load_state_dict_hook(module: nn.Module, *args: Any) -> None:
     fsdp_module = cast(fsdp_file.FullyShardedDataParallel, module)
     # Dispatch into state_dict type specific implementation of post-hook for
     # loading state_dict.
-    _post_load_state_dict_hook_fn[fsdp_module._state_dict_type](fsdp_module)
+    _post_load_state_dict_hook_fn[fsdp_state._state_dict_type](fsdp_module, fsdp_state)

From 9053251efc37a11d95b156402e23bb64cdc4146d Mon Sep 17 00:00:00 2001
From: anjali411 <chourdiaanjali123@gmail.com>
Date: Fri, 11 Nov 2022 15:24:28 +0000
Subject: [PATCH 0795/1922] Add meta impl for topk (#88694)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88694
Approved by: https://github.com/ezyang
---
 test/functorch/test_aotdispatch.py |  1 -
 test/test_proxy_tensor.py          |  1 -
 torch/_meta_registrations.py       | 17 +++++++++++++++++
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 09b65a32bfee9..4da39210343e7 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1214,7 +1214,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('take_along_dim', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('take', ''),  # aten.take.default - couldn't find symbolic meta function/decomposition
     xfail('tensordot', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('topk', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('trace', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('trapezoid', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('trapz', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 72c7249f4f145..d1a5c9498bcaa 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1337,7 +1337,6 @@ def f(a, b, c, d, e):
     xfail('take_along_dim', ''),  # dtype of indices should be Long but got Float
     xfail('take', ''),  # aten.take.default - couldn't find symbolic meta function/decomposition
     xfail('tensordot', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
-    xfail('topk', ''),  # aten.topk.default - couldn't find symbolic meta function/decomposition
     xfail('trapz', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('trapezoid', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('triangular_solve', ''),  # aten.triangular_solve.default - couldn't find symbolic meta function/decomposition
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 04c522ab9e3b4..5d583de67d196 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1777,6 +1777,23 @@ def scalar_tensor(s, dtype=None, layout=None, device=None, pin_memory=None):
     )
 
 
+@register_meta(aten.topk.default)
+def topk_meta(self, k, dim=-1, largest=True, sorted=True):
+    # From aten/src/ATen/native/Sorting.cpp
+    dim = maybe_wrap_dim(dim, self.dim(), wrap_scalar=True)
+    check(
+        k >= 0 and k <= (self.size(dim) if self.dim() > 0 else 1),
+        lambda: "selected index k out of range",
+    )
+    sliceSize = 1 if self.dim() == 0 else self.size(dim)
+    check(k >= 0 and k <= sliceSize, lambda: "k not in range for dimension")
+
+    topKSize = list(self.shape)
+    if len(topKSize) > 0:
+        topKSize[dim] = k
+    return self.new_empty(topKSize), self.new_empty(topKSize, dtype=torch.int64)
+
+
 # We must also trigger meta registrations from PrimTorch ref
 # decompositions
 import torch._refs

From ed0a6a3127de31dc38ca1612591c2ccb2064ae2e Mon Sep 17 00:00:00 2001
From: Nikita Karetnikov <nikita@karetnikov.org>
Date: Fri, 11 Nov 2022 13:51:18 +0100
Subject: [PATCH 0796/1922] Symintify `broadcast_to` (#88776)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88776
Approved by: https://github.com/ezyang
---
 .../ATen/functorch/BatchRulesDecompositions.cpp   |  2 +-
 aten/src/ATen/native/TensorShape.cpp              |  4 ++--
 aten/src/ATen/native/native_functions.yaml        |  4 +++-
 test/functorch/test_aotdispatch.py                | 11 ++---------
 test/test_proxy_tensor.py                         | 15 ---------------
 5 files changed, 8 insertions(+), 28 deletions(-)

diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
index 66aaa53bfcc1f..e31b36d112418 100644
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@@ -63,7 +63,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   OP_DECOMPOSE2(bitwise_or, Scalar);
   OP_DECOMPOSE2(bitwise_xor, Scalar);
   OP_DECOMPOSE(broadcast_tensors);
-  OP_DECOMPOSE(broadcast_to);
+  m.impl("broadcast_to", native::broadcast_to_symint);
   OP_DECOMPOSE(cartesian_prod);
   OP_DECOMPOSE(cdist);
   OP_DECOMPOSE(clip);
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index 31b4011c12813..deb9b949aa5d3 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -537,8 +537,8 @@ Tensor sparse_broadcast_to(const Tensor& self, IntArrayRef size) {
   return at::sparse_coo_tensor(new_indices, new_values, size)._coalesced_(is_coalesced);
 }
 
-Tensor broadcast_to(const Tensor& self, IntArrayRef size) {
-  return self.expand(size);
+Tensor broadcast_to_symint(const Tensor& self, SymIntArrayRef size) {
+  return self.expand_symint(size);
 }
 
 std::vector<Tensor> broadcast_tensors(TensorList tensors) {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 0ea606f5e1fb5..de087c0b8a896 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1195,8 +1195,10 @@
   device_check: NoCheck
   device_guard: False
 
-- func: broadcast_to(Tensor(a) self, int[] size) -> Tensor(a)
+- func: broadcast_to(Tensor(a) self, SymInt[] size) -> Tensor(a)
   variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: broadcast_to_symint
 
 - func: _sparse_broadcast_to(Tensor(a) self, int[] size) -> Tensor(a)
   variants: function
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 4da39210343e7..f4782b8a595df 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1093,20 +1093,13 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('masked.cumprod', ''),  # aten.cumprod.default - couldn't find symbolic meta function/decomposition
     xfail('masked.cumsum', ''),  # aten.cumsum.default - couldn't find symbolic meta function/decomposition
     xfail('masked_fill', ''),  # could not find kernel
-    xfail('masked.log_softmax', ''),  # argument 'size' (position 2) must be tuple of ints, not ...
     xfail('masked.logaddexp', ''),  # aten.logaddexp.default - couldn't find symbolic meta function/decomposi...
     xfail('masked.logsumexp', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('masked.mean', ''),  # ones() received an invalid combination of arguments - got (torch.Size, device=t...
-    xfail('masked.median', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('masked.norm', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
+    # Seems flaky: https://github.com/pytorch/pytorch/issues/88883
+    skip('masked.median', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('masked.prod', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('masked_scatter', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('masked_select', ''),  # aten.masked_select.default - couldn't find symbolic meta function/decompos...
-    xfail('masked.softmax', ''),  # argument 'size' (position 2) must be tuple of ints, not torc...
-    xfail('masked.softmin', ''),  # argument 'size' (position 2) must be tuple of ints, not torc...
-    xfail('masked.std', ''),  # ones() received an invalid combination of arguments - got (torch.Size, device=to...
-    xfail('masked.sum', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('masked.var', ''),  # ones() received an invalid combination of arguments - got (torch.Size, device=to...
     xfail('matmul', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('matrix_exp', ''),  # aten.linalg_matrix_exp.default - couldn't find symbolic meta function/decompo...
     xfail('median', ''),  # could not find kernel
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index d1a5c9498bcaa..86beb651cb2d1 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1114,23 +1114,8 @@ def f(a, b, c, d, e):
     xfail('linalg.eig'),
     xfail('linalg.eigvals'),
     skip('masked.logsumexp', ''),  # Tensors of type TensorImpl do not have numel
-    xfail('masked.amax', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
-    xfail('masked.amin', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
-    xfail('masked.argmax', ''),  # broadcast_to(): argument 'size' (position 2) must be tuple of ints, but found ...
-    xfail('masked.argmin', ''),  # broadcast_to(): argument 'size' (position 2) must be tuple of ints, but found ...
     xfail('masked.cumprod', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
-    xfail('masked.cumsum', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
-    xfail('masked.log_softmax', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
     xfail('masked.logaddexp', ''),  # aten.logaddexp.default - couldn't find symbolic meta function/decomposition
-    xfail('masked.mean', ''),  # ones() received an invalid combination of arguments - got (torch.Size, device=torch.device, ...
-    xfail('masked.median', ''),  # aten.nanmedian.dim - couldn't find symbolic meta function/decomposition
-    xfail('masked.norm', ''),  # aten.linalg_vector_norm.default - couldn't find symbolic meta function/decomposition
-    xfail('masked.prod', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
-    xfail('masked.softmax', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
-    xfail('masked.softmin', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
-    xfail('masked.std', ''),  # ones() received an invalid combination of arguments - got (torch.Size, device=torch.device, d...
-    xfail('masked.sum', ''),  # aten._to_copy.default - couldn't find symbolic meta function/decomposition
-    xfail('masked.var', ''),  # ones() received an invalid combination of arguments - got (torch.Size, device=torch.device, d...
     xfail('addmv', ''),  # aten.addmv.default - couldn't find symbolic meta function/decomposition
     xfail('addr', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('aminmax', ''),  # aten.aminmax.default - couldn't find symbolic meta function/decomposition

From aea224f015e92fd30a774d5e30fe05408fbd3373 Mon Sep 17 00:00:00 2001
From: Elias Ellison <elias.ellison@gmail.com>
Date: Tue, 8 Nov 2022 19:23:21 +0000
Subject: [PATCH 0797/1922] Regularize bernouilli_ with bernouilli decomp
 (#88349)

Fix for https://github.com/pytorch/torchdynamo/issues/1796. Just like the other [bernouilli decomp](https://github.com/pytorch/pytorch/blob/master/torch/_inductor/decomposition.py#L302) we need to pass `dtype=float32` to avoid `"check_uniform_bounds" not implemented` errors.

Are we planning on enabling `TEST_WITH_TORCHINDUCTOR` ? Do I need to change anything with the tests ?

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88349
Approved by: https://github.com/desertfire
---
 torch/_inductor/decomposition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index d7aa5e35f5010..e8a20c0dbd26e 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -325,7 +325,7 @@ def bernoulli_p(self, p=0.5, *, generator=None):
 
 @register_extra_random_decomp([aten.bernoulli_])
 def bernoulli_(self, p=0.5):
-    return self.copy_(torch.rand_like(self) < p)
+    return self.copy_(torch.rand_like(self, dtype=torch.float32) < p)
 
 
 @functools.lru_cache(None)

From 590b471a57bd3e378601361fdb520dd410643c7b Mon Sep 17 00:00:00 2001
From: Kurt Mohler <kmohler@quansight.com>
Date: Fri, 11 Nov 2022 16:57:05 +0000
Subject: [PATCH 0798/1922] Explicitly check filelike arg of `torch.save`
 (#88867)

Fixes #88793

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88867
Approved by: https://github.com/ezyang
---
 test/test_serialization.py | 9 +++++++++
 torch/serialization.py     | 7 +++++++
 2 files changed, 16 insertions(+)

diff --git a/test/test_serialization.py b/test/test_serialization.py
index 779d6fb5c20c5..5ccc6f47b4c5d 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -585,6 +585,15 @@ def test_serialization_filelike_exceptions(self):
         with self.assertRaises(TypeError):
             # Tries to serialize str into tensor with wrong callable write property
             torch.save('foo', x)
+        s_data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        s = torch.CharStorage(s_data)
+        with self.assertRaises(AttributeError):
+            # Tries to serialize list into CharStorage
+            torch.save(s_data, s)
+        x = torch.randint(10, (3, 3), dtype=torch.float).cpu().numpy()
+        with self.assertRaises(AttributeError):
+            # Tries to serialize ndarray into ndarray
+            torch.save(x, x)
 
 
     def test_serialization_storage_slice(self):
diff --git a/torch/serialization.py b/torch/serialization.py
index d123a955ad966..3078e57587be6 100644
--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -375,6 +375,12 @@ def _check_dill_version(pickle_module) -> None:
                 pickle_module.__version__
             ))
 
+def _check_save_filelike(f):
+    if not isinstance(f, (str, os.PathLike)) and not hasattr(f, 'write'):
+        raise AttributeError((
+            "expected 'f' to be string, path, or a file-like object with "
+            "a 'write' attribute"))
+
 def save(
     obj: object,
     f: FILE_LIKE,
@@ -422,6 +428,7 @@ def save(
         >>> torch.save(x, buffer)
     """
     _check_dill_version(pickle_module)
+    _check_save_filelike(f)
 
     if _use_new_zipfile_serialization:
         with _open_zipfile_writer(f) as opened_zipfile:

From 0318582d2647719ac7a7106b62628206531e17f5 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 11 Nov 2022 17:03:25 +0000
Subject: [PATCH 0799/1922] Revert "[Autograd] Use in-place input accumulation
 fast path for dense Tensors. (#88339)"

This reverts commit 8f66ae413f8c9d7f2418d7f0b9f69d409c455b46.

Reverted https://github.com/pytorch/pytorch/pull/88339 on behalf of https://github.com/mehtanirav due to Internal test failures
---
 torch/csrc/autograd/input_buffer.cpp | 54 ++++++++--------------------
 1 file changed, 14 insertions(+), 40 deletions(-)

diff --git a/torch/csrc/autograd/input_buffer.cpp b/torch/csrc/autograd/input_buffer.cpp
index 7e6df0cea8da0..6cc6acefc9d45 100644
--- a/torch/csrc/autograd/input_buffer.cpp
+++ b/torch/csrc/autograd/input_buffer.cpp
@@ -4,7 +4,6 @@
 #include <ATen/SparseCsrTensorUtils.h>
 #include <ATen/SparseTensorUtils.h>
 #include <ATen/TensorOperators.h>
-#include <ATen/TensorSubclassLikeUtils.h>
 
 #include <c10/core/DeviceGuard.h>
 #include <c10/core/Event.h>
@@ -67,18 +66,6 @@ void record_stream_any_impl(Variable& var, c10::Stream& stream) {
     }
   }
 }
-
-bool can_accumulate_inplace(const Variable& v) {
-  return (
-      // `v` is a "vanilla" Tensor
-      !(at::isTensorSubclassLike(v) || v._is_zerotensor() || v.is_nested()) &&
-
-      // with a favorable memory layout
-      v.is_non_overlapping_and_dense() &&
-
-      // and we hold the last reference
-      v.use_count() == 1 && v.storage().use_count() == 1);
-}
 } // anonymous namespace
 
 static void accumulate(
@@ -87,38 +74,25 @@ static void accumulate(
     Variable&& var) {
   TORCH_INTERNAL_ASSERT(pos < buffer.size());
   auto& old_var = buffer[pos];
-  // If we hold the last reference to `old_var` AND its storage we will try to
-  // repurpose it to store the output. (Or, if `old_var` is sparse then `var`
-  // becomes the candidate output Tensor.) We only do this if:
-  //  1) GradMode is disabled since Autograd has special handling for inplace
-  //     mutation which we don't want to trigger.
-  //
-  //  2) We hold the last reference.
-  //     (Both `.use_count` and `.storage().use_count()` are one)
-  //
-  //  3) The candidate tensor is a contiguous, non-overlapping, dense, and
-  //     otherwise stock standard Tensor.
-  //
-  //  4) The candidate is mutable. Currently only ZeroTensors are immutable.
-  //
-  //  5) The other Tensor is not a Tensor subclass (except sparse), since
-  //     it's hard to predict the semantics of arbitrary subclass behavior.
-
-  if (at::GradMode::is_enabled()) {
-    buffer[pos] = old_var + var;
-  } else if (
-      // ATen doesn't route sparse additions correctly...
-      old_var.is_sparse() || old_var.is_sparse_csr()) {
-    if (can_accumulate_inplace(var)) {
+  // ATen doesn't route sparse additions correctly...
+  // do dense + sparse in-place if possible
+  if (old_var.is_sparse()) {
+    // It is safe to change the Tensor inplace if the Tensor is only used in
+    // this buffer (this could be the gradient passed by the user) and that no
+    // other Tensor is using the same storage.
+    if (!var.is_sparse() && var.is_contiguous() && var.use_count() == 1 &&
+        var.storage().use_count() == 1) {
       buffer[pos] = var.add_(old_var);
     } else {
       buffer[pos] = var + old_var;
     }
-  } else if (
-      can_accumulate_inplace(old_var) && !at::isTensorSubclassLike(var)) {
-    buffer[pos] = old_var.add_(var);
   } else {
-    buffer[pos] = old_var + var;
+    if (var.is_sparse() && !old_var.is_sparse() && old_var.is_contiguous() &&
+        old_var.use_count() == 1 && old_var.storage().use_count() == 1) {
+      buffer[pos] = old_var.add_(var);
+    } else {
+      buffer[pos] = old_var + var;
+    }
   }
 }
 

From 735169f9498c20c30b9906e16f8d20298dc736e0 Mon Sep 17 00:00:00 2001
From: Elias Ellison <elias.ellison@gmail.com>
Date: Fri, 11 Nov 2022 04:25:11 +0000
Subject: [PATCH 0800/1922] Take input striding for conv forward based on eager
 output (#88706)

From discussion with @Chillee and @ngimel we'll likely need further fixes to ensure that we hit channels last kernels but this is still worth landing in its own right.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88706
Approved by: https://github.com/ngimel
---
 test/inductor/test_torchinductor.py | 26 +++++++++++
 torch/_inductor/ir.py               | 72 +++++++++++++++++------------
 2 files changed, 69 insertions(+), 29 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 121f3d31f39c2..aea8013bdfac8 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -4601,6 +4601,8 @@ def fn(a):
     CommonTemplate.install(CudaTests, "cuda")
 
     class CudaReproTests(TestCase):
+        common = check_model_cuda
+
         def test_index_put_issue(self):
             def forward(
                 self,
@@ -4637,6 +4639,30 @@ def forward(
             compiled = compile_fx_inner(mod, inps)
             compiled(inps)
 
+        @requires_cuda()
+        def test_input_channels_last(self):
+            m = torch.nn.Sequential(
+                torch.nn.Conv2d(3, 3, 1, 1),
+                ToTuple(),
+            ).cuda()
+            inp = (
+                torch.randn([2, 3, 16, 16]).to(memory_format=torch.channels_last).cuda()
+            )
+
+            self.common(
+                m,
+                (inp,),
+                check_lowp=False,
+            )
+
+            @torch._dynamo.optimize()
+            def foo(m, inp):
+                return m(inp)
+
+            self.assertTrue(
+                foo(m, inp)[0].is_contiguous(memory_format=torch.channels_last)
+            )
+
         # https://github.com/pytorch/torchdynamo/issues/1681#issuecomment-1283433527
         @requires_cuda()
         def test_unspec_inputs_interop(self):
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 448c057ecb0e1..240c196a73b6d 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -19,7 +19,12 @@
 
 import torch.fx
 import torch.utils._pytree as pytree
-from torch._prims_common import is_boolean_dtype, is_float_dtype
+from torch._prims_common import (
+    is_boolean_dtype,
+    is_float_dtype,
+    make_channels_last_strides_for,
+    make_contiguous_strides_for,
+)
 from torch._subclasses.fake_tensor import FakeTensorMode
 
 from . import config, dependencies
@@ -133,7 +138,7 @@ def ir_node_to_tensor(x, guard_shape=True):
     if is_storage_and_layout(x):
         stride = [shape_fn(s) for s in x.get_layout().stride]
     else:
-        stride = torch._prims_common.make_contiguous_strides_for(size)
+        stride = make_contiguous_strides_for(size)
     dtype = x.get_dtype()
     device = x.get_device()
     t = torch.empty_strided(
@@ -2462,6 +2467,9 @@ def require_stride_order(cls, x, order):
                 x.get_layout(), FixedLayout
             ) and x.get_layout().is_stride_ordered(order):
                 return x
+        # TODO - Storage to InputBuffer
+        if isinstance(x, InputBuffer) and x.get_layout().is_stride_ordered(order):
+            return x
         x = cls.copy_input(x)
         as_storage_and_layout(x, freeze=True, want_contiguous=False, stride_order=order)
         assert is_stride_order_storage_and_layout(x, order)
@@ -3052,9 +3060,32 @@ def create(
         output_padding_: List[int],
         groups: int,
     ):
+        with torch._subclasses.FakeTensorMode():
+            x_fake = ir_node_to_tensor(x, guard_shape=True)
+            weight_fake = ir_node_to_tensor(weight, guard_shape=True)
+            bias_fake = (
+                ir_node_to_tensor(bias, guard_shape=True) if bias is not None else bias
+            )
+            output = torch.ops.aten.convolution(
+                x_fake,
+                weight_fake,
+                bias_fake,
+                stride_,
+                padding_,
+                dilation_,
+                transposed,
+                output_padding_,
+                groups,
+            )
+            req_stride_order = get_stride_order(output.stride())
+
+        if config.triton.convolution == "aten":
+            weight = cls.require_stride_order(weight, req_stride_order)
+            x = cls.require_stride_order(x, req_stride_order)
+        else:
+            x = cls.require_stride1(cls.realize_input(x))
+            weight = cls.require_stride1(cls.realize_input(weight))
 
-        weight = cls.require_stride1(cls.realize_input(weight))
-        x = cls.require_stride_order(x, get_stride_order(weight.get_stride()))
         stride = tuple(stride_)
         padding = tuple(padding_)
         dilation = tuple(dilation_)
@@ -3062,22 +3093,6 @@ def create(
         output_padding = tuple(output_padding_)
         assert isinstance(groups, int)
 
-        # TODO - enable FakeTensorMode for propagation more globally. incorrect stride metas for fallback
-        # kernels will lead to runtime failures
-        with FakeTensorMode():
-            output, *_ = cls.process_kernel(
-                torch.ops.aten.convolution,
-                x,
-                weight,
-                bias,
-                stride,
-                padding,
-                dilation,
-                transposed,
-                output_padding,
-                groups,
-            )
-
         output_size = output.shape
 
         weight_shape = [
@@ -3122,6 +3137,7 @@ def create(
         # for conv2d or conv3d, prefer channels last format
         if kernel == "triton_ops.conv":
             output_layout_str = "torch.channels_last"
+
         elif config.tune_layout and len(x.get_size()) == 4:
             from .codegen.autotuner import tuned_conv_layout
 
@@ -3151,14 +3167,19 @@ def create(
             if len(stride_order) < len(output_size):
                 # add batch dim if it exists
                 stride_order = [len(stride_order)] + stride_order
+            strides = make_channels_last_strides_for(output_size)
         else:
             stride_order = list(reversed(range(len(output_size))))
+            strides = make_contiguous_strides_for(output_size)
 
-        output_layout = FlexibleLayout(
+        if config.triton.convolution != "aten":
+            x = cls.require_stride_order(x, stride_order)
+
+        output_layout = FixedLayout(
             x.get_device(),
             x.get_dtype(),
             output_size,
-            stride_order,
+            strides,
         )
 
         if bias is not None:
@@ -3178,13 +3199,6 @@ def create(
                 kernel,
             )
 
-    def apply_constraint(self):
-        x = self.inputs[0]
-        # FixedLayout of input
-        x = self.require_stride_order(x, self.layout.preferred_stride_order)
-        self.inputs[0] = x
-        self.freeze_layout_with_stride_order(self.layout.preferred_stride_order)
-
     def map_args(self):
         # x, w, bias
         in_args = [x.codegen_reference() for x in self.inputs]

From 1d2a2b52a44f58e1e31d8068cf228964426ecb28 Mon Sep 17 00:00:00 2001
From: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Date: Fri, 11 Nov 2022 17:43:46 +0000
Subject: [PATCH 0801/1922] Fix ATen Fallback for BUILD_CAFFE2=0 for ONNX-only
 ops (#88504)

Follow-up for #87735

Once again, because BUILD_CAFFE2=0 is not tested for ONNX exporter, one scenario slipped through. A use case where the model can be exported without aten fallback when operator_export_type=ONNX_ATEN_FALLBACK and BUILD_CAFFE2=0

A new unit test has been added, but it won't prevent regressions if BUILD_CAFFE2=0 is not executed on CI again

Fixes #87313

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88504
Approved by: https://github.com/justinchuby, https://github.com/BowenBao
---
 test/onnx/test_pytorch_onnx_no_runtime.py | 220 +++++++++++++---------
 torch/onnx/utils.py                       |  19 +-
 2 files changed, 149 insertions(+), 90 deletions(-)

diff --git a/test/onnx/test_pytorch_onnx_no_runtime.py b/test/onnx/test_pytorch_onnx_no_runtime.py
index 622f42effb4ab..89526c71ca387 100644
--- a/test/onnx/test_pytorch_onnx_no_runtime.py
+++ b/test/onnx/test_pytorch_onnx_no_runtime.py
@@ -18,7 +18,7 @@
 import torch
 import torch.nn.functional as F
 from torch import Tensor
-from torch.onnx import symbolic_helper, utils
+from torch.onnx import OperatorExportTypes, symbolic_helper, utils
 from torch.onnx._globals import GLOBALS
 from torch.onnx._internal import registration
 from torch.testing._internal import common_quantization, common_utils, jit_utils
@@ -935,6 +935,139 @@ def forward(self, x, w):
 
         torch.onnx.export_to_pretty_string(Mod(), (torch.rand(3, 4), torch.rand(4, 5)))
 
+    @common_utils.skipIfNoCaffe2
+    def test_caffe2_aten_fallback_must_fallback(self):
+        class ModelWithAtenNotONNXOp(torch.nn.Module):
+            def forward(self, x, y):
+                abcd = x + y
+                defg = torch.linalg.qr(abcd)
+                return defg
+
+        # TODO: Refactor common_utils._decide_skip_caffe2 to support parametrize
+        for operator_export_type in (
+            OperatorExportTypes.ONNX_ATEN,
+            OperatorExportTypes.ONNX_ATEN_FALLBACK,
+        ):
+            x = torch.rand(3, 4)
+            y = torch.rand(3, 4)
+            f = io.BytesIO()
+            torch.onnx.export(
+                ModelWithAtenNotONNXOp(),
+                (x, y),
+                f,
+                do_constant_folding=False,
+                operator_export_type=operator_export_type,
+                # support for linalg.qr was added in later op set versions.
+                opset_version=9,
+            )
+            onnx_model = onnx.load(io.BytesIO(f.getvalue()))
+            self.assertAtenOp(onnx_model, "linalg_qr")
+
+    @common_utils.skipIfNoCaffe2
+    def test_caffe2_onnx_aten_must_not_fallback(self):
+        class ModelWithAtenFmod(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.fmod(x, y)
+
+        # TODO: Refactor common_utils._decide_skip_caffe2 to support parametrize
+        for operator_export_type in (
+            OperatorExportTypes.ONNX_ATEN_FALLBACK,
+            OperatorExportTypes.ONNX_ATEN,
+        ):
+            x = torch.randn(3, 4, dtype=torch.float32)
+            y = torch.randn(3, 4, dtype=torch.float32)
+            f = io.BytesIO()
+            torch.onnx.export(
+                ModelWithAtenFmod(),
+                (x, y),
+                f,
+                do_constant_folding=False,
+                operator_export_type=operator_export_type,
+                opset_version=10,  # or higher
+            )
+            onnx_model = onnx.load(io.BytesIO(f.getvalue()))
+            assert onnx_model.graph.node[0].op_type == "Mod"
+
+    @common_utils.skipIfCaffe2
+    def test_aten_fallback_must_fallback(self):
+        class ModelWithAtenNotONNXOp(torch.nn.Module):
+            def forward(self, x, y):
+                abcd = x + y
+                defg = torch.linalg.qr(abcd)
+                return defg
+
+        x = torch.rand(3, 4)
+        y = torch.rand(3, 4)
+        f = io.BytesIO()
+        torch.onnx.export(
+            ModelWithAtenNotONNXOp(),
+            (x, y),
+            f,
+            do_constant_folding=False,
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+            # support for linalg.qr was added in later op set versions.
+            opset_version=9,
+        )
+        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
+        self.assertAtenOp(onnx_model, "linalg_qr")
+
+    @common_utils.skipIfCaffe2
+    def test_onnx_aten(self):
+        class ModelWithAtenFmod(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.fmod(x, y)
+
+        x = torch.randn(3, 4, dtype=torch.float32)
+        y = torch.randn(3, 4, dtype=torch.float32)
+        f = io.BytesIO()
+        torch.onnx.export(
+            ModelWithAtenFmod(),
+            (x, y),
+            f,
+            do_constant_folding=False,
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN,
+        )
+        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
+        self.assertAtenOp(onnx_model, "fmod", "Tensor")
+
+    @common_utils.skipIfCaffe2
+    def test_onnx_aten_fallback_must_not_fallback(self):
+        # For BUILD_CAFFE2=0, aten fallback only when not exportable
+        class ONNXExportable(torch.nn.Module):
+            def __init__(self):
+                super(ONNXExportable, self).__init__()
+                self.quant = torch.quantization.QuantStub()
+                self.fc1 = torch.nn.Linear(12, 8)
+                self.fc2 = torch.nn.Linear(8, 4)
+                self.fc3 = torch.nn.Linear(4, 6)
+                self.dequant = torch.quantization.DeQuantStub()
+
+            def forward(self, x):
+                x = self.quant(x)
+                x = x.view((-1, 12))
+                h = F.relu(self.fc1(x))
+                h = F.relu(self.fc2(h))
+                h = F.relu(self.fc3(h))
+                h = self.dequant(h)
+                return h
+
+        dummy_input = torch.randn(12)
+        f = io.BytesIO()
+        torch.onnx.export(
+            ONNXExportable(),
+            (dummy_input,),
+            f,
+            do_constant_folding=False,
+            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
+        )
+        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
+        all_aten_nodes = [
+            p
+            for p in onnx_model.graph.node
+            if p.op_type == "ATen" and p.domain == "org.pytorch.aten"
+        ]
+        self.assertEqual(len(all_aten_nodes), 0)
+
 
 class TestQuantizeEagerONNXExport(common_utils.TestCase):
     def _test_lower_graph_impl(self, model, data):
@@ -997,91 +1130,6 @@ def test_lower_graph_conv3d(self):
         data = torch.from_numpy(data_numpy).to(dtype=torch.float)
         self._test_lower_graph_impl(model, data)
 
-    @common_utils.skipIfNoCaffe2
-    def test_caffe2_aten_fallback(self):
-        class ModelWithAtenNotONNXOp(torch.nn.Module):
-            def forward(self, x, y):
-                abcd = x + y
-                defg = torch.linalg.qr(abcd)
-                return defg
-
-        x = torch.rand(3, 4)
-        y = torch.rand(3, 4)
-        f = io.BytesIO()
-        torch.onnx.export(
-            ModelWithAtenNotONNXOp(),
-            (x, y),
-            f,
-            do_constant_folding=False,
-            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
-            # support for linalg.qr was added in later op set versions.
-            opset_version=9,
-        )
-        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
-        self.assertAtenOp(onnx_model, "linalg_qr")
-
-    @common_utils.skipIfNoCaffe2
-    def test_caffe2_onnx_aten(self):
-        class ModelWithAtenFmod(torch.nn.Module):
-            def forward(self, x, y):
-                return torch.fmod(x, y)
-
-        x = torch.randn(3, 4, dtype=torch.float32)
-        y = torch.randn(3, 4, dtype=torch.float32)
-        f = io.BytesIO()
-        torch.onnx.export(
-            ModelWithAtenFmod(),
-            (x, y),
-            f,
-            do_constant_folding=False,
-            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN,
-            opset_version=10,  # or higher
-        )
-        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
-        assert onnx_model.graph.node[0].op_type == "Mod"
-
-    @common_utils.skipIfCaffe2
-    def test_aten_fallback(self):
-        class ModelWithAtenNotONNXOp(torch.nn.Module):
-            def forward(self, x, y):
-                abcd = x + y
-                defg = torch.linalg.qr(abcd)
-                return defg
-
-        x = torch.rand(3, 4)
-        y = torch.rand(3, 4)
-        f = io.BytesIO()
-        torch.onnx.export(
-            ModelWithAtenNotONNXOp(),
-            (x, y),
-            f,
-            do_constant_folding=False,
-            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
-            # support for linalg.qr was added in later op set versions.
-            opset_version=9,
-        )
-        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
-        self.assertAtenOp(onnx_model, "linalg_qr")
-
-    @common_utils.skipIfCaffe2
-    def test_onnx_aten(self):
-        class ModelWithAtenFmod(torch.nn.Module):
-            def forward(self, x, y):
-                return torch.fmod(x, y)
-
-        x = torch.randn(3, 4, dtype=torch.float32)
-        y = torch.randn(3, 4, dtype=torch.float32)
-        f = io.BytesIO()
-        torch.onnx.export(
-            ModelWithAtenFmod(),
-            (x, y),
-            f,
-            do_constant_folding=False,
-            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN,
-        )
-        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
-        self.assertAtenOp(onnx_model, "fmod", "Tensor")
-
 
 if __name__ == "__main__":
     common_utils.run_tests()
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index ff0ef755968d3..b30b71812aaef 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -1752,10 +1752,21 @@ def _should_aten_fallback(
     )
     is_caffe2_build = _C_onnx._CAFFE2_ATEN_FALLBACK
 
-    return name.startswith("aten::") and (
-        ((is_onnx_aten_export or is_aten_fallback_export) and not is_caffe2_build)
-        or (not is_exportable_aten_op and is_aten_fallback_export)
-    )
+    if not name.startswith("aten::"):
+        return False
+
+    if is_caffe2_build:
+        if (
+            is_onnx_aten_export or is_aten_fallback_export
+        ) and not is_exportable_aten_op:
+            return True
+    else:
+        if is_onnx_aten_export or (
+            is_aten_fallback_export and not is_exportable_aten_op
+        ):
+            return True
+
+    return False
 
 
 @_beartype.beartype

From 331c7645782f5850e2a7028a0caf4c213a45ea37 Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Thu, 10 Nov 2022 21:19:21 +0000
Subject: [PATCH 0802/1922] [FSDP][state_dict][4/N] Move the core logic of
 summon full parameters to _unshard_params_utils.py (#88636)

**What**
`_summon_full_parameters` is required for state_dict. To enable composable FSDP state_dict, `_summon_full_params` must be accessible without FullyShardedDataParall. This PR move the core logic of `_summon_full_params` to `_unshard_params_utils`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88636
Approved by: https://github.com/awgu
---
 test/distributed/fsdp/test_fsdp_state_dict.py |   2 +-
 .../fsdp/test_fsdp_summon_full_params.py      |   4 +-
 torch/distributed/fsdp/_state_dict_utils.py   |  34 ++-
 .../distributed/fsdp/_unshard_param_utils.py  | 254 ++++++++++++++++++
 .../fsdp/fully_sharded_data_parallel.py       | 201 ++------------
 5 files changed, 290 insertions(+), 205 deletions(-)
 create mode 100644 torch/distributed/fsdp/_unshard_param_utils.py

diff --git a/test/distributed/fsdp/test_fsdp_state_dict.py b/test/distributed/fsdp/test_fsdp_state_dict.py
index 48dad3118db74..ba51ae66ed1b2 100644
--- a/test/distributed/fsdp/test_fsdp_state_dict.py
+++ b/test/distributed/fsdp/test_fsdp_state_dict.py
@@ -25,7 +25,7 @@
     StateDictType,
 )
 from torch.distributed.fsdp._shard_utils import _gather_state_dict
-from torch.distributed.fsdp.fully_sharded_data_parallel import FLAT_PARAM
+from torch.distributed.fsdp._unshard_param_utils import FLAT_PARAM
 from torch.distributed.fsdp.wrap import enable_wrap, transformer_auto_wrap_policy, wrap
 from torch.nn import Linear, Module, TransformerDecoderLayer, TransformerEncoderLayer
 from torch.nn.parallel import DistributedDataParallel
diff --git a/test/distributed/fsdp/test_fsdp_summon_full_params.py b/test/distributed/fsdp/test_fsdp_summon_full_params.py
index 0d4e98069117a..18055dbebffbf 100644
--- a/test/distributed/fsdp/test_fsdp_summon_full_params.py
+++ b/test/distributed/fsdp/test_fsdp_summon_full_params.py
@@ -212,7 +212,7 @@ def forward(self, fsdp_module):
 
         model = FSDP(MyModule()).cuda(self.rank)
         with self.assertRaisesRegex(
-            ValueError, "current state is TrainingState.FORWARD"
+            ValueError, "Current handle state is HandleTrainingState.FORWARD"
         ):
             model(model)
 
@@ -231,7 +231,7 @@ def bad_backwards_hook(tensor):
         output.register_hook(bad_backwards_hook)
 
         with self.assertRaisesRegex(
-            ValueError, "current state is TrainingState.FORWARD_BACKWARD"
+            ValueError, "Current handle state is HandleTrainingState.BACKWARD_PRE"
         ):
             output.backward()
 
diff --git a/torch/distributed/fsdp/_state_dict_utils.py b/torch/distributed/fsdp/_state_dict_utils.py
index 0bfd149b0112c..eee5522340b46 100644
--- a/torch/distributed/fsdp/_state_dict_utils.py
+++ b/torch/distributed/fsdp/_state_dict_utils.py
@@ -24,7 +24,6 @@
     clean_tensor_name,
     FSDP_PREFIX,
     FSDP_WRAPPED_MODULE,
-    TrainingState,
 )
 from torch.distributed.fsdp._runtime_utils import (
     _cast_buffers_to_dtype_and_device,
@@ -40,6 +39,11 @@
     _ext_pre_load_state_dict_transform,
     _extensions as _user_extensions,
 )
+from ._unshard_param_utils import (
+    _deregister_orig_params,
+    _register_orig_params,
+    FLAT_PARAM,
+)
 from .flat_param import FlatParamHandle
 
 
@@ -91,10 +95,6 @@ def _enter_full_param_ctx(
         "Entering the ``summon_full_params`` context but fsdp_state._full_param_ctx "
         "is not None."
     )
-    assert fsdp_state.training_state != TrainingState.SUMMON_FULL_PARAMS, (
-        "Entering the summon_full_params context but the state is already "
-        "SUMMON_FULL_PARAMS."
-    )
     fsdp_state._full_param_ctx = fsdp_state._summon_full_params(
         recurse=recurse,
         writeback=writeback,
@@ -108,10 +108,6 @@ def _enter_full_param_ctx(
 @no_type_check
 def _exit_full_param_ctx(fsdp_state: _FSDPState) -> None:
     """A helper function to exit ``summon_full_params`` context."""
-    assert fsdp_state.training_state == TrainingState.SUMMON_FULL_PARAMS, (
-        "Exiting the summon_full_params context but the state is not "
-        "SUMMON_FULL_PARAMS."
-    )
     assert fsdp_state._full_param_ctx is not None
     fsdp_state._full_param_ctx.__exit__(None, None, None)
     fsdp_state._full_param_ctx = None
@@ -168,9 +164,6 @@ def _common_summon_post_state_dict_hook(
     hook.
     """
     _replace_by_prefix(state_dict, prefix + f"{FSDP_PREFIX}", prefix)
-    assert (
-        fsdp_state.training_state == TrainingState.SUMMON_FULL_PARAMS
-    ), "Inside the post_state_dict_hook but the state is not SUMMON_FULL_PARAMS."
     # Return early for trivial cases
     if not state_dict or not _has_fsdp_params(fsdp_state, module):
         _exit_full_param_ctx(fsdp_state)
@@ -180,7 +173,7 @@ def _common_summon_post_state_dict_hook(
     # For `use_orig_params=True`, the `FlatParameter` is not registered, so
     # there is no entry in the state dict for it to pop.
     if not fsdp_state._use_orig_params:
-        state_dict.pop(f"{prefix}{fsdp_file.FLAT_PARAM}")
+        state_dict.pop(f"{prefix}{FLAT_PARAM}")
 
     # If a rank does not have unsharded parameters(when `rank0_only=True`
     # and `rank != 0`), then the rank only needed to participate in the
@@ -338,6 +331,7 @@ def _full_pre_load_state_dict_hook(
     state_dict: Dict[str, Any],
     prefix: str,
 ) -> None:
+    _lazy_init(fsdp_state, module)
     _enter_full_param_ctx(fsdp_state, recurse=False, writeback=True)
     _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP_PREFIX}")
 
@@ -409,7 +403,7 @@ def _local_post_state_dict_hook(
     )  # type: ignore[assignment]
     if fsdp_state._state_dict_config.offload_to_cpu:
         sharded_tensor = sharded_tensor.cpu()
-    state_dict[f"{prefix}{fsdp_file.FLAT_PARAM}"] = sharded_tensor
+    state_dict[f"{prefix}{FLAT_PARAM}"] = sharded_tensor
     return state_dict
 
 
@@ -430,8 +424,9 @@ def _local_pre_load_state_dict_hook(
     state_dict. The flat_param should be a ShardedTensor. This hook converts
     the ShardedTensor to a tensor. No copy happen unless padding is required.
     """
+    _lazy_init(fsdp_state, module)
     _replace_by_prefix(state_dict, prefix, f"{prefix}{FSDP_PREFIX}")
-    fqn = f"{prefix}{FSDP_PREFIX}{fsdp_file.FLAT_PARAM}"
+    fqn = f"{prefix}{FSDP_PREFIX}{FLAT_PARAM}"
     if fqn not in state_dict:
         assert not _has_fsdp_params(fsdp_state, module), (
             "No `FlatParameter` in `state_dict` for this FSDP instance "
@@ -527,7 +522,7 @@ def _sharded_post_load_state_dict_hook(
     module, fsdp_state: _FSDPState, *args, **kwargs
 ) -> None:
     if fsdp_state._use_orig_params:
-        fsdp_state._register_orig_params()
+        _register_orig_params(module, fsdp_state)
 
 
 @no_type_check
@@ -541,6 +536,7 @@ def _sharded_pre_load_state_dict_hook(
     The hook combines the unflattened, sharded parameters (ShardedTensor) to
     a new FlatParameter and shards the new FlatParameter to the local chunk.
     """
+    _lazy_init(fsdp_state, module)
     _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP_PREFIX}")
     if not _has_fsdp_params(fsdp_state, module):
         return
@@ -605,7 +601,7 @@ def _sharded_pre_load_state_dict_hook(
     assert all(s1 == s2 for s1, s2 in zip(loaded_shapes, flat_param._shapes)), (
         f"The original shapes in FSDP are {flat_param._shapes}. "
         f"The loaded shapes are {loaded_shapes}. "
-        f"FSDP extension is {'NOT' if _user_extensions is None else ''} None."
+        f"FSDP extension is {'NOT' if _user_extensions is not None else ''} None."
     )
     assert flat_param.numel() == loaded_flat_tensor.numel(), (
         f"The loaded local chunk has different numel({loaded_flat_tensor.numel()}) "
@@ -615,9 +611,9 @@ def _sharded_pre_load_state_dict_hook(
         f"The loaded local chunk has different padding({num_to_pad}) "
         f"from the local chunk {flat_param._shard_numel_padded}."
     )
-    state_dict[f"{prefix}{FSDP_PREFIX}{fsdp_file.FLAT_PARAM}"] = loaded_flat_tensor
+    state_dict[f"{prefix}{FSDP_PREFIX}{FLAT_PARAM}"] = loaded_flat_tensor
     if fsdp_state._use_orig_params:
-        fsdp_state._deregister_orig_params()
+        _deregister_orig_params(module, fsdp_state)
 
 
 @no_type_check
diff --git a/torch/distributed/fsdp/_unshard_param_utils.py b/torch/distributed/fsdp/_unshard_param_utils.py
new file mode 100644
index 0000000000000..950841850b620
--- /dev/null
+++ b/torch/distributed/fsdp/_unshard_param_utils.py
@@ -0,0 +1,254 @@
+import contextlib
+import warnings
+from typing import cast, Generator, List
+
+import torch
+import torch.nn as nn
+from torch.distributed.fsdp._common_utils import (
+    _FSDPState,
+    _has_fsdp_params,
+    _module_handles,
+    HandleTrainingState,
+)
+from torch.distributed.fsdp._runtime_utils import (
+    _clear_grads_if_needed,
+    _reshard,
+    _reshard_grads,
+    _unshard,
+    _unshard_grads,
+)
+from ._utils import p_assert
+from .flat_param import FlatParamHandle
+
+FLAT_PARAM = "_flat_param"
+
+
+@torch.no_grad()
+def _writeback_to_local_shard(
+    handles: List[FlatParamHandle],
+    writeback_grad: bool,
+):
+    """
+    For each handle, writes back the this rank's shard of the unsharded
+    flattened parameter to the sharded flattened parameter. If
+    ``writeback_grad=True``, then writes back to the sharded gradient as
+    well.
+
+    Precondition: Each handle's ``FlatParameter`` 's data points to the
+    padded unsharded flattened parameter.
+    """
+    for handle in handles:
+        # For `NO_SHARD`, `_local_shard` is the unsharded flattened
+        # parameter and `grad` is the unsharded gradient, so there is no
+        # need to writeback for either
+        if not handle.uses_sharded_strategy:
+            continue
+        assert (
+            handle.flat_param.ndim == 1
+        ), f"Expects `flat_param` to be flattened but got {handle.flat_param.shape}"
+
+        # Get the unpadded shard instead of the padded shard to persist
+        # user changes to the padding (though FSDP does not explicitly
+        # support this)
+        param_shard, _ = FlatParamHandle._get_unpadded_shard(
+            handle.flat_param,
+            handle.rank,
+            handle.world_size,
+        )
+        handle.flat_param._local_shard[: param_shard.numel()].copy_(param_shard)  # type: ignore[attr-defined]
+        if writeback_grad:
+            existing_grad = handle.sharded_grad
+            if existing_grad is not None:
+                assert handle.flat_param.grad is not None
+                grad_shard, _ = FlatParamHandle._get_unpadded_shard(
+                    handle.flat_param.grad,
+                    handle.rank,
+                    handle.world_size,
+                )
+                existing_grad[: grad_shard.numel()].copy_(grad_shard)
+
+
+def _deregister_flat_param(state: _FSDPState, module: nn.Module) -> None:
+    """
+    De-registers the flattened parameter from the wrapped module, hiding it
+    from ``nn.Module`` methods.
+
+    We do not use ``del`` because we want ``FLAT_PARAM`` to always be an
+    attribute but dynamically change whether it is visible to ``nn.Module``
+    methods.
+    """
+    if _has_fsdp_params(state, module):
+        # TODO: figure out the case for the composable APIs.
+        cast(nn.Module, module.module)._parameters.pop(FLAT_PARAM, None)
+
+
+def _register_flat_param(state: _FSDPState, module: nn.Module) -> None:
+    """
+    Registers the flattened parameter to the wrapped module, making it
+    visible to ``nn.Module`` methods.
+
+    We do not use :meth:`nn.Module.register_parameter` because we want
+    ``FLAT_PARAM`` to always be an attribute but dynamically change whether
+    it is visible to ``nn.Module`` methods.
+    """
+    handles = _module_handles(state, module)
+    if _has_fsdp_params(state, module):
+        # TODO: figure out the case for the composable APIs.
+        cast(nn.Module, module.module)._parameters[FLAT_PARAM] = handles[0].flat_param
+
+
+@contextlib.contextmanager
+def _unflatten_as_params(state: _FSDPState, module: nn.Module) -> Generator:
+    """
+    Assumes that the flattened parameter is unsharded. When in the context,
+    de-registers the flattened parameter and unflattens the original
+    parameters as ``nn.Parameter`` views into the flattened parameter.
+    After the context, re-registers the flattened parameter and restores
+    the original parameters as ``Tensor`` views into the flattened
+    parameter.
+    """
+    handles = _module_handles(state, module)
+    if not handles:
+        yield
+    else:
+        _deregister_flat_param(state, module)
+        try:
+            with handles[0].unflatten_as_params():
+                yield
+        finally:
+            if not handles[0]._use_orig_params:
+                _register_flat_param(state, module)
+
+
+@contextlib.contextmanager
+def _unshard_params(
+    module: nn.Module,
+    state: _FSDPState,
+    writeback: bool = True,
+    rank0_only: bool = False,
+    offload_to_cpu: bool = False,
+    with_grads: bool = False,
+):
+    if with_grads and (offload_to_cpu or not state._use_orig_params):
+        raise NotImplementedError(
+            f"with_grads={with_grads} "
+            f"use_orig_params={state._use_orig_params} "
+            f"offload_to_cpu={offload_to_cpu} "
+            f"is not supported yet"
+        )
+    if writeback and rank0_only:
+        raise ValueError(
+            "writeback=True and rank0_only=True is not supported, as model "
+            "parameter shapes will be different across ranks, and writing "
+            "to them can lead to inconsistencies across ranks when the "
+            "context is exited."
+        )
+    if offload_to_cpu and not rank0_only:
+        warnings.warn(
+            "offload_to_cpu and rank0_only=False will result in "
+            "full parameters being redundantly copied to CPU memory for "
+            "GPUs that reside on the same machine, which may incur the risk of "
+            "CPU OOM. It is recommended to use ``offload_to_cpu`` with "
+            "rank0_only=True."
+        )
+
+    torch.cuda.synchronize()
+    # If handles are shared by other module(s), the handle may be already unsharded.
+    handles = [
+        handle
+        for handle in _module_handles(state, module)
+        if handle._training_state != HandleTrainingState.SUMMON_FULL_PARAMS
+    ]
+    if not handles:
+        yield
+        return
+
+    for handle in handles:
+        if handle._training_state != HandleTrainingState.IDLE:
+            raise ValueError(f"Current handle state is {handle._training_state}")
+
+    for handle in handles:
+        handle._training_state = HandleTrainingState.SUMMON_FULL_PARAMS
+
+    _clear_grads_if_needed(handles)
+    free_unsharded_flat_params = [handle.needs_unshard() for handle in handles]
+    # No need to call `wait_stream()` since we unshard in the computation
+    # stream directly
+    computation_stream = torch.cuda.current_stream()
+    _unshard(state, handles, computation_stream, computation_stream)
+    if with_grads:
+        _unshard_grads(handles)
+
+    if rank0_only and state.rank != 0:
+        # Free the unsharded flattened parameter early
+        _reshard(state, handles, free_unsharded_flat_params)
+        if with_grads:
+            _reshard_grads(handles)
+        try:
+            yield
+        finally:
+            for handle in handles:
+                handle._training_state = HandleTrainingState.IDLE
+    else:
+        # Unflatten the unsharded flattened parameters
+        with contextlib.ExitStack() as stack:
+            # Invariant: rank == 0 or !rank0_only
+            for handle in handles:
+                if offload_to_cpu and handle.uses_sharded_strategy:
+                    stack.enter_context(handle.to_cpu())
+                    # TODO (awgu): Since PyTorch enforces that a parameter
+                    # and its gradients need to match metadata (e.g.
+                    # device), we must move gradients to CPU *after* we
+                    # move parameters.
+            # TODO (awgu): This FPW call assumes 1 `FlatParameter`
+            if not state._use_orig_params:
+                stack.enter_context(_unflatten_as_params(state, module))
+            try:
+                yield
+            finally:
+                stack.close()
+                if writeback:
+                    _writeback_to_local_shard(handles, with_grads)
+                _reshard(state, handles, free_unsharded_flat_params)
+                if with_grads:
+                    _reshard_grads(handles)
+                for handle in handles:
+                    handle._training_state = HandleTrainingState.IDLE
+
+
+def _deregister_orig_params(state: _FSDPState, module: nn.Module) -> None:
+    """
+    Deregisters the original parameters; registers the ``FlatParameter``.
+    """
+    handles = _module_handles(state, module)
+    p_assert(
+        len(handles) <= 1,
+        "Expects <=1 handle per FSDP instance; needs to be refactored "
+        "for >1 handle (e.g. non-recursive wrapping)",
+    )
+    if not handles:
+        return
+    handle = handles[0]
+    p_assert(
+        handle._use_orig_params,
+        f"Inconsistent `_use_orig_params` -- FSDP: {state._use_orig_params} "
+        f"handle: {handle._use_orig_params}",
+    )
+    handle._deregister_orig_params()
+    _register_flat_param(state, module)
+
+
+def _register_orig_params(state: _FSDPState, module: nn.Module) -> None:
+    """
+    Deregisters the ``FlatParameter``; registers the original parameters.
+    """
+    handles = _module_handles(state, module)
+    if not handles:
+        return
+    handle = handles[0]
+    _deregister_flat_param(state, module)
+    if handle.is_sharded(handle.flat_param):
+        handle._use_sharded_views()
+        handle._use_sharded_grad_views()
+    else:
+        handle._use_unsharded_views(as_params=True)
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 773686081a4d2..510f90de20234 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -48,18 +48,14 @@
     _init_state_dict_state,
 )
 from torch.distributed.fsdp._runtime_utils import (
-    _clear_grads_if_needed,
     _lazy_init,
     _post_forward,
     _post_forward_reshard,
     _pre_forward,
     _pre_forward_unshard,
     _reshard,
-    _reshard_grads,
     _root_pre_forward,
     _should_free_in_backward,
-    _unshard,
-    _unshard_grads,
     _wait_for_computation_stream,
 )
 from torch.distributed.fsdp._wrap_utils import _auto_wrap
@@ -92,6 +88,12 @@
     _post_state_dict_hook,
     _pre_load_state_dict_hook,
 )
+from ._unshard_param_utils import (
+    _deregister_orig_params,
+    _register_flat_param,
+    _register_orig_params,
+    _unshard_params,
+)
 from ._utils import p_assert
 from .flat_param import FlatParameter, FlatParamHandle
 from .wrap import ParamExecOrderWrapPolicy
@@ -409,7 +411,7 @@ def __init__(
         self._fsdp_wrapped_module = module
         if not use_orig_params:
             _check_orig_params_flattened(self, self._ignored_params)
-            self._register_flat_param()
+            _register_flat_param(self, self)
 
         # Delete to avoid keeping references after the constructor
         delattr(self, "_ignored_params")
@@ -864,153 +866,20 @@ def _summon_full_params(
                 yield
             return
 
-        torch.cuda.synchronize()
         _lazy_init(self, self)
-        self._assert_state([TrainingState.IDLE])
-        for handle in self._handles:
-            assert handle._training_state == HandleTrainingState.IDLE
-        self.training_state = TrainingState.SUMMON_FULL_PARAMS
-        for handle in self._handles:
-            handle._training_state = HandleTrainingState.SUMMON_FULL_PARAMS
-
-        if self._is_root:
-            _clear_grads_if_needed(self._fsdp_handles(self))
-        free_unsharded_flat_params = [
-            handle.needs_unshard() for handle in self._handles
-        ]
-        # No need to call `wait_stream()` since we unshard in the computation
-        # stream directly
-        computation_stream = torch.cuda.current_stream()
-        _unshard(self, self._handles, computation_stream, computation_stream)
-        if with_grads:
-            _unshard_grads(self._handles)
-
-        if rank0_only and self.rank != 0:
-            # Free the unsharded flattened parameter early
-            _reshard(self, self._handles, free_unsharded_flat_params)
-            if with_grads:
-                _reshard_grads(self._handles)
+        with _unshard_params(
+            module=self,
+            state=self,
+            writeback=writeback,
+            rank0_only=rank0_only,
+            offload_to_cpu=offload_to_cpu,
+            with_grads=with_grads,
+        ):
             try:
+                self.training_state = TrainingState.SUMMON_FULL_PARAMS
                 yield
             finally:
                 self.training_state = TrainingState.IDLE
-                for handle in self._handles:
-                    handle._training_state = HandleTrainingState.IDLE
-        else:
-            # Unflatten the unsharded flattened parameters
-            with contextlib.ExitStack() as stack:
-                # Invariant: rank == 0 or !rank0_only
-                for handle in self._handles:
-                    if offload_to_cpu and handle.uses_sharded_strategy:
-                        stack.enter_context(handle.to_cpu())
-                        # TODO (awgu): Since PyTorch enforces that a parameter
-                        # and its gradients need to match metadata (e.g.
-                        # device), we must move gradients to CPU *after* we
-                        # move parameters.
-                # TODO (awgu): This FPW call assumes 1 `FlatParameter`
-                if not self._use_orig_params:
-                    stack.enter_context(self._unflatten_as_params())
-                try:
-                    yield
-                finally:
-                    stack.close()
-                    if writeback:
-                        self._writeback_to_local_shard(self._handles, with_grads)
-                    _reshard(self, self._handles, free_unsharded_flat_params)
-                    if with_grads:
-                        _reshard_grads(self._handles)
-                    self.training_state = TrainingState.IDLE
-                    for handle in self._handles:
-                        handle._training_state = HandleTrainingState.IDLE
-
-    @torch.no_grad()
-    def _writeback_to_local_shard(
-        self,
-        handles: List[FlatParamHandle],
-        writeback_grad: bool,
-    ):
-        """
-        For each handle, writes back the this rank's shard of the unsharded
-        flattened parameter to the sharded flattened parameter. If
-        ``writeback_grad=True``, then writes back to the sharded gradient as
-        well.
-
-        Precondition: Each handle's ``FlatParameter`` 's data points to the
-        padded unsharded flattened parameter.
-        """
-        for handle in handles:
-            # For `NO_SHARD`, `_local_shard` is the unsharded flattened
-            # parameter and `grad` is the unsharded gradient, so there is no
-            # need to writeback for either
-            if not handle.uses_sharded_strategy:
-                continue
-            assert (
-                handle.flat_param.ndim == 1
-            ), f"Expects `flat_param` to be flattened but got {handle.flat_param.shape}"
-
-            # Get the unpadded shard instead of the padded shard to persist
-            # user changes to the padding (though FSDP does not explicitly
-            # support this)
-            param_shard, _ = FlatParamHandle._get_unpadded_shard(
-                handle.flat_param,
-                handle.rank,
-                handle.world_size,
-            )
-            handle.flat_param._local_shard[: param_shard.numel()].copy_(param_shard)
-            if writeback_grad:
-                existing_grad = handle.sharded_grad
-                if existing_grad is not None:
-                    grad_shard, _ = FlatParamHandle._get_unpadded_shard(
-                        handle.flat_param.grad,
-                        handle.rank,
-                        handle.world_size,
-                    )
-                    existing_grad[: grad_shard.numel()].copy_(grad_shard)
-
-    @contextlib.contextmanager
-    def _unflatten_as_params(self) -> Generator:
-        """
-        Assumes that the flattened parameter is unsharded. When in the context,
-        de-registers the flattened parameter and unflattens the original
-        parameters as ``nn.Parameter`` views into the flattened parameter.
-        After the context, re-registers the flattened parameter and restores
-        the original parameters as ``Tensor`` views into the flattened
-        parameter.
-        """
-        if not self._handles:
-            yield
-        else:
-            self._deregister_flat_param()
-            try:
-                with self._handles[0].unflatten_as_params():
-                    yield
-            finally:
-                if not self._handles[0]._use_orig_params:
-                    self._register_flat_param()
-
-    def _register_flat_param(self):
-        """
-        Registers the flattened parameter to the wrapped module, making it
-        visible to ``nn.Module`` methods.
-
-        We do not use :meth:`nn.Module.register_parameter` because we want
-        ``FLAT_PARAM`` to always be an attribute but dynamically change whether
-        it is visible to ``nn.Module`` methods.
-        """
-        if self._has_params:
-            self.module._parameters[FLAT_PARAM] = self._handles[0].flat_param
-
-    def _deregister_flat_param(self):
-        """
-        De-registers the flattened parameter from the wrapped module, hiding it
-        from ``nn.Module`` methods.
-
-        We do not use ``del`` because we want ``FLAT_PARAM`` to always be an
-        attribute but dynamically change whether it is visible to ``nn.Module``
-        methods.
-        """
-        if self._has_params:
-            self.module._parameters.pop(FLAT_PARAM, None)
 
     @contextlib.contextmanager
     def _deregister_orig_params_ctx(self):
@@ -1026,46 +895,12 @@ def _deregister_orig_params_ctx(self):
             "`_use_orig_params=True`",
         )
         for fsdp_module in self.fsdp_modules(self):
-            fsdp_module._deregister_orig_params()
+            _deregister_orig_params(fsdp_module, fsdp_module)
         try:
             yield
         finally:
             for fsdp_module in self.fsdp_modules(self):
-                fsdp_module._register_orig_params()
-
-    def _deregister_orig_params(self):
-        """
-        Deregisters the original parameters; registers the ``FlatParameter``.
-        """
-        p_assert(
-            len(self._handles) <= 1,
-            "Expects <=1 handle per FSDP instance; needs to be refactored "
-            "for >1 handle (e.g. non-recursive wrapping)",
-        )
-        if not self._handles:
-            return
-        handle = self._handles[0]
-        p_assert(
-            handle._use_orig_params,
-            f"Inconsistent `_use_orig_params` -- FSDP: {self._use_orig_params} "
-            f"handle: {handle._use_orig_params}",
-        )
-        handle._deregister_orig_params()
-        self._register_flat_param()
-
-    def _register_orig_params(self):
-        """
-        Deregisters the ``FlatParameter``; registers the original parameters.
-        """
-        if not self._handles:
-            return
-        handle = self._handles[0]
-        self._deregister_flat_param()
-        if handle.is_sharded(handle.flat_param):
-            handle._use_sharded_views()
-            handle._use_sharded_grad_views()
-        else:
-            handle._use_unsharded_views(as_params=True)
+                _register_orig_params(fsdp_module, fsdp_module)
 
     def _apply(self, *args, **kwargs):
         """

From ba1454e2076e8efc6f123ff7205d78b2a8c8acf7 Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Tue, 8 Nov 2022 10:22:31 -0800
Subject: [PATCH 0803/1922] [ONNX] Add stack info to diagnostics (#87258)

~~Investigating strange bug releasing 'graph' right when returning from `_C._jit_pass_onnx`.~~
~~Can be repro-ed locally via `test_cpp_diagnose`, with changes in this PR.~~
Resolved by https://github.com/pytorch/pytorch/pull/87829.
This PR adds methods to record stack backtrace information to diagnostics.

* #87830
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87258
Approved by: https://github.com/abock
---
 test/onnx/internal/test_diagnostics.py        | 77 +++++++++++++++----
 .../onnx/_internal/diagnostics/_diagnostic.py | 61 ++++++++++++---
 .../_internal/diagnostics/infra/__init__.py   |  2 +
 .../_internal/diagnostics/infra/_infra.py     | 49 ++++++------
 .../onnx/_internal/diagnostics/infra/utils.py | 35 +++++++++
 5 files changed, 169 insertions(+), 55 deletions(-)
 create mode 100644 torch/onnx/_internal/diagnostics/infra/utils.py

diff --git a/test/onnx/internal/test_diagnostics.py b/test/onnx/internal/test_diagnostics.py
index fbd888329a50e..ea9a789e91c1f 100644
--- a/test/onnx/internal/test_diagnostics.py
+++ b/test/onnx/internal/test_diagnostics.py
@@ -3,6 +3,7 @@
 import contextlib
 import dataclasses
 import io
+import typing
 import unittest
 from typing import AbstractSet, Tuple
 
@@ -110,23 +111,15 @@ class TestOnnxDiagnostics(common_utils.TestCase):
     def setUp(self):
         engine = diagnostics.engine
         engine.clear()
+        self._sample_rule = diagnostics.rules.missing_custom_symbolic_function
         super().setUp()
 
-    def test_assert_diagnostic_raises_when_diagnostic_not_found(self):
-        with self.assertRaises(AssertionError):
-            with assert_diagnostic(
-                self,
-                diagnostics.engine,
-                diagnostics.rules.node_missing_onnx_shape_inference,
-                diagnostics.levels.WARNING,
-            ):
-                pass
-
-    def test_cpp_diagnose_emits_warning(self):
+    def _trigger_node_missing_onnx_shape_inference_warning_diagnostic_from_cpp(
+        self,
+    ) -> diagnostics.ExportDiagnostic:
         class CustomAdd(torch.autograd.Function):
             @staticmethod
             def forward(ctx, x, y):
-                ctx.save_for_backward(x, y)
                 return x + y
 
             @staticmethod
@@ -137,6 +130,30 @@ class M(torch.nn.Module):
             def forward(self, x):
                 return CustomAdd.apply(x, x)
 
+        # trigger warning for missing shape inference.
+        rule = diagnostics.rules.node_missing_onnx_shape_inference
+        torch.onnx.export(M(), torch.randn(3, 4), io.BytesIO())
+
+        context = diagnostics.engine.contexts[-1]
+        for diagnostic in context.diagnostics:
+            if (
+                diagnostic.rule == rule
+                and diagnostic.level == diagnostics.levels.WARNING
+            ):
+                return typing.cast(diagnostics.ExportDiagnostic, diagnostic)
+        raise AssertionError("No diagnostic found.")
+
+    def test_assert_diagnostic_raises_when_diagnostic_not_found(self):
+        with self.assertRaises(AssertionError):
+            with assert_diagnostic(
+                self,
+                diagnostics.engine,
+                diagnostics.rules.node_missing_onnx_shape_inference,
+                diagnostics.levels.WARNING,
+            ):
+                pass
+
+    def test_cpp_diagnose_emits_warning(self):
         with assert_diagnostic(
             self,
             diagnostics.engine,
@@ -144,7 +161,7 @@ def forward(self, x):
             diagnostics.levels.WARNING,
         ):
             # trigger warning for missing shape inference.
-            torch.onnx.export(M(), torch.randn(3, 4), io.BytesIO())
+            self._trigger_node_missing_onnx_shape_inference_warning_diagnostic_from_cpp()
 
     def test_py_diagnose_emits_error(self):
         class M(torch.nn.Module):
@@ -168,15 +185,43 @@ def forward(self, x):
     def test_diagnostics_engine_records_diagnosis_reported_outside_of_export(
         self,
     ):
-        sample_rule = diagnostics.rules.missing_custom_symbolic_function
         sample_level = diagnostics.levels.ERROR
         with assert_diagnostic(
             self,
             diagnostics.engine,
-            sample_rule,
+            self._sample_rule,
             sample_level,
         ):
-            diagnostics.context.diagnose(sample_rule, sample_level)
+            diagnostics.context.diagnose(self._sample_rule, sample_level)
+
+    def test_diagnostics_records_python_call_stack(self):
+        diagnostic = diagnostics.ExportDiagnostic(
+            self._sample_rule, diagnostics.levels.NOTE
+        )
+        stack = diagnostic.python_call_stack
+        assert stack is not None  # for mypy
+        self.assertGreater(len(stack.frames), 0)
+        frame = stack.frames[0]
+        assert frame.location.snippet is not None  # for mypy
+        self.assertIn("self._sample_rule", frame.location.snippet)
+        assert frame.location.uri is not None  # for mypy
+        self.assertIn("test_diagnostics.py", frame.location.uri)
+
+    def test_diagnostics_records_cpp_call_stack(self):
+        diagnostic = (
+            self._trigger_node_missing_onnx_shape_inference_warning_diagnostic_from_cpp()
+        )
+        stack = diagnostic.cpp_call_stack
+        assert stack is not None  # for mypy
+        self.assertGreater(len(stack.frames), 0)
+        frame_messages = [frame.location.message for frame in stack.frames]
+        self.assertTrue(
+            any(
+                isinstance(message, str)
+                and "torch::jit::ONNXShapeTypeInference" in message
+                for message in frame_messages
+            )
+        )
 
 
 @dataclasses.dataclass
diff --git a/torch/onnx/_internal/diagnostics/_diagnostic.py b/torch/onnx/_internal/diagnostics/_diagnostic.py
index ae6615e831cb2..21e44f2b44671 100644
--- a/torch/onnx/_internal/diagnostics/_diagnostic.py
+++ b/torch/onnx/_internal/diagnostics/_diagnostic.py
@@ -5,11 +5,38 @@
 
 import torch
 from torch.onnx._internal.diagnostics import infra
+from torch.onnx._internal.diagnostics.infra import utils as infra_utils
+from torch.utils import cpp_backtrace
 
 # This is a workaround for mypy not supporting Self from typing_extensions.
 _ExportDiagnostic = TypeVar("_ExportDiagnostic", bound="ExportDiagnostic")
 
 
+def _cpp_call_stack(frames_to_skip: int = 0, frames_to_log: int = 32):
+    """Returns the current C++ call stack.
+
+    This function utilizes `torch.utils.cpp_backtrace` to get the current C++ call stack.
+    The returned C++ call stack is a concatenated string of the C++ call stack frames.
+    Each frame is separated by a newline character, in the same format of
+    r"frame #[0-9]+: (?P<frame_info>.*)". More info at `c10/util/Backtrace.cpp`.
+
+    """
+    frames = cpp_backtrace.get_cpp_backtrace(frames_to_skip, frames_to_log).split("\n")
+    frame_messages = []
+    for frame in frames:
+        segments = frame.split(":", 1)
+        if len(segments) == 2:
+            frame_messages.append(segments[1].strip())
+        else:
+            frame_messages.append("<unknown frame>")
+    return infra.Stack(
+        frames=[
+            infra.StackFrame(location=infra.Location(message=message))
+            for message in frame_messages
+        ]
+    )
+
+
 class ExportDiagnostic(infra.Diagnostic):
     """Base class for all export diagnostics.
 
@@ -18,24 +45,34 @@ class ExportDiagnostic(infra.Diagnostic):
     diagnostic.
     """
 
+    python_call_stack: Optional[infra.Stack] = None
+    cpp_call_stack: Optional[infra.Stack] = None
+
     def __init__(
         self,
         *args,
         **kwargs,
     ) -> None:
         super().__init__(*args, **kwargs)
-
-    def with_cpp_stack(self: _ExportDiagnostic) -> _ExportDiagnostic:
-        # TODO: Implement this.
-        # self.stacks.append(...)
-        raise NotImplementedError()
-        return self
-
-    def with_python_stack(self: _ExportDiagnostic) -> _ExportDiagnostic:
-        # TODO: Implement this.
-        # self.stacks.append(...)
-        raise NotImplementedError()
-        return self
+        self.record_python_call_stack(frames_to_skip=1)
+        self.record_cpp_call_stack(frames_to_skip=1)
+
+    def record_python_call_stack(self, frames_to_skip) -> None:
+        """Records the current Python call stack in the diagnostic."""
+        frames_to_skip += 1  # Skip this function.
+        stack = infra_utils.python_call_stack(frames_to_skip=frames_to_skip)
+        stack.message = "Python call stack"
+        self.with_stack(stack)
+        self.python_call_stack = stack
+
+    def record_cpp_call_stack(self, frames_to_skip) -> None:
+        """Records the current C++ call stack in the diagnostic."""
+        # No need to skip this function because python frame is not recorded
+        # in cpp call stack.
+        stack = _cpp_call_stack(frames_to_skip=frames_to_skip)
+        stack.message = "C++ call stack"
+        self.with_stack(stack)
+        self.cpp_call_stack = stack
 
     def with_model_source_location(
         self: _ExportDiagnostic,
diff --git a/torch/onnx/_internal/diagnostics/infra/__init__.py b/torch/onnx/_internal/diagnostics/infra/__init__.py
index ac9e6e99a9746..4f9dd9e5fa0b3 100644
--- a/torch/onnx/_internal/diagnostics/infra/__init__.py
+++ b/torch/onnx/_internal/diagnostics/infra/__init__.py
@@ -8,6 +8,7 @@
     Rule,
     RuleCollection,
     Stack,
+    StackFrame,
 )
 from .engine import DiagnosticEngine
 
@@ -22,4 +23,5 @@
     "Rule",
     "RuleCollection",
     "Stack",
+    "StackFrame",
 ]
diff --git a/torch/onnx/_internal/diagnostics/infra/_infra.py b/torch/onnx/_internal/diagnostics/infra/_infra.py
index 6966ccccbb264..b8a4c5032f523 100644
--- a/torch/onnx/_internal/diagnostics/infra/_infra.py
+++ b/torch/onnx/_internal/diagnostics/infra/_infra.py
@@ -110,11 +110,12 @@ def format_message(self, *args, **kwargs) -> str:
 
 @dataclasses.dataclass
 class Location:
-    uri: str
-    message: str
+    uri: Optional[str] = None
     line: Optional[int] = None
+    message: Optional[str] = None
     start_column: Optional[int] = None
     end_column: Optional[int] = None
+    snippet: Optional[str] = None
 
     def sarif(self) -> sarif.Location:
         """Returns the SARIF representation of this location."""
@@ -124,43 +125,37 @@ def sarif(self) -> sarif.Location:
                 region=sarif.Region(
                     start_line=self.line,
                     start_column=self.start_column,
-                    end_line=self.line,
                     end_column=self.end_column,
+                    snippet=sarif.ArtifactContent(text=self.snippet),
                 ),
             ),
-            message=sarif.Message(text=self.message),
+            message=sarif.Message(text=self.message)
+            if self.message is not None
+            else None,
         )
 
 
+@dataclasses.dataclass
+class StackFrame:
+    location: Location
+
+    def sarif(self) -> sarif.StackFrame:
+        """Returns the SARIF representation of this stack frame."""
+        return sarif.StackFrame(location=self.location.sarif())
+
+
 @dataclasses.dataclass
 class Stack:
-    frame_locations: List[Location] = dataclasses.field(default_factory=list)
+    frames: List[StackFrame] = dataclasses.field(default_factory=list)
+    message: Optional[str] = None
 
     def sarif(self) -> sarif.Stack:
         """Returns the SARIF representation of this stack."""
         return sarif.Stack(
-            frames=[
-                sarif.StackFrame(location=loc.sarif()) for loc in self.frame_locations
-            ]
-        )
-
-    def add_frame(
-        self,
-        uri: str,
-        message: str,
-        line: Optional[int] = None,
-        start_column: Optional[int] = None,
-        end_column: Optional[int] = None,
-    ) -> None:
-        """Adds a frame to the stack."""
-        self.frame_locations.append(
-            Location(
-                uri=uri,
-                message=message,
-                line=line,
-                start_column=start_column,
-                end_column=end_column,
-            )
+            frames=[frame.sarif() for frame in self.frames],
+            message=sarif.Message(text=self.message)
+            if self.message is not None
+            else None,
         )
 
 
diff --git a/torch/onnx/_internal/diagnostics/infra/utils.py b/torch/onnx/_internal/diagnostics/infra/utils.py
new file mode 100644
index 0000000000000..c32de1c6b8ad9
--- /dev/null
+++ b/torch/onnx/_internal/diagnostics/infra/utils.py
@@ -0,0 +1,35 @@
+import inspect
+
+from torch.onnx._internal.diagnostics.infra import _infra
+
+
+def python_frame(frame: inspect.FrameInfo) -> _infra.StackFrame:
+    """Returns a StackFrame for the given inspect.FrameInfo."""
+    snippet = (
+        frame.code_context[frame.index]
+        if frame.code_context is not None and frame.index is not None
+        else None
+    )
+
+    return _infra.StackFrame(
+        location=_infra.Location(
+            uri=frame.filename,
+            line=frame.lineno,
+            snippet=snippet,
+        )
+    )
+
+
+def python_call_stack(frames_to_skip: int = 0, frames_to_log: int = 32) -> _infra.Stack:
+    """Returns the current Python call stack."""
+    if frames_to_skip < 0:
+        raise ValueError("frames_to_skip must be non-negative")
+    if frames_to_log < 0:
+        raise ValueError("frames_to_log must be non-negative")
+    frames_to_skip += 1  # Skip this function.
+    stack = _infra.Stack()
+    stack.frames = [
+        python_frame(frame)
+        for frame in inspect.stack()[frames_to_skip : frames_to_skip + frames_to_log]
+    ]
+    return stack

From 2b8a3a28fd155524e3a1ac429cc17a04a4961482 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 11 Nov 2022 19:08:30 +0000
Subject: [PATCH 0804/1922] Revert "add DisableTorchFunction that matches
 DisableTorchDispatch (#88219)"

This reverts commit c0ecce15b5a54ff0185f9976e6bfb6f3a7de698d.

Reverted https://github.com/pytorch/pytorch/pull/88219 on behalf of https://github.com/izaitsevfb due to BC-breaking change, D41211901
---
 aten/src/ATen/PythonTorchFunctionTLS.cpp    |  11 +--
 aten/src/ATen/PythonTorchFunctionTLS.h      |  12 +--
 test/allowlist_for_publicAPI.json           |   1 -
 test/test_overrides.py                      |  21 ----
 test/test_public_bindings.py                |   1 -
 torch/_C/__init__.pyi.in                    |   1 -
 torch/__init__.py                           |   2 +-
 torch/csrc/Module.cpp                       |   4 -
 torch/csrc/autograd/init.cpp                |   9 +-
 torch/csrc/utils/disable_torch_function.cpp | 100 ++------------------
 torch/csrc/utils/disable_torch_function.h   |   1 -
 11 files changed, 24 insertions(+), 139 deletions(-)

diff --git a/aten/src/ATen/PythonTorchFunctionTLS.cpp b/aten/src/ATen/PythonTorchFunctionTLS.cpp
index 00f372f370e62..c9487c6958cbf 100644
--- a/aten/src/ATen/PythonTorchFunctionTLS.cpp
+++ b/aten/src/ATen/PythonTorchFunctionTLS.cpp
@@ -26,12 +26,12 @@ int64_t PythonTorchFunctionTLS::stack_len() {
   return pythonTorchFunctionState.stack_.size();
 }
 
-void PythonTorchFunctionTLS::set_disabled_state(TorchFunctionDisabledState disabled_state) {
-  pythonTorchFunctionState.disabled_state_ = disabled_state;
+void PythonTorchFunctionTLS::set_disabled(bool disabled) {
+  pythonTorchFunctionState.disabled_ = disabled;
 }
 
-TorchFunctionDisabledState PythonTorchFunctionTLS::get_disabled_state() {
-  return pythonTorchFunctionState.disabled_state_;
+bool PythonTorchFunctionTLS::is_disabled() {
+  return pythonTorchFunctionState.disabled_;
 }
 
 void PythonTorchFunctionTLS::set_state(const PythonTorchFunctionTLS& state) {
@@ -43,8 +43,7 @@ const PythonTorchFunctionTLS& PythonTorchFunctionTLS::get_state() {
 }
 
 bool torch_function_mode_enabled() {
-  return PythonTorchFunctionTLS::get_disabled_state() != TorchFunctionDisabledState::ALL_DISABLED &&
-         PythonTorchFunctionTLS::stack_len() > 0;
+  return PythonTorchFunctionTLS::stack_len() > 0;
 }
 
 } // namespace impl
diff --git a/aten/src/ATen/PythonTorchFunctionTLS.h b/aten/src/ATen/PythonTorchFunctionTLS.h
index a1e3a61ea2023..5940fb6f2dee2 100644
--- a/aten/src/ATen/PythonTorchFunctionTLS.h
+++ b/aten/src/ATen/PythonTorchFunctionTLS.h
@@ -6,11 +6,9 @@
 namespace at {
 namespace impl {
 
-enum TorchFunctionDisabledState { ENABLED, SUBCLASSES_DISABLED, ALL_DISABLED };
-
 struct TORCH_API PythonTorchFunctionTLS {
-  static void set_disabled_state(TorchFunctionDisabledState disabled_state_);
-  static TorchFunctionDisabledState get_disabled_state();
+  static void set_disabled(bool);
+  static bool is_disabled();
 
   static void push_onto_stack(std::shared_ptr<SafePyObject> mode);
   static const std::shared_ptr<SafePyObject> pop_stack();
@@ -22,11 +20,11 @@ struct TORCH_API PythonTorchFunctionTLS {
 
  private:
   // The mode TLS is split into
-  //   - disabled_state, which says which part of torch function are disabled
+  //   - disabled_, which says whether or not to disable all torch function
+  //   modes
   //   - stack_, which is a vector of modes representing the stack of user
   //   defined modes
-  TorchFunctionDisabledState disabled_state_ =
-      TorchFunctionDisabledState::ENABLED;
+  bool disabled_;
   std::vector<std::shared_ptr<c10::SafePyObject>> stack_;
 };
 
diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index 45ba9ae94676d..8a66dc12d4b6f 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -1128,7 +1128,6 @@
     "BFloat16Tensor",
     "ComplexDoubleStorage",
     "ComplexFloatStorage",
-    "DisableTorchFunction",
     "DisableTorchFunctionSubclass",
     "Generator",
     "HalfStorage",
diff --git a/test/test_overrides.py b/test/test_overrides.py
index 3b3a5ed063c70..01c763a548fc8 100644
--- a/test/test_overrides.py
+++ b/test/test_overrides.py
@@ -1453,27 +1453,6 @@ class B(torch.Tensor):
 
         self.assertTrue(called)
 
-    def test_disable_subclass_mode(self):
-        called = False
-
-        class A(TorchFunctionMode):
-            def __torch_function__(self, func, types, args=(), kwargs=None):
-                nonlocal called
-                if kwargs is None:
-                    kwargs = {}
-                called = True
-                return func(*args, **kwargs)
-
-        class B(torch.Tensor):
-            pass
-
-        x = B(torch.randn(5))
-        with A():
-            with torch._C.DisableTorchFunction():
-                self.assertNotIsInstance(torch.sum(x), B)
-
-        self.assertFalse(called)
-
     def test_disable_enable_subclass(self):
         called = False
 
diff --git a/test/test_public_bindings.py b/test/test_public_bindings.py
index 46c7396b9b07f..6897c3102df60 100644
--- a/test/test_public_bindings.py
+++ b/test/test_public_bindings.py
@@ -99,7 +99,6 @@ def test_no_new_bindings(self):
             "device",
             "DeviceObjType",
             "DictType",
-            "DisableTorchFunction",
             "DisableTorchFunctionSubclass",
             "DispatchKey",
             "DispatchKeySet",
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index bc4bf03d8161f..79dd6386c3789 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -108,7 +108,6 @@ class layout:
     ...
 
 # Defined in torch/csrc/utils/disable_torch_function.cpp
-def DisableTorchFunction(): ...
 def DisableTorchFunctionSubclass(): ...
 
 # Defined in torch/csrc/utils/tensor_layouts.cpp
diff --git a/torch/__init__.py b/torch/__init__.py
index 6049967b6f18e..ec23499dce659 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -315,7 +315,7 @@ def get_pyobj(self):
         if (isinstance(obj, Callable) or inspect.isclass(obj)):  # type: ignore[arg-type]
             if (obj.__module__ != 'torch'):
                 # TODO: fix their module from C++ side
-                if name not in ['DisableTorchFunctionSubclass', 'DisableTorchFunction', 'Generator']:
+                if name not in ['DisableTorchFunctionSubclass', 'Generator']:
                     obj.__module__ = 'torch'
 
 if not TYPE_CHECKING:
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 0a9aa53a0bbc4..efe6c18ea0cd4 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -1597,10 +1597,6 @@ Call this whenever a new thread is created in order to propagate values from
       "DisableTorchFunctionSubclass",
       (PyObject*)THPModule_DisableTorchFunctionSubclassType(),
       /* incref= */ false));
-  ASSERT_TRUE(set_module_attr(
-      "DisableTorchFunction",
-      (PyObject*)THPModule_DisableTorchFunctionType(),
-      /* incref= */ false));
   torch::set_disabled_torch_function_impl(
       PyObject_GetAttrString(module, "_disabled_torch_function_impl"));
   ASSERT_TRUE(torch::disabled_torch_function_impl() != nullptr);
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 6271cfd5cb997..d26db95f1295c 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -60,14 +60,13 @@ struct DisableAutocast {
 
 struct EnableTorchFunction {
   EnableTorchFunction()
-      : old_(at::impl::PythonTorchFunctionTLS::get_disabled_state()) {
-    at::impl::PythonTorchFunctionTLS::set_disabled_state(
-        at::impl::TorchFunctionDisabledState::ENABLED);
+      : old_(at::impl::PythonTorchFunctionTLS::is_disabled()) {
+    at::impl::PythonTorchFunctionTLS::set_disabled(false);
   }
   ~EnableTorchFunction() {
-    at::impl::PythonTorchFunctionTLS::set_disabled_state(old_);
+    at::impl::PythonTorchFunctionTLS::set_disabled(old_);
   }
-  at::impl::TorchFunctionDisabledState old_;
+  bool old_;
 };
 
 struct EnablePythonDispatcher {
diff --git a/torch/csrc/utils/disable_torch_function.cpp b/torch/csrc/utils/disable_torch_function.cpp
index 589b069250a36..516e6b89d43af 100644
--- a/torch/csrc/utils/disable_torch_function.cpp
+++ b/torch/csrc/utils/disable_torch_function.cpp
@@ -11,8 +11,7 @@ PyObject* disabled_torch_function = nullptr;
 PyObject* disabled_torch_dispatch = nullptr;
 
 bool torch_function_enabled() {
-  return at::impl::PythonTorchFunctionTLS::get_disabled_state() ==
-      at::impl::TorchFunctionDisabledState::ENABLED;
+  return !at::impl::PythonTorchFunctionTLS::is_disabled();
 }
 
 PyObject* disabled_torch_function_impl() {
@@ -35,23 +34,20 @@ void set_disabled_torch_dispatch_impl(PyObject* value) {
 typedef struct {
   PyObject_HEAD
       /* Type-specific fields go here. */
-      at::impl::TorchFunctionDisabledState old_state;
+      bool old_state;
 } DisableTorchFunctionSubclass;
 
 PyObject* DisableTorchFunctionSubclass__enter(
     PyObject* self,
     PyObject* unused) {
-  const auto old_state = at::impl::PythonTorchFunctionTLS::get_disabled_state();
-  ((DisableTorchFunctionSubclass*)self)->old_state = old_state;
-  if (old_state == at::impl::TorchFunctionDisabledState::ENABLED) {
-    at::impl::PythonTorchFunctionTLS::set_disabled_state(
-        at::impl::TorchFunctionDisabledState::SUBCLASSES_DISABLED);
-  }
+  ((DisableTorchFunctionSubclass*)self)->old_state =
+      at::impl::PythonTorchFunctionTLS::is_disabled();
+  at::impl::PythonTorchFunctionTLS::set_disabled(true);
   Py_RETURN_NONE;
 }
 
 PyObject* DisableTorchFunctionSubclass__exit(PyObject* self, PyObject* unused) {
-  at::impl::PythonTorchFunctionTLS::set_disabled_state(
+  at::impl::PythonTorchFunctionTLS::set_disabled(
       ((DisableTorchFunctionSubclass*)self)->old_state);
   Py_RETURN_NONE;
 }
@@ -119,81 +115,6 @@ PyObject* THPModule_DisableTorchFunctionSubclassType() {
   return (PyObject*)(&DisableTorchFunctionSubclassType);
 }
 
-typedef struct {
-  PyObject_HEAD
-      /* Type-specific fields go here. */
-      at::impl::TorchFunctionDisabledState old_state;
-} DisableTorchFunction;
-
-PyObject* DisableTorchFunction__enter(PyObject* self, PyObject* unused) {
-  ((DisableTorchFunctionSubclass*)self)->old_state =
-      at::impl::PythonTorchFunctionTLS::get_disabled_state();
-  at::impl::PythonTorchFunctionTLS::set_disabled_state(
-      at::impl::TorchFunctionDisabledState::ALL_DISABLED);
-  Py_RETURN_NONE;
-}
-
-PyObject* DisableTorchFunction__exit(PyObject* self, PyObject* unused) {
-  at::impl::PythonTorchFunctionTLS::set_disabled_state(
-      ((DisableTorchFunctionSubclass*)self)->old_state);
-  Py_RETURN_NONE;
-}
-
-static PyMethodDef DisableTorchFunction_methods[] = { // NOLINT
-    {"__enter__", DisableTorchFunction__enter, METH_NOARGS, nullptr},
-    {"__exit__", DisableTorchFunction__exit, METH_VARARGS, nullptr},
-    {nullptr, nullptr, 0, nullptr}};
-
-PyTypeObject DisableTorchFunctionType = {
-    PyVarObject_HEAD_INIT(
-        nullptr,
-        0) "torch._C.DisableTorchFunction", /* tp_name */
-    sizeof(DisableTorchFunction), /* tp_basicsize */
-    0, /* tp_itemsize */
-    nullptr, /* tp_dealloc */
-    0, /* tp_vectorcall_offset */
-    nullptr, /* tp_getattr */
-    nullptr, /* tp_setattr */
-    nullptr, /* tp_reserved */
-    nullptr, /* tp_repr */
-    nullptr, /* tp_as_number */
-    nullptr, /* tp_as_sequence */
-    nullptr, /* tp_as_mapping */
-    nullptr, /* tp_hash  */
-    nullptr, /* tp_call */
-    nullptr, /* tp_str */
-    nullptr, /* tp_getattro */
-    nullptr, /* tp_setattro */
-    nullptr, /* tp_as_buffer */
-    Py_TPFLAGS_DEFAULT, /* tp_flags */
-    nullptr, /* tp_doc */
-    nullptr, /* tp_traverse */
-    nullptr, /* tp_clear */
-    nullptr, /* tp_richcompare */
-    0, /* tp_weaklistoffset */
-    nullptr, /* tp_iter */
-    nullptr, /* tp_iternext */
-    DisableTorchFunction_methods, /* tp_methods */
-    nullptr, /* tp_members */
-    nullptr, /* tp_getset */
-    nullptr, /* tp_base */
-    nullptr, /* tp_dict */
-    nullptr, /* tp_descr_get */
-    nullptr, /* tp_descr_set */
-    0, /* tp_dictoffset */
-    nullptr, /* tp_init */
-    PyType_GenericAlloc, /* tp_alloc */
-    PyType_GenericNew, /* tp_new */
-};
-
-PyObject* THPModule_DisableTorchFunctionType() {
-  if (PyType_Ready(&DisableTorchFunctionType) < 0) {
-    return nullptr;
-  }
-
-  return (PyObject*)(&DisableTorchFunctionType);
-}
-
 PyObject* THPModule_disable_torch_function(PyObject* self, PyObject* a) {
   HANDLE_TH_ERRORS
   PyObject *func = nullptr, *types = nullptr, *args = nullptr,
@@ -216,14 +137,11 @@ PyObject* THPModule_disable_torch_function(PyObject* self, PyObject* a) {
   // These are all C-API calls so no exceptions will be raised
   // and therefore no need for RAII approach to storing
   // the old value.
-  auto old_value = at::impl::PythonTorchFunctionTLS::get_disabled_state();
-  if (old_value == at::impl::TorchFunctionDisabledState::ENABLED) {
-    at::impl::PythonTorchFunctionTLS::set_disabled_state(
-        at::impl::TorchFunctionDisabledState::SUBCLASSES_DISABLED);
-  }
+  bool old_value = at::impl::PythonTorchFunctionTLS::is_disabled();
+  at::impl::PythonTorchFunctionTLS::set_disabled(true);
   // kwargs can safely be nullptr here.
   PyObject* result = PyObject_Call(func, py_args.ptr(), kwargs);
-  at::impl::PythonTorchFunctionTLS::set_disabled_state(old_value);
+  at::impl::PythonTorchFunctionTLS::set_disabled(old_value);
   return result;
   END_HANDLE_TH_ERRORS
 }
diff --git a/torch/csrc/utils/disable_torch_function.h b/torch/csrc/utils/disable_torch_function.h
index 8fc5118830eb7..881a7adb13ebf 100644
--- a/torch/csrc/utils/disable_torch_function.h
+++ b/torch/csrc/utils/disable_torch_function.h
@@ -29,7 +29,6 @@ struct DisableTorchDispatch {
 } // namespace torch
 
 PyObject* THPModule_isEnabledTorchFunction(PyObject* self, PyObject* unused);
-PyObject* THPModule_DisableTorchFunctionType();
 PyObject* THPModule_DisableTorchFunctionSubclassType();
 PyObject* THPModule_disable_torch_function(PyObject* self, PyObject* args);
 PyObject* THPModule_disable_torch_dispatch(PyObject* self, PyObject* args);

From 46fa6d8a43021e562cb903c219e429fc6dcba2ea Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 11 Nov 2022 19:13:05 +0000
Subject: [PATCH 0805/1922] Revert "rename DisableTorchFunction to
 DisableTorchFunctionSubclass (#88218)"

This reverts commit 7f28be10e5e71efda37800384fa897785499bed1.

Reverted https://github.com/pytorch/pytorch/pull/88218 on behalf of https://github.com/izaitsevfb due to BC-breaking change, D41211901
---
 test/allowlist_for_publicAPI.json             |  2 +-
 test/profiler/test_profiler_tree.py           |  2 +-
 test/test_overrides.py                        |  4 +--
 test/test_public_bindings.py                  |  2 +-
 torch/_C/__init__.pyi.in                      |  2 +-
 torch/__init__.py                             |  2 +-
 torch/_dynamo/variables/builder.py            |  2 +-
 torch/_dynamo/variables/misc.py               |  2 +-
 torch/_dynamo/variables/tensor.py             |  2 +-
 torch/_subclasses/fake_tensor.py              |  2 +-
 torch/_tensor.py                              |  2 +-
 torch/csrc/Module.cpp                         |  4 +--
 torch/csrc/autograd/init.cpp                  |  1 +
 torch/csrc/utils/disable_torch_function.cpp   | 32 +++++++++----------
 torch/csrc/utils/disable_torch_function.h     |  2 +-
 torch/distributed/_shard/common_op_utils.py   |  4 +--
 torch/distributed/_shard/partial_tensor.py    |  2 +-
 torch/distributed/_shard/replicated_tensor.py |  4 +--
 .../_shard/sharded_tensor/_ops/tensor_ops.py  |  2 +-
 torch/masked/maskedtensor/core.py             |  2 +-
 20 files changed, 38 insertions(+), 39 deletions(-)

diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index 8a66dc12d4b6f..ba4a2e96df219 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -1128,7 +1128,7 @@
     "BFloat16Tensor",
     "ComplexDoubleStorage",
     "ComplexFloatStorage",
-    "DisableTorchFunctionSubclass",
+    "DisableTorchFunction",
     "Generator",
     "HalfStorage",
     "HalfTensor",
diff --git a/test/profiler/test_profiler_tree.py b/test/profiler/test_profiler_tree.py
index 210530250f924..d4a31c6456131 100644
--- a/test/profiler/test_profiler_tree.py
+++ b/test/profiler/test_profiler_tree.py
@@ -26,7 +26,7 @@
     "torch/profiler/profiler.py(...): start": KEEP_ELLIPSES,
     "torch/profiler/profiler.py(...): stop_trace": KEEP_ELLIPSES,
     "torch/profiler/profiler.py(...): _transit_action": KEEP_ELLIPSES,
-    "<built-in method __exit__ of torch._C.DisableTorchFunctionSubclass object at 0xXXXXXXXXXXXX>": PRUNE_ALL,
+    "<built-in method __exit__ of torch._C.DisableTorchFunction object at 0xXXXXXXXXXXXX>": PRUNE_ALL,
     "cudaStreamIsCapturing": PRUNE_ALL,
     "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags": PRUNE_ALL,
 }
diff --git a/test/test_overrides.py b/test/test_overrides.py
index 01c763a548fc8..7082f75a2141f 100644
--- a/test/test_overrides.py
+++ b/test/test_overrides.py
@@ -1448,7 +1448,7 @@ class B(torch.Tensor):
 
         x = B(torch.randn(5))
         with A():
-            with torch._C.DisableTorchFunctionSubclass():
+            with torch._C.DisableTorchFunction():
                 self.assertNotIsInstance(torch.sum(x), B)
 
         self.assertTrue(called)
@@ -1460,7 +1460,7 @@ class A(torch.Tensor):
             pass
 
         x = A(torch.randn(5))
-        with torch._C.DisableTorchFunctionSubclass():
+        with torch._C.DisableTorchFunction():
             g = torch._C._EnableTorchFunction()
             try:
                 self.assertIsInstance(torch.sum(x), A)
diff --git a/test/test_public_bindings.py b/test/test_public_bindings.py
index 6897c3102df60..4d2df65126983 100644
--- a/test/test_public_bindings.py
+++ b/test/test_public_bindings.py
@@ -99,7 +99,7 @@ def test_no_new_bindings(self):
             "device",
             "DeviceObjType",
             "DictType",
-            "DisableTorchFunctionSubclass",
+            "DisableTorchFunction",
             "DispatchKey",
             "DispatchKeySet",
             "dtype",
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 79dd6386c3789..2d20da2a04f30 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -108,7 +108,7 @@ class layout:
     ...
 
 # Defined in torch/csrc/utils/disable_torch_function.cpp
-def DisableTorchFunctionSubclass(): ...
+def DisableTorchFunction(): ...
 
 # Defined in torch/csrc/utils/tensor_layouts.cpp
 strided : layout = ...
diff --git a/torch/__init__.py b/torch/__init__.py
index ec23499dce659..19be59282cca4 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -315,7 +315,7 @@ def get_pyobj(self):
         if (isinstance(obj, Callable) or inspect.isclass(obj)):  # type: ignore[arg-type]
             if (obj.__module__ != 'torch'):
                 # TODO: fix their module from C++ side
-                if name not in ['DisableTorchFunctionSubclass', 'Generator']:
+                if name not in ['DisableTorchFunction', 'Generator']:
                     obj.__module__ = 'torch'
 
 if not TYPE_CHECKING:
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index 9d87897468554..d3c5140fa4a97 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -506,7 +506,7 @@ def wrap_tensor(self, value: torch.Tensor):
                 )
             # Disable __torch_function__ to prevent cloning of `value` to hit
             # us
-            with torch._C.DisableTorchFunctionSubclass():
+            with torch._C.DisableTorchFunction():
                 if is_constant_source(self.get_source()):
                     return self.tx.output.register_attr_or_module(
                         value,
diff --git a/torch/_dynamo/variables/misc.py b/torch/_dynamo/variables/misc.py
index 6e4325b6c0f43..da327122a6a70 100644
--- a/torch/_dynamo/variables/misc.py
+++ b/torch/_dynamo/variables/misc.py
@@ -538,7 +538,7 @@ def call_function(
             options = VariableTracker.propagate(self, new_args, new_kwargs.values())
             # Disable __torch_function__ here to prevent the clone of the
             # example tensor from going into the override.
-            with torch._C.DisableTorchFunctionSubclass():
+            with torch._C.DisableTorchFunction():
                 if isinstance(args[0], TorchVariable):
                     return TensorVariable.create(
                         tx=tx,
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index 0974f24ee9694..e87b1d87bac9b 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -743,7 +743,7 @@ def inline_torch_function_unwrapped(
 
         # Disable __torch_function__ here to prevent the clone of the
         # example tensor from going into the override.
-        with torch._C.DisableTorchFunctionSubclass():
+        with torch._C.DisableTorchFunction():
             return tx.inline_user_function_return(tf_func_var, tf_args, {})
 
 
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 79af51efc5b8e..14f5cd2de0a7a 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -1093,5 +1093,5 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
             memo[id(tensor)] = out
             return out
         else:
-            with torch._C.DisableTorchFunctionSubclass():
+            with torch._C.DisableTorchFunction():
                 return func(*args, **kwargs)
diff --git a/torch/_tensor.py b/torch/_tensor.py
index 41b6569c06d86..793034bb64ede 100644
--- a/torch/_tensor.py
+++ b/torch/_tensor.py
@@ -1297,7 +1297,7 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
         if not all(issubclass(cls, t) for t in types):
             return NotImplemented
 
-        with _C.DisableTorchFunctionSubclass():
+        with _C.DisableTorchFunction():
             ret = func(*args, **kwargs)
             if func in get_default_nowrap_functions():
                 return ret
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index efe6c18ea0cd4..b8693a484ed9d 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -1594,8 +1594,8 @@ Call this whenever a new thread is created in order to propagate values from
       (PyObject*)THPDefaultCPUGenerator,
       /* incref= */ false));
   ASSERT_TRUE(set_module_attr(
-      "DisableTorchFunctionSubclass",
-      (PyObject*)THPModule_DisableTorchFunctionSubclassType(),
+      "DisableTorchFunction",
+      (PyObject*)THPModule_DisableTorchFunctionType(),
       /* incref= */ false));
   torch::set_disabled_torch_function_impl(
       PyObject_GetAttrString(module, "_disabled_torch_function_impl"));
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index d26db95f1295c..ee963232d3166 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -343,6 +343,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
       _C_m, "_RestorePythonTLSSnapshot")
       .def(py::init<>());
 
+  // TODO: line up this binding with DisableTorchFunction
   py::class_<torch::DisableTorchDispatch>(_C_m, "_DisableTorchDispatch")
       .def(py::init<>());
   py::class_<EnableTorchFunction>(_C_m, "_EnableTorchFunction")
diff --git a/torch/csrc/utils/disable_torch_function.cpp b/torch/csrc/utils/disable_torch_function.cpp
index 516e6b89d43af..682120d7e6223 100644
--- a/torch/csrc/utils/disable_torch_function.cpp
+++ b/torch/csrc/utils/disable_torch_function.cpp
@@ -35,20 +35,18 @@ typedef struct {
   PyObject_HEAD
       /* Type-specific fields go here. */
       bool old_state;
-} DisableTorchFunctionSubclass;
+} DisableTorchFunction;
 
-PyObject* DisableTorchFunctionSubclass__enter(
-    PyObject* self,
-    PyObject* unused) {
-  ((DisableTorchFunctionSubclass*)self)->old_state =
+PyObject* DisableTorchFunction__enter(PyObject* self, PyObject* unused) {
+  ((DisableTorchFunction*)self)->old_state =
       at::impl::PythonTorchFunctionTLS::is_disabled();
   at::impl::PythonTorchFunctionTLS::set_disabled(true);
   Py_RETURN_NONE;
 }
 
-PyObject* DisableTorchFunctionSubclass__exit(PyObject* self, PyObject* unused) {
+PyObject* DisableTorchFunction__exit(PyObject* self, PyObject* unused) {
   at::impl::PythonTorchFunctionTLS::set_disabled(
-      ((DisableTorchFunctionSubclass*)self)->old_state);
+      ((DisableTorchFunction*)self)->old_state);
   Py_RETURN_NONE;
 }
 
@@ -60,16 +58,16 @@ PyObject* THPModule_isEnabledTorchFunction(PyObject* self, PyObject* unused) {
   }
 }
 
-static PyMethodDef DisableTorchFunctionSubclass_methods[] = { // NOLINT
-    {"__enter__", DisableTorchFunctionSubclass__enter, METH_NOARGS, nullptr},
-    {"__exit__", DisableTorchFunctionSubclass__exit, METH_VARARGS, nullptr},
+static PyMethodDef DisableTorchFunction_methods[] = { // NOLINT
+    {"__enter__", DisableTorchFunction__enter, METH_NOARGS, nullptr},
+    {"__exit__", DisableTorchFunction__exit, METH_VARARGS, nullptr},
     {nullptr, nullptr, 0, nullptr}};
 
-PyTypeObject DisableTorchFunctionSubclassType = {
+PyTypeObject DisableTorchFunctionType = {
     PyVarObject_HEAD_INIT(
         nullptr,
-        0) "torch._C.DisableTorchFunctionSubclass", /* tp_name */
-    sizeof(DisableTorchFunctionSubclass), /* tp_basicsize */
+        0) "torch._C.DisableTorchFunction", /* tp_name */
+    sizeof(DisableTorchFunction), /* tp_basicsize */
     0, /* tp_itemsize */
     nullptr, /* tp_dealloc */
     0, /* tp_vectorcall_offset */
@@ -94,7 +92,7 @@ PyTypeObject DisableTorchFunctionSubclassType = {
     0, /* tp_weaklistoffset */
     nullptr, /* tp_iter */
     nullptr, /* tp_iternext */
-    DisableTorchFunctionSubclass_methods, /* tp_methods */
+    DisableTorchFunction_methods, /* tp_methods */
     nullptr, /* tp_members */
     nullptr, /* tp_getset */
     nullptr, /* tp_base */
@@ -107,12 +105,12 @@ PyTypeObject DisableTorchFunctionSubclassType = {
     PyType_GenericNew, /* tp_new */
 };
 
-PyObject* THPModule_DisableTorchFunctionSubclassType() {
-  if (PyType_Ready(&DisableTorchFunctionSubclassType) < 0) {
+PyObject* THPModule_DisableTorchFunctionType() {
+  if (PyType_Ready(&DisableTorchFunctionType) < 0) {
     return nullptr;
   }
 
-  return (PyObject*)(&DisableTorchFunctionSubclassType);
+  return (PyObject*)(&DisableTorchFunctionType);
 }
 
 PyObject* THPModule_disable_torch_function(PyObject* self, PyObject* a) {
diff --git a/torch/csrc/utils/disable_torch_function.h b/torch/csrc/utils/disable_torch_function.h
index 881a7adb13ebf..3cdc33e90681b 100644
--- a/torch/csrc/utils/disable_torch_function.h
+++ b/torch/csrc/utils/disable_torch_function.h
@@ -29,7 +29,7 @@ struct DisableTorchDispatch {
 } // namespace torch
 
 PyObject* THPModule_isEnabledTorchFunction(PyObject* self, PyObject* unused);
-PyObject* THPModule_DisableTorchFunctionSubclassType();
+PyObject* THPModule_DisableTorchFunctionType();
 PyObject* THPModule_disable_torch_function(PyObject* self, PyObject* args);
 PyObject* THPModule_disable_torch_dispatch(PyObject* self, PyObject* args);
 PyObject* THPModule_has_torch_function(PyObject*, PyObject* arg);
diff --git a/torch/distributed/_shard/common_op_utils.py b/torch/distributed/_shard/common_op_utils.py
index 42d65923a5365..08aa13282abcd 100644
--- a/torch/distributed/_shard/common_op_utils.py
+++ b/torch/distributed/_shard/common_op_utils.py
@@ -53,11 +53,11 @@ def tensor_default_op(types, args=(), kwargs=None, pg=None):
         Handles ``__torch_function__`` dispatch for the default tensor ops that
         behave the same as ``torch.Tensor`` such as ``torch.Tensor.shape`` or
         ``torch.Tensor.dtype``. We simply lower to the real op call with
-        DisableTorchFunctionSubclass context like ``torch.Tensor.__torch_function__``
+        DisableTorchFunction context like ``torch.Tensor.__torch_function__``
         to avoid recursions.
         """
         if kwargs is None:
             kwargs = {}
 
-        with torch._C.DisableTorchFunctionSubclass():
+        with torch._C.DisableTorchFunction():
             return op(*args, **kwargs)
diff --git a/torch/distributed/_shard/partial_tensor.py b/torch/distributed/_shard/partial_tensor.py
index 6a48163082c5e..dc8d09bdd7f30 100644
--- a/torch/distributed/_shard/partial_tensor.py
+++ b/torch/distributed/_shard/partial_tensor.py
@@ -236,7 +236,7 @@ def find_process_group(e):
         # Need to disable all dispatch to print args and kwargs appropriately.
         guard = torch._C._DisableTorchDispatch()  # type: ignore[attr-defined]
         try:
-            with torch._C.DisableTorchFunctionSubclass():
+            with torch._C.DisableTorchFunction():
                 raise RuntimeError(
                     f"torch function '{func.__name__}', with args: {args} and "
                     f"kwargs: {kwargs} not supported for PartialTensor!")
diff --git a/torch/distributed/_shard/replicated_tensor.py b/torch/distributed/_shard/replicated_tensor.py
index e3db6b0fac664..1327f89e00aaf 100644
--- a/torch/distributed/_shard/replicated_tensor.py
+++ b/torch/distributed/_shard/replicated_tensor.py
@@ -109,7 +109,7 @@ def dispatch_arg(arg):
         # We cann't do super().__torch_function__() as it implicitly convert the result
         # back to tensor subclasses, where in our case, we need to control the output type
         # base on the inter-op rules we defined.
-        with torch._C.DisableTorchFunctionSubclass():
+        with torch._C.DisableTorchFunction():
             rs = func(*args, **kwargs)
             if func in get_default_nowrap_functions():
                 return rs
@@ -157,7 +157,7 @@ def validate(self) -> bool:
         return True
 
     def __setstate__(self, state):
-        with torch._C.DisableTorchFunctionSubclass():
+        with torch._C.DisableTorchFunction():
             self.data = state
             self.requires_grad = state.requires_grad
             from torch.distributed._shard.api import _get_current_process_group
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py b/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py
index 9ed83ee33f619..e52c29238a62b 100644
--- a/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py
+++ b/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py
@@ -203,7 +203,7 @@ def tensor_requires_grad_set(types, args=(), kwargs=None, pg=None):
         local_shard.tensor.requires_grad_(requires_grad)
 
         # update the wrapper class property
-    with torch._C.DisableTorchFunctionSubclass():
+    with torch._C.DisableTorchFunction():
         self_st.requires_grad_(requires_grad)
     # update the metadata in the meanwhile
     self_st._metadata.tensor_properties.requires_grad = requires_grad
diff --git a/torch/masked/maskedtensor/core.py b/torch/masked/maskedtensor/core.py
index 0459f24587bd7..3274ef2ef9569 100644
--- a/torch/masked/maskedtensor/core.py
+++ b/torch/masked/maskedtensor/core.py
@@ -270,7 +270,7 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
 
         if not all(issubclass(cls, t) for t in types):
             return NotImplemented
-        with torch._C.DisableTorchFunctionSubclass():
+        with torch._C.DisableTorchFunction():
             ret = func(*args, **kwargs)
             if func in get_default_nowrap_functions():
                 return ret

From dc5e3c7b1cef3a4f824ff6272b79659fed652252 Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Fri, 11 Nov 2022 21:11:12 +0000
Subject: [PATCH 0806/1922] [fix] allow saving python attr on Tensor and
 Parameter via torch.save (#81616)

Fixes: https://github.com/pytorch/pytorch/issues/72129

TODO:
* [x] Fix for Parameter

Benchmark
(Measurable diff for small tensors)
```
[-------------- Save and Load --------------]
                    |  After PR  |  Before PR
1 threads: ----------------------------------
      ()            |    111.7   |     106.9
      (4, 4)        |    114.4   |     109.2
      (128, 128)    |    135.2   |     128.3
      (1024, 1024)  |   1431.9   |    1431.3

Times are in microseconds (us).
```

<details>

<summary> Benchmark Script </summary>

```python
import torch
from torch.testing._internal.common_utils import BytesIOContext
from torch.utils import benchmark
import pickle

shapes = ((), (4, 4), (128, 128), (1024, 1024))

sizes = [1, 64, 1024, 10000]
results = []

def save_load_fn(t):
    with BytesIOContext() as f:
        torch.save(t, f)
        f.seek(0)
        torch.load(f)

for shape in shapes:
    t = torch.randn(shape)
    label = 'Save and Load'
    sub_label = f'{shape}'
    results.append(benchmark.Timer(
        stmt='save_load_fn(t)',
        globals={'t': t, 'save_load_fn':save_load_fn},
        label=label,
        sub_label=sub_label,
        description='Before PR',
    ).blocked_autorange(min_run_time=2))

compare = benchmark.Compare(results)
compare.print()

with open('before_pr.pkl', 'wb') as f:
    pickle.dump(results, f)

# with open('after_pr.pkl', 'rb') as f:
#     after_pr = pickle.load(f)

# with open('before_pr.pkl', 'rb') as f:
#     before_pr = pickle.load(f)

# compare = benchmark.Compare(after_pr + before_pr)
# compare.print()
```

</details>

NOTE : **BC-Breaking** : After this PR, all tensors (also regular tensors) will be serialised using `_rebuild_from_type_v2`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/81616
Approved by: https://github.com/albanD, https://github.com/kurtamohler
---
 test/test_serialization.py                 | 22 +++++++
 torch/_tensor.py                           | 43 ++-----------
 torch/_utils.py                            | 59 ++++++++++++++++++
 torch/_weights_only_unpickler.py           |  4 ++
 torch/csrc/jit/serialization/unpickler.cpp | 71 ++++++++++++++++++++++
 torch/csrc/jit/serialization/unpickler.h   |  4 ++
 torch/nn/parameter.py                      |  1 +
 7 files changed, 165 insertions(+), 39 deletions(-)

diff --git a/test/test_serialization.py b/test/test_serialization.py
index 5ccc6f47b4c5d..dca926be60e70 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -905,6 +905,28 @@ def test_meta_serialization(self, weights_only):
 
         self.assertEqual(state['weight'].size(), big_model.weight.size())
 
+    def test_serialization_python_attr(self):
+        def _test_save_load_attr(t):
+            t.foo = 'foo'
+            t.pi = 3.14
+
+            with BytesIOContext() as f:
+                torch.save(t, f)
+                f.seek(0)
+                loaded_t = torch.load(f)
+
+            self.assertEqual(t, loaded_t)
+            self.assertEqual(t.foo, loaded_t.foo)
+            self.assertEqual(t.pi, loaded_t.pi)
+
+        t = torch.zeros(3, 3)
+        _test_save_load_attr(t)
+        # This should start failing once Parameter
+        # supports saving Python Attribute.
+        err_msg = "'Parameter' object has no attribute"
+        with self.assertRaisesRegex(AttributeError, err_msg):
+            _test_save_load_attr(torch.nn.Parameter(t))
+
     def test_weights_only_assert(self):
         class HelloWorld:
             def __reduce__(self):
diff --git a/torch/_tensor.py b/torch/_tensor.py
index 793034bb64ede..39fc56452f5a4 100644
--- a/torch/_tensor.py
+++ b/torch/_tensor.py
@@ -55,9 +55,6 @@ def _rebuild_from_type(func, type, args, dict):
 
 
 def _rebuild_from_type_v2(func, new_type, args, state):
-    if new_type is Tensor:
-        return func(*args)
-
     ret = func(*args)
     if type(ret) is not new_type:
         ret = ret.as_subclass(new_type)
@@ -70,21 +67,7 @@ def _rebuild_from_type_v2(func, new_type, args, state):
     ):
         ret.__setstate__(state)
     else:
-        if isinstance(state, tuple):
-            if not len(state) == 2:
-                raise RuntimeError(f"Invalid serialized state: {state}")
-            dict_state = state[0]
-            slots_state = state[1]
-        else:
-            dict_state = state
-            slots_state = None
-
-        for k, v in dict_state.items():
-            setattr(ret, k, v)
-
-        if slots_state:
-            for k, v in slots_state.items():
-                setattr(ret, k, v)
+        ret = torch._utils._set_obj_state(ret, state)
     return ret
 
 
@@ -223,31 +206,13 @@ def __deepcopy__(self, memo):
             return new_tensor
 
     def __reduce_ex__(self, proto):
-        if type(self) is Tensor:
+        state = torch._utils._get_obj_state(self)
+        if type(self) is Tensor and not state:
+            # Fast path for regular tensor without Python state.
             return self._reduce_ex_internal(proto)
         if has_torch_function_unary(self):
             return handle_torch_function(Tensor.__reduce_ex__, (self,), self, proto)
         func, args = self._reduce_ex_internal(proto)
-        # Get the state of the python subclass
-        # This loosely mimicks the function on the object class but since Tensor do not inherit
-        # from it, we cannot call that function directly
-        # https://github.com/python/cpython/blob/c83919bd635f4433f1c6ae8504996a9fe3c215e5/Objects/typeobject.c#L4891
-        getstate_fn = getattr(self, "__getstate__", None)
-        if getstate_fn:
-            state = getstate_fn()
-        else:
-            slots_to_save = copyreg._slotnames(self.__class__)  # type: ignore[attr-defined]
-            if slots_to_save:
-                state = (
-                    self.__dict__,
-                    {
-                        name: getattr(self, name)
-                        for name in slots_to_save
-                        if hasattr(self, name)
-                    },
-                )
-            else:
-                state = self.__dict__
         return (_rebuild_from_type_v2, (func, type(self), args, state))
 
     def storage(self):
diff --git a/torch/_utils.py b/torch/_utils.py
index 3bc8a749b3e66..9c646a2f85e0c 100644
--- a/torch/_utils.py
+++ b/torch/_utils.py
@@ -1,3 +1,4 @@
+import copyreg
 import sys
 import traceback
 import warnings
@@ -335,6 +336,64 @@ def _rebuild_parameter(data, requires_grad, backward_hooks):
     return param
 
 
+# TODO(kshitij12345): Support serializing nn.Parameter with Python Attributes.
+# NOTE: We are just defining it here now for future use.
+def _rebuild_parameter_with_state(data, requires_grad, backward_hooks, state):
+    param = torch.nn.Parameter(data, requires_grad)
+    # NB: This line exists only for backwards compatibility; the
+    # general expectation is that backward_hooks is an empty
+    # OrderedDict.  See Note [Don't serialize hooks]
+    param._backward_hooks = backward_hooks
+
+    # Restore state on Parameter like python attr.
+    param = _set_obj_state(param, state)
+    return param
+
+
+def _get_obj_state(obj):
+    # Get the state of the python subclass
+    # This loosely mimicks the function on the object class but since Tensor do not inherit
+    # from it, we cannot call that function directly
+    # https://github.com/python/cpython/blob/c83919bd635f4433f1c6ae8504996a9fe3c215e5/Objects/typeobject.c#L4891
+    getstate_fn = getattr(obj, "__getstate__", None)
+    if getstate_fn:
+        state = getstate_fn()
+    else:
+        slots_to_save = copyreg._slotnames(obj.__class__)  # type: ignore[attr-defined]
+        if slots_to_save:
+            state = (
+                obj.__dict__,
+                {
+                    name: getattr(obj, name)
+                    for name in slots_to_save
+                    if hasattr(obj, name)
+                },
+            )
+        else:
+            state = obj.__dict__
+
+    return state
+
+
+def _set_obj_state(obj, state):
+    if isinstance(state, tuple):
+        if not len(state) == 2:
+            raise RuntimeError(f"Invalid serialized state: {state}")
+        dict_state = state[0]
+        slots_state = state[1]
+    else:
+        dict_state = state
+        slots_state = None
+
+    for k, v in dict_state.items():
+        setattr(obj, k, v)
+
+    if slots_state:
+        for k, v in slots_state.items():
+            setattr(obj, k, v)
+    return obj
+
+
 def _import_dotted_name(name):
     components = name.split(".")
     obj = __import__(components[0])
diff --git a/torch/_weights_only_unpickler.py b/torch/_weights_only_unpickler.py
index ee00db937fc3d..acc3554768b0b 100644
--- a/torch/_weights_only_unpickler.py
+++ b/torch/_weights_only_unpickler.py
@@ -103,6 +103,10 @@ def _get_allowed_globals():
         torch._utils._rebuild_sparse_csr_tensor,
     ]:
         rc[f"torch._utils.{f.__name__}"] = f
+
+    # Handles Tensor Subclasses, Tensor's with attributes.
+    # NOTE: It calls into above rebuild functions for regular Tensor types.
+    rc["torch._tensor._rebuild_from_type_v2"] = torch._tensor._rebuild_from_type_v2
     return rc
 
 
diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp
index f7e974919f03d..4bbf7a783a232 100644
--- a/torch/csrc/jit/serialization/unpickler.cpp
+++ b/torch/csrc/jit/serialization/unpickler.cpp
@@ -532,6 +532,21 @@ PickleOpCode Unpickler::readInstruction() {
       }
       stack_.emplace_back(std::move(tensor));
     } break;
+    case PickleOpCode::SETITEM: {
+      // At this OpCode, stack looks like
+      // | Stack Bottom |
+      // | ......       |
+      // | Dict         | -> (stack_size - 3)
+      // | Key          | -> (stack_size - 2)
+      // | Value        | -> (stack_size - 1)
+      auto stack_size = stack_.size();
+      auto dict_pos = stack_size - 3;
+      auto key_pos = stack_size - 2;
+      auto val_pos = stack_size - 1;
+      auto dict = stack_.at(dict_pos).toGenericDict();
+      dict.insert_or_assign(stack_.at(key_pos), stack_.at(val_pos));
+      stack_.erase(stack_.begin() + (key_pos), stack_.end());
+    } break;
     default: {
       AT_ERROR(
           "Unknown opcode for unpickling at ",
@@ -546,6 +561,23 @@ PickleOpCode Unpickler::readInstruction() {
 void Unpickler::readGlobal(
     const std::string& module_name,
     const std::string& class_name) {
+  if (this->skip_next_read_global) {
+    // See [NOTE] skip_next_read_global
+    this->skip_next_read_global--;
+    if (this->skip_next_read_global == 1) {
+      // Pass through to the correct handler
+    } else if (this->skip_next_read_global == 0) {
+      // Corresponds to the type of `Tensor` being unpickled
+      if (module_name != "torch" || class_name != "Tensor") {
+        TORCH_WARN(
+            "Trying to load a Subclassed Tensor, it will be converted to at::Tensor in C++");
+      }
+      stack_.emplace_back(int64_t(globals_.size() - 1));
+      return;
+    } else {
+      TORCH_CHECK(false, "INVALID VALUES")
+    }
+  }
   // TODO [unpickler refactor] __main__ isn't used by the pickler anymore, this
   // is only here for bc-compatibility reasons
   if (module_name == "__main__") {
@@ -631,6 +663,12 @@ void Unpickler::readGlobal(
     // Unpickle a tensor
     bool quantized = class_name == "_rebuild_qtensor";
     rebuildTensor(quantized);
+  } else if (
+      module_name == "torch._tensor" &&
+      (class_name == "_rebuild_from_type_v2")) {
+    // Unpickle a Tensor with Python attributes or
+    // a Subclassed Tensor.
+    rebuildTensorFromTypeV2();
   } else if (
       module_name == "torch._utils" && class_name == "_rebuild_sparse_tensor") {
     rebuildSparseTensor();
@@ -849,6 +887,39 @@ void Unpickler::rebuildTensor(bool quantized) {
   });
 }
 
+void Unpickler::rebuildTensorFromTypeV2() {
+  // [NOTE] skip_next_read_global
+  // When rebuilding Tensor with Python Attr or Subclassed Tensor,
+  // we receive `(func, type(self), args, state)` on stack for
+  // `rebuildTensorFromTypeV2`.
+  // Thus next call to readGlobal corresponds to `func` which is
+  // the function to rebuild the base tensor.
+  // The call after `func` to readGlobal corresponds to `type` of the
+  // Tensor where we raise warning if the type is not `torch.Tensor`.
+  this->skip_next_read_global = 2;
+  auto curr_globals_idx = globals_.size();
+  globals_.emplace_back([this, curr_globals_idx] {
+    // args is a tuple with following data
+    //  (function to rebuild base tensor, type of tensor,
+    //   arguments to construct base tensor, Python State (as dict))
+    auto args = pop(stack_).toTuple();
+    size_t tup_idx = 0;
+    const auto args_elems = args->elements();
+    auto base_tensor_args = args_elems.at(tup_idx + 2).toTuple();
+    auto py_state = args_elems.at(tup_idx + 3).toGenericDict();
+    if (py_state.size() > 0) {
+      TORCH_WARN(
+          "Loading Tensor with Python attributes will return at::Tensor with Python attributes being discarded");
+    }
+    // This calls the function to rebuild the
+    // base tensor.
+    // Eg. `rebuildTensor`, `rebuildSpareTensor`.
+    stack_.emplace_back(base_tensor_args);
+    globals_[curr_globals_idx + 1]();
+    stack_.emplace_back(pop(stack_));
+  });
+}
+
 #ifdef USE_RPC
 void Unpickler::rebuildRRef() {
   globals_.emplace_back([this] {
diff --git a/torch/csrc/jit/serialization/unpickler.h b/torch/csrc/jit/serialization/unpickler.h
index 5411d421a0c57..de00e7eacff21 100644
--- a/torch/csrc/jit/serialization/unpickler.h
+++ b/torch/csrc/jit/serialization/unpickler.h
@@ -120,6 +120,7 @@ class TORCH_API Unpickler {
       const std::string& module_name,
       const std::string& class_name);
   void rebuildTensor(bool quantized);
+  void rebuildTensorFromTypeV2();
   void rebuildSparseTensor();
 #ifdef USE_DISTRIBUTED
   void rebuildRRef();
@@ -176,6 +177,9 @@ class TORCH_API Unpickler {
 
   // See [type tag serialization]
   uint64_t version_;
+
+  // See [NOTE] skip_next_read_global
+  uint8_t skip_next_read_global = 0;
 };
 
 void restoreAccurateTypeTags(const IValue& root, const c10::TypePtr& type_tag);
diff --git a/torch/nn/parameter.py b/torch/nn/parameter.py
index e0f400f2642bf..68908001238ec 100644
--- a/torch/nn/parameter.py
+++ b/torch/nn/parameter.py
@@ -60,6 +60,7 @@ def __repr__(self):
         return 'Parameter containing:\n' + super(Parameter, self).__repr__()
 
     def __reduce_ex__(self, proto):
+        # TODO(kshitij12345): Support saving Python Attribute
         # See Note [Don't serialize hooks]
         return (
             torch._utils._rebuild_parameter,

From 2d36f801520fba040fb508ef4c71b7ec72a27152 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Fri, 11 Nov 2022 21:19:26 +0000
Subject: [PATCH 0807/1922] Fix CUDNN_PATH handling on Windows (#88898)

Fixes https://github.com/pytorch/pytorch/issues/88873
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88898
Approved by: https://github.com/kit1980
---
 torch/utils/cpp_extension.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index aa03da23b38da..720935296504f 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -1686,7 +1686,7 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose, is_standalone):
             extra_ldflags.append(f'/LIBPATH:{_join_cuda_home("lib", "x64")}')
             extra_ldflags.append('cudart.lib')
             if CUDNN_HOME is not None:
-                extra_ldflags.append(os.path.join(CUDNN_HOME, "lib", "x64"))
+                extra_ldflags.append(f'/LIBPATH:{os.path.join(CUDNN_HOME, "lib", "x64")}')
         elif not IS_HIP_EXTENSION:
             extra_ldflags.append(f'-L{_join_cuda_home("lib64")}')
             extra_ldflags.append('-lcudart')

From 2a602e1fcfd3ffcc91210a3eb4c518080404a116 Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Thu, 10 Nov 2022 21:19:21 +0000
Subject: [PATCH 0808/1922] [FSDP][state_dict][5/N] Remove the FSDP module
 dependency from _state_dict_utils (#88637)

**What**
This PR completely removes the `FullyShardedDataParallel` dependency from `_state_dict_utils` -- `_state_dict_utils` now depends only on `_FSDPState` and all the utils modules.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88637
Approved by: https://github.com/awgu
---
 torch/distributed/fsdp/_init_utils.py       |   6 +-
 torch/distributed/fsdp/_state_dict_utils.py | 108 ++++++++++----------
 2 files changed, 58 insertions(+), 56 deletions(-)

diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index 966e61f7fe123..1265ee3578ed4 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -213,10 +213,8 @@ def _init_state_dict_state(state: _FSDPState) -> _FSDPState:
     state._state_dict_type = StateDictType.FULL_STATE_DICT
     state_dict_config: StateDictConfig = FullStateDictConfig()
     state._state_dict_config = state_dict_config
-    full_param_ctx: Optional[Generator] = None
-    # TODO: For composable API, this should be a dict that maps from a module to
-    # handles.
-    state._full_param_ctx = full_param_ctx
+    unshard_params_ctx: Dict[nn.Module, Generator] = {}
+    state._unshard_params_ctx = unshard_params_ctx
     return state
 
 
diff --git a/torch/distributed/fsdp/_state_dict_utils.py b/torch/distributed/fsdp/_state_dict_utils.py
index eee5522340b46..54191cb55ece8 100644
--- a/torch/distributed/fsdp/_state_dict_utils.py
+++ b/torch/distributed/fsdp/_state_dict_utils.py
@@ -8,7 +8,6 @@
 import torch.distributed.algorithms._checkpoint.checkpoint_wrapper as checkpoint_wrapper
 
 # Import the entire FSDP file to avoid circular imports
-import torch.distributed.fsdp.fully_sharded_data_parallel as fsdp_file
 import torch.nn as nn
 import torch.nn.functional as F
 
@@ -42,6 +41,7 @@
 from ._unshard_param_utils import (
     _deregister_orig_params,
     _register_orig_params,
+    _unshard_params,
     FLAT_PARAM,
 )
 from .flat_param import FlatParamHandle
@@ -58,7 +58,9 @@ def _convert_to_wrapped_module_name(module_name: str) -> str:
     return module_name
 
 
-def _param_fqns(module, fsdp_state: _FSDPState) -> Iterator[Tuple[str, str, str]]:
+def _param_fqns(
+    module: nn.Module, fsdp_state: _FSDPState
+) -> Iterator[Tuple[str, str, str]]:
     if not _has_fsdp_params(fsdp_state, module):
         return
     for param_name, module_name in _module_handles(fsdp_state, module)[
@@ -69,7 +71,7 @@ def _param_fqns(module, fsdp_state: _FSDPState) -> Iterator[Tuple[str, str, str]
         yield fqn, param_name, module_name
 
 
-def _shared_param_fqns(module, fsdp_state) -> Iterator[Tuple[str, str, str]]:
+def _shared_param_fqns(module: nn.Module, fsdp_state) -> Iterator[Tuple[str, str, str]]:
     for param_name, module_name in _module_handles(fsdp_state, module)[
         0
     ].shared_parameter_module_names():
@@ -78,7 +80,9 @@ def _shared_param_fqns(module, fsdp_state) -> Iterator[Tuple[str, str, str]]:
         yield fqn, param_name, module_name
 
 
-def _enter_full_param_ctx(
+@no_type_check
+def _enter_unshard_params_ctx(
+    module: nn.Module,
     fsdp_state: _FSDPState,
     recurse: bool = False,
     writeback: bool = False,
@@ -89,32 +93,32 @@ def _enter_full_param_ctx(
     """
     state_dict hooks cannot use the pure context call as the checkpoint flow
     requires to enter the context in the pre-hook but leave the context in the
-    post-hook. This API enters the context of ``summon_full_params``.
+    post-hook. This API enters the context of ``_unshard_params``.
     """
-    assert fsdp_state._full_param_ctx is None, (
-        "Entering the ``summon_full_params`` context but fsdp_state._full_param_ctx "
+    assert module not in fsdp_state._unshard_params_ctx, (
+        "Entering the ``_unshard_params`` context but _unshard_params_ctx[module] "
         "is not None."
     )
-    fsdp_state._full_param_ctx = fsdp_state._summon_full_params(
-        recurse=recurse,
+    fsdp_state._unshard_params_ctx[module] = _unshard_params(
+        module,
+        fsdp_state,
         writeback=writeback,
         rank0_only=rank0_only,
         offload_to_cpu=offload_to_cpu,
         with_grads=with_grads,
     )
-    fsdp_state._full_param_ctx.__enter__()
+    fsdp_state._unshard_params_ctx[module].__enter__()
 
 
 @no_type_check
-def _exit_full_param_ctx(fsdp_state: _FSDPState) -> None:
-    """A helper function to exit ``summon_full_params`` context."""
-    assert fsdp_state._full_param_ctx is not None
-    fsdp_state._full_param_ctx.__exit__(None, None, None)
-    fsdp_state._full_param_ctx = None
+def _exit_unshard_params_ctx(module: nn.Module, fsdp_state: _FSDPState) -> None:
+    """A helper function to exit ``_unshard_params`` context."""
+    fsdp_state._unshard_params_ctx[module].__exit__(None, None, None)
+    fsdp_state._unshard_params_ctx.pop(module)
 
 
 def _common_pre_state_dict_hook(
-    module,
+    module: nn.Module,
     fsdp_state: _FSDPState,
     state_dict: Dict[str, Any],
     prefix: str,
@@ -131,16 +135,18 @@ def _common_pre_state_dict_hook(
         _clear_grads_if_needed([_module_handles(fsdp_state, module)[0]])
 
 
-def _common_summon_pre_state_dict_hook(
+def _common_unshard_pre_state_dict_hook(
+    module: nn.Module,
     fsdp_state: _FSDPState,
     offload_to_cpu: bool,
     rank0_only: bool,
 ) -> None:
     """
     Performs the pre-state_dict tasks shared by all state_dict types that require
-    ``summon_full_params()``. FULL_STATE_DICT and SHARDED_STATE_DICT use this hook.
+    ``_unshard_params()``. FULL_STATE_DICT and SHARDED_STATE_DICT use this hook.
     """
-    _enter_full_param_ctx(
+    _enter_unshard_params_ctx(
+        module,
         fsdp_state,
         recurse=False,
         writeback=False,
@@ -151,8 +157,8 @@ def _common_summon_pre_state_dict_hook(
 
 # TODO: change to the decorator style. See ``_full_pre_state_dict_hook``.
 @no_type_check
-def _common_summon_post_state_dict_hook(
-    module,
+def _common_unshard_post_state_dict_hook(
+    module: nn.Module,
     fsdp_state: _FSDPState,
     state_dict: Dict[str, Any],
     prefix: str,
@@ -160,13 +166,13 @@ def _common_summon_post_state_dict_hook(
 ) -> Dict[str, Any]:
     """
     The post-state_dict flow that shared by all state_dict types that require
-    ``summon_full_params()``. FULL_STATE_DICT and SHARDED_STATE_DICT use this
+    ``_unshard_params()``. FULL_STATE_DICT and SHARDED_STATE_DICT use this
     hook.
     """
     _replace_by_prefix(state_dict, prefix + f"{FSDP_PREFIX}", prefix)
     # Return early for trivial cases
     if not state_dict or not _has_fsdp_params(fsdp_state, module):
-        _exit_full_param_ctx(fsdp_state)
+        _exit_unshard_params_ctx(module, fsdp_state)
         return state_dict
 
     # TODO: Once pre_state_dict hook is supported, this pop should be removed.
@@ -193,7 +199,7 @@ def _common_summon_post_state_dict_hook(
                 f"{checkpoint_wrapper._CHECKPOINT_PREFIX}.", ""
             )
             state_dict.pop(f"{prefix}{clean_key}", None)
-        _exit_full_param_ctx(fsdp_state)
+        _exit_unshard_params_ctx(module, fsdp_state)
         return state_dict
 
     # Loop only the parameters saved in this instance's wrapped module to
@@ -214,7 +220,7 @@ def _common_summon_post_state_dict_hook(
         )
 
         param_hook(state_dict, prefix, fqn)
-    _exit_full_param_ctx(fsdp_state)
+    _exit_unshard_params_ctx(module, fsdp_state)
 
     cpu_device = torch.device("cpu")
     buffer_clean_fqns = []
@@ -251,7 +257,7 @@ def _common_summon_post_state_dict_hook(
 
 @no_type_check
 def _full_pre_state_dict_hook(
-    module,
+    module: nn.Module,
     fsdp_state: _FSDPState,
     state_dict: Dict[str, Any],
     prefix: str,
@@ -267,7 +273,8 @@ def _full_pre_state_dict_hook(
     in ``nn.Module``.
     """
     _common_pre_state_dict_hook(module, fsdp_state, state_dict, prefix)
-    _common_summon_pre_state_dict_hook(
+    _common_unshard_pre_state_dict_hook(
+        module,
         fsdp_state,
         offload_to_cpu=fsdp_state._state_dict_config.offload_to_cpu,
         rank0_only=cast(FullStateDictConfig, fsdp_state._state_dict_config).rank0_only,
@@ -276,7 +283,7 @@ def _full_pre_state_dict_hook(
 
 @no_type_check
 def _full_post_state_dict_hook(
-    module,
+    module: nn.Module,
     fsdp_state: _FSDPState,
     state_dict: Dict[str, Any],
     prefix: str,
@@ -284,7 +291,7 @@ def _full_post_state_dict_hook(
     """
     Hook that runs after model.state_dict() is called before returning result to
     user. For FSDP, we may have to clone the tensors in state_dict as params go
-    back to sharded version after _summon_full_params ends, and also remove
+    back to sharded version after _unshard_params ends, and also remove
     the ``FSDP_WRAPPED_MODULE`` prefix.
     """
     # TODO: remove the hack. See ``_full_pre_state_dict_hook``.
@@ -303,8 +310,7 @@ def param_hook(
         if clean_key.startswith(clean_prefix):
             clean_key = clean_key[len(clean_prefix) :]
 
-        # Clone non-ignored parameters before exiting the
-        # `_summon_full_params()` context
+        # Clone non-ignored parameters before exiting the `_unshard_params()` context.
         if clean_key not in fsdp_state._ignored_param_names and not getattr(
             state_dict[fqn], "_has_been_cloned", False
         ):
@@ -320,30 +326,30 @@ def param_hook(
                     f"implementation of {fqn}. Error: {str(e)}"
                 )
 
-    return _common_summon_post_state_dict_hook(
+    return _common_unshard_post_state_dict_hook(
         module, fsdp_state, state_dict, prefix, param_hook
     )
 
 
 def _full_pre_load_state_dict_hook(
-    module,
+    module: nn.Module,
     fsdp_state: _FSDPState,
     state_dict: Dict[str, Any],
     prefix: str,
 ) -> None:
     _lazy_init(fsdp_state, module)
-    _enter_full_param_ctx(fsdp_state, recurse=False, writeback=True)
+    _enter_unshard_params_ctx(module, fsdp_state, recurse=False, writeback=True)
     _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP_PREFIX}")
 
 
 def _full_post_load_state_dict_hook(
-    module, fsdp_state: _FSDPState, *args, **kwargs
+    module: nn.Module, fsdp_state: _FSDPState, *args, **kwargs
 ) -> None:
-    _exit_full_param_ctx(fsdp_state)
+    _exit_unshard_params_ctx(module, fsdp_state)
 
 
 def _local_pre_state_dict_hook(
-    module,
+    module: nn.Module,
     fsdp_state: _FSDPState,
     state_dict: Dict[str, Any],
     prefix: str,
@@ -366,7 +372,7 @@ def _local_pre_state_dict_hook(
 
 @no_type_check
 def _local_post_state_dict_hook(
-    module,
+    module: nn.Module,
     fsdp_state: _FSDPState,
     state_dict: Dict[str, Any],
     prefix: str,
@@ -408,13 +414,13 @@ def _local_post_state_dict_hook(
 
 
 def _local_post_load_state_dict_hook(
-    module, fsdp_state: _FSDPState, *args, **kwargs
+    module: nn.Module, fsdp_state: _FSDPState, *args, **kwargs
 ) -> None:
     pass
 
 
 def _local_pre_load_state_dict_hook(
-    module,
+    module: nn.Module,
     fsdp_state: _FSDPState,
     state_dict: Dict[str, Any],
     prefix: str,
@@ -457,7 +463,7 @@ def _local_pre_load_state_dict_hook(
 
 
 def _sharded_pre_state_dict_hook(
-    module,
+    module: nn.Module,
     fsdp_state: _FSDPState,
     state_dict: Dict[str, Any],
     prefix: str,
@@ -477,7 +483,8 @@ def _sharded_pre_state_dict_hook(
     _common_pre_state_dict_hook(module, fsdp_state, state_dict, prefix)
     # Setting offload_to_cpu here does not work even if offload_to_cpu is True.
     # We have to create ShardedTensor first then move it to CPU.
-    _common_summon_pre_state_dict_hook(
+    _common_unshard_pre_state_dict_hook(
+        module,
         fsdp_state,
         offload_to_cpu=False,
         rank0_only=False,
@@ -486,7 +493,7 @@ def _sharded_pre_state_dict_hook(
 
 @no_type_check
 def _sharded_post_state_dict_hook(
-    module,
+    module: nn.Module,
     fsdp_state: _FSDPState,
     state_dict: Dict[str, Any],
     prefix: str,
@@ -512,14 +519,14 @@ def param_hook(state_dict: Dict[str, Any], prefix: str, fqn: str):
             sharded_tensor = sharded_tensor.cpu()
         state_dict[fqn] = sharded_tensor
 
-    return _common_summon_post_state_dict_hook(
+    return _common_unshard_post_state_dict_hook(
         module, fsdp_state, state_dict, prefix, param_hook
     )
 
 
 @no_type_check
 def _sharded_post_load_state_dict_hook(
-    module, fsdp_state: _FSDPState, *args, **kwargs
+    module: nn.Module, fsdp_state: _FSDPState, *args, **kwargs
 ) -> None:
     if fsdp_state._use_orig_params:
         _register_orig_params(module, fsdp_state)
@@ -527,7 +534,7 @@ def _sharded_post_load_state_dict_hook(
 
 @no_type_check
 def _sharded_pre_load_state_dict_hook(
-    module,
+    module: nn.Module,
     fsdp_state: _FSDPState,
     state_dict: Dict[str, Any],
     prefix: str,
@@ -636,9 +643,8 @@ def _post_state_dict_hook(
         StateDictType.LOCAL_STATE_DICT: _local_post_state_dict_hook,
         StateDictType.SHARDED_STATE_DICT: _sharded_post_state_dict_hook,
     }
-    fsdp_module = cast(fsdp_file.FullyShardedDataParallel, module)
     processed_state_dict = _post_state_dict_hook_fn[fsdp_state._state_dict_type](
-        fsdp_module, fsdp_state, state_dict, prefix
+        module, fsdp_state, state_dict, prefix
     )
     return processed_state_dict
 
@@ -664,12 +670,11 @@ def _pre_load_state_dict_hook(
         StateDictType.SHARDED_STATE_DICT: _sharded_pre_load_state_dict_hook,
     }
     # Code that is common for all state_dict impls
-    fsdp_module = cast(fsdp_file.FullyShardedDataParallel, module)
     if torch.cuda.is_available():
         torch.cuda.synchronize()
     # Dispatch into state_dict specific implementation of pre-hook.
     _pre_load_state_dict_hook_fn[fsdp_state._state_dict_type](
-        fsdp_module, fsdp_state, state_dict, prefix
+        module, fsdp_state, state_dict, prefix
     )
 
 
@@ -684,7 +689,6 @@ def _post_load_state_dict_hook(module: nn.Module, *args: Any) -> None:
         StateDictType.SHARDED_STATE_DICT: _sharded_post_load_state_dict_hook,
     }
     # Code that is common for all state_dict impls
-    fsdp_module = cast(fsdp_file.FullyShardedDataParallel, module)
     # Dispatch into state_dict type specific implementation of post-hook for
     # loading state_dict.
-    _post_load_state_dict_hook_fn[fsdp_state._state_dict_type](fsdp_module, fsdp_state)
+    _post_load_state_dict_hook_fn[fsdp_state._state_dict_type](module, fsdp_state)

From b6b4023f894971d511303fd053e25bbf0986260b Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Fri, 11 Nov 2022 21:51:10 +0000
Subject: [PATCH 0809/1922] Fix unused variable 'options' warning in RNN.cpp
 (#88753)

Fixes
```
/home/rbarnes/pytorch/aten/src/ATen/native/cudnn/RNN.cpp:73:17: warning: unused variable 'options' [-Wunused-variable]
  TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
                ^
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88753
Approved by: https://github.com/soumith
---
 aten/src/ATen/native/cudnn/RNN.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index c08c5d26b63c7..426243392b6fc 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -70,7 +70,7 @@ Tensor _cudnn_init_dropout_state(double dropout, bool train, int64_t dropout_see
     c10::optional<Device> device,
     c10::optional<bool> pin_memory) {
   // See [Note: hacky wrapper removal for TensorOptions]
-  TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
+  TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
   AT_ERROR("_cudnn_init_dropout_state: ATen not compiled with cuDNN support");
 }

From b02fc4c66815b36874f1fac194ab5ec9257bca10 Mon Sep 17 00:00:00 2001
From: efiks <5167930+efiks@users.noreply.github.com>
Date: Fri, 11 Nov 2022 21:58:23 +0000
Subject: [PATCH 0810/1922] [tourch] BatchBoxCox - fix numerical issue in
 vectorized code (#88875)

Summary:
Usage of fast math in BatchBoxCox kernel provided different math results between dev and optimized versions which cause few internal test to fail.
For now disabling the compiler optimized version and relying on ATEN vectors

Differential Revision: D41211784

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88875
Approved by: https://github.com/hyuen
---
 caffe2/perfkernels/batch_box_cox_avx2.cc | 93 ++++++++++++++----------
 1 file changed, 53 insertions(+), 40 deletions(-)

diff --git a/caffe2/perfkernels/batch_box_cox_avx2.cc b/caffe2/perfkernels/batch_box_cox_avx2.cc
index 8b93293646dba..6171b5bfd0326 100644
--- a/caffe2/perfkernels/batch_box_cox_avx2.cc
+++ b/caffe2/perfkernels/batch_box_cox_avx2.cc
@@ -1,3 +1,4 @@
+#include <immintrin.h>
 #ifdef CAFFE2_PERF_USE_MKL
 #include <c10/util/irange.h>
 #include <caffe2/perfkernels/common.h>
@@ -5,30 +6,68 @@
 
 #include "vectorizer.h"
 
-#ifndef VECTORIZED_KERNEL
+// Enable compiler vectorized version only if numerical consistency is not
+// required between dev and opt versions - disabled for now
+#ifndef FAST_VECTORIZED_KERNEL
 #define CPU_CAPABILITY_AVX2
 #include <ATen/cpu/vec/vec.h>
 
 namespace at::vec {
 
+// Implements the vectorized version of std::max() operation,
+// which DOESNOT propagates NaN for second argument
 template <typename scalar_t>
 Vectorized<scalar_t> max(const Vectorized<scalar_t>& a, const Vectorized<scalar_t>& b);
 
-// Implements the vectorized version of std::max() operation,
-// which DOESNOT propagates NaN for second argument
 template <>
 Vectorized<double> max(const Vectorized<double>& a, const Vectorized<double>& b) {
   // std::max(NaN, nonNan) -> NaN
   return _mm256_max_pd(b, a);
 }
 
-
 template <>
 Vectorized<float> max(const Vectorized<float>& a, const Vectorized<float>& b) {
   // std::max(NaN, nonNan) -> NaN
   return _mm256_max_ps(b, a);
 }
 
+// Implements recieprocal method based on newton-rapson method
+// 1. user RCP approximiation
+// 2. update with RCP = RCP * (2 - X * RCP)
+template <typename scalar_t>
+Vectorized<scalar_t> fast_recieprocal(const Vectorized<scalar_t>& b);
+template <typename scalar_t>
+scalar_t fast_recieprocal(scalar_t b);
+
+template<>
+Vectorized<float> fast_recieprocal(const Vectorized<float>& b) {
+  auto minus2 = _mm256_set1_ps(-2.f);
+  auto rcp = _mm256_rcp_ps(b);
+  rcp = _mm256_mul_ps(rcp,  _mm256_fnmsub_ps(rcp, b, minus2));
+  rcp = _mm256_mul_ps(rcp,  _mm256_fnmsub_ps(rcp, b, minus2));
+  return rcp;
+}
+
+template <>
+float fast_recieprocal(float b) {
+  auto minus2 = _mm_set_ss(-2.f);
+  auto b_reg = _mm_set_ss(b);
+  auto rcp = _mm_rcp_ss(b_reg);
+  rcp = _mm_mul_ss(rcp,  _mm_fnmsub_ss(rcp, b_reg, minus2));
+  rcp = _mm_mul_ss(rcp,  _mm_fnmsub_ss(rcp, b_reg, minus2));
+  return _mm_cvtss_f32(rcp);
+}
+
+template<>
+Vectorized<double> fast_recieprocal(const Vectorized<double>& b) {
+  return b.reciprocal();
+}
+
+template <>
+double fast_recieprocal(double b) {
+  return 1./b;
+}
+
 }
 #endif
 
@@ -45,14 +84,6 @@ template <typename T>
 void PackV(const int N, const T* a, const int* ia, T* y);
 template <typename T>
 void UnpackV(const int N, const T* a, T* y, const int* iy);
-template <typename T>
-void Pow(const int N, const T* a, const T* b, T* y);
-template <typename T>
-void Add(const int N, const T* a, const T* b, T* y);
-template <typename T>
-void Div(const int N, const T* a, const T* b, T* y);
-template <typename T>
-void Ln(const int N, const T* a, T* y);
 
 #define DELEGATE_PACKV_FUNCTION(T, OriginalFunc)                \
   template <>                                                   \
@@ -72,29 +103,7 @@ DELEGATE_UNPACKV_FUNCTION(float, vsUnpackV)
 DELEGATE_UNPACKV_FUNCTION(double, vdUnpackV)
 #undef DELEGATE_UNPACKV_FUNCTION
 
-#define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Funcname, OriginalFunc) \
-  template <>                                                      \
-  void Funcname<T>(const int N, const T* a, const T* b, T* y) {    \
-    OriginalFunc(N, a, b, y);                                      \
-  }
-DELEGATE_SIMPLE_BINARY_FUNCTION(float, Pow, vsPow)
-DELEGATE_SIMPLE_BINARY_FUNCTION(double, Pow, vdPow)
-DELEGATE_SIMPLE_BINARY_FUNCTION(float, Add, vsAdd)
-DELEGATE_SIMPLE_BINARY_FUNCTION(double, Add, vdAdd)
-DELEGATE_SIMPLE_BINARY_FUNCTION(float, Div, vsDiv)
-DELEGATE_SIMPLE_BINARY_FUNCTION(double, Div, vdDiv)
-#undef DELEGATE_SIMPLE_BINARY_FUNCTION
-
-#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, OriginalFunc) \
-  template <>                                                     \
-  void Funcname<T>(const int N, const T* a, T* y) {               \
-    OriginalFunc(N, a, y);                                        \
-  }
-DELEGATE_SIMPLE_UNARY_FUNCTION(float, Ln, vsLn)
-DELEGATE_SIMPLE_UNARY_FUNCTION(double, Ln, vdLn)
-#undef DELEGATE_SIMPLE_UNARY_FUNCTION
-
-#ifndef VECTORIZED_KERNEL
+#ifndef FAST_VECTORIZED_KERNEL
 template <typename T>
 void box_cox_zero_lambda(
     size_t D,
@@ -140,7 +149,7 @@ void box_cox_nonzero_lambda(
     auto sum = data + lambda2;
     auto max = at::vec::max(sum, k_eps_vec);
     auto lambda1 = Vec::loadu(lambda1_ptr + j);
-    auto lambda_over_1 = lambda1.reciprocal();
+    auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
     auto pow = max.pow(lambda1);
     auto res = at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
     res.store(out + j);
@@ -148,7 +157,7 @@ void box_cox_nonzero_lambda(
   for ( ;j < D; ++j) {
     auto sum = data_ptr[j] + lambda2_ptr[j];
     auto max = std::max(sum, k_eps);
-    auto lambda_over_1 = 1 / lambda1_ptr[j];
+    auto lambda_over_1 = at::vec::fast_recieprocal(lambda1_ptr[j]);
     auto pow = std::pow(max, lambda1_ptr[j]);
     out[j] = pow * lambda_over_1 - lambda_over_1;
   }
@@ -181,12 +190,16 @@ void box_cox_nonzero_lambda(
     FAST_MATH
     auto sum = data_ptr[j] + lambda2_ptr[j];
     auto max = std::max(sum, k_eps);
-    auto lambda_over_1 = 1 / lambda1_ptr[j];
-    auto pow = std::pow(max, lambda1_ptr[j]);
+    auto lamda1 = lambda1_ptr[j];
+    auto lambda_over_1 = 1 / lamda1;
+    if constexpr (std::is_same<T, float>::value) {
+      lambda_over_1 = lambda_over_1 * (T{2} - lambda_over_1 * lamda1);
+      lambda_over_1 = lambda_over_1 * (T{2} - lambda_over_1 * lamda1);
+    }
+    auto pow = std::pow(max, lamda1);
     out[j] = pow * lambda_over_1 - lambda_over_1;
   }
 }
-
 #endif
 
 template <typename T>

From 0c2fcf5069879a13153daebb424d698a15e5783c Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Tue, 8 Nov 2022 10:22:32 -0800
Subject: [PATCH 0811/1922] [ONNX] Pretty print diagnostic logging (#88261)

Adds pretty print diagnostic logging. For example
```python
import io
import torch
from torch.onnx._internal import diagnostics

class CustomAdd(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, y):
        return x + y

    @staticmethod
    def symbolic(g, x, y):
        return g.op("custom::CustomAdd", x, y)

class M(torch.nn.Module):
    def forward(self, x):
        return CustomAdd.apply(x, x)

# trigger warning for missing shape inference.
# rule = diagnostics.rules.node_missing_onnx_shape_inference
torch.onnx.export(M(), torch.randn(3, 4), io.BytesIO())
```

By default, observe minimum summary of diagnostics
```
========= Diagnostic Run torch.onnx.export version 1.14.0a0+git90a69c5 =========
verbose: False, log level: Level.ERROR
======================= 0 NONE 0 NOTE 3 WARNING 0 ERROR ========================
3 WARNING were not printed due to the log level.
```

Adjusting the `verbose` and `level` argument.
```python
diagnostics.engine.pretty_print(verbose=True, level=diagnostics.levels.WARNING)
```

Prints full log.
```
=============================== 1 Diagnostic Run ===============================
========= Diagnostic Run torch.onnx.export version 1.14.0a0+git90a69c5 =========
verbose: True, log level: Level.WARNING
======================= 0 NONE 0 NOTE 3 WARNING 0 ERROR ========================
WARNING: node-missing-onnx-shape-inference
==========================================
The shape inference of custom::CustomAdd type is missing, so it may result in wrong shape inference for the exported graph. Please consider adding it in symbolic function.
--------------------------- Stack: Python call stack ---------------------------
frame: diagnostic = ExportDiagnostic(rule, level, message, **kwargs) /home/bowbao/pytorch_dev/torch/onnx/_internal/diagnostics/_diagnostic.py:151
frame: n, utils._params_dict, GLOBALS.export_onnx_opset_version /home/bowbao/pytorch_dev/torch/onnx/_patch_torch.py:82
frame: <@beartype(torch.onnx._patch_torch._graph_op) at 0x7f62184b6710>:78
frame: return beartyped(*args, **kwargs) /home/bowbao/pytorch_dev/torch/onnx/_internal/_beartype.py:81
frame: return function(*args, **kwargs) /home/bowbao/pytorch_dev/torch/onnx/_deprecation.py:30
frame: return g.op("custom::CustomAdd", x, y) test_pretty_print.py:14
frame: return symbolic_fn(g, *args) /home/bowbao/pytorch_dev/torch/onnx/utils.py:1716
frame: return beartyped(*args, **kwargs) /home/bowbao/pytorch_dev/torch/onnx/_internal/_beartype.py:81
frame: graph = _C._jit_pass_onnx(graph, operator_export_type) /home/bowbao/pytorch_dev/torch/onnx/utils.py:663
frame: <@beartype(torch.onnx.utils._optimize_graph) at 0x7f62180e05f0>:85
frame: return beartyped(*args, **kwargs) /home/bowbao/pytorch_dev/torch/onnx/_internal/_beartype.py:81
frame: module=module, /home/bowbao/pytorch_dev/torch/onnx/utils.py:1123
frame: return beartyped(*args, **kwargs) /home/bowbao/pytorch_dev/torch/onnx/_internal/_beartype.py:81
frame: dynamic_axes=dynamic_axes, /home/bowbao/pytorch_dev/torch/onnx/utils.py:1539
frame: return beartyped(*args, **kwargs) /home/bowbao/pytorch_dev/torch/onnx/_internal/_beartype.py:81
frame: export_modules_as_functions=export_modules_as_functions, /home/bowbao/pytorch_dev/torch/onnx/utils.py:519
frame: <@beartype(torch.onnx.utils.export) at 0x7f62180e0170>:347
frame: return beartyped(*args, **kwargs) /home/bowbao/pytorch_dev/torch/onnx/_internal/_beartype.py:81
frame: torch.onnx.export(M(), torch.randn(3, 4), io.BytesIO()) test_pretty_print.py:22
---------------------------- Stack: C++ call stack -----------------------------
frame: (<unknown frame>)
frame: (<unknown function> + 0x88411b (0x7f625b36011b in /home/bowbao/pytorch_dev/torch/lib/libtorch_python.so))
frame: (torch::jit::UpdateReliable(torch::jit::Value*, std::pair<bool, bool> const&) + 0x7d3 (0x7f625b351743 in /home/bowbao/pytorch_dev/torch/lib/libtorch_python.so))
frame: (torch::jit::UpdateReliable(torch::jit::Node*) + 0x4f (0x7f625b35198f in /home/bowbao/pytorch_dev/torch/lib/libtorch_python.so))
frame: (torch::jit::ONNXShapeTypeInference(torch::jit::Node*, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, c10::IValue, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, c10::IValue> > > const&, int) + 0xac9 (0x7f625b357179 in /home/bowbao/pytorch_dev/torch/lib/libtorch_python.so))
frame: (<unknown function> + 0xabd026 (0x7f625b599026 in /home/bowbao/pytorch_dev/torch/lib/libtorch_python.so))
frame: (<unknown function> + 0x3c0fda (0x7f625ae9cfda in /home/bowbao/pytorch_dev/torch/lib/libtorch_python.so))
frame: (<unknown frame>)

WARNING: node-missing-onnx-shape-inference
==========================================
The shape inference of custom::CustomAdd type is missing, so it may result in wrong shape inference for the exported graph. Please consider adding it in symbolic function.
--------------------------- Stack: Python call stack ---------------------------
frame: diagnostic = ExportDiagnostic(rule, level, message, **kwargs) /home/bowbao/pytorch_dev/torch/onnx/_internal/diagnostics/_diagnostic.py:151
frame: graph, params_dict, GLOBALS.export_onnx_opset_version /home/bowbao/pytorch_dev/torch/onnx/utils.py:688
frame: <@beartype(torch.onnx.utils._optimize_graph) at 0x7f62180e05f0>:85
frame: return beartyped(*args, **kwargs) /home/bowbao/pytorch_dev/torch/onnx/_internal/_beartype.py:81
frame: module=module, /home/bowbao/pytorch_dev/torch/onnx/utils.py:1123
frame: return beartyped(*args, **kwargs) /home/bowbao/pytorch_dev/torch/onnx/_internal/_beartype.py:81
frame: dynamic_axes=dynamic_axes, /home/bowbao/pytorch_dev/torch/onnx/utils.py:1539
frame: return beartyped(*args, **kwargs) /home/bowbao/pytorch_dev/torch/onnx/_internal/_beartype.py:81
frame: export_modules_as_functions=export_modules_as_functions, /home/bowbao/pytorch_dev/torch/onnx/utils.py:519
frame: <@beartype(torch.onnx.utils.export) at 0x7f62180e0170>:347
frame: return beartyped(*args, **kwargs) /home/bowbao/pytorch_dev/torch/onnx/_internal/_beartype.py:81
frame: torch.onnx.export(M(), torch.randn(3, 4), io.BytesIO()) test_pretty_print.py:22
---------------------------- Stack: C++ call stack -----------------------------
frame: (<unknown frame>)
frame: (<unknown function> + 0x88411b (0x7f625b36011b in /home/bowbao/pytorch_dev/torch/lib/libtorch_python.so))
frame: (torch::jit::UpdateReliable(torch::jit::Value*, std::pair<bool, bool> const&) + 0x7d3 (0x7f625b351743 in /home/bowbao/pytorch_dev/torch/lib/libtorch_python.so))
frame: (torch::jit::UpdateReliable(torch::jit::Node*) + 0x4f (0x7f625b35198f in /home/bowbao/pytorch_dev/torch/lib/libtorch_python.so))
frame: (torch::jit::ONNXShapeTypeInference(torch::jit::Node*, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, c10::IValue, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, c10::IValue> > > const&, int) + 0xac9 (0x7f625b357179 in /home/bowbao/pytorch_dev/torch/lib/libtorch_python.so))
frame: (<unknown function> + 0x87d6d1 (0x7f625b3596d1 in /home/bowbao/pytorch_dev/torch/lib/libtorch_python.so))
frame: (torch::jit::ONNXShapeTypeInference(std::shared_ptr<torch::jit::Graph>&, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, c10::IValue, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, c10::IValue> > > const&, int) + 0x33 (0x7f625b359cf3 in /home/bowbao/pytorch_dev/torch/lib/libtorch_python.so))
frame: (<unknown function> + 0xabdbae (0x7f625b599bae in /home/bowbao/pytorch_dev/torch/lib/libtorch_python.so))
frame: (<unknown function> + 0x3c0fda (0x7f625ae9cfda in /home/bowbao/pytorch_dev/torch/lib/libtorch_python.so))
frame: (<unknown frame>)

WARNING: node-missing-onnx-shape-inference
==========================================
The shape inference of custom::CustomAdd type is missing, so it may result in wrong shape inference for the exported graph. Please consider adding it in symbolic function.
--------------------------- Stack: Python call stack ---------------------------
frame: diagnostic = ExportDiagnostic(rule, level, message, **kwargs) /home/bowbao/pytorch_dev/torch/onnx/_internal/diagnostics/_diagnostic.py:151
frame: graph, params_dict, GLOBALS.export_onnx_opset_version /home/bowbao/pytorch_dev/torch/onnx/utils.py:1179
frame: return beartyped(*args, **kwargs) /home/bowbao/pytorch_dev/torch/onnx/_internal/_beartype.py:81
frame: dynamic_axes=dynamic_axes, /home/bowbao/pytorch_dev/torch/onnx/utils.py:1539
frame: return beartyped(*args, **kwargs) /home/bowbao/pytorch_dev/torch/onnx/_internal/_beartype.py:81
frame: export_modules_as_functions=export_modules_as_functions, /home/bowbao/pytorch_dev/torch/onnx/utils.py:519
frame: <@beartype(torch.onnx.utils.export) at 0x7f62180e0170>:347
frame: return beartyped(*args, **kwargs) /home/bowbao/pytorch_dev/torch/onnx/_internal/_beartype.py:81
frame: torch.onnx.export(M(), torch.randn(3, 4), io.BytesIO()) test_pretty_print.py:22
---------------------------- Stack: C++ call stack -----------------------------
frame: (<unknown frame>)
frame: (<unknown function> + 0x88411b (0x7f625b36011b in /home/bowbao/pytorch_dev/torch/lib/libtorch_python.so))
frame: (torch::jit::UpdateReliable(torch::jit::Value*, std::pair<bool, bool> const&) + 0x7d3 (0x7f625b351743 in /home/bowbao/pytorch_dev/torch/lib/libtorch_python.so))
frame: (torch::jit::UpdateReliable(torch::jit::Node*) + 0x4f (0x7f625b35198f in /home/bowbao/pytorch_dev/torch/lib/libtorch_python.so))
frame: (torch::jit::ONNXShapeTypeInference(torch::jit::Node*, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, c10::IValue, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, c10::IValue> > > const&, int) + 0xac9 (0x7f625b357179 in /home/bowbao/pytorch_dev/torch/lib/libtorch_python.so))
frame: (<unknown function> + 0x87d6d1 (0x7f625b3596d1 in /home/bowbao/pytorch_dev/torch/lib/libtorch_python.so))
frame: (torch::jit::ONNXShapeTypeInference(std::shared_ptr<torch::jit::Graph>&, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, c10::IValue, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, c10::IValue> > > const&, int) + 0x33 (0x7f625b359cf3 in /home/bowbao/pytorch_dev/torch/lib/libtorch_python.so))
frame: (<unknown function> + 0xabdbae (0x7f625b599bae in /home/bowbao/pytorch_dev/torch/lib/libtorch_python.so))
frame: (<unknown function> + 0x3c0fda (0x7f625ae9cfda in /home/bowbao/pytorch_dev/torch/lib/libtorch_python.so))
frame: (<unknown frame>)
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88261
Approved by: https://github.com/abock, https://github.com/justinchuby
---
 test/onnx/internal/test_diagnostics.py        |   2 +-
 .../onnx/_internal/diagnostics/_diagnostic.py |  18 +--
 .../_internal/diagnostics/infra/_infra.py     | 110 +++++++++++++++++-
 .../_internal/diagnostics/infra/engine.py     |  15 +++
 .../_internal/diagnostics/infra/formatter.py  |  18 +++
 .../onnx/_internal/diagnostics/infra/utils.py |   2 +-
 6 files changed, 140 insertions(+), 25 deletions(-)

diff --git a/test/onnx/internal/test_diagnostics.py b/test/onnx/internal/test_diagnostics.py
index ea9a789e91c1f..884b7cb1c3880 100644
--- a/test/onnx/internal/test_diagnostics.py
+++ b/test/onnx/internal/test_diagnostics.py
@@ -19,7 +19,7 @@ def _assert_has_diagnostics(
     rule_level_pairs: AbstractSet[Tuple[infra.Rule, infra.Level]],
 ):
     sarif_log = engine.sarif_log()
-    unseen_pairs = {(rule.id, level.value) for rule, level in rule_level_pairs}
+    unseen_pairs = {(rule.id, level.name.lower()) for rule, level in rule_level_pairs}
     actual_results = []
     for run in sarif_log.runs:
         if run.results is None:
diff --git a/torch/onnx/_internal/diagnostics/_diagnostic.py b/torch/onnx/_internal/diagnostics/_diagnostic.py
index 21e44f2b44671..efe5c0e34911c 100644
--- a/torch/onnx/_internal/diagnostics/_diagnostic.py
+++ b/torch/onnx/_internal/diagnostics/_diagnostic.py
@@ -74,22 +74,6 @@ def record_cpp_call_stack(self, frames_to_skip) -> None:
         self.with_stack(stack)
         self.cpp_call_stack = stack
 
-    def with_model_source_location(
-        self: _ExportDiagnostic,
-    ) -> _ExportDiagnostic:
-        # TODO: Implement this.
-        # self.locations.append(...)
-        raise NotImplementedError()
-        return self
-
-    def with_export_source_location(
-        self: _ExportDiagnostic,
-    ) -> _ExportDiagnostic:
-        # TODO: Implement this.
-        # self.locations.append(...)
-        raise NotImplementedError()
-        return self
-
 
 class ExportDiagnosticEngine(infra.DiagnosticEngine):
     """PyTorch ONNX Export diagnostic engine.
@@ -115,7 +99,6 @@ def __init__(self) -> None:
             name="torch.onnx",
             version=torch.__version__,
             diagnostic_type=ExportDiagnostic,
-            options=None,
         )
 
     @property
@@ -150,6 +133,7 @@ def create_export_diagnostic_context():
     try:
         yield context
     finally:
+        context.pretty_print(context.options.log_verbose, context.options.log_level)
         context = engine.background_context
 
 
diff --git a/torch/onnx/_internal/diagnostics/infra/_infra.py b/torch/onnx/_internal/diagnostics/infra/_infra.py
index b8a4c5032f523..3414574cce739 100644
--- a/torch/onnx/_internal/diagnostics/infra/_infra.py
+++ b/torch/onnx/_internal/diagnostics/infra/_infra.py
@@ -17,10 +17,10 @@ class Level(enum.Enum):
     please use infra.Tag instead.
     """
 
-    NONE = "none"
-    NOTE = "note"
-    WARNING = "warning"
-    ERROR = "error"
+    NONE = enum.auto()
+    NOTE = enum.auto()
+    WARNING = enum.auto()
+    ERROR = enum.auto()
 
 
 levels = Level
@@ -107,6 +107,9 @@ def format_message(self, *args, **kwargs) -> str:
         """
         return self.message_default_template.format(*args, **kwargs)
 
+    def pretty_print(self):
+        pass
+
 
 @dataclasses.dataclass
 class Location:
@@ -134,6 +137,25 @@ def sarif(self) -> sarif.Location:
             else None,
         )
 
+    def pretty_print(self):
+        """Prints the location in a human-readable format."""
+        location_strs = ["frame:"]
+        if self.snippet is not None:
+            location_strs.append(self.snippet)
+        if self.uri is not None:
+            line_strs = [self.uri]
+            line_strs.append(str(self.line)) if self.line is not None else "-1"
+            line_strs.append(
+                str(self.start_column)
+            ) if self.start_column is not None else "-1"
+            line_strs.append(
+                str(self.end_column)
+            ) if self.end_column is not None else "-1"
+            location_strs.append(":".join(line_strs))
+        if self.message is not None:
+            location_strs.append(f"({self.message})")
+        print(" ".join(location_strs))
+
 
 @dataclasses.dataclass
 class StackFrame:
@@ -143,6 +165,10 @@ def sarif(self) -> sarif.StackFrame:
         """Returns the SARIF representation of this stack frame."""
         return sarif.StackFrame(location=self.location.sarif())
 
+    def pretty_print(self):
+        """Prints the stack frame in a human-readable format."""
+        self.location.pretty_print()
+
 
 @dataclasses.dataclass
 class Stack:
@@ -158,6 +184,12 @@ def sarif(self) -> sarif.Stack:
             else None,
         )
 
+    def pretty_print(self):
+        """Prints the stack in a human-readable format."""
+        formatter.pretty_print_title(f"Stack: {self.message}", fill_char="-")
+        for frame in self.frames:
+            frame.pretty_print()
+
 
 # This is a workaround for mypy not supporting Self from typing_extensions.
 _Diagnostic = TypeVar("_Diagnostic", bound="Diagnostic")
@@ -182,6 +214,9 @@ def sarif(self) -> sarif.Graph:
             properties=PatchedPropertyBag(name=self.name, description=self.description),
         )
 
+    def pretty_print(self):
+        pass
+
 
 @dataclasses.dataclass
 class Diagnostic:
@@ -201,7 +236,7 @@ def sarif(self) -> sarif.Result:
             message = f"{message}\n{self.additional_message}"
         sarif_result = sarif.Result(
             message=sarif.Message(text=message),
-            level=self.level.value,
+            level=self.level.name.lower(),  # type: ignore[arg-type]
             rule_id=self.rule.id,
         )
         sarif_result.locations = [location.sarif() for location in self.locations]
@@ -235,6 +270,31 @@ def with_additional_message(self: _Diagnostic, message: str) -> _Diagnostic:
             self.additional_message = f"{self.additional_message}\n{message}"
         return self
 
+    def pretty_print(self, verbose: bool = False, log_level: Level = Level.ERROR):
+        """Prints the diagnostics in a human-readable format.
+
+        Args:
+            verbose: If True, prints all information. E.g. stack frames, graphs, etc.
+                Otherwise, only prints compact information. E.g., rule name and display message.
+            level: The minimum level of diagnostics to print.
+        """
+        if self.level.value < log_level.value:
+            return
+        formatter.pretty_print_item_title(f"{self.level.name}: {self.rule.name}")
+        print(self.message)
+
+        if not verbose:
+            print("<Set verbose=True to see more details>\n")
+            return
+
+        for location in self.locations:
+            location.pretty_print()
+        for stack in self.stacks:
+            stack.pretty_print()
+        for graph in self.graphs:
+            graph.pretty_print()
+        print()
+
 
 @dataclasses.dataclass
 class RuleCollection:
@@ -284,12 +344,15 @@ class DiagnosticOptions:
     Options for diagnostic context.
     """
 
+    log_verbose: bool = dataclasses.field(default=False)
+    log_level: Level = dataclasses.field(default=Level.ERROR)
+
 
 @dataclasses.dataclass
 class DiagnosticContext:
     name: str
     version: str
-    options: Optional[DiagnosticOptions] = None
+    options: DiagnosticOptions = dataclasses.field(default_factory=DiagnosticOptions)
     diagnostic_type: Type[Diagnostic] = dataclasses.field(default=Diagnostic)
     diagnostics: List[Diagnostic] = dataclasses.field(init=False, default_factory=list)
     _invocation: Invocation = dataclasses.field(init=False)
@@ -350,3 +413,38 @@ def diagnose(
         diagnostic = self.diagnostic_type(rule, level, message, **kwargs)
         self.add_diagnostic(diagnostic)
         return diagnostic
+
+    def pretty_print(
+        self, verbose: bool = False, log_level: Level = Level.ERROR
+    ) -> None:
+        """Prints the diagnostics in a human-readable format.
+
+        Args:
+            verbose: Whether to print the diagnostics in verbose mode. See Diagnostic.pretty_print.
+            level: The minimum level of diagnostics to print.
+        """
+        formatter.pretty_print_title(
+            f"Diagnostic Run {self.name} version {self.version}"
+        )
+        print(f"verbose: {verbose}, log level: {log_level}")
+        diagnostic_stats = {level: 0 for level in Level}
+        for diagnostic in self.diagnostics:
+            diagnostic_stats[diagnostic.level] += 1
+        formatter.pretty_print_title(
+            " ".join(f"{diagnostic_stats[level]} {level.name}" for level in Level)
+        )
+
+        for diagnostic in self.diagnostics:
+            diagnostic.pretty_print(verbose, log_level)
+
+        unprinted_diagnostic_stats = [
+            (level, count)
+            for level, count in diagnostic_stats.items()
+            if count > 0 and level.value < log_level.value
+        ]
+        if unprinted_diagnostic_stats:
+            print(
+                f"{' '.join(f'{count} {level.name}' for level, count in unprinted_diagnostic_stats)} "
+                "were not printed due to the log level."
+            )
+        print()
diff --git a/torch/onnx/_internal/diagnostics/infra/engine.py b/torch/onnx/_internal/diagnostics/infra/engine.py
index 2678268fbaf9a..51a6057565bba 100644
--- a/torch/onnx/_internal/diagnostics/infra/engine.py
+++ b/torch/onnx/_internal/diagnostics/infra/engine.py
@@ -85,8 +85,23 @@ def create_diagnostic_context(
         Returns:
             A new diagnostic context.
         """
+        if options is None:
+            options = infra.DiagnosticOptions()
         context = infra.DiagnosticContext(
             name, version, options, diagnostic_type=diagnostic_type
         )
         self.contexts.append(context)
         return context
+
+    def pretty_print(
+        self, verbose: bool = False, level: infra.Level = infra.Level.ERROR
+    ) -> None:
+        """Pretty prints all diagnostics in the diagnostic contexts.
+
+        Args:
+            verbose: Whether to print the diagnostics in verbose mode. See Diagnostic.pretty_print.
+            level: The minimum level of diagnostics to print.
+        """
+        formatter.pretty_print_title(f"{len(self.contexts)} Diagnostic Run")
+        for context in self.contexts:
+            context.pretty_print(verbose, level)
diff --git a/torch/onnx/_internal/diagnostics/infra/formatter.py b/torch/onnx/_internal/diagnostics/infra/formatter.py
index 2f35489f8d454..292a2b6a47a5a 100644
--- a/torch/onnx/_internal/diagnostics/infra/formatter.py
+++ b/torch/onnx/_internal/diagnostics/infra/formatter.py
@@ -57,3 +57,21 @@ def sarif_to_json(attr_cls_obj: _SarifClass) -> str:
     dict = dataclasses.asdict(attr_cls_obj)
     dict = _convert_key(dict, _camel_case_to_snake_case)
     return json.dumps(dict, indent=4)
+
+
+def pretty_print_title(title: str, width: int = 80, fill_char: str = "=") -> None:
+    """Pretty prints title in below format:
+
+    ==================== title ====================
+    """
+    print(f" {title} ".center(width, fill_char))
+
+
+def pretty_print_item_title(title: str, fill_char: str = "=") -> None:
+    """Pretty prints title in below format:
+
+    title
+    =====
+    """
+    print(title)
+    print(fill_char * len(title))
diff --git a/torch/onnx/_internal/diagnostics/infra/utils.py b/torch/onnx/_internal/diagnostics/infra/utils.py
index c32de1c6b8ad9..6a85df9104639 100644
--- a/torch/onnx/_internal/diagnostics/infra/utils.py
+++ b/torch/onnx/_internal/diagnostics/infra/utils.py
@@ -6,7 +6,7 @@
 def python_frame(frame: inspect.FrameInfo) -> _infra.StackFrame:
     """Returns a StackFrame for the given inspect.FrameInfo."""
     snippet = (
-        frame.code_context[frame.index]
+        frame.code_context[frame.index].strip()
         if frame.code_context is not None and frame.index is not None
         else None
     )

From cdde8cf2f40c5658309079f8b26b5e27ecad0c1d Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Fri, 11 Nov 2022 22:07:34 +0000
Subject: [PATCH 0812/1922] Make InductorCPU usable in internally (#88870)

Test Plan: `buck2 test mode/opt //caffe2/test:test_inductor -- --exact 'caffe2/test:test_inductor - test_dtype_mismatch_issue_cuda (caffe2.test.inductor.test_torchinductor.CudaTests)'`

Differential Revision: D41206109

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88870
Approved by: https://github.com/izaitsevfb
---
 torch/_inductor/config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index 87e2793782be8..8f9f2c4f461dd 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -92,6 +92,7 @@ class cpp:
         "g++-10",
         "clang++",
         "g++",
+        "g++.par",
     )
 
 
From 069d6953aedbe0acc6dd6f8bdb023dc1d0341b76 Mon Sep 17 00:00:00 2001
From: Kevin Tse <ktse@fb.com>
Date: Thu, 10 Nov 2022 18:33:09 -0500
Subject: [PATCH 0813/1922] [DataLoader] Removing DataLoader2 related code
 (#88848)

Removing these lines of code as `DataLoader2` has been added to [TorchData](https://github.com/pytorch/data). I'm importing this to confirm it will not impact internal codes.

Differential Revision: [D41201578](https://our.internmc.facebook.com/intern/diff/D41201578)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88848
Approved by: https://github.com/ejguan
---
 docs/source/data.rst                        |   3 -
 test/test_dataloader.py                     | 111 -----------
 torch/utils/data/__init__.py                |   5 -
 torch/utils/data/communication/__init__.py  |   6 -
 torch/utils/data/communication/eventloop.py |  70 -------
 torch/utils/data/communication/iter.py      | 181 -----------------
 torch/utils/data/communication/map.py       | 159 ---------------
 torch/utils/data/communication/messages.py  |  75 -------
 torch/utils/data/communication/protocol.py  | 205 --------------------
 torch/utils/data/communication/queue.py     |  51 -----
 torch/utils/data/dataloader_experimental.py | 150 --------------
 11 files changed, 1016 deletions(-)
 delete mode 100644 torch/utils/data/communication/__init__.py
 delete mode 100644 torch/utils/data/communication/eventloop.py
 delete mode 100644 torch/utils/data/communication/iter.py
 delete mode 100644 torch/utils/data/communication/map.py
 delete mode 100644 torch/utils/data/communication/messages.py
 delete mode 100644 torch/utils/data/communication/protocol.py
 delete mode 100644 torch/utils/data/communication/queue.py
 delete mode 100644 torch/utils/data/dataloader_experimental.py

diff --git a/docs/source/data.rst b/docs/source/data.rst
index de2d44920f573..b44096d101964 100644
--- a/docs/source/data.rst
+++ b/docs/source/data.rst
@@ -441,9 +441,6 @@ Example::
 .. autoclass:: torch.utils.data.distributed.DistributedSampler
 
 
-.. This module is experimental and should be private, adding it here for now
-.. py:module:: torch.utils.data.communication
-
 .. These modules are documented as part of torch/data listing them here for
 .. now until we have a clearer fix
 .. py:module:: torch.utils.data.datapipes
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 270ca89764ed1..6a7ff90527d3d 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -20,19 +20,16 @@
     ChainDataset,
     ConcatDataset,
     DataLoader,
-    DataLoader2,
     Dataset,
     IterableDataset,
     IterDataPipe,
     Subset,
     TensorDataset,
-    communication,
     _utils
 )
 from torch.utils.data._utils import MP_STATUS_CHECK_INTERVAL
 from torch.utils.data.dataset import random_split
 from torch.utils.data.datapipes.iter import IterableWrapper
-from torch.utils.data.datapipes.map import SequenceWrapper
 from torch._utils import ExceptionWrapper
 from torch.testing._internal.common_utils import (TestCase, run_tests, TEST_NUMPY, IS_WINDOWS,
                                                   IS_CI, NO_MULTIPROCESSING_SPAWN, skipIfRocm, slowTest,
@@ -2222,114 +2219,6 @@ def test_excessive_thread_creation_warning(self):
                 r"excessive worker creation might get DataLoader running slow or even freeze"):
             dataloader = DataLoader(self.dataset, batch_size=2, num_workers=1000)
 
-# Define a global function for testing purposes since local functions cannot be pickled
-def identity(x):
-    return x
-
-@unittest.skipIf(
-    TEST_WITH_TSAN,
-    "Fails with TSAN with the following error: starting new threads after multi-threaded "
-    "fork is not supported. Dying (set die_after_fork=0 to override)")
-class TestDataLoader2(TestCase):
-    @skipIfNoDill
-    def test_basics(self):
-        # TODO(VitalyFedyunin): This test will start breaking if we remove guaranteed order
-        # of traversing workers
-        dp = IterableWrapper(list(range(1000))).sharding_filter()
-        dl = DataLoader(dp, batch_size=3, collate_fn=identity, num_workers=2)
-        dl2 = DataLoader2(dp, batch_size=3, collate_fn=identity, num_workers=2)
-        dl2_threading = DataLoader2(dp, batch_size=3, collate_fn=identity, num_workers=2, parallelism_mode='thread')
-        self.assertEqual(list(dl), list(dl2))
-        self.assertEqual(list(dl), list(dl2_threading))
-
-    class Sorter(IterDataPipe):
-        def __init__(self, datapipe):
-            self.datapipe = datapipe
-
-        def __iter__(self):
-            return iter(sorted(self.datapipe))
-
-    def test_shuffle(self):
-        items = list(range(1000))
-        dp = IterableWrapper(items).sharding_filter().shuffle()
-
-        dl = DataLoader2(dp, batch_size=None, num_workers=2, shuffle=False)
-        self.assertEqual(items, list(dl))
-
-        dl = DataLoader2(dp, batch_size=None, num_workers=2, shuffle=True)
-        self.assertNotEqual(items, list(dl))
-        self.assertEqual(items, sorted(list(dl)))
-
-        dl = DataLoader2(dp, batch_size=None, num_workers=2, shuffle=True)
-        self.assertNotEqual(items, list(dl))
-        self.assertEqual(items, sorted(list(dl)))
-
-        dl = DataLoader2(self.Sorter(dp), batch_size=None, num_workers=2, shuffle=True)
-        self.assertEqual(list(dl), items)
-
-        dl = DataLoader2(self.Sorter(dp), batch_size=None, num_workers=2, shuffle=True)
-        self.assertEqual(list(dl), items)
-
-
-@unittest.skipIf(
-    TEST_WITH_TSAN,
-    "Fails with TSAN with the following error: starting new threads after multi-threaded "
-    "fork is not supported. Dying (set die_after_fork=0 to override)")
-class TestDataLoader2_EventLoop(TestCase):
-    @skipIfNoDill
-    def test_basic_threading(self):
-        def clean_me(process, req_queue, res_queue):
-            req_queue.put(communication.messages.TerminateRequest())
-            _ = res_queue.get()
-            process.join()
-
-        it = list(range(100))
-        numbers_dp = IterableWrapper(it)
-        (process, req_queue, res_queue, _thread_local_datapipe) = communication.eventloop.SpawnThreadForDataPipeline(numbers_dp)
-
-        process.start()
-        local_datapipe = communication.iter.QueueWrapper(
-            communication.protocol.IterDataPipeQueueProtocolClient(req_queue, res_queue))
-
-        actual = list(local_datapipe)
-        clean_me(process, req_queue, res_queue)
-
-        self.assertEqual(list(range(100)), actual)
-
-    @skipIfNoDill
-    def test_basic_mapdatapipe_threading(self):
-        def clean_me(process, req_queue, res_queue):
-            req_queue.put(communication.messages.TerminateRequest())
-            _ = res_queue.get()
-            process.join()
-
-        input_len = 100
-        it = list(range(input_len))
-        numbers_dp = SequenceWrapper(it)
-        (process, req_queue, res_queue, _thread_local_datapipe) = communication.eventloop.SpawnThreadForDataPipeline(
-            numbers_dp)
-
-        process.start()
-
-        # Functional Test: Ensure that you can retrieve every element from the Queue and DataPipe
-        local_datapipe = communication.map.QueueWrapperForMap(
-            communication.protocol.MapDataPipeQueueProtocolClient(req_queue, res_queue))
-        actual = list(local_datapipe)
-        self.assertEqual([(x, x) for x in range(100)], actual)
-
-        # Functional Test: raise Error when input
-        local_datapipe = communication.map.QueueWrapperForMap(
-            communication.protocol.MapDataPipeQueueProtocolClient(req_queue, res_queue))
-        with self.assertRaisesRegex(IndexError, "out of bound"):
-            local_datapipe[1000]
-
-        # __len__ Test: Ensure that the correct length is returned
-        local_datapipe = communication.map.QueueWrapperForMap(
-            communication.protocol.MapDataPipeQueueProtocolClient(req_queue, res_queue))
-        self.assertEqual(input_len, len(local_datapipe))
-
-        clean_me(process, req_queue, res_queue)
-
 
 class IntegrationTestDataLoaderDataPipe(TestCase):
     r"""
diff --git a/torch/utils/data/__init__.py b/torch/utils/data/__init__.py
index 6fe6147ddc545..bc054a947069f 100644
--- a/torch/utils/data/__init__.py
+++ b/torch/utils/data/__init__.py
@@ -39,8 +39,6 @@
     runtime_validation,
     runtime_validation_disabled,
 )
-from torch.utils.data.dataloader_experimental import DataLoader2
-from torch.utils.data import communication
 
 __all__ = ['BatchSampler',
            'ChainDataset',
@@ -48,7 +46,6 @@
            'DFIterDataPipe',
            'DataChunk',
            'DataLoader',
-           'DataLoader2',
            'Dataset',
            'DistributedSampler',
            'IterDataPipe',
@@ -63,8 +60,6 @@
            'WeightedRandomSampler',
            '_DatasetKind',
            'argument_validation',
-           'collate',
-           'communication',
            'default_collate',
            'default_convert',
            'functional_datapipe',
diff --git a/torch/utils/data/communication/__init__.py b/torch/utils/data/communication/__init__.py
deleted file mode 100644
index 1b9cae4011897..0000000000000
--- a/torch/utils/data/communication/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from . import eventloop
-from . import iter
-from . import map
-from . import messages
-from . import protocol
-from . import queue
diff --git a/torch/utils/data/communication/eventloop.py b/torch/utils/data/communication/eventloop.py
deleted file mode 100644
index 9bf241d334dfe..0000000000000
--- a/torch/utils/data/communication/eventloop.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import torch
-import threading
-import pickle
-
-from torch.utils.data import IterDataPipe, communication, MapDataPipe
-
-try:
-    import dill
-    # XXX: By default, dill writes the Pickler dispatch table to inject its
-    # own logic there. This globally affects the behavior of the standard library
-    # pickler for any user who transitively depends on this module!
-    # Undo this extension to avoid altering the behavior of the pickler globally.
-    dill.extend(use_dill=False)
-    HAS_DILL = True
-except ImportError:
-    HAS_DILL = False
-
-__all__ = [
-    "DataPipeToQueuesLoop",
-    "SpawnProcessForDataPipeline",
-    "SpawnThreadForDataPipeline",
-]
-
-def DataPipeToQueuesLoop(source_datapipe, req_queue, res_queue):
-    if isinstance(source_datapipe, IterDataPipe):
-        pipe_type = communication.iter
-        protocol_type = communication.protocol.IterDataPipeQueueProtocolServer
-    elif isinstance(source_datapipe, MapDataPipe):
-        pipe_type = communication.map  # type: ignore[misc]
-        protocol_type = communication.protocol.MapDataPipeQueueProtocolServer  # type: ignore[assignment]
-    else:
-        raise Exception('Only supports IterDataPipe or MapDataPipe, got', source_datapipe)
-
-    torch.set_num_threads(1)
-    for _ in pipe_type.DataPipeBehindQueues(source_datapipe, protocol_type(req_queue, res_queue),
-                                            blocking_request_get=True):
-        pass
-
-
-def SpawnProcessForDataPipeline(multiprocessing_ctx, datapipe):
-    req_queue = multiprocessing_ctx.Queue()
-    res_queue = multiprocessing_ctx.Queue()
-    process = multiprocessing_ctx.Process(
-        target=DataPipeToQueuesLoop, args=(datapipe, req_queue, res_queue))
-    return process, req_queue, res_queue
-
-
-def SpawnThreadForDataPipeline(datapipe):
-    r"""
-        Given a DataPipe, creates a copy of the DataPipe, starts a new Thread with DataPipeToQueuesLoop as target,
-        and return the process, req_queue, res_queue, thread_local_datapipe.
-    """
-    req_queue = communication.queue.ThreadingQueue()
-    res_queue = communication.queue.ThreadingQueue()
-
-    try:
-        new_datapipe = pickle.loads(pickle.dumps(datapipe))
-    except Exception as pe:
-        if HAS_DILL:
-            try:
-                new_datapipe = dill.loads(dill.dumps(datapipe))
-            except Exception as de:
-                raise Exception('Unable to dill DataPipe to make thread local copy', de)
-
-        else:
-            raise Exception('Unable to pickle DataPipe to make thread local copy (consider installing `dill`)', pe)
-
-    process = threading.Thread(target=DataPipeToQueuesLoop, args=(
-        new_datapipe, req_queue, res_queue), daemon=True)
-    return process, req_queue, res_queue, new_datapipe
diff --git a/torch/utils/data/communication/iter.py b/torch/utils/data/communication/iter.py
deleted file mode 100644
index 94f7cd2ec7035..0000000000000
--- a/torch/utils/data/communication/iter.py
+++ /dev/null
@@ -1,181 +0,0 @@
-import time
-import types
-
-from torch.utils.data import IterDataPipe, communication
-
-DEFAULT_NON_BLOCKING_SLEEP = 0.001
-
-__all__ = [
-    "DataPipeBehindQueues",
-    "EnsureNonBlockingDataPipe",
-    "InvalidStateResetRequired",
-    "NonBlocking",
-    "NotAvailable",
-    "QueueWrapper",
-    "default_not_available_hook",
-]
-
-
-def default_not_available_hook():
-    time.sleep(DEFAULT_NON_BLOCKING_SLEEP)
-
-
-class NotAvailable(Exception):
-    pass
-
-
-class InvalidStateResetRequired(Exception):
-    """
-        Returned by DataPipe when it is expecting to get reset request,
-        for example RouterDataPipe expecting all workers to request reset'
-    """
-    pass
-
-
-class NonBlocking(IterDataPipe):
-    not_available_hook = default_not_available_hook
-
-    def __iter__(self):
-        self.reset_iterator()
-        return self
-
-    def __next__(self):
-        while True:
-            try:
-                return self.nonblocking_next()
-            except StopIteration:
-                raise StopIteration
-            except NotAvailable:
-                if NonBlocking.not_available_hook is not None:
-                    NonBlocking.not_available_hook()
-
-    def nonblocking_next(self):
-        raise NotImplementedError(
-            "nonblocking_next is not implemented for %s" % self.__class__)
-
-    def reset_iterator(self):
-        raise NotImplementedError(
-            "reset_iterator is not implemented for %s" % self.__class__)
-
-    @staticmethod
-    def register_not_available_hook(hook_function):
-        NonBlocking.not_available_hook = hook_function
-
-
-def EnsureNonBlockingDataPipe(validated_datapipe):
-    if not isinstance(validated_datapipe, IterDataPipe):
-        raise Exception('Not Iterable DataPipe ' +
-                        str(validated_datapipe.__class__))
-    if isinstance(validated_datapipe, NonBlocking):
-        return validated_datapipe
-    if not hasattr(validated_datapipe, '_as_iterator'):
-        validated_datapipe._as_iterator = None  # type: ignore[attr-defined]
-    if not hasattr(validated_datapipe, 'nonblocking_next'):
-        def nonblocking_next(self):
-            if self._as_iterator is None:
-                self._as_iterator = iter(self)
-            return next(self._as_iterator)
-        validated_datapipe.nonblocking_next = types.MethodType(  # type: ignore[attr-defined]
-            nonblocking_next, validated_datapipe)
-    if not hasattr(validated_datapipe, 'reset_iterator'):
-        def reset_iterator(self):
-            self._as_iterator = None
-        validated_datapipe.reset_iterator = types.MethodType(  # type: ignore[attr-defined]
-            reset_iterator, validated_datapipe)
-    return validated_datapipe
-
-
-def DataPipeBehindQueues(source_datapipe, protocol, full_stop=False, blocking_request_get=False):
-    """
-        Indefinitely iterates over req_queue and passing values from source_datapipe to res_queue
-        If raise_stop is true, raises exception when StopIteration received from the source_datapipe
-    """
-    if not isinstance(protocol, communication.protocol.IterDataPipeQueueProtocolServer):
-        raise Exception('Expecting IterDataPipeQueueProtocolServer, got', protocol)
-    source_datapipe = EnsureNonBlockingDataPipe(source_datapipe)
-    forever = True
-    while forever:
-        try:
-            # Non-blocking call is Extremely slow here for python.mp, need to figure out a good workaround
-            request = protocol.get_new_request(block=blocking_request_get)
-        except communication.protocol.EmptyQueue:
-            yield True
-            continue
-
-        if isinstance(request, communication.messages.ResetIteratorRequest):
-            source_datapipe.reset_iterator()
-            protocol.response_reset_iterator()
-
-        elif isinstance(request, communication.messages.TerminateRequest):
-            forever = False
-            protocol.response_terminate()
-
-        elif isinstance(request, communication.messages.GetNextRequest):
-            while forever:
-                try:
-                    value = source_datapipe.nonblocking_next()
-                except NotAvailable:
-                    yield True
-                    continue
-                except StopIteration:
-                    protocol.response_stop_iteration()
-                    if full_stop:
-                        forever = False
-                    else:
-                        yield True
-                    break
-                except InvalidStateResetRequired:
-                    protocol.response_invalid_state()
-                    if full_stop:
-                        forever = False
-                    else:
-                        yield True
-                    break
-                protocol.response_next(value)
-                yield True  # Returns control
-                break
-        else:
-            raise Exception('Unrecognized type of request received', request)
-
-
-class QueueWrapper(NonBlocking):
-    """
-        Creates iter.DataPipe which reads data from the DataLoader.Queue
-    """
-
-    def __init__(self, protocol, response_wait_time=0.00001):
-        if not isinstance(protocol, communication.protocol.IterDataPipeQueueProtocolClient):
-            raise Exception('Got', protocol)
-        self.protocol = protocol
-        self.counter = 0
-        self._stop_iteration = False
-        self._response_wait_time = response_wait_time
-
-    def reset_iterator(self):
-        self._stop_iteration = False
-        self.counter = 0
-        self.protocol.request_reset_iterator()
-        while True:
-            try:
-                self.protocol.get_response_reset_iterator()
-                break
-            except communication.protocol.EmptyQueue:
-                if NonBlocking.not_available_hook is not None:
-                    NonBlocking.not_available_hook()
-
-    def nonblocking_next(self):
-        if self._stop_iteration:
-            raise Exception(
-                '`next` or `nonblocking_next` called after receiving StopIteration')
-        if self.protocol.can_take_request():
-            self.protocol.request_next()
-        try:
-            response = self.protocol.get_response_next(block=True, timeout=self._response_wait_time)
-        except communication.protocol.EmptyQueue:
-            raise NotAvailable
-        if isinstance(response, communication.messages.StopIterationResponse):
-            self._stop_iteration = True
-            raise StopIteration
-        if isinstance(response, communication.messages.InvalidStateResponse):
-            raise NotAvailable
-        return response.value
diff --git a/torch/utils/data/communication/map.py b/torch/utils/data/communication/map.py
deleted file mode 100644
index 8af63bf0c73ec..0000000000000
--- a/torch/utils/data/communication/map.py
+++ /dev/null
@@ -1,159 +0,0 @@
-import time
-import types
-
-from torch.utils.data import communication, MapDataPipe
-
-DEFAULT_NON_BLOCKING_SLEEP = 0.001
-
-__all__ = [
-    "DataPipeBehindQueues",
-    "EnsureNonBlockingMapDataPipe",
-    "NonBlockingMap",
-    "NotAvailable",
-    "QueueWrapperForMap",
-    "default_not_available_hook",
-]
-
-
-def default_not_available_hook():
-    time.sleep(DEFAULT_NON_BLOCKING_SLEEP)
-
-
-class NotAvailable(Exception):
-    pass
-
-
-class NonBlockingMap(MapDataPipe):
-    not_available_hook = default_not_available_hook
-
-    def __getitem__(self, index):
-        while True:
-            try:
-                return self.nonblocking_getitem(index)
-            except NotAvailable:
-                if NonBlockingMap.not_available_hook is not None:
-                    NonBlockingMap.not_available_hook()
-
-    def __len__(self):
-        try:
-            return self.nonblocking_len()
-        except NotAvailable:
-            if NonBlockingMap.not_available_hook is not None:
-                NonBlockingMap.not_available_hook()
-
-    def nonblocking_len(self):
-        raise NotImplementedError(
-            "nonblocking_len is not implemented for %s" % self.__class__)
-
-    def nonblocking_getitem(self, index):
-        raise NotImplementedError(
-            "nonblocking_getitem is not implemented for %s" % self.__class__)
-
-    @staticmethod
-    def register_not_available_hook(hook_function):
-        NonBlockingMap.not_available_hook = hook_function
-
-
-def EnsureNonBlockingMapDataPipe(validated_datapipe):
-    if not isinstance(validated_datapipe, MapDataPipe):
-        raise Exception(f'Not Map DataPipe - got {validated_datapipe.__class__}')
-    if isinstance(validated_datapipe, NonBlockingMap):
-        return validated_datapipe
-    if not hasattr(validated_datapipe, 'nonblocking_len'):
-        def nonblocking_len(self):
-            return self.__len__()
-        validated_datapipe.nonblocking_len = types.MethodType(  # type: ignore[attr-defined]
-            nonblocking_len, validated_datapipe)
-    if not hasattr(validated_datapipe, 'nonblocking_getitem'):
-        def nonblocking_getitem(self, index):
-            return self.__getitem__(index)
-        validated_datapipe.nonblocking_getitem = types.MethodType(  # type: ignore[attr-defined]
-            nonblocking_getitem, validated_datapipe)
-    return validated_datapipe
-
-
-def DataPipeBehindQueues(source_datapipe, protocol, full_stop=False, blocking_request_get=False):
-    """
-        Indefinitely iterates over req_queue and passing values from source_datapipe to res_queue
-        If raise_stop is true, raises exception when StopIteration received from the source_datapipe
-    """
-    if not isinstance(protocol, communication.protocol.MapDataPipeQueueProtocolServer):
-        raise Exception('Expecting MapDataPipeQueueProtocolServer, got', protocol)
-    source_datapipe = EnsureNonBlockingMapDataPipe(source_datapipe)
-    forever = True
-    while forever:
-        try:
-            # Non-blocking call is Extremely slow here for python.mp, need to figure out a good workaround
-            request = protocol.get_new_request(block=blocking_request_get)
-        except communication.protocol.EmptyQueue:
-            yield True
-            continue
-
-        if isinstance(request, communication.messages.TerminateRequest):
-            forever = False
-            protocol.response_terminate()
-
-        elif isinstance(request, communication.messages.LenRequest):
-            size = source_datapipe.nonblocking_len()
-            protocol.response_len(size)
-
-        elif isinstance(request, communication.messages.GetItemRequest):
-            while forever:
-                try:
-                    value = source_datapipe.nonblocking_getitem(request.key)
-                except NotAvailable:
-                    yield True
-                    continue
-                except IndexError as e:
-                    # Alternatively, we can just allow the underlying DataPipe to throw an exception?
-                    protocol.response_index_out_of_bound()
-                    if full_stop:
-                        forever = False
-                    else:
-                        yield True
-                    break
-                protocol.response_item(request.key, value)
-                yield True  # Returns control
-                break
-        else:
-            raise Exception('Unrecognized type of request received', request)
-
-
-class QueueWrapperForMap(NonBlockingMap):
-    """
-        Creates map.DataPipe which reads data from the DataLoader.Queue
-    """
-    def __init__(self, protocol, response_wait_time=0.00001):
-        if not isinstance(protocol, communication.protocol.MapDataPipeQueueProtocolClient):
-            raise Exception('Got', protocol)
-        self.protocol = protocol
-        self.counter = 0
-        self._stop_iteration = False
-        self._response_wait_time = response_wait_time
-
-    def nonblocking_getitem(self, index):
-        if self._stop_iteration:
-            raise Exception(
-                '`getitem` or `nonblocking_getitem` called after receiving StopIteration')
-        if self.protocol.can_take_request():
-            self.protocol.request_item(index)
-        try:
-            response = self.protocol.get_response_item(block=True, timeout=self._response_wait_time)
-        except communication.protocol.EmptyQueue:
-            raise NotAvailable
-        if isinstance(response, communication.messages.StopIterationResponse):
-            self._stop_iteration = True
-            raise IndexError(f"Index {index} is out of bound.")
-        return response.key, response.value
-
-    def nonblocking_len(self):
-        if self._stop_iteration:
-            raise Exception(
-                '`len` or `nonblocking_len` called after receiving StopIteration')
-        if self.protocol.can_take_request():
-            self.protocol.request_len()
-        try:
-            response = self.protocol.get_response_len(block=True, timeout=self._response_wait_time)
-        except communication.protocol.EmptyQueue:
-            raise NotAvailable
-        return response.len
diff --git a/torch/utils/data/communication/messages.py b/torch/utils/data/communication/messages.py
deleted file mode 100644
index 449cf23cfc01c..0000000000000
--- a/torch/utils/data/communication/messages.py
+++ /dev/null
@@ -1,75 +0,0 @@
-class DataLoaderQueueMessage(object):
-    pass
-
-
-class Request(DataLoaderQueueMessage):
-    pass
-
-
-class Response(DataLoaderQueueMessage):
-    pass
-
-
-class ResetIteratorRequest(Request):
-    pass
-
-
-class ResetIteratorResponse(Response):
-    pass
-
-
-class TerminateRequest(Request):
-    pass
-
-
-class TerminateResponse(Response):
-    pass
-
-
-class LenRequest(Request):
-    pass
-
-
-class LenResponse(Response):
-    __slots__ = ('len')
-
-    def __init__(self, len):
-        self.len = len
-
-
-class GetItemRequest(Request):
-    __slots__ = ('key')
-
-    def __init__(self, key):
-        self.key = key
-
-
-class GetItemResponse(Response):
-    __slots__ = ('key', 'value')
-
-    def __init__(self, key, value):
-        self.key = key
-        self.value = value
-
-
-class GetNextRequest(Request):
-    pass
-
-
-class GetNextResponse(Response):
-    __slots__ = ('value')
-
-    def __init__(self, value):
-        self.value = value
-
-
-class StopIterationResponse(Response):
-    pass
-
-
-class InvalidStateResponse(Response):
-    """
-        Returned by DataPipe when it is expecting to get reset request,
-        for example RouterDataPipe expecting all workers to request reset'
-    """
-    pass
diff --git a/torch/utils/data/communication/protocol.py b/torch/utils/data/communication/protocol.py
deleted file mode 100644
index 5bf5fe1af0626..0000000000000
--- a/torch/utils/data/communication/protocol.py
+++ /dev/null
@@ -1,205 +0,0 @@
-from torch.utils.data import communication
-
-
-class Protocol(object):
-    __slots__ = ('request_queue', 'response_queue')
-
-    def __init__(self, request_queue, response_queue):
-        self.request_queue = request_queue
-        self.response_queue = response_queue
-
-
-class ProtocolClient(Protocol):
-    """
-        ProtocolClient takes charge of putting requests into req_queue and returning results from res_queue.
-    """
-    _req_sent = None
-
-    def __init__(self, request_queue, response_queue):
-        self.request_queue = request_queue
-        self.response_queue = response_queue
-        self._req_sent = None
-
-    def can_take_request(self):
-        return self._req_sent is None
-
-    def waiting_for_response(self):
-        return self._req_sent is not None
-
-    def request_sent(self, request=True):
-        if not self.can_take_request():
-            raise Exception('Protocol only supports one request in the Queue')
-        self._req_sent = request
-
-    def request_served(self, result=None):
-        if not self.waiting_for_response():
-            raise Exception(
-                'Expected no peding requests, but something got served', result)
-        self._req_sent = None
-
-
-class ProtocolServer(Protocol):
-    """
-        ProtocolServer takes charge of getting requests from req_queue and fetching data from source datapipe.
-    """
-    _req_received = None
-
-    def __init__(self, request_queue, response_queue):
-        self.request_queue = request_queue
-        self.response_queue = response_queue
-        self._req_received = None
-
-    def have_pending_request(self):
-        return self._req_received is not None
-
-    def get_new_request(self, block=False):
-        if self.have_pending_request():
-            raise Exception(
-                'Trying to get next request, while having one unserved')
-        try:
-            response = self.request_queue.get(block=block)
-        except Exception as e:  # TODO: Catch only timeout exceptions
-            raise EmptyQueue('queue is empty')
-        self._req_received = response
-        return response
-        # TODO: Validate supported requests
-
-    def response_terminate(self):
-        if not self.have_pending_request():
-            raise Exception("Attempting to reply with pending request")
-        if not isinstance(self._req_received, communication.messages.TerminateRequest):
-            raise Exception(
-                "Replaying with terminate status to other type of message")
-        self.response_queue.put(communication.messages.TerminateResponse())
-        self._req_received = None
-
-
-class MapDataPipeQueueProtocolServer(ProtocolServer):
-    def response_item(self, key, value):
-        if not self.have_pending_request():
-            raise Exception("Attempting to reply with pending request")
-        self.response_queue.put(communication.messages.GetItemResponse(key, value))
-        self._req_received = None
-
-    def response_len(self, size):
-        if not self.have_pending_request():
-            raise Exception("Attempting to reply with pending request")
-        self.response_queue.put(communication.messages.LenResponse(size))
-        self._req_received = None
-
-    def response_index_out_of_bound(self):
-        if not self.have_pending_request():
-            raise Exception("Attempting to reply with pending request")
-        self.response_queue.put(communication.messages.StopIterationResponse())
-        self._req_received = None
-
-class MapDataPipeQueueProtocolClient(ProtocolClient):
-    def request_len(self):
-        if not self.can_take_request():
-            raise Exception('Can not request len while we are still waiting response for previous request')
-        request = communication.messages.LenRequest()
-        self.request_queue.put(request)
-        self.request_sent(request)
-
-    def request_item(self, index):
-        if not self.can_take_request():
-            raise Exception('Can not request item while we are still waiting response for previous request')
-        request = communication.messages.GetItemRequest(index)
-        self.request_queue.put(request)
-        self.request_sent(request)
-
-    def get_response_len(self, block=False, timeout=None):
-        if not self.waiting_for_response():
-            raise Exception('Can not expect any response without submitted request')
-        try:
-            response = self.response_queue.get(block=block, timeout=timeout)
-        except TimeoutError:
-            raise EmptyQueue('queue is empty')
-        self.request_served(response)
-        if not isinstance(response, communication.messages.LenResponse):
-            raise Exception('Invalid response received')
-        return response
-
-    def get_response_item(self, block=False, timeout=None):
-        if not self.waiting_for_response():
-            raise Exception('Can not expect any response without submitted request')
-        try:
-            response = self.response_queue.get(block=block, timeout=timeout)
-        except TimeoutError:
-            raise EmptyQueue('queue is empty')
-        self.request_served(response)
-        # if not isinstance(response, communication.messages.GetItemResponse):
-        #     raise Exception('Invalid response received')
-        return response
-
-
-class EmptyQueue(Exception):
-    pass
-
-
-class IterDataPipeQueueProtocolServer(ProtocolServer):
-    def response_reset_iterator(self):
-        if not self.have_pending_request():
-            raise Exception("Attempting to reply with pending request")
-        if not isinstance(self._req_received, communication.messages.ResetIteratorRequest):
-            raise Exception(
-                "Replaying with reset status to other type of message")
-        self.response_queue.put(communication.messages.ResetIteratorResponse())
-        self._req_received = None
-
-    def response_next(self, value):
-        if not self.have_pending_request():
-            raise Exception("Attempting to reply with pending request")
-        self.response_queue.put(communication.messages.GetNextResponse(value))
-        self._req_received = None
-
-    def response_stop_iteration(self):
-        if not self.have_pending_request():
-            raise Exception("Attempting to reply with pending request")
-        self.response_queue.put(communication.messages.StopIterationResponse())
-        self._req_received = None
-
-    def response_invalid_state(self):
-        if not self.have_pending_request():
-            raise Exception("Attempting to reply with pending request")
-        self.response_queue.put(communication.messages.InvalidStateResponse())
-        self._req_received = None
-
-
-class IterDataPipeQueueProtocolClient(ProtocolClient):
-    def request_reset_iterator(self):
-        if not self.can_take_request():
-            raise Exception('Can not reset while we are still waiting response for previous request')
-        request = communication.messages.ResetIteratorRequest()
-        self.request_queue.put(request)
-        self.request_sent(request)
-
-    def request_next(self):
-        if not self.can_take_request():
-            raise Exception('Can not request next item while we are still waiting response for previous request')
-        request = communication.messages.GetNextRequest()
-        self.request_queue.put(request)
-        self.request_sent(request)
-
-    def get_response_reset_iterator(self, block=False):
-        try:
-            response = self.response_queue.get(block=block)
-        except Exception as e:  # TODO: Catch only timeout exceptions
-            raise EmptyQueue('queue is empty')
-        self.request_served(response)
-
-        if not isinstance(response, communication.messages.ResetIteratorResponse):
-            raise Exception('Invalid response received')
-
-    def get_response_next(self, block=False, timeout=None):
-        if not self.waiting_for_response():
-            raise Exception(
-                'Can not expect any response without submitted request')
-        try:
-            response = self.response_queue.get(block=block, timeout=timeout)
-        except Exception as e:  # TODO: Catch only timeout exceptions
-            raise EmptyQueue('queue is empty')
-        self.request_served(response)
-
-        # TODO(VitalyFedyunin): Add possible response types validation here
-        return response
diff --git a/torch/utils/data/communication/queue.py b/torch/utils/data/communication/queue.py
deleted file mode 100644
index 85c33d4799cd8..0000000000000
--- a/torch/utils/data/communication/queue.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import threading
-import time
-
-
-class LocalQueue():
-    ops = 0
-    stored = 0
-    uid = 0
-    empty = 0
-
-    def __init__(self, name='unnamed'):
-        self.items = []
-        self.name = name
-        self.uid = LocalQueue.uid
-        LocalQueue.uid += 1
-
-    def put(self, item, block=True):
-        LocalQueue.ops += 1
-        LocalQueue.stored += 1
-        self.items.append(item)
-
-    def get(self, block=True, timeout=0):
-        # TODO(VitalyFedyunin): Add support of block and timeout arguments
-        LocalQueue.ops += 1
-        if not len(self.items):
-            LocalQueue.empty += 1
-            raise Exception('LocalQueue is empty')
-        LocalQueue.stored -= 1
-        return self.items.pop()
-
-
-class ThreadingQueue():
-    def __init__(self, name='unnamed'):
-        self.lock = threading.Lock()
-        self.items = []
-        self.name = name
-
-    def put(self, item, block=True):
-        with self.lock:
-            self.items.append(item)
-
-    def get(self, block=True, timeout=0):
-        # TODO(VitalyFedyunin): Add support of block and timeout arguments
-        while True:
-            with self.lock:
-                if len(self.items) > 0:
-                    return self.items.pop()
-            if not block:
-                raise Exception("Not available")
-            # TODO(VitalyFedyunin): Figure out what to do if nothing in the queue
-            time.sleep(0.000001)
diff --git a/torch/utils/data/dataloader_experimental.py b/torch/utils/data/dataloader_experimental.py
deleted file mode 100644
index 8a8d536b79857..0000000000000
--- a/torch/utils/data/dataloader_experimental.py
+++ /dev/null
@@ -1,150 +0,0 @@
-import time
-
-from typing import Any, List
-
-import torch.utils.data.backward_compatibility
-
-import torch.utils.data.graph_settings
-from torch.utils.data import DataLoader, IterDataPipe, communication
-from torch.utils.data.datapipes.iter import IterableWrapper
-
-__all__ = [
-    "DataLoader2",
-]
-
-
-class _ThreadingDataLoader2:
-
-    def __init__(self, datapipe, num_workers=0, collate_fn=None):
-        self.threads = []
-        self.datapipes = []
-        self.collate_fn = collate_fn
-        for worker_id in range(num_workers):
-            (thread, req_queue, res_queue, thread_localdatapipe) = communication.eventloop.SpawnThreadForDataPipeline(datapipe)
-            torch.utils.data.graph_settings.apply_sharding(thread_localdatapipe, num_workers, worker_id)
-            thread.start()
-            self.threads.append((thread, req_queue, res_queue))  # These queues are independent
-            local_datapipe = communication.iter.QueueWrapper(
-                communication.protocol.IterDataPipeQueueProtocolClient(req_queue, res_queue))
-            self.datapipes.append(local_datapipe)
-
-    def __iter__(self):
-        not_available = False
-        forever = True
-        exclude_datapipes: List[Any] = []
-        while len(exclude_datapipes) < len(self.datapipes):
-            for dp in self.datapipes:
-                if dp not in exclude_datapipes:
-                    try:
-                        value = dp.nonblocking_next()
-                        yield value
-                    except StopIteration:
-                        exclude_datapipes.append(dp)
-                    except communication.iter.NotAvailable:
-                        not_available = True
-            if not_available:
-                time.sleep(0.001)
-
-    def __del__(self):
-        self._cleanup_all_threads()
-
-    def _cleanup_all_threads(self):
-        def clean_me(thread, req_queue, res_queue):
-            req_queue.put(communication.messages.TerminateRequest())
-            _ = res_queue.get()
-            thread.join()
-
-        for thread, req_queue, res_queue in self.threads:
-            clean_me(thread, req_queue, res_queue)
-
-class DataLoader2:
-    def __new__(cls,
-                dataset,
-                batch_size=1,
-                shuffle=None,
-                sampler=None,
-                batch_sampler=None,
-                num_workers=0,
-                collate_fn=None,
-                pin_memory=False,
-                drop_last=False,
-                timeout=0,
-                worker_init_fn=None,
-                *,
-                prefetch_factor=2,
-                persistent_workers=False,
-                batch_outside_worker=False,
-                parallelism_mode='mp'):
-        if isinstance(dataset, IterDataPipe):
-            data_loader: Any = None
-            if batch_sampler is not None:
-                raise Exception(
-                    'batch_sampler is not yet supported by DataPipes')
-            if sampler is not None:
-                raise Exception(
-                    'sampler is not yet supported by DataPipes')
-            datapipe = dataset
-            datapipe = torch.utils.data.graph_settings.apply_shuffle_settings(datapipe, shuffle=shuffle)  # type: ignore[assignment]
-            if batch_outside_worker and pin_memory:
-                raise Exception(
-                    'pin_memory is not yet compatible with batch_outside_worker')
-            if not batch_outside_worker:
-                if batch_size is not None:
-                    datapipe = datapipe.batch(batch_size, drop_last=drop_last)
-                    if collate_fn is None:
-                        collate_fn = torch.utils.data._utils.collate.default_collate
-
-                # Note: It is safe to pass shuffle=True to the old DataLoader, as shuffle does nothing
-                # for Iterable, but required to set Pipes correctly.
-                data_loader = DataLoader(datapipe,
-                                         batch_size=None,  # Replaced by .batch DataPipe
-                                         shuffle=shuffle,
-                                         sampler=None,
-                                         batch_sampler=None,
-                                         num_workers=num_workers,
-                                         collate_fn=collate_fn,
-                                         pin_memory=pin_memory,
-                                         drop_last=False,  # Replaced by .batch DataPipe
-                                         timeout=timeout,
-                                         worker_init_fn=worker_init_fn,
-                                         prefetch_factor=prefetch_factor,
-                                         persistent_workers=persistent_workers)
-            elif parallelism_mode == 'thread':
-                if collate_fn is not None and not batch_outside_worker:
-                    datapipe = datapipe.map(collate_fn)
-                if pin_memory:
-                    raise Exception(
-                        'pin_memory is not yet supported by DataPipes with Threading')
-                if worker_init_fn is not None:
-                    raise Exception(
-                        'worker_init_fn is not yet supported by DataPipes with Threading')
-                data_loader = _ThreadingDataLoader2(datapipe,
-                                                    num_workers=num_workers,
-                                                    collate_fn=collate_fn)
-            else:
-                raise Exception('Unsupported parallelism mode', parallelism_mode)
-            if not batch_outside_worker:
-                return data_loader
-            else:
-                if collate_fn is None:
-                    collate_fn = torch.utils.data._utils.collate.default_collate
-                datapipe = IterableWrapper(data_loader).batch(
-                    batch_size, drop_last=drop_last).map(collate_fn)
-                return datapipe
-        else:
-            if parallelism_mode == 'thread':
-                raise Exception(
-                    'thread parallelism mode is not supported for old DataSets')
-            return DataLoader(dataset,
-                              batch_size=batch_size,
-                              shuffle=shuffle,
-                              sampler=sampler,
-                              batch_sampler=batch_sampler,
-                              num_workers=num_workers,
-                              collate_fn=collate_fn,
-                              pin_memory=pin_memory,
-                              drop_last=drop_last,
-                              timeout=timeout,
-                              worker_init_fn=worker_init_fn,
-                              prefetch_factor=prefetch_factor,
-                              persistent_workers=persistent_workers)

From bced4cf0d9e35eddf2cc7593858b013ef8ce26ae Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 11 Nov 2022 22:31:32 +0000
Subject: [PATCH 0814/1922] [Dynamo] Fix str(Guard.obj_weakref) bug to
 re-ennable support overriding __getattr__ (#88564)

See my inline comments!

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88564
Approved by: https://github.com/ezyang, https://github.com/anijain2305
---
 test/dynamo/test_misc.py |  2 --
 torch/_dynamo/guards.py  | 27 ++++++++++++++++++++++++++-
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index 4df7153b8fb2b..a8bf86e46411b 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -579,8 +579,6 @@ def fn(count):
         self.assertEqual(cnts.frame_count, 0)
         self.assertEqual(cnts.op_count, 0)
 
-    # KeyError: '__name__'
-    @patch.object(torch._dynamo.config, "suppress_errors", True)
     def test_user_getattr1(self):
         class MyConfig(dict):
             def __getattr__(self, name):
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index 9edd6f60560df..382734412b2ba 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -101,13 +101,38 @@ def sort_key(self):
     def __lt__(self, other):
         return self.sort_key() < other.sort_key()
 
+    @staticmethod
+    def weakref_to_str(obj_weakref):
+        """
+        This is a workaround of a Python weakref bug.
+
+        `obj_weakref` is instance returned by `weakref.ref`,
+        `str(obj_weakref)` is buggy if the original obj overrides __getattr__, e.g:
+
+            class MyConfig(dict):
+                def __getattr__(self, x):
+                    return self[x]
+
+            obj = MyConfig(offset=5)
+            obj_weakref = weakref.ref(obj)
+            str(obj_weakref)  # raise error: KeyError: '__name__'
+        """
+        if isinstance(obj_weakref, weakref.ReferenceType):
+            obj = obj_weakref()
+            if obj is not None:
+                return f"<weakref at {hex(id(obj_weakref))}; to '{obj.__class__.__name__}' at {hex(id(obj))}>"
+            else:
+                return f"<weakref at {hex(id(obj_weakref))}; dead>"
+        else:
+            return str(obj_weakref)
+
     def __str__(self):
         s = f"""
             {self.source.name.lower()} {repr(self.name)} {self.create_fn.__name__}
             {{
                 'guard_types': {self.guard_types},
                 'code': {self.code_list},
-                'obj_weakref': {self.obj_weakref}
+                'obj_weakref': {self.weakref_to_str(self.obj_weakref)}
                 'guarded_class': {self.guarded_class_weakref}
             }}
             """

From a05297293e34056da61e405b0fc7114525ee3f95 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Fri, 11 Nov 2022 14:41:35 +0000
Subject: [PATCH 0815/1922] copy_: Short-circuit when self and src view the
 same data (#88884)

This comes up if you use inplace operators on a slice, e.g.
```python
import torch
a = torch.rand(1000000, device="cuda")
a[::2] *= 2
```

The last line looks as if it should be fully inplace, but is actually
equivalent to:

```python
tmp = a[::2]
tmp *= 2
a[::2] = tmp
```

Which results in `mul_` and `copy_` being called. With this PR, the
redundant copy becomes a no-op and the above example is 2x faster.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88884
Approved by: https://github.com/ngimel
---
 aten/src/ATen/native/Copy.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
index a44f39c5bb2eb..c6b82426d3bf6 100644
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@@ -220,6 +220,18 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
     return at::metal::metal_copy_(self, src);
   }
 
+  // Exit early if self and src are views of the same data
+  const bool is_same_data = (
+      self.is_alias_of(src) &&
+      self.storage_offset() == src.storage_offset() &&
+      self.strides().equals(src.strides()) &&
+      self.sizes().equals(src.sizes()) &&
+      self.scalar_type() == src.scalar_type()
+    );
+  if (is_same_data) {
+    return self;
+  }
+
 
   auto iter = TensorIteratorConfig()
     .add_output(self)

From d60f5cb3ad19ffe2a4d6bef7b01394a2595d44ed Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Wed, 9 Nov 2022 12:20:16 -0800
Subject: [PATCH 0816/1922] [functorch] delete some unused files (#88763)

Some post-merge cleanup.
- packaging/ was for building standalone windows binaries
- our flake8 config got superceded by PyTorch's.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88763
Approved by: https://github.com/samdow
---
 functorch/.flake8                             |  20 -
 functorch/packaging/build_wheel.sh            |  19 -
 functorch/packaging/pkg_helpers.bash          | 414 ------------------
 .../windows/internal/cuda_install.bat         | 264 -----------
 .../windows/internal/driver_update.bat        |  25 --
 .../windows/internal/vc_env_helper.bat        |  43 --
 .../windows/internal/vc_install_helper.sh     |  16 -
 7 files changed, 801 deletions(-)
 delete mode 100644 functorch/.flake8
 delete mode 100644 functorch/packaging/build_wheel.sh
 delete mode 100644 functorch/packaging/pkg_helpers.bash
 delete mode 100644 functorch/packaging/windows/internal/cuda_install.bat
 delete mode 100644 functorch/packaging/windows/internal/driver_update.bat
 delete mode 100644 functorch/packaging/windows/internal/vc_env_helper.bat
 delete mode 100644 functorch/packaging/windows/internal/vc_install_helper.sh

diff --git a/functorch/.flake8 b/functorch/.flake8
deleted file mode 100644
index a6d73773e3b55..0000000000000
--- a/functorch/.flake8
+++ /dev/null
@@ -1,20 +0,0 @@
-[flake8]
-select = B,C,E,F,P,T4,W,B9
-max-line-length = 120
-# C408 ignored because we like the dict keyword argument syntax
-# E501 is not flexible enough, we're using B950 instead
-ignore =
-    E203,E305,E402,E501,E721,E741,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,
-    # shebang has extra meaning in fbcode lints, so I think it's not worth trying
-    # to line this up with executable bit
-    EXE001,
-    # these ignores are from flake8-bugbear; please fix!
-    B007,B008,
-    # these ignores are from flake8-comprehensions; please fix!
-    C400,C401,C402,C403,C404,C405,C407,C411,C413,C414,C415
-exclude =
-    ./.git,
-    ./benchmarks,
-    ./docs,
-    ./examples,
-    ./notebooks
diff --git a/functorch/packaging/build_wheel.sh b/functorch/packaging/build_wheel.sh
deleted file mode 100644
index 074e7dde77141..0000000000000
--- a/functorch/packaging/build_wheel.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-set -ex
-
-script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-. "$script_dir/pkg_helpers.bash"
-
-export BUILD_TYPE=wheel
-setup_env 0.2.0
-setup_wheel_python
-pip_install numpy pyyaml future ninja
-pip_install --upgrade setuptools
-setup_pip_pytorch_version
-python setup.py clean
-
-if [[ "$OSTYPE" == "msys" ]]; then
-    "$script_dir/windows/internal/vc_env_helper.bat" python setup.py bdist_wheel
-else
-    python setup.py bdist_wheel
-fi
diff --git a/functorch/packaging/pkg_helpers.bash b/functorch/packaging/pkg_helpers.bash
deleted file mode 100644
index 329891a07216c..0000000000000
--- a/functorch/packaging/pkg_helpers.bash
+++ /dev/null
@@ -1,414 +0,0 @@
-# A set of useful bash functions for common functionality we need to do in
-# many build scripts
-
-
-# Setup CUDA environment variables, based on CU_VERSION
-#
-# Inputs:
-#   CU_VERSION (cpu, cu92, cu100)
-#   NO_CUDA_PACKAGE (bool)
-#   BUILD_TYPE (conda, wheel)
-#
-# Outputs:
-#   VERSION_SUFFIX (e.g., "")
-#   PYTORCH_VERSION_SUFFIX (e.g., +cpu)
-#   WHEEL_DIR (e.g., cu100/)
-#   CUDA_HOME (e.g., /usr/local/cuda-9.2, respected by torch.utils.cpp_extension)
-#   FORCE_CUDA (respected by torchvision setup.py)
-#   NVCC_FLAGS (respected by torchvision setup.py)
-#
-# Precondition: CUDA versions are installed in their conventional locations in
-# /usr/local/cuda-*
-#
-# NOTE: Why VERSION_SUFFIX versus PYTORCH_VERSION_SUFFIX?  If you're building
-# a package with CUDA on a platform we support CUDA on, VERSION_SUFFIX ==
-# PYTORCH_VERSION_SUFFIX and everyone is happy.  However, if you are building a
-# package with only CPU bits (e.g., torchaudio), then VERSION_SUFFIX is always
-# empty, but PYTORCH_VERSION_SUFFIX is +cpu (because that's how you get a CPU
-# version of a Python package.  But that doesn't apply if you're on OS X,
-# since the default CU_VERSION on OS X is cpu.
-setup_cuda() {
-
-  # First, compute version suffixes.  By default, assume no version suffixes
-  export VERSION_SUFFIX=""
-  export PYTORCH_VERSION_SUFFIX=""
-  export WHEEL_DIR=""
-  # Wheel builds need suffixes (but not if they're on OS X, which never has suffix)
-  if [[ "$BUILD_TYPE" == "wheel" ]] && [[ "$(uname)" != Darwin ]]; then
-    export PYTORCH_VERSION_SUFFIX="+$CU_VERSION"
-    # Match the suffix scheme of pytorch, unless this package does not have
-    # CUDA builds (in which case, use default)
-    if [[ -z "$NO_CUDA_PACKAGE" ]]; then
-      export VERSION_SUFFIX="$PYTORCH_VERSION_SUFFIX"
-      export WHEEL_DIR="$CU_VERSION/"
-    fi
-  fi
-
-  # Now work out the CUDA settings
-  case "$CU_VERSION" in
-    cu115)
-      if [[ "$OSTYPE" == "msys" ]]; then
-        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.5"
-      else
-        export CUDA_HOME=/usr/local/cuda-11.5/
-      fi
-      export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
-      ;;
-    cu113)
-      if [[ "$OSTYPE" == "msys" ]]; then
-        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.3"
-      else
-        export CUDA_HOME=/usr/local/cuda-11.3/
-      fi
-      export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
-      ;;
-    cu112)
-      if [[ "$OSTYPE" == "msys" ]]; then
-        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.2"
-      else
-        export CUDA_HOME=/usr/local/cuda-11.2/
-      fi
-      export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
-      ;;
-    cu111)
-      if [[ "$OSTYPE" == "msys" ]]; then
-        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.1"
-      else
-        export CUDA_HOME=/usr/local/cuda-11.1/
-      fi
-      export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
-      ;;
-    cu110)
-      if [[ "$OSTYPE" == "msys" ]]; then
-        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.0"
-      else
-        export CUDA_HOME=/usr/local/cuda-11.0/
-      fi
-      export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0"
-      ;;
-    cu102)
-      if [[ "$OSTYPE" == "msys" ]]; then
-        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2"
-      else
-        export CUDA_HOME=/usr/local/cuda-10.2/
-      fi
-      export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5"
-      ;;
-    cu101)
-      if [[ "$OSTYPE" == "msys" ]]; then
-        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.1"
-      else
-        export CUDA_HOME=/usr/local/cuda-10.1/
-      fi
-      export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5"
-      ;;
-    cu100)
-      if [[ "$OSTYPE" == "msys" ]]; then
-        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.0"
-      else
-        export CUDA_HOME=/usr/local/cuda-10.0/
-      fi
-      export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5"
-      ;;
-    cu92)
-      if [[ "$OSTYPE" == "msys" ]]; then
-        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.2"
-      else
-        export CUDA_HOME=/usr/local/cuda-9.2/
-      fi
-      export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0"
-      ;;
-    cpu)
-      ;;
-    rocm*)
-      export FORCE_CUDA=1
-      ;;
-    *)
-      echo "Unrecognized CU_VERSION=$CU_VERSION"
-      exit 1
-      ;;
-  esac
-  if [[ -n "$CUDA_HOME" ]]; then
-    # Adds nvcc binary to the search path so that CMake's `find_package(CUDA)` will pick the right one
-    export PATH="$CUDA_HOME/bin:$PATH"
-    export FORCE_CUDA=1
-  fi
-}
-
-# Populate build version if necessary, and add version suffix
-#
-# Inputs:
-#   BUILD_VERSION (e.g., 0.2.0 or empty)
-#   VERSION_SUFFIX (e.g., +cpu)
-#
-# Outputs:
-#   BUILD_VERSION (e.g., 0.2.0.dev20190807+cpu)
-#
-# Fill BUILD_VERSION if it doesn't exist already with a nightly string
-# Usage: setup_build_version 0.2.0
-setup_build_version() {
-  if [[ -z "$BUILD_VERSION" ]]; then
-    export BUILD_VERSION="$1.dev$(date "+%Y%m%d")$VERSION_SUFFIX"
-  else
-    export BUILD_VERSION="$BUILD_VERSION$VERSION_SUFFIX"
-  fi
-
-  # Set build version based on tag if on tag
-  if [[ -n "${CIRCLE_TAG}" ]]; then
-    # Strip tag
-    export BUILD_VERSION="$(echo "${CIRCLE_TAG}" | sed -e 's/^v//' -e 's/-.*$//')${VERSION_SUFFIX}"
-  fi
-}
-
-# Set some useful variables for OS X, if applicable
-setup_macos() {
-  if [[ "$(uname)" == Darwin ]]; then
-    export MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++
-  fi
-}
-
-
-# Top-level entry point for things every package will need to do
-#
-# Usage: setup_env 0.2.0
-setup_env() {
-  setup_cuda
-  setup_build_version "$1"
-  setup_macos
-}
-
-# Function to retry functions that sometimes timeout or have flaky failures
-retry () {
-    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
-}
-
-# Inputs:
-#   PYTHON_VERSION (3.7, 3.8, 3.9)
-#   UNICODE_ABI (bool)
-#
-# Outputs:
-#   PATH modified to put correct Python version in PATH
-#
-# Precondition: If Linux, you are in a soumith/manylinux-cuda* Docker image
-setup_wheel_python() {
-  if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
-    eval "$(conda shell.bash hook)"
-    conda env remove -n "env$PYTHON_VERSION" || true
-    conda create ${CONDA_CHANNEL_FLAGS} -yn "env$PYTHON_VERSION" python="$PYTHON_VERSION"
-    conda activate "env$PYTHON_VERSION"
-    # Install libpng from Anaconda (defaults)
-    conda install ${CONDA_CHANNEL_FLAGS} libpng "jpeg<=9b" -y
-  else
-    # Install native CentOS libJPEG, freetype and GnuTLS
-    yum install -y libjpeg-turbo-devel freetype gnutls
-    case "$PYTHON_VERSION" in
-      3.7) python_abi=cp37-cp37m ;;
-      3.8) python_abi=cp38-cp38 ;;
-      3.9) python_abi=cp39-cp39 ;;
-      3.10) python_abi=cp310-cp310 ;;
-      *)
-        echo "Unrecognized PYTHON_VERSION=$PYTHON_VERSION"
-        exit 1
-        ;;
-    esac
-    # Download all the dependencies required to compile image and video_reader
-    # extensions
-
-    mkdir -p ext_libraries
-    pushd ext_libraries
-    popd
-    export PATH="/opt/python/$python_abi/bin:$(pwd)/ext_libraries/bin:$PATH"
-  fi
-}
-
-# Install with pip a bit more robustly than the default
-pip_install() {
-  retry pip install --progress-bar off "$@"
-}
-
-# Install torch with pip, respecting PYTORCH_VERSION, and record the installed
-# version into PYTORCH_VERSION, if applicable
-setup_pip_pytorch_version() {
-  if [[ -z "$PYTORCH_VERSION" ]]; then
-    # Install latest prerelease version of torch, per our nightlies, consistent
-    # with the requested cuda version
-    pip_install --pre torch -f "https://download.pytorch.org/whl/nightly/${WHEEL_DIR}torch_nightly.html"
-    if [[ "$CUDA_VERSION" == "cpu" ]]; then
-      # CUDA and CPU are ABI compatible on the CPU-only parts, so strip
-      # in this case
-      export PYTORCH_VERSION="$(pip show torch | grep ^Version: | sed 's/Version:  *//' | sed 's/+.\+//')"
-    else
-      export PYTORCH_VERSION="$(pip show torch | grep ^Version: | sed 's/Version:  *//')"
-    fi
-  else
-    pip_install "torch==$PYTORCH_VERSION$PYTORCH_VERSION_SUFFIX" \
-      -f "https://download.pytorch.org/whl/${CU_VERSION}/torch_stable.html" \
-      -f "https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/${CU_VERSION}/torch_${UPLOAD_CHANNEL}.html"
-  fi
-}
-
-# Fill PYTORCH_VERSION with the latest conda nightly version, and
-# CONDA_CHANNEL_FLAGS with appropriate flags to retrieve these versions
-#
-# You MUST have populated PYTORCH_VERSION_SUFFIX before hand.
-setup_conda_pytorch_constraint() {
-  if [[ -z "$PYTORCH_VERSION" ]]; then
-    export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c pytorch-nightly -c pytorch"
-    export PYTORCH_VERSION="$(conda search --json 'pytorch[channel=pytorch-nightly]' | \
-                              python -c "import os, sys, json, re; cuver = os.environ.get('CU_VERSION'); \
-                               cuver_1 = cuver.replace('cu', 'cuda') if cuver != 'cpu' else cuver; \
-                               cuver_2 = (cuver[:-1] + '.' + cuver[-1]).replace('cu', 'cuda') if cuver != 'cpu' else cuver; \
-                               print(re.sub(r'\\+.*$', '', \
-                                [x['version'] for x in json.load(sys.stdin)['pytorch'] \
-                                  if (x['platform'] == 'darwin' or cuver_1 in x['fn'] or cuver_2 in x['fn']) \
-                                    and 'py' + os.environ['PYTHON_VERSION'] in x['fn']][-1]))")"
-    if [[ -z "$PYTORCH_VERSION" ]]; then
-      echo "PyTorch version auto detection failed"
-      echo "No package found for CU_VERSION=$CU_VERSION and PYTHON_VERSION=$PYTHON_VERSION"
-      exit 1
-    fi
-  else
-    export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c pytorch -c pytorch-${UPLOAD_CHANNEL}"
-  fi
-  if [[ "$CU_VERSION" == cpu ]]; then
-    export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==$PYTORCH_VERSION${PYTORCH_VERSION_SUFFIX}"
-    export CONDA_PYTORCH_CONSTRAINT="- pytorch==$PYTORCH_VERSION"
-  else
-    export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==${PYTORCH_VERSION}${PYTORCH_VERSION_SUFFIX}"
-    export CONDA_PYTORCH_CONSTRAINT="- pytorch==${PYTORCH_VERSION}${PYTORCH_VERSION_SUFFIX}"
-  fi
-  if [[ "$OSTYPE" == msys && "$CU_VERSION" == cu92 ]]; then
-    export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c defaults -c numba/label/dev"
-  fi
-}
-
-# Translate CUDA_VERSION into CUDA_CUDATOOLKIT_CONSTRAINT
-setup_conda_cudatoolkit_constraint() {
-  export CONDA_BUILD_VARIANT="cuda"
-  if [[ "$(uname)" == Darwin ]]; then
-    export CONDA_BUILD_VARIANT="cpu"
-  else
-    case "$CU_VERSION" in
-      cu115)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=11.5,<11.6 # [not osx]"
-        ;;
-      cu113)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=11.3,<11.4 # [not osx]"
-        ;;
-      cu112)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=11.2,<11.3 # [not osx]"
-        ;;
-      cu111)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=11.1,<11.2 # [not osx]"
-        ;;
-      cu110)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=11.0,<11.1 # [not osx]"
-        ;;
-      cu102)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=10.2,<10.3 # [not osx]"
-        ;;
-      cu101)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=10.1,<10.2 # [not osx]"
-        ;;
-      cu100)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=10.0,<10.1 # [not osx]"
-        ;;
-      cu92)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=9.2,<9.3 # [not osx]"
-        ;;
-      cpu)
-        export CONDA_CUDATOOLKIT_CONSTRAINT=""
-        export CONDA_BUILD_VARIANT="cpu"
-        ;;
-      *)
-        echo "Unrecognized CU_VERSION=$CU_VERSION"
-        exit 1
-        ;;
-    esac
-  fi
-}
-
-setup_conda_cudatoolkit_plain_constraint() {
-  export CONDA_BUILD_VARIANT="cuda"
-  export CMAKE_USE_CUDA=1
-  if [[ "$(uname)" == Darwin ]]; then
-    export CONDA_BUILD_VARIANT="cpu"
-    export CMAKE_USE_CUDA=0
-  else
-    case "$CU_VERSION" in
-      cu115)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit=11.5"
-        ;;
-      cu113)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit=11.3"
-        ;;
-      cu112)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit=11.2"
-        ;;
-      cu111)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit=11.1"
-        ;;
-      cu102)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit=10.2"
-        ;;
-      cu101)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit=10.1"
-        ;;
-      cu100)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit=10.0"
-        ;;
-      cu92)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit=9.2"
-        ;;
-      cpu)
-        export CONDA_CUDATOOLKIT_CONSTRAINT=""
-        export CONDA_BUILD_VARIANT="cpu"
-        export CMAKE_USE_CUDA=0
-        ;;
-      *)
-        echo "Unrecognized CU_VERSION=$CU_VERSION"
-        exit 1
-        ;;
-    esac
-  fi
-}
-
-# Build the proper compiler package before building the final package
-setup_visual_studio_constraint() {
-  if [[ "$OSTYPE" == "msys" ]]; then
-      export VSTOOLCHAIN_PACKAGE=vs$VC_YEAR
-      conda build $CONDA_CHANNEL_FLAGS --no-anaconda-upload packaging/$VSTOOLCHAIN_PACKAGE
-      cp packaging/$VSTOOLCHAIN_PACKAGE/conda_build_config.yaml packaging/torchvision/conda_build_config.yaml
-  fi
-}
-
-setup_junit_results_folder() {
-  if [[ "$CI" == "true" ]]; then
-    export CONDA_PYTORCH_BUILD_RESULTS_DIRECTORY="${SOURCE_ROOT_DIR}/build_results/results.xml"
-  fi
-}
-
-
-download_copy_ffmpeg() {
-  if [[ "$OSTYPE" == "msys" ]]; then
-    # conda install -yq ffmpeg=4.2 -c pytorch
-    # curl -L -q https://anaconda.org/pytorch/ffmpeg/4.3/download/win-64/ffmpeg-4.3-ha925a31_0.tar.bz2 --output ffmpeg-4.3-ha925a31_0.tar.bz2
-    # bzip2 --decompress --stdout ffmpeg-4.3-ha925a31_0.tar.bz2 | tar -x --file=-
-    # cp Library/bin/*.dll ../torchvision
-    echo "FFmpeg is disabled currently on Windows"
-  else
-    if [[ "$(uname)" == Darwin ]]; then
-      conda install -yq ffmpeg=4.2 -c pytorch
-      conda install -yq wget
-    else
-      # pushd ext_libraries
-      # wget -q https://anaconda.org/pytorch/ffmpeg/4.2/download/linux-64/ffmpeg-4.2-hf484d3e_0.tar.bz2
-      # tar -xjvf ffmpeg-4.2-hf484d3e_0.tar.bz2
-      # rm -rf ffmpeg-4.2-hf484d3e_0.tar.bz2
-      # ldconfig
-      # which ffmpeg
-      # popd
-      echo "FFmpeg is disabled currently on Linux"
-    fi
-  fi
-}
diff --git a/functorch/packaging/windows/internal/cuda_install.bat b/functorch/packaging/windows/internal/cuda_install.bat
deleted file mode 100644
index 41960224ebaed..0000000000000
--- a/functorch/packaging/windows/internal/cuda_install.bat
+++ /dev/null
@@ -1,264 +0,0 @@
-@echo on
-
-if "%CU_VERSION%" == "cpu" (
-    echo Skipping for CPU builds
-    exit /b 0
-)
-
-set SRC_DIR=%~dp0\..
-
-if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build"
-
-rem in unit test workflow, we get CUDA_VERSION, for example 11.1
-if defined CUDA_VERSION (
-    set CUDA_VER=%CUDA_VERSION:.=%
-) else (
-    set CUDA_VER=%CU_VERSION:cu=%
-)
-
-set /a CUDA_VER=%CU_VERSION:cu=%
-set CUDA_VER_MAJOR=%CUDA_VER:~0,-1%
-set CUDA_VER_MINOR=%CUDA_VER:~-1,1%
-set CUDA_VERSION_STR=%CUDA_VER_MAJOR%.%CUDA_VER_MINOR%
-
-
-if %CUDA_VER% EQU 92 goto cuda92
-if %CUDA_VER% EQU 100 goto cuda100
-if %CUDA_VER% EQU 101 goto cuda101
-if %CUDA_VER% EQU 102 goto cuda102
-if %CUDA_VER% EQU 110 goto cuda110
-if %CUDA_VER% EQU 111 goto cuda111
-if %CUDA_VER% EQU 112 goto cuda112
-if %CUDA_VER% EQU 113 goto cuda113
-if %CUDA_VER% EQU 115 goto cuda115
-
-
-echo CUDA %CUDA_VERSION_STR% is not supported
-exit /b 1
-
-:cuda92
-if not exist "%SRC_DIR%\temp_build\cuda_9.2.148_win10.exe" (
-    curl -k -L https://ossci-windows.s3.amazonaws.com/win2016/cuda_9.2.148_win10.exe --output "%SRC_DIR%\temp_build\cuda_9.2.148_win10.exe"
-    if errorlevel 1 exit /b 1
-    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_9.2.148_win10.exe"
-    set "ARGS=nvcc_9.2 cuobjdump_9.2 nvprune_9.2 cupti_9.2 cublas_9.2 cublas_dev_9.2 cudart_9.2 cufft_9.2 cufft_dev_9.2 curand_9.2 curand_dev_9.2 cusolver_9.2 cusolver_dev_9.2 cusparse_9.2 cusparse_dev_9.2 nvgraph_9.2 nvgraph_dev_9.2 npp_9.2 npp_dev_9.2 nvrtc_9.2 nvrtc_dev_9.2 nvml_dev_9.2"
-)
-
-if not exist "%SRC_DIR%\temp_build\cudnn-9.2-windows10-x64-v7.2.1.38.zip" (
-    curl -k -L https://ossci-windows.s3.amazonaws.com/win2016/cudnn-9.2-windows10-x64-v7.2.1.38.zip --output "%SRC_DIR%\temp_build\cudnn-9.2-windows10-x64-v7.2.1.38.zip"
-    if errorlevel 1 exit /b 1
-    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\cudnn-9.2-windows10-x64-v7.2.1.38.zip"
-)
-
-goto cuda_common
-
-:cuda100
-
-if not exist "%SRC_DIR%\temp_build\cuda_10.0.130_411.31_win10.exe" (
-    curl -k -L https://ossci-windows.s3.amazonaws.com/win2016/cuda_10.0.130_411.31_win10.exe --output "%SRC_DIR%\temp_build\cuda_10.0.130_411.31_win10.exe"
-    if errorlevel 1 exit /b 1
-    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_10.0.130_411.31_win10.exe"
-    set "ARGS=nvcc_10.0 cuobjdump_10.0 nvprune_10.0 cupti_10.0 cublas_10.0 cublas_dev_10.0 cudart_10.0 cufft_10.0 cufft_dev_10.0 curand_10.0 curand_dev_10.0 cusolver_10.0 cusolver_dev_10.0 cusparse_10.0 cusparse_dev_10.0 nvgraph_10.0 nvgraph_dev_10.0 npp_10.0 npp_dev_10.0 nvrtc_10.0 nvrtc_dev_10.0 nvml_dev_10.0"
-)
-
-if not exist "%SRC_DIR%\temp_build\cudnn-10.0-windows10-x64-v7.4.1.5.zip" (
-    curl -k -L https://ossci-windows.s3.amazonaws.com/win2016/cudnn-10.0-windows10-x64-v7.4.1.5.zip --output "%SRC_DIR%\temp_build\cudnn-10.0-windows10-x64-v7.4.1.5.zip"
-    if errorlevel 1 exit /b 1
-    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\cudnn-10.0-windows10-x64-v7.4.1.5.zip"
-)
-
-goto cuda_common
-
-:cuda101
-
-if not exist "%SRC_DIR%\temp_build\cuda_10.1.243_426.00_win10.exe" (
-    curl -k -L https://ossci-windows.s3.amazonaws.com/cuda_10.1.243_426.00_win10.exe --output "%SRC_DIR%\temp_build\cuda_10.1.243_426.00_win10.exe"
-    if errorlevel 1 exit /b 1
-    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_10.1.243_426.00_win10.exe"
-    set "ARGS=nvcc_10.1 cuobjdump_10.1 nvprune_10.1 cupti_10.1 cublas_10.1 cublas_dev_10.1 cudart_10.1 cufft_10.1 cufft_dev_10.1 curand_10.1 curand_dev_10.1 cusolver_10.1 cusolver_dev_10.1 cusparse_10.1 cusparse_dev_10.1 nvgraph_10.1 nvgraph_dev_10.1 npp_10.1 npp_dev_10.1 nvjpeg_10.1 nvjpeg_dev_10.1 nvrtc_10.1 nvrtc_dev_10.1 nvml_dev_10.1"
-)
-
-if not exist "%SRC_DIR%\temp_build\cudnn-10.1-windows10-x64-v7.6.4.38.zip" (
-    curl -k -L https://ossci-windows.s3.amazonaws.com/cudnn-10.1-windows10-x64-v7.6.4.38.zip --output "%SRC_DIR%\temp_build\cudnn-10.1-windows10-x64-v7.6.4.38.zip"
-    if errorlevel 1 exit /b 1
-    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\cudnn-10.1-windows10-x64-v7.6.4.38.zip"
-)
-
-goto cuda_common
-
-:cuda102
-
-if not exist "%SRC_DIR%\temp_build\cuda_10.2.89_441.22_win10.exe" (
-    curl -k -L https://ossci-windows.s3.amazonaws.com/cuda_10.2.89_441.22_win10.exe --output "%SRC_DIR%\temp_build\cuda_10.2.89_441.22_win10.exe"
-    if errorlevel 1 exit /b 1
-    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_10.2.89_441.22_win10.exe"
-    set "ARGS=nvcc_10.2 cuobjdump_10.2 nvprune_10.2 cupti_10.2 cublas_10.2 cublas_dev_10.2 cudart_10.2 cufft_10.2 cufft_dev_10.2 curand_10.2 curand_dev_10.2 cusolver_10.2 cusolver_dev_10.2 cusparse_10.2 cusparse_dev_10.2 nvgraph_10.2 nvgraph_dev_10.2 npp_10.2 npp_dev_10.2 nvjpeg_10.2 nvjpeg_dev_10.2 nvrtc_10.2 nvrtc_dev_10.2 nvml_dev_10.2"
-)
-
-if not exist "%SRC_DIR%\temp_build\cudnn-10.2-windows10-x64-v7.6.5.32.zip" (
-    curl -k -L https://ossci-windows.s3.amazonaws.com/cudnn-10.2-windows10-x64-v7.6.5.32.zip --output "%SRC_DIR%\temp_build\cudnn-10.2-windows10-x64-v7.6.5.32.zip"
-    if errorlevel 1 exit /b 1
-    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\cudnn-10.2-windows10-x64-v7.6.5.32.zip"
-)
-
-rem The below only for cu102, if it's used in other version, e.g. cu111, torch.cuda.is_availabe() would be False.
-if not exist "%SRC_DIR%\temp_build\gpu_driver_dlls.7z" (
-    curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "%SRC_DIR%\temp_build\gpu_driver_dlls.zip"
-    if errorlevel 1 exit /b 1
-)
-
-echo Installing GPU driver DLLs
-7z x %SRC_DIR%\temp_build\gpu_driver_dlls.zip -aoa -o"C:\Windows\System32"
-
-goto cuda_common
-
-:cuda110
-
-if not exist "%SRC_DIR%\temp_build\cuda_11.0.2_451.48_win10.exe" (
-    curl -k -L https://ossci-windows.s3.amazonaws.com/cuda_11.0.2_451.48_win10.exe --output "%SRC_DIR%\temp_build\cuda_11.0.2_451.48_win10.exe"
-    if errorlevel 1 exit /b 1
-    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_11.0.2_451.48_win10.exe"
-    set "ARGS=nvcc_11.0 cuobjdump_11.0 nvprune_11.0 nvprof_11.0 cupti_11.0 cublas_11.0 cublas_dev_11.0 cudart_11.0 cufft_11.0 cufft_dev_11.0 curand_11.0 curand_dev_11.0 cusolver_11.0 cusolver_dev_11.0 cusparse_11.0 cusparse_dev_11.0 npp_11.0 npp_dev_11.0 nvjpeg_11.0 nvjpeg_dev_11.0 nvrtc_11.0 nvrtc_dev_11.0 nvml_dev_11.0"
-)
-
-if not exist "%SRC_DIR%\temp_build\cudnn-11.0-windows-x64-v8.0.4.30.zip" (
-    curl -k -L https://ossci-windows.s3.amazonaws.com/cudnn-11.0-windows-x64-v8.0.4.30.zip --output "%SRC_DIR%\temp_build\cudnn-11.0-windows-x64-v8.0.4.30.zip"
-    if errorlevel 1 exit /b 1
-    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\cudnn-11.0-windows-x64-v8.0.4.30.zip"
-)
-
-goto cuda_common
-
-:cuda111
-
-if not exist "%SRC_DIR%\temp_build\cuda_11.1.1_456.81_win10.exe" (
-    curl -k -L https://ossci-windows.s3.amazonaws.com/cuda_11.1.1_456.81_win10.exe --output "%SRC_DIR%\temp_build\cuda_11.1.1_456.81_win10.exe"
-    if errorlevel 1 exit /b 1
-    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_11.1.1_456.81_win10.exe"
-    set "ARGS=nvcc_11.1 cuobjdump_11.1 nvprune_11.1 nvprof_11.1 cupti_11.1 cublas_11.1 cublas_dev_11.1 cudart_11.1 cufft_11.1 cufft_dev_11.1 curand_11.1 curand_dev_11.1 cusolver_11.1 cusolver_dev_11.1 cusparse_11.1 cusparse_dev_11.1 npp_11.1 npp_dev_11.1 nvjpeg_11.1 nvjpeg_dev_11.1 nvrtc_11.1 nvrtc_dev_11.1 nvml_dev_11.1"
-)
-
-if not exist "%SRC_DIR%\temp_build\cudnn-11.1-windows-x64-v8.0.5.39.zip" (
-    curl -k -L https://ossci-windows.s3.amazonaws.com/cudnn-11.1-windows-x64-v8.0.5.39.zip --output "%SRC_DIR%\temp_build\cudnn-11.1-windows-x64-v8.0.5.39.zip"
-    if errorlevel 1 exit /b 1
-    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\cudnn-11.1-windows-x64-v8.0.5.39.zip"
-)
-
-goto cuda_common
-
-:cuda112
-
-if not exist "%SRC_DIR%\temp_build\cuda_11.2.0_460.89_win10.exe" (
-    curl -k -L https://ossci-windows.s3.amazonaws.com/cuda_11.2.0_460.89_win10.exe --output "%SRC_DIR%\temp_build\cuda_11.2.0_460.89_win10.exe"
-    if errorlevel 1 exit /b 1
-    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_11.2.0_460.89_win10.exe"
-    set "ARGS=nvcc_11.2 cuobjdump_11.2 nvprune_11.2 nvprof_11.2 cupti_11.2 cublas_11.2 cublas_dev_11.2 cudart_11.2 cufft_11.2 cufft_dev_11.2 curand_11.2 curand_dev_11.2 cusolver_11.2 cusolver_dev_11.2 cusparse_11.2 cusparse_dev_11.2 npp_11.2 npp_dev_11.2 nvjpeg_11.2 nvjpeg_dev_11.2 nvrtc_11.2 nvrtc_dev_11.2 nvml_dev_11.2"
-)
-
-if not exist "%SRC_DIR%\temp_build\cudnn-11.2-windows-x64-v8.1.0.77.zip" (
-    curl -k -L http://s3.amazonaws.com/ossci-windows/cudnn-11.2-windows-x64-v8.1.0.77.zip --output "%SRC_DIR%\temp_build\cudnn-11.2-windows-x64-v8.1.0.77.zip"
-    if errorlevel 1 exit /b 1
-    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\cudnn-11.2-windows-x64-v8.1.0.77.zip"
-)
-
-goto cuda_common
-
-:cuda113
-
-set CUDA_INSTALL_EXE=cuda_11.3.0_465.89_win10.exe
-if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
-    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    if errorlevel 1 exit /b 1
-    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    set "ARGS=thrust_11.3 nvcc_11.3 cuobjdump_11.3 nvprune_11.3 nvprof_11.3 cupti_11.3 cublas_11.3 cublas_dev_11.3 cudart_11.3 cufft_11.3 cufft_dev_11.3 curand_11.3 curand_dev_11.3 cusolver_11.3 cusolver_dev_11.3 cusparse_11.3 cusparse_dev_11.3 npp_11.3 npp_dev_11.3 nvjpeg_11.3 nvjpeg_dev_11.3 nvrtc_11.3 nvrtc_dev_11.3 nvml_dev_11.3"
-
-)
-
-set CUDNN_INSTALL_ZIP=cudnn-11.3-windows-x64-v8.2.0.53.zip
-if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
-    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
-    if errorlevel 1 exit /b 1
-    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
-)
-
-goto cuda_common
-
-:cuda115
-
-set CUDA_INSTALL_EXE=cuda_11.5.0_496.13_win10.exe
-if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
-    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    if errorlevel 1 exit /b 1
-    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    set "ARGS=thrust_11.5 nvcc_11.5 cuobjdump_11.5 nvprune_11.5 nvprof_11.5 cupti_11.5 cublas_11.5 cublas_dev_11.5 cudart_11.5 cufft_11.5 cufft_dev_11.5 curand_11.5 curand_dev_11.5 cusolver_11.5 cusolver_dev_11.5 cusparse_11.5 cusparse_dev_11.5 npp_11.5 npp_dev_11.5 nvrtc_11.5 nvrtc_dev_11.5 nvml_dev_11.5"
-)
-
-set CUDNN_INSTALL_ZIP=cudnn-11.3-windows-x64-v8.2.0.53.zip
-if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
-    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
-    if errorlevel 1 exit /b 1
-    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
-)
-
-goto cuda_common
-
-:cuda_common
-
-if not exist "%SRC_DIR%\temp_build\NvToolsExt.7z" (
-    curl -k -L https://www.dropbox.com/s/9mcolalfdj4n979/NvToolsExt.7z?dl=1 --output "%SRC_DIR%\temp_build\NvToolsExt.7z"
-    if errorlevel 1 exit /b 1
-)
-
-echo Installing CUDA toolkit...
-7z x %CUDA_SETUP_FILE% -o"%SRC_DIR%\temp_build\cuda"
-pushd "%SRC_DIR%\temp_build\cuda"
-sc config wuauserv start= disabled
-sc stop wuauserv
-sc query wuauserv
-
-start /wait setup.exe -s %ARGS% -loglevel:6 -log:"%cd%/cuda_install_logs"
-echo %errorlevel%
-
-popd
-
-echo Installing VS integration...
-rem It's for VS 2019
-if "%CUDA_VER_MAJOR%" == "10" (
-    xcopy /Y "%SRC_DIR%\temp_build\cuda\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions\*.*" "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Microsoft\VC\v160\BuildCustomizations"
-)
-if "%CUDA_VER_MAJOR%" == "11" (
-    xcopy /Y "%SRC_DIR%\temp_build\cuda\visual_studio_integration\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions\*.*" "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Microsoft\VC\v160\BuildCustomizations"
-)
-
-echo Installing NvToolsExt...
-7z x %SRC_DIR%\temp_build\NvToolsExt.7z -o"%SRC_DIR%\temp_build\NvToolsExt"
-mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64"
-mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include"
-mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64"
-xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\bin\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64"
-xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\include\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include"
-xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\lib\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64"
-
-echo Setting up environment...
-set "PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\libnvvp;%PATH%"
-set "CUDA_PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%"
-set "CUDA_PATH_V%CUDA_VER_MAJOR%_%CUDA_VER_MINOR%=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%"
-set "NVTOOLSEXT_PATH=%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64"
-
-if not exist "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\nvcc.exe" (
-    echo CUDA %CUDA_VERSION_STR% installed failed.
-    echo --------- RunDll32.exe.log
-    type "%SRC_DIR%\temp_build\cuda\cuda_install_logs\LOG.RunDll32.exe.log"
-    echo --------- setup.exe.log -------
-    type "%SRC_DIR%\temp_build\cuda\cuda_install_logs\LOG.setup.exe.log"
-    exit /b 1
-)
-
-echo Installing cuDNN...
-7z x %CUDNN_SETUP_FILE% -o"%SRC_DIR%\temp_build\cudnn"
-xcopy /Y "%SRC_DIR%\temp_build\cudnn\cuda\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin"
-xcopy /Y "%SRC_DIR%\temp_build\cudnn\cuda\lib\x64\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\lib\x64"
-xcopy /Y "%SRC_DIR%\temp_build\cudnn\cuda\include\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\include"
-
-echo Cleaning temp files
-rd /s /q "%SRC_DIR%\temp_build" || ver > nul
diff --git a/functorch/packaging/windows/internal/driver_update.bat b/functorch/packaging/windows/internal/driver_update.bat
deleted file mode 100644
index 00b43affc01cc..0000000000000
--- a/functorch/packaging/windows/internal/driver_update.bat
+++ /dev/null
@@ -1,25 +0,0 @@
-set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe"
-curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe
-if errorlevel 1 exit /b 1
-
-start /wait 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe -s -noreboot
-if errorlevel 1 exit /b 1
-
-del 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe || ver > NUL
-
-setlocal EnableDelayedExpansion
-set NVIDIA_GPU_EXISTS=0
-for /F "delims=" %%i in ('wmic path win32_VideoController get name') do (
-    set GPUS=%%i
-    if not "x!GPUS:NVIDIA=!" == "x!GPUS!" (
-        SET NVIDIA_GPU_EXISTS=1
-        goto gpu_check_end
-    )
-)
-:gpu_check_end
-endlocal & set NVIDIA_GPU_EXISTS=%NVIDIA_GPU_EXISTS%
-
-if "%NVIDIA_GPU_EXISTS%" == "0" (
-    echo "CUDA Driver installation Failed"
-    exit /b 1
-)
diff --git a/functorch/packaging/windows/internal/vc_env_helper.bat b/functorch/packaging/windows/internal/vc_env_helper.bat
deleted file mode 100644
index e85a372f93d58..0000000000000
--- a/functorch/packaging/windows/internal/vc_env_helper.bat
+++ /dev/null
@@ -1,43 +0,0 @@
-@echo on
-
-set VC_VERSION_LOWER=16
-set VC_VERSION_UPPER=17
-if "%VC_YEAR%" == "2017" (
-    set VC_VERSION_LOWER=15
-    set VC_VERSION_UPPER=16
-)
-
-for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do (
-    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
-        set "VS15INSTALLDIR=%%i"
-        set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
-        goto vswhere
-    )
-)
-
-:vswhere
-if "%VSDEVCMD_ARGS%" == "" (
-    call "%VS15VCVARSALL%" x64 || exit /b 1
-) else (
-    call "%VS15VCVARSALL%" x64 %VSDEVCMD_ARGS% || exit /b 1
-)
-
-@echo on
-
-set DISTUTILS_USE_SDK=1
-
-set args=%1
-shift
-:start
-if [%1] == [] goto done
-set args=%args% %1
-shift
-goto start
-
-:done
-if "%args%" == "" (
-    echo Usage: vc_env_helper.bat [command] [args]
-    echo e.g. vc_env_helper.bat cl /c test.cpp
-)
-
-%args% || exit /b 1
diff --git a/functorch/packaging/windows/internal/vc_install_helper.sh b/functorch/packaging/windows/internal/vc_install_helper.sh
deleted file mode 100644
index cdae18065b9f6..0000000000000
--- a/functorch/packaging/windows/internal/vc_install_helper.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-if [[ "$CU_VERSION" == "cu92" ]]; then
-  export VC_YEAR=2017
-  export VSDEVCMD_ARGS="-vcvars_ver=14.13"
-  powershell packaging/windows/internal/vs2017_install.ps1
-elif [[ "$CU_VERSION" == "cu100" ]]; then
-  export VC_YEAR=2017
-  export VSDEVCMD_ARGS=""
-  powershell packaging/windows/internal/vs2017_install.ps1
-else
-  export VC_YEAR=2019
-  export VSDEVCMD_ARGS=""
-fi

From 7ab32b1ad78eaafcb37f6bf5511f8fae47468a38 Mon Sep 17 00:00:00 2001
From: Horace He <chilli@fb.com>
Date: Fri, 11 Nov 2022 19:17:47 +0000
Subject: [PATCH 0817/1922] Fix matmul decomp to use reshape instead of
 contiguous().view() (#88832)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88832
Approved by: https://github.com/bertmaher, https://github.com/ngimel
---
 torch/_decomp/decompositions.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index fe63e0db007a7..1a2d332e99fd9 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -2261,9 +2261,7 @@ def matmul(tensor1, tensor2):
         t2_is_matrix = t2.dim() == 2
         if t2_is_matrix:
             output_shape.append(t2.shape[1])
-        # HACK: We need reshape with symint support
-        t1 = t1.contiguous()
-        t1_folded = t1.view(folded_dim1, sizes_1[-1])
+        t1_folded = t1.reshape(folded_dim1, sizes_1[-1])
         if t2_is_matrix:
             # FIXME This path always does an unnecessary copy when transpose == True as the returned
             # result from BLAS is already C-transposed
@@ -2296,15 +2294,11 @@ def matmul(tensor1, tensor2):
         expand_batch_product = prod(expand_batch_portion)
 
         # HACK: We need reshape with symint support
-        tensor1_expanded = (
-            tensor1.expand(tensor1_expand_size)
-            .contiguous()
-            .view(expand_batch_product, n, m1)
+        tensor1_expanded = tensor1.expand(tensor1_expand_size).reshape(
+            expand_batch_product, n, m1
         )
-        tensor2_expanded = (
-            tensor2.expand(tensor2_expand_size)
-            .contiguous()
-            .view(expand_batch_product, m2, p)
+        tensor2_expanded = tensor2.expand(tensor2_expand_size).reshape(
+            expand_batch_product, m2, p
         )
 
         output_shape = expand_batch_portion

From 3971acfd12f3c1af41764147059327c7d93d5dbb Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@fb.com>
Date: Sat, 12 Nov 2022 00:22:25 +0000
Subject: [PATCH 0818/1922] Add comprehensive minifier tests (#88022)

Adds tests for https://github.com/pytorch/torchdynamo/issues/1241.

To run: `pytest test/dynamo/test_minifier.py`.

Actually runs minifier launcher script and repro scripts, rather than just checking for existence of the minifier launcher script.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88022
Approved by: https://github.com/mlazos, https://github.com/anijain2305
---
 test/dynamo/test_minifier.py | 630 +++++++++++++++++++++++++++++++----
 torch/_dynamo/debug_utils.py |  78 ++++-
 2 files changed, 632 insertions(+), 76 deletions(-)

diff --git a/test/dynamo/test_minifier.py b/test/dynamo/test_minifier.py
index 0cec7d202a9d4..51b79a5e7511e 100644
--- a/test/dynamo/test_minifier.py
+++ b/test/dynamo/test_minifier.py
@@ -1,27 +1,138 @@
 # Owner(s): ["module: dynamo"]
+import functools
 import os
+import re
 import shutil
+import subprocess
+import textwrap
 import unittest
-from unittest.mock import patch
 
 import torch
 import torch._dynamo
 import torch._dynamo.test_case
 import torch._dynamo.testing
-from torch._dynamo.optimizations.backends import create_backend
+import torch._inductor.utils
+from torch._dynamo.debug_utils import TEST_REPLACEABLE_COMMENT
 
+_HAS_TRITON = torch._inductor.utils.has_triton()
+requires_cuda = functools.partial(unittest.skipIf, not _HAS_TRITON, "requires cuda")
 
-class MockModule(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
+RELU_COMPILE_ERROR_BACKEND = """\
+from torch._dynamo.optimizations.backends import register_backend
 
-    def forward(self, x):
-        for _ in range(10):
-            x = torch.sin(x)
-        x = torch._foobar(x)
-        for _ in range(10):
-            x = torch.cos(x)
-        return x
+class DynamoCompileError(Exception):
+    pass
+
+@register_backend
+def test_relu_compile_error(gm: torch.fx.GraphModule, example_inputs):
+    for node in gm.graph.nodes:
+        if node.target == torch.relu:
+            raise DynamoCompileError("relu found")
+    return gm
+"""
+
+RELU_RUNTIME_ERROR_BACKEND = """\
+import copy
+from torch._dynamo.optimizations.backends import register_backend
+
+@register_backend
+def test_relu_runtime_error(gm: torch.fx.GraphModule, example_inputs):
+    gm = copy.deepcopy(gm)
+    for node in gm.graph.nodes:
+        if node.target == torch.relu:
+            node.target = torch._assert
+            node.args = (False, "DynamoRuntimeError")
+    gm.recompile()
+    return gm
+"""
+
+RELU_ACCURACY_ERROR_BACKEND = """\
+import copy
+from torch._dynamo.optimizations.backends import register_backend
+
+@register_backend
+def test_relu_accuracy_error(gm: torch.fx.GraphModule, example_inputs):
+    gm = copy.deepcopy(gm)
+    for node in gm.graph.nodes:
+        if node.target == torch.relu:
+            node.target = torch.add
+            node.args = (node.args[0], 1)
+    gm.recompile()
+
+    return gm
+"""
+
+RELU_CUSTOM_ERROR_BACKEND = """\
+class CustomError(Exception):
+    pass
+
+def test_relu_custom_error(gm: torch.fx.GraphModule, example_inputs):
+    for node in gm.graph.nodes:
+        if node.target == torch.relu:
+            raise CustomError("relu found")
+    return gm
+"""
+
+CPP_COMPILE_ERROR = """\
+def cpp_compile_error(x):
+    return "compile error!"
+"""
+
+CPP_RUNTIME_ERROR = """\
+def cpp_runtime_error(x):
+    return f"{x}; throw 1"
+"""
+
+CPP_ACCURACY_ERROR = """\
+def cpp_accuracy_error(x):
+    return f"{x} + 1"
+"""
+
+TRITON_COMPILE_ERROR = """\
+def triton_compile_error(x):
+    return "compile error!"
+"""
+
+# NOTE: there is currently not an easy way to cause a triton runtime error.
+TRITON_RUNTIME_ERROR = """\
+def triton_runtime_error(x):
+    return f"{x}; assert?"
+"""
+
+TRITON_ACCURACY_ERROR = """\
+def triton_accuracy_error(x):
+    return f"{x} + 1"
+"""
+
+DEBUG_DIR = "/tmp/_torchdynamo_debug_/"
+
+# Search for the name of the first function defined in a code string.
+def get_fn_name(code):
+    fn_name_match = re.search(r"def (\w+)\(", code)
+    if fn_name_match is not None:
+        return fn_name_match.group(1)
+    return None
+
+
+# Generates code that patches CppOverrides/TritonOverrides.
+def gen_codegen_fn_patch_code(old_fn_name, new_fn_code, device):
+    new_fn_name = get_fn_name(new_fn_code)
+    if new_fn_name is not None:
+        patch_code = f"""\
+import torch._inductor.codegen.{"cpp" if device == "cpu" else "triton"} as codegen
+overrides = codegen.{"CppOverrides" if device == "cpu" else "TritonOverrides"}
+{new_fn_code}
+overrides.{old_fn_name} = staticmethod({new_fn_name})
+"""
+        return f"""\
+{patch_code}
+isolate_fails_code_str = \"\"\"\\
+{patch_code}
+torch._dynamo.config.debug_dir_root = "{DEBUG_DIR}"
+\"\"\"
+"""
+
+    return None
 
 
 class MinfierTests(torch._dynamo.test_case.TestCase):
@@ -32,9 +143,10 @@ def setUpClass(cls):
             unittest.mock.patch.object(
                 torch._dynamo.config,
                 "debug_dir_root",
-                "/tmp/_torchdynamo_debug_/",
+                DEBUG_DIR,
             )
         )
+        os.makedirs(DEBUG_DIR, exist_ok=True)
 
     @classmethod
     def tearDownClass(cls):
@@ -47,65 +159,455 @@ def setUp(self):
     def tearDown(self):
         super().tearDown()
 
-    def test_after_dynamo(self):
-        @create_backend
-        def bad_dynamo_backend(subgraph):
-            import sys
-
-            def f(*args):
-                # Shifted the forced exception to runtime as this is more common
-                # in JIT compilers.
-                for node in subgraph.model.graph.nodes:
-                    if node.op == "call_function" and node.target is torch._foobar:
-                        sys.stdout.write("Dynamo compiled failed\n")
-                        raise NotImplementedError("foobar is not implemented")
-                return subgraph.model(*args)
-
-            return f
-
-        mod = MockModule()
-        opt_mod = torch._dynamo.optimize("bad_dynamo_backend")(mod)
-        repro_file = torch._dynamo.debug_utils.get_minifier_repro_path()
-
-        @patch.object(torch._dynamo.config, "repro_after", "dynamo")
-        def inner():
-            x = torch.randn(4)
-            try:
-                opt_mod(x)
-            except Exception:
-                pass
-
-        inner()
-        self.assertTrue(os.path.exists(repro_file))
+    # Run `code` in a separate python process.
+    # Returns the completed process state and the directory containing the
+    # minifier launcher script, if `code` outputted it.
+    def _run_test_code(self, code):
+        proc = subprocess.run(
+            ["python3", "-c", code], capture_output=True, cwd=DEBUG_DIR
+        )
+
+        repro_dir_match = re.search(
+            r"(\S+)minifier_launcher.py", proc.stderr.decode("utf-8")
+        )
+        if repro_dir_match is not None:
+            # Print repro directory for debugging generated code.
+            # Make sure to comment out `shutil.rmtree...` above as well.
+            print("repro dir:", repro_dir_match.group(1))
+            return proc, repro_dir_match.group(1)
+        return proc, None
 
-    # If error_at_aot is True, an error will be produced when AOTAutograd
-    # attempts to generate the backward graph.
-    # If error_after_aot is False, an error will be produced in inductor.
-    def _test_around_aot(self, error_at_aot):
-        mod = MockModule()
-        opt_mod = torch._dynamo.optimize("inductor")(mod)
+    # Patch generated files with testing patches
+    def _inject_code(self, patch_code, filename):
+        patch_code = f"""\
+{patch_code}
+torch._dynamo.config.debug_dir_root = "{DEBUG_DIR}"
+"""
+        with open(filename, "r") as f:
+            code = f.read()
+        code = code.replace(TEST_REPLACEABLE_COMMENT, patch_code)
+        with open(filename, "w") as f:
+            f.write(code)
+        return code
 
-        repro_file = torch._dynamo.debug_utils.get_minifier_repro_path()
-        repro_after = "dynamo" if error_at_aot else "aot"
+    # Runs the minifier launcher script in `repro_dir`, patched with `patch_code`.
+    def _run_minifier_launcher(self, patch_code, repro_dir):
+        self.assertIsNotNone(repro_dir)
+        launch_file = os.path.join(repro_dir, "minifier_launcher.py")
+        self.assertTrue(os.path.exists(launch_file))
+        launch_code = self._inject_code(patch_code, launch_file)
 
-        @patch.object(torch._dynamo.config, "repro_after", repro_after)
-        def inner():
-            x = torch.randn(4)
-            x.requires_grad = error_at_aot
-            try:
-                opt_mod(x)
-            except Exception:
-                pass
+        launch_proc = subprocess.run(
+            ["python3", launch_file],
+            capture_output=True,
+            cwd=repro_dir,
+        )
 
-        inner()
+        return launch_proc, launch_code
 
+    # Runs the repro script in `repro_dir`, patched with `patch_code`
+    def _run_repro(self, patch_code, repro_dir):
+        self.assertIsNotNone(repro_dir)
+        repro_file = os.path.join(repro_dir, "repro.py")
         self.assertTrue(os.path.exists(repro_file))
+        repro_code = self._inject_code(patch_code, repro_file)
+
+        repro_proc = subprocess.run(
+            ["python3", repro_file], capture_output=True, cwd=repro_dir
+        )
+
+        return repro_proc, repro_code
+
+    # Template for testing code.
+    # `run_code` is the code to run for the test case.
+    # `patch_code` is the code to be patched in every generated file.
+    def _gen_test_code(self, run_code, repro_after, repro_level, patch_code):
+        return f"""\
+import torch
+import torch._dynamo
+{patch_code}
+torch._dynamo.config.repro_after = "{repro_after}"
+torch._dynamo.config.repro_level = {repro_level}
+torch._dynamo.config.debug_dir_root = "{DEBUG_DIR}"
+{run_code}
+"""
+
+    # Runs a full minifier test.
+    # Minifier tests generally consist of 3 stages:
+    # 1. Run the problematic code (in a separate process since it could segfault)
+    # 2. Run the generated minifier launcher script
+    # 3. Run the generated repro script
+    def _run_full_test(self, run_code, repro_after, repro_level, patch_code):
+        test_code = self._gen_test_code(run_code, repro_after, repro_level, patch_code)
+        test_proc, repro_dir = self._run_test_code(test_code)
+        self.assertIsNotNone(repro_dir)
+        launch_proc, launch_code = self._run_minifier_launcher(patch_code, repro_dir)
+        repro_proc, repro_code = self._run_repro(patch_code, repro_dir)
+        return ((test_proc, launch_proc, repro_proc), (launch_code, repro_code))
+
+    # Test that compile, runtime, and accuracy errors after dynamo can be repro'd (both CPU and CUDA)
+    def _test_after_dynamo(self, device, repro_level, backend_code, error_name):
+        run_code = textwrap.dedent(
+            f"""\
+            @torch._dynamo.optimize("{get_fn_name(backend_code)}")
+            def inner(x):
+                for _ in range(10):
+                    x = torch.sin(x)
+                x = torch.relu(x)
+                for _ in range(10):
+                    x = torch.cos(x)
+                return x
+
+            inner(torch.randn(20, 20).to("{device}"))
+        """
+        )
+
+        (test_proc, _, repro_proc), _ = self._run_full_test(
+            run_code, "dynamo", repro_level, backend_code
+        )
+
+        self.assertIn(error_name, test_proc.stderr.decode("utf-8"))
+        self.assertIn(error_name, repro_proc.stderr.decode("utf-8"))
+
+    def test_after_dynamo_cpu_compile_error(self):
+        self._test_after_dynamo(
+            "cpu", 2, RELU_COMPILE_ERROR_BACKEND, "DynamoCompileError"
+        )
+
+    def test_after_dynamo_cpu_runtime_error(self):
+        self._test_after_dynamo(
+            "cpu", 2, RELU_RUNTIME_ERROR_BACKEND, "DynamoRuntimeError"
+        )
+
+    def test_after_dynamo_cpu_accuracy_error(self):
+        self._test_after_dynamo("cpu", 4, RELU_ACCURACY_ERROR_BACKEND, "AccuracyError")
+
+    @requires_cuda()
+    def test_after_dynamo_cuda_compile_error(self):
+        self._test_after_dynamo(
+            "cuda", 2, RELU_COMPILE_ERROR_BACKEND, "DynamoCompileError"
+        )
+
+    @requires_cuda()
+    def test_after_dynamo_cuda_runtime_error(self):
+        self._test_after_dynamo(
+            "cuda", 2, RELU_RUNTIME_ERROR_BACKEND, "DynamoRuntimeError"
+        )
+
+    @requires_cuda()
+    def test_after_dynamo_cuda_accuracy_error(self):
+        self._test_after_dynamo("cuda", 4, RELU_ACCURACY_ERROR_BACKEND, "AccuracyError")
+
+    # Ensure that the testing backends pass when relu is not present.
+    def _test_after_dynamo_backend_passes(self, device, repro_level, backend_code):
+        run_code = textwrap.dedent(
+            f"""\
+            @torch._dynamo.optimize("{get_fn_name(backend_code)}")
+            def inner(x):
+                for _ in range(10):
+                    x = torch.sin(x)
+                for _ in range(10):
+                    x = torch.cos(x)
+                return x
+
+            inner(torch.randn(20, 20).to("{device}"))
+        """
+        )
+
+        test_code = self._gen_test_code(run_code, "dynamo", repro_level, backend_code)
+        proc, repro_dir = self._run_test_code(test_code)
+        self.assertEqual(proc.returncode, 0)
+        self.assertIsNone(repro_dir)
+
+    def test_after_dynamo_cpu_compile_backend_passes(self):
+        self._test_after_dynamo_backend_passes("cpu", 2, RELU_COMPILE_ERROR_BACKEND)
+
+    def test_after_dynamo_cpu_runtime_backend_passes(self):
+        self._test_after_dynamo_backend_passes("cpu", 2, RELU_RUNTIME_ERROR_BACKEND)
+
+    def test_after_dynamo_cpu_accuracy_backend_passes(self):
+        self._test_after_dynamo_backend_passes("cpu", 4, RELU_ACCURACY_ERROR_BACKEND)
+
+    @requires_cuda()
+    def test_after_dynamo_cuda_compile_backend_passes(self):
+        self._test_after_dynamo_backend_passes("cuda", 2, RELU_COMPILE_ERROR_BACKEND)
+
+    @requires_cuda()
+    def test_after_dynamo_cuda_runtime_backend_passes(self):
+        self._test_after_dynamo_backend_passes("cuda", 2, RELU_RUNTIME_ERROR_BACKEND)
+
+    @requires_cuda()
+    def test_after_dynamo_cuda_accuracy_backend_passes(self):
+        self._test_after_dynamo_backend_passes("cuda", 4, RELU_ACCURACY_ERROR_BACKEND)
+
+    # Ensure that generated code with a custom backends generates a runnable minifier
+    # launcher script that results in a RuntimeError
+    def test_after_dynamo_custom_backend(self):
+        run_code = textwrap.dedent(
+            f"""\
+            @torch._dynamo.optimize({get_fn_name(RELU_CUSTOM_ERROR_BACKEND)})
+            def inner(x):
+                for _ in range(10):
+                    x = torch.sin(x)
+                x = torch.relu(x)
+                for _ in range(10):
+                    x = torch.cos(x)
+                return x
+
+            inner(torch.randn(20, 20))
+        """
+        )
+
+        test_code = self._gen_test_code(
+            run_code, "dynamo", 2, RELU_CUSTOM_ERROR_BACKEND
+        )
+        _, repro_dir = self._run_test_code(test_code)
+        launch_proc, launch_code = self._run_minifier_launcher("", repro_dir)
+        self.assertIn("RuntimeError", launch_proc.stderr.decode("utf-8"))
+
+    # Test that a module with mixed cpu/cuda parts with an error after dynamo can be repro'd
+    @requires_cuda()
+    def test_cpu_cuda_module_after_dynamo(self):
+        backend_name = get_fn_name(RELU_COMPILE_ERROR_BACKEND)
+
+        run_code = textwrap.dedent(
+            f"""\
+            class CpuCudaModule(torch.nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.m_x = torch.nn.Linear(20, 20).cuda()
+                    self.m_y = torch.nn.Linear(20, 20)
+                    self.p_x = torch.nn.Parameter(torch.randn(20, 20).cuda())
+                    self.p_y = torch.nn.Parameter(torch.randn(20, 20))
+                    self.register_buffer("b_x", torch.ones(20, 20).cuda())
+                    self.register_buffer("b_y", torch.ones(20, 20))
+
+                def forward(self, x, y):
+                    return self.m_x(x) + self.p_x + self.b_x, self.m_y(y) + self.p_y + self.b_y
+
+            mod = CpuCudaModule()
+
+            @torch._dynamo.optimize("{backend_name}")
+            def inner(x1, y1):
+                x2 = torch.randn(20, 20).cuda()
+                y2 = torch.randn(20, 20)
+                x3, y3 = mod(x1 + x2, y1 + y2)
+                return torch.relu(x3.cpu() + y3)
+
+            inner(torch.randn(20, 20).cuda(), torch.randn(20, 20))
+        """
+        )
+
+        (test_proc, _, repro_proc), (launch_code, _) = self._run_full_test(
+            run_code, "dynamo", 2, RELU_COMPILE_ERROR_BACKEND
+        )
+
+        tb1 = test_proc.stderr.decode("utf-8")
+        tb2 = repro_proc.stderr.decode("utf-8")
+
+        # Check if generated minifier code covers all cpu/cuda cases
+        self.assertIsNotNone(re.search(r"args.*cuda", launch_code))
+        self.assertIsNotNone(re.search(r"args.*cpu", launch_code))
+        # search for Linear(...).cuda()
+        self.assertIsNotNone(re.search(r"Linear.*cuda", launch_code))
+        # search for Linear(...)
+        self.assertIsNotNone(
+            re.search(r"Linear(?!.*cuda.*$)", launch_code, re.MULTILINE)
+        )
+        self.assertIsNotNone(re.search(r"register_buffer.*cuda", launch_code))
+        self.assertIsNotNone(
+            re.search(r"register_buffer(?!.*cuda.*$)", launch_code, re.MULTILINE)
+        )
+        self.assertIsNotNone(re.search(r"Parameter.*cuda", launch_code))
+        self.assertIsNotNone(
+            re.search(r"Parameter(?!.*cuda.*$)", launch_code, re.MULTILINE)
+        )
+        # search for
+        # <name> = torch.randn(...)
+        # ... = <name>.cuda()
+        self.assertIsNotNone(
+            re.search(r"(\w+) = torch.randn.*\1\.cuda", launch_code, re.DOTALL)
+        )
+        # search for
+        # <name> = torch.randn(...)
+        # no followup call to <name>.cuda()
+        self.assertIsNotNone(
+            re.search(
+                r"(\w+) = torch.randn(?!.*\1\.cuda\(\).*$)", launch_code, re.DOTALL
+            )
+        )
+
+        self.assertIn(backend_name, tb1)
+        self.assertIn(backend_name, tb2)
+
+    # Test if we can actually get a minified graph
+    def test_if_graph_minified(self):
+        backend_name = get_fn_name(RELU_COMPILE_ERROR_BACKEND)
+
+        run_code = textwrap.dedent(
+            f"""\
+            @torch._dynamo.optimize("{backend_name}")
+            def inner(x):
+                for _ in range(20):
+                    x = torch.sin(x)
+                x = torch.relu(x)
+                for _ in range(20):
+                    x = torch.cos(x)
+                return x
+
+            inner(torch.randn(20, 20))
+        """
+        )
+
+        (test_proc, _, repro_proc), (launch_code, repro_code) = self._run_full_test(
+            run_code, "dynamo", 2, RELU_COMPILE_ERROR_BACKEND
+        )
+
+        tb1 = test_proc.stderr.decode("utf-8")
+        tb2 = repro_proc.stderr.decode("utf-8")
+
+        self.assertIn(backend_name, tb1)
+        self.assertIn(backend_name, tb2)
+
+        # compare the length of the forward functions
+        match = re.search(r"def forward.*return", launch_code, re.DOTALL)
+        self.assertIsNotNone(match)
+        self.assertGreater(match.group(0).count("\n"), 40)
+
+        match = re.search(r"def forward.*return", repro_code, re.DOTALL)
+        self.assertIsNotNone(match)
+        self.assertLess(match.group(0).count("\n"), 5)
+
+    # Test that compile and accuracy errors after aot can be repro'd (both CPU and CUDA)
+    def _test_after_aot(self, device, backend_code, repro_level):
+        run_code = textwrap.dedent(
+            f"""\
+            @torch._dynamo.optimize("inductor")
+            def inner(x):
+                for _ in range(3):
+                    x = torch.sin(x)
+                x = torch.relu(x)
+                for _ in range(3):
+                    x = torch.cos(x)
+                return x
+
+            inner(torch.randn(20, 20).to("{device}"))
+        """
+        )
+        patch_code = gen_codegen_fn_patch_code("relu", backend_code, device)
+        self.assertIsNotNone(patch_code)
+        (test_proc, _, repro_proc), _ = self._run_full_test(
+            run_code, "aot", repro_level, patch_code
+        )
+        return (
+            (test_proc.stderr.decode("utf-8"), repro_proc.stderr.decode("utf-8")),
+            (test_proc.returncode, repro_proc.returncode),
+        )
+
+    def test_after_aot_cpu_compile_error(self):
+        (tb1, tb2), _ = self._test_after_aot("cpu", CPP_COMPILE_ERROR, 2)
+        self.assertIn("CppCompileError", tb1)
+        self.assertIn("CppCompileError", tb2)
+
+    def test_after_aot_cpu_accuracy_error(self):
+        (tb1, tb2), _ = self._test_after_aot("cpu", CPP_ACCURACY_ERROR, 4)
+        self.assertIn("AccuracyError", tb1)
+        self.assertIn("AccuracyError", tb2)
+
+    @requires_cuda()
+    def test_after_aot_cuda_compile_error(self):
+        (tb1, tb2), _ = self._test_after_aot("cuda", TRITON_COMPILE_ERROR, 2)
+        self.assertIn("SyntaxError", tb1)
+        self.assertIn("SyntaxError", tb2)
+
+    @requires_cuda()
+    def test_after_aot_cuda_accuracy_error(self):
+        (tb1, tb2), _ = self._test_after_aot("cuda", TRITON_ACCURACY_ERROR, 4)
+        self.assertIn("AccuracyError", tb1)
+        self.assertIn("AccuracyError", tb2)
+
+    # Test that runtime errors after aot can be repro'd (CPU only for now)
+    def _test_after_aot_runtime_error(self, device, backend_code):
+        run_code = textwrap.dedent(
+            f"""\
+            @torch._dynamo.optimize("inductor")
+            def inner(x):
+                for _ in range(3):
+                    x = torch.sin(x)
+                x = torch.relu(x)
+                for _ in range(3):
+                    x = torch.cos(x)
+                return x
+
+            inner(torch.randn(20, 20).to("{device}"))
+        """
+        )
+        patch_code = gen_codegen_fn_patch_code("relu", backend_code, device)
+        self.assertIsNotNone(patch_code)
+
+        (test_proc, _, repro_proc), _ = self._run_full_test(
+            run_code, "aot", 3, patch_code
+        )
+
+        self.assertNotIn("CompilerError", test_proc.stderr.decode("utf-8"))
+
+        self.assertEqual(test_proc.returncode, repro_proc.returncode)
+        self.assertNotEqual(test_proc.returncode, 0)
+
+    def test_after_aot_cpu_runtime_error(self):
+        self._test_after_aot_runtime_error("cpu", CPP_RUNTIME_ERROR)
+
+    # NOTE: there is currently not an easy way to cause a triton runtime error.
+    @unittest.skip
+    @requires_cuda()
+    def test_after_aot_cuda_runtime_error(self):
+        self._test_after_aot_runtime_error("cuda", TRITON_RUNTIME_ERROR)
+
+    # Ensure that inductor codegen patches pass when relu is not present.
+    def _test_after_aot_backend_passes(self, device, repro_level, backend_code):
+        run_code = textwrap.dedent(
+            f"""\
+            @torch._dynamo.optimize("inductor")
+            def inner(x):
+                for _ in range(3):
+                    x = torch.sin(x)
+                for _ in range(3):
+                    x = torch.cos(x)
+                return x
+
+            inner(torch.randn(20, 20).to("{device}"))
+        """
+        )
+        patch_code = gen_codegen_fn_patch_code("relu", backend_code, device)
+        self.assertIsNotNone(patch_code)
+
+        test_code = self._gen_test_code(run_code, "aot", repro_level, patch_code)
+        proc, repro_dir = self._run_test_code(test_code)
+        self.assertEqual(proc.returncode, 0)
+        self.assertIsNone(repro_dir)
+
+    def test_after_aot_cpu_compile_backend_passes(self):
+        self._test_after_aot_backend_passes("cpu", 2, CPP_COMPILE_ERROR)
+
+    def test_after_aot_cpu_runtime_backend_passes(self):
+        self._test_after_aot_backend_passes("cpu", 2, CPP_RUNTIME_ERROR)
+
+    def test_after_aot_cpu_accuracy_backend_passes(self):
+        self._test_after_aot_backend_passes("cpu", 4, CPP_ACCURACY_ERROR)
+
+    @requires_cuda()
+    def test_after_aot_cuda_compile_backend_passes(self):
+        self._test_after_aot_backend_passes("cuda", 2, TRITON_COMPILE_ERROR)
 
-    def test_at_aot(self):
-        self._test_around_aot(True)
+    # NOTE: there is currently not an easy way to cause a triton runtime error.
+    @unittest.skip
+    @requires_cuda()
+    def test_after_aot_cuda_runtime_backend_passes(self):
+        self._test_after_aot_backend_passes("cuda", 2, TRITON_RUNTIME_ERROR)
 
-    def test_after_aot(self):
-        self._test_around_aot(False)
+    @requires_cuda()
+    def test_after_aot_cuda_accuracy_backend_passes(self):
+        self._test_after_aot_backend_passes("cuda", 4, TRITON_ACCURACY_ERROR)
 
 
 if __name__ == "__main__":
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index f09991f9bf348..98a269fe8c9eb 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -84,6 +84,11 @@ def __init__(self):
 
         for module_name, module in gm.named_children():
             module_str = f"{module.__repr__()}"
+            # module should be a core torch.nn.Module, so all parameters
+            # should be on the same device.
+            example_param = next(module.parameters(), None)
+            if example_param is not None and example_param.is_cuda:
+                module_str = f"{module_str}.cuda()"
             model_str += f"{tab*2}self.{module_name} = {module_str}\n"
 
         for buffer_name, buffer in gm._buffers.items():
@@ -95,12 +100,16 @@ def __init__(self):
                 tensor_str = (
                     f"torch.randint(1, size={list(buffer.shape)}, dtype={buffer.dtype})"
                 )
+            if buffer.is_cuda:
+                tensor_str = f"{tensor_str}.cuda()"
             model_str += f"{tab*2}self.register_buffer('{buffer_name}', {tensor_str})\n"
 
         for param_name, param in gm._parameters.items():
             if param is None:
                 continue
             tensor_str = f"torch.nn.Parameter(torch.randn({list(param.shape)}, dtype={param.dtype}))"
+            if param.is_cuda:
+                tensor_str = f"{tensor_str}.cuda()"
             model_str += f"{tab*2}self.{param_name} = {tensor_str}\n"
 
         # TODO - Keep this code for now. But, I don't think we will need this.
@@ -145,6 +154,9 @@ def _cuda_system_info_comment():
     return model_str
 
 
+TEST_REPLACEABLE_COMMENT = "# REPLACEABLE COMMENT FOR TESTING PURPOSES"
+
+
 def generate_compiler_repro_string(gm, args):
     model_str = textwrap.dedent(
         f"""
@@ -155,6 +167,8 @@ def generate_compiler_repro_string(gm, args):
         from math import inf
         from torch.fx.experimental.proxy_tensor import make_fx
 
+        {TEST_REPLACEABLE_COMMENT}
+
         """
     )
     model_str += f"# torch version: {torch.version.__version__}\n"
@@ -170,7 +184,7 @@ def generate_compiler_repro_string(gm, args):
     model_str += (
         "args = [rand_strided(sh, st, dt, dev) for (sh, st, dt, dev) in args]\n"
     )
-    model_str += 'mod = make_fx(Repro().to(device="cuda"))(*args)\n'
+    model_str += "mod = make_fx(Repro())(*args)\n"
     return model_str
 
 
@@ -197,7 +211,8 @@ def dump_compiler_graph_state(gm, args, compiler_name):
     log.warning(f"Writing checkpoint with {len(gm.graph.nodes)} nodes to {file_name}")
     with open(file_name, "w") as fd:
         save_graph_repro(fd, gm, args, compiler_name)
-    repro_path = os.path.join(config.base_dir, "repro.py")
+    curdir = os.getcwd()
+    repro_path = os.path.join(curdir, "repro.py")
     try:
         shutil.copyfile(file_name, repro_path)
         log.warning(f"Copying repro file for convenience to {repro_path}")
@@ -216,7 +231,10 @@ def save_graph_repro(fd, gm, args, compiler_name):
             textwrap.dedent(
                 f"""
                 compiled = {COMPILER_REPRO_OPTIONS[compiler_name][1]}(mod, args)
-                assert same_two_models(mod, compiled, args, only_fwd=True), "Accuracy failed"
+                class AccuracyError(Exception):
+                    pass
+                if not same_two_models(mod, compiled, args, only_fwd=True):
+                    raise AccuracyError("Bad accuracy detected")
                 """
             )
         )
@@ -231,7 +249,7 @@ def save_graph_repro(fd, gm, args, compiler_name):
         )
 
 
-def isolate_fails(fx_g, args, compiler_name: str, env=None):
+def isolate_fails(fx_g, args, compiler_name: str, env=None, patch_code=None):
     if env is None:
         env = {}
     subdir = os.path.join(os.getcwd(), "isolate")
@@ -239,7 +257,10 @@ def isolate_fails(fx_g, args, compiler_name: str, env=None):
         os.makedirs(subdir, exist_ok=True)
     file_name = os.path.join(subdir, f"{str(uuid.uuid4())[:5]}.py")
     with open(file_name, "w") as fd:
-        fd.write(generate_compiler_repro_string(fx_g, args))
+        repro_code = generate_compiler_repro_string(fx_g, args)
+        if patch_code is not None:
+            repro_code = repro_code.replace(TEST_REPLACEABLE_COMMENT, patch_code)
+        fd.write(repro_code)
         fail_fn = COMPILER_REPRO_OPTIONS[compiler_name][2]
         fd.write(
             textwrap.dedent(
@@ -263,6 +284,7 @@ def isolate_fails(fx_g, args, compiler_name: str, env=None):
     stdout, stderr = TemporaryFile(), TemporaryFile()
     p = subprocess.Popen(
         ["python", file_name],
+        cwd=subdir,
         stdout=stdout,
         stderr=stderr,
         env=new_env,
@@ -329,6 +351,8 @@ def dump_to_minify(gm, args, compiler_name: str):
 
     contents = textwrap.dedent(
         f"""
+isolate_fails_code_str = None
+
 {generate_compiler_repro_string(gm, args)}
 
 from functools import partial
@@ -343,7 +367,7 @@ def dump_to_minify(gm, args, compiler_name: str):
 minifier(
     mod,
     args,
-    module_fails=partial(isolate_fails, env=env_variables, compiler_name="{compiler_name}"),
+    module_fails=partial(isolate_fails, env=env_variables, compiler_name="{compiler_name}", patch_code=isolate_fails_code_str),
     dump_state=partial(dump_compiler_graph_state, compiler_name="{compiler_name}"),
 )
         """
@@ -351,6 +375,10 @@ def dump_to_minify(gm, args, compiler_name: str):
     return helper_for_dump_minify(contents)
 
 
+class AccuracyError(Exception):
+    pass
+
+
 def wrap_compiler_debug(compiler_fn, compiler_name: str):
     """
     Minifier for Fx Graph modules after Aot Autograd has finished. We wrap both
@@ -410,7 +438,7 @@ def deferred_for_real_inputs(real_inputs):
                         copy_tensor_attrs,
                         f"{compiler_name}_accuracy",
                     )
-                    raise ValueError("Bad accuracy detected")
+                    raise AccuracyError("Bad accuracy detected")
                 else:
                     # Call the compiled function with real inputs
                     return inner_compiled_fn(real_inputs)
@@ -435,7 +463,8 @@ def deferred_for_real_inputs(real_inputs):
                             copy_tensor_attrs,
                             compiler_name,
                         )
-                    raise e
+                    log.error("CompilerError")
+                    raise
 
         if config.repro_after == "aot":
             compiled_fn = deferred_for_real_inputs
@@ -544,9 +573,14 @@ def generate_dynamo_fx_repro_string(
             f"""
 mod.eval()
 opt_mod.eval()
+
+class AccuracyError(Exception):
+    pass
+
 with torch.cuda.amp.autocast(enabled={torch.is_autocast_enabled()}):
     assert same_two_models(mod, mod, args), "Eager itself failed"
-    assert same_two_models(mod, opt_mod, args), "Dynamo failed"
+    if not same_two_models(mod, opt_mod, args):
+        raise AccuracyError("Dynamo failed")
     """
         )
 
@@ -561,12 +595,14 @@ def generate_dynamo_fx_repro_string(
 from {config.dynamo_import}.debug_utils import run_fwd_maybe_bwd
 from {config.dynamo_import}.debug_utils import same_two_models
 
+{TEST_REPLACEABLE_COMMENT}
+
 args = {[(tuple(a.shape), tuple(a.stride()), a.dtype, a.device.type, a.requires_grad) for a in args]}
 args = [rand_strided(sh, st, dt, dev).requires_grad_(rg) for (sh, st, dt, dev, rg) in args]
 
 {model_str}
 
-mod = Repro().cuda()
+mod = Repro()
 opt_mod = {config.dynamo_import}.optimize("{compiler_name}")(mod)
 
 {run_code}
@@ -705,6 +741,21 @@ def dump_to_minify_after_dynamo(gm, args, compiler_name):
     if config.repro_level == 4:
         minifier_backend = "dynamo_accuracy_minifier_backend"
 
+    custom_compiler_error = (
+        textwrap.dedent(
+            """\
+        raise RuntimeError(
+            'Compiler name is None - this likely means that a custom compiler '
+            'was called by torchdynamo. Please remove this error, import your '
+            'custom compiler function, and replace the compiler_name="None" '
+            'line below to compiler_name=<my_imported_custom_function>'
+        )
+        """
+        )
+        if compiler_name is None
+        else ""
+    )
+
     contents = textwrap.dedent(
         f"""
 import os
@@ -718,14 +769,17 @@ def dump_to_minify_after_dynamo(gm, args, compiler_name):
 from {config.dynamo_import}.optimizations.backends import BACKENDS
 from {config.dynamo_import}.testing import rand_strided
 
+{TEST_REPLACEABLE_COMMENT}
+
 args = {[(tuple(a.shape), tuple(a.stride()), a.dtype, a.device.type, a.requires_grad) for a in args]}
 args = [rand_strided(sh, st, dt, dev).requires_grad_(rg) for (sh, st, dt, dev, rg) in args]
 
 {model_str}
-mod = Repro().cuda()
+mod = Repro()
 
 # Setup debug minifier compiler
 compiler_fn = BACKENDS["{minifier_backend}"]
+{custom_compiler_error}
 dynamo_minifier_backend = functools.partial(
     compiler_fn,
     compiler_name="{compiler_name}",
@@ -769,7 +823,7 @@ def debug_wrapper(gm, example_inputs, **kwargs):
                         example_inputs,
                         compiler_name,
                     )
-                    exc = ValueError("Bad accuracy detected.")
+                    exc = AccuracyError("Bad accuracy detected.")
                     exc.minifier_path = os.path.join(
                         minifier_dir(), "minifier_launcher.py"
                     )

From 4671c70bdad51e3083769d3c5b255ab164828294 Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Fri, 11 Nov 2022 21:22:49 +0000
Subject: [PATCH 0819/1922] [FSDP+dynamo]: forward treats parameter-views as
 params (#88781)

Dynamo+AotAutograd needs a way to wrap all tensors (whether
inputs or params/buffers) in FakeTensor wrappers, and
FSDP's mangling of parameters hides them from this wrapping.

This PR unblocks running hf_bert and hf_T5 with FSDP under dynamo, whether using recursive wrapping around transformer layers or only applying FSDP around the whole model.  Perf/memory validation and possibly optimization is the next step.
`python benchmarks/dynamo/distributed.py --torchbench_model hf_Bert --fsdp --dynamo aot_eager`
`python benchmarks/dynamo/distributed.py --torchbench_model hf_Bert --fsdp --dynamo aot_eager --fsdp_wrap`
`python benchmarks/dynamo/distributed.py --torchbench_model hf_T5 --fsdp --dynamo aot_eager`
`python benchmarks/dynamo/distributed.py --torchbench_model hf_T5 --fsdp --dynamo aot_eager --fsdp_wrap`

The problem:
Dynamo (Actually aot_autograd) trips up with FSDP becuase it must
wrap all input tensors in FakeTensor wrappers, and it only knows
to wrap graph inputs or named_(parameters, buffers).  FSDP's
pre_forward hook sets views (which are not nn.param) into the flatparam
as attrs on the module with the same name as the original param, but
they will not show up in named_parameters.

- in use_orig_params mode, FSDP still de-registers
  params during pre-forward hook, then re-registers them
  post-forward
- during forward (between the hooks), the params are setattr'd
  on the module as regular view tensors, not nn.Parameters
- note: use_orig_params is the recommended way to use FSDP,
  and use_orig_params=False is being deprecated.  So i only consider
  use_orig_params=True for this enablement

The solution:
- adding them to named_buffers is not possible because it interferes
  with how FSDP's `_apply` works
- since they are not actual nn.parameters, register_parameter will
  complain about registering them
- simply seting `module._parameters[name] = view` seems to be a viable
  workaround, despite being hacky, and FSDP code does modify _parameters
  directly already.

Note: Manual checkpointing still isn't working with FSDP+dynamo,
so that will have to be addressed in a follow up.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88781
Approved by: https://github.com/ezyang, https://github.com/awgu
---
 benchmarks/dynamo/dist_util.py              |  20 +--
 benchmarks/dynamo/distributed.py            |   5 +-
 test/distributed/test_dynamo_distributed.py | 131 ++++++++++++++++----
 torch/distributed/fsdp/flat_param.py        |   4 +
 4 files changed, 124 insertions(+), 36 deletions(-)

diff --git a/benchmarks/dynamo/dist_util.py b/benchmarks/dynamo/dist_util.py
index 9e2f086ca8b70..d30b5a63cfe5f 100644
--- a/benchmarks/dynamo/dist_util.py
+++ b/benchmarks/dynamo/dist_util.py
@@ -20,6 +20,9 @@
 except ImportError:
     from torchbench import setup_torchbench_cwd
 
+from transformers.models.bert.modeling_bert import BertLayer, BertLMPredictionHead
+from transformers.models.t5.modeling_t5 import T5Block
+
 
 def setup(rank, world_size):
     os.environ["MASTER_ADDR"] = "localhost"
@@ -122,26 +125,25 @@ def check_fn(submodule):
     )
 
 
-# from transformers.models.t5.modeling_t5 import T5Block
-
 MODEL_FSDP_WRAP = {
-    ToyModel: (MyModule,)
-    # TODO T5: (T5Block,)
+    "toy_model": (MyModule,),
+    "hf_Bert": (BertLayer, BertLMPredictionHead),
+    "hf_T5": (T5Block,),
 }
 
 
-def apply_fsdp(model, use_checkpointing=False, use_wrap_policy=True):
-    blocks = MODEL_FSDP_WRAP[model.__class__]
-
+def apply_fsdp(args, model, use_checkpointing=False, use_wrap_policy=True):
     wrap_policy = None
+    blocks = MODEL_FSDP_WRAP[
+        "toy_model" if model.__class__ is ToyModel else args.torchbench_model
+    ]
     if use_wrap_policy:
         # transformer policy is really a generic policy that wraps modules of specified classes
         wrap_policy = functools.partial(
             transformer_auto_wrap_policy, transformer_layer_cls=blocks
         )
 
-    model = FSDP(model, auto_wrap_policy=wrap_policy)
+    model = FSDP(model, auto_wrap_policy=wrap_policy, use_orig_params=True)
     if use_checkpointing:
         fsdp_checkpointing_base(model, blocks)
-
     return model
diff --git a/benchmarks/dynamo/distributed.py b/benchmarks/dynamo/distributed.py
index c2db15563348a..32e3b544d87dd 100644
--- a/benchmarks/dynamo/distributed.py
+++ b/benchmarks/dynamo/distributed.py
@@ -50,6 +50,7 @@ def move_tensor(maybe_tensor):
 
     if args.fsdp:
         model = apply_fsdp(
+            args,
             model,
             use_checkpointing=args.fsdp_checkpoint,
             use_wrap_policy=args.fsdp_wrap,
@@ -160,7 +161,9 @@ def experiment(fn, key, world_size, results):
     )
     args = parser.parse_args()
 
-    model_name = "ToyModel" if args.toy_model else args.torchbench_model
+    model_name = args.torchbench_model
+    if args.toy_model:
+        model_name = "ToyModel"
     model, inputs = get_model(args)
 
     fn = partial(run_model, args, model, inputs)
diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index 3dd3c5de77253..b6bc16edb941a 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -1,4 +1,6 @@
 # Owner(s): ["module: dynamo"]
+import copy
+import functools
 import logging
 import os
 import random
@@ -16,7 +18,9 @@
 from torch._dynamo.utils import same
 from torch._dynamo.testing import collect_results
 from torch._inductor.utils import has_triton
+from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
 from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
     import_transformers_or_skip,
@@ -175,6 +179,7 @@ def test_ddp_baseline_aot_eager_multiprocess(self):
 
     @skip_if_lt_x_gpu(2)
     @import_transformers_or_skip()
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
     @patch.object(config, "optimize_ddp", True)
     @patch.object(torch._inductor.config, "fallback_random", True)
     def test_hf_bert_ddp(self):
@@ -199,6 +204,106 @@ def test_hf_bert_ddp(self):
             opt_results = collect_results(opt_model, opt_outputs.logits, opt_loss, inputs_flat)
             self.assertTrue(same(correct_results, opt_results))
 
+
+    @skip_if_lt_x_gpu(1)
+    # TODO(whc)  delete aot_eager test, if inductor test lands stably
+    def test_fsdp_aot_eager(self):
+        with _per_rank_init(self.rank, self.world_size):
+            # Test with basic FSDP wrapping (outer wrap around whole model)
+            m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
+            fsdp_m = FSDP(m, use_orig_params=True)
+            fsdp_m = torch._dynamo.optimize("aot_eager")(fsdp_m)
+            outputs = fsdp_m(inputs)
+            self.assertTrue(same(correct_outputs, outputs))
+
+            # Test with recursive wrapping, nested FSDP around each Linear
+            m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
+            fsdp_m = FSDP(
+                m,
+                auto_wrap_policy=functools.partial(
+                    transformer_auto_wrap_policy, transformer_layer_cls=(nn.Linear, )
+                ),
+                use_orig_params=True
+            )
+            fsdp_m = torch._dynamo.optimize("aot_eager")(fsdp_m)
+            outputs = fsdp_m(inputs)
+            self.assertTrue(same(correct_outputs, outputs))
+
+    @skip_if_lt_x_gpu(1)
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    def test_fsdp_inductor(self):
+        with _per_rank_init(self.rank, self.world_size):
+            # Test with basic FSDP wrapping (outer wrap around whole model)
+            m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
+            fsdp_m = FSDP(m, use_orig_params=True)
+            fsdp_m = torch._dynamo.optimize("inductor")(fsdp_m)
+            outputs = fsdp_m(inputs)
+            self.assertTrue(same(correct_outputs, outputs))
+
+            # Test with recursive wrapping, nested FSDP around each Linear
+            m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
+            fsdp_m = FSDP(
+                m,
+                auto_wrap_policy=functools.partial(
+                    transformer_auto_wrap_policy, transformer_layer_cls=(nn.Linear, )
+                ),
+                use_orig_params=True
+            )
+            fsdp_m = torch._dynamo.optimize("inductor")(fsdp_m)
+            outputs = fsdp_m(inputs)
+            self.assertTrue(same(correct_outputs, outputs))
+
+    @import_transformers_or_skip()
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    # TODO(whc) Investigate why cudagraphs breaks inductor+fsdp for hf_bert
+    @patch.object(torch._inductor.config.triton, "cudagraphs", False)
+    @patch.object(torch._inductor.config, "fallback_random", True)
+    def test_hf_bert_fsdp(self):
+        from transformers.models.bert.modeling_bert import BertLayer
+
+        def apply_fsdp(model, wrap_policy):
+            model = FSDP(
+                copy.deepcopy(model),
+                auto_wrap_policy=wrap_policy,
+                use_orig_params=True
+            )
+            return model
+
+        with _per_rank_init(self.rank, self.world_size):
+            for (wrap_policy, test_instance) in (
+                (
+                    None,
+                    "FSDP without recursive wrapping"
+                ),
+                (
+                    functools.partial(
+                        transformer_auto_wrap_policy, transformer_layer_cls=(BertLayer, )
+                    ),
+                    "FSDP with recursive wrapping BertLayer instances"
+                )
+            ):
+                print(f"Running hf_bert test for {test_instance}")
+                model, inputs = get_hf_bert(self.rank)
+                reset_rng_state()
+                eager_model = apply_fsdp(model, wrap_policy)
+                correct_outputs = eager_model(**inputs)
+                correct_loss = correct_outputs.loss
+                correct_loss.backward()
+
+                reset_rng_state()
+                opt_model = apply_fsdp(model, wrap_policy)
+
+                opt_model = torch._dynamo.optimize("inductor")(opt_model)
+                opt_outputs = opt_model(**inputs)
+                opt_loss = opt_outputs.loss
+                opt_loss.backward()
+
+                inputs_flat = [inputs[k] for k in inputs]
+                correct_results = collect_results(eager_model, correct_outputs.logits, correct_loss, inputs_flat)
+                opt_results = collect_results(opt_model, opt_outputs.logits, opt_loss, inputs_flat)
+                self.assertTrue(same(correct_results, opt_results))
+
+
 @requires_nccl()
 class TestDistributed(torch._dynamo.test_case.TestCase):
     """
@@ -257,32 +362,6 @@ def test_ddp_baseline_inductor(self):
         outputs = ddp_m(inputs)
         self.assertTrue(same(correct_outputs, outputs))
 
-    # TODO(whc) move these tests to 'distributed' shard to get nccl, or see if it's available already in pytorch CI?
-    @unittest.skip(
-        "can't run with gloo (no support for _allgather_base) and nccl not available in CI"
-    )
-    @patch.object(config, "optimize_ddp", False)
-    def test_fsdp_baseline_aot_eager(self):
-        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-
-        m, inputs, correct_outputs = self.get_model()
-        fsdp_m = FSDP(m, device_id=self.device_ids[0] if self.device_ids else None)
-        fsdp_m = torch._dynamo.optimize("aot_eager")(fsdp_m)
-        outputs = fsdp_m(inputs)
-        self.assertTrue(same(correct_outputs, outputs))
-
-    @unittest.skip("hangs/crashes with inductor currently")
-    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
-    @patch.object(config, "optimize_ddp", False)
-    def test_fsdp_baseline_inductor(self):
-        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-
-        m, inputs, correct_outputs = self.get_model()
-        fsdp_m = FSDP(m, device_id=self.device_ids[0] if self.device_ids else None)
-        fsdp_m = torch._dynamo.optimize("inductor")(fsdp_m)
-        outputs = fsdp_m(inputs)
-        self.assertTrue(same(correct_outputs, outputs))
-
     @patch.object(config, "optimize_ddp", True)
     def test_graph_split(self):
         """
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index 0978f0875a28f..b790590c7943f 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -1306,6 +1306,8 @@ def _use_unsharded_views(self, as_params: bool) -> None:
                         assert tensor is not None  # mypy
                         param_var = tensor
                 setattr(module, param_name, param_var)
+                if self._use_orig_params and self._training_state == HandleTrainingState.FORWARD:
+                    module._parameters[param_name] = param_var  # type: ignore[assignment]
         for i, (
             param_name,
             module,
@@ -1336,6 +1338,8 @@ def _use_unsharded_views(self, as_params: bool) -> None:
                 module.register_parameter(param_name, prim_param)
             else:
                 setattr(module, param_name, prim_param)
+                if self._use_orig_params and self._training_state == HandleTrainingState.FORWARD:
+                    module._parameters[param_name] = prim_param  # type: ignore[assignment]
 
     def _use_unsharded_grad_views(self) -> None:
         """

From 05f927f22dc6de1b576aaf9747219ba1818d6700 Mon Sep 17 00:00:00 2001
From: Jiaxu Zhu <jiaxuzhu@meta.com>
Date: Sat, 12 Nov 2022 01:20:52 +0000
Subject: [PATCH 0820/1922] Support torch.qint32 in Convert (#88871)

Enable the `torch.qint32` when creating `quantize_per_tensor` function call in `convert_fx`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88871
Approved by: https://github.com/jerryzh168
---
 torch/ao/quantization/fx/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py
index 61bb2cdc1b034..a5a989ec21480 100644
--- a/torch/ao/quantization/fx/utils.py
+++ b/torch/ao/quantization/fx/utils.py
@@ -183,7 +183,7 @@ def get_quantize_node_info(
     if hasattr(activation_post_process, "compute_dtype"):
         compute_dtype = activation_post_process.compute_dtype  # type: ignore[attr-defined]
     quantize_op : Optional[Union[Callable, str]] = None
-    if dtype in [torch.quint8, torch.qint8] and \
+    if dtype in [torch.quint8, torch.qint8, torch.qint32] and \
             not hasattr(activation_post_process, 'compute_dtype'):
         node_type = "call_function"
         scale, zero_point = activation_post_process.calculate_qparams()  # type: ignore[attr-defined]

From 497e144290fe7688dc6685ccf0d3317742219bca Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Fri, 11 Nov 2022 09:00:55 -0500
Subject: [PATCH 0821/1922] Remove incorrect assert about hermetic state.
 (#88885)

I'm not sure why I thought this assert was valid in the first
place, and there's no comment about it.

The assert is tantamount to saying, "no tensor objects should
become dead via SafePyObject when hermetic mode is on."  But
suppose we run a Python GC while we're inside hermetic mode.
This could result in us disposing non-hermetic tensors, which
would hit decref.  So the assert seems invalid.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88885
Approved by: https://github.com/anjali411, https://github.com/malfet
---
 torch/csrc/autograd/python_variable.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index 920d0e7344b58..002b904d40721 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -305,10 +305,6 @@ void ConcretePyInterpreterVTable::decref(PyObject* pyobj, bool is_tensor)
   // THPVariable_clear).
   // 2. We are decref-ing some other Python object. We don't do
   // PyObject resurrection on non-Tensors, so we just carry on as usual
-  if (is_tensor) {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        !c10::impl::HermeticPyObjectTLS::get_state());
-  }
   if (is_tensor && Py_REFCNT(pyobj) > 1) {
     // It's still alive!  This can happen if a weak ref resurrected
     // the PyObject without flipping ownership.  At this point it is

From d8f8f29f0e9a329ad56a2edf98826bd3c446ebea Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Fri, 11 Nov 2022 09:33:41 -0500
Subject: [PATCH 0822/1922] Fix bug in OptionalTensorList (#88887)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88887
Approved by: https://github.com/anjali411
---
 aten/src/ATen/core/PythonFallbackKernel.cpp |  5 ++++-
 test/test_python_dispatch.py                | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/core/PythonFallbackKernel.cpp b/aten/src/ATen/core/PythonFallbackKernel.cpp
index e16874a83f966..2d8834afe59ef 100644
--- a/aten/src/ATen/core/PythonFallbackKernel.cpp
+++ b/aten/src/ATen/core/PythonFallbackKernel.cpp
@@ -74,10 +74,13 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
         (*interpreter)->dispatch(op, stack);
         return;
       }
-    } else if (ivalue.isTensorList() || (ivalue.isOptionalTensorList() && !ivalue.isNone())) {
+    } else if (ivalue.isTensorList() || ivalue.isOptionalTensorList()) {
       // NB: use toListRef as it doesn't induce refcount bumps (toTensorListRef
       // is not a thing)
       for (const auto& nv : ivalue.toListRef()) {
+        if (nv.isNone()) {
+          continue;
+        }
         auto* interpreter = nv.unsafeToTensorImpl()->pyobj_interpreter();
         if (interpreter) {
           (*interpreter)->dispatch(op, stack);
diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py
index 380f85f568f72..33465217bbbc0 100644
--- a/test/test_python_dispatch.py
+++ b/test/test_python_dispatch.py
@@ -390,6 +390,24 @@ def test_produce_real_type(self) -> None:
 $4 = torch._ops.aten.select.int($3, 1, 1)
 $5 = torch._ops.aten.clone.default($4, memory_format=torch.contiguous_format)''')
 
+    def test_optional_tensor_list(self) -> None:
+        def weird(xs):
+            print("woof")
+            return torch.empty(())
+
+        my_lib = Library("my_lib", "DEF")
+        my_lib.define("weird(Tensor?[] self) -> Tensor")
+        my_lib.impl("weird", weird, "CPU")
+        with capture_logs() as logs:
+            x = LoggingTensor(torch.ones(2, 2))
+            log_input("x", x)
+            torch.ops.my_lib.weird.default([None, x])
+
+        self.assertExpectedInline('\n'.join(logs), '''\
+$0 = input('x')
+$1 = torch._ops.my_lib.weird.default([None, LoggingTensor(tensor([[1., 1.],
+        [1., 1.]]))])''')
+
     def test_list_ret(self) -> None:
         # test all sequence types are permissible returns
         for list_type in (list, tuple):

From 29c5a458454999221ed3811203c6428fe200738d Mon Sep 17 00:00:00 2001
From: mikey dagitses <mikeyd@meta.com>
Date: Sat, 12 Nov 2022 02:23:48 +0000
Subject: [PATCH 0823/1922] fix fx tests (#88886)

Summary:
Some source files are missing and TPX couldn't handle the default test
names.

Test Plan: Rely on CI.

Differential Revision: D41218564

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88886
Approved by: https://github.com/zou3519
---
 test/fx/test_common_passes.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/test/fx/test_common_passes.py b/test/fx/test_common_passes.py
index 9c59abce4da61..407e707db8797 100644
--- a/test/fx/test_common_passes.py
+++ b/test/fx/test_common_passes.py
@@ -73,10 +73,15 @@ def MutationMetadata(x):
 if torch.cuda.is_available():
     Devices.append("cuda")
 
+
+def name_fn(common_pass, f, device):
+    """Names parameterized test cases."""
+    return f'{type(common_pass()).__name__}_{f.__name__}_{device}'
+
 @instantiate_parametrized_tests
 class TestCommonPass(TestCase):
 
-    @parametrize("common_pass,f,device", itertools.product(Passes, Test_Cases, Devices))
+    @parametrize("common_pass,f,device", itertools.product(Passes, Test_Cases, Devices), name_fn)
     def test_correctness(self, common_pass, f, device):
         inp = torch.randn(10, device=device)
 
@@ -94,7 +99,7 @@ def test_correctness(self, common_pass, f, device):
         self.assertEqual(result, expected)
 
 
-    @parametrize("common_pass,f,device", itertools.product(Passes, Factory_Test_Cases, Devices))
+    @parametrize("common_pass,f,device", itertools.product(Passes, Factory_Test_Cases, Devices), name_fn)
     def test_correctness_factory(self, common_pass, f, device):
         inp = torch.randn(10, device=device)
         traced_m = make_fx(f)(inp, device)

From 78ef88302bc26f065ff5ef65e264e1ae2c605d29 Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Fri, 11 Nov 2022 20:41:36 +0000
Subject: [PATCH 0824/1922] Exclude poolformer_m36 from the inductor model test
 (#88908)

Summary: The root cause is still to be investigated. Issue tracked at
https://github.com/pytorch/torchdynamo/issues/1856

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88908
Approved by: https://github.com/malfet
---
 benchmarks/dynamo/common.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 758f4396b5b1b..198877e0313d8 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -156,6 +156,7 @@
     "hrnet_w18",  # accuracy
     "lcnet_0500",  # accuracy
     "levit_128",  # levit_128
+    "poolformer_m36",
     "rexnet_100",  # accuracy
     "swin_base_patch4_window7_224",
     "twins_pcpvt_base",  # time out

From fd3e7546caf8ec7bf51fa95e6126bb89b7ae2da0 Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Thu, 10 Nov 2022 21:19:22 +0000
Subject: [PATCH 0825/1922] [FSDP][state_dict][6/N] Remove most FSDP module
 dependency from _optim_utils (#88638)

**What**
This PR removes most `FullyShardedDataParallel` dependencies from `optim_utils`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88638
Approved by: https://github.com/awgu
---
 torch/distributed/fsdp/_optim_utils.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py
index 530a8480d5522..70fb4156d5378 100644
--- a/torch/distributed/fsdp/_optim_utils.py
+++ b/torch/distributed/fsdp/_optim_utils.py
@@ -22,9 +22,11 @@
 import torch.distributed.fsdp.fully_sharded_data_parallel as fsdp_file
 import torch.nn as nn
 from torch.distributed._shard.sharded_tensor import ShardedTensor
+from torch.distributed.fsdp._common_utils import _get_param_to_fqns
 from torch.distributed.fsdp._fsdp_extensions import _ext_chunk_tensor
-from torch.distributed.fsdp._runtime_utils import _clear_grads_if_needed
+from torch.distributed.fsdp._runtime_utils import _clear_grads_if_needed, _lazy_init
 from torch.distributed.fsdp._shard_utils import _gather_state_dict
+from torch.distributed.fsdp.api import ShardingStrategy
 from torch.distributed.fsdp.flat_param import FlatParameter, FlatParamHandle
 
 
@@ -185,7 +187,7 @@ def _communicate_optim_state(
             # we take the target rank's value
             if (
                 fsdp_module.world_size == 1
-                or fsdp_module.sharding_strategy == fsdp_file.ShardingStrategy.NO_SHARD
+                or fsdp_module.sharding_strategy == ShardingStrategy.NO_SHARD
             ):
                 tensor_state[state_name] = value
                 continue
@@ -293,7 +295,7 @@ def _flatten_optim_state_dict(
             '"param_groups" to be a valid optimizer state dict'
         )
     flat_param_to_fsdp_module = _get_flat_param_to_fsdp_module(model)
-    param_to_fqns = fsdp_file._get_param_to_fqns(model)
+    param_to_fqns = _get_param_to_fqns(model)
 
     # Construct the "state" part
     flat_osd_state: Dict[_OptimStateKey, Any] = {}
@@ -897,7 +899,7 @@ def _rekey_sharded_optim_state_dict(
         if using_optim_input
         else _get_param_to_param_id(optim)
     )
-    param_to_fqns = fsdp_file._get_param_to_fqns(model)
+    param_to_fqns = _get_param_to_fqns(model)
     # All parameter keys in `param_to_flat_param_id` should be in
     # `param_to_fqns` -- strict inequality follows when not all parameters are
     # passed to the optimizer
@@ -951,7 +953,7 @@ def _get_flat_param_to_fsdp_module(model: torch.nn.Module):
     flat_param_to_fsdp_module = {}
     for module in model.modules():
         if isinstance(module, fsdp_file.FullyShardedDataParallel):
-            fsdp_file._lazy_init(module, module)
+            _lazy_init(module, module)
             for param in module.params:  # may have none
                 flat_param_to_fsdp_module[param] = module
     return flat_param_to_fsdp_module
@@ -1165,9 +1167,7 @@ def _optim_state_dict(
 
     # Construct the local mapping between unflattened parameter names
     # (`_OptimStateKey`s) and parameter IDs and broadcast rank 0's mapping
-    param_to_fqns: Dict[torch.nn.Parameter, List[str]] = fsdp_file._get_param_to_fqns(
-        model
-    )
+    param_to_fqns: Dict[torch.nn.Parameter, List[str]] = _get_param_to_fqns(model)
     flat_param_id_to_param: List[torch.nn.Parameter] = (
         _get_param_id_to_param_from_optim_input(model, optim_input)
         if using_optim_input

From f90a62e71a79240340acb36c54ddebddea8a3dce Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Sat, 12 Nov 2022 03:21:06 +0000
Subject: [PATCH 0826/1922] [vision hash update] update the pinned vision hash
 (#88920)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned vision hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88920
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/vision.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index 48685938a146b..b9eda365de0c5 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-d72e90640ec8514e0369b5419d7f3b74a387b1d7
+deba056203d009fec6b58afb9fa211f6ee3328c8

From f92f1c72731ac691150b418155cfe70cef624b59 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Sat, 12 Nov 2022 01:05:46 +0000
Subject: [PATCH 0827/1922] [FSDP] Introduce `ModuleWrapPolicy` for simplicity
 (#88450)

**BC Breaking Change**
This renames `unwrapped_params` to `nonwrapped_numel`. I prefer `nonwrapped` over `unwrapped` because "unwrap"  suggests that some wrapping has been undone. I prefer `numel` over `params` because that is unit of measurement; I think we should keep "params" to refer to `nn.Parameter`s themselves.

This only breaks anything that passes `unwrapped_params` as a keyword argument, but I did not see anything that did that (except the one internal benchmark file but that does not actually depend on our `pytorch` code).

In a follow-up, I want to rename `min_num_params` to `min_nonwrapped_numel` in `size_based_auto_wrap_policy`, which is also BC breaking. Again, this is to differentiate between "params" being `nn.Parameter`s and "numel" being the unit for `param.numel()`.

**Overview**
This PR introduces `ModuleWrapPolicy` as a lightweight layer over the existing `transformer_auto_wrap_policy`. The most common auto wrapping paradigm is:
```
module_classes: Set[Type[nn.Module]] = ...
auto_wrap_policy = functools.partial(
    transformer_auto_wrap_policy,
    transformer_layer_cls=module_classes,
)
fsdp_model = FSDP(model, auto_wrap_policy=auto_wrap_policy, ...)
```
Now, users can instead write:
```
auto_wrap_policy = ModuleWrapPolicy(module_classes)
fsdp_model = FSDP(model, auto_wrap_policy=auto_wrap_policy, ...)
```
This hides the unused arguments expected from the callable (`recurse` and `unwrapped_params`/`nonwrapped_numel`).

`ModuleWrapPolicy` inherits from an abstract base class `FSDPPolicy` that expects a `policy` property. This decouples the construct of such `FSDPPolicy` classes and their actual `policy`, which must abide by the `_recursive_wrap` interface. Any existing auto wrap policy can be rewritten as a class that inherits from `FSDPPolicy`, so this approach is fully backward compatible from a functionality perspective.

I call this base class `FSDPPolicy` to generalize over the cases where we may not want to actually perform any nested wrapping. In reality, the policy is meant for constructing `FlatParameter`s, which just happened to be induced by a nested wrapping before. Given this, I am changing the constructor argument in `fully_shard()` to simply `policy` instead of `auto_wrap_policy`.

This PR migrates usages of `transformer_auto_wrap_policy` within our unit test suite to `ModuleWrapPolicy` as much as possible.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88450
Approved by: https://github.com/zhaojuanmao
---
 .../_composable/test_fully_shard.py           |  27 +-
 .../fsdp/test_fsdp_clip_grad_norm.py          |  10 +-
 test/distributed/fsdp/test_fsdp_misc.py       |  22 +-
 test/distributed/fsdp/test_fsdp_state_dict.py |  12 +-
 .../fsdp/test_fsdp_use_orig_params.py         |   9 +-
 test/distributed/fsdp/test_utils.py           |   7 +-
 test/distributed/fsdp/test_wrap.py            |  16 +
 torch/distributed/_composable/fully_shard.py  |   8 +-
 torch/distributed/fsdp/__init__.py            |   1 -
 torch/distributed/fsdp/_init_utils.py         |   5 +-
 torch/distributed/fsdp/_wrap_utils.py         |  17 +-
 torch/distributed/fsdp/flat_param.py          |   3 +-
 .../fsdp/fully_sharded_data_parallel.py       | 155 ++--------
 torch/distributed/fsdp/wrap.py                | 288 ++++++++----------
 torch/testing/_internal/common_fsdp.py        |  20 +-
 15 files changed, 244 insertions(+), 356 deletions(-)

diff --git a/test/distributed/_composable/test_fully_shard.py b/test/distributed/_composable/test_fully_shard.py
index 27e0fb855fba7..ba08deeafcdfb 100644
--- a/test/distributed/_composable/test_fully_shard.py
+++ b/test/distributed/_composable/test_fully_shard.py
@@ -1,7 +1,6 @@
 # Owner(s): ["oncall: distributed"]
 
 import copy
-import functools
 import sys
 from typing import Any, Tuple
 
@@ -12,7 +11,7 @@
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp._common_utils import _is_fsdp_flattened
 from torch.distributed.fsdp._runtime_utils import _root_pre_forward
-from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
+from torch.distributed.fsdp.wrap import ModuleWrapPolicy
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import FSDPTest
 from torch.testing._internal.common_utils import (
@@ -62,10 +61,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return z
 
     @staticmethod
-    def auto_wrap_policy():
-        return functools.partial(
-            transformer_auto_wrap_policy, transformer_layer_cls={SubModel}
-        )
+    def policy():
+        return ModuleWrapPolicy({SubModel})
 
     def get_input(self, device=torch.device) -> Tuple[Any, ...]:
         return (torch.randn((8, 5), device=device),)
@@ -85,13 +82,13 @@ def test_auto_wrap_policy(self):
         local_model = Model(device=torch.device("cuda"))
         fsdp_wrapped_model = FSDP(
             copy.deepcopy(local_model),
-            auto_wrap_policy=Model.auto_wrap_policy(),
+            auto_wrap_policy=Model.policy(),
             use_orig_params=True,
         )
         composable_module = copy.deepcopy(local_model)
         fully_shard(
             composable_module,
-            auto_wrap_policy=Model.auto_wrap_policy(),
+            policy=Model.policy(),
         )
 
         # Check that the composable module has the same names as the local
@@ -138,7 +135,7 @@ def test_device_id(self):
             assert param.device == cpu_device
         fully_shard(
             composable_module,
-            auto_wrap_policy=Model.auto_wrap_policy(),
+            policy=Model.policy(),
             device_id=self.rank,
         )
         for param in composable_module.parameters():
@@ -157,12 +154,12 @@ def test_sync_module_states(self):
                     param.zero_()
         fsdp_wrapped_model = FSDP(
             copy.deepcopy(local_model),
-            auto_wrap_policy=Model.auto_wrap_policy(),
+            auto_wrap_policy=Model.policy(),
             use_orig_params=True,
         )
         fully_shard(
             composable_module,
-            auto_wrap_policy=Model.auto_wrap_policy(),
+            policy=Model.policy(),
             sync_module_states=True,
         )
         for (composable_param, fsdp_wrapped_param) in zip(
@@ -197,13 +194,13 @@ def _param_init_fn(module: nn.Module):
         composable_module = Model(device="meta")
         fsdp_wrapped_model = FSDP(
             Model(device="meta"),
-            auto_wrap_policy=Model.auto_wrap_policy(),
+            auto_wrap_policy=Model.policy(),
             param_init_fn=_param_init_fn,
             use_orig_params=True,
         )
         fully_shard(
             composable_module,
-            auto_wrap_policy=Model.auto_wrap_policy(),
+            policy=Model.policy(),
             param_init_fn=_param_init_fn,
         )
         for (composable_param, fsdp_wrapped_param) in zip(
@@ -227,13 +224,13 @@ def test_training(self):
         local_model = Model(device=device)
         fsdp_wrapped_model = FSDP(
             copy.deepcopy(local_model),
-            auto_wrap_policy=Model.auto_wrap_policy(),
+            auto_wrap_policy=Model.policy(),
             use_orig_params=True,
         )
         composable_module = copy.deepcopy(local_model)
         fully_shard(
             composable_module,
-            auto_wrap_policy=Model.auto_wrap_policy(),
+            policy=Model.policy(),
         )
         del local_model  # not needed anymore
         LR = 1e-2
diff --git a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
index ddba50a9e4561..e587065c5c77f 100644
--- a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
+++ b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
@@ -1,6 +1,5 @@
 # Owner(s): ["oncall: distributed"]
 
-import functools
 import itertools
 import sys
 from typing import Union
@@ -12,7 +11,7 @@
     CPUOffload,
     FullyShardedDataParallel as FSDP,
 )
-from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
+from torch.distributed.fsdp.wrap import ModuleWrapPolicy
 from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
@@ -102,12 +101,11 @@ def _test_ddp_parity(
         )
         ddp_model = DDP(local_model, device_ids=[self.rank])
         fsdp_kwargs = {
-            "auto_wrap_policy": functools.partial(
-                transformer_auto_wrap_policy,
-                transformer_layer_cls={
+            "auto_wrap_policy": ModuleWrapPolicy(
+                {
                     TransformerEncoderLayer,
                     TransformerDecoderLayer,
-                },
+                }
             ),
             "cpu_offload": CPUOffload(offload_params=offload_params),
             "use_orig_params": use_orig_params,
diff --git a/test/distributed/fsdp/test_fsdp_misc.py b/test/distributed/fsdp/test_fsdp_misc.py
index 79ed6da6240fa..8c972f8515634 100644
--- a/test/distributed/fsdp/test_fsdp_misc.py
+++ b/test/distributed/fsdp/test_fsdp_misc.py
@@ -15,7 +15,11 @@
     FullyShardedDataParallel as FSDP,
     ShardingStrategy,
 )
-from torch.distributed.fsdp.wrap import always_wrap_policy, transformer_auto_wrap_policy
+from torch.distributed.fsdp.wrap import (
+    always_wrap_policy,
+    ModuleWrapPolicy,
+    transformer_auto_wrap_policy,
+)
 from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
@@ -211,10 +215,20 @@ def forward(self, x, y):
     def test_device_id_auto_wrap(self):
         """Tests that ``auto_wrap_policy`` propagates ``device_id`` to all
         nested FSDP instances."""
-        auto_wrap_policy = functools.partial(
-            transformer_auto_wrap_policy,
-            transformer_layer_cls={TransformerEncoderLayer, TransformerDecoderLayer},
+        self.run_subtests(
+            {"use_callable": [False, True]},
+            self._test_device_id_auto_wrap,
         )
+
+    def _test_device_id_auto_wrap(self, use_callable: bool):
+        module_classes = {TransformerEncoderLayer, TransformerDecoderLayer}
+        if use_callable:
+            auto_wrap_policy = functools.partial(
+                transformer_auto_wrap_policy,
+                transformer_layer_cls=module_classes,
+            )
+        else:
+            auto_wrap_policy = ModuleWrapPolicy(module_classes)
         fsdp_kwargs = {
             "auto_wrap_policy": auto_wrap_policy,
             "device_id": torch.cuda.current_device(),
diff --git a/test/distributed/fsdp/test_fsdp_state_dict.py b/test/distributed/fsdp/test_fsdp_state_dict.py
index ba51ae66ed1b2..6fafc8e8fdf4a 100644
--- a/test/distributed/fsdp/test_fsdp_state_dict.py
+++ b/test/distributed/fsdp/test_fsdp_state_dict.py
@@ -26,7 +26,7 @@
 )
 from torch.distributed.fsdp._shard_utils import _gather_state_dict
 from torch.distributed.fsdp._unshard_param_utils import FLAT_PARAM
-from torch.distributed.fsdp.wrap import enable_wrap, transformer_auto_wrap_policy, wrap
+from torch.distributed.fsdp.wrap import enable_wrap, ModuleWrapPolicy, wrap
 from torch.nn import Linear, Module, TransformerDecoderLayer, TransformerEncoderLayer
 from torch.nn.parallel import DistributedDataParallel
 from torch.optim import SGD
@@ -350,9 +350,8 @@ def test_state_dict_with_manual_ac_wrapper(
     @skip_if_lt_x_gpu(2)
     @parametrize("state_dict_type", _SUPPORTED_STATE_DICT_IMPLS)
     def test_state_dict_with_shared_parameters(self, state_dict_type):
-        auto_wrap_policy = partial(
-            transformer_auto_wrap_policy,
-            transformer_layer_cls={TransformerEncoderLayer, TransformerDecoderLayer},
+        auto_wrap_policy = ModuleWrapPolicy(
+            {TransformerEncoderLayer, TransformerDecoderLayer}
         )
         model_creator = partial(
             TransformerWithSharedParams.init,
@@ -377,9 +376,8 @@ def test_state_dict_rank0_offload_save_load_flow(self, use_orig_params: bool):
         """Tests saving a model checkpoint only on rank 0 and loading it only
         on rank 0 with ``sync_module_states=True`` to emulate the workflow to
         avoid redundant CPU memory usage."""
-        auto_wrap_policy = partial(
-            transformer_auto_wrap_policy,
-            transformer_layer_cls={TransformerEncoderLayer, TransformerDecoderLayer},
+        auto_wrap_policy = ModuleWrapPolicy(
+            {TransformerEncoderLayer, TransformerDecoderLayer}
         )
         fsdp_kwargs = {
             "auto_wrap_policy": auto_wrap_policy,
diff --git a/test/distributed/fsdp/test_fsdp_use_orig_params.py b/test/distributed/fsdp/test_fsdp_use_orig_params.py
index 24829ff408d9b..0f5ffa564c2d4 100644
--- a/test/distributed/fsdp/test_fsdp_use_orig_params.py
+++ b/test/distributed/fsdp/test_fsdp_use_orig_params.py
@@ -15,7 +15,7 @@
     ShardingStrategy,
 )
 from torch.distributed.fsdp._common_utils import clean_tensor_name
-from torch.distributed.fsdp.wrap import always_wrap_policy, transformer_auto_wrap_policy
+from torch.distributed.fsdp.wrap import always_wrap_policy, ModuleWrapPolicy
 from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer
 from torch.nn.parallel.distributed import DistributedDataParallel as DDP
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
@@ -117,12 +117,11 @@ def _get_fsdp_transformer_and_optim(
         # combination with the parameter group construction, ensures different
         # hyperparameter settings within one `FlatParameter`
         fsdp_kwargs = {
-            "auto_wrap_policy": functools.partial(
-                transformer_auto_wrap_policy,
-                transformer_layer_cls={
+            "auto_wrap_policy": ModuleWrapPolicy(
+                {
                     TransformerEncoderLayer,
                     TransformerDecoderLayer,
-                },
+                }
             ),
             "use_orig_params": True,
             "sharding_strategy": sharding_strategy,
diff --git a/test/distributed/fsdp/test_utils.py b/test/distributed/fsdp/test_utils.py
index e797325ccbc99..37c52547e8472 100644
--- a/test/distributed/fsdp/test_utils.py
+++ b/test/distributed/fsdp/test_utils.py
@@ -1,6 +1,5 @@
 # Owner(s): ["oncall: distributed"]
 
-import functools
 import random
 import sys
 import unittest
@@ -14,7 +13,7 @@
 from torch import distributed as dist
 from torch.distributed.fsdp._utils import _apply_to_tensors
 from torch.distributed.fsdp._wrap_utils import _get_submodule_to_states
-from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
+from torch.distributed.fsdp.wrap import ModuleWrapPolicy
 from torch.distributed.utils import _replace_by_prefix
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -173,9 +172,7 @@ def test_module_wrap_policy(self):
         # Compute the mapping from submodule to states according to a logical
         # module wrap policy
         module_classes = (nn.Sequential,)
-        auto_wrap_policy = functools.partial(
-            transformer_auto_wrap_policy, transformer_layer_cls=set(module_classes)
-        )
+        auto_wrap_policy = ModuleWrapPolicy(set(module_classes))
         submodule_to_states = _get_submodule_to_states(
             model, auto_wrap_policy, set(), set()
         )
diff --git a/test/distributed/fsdp/test_wrap.py b/test/distributed/fsdp/test_wrap.py
index cd0d11ba9b4b1..e157f041ae1bd 100644
--- a/test/distributed/fsdp/test_wrap.py
+++ b/test/distributed/fsdp/test_wrap.py
@@ -5,6 +5,7 @@
 import tempfile
 import unittest
 from enum import auto, Enum
+from typing import Callable, Union
 
 import torch
 import torch.nn as nn
@@ -15,10 +16,12 @@
     FullyShardedDataParallel as FSDP,
 )
 from torch.distributed.fsdp.wrap import (
+    _FSDPPolicy,
     _or_policy,
     _wrap_batchnorm_individually,
     always_wrap_policy,
     enable_wrap,
+    ModuleWrapPolicy,
     size_based_auto_wrap_policy,
     transformer_auto_wrap_policy,
     wrap,
@@ -373,6 +376,19 @@ def test_transformer_auto_wrap_policy(self):
             transformer_auto_wrap_policy,
             transformer_layer_cls={TransformerEncoderLayer, TransformerDecoderLayer},
         )
+        self._test_transformer_wrapping(auto_wrap_policy)
+
+    @unittest.skipIf(torch.cuda.device_count() < 2, "Requires at least 2 GPUs")
+    def test_module_wrap_policy(self):
+        """Tests the ``ModuleWrapPolicy``."""
+        auto_wrap_policy = ModuleWrapPolicy(
+            {TransformerEncoderLayer, TransformerDecoderLayer}
+        )
+        self._test_transformer_wrapping(auto_wrap_policy)
+
+    def _test_transformer_wrapping(
+        self, auto_wrap_policy: Union[Callable, _FSDPPolicy]
+    ):
         fsdp_kwargs = {"auto_wrap_policy": auto_wrap_policy}
         fsdp_model = TransformerWithSharedParams.init(
             self.process_group,
diff --git a/torch/distributed/_composable/fully_shard.py b/torch/distributed/_composable/fully_shard.py
index 2d9e9329795bd..174b2ca89a788 100644
--- a/torch/distributed/_composable/fully_shard.py
+++ b/torch/distributed/_composable/fully_shard.py
@@ -24,6 +24,7 @@
     MixedPrecision,
     ShardingStrategy,
 )
+from torch.distributed.fsdp.wrap import _FSDPPolicy
 
 
 @contract
@@ -32,7 +33,7 @@ def fully_shard(
     process_group: Optional[dist.ProcessGroup] = None,
     mixed_precision: Optional[MixedPrecision] = None,
     cpu_offload: Optional[CPUOffload] = None,
-    auto_wrap_policy: Optional[Callable] = None,
+    policy: Optional[_FSDPPolicy] = None,
     ignored_modules: Optional[Iterable[torch.nn.Module]] = None,
     device_id: Optional[Union[int, torch.device]] = None,
     param_init_fn: Optional[Callable[[nn.Module], None]] = None,
@@ -41,6 +42,9 @@ def fully_shard(
     """
     Applies ``FullyShardedDataParallel` (FSDP) semantics to ``module``.
     """
+    # Enforce the new auto wrap policy
+    if policy is not None and not isinstance(policy, _FSDPPolicy):
+        raise ValueError(f"Expects an `_FSDPPolicy` but got {policy}")
     state = fully_shard.state(module)
     state = _init_ignored_module_states(state, module, ignored_modules)
     state = _init_process_group_state(state, process_group)
@@ -64,7 +68,7 @@ def fully_shard(
     state = _init_param_handles_from_module(
         state,
         module,
-        auto_wrap_policy,
+        policy,
         device_id,
         param_init_fn,
         sync_module_states,
diff --git a/torch/distributed/fsdp/__init__.py b/torch/distributed/fsdp/__init__.py
index 324a3442dea95..b1bffdb25a0eb 100644
--- a/torch/distributed/fsdp/__init__.py
+++ b/torch/distributed/fsdp/__init__.py
@@ -11,4 +11,3 @@
     ShardingStrategy,
     StateDictType,
 )
-from .wrap import ParamExecOrderWrapPolicy
diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index 1265ee3578ed4..7e128251fcc49 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -47,6 +47,7 @@
     HandleConfig,
     HandleShardingStrategy,
 )
+from torch.distributed.fsdp.wrap import _FSDPPolicy
 from torch.distributed.utils import _sync_params_and_buffers
 from torch.utils.hooks import RemovableHandle
 
@@ -262,7 +263,7 @@ def _init_param_handle_from_module(
 def _init_param_handles_from_module(
     state: _FSDPState,
     root_module: nn.Module,
-    auto_wrap_policy: Callable,
+    policy: _FSDPPolicy,
     device_id: Optional[Union[int, torch.device]],
     param_init_fn: Optional[Callable[[nn.Module], None]],
     sync_module_states: bool,
@@ -273,7 +274,7 @@ def _init_param_handles_from_module(
     """
     submodule_to_states = _get_submodule_to_states(
         root_module,
-        auto_wrap_policy,
+        policy,
         state._ignored_modules,
         state._ignored_params,
     )
diff --git a/torch/distributed/fsdp/_wrap_utils.py b/torch/distributed/fsdp/_wrap_utils.py
index 34d1c9c1ac243..cdda065df1993 100644
--- a/torch/distributed/fsdp/_wrap_utils.py
+++ b/torch/distributed/fsdp/_wrap_utils.py
@@ -1,7 +1,7 @@
 import collections
 import functools
 import warnings
-from typing import Any, Callable, Deque, Dict, List, NamedTuple, Set, Tuple
+from typing import Any, Deque, Dict, List, NamedTuple, Set, Tuple
 
 import torch
 import torch.nn as nn
@@ -10,6 +10,7 @@
     _override_batchnorm_mixed_precision,
 )
 from torch.distributed.fsdp.wrap import (
+    _FSDPPolicy,
     _or_policy,
     _recursive_wrap,
     _wrap_batchnorm_individually,
@@ -45,6 +46,9 @@ def _auto_wrap(
     ``fsdp_kwargs`` contains all FSDP arguments except ``module``.
     """
     auto_wrap_policy = auto_wrap_kwargs["auto_wrap_policy"]
+    # Support new way to pass an auto wrap policy
+    if isinstance(auto_wrap_policy, _FSDPPolicy):
+        auto_wrap_policy = auto_wrap_policy.policy
     root_module = auto_wrap_kwargs["module"]
     assert auto_wrap_policy is not None
     # For auto wrapping, submodules should not already be wrapped with FSDP
@@ -68,13 +72,13 @@ def _auto_wrap(
             "instances with mixed precision disabled since some batch norm "
             "kernels do not support low precision."
         )
-        auto_wrap_kwargs["auto_wrap_policy"] = auto_wrap_policy
+    auto_wrap_kwargs["auto_wrap_policy"] = auto_wrap_policy
     _recursive_wrap(**auto_wrap_kwargs, **fsdp_kwargs)
 
 
 def _get_submodule_to_states(
     root_module: nn.Module,
-    auto_wrap_policy: Callable,
+    auto_wrap_policy: _FSDPPolicy,
     ignored_modules: Set[nn.Module],
     ignored_params: Set[nn.Parameter],
 ) -> Dict[nn.Module, SubmoduleState]:
@@ -99,7 +103,7 @@ def _get_submodule_to_states(
     wrapper_cls = functools.partial(_record_module_wrapper_cls, wrapped_modules)
     _recursive_wrap(
         root_module,
-        auto_wrap_policy=auto_wrap_policy,
+        auto_wrap_policy=auto_wrap_policy.policy,
         wrapper_cls=wrapper_cls,
         ignored_modules=ignored_modules,
         ignored_params=ignored_params,
@@ -158,8 +162,9 @@ def _record_module_wrapper_cls(
     **kwargs,
 ) -> nn.Module:
     """
-    This defines a wrapper class to be passed to ``_recursive_wrap()`` that
-    records the wrapped module to the input ``wrapped_modules``.
+    This defines a pseudo-wrapper class to be passed to ``_recursive_wrap()``
+    that records the wrapped module to the input ``wrapped_modules`` without
+    actually wrapping with a class.
     """
     wrapped_modules.append(module)
     return module
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index b790590c7943f..b5892bca683a2 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -838,7 +838,8 @@ def needs_unshard(self) -> bool:
             return False
         unsharded_flat_param = self._get_padded_unsharded_flat_param()
         already_unsharded = (
-            unsharded_flat_param._typed_storage()._size() == unsharded_flat_param.numel()
+            unsharded_flat_param._typed_storage()._size()
+            == unsharded_flat_param.numel()
         )
         return not already_unsharded
 
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 510f90de20234..69c8dd92ed8dc 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -96,14 +96,6 @@
 )
 from ._utils import p_assert
 from .flat_param import FlatParameter, FlatParamHandle
-from .wrap import ParamExecOrderWrapPolicy
-
-
-_TORCH_FX_AVAIL = True
-if not hasattr(torch, "fx"):
-    _TORCH_FX_AVAIL = False
-if _TORCH_FX_AVAIL:
-    from ._symbolic_trace import _init_execution_info, _patch_tracer, TracingConfig
 
 
 __all__ = [
@@ -207,37 +199,36 @@ class FullyShardedDataParallel(nn.Module):
             This configures CPU offloading. If this is set to ``None``, then
             no CPU offloading happens. See :class:`CPUOffload` for details.
             (Default: ``None``)
-        auto_wrap_policy (Optional[Callable[[nn.Module, bool, int], bool]]):
-            A callable specifying a policy to recursively wrap layers with FSDP.
-            Note that this policy currently will only apply to child modules of
-            the passed in module. The remainder modules are always wrapped in
-            the returned FSDP root instance.
-            ``size_based_auto_wrap_policy`` written in ``torch.distributed.fsdp.wrap`` is
-            an example of ``auto_wrap_policy`` callable, this policy wraps layers
-            with the number of parameters larger than 100M. ``transformer_auto_wrap_policy``
-            written in ``torch.distributed.fsdp.wrap`` is an example of ``auto_wrap_policy``
-            callable for transformer-like model architectures. Users can supply the customized
-            ``auto_wrap_policy`` callable that should accept following arguments:
-            ``module: nn.Module``, ``recurse: bool``, ``unwrapped_params: int``, and return
-            a ``bool`` specifying whether the passed in ``module``` should be wrapped
-            (if ``recurse=False``) or whether we should recurse down the subgraph of ``module``
-            children (if ``recurse=True``). Extra customized arguments could be added to
-            the customized ``auto_wrap_policy`` callable as well. It is a good practice to
-            print out the sharded model and check whether the sharded model is what
-            the application wants and then adjust accordingly.
+        auto_wrap_policy (Optional[Union[Callable[[nn.Module, bool, int], bool], _FSDPPolicy]]):
+            This is either ``None``, an ``_FSDPPolicy``, or a callable of
+            a fixed signature. If it is ``None``, then ``module`` is wrapped
+            with only a top-level FSDP instance without any nested wrapping. If
+            it is an ``_FSDPPolicy``, then the wrapping follows the given
+            policy. ``ModuleWrapPolicy`` in ``torch.distributed.fsdp.wrap.py``
+            is an example. If it is a callable, then it should take in three
+            arguments ``module: nn.Module``, ``recurse: bool``, and
+            ``nonwrapped_numel: int`` and should return a ``bool`` specifying
+            whether the passed-in ``module`` should be wrapped if
+            ``recurse=False`` or if the traversal should continue down the
+            subtree if ``recurse=True``. Additional custom arguments may be
+            added to the callable. The ``size_based_auto_wrap_policy`` in
+            ``torch.distributed.fsdp.wrap.py`` gives an example callable that
+            wraps a module if the parameters in its subtree exceed 100M numel.
+            A good practice is to print the model after wrapping and adjust as
+            needed.
 
             Example::
 
                 >>> def custom_auto_wrap_policy(
                 >>>     module: nn.Module,
                 >>>     recurse: bool,
-                >>>     unwrapped_params: int,
-                >>>     # These are customizable for this policy function.
+                >>>     nonwrapped_numel: int,
+                >>>     # Additional custom arguments
                 >>>     min_num_params: int = int(1e8),
                 >>> ) -> bool:
-                >>>     return unwrapped_params >= min_num_params
-                >>> # Configure a custom min_num_params
-                >>> my_auto_wrap_policy = functools.partial(custom_auto_wrap_policy, min_num_params=1e5)
+                >>>     return nonwrapped_numel >= min_num_params
+                >>> # Configure a custom `min_num_params`
+                >>> my_auto_wrap_policy = functools.partial(custom_auto_wrap_policy, min_num_params=int(1e5))
 
         backward_prefetch (Optional[BackwardPrefetch]):
             This configures explicit backward prefetching of all-gathers. See
@@ -337,25 +328,6 @@ def __init__(
         limit_all_gathers: bool = False,
         use_orig_params: bool = False,
     ):
-        if isinstance(auto_wrap_policy, ParamExecOrderWrapPolicy):
-            self._init_param_exec_order_wrap_policy(
-                module=module,
-                process_group=process_group,
-                sharding_strategy=sharding_strategy,
-                cpu_offload=cpu_offload,
-                auto_wrap_policy=auto_wrap_policy,
-                backward_prefetch=backward_prefetch,
-                mixed_precision=mixed_precision,
-                ignored_modules=ignored_modules,
-                param_init_fn=param_init_fn,
-                device_id=device_id,
-                sync_module_states=sync_module_states,
-                forward_prefetch=forward_prefetch,
-                limit_all_gathers=limit_all_gathers,
-                use_orig_params=use_orig_params,
-            )
-            return
-
         torch._C._log_api_usage_once("torch.distributed.fsdp")
         super().__init__()
 
@@ -1815,89 +1787,6 @@ def register_comm_hook(self, state: object, hook: callable):
             submodule._communication_hook_state = state
             submodule._communication_hook = hook
 
-    def _init_param_exec_order_wrap_policy(self, *args, **kwargs) -> None:
-        auto_wrap_policy = kwargs["auto_wrap_policy"]
-        module = kwargs["module"]
-        assert hasattr(auto_wrap_policy, "tracing_config")
-        if not _TORCH_FX_AVAIL:
-            assert (
-                auto_wrap_policy.tracing_config is None
-            ), "tracing_config should be None when torch.fx is not enabled"
-        elif isinstance(auto_wrap_policy.tracing_config, TracingConfig):
-            tracer = auto_wrap_policy.tracing_config.tracer
-            execution_info = _init_execution_info(module)
-
-            for m in module.modules():
-                assert not isinstance(
-                    m, FullyShardedDataParallel
-                ), "The input module of _patch_tracer should not contain FSDP modules"
-
-            with _patch_tracer(
-                tracer=tracer,
-                root_module=module,
-                execution_info=execution_info,
-            ):
-                try:
-                    tracer.trace(module, auto_wrap_policy.tracing_config.concrete_args)
-                except BaseException as e:
-                    raise RuntimeError(
-                        "tracer.trace failed inside _init_param_exec_order_wrap_policy"
-                        f" with the error: {e}."
-                    )
-        else:
-            assert (
-                auto_wrap_policy.tracing_config is None
-            ), "tracing_config should either be an instance of TracingConfig or be None"
-        # The initial FSDP wrapping is done with auto_wrap_policy.init_policy
-        kwargs["auto_wrap_policy"] = auto_wrap_policy.init_policy
-        self.__init__(*args, **kwargs)
-        self._param_exec_order_policy: bool = True
-        # self._param_exec_order_prep_stage is set to True before we get the execution order
-        self._param_exec_order_prep_stage: bool = True
-        # A list that stores the flatten parameters and its name based on the parameter execution order
-        self._fsdp_params_exec_order: List[FlatParameter] = []
-        if _TORCH_FX_AVAIL and isinstance(
-            auto_wrap_policy.tracing_config, TracingConfig
-        ):
-            # Initialize a dict that maps each module to its parent FSDP wrap
-            module_to_fsdp: Dict[nn.Module, FullyShardedDataParallel] = dict()
-            for wrap in self.fsdp_modules(self):
-                module_to_fsdp[wrap.module] = wrap
-            # Set self._fsdp_params_exec_order based on execution_info.module_forward_order.
-            # TODO (linjianma): self._fsdp_params_exec_order will be set based on
-            # the parameter execution order rather than module_forward_order,
-            # once the non-recursive wrapping policy is fully implemented.
-            for m in execution_info.module_forward_order:
-                if m in module_to_fsdp:
-                    for flat_param in module_to_fsdp[m].params:
-                        self._fsdp_params_exec_order.append(flat_param)
-            self._param_exec_order_prep_stage = False
-
-        for m in self.modules():
-            if m is not self and isinstance(m, FullyShardedDataParallel):
-                # Assignment by reference, so each children FSDP wrap has access to
-                # the _fsdp_params_exec_order of the root module
-                m._fsdp_params_exec_order = self._fsdp_params_exec_order
-                m._param_exec_order_policy = self._param_exec_order_policy
-                m._param_exec_order_prep_stage = self._param_exec_order_prep_stage
-
-    def _use_param_exec_order_policy(self) -> bool:
-        return (
-            hasattr(self, "_param_exec_order_policy") and self._param_exec_order_policy
-        )
-
-    def _is_param_exec_order_prep_stage(self) -> bool:
-        is_prep_stage = (
-            hasattr(self, "_param_exec_order_prep_stage")
-            and self._param_exec_order_prep_stage
-        )
-        if not is_prep_stage:
-            for p in self.parameters():
-                assert not hasattr(
-                    p, "_params_exec_order_hook_handle"
-                ), "When not in execution order prep stage, all _params_exec_order_hook_handle should be removed."
-        return is_prep_stage
-
 
 def _get_grad_norm(
     params: List[nn.Parameter],
diff --git a/torch/distributed/fsdp/wrap.py b/torch/distributed/fsdp/wrap.py
index c529bcde8c859..e20c07f18d132 100644
--- a/torch/distributed/fsdp/wrap.py
+++ b/torch/distributed/fsdp/wrap.py
@@ -4,7 +4,8 @@
 # LICENSE file in the root directory of this source tree.
 
 import contextlib
-from dataclasses import dataclass
+import functools
+from abc import ABC, abstractmethod
 from typing import Any, Callable, cast, Dict, Generator, Optional, Set, Tuple, Type
 
 import torch.nn as nn
@@ -17,22 +18,84 @@
     "size_based_auto_wrap_policy",
     "enable_wrap",
     "wrap",
-    "ParamExecOrderWrapPolicy",
+    "ModuleWrapPolicy",
 ]
 
 
 def always_wrap_policy(*args, **kwargs) -> bool:
     """
-    A simple wrapper policy that always returns ``True``,
-    i.e. when passed as the `auto_wrap_policy` into FSDP,
-    this will result in all submodules being wrapped as
-    distinct FSDP instances.
+    A simple recursive wrap policy that always returns ``True``. This means
+    that every submodule is wrapped by the wrapper class in
+    :func:`_recursive_wrap`.
     """
     return True
 
 
+class _FSDPPolicy(ABC):
+    """
+    This defines an abstract base class that represents an FSDP policy for
+    constructing ``FlatParameter`` s.
+    """
+
+    # The motivation for this abstract base class is to hide the interface
+    # expected by `_recursive_wrap()` from users (i.e. the `recurse` argument).
+    def __init__(self):
+        ...
+
+    @property
+    @abstractmethod
+    def policy(self) -> Callable:
+        ...
+
+
+def _module_wrap_policy(
+    module: nn.Module,
+    recurse: bool,
+    nonwrapped_numel: int,
+    module_classes: Set[Type[nn.Module]],
+) -> bool:
+    """
+    This auto wrap policy wraps every module that is an instance of any type in
+    ``module_classes`` as its own FSDP instance. The root module given by
+    ``module`` is always wrapped as an FSDP instance regardless. Since the
+    wrapping proceeds bottom up, each FSDP instance manages the parameters in
+    its subtree excluding any already managed by a child FSDP instance.
+
+    Args:
+        module (nn.Module): Current module being considered.
+        recurse (bool): If ``False``, then this function must decide whether
+            ``module`` should be wrapped as an FSDP instance or not. If
+            ``True``, then the function is still recursing down the module
+            tree as a part of the DFS.
+        nonwrapped_numel (int): Parameter numel not yet wrapped.
+        module_classes (Set[Type[nn.Module]]): Set of module classes that are
+            wrapped as FSDP instances.
+
+    Returns:
+        ``True`` if ``recurse=True``, and whether ``module`` should be wrapped
+        if ``recurse=False``.
+    """
+    if recurse:
+        return True  # always recurse
+    return isinstance(module, tuple(module_classes))
+
+
+class ModuleWrapPolicy(_FSDPPolicy):
+    """This is a wrapper around :func:`_module_wrap_policy`."""
+
+    def __init__(self, module_classes: Set[Type[nn.Module]]):
+        self._policy: Callable = functools.partial(
+            _module_wrap_policy,
+            module_classes=module_classes,
+        )
+
+    @property
+    def policy(self):
+        return self._policy
+
+
 def lambda_auto_wrap_policy(
-    module: nn.Module, recurse: bool, unwrapped_params: int, lambda_fn: Callable
+    module: nn.Module, recurse: bool, nonwrapped_numel: int, lambda_fn: Callable
 ) -> bool:
     """
     A convenient auto wrap policy to wrap submodules based on an arbitrary user
@@ -44,70 +107,34 @@ def lambda_auto_wrap_policy(
     The first three parameters are required by :func:`_recursive_wrap`.
 
     Args:
-       module (nn.Module):
-           The module to be considered in this decision.
-       recurse (bool):
-           Indicate if this is called to make a decision on whether we
-           should recurse down a subgraph of the module structure.
-           If False, it means this function is called to make a decision
-           on whether we should wrap the said module.
-       unwrapped_params (int):
-           The number of parameters yet to be wrapped in this module.
-
-       lambda_fn (Callable[nn.Module] -> bool):
-           If this returns ``True``, this module will be wrapped by
-           wrapper_cls individually.
+        module (nn.Module): Current module being considered.
+        recurse (bool): If ``False``, then this function must decide whether
+            ``module`` should be wrapped as an FSDP instance or not. If
+            ``True``, then the function is still recursing down the module
+            tree as a part of the DFS.
+        nonwrapped_numel (int): Parameter numel not yet wrapped.
+
+        lambda_fn (Callable[[nn.Module], bool]): If this returns ``True``, then
+            this module will be wrapped.
     """
     if recurse:
-        # always recurse
-        return True
-    else:
-        # if not recursing, decide whether we should wrap for the leaf node or reminder
-        return lambda_fn(module)
+        return True  # always recurse
+    return lambda_fn(module)
 
 
 def transformer_auto_wrap_policy(
     module: nn.Module,
     recurse: bool,
-    unwrapped_params: int,
+    nonwrapped_numel: int,
     transformer_layer_cls: Set[Type[nn.Module]],
 ) -> bool:
     """
-    A convenient auto wrap policy for transformer models. If the submodule
-    is an instance of transformer_layer_cls, the submodule will be wrapped
-    as a FSDP unit. Otherwise, all the other remainder submodules are wrapped
-    by the outermost FSDP unit. Right now, FSDP requires submodules that share
-    weights to be wrapped in the same FSDP unit, this auto wrap policy can
-    conviniently wrap the shared embeddings into the same FSDP unit for transformer
-    models. In the near future, FSDP will support submodules that share weights
-    to be wrapped in the separated FSDP units.
-
-    Return if a module should be wrapped during FSDP auto wrapping.
-
-    The first three parameters are required by :func:`_recursive_wrap`.
-
-
-    Args:
-       module (nn.Module):
-           The module to be considered in this decision.
-       recurse (bool):
-           Indicate if this is called to make a decision on whether we
-           should recurse down a subgraph of the module structure.
-           If False, it means this function is called to make a decision
-           on whether we should wrap the said module.
-       unwrapped_params (int):
-           The number of parameters yet to be wrapped in this module.
-
-       transformer_layer_cls (int):
-           Submodules with one of the `transformer_layer_cls` names
-           will be wrapped as separated FSDP units
+    See :func:`_module_wrap_policy`, where ``transformer_layer_cls`` is the
+    same as ``module_classes``. Note that shared parameters must be wrapped in
+    the same FSDP instance, so this auto wrap policy can help wrap shared
+    embeddings into the same FSDP instance for transformer models.
     """
-    if recurse:
-        # always recurse
-        return True
-    else:
-        # if not recursing, decide whether we should wrap for the leaf node or reminder
-        return isinstance(module, tuple(transformer_layer_cls))
+    return _module_wrap_policy(module, recurse, nonwrapped_numel, transformer_layer_cls)
 
 
 def _wrap_batchnorm_individually(
@@ -117,7 +144,7 @@ def _wrap_batchnorm_individually(
     **kwargs,
 ) -> bool:
     """
-    A policy that wraps ``BatchNorm`` instances in their own FSDP unit.
+    A policy that wraps ``BatchNorm`` instances in their own FSDP instance.
     """
     if recurse:
         # always recurse
@@ -131,52 +158,46 @@ def _wrap_batchnorm_individually(
 def _or_policy(
     module: nn.Module,
     recurse: bool,
-    unwrapped_params: int,
+    nonwrapped_numel: int,
     policies,
 ) -> bool:
     """
     A policy that wraps ``module`` if any policy in the passed in iterable of
     ``policies`` returns ``True``.
     """
-    return any(policy(module, recurse, unwrapped_params) for policy in policies)
+    return any(policy(module, recurse, nonwrapped_numel) for policy in policies)
 
 
 def size_based_auto_wrap_policy(
     module: nn.Module,
     recurse: bool,
-    unwrapped_params: int,
-    # These are customizable for this policy function.
+    nonwrapped_numel: int,
+    # Additional custom arguments
     min_num_params: int = int(1e8),
     force_leaf_modules: Optional[Set[Type[nn.Module]]] = None,
     exclude_wrap_modules: Optional[Set[Type[nn.Module]]] = None,
 ) -> bool:
-    """A size based auto_wrap_policy function for FSDP API.
-
-       Return if a module should be wrapped during FSDP auto wrapping.
-
-       The first three parameters are used by :func:`_recursive_wrap`. If
-       you write a custom version of this policy function, your version
-       needs to at least accept the first three parameters and free
-       to do whatever you want in the function.
+    """
+    A size-based auto wrap policy.
 
     Args:
-       module (nn.Module):
-           The module to be considered in this decision.
-       recurse (bool):
-           Indicate if this is called to make a decision on whether we
-           should recurse down a subgraph of the module structure.
-           If False, it means this function is called to make a decision
-           on whether we should wrap the said module.
-       unwrapped_params (int):
-           The number of parameters yet to be wrapped in this module.
-
-       min_num_params (int):
-           Customizable policy input. It controls the size threshold
-           on how big should a module be to be considered wrapped.
-       force_leaf_modules (Set[Type[nn.Module]]): set of module types to
-           keep as leaves, i.e., their children will never be wrapped.
-       exclude_wrap_modules (Set[Type[nn.Module]]):
-           Customizable set of module types to be excluded in wrapping.
+        module (nn.Module): Current module being considered.
+        recurse (bool): If ``False``, then this function must decide whether
+            ``module`` should be wrapped as an FSDP instance or not. If
+            ``True``, then the function is still recursing down the module
+            tree as a part of the DFS.
+        nonwrapped_numel (int): Parameter numel not yet wrapped.
+
+        min_num_params (int): Customizable policy input that controls the size
+            threshold over which a module is ready to be wrapped. This is in
+            units of numel.
+        force_leaf_modules (Set[Type[nn.Module]]): Set of module types to keep
+            as leaves, i.e. their children will never be wrapped.
+        exclude_wrap_modules (Set[Type[nn.Module]]): Set of module types to be
+            excluded in wrapping.
+
+    Returns:
+        Whether ``module`` should be wrapped.
     """
     force_leaf_modules = (
         size_based_auto_wrap_policy.FORCE_LEAF_MODULES  # type: ignore[attr-defined]
@@ -189,7 +210,10 @@ def size_based_auto_wrap_policy(
         else exclude_wrap_modules
     )
 
-    is_large = unwrapped_params >= min_num_params
+    # Keep the argument `min_num_params` for BC for now, but it represents the
+    # minimum non-wrapped *numel* before triggering a wrapping
+    min_nonwrapped_numel = min_num_params
+    is_large = nonwrapped_numel >= min_nonwrapped_numel
     if recurse:
         # We should recurse if the module is big enough but not in force_leaf_modules list.
         return is_large and not isinstance(module, tuple(force_leaf_modules))
@@ -276,56 +300,6 @@ def wrap(module: nn.Module, **wrap_overrides: Any) -> nn.Module:
     return module
 
 
-@dataclass
-class ParamExecOrderWrapPolicy:
-    """
-    This is the class used for the wrapping policy that wraps parameters and performs
-    the communication scheduling based on the parameter execution order in the forward pass
-    (also called non-recursive wrapping policy).
-
-    The policy contains multiple wraps. Each wrap contains original parameters that will be executed together,
-    and the wrap transfers these parameters into one ``FlattenParameter``. In both forward and the backward passes,
-    the sharded parameters in each wrap will be gathered just before these parameters are used in the passes.
-    These parameters will then be reshaded once they have been used.
-
-    TODO (linjianma): For now, the parameters contained in each wrap of ``ParamExecOrderWrapPolicy``
-    are the parameters in each wrap of the ``init_policy`` (a recursive wrapping policy).
-    Later we will wrap parameters based on bucket size.
-
-    Args:
-        init_policy (Callable):
-            The initial recursive wrapping policy used to guide the wrapping of
-            this policy. If tracing_config is none, in the first forward and
-            backward iteration, ``init_policy`` is used to record parameter
-            execution order. Otherwise, init_policy is only used in FSDP
-            constructor for module level wrapping.
-
-            The default ``always_wrap_policy`` might not be the best choice for every model. For example, for
-            transformer based models, setting ``transformer_auto_wrap_policy`` as the ``init_policy`` will guarantee
-            wrapping each transformer layer into one FSDP unit, and can be easily combined with checkpointing
-            within each transformer layer.
-
-        tracing_config (Optional[TracingConfig]):
-            The configuration used to perform symbolic tracing at FSDP
-            constructor to get the module and parameter execution order. The
-            type of ``tracing_config`` needs to be either ``None`` or
-            ``TracingConfig``. If set as ``None``, then symbolic tracing is not
-            enabled, and one forward as well as backward iteration are needed to
-            get the parameter execution order.
-
-    ..warning :: Note that not all modules can be successfully traced when
-    ``tracing_config`` is not None and symbolic tracing is enabled. The two
-    cases below may be unable to trace: 1. when there is a data-dependent
-    branch, 2. when the forward pass contains operators that don't support
-    ``torch.fx.Proxy`` as the input type (e.g. ``arange``, ``zeros``, ``ones``,
-    ``full``, ``full_like``, ``eye``, ``empty``, ``tensor``). For those cases,
-    users can set ``tracing_config = None`` to disable symbolic tracing.
-    """
-
-    init_policy: Callable = always_wrap_policy
-    tracing_config: Any = None
-
-
 def _wrap(module: nn.Module, wrapper_cls: Callable, **kwargs) -> nn.Module:
     assert wrapper_cls is not None
     if hasattr(module, "_wrap_overrides"):
@@ -349,13 +323,13 @@ def _recursive_wrap(
     **kwargs: Any,
 ) -> Tuple[nn.Module, int]:
     """
-    Automatically wrap child modules of *module* that meet the given
-    criteria with :func:`auto_wrap`. Does not rely on _ConfigAutoWrap.
+    Wraps submodules of ``module`` for which ``auto_wrap_policy`` returns
+    ``True`` with ``wrapper_cls``.
+
     Args:
-        module (nn.Module):
-            module to recursively wrap
-        auto_wrap_policy (Callable):
-            A callable specifying a policy to recursively wrap layers with FSDP.
+        module (nn.Module): Module to recursively wrap.
+        auto_wrap_policy (Callable): A callable representing a policy that
+            determines which modules to recursively wrap with ``wrapper_cls``.
         ignored_modules (Set[torch.nn.Module]): Modules to ignore when
             wrapping.
         ignored_params (Set[torch.nn.Parameter]): Parameters to ignore when
@@ -363,7 +337,7 @@ def _recursive_wrap(
             in ``ignored_modules``.
     Returns:
         (nn.Module, int):
-            Wrapped module and the number parameters wrapped recursively.
+            ``module`` after wrapping and the numel recursively wrapped.
     """
     assert auto_wrap_policy is not None, "Must specify auto_wrap_policy."
     assert wrapper_cls is not None, "Must specify wrapper_cls"
@@ -378,11 +352,13 @@ def _recursive_wrap(
             pass
 
     # We count all params, assuming none of them are already wrapped.
-    num_params = sum(p.numel() for p in module.parameters() if p not in ignored_params)
+    nonwrapped_numel = sum(
+        p.numel() for p in module.parameters() if p not in ignored_params
+    )
 
     assert auto_wrap_policy is not None
-    if auto_wrap_policy(module=module, recurse=True, unwrapped_params=num_params):
-        total_wrapped_params = 0
+    if auto_wrap_policy(module=module, recurse=True, nonwrapped_numel=nonwrapped_numel):
+        total_wrapped_numel = 0
         # Iterate through the children, recursively wrap if necessary
         for name, child in module.named_children():
             if child in ignored_modules:
@@ -397,17 +373,17 @@ def _recursive_wrap(
             )
             setattr(module, name, wrapped_child)
             # Keep track of how many parameters have been wrapped
-            total_wrapped_params += num_wrapped_params
+            total_wrapped_numel += num_wrapped_params
         # decide if we need to wrap the current module,
         # since the left over parameters exceed the number of params to wrap
-        remainder = num_params - total_wrapped_params
+        remainder = nonwrapped_numel - total_wrapped_numel
         if not only_wrap_children and auto_wrap_policy(
-            module=module, recurse=False, unwrapped_params=remainder
+            module=module, recurse=False, nonwrapped_numel=remainder
         ):
             # Leaf node or final wrapping of the remainder both happen here.
-            return _wrap(module, wrapper_cls, **kwargs), num_params
+            return _wrap(module, wrapper_cls, **kwargs), nonwrapped_numel
         else:
-            return module, total_wrapped_params
+            return module, total_wrapped_numel
     return module, 0
 
 
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index 0dca22f48092b..b4650adff569b 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -1,6 +1,5 @@
 # Owner(s): ["oncall: distributed"]
 
-import functools
 import itertools
 import sys
 from abc import ABC, abstractmethod
@@ -21,11 +20,7 @@
     ShardingStrategy,
 )
 from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
-from torch.distributed.fsdp.wrap import (
-    always_wrap_policy,
-    transformer_auto_wrap_policy,
-    wrap,
-)
+from torch.distributed.fsdp.wrap import always_wrap_policy, ModuleWrapPolicy, wrap
 from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer
 from torch.nn.parallel.distributed import DistributedDataParallel as DDP
 from torch.testing._internal.common_distributed import MultiProcessTestCase, TEST_SKIPS
@@ -285,8 +280,8 @@ def init(
             fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
                 any modules with FSDP. If ``RECURSIVE``, then wraps with
                 top-level FSDP. By default, the top-level FSDP uses the
-                ``transformer_auto_wrap_policy()`` for encoder and decoder
-                layers, but a different auto wrap policy may be specified via
+                ``ModuleWrapPolicy`` for encoder and decoder layers, but a
+                different auto wrap policy may be specified via
                 ``fsdp_kwargs``.
             cuda_init_mode (CUDAInitMode): Determines model movement to CUDA.
             fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
@@ -302,14 +297,13 @@ def init(
                 group, cuda_init_mode, add_bn, deterministic
             )
         elif fsdp_init_mode == FSDPInitMode.RECURSIVE:
-            # Default to the `transformer_auto_wrap_policy()`
+            # Default to the `ModuleWrapPolicy`
             if "auto_wrap_policy" not in fsdp_kwargs:
-                auto_wrap_policy = functools.partial(
-                    transformer_auto_wrap_policy,
-                    transformer_layer_cls={
+                auto_wrap_policy = ModuleWrapPolicy(
+                    {
                         TransformerEncoderLayer,
                         TransformerDecoderLayer,
-                    },
+                    }
                 )
             else:
                 auto_wrap_policy = fsdp_kwargs.pop("auto_wrap_policy")

From ce510a9d4c59e38a5ca3b2434fa528ae7426f14b Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Sat, 12 Nov 2022 04:45:17 +0000
Subject: [PATCH 0828/1922] [dynamo][api] Better support of torch.nn.Module
 (#88629)

This is an API change, so please review carefully.

With this PR, torchdynamo returns an `OptimizedModule` class object, a subclass of `torch.nn.Module`, when asked to optimize a `nn.Module` object. Most of the methods are redirected to the original `nn.Module`, which is installed as `_mod` in the `OptimizedModule`.

This is helpful for many cases

```
mod = MockModule()

opt_mod = torch._dynamo.optimize()(mod)

print(opt_mod) # Works

opt_mod = opt_mod.to(device="cuda")
print(opt_mod) # Works
opt_mod(input) # Triggers recompile if necessary, earlier we were shedding the TorchDynamo wrapper

opt_mod.parameters() # Refers to the original module

```

Topics unclear to me
* I have overridden many methods to raise NotImplementedError. A careful review of those will be good.
* hooks
* For the optimized forward, should we call torchdynamo optimization on `__call__` or `forward`
* What else to test

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88629
Approved by: https://github.com/Chillee, https://github.com/jansel, https://github.com/msaroufim
---
 test/dynamo/test_modules.py  | 127 +++++++++++++++++++++++++++++++++++
 torch/_dynamo/__init__.py    |   2 +
 torch/_dynamo/debug_utils.py |   8 +++
 torch/_dynamo/eval_frame.py  |  74 ++++++++++++++------
 torch/_dynamo/testing.py     |  13 ++++
 5 files changed, 204 insertions(+), 20 deletions(-)

diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py
index 2fb83b3add6cf..930035f99a30c 100644
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@@ -904,6 +904,133 @@ def forward(self, x):
         self.assertTrue(torch._dynamo.testing.same(real, graph(rx)))
 
 
+class MockModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.relu = torch.nn.ReLU()
+        self.linear = torch.nn.Linear(10, 10)
+        self.register_buffer("buf0", torch.randn(10, 10))
+
+    def forward(self, x):
+        return self.relu(self.linear(x) + self.buf0)
+
+
+class OptimizedModuleTest(torch._dynamo.test_case.TestCase):
+    def test_nn_module(self):
+        mod = MockModule()
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_mod = torch._dynamo.optimize(cnt)(mod)
+        self.assertIsInstance(opt_mod, torch._dynamo.OptimizedModule)
+
+        x = torch.randn(10, 10)
+        self.assertTrue(torch._dynamo.testing.same(mod(x), opt_mod(x)))
+        self.assertEqual(cnt.frame_count, 1)
+
+    def test_to(self):
+        mod = MockModule()
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_mod = torch._dynamo.optimize(cnt)(mod)
+        x = torch.randn(10, 10)
+        self.assertTrue(torch._dynamo.testing.same(mod(x), opt_mod(x)))
+        self.assertEqual(cnt.frame_count, 1)
+
+        # Ensure that there is no recompilation
+        opt_mod(x)
+        self.assertEqual(cnt.frame_count, 1)
+
+        opt_mod = opt_mod.to(device="cpu").to(dtype=torch.float64)
+        self.assertIsInstance(opt_mod, torch._dynamo.OptimizedModule)
+        x = torch.randn(10, 10).to(dtype=torch.float64)
+        opt_mod(x)
+        # Ensure that there is a recompilation
+        self.assertEqual(cnt.frame_count, 2)
+
+    def test_attr(self):
+        class MockModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(10, 10)
+                self.register_buffer("buf0", torch.randn(10, 10))
+
+            def forward(self, x):
+                return self.r(torch.sin(x)) + self.buf0
+
+        mod = MockModule()
+        opt_mod = torch._dynamo.optimize("eager")(mod)
+
+        # Check parameteres and buffers
+        for (p1, p2) in zip(mod.parameters(), opt_mod.parameters()):
+            self.assertTrue(id(p1) == id(p2))
+
+    def test_recursion(self):
+        mod = MockModule()
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_mod = torch._dynamo.optimize(cnt)(mod)
+
+        for _ in range(5):
+            opt_mod = torch._dynamo.optimize(cnt)(opt_mod)
+        opt_mod(torch.randn(10, 10))
+        self.assertEqual(cnt.frame_count, 1)
+
+    def test_composition(self):
+        class InnerModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                return self.relu(torch.sin(x))
+
+        opt_inner_mod = InnerModule()
+
+        class OuterModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mod = opt_inner_mod
+
+            def forward(self, x):
+                return self.mod(torch.cos(x))
+
+        outer_mod = OuterModule()
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_outer_mod = torch._dynamo.optimize(cnt)(outer_mod)
+
+        x = torch.randn(4)
+        self.assertIsInstance(opt_outer_mod, torch._dynamo.OptimizedModule)
+        self.assertTrue(torch._dynamo.testing.same(outer_mod(x), opt_outer_mod(x)))
+        self.assertEqual(cnt.frame_count, 1)
+
+    def test_composition_with_opt_mod(self):
+        class InnerModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                return self.relu(torch.sin(x))
+
+        inner_mod = InnerModule()
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_inner_mod = torch._dynamo.optimize(cnt)(inner_mod)
+
+        class OuterModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mod = opt_inner_mod
+
+            def forward(self, x):
+                return self.mod(torch.cos(x))
+
+        outer_mod = OuterModule()
+        opt_outer_mod = torch._dynamo.optimize(cnt)(outer_mod)
+
+        x = torch.randn(4)
+        self.assertIsInstance(opt_outer_mod, torch._dynamo.OptimizedModule)
+        self.assertTrue(torch._dynamo.testing.same(outer_mod(x), opt_outer_mod(x)))
+        # There will be a graph break for the inner mod being OptimizedModule
+        self.assertEqual(cnt.frame_count, 2)
+
+
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/torch/_dynamo/__init__.py b/torch/_dynamo/__init__.py
index 80f927aeef2fa..5eee609b0852a 100644
--- a/torch/_dynamo/__init__.py
+++ b/torch/_dynamo/__init__.py
@@ -7,6 +7,7 @@
     export,
     optimize,
     optimize_assert,
+    OptimizedModule,
     reset_code,
     run,
     skip,
@@ -25,6 +26,7 @@
     "reset",
     "list_backends",
     "skip",
+    "OptimizedModule",
 ]
 
 
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index 98a269fe8c9eb..29d830167b109 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -515,8 +515,16 @@ def same_two_models(gm, opt_gm, example_inputs, only_fwd=False):
     """
     Check two models have same accuracy.
     """
+    from .eval_frame import OptimizedModule
+    from .testing import named_parameters_for_optimized_module
     from .utils import same
 
+    if isinstance(gm, OptimizedModule):
+        gm.named_parameters = named_parameters_for_optimized_module(gm)
+
+    if isinstance(opt_gm, OptimizedModule):
+        opt_gm.named_parameters = named_parameters_for_optimized_module(opt_gm)
+
     ref = run_fwd_maybe_bwd(gm, example_inputs, only_fwd)
 
     try:
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 8d9e3b7b6aa14..20e8c7de085e0 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -5,6 +5,7 @@
 import logging
 import os
 import sys
+import textwrap
 import threading
 import traceback
 import types
@@ -44,6 +45,27 @@
 most_recent_backend = None
 
 
+class OptimizedModule(torch.nn.Module):
+    """
+    Wraps the original nn.Module object and later patches its
+    forward method to optimized self.forward method.
+    """
+
+    def __init__(self, mod):
+        super().__init__()
+        # Installs the params/buffer
+        self._orig_mod = mod
+
+    def __getattr__(self, name):
+        if name == "_orig_mod":
+            return self._modules["_orig_mod"]
+        return getattr(self._orig_mod, name)
+
+    def forward(self, *args, **kwargs):
+        # This will be monkey patched later
+        raise RuntimeError("Should not be here")
+
+
 def remove_from_cache(f):
     """
     Make sure f.__code__ is not cached to force a recompile
@@ -118,31 +140,15 @@ def __call__(self, fn):
         # Optimize the forward method of torch.nn.Module object
         if isinstance(fn, torch.nn.Module):
             mod = fn
-            optimized_forward = self(mod.forward)
-
-            class TorchDynamoNNModuleWrapper:
-                """
-                A wrapper that redirects the forward call to the optimized
-                forward, while for rest it redirects the calls to the original
-                module.
-                """
-
-                def __getattr__(self, name):
-                    return getattr(mod, name)
-
-                def forward(self, *args, **kwargs):
-                    return optimized_forward(*args, **kwargs)
-
-                def __call__(self, *args, **kwargs):
-                    return self.forward(*args, **kwargs)
-
-            new_mod = TorchDynamoNNModuleWrapper()
+            new_mod = OptimizedModule(mod)
+            new_mod.forward = self(mod.forward)
             # Save the function pointer to find the original callable while nesting
             # of decorators.
-            new_mod._torchdynamo_orig_callable = mod
+            new_mod._torchdynamo_orig_callable = mod.forward
             return new_mod
 
         assert callable(fn)
+
         callback = self.callback
         on_enter = self.on_enter
         backend_ctx_ctor = self.extra_ctx_ctor
@@ -184,6 +190,34 @@ def _fn(*args, **kwargs):
         # If the function is called using torch._dynamo.optimize decorator, we
         # should prevent any type of skipping.
         if callback not in (None, False):
+            if not hasattr(fn, "__code__"):
+                raise RuntimeError(
+                    textwrap.dedent(
+                        """
+
+                        torch._dynamo.optimize is called on a non function object.
+                        If this is a callable class, please optimize the individual methods that you are interested in optimizing.
+
+                        >> class CallableClass:
+                        >>     def __init__(self):
+                        >>         super().__init__()
+                        >>         self.relu = torch.nn.ReLU()
+                        >>
+                        >>     def __call__(self, x):
+                        >>         return self.relu(torch.sin(x))
+                        >>
+                        >>     def print_hello(self):
+                        >>         print("Hello world")
+                        >>
+                        >> mod = CallableClass()
+
+                        If you want to optimize the __call__ function
+
+                        >> mod.__call__ = torch._dynamo.optimize(mod.__call__)
+
+                        """
+                    )
+                )
             always_optimize_code_objects[fn.__code__] = True
 
         return _fn
diff --git a/torch/_dynamo/testing.py b/torch/_dynamo/testing.py
index d6082ce48acf8..b37299ffd5791 100644
--- a/torch/_dynamo/testing.py
+++ b/torch/_dynamo/testing.py
@@ -32,6 +32,17 @@ def clone_me(x):
     return x.detach().clone().requires_grad_(x.requires_grad)
 
 
+def named_parameters_for_optimized_module(mod):
+    assert isinstance(mod, eval_frame.OptimizedModule)
+    return mod._orig_mod.named_parameters
+
+
+def remove_optimized_module_prefix(name):
+    prefix = "_orig_mod."
+    assert name.startswith(prefix)
+    return name[len(prefix) :]
+
+
 def collect_results(model, prediction, loss, example_inputs):
     results = []
     results.append(prediction)
@@ -44,6 +55,8 @@ def collect_results(model, prediction, loss, example_inputs):
     grads = dict()
     params = dict()
     for name, param in model.named_parameters():
+        if isinstance(model, eval_frame.OptimizedModule):
+            name = remove_optimized_module_prefix(name)
         param_copy = param
         grad = param.grad
         # Treat None and zero grad as same

From 73b7af965546008f870d9b4588f16d6af955e3cc Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Sat, 12 Nov 2022 05:16:41 +0000
Subject: [PATCH 0829/1922] Revert "Add comprehensive minifier tests (#88022)"

This reverts commit 5ff600aa6e40c6b4d426594bbb1f446f005b7fb3.

Reverted https://github.com/pytorch/pytorch/pull/88022 on behalf of https://github.com/wconstab due to Seems to be causing CI failures relating to minifier test and some /tmp/ path not existing
---
 test/dynamo/test_minifier.py | 630 ++++-------------------------------
 torch/_dynamo/debug_utils.py |  78 +----
 2 files changed, 76 insertions(+), 632 deletions(-)

diff --git a/test/dynamo/test_minifier.py b/test/dynamo/test_minifier.py
index 51b79a5e7511e..0cec7d202a9d4 100644
--- a/test/dynamo/test_minifier.py
+++ b/test/dynamo/test_minifier.py
@@ -1,138 +1,27 @@
 # Owner(s): ["module: dynamo"]
-import functools
 import os
-import re
 import shutil
-import subprocess
-import textwrap
 import unittest
+from unittest.mock import patch
 
 import torch
 import torch._dynamo
 import torch._dynamo.test_case
 import torch._dynamo.testing
-import torch._inductor.utils
-from torch._dynamo.debug_utils import TEST_REPLACEABLE_COMMENT
+from torch._dynamo.optimizations.backends import create_backend
 
-_HAS_TRITON = torch._inductor.utils.has_triton()
-requires_cuda = functools.partial(unittest.skipIf, not _HAS_TRITON, "requires cuda")
 
-RELU_COMPILE_ERROR_BACKEND = """\
-from torch._dynamo.optimizations.backends import register_backend
+class MockModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
 
-class DynamoCompileError(Exception):
-    pass
-
-@register_backend
-def test_relu_compile_error(gm: torch.fx.GraphModule, example_inputs):
-    for node in gm.graph.nodes:
-        if node.target == torch.relu:
-            raise DynamoCompileError("relu found")
-    return gm
-"""
-
-RELU_RUNTIME_ERROR_BACKEND = """\
-import copy
-from torch._dynamo.optimizations.backends import register_backend
-
-@register_backend
-def test_relu_runtime_error(gm: torch.fx.GraphModule, example_inputs):
-    gm = copy.deepcopy(gm)
-    for node in gm.graph.nodes:
-        if node.target == torch.relu:
-            node.target = torch._assert
-            node.args = (False, "DynamoRuntimeError")
-    gm.recompile()
-    return gm
-"""
-
-RELU_ACCURACY_ERROR_BACKEND = """\
-import copy
-from torch._dynamo.optimizations.backends import register_backend
-
-@register_backend
-def test_relu_accuracy_error(gm: torch.fx.GraphModule, example_inputs):
-    gm = copy.deepcopy(gm)
-    for node in gm.graph.nodes:
-        if node.target == torch.relu:
-            node.target = torch.add
-            node.args = (node.args[0], 1)
-    gm.recompile()
-
-    return gm
-"""
-
-RELU_CUSTOM_ERROR_BACKEND = """\
-class CustomError(Exception):
-    pass
-
-def test_relu_custom_error(gm: torch.fx.GraphModule, example_inputs):
-    for node in gm.graph.nodes:
-        if node.target == torch.relu:
-            raise CustomError("relu found")
-    return gm
-"""
-
-CPP_COMPILE_ERROR = """\
-def cpp_compile_error(x):
-    return "compile error!"
-"""
-
-CPP_RUNTIME_ERROR = """\
-def cpp_runtime_error(x):
-    return f"{x}; throw 1"
-"""
-
-CPP_ACCURACY_ERROR = """\
-def cpp_accuracy_error(x):
-    return f"{x} + 1"
-"""
-
-TRITON_COMPILE_ERROR = """\
-def triton_compile_error(x):
-    return "compile error!"
-"""
-
-# NOTE: there is currently not an easy way to cause a triton runtime error.
-TRITON_RUNTIME_ERROR = """\
-def triton_runtime_error(x):
-    return f"{x}; assert?"
-"""
-
-TRITON_ACCURACY_ERROR = """\
-def triton_accuracy_error(x):
-    return f"{x} + 1"
-"""
-
-DEBUG_DIR = "/tmp/_torchdynamo_debug_/"
-
-# Search for the name of the first function defined in a code string.
-def get_fn_name(code):
-    fn_name_match = re.search(r"def (\w+)\(", code)
-    if fn_name_match is not None:
-        return fn_name_match.group(1)
-    return None
-
-
-# Generates code that patches CppOverrides/TritonOverrides.
-def gen_codegen_fn_patch_code(old_fn_name, new_fn_code, device):
-    new_fn_name = get_fn_name(new_fn_code)
-    if new_fn_name is not None:
-        patch_code = f"""\
-import torch._inductor.codegen.{"cpp" if device == "cpu" else "triton"} as codegen
-overrides = codegen.{"CppOverrides" if device == "cpu" else "TritonOverrides"}
-{new_fn_code}
-overrides.{old_fn_name} = staticmethod({new_fn_name})
-"""
-        return f"""\
-{patch_code}
-isolate_fails_code_str = \"\"\"\\
-{patch_code}
-torch._dynamo.config.debug_dir_root = "{DEBUG_DIR}"
-\"\"\"
-"""
-
-    return None
+    def forward(self, x):
+        for _ in range(10):
+            x = torch.sin(x)
+        x = torch._foobar(x)
+        for _ in range(10):
+            x = torch.cos(x)
+        return x
 
 
 class MinfierTests(torch._dynamo.test_case.TestCase):
@@ -143,10 +32,9 @@ def setUpClass(cls):
             unittest.mock.patch.object(
                 torch._dynamo.config,
                 "debug_dir_root",
-                DEBUG_DIR,
+                "/tmp/_torchdynamo_debug_/",
             )
         )
-        os.makedirs(DEBUG_DIR, exist_ok=True)
 
     @classmethod
     def tearDownClass(cls):
@@ -159,455 +47,65 @@ def setUp(self):
     def tearDown(self):
         super().tearDown()
 
-    # Run `code` in a separate python process.
-    # Returns the completed process state and the directory containing the
-    # minifier launcher script, if `code` outputted it.
-    def _run_test_code(self, code):
-        proc = subprocess.run(
-            ["python3", "-c", code], capture_output=True, cwd=DEBUG_DIR
-        )
-
-        repro_dir_match = re.search(
-            r"(\S+)minifier_launcher.py", proc.stderr.decode("utf-8")
-        )
-        if repro_dir_match is not None:
-            # Print repro directory for debugging generated code.
-            # Make sure to comment out `shutil.rmtree...` above as well.
-            print("repro dir:", repro_dir_match.group(1))
-            return proc, repro_dir_match.group(1)
-        return proc, None
-
-    # Patch generated files with testing patches
-    def _inject_code(self, patch_code, filename):
-        patch_code = f"""\
-{patch_code}
-torch._dynamo.config.debug_dir_root = "{DEBUG_DIR}"
-"""
-        with open(filename, "r") as f:
-            code = f.read()
-        code = code.replace(TEST_REPLACEABLE_COMMENT, patch_code)
-        with open(filename, "w") as f:
-            f.write(code)
-        return code
-
-    # Runs the minifier launcher script in `repro_dir`, patched with `patch_code`.
-    def _run_minifier_launcher(self, patch_code, repro_dir):
-        self.assertIsNotNone(repro_dir)
-        launch_file = os.path.join(repro_dir, "minifier_launcher.py")
-        self.assertTrue(os.path.exists(launch_file))
-        launch_code = self._inject_code(patch_code, launch_file)
-
-        launch_proc = subprocess.run(
-            ["python3", launch_file],
-            capture_output=True,
-            cwd=repro_dir,
-        )
-
-        return launch_proc, launch_code
-
-    # Runs the repro script in `repro_dir`, patched with `patch_code`
-    def _run_repro(self, patch_code, repro_dir):
-        self.assertIsNotNone(repro_dir)
-        repro_file = os.path.join(repro_dir, "repro.py")
+    def test_after_dynamo(self):
+        @create_backend
+        def bad_dynamo_backend(subgraph):
+            import sys
+
+            def f(*args):
+                # Shifted the forced exception to runtime as this is more common
+                # in JIT compilers.
+                for node in subgraph.model.graph.nodes:
+                    if node.op == "call_function" and node.target is torch._foobar:
+                        sys.stdout.write("Dynamo compiled failed\n")
+                        raise NotImplementedError("foobar is not implemented")
+                return subgraph.model(*args)
+
+            return f
+
+        mod = MockModule()
+        opt_mod = torch._dynamo.optimize("bad_dynamo_backend")(mod)
+        repro_file = torch._dynamo.debug_utils.get_minifier_repro_path()
+
+        @patch.object(torch._dynamo.config, "repro_after", "dynamo")
+        def inner():
+            x = torch.randn(4)
+            try:
+                opt_mod(x)
+            except Exception:
+                pass
+
+        inner()
         self.assertTrue(os.path.exists(repro_file))
-        repro_code = self._inject_code(patch_code, repro_file)
-
-        repro_proc = subprocess.run(
-            ["python3", repro_file], capture_output=True, cwd=repro_dir
-        )
-
-        return repro_proc, repro_code
-
-    # Template for testing code.
-    # `run_code` is the code to run for the test case.
-    # `patch_code` is the code to be patched in every generated file.
-    def _gen_test_code(self, run_code, repro_after, repro_level, patch_code):
-        return f"""\
-import torch
-import torch._dynamo
-{patch_code}
-torch._dynamo.config.repro_after = "{repro_after}"
-torch._dynamo.config.repro_level = {repro_level}
-torch._dynamo.config.debug_dir_root = "{DEBUG_DIR}"
-{run_code}
-"""
-
-    # Runs a full minifier test.
-    # Minifier tests generally consist of 3 stages:
-    # 1. Run the problematic code (in a separate process since it could segfault)
-    # 2. Run the generated minifier launcher script
-    # 3. Run the generated repro script
-    def _run_full_test(self, run_code, repro_after, repro_level, patch_code):
-        test_code = self._gen_test_code(run_code, repro_after, repro_level, patch_code)
-        test_proc, repro_dir = self._run_test_code(test_code)
-        self.assertIsNotNone(repro_dir)
-        launch_proc, launch_code = self._run_minifier_launcher(patch_code, repro_dir)
-        repro_proc, repro_code = self._run_repro(patch_code, repro_dir)
-        return ((test_proc, launch_proc, repro_proc), (launch_code, repro_code))
-
-    # Test that compile, runtime, and accuracy errors after dynamo can be repro'd (both CPU and CUDA)
-    def _test_after_dynamo(self, device, repro_level, backend_code, error_name):
-        run_code = textwrap.dedent(
-            f"""\
-            @torch._dynamo.optimize("{get_fn_name(backend_code)}")
-            def inner(x):
-                for _ in range(10):
-                    x = torch.sin(x)
-                x = torch.relu(x)
-                for _ in range(10):
-                    x = torch.cos(x)
-                return x
-
-            inner(torch.randn(20, 20).to("{device}"))
-        """
-        )
-
-        (test_proc, _, repro_proc), _ = self._run_full_test(
-            run_code, "dynamo", repro_level, backend_code
-        )
-
-        self.assertIn(error_name, test_proc.stderr.decode("utf-8"))
-        self.assertIn(error_name, repro_proc.stderr.decode("utf-8"))
-
-    def test_after_dynamo_cpu_compile_error(self):
-        self._test_after_dynamo(
-            "cpu", 2, RELU_COMPILE_ERROR_BACKEND, "DynamoCompileError"
-        )
-
-    def test_after_dynamo_cpu_runtime_error(self):
-        self._test_after_dynamo(
-            "cpu", 2, RELU_RUNTIME_ERROR_BACKEND, "DynamoRuntimeError"
-        )
-
-    def test_after_dynamo_cpu_accuracy_error(self):
-        self._test_after_dynamo("cpu", 4, RELU_ACCURACY_ERROR_BACKEND, "AccuracyError")
-
-    @requires_cuda()
-    def test_after_dynamo_cuda_compile_error(self):
-        self._test_after_dynamo(
-            "cuda", 2, RELU_COMPILE_ERROR_BACKEND, "DynamoCompileError"
-        )
-
-    @requires_cuda()
-    def test_after_dynamo_cuda_runtime_error(self):
-        self._test_after_dynamo(
-            "cuda", 2, RELU_RUNTIME_ERROR_BACKEND, "DynamoRuntimeError"
-        )
-
-    @requires_cuda()
-    def test_after_dynamo_cuda_accuracy_error(self):
-        self._test_after_dynamo("cuda", 4, RELU_ACCURACY_ERROR_BACKEND, "AccuracyError")
-
-    # Ensure that the testing backends pass when relu is not present.
-    def _test_after_dynamo_backend_passes(self, device, repro_level, backend_code):
-        run_code = textwrap.dedent(
-            f"""\
-            @torch._dynamo.optimize("{get_fn_name(backend_code)}")
-            def inner(x):
-                for _ in range(10):
-                    x = torch.sin(x)
-                for _ in range(10):
-                    x = torch.cos(x)
-                return x
-
-            inner(torch.randn(20, 20).to("{device}"))
-        """
-        )
-
-        test_code = self._gen_test_code(run_code, "dynamo", repro_level, backend_code)
-        proc, repro_dir = self._run_test_code(test_code)
-        self.assertEqual(proc.returncode, 0)
-        self.assertIsNone(repro_dir)
-
-    def test_after_dynamo_cpu_compile_backend_passes(self):
-        self._test_after_dynamo_backend_passes("cpu", 2, RELU_COMPILE_ERROR_BACKEND)
-
-    def test_after_dynamo_cpu_runtime_backend_passes(self):
-        self._test_after_dynamo_backend_passes("cpu", 2, RELU_RUNTIME_ERROR_BACKEND)
-
-    def test_after_dynamo_cpu_accuracy_backend_passes(self):
-        self._test_after_dynamo_backend_passes("cpu", 4, RELU_ACCURACY_ERROR_BACKEND)
 
-    @requires_cuda()
-    def test_after_dynamo_cuda_compile_backend_passes(self):
-        self._test_after_dynamo_backend_passes("cuda", 2, RELU_COMPILE_ERROR_BACKEND)
+    # If error_at_aot is True, an error will be produced when AOTAutograd
+    # attempts to generate the backward graph.
+    # If error_after_aot is False, an error will be produced in inductor.
+    def _test_around_aot(self, error_at_aot):
+        mod = MockModule()
+        opt_mod = torch._dynamo.optimize("inductor")(mod)
 
-    @requires_cuda()
-    def test_after_dynamo_cuda_runtime_backend_passes(self):
-        self._test_after_dynamo_backend_passes("cuda", 2, RELU_RUNTIME_ERROR_BACKEND)
+        repro_file = torch._dynamo.debug_utils.get_minifier_repro_path()
+        repro_after = "dynamo" if error_at_aot else "aot"
 
-    @requires_cuda()
-    def test_after_dynamo_cuda_accuracy_backend_passes(self):
-        self._test_after_dynamo_backend_passes("cuda", 4, RELU_ACCURACY_ERROR_BACKEND)
+        @patch.object(torch._dynamo.config, "repro_after", repro_after)
+        def inner():
+            x = torch.randn(4)
+            x.requires_grad = error_at_aot
+            try:
+                opt_mod(x)
+            except Exception:
+                pass
 
-    # Ensure that generated code with a custom backends generates a runnable minifier
-    # launcher script that results in a RuntimeError
-    def test_after_dynamo_custom_backend(self):
-        run_code = textwrap.dedent(
-            f"""\
-            @torch._dynamo.optimize({get_fn_name(RELU_CUSTOM_ERROR_BACKEND)})
-            def inner(x):
-                for _ in range(10):
-                    x = torch.sin(x)
-                x = torch.relu(x)
-                for _ in range(10):
-                    x = torch.cos(x)
-                return x
+        inner()
 
-            inner(torch.randn(20, 20))
-        """
-        )
-
-        test_code = self._gen_test_code(
-            run_code, "dynamo", 2, RELU_CUSTOM_ERROR_BACKEND
-        )
-        _, repro_dir = self._run_test_code(test_code)
-        launch_proc, launch_code = self._run_minifier_launcher("", repro_dir)
-        self.assertIn("RuntimeError", launch_proc.stderr.decode("utf-8"))
-
-    # Test that a module with mixed cpu/cuda parts with an error after dynamo can be repro'd
-    @requires_cuda()
-    def test_cpu_cuda_module_after_dynamo(self):
-        backend_name = get_fn_name(RELU_COMPILE_ERROR_BACKEND)
-
-        run_code = textwrap.dedent(
-            f"""\
-            class CpuCudaModule(torch.nn.Module):
-                def __init__(self):
-                    super().__init__()
-                    self.m_x = torch.nn.Linear(20, 20).cuda()
-                    self.m_y = torch.nn.Linear(20, 20)
-                    self.p_x = torch.nn.Parameter(torch.randn(20, 20).cuda())
-                    self.p_y = torch.nn.Parameter(torch.randn(20, 20))
-                    self.register_buffer("b_x", torch.ones(20, 20).cuda())
-                    self.register_buffer("b_y", torch.ones(20, 20))
-
-                def forward(self, x, y):
-                    return self.m_x(x) + self.p_x + self.b_x, self.m_y(y) + self.p_y + self.b_y
-
-            mod = CpuCudaModule()
-
-            @torch._dynamo.optimize("{backend_name}")
-            def inner(x1, y1):
-                x2 = torch.randn(20, 20).cuda()
-                y2 = torch.randn(20, 20)
-                x3, y3 = mod(x1 + x2, y1 + y2)
-                return torch.relu(x3.cpu() + y3)
-
-            inner(torch.randn(20, 20).cuda(), torch.randn(20, 20))
-        """
-        )
-
-        (test_proc, _, repro_proc), (launch_code, _) = self._run_full_test(
-            run_code, "dynamo", 2, RELU_COMPILE_ERROR_BACKEND
-        )
-
-        tb1 = test_proc.stderr.decode("utf-8")
-        tb2 = repro_proc.stderr.decode("utf-8")
-
-        # Check if generated minifier code covers all cpu/cuda cases
-        self.assertIsNotNone(re.search(r"args.*cuda", launch_code))
-        self.assertIsNotNone(re.search(r"args.*cpu", launch_code))
-        # search for Linear(...).cuda()
-        self.assertIsNotNone(re.search(r"Linear.*cuda", launch_code))
-        # search for Linear(...)
-        self.assertIsNotNone(
-            re.search(r"Linear(?!.*cuda.*$)", launch_code, re.MULTILINE)
-        )
-        self.assertIsNotNone(re.search(r"register_buffer.*cuda", launch_code))
-        self.assertIsNotNone(
-            re.search(r"register_buffer(?!.*cuda.*$)", launch_code, re.MULTILINE)
-        )
-        self.assertIsNotNone(re.search(r"Parameter.*cuda", launch_code))
-        self.assertIsNotNone(
-            re.search(r"Parameter(?!.*cuda.*$)", launch_code, re.MULTILINE)
-        )
-        # search for
-        # <name> = torch.randn(...)
-        # ... = <name>.cuda()
-        self.assertIsNotNone(
-            re.search(r"(\w+) = torch.randn.*\1\.cuda", launch_code, re.DOTALL)
-        )
-        # search for
-        # <name> = torch.randn(...)
-        # no followup call to <name>.cuda()
-        self.assertIsNotNone(
-            re.search(
-                r"(\w+) = torch.randn(?!.*\1\.cuda\(\).*$)", launch_code, re.DOTALL
-            )
-        )
-
-        self.assertIn(backend_name, tb1)
-        self.assertIn(backend_name, tb2)
-
-    # Test if we can actually get a minified graph
-    def test_if_graph_minified(self):
-        backend_name = get_fn_name(RELU_COMPILE_ERROR_BACKEND)
-
-        run_code = textwrap.dedent(
-            f"""\
-            @torch._dynamo.optimize("{backend_name}")
-            def inner(x):
-                for _ in range(20):
-                    x = torch.sin(x)
-                x = torch.relu(x)
-                for _ in range(20):
-                    x = torch.cos(x)
-                return x
-
-            inner(torch.randn(20, 20))
-        """
-        )
-
-        (test_proc, _, repro_proc), (launch_code, repro_code) = self._run_full_test(
-            run_code, "dynamo", 2, RELU_COMPILE_ERROR_BACKEND
-        )
-
-        tb1 = test_proc.stderr.decode("utf-8")
-        tb2 = repro_proc.stderr.decode("utf-8")
-
-        self.assertIn(backend_name, tb1)
-        self.assertIn(backend_name, tb2)
-
-        # compare the length of the forward functions
-        match = re.search(r"def forward.*return", launch_code, re.DOTALL)
-        self.assertIsNotNone(match)
-        self.assertGreater(match.group(0).count("\n"), 40)
-
-        match = re.search(r"def forward.*return", repro_code, re.DOTALL)
-        self.assertIsNotNone(match)
-        self.assertLess(match.group(0).count("\n"), 5)
-
-    # Test that compile and accuracy errors after aot can be repro'd (both CPU and CUDA)
-    def _test_after_aot(self, device, backend_code, repro_level):
-        run_code = textwrap.dedent(
-            f"""\
-            @torch._dynamo.optimize("inductor")
-            def inner(x):
-                for _ in range(3):
-                    x = torch.sin(x)
-                x = torch.relu(x)
-                for _ in range(3):
-                    x = torch.cos(x)
-                return x
-
-            inner(torch.randn(20, 20).to("{device}"))
-        """
-        )
-        patch_code = gen_codegen_fn_patch_code("relu", backend_code, device)
-        self.assertIsNotNone(patch_code)
-        (test_proc, _, repro_proc), _ = self._run_full_test(
-            run_code, "aot", repro_level, patch_code
-        )
-        return (
-            (test_proc.stderr.decode("utf-8"), repro_proc.stderr.decode("utf-8")),
-            (test_proc.returncode, repro_proc.returncode),
-        )
-
-    def test_after_aot_cpu_compile_error(self):
-        (tb1, tb2), _ = self._test_after_aot("cpu", CPP_COMPILE_ERROR, 2)
-        self.assertIn("CppCompileError", tb1)
-        self.assertIn("CppCompileError", tb2)
-
-    def test_after_aot_cpu_accuracy_error(self):
-        (tb1, tb2), _ = self._test_after_aot("cpu", CPP_ACCURACY_ERROR, 4)
-        self.assertIn("AccuracyError", tb1)
-        self.assertIn("AccuracyError", tb2)
-
-    @requires_cuda()
-    def test_after_aot_cuda_compile_error(self):
-        (tb1, tb2), _ = self._test_after_aot("cuda", TRITON_COMPILE_ERROR, 2)
-        self.assertIn("SyntaxError", tb1)
-        self.assertIn("SyntaxError", tb2)
-
-    @requires_cuda()
-    def test_after_aot_cuda_accuracy_error(self):
-        (tb1, tb2), _ = self._test_after_aot("cuda", TRITON_ACCURACY_ERROR, 4)
-        self.assertIn("AccuracyError", tb1)
-        self.assertIn("AccuracyError", tb2)
-
-    # Test that runtime errors after aot can be repro'd (CPU only for now)
-    def _test_after_aot_runtime_error(self, device, backend_code):
-        run_code = textwrap.dedent(
-            f"""\
-            @torch._dynamo.optimize("inductor")
-            def inner(x):
-                for _ in range(3):
-                    x = torch.sin(x)
-                x = torch.relu(x)
-                for _ in range(3):
-                    x = torch.cos(x)
-                return x
-
-            inner(torch.randn(20, 20).to("{device}"))
-        """
-        )
-        patch_code = gen_codegen_fn_patch_code("relu", backend_code, device)
-        self.assertIsNotNone(patch_code)
-
-        (test_proc, _, repro_proc), _ = self._run_full_test(
-            run_code, "aot", 3, patch_code
-        )
-
-        self.assertNotIn("CompilerError", test_proc.stderr.decode("utf-8"))
-
-        self.assertEqual(test_proc.returncode, repro_proc.returncode)
-        self.assertNotEqual(test_proc.returncode, 0)
-
-    def test_after_aot_cpu_runtime_error(self):
-        self._test_after_aot_runtime_error("cpu", CPP_RUNTIME_ERROR)
-
-    # NOTE: there is currently not an easy way to cause a triton runtime error.
-    @unittest.skip
-    @requires_cuda()
-    def test_after_aot_cuda_runtime_error(self):
-        self._test_after_aot_runtime_error("cuda", TRITON_RUNTIME_ERROR)
-
-    # Ensure that inductor codegen patches pass when relu is not present.
-    def _test_after_aot_backend_passes(self, device, repro_level, backend_code):
-        run_code = textwrap.dedent(
-            f"""\
-            @torch._dynamo.optimize("inductor")
-            def inner(x):
-                for _ in range(3):
-                    x = torch.sin(x)
-                for _ in range(3):
-                    x = torch.cos(x)
-                return x
-
-            inner(torch.randn(20, 20).to("{device}"))
-        """
-        )
-        patch_code = gen_codegen_fn_patch_code("relu", backend_code, device)
-        self.assertIsNotNone(patch_code)
-
-        test_code = self._gen_test_code(run_code, "aot", repro_level, patch_code)
-        proc, repro_dir = self._run_test_code(test_code)
-        self.assertEqual(proc.returncode, 0)
-        self.assertIsNone(repro_dir)
-
-    def test_after_aot_cpu_compile_backend_passes(self):
-        self._test_after_aot_backend_passes("cpu", 2, CPP_COMPILE_ERROR)
-
-    def test_after_aot_cpu_runtime_backend_passes(self):
-        self._test_after_aot_backend_passes("cpu", 2, CPP_RUNTIME_ERROR)
-
-    def test_after_aot_cpu_accuracy_backend_passes(self):
-        self._test_after_aot_backend_passes("cpu", 4, CPP_ACCURACY_ERROR)
-
-    @requires_cuda()
-    def test_after_aot_cuda_compile_backend_passes(self):
-        self._test_after_aot_backend_passes("cuda", 2, TRITON_COMPILE_ERROR)
+        self.assertTrue(os.path.exists(repro_file))
 
-    # NOTE: there is currently not an easy way to cause a triton runtime error.
-    @unittest.skip
-    @requires_cuda()
-    def test_after_aot_cuda_runtime_backend_passes(self):
-        self._test_after_aot_backend_passes("cuda", 2, TRITON_RUNTIME_ERROR)
+    def test_at_aot(self):
+        self._test_around_aot(True)
 
-    @requires_cuda()
-    def test_after_aot_cuda_accuracy_backend_passes(self):
-        self._test_after_aot_backend_passes("cuda", 4, TRITON_ACCURACY_ERROR)
+    def test_after_aot(self):
+        self._test_around_aot(False)
 
 
 if __name__ == "__main__":
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index 29d830167b109..089ef172d625d 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -84,11 +84,6 @@ def __init__(self):
 
         for module_name, module in gm.named_children():
             module_str = f"{module.__repr__()}"
-            # module should be a core torch.nn.Module, so all parameters
-            # should be on the same device.
-            example_param = next(module.parameters(), None)
-            if example_param is not None and example_param.is_cuda:
-                module_str = f"{module_str}.cuda()"
             model_str += f"{tab*2}self.{module_name} = {module_str}\n"
 
         for buffer_name, buffer in gm._buffers.items():
@@ -100,16 +95,12 @@ def __init__(self):
                 tensor_str = (
                     f"torch.randint(1, size={list(buffer.shape)}, dtype={buffer.dtype})"
                 )
-            if buffer.is_cuda:
-                tensor_str = f"{tensor_str}.cuda()"
             model_str += f"{tab*2}self.register_buffer('{buffer_name}', {tensor_str})\n"
 
         for param_name, param in gm._parameters.items():
             if param is None:
                 continue
             tensor_str = f"torch.nn.Parameter(torch.randn({list(param.shape)}, dtype={param.dtype}))"
-            if param.is_cuda:
-                tensor_str = f"{tensor_str}.cuda()"
             model_str += f"{tab*2}self.{param_name} = {tensor_str}\n"
 
         # TODO - Keep this code for now. But, I don't think we will need this.
@@ -154,9 +145,6 @@ def _cuda_system_info_comment():
     return model_str
 
 
-TEST_REPLACEABLE_COMMENT = "# REPLACEABLE COMMENT FOR TESTING PURPOSES"
-
-
 def generate_compiler_repro_string(gm, args):
     model_str = textwrap.dedent(
         f"""
@@ -167,8 +155,6 @@ def generate_compiler_repro_string(gm, args):
         from math import inf
         from torch.fx.experimental.proxy_tensor import make_fx
 
-        {TEST_REPLACEABLE_COMMENT}
-
         """
     )
     model_str += f"# torch version: {torch.version.__version__}\n"
@@ -184,7 +170,7 @@ def generate_compiler_repro_string(gm, args):
     model_str += (
         "args = [rand_strided(sh, st, dt, dev) for (sh, st, dt, dev) in args]\n"
     )
-    model_str += "mod = make_fx(Repro())(*args)\n"
+    model_str += 'mod = make_fx(Repro().to(device="cuda"))(*args)\n'
     return model_str
 
 
@@ -211,8 +197,7 @@ def dump_compiler_graph_state(gm, args, compiler_name):
     log.warning(f"Writing checkpoint with {len(gm.graph.nodes)} nodes to {file_name}")
     with open(file_name, "w") as fd:
         save_graph_repro(fd, gm, args, compiler_name)
-    curdir = os.getcwd()
-    repro_path = os.path.join(curdir, "repro.py")
+    repro_path = os.path.join(config.base_dir, "repro.py")
     try:
         shutil.copyfile(file_name, repro_path)
         log.warning(f"Copying repro file for convenience to {repro_path}")
@@ -231,10 +216,7 @@ def save_graph_repro(fd, gm, args, compiler_name):
             textwrap.dedent(
                 f"""
                 compiled = {COMPILER_REPRO_OPTIONS[compiler_name][1]}(mod, args)
-                class AccuracyError(Exception):
-                    pass
-                if not same_two_models(mod, compiled, args, only_fwd=True):
-                    raise AccuracyError("Bad accuracy detected")
+                assert same_two_models(mod, compiled, args, only_fwd=True), "Accuracy failed"
                 """
             )
         )
@@ -249,7 +231,7 @@ class AccuracyError(Exception):
         )
 
 
-def isolate_fails(fx_g, args, compiler_name: str, env=None, patch_code=None):
+def isolate_fails(fx_g, args, compiler_name: str, env=None):
     if env is None:
         env = {}
     subdir = os.path.join(os.getcwd(), "isolate")
@@ -257,10 +239,7 @@ def isolate_fails(fx_g, args, compiler_name: str, env=None, patch_code=None):
         os.makedirs(subdir, exist_ok=True)
     file_name = os.path.join(subdir, f"{str(uuid.uuid4())[:5]}.py")
     with open(file_name, "w") as fd:
-        repro_code = generate_compiler_repro_string(fx_g, args)
-        if patch_code is not None:
-            repro_code = repro_code.replace(TEST_REPLACEABLE_COMMENT, patch_code)
-        fd.write(repro_code)
+        fd.write(generate_compiler_repro_string(fx_g, args))
         fail_fn = COMPILER_REPRO_OPTIONS[compiler_name][2]
         fd.write(
             textwrap.dedent(
@@ -284,7 +263,6 @@ def isolate_fails(fx_g, args, compiler_name: str, env=None, patch_code=None):
     stdout, stderr = TemporaryFile(), TemporaryFile()
     p = subprocess.Popen(
         ["python", file_name],
-        cwd=subdir,
         stdout=stdout,
         stderr=stderr,
         env=new_env,
@@ -351,8 +329,6 @@ def dump_to_minify(gm, args, compiler_name: str):
 
     contents = textwrap.dedent(
         f"""
-isolate_fails_code_str = None
-
 {generate_compiler_repro_string(gm, args)}
 
 from functools import partial
@@ -367,7 +343,7 @@ def dump_to_minify(gm, args, compiler_name: str):
 minifier(
     mod,
     args,
-    module_fails=partial(isolate_fails, env=env_variables, compiler_name="{compiler_name}", patch_code=isolate_fails_code_str),
+    module_fails=partial(isolate_fails, env=env_variables, compiler_name="{compiler_name}"),
     dump_state=partial(dump_compiler_graph_state, compiler_name="{compiler_name}"),
 )
         """
@@ -375,10 +351,6 @@ def dump_to_minify(gm, args, compiler_name: str):
     return helper_for_dump_minify(contents)
 
 
-class AccuracyError(Exception):
-    pass
-
-
 def wrap_compiler_debug(compiler_fn, compiler_name: str):
     """
     Minifier for Fx Graph modules after Aot Autograd has finished. We wrap both
@@ -438,7 +410,7 @@ def deferred_for_real_inputs(real_inputs):
                         copy_tensor_attrs,
                         f"{compiler_name}_accuracy",
                     )
-                    raise AccuracyError("Bad accuracy detected")
+                    raise ValueError("Bad accuracy detected")
                 else:
                     # Call the compiled function with real inputs
                     return inner_compiled_fn(real_inputs)
@@ -463,8 +435,7 @@ def deferred_for_real_inputs(real_inputs):
                             copy_tensor_attrs,
                             compiler_name,
                         )
-                    log.error("CompilerError")
-                    raise
+                    raise e
 
         if config.repro_after == "aot":
             compiled_fn = deferred_for_real_inputs
@@ -581,14 +552,9 @@ def generate_dynamo_fx_repro_string(
             f"""
 mod.eval()
 opt_mod.eval()
-
-class AccuracyError(Exception):
-    pass
-
 with torch.cuda.amp.autocast(enabled={torch.is_autocast_enabled()}):
     assert same_two_models(mod, mod, args), "Eager itself failed"
-    if not same_two_models(mod, opt_mod, args):
-        raise AccuracyError("Dynamo failed")
+    assert same_two_models(mod, opt_mod, args), "Dynamo failed"
     """
         )
 
@@ -603,14 +569,12 @@ class AccuracyError(Exception):
 from {config.dynamo_import}.debug_utils import run_fwd_maybe_bwd
 from {config.dynamo_import}.debug_utils import same_two_models
 
-{TEST_REPLACEABLE_COMMENT}
-
 args = {[(tuple(a.shape), tuple(a.stride()), a.dtype, a.device.type, a.requires_grad) for a in args]}
 args = [rand_strided(sh, st, dt, dev).requires_grad_(rg) for (sh, st, dt, dev, rg) in args]
 
 {model_str}
 
-mod = Repro()
+mod = Repro().cuda()
 opt_mod = {config.dynamo_import}.optimize("{compiler_name}")(mod)
 
 {run_code}
@@ -749,21 +713,6 @@ def dump_to_minify_after_dynamo(gm, args, compiler_name):
     if config.repro_level == 4:
         minifier_backend = "dynamo_accuracy_minifier_backend"
 
-    custom_compiler_error = (
-        textwrap.dedent(
-            """\
-        raise RuntimeError(
-            'Compiler name is None - this likely means that a custom compiler '
-            'was called by torchdynamo. Please remove this error, import your '
-            'custom compiler function, and replace the compiler_name="None" '
-            'line below to compiler_name=<my_imported_custom_function>'
-        )
-        """
-        )
-        if compiler_name is None
-        else ""
-    )
-
     contents = textwrap.dedent(
         f"""
 import os
@@ -777,17 +726,14 @@ def dump_to_minify_after_dynamo(gm, args, compiler_name):
 from {config.dynamo_import}.optimizations.backends import BACKENDS
 from {config.dynamo_import}.testing import rand_strided
 
-{TEST_REPLACEABLE_COMMENT}
-
 args = {[(tuple(a.shape), tuple(a.stride()), a.dtype, a.device.type, a.requires_grad) for a in args]}
 args = [rand_strided(sh, st, dt, dev).requires_grad_(rg) for (sh, st, dt, dev, rg) in args]
 
 {model_str}
-mod = Repro()
+mod = Repro().cuda()
 
 # Setup debug minifier compiler
 compiler_fn = BACKENDS["{minifier_backend}"]
-{custom_compiler_error}
 dynamo_minifier_backend = functools.partial(
     compiler_fn,
     compiler_name="{compiler_name}",
@@ -831,7 +777,7 @@ def debug_wrapper(gm, example_inputs, **kwargs):
                         example_inputs,
                         compiler_name,
                     )
-                    exc = AccuracyError("Bad accuracy detected.")
+                    exc = ValueError("Bad accuracy detected.")
                     exc.minifier_path = os.path.join(
                         minifier_dir(), "minifier_launcher.py"
                     )

From f8dc4cfc4d36f3a329442369d85b3adf36ca8dfa Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@meta.com>
Date: Sat, 12 Nov 2022 07:52:44 +0000
Subject: [PATCH 0830/1922] [quant][executorch] Support quant fusion for
 reshape in quant in executorch stack (#88858)

Summary: This diff added support for fusing "dq - reshape - q" to a reshape op, the op is needed in wakeword model

Test Plan: buck test executorch/exir/tests:quant_fusion_pass

Reviewed By: qihqi, JacobSzwejbka

Differential Revision: D41111069

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88858
Approved by: https://github.com/JacobSzwejbka
---
 torch/_C/__init__.pyi.in | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 2d20da2a04f30..5833d7d7f2a41 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -972,11 +972,14 @@ class AggregationType(Enum):
     AVG = 1
 
 class FileCheck(object):
-    # TODO (add more FileCheck signature)
-    def check_source_highlighted(self, highlight: str) -> 'FileCheck': ...
     def run(self, test_string: str) -> None: ...
     def check(self, test_string: str) -> 'FileCheck': ...
     def check_not(self, test_string: str) -> 'FileCheck': ...
+    def check_same(self, test_string: str) -> 'FileCheck': ...
+    def check_next(self, test_string: str) -> 'FileCheck': ...
+    def check_count(self, test_string: str, count: _int, exactly: _bool = False) -> 'FileCheck': ...
+    def check_dag(self, test_string: str) -> 'FileCheck': ...
+    def check_source_highlighted(self, test_string: str) -> 'FileCheck': ...
     ...
 
 # Defined in torch/csrc/jit/python/init.cpp

From 004d0854626405f326ea64ea884f25f2afd26a39 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Sat, 12 Nov 2022 07:52:53 +0000
Subject: [PATCH 0831/1922] Revert "[dynamo][api] Better support of
 torch.nn.Module (#88629)"

This reverts commit c83348597b195f2da1cca0e8318c878b104bce5d.

Reverted https://github.com/pytorch/pytorch/pull/88629 on behalf of https://github.com/anijain2305 due to job failing on master https://github.com/pytorch/pytorch/actions/runs/3449914495/jobs/5758267231
---
 test/dynamo/test_modules.py  | 127 -----------------------------------
 torch/_dynamo/__init__.py    |   2 -
 torch/_dynamo/debug_utils.py |   8 ---
 torch/_dynamo/eval_frame.py  |  74 ++++++--------------
 torch/_dynamo/testing.py     |  13 ----
 5 files changed, 20 insertions(+), 204 deletions(-)

diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py
index 930035f99a30c..2fb83b3add6cf 100644
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@@ -904,133 +904,6 @@ def forward(self, x):
         self.assertTrue(torch._dynamo.testing.same(real, graph(rx)))
 
 
-class MockModule(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.relu = torch.nn.ReLU()
-        self.linear = torch.nn.Linear(10, 10)
-        self.register_buffer("buf0", torch.randn(10, 10))
-
-    def forward(self, x):
-        return self.relu(self.linear(x) + self.buf0)
-
-
-class OptimizedModuleTest(torch._dynamo.test_case.TestCase):
-    def test_nn_module(self):
-        mod = MockModule()
-        cnt = torch._dynamo.testing.CompileCounter()
-        opt_mod = torch._dynamo.optimize(cnt)(mod)
-        self.assertIsInstance(opt_mod, torch._dynamo.OptimizedModule)
-
-        x = torch.randn(10, 10)
-        self.assertTrue(torch._dynamo.testing.same(mod(x), opt_mod(x)))
-        self.assertEqual(cnt.frame_count, 1)
-
-    def test_to(self):
-        mod = MockModule()
-        cnt = torch._dynamo.testing.CompileCounter()
-        opt_mod = torch._dynamo.optimize(cnt)(mod)
-        x = torch.randn(10, 10)
-        self.assertTrue(torch._dynamo.testing.same(mod(x), opt_mod(x)))
-        self.assertEqual(cnt.frame_count, 1)
-
-        # Ensure that there is no recompilation
-        opt_mod(x)
-        self.assertEqual(cnt.frame_count, 1)
-
-        opt_mod = opt_mod.to(device="cpu").to(dtype=torch.float64)
-        self.assertIsInstance(opt_mod, torch._dynamo.OptimizedModule)
-        x = torch.randn(10, 10).to(dtype=torch.float64)
-        opt_mod(x)
-        # Ensure that there is a recompilation
-        self.assertEqual(cnt.frame_count, 2)
-
-    def test_attr(self):
-        class MockModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.linear = torch.nn.Linear(10, 10)
-                self.register_buffer("buf0", torch.randn(10, 10))
-
-            def forward(self, x):
-                return self.r(torch.sin(x)) + self.buf0
-
-        mod = MockModule()
-        opt_mod = torch._dynamo.optimize("eager")(mod)
-
-        # Check parameteres and buffers
-        for (p1, p2) in zip(mod.parameters(), opt_mod.parameters()):
-            self.assertTrue(id(p1) == id(p2))
-
-    def test_recursion(self):
-        mod = MockModule()
-        cnt = torch._dynamo.testing.CompileCounter()
-        opt_mod = torch._dynamo.optimize(cnt)(mod)
-
-        for _ in range(5):
-            opt_mod = torch._dynamo.optimize(cnt)(opt_mod)
-        opt_mod(torch.randn(10, 10))
-        self.assertEqual(cnt.frame_count, 1)
-
-    def test_composition(self):
-        class InnerModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.relu = torch.nn.ReLU()
-
-            def forward(self, x):
-                return self.relu(torch.sin(x))
-
-        opt_inner_mod = InnerModule()
-
-        class OuterModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.mod = opt_inner_mod
-
-            def forward(self, x):
-                return self.mod(torch.cos(x))
-
-        outer_mod = OuterModule()
-        cnt = torch._dynamo.testing.CompileCounter()
-        opt_outer_mod = torch._dynamo.optimize(cnt)(outer_mod)
-
-        x = torch.randn(4)
-        self.assertIsInstance(opt_outer_mod, torch._dynamo.OptimizedModule)
-        self.assertTrue(torch._dynamo.testing.same(outer_mod(x), opt_outer_mod(x)))
-        self.assertEqual(cnt.frame_count, 1)
-
-    def test_composition_with_opt_mod(self):
-        class InnerModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.relu = torch.nn.ReLU()
-
-            def forward(self, x):
-                return self.relu(torch.sin(x))
-
-        inner_mod = InnerModule()
-        cnt = torch._dynamo.testing.CompileCounter()
-        opt_inner_mod = torch._dynamo.optimize(cnt)(inner_mod)
-
-        class OuterModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.mod = opt_inner_mod
-
-            def forward(self, x):
-                return self.mod(torch.cos(x))
-
-        outer_mod = OuterModule()
-        opt_outer_mod = torch._dynamo.optimize(cnt)(outer_mod)
-
-        x = torch.randn(4)
-        self.assertIsInstance(opt_outer_mod, torch._dynamo.OptimizedModule)
-        self.assertTrue(torch._dynamo.testing.same(outer_mod(x), opt_outer_mod(x)))
-        # There will be a graph break for the inner mod being OptimizedModule
-        self.assertEqual(cnt.frame_count, 2)
-
-
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/torch/_dynamo/__init__.py b/torch/_dynamo/__init__.py
index 5eee609b0852a..80f927aeef2fa 100644
--- a/torch/_dynamo/__init__.py
+++ b/torch/_dynamo/__init__.py
@@ -7,7 +7,6 @@
     export,
     optimize,
     optimize_assert,
-    OptimizedModule,
     reset_code,
     run,
     skip,
@@ -26,7 +25,6 @@
     "reset",
     "list_backends",
     "skip",
-    "OptimizedModule",
 ]
 
 
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index 089ef172d625d..f09991f9bf348 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -486,16 +486,8 @@ def same_two_models(gm, opt_gm, example_inputs, only_fwd=False):
     """
     Check two models have same accuracy.
     """
-    from .eval_frame import OptimizedModule
-    from .testing import named_parameters_for_optimized_module
     from .utils import same
 
-    if isinstance(gm, OptimizedModule):
-        gm.named_parameters = named_parameters_for_optimized_module(gm)
-
-    if isinstance(opt_gm, OptimizedModule):
-        opt_gm.named_parameters = named_parameters_for_optimized_module(opt_gm)
-
     ref = run_fwd_maybe_bwd(gm, example_inputs, only_fwd)
 
     try:
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 20e8c7de085e0..8d9e3b7b6aa14 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -5,7 +5,6 @@
 import logging
 import os
 import sys
-import textwrap
 import threading
 import traceback
 import types
@@ -45,27 +44,6 @@
 most_recent_backend = None
 
 
-class OptimizedModule(torch.nn.Module):
-    """
-    Wraps the original nn.Module object and later patches its
-    forward method to optimized self.forward method.
-    """
-
-    def __init__(self, mod):
-        super().__init__()
-        # Installs the params/buffer
-        self._orig_mod = mod
-
-    def __getattr__(self, name):
-        if name == "_orig_mod":
-            return self._modules["_orig_mod"]
-        return getattr(self._orig_mod, name)
-
-    def forward(self, *args, **kwargs):
-        # This will be monkey patched later
-        raise RuntimeError("Should not be here")
-
-
 def remove_from_cache(f):
     """
     Make sure f.__code__ is not cached to force a recompile
@@ -140,15 +118,31 @@ def __call__(self, fn):
         # Optimize the forward method of torch.nn.Module object
         if isinstance(fn, torch.nn.Module):
             mod = fn
-            new_mod = OptimizedModule(mod)
-            new_mod.forward = self(mod.forward)
+            optimized_forward = self(mod.forward)
+
+            class TorchDynamoNNModuleWrapper:
+                """
+                A wrapper that redirects the forward call to the optimized
+                forward, while for rest it redirects the calls to the original
+                module.
+                """
+
+                def __getattr__(self, name):
+                    return getattr(mod, name)
+
+                def forward(self, *args, **kwargs):
+                    return optimized_forward(*args, **kwargs)
+
+                def __call__(self, *args, **kwargs):
+                    return self.forward(*args, **kwargs)
+
+            new_mod = TorchDynamoNNModuleWrapper()
             # Save the function pointer to find the original callable while nesting
             # of decorators.
-            new_mod._torchdynamo_orig_callable = mod.forward
+            new_mod._torchdynamo_orig_callable = mod
             return new_mod
 
         assert callable(fn)
-
         callback = self.callback
         on_enter = self.on_enter
         backend_ctx_ctor = self.extra_ctx_ctor
@@ -190,34 +184,6 @@ def _fn(*args, **kwargs):
         # If the function is called using torch._dynamo.optimize decorator, we
         # should prevent any type of skipping.
         if callback not in (None, False):
-            if not hasattr(fn, "__code__"):
-                raise RuntimeError(
-                    textwrap.dedent(
-                        """
-
-                        torch._dynamo.optimize is called on a non function object.
-                        If this is a callable class, please optimize the individual methods that you are interested in optimizing.
-
-                        >> class CallableClass:
-                        >>     def __init__(self):
-                        >>         super().__init__()
-                        >>         self.relu = torch.nn.ReLU()
-                        >>
-                        >>     def __call__(self, x):
-                        >>         return self.relu(torch.sin(x))
-                        >>
-                        >>     def print_hello(self):
-                        >>         print("Hello world")
-                        >>
-                        >> mod = CallableClass()
-
-                        If you want to optimize the __call__ function
-
-                        >> mod.__call__ = torch._dynamo.optimize(mod.__call__)
-
-                        """
-                    )
-                )
             always_optimize_code_objects[fn.__code__] = True
 
         return _fn
diff --git a/torch/_dynamo/testing.py b/torch/_dynamo/testing.py
index b37299ffd5791..d6082ce48acf8 100644
--- a/torch/_dynamo/testing.py
+++ b/torch/_dynamo/testing.py
@@ -32,17 +32,6 @@ def clone_me(x):
     return x.detach().clone().requires_grad_(x.requires_grad)
 
 
-def named_parameters_for_optimized_module(mod):
-    assert isinstance(mod, eval_frame.OptimizedModule)
-    return mod._orig_mod.named_parameters
-
-
-def remove_optimized_module_prefix(name):
-    prefix = "_orig_mod."
-    assert name.startswith(prefix)
-    return name[len(prefix) :]
-
-
 def collect_results(model, prediction, loss, example_inputs):
     results = []
     results.append(prediction)
@@ -55,8 +44,6 @@ def collect_results(model, prediction, loss, example_inputs):
     grads = dict()
     params = dict()
     for name, param in model.named_parameters():
-        if isinstance(model, eval_frame.OptimizedModule):
-            name = remove_optimized_module_prefix(name)
         param_copy = param
         grad = param.grad
         # Treat None and zero grad as same

From 770d01732bf491174955cba8474e0a12c6b198af Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang@fb.com>
Date: Fri, 11 Nov 2022 08:21:48 -0800
Subject: [PATCH 0832/1922] [15/N] Add allreduce_coalesced custom op with
 CPU/CUDA implementations (#88846)

Differential Revision: [D41227740](https://our.internmc.facebook.com/intern/diff/D41227740)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88846
Approved by: https://github.com/kwen2501
---
 test/distributed/test_c10d_common.py    | 15 +++++++++++
 test/distributed/test_c10d_gloo.py      |  4 +++
 test/distributed/test_c10d_nccl.py      |  5 ++++
 torch/csrc/distributed/c10d/Ops.cpp     | 36 +++++++++++++++++++++++++
 torch/csrc/distributed/c10d/Ops.hpp     |  5 ++++
 torch/csrc/distributed/c10d/OpsImpl.cpp | 34 +++++++++++++++++++++++
 torch/csrc/distributed/c10d/init.cpp    |  6 ++---
 7 files changed, 102 insertions(+), 3 deletions(-)

diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index cf46f89b353cd..77ee7487a0afa 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -1503,6 +1503,21 @@ def _test_collectives(self, backend):
             with self.subTest(collective=collective, args=args):
                 self._call_collective_with_varying_tensors(backend, collective, *args)
 
+    def _test_allreduce_coalesced(self, backend):
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            backend,
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        # TODO: this will be updated in the future to not be backend specific
+        device = "cuda" if backend == "nccl" else "cpu"
+        tensors = [torch.ones(10, 10, device=torch.device(device))]
+        dist.all_reduce_coalesced(tensors, dist.ReduceOp.SUM)
+        for tensor in tensors:
+            self.assertEqual(tensor, torch.ones(10, 10) * self.world_size)
+
 class CompilerTest(MultiProcessTestCase):
     def setUp(self):
         super(CompilerTest, self).setUp()
diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
index e0c7c64f7b836..ba214a02696f9 100644
--- a/test/distributed/test_c10d_gloo.py
+++ b/test/distributed/test_c10d_gloo.py
@@ -2363,6 +2363,10 @@ class GlooProcessGroupWithDispatchedCollectivesTests(test_c10d_common.ProcessGro
     def test_collectives(self):
         self._test_collectives(backend="gloo")
 
+    @requires_gloo()
+    def test_allreduce_coalesced(self):
+        self._test_allreduce_coalesced(backend="gloo")
+
 class CompilerTest(test_c10d_common.CompilerTest):
 
     @property
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 5d412dd3fb1b0..b3790b082ed57 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -2953,6 +2953,11 @@ class NcclProcessGroupWithDispatchedCollectivesTests(test_c10d_common.ProcessGro
     def test_collectives(self):
         self._test_collectives(backend="nccl")
 
+    @requires_nccl()
+    @skip_if_lt_x_gpu(1)
+    def test_allreduce_coalesced(self):
+        self._test_allreduce_coalesced(backend="nccl")
+
 if __name__ == "__main__":
     assert (
         not torch.cuda._initialized
diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
index ea77bb337b4a8..15e186fe3d22d 100644
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@@ -40,6 +40,19 @@ std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>> allreduce_(
       std::move(tensor_vec), work);
 }
 
+c10::intrusive_ptr<Work> allreduce_coalesced_(
+    at::TensorList tensors,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    const c10::intrusive_ptr<ReduceOp>& reduce_op,
+    int64_t timeout) {
+  auto tensor_vec = tensors.vec();
+  AllreduceCoalescedOptions opts = AllreduceCoalescedOptions{};
+  opts.reduceOp = *reduce_op.get();
+  opts.timeout = std::chrono::milliseconds(timeout);
+
+  return process_group->allreduce_coalesced(tensor_vec, opts);
+}
+
 c10::intrusive_ptr<Work> reduce_(
     at::TensorList tensors,
     const c10::intrusive_ptr<ProcessGroup>& process_group,
@@ -177,6 +190,10 @@ TORCH_LIBRARY(c10d, m) {
   m.def(
       "allreduce_",
       dispatch(c10::DispatchKey::CompositeExplicitAutograd, allreduce_));
+  m.def(
+      "allreduce_coalesced_",
+      dispatch(
+          c10::DispatchKey::CompositeExplicitAutograd, allreduce_coalesced_));
   m.def(
       "allgather_",
       dispatch(c10::DispatchKey::CompositeExplicitAutograd, allgather_));
@@ -249,6 +266,25 @@ c10::intrusive_ptr<Work> allreduce(
       opts.timeout.count()));
 }
 
+c10::intrusive_ptr<Work> allreduce_coalesced(
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    at::TensorList tensors,
+    const AllreduceCoalescedOptions& opts) {
+  static auto op = c10::Dispatcher::singleton()
+                       .findSchemaOrThrow("c10d::allreduce_coalesced_", "")
+                       .typed<c10::intrusive_ptr<::c10d::Work>(
+                           at::TensorList,
+                           const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                           const c10::intrusive_ptr<::c10d::ReduceOp>&,
+                           int64_t)>();
+
+  return op.call(
+      tensors,
+      process_group,
+      c10::make_intrusive<ReduceOp>(opts.reduceOp),
+      opts.timeout.count());
+}
+
 c10::intrusive_ptr<Work> allgather(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const std::vector<std::vector<at::Tensor>>& output_tensors,
diff --git a/torch/csrc/distributed/c10d/Ops.hpp b/torch/csrc/distributed/c10d/Ops.hpp
index adc64066a885e..8ef78126e5b9e 100644
--- a/torch/csrc/distributed/c10d/Ops.hpp
+++ b/torch/csrc/distributed/c10d/Ops.hpp
@@ -21,6 +21,11 @@ TORCH_API c10::intrusive_ptr<Work> allreduce(
     at::TensorList tensors,
     const AllreduceOptions& opts = {});
 
+TORCH_API c10::intrusive_ptr<Work> allreduce_coalesced(
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    at::TensorList tensors,
+    const AllreduceCoalescedOptions& opts = {});
+
 TORCH_API c10::intrusive_ptr<Work> allgather(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const std::vector<std::vector<at::Tensor>>& output_tensors,
diff --git a/torch/csrc/distributed/c10d/OpsImpl.cpp b/torch/csrc/distributed/c10d/OpsImpl.cpp
index 03ec6892857e7..94f5febec14d0 100644
--- a/torch/csrc/distributed/c10d/OpsImpl.cpp
+++ b/torch/csrc/distributed/c10d/OpsImpl.cpp
@@ -149,6 +149,32 @@ std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>> allreduce_cuda_(
       std::move(tensor_vec), work);
 }
 
+c10::intrusive_ptr<Work> allreduce_coalesced_cpu_(
+    at::TensorList tensors,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    const c10::intrusive_ptr<ReduceOp>& reduce_op,
+    int64_t timeout) {
+  auto tensor_vec = tensors.vec();
+  AllreduceCoalescedOptions opts = AllreduceCoalescedOptions{};
+  opts.reduceOp = *reduce_op.get();
+  opts.timeout = std::chrono::milliseconds(timeout);
+
+  return process_group->allreduce_coalesced(tensor_vec, opts);
+}
+
+c10::intrusive_ptr<Work> allreduce_coalesced_cuda_(
+    at::TensorList tensors,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    const c10::intrusive_ptr<ReduceOp>& reduce_op,
+    int64_t timeout) {
+  auto tensor_vec = tensors.vec();
+  AllreduceCoalescedOptions opts = AllreduceCoalescedOptions{};
+  opts.reduceOp = *reduce_op.get();
+  opts.timeout = std::chrono::milliseconds(timeout);
+
+  return process_group->allreduce_coalesced(tensor_vec, opts);
+}
+
 std::tuple<std::vector<std::vector<at::Tensor>>, c10::intrusive_ptr<Work>>
 allgather_cpu_(
     const std::vector<std::vector<at::Tensor>>& output_tensors,
@@ -367,6 +393,14 @@ TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
   m.impl("allreduce_", allreduce_cuda_);
 }
 
+TORCH_LIBRARY_IMPL(c10d, CPU, m) {
+  m.impl("allreduce_coalesced_", allreduce_coalesced_cpu_);
+}
+
+TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
+  m.impl("allreduce_coalesced_", allreduce_coalesced_cuda_);
+}
+
 TORCH_LIBRARY_IMPL(c10d, CPU, m) {
   m.impl("allgather_", allgather_cpu_);
 }
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 6515a3d9a87d4..673f481d60251 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -1134,10 +1134,10 @@ that adds a prefix to each key inserted to the store.
 
           .def(
               "allreduce_coalesced",
-              [](::c10d::ProcessGroup& self,
-                 std::vector<at::Tensor>& xs,
+              [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
+                 const std::vector<at::Tensor>& xs,
                  ::c10d::AllreduceCoalescedOptions opts) {
-                return self.allreduce_coalesced(xs, opts);
+                return ::c10d::ops::allreduce_coalesced(self, xs, opts);
               },
               py::arg("tensors"),
               py::arg("opts") = ::c10d::AllreduceCoalescedOptions(),

From a7dd636100a51d181b469842fcb15f064c135ce8 Mon Sep 17 00:00:00 2001
From: Nikita Karetnikov <nikita@karetnikov.org>
Date: Sat, 12 Nov 2022 00:41:57 +0100
Subject: [PATCH 0833/1922] [primTorch] Improve `narrow` and `narrow_copy`:
 refs, tests, docs (#87045)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87045
Approved by: https://github.com/mruberry
---
 aten/src/ATen/native/TensorShape.cpp          |  13 +-
 test/test_meta.py                             |   1 -
 torch/_refs/__init__.py                       |  38 +++-
 torch/_tensor_docs.py                         |  13 +-
 torch/_torch_docs.py                          |  27 +--
 .../_internal/common_methods_invocations.py   | 163 ++++++++++++++----
 6 files changed, 188 insertions(+), 67 deletions(-)

diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index deb9b949aa5d3..e8c87a2f1f5ce 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -1196,6 +1196,8 @@ Tensor narrow_copy_dense(const Tensor& self, int64_t dim, int64_t start, int64_t
   return self.narrow(dim, start, length).clone(at::MemoryFormat::Contiguous);
 }
 
+// Should just use narrow_copy_out, but this API is used internally at Meta:
+// https://github.com/pytorch/pytorch/pull/87045#issuecomment-1309353561
 Tensor narrow_copy_dense_cpu(const Tensor& self, int64_t dim, int64_t start, int64_t length){
   auto output = at::empty_like(self);
   return narrow_copy_dense_cpu_out(self, dim, start, length, output);
@@ -1205,9 +1207,10 @@ Tensor narrow_copy_sparse(const Tensor& self, int64_t dim, int64_t start, int64_
   int64_t allDim = self.dim();
   int64_t end = start+length;
   TORCH_CHECK(allDim > 0, "narrow() cannot be applied to a 0-dim tensor.");
+  TORCH_CHECK(length >= 0, "narrow(): length must be non-negative.");
   TORCH_CHECK(dim >= 0 && dim < allDim,
     "Dimension ", dim, " out of range. Expecting 0 <= dim < ", allDim, ".");
-  TORCH_CHECK(start >= 0 && length >= 0 && end <= self.size(dim),
+  TORCH_CHECK(start >= 0 && end <= self.size(dim),
     "Invalid range to narrow. range(start, start+length) must be a subset of range(0, ", self.size(dim), ").")
   Tensor indices = self._indices();
   int64_t sparse_dim = self.sparse_dim();
@@ -1235,6 +1238,8 @@ Tensor narrow_copy_sparse(const Tensor& self, int64_t dim, int64_t start, int64_
   return newTensor._coalesced_(self.is_coalesced());
 }
 
+// Should just use narrow_copy_out, but this API is used internally at Meta:
+// https://github.com/pytorch/pytorch/pull/87045#issuecomment-1309353561
 Tensor& narrow_copy_dense_cpu_out(
   const Tensor& self, int64_t dim, int64_t start, int64_t length, Tensor& output
 ) {
@@ -1318,22 +1323,24 @@ Tensor& narrow_copy_dense_cpu_out(
 
 Tensor narrow(const Tensor& self, int64_t dim, int64_t start, int64_t length) {
   TORCH_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor.");
+  TORCH_CHECK(length >= 0, "narrow(): length must be non-negative.");
   auto cur_size = self.size(dim);
   if (start != cur_size) {  // start being the end is valid, but not a valid dim specification.
     start = maybe_wrap_dim(start, cur_size);
   }
-  TORCH_CHECK(length >= 0 && start <= cur_size - length,
+  TORCH_CHECK(start <= cur_size - length,
            "start (", start, ") + length (", length, ") exceeds dimension size (", cur_size, ").");
   return at::slice(self, dim, start, start + length, 1);
 }
 
 Tensor narrow_symint(const Tensor& self, int64_t dim, SymInt start, SymInt length) {
   TORCH_CHECK(self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor.");
+  TORCH_CHECK(length >= 0, "narrow(): length must be non-negative.");
   auto cur_size = self.sym_size(dim);
   if (start != cur_size) {  // start being the end is valid, but not a valid dim specification.
     start = maybe_wrap_dim(start, cur_size);
   }
-  TORCH_CHECK(length >= 0 && start <= cur_size - length,
+  TORCH_CHECK(start <= cur_size - length,
            "start (", start, ") + length (", length, ") exceeds dimension size (", cur_size, ").");
   return at::slice_symint(self, dim, start, start + length, 1);
 }
diff --git a/test/test_meta.py b/test/test_meta.py
index ef25d184c8428..ae248a90cffb7 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -745,7 +745,6 @@ def run_meta_crossref(
 }
 
 meta_function_device_skips['cpu'] = {
-    torch.narrow_copy: {b8, bf16, c128, c32, c64, f16, f32, f64, i16, i32, i64, i8, u8},
     torch.native_batch_norm: {f32, f64},
 }
 
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 43b0c74192dee..70edbff2237f2 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -2750,19 +2750,39 @@ def flipud(a: TensorLikeType) -> TensorLikeType:
 
 
 # CompositeImplicitAutograd - don't register decomp
-def narrow(a: TensorLikeType, dim: int, start: int, length: int) -> TensorLikeType:
+def narrow(
+    a: TensorLikeType, dim: int, start: Union[int, TensorLikeType], length: int
+) -> TensorLikeType:
+    # Supports Tensor overload that was added for XLA:
+    # https://github.com/pytorch/pytorch/issues/31558
+    if isinstance(start, TensorLike):
+        check(
+            start.dim() == 0 and utils.is_integer_dtype(start.dtype),
+            lambda: "start must be an 0-dim integral Tensor.",
+        )
+        start = start.item()  # type: ignore[assignment]
+    check(a.dim() > 0, lambda: "narrow() cannot be applied to a 0-dim tensor.")
+    check(length >= 0, lambda: "narrow(): length must be non-negative.")
     dim = utils.canonicalize_dim(a.ndim, dim)
+    dim_length = a.size(dim)
+    # Start being the end is usually invalid since it's out of bounds. So it's
+    # not allowed by canonicalize_dim. But for narrow it's valid as long as
+    # the length is 0, which is handled by the check below.
+    if start != dim_length:
+        # Negative start means indexing from the end of dim.
+        # Note: a dimension isn't being canonicalized here, this reuses
+        # canonicalize_dim because the semantics are similar.
+        start = utils.canonicalize_dim(dim_length, start)  # type: ignore[arg-type]
+    check(
+        start <= dim_length - length,  # type: ignore[arg-type]
+        lambda: f"start ({start}) + length ({length}) exceeds dimension size ({dim_length}).",
+    )
     return prims.slice_in_dim(a, start, start + length, axis=dim)
 
 
-@register_decomposition(torch.ops.aten.narrow_copy)
-@out_wrapper()
-def narrow_copy(a: TensorLikeType, dim: int, start: int, length: int) -> TensorLikeType:
-    # TODO: This must return a sparse tensor if the input is sparse, but refs
-    # have no sparse support.  See narrow_copy_sparse in core.
-    if a.is_sparse:
-        raise NotImplementedError("narrow_copy ref doesn't support sparse tensors")
-    return torch.clone(torch.narrow(a=a, dim=dim, start=start, length=length))  # type: ignore[call-overload]
+# TODO: This must return a sparse tensor if the input is sparse, but refs have
+# no sparse support. See narrow_copy_sparse in core.
+narrow_copy = _make_copy_from_view(narrow)
 
 
 def _normalize(
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 8c734a1f3774b..726ae5137e6a4 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -3436,18 +3436,7 @@ def callable(a, b) -> number
     r"""
 narrow(dimension, start, length) -> Tensor
 
-See :func:`torch.narrow`
-
-Example::
-
-    >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
-    >>> x.narrow(0, 0, 2)
-    tensor([[ 1,  2,  3],
-            [ 4,  5,  6]])
-    >>> x.narrow(1, 1, 2)
-    tensor([[ 2,  3],
-            [ 5,  6],
-            [ 8,  9]])
+See :func:`torch.narrow`.
 """,
 )
 
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 40375bae3e274..2ff2e9be315de 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -7980,8 +7980,10 @@ def merge_dicts(*dicts):
 Args:
     input (Tensor): the tensor to narrow
     dim (int): the dimension along which to narrow
-    start (Tensor or int): the starting dimension
-    length (int): the distance to the ending dimension
+    start (int or Tensor): index of the element to start the narrowed dimension
+        from. Can be negative, which means indexing from the end of `dim`. If
+        `Tensor`, it must be an 0-dim integral `Tensor` (bools not allowed)
+    length (int): length of the narrowed dimension, must be weakly positive
 
 Example::
 
@@ -7993,6 +7995,10 @@ def merge_dicts(*dicts):
     tensor([[ 2,  3],
             [ 5,  6],
             [ 8,  9]])
+    >>> torch.narrow(x, -1, torch.tensor(-1), 1)
+    tensor([[3],
+            [6],
+            [9]])
 """,
 )
 
@@ -8008,8 +8014,9 @@ def merge_dicts(*dicts):
 Args:
     input (Tensor): the tensor to narrow
     dim (int): the dimension along which to narrow
-    start (int): the starting offset
-    length (int): the distance to the ending dimension
+    start (int): index of the element to start the narrowed dimension from. Can
+        be negative, which means indexing from the end of `dim`
+    length (int): length of the narrowed dimension, must be weakly positive
 
 Keyword args:
     {out}
@@ -8027,13 +8034,13 @@ def merge_dicts(*dicts):
     >>> s = torch.arange(16).reshape(2, 2, 2, 2).to_sparse(2)
     >>> torch.narrow_copy(s, 0, 0, 1)
     tensor(indices=tensor([[0, 0],
-                        [0, 1]]),
-        values=tensor([[[0, 1],
-                        [2, 3]],
+                           [0, 1]]),
+           values=tensor([[[0, 1],
+                           [2, 3]],
 
-                        [[4, 5],
-                        [6, 7]]]),
-        size=(1, 2, 2, 2), nnz=2, layout=torch.sparse_coo)
+                          [[4, 5],
+                           [6, 7]]]),
+           size=(1, 2, 2, 2), nnz=2, layout=torch.sparse_coo)
 
 .. seealso::
 
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 5178ec978bd1c..8ab1ea8a047cd 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -4391,29 +4391,127 @@ def sample_repeat_tile(op_info, device, dtype, requires_grad, **kwargs):
         yield SampleInput(make_arg(shape), rep_dim)
 
 
-def sample_inputs_narrow_copy(op_info, device, dtype, requires_grad, **kwargs):
+def sample_inputs_narrow_narrow_copy(op_info, device, dtype, requires_grad, *, is_narrow, **kwargs):
     shapes_and_args = (
-        ((S, S, S), (1, 2, 2)),
-        ((S, S, S), (-1, 2, 2)),
-        ((S, S, S), (1, 0, 0)),
-        ((S, S, S), (-1, 0, 0)),
-        ((S, S, S), (2, 1, 2)),
+        ((S, S, S), 1, 2, 2),
+        ((S, S, S), -1, 2, 2),
+        ((S, S, S), 1, 0, 0),
+        ((S, S, S), -1, 0, 0),
+        ((S, S, S), 2, 1, 2),
     )
 
-    for shape, args in shapes_and_args:
+    for shape, dim, start, length in shapes_and_args:
         tensor = make_tensor(shape, dtype=dtype, device=device, low=None, high=None,
                              requires_grad=requires_grad)
-        yield SampleInput(tensor, args=args)
+        yield SampleInput(tensor, dim, start, length)
+        # narrow also accepts the start argument being a Tensor
+        if is_narrow:
+            yield SampleInput(tensor, dim, torch.tensor(start), length)
 
+def reference_inputs_narrow_narrow_copy(op_info, device, dtype, requires_grad, *, is_narrow, **kwargs):
+    yield from sample_inputs_narrow_narrow_copy(op_info, device, dtype, requires_grad, is_narrow=is_narrow, **kwargs)
 
-def sample_inputs_narrow(op_info, device, dtype, requires_grad, **kwargs):
-    '''
-    sample_inputs_narrow accepts the same inputs as narrow_copy, in addition
-    narrow also accepts `start` argument to be a Tensor.
-    '''
-    for sample in sample_inputs_narrow_copy(op_info, device, dtype, requires_grad, **kwargs):
-        yield sample
-        yield SampleInput(sample.input, args=(sample.args[0], torch.tensor(sample.args[1]), sample.args[2]))
+    shapes_and_args = (
+        # 1-dim
+        ((M,), 0, 0, 0),    # 0 elems from the left
+        ((M,), -1, -1, 0),  # 0 elems from the right
+        ((M,), 0, 5, 3),    # 3 elems from the left
+        ((M,), 0, -5, 2),   # 2 elems from the right
+        ((M,), -1, 0, M),   # M elems from the left
+        ((M,), 0, -M, M),   # M elems from the right
+
+        # 2-dim
+        ((M, S), 1, 0, 0),    # dim 1, 0 elems from the left
+        ((S, M), -2, -1, 0),  # dim 0, 0 elems from the right
+        ((L, S), 1, 2, 3),    # dim 1, 3 elems from the left
+        ((L, S), -1, 3, 2),   # dim 1, 2 elems from the left
+        ((M, L), 0, 0, M),    # dim 0, M elems from the left
+        ((M, L), -1, -L, L),  # dim 1, L elems from the right
+
+        # 3-dim
+        ((L, M, S), 2, 0, 0),    # dim 2, 0 elems from the left
+        ((M, S, L), -1, -1, 0),  # dim 2, 0 elems from the right
+        ((S, L, M), 2, 0, M),    # dim 2, M elems from the left
+        ((L, S, M), -1, -M, M),  # dim 2, M elems from the right
+        ((S, L, M), 1, 0, 0),    # dim 1, 0 elems from the left
+        ((S, L, M), 0, 2, 1),    # dim 0, 1 elem from the left
+        ((M, S, M), -1, -5, 4),  # dim 2, 4 elems from the right
+    )
+
+    for shape, dim, start, length in shapes_and_args:
+        tensor = make_tensor(shape, dtype=dtype, device=device, low=None, high=None,
+                             requires_grad=requires_grad)
+        yield SampleInput(tensor, dim, start, length)
+        # narrow also accepts the start argument being a Tensor
+        if is_narrow:
+            yield SampleInput(tensor, dim, torch.tensor(start), length)
+
+def error_inputs_narrow_narrow_copy(op_info, device, *, is_narrow, is_ref):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+
+    # 0-dim
+    yield ErrorInput(SampleInput(make_arg(()), 0, 0, 1),
+                     error_type=RuntimeError,
+                     error_regex=r"narrow\(\) cannot be applied to a 0-dim tensor\.")
+
+    # out of bounds dim
+    if not is_narrow and not is_ref and torch.device(device).type == 'cpu':
+        # narrow_copy_dense_cpu_out
+        yield ErrorInput(SampleInput(make_arg((M, S, L)), 3, 0, 0),
+                         error_type=RuntimeError,
+                         error_regex=r"Expected dim < static_cast<int64_t>\(self_sizes.size\(\)\) to be true, but got false\.")
+    else:
+        yield ErrorInput(SampleInput(make_arg((M, S, L)), 3, 0, 0),
+                         error_type=IndexError,
+                         error_regex=r"Dimension out of range \(expected to be in range of \[-3, 2\], but got 3\)")
+    # out of bounds dim (negative)
+    yield ErrorInput(SampleInput(make_arg((L, S, M)), -4, 0, 0),
+                     error_type=IndexError,
+                     error_regex=r"Dimension out of range \(expected to be in range of \[-3, 2\], but got -4\)")
+
+    # out of bounds start
+    if not is_narrow and not is_ref and torch.device(device).type == 'cpu':
+        # narrow_copy_dense_cpu_out
+        yield ErrorInput(SampleInput(make_arg((L, M, S)), 1, M + 1, 0),
+                         error_type=RuntimeError,
+                         error_regex=r"start \(11\) \+ length \(0\) exceeds dimension size \(10\)\.")
+    else:
+        yield ErrorInput(SampleInput(make_arg((L, M, S)), 1, M + 1, 0),
+                         error_type=IndexError,
+                         error_regex=r"Dimension out of range \(expected to be in range of \[-10, 9\], but got 11\)")
+    # out of bounds start (negative)
+    yield ErrorInput(SampleInput(make_arg((L, M, S)), 1, -M - 1, 0),
+                     error_type=IndexError,
+                     error_regex=r"Dimension out of range \(expected to be in range of \[-10, 9\], but got -11\)")
+
+    # out of bounds length
+    yield ErrorInput(SampleInput(make_arg((S, L, M)), 2, 0, M + 1),
+                     error_type=RuntimeError,
+                     error_regex=r"start \(0\) \+ length \(11\) exceeds dimension size \(10\)\.")
+    # out of bounds length (negative)
+    if not is_narrow and not is_ref and torch.device(device).type == 'cpu':
+        # narrow_copy_dense_cpu_out
+        yield ErrorInput(SampleInput(make_arg((M,)), 0, 0, -1),
+                         error_type=RuntimeError,
+                         error_regex=r"start \(0\) \+ length \(-1\) exceeds dimension size \(10\)\.")
+    else:
+        yield ErrorInput(SampleInput(make_arg((M,)), 0, 0, -1),
+                         error_type=RuntimeError,
+                         error_regex=r"narrow\(\): length must be non-negative\.")
+
+    # Test Tensor overload that was added for XLA. Start must be an 0-dim
+    # integral Tensor. narrow_copy doesn't have this overload.
+    # https://github.com/pytorch/pytorch/issues/31558
+    if is_narrow:
+        # *1-dim* integral Tensor
+        yield ErrorInput(SampleInput(make_arg((L, M, S)), 1, make_arg(S, dtype=torch.int), 2),
+                         error_type=RuntimeError,
+                         error_regex=r"start must be an 0-dim integral Tensor\.")
+
+        # 0-dim *bool* Tensor (bools are not allowed)
+        yield ErrorInput(SampleInput(make_arg((L, M, S)), -3, make_arg((), dtype=torch.bool), 3),
+                         error_type=RuntimeError,
+                         error_regex=r"start must be an 0-dim integral Tensor\.")
 
 
 def sample_trapezoid(op_info, device, dtype, requires_grad, **kwargs):
@@ -12407,7 +12505,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_out=False,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           sample_inputs_func=sample_inputs_narrow,
+           sample_inputs_func=partial(sample_inputs_narrow_narrow_copy, is_narrow=True),
+           reference_inputs_func=partial(reference_inputs_narrow_narrow_copy, is_narrow=True),
+           error_inputs_func=partial(error_inputs_narrow_narrow_copy, is_narrow=True, is_ref=False),
            skips=(
                # Use of .item()
                DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator'),
@@ -12423,15 +12523,16 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_fwgrad_bwgrad=False,
            supports_autograd=False,
            # https://github.com/pytorch/pytorch/issues/86931
-           sample_inputs_func=sample_inputs_narrow_copy,
+           sample_inputs_func=partial(sample_inputs_narrow_narrow_copy, is_narrow=False),
+           reference_inputs_func=partial(reference_inputs_narrow_narrow_copy, is_narrow=False),
+           error_inputs_func=partial(error_inputs_narrow_narrow_copy, is_narrow=False, is_ref=False),
            skips=(
                # https://github.com/pytorch/pytorch/issues/84577
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
-               # Not implemented
-               DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_meta_outplace', device_type='cuda'),
-               DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_meta_outplace', device_type='cuda'),
-               DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_symbolic_meta', device_type='cuda'),
+               # Lazy tensor failures: mutating and aliasing ops should all have codegen'd kernels
+               DecorateInfo(unittest.expectedFailure, 'TestLazyOpInfo', 'test_correctness'),
+               DecorateInfo(unittest.expectedFailure, 'TestLazyOpInfo', 'test_correctness_with_reusing_ir'),
            )),
     UnaryUfuncInfo('neg',
                    aliases=('negative', ),
@@ -18061,22 +18162,20 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.narrow",
         torch_opinfo_name="narrow",
         supports_nvfuser=False,
-        skips=(
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_meta'),
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_executor'),
-        )
-    ),
-    PythonRefInfo(
-        "_refs.nn.functional.group_norm",
-        torch_opinfo_name="nn.functional.group_norm",
-        supports_nvfuser=False,
-        validate_view_consistency=False,
+        error_inputs_func=partial(error_inputs_narrow_narrow_copy, is_narrow=True, is_ref=True),
     ),
     PythonRefInfo(
         "_refs.narrow_copy",
         torch_opinfo_name="narrow_copy",
         supports_out=True,
         supports_nvfuser=False,
+        error_inputs_func=partial(error_inputs_narrow_narrow_copy, is_narrow=False, is_ref=True),
+    ),
+    PythonRefInfo(
+        "_refs.nn.functional.group_norm",
+        torch_opinfo_name="nn.functional.group_norm",
+        supports_nvfuser=False,
+        validate_view_consistency=False,
     ),
     PythonRefInfo(
         "_refs.native_layer_norm",

From b10918528650660e1d402d5bc3090dedd382cf5e Mon Sep 17 00:00:00 2001
From: soulitzer <soulitzer@gmail.com>
Date: Fri, 11 Nov 2022 11:51:22 -0500
Subject: [PATCH 0834/1922] Turn internal assert when saved tensor is detached
 inplace into torch check (#88860)

Fixes https://github.com/pytorch/pytorch/issues/88809

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88860
Approved by: https://github.com/albanD
---
 test/test_autograd.py                  | 14 ++++++++++++++
 torch/csrc/autograd/saved_variable.cpp | 11 ++++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index e08047860e423..33cf188af0659 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -6776,6 +6776,20 @@ def inplace_double(x):
         # not leaf, not output
         test(lambda: (1 + torch.randn(5, requires_grad=True)), False)
 
+    def test_saved_variable_saved_original_inplace_detach(self):
+        # Detaching a tensor that is saved input raises
+        a = torch.tensor(1., requires_grad=True).clone()
+        b = a.sin()
+        a.detach_()
+        with self.assertRaisesRegex(RuntimeError, "Trying to use a saved tensor that has been detached"):
+            b.backward()
+
+        # Detaching a tensor that is saved as output is OK
+        a = torch.tensor(1., requires_grad=True).clone()
+        b = a.exp()
+        a.detach_()
+        b.backward()
+
     def test_saved_variable_packing_unpacking_did_not_save_original_with_hooks(self):
         # Tests that packing/unpacking a SavedVariable works correctly with user-defined hooks
         # The saved_original / did_not_save_original distinction corresponds to the `save_original`
diff --git a/torch/csrc/autograd/saved_variable.cpp b/torch/csrc/autograd/saved_variable.cpp
index a2e0f05b63943..d438205e8947f 100644
--- a/torch/csrc/autograd/saved_variable.cpp
+++ b/torch/csrc/autograd/saved_variable.cpp
@@ -144,7 +144,16 @@ Variable SavedVariable::unpack(std::shared_ptr<Node> saved_for) const {
                 : grad_fn_;
 
   if (!is_leaf_ && !grad_fn) {
-    TORCH_INTERNAL_ASSERT(saved_for, "No grad_fn for non-leaf saved tensor");
+    // This issue was introduced when we added logic to save the original
+    // because now we rely on data_.grad_fn(), but can be unreliable if the
+    // autograd_meta of that saved tensor is cleared with an in-place detach.
+    // As a simple fix, we choose to disallow that behavior here even though
+    // it makes behavior inconsistent depending on whether you are saving
+    // input or output.
+    TORCH_CHECK(
+        saved_for,
+        "Trying to use a saved tensor that has been detached in-place, i.e. with .detach_()."
+        "This is not supported, please use out-of-place `.detach()` instead");
     grad_fn = std::move(saved_for);
   }
 

From 21dc6665aa46efab77d5e62c3c08d270f0f6fe8d Mon Sep 17 00:00:00 2001
From: ydwu4 <yidi@meta.com>
Date: Sat, 12 Nov 2022 20:00:51 +0000
Subject: [PATCH 0835/1922] torchdynamo support self.modules() for nn_module
 (#88695)

This PR allows models to call self.modules() during dynamo tracing.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88695
Approved by: https://github.com/voznesenskym
---
 test/dynamo/test_repros.py           | 20 ++++++++++++++++++++
 torch/_dynamo/guards.py              |  2 +-
 torch/_dynamo/variables/nn_module.py |  2 ++
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 2103e075fffc9..913d59322ac76 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -1792,6 +1792,26 @@ def fn(x):
         res = opt_fn(a)
         self.assertTrue(same(ref, res))
 
+    def test_modules(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc = torch.nn.Linear(4, 3)
+
+            def forward(self, inp):
+                res = torch.zeros(3, 3)
+                for mod in self.modules():
+                    res += self.fc(inp)
+                return res
+
+        mod = Foo()
+        args = (torch.ones(3, 4),)
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_mod = torch._dynamo.optimize(cnt, nopython=True)(mod)
+        self.assertTrue(same(mod(*args), opt_mod(*args)))
+        self.assertEqual(cnt.op_count, 5)
+        self.assertEqual(cnt.frame_count, 1)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index 382734412b2ba..d4903964aac6c 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -543,7 +543,7 @@ def __init__(self, expr_to_tensor_ref, id_to_name_map):
         self.id_to_name_map = id_to_name_map
 
     def _print_Symbol(self, expr) -> str:
-        assert isinstance(expr, sympy.core.symbol.Symbol)
+        assert isinstance(expr, sympy.Symbol)
         if expr == 0:
             return "0"
         if expr == 1:
diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py
index 6f7c2ff287373..1922980fc957f 100644
--- a/torch/_dynamo/variables/nn_module.py
+++ b/torch/_dynamo/variables/nn_module.py
@@ -337,6 +337,8 @@ def named_embed(name, obj):
             ):
                 result.append(named_embed(name, submod))
             return ListIteratorVariable(result, mutable_local=MutableLocal(), **options)
+        elif name == "modules":
+            return wrap_values(module.named_modules())
         elif name == "parameters":
             return wrap_values(module.named_parameters(**get_kwargs("recurse")))
         elif name == "values":

From 94e38d002937735fc36b3046089d79ec7506a427 Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang@fb.com>
Date: Fri, 11 Nov 2022 11:44:00 -0800
Subject: [PATCH 0836/1922] [16/N] Add _allgather_base custom op with CPU/CUDA
 implementation (#88889)

Differential Revision: [D41227739](https://our.internmc.facebook.com/intern/diff/D41227739)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88889
Approved by: https://github.com/kwen2501
---
 test/distributed/test_c10d_nccl.py      | 17 +++++++++++++++++
 torch/csrc/distributed/c10d/Ops.cpp     | 25 +++++++++++++++++++++++++
 torch/csrc/distributed/c10d/Ops.hpp     |  6 ++++++
 torch/csrc/distributed/c10d/OpsImpl.cpp | 22 ++++++++++++++++++++++
 torch/csrc/distributed/c10d/init.cpp    |  8 +++++++-
 5 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index b3790b082ed57..c514ea4ab31fd 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -2958,6 +2958,23 @@ def test_collectives(self):
     def test_allreduce_coalesced(self):
         self._test_allreduce_coalesced(backend="nccl")
 
+    @requires_nccl()
+    @skip_if_lt_x_gpu(1)
+    def test_allgather_base(self):
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            "nccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        device = "cuda"
+        tensor = torch.ones(10, 10, device=torch.device(device))
+        output_tensor = torch.zeros(10, 10, device=torch.device(device))
+        dist.all_gather_into_tensor(output_tensor, tensor)
+        self.assertEqual(output_tensor, tensor)
+
+
 if __name__ == "__main__":
     assert (
         not torch.cuda._initialized
diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
index 15e186fe3d22d..f825afca2a1d9 100644
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@@ -88,6 +88,13 @@ allgather_(
           output_tensors, work);
 }
 
+c10::intrusive_ptr<Work> _allgather_base_(
+    at::Tensor& output_tensor,
+    at::Tensor& input_tensor,
+    const c10::intrusive_ptr<ProcessGroup>& process_group) {
+  return process_group->_allgather_base(output_tensor, input_tensor);
+}
+
 std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>> reduce_scatter_(
     const std::vector<at::Tensor>& output_tensors,
     const std::vector<std::vector<at::Tensor>>& input_tensors,
@@ -197,6 +204,9 @@ TORCH_LIBRARY(c10d, m) {
   m.def(
       "allgather_",
       dispatch(c10::DispatchKey::CompositeExplicitAutograd, allgather_));
+  m.def(
+      "_allgather_base_",
+      dispatch(c10::DispatchKey::CompositeExplicitAutograd, _allgather_base_));
   m.def(
       "reduce_scatter_",
       dispatch(c10::DispatchKey::CompositeExplicitAutograd, reduce_scatter_));
@@ -303,6 +313,21 @@ c10::intrusive_ptr<Work> allgather(
       output_tensors, input_tensors, process_group, opts.timeout.count()));
 }
 
+c10::intrusive_ptr<Work> _allgather_base(
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    at::Tensor& output_tensor,
+    at::Tensor& input_tensor,
+    const AllgatherOptions& opts) {
+  static auto op = c10::Dispatcher::singleton()
+                       .findSchemaOrThrow("c10d::_allgather_base_", "")
+                       .typed<c10::intrusive_ptr<Work>(
+                           at::Tensor&,
+                           at::Tensor&,
+                           const c10::intrusive_ptr<::c10d::ProcessGroup>&)>();
+
+  return op.call(output_tensor, input_tensor, process_group);
+}
+
 c10::intrusive_ptr<Work> reduce_scatter(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const std::vector<at::Tensor>& output_tensors,
diff --git a/torch/csrc/distributed/c10d/Ops.hpp b/torch/csrc/distributed/c10d/Ops.hpp
index 8ef78126e5b9e..72f09e341d7df 100644
--- a/torch/csrc/distributed/c10d/Ops.hpp
+++ b/torch/csrc/distributed/c10d/Ops.hpp
@@ -32,6 +32,12 @@ TORCH_API c10::intrusive_ptr<Work> allgather(
     const std::vector<at::Tensor>& input_tensors,
     const AllgatherOptions& opts = {});
 
+TORCH_API c10::intrusive_ptr<Work> _allgather_base(
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    at::Tensor& outputTensor,
+    at::Tensor& inputTensor,
+    const AllgatherOptions& opts = {});
+
 TORCH_API c10::intrusive_ptr<Work> reduce_scatter(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const std::vector<at::Tensor>& output_tensors,
diff --git a/torch/csrc/distributed/c10d/OpsImpl.cpp b/torch/csrc/distributed/c10d/OpsImpl.cpp
index 94f5febec14d0..78e26c9656d8d 100644
--- a/torch/csrc/distributed/c10d/OpsImpl.cpp
+++ b/torch/csrc/distributed/c10d/OpsImpl.cpp
@@ -211,6 +211,20 @@ allgather_cuda_(
           output_tensors, work);
 }
 
+c10::intrusive_ptr<Work> _allgather_base_cpu_(
+    at::Tensor& output_tensor,
+    at::Tensor& input_tensor,
+    const c10::intrusive_ptr<ProcessGroup>& process_group) {
+  return process_group->_allgather_base(output_tensor, input_tensor);
+}
+
+c10::intrusive_ptr<Work> _allgather_base_cuda_(
+    at::Tensor& output_tensor,
+    at::Tensor& input_tensor,
+    const c10::intrusive_ptr<ProcessGroup>& process_group) {
+  return process_group->_allgather_base(output_tensor, input_tensor);
+}
+
 std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>
 reduce_scatter_cpu_(
     const std::vector<at::Tensor>& output_tensors,
@@ -409,6 +423,14 @@ TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
   m.impl("allgather_", allgather_cuda_);
 }
 
+TORCH_LIBRARY_IMPL(c10d, CPU, m) {
+  m.impl("_allgather_base_", _allgather_base_cpu_);
+}
+
+TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
+  m.impl("_allgather_base_", _allgather_base_cuda_);
+}
+
 TORCH_LIBRARY_IMPL(c10d, CPU, m) {
   m.impl("reduce_scatter_", reduce_scatter_cpu_);
 }
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 673f481d60251..2424506eef0ff 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -1187,7 +1187,13 @@ that adds a prefix to each key inserted to the store.
 
           .def(
               "_allgather_base",
-              &::c10d::ProcessGroup::_allgather_base,
+              [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
+                 at::Tensor& output_tensor,
+                 at::Tensor& input_tensor,
+                 const ::c10d::AllgatherOptions& opts) {
+                return ::c10d::ops::_allgather_base(
+                    self, output_tensor, input_tensor, opts);
+              },
               py::arg("output"),
               py::arg("input"),
               py::arg("opts") = ::c10d::AllgatherOptions(),

From 3bee3e5fbaec3553c27548fd2ba0d95249ed8f54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleksandar=20Samard=C5=BEi=C4=87?=
 <asamardzic@quansight.com>
Date: Sun, 13 Nov 2022 00:31:11 +0000
Subject: [PATCH 0837/1922] Vectorized CPU code implementing left shift
 operator. (#88607)

This PR adds vectorized implementation for CPU version of left shift operator.

All of the tests run by `pytest test/test_ops.py -vk left_shift` pass.

Here are some additional details:

<details>
<summary>
Benchmarking script (writen by Philip, with small tweaks by Mario) comparing left shifts with multiplications - on par now
</summary>

```python
import torch
from torch import Tensor
from torch.utils.benchmark import Timer, Compare
from itertools import product
from functools import partial

# These functions exist, because torch.jit.script does not support `torch.iinfo`
def _num_value_bits(dtype):
    if dtype == torch.uint8:
        return 8
    else:  # torch.int32
        return 31

def _max_value(dtype):
    if dtype == torch.uint8:
        return 255
    else:  # torch.int32
        return 2147483647

def bitshift(image, dtype):
    num_value_bits_input = _num_value_bits(image.dtype)
    num_value_bits_output = _num_value_bits(dtype)

    return image.to(dtype).bitwise_left_shift_(num_value_bits_output - num_value_bits_input)

def mul(image, dtype):
    input_max = float(_max_value(image.dtype))
    output_max = float(_max_value(dtype))

    factor = int((output_max + 1) // (input_max + 1))
    image = image.to(dtype)
    return image * factor

size = 256
image = torch.randint(0, 256, (3, size, size), dtype=torch.uint8)
dtype = torch.int32

def gen_inputs():
    devices = ("cpu",)
    fns = (mul, bitshift)
    threads = (1,)
    for device, fn, threads in product(devices, fns, threads):
        yield f"Bitshift {device} {image.dtype}", str(tuple(image.shape)), threads, fn, image, dtype

def benchmark(label, sub_label, threads, f, *args, **kwargs):
    return Timer("f(*args, **kwargs)",
                 globals=locals(),
                 label=label,
                 description=f.__name__,
                 sub_label=sub_label,
                 num_threads=threads).blocked_autorange()

results = []
for args in gen_inputs():
    results.append(benchmark(*args))

compare = Compare(results)
compare.trim_significant_figures()
compare.print()
```
</details>

<details>
<summary>
Test script exercising large number of combinations of left shift operands that I've used for further testing (validates results through comparing with results generated by NumPy)
</summary>

```python
import numpy as np
import torch

# Testing shifting of non-negative numbers only, but will test all
# possible RHS shift values for given type.  For int8 and int16, we'll
# test shifting all of non-negative values represntable by type.  For
# the rest of data types, we'll test shifting some random numbers in
# the corresponding range.
def _create_inputs(dtype):
    info = torch.iinfo(dtype)
    if dtype == torch.int8 or dtype == torch.int16:
        ntests = info.max + 1
        x = torch.arange(info.max + 1, dtype=dtype, device="cpu", requires_grad=False)
    else:
        ntests = 100000
        x = torch.randint(info.max + 1 if dtype != torch.int64 else info.max, (ntests,), dtype=dtype, device="cpu", requires_grad=False)
    y = torch.tensor(range(info.bits), dtype=dtype, device="cpu", requires_grad=False)
    xy = torch.cartesian_prod(x, y)
    return (xy[:, 0], xy[:, 1])

torch.manual_seed(0)

# Perform testing for each datatype supported, and compare results
# with ones generated by numpy.
for dtype in (torch.int8, torch.int16, torch.int32, torch.int64):
    (x, y) = _create_inputs(dtype)
    z = x << y
    xnp = x.numpy()
    ynp = y.numpy()
    znp = z.numpy()
    assert((znp == (xnp << ynp)).all())
```
</details>

<details>
<summary>
Benchmarking script running the left shift operator on tensors of different length (and varying number of bits to shift)
</summary>

```python
import torch
import pickle
import itertools
from torch.utils.benchmark import Timer, Compare

torch.manual_seed(0)

# Edit this part if needed.
lengths = [1024, 4096, 16384, 65536]
rhss = [1, 2, 7, 8, 15, 16, 31, 32, 63, 64]

benchmark_name = "lshift"
label = ""
dtypes = [torch.int8, torch.int16, torch.int32, torch.int64]
results = []

# Create an argument pair for testing.  Argument are tensors of given
# datatype and length, LHS for each shift operation is a random
# number, and RHS is given value that is same for all of them.
def _make_args(dtype, length, rhs):
    info = torch.iinfo(dtype)
    imax = info.max
    return (torch.randint(info.max, (length,), dtype=dtype, device="cpu", requires_grad=False),
            rhs * torch.ones((length,), dtype=dtype, device="cpu", requires_grad=False))

# Run shift operation for vectors of given lenghts and for given
# number of bits to be shifted, and remember timings.
for dtype, length, rhs in itertools.product(dtypes, lengths, rhss):
    x, y = _make_args(dtype, length, rhs)
    timer = Timer("x << y",
                  globals=globals(),
                  label=benchmark_name,
                  description=label,
                  sub_label=f"dtype={dtype},length={length}",
                  num_threads=1)
    results.append(timer.blocked_autorange())

# Gather results.
compare = Compare(results)
compare.trim_significant_figures()
compare.print()

# Print results.
with open("{}.pickle".format(label), "wb") as f:
    pickle.dump(results, f)
```
</details>

<details>
<summary>
Results of running above benchmarking script - results manually merged for runs of viable/strict (labeled "master" in the table below) and my branch (labeled "mybranch" in the table below)
</summary>

```
[------------------- lshift -------------------------------]
                                      |  master	|  mybranch
1 threads: ------------------------------------------------
      dtype=torch.int8,length=1024    |     3  	|      3
      dtype=torch.int8,length=4096    |     5  	|      3
      dtype=torch.int8,length=16384   |    14  	|      5
      dtype=torch.int8,length=65536   |    51  	|     15
      dtype=torch.int16,length=1024   |     3  	|      3
      dtype=torch.int16,length=4096   |     4  	|      3
      dtype=torch.int16,length=16384  |    11  	|      5
      dtype=torch.int16,length=65536  |    39  	|     13
      dtype=torch.int32,length=1024   |     3  	|      2
      dtype=torch.int32,length=4096   |     4  	|      3
      dtype=torch.int32,length=16384  |    10  	|      4
      dtype=torch.int32,length=65536  |    35  	|     12
      dtype=torch.int64,length=1024   |     3  	|      3
      dtype=torch.int64,length=4096   |     4  	|      3
      dtype=torch.int64,length=16384  |    11  	|      6
      dtype=torch.int64,length=65536  |    36  	|     20

Times are in microseconds (us).
```
</details>

All of the testing/benchmarking was conducted on qpu3, that supports AVX2 only.  For basic validation of AVX-512 update of left shift implementation for 8-bit operands (that is the only one that is non-trivial in AVX-512 case), [Compiler Explorer](https://godbolt.org/) is used, with GCC trunk and `-mavx512f -mavx512bw` flags added.  Here are further details:

<details>
<summary>
C program used for basic validation of AVX-512 vectorized version for 8-bit operands
</summary>

```
#include <stdio.h>
#include <stdint.h>
#include <string.h>

#include <immintrin.h>

static void print_m512i_int8(const __m512i* x)
{
    int8_t val[64];
    memcpy(val, x, sizeof(val));
    for (int i = 0; i < 64; ++i) {
        if (i > 0)
            printf(", ");
        printf("%d", (int)val[i]);
    }
    printf("\n");
}

int main()
{
    __m512i a = _mm512_set_epi8(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                                1);
    __m512i b = _mm512_set_epi8(7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6,
                                5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
                                3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2,
                                1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
                                0);

  // ------- Copied code from vec512_int.h

  // Mask used to set upper 8 bits of each 16-bit value to 0, and keep
  // lower 8 bits.
  __m512i mask = _mm512_set_epi16(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff);

  // Convert 8-bit operands from lower lanes to 16-bit values, and
  // perform vectorized shift.  Make sure that upper 8 bits of 16-bit
  // results are all 0.
  __m256i a_lo_8 = _mm512_extracti64x4_epi64(a, 0);
  __m256i b_lo_8 = _mm512_extracti64x4_epi64(b, 0);
  __m512i a_lo_16 = _mm512_cvtepi8_epi16(a_lo_8);
  __m512i b_lo_16 = _mm512_cvtepi8_epi16(b_lo_8);
  __m512i c_lo_16 = _mm512_and_si512(_mm512_sllv_epi16(a_lo_16, b_lo_16), mask);

  // Convert 8-bit operands from upper lanes to 16-bit values, and
  // perform vectorized shift.  Make sure that upper 8 bits of 16-bit
  // results are all 0.
  __m256i a_hi_8 = _mm512_extracti64x4_epi64(a, 1);
  __m256i b_hi_8 = _mm512_extracti64x4_epi64(b, 1);
  __m512i a_hi_16 = _mm512_cvtepi8_epi16(a_hi_8);
  __m512i b_hi_16 = _mm512_cvtepi8_epi16(b_hi_8);
  __m512i c_hi_16 = _mm512_and_si512(_mm512_sllv_epi16(a_hi_16, b_hi_16), mask);

  // Cast 16-bit results back into 8-bit values and merge them
  // together (using unsigned saturation with higher 8 bits set to 0
  // above ensures that results are correct).  Values are merged per
  // lanes, so this is not yet the final result.
  __m512i c_perm = _mm512_packus_epi16(c_lo_16, c_hi_16);

  // Permute values so that final result is produced.
  __m512i idx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
  __m512i c = _mm512_permutexvar_epi64(idx, c_perm);

  // ------- End copied

    print_m512i_int8(&c);
    // Expected output: 1(x8), 2(x8), 4(x8), 8(x8), 16(x8), 32(x8), 64(x8), 128(x8), -128(x8)

    return 0;
}
```
</details>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88607
Approved by: https://github.com/jgong5, https://github.com/lezcano, https://github.com/peterbell10
---
 aten/src/ATen/cpu/vec/vec256/vec256_int.h    | 195 +++++++++++++++++++
 aten/src/ATen/cpu/vec/vec512/vec512_int.h    |  93 +++++++++
 aten/src/ATen/cpu/vec/vec_base.h             |  13 ++
 aten/src/ATen/native/cpu/BinaryOpsKernel.cpp |  11 +-
 4 files changed, 308 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_int.h b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
index 0cc36d590019d..7737f4a0037cd 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
@@ -1133,6 +1133,201 @@ inline Vectorized<int8_t> Vectorized<int8_t>::le(const Vectorized<int8_t>& other
   return (*this <= other) & Vectorized<int8_t>(1);
 }
 
+template <bool left_shift>
+Vectorized<int16_t> inline shift_256_16(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  // No vector instruction for shifting int16_t, so emulating it instead.
+
+  // Control masks for shuffle operation, treating 256 bits as an
+  // array of 16-bit elements, and considering pairs of neighboring
+  // elements.  Specifially, a mask named "ctl_M_N" (M,N in [0,1], and
+  // M!=N) is set so that shuffle will move element with index M from
+  // input pair into element with index N in output pair, and element
+  // with index M in output pair will be set to all 0s.
+  __m256i ctl_0_1 = _mm256_set_epi8(29, 28, 0x80, 0x80, 25, 24, 0x80, 0x80,
+                                    21, 20, 0x80, 0x80, 17, 16, 0x80, 0x80,
+                                    13, 12, 0x80, 0x80, 9, 8, 0x80, 0x80,
+                                    5, 4, 0x80, 0x80, 1, 0, 0x80, 0x80);
+  __m256i ctl_1_0 = _mm256_set_epi8(0x80, 0x80, 31, 30, 0x80, 0x80, 27, 26,
+                                    0x80, 0x80, 23, 22, 0x80, 0x80, 19, 18,
+                                    0x80, 0x80, 15, 14, 0x80, 0x80, 11, 10,
+                                    0x80, 0x80, 7, 6, 0x80, 0x80, 3, 2);
+
+  // Masks for bitwise and operation, treating 256 bits as an array of
+  // 16-bit elements, and considering them in pairs of neighboring
+  // elements.  A mask named "keep_M" (M in [0,1]) is set so that
+  // bitwise and will copy element with index M from input pair into
+  // element with the same index in output pair, while the other
+  // element in output pair will be set to all 0s.
+  __m256i keep_0 = _mm256_set1_epi32(0xFFFF);
+  __m256i keep_1 = _mm256_set1_epi32(0xFFFF0000);
+
+  // Take each 16-bit element with idx%2==0 from input array to be
+  // shifted and extend it to 32 bits so that 0s are added to the
+  // right.  Then, perform shifting on this 32-bit number.  Upper 16
+  // bits will be proper result of shifting original 16-bit number, so
+  // write them to result array, into the same position from which
+  // corresponding input element is taken.  Also, make sure that
+  // result array elements with idx%2!=0 are set to all 0s.
+  //
+  // Note that number of bits to shift for is extended to 32 bits by
+  // adding 0s to the left.  That means this number is not properly
+  // sign-extended for negative values.  However, number of bits to
+  // shift is treated as an unsigned integer by respective shift
+  // intrinsics anyway so if negative then either with or without
+  // proper sign extension, it will be interpreted as a number greater
+  // than 32, and the shifting result will be the same.
+  __m256i a0 = _mm256_shuffle_epi8(a, ctl_0_1);
+  __m256i b0 = _mm256_and_si256(b, keep_0);
+  __m256i c0;
+  if (left_shift)
+    c0 = _mm256_sllv_epi32(a0, b0);
+  c0 = _mm256_shuffle_epi8(c0, ctl_1_0);
+
+  // Peform shifting the same way for input array elements with
+  // idx%2==1.
+  __m256i a1 = _mm256_and_si256(a, keep_1);
+  __m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0);
+  __m256i c1;
+  if (left_shift)
+    c1 = _mm256_sllv_epi32(a1, b1);
+  c1 = _mm256_and_si256(c1, keep_1);
+
+  // Merge partial results into the final result.
+  __m256i c = _mm256_or_si256(c0, c1);
+
+  return c;
+}
+
+template <bool left_shift>
+Vectorized<int8_t> inline shift_256_8(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  // No vector instruction for shifting int8_t, so emulating it instead.
+
+  // Control masks for shuffle operation, treating 256 bits as an
+  // array of 8-bit elements, and considering quadruples of
+  // neighboring elements.  Specifially, a mask named "ctl_M_N" (M,N
+  // in [0,1,2,3], and M!=N) is set so that shuffle will move element
+  // with index M from input quadruple into element with index N in
+  // output quadruple, and other elements in output quadruple will be
+  // set to all 0s.
+  __m256i ctl_0_3 = _mm256_set_epi8(28, 0x80, 0x80, 0x80, 24, 0x80, 0x80, 0x80,
+                                    20, 0x80, 0x80, 0x80, 16, 0x80, 0x80, 0x80,
+                                    12, 0x80, 0x80, 0x80, 8, 0x80, 0x80, 0x80,
+                                    4, 0x80, 0x80, 0x80, 0, 0x80, 0x80, 0x80);
+  __m256i ctl_1_0 = _mm256_set_epi8(0x80, 0x80, 0x80, 29, 0x80, 0x80, 0x80, 25,
+                                    0x80, 0x80, 0x80, 21, 0x80, 0x80, 0x80, 17,
+                                    0x80, 0x80, 0x80, 13, 0x80, 0x80, 0x80, 9,
+                                    0x80, 0x80, 0x80, 5, 0x80, 0x80, 0x80, 1);
+  __m256i ctl_1_3 = _mm256_set_epi8(29, 0x80, 0x80, 0x80, 25, 0x80, 0x80, 0x80,
+                                    21, 0x80, 0x80, 0x80, 17, 0x80, 0x80, 0x80,
+                                    13, 0x80, 0x80, 0x80, 9, 0x80, 0x80, 0x80,
+                                    5, 0x80, 0x80, 0x80, 1, 0x80, 0x80, 0x80);
+  __m256i ctl_2_0 = _mm256_set_epi8(0x80, 0x80, 0x80, 30, 0x80, 0x80, 0x80, 26,
+                                    0x80, 0x80, 0x80, 22, 0x80, 0x80, 0x80, 18,
+                                    0x80, 0x80, 0x80, 14, 0x80, 0x80, 0x80, 10,
+                                    0x80, 0x80, 0x80, 6, 0x80, 0x80, 0x80, 2);
+  __m256i ctl_2_3 = _mm256_set_epi8(30, 0x80, 0x80, 0x80, 26, 0x80, 0x80, 0x80,
+                                    22, 0x80, 0x80, 0x80, 18, 0x80, 0x80, 0x80,
+                                    14, 0x80, 0x80, 0x80, 10, 0x80, 0x80, 0x80,
+                                    6, 0x80, 0x80, 0x80, 2, 0x80, 0x80, 0x80);
+  __m256i ctl_3_0 = _mm256_set_epi8(0x80, 0x80, 0x80, 31, 0x80, 0x80, 0x80, 27,
+                                    0x80, 0x80, 0x80, 23, 0x80, 0x80, 0x80, 19,
+                                    0x80, 0x80, 0x80, 15, 0x80, 0x80, 0x80, 11,
+                                    0x80, 0x80, 0x80, 7, 0x80, 0x80, 0x80, 3);
+  __m256i ctl_3_1 = _mm256_set_epi8(0x80, 0x80, 31, 0x80, 0x80, 0x80, 27, 0x80,
+                                    0x80, 0x80, 23, 0x80, 0x80, 0x80, 19, 0x80,
+                                    0x80, 0x80, 15, 0x80, 0x80, 0x80, 11, 0x80,
+                                    0x80, 0x80, 7, 0x80, 0x80, 0x80, 3, 0x80);
+  __m256i ctl_3_2 = _mm256_set_epi8(0x80, 31, 0x80, 0x80, 0x80, 27, 0x80, 0x80,
+                                    0x80, 23, 0x80, 0x80, 0x80, 19, 0x80, 0x80,
+                                    0x80, 15, 0x80, 0x80, 0x80, 11, 0x80, 0x80,
+                                    0x80, 7, 0x80, 0x80, 0x80, 3, 0x80, 0x80);
+
+  // Masks for bitwise and operation, treating 256 bits as an array of
+  // 8-bit elements, and considering them in quadruples of neighboring
+  // elements.  A mask named "keep_M" (M in [0,1,2,3]) is set so that
+  // bitwise and will copy element with index M from input quadruple
+  // into element with the same index in output quadruple, while the
+  // other elements in output quadruple will be set to all 0s.
+  __m256i keep_0 = _mm256_set1_epi32(0xFF);
+  __m256i keep_3 = _mm256_set1_epi32(0xFF000000);
+
+  // Take each 8-bit element with idx%4==0 from input array to be
+  // shifted and extend it to 32 bits so that 0s are added to the
+  // right.  Then, perform shifting on this 32-bit number.  Upper 8
+  // bits will be proper result of shifting original 8-bit number, so
+  // write them to result array, into the same position from which
+  // corresponding input element is taken.  Also, make sure that
+  // result array elements with idx%4!=0 are set to all 0s.
+  //
+  // Note that number of bits to shift for is extended to 32 bits by
+  // adding 0s to the left.  That means this number is not properly
+  // sign-extended for negative values.  However, number of bits to
+  // shift is treated as an unsigned integer by respective shift
+  // intrinsics anyway so if negative then either with or without
+  // proper sign extension, it will be interpreted as a number greater
+  // than 32, and the shifting result will be the same.
+  __m256i a0 = _mm256_shuffle_epi8(a, ctl_0_3);
+  __m256i b0 = _mm256_and_si256(b, keep_0);
+  __m256i c0;
+  if (left_shift)
+    c0 = _mm256_sllv_epi32(a0, b0);
+  c0 = _mm256_shuffle_epi8(c0, ctl_3_0);
+
+  // Peform shifting the same way for input array elements with
+  // idx%4==1.
+  __m256i a1 = _mm256_shuffle_epi8(a, ctl_1_3);
+  __m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0);
+  __m256i c1;
+  if (left_shift)
+    c1 = _mm256_sllv_epi32(a1, b1);
+  c1 = _mm256_shuffle_epi8(c1, ctl_3_1);
+
+  // Peform shifting the same way for input array elements with
+  // idx%4==2.
+  __m256i a2 = _mm256_shuffle_epi8(a, ctl_2_3);
+  __m256i b2 = _mm256_shuffle_epi8(b, ctl_2_0);
+  __m256i c2;
+  if (left_shift)
+    c2 = _mm256_sllv_epi32(a2, b2);
+  c2 = _mm256_shuffle_epi8(c2, ctl_3_2);
+
+  // Peform shifting the same way for input array elements with
+  // idx%4==3.
+  __m256i a3 =  _mm256_and_si256(a, keep_3);
+  __m256i b3 = _mm256_shuffle_epi8(b, ctl_3_0);
+  __m256i c3;
+  if (left_shift)
+    c3 = _mm256_sllv_epi32(a3, b3);
+  c3 = _mm256_and_si256(c3, keep_3);
+
+  // Merge partial results into the final result.
+  __m256i c01 = _mm256_or_si256(c0, c1);
+  __m256i c23 = _mm256_or_si256(c2, c3);
+  __m256i c = _mm256_or_si256(c01, c23);
+
+  return c;
+}
+
+template <>
+Vectorized<int64_t> inline operator<<(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return _mm256_sllv_epi64(a, b);
+}
+
+template <>
+Vectorized<int32_t> inline operator<<(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return _mm256_sllv_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator<<(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return shift_256_16<true>(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline operator<<(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  return shift_256_8<true>(a, b);
+}
+
 #endif
 
 }}}
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_int.h b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
index c2cbc0b1d7f94..590c3254e3790 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_int.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
@@ -1163,6 +1163,99 @@ inline Vectorized<int8_t> Vectorized<int8_t>::le(const Vectorized<int8_t>& other
   return (*this <= other) & Vectorized<int8_t>(1);
 }
 
+template <bool left_shift>
+Vectorized<int8_t> inline shift_512_8(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  // No vector instruction for shifting int8_t, so emulating it instead.
+
+  // Control masks for shuffle operation, treating 512 bits as an
+  // array of 8-bit elements, and considering pairs of neighboring
+  // elements.  Specifially, a mask named "ctl_M_N" (M,N in [0,1], and
+  // M!=N) is set so that shuffle will move element with index M from
+  // input pair into element with index N in output pair, and element
+  // with index M in output pair will be set to all 0s.
+  __m512i ctl_0_1 = _mm512_set_epi8(62, 0x80, 60, 0x80, 58, 0x80, 56, 0x80,
+                                    54, 0x80, 52, 0x80, 50, 0x80, 48, 0x80,
+                                    46, 0x80, 44, 0x80, 42, 0x80, 40, 0x80,
+                                    38, 0x80, 36, 0x80, 34, 0x80, 32, 0x80,
+                                    30, 0x80, 28, 0x80, 26, 0x80, 24, 0x80,
+                                    22, 0x80, 20, 0x80, 18, 0x80, 16, 0x80,
+                                    14, 0x80, 12, 0x80, 10, 0x80, 8, 0x80,
+                                    6, 0x80, 4, 0x80, 2, 0x80, 0, 0x80);
+  __m512i ctl_1_0 = _mm512_set_epi8(0x80, 63, 0x80, 61, 0x80, 59, 0x80, 57,
+                                    0x80, 55, 0x80, 53, 0x80, 51, 0x80, 49,
+                                    0x80, 47, 0x80, 45, 0x80, 43, 0x80, 41,
+                                    0x80, 39, 0x80, 37, 0x80, 35, 0x80, 33,
+                                    0x80, 31, 0x80, 29, 0x80, 27, 0x80, 25,
+                                    0x80, 23, 0x80, 21, 0x80, 19, 0x80, 17,
+                                    0x80, 15, 0x80, 13, 0x80, 11, 0x80, 9,
+                                    0x80, 7, 0x80, 5, 0x80, 3, 0x80, 1);
+
+  // Masks for bitwise and operation, treating 512 bits as an array of
+  // 8-bit elements, and considering them in pairs of neighboring
+  // elements.  A mask named "keep_M" (M in [0,1]) is set so that
+  // bitwise and will copy element with index M from input pair into
+  // element with the same index in output pair, while the other
+  // element in output pair will be set to all 0s.
+  __m512i keep_0 = _mm512_set1_epi16(0xFF);
+  __m512i keep_1 = _mm512_set1_epi16(0xFF00);
+
+  // Take each 8-bit element with idx%2==0 from input array to be
+  // shifted and extend it to 16 bits so that 0s are added to the
+  // right.  Then, perform shifting on this 16-bit number.  Upper 8
+  // bits will be proper result of shifting original 8-bit number, so
+  // write them to result array, into the same position from which
+  // corresponding input element is taken.  Also, make sure that
+  // result array elements with idx%2!=0 are set to all 0s.
+  //
+  // Note that number of bits to shift for is extended to 16 bits by
+  // adding 0s to the left.  That means this number is not properly
+  // sign-extended for negative values.  However, number of bits to
+  // shift is treated as an unsigned integer by respective shift
+  // intrinsics anyway so if negative then either with or without
+  // proper sign extension, it will be interpreted as a number greater
+  // than 32, and the shifting result will be the same.
+  __m512i a0 = _mm512_shuffle_epi8(a, ctl_0_1);
+  __m512i b0 = _mm512_and_si512(b, keep_0);
+  __m512i c0;
+  if (left_shift)
+    c0 = _mm512_sllv_epi16(a0, b0);
+  c0 = _mm512_shuffle_epi8(c0, ctl_1_0);
+
+  // Peform shifting the same way for input array elements with
+  // idx%2==1.
+  __m512i a1 = _mm512_and_si512(a, keep_1);
+  __m512i b1 = _mm512_shuffle_epi8(b, ctl_1_0);
+  __m512i c1;
+  if (left_shift)
+    c1 = _mm512_sllv_epi16(a1, b1);
+  c1 = _mm512_and_si512(c1, keep_1);
+
+  // Merge partial results into the final result.
+  __m512i c = _mm512_or_si512(c0, c1);
+
+  return c;
+}
+
+template <>
+Vectorized<int64_t> inline operator<<(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return _mm512_sllv_epi64(a, b);
+}
+
+template <>
+Vectorized<int32_t> inline operator<<(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return _mm512_sllv_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator<<(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return _mm512_sllv_epi16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline operator<<(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  return shift_512_8<true>(a, b);
+}
+
 #endif
 
 }}}
diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h
index b9b3745e99d5f..f045437ac3689 100644
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@@ -799,6 +799,13 @@ inline Vectorized<T> operator~(const Vectorized<T>& a) {
   return a ^ ones;
 }
 
+template <class T> Vectorized<T> inline operator<<(const Vectorized<T> &a, const Vectorized<T> &b) {
+  Vectorized<T> c;
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    c[i] = a[i] << b[i];
+  }
+  return c;
+}
 
 template <typename T>
 inline Vectorized<T>& operator += (Vectorized<T>& a, const Vectorized<T>& b) {
@@ -826,6 +833,12 @@ inline Vectorized<T>& operator *= (Vectorized<T>& a, const Vectorized<T>& b) {
   return a;
 }
 
+template <typename T>
+inline Vectorized<T>& operator <<= (Vectorized<T>& a, const Vectorized<T>& b) {
+  a = a << b;
+  return a;
+}
+
 template <typename T>
 inline Vectorized<T> fmadd(const Vectorized<T>& a, const Vectorized<T>& b, const Vectorized<T>& c) {
   return a * b + c;
diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
index a5dde16024ab6..c2497a6949f12 100644
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@@ -314,10 +314,13 @@ void bitwise_xor_kernel(TensorIteratorBase& iter) {
 
 void lshift_kernel(TensorIteratorBase& iter) {
   AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "lshift_cpu", [&]() {
-    cpu_kernel(iter,
-      [](scalar_t a, scalar_t b) -> scalar_t {
-        return static_cast<std::make_unsigned_t<scalar_t>>(a) << b;
-    });
+    cpu_kernel_vec(iter,
+        [](scalar_t a, scalar_t b) -> scalar_t {
+          return static_cast<std::make_unsigned_t<scalar_t>>(a) << b;
+        },
+        [](Vectorized<scalar_t> a, Vectorized<scalar_t> b) {
+            return a << b;
+        });
   });
 }
 

From c5bc5d6eab82e8fe03c0cb13148e07d5620c7929 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Sat, 12 Nov 2022 06:19:02 -0800
Subject: [PATCH 0838/1922] Fix XLA symbolic shapes binding (#88928)

Obsoletes https://github.com/pytorch/pytorch/pull/88772

Mostly revolves around NOT assuming that the inside is a SymNode,
but instead duck-typed to be a SymNode.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88928
Approved by: https://github.com/SherlockNoMad
---
 c10/core/SymNodeImpl.h                   |  3 -
 test/test_dynamic_shapes.py              |  6 +-
 torch/__init__.py                        |  2 -
 torch/csrc/jit/python/init.cpp           | 77 ++++++++++++++++--------
 torch/csrc/utils/pybind.cpp              | 14 ++++-
 torch/csrc/utils/python_symnode.h        |  4 --
 torch/fx/experimental/symbolic_shapes.py | 39 ++++++------
 7 files changed, 85 insertions(+), 60 deletions(-)

diff --git a/c10/core/SymNodeImpl.h b/c10/core/SymNodeImpl.h
index d2f3aafaad8b1..fcec452821d76 100644
--- a/c10/core/SymNodeImpl.h
+++ b/c10/core/SymNodeImpl.h
@@ -85,9 +85,6 @@ class C10_API SymNodeImpl : public c10::intrusive_ptr_target {
   virtual SymNode clone() {
     TORCH_CHECK(false, "NYI");
   };
-  virtual SymNode sym_int() {
-    TORCH_CHECK(false, "NYI");
-  }
   virtual SymNode sym_float() {
     TORCH_CHECK(false, "NYI");
   }
diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index 0f1f49d2e6ea5..3a8e31151bf37 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -20,7 +20,7 @@
 from torch.utils._pytree import tree_map
 from torch.fx.experimental import symbolic_shapes
 from torch.fx.experimental.proxy_tensor import make_fx
-from torch.fx.experimental.symbolic_shapes import ShapeEnv, sym_float, guard_int, SymNode, sym_sqrt, sym_int
+from torch.fx.experimental.symbolic_shapes import ShapeEnv, sym_float, guard_int, SymNode, sym_sqrt, sym_int, to_node
 from torch.utils._python_dispatch import TorchDispatchMode
 from torch import SymInt
 
@@ -478,9 +478,9 @@ def _do_test(self, fn, inp1, inp2, shape_env, is_unary_fn):
 
         def get_sym_inp(inp):
             if isinstance(inp, int):
-                return torch.SymInt(seed_node.to_node(inp))
+                return torch.SymInt(to_node(seed_node, inp))
             else:
-                return torch.SymFloat(seed_node.to_node(inp))
+                return torch.SymFloat(to_node(seed_node, inp))
 
         def maybe_xfail(inp1, inp2):
             key = (fn, type(inp1).__name__, type(inp2).__name__)
diff --git a/torch/__init__.py b/torch/__init__.py
index 19be59282cca4..6def80d1dc599 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -204,8 +204,6 @@ class SymInt:
     """
 
     def __init__(self, node):
-        from torch.fx.experimental.symbolic_shapes import SymNode
-        assert isinstance(node, SymNode)
         # This field MUST be named node; C++ binding code assumes that this
         # class has a field named node that stores SymNode
         self.node = node
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index a72a8a2c11502..7ee48635cdffc 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -1148,38 +1148,65 @@ void initJITBindings(PyObject* module) {
   // NB: This isn't actually used for regular PyTorch symbolic tracing;
   // XLA is what needs this
 #define SYMNODE_UNARY(n) .def(#n, [](c10::SymNode a) { return a->n(); })
-#define SYMNODE_UNARY2(n2, n) .def(#n2, [](c10::SymNode a) { return a->n(); })
 #define SYMNODE_BINARY(n) \
   .def(#n, [](c10::SymNode a, c10::SymNode b) { return a->n(b); })
   auto symnode_class =
       py::class_<c10::SymNodeImpl, c10::SymNode>(m, "_SymNode")
+      // clang-format off
       // These DO NOT install magic methods; the SymInt/SymFloat wrapper in
       // Python is responsible for this
       SYMNODE_UNARY(clone)
-      // Named these for consistency with inner python class, but maybe
-      // should change the python side
-      SYMNODE_UNARY2(__bool__, bool_) SYMNODE_UNARY2(__int__, int_)
-          SYMNODE_UNARY2(__sym_int__, sym_int) SYMNODE_UNARY2(
-              __sym_float__, sym_float) SYMNODE_BINARY(add) SYMNODE_BINARY(sub)
-              SYMNODE_BINARY(mul) SYMNODE_BINARY(truediv) SYMNODE_BINARY(pow)
-                  SYMNODE_BINARY(floordiv) SYMNODE_BINARY(mod) SYMNODE_BINARY(
-                      eq) SYMNODE_BINARY(gt) SYMNODE_BINARY(lt)
-                      SYMNODE_BINARY(le) SYMNODE_BINARY(ge) SYMNODE_BINARY(min)
-                          SYMNODE_BINARY(max) SYMNODE_UNARY(ceil)
-                              SYMNODE_UNARY(floor) SYMNODE_UNARY(neg)
-                                  // Intentionally don't set file line, as the
-                                  // Python backtrace matters more here
-                                  .def(
-                                      "guard_int",
-                                      [](c10::SymNode a) {
-                                        return a->guard_int(nullptr, 0);
-                                      })
-                                  .def(
-                                      "__str__",
-                                      [](c10::SymNode a) { return a->str(); })
-                                  .def("__repr__", [](c10::SymNode a) {
-                                    return a->str();
-                                  });
+      SYMNODE_UNARY(is_int)
+      SYMNODE_UNARY(is_float)
+      SYMNODE_UNARY(bool_)
+      SYMNODE_UNARY(int_)
+      SYMNODE_UNARY(sym_float)
+      SYMNODE_BINARY(add)
+      SYMNODE_BINARY(sub)
+      SYMNODE_BINARY(mul)
+      SYMNODE_BINARY(truediv)
+      SYMNODE_BINARY(pow)
+      SYMNODE_BINARY(floordiv)
+      SYMNODE_BINARY(mod)
+      SYMNODE_BINARY(eq)
+      SYMNODE_BINARY(gt)
+      SYMNODE_BINARY(lt)
+      SYMNODE_BINARY(le)
+      SYMNODE_BINARY(ge)
+      SYMNODE_BINARY(min)
+      SYMNODE_BINARY(max)
+      SYMNODE_UNARY(ceil)
+      SYMNODE_UNARY(floor)
+      SYMNODE_UNARY(neg)
+      // Intentionally don't set file line, as the
+      // Python backtrace matters more here
+      .def(
+          "guard_int",
+          [](c10::SymNode a) {
+            return a->guard_int(nullptr, 0);
+          })
+      .def(
+          "guard_float",
+          [](c10::SymNode a) {
+            return a->guard_float(nullptr, 0);
+          })
+      .def(
+          "wrap_int",
+          [](c10::SymNode a, int64_t b) {
+            return a->wrap_int(b);
+          })
+      .def(
+          "wrap_float",
+          [](c10::SymNode a, double b) {
+            return a->wrap_float(b);
+          })
+      .def(
+          "__str__",
+          [](c10::SymNode a) { return a->str(); })
+      .def("__repr__", [](c10::SymNode a) {
+        return a->str();
+      });
+  // clang-format on
 
   // NOLINTNEXTLINE(bugprone-unused-raii)
   py::class_<CompleteArgumentSpec>(m, "CompleteArgumentSpec")
diff --git a/torch/csrc/utils/pybind.cpp b/torch/csrc/utils/pybind.cpp
index 37e37a873774b..4cd148fdfa91c 100644
--- a/torch/csrc/utils/pybind.cpp
+++ b/torch/csrc/utils/pybind.cpp
@@ -25,11 +25,19 @@ py::handle type_caster<c10::SymInt>::cast(
     return_value_policy /* policy */,
     handle /* parent */) {
   if (si.is_symbolic()) {
-    // TODO: generalize this to work with C++ backed class
     auto* py_node =
         dynamic_cast<torch::impl::PythonSymNodeImpl*>(si.toSymNodeImpl().get());
-    TORCH_INTERNAL_ASSERT(py_node);
-    return torch::get_symint_class()(py_node->getPyObj()).release();
+    if (py_node) {
+      // Return the Python directly (unwrap)
+      return torch::get_symint_class()(py_node->getPyObj()).release();
+    } else {
+      // Wrap the C++ into Python
+      auto inner = py::cast(si.toSymNodeImpl());
+      if (!inner) {
+        throw python_error();
+      }
+      return torch::get_symint_class()(inner).release();
+    }
   } else {
     return py::cast(si.as_int_unchecked()).release();
   }
diff --git a/torch/csrc/utils/python_symnode.h b/torch/csrc/utils/python_symnode.h
index be402e4d5439f..3a9fa79d37d6e 100644
--- a/torch/csrc/utils/python_symnode.h
+++ b/torch/csrc/utils/python_symnode.h
@@ -164,10 +164,6 @@ class PythonSymNodeImpl : public c10::SymNodeImpl {
     return dispatch_common_(__FUNCTION__);
   }
 
-  c10::SymNode sym_int() override {
-    return dispatch_common_(__FUNCTION__);
-  }
-
   c10::SymNode sym_float() override {
     return dispatch_common_(__FUNCTION__);
   }
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index d9b0a8fc2019e..9b55af3c555c4 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -126,6 +126,18 @@ def sym_int(a):
         return sym_floor(a) if a > 0 else sym_ceil(a)
     return int(a)
 
+def to_node(self, num):
+    if isinstance(num, (SymInt, SymFloat)):
+        return num.node
+    elif isinstance(num, int):
+        return self.wrap_int(num)
+    elif isinstance(num, float):
+        return self.wrap_float(num)
+    else:
+        # NotImplemented is important so that Python tries the
+        # other magic method
+        return NotImplemented
+
 # TODO: An incomplete list
 # 1. Set variables to be equal when we do equality
 # 2. Specialize on 0/1 when we do subtraction
@@ -148,18 +160,6 @@ def expr(self):
     def _update_expr(self):
         self._expr = self.shape_env.replace(self._expr)
 
-    def to_node(self, num):
-        if isinstance(num, (SymInt, SymFloat)):
-            return num.node
-        elif isinstance(num, int):
-            return self.wrap_int(num)
-        elif isinstance(num, float):
-            return self.wrap_float(num)
-        else:
-            # NotImplemented is important so that Python tries the
-            # other magic method
-            return NotImplemented
-
     def is_int(self):
         return self.pytype is int
 
@@ -297,16 +297,15 @@ def _nyi():
 always_bool_magic_methods = {"eq", "gt", "lt", "le", "ge"}
 
 def wrap_node(x):
-    if not isinstance(x, SymNode):
-        return x
-    if x.constant is not None:
+    # TODO: let C++ also take advantage of this
+    if isinstance(x, SymNode) and x.constant is not None:
         return x.constant
-    if x.pytype is int:
+    if x.is_int():
         return SymInt(x)
-    elif x.pytype is float:
+    elif x.is_float():
         return SymFloat(x)
     else:
-        raise AssertionError(f"unrecognized return type {x.pytype}")
+        raise AssertionError(f"unrecognized return type {x}")
 
 def _make_node_magic(method, func):
     func = lru_cache(256)(func)
@@ -378,13 +377,13 @@ def unary_magic_impl(self):
         return wrap_node(getattr(self.node, method)())
 
     def binary_magic_impl(self, other):
-        other_node = self.node.to_node(other)
+        other_node = to_node(self.node, other)
         if other_node is NotImplemented:
             return NotImplemented
         return wrap_node(getattr(self.node, method)(other_node))
 
     def rbinary_magic_impl(self, other):
-        other_node = self.node.to_node(other)
+        other_node = to_node(self.node, other)
         if other_node is NotImplemented:
             return NotImplemented
         return wrap_node(getattr(other_node, method)(self.node))

From 733e0d1e83adc601164caf3423b912b6ccbbdd2c Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Sat, 12 Nov 2022 20:27:00 +0000
Subject: [PATCH 0839/1922] [FSDP] Fix `FSDP.clip_grad_norm_()` for `NO_SHARD`
 (#88955)

This PR fixes `FSDP.clip_grad_norm_()` for `NO_SHARD`, which previously "double-counted" each gradient `world_size`-many times.

This does not address any discrepancies between `FULL_SHARD` and DDP. (Note that the unit tests do show parity between `FULL_SHARD` and DDP when using `FSDP.clip_grad_norm_()` and `nn.utils.clip_grad_norm_()` respectively on one iteration.)

The added unit test code path tests mixing nested FSDP instances with both `FULL_SHARD` and `NO_SHARD` to ensure that the `local_sharded_norm` and `local_nonsharded_norm` computations are interoperating correctly. I want to test non-FSDP root instance in the future, but this is BC breaking since we need to make `clip_grad_norm_()` a static method, which would require a different method call syntax (`FSDP.clip_grad_norm_(root_module, ...)` vs. `root_module.clip_grad_norm_(...)`).
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88955
Approved by: https://github.com/zhaojuanmao
---
 .../fsdp/test_fsdp_clip_grad_norm.py          | 74 ++++++++++++++-----
 .../fsdp/fully_sharded_data_parallel.py       | 36 +++++++--
 2 files changed, 84 insertions(+), 26 deletions(-)

diff --git a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
index e587065c5c77f..1a742da889ac3 100644
--- a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
+++ b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
@@ -7,6 +7,7 @@
 import torch
 import torch.nn as nn
 from torch import distributed as dist
+from torch.distributed.fsdp import ShardingStrategy
 from torch.distributed.fsdp.fully_sharded_data_parallel import (
     CPUOffload,
     FullyShardedDataParallel as FSDP,
@@ -42,10 +43,6 @@
 class TestClipGradNorm(FSDPTest):
     """Tests :meth:`FullyShardedDataParallel.clip_grad_norm_`."""
 
-    @property
-    def world_size(self) -> int:
-        return 2
-
     @skip_if_lt_x_gpu(2)
     def test_non_root(self):
         """
@@ -80,6 +77,11 @@ def test_ddp_parity(self):
             {
                 "max_norm": [1, 2.5],
                 "norm_type": [1, 2, float("inf")],
+                "sharding_strategy": [
+                    ShardingStrategy.FULL_SHARD,
+                    ShardingStrategy.NO_SHARD,
+                    "mixed_strategy",
+                ],
                 "use_orig_params": [False, True],
                 "offload_params": [False, True],
             },
@@ -90,8 +92,9 @@ def _test_ddp_parity(
         self,
         max_norm: Union[float, int],
         norm_type: Union[float, int],
-        offload_params: bool,
+        sharding_strategy: Union[ShardingStrategy, str],
         use_orig_params: bool,
+        offload_params: bool,
     ):
         local_model = TransformerWithSharedParams.init(
             self.process_group,
@@ -101,22 +104,52 @@ def _test_ddp_parity(
         )
         ddp_model = DDP(local_model, device_ids=[self.rank])
         fsdp_kwargs = {
-            "auto_wrap_policy": ModuleWrapPolicy(
-                {
-                    TransformerEncoderLayer,
-                    TransformerDecoderLayer,
-                }
-            ),
             "cpu_offload": CPUOffload(offload_params=offload_params),
             "use_orig_params": use_orig_params,
         }
-        fsdp_model = TransformerWithSharedParams.init(
-            self.process_group,
-            FSDPInitMode.RECURSIVE,
-            CUDAInitMode.CUDA_BEFORE,
-            deterministic=True,
-            fsdp_kwargs=fsdp_kwargs,
-        )
+        if sharding_strategy == "mixed_strategy":
+            fsdp_model = TransformerWithSharedParams.init(
+                self.process_group,
+                FSDPInitMode.NO_FSDP,
+                CUDAInitMode.CUDA_BEFORE,
+                deterministic=True,
+            )
+            # Apply `NO_SHARD` to the encoder
+            fsdp_model.transformer.encoder = FSDP(
+                fsdp_model.transformer.encoder,
+                sharding_strategy=ShardingStrategy.NO_SHARD,
+                **fsdp_kwargs,
+            )
+            # Apply `FULL_SHARD` to the decoder
+            fsdp_model.transformer.decoder = FSDP(
+                fsdp_model.transformer.decoder,
+                sharding_strategy=ShardingStrategy.FULL_SHARD,
+                **fsdp_kwargs,
+            )
+            # TODO: FSDP's `clip_grad_norm_()` is not a static method, so we
+            # must make the root module an FSDP instance
+            fsdp_model = FSDP(
+                fsdp_model, sharding_strategy=ShardingStrategy.FULL_SHARD, **fsdp_kwargs
+            )
+        else:
+            fsdp_kwargs.update(
+                {
+                    "sharding_strategy": sharding_strategy,
+                    "auto_wrap_policy": ModuleWrapPolicy(
+                        {
+                            TransformerEncoderLayer,
+                            TransformerDecoderLayer,
+                        }
+                    ),
+                }
+            )
+            fsdp_model = TransformerWithSharedParams.init(
+                self.process_group,
+                FSDPInitMode.RECURSIVE,
+                CUDAInitMode.CUDA_BEFORE,
+                deterministic=True,
+                fsdp_kwargs=fsdp_kwargs,
+            )
         LR = 1e-2
         ddp_optim = torch.optim.Adam(ddp_model.parameters(), lr=LR)
         fsdp_optim = torch.optim.Adam(fsdp_model.parameters(), lr=LR)
@@ -125,7 +158,10 @@ def _test_ddp_parity(
         inp = ddp_model.module.get_input(device)
         for model in (ddp_model, fsdp_model):
             out = model(*inp)
-            loss = model.module.get_loss(inp, out)
+            if isinstance(model, (DDP, FSDP)):
+                loss = model.module.get_loss(inp, out)
+            else:
+                loss = model.get_loss(inp, out)
             loss.backward()
 
         # Multiply gradients by a large factor to ensure that gradients will
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 69c8dd92ed8dc..3e84315a4e116 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -1161,23 +1161,45 @@ def clip_grad_norm_(
             self._streams["unshard"],
             self._streams["pre_unshard"],
         )
-
         max_norm = float(max_norm)
         norm_type = float(norm_type)
-        # Compute the local gradient norm (only including this rank's shard
-        # of the gradients)
-        local_norm = _get_grad_norm(self.parameters(), norm_type).to(
+        # Perform local gradient norm computation, where sharded and
+        # non-sharded parameters must be handled separately
+        sharded_params = set()
+        nonsharded_params = set()  # `NO_SHARD` or not FSDP-managed
+        for handle in FullyShardedDataParallel._fsdp_handles(self):
+            target_set = (
+                sharded_params if handle.uses_sharded_strategy else nonsharded_params
+            )
+            if handle._use_orig_params:
+                for param in handle.flat_param._params:
+                    target_set.add(param)
+            else:
+                target_set.add(handle.flat_param)
+        for param in self.parameters():
+            not_fsdp_managed = (
+                param not in sharded_params and param not in nonsharded_params
+            )
+            if not_fsdp_managed:
+                nonsharded_params.add(param)
+        local_sharded_norm = _get_grad_norm(sharded_params, norm_type).to(
+            self.compute_device
+        )
+        local_nonsharded_norm = _get_grad_norm(nonsharded_params, norm_type).to(
             self.compute_device
         )
         # Reconstruct the total gradient norm depending on the norm type
         if norm_type == math.inf:
-            total_norm = local_norm
+            total_norm = torch.maximum(local_sharded_norm, local_nonsharded_norm)
             dist.all_reduce(
                 total_norm, op=torch.distributed.ReduceOp.MAX, group=self.process_group
             )
         else:
-            total_norm = local_norm**norm_type
+            total_norm = local_sharded_norm**norm_type
             dist.all_reduce(total_norm, group=self.process_group)
+            # All-reducing the local non-sharded norm would count it an extra
+            # world-size-many times
+            total_norm += local_nonsharded_norm**norm_type
             total_norm = total_norm ** (1.0 / norm_type)
         if self.cpu_offload.offload_params:
             total_norm = total_norm.cpu()
@@ -1789,7 +1811,7 @@ def register_comm_hook(self, state: object, hook: callable):
 
 
 def _get_grad_norm(
-    params: List[nn.Parameter],
+    params: Iterable[nn.Parameter],
     norm_type: float,
 ) -> torch.Tensor:
     """

From 130a1343773bd3026e088d2bb58319c3845e2dc6 Mon Sep 17 00:00:00 2001
From: Michael Voznesensky <voznesenskym@gmail.com>
Date: Sun, 13 Nov 2022 04:50:21 +0000
Subject: [PATCH 0840/1922] [dynamo] Port all pytorch/dynamo and test/dynamo
 pieces over from symbolic-shapes branch (#88768)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88768
Approved by: https://github.com/jansel, https://github.com/ezyang
---
 functorch/_src/compilers.py                 |  30 ++
 test/distributed/test_dynamo_distributed.py |   2 +
 test/dynamo/test_dynamic_shapes.py          | 109 ++----
 test/dynamo/test_export.py                  |  26 ++
 test/dynamo/test_misc.py                    |   2 +
 test/dynamo/test_no_fake_tensors.py         |   5 -
 test/dynamo/test_repros.py                  |  38 +-
 test/dynamo/test_unspec.py                  |   2 +
 test/inductor/test_torchinductor_opinfo.py  |   1 +
 torch/_dynamo/codegen.py                    |   2 +
 torch/_dynamo/guards.py                     |  49 ++-
 torch/_dynamo/optimizations/analysis.py     |  25 +-
 torch/_dynamo/optimizations/training.py     |   6 +-
 torch/_dynamo/output_graph.py               |  56 ++-
 torch/_dynamo/symbolic_convert.py           |  53 ++-
 torch/_dynamo/utils.py                      | 135 ++++++-
 torch/_dynamo/variables/__init__.py         |   1 +
 torch/_dynamo/variables/builder.py          | 238 +++++++++++-
 torch/_dynamo/variables/builtin.py          |  85 ++++-
 torch/_dynamo/variables/constant.py         |  32 +-
 torch/_dynamo/variables/lists.py            |  75 +++-
 torch/_dynamo/variables/misc.py             |   7 +-
 torch/_dynamo/variables/nn_module.py        |   7 +-
 torch/_dynamo/variables/tensor.py           | 387 ++++----------------
 torch/_dynamo/variables/torch.py            |  43 ++-
 torch/_subclasses/fake_tensor.py            |   6 +-
 torch/fx/experimental/symbolic_shapes.py    |   1 -
 27 files changed, 921 insertions(+), 502 deletions(-)

diff --git a/functorch/_src/compilers.py b/functorch/_src/compilers.py
index 3f52fede57ebf..55de63e5c344d 100644
--- a/functorch/_src/compilers.py
+++ b/functorch/_src/compilers.py
@@ -19,6 +19,8 @@
     draw_graph,
     min_cut_rematerialization_partition,
 )
+import torch.utils._pytree as pytree
+
 
 
 # These canonicalizations are needed here (and not decompositions), as the ops
@@ -113,6 +115,34 @@ def nop(fx_g: fx.GraphModule, _) -> Callable:
     """
     return fx_g
 
+class DebugInterpreter(fx.Interpreter):
+    def run_node(self, n):
+        # TODO: This will fail once we start caching in AOTAutograd
+        # again, because we need to remap SymInts to their new values
+        # in the presence of dynamism
+        r = super().run_node(n)
+        if 'val' in n.meta:
+            n_vals, n_spec = pytree.tree_flatten(n.meta['val'])
+            r_vals, r_spec = pytree.tree_flatten(r)
+            assert n_spec == r_spec, f"{n_spec} != {r_spec}"
+            assert len(n_vals) == len(r_vals), f"{len(n_vals)} != {len(r_vals)}"
+            for i, nv, rv in zip(range(len(n_vals)), n_vals, r_vals):
+                if not isinstance(rv, torch.Tensor):
+                    continue
+                assert nv.size() == rv.size(), f"output {i}: {nv.size()} != {rv.size()}"
+                assert nv.dtype == rv.dtype, f"output {i}: {nv.dtype} != {rv.dtype}"
+                assert torch._prims_common.check_significant_strides(nv, rv), f"output {i}: {nv.stride()} != {rv.stride()}"
+        return r
+
+
+@make_boxed_compiler
+def debug_nop(fx_g: fx.GraphModule, _) -> Callable:
+    """
+    Returns a (slow) interpreter over the FX graph module that also checks
+    various debugging properties (e.g., that tracing strides matched real
+    strides.)
+    """
+    return DebugInterpreter(fx_g).run
 
 @make_boxed_compiler
 def simple_ts_compile(fx_g, _):
diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index b6bc16edb941a..21550a0120e46 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -258,6 +258,8 @@ def test_fsdp_inductor(self):
     # TODO(whc) Investigate why cudagraphs breaks inductor+fsdp for hf_bert
     @patch.object(torch._inductor.config.triton, "cudagraphs", False)
     @patch.object(torch._inductor.config, "fallback_random", True)
+    # TODO(voz): Flaky on CI failure, consistent failure on local master.
+    @unittest.skipIf(True, "Flaky on CI failure, consistent failure on local master")
     def test_hf_bert_fsdp(self):
         from transformers.models.bert.modeling_bert import BertLayer
 
diff --git a/test/dynamo/test_dynamic_shapes.py b/test/dynamo/test_dynamic_shapes.py
index d82cc6925fe9d..294ea9e549522 100644
--- a/test/dynamo/test_dynamic_shapes.py
+++ b/test/dynamo/test_dynamic_shapes.py
@@ -51,22 +51,6 @@ def make_dynamic_cls(cls):
 )
 
 
-# DynamicShapesReproTests
-unittest.expectedFailure(
-    DynamicShapesReproTests.test_reformer_eval_dynamic_shapes
-    # TypeError: 'torch._C.SymIntNode' object cannot be interpreted as an integer
-)
-
-unittest.expectedFailure(
-    DynamicShapesReproTests.test_reformer_train_dynamic_shapes
-    # TypeError: 'torch._C.SymIntNode' object cannot be interpreted as an integer
-)
-
-unittest.expectedFailure(
-    DynamicShapesReproTests.test_issue175_dynamic_shapes
-    # TypeError: 'torch._C.SymIntNode' object cannot be interpreted as an integer
-)
-
 unittest.expectedFailure(
     DynamicShapesReproTests.test_do_paste_mask_dynamic_shapes
     # aten.min.dim - couldn't find symbolic meta function/decomposition
@@ -77,97 +61,66 @@ def make_dynamic_cls(cls):
     # Could not infer dtype of torch._C.SymIntNode
 )
 
-unittest.expectedFailure(
-    DynamicShapesReproTests.test_ellipsis_dynamic_shapes
-    # Cannot call sizes() on tensor with symbolic sizes/strides
-)
-
 unittest.expectedFailure(
     DynamicShapesReproTests.test_hf_t5_forward_dynamic_shapes
     # Cannot call sizes() on tensor with symbolic sizes/strides
 )
 
+# DynamicShapesExportTests
 unittest.expectedFailure(
-    DynamicShapesReproTests.test_reformer_sorting_dynamic_shapes
-    # Unable to cast Python instance to C++ type
-)
-
-unittest.expectedFailure(
-    DynamicShapesReproTests.test_guard_fail_tensor_bool_dynamic_shapes
-    # RuntimeError: aten.allclose.default - couldn't find symbolic meta function/decomposition
+    DynamicShapesExportTests.test_export_with_constant_list_nonzero_dynamic_shapes
 )
-
-# DynamicShapesMiscTests
 unittest.expectedFailure(
-    DynamicShapesMiscTests.test_unsupported_fake_tensor_dynamic_shapes
-    # aten.quantize_per_tensor.default - couldn't find symbolic meta function/decomposition
+    DynamicShapesExportTests.test_export_with_constant_list_nonzero_free_function_dynamic_shapes
 )
 unittest.expectedFailure(
-    DynamicShapesMiscTests.test_module_deepcopy_dynamic_shapes
-    # aten.squeeze_.dim - couldn't find symbolic meta function/decompositio
+    DynamicShapesExportTests.test_export_with_constant_tuple_nonzero_dynamic_shapes
 )
-
-# DynamicShapesUnspecTests
 unittest.expectedFailure(
-    DynamicShapesUnspecTests.test_unspec_float_precision_dynamic_shapes
-    # float() argument must be a string or a real number, not 'torch._C.SymIntNode'
+    DynamicShapesExportTests.test_export_with_constant_tuple_nonzero_dynamic_shapes
 )
 
 
-# DynamicShapesNNModuleTests
-unittest.expectedFailure(
-    DynamicShapesNNModuleTests.test_unsupportedmethod_dynamic_shapes
-    # aten.squeeze_.dim - couldn't find symbolic meta function/decomposition
-)
-
+# DynamicShapesSubGraphTests
 unittest.expectedFailure(
-    DynamicShapesNNModuleTests.test_unsupportedmodule_dynamic_shapes
-    # aten.squeeze_.dim - couldn't find symbolic meta function/decomposition
+    DynamicShapesSubGraphTests.test_enumerate_not_break_graph_dynamic_shapes
 )
+unittest.expectedFailure(DynamicShapesSubGraphTests.test_restore_state_dynamic_shapes)
 
+# DynamicShapesUnspecTests
+# Missing decomp
+# RuntimeError: Failed running call_function <function batch_norm at 0x7f7d1ce38310>
+# (*(FakeTensor(FakeTensor(..., device='meta', size=(5, 1, 28, 28)), cpu),
+# FakeTensor(FakeTensor(..., device='meta', size=(1,)), cpu),
+#  FakeTensor(FakeTensor(..., device='meta', size=(1,)), cpu),
+#  FakeTensor(Parameter(FakeTensor(..., device='meta', size=(1,),
+#  requires_grad=True)), cpu),
+#  FakeTensor(Parameter(FakeTensor(..., device='meta', size=(1,),
+#  requires_grad=True)), cpu), False, 0.1,
+# FakeTensor(FakeTensor(..., device='meta', size=()), cpu)), **{}):
+# aten._local_scalar_dense.default
+unittest.expectedFailure(test_unspec.UnspecReproTests.test_batch_norm_act_unspec)
+
+# SymIntArrayRef expected to contain only concrete integers
 unittest.expectedFailure(
-    DynamicShapesNNModuleTests.test_self_mutating1_dynamic_shapes
-    # aten.squeeze_.dim - couldn't find symbolic meta function/decomposition
+    DynamicShapesUnspecTests.test_unspec_float_precision_dynamic_shapes
 )
 
+# DynamicShapesReproTests
 unittest.expectedFailure(
-    DynamicShapesNNModuleTests.test_call_fn_with_non_const_inputs_safe_dynamic_shapes
-    # aten.squeeze_.dim - couldn't find symbolic meta function/decomposition
+    DynamicShapesReproTests.test_reformer_eval_dynamic_shapes
+    # TypeError: 'torch._C.SymIntNode' object cannot be interpreted as an integer
 )
 
-
-# DynamicShapesExportTests
-unittest.expectedFailure(
-    DynamicShapesExportTests.test_export_compare_optimize_with_make_fx_dynamic_shapes
-)
-unittest.expectedFailure(
-    DynamicShapesExportTests.test_export_with_constant_list_nonzero_dynamic_shapes
-)
-unittest.expectedFailure(
-    DynamicShapesExportTests.test_export_with_constant_list_nonzero_free_function_dynamic_shapes
-)
 unittest.expectedFailure(
-    DynamicShapesExportTests.test_export_with_constant_tuple_nonzero_dynamic_shapes
-)
-unittest.expectedFailure(
-    DynamicShapesExportTests.test_export_with_stack_trace_dynamic_shapes
-)
-unittest.expectedFailure(
-    DynamicShapesExportTests.test_zeroes_in_new_shape_scalar_out_dynamic_shapes
-)
-unittest.expectedFailure(
-    DynamicShapesExportTests.test_zeroes_in_new_shape_scalar_out_permute_dupe_and_bypass_dynamic_shapes
-)
-unittest.expectedFailure(
-    DynamicShapesExportTests.test_zeroes_in_new_shape_scalar_out_permute_dynamic_shapes
+    DynamicShapesReproTests.test_reformer_sorting_dynamic_shapes
+    # Unable to cast Python instance to C++ type
 )
 
-
-# DynamicShapesSubGraphTests
 unittest.expectedFailure(
-    DynamicShapesSubGraphTests.test_enumerate_not_break_graph_dynamic_shapes
+    DynamicShapesReproTests.test_reformer_train_dynamic_shapes
+    # TypeError: 'torch._C.SymIntNode' object cannot be interpreted as an integer
 )
-unittest.expectedFailure(DynamicShapesSubGraphTests.test_restore_state_dynamic_shapes)
 
 
 if __name__ == "__main__":
diff --git a/test/dynamo/test_export.py b/test/dynamo/test_export.py
index a157926422c8b..21c0d2004bb9e 100644
--- a/test/dynamo/test_export.py
+++ b/test/dynamo/test_export.py
@@ -71,6 +71,32 @@ def func(x):
 
         self.assertTrue(torch._dynamo.utils.same(real_result, dynamo_result))
 
+    @patch.object(torch._dynamo.config, "dynamic_shapes", True)
+    def test_export_shape_control_flow_1(self):
+        def func(x):
+            if x.shape[0] > 10:
+                return x.cos()
+            return x.sin()
+
+        opt_func = torch._dynamo.optimize("eager")(func)
+        real_result = opt_func(torch.ones(6, 4))
+
+        torch._dynamo.reset()
+
+        exported = torch._dynamo.export(func, torch.ones(6, 4))
+        out_graph, out_guards = exported
+
+        dynamo_result = out_graph(torch.ones(6, 4))
+
+        self.assertTrue(torch._dynamo.utils.same(real_result, dynamo_result))
+        hit = False
+        for guard in out_guards:
+            if guard.name == "symbolic_shape_expression":
+                hit = True
+                self.assertTrue("x.size()[0] <= 10" in guard.code_list)
+
+        self.assertTrue(hit)
+
     def test_export_graph_bypass(self):
         inp = [
             torch.tensor([0.1, 0.1]),
diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index a8bf86e46411b..e270852fc5269 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -1144,6 +1144,7 @@ def fn(x):
         torch._dynamo.run()(fn2)(torch.randn(4))
         self.assertEqual(cnts2.frame_count, 0)
 
+    @patch.object(torch._dynamo.config, "suppress_errors", True)
     def test_nested_disable_decorator(self):
         cnts = torch._dynamo.testing.CompileCounter()
 
@@ -1616,6 +1617,7 @@ def fn(x, func):
         self.assertEqual(cnts.op_count, 1)
 
     @patch.object(torch._dynamo.config, "fake_tensor_propagation", True)
+    @patch.object(torch._dynamo.config, "suppress_errors", True)
     def test_unsupported_fake_tensor(self):
         def f(x):
             return torch.quantize_per_tensor(x, 0.1, 10, torch.quint8)
diff --git a/test/dynamo/test_no_fake_tensors.py b/test/dynamo/test_no_fake_tensors.py
index df511f1affd55..f7943c1d7ab90 100644
--- a/test/dynamo/test_no_fake_tensors.py
+++ b/test/dynamo/test_no_fake_tensors.py
@@ -1,6 +1,4 @@
 # Owner(s): ["module: dynamo"]
-import unittest
-
 from torch._dynamo.testing import make_test_cls_with_patches
 
 try:
@@ -25,9 +23,6 @@ def make_no_fake_cls(cls):
 NoFakeTensorsNNModuleTests = make_no_fake_cls(test_modules.NNModuleTests)
 NoFakeTensorsUnspecTests = make_no_fake_cls(test_unspec.UnspecTests)
 
-unittest.expectedFailure(
-    NoFakeTensorsReproTests.test_guard_fail_tensor_bool_no_fake_tensors
-)
 NoFakeTensorsReproTests.test_numpy_list_no_fake_tensors.__unittest_expecting_failure__ = (
     False
 )
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 913d59322ac76..6a1c654a4873f 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -11,6 +11,8 @@
 from typing import List
 from unittest.mock import patch
 
+import functorch._src.config
+
 import numpy as np
 import torch
 
@@ -803,7 +805,6 @@ def test_do_paste_mask(self):
         )
 
         self.assertGreaterEqual(torch._dynamo.utils.counters["frames"]["ok"], 3)
-        # Graph break because of dynamic slicing
         self.assertEqual(
             torch._dynamo.utils.counters["frames"]["total"],
             torch._dynamo.utils.counters["frames"]["ok"] + 1,
@@ -961,7 +962,7 @@ def test_maml_item_capture(self):
 
         self.assertEqual(cnt.frame_count, ifdyn(3, 2))
         # TODO(jansel): figure out why op count depends on imports
-        self.assertIn(cnt.op_count, (36, 35, 29, 28))
+        self.assertIn(cnt.op_count, (36, 35, 34, 29, 28, 27))
 
     # see: https://github.com/pytorch/pytorch/issues/80067
     @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
@@ -980,7 +981,7 @@ def test_maml_no_item_capture(self):
 
         self.assertEqual(cnt.frame_count, ifdyn(5, 4))
         # TODO(jansel): figure out why op count depends on imports
-        self.assertIn(cnt.op_count, (31, 36, 35, 29, 28))
+        self.assertIn(cnt.op_count, (31, 36, 35, 34, 29, 28))
 
     def test_hf_model_output(self):
         ex = ModelOutput(a=torch.randn(10), b=torch.randn(10), c=torch.randn(10))
@@ -1316,6 +1317,7 @@ def blah(self, x):
         self.assertGreaterEqual(torch._dynamo.utils.counters["frames"]["ok"], 3)
         self.assertGreaterEqual(torch._dynamo.utils.counters["frames"]["total"], 3)
 
+    @patch.object(torch._dynamo.config, "suppress_errors", True)
     def test_guard_fail_tensor_bool(self):
         @torch._dynamo.skip
         def fn():
@@ -1402,8 +1404,17 @@ def fn(x):
         self.assertTrue(same(ref1, res1))
 
     @unittest.skipIf(not HAS_REFS, "requires recent PT version")
-    @unittest.expectedFailure
     def test_primtorch(self):
+        @torch._dynamo.optimize("eager")
+        def fn(x):
+            torch._refs.abs(x)
+
+        fn(torch.randn(3))
+
+    @unittest.skipIf(not HAS_REFS, "requires recent PT version")
+    @unittest.expectedFailure
+    # inline_call [('inline in skipfiles: bind ...python3.10/inspect.py', 1)]
+    def test_primtorch_no_graph_break(self):
         @torch._dynamo.optimize("eager", nopython=True)
         def fn(x):
             torch._refs.abs(x)
@@ -1456,14 +1467,14 @@ def fn(x):
 
         fn(torch.randn(3))
 
-    # AssertionError: ABCMeta
+    # Bug with storage meta - torch.BoolStorage is becoming torch.storage._LegacyStorageMeta
     @unittest.expectedFailure
     def test_isinstance_storage(self):
         @torch._dynamo.optimize("eager")
         def fn(x):
             f = bytearray([0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x40])
             bools = torch.BoolStorage.from_buffer(f, "big")
-            self.assertTrue(isinstance(bools, torch.BoolStorage))
+            assert isinstance(bools, torch.BoolStorage)
             return x
 
         fn(torch.randn(3))
@@ -1662,6 +1673,21 @@ def fn(x):
         opt_fn(x)
         self.assertEqual(cnt.frame_count, 1)
 
+    @patch.object(functorch._src.config, "use_dynamic_shapes", True)
+    def test_bigbird_unsqueeze_inplace(self):
+        def fn(reshape_2):
+            view_2 = reshape_2.clone()
+            view_2.unsqueeze_(2)
+            cat_11 = torch.cat([view_2], dim=2)
+            view_13 = cat_11.view((2, 12, 64, -1))
+            return (view_13,)
+
+        x = torch.randn(2, 12, 64, 64, requires_grad=True)
+        ref = fn(x)
+        opt_fn = torch._dynamo.optimize("aot_eager")(fn)
+        res = opt_fn(x)
+        self.assertTrue(same(ref, res))
+
     # This doesn't work without fake tensors but I don't care
     @patch.object(torch._dynamo.config, "fake_tensor_propagation", True)
     def test_issue1466_size_aot_autograd(self):
diff --git a/test/dynamo/test_unspec.py b/test/dynamo/test_unspec.py
index fd5396981b740..e46d79208de02 100644
--- a/test/dynamo/test_unspec.py
+++ b/test/dynamo/test_unspec.py
@@ -50,6 +50,8 @@ class UnspecTest(cls):
 UnspecReproTests = make_unspec_cls(test_repros.ReproTests)
 UnspecNNModuleTests = make_unspec_cls(test_modules.NNModuleTests)
 
+unittest.expectedFailure(UnspecReproTests.test_batch_norm_act_unspec)
+
 
 @patch.object(torch._dynamo.config, "specialize_int_float", False)
 class UnspecTests(torch._dynamo.test_case.TestCase):
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 5cee29920b777..3d384efea0aec 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -279,6 +279,7 @@ def process(device_type):
     "baddbmm": {f16},
     "bernoulli": {f16, f32, f64},
     "bincount": {i32, i64},
+    "bucketize": {b8, f16, f32, f64, i32, i64},
     "chalf": {b8, f16, f32, f64, i32, i64},
     "cholesky": {f32, f64},
     "combinations": {b8, f16, f32, f64, i32, i64},
diff --git a/torch/_dynamo/codegen.py b/torch/_dynamo/codegen.py
index 2ba29981c3668..e469ce02ebd64 100644
--- a/torch/_dynamo/codegen.py
+++ b/torch/_dynamo/codegen.py
@@ -14,6 +14,7 @@
 from .variables.base import VariableTracker
 from .variables.nn_module import NNModuleVariable
 from .variables.tensor import (
+    DynamicShapeVariable,
     TensorVariable,
     TensorWithTFOverrideVariable,
     UnspecializedNumpyVariable,
@@ -95,6 +96,7 @@ def __call__(self, value, allow_cache=True):
             value,
             (
                 TensorVariable,
+                DynamicShapeVariable,
                 TensorWithTFOverrideVariable,
                 UnspecializedNumpyVariable,
                 UnspecializedPythonVariable,
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index d4903964aac6c..9cbcb93fcc5cc 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -92,7 +92,7 @@ def __hash__(self):
 
     def sort_key(self):
         return (
-            self.source.value,
+            self.source.value if self.source else -1,
             len(self.name),
             self.name,
             self.create_fn.__code__.co_firstlineno,
@@ -128,7 +128,7 @@ def __getattr__(self, x):
 
     def __str__(self):
         s = f"""
-            {self.source.name.lower()} {repr(self.name)} {self.create_fn.__name__}
+            {self.source.name.lower() if self.source else ""} {repr(self.name)} {self.create_fn.__name__}
             {{
                 'guard_types': {self.guard_types},
                 'code': {self.code_list},
@@ -438,6 +438,13 @@ def GRAD_MODE(self, guard: Guard):
             code = "not ___is_grad_enabled()"
         self._produce_guard_code(guard, [code])
 
+    # This is a bit of a crutch for export case for symbolic shape guards.
+    # SYMBOL_MATCH is only ever, and must only ever, be used for setting this value on
+    # the create_fn field for tracking guards in export.
+    @staticmethod
+    def SYMBOL_MATCH():
+        pass
+
     def TENSOR_MATCH(self, guard: Guard):
         if guard.is_nn_module():
             self.ID_MATCH(guard)
@@ -537,10 +544,14 @@ def tensor_ref_as_str(tensor_ref, id_to_name_map):
             return f"{id_to_name_map[tensor_ref.ref_id]}.{tensor_ref.kind}()[{tensor_ref.idx}]"
         return f"{id_to_name_map[tensor_ref.ref_id]}.{tensor_ref.kind}()"
 
-    def __init__(self, expr_to_tensor_ref, id_to_name_map):
+    def __init__(
+        self, expr_to_tensor_ref, id_to_name_map, shape_env, intermediary_symbols
+    ):
         super().__init__()
         self.expr_to_tensor_ref = expr_to_tensor_ref
         self.id_to_name_map = id_to_name_map
+        self.shape_env = shape_env
+        self.intermediary_symbols = intermediary_symbols
 
     def _print_Symbol(self, expr) -> str:
         assert isinstance(expr, sympy.Symbol)
@@ -548,7 +559,7 @@ def _print_Symbol(self, expr) -> str:
             return "0"
         if expr == 1:
             return "1"
-        assert expr in self.expr_to_tensor_ref, f"Unknown expression {expr}"
+        assert expr in (self.expr_to_tensor_ref) or (expr in self.intermediary_symbols)
         refs = self.expr_to_tensor_ref[expr]
         if len(refs) == 0:
             return super()._print_Symbol(expr)
@@ -599,7 +610,7 @@ def combine_scopes(left, right):
             if not config.guard_nn_modules and guard.is_nn_module():
                 continue
             guard.create(local_builder, global_builder)
-        self.check_fn = self.compile_check_fn(local_builder, global_builder)
+        self.check_fn = self.compile_check_fn(local_builder, global_builder, guards)
         self._seen_ids.clear()
 
     """
@@ -632,7 +643,12 @@ def _parse_symbolic_shape_expressions(self, tensor_check_names, tensor_check_ids
             return None
 
         expr_to_tensor_ref = {}
-        guard_printer = DynamoGuardPrinter(expr_to_tensor_ref, id_to_name_map)
+        guard_printer = DynamoGuardPrinter(
+            expr_to_tensor_ref,
+            id_to_name_map,
+            self.output_graph.shape_env,
+            self.output_graph.intermediary_symbols,
+        )
 
         # tensor_check_names is the primary tensor association mechanism in dynamo.
         # All other guards installations are driven off of it, so these ones will too.
@@ -649,7 +665,6 @@ def _parse_symbolic_shape_expressions(self, tensor_check_names, tensor_check_ids
                     if obj_expr not in expr_to_tensor_ref:
                         expr_to_tensor_ref[obj_expr] = {}
                     expr_to_tensor_ref[obj_expr][tensor_ref] = ""
-            finished_expressions.append(f"isinstance({name}, torch.Tensor)")
 
         guard_expression = self.output_graph.shape_env.get_guard_expr()
         expr_as_str = guard_printer.doprint(guard_expression)
@@ -668,7 +683,6 @@ def _parse_symbolic_shape_expressions(self, tensor_check_names, tensor_check_ids
 
             if len(equality_candidates) > 1:
                 equality_expr = " == ".join(equality_candidates)
-                # breakpoint()
                 finished_expressions.append(equality_expr)
 
         # Redundant with code_parts, but allows us to wrap it with parens nicely.
@@ -678,7 +692,7 @@ def _parse_symbolic_shape_expressions(self, tensor_check_names, tensor_check_ids
         expression = " and ".join(finished_expressions)
         return f"({expression})"
 
-    def compile_check_fn(self, local_builder, global_builder):
+    def compile_check_fn(self, local_builder, global_builder, guards_out):
         assert not (set(local_builder.argnames) & set(global_builder.argnames))
         # see parallel handling of ".0" / "___implicit0" in _eval_frame.c
         args = [a for a in local_builder.scope.keys() if a == "___implicit0"]
@@ -707,10 +721,6 @@ def compile_check_fn(self, local_builder, global_builder):
             symbolic_shape_expression = self._parse_symbolic_shape_expressions(
                 tensor_check_names, tensor_check_ids
             )
-            if symbolic_shape_expression:
-                code_parts.append(symbolic_shape_expression)
-                verbose_code_parts.append(symbolic_shape_expression)
-
             tensor_check_examples = (
                 local_builder.tensor_check_examples
                 + global_builder.tensor_check_examples
@@ -725,6 +735,17 @@ def compile_check_fn(self, local_builder, global_builder):
                 tensor_check_names + ["tensor_check_names=tensor_check_names"]
             )
             verbose_code_parts.append(f"___check_tensors_verbose({verbose_args})")
+            if symbolic_shape_expression:
+                code_parts.append(symbolic_shape_expression)
+                verbose_code_parts.append(symbolic_shape_expression)
+                guards_out.add(
+                    Guard(
+                        name="symbolic_shape_expression",
+                        source=None,
+                        create_fn=GuardBuilder.SYMBOL_MATCH,
+                        code_list=symbolic_shape_expression,
+                    )
+                )
 
         def direct_equality(a, b):
             return a == b
@@ -739,6 +760,8 @@ def direct_negation(a, b):
                 ("___check_tensors", check_tensors_fn),
                 ("___check_tensors_verbose", check_tensors_verbose_fn),
                 ("tensor_check_names", tensor_check_names),
+                ("floor", math.floor),
+                ("ceiling", math.ceil),
                 ("Eq", direct_equality),
                 ("Ne", direct_negation),
                 ("Mod", sympy.Mod),
diff --git a/torch/_dynamo/optimizations/analysis.py b/torch/_dynamo/optimizations/analysis.py
index b3f6ed79eb06f..c4ed04ca8c39d 100644
--- a/torch/_dynamo/optimizations/analysis.py
+++ b/torch/_dynamo/optimizations/analysis.py
@@ -15,7 +15,7 @@
 if fake_tensors_available:
     from torch._subclasses import FakeTensorMode  # noqa: F401
 
-    from ..utils import deepcopy_to_fake_tensor, wrap_to_fake_tensor
+    from ..utils import deepcopy_to_fake_tensor
 
 
 class ShapeAliasingAndMutationProp(ShapeProp):
@@ -122,9 +122,26 @@ def has_mutation(gm, example_inputs, inputs_only=False):
     # TODO - moco gives bad accuracy with Aliasing. gm is getting mutated in a bad way.
 
     if fake_tensors_available and config.fake_tensor_propagation:
-        with FakeTensorMode() as fake_mode:
-            pass
-        fake_wrapper = functools.partial(wrap_to_fake_tensor, fake_mode=fake_mode)
+
+        def _wrap_to_fake_tensor(t, *, f_mode):
+            if type(t) in (torch.Tensor, torch.nn.Parameter):
+                static_shapes_ = config.dynamic_shapes is False
+                return fake_mode.from_tensor(
+                    t, static_shapes=config.dynamic_shapes is not False
+                )
+            else:
+                return t
+
+        # Our analysis pass should use dynamic shape tensor inputs
+        # when dynamic shapes are enabled.
+        # We don't actually care about the guards that are created
+        # on those shapes though, so just create a fresh ShapeEnv here.
+        from torch.fx.experimental.symbolic_shapes import ShapeEnv
+
+        fake_mode = FakeTensorMode(
+            shape_env=ShapeEnv() if config.dynamic_shapes else None
+        )
+        fake_wrapper = functools.partial(_wrap_to_fake_tensor, f_mode=fake_mode)
         example_inputs = tree_map(fake_wrapper, example_inputs)
         new_gm = deepcopy_to_fake_tensor(gm, fake_mode)
         with fake_mode.restore() if hasattr(fake_mode, "restore") else fake_mode:
diff --git a/torch/_dynamo/optimizations/training.py b/torch/_dynamo/optimizations/training.py
index 49f9a4397dd99..a56a74ad5aeae 100644
--- a/torch/_dynamo/optimizations/training.py
+++ b/torch/_dynamo/optimizations/training.py
@@ -140,9 +140,13 @@ class AotNop(AotAutogradStrategy):
     """Useful for debugging purpose"""
 
     def candidate(self):
+        from functorch._src.compilers import debug_nop
         from functorch.compile import nop
 
-        return BACKENDS["aot_autograd"](self.gm, self.example_inputs, fw_compiler=nop)
+        DEBUG = False
+        return BACKENDS["aot_autograd"](
+            self.gm, self.example_inputs, fw_compiler=debug_nop if DEBUG else nop
+        )
 
 
 aot_eager = AotNop.compile_fn
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index 9dd9a713a25cd..ee5079581be76 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -6,7 +6,7 @@
 import re
 import traceback
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import torch.nn
 from torch import fx
@@ -15,7 +15,7 @@
 from . import config, logging as torchdynamo_logging, variables
 from .bytecode_transformation import create_instruction, Instruction, unique_id
 from .codegen import PyCodegen
-from .exc import BackendCompilerFailed, unimplemented
+from .exc import BackendCompilerFailed
 from .guards import GuardBuilder
 from .mutation_guard import is_dynamic_nn_module
 from .side_effects import SideEffects
@@ -27,9 +27,10 @@
     fake_tensors_available,
     format_graph_tabular,
 )
-from .variables.builder import VariableBuilder
+from .variables.builder import VariableBuilder, wrap_fx_proxy
 from .variables.nn_module import NNModuleVariable
 from .variables.tensor import (
+    DynamicShapeVariable,
     TensorVariable,
     UnspecializedNumpyVariable,
     UnspecializedPythonVariable,
@@ -93,7 +94,7 @@ def __init__(
         self.side_effects = SideEffects()
         self.code_options = dict(code_options)
         self.output_instructions = []
-        # Node => computed real value (see TensorVariable.get_real_value)
+        # Node => computed real value (see utils.get_real_value)
         self.real_value_cache = {}
 
         # Not checkpointed
@@ -107,6 +108,7 @@ def __init__(
         self.unspec_variable_map = {}
         self.shape_env = ShapeEnv() if config.dynamic_shapes else None
         self.tensor_id_to_sym_shape_ref = {}
+        self.intermediary_symbols = {}
 
     @property
     def output(self):
@@ -194,43 +196,63 @@ def update_co_names(self, name):
                 name,
             )
 
-    def register_attr_or_module(self, mod: torch.nn.Module, *names, **options):
-        if is_dynamic_nn_module(mod):
-            return variables.UnspecializedNNModuleVariable(mod, **options)
+    def register_attr_or_module(
+        self, target: Union[torch.nn.Module, torch.Tensor, Any], *names, **options
+    ):
+        if is_dynamic_nn_module(target):
+            return variables.UnspecializedNNModuleVariable(target, **options)
 
         options = dict(options)
         options["guards"] = set(options.get("guards", []))
         source: Source = options.get("source", None)
-        if isinstance(mod, torch.Tensor):
+        if isinstance(target, torch.Tensor):
             if source:
                 options["guards"].add(source.make_guard(GuardBuilder.TENSOR_MATCH))
 
             def wrap_name(module_key):
-                return TensorVariable.create(
+                return wrap_fx_proxy(
                     self,
                     self.create_proxy("get_attr", module_key, tuple(), {}),
-                    example_value=mod,
+                    example_value=target,
                     **options,
                 )
 
-        elif isinstance(mod, torch.nn.Module):
-            assert isinstance(mod, torch.nn.Module)
+        elif isinstance(target, torch.nn.Module):
+            assert isinstance(target, torch.nn.Module)
             options["guards"].add(source.make_guard(GuardBuilder.NN_MODULE))
 
             def wrap_name(module_key):
-                return NNModuleVariable(type(mod), module_key, **options)
+                return NNModuleVariable(type(target), module_key, **options)
+
+        elif isinstance(target, (torch.SymInt, torch.SymFloat)):
+            # HACKY CODE REGION BEGIN
+            # WE ARE PIGGYBACKING ON EXISTING INFRA TO REGISTER ATTRS
+            # This ultimately gets written to self.nn_modules, which is unfortunate
+            # Attrs that are tenors and symints and such need to be migrated to have their
+            # own storage
+            # alas, this is like this for now
+            self.intermediary_symbols.update({target.get_pyobj().expr: None})
+
+            def wrap_name(module_key):
+                return DynamicShapeVariable.create(
+                    self,
+                    self.create_proxy("get_attr", module_key, tuple(), {}),
+                    dyn_shape=target,
+                    **options,
+                )
 
+            # HACKY CODE REGION END
         else:
 
             def wrap_name(module_key):
                 self.output.update_co_names(module_key)
-                self.root_globals[module_key] = mod
+                self.root_globals[module_key] = target
                 return VariableBuilder(self, ConstantSource(source_name=module_key))(
-                    mod
+                    target
                 )
 
         for k, v in self.nn_modules.items():
-            if v is mod:
+            if v is target:
                 # it already exists
                 return wrap_name(k)
 
@@ -246,7 +268,7 @@ def wrap_name(module_key):
         base = name
         for i in itertools.count():
             if name not in self.nn_modules:
-                self.nn_modules[name] = mod
+                self.nn_modules[name] = target
                 return wrap_name(name)
             name = f"{base}_{i}"
 
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index e06f62a6bf628..88e0df5470bc8 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -55,7 +55,7 @@
     istype,
 )
 from .variables.base import MutableLocal, typestr, VariableTracker
-from .variables.builder import VariableBuilder
+from .variables.builder import VariableBuilder, wrap_fx_proxy
 from .variables.builtin import BuiltinVariable
 from .variables.constant import ConstantVariable
 from .variables.dicts import ConstDictVariable
@@ -81,7 +81,7 @@
     WithExitFunctionVariable,
 )
 from .variables.nn_module import NNModuleVariable
-from .variables.tensor import TensorVariable
+from .variables.tensor import DynamicShapeVariable, TensorVariable
 from .variables.torch import TorchVariable
 from .variables.user_defined import UserDefinedVariable
 
@@ -129,7 +129,9 @@ def inner(self: "InstructionTranslatorBase", inst: Instruction):
             if truth_fn(value.as_python_constant()):
                 push and self.push(value)
                 self.jump(inst)
-        elif isinstance(value, TensorVariable) and self.should_compile_partial_graph():
+        elif (
+            isinstance(value, (TensorVariable)) and self.should_compile_partial_graph()
+        ):
             # compile a partial subgraph prefix then jump into user code
             self.push(value)
             self.output.compile_subgraph(
@@ -155,6 +157,11 @@ def inner(self: "InstructionTranslatorBase", inst: Instruction):
             if truth_fn(len(value.unpack_var_sequence(self))):
                 push and self.push(value)
                 self.jump(inst)
+        elif isinstance(value, DynamicShapeVariable):
+            eval_result = value.evaluate_expr(self.output)
+            if truth_fn(eval_result):
+                push and self.push(value)
+                self.jump(inst)
         else:
             unimplemented(f"generic_jump {typestr(value)}")
 
@@ -700,6 +707,7 @@ def COMPARE_OP(self, inst):
                 left,
                 (
                     TensorVariable,
+                    DynamicShapeVariable,
                     NNModuleVariable,
                     BaseListVariable,
                     UserDefinedVariable,
@@ -717,16 +725,6 @@ def COMPARE_OP(self, inst):
                     supported_is_const[op](object(), right.value), **options
                 )
             )
-        elif (
-            isinstance(left, TensorVariable) or isinstance(right, TensorVariable)
-        ) and op in supported_tensors:
-            self.push(
-                TensorVariable.create(
-                    self,
-                    supported_tensors[op](left.as_proxy(), right.as_proxy()),
-                    **options,
-                )
-            )
         elif (
             left.is_python_constant()
             and right.is_python_constant()
@@ -741,6 +739,28 @@ def COMPARE_OP(self, inst):
                     **options,
                 )
             )
+        elif (
+            isinstance(left, TensorVariable) or isinstance(right, TensorVariable)
+        ) and op in supported_tensors:
+            self.push(
+                wrap_fx_proxy(
+                    self,
+                    supported_tensors[op](left.as_proxy(), right.as_proxy()),
+                    **options,
+                )
+            )
+        elif (
+            isinstance(left, DynamicShapeVariable)
+            or isinstance(right, DynamicShapeVariable)
+        ) and op in supported_tensors:
+            self.push(
+                DynamicShapeVariable.create(
+                    self,
+                    supported_tensors[op](left.as_proxy(), right.as_proxy()),
+                    dyn_shape=None,
+                    **options,
+                )
+            )
         elif op in ("in", "not in"):
             self.push(right.call_method(self, "__contains__", [left], {}))
             if op == "not in":
@@ -1029,12 +1049,12 @@ def UNPACK_SEQUENCE(self, inst):
         elif isinstance(seq, TensorVariable):
             proxy = seq.as_proxy()
             for i in reversed(range(inst.argval)):
-                self.push(TensorVariable.create(self, proxy[i], **options))
+                self.push(wrap_fx_proxy(self, proxy[i], **options))
         elif isinstance(seq, GetAttrVariable) and isinstance(seq.obj, TensorVariable):
             # x, y = a.shape
             proxy = getattr(seq.obj.as_proxy(), seq.name)
             for i in reversed(range(inst.argval)):
-                self.push(TensorVariable.create(self, proxy[i], **options))
+                self.push(wrap_fx_proxy(self, proxy[i], **options))
         else:
             unimplemented(f"UNPACK_SEQUENCE {seq}")
 
@@ -1109,7 +1129,8 @@ def FORMAT_VALUE(self, inst):
             fmt_spec = ConstantVariable("")
 
         value = self.pop()
-
+        if isinstance(value, DynamicShapeVariable):
+            value = ConstantVariable(str(value.dyn_shape))
         if (flags & 0x03) == 0x01:
             value = BuiltinVariable(str).call_function(self, [value], {})
         elif (flags & 0x03) == 0x02:
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 067a808073743..0b87be7393b52 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -29,7 +29,9 @@
 
 import torch
 from torch import fx
+from torch._dispatch.python import enable_python_dispatcher
 from torch.nn.modules.lazy import LazyModuleMixin
+from torch.utils._pytree import tree_map
 
 from . import config, logging as torchdynamo_logging
 
@@ -679,10 +681,8 @@ def rename_implicit(v):
         UnsupportedFakeTensorException,
     )
 
-    def make_fake_tensor(e, fake_mode, tx=None):
-        fake_tensor = fake_mode.from_tensor(
-            e, static_shapes=config.dynamic_shapes is False
-        )
+    def make_fake_tensor(e, fake_mode, static_shapes=False, tx=None):
+        fake_tensor = fake_mode.from_tensor(e, static_shapes=static_shapes)
         if tx is not None:
             from torch._dynamo.guards import TensorReference
 
@@ -728,13 +728,23 @@ def wrap_fake_exception(fn):
 
     def wrap_to_fake_tensor(e, fake_mode):
         if type(e) in (torch.Tensor, torch.nn.Parameter):
-            return wrap_fake_exception(lambda: make_fake_tensor(e, fake_mode))
+            return wrap_fake_exception(
+                lambda: make_fake_tensor(
+                    e, fake_mode, static_shapes=config.dynamic_shapes is False
+                )
+            )
         else:
             return e
 
     def wrap_to_fake_tensor_and_record(e, tx):
         if type(e) in (torch.Tensor, torch.nn.Parameter):
-            return wrap_fake_exception(lambda: make_fake_tensor(e, tx.fake_mode, tx))
+            static_shapes = config.dynamic_shapes is False
+            if type(e) is torch.nn.Parameter:
+                # Always static for params
+                static_shapes = True
+            return wrap_fake_exception(
+                lambda: make_fake_tensor(e, tx.fake_mode, static_shapes, tx)
+            )
         else:
             return e
 
@@ -997,3 +1007,116 @@ def _get_debug_dir(root_dir):
 def get_debug_dir():
     debug_root = config.debug_dir_root
     return _get_debug_dir(debug_root)
+
+
+def get_fake_value(node, tx):
+    """
+    Run the computation represented by `node` using fake tensors and return the result.
+    """
+    from .exc import TorchRuntimeError, unimplemented, Unsupported
+
+    op = node.op
+    fake_wrapper = functools.partial(wrap_to_fake_tensor_and_record, tx=tx)
+
+    def visit(n: torch.fx.Node):
+        return n.meta["example_value"]
+
+    args, kwargs = torch.fx.node.map_arg((node.args, node.kwargs), visit)
+    args = tree_map(fake_wrapper, args)
+    kwargs = tree_map(fake_wrapper, kwargs)
+
+    nnmodule = None
+    if op == "call_module":
+        nnmodule = tx.output.nn_modules[node.target]
+
+        if not is_lazy_module(nnmodule):
+            nnmodule = deepcopy_to_fake_tensor(nnmodule, tx.fake_mode)
+
+    if op == "call_module" and is_lazy_module(nnmodule):
+        assert nnmodule is not None
+        # In the case of a lazy module, we want to run
+        # the pre-hooks which initialize it
+        nnmodule(*args, **kwargs)
+    try:
+        with tx.fake_mode, enable_python_dispatcher():
+            return wrap_fake_exception(
+                lambda: run_node(tx.output, node, args, kwargs, nnmodule)
+            )
+    except Unsupported:
+        raise
+    except RuntimeError as e:
+        if isinstance(e, torch._subclasses.fake_tensor.DataDependentOutputException):
+            if config.capture_scalar_outputs and node.target == "item":
+                return torch.zeros(size=(), dtype=args[0].dtype).item()
+            else:
+                unimplemented(f"data dependent operator: {e.func}")
+        elif isinstance(e, torch._subclasses.fake_tensor.DynamicOutputShapeException):
+            unimplemented(f"dynamic shape operator: {e.func}")
+        raise TorchRuntimeError() from e
+
+
+def run_node(output_graph, node, args, kwargs, nnmodule):
+    """
+    Runs a given node, with the given args and kwargs.
+
+    Behavior is dicatated by a node's op.
+
+    run_node is useful for extracting real values out of nodes.
+    See get_real_value for more info on common usage.
+
+    Note: The output_graph arg is only used for 'get_attr' ops
+    Note: The nnmodule arg is only used for 'call_module' ops
+
+    Nodes that are not call_function, call_method, call_module, or get_attr will
+    raise an AssertionError.
+    """
+    op = node.op
+    try:
+        if op == "call_function":
+            return node.target(*args, **kwargs)
+        elif op == "call_method":
+            return getattr(args[0], node.target)(*args[1:], **kwargs)
+        elif op == "call_module":
+            assert nnmodule is not None
+            return nnmodule(*args, **kwargs)
+        elif op == "get_attr":
+            return output_graph.get_submodule(node.target)
+    except Exception as e:
+        raise RuntimeError(
+            f"Failed running {op} {node.target}(*{args}, **{kwargs}):\n{e}\n(scroll up for backtrace)"
+        ) from e
+    raise AssertionError(op)
+
+
+def get_real_value(node, output_graph):
+    """
+    Run the actual computation represented by `node` and return the result.
+    This will execute any dependent nodes in the graph as well.
+    """
+    cache = output_graph.real_value_cache
+    if node in cache:
+        return cache[node]
+
+    op = node.op
+    args, kwargs = torch.fx.node.map_arg(
+        (node.args, node.kwargs),
+        lambda n: get_real_value(n, output_graph),
+    )
+
+    if op == "call_module":
+        nn_module = output_graph.nn_modules[node.target]
+        if not is_lazy_module(nn_module):
+            nn_module = copy.deepcopy(nn_module)
+        else:
+            # In the case of a lazy module, we want to run
+            # the pre-hooks which initialize it
+            nn_module(*args, **kwargs)
+    else:
+        nn_module = None
+
+    try:
+        real_value = run_node(output_graph, node, args, kwargs, nn_module)
+        cache[node] = real_value
+    except RuntimeError as e:
+        raise TorchRuntimeError() from e
+    return real_value
diff --git a/torch/_dynamo/variables/__init__.py b/torch/_dynamo/variables/__init__.py
index 8c80557e3fd01..2305afc226ac2 100644
--- a/torch/_dynamo/variables/__init__.py
+++ b/torch/_dynamo/variables/__init__.py
@@ -35,6 +35,7 @@
 )
 from .nn_module import NNModuleVariable, UnspecializedNNModuleVariable
 from .tensor import (
+    DynamicShapeVariable,
     FakeItemVariable,
     TensorVariable,
     UnspecializedNumpyVariable,
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index d3c5140fa4a97..67e506b5b435b 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -3,15 +3,19 @@
 import enum
 import functools
 import inspect
+import math
+import numbers
+import operator
 import re
 import types
 from abc import ABCMeta
-from typing import Any, List
+from typing import Any, List, Union
 
 import numpy as np
 from functorch.experimental.ops import PyOperator
 
 import torch
+from torch.fx.immutable_collections import immutable_list
 
 from .. import config, mutation_guard, replay_record, skipfiles
 from ..allowed_functions import is_allowed, is_builtin_callable, is_numpy
@@ -31,6 +35,10 @@
     TupleIteratorGetItemSource,
 )
 from ..utils import (
+    clone_input,
+    fake_tensors_available,
+    get_fake_value,
+    get_real_value,
     getfile,
     global_key_name,
     is_namedtuple,
@@ -38,11 +46,14 @@
     istensor,
     istype,
     odict_values,
+    preserve_rng_state,
     tuple_iterator,
     tuple_iterator_getitem,
     tuple_iterator_len,
+    wrap_to_fake_tensor_and_record,
 )
-from .base import MutableLocal
+
+from .base import MutableLocal, typestr
 from .builtin import BuiltinVariable
 from .constant import ConstantVariable, EnumVariable
 from .dicts import (
@@ -57,6 +68,7 @@
     ListVariable,
     NamedTupleVariable,
     RangeVariable,
+    SizeVariable,
     SliceVariable,
     TupleVariable,
 )
@@ -72,6 +84,7 @@
 )
 from .nn_module import UnspecializedNNModuleVariable
 from .tensor import (
+    DynamicShapeVariable,
     TensorVariable,
     TensorWithTFOverrideVariable,
     UnspecializedNumpyVariable,
@@ -86,6 +99,10 @@
 from .user_defined import UserDefinedClassVariable, UserDefinedObjectVariable
 
 
+class _missing:
+    pass
+
+
 @dataclasses.dataclass
 class GraphArg:
     source: Source
@@ -187,6 +204,8 @@ def make_guards(self, *guards):
 
     def _wrap(self, value):
         make_guards = self.make_guards
+        if istype(value, (torch.SymInt, torch.SymFloat)):
+            return self.wrap_sym(value)
         if istensor(value):
             return self.wrap_tensor(value)
         elif istype(value, (tuple, list, odict_values)) or is_namedtuple(value):
@@ -490,6 +509,26 @@ def tensor_should_specialize(self):
             )
         )
 
+    def wrap_sym(self, value: Union[torch.SymInt, torch.SymFloat]):
+        if not is_constant_source(self.get_source()):
+            self.tx.output.graphargs.append(GraphArg(self.get_source(), value, False))
+        elif is_constant_source(self.get_source()):
+            return self.tx.output.register_attr_or_module(
+                value,
+                re.sub(r"[^a-zA-Z0-9]+", "_", self.name),
+                source=None,
+                dyn_shape=value
+                # shape Guards live their own rich life via shape_env
+            )
+        return DynamicShapeVariable.create(
+            tx=self.tx,
+            proxy=self.tx.output.create_graph_input(
+                re.sub(r"[^a-zA-Z0-9]+", "_", self.name), type(value)
+            ),
+            dyn_shape=value
+            # shape Guards live their own rich life via shape_env
+        )
+
     def wrap_tensor(self, value: torch.Tensor):
         if self.get_source().guard_source().is_nn_module():
             return self.tx.output.register_attr_or_module(
@@ -514,7 +553,7 @@ def wrap_tensor(self, value: torch.Tensor):
                         source=None,
                         # Guards are added inside register_attr_or_module
                     )
-                tensor_variable = TensorVariable.create(
+                tensor_variable = wrap_fx_proxy(
                     tx=self.tx,
                     proxy=self.tx.output.create_graph_input(
                         re.sub(r"[^a-zA-Z0-9]+", "_", self.name), type(value)
@@ -556,14 +595,16 @@ def wrap_unspecialized_primitive(self, value):
             )
 
             if isinstance(value, np.number):
-                unspec_var = UnspecializedNumpyVariable.create(
+                unspec_var = wrap_fx_proxy_cls(
+                    UnspecializedNumpyVariable,
                     tx=self.tx,
                     proxy=proxy,
                     example_value=wrapped_value,
                     **options,
                 )
             else:
-                unspec_var = UnspecializedPythonVariable.create(
+                unspec_var = wrap_fx_proxy_cls(
+                    UnspecializedPythonVariable,
                     tx=self.tx,
                     proxy=proxy,
                     example_value=wrapped_value,
@@ -589,3 +630,190 @@ def _dataclasses_fields_lambda(obj):
             )
         items.append(UserDefinedObjectVariable(field, source=source).add_options(obj))
     return TupleVariable(items).add_options(obj)
+
+
+def wrap_fx_proxy(tx, proxy, example_value=None, **options):
+    return wrap_fx_proxy_cls(
+        target_cls=TensorVariable,
+        tx=tx,
+        proxy=proxy,
+        example_value=example_value,
+        **options,
+    )
+
+
+# Note: Unfortunate split due to some gross classes existing that subclass TensorVariable
+# Should be compositional instead
+def wrap_fx_proxy_cls(target_cls, tx, proxy, example_value=None, **options):
+    if "guards" in options and options["guards"] is not None:
+        tx.output.guards.update(options["guards"])
+
+    assert "example_value" not in proxy.node.meta
+    if not config.dynamic_propagation:
+        if isinstance(example_value, torch.Tensor):
+            options.update(target_cls.specialize(example_value))
+        return target_cls(proxy, **options)
+
+    use_fake_tensors = fake_tensors_available and config.fake_tensor_propagation
+
+    initial_example_value = example_value
+
+    def _clone_input(value):
+        if isinstance(value, torch.Tensor):
+            use_fake_tensors = fake_tensors_available and config.fake_tensor_propagation
+            # tensor subclasses will not be converted to FakeTensors and need to be cloned
+            if not use_fake_tensors or not isinstance(
+                value, torch._subclasses.fake_tensor.FakeTensor
+            ):
+                # NB: ensure strides are preserved
+                value = clone_input(value)
+
+        return value
+
+    with preserve_rng_state():
+        if example_value is None:
+            if use_fake_tensors:
+                example_value = get_fake_value(proxy.node, tx)
+            else:
+                example_value = get_real_value(proxy.node, tx.output)
+
+        else:
+            proxy.tracer.real_value_cache[proxy.node] = _clone_input(example_value)
+            if use_fake_tensors:
+                fake_wrapper = functools.partial(wrap_to_fake_tensor_and_record, tx=tx)
+                example_value = fake_wrapper(example_value)
+
+    if isinstance(example_value, torch.Tensor):
+        is_parameter = isinstance(example_value, torch.nn.Parameter)
+        should_specialize = options.pop("should_specialize", False)
+        if is_parameter or should_specialize:
+            specialized_value = initial_example_value
+        else:
+            specialized_value = None
+
+        example_value = _clone_input(example_value)
+        proxy.node.meta["example_value"] = example_value
+        specialized_props = target_cls.specialize(example_value)
+        if use_fake_tensors and isinstance(
+            example_value, torch._subclasses.fake_tensor.FakeTensor
+        ):
+            specialized_props["class_type"] = (
+                torch.nn.Parameter if is_parameter else torch.Tensor
+            )
+
+        specialized_props["specialized_value"] = specialized_value
+
+        options.update(specialized_props)
+        return target_cls(proxy, **options)
+    elif (
+        hasattr(proxy.node.target, "__name__")
+        and proxy.node.target.__name__ == "set_state"
+        and isinstance(proxy.node.target.__self__, torch._C.Generator)
+        or proxy.node.target == torch.random.set_rng_state
+    ):
+        from . import TorchVariable
+
+        return TorchVariable(proxy.node.target)
+    elif (
+        proxy.node.target == torch._C._DisableFuncTorch
+        or proxy.node.target == torch.cuda._is_in_bad_fork
+    ):
+        from . import UserDefinedObjectVariable
+
+        return UserDefinedObjectVariable(example_value)
+    elif istype(example_value, (int, bool, float)) and config.dynamic_shapes:
+        proxy.node.meta["example_value"] = example_value
+        return DynamicShapeVariable.create(tx, proxy, example_value, **options)
+    elif istype(example_value, torch.Size) and config.dynamic_shapes:
+        proxy.node.meta["example_value"] = example_value
+        sizes = []
+        for i, v in enumerate(example_value):
+            proxy_i = proxy[i]
+            sizes.append(DynamicShapeVariable.create(tx, proxy_i, v, **options))
+        return SizeVariable(sizes, proxy, **options)
+    elif istype(example_value, int) and proxy.node.target in (
+        torch.seed,
+        operator.mod,
+        # some mac builds are missing torch.distributed.get_rank()
+        getattr(torch.distributed, "get_rank", _missing),
+        getattr(torch.distributed, "get_world_size", _missing),
+    ):
+        if config.dynamic_shapes:
+            proxy.node.meta["example_value"] = example_value
+            return DynamicShapeVariable.create(tx, proxy, example_value, **options)
+        else:
+            return ConstantVariable(example_value, **options)
+    elif istype(example_value, torch.Size) and all(
+        [isinstance(x, int) for x in example_value]
+    ):
+        sizes = [ConstantVariable(x) for x in example_value]
+        return SizeVariable(sizes, **options)
+    elif isinstance(example_value, (tuple, list)):
+        unpacked = []
+        for i, val in enumerate(example_value):
+            if val is None:
+                # nn.MultiheadAttention() can return None, see issue #175
+                unpacked.append(
+                    ConstantVariable(None, **options),
+                )
+            else:
+                unpacked.append(
+                    wrap_fx_proxy(
+                        tx,
+                        proxy.tracer.create_proxy(
+                            "call_function", operator.getitem, (proxy, i), {}
+                        ),
+                        example_value=val,
+                        **options,
+                    )
+                )
+        if istype(example_value, tuple):
+            return TupleVariable(unpacked, **options)
+        elif istype(example_value, (list, immutable_list)):
+            return ListVariable(unpacked, mutable_local=MutableLocal(), **options)
+        else:
+            assert (
+                example_value.__class__.__module__ == "torch.return_types"
+                or hasattr(example_value, "_fields")
+            ), ("namedtuple?")
+            return NamedTupleVariable(unpacked, example_value.__class__, **options)
+    elif example_value is None or proxy.node.target is torch.manual_seed:
+        return ConstantVariable(None, **options)
+    elif (
+        isinstance(example_value, int)
+        and proxy.node.target is torch._utils._element_size
+    ):
+        proxy.node.meta["example_value"] = example_value
+        return ConstantVariable(example_value, **options)
+    elif (
+        isinstance(example_value, numbers.Number)
+        and (proxy.node.target == "item" or proxy.node.target in {math.sqrt, math.pow})
+        and config.capture_scalar_outputs
+    ):
+        if use_fake_tensors:
+            # item raw value should not be accessed
+            return wrap_fx_proxy_cls(
+                FakeItemVariable,
+                tx=tx,
+                proxy=proxy,
+                example_value=torch.tensor(example_value),
+                **options,
+            )
+        else:
+            return wrap_fx_proxy_cls(
+                UnspecializedPythonVariable,
+                tx=tx,
+                proxy=proxy,
+                example_value=torch.tensor(example_value),
+                raw_value=None if use_fake_tensors else example_value,
+                need_unwrap=False,
+                **options,
+            )
+    elif isinstance(example_value, (torch.SymInt, torch.SymFloat)):
+        proxy.node.meta["example_value"] = example_value
+        return DynamicShapeVariable(proxy, example_value, **options)
+    else:
+        raise AssertionError(
+            "torch.* op returned non-Tensor "
+            + f"{typestr(example_value)} {proxy.node.op} {proxy.node.target}"
+        )
diff --git a/torch/_dynamo/variables/builtin.py b/torch/_dynamo/variables/builtin.py
index 5a88f375c9c28..904ed8a49f81c 100644
--- a/torch/_dynamo/variables/builtin.py
+++ b/torch/_dynamo/variables/builtin.py
@@ -10,6 +10,7 @@
 import numpy as np
 
 import torch
+from torch.fx.experimental.symbolic_shapes import sym_float, sym_int
 
 from .. import config, variables
 from ..allowed_functions import is_allowed
@@ -26,7 +27,7 @@
 )
 from .base import MutableLocal, VariableTracker
 from .dicts import ConstDictVariable
-from .tensor import DynamicShapeVariable, FakeItemVariable
+from .tensor import DynamicShapeVariable, FakeItemVariable, UnspecializedPythonVariable
 
 log = logging.getLogger(__name__)
 
@@ -226,6 +227,7 @@ def unwrap_unspec_args_kwargs(args, kwargs):
     def call_function(
         self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
     ) -> "VariableTracker":
+        from .builder import wrap_fx_proxy, wrap_fx_proxy_cls
 
         constant_args = check_constant_args(args, kwargs)
         tensor_args = self.tensor_args(*args, **kwargs)
@@ -234,7 +236,7 @@ def call_function(
         has_constant_handler = self.can_constant_fold_through() and (
             constant_args or unspec_python_args
         )
-        assert isinstance(args, list)
+        assert isinstance(args, (list, tuple))
         assert isinstance(kwargs, dict)
 
         if (
@@ -274,7 +276,8 @@ def call_function(
                     "call_function", fn, *proxy_args_kwargs(args, kwargs), current_tx=tx
                 )
                 if any([isinstance(arg, FakeItemVariable) for arg in args]):
-                    return variables.FakeItemVariable.create(
+                    return wrap_fx_proxy_cls(
+                        FakeItemVariable,
                         tx,
                         proxy,
                         **options,
@@ -282,7 +285,8 @@ def call_function(
                 elif self.unspec_numpy_args(*args, **kwargs):
                     _args, _kwargs = self.unwrap_unspec_args_kwargs(args, kwargs)
                     raw_value = self.fn(*_args, **_kwargs)
-                    return variables.UnspecializedNumpyVariable.create(
+                    return wrap_fx_proxy_cls(
+                        variables.UnspecializedNumpyVariable,
                         tx,
                         proxy,
                         raw_value=raw_value,
@@ -298,7 +302,8 @@ def call_function(
                         if isinstance(x, variables.UnspecializedPythonVariable)
                     )
 
-                    return variables.UnspecializedPythonVariable.create(
+                    return wrap_fx_proxy_cls(
+                        UnspecializedPythonVariable,
                         tx,
                         proxy,
                         raw_value=raw_value,
@@ -312,14 +317,27 @@ def call_function(
                         args[0], variables.UnspecializedPythonVariable
                     ):
                         args[0] = args[0].convert_to_constant(tx)
-                    return variables.TensorVariable.create(tx, proxy, **options)
+                    return wrap_fx_proxy(tx, proxy, **options)
 
             except NotImplementedError:
                 unimplemented(f"partial tensor op: {self} {args} {kwargs}")
 
         # Handle cases like int(torch.seed())
-        if self.fn is int and isinstance(args[0], DynamicShapeVariable):
-            return args[0]
+        # Also handle sym_float to sym_int cases
+        if self.fn in (int, float) and isinstance(args[0], DynamicShapeVariable):
+            fn_ = sym_int if self.fn is int else sym_float
+            out = wrap_fx_proxy(
+                tx=tx,
+                proxy=tx.output.create_proxy(
+                    "call_function",
+                    fn_,
+                    (args[0].as_proxy(),),
+                    {},
+                    current_tx=tx,
+                ),
+                **options,
+            )
+            return out
 
         handler = getattr(self, f"call_{self.fn.__name__}", None)
         if handler:
@@ -353,7 +371,6 @@ def call_function(
                 ),
                 **options,
             )
-
         return super().call_function(tx, args, kwargs)
 
     def _call_min_max(self, tx, a, b):
@@ -368,7 +385,9 @@ def _call_min_max(self, tx, a, b):
 
             # Dynamic input does not get resolved, rather, gets stored as call_function
             if isinstance(a, DynamicShapeVariable):
-                return variables.TensorVariable.create(
+                from .builder import wrap_fx_proxy
+
+                return wrap_fx_proxy(
                     tx=tx,
                     proxy=tx.output.create_proxy(
                         "call_function",
@@ -437,7 +456,13 @@ def _call_min_max(self, tx, a, b):
                 return variables.ConstantVariable(max(a.value, b.value))
             else:
                 return variables.ConstantVariable(min(a.value, b.value))
+        elif isinstance(a, DynamicShapeVariable) or isinstance(b, DynamicShapeVariable):
+            proxy = tx.output.create_proxy(
+                "call_function", self.fn, *proxy_args_kwargs([a, b], {})
+            )
+            return DynamicShapeVariable.create(tx, proxy, None)
         else:
+
             unimplemented(f"unsupported min / max over args {str(a)}, {str(b)}")
 
     call_min = _call_min_max
@@ -454,11 +479,48 @@ def call_range(self, tx, *args, **kwargs):
                     **{k: v.value for k, v in kwargs.items()},
                 ),
             )
+        elif self._dynamic_args(*args, **kwargs):
+            assert len(kwargs) == 0
+
+            def guard_if_dyn(arg):
+                if isinstance(arg, DynamicShapeVariable):
+                    return arg.evaluate_expr(tx.output)
+                return arg
+
+            args = [guard_if_dyn(arg) for arg in args]
+            value = self.fn(*args)
+            return variables.RangeVariable(value=value)
+        # None no-ops this handler and lets the driving function proceed
+        return None
+
+    def _dynamic_args(self, *args, **kwargs):
+        return any([isinstance(x, DynamicShapeVariable) for x in args]) or any(
+            [isinstance(x, DynamicShapeVariable) for x in kwargs.values()]
+        )
 
     def call_slice(self, tx, *args):
         return variables.SliceVariable(args)
 
-    def _call_iter_tuple_list(self, tx, obj=None):
+    def _dyn_proxy(self, tx, *args, **kwargs):
+        assert self._dynamic_args(*args, **kwargs)
+        from .builder import wrap_fx_proxy
+
+        options = VariableTracker.propagate(self, args, kwargs.values())
+        return wrap_fx_proxy(
+            tx,
+            tx.output.create_proxy(
+                "call_function", self.fn, *proxy_args_kwargs(args, kwargs)
+            ),
+            **options,
+        )
+
+    def call_mod(self, tx, *args, **kwargs):
+        if self._dynamic_args(*args, **kwargs):
+            return self._dyn_proxy(tx, *args, **kwargs)
+
+    def _call_iter_tuple_list(self, tx, obj=None, *args, **kwargs):
+        if self._dynamic_args(*args, **kwargs):
+            return self._dyn_proxy(tx, *args, **kwargs)
         cls = variables.BaseListVariable.cls_for(self.fn)
         if obj is None:
             return cls(
@@ -551,6 +613,7 @@ def call_getitem(self, tx, *args, **kwargs):
 
     def call_isinstance(self, tx, arg, isinstance_type):
         arg_type = arg.python_type()
+
         isinstance_type = isinstance_type.as_python_constant()
 
         if isinstance(arg, variables.TensorVariable) and arg.dtype is not None:
diff --git a/torch/_dynamo/variables/constant.py b/torch/_dynamo/variables/constant.py
index d3366448e3799..63eed37ccbec2 100644
--- a/torch/_dynamo/variables/constant.py
+++ b/torch/_dynamo/variables/constant.py
@@ -13,6 +13,8 @@ class ConstantVariable(VariableTracker):
     def __init__(self, value, **kwargs):
         super(ConstantVariable, self).__init__(**kwargs)
         assert not isinstance(value, torch.Tensor)
+        assert not isinstance(value, torch.SymInt)
+        assert not isinstance(value, torch.SymFloat)
         self.value = value
 
     def as_proxy(self):
@@ -70,6 +72,8 @@ def call_method(
         args: "List[VariableTracker]",
         kwargs: "Dict[str, VariableTracker]",
     ) -> "VariableTracker":
+        from .tensor import DynamicShapeVariable
+
         options = VariableTracker.propagate(self, args, kwargs.values())
 
         if istype(self.value, tuple):
@@ -78,6 +82,20 @@ def call_method(
                 items=self.unpack_var_sequence(tx), source=self.source, **options
             ).call_method(tx, name, args, kwargs)
 
+        if any([isinstance(x, DynamicShapeVariable) for x in args]):
+            # NOTE! DANGER! THIS ONLY WORKS FOR COMMUTATIVE OPS
+            # we are relying on add to have arg[0] be a DynamicShapeVariable
+            # because we are in ConstantVariable land
+            # This transforms
+            # constant + dynamic
+            # into
+            # dynamic + constant
+            # Which already has infra built for writing to the graph
+            if name == "__add__":
+                assert len(args) == 1
+                return args[0].call_method(tx, name, [self], {})
+            # Unfortunate constant
+            return super(ConstantVariable, self).call_method(tx, name, args, kwargs)
         try:
             const_args = [a.as_python_constant() for a in args]
             const_kwargs = {k: v.as_python_constant() for k, v in kwargs.items()}
@@ -98,7 +116,19 @@ def has_arith_binop(num_ty):
             return ConstantVariable(method(*const_args, **const_kwargs), **options)
         elif has_arith_binop(int) or has_arith_binop(float):
             op = getattr(operator, name)
-            return ConstantVariable(op(self.value, const_args[0]), **options)
+            add_target = const_args[0]
+            if isinstance(add_target, (torch.SymInt, torch.SymFloat)):
+                from .tensor import DynamicShapeVariable
+
+                # Addition between a non sym and sym makes a sym
+                # dyn_shape = tx.output.register_attr_or_module(
+                #     add_target, f"sym_shape_{add_target}", source=None
+                # )
+                proxy = tx.output.create_proxy(
+                    "call_function", op, (self.value, add_target), {}
+                )
+                return DynamicShapeVariable.create(tx, proxy, add_target, **options)
+            return ConstantVariable(op(self.value, add_target), **options)
         elif name == "__len__" and not (args or kwargs):
             return ConstantVariable(len(self.value), **options)
         elif name == "__contains__" and len(args) == 1 and args[0].is_python_constant():
diff --git a/torch/_dynamo/variables/lists.py b/torch/_dynamo/variables/lists.py
index f63283819f350..151619d0e4ab0 100644
--- a/torch/_dynamo/variables/lists.py
+++ b/torch/_dynamo/variables/lists.py
@@ -7,7 +7,7 @@
 from ..bytecode_transformation import create_instruction
 from ..exc import unimplemented
 from ..source import GetItemSource
-from ..utils import namedtuple_fields
+from ..utils import namedtuple_fields, proxy_args_kwargs
 from .base import MutableLocal, VariableTracker
 from .constant import ConstantVariable
 
@@ -308,6 +308,58 @@ def reconstruct(self, codegen):
         ]
         return build_torch_size
 
+    def unpack_var_sequence(self, tx):
+        return [x.add_options(self) for x in self.items]
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        options = VariableTracker.propagate(self, args, kwargs.values())
+        if name == "__getitem__":
+            assert not kwargs and len(args) == 1
+            if config.dynamic_shapes:
+                out = self.get_item_dyn(tx, args[0])
+            else:
+                out = self.getitem_const(args[0])
+            return out
+        return super(SizeVariable, self).call_method(tx, name, args, kwargs)
+
+    def get_item_dyn(self, tx, arg: VariableTracker):
+        from .tensor import DynamicShapeVariable
+
+        index = arg.as_python_constant()
+        if isinstance(index, slice):
+
+            def _dynamo_get_item_lambda(target, index):
+                return torch.Size.__getitem__(target, index)
+
+            parent_proxy = self.as_proxy()
+            proxy = tx.output.create_proxy(
+                "call_function",
+                _dynamo_get_item_lambda,
+                *proxy_args_kwargs([self, arg], {}),
+                current_tx=tx,
+            )
+            items = self.items[index]
+
+            def _unpack_into_example(item):
+                if isinstance(item, DynamicShapeVariable):
+                    return item.dyn_shape
+                return item.as_python_constant()
+
+            # Mirror the indexing into example_value for downstream correctness
+            proxy.node.meta["example_value"] = parent_proxy.node.meta["example_value"][
+                index
+            ]
+            return SizeVariable(items, proxy=proxy).add_options(arg, self)
+        else:
+            assert isinstance(index, int)
+            return self.items[index].add_options(arg, self)
+
 
 class ShapeVariable(TupleVariable):
     """
@@ -349,13 +401,20 @@ def call_hasattr(self, tx, name: str) -> "VariableTracker":
 
 class SliceVariable(BaseListVariable):
     def __init__(self, items, **kwargs):
+        from .tensor import DynamicShapeVariable
+
+        if any([isinstance(x, DynamicShapeVariable) for x in items]):
+            unimplemented("Dynamic slicing not supported")
+
+        items_to_map = items
         start, stop, step = [variables.ConstantVariable(None)] * 3
-        if len(items) == 1:
-            (stop,) = items
-        elif len(items) == 2:
-            start, stop = items
-        elif len(items) == 3:
-            start, stop, step = items
+
+        if len(items_to_map) == 1:
+            (stop,) = items_to_map
+        elif len(items_to_map) == 2:
+            start, stop = items_to_map
+        elif len(items_to_map) == 3:
+            start, stop, step = items_to_map
         else:
             raise AssertionError()
 
@@ -366,7 +425,7 @@ def __init__(self, items, **kwargs):
         # more complete support for breaking on data dependent operators.
         if not config.capture_scalar_outputs:
             for limit in (start, stop, step):
-                if isinstance(limit, variables.TensorVariable):
+                if isinstance(limit, (variables.TensorVariable, DynamicShapeVariable)):
                     unimplemented("Dynamic slicing not supported")
 
         super().__init__([start, stop, step], **kwargs)
diff --git a/torch/_dynamo/variables/misc.py b/torch/_dynamo/variables/misc.py
index da327122a6a70..5d7336cefeae7 100644
--- a/torch/_dynamo/variables/misc.py
+++ b/torch/_dynamo/variables/misc.py
@@ -513,6 +513,7 @@ def reconstruct(self, codegen):
     def call_function(
         self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
     ) -> "VariableTracker":
+        from .builder import wrap_fx_proxy
 
         # This variable is True when it corresponds to user code such as
         #
@@ -530,7 +531,7 @@ def call_function(
         if is_original_tensor_torch_function:
             # Instead of tracing inside torch.Tensor.__torch_function__,
             # record the `call_function` or `call_method` call into the graph.
-            from . import TensorVariable, TorchVariable
+            from . import TorchVariable
 
             original_torch_or_getattr_variable = args[0]
             new_args = args[2].items
@@ -540,7 +541,7 @@ def call_function(
             # example tensor from going into the override.
             with torch._C.DisableTorchFunction():
                 if isinstance(args[0], TorchVariable):
-                    return TensorVariable.create(
+                    return wrap_fx_proxy(
                         tx=tx,
                         proxy=tx.output.create_proxy(
                             "call_function",
@@ -551,7 +552,7 @@ def call_function(
                         **options,
                     )
                 elif isinstance(args[0], GetAttrVariable):
-                    return TensorVariable.create(
+                    return wrap_fx_proxy(
                         tx=tx,
                         proxy=tx.output.create_proxy(
                             "call_method",
diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py
index 1922980fc957f..848f022525d9e 100644
--- a/torch/_dynamo/variables/nn_module.py
+++ b/torch/_dynamo/variables/nn_module.py
@@ -197,8 +197,9 @@ def record_nn_module_stack():
                 # The module type will change after it is called
                 if is_lazy:
                     self.module_type = mod.cls_to_become
+                from .builder import wrap_fx_proxy
 
-                return variables.TensorVariable.create(
+                return wrap_fx_proxy(
                     tx=tx,
                     proxy=tx.output.create_proxy(
                         "call_module",
@@ -454,7 +455,9 @@ def make_attr(name):
 
             proxy_args, proxy_kwargs = proxy_args_kwargs(args, kwargs)
 
-            return variables.TensorVariable.create(
+            from .builder import wrap_fx_proxy
+
+            return wrap_fx_proxy(
                 tx=tx,
                 proxy=tx.output.create_proxy(
                     "call_method",
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index e87b1d87bac9b..8867f7e6cc93d 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -1,161 +1,28 @@
-import copy
-import functools
 import itertools
-import math
-import numbers
 import operator
 from typing import Dict, List
 
 import torch.fx
 import torch.random
 
-from ..utils import fake_tensors_available
-
-if fake_tensors_available:
-    from torch._subclasses import FakeTensor
-    from torch._subclasses.fake_tensor import (
-        DataDependentOutputException,
-        DynamicOutputShapeException,
-    )
-    from ..utils import deepcopy_to_fake_tensor, wrap_to_fake_tensor_and_record
-
-import torch.utils._python_dispatch as py_dispatch
-from torch.fx.immutable_collections import immutable_list
-from torch.utils._pytree import tree_map
-
 from .. import config, variables
-from ..exc import TorchRuntimeError, unimplemented, Unsupported
+from ..exc import unimplemented
 from ..guards import GuardBuilder
 from ..source import AttrSource
+
 from ..utils import (
-    clone_input,
-    is_lazy_module,
-    istype,
-    preserve_rng_state,
+    fake_tensors_available,
+    get_fake_value,
+    get_real_value,
     product,
     proxy_args_kwargs,
     tensortype_to_dtype,
 )
-from .base import MutableLocal, typestr, VariableTracker
+from .base import VariableTracker
 from .constant import ConstantVariable
 from .lists import ShapeVariable, SizeVariable
 
 
-class _missing:
-    pass
-
-
-def _run_node(output_graph, node, args, kwargs, nnmodule):
-    op = node.op
-    if op == "call_function":
-        return node.target(*args, **kwargs)
-    elif op == "call_method":
-        return getattr(args[0], node.target)(*args[1:], **kwargs)
-    elif op == "call_module":
-        assert nnmodule is not None
-        return nnmodule(*args, **kwargs)
-    elif op == "get_attr":
-        return output_graph.get_submodule(node.target)
-    raise AssertionError(op)
-
-
-def _get_real_value(node, output_graph):
-    """
-    Run the actual computation represented by `node` and return the result.
-    This will execute any dependent nodes in the graph as well.
-    """
-    cache = output_graph.real_value_cache
-    if node in cache:
-        return cache[node]
-
-    op = node.op
-    args, kwargs = torch.fx.node.map_arg(
-        (node.args, node.kwargs),
-        lambda n: _get_real_value(n, output_graph),
-    )
-
-    if op == "call_module":
-        nn_module = output_graph.nn_modules[node.target]
-        if not is_lazy_module(nn_module):
-            nn_module = copy.deepcopy(nn_module)
-        else:
-            # In the case of a lazy module, we want to run
-            # the pre-hooks which initialize it
-            nn_module(*args, **kwargs)
-    else:
-        nn_module = None
-
-    try:
-        real_value = _run_node(output_graph, node, args, kwargs, nn_module)
-        cache[node] = real_value
-    except RuntimeError as e:
-        raise TorchRuntimeError() from e
-    return real_value
-
-
-def _get_fake_value(node, tx):
-    """
-    Run the computation represented by `node` using fake tensors and return the result.
-    """
-    op = node.op
-    fake_wrapper = functools.partial(wrap_to_fake_tensor_and_record, tx=tx)
-    from ..utils import wrap_fake_exception
-
-    def visit(n: torch.fx.Node):
-        return n.meta["example_value"]
-
-    args, kwargs = torch.fx.node.map_arg((node.args, node.kwargs), visit)
-    args = tree_map(fake_wrapper, args)
-    kwargs = tree_map(fake_wrapper, kwargs)
-
-    nnmodule = None
-    if op == "call_module":
-        nnmodule = tx.output.nn_modules[node.target]
-
-        if not is_lazy_module(nnmodule):
-            nnmodule = deepcopy_to_fake_tensor(nnmodule, tx.fake_mode)
-
-    def context():
-        if hasattr(py_dispatch, "enable_torch_dispatch_mode"):
-            return py_dispatch.enable_torch_dispatch_mode(tx.fake_mode)
-        else:
-            return tx.fake_mode
-
-    if op == "call_module" and is_lazy_module(nnmodule):
-        assert nnmodule is not None
-        # In the case of a lazy module, we want to run
-        # the pre-hooks which initialize it
-        nnmodule(*args, **kwargs)
-    try:
-        with context():
-            return wrap_fake_exception(
-                lambda: _run_node(tx.output, node, args, kwargs, nnmodule)
-            )
-    except Unsupported:
-        raise
-    except RuntimeError as e:
-        if isinstance(e, DataDependentOutputException):
-            if config.capture_scalar_outputs and node.target == "item":
-                return torch.zeros(size=(), dtype=args[0].dtype).item()
-            else:
-                unimplemented(f"data dependent operator: {e.func}")
-        elif isinstance(e, DynamicOutputShapeException):
-            unimplemented(f"dynamic shape operator: {e.func}")
-        else:
-            raise TorchRuntimeError() from e
-
-
-def _clone_input(value):
-    if isinstance(value, torch.Tensor):
-        use_fake_tensors = fake_tensors_available and config.fake_tensor_propagation
-        # tensor subclasses will not be converted to FakeTensors and need to be cloned
-        if not use_fake_tensors or not isinstance(value, FakeTensor):
-            # NB: ensure strides are preserved
-            value = clone_input(value)
-
-    return value
-
-
 class TensorVariable(VariableTracker):
     """A torch.Tensor input or an intermediate value in the FX graph"""
 
@@ -178,173 +45,7 @@ def get_real_value(self):
         NOTE: this runs actual tensor computation and may be
         slow and memory-intensive.
         """
-        return _get_real_value(self.proxy.node, self.proxy.tracer)
-
-    @classmethod
-    def create(cls, tx, proxy, example_value=None, **options):
-        if "guards" in options and options["guards"] is not None:
-            tx.output.guards.update(options["guards"])
-
-        assert "example_value" not in proxy.node.meta
-        if not config.dynamic_propagation:
-            if isinstance(example_value, torch.Tensor):
-                options.update(cls.specialize(example_value))
-            return cls(proxy, **options)
-
-        use_fake_tensors = fake_tensors_available and config.fake_tensor_propagation
-
-        initial_example_value = example_value
-
-        with preserve_rng_state():
-            if example_value is None:
-                if use_fake_tensors:
-                    example_value = _get_fake_value(proxy.node, tx)
-                else:
-                    example_value = _get_real_value(proxy.node, tx.output)
-
-            else:
-                proxy.tracer.real_value_cache[proxy.node] = _clone_input(example_value)
-                if use_fake_tensors:
-                    fake_wrapper = functools.partial(
-                        wrap_to_fake_tensor_and_record, tx=tx
-                    )
-                    example_value = fake_wrapper(example_value)
-
-        if isinstance(example_value, torch.Tensor):
-            is_parameter = isinstance(example_value, torch.nn.Parameter)
-            should_specialize = options.pop("should_specialize", False)
-            if is_parameter or should_specialize:
-                specialized_value = initial_example_value
-            else:
-                specialized_value = None
-
-            example_value = _clone_input(example_value)
-            proxy.node.meta["example_value"] = example_value
-            specialized_props = cls.specialize(example_value)
-            if use_fake_tensors and isinstance(example_value, FakeTensor):
-                specialized_props["class_type"] = (
-                    torch.nn.Parameter if is_parameter else torch.Tensor
-                )
-
-            specialized_props["specialized_value"] = specialized_value
-
-            options.update(specialized_props)
-            return cls(proxy, **options)
-        elif (
-            hasattr(proxy.node.target, "__name__")
-            and proxy.node.target.__name__ == "set_state"
-            and isinstance(proxy.node.target.__self__, torch._C.Generator)
-            or proxy.node.target == torch.random.set_rng_state
-        ):
-            from . import TorchVariable
-
-            return TorchVariable(proxy.node.target)
-        elif istype(example_value, (int, bool, float)) and config.dynamic_shapes:
-            proxy.node.meta["example_value"] = example_value
-            return DynamicShapeVariable(proxy, example_value, **options)
-        elif istype(example_value, torch.Size) and config.dynamic_shapes:
-            proxy.node.meta["example_value"] = example_value
-            sizes = []
-            for i, v in enumerate(example_value):
-                proxy_i = proxy[i]
-                proxy_i.node.meta["example_value"] = v
-                sizes.append(DynamicShapeVariable(proxy_i, v))
-            return SizeVariable(sizes, proxy, **options)
-        elif istype(example_value, int) and proxy.node.target in (
-            torch.seed,
-            operator.mod,
-            # some mac builds are missing torch.distributed.get_rank()
-            getattr(torch.distributed, "get_rank", _missing),
-            getattr(torch.distributed, "get_world_size", _missing),
-        ):
-            proxy.node.meta["example_value"] = example_value
-            return DynamicShapeVariable(proxy, example_value, **options)
-        elif istype(example_value, torch.Size) and all(
-            [isinstance(x, int) for x in example_value]
-        ):
-            sizes = [variables.ConstantVariable(x) for x in example_value]
-            return SizeVariable(sizes, **options)
-        elif isinstance(example_value, (tuple, list)):
-            unpacked = []
-            for i, val in enumerate(example_value):
-                if val is None:
-                    # nn.MultiheadAttention() can return None, see issue #175
-                    unpacked.append(
-                        variables.ConstantVariable(None, **options),
-                    )
-                else:
-                    unpacked.append(
-                        cls.create(
-                            tx,
-                            proxy.tracer.create_proxy(
-                                "call_function", operator.getitem, (proxy, i), {}
-                            ),
-                            example_value=val,
-                            **options,
-                        )
-                    )
-            if istype(example_value, tuple):
-                return variables.TupleVariable(unpacked, **options)
-            elif istype(example_value, (list, immutable_list)):
-                return variables.ListVariable(
-                    unpacked, mutable_local=MutableLocal(), **options
-                )
-            else:
-                assert (
-                    example_value.__class__.__module__ == "torch.return_types"
-                    or hasattr(example_value, "_fields")
-                ), "namedtuple?"
-                return variables.NamedTupleVariable(
-                    unpacked, example_value.__class__, **options
-                )
-        elif example_value is None or proxy.node.target is torch.manual_seed:
-            return variables.ConstantVariable(None, **options)
-        elif (
-            isinstance(example_value, int)
-            and proxy.node.target is torch._utils._element_size
-        ):
-            proxy.node.meta["example_value"] = example_value
-            return variables.ConstantVariable(example_value, **options)
-        elif (
-            isinstance(example_value, numbers.Number)
-            and (
-                proxy.node.target == "item"
-                or proxy.node.target in {math.sqrt, math.pow}
-            )
-            and config.capture_scalar_outputs
-        ):
-            if use_fake_tensors:
-                # item raw value should not be accessed
-                return FakeItemVariable.create(
-                    tx=tx,
-                    proxy=proxy,
-                    example_value=torch.tensor(example_value),
-                    **options,
-                )
-            else:
-                return UnspecializedPythonVariable.create(
-                    tx=tx,
-                    proxy=proxy,
-                    example_value=torch.tensor(example_value),
-                    raw_value=None if use_fake_tensors else example_value,
-                    need_unwrap=False,
-                    **options,
-                )
-        elif (
-            proxy.node.target == torch._C._DisableFuncTorch
-            or proxy.node.target == torch.cuda._is_in_bad_fork
-        ):
-            from . import UserDefinedObjectVariable
-
-            return UserDefinedObjectVariable(example_value)
-        elif isinstance(example_value, torch.SymInt):
-            proxy.node.meta["example_value"] = example_value
-            return cls(proxy, **options)
-        else:
-            raise AssertionError(
-                "torch.* op returned non-Tensor "
-                + f"{typestr(example_value)} {proxy.node.op} {proxy.node.target}"
-            )
+        return get_real_value(self.proxy.node, self.proxy.tracer)
 
     def __init__(
         self,
@@ -482,15 +183,26 @@ def call_method(
         kwargs: "Dict[str, VariableTracker]",
     ) -> "VariableTracker":
         from . import ConstantVariable, TupleVariable
+        from .builder import wrap_fx_proxy
 
         kwargs = dict(kwargs)
-
         options = VariableTracker.propagate(self, args, kwargs.values())
         if name == "stride" and self.stride is not None:
             constant_result = ConstantVariable(self.stride, **options)
         elif name == "size" and self.size is not None:
             sizes = [variables.ConstantVariable(x) for x in self.size]
             constant_result = SizeVariable(sizes, **options)
+        elif name == "size" and self.size is None and config.dynamic_shapes:
+            return wrap_fx_proxy(
+                tx,
+                tx.output.create_proxy(
+                    "call_method",
+                    name,
+                    *proxy_args_kwargs([self] + args, kwargs),
+                    current_tx=tx,
+                ),
+                **options,
+            )
         elif name == "numel" and self.size is not None:
             constant_result = ConstantVariable(product(self.size), **options)
         elif name in ("ndimension", "dim") and self.ndim is not None:
@@ -531,11 +243,19 @@ def call_method(
             unimplemented(f"Tensor.{name}")
         elif name == "item":
             if config.capture_scalar_outputs:
-                return self.__class__.create(
+                use_fake_tensors = (
+                    fake_tensors_available and config.fake_tensor_propagation
+                )
+                if use_fake_tensors:
+                    example_value = get_fake_value(self.proxy.node, tx)
+                else:
+                    example_value = get_real_value(self.proxy.node, tx.output).item()
+                return wrap_fx_proxy(
                     tx,
                     tx.output.create_proxy(
                         "call_method", "item", (self.as_proxy(),), {}, current_tx=tx
                     ),
+                    example_value=example_value,
                     **options,
                 )
             else:
@@ -545,7 +265,7 @@ def call_method(
                 assert not config.dynamic_shapes
                 return ConstantVariable(self.size[0], **options)
             else:
-                return self.__class__.create(
+                return wrap_fx_proxy(
                     tx,
                     tx.output.create_proxy(
                         "call_function", len, (self.as_proxy(),), {}, current_tx=tx
@@ -584,7 +304,7 @@ def call_method(
                         self.ndim = args[0].ndim
                         self.is_contiguous = (memory_format,)
 
-            return self.__class__.create(
+            return wrap_fx_proxy(
                 tx,
                 tx.output.create_proxy(
                     "call_method",
@@ -604,8 +324,7 @@ def call_method(
                 and not config.dynamic_shapes
             ):
                 name = "new_empty"
-
-            return self.__class__.create(
+            return wrap_fx_proxy(
                 tx,
                 tx.output.create_proxy(
                     "call_method",
@@ -617,13 +336,23 @@ def call_method(
             )
 
 
-class DynamicShapeVariable(TensorVariable):
+class DynamicShapeVariable(VariableTracker):
     """
     Represents a symbolic size, e.g., as returned by tensor.size(0)
     """
 
+    @classmethod
+    def create(cls, tx, proxy, dyn_shape, **options):
+        if "example_value" in proxy.node.meta:
+            assert proxy.node.meta["example_value"] == dyn_shape
+        if dyn_shape is None:
+            dyn_shape = get_fake_value(proxy.node, tx)
+        proxy.node.meta["example_value"] = dyn_shape
+        return DynamicShapeVariable(proxy, dyn_shape, **options)
+
     def __init__(self, proxy, dyn_shape, **kwargs):
-        super(DynamicShapeVariable, self).__init__(proxy, **kwargs)
+        super(DynamicShapeVariable, self).__init__(**kwargs)
+        self.proxy = proxy
         self.dyn_shape = dyn_shape
 
     def python_type(self):
@@ -632,6 +361,36 @@ def python_type(self):
     def unpack_var_sequence(self, tx):
         super(DynamicShapeVariable, self).unpack_var_sequence(tx)
 
+    def as_proxy(self):
+        return self.proxy
+
+    def evaluate_expr(self, output_graph):
+        if not isinstance(self.dyn_shape, torch.SymInt):
+            return self.dyn_shape
+        return output_graph.shape_env.evaluate_expr(self.dyn_shape.get_pyobj().expr)
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        from .builder import wrap_fx_proxy
+
+        options = VariableTracker.propagate(self, args, kwargs.values())
+
+        return wrap_fx_proxy(
+            tx,
+            tx.output.create_proxy(
+                "call_method",
+                name,
+                *proxy_args_kwargs([self] + list(args), kwargs),
+                current_tx=tx,
+            ),
+            **options,
+        )
+
 
 class TensorWithTFOverrideVariable(VariableTracker):
     """
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index c55a64cff50c7..0debfe9e9f3c9 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -1,4 +1,6 @@
 import logging
+
+import math
 import re
 import types
 from typing import Dict, List
@@ -170,7 +172,15 @@ def can_constant_fold_through(self):
     def call_function(
         self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
     ) -> "VariableTracker":
-        from . import ConstantVariable, GradModeVariable, TensorVariable
+        from . import (
+            ConstantVariable,
+            DynamicShapeVariable,
+            GradModeVariable,
+            TensorVariable,
+        )
+
+        # print("CALLING ON TORCH", self.value)
+        from .builder import wrap_fx_proxy
 
         constant_args = check_constant_args(args, kwargs)
         unspec_python_args = check_unspec_python_args(args, kwargs)
@@ -302,7 +312,7 @@ def call_function(
             def get_state_from_generator():
                 return self.value()
 
-            return TensorVariable.create(
+            return wrap_fx_proxy(
                 tx=tx,
                 proxy=tx.output.create_proxy(
                     "call_function",
@@ -338,7 +348,7 @@ def get_state_from_generator():
                 example_value = args[0].proxy.node.meta["example_value"]
 
             self.value.__module__ = self.__module__
-            return TensorVariable.create(
+            return wrap_fx_proxy(
                 tx=tx,
                 proxy=tx.output.create_proxy(
                     "call_function",
@@ -357,7 +367,7 @@ def get_state_from_generator():
         ):
             # TODO(voz): This is rewritten as a call_method because
             # torch.numel(x) w/ sym shapes raises a RuntimeError and x.numel() does not
-            return TensorVariable.create(
+            return wrap_fx_proxy(
                 tx=tx,
                 proxy=tx.output.create_proxy(
                     "call_method",
@@ -380,11 +390,21 @@ def get_state_from_generator():
                     if isinstance(x.value, numpy.generic):
                         x.value = x.value.item()
 
-            tensor_variable = TensorVariable.create(
+            # TODO(voz): Replace w/ dynamic shape rewrite table.
+            # Ideally, we would be able to do this at ctor time, but alas we need a combination
+            # of value + args to determine this.
+            fn_ = self.value
+            if any([isinstance(x, DynamicShapeVariable) for x in args]):
+                if self.value == math.sqrt:
+                    from torch.fx.experimental.symbolic_shapes import sym_sqrt
+
+                    fn_ = sym_sqrt
+
+            tensor_variable = wrap_fx_proxy(
                 tx=tx,
                 proxy=tx.output.create_proxy(
                     "call_function",
-                    self.value,
+                    fn_,
                     *proxy_args_kwargs(args, kwargs),
                     current_tx=tx,
                 ),
@@ -450,7 +470,9 @@ def _call_softmax(self, tx, args, kwargs, options):
         dim = args[0] if args else kwargs.get("dim", variables.ConstantVariable(None))
 
         def fake_softmax(input):
-            return variables.TensorVariable.create(
+            from .builder import wrap_fx_proxy
+
+            return wrap_fx_proxy(
                 tx=tx,
                 proxy=tx.output.create_proxy(
                     "call_function",
@@ -502,7 +524,9 @@ def normalize_args(
         ) = normalize_args(*args, **kwargs)
 
         def fake_cross_entropy_loss(input, target):
-            return variables.TensorVariable.create(
+            from .builder import wrap_fx_proxy
+
+            return wrap_fx_proxy(
                 tx=tx,
                 proxy=tx.output.create_proxy(
                     "call_function",
@@ -577,6 +601,7 @@ def call_function(
         self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
     ) -> "VariableTracker":
         from . import ListVariable, TensorVariable, UserFunctionVariable
+        from .builder import wrap_fx_proxy
 
         assert kwargs is None or len(kwargs) == 0, "kwargs are not supported, yet"
 
@@ -688,7 +713,7 @@ def register_as_subgraph(fn, name, args):
             p_args[2] = false_node
 
         # Store the invocation as a call
-        return variables.TensorVariable.create(
+        return wrap_fx_proxy(
             tx=tx,
             proxy=tx.output.create_proxy(
                 "call_function",
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 14f5cd2de0a7a..65f571f93ec09 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -1081,7 +1081,9 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
 
         # clone will get called in Parameter deepcopy
         if func == torch._C._TensorBase.clone:
-            return func(self.fake_mode.from_tensor(args[0]), **kwargs)
+            return func(
+                self.fake_mode.from_tensor(args[0], static_shapes=True), **kwargs
+            )
         elif func == torch.Tensor.__deepcopy__:
             assert len(args) == 2 and len(kwargs) == 0
             tensor, memo = args
@@ -1089,7 +1091,7 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
             if id(tensor) in memo:
                 return memo[id(tensor)]
 
-            out = self.fake_mode.from_tensor(tensor)
+            out = self.fake_mode.from_tensor(tensor, static_shapes=True)
             memo[id(tensor)] = out
             return out
         else:
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index 9b55af3c555c4..ae4427e2320e9 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -456,7 +456,6 @@ def create_symbolic_sizes_strides(self, ex: torch.Tensor):
         We try our best to express stride in terms of the sizes, so as to not
         introduce new symbolic variables.
         """
-
         size = [self.create_symbol(i) for i in ex.size()]
         stride: List[Optional[sympy.Expr]] = [None] * len(size)
         for i, val in enumerate(ex.stride()):

From 6eedff9e8cd3b00c61b2a140836b650f55758090 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Sun, 13 Nov 2022 08:19:45 +0000
Subject: [PATCH 0841/1922] [reland][dynamo] Better support for nn.Module
 (#88959)

Relanding https://github.com/pytorch/pytorch/pull/88629

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88959
Approved by: https://github.com/msaroufim
---
 test/dynamo/test_modules.py  | 127 +++++++++++++++++++++++++++++++++++
 torch/_dynamo/__init__.py    |   2 +
 torch/_dynamo/debug_utils.py |   8 +++
 torch/_dynamo/eval_frame.py  |  74 ++++++++++++++------
 torch/_dynamo/testing.py     |  14 ++++
 5 files changed, 205 insertions(+), 20 deletions(-)

diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py
index 2fb83b3add6cf..930035f99a30c 100644
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@@ -904,6 +904,133 @@ def forward(self, x):
         self.assertTrue(torch._dynamo.testing.same(real, graph(rx)))
 
 
+class MockModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.relu = torch.nn.ReLU()
+        self.linear = torch.nn.Linear(10, 10)
+        self.register_buffer("buf0", torch.randn(10, 10))
+
+    def forward(self, x):
+        return self.relu(self.linear(x) + self.buf0)
+
+
+class OptimizedModuleTest(torch._dynamo.test_case.TestCase):
+    def test_nn_module(self):
+        mod = MockModule()
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_mod = torch._dynamo.optimize(cnt)(mod)
+        self.assertIsInstance(opt_mod, torch._dynamo.OptimizedModule)
+
+        x = torch.randn(10, 10)
+        self.assertTrue(torch._dynamo.testing.same(mod(x), opt_mod(x)))
+        self.assertEqual(cnt.frame_count, 1)
+
+    def test_to(self):
+        mod = MockModule()
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_mod = torch._dynamo.optimize(cnt)(mod)
+        x = torch.randn(10, 10)
+        self.assertTrue(torch._dynamo.testing.same(mod(x), opt_mod(x)))
+        self.assertEqual(cnt.frame_count, 1)
+
+        # Ensure that there is no recompilation
+        opt_mod(x)
+        self.assertEqual(cnt.frame_count, 1)
+
+        opt_mod = opt_mod.to(device="cpu").to(dtype=torch.float64)
+        self.assertIsInstance(opt_mod, torch._dynamo.OptimizedModule)
+        x = torch.randn(10, 10).to(dtype=torch.float64)
+        opt_mod(x)
+        # Ensure that there is a recompilation
+        self.assertEqual(cnt.frame_count, 2)
+
+    def test_attr(self):
+        class MockModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(10, 10)
+                self.register_buffer("buf0", torch.randn(10, 10))
+
+            def forward(self, x):
+                return self.r(torch.sin(x)) + self.buf0
+
+        mod = MockModule()
+        opt_mod = torch._dynamo.optimize("eager")(mod)
+
+        # Check parameteres and buffers
+        for (p1, p2) in zip(mod.parameters(), opt_mod.parameters()):
+            self.assertTrue(id(p1) == id(p2))
+
+    def test_recursion(self):
+        mod = MockModule()
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_mod = torch._dynamo.optimize(cnt)(mod)
+
+        for _ in range(5):
+            opt_mod = torch._dynamo.optimize(cnt)(opt_mod)
+        opt_mod(torch.randn(10, 10))
+        self.assertEqual(cnt.frame_count, 1)
+
+    def test_composition(self):
+        class InnerModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                return self.relu(torch.sin(x))
+
+        opt_inner_mod = InnerModule()
+
+        class OuterModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mod = opt_inner_mod
+
+            def forward(self, x):
+                return self.mod(torch.cos(x))
+
+        outer_mod = OuterModule()
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_outer_mod = torch._dynamo.optimize(cnt)(outer_mod)
+
+        x = torch.randn(4)
+        self.assertIsInstance(opt_outer_mod, torch._dynamo.OptimizedModule)
+        self.assertTrue(torch._dynamo.testing.same(outer_mod(x), opt_outer_mod(x)))
+        self.assertEqual(cnt.frame_count, 1)
+
+    def test_composition_with_opt_mod(self):
+        class InnerModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                return self.relu(torch.sin(x))
+
+        inner_mod = InnerModule()
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_inner_mod = torch._dynamo.optimize(cnt)(inner_mod)
+
+        class OuterModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mod = opt_inner_mod
+
+            def forward(self, x):
+                return self.mod(torch.cos(x))
+
+        outer_mod = OuterModule()
+        opt_outer_mod = torch._dynamo.optimize(cnt)(outer_mod)
+
+        x = torch.randn(4)
+        self.assertIsInstance(opt_outer_mod, torch._dynamo.OptimizedModule)
+        self.assertTrue(torch._dynamo.testing.same(outer_mod(x), opt_outer_mod(x)))
+        # There will be a graph break for the inner mod being OptimizedModule
+        self.assertEqual(cnt.frame_count, 2)
+
+
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/torch/_dynamo/__init__.py b/torch/_dynamo/__init__.py
index 80f927aeef2fa..5eee609b0852a 100644
--- a/torch/_dynamo/__init__.py
+++ b/torch/_dynamo/__init__.py
@@ -7,6 +7,7 @@
     export,
     optimize,
     optimize_assert,
+    OptimizedModule,
     reset_code,
     run,
     skip,
@@ -25,6 +26,7 @@
     "reset",
     "list_backends",
     "skip",
+    "OptimizedModule",
 ]
 
 
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index f09991f9bf348..089ef172d625d 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -486,8 +486,16 @@ def same_two_models(gm, opt_gm, example_inputs, only_fwd=False):
     """
     Check two models have same accuracy.
     """
+    from .eval_frame import OptimizedModule
+    from .testing import named_parameters_for_optimized_module
     from .utils import same
 
+    if isinstance(gm, OptimizedModule):
+        gm.named_parameters = named_parameters_for_optimized_module(gm)
+
+    if isinstance(opt_gm, OptimizedModule):
+        opt_gm.named_parameters = named_parameters_for_optimized_module(opt_gm)
+
     ref = run_fwd_maybe_bwd(gm, example_inputs, only_fwd)
 
     try:
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 8d9e3b7b6aa14..20e8c7de085e0 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -5,6 +5,7 @@
 import logging
 import os
 import sys
+import textwrap
 import threading
 import traceback
 import types
@@ -44,6 +45,27 @@
 most_recent_backend = None
 
 
+class OptimizedModule(torch.nn.Module):
+    """
+    Wraps the original nn.Module object and later patches its
+    forward method to optimized self.forward method.
+    """
+
+    def __init__(self, mod):
+        super().__init__()
+        # Installs the params/buffer
+        self._orig_mod = mod
+
+    def __getattr__(self, name):
+        if name == "_orig_mod":
+            return self._modules["_orig_mod"]
+        return getattr(self._orig_mod, name)
+
+    def forward(self, *args, **kwargs):
+        # This will be monkey patched later
+        raise RuntimeError("Should not be here")
+
+
 def remove_from_cache(f):
     """
     Make sure f.__code__ is not cached to force a recompile
@@ -118,31 +140,15 @@ def __call__(self, fn):
         # Optimize the forward method of torch.nn.Module object
         if isinstance(fn, torch.nn.Module):
             mod = fn
-            optimized_forward = self(mod.forward)
-
-            class TorchDynamoNNModuleWrapper:
-                """
-                A wrapper that redirects the forward call to the optimized
-                forward, while for rest it redirects the calls to the original
-                module.
-                """
-
-                def __getattr__(self, name):
-                    return getattr(mod, name)
-
-                def forward(self, *args, **kwargs):
-                    return optimized_forward(*args, **kwargs)
-
-                def __call__(self, *args, **kwargs):
-                    return self.forward(*args, **kwargs)
-
-            new_mod = TorchDynamoNNModuleWrapper()
+            new_mod = OptimizedModule(mod)
+            new_mod.forward = self(mod.forward)
             # Save the function pointer to find the original callable while nesting
             # of decorators.
-            new_mod._torchdynamo_orig_callable = mod
+            new_mod._torchdynamo_orig_callable = mod.forward
             return new_mod
 
         assert callable(fn)
+
         callback = self.callback
         on_enter = self.on_enter
         backend_ctx_ctor = self.extra_ctx_ctor
@@ -184,6 +190,34 @@ def _fn(*args, **kwargs):
         # If the function is called using torch._dynamo.optimize decorator, we
         # should prevent any type of skipping.
         if callback not in (None, False):
+            if not hasattr(fn, "__code__"):
+                raise RuntimeError(
+                    textwrap.dedent(
+                        """
+
+                        torch._dynamo.optimize is called on a non function object.
+                        If this is a callable class, please optimize the individual methods that you are interested in optimizing.
+
+                        >> class CallableClass:
+                        >>     def __init__(self):
+                        >>         super().__init__()
+                        >>         self.relu = torch.nn.ReLU()
+                        >>
+                        >>     def __call__(self, x):
+                        >>         return self.relu(torch.sin(x))
+                        >>
+                        >>     def print_hello(self):
+                        >>         print("Hello world")
+                        >>
+                        >> mod = CallableClass()
+
+                        If you want to optimize the __call__ function
+
+                        >> mod.__call__ = torch._dynamo.optimize(mod.__call__)
+
+                        """
+                    )
+                )
             always_optimize_code_objects[fn.__code__] = True
 
         return _fn
diff --git a/torch/_dynamo/testing.py b/torch/_dynamo/testing.py
index d6082ce48acf8..6e0d32d21f978 100644
--- a/torch/_dynamo/testing.py
+++ b/torch/_dynamo/testing.py
@@ -32,6 +32,18 @@ def clone_me(x):
     return x.detach().clone().requires_grad_(x.requires_grad)
 
 
+def named_parameters_for_optimized_module(mod):
+    assert isinstance(mod, eval_frame.OptimizedModule)
+    return mod._orig_mod.named_parameters
+
+
+def remove_optimized_module_prefix(name):
+    prefix = "_orig_mod."
+    assert name.startswith(prefix)
+    name = name[len(prefix) :]
+    return torch.distributed.fsdp._common_utils.clean_tensor_name(name)
+
+
 def collect_results(model, prediction, loss, example_inputs):
     results = []
     results.append(prediction)
@@ -44,6 +56,8 @@ def collect_results(model, prediction, loss, example_inputs):
     grads = dict()
     params = dict()
     for name, param in model.named_parameters():
+        if isinstance(model, eval_frame.OptimizedModule):
+            name = remove_optimized_module_prefix(name)
         param_copy = param
         grad = param.grad
         # Treat None and zero grad as same

From 9b1bfc641450c2c9d10acc3a4499f8b424307f2b Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Sun, 13 Nov 2022 09:53:38 +0000
Subject: [PATCH 0842/1922] [dynamo] Skip frame when graph break in a loop
 (#88857)

This fixes excessing recompilation issue in tacotron2 but has few caveats - https://github.com/pytorch/torchdynamo/issues/330

For tacotron2, the repro is something like this

~~~
        def inner(x):
            return torch.sin(x)

        def fn(x):
            for _ in range(100):
                inner(x)
                torch._dynamo.graph_break()
            return x
~~~

The problem here is that Dynamo has guards on the TUPLE_ITERATOR_LEN whenever a graph break happens. Therefore, we keep on recompiling.

This PR checks if there is a backedge (helps with while loop) in presence of a graph break. If there is, Dynamo skips processing this frame. Therefore, Dynamo gets called when inner is called, and we compile only once.

Note that, if there was no graph break, we will unroll the original loop, and see one graph with 100 sin operations (just as before, so no changes there).

The caveat is - We are skipping the frame, so if we have something like this

~~~
        def fn(x):
            for _ in range(100):
                # 1000s of lines of PyTorch code
                torch._dynamo.graph_break()
            return x
~~~

Dynamo will skip processing this frame, and might miss on the optimization.

Completely open for suggestions. Happy to re-implement if there is a better way to handle this.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88857
Approved by: https://github.com/jansel, https://github.com/yanboliang
---
 test/dynamo/test_optimizers.py    |  3 +-
 test/dynamo/test_repros.py        | 55 +++++++++++++++++++++++++++++++
 torch/_dynamo/symbolic_convert.py | 37 +++++++++++++++++----
 3 files changed, 87 insertions(+), 8 deletions(-)

diff --git a/test/dynamo/test_optimizers.py b/test/dynamo/test_optimizers.py
index 92b163b76d6dc..2f204a7a11999 100644
--- a/test/dynamo/test_optimizers.py
+++ b/test/dynamo/test_optimizers.py
@@ -1,7 +1,6 @@
 # Owner(s): ["module: dynamo"]
 
 import inspect
-import sys
 import unittest
 
 import torch
@@ -126,7 +125,7 @@ def training_iter_fn(batch, model, optimizer):
         batch = {"x": input1, "y": input2}
         for _ in range(2):
             opt_training_iter_fn(batch, net, optimizer)
-        self.assertEqual(cnts.frame_count, (2 if sys.version_info < (3, 8) else 6))
+        self.assertEqual(cnts.frame_count, 2)
 
 
 if __name__ == "__main__":
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 6a1c654a4873f..aa30affd5144f 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -1838,6 +1838,61 @@ def forward(self, inp):
         self.assertEqual(cnt.op_count, 5)
         self.assertEqual(cnt.frame_count, 1)
 
+    def test_for_loop_graph_break(self):
+        def inner(x):
+            return torch.sin(x)
+
+        def fn(x):
+            for _ in range(100):
+                inner(x)
+                torch._dynamo.graph_break()
+            return x
+
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnt)(fn)
+        x = torch.randn(4)
+        opt_fn(x)
+        self.assertEqual(cnt.frame_count, 1)
+        self.assertEqual(cnt.op_count, 1)
+
+    def test_for_loop_graph_break_before(self):
+        # Checks that the backedge is calculated correctly
+        def inner(x):
+            return torch.sin(x)
+
+        def fn(x):
+            torch._dynamo.graph_break()
+            for _ in range(100):
+                inner(x)
+            return x
+
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnt)(fn)
+        x = torch.randn(4)
+        opt_fn(x)
+        self.assertEqual(cnt.frame_count, 1)
+        self.assertEqual(cnt.op_count, 100)
+
+    def test_while_loop_graph_break(self):
+        # Repro of tacotron2 cache_size_recompilation
+        def inner(x):
+            return torch.sin(x)
+
+        def fn(x):
+            i = 20
+            while i > 10:
+                x = inner(x)
+                i -= 1
+                torch._dynamo.graph_break()
+            return x
+
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnt)(fn)
+        x = torch.randn(4)
+        opt_fn(x)
+        self.assertEqual(cnt.frame_count, 1)
+        self.assertEqual(cnt.op_count, 1)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index 88e0df5470bc8..d707bee930ee8 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -133,6 +133,13 @@ def inner(self: "InstructionTranslatorBase", inst: Instruction):
             isinstance(value, (TensorVariable)) and self.should_compile_partial_graph()
         ):
             # compile a partial subgraph prefix then jump into user code
+            if self.has_backedge():
+                msg = (
+                    "Skipping frame because there is a graph break in a for/while loop"
+                )
+                log.debug(msg)
+                raise exc.SkipFrame(msg)
+
             self.push(value)
             self.output.compile_subgraph(
                 self,
@@ -179,10 +186,15 @@ def wrapper(self: "InstructionTranslatorBase", inst: Instruction):
             reason = None
             try:
                 return inner_fn(self, inst)
-            except Unsupported as exc:
+            except Unsupported as excp:
+                if self.has_backedge():
+                    msg = "Skipping frame because there is a graph break in a for/while loop"
+                    log.debug(msg)
+                    raise exc.SkipFrame(msg)
+
                 if not self.should_compile_partial_graph():
                     raise
-                user_stack = [self.frame_summary()] + list(reversed(exc.real_stack))
+                user_stack = [self.frame_summary()] + list(reversed(excp.real_stack))
                 user_stack_formatted = "".join(traceback.format_list(user_stack))
                 frame_loc = (user_stack[-1].filename, user_stack[-1].lineno)
                 # torch._dynamo.explain() formats this a little nicer, and presents a slightly
@@ -193,12 +205,12 @@ def wrapper(self: "InstructionTranslatorBase", inst: Instruction):
                     and graph_break_dup_warning_checker.add(frame_loc)
                 ):
                     log.warning(
-                        f"Graph break: {exc} from user code at {user_stack_formatted}"
+                        f"Graph break: {excp} from user code at {user_stack_formatted}"
                     )
 
-                exc.remove_from_stats()
-                exc.add_to_stats("graph_break")
-                reason = GraphCompileReason(exc.msg, user_stack)
+                excp.remove_from_stats()
+                excp.add_to_stats("graph_break")
+                reason = GraphCompileReason(excp.msg, user_stack)
             self.restore_graphstate(state)
             self.output.compile_subgraph(self, reason=reason)
             self.popn(push - dis.stack_effect(inst.opcode, inst.arg))
@@ -237,6 +249,19 @@ def wrapper(self: "InstructionTranslatorBase", inst: Instruction):
 
 
 class InstructionTranslatorBase(object):
+    def has_backedge(self):
+        cur_offset = self.current_instruction.offset
+        for inst in self.instructions[self.instruction_pointer :]:
+            if inst.opname in (
+                "JUMP_ABSOLUTE",
+                "POP_JUMP_IF_TRUE",
+                "POP_JUMP_IF_FALSE",
+            ):
+                jump_offset = inst.argval
+                if jump_offset < cur_offset:
+                    return True
+        return False
+
     def cell_and_freevars(self):
         if not hasattr(self, "_cell_and_freevars"):
             self._cell_and_freevars = tuple(

From 2d913f5ec03bf767c72cb9a4ba8778181c78148c Mon Sep 17 00:00:00 2001
From: Chen Lai <chenlai@fb.com>
Date: Sat, 12 Nov 2022 21:41:31 -0800
Subject: [PATCH 0843/1922] Move xnnpack taget to fb code base (#88909)

1. Move the source file list to the `build_variables.bzl`, as it's the source of truth for both internal buck build and oss build
2. Move target definitions to `fb` internal folder
3. Some changes are triggered from auto format.

Differential Revision: [D40906961](https://our.internmc.facebook.com/intern/diff/D40906961/)

**NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D40906961/)!
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88909
Approved by: https://github.com/mcr229
---
 build_variables.bzl | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/build_variables.bzl b/build_variables.bzl
index e476341b9ac0e..473ed1c1de1b1 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -1489,3 +1489,33 @@ aten_cuda_with_sort_by_key_source_list = [
 aten_cuda_cu_with_sort_by_key_source_list = [
     "aten/src/ATen/native/cuda/Unique.cu",
 ]
+
+# Followings are source code for xnnpack delegate
+
+xnnpack_delegate_serializer_header = [
+    "torch/csrc/jit/backends/xnnpack/serialization/serializer.h",
+]
+
+xnnpack_delegate_serializer_source_list = [
+    "torch/csrc/jit/backends/xnnpack/serialization/serializer.cpp",
+]
+
+xnnpack_delegate_core_source_list = [
+    "torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp",
+]
+
+xnnpack_delegate_core_header = [
+    "torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.h",
+    "torch/csrc/jit/backends/xnnpack/executor/xnn_executor.h",
+]
+
+xnnpack_backend_header = [
+    "torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.h",
+] + xnnpack_delegate_core_header
+
+xnnpack_backend_source_list = [
+    "torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp",
+    "torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp",
+    "torch/csrc/jit/backends/xnnpack/xnnpack_backend_preprocess.cpp",
+    "torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp",
+] + xnnpack_delegate_core_source_list

From 31e0497b77a26a59948135aa4c4d2b6f046771b2 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Sat, 12 Nov 2022 19:26:28 +0000
Subject: [PATCH 0844/1922] [Dynamo][FSDP] Migrate to `ModuleWrapPolicy`
 (#88453)

Hello @wconstab! As you saw, `transformer_auto_wrap_policy()` is a misnomer and actually works for any module classes. The PR before this one tries to add a class `ModuleWrapPolicy` that takes in the `module_classes` in its constructor and works just like `transformer_auto_wrap_policy()` without requiring the `functools.partial()`. I hope you do not mind if we update the dynamo benchmarks util file with this migration.

The PR before this one might require some back and forth within FSDP devs, so I apologize for any consequent updates to this PR, which in itself is an easy change. I will request review once we know the previous PR is good for land.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88453
Approved by: https://github.com/wconstab
---
 benchmarks/dynamo/dist_util.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/benchmarks/dynamo/dist_util.py b/benchmarks/dynamo/dist_util.py
index d30b5a63cfe5f..d0267cbca3073 100644
--- a/benchmarks/dynamo/dist_util.py
+++ b/benchmarks/dynamo/dist_util.py
@@ -13,7 +13,7 @@
     CheckpointImpl,
 )
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
+from torch.distributed.fsdp.wrap import ModuleWrapPolicy
 
 try:
     from .torchbench import setup_torchbench_cwd
@@ -138,10 +138,7 @@ def apply_fsdp(args, model, use_checkpointing=False, use_wrap_policy=True):
         "toy_model" if model.__class__ is ToyModel else args.torchbench_model
     ]
     if use_wrap_policy:
-        # transformer policy is really a generic policy that wraps modules of specified classes
-        wrap_policy = functools.partial(
-            transformer_auto_wrap_policy, transformer_layer_cls=blocks
-        )
+        wrap_policy = ModuleWrapPolicy(blocks)
 
     model = FSDP(model, auto_wrap_policy=wrap_policy, use_orig_params=True)
     if use_checkpointing:

From 144b2c96a209d7b9690e2a46c9df2c3996c42dd2 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Sun, 13 Nov 2022 16:20:45 +0000
Subject: [PATCH 0845/1922] [reland][dynamo] fixes dict changed during runtime
 error (#88877)

Reland https://github.com/pytorch/pytorch/pull/87526

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88877
Approved by: https://github.com/ezyang
---
 test/dynamo/test_aot_cudagraphs.py |  3 ---
 test/dynamo/test_repros.py         | 30 ++++++++++++++++++++++++++++++
 torch/_dynamo/convert_frame.py     | 10 ++++++----
 3 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/test/dynamo/test_aot_cudagraphs.py b/test/dynamo/test_aot_cudagraphs.py
index cb1d2a0e601ff..fdb7c88762b8b 100644
--- a/test/dynamo/test_aot_cudagraphs.py
+++ b/test/dynamo/test_aot_cudagraphs.py
@@ -71,7 +71,6 @@ def fn(x, y):
         y = torch.randn(3, device="cuda")
         fn(x, y)
 
-    @patch("torch._dynamo.config.suppress_errors", True)
     @patch_all()
     def test_dtoh(self):
         def model(x, y):
@@ -105,7 +104,6 @@ def fn(x, y):
         y = torch.randn((), device="cpu")
         fn(x, y)
 
-    @patch("torch._dynamo.config.suppress_errors", True)
     @patch("functorch._src.config.use_functionalize", True)
     @patch_all(ok=False)  # input mutation not supported yet
     def test_mutate_input(self):
@@ -145,7 +143,6 @@ def fn(x, y):
         y = torch.randn(1, device="cuda")
         fn(x, y)
 
-    @patch("torch._dynamo.config.suppress_errors", True)
     @patch_all()
     def test_factory(self):
         def model(y):
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index aa30affd5144f..fd0fcf9e08bc2 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -1818,6 +1818,36 @@ def fn(x):
         res = opt_fn(a)
         self.assertTrue(same(ref, res))
 
+    def test_tokenization(self):
+        from collections import UserDict
+
+        class BatchEncoding(UserDict):
+            """
+            Copied from tokenization
+            """
+
+            def __init__(
+                self,
+                data,
+            ):
+                super().__init__(data)
+
+            def __getattr__(self, item: str):
+                try:
+                    return self.data[item]
+                except KeyError:
+                    raise AttributeError
+
+        def tokenization(x):
+            encoding = BatchEncoding({"key": x})
+            return encoding["key"]
+
+        opt_fn = torch._dynamo.optimize("eager")(tokenization)
+        x = torch.rand((1, 4))
+        ref = tokenization(x)
+        res = opt_fn(x)
+        self.assertTrue(same(ref, res))
+
     def test_modules(self):
         class Foo(torch.nn.Module):
             def __init__(self):
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index f1ce83727a19f..c612fe3c167d4 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -156,7 +156,11 @@ def has_tensor(obj):
             seen_ids[obj_id] = any([has_tensor(v) for v in obj])
             return seen_ids[obj_id]
         elif istype(obj, dict):
-            seen_ids[obj_id] = any([has_tensor(v) for v in obj.values()])
+            # Some packages like pytest can be updated during runtime. So, make a
+            # copy of values to avoid issues like "RuntimeError: dictionary
+            # changed size during iteration"
+            values = list(obj.values())
+            seen_ids[obj_id] = any([has_tensor(v) for v in values])
             return seen_ids[obj_id]
         elif istype(obj, (str, int, float, type(None), bool)):
             seen_ids[obj_id] = False
@@ -164,9 +168,6 @@ def has_tensor(obj):
         elif is_namedtuple(obj):
             seen_ids[obj_id] = any([has_tensor(getattr(obj, v)) for v in obj._fields])
             return seen_ids[obj_id]
-        elif not is_allowed(obj) and hasattr(obj, "__dict__") and len(obj.__dict__):
-            seen_ids[obj_id] = any([has_tensor(v) for v in obj.__dict__.values()])
-            return seen_ids[obj_id]
         else:
             # if config.debug:
             #     print(
@@ -302,6 +303,7 @@ def _convert_frame_assert(frame: types.FrameType, cache_size: int):
             # setattr could be tricky to handle generally,
             # but also not likely useful to compile- skip the whole frame
             return None
+
         # Check if the frame is generated by an exec builtin call
         # TODO - Running exec generated frame seems propagates f_globals to the
         # next frames.

From efb915bbd4abed8bb9a9d0dd26222809d3282293 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Sun, 13 Nov 2022 16:21:12 +0000
Subject: [PATCH 0846/1922] Revert "[reland][dynamo] Better support for
 nn.Module (#88959)"

This reverts commit e950afc3958c9bae5d61cbc99bc088309141df6d.

Reverted https://github.com/pytorch/pytorch/pull/88959 on behalf of https://github.com/malfet due to Broke `test_accuracy_issue1`
---
 test/dynamo/test_modules.py  | 127 -----------------------------------
 torch/_dynamo/__init__.py    |   2 -
 torch/_dynamo/debug_utils.py |   8 ---
 torch/_dynamo/eval_frame.py  |  74 ++++++--------------
 torch/_dynamo/testing.py     |  14 ----
 5 files changed, 20 insertions(+), 205 deletions(-)

diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py
index 930035f99a30c..2fb83b3add6cf 100644
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@@ -904,133 +904,6 @@ def forward(self, x):
         self.assertTrue(torch._dynamo.testing.same(real, graph(rx)))
 
 
-class MockModule(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.relu = torch.nn.ReLU()
-        self.linear = torch.nn.Linear(10, 10)
-        self.register_buffer("buf0", torch.randn(10, 10))
-
-    def forward(self, x):
-        return self.relu(self.linear(x) + self.buf0)
-
-
-class OptimizedModuleTest(torch._dynamo.test_case.TestCase):
-    def test_nn_module(self):
-        mod = MockModule()
-        cnt = torch._dynamo.testing.CompileCounter()
-        opt_mod = torch._dynamo.optimize(cnt)(mod)
-        self.assertIsInstance(opt_mod, torch._dynamo.OptimizedModule)
-
-        x = torch.randn(10, 10)
-        self.assertTrue(torch._dynamo.testing.same(mod(x), opt_mod(x)))
-        self.assertEqual(cnt.frame_count, 1)
-
-    def test_to(self):
-        mod = MockModule()
-        cnt = torch._dynamo.testing.CompileCounter()
-        opt_mod = torch._dynamo.optimize(cnt)(mod)
-        x = torch.randn(10, 10)
-        self.assertTrue(torch._dynamo.testing.same(mod(x), opt_mod(x)))
-        self.assertEqual(cnt.frame_count, 1)
-
-        # Ensure that there is no recompilation
-        opt_mod(x)
-        self.assertEqual(cnt.frame_count, 1)
-
-        opt_mod = opt_mod.to(device="cpu").to(dtype=torch.float64)
-        self.assertIsInstance(opt_mod, torch._dynamo.OptimizedModule)
-        x = torch.randn(10, 10).to(dtype=torch.float64)
-        opt_mod(x)
-        # Ensure that there is a recompilation
-        self.assertEqual(cnt.frame_count, 2)
-
-    def test_attr(self):
-        class MockModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.linear = torch.nn.Linear(10, 10)
-                self.register_buffer("buf0", torch.randn(10, 10))
-
-            def forward(self, x):
-                return self.r(torch.sin(x)) + self.buf0
-
-        mod = MockModule()
-        opt_mod = torch._dynamo.optimize("eager")(mod)
-
-        # Check parameteres and buffers
-        for (p1, p2) in zip(mod.parameters(), opt_mod.parameters()):
-            self.assertTrue(id(p1) == id(p2))
-
-    def test_recursion(self):
-        mod = MockModule()
-        cnt = torch._dynamo.testing.CompileCounter()
-        opt_mod = torch._dynamo.optimize(cnt)(mod)
-
-        for _ in range(5):
-            opt_mod = torch._dynamo.optimize(cnt)(opt_mod)
-        opt_mod(torch.randn(10, 10))
-        self.assertEqual(cnt.frame_count, 1)
-
-    def test_composition(self):
-        class InnerModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.relu = torch.nn.ReLU()
-
-            def forward(self, x):
-                return self.relu(torch.sin(x))
-
-        opt_inner_mod = InnerModule()
-
-        class OuterModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.mod = opt_inner_mod
-
-            def forward(self, x):
-                return self.mod(torch.cos(x))
-
-        outer_mod = OuterModule()
-        cnt = torch._dynamo.testing.CompileCounter()
-        opt_outer_mod = torch._dynamo.optimize(cnt)(outer_mod)
-
-        x = torch.randn(4)
-        self.assertIsInstance(opt_outer_mod, torch._dynamo.OptimizedModule)
-        self.assertTrue(torch._dynamo.testing.same(outer_mod(x), opt_outer_mod(x)))
-        self.assertEqual(cnt.frame_count, 1)
-
-    def test_composition_with_opt_mod(self):
-        class InnerModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.relu = torch.nn.ReLU()
-
-            def forward(self, x):
-                return self.relu(torch.sin(x))
-
-        inner_mod = InnerModule()
-        cnt = torch._dynamo.testing.CompileCounter()
-        opt_inner_mod = torch._dynamo.optimize(cnt)(inner_mod)
-
-        class OuterModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.mod = opt_inner_mod
-
-            def forward(self, x):
-                return self.mod(torch.cos(x))
-
-        outer_mod = OuterModule()
-        opt_outer_mod = torch._dynamo.optimize(cnt)(outer_mod)
-
-        x = torch.randn(4)
-        self.assertIsInstance(opt_outer_mod, torch._dynamo.OptimizedModule)
-        self.assertTrue(torch._dynamo.testing.same(outer_mod(x), opt_outer_mod(x)))
-        # There will be a graph break for the inner mod being OptimizedModule
-        self.assertEqual(cnt.frame_count, 2)
-
-
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/torch/_dynamo/__init__.py b/torch/_dynamo/__init__.py
index 5eee609b0852a..80f927aeef2fa 100644
--- a/torch/_dynamo/__init__.py
+++ b/torch/_dynamo/__init__.py
@@ -7,7 +7,6 @@
     export,
     optimize,
     optimize_assert,
-    OptimizedModule,
     reset_code,
     run,
     skip,
@@ -26,7 +25,6 @@
     "reset",
     "list_backends",
     "skip",
-    "OptimizedModule",
 ]
 
 
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index 089ef172d625d..f09991f9bf348 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -486,16 +486,8 @@ def same_two_models(gm, opt_gm, example_inputs, only_fwd=False):
     """
     Check two models have same accuracy.
     """
-    from .eval_frame import OptimizedModule
-    from .testing import named_parameters_for_optimized_module
     from .utils import same
 
-    if isinstance(gm, OptimizedModule):
-        gm.named_parameters = named_parameters_for_optimized_module(gm)
-
-    if isinstance(opt_gm, OptimizedModule):
-        opt_gm.named_parameters = named_parameters_for_optimized_module(opt_gm)
-
     ref = run_fwd_maybe_bwd(gm, example_inputs, only_fwd)
 
     try:
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 20e8c7de085e0..8d9e3b7b6aa14 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -5,7 +5,6 @@
 import logging
 import os
 import sys
-import textwrap
 import threading
 import traceback
 import types
@@ -45,27 +44,6 @@
 most_recent_backend = None
 
 
-class OptimizedModule(torch.nn.Module):
-    """
-    Wraps the original nn.Module object and later patches its
-    forward method to optimized self.forward method.
-    """
-
-    def __init__(self, mod):
-        super().__init__()
-        # Installs the params/buffer
-        self._orig_mod = mod
-
-    def __getattr__(self, name):
-        if name == "_orig_mod":
-            return self._modules["_orig_mod"]
-        return getattr(self._orig_mod, name)
-
-    def forward(self, *args, **kwargs):
-        # This will be monkey patched later
-        raise RuntimeError("Should not be here")
-
-
 def remove_from_cache(f):
     """
     Make sure f.__code__ is not cached to force a recompile
@@ -140,15 +118,31 @@ def __call__(self, fn):
         # Optimize the forward method of torch.nn.Module object
         if isinstance(fn, torch.nn.Module):
             mod = fn
-            new_mod = OptimizedModule(mod)
-            new_mod.forward = self(mod.forward)
+            optimized_forward = self(mod.forward)
+
+            class TorchDynamoNNModuleWrapper:
+                """
+                A wrapper that redirects the forward call to the optimized
+                forward, while for rest it redirects the calls to the original
+                module.
+                """
+
+                def __getattr__(self, name):
+                    return getattr(mod, name)
+
+                def forward(self, *args, **kwargs):
+                    return optimized_forward(*args, **kwargs)
+
+                def __call__(self, *args, **kwargs):
+                    return self.forward(*args, **kwargs)
+
+            new_mod = TorchDynamoNNModuleWrapper()
             # Save the function pointer to find the original callable while nesting
             # of decorators.
-            new_mod._torchdynamo_orig_callable = mod.forward
+            new_mod._torchdynamo_orig_callable = mod
             return new_mod
 
         assert callable(fn)
-
         callback = self.callback
         on_enter = self.on_enter
         backend_ctx_ctor = self.extra_ctx_ctor
@@ -190,34 +184,6 @@ def _fn(*args, **kwargs):
         # If the function is called using torch._dynamo.optimize decorator, we
         # should prevent any type of skipping.
         if callback not in (None, False):
-            if not hasattr(fn, "__code__"):
-                raise RuntimeError(
-                    textwrap.dedent(
-                        """
-
-                        torch._dynamo.optimize is called on a non function object.
-                        If this is a callable class, please optimize the individual methods that you are interested in optimizing.
-
-                        >> class CallableClass:
-                        >>     def __init__(self):
-                        >>         super().__init__()
-                        >>         self.relu = torch.nn.ReLU()
-                        >>
-                        >>     def __call__(self, x):
-                        >>         return self.relu(torch.sin(x))
-                        >>
-                        >>     def print_hello(self):
-                        >>         print("Hello world")
-                        >>
-                        >> mod = CallableClass()
-
-                        If you want to optimize the __call__ function
-
-                        >> mod.__call__ = torch._dynamo.optimize(mod.__call__)
-
-                        """
-                    )
-                )
             always_optimize_code_objects[fn.__code__] = True
 
         return _fn
diff --git a/torch/_dynamo/testing.py b/torch/_dynamo/testing.py
index 6e0d32d21f978..d6082ce48acf8 100644
--- a/torch/_dynamo/testing.py
+++ b/torch/_dynamo/testing.py
@@ -32,18 +32,6 @@ def clone_me(x):
     return x.detach().clone().requires_grad_(x.requires_grad)
 
 
-def named_parameters_for_optimized_module(mod):
-    assert isinstance(mod, eval_frame.OptimizedModule)
-    return mod._orig_mod.named_parameters
-
-
-def remove_optimized_module_prefix(name):
-    prefix = "_orig_mod."
-    assert name.startswith(prefix)
-    name = name[len(prefix) :]
-    return torch.distributed.fsdp._common_utils.clean_tensor_name(name)
-
-
 def collect_results(model, prediction, loss, example_inputs):
     results = []
     results.append(prediction)
@@ -56,8 +44,6 @@ def collect_results(model, prediction, loss, example_inputs):
     grads = dict()
     params = dict()
     for name, param in model.named_parameters():
-        if isinstance(model, eval_frame.OptimizedModule):
-            name = remove_optimized_module_prefix(name)
         param_copy = param
         grad = param.grad
         # Treat None and zero grad as same

From 2a56cdc43f46a2afd02b446d99081c39f1408442 Mon Sep 17 00:00:00 2001
From: anjali411 <chourdiaanjali123@gmail.com>
Date: Sun, 13 Nov 2022 15:56:16 +0000
Subject: [PATCH 0847/1922] meta function for max_pool2d_with_indices_backward
 (#88743)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88743
Approved by: https://github.com/lezcano, https://github.com/ezyang
---
 test/functorch/test_aotdispatch.py            |  3 +
 test/inductor/test_torchinductor.py           |  3 +
 test/test_proxy_tensor.py                     |  2 +-
 torch/_meta_registrations.py                  | 56 ++++++++++++++++---
 .../_internal/common_methods_invocations.py   | 35 ++++++++++++
 5 files changed, 90 insertions(+), 9 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index f4782b8a595df..ea00842a4e005 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -973,6 +973,9 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('cholesky'),
     xfail('linalg.cholesky'),
 
+    # Given input size: (s0xs1x2). Calculated output size: ...
+    skip('max_pool2d_with_indices_backward'),
+
     # Misc
     xfail('to_sparse'),
     xfail('corrcoef'),
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index aea8013bdfac8..d331559a3a8b1 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -4787,6 +4787,9 @@ def forward(self, x):
                     for param in model_opt.parameters():
                         param.add_(1.0)
 
+        # Probably fails due to the symint math issue caught while adding
+        # max_pool2d_with_indices_backward
+        @unittest.skip("Accuracy failure, needs debugging")
         def test_accuracy_issue1(self):
             class Repro(torch.nn.Module):
                 def __init__(self):
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 86beb651cb2d1..42ecc3d376ab8 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -721,7 +721,6 @@ def deco(cls):
 @xfail_inherited_tests([
     "test_mode_tracing_factory_function",
     "test_make_fx_overloads",
-    "test_resnet18_backward_trace",
     "test_trace_subclasses",
 ])
 class TestGenericProxyTensorSymbolic(TestGenericProxyTensor):
@@ -1229,6 +1228,7 @@ def f(a, b, c, d, e):
     xfail('mode', ''),  # aten.mode.default - couldn't find symbolic meta function/decomposition
     xfail('nanquantile', ''),  # Could not run 'aten::equal' with arguments from the 'Meta' backend.
     xfail('narrow', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
+    xfail('max_pool2d_with_indices_backward', ''),  # (symint math failure) Given input size: (s0xs1x2). Calculated ...
     xfail('nn.functional.adaptive_max_pool1d', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.adaptive_max_pool2d', ''),  # aten.adaptive_max_pool2d.default - couldn't find symbolic meta funct...
     xfail('nn.functional.adaptive_max_pool3d', ''),  # argument 'output_size' (position 2) must be tupl...
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 5d583de67d196..be7370e344f04 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1349,9 +1349,8 @@ def pool2d_shape_check(
     )
 
 
-@register_meta(aten.max_pool2d_with_indices.default)
-def meta_max_pool2d_with_indices(
-    input, kernel_size, stride=(), padding=(0,), dilation=(1,), ceil_mode=False
+def max_pool2d_checks_and_compute_shape(
+    input, kernel_size, stride, padding, dilation, ceil_mode
 ):
     # Reference: aten/src/ATen/native/DilatedMaxPool2d.cpp
     def unpack(name, val):
@@ -1376,6 +1375,9 @@ def unpack(name, val):
 
     padH, padW = unpack("padding", padding)
     dilationH, dilationW = unpack("dilation", dilation)
+    nInputPlane = input.size(-3)
+    inputHeight = input.size(-2)
+    inputWidth = input.size(-1)
 
     memory_format = utils.suggest_memory_format(input)
     if memory_format == torch.channels_last:
@@ -1394,11 +1396,6 @@ def unpack(name, val):
             lambda: "Unsupport memory format. Supports only ChannelsLast, Contiguous",
         )
 
-    nbatch = input.size(-4) if input.dim() == 4 else 1
-    nInputPlane = input.size(-3)
-    inputHeight = input.size(-2)
-    inputWidth = input.size(-1)
-
     outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, dilationH, ceil_mode)
     outputWidth = pooling_output_shape(inputWidth, kW, padW, dW, dilationW, ceil_mode)
 
@@ -1420,6 +1417,49 @@ def unpack(name, val):
         memory_format,
     )
 
+    return nInputPlane, outputHeight, outputWidth
+
+
+@register_meta(aten.max_pool2d_with_indices_backward.default)
+def meta_max_pool2d_with_indices_backward(
+    grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices
+):
+    nInputPlane, outputHeight, outputWidth = max_pool2d_checks_and_compute_shape(
+        self, kernel_size, stride, padding, dilation, ceil_mode
+    )
+
+    check(
+        self.dtype == grad_output.dtype,
+        lambda: "expected dtype {self.dtype} for `gradOutput` but got dtype {grad_output.dtype}",
+    )
+
+    nOutputPlane = nInputPlane
+    ndim = self.ndim
+
+    def _check_dim_size(t):
+        check_dim_size(t, ndim, ndim - 3, nOutputPlane)
+        check_dim_size(t, ndim, ndim - 2, outputHeight)
+        check_dim_size(t, ndim, ndim - 1, outputWidth)
+
+    _check_dim_size(grad_output)
+    _check_dim_size(indices)
+
+    memory_format = utils.suggest_memory_format(self)
+    return torch.empty(
+        self.shape, dtype=self.dtype, device=self.device, memory_format=memory_format
+    )
+
+
+@register_meta(aten.max_pool2d_with_indices.default)
+def meta_max_pool2d_with_indices(
+    input, kernel_size, stride=(), padding=(0,), dilation=(1,), ceil_mode=False
+):
+    nInputPlane, outputHeight, outputWidth = max_pool2d_checks_and_compute_shape(
+        input, kernel_size, stride, padding, dilation, ceil_mode
+    )
+
+    nbatch = input.size(-4) if input.dim() == 4 else 1
+    memory_format = utils.suggest_memory_format(input)
     if input.dim() == 3:
         size = [nInputPlane, outputHeight, outputWidth]
     else:
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 8ab1ea8a047cd..441bc7adcf83a 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -2925,6 +2925,7 @@ def sample_inputs_max_pool(op_info, device, dtype, requires_grad, **kwargs):
         'nn.functional.max_pool1d': _TestParamsMaxPool1d,
         'nn.functional.max_pool2d': _TestParamsMaxPool2d,
         'nn.functional.max_pool3d': _TestParamsMaxPool3d,
+        'max_pool2d_with_indices_backward': _TestParamsMaxPool2d,
     }
 
     params_generator = params_generator_type_dict[op_info.name]()
@@ -2932,6 +2933,15 @@ def sample_inputs_max_pool(op_info, device, dtype, requires_grad, **kwargs):
         arg = make_arg(shape).to(memory_format=memory_format).requires_grad_(requires_grad)
         yield SampleInput(arg, kwargs=kwargs)
 
+def max_pool2d_backward(*args, kernel_size=(), stride=(), padding=(0,), dilation=(1,), ceil_mode=False, **kwargs):
+    out, indices = torch.nn.functional.max_pool2d_with_indices(
+        *args, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, ceil_mode=ceil_mode, return_indices=True)
+    grad_out = torch.ones_like(out)
+    if stride is None:
+        stride = kernel_size
+    out_b = torch.ops.aten.max_pool2d_with_indices_backward.default(
+        grad_out, *args, kernel_size, stride, padding, dilation, ceil_mode, indices)
+    return out_b
 
 def error_inputs_max_pool1d(op_info, device, **kwargs):
     # Toggle requires_grad because `max_pool1d` has different path
@@ -11567,6 +11577,31 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
            error_inputs_func=error_inputs_max_pool2d,
            sample_inputs_func=sample_inputs_max_pool),
+    OpInfo('max_pool2d_with_indices_backward',
+           op=max_pool2d_backward,
+           # We've defined a custom op, so there's no corresponding aten op
+           aten_name=None,
+           method_variant=None,
+           inplace_variant=None,
+           operator_variant=None,
+           inplace_operator_variant=None,
+           check_batched_gradgrad=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           check_batched_forward_grad=False,
+           assert_jit_shape_analysis=False,
+           dtypes=floating_types_and(torch.bfloat16),
+           dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+           sample_inputs_func=sample_inputs_max_pool,
+           skips=(
+               # We've defined a custom op here, and we don't handle the case where we receive an out kwarg
+               DecorateInfo(unittest.skip("Skipped!"), "TestCommon", "test_out"),
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+               # FX failed to normalize op - add the op to the op_skip list.
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # object has no attribute max_pool2d_with_indices_backward (It's not available on torch -- so expected)
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit')
+           )),
     OpInfo('nn.functional.max_pool3d',
            aten_name='max_pool3d',
            # Runs very slowly on slow gradcheck - alternatively reduce input sizes

From 88d9469b016dc9698a18ae54bb2de0e56f8736b2 Mon Sep 17 00:00:00 2001
From: Jason Ansel <jansel@meta.com>
Date: Sun, 13 Nov 2022 19:42:42 +0000
Subject: [PATCH 0848/1922] Skip dynamo benchmark tests under TSAN (#88895)

Summary: Fixes T137546804

Test Plan:
```
buck2 test mode/opt-tsan //caffe2/benchmarks/dynamo:test
buck2 test mode/opt //caffe2/benchmarks/dynamo:test
```

Differential Revision: D41226384

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88895
Approved by: https://github.com/anijain2305
---
 benchmarks/dynamo/test.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/benchmarks/dynamo/test.py b/benchmarks/dynamo/test.py
index 317e8e4ea50e7..438218462030f 100644
--- a/benchmarks/dynamo/test.py
+++ b/benchmarks/dynamo/test.py
@@ -5,8 +5,17 @@
 
 from .torchbench import setup_torchbench_cwd, TorchBenchmarkRunner
 
+try:
+    # fbcode only
+    from aiplatform.utils.sanitizer_status import is_asan_or_tsan
+except ImportError:
+
+    def is_asan_or_tsan():
+        return False
+
 
 class TestDynamoBenchmark(unittest.TestCase):
+    @unittest.skipIf(is_asan_or_tsan(), "ASAN/TSAN not supported")
     def test_benchmark_infra_runs(self) -> None:
         """
         Basic smoke test that TorchBench runs.

From 9b7b4d580dae88b868af9da45316a15f1d31e261 Mon Sep 17 00:00:00 2001
From: Nikita Karetnikov <nikita@karetnikov.org>
Date: Sat, 12 Nov 2022 20:06:12 +0100
Subject: [PATCH 0849/1922] [primTorch] Add ref for `complex` (#88562)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88562
Approved by: https://github.com/ezyang
---
 torch/_prims/context.py                       |  3 +-
 torch/_refs/__init__.py                       |  1 -
 torch/_refs/_conversions.py                   | 45 ++++++++++++++++++-
 .../_internal/common_methods_invocations.py   | 34 ++++++++++++++
 4 files changed, 80 insertions(+), 3 deletions(-)

diff --git a/torch/_prims/context.py b/torch/_prims/context.py
index 203d73fd948eb..b9f6e634bb49b 100644
--- a/torch/_prims/context.py
+++ b/torch/_prims/context.py
@@ -68,7 +68,8 @@ def torch_to_refs_map():
 
     # Support conversions
     for s in torch._refs._conversions.__all__:
-        r[getattr(torch.Tensor, s)] = torch._refs._conversions.__dict__.get(s)
+        tensor_attr = getattr(torch.Tensor, s, None) or getattr(torch, s)
+        r[tensor_attr] = torch._refs._conversions.__dict__.get(s)
 
     return r
 
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 70edbff2237f2..a1de9a438d774 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -122,7 +122,6 @@
     "bitwise_right_shift",
     "bitwise_xor",
     "clamp_min",
-    # "complex",
     "copysign",
     "div",
     "eq",
diff --git a/torch/_refs/_conversions.py b/torch/_refs/_conversions.py
index 11657f7058bd7..abcd5729818d7 100644
--- a/torch/_refs/_conversions.py
+++ b/torch/_refs/_conversions.py
@@ -1,6 +1,12 @@
 import torch
+import torch._prims_common as utils
 
-from torch._prims_common import TensorLikeType
+# Utilities should come BEFORE this import
+from torch._decomp import register_decomposition
+
+from torch._prims_common import check, TensorLikeType
+from torch._prims_common.wrappers import out_wrapper
+from torch._refs import _broadcast_shapes
 
 # Data conversion references.
 #
@@ -10,6 +16,7 @@
 # (like int).
 
 __all__ = [
+    # dtypes
     "bfloat16",
     "bool",
     "byte",
@@ -23,6 +30,8 @@
     "int",
     "long",
     "short",
+    # misc
+    "complex",
 ]
 
 
@@ -61,3 +70,37 @@ def fn(
 long = _make_conversion_method("long", torch.long)
 
 short = _make_conversion_method("short", torch.short)
+
+
+@register_decomposition(torch.ops.aten.complex)
+# Note: complex has type promotion tests disabled due to different semantics.
+# exact_dtype is for compat with complex_check_dtype from core.
+@out_wrapper(exact_dtype=True)
+def complex(real: TensorLikeType, imag: TensorLikeType) -> TensorLikeType:
+    allowed_dtypes = (torch.float32, torch.float64, torch.float16)
+    check(
+        real.dtype in allowed_dtypes and imag.dtype in allowed_dtypes,
+        lambda: (
+            f"Expected both inputs to be Half, Float or Double tensors but got "
+            f"{real.dtype} and {imag.dtype}"
+        ),
+    )
+    check(
+        real.dtype == imag.dtype,
+        lambda: (
+            f"Expected object of scalar type {real.dtype} but got "
+            f"scalar type {imag.dtype} for second argument"
+        ),
+    )
+    result_dtype = utils.corresponding_complex_dtype(real.dtype)  # type: ignore[arg-type]
+    common_shape = _broadcast_shapes(real.shape, imag.shape)
+    result = real.new_empty(
+        common_shape,
+        dtype=result_dtype,
+        layout=real.layout,
+        device=real.device,
+        # pin_memory=real.is_pinned(),  # NYI
+    )
+    result.real = real
+    result.imag = imag
+    return result
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 441bc7adcf83a..62c9b4750ae93 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -5239,6 +5239,28 @@ def sample_inputs_view_as_real(op_info, device, dtype, requires_grad, **kwargs):
     sizes = ((S, S), ())
     return (SampleInput(make_arg(size)) for size in sizes)
 
+def error_inputs_complex(op_info, device, is_ref=False, **kwargs):
+    make_arg = partial(make_tensor, dtype=torch.float32, device=device)
+
+    if is_ref:
+        error_float = "Expected both inputs to be Half, Float or Double tensors but got torch.float32 and torch.int32"
+        error_dtype = "Expected object of scalar type torch.float32 but got scalar type torch.float64 for second argument"
+        error_out = "Expected out tensor to have dtype torch.complex128 but got torch.complex64 instead"
+    else:
+        error_float = "Expected both inputs to be Half, Float or Double tensors but got Float and Int"
+        error_dtype = "Expected object of scalar type Float but got scalar type Double for second argument"
+        error_out = "Expected object of scalar type ComplexDouble but got scalar type ComplexFloat for argument 'out'"
+
+    yield ErrorInput(SampleInput(make_arg(M, S), make_arg(M, S, dtype=torch.int)),
+                     error_type=RuntimeError, error_regex=error_float)
+
+    yield ErrorInput(SampleInput(make_arg(M, S), make_arg(M, S, dtype=torch.float64)),
+                     error_type=RuntimeError, error_regex=error_dtype)
+
+    yield ErrorInput(SampleInput(make_arg(M, S, dtype=torch.float64), make_arg(M, S, dtype=torch.float64),
+                                 out=make_arg(M, S, dtype=torch.complex64)),
+                     error_type=RuntimeError, error_regex=error_out)
+
 def sample_inputs_prod(op_info, device, dtype, requires_grad, **kwargs):
     def make_arg(shape):
         # shrink values to be in the interval [-1, +1] for better precision in gradgradcheck
@@ -9097,6 +9119,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     supports_forward_ad=True,
                     supports_fwgrad_bwgrad=True,
                     supports_rhs_python_scalar=False,
+                    error_inputs_func=error_inputs_complex,
                     skips=(
                         # Test doesn't account for complex's type promotion semantics
                         DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion'),
@@ -17933,6 +17956,17 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             DecorateInfo(unittest.skip('Overflow when downcasting signed type is undefined'), 'TestCommon', 'test_compare_cpu'),
         )
     ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs._conversions.complex",
+        torch_opinfo_name="complex",
+        error_inputs_func=partial(error_inputs_complex, is_ref=True),
+        # prims.empty_strided.default does not support nvfuser
+        supports_nvfuser=False,
+        skips=(
+            # Test doesn't account for complex's type promotion semantics
+            DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion'),
+        )
+    ),
     ElementwiseUnaryPythonRefInfo(
         "_refs._conversions.double",
         torch_opinfo_name="double",

From e7fcab0587ac3380224eb67536355cbc6d683e18 Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Sun, 13 Nov 2022 06:06:24 +0000
Subject: [PATCH 0850/1922] Symintify decomps for split and upsample_bilinear;
 Fix decomp for _softmax_backward_data and native_dropout_backward (#88761)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88761
Approved by: https://github.com/ezyang
---
 test/functorch/test_aotdispatch.py            |  4 -
 test/functorch/test_ops.py                    |  4 +
 test/functorch/test_vmap.py                   |  3 +
 test/inductor/test_torchinductor_opinfo.py    |  1 +
 test/test_decomp.py                           |  3 +
 test/test_proxy_tensor.py                     | 22 +++--
 torch/_decomp/decompositions.py               | 98 +++++++++++++++----
 .../_internal/common_methods_invocations.py   | 94 ++++++++++++++----
 8 files changed, 177 insertions(+), 52 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index ea00842a4e005..e0ffcbe7d97d5 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1005,7 +1005,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('cdist', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('cholesky_inverse', ''),  # could not find kernel
     xfail('cholesky_solve', ''),  # could not find kernel
-    xfail('chunk', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('column_stack', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('combinations', ''),  # aten.masked_select.default
     xfail('complex', ''),  # aten.view_as_real.default - couldn't find symbolic meta function/decomposition
@@ -1139,7 +1138,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('nn.functional.hinge_embedding_loss', ''),  # aten.zeros_like.default - couldn't find symbolic meta...
     xfail('nn.functional.interpolate', 'area'),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.interpolate', 'bicubic'),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('nn.functional.interpolate', 'bilinear'),  # Cannot call sizes() on tensor with symbolic sizes/str...
     xfail('nn.functional.interpolate', 'linear'),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.interpolate', 'nearest'),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.interpolate', 'trilinear'),  # Cannot call sizes() on tensor with symbolic sizes/st...
@@ -1166,7 +1164,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('nn.functional.rrelu', ''),  # aten.rrelu_with_noise.default - couldn't find symbolic meta function...
     xfail('nn.functional.smooth_l1_loss', ''),  # could not find kernel
     xfail('nn.functional.unfold', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('nn.functional.upsample_bilinear', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.upsample_nearest', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('norm', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('norm', 'nuc'),  # aten._linalg_svd.default - couldn't find symbolic meta function/decomposition
@@ -1199,7 +1196,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('sgn', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('special.i1', ''),  # aten.i0.default - couldn't find symbolic meta function/decomposition
     xfail('special.polygamma', 'special_polygamma_n_0'),  # aten.polygamma.default - couldn't find symbolic ...
-    xfail('split', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('std', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('std_mean', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('stft', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index 74085941c6c88..85ac70d74825e 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -1052,6 +1052,7 @@ def test():
         xfail('segment_reduce', 'lengths'),
         xfail('sparse.sampled_addmm', ''),
         xfail("native_batch_norm"),
+        xfail("native_dropout_backward"),
     }))
     def test_vmapvjp_has_batch_rule(self, device, dtype, op):
         if not op.supports_autograd:
@@ -1216,6 +1217,8 @@ def get_vjp(cotangents, *primals):
         xfail('segment_reduce', 'offsets'),  # NYI: forward-AD for segment_reduce
         xfail('index_reduce', ''),  # NYI: forward-AD for index_reduce
         xfail('segment_reduce', 'lengths'),  # NYI: forward-AD for segment_reduce
+        xfail('native_dropout_backward'),  # NYI
+
     }))
     @opsToleranceOverride('TestOperators', 'test_jvpvjp', (
         tol1('masked.prod',
@@ -1372,6 +1375,7 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents):
         # input while the running_mean or running_var, which will be updated in
         # place, were not batched.
         xfail("native_batch_norm"),
+        xfail('native_dropout_backward',)
     }))
     @ops(op_db + additional_op_db, allowed_dtypes=(torch.float,))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index 6d95077b627e2..9726b7feedb7f 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -3238,6 +3238,7 @@ def test():
         xfail('broadcast_shapes', ''),  # test runner can't handle non-Tensor ops
         xfail('sparse.sampled_addmm'),  # sparse
         xfail('cross'),  # The default value of dim in op is *very* weird. No wonder it doesn't work
+        skip('_softmax_backward_data'),
         skip('linalg.eigh', ''),  # not unique, see test_linalg_eigh for manual test
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
         # ----------------------------------------------------------------------
@@ -3379,6 +3380,7 @@ def test_vmap_exhaustive(self, device, dtype, op):
         xfail('bernoulli', ''),
         xfail('linalg.lu_factor', ''),
         xfail('nn.functional.feature_alpha_dropout', 'with_train'),
+        xfail('native_dropout_backward'),
         xfail('nn.functional.kl_div', ''),
         xfail('multinomial', ''),
         xfail('column_stack', ''),
@@ -3452,6 +3454,7 @@ def test_vmap_exhaustive(self, device, dtype, op):
         xfail('equal', ''),
         xfail('linalg.lu', ''),
         skip('linalg.ldl_solve', ''),
+        skip('_softmax_backward_data'),
     }))
     def test_op_has_batch_rule(self, device, dtype, op):
         # needs to be fixed
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 3d384efea0aec..89ea42c9fea7e 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -425,6 +425,7 @@ def wrapper_set_seed(op, *args, **kwargs):
     "randn": {"assert_equal": False},
     ("nn.functional.tanhshrink", "cuda", f16): {"atol": 3e-4, "rtol": 0.001},
     ("cummax", "cuda", f16): {"atol": 5e-4, "rtol": 0.002},
+    ("_softmax_backward_data", "cuda", f16): {"atol": 0.008, "rtol": 0.002},
     "gradient": {"check_gradient": False},  # segfault on check_gradient
     # Following tests failed, and causing subsequent tests failing with unrecoverable CUDA error
     "linalg.solve_triangular": {"check_gradient": False},
diff --git a/test/test_decomp.py b/test/test_decomp.py
index 67e99d5eb8291..a3658792c5e71 100644
--- a/test/test_decomp.py
+++ b/test/test_decomp.py
@@ -294,6 +294,9 @@ def normalize_op_input_output(f, sample, requires_grad=True):
     (None, None, "meshgrid"),
     # diag was not decomposed (it just registers a decomp for diag_out, torch.diag is CompImplicit)
     (None, None, "diag"),
+
+    # _softmax_backward_data's CPU kernel for bfloat16 always return the grad_input as float32
+    ("cpu", torch.bfloat16, "_softmax_backward_data"),
 }
 
 CROSS_REF_BACKWARD_EXCLUDE_SET = {
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 42ecc3d376ab8..894b35693430e 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1124,7 +1124,6 @@ def f(a, b, c, d, e):
     xfail('cartesian_prod', ''),  # Tensors of type TensorImpl do not have numel
     xfail('cdist', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('cholesky_solve', ''),  # Could not run 'aten::_cholesky_solve_helper' with arguments from the 'Meta' back...
-    xfail('chunk', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('column_stack', ''),  # Tensors of type TensorImpl do not have numel
     xfail('combinations', ''),
     xfail('count_nonzero', ''),  # Could not run 'aten::count_nonzero.dim_IntList' with arguments from the 'Meta' ba...
@@ -1247,7 +1246,6 @@ def f(a, b, c, d, e):
     xfail('nn.functional.hinge_embedding_loss', ''),  # aten.empty_like.default - couldn't find symbolic meta function/deco...
     xfail('nn.functional.interpolate', 'area'),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.interpolate', 'bicubic'),  # aten.upsample_bicubic2d.vec - couldn't find symbolic meta function/d...
-    xfail('nn.functional.interpolate', 'bilinear'),  # aten.upsample_bilinear2d.vec - couldn't find symbolic meta function...
     xfail('nn.functional.interpolate', 'linear'),  # aten.upsample_linear1d.vec - couldn't find symbolic meta function/dec...
     xfail('nn.functional.interpolate', 'nearest'),  # aten.upsample_nearest1d.vec - couldn't find symbolic meta function/d...
     xfail('nn.functional.interpolate', 'trilinear'),  # aten.upsample_trilinear3d.vec - couldn't find symbolic meta functi...
@@ -1267,7 +1265,6 @@ def f(a, b, c, d, e):
     xfail('nn.functional.rrelu', ''),  # aten.empty_like.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.smooth_l1_loss', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.unfold', ''),  # aten.im2col.default - couldn't find symbolic meta function/decomposition
-    xfail('nn.functional.upsample_bilinear', ''),  # aten.upsample_bilinear2d.vec - couldn't find symbolic meta function/de...
     xfail('nn.functional.upsample_nearest', ''),  # aten.upsample_nearest1d.vec - couldn't find symbolic meta function/deco...
     xfail('nonzero', ''),  # aten.nonzero.default - couldn't find symbolic meta function/decomposition
     xfail('norm', 'nuc'),  # aten._linalg_svd.default - couldn't find symbolic meta function/decomposition
@@ -1313,7 +1310,6 @@ def f(a, b, c, d, e):
     xfail('special.polygamma', 'special_polygamma_n_0'),  # aten.polygamma.default - couldn't find symbolic meta function/...
     xfail('special.scaled_modified_bessel_k0', ''),  # aten.special_scaled_modified_bessel_k0.default - couldn't find symbo...
     xfail('special.scaled_modified_bessel_k1', ''),  # aten.special_scaled_modified_bessel_k1.default - couldn't find symbo...
-    xfail('split', ''),  # 'torch._C.SymIntNode' and 'int'
     xfail('stft', ''),  # argument 'size' must be tuple of ints, but found element of type torch._C.SymIntNode at...
     xfail('sum_to_size', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('svd', ''),  # aten._linalg_svd.default - couldn't find symbolic meta function/decomposition
@@ -1439,10 +1435,13 @@ def _fn(t, *args, **kwargs):
     return _fn
 
 def _test_make_fx_helper(self, device, dtype, op, tracing_mode, inplace=False):
-    def f(args, kwargs, extra_args):
+    def f(args, kwargs, extra_args, extra_kwargs):
         if extra_args:
             for i, t in extra_args:
                 args[i] = t.size()
+        if extra_kwargs:
+            for k, t in extra_kwargs.items():
+                kwargs[k] = t.size()
 
         fn = _get_safe_inplace(op.get_inplace()) if inplace else op.op
         return fn(*args, **kwargs)
@@ -1463,23 +1462,26 @@ def f(args, kwargs, extra_args):
         # - Unpack the size in the wrapper to get a torch.Size with dynamic shapes (in
         #   symbolic mode, a no-op otherwise)
         extra_args = []
+        extra_kwargs = {}
         for i, arg in enumerate(args):
             if isinstance(arg, torch.Size):
-                extra_args.append((i, torch.empty((), device="cpu").expand(arg)))
-        # TODO: support kwargs
+                extra_args.append((i, torch.empty(arg, device="cpu")))
+        for key, value in kwargs.items():
+            if isinstance(value, torch.Size):
+                extra_kwargs[key] = torch.empty(value, device="cpu")
 
         try:
-            new_f = make_fx(f, tracing_mode=tracing_mode)(args, kwargs, extra_args)
+            new_f = make_fx(f, tracing_mode=tracing_mode)(args, kwargs, extra_args, extra_kwargs)
         except DynamicOutputShapeException as e:
             self.skipTest("Dynamic output shape operation in trace")
         for arg in args:
             if isinstance(arg, torch.Tensor) and arg.dtype == torch.float:
                 arg.uniform_(0, 1)
         try:
-            old_out = f(args, kwargs, extra_args)
+            old_out = f(args, kwargs, extra_args, extra_kwargs)
         except Exception:
             continue
-        new_out = wrapper_set_seed(new_f, args, kwargs, extra_args)
+        new_out = wrapper_set_seed(new_f, args, kwargs, extra_args, extra_kwargs)
         self.assertEqual(new_out, old_out)
 
 class TestProxyTensorOpInfo(TestCase):
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 1a2d332e99fd9..7c84cb7e2ca8b 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -4,7 +4,7 @@
 from enum import Enum
 from functools import partial, reduce
 from itertools import product
-from typing import Callable, cast, Iterable, List, Optional, Tuple
+from typing import Callable, cast, Iterable, List, Optional, Tuple, Union
 
 import torch
 import torch._prims_common as utils
@@ -13,6 +13,7 @@
 from torch._decomp import register_decomposition
 from torch._prims_common import IntLike, NumberType, TensorLike, TensorSequenceType
 from torch._prims_common.wrappers import _maybe_resize_out, _safe_copy_out, out_wrapper
+from torch.fx.experimental.symbolic_shapes import guard_int, sym_float, sym_int
 from torch.utils._pytree import tree_flatten, tree_map
 
 DispatchKey = torch._C.DispatchKey  # type: ignore[attr-defined]
@@ -696,7 +697,12 @@ def _softmax_backward_data(
     grad_input = new_grad_output - output * torch.sum(
         new_grad_output, dim=dim, keepdim=True
     )
-    return _cast_grad_to_input_dtype(grad_output, grad_input, input_dtype)
+
+    # CPU kernel doesn't respect input_dtype, but following check doesn't work for meta tensor
+    # if grad_output.device == torch.device("cpu"):
+    #     return grad_input.contiguous()
+
+    return _cast_grad_to_input_dtype(grad_output, grad_input, input_dtype).contiguous()
 
 
 @register_decomposition(aten._log_softmax_backward_data)
@@ -912,9 +918,17 @@ def check_positive(param, param_name, strict=True):
 
 
 @register_decomposition(aten.native_dropout_backward)
-@pw_cast_for_opmath
 def native_dropout_backward(grad_output: Tensor, mask: Tensor, scale: float):
-    return grad_output * (mask.type_as(grad_output) * scale)
+    # According to the CUDA kernel implementation we should have this test;
+    # but it seems to fail tests!
+    # utils.check(mask.dtype == torch.bool, lambda: f"Mask should be Bool Scalar Type {mask.dtype}")
+
+    # Mimicking CUDA kernel's behavior for output stride: output follow input's memory format
+    # This different from TensorIterator's behavior
+    r = (grad_output * (mask.type_as(grad_output) * scale)).clone(
+        memory_format=utils.suggest_memory_format(grad_output)
+    )
+    return r
 
 
 @register_decomposition(aten.unfold_backward)
@@ -1095,8 +1109,9 @@ def split(self: Tensor, split_size: int, dim: int = 0) -> List[Tensor]:
         assert dim_size == 0
         return [self]
     chunks = (dim_size + split_size - 1) // split_size
+    chunks = guard_int(chunks)
     split_sizes = [split_size for i in range(chunks)]
-    split_sizes[chunks - 1] = split_size - (split_size * chunks - dim_size)
+    split_sizes[-1] = split_size - (split_size * chunks - dim_size)
     return torch.split(self, split_sizes, dim)
 
 
@@ -1786,29 +1801,74 @@ def norm(
     return torch.linalg.vector_norm(self, p, dim, keepdim, dtype=dtype)
 
 
+# aten/src/ATen/native/UpSample.cpp compute_output_size
+def upsample_compute_output_size(input_size, output_size, scale_factors):
+    spatial_dimensions = len(input_size) - 2
+    if output_size is not None:
+        utils.check(
+            scale_factors is None,
+            lambda: "Must specify exactly one of output_size and scale_factors",
+        )
+        utils.check(len(output_size) == spatial_dimensions, lambda: "")
+        return output_size
+    if scale_factors is not None:
+        # NB: this isn't necessary lol
+        utils.check(
+            output_size is None,
+            lambda: "Must specify exactly one of output_size and scale_factors",
+        )
+        utils.check(len(scale_factors) == spatial_dimensions, lambda: "")
+        return [
+            # Returning output_size as float. We cannot convert it to int directly,
+            # as latter computation of scale_factor is relying output size being float
+            sym_float(input_size[i + 2] * scale_factors[i])
+            for i in range(spatial_dimensions)
+        ]
+    utils.check(
+        False, lambda: "Must specify exactly one of output_size and scale_factors"
+    )
+
+
+def get_scale_value(scales, idx):
+    if scales is None:
+        return None
+    return scales[idx]
+
+
 @register_decomposition(torch.ops.aten.upsample_bilinear2d.vec)
-@register_decomposition(torch.ops.aten.upsample_bilinear2d.vec, type="pre_autograd")
+@torch.ops.aten.upsample_bilinear2d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
+@torch.ops.aten.upsample_bilinear2d.vec.py_impl(DispatchKey.Autograd)
+def upsample_bilinear2d_vec(input, output_size, align_corners, scale_factors):
+    osize = upsample_compute_output_size(input.size(), output_size, scale_factors)
+    scale_h = get_scale_value(scale_factors, 0)
+    scale_w = get_scale_value(scale_factors, 1)
+
+    # NB: osize could be a list of float when scale_factors is float
+    # so we cannot redispatch to aten.upsample_bilinear2d.default here
+    return upsample_bilinear2d(input, osize, align_corners, scale_h, scale_w)
+
+
+@register_decomposition(torch.ops.aten.upsample_bilinear2d.default)
+@torch.ops.aten.upsample_bilinear2d.default.py_impl(DispatchKey.Autograd)
 @pw_cast_for_opmath
-def upsample_bilinear2d_vec(
+def upsample_bilinear2d(
     input: Tensor,
-    output_size: Optional[List[int]],
+    output_size: List[Union[int, float]],
     align_corners: bool,
-    scale_factors: Optional[List[float]],
+    scales_h: Optional[float] = None,
+    scales_w: Optional[float] = None,
 ) -> Tensor:
     # get dimensions of original image
     n_batch, n_channels, in_h, in_w = input.shape
 
-    if output_size is not None:
-        out_h = float(output_size[0])
-        out_w = float(output_size[1])
-    elif scale_factors is not None:
-        out_h = in_h * scale_factors[0]
-        out_w = in_w * scale_factors[1]
+    out_h = sym_float(output_size[0])
+    out_w = sym_float(output_size[1])
 
     # Calculate horizontal and vertical scaling factor
+    # TODO: Figure out if scales_h/scales_w matters here
     if out_h > 1:
         if align_corners:
-            h_scale_factor = (in_h - 1) / (int(out_h) - 1)
+            h_scale_factor = (in_h - 1) / (sym_int(out_h) - 1)
         else:
             h_scale_factor = in_h / out_h
     else:
@@ -1816,14 +1876,14 @@ def upsample_bilinear2d_vec(
 
     if out_w > 1:
         if align_corners:
-            w_scale_factor = (in_w - 1) / (int(out_w) - 1)
+            w_scale_factor = (in_w - 1) / (sym_int(out_w) - 1)
         else:
             w_scale_factor = in_w / out_w
     else:
         w_scale_factor = 0.0
 
-    i = torch.arange(int(out_h), dtype=input.dtype, device=input.device)
-    j = torch.arange(int(out_w), dtype=input.dtype, device=input.device)
+    i = torch.arange(sym_int(out_h), dtype=input.dtype, device=input.device)
+    j = torch.arange(sym_int(out_w), dtype=input.dtype, device=input.device)
 
     if align_corners:
         x = h_scale_factor * i
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 62c9b4750ae93..8a7968cf57d24 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -406,6 +406,21 @@ def sample_inputs_batch_norm(op_info, device, dtype, requires_grad, **kwargs):
     # running_mean and running_var are required in evaluation mode (training: False) but not in training mode
     yield SampleInput(make_arg((1, 2, 3)), args=(None, None, None, None), kwargs={'training': True})
 
+def sample_inputs_softmax_backward_data(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+    cases = [
+        ((S,), 0),
+        ((S, S), 0),
+        ((S, M, S), -1),
+    ]
+    input_dtypes = [dtype]
+    if dtype == torch.float and device == 'cuda':
+        input_dtypes += [torch.float16]
+
+    for (shape, dim), input_dtype in product(cases, input_dtypes):
+        yield SampleInput(make_arg(shape), make_arg(shape), dim, input_dtype)
 
 def sample_inputs_native_batch_norm(op_info, device, dtype, requires_grad, **kwargs):
     samples = sample_inputs_batch_norm(op_info, device, dtype, requires_grad, **kwargs)
@@ -1173,7 +1188,7 @@ def sample_inputs_zero_(op_info, device, dtype, requires_grad, **kwargs):
     cases = ((), (S, S, S), (S,))
 
     for shape in cases:
-        yield(SampleInput(make_arg(shape)))
+        yield SampleInput(make_arg(shape))
 
 # TODO: add reduction kwargs
 def sample_inputs_multi_margin_loss(op_info, device, dtype, requires_grad, **kwargs):
@@ -3745,8 +3760,8 @@ def sample_inputs_upsample(mode, self, device, dtype, requires_grad, **kwargs):
 
     def shape(size, rank, with_batch_channel=True):
         if with_batch_channel:
-            return tuple([N, C] + ([size] * rank))
-        return tuple([size] * rank)
+            return torch.Size([N, C] + ([size] * rank))
+        return torch.Size([size] * rank)
 
     make_arg = partial(make_tensor, device=device, dtype=dtype,
                        requires_grad=requires_grad, low=-1, high=1)
@@ -5794,9 +5809,9 @@ def sample_inputs_split(op_info, device, dtype, requires_grad, *, list_args=Fals
 
     if list_args:
         cases = (
-            ((S, S, S), ([int(S / 3), S - int(S / 3) * 2, int(S / 3)],)),
-            ((S, S, S), ([int(S / 2), S - int(S / 2) * 2, int(S / 2)], 2),),
-            ((S, S, S), ([int(S / 2), S - int(S / 2) * 2, int(S / 2)], -2),)
+            ((S, S, S), (torch.Size([int(S / 3), S - int(S / 3) * 2, int(S / 3)]),)),
+            ((S, S, S), (torch.Size([int(S / 2), S - int(S / 2) * 2, int(S / 2)]), 2),),
+            ((S, S, S), (torch.Size([int(S / 2), S - int(S / 2) * 2, int(S / 2)]), -2),)
         )
     else:
         cases = (  # type: ignore[assignment]
@@ -5811,10 +5826,10 @@ def sample_inputs_split(op_info, device, dtype, requires_grad, *, list_args=Fals
 def sample_inputs_split_with_sizes(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
-    cases = (((S, S, S), ([int(S / 3), S - int(S / 3) * 2, int(S / 3)],)),
-             ((S, S, S), ([int(S / 3), S - int(S / 3), 0],)),
-             ((S, S, S), ([int(S / 3), S - int(S / 3) * 2, int(S / 3)], 2)),
-             ((S, S, S), ([int(S / 3), S - int(S / 3) * 2, int(S / 3)], -2)),
+    cases = (((S, S, S), (torch.Size([int(S / 3), S - int(S / 3) * 2, int(S / 3)]),)),
+             ((S, S, S), (torch.Size([int(S / 3), S - int(S / 3), 0]),)),
+             ((S, S, S), (torch.Size([int(S / 3), S - int(S / 3) * 2, int(S / 3)]), 2)),
+             ((S, S, S), (torch.Size([int(S / 3), S - int(S / 3) * 2, int(S / 3)]), -2)),
              )
 
     for shape, args in cases:
@@ -6190,7 +6205,7 @@ def sample_inputs_resize_ops(op_info, device, dtype, requires_grad, **kwargs):
         else:
             raise ValueError("sample_inputs_resize_ops is being used with incorrect operator")
 
-        yield(SampleInput(make_arg(shape, requires_grad=requires_grad), args=args))
+        yield SampleInput(make_arg(shape, requires_grad=requires_grad), args=args)
 
 def sample_inputs_view_reshape(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
@@ -6446,7 +6461,7 @@ def sample_inputs_expand(op_info, device, dtype, requires_grad, **kwargs):
 
     for case in cases:
         shape, args = case
-        yield(SampleInput(make_arg(shape), args=(args, )))
+        yield SampleInput(make_arg(shape), args=(args,))
 
 def sample_inputs_conversion(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
@@ -6469,8 +6484,8 @@ def sample_inputs_expand_as(op_info, device, dtype, requires_grad, **kwargs):
              )
 
     for shape, shape_other in cases:
-        yield(SampleInput(make_arg(shape, requires_grad=requires_grad),
-                          args=(make_arg(shape_other, requires_grad=False), )))
+        yield SampleInput(make_arg(shape, requires_grad=requires_grad),
+                          args=(make_arg(shape_other, requires_grad=False),))
 
 
 def sample_inputs_where(op_info, device, dtype, requires_grad, **kwargs):
@@ -6588,8 +6603,8 @@ def sample_inputs_nonzero(op_info, device, dtype, requires_grad, **kwargs):
         inputs.append(mixed)
 
     for input_t, as_tuple in product(inputs, [False, True]):
-        yield(SampleInput(input_t.clone().requires_grad_(requires_grad),
-                          kwargs=dict(as_tuple=as_tuple)))
+        yield SampleInput(input_t.clone().requires_grad_(requires_grad),
+                          kwargs=dict(as_tuple=as_tuple))
 
 def sample_inputs_chunk(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
@@ -6600,7 +6615,7 @@ def sample_inputs_chunk(op_info, device, dtype, requires_grad, **kwargs):
 
     for case in cases:
         shape, args = case
-        yield(SampleInput(make_arg(shape), args=args))
+        yield SampleInput(make_arg(shape), args=args)
 
 def reference_inputs_chunk(op, device, dtype, requires_grad, **kwargs):
     yield from sample_inputs_chunk(op, device, dtype, requires_grad, **kwargs)
@@ -6678,6 +6693,15 @@ def sample_inputs_dropout(op_info, device, dtype, requires_grad, *,
         yield SampleInput(make_arg(case), p=p, training=training)
     yield SampleInput(make_arg(case))
 
+def sample_inputs_dropout_backward(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_mask = partial(make_tensor, device=device, dtype=torch.bool, requires_grad=False)
+
+    cases = ((S, S, S, S), (S,), ())
+    scale_vals = [0.0, 1.0, 2.0]
+
+    for case, scale in product(cases, scale_vals):
+        yield SampleInput(make_arg(case), make_mask(case), scale)
 
 def sample_inputs_embedding_bag(op_info, device, dtype, requires_grad, **kwargs):
     def make_input(shape):
@@ -8095,7 +8119,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     in_shape = input.shape
     in_rank = len(in_shape)
     for d in start_dim, end_dim:
-        if not((in_rank == 0 and d in (-1, 0)) or -in_rank <= d < in_rank):
+        if not ((in_rank == 0 and d in (-1, 0)) or -in_rank <= d < in_rank):
             raise IndexError(f"Dimension out of range (expected to be in range of [{-in_rank}, {in_rank-1}], but got {d}")
     end_dim = end_dim if end_dim >= 0 else in_rank + end_dim
     start_dim = start_dim if start_dim >= 0 else in_rank + start_dim
@@ -8424,7 +8448,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            variant_test_name='decomposed',
            dtypes=all_types_and_complex_and(torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
-                                                       *[torch.bfloat16] if(CUDA11OrLater or TEST_WITH_ROCM) else []),
+                                                       *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
            assert_autodiffed=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -10554,6 +10578,22 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            supports_out=True),
+    OpInfo(
+        '_softmax_backward_data',
+        op=torch.ops.aten._softmax_backward_data,
+        aten_name='_softmax_backward_data',
+        dtypes=floating_types_and(torch.bfloat16),
+        dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16),
+        sample_inputs_func=sample_inputs_softmax_backward_data,
+        assert_autodiffed=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_out=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples', device_type='cpu'),
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)),
+        ),
+    ),
     # `softmin` supports different dtypes based on whether `dtype` argument,
     # is passed or not. Hence two OpInfo entries, one with dtype and other without.
     # https://github.com/pytorch/pytorch/issues/68752
@@ -15927,6 +15967,22 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         sample_inputs_func=sample_inputs_dropout,
         inplace_variant=lambda input, *args, **kwargs:
             wrapper_set_seed(torch.nn.functional.dropout, input, *args, **kwargs, inplace=True)),
+    OpInfo(
+        "native_dropout_backward",
+        op=torch.ops.aten.native_dropout_backward.default,
+        aten_name="native_dropout_backward",
+        dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+        dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+        supports_out=False,
+        sample_inputs_func=sample_inputs_dropout_backward,
+        skips=(
+            DecorateInfo(unittest.skip('Skipped!'), 'TestJit', 'test_variant_consistency_jit'),
+            # Lazy tensor failures
+            DecorateInfo(unittest.skip('Skipped!'), 'TestLazyOpInfo', 'test_dispatched_to_lazy'),
+            DecorateInfo(unittest.expectedFailure, 'TestLazyOpInfo', 'test_correctness'),
+            DecorateInfo(unittest.expectedFailure, 'TestLazyOpInfo', 'test_correctness_with_reusing_ir'),
+        ),
+    ),
     OpInfo(
         "nn.functional.dropout2d",
         op=lambda input, *args, **kwargs:

From 3a436345de1d60f09085981cab3acc61af11c219 Mon Sep 17 00:00:00 2001
From: Aaron Gokaslan <aaronGokaslan@gmail.com>
Date: Sun, 13 Nov 2022 22:05:41 +0000
Subject: [PATCH 0851/1922] Fix: [ATen] Add some missing moves (#88514)

Related to #88512 , but for ATen. This should reduce a number of copies and inefficient atomic smart pointer increments.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88514
Approved by: https://github.com/jgong5, https://github.com/ezyang
---
 aten/src/ATen/InferSize.h                              | 2 +-
 aten/src/ATen/core/Formatting.cpp                      | 4 ++--
 aten/src/ATen/core/Formatting.h                        | 4 ++--
 aten/src/ATen/native/TensorShape.cpp                   | 5 +++--
 aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp   | 3 ++-
 aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp | 3 ++-
 c10/core/Storage.h                                     | 2 +-
 c10/core/StorageImpl.h                                 | 2 +-
 c10/core/WrapDimMinimal.cpp                            | 3 ++-
 c10/core/WrapDimMinimal.h                              | 2 +-
 10 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/aten/src/ATen/InferSize.h b/aten/src/ATen/InferSize.h
index 594b87373a209..111c7eb8f5fc7 100644
--- a/aten/src/ATen/InferSize.h
+++ b/aten/src/ATen/InferSize.h
@@ -80,7 +80,7 @@ inline at::SymDimVector infer_size_dv(
     c10::SymInt numel) {
   auto res = at::SymDimVector(shape);
   infer_size_impl<c10::SymIntArrayRef, c10::SymInt, at::SymDimVector>(
-      shape, numel, res);
+      shape, std::move(numel), res);
   return res;
 }
 
diff --git a/aten/src/ATen/core/Formatting.cpp b/aten/src/ATen/core/Formatting.cpp
index 875b9ef3d0427..4537adff5aa4b 100644
--- a/aten/src/ATen/core/Formatting.cpp
+++ b/aten/src/ATen/core/Formatting.cpp
@@ -13,7 +13,7 @@ std::ostream& operator<<(std::ostream & out, Backend b) {
   return out << toString(b);
 }
 
-std::ostream& operator<<(std::ostream & out, Scalar s) {
+std::ostream& operator<<(std::ostream & out, const Scalar& s) {
   if (s.isFloatingPoint()) {
     return out << s.toDouble();
   }
@@ -35,7 +35,7 @@ std::ostream& operator<<(std::ostream & out, Scalar s) {
   throw std::logic_error("Unknown type in Scalar");
 }
 
-std::string toString(Scalar s) {
+std::string toString(const Scalar& s) {
   std::stringstream out;
   out << s;
   return out.str();
diff --git a/aten/src/ATen/core/Formatting.h b/aten/src/ATen/core/Formatting.h
index 6dcfc6c7b3cd1..9dcd14e1902ee 100644
--- a/aten/src/ATen/core/Formatting.h
+++ b/aten/src/ATen/core/Formatting.h
@@ -8,8 +8,8 @@
 
 namespace c10 {
 TORCH_API std::ostream& operator<<(std::ostream& out, Backend b);
-TORCH_API std::ostream& operator<<(std::ostream & out, Scalar s);
-TORCH_API std::string toString(Scalar s);
+TORCH_API std::ostream& operator<<(std::ostream & out, const Scalar& s);
+TORCH_API std::string toString(const Scalar& s);
 }
 namespace at {
 
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index e8c87a2f1f5ce..ccaf4b464252a 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -204,10 +204,11 @@
 #include <ATen/ops/zeros_native.h>
 #endif
 
+#include <c10/util/StringUtil.h>
 #include <algorithm>
 #include <cstdint>
+#include <utility>
 #include <vector>
-#include <c10/util/StringUtil.h>
 
 namespace at {
 namespace meta {
@@ -416,7 +417,7 @@ Tensor& set_storage_meta__symint(Tensor& result, Storage storage, c10::SymInt st
     const auto itemsize = result.dtype().itemsize();
     c10::SymInt size_bytes = at::detail::computeStorageNbytes(
         size, stride, itemsize, storage_offset);
-    storage.set_nbytes(size_bytes);
+    storage.set_nbytes(std::move(size_bytes));
   }
   return result;
 }
diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
index 2250e84ad7a6e..9d2f1a96c31ba 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
@@ -1,4 +1,5 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <utility>
 #include <vector>
 
 #include <ATen/core/Tensor.h>
@@ -444,7 +445,7 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeightsOnednn<
   exp_wgt.init(w_desc);
   exp_wgt.set_scale(wgt_scales); // Also for feed_from()
   exp_wgt.feed_from(wgt, transpose); // expect wgt to be in [OC IC KH KW] format
-  ideep::tensor * packed_weight_p = new ideep::tensor(exp_wgt);
+  ideep::tensor * packed_weight_p = new ideep::tensor(std::move(exp_wgt));
   packed_weight_p->set_scale(wgt_scales);
   packed_weight_p->set_zero_point(wgt_zero_points);
   std::unique_ptr<ideep::tensor> weight_ptr(packed_weight_p);
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
index dda600e9b41c0..36523bbd1b9bd 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
@@ -23,6 +23,7 @@
 #include <c10/util/irange.h>
 
 #include <algorithm>
+#include <utility>
 #include <vector>
 
 int register_linear_params();
@@ -249,7 +250,7 @@ c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeightsOnednn::prepack(
                                                              dnnl::memory::data_type::u8);
   ideep::tensor exp_wgt(w_desc);
   exp_wgt.feed_from(wgt);
-  ideep::tensor * packed_weight_p = new ideep::tensor(exp_wgt);
+  ideep::tensor * packed_weight_p = new ideep::tensor(std::move(exp_wgt));
   packed_weight_p->set_scale(wgt_scales);
   packed_weight_p->set_zero_point(wgt_zero_points);
   std::unique_ptr<ideep::tensor> weight_ptr(packed_weight_p);
diff --git a/c10/core/Storage.h b/c10/core/Storage.h
index a89a0039fdfe6..09c5920b56493 100644
--- a/c10/core/Storage.h
+++ b/c10/core/Storage.h
@@ -76,7 +76,7 @@ struct C10_API Storage {
   }
 
   void set_nbytes(c10::SymInt size_bytes) const {
-    storage_impl_.get()->set_nbytes(size_bytes);
+    storage_impl_.get()->set_nbytes(std::move(size_bytes));
   }
 
   bool resizable() const {
diff --git a/c10/core/StorageImpl.h b/c10/core/StorageImpl.h
index bbf0803842537..1d80daed871a2 100644
--- a/c10/core/StorageImpl.h
+++ b/c10/core/StorageImpl.h
@@ -112,7 +112,7 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target {
   }
 
   void set_nbytes(c10::SymInt size_bytes) {
-    size_bytes_ = size_bytes;
+    size_bytes_ = std::move(size_bytes);
   }
 
   bool resizable() const {
diff --git a/c10/core/WrapDimMinimal.cpp b/c10/core/WrapDimMinimal.cpp
index 6703f0638901e..2375dc3ac5cf7 100644
--- a/c10/core/WrapDimMinimal.cpp
+++ b/c10/core/WrapDimMinimal.cpp
@@ -14,7 +14,8 @@ T maybe_wrap_dim_slow(T dim, T dim_post_expr, bool wrap_scalar) {
         "Dimension specified as ",
         dim,
         " but tensor has no dimensions");
-    return c10::maybe_wrap_dim(dim, /*dim_post_expr=*/1, /*wrap_scalar=*/false);
+    return c10::maybe_wrap_dim(
+        std::move(dim), /*dim_post_expr=*/1, /*wrap_scalar=*/false);
   }
 
   T min = dim_post_expr * -1;
diff --git a/c10/core/WrapDimMinimal.h b/c10/core/WrapDimMinimal.h
index 0f5949f65082b..dda01fbe18f0f 100644
--- a/c10/core/WrapDimMinimal.h
+++ b/c10/core/WrapDimMinimal.h
@@ -38,7 +38,7 @@ inline c10::SymInt maybe_wrap_dim(
     c10::SymInt dim,
     c10::SymInt dim_post_expr,
     bool wrap_scalar = true) {
-  return _maybe_wrap_dim(dim, dim_post_expr, wrap_scalar);
+  return _maybe_wrap_dim(std::move(dim), std::move(dim_post_expr), wrap_scalar);
 }
 
 } // namespace c10

From 3c30c687f17f89e1b5c09cd7cb1e496211d844b9 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 14 Nov 2022 01:58:47 +0000
Subject: [PATCH 0852/1922] Revert "Symintify decomps for split and
 upsample_bilinear; Fix decomp for _softmax_backward_data and
 native_dropout_backward (#88761)"

This reverts commit 9eabcc370f4c3a04be85cb1f878038f10716bdc3.

Reverted https://github.com/pytorch/pytorch/pull/88761 on behalf of https://github.com/suo due to much broken https://hud.pytorch.org/pytorch/pytorch/commit/9eabcc370f4c3a04be85cb1f878038f10716bdc3
---
 test/functorch/test_aotdispatch.py            |  4 +
 test/functorch/test_ops.py                    |  4 -
 test/functorch/test_vmap.py                   |  3 -
 test/inductor/test_torchinductor_opinfo.py    |  1 -
 test/test_decomp.py                           |  3 -
 test/test_proxy_tensor.py                     | 22 ++---
 torch/_decomp/decompositions.py               | 98 ++++---------------
 .../_internal/common_methods_invocations.py   | 94 ++++--------------
 8 files changed, 52 insertions(+), 177 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index e0ffcbe7d97d5..ea00842a4e005 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1005,6 +1005,7 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('cdist', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('cholesky_inverse', ''),  # could not find kernel
     xfail('cholesky_solve', ''),  # could not find kernel
+    xfail('chunk', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('column_stack', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('combinations', ''),  # aten.masked_select.default
     xfail('complex', ''),  # aten.view_as_real.default - couldn't find symbolic meta function/decomposition
@@ -1138,6 +1139,7 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('nn.functional.hinge_embedding_loss', ''),  # aten.zeros_like.default - couldn't find symbolic meta...
     xfail('nn.functional.interpolate', 'area'),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.interpolate', 'bicubic'),  # Cannot call sizes() on tensor with symbolic sizes/strides
+    xfail('nn.functional.interpolate', 'bilinear'),  # Cannot call sizes() on tensor with symbolic sizes/str...
     xfail('nn.functional.interpolate', 'linear'),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.interpolate', 'nearest'),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.interpolate', 'trilinear'),  # Cannot call sizes() on tensor with symbolic sizes/st...
@@ -1164,6 +1166,7 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('nn.functional.rrelu', ''),  # aten.rrelu_with_noise.default - couldn't find symbolic meta function...
     xfail('nn.functional.smooth_l1_loss', ''),  # could not find kernel
     xfail('nn.functional.unfold', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
+    xfail('nn.functional.upsample_bilinear', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.upsample_nearest', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('norm', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('norm', 'nuc'),  # aten._linalg_svd.default - couldn't find symbolic meta function/decomposition
@@ -1196,6 +1199,7 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('sgn', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('special.i1', ''),  # aten.i0.default - couldn't find symbolic meta function/decomposition
     xfail('special.polygamma', 'special_polygamma_n_0'),  # aten.polygamma.default - couldn't find symbolic ...
+    xfail('split', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('std', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('std_mean', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('stft', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index 85ac70d74825e..74085941c6c88 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -1052,7 +1052,6 @@ def test():
         xfail('segment_reduce', 'lengths'),
         xfail('sparse.sampled_addmm', ''),
         xfail("native_batch_norm"),
-        xfail("native_dropout_backward"),
     }))
     def test_vmapvjp_has_batch_rule(self, device, dtype, op):
         if not op.supports_autograd:
@@ -1217,8 +1216,6 @@ def get_vjp(cotangents, *primals):
         xfail('segment_reduce', 'offsets'),  # NYI: forward-AD for segment_reduce
         xfail('index_reduce', ''),  # NYI: forward-AD for index_reduce
         xfail('segment_reduce', 'lengths'),  # NYI: forward-AD for segment_reduce
-        xfail('native_dropout_backward'),  # NYI
-
     }))
     @opsToleranceOverride('TestOperators', 'test_jvpvjp', (
         tol1('masked.prod',
@@ -1375,7 +1372,6 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents):
         # input while the running_mean or running_var, which will be updated in
         # place, were not batched.
         xfail("native_batch_norm"),
-        xfail('native_dropout_backward',)
     }))
     @ops(op_db + additional_op_db, allowed_dtypes=(torch.float,))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index 9726b7feedb7f..6d95077b627e2 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -3238,7 +3238,6 @@ def test():
         xfail('broadcast_shapes', ''),  # test runner can't handle non-Tensor ops
         xfail('sparse.sampled_addmm'),  # sparse
         xfail('cross'),  # The default value of dim in op is *very* weird. No wonder it doesn't work
-        skip('_softmax_backward_data'),
         skip('linalg.eigh', ''),  # not unique, see test_linalg_eigh for manual test
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
         # ----------------------------------------------------------------------
@@ -3380,7 +3379,6 @@ def test_vmap_exhaustive(self, device, dtype, op):
         xfail('bernoulli', ''),
         xfail('linalg.lu_factor', ''),
         xfail('nn.functional.feature_alpha_dropout', 'with_train'),
-        xfail('native_dropout_backward'),
         xfail('nn.functional.kl_div', ''),
         xfail('multinomial', ''),
         xfail('column_stack', ''),
@@ -3454,7 +3452,6 @@ def test_vmap_exhaustive(self, device, dtype, op):
         xfail('equal', ''),
         xfail('linalg.lu', ''),
         skip('linalg.ldl_solve', ''),
-        skip('_softmax_backward_data'),
     }))
     def test_op_has_batch_rule(self, device, dtype, op):
         # needs to be fixed
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 89ea42c9fea7e..3d384efea0aec 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -425,7 +425,6 @@ def wrapper_set_seed(op, *args, **kwargs):
     "randn": {"assert_equal": False},
     ("nn.functional.tanhshrink", "cuda", f16): {"atol": 3e-4, "rtol": 0.001},
     ("cummax", "cuda", f16): {"atol": 5e-4, "rtol": 0.002},
-    ("_softmax_backward_data", "cuda", f16): {"atol": 0.008, "rtol": 0.002},
     "gradient": {"check_gradient": False},  # segfault on check_gradient
     # Following tests failed, and causing subsequent tests failing with unrecoverable CUDA error
     "linalg.solve_triangular": {"check_gradient": False},
diff --git a/test/test_decomp.py b/test/test_decomp.py
index a3658792c5e71..67e99d5eb8291 100644
--- a/test/test_decomp.py
+++ b/test/test_decomp.py
@@ -294,9 +294,6 @@ def normalize_op_input_output(f, sample, requires_grad=True):
     (None, None, "meshgrid"),
     # diag was not decomposed (it just registers a decomp for diag_out, torch.diag is CompImplicit)
     (None, None, "diag"),
-
-    # _softmax_backward_data's CPU kernel for bfloat16 always return the grad_input as float32
-    ("cpu", torch.bfloat16, "_softmax_backward_data"),
 }
 
 CROSS_REF_BACKWARD_EXCLUDE_SET = {
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 894b35693430e..42ecc3d376ab8 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1124,6 +1124,7 @@ def f(a, b, c, d, e):
     xfail('cartesian_prod', ''),  # Tensors of type TensorImpl do not have numel
     xfail('cdist', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('cholesky_solve', ''),  # Could not run 'aten::_cholesky_solve_helper' with arguments from the 'Meta' back...
+    xfail('chunk', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('column_stack', ''),  # Tensors of type TensorImpl do not have numel
     xfail('combinations', ''),
     xfail('count_nonzero', ''),  # Could not run 'aten::count_nonzero.dim_IntList' with arguments from the 'Meta' ba...
@@ -1246,6 +1247,7 @@ def f(a, b, c, d, e):
     xfail('nn.functional.hinge_embedding_loss', ''),  # aten.empty_like.default - couldn't find symbolic meta function/deco...
     xfail('nn.functional.interpolate', 'area'),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.interpolate', 'bicubic'),  # aten.upsample_bicubic2d.vec - couldn't find symbolic meta function/d...
+    xfail('nn.functional.interpolate', 'bilinear'),  # aten.upsample_bilinear2d.vec - couldn't find symbolic meta function...
     xfail('nn.functional.interpolate', 'linear'),  # aten.upsample_linear1d.vec - couldn't find symbolic meta function/dec...
     xfail('nn.functional.interpolate', 'nearest'),  # aten.upsample_nearest1d.vec - couldn't find symbolic meta function/d...
     xfail('nn.functional.interpolate', 'trilinear'),  # aten.upsample_trilinear3d.vec - couldn't find symbolic meta functi...
@@ -1265,6 +1267,7 @@ def f(a, b, c, d, e):
     xfail('nn.functional.rrelu', ''),  # aten.empty_like.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.smooth_l1_loss', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.unfold', ''),  # aten.im2col.default - couldn't find symbolic meta function/decomposition
+    xfail('nn.functional.upsample_bilinear', ''),  # aten.upsample_bilinear2d.vec - couldn't find symbolic meta function/de...
     xfail('nn.functional.upsample_nearest', ''),  # aten.upsample_nearest1d.vec - couldn't find symbolic meta function/deco...
     xfail('nonzero', ''),  # aten.nonzero.default - couldn't find symbolic meta function/decomposition
     xfail('norm', 'nuc'),  # aten._linalg_svd.default - couldn't find symbolic meta function/decomposition
@@ -1310,6 +1313,7 @@ def f(a, b, c, d, e):
     xfail('special.polygamma', 'special_polygamma_n_0'),  # aten.polygamma.default - couldn't find symbolic meta function/...
     xfail('special.scaled_modified_bessel_k0', ''),  # aten.special_scaled_modified_bessel_k0.default - couldn't find symbo...
     xfail('special.scaled_modified_bessel_k1', ''),  # aten.special_scaled_modified_bessel_k1.default - couldn't find symbo...
+    xfail('split', ''),  # 'torch._C.SymIntNode' and 'int'
     xfail('stft', ''),  # argument 'size' must be tuple of ints, but found element of type torch._C.SymIntNode at...
     xfail('sum_to_size', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('svd', ''),  # aten._linalg_svd.default - couldn't find symbolic meta function/decomposition
@@ -1435,13 +1439,10 @@ def _fn(t, *args, **kwargs):
     return _fn
 
 def _test_make_fx_helper(self, device, dtype, op, tracing_mode, inplace=False):
-    def f(args, kwargs, extra_args, extra_kwargs):
+    def f(args, kwargs, extra_args):
         if extra_args:
             for i, t in extra_args:
                 args[i] = t.size()
-        if extra_kwargs:
-            for k, t in extra_kwargs.items():
-                kwargs[k] = t.size()
 
         fn = _get_safe_inplace(op.get_inplace()) if inplace else op.op
         return fn(*args, **kwargs)
@@ -1462,26 +1463,23 @@ def f(args, kwargs, extra_args, extra_kwargs):
         # - Unpack the size in the wrapper to get a torch.Size with dynamic shapes (in
         #   symbolic mode, a no-op otherwise)
         extra_args = []
-        extra_kwargs = {}
         for i, arg in enumerate(args):
             if isinstance(arg, torch.Size):
-                extra_args.append((i, torch.empty(arg, device="cpu")))
-        for key, value in kwargs.items():
-            if isinstance(value, torch.Size):
-                extra_kwargs[key] = torch.empty(value, device="cpu")
+                extra_args.append((i, torch.empty((), device="cpu").expand(arg)))
+        # TODO: support kwargs
 
         try:
-            new_f = make_fx(f, tracing_mode=tracing_mode)(args, kwargs, extra_args, extra_kwargs)
+            new_f = make_fx(f, tracing_mode=tracing_mode)(args, kwargs, extra_args)
         except DynamicOutputShapeException as e:
             self.skipTest("Dynamic output shape operation in trace")
         for arg in args:
             if isinstance(arg, torch.Tensor) and arg.dtype == torch.float:
                 arg.uniform_(0, 1)
         try:
-            old_out = f(args, kwargs, extra_args, extra_kwargs)
+            old_out = f(args, kwargs, extra_args)
         except Exception:
             continue
-        new_out = wrapper_set_seed(new_f, args, kwargs, extra_args, extra_kwargs)
+        new_out = wrapper_set_seed(new_f, args, kwargs, extra_args)
         self.assertEqual(new_out, old_out)
 
 class TestProxyTensorOpInfo(TestCase):
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 7c84cb7e2ca8b..1a2d332e99fd9 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -4,7 +4,7 @@
 from enum import Enum
 from functools import partial, reduce
 from itertools import product
-from typing import Callable, cast, Iterable, List, Optional, Tuple, Union
+from typing import Callable, cast, Iterable, List, Optional, Tuple
 
 import torch
 import torch._prims_common as utils
@@ -13,7 +13,6 @@
 from torch._decomp import register_decomposition
 from torch._prims_common import IntLike, NumberType, TensorLike, TensorSequenceType
 from torch._prims_common.wrappers import _maybe_resize_out, _safe_copy_out, out_wrapper
-from torch.fx.experimental.symbolic_shapes import guard_int, sym_float, sym_int
 from torch.utils._pytree import tree_flatten, tree_map
 
 DispatchKey = torch._C.DispatchKey  # type: ignore[attr-defined]
@@ -697,12 +696,7 @@ def _softmax_backward_data(
     grad_input = new_grad_output - output * torch.sum(
         new_grad_output, dim=dim, keepdim=True
     )
-
-    # CPU kernel doesn't respect input_dtype, but following check doesn't work for meta tensor
-    # if grad_output.device == torch.device("cpu"):
-    #     return grad_input.contiguous()
-
-    return _cast_grad_to_input_dtype(grad_output, grad_input, input_dtype).contiguous()
+    return _cast_grad_to_input_dtype(grad_output, grad_input, input_dtype)
 
 
 @register_decomposition(aten._log_softmax_backward_data)
@@ -918,17 +912,9 @@ def check_positive(param, param_name, strict=True):
 
 
 @register_decomposition(aten.native_dropout_backward)
+@pw_cast_for_opmath
 def native_dropout_backward(grad_output: Tensor, mask: Tensor, scale: float):
-    # According to the CUDA kernel implementation we should have this test;
-    # but it seems to fail tests!
-    # utils.check(mask.dtype == torch.bool, lambda: f"Mask should be Bool Scalar Type {mask.dtype}")
-
-    # Mimicking CUDA kernel's behavior for output stride: output follow input's memory format
-    # This different from TensorIterator's behavior
-    r = (grad_output * (mask.type_as(grad_output) * scale)).clone(
-        memory_format=utils.suggest_memory_format(grad_output)
-    )
-    return r
+    return grad_output * (mask.type_as(grad_output) * scale)
 
 
 @register_decomposition(aten.unfold_backward)
@@ -1109,9 +1095,8 @@ def split(self: Tensor, split_size: int, dim: int = 0) -> List[Tensor]:
         assert dim_size == 0
         return [self]
     chunks = (dim_size + split_size - 1) // split_size
-    chunks = guard_int(chunks)
     split_sizes = [split_size for i in range(chunks)]
-    split_sizes[-1] = split_size - (split_size * chunks - dim_size)
+    split_sizes[chunks - 1] = split_size - (split_size * chunks - dim_size)
     return torch.split(self, split_sizes, dim)
 
 
@@ -1801,74 +1786,29 @@ def norm(
     return torch.linalg.vector_norm(self, p, dim, keepdim, dtype=dtype)
 
 
-# aten/src/ATen/native/UpSample.cpp compute_output_size
-def upsample_compute_output_size(input_size, output_size, scale_factors):
-    spatial_dimensions = len(input_size) - 2
-    if output_size is not None:
-        utils.check(
-            scale_factors is None,
-            lambda: "Must specify exactly one of output_size and scale_factors",
-        )
-        utils.check(len(output_size) == spatial_dimensions, lambda: "")
-        return output_size
-    if scale_factors is not None:
-        # NB: this isn't necessary lol
-        utils.check(
-            output_size is None,
-            lambda: "Must specify exactly one of output_size and scale_factors",
-        )
-        utils.check(len(scale_factors) == spatial_dimensions, lambda: "")
-        return [
-            # Returning output_size as float. We cannot convert it to int directly,
-            # as latter computation of scale_factor is relying output size being float
-            sym_float(input_size[i + 2] * scale_factors[i])
-            for i in range(spatial_dimensions)
-        ]
-    utils.check(
-        False, lambda: "Must specify exactly one of output_size and scale_factors"
-    )
-
-
-def get_scale_value(scales, idx):
-    if scales is None:
-        return None
-    return scales[idx]
-
-
 @register_decomposition(torch.ops.aten.upsample_bilinear2d.vec)
-@torch.ops.aten.upsample_bilinear2d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
-@torch.ops.aten.upsample_bilinear2d.vec.py_impl(DispatchKey.Autograd)
-def upsample_bilinear2d_vec(input, output_size, align_corners, scale_factors):
-    osize = upsample_compute_output_size(input.size(), output_size, scale_factors)
-    scale_h = get_scale_value(scale_factors, 0)
-    scale_w = get_scale_value(scale_factors, 1)
-
-    # NB: osize could be a list of float when scale_factors is float
-    # so we cannot redispatch to aten.upsample_bilinear2d.default here
-    return upsample_bilinear2d(input, osize, align_corners, scale_h, scale_w)
-
-
-@register_decomposition(torch.ops.aten.upsample_bilinear2d.default)
-@torch.ops.aten.upsample_bilinear2d.default.py_impl(DispatchKey.Autograd)
+@register_decomposition(torch.ops.aten.upsample_bilinear2d.vec, type="pre_autograd")
 @pw_cast_for_opmath
-def upsample_bilinear2d(
+def upsample_bilinear2d_vec(
     input: Tensor,
-    output_size: List[Union[int, float]],
+    output_size: Optional[List[int]],
     align_corners: bool,
-    scales_h: Optional[float] = None,
-    scales_w: Optional[float] = None,
+    scale_factors: Optional[List[float]],
 ) -> Tensor:
     # get dimensions of original image
     n_batch, n_channels, in_h, in_w = input.shape
 
-    out_h = sym_float(output_size[0])
-    out_w = sym_float(output_size[1])
+    if output_size is not None:
+        out_h = float(output_size[0])
+        out_w = float(output_size[1])
+    elif scale_factors is not None:
+        out_h = in_h * scale_factors[0]
+        out_w = in_w * scale_factors[1]
 
     # Calculate horizontal and vertical scaling factor
-    # TODO: Figure out if scales_h/scales_w matters here
     if out_h > 1:
         if align_corners:
-            h_scale_factor = (in_h - 1) / (sym_int(out_h) - 1)
+            h_scale_factor = (in_h - 1) / (int(out_h) - 1)
         else:
             h_scale_factor = in_h / out_h
     else:
@@ -1876,14 +1816,14 @@ def upsample_bilinear2d(
 
     if out_w > 1:
         if align_corners:
-            w_scale_factor = (in_w - 1) / (sym_int(out_w) - 1)
+            w_scale_factor = (in_w - 1) / (int(out_w) - 1)
         else:
             w_scale_factor = in_w / out_w
     else:
         w_scale_factor = 0.0
 
-    i = torch.arange(sym_int(out_h), dtype=input.dtype, device=input.device)
-    j = torch.arange(sym_int(out_w), dtype=input.dtype, device=input.device)
+    i = torch.arange(int(out_h), dtype=input.dtype, device=input.device)
+    j = torch.arange(int(out_w), dtype=input.dtype, device=input.device)
 
     if align_corners:
         x = h_scale_factor * i
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 8a7968cf57d24..62c9b4750ae93 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -406,21 +406,6 @@ def sample_inputs_batch_norm(op_info, device, dtype, requires_grad, **kwargs):
     # running_mean and running_var are required in evaluation mode (training: False) but not in training mode
     yield SampleInput(make_arg((1, 2, 3)), args=(None, None, None, None), kwargs={'training': True})
 
-def sample_inputs_softmax_backward_data(op_info, device, dtype, requires_grad, **kwargs):
-    make_arg = partial(
-        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
-    )
-    cases = [
-        ((S,), 0),
-        ((S, S), 0),
-        ((S, M, S), -1),
-    ]
-    input_dtypes = [dtype]
-    if dtype == torch.float and device == 'cuda':
-        input_dtypes += [torch.float16]
-
-    for (shape, dim), input_dtype in product(cases, input_dtypes):
-        yield SampleInput(make_arg(shape), make_arg(shape), dim, input_dtype)
 
 def sample_inputs_native_batch_norm(op_info, device, dtype, requires_grad, **kwargs):
     samples = sample_inputs_batch_norm(op_info, device, dtype, requires_grad, **kwargs)
@@ -1188,7 +1173,7 @@ def sample_inputs_zero_(op_info, device, dtype, requires_grad, **kwargs):
     cases = ((), (S, S, S), (S,))
 
     for shape in cases:
-        yield SampleInput(make_arg(shape))
+        yield(SampleInput(make_arg(shape)))
 
 # TODO: add reduction kwargs
 def sample_inputs_multi_margin_loss(op_info, device, dtype, requires_grad, **kwargs):
@@ -3760,8 +3745,8 @@ def sample_inputs_upsample(mode, self, device, dtype, requires_grad, **kwargs):
 
     def shape(size, rank, with_batch_channel=True):
         if with_batch_channel:
-            return torch.Size([N, C] + ([size] * rank))
-        return torch.Size([size] * rank)
+            return tuple([N, C] + ([size] * rank))
+        return tuple([size] * rank)
 
     make_arg = partial(make_tensor, device=device, dtype=dtype,
                        requires_grad=requires_grad, low=-1, high=1)
@@ -5809,9 +5794,9 @@ def sample_inputs_split(op_info, device, dtype, requires_grad, *, list_args=Fals
 
     if list_args:
         cases = (
-            ((S, S, S), (torch.Size([int(S / 3), S - int(S / 3) * 2, int(S / 3)]),)),
-            ((S, S, S), (torch.Size([int(S / 2), S - int(S / 2) * 2, int(S / 2)]), 2),),
-            ((S, S, S), (torch.Size([int(S / 2), S - int(S / 2) * 2, int(S / 2)]), -2),)
+            ((S, S, S), ([int(S / 3), S - int(S / 3) * 2, int(S / 3)],)),
+            ((S, S, S), ([int(S / 2), S - int(S / 2) * 2, int(S / 2)], 2),),
+            ((S, S, S), ([int(S / 2), S - int(S / 2) * 2, int(S / 2)], -2),)
         )
     else:
         cases = (  # type: ignore[assignment]
@@ -5826,10 +5811,10 @@ def sample_inputs_split(op_info, device, dtype, requires_grad, *, list_args=Fals
 def sample_inputs_split_with_sizes(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
-    cases = (((S, S, S), (torch.Size([int(S / 3), S - int(S / 3) * 2, int(S / 3)]),)),
-             ((S, S, S), (torch.Size([int(S / 3), S - int(S / 3), 0]),)),
-             ((S, S, S), (torch.Size([int(S / 3), S - int(S / 3) * 2, int(S / 3)]), 2)),
-             ((S, S, S), (torch.Size([int(S / 3), S - int(S / 3) * 2, int(S / 3)]), -2)),
+    cases = (((S, S, S), ([int(S / 3), S - int(S / 3) * 2, int(S / 3)],)),
+             ((S, S, S), ([int(S / 3), S - int(S / 3), 0],)),
+             ((S, S, S), ([int(S / 3), S - int(S / 3) * 2, int(S / 3)], 2)),
+             ((S, S, S), ([int(S / 3), S - int(S / 3) * 2, int(S / 3)], -2)),
              )
 
     for shape, args in cases:
@@ -6205,7 +6190,7 @@ def sample_inputs_resize_ops(op_info, device, dtype, requires_grad, **kwargs):
         else:
             raise ValueError("sample_inputs_resize_ops is being used with incorrect operator")
 
-        yield SampleInput(make_arg(shape, requires_grad=requires_grad), args=args)
+        yield(SampleInput(make_arg(shape, requires_grad=requires_grad), args=args))
 
 def sample_inputs_view_reshape(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
@@ -6461,7 +6446,7 @@ def sample_inputs_expand(op_info, device, dtype, requires_grad, **kwargs):
 
     for case in cases:
         shape, args = case
-        yield SampleInput(make_arg(shape), args=(args,))
+        yield(SampleInput(make_arg(shape), args=(args, )))
 
 def sample_inputs_conversion(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
@@ -6484,8 +6469,8 @@ def sample_inputs_expand_as(op_info, device, dtype, requires_grad, **kwargs):
              )
 
     for shape, shape_other in cases:
-        yield SampleInput(make_arg(shape, requires_grad=requires_grad),
-                          args=(make_arg(shape_other, requires_grad=False),))
+        yield(SampleInput(make_arg(shape, requires_grad=requires_grad),
+                          args=(make_arg(shape_other, requires_grad=False), )))
 
 
 def sample_inputs_where(op_info, device, dtype, requires_grad, **kwargs):
@@ -6603,8 +6588,8 @@ def sample_inputs_nonzero(op_info, device, dtype, requires_grad, **kwargs):
         inputs.append(mixed)
 
     for input_t, as_tuple in product(inputs, [False, True]):
-        yield SampleInput(input_t.clone().requires_grad_(requires_grad),
-                          kwargs=dict(as_tuple=as_tuple))
+        yield(SampleInput(input_t.clone().requires_grad_(requires_grad),
+                          kwargs=dict(as_tuple=as_tuple)))
 
 def sample_inputs_chunk(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
@@ -6615,7 +6600,7 @@ def sample_inputs_chunk(op_info, device, dtype, requires_grad, **kwargs):
 
     for case in cases:
         shape, args = case
-        yield SampleInput(make_arg(shape), args=args)
+        yield(SampleInput(make_arg(shape), args=args))
 
 def reference_inputs_chunk(op, device, dtype, requires_grad, **kwargs):
     yield from sample_inputs_chunk(op, device, dtype, requires_grad, **kwargs)
@@ -6693,15 +6678,6 @@ def sample_inputs_dropout(op_info, device, dtype, requires_grad, *,
         yield SampleInput(make_arg(case), p=p, training=training)
     yield SampleInput(make_arg(case))
 
-def sample_inputs_dropout_backward(op_info, device, dtype, requires_grad, **kwargs):
-    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
-    make_mask = partial(make_tensor, device=device, dtype=torch.bool, requires_grad=False)
-
-    cases = ((S, S, S, S), (S,), ())
-    scale_vals = [0.0, 1.0, 2.0]
-
-    for case, scale in product(cases, scale_vals):
-        yield SampleInput(make_arg(case), make_mask(case), scale)
 
 def sample_inputs_embedding_bag(op_info, device, dtype, requires_grad, **kwargs):
     def make_input(shape):
@@ -8119,7 +8095,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     in_shape = input.shape
     in_rank = len(in_shape)
     for d in start_dim, end_dim:
-        if not ((in_rank == 0 and d in (-1, 0)) or -in_rank <= d < in_rank):
+        if not((in_rank == 0 and d in (-1, 0)) or -in_rank <= d < in_rank):
             raise IndexError(f"Dimension out of range (expected to be in range of [{-in_rank}, {in_rank-1}], but got {d}")
     end_dim = end_dim if end_dim >= 0 else in_rank + end_dim
     start_dim = start_dim if start_dim >= 0 else in_rank + start_dim
@@ -8448,7 +8424,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            variant_test_name='decomposed',
            dtypes=all_types_and_complex_and(torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
-                                                       *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
+                                                       *[torch.bfloat16] if(CUDA11OrLater or TEST_WITH_ROCM) else []),
            assert_autodiffed=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -10578,22 +10554,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            supports_out=True),
-    OpInfo(
-        '_softmax_backward_data',
-        op=torch.ops.aten._softmax_backward_data,
-        aten_name='_softmax_backward_data',
-        dtypes=floating_types_and(torch.bfloat16),
-        dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16),
-        sample_inputs_func=sample_inputs_softmax_backward_data,
-        assert_autodiffed=True,
-        supports_forward_ad=True,
-        supports_fwgrad_bwgrad=True,
-        supports_out=False,
-        skips=(
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples', device_type='cpu'),
-            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)),
-        ),
-    ),
     # `softmin` supports different dtypes based on whether `dtype` argument,
     # is passed or not. Hence two OpInfo entries, one with dtype and other without.
     # https://github.com/pytorch/pytorch/issues/68752
@@ -15967,22 +15927,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         sample_inputs_func=sample_inputs_dropout,
         inplace_variant=lambda input, *args, **kwargs:
             wrapper_set_seed(torch.nn.functional.dropout, input, *args, **kwargs, inplace=True)),
-    OpInfo(
-        "native_dropout_backward",
-        op=torch.ops.aten.native_dropout_backward.default,
-        aten_name="native_dropout_backward",
-        dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
-        dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
-        supports_out=False,
-        sample_inputs_func=sample_inputs_dropout_backward,
-        skips=(
-            DecorateInfo(unittest.skip('Skipped!'), 'TestJit', 'test_variant_consistency_jit'),
-            # Lazy tensor failures
-            DecorateInfo(unittest.skip('Skipped!'), 'TestLazyOpInfo', 'test_dispatched_to_lazy'),
-            DecorateInfo(unittest.expectedFailure, 'TestLazyOpInfo', 'test_correctness'),
-            DecorateInfo(unittest.expectedFailure, 'TestLazyOpInfo', 'test_correctness_with_reusing_ir'),
-        ),
-    ),
     OpInfo(
         "nn.functional.dropout2d",
         op=lambda input, *args, **kwargs:

From fab125dedad48d3dcedd9bce59f04bfda478616f Mon Sep 17 00:00:00 2001
From: iLeGend <824040212@qq.com>
Date: Mon, 14 Nov 2022 03:39:43 +0000
Subject: [PATCH 0853/1922] fix typo: AT_MKLDNN_EBABLED => AT_MKLDNN_ENABLED
 (#88952)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88952
Approved by: https://github.com/XiaobingSuper
---
 aten/src/ATen/native/mkldnn/Prelu.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/mkldnn/Prelu.cpp b/aten/src/ATen/native/mkldnn/Prelu.cpp
index acc78211d83cc..dc7d239da7b68 100644
--- a/aten/src/ATen/native/mkldnn/Prelu.cpp
+++ b/aten/src/ATen/native/mkldnn/Prelu.cpp
@@ -17,7 +17,7 @@ std::tuple<Tensor, Tensor> mkldnn_prelu_backward(const Tensor& grad_output, cons
 
 }}
 
-#else // AT_MKLDNN_EBABLED
+#else // AT_MKLDNN_ENABLED
 
 #include <ATen/native/mkldnn/MKLDNNCommon.h>
 #include <ATen/native/mkldnn/Utils.h>
@@ -76,4 +76,4 @@ std::tuple<Tensor, Tensor> mkldnn_prelu_backward(const Tensor& grad_output, cons
 }
 }}
 
-#endif // AT_MKLDNN_EBABLED
+#endif // AT_MKLDNN_ENABLED

From 063a2fa8b59d11592e9264853b2413ed8533ee88 Mon Sep 17 00:00:00 2001
From: XiaobingSuper <xiaobing.zhang@intel.com>
Date: Sun, 13 Nov 2022 22:09:53 -0500
Subject: [PATCH 0854/1922] TorchDynamo: Add convolution binary(inplace) fusion
 for cpu in inference mode (#88403)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88403
Approved by: https://github.com/jgong5, https://github.com/jansel
---
 torch/_inductor/ir.py        | 112 +++++++++++++++++++++-
 torch/_inductor/lowering.py  |  34 +++++++
 torch/_inductor/overrides.py | 174 ++++++++++++++++++++++++++++++++---
 3 files changed, 303 insertions(+), 17 deletions(-)

diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 240c196a73b6d..ffb935ae440db 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -1552,11 +1552,23 @@ def loader(index):
 
 @dataclasses.dataclass
 class Layout(IRNode):
-    device: torch.device
-    dtype: torch.dtype
-    size: List[Expr]
-    stride: List[Expr]
-    offset: Expr = Integer(0)
+    def __init__(
+        self,
+        device: torch.device,
+        dtype: torch.dtype,
+        size: List[Expr],
+        stride: List[Expr],
+        offset: Expr = Integer(0),
+    ):
+        self.device = device
+        self.dtype = dtype
+        self.size = size
+        self._stride = stride
+        self.offset = offset
+
+    @property
+    def stride(self):
+        return self._stride
 
     def __str__(self):
         offset = ""
@@ -1772,6 +1784,15 @@ def __init__(self, target: IRNode):
         )
         self.target = target
 
+    @Layout.stride.getter
+    def stride(self):
+        return self.real_layout().stride
+
+    def real_layout(self):
+        if isinstance(self.target, MutationLayout):
+            return self.target.real_layout()
+        return self.target.data.layout
+
     @classmethod
     def realize_into(cls, src, dst):
         dst.realize()
@@ -2467,6 +2488,16 @@ def require_stride_order(cls, x, order):
                 x.get_layout(), FixedLayout
             ) and x.get_layout().is_stride_ordered(order):
                 return x
+            elif isinstance(x.get_layout(), MutationLayout):
+                if isinstance(x.get_layout().real_layout(), FlexibleLayout):
+                    raise AssertionError(
+                        "the MutationLayout's real layout shouldn't be FlexibleLayout"
+                    )
+                elif isinstance(
+                    x.get_layout().real_layout(), FixedLayout
+                ) and x.get_layout().real_layout().is_stride_ordered(order):
+                    return x
+
         # TODO - Storage to InputBuffer
         if isinstance(x, InputBuffer) and x.get_layout().is_stride_ordered(order):
             return x
@@ -3513,6 +3544,77 @@ def apply_constraint(self):
         self.freeze_layout_with_stride_order(self.layout.preferred_stride_order)
 
 
+class ConvolutionBinaryInplace(ExternKernelAlloc):
+    kernel = "torch.ops.mkldnn._convolution_pointwise_.binary"
+
+    def __init__(
+        self,
+        kernel_layout,
+        inputs_layout,
+        inputs,
+        constant_args=(),
+        kernel="torch.ops.mkldnn._convolution_pointwise_.binary",
+    ):
+        super().__init__(kernel_layout, inputs, constant_args)
+        self.kernel = kernel
+        self.inputs_layout = inputs_layout
+
+    def codegen(self, wrapper):
+        wrapper.writeline(
+            f"{self.get_name()} = {self.kernel}({', '.join(self.codegen_args())})"
+        )
+
+    def get_mutation_names(self):
+        assert isinstance(self.layout, MutationLayout)
+        return (self.layout.target.get_name(),)
+
+    @classmethod
+    def create(
+        cls,
+        x: "TensorBox",
+        other: "TensorBox",
+        weight: "TensorBox",
+        bias: "TensorBox",
+        padding_: List[int],
+        stride_: List[int],
+        dilation_: List[int],
+        groups: int,
+        binary_attr: str,
+        binary_alpha: Optional[float],
+        unary_attr: Optional[str],
+        unary_scalars: Optional[List],
+        unary_algorithm: Optional[str],
+    ):
+        kernel = "torch.ops.mkldnn._convolution_pointwise_.binary"
+        (inputs, constant_args, inputs_layout,) = _prepare_convolution_fusion_create(
+            cls, x, weight, bias, padding_, stride_, dilation_, groups
+        )
+        other = cls.realize_input(other)
+        V.graph.realize_users_of(other.get_name())
+        inputs.insert(1, other)
+        constant_args = constant_args + [
+            binary_attr,
+            binary_alpha,
+            unary_attr,
+            unary_scalars,
+            unary_algorithm,
+        ]
+        return ConvolutionBinaryInplace(
+            kernel_layout=MutationLayout(inputs[1]),
+            inputs_layout=inputs_layout,
+            inputs=inputs,
+            constant_args=constant_args,
+            kernel=kernel,
+        )
+
+    def apply_constraint(self):
+        x = self.inputs[0]
+        # FixedLayout of input
+        x = self.require_stride_order(x, self.inputs_layout.preferred_stride_order)
+        self.inputs[0] = x
+        self.freeze_layout_with_stride_order(self.inputs_layout.preferred_stride_order)
+
+
 class LinearUnary(ExternKernelAlloc):
     kernel = "torch.ops.mkldnn._linear_pointwise"
 
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index dedd39cd91c46..9924396075f6c 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -960,6 +960,40 @@ def convolution_binary(
                 )
             )
 
+        @register_lowering(torch.ops.mkldnn._convolution_pointwise_.binary)
+        def convolution_binary_inplace(
+            x: TensorBox,
+            other: TensorBox,
+            weight: TensorBox,
+            bias: TensorBox,
+            padding,
+            stride,
+            dilation,
+            groups,
+            binary_attr,
+            binary_alpha,
+            unary_attr,
+            unary_scalars,
+            unary_algorithm,
+        ):
+            return TensorBox.create(
+                ir.ConvolutionBinaryInplace.create(
+                    x,
+                    other,
+                    weight,
+                    bias,
+                    padding,
+                    stride,
+                    dilation,
+                    groups,
+                    binary_attr,
+                    binary_alpha,
+                    unary_attr,
+                    unary_scalars,
+                    unary_algorithm,
+                )
+            )
+
         @register_lowering(torch.ops.mkldnn._linear_pointwise)
         def linear_unary(
             x: TensorBox, w: TensorBox, b: TensorBox, attr, scalars, algorithm
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index d89ee82674dde..a4a29fb023824 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -196,6 +196,79 @@ def forward(self, input, other):
         return self._conv_forward(input, other, self.weight, self.bias)
 
 
+class ConvBinaryInplace2d(nn.Conv2d):
+    def __init__(
+        self,
+        conv: nn.Module,
+        binary_op_name: str,
+    ):
+        super(ConvBinaryInplace2d, self).__init__(
+            conv.in_channels,
+            conv.out_channels,
+            conv.kernel_size,
+            conv.stride,
+            conv.padding,
+            conv.dilation,
+            conv.groups,
+            conv.bias is not None,
+            conv.padding_mode,
+            conv.weight.device,
+            conv.weight.dtype,
+        )
+        self._update_module_params(conv, binary_op_name)
+
+    def _update_module_params(self, conv, binary_op_name):
+        self.__dict__ = copy.deepcopy(conv.__dict__)
+        self.binary_attr = binary_op_name
+        self.binary_alpha = None
+        self.unary_attr = None
+        self.unary_scalars = []
+        self.unary_algorithm = None
+
+    def _update_unary_params(self, unary):
+        self.attr, self.scalars, self.algorithm = unary_modules_map[unary.__class__](
+            unary
+        )
+
+    def _conv_forward(self, input, other, weight, bias):
+        if self.padding_mode != "zeros":
+            return torch.ops.mkldnn._convolution_pointwise_(
+                F.pad(
+                    input, self._reversed_padding_repeated_twice, mode=self.padding_mode
+                ),
+                other,
+                weight,
+                bias,
+                _pair(0),
+                self.stride,
+                self.dilation,
+                self.groups,
+                self.binary_attr,
+                self.binary_alpha,
+                self.unary_attr,
+                self.unary_scalars,
+                self.unary_algorithm,
+            )
+        return torch.ops.mkldnn._convolution_pointwise_(
+            input,
+            other,
+            weight,
+            bias,
+            self.padding,
+            self.stride,
+            self.dilation,
+            self.groups,
+            self.binary_attr,
+            self.binary_alpha,
+            self.unary_attr,
+            self.unary_scalars,
+            self.unary_algorithm,
+        )
+
+    def forward(self, input, other):
+        return self._conv_forward(input, other, self.weight, self.bias)
+
+
 class LinearUnary(nn.Linear):
     def __init__(
         self,
@@ -263,6 +336,14 @@ def fused_conv_binary_eval(conv: nn.Module, binary_op_name: str):
     )
 
 
+def fused_conv_binary_inplace_eval(conv: nn.Module, binary_op_name: str):
+    assert not (conv.training), "Fusion only for eval!"
+    return ConvBinaryInplace2d(
+        conv,
+        binary_op_name,
+    )
+
+
 def is_bfloat16_module(m):
     weight_is_bf16 = m.weight.dtype == torch.bfloat16
     bias_is_bf16 = m.bias is None or m.bias.dtype == torch.bfloat16
@@ -312,6 +393,25 @@ def check_node_is_binary(node):
     )
 
 
+def check_binary_op_kwargs_is_default(node):
+    # For binary op, we hope the kwargs values are the default value:
+    # torch.sub(add)(input, other, *, alpha=1, out=None).
+    if len(node.args) > 2:
+        return False
+    if len(node.kwargs) > 0:
+        if "out" in node.kwargs and node.kwargs["out"] is not None:
+            return False
+        if "alpha" in node.kwargs and node.kwargs["alpha"] != 1.0:
+            return False
+    return True
+
+
+def check_node_is_add_inplace(node):
+    return (node.op == "call_function" and node.target in [operator.iadd]) or (
+        node.op == "call_method" and node.target in ["add_"]
+    )
+
+
 def fuse_fx(gm: torch.fx.GraphModule, example_inputs):
     # make sure the autograd is disabled.
     if torch.is_grad_enabled():
@@ -328,6 +428,7 @@ def fuse_fx(gm: torch.fx.GraphModule, example_inputs):
     # the binary inputs have same tensor info(device, dtype, and layout).
     ShapeProp(gm).propagate(*example_inputs)
     gm = fuse_unary(gm)
+    gm = fuse_binary_inplace(gm)
     gm = fuse_binary(gm)
 
     return gm
@@ -419,26 +520,31 @@ def replace_and_fuse_for_binary(
     node.replace_all_uses_with(node.args[index_node])
 
 
+def binary_inputs_meta_is_same(binary_node):
+    tensor0_meta = binary_node.args[0].meta.get("tensor_meta")
+    tensor1_meta = binary_node.args[1].meta.get("tensor_meta")
+    if not tensor0_meta or not tensor1_meta:
+        return False
+    if (
+        tensor0_meta.shape != tensor1_meta.shape
+        or tensor0_meta.stride != tensor1_meta.stride
+        or tensor0_meta.dtype != tensor1_meta.dtype
+    ):
+        return False
+
+    return True
+
+
 def fuse_binary(gm: torch.fx.GraphModule):
     modules = dict(gm.named_modules())
     for node in gm.graph.nodes:
-        if check_node_is_binary(node) and (
-            len(node.kwargs) != 2 or node.kwargs["alpha"] == 1.0
-        ):
+        if check_node_is_binary(node) and check_binary_op_kwargs_is_default(node):
             for node_kind, fuse_func in computation_op_binary_op_fusion_map.items():
                 if not isinstance(node.args[0], torch.fx.Node) or not isinstance(
                     node.args[1], torch.fx.Node
                 ):
                     continue
-                tensor0_meta = node.args[0].meta.get("tensor_meta")
-                tensor1_meta = node.args[1].meta.get("tensor_meta")
-                if not tensor0_meta or not tensor1_meta:
-                    continue
-                if (
-                    tensor0_meta.shape != tensor1_meta.shape
-                    or tensor0_meta.stride != tensor1_meta.stride
-                    or tensor0_meta.dtype != tensor1_meta.dtype
-                ):
+                if not binary_inputs_meta_is_same(node):
                     continue
                 attr = binary_attr[node.target]
                 index_list = supported_index_list[attr]
@@ -473,6 +579,46 @@ def fuse_binary(gm: torch.fx.GraphModule):
     return gm
 
 
+def fuse_binary_inplace(gm: torch.fx.GraphModule):
+    modules = dict(gm.named_modules())
+    for node in gm.graph.nodes:
+        if check_node_is_add_inplace(node) and check_binary_op_kwargs_is_default(node):
+            for (
+                node_kind,
+                fuse_func,
+            ) in computation_op_binary_op_fusion_inplace_map.items():
+                if not isinstance(node.args[0], torch.fx.Node) or not isinstance(
+                    node.args[1], torch.fx.Node
+                ):
+                    continue
+                if not binary_inputs_meta_is_same(node):
+                    continue
+                if check_node_kind(node.args[1], modules, node_kind):
+                    if len(node.args[1].users) > 1:
+                        continue
+                    # make sure the output and input are not same tensor.
+                    if node.args[1].args[0] == node.args[0]:
+                        continue
+                    computation_node = modules[node.args[1].target]
+                    replace_and_fuse_for_binary(
+                        computation_node,
+                        node,
+                        fuse_func,
+                        "add",
+                        modules,
+                        1,  # conv module index
+                        0,  # binary op index
+                    )
+                    # Make sure the fused node is post node of node's inputs nodes.
+                    node.append(node.args[1])
+                    gm.graph.erase_node(node)
+                    gm.graph.lint()
+                    break
+
+    gm.recompile()
+    return gm
+
+
 philox_rand_like = _prims._make_prim(
     schema="philox_rand_like(Tensor input, Tensor seed, int offset) -> Tensor",
     return_type=_prims.RETURN_TYPE.NEW,
@@ -629,6 +775,10 @@ def rand_like(x, **kwargs):
 }
 
 
+computation_op_binary_op_fusion_inplace_map = {
+    nn.Conv2d: fused_conv_binary_inplace_eval,
+}
+
 # For add: we support conv/linear + other and other + conv
 # For sub/add_/sub_, we only support conv/linear - other
 # or conv/linear +(-)= other

From 5540571beba92bfd9a3953b2ccbbd64d3a697679 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Mon, 14 Nov 2022 09:50:50 +0000
Subject: [PATCH 0855/1922] Fix typos in messages under aten (#88964)

This PR fixes typos of messages and parms in c++ source files under `aten` directory.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88964
Approved by: https://github.com/lezcano
---
 aten/src/ATen/core/List_test.cpp                       |  4 ++--
 aten/src/ATen/core/class_type.cpp                      |  4 ++--
 aten/src/ATen/cuda/detail/CUDAHooks.cpp                |  2 +-
 aten/src/ATen/cudnn/Descriptors.cpp                    |  2 +-
 aten/src/ATen/native/LinearAlgebra.cpp                 |  2 +-
 aten/src/ATen/native/SpectralOps.cpp                   |  4 ++--
 aten/src/ATen/native/TensorShape.cpp                   |  2 +-
 .../ao_sparse/quantized/cpu/qlinear_deserialize.cpp    |  2 +-
 .../native/ao_sparse/quantized/cpu/qlinear_dynamic.cpp |  2 +-
 aten/src/ATen/native/quantized/cpu/BinaryOps.cpp       |  4 ++--
 aten/src/ATen/native/quantized/cpu/qconv.cpp           |  2 +-
 aten/src/ATen/native/quantized/cpu/qmatmul.cpp         |  4 ++--
 aten/src/ATen/native/quantized/cpu/qmul.cpp            |  4 ++--
 aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp     |  4 ++--
 aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp   |  2 +-
 aten/src/ATen/native/sparse/SparseCsrTensor.cpp        | 10 +++++-----
 aten/src/ATen/native/vulkan/api/Adapter.cpp            |  2 +-
 aten/src/ATen/native/vulkan/ops/Clone.cpp              |  2 +-
 test/test_sparse_csr.py                                |  8 ++++----
 19 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/aten/src/ATen/core/List_test.cpp b/aten/src/ATen/core/List_test.cpp
index e16e26b6042e3..f37f3c0084932 100644
--- a/aten/src/ATen/core/List_test.cpp
+++ b/aten/src/ATen/core/List_test.cpp
@@ -1118,7 +1118,7 @@ TEST(ListTest, canAccessStringByReference) {
   List<std::string> list({"one", "two"});
   const auto& listRef = list;
   static_assert(std::is_same<decltype(listRef[1]), const std::string&>::value,
-                "const List<std::string> acccess should be by const reference");
+                "const List<std::string> access should be by const reference");
   std::string str = list[1];
   const std::string& strRef = listRef[1];
   EXPECT_EQ("two", str);
@@ -1130,7 +1130,7 @@ TEST(ListTest, canAccessOptionalStringByReference) {
   const auto& listRef = list;
   static_assert(
       std::is_same<decltype(listRef[1]), c10::optional<std::reference_wrapper<const std::string>>>::value,
-      "List<c10::optional<std::string>> acccess should be by const reference");
+      "List<c10::optional<std::string>> access should be by const reference");
   c10::optional<std::string> str1 = list[1];
   c10::optional<std::string> str2 = list[2];
   decltype(auto) strRef1 = listRef[1];
diff --git a/aten/src/ATen/core/class_type.cpp b/aten/src/ATen/core/class_type.cpp
index 9d7b38d4d67b6..2478bde034bc7 100644
--- a/aten/src/ATen/core/class_type.cpp
+++ b/aten/src/ATen/core/class_type.cpp
@@ -86,7 +86,7 @@ std::string ClassType::getForwardPreHookErrorMessage(int pre_hook_idx) const {
   std::string pre_hook_schema =
       pre_hook_name + "(self, input: Tuple[" + input_types + "])";
   std::string return_string =
-      "This error occured while scripting the forward pre-hook '" +
+      "This error occurred while scripting the forward pre-hook '" +
       pre_hook_name + "' on module '" + name()->name() +
       "'. If you did not want to script this pre-hook remove it from the "
       "original NN module before scripting. Pre-hooks for module '" +
@@ -111,7 +111,7 @@ std::string ClassType::getForwardHookErrorMessage(int hook_idx) const {
   std::string hook_schema = hook_name + "(self, input: Tuple[" +
                             input_types + "], output: " + output_types + ")";
   std::string return_string =
-      "This error occured while scripting the forward hook '"
+      "This error occurred while scripting the forward hook '"
       + hook_name + "' on module " + name()->name() +
       ". If you did not want to script this hook remove it from" +
       " the original NN module before scripting. This hook was" +
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index b5e685dac65f1..25e4c2b44fa99 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -82,7 +82,7 @@ void CUDAHooks::initCUDA() const {
   at::cuda::detail::init_p2p_access_cache(num_devices);
 
 #if AT_MAGMA_ENABLED()
-  TORCH_INTERNAL_ASSERT(magma_init_fn != nullptr, "Cannot initilaize magma, init routine not set");
+  TORCH_INTERNAL_ASSERT(magma_init_fn != nullptr, "Cannot initialize magma, init routine not set");
   magma_init_fn();
 #endif
 }
diff --git a/aten/src/ATen/cudnn/Descriptors.cpp b/aten/src/ATen/cudnn/Descriptors.cpp
index f954bbf5623ad..0e739a49bb33c 100644
--- a/aten/src/ATen/cudnn/Descriptors.cpp
+++ b/aten/src/ATen/cudnn/Descriptors.cpp
@@ -164,7 +164,7 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo
       filter_format = CUDNN_TENSOR_NHWC;
       break;
     default:
-      TORCH_INTERNAL_ASSERT(false, "unsurpported memory_format for cuDNN filters");
+      TORCH_INTERNAL_ASSERT(false, "unsupported memory_format for cuDNN filters");
   }
   set(getDataType(t), (int) dim, size, filter_format);
 }
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 8c5a6fc8f1955..c21bc4b475312 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -877,7 +877,7 @@ std::vector<std::vector<int64_t>> matrix_chain_order(TensorList tensors) {
 /**
  * @brief Recursively multiplies the tensors i...j using the given order
  *
- * @param tensors matrices to multiply togther
+ * @param tensors matrices to multiply together
  * @param order optimal chain multiplication order from #matrix_chain_order
  * @param i index of first tensor to be multiplied
  * @param j index of last tensor to be multiplied
diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index 0acc3506cf515..e08e17af4d087 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -1053,13 +1053,13 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const optional<int64_t> ho
   if (onesided) {
     if (n_fft / 2 + 1 != fft_size) {
       std::ostringstream ss;
-      REPR(ss) << ": expected the frequency dimension (3rd to the last) of the input tensor to match n_fft / 2 + 1 when onsided=True, but got " << fft_size;
+      REPR(ss) << ": expected the frequency dimension (3rd to the last) of the input tensor to match n_fft / 2 + 1 when onesided=True, but got " << fft_size;
       AT_ERROR(ss.str());
     }
   } else {
     if (n_fft != fft_size) {
       std::ostringstream ss;
-      REPR(ss) << ": expected the frequency dimension (3rd to the last) of the input tensor to match n_fft when onsided=False, but got " << fft_size;
+      REPR(ss) << ": expected the frequency dimension (3rd to the last) of the input tensor to match n_fft when onesided=False, but got " << fft_size;
       AT_ERROR(ss.str());
     }
   }
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index ccaf4b464252a..ba6ff27661baf 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -1589,7 +1589,7 @@ Tensor _reshape_copy_symint(const Tensor& self, c10::SymIntArrayRef proposed_sha
   c10::SymDimVector shape = infer_size_dv(proposed_shape, self.sym_numel());
 
   if (self.is_mkldnn()) {
-    TORCH_CHECK(0, "_reshape_copy not implemented for mkldnn tesnors");
+    TORCH_CHECK(0, "_reshape_copy not implemented for mkldnn tensors");
   }
 
   if (self.is_contiguous()) {
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp
index c5fa0210cd581..d367dbe011031 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp
@@ -209,7 +209,7 @@ PackedLinearWeightQnnp::PackedLinearWeightQnnp(
       std::get<serialization_version_index>(serialized);
   TORCH_CHECK(
       serialization_version <= SPARSE_LINEAR_PACKED_PARAM_SERIALIZATION_VERSION,
-      "Attemped to deserialize sparse qlinear packed params with an ",
+      "Attempted to deserialize sparse qlinear packed params with an ",
       "incompatible serialization version (",
       serialization_version,
       " > ",
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_dynamic.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_dynamic.cpp
index a430e81854519..64cab80790a99 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_dynamic.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_dynamic.cpp
@@ -45,7 +45,7 @@ at::Tensor PackedLinearWeightQnnp::apply_dynamic_impl<false>(
   const auto cols_input = static_cast<int64_t>(input.size(input.dim() - 1));
   TORCH_CHECK(
       cols_input == input_channels_,
-      "quantized_sparse_lienar: Input tensor's last and weight tensor's"
+      "quantized_sparse_linear: Input tensor's last and weight tensor's"
       " second dimension must match.");
 
   // On empty input, no output data will be generated,
diff --git a/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp b/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp
index 8444f9ca615be..58a7036bdd7e2 100644
--- a/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp
+++ b/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp
@@ -36,10 +36,10 @@ namespace {
 inline void check_inputs(const Tensor& qa, const Tensor& qb) {
   TORCH_CHECK(
       qa.qscheme() == kPerTensorAffine,
-      "Only per tensor quantization is suported in Add.");
+      "Only per tensor quantization is supported in Add.");
   TORCH_CHECK(
       qa.qscheme() == qb.qscheme(),
-      "Both inputs to Add must have the same quantization shceme.");
+      "Both inputs to Add must have the same quantization scheme.");
   TORCH_CHECK(
       qa.scalar_type() == qb.scalar_type(),
       "Add operands should have same data type.");
diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index 2cd7cd81b9034..b6fa57b9e3ede 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -130,7 +130,7 @@ at::SmallVector<int64_t, kSpatialDim + 2> MakeDeConvOutputShape(
                 ", output padding: ", output_padding[idx],
                 ", dilation: ", dilation[idx])
     TORCH_CHECK(output_shape[idx + 2] < kReasonableMaxDim,
-                "Output dimension is beyound reasonable maximum for ", idx,
+                "Output dimension is beyond reasonable maximum for ", idx,
                 " axis;"
                 " kernel: ", kernel[idx],
                 ", stride: ", stride[idx],
diff --git a/aten/src/ATen/native/quantized/cpu/qmatmul.cpp b/aten/src/ATen/native/quantized/cpu/qmatmul.cpp
index c1e5041a5734c..4da714e0bcf0b 100644
--- a/aten/src/ATen/native/quantized/cpu/qmatmul.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qmatmul.cpp
@@ -21,7 +21,7 @@ inline void check_inputs(const Tensor& qa, const Tensor& qb) {
       "MatMul operands should have same data type.");
   TORCH_CHECK(
       qa.qscheme() == kPerTensorAffine || qa.qscheme() == kPerTensorSymmetric,
-      "Only per-tensor quantization is suported in Matmul.");
+      "Only per-tensor quantization is supported in Matmul.");
   TORCH_CHECK(
       qa.qscheme() == qb.qscheme(),
       "Both inputs to Matmul must have the same quantization scheme.");
@@ -45,7 +45,7 @@ Tensor qmatmul(
       " and ", b_num_dims, " provided)");
   TORCH_CHECK(
       num_dims >= 2,
-      "Quantized Matmul currently only suports operands which are at least 2-dimensional. (",
+      "Quantized Matmul currently only supports operands which are at least 2-dimensional. (",
       num_dims, " provided)");
 
   const int64_t m = qa.size(num_dims - 2);
diff --git a/aten/src/ATen/native/quantized/cpu/qmul.cpp b/aten/src/ATen/native/quantized/cpu/qmul.cpp
index 35d2139c6c142..aa6ad0e724f5b 100644
--- a/aten/src/ATen/native/quantized/cpu/qmul.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qmul.cpp
@@ -40,7 +40,7 @@ inline void check_inputs(const Tensor& qa, const Tensor& qb) {
   TORCH_CHECK(qa.scalar_type() == qb.scalar_type(),
               "Mul operands should have same data type.");
   TORCH_CHECK(qa.qscheme() == qb.qscheme(),
-              "Both inputs to Mul must have the same quantization shceme.");
+              "Both inputs to Mul must have the same quantization scheme.");
 }
 
 // Note: out is assumed to be the same size as self and other.
@@ -314,7 +314,7 @@ class QMulScalarTensor final {
   static Tensor run(Tensor qa, Tensor b) {
     TORCH_CHECK(qa.qscheme() == kPerTensorAffine ||
               qa.qscheme() == kPerTensorSymmetric,
-              "Only per tensor quantization is suported in Mul.");
+              "Only per tensor quantization is supported in Mul.");
     auto qc = at::empty_like(qa, qa.suggest_memory_format());
     return _mul_scalar_out<ReLUFused>(qc, qa, b.item());
   }
diff --git a/aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp b/aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp
index d9abd8bcfc797..fbb46b4b0174c 100644
--- a/aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/BinaryOps.cpp
@@ -71,10 +71,10 @@ std::unordered_map<CacheKey, cudnn_frontend::ManagedOpaqueDescriptor, at::native
 inline void check_inputs(const Tensor& qa, const Tensor& qb) {
   TORCH_CHECK(
       qa.qscheme() == kPerTensorAffine,
-      "Only per tensor quantization is suported in Add.");
+      "Only per tensor quantization is supported in Add.");
   TORCH_CHECK(
       qa.qscheme() == qb.qscheme(),
-      "Both inputs to Add must have the same quantization shceme.");
+      "Both inputs to Add must have the same quantization scheme.");
   TORCH_CHECK(
       qa.scalar_type() == qb.scalar_type(),
       "Add operands should have same data type.");
diff --git a/aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp b/aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp
index b2a12832332c3..e214ab6492dfd 100644
--- a/aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp
@@ -33,7 +33,7 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeightCudnn<
         int64_t groups,
         bool transpose) {
   // TODO: need to check out to implement groups for conv operator in Conv.cpp
-  TORCH_CHECK(groups == 1, "Quantized cudnn conv2d is currenty limited to groups = 1; received groups =", groups);
+  TORCH_CHECK(groups == 1, "Quantized cudnn conv2d is currently limited to groups = 1; received groups =", groups);
   TORCH_CHECK(weight.qscheme() == c10::kPerTensorAffine, "Unsupported qscheme: ", toString(weight.qscheme()));
   TORCH_CHECK(
       kSpatialDim == 2,  // 1D is packed as 2d, hence we don't need other checks
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensor.cpp b/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
index 2bcbe00a87205..ef205c5673ae8 100644
--- a/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
@@ -129,7 +129,7 @@ void _validate_sparse_compressed_tensor_args_worker(const Tensor& compressed_ind
   // 3.1
   TORCH_CHECK(
               static_cast<int>(size.size()) == batch_ndim + base_ndim + dense_ndim,
-              "tensor dimensionality must be sum of batch, base, and dense dimensionalites (=",
+              "tensor dimensionality must be sum of batch, base, and dense dimensionalities (=",
               batch_ndim, " + ", base_ndim, " + ", dense_ndim, ") but got ", size.size());
 
   // For CSR/CSC formats, we define blocksize=(1, 1) so that checking
@@ -380,7 +380,7 @@ DimVector _estimate_sparse_compressed_tensor_size(
   }
   TORCH_CHECK(
               static_cast<int>(size.size()) == batch_ndim + base_ndim + dense_ndim,
-              "tensor dimensionality must be sum of batch, base, and dense dimensionalites (=",
+              "tensor dimensionality must be sum of batch, base, and dense dimensionalities (=",
               batch_ndim, " + ", base_ndim, " + ", dense_ndim, ") but got ", size.size());
   return size;
 }
@@ -559,13 +559,13 @@ Tensor& copy_sparse_compressed_(Tensor& self, const Tensor& src, bool non_blocki
                 "torch.copy_: expected shapes of self and src to match along dimension ",
                 self_compressed_dim, " for ",
                 self.layout(), " layout but the corresponding dimensions of self and src are ",
-                self_compressed_dims, " and ", src_compressed_dims, ", respecitvely.");
+                self_compressed_dims, " and ", src_compressed_dims, ", respectively.");
   } else {
     TORCH_CHECK(self_compressed_dims == src_compressed_dims,
                 "torch.copy_: expected shapes of self and src to match along dimensions ",
                 self_compressed_dim, " and ", src_compressed_dim, ", respectively, for ",
                 self.layout(), " layout but the corresponding dimensions of self and src are ",
-                self_compressed_dims, " and ", src_compressed_dims, ", respecitvely.");
+                self_compressed_dims, " and ", src_compressed_dims, ", respectively.");
   }
   AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(self.layout(), "copy_sparse_compressed_",
                                               [&]{},
@@ -576,7 +576,7 @@ Tensor& copy_sparse_compressed_(Tensor& self, const Tensor& src, bool non_blocki
                                                 auto src_blocksize = DimVector(src_values.sizes().slice(src_values.dim()-2, 2));
                                                 TORCH_CHECK(self_blocksize == src_blocksize,
                                                             "torch.copy_: copy of sparse compressed tensors having different block sizes is not supported.",
-                                                            " self and src block sizes are ", self_blocksize, " and ", src_blocksize, ", respectivly.");
+                                                            " self and src block sizes are ", self_blocksize, " and ", src_blocksize, ", respectively.");
                                               });
   AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(self.layout(), "copy_sparse_compressed_",
                                             [&]{
diff --git a/aten/src/ATen/native/vulkan/api/Adapter.cpp b/aten/src/ATen/native/vulkan/api/Adapter.cpp
index 311648b6894ed..176236611c1d9 100644
--- a/aten/src/ATen/native/vulkan/api/Adapter.cpp
+++ b/aten/src/ATen/native/vulkan/api/Adapter.cpp
@@ -195,7 +195,7 @@ std::string get_device_type_str(const VkPhysicalDeviceType type) {
     case VK_PHYSICAL_DEVICE_TYPE_CPU:
       return "CPU";
     default:
-      return "UNKOWN";
+      return "UNKNOWN";
   }
 }
 
diff --git a/aten/src/ATen/native/vulkan/ops/Clone.cpp b/aten/src/ATen/native/vulkan/ops/Clone.cpp
index de353a10cb931..2601d785ddb52 100644
--- a/aten/src/ATen/native/vulkan/ops/Clone.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Clone.cpp
@@ -21,7 +21,7 @@ Tensor clone(
   TORCH_CHECK(
       (c10::MemoryFormat::Preserve == memory_format) ||
           (c10::MemoryFormat::Contiguous == memory_format),
-      "Vulkan supports Preserve and Contiguous memory foramts");
+      "Vulkan supports Preserve and Contiguous memory formats");
 
   Tensor self;
   if (memory_format == MemoryFormat::Preserve) {
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index cc5044da0bd58..d2e3c5fc38514 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -710,7 +710,7 @@ def _generate_invalid_input(self, layout, device):
                shape((2, 3)),
                'compressed_indices must have dimensionality >= 1 but got 0')
 
-        yield ('compressed/plain_indices mismatch of dimensionalites',
+        yield ('compressed/plain_indices mismatch of dimensionalities',
                tensor([[0, 2, 4]]),
                tensor([0, 1, 0, 2]),
                values([1, 2, 3, 4]),
@@ -718,14 +718,14 @@ def _generate_invalid_input(self, layout, device):
                'compressed_indices and plain_indices dimensionalities must be equal but got 2 and 1, respectively')
 
         if layout in {torch.sparse_csr, torch.sparse_csc}:
-            yield ('indices and values mismatch of dimensionalites',
+            yield ('indices and values mismatch of dimensionalities',
                    tensor([[0, 2, 4]]),
                    tensor([[0, 1, 0, 2]]),
                    values([1, 2, 3, 4]),
                    shape((2, 3)),
                    r'values must have dimensionality > sum of batch and block dimensionalities \(=1 \+ 0\) but got 1')
         else:
-            yield ('indices and values mismatch of dimensionalites',
+            yield ('indices and values mismatch of dimensionalities',
                    tensor([[0, 2, 4]]),
                    tensor([[0, 1, 0, 2]]),
                    values([1, 2, 3, 4]),
@@ -737,7 +737,7 @@ def _generate_invalid_input(self, layout, device):
                tensor([0, 1, 0, 2]),
                values([1, 2, 3, 4]),
                (2,),
-               r'tensor dimensionality must be sum of batch, base, and dense dimensionalites \(=0 \+ 2 \+ 0\) but got 1')
+               r'tensor dimensionality must be sum of batch, base, and dense dimensionalities \(=0 \+ 2 \+ 0\) but got 1')
 
         yield ('invalid batchsize',
                tensor([[0, 2, 4]]),

From b62c30a1850e87840b62feeb3ac9f5d106158a84 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 14 Nov 2022 10:29:24 +0000
Subject: [PATCH 0856/1922] [xla hash update] update the pinned xla hash
 (#88982)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned xla hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88982
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/xla.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index 957272e8578b8..6e29f8ee3c313 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-08121e41079319cd369f82f523f5a714a0563f9d
+dd9b67ff0d6ba4da6a46ca1b22e35c98dbed0d77

From 8cf7eb81832f487dbe6006282983b6d9f8c5a398 Mon Sep 17 00:00:00 2001
From: XiaobingSuper <xiaobing.zhang@intel.com>
Date: Sun, 13 Nov 2022 22:09:54 -0500
Subject: [PATCH 0857/1922] TorchDynamo: Add convolution binary+unary fusion
 for cpu in inference mode (#88412)

This PR is about enabling the fusion of **conv+binary+relu**, which will improve the vision model's performance.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88412
Approved by: https://github.com/jgong5, https://github.com/jansel
---
 test/inductor/test_torchinductor.py | 15 +++++++++++++--
 torch/_inductor/overrides.py        | 23 ++++++++++++++++++++---
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index d331559a3a8b1..bf1b0a9e4b37b 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -1449,6 +1449,7 @@ def __init__(
                 dilation,
                 groups,
                 bias,
+                has_relu,
                 **kwargs,
             ):
                 super(M, self).__init__()
@@ -1471,16 +1472,18 @@ def __init__(
                     )
                 )
                 self.binary_fn = binary_fn
+                self.relu = torch.nn.ReLU() if has_relu else torch.nn.Identity()
 
             def forward(self, x):
                 x1 = self.conv1(x)
                 x2 = self.conv2(x)
-                return self.binary_fn(x1, x2)
+                return self.relu(self.binary_fn(x1, x2))
 
         test_memory_format = [torch.contiguous_format, torch.channels_last]
         options = itertools.product(
             binary_list,
             [True, False],
+            [True, False],
             [1, 3],
             [1, 2],
             [1, 4],
@@ -1489,6 +1492,7 @@ def forward(self, x):
 
         for (
             binary_fn,
+            has_relu,
             bias,
             kernel_size,
             dilation,
@@ -1499,7 +1503,14 @@ def forward(self, x):
             iC = 3 * groups
             x_shape = (1, iC, 112, 112)
             mod = M(
-                binary_fn, iC, oC, dilation, groups, bias, kernel_size=kernel_size
+                binary_fn,
+                iC,
+                oC,
+                dilation,
+                groups,
+                bias,
+                has_relu,
+                kernel_size=kernel_size,
             ).eval()
             mod = mod.to(memory_format=memory_format)
             # TODO: add bf16 test
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index a4a29fb023824..8d99107d17c3d 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -157,6 +157,11 @@ def _update_module_params(self, conv, binary_op_name):
         self.unary_scalars = []
         self.unary_algorithm = None
 
+    def _update_unary_params(self, unary):
+        self.unary_attr, self.unary_scalars, self.unary_algorithm = unary_modules_map[
+            unary.__class__
+        ](unary)
+
     def _conv_forward(self, input, other, weight, bias):
         if self.padding_mode != "zeros":
             return torch.ops.mkldnn._convolution_pointwise(
@@ -226,9 +231,9 @@ def _update_module_params(self, conv, binary_op_name):
         self.unary_algorithm = None
 
     def _update_unary_params(self, unary):
-        self.attr, self.scalars, self.algorithm = unary_modules_map[unary.__class__](
-            unary
-        )
+        self.unary_attr, self.unary_scalars, self.unary_algorithm = unary_modules_map[
+            unary.__class__
+        ](unary)
 
     def _conv_forward(self, input, other, weight, bias):
         if self.padding_mode != "zeros":
@@ -344,6 +349,13 @@ def fused_conv_binary_inplace_eval(conv: nn.Module, binary_op_name: str):
     )
 
 
+def fused_binary_unary_eval(conv_binary: nn.Module, unary: nn.Module):
+    assert not (conv_binary.training), "Fusion only for eval!"
+    # reuse origin conv module, and just update its' unary attr.
+    conv_binary._update_unary_params(unary)
+    return conv_binary
+
+
 def is_bfloat16_module(m):
     weight_is_bf16 = m.weight.dtype == torch.bfloat16
     bias_is_bf16 = m.bias is None or m.bias.dtype == torch.bfloat16
@@ -430,6 +442,9 @@ def fuse_fx(gm: torch.fx.GraphModule, example_inputs):
     gm = fuse_unary(gm)
     gm = fuse_binary_inplace(gm)
     gm = fuse_binary(gm)
+    # why re-run fuse_unary? we want to enable conv+binary+unary fusion,
+    # such as conv+add+relu for vision model.
+    gm = fuse_unary(gm)
 
     return gm
 
@@ -741,6 +756,8 @@ def rand_like(x, **kwargs):
 computation_op_unary_op_fusion_map = {
     nn.Conv2d: fused_conv_unary_eval,
     nn.Linear: fused_linear_unary_eval,
+    ConvBinary2d: fused_binary_unary_eval,
+    ConvBinaryInplace2d: fused_binary_unary_eval,
 }
 
 
From 69ad527569d726d8aeccac2ea14b38f9368f3016 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Sun, 13 Nov 2022 22:33:13 +0000
Subject: [PATCH 0858/1922] Run test_torchinductor_opinfo CPU tests if triton
 not installed (#88934)

These test are not run currently because normal CI workers don't have
triton installed.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88934
Approved by: https://github.com/ngimel
---
 test/inductor/test_torchinductor.py        | 20 ++--------------
 test/inductor/test_torchinductor_opinfo.py | 28 ++++++++++++++--------
 torch/testing/_internal/inductor_utils.py  | 23 ++++++++++++++++++
 3 files changed, 43 insertions(+), 28 deletions(-)
 create mode 100644 torch/testing/_internal/inductor_utils.py

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index bf1b0a9e4b37b..dfce58397c5c4 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -20,7 +20,6 @@
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.nn import functional as F
 from torch.testing._internal.common_utils import (
-    IS_FBCODE,
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
     TestCase as TorchTestCase,
@@ -41,7 +40,7 @@
     from torch._inductor.compile_fx import compile_fx, complex_memory_overlap
     from torch._inductor.ir import IndexingDiv, ModularIndexing
     from torch._inductor.sizevars import SizeVarAllocator
-    from torch._inductor.utils import has_torchvision_roi_align, has_triton, timed
+    from torch._inductor.utils import has_torchvision_roi_align, timed
 
     # This will only pass on pytorch builds newer than roughly 5/15/2022
     assert get_decompositions([torch.ops.aten.trace])
@@ -53,25 +52,10 @@
         sys.exit(0)
     raise unittest.SkipTest("requires sympy/functorch/filelock")
 
-HAS_CPU = False
-try:
-    from subprocess import CalledProcessError
-
-    from torch._inductor.codecache import CppCodeCache
-
-    CppCodeCache.load("")
-    HAS_CPU = not IS_FBCODE
-except (
-    CalledProcessError,
-    OSError,
-    torch._inductor.exc.InvalidCxxCompiler,
-    torch._inductor.exc.CppCompileError,
-):
-    pass
+from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
 
 aten = torch.ops.aten
 
-HAS_CUDA = has_triton()
 requires_cuda = functools.partial(unittest.skipIf, not HAS_CUDA, "requires cuda")
 
 torch._inductor.config.triton.autotune = False  # too slow
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 3d384efea0aec..36c5aaacd1dde 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -16,20 +16,22 @@
     onlyNativeDeviceTypes,
     OpDTypes,
     ops,
+    skipCPUIf,
+    skipCUDAIf,
 )
 from torch.testing._internal.common_methods_invocations import op_db
 from torch.testing._internal.common_utils import (
     dtype_abbrs,
     run_tests,
     skipCUDAMemoryLeakCheckIf,
+    skipIfCrossRef,
+    skipIfTorchDynamo,
     suppress_warnings,
-    TEST_WITH_ROCM,
     TestCase,
 )
+from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
 
 try:
-    from torch._inductor.utils import has_triton
-
     try:
         from .test_torchinductor import check_model, check_model_cuda
     except ImportError:
@@ -120,6 +122,7 @@ def process(device_type):
 
 inductor_skips["cpu"] = {
     "linalg.ldl_solve": {b8, f16, f32, f64, i32, i64},  # segfault
+    "linalg.ldl_factor": {f32, f64},  # flaky
     "__rdiv__": {b8, f16, f32, f64, i32, i64},  # flaky
 }
 
@@ -169,6 +172,8 @@ def process(device_type):
     "argwhere": {b8, f16, f32, f64, i32, i64},
     "bernoulli": {f32, f64},
     "bincount": {i32, i64},
+    "cdouble": {b8, f16, f32, f64, i32, i64},
+    "cfloat": {b8, f16, f32, f64, i32, i64},
     "chalf": {b8, f16, f32, f64, i32, i64},
     "cholesky": {f32, f64},
     "combinations": {b8, f16, f32, f64, i32, i64},
@@ -209,11 +214,10 @@ def process(device_type):
     "linalg.lstsq.grad_oriented": {f32, f64},
     "linalg.matrix_rank": {f32, f64},
     "linalg.matrix_rank.hermitian": {f32, f64},
-    "linalg.lu_solve": {f32, f64},
-    "lu_solve": {f32, f64},
-    "lu_unpack": {f32, f64},
+    "linalg.pinv.singular": {f32, f64},
     "logdet": {f32, f64},
     "masked.norm": {f16},
+    "masked.normalize": {f16},
     "masked_fill": {f16},
     "masked_scatter": {f16, f32, f64},
     "masked_select": {b8, f16, f32, f64, i32, i64},
@@ -225,8 +229,8 @@ def process(device_type):
     "nan_to_num": {f16},
     "nanquantile": {f32, f64},
     "nn.functional.avg_pool1d": {i64},
-    "nn.functional.avg_pool2d": {i64},
-    "nn.functional.adaptive_avg_pool2d": {f16},
+    "nn.functional.avg_pool2d": {i64, f64},
+    "nn.functional.adaptive_avg_pool2d": {f16, f64},
     "nn.functional.ctc_loss": {f32, f64},
     "nn.functional.gaussian_nll_loss": {f32, f64},
     "nn.functional.gelu": {f64},
@@ -243,6 +247,7 @@ def process(device_type):
     "quantile": {f32, f64},
     "rand_like": {f16, f32, f64},
     "randint_like": {f16, f32, f64, i32, i64},
+    "randint": {f16, f32, f64, i32, i64},
     "randn_like": {f16, f32, f64},
     "repeat_interleave": {b8, f16, f32, f64, i32, i64},
     "scatter_add": {f16},
@@ -455,6 +460,10 @@ class TestInductorOpInfo(TestCase):
     @skipCUDAMemoryLeakCheckIf(
         True
     )  # inductor kernels failing this test intermittently
+    @skipCUDAIf(not HAS_CUDA, "Skipped! Triton not found")
+    @skipCPUIf(not HAS_CPU, "Skipped! Supported CPU compiler not found")
+    @skipIfTorchDynamo("Test uses dynamo already")
+    @skipIfCrossRef
     @_ops(op_db[START:END])
     @patch("torch._dynamo.config.raise_on_unsafe_aot_autograd", True)
     def test_comprehensive(self, device, dtype, op):
@@ -599,5 +608,4 @@ def fn(*args, **kwargs):
 instantiate_device_type_tests(TestInductorOpInfo, globals())
 
 if __name__ == "__main__":
-    if has_triton() and not TEST_WITH_ROCM:
-        run_tests()
+    run_tests()
diff --git a/torch/testing/_internal/inductor_utils.py b/torch/testing/_internal/inductor_utils.py
new file mode 100644
index 0000000000000..84750a2de3eee
--- /dev/null
+++ b/torch/testing/_internal/inductor_utils.py
@@ -0,0 +1,23 @@
+from subprocess import CalledProcessError
+
+from torch._inductor.codecache import CppCodeCache
+from torch._inductor.utils import has_triton
+from torch.testing._internal.common_utils import (
+    IS_FBCODE,
+    TEST_WITH_ROCM,
+)
+import torch
+
+HAS_CPU = False
+try:
+    CppCodeCache.load("")
+    HAS_CPU = not IS_FBCODE
+except (
+    CalledProcessError,
+    OSError,
+    torch._inductor.exc.InvalidCxxCompiler,
+    torch._inductor.exc.CppCompileError,
+):
+    pass
+
+HAS_CUDA = has_triton() and not TEST_WITH_ROCM

From f48f52b3c836a9448c176383b3babb46a0d2432c Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 14 Nov 2022 12:02:43 +0000
Subject: [PATCH 0859/1922] Revert "Run test_torchinductor_opinfo CPU tests if
 triton not installed (#88934)"

This reverts commit 8371bb8a3dddbead709bc1e9d26715818a34fa8a.

Reverted https://github.com/pytorch/pytorch/pull/88934 on behalf of https://github.com/peterbell10 due to Inductor tests failing on master
---
 test/inductor/test_torchinductor.py        | 20 ++++++++++++++--
 test/inductor/test_torchinductor_opinfo.py | 28 ++++++++--------------
 torch/testing/_internal/inductor_utils.py  | 23 ------------------
 3 files changed, 28 insertions(+), 43 deletions(-)
 delete mode 100644 torch/testing/_internal/inductor_utils.py

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index dfce58397c5c4..bf1b0a9e4b37b 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -20,6 +20,7 @@
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.nn import functional as F
 from torch.testing._internal.common_utils import (
+    IS_FBCODE,
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
     TestCase as TorchTestCase,
@@ -40,7 +41,7 @@
     from torch._inductor.compile_fx import compile_fx, complex_memory_overlap
     from torch._inductor.ir import IndexingDiv, ModularIndexing
     from torch._inductor.sizevars import SizeVarAllocator
-    from torch._inductor.utils import has_torchvision_roi_align, timed
+    from torch._inductor.utils import has_torchvision_roi_align, has_triton, timed
 
     # This will only pass on pytorch builds newer than roughly 5/15/2022
     assert get_decompositions([torch.ops.aten.trace])
@@ -52,10 +53,25 @@
         sys.exit(0)
     raise unittest.SkipTest("requires sympy/functorch/filelock")
 
-from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
+HAS_CPU = False
+try:
+    from subprocess import CalledProcessError
+
+    from torch._inductor.codecache import CppCodeCache
+
+    CppCodeCache.load("")
+    HAS_CPU = not IS_FBCODE
+except (
+    CalledProcessError,
+    OSError,
+    torch._inductor.exc.InvalidCxxCompiler,
+    torch._inductor.exc.CppCompileError,
+):
+    pass
 
 aten = torch.ops.aten
 
+HAS_CUDA = has_triton()
 requires_cuda = functools.partial(unittest.skipIf, not HAS_CUDA, "requires cuda")
 
 torch._inductor.config.triton.autotune = False  # too slow
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 36c5aaacd1dde..3d384efea0aec 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -16,22 +16,20 @@
     onlyNativeDeviceTypes,
     OpDTypes,
     ops,
-    skipCPUIf,
-    skipCUDAIf,
 )
 from torch.testing._internal.common_methods_invocations import op_db
 from torch.testing._internal.common_utils import (
     dtype_abbrs,
     run_tests,
     skipCUDAMemoryLeakCheckIf,
-    skipIfCrossRef,
-    skipIfTorchDynamo,
     suppress_warnings,
+    TEST_WITH_ROCM,
     TestCase,
 )
-from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
 
 try:
+    from torch._inductor.utils import has_triton
+
     try:
         from .test_torchinductor import check_model, check_model_cuda
     except ImportError:
@@ -122,7 +120,6 @@ def process(device_type):
 
 inductor_skips["cpu"] = {
     "linalg.ldl_solve": {b8, f16, f32, f64, i32, i64},  # segfault
-    "linalg.ldl_factor": {f32, f64},  # flaky
     "__rdiv__": {b8, f16, f32, f64, i32, i64},  # flaky
 }
 
@@ -172,8 +169,6 @@ def process(device_type):
     "argwhere": {b8, f16, f32, f64, i32, i64},
     "bernoulli": {f32, f64},
     "bincount": {i32, i64},
-    "cdouble": {b8, f16, f32, f64, i32, i64},
-    "cfloat": {b8, f16, f32, f64, i32, i64},
     "chalf": {b8, f16, f32, f64, i32, i64},
     "cholesky": {f32, f64},
     "combinations": {b8, f16, f32, f64, i32, i64},
@@ -214,10 +209,11 @@ def process(device_type):
     "linalg.lstsq.grad_oriented": {f32, f64},
     "linalg.matrix_rank": {f32, f64},
     "linalg.matrix_rank.hermitian": {f32, f64},
-    "linalg.pinv.singular": {f32, f64},
+    "linalg.lu_solve": {f32, f64},
+    "lu_solve": {f32, f64},
+    "lu_unpack": {f32, f64},
     "logdet": {f32, f64},
     "masked.norm": {f16},
-    "masked.normalize": {f16},
     "masked_fill": {f16},
     "masked_scatter": {f16, f32, f64},
     "masked_select": {b8, f16, f32, f64, i32, i64},
@@ -229,8 +225,8 @@ def process(device_type):
     "nan_to_num": {f16},
     "nanquantile": {f32, f64},
     "nn.functional.avg_pool1d": {i64},
-    "nn.functional.avg_pool2d": {i64, f64},
-    "nn.functional.adaptive_avg_pool2d": {f16, f64},
+    "nn.functional.avg_pool2d": {i64},
+    "nn.functional.adaptive_avg_pool2d": {f16},
     "nn.functional.ctc_loss": {f32, f64},
     "nn.functional.gaussian_nll_loss": {f32, f64},
     "nn.functional.gelu": {f64},
@@ -247,7 +243,6 @@ def process(device_type):
     "quantile": {f32, f64},
     "rand_like": {f16, f32, f64},
     "randint_like": {f16, f32, f64, i32, i64},
-    "randint": {f16, f32, f64, i32, i64},
     "randn_like": {f16, f32, f64},
     "repeat_interleave": {b8, f16, f32, f64, i32, i64},
     "scatter_add": {f16},
@@ -460,10 +455,6 @@ class TestInductorOpInfo(TestCase):
     @skipCUDAMemoryLeakCheckIf(
         True
     )  # inductor kernels failing this test intermittently
-    @skipCUDAIf(not HAS_CUDA, "Skipped! Triton not found")
-    @skipCPUIf(not HAS_CPU, "Skipped! Supported CPU compiler not found")
-    @skipIfTorchDynamo("Test uses dynamo already")
-    @skipIfCrossRef
     @_ops(op_db[START:END])
     @patch("torch._dynamo.config.raise_on_unsafe_aot_autograd", True)
     def test_comprehensive(self, device, dtype, op):
@@ -608,4 +599,5 @@ def fn(*args, **kwargs):
 instantiate_device_type_tests(TestInductorOpInfo, globals())
 
 if __name__ == "__main__":
-    run_tests()
+    if has_triton() and not TEST_WITH_ROCM:
+        run_tests()
diff --git a/torch/testing/_internal/inductor_utils.py b/torch/testing/_internal/inductor_utils.py
deleted file mode 100644
index 84750a2de3eee..0000000000000
--- a/torch/testing/_internal/inductor_utils.py
+++ /dev/null
@@ -1,23 +0,0 @@
-from subprocess import CalledProcessError
-
-from torch._inductor.codecache import CppCodeCache
-from torch._inductor.utils import has_triton
-from torch.testing._internal.common_utils import (
-    IS_FBCODE,
-    TEST_WITH_ROCM,
-)
-import torch
-
-HAS_CPU = False
-try:
-    CppCodeCache.load("")
-    HAS_CPU = not IS_FBCODE
-except (
-    CalledProcessError,
-    OSError,
-    torch._inductor.exc.InvalidCxxCompiler,
-    torch._inductor.exc.CppCompileError,
-):
-    pass
-
-HAS_CUDA = has_triton() and not TEST_WITH_ROCM

From 4fa3c931845bd20aeffde5e7238d24f2dfbae77c Mon Sep 17 00:00:00 2001
From: XiaobingSuper <xiaobing.zhang@intel.com>
Date: Sun, 13 Nov 2022 22:09:56 -0500
Subject: [PATCH 0860/1922] Fake Tensor For (ConvFusion) Propagation (#88414)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88414
Approved by: https://github.com/jgong5, https://github.com/jansel
---
 torch/_inductor/ir.py | 62 +++++++++++++++----------------------------
 1 file changed, 21 insertions(+), 41 deletions(-)

diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index ffb935ae440db..8a2e26ee9b94c 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -3130,7 +3130,7 @@ def create(
             sympy.Integer(V.graph.sizevars.guard_static_shape(s))
             for s in weight.get_size()
         ]
-        _, _, *kernel_size = weight.get_size()
+        _, _, *kernel_size = weight_shape
 
         # choose runtime kernel
         config_conv = config.triton.convolution
@@ -3355,50 +3355,28 @@ def _prepare_convolution_fusion_create(
     padding = tuple(padding_)
     dilation = tuple(dilation_)
     assert isinstance(groups, int)
-
+    with FakeTensorMode():
+        output, *_ = cls.process_kernel(
+            torch.ops.aten.convolution,
+            x,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            False,
+            [0, 0],
+            groups,
+        )
+
+    output_size = output.shape
     weight_shape = [
         sympy.Integer(V.graph.sizevars.guard_static_shape(s)) for s in weight.get_size()
     ]
-
-    out_channels, in_channels1, *kernel_size = weight_shape
-    in_channels1 = in_channels1 * groups
-    assert len(x.get_size()) == 2 + len(kernel_size)
-    batch, in_channels2, *input_size = x.get_size()
-    output_size = [batch]
-    V.graph.sizevars.guard_equals(in_channels1, in_channels2)
-
-    output_size.append(out_channels)
-    assert (
-        len(stride)
-        == len(padding)
-        == len(dilation)
-        == len(kernel_size)
-        == len(input_size)
+    _, _, *kernel_size = weight_shape
+    output_layout_str = (
+        "torch.contiguous_format" if output.is_contiguous() else "torch.channels_last"
     )
-    for i in range(len(stride)):
-        output_size.append(
-            IndexingDiv(
-                input_size[i]
-                + 2 * padding[i]
-                - dilation[i] * (kernel_size[i] - 1)
-                - 1
-                + stride[i],
-                stride[i],
-            )
-        )
-        output_size[-1] = sympy.Integer(
-            V.graph.sizevars.guard_static_shape(output_size[-1])
-        )
-
-    output_layout_str = "torch.contiguous_format"
-    # If x or weight have one channels_last(2d or 3d) format, it will call channels_last path,
-    # which align with aten.convolutuion path(cpu only support 2d case now).
-    # TODO: after cpu 3d convolution support channels_last path, the size check can be removed.
-    if len(x.get_size()) == 4 and (
-        x.get_layout().is_channels_last_stride_ordered()
-        or weight.get_layout().is_channels_last_stride_ordered()
-    ):
-        output_layout_str = "torch.channels_last"
 
     if output_layout_str == "torch.channels_last":
         stride_order = [0] + list(reversed(range(1, len(kernel_size) + 1)))
@@ -3440,6 +3418,8 @@ def codegen(self, wrapper):
         wrapper.writeline(
             f"{self.get_name()} = {self.kernel}({', '.join(self.codegen_args())})"
         )
+        if isinstance(self.layout, Layout):
+            self.codegen_size_asserts(wrapper)
 
     @classmethod
     def create(

From b5f28659bae98794762f2d42557a5bd13e141fe6 Mon Sep 17 00:00:00 2001
From: XiaobingSuper <xiaobing.zhang@intel.com>
Date: Sun, 13 Nov 2022 22:09:58 -0500
Subject: [PATCH 0861/1922] TorchDynamo: skip convolution fusion when
 convolution's padding is string (#88794)

Currently,  the fusion convolution doesn't support the case when padding is a string, we will support it at the next step.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88794
Approved by: https://github.com/jansel, https://github.com/jgong5
---
 test/inductor/test_torchinductor.py |  9 +++++++++
 torch/_inductor/overrides.py        | 16 +++++++++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index bf1b0a9e4b37b..8c74b1090a236 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -1400,6 +1400,7 @@ def test_conv2d_unary(self):
             [1, 3],
             [1, 2],
             [1, 4],
+            ["same", 0],
             test_memory_format,
         )
 
@@ -1409,6 +1410,7 @@ def test_conv2d_unary(self):
             kernel_size,
             dilation,
             groups,
+            padding,
             memory_format,
         ) in options:
             oC = 32 * groups
@@ -1419,6 +1421,7 @@ def test_conv2d_unary(self):
                     iC,
                     oC,
                     kernel_size=kernel_size,
+                    padding=padding,
                     dilation=dilation,
                     groups=groups,
                     bias=bias,
@@ -1448,6 +1451,7 @@ def __init__(
                 out_channels,
                 dilation,
                 groups,
+                padding,
                 bias,
                 has_relu,
                 **kwargs,
@@ -1458,6 +1462,7 @@ def __init__(
                     out_channels,
                     dilation=dilation,
                     groups=groups,
+                    padding=padding,
                     bias=bias,
                     **kwargs,
                 )
@@ -1467,6 +1472,7 @@ def __init__(
                         out_channels,
                         dilation=dilation,
                         groups=groups,
+                        padding=padding,
                         bias=bias,
                         **kwargs,
                     )
@@ -1487,6 +1493,7 @@ def forward(self, x):
             [1, 3],
             [1, 2],
             [1, 4],
+            ["same", 0],
             test_memory_format,
         )
 
@@ -1497,6 +1504,7 @@ def forward(self, x):
             kernel_size,
             dilation,
             groups,
+            padding,
             memory_format,
         ) in options:
             oC = 32 * groups
@@ -1508,6 +1516,7 @@ def forward(self, x):
                 oC,
                 dilation,
                 groups,
+                padding,
                 bias,
                 has_relu,
                 kernel_size=kernel_size,
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index 8d99107d17c3d..3a95aa7ce8807 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -499,7 +499,11 @@ def fuse_unary(gm: torch.fx.GraphModule):
                 eval_mode = all(not n.training for n in [computation_node, unary_node])
                 if not eval_mode:
                     continue
-
+                # TODO: support padding str input("valid", "same").
+                if type(computation_node) in [nn.Conv2d] and isinstance(
+                    computation_node.padding, str
+                ):
+                    continue
                 # only fuse for linear when the dtype is bf16
                 if type(computation_node) in [nn.Linear] and not is_bfloat16_module(
                     computation_node
@@ -570,6 +574,11 @@ def fuse_binary(gm: torch.fx.GraphModule):
                         if len(node.args[index_node].users) > 1:
                             continue
                         computation_node = modules[node.args[index_node].target]
+                        # TODO: support padding str input("valid", "same").
+                        if type(computation_node) in [nn.Conv2d] and isinstance(
+                            computation_node.padding, str
+                        ):
+                            continue
                         # only fuse for linear when the dtype is bf16
                         if type(computation_node) in [
                             nn.Linear
@@ -615,6 +624,11 @@ def fuse_binary_inplace(gm: torch.fx.GraphModule):
                     if node.args[1].args[0] == node.args[0]:
                         continue
                     computation_node = modules[node.args[1].target]
+                    # TODO: support padding str input("valid", "same").
+                    if type(computation_node) in [nn.Conv2d] and isinstance(
+                        computation_node.padding, str
+                    ):
+                        continue
                     replace_and_fuse_for_binary(
                         computation_node,
                         node,

From de11d206e2e8d4ee1dc8fba496ed2766fd10bc04 Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@fb.com>
Date: Fri, 11 Nov 2022 14:13:01 -0800
Subject: [PATCH 0862/1922] reland "Do not use unsafe restriding for subclasses
 (#87610)" (#88343)

This reverts commit 5b75b19f51837e162cc0e5e5757dfd9bef437c67.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88343
Approved by: https://github.com/ezyang
---
 .../ATen/functorch/BatchRulesScatterOps.cpp   |  5 ++
 aten/src/ATen/native/TensorShape.cpp          |  3 +-
 test/functorch/test_aotdispatch.py            |  2 -
 test/functorch/test_eager_transforms.py       | 10 ++-
 test/test_functionalization.py                | 76 +++++++++----------
 5 files changed, 52 insertions(+), 44 deletions(-)

diff --git a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
index 5eecbedd93e7b..fc51e9d744099 100644
--- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
@@ -928,6 +928,11 @@ Tensor index_copy_decomp(
   return at::scatter(self, dim, index_, source);  ;
 }
 
+// Note [Fix vmap slice_scatter]
+// registers a decomposition for `slice_scatter` that calls into `slice.src`
+// *_scatter operators have some special semantics though, that we can't easily
+// through a decomposition: slice_scatter's output needs to have the same
+// size, size, strides and storage_offset as the input.
 Tensor slice_scatter_decomp(const Tensor &self, const Tensor &src,
                             int64_t dim, c10::optional<int64_t> start,
                             c10::optional<int64_t> end, int64_t step)
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index ba6ff27661baf..c44f3a921afc1 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -3,6 +3,7 @@
 #include <ATen/core/DimVector.h>
 #include <ATen/core/functional.h>
 #include <ATen/core/IListRef.h>
+#include <ATen/TensorSubclassLikeUtils.h>
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
 #include <ATen/ExpandUtils.h>
@@ -1573,7 +1574,7 @@ Tensor reshape_symint(const Tensor& self, c10::SymIntArrayRef proposed_shape) {
     //
     // We need to do the checks here instead of in `native_functions.yaml`
     // to preserve backwards compatibility.
-    if (!self.is_xla() && !self.is_lazy() && !self.is_ipu()) {
+    if (!self.is_xla() && !self.is_lazy() && !self.is_ipu() && !at::isTensorSubclassLike(self)) {
       return self._reshape_alias_symint(shape, stride.value());
     } else {
       return self.view_symint(shape);
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index ea00842a4e005..e31ac58039ec8 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1098,8 +1098,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('masked_fill', ''),  # could not find kernel
     xfail('masked.logaddexp', ''),  # aten.logaddexp.default - couldn't find symbolic meta function/decomposi...
     xfail('masked.logsumexp', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    # Seems flaky: https://github.com/pytorch/pytorch/issues/88883
-    skip('masked.median', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('masked.prod', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('masked_scatter', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('masked_select', ''),  # aten.masked_select.default - couldn't find symbolic meta function/decompos...
diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py
index 26b64c5e70cca..ff69ed9df6e63 100644
--- a/test/functorch/test_eager_transforms.py
+++ b/test/functorch/test_eager_transforms.py
@@ -3130,13 +3130,16 @@ def normalize_devices(fx_g):
     return fx_g
 
 class TestFunctionalize(TestCase):
-    def _check_functionalize_correctness(self, f, inpt):
+    def _check_functionalize_correctness(self, f, inpt, *, skip_vmap=False):
         inpt1 = inpt.clone()
         inpt2 = inpt.clone()
         inpt3 = inpt.clone()
 
         expected_outputs = f(inpt1)
-        actual_outputs = vmap(functionalize(f))(inpt2.unsqueeze(0))[0].squeeze()
+        if skip_vmap:
+            actual_outputs = functionalize(f)(inpt2)
+        else:
+            actual_outputs = vmap(functionalize(f))(inpt2.unsqueeze(0))[0].squeeze()
         # Right now the flavor of functionalize that also removes view ops
         # isn't being used with vmap
         # That's because {view}_copy ops don't have batching rules yet
@@ -3206,7 +3209,8 @@ def f(x: torch.Tensor) -> torch.Tensor:
             z2, z3 = z1.split(2)
             z2.add_(tmp)
             return x
-        self._check_functionalize_correctness(f, torch.zeros(4, 2, device=device))
+        # See Note [Fix vmap slice_scatter]
+        self._check_functionalize_correctness(f, torch.zeros(4, 2, device=device), skip_vmap=True)
 
     # Ensure functionalize works with List[Optional[Tensor]] arguments.
     # See the fix / discussion at https://github.com/pytorch/pytorch/pull/76085
diff --git a/test/test_functionalization.py b/test/test_functionalization.py
index c6c3d991771ba..c5330664d1e83 100644
--- a/test/test_functionalization.py
+++ b/test/test_functionalization.py
@@ -147,17 +147,17 @@ def forward(self, a_1):
     sum_1 = torch.ops.aten.sum.default(relu)
     ones_like = torch.ops.aten.ones_like.default(sum_1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False, memory_format = torch.preserve_format);  sum_1 = None
     expand_copy = torch.ops.aten.expand_copy.default(ones_like, [16, 64, 128, 128]);  ones_like = None
-    _reshape_alias_copy = torch.ops.aten._reshape_alias_copy.default(expand_copy, [1, 1024, 128, 128], [16777216, 16384, 128, 1]);  expand_copy = None
-    new_empty_strided = torch.ops.aten.new_empty_strided.default(_reshape_alias_copy, [1, 1024, 128, 128], [16777216, 16384, 128, 1])
-    view_copy_3 = torch.ops.aten.view_copy.default(_reshape_alias_copy, [16, 64, 128, 128])
-    view_copy_4 = torch.ops.aten.view_copy.default(_reshape_alias_copy, [16, 64, 128, 128])
-    clone_1 = torch.ops.aten.clone.default(view_copy_4, memory_format = torch.contiguous_format);  view_copy_4 = None
+    view_copy_3 = torch.ops.aten.view_copy.default(expand_copy, [1, 1024, 128, 128]);  expand_copy = None
+    new_empty_strided = torch.ops.aten.new_empty_strided.default(view_copy_3, [1, 1024, 128, 128], [16777216, 16384, 128, 1])
+    view_copy_4 = torch.ops.aten.view_copy.default(view_copy_3, [16, 64, 128, 128])
+    view_copy_5 = torch.ops.aten.view_copy.default(view_copy_3, [16, 64, 128, 128])
+    clone_1 = torch.ops.aten.clone.default(view_copy_5, memory_format = torch.contiguous_format);  view_copy_5 = None
     threshold_backward = torch.ops.aten.threshold_backward.default(clone_1, relu, 0);  clone_1 = relu = None
-    _reshape_alias_copy_1 = torch.ops.aten._reshape_alias_copy.default(_reshape_alias_copy, [16, 64, 128, 128], [1048576, 16384, 128, 1]);  _reshape_alias_copy = None
-    detach_copy = torch.ops.aten.detach_copy.default(_reshape_alias_copy_1);  _reshape_alias_copy_1 = None
-    view_copy_5 = torch.ops.aten.view_copy.default(threshold_backward, [1, 1024, 128, 128]);  threshold_backward = None
-    _reshape_alias_copy_2 = torch.ops.aten._reshape_alias_copy.default(view_copy_5, [16, 64, 128, 128], [1048576, 16384, 128, 1]);  view_copy_5 = None
-    detach_copy_1 = torch.ops.aten.detach_copy.default(_reshape_alias_copy_2);  _reshape_alias_copy_2 = None
+    view_copy_6 = torch.ops.aten.view_copy.default(view_copy_3, [16, 64, 128, 128]);  view_copy_3 = None
+    detach_copy = torch.ops.aten.detach_copy.default(view_copy_6);  view_copy_6 = None
+    view_copy_7 = torch.ops.aten.view_copy.default(threshold_backward, [1, 1024, 128, 128]);  threshold_backward = None
+    view_copy_8 = torch.ops.aten.view_copy.default(view_copy_7, [16, 64, 128, 128]);  view_copy_7 = None
+    detach_copy_1 = torch.ops.aten.detach_copy.default(view_copy_8);  view_copy_8 = None
     return detach_copy_1
     """)  # noqa: B950
 
@@ -710,40 +710,40 @@ def forward(self, a_1):
     ones = torch.ops.aten.ones.default([2, 2], device = device(type='cpu'), pin_memory = False)
     add = torch.ops.aten.add.Tensor(a_1, a_1);  a_1 = None
     view_copy = torch.ops.aten.view_copy.default(add, [8])
-    _reshape_alias_copy = torch.ops.aten._reshape_alias_copy.default(view_copy, [2, 4], [4, 1]);  view_copy = None
-    transpose_copy = torch.ops.aten.transpose_copy.int(_reshape_alias_copy, 1, 0)
+    view_copy_1 = torch.ops.aten.view_copy.default(view_copy, [2, 4]);  view_copy = None
+    transpose_copy = torch.ops.aten.transpose_copy.int(view_copy_1, 1, 0)
     unsqueeze_copy = torch.ops.aten.unsqueeze_copy.default(transpose_copy, 0);  transpose_copy = None
     squeeze_copy = torch.ops.aten.squeeze_copy.default(unsqueeze_copy);  unsqueeze_copy = None
     split_copy = torch.ops.aten.split_copy.Tensor(squeeze_copy, 2);  squeeze_copy = None
     getitem = split_copy[0]
     getitem_1 = split_copy[1];  split_copy = None
     add_1 = torch.ops.aten.add.Tensor(getitem, ones);  getitem = ones = None
-    select_copy = torch.ops.aten.select_copy.int(_reshape_alias_copy, 0, 0);  _reshape_alias_copy = None
-    _reshape_alias_copy_1 = torch.ops.aten._reshape_alias_copy.default(add_1, [4], [1])
-    view_copy_1 = torch.ops.aten.view_copy.default(add, [8]);  add = None
-    _reshape_alias_copy_2 = torch.ops.aten._reshape_alias_copy.default(view_copy_1, [2, 4], [4, 1]);  view_copy_1 = None
-    transpose_copy_1 = torch.ops.aten.transpose_copy.int(_reshape_alias_copy_2, 1, 0);  _reshape_alias_copy_2 = None
+    select_copy = torch.ops.aten.select_copy.int(view_copy_1, 0, 0);  view_copy_1 = None
+    view_copy_2 = torch.ops.aten.view_copy.default(add_1, [4])
+    view_copy_3 = torch.ops.aten.view_copy.default(add, [8]);  add = None
+    view_copy_4 = torch.ops.aten.view_copy.default(view_copy_3, [2, 4]);  view_copy_3 = None
+    transpose_copy_1 = torch.ops.aten.transpose_copy.int(view_copy_4, 1, 0);  view_copy_4 = None
     unsqueeze_copy_1 = torch.ops.aten.unsqueeze_copy.default(transpose_copy_1, 0);  transpose_copy_1 = None
     squeeze_copy_1 = torch.ops.aten.squeeze_copy.default(unsqueeze_copy_1);  unsqueeze_copy_1 = None
     slice_scatter = torch.ops.aten.slice_scatter.default(squeeze_copy_1, add_1, 0, 0, 2);  squeeze_copy_1 = None
     unsqueeze_copy_2 = torch.ops.aten.unsqueeze_copy.default(slice_scatter, 0);  slice_scatter = None
     squeeze_copy_2 = torch.ops.aten.squeeze_copy.dim(unsqueeze_copy_2, 0);  unsqueeze_copy_2 = None
     transpose_copy_2 = torch.ops.aten.transpose_copy.int(squeeze_copy_2, 1, 0);  squeeze_copy_2 = None
-    _reshape_alias_copy_3 = torch.ops.aten._reshape_alias_copy.default(transpose_copy_2, [8], [1]);  transpose_copy_2 = None
-    view_copy_2 = torch.ops.aten.view_copy.default(_reshape_alias_copy_3, [4, 2]);  _reshape_alias_copy_3 = None
-    view_copy_3 = torch.ops.aten.view_copy.default(view_copy_2, [8])
-    _reshape_alias_copy_4 = torch.ops.aten._reshape_alias_copy.default(view_copy_3, [2, 4], [4, 1]);  view_copy_3 = None
-    select_copy_1 = torch.ops.aten.select_copy.int(_reshape_alias_copy_4, 0, 0);  _reshape_alias_copy_4 = None
-    view_copy_4 = torch.ops.aten.view_copy.default(view_copy_2, [8]);  view_copy_2 = None
-    _reshape_alias_copy_5 = torch.ops.aten._reshape_alias_copy.default(view_copy_4, [2, 4], [4, 1]);  view_copy_4 = None
-    transpose_copy_3 = torch.ops.aten.transpose_copy.int(_reshape_alias_copy_5, 1, 0);  _reshape_alias_copy_5 = None
+    view_copy_5 = torch.ops.aten.view_copy.default(transpose_copy_2, [8]);  transpose_copy_2 = None
+    view_copy_6 = torch.ops.aten.view_copy.default(view_copy_5, [4, 2]);  view_copy_5 = None
+    view_copy_7 = torch.ops.aten.view_copy.default(view_copy_6, [8])
+    view_copy_8 = torch.ops.aten.view_copy.default(view_copy_7, [2, 4]);  view_copy_7 = None
+    select_copy_1 = torch.ops.aten.select_copy.int(view_copy_8, 0, 0);  view_copy_8 = None
+    view_copy_9 = torch.ops.aten.view_copy.default(view_copy_6, [8]);  view_copy_6 = None
+    view_copy_10 = torch.ops.aten.view_copy.default(view_copy_9, [2, 4]);  view_copy_9 = None
+    transpose_copy_3 = torch.ops.aten.transpose_copy.int(view_copy_10, 1, 0);  view_copy_10 = None
     unsqueeze_copy_3 = torch.ops.aten.unsqueeze_copy.default(transpose_copy_3, 0);  transpose_copy_3 = None
     squeeze_copy_3 = torch.ops.aten.squeeze_copy.default(unsqueeze_copy_3);  unsqueeze_copy_3 = None
     split_copy_1 = torch.ops.aten.split_copy.Tensor(squeeze_copy_3, 2);  squeeze_copy_3 = None
     getitem_2 = split_copy_1[0]
     getitem_3 = split_copy_1[1];  split_copy_1 = None
-    _reshape_alias_copy_6 = torch.ops.aten._reshape_alias_copy.default(getitem_2, [4], [1]);  getitem_2 = None
-    add_2 = torch.ops.aten.add.Tensor(select_copy_1, _reshape_alias_copy_6);  select_copy_1 = _reshape_alias_copy_6 = None
+    view_copy_11 = torch.ops.aten.view_copy.default(getitem_2, [4]);  getitem_2 = None
+    add_2 = torch.ops.aten.add.Tensor(select_copy_1, view_copy_11);  select_copy_1 = view_copy_11 = None
     return add_1
     """)  # noqa: B950
 
@@ -756,30 +756,30 @@ def forward(self, a_1):
     ones = torch.ops.aten.ones.default([2, 2], device = device(type='cpu'), pin_memory = False)
     add = torch.ops.aten.add.Tensor(a_1, a_1);  a_1 = None
     view = torch.ops.aten.view.default(add, [8])
-    _reshape_alias = torch.ops.aten._reshape_alias.default(view, [2, 4], [4, 1]);  view = None
-    transpose = torch.ops.aten.transpose.int(_reshape_alias, 1, 0)
+    view_1 = torch.ops.aten.view.default(view, [2, 4]);  view = None
+    transpose = torch.ops.aten.transpose.int(view_1, 1, 0)
     unsqueeze = torch.ops.aten.unsqueeze.default(transpose, 0);  transpose = None
     squeeze = torch.ops.aten.squeeze.default(unsqueeze);  unsqueeze = None
     split = torch.ops.aten.split.Tensor(squeeze, 2);  squeeze = None
     getitem = split[0]
     getitem_1 = split[1];  split = None
     add_1 = torch.ops.aten.add_.Tensor(getitem, ones);  ones = None
-    select = torch.ops.aten.select.int(_reshape_alias, 0, 0);  _reshape_alias = None
+    select = torch.ops.aten.select.int(view_1, 0, 0);  view_1 = None
     clone = torch.ops.aten.clone.default(getitem, memory_format = torch.contiguous_format)
     _unsafe_view = torch.ops.aten._unsafe_view.default(clone, [4]);  clone = None
-    view_1 = torch.ops.aten.view.default(add, [8]);  add = None
-    _reshape_alias_1 = torch.ops.aten._reshape_alias.default(view_1, [2, 4], [4, 1]);  view_1 = None
-    transpose_1 = torch.ops.aten.transpose.int(_reshape_alias_1, 1, 0);  _reshape_alias_1 = None
+    view_2 = torch.ops.aten.view.default(add, [8]);  add = None
+    view_3 = torch.ops.aten.view.default(view_2, [2, 4]);  view_2 = None
+    transpose_1 = torch.ops.aten.transpose.int(view_3, 1, 0);  view_3 = None
     unsqueeze_1 = torch.ops.aten.unsqueeze.default(transpose_1, 0);  transpose_1 = None
     squeeze_1 = torch.ops.aten.squeeze.default(unsqueeze_1);  unsqueeze_1 = None
     unsqueeze_2 = torch.ops.aten.unsqueeze.default(squeeze_1, 0);  squeeze_1 = None
     squeeze_2 = torch.ops.aten.squeeze.dim(unsqueeze_2, 0);  unsqueeze_2 = None
     transpose_2 = torch.ops.aten.transpose.int(squeeze_2, 1, 0);  squeeze_2 = None
-    _reshape_alias_2 = torch.ops.aten._reshape_alias.default(transpose_2, [8], [1]);  transpose_2 = None
-    view_2 = torch.ops.aten.view.default(_reshape_alias_2, [4, 2]);  _reshape_alias_2 = None
-    view_3 = torch.ops.aten.view.default(view_2, [8]);  view_2 = None
-    _reshape_alias_3 = torch.ops.aten._reshape_alias.default(view_3, [2, 4], [4, 1]);  view_3 = None
-    select_1 = torch.ops.aten.select.int(_reshape_alias_3, 0, 0);  _reshape_alias_3 = None
+    view_4 = torch.ops.aten.view.default(transpose_2, [8]);  transpose_2 = None
+    view_5 = torch.ops.aten.view.default(view_4, [4, 2]);  view_4 = None
+    view_6 = torch.ops.aten.view.default(view_5, [8]);  view_5 = None
+    view_7 = torch.ops.aten.view.default(view_6, [2, 4]);  view_6 = None
+    select_1 = torch.ops.aten.select.int(view_7, 0, 0);  view_7 = None
     add_2 = torch.ops.aten.add.Tensor(select_1, _unsafe_view);  select_1 = _unsafe_view = None
     return getitem
     """)

From bb20bf83a2d23d296e37b9886670caf2057495c8 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Mon, 14 Nov 2022 12:36:44 +0000
Subject: [PATCH 0863/1922] Run test_torchinductor_opinfo CPU tests if triton
 not installed (#88934)

These test are not run currently because normal CI workers don't have
triton installed.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88934
Approved by: https://github.com/ngimel
---
 test/inductor/test_torchinductor.py        | 20 ++-------------
 test/inductor/test_torchinductor_opinfo.py | 30 ++++++++++++++--------
 torch/testing/_internal/inductor_utils.py  | 23 +++++++++++++++++
 3 files changed, 44 insertions(+), 29 deletions(-)
 create mode 100644 torch/testing/_internal/inductor_utils.py

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 8c74b1090a236..ba1f9032d97f0 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -20,7 +20,6 @@
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.nn import functional as F
 from torch.testing._internal.common_utils import (
-    IS_FBCODE,
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
     TestCase as TorchTestCase,
@@ -41,7 +40,7 @@
     from torch._inductor.compile_fx import compile_fx, complex_memory_overlap
     from torch._inductor.ir import IndexingDiv, ModularIndexing
     from torch._inductor.sizevars import SizeVarAllocator
-    from torch._inductor.utils import has_torchvision_roi_align, has_triton, timed
+    from torch._inductor.utils import has_torchvision_roi_align, timed
 
     # This will only pass on pytorch builds newer than roughly 5/15/2022
     assert get_decompositions([torch.ops.aten.trace])
@@ -53,25 +52,10 @@
         sys.exit(0)
     raise unittest.SkipTest("requires sympy/functorch/filelock")
 
-HAS_CPU = False
-try:
-    from subprocess import CalledProcessError
-
-    from torch._inductor.codecache import CppCodeCache
-
-    CppCodeCache.load("")
-    HAS_CPU = not IS_FBCODE
-except (
-    CalledProcessError,
-    OSError,
-    torch._inductor.exc.InvalidCxxCompiler,
-    torch._inductor.exc.CppCompileError,
-):
-    pass
+from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
 
 aten = torch.ops.aten
 
-HAS_CUDA = has_triton()
 requires_cuda = functools.partial(unittest.skipIf, not HAS_CUDA, "requires cuda")
 
 torch._inductor.config.triton.autotune = False  # too slow
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 3d384efea0aec..3880b87c082cb 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -16,20 +16,22 @@
     onlyNativeDeviceTypes,
     OpDTypes,
     ops,
+    skipCPUIf,
+    skipCUDAIf,
 )
 from torch.testing._internal.common_methods_invocations import op_db
 from torch.testing._internal.common_utils import (
     dtype_abbrs,
     run_tests,
     skipCUDAMemoryLeakCheckIf,
+    skipIfCrossRef,
+    skipIfTorchDynamo,
     suppress_warnings,
-    TEST_WITH_ROCM,
     TestCase,
 )
+from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
 
 try:
-    from torch._inductor.utils import has_triton
-
     try:
         from .test_torchinductor import check_model, check_model_cuda
     except ImportError:
@@ -120,6 +122,7 @@ def process(device_type):
 
 inductor_skips["cpu"] = {
     "linalg.ldl_solve": {b8, f16, f32, f64, i32, i64},  # segfault
+    "linalg.ldl_factor": {f32, f64},  # flaky
     "__rdiv__": {b8, f16, f32, f64, i32, i64},  # flaky
 }
 
@@ -169,6 +172,9 @@ def process(device_type):
     "argwhere": {b8, f16, f32, f64, i32, i64},
     "bernoulli": {f32, f64},
     "bincount": {i32, i64},
+    "bucketize": {b8, f16, f32, f64, i32, i64},
+    "cdouble": {b8, f16, f32, f64, i32, i64},
+    "cfloat": {b8, f16, f32, f64, i32, i64},
     "chalf": {b8, f16, f32, f64, i32, i64},
     "cholesky": {f32, f64},
     "combinations": {b8, f16, f32, f64, i32, i64},
@@ -209,11 +215,10 @@ def process(device_type):
     "linalg.lstsq.grad_oriented": {f32, f64},
     "linalg.matrix_rank": {f32, f64},
     "linalg.matrix_rank.hermitian": {f32, f64},
-    "linalg.lu_solve": {f32, f64},
-    "lu_solve": {f32, f64},
-    "lu_unpack": {f32, f64},
+    "linalg.pinv.singular": {f32, f64},
     "logdet": {f32, f64},
     "masked.norm": {f16},
+    "masked.normalize": {f16},
     "masked_fill": {f16},
     "masked_scatter": {f16, f32, f64},
     "masked_select": {b8, f16, f32, f64, i32, i64},
@@ -225,8 +230,8 @@ def process(device_type):
     "nan_to_num": {f16},
     "nanquantile": {f32, f64},
     "nn.functional.avg_pool1d": {i64},
-    "nn.functional.avg_pool2d": {i64},
-    "nn.functional.adaptive_avg_pool2d": {f16},
+    "nn.functional.avg_pool2d": {i64, f64},
+    "nn.functional.adaptive_avg_pool2d": {f16, f64},
     "nn.functional.ctc_loss": {f32, f64},
     "nn.functional.gaussian_nll_loss": {f32, f64},
     "nn.functional.gelu": {f64},
@@ -243,6 +248,7 @@ def process(device_type):
     "quantile": {f32, f64},
     "rand_like": {f16, f32, f64},
     "randint_like": {f16, f32, f64, i32, i64},
+    "randint": {f16, f32, f64, i32, i64},
     "randn_like": {f16, f32, f64},
     "repeat_interleave": {b8, f16, f32, f64, i32, i64},
     "scatter_add": {f16},
@@ -366,7 +372,6 @@ def process(device_type):
     "asin": {f16},
     "cumprod": {f16},
     "linalg.vector_norm": {f64, f64},
-    "linalg.householder_product": {f32},
     "kron": {f16},
     "nanquantile": {f32, f64},
     "native_batch_norm": {f16, f32, f64},
@@ -455,6 +460,10 @@ class TestInductorOpInfo(TestCase):
     @skipCUDAMemoryLeakCheckIf(
         True
     )  # inductor kernels failing this test intermittently
+    @skipCUDAIf(not HAS_CUDA, "Skipped! Triton not found")
+    @skipCPUIf(not HAS_CPU, "Skipped! Supported CPU compiler not found")
+    @skipIfTorchDynamo("Test uses dynamo already")
+    @skipIfCrossRef
     @_ops(op_db[START:END])
     @patch("torch._dynamo.config.raise_on_unsafe_aot_autograd", True)
     def test_comprehensive(self, device, dtype, op):
@@ -599,5 +608,4 @@ def fn(*args, **kwargs):
 instantiate_device_type_tests(TestInductorOpInfo, globals())
 
 if __name__ == "__main__":
-    if has_triton() and not TEST_WITH_ROCM:
-        run_tests()
+    run_tests()
diff --git a/torch/testing/_internal/inductor_utils.py b/torch/testing/_internal/inductor_utils.py
new file mode 100644
index 0000000000000..84750a2de3eee
--- /dev/null
+++ b/torch/testing/_internal/inductor_utils.py
@@ -0,0 +1,23 @@
+from subprocess import CalledProcessError
+
+from torch._inductor.codecache import CppCodeCache
+from torch._inductor.utils import has_triton
+from torch.testing._internal.common_utils import (
+    IS_FBCODE,
+    TEST_WITH_ROCM,
+)
+import torch
+
+HAS_CPU = False
+try:
+    CppCodeCache.load("")
+    HAS_CPU = not IS_FBCODE
+except (
+    CalledProcessError,
+    OSError,
+    torch._inductor.exc.InvalidCxxCompiler,
+    torch._inductor.exc.CppCompileError,
+):
+    pass
+
+HAS_CUDA = has_triton() and not TEST_WITH_ROCM

From 63d7bde1302709431edde846490215419fd2eaca Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Mon, 14 Nov 2022 17:37:24 +0000
Subject: [PATCH 0864/1922] don't use prims.unsqueeze in group_norm (#88927)

inductor doesn't have prims.squeeze lowering, so this breaks it. Longer term, `squeeze` with multiple dimensions is not a prim, nvfuser implements it with a loop, inductor uses `_squeeze_multiple` helper which turns it into a loop. Prim should accept only a single dimension.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88927
Approved by: https://github.com/eellison
---
 torch/_refs/__init__.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index a1de9a438d774..f2817f0331ac5 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -2820,6 +2820,12 @@ def _unsqueeze_multiple(x: TensorLikeType, dimensions: List[int]) -> TensorLikeT
     return x
 
 
+def _squeeze_multiple(x: TensorLikeType, dimensions: List[int]) -> TensorLikeType:
+    for dim in reversed(sorted(dimensions)):
+        x = torch.squeeze(x, dim)
+    return x
+
+
 @register_decomposition(torch.ops.aten.native_group_norm.default)
 def native_group_norm(
     input: Tensor,
@@ -2868,8 +2874,8 @@ def native_group_norm(
     rstd = _maybe_convert_to_dtype(rstd, input.dtype)  # type: ignore[assignment]
 
     # remove broadcast dimensions from mean and rstd
-    mean = prims.squeeze(mean, reduction_dims)
-    rstd = prims.squeeze(rstd, reduction_dims)
+    mean = _squeeze_multiple(mean, reduction_dims)
+    rstd = _squeeze_multiple(rstd, reduction_dims)
     return (out, mean, rstd)
 
 
From 716e4ba157cf74f4b44db9bd3443be308270ae78 Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <ivan.yashchuk@aalto.fi>
Date: Mon, 14 Nov 2022 17:49:30 +0000
Subject: [PATCH 0865/1922] Remove cpu path from lobpcg's basis helper (#88984)

Fixes https://github.com/pytorch/pytorch/issues/88650
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88984
Approved by: https://github.com/lezcano
---
 torch/_linalg_utils.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/torch/_linalg_utils.py b/torch/_linalg_utils.py
index 76b8ab532fcda..bdd22f395d2da 100644
--- a/torch/_linalg_utils.py
+++ b/torch/_linalg_utils.py
@@ -76,12 +76,7 @@ def qform(A: Optional[Tensor], S: Tensor):
 
 def basis(A):
     """Return orthogonal basis of A columns."""
-    if A.is_cuda:
-        # torch.orgqr is not available in CUDA
-        Q = torch.linalg.qr(A).Q
-    else:
-        Q = torch.orgqr(*torch.geqrf(A))
-    return Q
+    return torch.linalg.qr(A).Q
 
 
 def symeig(A: Tensor, largest: Optional[bool] = False) -> Tuple[Tensor, Tensor]:

From 5df60d2ff010619dcdce577e464f768d69b65418 Mon Sep 17 00:00:00 2001
From: Khushi Agrawal <khushiagrawal411@gmail.com>
Date: Mon, 14 Nov 2022 18:18:45 +0000
Subject: [PATCH 0866/1922] [primTorch] _refs & opinfo alpha_dropout (#87989)

Add _refs and OpInfo for `nn.functional.alpha_dropout`
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87989
Approved by: https://github.com/mruberry
---
 test/functorch/test_ops.py                    |  5 ++
 test/functorch/test_vmap.py                   |  1 +
 torch/_refs/nn/functional/__init__.py         | 81 ++++++++++++++++---
 .../_internal/common_methods_invocations.py   | 47 +++++++++++
 4 files changed, 123 insertions(+), 11 deletions(-)

diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index 74085941c6c88..2e303922dfa1c 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -612,6 +612,7 @@ def fn(inp, *args, **kwargs):
         skip("nn.functional.dropout"),  # calls random op
         skip("nn.functional.dropout2d"),  # calls random op
         skip("nn.functional.dropout3d"),  # calls random op
+        skip("nn.functional.alpha_dropout"),  # calls random op
         skip("nn.functional.feature_alpha_dropout", "with_train"),  # calls random op
         skip("nn.functional.fractional_max_pool2d"),  # calls random op
         skip("nn.functional.fractional_max_pool3d"),  # calls random op
@@ -719,6 +720,7 @@ def vjp_of_vjp(*args_and_cotangents):
         skip('nn.functional.dropout'),  # randomness
         skip('nn.functional.dropout2d'),  # randomness
         skip('nn.functional.dropout3d', ''),  # randomness
+        skip('nn.functional.alpha_dropout'),  # randomness
         skip('nn.functional._scaled_dot_product_attention'),  # randomness
         xfail('as_strided'),  # as_strided is too wild for us to support, wontfix
         xfail('index_put', ''),  # not possible due to dynamic shapes; we support a subset
@@ -808,6 +810,7 @@ def test_vmapvjp(self, device, dtype, op):
         skip('nn.functional.dropout2d', ''),
         skip('nn.functional.dropout3d', ''),
         skip('nn.functional._scaled_dot_product_attention'),  # randomness
+        skip('nn.functional.alpha_dropout'),  # randomness
         skip('nn.functional.feature_alpha_dropout', 'without_train'),
         skip('nn.functional.feature_alpha_dropout', 'with_train'),
         xfail('nn.functional.fractional_max_pool2d'),  # Cannot access data pointer of Tensor that doesn't have storage
@@ -1089,6 +1092,7 @@ def test():
         skip('nn.functional.rrelu'),  # randomness
         skip('nn.functional.feature_alpha_dropout', 'with_train'),  # randomness
         skip('nn.functional.feature_alpha_dropout', 'without_train'),  # randomness
+        skip('nn.functional.alpha_dropout'),  # randomness
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
         skip('to_sparse', ''),  # non-dense output
         skip('ormqr', ''),  # takes too long
@@ -1330,6 +1334,7 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents):
         xfail('nn.functional.dropout'),  # calls random op
         skip('nn.functional._scaled_dot_product_attention'),  # randomness
         xfail('nn.functional.embedding_bag'),  # Forward AD not implemented and no decomposition
+        xfail('nn.functional.alpha_dropout'),  # calls randomn op
         xfail('nn.functional.feature_alpha_dropout', 'with_train'),  # calls random op
         xfail('nn.functional.fractional_max_pool2d'),  # calls random op
         xfail('nn.functional.fractional_max_pool3d'),  # calls random op
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index 6d95077b627e2..fb8722b8405bf 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -3219,6 +3219,7 @@ def test():
         xfail('nn.functional.rrelu'),  # randomness
         xfail('nn.functional.dropout2d', ''),  # randomness
         xfail('nn.functional.dropout3d', ''),  # randomness
+        xfail('nn.functional.alpha_dropout', ''),  # randomness
         xfail('nn.functional.feature_alpha_dropout', 'with_train'),  # randomness
         xfail('as_strided'),  # Our test runner can't handle this; manual test exists
         skip('new_empty_strided'),  # empty tensor data is garbage so it's hard to make comparisons with it
diff --git a/torch/_refs/nn/functional/__init__.py b/torch/_refs/nn/functional/__init__.py
index dcd86d8952d26..3848a738d5346 100644
--- a/torch/_refs/nn/functional/__init__.py
+++ b/torch/_refs/nn/functional/__init__.py
@@ -1,3 +1,4 @@
+import math
 from typing import Callable, Optional, Union
 
 import torch
@@ -27,6 +28,7 @@
 from torch._subclasses.fake_tensor import FakeTensor
 
 __all__ = [
+    "alpha_dropout",
     "celu",
     "dropout",
     "elu",
@@ -59,6 +61,65 @@
 
 Tensor = torch.Tensor
 
+
+def _dropout_helper(
+    self: TensorLikeType,
+    val: float,
+) -> TensorLikeType:
+    """
+    Helper function for all dropout-type operators. During training,
+    some of the elements of the input tensor are randomly masked.
+
+    Returns the masked tensor of the boolean values.
+
+    """
+
+    return (
+        refs.uniform(
+            self.shape, low=0.0, high=1.0, dtype=torch.float32, device=self.device
+        )
+        < val
+    )
+
+
+@register_decomposition(torch.ops.aten.alpha_dropout)
+def alpha_dropout(
+    self: TensorLikeType, p: float = 0.5, training: bool = False, inplace: bool = False
+) -> TensorLikeType:
+
+    if inplace:
+        raise NotImplementedError
+
+    if not training:
+        return self
+
+    utils.check(
+        p <= 1 and p >= 0,
+        lambda: f"dropout probability has to be between 0 and 1, but got, {p}",
+    )
+
+    if p == 1:
+        return torch.zeros_like(self)
+
+    if p == 0:
+        return self
+
+    dropout_mask = _dropout_helper(self, 1 - p)
+
+    # From paper: Self-Normalizing Neural Networks (https://arxiv.org/pdf/1706.02515.pdf)
+    # alpha = - SELU.alpha * SELU.scale, here
+    # SELU.alpha = 1.6732632423543772848170429916717 and
+    # SELU.scale = 1.0507009873554804934193349852946
+    alpha = -1.7580993408473766
+
+    a = 1.0 / math.sqrt((alpha * alpha * p + 1) * (1 - p))
+    b = torch.logical_not(dropout_mask)
+    b = b * (alpha * a) + alpha * a * p
+    dropout_mask = a * dropout_mask
+
+    return self * dropout_mask + b
+
+
 # celu is implemented specially because it has an alpha argument
 # celu is very similar to elu
 @register_decomposition(torch.ops.aten.celu)
@@ -93,7 +154,6 @@ def celu(
     return torch.where(a > 0, a, rhs)
 
 
-# TODO: should we allow the user to set a different dtype for the mask generation?
 @register_decomposition(torch.ops.aten.dropout)
 def dropout(
     a: TensorLikeType, p: float = 0.5, training: bool = True, inplace: bool = False
@@ -105,22 +165,21 @@ def dropout(
     if not training:
         return a
 
-    assert p <= 1
-    assert p >= 0
+    utils.check(
+        p <= 1 and p >= 0,
+        lambda: f"dropout probability has to be between 0 and 1, but got, {p}",
+    )
 
     if p == 1:
-        return refs.zeros_like(a)
+        return torch.zeros_like(a)
 
     if p == 0:
         return a
 
-    p1m = 1 - p
-    scale = 1 / p1m
-    mask = refs.lt(
-        refs.uniform(a.shape, low=0.0, high=1.0, dtype=torch.float32, device=a.device),
-        p1m,
-    )
-    return refs.mul(refs.mul(a, mask), scale)
+    scale = 1 / (1 - p)
+    dropout_mask = _dropout_helper(a, 1 - p)
+
+    return a * dropout_mask * scale
 
 
 # elu is implemented specially because it has an alpha argument
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 62c9b4750ae93..001fd455e82ee 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -15965,6 +15965,28 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         sample_inputs_func=partial(sample_inputs_dropout, valid_input_dim=(4, 5)),
         inplace_variant=lambda input, *args, **kwargs:
             wrapper_set_seed(torch.nn.functional.dropout3d, input, *args, **kwargs, inplace=True)),
+    OpInfo(
+        "nn.functional.alpha_dropout",
+        op=lambda input, *args, **kwargs:
+            wrapper_set_seed(torch.nn.functional.alpha_dropout, input, *args, **kwargs),
+        dtypes=floating_types_and(torch.bfloat16),
+        dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+        gradcheck_wrapper=wrapper_set_seed,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_out=False,
+        sample_inputs_func=sample_inputs_dropout,
+        check_batched_forward_grad=False,
+        inplace_variant=lambda input, *args, **kwargs:
+            wrapper_set_seed(torch.nn.functional.alpha_dropout, input, *args, **kwargs, inplace=True),
+        skips=(
+            # lambda impl
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            # AssertionError: Tensor-likes are not close!
+            # Fails in cuda11.7
+            # Error Log: https://github.com/pytorch/pytorch/actions/runs/3440108478/jobs/5738475757
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu', device_type='cuda'),
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),),),
     # In training mode, feature_alpha_dropout currently doesn't support inputs of complex dtype
     # unlike when `train=False`, it supports complex inputs, hence 2 OpInfos to cover all cases
     OpInfo(
@@ -17287,6 +17309,31 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     #
     # Elementwise Unary nn.functional OpInfos
     #
+    PythonRefInfo(
+        "_refs.nn.functional.alpha_dropout",
+        torch_opinfo_name="nn.functional.alpha_dropout",
+        supports_nvfuser=False,
+        decorators=(
+            DecorateInfo(unittest.skip("Expected: dropout is not comparable"),
+                         'TestCommon',
+                         'test_python_ref'),
+            # AssertionError: Tensor-likes are not close!
+            DecorateInfo(unittest.skip("Expected: dropout is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_torch_fallback'),
+            DecorateInfo(unittest.skip("Expected: dropout is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_executor', device_type='cuda'),
+            # AssertionError: Tensor-likes are not close!
+            DecorateInfo(unittest.skip("Expected: dropout is not comparable"),
+                         'TestMathBits',
+                         'test_neg_view'),
+            # AssertionError: Tensor-likes are not close!
+            DecorateInfo(unittest.skip("Expected: dropout is not comparable"),
+                         'TestCommon',
+                         'test_compare_cpu'),
+        )
+    ),
     ElementwiseUnaryPythonRefInfo(
         "_refs.nn.functional.celu",
         torch_opinfo_name="nn.functional.celu",

From c3d9b32aebbd03a13fd88477f8730dee362b1947 Mon Sep 17 00:00:00 2001
From: Sean Ross-Ross <srossross@gmail.com>
Date: Mon, 14 Nov 2022 18:39:45 +0000
Subject: [PATCH 0867/1922] _get_nested_attr should return a value in the
 general case (#88822)

Fixes https://github.com/pytorch/functorch/issues/1053

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88822
Approved by: https://github.com/zou3519
---
 functorch/_src/make_functional.py       |  2 +-
 test/functorch/test_eager_transforms.py | 31 +++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/functorch/_src/make_functional.py b/functorch/_src/make_functional.py
index 7b8c15196e23b..abb3f07ca597f 100644
--- a/functorch/_src/make_functional.py
+++ b/functorch/_src/make_functional.py
@@ -44,7 +44,7 @@ def _get_nested_attr(obj: nn.Module, names: List[str]) -> None:
     if len(names) == 1:
         return getattr(obj, names[0])
     else:
-        _get_nested_attr(getattr(obj, names[0]), names[1:])
+        return _get_nested_attr(getattr(obj, names[0]), names[1:])
 
 
 def raise_parameter_tying_error():
diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py
index ff69ed9df6e63..e88e8007e77ed 100644
--- a/test/functorch/test_eager_transforms.py
+++ b/test/functorch/test_eager_transforms.py
@@ -2669,6 +2669,37 @@ def test_combine_state_for_ensemble_smoke(self):
         models = [torch.nn.Linear(in_features, out_features) for i in range(num_models)]
         _ = combine_state_for_ensemble(models)
 
+    def test_state_correctly_returned_after_forward(self):
+        class Net(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = nn.Linear(3, 3)
+
+            def forward(self, x):
+                x = self.linear(x)
+                return x
+
+        mod = Net()
+        func, params = make_functional(mod)
+
+        # state in func.names_map
+        old_state_linear_weight = func.stateless_model.linear.weight
+        old_state_linear_bias = func.stateless_model.linear.bias
+
+        self.assertIsNotNone(old_state_linear_weight)
+        self.assertIsNotNone(old_state_linear_bias)
+
+        x = torch.randn(4, 3)
+        func(params, x)
+
+        new_state_linear_weight = func.stateless_model.linear.weight
+        new_state_linear_bias = func.stateless_model.linear.bias
+
+        self.assertIsNotNone(new_state_linear_weight)
+        self.assertIsNotNone(new_state_linear_bias)
+
+        self.assertEqual(old_state_linear_weight, new_state_linear_weight)
+        self.assertEqual(old_state_linear_bias, new_state_linear_bias)
 
 class TestExamplesCorrectness(TestCase):
     def test_maml_regression(self, device):

From 4d02923deb473ec6684bd619eac3f57dc93f672f Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@fb.com>
Date: Mon, 14 Nov 2022 18:43:50 +0000
Subject: [PATCH 0868/1922] Fix long comment error on dashboard (#89002)

Fix dashboard comment failure due to the following trace:
```
Traceback (most recent call last):
  File "/scratch/anijain/dashboard/work/pytorch/benchmarks/dynamo/runner.py", line 1180, in <module>
    DashboardUpdater(args).update()
  File "/scratch/anijain/dashboard/work/pytorch/benchmarks/dynamo/runner.py", line 1119, in update
    self.comment_on_gh(comment)
  File "/scratch/anijain/dashboard/work/pytorch/benchmarks/dynamo/runner.py", line 1096, in comment_on_gh
    subprocess.check_call(
  File "/scratch/anijain/dashboard/env/lib/python3.9/subprocess.py", line 368, in check_call
    retcode = call(*popenargs, **kwargs)
  File "/scratch/anijain/dashboard/env/lib/python3.9/subprocess.py", line 349, in call
    with Popen(*popenargs, **kwargs) as p:
  File "/scratch/anijain/dashboard/env/lib/python3.9/subprocess.py", line 951, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "/scratch/anijain/dashboard/env/lib/python3.9/subprocess.py", line 1821, in _execute_child
    raise child_exception_type(errno_num, err_msg, err_filename)
OSError: [Errno 7] Argument list too long: '/data/home/anijain/miniconda/bin/gh'
srun: error: a100-st-p4d24xlarge-27: task 0: Exited with exit code 1
```
That is, we were trying to execute a gh command in the OS that was too long.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89002
Approved by: https://github.com/davidberard98
---
 benchmarks/dynamo/runner.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py
index 99c70426cd36e..d27763c41b0b6 100755
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@@ -36,6 +36,7 @@
 import re
 import shutil
 import subprocess
+import tempfile
 from collections import defaultdict
 from datetime import datetime
 from os.path import abspath, exists
@@ -1093,6 +1094,10 @@ def comment_on_gh(self, comment):
         """
         Send a commment to dashboard
         """
+        with tempfile.NamedTemporaryFile(mode="w", delete=False) as f:
+            f.write(comment)
+            filename = f.name
+
         subprocess.check_call(
             [
                 self.args.dashboard_gh_cli_path,
@@ -1100,11 +1105,13 @@ def comment_on_gh(self, comment):
                 "comment",
                 "--repo=https://github.com/pytorch/torchdynamo.git",
                 "681",
-                "-b",
-                comment,
+                "-F",
+                filename,
             ]
         )
 
+        os.remove(filename)
+
     def update(self):
         self.upload_graphs()
         AccuracyRegressionTracker(self.args).generate_comment()

From a301e64ca76e9e20263e57ec32bdb70fc8b65f37 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Sat, 12 Nov 2022 14:20:41 +0000
Subject: [PATCH 0869/1922] wrap_pybind_function: support member function
 pointers (#88932)

This updates `wrap_pybind_function` to use `invoke` and adds the
`invoke_traits` object which is analogous to `function_traits` but
for member functions it includes the class as an explicit argument.

To test this is working properly, I've also applied it to the
`CUDAGraph` binding code.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88932
Approved by: https://github.com/albanD
---
 aten/src/ATen/detail/FunctionTraits.h | 24 ++++++++++++++++++++++++
 torch/csrc/Exceptions.h               | 11 ++++++-----
 torch/csrc/cuda/Graph.cpp             | 10 +++++-----
 3 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/aten/src/ATen/detail/FunctionTraits.h b/aten/src/ATen/detail/FunctionTraits.h
index aab7300b585fe..f49a55e1326d5 100644
--- a/aten/src/ATen/detail/FunctionTraits.h
+++ b/aten/src/ATen/detail/FunctionTraits.h
@@ -76,3 +76,27 @@ struct binary_function_traits {
   using arg1_t = typename traits::template arg<0>::type;
   using arg2_t = typename traits::template arg<1>::type;
 };
+
+
+// Traits for calling with c10::guts::invoke, where member_functions have a first argument of ClassType
+template <typename T>
+struct invoke_traits : public function_traits<T>{
+};
+
+template <typename T>
+struct invoke_traits<T&> : public invoke_traits<T>{
+};
+
+template <typename T>
+struct invoke_traits<T&&> : public invoke_traits<T>{
+};
+
+template <typename ClassType, typename ReturnType, typename... Args>
+struct invoke_traits<ReturnType(ClassType::*)(Args...)> :
+  public function_traits<ReturnType(ClassType&, Args...)> {
+};
+
+template <typename ClassType, typename ReturnType, typename... Args>
+struct invoke_traits<ReturnType(ClassType::*)(Args...) const> :
+  public function_traits<ReturnType(const ClassType&, Args...)> {
+};
diff --git a/torch/csrc/Exceptions.h b/torch/csrc/Exceptions.h
index c9069a4a7c5b7..01caa6a702c0a 100644
--- a/torch/csrc/Exceptions.h
+++ b/torch/csrc/Exceptions.h
@@ -8,6 +8,7 @@
 #include <system_error>
 
 #include <ATen/detail/FunctionTraits.h>
+#include <c10/util/C++17.h>
 #include <c10/util/Exception.h>
 #include <c10/util/StringUtil.h>
 #include <pybind11/pybind11.h>
@@ -375,17 +376,17 @@ struct PyWarningHandler {
 
 namespace detail {
 template <typename Func, size_t i>
-using Arg = typename function_traits<Func>::template arg<i>::type;
+using Arg = typename invoke_traits<Func>::template arg<i>::type;
 
 template <typename Func, size_t... Is>
 auto wrap_pybind_function_impl_(Func&& f, std::index_sequence<Is...>) {
-  using traits = function_traits<Func>;
+  using result_type = typename invoke_traits<Func>::result_type;
   namespace py = pybind11;
 
   // f=f is needed to handle function references on older compilers
-  return [f = f](Arg<Func, Is>... args) -> typename traits::result_type {
+  return [f = std::forward<Func>(f)](Arg<Func, Is>... args) -> result_type {
     HANDLE_TH_ERRORS
-    return f(std::forward<Arg<Func, Is>>(args)...);
+    return c10::guts::invoke(f, std::forward<Arg<Func, Is>>(args)...);
     END_HANDLE_TH_ERRORS_PYBIND
   };
 }
@@ -395,7 +396,7 @@ auto wrap_pybind_function_impl_(Func&& f, std::index_sequence<Is...>) {
 // Returns a function object suitable for registering with pybind11.
 template <typename Func>
 auto wrap_pybind_function(Func&& f) {
-  using traits = function_traits<Func>;
+  using traits = invoke_traits<Func>;
   return torch::detail::wrap_pybind_function_impl_(
       std::forward<Func>(f), std::make_index_sequence<traits::arity>{});
 }
diff --git a/torch/csrc/cuda/Graph.cpp b/torch/csrc/cuda/Graph.cpp
index 0866b82f659dd..6d3a77c365e1b 100644
--- a/torch/csrc/cuda/Graph.cpp
+++ b/torch/csrc/cuda/Graph.cpp
@@ -30,23 +30,23 @@ void THCPGraph_init(PyObject* module) {
       // docs aren't clear. But it works.
       .def(
           "capture_begin",
-          &::at::cuda::CUDAGraph::capture_begin,
+          torch::wrap_pybind_function(&at::cuda::CUDAGraph::capture_begin),
           py::call_guard<py::gil_scoped_release>(),
           py::arg("pool") = c10::cuda::MempoolId_t{0, 0})
       .def(
           "capture_end",
-          &::at::cuda::CUDAGraph::capture_end,
+          torch::wrap_pybind_function(&at::cuda::CUDAGraph::capture_end),
           py::call_guard<py::gil_scoped_release>())
       .def(
           "replay",
-          &::at::cuda::CUDAGraph::replay,
+          torch::wrap_pybind_function(&at::cuda::CUDAGraph::replay),
           py::call_guard<py::gil_scoped_release>())
       .def(
           "reset",
-          &::at::cuda::CUDAGraph::reset,
+          torch::wrap_pybind_function(&at::cuda::CUDAGraph::reset),
           py::call_guard<py::gil_scoped_release>())
       .def(
           "pool",
-          &::at::cuda::CUDAGraph::pool,
+          torch::wrap_pybind_function(&at::cuda::CUDAGraph::pool),
           py::call_guard<py::gil_scoped_release>());
 }

From 71c7fef20a4e2fcae924c8254696d7c5b5f88186 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Mon, 14 Nov 2022 19:06:38 +0000
Subject: [PATCH 0870/1922] Fix typos in messages under torch (#88961)

This PR fixes typos of messages and parms in c++ source and head files under `torch` directory.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88961
Approved by: https://github.com/albanD
---
 torch/csrc/api/src/nn/modules/transformer.cpp |  4 ++--
 torch/csrc/api/src/optim/optimizer.cpp        |  4 ++--
 torch/csrc/autograd/FunctionsManual.cpp       |  2 +-
 torch/csrc/autograd/custom_function.h         |  2 +-
 torch/csrc/autograd/python_variable.cpp       |  2 +-
 .../distributed/c10d/ProcessGroupNCCL.cpp     |  4 ++--
 torch/csrc/distributed/c10d/UCCUtils.cpp      |  2 +-
 torch/csrc/distributed/c10d/init.cpp          |  2 +-
 torch/csrc/distributed/rpc/utils.cpp          |  2 +-
 .../xnnpack/compiler/xnn_compiler.cpp         |  2 +-
 torch/csrc/jit/codegen/cuda/arith.cpp         | 10 +++++-----
 .../csrc/jit/codegen/cuda/compute_at_map.cpp  |  2 +-
 torch/csrc/jit/codegen/cuda/disjoint_set.h    |  2 +-
 .../csrc/jit/codegen/cuda/executor_utils.cpp  |  6 +++---
 torch/csrc/jit/codegen/cuda/ir_nodes.cpp      |  8 ++++----
 torch/csrc/jit/codegen/cuda/kernel_cache.cpp  |  4 ++--
 .../csrc/jit/codegen/cuda/lower_expr_sort.cpp |  6 +++---
 .../cuda/lower_misaligned_vectorization.cpp   |  2 +-
 .../cuda/lower_predicate_elimination.cpp      |  2 +-
 .../jit/codegen/cuda/lower_validation.cpp     |  2 +-
 .../csrc/jit/codegen/cuda/root_domain_map.cpp |  2 +-
 .../jit/codegen/cuda/scheduler/mma_utils.cpp  |  2 +-
 .../csrc/jit/codegen/cuda/scheduler/utils.cpp |  2 +-
 torch/csrc/jit/codegen/cuda/tensor_view.cpp   |  2 +-
 .../csrc/jit/codegen/cuda/test/test_gpu3.cpp  |  2 +-
 .../jit/codegen/cuda/transform_rfactor.cpp    |  2 +-
 .../csrc/jit/codegen/cuda/transform_view.cpp  |  2 +-
 torch/csrc/jit/frontend/ir_emitter.cpp        |  2 +-
 torch/csrc/jit/ir/irparser.cpp                |  2 +-
 torch/csrc/jit/mobile/flatbuffer_loader.cpp   |  2 +-
 .../jit/passes/onnx/shape_type_inference.cpp  |  2 +-
 torch/csrc/jit/passes/peephole_non_tensor.cpp |  2 +-
 .../quantization/insert_quant_dequant.cpp     |  2 +-
 torch/csrc/jit/python/script_init.cpp         |  2 +-
 torch/csrc/jit/runtime/graph_executor.cpp     |  7 +++----
 torch/csrc/jit/runtime/static/ops.cpp         |  2 +-
 .../jit/serialization/export_bytecode.cpp     |  2 +-
 .../csrc/jit/serialization/export_module.cpp  |  2 +-
 torch/csrc/jit/serialization/import.cpp       |  2 +-
 torch/csrc/jit/tensorexpr/kernel.cpp          |  2 +-
 torch/csrc/jit/tensorexpr/llvm_codegen.cpp    | 20 +++++++++----------
 torch/csrc/lazy/core/config.cpp               |  2 +-
 torch/csrc/profiler/util.cpp                  |  4 ++--
 43 files changed, 70 insertions(+), 71 deletions(-)

diff --git a/torch/csrc/api/src/nn/modules/transformer.cpp b/torch/csrc/api/src/nn/modules/transformer.cpp
index 6d643fc7354f0..df08c629da561 100644
--- a/torch/csrc/api/src/nn/modules/transformer.cpp
+++ b/torch/csrc/api/src/nn/modules/transformer.cpp
@@ -466,7 +466,7 @@ Tensor TransformerImpl::generate_square_subsequent_mask(int64_t sz) {
   // Treat 0 dim valid here
   TORCH_CHECK(
       sz >= 0,
-      "Input size must be non-negative to genearte a valid square subsequent mask, but got ",
+      "Input size must be non-negative to generate a valid square subsequent mask, but got ",
       sz);
 
   // check IEEE754 support here since -inf is not guaranteed to be valid on non
@@ -479,7 +479,7 @@ Tensor TransformerImpl::generate_square_subsequent_mask(int64_t sz) {
   // platform
   else {
     TORCH_WARN_ONCE(
-        "IEEE754 is not supporetd on this platform, generate_square_subsequent_mask will fill "
+        "IEEE754 is not supported on this platform, generate_square_subsequent_mask will fill "
         "the mask with smallest float number on this platform instead of -inf");
     return torch::triu(
         torch::full({sz, sz}, std::numeric_limits<float>::lowest()), 1);
diff --git a/torch/csrc/api/src/optim/optimizer.cpp b/torch/csrc/api/src/optim/optimizer.cpp
index 95165d850cf6f..f73e54d2835f2 100644
--- a/torch/csrc/api/src/optim/optimizer.cpp
+++ b/torch/csrc/api/src/optim/optimizer.cpp
@@ -64,13 +64,13 @@ void OptimizerParamState::serialize(
 double OptimizerOptions::get_lr() const {
   TORCH_CHECK(
       false,
-      "double get_lr() has not been overidden and implemented in subclass of torch::optim::OptimizerOptions, you must override it in your subclass.");
+      "double get_lr() has not been overridden and implemented in subclass of torch::optim::OptimizerOptions, you must override it in your subclass.");
 }
 
 void OptimizerOptions::set_lr(const double lr) {
   TORCH_CHECK(
       false,
-      "double set_lr() has not been overidden and implemented in subclass of torch::optim::OptimizerOptions, you must override it in your subclass.");
+      "double set_lr() has not been overridden and implemented in subclass of torch::optim::OptimizerOptions, you must override it in your subclass.");
 }
 
 std::unique_ptr<OptimizerOptions> OptimizerOptions::clone() const {
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 5a3f96d47e30e..c0fbf5f6c0aa6 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -4846,7 +4846,7 @@ Tensor log1p_backward(const Tensor& grad, const Tensor& self) {
     // materialized so if self is strided and grad is sparse nothing unepected
     // happens memory wise
     TORCH_WARN(
-        "log1p_backward: recived self with sparse layout, but backward requires materialization of a dense tensor with this shape");
+        "log1p_backward: received self with sparse layout, but backward requires materialization of a dense tensor with this shape");
     self_p1_conj = (self.to_dense() + 1).conj();
   } else {
     // Although calling self.to_dense() would just return self when it has
diff --git a/torch/csrc/autograd/custom_function.h b/torch/csrc/autograd/custom_function.h
index bc7489292c239..d7670d924b1fa 100644
--- a/torch/csrc/autograd/custom_function.h
+++ b/torch/csrc/autograd/custom_function.h
@@ -300,7 +300,7 @@ auto Function<T>::apply(Args&&... args)
     TORCH_CHECK(
         false,
         "jvp is not implemented for the c++ API of custom Function yet.",
-        "Please open a feature request on Github if you need this.");
+        "Please open a feature request on GitHub if you need this.");
   };
 
   auto wrapped_outputs = _wrap_outputs(
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index 002b904d40721..e3ab10c7499ca 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -857,7 +857,7 @@ static PyObject* THPVariable_make_wrapper_subclass(
     if (sizes_strides_policy.has_value()) {
       TORCH_CHECK(
           false,
-          "Setting sizes_strides_policy isn't suppored for this overload")
+          "Setting sizes_strides_policy isn't supported for this overload")
     }
   }
 
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index c92d24af21c84..1d788a2c2e0c7 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -1120,7 +1120,7 @@ void ProcessGroupNCCL::broadcastUniqueNCCLID(
           "[",
           rank_,
           "] is setting up NCCL communicator and "
-          "retreiving ncclUniqueId from [0] via c10d key-value store by key '",
+          "retrieving ncclUniqueId from [0] via c10d key-value store by key '",
           storeKey,
           "', but store->get('",
           storeKey,
@@ -1133,7 +1133,7 @@ void ProcessGroupNCCL::broadcastUniqueNCCLID(
               "Unknown exception while [",
               rank_,
               "] is setting up NCCL communicator and "
-              "retreiving ncclUniqueId from [0] via c10d key-value store by key '",
+              "retrieving ncclUniqueId from [0] via c10d key-value store by key '",
               storeKey,
               "'"));
     }
diff --git a/torch/csrc/distributed/c10d/UCCUtils.cpp b/torch/csrc/distributed/c10d/UCCUtils.cpp
index ef934d1597f9d..590a931f2f110 100644
--- a/torch/csrc/distributed/c10d/UCCUtils.cpp
+++ b/torch/csrc/distributed/c10d/UCCUtils.cpp
@@ -186,7 +186,7 @@ void CommUCC::free_request(ucc_coll_req_h request) {
 CommUCC::~CommUCC() {
   if (context != nullptr) {
     TORCH_UCC_CHECK(
-        ucc_context_destroy(context), "failed to destory UCC context");
+        ucc_context_destroy(context), "failed to destroy UCC context");
   }
   if (lib != nullptr) {
     TORCH_UCC_CHECK(ucc_finalize(lib), "failed to finalize UCC library");
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 2424506eef0ff..313aabee7cd98 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -1906,7 +1906,7 @@ Example::
         Returns:
             A ``Work`` object which is associated with the completion of
             the ``torch.futures.Future``.
-        This is the prefered way of constructing Work objects when writing a custom ProcessGroup
+        This is the preferred way of constructing Work objects when writing a custom ProcessGroup
         in python.
         Example::
             >>> class SingleRankProcessGroup(torch.distributed.ProcessGroup):
diff --git a/torch/csrc/distributed/rpc/utils.cpp b/torch/csrc/distributed/rpc/utils.cpp
index 12e3f2edf7558..c20145e82d038 100644
--- a/torch/csrc/distributed/rpc/utils.cpp
+++ b/torch/csrc/distributed/rpc/utils.cpp
@@ -38,7 +38,7 @@ void processRemoteProfiledEvents(
   TORCH_CHECK(
       enabled,
       "Profiler was expected to be enabled. This can happen in callback "
-      " continutations that run in different threads, and the TLS of the "
+      " continuations that run in different threads, and the TLS of the "
       " profiler was not propagated.");
   std::vector<LegacyEvent> events = rpcWithProfilingResp.getProfiledEvents();
   const auto& profilingId = rpcWithProfilingResp.getProfilingId();
diff --git a/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp b/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp
index 395d59a1cf21d..3bbff2309904d 100644
--- a/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp
+++ b/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp
@@ -55,7 +55,7 @@ XNNExecutor XNNCompiler::compileModel(std::string ser_model) {
         auto buffer_idx = tensor_value->constant_buffer_idx();
         if (buffer_idx != 0) {
           // TODO: @maxren implement data handling
-          TORCH_CHECK(false, "Cosntant data handling not yet implemented")
+          TORCH_CHECK(false, "Constant data handling not yet implemented")
         }
         std::vector<size_t> dims_data;
         for (auto dim : *tensor_value->dims()) {
diff --git a/torch/csrc/jit/codegen/cuda/arith.cpp b/torch/csrc/jit/codegen/cuda/arith.cpp
index 8e8d82128512b..d4e1348ee6933 100644
--- a/torch/csrc/jit/codegen/cuda/arith.cpp
+++ b/torch/csrc/jit/codegen/cuda/arith.cpp
@@ -1094,7 +1094,7 @@ static TensorView* newForReduction(
 
   TORCH_INTERNAL_ASSERT(
       !axes_set.empty(),
-      "Asked for ouput of reduction, but no reduction axis provided.");
+      "Asked for output of reduction, but no reduction axis provided.");
 
   TORCH_INTERNAL_ASSERT(
       (*(axes_set.rbegin())) < orig_domain.size(),
@@ -1183,7 +1183,7 @@ TensorView* reductionOp(
 
     TORCH_CHECK(
         axis >= 0 && axis < ndims,
-        "Reduction on invalid axis, recieved: ",
+        "Reduction on invalid axis, received: ",
         axis,
         " however tensor view only has ",
         ndims,
@@ -1518,7 +1518,7 @@ WelfordResult Welford(
 
     TORCH_CHECK(
         axis >= 0 && axis < ndims,
-        "Reduction on invalid axis, recieved: ",
+        "Reduction on invalid axis, received: ",
         axis,
         " however tensor view only has ",
         ndims,
@@ -2228,7 +2228,7 @@ static TensorView* newForMma(
 
   TORCH_INTERNAL_ASSERT(
       !axes_set.empty(),
-      "Asked for ouput of reduction, but no reduction axis provided.");
+      "Asked for output of reduction, but no reduction axis provided.");
 
   TORCH_INTERNAL_ASSERT(
       (*(axes_set.rbegin())) < orig_domain_a.size(),
@@ -2319,7 +2319,7 @@ TensorView* fusedMultiplySum(
 
     TORCH_CHECK(
         axis >= 0 && axis < ndims,
-        "Reduction on invalid axis, recieved: ",
+        "Reduction on invalid axis, received: ",
         axis,
         " however tensor view only has ",
         ndims,
diff --git a/torch/csrc/jit/codegen/cuda/compute_at_map.cpp b/torch/csrc/jit/codegen/cuda/compute_at_map.cpp
index 7f3de6687eb3a..1c2ac627b5756 100644
--- a/torch/csrc/jit/codegen/cuda/compute_at_map.cpp
+++ b/torch/csrc/jit/codegen/cuda/compute_at_map.cpp
@@ -331,7 +331,7 @@ void IterDomainGraph::build(Fusion* fusion) {
             c_tv->getRootDomain().size() ==
                 first_output_tv->getRootDomain().size(),
             "Multiple outputs with mismatched dimensions is not supported. ",
-            "Only supported case is welford op where all outputs tvs have idential domains.");
+            "Only supported case is welford op where all outputs tvs have identical domains.");
         // p->f, c->c
         std::unordered_map<IterDomain*, IterDomain*> c2f_root_map;
         for (const auto i :
diff --git a/torch/csrc/jit/codegen/cuda/disjoint_set.h b/torch/csrc/jit/codegen/cuda/disjoint_set.h
index 09cf6e8de9504..8fd60dab5bd22 100644
--- a/torch/csrc/jit/codegen/cuda/disjoint_set.h
+++ b/torch/csrc/jit/codegen/cuda/disjoint_set.h
@@ -260,7 +260,7 @@ class DisjointSets {
         entry_it != disjointSetMap().end(),
         "Strict mapping failed on element: ",
         abstractToString(entry0),
-        " either an error occured, or non strict mapping should have been used.");
+        " either an error occurred, or non strict mapping should have been used.");
     return entry_it->second->has(entry1);
   }
 
diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
index 6da05cbf4dcba..217480a974edf 100644
--- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
@@ -155,7 +155,7 @@ bool validateKernelArgTensor(
   }
 
   if (!is_cpu_scalar(arg) && !arg.is_cuda()) {
-    msg << "Argumnet is a CPU tensor which is not supported in fusions.\n";
+    msg << "Argument is a CPU tensor which is not supported in fusions.\n";
     return false;
   }
 
@@ -824,7 +824,7 @@ void bindInputForExprEvaluation(
         if (root_domain[dim]->hasExpandedExtent()) {
           TORCH_INTERNAL_ASSERT(
               tensor_arg_stride == 0,
-              "Execting an expanded dimension on dimension ",
+              "Expecting an expanded dimension on dimension ",
               dim,
               " but found stride ",
               tensor_arg_stride);
@@ -838,7 +838,7 @@ void bindInputForExprEvaluation(
                 *maybe_expanded_size == tensor_arg_size,
                 "Expecting expanded extent of ",
                 *maybe_expanded_size,
-                " but recieved value of ",
+                " but received value of ",
                 tensor_arg_size);
           }
         }
diff --git a/torch/csrc/jit/codegen/cuda/ir_nodes.cpp b/torch/csrc/jit/codegen/cuda/ir_nodes.cpp
index 3b51b807a727d..c4d994f272be1 100644
--- a/torch/csrc/jit/codegen/cuda/ir_nodes.cpp
+++ b/torch/csrc/jit/codegen/cuda/ir_nodes.cpp
@@ -600,7 +600,7 @@ BroadcastOp::BroadcastOp(
           id->isReduction() || id->isStride(),
           "Invalid broadcast op: ",
           id,
-          ". Non-reduction input dim does't match to output.");
+          ". Non-reduction input dim doesn't match to output.");
     }
   }
 
@@ -2060,7 +2060,7 @@ TensorDomain::TensorDomain(
                              : std::move(contiguity)) {
   TORCH_CHECK(
       contiguity_.size() == getMaybeRFactorDomain().size(),
-      "Invalid contiguity information provided, incorrect size. Recieved vector of size ",
+      "Invalid contiguity information provided, incorrect size. Received vector of size ",
       contiguity_.size(),
       " but needed one of size ",
       root_domain_.size());
@@ -2084,7 +2084,7 @@ TensorDomain::TensorDomain(
                              : std::move(contiguity)) {
   TORCH_CHECK(
       contiguity_.size() == getMaybeRFactorDomain().size(),
-      "Invalid contiguity information provided, incorrect size. Recieved vector of size ",
+      "Invalid contiguity information provided, incorrect size. Received vector of size ",
       contiguity_.size(),
       " but needed one of size ",
       root_domain_.size());
@@ -2124,7 +2124,7 @@ TensorDomain::TensorDomain(
                              : std::move(contiguity)) {
   TORCH_CHECK(
       contiguity_.size() == getMaybeRFactorDomain().size(),
-      "Invalid contiguity information provided, incorrect size. Recieved vector of size ",
+      "Invalid contiguity information provided, incorrect size. Received vector of size ",
       contiguity_.size(),
       " but needed one of size ",
       getMaybeRFactorDomain().size());
diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
index efcc51f231b26..c4604042bfaed 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
+++ b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
@@ -493,7 +493,7 @@ void FusionKernelRuntime::startAsyncCompile(KernelArgumentHolder& args_old) {
 
     TORCH_INTERNAL_ASSERT(
         args.size() == segmented_fusion_->inputs().size(),
-        "Inputs were not set up correctly, recieved ",
+        "Inputs were not set up correctly, received ",
         args.size(),
         " inputs but expecting ",
         segmented_fusion_->inputs().size());
@@ -610,7 +610,7 @@ std::vector<at::Tensor> FusionKernelRuntime::runWithInput(
 
   TORCH_INTERNAL_ASSERT(
       args.size() == segmented_fusion_->inputs().size(),
-      "Inputs were not set up correctly, recieved ",
+      "Inputs were not set up correctly, received ",
       args.size(),
       " inputs but expecting ",
       segmented_fusion_->inputs().size());
diff --git a/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp b/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp
index 7f06aea2f5423..1e2806b11fd42 100644
--- a/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp
@@ -927,8 +927,8 @@ bool ExprSegmentationSorter::interIterUpdate() {
     // If we didn't finish and we tried the fallback, throw.
     TORCH_INTERNAL_ASSERT(
         !fallback_mode_enabled_,
-        "Couldn't succcessfully sort out the fusion expressions. ",
-        "There are remaining connections of the heirarchical segmentation which should have been ",
+        "Couldn't successfully sort out the fusion expressions. ",
+        "There are remaining connections of the hierarchical segmentation which should have been ",
         "flattened to a single ordered group, or disjoint ordered groups.");
     // We didn't finish, but we haven't tried the fallback, try again with that.
     fallback_mode_enabled_ = true;
@@ -1066,7 +1066,7 @@ void ExprSegmentationSorter::initializeForLoopDependencies() {
       }
     }
 
-    std::cerr << "Depdencies: " << std::endl;
+    std::cerr << "Dependencies: " << std::endl;
     for (const auto& dep_entry : concrete_id_dependencies) {
       std::cerr << "  Deps of " << dep_entry.first->toString() << std::endl
                 << "   ";
diff --git a/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.cpp b/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.cpp
index bd3c9baf66e1f..9e713f4cf3a23 100644
--- a/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.cpp
@@ -462,7 +462,7 @@ class MisalignedVectorizationModifier : public kir::ExprMutator {
 
       TORCH_INTERNAL_ASSERT(
           !gpu_lower->trivialReductionInfo().isDerived(producer_root_id),
-          "No trivial reduciton axis should exist: ",
+          "No trivial reduction axis should exist: ",
           producer_root_id);
 
       // If the producer ID is reduction or broadcast, it should be safe
diff --git a/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.cpp b/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.cpp
index 38df8229bb777..294a2327bbba0 100644
--- a/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_predicate_elimination.cpp
@@ -925,7 +925,7 @@ bool PredicateElimination::setReductionInitValue(
   } else {
     TORCH_INTERNAL_ASSERT(
         false,
-        "Incosistent setting of initialization value for t",
+        "Inconsistent setting of initialization value for t",
         tv->name(),
         ". Prev: ",
         existing_val->toString(),
diff --git a/torch/csrc/jit/codegen/cuda/lower_validation.cpp b/torch/csrc/jit/codegen/cuda/lower_validation.cpp
index da1def37cad84..f6f71c2ec123a 100644
--- a/torch/csrc/jit/codegen/cuda/lower_validation.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_validation.cpp
@@ -86,7 +86,7 @@ class ValidateSiblings : public IterVisitor {
         auto sibling_id = it->second;
         TORCH_INTERNAL_ASSERT(
             sibling->axis(i) == sibling_id,
-            "Invalid matching sinbling ID detected. Expr: ",
+            "Invalid matching sibling ID detected. Expr: ",
             expr->toString(),
             "Sibling ID: ",
             sibling_id->toString());
diff --git a/torch/csrc/jit/codegen/cuda/root_domain_map.cpp b/torch/csrc/jit/codegen/cuda/root_domain_map.cpp
index 09a740d01097d..235d257e2351d 100644
--- a/torch/csrc/jit/codegen/cuda/root_domain_map.cpp
+++ b/torch/csrc/jit/codegen/cuda/root_domain_map.cpp
@@ -486,7 +486,7 @@ bool ComputeAtRootDomainMap::canMap(
     const IterDomain* id_b) const {
   TORCH_INTERNAL_ASSERT(
       id_b->definition() == nullptr || id_b->isRFactorProduct(),
-      "Non-root domain is not supproted: ",
+      "Non-root domain is not supported: ",
       id_b);
 
   if (!id_b->isBroadcast()) {
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/mma_utils.cpp b/torch/csrc/jit/codegen/cuda/scheduler/mma_utils.cpp
index 1991cada00dda..ddf1061591ed0 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/mma_utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/mma_utils.cpp
@@ -208,7 +208,7 @@ std::vector<IterDomain*> getMmaDomains(MmaOp* mma, MmaDimension dimension) {
   TORCH_CHECK(
       a_domain.size() == b_domain.size() &&
           a_domain.size() == accumulator_domain.size(),
-      "Inconsisitent dimensions in mma op",
+      "Inconsistent dimensions in mma op",
       a_domain.size(),
       " ",
       b_domain.size(),
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp b/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp
index d985da926354b..4ba6b241e455c 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp
@@ -2366,7 +2366,7 @@ std::unordered_map<int, int> domainReorderAsRfactorMap(TensorView* tv) {
       // Should be impossible.
       TORCH_INTERNAL_ASSERT(
           pos0 != pos1,
-          "Didn't expect merge inputs to be the same iteratrion domain:\n",
+          "Didn't expect merge inputs to be the same iteration domain:\n",
           merge->toString());
 
       reordered_ids.erase(reordered_ids.begin() + pos0);
diff --git a/torch/csrc/jit/codegen/cuda/tensor_view.cpp b/torch/csrc/jit/codegen/cuda/tensor_view.cpp
index 633c98102e2e0..85f320fef2e43 100644
--- a/torch/csrc/jit/codegen/cuda/tensor_view.cpp
+++ b/torch/csrc/jit/codegen/cuda/tensor_view.cpp
@@ -757,7 +757,7 @@ TensorView* TensorView::rFactor(const std::vector<int>& axes) {
 
   TORCH_CHECK(
       !definition()->isA<GroupedReductionOp>(),
-      "For GroupedReducitonOp, use TensorView::rFactor(const std::vector<int>& axes, const std::vector<TensorView*>& tvs)");
+      "For GroupedReductionOp, use TensorView::rFactor(const std::vector<int>& axes, const std::vector<TensorView*>& tvs)");
 
   // Split tensor view into 2 parts
   auto domain_pair = domain()->rFactor(axes);
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp
index a8fb439af14f5..8d24cc3803747 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp
@@ -4096,7 +4096,7 @@ TEST_F(NVFuserTest, FusionUnsqueeze1_CUDA) {
   fusion.addOutput(tv2);
 
   TORCH_CHECK(
-      tv2->nDims() == 2, "Unpected unsqueeze result: ", tv2->toString());
+      tv2->nDims() == 2, "Unexpected unsqueeze result: ", tv2->toString());
   TORCH_CHECK(
       tv2->axis(1)->isBroadcast(),
       "Unexpected unsqueeze result: ",
diff --git a/torch/csrc/jit/codegen/cuda/transform_rfactor.cpp b/torch/csrc/jit/codegen/cuda/transform_rfactor.cpp
index dc5973c0ecd6a..8d5151074563e 100644
--- a/torch/csrc/jit/codegen/cuda/transform_rfactor.cpp
+++ b/torch/csrc/jit/codegen/cuda/transform_rfactor.cpp
@@ -262,7 +262,7 @@ std::pair<TensorDomain*, TensorDomain*> TransformRFactor::runReplay(
   std::transform(axes.begin(), axes.end(), axes.begin(), [ndims](int i) {
     TORCH_CHECK(
         i >= -ndims && i < ndims,
-        "Rfactor replay recieved an axis outside the number of dims in the tensor, acceptable inclusive range is ",
+        "Rfactor replay received an axis outside the number of dims in the tensor, acceptable inclusive range is ",
         -ndims,
         " to ",
         ndims - 1);
diff --git a/torch/csrc/jit/codegen/cuda/transform_view.cpp b/torch/csrc/jit/codegen/cuda/transform_view.cpp
index e5f9c068f16c1..a543c6d0f79cf 100644
--- a/torch/csrc/jit/codegen/cuda/transform_view.cpp
+++ b/torch/csrc/jit/codegen/cuda/transform_view.cpp
@@ -732,7 +732,7 @@ AnalyzeViewResult analyzeView(
   FUSER_PERF_SCOPE("analyzeView");
   TORCH_INTERNAL_ASSERT(
       original_sizes.size() > 0,
-      "Empty original size not supported for view operatioon.");
+      "Empty original size not supported for view operation.");
 
   TORCH_INTERNAL_ASSERT(
       TensorDomain::noReductions(original_view_tv->getMaybeRFactorDomain())
diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp
index d60dd77bc8dad..7c53dbd0b3392 100644
--- a/torch/csrc/jit/frontend/ir_emitter.cpp
+++ b/torch/csrc/jit/frontend/ir_emitter.cpp
@@ -5640,7 +5640,7 @@ void CompilationUnit::define_interface(
   for (const Stmt& stmt : classDef.body()) {
     if (stmt.kind() != TK_DEF) {
       throw ErrorReport(stmt)
-          << "interface declartions can only contain method definitions";
+          << "interface declarations can only contain method definitions";
     }
     auto method_def = Def(stmt);
     if (!method_def.decl().return_type().present()) {
diff --git a/torch/csrc/jit/ir/irparser.cpp b/torch/csrc/jit/ir/irparser.cpp
index 1f790de92cb1a..0673645731da0 100644
--- a/torch/csrc/jit/ir/irparser.cpp
+++ b/torch/csrc/jit/ir/irparser.cpp
@@ -237,7 +237,7 @@ ParsedLiteral IRParser::parseScalarLiteral(Node* n) {
       auto text = L.expect(TK_NUMBER);
       if (!parse_tensor_constants_) {
         throw ErrorReport(token.range)
-            << "Single-element tensor constant encoutered but "
+            << "Single-element tensor constant encountered but "
             << "`parse_tensor_constants` is set to false " << token.text();
       }
       L.expect('}');
diff --git a/torch/csrc/jit/mobile/flatbuffer_loader.cpp b/torch/csrc/jit/mobile/flatbuffer_loader.cpp
index 45e31fb5e1747..29c29925ef099 100644
--- a/torch/csrc/jit/mobile/flatbuffer_loader.cpp
+++ b/torch/csrc/jit/mobile/flatbuffer_loader.cpp
@@ -718,7 +718,7 @@ void FlatbufferLoader::extractJitSourceAndConstants(
     std::vector<IValue>* constants) {
   AT_ASSERT(
       module_parsed_,
-      "Need to first parse a flatbuffer file before extracing jit_sources");
+      "Need to first parse a flatbuffer file before extracting jit_sources");
 
   const auto* ivalues = module_->ivalues();
   for (uint32_t i = mobile_ivalue_size_; i < ivalues->size(); i++) {
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
index f646fe77e07ac..8baa439bdb58a 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
@@ -2228,7 +2228,7 @@ size_t ONNXAssignOutputShape(
           auto& new_var = THPVariable_Unpack(list_elem);
           TORCH_CHECK(
               var.scalar_type() == new_var.scalar_type(),
-              "Unsupported sequence with mixed elment types in model outputs. "
+              "Unsupported sequence with mixed element types in model outputs. "
               "ONNX supports only sequences of elements of the same data type.");
         }
         auto elem_type = graph->outputs()
diff --git a/torch/csrc/jit/passes/peephole_non_tensor.cpp b/torch/csrc/jit/passes/peephole_non_tensor.cpp
index c114ea759e52c..10ff3db0586a0 100644
--- a/torch/csrc/jit/passes/peephole_non_tensor.cpp
+++ b/torch/csrc/jit/passes/peephole_non_tensor.cpp
@@ -15,7 +15,7 @@ namespace {
  * constant int value if there exists one.
  *
  * @pre node is integer arithmetic.
- * @post if there's one constant in two oprands, then the second operand is
+ * @post if there's one constant in two operands, then the second operand is
  *       constant.
  */
 c10::optional<int64_t> checkArithNode(Node& node) {
diff --git a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
index 54bd6679980e6..3270ef4ced82e 100644
--- a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
+++ b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
@@ -1554,7 +1554,7 @@ Node* insertQuantDequantNodes<QuantOpParams>(
 void checkCalculateQParamsResultTypes(const Node* out) {
   TORCH_CHECK(
       out->outputs().size() == 2,
-      "cacluate_qparams should produce output of size 2 (scale, zero_point).");
+      "calculate_qparams should produce output of size 2 (scale, zero_point).");
   Value* scale = out->output(0);
   Value* zp = out->output(1);
   TORCH_CHECK(
diff --git a/torch/csrc/jit/python/script_init.cpp b/torch/csrc/jit/python/script_init.cpp
index 030e2525a1635..2c6f8b1daca83 100644
--- a/torch/csrc/jit/python/script_init.cpp
+++ b/torch/csrc/jit/python/script_init.cpp
@@ -1774,7 +1774,7 @@ void initJitScriptBindings(PyObject* module) {
           if (def.kind() != TK_DEF) {
             throw ErrorReport(def.range())
                 << "Currently class bodies can only contain method "
-                   "definitions. File an issue on Github if you want "
+                   "definitions. File an issue on GitHub if you want "
                    "something else!";
           }
           methodDefs.emplace_back(Def(def));
diff --git a/torch/csrc/jit/runtime/graph_executor.cpp b/torch/csrc/jit/runtime/graph_executor.cpp
index c2c84eb9e4e47..88a092c39fe05 100644
--- a/torch/csrc/jit/runtime/graph_executor.cpp
+++ b/torch/csrc/jit/runtime/graph_executor.cpp
@@ -922,13 +922,12 @@ void runNondiffOptimization(
     std::shared_ptr<Graph>& graph,
     bool strict_fuser_check) {
   GRAPH_DEBUG(
-      "Before customPrePassses (beginning of runNondiffOptimization)\n",
-      *graph);
+      "Before customPrePasses (beginning of runNondiffOptimization)\n", *graph);
   // Run custom passes that different backends can register.
   for (const auto& passPair : getCustomPrePasses()) {
     passPair.first(graph);
   }
-  GRAPH_DEBUG("After customPrePassses\n", *graph);
+  GRAPH_DEBUG("After customPrePasses\n", *graph);
 
   // decomposition pass, decompose certain ops that will be used in the
   // following passes (like batchmm and jit fusion)
@@ -960,7 +959,7 @@ void runNondiffOptimization(
     passPair.first(graph);
   }
   GRAPH_DEBUG(
-      "After customPostPassses (end of runNondiffOptimization)\n", *graph);
+      "After customPostPasses (end of runNondiffOptimization)\n", *graph);
 }
 
 void runOptimization(
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 92044ca565a9c..834a71b081614 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -45,7 +45,7 @@
 C10_DEFINE_bool(
     static_runtime_enable_fast_math,
     true,
-    "If on, static runtime may use use optimizations that cause accurary loss "
+    "If on, static runtime may use use optimizations that cause accuracy loss "
     "vs the jit interpreter");
 
 namespace at {
diff --git a/torch/csrc/jit/serialization/export_bytecode.cpp b/torch/csrc/jit/serialization/export_bytecode.cpp
index b56c4980211a8..6f30f82899ed4 100644
--- a/torch/csrc/jit/serialization/export_bytecode.cpp
+++ b/torch/csrc/jit/serialization/export_bytecode.cpp
@@ -212,7 +212,7 @@ mobile::Code compileGraphToMobileCode(
           for (const TypePtr& element_type : input_type->containedTypes()) {
             TORCH_CHECK(
                 element_type->kind() != TypeKind::ClassType,
-                "Returining a list or dictionary with pytorch class type ",
+                "Returning a list or dictionary with pytorch class type ",
                 "is not supported in mobile module "
                 "(List[Foo] or Dict[int, Foo] for class Foo(torch.nn.Module)). "
                 "Workaround: instead of using pytorch class as their element type, ",
diff --git a/torch/csrc/jit/serialization/export_module.cpp b/torch/csrc/jit/serialization/export_module.cpp
index b29f1e2914c0c..90f9f9411b38e 100644
--- a/torch/csrc/jit/serialization/export_module.cpp
+++ b/torch/csrc/jit/serialization/export_module.cpp
@@ -95,7 +95,7 @@ ExportModuleExtraFilesHook& GetExtraFilesHook() {
  *         ]
  *     ]"
  *
- * @param compilation_unit Jit compilcation unit to look up function schema.
+ * @param compilation_unit Jit compilation unit to look up function schema.
  * @param type_ptr A type pointer and it can be possibly any type.
  * @param default_type_str The default string representation. The string can
  * either from type_ptr->str(), type_ptr->annotation_str(), or
diff --git a/torch/csrc/jit/serialization/import.cpp b/torch/csrc/jit/serialization/import.cpp
index a72abeaede8e1..b79d29726bef1 100644
--- a/torch/csrc/jit/serialization/import.cpp
+++ b/torch/csrc/jit/serialization/import.cpp
@@ -444,7 +444,7 @@ Module _load_jit_module_from_bytes(
     std::shared_ptr<CompilationUnit> cu,
     c10::optional<c10::Device> device,
     ExtraFilesMap& extra_files) {
-  TORCH_CHECK(size >= kFileFormatHeaderSize, "Unrecorgnized data format");
+  TORCH_CHECK(size >= kFileFormatHeaderSize, "Unrecognized data format");
   auto format = getFileFormat(data.get());
   switch (format) {
     case FileFormat::FlatbufferFileFormat: {
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index c30ed316e48b1..eb108abfb0296 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -29,7 +29,7 @@ namespace tensorexpr {
 
 std::string buildErrorMessage(const std::string& s) {
   static const std::string generic_error_message =
-      "This error occured in the fuser. You can turn off the fuser with "
+      "This error occurred in the fuser. You can turn off the fuser with "
       "torch.jit.enable_fusion(False).";
   if (s.empty()) {
     return generic_error_message;
diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index 1ca5665b4432a..f6801973dd6b1 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -694,7 +694,7 @@ void LLVMCodeGenImpl::visit(AddPtr v) {
   } else if (!lfp && !rfp) {
     value_ = irb_.CreateAdd(lhs, rhs);
   } else {
-    throw malformed_input("llvm_codgen: bad type in Add", v);
+    throw malformed_input("llvm_codegen: bad type in Add", v);
   }
 }
 
@@ -712,7 +712,7 @@ void LLVMCodeGenImpl::visit(SubPtr v) {
   } else if (!lfp && !rfp) {
     value_ = irb_.CreateSub(lhs, rhs);
   } else {
-    throw malformed_input("llvm_codgen: bad type in Sub", v);
+    throw malformed_input("llvm_codegen: bad type in Sub", v);
   }
 }
 
@@ -730,7 +730,7 @@ void LLVMCodeGenImpl::visit(MulPtr v) {
   } else if (!lfp && !rfp) {
     value_ = irb_.CreateMul(lhs, rhs);
   } else {
-    throw malformed_input("llvm_codgen: bad type in Mul", v);
+    throw malformed_input("llvm_codegen: bad type in Mul", v);
   }
 }
 
@@ -748,7 +748,7 @@ void LLVMCodeGenImpl::visit(DivPtr v) {
   } else if (!lfp && !rfp) {
     value_ = irb_.CreateSDiv(lhs, rhs);
   } else {
-    throw malformed_input("llvm_codgen: bad type in Div", v);
+    throw malformed_input("llvm_codegen: bad type in Div", v);
   }
 }
 
@@ -763,7 +763,7 @@ void LLVMCodeGenImpl::visit(AndPtr v) {
   if (!lfp && !rfp) {
     value_ = irb_.CreateAnd(lhs, rhs);
   } else {
-    throw malformed_input("llvm_codgen: bad type in And", v);
+    throw malformed_input("llvm_codegen: bad type in And", v);
   }
 }
 
@@ -778,7 +778,7 @@ void LLVMCodeGenImpl::visit(OrPtr v) {
   if (!lfp && !rfp) {
     value_ = irb_.CreateOr(lhs, rhs);
   } else {
-    throw malformed_input("llvm_codgen: bad type in Or", v);
+    throw malformed_input("llvm_codegen: bad type in Or", v);
   }
 }
 
@@ -793,7 +793,7 @@ void LLVMCodeGenImpl::visit(XorPtr v) {
   if (!lfp && !rfp) {
     value_ = irb_.CreateXor(lhs, rhs);
   } else {
-    throw malformed_input("llvm_codgen: bad type in Xor", v);
+    throw malformed_input("llvm_codegen: bad type in Xor", v);
   }
 }
 
@@ -808,7 +808,7 @@ void LLVMCodeGenImpl::visit(LshiftPtr v) {
   if (!lfp && !rfp) {
     value_ = irb_.CreateShl(lhs, rhs);
   } else {
-    throw malformed_input("llvm_codgen: bad type in Lshift", v);
+    throw malformed_input("llvm_codegen: bad type in Lshift", v);
   }
 }
 
@@ -827,7 +827,7 @@ void LLVMCodeGenImpl::visit(RshiftPtr v) {
       value_ = irb_.CreateLShr(lhs, rhs);
     }
   } else {
-    throw malformed_input("llvm_codgen: bad type in Rshift", v);
+    throw malformed_input("llvm_codegen: bad type in Rshift", v);
   }
 }
 
@@ -842,7 +842,7 @@ void LLVMCodeGenImpl::visit(ModPtr v) {
   if (!lfp && !rfp) {
     value_ = irb_.CreateSRem(lhs, rhs);
   } else {
-    throw malformed_input("llvm_codgen: bad type in Mod", v);
+    throw malformed_input("llvm_codegen: bad type in Mod", v);
   }
 }
 
diff --git a/torch/csrc/lazy/core/config.cpp b/torch/csrc/lazy/core/config.cpp
index d87036767be59..c39fd8fef75a4 100644
--- a/torch/csrc/lazy/core/config.cpp
+++ b/torch/csrc/lazy/core/config.cpp
@@ -10,7 +10,7 @@ C10_DEFINE_bool(
 C10_DEFINE_bool(
     torch_lazy_handle_special_scalars,
     false,
-    "Handle special scalars 0 and 1 diffrently");
+    "Handle special scalars 0 and 1 differently");
 
 C10_DEFINE_bool(
     torch_lazy_all_numbers_special_scalars,
diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp
index 08a20c84805e9..f4fb4dd1eee12 100644
--- a/torch/csrc/profiler/util.cpp
+++ b/torch/csrc/profiler/util.cpp
@@ -347,7 +347,7 @@ static bool validateInput(
     const c10::ArrayRef<int>& should_be_tensor) {
   std::stringstream ss;
   if (inputs.size() < min_size) {
-    ss << "Failed to save extra arguments for flops compuation of op "
+    ss << "Failed to save extra arguments for flops computation of op "
        << op_name << ", min size: " << min_size
        << ", actual size: " << inputs.size();
     TORCH_WARN(ss.str());
@@ -355,7 +355,7 @@ static bool validateInput(
   }
   for (auto index : should_be_tensor) {
     if (!inputs[index].isTensor()) {
-      ss << "Failed to save extra arguments for flops compuation of op "
+      ss << "Failed to save extra arguments for flops computation of op "
          << op_name << ", input[" << index << "] must be a tensor.";
       TORCH_WARN(ss.str());
       return false;

From 7da1ee66da3ce15b1f19fdcbfb9e3d336a76d075 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@meta.com>
Date: Mon, 14 Nov 2022 19:27:46 +0000
Subject: [PATCH 0871/1922] [quant][executorch] Support quant fusion for cat in
 quant in executorch stack (#88960)

Summary:
* added cat in executorch backend config
* added quant fusion for "dq - cat - q" pattern

Test Plan: buck run executorch/exir/tests:quant_fusion_pass -- "executorch.exir.tests.test_quant_fusion_pass.TestQuantFusionPass.test_cat"

Reviewed By: qihqi

Differential Revision: D41111054

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88960
Approved by: https://github.com/JacobSzwejbka
---
 torch/ao/quantization/backend_config/executorch.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/torch/ao/quantization/backend_config/executorch.py b/torch/ao/quantization/backend_config/executorch.py
index 3c729327de760..627143c000991 100644
--- a/torch/ao/quantization/backend_config/executorch.py
+++ b/torch/ao/quantization/backend_config/executorch.py
@@ -200,6 +200,14 @@ def _get_bn_configs() -> List[BackendPatternConfig]:
             .set_dtype_configs(dtype_configs))
     return bn_configs
 
+def _get_cat_configs() -> List[BackendPatternConfig]:
+    dtype_configs = [executorch_default_op_quint8_dtype_config]
+    cat_configs = []
+    cat_configs.append(
+        BackendPatternConfig(torch.cat)
+        .set_observation_type(ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT)
+        .set_dtype_configs(dtype_configs))
+    return cat_configs
 
 # =====================
 # |  BACKEND CONFIGS  |
@@ -214,4 +222,5 @@ def get_executorch_backend_config() -> BackendConfig:
         .set_backend_pattern_configs(_get_conv_configs()) \
         .set_backend_pattern_configs(_get_binary_ops_configs()) \
         .set_backend_pattern_configs(_get_share_qparams_ops_configs()) \
-        .set_backend_pattern_configs(_get_bn_configs())
+        .set_backend_pattern_configs(_get_bn_configs()) \
+        .set_backend_pattern_configs(_get_cat_configs())

From 30f6f2f8a75b63cba4f9e82afcff2f0743ba2c42 Mon Sep 17 00:00:00 2001
From: anjali411 <chourdiaanjali123@gmail.com>
Date: Mon, 14 Nov 2022 14:43:15 +0000
Subject: [PATCH 0872/1922] Remove skip (#88979)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88979
Approved by: https://github.com/voznesenskym
---
 test/inductor/test_torchinductor.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index ba1f9032d97f0..23fb2f7712e04 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -4791,9 +4791,6 @@ def forward(self, x):
                     for param in model_opt.parameters():
                         param.add_(1.0)
 
-        # Probably fails due to the symint math issue caught while adding
-        # max_pool2d_with_indices_backward
-        @unittest.skip("Accuracy failure, needs debugging")
         def test_accuracy_issue1(self):
             class Repro(torch.nn.Module):
                 def __init__(self):

From 45fd4215d0910d3129e377096ee573c7c24b6930 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 14 Nov 2022 14:34:01 -0500
Subject: [PATCH 0873/1922] woof (#89010)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Differential Revision: [D41276175](https://our.internmc.facebook.com/intern/diff/D41276175)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89010
Approved by: https://github.com/bigfootjon
---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index bcce2997b25b6..49bd2dfed706d 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
 ![PyTorch Logo](https://github.com/pytorch/pytorch/blob/master/docs/source/_static/img/pytorch-logo-dark.png)
 
+woof
+
 --------------------------------------------------------------------------------
 
 PyTorch is a Python package that provides two high-level features:

From 87477ad39843ec8d5fc087b7d0dab027e4da1892 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 14 Nov 2022 21:21:09 +0000
Subject: [PATCH 0874/1922] Revert "woof (#89010)"

This reverts commit 4570bd6030c97577d2fa994857d0a022ef7563a4.

Reverted https://github.com/pytorch/pytorch/pull/89010 on behalf of https://github.com/ezyang due to whoops this actually landed
---
 README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/README.md b/README.md
index 49bd2dfed706d..bcce2997b25b6 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,5 @@
 ![PyTorch Logo](https://github.com/pytorch/pytorch/blob/master/docs/source/_static/img/pytorch-logo-dark.png)
 
-woof
-
 --------------------------------------------------------------------------------
 
 PyTorch is a Python package that provides two high-level features:

From 55c6870745726cf6fa9ef35d03a16431adcb2295 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Mon, 14 Nov 2022 21:54:46 +0000
Subject: [PATCH 0875/1922] [CI] Push `latest` and hash+CUDAver tags (#88971)

For nightly docker build to simulate the behavior of `push_nightly_docker_ghcr.yml`

Tested in https://github.com/pytorch/pytorch/actions/runs/3465221336/jobs/5787694933

Fixes https://github.com/pytorch/pytorch/issues/88833

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88971
Approved by: https://github.com/seemethere
---
 .github/workflows/docker-release.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml
index d1b9209c4076e..0f9638e210ade 100644
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@@ -91,6 +91,20 @@ jobs:
         # WITH_PUSH is used here to determine whether or not to add the --push flag
         run: |
           make -f docker.Makefile "${BUILD_IMAGE_TYPE}-image"
+      - name: Push nightly tags
+        if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' }}
+        run: |
+          PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-runtime"
+          CUDA_VERSION=$(python3 -c "import re;print(re.search('CUDA_VERSION\s+=\s+([0-9\.]+)',open('docker.Makefile').read())[1],end='')")
+          PYTORCH_NIGHTLY_COMMIT=$(docker run ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_DOCKER_TAG}" \
+                                          python -c 'import torch; print(torch.version.git_version[:7],end="")')
+          docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_DOCKER_TAG}" \
+                 ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION}"
+          docker push ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION}"
+
+          docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION}" \
+                 ghcr.io/pytorch/pytorch-nightly:latest
+          docker push ghcr.io/pytorch/pytorch-nightly:latest
       - name: Teardown Linux
         uses: pytorch/test-infra/.github/actions/teardown-linux@main
         if: always()

From 89c59550821c50fc20f4c4f3b435ef1221b85d02 Mon Sep 17 00:00:00 2001
From: mikey dagitses <mikeyd@meta.com>
Date: Mon, 14 Nov 2022 22:11:29 +0000
Subject: [PATCH 0876/1922] support running test_mobile_profiler with
 buck1/buck2 and OSS (#89001)

Summary:
Internally we are switching to a new version of buck, but we also must
keep this working in OSS.

Test Plan: Rely on CI.

Differential Revision: D41270673

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89001
Approved by: https://github.com/r-barnes, https://github.com/osalpekar, https://github.com/malfet
---
 .../lite_interpreter_runtime/CMakeLists.txt   |  1 +
 test/cpp/lite_interpreter_runtime/resources.h | 19 +++++++++++
 .../test_mobile_profiler.cpp                  | 34 ++++++++-----------
 3 files changed, 34 insertions(+), 20 deletions(-)
 create mode 100644 test/cpp/lite_interpreter_runtime/resources.h

diff --git a/test/cpp/lite_interpreter_runtime/CMakeLists.txt b/test/cpp/lite_interpreter_runtime/CMakeLists.txt
index 6a2e6db6eaa97..b75ba4ed984ef 100644
--- a/test/cpp/lite_interpreter_runtime/CMakeLists.txt
+++ b/test/cpp/lite_interpreter_runtime/CMakeLists.txt
@@ -25,6 +25,7 @@ target_link_libraries(test_lite_interpreter_runtime PRIVATE torch gtest backend_
 
 if(LINUX)
   target_link_libraries(test_lite_interpreter_runtime PRIVATE "-Wl,--no-as-needed,$<TARGET_FILE:backend_with_compiler_runtime>,--as-needed")
+  target_link_libraries(test_lite_interpreter_runtime PRIVATE stdc++fs)
 endif()
 
 if(INSTALL_TEST)
diff --git a/test/cpp/lite_interpreter_runtime/resources.h b/test/cpp/lite_interpreter_runtime/resources.h
new file mode 100644
index 0000000000000..07f13ca8b86a0
--- /dev/null
+++ b/test/cpp/lite_interpreter_runtime/resources.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <experimental/filesystem>
+#include <string>
+
+namespace torch {
+namespace testing {
+
+/// Gets the path to the resource identified by name.
+///
+/// @param name identifies a resource, relative path starting from the
+///             repo root
+inline auto getResourcePath(std::string name)
+    -> std::experimental::filesystem::path {
+  return std::move(name);
+}
+
+} // namespace testing
+} // namespace torch
diff --git a/test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp b/test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp
index 08cb81ae78763..df9cb9cea28c6 100644
--- a/test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp
+++ b/test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp
@@ -11,6 +11,8 @@
 
 #include <torch/csrc/profiler/events.h>
 
+#include "test/cpp/lite_interpreter_runtime/resources.h"
+
 #ifdef EDGE_PROFILER_USE_KINETO
 namespace torch {
 namespace jit {
@@ -42,16 +44,15 @@ bool checkMetaData(
 } // namespace
 
 TEST(MobileProfiler, ModuleHierarchy) {
-  std::string filePath(__FILE__);
-  auto testModelFile = filePath.substr(0, filePath.find_last_of("/\\") + 1);
-  testModelFile.append("to_be_profiled_module.ptl");
+  auto testModelFile = torch::testing::getResourcePath(
+      "test/cpp/lite_interpreter_runtime/to_be_profiled_module.ptl");
 
   std::vector<IValue> inputs;
   inputs.emplace_back(at::rand({64, 64}));
   inputs.emplace_back(at::rand({64, 64}));
   std::string trace_file_name("/tmp/test_trace.trace");
 
-  mobile::Module bc = _load_for_mobile(testModelFile);
+  mobile::Module bc = _load_for_mobile(testModelFile.string());
   {
     KinetoEdgeCPUProfiler profiler(
         bc,
@@ -95,16 +96,15 @@ TEST(MobileProfiler, ModuleHierarchy) {
 }
 
 TEST(MobileProfiler, Backend) {
-  std::string filePath(__FILE__);
-  auto testModelFile = filePath.substr(0, filePath.find_last_of("/\\") + 1);
-  testModelFile.append("test_backend_for_profiling.ptl");
+  auto testModelFile = torch::testing::getResourcePath(
+      "test/cpp/lite_interpreter_runtime/test_backend_for_profiling.ptl");
 
   std::vector<IValue> inputs;
   inputs.emplace_back(at::rand({64, 64}));
   inputs.emplace_back(at::rand({64, 64}));
   std::string trace_file_name("/tmp/test_trace_backend.trace");
 
-  mobile::Module bc = _load_for_mobile(testModelFile);
+  mobile::Module bc = _load_for_mobile(testModelFile.string());
   {
     KinetoEdgeCPUProfiler profiler(
         bc,
@@ -130,16 +130,15 @@ TEST(MobileProfiler, Backend) {
 }
 
 TEST(MobileProfiler, BackendMemoryEvents) {
-  std::string filePath(__FILE__);
-  auto testModelFile = filePath.substr(0, filePath.find_last_of("/\\") + 1);
-  testModelFile.append("test_backend_for_profiling.ptl");
+  auto testModelFile = torch::testing::getResourcePath(
+      "test/cpp/lite_interpreter_runtime/test_backend_for_profiling.ptl");
 
   std::vector<IValue> inputs;
   inputs.emplace_back(at::rand({64, 64}));
   inputs.emplace_back(at::rand({64, 64}));
   std::string trace_file_name("/tmp/test_trace_backend_memory.trace");
 
-  mobile::Module bc = _load_for_mobile(testModelFile);
+  mobile::Module bc = _load_for_mobile(testModelFile.string());
   {
     mobile::KinetoEdgeCPUProfiler profiler(
         bc,
@@ -163,13 +162,8 @@ TEST(MobileProfiler, BackendMemoryEvents) {
 }
 
 TEST(MobileProfiler, ProfilerEvent) {
-  /*
-   * TODO: Using __FILE__ is unreliable e.g. it fails to resolve correctly when
-   * using buck2, works ok with buck1
-   */
-  std::string filePath(__FILE__);
-  auto testModelFile = filePath.substr(0, filePath.find_last_of("/\\") + 1);
-  testModelFile.append("test_backend_for_profiling.ptl");
+  auto testModelFile = torch::testing::getResourcePath(
+      "test/cpp/lite_interpreter_runtime/test_backend_for_profiling.ptl");
 
   std::vector<IValue> inputs;
   inputs.emplace_back(at::rand({64, 64}));
@@ -180,7 +174,7 @@ TEST(MobileProfiler, ProfilerEvent) {
       torch::profiler::ProfilerPerfEvents.begin(),
       torch::profiler::ProfilerPerfEvents.end());
 
-  mobile::Module bc = _load_for_mobile(testModelFile);
+  mobile::Module bc = _load_for_mobile(testModelFile.string());
   {
     // Bail if something goes wrong here
     try {

From d52476f91bb25921b59fce8ef55c7cf8df465680 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Mon, 14 Nov 2022 22:45:50 +0000
Subject: [PATCH 0877/1922] [Dynamo] Fix torch.is_tensor and
 torch.overrides.is_tensor_like (#88704)

Fixes error from 7k github models: https://github.com/jansel/pytorch-jit-paritybench/blob/master/generated/test_arashwan_matrixnet.py

Error:
```
AssertionError: torch.* op returned non-Tensor bool call_function <function is_tensor at 0x7fca94d0faf0>

from user code:
   File "/scratch/ybliang/work/repos/pytorch-jit-paritybench/generated/test_arashwan_matrixnet.py", line 749, in scatter
      return scatter_map(inputs)
   File "/scratch/ybliang/work/repos/pytorch-jit-paritybench/generated/test_arashwan_matrixnet.py", line 741, in scatter_map
      assert not torch.is_tensor(obj), 'Tensors not supported in scatter.'
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88704
Approved by: https://github.com/jansel
---
 test/dynamo/test_misc.py         | 43 ++++++++++++++++++++++++++++++++
 torch/_dynamo/variables/torch.py | 21 +++++++++-------
 2 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index e270852fc5269..e27f7bc5198dd 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -400,6 +400,23 @@ def fn(a, b):
 
         return torch._dynamo.testing.standard_test(self, fn=fn, nargs=2, expected_ops=3)
 
+    def test_is_tensor2(self):
+        def fn(x):
+            if torch.is_tensor(x):
+                return x + 1
+            else:
+                return torch.ones([2, 3])
+
+        x1 = {"input": torch.rand(2, 3)}
+        x2 = torch.rand(2, 3)
+        ref1 = fn(x1)
+        ref2 = fn(x2)
+        opt_fn = torch._dynamo.optimize("eager")(fn)
+        res1 = opt_fn(x1)
+        res2 = opt_fn(x2)
+        self.assertEqual(ref1, res1)
+        self.assertEqual(ref2, res2)
+
     def test_numel(self):
         def fn(a):
             return a + a.numel() + torch.numel(a)
@@ -1244,6 +1261,32 @@ def f(x):
         self.assertTrue(same(ref0, res0))
         self.assertTrue(same(ref1, res1))
 
+    def test_is_tensor_like2(self):
+        class MyTensor(object):
+            @classmethod
+            def __torch_function__(cls, func, types, args=(), kwargs=None):
+                if kwargs is None:
+                    kwargs = {}
+
+                if func is torch.max:
+                    return torch.tensor(123)
+                return func(*args, **kwargs)
+
+        def fn(x):
+            if torch.overrides.is_tensor_like(x):
+                return torch.max(x)
+            else:
+                return torch.zeros(1)
+
+        x = MyTensor()
+        ref0 = fn(x)
+        ref1 = fn(4)
+        opt_fn = torch._dynamo.optimize("eager")(fn)
+        res0 = opt_fn(x)
+        res1 = opt_fn(4)
+        self.assertTrue(same(ref0, res0))
+        self.assertTrue(same(ref1, res1))
+
     def test_version_ci(self):
         # temporary test to check that the ci torch version is set correctly
         self.assertTrue(hasattr(torch, "_subclasses"))
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index 0debfe9e9f3c9..3b9b552542ac0 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -163,8 +163,6 @@ def can_constant_fold_through(self):
             torch.finfo,
             torch.iinfo,
             torch.is_floating_point,
-            torch.is_tensor,
-            torch.overrides.is_tensor_like,
         ):
             return True
         return getattr(self.value, "__module__", None) == "math"
@@ -177,9 +175,9 @@ def call_function(
             DynamicShapeVariable,
             GradModeVariable,
             TensorVariable,
+            UserDefinedObjectVariable,
         )
 
-        # print("CALLING ON TORCH", self.value)
         from .builder import wrap_fx_proxy
 
         constant_args = check_constant_args(args, kwargs)
@@ -206,21 +204,26 @@ def call_function(
                 return self._call_cross_entropy_loss(tx, args, kwargs, options)
             else:
                 unimplemented(f"construct nn.Module: {self.value.__name__}")
+        elif self.value in (torch.is_tensor, torch.overrides.is_tensor_like):
+            assert len(args) == 1
+            if isinstance(args[0], TensorVariable) or (
+                self.value is torch.overrides.is_tensor_like
+                and isinstance(args[0], UserDefinedObjectVariable)
+                and hasattr(args[0].value, "__torch_function__")
+            ):
+                return ConstantVariable(True, **options)
+            else:
+                return ConstantVariable(False, **options)
         elif (
             self.value
             in (
-                torch.is_tensor,
                 torch.is_floating_point,
                 torch.is_complex,
-                torch.overrides.is_tensor_like,
-                torch.is_complex,
             )
             and isinstance(args[0], TensorVariable)
             and args[0].dtype is not None
         ):
-            if self.value in (torch.is_tensor, torch.overrides.is_tensor_like):
-                return ConstantVariable(True, **options)
-            elif self.value is torch.is_floating_point:
+            if self.value is torch.is_floating_point:
                 return ConstantVariable(args[0].dtype.is_floating_point, **options)
             elif self.value is torch.is_complex:
                 return ConstantVariable(args[0].dtype.is_complex, **options)

From 14ec19a1d852a668f3d10f66605a6d984b191a9f Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Mon, 14 Nov 2022 23:24:31 +0000
Subject: [PATCH 0878/1922] [testing] fix a key in parse_namespace() (#88969)

This PR fixes an incorrect key name of `mappings` dict in `parse_namespace()`
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88969
Approved by: https://github.com/kit1980
---
 test/functorch/xfail_suggester.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/functorch/xfail_suggester.py b/test/functorch/xfail_suggester.py
index 4ae552a44bd3c..cdf2cca13671c 100644
--- a/test/functorch/xfail_suggester.py
+++ b/test/functorch/xfail_suggester.py
@@ -69,7 +69,7 @@ def parse_namespace(base):
         'linalg_': 'linalg',
         '_masked_': '_masked',
         'sparse_': 'sparse',
-        'speical_': 'special',
+        'special_': 'special',
     }
     for heading in mappings.keys():
         if base.startswith(heading):

From 2d96183d74c05e94821be0682f21372f282b9e75 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 14 Nov 2022 23:36:17 +0000
Subject: [PATCH 0879/1922] Revert "support running test_mobile_profiler with
 buck1/buck2 and OSS (#89001)"

This reverts commit 3b33a2794e07b5216aa473da67755af3aa6e6433.

Reverted https://github.com/pytorch/pytorch/pull/89001 on behalf of https://github.com/kit1980 due to Broke trunk / macos-12-py3-x86-64-lite-interpreter / build
---
 .../lite_interpreter_runtime/CMakeLists.txt   |  1 -
 test/cpp/lite_interpreter_runtime/resources.h | 19 -----------
 .../test_mobile_profiler.cpp                  | 34 +++++++++++--------
 3 files changed, 20 insertions(+), 34 deletions(-)
 delete mode 100644 test/cpp/lite_interpreter_runtime/resources.h

diff --git a/test/cpp/lite_interpreter_runtime/CMakeLists.txt b/test/cpp/lite_interpreter_runtime/CMakeLists.txt
index b75ba4ed984ef..6a2e6db6eaa97 100644
--- a/test/cpp/lite_interpreter_runtime/CMakeLists.txt
+++ b/test/cpp/lite_interpreter_runtime/CMakeLists.txt
@@ -25,7 +25,6 @@ target_link_libraries(test_lite_interpreter_runtime PRIVATE torch gtest backend_
 
 if(LINUX)
   target_link_libraries(test_lite_interpreter_runtime PRIVATE "-Wl,--no-as-needed,$<TARGET_FILE:backend_with_compiler_runtime>,--as-needed")
-  target_link_libraries(test_lite_interpreter_runtime PRIVATE stdc++fs)
 endif()
 
 if(INSTALL_TEST)
diff --git a/test/cpp/lite_interpreter_runtime/resources.h b/test/cpp/lite_interpreter_runtime/resources.h
deleted file mode 100644
index 07f13ca8b86a0..0000000000000
--- a/test/cpp/lite_interpreter_runtime/resources.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#pragma once
-
-#include <experimental/filesystem>
-#include <string>
-
-namespace torch {
-namespace testing {
-
-/// Gets the path to the resource identified by name.
-///
-/// @param name identifies a resource, relative path starting from the
-///             repo root
-inline auto getResourcePath(std::string name)
-    -> std::experimental::filesystem::path {
-  return std::move(name);
-}
-
-} // namespace testing
-} // namespace torch
diff --git a/test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp b/test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp
index df9cb9cea28c6..08cb81ae78763 100644
--- a/test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp
+++ b/test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp
@@ -11,8 +11,6 @@
 
 #include <torch/csrc/profiler/events.h>
 
-#include "test/cpp/lite_interpreter_runtime/resources.h"
-
 #ifdef EDGE_PROFILER_USE_KINETO
 namespace torch {
 namespace jit {
@@ -44,15 +42,16 @@ bool checkMetaData(
 } // namespace
 
 TEST(MobileProfiler, ModuleHierarchy) {
-  auto testModelFile = torch::testing::getResourcePath(
-      "test/cpp/lite_interpreter_runtime/to_be_profiled_module.ptl");
+  std::string filePath(__FILE__);
+  auto testModelFile = filePath.substr(0, filePath.find_last_of("/\\") + 1);
+  testModelFile.append("to_be_profiled_module.ptl");
 
   std::vector<IValue> inputs;
   inputs.emplace_back(at::rand({64, 64}));
   inputs.emplace_back(at::rand({64, 64}));
   std::string trace_file_name("/tmp/test_trace.trace");
 
-  mobile::Module bc = _load_for_mobile(testModelFile.string());
+  mobile::Module bc = _load_for_mobile(testModelFile);
   {
     KinetoEdgeCPUProfiler profiler(
         bc,
@@ -96,15 +95,16 @@ TEST(MobileProfiler, ModuleHierarchy) {
 }
 
 TEST(MobileProfiler, Backend) {
-  auto testModelFile = torch::testing::getResourcePath(
-      "test/cpp/lite_interpreter_runtime/test_backend_for_profiling.ptl");
+  std::string filePath(__FILE__);
+  auto testModelFile = filePath.substr(0, filePath.find_last_of("/\\") + 1);
+  testModelFile.append("test_backend_for_profiling.ptl");
 
   std::vector<IValue> inputs;
   inputs.emplace_back(at::rand({64, 64}));
   inputs.emplace_back(at::rand({64, 64}));
   std::string trace_file_name("/tmp/test_trace_backend.trace");
 
-  mobile::Module bc = _load_for_mobile(testModelFile.string());
+  mobile::Module bc = _load_for_mobile(testModelFile);
   {
     KinetoEdgeCPUProfiler profiler(
         bc,
@@ -130,15 +130,16 @@ TEST(MobileProfiler, Backend) {
 }
 
 TEST(MobileProfiler, BackendMemoryEvents) {
-  auto testModelFile = torch::testing::getResourcePath(
-      "test/cpp/lite_interpreter_runtime/test_backend_for_profiling.ptl");
+  std::string filePath(__FILE__);
+  auto testModelFile = filePath.substr(0, filePath.find_last_of("/\\") + 1);
+  testModelFile.append("test_backend_for_profiling.ptl");
 
   std::vector<IValue> inputs;
   inputs.emplace_back(at::rand({64, 64}));
   inputs.emplace_back(at::rand({64, 64}));
   std::string trace_file_name("/tmp/test_trace_backend_memory.trace");
 
-  mobile::Module bc = _load_for_mobile(testModelFile.string());
+  mobile::Module bc = _load_for_mobile(testModelFile);
   {
     mobile::KinetoEdgeCPUProfiler profiler(
         bc,
@@ -162,8 +163,13 @@ TEST(MobileProfiler, BackendMemoryEvents) {
 }
 
 TEST(MobileProfiler, ProfilerEvent) {
-  auto testModelFile = torch::testing::getResourcePath(
-      "test/cpp/lite_interpreter_runtime/test_backend_for_profiling.ptl");
+  /*
+   * TODO: Using __FILE__ is unreliable e.g. it fails to resolve correctly when
+   * using buck2, works ok with buck1
+   */
+  std::string filePath(__FILE__);
+  auto testModelFile = filePath.substr(0, filePath.find_last_of("/\\") + 1);
+  testModelFile.append("test_backend_for_profiling.ptl");
 
   std::vector<IValue> inputs;
   inputs.emplace_back(at::rand({64, 64}));
@@ -174,7 +180,7 @@ TEST(MobileProfiler, ProfilerEvent) {
       torch::profiler::ProfilerPerfEvents.begin(),
       torch::profiler::ProfilerPerfEvents.end());
 
-  mobile::Module bc = _load_for_mobile(testModelFile.string());
+  mobile::Module bc = _load_for_mobile(testModelFile);
   {
     // Bail if something goes wrong here
     try {

From ade4e67e76333e2436f074ae8403c089d85e4c56 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 14 Nov 2022 10:49:20 -0500
Subject: [PATCH 0880/1922] Fix some naughty uses of reshape/flatten (#88999)

Mutating after reshape/flatten is bad! And it turns out
the corresponding view operations are guaranteed to work
too.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88999
Approved by: https://github.com/albanD
---
 torch/autograd/gradcheck.py            | 2 +-
 torch/testing/_internal/opinfo/core.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch/autograd/gradcheck.py b/torch/autograd/gradcheck.py
index 2f43423a2bd6f..46d4f370a99ae 100644
--- a/torch/autograd/gradcheck.py
+++ b/torch/autograd/gradcheck.py
@@ -1164,7 +1164,7 @@ def _vec_from_tensor(x, generator, downcast_complex=False):
         dtype = _to_real_dtype(x.dtype) if downcast_complex else x.dtype
         values = torch.rand(x_values.numel(), generator=generator) \
             .to(dtype=dtype, device=x.device) \
-            .reshape(x_values.shape)
+            .view(x_values.shape)
         values /= values.norm()
         vec = torch.sparse_coo_tensor(x._indices(), values, x.size())
     else:
diff --git a/torch/testing/_internal/opinfo/core.py b/torch/testing/_internal/opinfo/core.py
index 1114d68518320..4f4ab79c2256f 100644
--- a/torch/testing/_internal/opinfo/core.py
+++ b/torch/testing/_internal/opinfo/core.py
@@ -1732,11 +1732,11 @@ def generate_elementwise_binary_extremal_value_tensors(
     lhs = make_tensor(
         (128, 128), device=device, dtype=dtype, requires_grad=requires_grad
     )
-    lhs.flatten()[::3] = nan
+    lhs.view(-1)[::3] = nan
     rhs = make_tensor(
         (128, 128), device=device, dtype=dtype, requires_grad=requires_grad
     )
-    rhs.flatten()[::3] = nan
+    rhs.view(-1)[::3] = nan
 
     yield SampleInput(lhs, args=(rhs,))
 

From b4f093892ecbc3c4f4865f2a1926572adc41c9f3 Mon Sep 17 00:00:00 2001
From: wswartworth <wswartworth@gmail.com>
Date: Mon, 14 Nov 2022 23:58:46 +0000
Subject: [PATCH 0881/1922] improving torch.linalg.lstsq documentation
 formatting (#89013)

Fixes #80441

The highlighting in the documentation for torch.linalg.lstsq was incorrect due to a newline that sphinx doesn't parse correctly.  Instead of writing the tensors directly, I used randn to generate the tensors.  This seems to be more consistent with how other documentation is written.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89013
Approved by: https://github.com/lezcano
---
 torch/linalg/__init__.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/torch/linalg/__init__.py b/torch/linalg/__init__.py
index e78cbbb3be357..3ec9a383546bf 100644
--- a/torch/linalg/__init__.py
+++ b/torch/linalg/__init__.py
@@ -1084,16 +1084,26 @@
 
 Examples::
 
-    >>> A = torch.tensor([[[10, 2, 3], [3, 10, 5], [5, 6, 12]]], dtype=torch.float) # shape (1, 3, 3)
-    >>> B = torch.tensor([[[2, 5, 1], [3, 2, 1], [5, 1, 9]],
-                          [[4, 2, 9], [2, 0, 3], [2, 5, 3]]], dtype=torch.float) # shape (2, 3, 3)
+    >>> A = torch.randn(1,3,3)
+    >>> A
+    tensor([[[-1.0838,  0.0225,  0.2275],
+         [ 0.2438,  0.3844,  0.5499],
+         [ 0.1175, -0.9102,  2.0870]]])
+    >>> B = torch.randn(2,3,3)
+    >>> B
+    tensor([[[-0.6772,  0.7758,  0.5109],
+         [-1.4382,  1.3769,  1.1818],
+         [-0.3450,  0.0806,  0.3967]],
+        [[-1.3994, -0.1521, -0.1473],
+         [ 1.9194,  1.0458,  0.6705],
+         [-1.1802, -0.9796,  1.4086]]])
     >>> X = torch.linalg.lstsq(A, B).solution # A is broadcasted to shape (2, 3, 3)
     >>> torch.dist(X, torch.linalg.pinv(A) @ B)
-    tensor(2.0862e-07)
+    tensor(1.5152e-06)
 
     >>> S = torch.linalg.lstsq(A, B, driver='gelsd').singular_values
     >>> torch.dist(S, torch.linalg.svdvals(A))
-    tensor(5.7220e-06)
+    tensor(2.3842e-07)
 
     >>> A[:, 0].zero_()  # Decrease the rank of A
     >>> rank = torch.linalg.lstsq(A, B).rank

From f2fdac25c03b244f5f744b600c6c129841c8bc7b Mon Sep 17 00:00:00 2001
From: Jongsoo Park <jongsoo@meta.com>
Date: Tue, 15 Nov 2022 00:48:49 +0000
Subject: [PATCH 0882/1922] [inductor] fix could not find as_strided with
 config.triton.mm=triton (#88946)

Summary: ReinterpretView doesn't seem to be handled properly with matrix multiply Triton kernels

Reviewed By: bertmaher

Differential Revision: D40836677

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88946
Approved by: https://github.com/jansel
---
 torch/_inductor/codegen/common.py          | 12 ++++++++++++
 torch/_inductor/codegen/triton_template.py |  2 +-
 torch/_inductor/graph.py                   |  4 ++++
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index a949effb26793..932e8c91bc7da 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -283,6 +283,8 @@ def input(self, name):
         assert name not in V.graph.removed_buffers, name
         if name in self.output_buffers:
             return self.output_buffers[name]
+        if name in self.inplace_buffers:
+            return self.inplace_buffers[name].inner_name
         if name.startswith("seed"):
             return self._lookup("seed", self.input_buffers, name)
         return self._lookup("in_ptr", self.input_buffers, name)
@@ -290,6 +292,8 @@ def input(self, name):
     def output(self, name):
         name = V.graph.scheduler.mutation_real_name.get(name, name)
         assert name not in V.graph.removed_buffers, name
+        if name in self.inplace_buffers:
+            return self.inplace_buffers[name].inner_name
         return self._lookup("out_ptr", self.output_buffers, name)
 
     def make_inplace(self, input_name, output_name):
@@ -392,6 +396,14 @@ def aliases(self):
                 if other in self.output_buffers:
                     yield self.output_buffers[other], inplaced.inner_name
 
+    def is_removed(self, name):
+        def _is_removed(name, buffers):
+            return name not in buffers or buffers[name] == "REMOVED"
+
+        return _is_removed(name, self.output_buffers) and _is_removed(
+            name, self.inplace_buffers
+        )
+
 
 class CSE:
     """Common subexpression elimination"""
diff --git a/torch/_inductor/codegen/triton_template.py b/torch/_inductor/codegen/triton_template.py
index 0de771ff65749..cd1c2bed6bb7c 100644
--- a/torch/_inductor/codegen/triton_template.py
+++ b/torch/_inductor/codegen/triton_template.py
@@ -330,7 +330,7 @@ def template_codegen(scheduler, scheduler_node, epilogue):
     kernel_buf_replace_name = None
     if could_remove_kernel_buf:
         for node in epilogue:
-            if kernel.args.output_buffers[node.get_name()] != "REMOVED":
+            if not kernel.args.is_removed(node.get_name()):
                 kernel_buf_replace_name = node.get_name()
                 break
         assert kernel_buf_replace_name is not None
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index f69a891fca7ba..e0e41fd8afa5d 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -1,6 +1,7 @@
 import logging
 import operator
 import os
+import re
 import time
 
 import sympy
@@ -90,6 +91,9 @@ def get_dtype(self, buffer_name):
             return self.name_to_buffer[buffer_name].get_dtype()
         if buffer_name in self.graph_inputs:
             return self.graph_inputs[buffer_name].get_dtype()
+        m = re.match(r"as_strided\(([a-zA-Z0-9_]+),", buffer_name)
+        if m:
+            return self.get_dtype(m.group(1))
         raise KeyError(f"could not find {buffer_name}")
 
     def random_seed_buffer(self, device: torch.device):

From fa93a0a323130845222f508014e44bbd6e39db15 Mon Sep 17 00:00:00 2001
From: XiaobingSuper <xiaobing.zhang@intel.com>
Date: Mon, 14 Nov 2022 07:40:32 -0500
Subject: [PATCH 0883/1922] Take input striding for conv fusion op  based on
 eager output (#88864)

As https://github.com/pytorch/pytorch/pull/88706, we also change the input stride check using eager output.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88864
Approved by: https://github.com/jgong5, https://github.com/jansel
---
 torch/_inductor/ir.py | 95 ++++++++++++++-----------------------------
 1 file changed, 30 insertions(+), 65 deletions(-)

diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 8a2e26ee9b94c..fdc10c9ca16a7 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -3346,21 +3346,20 @@ def _prepare_convolution_fusion_create(
     function only supports the CPU device since conv post-op fusion kernel is only
     supported on CPU right now.
     """
-
-    x = cls.require_stride1(cls.realize_input(x))
-    weight = cls.require_stride1(cls.realize_input(weight))
-    assert x.get_device().type == "cpu" and weight.get_device().type == "cpu"
-    inputs = [x, weight]
     stride = tuple(stride_)
     padding = tuple(padding_)
     dilation = tuple(dilation_)
     assert isinstance(groups, int)
-    with FakeTensorMode():
-        output, *_ = cls.process_kernel(
-            torch.ops.aten.convolution,
-            x,
-            weight,
-            bias,
+    with torch._subclasses.FakeTensorMode():
+        x_fake = ir_node_to_tensor(x, guard_shape=True)
+        weight_fake = ir_node_to_tensor(weight, guard_shape=True)
+        bias_fake = (
+            ir_node_to_tensor(bias, guard_shape=True) if bias is not None else bias
+        )
+        output = torch.ops.aten.convolution(
+            x_fake,
+            weight_fake,
+            bias_fake,
             stride,
             padding,
             dilation,
@@ -3368,29 +3367,18 @@ def _prepare_convolution_fusion_create(
             [0, 0],
             groups,
         )
+        req_stride_order = get_stride_order(output.stride())
 
-    output_size = output.shape
-    weight_shape = [
-        sympy.Integer(V.graph.sizevars.guard_static_shape(s)) for s in weight.get_size()
-    ]
-    _, _, *kernel_size = weight_shape
-    output_layout_str = (
-        "torch.contiguous_format" if output.is_contiguous() else "torch.channels_last"
-    )
-
-    if output_layout_str == "torch.channels_last":
-        stride_order = [0] + list(reversed(range(1, len(kernel_size) + 1)))
-        if len(stride_order) < len(output_size):
-            # add batch dim if it exists
-            stride_order = [len(stride_order)] + stride_order
-    else:
-        stride_order = list(reversed(range(len(output_size))))
+    x = cls.require_stride_order(x, req_stride_order)
+    weight = cls.require_stride1(cls.realize_input(weight))
+    assert x.get_device().type == "cpu" and weight.get_device().type == "cpu"
+    inputs = [x, weight]
 
-    kernel_layout = FlexibleLayout(
-        device=inputs[0].get_device(),
-        dtype=inputs[0].get_dtype(),
-        size=output_size,
-        stride_order=stride_order,
+    kernel_layout = FixedLayout(
+        x.get_device(),
+        x.get_dtype(),
+        output.size(),
+        output.stride(),
     )
     constant_args = [padding, stride, dilation, groups]
 
@@ -3398,7 +3386,7 @@ def _prepare_convolution_fusion_create(
         inputs.append(bias)
     else:
         constant_args.insert(0, bias)
-    return inputs, constant_args, kernel_layout
+    return inputs, constant_args, kernel_layout, req_stride_order
 
 
 class ConvolutionUnary(ExternKernelAlloc):
@@ -3436,7 +3424,7 @@ def create(
         algorithm,
     ):
         kernel = "torch.ops.mkldnn._convolution_pointwise"
-        (inputs, constant_args, kernel_layout,) = _prepare_convolution_fusion_create(
+        (inputs, constant_args, kernel_layout, _) = _prepare_convolution_fusion_create(
             cls, x, weight, bias, padding_, stride_, dilation_, groups
         )
         constant_args = constant_args + [attr, scalars, algorithm]
@@ -3447,13 +3435,6 @@ def create(
             kernel=kernel,
         )
 
-    def apply_constraint(self):
-        x = self.inputs[0]
-        # FixedLayout of input
-        x = self.require_stride_order(x, self.layout.preferred_stride_order)
-        self.inputs[0] = x
-        self.freeze_layout_with_stride_order(self.layout.preferred_stride_order)
-
 
 class ConvolutionBinary(ExternKernelAlloc):
     kernel = "torch.ops.mkldnn._convolution_pointwise.binary"
@@ -3493,10 +3474,15 @@ def create(
         unary_algorithm: Optional[str],
     ):
         kernel = "torch.ops.mkldnn._convolution_pointwise.binary"
-        (inputs, constant_args, kernel_layout,) = _prepare_convolution_fusion_create(
+        (
+            inputs,
+            constant_args,
+            kernel_layout,
+            req_stride_order,
+        ) = _prepare_convolution_fusion_create(
             cls, x, weight, bias, padding_, stride_, dilation_, groups
         )
-        other = cls.require_stride1(cls.realize_input(other))
+        other = cls.require_stride_order(other, req_stride_order)
         inputs.insert(1, other)
         constant_args = constant_args + [
             binary_attr,
@@ -3512,17 +3498,6 @@ def create(
             kernel=kernel,
         )
 
-    def apply_constraint(self):
-        x = self.inputs[0]
-        # FixedLayout of input
-        x = self.require_stride_order(x, self.layout.preferred_stride_order)
-        self.inputs[0] = x
-        other = self.inputs[1]
-        # FixedLayout of other
-        other = self.require_stride_order(other, self.layout.preferred_stride_order)
-        self.inputs[1] = other
-        self.freeze_layout_with_stride_order(self.layout.preferred_stride_order)
-
 
 class ConvolutionBinaryInplace(ExternKernelAlloc):
     kernel = "torch.ops.mkldnn._convolution_pointwise_.binary"
@@ -3530,14 +3505,12 @@ class ConvolutionBinaryInplace(ExternKernelAlloc):
     def __init__(
         self,
         kernel_layout,
-        inputs_layout,
         inputs,
         constant_args=(),
         kernel="torch.ops.mkldnn._convolution_pointwise_.binary",
     ):
         super().__init__(kernel_layout, inputs, constant_args)
         self.kernel = kernel
-        self.inputs_layout = inputs_layout
 
     def codegen(self, wrapper):
         wrapper.writeline(
@@ -3566,7 +3539,7 @@ def create(
         unary_algorithm: Optional[str],
     ):
         kernel = "torch.ops.mkldnn._convolution_pointwise_.binary"
-        (inputs, constant_args, inputs_layout,) = _prepare_convolution_fusion_create(
+        (inputs, constant_args, _, _) = _prepare_convolution_fusion_create(
             cls, x, weight, bias, padding_, stride_, dilation_, groups
         )
         other = cls.realize_input(other)
@@ -3581,19 +3554,11 @@ def create(
         ]
         return ConvolutionBinaryInplace(
             kernel_layout=MutationLayout(inputs[1]),
-            inputs_layout=inputs_layout,
             inputs=inputs,
             constant_args=constant_args,
             kernel=kernel,
         )
 
-    def apply_constraint(self):
-        x = self.inputs[0]
-        # FixedLayout of input
-        x = self.require_stride_order(x, self.inputs_layout.preferred_stride_order)
-        self.inputs[0] = x
-        self.freeze_layout_with_stride_order(self.inputs_layout.preferred_stride_order)
-
 
 class LinearUnary(ExternKernelAlloc):
     kernel = "torch.ops.mkldnn._linear_pointwise"

From 90a062831f4b9381d8ae342162ac11c47b8295d7 Mon Sep 17 00:00:00 2001
From: Zain Rizvi <zainr@fb.com>
Date: Tue, 15 Nov 2022 01:01:37 +0000
Subject: [PATCH 0884/1922] Use same retry logic as macos binary builds
 (#89014)

Occasionally the command to download sccache via curl fails with network errors (example below). The default curl retry option only retries errors that are considered "transient", but but the set of actual transient commands is greater than what curl considers to be transient.

This PR modifies the retry logic for downloading sccache to match what's in https://github.com/pytorch/pytorch/blob/master/.github/templates/macos_binary_build_workflow.yml.j2#L79-L89, using the retry action to ensure we both retry all transient errors, and including a longer retry delay to give the transient issue time to resolve itself.

Example failure from [this run](https://github.com/pytorch/pytorch/actions/runs/3422664884/jobs/5700595220):
```
Run sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:02 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:03 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:04 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:05 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:06 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:07 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:08 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:10 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:11 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:12 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:13 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:14 --:--:--     0
curl: (35) OpenSSL SSL_connect: Connection reset by peer in connection to s3.amazonaws.com:443
Error: Process completed with exit code 35.
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89014
Approved by: https://github.com/huydhn
---
 .github/workflows/_mac-build.yml | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml
index 9f0c988f4a311..faf069e7a7c35 100644
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@@ -109,12 +109,17 @@ jobs:
           brew link --force libomp
 
       - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        uses: nick-fields/retry@v2.8.2
         if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        run: |
-          sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-          sudo chmod +x /usr/local/bin/sccache
-          echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-          echo "SCCACHE_S3_KEY_PREFIX=${GITHUB_WORKFLOW}" >> "${GITHUB_ENV}"
+        with:
+          timeout_minutes: 5
+          max_attempts: 3
+          retry_wait_seconds: 90
+          command: |
+            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo chmod +x /usr/local/bin/sccache
+            echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+            echo "SCCACHE_S3_KEY_PREFIX=${GITHUB_WORKFLOW}" >> "${GITHUB_ENV}"
 
       - name: Get workflow job id
         id: get-job-id

From c3f5c8dc5cd1fb3b641d0912d7ec548c56db0f23 Mon Sep 17 00:00:00 2001
From: Driss Guessous <drisspg@fb.com>
Date: Tue, 15 Nov 2022 01:10:35 +0000
Subject: [PATCH 0885/1922] Add mem efficient backward (#88856)

# Registers the derivative for mem efficient backward

- Use gradcheck to test correctness. The kernel is not implemented for fp64 so run checks with bumped tolerances in fp32
- I also made updates based off of Xformer main branch and flash-attention cutlass branch.
- This will enable the fused backward to be called for scaled dot product attention

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88856
Approved by: https://github.com/cpuhrsch
---
 aten/src/ATen/native/native_functions.yaml    |   5 +
 .../native/transformers/cuda/attention.cu     |  16 +-
 .../transformers/cuda/attention_backward.cu   | 261 ++++++++++++++++++
 .../transformers/cuda/flash_attn/fmha_api.cpp |   4 +
 .../attention_backward_generic.cu             | 166 -----------
 .../attention_forward_generic.cu              | 232 ----------------
 .../cuda/mem_eff_attention/find_default_mma.h |   7 +-
 .../cuda/mem_eff_attention/kernel_backward.h  | 250 +++++++++++------
 .../ATen/native/transformers/cuda/sdp_utils.h |  12 +-
 test/test_transformers.py                     |  44 ++-
 tools/autograd/derivatives.yaml               |   7 +-
 .../_internal/common_methods_invocations.py   |   4 +-
 12 files changed, 501 insertions(+), 507 deletions(-)
 create mode 100644 aten/src/ATen/native/transformers/cuda/attention_backward.cu
 delete mode 100644 aten/src/ATen/native/transformers/cuda/mem_eff_attention/attention_backward_generic.cu
 delete mode 100644 aten/src/ATen/native/transformers/cuda/mem_eff_attention/attention_forward_generic.cu

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index de087c0b8a896..9572ccc56653d 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -13287,6 +13287,11 @@
   dispatch:
     CUDA: _efficient_attention_forward
 
+- func: _efficient_attention_backward(Tensor grad, Tensor query, Tensor key, Tensor value, Tensor logsumexp, Tensor out, bool is_causal=False) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: _efficient_attention_backward
+
 - func: _transformer_decoder_only_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, Tensor? incr_key=None, Tensor? incr_value=None) -> (Tensor, Tensor, Tensor)
   variants: function
   dispatch:
diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu
index f65fedd6d7954..46543d4663fab 100644
--- a/aten/src/ATen/native/transformers/cuda/attention.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention.cu
@@ -746,7 +746,9 @@ std::tuple<Tensor, Tensor> flash_attention_helper_dense_unpacked(
 std::tuple<Tensor, Tensor> mem_eff_helper(
     const Tensor& query,
     const Tensor& key,
-    const Tensor& value){
+    const Tensor& value,
+    bool compute_log_sumexp,
+    bool is_causal) {
   // Query -> Query(Batch x Q_seq_len x Num_heads x Dim_per_head)
   // Key   -> Key(Batch x KV_seq_len x Num_heads x Dim_per_head)
   // Value -> Value(Batch x KV_seq_len x  Num_heads x Dim_per_head)
@@ -754,16 +756,18 @@ std::tuple<Tensor, Tensor> mem_eff_helper(
   Tensor k_t = key.transpose(1, 2);
   Tensor v_t = value.transpose(1, 2);
 
-  Tensor attention = std::get<0>(at::_efficient_attention_forward(
+  Tensor attention, log_sumexp;
+  std::tie(attention, log_sumexp) = at::_efficient_attention_forward(
       q_t,
       k_t,
       v_t,
       c10::nullopt,
       c10::nullopt,
       c10::nullopt,
-      false,
-      false)).transpose(1,2);
-  return std::make_tuple(attention, Tensor());
+      compute_log_sumexp,
+      is_causal);
+  attention = attention.transpose(1,2);
+  return std::make_tuple(std::move(attention), Tensor());
 }
 
 std::tuple<Tensor, Tensor> _scaled_dot_product_attention_forward_cuda(
@@ -776,7 +780,7 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_forward_cuda(
       case sdp::SDPBackend::flash_attention:
           return flash_attention_helper_dense_unpacked(query_, key, value, dropout_p, need_attn_weights, is_causal);
       case sdp::SDPBackend::efficient_attention:
-          return mem_eff_helper(query_, key , value);
+          return mem_eff_helper(query_, key , value, need_attn_weights, is_causal);
       case sdp::SDPBackend::math:
         return at::_scaled_dot_product_attention_math(query_, key, value, attn_mask_, dropout_p, need_attn_weights, is_causal);
       default:
diff --git a/aten/src/ATen/native/transformers/cuda/attention_backward.cu b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
new file mode 100644
index 0000000000000..af005b2669b29
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
@@ -0,0 +1,261 @@
+#include <type_traits>
+
+#include <ATen/ATen.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAMathCompat.h>
+
+#include <ATen/native/nested/NestedTensorTransformerFunctions.h>
+#include <ATen/native/nested/NestedTensorUtils.h>
+#include <ATen/native/transformers/attention.h>
+#include <ATen/native/transformers/cuda/sdp_utils.h>
+
+#ifdef USE_FLASH_ATTENTION
+#include <ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h>
+#endif
+
+#define ASSIGN_CHECK_OVERFLOW(A, B)                                            \
+  {                                                                            \
+    A = B;                                                                     \
+    TORCH_CHECK(B < std::numeric_limits<decltype(A)>::max(), #B " overflows"); \
+  }
+
+#define DISPATCH_MAXK(func)                                   \
+  {                                                           \
+    const auto maxK = std::max(query.size(3), value.size(3)); \
+    if (maxK <= 64) {                                         \
+      constexpr int kMaxK = 64;                               \
+      func();                                                 \
+    } else if (maxK <= 128) {                                 \
+      constexpr int kMaxK = 128;                              \
+      func();                                                 \
+    } else {                                                  \
+      constexpr int kMaxK = std::numeric_limits<int>::max();  \
+      func();                                                 \
+    }                                                         \
+  }
+
+#define DISPATCH_KERNEL(QUERY, KEY, VALUE, FUNC)                               \
+  {                                                                            \
+    cudaDeviceProp* properties =                                               \
+        at::cuda::getDeviceProperties(QUERY.device().index());                 \
+    const int computeCapability = properties->major * 10 + properties->minor;  \
+    DISPATCH_MAXK(([&] {                                                       \
+      DISPATCH_TYPES(                                                          \
+          QUERY, ([&]() {                                                      \
+            DISPATCH_ARCHTAG(                                                  \
+                computeCapability, ([&]() {                                    \
+                  using AlignedAK =                                            \
+                      AttentionBackwardKernel<ArchTag, scalar_t, true, kMaxK>; \
+                  bool isAligned =                                             \
+                      (QUERY.stride(2) % AlignedAK::kOptimalAlignement == 0 && \
+                       KEY.stride(2) % AlignedAK::kOptimalAlignement == 0 &&   \
+                       VALUE.stride(2) % AlignedAK::kOptimalAlignement == 0);  \
+                  DISPATCH_BOOL(isAligned, kIsAligned, ([&]() {                \
+                                  using Kernel = AttentionBackwardKernel<      \
+                                      ArchTag,                                 \
+                                      scalar_t,                                \
+                                      kIsAligned,                              \
+                                      kMaxK>;                                  \
+                                  FUNC();                                      \
+                                }))                                            \
+                }))                                                            \
+          }))                                                                  \
+    }));                                                                       \
+  }
+
+namespace at {
+
+namespace native {
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> _efficient_attention_backward(
+    const at::Tensor& grad_out_,
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const at::Tensor& logsumexp,
+    const at::Tensor& out,
+    bool causal) {
+  #if defined(USE_FLASH_ATTENTION)
+  if (!grad_out_.defined()) {
+    return std::make_tuple(Tensor{}, Tensor{}, Tensor{});
+  }
+    // ndim
+  TORCH_CHECK(query.dim() == grad_out_.dim());
+  TORCH_CHECK(query.dim() == key.dim());
+  TORCH_CHECK(query.dim() == value.dim());
+  TORCH_CHECK(query.dim() == 4);
+
+  // batch size
+  TORCH_CHECK(query.size(0) == grad_out_.size(0));
+  TORCH_CHECK(query.size(0) == key.size(0));
+  TORCH_CHECK(query.size(0) == value.size(0));
+
+  // seqlen
+  TORCH_CHECK(key.size(1) == value.size(1));
+  TORCH_CHECK(query.size(1) == grad_out_.size(1));
+
+  // Num heads
+  TORCH_CHECK(query.size(2) == key.size(2));
+  TORCH_CHECK(query.size(2) == value.size(2));
+  TORCH_CHECK(query.size(2) == grad_out_.size(2));
+
+  // Embedding per head
+  TORCH_CHECK(query.size(3) == key.size(3));
+  TORCH_CHECK(value.size(3) == grad_out_.size(3));
+
+  // handle potentially non-contiguous grad_out through a copy
+  auto grad_out = grad_out_.contiguous();
+  CHECK_NOSPARSE_CONTIGUOUS_CUDA(grad_out);
+
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
+
+  at::cuda::CUDAGuard device_guard(query.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  int64_t B = query.size(0);
+  int64_t M = query.size(1);
+  int64_t N = key.size(1);
+  int64_t nH = query.size(2);
+  int64_t K = query.size(3);
+
+  // It does not make sense to use that in practice,
+  // but let's still make sure we are correct
+  // As we iterate through keys first, we skip
+  // keys with no query associated, so they are not
+  // initialized
+  bool grad_kv_needs_init = causal && N > M;
+  at::Tensor grad_q, grad_k, grad_v;
+  if (!grad_kv_needs_init && query.size(1) == key.size(1) &&
+      query.size(3) == value.size(3) &&
+      query.storage().is_alias_of(key.storage()) &&
+      query.storage().is_alias_of(value.storage())) {
+    // Create one big contiguous chunk
+    // This is because q, k and v usually come from a single
+    // output of a linear layer that is chunked.
+    // Creating the gradients with the right layout saves us
+    // a `torch.cat` call in the backward pass
+    at::Tensor chunk = at::empty({B, M, 3, nH, K}, query.options());
+    grad_q = chunk.select(2, 0);
+    grad_k = chunk.select(2, 1);
+    grad_v = chunk.select(2, 2);
+  } else {
+    grad_q = at::empty_like(query);
+    grad_k = grad_kv_needs_init ? at::zeros_like(key) : at::empty_like(key);
+    grad_v = grad_kv_needs_init ? at::zeros_like(value) : at::empty_like(value);
+  }
+
+  auto launchKernel = [&](auto _k, int computeCapability) {
+    using Kernel = decltype(_k);
+    using scalar_t = typename Kernel::scalar_t;
+    (void)_k;
+
+    size_t smem_bytes = sizeof(typename Kernel::SharedStorage);
+
+    // TODO: Fuse this into a kernel?
+    // This is a bottleneck for smaller sequences (M <= 128)
+    auto delta = Kernel::kKernelComputesDelta
+        ? at::empty({B, nH, M}, query.options().dtype(at::ScalarType::Float))
+        : (grad_out.to(at::kFloat) * out.to(at::kFloat))
+              .sum(-1)
+              .transpose(-2, -1)
+              .contiguous();
+    TORCH_INTERNAL_ASSERT(delta.size(0) == B);
+    TORCH_INTERNAL_ASSERT(delta.size(1) == nH);
+    TORCH_INTERNAL_ASSERT(delta.size(2) == M);
+
+    typename Kernel::Params p;
+    p.query_ptr = (scalar_t*)query.data_ptr();
+    p.key_ptr = (scalar_t*)key.data_ptr();
+    p.value_ptr = (scalar_t*)value.data_ptr();
+    p.logsumexp_ptr = (typename Kernel::lse_scalar_t*)logsumexp.data_ptr();
+    p.output_ptr = (scalar_t*)out.data_ptr();
+    p.grad_output_ptr = (scalar_t*)grad_out.data_ptr();
+    p.grad_query_ptr = (scalar_t*)grad_q.data_ptr();
+    p.grad_key_ptr = (scalar_t*)grad_k.data_ptr();
+    p.grad_value_ptr = (scalar_t*)grad_v.data_ptr();
+    p.delta_ptr = (float*)delta.data_ptr();
+    p.head_dim = query.size(3);
+    p.head_dim_value = value.size(3);
+    p.num_queries = query.size(1);
+    p.num_keys = key.size(1);
+    p.num_batches = B;
+    p.num_heads = nH;
+    p.causal = causal;
+
+    ASSIGN_CHECK_OVERFLOW(p.gO_strideB, grad_out.stride(0));
+    ASSIGN_CHECK_OVERFLOW(p.gO_strideM, grad_out.stride(1));
+    ASSIGN_CHECK_OVERFLOW(p.gO_strideH, grad_out.stride(2));
+
+    ASSIGN_CHECK_OVERFLOW(p.o_strideB, out.stride(0));
+    ASSIGN_CHECK_OVERFLOW(p.o_strideH, out.stride(2));
+
+    ASSIGN_CHECK_OVERFLOW(p.gQ_strideB, grad_q.stride(0));
+    ASSIGN_CHECK_OVERFLOW(p.gK_strideB, grad_k.stride(0));
+    ASSIGN_CHECK_OVERFLOW(p.gV_strideB, grad_v.stride(0));
+    ASSIGN_CHECK_OVERFLOW(p.gQ_strideH, grad_q.stride(2));
+    ASSIGN_CHECK_OVERFLOW(p.gK_strideH, grad_k.stride(2));
+    ASSIGN_CHECK_OVERFLOW(p.gV_strideH, grad_v.stride(2));
+    p.gQKV_strideM_multiplier = grad_q.is_contiguous() ? 1 : 3;
+    TORCH_INTERNAL_ASSERT(p.gQ_strideM() == grad_q.stride(1));
+    TORCH_INTERNAL_ASSERT(p.gK_strideM() == grad_k.stride(1));
+    TORCH_INTERNAL_ASSERT(p.gV_strideM() == grad_v.stride(1));
+
+    ASSIGN_CHECK_OVERFLOW(p.q_strideB, query.stride(0));
+    ASSIGN_CHECK_OVERFLOW(p.k_strideB, key.stride(0));
+    ASSIGN_CHECK_OVERFLOW(p.v_strideB, value.stride(0));
+    ASSIGN_CHECK_OVERFLOW(p.q_strideM, query.stride(1));
+    ASSIGN_CHECK_OVERFLOW(p.k_strideM, key.stride(1));
+    ASSIGN_CHECK_OVERFLOW(p.v_strideM, value.stride(1));
+    ASSIGN_CHECK_OVERFLOW(p.q_strideH, query.stride(2));
+    ASSIGN_CHECK_OVERFLOW(p.k_strideH, key.stride(2));
+    ASSIGN_CHECK_OVERFLOW(p.v_strideH, value.stride(2));
+
+    Kernel::check_supported(p);
+
+    constexpr auto kernel_fn = attention_kernel_backward_batched<Kernel>;
+
+    if (smem_bytes > 0xc000) {
+      TORCH_INTERNAL_ASSERT(
+          computeCapability >= 70,
+          "This kernel requires too much shared memory on this machine!");
+      cudaFuncSetAttribute(
+          kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes);
+    }
+
+    // second syntax resulted in the error below on windows
+    // error C3495: 'kernel_fn': a simple capture must be a variable
+    // with automatic storage duration declared
+    // in the reaching scope of the lambda
+#ifdef _WIN32
+    cudaFuncAttributes attr;
+    AT_CUDA_CHECK(cudaFuncGetAttributes(&attr, kernel_fn));
+    TORCH_INTERNAL_ASSERT(
+        attr.binaryVersion >= Kernel::ArchTag::kMinComputeCapability,
+        "Something went wrong in the build process");
+#else
+    auto checkBinaryArchMatches = [&]() {
+      cudaFuncAttributes attr;
+      AT_CUDA_CHECK(cudaFuncGetAttributes(&attr, kernel_fn));
+      return attr.binaryVersion >= Kernel::ArchTag::kMinComputeCapability;
+    };
+    TORCH_INTERNAL_ASSERT(
+        checkBinaryArchMatches(), "Something went wrong in the build process");
+#endif
+
+    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes, stream>>>(p);
+  };
+
+  DISPATCH_KERNEL(
+      query, key, value, ([&] { launchKernel(Kernel{}, computeCapability); }));
+  AT_CUDA_CHECK(cudaGetLastError());
+  return std::make_tuple(grad_q, grad_k, grad_v);
+  #endif
+  TORCH_CHECK(false, "USE_FLASH_ATTENTION was not enabled for build.")
+  return std::make_tuple(Tensor{}, Tensor{}, Tensor{});
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
index a8d6110e951d9..6c86e1ff63b01 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
@@ -29,6 +29,7 @@
 #ifdef USE_FLASH_ATTENTION
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
 #include <ATen/NativeFunctions.h>
 
 #include <ATen/native/transformers/cuda/flash_attn/fmha.h>
@@ -185,6 +186,9 @@ mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q
     int max_seqlen_q = ((max_seqlen_q_ + 16 - 1) / 16) * 16;
     bool loop = max_seqlen_k > blocksize_c;
 
+    // Otherwise the kernel will be launched from cuda:0 device
+    at::cuda::CUDAGuard device_guard{q.get_device()};
+
     auto opts = q.options();
 
     auto o = at::empty({ total_q, num_heads, head_size }, opts);
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/attention_backward_generic.cu b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/attention_backward_generic.cu
deleted file mode 100644
index 07c14ad8195dd..0000000000000
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/attention_backward_generic.cu
+++ /dev/null
@@ -1,166 +0,0 @@
-#include <ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h>
-
-#define DISPATCH_MAXK(func)                                   \
-  {                                                           \
-    const auto maxK = std::max(query.size(2), value.size(2)); \
-    if (maxK <= 64) {                                         \
-      constexpr int kMaxK = 64;                               \
-      func();                                                 \
-    } else if (maxK <= 128) {                                 \
-      constexpr int kMaxK = 128;                              \
-      func();                                                 \
-    } else {                                                  \
-      constexpr int kMaxK = std::numeric_limits<int>::max();  \
-      func();                                                 \
-    }                                                         \
-  }
-
-#define DISPATCH_KERNEL(QUERY, KEY, VALUE, FUNC)                               \
-  {                                                                            \
-    cudaDeviceProp* properties =                                               \
-        at::cuda::getDeviceProperties(QUERY.device().index());                 \
-    const int computeCapability = properties->major * 10 + properties->minor;  \
-    DISPATCH_MAXK(([&] {                                                       \
-      DISPATCH_TYPES(                                                          \
-          QUERY, ([&]() {                                                      \
-            DISPATCH_ARCHTAG(                                                  \
-                computeCapability, ([&]() {                                    \
-                  using AlignedAK =                                            \
-                      AttentionBackwardKernel<ArchTag, scalar_t, true, kMaxK>; \
-                  bool isAligned =                                             \
-                      (QUERY.stride(1) % AlignedAK::kOptimalAlignement == 0 && \
-                       KEY.stride(1) % AlignedAK::kOptimalAlignement == 0 &&   \
-                       VALUE.stride(1) % AlignedAK::kOptimalAlignement == 0);  \
-                  DISPATCH_BOOL(isAligned, kIsAligned, ([&]() {                \
-                                  using Kernel = AttentionBackwardKernel<      \
-                                      ArchTag,                                 \
-                                      scalar_t,                                \
-                                      kIsAligned,                              \
-                                      kMaxK>;                                  \
-                                  FUNC();                                      \
-                                }))                                            \
-                }))                                                            \
-          }))                                                                  \
-    }));                                                                       \
-  }
-
-namespace {
-std::tuple<at::Tensor, at::Tensor, at::Tensor>
-mem_efficient_attention_backward_cutlass(
-    const at::Tensor& grad_out_,
-    const at::Tensor& query,
-    const at::Tensor& key,
-    const at::Tensor& value,
-    const at::Tensor& logsumexp,
-    const at::Tensor& out,
-    bool causal) {
-  TORCH_CHECK(query.dim() == grad_out_.dim());
-  TORCH_CHECK(query.dim() == key.dim());
-  TORCH_CHECK(query.dim() == 3);
-
-  TORCH_CHECK(query.size(0) == grad_out_.size(0));
-  TORCH_CHECK(query.size(1) == grad_out_.size(1));
-  TORCH_CHECK(value.size(2) == grad_out_.size(2));
-
-  TORCH_CHECK(query.size(2) == key.size(2));
-  TORCH_CHECK(query.size(0) == key.size(0));
-
-  TORCH_CHECK(query.size(0) == value.size(0));
-  TORCH_CHECK(key.size(1) == value.size(1));
-
-  // handle potentially non-contiguous grad_out through a copy
-  auto grad_out = grad_out_.contiguous();
-
-  CHECK_NOSPARSE_CONTIGUOUS_CUDA(query);
-  CHECK_NOSPARSE_CONTIGUOUS_CUDA(key);
-  CHECK_NOSPARSE_CONTIGUOUS_CUDA(value);
-  CHECK_NOSPARSE_CONTIGUOUS_CUDA(grad_out);
-
-  at::cuda::CUDAGuard device_guard(query.device());
-
-  int64_t B = query.size(0);
-  int64_t M = query.size(1);
-  int64_t N = key.size(1);
-  int64_t K = query.size(2);
-
-  // It does not make sense to use that in practice,
-  // but let's still make sure we are correct
-  // As we iterate through keys first, we skip
-  // keys with no query associated, so they are not
-  // initialized
-  bool grad_kv_needs_init = causal && N > M;
-  at::Tensor grad_q = at::empty_like(query);
-  at::Tensor grad_k =
-      grad_kv_needs_init ? at::zeros_like(key) : at::empty_like(key);
-  at::Tensor grad_v =
-      grad_kv_needs_init ? at::zeros_like(value) : at::empty_like(value);
-
-  auto launchKernel = [&](auto _k, int computeCapability) {
-    using Kernel = decltype(_k);
-    using scalar_t = typename Kernel::scalar_t;
-    (void)_k;
-
-    size_t smem_bytes = sizeof(typename Kernel::SharedStorage);
-
-    // TODO: Fuse this into a kernel?
-    // This is a bottleneck for smaller sequences (M <= 128)
-    auto delta = Kernel::kKernelComputesDelta
-        ? at::empty({B, M}, query.options().dtype(at::ScalarType::Float))
-        : (grad_out.to(at::kFloat) * out.to(at::kFloat)).sum(-1);
-    TORCH_INTERNAL_ASSERT(delta.size(0) == B);
-    TORCH_INTERNAL_ASSERT(delta.size(1) == M);
-
-    typename Kernel::Params params;
-    params.query_ptr = (scalar_t*)query.data_ptr();
-    params.key_ptr = (scalar_t*)key.data_ptr();
-    params.value_ptr = (scalar_t*)value.data_ptr();
-    params.logsumexp_ptr = (typename Kernel::lse_scalar_t*)logsumexp.data_ptr();
-    params.output_ptr = (scalar_t*)out.data_ptr();
-    params.grad_output_ptr = (scalar_t*)grad_out.data_ptr();
-    params.grad_query_ptr = (scalar_t*)grad_q.data_ptr();
-    params.grad_key_ptr = (scalar_t*)grad_k.data_ptr();
-    params.grad_value_ptr = (scalar_t*)grad_v.data_ptr();
-    params.delta_ptr = (float*)delta.data_ptr();
-    params.head_dim = query.size(2);
-    params.head_dim_value = value.size(2);
-    params.num_queries = query.size(1);
-    params.num_keys = key.size(1);
-    params.num_batches = B;
-    params.causal = causal;
-    Kernel::check_supported(params);
-
-    constexpr auto kernel_fn = attention_kernel_backward_batched<Kernel>;
-
-    if (smem_bytes > 0xc000) {
-      TORCH_INTERNAL_ASSERT(
-          computeCapability >= 70,
-          "This kernel requires too much shared memory on this machine!");
-      cudaFuncSetAttribute(
-          kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes);
-    }
-
-    auto checkBinaryArchMatches = [&]() {
-      cudaFuncAttributes attr;
-      AT_CUDA_CHECK(cudaFuncGetAttributes(&attr, kernel_fn));
-      return attr.binaryVersion >= Kernel::ArchTag::kMinComputeCapability;
-    };
-    TORCH_INTERNAL_ASSERT(
-        checkBinaryArchMatches(), "Something went wrong in the build process");
-
-    kernel_fn<<<params.getBlocksGrid(), params.getThreadsGrid(), smem_bytes>>>(
-        params);
-  };
-
-  DISPATCH_KERNEL(
-      query, key, value, ([&] { launchKernel(Kernel{}, computeCapability); }));
-  AT_CUDA_CHECK(cudaGetLastError());
-  return std::make_tuple(grad_q, grad_k, grad_v);
-} // namespace
-
-} // namespace
-
-// TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
-//   m.impl(
-//       TORCH_SELECTIVE_NAME("xformers::efficient_attention_backward_cutlass"),
-//       TORCH_FN(mem_efficient_attention_backward_cutlass));
-// }
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/attention_forward_generic.cu b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/attention_forward_generic.cu
deleted file mode 100644
index 59b3637c8a438..0000000000000
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/attention_forward_generic.cu
+++ /dev/null
@@ -1,232 +0,0 @@
-#include <ATen/native/transformers/cuda/mem_eff_attention/kernel_forward.h>
-
-
-#define DISPATCH_BLOCKSIZE(VALUE_HEAD_DIM, FN)        \
-  {                                                   \
-    if (VALUE_HEAD_DIM <= 64) {                       \
-      constexpr bool kIs64x64 = true;                 \
-      constexpr bool kSingleValueIteration = true;    \
-      FN();                                           \
-    } else {                                          \
-      constexpr bool kIs64x64 = false;                \
-      if (VALUE_HEAD_DIM <= 128) {                    \
-        constexpr bool kSingleValueIteration = true;  \
-        FN();                                         \
-      } else {                                        \
-        constexpr bool kSingleValueIteration = false; \
-        FN();                                         \
-      }                                               \
-    }                                                 \
-  }
-
-#define DISPATCH_KERNEL(QUERY, KEY, VALUE, FUNC)                              \
-  {                                                                           \
-    cudaDeviceProp* properties =                                              \
-        at::cuda::getDeviceProperties(QUERY.device().index());                \
-    const int computeCapability = properties->major * 10 + properties->minor; \
-    DISPATCH_BLOCKSIZE(                                                       \
-        VALUE.size(-1), ([&]() {                                              \
-          static constexpr int64_t kQueriesPerBlock = kIs64x64 ? 64 : 32;     \
-          static constexpr int64_t kKeysPerBlock = kIs64x64 ? 64 : 128;       \
-          DISPATCH_TYPES(                                                     \
-              QUERY, ([&]() {                                                 \
-                DISPATCH_ARCHTAG(                                             \
-                    computeCapability, ([&]() {                               \
-                      using AlignedAK = AttentionKernel<                      \
-                          scalar_t,                                           \
-                          ArchTag,                                            \
-                          true,                                               \
-                          kQueriesPerBlock,                                   \
-                          kKeysPerBlock,                                      \
-                          kSingleValueIteration>;                             \
-                      /* Run a more efficient kernel (with `isAligned=True`)  \
-                      if memory is correctly aligned*/                        \
-                      bool isAligned =                                        \
-                          (QUERY.stride(2) % AlignedAK::kAlignmentQ == 0 &&   \
-                           KEY.stride(2) % AlignedAK::kAlignmentK == 0 &&     \
-                           VALUE.stride(2) % AlignedAK::kAlignmentV == 0);    \
-                      /* TODO: Should we warn or log somewhere when we use a  \
-                      less efficient kernel due to wrong alignment? */        \
-                      DISPATCH_BOOL(isAligned, kIsAligned, ([&]() {           \
-                                      using Kernel = AttentionKernel<         \
-                                          scalar_t,                           \
-                                          ArchTag,                            \
-                                          kIsAligned,                         \
-                                          kQueriesPerBlock,                   \
-                                          kKeysPerBlock,                      \
-                                          kSingleValueIteration>;             \
-                                      FUNC();                                 \
-                                    }))                                       \
-                    }))                                                       \
-              }));                                                            \
-        }));                                                                  \
-  }
-
-namespace {
-/*
-  There are 2 modes for using this function.
-  (Mode BMHK) With all the heads having the same seqlen
-  (Mode 1MHK) `batch=1` with all tokens across batches concatenated
-*/
-std::tuple<at::Tensor, at::Tensor> efficient_attention_forward_cutlass(
-    const at::Tensor& query, // [b, seqlen, num_heads, K]
-    const at::Tensor& key, // [b, seqlen, num_heads, K]
-    const at::Tensor& value, // [b, seqlen, num_heads, Kv]
-    // (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the
-    // position of the first query token for batch $b
-    const c10::optional<at::Tensor>& cu_seqlens_q,
-    // (Mode 1MHK only) [b+1]: cu_seqlens_k[b] contains the
-    // position of the first key token for batch $b
-    const c10::optional<at::Tensor>& cu_seqlens_k,
-    // (Mode 1MHK only) Maximum sequence length across batches
-    const c10::optional<int64_t> max_seqlen_q_,
-    bool compute_logsumexp,
-    bool causal) {
-  TORCH_CHECK(query.dim() == 4);
-  TORCH_CHECK(key.dim() == 4);
-  TORCH_CHECK(value.dim() == 4);
-
-  // Batch sizes
-  TORCH_CHECK(query.size(0) == key.size(0));
-  TORCH_CHECK(query.size(0) == value.size(0));
-
-  // Sequence length
-  TORCH_CHECK(key.size(1) == value.size(1));
-
-  // Num heads
-  TORCH_CHECK(query.size(2) == key.size(2));
-  TORCH_CHECK(query.size(2) == value.size(2));
-
-  // Embedding per head
-  TORCH_CHECK(query.size(3) == key.size(3));
-
-  int64_t max_seqlen_q, max_seqlen_k;
-  TORCH_CHECK(cu_seqlens_q.has_value() == cu_seqlens_k.has_value());
-  if (cu_seqlens_q.has_value()) {
-    TORCH_CHECK(cu_seqlens_q->scalar_type() == at::ScalarType::Int);
-    TORCH_CHECK(cu_seqlens_k->scalar_type() == at::ScalarType::Int);
-    TORCH_CHECK(cu_seqlens_q->dim() == 1 && cu_seqlens_k->dim() == 1);
-    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*cu_seqlens_q));
-    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*cu_seqlens_k));
-    TORCH_CHECK(cu_seqlens_q->size(0) == cu_seqlens_k->size(0));
-    TORCH_CHECK(query.size(0) == 1, "cu_seqlen only supports batch_size=1");
-    TORCH_CHECK(max_seqlen_q_.has_value());
-    max_seqlen_q = *max_seqlen_q_;
-    max_seqlen_k = 0; // Will be set inside the kernel
-  } else {
-    max_seqlen_q = query.size(1);
-    max_seqlen_k = key.size(1);
-  }
-
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
-
-  at::cuda::CUDAGuard device_guard(query.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  int64_t B = query.size(0);
-  int64_t M = query.size(1);
-  int64_t N = key.size(1);
-  int64_t num_heads = query.size(-2);
-  int64_t K = query.size(-1);
-  int64_t Kv = value.size(-1);
-
-  at::Tensor res;
-  at::Tensor logsumexp;
-
-  auto launchKernel = [&](auto _k, int computeCapability) {
-    using Kernel = decltype(_k);
-    using scalar_t = typename Kernel::scalar_t;
-    (void)_k;
-
-    res = at::empty(
-        {B, M, num_heads, Kv},
-        query.options().dtype(
-            TypeTraits<typename Kernel::output_t>::atScalarType()));
-
-    // NOTE: Should be aligned (by padding) in case M is
-    // not a good number for loading during backward
-    constexpr decltype(M) kAlignLSE = Kernel::kAlignLSE;
-    logsumexp = at::empty(
-        {B,
-         num_heads,
-         compute_logsumexp ? ceil_div(max_seqlen_q, kAlignLSE) * kAlignLSE : 0},
-        query.options().dtype(at::ScalarType::Float));
-
-    typename Kernel::Params p;
-    p.query_ptr = (scalar_t*)query.data_ptr();
-    p.key_ptr = (scalar_t*)key.data_ptr();
-    p.value_ptr = (scalar_t*)value.data_ptr();
-    p.logsumexp_ptr = compute_logsumexp
-        ? (typename Kernel::lse_scalar_t*)logsumexp.data_ptr()
-        : nullptr;
-    at::Tensor output_accum;
-    if (Kernel::kNeedsOutputAccumulatorBuffer) {
-      output_accum = at::empty(
-          {B, M, num_heads, Kv},
-          query.options().dtype(
-              TypeTraits<typename Kernel::output_accum_t>::atScalarType()));
-      p.output_accum_ptr =
-          (typename Kernel::output_accum_t*)output_accum.data_ptr();
-    } else {
-      p.output_accum_ptr = nullptr;
-    }
-    p.output_ptr = (typename Kernel::output_t*)res.data_ptr();
-
-    if (cu_seqlens_q.has_value()) {
-      p.cu_seqlens_q_ptr = (int32_t*)cu_seqlens_q->data_ptr();
-      p.cu_seqlens_k_ptr = (int32_t*)cu_seqlens_k->data_ptr();
-    }
-
-#define ASSIGN_CHECK_OVERFLOW(A, B)                                            \
-  {                                                                            \
-    A = B;                                                                     \
-    TORCH_CHECK(B < std::numeric_limits<decltype(A)>::max(), #B " overflows"); \
-  }
-
-    p.num_heads = num_heads;
-    p.head_dim = query.size(3);
-    p.head_dim_value = value.size(3);
-    p.num_queries = max_seqlen_q;
-    p.num_keys = max_seqlen_k;
-    p.num_batches = cu_seqlens_q.has_value() ? cu_seqlens_q->size(0) - 1 : B;
-    p.causal = causal;
-
-    ASSIGN_CHECK_OVERFLOW(p.q_strideB, query.stride(0));
-    ASSIGN_CHECK_OVERFLOW(p.k_strideB, key.stride(0));
-    ASSIGN_CHECK_OVERFLOW(p.v_strideB, value.stride(0));
-    ASSIGN_CHECK_OVERFLOW(p.q_strideM, query.stride(1));
-    ASSIGN_CHECK_OVERFLOW(p.k_strideM, key.stride(1));
-    ASSIGN_CHECK_OVERFLOW(p.v_strideM, value.stride(1));
-    ASSIGN_CHECK_OVERFLOW(p.q_strideH, query.stride(2));
-    ASSIGN_CHECK_OVERFLOW(p.k_strideH, key.stride(2));
-    ASSIGN_CHECK_OVERFLOW(p.v_strideH, value.stride(2));
-
-    constexpr auto kernel_fn = attention_kernel_batched<Kernel>;
-    size_t smem_bytes = sizeof(typename Kernel::SharedStorage);
-    if (smem_bytes > 0xc000) {
-      TORCH_INTERNAL_ASSERT(
-          computeCapability >= 70,
-          "This kernel requires too much shared memory on this machine!");
-      AT_CUDA_CHECK(cudaFuncSetAttribute(
-          kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes));
-    }
-    Kernel::check_supported(p);
-    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
-  };
-  // Dispatch to the right kernel
-  DISPATCH_KERNEL(query, key, value, ([&]() {
-                    launchKernel(Kernel{}, computeCapability);
-                  }));
-
-  AT_CUDA_CHECK(cudaGetLastError());
-  return std::make_tuple(res, logsumexp);
-}
-} // namespace
-
-// TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
-//   m.impl(
-//       TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_cutlass"),
-//       TORCH_FN(efficient_attention_forward_cutlass));
-// }
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/find_default_mma.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/find_default_mma.h
index 399593fd09573..b0e7106f3cfc8 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/find_default_mma.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/find_default_mma.h
@@ -1,15 +1,16 @@
 /*! \file
     \brief Cutlass provides helper template functions to figure out the right
-   datastructures to instanciate to run a GEMM with various parameters (see
+   datastructures to instantiate to run a GEMM with various parameters (see
    `cutlass/gemm/threadblock/default_mma.h`). However, due to template
-   instanciation priority rules, it will only create an MmaMultiStage with
+   instantiation priority rules, it will only create an MmaMultiStage with
    kStages=3 (otherwise creates an MmePipelined - which is not compatible with
    FastF32). kStages=3 uses too much shared memory and we want to use kStages=2,
    so we just copy-pasted some code from `default_mma.h` and
-   `default_mma_core.h` files and wrapped this template to allow our usecase.
+   `default_mma_core.h` files and wrapped this template to allow our use case.
 
     This is really only for the FastF32 case - aka using TensorCores with fp32.
 */
+#pragma once
 
 #include <cutlass/gemm/threadblock/default_mma.h>
 #include <cutlass/gemm/threadblock/default_mma_core_simt.h>
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h
index e25701a7588ac..c9652c40d38e4 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h
@@ -1,7 +1,5 @@
 #pragma once
-
 #include <ATen/ATen.h>
-#include <torch/library.h>
 #include <cmath>
 #include <vector>
 
@@ -75,46 +73,113 @@ struct AttentionBackwardKernel {
 
   struct Params {
     // Input tensors
-    scalar_t* query_ptr; // [num_queries, head_dim]
-    scalar_t* key_ptr; // [num_keys, head_dim]
-    scalar_t* value_ptr; // [num_keys, head_dim_value]
-    lse_scalar_t* logsumexp_ptr; // [num_queries]
-    scalar_t* output_ptr; // [num_queries, head_dim_value]
-    scalar_t* grad_output_ptr; // [num_queries, head_dim_value]
-    accum_t* delta_ptr; // [num_queries]
+    scalar_t* query_ptr; // [Mq, nH, K]
+    scalar_t* key_ptr; // [Mk, nH, K]
+    scalar_t* value_ptr; // [Mk, nH, Kv]
+    lse_scalar_t* logsumexp_ptr; // [nH, Mq]
+    scalar_t* output_ptr; // [Mq, nH, Kv]
+    scalar_t* grad_output_ptr; // [Mq, nH, Kv]
+    accum_t* delta_ptr; // [Mq, nH]
 
     // Output tensors
-    scalar_t* grad_query_ptr; // [num_queries, head_dim]
-    scalar_t* grad_key_ptr; // [num_keys, head_dim]
-    scalar_t* grad_value_ptr; // [num_keys, head_dim_value]
+    output_t* grad_query_ptr; //  [Mq, nH, K]
+    output_t* grad_key_ptr; //    [Mk, nH, K]
+    output_t* grad_value_ptr; //  [Mk, nH, Kv]
 
     // Dimensions/strides
     int32_t head_dim;
     int32_t head_dim_value;
     int32_t num_queries;
     int32_t num_keys;
-    int32_t num_batches;
+    int32_t num_heads;
     bool causal;
 
-    __device__ void advance_batches(int32_t batch_id) {
+    int32_t q_strideM;
+    int32_t k_strideM;
+    int32_t v_strideM;
+    int32_t gO_strideM;
+    int8_t gQKV_strideM_multiplier; // 3 for packed, 1 otherwise
+
+    CUTLASS_HOST_DEVICE int32_t o_strideM() const {
+      return head_dim_value * num_heads;
+    }
+    CUTLASS_HOST_DEVICE int32_t gQ_strideM() const {
+      return gQKV_strideM_multiplier * num_heads * head_dim;
+    }
+    CUTLASS_HOST_DEVICE int32_t gK_strideM() const {
+      return gQKV_strideM_multiplier * num_heads * head_dim;
+    }
+    CUTLASS_HOST_DEVICE int32_t gV_strideM() const {
+      return gQKV_strideM_multiplier * num_heads * head_dim_value;
+    }
+
+    // Everything below is only used in `advance_to_block`
+    // and shouldn't use registers
+    int64_t o_strideH;
+    int32_t q_strideH;
+    int32_t k_strideH;
+    int32_t v_strideH;
+    int64_t o_strideB;
+    int64_t q_strideB;
+    int64_t k_strideB;
+    int64_t v_strideB;
+    int32_t num_batches;
+
+    int64_t gO_strideB;
+    int64_t gQ_strideB;
+    int64_t gK_strideB;
+    int64_t gV_strideB;
+    int64_t gO_strideH;
+    int64_t gQ_strideH;
+    int64_t gK_strideH;
+    int64_t gV_strideH;
+
+    CUTLASS_DEVICE void advance_to_block() {
       constexpr int32_t kAlignLSE = 32; // block size of backward
       auto lse_dim = ceil_div((int32_t)num_queries, kAlignLSE) * kAlignLSE;
 
-      query_ptr += batch_id * head_dim * num_queries;
-      key_ptr += batch_id * head_dim * num_keys;
-      value_ptr += batch_id * head_dim_value * num_keys;
-      logsumexp_ptr += batch_id * lse_dim;
-      output_ptr += batch_id * head_dim_value * num_queries;
-      grad_output_ptr += batch_id * head_dim_value * num_queries;
-      delta_ptr += batch_id * num_queries;
-
-      grad_query_ptr += batch_id * head_dim * num_queries;
-      grad_key_ptr += batch_id * head_dim * num_keys;
-      grad_value_ptr += batch_id * head_dim_value * num_keys;
+      int32_t batch_id = blockIdx.z;
+      int32_t head_id = blockIdx.y;
+
+      query_ptr += batch_id * q_strideB + head_id * q_strideH;
+      key_ptr += batch_id * k_strideB + head_id * k_strideH;
+      value_ptr += batch_id * v_strideB + head_id * v_strideH;
+      logsumexp_ptr += (batch_id * num_heads + head_id) * lse_dim;
+      output_ptr += batch_id * o_strideB + head_id * o_strideH;
+      grad_output_ptr += batch_id * gO_strideB + head_id * gO_strideH;
+      delta_ptr += (batch_id * num_heads + head_id) * num_queries;
+
+      grad_query_ptr += batch_id * gQ_strideB + head_id * gQ_strideH;
+      grad_key_ptr += batch_id * gK_strideB + head_id * gK_strideH;
+      grad_value_ptr += batch_id * gV_strideB + head_id * gV_strideH;
+
+      head_dim = warp_uniform(head_dim);
+      head_dim_value = warp_uniform(head_dim_value);
+      num_queries = warp_uniform(num_queries);
+      num_keys = warp_uniform(num_keys);
+      num_heads = warp_uniform(num_heads);
+
+      gO_strideM = warp_uniform(gO_strideM);
+      gQKV_strideM_multiplier = warp_uniform(gQKV_strideM_multiplier);
+      q_strideM = warp_uniform(q_strideM);
+      k_strideM = warp_uniform(k_strideM);
+      v_strideM = warp_uniform(v_strideM);
+
+      query_ptr = warp_uniform(query_ptr);
+      key_ptr = warp_uniform(key_ptr);
+      value_ptr = warp_uniform(value_ptr);
+      logsumexp_ptr = warp_uniform(logsumexp_ptr);
+      output_ptr = warp_uniform(output_ptr);
+      grad_output_ptr = warp_uniform(grad_output_ptr);
+      delta_ptr = warp_uniform(delta_ptr);
+
+      grad_query_ptr = warp_uniform(grad_query_ptr);
+      grad_key_ptr = warp_uniform(grad_key_ptr);
+      grad_value_ptr = warp_uniform(grad_value_ptr);
     }
 
     __host__ dim3 getBlocksGrid() const {
-      return dim3(1, 1, num_batches);
+      return dim3(1, num_heads, num_batches);
     }
     __host__ dim3 getThreadsGrid() const {
       return dim3(kWarpSize, kNumWarpsPerBlock, 1);
@@ -179,7 +244,6 @@ struct AttentionBackwardKernel {
     attn_T = k_j @ q_i.transpose(-2, -1) # matmul
     attn_T = (attn_T - logsumexp[i_start:i_end].unsqueeze(1).transpose(-2,
     -1)).exp() # epilogue
-
     with attn_T.shape = (kBlockSizeJ, kBlockSizeI)
     */
     using ThreadblockShape =
@@ -225,7 +289,6 @@ struct AttentionBackwardKernel {
   struct MatmulGradV {
     /*
     grad_v[j_start:j_end] += attn_T @ do_i # matmul
-
     Dimensions: (kBlockSizeJ * kNumWarpsPerBlock, kBlockSizeI, K)
     (we might need to iterate multiple times on K)
     */
@@ -601,7 +664,7 @@ struct AttentionBackwardKernel {
     typename MatmulGradV::Mma::FragmentC gradV;
     typename MatmulGradK::Mma::FragmentC gradK;
 
-    __device__ __forceinline__ void clear() {
+    CUTLASS_DEVICE void clear() {
       gradV.clear();
       gradK.clear();
     }
@@ -614,14 +677,14 @@ struct AttentionBackwardKernel {
     CHECK_ALIGNED_PTR(p.output_ptr, kMinimumAlignment);
     CHECK_ALIGNED_PTR(p.grad_output_ptr, kMinimumAlignment);
     TORCH_CHECK(
-        p.head_dim % kMinimumAlignment == 0,
-        "query/key is not correctly aligned");
+        p.q_strideH % kMinimumAlignment == 0, "query is not correctly aligned");
     TORCH_CHECK(
-        p.head_dim_value % kMinimumAlignment == 0,
-        "value is not correctly aligned");
+        p.k_strideH % kMinimumAlignment == 0, "key is not correctly aligned");
+    TORCH_CHECK(
+        p.v_strideH % kMinimumAlignment == 0, "value is not correctly aligned");
   }
 
-  static __device__ void kernel(Params& p_) {
+  static CUTLASS_DEVICE void kernel(Params& p_) {
     // Hint to nvcc to store points & tensor shapes in registers
     // as we use them a lot
     register const Params p = p_;
@@ -658,7 +721,7 @@ struct AttentionBackwardKernel {
       __syncthreads();
     }
 
-    OutputFragments output_frags;
+    OutputFragments register output_frags;
     int32_t key_start = 0;
     int32_t key_end = p.num_keys / kBlockSizeJ * kBlockSizeJ;
     for (; key_start < key_end; key_start += kBlockSizeJ) {
@@ -695,7 +758,7 @@ struct AttentionBackwardKernel {
     }
   }
 
-  static __device__ __forceinline__ void loadDi(
+  static CUTLASS_DEVICE void loadDi(
       cutlass::Array<accum_t, kBlockSizeI>& di,
       Params const& p,
       int32_t query_start) {
@@ -710,7 +773,7 @@ struct AttentionBackwardKernel {
   }
 
   template <bool skipBoundsChecks>
-  static __device__ __forceinline__ void processBlockIJ(
+  static CUTLASS_DEVICE void processBlockIJ(
       SharedStorage& shared_storage,
       OutputFragments& output_frags,
       Params const& p,
@@ -718,9 +781,9 @@ struct AttentionBackwardKernel {
       int32_t key_start) {
     cutlass::MatrixCoord no_offset{0, 0};
     accum_t scale = accum_t(1.0 / std::sqrt(float(p.head_dim)));
-    int32_t thread_id = threadIdx.x + threadIdx.y * blockDim.x;
-    int32_t warp_id = threadIdx.y;
-    int32_t lane_id = threadIdx.x;
+    int16_t thread_id = threadIdx.x + threadIdx.y * blockDim.x;
+    int8_t warp_id = warp_uniform(threadIdx.y);
+    int8_t lane_id = threadIdx.x;
     __syncthreads();
     loadDi(shared_storage.di(), p, query_start);
 
@@ -734,8 +797,8 @@ struct AttentionBackwardKernel {
 
     auto prologueGradV = [&](int col) {
       typename MatmulGradV::Mma::IteratorB iterator_dO(
-          {int32_t(p.head_dim_value)},
-          p.grad_output_ptr + query_start * p.head_dim_value + col,
+          {int32_t(p.gO_strideM)},
+          p.grad_output_ptr + query_start * p.gO_strideM + col,
           {num_queries_in_block, p.head_dim_value - col},
           thread_id,
           no_offset);
@@ -747,8 +810,8 @@ struct AttentionBackwardKernel {
     };
     auto prologueGradQ = [&](int col) {
       typename MatmulGradQ::Mma::IteratorB iterator_K(
-          {int32_t(p.head_dim)},
-          p.key_ptr + key_start * p.head_dim + col,
+          {int32_t(p.k_strideM)},
+          p.key_ptr + key_start * p.k_strideM + col,
           {num_keys_in_block, p.head_dim - col},
           thread_id,
           no_offset);
@@ -757,8 +820,8 @@ struct AttentionBackwardKernel {
     };
     auto prologueGradK = [&](int col) {
       typename MatmulGradK::Mma::IteratorB iterator_Q(
-          {int32_t(p.head_dim)},
-          p.query_ptr + query_start * p.head_dim + col,
+          {int32_t(p.q_strideM)},
+          p.query_ptr + query_start * p.q_strideM + col,
           {num_queries_in_block, p.head_dim - col},
           thread_id,
           no_offset);
@@ -770,14 +833,14 @@ struct AttentionBackwardKernel {
     };
     auto prologueDOV = [&]() {
       typename MatmulDOIVJ::Mma::IteratorA iterator_A(
-          {int32_t(p.head_dim_value)},
-          p.grad_output_ptr + query_start * p.head_dim_value,
+          {int32_t(p.gO_strideM)},
+          p.grad_output_ptr + query_start * p.gO_strideM,
           {num_queries_in_block, p.head_dim_value},
           thread_id,
           no_offset);
       typename MatmulDOIVJ::Mma::IteratorB iterator_B(
-          {int32_t(p.head_dim_value)},
-          p.value_ptr + key_start * p.head_dim_value,
+          {int32_t(p.v_strideM)},
+          p.value_ptr + key_start * p.v_strideM,
           {p.head_dim_value, num_keys_in_block},
           thread_id,
           no_offset);
@@ -803,16 +866,16 @@ struct AttentionBackwardKernel {
 
       // k_j
       typename Mma::IteratorA iterator_A(
-          {int32_t(p.head_dim)},
-          p.key_ptr + key_start * p.head_dim,
+          {int32_t(p.k_strideM)},
+          p.key_ptr + key_start * p.k_strideM,
           {problem_size.m(), problem_size.k()},
           thread_id,
           no_offset);
 
       // q_i.transpose(-2, -1)
       typename Mma::IteratorB iterator_B(
-          {int32_t(p.head_dim)},
-          p.query_ptr + query_start * p.head_dim,
+          {int32_t(p.q_strideM)},
+          p.query_ptr + query_start * p.q_strideM,
           {problem_size.k(), problem_size.n()},
           thread_id,
           no_offset);
@@ -893,14 +956,14 @@ struct AttentionBackwardKernel {
           num_keys_in_block, p.head_dim_value - col, num_queries_in_block);
       auto createEpilogueIter = [&]() {
         return typename MatmulGradV::OutputTileIterator(
-            typename MatmulGradV::OutputTileIterator::Params{p.head_dim_value},
-            p.grad_value_ptr + key_start * p.head_dim_value + col,
+            typename MatmulGradV::OutputTileIterator::Params{p.gV_strideM()},
+            p.grad_value_ptr + key_start * p.gV_strideM() + col,
             {num_keys_in_block, p.head_dim_value - col},
             thread_id);
       };
       typename Mma::IteratorB iterator_B(
-          {int32_t(p.head_dim_value)},
-          p.grad_output_ptr + query_start * p.head_dim_value + col,
+          {int32_t(p.gO_strideM)},
+          p.grad_output_ptr + query_start * p.gO_strideM + col,
           {num_queries_in_block, p.head_dim_value - col},
           thread_id,
           no_offset);
@@ -951,16 +1014,16 @@ struct AttentionBackwardKernel {
       using Mma = typename MatmulDOIVJ::Mma;
       // do_i
       typename Mma::IteratorA iterator_A(
-          {int32_t(p.head_dim_value)},
-          p.grad_output_ptr + query_start * p.head_dim_value,
+          {int32_t(p.gO_strideM)},
+          p.grad_output_ptr + query_start * p.gO_strideM,
           {num_queries_in_block, p.head_dim_value},
           thread_id,
           no_offset);
 
       // v_j.transpose(-2, -1)
       typename Mma::IteratorB iterator_B(
-          {int32_t(p.head_dim_value)},
-          p.value_ptr + key_start * p.head_dim_value,
+          {int32_t(p.v_strideM)},
+          p.value_ptr + key_start * p.v_strideM,
           {p.head_dim_value, num_keys_in_block},
           thread_id,
           no_offset);
@@ -1057,16 +1120,16 @@ struct AttentionBackwardKernel {
           num_keys_in_block);
       auto createEpilogueIter = [&]() {
         return typename MatmulGradQ::OutputTileIterator(
-            typename MatmulGradQ::OutputTileIterator::Params{p.head_dim},
-            p.grad_query_ptr + query_start * p.head_dim + col,
+            typename MatmulGradQ::OutputTileIterator::Params{p.gQ_strideM()},
+            p.grad_query_ptr + query_start * p.gQ_strideM() + col,
             {problem_size.m(), problem_size.n()},
             thread_id);
       };
 
       // k_j
       typename Mma::IteratorB iterator_B(
-          {int32_t(p.head_dim)},
-          p.key_ptr + key_start * p.head_dim + col,
+          {int32_t(p.k_strideM)},
+          p.key_ptr + key_start * p.k_strideM + col,
           {problem_size.k(), problem_size.n()},
           thread_id,
           no_offset);
@@ -1153,8 +1216,8 @@ struct AttentionBackwardKernel {
           num_queries_in_block);
       auto createEpilogueIter = [&]() {
         return typename MatmulGradK::OutputTileIterator(
-            typename MatmulGradK::OutputTileIterator::Params{p.head_dim},
-            p.grad_key_ptr + key_start * p.head_dim + col,
+            typename MatmulGradK::OutputTileIterator::Params{p.gK_strideM()},
+            p.grad_key_ptr + key_start * p.gK_strideM() + col,
             {num_keys_in_block,
              false ? MatmulGradK::ThreadblockShape::kN : p.head_dim - col},
             thread_id);
@@ -1162,8 +1225,8 @@ struct AttentionBackwardKernel {
 
       // q_i
       typename Mma::IteratorB iterator_B(
-          {int32_t(p.head_dim)},
-          p.query_ptr + query_start * p.head_dim + col,
+          {int32_t(p.q_strideM)},
+          p.query_ptr + query_start * p.q_strideM + col,
           {problem_size.k(), problem_size.n()},
           thread_id,
           no_offset);
@@ -1236,15 +1299,15 @@ struct AttentionBackwardKernel {
         kForceReloadK || !MatmulQK::Mma::kSmemContainsEntireMat;
     auto thread_id = get_thread_id();
     typename MatmulQK::Mma::IteratorA iterator_A(
-        {int32_t(p.head_dim)},
-        p.key_ptr + key_start * p.head_dim,
+        {int32_t(p.k_strideM)},
+        p.key_ptr + key_start * p.k_strideM,
         {p.num_keys - key_start, p.head_dim},
         thread_id,
         cutlass::MatrixCoord{0, 0});
 
     typename MatmulQK::Mma::IteratorB iterator_B(
-        {int32_t(p.head_dim)},
-        p.query_ptr + query_start * p.head_dim,
+        {int32_t(p.q_strideM)},
+        p.query_ptr + query_start * p.q_strideM,
         {p.head_dim, p.num_queries - query_start},
         thread_id,
         cutlass::MatrixCoord{0, 0});
@@ -1259,7 +1322,7 @@ struct AttentionBackwardKernel {
   }
 
   template <bool skipBoundsChecks>
-  static __device__ __forceinline__ void writeFragsToGmem(
+  static CUTLASS_DEVICE void writeFragsToGmem(
       SharedStorage& shared_storage,
       OutputFragments& output_frags,
       Params const& p,
@@ -1268,8 +1331,8 @@ struct AttentionBackwardKernel {
         ? MatmulQK::Mma::Shape::kM
         : std::min((int32_t)MatmulQK::Mma::Shape::kM, p.num_keys - key_start);
     typename MatmulGradV::OutputTileIterator outputV_it(
-        typename MatmulGradV::OutputTileIterator::Params{p.head_dim_value},
-        p.grad_value_ptr + key_start * p.head_dim_value,
+        typename MatmulGradV::OutputTileIterator::Params{p.gV_strideM()},
+        p.grad_value_ptr + key_start * p.gV_strideM(),
         {num_keys_in_block, p.head_dim_value},
         get_thread_id());
     accumulateInGmem<MatmulGradV>(
@@ -1279,8 +1342,8 @@ struct AttentionBackwardKernel {
         true);
 
     typename MatmulGradK::OutputTileIterator outputK_it(
-        typename MatmulGradK::OutputTileIterator::Params{p.head_dim},
-        p.grad_key_ptr + key_start * p.head_dim,
+        typename MatmulGradK::OutputTileIterator::Params{p.gK_strideM()},
+        p.grad_key_ptr + key_start * p.gK_strideM(),
         {num_keys_in_block,
          false ? MatmulGradK::ThreadblockShape::kN : p.head_dim},
         get_thread_id());
@@ -1292,7 +1355,7 @@ struct AttentionBackwardKernel {
   }
 
   template <typename MatmulT>
-  static __device__ __forceinline__ void accumulateInGmem(
+  static CUTLASS_DEVICE void accumulateInGmem(
       typename MatmulT::DefaultEpilogue::SharedStorage& epilogue_smem,
       typename MatmulT::Mma::FragmentC const& accum,
       typename MatmulT::OutputTileIterator output_it,
@@ -1334,7 +1397,9 @@ struct AttentionBackwardKernel {
   }
 
   template <int kElementsPerAccess>
-  static __device__ void computeDelta(Params const& p, int32_t query_start) {
+  static CUTLASS_DEVICE void computeDelta(
+      Params const& p,
+      int32_t query_start) {
     // Each thread computes one value for Delta
     // Depending on warp configuration, we might have multiple
     // threads of the same warp working on the same row
@@ -1349,13 +1414,15 @@ struct AttentionBackwardKernel {
     bool rowPred = (query_start + laneRow) < p.num_queries;
     bool pred = rowPred;
 
-    const __restrict__ AccessType* grad_output_ptr =
-        reinterpret_cast<const __restrict__ AccessType*>(
-            p.grad_output_ptr + (query_start + laneRow) * p.head_dim_value +
+    // on windows, previous syntax __restrict__ AccessType*
+    // resulted in error: "restrict" is not allowed
+    const AccessType* __restrict__ grad_output_ptr =
+        reinterpret_cast<const AccessType*>(
+            p.grad_output_ptr + (query_start + laneRow) * p.gO_strideM +
             laneFirstCol);
-    const __restrict__ AccessType* output_ptr =
-        reinterpret_cast<const __restrict__ AccessType*>(
-            p.output_ptr + (query_start + laneRow) * p.head_dim_value +
+    const AccessType* __restrict__ output_ptr =
+        reinterpret_cast<const AccessType*>(
+            p.output_ptr + (query_start + laneRow) * p.o_strideM() +
             laneFirstCol);
 
     static constexpr int64_t kMaxIters =
@@ -1430,13 +1497,13 @@ struct AttentionBackwardKernel {
     }
   }
 
-  static __device__ __forceinline__ int8_t get_lane_id() {
+  static CUTLASS_DEVICE int8_t get_lane_id() {
     return threadIdx.x;
   }
-  static __device__ __forceinline__ int8_t get_warp_id() {
+  static CUTLASS_DEVICE int8_t get_warp_id() {
     return threadIdx.y;
   }
-  static __device__ __forceinline__ int16_t get_thread_id() {
+  static CUTLASS_DEVICE int16_t get_thread_id() {
     return threadIdx.x + threadIdx.y * blockDim.x;
   }
 };
@@ -1457,8 +1524,7 @@ __global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
 #define INSTANTIATE_ATTENTION_KERNEL_BACKWARD(ARCH, ...)             \
   _ATTENTION_KERNEL_BACKWARD_BEGIN(                                  \
       AttentionBackwardKernel<cutlass::arch::Sm##ARCH, __VA_ARGS__>) \
-  auto batch_id = blockIdx.z;                                        \
-  p.advance_batches(batch_id);                                       \
+  p.advance_to_block();                                              \
   Kernel::kernel(p);                                                 \
   _ATTENTION_KERNEL_BACKWARD_END();
 
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.h b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
index 564adb2d51ea8..e9f3d5029aa86 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.h
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
@@ -62,6 +62,15 @@ inline bool check_for_attn_weights(sdp_params params, bool debug) {
   }
   return true;
 }
+
+inline bool check_for_non_zero_dropout(sdp_params params, bool debug) {
+  if (params.dropout != 0.0) {
+    TORCH_CHECK(!debug, "Mem_efficient does not support non_zero dropout. Dropout_p: ", params.dropout);
+    return false;
+  }
+  return true;
+}
+
 inline bool check_for_seq_len_1_nested_tensor(sdp_params params, bool debug) {
   if (!params.query.is_nested()) {
     return true;
@@ -230,7 +239,8 @@ inline bool use_mem_efficient_attention(sdp_params params, bool debug) {
       check_for_attn_weights,
       check_tensor_shapes,
       check_for_attn_mask,
-      check_for_seq_len_1_nested_tensor};
+      check_for_seq_len_1_nested_tensor,
+      check_for_non_zero_dropout};
   for (auto& constraint : constraints) {
     if (!constraint(params, debug)) {
       return false;
diff --git a/test/test_transformers.py b/test/test_transformers.py
index a9d0d960fb9a6..c86b89bed5efd 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -21,8 +21,11 @@
     TEST_WITH_ROCM,
     IS_WINDOWS,
     slowTest,
-    set_default_dtype
+    set_default_dtype,
+    gradcheck
 )
+
+from torch.testing._internal.common_methods_invocations import wrapper_set_seed
 from torch.testing._internal.common_cuda import TEST_CUDA, SM80OrLater
 
 if TEST_FAIRSEQ:
@@ -860,11 +863,22 @@ def rand_tensor(*shape):
                     actual = torch.ops.aten._scaled_dot_product_attention(
                         query, key, value, attn_mask, dropout_p, need_attn_weights, is_causal)
 
-            # freeze_rng_state() doesn't seem to work outside of CPU, so dropout makes the results incomparable.
-            # TODO: Do this skipping in a nicer way once the granular test skipping logic lands.
-            if dropout_p == 0.0 or device == 'cpu':
                 self.assertEqual(actual, expected)
 
+        if attn_mask_dim is None:
+            q = q.double().clone()
+            k = k.double().clone()
+            v = v.double().clone()
+            q.requires_grad_()
+            k.requires_grad_()
+            v.requires_grad_()
+
+            assert gradcheck(lambda *args, **kwargs: wrapper_set_seed(sdp_ref, *args, **kwargs),
+                             (q, k, v, attn_mask, dropout_p))
+            assert gradcheck(lambda *args, **kwargs:
+                             wrapper_set_seed(torch.nn.functional._scaled_dot_product_attention, *args, **kwargs),
+                             (q, k, v, attn_mask, dropout_p))
+
     @unittest.skipIf(TEST_WITH_CROSSREF, 'Fastpath not available with crossref')
     @torch.no_grad()
     def test_mask_check_fastpath(self):
@@ -1079,6 +1093,28 @@ def rand_tensor(shape):
         self.assertEqual(math_ref_test, math_ref_lp_test, atol=7e-3, rtol=7e-3)
         self.assertEqual(actual_test, math_ref_test, atol=5e-3, rtol=5e-3)
 
+    @unittest.skipIf(not TEST_CUDA or TEST_WITH_ROCM or IS_WINDOWS, "Flash Attention was not built for this system")
+    @parametrize("contiguous_inputs", [True, False])
+    def test_efficient_attention_gradcheck(self, contiguous_inputs: bool):
+
+        batch_size, seq_len, num_heads, head_dim = 8, 8, 4, 64
+        query, key, value = torch.rand((batch_size, seq_len, 3 * num_heads * head_dim),
+                                       device="cuda", dtype=torch.float32, requires_grad=True).chunk(3, -1)
+        query = query.view(batch_size, -1, num_heads, head_dim)
+        key = key.view(batch_size, -1, num_heads, head_dim)
+        value = value.view(batch_size, -1, num_heads, head_dim)
+
+        if contiguous_inputs:
+            query = query.contiguous()
+            key = key.contiguous()
+            value = value.contiguous()
+
+        # Normally we would transpose the inputs but the fused kernels expect
+        # (batch, seq_len, num_heads, head_dim) bump the tolerance since we can only run kernel
+        # in fp32
+        assert gradcheck(lambda *args, **kwargs:
+                         wrapper_set_seed(torch.ops.aten._efficient_attention_forward, *args, **kwargs),
+                         (query, key, value, None, None, None, True, False), fast_mode=True, atol=8e-5, rtol=1e-3)
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     def test_sdp_runtime_dispatch(self):
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 8349a308be35a..a0892b32a8352 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -2591,7 +2591,7 @@
 - name: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor
   self: grad.reshape_symint(self.sym_sizes())
 
-# Nested Tensor
+# NestedTensor
 - name: _nested_tensor_from_tensor_list(Tensor[] list, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   list: "grad.defined()? at::unbind(grad) : std::vector<Tensor>(list.size())"
 
@@ -2612,6 +2612,11 @@
   nested_size: non_differentiable
   nested_strides: non_differentiable
 
+# Transformers
+- name:  _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, bool compute_log_sumexp=False, bool causal=False) -> (Tensor, Tensor)
+  output_differentiability: [True, False]
+  query, key, value: _efficient_attention_backward(grad, query, key, value, result1, result0, causal)
+
 # fft
 - name: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor
   self: fft_r2c_backward(grad, dim, normalization, onesided, self.sym_size(dim.back()))
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 001fd455e82ee..3b43b8fb48634 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -11944,8 +11944,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     ),
     OpInfo(
         'nn.functional._scaled_dot_product_attention',
-        op=lambda inp, *args, **kwargs:
-               wrapper_set_seed(torch.nn.functional._scaled_dot_product_attention, inp, *args, **kwargs),
+        op=lambda *args, **kwargs:
+               wrapper_set_seed(torch.nn.functional._scaled_dot_product_attention, *args, **kwargs),
         sample_inputs_func=sample_inputs_scaled_dot_product_attention,
         dtypes=floating_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),

From 4681335403fffe1e0c6dc11532a5ee6cc12fbe17 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <mikekg@meta.com>
Date: Tue, 15 Nov 2022 01:25:17 +0000
Subject: [PATCH 0886/1922] Fix cuda/cpu check on NoneType (Unit test) (#88970)

Summary: Fix cuda/cpu check on NoneType (unit test)

Test Plan: sabdcastle/ github CI/CD

Differential Revision: D41208798

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88970
Approved by: https://github.com/Skylion007, https://github.com/cpuhrsch
---
 test/test_transformers.py      | 9 +++++++++
 torch/nn/modules/activation.py | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/test/test_transformers.py b/test/test_transformers.py
index c86b89bed5efd..93a94a5604c91 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -1168,6 +1168,15 @@ def make_tensor(*size, device=device, dtype=dtype):
             self.assertRaises(RuntimeError, lambda: torch.nn.functional._scaled_dot_product_attention(
                 q, k, v, torch.ones_like(q), 0.0, False, False))
 
+    # Test failing MHA when bias was NoneType
+    def test_bias_is_none(self):
+        x = torch.rand((1, 5, 10))
+        model = torch.nn.modules.activation.MultiheadAttention(10, 1, bias=False, batch_first=True)
+        model.eval()
+        model(x, x, x)
+        # completes without error
+
+
 # TODO: Replace this with instantiate_device_type_tests() to take advantage of test framework support for
 # cross device / dtype testing.
 instantiate_parametrized_tests(TestTransformers)
diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index e6b3b778e5fbc..b00da06126a7a 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -1113,7 +1113,7 @@ def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: O
                 why_not_fast_path = "some Tensor argument has_torch_function"
             elif not all([(x is None or x.is_cuda or 'cpu' in str(x.device)) for x in tensor_args]):
                 why_not_fast_path = "some Tensor argument is neither CUDA nor CPU"
-            elif torch.is_grad_enabled() and any([x.requires_grad for x in tensor_args]):
+            elif torch.is_grad_enabled() and any([x is not None and x.requires_grad for x in tensor_args]):
                 why_not_fast_path = ("grad is enabled and at least one of query or the "
                                      "input/output projection weights or biases requires_grad")
             if not why_not_fast_path:

From d178732f77a13b6edc2851ceeee4b8ead19928d1 Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@fb.com>
Date: Tue, 15 Nov 2022 02:32:55 +0000
Subject: [PATCH 0887/1922] Fix lookup file update in dashboard (#89024)

Lookup file should be updated before graphs are generated.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89024
Approved by: https://github.com/mlazos, https://github.com/anijain2305
---
 benchmarks/dynamo/runner.py | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py
index d27763c41b0b6..319ff677db4fb 100755
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@@ -866,7 +866,7 @@ def generate_comment(self):
         title = "## Accuracy Regressions ##\n"
         body = (
             "For each relevant compiler, we compare the most recent 2 reports "
-            "(that run actually the compiler) to find models where previously "
+            "(that actually run the compiler) to find models where previously "
             "successful accuracy tests now fail.\n\n"
         )
         dtype = self.args.dtypes[0]
@@ -1031,29 +1031,35 @@ def __init__(self, args):
         self.output_dir = args.output_dir
         self.lookup_file = os.path.join(self.args.dashboard_archive_path, "lookup.csv")
         assert os.path.exists(self.lookup_file)
+        try:
+            self.update_lookup_file()
+        except subprocess.CalledProcessError:
+            print("failed to update lookup file")
 
-    def archive(self):
+    def update_lookup_file(self):
         dtype = self.args.dtypes[0]
-        # Copy the folder to archived location
-        archive(
-            self.output_dir,
-            self.args.dashboard_archive_path,
-            self.args.archive_name,
-            dtype,
-        )
         day, _ = archive_data(self.args.archive_name)
         target_dir = (
             default_archive_name(dtype)
             if self.args.archive_name is None
             else self.args.archive_name
         )
-
         # Update lookup csv the folder to arhived logs
         subprocess.check_call(
             f'echo "{day},performance,{dtype},{target_dir}" >> {self.lookup_file}',
             shell=True,
         )
 
+    def archive(self):
+        dtype = self.args.dtypes[0]
+        # Copy the folder to archived location
+        archive(
+            self.output_dir,
+            self.args.dashboard_archive_path,
+            self.args.archive_name,
+            dtype,
+        )
+
     def upload_graphs(self):
         title = "## Performance graphs ##\n"
         str_io = io.StringIO()

From cac1d6e757e0e1a31fab731ed76d0fd4a815c0f7 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 14 Nov 2022 16:51:32 -0500
Subject: [PATCH 0888/1922] Add test that bias gradient is properly tested in
 same_two_models (#88995)

See
https://github.com/pytorch/pytorch/pull/88629#issuecomment-1313850324
for why this got broken.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88995
Approved by: https://github.com/albanD
---
 test/dynamo/test_repros.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index fd0fcf9e08bc2..503231b4cb120 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -1769,6 +1769,21 @@ def forward(self, getitem_1, getitem_2, add):
         ]
         self.assertTrue(same_two_models(mod, opt_mod, args))
 
+    def test_optimized_deepcopy(self):
+        # See https://github.com/pytorch/pytorch/pull/88629
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc = torch.nn.Linear(in_features=2, out_features=3, bias=True)
+
+            def forward(self, x):
+                return self.fc(x)
+
+        mod = Foo()
+        opt_mod = torch._dynamo.optimize("eager")(mod)
+        args = [torch.randn(1, 2)]
+        self.assertTrue(same_two_models(mod, opt_mod, args))
+
     def test_class_member(self):
         class Foo(torch.nn.Module):
             a = 4

From 84809755198961cc91e3dba38f4f662485afa5cc Mon Sep 17 00:00:00 2001
From: Jiawen Liu <jiawenl@meta.com>
Date: Tue, 15 Nov 2022 03:10:36 +0000
Subject: [PATCH 0889/1922] [Inductor] Build Shape Padding in Inductor (#88709)

Summary: Build shape padding for matmul/bmm/addmm in Inductor

Differential Revision: D41071282

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88709
Approved by: https://github.com/bertmaher, https://github.com/Chillee
---
 torch/_inductor/config.py        |   3 +
 torch/_inductor/decomposition.py | 149 ++++++++++++++++++++++++++++++-
 torch/_inductor/utils.py         |  77 ++++++++++++++++
 3 files changed, 226 insertions(+), 3 deletions(-)

diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index 8f9f2c4f461dd..d376fe3e8bf7f 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -71,6 +71,9 @@
 # How to import torchdynamo, either torchdynamo or torch.dynamo
 dynamo_import = inductor_import.replace("inductor", "dynamo")
 
+# Pad input tensors of matmul/bmm/addmm to leverage Tensor Cores in NVIDIA GPUs
+shape_padding = os.environ.get("TORCHINDUCTOR_SHAPE_PADDING", "0") == "1"
+alignment_size = 4
 
 # config specific to codegen/cpp.pp
 class cpp:
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index e8a20c0dbd26e..0b29dd524cb78 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -8,8 +8,9 @@
 from torch import Tensor
 from torch._decomp import get_decompositions
 from torch._prims_common import is_boolean_dtype, is_integer_dtype
+from torch.utils._mode_utils import no_dispatch
 
-from . import config
+from . import config, utils
 
 log = logging.getLogger(__name__)
 aten = torch.ops.aten
@@ -135,6 +136,26 @@ def floordiv(a, b):
     return aten.div.Tensor_mode(a, b, rounding_mode="floor")
 
 
+def get_padded_length(x):
+    if x % config.alignment_size == 0:
+        return 0
+    return int((x // config.alignment_size + 1) * config.alignment_size) - x
+
+
+def pad_dim(x, padded_length, dim):
+    pad = x.new_zeros(*x.shape[:dim], padded_length, *x.shape[dim + 1 :])
+    return torch.cat([x, pad], dim=dim)
+
+
+def check_device_dtype(a: Tensor, b: Tensor):
+    return (
+        a.is_cuda
+        and b.is_cuda
+        and a.dtype == torch.float32
+        and b.dtype == torch.float32
+    )
+
+
 @register_decomposition([aten.addmm])
 def addmm(input, mat1, mat2, *, beta=1, alpha=1):
     if config.triton.mm != "aten":
@@ -144,8 +165,130 @@ def addmm(input, mat1, mat2, *, beta=1, alpha=1):
         if not isinstance(beta, numbers.Number) or beta != 1:
             input = input * beta
         return input + out
-    else:
-        return NotImplemented  # go directly to lowering
+
+    if (
+        config.shape_padding
+        and check_device_dtype(mat1, mat2)
+        and should_pad_bench(mat1, mat2, torch.ops.aten.addmm, input=input)
+    ):
+        m_padded_length = get_padded_length(mat1.shape[0])
+        k_padded_length = get_padded_length(mat1.shape[1])
+        n_padded_length = get_padded_length(mat2.shape[1])
+
+        if k_padded_length != 0:
+            mat1 = pad_dim(mat1, k_padded_length, 1)
+            mat2 = pad_dim(mat2, k_padded_length, 0)
+        elif m_padded_length != 0:
+            mat1 = pad_dim(mat1, m_padded_length, 0)
+        elif n_padded_length != 0:
+            mat2 = pad_dim(mat2, n_padded_length, 1)
+
+        if input is not None and k_padded_length == 0:
+            if m_padded_length != 0 and input.dim() == 2:
+                input = pad_dim(input, m_padded_length, 0)
+            elif n_padded_length != 0:
+                if input.dim() == 2:
+                    input = pad_dim(input, n_padded_length, 1)
+                elif input.dim() == 1:
+                    input = pad_dim(input, n_padded_length, 0)
+
+        if k_padded_length != 0:
+            return torch.ops.aten.addmm(input, mat1, mat2, beta=beta, alpha=alpha)
+        elif m_padded_length != 0:
+            return torch.ops.aten.addmm(input, mat1, mat2, beta=beta, alpha=alpha)[
+                :-m_padded_length, :
+            ]
+        elif n_padded_length != 0:
+            return torch.ops.aten.addmm(input, mat1, mat2, beta=beta, alpha=alpha)[
+                :, :-n_padded_length
+            ]
+
+    return NotImplemented  # go directly to lowering
+
+
+def should_pad_bench(mat1, mat2, op, input=None):
+    with no_dispatch():
+        if op is torch.ops.aten.mm or op is torch.ops.aten.addmm:
+            m_padded_length = get_padded_length(mat1.shape[0])
+            k_padded_length = get_padded_length(mat1.shape[1])
+            n_padded_length = get_padded_length(mat2.shape[1])
+        elif op is torch.ops.aten.bmm:
+            m_padded_length = get_padded_length(mat1.shape[1])
+            k_padded_length = get_padded_length(mat1.shape[2])
+            n_padded_length = get_padded_length(mat2.shape[2])
+        else:
+            return False
+
+        if m_padded_length == k_padded_length == n_padded_length == 0:
+            return False
+
+        mat1 = torch.randn_like(mat1)
+        mat2 = torch.randn_like(mat2)
+        warmup = 5
+        rep = 100
+        if op is torch.ops.aten.bmm or op is torch.ops.aten.mm:
+            ori_time = utils.do_bench(
+                lambda: op(mat1, mat2), warmup=warmup, rep=rep, fast_flush=True
+            )[0]
+        else:
+            if input is not None:
+                input = torch.randn_like(input)
+            ori_time = utils.do_bench(
+                lambda: op(input, mat1, mat2), warmup=warmup, rep=rep, fast_flush=True
+            )[0]
+
+        mat1_pad = mat1.new_empty([get_padded_length(i) + i for i in mat1.shape])
+        mat2_pad = mat2.new_empty([get_padded_length(i) + i for i in mat2.shape])
+        if op is torch.ops.aten.addmm:
+            input_pad = None
+            if input is not None and input.is_cuda and input.dtype == torch.float32:
+                input_pad = input.new_empty(
+                    [get_padded_length(i) + i for i in input.shape]
+                )
+            pad_time = utils.do_bench(
+                lambda: op(input_pad, mat1_pad, mat2_pad),
+                warmup=warmup,
+                rep=rep,
+                fast_flush=True,
+            )[0]
+        else:
+            pad_time = utils.do_bench(
+                lambda: op(mat1_pad, mat2_pad), warmup=warmup, rep=rep, fast_flush=True
+            )[0]
+
+        # Shape padding introduces addtional memory ops. Based on microbenchmarks, 1.3x for
+        # aten.mm and aten.addmm and 2x for aten.bmm represent a reasonable tradeoff between
+        # performance improvement from shape padding and overhead from addtional memory ops
+        # TODO: Build a learned model which would be better than this heuristic
+        if op is torch.ops.aten.mm or op is torch.ops.aten.addmm:
+            return ori_time > pad_time * 1.3
+        else:
+            return ori_time > pad_time * 2
+
+
+@register_decomposition([aten.bmm])
+def bmm_decomp(mat1, mat2):
+    if (
+        config.shape_padding
+        and check_device_dtype(mat1, mat2)
+        and should_pad_bench(mat1, mat2, torch.ops.aten.bmm)
+    ):
+        m_padded_length = get_padded_length(mat1.shape[1])
+        k_padded_length = get_padded_length(mat1.shape[2])
+        n_padded_length = get_padded_length(mat2.shape[2])
+
+        if k_padded_length != 0:
+            mat1 = pad_dim(mat1, k_padded_length, 2)
+            mat2 = pad_dim(mat2, k_padded_length, 1)
+            return torch.ops.aten.bmm(mat1, mat2)
+        elif m_padded_length != 0:
+            mat1 = pad_dim(mat1, m_padded_length, 1)
+            return torch.ops.aten.bmm(mat1, mat2)[:, :-m_padded_length, :].contiguous()
+        elif n_padded_length != 0:
+            mat2 = pad_dim(mat2, n_padded_length, 2)
+            return torch.ops.aten.bmm(mat1, mat2)[:, :, :-n_padded_length].contiguous()
+
+    return NotImplemented  # go directly to lowering
 
 
 @register_decomposition([aten.rsqrt])
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index 5bfda50dd6f7e..08e95b9b5cc34 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -57,6 +57,83 @@ def conditional_product(*args):
     return functools.reduce(operator.mul, [x for x in args if x])
 
 
+def do_bench(
+    fn,
+    warmup=25,
+    rep=100,
+    grad_to_none=None,
+    percentiles=(0.5, 0.2, 0.8),
+    record_clocks=False,
+    fast_flush=False,
+):
+    """
+    Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
+    the 20-th and 80-th performance percentile.
+
+    :param fn: Function to benchmark
+    :type fn: Callable
+    :param warmup: Warmup time (in ms)
+    :type warmup: int
+    :param rep: Repetition time (in ms)
+    :type rep: int
+    :param grad_to_none: Reset the gradient of the provided tensor to None
+    :type grad_to_none: torch.tensor, optional
+    :param percentiles: Performance percentile to return in addition to the median.
+    :type percentiles: list[float]
+    :param fast_flush: Use faster kernel to flush L2 between measurements
+    :type fast_flush: bool
+    """
+
+    # Estimate the runtime of the function
+    fn()
+    torch.cuda.synchronize()
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    for _ in range(5):
+        fn()
+    end_event.record()
+    torch.cuda.synchronize()
+    estimate_ms = start_event.elapsed_time(end_event) / 5
+    # compute number of warmup and repeat
+    n_warmup = max(1, int(warmup / estimate_ms))
+    n_repeat = max(1, int(rep / estimate_ms))
+    # We maintain a buffer of 256 MB that we clear
+    # before each kernel call to make sure that the L2
+    # doesn't contain any input data before the run
+    start_event = [torch.cuda.Event(enable_timing=True) for i in range(n_repeat)]
+    end_event = [torch.cuda.Event(enable_timing=True) for i in range(n_repeat)]
+    if fast_flush:
+        cache = torch.empty(int(256e6 // 4), dtype=torch.int, device="cuda")
+    else:
+        cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
+    # Warm-up
+    for _ in range(n_warmup):
+        fn()
+    # Benchmark
+    for i in range(n_repeat):
+        # we don't want `fn` to accumulate gradient values
+        # if it contains a backward pass. So we clear the
+        # provided gradients
+        if grad_to_none is not None:
+            for x in grad_to_none:
+                x.grad = None
+        # we clear the L2 cache before each run
+        cache.zero_()
+        # record time of `fn`
+        start_event[i].record()
+        fn()
+        end_event[i].record()
+    # Record clocks
+    torch.cuda.synchronize()
+    times = torch.tensor([s.elapsed_time(e) for s, e in zip(start_event, end_event)])
+    if percentiles:
+        percentiles = torch.quantile(times, torch.tensor(percentiles)).tolist()
+        return tuple(percentiles)
+    else:
+        return torch.mean(times).item()
+
+
 def sympy_product(it):
     return functools.reduce(operator.mul, it, sympy.Integer(1))
 

From 4bbfca96b691c671c24f237cbcad69c7c8fb16ce Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Tue, 15 Nov 2022 03:32:00 +0000
Subject: [PATCH 0890/1922] [vision hash update] update the pinned vision hash
 (#89026)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned vision hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89026
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/vision.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index b9eda365de0c5..c9bfe60001af3 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-deba056203d009fec6b58afb9fa211f6ee3328c8
+b1f6c9e271368cd84837522af39e68dd4b5768a7

From 81d76b9e7acafb1bd67d7ed0ed13a59f6520061c Mon Sep 17 00:00:00 2001
From: Everton Constantino <everton.constantino@linaro.org>
Date: Tue, 15 Nov 2022 04:10:49 +0000
Subject: [PATCH 0891/1922] Changing the use from ASSERT_EQ to ASSERT_FLOAT_EQ
 on nn_utils test. (#83693)

Changing the use from ASSERT_EQ to ASSERT_FLOAT_EQ on nn_utils.cpp:ClipGradNorm as this is the proper way to compare equality between floating point values. This avoids `test_api` ClipGradNorm failing for WoA.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/83693
Approved by: https://github.com/ngimel, https://github.com/kit1980
---
 test/cpp/api/nn_utils.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/cpp/api/nn_utils.cpp b/test/cpp/api/nn_utils.cpp
index 3d24749a96532..76aab44ac290d 100644
--- a/test/cpp/api/nn_utils.cpp
+++ b/test/cpp/api/nn_utils.cpp
@@ -92,7 +92,7 @@ TEST_F(NNUtilsTest, ClipGradNorm) {
     ASSERT_LE(norm_after, max_norm);
     auto scaled = compare_scaling(grads);
     ASSERT_NEAR(0, scaled.std().item().toFloat(), 1e-7);
-    ASSERT_EQ(scaled[0].item().toFloat(), 1);
+    ASSERT_FLOAT_EQ(scaled[0].item().toFloat(), 1);
   }
   // should accept a single tensor as input
   auto p1 = torch::randn({10, 10});

From 4b2946b5e68bcd5d6125ee6078e8eecbe9e1cf73 Mon Sep 17 00:00:00 2001
From: Elias Ellison <elias.ellison@gmail.com>
Date: Tue, 15 Nov 2022 00:21:52 +0000
Subject: [PATCH 0892/1922] [WIP] Unwrap View in Reinterpret View (#89016)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89016
Approved by: https://github.com/ngimel
---
 torch/_inductor/ir.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index fdc10c9ca16a7..8327fe0d7b521 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -998,11 +998,11 @@ def as_storage_and_layout(x, freeze=True, want_contiguous=False, stride_order=No
                 x.data.decide_layout()
         return x, x.data.layout
     if isinstance(x, ReinterpretView):
+        # making the base of x contiguous or stride_ordered will not necessarily make
+        # the ReinterpretedView either, so dont pass along those arguments
         buffer, _ = as_storage_and_layout(
             x.data,
             freeze=freeze,
-            want_contiguous=want_contiguous,
-            stride_order=stride_order,
         )
         return buffer, x.layout
     raise NotImplementedError
@@ -1402,6 +1402,10 @@ class ReinterpretView(BaseView):
 
     layout: "Layout"
 
+    def __post_init__(self):
+        if isinstance(self.data, BaseView):
+            self.data = self.data.unwrap_view()
+
     def __str__(self):
         return self.str_helper(
             [

From c678be28a72f2f3ba3049d9837427318d70dcf52 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 15 Nov 2022 05:08:26 +0000
Subject: [PATCH 0893/1922] Add a mode to rerun all disabled tests (without
 running anything else) (#88646)

Rerun all disabled test to gather their latest result so that we can close disabled tickets automatically. When running under this mode (RERUN_DISABLED_TESTS=true), only disabled tests are run while the rest are skipped `<skipped message="Test is enabled but --rerun-disabled-tests verification mode is set, so only disabled tests are run" type="skip"/>`

The logic is roughly as follows, the test runs multiple times (n=50)

* If the disabled test passes, and it's flaky, do nothing because it's still flaky.  In the test report, we'll see the test passes with the following skipped message:
```
<testcase classname="TestMultiprocessing" file="test_multiprocessing.py" line="357" name="test_fs" time="0.000" timestamp="0001-01-01T00:00:00">
    <skipped message="{&quot;flaky&quot;: True, &quot;num_red&quot;: 4, &quot;num_green&quot;: 0, &quot;max_num_retries&quot;: 3, &quot;rerun_disabled_test&quot;: true}" type="skip"/>
</testcase>
```

* If the disabled test passes every single time, and it is not flaky anymore, mark it so that it can be closed later.  We will see the test runs and passes, i.e.
```
<testcase classname="TestCommonCUDA" name="test_out_warning_linalg_lu_factor_cuda" time="0.170" file="test_ops.py" />
```

* If the disabled test fails after all retries, this is also expected. So only report this but don't fail the job (because we don't care about red signals here), we'll see the test is skipped (without the `flaky` field), i.e.
```
<testcase classname="TestMultiprocessing" file="test_multiprocessing.py" line="357" name="test_fs" time="0.000" timestamp="0001-01-01T00:00:00">
    <skipped message="{&quot;num_red&quot;: 4, &quot;num_green&quot;: 0, &quot;max_num_retries&quot;: 3, &quot;rerun_disabled_test&quot;: true}" type="skip"/>
</testcase>
```

This runs at the same schedule as `mem_leak_check` (daily).  The change to update test stats, and (potentially) grouping on HUD will come in separated PRs.

### Testing

* pull https://github.com/pytorch/pytorch/actions/runs/3447434434
* trunk https://github.com/pytorch/pytorch/actions/runs/3447434928
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88646
Approved by: https://github.com/clee2000
---
 .github/scripts/filter_test_configs.py      | 27 +++++++-
 .github/scripts/test_filter_test_configs.py | 30 ++++++++-
 .github/workflows/_linux-test.yml           |  4 +-
 .github/workflows/_mac-test.yml             |  3 +-
 .github/workflows/_rocm-test.yml            |  4 +-
 .github/workflows/_win-test.yml             |  3 +-
 test/run_test.py                            |  5 +-
 test/test_dataloader.py                     |  3 +
 test/test_indexing.py                       |  7 +-
 torch/testing/_internal/common_utils.py     | 72 +++++++++++++++++++--
 10 files changed, 143 insertions(+), 15 deletions(-)

diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py
index 06c8f90441eb9..bb5314434e077 100755
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@@ -34,6 +34,13 @@
     "xla",
 }}
 
+# Supported modes when running periodically
+SUPPORTED_PERIODICAL_MODES = {
+    "mem_leak_check",
+    "rerun_disabled_tests",
+}
+
+
 def parse_args() -> Any:
     from argparse import ArgumentParser
     parser = ArgumentParser("Filter all test configurations and keep only requested ones")
@@ -109,6 +116,23 @@ def filter(test_matrix: Dict[str, List[Any]], labels: Set[str]) -> Dict[str, Lis
         return filtered_test_matrix
 
 
+def set_periodic_modes(test_matrix: Dict[str, List[Any]]) -> Dict[str, List[Any]]:
+    """
+    Apply all periodic modes when running under a schedule
+    """
+    scheduled_test_matrix: Dict[str, List[Any]] = {
+        "include": [],
+    }
+
+    for config in test_matrix.get("include", []):
+        for mode in SUPPORTED_PERIODICAL_MODES:
+            cfg = config.copy()
+            cfg[mode] = mode
+            scheduled_test_matrix["include"].append(cfg)
+
+    return scheduled_test_matrix
+
+
 def set_output(name: str, val: Any) -> None:
     if os.getenv("GITHUB_OUTPUT"):
         with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
@@ -163,8 +187,7 @@ def main() -> None:
         filtered_test_matrix = test_matrix
 
     if args.event_name == "schedule":
-        for config in filtered_test_matrix.get("include", []):
-            config["mem_leak_check"] = "mem_leak_check"
+        filtered_test_matrix = set_periodic_modes(filtered_test_matrix)
 
     # Set the filtered test matrix as the output
     set_output("test-matrix", json.dumps(filtered_test_matrix))
diff --git a/.github/scripts/test_filter_test_configs.py b/.github/scripts/test_filter_test_configs.py
index a043a35355431..55410e846c972 100755
--- a/.github/scripts/test_filter_test_configs.py
+++ b/.github/scripts/test_filter_test_configs.py
@@ -4,7 +4,14 @@
 import yaml
 import json
 from unittest import TestCase, main, mock
-from filter_test_configs import get_labels, filter, PREFIX, VALID_TEST_CONFIG_LABELS
+from filter_test_configs import (
+    get_labels,
+    filter,
+    set_periodic_modes,
+    PREFIX,
+    VALID_TEST_CONFIG_LABELS,
+    SUPPORTED_PERIODICAL_MODES
+)
 import requests
 from requests.models import Response
 from typing import Any, Dict
@@ -86,5 +93,26 @@ def test_filter_with_valid_label(self) -> None:
             self.assertEqual(case["expected"], json.dumps(filtered_test_matrix))
 
 
+    def test_set_periodic_modes(self) -> None:
+        testcases = [
+            {
+                "test_matrix": "{include: []}",
+                "description": "Empty test matrix",
+            },
+            {
+                "test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "cfg", runner: "macos"}]}',
+                "descripion": "Replicate each periodic mode in a different config",
+            },
+        ]
+
+        for case in testcases:
+            test_matrix = yaml.safe_load(case["test_matrix"])
+            scheduled_test_matrix = set_periodic_modes(test_matrix)
+            self.assertEqual(
+                len(test_matrix["include"]) * len(SUPPORTED_PERIODICAL_MODES),
+                len(scheduled_test_matrix["include"])
+            )
+
+
 if __name__ == '__main__':
     main()
diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
index dc1346205e636..6ad30080fd649 100644
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@@ -115,7 +115,8 @@ jobs:
           DOCKER_IMAGE: ${{ inputs.docker-image }}
           XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
           XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}}
+          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
+          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
         timeout-minutes: 240
         run: |
           set -x
@@ -170,6 +171,7 @@ jobs:
             -e XLA_CUDA \
             -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
             -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
+            -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --ulimit stack=10485760:83886080 \
             --security-opt seccomp=unconfined \
diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml
index 82dee7b548412..cbc3372e1c42b 100644
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@@ -129,7 +129,8 @@ jobs:
       - name: Test
         id: test
         env:
-          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}}
+          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
+          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
         run: |
           COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")
 
diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml
index 0d8ff874ba034..dd1a0830275cd 100644
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@@ -97,7 +97,8 @@ jobs:
           DOCKER_IMAGE: ${{ inputs.docker-image }}
           XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
           PYTORCH_JIT_ENABLE_NVFUSER: 1
-          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}}
+          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
+          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
         timeout-minutes: 270
         run: |
           set -x
@@ -148,6 +149,7 @@ jobs:
             -e SCCACHE_BUCKET \
             -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
             -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
+            -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
             --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
             --ulimit stack=10485760:83886080 \
             --security-opt seccomp=unconfined \
diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml
index a0047abbc0f55..0cabb8ec469aa 100644
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@@ -124,7 +124,8 @@ jobs:
           TEST_CONFIG: ${{ matrix.config }}
           PR_BODY: ${{ github.event.pull_request.body }}
           TORCH_CUDA_ARCH_LIST: "7.0"
-          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0'}}
+          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
+          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
         run: |
           COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")
 
diff --git a/test/run_test.py b/test/run_test.py
index 59454c6aaa3f6..1273ab45c4fbc 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -439,8 +439,11 @@ def run_test(
     if options.pytest:
         unittest_args = [arg if arg != "-f" else "-x" for arg in unittest_args]
     elif IS_CI:
+        ci_args = ["--import-slow-tests", "--import-disabled-tests"]
+        if os.getenv("PYTORCH_TEST_RERUN_DISABLED_TESTS", "0") == "1":
+            ci_args.append("--rerun-disabled-tests")
         # use the downloaded test cases configuration, not supported in pytest
-        unittest_args.extend(["--import-slow-tests", "--import-disabled-tests"])
+        unittest_args.extend(ci_args)
 
     # Extra arguments are not supported with pytest
     executable = get_executable_command(
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 6a7ff90527d3d..347f9be73e8b9 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -2716,6 +2716,9 @@ def __getitem__(self, index):
 
 
 @unittest.skipIf(IS_WINDOWS, "Needs fork")
+@unittest.skipIf(
+    TEST_WITH_ASAN,
+    "This test hangs when running with ASAN, see https://github.com/pytorch/pytorch/issues/75492")
 class TestConvAfterFork(TestCase):
     # Tests crash reported in https://github.com/pytorch/pytorch/issues/53565
     def test_conv_after_fork(self):
diff --git a/test/test_indexing.py b/test/test_indexing.py
index 1d5f2ea68ac21..5dc23a3d54653 100644
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@@ -11,7 +11,8 @@
 import numpy as np
 
 from torch.testing import make_tensor
-from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.testing._internal.common_utils import (
+    TestCase, run_tests, TEST_WITH_TORCHDYNAMO)
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests, onlyCUDA, dtypes, dtypesIfCPU, dtypesIfCUDA,
     onlyNativeDeviceTypes)
@@ -737,6 +738,10 @@ def test_byte_mask_accumulate(self, device):
             self.assertEqual(y, torch.ones(size=(10, 10), device=device))
             self.assertEqual(len(w), 2)
 
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO,
+        "This test causes SIGKILL when running with dynamo, https://github.com/pytorch/pytorch/issues/88472"
+    )
     def test_index_put_accumulate_large_tensor(self, device):
         # This test is for tensors with number of elements >= INT_MAX (2^31 - 1).
         N = (1 << 31) + 5
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 8f497d515eb5d..e0b703046c542 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -107,7 +107,6 @@
 RETRY_TEST_CASES = os.getenv('PYTORCH_RETRY_TEST_CASES') == '1'
 OVERRIDE_FLAKY_SIGNAL = os.getenv('PYTORCH_OVERRIDE_FLAKY_SIGNAL') == '1'
 DISABLE_RUNNING_SCRIPT_CHK = os.getenv('PYTORCH_DISABLE_RUNNING_SCRIPT_CHK') == '1'
-MAX_NUM_RETRIES = 3
 
 DEFAULT_DISABLED_TESTS_FILE = '.pytorch-disabled-tests.json'
 DEFAULT_SLOW_TESTS_FILE = '.pytorch-slow-tests.json'
@@ -506,6 +505,7 @@ def _get_test_report_path():
 parser.add_argument('--run-parallel', type=int, default=1)
 parser.add_argument('--import-slow-tests', type=str, nargs='?', const=DEFAULT_SLOW_TESTS_FILE)
 parser.add_argument('--import-disabled-tests', type=str, nargs='?', const=DEFAULT_DISABLED_TESTS_FILE)
+parser.add_argument('--rerun-disabled-tests', action='store_true')
 
 # Only run when -h or --help flag is active to display both unittest and parser help messages.
 def run_unittest_help(argv):
@@ -527,6 +527,9 @@ def run_unittest_help(argv):
     # infer flags based on the default settings
     GRAPH_EXECUTOR = cppProfilingFlagsToProfilingMode()
 
+RERUN_DISABLED_TESTS = args.rerun_disabled_tests
+# Rerun disabled tests many more times to make sure that they are not flaky anymore
+MAX_NUM_RETRIES = 3 if not RERUN_DISABLED_TESTS else 50
 
 SLOW_TESTS_FILE = args.import_slow_tests
 DISABLED_TESTS_FILE = args.import_disabled_tests
@@ -1653,6 +1656,9 @@ def check_if_enable(test: unittest.TestCase):
             raise unittest.SkipTest("test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test")
     sanitized_test_method_name = remove_device_and_dtype_suffixes(test._testMethodName)
     if not IS_SANDCASTLE:
+        should_skip = False
+        skip_msg = ""
+
         for disabled_test, (issue_url, platforms) in disabled_tests_dict.items():
             disable_test_parts = disabled_test.split()
             if len(disable_test_parts) > 1:
@@ -1687,11 +1693,22 @@ def check_if_enable(test: unittest.TestCase):
                         platforms = list(filter(lambda p: p in platform_to_conditional, platforms))
 
                     if platforms == [] or any([platform_to_conditional[platform] for platform in platforms]):
+                        should_skip = True
                         skip_msg = f"Test is disabled because an issue exists disabling it: {issue_url}" \
                             f" for {'all' if platforms == [] else ''}platform(s) {', '.join(platforms)}. " \
                             "If you're seeing this on your local machine and would like to enable this test, " \
                             "please make sure CI is not set and you are not using the flag --import-disabled-tests."
-                        raise unittest.SkipTest(skip_msg)
+                        break
+
+        if should_skip and not RERUN_DISABLED_TESTS:
+            # Skip the disabled test when not running under --rerun-disabled-tests verification mode
+            raise unittest.SkipTest(skip_msg)
+
+        if not should_skip and RERUN_DISABLED_TESTS:
+            skip_msg = "Test is enabled but --rerun-disabled-tests verification mode is set, so only" \
+                " disabled tests are run"
+            raise unittest.SkipTest(skip_msg)
+
     if TEST_SKIP_FAST:
         if not getattr(test, test._testMethodName).__dict__.get('slow_test', False):
             raise unittest.SkipTest("test is fast; we disabled it with PYTORCH_TEST_SKIP_FAST")
@@ -2039,9 +2056,48 @@ def wrap_with_cuda_memory_check(self, method):
     def _run_with_retry(self, result=None, num_runs_left=0, report_only=True, num_red=0, num_green=0):
         using_unittest = isinstance(result, unittest.TestResult)
         if num_runs_left == 0:
+            # The logic when RERUN_DISABLED_TESTS is set to true is as follows:
+            # |-if the disabled test passes:
+            # |-- if it's flaky:
+            # |---  Do nothing because it's still flaky
+            # |-- elif it isn't flaky anymore:
+            # |---  Close the disabled ticket (later)
+            # |
+            # |- elif the disabled test fails after n retries:
+            # |--  This is expected, report this but don't fail the job
+            skipped_msg = {
+                "num_red": num_red,
+                "num_green": num_green,
+                "max_num_retries": MAX_NUM_RETRIES,
+                "rerun_disabled_test": RERUN_DISABLED_TESTS,
+            }
+
+            traceback_str = ""
+            if RERUN_DISABLED_TESTS and using_unittest:
+                # Hide all failures and errors when RERUN_DISABLED_TESTS is enabled. This is
+                # a verification check, we don't want more red signals coming from it
+                if result.failures:
+                    _, traceback_str = result.failures.pop(-1)
+                if result.errors:
+                    _, traceback_str = result.errors.pop(-1)
+
+                if traceback_str:
+                    skipped_msg["traceback_str"] = traceback_str
+
+                if num_green == 0:
+                    # The disabled test fails, report as skipped but don't fail the job
+                    result.addSkip(self, json.dumps(skipped_msg))
+
+                if num_red == 0:
+                    # The test passes after re-running multiple times. This acts as a signal
+                    # to confirm that it's not flaky anymore
+                    result.addSuccess(self)
+
             if num_green > 0 and num_red > 0 and using_unittest:
-                result.addSkip(self, f'{{"flaky": {True}, "num_red": {num_red}, "num_green": {num_green},' +
-                                     f'"max_num_retries": {MAX_NUM_RETRIES}}}')
+                skipped_msg["flaky"] = True
+                # Still flaky, do nothing
+                result.addSkip(self, json.dumps(skipped_msg))
+
             return
 
         if using_unittest:
@@ -2100,9 +2156,13 @@ def _run_with_retry(self, result=None, num_runs_left=0, report_only=True, num_re
                 result.addExpectedFailure(self, err)
             self._run_with_retry(result=result, num_runs_left=num_retries_left, report_only=report_only,
                                  num_red=num_red + 1, num_green=num_green)
-        elif report_only and num_retries_left < MAX_NUM_RETRIES:
+        elif (RERUN_DISABLED_TESTS or report_only) and num_retries_left < MAX_NUM_RETRIES:
+            # Always re-run up to MAX_NUM_RETRIES when running under report only or rerun disabled tests modes
             print(f"    {self._testMethodName} succeeded - num_retries_left: {num_retries_left}")
-            result.addUnexpectedSuccess(self)
+            if RERUN_DISABLED_TESTS:
+                result.addSuccess(self)
+            else:
+                result.addUnexpectedSuccess(self)
             self._run_with_retry(result=result, num_runs_left=num_retries_left, report_only=report_only,
                                  num_red=num_red, num_green=num_green + 1)
         elif not report_only and num_retries_left < MAX_NUM_RETRIES:

From 474092afb3ea66aee2debd1216f49d121dc9582e Mon Sep 17 00:00:00 2001
From: Iris <wz337@cornell.edu>
Date: Tue, 15 Nov 2022 06:13:15 +0000
Subject: [PATCH 0894/1922] [BE] [c10d][send] Improve error message on
 dist.send() with destination rank as itself (#89004)

This improves error msg on dist.send() and add corresponding test in test_c10d_common.py(https://github.com/pytorch/pytorch/blob/master/test/distributed/test_c10d_common.py).
Context in issue#83912: https://github.com/pytorch/pytorch/issues/83912

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89004
Approved by: https://github.com/H-Huang
---
 test/distributed/test_c10d_common.py  | 3 +++
 torch/distributed/distributed_c10d.py | 9 ++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index 77ee7487a0afa..a43b1343923c8 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -1427,6 +1427,9 @@ def test_send_recv(self):
         dist.send(input_tensor, (self.rank + 1) % self.world_size)
         self.assertEqual(input_tensor, torch.zeros(2, 2) + 1)
 
+        with self.assertRaises(ValueError):
+            dist.send(input_tensor, dist.get_rank())
+
         # test recv
         input_tensor = torch.zeros(2, 2)
         dist.recv(input_tensor, (self.rank + 1) % self.world_size)
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 4a132d141e00a..33569f5169e5d 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -1179,12 +1179,19 @@ def send(tensor: torch.Tensor, dst: int, group: Optional[ProcessGroup] = None, t
 
     Args:
         tensor (Tensor): Tensor to send.
-        dst (int): Destination rank.
+        dst (int): Destination rank. Destination rank should not be the same
+        as the rank of the current process.
         group (ProcessGroup, optional): The process group to work on. If None,
             the default process group will be used.
         tag (int, optional): Tag to match send with remote recv
 
     """
+    if get_rank() == dst:
+        raise ValueError(
+            "Invalid destination rank: destination rank should not be the same as "
+            "the rank of the current process."
+        )
+
     _check_single_tensor(tensor, "tensor")
     if _rank_not_in_group(group):
         _warn_not_in_group("send")

From a6a9f34e767df6f0cb9c3172df785d31e6fd41ab Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Tue, 15 Nov 2022 06:16:13 +0000
Subject: [PATCH 0895/1922] enable index_put test (#89019)

Per title

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89019
Approved by: https://github.com/desertfire
---
 test/inductor/test_torchinductor_opinfo.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 3880b87c082cb..4e706bab0ea6c 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -168,6 +168,8 @@ def process(device_type):
     "__getitem__": {b8, f16, f32, f64, i32, i64},
     "addr": {f16},
     "allclose": {f16, f32, f64},
+    "amax": {f16},
+    "amin": {f16},
     "angle": {f16, f32, f64},
     "argwhere": {b8, f16, f32, f64, i32, i64},
     "bernoulli": {f32, f64},
@@ -204,7 +206,6 @@ def process(device_type):
     "fft.rfft2": {f32, f64},
     "fft.rfftn": {f32, f64},
     "index_add": {f16},
-    "index_put": {f16, f32, f64},
     "index_reduce": {f16, f32, f64},
     "istft": {f32, f64},
     "linalg.eig": {f32, f64},
@@ -311,7 +312,6 @@ def process(device_type):
     "fft.rfft": {f16, f32, f64},
     "fft.rfft2": {f16, f32, f64},
     "fft.rfftn": {f16, f32, f64},
-    "index_put": {f16, f32, f64},
     "index_reduce": {f16, f32, f64},
     "istft": {f32, f64},
     "linalg.eig": {f32, f64},
@@ -441,13 +441,15 @@ def wrapper_set_seed(op, *args, **kwargs):
 inductor_all_samples = {
     "softmax.with_dtype",
     "index_add",
-    "index_put",
     "index_copy",
     "scatter_reduce.sum",
     "select_scatter",
     "squeeze",
     "unsqueeze",
     "sum",
+    "amax",
+    "amin",
+    "all",
 }
 
 
@@ -549,7 +551,6 @@ def fn(*args, **kwargs):
                         "check_gradient": requires_grad,
                     }
                     adjusted_kwargs.update(overridden_kwargs)
-
                     self.check_model_cuda(
                         fn,
                         args,

From 75242065592441537b54bad96204096c028d517f Mon Sep 17 00:00:00 2001
From: peterjc123 <peterghost86@gmail.com>
Date: Tue, 15 Nov 2022 06:36:24 +0000
Subject: [PATCH 0896/1922] Fix get_default_qat_qconfig for PT 1.13 (#88876)

See https://github.com/pytorch/pytorch/pull/84329/files#r1019916766 for more context

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88876
Approved by: https://github.com/jgong5, https://github.com/vkuzo
---
 test/quantization/core/test_top_level_apis.py | 32 +++++++++++++++++++
 torch/ao/quantization/qconfig.py              |  2 +-
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/test/quantization/core/test_top_level_apis.py b/test/quantization/core/test_top_level_apis.py
index 7343a16040d25..f76db1cd4139b 100644
--- a/test/quantization/core/test_top_level_apis.py
+++ b/test/quantization/core/test_top_level_apis.py
@@ -59,3 +59,35 @@ def test_fake_quants(self) -> None:
         for observer in self.fake_quants:
             obs = self._get_observer_ins(observer)
             obs.forward(t)
+
+
+class TestQConfig(TestCase):
+
+    REDUCE_RANGE_DICT = {
+        'fbgemm': (True, False),
+        'qnnpack': (False, False),
+        'onednn': (False, False),
+        'x86': (True, False),
+    }
+
+    def test_reduce_range_qat(self) -> None:
+        for backend, reduce_ranges in self.REDUCE_RANGE_DICT.items():
+            for version in range(2):
+                qconfig = torch.ao.quantization.get_default_qat_qconfig(backend, version)
+
+                fake_quantize_activ = qconfig.activation()
+                self.assertEqual(fake_quantize_activ.activation_post_process.reduce_range, reduce_ranges[0])
+
+                fake_quantize_weight = qconfig.weight()
+                self.assertEqual(fake_quantize_weight.activation_post_process.reduce_range, reduce_ranges[1])
+
+    def test_reduce_range(self) -> None:
+        for backend, reduce_ranges in self.REDUCE_RANGE_DICT.items():
+            for version in range(1):
+                qconfig = torch.ao.quantization.get_default_qconfig(backend, version)
+
+                fake_quantize_activ = qconfig.activation()
+                self.assertEqual(fake_quantize_activ.reduce_range, reduce_ranges[0])
+
+                fake_quantize_weight = qconfig.weight()
+                self.assertEqual(fake_quantize_weight.reduce_range, reduce_ranges[1])
diff --git a/torch/ao/quantization/qconfig.py b/torch/ao/quantization/qconfig.py
index b75e16ef044f4..f52bf713c6f9b 100644
--- a/torch/ao/quantization/qconfig.py
+++ b/torch/ao/quantization/qconfig.py
@@ -339,7 +339,7 @@ def get_default_qat_qconfig(backend='fbgemm', version=1):
                                                                 quant_min=0,
                                                                 quant_max=255),
                               weight=default_per_channel_weight_fake_quant)
-        if backend == 'x86':
+        elif backend == 'x86':
             qconfig = QConfig(activation=FakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
                                                                 quant_min=0,
                                                                 quant_max=255,

From eadcb74acb51133460aa95e40106a3280265b32b Mon Sep 17 00:00:00 2001
From: Wenzhe Xue <wenzhe.xue@intel.com>
Date: Tue, 15 Nov 2022 07:29:52 +0000
Subject: [PATCH 0897/1922] Set correct size of `attr::output_layouts` when the
 graph has multiple outputs in JIT oneDNN fuser  (#88496)

Bug:
Previously, `initOutputLayouts()` was called after creating a graph and before merging other nodes. It is a vector with one element. So when a graph contains multiple outputs, e.g. using AOTAutograd compile in my case, layout_propagation pass try to access out of range elements in the vector. Then it comes to the second bug in `useOpaqueLayout()`, the out of range checks the index with the updated output size instead of the size of the vector. Then used `[]` to access the element, which is out of range.

Fixes the above two issues:

1. check the offset is within range with the size of `attr::output_layouts` vector instead of another variable. This check catches the error now.
2. change the place to initial `attr::output_layouts` after node merging. The graph may change with node merging. Thus we moved the initialization in layout_propagation with the complete graph.

Added test time:
`Ran 1 test in 0.383s`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88496
Approved by: https://github.com/jgong5, https://github.com/sanchitintel
---
 test/test_jit_llga_fuser.py                   | 30 +++++++++++++++++
 .../csrc/jit/codegen/onednn/graph_helper.cpp  | 33 ++++++++++---------
 torch/csrc/jit/codegen/onednn/graph_helper.h  |  5 +--
 .../jit/codegen/onednn/layout_propagation.cpp |  9 +++++
 4 files changed, 60 insertions(+), 17 deletions(-)

diff --git a/test/test_jit_llga_fuser.py b/test/test_jit_llga_fuser.py
index 4804a442c1d66..12bd955043b96 100644
--- a/test/test_jit_llga_fuser.py
+++ b/test/test_jit_llga_fuser.py
@@ -774,6 +774,36 @@ def t3(x, y):
         self.assertGraphContainsExactly(t_jit_3.graph_for(x, y), LLGA_FUSION_GROUP, 0)
 
 
+@unittest.skipIf(LLGA_NOT_ENABLED, "MKL-DNN build is disabled")
+@unittest.skip("Enable when integration with dynamo aot_autograd is more stable")
+class TestDynamoAOT(JitTestCase):
+    def test_dynamo_aot_ts_onednn(self):
+        class Seq(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.layers = nn.Sequential(
+                    nn.Linear(10, 10),
+                    nn.ReLU(),
+                    nn.Linear(10, 10),
+                    nn.ReLU(),
+                )
+
+            def forward(self, x):
+                return self.layers(x)
+
+        mod = Seq()
+
+        import torch._dynamo
+        aot_mod = torch._dynamo.optimize("aot_ts", nopython=True)(mod)
+
+        for _ in range(10):
+            with torch.jit.fuser("fuser3"):
+                loss = aot_mod(torch.rand([10, 10])).sum()
+                loss.backward()
+
+        torch._dynamo.reset()
+
+
 @unittest.skipIf(IS_AVX512_UNSUPPORTED, "This test fails for BF16 on machines without AVX512.")
 @unittest.skipIf(LLGA_NOT_ENABLED, "MKL-DNN build is disabled")
 class TestModel(JitLlgaTestCase):
diff --git a/torch/csrc/jit/codegen/onednn/graph_helper.cpp b/torch/csrc/jit/codegen/onednn/graph_helper.cpp
index a8a202acf0dac..a14dce108dd12 100644
--- a/torch/csrc/jit/codegen/onednn/graph_helper.cpp
+++ b/torch/csrc/jit/codegen/onednn/graph_helper.cpp
@@ -505,7 +505,6 @@ Node* LlgaGraphHelper::createSingletonSubgraph(Node* n, AliasDb& aliasDb) {
   auto group = SubgraphUtils::createSingletonSubgraphAndUpdateAliasing(
       n, prim::oneDNNFusionGroup, aliasDb);
   opToOwningPartition_.add(group, partitionId);
-  LlgaNodeWrapper(group).initOutputLayouts();
   return group;
 }
 
@@ -585,25 +584,29 @@ LlgaNodeWrapper::LlgaNodeWrapper(const Node* node)
 }
 
 void LlgaNodeWrapper::setOpaqueLayout(size_t offset) {
-  TORCH_CHECK(offset < n->outputs().size(), "Invalid output offset ", offset);
+  const auto num_output = n->is(attr::output_layouts).size();
+  TORCH_CHECK(
+      offset < num_output,
+      "Out of range. (Invalid index ",
+      offset,
+      " for attr::output_layouts with size ",
+      num_output,
+      ")");
   auto& layouts =
       const_cast<std::vector<int64_t>&>(n->is(attr::output_layouts)); // NOLINT
-  layouts.at(offset) = 1;
+  layouts.at(offset) = OPAQUE_LAYOUT;
 }
 
 bool LlgaNodeWrapper::useOpaqueLayout(size_t offset) const {
-  TORCH_CHECK(offset < n->outputs().size(), "Invalid output offset ", offset);
-  return n->is(attr::output_layouts)[offset] == 1;
-}
-
-void LlgaNodeWrapper::initOutputLayouts() {
-  if (n->hasAttribute(attr::output_layouts)) {
-    return;
-  }
-
-  // Init all output layouts as undef
-  std::vector<int64_t> layouts(n->outputs().size(), 0);
-  n->is_(attr::output_layouts, layouts);
+  const auto num_output = n->is(attr::output_layouts).size();
+  TORCH_CHECK(
+      offset < num_output,
+      "Out of range. (Invalid index ",
+      offset,
+      " for attr::output_layouts with size ",
+      num_output,
+      ")");
+  return n->is(attr::output_layouts)[offset] == OPAQUE_LAYOUT;
 }
 
 } // namespace onednn
diff --git a/torch/csrc/jit/codegen/onednn/graph_helper.h b/torch/csrc/jit/codegen/onednn/graph_helper.h
index 5422a90d9e97b..fbb5eaa84aec7 100644
--- a/torch/csrc/jit/codegen/onednn/graph_helper.h
+++ b/torch/csrc/jit/codegen/onednn/graph_helper.h
@@ -10,6 +10,9 @@ namespace jit {
 namespace fuser {
 namespace onednn {
 
+#define STRIDED_LAYOUT 0
+#define OPAQUE_LAYOUT 1
+
 struct OpPartitionMap {
   void add(uint64_t opId, uint64_t partitionId) {
     opmap_[opId] = partitionId;
@@ -92,8 +95,6 @@ class LlgaNodeWrapper {
   friend class LlgaGraphHelper;
 
  private:
-  void initOutputLayouts();
-
   Node* n;
 };
 
diff --git a/torch/csrc/jit/codegen/onednn/layout_propagation.cpp b/torch/csrc/jit/codegen/onednn/layout_propagation.cpp
index 448e1cf858849..4201282fb083b 100644
--- a/torch/csrc/jit/codegen/onednn/layout_propagation.cpp
+++ b/torch/csrc/jit/codegen/onednn/layout_propagation.cpp
@@ -1,5 +1,6 @@
 #include <torch/csrc/jit/codegen/onednn/graph_helper.h>
 #include <torch/csrc/jit/codegen/onednn/layout_propagation.h>
+#include <torch/csrc/jit/jit_log.h>
 
 namespace torch {
 namespace jit {
@@ -10,6 +11,14 @@ void LayoutPropagation(Node* n) {
   if (!LlgaGraphHelper::isLlgaSubgraph(n))
     return;
 
+  // initial attr::output_layouts if undefined
+  if (!n->hasAttribute(attr::output_layouts)) {
+    const auto num_output = n->outputs().size();
+    GRAPH_DEBUG("Initial output_layouts of size ", num_output);
+    std::vector<int64_t> layouts(num_output, STRIDED_LAYOUT);
+    n->is_(attr::output_layouts, layouts);
+  }
+
   for (auto input : n->inputs()) {
     auto prev = input->node();
     auto offset = input->offset();

From 2f27ea95cb790cb0c94b5baf1fba114c265b09b5 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Tue, 15 Nov 2022 09:37:09 +0000
Subject: [PATCH 0898/1922] Revert "Add mem efficient backward (#88856)"

This reverts commit 35e668b5ced25e735b6e523d557ed7fd60267914.

Reverted https://github.com/pytorch/pytorch/pull/88856 on behalf of https://github.com/DanilBaibak due to breaking internal builds
---
 aten/src/ATen/native/native_functions.yaml    |   5 -
 .../native/transformers/cuda/attention.cu     |  16 +-
 .../transformers/cuda/attention_backward.cu   | 261 ------------------
 .../transformers/cuda/flash_attn/fmha_api.cpp |   4 -
 .../attention_backward_generic.cu             | 166 +++++++++++
 .../attention_forward_generic.cu              | 232 ++++++++++++++++
 .../cuda/mem_eff_attention/find_default_mma.h |   7 +-
 .../cuda/mem_eff_attention/kernel_backward.h  | 250 ++++++-----------
 .../ATen/native/transformers/cuda/sdp_utils.h |  12 +-
 test/test_transformers.py                     |  44 +--
 tools/autograd/derivatives.yaml               |   7 +-
 .../_internal/common_methods_invocations.py   |   4 +-
 12 files changed, 507 insertions(+), 501 deletions(-)
 delete mode 100644 aten/src/ATen/native/transformers/cuda/attention_backward.cu
 create mode 100644 aten/src/ATen/native/transformers/cuda/mem_eff_attention/attention_backward_generic.cu
 create mode 100644 aten/src/ATen/native/transformers/cuda/mem_eff_attention/attention_forward_generic.cu

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 9572ccc56653d..de087c0b8a896 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -13287,11 +13287,6 @@
   dispatch:
     CUDA: _efficient_attention_forward
 
-- func: _efficient_attention_backward(Tensor grad, Tensor query, Tensor key, Tensor value, Tensor logsumexp, Tensor out, bool is_causal=False) -> (Tensor, Tensor, Tensor)
-  variants: function
-  dispatch:
-    CUDA: _efficient_attention_backward
-
 - func: _transformer_decoder_only_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, Tensor? incr_key=None, Tensor? incr_value=None) -> (Tensor, Tensor, Tensor)
   variants: function
   dispatch:
diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu
index 46543d4663fab..f65fedd6d7954 100644
--- a/aten/src/ATen/native/transformers/cuda/attention.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention.cu
@@ -746,9 +746,7 @@ std::tuple<Tensor, Tensor> flash_attention_helper_dense_unpacked(
 std::tuple<Tensor, Tensor> mem_eff_helper(
     const Tensor& query,
     const Tensor& key,
-    const Tensor& value,
-    bool compute_log_sumexp,
-    bool is_causal) {
+    const Tensor& value){
   // Query -> Query(Batch x Q_seq_len x Num_heads x Dim_per_head)
   // Key   -> Key(Batch x KV_seq_len x Num_heads x Dim_per_head)
   // Value -> Value(Batch x KV_seq_len x  Num_heads x Dim_per_head)
@@ -756,18 +754,16 @@ std::tuple<Tensor, Tensor> mem_eff_helper(
   Tensor k_t = key.transpose(1, 2);
   Tensor v_t = value.transpose(1, 2);
 
-  Tensor attention, log_sumexp;
-  std::tie(attention, log_sumexp) = at::_efficient_attention_forward(
+  Tensor attention = std::get<0>(at::_efficient_attention_forward(
       q_t,
       k_t,
       v_t,
       c10::nullopt,
       c10::nullopt,
       c10::nullopt,
-      compute_log_sumexp,
-      is_causal);
-  attention = attention.transpose(1,2);
-  return std::make_tuple(std::move(attention), Tensor());
+      false,
+      false)).transpose(1,2);
+  return std::make_tuple(attention, Tensor());
 }
 
 std::tuple<Tensor, Tensor> _scaled_dot_product_attention_forward_cuda(
@@ -780,7 +776,7 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_forward_cuda(
       case sdp::SDPBackend::flash_attention:
           return flash_attention_helper_dense_unpacked(query_, key, value, dropout_p, need_attn_weights, is_causal);
       case sdp::SDPBackend::efficient_attention:
-          return mem_eff_helper(query_, key , value, need_attn_weights, is_causal);
+          return mem_eff_helper(query_, key , value);
       case sdp::SDPBackend::math:
         return at::_scaled_dot_product_attention_math(query_, key, value, attn_mask_, dropout_p, need_attn_weights, is_causal);
       default:
diff --git a/aten/src/ATen/native/transformers/cuda/attention_backward.cu b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
deleted file mode 100644
index af005b2669b29..0000000000000
--- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu
+++ /dev/null
@@ -1,261 +0,0 @@
-#include <type_traits>
-
-#include <ATen/ATen.h>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAMathCompat.h>
-
-#include <ATen/native/nested/NestedTensorTransformerFunctions.h>
-#include <ATen/native/nested/NestedTensorUtils.h>
-#include <ATen/native/transformers/attention.h>
-#include <ATen/native/transformers/cuda/sdp_utils.h>
-
-#ifdef USE_FLASH_ATTENTION
-#include <ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h>
-#endif
-
-#define ASSIGN_CHECK_OVERFLOW(A, B)                                            \
-  {                                                                            \
-    A = B;                                                                     \
-    TORCH_CHECK(B < std::numeric_limits<decltype(A)>::max(), #B " overflows"); \
-  }
-
-#define DISPATCH_MAXK(func)                                   \
-  {                                                           \
-    const auto maxK = std::max(query.size(3), value.size(3)); \
-    if (maxK <= 64) {                                         \
-      constexpr int kMaxK = 64;                               \
-      func();                                                 \
-    } else if (maxK <= 128) {                                 \
-      constexpr int kMaxK = 128;                              \
-      func();                                                 \
-    } else {                                                  \
-      constexpr int kMaxK = std::numeric_limits<int>::max();  \
-      func();                                                 \
-    }                                                         \
-  }
-
-#define DISPATCH_KERNEL(QUERY, KEY, VALUE, FUNC)                               \
-  {                                                                            \
-    cudaDeviceProp* properties =                                               \
-        at::cuda::getDeviceProperties(QUERY.device().index());                 \
-    const int computeCapability = properties->major * 10 + properties->minor;  \
-    DISPATCH_MAXK(([&] {                                                       \
-      DISPATCH_TYPES(                                                          \
-          QUERY, ([&]() {                                                      \
-            DISPATCH_ARCHTAG(                                                  \
-                computeCapability, ([&]() {                                    \
-                  using AlignedAK =                                            \
-                      AttentionBackwardKernel<ArchTag, scalar_t, true, kMaxK>; \
-                  bool isAligned =                                             \
-                      (QUERY.stride(2) % AlignedAK::kOptimalAlignement == 0 && \
-                       KEY.stride(2) % AlignedAK::kOptimalAlignement == 0 &&   \
-                       VALUE.stride(2) % AlignedAK::kOptimalAlignement == 0);  \
-                  DISPATCH_BOOL(isAligned, kIsAligned, ([&]() {                \
-                                  using Kernel = AttentionBackwardKernel<      \
-                                      ArchTag,                                 \
-                                      scalar_t,                                \
-                                      kIsAligned,                              \
-                                      kMaxK>;                                  \
-                                  FUNC();                                      \
-                                }))                                            \
-                }))                                                            \
-          }))                                                                  \
-    }));                                                                       \
-  }
-
-namespace at {
-
-namespace native {
-
-std::tuple<at::Tensor, at::Tensor, at::Tensor> _efficient_attention_backward(
-    const at::Tensor& grad_out_,
-    const at::Tensor& query,
-    const at::Tensor& key,
-    const at::Tensor& value,
-    const at::Tensor& logsumexp,
-    const at::Tensor& out,
-    bool causal) {
-  #if defined(USE_FLASH_ATTENTION)
-  if (!grad_out_.defined()) {
-    return std::make_tuple(Tensor{}, Tensor{}, Tensor{});
-  }
-    // ndim
-  TORCH_CHECK(query.dim() == grad_out_.dim());
-  TORCH_CHECK(query.dim() == key.dim());
-  TORCH_CHECK(query.dim() == value.dim());
-  TORCH_CHECK(query.dim() == 4);
-
-  // batch size
-  TORCH_CHECK(query.size(0) == grad_out_.size(0));
-  TORCH_CHECK(query.size(0) == key.size(0));
-  TORCH_CHECK(query.size(0) == value.size(0));
-
-  // seqlen
-  TORCH_CHECK(key.size(1) == value.size(1));
-  TORCH_CHECK(query.size(1) == grad_out_.size(1));
-
-  // Num heads
-  TORCH_CHECK(query.size(2) == key.size(2));
-  TORCH_CHECK(query.size(2) == value.size(2));
-  TORCH_CHECK(query.size(2) == grad_out_.size(2));
-
-  // Embedding per head
-  TORCH_CHECK(query.size(3) == key.size(3));
-  TORCH_CHECK(value.size(3) == grad_out_.size(3));
-
-  // handle potentially non-contiguous grad_out through a copy
-  auto grad_out = grad_out_.contiguous();
-  CHECK_NOSPARSE_CONTIGUOUS_CUDA(grad_out);
-
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
-
-  at::cuda::CUDAGuard device_guard(query.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  int64_t B = query.size(0);
-  int64_t M = query.size(1);
-  int64_t N = key.size(1);
-  int64_t nH = query.size(2);
-  int64_t K = query.size(3);
-
-  // It does not make sense to use that in practice,
-  // but let's still make sure we are correct
-  // As we iterate through keys first, we skip
-  // keys with no query associated, so they are not
-  // initialized
-  bool grad_kv_needs_init = causal && N > M;
-  at::Tensor grad_q, grad_k, grad_v;
-  if (!grad_kv_needs_init && query.size(1) == key.size(1) &&
-      query.size(3) == value.size(3) &&
-      query.storage().is_alias_of(key.storage()) &&
-      query.storage().is_alias_of(value.storage())) {
-    // Create one big contiguous chunk
-    // This is because q, k and v usually come from a single
-    // output of a linear layer that is chunked.
-    // Creating the gradients with the right layout saves us
-    // a `torch.cat` call in the backward pass
-    at::Tensor chunk = at::empty({B, M, 3, nH, K}, query.options());
-    grad_q = chunk.select(2, 0);
-    grad_k = chunk.select(2, 1);
-    grad_v = chunk.select(2, 2);
-  } else {
-    grad_q = at::empty_like(query);
-    grad_k = grad_kv_needs_init ? at::zeros_like(key) : at::empty_like(key);
-    grad_v = grad_kv_needs_init ? at::zeros_like(value) : at::empty_like(value);
-  }
-
-  auto launchKernel = [&](auto _k, int computeCapability) {
-    using Kernel = decltype(_k);
-    using scalar_t = typename Kernel::scalar_t;
-    (void)_k;
-
-    size_t smem_bytes = sizeof(typename Kernel::SharedStorage);
-
-    // TODO: Fuse this into a kernel?
-    // This is a bottleneck for smaller sequences (M <= 128)
-    auto delta = Kernel::kKernelComputesDelta
-        ? at::empty({B, nH, M}, query.options().dtype(at::ScalarType::Float))
-        : (grad_out.to(at::kFloat) * out.to(at::kFloat))
-              .sum(-1)
-              .transpose(-2, -1)
-              .contiguous();
-    TORCH_INTERNAL_ASSERT(delta.size(0) == B);
-    TORCH_INTERNAL_ASSERT(delta.size(1) == nH);
-    TORCH_INTERNAL_ASSERT(delta.size(2) == M);
-
-    typename Kernel::Params p;
-    p.query_ptr = (scalar_t*)query.data_ptr();
-    p.key_ptr = (scalar_t*)key.data_ptr();
-    p.value_ptr = (scalar_t*)value.data_ptr();
-    p.logsumexp_ptr = (typename Kernel::lse_scalar_t*)logsumexp.data_ptr();
-    p.output_ptr = (scalar_t*)out.data_ptr();
-    p.grad_output_ptr = (scalar_t*)grad_out.data_ptr();
-    p.grad_query_ptr = (scalar_t*)grad_q.data_ptr();
-    p.grad_key_ptr = (scalar_t*)grad_k.data_ptr();
-    p.grad_value_ptr = (scalar_t*)grad_v.data_ptr();
-    p.delta_ptr = (float*)delta.data_ptr();
-    p.head_dim = query.size(3);
-    p.head_dim_value = value.size(3);
-    p.num_queries = query.size(1);
-    p.num_keys = key.size(1);
-    p.num_batches = B;
-    p.num_heads = nH;
-    p.causal = causal;
-
-    ASSIGN_CHECK_OVERFLOW(p.gO_strideB, grad_out.stride(0));
-    ASSIGN_CHECK_OVERFLOW(p.gO_strideM, grad_out.stride(1));
-    ASSIGN_CHECK_OVERFLOW(p.gO_strideH, grad_out.stride(2));
-
-    ASSIGN_CHECK_OVERFLOW(p.o_strideB, out.stride(0));
-    ASSIGN_CHECK_OVERFLOW(p.o_strideH, out.stride(2));
-
-    ASSIGN_CHECK_OVERFLOW(p.gQ_strideB, grad_q.stride(0));
-    ASSIGN_CHECK_OVERFLOW(p.gK_strideB, grad_k.stride(0));
-    ASSIGN_CHECK_OVERFLOW(p.gV_strideB, grad_v.stride(0));
-    ASSIGN_CHECK_OVERFLOW(p.gQ_strideH, grad_q.stride(2));
-    ASSIGN_CHECK_OVERFLOW(p.gK_strideH, grad_k.stride(2));
-    ASSIGN_CHECK_OVERFLOW(p.gV_strideH, grad_v.stride(2));
-    p.gQKV_strideM_multiplier = grad_q.is_contiguous() ? 1 : 3;
-    TORCH_INTERNAL_ASSERT(p.gQ_strideM() == grad_q.stride(1));
-    TORCH_INTERNAL_ASSERT(p.gK_strideM() == grad_k.stride(1));
-    TORCH_INTERNAL_ASSERT(p.gV_strideM() == grad_v.stride(1));
-
-    ASSIGN_CHECK_OVERFLOW(p.q_strideB, query.stride(0));
-    ASSIGN_CHECK_OVERFLOW(p.k_strideB, key.stride(0));
-    ASSIGN_CHECK_OVERFLOW(p.v_strideB, value.stride(0));
-    ASSIGN_CHECK_OVERFLOW(p.q_strideM, query.stride(1));
-    ASSIGN_CHECK_OVERFLOW(p.k_strideM, key.stride(1));
-    ASSIGN_CHECK_OVERFLOW(p.v_strideM, value.stride(1));
-    ASSIGN_CHECK_OVERFLOW(p.q_strideH, query.stride(2));
-    ASSIGN_CHECK_OVERFLOW(p.k_strideH, key.stride(2));
-    ASSIGN_CHECK_OVERFLOW(p.v_strideH, value.stride(2));
-
-    Kernel::check_supported(p);
-
-    constexpr auto kernel_fn = attention_kernel_backward_batched<Kernel>;
-
-    if (smem_bytes > 0xc000) {
-      TORCH_INTERNAL_ASSERT(
-          computeCapability >= 70,
-          "This kernel requires too much shared memory on this machine!");
-      cudaFuncSetAttribute(
-          kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes);
-    }
-
-    // second syntax resulted in the error below on windows
-    // error C3495: 'kernel_fn': a simple capture must be a variable
-    // with automatic storage duration declared
-    // in the reaching scope of the lambda
-#ifdef _WIN32
-    cudaFuncAttributes attr;
-    AT_CUDA_CHECK(cudaFuncGetAttributes(&attr, kernel_fn));
-    TORCH_INTERNAL_ASSERT(
-        attr.binaryVersion >= Kernel::ArchTag::kMinComputeCapability,
-        "Something went wrong in the build process");
-#else
-    auto checkBinaryArchMatches = [&]() {
-      cudaFuncAttributes attr;
-      AT_CUDA_CHECK(cudaFuncGetAttributes(&attr, kernel_fn));
-      return attr.binaryVersion >= Kernel::ArchTag::kMinComputeCapability;
-    };
-    TORCH_INTERNAL_ASSERT(
-        checkBinaryArchMatches(), "Something went wrong in the build process");
-#endif
-
-    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes, stream>>>(p);
-  };
-
-  DISPATCH_KERNEL(
-      query, key, value, ([&] { launchKernel(Kernel{}, computeCapability); }));
-  AT_CUDA_CHECK(cudaGetLastError());
-  return std::make_tuple(grad_q, grad_k, grad_v);
-  #endif
-  TORCH_CHECK(false, "USE_FLASH_ATTENTION was not enabled for build.")
-  return std::make_tuple(Tensor{}, Tensor{}, Tensor{});
-}
-
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
index 6c86e1ff63b01..a8d6110e951d9 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
@@ -29,7 +29,6 @@
 #ifdef USE_FLASH_ATTENTION
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
 #include <ATen/NativeFunctions.h>
 
 #include <ATen/native/transformers/cuda/flash_attn/fmha.h>
@@ -186,9 +185,6 @@ mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q
     int max_seqlen_q = ((max_seqlen_q_ + 16 - 1) / 16) * 16;
     bool loop = max_seqlen_k > blocksize_c;
 
-    // Otherwise the kernel will be launched from cuda:0 device
-    at::cuda::CUDAGuard device_guard{q.get_device()};
-
     auto opts = q.options();
 
     auto o = at::empty({ total_q, num_heads, head_size }, opts);
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/attention_backward_generic.cu b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/attention_backward_generic.cu
new file mode 100644
index 0000000000000..07c14ad8195dd
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/attention_backward_generic.cu
@@ -0,0 +1,166 @@
+#include <ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h>
+
+#define DISPATCH_MAXK(func)                                   \
+  {                                                           \
+    const auto maxK = std::max(query.size(2), value.size(2)); \
+    if (maxK <= 64) {                                         \
+      constexpr int kMaxK = 64;                               \
+      func();                                                 \
+    } else if (maxK <= 128) {                                 \
+      constexpr int kMaxK = 128;                              \
+      func();                                                 \
+    } else {                                                  \
+      constexpr int kMaxK = std::numeric_limits<int>::max();  \
+      func();                                                 \
+    }                                                         \
+  }
+
+#define DISPATCH_KERNEL(QUERY, KEY, VALUE, FUNC)                               \
+  {                                                                            \
+    cudaDeviceProp* properties =                                               \
+        at::cuda::getDeviceProperties(QUERY.device().index());                 \
+    const int computeCapability = properties->major * 10 + properties->minor;  \
+    DISPATCH_MAXK(([&] {                                                       \
+      DISPATCH_TYPES(                                                          \
+          QUERY, ([&]() {                                                      \
+            DISPATCH_ARCHTAG(                                                  \
+                computeCapability, ([&]() {                                    \
+                  using AlignedAK =                                            \
+                      AttentionBackwardKernel<ArchTag, scalar_t, true, kMaxK>; \
+                  bool isAligned =                                             \
+                      (QUERY.stride(1) % AlignedAK::kOptimalAlignement == 0 && \
+                       KEY.stride(1) % AlignedAK::kOptimalAlignement == 0 &&   \
+                       VALUE.stride(1) % AlignedAK::kOptimalAlignement == 0);  \
+                  DISPATCH_BOOL(isAligned, kIsAligned, ([&]() {                \
+                                  using Kernel = AttentionBackwardKernel<      \
+                                      ArchTag,                                 \
+                                      scalar_t,                                \
+                                      kIsAligned,                              \
+                                      kMaxK>;                                  \
+                                  FUNC();                                      \
+                                }))                                            \
+                }))                                                            \
+          }))                                                                  \
+    }));                                                                       \
+  }
+
+namespace {
+std::tuple<at::Tensor, at::Tensor, at::Tensor>
+mem_efficient_attention_backward_cutlass(
+    const at::Tensor& grad_out_,
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const at::Tensor& logsumexp,
+    const at::Tensor& out,
+    bool causal) {
+  TORCH_CHECK(query.dim() == grad_out_.dim());
+  TORCH_CHECK(query.dim() == key.dim());
+  TORCH_CHECK(query.dim() == 3);
+
+  TORCH_CHECK(query.size(0) == grad_out_.size(0));
+  TORCH_CHECK(query.size(1) == grad_out_.size(1));
+  TORCH_CHECK(value.size(2) == grad_out_.size(2));
+
+  TORCH_CHECK(query.size(2) == key.size(2));
+  TORCH_CHECK(query.size(0) == key.size(0));
+
+  TORCH_CHECK(query.size(0) == value.size(0));
+  TORCH_CHECK(key.size(1) == value.size(1));
+
+  // handle potentially non-contiguous grad_out through a copy
+  auto grad_out = grad_out_.contiguous();
+
+  CHECK_NOSPARSE_CONTIGUOUS_CUDA(query);
+  CHECK_NOSPARSE_CONTIGUOUS_CUDA(key);
+  CHECK_NOSPARSE_CONTIGUOUS_CUDA(value);
+  CHECK_NOSPARSE_CONTIGUOUS_CUDA(grad_out);
+
+  at::cuda::CUDAGuard device_guard(query.device());
+
+  int64_t B = query.size(0);
+  int64_t M = query.size(1);
+  int64_t N = key.size(1);
+  int64_t K = query.size(2);
+
+  // It does not make sense to use that in practice,
+  // but let's still make sure we are correct
+  // As we iterate through keys first, we skip
+  // keys with no query associated, so they are not
+  // initialized
+  bool grad_kv_needs_init = causal && N > M;
+  at::Tensor grad_q = at::empty_like(query);
+  at::Tensor grad_k =
+      grad_kv_needs_init ? at::zeros_like(key) : at::empty_like(key);
+  at::Tensor grad_v =
+      grad_kv_needs_init ? at::zeros_like(value) : at::empty_like(value);
+
+  auto launchKernel = [&](auto _k, int computeCapability) {
+    using Kernel = decltype(_k);
+    using scalar_t = typename Kernel::scalar_t;
+    (void)_k;
+
+    size_t smem_bytes = sizeof(typename Kernel::SharedStorage);
+
+    // TODO: Fuse this into a kernel?
+    // This is a bottleneck for smaller sequences (M <= 128)
+    auto delta = Kernel::kKernelComputesDelta
+        ? at::empty({B, M}, query.options().dtype(at::ScalarType::Float))
+        : (grad_out.to(at::kFloat) * out.to(at::kFloat)).sum(-1);
+    TORCH_INTERNAL_ASSERT(delta.size(0) == B);
+    TORCH_INTERNAL_ASSERT(delta.size(1) == M);
+
+    typename Kernel::Params params;
+    params.query_ptr = (scalar_t*)query.data_ptr();
+    params.key_ptr = (scalar_t*)key.data_ptr();
+    params.value_ptr = (scalar_t*)value.data_ptr();
+    params.logsumexp_ptr = (typename Kernel::lse_scalar_t*)logsumexp.data_ptr();
+    params.output_ptr = (scalar_t*)out.data_ptr();
+    params.grad_output_ptr = (scalar_t*)grad_out.data_ptr();
+    params.grad_query_ptr = (scalar_t*)grad_q.data_ptr();
+    params.grad_key_ptr = (scalar_t*)grad_k.data_ptr();
+    params.grad_value_ptr = (scalar_t*)grad_v.data_ptr();
+    params.delta_ptr = (float*)delta.data_ptr();
+    params.head_dim = query.size(2);
+    params.head_dim_value = value.size(2);
+    params.num_queries = query.size(1);
+    params.num_keys = key.size(1);
+    params.num_batches = B;
+    params.causal = causal;
+    Kernel::check_supported(params);
+
+    constexpr auto kernel_fn = attention_kernel_backward_batched<Kernel>;
+
+    if (smem_bytes > 0xc000) {
+      TORCH_INTERNAL_ASSERT(
+          computeCapability >= 70,
+          "This kernel requires too much shared memory on this machine!");
+      cudaFuncSetAttribute(
+          kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes);
+    }
+
+    auto checkBinaryArchMatches = [&]() {
+      cudaFuncAttributes attr;
+      AT_CUDA_CHECK(cudaFuncGetAttributes(&attr, kernel_fn));
+      return attr.binaryVersion >= Kernel::ArchTag::kMinComputeCapability;
+    };
+    TORCH_INTERNAL_ASSERT(
+        checkBinaryArchMatches(), "Something went wrong in the build process");
+
+    kernel_fn<<<params.getBlocksGrid(), params.getThreadsGrid(), smem_bytes>>>(
+        params);
+  };
+
+  DISPATCH_KERNEL(
+      query, key, value, ([&] { launchKernel(Kernel{}, computeCapability); }));
+  AT_CUDA_CHECK(cudaGetLastError());
+  return std::make_tuple(grad_q, grad_k, grad_v);
+} // namespace
+
+} // namespace
+
+// TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
+//   m.impl(
+//       TORCH_SELECTIVE_NAME("xformers::efficient_attention_backward_cutlass"),
+//       TORCH_FN(mem_efficient_attention_backward_cutlass));
+// }
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/attention_forward_generic.cu b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/attention_forward_generic.cu
new file mode 100644
index 0000000000000..59b3637c8a438
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/attention_forward_generic.cu
@@ -0,0 +1,232 @@
+#include <ATen/native/transformers/cuda/mem_eff_attention/kernel_forward.h>
+
+
+#define DISPATCH_BLOCKSIZE(VALUE_HEAD_DIM, FN)        \
+  {                                                   \
+    if (VALUE_HEAD_DIM <= 64) {                       \
+      constexpr bool kIs64x64 = true;                 \
+      constexpr bool kSingleValueIteration = true;    \
+      FN();                                           \
+    } else {                                          \
+      constexpr bool kIs64x64 = false;                \
+      if (VALUE_HEAD_DIM <= 128) {                    \
+        constexpr bool kSingleValueIteration = true;  \
+        FN();                                         \
+      } else {                                        \
+        constexpr bool kSingleValueIteration = false; \
+        FN();                                         \
+      }                                               \
+    }                                                 \
+  }
+
+#define DISPATCH_KERNEL(QUERY, KEY, VALUE, FUNC)                              \
+  {                                                                           \
+    cudaDeviceProp* properties =                                              \
+        at::cuda::getDeviceProperties(QUERY.device().index());                \
+    const int computeCapability = properties->major * 10 + properties->minor; \
+    DISPATCH_BLOCKSIZE(                                                       \
+        VALUE.size(-1), ([&]() {                                              \
+          static constexpr int64_t kQueriesPerBlock = kIs64x64 ? 64 : 32;     \
+          static constexpr int64_t kKeysPerBlock = kIs64x64 ? 64 : 128;       \
+          DISPATCH_TYPES(                                                     \
+              QUERY, ([&]() {                                                 \
+                DISPATCH_ARCHTAG(                                             \
+                    computeCapability, ([&]() {                               \
+                      using AlignedAK = AttentionKernel<                      \
+                          scalar_t,                                           \
+                          ArchTag,                                            \
+                          true,                                               \
+                          kQueriesPerBlock,                                   \
+                          kKeysPerBlock,                                      \
+                          kSingleValueIteration>;                             \
+                      /* Run a more efficient kernel (with `isAligned=True`)  \
+                      if memory is correctly aligned*/                        \
+                      bool isAligned =                                        \
+                          (QUERY.stride(2) % AlignedAK::kAlignmentQ == 0 &&   \
+                           KEY.stride(2) % AlignedAK::kAlignmentK == 0 &&     \
+                           VALUE.stride(2) % AlignedAK::kAlignmentV == 0);    \
+                      /* TODO: Should we warn or log somewhere when we use a  \
+                      less efficient kernel due to wrong alignment? */        \
+                      DISPATCH_BOOL(isAligned, kIsAligned, ([&]() {           \
+                                      using Kernel = AttentionKernel<         \
+                                          scalar_t,                           \
+                                          ArchTag,                            \
+                                          kIsAligned,                         \
+                                          kQueriesPerBlock,                   \
+                                          kKeysPerBlock,                      \
+                                          kSingleValueIteration>;             \
+                                      FUNC();                                 \
+                                    }))                                       \
+                    }))                                                       \
+              }));                                                            \
+        }));                                                                  \
+  }
+
+namespace {
+/*
+  There are 2 modes for using this function.
+  (Mode BMHK) With all the heads having the same seqlen
+  (Mode 1MHK) `batch=1` with all tokens across batches concatenated
+*/
+std::tuple<at::Tensor, at::Tensor> efficient_attention_forward_cutlass(
+    const at::Tensor& query, // [b, seqlen, num_heads, K]
+    const at::Tensor& key, // [b, seqlen, num_heads, K]
+    const at::Tensor& value, // [b, seqlen, num_heads, Kv]
+    // (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the
+    // position of the first query token for batch $b
+    const c10::optional<at::Tensor>& cu_seqlens_q,
+    // (Mode 1MHK only) [b+1]: cu_seqlens_k[b] contains the
+    // position of the first key token for batch $b
+    const c10::optional<at::Tensor>& cu_seqlens_k,
+    // (Mode 1MHK only) Maximum sequence length across batches
+    const c10::optional<int64_t> max_seqlen_q_,
+    bool compute_logsumexp,
+    bool causal) {
+  TORCH_CHECK(query.dim() == 4);
+  TORCH_CHECK(key.dim() == 4);
+  TORCH_CHECK(value.dim() == 4);
+
+  // Batch sizes
+  TORCH_CHECK(query.size(0) == key.size(0));
+  TORCH_CHECK(query.size(0) == value.size(0));
+
+  // Sequence length
+  TORCH_CHECK(key.size(1) == value.size(1));
+
+  // Num heads
+  TORCH_CHECK(query.size(2) == key.size(2));
+  TORCH_CHECK(query.size(2) == value.size(2));
+
+  // Embedding per head
+  TORCH_CHECK(query.size(3) == key.size(3));
+
+  int64_t max_seqlen_q, max_seqlen_k;
+  TORCH_CHECK(cu_seqlens_q.has_value() == cu_seqlens_k.has_value());
+  if (cu_seqlens_q.has_value()) {
+    TORCH_CHECK(cu_seqlens_q->scalar_type() == at::ScalarType::Int);
+    TORCH_CHECK(cu_seqlens_k->scalar_type() == at::ScalarType::Int);
+    TORCH_CHECK(cu_seqlens_q->dim() == 1 && cu_seqlens_k->dim() == 1);
+    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*cu_seqlens_q));
+    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*cu_seqlens_k));
+    TORCH_CHECK(cu_seqlens_q->size(0) == cu_seqlens_k->size(0));
+    TORCH_CHECK(query.size(0) == 1, "cu_seqlen only supports batch_size=1");
+    TORCH_CHECK(max_seqlen_q_.has_value());
+    max_seqlen_q = *max_seqlen_q_;
+    max_seqlen_k = 0; // Will be set inside the kernel
+  } else {
+    max_seqlen_q = query.size(1);
+    max_seqlen_k = key.size(1);
+  }
+
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
+
+  at::cuda::CUDAGuard device_guard(query.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  int64_t B = query.size(0);
+  int64_t M = query.size(1);
+  int64_t N = key.size(1);
+  int64_t num_heads = query.size(-2);
+  int64_t K = query.size(-1);
+  int64_t Kv = value.size(-1);
+
+  at::Tensor res;
+  at::Tensor logsumexp;
+
+  auto launchKernel = [&](auto _k, int computeCapability) {
+    using Kernel = decltype(_k);
+    using scalar_t = typename Kernel::scalar_t;
+    (void)_k;
+
+    res = at::empty(
+        {B, M, num_heads, Kv},
+        query.options().dtype(
+            TypeTraits<typename Kernel::output_t>::atScalarType()));
+
+    // NOTE: Should be aligned (by padding) in case M is
+    // not a good number for loading during backward
+    constexpr decltype(M) kAlignLSE = Kernel::kAlignLSE;
+    logsumexp = at::empty(
+        {B,
+         num_heads,
+         compute_logsumexp ? ceil_div(max_seqlen_q, kAlignLSE) * kAlignLSE : 0},
+        query.options().dtype(at::ScalarType::Float));
+
+    typename Kernel::Params p;
+    p.query_ptr = (scalar_t*)query.data_ptr();
+    p.key_ptr = (scalar_t*)key.data_ptr();
+    p.value_ptr = (scalar_t*)value.data_ptr();
+    p.logsumexp_ptr = compute_logsumexp
+        ? (typename Kernel::lse_scalar_t*)logsumexp.data_ptr()
+        : nullptr;
+    at::Tensor output_accum;
+    if (Kernel::kNeedsOutputAccumulatorBuffer) {
+      output_accum = at::empty(
+          {B, M, num_heads, Kv},
+          query.options().dtype(
+              TypeTraits<typename Kernel::output_accum_t>::atScalarType()));
+      p.output_accum_ptr =
+          (typename Kernel::output_accum_t*)output_accum.data_ptr();
+    } else {
+      p.output_accum_ptr = nullptr;
+    }
+    p.output_ptr = (typename Kernel::output_t*)res.data_ptr();
+
+    if (cu_seqlens_q.has_value()) {
+      p.cu_seqlens_q_ptr = (int32_t*)cu_seqlens_q->data_ptr();
+      p.cu_seqlens_k_ptr = (int32_t*)cu_seqlens_k->data_ptr();
+    }
+
+#define ASSIGN_CHECK_OVERFLOW(A, B)                                            \
+  {                                                                            \
+    A = B;                                                                     \
+    TORCH_CHECK(B < std::numeric_limits<decltype(A)>::max(), #B " overflows"); \
+  }
+
+    p.num_heads = num_heads;
+    p.head_dim = query.size(3);
+    p.head_dim_value = value.size(3);
+    p.num_queries = max_seqlen_q;
+    p.num_keys = max_seqlen_k;
+    p.num_batches = cu_seqlens_q.has_value() ? cu_seqlens_q->size(0) - 1 : B;
+    p.causal = causal;
+
+    ASSIGN_CHECK_OVERFLOW(p.q_strideB, query.stride(0));
+    ASSIGN_CHECK_OVERFLOW(p.k_strideB, key.stride(0));
+    ASSIGN_CHECK_OVERFLOW(p.v_strideB, value.stride(0));
+    ASSIGN_CHECK_OVERFLOW(p.q_strideM, query.stride(1));
+    ASSIGN_CHECK_OVERFLOW(p.k_strideM, key.stride(1));
+    ASSIGN_CHECK_OVERFLOW(p.v_strideM, value.stride(1));
+    ASSIGN_CHECK_OVERFLOW(p.q_strideH, query.stride(2));
+    ASSIGN_CHECK_OVERFLOW(p.k_strideH, key.stride(2));
+    ASSIGN_CHECK_OVERFLOW(p.v_strideH, value.stride(2));
+
+    constexpr auto kernel_fn = attention_kernel_batched<Kernel>;
+    size_t smem_bytes = sizeof(typename Kernel::SharedStorage);
+    if (smem_bytes > 0xc000) {
+      TORCH_INTERNAL_ASSERT(
+          computeCapability >= 70,
+          "This kernel requires too much shared memory on this machine!");
+      AT_CUDA_CHECK(cudaFuncSetAttribute(
+          kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes));
+    }
+    Kernel::check_supported(p);
+    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
+  };
+  // Dispatch to the right kernel
+  DISPATCH_KERNEL(query, key, value, ([&]() {
+                    launchKernel(Kernel{}, computeCapability);
+                  }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return std::make_tuple(res, logsumexp);
+}
+} // namespace
+
+// TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
+//   m.impl(
+//       TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_cutlass"),
+//       TORCH_FN(efficient_attention_forward_cutlass));
+// }
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/find_default_mma.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/find_default_mma.h
index b0e7106f3cfc8..399593fd09573 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/find_default_mma.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/find_default_mma.h
@@ -1,16 +1,15 @@
 /*! \file
     \brief Cutlass provides helper template functions to figure out the right
-   datastructures to instantiate to run a GEMM with various parameters (see
+   datastructures to instanciate to run a GEMM with various parameters (see
    `cutlass/gemm/threadblock/default_mma.h`). However, due to template
-   instantiation priority rules, it will only create an MmaMultiStage with
+   instanciation priority rules, it will only create an MmaMultiStage with
    kStages=3 (otherwise creates an MmePipelined - which is not compatible with
    FastF32). kStages=3 uses too much shared memory and we want to use kStages=2,
    so we just copy-pasted some code from `default_mma.h` and
-   `default_mma_core.h` files and wrapped this template to allow our use case.
+   `default_mma_core.h` files and wrapped this template to allow our usecase.
 
     This is really only for the FastF32 case - aka using TensorCores with fp32.
 */
-#pragma once
 
 #include <cutlass/gemm/threadblock/default_mma.h>
 #include <cutlass/gemm/threadblock/default_mma_core_simt.h>
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h
index c9652c40d38e4..e25701a7588ac 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h
@@ -1,5 +1,7 @@
 #pragma once
+
 #include <ATen/ATen.h>
+#include <torch/library.h>
 #include <cmath>
 #include <vector>
 
@@ -73,113 +75,46 @@ struct AttentionBackwardKernel {
 
   struct Params {
     // Input tensors
-    scalar_t* query_ptr; // [Mq, nH, K]
-    scalar_t* key_ptr; // [Mk, nH, K]
-    scalar_t* value_ptr; // [Mk, nH, Kv]
-    lse_scalar_t* logsumexp_ptr; // [nH, Mq]
-    scalar_t* output_ptr; // [Mq, nH, Kv]
-    scalar_t* grad_output_ptr; // [Mq, nH, Kv]
-    accum_t* delta_ptr; // [Mq, nH]
+    scalar_t* query_ptr; // [num_queries, head_dim]
+    scalar_t* key_ptr; // [num_keys, head_dim]
+    scalar_t* value_ptr; // [num_keys, head_dim_value]
+    lse_scalar_t* logsumexp_ptr; // [num_queries]
+    scalar_t* output_ptr; // [num_queries, head_dim_value]
+    scalar_t* grad_output_ptr; // [num_queries, head_dim_value]
+    accum_t* delta_ptr; // [num_queries]
 
     // Output tensors
-    output_t* grad_query_ptr; //  [Mq, nH, K]
-    output_t* grad_key_ptr; //    [Mk, nH, K]
-    output_t* grad_value_ptr; //  [Mk, nH, Kv]
+    scalar_t* grad_query_ptr; // [num_queries, head_dim]
+    scalar_t* grad_key_ptr; // [num_keys, head_dim]
+    scalar_t* grad_value_ptr; // [num_keys, head_dim_value]
 
     // Dimensions/strides
     int32_t head_dim;
     int32_t head_dim_value;
     int32_t num_queries;
     int32_t num_keys;
-    int32_t num_heads;
-    bool causal;
-
-    int32_t q_strideM;
-    int32_t k_strideM;
-    int32_t v_strideM;
-    int32_t gO_strideM;
-    int8_t gQKV_strideM_multiplier; // 3 for packed, 1 otherwise
-
-    CUTLASS_HOST_DEVICE int32_t o_strideM() const {
-      return head_dim_value * num_heads;
-    }
-    CUTLASS_HOST_DEVICE int32_t gQ_strideM() const {
-      return gQKV_strideM_multiplier * num_heads * head_dim;
-    }
-    CUTLASS_HOST_DEVICE int32_t gK_strideM() const {
-      return gQKV_strideM_multiplier * num_heads * head_dim;
-    }
-    CUTLASS_HOST_DEVICE int32_t gV_strideM() const {
-      return gQKV_strideM_multiplier * num_heads * head_dim_value;
-    }
-
-    // Everything below is only used in `advance_to_block`
-    // and shouldn't use registers
-    int64_t o_strideH;
-    int32_t q_strideH;
-    int32_t k_strideH;
-    int32_t v_strideH;
-    int64_t o_strideB;
-    int64_t q_strideB;
-    int64_t k_strideB;
-    int64_t v_strideB;
     int32_t num_batches;
+    bool causal;
 
-    int64_t gO_strideB;
-    int64_t gQ_strideB;
-    int64_t gK_strideB;
-    int64_t gV_strideB;
-    int64_t gO_strideH;
-    int64_t gQ_strideH;
-    int64_t gK_strideH;
-    int64_t gV_strideH;
-
-    CUTLASS_DEVICE void advance_to_block() {
+    __device__ void advance_batches(int32_t batch_id) {
       constexpr int32_t kAlignLSE = 32; // block size of backward
       auto lse_dim = ceil_div((int32_t)num_queries, kAlignLSE) * kAlignLSE;
 
-      int32_t batch_id = blockIdx.z;
-      int32_t head_id = blockIdx.y;
-
-      query_ptr += batch_id * q_strideB + head_id * q_strideH;
-      key_ptr += batch_id * k_strideB + head_id * k_strideH;
-      value_ptr += batch_id * v_strideB + head_id * v_strideH;
-      logsumexp_ptr += (batch_id * num_heads + head_id) * lse_dim;
-      output_ptr += batch_id * o_strideB + head_id * o_strideH;
-      grad_output_ptr += batch_id * gO_strideB + head_id * gO_strideH;
-      delta_ptr += (batch_id * num_heads + head_id) * num_queries;
-
-      grad_query_ptr += batch_id * gQ_strideB + head_id * gQ_strideH;
-      grad_key_ptr += batch_id * gK_strideB + head_id * gK_strideH;
-      grad_value_ptr += batch_id * gV_strideB + head_id * gV_strideH;
-
-      head_dim = warp_uniform(head_dim);
-      head_dim_value = warp_uniform(head_dim_value);
-      num_queries = warp_uniform(num_queries);
-      num_keys = warp_uniform(num_keys);
-      num_heads = warp_uniform(num_heads);
-
-      gO_strideM = warp_uniform(gO_strideM);
-      gQKV_strideM_multiplier = warp_uniform(gQKV_strideM_multiplier);
-      q_strideM = warp_uniform(q_strideM);
-      k_strideM = warp_uniform(k_strideM);
-      v_strideM = warp_uniform(v_strideM);
-
-      query_ptr = warp_uniform(query_ptr);
-      key_ptr = warp_uniform(key_ptr);
-      value_ptr = warp_uniform(value_ptr);
-      logsumexp_ptr = warp_uniform(logsumexp_ptr);
-      output_ptr = warp_uniform(output_ptr);
-      grad_output_ptr = warp_uniform(grad_output_ptr);
-      delta_ptr = warp_uniform(delta_ptr);
-
-      grad_query_ptr = warp_uniform(grad_query_ptr);
-      grad_key_ptr = warp_uniform(grad_key_ptr);
-      grad_value_ptr = warp_uniform(grad_value_ptr);
+      query_ptr += batch_id * head_dim * num_queries;
+      key_ptr += batch_id * head_dim * num_keys;
+      value_ptr += batch_id * head_dim_value * num_keys;
+      logsumexp_ptr += batch_id * lse_dim;
+      output_ptr += batch_id * head_dim_value * num_queries;
+      grad_output_ptr += batch_id * head_dim_value * num_queries;
+      delta_ptr += batch_id * num_queries;
+
+      grad_query_ptr += batch_id * head_dim * num_queries;
+      grad_key_ptr += batch_id * head_dim * num_keys;
+      grad_value_ptr += batch_id * head_dim_value * num_keys;
     }
 
     __host__ dim3 getBlocksGrid() const {
-      return dim3(1, num_heads, num_batches);
+      return dim3(1, 1, num_batches);
     }
     __host__ dim3 getThreadsGrid() const {
       return dim3(kWarpSize, kNumWarpsPerBlock, 1);
@@ -244,6 +179,7 @@ struct AttentionBackwardKernel {
     attn_T = k_j @ q_i.transpose(-2, -1) # matmul
     attn_T = (attn_T - logsumexp[i_start:i_end].unsqueeze(1).transpose(-2,
     -1)).exp() # epilogue
+
     with attn_T.shape = (kBlockSizeJ, kBlockSizeI)
     */
     using ThreadblockShape =
@@ -289,6 +225,7 @@ struct AttentionBackwardKernel {
   struct MatmulGradV {
     /*
     grad_v[j_start:j_end] += attn_T @ do_i # matmul
+
     Dimensions: (kBlockSizeJ * kNumWarpsPerBlock, kBlockSizeI, K)
     (we might need to iterate multiple times on K)
     */
@@ -664,7 +601,7 @@ struct AttentionBackwardKernel {
     typename MatmulGradV::Mma::FragmentC gradV;
     typename MatmulGradK::Mma::FragmentC gradK;
 
-    CUTLASS_DEVICE void clear() {
+    __device__ __forceinline__ void clear() {
       gradV.clear();
       gradK.clear();
     }
@@ -677,14 +614,14 @@ struct AttentionBackwardKernel {
     CHECK_ALIGNED_PTR(p.output_ptr, kMinimumAlignment);
     CHECK_ALIGNED_PTR(p.grad_output_ptr, kMinimumAlignment);
     TORCH_CHECK(
-        p.q_strideH % kMinimumAlignment == 0, "query is not correctly aligned");
+        p.head_dim % kMinimumAlignment == 0,
+        "query/key is not correctly aligned");
     TORCH_CHECK(
-        p.k_strideH % kMinimumAlignment == 0, "key is not correctly aligned");
-    TORCH_CHECK(
-        p.v_strideH % kMinimumAlignment == 0, "value is not correctly aligned");
+        p.head_dim_value % kMinimumAlignment == 0,
+        "value is not correctly aligned");
   }
 
-  static CUTLASS_DEVICE void kernel(Params& p_) {
+  static __device__ void kernel(Params& p_) {
     // Hint to nvcc to store points & tensor shapes in registers
     // as we use them a lot
     register const Params p = p_;
@@ -721,7 +658,7 @@ struct AttentionBackwardKernel {
       __syncthreads();
     }
 
-    OutputFragments register output_frags;
+    OutputFragments output_frags;
     int32_t key_start = 0;
     int32_t key_end = p.num_keys / kBlockSizeJ * kBlockSizeJ;
     for (; key_start < key_end; key_start += kBlockSizeJ) {
@@ -758,7 +695,7 @@ struct AttentionBackwardKernel {
     }
   }
 
-  static CUTLASS_DEVICE void loadDi(
+  static __device__ __forceinline__ void loadDi(
       cutlass::Array<accum_t, kBlockSizeI>& di,
       Params const& p,
       int32_t query_start) {
@@ -773,7 +710,7 @@ struct AttentionBackwardKernel {
   }
 
   template <bool skipBoundsChecks>
-  static CUTLASS_DEVICE void processBlockIJ(
+  static __device__ __forceinline__ void processBlockIJ(
       SharedStorage& shared_storage,
       OutputFragments& output_frags,
       Params const& p,
@@ -781,9 +718,9 @@ struct AttentionBackwardKernel {
       int32_t key_start) {
     cutlass::MatrixCoord no_offset{0, 0};
     accum_t scale = accum_t(1.0 / std::sqrt(float(p.head_dim)));
-    int16_t thread_id = threadIdx.x + threadIdx.y * blockDim.x;
-    int8_t warp_id = warp_uniform(threadIdx.y);
-    int8_t lane_id = threadIdx.x;
+    int32_t thread_id = threadIdx.x + threadIdx.y * blockDim.x;
+    int32_t warp_id = threadIdx.y;
+    int32_t lane_id = threadIdx.x;
     __syncthreads();
     loadDi(shared_storage.di(), p, query_start);
 
@@ -797,8 +734,8 @@ struct AttentionBackwardKernel {
 
     auto prologueGradV = [&](int col) {
       typename MatmulGradV::Mma::IteratorB iterator_dO(
-          {int32_t(p.gO_strideM)},
-          p.grad_output_ptr + query_start * p.gO_strideM + col,
+          {int32_t(p.head_dim_value)},
+          p.grad_output_ptr + query_start * p.head_dim_value + col,
           {num_queries_in_block, p.head_dim_value - col},
           thread_id,
           no_offset);
@@ -810,8 +747,8 @@ struct AttentionBackwardKernel {
     };
     auto prologueGradQ = [&](int col) {
       typename MatmulGradQ::Mma::IteratorB iterator_K(
-          {int32_t(p.k_strideM)},
-          p.key_ptr + key_start * p.k_strideM + col,
+          {int32_t(p.head_dim)},
+          p.key_ptr + key_start * p.head_dim + col,
           {num_keys_in_block, p.head_dim - col},
           thread_id,
           no_offset);
@@ -820,8 +757,8 @@ struct AttentionBackwardKernel {
     };
     auto prologueGradK = [&](int col) {
       typename MatmulGradK::Mma::IteratorB iterator_Q(
-          {int32_t(p.q_strideM)},
-          p.query_ptr + query_start * p.q_strideM + col,
+          {int32_t(p.head_dim)},
+          p.query_ptr + query_start * p.head_dim + col,
           {num_queries_in_block, p.head_dim - col},
           thread_id,
           no_offset);
@@ -833,14 +770,14 @@ struct AttentionBackwardKernel {
     };
     auto prologueDOV = [&]() {
       typename MatmulDOIVJ::Mma::IteratorA iterator_A(
-          {int32_t(p.gO_strideM)},
-          p.grad_output_ptr + query_start * p.gO_strideM,
+          {int32_t(p.head_dim_value)},
+          p.grad_output_ptr + query_start * p.head_dim_value,
           {num_queries_in_block, p.head_dim_value},
           thread_id,
           no_offset);
       typename MatmulDOIVJ::Mma::IteratorB iterator_B(
-          {int32_t(p.v_strideM)},
-          p.value_ptr + key_start * p.v_strideM,
+          {int32_t(p.head_dim_value)},
+          p.value_ptr + key_start * p.head_dim_value,
           {p.head_dim_value, num_keys_in_block},
           thread_id,
           no_offset);
@@ -866,16 +803,16 @@ struct AttentionBackwardKernel {
 
       // k_j
       typename Mma::IteratorA iterator_A(
-          {int32_t(p.k_strideM)},
-          p.key_ptr + key_start * p.k_strideM,
+          {int32_t(p.head_dim)},
+          p.key_ptr + key_start * p.head_dim,
           {problem_size.m(), problem_size.k()},
           thread_id,
           no_offset);
 
       // q_i.transpose(-2, -1)
       typename Mma::IteratorB iterator_B(
-          {int32_t(p.q_strideM)},
-          p.query_ptr + query_start * p.q_strideM,
+          {int32_t(p.head_dim)},
+          p.query_ptr + query_start * p.head_dim,
           {problem_size.k(), problem_size.n()},
           thread_id,
           no_offset);
@@ -956,14 +893,14 @@ struct AttentionBackwardKernel {
           num_keys_in_block, p.head_dim_value - col, num_queries_in_block);
       auto createEpilogueIter = [&]() {
         return typename MatmulGradV::OutputTileIterator(
-            typename MatmulGradV::OutputTileIterator::Params{p.gV_strideM()},
-            p.grad_value_ptr + key_start * p.gV_strideM() + col,
+            typename MatmulGradV::OutputTileIterator::Params{p.head_dim_value},
+            p.grad_value_ptr + key_start * p.head_dim_value + col,
             {num_keys_in_block, p.head_dim_value - col},
             thread_id);
       };
       typename Mma::IteratorB iterator_B(
-          {int32_t(p.gO_strideM)},
-          p.grad_output_ptr + query_start * p.gO_strideM + col,
+          {int32_t(p.head_dim_value)},
+          p.grad_output_ptr + query_start * p.head_dim_value + col,
           {num_queries_in_block, p.head_dim_value - col},
           thread_id,
           no_offset);
@@ -1014,16 +951,16 @@ struct AttentionBackwardKernel {
       using Mma = typename MatmulDOIVJ::Mma;
       // do_i
       typename Mma::IteratorA iterator_A(
-          {int32_t(p.gO_strideM)},
-          p.grad_output_ptr + query_start * p.gO_strideM,
+          {int32_t(p.head_dim_value)},
+          p.grad_output_ptr + query_start * p.head_dim_value,
           {num_queries_in_block, p.head_dim_value},
           thread_id,
           no_offset);
 
       // v_j.transpose(-2, -1)
       typename Mma::IteratorB iterator_B(
-          {int32_t(p.v_strideM)},
-          p.value_ptr + key_start * p.v_strideM,
+          {int32_t(p.head_dim_value)},
+          p.value_ptr + key_start * p.head_dim_value,
           {p.head_dim_value, num_keys_in_block},
           thread_id,
           no_offset);
@@ -1120,16 +1057,16 @@ struct AttentionBackwardKernel {
           num_keys_in_block);
       auto createEpilogueIter = [&]() {
         return typename MatmulGradQ::OutputTileIterator(
-            typename MatmulGradQ::OutputTileIterator::Params{p.gQ_strideM()},
-            p.grad_query_ptr + query_start * p.gQ_strideM() + col,
+            typename MatmulGradQ::OutputTileIterator::Params{p.head_dim},
+            p.grad_query_ptr + query_start * p.head_dim + col,
             {problem_size.m(), problem_size.n()},
             thread_id);
       };
 
       // k_j
       typename Mma::IteratorB iterator_B(
-          {int32_t(p.k_strideM)},
-          p.key_ptr + key_start * p.k_strideM + col,
+          {int32_t(p.head_dim)},
+          p.key_ptr + key_start * p.head_dim + col,
           {problem_size.k(), problem_size.n()},
           thread_id,
           no_offset);
@@ -1216,8 +1153,8 @@ struct AttentionBackwardKernel {
           num_queries_in_block);
       auto createEpilogueIter = [&]() {
         return typename MatmulGradK::OutputTileIterator(
-            typename MatmulGradK::OutputTileIterator::Params{p.gK_strideM()},
-            p.grad_key_ptr + key_start * p.gK_strideM() + col,
+            typename MatmulGradK::OutputTileIterator::Params{p.head_dim},
+            p.grad_key_ptr + key_start * p.head_dim + col,
             {num_keys_in_block,
              false ? MatmulGradK::ThreadblockShape::kN : p.head_dim - col},
             thread_id);
@@ -1225,8 +1162,8 @@ struct AttentionBackwardKernel {
 
       // q_i
       typename Mma::IteratorB iterator_B(
-          {int32_t(p.q_strideM)},
-          p.query_ptr + query_start * p.q_strideM + col,
+          {int32_t(p.head_dim)},
+          p.query_ptr + query_start * p.head_dim + col,
           {problem_size.k(), problem_size.n()},
           thread_id,
           no_offset);
@@ -1299,15 +1236,15 @@ struct AttentionBackwardKernel {
         kForceReloadK || !MatmulQK::Mma::kSmemContainsEntireMat;
     auto thread_id = get_thread_id();
     typename MatmulQK::Mma::IteratorA iterator_A(
-        {int32_t(p.k_strideM)},
-        p.key_ptr + key_start * p.k_strideM,
+        {int32_t(p.head_dim)},
+        p.key_ptr + key_start * p.head_dim,
         {p.num_keys - key_start, p.head_dim},
         thread_id,
         cutlass::MatrixCoord{0, 0});
 
     typename MatmulQK::Mma::IteratorB iterator_B(
-        {int32_t(p.q_strideM)},
-        p.query_ptr + query_start * p.q_strideM,
+        {int32_t(p.head_dim)},
+        p.query_ptr + query_start * p.head_dim,
         {p.head_dim, p.num_queries - query_start},
         thread_id,
         cutlass::MatrixCoord{0, 0});
@@ -1322,7 +1259,7 @@ struct AttentionBackwardKernel {
   }
 
   template <bool skipBoundsChecks>
-  static CUTLASS_DEVICE void writeFragsToGmem(
+  static __device__ __forceinline__ void writeFragsToGmem(
       SharedStorage& shared_storage,
       OutputFragments& output_frags,
       Params const& p,
@@ -1331,8 +1268,8 @@ struct AttentionBackwardKernel {
         ? MatmulQK::Mma::Shape::kM
         : std::min((int32_t)MatmulQK::Mma::Shape::kM, p.num_keys - key_start);
     typename MatmulGradV::OutputTileIterator outputV_it(
-        typename MatmulGradV::OutputTileIterator::Params{p.gV_strideM()},
-        p.grad_value_ptr + key_start * p.gV_strideM(),
+        typename MatmulGradV::OutputTileIterator::Params{p.head_dim_value},
+        p.grad_value_ptr + key_start * p.head_dim_value,
         {num_keys_in_block, p.head_dim_value},
         get_thread_id());
     accumulateInGmem<MatmulGradV>(
@@ -1342,8 +1279,8 @@ struct AttentionBackwardKernel {
         true);
 
     typename MatmulGradK::OutputTileIterator outputK_it(
-        typename MatmulGradK::OutputTileIterator::Params{p.gK_strideM()},
-        p.grad_key_ptr + key_start * p.gK_strideM(),
+        typename MatmulGradK::OutputTileIterator::Params{p.head_dim},
+        p.grad_key_ptr + key_start * p.head_dim,
         {num_keys_in_block,
          false ? MatmulGradK::ThreadblockShape::kN : p.head_dim},
         get_thread_id());
@@ -1355,7 +1292,7 @@ struct AttentionBackwardKernel {
   }
 
   template <typename MatmulT>
-  static CUTLASS_DEVICE void accumulateInGmem(
+  static __device__ __forceinline__ void accumulateInGmem(
       typename MatmulT::DefaultEpilogue::SharedStorage& epilogue_smem,
       typename MatmulT::Mma::FragmentC const& accum,
       typename MatmulT::OutputTileIterator output_it,
@@ -1397,9 +1334,7 @@ struct AttentionBackwardKernel {
   }
 
   template <int kElementsPerAccess>
-  static CUTLASS_DEVICE void computeDelta(
-      Params const& p,
-      int32_t query_start) {
+  static __device__ void computeDelta(Params const& p, int32_t query_start) {
     // Each thread computes one value for Delta
     // Depending on warp configuration, we might have multiple
     // threads of the same warp working on the same row
@@ -1414,15 +1349,13 @@ struct AttentionBackwardKernel {
     bool rowPred = (query_start + laneRow) < p.num_queries;
     bool pred = rowPred;
 
-    // on windows, previous syntax __restrict__ AccessType*
-    // resulted in error: "restrict" is not allowed
-    const AccessType* __restrict__ grad_output_ptr =
-        reinterpret_cast<const AccessType*>(
-            p.grad_output_ptr + (query_start + laneRow) * p.gO_strideM +
+    const __restrict__ AccessType* grad_output_ptr =
+        reinterpret_cast<const __restrict__ AccessType*>(
+            p.grad_output_ptr + (query_start + laneRow) * p.head_dim_value +
             laneFirstCol);
-    const AccessType* __restrict__ output_ptr =
-        reinterpret_cast<const AccessType*>(
-            p.output_ptr + (query_start + laneRow) * p.o_strideM() +
+    const __restrict__ AccessType* output_ptr =
+        reinterpret_cast<const __restrict__ AccessType*>(
+            p.output_ptr + (query_start + laneRow) * p.head_dim_value +
             laneFirstCol);
 
     static constexpr int64_t kMaxIters =
@@ -1497,13 +1430,13 @@ struct AttentionBackwardKernel {
     }
   }
 
-  static CUTLASS_DEVICE int8_t get_lane_id() {
+  static __device__ __forceinline__ int8_t get_lane_id() {
     return threadIdx.x;
   }
-  static CUTLASS_DEVICE int8_t get_warp_id() {
+  static __device__ __forceinline__ int8_t get_warp_id() {
     return threadIdx.y;
   }
-  static CUTLASS_DEVICE int16_t get_thread_id() {
+  static __device__ __forceinline__ int16_t get_thread_id() {
     return threadIdx.x + threadIdx.y * blockDim.x;
   }
 };
@@ -1524,7 +1457,8 @@ __global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
 #define INSTANTIATE_ATTENTION_KERNEL_BACKWARD(ARCH, ...)             \
   _ATTENTION_KERNEL_BACKWARD_BEGIN(                                  \
       AttentionBackwardKernel<cutlass::arch::Sm##ARCH, __VA_ARGS__>) \
-  p.advance_to_block();                                              \
+  auto batch_id = blockIdx.z;                                        \
+  p.advance_batches(batch_id);                                       \
   Kernel::kernel(p);                                                 \
   _ATTENTION_KERNEL_BACKWARD_END();
 
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.h b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
index e9f3d5029aa86..564adb2d51ea8 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.h
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
@@ -62,15 +62,6 @@ inline bool check_for_attn_weights(sdp_params params, bool debug) {
   }
   return true;
 }
-
-inline bool check_for_non_zero_dropout(sdp_params params, bool debug) {
-  if (params.dropout != 0.0) {
-    TORCH_CHECK(!debug, "Mem_efficient does not support non_zero dropout. Dropout_p: ", params.dropout);
-    return false;
-  }
-  return true;
-}
-
 inline bool check_for_seq_len_1_nested_tensor(sdp_params params, bool debug) {
   if (!params.query.is_nested()) {
     return true;
@@ -239,8 +230,7 @@ inline bool use_mem_efficient_attention(sdp_params params, bool debug) {
       check_for_attn_weights,
       check_tensor_shapes,
       check_for_attn_mask,
-      check_for_seq_len_1_nested_tensor,
-      check_for_non_zero_dropout};
+      check_for_seq_len_1_nested_tensor};
   for (auto& constraint : constraints) {
     if (!constraint(params, debug)) {
       return false;
diff --git a/test/test_transformers.py b/test/test_transformers.py
index 93a94a5604c91..939d91e7ee874 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -21,11 +21,8 @@
     TEST_WITH_ROCM,
     IS_WINDOWS,
     slowTest,
-    set_default_dtype,
-    gradcheck
+    set_default_dtype
 )
-
-from torch.testing._internal.common_methods_invocations import wrapper_set_seed
 from torch.testing._internal.common_cuda import TEST_CUDA, SM80OrLater
 
 if TEST_FAIRSEQ:
@@ -863,22 +860,11 @@ def rand_tensor(*shape):
                     actual = torch.ops.aten._scaled_dot_product_attention(
                         query, key, value, attn_mask, dropout_p, need_attn_weights, is_causal)
 
+            # freeze_rng_state() doesn't seem to work outside of CPU, so dropout makes the results incomparable.
+            # TODO: Do this skipping in a nicer way once the granular test skipping logic lands.
+            if dropout_p == 0.0 or device == 'cpu':
                 self.assertEqual(actual, expected)
 
-        if attn_mask_dim is None:
-            q = q.double().clone()
-            k = k.double().clone()
-            v = v.double().clone()
-            q.requires_grad_()
-            k.requires_grad_()
-            v.requires_grad_()
-
-            assert gradcheck(lambda *args, **kwargs: wrapper_set_seed(sdp_ref, *args, **kwargs),
-                             (q, k, v, attn_mask, dropout_p))
-            assert gradcheck(lambda *args, **kwargs:
-                             wrapper_set_seed(torch.nn.functional._scaled_dot_product_attention, *args, **kwargs),
-                             (q, k, v, attn_mask, dropout_p))
-
     @unittest.skipIf(TEST_WITH_CROSSREF, 'Fastpath not available with crossref')
     @torch.no_grad()
     def test_mask_check_fastpath(self):
@@ -1093,28 +1079,6 @@ def rand_tensor(shape):
         self.assertEqual(math_ref_test, math_ref_lp_test, atol=7e-3, rtol=7e-3)
         self.assertEqual(actual_test, math_ref_test, atol=5e-3, rtol=5e-3)
 
-    @unittest.skipIf(not TEST_CUDA or TEST_WITH_ROCM or IS_WINDOWS, "Flash Attention was not built for this system")
-    @parametrize("contiguous_inputs", [True, False])
-    def test_efficient_attention_gradcheck(self, contiguous_inputs: bool):
-
-        batch_size, seq_len, num_heads, head_dim = 8, 8, 4, 64
-        query, key, value = torch.rand((batch_size, seq_len, 3 * num_heads * head_dim),
-                                       device="cuda", dtype=torch.float32, requires_grad=True).chunk(3, -1)
-        query = query.view(batch_size, -1, num_heads, head_dim)
-        key = key.view(batch_size, -1, num_heads, head_dim)
-        value = value.view(batch_size, -1, num_heads, head_dim)
-
-        if contiguous_inputs:
-            query = query.contiguous()
-            key = key.contiguous()
-            value = value.contiguous()
-
-        # Normally we would transpose the inputs but the fused kernels expect
-        # (batch, seq_len, num_heads, head_dim) bump the tolerance since we can only run kernel
-        # in fp32
-        assert gradcheck(lambda *args, **kwargs:
-                         wrapper_set_seed(torch.ops.aten._efficient_attention_forward, *args, **kwargs),
-                         (query, key, value, None, None, None, True, False), fast_mode=True, atol=8e-5, rtol=1e-3)
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     def test_sdp_runtime_dispatch(self):
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index a0892b32a8352..8349a308be35a 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -2591,7 +2591,7 @@
 - name: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor
   self: grad.reshape_symint(self.sym_sizes())
 
-# NestedTensor
+# Nested Tensor
 - name: _nested_tensor_from_tensor_list(Tensor[] list, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   list: "grad.defined()? at::unbind(grad) : std::vector<Tensor>(list.size())"
 
@@ -2612,11 +2612,6 @@
   nested_size: non_differentiable
   nested_strides: non_differentiable
 
-# Transformers
-- name:  _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, bool compute_log_sumexp=False, bool causal=False) -> (Tensor, Tensor)
-  output_differentiability: [True, False]
-  query, key, value: _efficient_attention_backward(grad, query, key, value, result1, result0, causal)
-
 # fft
 - name: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor
   self: fft_r2c_backward(grad, dim, normalization, onesided, self.sym_size(dim.back()))
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 3b43b8fb48634..001fd455e82ee 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -11944,8 +11944,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     ),
     OpInfo(
         'nn.functional._scaled_dot_product_attention',
-        op=lambda *args, **kwargs:
-               wrapper_set_seed(torch.nn.functional._scaled_dot_product_attention, *args, **kwargs),
+        op=lambda inp, *args, **kwargs:
+               wrapper_set_seed(torch.nn.functional._scaled_dot_product_attention, inp, *args, **kwargs),
         sample_inputs_func=sample_inputs_scaled_dot_product_attention,
         dtypes=floating_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),

From dbabb9a83f162914c546f6cc8916c1c25baa8990 Mon Sep 17 00:00:00 2001
From: Chen Lai <chenlai@fb.com>
Date: Mon, 14 Nov 2022 20:16:43 -0800
Subject: [PATCH 0899/1922] Add debug handle to xnnpack schema (#89033)

As title, add three things to the schema
1. debug handle for each node
2. file identifier, so we can sanity check we are getting the xnnpack schema flatbuffers file, instead of other random binary
3. extension, so the dumped binary will end up with its own extension like `myschema.xnnpack` (maybe can have a better name) instead of the default extension `.bin`

Differential Revision: [D40906970](https://our.internmc.facebook.com/intern/diff/D40906970/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89033
Approved by: https://github.com/mcr229
---
 torch/csrc/jit/backends/xnnpack/serialization/schema.fbs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch/csrc/jit/backends/xnnpack/serialization/schema.fbs b/torch/csrc/jit/backends/xnnpack/serialization/schema.fbs
index 3b4b53debd026..6f72e604d0c4d 100644
--- a/torch/csrc/jit/backends/xnnpack/serialization/schema.fbs
+++ b/torch/csrc/jit/backends/xnnpack/serialization/schema.fbs
@@ -54,6 +54,8 @@ union ValueUnion {
 
 table Node {
   node:NodeUnion;
+  // An int which can be linked back to the node in the origin graph
+  debug_handle:uint;
 }
 
 table Value {

From cf0fb0d0e21086e5622b2a74dc064b754c294a5d Mon Sep 17 00:00:00 2001
From: Chen Lai <chenlai@fb.com>
Date: Mon, 14 Nov 2022 20:16:45 -0800
Subject: [PATCH 0900/1922] Update xnnpack graph schema to use xnode and xvalue
 (#89036)

There are different nodes definition like [Node in autograd](https://www.internalfb.com/code/fbsource/fbcode/caffe2/torch/csrc/autograd/function.h?lines=108-609&reveal=108-609) and onnxnodes and etc. Understand namespace can be used where nodes from definition are used together, however it's still better to slightly differentiate the name.

Differential Revision: [D41002324](https://our.internmc.facebook.com/intern/diff/D41002324/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89036
Approved by: https://github.com/mcr229
---
 .../backends/xnnpack/compiler/xnn_compiler.cpp   | 16 ++++++++--------
 .../backends/xnnpack/serialization/schema.fbs    | 16 ++++++++--------
 .../xnnpack/serialization/serializer.cpp         |  4 ++--
 .../backends/xnnpack/serialization/serializer.h  |  4 ++--
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp b/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp
index 3bbff2309904d..4147edf90e85d 100644
--- a/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp
+++ b/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp
@@ -46,10 +46,10 @@ XNNExecutor XNNCompiler::compileModel(std::string ser_model) {
   // a new mapping from the old ids to the newly created ones
   std::unordered_map<uint32_t, uint32_t> remapped_ids;
 
-  for (auto value : *flatbuffer_graph->values()) {
-    switch (value->value_type()) {
-      case fb_xnnpack::ValueUnion::XNNTensorValue: {
-        auto tensor_value = value->value_as_XNNTensorValue();
+  for (auto value : *flatbuffer_graph->xvalues()) {
+    switch (value->xvalue_type()) {
+      case fb_xnnpack::XValueUnion::XNNTensorValue: {
+        auto tensor_value = value->xvalue_as_XNNTensorValue();
 
         const void* data_ptr = nullptr;
         auto buffer_idx = tensor_value->constant_buffer_idx();
@@ -85,10 +85,10 @@ XNNExecutor XNNCompiler::compileModel(std::string ser_model) {
     }
   }
 
-  for (auto node : *flatbuffer_graph->nodes()) {
-    switch (node->node_type()) {
-      case fb_xnnpack::NodeUnion::XNNAdd: {
-        auto graph_node = node->node_as_XNNAdd();
+  for (auto node : *flatbuffer_graph->xnodes()) {
+    switch (node->xnode_type()) {
+      case fb_xnnpack::XNodeUnion::XNNAdd: {
+        auto graph_node = node->xnode_as_XNNAdd();
         status = xnn_define_add2(
             subgraph_ptr,
             output_min,
diff --git a/torch/csrc/jit/backends/xnnpack/serialization/schema.fbs b/torch/csrc/jit/backends/xnnpack/serialization/schema.fbs
index 6f72e604d0c4d..87ebe20a825a6 100644
--- a/torch/csrc/jit/backends/xnnpack/serialization/schema.fbs
+++ b/torch/csrc/jit/backends/xnnpack/serialization/schema.fbs
@@ -44,22 +44,22 @@ table XNNTensorValue {
   id_out:uint;
 }
 
-union NodeUnion {
+union XNodeUnion {
   XNNAdd,
 }
 
-union ValueUnion {
+union XValueUnion {
   XNNTensorValue,
 }
 
-table Node {
-  node:NodeUnion;
+table XNode {
+  xnode:XNodeUnion;
   // An int which can be linked back to the node in the origin graph
   debug_handle:uint;
 }
 
-table Value {
-  value:ValueUnion;
+table XValue {
+  xvalue:XValueUnion;
 }
 
 table XNNAdd {
@@ -72,8 +72,8 @@ table XNNAdd {
 table XNNGraph {
   // Schema version.
   version:string;
-  nodes:[Node];
-  values:[Value];
+  xnodes:[XNode];
+  xvalues:[XValue];
 
   // Ids of external inputs
   input_ids:[uint];
diff --git a/torch/csrc/jit/backends/xnnpack/serialization/serializer.cpp b/torch/csrc/jit/backends/xnnpack/serialization/serializer.cpp
index 306884a894568..df1ccc791781a 100644
--- a/torch/csrc/jit/backends/xnnpack/serialization/serializer.cpp
+++ b/torch/csrc/jit/backends/xnnpack/serialization/serializer.cpp
@@ -20,7 +20,7 @@ void XNNSerializer::serializeAddNode(
   const auto addNode =
       CreateXNNAdd(_builder, input1_id, input2_id, output_id, flags);
   const auto flatbufferNode =
-      CreateNode(_builder, NodeUnion::XNNAdd, addNode.Union());
+      CreateXNode(_builder, XNodeUnion::XNNAdd, addNode.Union());
   _nodes.push_back(flatbufferNode);
 }
 
@@ -61,7 +61,7 @@ void XNNSerializer::serializeTensorValue(
       id_out);
 
   const auto flatbufferValue =
-      CreateValue(_builder, ValueUnion::XNNTensorValue, tensorValue.Union());
+      CreateXValue(_builder, XValueUnion::XNNTensorValue, tensorValue.Union());
   _values.push_back(flatbufferValue);
 }
 
diff --git a/torch/csrc/jit/backends/xnnpack/serialization/serializer.h b/torch/csrc/jit/backends/xnnpack/serialization/serializer.h
index 3d6927f7678b8..6d01571d424d3 100644
--- a/torch/csrc/jit/backends/xnnpack/serialization/serializer.h
+++ b/torch/csrc/jit/backends/xnnpack/serialization/serializer.h
@@ -61,10 +61,10 @@ class XNNSerializer {
   flatbuffers_fbsource::FlatBufferBuilder _builder;
 
   // Vector of the serialized xnnpack nodes
-  std::vector<flatbuffers_fbsource::Offset<Node>> _nodes;
+  std::vector<flatbuffers_fbsource::Offset<XNode>> _nodes;
 
   // Vector of the serialized xnnpack values
-  std::vector<flatbuffers_fbsource::Offset<Value>> _values;
+  std::vector<flatbuffers_fbsource::Offset<XValue>> _values;
 
   std::vector<flatbuffers_fbsource::Offset<Buffer>> _constantBuffer;
   std::vector<uint32_t> _bufferSizes;

From 80fa2624ab064c8c8809d295f259eef5bf7c186a Mon Sep 17 00:00:00 2001
From: Masaki Kozuki <mkozuki@nvidia.com>
Date: Tue, 15 Nov 2022 13:21:39 +0000
Subject: [PATCH 0901/1922] [c10d] Implement `__instancecheck__` for
 `c10d::ReduceOp` (#88275)

Summary:
- Customize the metaclass of `torch.distributed.distributed_c10d.ReduceOp` for the sake of custom `__instancecheck__`
- Add `copy.copy`, `copy.deepcopy`, and `pickle` support with tests

Rel:
- #81272
- #84243
- #87191
- #87303
- #87555

Ref:
- https://github.com/pybind/pybind11/issues/2696

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88275
Approved by: https://github.com/wanchaol
---
 test/distributed/test_c10d_common.py          |  32 ++++-
 test/distributed/test_c10d_nccl.py            |  18 ++-
 torch/_C/_distributed_c10d.pyi                |   2 +
 .../distributed/c10d/ProcessGroupNCCL.cpp     |   7 +-
 torch/csrc/distributed/c10d/Types.hpp         |   9 +-
 torch/csrc/distributed/c10d/init.cpp          | 117 ++++++++++++++++--
 torch/distributed/distributed_c10d.py         |  11 --
 7 files changed, 156 insertions(+), 40 deletions(-)

diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index a43b1343923c8..c03a68228990a 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -2,6 +2,7 @@
 
 import copy
 import os
+import pickle
 import sys
 import tempfile
 import threading
@@ -1657,15 +1658,44 @@ def comm_fn(tensor, group=None):
 
 class ReduceOpTest(TestCase):
 
+    # Ref: https://github.com/pytorch/pytorch/issues/87191
     def test_op_isinstance_of_reduceop(self):
         for reduce_op in (
             c10d.ReduceOp.SUM, c10d.ReduceOp.AVG, c10d.ReduceOp.PRODUCT, c10d.ReduceOp.MIN, c10d.ReduceOp.MAX,
             c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR,
         ):
             self.assertTrue(isinstance(reduce_op, c10d.ReduceOp))
-        for scale in ([torch.tensor(1.0)], 2.0):
+        for scale in (torch.tensor(1.0), 2.0):
             self.assertTrue(isinstance(dist._make_nccl_premul_sum(scale), c10d.ReduceOp))
 
+    # Ref: https://github.com/pytorch/pytorch/pull/87303#discussion_r1002879700
+    def test_reduceop_copyable(self):
+        for reduce_op in (
+            c10d.ReduceOp.SUM, c10d.ReduceOp.AVG, c10d.ReduceOp.PRODUCT, c10d.ReduceOp.MIN, c10d.ReduceOp.MAX,
+            c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR,
+        ):
+            self.assertEqual(copy.copy(reduce_op), reduce_op)
+            self.assertEqual(copy.deepcopy(reduce_op), reduce_op)
+            self.assertEqual(copy.copy(c10d.ReduceOp(reduce_op)), reduce_op)
+            self.assertEqual(copy.deepcopy(c10d.ReduceOp(reduce_op)), reduce_op)
+
+        for scale in (torch.tensor(1.0), 2.0):
+            reduce_op = dist._make_nccl_premul_sum(scale)
+            self.assertEqual(copy.copy(reduce_op), reduce_op)
+            self.assertEqual(copy.deepcopy(reduce_op), reduce_op)
+
+    def test_reduceop_pickle(self):
+        for reduce_op in (
+            c10d.ReduceOp.SUM, c10d.ReduceOp.AVG, c10d.ReduceOp.PRODUCT, c10d.ReduceOp.MIN, c10d.ReduceOp.MAX,
+            c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR,
+        ):
+            pickle.loads(pickle.dumps(reduce_op))
+            orig = c10d.ReduceOp(reduce_op)
+            self.assertEqual(pickle.loads(pickle.dumps(orig)), orig)
+        for scale in (torch.tensor(1.0), 2.0):
+            reduce_op = dist._make_nccl_premul_sum(scale)
+            self.assertEqual(pickle.loads(pickle.dumps(reduce_op)), reduce_op)
+
 
 if __name__ == "__main__":
     assert (
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index c514ea4ab31fd..cdc167bc4d1ae 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -348,16 +348,14 @@ def allreduce(tensors, op):
         # Premul Sum
         if torch.cuda.nccl.version() >= (2, 11, 1):
             for dtype in torch.half, torch.float, torch.double:
-                for factor in (3.0,
-                               (torch.tensor([5.0], device=local_device_id, dtype=dtype),)):
+                for factor in (3.0, torch.tensor([5.0], device=local_device_id, dtype=dtype)):
                     tensors = [torch.tensor([self.rank + 1]).cuda(local_device_id).to(dtype=dtype)]
 
                     allreduce(tensors, c10d._make_nccl_premul_sum(factor))
 
-                    f = factor if isinstance(factor, float) else factor[0]
                     # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
                     self.assertEqualIgnoreType(
-                        f * torch.tensor([float(self.world_size * (self.world_size + 1) / 2)], device=local_device_id),
+                        factor * torch.tensor([float(self.world_size * (self.world_size + 1) / 2)], device=local_device_id),
                         tensors[0],
                     )
 
@@ -435,9 +433,9 @@ def reduce(xs, rootRank, rootTensor, op=None):
 
             # Premul sum
             if torch.cuda.nccl.version() >= (2, 11, 1):
-                for factor in (3.0, (torch.tensor([5.0], device=local_device_id),)):
-                    if isinstance(factor, tuple):
-                        factor_ref = factor[0].cpu().item()
+                for factor in (3.0, torch.tensor([5.0], device=local_device_id)):
+                    if isinstance(factor, torch.Tensor):
+                        factor_ref = factor.cpu().item()
                     else:
                         factor_ref = factor
                     float_tensors = [
@@ -933,9 +931,9 @@ def perm(n, k):
         self.assertEqualIgnoreType(expected, output_tensor)
 
         if torch.cuda.nccl.version() >= (2, 11, 1):
-            for factor in (3.0, (torch.tensor([5.0], device=self.rank),),):
-                if isinstance(factor, tuple):
-                    factor_ref = factor[0].cpu().item()
+            for factor in (3.0, torch.tensor([5.0], device=self.rank)):
+                if isinstance(factor, torch.Tensor):
+                    factor_ref = factor.cpu().item()
                 else:
                     factor_ref = factor
                 output = [t.float() for t in output]
diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index 56b86bd504bf6..f16a8ec362f50 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -91,6 +91,8 @@ class DebugLevel(Enum):
 
 class ReduceOp:
 
+    def __init__(self, op: "RedOpType"): ...
+
     SUM = ...
     PRODUCT = ...
     MIN = ...
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 1d788a2c2e0c7..387fe5eb4dcc7 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -83,11 +83,10 @@ ncclRedOpRAII unpackPreMulSum(
   const auto* preMulSupplement =
       reinterpret_cast<NCCLPreMulSumSupplement*>(reduceOp.supplement_.get());
   ncclRedOp_t preMulSum;
-  bool has_tensor = !preMulSupplement->tensor_factors.empty();
+  bool has_tensor = preMulSupplement->tensor_factor.defined();
   auto residence = has_tensor ? ncclScalarDevice : ncclScalarHostImmediate;
-  T* ptr_factor = has_tensor
-      ? preMulSupplement->tensor_factors[dev_in_group].data_ptr<T>()
-      : nullptr;
+  T* ptr_factor =
+      has_tensor ? preMulSupplement->tensor_factor.data_ptr<T>() : nullptr;
   T scalar_factor = T(preMulSupplement->double_factor);
   ncclRedOpCreatePreMulSum(
       &preMulSum,
diff --git a/torch/csrc/distributed/c10d/Types.hpp b/torch/csrc/distributed/c10d/Types.hpp
index 64fbc45c6588c..be20fcadba645 100644
--- a/torch/csrc/distributed/c10d/Types.hpp
+++ b/torch/csrc/distributed/c10d/Types.hpp
@@ -8,6 +8,7 @@
 #include <ATen/core/ivalue.h>
 #include <ATen/core/Tensor.h>
 
+#include <c10/macros/Macros.h>
 #include <c10/util/intrusive_ptr.h>
 
 namespace c10d {
@@ -21,9 +22,11 @@ struct TORCH_API _SupplementBase : torch::CustomClassHolder {
 // The point of use in ProcessGroupNCCL knows how to unpack it.
 struct NCCLPreMulSumSupplement : _SupplementBase {
   double double_factor{0.0};
-  std::vector<at::Tensor> tensor_factors;
+  at::Tensor tensor_factor;
   NCCLPreMulSumSupplement(double f) : double_factor{f} {}
-  NCCLPreMulSumSupplement(std::vector<at::Tensor> f) : tensor_factors{std::move(f)} {}
+  NCCLPreMulSumSupplement(at::Tensor t) : tensor_factor{std::move(t)} {
+    TORCH_CHECK_EQ(tensor_factor.numel(), 1);
+  }
 };
 
 // Other ReduceOps that need different supplementary data can also
@@ -60,7 +63,7 @@ struct TORCH_API ReduceOp : torch::CustomClassHolder {
     }
   }
 
-  // The heap resource supplement_, if it exists, is managed by a shared_ptr,
+  // The heap resource supplement_, if it exists, is managed by a c10::intrusive_ptr,
   // so constructors and operator= can be simple
   ReduceOp(const ReduceOp& other) :
     op_(other.op_), supplement_(other.supplement_) {}
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 313aabee7cd98..d39fc322d326b 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/python_headers.h>
 
 #include <c10/util/intrusive_ptr.h>
+#include <c10/util/string_view.h>
 #include <torch/csrc/distributed/c10d/FileStore.hpp>
 #include <torch/csrc/distributed/c10d/TCPStore.hpp>
 #include <torch/csrc/distributed/c10d/Utils.hpp>
@@ -235,6 +236,61 @@ void _register_builtin_comm_hook(
   reducer.register_builtin_comm_hook(comm_hook_type);
 }
 
+// Customize the metaclass of ::c10d::ReduceOp for the backward compatibility.
+// https://github.com/pytorch/pytorch/pull/84243 changed ::c10d::ReduceOp to
+// struct from enum, sacrificing some of the Python built-in function supports
+// such as `isinstance` (see https://github.com/pytorch/pytorch/issues/87191)
+// and `copy` (see
+// https://github.com/pytorch/pytorch/pull/87303#discussion_r1002879700). Below,
+// we define a custom `isinstance` in CPython/pybind11
+// (`reduceopmeta___instancecheck__`) and modify the default metaclass of
+// pybind11 (`GetReduceOpMetaclass`) so that
+// `isinstance(torch.distributed.ReduceOp.SUM, torch.distributed.ReduceOp)`
+// returns :obj:`True` as if `ReduceOp` is enum.
+// Ref:
+//   - https://docs.python.org/3/extending/newtypes_tutorial.html
+//   - https://docs.python.org/3/c-api/typeobj.html?highlight=tp_methods
+//   - https://github.com/pybind/pybind11/issues/2696
+static PyObject* reduceopmeta___instancecheck__(
+    PyObject* self,
+    PyObject* args) {
+  if (Py_TYPE(self) == Py_TYPE(args)) {
+    Py_RETURN_TRUE;
+  }
+  if (c10::string_view(args->ob_type->tp_name).find("RedOpType") !=
+      c10::string_view::npos) {
+    Py_RETURN_TRUE;
+  }
+  Py_RETURN_FALSE;
+}
+static PyMethodDef reduceopmeta_methods[] = {
+    {"__instancecheck__",
+     (PyCFunction)reduceopmeta___instancecheck__,
+     METH_O,
+     "Custom `__instancecheck__` for ReduceOp"},
+    {NULL, NULL}};
+PyTypeObject* GetReduceOpMetaclass() {
+  static auto* metaclass = [] {
+    PyTypeObject* base_metaclass =
+        pybind11::detail::get_internals().default_metaclass;
+    PyType_Slot slots[] = {
+        {Py_tp_base, base_metaclass},
+        {Py_tp_methods, reduceopmeta_methods},
+        {0},
+    };
+    PyType_Spec spec = {};
+    spec.name = "torch._C._distributed_c10d._ReduceOpMeta";
+    spec.basicsize = base_metaclass->tp_basicsize;
+    spec.flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE;
+    spec.slots = slots;
+    PyTypeObject* metaclass = (PyTypeObject*)PyType_FromSpec(&spec);
+    if (!metaclass)
+      throw py::error_already_set();
+    return metaclass;
+  }();
+  return metaclass;
+}
+
 PyObject* c10d_init(PyObject* _unused, PyObject* noargs) {
   C10_LOG_API_USAGE_ONCE("c10d.python.import");
 
@@ -520,7 +576,8 @@ An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_CO
   //    making `PREMUL_SUM` callable, i.e., allowing for
   //    `ReduceOp.PREMUL_SUM(scale)` might be better as per @wanchaol.
   // https://pybind11.readthedocs.io/en/stable/classes.html#enumerations-and-internal-types
-  py::class_<::c10d::ReduceOp> reduce_op(module, "ReduceOp", R"(
+  py::class_<::c10d::ReduceOp> reduce_op(
+      module, "ReduceOp", py::metaclass((PyObject*)GetReduceOpMetaclass()), R"(
 An enum-like class for available reduction operations: ``SUM``, ``PRODUCT``,
 ``MIN``, ``MAX``, ``BAND``, ``BOR``, ``BXOR``, and ``PREMUL_SUM``.
 
@@ -562,14 +619,51 @@ This class does not support ``__members__`` property.)");
           [](const ::c10d::ReduceOp& self, const ::c10d::ReduceOp& other) {
             return self == other.op_;
           })
-      .def("__hash__", [](const ::c10d::ReduceOp& self) {
-        return static_cast<uint8_t>(self.op_);
-      });
-
-  // note(crcrpar): Deliberately skip
-  // [`export_values`](https://pybind11.readthedocs.io/en/stable/classes.html#enumerations-and-internal-types)
-  // here and manually set values in Python side. See note "ReduceOp static
-  // class attributes to support `isinstance`"
+      .def(
+          "__hash__",
+          [](const ::c10d::ReduceOp& self) {
+            return static_cast<uint8_t>(self.op_);
+          })
+      .def(
+          "__copy__",
+          [](const ::c10d::ReduceOp& self) { return ::c10d::ReduceOp(self); })
+      .def(
+          "__deepcopy__",
+          [](const ::c10d::ReduceOp& self, const py::dict& memo) {
+            return ::c10d::ReduceOp(self);
+          })
+      .def(py::pickle(
+          [](const ::c10d::ReduceOp& r) {
+            // __getstate__
+            if (r.op_ != ::c10d::ReduceOp::RedOpType::PREMUL_SUM) {
+              return py::make_tuple(r.op_, py::none());
+            }
+            TORCH_CHECK(r.supplement_.defined(), "Invalid PREMUL_SUM ReduceOp");
+            const auto* preMulSupplement =
+                reinterpret_cast<::c10d::NCCLPreMulSumSupplement*>(
+                    r.supplement_.get());
+            if (!preMulSupplement->tensor_factor.defined()) {
+              return py::make_tuple(r.op_, preMulSupplement->double_factor);
+            } else {
+              return py::make_tuple(r.op_, preMulSupplement->tensor_factor);
+            }
+          },
+          [](const py::tuple t) {
+            // __setstate__
+            TORCH_CHECK(t.size() == 2, "Invalid state");
+            const auto op =
+                static_cast<::c10d::ReduceOp::RedOpType>(t[0].cast<uint8_t>());
+            if (op != ::c10d::ReduceOp::RedOpType::PREMUL_SUM) {
+              return ::c10d::ReduceOp(op);
+            }
+            const auto preMulSupplement_factor = t[1];
+            if (py::isinstance<py::float_>(preMulSupplement_factor)) {
+              return ::c10d::makeNCCLPreMulSum(t[1].cast<double>());
+            } else {
+              return ::c10d::makeNCCLPreMulSum(t[1].cast<at::Tensor>());
+            }
+          }));
+
   py::enum_<::c10d::ReduceOp::RedOpType>(reduce_op, "RedOpType")
       .value("SUM", ::c10d::ReduceOp::RedOpType::SUM)
       .value("AVG", ::c10d::ReduceOp::RedOpType::AVG)
@@ -579,7 +673,8 @@ This class does not support ``__members__`` property.)");
       .value("BAND", ::c10d::ReduceOp::RedOpType::BAND)
       .value("BOR", ::c10d::ReduceOp::RedOpType::BOR)
       .value("BXOR", ::c10d::ReduceOp::RedOpType::BXOR)
-      .value("PREMUL_SUM", ::c10d::ReduceOp::RedOpType::PREMUL_SUM);
+      .value("PREMUL_SUM", ::c10d::ReduceOp::RedOpType::PREMUL_SUM)
+      .export_values();
 
   // note(crcrpar): This could be removed because users will not pass
   // `RedOpType` to reduce collective ops Ref: [Implicit
@@ -597,7 +692,7 @@ This class does not support ``__members__`` property.)");
           py::call_guard<py::gil_scoped_release>())
       .def(
           "_make_nccl_premul_sum",
-          &::c10d::makeNCCLPreMulSum<std::vector<at::Tensor>>,
+          &::c10d::makeNCCLPreMulSum<at::Tensor>,
           py::arg("factor").noconvert(),
           py::return_value_policy::copy, // seems safest
           py::call_guard<py::gil_scoped_release>());
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 33569f5169e5d..f46aaaef94ef4 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -237,17 +237,6 @@ def register_backend(cls, name, func, extended_api=False):
 dist_backend = Backend
 
 
-# NOTE(crcrpar): [ReduceOp static class attributes to support `isinstance`]
-#   A ReduceOp instance of `PREMUL_SUM` is supposed to be created via `_make_nccl_premul_sum`
-#   while the other `op`s (meaning RedOpType members) can be directly passed to c10d reduce collectives.
-#   I changed `ReduceOp` to struct from enum class and introduced RedOpType enum class for PREMUL_SUM,
-#   which broke an implicit contract of ReduceOp being enum-like with which users apply isinstance to
-#   `op`, for example, `isinstance(ReduceOp.SUM, ReduceOp)`: https://github.com/pytorch/pytorch/issues/87191
-DENY_LIST = ("PREMUL_SUM", )
-for _red_op_name, _red_op_value in ReduceOp.RedOpType.__members__.items():
-    setattr(ReduceOp, _red_op_name, _red_op_value if _red_op_name in DENY_LIST else ReduceOp(_red_op_value))
-
-
 class _reduce_op(object):
     r"""
     Deprecated enum-like class for reduction operations: ``SUM``, ``PRODUCT``,

From 1b9f46a9f0c7a9bb8a702674b617069ff7af0553 Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Tue, 15 Nov 2022 01:06:23 +0000
Subject: [PATCH 0902/1922] Symintify decomps for split and upsample_bilinear;
 Fix decomp for _softmax_backward_data and native_dropout_backward (#88761)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88761
Approved by: https://github.com/ezyang
---
 test/dynamo/test_dynamic_shapes.py            | 12 ---
 test/functorch/test_aotdispatch.py            |  4 -
 test/functorch/test_ops.py                    |  4 +
 test/functorch/test_vmap.py                   |  3 +
 test/inductor/test_torchinductor_opinfo.py    |  1 +
 test/test_decomp.py                           |  3 +
 test/test_proxy_tensor.py                     | 22 +++--
 torch/_decomp/decompositions.py               | 98 +++++++++++++++----
 .../_internal/common_methods_invocations.py   | 94 ++++++++++++++----
 9 files changed, 177 insertions(+), 64 deletions(-)

diff --git a/test/dynamo/test_dynamic_shapes.py b/test/dynamo/test_dynamic_shapes.py
index 294ea9e549522..f3964a777aa82 100644
--- a/test/dynamo/test_dynamic_shapes.py
+++ b/test/dynamo/test_dynamic_shapes.py
@@ -106,23 +106,11 @@ def make_dynamic_cls(cls):
     DynamicShapesUnspecTests.test_unspec_float_precision_dynamic_shapes
 )
 
-# DynamicShapesReproTests
-unittest.expectedFailure(
-    DynamicShapesReproTests.test_reformer_eval_dynamic_shapes
-    # TypeError: 'torch._C.SymIntNode' object cannot be interpreted as an integer
-)
-
 unittest.expectedFailure(
     DynamicShapesReproTests.test_reformer_sorting_dynamic_shapes
     # Unable to cast Python instance to C++ type
 )
 
-unittest.expectedFailure(
-    DynamicShapesReproTests.test_reformer_train_dynamic_shapes
-    # TypeError: 'torch._C.SymIntNode' object cannot be interpreted as an integer
-)
-
-
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index e31ac58039ec8..eb34a3fb75828 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1005,7 +1005,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('cdist', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('cholesky_inverse', ''),  # could not find kernel
     xfail('cholesky_solve', ''),  # could not find kernel
-    xfail('chunk', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('column_stack', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('combinations', ''),  # aten.masked_select.default
     xfail('complex', ''),  # aten.view_as_real.default - couldn't find symbolic meta function/decomposition
@@ -1137,7 +1136,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('nn.functional.hinge_embedding_loss', ''),  # aten.zeros_like.default - couldn't find symbolic meta...
     xfail('nn.functional.interpolate', 'area'),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.interpolate', 'bicubic'),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('nn.functional.interpolate', 'bilinear'),  # Cannot call sizes() on tensor with symbolic sizes/str...
     xfail('nn.functional.interpolate', 'linear'),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.interpolate', 'nearest'),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.interpolate', 'trilinear'),  # Cannot call sizes() on tensor with symbolic sizes/st...
@@ -1164,7 +1162,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('nn.functional.rrelu', ''),  # aten.rrelu_with_noise.default - couldn't find symbolic meta function...
     xfail('nn.functional.smooth_l1_loss', ''),  # could not find kernel
     xfail('nn.functional.unfold', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('nn.functional.upsample_bilinear', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.upsample_nearest', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('norm', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('norm', 'nuc'),  # aten._linalg_svd.default - couldn't find symbolic meta function/decomposition
@@ -1197,7 +1194,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('sgn', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('special.i1', ''),  # aten.i0.default - couldn't find symbolic meta function/decomposition
     xfail('special.polygamma', 'special_polygamma_n_0'),  # aten.polygamma.default - couldn't find symbolic ...
-    xfail('split', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('std', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('std_mean', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('stft', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index 2e303922dfa1c..643ff0ec862a2 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -1055,6 +1055,7 @@ def test():
         xfail('segment_reduce', 'lengths'),
         xfail('sparse.sampled_addmm', ''),
         xfail("native_batch_norm"),
+        xfail("native_dropout_backward"),
     }))
     def test_vmapvjp_has_batch_rule(self, device, dtype, op):
         if not op.supports_autograd:
@@ -1220,6 +1221,8 @@ def get_vjp(cotangents, *primals):
         xfail('segment_reduce', 'offsets'),  # NYI: forward-AD for segment_reduce
         xfail('index_reduce', ''),  # NYI: forward-AD for index_reduce
         xfail('segment_reduce', 'lengths'),  # NYI: forward-AD for segment_reduce
+        xfail('native_dropout_backward'),  # NYI
+
     }))
     @opsToleranceOverride('TestOperators', 'test_jvpvjp', (
         tol1('masked.prod',
@@ -1377,6 +1380,7 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents):
         # input while the running_mean or running_var, which will be updated in
         # place, were not batched.
         xfail("native_batch_norm"),
+        xfail('native_dropout_backward',)
     }))
     @ops(op_db + additional_op_db, allowed_dtypes=(torch.float,))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index fb8722b8405bf..0c38c5101cf86 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -3239,6 +3239,7 @@ def test():
         xfail('broadcast_shapes', ''),  # test runner can't handle non-Tensor ops
         xfail('sparse.sampled_addmm'),  # sparse
         xfail('cross'),  # The default value of dim in op is *very* weird. No wonder it doesn't work
+        skip('_softmax_backward_data'),
         skip('linalg.eigh', ''),  # not unique, see test_linalg_eigh for manual test
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
         # ----------------------------------------------------------------------
@@ -3380,6 +3381,7 @@ def test_vmap_exhaustive(self, device, dtype, op):
         xfail('bernoulli', ''),
         xfail('linalg.lu_factor', ''),
         xfail('nn.functional.feature_alpha_dropout', 'with_train'),
+        xfail('native_dropout_backward'),
         xfail('nn.functional.kl_div', ''),
         xfail('multinomial', ''),
         xfail('column_stack', ''),
@@ -3453,6 +3455,7 @@ def test_vmap_exhaustive(self, device, dtype, op):
         xfail('equal', ''),
         xfail('linalg.lu', ''),
         skip('linalg.ldl_solve', ''),
+        skip('_softmax_backward_data'),
     }))
     def test_op_has_batch_rule(self, device, dtype, op):
         # needs to be fixed
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 4e706bab0ea6c..83d8d40e21ecf 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -430,6 +430,7 @@ def wrapper_set_seed(op, *args, **kwargs):
     "randn": {"assert_equal": False},
     ("nn.functional.tanhshrink", "cuda", f16): {"atol": 3e-4, "rtol": 0.001},
     ("cummax", "cuda", f16): {"atol": 5e-4, "rtol": 0.002},
+    ("_softmax_backward_data", "cuda", f16): {"atol": 0.008, "rtol": 0.002},
     "gradient": {"check_gradient": False},  # segfault on check_gradient
     # Following tests failed, and causing subsequent tests failing with unrecoverable CUDA error
     "linalg.solve_triangular": {"check_gradient": False},
diff --git a/test/test_decomp.py b/test/test_decomp.py
index 67e99d5eb8291..a3658792c5e71 100644
--- a/test/test_decomp.py
+++ b/test/test_decomp.py
@@ -294,6 +294,9 @@ def normalize_op_input_output(f, sample, requires_grad=True):
     (None, None, "meshgrid"),
     # diag was not decomposed (it just registers a decomp for diag_out, torch.diag is CompImplicit)
     (None, None, "diag"),
+
+    # _softmax_backward_data's CPU kernel for bfloat16 always return the grad_input as float32
+    ("cpu", torch.bfloat16, "_softmax_backward_data"),
 }
 
 CROSS_REF_BACKWARD_EXCLUDE_SET = {
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 42ecc3d376ab8..894b35693430e 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1124,7 +1124,6 @@ def f(a, b, c, d, e):
     xfail('cartesian_prod', ''),  # Tensors of type TensorImpl do not have numel
     xfail('cdist', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('cholesky_solve', ''),  # Could not run 'aten::_cholesky_solve_helper' with arguments from the 'Meta' back...
-    xfail('chunk', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('column_stack', ''),  # Tensors of type TensorImpl do not have numel
     xfail('combinations', ''),
     xfail('count_nonzero', ''),  # Could not run 'aten::count_nonzero.dim_IntList' with arguments from the 'Meta' ba...
@@ -1247,7 +1246,6 @@ def f(a, b, c, d, e):
     xfail('nn.functional.hinge_embedding_loss', ''),  # aten.empty_like.default - couldn't find symbolic meta function/deco...
     xfail('nn.functional.interpolate', 'area'),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.interpolate', 'bicubic'),  # aten.upsample_bicubic2d.vec - couldn't find symbolic meta function/d...
-    xfail('nn.functional.interpolate', 'bilinear'),  # aten.upsample_bilinear2d.vec - couldn't find symbolic meta function...
     xfail('nn.functional.interpolate', 'linear'),  # aten.upsample_linear1d.vec - couldn't find symbolic meta function/dec...
     xfail('nn.functional.interpolate', 'nearest'),  # aten.upsample_nearest1d.vec - couldn't find symbolic meta function/d...
     xfail('nn.functional.interpolate', 'trilinear'),  # aten.upsample_trilinear3d.vec - couldn't find symbolic meta functi...
@@ -1267,7 +1265,6 @@ def f(a, b, c, d, e):
     xfail('nn.functional.rrelu', ''),  # aten.empty_like.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.smooth_l1_loss', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.unfold', ''),  # aten.im2col.default - couldn't find symbolic meta function/decomposition
-    xfail('nn.functional.upsample_bilinear', ''),  # aten.upsample_bilinear2d.vec - couldn't find symbolic meta function/de...
     xfail('nn.functional.upsample_nearest', ''),  # aten.upsample_nearest1d.vec - couldn't find symbolic meta function/deco...
     xfail('nonzero', ''),  # aten.nonzero.default - couldn't find symbolic meta function/decomposition
     xfail('norm', 'nuc'),  # aten._linalg_svd.default - couldn't find symbolic meta function/decomposition
@@ -1313,7 +1310,6 @@ def f(a, b, c, d, e):
     xfail('special.polygamma', 'special_polygamma_n_0'),  # aten.polygamma.default - couldn't find symbolic meta function/...
     xfail('special.scaled_modified_bessel_k0', ''),  # aten.special_scaled_modified_bessel_k0.default - couldn't find symbo...
     xfail('special.scaled_modified_bessel_k1', ''),  # aten.special_scaled_modified_bessel_k1.default - couldn't find symbo...
-    xfail('split', ''),  # 'torch._C.SymIntNode' and 'int'
     xfail('stft', ''),  # argument 'size' must be tuple of ints, but found element of type torch._C.SymIntNode at...
     xfail('sum_to_size', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('svd', ''),  # aten._linalg_svd.default - couldn't find symbolic meta function/decomposition
@@ -1439,10 +1435,13 @@ def _fn(t, *args, **kwargs):
     return _fn
 
 def _test_make_fx_helper(self, device, dtype, op, tracing_mode, inplace=False):
-    def f(args, kwargs, extra_args):
+    def f(args, kwargs, extra_args, extra_kwargs):
         if extra_args:
             for i, t in extra_args:
                 args[i] = t.size()
+        if extra_kwargs:
+            for k, t in extra_kwargs.items():
+                kwargs[k] = t.size()
 
         fn = _get_safe_inplace(op.get_inplace()) if inplace else op.op
         return fn(*args, **kwargs)
@@ -1463,23 +1462,26 @@ def f(args, kwargs, extra_args):
         # - Unpack the size in the wrapper to get a torch.Size with dynamic shapes (in
         #   symbolic mode, a no-op otherwise)
         extra_args = []
+        extra_kwargs = {}
         for i, arg in enumerate(args):
             if isinstance(arg, torch.Size):
-                extra_args.append((i, torch.empty((), device="cpu").expand(arg)))
-        # TODO: support kwargs
+                extra_args.append((i, torch.empty(arg, device="cpu")))
+        for key, value in kwargs.items():
+            if isinstance(value, torch.Size):
+                extra_kwargs[key] = torch.empty(value, device="cpu")
 
         try:
-            new_f = make_fx(f, tracing_mode=tracing_mode)(args, kwargs, extra_args)
+            new_f = make_fx(f, tracing_mode=tracing_mode)(args, kwargs, extra_args, extra_kwargs)
         except DynamicOutputShapeException as e:
             self.skipTest("Dynamic output shape operation in trace")
         for arg in args:
             if isinstance(arg, torch.Tensor) and arg.dtype == torch.float:
                 arg.uniform_(0, 1)
         try:
-            old_out = f(args, kwargs, extra_args)
+            old_out = f(args, kwargs, extra_args, extra_kwargs)
         except Exception:
             continue
-        new_out = wrapper_set_seed(new_f, args, kwargs, extra_args)
+        new_out = wrapper_set_seed(new_f, args, kwargs, extra_args, extra_kwargs)
         self.assertEqual(new_out, old_out)
 
 class TestProxyTensorOpInfo(TestCase):
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 1a2d332e99fd9..7c84cb7e2ca8b 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -4,7 +4,7 @@
 from enum import Enum
 from functools import partial, reduce
 from itertools import product
-from typing import Callable, cast, Iterable, List, Optional, Tuple
+from typing import Callable, cast, Iterable, List, Optional, Tuple, Union
 
 import torch
 import torch._prims_common as utils
@@ -13,6 +13,7 @@
 from torch._decomp import register_decomposition
 from torch._prims_common import IntLike, NumberType, TensorLike, TensorSequenceType
 from torch._prims_common.wrappers import _maybe_resize_out, _safe_copy_out, out_wrapper
+from torch.fx.experimental.symbolic_shapes import guard_int, sym_float, sym_int
 from torch.utils._pytree import tree_flatten, tree_map
 
 DispatchKey = torch._C.DispatchKey  # type: ignore[attr-defined]
@@ -696,7 +697,12 @@ def _softmax_backward_data(
     grad_input = new_grad_output - output * torch.sum(
         new_grad_output, dim=dim, keepdim=True
     )
-    return _cast_grad_to_input_dtype(grad_output, grad_input, input_dtype)
+
+    # CPU kernel doesn't respect input_dtype, but following check doesn't work for meta tensor
+    # if grad_output.device == torch.device("cpu"):
+    #     return grad_input.contiguous()
+
+    return _cast_grad_to_input_dtype(grad_output, grad_input, input_dtype).contiguous()
 
 
 @register_decomposition(aten._log_softmax_backward_data)
@@ -912,9 +918,17 @@ def check_positive(param, param_name, strict=True):
 
 
 @register_decomposition(aten.native_dropout_backward)
-@pw_cast_for_opmath
 def native_dropout_backward(grad_output: Tensor, mask: Tensor, scale: float):
-    return grad_output * (mask.type_as(grad_output) * scale)
+    # According to the CUDA kernel implementation we should have this test;
+    # but it seems to fail tests!
+    # utils.check(mask.dtype == torch.bool, lambda: f"Mask should be Bool Scalar Type {mask.dtype}")
+
+    # Mimicking CUDA kernel's behavior for output stride: output follow input's memory format
+    # This different from TensorIterator's behavior
+    r = (grad_output * (mask.type_as(grad_output) * scale)).clone(
+        memory_format=utils.suggest_memory_format(grad_output)
+    )
+    return r
 
 
 @register_decomposition(aten.unfold_backward)
@@ -1095,8 +1109,9 @@ def split(self: Tensor, split_size: int, dim: int = 0) -> List[Tensor]:
         assert dim_size == 0
         return [self]
     chunks = (dim_size + split_size - 1) // split_size
+    chunks = guard_int(chunks)
     split_sizes = [split_size for i in range(chunks)]
-    split_sizes[chunks - 1] = split_size - (split_size * chunks - dim_size)
+    split_sizes[-1] = split_size - (split_size * chunks - dim_size)
     return torch.split(self, split_sizes, dim)
 
 
@@ -1786,29 +1801,74 @@ def norm(
     return torch.linalg.vector_norm(self, p, dim, keepdim, dtype=dtype)
 
 
+# aten/src/ATen/native/UpSample.cpp compute_output_size
+def upsample_compute_output_size(input_size, output_size, scale_factors):
+    spatial_dimensions = len(input_size) - 2
+    if output_size is not None:
+        utils.check(
+            scale_factors is None,
+            lambda: "Must specify exactly one of output_size and scale_factors",
+        )
+        utils.check(len(output_size) == spatial_dimensions, lambda: "")
+        return output_size
+    if scale_factors is not None:
+        # NB: this isn't necessary lol
+        utils.check(
+            output_size is None,
+            lambda: "Must specify exactly one of output_size and scale_factors",
+        )
+        utils.check(len(scale_factors) == spatial_dimensions, lambda: "")
+        return [
+            # Returning output_size as float. We cannot convert it to int directly,
+            # as latter computation of scale_factor is relying output size being float
+            sym_float(input_size[i + 2] * scale_factors[i])
+            for i in range(spatial_dimensions)
+        ]
+    utils.check(
+        False, lambda: "Must specify exactly one of output_size and scale_factors"
+    )
+
+
+def get_scale_value(scales, idx):
+    if scales is None:
+        return None
+    return scales[idx]
+
+
 @register_decomposition(torch.ops.aten.upsample_bilinear2d.vec)
-@register_decomposition(torch.ops.aten.upsample_bilinear2d.vec, type="pre_autograd")
+@torch.ops.aten.upsample_bilinear2d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
+@torch.ops.aten.upsample_bilinear2d.vec.py_impl(DispatchKey.Autograd)
+def upsample_bilinear2d_vec(input, output_size, align_corners, scale_factors):
+    osize = upsample_compute_output_size(input.size(), output_size, scale_factors)
+    scale_h = get_scale_value(scale_factors, 0)
+    scale_w = get_scale_value(scale_factors, 1)
+
+    # NB: osize could be a list of float when scale_factors is float
+    # so we cannot redispatch to aten.upsample_bilinear2d.default here
+    return upsample_bilinear2d(input, osize, align_corners, scale_h, scale_w)
+
+
+@register_decomposition(torch.ops.aten.upsample_bilinear2d.default)
+@torch.ops.aten.upsample_bilinear2d.default.py_impl(DispatchKey.Autograd)
 @pw_cast_for_opmath
-def upsample_bilinear2d_vec(
+def upsample_bilinear2d(
     input: Tensor,
-    output_size: Optional[List[int]],
+    output_size: List[Union[int, float]],
     align_corners: bool,
-    scale_factors: Optional[List[float]],
+    scales_h: Optional[float] = None,
+    scales_w: Optional[float] = None,
 ) -> Tensor:
     # get dimensions of original image
     n_batch, n_channels, in_h, in_w = input.shape
 
-    if output_size is not None:
-        out_h = float(output_size[0])
-        out_w = float(output_size[1])
-    elif scale_factors is not None:
-        out_h = in_h * scale_factors[0]
-        out_w = in_w * scale_factors[1]
+    out_h = sym_float(output_size[0])
+    out_w = sym_float(output_size[1])
 
     # Calculate horizontal and vertical scaling factor
+    # TODO: Figure out if scales_h/scales_w matters here
     if out_h > 1:
         if align_corners:
-            h_scale_factor = (in_h - 1) / (int(out_h) - 1)
+            h_scale_factor = (in_h - 1) / (sym_int(out_h) - 1)
         else:
             h_scale_factor = in_h / out_h
     else:
@@ -1816,14 +1876,14 @@ def upsample_bilinear2d_vec(
 
     if out_w > 1:
         if align_corners:
-            w_scale_factor = (in_w - 1) / (int(out_w) - 1)
+            w_scale_factor = (in_w - 1) / (sym_int(out_w) - 1)
         else:
             w_scale_factor = in_w / out_w
     else:
         w_scale_factor = 0.0
 
-    i = torch.arange(int(out_h), dtype=input.dtype, device=input.device)
-    j = torch.arange(int(out_w), dtype=input.dtype, device=input.device)
+    i = torch.arange(sym_int(out_h), dtype=input.dtype, device=input.device)
+    j = torch.arange(sym_int(out_w), dtype=input.dtype, device=input.device)
 
     if align_corners:
         x = h_scale_factor * i
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 001fd455e82ee..0c59af77736af 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -406,6 +406,21 @@ def sample_inputs_batch_norm(op_info, device, dtype, requires_grad, **kwargs):
     # running_mean and running_var are required in evaluation mode (training: False) but not in training mode
     yield SampleInput(make_arg((1, 2, 3)), args=(None, None, None, None), kwargs={'training': True})
 
+def sample_inputs_softmax_backward_data(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+    cases = [
+        ((S,), 0),
+        ((S, S), 0),
+        ((S, M, S), -1),
+    ]
+    input_dtypes = [dtype]
+    if dtype == torch.float and device == 'cuda':
+        input_dtypes += [torch.float16]
+
+    for (shape, dim), input_dtype in product(cases, input_dtypes):
+        yield SampleInput(make_arg(shape), make_arg(shape), dim, input_dtype)
 
 def sample_inputs_native_batch_norm(op_info, device, dtype, requires_grad, **kwargs):
     samples = sample_inputs_batch_norm(op_info, device, dtype, requires_grad, **kwargs)
@@ -1173,7 +1188,7 @@ def sample_inputs_zero_(op_info, device, dtype, requires_grad, **kwargs):
     cases = ((), (S, S, S), (S,))
 
     for shape in cases:
-        yield(SampleInput(make_arg(shape)))
+        yield SampleInput(make_arg(shape))
 
 # TODO: add reduction kwargs
 def sample_inputs_multi_margin_loss(op_info, device, dtype, requires_grad, **kwargs):
@@ -3745,8 +3760,8 @@ def sample_inputs_upsample(mode, self, device, dtype, requires_grad, **kwargs):
 
     def shape(size, rank, with_batch_channel=True):
         if with_batch_channel:
-            return tuple([N, C] + ([size] * rank))
-        return tuple([size] * rank)
+            return torch.Size([N, C] + ([size] * rank))
+        return torch.Size([size] * rank)
 
     make_arg = partial(make_tensor, device=device, dtype=dtype,
                        requires_grad=requires_grad, low=-1, high=1)
@@ -5794,9 +5809,9 @@ def sample_inputs_split(op_info, device, dtype, requires_grad, *, list_args=Fals
 
     if list_args:
         cases = (
-            ((S, S, S), ([int(S / 3), S - int(S / 3) * 2, int(S / 3)],)),
-            ((S, S, S), ([int(S / 2), S - int(S / 2) * 2, int(S / 2)], 2),),
-            ((S, S, S), ([int(S / 2), S - int(S / 2) * 2, int(S / 2)], -2),)
+            ((S, S, S), (torch.Size([int(S / 3), S - int(S / 3) * 2, int(S / 3)]),)),
+            ((S, S, S), (torch.Size([int(S / 2), S - int(S / 2) * 2, int(S / 2)]), 2),),
+            ((S, S, S), (torch.Size([int(S / 2), S - int(S / 2) * 2, int(S / 2)]), -2),)
         )
     else:
         cases = (  # type: ignore[assignment]
@@ -5811,10 +5826,10 @@ def sample_inputs_split(op_info, device, dtype, requires_grad, *, list_args=Fals
 def sample_inputs_split_with_sizes(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
-    cases = (((S, S, S), ([int(S / 3), S - int(S / 3) * 2, int(S / 3)],)),
-             ((S, S, S), ([int(S / 3), S - int(S / 3), 0],)),
-             ((S, S, S), ([int(S / 3), S - int(S / 3) * 2, int(S / 3)], 2)),
-             ((S, S, S), ([int(S / 3), S - int(S / 3) * 2, int(S / 3)], -2)),
+    cases = (((S, S, S), (torch.Size([int(S / 3), S - int(S / 3) * 2, int(S / 3)]),)),
+             ((S, S, S), (torch.Size([int(S / 3), S - int(S / 3), 0]),)),
+             ((S, S, S), (torch.Size([int(S / 3), S - int(S / 3) * 2, int(S / 3)]), 2)),
+             ((S, S, S), (torch.Size([int(S / 3), S - int(S / 3) * 2, int(S / 3)]), -2)),
              )
 
     for shape, args in cases:
@@ -6190,7 +6205,7 @@ def sample_inputs_resize_ops(op_info, device, dtype, requires_grad, **kwargs):
         else:
             raise ValueError("sample_inputs_resize_ops is being used with incorrect operator")
 
-        yield(SampleInput(make_arg(shape, requires_grad=requires_grad), args=args))
+        yield SampleInput(make_arg(shape, requires_grad=requires_grad), args=args)
 
 def sample_inputs_view_reshape(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
@@ -6446,7 +6461,7 @@ def sample_inputs_expand(op_info, device, dtype, requires_grad, **kwargs):
 
     for case in cases:
         shape, args = case
-        yield(SampleInput(make_arg(shape), args=(args, )))
+        yield SampleInput(make_arg(shape), args=(args,))
 
 def sample_inputs_conversion(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
@@ -6469,8 +6484,8 @@ def sample_inputs_expand_as(op_info, device, dtype, requires_grad, **kwargs):
              )
 
     for shape, shape_other in cases:
-        yield(SampleInput(make_arg(shape, requires_grad=requires_grad),
-                          args=(make_arg(shape_other, requires_grad=False), )))
+        yield SampleInput(make_arg(shape, requires_grad=requires_grad),
+                          args=(make_arg(shape_other, requires_grad=False),))
 
 
 def sample_inputs_where(op_info, device, dtype, requires_grad, **kwargs):
@@ -6588,8 +6603,8 @@ def sample_inputs_nonzero(op_info, device, dtype, requires_grad, **kwargs):
         inputs.append(mixed)
 
     for input_t, as_tuple in product(inputs, [False, True]):
-        yield(SampleInput(input_t.clone().requires_grad_(requires_grad),
-                          kwargs=dict(as_tuple=as_tuple)))
+        yield SampleInput(input_t.clone().requires_grad_(requires_grad),
+                          kwargs=dict(as_tuple=as_tuple))
 
 def sample_inputs_chunk(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
@@ -6600,7 +6615,7 @@ def sample_inputs_chunk(op_info, device, dtype, requires_grad, **kwargs):
 
     for case in cases:
         shape, args = case
-        yield(SampleInput(make_arg(shape), args=args))
+        yield SampleInput(make_arg(shape), args=args)
 
 def reference_inputs_chunk(op, device, dtype, requires_grad, **kwargs):
     yield from sample_inputs_chunk(op, device, dtype, requires_grad, **kwargs)
@@ -6678,6 +6693,15 @@ def sample_inputs_dropout(op_info, device, dtype, requires_grad, *,
         yield SampleInput(make_arg(case), p=p, training=training)
     yield SampleInput(make_arg(case))
 
+def sample_inputs_dropout_backward(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_mask = partial(make_tensor, device=device, dtype=torch.bool, requires_grad=False)
+
+    cases = ((S, S, S, S), (S,), ())
+    scale_vals = [0.0, 1.0, 2.0]
+
+    for case, scale in product(cases, scale_vals):
+        yield SampleInput(make_arg(case), make_mask(case), scale)
 
 def sample_inputs_embedding_bag(op_info, device, dtype, requires_grad, **kwargs):
     def make_input(shape):
@@ -8095,7 +8119,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     in_shape = input.shape
     in_rank = len(in_shape)
     for d in start_dim, end_dim:
-        if not((in_rank == 0 and d in (-1, 0)) or -in_rank <= d < in_rank):
+        if not ((in_rank == 0 and d in (-1, 0)) or -in_rank <= d < in_rank):
             raise IndexError(f"Dimension out of range (expected to be in range of [{-in_rank}, {in_rank-1}], but got {d}")
     end_dim = end_dim if end_dim >= 0 else in_rank + end_dim
     start_dim = start_dim if start_dim >= 0 else in_rank + start_dim
@@ -8424,7 +8448,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            variant_test_name='decomposed',
            dtypes=all_types_and_complex_and(torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
-                                                       *[torch.bfloat16] if(CUDA11OrLater or TEST_WITH_ROCM) else []),
+                                                       *[torch.bfloat16] if (CUDA11OrLater or TEST_WITH_ROCM) else []),
            assert_autodiffed=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
@@ -10554,6 +10578,22 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            supports_out=True),
+    OpInfo(
+        '_softmax_backward_data',
+        op=torch.ops.aten._softmax_backward_data,
+        aten_name='_softmax_backward_data',
+        dtypes=floating_types_and(torch.bfloat16),
+        dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16),
+        sample_inputs_func=sample_inputs_softmax_backward_data,
+        assert_autodiffed=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_out=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples', device_type='cpu'),
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)),
+        ),
+    ),
     # `softmin` supports different dtypes based on whether `dtype` argument,
     # is passed or not. Hence two OpInfo entries, one with dtype and other without.
     # https://github.com/pytorch/pytorch/issues/68752
@@ -15927,6 +15967,22 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         sample_inputs_func=sample_inputs_dropout,
         inplace_variant=lambda input, *args, **kwargs:
             wrapper_set_seed(torch.nn.functional.dropout, input, *args, **kwargs, inplace=True)),
+    OpInfo(
+        "native_dropout_backward",
+        op=torch.ops.aten.native_dropout_backward.default,
+        aten_name="native_dropout_backward",
+        dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+        dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+        supports_out=False,
+        sample_inputs_func=sample_inputs_dropout_backward,
+        skips=(
+            DecorateInfo(unittest.skip('Skipped!'), 'TestJit', 'test_variant_consistency_jit'),
+            # Lazy tensor failures
+            DecorateInfo(unittest.skip('Skipped!'), 'TestLazyOpInfo', 'test_dispatched_to_lazy'),
+            DecorateInfo(unittest.expectedFailure, 'TestLazyOpInfo', 'test_correctness'),
+            DecorateInfo(unittest.expectedFailure, 'TestLazyOpInfo', 'test_correctness_with_reusing_ir'),
+        ),
+    ),
     OpInfo(
         "nn.functional.dropout2d",
         op=lambda input, *args, **kwargs:

From 75552225298a5af743e19823a7136fb206962e6d Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Fri, 11 Nov 2022 08:55:40 -0800
Subject: [PATCH 0903/1922] [ao] quant_type.py fixing public v private (#87519)

Summary: made _get_quant_type_to_str private

Test Plan: python test/test_public_bindings.py

Reviewers:

Subscribers:

Tasks:

Tags:

Differential Revision: [D40709282](https://our.internmc.facebook.com/intern/diff/D40709282)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87519
Approved by: https://github.com/jcaip
---
 test/allowlist_for_publicAPI.json                   | 4 ++--
 test/quantization/ao_migration/test_quantization.py | 2 +-
 test/quantization/fx/test_quantize_fx.py            | 4 ++--
 torch/ao/quantization/__init__.py                   | 1 -
 torch/ao/quantization/fx/custom_config.py           | 6 +++---
 torch/ao/quantization/quant_type.py                 | 3 +--
 torch/quantization/__init__.py                      | 2 +-
 torch/quantization/quant_type.py                    | 2 +-
 8 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index ba4a2e96df219..94ff57700af67 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -795,7 +795,7 @@
     "prepare_qat",
     "propagate_qconfig_",
     "qconfig_equals",
-    "quant_type_to_str",
+    "_get_quant_type_to_str",
     "quantize",
     "quantize_dynamic",
     "quantize_dynamic_jit",
@@ -874,7 +874,7 @@
   ],
   "torch.quantization.quant_type": [
     "QuantType",
-    "quant_type_to_str"
+    "_get_quant_type_to_str"
   ],
   "torch.quantization.quantization_mappings": [
     "get_default_compare_output_module_list",
diff --git a/test/quantization/ao_migration/test_quantization.py b/test/quantization/ao_migration/test_quantization.py
index 2617e7a1187d3..9c246e1b7cd89 100644
--- a/test/quantization/ao_migration/test_quantization.py
+++ b/test/quantization/ao_migration/test_quantization.py
@@ -118,7 +118,7 @@ def test_package_import_quant_type(self):
     def test_function_import_quant_type(self):
         function_list = [
             'QuantType',
-            'quant_type_to_str',
+            '_get_quant_type_to_str',
         ]
         self._test_function_import('quant_type', function_list)
 
diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 6eb9246c85a7c..6721e397180e2 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -33,8 +33,8 @@
 
 from torch.ao.quantization import (
     QuantType,
-    quant_type_to_str,
 )
+from torch.ao.quantization.quant_type import _get_quant_type_to_str
 
 from torch.ao.quantization import (
     QuantStub,
@@ -2636,7 +2636,7 @@ def forward(self, x):
         }
 
         for quant_type in [QuantType.STATIC, QuantType.DYNAMIC]:
-            key = quant_type_to_str(quant_type)
+            key = _get_quant_type_to_str(quant_type)
             qconfig, quantized_module_class, num_observers = test_configs[key]
             qconfig_dict = {"": qconfig}
             if key == "static":
diff --git a/torch/ao/quantization/__init__.py b/torch/ao/quantization/__init__.py
index 2e8390c1acc7b..1ba2a60ed3d12 100644
--- a/torch/ao/quantization/__init__.py
+++ b/torch/ao/quantization/__init__.py
@@ -124,7 +124,6 @@
     "prepare_qat",
     "propagate_qconfig_",
     "qconfig_equals",
-    "quant_type_to_str",
     "quantize",
     "quantize_dynamic",
     "quantize_dynamic_jit",
diff --git a/torch/ao/quantization/fx/custom_config.py b/torch/ao/quantization/fx/custom_config.py
index 0f5f5bfe8d158..9d08853a41260 100644
--- a/torch/ao/quantization/fx/custom_config.py
+++ b/torch/ao/quantization/fx/custom_config.py
@@ -4,7 +4,7 @@
 
 from torch.ao.quantization import QConfigMapping
 from torch.ao.quantization.backend_config import BackendConfig
-from torch.ao.quantization.quant_type import QuantType, _quant_type_from_str, quant_type_to_str
+from torch.ao.quantization.quant_type import QuantType, _quant_type_from_str, _get_quant_type_to_str
 
 
 __all__ = [
@@ -263,7 +263,7 @@ def _make_tuple(key: Any, e: StandaloneModuleConfigEntry):
         for quant_type, float_to_observed_mapping in self.float_to_observed_mapping.items():
             if FLOAT_TO_OBSERVED_DICT_KEY not in d:
                 d[FLOAT_TO_OBSERVED_DICT_KEY] = {}
-            d[FLOAT_TO_OBSERVED_DICT_KEY][quant_type_to_str(quant_type)] = float_to_observed_mapping
+            d[FLOAT_TO_OBSERVED_DICT_KEY][_get_quant_type_to_str(quant_type)] = float_to_observed_mapping
         if len(self.non_traceable_module_names) > 0:
             d[NON_TRACEABLE_MODULE_NAME_DICT_KEY] = self.non_traceable_module_names
         if len(self.non_traceable_module_classes) > 0:
@@ -350,7 +350,7 @@ def to_dict(self) -> Dict[str, Any]:
         for quant_type, observed_to_quantized_mapping in self.observed_to_quantized_mapping.items():
             if OBSERVED_TO_QUANTIZED_DICT_KEY not in d:
                 d[OBSERVED_TO_QUANTIZED_DICT_KEY] = {}
-            d[OBSERVED_TO_QUANTIZED_DICT_KEY][quant_type_to_str(quant_type)] = observed_to_quantized_mapping
+            d[OBSERVED_TO_QUANTIZED_DICT_KEY][_get_quant_type_to_str(quant_type)] = observed_to_quantized_mapping
         if len(self.preserved_attributes) > 0:
             d[PRESERVED_ATTRIBUTES_DICT_KEY] = self.preserved_attributes
         return d
diff --git a/torch/ao/quantization/quant_type.py b/torch/ao/quantization/quant_type.py
index 9d2a3a2bdc7b2..d3b1d034a1feb 100644
--- a/torch/ao/quantization/quant_type.py
+++ b/torch/ao/quantization/quant_type.py
@@ -2,7 +2,6 @@
 
 __all__ = [
     "QuantType",
-    "quant_type_to_str",
 ]
 
 # Quantization type (dynamic quantization, static quantization).
@@ -21,7 +20,7 @@ class QuantType(enum.IntEnum):
 }
 
 # TODO: make this private
-def quant_type_to_str(quant_type: QuantType) -> str:
+def _get_quant_type_to_str(quant_type: QuantType) -> str:
     return _quant_type_to_str[quant_type]
 
 def _quant_type_from_str(name: str) -> QuantType:
diff --git a/torch/quantization/__init__.py b/torch/quantization/__init__.py
index df9a75d022643..6e4ede123eb0f 100644
--- a/torch/quantization/__init__.py
+++ b/torch/quantization/__init__.py
@@ -30,7 +30,7 @@ def default_eval_fn(model, calib_data):
     # Top level API for graph mode quantization on GraphModule(torch.fx)
     # 'fuse_fx', 'quantize_fx',  # TODO: add quantize_dynamic_fx
     # 'prepare_fx', 'prepare_dynamic_fx', 'convert_fx',
-    'QuantType', 'quant_type_to_str',  # quantization type
+    'QuantType',  # quantization type
     # custom module APIs
     'get_default_static_quant_module_mappings', 'get_static_quant_module_class',
     'get_default_dynamic_quant_module_mappings',
diff --git a/torch/quantization/quant_type.py b/torch/quantization/quant_type.py
index cd2e5e020a6a3..c7f7cc15dbdd6 100644
--- a/torch/quantization/quant_type.py
+++ b/torch/quantization/quant_type.py
@@ -8,4 +8,4 @@
 """
 
 from torch.ao.quantization.quant_type import QuantType
-from torch.ao.quantization.quant_type import quant_type_to_str
+from torch.ao.quantization.quant_type import _get_quant_type_to_str

From d96ce25c003fb5d4e5f0e8b9ce6e8c4479efe6fe Mon Sep 17 00:00:00 2001
From: anjali411 <chourdiaanjali123@gmail.com>
Date: Tue, 15 Nov 2022 13:05:30 +0000
Subject: [PATCH 0904/1922] Symintify view_as_complex and view_as_real (#89052)

Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom):
* __->__ #89052
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89052
Approved by: https://github.com/ezyang
---
 aten/src/ATen/native/ComplexHelper.h | 31 ++++++++++++++--------------
 test/functorch/test_aotdispatch.py   |  4 ----
 test/test_proxy_tensor.py            |  1 -
 torch/_subclasses/fake_tensor.py     |  2 ++
 4 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/aten/src/ATen/native/ComplexHelper.h b/aten/src/ATen/native/ComplexHelper.h
index 8d69f6292772c..9533115a7066c 100644
--- a/aten/src/ATen/native/ComplexHelper.h
+++ b/aten/src/ATen/native/ComplexHelper.h
@@ -18,19 +18,18 @@ namespace at { namespace native {
 // View tensor with new dtype, storage offset, sizes and strides
 inline Tensor view_tensor(
     const Tensor &tensor, ScalarType dtype,
-    int64_t offset, IntArrayRef sizes, IntArrayRef strides) {
+    c10::SymInt offset, SymIntArrayRef sizes, SymIntArrayRef strides) {
   Storage storage = tensor.storage();
   auto key_set = tensor.key_set().remove(DispatchKey::Conjugate);
   auto new_tensor = detail::make_tensor<TensorImpl>(
       c10::TensorImpl::VIEW, std::move(storage), key_set, scalarTypeToTypeMeta(dtype));
   auto * impl = new_tensor.unsafeGetTensorImpl();
-  impl->set_storage_offset(offset);
-  impl->set_sizes_and_strides(sizes, strides);
+  impl->set_sizes_and_strides(sizes, strides, offset);
   return new_tensor;
 }
 
-inline DimVector computeStrideForViewAsReal(IntArrayRef oldstride) {
-  DimVector res(oldstride.size() + 1);
+inline SymDimVector computeStrideForViewAsReal(SymIntArrayRef oldstride) {
+  SymDimVector res(oldstride.size() + 1);
   for (const auto i : c10::irange(oldstride.size())) {
     res[i] = oldstride[i] * 2;
   }
@@ -40,13 +39,13 @@ inline DimVector computeStrideForViewAsReal(IntArrayRef oldstride) {
 
 Tensor _view_as_real_physical(const Tensor& self) {
   TORCH_CHECK(self.is_complex(), "view_as_real is only supported for complex tensors");
-  auto old_sizes = self.sizes();
-  DimVector new_sizes(old_sizes.size() + 1);
+  auto old_sizes = self.sym_sizes();
+  SymDimVector new_sizes(old_sizes.size() + 1);
   std::copy(old_sizes.begin(), old_sizes.end(), new_sizes.begin());
   // last dimension will always have two elements containing the real and imag vals
   new_sizes.back() = 2;
-  auto new_strides = computeStrideForViewAsReal(self.strides());
-  auto new_storage_offset = 2 * self.storage_offset();
+  auto new_strides = computeStrideForViewAsReal(self.sym_strides());
+  auto new_storage_offset = self.sym_storage_offset() * 2;
   const auto float_type = c10::toRealValueType(self.scalar_type());
   auto real_tensor = view_tensor(self, float_type, new_storage_offset, new_sizes, new_strides);
   return real_tensor;
@@ -60,11 +59,11 @@ Tensor view_as_real(const Tensor& self) {
   return _view_as_real_physical(self);
 }
 
-inline DimVector computeStrideForViewAsComplex(IntArrayRef oldstride) {
+inline SymDimVector computeStrideForViewAsComplex(SymIntArrayRef oldstride) {
   const int64_t dim = oldstride.size();
   TORCH_CHECK(oldstride[dim-1] == 1, "Tensor must have a last dimension with stride 1");
 
-  DimVector res(dim - 1);
+  SymDimVector res(dim - 1);
   for (const auto i : c10::irange(res.size())) {
     TORCH_CHECK(oldstride[i] % 2 == 0, "Tensor must have a stride divisible by 2 for all but last dimension");
     res[i] = oldstride[i] / 2;
@@ -79,16 +78,16 @@ Tensor view_as_complex(const Tensor& self) {
     self.scalar_type() == kFloat || self.scalar_type() == kDouble || self.scalar_type() == kHalf,
     "view_as_complex is only supported for half, float and double tensors, but got a tensor of scalar type: ", self.scalar_type());
 
-  auto old_sizes = self.sizes();
+  auto old_sizes = self.sym_sizes();
   TORCH_CHECK(old_sizes.size() != 0, "Input tensor must have one or more dimensions");
   TORCH_CHECK(old_sizes[old_sizes.size()-1] == 2, "Tensor must have a last dimension of size 2");
-  DimVector new_sizes(old_sizes.begin(), old_sizes.end() - 1);
+  SymDimVector new_sizes(old_sizes.begin(), old_sizes.end() - 1);
 
-  const auto new_strides = computeStrideForViewAsComplex(self.strides());
+  const auto new_strides = computeStrideForViewAsComplex(self.sym_strides());
   const auto complex_type = c10::toComplexType(self.scalar_type());
 
-  TORCH_CHECK(self.storage_offset() % 2 == 0, "Tensor must have a storage_offset divisible by 2");
-  const auto new_storage_offset = self.storage_offset() / 2;
+  TORCH_CHECK(self.sym_storage_offset() % 2 == 0, "Tensor must have a storage_offset divisible by 2");
+  const auto new_storage_offset = self.sym_storage_offset() / 2;
 
   return view_tensor(self, complex_type, new_storage_offset, new_sizes, new_strides);
 }
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index eb34a3fb75828..752b03ac9984f 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1000,14 +1000,11 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('baddbmm', ''),  # aten.baddbmm.default - couldn't find symbolic meta function/decomposition
     xfail('block_diag', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('cartesian_prod', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
-    xfail('cdouble'),  # RuntimeError: aten.view_as_real.default - couldn't find symbolic meta function/decomposition
-    xfail('cfloat'),  # RuntimeError: aten.view_as_real.default - couldn't find symbolic meta function/decomposition
     xfail('cdist', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('cholesky_inverse', ''),  # could not find kernel
     xfail('cholesky_solve', ''),  # could not find kernel
     xfail('column_stack', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('combinations', ''),  # aten.masked_select.default
-    xfail('complex', ''),  # aten.view_as_real.default - couldn't find symbolic meta function/decomposition
     xfail('cross', ''),  # aten.linalg_cross.default - couldn't find symbolic meta function/decomposition
     xfail('cummax', ''),  # aten.cummax.default - couldn't find symbolic meta function/decomposition
     xfail('cummin', ''),  # aten.cummin.default - couldn't find symbolic meta function/decomposition
@@ -1211,7 +1208,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('unflatten', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('var', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('var_mean', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
-    xfail('view_as_complex', ''),  # aten.view_as_complex.default - couldn't find symbolic meta function/deco...
     xfail('view_as', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('vsplit', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
 }
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 894b35693430e..24efcab9e5cb6 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1321,7 +1321,6 @@ def f(a, b, c, d, e):
     xfail('trapz', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('trapezoid', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('triangular_solve', ''),  # aten.triangular_solve.default - couldn't find symbolic meta function/decomposition
-    xfail('view_as_complex', ''),  # aten.view_as_complex.default - couldn't find symbolic meta function/decomposition
     xfail('view_as', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('vsplit', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('unique_consecutive', ''),  # aten.unique_consecutive.default - couldn't find symbolic meta function/decomposition
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 65f571f93ec09..8dec2475df15f 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -971,6 +971,8 @@ def cpp_meta_supports_symint(self, func):
             aten.as_strided_.default,
             aten.zeros.default,
             aten.detach.default,
+            aten.view_as_real.default,
+            aten.view_as_complex.default,
             aten.set_.source_Storage_storage_offset,
             aten._sparse_coo_tensor_with_dims_and_tensors.default,
         ]

From e957ba645bb0d0d19b61960ddd7e62da9c91ad39 Mon Sep 17 00:00:00 2001
From: "Tugsbayasgalan (Tugsuu) Manlaibaatar" <tmanlaibaatar@fb.com>
Date: Mon, 14 Nov 2022 23:26:15 -0800
Subject: [PATCH 0905/1922] Rewrite assert statement with torch._assert under
 config (#88246)

This diff rewrites assert statement in python with torch._assert under config. The resulting graph looks something like:
```
SOURCE CODE:
def f(x):
      assert x[0] == 3
      return x.cos()

CAPTURED GRAPH:
graph():
    %arg0 : [#users=2] = placeholder[target=arg0]
    %getitem : [#users=1] = call_function[target=operator.getitem](args = (%arg0, 0), kwargs = {})
    %eq : [#users=1] = call_function[target=operator.eq](args = (%getitem, 3), kwargs = {})
    %_assert : [#users=0] = call_function[target=torch._assert](args = (%eq, "assertion_error"), kwargs = {})
    %cos : [#users=1] = call_method[target=cos](args = (%arg0,), kwargs = {})
    return cos
 ```
Note that this introduces side-effect as it could error out while executing graph, but the assertion can eliminated via DCE if we choose to ignore it.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88246
Approved by: https://github.com/jansel
---
 test/dynamo/test_repros.py        | 92 ++++++++++++++++++++++++++++++
 torch/_dynamo/config.py           |  3 +
 torch/_dynamo/symbolic_convert.py | 94 +++++++++++++++++++++++++++++++
 3 files changed, 189 insertions(+)

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 503231b4cb120..e30a1275ed135 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -1938,6 +1938,98 @@ def fn(x):
         self.assertEqual(cnt.frame_count, 1)
         self.assertEqual(cnt.op_count, 1)
 
+    @patch.object(torch._dynamo.config, "rewrite_assert_with_torch_assert", True)
+    def test_rewrite_assert_with_msg(self):
+        def f(x):
+            b = x.sin()
+            assert x[0] == 3, "First dim need to be 3"
+            return x.cos() + b
+
+        args = (torch.Tensor([3, 4, 5]),)
+        cnt = torch._dynamo.testing.CompileCounter()
+
+        opt_f = torch._dynamo.optimize(cnt, nopython=True)(f)
+        self.assertTrue(same(f(*args), opt_f(*args)))
+        self.assertEqual(cnt.op_count, 6)
+        self.assertEqual(cnt.frame_count, 1)
+
+        exported, _ = torch._dynamo.export(f, torch.Tensor([3, 4, 5]))
+        self.assertTrue(same(exported(*args), f(*args)))
+
+        with self.assertRaisesRegex(AssertionError, ""):
+            exported, _ = torch._dynamo.export(f, torch.Tensor([4, 4, 5]))
+
+    @patch.object(torch._dynamo.config, "rewrite_assert_with_torch_assert", True)
+    def test_not_rewrite_assert_for_other_errors(self):
+        def f(x):
+            b = x.sin()
+            if not x.sum() <= 3:
+                raise ValueError("input sum needs to be 3")
+            return x.cos() + b
+
+        args = (torch.Tensor([3, 4, 5]),)
+        opt_fn = torch._dynamo.optimize("eager")(f)
+        with self.assertRaisesRegex(ValueError, "input sum needs to be 3"):
+            opt_fn(*args)
+
+    # TODO (tmanlaibaatar) handle data-dependent fstring in assert statement.
+    @patch.object(torch._dynamo.config, "rewrite_assert_with_torch_assert", True)
+    def test_rewrite_assert_with_fstring_msg(self):
+        def f(x):
+            b = x.sin()
+            assert x[0] == 3, f"First dim need to be {x[0]}"
+            return x.cos() + b
+
+        args = (torch.Tensor([3, 4, 5]),)
+        with self.assertRaisesRegex(torch._dynamo.exc.Unsupported, "generic_jump"):
+            exported, _ = torch._dynamo.export(f, torch.Tensor([3, 4, 5]))
+
+    @patch.object(torch._dynamo.config, "rewrite_assert_with_torch_assert", True)
+    def test_rewrite_assert_without_msg(self):
+        def f(x):
+            b = x.sin()
+            assert x[0] == 3
+            return x.cos() + b
+
+        args = (torch.Tensor([3, 4, 5]),)
+        exported, _ = torch._dynamo.export(f, torch.Tensor([3, 4, 5]))
+        self.assertTrue(same(exported(*args), f(*args)))
+
+        with self.assertRaisesRegex(AssertionError, ""):
+            exported, _ = torch._dynamo.export(f, torch.Tensor([4, 4, 5]))
+
+    @patch.object(torch._dynamo.config, "rewrite_assert_with_torch_assert", True)
+    def test_rewrite_assert_noop(self):
+        def f(x):
+            b = x.sin()
+            assert True
+            assert x.dtype == torch.float32
+            return x.cos() + b
+
+        args = (torch.Tensor([3, 4, 5]),)
+        exported, _ = torch._dynamo.export(f, torch.Tensor([3, 4, 5]))
+        self.assertTrue(same(exported(*args), f(*args)))
+
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_f = torch._dynamo.optimize(cnt, nopython=True)(f)
+        self.assertTrue(same(f(*args), opt_f(*args)))
+        # torch._assert shouldn't be in the graph
+        self.assertEqual(cnt.op_count, 3)
+        self.assertEqual(cnt.frame_count, 1)
+
+        exported, _ = torch._dynamo.export(f, torch.Tensor([4, 4, 5]))
+        self.assertTrue(same(exported(*args), f(*args)))
+
+    @patch.object(torch._dynamo.config, "rewrite_assert_with_torch_assert", False)
+    def test_not_rewrite_assert(self):
+        def f(x):
+            b = x.sin()
+            assert x[0] == 3
+            return x.cos() + b
+
+        with self.assertRaisesRegex(torch._dynamo.exc.Unsupported, "generic_jump"):
+            torch._dynamo.export(f, torch.Tensor([3, 4, 5]))
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index 12088383e741c..39a1a6433419f 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -87,6 +87,9 @@
 # if an exception is encountered
 replay_record_enabled = False
 
+# Rewrite assert statement in python with torch._assert
+rewrite_assert_with_torch_assert = True
+
 # Show a warning on every graph break
 print_graph_breaks = False
 
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index d707bee930ee8..d5c05f76efb0a 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -53,6 +53,7 @@
     fake_tensors_available,
     graph_break_dup_warning_checker,
     istype,
+    proxy_args_kwargs,
 )
 from .variables.base import MutableLocal, typestr, VariableTracker
 from .variables.builder import VariableBuilder, wrap_fx_proxy
@@ -121,10 +122,103 @@ def impl(self: "InstructionTranslatorBase", inst: Instruction):
     return impl
 
 
+def _detect_and_normalize_assert_statement(
+    self: "InstructionTranslatorBase", truth_fn: typing.Callable, push: bool
+):
+    # Detect if this jump instruction is assert and normalize the assert
+    # by pushing dummy error message when nothing is given.
+    #
+    # Python 3.9 assertion is in following format:
+    # 18 POP_JUMP_IF_TRUE       28
+    # 20 LOAD_ASSERTION_ERROR
+    # 22 LOAD_CONST               3 ('Assert message') -> optional instruction
+    # 24 CALL_FUNCTION            1                    -> optional instruction
+    # 26 RAISE_VARARGS
+    #
+    # Python 3.8 assertion is in following format:
+    # 18 POP_JUMP_IF_TRUE       28
+    # 20 LOAD_GLOBAL              0 (Assertion type)
+    # 22 LOAD_CONST               3 ('Assert message') -> optional instruction
+    # 24 CALL_FUNCTION            1                    -> optional instruction
+    # 26 RAISE_VARARGS            1
+
+    if (truth_fn is not operator.truth) or push:
+        return False
+
+    current_instruction_pointer = self.instruction_pointer
+    inst = self.instructions[current_instruction_pointer]
+    # Detect LOAD_ASSERTION_ERROR or LOAD_GLOBAL 0
+    if sys.version_info < (3, 9):
+        if inst.opname != "LOAD_GLOBAL" or inst.argval != "AssertionError":
+            return False
+    else:
+        if inst.opname != "LOAD_ASSERTION_ERROR":
+            return False
+
+    current_instruction_pointer += 1
+
+    if current_instruction_pointer >= len(self.instructions):
+        return False
+
+    inst = self.instructions[current_instruction_pointer]
+    has_error_msg = False
+    # DETECT RAISE_VARARGS or LOAD CONST
+    if inst.opname == "LOAD_CONST":
+        if not isinstance(inst.argval, str):
+            return False
+        self.LOAD_CONST(inst)
+        has_error_msg = True
+
+        # if it is LOAD_CONSTANT, it must be followed by CALL_FUNCTION
+        current_instruction_pointer += 1
+        if current_instruction_pointer >= len(self.instructions):
+            return False
+        inst = self.instructions[current_instruction_pointer]
+        if inst.opname != "CALL_FUNCTION":
+            return False
+
+        # CALL_FUNCTION should be followed by RAISE_VARARGS
+        current_instruction_pointer += 1
+        if current_instruction_pointer >= len(self.instructions):
+            return False
+        inst = self.instructions[current_instruction_pointer]
+
+    if inst.opname != "RAISE_VARARGS":
+        return False
+
+    if not has_error_msg:
+        # Push dummy value instead of error message
+        self.push(ConstantVariable("assertion error"))
+
+    return True
+
+
 def generic_jump(truth_fn: typing.Callable, push: bool):
     def inner(self: "InstructionTranslatorBase", inst: Instruction):
         value: VariableTracker = self.pop()
         self.output.guards.update(value.guards)
+        if (
+            config.rewrite_assert_with_torch_assert
+            and _detect_and_normalize_assert_statement(self, truth_fn, push)
+        ):
+            error_msg: VariableTracker = self.pop()
+            self.output.guards.update(error_msg.guards)
+            # Skip over things like `assert True`
+            if value.is_python_constant() and bool(value.as_python_constant()):
+                self.jump(inst)
+                return
+
+            # Manually insert torch._assert instead of python assert and jump over
+            # assert related instructions as we don't need them anymore.
+            self.output.create_proxy(
+                "call_function",
+                torch._assert,
+                *proxy_args_kwargs((value, error_msg), {}),
+                current_tx=self,
+            )
+            self.jump(inst)
+            return
+
         if value.is_python_constant():
             if truth_fn(value.as_python_constant()):
                 push and self.push(value)

From 8f66b3688d7dd278111ab3cd9763a29b6bc01134 Mon Sep 17 00:00:00 2001
From: Pruthvi Madugundu <pmagundu@amd.com>
Date: Tue, 15 Nov 2022 17:49:00 +0000
Subject: [PATCH 0906/1922] [ROCm] Enable python ref executor UTs for ROCm
 (#88981)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88981
Approved by: https://github.com/mruberry
---
 test/test_ops.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index c688f6521af14..0ef2e4ee6d60e 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -57,7 +57,6 @@
     onlyCPU,
     onlyNativeDeviceTypes,
     OpDTypes,
-    skipCUDAIfRocm,
     skipMeta,
 )
 from torch._subclasses.fake_tensor import (
@@ -393,7 +392,6 @@ def test_python_ref_torch_fallback(self, device, dtype, op):
 
     @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
     @onlyCUDA
-    @skipCUDAIfRocm
     @ops(python_ref_db)
     @parametrize('executor', ['aten', 'nvfuser'])
     @skipIfTorchInductor("Takes too long for inductor")

From 9cfcbca742ced256329d8d827020d6af7ea16081 Mon Sep 17 00:00:00 2001
From: Zain Rizvi <zainr@fb.com>
Date: Tue, 15 Nov 2022 17:55:29 +0000
Subject: [PATCH 0907/1922] Allow ROCm runners to have 2 or more gpus (#89011)

[This run](https://github.com/pytorch/pytorch/actions/runs/3432340660/jobs/5721731207) failed claiming that it couldn't detect GPUs on the runner. Inspecting the rocminfo output (higher up in logs) show that it in fact had three GPUs, but the workflow is currently setup to expect either 2 or 4 gpus.

The workflow files currently have no way of specifying wither it'll get a 2 gpu or a 4 gpu machine, so really 2 is all any test can expect to get. [This old PR](https://github.com/pytorch/pytorch/pull/72142/files) shows that historically ROCm runners only had 4 gpus, then later the logic was extended to expect 2 GPU runners as well.

It's not clear how the ROCm runner ended up with 3 gpus instead of 2 or 4 (something for ROCm folks to look into) but there doesn't seem to be a good reason for ROCm workflows to fail if 3 (or 5) gpus ever show up on a machine. This PR makes the workflows resilient to ROCm having these alternate GPU counts

Also filed https://github.com/pytorch/pytorch/issues/89012 against the ROCm team to explore why the runner only had 3 gpus

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89011
Approved by: https://github.com/huydhn
---
 .github/actions/setup-rocm/action.yml         |  7 ++-
 .github/templates/common.yml.j2               |  7 ++-
 ...inux-binary-libtorch-cxx11-abi-nightly.yml | 28 ++++++++--
 ...inux-binary-libtorch-pre-cxx11-nightly.yml | 28 ++++++++--
 ...nerated-linux-binary-manywheel-nightly.yml | 56 ++++++++++++++++---
 5 files changed, 108 insertions(+), 18 deletions(-)

diff --git a/.github/actions/setup-rocm/action.yml b/.github/actions/setup-rocm/action.yml
index 97dfd22c76ac0..d91762eb9a861 100644
--- a/.github/actions/setup-rocm/action.yml
+++ b/.github/actions/setup-rocm/action.yml
@@ -36,7 +36,12 @@ runs:
       run: |
         ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
         if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
-            echo "Failed to detect GPUs on the runner"
+            if [[ $ngpu -eq 0 ]]; then
+              echo "Error: Failed to detect any GPUs on the runner"
+            else
+              echo "Error: Detected $ngpu GPUs on the runner, when only 2 or 4 were expected"
+            fi
+            echo "Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
             exit 1
         fi
 
diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2
index a2941546abe1c..edb652ff16ce5 100644
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@@ -78,7 +78,12 @@ concurrency:
         run: |
           ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
           if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
-              echo "Failed to detect GPUs on the runner"
+              if [[ $ngpu -eq 0 ]]; then
+                echo "Error: Failed to detect any GPUs on the runner"
+              else
+                echo "Error: Detected $ngpu GPUs on the runner, when only 2 or 4 were expected"
+              fi
+              echo "Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
               exit 1
           fi
       - name: Runner health check disconnect on failure
diff --git a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
index 6a23b85f433a0..f9ab6798787fb 100644
--- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
@@ -845,7 +845,12 @@ jobs:
         run: |
           ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
           if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
-              echo "Failed to detect GPUs on the runner"
+              if [[ $ngpu -eq 0 ]]; then
+                echo "Error: Failed to detect any GPUs on the runner"
+              else
+                echo "Error: Detected $ngpu GPUs on the runner, when only 2 or 4 were expected"
+              fi
+              echo "Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
               exit 1
           fi
       - name: Runner health check disconnect on failure
@@ -988,7 +993,12 @@ jobs:
         run: |
           ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
           if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
-              echo "Failed to detect GPUs on the runner"
+              if [[ $ngpu -eq 0 ]]; then
+                echo "Error: Failed to detect any GPUs on the runner"
+              else
+                echo "Error: Detected $ngpu GPUs on the runner, when only 2 or 4 were expected"
+              fi
+              echo "Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
               exit 1
           fi
       - name: Runner health check disconnect on failure
@@ -1131,7 +1141,12 @@ jobs:
         run: |
           ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
           if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
-              echo "Failed to detect GPUs on the runner"
+              if [[ $ngpu -eq 0 ]]; then
+                echo "Error: Failed to detect any GPUs on the runner"
+              else
+                echo "Error: Detected $ngpu GPUs on the runner, when only 2 or 4 were expected"
+              fi
+              echo "Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
               exit 1
           fi
       - name: Runner health check disconnect on failure
@@ -1274,7 +1289,12 @@ jobs:
         run: |
           ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
           if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
-              echo "Failed to detect GPUs on the runner"
+              if [[ $ngpu -eq 0 ]]; then
+                echo "Error: Failed to detect any GPUs on the runner"
+              else
+                echo "Error: Detected $ngpu GPUs on the runner, when only 2 or 4 were expected"
+              fi
+              echo "Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
               exit 1
           fi
       - name: Runner health check disconnect on failure
diff --git a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
index 27358089ba2df..55e4a19b8e8ab 100644
--- a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
@@ -845,7 +845,12 @@ jobs:
         run: |
           ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
           if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
-              echo "Failed to detect GPUs on the runner"
+              if [[ $ngpu -eq 0 ]]; then
+                echo "Error: Failed to detect any GPUs on the runner"
+              else
+                echo "Error: Detected $ngpu GPUs on the runner, when only 2 or 4 were expected"
+              fi
+              echo "Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
               exit 1
           fi
       - name: Runner health check disconnect on failure
@@ -988,7 +993,12 @@ jobs:
         run: |
           ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
           if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
-              echo "Failed to detect GPUs on the runner"
+              if [[ $ngpu -eq 0 ]]; then
+                echo "Error: Failed to detect any GPUs on the runner"
+              else
+                echo "Error: Detected $ngpu GPUs on the runner, when only 2 or 4 were expected"
+              fi
+              echo "Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
               exit 1
           fi
       - name: Runner health check disconnect on failure
@@ -1131,7 +1141,12 @@ jobs:
         run: |
           ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
           if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
-              echo "Failed to detect GPUs on the runner"
+              if [[ $ngpu -eq 0 ]]; then
+                echo "Error: Failed to detect any GPUs on the runner"
+              else
+                echo "Error: Detected $ngpu GPUs on the runner, when only 2 or 4 were expected"
+              fi
+              echo "Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
               exit 1
           fi
       - name: Runner health check disconnect on failure
@@ -1274,7 +1289,12 @@ jobs:
         run: |
           ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
           if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
-              echo "Failed to detect GPUs on the runner"
+              if [[ $ngpu -eq 0 ]]; then
+                echo "Error: Failed to detect any GPUs on the runner"
+              else
+                echo "Error: Detected $ngpu GPUs on the runner, when only 2 or 4 were expected"
+              fi
+              echo "Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
               exit 1
           fi
       - name: Runner health check disconnect on failure
diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
index ac9edc252c28e..efe3e2c0d17c9 100644
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@@ -337,7 +337,12 @@ jobs:
         run: |
           ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
           if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
-              echo "Failed to detect GPUs on the runner"
+              if [[ $ngpu -eq 0 ]]; then
+                echo "Error: Failed to detect any GPUs on the runner"
+              else
+                echo "Error: Detected $ngpu GPUs on the runner, when only 2 or 4 were expected"
+              fi
+              echo "Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
               exit 1
           fi
       - name: Runner health check disconnect on failure
@@ -477,7 +482,12 @@ jobs:
         run: |
           ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
           if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
-              echo "Failed to detect GPUs on the runner"
+              if [[ $ngpu -eq 0 ]]; then
+                echo "Error: Failed to detect any GPUs on the runner"
+              else
+                echo "Error: Detected $ngpu GPUs on the runner, when only 2 or 4 were expected"
+              fi
+              echo "Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
               exit 1
           fi
       - name: Runner health check disconnect on failure
@@ -855,7 +865,12 @@ jobs:
         run: |
           ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
           if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
-              echo "Failed to detect GPUs on the runner"
+              if [[ $ngpu -eq 0 ]]; then
+                echo "Error: Failed to detect any GPUs on the runner"
+              else
+                echo "Error: Detected $ngpu GPUs on the runner, when only 2 or 4 were expected"
+              fi
+              echo "Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
               exit 1
           fi
       - name: Runner health check disconnect on failure
@@ -995,7 +1010,12 @@ jobs:
         run: |
           ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
           if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
-              echo "Failed to detect GPUs on the runner"
+              if [[ $ngpu -eq 0 ]]; then
+                echo "Error: Failed to detect any GPUs on the runner"
+              else
+                echo "Error: Detected $ngpu GPUs on the runner, when only 2 or 4 were expected"
+              fi
+              echo "Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
               exit 1
           fi
       - name: Runner health check disconnect on failure
@@ -1373,7 +1393,12 @@ jobs:
         run: |
           ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
           if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
-              echo "Failed to detect GPUs on the runner"
+              if [[ $ngpu -eq 0 ]]; then
+                echo "Error: Failed to detect any GPUs on the runner"
+              else
+                echo "Error: Detected $ngpu GPUs on the runner, when only 2 or 4 were expected"
+              fi
+              echo "Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
               exit 1
           fi
       - name: Runner health check disconnect on failure
@@ -1513,7 +1538,12 @@ jobs:
         run: |
           ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
           if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
-              echo "Failed to detect GPUs on the runner"
+              if [[ $ngpu -eq 0 ]]; then
+                echo "Error: Failed to detect any GPUs on the runner"
+              else
+                echo "Error: Detected $ngpu GPUs on the runner, when only 2 or 4 were expected"
+              fi
+              echo "Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
               exit 1
           fi
       - name: Runner health check disconnect on failure
@@ -1891,7 +1921,12 @@ jobs:
         run: |
           ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
           if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
-              echo "Failed to detect GPUs on the runner"
+              if [[ $ngpu -eq 0 ]]; then
+                echo "Error: Failed to detect any GPUs on the runner"
+              else
+                echo "Error: Detected $ngpu GPUs on the runner, when only 2 or 4 were expected"
+              fi
+              echo "Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
               exit 1
           fi
       - name: Runner health check disconnect on failure
@@ -2031,7 +2066,12 @@ jobs:
         run: |
           ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
           if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then
-              echo "Failed to detect GPUs on the runner"
+              if [[ $ngpu -eq 0 ]]; then
+                echo "Error: Failed to detect any GPUs on the runner"
+              else
+                echo "Error: Detected $ngpu GPUs on the runner, when only 2 or 4 were expected"
+              fi
+              echo "Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
               exit 1
           fi
       - name: Runner health check disconnect on failure

From 1dd4a907f8ff9f3adc13a23e0d263dfbc0d9ecd5 Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Fri, 11 Nov 2022 16:30:01 -0800
Subject: [PATCH 0908/1922] [Profiler] Account for caching when assigning IDs
 (#88917)

The python tracer caches information about module and optimizer state. That means that for subsequent calls, the presence of a Tensor in these fields does not imply that the Tensor is still live; just that it was live during the first call. (I should perhaps rename the fields to something like `stale_parameters` to convey this.) Unless we discard subsequent calls ID assignment get tripped up when it see's a Tensor that was already released.

Differential Revision: [D41226827](https://our.internmc.facebook.com/intern/diff/D41226827/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88917
Approved by: https://github.com/chaekit
---
 torch/csrc/profiler/data_flow.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/profiler/data_flow.cpp b/torch/csrc/profiler/data_flow.cpp
index 5432027867887..dcb3eaffd4392 100644
--- a/torch/csrc/profiler/data_flow.cpp
+++ b/torch/csrc/profiler/data_flow.cpp
@@ -69,6 +69,10 @@ void calculateUniqueTensorIDs(
   // --------------------------------------------------------------------------
   {
     RawTensors raw_tensors;
+
+    // The python tracer caches values, so it's only safe to use the first case.
+    ska::flat_hash_set<PyModuleSelf> seen_modules;
+    ska::flat_hash_set<PyOptimizerSelf> seen_optimizers;
     for (auto& result : sorted_results) {
       result->visit(c10::overloaded(
           [&](ExtraFields<EventType::TorchOp>& torch_op) {
@@ -78,7 +82,8 @@ void calculateUniqueTensorIDs(
           },
           [&](ExtraFields<EventType::PyCall>& py_call) {
             // torch.nn.Module
-            if (py_call.module_.has_value()) {
+            if (py_call.module_.has_value() &&
+                seen_modules.insert(py_call.module_->self_).second) {
               for (auto& p : py_call.module_->parameters_) {
                 raw_tensors(p.metadata_);
                 raw_tensors(p.grad_metadata_);
@@ -86,7 +91,8 @@ void calculateUniqueTensorIDs(
             }
 
             // torch.optim.Optimizer
-            if (py_call.optimizer_.has_value()) {
+            if (py_call.optimizer_.has_value() &&
+                seen_optimizers.insert(py_call.optimizer_->self_).second) {
               for (auto& p : py_call.optimizer_->parameters_) {
                 raw_tensors(p.metadata_);
                 raw_tensors(p.grad_metadata_);

From 8d6d31e36c7809cea645b7a300febe00b34cdd92 Mon Sep 17 00:00:00 2001
From: mikey dagitses <mikeyd@meta.com>
Date: Tue, 15 Nov 2022 19:08:31 +0000
Subject: [PATCH 0909/1922] disable test that fails in fbcode (#88786)

Summary:
caffe2/test:torch_cuda - test_advanced_indexing_assignment_lazy (test_view_ops.TestViewOpsLAZY)
RuntimeError: TorchScript backend not yet supported in FBCODE/OVRSOURCE builds
  File "/usr/local/fbcode/platform010/lib/python3.8/unittest/suite.py", line 163, in _handleClassSetUp
    setUpClass()
  File "/re_cwd/fbcode/buck-out/opt/gen/caffe2/test/torch_cuda#binary,link-tree/torch/testing/_internal/common_device_type.py", line 506, in setUpClass
    torch._lazy.ts_backend.init()
  File "/re_cwd/fbcode/buck-out/opt/gen/caffe2/test/torch_cuda#binary,link-tree/torch/_lazy/ts_backend.py", line 6, in init
    torch._C._lazy_ts_backend._init()

Test Plan: Rely on CI.

Differential Revision: D41170545

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88786
Approved by: https://github.com/zou3519
---
 test/test_view_ops.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_view_ops.py b/test/test_view_ops.py
index c4729557c416b..3c4376b501f91 100644
--- a/test/test_view_ops.py
+++ b/test/test_view_ops.py
@@ -9,7 +9,7 @@
 
 from torch.testing import make_tensor
 from torch.testing._internal.common_utils import (
-    TestCase, run_tests, suppress_warnings, gradcheck, gradgradcheck,
+    IS_FBCODE, TestCase, run_tests, suppress_warnings, gradcheck, gradgradcheck,
     numpy_to_torch_dtype_dict, skipIfTorchDynamo
 )
 from torch.testing._internal.common_device_type import \
@@ -857,6 +857,7 @@ def test_advanced_indexing_nonview(self, device):
         nv[1, 1] = 0
         self.assertNotEqual(t[2, 2], nv[1, 1])
 
+    @unittest.skipIf(IS_FBCODE, "TorchScript backend not yet supported in FBCODE/OVRSOURCE builds")
     def test_advanced_indexing_assignment(self, device):
         t = torch.ones(3, 3, device=device)
         rows = torch.tensor([[0, 0], [2, 2]], device=device)

From edc09d12998d18fd8766d8e4f201594f66de99f1 Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Fri, 11 Nov 2022 16:30:03 -0800
Subject: [PATCH 0910/1922] [Profiler] Memory profiler part 2: Config
 validation (#86853)

Memory profiling requires `record_shapes`, `profile_memory`, and `with_stack`. This PR just adds a skeleton endpoint with a good error message if certain flags are missing.

Differential Revision: [D39920801](https://our.internmc.facebook.com/intern/diff/D39920801/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86853
Approved by: https://github.com/chaekit
---
 test/profiler/test_memory_profiler.py | 22 ++++++++++++++++++++++
 torch/profiler/_memory_profiler.py    | 13 ++++++++++++-
 torch/profiler/profiler.py            | 24 +++++++++++++++++++++---
 3 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/test/profiler/test_memory_profiler.py b/test/profiler/test_memory_profiler.py
index c725f8bec51a4..3fd6b04b8a76c 100644
--- a/test/profiler/test_memory_profiler.py
+++ b/test/profiler/test_memory_profiler.py
@@ -12,6 +12,28 @@
     torch.profiler.profile, record_shapes=True, profile_memory=True, with_stack=True
 )
 
+@skipIfTorchDynamo("TorchDynamo removes profiler altogether.")
+class TestMemoryProfiler(TestCase):
+    def test_config_check(self) -> None:
+        with torch.profiler.profile() as prof:
+            pass
+
+        pattern = r"record_shapes=True, profile_memory=True, with_stack=True"
+        with self.assertRaisesRegex(ValueError, pattern):
+            prof._memory_profile()
+
+        with torch.profiler.profile(record_shapes=True, with_stack=True) as prof:
+            pass
+
+        pattern = r"^profile_memory=True required for memory profiling\.$"
+        with self.assertRaisesRegex(ValueError, pattern):
+            prof._memory_profile()
+
+        with profile() as prof:
+            pass
+
+        self.assertIsInstance(prof._memory_profile(), _memory_profiler.MemoryProfile)
+
 
 class ScaleLayer(torch.nn.Module):
     def __init__(self) -> None:
diff --git a/torch/profiler/_memory_profiler.py b/torch/profiler/_memory_profiler.py
index cab771931489c..355d3322a4e00 100644
--- a/torch/profiler/_memory_profiler.py
+++ b/torch/profiler/_memory_profiler.py
@@ -2,7 +2,13 @@
 from typing import Any, Iterator, Optional, Tuple
 
 import torch
-from torch._C._profiler import _EventType, _ProfilerEvent, _TensorMetadata, RecordScope
+from torch._C._autograd import _ProfilerResult
+from torch._C._profiler import (
+    _EventType,
+    _ProfilerEvent,
+    _TensorMetadata,
+    RecordScope,
+)
 
 
 @dataclasses.dataclass
@@ -112,3 +118,8 @@ def extract_gradients(
                 p_grad_key = TensorKey.from_tensor(p_grad)
                 if p_grad_key is not None:
                     yield TensorKey.from_tensor(p), p_grad_key
+
+
+class MemoryProfile:
+    def __init__(self, result: _ProfilerResult) -> None:
+        pass
diff --git a/torch/profiler/profiler.py b/torch/profiler/profiler.py
index ceca36126dcd1..31b85eb26f0fe 100644
--- a/torch/profiler/profiler.py
+++ b/torch/profiler/profiler.py
@@ -16,10 +16,19 @@
     _ExperimentalConfig,
     _remove_execution_graph_observer,
 )
-from torch.autograd import ProfilerActivity, kineto_available
+from torch.autograd import kineto_available, ProfilerActivity
+from torch.profiler import _memory_profiler
+
+
+__all__ = [
+    "supported_activities",
+    "ProfilerAction",
+    "schedule",
+    "tensorboard_trace_handler",
+    "profile",
+    "ExecutionGraphObserver",
+]
 
-__all__ = ['supported_activities', 'ProfilerAction', 'schedule', 'tensorboard_trace_handler', 'profile',
-           'ExecutionGraphObserver']
 
 def supported_activities():
     """
@@ -208,6 +217,15 @@ def _get_distributed_info(self):
             "world_size": dist.get_world_size()
         }
 
+    def _memory_profile(self) -> _memory_profiler.MemoryProfile:
+        required = ("record_shapes", "profile_memory", "with_stack")
+        missing = [f"{i}=True" for i in required if not getattr(self, i)]
+        if missing:
+            raise ValueError(f"{', '.join(missing)} required for memory profiling.")
+
+        assert self.profiler is not None and self.profiler.kineto_results is not None
+        return _memory_profiler.MemoryProfile(self.profiler.kineto_results)
+
 
 class ProfilerAction(Enum):
     """

From 3127f62269b966376b3813d507fd03e6c26393e6 Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Fri, 11 Nov 2022 16:30:05 -0800
Subject: [PATCH 0911/1922] [Profiler] Memory profiler part 3: Schema parsing
 and mutable arguments (#86854)

The appropriate annotation for a block of memory is a function of time: an input can be mutated in-place to become an activation, a clever kernel might steal the memory of a detached input (such as a mask) to use as output memory, etc.

We could pessimistically assume that all ops mutate all of their inputs, however inspection of schema allows us to significantly narrow that assumption with minimal effort. Checking schemas also allows us to distinguish between dispatcher ops (which have load bearing semantics) and user annotations with reasonably high precision.

Differential Revision: [D40220390](https://our.internmc.facebook.com/intern/diff/D40220390/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86854
Approved by: https://github.com/chaekit
---
 test/profiler/test_memory_profiler.py | 105 +++++++++++++++++++++++-
 torch/_C/_profiler.pyi                |   1 +
 torch/csrc/profiler/python/init.cpp   |   1 +
 torch/profiler/_memory_profiler.py    | 111 +++++++++++++++++++++++++-
 4 files changed, 216 insertions(+), 2 deletions(-)

diff --git a/test/profiler/test_memory_profiler.py b/test/profiler/test_memory_profiler.py
index 3fd6b04b8a76c..6924cb3556592 100644
--- a/test/profiler/test_memory_profiler.py
+++ b/test/profiler/test_memory_profiler.py
@@ -1,6 +1,6 @@
 # Owner(s): ["oncall: profiler"]
 import functools
-from typing import Iterator, Optional
+from typing import Iterator, List, Optional, Tuple
 
 import torch
 from torch._C._profiler import _EventType
@@ -12,6 +12,7 @@
     torch.profiler.profile, record_shapes=True, profile_memory=True, with_stack=True
 )
 
+
 @skipIfTorchDynamo("TorchDynamo removes profiler altogether.")
 class TestMemoryProfiler(TestCase):
     def test_config_check(self) -> None:
@@ -242,5 +243,107 @@ def test_extract_gradients_from_module_and_optimizer(self) -> None:
         )
 
 
+class TestDataFlow(TestCase):
+    @staticmethod
+    def formatSchemas(
+        prof: torch.profiler.profile, indent: int = 12
+    ) -> Tuple[Tuple[str, Tuple[bool, ...]], ...]:
+        tree = prof.profiler.kineto_results.experimental_event_tree()
+        out: List[Tuple[str, Tuple[bool, ...]]] = []
+        for node in _utils.traverse_dfs(tree):
+            if node.tag == _EventType.TorchOp:
+                e = node.extra_fields
+                schemas = _memory_profiler.SchemaMatcher.match_schemas(e)
+                name = node.name
+                if len(schemas) == 1:
+                    name = f"{name}.{schemas[0].overload_name}"
+                elif len(schemas) > 1:
+                    name = f"{name}.{{{', '.join(s.overload_name for s in schemas)}}}"
+
+                out.append((name, _memory_profiler.SchemaMatcher.inputs_are_mutable(e)))
+        return tuple(out)
+
+    def test_match_schemas(self) -> None:
+        with profile() as prof:
+            x = torch.ones((1,)).mul(2).add_(2)
+            _ = torch.sin(x, out=torch.empty_like(x))
+
+        self.assertEqual(
+            self.formatSchemas(prof),
+            (
+                ("aten::ones.", (False,) * 5),
+                ("aten::empty.memory_format", (False,) * 6),
+                #
+                # fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!)
+                ("aten::fill_.Scalar", (True, False)),
+                ("aten::mul.Tensor", (False, False)),
+                ("aten::to.dtype", (False,) * 5),
+                ("aten::_to_copy.", (False,) * 7),
+                ("aten::empty_strided.", (False,) * 6),
+                #
+                # copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
+                ("aten::copy_.", (True, False, False)),
+                #
+                # add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+                ("aten::add_.Tensor", (True, False, False)),
+                ("aten::to.dtype", (False,) * 5),
+                ("aten::_to_copy.", (False,) * 7),
+                ("aten::empty_strided.", (False,) * 6),
+                #
+                # copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
+                ("aten::copy_.", (True, False, False)),
+                ("aten::empty_like.", (False,) * 6),
+                ("aten::empty_strided.", (False,) * 6),
+                #
+                # sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+                ("aten::sin.out", (False, True)),
+            ),
+        )
+
+    def test_match_schemas_backward(self) -> None:
+        x = torch.ones((1,))
+        w = torch.ones((1,), requires_grad=True)
+        with profile() as prof:
+            torch.mul(x, w).backward()
+
+        self.assertEqual(
+            self.formatSchemas(prof),
+            (
+                ("aten::mul.Tensor", (False, False)),
+                ("aten::ones_like.", (False,) * 6),
+                ("aten::empty_like.", (False,) * 6),
+                ("aten::empty_strided.", (False,) * 6),
+                #
+                # fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!)
+                ("aten::fill_.Scalar", (True, False)),
+                ("autograd::engine::evaluate_function: MulBackward0", ()),
+                #
+                # Cannot find schema, all inputs presumed mutable
+                ("MulBackward0", (True,)),
+                ("aten::mul.Tensor", (False, False)),
+                (
+                    "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad",
+                    (),
+                ),
+                #
+                # Cannot find schema, all inputs presumed mutable
+                ("torch::autograd::AccumulateGrad", (True,)),
+                ("aten::detach.", (False,)),
+                ("detach", (True,)),
+            ),
+        )
+
+    def test_match_schemas_tensorlist(self) -> None:
+        x = torch.ones((1,))
+        y = torch.ones((1,))
+        with profile() as prof:
+            torch.cat([x, y], axis=0)
+
+        self.assertEqual(
+            self.formatSchemas(prof),
+            (("aten::cat.", (False, False)),),
+        )
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/_C/_profiler.pyi b/torch/_C/_profiler.pyi
index da0f191e26b53..4a1fe23cec614 100644
--- a/torch/_C/_profiler.pyi
+++ b/torch/_C/_profiler.pyi
@@ -135,6 +135,7 @@ Scalar = Union[int, float, bool, complex]
 Input = Optional[Union[_TensorMetadata, List[_TensorMetadata], Scalar]]
 
 class _ExtraFields_TorchOp:
+    name: str
     sequence_number: int
     allow_tf32_cublas: bool
 
diff --git a/torch/csrc/profiler/python/init.cpp b/torch/csrc/profiler/python/init.cpp
index 2a5839fc6a229..d910afe4234a8 100644
--- a/torch/csrc/profiler/python/init.cpp
+++ b/torch/csrc/profiler/python/init.cpp
@@ -163,6 +163,7 @@ void initPythonBindings(PyObject* module) {
 
   using torch_op_t = ExtraFields<EventType::TorchOp>;
   py::class_<torch_op_t>(m, "_ExtraFields_TorchOp")
+      .def_readonly("name", &torch_op_t::name_)
       .def_property_readonly(
           "inputs",
           [](const torch_op_t& op) {
diff --git a/torch/profiler/_memory_profiler.py b/torch/profiler/_memory_profiler.py
index 355d3322a4e00..cd652a6a000f9 100644
--- a/torch/profiler/_memory_profiler.py
+++ b/torch/profiler/_memory_profiler.py
@@ -1,10 +1,12 @@
 import dataclasses
-from typing import Any, Iterator, Optional, Tuple
+from typing import Any, Iterator, List, Optional, Tuple, Union
 
 import torch
+from torch._C import FunctionSchema
 from torch._C._autograd import _ProfilerResult
 from torch._C._profiler import (
     _EventType,
+    _ExtraFields_TorchOp,
     _ProfilerEvent,
     _TensorMetadata,
     RecordScope,
@@ -120,6 +122,113 @@ def extract_gradients(
                     yield TensorKey.from_tensor(p), p_grad_key
 
 
+class SchemaMatcher:
+    """Lookup operator schema based on profiled name.
+
+    When profiling we record the operator's name but not the schema. However
+    some analysis requires that information. Fortunately we can look up
+    registered schema from the recorded name. We do not, however, record the
+    overload and so we must compare the profiled arguments with all overloads
+    to determine viable matches.
+
+    Note: Once https://github.com/pytorch/pytorch/issues/78871 is completed
+    this code will be obsolete.
+    """
+
+    @classmethod
+    def inputs_are_mutable(cls, t: _ExtraFields_TorchOp) -> Tuple[bool, ...]:
+        """Determine which inputs may have mutated based on function schema.
+
+        Note that we don't need to resolve down to a single schema to perform
+        this analysis. An input is mutable if it is mutable in any overload. In
+        practice, however, it is overwhelmingly common to match a single
+        overload. If we cannot find any valid schema then we must be
+        conservative and assume all inputs are mutable.
+        """
+        mutable: Optional[List[bool]] = None
+        for schema in cls.match_schemas(t):
+            mutable = mutable or [False for _ in schema.arguments]
+            for i, arg in enumerate(schema.arguments):
+                mutable[i] |= getattr(arg.alias_info, "is_write", False)
+
+        return tuple(mutable or (True for _ in t.inputs))
+
+    @classmethod
+    def match_schemas(cls, t: _ExtraFields_TorchOp) -> Tuple[FunctionSchema, ...]:
+        signature = tuple(
+            # Tensor
+            TensorKey.from_tensor(i) if isinstance(i, _TensorMetadata)
+            #
+            # TensorList
+            else [TensorKey.from_tensor(j) for j in i] if isinstance(i, list)
+            #
+            # Scalar and uncaptured inputs.
+            else i
+            for i in t.inputs
+        )
+
+        def matches(schema) -> bool:
+            return len(schema.arguments) == len(signature) and all(
+                cls._types_match(observed, schema_arg.type)
+                for observed, schema_arg in zip(signature, schema.arguments)
+            )
+
+        return tuple(s for s in cls.lookup_schemas(t.name) or () if matches(s))
+
+    @classmethod
+    def _types_match(cls, observed, schema_type) -> bool:
+        if isinstance(schema_type, torch._C.OptionalType):
+            schema_type = schema_type.getElementType()
+            return observed is None or cls._types_match(observed, schema_type)
+
+        if isinstance(schema_type, torch._C.AnyType):
+            return True
+
+        if schema_type.isSubtypeOf(torch._C.ListType.ofTensors()):
+            return isinstance(observed, list) and all(
+                isinstance(i, TensorKey) for i in observed
+            )
+
+        type_map: Tuple[Tuple[Any, Union[type, Tuple[type, ...]]], ...] = (
+            (torch._C.TensorType, TensorKey),
+            (torch._C.NoneType, type(None)),
+            (torch._C.BoolType, bool),
+            (torch._C.IntType, int),
+            (torch._C.FloatType, float),
+            (torch._C.ComplexType, complex),
+            (torch._C.NumberType, (bool, int, float, complex)),
+        )
+
+        for jit_type, py_types in type_map:
+            if isinstance(schema_type, jit_type):
+                return isinstance(observed, py_types)
+
+        # Profiler only records a subset of possible argument types. If we
+        # reach this point then the schema must call for a type that profiler
+        # does not record. Thus, the schema can only be a match if `observed`
+        # is also None.
+        return observed is None
+
+    @staticmethod
+    def lookup_schemas(name: str) -> Optional[Tuple[FunctionSchema, ...]]:
+        # TODO(robieta):
+        #   _jit_get_schemas_for_operator is quite expensive. (~100us / call)
+        #   Consider adding `functools.lru_cache` if that becomes an issue.
+
+        try:
+            # Schema lookup will throw if `name` is malformed. (For example,
+            # schemas must be namespaced and schema lookup will fail if name
+            # does not include "::".) We simply catch the exception and return
+            # `None` to denote that `name` cannot be an operator name.
+            #
+            # Note that record_function annotations also go through this path,
+            # so it is expected that some names will not correspond to PyTorch
+            # operators.
+            return tuple(torch._C._jit_get_schemas_for_operator(name))
+        except RuntimeError:
+            return None
+
+
 class MemoryProfile:
     def __init__(self, result: _ProfilerResult) -> None:
         pass

From b2b14e7cbc94c886ebe1cf843d48082fa31ea6d9 Mon Sep 17 00:00:00 2001
From: Xiao Wang <24860335+xwang233@users.noreply.github.com>
Date: Tue, 15 Nov 2022 19:25:53 +0000
Subject: [PATCH 0912/1922] Enable channels_last_3d on SyncBatchNorm (#88401)

This PR enabled the use of fast channels_last kernels on SyncBatchNorm with channels_last_3d memory format.

With a small benchmark script here https://github.com/pytorch/pytorch/issues/88021#issuecomment-1299059859, on V100, I got

master:
```
DDP channels_last=False, run_forward_backward, time: 0.8945400714874268 sec
DDP channels_last=True, run_forward_backward, time: 1.4736433029174805 sec
```

This PR:
```
DDP channels_last=False, run_forward_backward, time: 0.8927242755889893 sec
DDP channels_last=True, run_forward_backward, time: 0.48697471618652344 sec
```

This PR is a follow-up of https://github.com/pytorch/pytorch/pull/46906

Close https://github.com/pytorch/pytorch/issues/88021
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88401
Approved by: https://github.com/ngimel
---
 aten/src/ATen/native/cuda/Normalization.cu            |  7 +++++--
 test/test_nn.py                                       | 10 +++++-----
 torch/nn/modules/_functions.py                        | 10 ++++++++--
 .../testing/_internal/distributed/distributed_test.py | 11 ++++++++---
 4 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/aten/src/ATen/native/cuda/Normalization.cu b/aten/src/ATen/native/cuda/Normalization.cu
index 3b27ebfc7d922..df460447464b2 100644
--- a/aten/src/ATen/native/cuda/Normalization.cu
+++ b/aten/src/ATen/native/cuda/Normalization.cu
@@ -48,8 +48,11 @@ bool is_mixed_type(const Tensor& input, const Args&... parameters) {
 }
 
 inline bool batch_norm_use_channels_last_kernels(const at::Tensor& self) {
-  return (self.is_contiguous(at::MemoryFormat::ChannelsLast) ||
-          (self.is_contiguous() && self.strides()[1] == 1));
+  return (
+    self.is_contiguous(at::MemoryFormat::ChannelsLast) ||
+    self.is_contiguous(at::MemoryFormat::ChannelsLast3d) ||
+    (self.is_contiguous() && self.strides()[1] == 1)
+  );
 }
 
 enum class Impl {
diff --git a/test/test_nn.py b/test/test_nn.py
index b07793e79f48f..2b96838e36014 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -10283,16 +10283,16 @@ def test_sync_batchnorm_accuracy_cuda(self):
         #   fwd: torch.batch_norm_stats, torch.batch_norm_gather_stats_with_counts, torch.batch_norm_elemt
         #   bwd: torch.batch_norm_backward_reduce, torch.batch_norm_backward_elemt
 
-        def _batch_norm_stats(data):
+        def _batch_norm_stats(data, memory_format, mean_axes):
             mean1, _ = torch.batch_norm_stats(data, 1e-5)
-            mean2, _ = torch.batch_norm_stats(data.to(memory_format=torch.channels_last), 1e-5)
-            mean_ref = torch.mean(data, (0, 2, 3), keepdim=False)
+            mean2, _ = torch.batch_norm_stats(data.to(memory_format=memory_format), 1e-5)
+            mean_ref = torch.mean(data, mean_axes, keepdim=False)
 
             self.assertEqual(mean_ref, mean1)
             self.assertEqual(mean_ref, mean2)
 
-        data = torch.randn(1, 96, 112, 112, dtype=torch.float, device='cuda')
-        _batch_norm_stats(data)
+        _batch_norm_stats(torch.randn(1, 96, 112, 112, dtype=torch.float, device='cuda'), torch.channels_last, (0, 2, 3))
+        _batch_norm_stats(torch.randn(1, 96, 112, 112, 112, dtype=torch.float, device='cuda'), torch.channels_last_3d, (0, 2, 3, 4))
 
     def test_flatten(self):
         tensor_input = torch.randn(2, 1, 2, 3)
diff --git a/torch/nn/modules/_functions.py b/torch/nn/modules/_functions.py
index 66200345cbc23..464c56a548a65 100644
--- a/torch/nn/modules/_functions.py
+++ b/torch/nn/modules/_functions.py
@@ -7,7 +7,10 @@ class SyncBatchNorm(Function):
 
     @staticmethod
     def forward(self, input, weight, bias, running_mean, running_var, eps, momentum, process_group, world_size):
-        if not input.is_contiguous(memory_format=torch.channels_last):
+        if not (
+            input.is_contiguous(memory_format=torch.channels_last) or
+            input.is_contiguous(memory_format=torch.channels_last_3d)
+        ):
             input = input.contiguous()
         if weight is not None:
             weight = weight.contiguous()
@@ -104,7 +107,10 @@ def forward(self, input, weight, bias, running_mean, running_var, eps, momentum,
 
     @staticmethod
     def backward(self, grad_output):
-        if not grad_output.is_contiguous(memory_format=torch.channels_last):
+        if not (
+            grad_output.is_contiguous(memory_format=torch.channels_last) or
+            grad_output.is_contiguous(memory_format=torch.channels_last_3d)
+        ):
             grad_output = grad_output.contiguous()
         saved_input, weight, mean, invstd, count_tensor = self.saved_tensors
         grad_input = grad_weight = grad_bias = None
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 43a49b0489dc9..c67dfc7c40a3d 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -5324,6 +5324,10 @@ def test_post_localSGD_optimizer_step_reload(self):
         )
         @skip_if_no_gpu
         def test_DistributedDataParallel_SyncBatchNorm_Channels_Last(self):
+            self._test_DistributedDataParallel_SyncBatchNorm_with_memory_format(torch.channels_last)
+            self._test_DistributedDataParallel_SyncBatchNorm_with_memory_format(torch.channels_last_3d)
+
+        def _test_DistributedDataParallel_SyncBatchNorm_with_memory_format(self, memory_format):
             group, group_id, rank = self._init_global_test()
             num_processes = dist.get_world_size()
             local_bs = 2
@@ -5336,14 +5340,15 @@ def test_DistributedDataParallel_SyncBatchNorm_Channels_Last(self):
                 model_gpu, device_ids=[rank]
             )
 
-            memory_format = torch.channels_last
+            shapes = [global_bs, 2, 4, 4] + ([] if memory_format is torch.channels_last else [4])
+
             input_gpu = (
-                torch.randn(global_bs, 2, 4, 4, dtype=torch.float)
+                torch.randn(*shapes, dtype=torch.float)
                 .cuda(rank)
                 .to(memory_format=memory_format)
             )
             target_gpu = (
-                torch.randn(global_bs, 2, 4, 4, dtype=torch.float)
+                torch.randn(*shapes, dtype=torch.float)
                 .cuda(rank)
                 .to(memory_format=memory_format)
             )

From f9b2fa4eae03e83f7e85c9e4871af43fc60afe7e Mon Sep 17 00:00:00 2001
From: Jiawen Liu <jiawenl@meta.com>
Date: Tue, 15 Nov 2022 19:34:38 +0000
Subject: [PATCH 0913/1922] [Inductor] Build FX Linear + Permute Vertical
 Fusion in Inductor (#88859)

Summary:
Build fx-based linear/matmul/bmm + permute/transpose vertical fusion in Inductor

For an internal Ads model: **1.15x -> 1.36x speedup**

Differential Revision: D41071665

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88859
Approved by: https://github.com/jianyuh, https://github.com/jansel
---
 test/inductor/test_torchinductor.py | 106 +++++++++++++++
 torch/_inductor/config.py           |   4 +
 torch/_inductor/overrides.py        | 199 ++++++++++++++++++++++++++++
 3 files changed, 309 insertions(+)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 23fb2f7712e04..b64f40377995e 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -10,6 +10,7 @@
 import typing
 import unittest
 import weakref
+from typing import Any, Callable
 from unittest.mock import patch
 
 import torch
@@ -18,6 +19,7 @@
 from torch._dynamo.debug_utils import same_two_models
 from torch._dynamo.testing import rand_strided, same
 from torch.fx.experimental.proxy_tensor import make_fx
+from torch.fx.passes.shape_prop import ShapeProp
 from torch.nn import functional as F
 from torch.testing._internal.common_utils import (
     TEST_WITH_ASAN,
@@ -39,6 +41,14 @@
     from torch._inductor import codecache, config, metrics
     from torch._inductor.compile_fx import compile_fx, complex_memory_overlap
     from torch._inductor.ir import IndexingDiv, ModularIndexing
+    from torch._inductor.overrides import (
+        linear_permute_fusion,
+        linear_transpose,
+        permute_linear_fusion,
+        permute_matmul_fusion,
+        transpose_linear,
+        transpose_matmul,
+    )
     from torch._inductor.sizevars import SizeVarAllocator
     from torch._inductor.utils import has_torchvision_roi_align, timed
 
@@ -113,6 +123,29 @@ def maybe_test(*args, **kwargs):
     return wrap_test
 
 
+PassFunc = Callable[[torch.fx.GraphModule, Any], torch.fx.GraphModule]
+
+
+def chain_passes(*passes: PassFunc) -> PassFunc:
+    def parent_pass(module: torch.fx.GraphModule, input: Any) -> torch.fx.GraphModule:
+        for pass_ in passes:
+            if isinstance(module, torch.fx.GraphModule):
+                ShapeProp(module).propagate(*input)
+            module = pass_(module)
+        return module
+
+    return parent_pass
+
+
+def count_call_function(module: torch.fx.GraphModule, target_op: Any) -> int:
+    return sum(
+        [
+            1 if (n.op == "call_function" and n.target == target_op) else 0
+            for n in module.graph.nodes
+        ]
+    )
+
+
 class TestCase(TorchTestCase):
     @classmethod
     def setUpClass(cls):
@@ -1586,6 +1619,79 @@ def fn(a, b):
         y = torch.tensor(0)
         self.assertEqual(fn(x, y), x + x)
 
+    def test_linear_permute_fusion(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self, k: int, n: int):
+                super().__init__()
+                self.weight = torch.nn.Parameter(torch.randn(n, k))
+                self.bias = torch.nn.Parameter(torch.randn(n))
+
+            def forward(self, input: torch.Tensor):
+                a0 = torch.nn.functional.linear(input, self.weight, self.bias)
+                b0 = a0.permute(0, 2, 1)
+                return b0
+
+        m, k, n = 16, 8, 4
+        trace_func = chain_passes(torch.fx.symbolic_trace, linear_permute_fusion)
+        module = TestModule(k, n).eval()
+        input = torch.randn(6, m, k)
+        traced = trace_func(module, [input])
+        num_linear = count_call_function(traced, torch.nn.functional.linear)
+        num_linear_transpose = count_call_function(traced, linear_transpose)
+        self.assertEqual(num_linear, 0)
+        self.assertEqual(num_linear_transpose, 1)
+
+        self.assertTrue(torch.allclose(module(input), traced(input)))
+
+    def test_permute_linear_fusion(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self, k: int, n: int):
+                super().__init__()
+                self.weight = torch.nn.Parameter(torch.randn(n, k))
+                self.bias = torch.nn.Parameter(torch.randn(n))
+
+            def forward(self, input: torch.Tensor):
+                input1 = input.permute(0, 2, 1)
+                output = torch.nn.functional.linear(input1, self.weight, self.bias)
+                return output
+
+        m, k, n = 16, 8, 4
+
+        trace_func = chain_passes(torch.fx.symbolic_trace, permute_linear_fusion)
+        module = TestModule(k, n).eval()
+        input = torch.randn(6, k, m)
+        traced = trace_func(module, [input])
+        num_linear = count_call_function(traced, torch.nn.functional.linear)
+        num_transpose_linear = count_call_function(traced, transpose_linear)
+        self.assertEqual(num_linear, 0)
+        self.assertEqual(num_transpose_linear, 1)
+
+        self.assertTrue(torch.allclose(module(input), traced(input)))
+
+    def test_permute_bmm_fusion(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self, batch: int, k: int, n: int):
+                super().__init__()
+                self.other = torch.randn(batch, k, n)
+
+            def forward(self, input: torch.Tensor):
+                input1 = input.permute(0, 2, 1)
+                output = torch.bmm(input1, self.other)
+                return output
+
+        batch, m, k, n = 6, 16, 8, 4
+
+        trace_func = chain_passes(torch.fx.symbolic_trace, permute_matmul_fusion)
+        module = TestModule(batch, k, n).eval()
+        input = torch.randn(batch, k, m)
+        traced = trace_func(module, [input])
+        num_bmm = count_call_function(traced, torch.bmm)
+        num_transpose_matmul = count_call_function(traced, transpose_matmul)
+        self.assertEqual(num_bmm, 0)
+        self.assertEqual(num_transpose_matmul, 1)
+
+        self.assertTrue(torch.allclose(module(input), traced(input)))
+
     def test_slice1(self):
         def fn(a):
             return (
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index d376fe3e8bf7f..c552101c1caee 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -75,6 +75,10 @@
 shape_padding = os.environ.get("TORCHINDUCTOR_SHAPE_PADDING", "0") == "1"
 alignment_size = 4
 
+# Fx-based linear/matmul/bmm + permute/transpose vertical fusion
+permute_fusion = os.environ.get("TORCHINDUCTOR_PERMUTE_FUSION", "0") == "1"
+
+
 # config specific to codegen/cpp.pp
 class cpp:
     # set to torch.get_num_threads()
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index 3a95aa7ce8807..cf2cd5f60f510 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -19,6 +19,8 @@
 from torch.nn.utils.fusion import fuse_conv_bn_eval
 from torch.overrides import TorchFunctionMode
 
+from . import config
+
 log = logging.getLogger(__name__)
 
 
@@ -425,6 +427,14 @@ def check_node_is_add_inplace(node):
 
 
 def fuse_fx(gm: torch.fx.GraphModule, example_inputs):
+    if config.permute_fusion:
+        # For linear permute fusion, we need to check input info to identify
+        # and perform proper permutation/transpose
+        ShapeProp(gm).propagate(*example_inputs)
+        gm = linear_permute_fusion(gm)
+        gm = permute_linear_fusion(gm)
+        gm = permute_matmul_fusion(gm)
+
     # make sure the autograd is disabled.
     if torch.is_grad_enabled():
         return gm
@@ -528,6 +538,195 @@ def _philox_rand_like(input, seed, offset):
     return torch.rand_like(input)
 
 
+class NormalizedLinearNode:
+    def __init__(self, node: torch.fx.Node) -> None:
+        assert node.op == "call_function"
+        assert node.target in [torch.nn.functional.linear]
+        self.node: torch.fx.Node = node
+
+    def get_input(self) -> torch.fx.Node:
+        if len(self.node.args) > 0:
+            return self.node.args[0]
+        else:
+            return self.node.kwargs["input"]
+
+    def get_weight(self) -> torch.fx.Node:
+        if len(self.node.args) > 1:
+            return self.node.args[1]
+        else:
+            return self.node.kwargs["weight"]
+
+    def get_bias(self) -> torch.fx.Node:
+        if len(self.node.args) > 2:
+            return self.node.args[2]
+        else:
+            return self.node.kwargs["bias"]
+
+
+class NormalizedMatmulNode:
+    def __init__(self, node: torch.fx.Node) -> None:
+        assert node.op == "call_function"
+        assert node.target in [torch.bmm, torch.matmul]
+        self.node: torch.fx.Node = node
+
+    def get_input(self) -> torch.fx.Node:
+        if len(self.node.args) > 0:
+            return self.node.args[0]
+        else:
+            return self.node.kwargs["input"]
+
+    def get_other(self) -> torch.fx.Node:
+        if len(self.node.args) > 1:
+            return self.node.args[1]
+        else:
+            return self.node.kwargs["other"]
+
+
+def check_permute(node: torch.fx.Node):
+    ranks = len(node.meta["tensor_meta"].shape)
+    if len(node.args) > 3:
+        permutation = [node.args[i] % ranks for i in range(1, ranks + 1)]
+    elif (
+        "permutation" in node.kwargs
+        and node.kwargs["permutation"] is not None
+        and len(node.kwargs["permutation"]) > 2
+    ):
+        permutation = [i % ranks for i in node.kwargs["permutation"]]
+    else:
+        return False
+    allowed_permutation = list(range(ranks))
+    allowed_permutation[-1] = ranks - 2
+    allowed_permutation[-2] = ranks - 1
+    return permutation == allowed_permutation
+
+
+def linear_permute_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    for node in module.graph.nodes:
+        if (
+            node.op == "call_method"
+            and node.target == "permute"
+            and check_permute(node)
+        ):
+            if len(node.args) > 0:
+                input_node = node.args[0]
+            else:
+                input_node = node.kwargs["input"]
+            if (
+                input_node.op == "call_function"
+                and input_node.target == torch.nn.functional.linear
+            ):
+                normalized = NormalizedLinearNode(input_node)
+                input = normalized.get_input()
+                weight = normalized.get_weight()
+                bias = normalized.get_bias()
+                with module.graph.inserting_before(node):
+                    fused_node = module.graph.call_function(
+                        linear_transpose, args=(input, weight, bias)
+                    )
+                    node.replace_all_uses_with(fused_node)
+
+    module.graph.lint()
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return module
+
+
+# Y1 = X * W^T + bias
+# Y2 = Y1.permute(0, 2, 1)
+# ---->
+# Y2 = (W * X^T + bias.unsqueeze(-1))^T
+def linear_transpose(
+    input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor
+) -> torch.Tensor:
+    return torch.matmul(weight, input.transpose(-1, -2)) + bias.unsqueeze(-1)
+
+
+def permute_linear_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    for node in module.graph.nodes:
+        if node.op == "call_function" and node.target == torch.nn.functional.linear:
+            if len(node.args) > 0:
+                input_node = node.args[0]
+            else:
+                input_node = node.kwargs["input"]
+            if (
+                input_node.op == "call_method"
+                and input_node.target == "permute"
+                and check_permute(input_node)
+            ):
+                normalized = NormalizedLinearNode(node)
+                if len(input_node.args) > 0:
+                    input = input_node.args[0]
+                else:
+                    input = input_node.kwargs["input"]
+                weight = normalized.get_weight()
+                bias = normalized.get_bias()
+                with module.graph.inserting_before(node):
+                    fused_node = module.graph.call_function(
+                        transpose_linear, args=(input, weight, bias)
+                    )
+                    node.replace_all_uses_with(fused_node)
+
+    module.graph.lint()
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return module
+
+
+def permute_matmul_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    for node in module.graph.nodes:
+        if node.op == "call_function" and (
+            node.target == torch.bmm or node.target == torch.matmul
+        ):
+            normalized = NormalizedMatmulNode(node)
+            A = normalized.get_input()
+            B = normalized.get_other()
+            Atrans = Btrans = False
+            if A.op == "call_method" and A.target == "permute" and check_permute(A):
+                Atrans = True
+                if len(A.args) > 0:
+                    A = A.args[0]
+                else:
+                    A = A.kwargs["input"]
+
+            if B.op == "call_method" and B.target == "permute" and check_permute(B):
+                Btrans = True
+                if len(B.args) > 0:
+                    B = B.args[0]
+                else:
+                    B = B.kwargs["input"]
+
+            if Atrans or Btrans:
+                with module.graph.inserting_before(node):
+                    fused_node = module.graph.call_function(
+                        transpose_matmul,
+                        args=(A, B, Atrans, Btrans),
+                    )
+                node.replace_all_uses_with(fused_node)
+
+    module.graph.lint()
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return module
+
+
+# X1 = X.permute(0, 2, 1)
+# Y1 = X1 * W1^T + bias1
+# ---->
+# Y2 = X1.transpose(-1, -2) * W1^T + bias1
+def transpose_linear(
+    input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor
+) -> torch.Tensor:
+    return torch.matmul(input.transpose(-1, -2), weight.t()) + bias
+
+
+def transpose_matmul(A: torch.Tensor, B: torch.Tensor, Atrans: bool, Btrans: bool):
+    if Atrans:
+        A = A.transpose(-1, -2)
+    if Btrans:
+        B = B.transpose(-1, -2)
+    return torch.matmul(A, B)
+
+
 def replace_and_fuse_for_binary(
     computation_node, node, fuse_func, attr, modules, index_node, index_pointwise
 ):

From 7593b6fb8489ab0548aaf9ff9c67ade94c823090 Mon Sep 17 00:00:00 2001
From: Driss Guessous <drisspg@fb.com>
Date: Tue, 15 Nov 2022 20:22:54 +0000
Subject: [PATCH 0914/1922] Add mem efficient backward (#88856)

# Registers the derivative for mem efficient backward

- Use gradcheck to test correctness. The kernel is not implemented for fp64 so run checks with bumped tolerances in fp32
- I also made updates based off of Xformer main branch and flash-attention cutlass branch.
- This will enable the fused backward to be called for scaled dot product attention

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88856
Approved by: https://github.com/cpuhrsch
---
 aten/src/ATen/native/native_functions.yaml    |   5 +
 .../native/transformers/cuda/attention.cu     |  16 +-
 .../transformers/cuda/attention_backward.cu   | 261 ++++++++++++++++++
 .../transformers/cuda/flash_attn/fmha_api.cpp |   4 +
 .../attention_backward_generic.cu             | 166 -----------
 .../attention_forward_generic.cu              | 232 ----------------
 .../cuda/mem_eff_attention/find_default_mma.h |   7 +-
 .../cuda/mem_eff_attention/kernel_backward.h  | 250 +++++++++++------
 .../ATen/native/transformers/cuda/sdp_utils.h |  12 +-
 test/test_transformers.py                     |  44 ++-
 tools/autograd/derivatives.yaml               |   7 +-
 .../_internal/common_methods_invocations.py   |   4 +-
 12 files changed, 501 insertions(+), 507 deletions(-)
 create mode 100644 aten/src/ATen/native/transformers/cuda/attention_backward.cu
 delete mode 100644 aten/src/ATen/native/transformers/cuda/mem_eff_attention/attention_backward_generic.cu
 delete mode 100644 aten/src/ATen/native/transformers/cuda/mem_eff_attention/attention_forward_generic.cu

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index de087c0b8a896..9572ccc56653d 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -13287,6 +13287,11 @@
   dispatch:
     CUDA: _efficient_attention_forward
 
+- func: _efficient_attention_backward(Tensor grad, Tensor query, Tensor key, Tensor value, Tensor logsumexp, Tensor out, bool is_causal=False) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: _efficient_attention_backward
+
 - func: _transformer_decoder_only_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, Tensor? incr_key=None, Tensor? incr_value=None) -> (Tensor, Tensor, Tensor)
   variants: function
   dispatch:
diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu
index f65fedd6d7954..46543d4663fab 100644
--- a/aten/src/ATen/native/transformers/cuda/attention.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention.cu
@@ -746,7 +746,9 @@ std::tuple<Tensor, Tensor> flash_attention_helper_dense_unpacked(
 std::tuple<Tensor, Tensor> mem_eff_helper(
     const Tensor& query,
     const Tensor& key,
-    const Tensor& value){
+    const Tensor& value,
+    bool compute_log_sumexp,
+    bool is_causal) {
   // Query -> Query(Batch x Q_seq_len x Num_heads x Dim_per_head)
   // Key   -> Key(Batch x KV_seq_len x Num_heads x Dim_per_head)
   // Value -> Value(Batch x KV_seq_len x  Num_heads x Dim_per_head)
@@ -754,16 +756,18 @@ std::tuple<Tensor, Tensor> mem_eff_helper(
   Tensor k_t = key.transpose(1, 2);
   Tensor v_t = value.transpose(1, 2);
 
-  Tensor attention = std::get<0>(at::_efficient_attention_forward(
+  Tensor attention, log_sumexp;
+  std::tie(attention, log_sumexp) = at::_efficient_attention_forward(
       q_t,
       k_t,
       v_t,
       c10::nullopt,
       c10::nullopt,
       c10::nullopt,
-      false,
-      false)).transpose(1,2);
-  return std::make_tuple(attention, Tensor());
+      compute_log_sumexp,
+      is_causal);
+  attention = attention.transpose(1,2);
+  return std::make_tuple(std::move(attention), Tensor());
 }
 
 std::tuple<Tensor, Tensor> _scaled_dot_product_attention_forward_cuda(
@@ -776,7 +780,7 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_forward_cuda(
       case sdp::SDPBackend::flash_attention:
           return flash_attention_helper_dense_unpacked(query_, key, value, dropout_p, need_attn_weights, is_causal);
       case sdp::SDPBackend::efficient_attention:
-          return mem_eff_helper(query_, key , value);
+          return mem_eff_helper(query_, key , value, need_attn_weights, is_causal);
       case sdp::SDPBackend::math:
         return at::_scaled_dot_product_attention_math(query_, key, value, attn_mask_, dropout_p, need_attn_weights, is_causal);
       default:
diff --git a/aten/src/ATen/native/transformers/cuda/attention_backward.cu b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
new file mode 100644
index 0000000000000..af005b2669b29
--- /dev/null
+++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
@@ -0,0 +1,261 @@
+#include <type_traits>
+
+#include <ATen/ATen.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAMathCompat.h>
+
+#include <ATen/native/nested/NestedTensorTransformerFunctions.h>
+#include <ATen/native/nested/NestedTensorUtils.h>
+#include <ATen/native/transformers/attention.h>
+#include <ATen/native/transformers/cuda/sdp_utils.h>
+
+#ifdef USE_FLASH_ATTENTION
+#include <ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h>
+#endif
+
+#define ASSIGN_CHECK_OVERFLOW(A, B)                                            \
+  {                                                                            \
+    A = B;                                                                     \
+    TORCH_CHECK(B < std::numeric_limits<decltype(A)>::max(), #B " overflows"); \
+  }
+
+#define DISPATCH_MAXK(func)                                   \
+  {                                                           \
+    const auto maxK = std::max(query.size(3), value.size(3)); \
+    if (maxK <= 64) {                                         \
+      constexpr int kMaxK = 64;                               \
+      func();                                                 \
+    } else if (maxK <= 128) {                                 \
+      constexpr int kMaxK = 128;                              \
+      func();                                                 \
+    } else {                                                  \
+      constexpr int kMaxK = std::numeric_limits<int>::max();  \
+      func();                                                 \
+    }                                                         \
+  }
+
+#define DISPATCH_KERNEL(QUERY, KEY, VALUE, FUNC)                               \
+  {                                                                            \
+    cudaDeviceProp* properties =                                               \
+        at::cuda::getDeviceProperties(QUERY.device().index());                 \
+    const int computeCapability = properties->major * 10 + properties->minor;  \
+    DISPATCH_MAXK(([&] {                                                       \
+      DISPATCH_TYPES(                                                          \
+          QUERY, ([&]() {                                                      \
+            DISPATCH_ARCHTAG(                                                  \
+                computeCapability, ([&]() {                                    \
+                  using AlignedAK =                                            \
+                      AttentionBackwardKernel<ArchTag, scalar_t, true, kMaxK>; \
+                  bool isAligned =                                             \
+                      (QUERY.stride(2) % AlignedAK::kOptimalAlignement == 0 && \
+                       KEY.stride(2) % AlignedAK::kOptimalAlignement == 0 &&   \
+                       VALUE.stride(2) % AlignedAK::kOptimalAlignement == 0);  \
+                  DISPATCH_BOOL(isAligned, kIsAligned, ([&]() {                \
+                                  using Kernel = AttentionBackwardKernel<      \
+                                      ArchTag,                                 \
+                                      scalar_t,                                \
+                                      kIsAligned,                              \
+                                      kMaxK>;                                  \
+                                  FUNC();                                      \
+                                }))                                            \
+                }))                                                            \
+          }))                                                                  \
+    }));                                                                       \
+  }
+
+namespace at {
+
+namespace native {
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> _efficient_attention_backward(
+    const at::Tensor& grad_out_,
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const at::Tensor& logsumexp,
+    const at::Tensor& out,
+    bool causal) {
+  #if defined(USE_FLASH_ATTENTION)
+  if (!grad_out_.defined()) {
+    return std::make_tuple(Tensor{}, Tensor{}, Tensor{});
+  }
+    // ndim
+  TORCH_CHECK(query.dim() == grad_out_.dim());
+  TORCH_CHECK(query.dim() == key.dim());
+  TORCH_CHECK(query.dim() == value.dim());
+  TORCH_CHECK(query.dim() == 4);
+
+  // batch size
+  TORCH_CHECK(query.size(0) == grad_out_.size(0));
+  TORCH_CHECK(query.size(0) == key.size(0));
+  TORCH_CHECK(query.size(0) == value.size(0));
+
+  // seqlen
+  TORCH_CHECK(key.size(1) == value.size(1));
+  TORCH_CHECK(query.size(1) == grad_out_.size(1));
+
+  // Num heads
+  TORCH_CHECK(query.size(2) == key.size(2));
+  TORCH_CHECK(query.size(2) == value.size(2));
+  TORCH_CHECK(query.size(2) == grad_out_.size(2));
+
+  // Embedding per head
+  TORCH_CHECK(query.size(3) == key.size(3));
+  TORCH_CHECK(value.size(3) == grad_out_.size(3));
+
+  // handle potentially non-contiguous grad_out through a copy
+  auto grad_out = grad_out_.contiguous();
+  CHECK_NOSPARSE_CONTIGUOUS_CUDA(grad_out);
+
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
+  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
+
+  at::cuda::CUDAGuard device_guard(query.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  int64_t B = query.size(0);
+  int64_t M = query.size(1);
+  int64_t N = key.size(1);
+  int64_t nH = query.size(2);
+  int64_t K = query.size(3);
+
+  // It does not make sense to use that in practice,
+  // but let's still make sure we are correct
+  // As we iterate through keys first, we skip
+  // keys with no query associated, so they are not
+  // initialized
+  bool grad_kv_needs_init = causal && N > M;
+  at::Tensor grad_q, grad_k, grad_v;
+  if (!grad_kv_needs_init && query.size(1) == key.size(1) &&
+      query.size(3) == value.size(3) &&
+      query.storage().is_alias_of(key.storage()) &&
+      query.storage().is_alias_of(value.storage())) {
+    // Create one big contiguous chunk
+    // This is because q, k and v usually come from a single
+    // output of a linear layer that is chunked.
+    // Creating the gradients with the right layout saves us
+    // a `torch.cat` call in the backward pass
+    at::Tensor chunk = at::empty({B, M, 3, nH, K}, query.options());
+    grad_q = chunk.select(2, 0);
+    grad_k = chunk.select(2, 1);
+    grad_v = chunk.select(2, 2);
+  } else {
+    grad_q = at::empty_like(query);
+    grad_k = grad_kv_needs_init ? at::zeros_like(key) : at::empty_like(key);
+    grad_v = grad_kv_needs_init ? at::zeros_like(value) : at::empty_like(value);
+  }
+
+  auto launchKernel = [&](auto _k, int computeCapability) {
+    using Kernel = decltype(_k);
+    using scalar_t = typename Kernel::scalar_t;
+    (void)_k;
+
+    size_t smem_bytes = sizeof(typename Kernel::SharedStorage);
+
+    // TODO: Fuse this into a kernel?
+    // This is a bottleneck for smaller sequences (M <= 128)
+    auto delta = Kernel::kKernelComputesDelta
+        ? at::empty({B, nH, M}, query.options().dtype(at::ScalarType::Float))
+        : (grad_out.to(at::kFloat) * out.to(at::kFloat))
+              .sum(-1)
+              .transpose(-2, -1)
+              .contiguous();
+    TORCH_INTERNAL_ASSERT(delta.size(0) == B);
+    TORCH_INTERNAL_ASSERT(delta.size(1) == nH);
+    TORCH_INTERNAL_ASSERT(delta.size(2) == M);
+
+    typename Kernel::Params p;
+    p.query_ptr = (scalar_t*)query.data_ptr();
+    p.key_ptr = (scalar_t*)key.data_ptr();
+    p.value_ptr = (scalar_t*)value.data_ptr();
+    p.logsumexp_ptr = (typename Kernel::lse_scalar_t*)logsumexp.data_ptr();
+    p.output_ptr = (scalar_t*)out.data_ptr();
+    p.grad_output_ptr = (scalar_t*)grad_out.data_ptr();
+    p.grad_query_ptr = (scalar_t*)grad_q.data_ptr();
+    p.grad_key_ptr = (scalar_t*)grad_k.data_ptr();
+    p.grad_value_ptr = (scalar_t*)grad_v.data_ptr();
+    p.delta_ptr = (float*)delta.data_ptr();
+    p.head_dim = query.size(3);
+    p.head_dim_value = value.size(3);
+    p.num_queries = query.size(1);
+    p.num_keys = key.size(1);
+    p.num_batches = B;
+    p.num_heads = nH;
+    p.causal = causal;
+
+    ASSIGN_CHECK_OVERFLOW(p.gO_strideB, grad_out.stride(0));
+    ASSIGN_CHECK_OVERFLOW(p.gO_strideM, grad_out.stride(1));
+    ASSIGN_CHECK_OVERFLOW(p.gO_strideH, grad_out.stride(2));
+
+    ASSIGN_CHECK_OVERFLOW(p.o_strideB, out.stride(0));
+    ASSIGN_CHECK_OVERFLOW(p.o_strideH, out.stride(2));
+
+    ASSIGN_CHECK_OVERFLOW(p.gQ_strideB, grad_q.stride(0));
+    ASSIGN_CHECK_OVERFLOW(p.gK_strideB, grad_k.stride(0));
+    ASSIGN_CHECK_OVERFLOW(p.gV_strideB, grad_v.stride(0));
+    ASSIGN_CHECK_OVERFLOW(p.gQ_strideH, grad_q.stride(2));
+    ASSIGN_CHECK_OVERFLOW(p.gK_strideH, grad_k.stride(2));
+    ASSIGN_CHECK_OVERFLOW(p.gV_strideH, grad_v.stride(2));
+    p.gQKV_strideM_multiplier = grad_q.is_contiguous() ? 1 : 3;
+    TORCH_INTERNAL_ASSERT(p.gQ_strideM() == grad_q.stride(1));
+    TORCH_INTERNAL_ASSERT(p.gK_strideM() == grad_k.stride(1));
+    TORCH_INTERNAL_ASSERT(p.gV_strideM() == grad_v.stride(1));
+
+    ASSIGN_CHECK_OVERFLOW(p.q_strideB, query.stride(0));
+    ASSIGN_CHECK_OVERFLOW(p.k_strideB, key.stride(0));
+    ASSIGN_CHECK_OVERFLOW(p.v_strideB, value.stride(0));
+    ASSIGN_CHECK_OVERFLOW(p.q_strideM, query.stride(1));
+    ASSIGN_CHECK_OVERFLOW(p.k_strideM, key.stride(1));
+    ASSIGN_CHECK_OVERFLOW(p.v_strideM, value.stride(1));
+    ASSIGN_CHECK_OVERFLOW(p.q_strideH, query.stride(2));
+    ASSIGN_CHECK_OVERFLOW(p.k_strideH, key.stride(2));
+    ASSIGN_CHECK_OVERFLOW(p.v_strideH, value.stride(2));
+
+    Kernel::check_supported(p);
+
+    constexpr auto kernel_fn = attention_kernel_backward_batched<Kernel>;
+
+    if (smem_bytes > 0xc000) {
+      TORCH_INTERNAL_ASSERT(
+          computeCapability >= 70,
+          "This kernel requires too much shared memory on this machine!");
+      cudaFuncSetAttribute(
+          kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes);
+    }
+
+    // second syntax resulted in the error below on windows
+    // error C3495: 'kernel_fn': a simple capture must be a variable
+    // with automatic storage duration declared
+    // in the reaching scope of the lambda
+#ifdef _WIN32
+    cudaFuncAttributes attr;
+    AT_CUDA_CHECK(cudaFuncGetAttributes(&attr, kernel_fn));
+    TORCH_INTERNAL_ASSERT(
+        attr.binaryVersion >= Kernel::ArchTag::kMinComputeCapability,
+        "Something went wrong in the build process");
+#else
+    auto checkBinaryArchMatches = [&]() {
+      cudaFuncAttributes attr;
+      AT_CUDA_CHECK(cudaFuncGetAttributes(&attr, kernel_fn));
+      return attr.binaryVersion >= Kernel::ArchTag::kMinComputeCapability;
+    };
+    TORCH_INTERNAL_ASSERT(
+        checkBinaryArchMatches(), "Something went wrong in the build process");
+#endif
+
+    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes, stream>>>(p);
+  };
+
+  DISPATCH_KERNEL(
+      query, key, value, ([&] { launchKernel(Kernel{}, computeCapability); }));
+  AT_CUDA_CHECK(cudaGetLastError());
+  return std::make_tuple(grad_q, grad_k, grad_v);
+  #endif
+  TORCH_CHECK(false, "USE_FLASH_ATTENTION was not enabled for build.")
+  return std::make_tuple(Tensor{}, Tensor{}, Tensor{});
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
index a8d6110e951d9..aaf7d833fe833 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
@@ -29,6 +29,7 @@
 #ifdef USE_FLASH_ATTENTION
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
 #include <ATen/NativeFunctions.h>
 
 #include <ATen/native/transformers/cuda/flash_attn/fmha.h>
@@ -185,6 +186,9 @@ mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q
     int max_seqlen_q = ((max_seqlen_q_ + 16 - 1) / 16) * 16;
     bool loop = max_seqlen_k > blocksize_c;
 
+    // Otherwise the kernel will be launched from cuda:0 device
+    at::cuda::CUDAGuard device_guard{q.device()};
+
     auto opts = q.options();
 
     auto o = at::empty({ total_q, num_heads, head_size }, opts);
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/attention_backward_generic.cu b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/attention_backward_generic.cu
deleted file mode 100644
index 07c14ad8195dd..0000000000000
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/attention_backward_generic.cu
+++ /dev/null
@@ -1,166 +0,0 @@
-#include <ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h>
-
-#define DISPATCH_MAXK(func)                                   \
-  {                                                           \
-    const auto maxK = std::max(query.size(2), value.size(2)); \
-    if (maxK <= 64) {                                         \
-      constexpr int kMaxK = 64;                               \
-      func();                                                 \
-    } else if (maxK <= 128) {                                 \
-      constexpr int kMaxK = 128;                              \
-      func();                                                 \
-    } else {                                                  \
-      constexpr int kMaxK = std::numeric_limits<int>::max();  \
-      func();                                                 \
-    }                                                         \
-  }
-
-#define DISPATCH_KERNEL(QUERY, KEY, VALUE, FUNC)                               \
-  {                                                                            \
-    cudaDeviceProp* properties =                                               \
-        at::cuda::getDeviceProperties(QUERY.device().index());                 \
-    const int computeCapability = properties->major * 10 + properties->minor;  \
-    DISPATCH_MAXK(([&] {                                                       \
-      DISPATCH_TYPES(                                                          \
-          QUERY, ([&]() {                                                      \
-            DISPATCH_ARCHTAG(                                                  \
-                computeCapability, ([&]() {                                    \
-                  using AlignedAK =                                            \
-                      AttentionBackwardKernel<ArchTag, scalar_t, true, kMaxK>; \
-                  bool isAligned =                                             \
-                      (QUERY.stride(1) % AlignedAK::kOptimalAlignement == 0 && \
-                       KEY.stride(1) % AlignedAK::kOptimalAlignement == 0 &&   \
-                       VALUE.stride(1) % AlignedAK::kOptimalAlignement == 0);  \
-                  DISPATCH_BOOL(isAligned, kIsAligned, ([&]() {                \
-                                  using Kernel = AttentionBackwardKernel<      \
-                                      ArchTag,                                 \
-                                      scalar_t,                                \
-                                      kIsAligned,                              \
-                                      kMaxK>;                                  \
-                                  FUNC();                                      \
-                                }))                                            \
-                }))                                                            \
-          }))                                                                  \
-    }));                                                                       \
-  }
-
-namespace {
-std::tuple<at::Tensor, at::Tensor, at::Tensor>
-mem_efficient_attention_backward_cutlass(
-    const at::Tensor& grad_out_,
-    const at::Tensor& query,
-    const at::Tensor& key,
-    const at::Tensor& value,
-    const at::Tensor& logsumexp,
-    const at::Tensor& out,
-    bool causal) {
-  TORCH_CHECK(query.dim() == grad_out_.dim());
-  TORCH_CHECK(query.dim() == key.dim());
-  TORCH_CHECK(query.dim() == 3);
-
-  TORCH_CHECK(query.size(0) == grad_out_.size(0));
-  TORCH_CHECK(query.size(1) == grad_out_.size(1));
-  TORCH_CHECK(value.size(2) == grad_out_.size(2));
-
-  TORCH_CHECK(query.size(2) == key.size(2));
-  TORCH_CHECK(query.size(0) == key.size(0));
-
-  TORCH_CHECK(query.size(0) == value.size(0));
-  TORCH_CHECK(key.size(1) == value.size(1));
-
-  // handle potentially non-contiguous grad_out through a copy
-  auto grad_out = grad_out_.contiguous();
-
-  CHECK_NOSPARSE_CONTIGUOUS_CUDA(query);
-  CHECK_NOSPARSE_CONTIGUOUS_CUDA(key);
-  CHECK_NOSPARSE_CONTIGUOUS_CUDA(value);
-  CHECK_NOSPARSE_CONTIGUOUS_CUDA(grad_out);
-
-  at::cuda::CUDAGuard device_guard(query.device());
-
-  int64_t B = query.size(0);
-  int64_t M = query.size(1);
-  int64_t N = key.size(1);
-  int64_t K = query.size(2);
-
-  // It does not make sense to use that in practice,
-  // but let's still make sure we are correct
-  // As we iterate through keys first, we skip
-  // keys with no query associated, so they are not
-  // initialized
-  bool grad_kv_needs_init = causal && N > M;
-  at::Tensor grad_q = at::empty_like(query);
-  at::Tensor grad_k =
-      grad_kv_needs_init ? at::zeros_like(key) : at::empty_like(key);
-  at::Tensor grad_v =
-      grad_kv_needs_init ? at::zeros_like(value) : at::empty_like(value);
-
-  auto launchKernel = [&](auto _k, int computeCapability) {
-    using Kernel = decltype(_k);
-    using scalar_t = typename Kernel::scalar_t;
-    (void)_k;
-
-    size_t smem_bytes = sizeof(typename Kernel::SharedStorage);
-
-    // TODO: Fuse this into a kernel?
-    // This is a bottleneck for smaller sequences (M <= 128)
-    auto delta = Kernel::kKernelComputesDelta
-        ? at::empty({B, M}, query.options().dtype(at::ScalarType::Float))
-        : (grad_out.to(at::kFloat) * out.to(at::kFloat)).sum(-1);
-    TORCH_INTERNAL_ASSERT(delta.size(0) == B);
-    TORCH_INTERNAL_ASSERT(delta.size(1) == M);
-
-    typename Kernel::Params params;
-    params.query_ptr = (scalar_t*)query.data_ptr();
-    params.key_ptr = (scalar_t*)key.data_ptr();
-    params.value_ptr = (scalar_t*)value.data_ptr();
-    params.logsumexp_ptr = (typename Kernel::lse_scalar_t*)logsumexp.data_ptr();
-    params.output_ptr = (scalar_t*)out.data_ptr();
-    params.grad_output_ptr = (scalar_t*)grad_out.data_ptr();
-    params.grad_query_ptr = (scalar_t*)grad_q.data_ptr();
-    params.grad_key_ptr = (scalar_t*)grad_k.data_ptr();
-    params.grad_value_ptr = (scalar_t*)grad_v.data_ptr();
-    params.delta_ptr = (float*)delta.data_ptr();
-    params.head_dim = query.size(2);
-    params.head_dim_value = value.size(2);
-    params.num_queries = query.size(1);
-    params.num_keys = key.size(1);
-    params.num_batches = B;
-    params.causal = causal;
-    Kernel::check_supported(params);
-
-    constexpr auto kernel_fn = attention_kernel_backward_batched<Kernel>;
-
-    if (smem_bytes > 0xc000) {
-      TORCH_INTERNAL_ASSERT(
-          computeCapability >= 70,
-          "This kernel requires too much shared memory on this machine!");
-      cudaFuncSetAttribute(
-          kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes);
-    }
-
-    auto checkBinaryArchMatches = [&]() {
-      cudaFuncAttributes attr;
-      AT_CUDA_CHECK(cudaFuncGetAttributes(&attr, kernel_fn));
-      return attr.binaryVersion >= Kernel::ArchTag::kMinComputeCapability;
-    };
-    TORCH_INTERNAL_ASSERT(
-        checkBinaryArchMatches(), "Something went wrong in the build process");
-
-    kernel_fn<<<params.getBlocksGrid(), params.getThreadsGrid(), smem_bytes>>>(
-        params);
-  };
-
-  DISPATCH_KERNEL(
-      query, key, value, ([&] { launchKernel(Kernel{}, computeCapability); }));
-  AT_CUDA_CHECK(cudaGetLastError());
-  return std::make_tuple(grad_q, grad_k, grad_v);
-} // namespace
-
-} // namespace
-
-// TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
-//   m.impl(
-//       TORCH_SELECTIVE_NAME("xformers::efficient_attention_backward_cutlass"),
-//       TORCH_FN(mem_efficient_attention_backward_cutlass));
-// }
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/attention_forward_generic.cu b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/attention_forward_generic.cu
deleted file mode 100644
index 59b3637c8a438..0000000000000
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/attention_forward_generic.cu
+++ /dev/null
@@ -1,232 +0,0 @@
-#include <ATen/native/transformers/cuda/mem_eff_attention/kernel_forward.h>
-
-
-#define DISPATCH_BLOCKSIZE(VALUE_HEAD_DIM, FN)        \
-  {                                                   \
-    if (VALUE_HEAD_DIM <= 64) {                       \
-      constexpr bool kIs64x64 = true;                 \
-      constexpr bool kSingleValueIteration = true;    \
-      FN();                                           \
-    } else {                                          \
-      constexpr bool kIs64x64 = false;                \
-      if (VALUE_HEAD_DIM <= 128) {                    \
-        constexpr bool kSingleValueIteration = true;  \
-        FN();                                         \
-      } else {                                        \
-        constexpr bool kSingleValueIteration = false; \
-        FN();                                         \
-      }                                               \
-    }                                                 \
-  }
-
-#define DISPATCH_KERNEL(QUERY, KEY, VALUE, FUNC)                              \
-  {                                                                           \
-    cudaDeviceProp* properties =                                              \
-        at::cuda::getDeviceProperties(QUERY.device().index());                \
-    const int computeCapability = properties->major * 10 + properties->minor; \
-    DISPATCH_BLOCKSIZE(                                                       \
-        VALUE.size(-1), ([&]() {                                              \
-          static constexpr int64_t kQueriesPerBlock = kIs64x64 ? 64 : 32;     \
-          static constexpr int64_t kKeysPerBlock = kIs64x64 ? 64 : 128;       \
-          DISPATCH_TYPES(                                                     \
-              QUERY, ([&]() {                                                 \
-                DISPATCH_ARCHTAG(                                             \
-                    computeCapability, ([&]() {                               \
-                      using AlignedAK = AttentionKernel<                      \
-                          scalar_t,                                           \
-                          ArchTag,                                            \
-                          true,                                               \
-                          kQueriesPerBlock,                                   \
-                          kKeysPerBlock,                                      \
-                          kSingleValueIteration>;                             \
-                      /* Run a more efficient kernel (with `isAligned=True`)  \
-                      if memory is correctly aligned*/                        \
-                      bool isAligned =                                        \
-                          (QUERY.stride(2) % AlignedAK::kAlignmentQ == 0 &&   \
-                           KEY.stride(2) % AlignedAK::kAlignmentK == 0 &&     \
-                           VALUE.stride(2) % AlignedAK::kAlignmentV == 0);    \
-                      /* TODO: Should we warn or log somewhere when we use a  \
-                      less efficient kernel due to wrong alignment? */        \
-                      DISPATCH_BOOL(isAligned, kIsAligned, ([&]() {           \
-                                      using Kernel = AttentionKernel<         \
-                                          scalar_t,                           \
-                                          ArchTag,                            \
-                                          kIsAligned,                         \
-                                          kQueriesPerBlock,                   \
-                                          kKeysPerBlock,                      \
-                                          kSingleValueIteration>;             \
-                                      FUNC();                                 \
-                                    }))                                       \
-                    }))                                                       \
-              }));                                                            \
-        }));                                                                  \
-  }
-
-namespace {
-/*
-  There are 2 modes for using this function.
-  (Mode BMHK) With all the heads having the same seqlen
-  (Mode 1MHK) `batch=1` with all tokens across batches concatenated
-*/
-std::tuple<at::Tensor, at::Tensor> efficient_attention_forward_cutlass(
-    const at::Tensor& query, // [b, seqlen, num_heads, K]
-    const at::Tensor& key, // [b, seqlen, num_heads, K]
-    const at::Tensor& value, // [b, seqlen, num_heads, Kv]
-    // (Mode 1MHK only) [b+1]: cu_seqlens_q[b] contains the
-    // position of the first query token for batch $b
-    const c10::optional<at::Tensor>& cu_seqlens_q,
-    // (Mode 1MHK only) [b+1]: cu_seqlens_k[b] contains the
-    // position of the first key token for batch $b
-    const c10::optional<at::Tensor>& cu_seqlens_k,
-    // (Mode 1MHK only) Maximum sequence length across batches
-    const c10::optional<int64_t> max_seqlen_q_,
-    bool compute_logsumexp,
-    bool causal) {
-  TORCH_CHECK(query.dim() == 4);
-  TORCH_CHECK(key.dim() == 4);
-  TORCH_CHECK(value.dim() == 4);
-
-  // Batch sizes
-  TORCH_CHECK(query.size(0) == key.size(0));
-  TORCH_CHECK(query.size(0) == value.size(0));
-
-  // Sequence length
-  TORCH_CHECK(key.size(1) == value.size(1));
-
-  // Num heads
-  TORCH_CHECK(query.size(2) == key.size(2));
-  TORCH_CHECK(query.size(2) == value.size(2));
-
-  // Embedding per head
-  TORCH_CHECK(query.size(3) == key.size(3));
-
-  int64_t max_seqlen_q, max_seqlen_k;
-  TORCH_CHECK(cu_seqlens_q.has_value() == cu_seqlens_k.has_value());
-  if (cu_seqlens_q.has_value()) {
-    TORCH_CHECK(cu_seqlens_q->scalar_type() == at::ScalarType::Int);
-    TORCH_CHECK(cu_seqlens_k->scalar_type() == at::ScalarType::Int);
-    TORCH_CHECK(cu_seqlens_q->dim() == 1 && cu_seqlens_k->dim() == 1);
-    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*cu_seqlens_q));
-    CHECK_NOSPARSE_CONTIGUOUS_CUDA((*cu_seqlens_k));
-    TORCH_CHECK(cu_seqlens_q->size(0) == cu_seqlens_k->size(0));
-    TORCH_CHECK(query.size(0) == 1, "cu_seqlen only supports batch_size=1");
-    TORCH_CHECK(max_seqlen_q_.has_value());
-    max_seqlen_q = *max_seqlen_q_;
-    max_seqlen_k = 0; // Will be set inside the kernel
-  } else {
-    max_seqlen_q = query.size(1);
-    max_seqlen_k = key.size(1);
-  }
-
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(query);
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(key);
-  CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(value);
-
-  at::cuda::CUDAGuard device_guard(query.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  int64_t B = query.size(0);
-  int64_t M = query.size(1);
-  int64_t N = key.size(1);
-  int64_t num_heads = query.size(-2);
-  int64_t K = query.size(-1);
-  int64_t Kv = value.size(-1);
-
-  at::Tensor res;
-  at::Tensor logsumexp;
-
-  auto launchKernel = [&](auto _k, int computeCapability) {
-    using Kernel = decltype(_k);
-    using scalar_t = typename Kernel::scalar_t;
-    (void)_k;
-
-    res = at::empty(
-        {B, M, num_heads, Kv},
-        query.options().dtype(
-            TypeTraits<typename Kernel::output_t>::atScalarType()));
-
-    // NOTE: Should be aligned (by padding) in case M is
-    // not a good number for loading during backward
-    constexpr decltype(M) kAlignLSE = Kernel::kAlignLSE;
-    logsumexp = at::empty(
-        {B,
-         num_heads,
-         compute_logsumexp ? ceil_div(max_seqlen_q, kAlignLSE) * kAlignLSE : 0},
-        query.options().dtype(at::ScalarType::Float));
-
-    typename Kernel::Params p;
-    p.query_ptr = (scalar_t*)query.data_ptr();
-    p.key_ptr = (scalar_t*)key.data_ptr();
-    p.value_ptr = (scalar_t*)value.data_ptr();
-    p.logsumexp_ptr = compute_logsumexp
-        ? (typename Kernel::lse_scalar_t*)logsumexp.data_ptr()
-        : nullptr;
-    at::Tensor output_accum;
-    if (Kernel::kNeedsOutputAccumulatorBuffer) {
-      output_accum = at::empty(
-          {B, M, num_heads, Kv},
-          query.options().dtype(
-              TypeTraits<typename Kernel::output_accum_t>::atScalarType()));
-      p.output_accum_ptr =
-          (typename Kernel::output_accum_t*)output_accum.data_ptr();
-    } else {
-      p.output_accum_ptr = nullptr;
-    }
-    p.output_ptr = (typename Kernel::output_t*)res.data_ptr();
-
-    if (cu_seqlens_q.has_value()) {
-      p.cu_seqlens_q_ptr = (int32_t*)cu_seqlens_q->data_ptr();
-      p.cu_seqlens_k_ptr = (int32_t*)cu_seqlens_k->data_ptr();
-    }
-
-#define ASSIGN_CHECK_OVERFLOW(A, B)                                            \
-  {                                                                            \
-    A = B;                                                                     \
-    TORCH_CHECK(B < std::numeric_limits<decltype(A)>::max(), #B " overflows"); \
-  }
-
-    p.num_heads = num_heads;
-    p.head_dim = query.size(3);
-    p.head_dim_value = value.size(3);
-    p.num_queries = max_seqlen_q;
-    p.num_keys = max_seqlen_k;
-    p.num_batches = cu_seqlens_q.has_value() ? cu_seqlens_q->size(0) - 1 : B;
-    p.causal = causal;
-
-    ASSIGN_CHECK_OVERFLOW(p.q_strideB, query.stride(0));
-    ASSIGN_CHECK_OVERFLOW(p.k_strideB, key.stride(0));
-    ASSIGN_CHECK_OVERFLOW(p.v_strideB, value.stride(0));
-    ASSIGN_CHECK_OVERFLOW(p.q_strideM, query.stride(1));
-    ASSIGN_CHECK_OVERFLOW(p.k_strideM, key.stride(1));
-    ASSIGN_CHECK_OVERFLOW(p.v_strideM, value.stride(1));
-    ASSIGN_CHECK_OVERFLOW(p.q_strideH, query.stride(2));
-    ASSIGN_CHECK_OVERFLOW(p.k_strideH, key.stride(2));
-    ASSIGN_CHECK_OVERFLOW(p.v_strideH, value.stride(2));
-
-    constexpr auto kernel_fn = attention_kernel_batched<Kernel>;
-    size_t smem_bytes = sizeof(typename Kernel::SharedStorage);
-    if (smem_bytes > 0xc000) {
-      TORCH_INTERNAL_ASSERT(
-          computeCapability >= 70,
-          "This kernel requires too much shared memory on this machine!");
-      AT_CUDA_CHECK(cudaFuncSetAttribute(
-          kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes));
-    }
-    Kernel::check_supported(p);
-    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
-  };
-  // Dispatch to the right kernel
-  DISPATCH_KERNEL(query, key, value, ([&]() {
-                    launchKernel(Kernel{}, computeCapability);
-                  }));
-
-  AT_CUDA_CHECK(cudaGetLastError());
-  return std::make_tuple(res, logsumexp);
-}
-} // namespace
-
-// TORCH_LIBRARY_IMPL(xformers, CUDA, m) {
-//   m.impl(
-//       TORCH_SELECTIVE_NAME("xformers::efficient_attention_forward_cutlass"),
-//       TORCH_FN(efficient_attention_forward_cutlass));
-// }
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/find_default_mma.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/find_default_mma.h
index 399593fd09573..b0e7106f3cfc8 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/find_default_mma.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/find_default_mma.h
@@ -1,15 +1,16 @@
 /*! \file
     \brief Cutlass provides helper template functions to figure out the right
-   datastructures to instanciate to run a GEMM with various parameters (see
+   datastructures to instantiate to run a GEMM with various parameters (see
    `cutlass/gemm/threadblock/default_mma.h`). However, due to template
-   instanciation priority rules, it will only create an MmaMultiStage with
+   instantiation priority rules, it will only create an MmaMultiStage with
    kStages=3 (otherwise creates an MmePipelined - which is not compatible with
    FastF32). kStages=3 uses too much shared memory and we want to use kStages=2,
    so we just copy-pasted some code from `default_mma.h` and
-   `default_mma_core.h` files and wrapped this template to allow our usecase.
+   `default_mma_core.h` files and wrapped this template to allow our use case.
 
     This is really only for the FastF32 case - aka using TensorCores with fp32.
 */
+#pragma once
 
 #include <cutlass/gemm/threadblock/default_mma.h>
 #include <cutlass/gemm/threadblock/default_mma_core_simt.h>
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h
index e25701a7588ac..c9652c40d38e4 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h
@@ -1,7 +1,5 @@
 #pragma once
-
 #include <ATen/ATen.h>
-#include <torch/library.h>
 #include <cmath>
 #include <vector>
 
@@ -75,46 +73,113 @@ struct AttentionBackwardKernel {
 
   struct Params {
     // Input tensors
-    scalar_t* query_ptr; // [num_queries, head_dim]
-    scalar_t* key_ptr; // [num_keys, head_dim]
-    scalar_t* value_ptr; // [num_keys, head_dim_value]
-    lse_scalar_t* logsumexp_ptr; // [num_queries]
-    scalar_t* output_ptr; // [num_queries, head_dim_value]
-    scalar_t* grad_output_ptr; // [num_queries, head_dim_value]
-    accum_t* delta_ptr; // [num_queries]
+    scalar_t* query_ptr; // [Mq, nH, K]
+    scalar_t* key_ptr; // [Mk, nH, K]
+    scalar_t* value_ptr; // [Mk, nH, Kv]
+    lse_scalar_t* logsumexp_ptr; // [nH, Mq]
+    scalar_t* output_ptr; // [Mq, nH, Kv]
+    scalar_t* grad_output_ptr; // [Mq, nH, Kv]
+    accum_t* delta_ptr; // [Mq, nH]
 
     // Output tensors
-    scalar_t* grad_query_ptr; // [num_queries, head_dim]
-    scalar_t* grad_key_ptr; // [num_keys, head_dim]
-    scalar_t* grad_value_ptr; // [num_keys, head_dim_value]
+    output_t* grad_query_ptr; //  [Mq, nH, K]
+    output_t* grad_key_ptr; //    [Mk, nH, K]
+    output_t* grad_value_ptr; //  [Mk, nH, Kv]
 
     // Dimensions/strides
     int32_t head_dim;
     int32_t head_dim_value;
     int32_t num_queries;
     int32_t num_keys;
-    int32_t num_batches;
+    int32_t num_heads;
     bool causal;
 
-    __device__ void advance_batches(int32_t batch_id) {
+    int32_t q_strideM;
+    int32_t k_strideM;
+    int32_t v_strideM;
+    int32_t gO_strideM;
+    int8_t gQKV_strideM_multiplier; // 3 for packed, 1 otherwise
+
+    CUTLASS_HOST_DEVICE int32_t o_strideM() const {
+      return head_dim_value * num_heads;
+    }
+    CUTLASS_HOST_DEVICE int32_t gQ_strideM() const {
+      return gQKV_strideM_multiplier * num_heads * head_dim;
+    }
+    CUTLASS_HOST_DEVICE int32_t gK_strideM() const {
+      return gQKV_strideM_multiplier * num_heads * head_dim;
+    }
+    CUTLASS_HOST_DEVICE int32_t gV_strideM() const {
+      return gQKV_strideM_multiplier * num_heads * head_dim_value;
+    }
+
+    // Everything below is only used in `advance_to_block`
+    // and shouldn't use registers
+    int64_t o_strideH;
+    int32_t q_strideH;
+    int32_t k_strideH;
+    int32_t v_strideH;
+    int64_t o_strideB;
+    int64_t q_strideB;
+    int64_t k_strideB;
+    int64_t v_strideB;
+    int32_t num_batches;
+
+    int64_t gO_strideB;
+    int64_t gQ_strideB;
+    int64_t gK_strideB;
+    int64_t gV_strideB;
+    int64_t gO_strideH;
+    int64_t gQ_strideH;
+    int64_t gK_strideH;
+    int64_t gV_strideH;
+
+    CUTLASS_DEVICE void advance_to_block() {
       constexpr int32_t kAlignLSE = 32; // block size of backward
       auto lse_dim = ceil_div((int32_t)num_queries, kAlignLSE) * kAlignLSE;
 
-      query_ptr += batch_id * head_dim * num_queries;
-      key_ptr += batch_id * head_dim * num_keys;
-      value_ptr += batch_id * head_dim_value * num_keys;
-      logsumexp_ptr += batch_id * lse_dim;
-      output_ptr += batch_id * head_dim_value * num_queries;
-      grad_output_ptr += batch_id * head_dim_value * num_queries;
-      delta_ptr += batch_id * num_queries;
-
-      grad_query_ptr += batch_id * head_dim * num_queries;
-      grad_key_ptr += batch_id * head_dim * num_keys;
-      grad_value_ptr += batch_id * head_dim_value * num_keys;
+      int32_t batch_id = blockIdx.z;
+      int32_t head_id = blockIdx.y;
+
+      query_ptr += batch_id * q_strideB + head_id * q_strideH;
+      key_ptr += batch_id * k_strideB + head_id * k_strideH;
+      value_ptr += batch_id * v_strideB + head_id * v_strideH;
+      logsumexp_ptr += (batch_id * num_heads + head_id) * lse_dim;
+      output_ptr += batch_id * o_strideB + head_id * o_strideH;
+      grad_output_ptr += batch_id * gO_strideB + head_id * gO_strideH;
+      delta_ptr += (batch_id * num_heads + head_id) * num_queries;
+
+      grad_query_ptr += batch_id * gQ_strideB + head_id * gQ_strideH;
+      grad_key_ptr += batch_id * gK_strideB + head_id * gK_strideH;
+      grad_value_ptr += batch_id * gV_strideB + head_id * gV_strideH;
+
+      head_dim = warp_uniform(head_dim);
+      head_dim_value = warp_uniform(head_dim_value);
+      num_queries = warp_uniform(num_queries);
+      num_keys = warp_uniform(num_keys);
+      num_heads = warp_uniform(num_heads);
+
+      gO_strideM = warp_uniform(gO_strideM);
+      gQKV_strideM_multiplier = warp_uniform(gQKV_strideM_multiplier);
+      q_strideM = warp_uniform(q_strideM);
+      k_strideM = warp_uniform(k_strideM);
+      v_strideM = warp_uniform(v_strideM);
+
+      query_ptr = warp_uniform(query_ptr);
+      key_ptr = warp_uniform(key_ptr);
+      value_ptr = warp_uniform(value_ptr);
+      logsumexp_ptr = warp_uniform(logsumexp_ptr);
+      output_ptr = warp_uniform(output_ptr);
+      grad_output_ptr = warp_uniform(grad_output_ptr);
+      delta_ptr = warp_uniform(delta_ptr);
+
+      grad_query_ptr = warp_uniform(grad_query_ptr);
+      grad_key_ptr = warp_uniform(grad_key_ptr);
+      grad_value_ptr = warp_uniform(grad_value_ptr);
     }
 
     __host__ dim3 getBlocksGrid() const {
-      return dim3(1, 1, num_batches);
+      return dim3(1, num_heads, num_batches);
     }
     __host__ dim3 getThreadsGrid() const {
       return dim3(kWarpSize, kNumWarpsPerBlock, 1);
@@ -179,7 +244,6 @@ struct AttentionBackwardKernel {
     attn_T = k_j @ q_i.transpose(-2, -1) # matmul
     attn_T = (attn_T - logsumexp[i_start:i_end].unsqueeze(1).transpose(-2,
     -1)).exp() # epilogue
-
     with attn_T.shape = (kBlockSizeJ, kBlockSizeI)
     */
     using ThreadblockShape =
@@ -225,7 +289,6 @@ struct AttentionBackwardKernel {
   struct MatmulGradV {
     /*
     grad_v[j_start:j_end] += attn_T @ do_i # matmul
-
     Dimensions: (kBlockSizeJ * kNumWarpsPerBlock, kBlockSizeI, K)
     (we might need to iterate multiple times on K)
     */
@@ -601,7 +664,7 @@ struct AttentionBackwardKernel {
     typename MatmulGradV::Mma::FragmentC gradV;
     typename MatmulGradK::Mma::FragmentC gradK;
 
-    __device__ __forceinline__ void clear() {
+    CUTLASS_DEVICE void clear() {
       gradV.clear();
       gradK.clear();
     }
@@ -614,14 +677,14 @@ struct AttentionBackwardKernel {
     CHECK_ALIGNED_PTR(p.output_ptr, kMinimumAlignment);
     CHECK_ALIGNED_PTR(p.grad_output_ptr, kMinimumAlignment);
     TORCH_CHECK(
-        p.head_dim % kMinimumAlignment == 0,
-        "query/key is not correctly aligned");
+        p.q_strideH % kMinimumAlignment == 0, "query is not correctly aligned");
     TORCH_CHECK(
-        p.head_dim_value % kMinimumAlignment == 0,
-        "value is not correctly aligned");
+        p.k_strideH % kMinimumAlignment == 0, "key is not correctly aligned");
+    TORCH_CHECK(
+        p.v_strideH % kMinimumAlignment == 0, "value is not correctly aligned");
   }
 
-  static __device__ void kernel(Params& p_) {
+  static CUTLASS_DEVICE void kernel(Params& p_) {
     // Hint to nvcc to store points & tensor shapes in registers
     // as we use them a lot
     register const Params p = p_;
@@ -658,7 +721,7 @@ struct AttentionBackwardKernel {
       __syncthreads();
     }
 
-    OutputFragments output_frags;
+    OutputFragments register output_frags;
     int32_t key_start = 0;
     int32_t key_end = p.num_keys / kBlockSizeJ * kBlockSizeJ;
     for (; key_start < key_end; key_start += kBlockSizeJ) {
@@ -695,7 +758,7 @@ struct AttentionBackwardKernel {
     }
   }
 
-  static __device__ __forceinline__ void loadDi(
+  static CUTLASS_DEVICE void loadDi(
       cutlass::Array<accum_t, kBlockSizeI>& di,
       Params const& p,
       int32_t query_start) {
@@ -710,7 +773,7 @@ struct AttentionBackwardKernel {
   }
 
   template <bool skipBoundsChecks>
-  static __device__ __forceinline__ void processBlockIJ(
+  static CUTLASS_DEVICE void processBlockIJ(
       SharedStorage& shared_storage,
       OutputFragments& output_frags,
       Params const& p,
@@ -718,9 +781,9 @@ struct AttentionBackwardKernel {
       int32_t key_start) {
     cutlass::MatrixCoord no_offset{0, 0};
     accum_t scale = accum_t(1.0 / std::sqrt(float(p.head_dim)));
-    int32_t thread_id = threadIdx.x + threadIdx.y * blockDim.x;
-    int32_t warp_id = threadIdx.y;
-    int32_t lane_id = threadIdx.x;
+    int16_t thread_id = threadIdx.x + threadIdx.y * blockDim.x;
+    int8_t warp_id = warp_uniform(threadIdx.y);
+    int8_t lane_id = threadIdx.x;
     __syncthreads();
     loadDi(shared_storage.di(), p, query_start);
 
@@ -734,8 +797,8 @@ struct AttentionBackwardKernel {
 
     auto prologueGradV = [&](int col) {
       typename MatmulGradV::Mma::IteratorB iterator_dO(
-          {int32_t(p.head_dim_value)},
-          p.grad_output_ptr + query_start * p.head_dim_value + col,
+          {int32_t(p.gO_strideM)},
+          p.grad_output_ptr + query_start * p.gO_strideM + col,
           {num_queries_in_block, p.head_dim_value - col},
           thread_id,
           no_offset);
@@ -747,8 +810,8 @@ struct AttentionBackwardKernel {
     };
     auto prologueGradQ = [&](int col) {
       typename MatmulGradQ::Mma::IteratorB iterator_K(
-          {int32_t(p.head_dim)},
-          p.key_ptr + key_start * p.head_dim + col,
+          {int32_t(p.k_strideM)},
+          p.key_ptr + key_start * p.k_strideM + col,
           {num_keys_in_block, p.head_dim - col},
           thread_id,
           no_offset);
@@ -757,8 +820,8 @@ struct AttentionBackwardKernel {
     };
     auto prologueGradK = [&](int col) {
       typename MatmulGradK::Mma::IteratorB iterator_Q(
-          {int32_t(p.head_dim)},
-          p.query_ptr + query_start * p.head_dim + col,
+          {int32_t(p.q_strideM)},
+          p.query_ptr + query_start * p.q_strideM + col,
           {num_queries_in_block, p.head_dim - col},
           thread_id,
           no_offset);
@@ -770,14 +833,14 @@ struct AttentionBackwardKernel {
     };
     auto prologueDOV = [&]() {
       typename MatmulDOIVJ::Mma::IteratorA iterator_A(
-          {int32_t(p.head_dim_value)},
-          p.grad_output_ptr + query_start * p.head_dim_value,
+          {int32_t(p.gO_strideM)},
+          p.grad_output_ptr + query_start * p.gO_strideM,
           {num_queries_in_block, p.head_dim_value},
           thread_id,
           no_offset);
       typename MatmulDOIVJ::Mma::IteratorB iterator_B(
-          {int32_t(p.head_dim_value)},
-          p.value_ptr + key_start * p.head_dim_value,
+          {int32_t(p.v_strideM)},
+          p.value_ptr + key_start * p.v_strideM,
           {p.head_dim_value, num_keys_in_block},
           thread_id,
           no_offset);
@@ -803,16 +866,16 @@ struct AttentionBackwardKernel {
 
       // k_j
       typename Mma::IteratorA iterator_A(
-          {int32_t(p.head_dim)},
-          p.key_ptr + key_start * p.head_dim,
+          {int32_t(p.k_strideM)},
+          p.key_ptr + key_start * p.k_strideM,
           {problem_size.m(), problem_size.k()},
           thread_id,
           no_offset);
 
       // q_i.transpose(-2, -1)
       typename Mma::IteratorB iterator_B(
-          {int32_t(p.head_dim)},
-          p.query_ptr + query_start * p.head_dim,
+          {int32_t(p.q_strideM)},
+          p.query_ptr + query_start * p.q_strideM,
           {problem_size.k(), problem_size.n()},
           thread_id,
           no_offset);
@@ -893,14 +956,14 @@ struct AttentionBackwardKernel {
           num_keys_in_block, p.head_dim_value - col, num_queries_in_block);
       auto createEpilogueIter = [&]() {
         return typename MatmulGradV::OutputTileIterator(
-            typename MatmulGradV::OutputTileIterator::Params{p.head_dim_value},
-            p.grad_value_ptr + key_start * p.head_dim_value + col,
+            typename MatmulGradV::OutputTileIterator::Params{p.gV_strideM()},
+            p.grad_value_ptr + key_start * p.gV_strideM() + col,
             {num_keys_in_block, p.head_dim_value - col},
             thread_id);
       };
       typename Mma::IteratorB iterator_B(
-          {int32_t(p.head_dim_value)},
-          p.grad_output_ptr + query_start * p.head_dim_value + col,
+          {int32_t(p.gO_strideM)},
+          p.grad_output_ptr + query_start * p.gO_strideM + col,
           {num_queries_in_block, p.head_dim_value - col},
           thread_id,
           no_offset);
@@ -951,16 +1014,16 @@ struct AttentionBackwardKernel {
       using Mma = typename MatmulDOIVJ::Mma;
       // do_i
       typename Mma::IteratorA iterator_A(
-          {int32_t(p.head_dim_value)},
-          p.grad_output_ptr + query_start * p.head_dim_value,
+          {int32_t(p.gO_strideM)},
+          p.grad_output_ptr + query_start * p.gO_strideM,
           {num_queries_in_block, p.head_dim_value},
           thread_id,
           no_offset);
 
       // v_j.transpose(-2, -1)
       typename Mma::IteratorB iterator_B(
-          {int32_t(p.head_dim_value)},
-          p.value_ptr + key_start * p.head_dim_value,
+          {int32_t(p.v_strideM)},
+          p.value_ptr + key_start * p.v_strideM,
           {p.head_dim_value, num_keys_in_block},
           thread_id,
           no_offset);
@@ -1057,16 +1120,16 @@ struct AttentionBackwardKernel {
           num_keys_in_block);
       auto createEpilogueIter = [&]() {
         return typename MatmulGradQ::OutputTileIterator(
-            typename MatmulGradQ::OutputTileIterator::Params{p.head_dim},
-            p.grad_query_ptr + query_start * p.head_dim + col,
+            typename MatmulGradQ::OutputTileIterator::Params{p.gQ_strideM()},
+            p.grad_query_ptr + query_start * p.gQ_strideM() + col,
             {problem_size.m(), problem_size.n()},
             thread_id);
       };
 
       // k_j
       typename Mma::IteratorB iterator_B(
-          {int32_t(p.head_dim)},
-          p.key_ptr + key_start * p.head_dim + col,
+          {int32_t(p.k_strideM)},
+          p.key_ptr + key_start * p.k_strideM + col,
           {problem_size.k(), problem_size.n()},
           thread_id,
           no_offset);
@@ -1153,8 +1216,8 @@ struct AttentionBackwardKernel {
           num_queries_in_block);
       auto createEpilogueIter = [&]() {
         return typename MatmulGradK::OutputTileIterator(
-            typename MatmulGradK::OutputTileIterator::Params{p.head_dim},
-            p.grad_key_ptr + key_start * p.head_dim + col,
+            typename MatmulGradK::OutputTileIterator::Params{p.gK_strideM()},
+            p.grad_key_ptr + key_start * p.gK_strideM() + col,
             {num_keys_in_block,
              false ? MatmulGradK::ThreadblockShape::kN : p.head_dim - col},
             thread_id);
@@ -1162,8 +1225,8 @@ struct AttentionBackwardKernel {
 
       // q_i
       typename Mma::IteratorB iterator_B(
-          {int32_t(p.head_dim)},
-          p.query_ptr + query_start * p.head_dim + col,
+          {int32_t(p.q_strideM)},
+          p.query_ptr + query_start * p.q_strideM + col,
           {problem_size.k(), problem_size.n()},
           thread_id,
           no_offset);
@@ -1236,15 +1299,15 @@ struct AttentionBackwardKernel {
         kForceReloadK || !MatmulQK::Mma::kSmemContainsEntireMat;
     auto thread_id = get_thread_id();
     typename MatmulQK::Mma::IteratorA iterator_A(
-        {int32_t(p.head_dim)},
-        p.key_ptr + key_start * p.head_dim,
+        {int32_t(p.k_strideM)},
+        p.key_ptr + key_start * p.k_strideM,
         {p.num_keys - key_start, p.head_dim},
         thread_id,
         cutlass::MatrixCoord{0, 0});
 
     typename MatmulQK::Mma::IteratorB iterator_B(
-        {int32_t(p.head_dim)},
-        p.query_ptr + query_start * p.head_dim,
+        {int32_t(p.q_strideM)},
+        p.query_ptr + query_start * p.q_strideM,
         {p.head_dim, p.num_queries - query_start},
         thread_id,
         cutlass::MatrixCoord{0, 0});
@@ -1259,7 +1322,7 @@ struct AttentionBackwardKernel {
   }
 
   template <bool skipBoundsChecks>
-  static __device__ __forceinline__ void writeFragsToGmem(
+  static CUTLASS_DEVICE void writeFragsToGmem(
       SharedStorage& shared_storage,
       OutputFragments& output_frags,
       Params const& p,
@@ -1268,8 +1331,8 @@ struct AttentionBackwardKernel {
         ? MatmulQK::Mma::Shape::kM
         : std::min((int32_t)MatmulQK::Mma::Shape::kM, p.num_keys - key_start);
     typename MatmulGradV::OutputTileIterator outputV_it(
-        typename MatmulGradV::OutputTileIterator::Params{p.head_dim_value},
-        p.grad_value_ptr + key_start * p.head_dim_value,
+        typename MatmulGradV::OutputTileIterator::Params{p.gV_strideM()},
+        p.grad_value_ptr + key_start * p.gV_strideM(),
         {num_keys_in_block, p.head_dim_value},
         get_thread_id());
     accumulateInGmem<MatmulGradV>(
@@ -1279,8 +1342,8 @@ struct AttentionBackwardKernel {
         true);
 
     typename MatmulGradK::OutputTileIterator outputK_it(
-        typename MatmulGradK::OutputTileIterator::Params{p.head_dim},
-        p.grad_key_ptr + key_start * p.head_dim,
+        typename MatmulGradK::OutputTileIterator::Params{p.gK_strideM()},
+        p.grad_key_ptr + key_start * p.gK_strideM(),
         {num_keys_in_block,
          false ? MatmulGradK::ThreadblockShape::kN : p.head_dim},
         get_thread_id());
@@ -1292,7 +1355,7 @@ struct AttentionBackwardKernel {
   }
 
   template <typename MatmulT>
-  static __device__ __forceinline__ void accumulateInGmem(
+  static CUTLASS_DEVICE void accumulateInGmem(
       typename MatmulT::DefaultEpilogue::SharedStorage& epilogue_smem,
       typename MatmulT::Mma::FragmentC const& accum,
       typename MatmulT::OutputTileIterator output_it,
@@ -1334,7 +1397,9 @@ struct AttentionBackwardKernel {
   }
 
   template <int kElementsPerAccess>
-  static __device__ void computeDelta(Params const& p, int32_t query_start) {
+  static CUTLASS_DEVICE void computeDelta(
+      Params const& p,
+      int32_t query_start) {
     // Each thread computes one value for Delta
     // Depending on warp configuration, we might have multiple
     // threads of the same warp working on the same row
@@ -1349,13 +1414,15 @@ struct AttentionBackwardKernel {
     bool rowPred = (query_start + laneRow) < p.num_queries;
     bool pred = rowPred;
 
-    const __restrict__ AccessType* grad_output_ptr =
-        reinterpret_cast<const __restrict__ AccessType*>(
-            p.grad_output_ptr + (query_start + laneRow) * p.head_dim_value +
+    // on windows, previous syntax __restrict__ AccessType*
+    // resulted in error: "restrict" is not allowed
+    const AccessType* __restrict__ grad_output_ptr =
+        reinterpret_cast<const AccessType*>(
+            p.grad_output_ptr + (query_start + laneRow) * p.gO_strideM +
             laneFirstCol);
-    const __restrict__ AccessType* output_ptr =
-        reinterpret_cast<const __restrict__ AccessType*>(
-            p.output_ptr + (query_start + laneRow) * p.head_dim_value +
+    const AccessType* __restrict__ output_ptr =
+        reinterpret_cast<const AccessType*>(
+            p.output_ptr + (query_start + laneRow) * p.o_strideM() +
             laneFirstCol);
 
     static constexpr int64_t kMaxIters =
@@ -1430,13 +1497,13 @@ struct AttentionBackwardKernel {
     }
   }
 
-  static __device__ __forceinline__ int8_t get_lane_id() {
+  static CUTLASS_DEVICE int8_t get_lane_id() {
     return threadIdx.x;
   }
-  static __device__ __forceinline__ int8_t get_warp_id() {
+  static CUTLASS_DEVICE int8_t get_warp_id() {
     return threadIdx.y;
   }
-  static __device__ __forceinline__ int16_t get_thread_id() {
+  static CUTLASS_DEVICE int16_t get_thread_id() {
     return threadIdx.x + threadIdx.y * blockDim.x;
   }
 };
@@ -1457,8 +1524,7 @@ __global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
 #define INSTANTIATE_ATTENTION_KERNEL_BACKWARD(ARCH, ...)             \
   _ATTENTION_KERNEL_BACKWARD_BEGIN(                                  \
       AttentionBackwardKernel<cutlass::arch::Sm##ARCH, __VA_ARGS__>) \
-  auto batch_id = blockIdx.z;                                        \
-  p.advance_batches(batch_id);                                       \
+  p.advance_to_block();                                              \
   Kernel::kernel(p);                                                 \
   _ATTENTION_KERNEL_BACKWARD_END();
 
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.h b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
index 564adb2d51ea8..e9f3d5029aa86 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.h
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
@@ -62,6 +62,15 @@ inline bool check_for_attn_weights(sdp_params params, bool debug) {
   }
   return true;
 }
+
+inline bool check_for_non_zero_dropout(sdp_params params, bool debug) {
+  if (params.dropout != 0.0) {
+    TORCH_CHECK(!debug, "Mem_efficient does not support non_zero dropout. Dropout_p: ", params.dropout);
+    return false;
+  }
+  return true;
+}
+
 inline bool check_for_seq_len_1_nested_tensor(sdp_params params, bool debug) {
   if (!params.query.is_nested()) {
     return true;
@@ -230,7 +239,8 @@ inline bool use_mem_efficient_attention(sdp_params params, bool debug) {
       check_for_attn_weights,
       check_tensor_shapes,
       check_for_attn_mask,
-      check_for_seq_len_1_nested_tensor};
+      check_for_seq_len_1_nested_tensor,
+      check_for_non_zero_dropout};
   for (auto& constraint : constraints) {
     if (!constraint(params, debug)) {
       return false;
diff --git a/test/test_transformers.py b/test/test_transformers.py
index 939d91e7ee874..93a94a5604c91 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -21,8 +21,11 @@
     TEST_WITH_ROCM,
     IS_WINDOWS,
     slowTest,
-    set_default_dtype
+    set_default_dtype,
+    gradcheck
 )
+
+from torch.testing._internal.common_methods_invocations import wrapper_set_seed
 from torch.testing._internal.common_cuda import TEST_CUDA, SM80OrLater
 
 if TEST_FAIRSEQ:
@@ -860,11 +863,22 @@ def rand_tensor(*shape):
                     actual = torch.ops.aten._scaled_dot_product_attention(
                         query, key, value, attn_mask, dropout_p, need_attn_weights, is_causal)
 
-            # freeze_rng_state() doesn't seem to work outside of CPU, so dropout makes the results incomparable.
-            # TODO: Do this skipping in a nicer way once the granular test skipping logic lands.
-            if dropout_p == 0.0 or device == 'cpu':
                 self.assertEqual(actual, expected)
 
+        if attn_mask_dim is None:
+            q = q.double().clone()
+            k = k.double().clone()
+            v = v.double().clone()
+            q.requires_grad_()
+            k.requires_grad_()
+            v.requires_grad_()
+
+            assert gradcheck(lambda *args, **kwargs: wrapper_set_seed(sdp_ref, *args, **kwargs),
+                             (q, k, v, attn_mask, dropout_p))
+            assert gradcheck(lambda *args, **kwargs:
+                             wrapper_set_seed(torch.nn.functional._scaled_dot_product_attention, *args, **kwargs),
+                             (q, k, v, attn_mask, dropout_p))
+
     @unittest.skipIf(TEST_WITH_CROSSREF, 'Fastpath not available with crossref')
     @torch.no_grad()
     def test_mask_check_fastpath(self):
@@ -1079,6 +1093,28 @@ def rand_tensor(shape):
         self.assertEqual(math_ref_test, math_ref_lp_test, atol=7e-3, rtol=7e-3)
         self.assertEqual(actual_test, math_ref_test, atol=5e-3, rtol=5e-3)
 
+    @unittest.skipIf(not TEST_CUDA or TEST_WITH_ROCM or IS_WINDOWS, "Flash Attention was not built for this system")
+    @parametrize("contiguous_inputs", [True, False])
+    def test_efficient_attention_gradcheck(self, contiguous_inputs: bool):
+
+        batch_size, seq_len, num_heads, head_dim = 8, 8, 4, 64
+        query, key, value = torch.rand((batch_size, seq_len, 3 * num_heads * head_dim),
+                                       device="cuda", dtype=torch.float32, requires_grad=True).chunk(3, -1)
+        query = query.view(batch_size, -1, num_heads, head_dim)
+        key = key.view(batch_size, -1, num_heads, head_dim)
+        value = value.view(batch_size, -1, num_heads, head_dim)
+
+        if contiguous_inputs:
+            query = query.contiguous()
+            key = key.contiguous()
+            value = value.contiguous()
+
+        # Normally we would transpose the inputs but the fused kernels expect
+        # (batch, seq_len, num_heads, head_dim) bump the tolerance since we can only run kernel
+        # in fp32
+        assert gradcheck(lambda *args, **kwargs:
+                         wrapper_set_seed(torch.ops.aten._efficient_attention_forward, *args, **kwargs),
+                         (query, key, value, None, None, None, True, False), fast_mode=True, atol=8e-5, rtol=1e-3)
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     def test_sdp_runtime_dispatch(self):
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 8349a308be35a..a0892b32a8352 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -2591,7 +2591,7 @@
 - name: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor
   self: grad.reshape_symint(self.sym_sizes())
 
-# Nested Tensor
+# NestedTensor
 - name: _nested_tensor_from_tensor_list(Tensor[] list, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   list: "grad.defined()? at::unbind(grad) : std::vector<Tensor>(list.size())"
 
@@ -2612,6 +2612,11 @@
   nested_size: non_differentiable
   nested_strides: non_differentiable
 
+# Transformers
+- name:  _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, bool compute_log_sumexp=False, bool causal=False) -> (Tensor, Tensor)
+  output_differentiability: [True, False]
+  query, key, value: _efficient_attention_backward(grad, query, key, value, result1, result0, causal)
+
 # fft
 - name: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor
   self: fft_r2c_backward(grad, dim, normalization, onesided, self.sym_size(dim.back()))
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 0c59af77736af..5e60eff2865e4 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -11984,8 +11984,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     ),
     OpInfo(
         'nn.functional._scaled_dot_product_attention',
-        op=lambda inp, *args, **kwargs:
-               wrapper_set_seed(torch.nn.functional._scaled_dot_product_attention, inp, *args, **kwargs),
+        op=lambda *args, **kwargs:
+               wrapper_set_seed(torch.nn.functional._scaled_dot_product_attention, *args, **kwargs),
         sample_inputs_func=sample_inputs_scaled_dot_product_attention,
         dtypes=floating_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),

From e3b16b326baa3b46495d64fcba4d866a521018be Mon Sep 17 00:00:00 2001
From: Colin Taylor <colin2328@meta.com>
Date: Tue, 15 Nov 2022 20:35:34 +0000
Subject: [PATCH 0915/1922] [torch] [analytics] add pytorch event logger
 callsites to transformers and encoder/decoders (#88896)

Differential Revision: D41227275

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88896
Approved by: https://github.com/mikekgfb
---
 torch/nn/modules/transformer.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/torch/nn/modules/transformer.py b/torch/nn/modules/transformer.py
index 37e8823edf2c2..5f1bc7bb2785c 100644
--- a/torch/nn/modules/transformer.py
+++ b/torch/nn/modules/transformer.py
@@ -56,6 +56,7 @@ def __init__(self, d_model: int = 512, nhead: int = 8, num_encoder_layers: int =
                  device=None, dtype=None) -> None:
         factory_kwargs = {'device': device, 'dtype': dtype}
         super(Transformer, self).__init__()
+        torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}")
 
         if custom_encoder is not None:
             self.encoder = custom_encoder
@@ -186,6 +187,7 @@ class TransformerEncoder(Module):
 
     def __init__(self, encoder_layer, num_layers, norm=None, enable_nested_tensor=True, mask_check=True):
         super(TransformerEncoder, self).__init__()
+        torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}")
         self.layers = _get_clones(encoder_layer, num_layers)
         self.num_layers = num_layers
         self.norm = norm
@@ -307,6 +309,7 @@ class TransformerDecoder(Module):
 
     def __init__(self, decoder_layer, num_layers, norm=None):
         super(TransformerDecoder, self).__init__()
+        torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}")
         self.layers = _get_clones(decoder_layer, num_layers)
         self.num_layers = num_layers
         self.norm = norm

From 8d85d21e19f3b3837048e3b042f5e1d1f2f10e8d Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Tue, 15 Nov 2022 20:35:48 +0000
Subject: [PATCH 0916/1922] Add range check to multi margin loss target
 (#89008)

Fixes https://github.com/pytorch/pytorch/issues/88724

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89008
Approved by: https://github.com/ngimel
---
 aten/src/ATen/native/cuda/MultiMarginLoss.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/aten/src/ATen/native/cuda/MultiMarginLoss.cu b/aten/src/ATen/native/cuda/MultiMarginLoss.cu
index 15e6d1e9dc0c3..26f21cfa59a22 100644
--- a/aten/src/ATen/native/cuda/MultiMarginLoss.cu
+++ b/aten/src/ATen/native/cuda/MultiMarginLoss.cu
@@ -31,6 +31,7 @@ __global__ void MultiMarginLoss_forward_kernel(
   scalar_t *input_k = input + k*dim;
   scalar_t *output_k = output + k;
   int target_k = static_cast<int>(target[k]);
+  CUDA_KERNEL_ASSERT(target_k >= 0 && target_k < dim && "target index is out of bounds");
   scalar_t input_target_k = input_k[target_k];
 
   int i_start = threadIdx.x;

From 9f01ab6d8e3203a70945c8a70e6845389566b534 Mon Sep 17 00:00:00 2001
From: Colin Taylor <colin2328@meta.com>
Date: Tue, 15 Nov 2022 20:36:13 +0000
Subject: [PATCH 0917/1922] [torch] [analytics] add pytorch event logger
 callsites to torch.save and torch.load (#89003)

Summary: as title.

Differential Revision: D41239419

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89003
Approved by: https://github.com/ezyang, https://github.com/dzhulgakov
---
 torch/serialization.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch/serialization.py b/torch/serialization.py
index 3078e57587be6..5f9eda67648b4 100644
--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -427,6 +427,7 @@ def save(
         >>> buffer = io.BytesIO()
         >>> torch.save(x, buffer)
     """
+    torch._C._log_api_usage_once("torch.save")
     _check_dill_version(pickle_module)
     _check_save_filelike(f)
 
@@ -760,6 +761,7 @@ def load(
         # Load a module with 'ascii' encoding for unpickling
         >>> torch.load('module.pt', encoding='ascii')
     """
+    torch._C._log_api_usage_once("torch.load")
     UNSAFE_MESSAGE = (
         "Weights only load failed. Re-running `torch.load` with `weights_only` set to `False`"
         " will likely succeed, but it can result in arbitrary code execution."

From ec5a32f9a3fe1c87c605285cb1469ed69605e6b2 Mon Sep 17 00:00:00 2001
From: Colin Taylor <colin2328@meta.com>
Date: Tue, 15 Nov 2022 20:46:00 +0000
Subject: [PATCH 0918/1922] [torch] [analytics] add dynamo to analytics
 (#88915)

Summary: as title.

Differential Revision: D41237602

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88915
Approved by: https://github.com/jansel
---
 torch/_dynamo/eval_frame.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 8d9e3b7b6aa14..cb3cffaa73d16 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -351,6 +351,7 @@ def optimize(
         def toy_example(a, b):
             ...
     """
+    torch._C._log_api_usage_once("torch._dynamo.optimize")
     if disable or os.environ.get("TORCHDYNAMO_DISABLE", "") == "1":
         return _NullDecorator()
     if sys.platform == "win32":
@@ -451,6 +452,7 @@ def guard_export_print(guards):
 def export(
     f, *args, aten_graph=False, decomposition_table=None, tracing_mode="real", **kwargs
 ):
+    torch._C._log_api_usage_once("torch._dynamo.export")
     if decomposition_table is not None or tracing_mode != "real":
         assert (
             aten_graph

From 027082fff4b7025f9557a743da9fd1bb4b68a92b Mon Sep 17 00:00:00 2001
From: Fabio Rocha <frocha@quansight.com>
Date: Mon, 14 Nov 2022 10:47:32 +0000
Subject: [PATCH 0919/1922] [inductor] Introduce CSEVariable type and use it to
 track if Triton variables are scalar (#88347)

This fixes https://github.com/pytorch/torchdynamo/issues/1515

To fix it, we need to keep track of whether a Triton variable is a scalar (so we can not use a mask when doing indirect loads through them). This requires a way of annotating variable names generated by CSE with properties.

So now CSE will use CSEVariable class to keep track of variables and let backends subclass it so they can annotate them with whatever information they want. TritonCSEVariable is such a subclass that track the `is_scalar` property.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88347
Approved by: https://github.com/jgong5, https://github.com/ngimel
---
 test/inductor/test_torchinductor.py | 14 ++++++++++
 torch/_inductor/codegen/common.py   | 41 ++++++++++++++++++++++++-----
 torch/_inductor/codegen/triton.py   | 23 +++++++++++++++-
 3 files changed, 70 insertions(+), 8 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index b64f40377995e..f43a333d1f096 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -5011,6 +5011,20 @@ def forward(pred_objectness_logits_3_: torch.Tensor):
             result = forward(*args)
             assert same(result, torch.sort(args[0], descending=True, dim=1)[0])
 
+        @requires_cuda()
+        def test_scalar_triton_index(self):
+            # The indirect indexing via a scalar like below used to lead to
+            # bad triton code that made triton segfault when compiling.
+            # See https://github.com/pytorch/torchdynamo/issues/1515
+            def fn(a):
+                zero = torch.zeros((16,), device=a.device, dtype=torch.int64)
+                return (a[zero],)
+
+            a = torch.randn((8,), dtype=torch.float32, device="cuda")
+
+            fn_optimized = torch._dynamo.optimize("inductor")(fn)
+            assert same(fn(a), fn_optimized(a))
+
     class TritonCodeGenTests(TestCase):
         from torch._inductor.triton_ops.autotune import CachingAutotuner
 
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index 932e8c91bc7da..2803970295ccc 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -34,7 +34,8 @@ class ExprPrinter(Printer):
     @staticmethod
     def paren(string):
         if (
-            re.match(r"^[a-z0-9_.]+$", string, re.I)
+            isinstance(string, CSEVariable)
+            or re.match(r"^[a-z0-9_.]+$", string, re.I)
             or re.match(r"^\([^)]*\)$", string, re.I)
             or string == ""
         ):
@@ -405,6 +406,21 @@ def _is_removed(name, buffers):
         )
 
 
+class CSEVariable:
+    """A CSEVariable is just a name for an expression but it is useful to be able to annotate them on a backend dependent basis.
+    The backends can inherit from this class and overload the "create_cse_var" Kernel to do that.
+    The "update_on_args" method gives you a hook for annotations, see example of TritonCSEVariable in triton.py."""
+
+    def __init__(self, name):
+        self.name = name
+
+    def __str__(self):
+        return self.name
+
+    def update_on_args(self, args, kwargs):
+        pass
+
+
 class CSE:
     """Common subexpression elimination"""
 
@@ -425,6 +441,7 @@ def __init__(
         self.reduction_cache = reduction_cache or {}
         self.iter_buffer_ids = iter_buffers or itertools.count()
         self.invalidated_stores = set()
+        self.varname_map = {}
 
     def invalidate(self, keep_vars: typing.Set[str]):
         for name, tmp in list(self.store_cache.items()):
@@ -442,9 +459,11 @@ def clone(self):
             self.store_cache,
         )
 
-    def generate(self, buffer: IndentedBuffer, expr: str, write=True):
-        assert isinstance(expr, str), expr
-        if expr.startswith(self.name_prefix) and re.match(r"^[a-z0-9]+$", expr):
+    def generate(
+        self, buffer: IndentedBuffer, expr: typing.Union[str, CSEVariable], write=True
+    ) -> CSEVariable:
+        assert isinstance(expr, (str, CSEVariable)), type(expr)
+        if isinstance(expr, CSEVariable):
             return expr
         if expr not in self.cache:
             var = self.newvar()
@@ -454,8 +473,11 @@ def generate(self, buffer: IndentedBuffer, expr: str, write=True):
                 buffer.writeline(f"{self.prefix}{var} = {expr}{self.suffix}")
         return self.cache[expr]
 
-    def newvar(self):
-        return f"{self.name_prefix}{next(self.iter_buffer_ids)}"
+    def newvar(self) -> CSEVariable:
+        var_name = f"{self.name_prefix}{next(self.iter_buffer_ids)}"
+        var = V.kernel.create_cse_var(var_name)
+        self.varname_map[var_name] = var
+        return var
 
 
 class CodeGen:
@@ -539,9 +561,11 @@ class CSEProxy:
             @staticmethod
             def __getattr__(name):
                 def inner(*args, **kwargs):
-                    return self.cse.generate(
+                    csevar = self.cse.generate(
                         self.compute, getattr(parent_handler, name)(*args, **kwargs)
                     )
+                    csevar.update_on_args(args, kwargs)
+                    return csevar
 
                 return inner
 
@@ -598,3 +622,6 @@ def rename_indexing(self, index) -> sympy.Expr:
             x: self.args.size(x) for x in sorted_symbols if x.name.startswith("s")
         }
         return sympy_subs(index, replacements)
+
+    def create_cse_var(self, *args, **kwargs):
+        return CSEVariable(*args, **kwargs)
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 88a0ad4977be4..b79b03232a8a5 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -24,6 +24,7 @@
 )
 from ..virtualized import ops, V
 from .common import (
+    CSEVariable,
     DeferredLine,
     ExprPrinter,
     IndentedBuffer,
@@ -109,6 +110,17 @@ def triton_constant(value):
     return repr(value)
 
 
+class TritonCSEVariable(CSEVariable):
+    def __init__(self, name):
+        super().__init__(name)
+        self.is_scalar = False
+
+    def update_on_args(self, args, kwargs):
+        self.is_scalar = all(
+            not (isinstance(arg, TritonCSEVariable)) or arg.is_scalar for arg in args
+        )
+
+
 class TritonOverrides(OpOverrides):
     """Map element-wise ops to Triton"""
 
@@ -752,7 +764,13 @@ def indexing(
             # https://github.com/openai/triton/issues/633
             mask = ["None"]
 
-        return index_str, " & ".join(mask)
+        if (
+            index_str in self.cse.varname_map
+            and self.cse.varname_map[index_str].is_scalar
+        ):
+            mask = ["None"]
+
+        return index_str, " & ".join(map(str, mask))
 
     def var_ranges(self):
         return dict(
@@ -1106,6 +1124,9 @@ def call_kernel(self, code, name: str):
             f"{name}.run({call_args}, grid=grid({', '.join(grid)}), stream={stream_name})"
         )
 
+    def create_cse_var(self, *args, **kwargs):
+        return TritonCSEVariable(*args, **kwargs)
+
 
 class TritonScheduling:
     def __init__(self, scheduler):

From d03bbeb1f772e202c3893ab7bff6ca9d1d5bba31 Mon Sep 17 00:00:00 2001
From: Fabio Rocha <frocha@quansight.com>
Date: Mon, 14 Nov 2022 10:47:34 +0000
Subject: [PATCH 0920/1922] [inductor] Added bucketize to decomp table (#88348)

These are the benchmark results vs eager

```
[--------------------------- bucketize ----------------------------]
                                                 |  eager  |  decomp
32 threads: --------------------------------------------------------
      ((16384, 1024), (16,)), (True, True)       |    600  |    464
      ((16384, 1024), (16,)), (True, False)      |    542  |    464
      ((16384, 1024), (16,)), (False, True)      |    780  |    731
      ((16384, 1024), (16,)), (False, False)     |    777  |    731
      ((16384, 1024), (64,)), (True, True)       |    624  |    515
      ((16384, 1024), (64,)), (True, False)      |    603  |    515
      ((16384, 1024), (64,)), (False, True)      |    789  |    718
      ((16384, 1024), (64,)), (False, False)     |    786  |    718
      ((16384, 1024), (256,)), (True, True)      |    878  |    820
      ((16384, 1024), (256,)), (True, False)     |    891  |    830
      ((16384, 1024), (256,)), (False, True)     |    897  |    900
      ((16384, 1024), (256,)), (False, False)    |    900  |    900
      ((16384, 1024), (1024,)), (True, True)     |   2000  |   1890
      ((16384, 1024), (1024,)), (True, False)    |   1950  |   1892
      ((16384, 1024), (1024,)), (False, True)    |   1990  |   1962
      ((16384, 1024), (1024,)), (False, False)   |   1990  |   2060
      ((16384, 1024), (4096,)), (True, True)     |   3405  |   3155
      ((16384, 1024), (4096,)), (True, False)    |   3244  |   3154
      ((16384, 1024), (4096,)), (False, True)    |   3282  |   3219
      ((16384, 1024), (4096,)), (False, False)   |   3278  |   3220
      ((16384, 1024), (16384,)), (True, True)    |   4626  |   4672
      ((16384, 1024), (16384,)), (True, False)   |   4629  |   4671
      ((16384, 1024), (16384,)), (False, True)   |   4662  |   4829
      ((16384, 1024), (16384,)), (False, False)  |   4665  |   4824
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88348
Approved by: https://github.com/ngimel
---
 torch/_inductor/decomposition.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index 0b29dd524cb78..44bfd46505a2e 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -104,6 +104,7 @@
         aten.upsample_nearest2d_backward,
         aten.softplus,
         aten.softplus_backward,
+        aten.bucketize,
     ]
 )
 

From 89470aa57cce0c752edd8383696b80c24f15555c Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Tue, 15 Nov 2022 21:05:59 +0000
Subject: [PATCH 0921/1922] Add test for out-of-bounds Tensor access on GPU
 (#39211)

Since CUDA context can not recover safely from on-device assert, use `torch.multiprocessing.spawn` to execute a method in another context and verify that it raises unrecoverable error.

As those types of tests are pretty slow (6 seconds on powerful linux box with one GPU) run it only in the slow shard.

Closes https://github.com/pytorch/pytorch/issues/38944

Pull Request resolved: https://github.com/pytorch/pytorch/pull/39211
Approved by: https://github.com/ezyang
---
 test/test_cuda.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/test/test_cuda.py b/test/test_cuda.py
index fada440a72932..59f379487c43b 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -1576,6 +1576,38 @@ def test_multinomial_invalid_probs_cuda(self):
         self._spawn_test_multinomial_invalid_probs_cuda([1., -inf, 1.])
         self._spawn_test_multinomial_invalid_probs_cuda([1., 1., nan])
 
+    @staticmethod
+    def _mute_init():
+        os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stderr.fileno())
+
+    def _spawn_method(self, method, arg):
+        ctx = torch.multiprocessing.get_context("spawn")
+        with ctx.Pool(1, initializer=self._mute_init) as pool:
+            errors = pool.map(method, [arg])
+            for e in errors:
+                if 'device-side assert triggered' not in str(e):
+                    self.fail(e)
+
+    @staticmethod
+    def _test_index_bounds_cuda(idx):
+        x = torch.arange(10, device="cuda")
+        try:
+            y = x[torch.tensor([idx])]
+            return f"x[torch.tensor([{idx})]={y}"
+        except RuntimeError as err:
+            return err
+
+    @slowTest
+    @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
+                     don't support multiprocessing with spawn start method")
+    @skipIfRocm
+    def test_index_out_of_bounds_exception_cuda(self):
+        test_method = TestCuda._test_index_bounds_cuda
+        # Test in-bound access works fine
+        self.assertEqual(test_method(1), "x[torch.tensor([1)]=tensor([1], device='cuda:0')")
+        # Test that indexing out of bounds causes assert
+        self._spawn_method(test_method, 11)
+
     @slowTest
     @unittest.skipIf(not TEST_LARGE_TENSOR, "not enough memory")
     def test_huge_index(self):

From dd9c82f158573836e225ece4303f15173c43a774 Mon Sep 17 00:00:00 2001
From: Nikita Vedeneev <nik@quansight.com>
Date: Tue, 15 Nov 2022 21:16:15 +0000
Subject: [PATCH 0922/1922] add `to(dtype)` support for all sparse compressed
 formats (#89055)

Fixes [#88419](https://github.com/pytorch/pytorch/issues/88419)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89055
Approved by: https://github.com/cpuhrsch
---
 aten/src/ATen/native/TensorConversions.cpp | 84 +++++++++++-----------
 test/test_sparse_csr.py                    | 14 ++++
 2 files changed, 58 insertions(+), 40 deletions(-)

diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp
index ec699bf1bf7fa..96275bde82994 100644
--- a/aten/src/ATen/native/TensorConversions.cpp
+++ b/aten/src/ATen/native/TensorConversions.cpp
@@ -244,48 +244,52 @@ Tensor _to_copy(
   // memory_format is handled separately due to MemoryFormat::Preserve logic
   options = self.options().merge_in(options).memory_format(c10::nullopt);
   auto memory_format = optional_memory_format.value_or(MemoryFormat::Preserve);
+
   // TODO: Use the dispatcher for this.
   // Currently there are unenumerated extensibility issues preventing this.
-  if (self.is_sparse_csr()) {
-    TORCH_CHECK(
-        memory_format == MemoryFormat::Preserve,
-        "sparse_csr only supports memory format Preserve, but got ",
-        memory_format,
-        " instead.");
-
-    auto new_values = at::native::to(
-        self.values(),
-        dtype,
-        c10::kStrided, // values are strided
-        device,
-        pin_memory,
-        non_blocking,
-        true, // force copy since we're in _to_copy
-        memory_format);
-
-    auto new_crow_indices = at::native::to(
-        self.crow_indices(),
-        self.crow_indices().scalar_type(), // indices are integral
-        c10::kStrided, // indices are strided
-        device,
-        pin_memory,
-        non_blocking,
-        true, // force copy since we're in _to_copy
-        memory_format);
-
-    auto new_col_indices = at::native::to(
-        self.col_indices(),
-        self.col_indices().scalar_type(), // indices are integral
-        c10::kStrided, // indices are strided
-        device,
-        pin_memory,
-        non_blocking,
-        true, // force copy since we're in _to_copy
-        memory_format);
-
-    return at::native::_sparse_csr_tensor_unsafe(
-        new_crow_indices,
-        new_col_indices,
+  if (at::sparse_csr::is_sparse_compressed(self)) {
+      TORCH_CHECK(
+          memory_format == MemoryFormat::Preserve,
+          "to(options): ", at::sparse_csr::layoutToString(self.layout()),
+          " only supports memory format Preserve, but got ", memory_format,
+          " instead.");
+
+      Tensor compressed_indices, plain_indices;
+      std::tie(compressed_indices, plain_indices) = at::sparse_csr::getCompressedPlainIndices(self);
+
+      const auto new_values = at::native::to(
+          self.values(),
+          dtype,
+          c10::kStrided,
+          device,
+          pin_memory,
+          non_blocking,
+          true, // force copy since we are in _to_copy
+          memory_format);
+
+      const auto new_compressed_indices = at::native::to(
+          compressed_indices,
+          compressed_indices.scalar_type(),
+          c10::kStrided,
+          device,
+          pin_memory,
+          non_blocking,
+          true, // force copy since we are in _to_copy
+          memory_format);
+
+      const auto new_plain_indices = at::native::to(
+          plain_indices,
+          plain_indices.scalar_type(),
+          c10::kStrided,
+          device,
+          pin_memory,
+          non_blocking,
+          true, // force copy since we are in _to_copy
+          memory_format);
+
+    return at::native::_sparse_compressed_tensor_unsafe(
+        new_compressed_indices,
+        new_plain_indices,
         new_values,
         self.sizes(),
         new_values.scalar_type(),
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index d2e3c5fc38514..e83616489fc24 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -934,6 +934,20 @@ def test_dim(self, layout):
             self.assertEqual(sparse.dense_dim(), dense_dim)
 
 
+    @skipMeta
+    @all_sparse_compressed_layouts()
+    @dtypes(*all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16))
+    def test_to_dtype(self, layout, device, dtype):
+        # to_dense does not support hybrid inputs
+        input_gen = self._generate_small_inputs(layout, device=device, enable_hybrid=False)
+        for compressed_indices, plain_indices, values, size in input_gen:
+            sparse = torch.sparse_compressed_tensor(compressed_indices, plain_indices, values, size,
+                                                    dtype=dtype, layout=layout, device=device)
+            for to_dtype in all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16):
+                sparse_to_dtype = sparse.to(to_dtype)
+                dense_to_dtype = sparse.to_dense().to(to_dtype)
+                self.assertEqual(sparse_to_dtype.to_dense(), dense_to_dtype)
+
 def _npref_block_addmm_addmv(c, a, b, alpha, beta):
     return alpha * (a @ b) + beta * c
 

From d2f83203cd6d11fd62c91aab20d3cd9118ee9aed Mon Sep 17 00:00:00 2001
From: Catherine Lee <csl@fb.com>
Date: Tue, 15 Nov 2022 21:27:14 +0000
Subject: [PATCH 0923/1922] print xpass (#89020)

Print unexpected success as XPASS.  I will submit a PR to test-infra so that the log classifier can find these

Ex: https://github.com/pytorch/pytorch/actions/runs/3466368885/jobs/5790424173
```
  test_import_hipify (__main__.TestHipify) ... ok (0.000s)
  test_check_onnx_broadcast (__main__.TestONNXUtils) ... ok (0.000s)
  test_prepare_onnx_paddings (__main__.TestONNXUtils) ... ok (0.000s)
  test_load_standalone (__main__.TestStandaloneCPPJIT) ... ok (16.512s)

======================================================================
XPASS [4.072s]: test_smoke (__main__.TestCollectEnv)
----------------------------------------------------------------------

----------------------------------------------------------------------
Ran 31 tests in 24.594s

FAILED (skipped=7, unexpected successes=1)
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89020
Approved by: https://github.com/huydhn, https://github.com/seemethere
---
 torch/testing/_internal/common_utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index e0b703046c542..fa3eda3758e4e 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -770,6 +770,9 @@ def addSkip(self, test, reason):
                         # it stands for `verbose_str` captured in the closure
                         c.cell_contents = f"skip: {reason}"
 
+            def printErrors(self) -> None:
+                super().printErrors()
+                self.printErrorList("XPASS", self.unexpectedSuccesses)
         test_report_path = get_report_path()
         verbose = '--verbose' in argv or '-v' in argv
         if verbose:

From 2f1c5a059a9f9bb20612c5ee8286c6f5cc75a9d2 Mon Sep 17 00:00:00 2001
From: mikey dagitses <mikeyd@meta.com>
Date: Tue, 15 Nov 2022 21:33:38 +0000
Subject: [PATCH 0924/1922] skip test that is broken in head (#88759)

Test Plan: Rely on CI.

Differential Revision: D41156351

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88759
Approved by: https://github.com/zou3519
---
 test/mobile/test_lite_script_type.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/mobile/test_lite_script_type.py b/test/mobile/test_lite_script_type.py
index 8769a4b2363a2..44eb6d4778e8b 100644
--- a/test/mobile/test_lite_script_type.py
+++ b/test/mobile/test_lite_script_type.py
@@ -4,6 +4,7 @@
 import torch.utils.bundled_inputs
 import io
 from typing import Dict, List, NamedTuple
+import unittest
 
 from torch.jit.mobile import _load_for_lite_interpreter
 from torch.testing._internal.common_utils import TestCase, run_tests
@@ -34,6 +35,7 @@ def forward(self, a: torch.Tensor):
         )
 
 
+    @unittest.skip("T137512434")
     def test_typing_dict_with_namedtuple(self):
         class Foo(NamedTuple):
             id: torch.Tensor

From be60ae61daf3618125304769980d53307fd488ae Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Mon, 14 Nov 2022 14:47:15 +0000
Subject: [PATCH 0925/1922] Enable test_ops for inductor (#88994)

Summary: skip several unsupported test cases
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88994
Approved by: https://github.com/Krovatkin
---
 .jenkins/pytorch/test.sh | 2 +-
 test/test_ops.py         | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 6bbda7f4d7071..5fa54f538f35f 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -249,7 +249,7 @@ test_inductor_distributed() {
 }
 
 test_inductor() {
-  python test/run_test.py --include test_modules --verbose
+  python test/run_test.py --include test_modules test_ops --verbose
   # TODO: investigate "RuntimeError: CUDA driver API confirmed a leak"
   # seen intest_ops_gradients.py
   # pytest test/test_ops_gradients.py --verbose -k "not _complex and not test_inplace_grad_acos_cuda_float64"
diff --git a/test/test_ops.py b/test/test_ops.py
index 0ef2e4ee6d60e..11d659e5cd2b8 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -1069,6 +1069,7 @@ def _test_inplace_preserve_storage(samples, variants):
     # Reference testing for operations in complex32 against complex64.
     # NOTE: We test against complex64 as NumPy doesn't have a complex32 equivalent dtype.
     @ops(op_db, allowed_dtypes=(torch.complex32,))
+    @skipIfTorchInductor("Inductor does not support complex dtype yet")
     def test_complex_half_reference_testing(self, device, dtype, op):
         if not op.supports_dtype(torch.complex32, device):
             unittest.skip("Does not support complex32")
@@ -1098,6 +1099,7 @@ def test_complex_half_reference_testing(self, device, dtype, op):
 
     @ops(op_db, allowed_dtypes=(torch.bool,))
     @unittest.skipIf(TEST_WITH_UBSAN, "Test uses undefined behavior")
+    @skipIfTorchInductor("Inductor does not support view with dtype yet")
     def test_non_standard_bool_values(self, device, dtype, op):
         # Test boolean values other than 0x00 and 0x01 (gh-54789)
         def convert_boolean_tensors(x):
@@ -1497,6 +1499,7 @@ def clone_and_perform_view(input, **kwargs):
                         self.assertEqual(tensor.grad, cloned1_tensor.grad)
 
     @ops(ops_and_refs, allowed_dtypes=(torch.cfloat,))
+    @skipIfTorchInductor("Inductor does not support complex dtype yet")
     def test_conj_view(self, device, dtype, op):
         if not op.test_conjugated_samples:
             self.skipTest("Operation doesn't support conjugated inputs.")
@@ -1519,6 +1522,7 @@ def test_conj_view(self, device, dtype, op):
         )
 
     @ops(ops_and_refs, allowed_dtypes=(torch.double,))
+    @skipIfTorchInductor("Inductor does not support complex dtype yet")
     def test_neg_view(self, device, dtype, op):
         if not op.test_neg_view:
             self.skipTest("Operation not tested with tensors with negative bit.")
@@ -1538,6 +1542,7 @@ def test_neg_view(self, device, dtype, op):
         )
 
     @ops(ops_and_refs, allowed_dtypes=(torch.cdouble,))
+    @skipIfTorchInductor("Inductor does not support complex dtype yet")
     def test_neg_conj_view(self, device, dtype, op):
         if not op.test_neg_view:
             self.skipTest("Operation not tested with tensors with negative bit.")

From 148d05c4062d3989bd6ef0fe288ca3f6139cbc08 Mon Sep 17 00:00:00 2001
From: Michael Wootton <michael.wootton@amd.com>
Date: Tue, 15 Nov 2022 21:40:43 +0000
Subject: [PATCH 0926/1922] Enable correct supported activities for kineto on
 rocm (#88207)

A compile time guard was preventing ActivityType::CUDA from being available on rocm.  This caused both the GPU_FALLBACK and CUDA modes to be active at the same time.  So operators were being charged gpu time for the hipEventRecord ranges and the actual kernel execution times.  This caused incorrect (and often negative) cuda times, in e.g. table().

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88207
Approved by: https://github.com/malfet, https://github.com/jeffdaily
---
 torch/csrc/autograd/init.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index ee963232d3166..6bfd4bd4bfed6 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -279,8 +279,9 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
 
   m.def("_supported_activities", []() {
     std::set<ActivityType> activities{ActivityType::CPU};
-#if defined(USE_KINETO) && !defined(LIBKINETO_NOCUPTI)
-    if (at::getNumGPUs() > 0 && !at::hasHIP()) {
+#if defined(USE_KINETO) && \
+    (!defined(LIBKINETO_NOCUPTI) || !defined(LIBKINETO_NOROCTRACER))
+    if (at::getNumGPUs() > 0) {
       activities.insert(ActivityType::CUDA);
     }
 #endif

From efa57be5074ab5dbd8dce6220956b18f9cfcc664 Mon Sep 17 00:00:00 2001
From: Charlie Yan <charlieyan@fb.com>
Date: Tue, 15 Nov 2022 18:03:53 +0000
Subject: [PATCH 0927/1922] Rebase and re-land thread PG (#88795)

The previous PR (https://github.com/pytorch/pytorch/pull/88627) has been reverted due to a failed check. After rebasing and rerun, all checks passed.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88795
Approved by: https://github.com/huydhn, https://github.com/wanchaol
---
 test/distributed/test_multi_threaded_pg.py    |  45 +++
 test/test_testing.py                          |   1 +
 torch/testing/_internal/common_distributed.py | 149 +++++++--
 .../distributed/multi_threaded_pg.py          | 288 ++++++++++++++++++
 4 files changed, 457 insertions(+), 26 deletions(-)
 create mode 100644 test/distributed/test_multi_threaded_pg.py
 create mode 100644 torch/testing/_internal/distributed/multi_threaded_pg.py

diff --git a/test/distributed/test_multi_threaded_pg.py b/test/distributed/test_multi_threaded_pg.py
new file mode 100644
index 0000000000000..6a0fe33cd8ad6
--- /dev/null
+++ b/test/distributed/test_multi_threaded_pg.py
@@ -0,0 +1,45 @@
+# Owner(s): ["oncall: distributed"]
+
+import sys
+import torch.distributed as dist
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+from torch.testing._internal.common_distributed import (
+    spawn_threads_and_init_comms,
+    MultiThreadedTestCase
+
+)
+from torch.testing._internal.common_utils import TestCase, run_tests
+
+DEFAULT_WORLD_SIZE = 4
+
+class TestObjectCollectivesWithWrapper(TestCase):
+    @spawn_threads_and_init_comms(world_size=4)
+    def test_broadcast_object_list(self):
+        val = 99 if dist.get_rank() == 0 else None
+        object_list = [val] * dist.get_world_size()
+
+        dist.broadcast_object_list(object_list=object_list)
+        self.assertEqual(99, object_list[0])
+
+class TestObjectCollectivesWithBaseClass(MultiThreadedTestCase):
+    @property
+    def world_size(self):
+        return 4
+
+    def test_broadcast_object_list(self):
+        val = 99 if dist.get_rank() == 0 else None
+        object_list = [val] * dist.get_world_size()
+        print(f"{dist.get_rank()} -> {dist.get_world_size()}")
+
+        dist.broadcast_object_list(object_list=object_list)
+        self.assertEqual(99, object_list[0])
+
+    def test_something_else(self):
+        pass
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_testing.py b/test/test_testing.py
index 8fe66043e5a16..f05883919f17c 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -1806,6 +1806,7 @@ def test_circular_dependencies(self) -> None:
             # And these both end up with transitive dependencies on distributed
             ignored_modules.append("torch.nn.parallel._replicated_tensor_ddp_interop")
             ignored_modules.append("torch.testing._internal.common_fsdp")
+            ignored_modules.append("torch.testing._internal.common_distributed")
 
         torch_dir = os.path.dirname(torch.__file__)
         for base, folders, files in os.walk(torch_dir):
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 607211087ddc7..883a48a5a5fef 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -2,10 +2,10 @@
 import logging
 import multiprocessing
 import os
+import subprocess
 import sys
 import tempfile
 import threading
-import subprocess
 import time
 import traceback
 import types
@@ -14,11 +14,7 @@
 from dataclasses import dataclass
 from datetime import timedelta
 from enum import Enum
-from functools import (
-    partial,
-    reduce,
-    wraps
-)
+from functools import partial, reduce, wraps
 from io import StringIO
 from typing import NamedTuple, Optional, Union
 
@@ -26,16 +22,17 @@
 import torch.cuda.nccl
 import torch.distributed as c10d
 from torch.testing._internal.common_utils import (
-    TestCase,
-    TEST_WITH_ROCM,
-    TEST_WITH_TSAN,
     FILE_SCHEMA,
     find_free_port,
-    retry_on_connect_failures,
     IS_SANDCASTLE,
-    sandcastle_skip_if,
+    retry_on_connect_failures,
     sandcastle_skip,
+    sandcastle_skip_if,
+    TEST_WITH_ROCM,
+    TEST_WITH_TSAN,
+    TestCase,
 )
+from torch.testing._internal.distributed.multi_threaded_pg import run_with_threaded_pg
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -67,11 +64,10 @@ class TestSkip(NamedTuple):
     "generic": TestSkip(
         86, "Test skipped at subprocess level, look at subprocess log for skip reason"
     ),
-    "importerror": TestSkip(
-        88, "Test skipped due to missing import"
-    ),
+    "importerror": TestSkip(88, "Test skipped due to missing import"),
 }
 
+
 @dataclass
 class DistTestCases:
     # Backends that do not support a specific collective
@@ -93,6 +89,7 @@ class DistTestCases:
 def skip_if_no_gpu(func):
     """Skips if the world size exceeds the number of GPUs, ensuring that if the
     test is run, each rank has its own GPU via ``torch.cuda.device(rank)``."""
+
     @wraps(func)
     def wrapper(*args, **kwargs):
         if not torch.cuda.is_available():
@@ -116,6 +113,7 @@ def wrapper(*args, **kwargs):
 
     return wrapper
 
+
 def skip_if_odd_worldsize(func):
     @wraps(func)
     def wrapper(*args, **kwargs):
@@ -126,6 +124,7 @@ def wrapper(*args, **kwargs):
 
     return wrapper
 
+
 def require_n_gpus_for_nccl_backend(n, backend):
     def decorator(func):
         @wraps(func)
@@ -139,12 +138,17 @@ def wrapper(*args, **kwargs):
 
     return decorator
 
+
 def import_transformers_or_skip():
     def decorator(func):
         @wraps(func)
         def wrapper(*args, **kwargs):
             try:
-                from transformers import BertConfig, AutoModelForMaskedLM  # noqa: Unused
+                from transformers import (  # noqa: Unused
+                    AutoModelForMaskedLM,
+                    BertConfig,
+                )
+
                 return func(*args, **kwargs)
             except ImportError:
                 sys.exit(TEST_SKIPS["importerror"].exit_code)
@@ -153,6 +157,7 @@ def wrapper(*args, **kwargs):
 
     return decorator
 
+
 def skip_if_lt_x_gpu(x):
     def decorator(func):
         @wraps(func)
@@ -191,10 +196,13 @@ def verify_ddp_error_logged(model_DDP, err_substr):
     logging_err = ddp_logging_data["error"]
     # Remove C++ stacktrace if needed.
     actual = (
-        err_substr if err_substr.find("\nException raised from ") == -1
+        err_substr
+        if err_substr.find("\nException raised from ") == -1
         else err_substr.split("\nException raised from ")[0]
     )
-    assert actual in logging_err, f"Did not find expected {actual} in ddp logging data error: {logging_err}"
+    assert (
+        actual in logging_err
+    ), f"Did not find expected {actual} in ddp logging data error: {logging_err}"
 
 
 def with_nccl_blocking_wait(func):
@@ -319,7 +327,7 @@ def wrapper(*args, **kwargs):
 
 def skip_if_win32():
     return sandcastle_skip_if(
-        sys.platform == 'win32',
+        sys.platform == "win32",
         "This unit test case is not supportted on Windows platform",
     )
 
@@ -352,13 +360,14 @@ def create_tcp_store(
     # TSAN runs much slower.
     TIMEOUT_DEFAULT = 500
 else:
-    TIMEOUT_DEFAULT = int(os.getenv('DISTRIBUTED_TESTS_DEFAULT_TIMEOUT', '300'))
+    TIMEOUT_DEFAULT = int(os.getenv("DISTRIBUTED_TESTS_DEFAULT_TIMEOUT", "300"))
 TIMEOUT_OVERRIDE = {"test_ddp_uneven_inputs": 400}
 
 # https://github.com/pytorch/pytorch/issues/75665
 if TEST_WITH_ROCM:
     TIMEOUT_OVERRIDE["test_join_kwargs"] = 200
 
+
 def create_device(interface=None):
     if sys.platform == "win32" or interface is None:
         return c10d.ProcessGroupGloo.create_device(hostname="127.0.0.1")
@@ -449,9 +458,7 @@ def init_multigpu_helper(world_size: int, backend: str):
     if world_size > nGPUs:
         nGPUs_per_process = nGPUs // world_size
     rank_to_GPU = {
-        i: list(
-            visible_devices[i * nGPUs_per_process : (i + 1) * nGPUs_per_process]
-        )
+        i: list(visible_devices[i * nGPUs_per_process : (i + 1) * nGPUs_per_process])
         for i in range(world_size)
     }
     return rank_to_GPU
@@ -482,6 +489,9 @@ def cleanup_temp_dir() -> None:
         tmp_dir.cleanup()
 
 
+# Most tests operate with this worldsize
+DEFAULT_WORLD_SIZE = 4
+
 # [How does MultiProcessTestCase work?]
 # Each MultiProcessTestCase instance uses 1 + `world_size()` processes, by
 # default `world_size()` returns 4. Let's take `test_rpc_spawn.py` as an
@@ -508,7 +518,7 @@ def _should_stop_test_suite(self) -> bool:
 
     @property
     def world_size(self) -> int:
-        return 4
+        return DEFAULT_WORLD_SIZE
 
     def join_or_run(self, fn):
         @wraps(fn)
@@ -607,7 +617,10 @@ def _event_listener(parent_pipe, signal_pipe, rank: int):
     @classmethod
     def _run(cls, rank: int, test_name: str, file_name: str, parent_pipe) -> None:
         # Enable DDP + ReplicatedTensor
-        from torch.nn.parallel._replicated_tensor_ddp_utils import _set_ddp_with_replicated_tensor
+        from torch.nn.parallel._replicated_tensor_ddp_utils import (
+            _set_ddp_with_replicated_tensor,
+        )
+
         _set_ddp_with_replicated_tensor(True)
 
         self = cls(test_name)
@@ -815,16 +828,20 @@ def _check_return_codes(self, elapsed_time) -> None:
         self.assertEqual(
             first_process.exitcode,
             0,
-            msg="Expected zero exit code but got {} for pid: {}".format(first_process.exitcode, first_process.pid)
+            msg="Expected zero exit code but got {} for pid: {}".format(
+                first_process.exitcode, first_process.pid
+            ),
         )
 
     @property
     def is_master(self) -> bool:
         return self.rank == 0
 
+
 # Cannot use functools.cache as it requires python 3.9
 EFA_PROBE_RESULT = None
 
+
 def has_efa() -> bool:
     """
     If shell command `fi_info -p efa -t FI_EP_RDM` returns exit code 0 then we assume that the machine has
@@ -836,7 +853,9 @@ def has_efa() -> bool:
         return EFA_PROBE_RESULT
 
     try:
-        EFA_PROBE_RESULT = subprocess.run(["fi_info", "-p", "efa", "-t", "FI_EP_RDM"]).returncode == 0
+        EFA_PROBE_RESULT = (
+            subprocess.run(["fi_info", "-p", "efa", "-t", "FI_EP_RDM"]).returncode == 0
+        )
     except FileNotFoundError:
         EFA_PROBE_RESULT = False
     return EFA_PROBE_RESULT
@@ -850,3 +869,81 @@ def tp_transports():
     see https://github.com/pytorch/pytorch/issues/73885 and https://github.com/pytorch/pytorch/issues/65022
     """
     return ["shm", "uv"] if has_efa() else None
+
+
+def _run_test_with_mt_pg(self, timeout, world_size, callback):
+    failed_ranks = run_with_threaded_pg(world_size, timeout, callback)
+    for rank, exc_info in failed_ranks:
+        print(f"Rank {rank} raised:")
+        for line in traceback.format_exception(*exc_info):
+            sys.stdout.write(line)
+    self.assertEqual([], failed_ranks, "Some ranks failed")
+
+
+def spawn_threads_and_init_comms(
+    func=None, timeout=TIMEOUT_DEFAULT, world_size=DEFAULT_WORLD_SIZE
+):
+    """
+    Wrapper to use with a test method
+    """
+    if func is None:
+        return partial(
+            spawn_threads_and_init_comms, timeout=timeout, world_size=world_size
+        )
+
+    @wraps(func)
+    def wrapper(self, *args, **kwargs):
+        _run_test_with_mt_pg(
+            self, timeout, world_size, lambda: func(self, *args, **kwargs)
+        )
+
+    return wrapper
+
+
+class MultiThreadedTestCase(TestCase):
+    """
+    Simple test runner that executes all tests with the in-proc process group.
+
+    A single instance of the TestCase object for all threads.
+
+    Difference from regular test runner:
+    Cannot use setUp / tearDown (must use perThreadSetup / perThreadShutdown)
+        Not sure what these two would be good for though.
+    No global state possible
+        How bad of a limitation is this?
+    """
+
+    def __init__(self, method_name: str = "runTest") -> None:
+        super().__init__(method_name)
+        self._test_method = getattr(self, method_name, None)
+        setattr(self, method_name, self.threaded_run_test)
+        if TestCase.setUp != type(self).setUp:
+            raise RuntimeError(
+                f"Test class {type(self)} overrides disabled method setUp. Use perThreadSetUp instead"
+            )
+        if TestCase.tearDown != type(self).tearDown:
+            raise RuntimeError(
+                f"Test class {type(self)} overrides disabled method tearDown. Use perThreadTearDown instead"
+            )
+
+    def threaded_run_test(self):
+        self.perThreadSetUp()
+        try:
+            _run_test_with_mt_pg(
+                self=self,
+                timeout=TIMEOUT_DEFAULT,
+                world_size=self.world_size,
+                callback=self._test_method,
+            )
+        finally:
+            self.perThreadTearDown()
+
+    def perThreadSetUp(self):
+        pass
+
+    def perThreadTearDown(self):
+        pass
+
+    @property
+    def world_size(self) -> int:
+        raise RuntimeError("world size not implemented")
diff --git a/torch/testing/_internal/distributed/multi_threaded_pg.py b/torch/testing/_internal/distributed/multi_threaded_pg.py
new file mode 100644
index 0000000000000..7e18f870f2e76
--- /dev/null
+++ b/torch/testing/_internal/distributed/multi_threaded_pg.py
@@ -0,0 +1,288 @@
+import queue
+import sys
+import threading
+import time
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple
+
+import torch
+import torch.distributed as dist
+from torch._C._distributed_c10d import _create_work_from_future, Store
+from torch.futures import Future
+from torch.utils._pytree import tree_flatten
+
+"""
+TODO:
+Lots of missing collectives.
+Collectives validation.
+Make timeout robust by making collectives respect the test deadline.
+Make tests robuts by making collectives interruptible.
+We need some synchronization around cleanup to ensure that timedout ranks don't cause spurious failures.
+
+"""
+
+
+def flatten_list(lst):
+    return tree_flatten(lst)[0]
+
+
+def ret_work(ret):
+    fut = Future()
+    fut.set_result(ret)
+    return _create_work_from_future(fut)
+
+
+class AllGather:
+    def work(self, data):
+        for src_rank in range(len(data)):
+            in_tensor_list = data[src_rank][1]
+            # Can't handle all_gather with multiple tensors
+            assert len(in_tensor_list) == 1
+            src_tensor = in_tensor_list[0]
+
+            for dest in data:
+                dest_tensor = dest[0][0][src_rank]
+                with torch.no_grad():
+                    dest_tensor.copy_(src_tensor)
+
+
+class Broadcast:
+    def __init__(self, src):
+        self.src = src
+
+    def work(self, data):
+        in_tensor_list = flatten_list(data[self.src])
+        for i in range(len(data)):
+            out_tensor_list = flatten_list(data[i])
+            for j in range(len(in_tensor_list)):
+                with torch.no_grad():
+                    out_tensor_list[j].copy_(in_tensor_list[j])
+
+
+class Collective:
+    def __init__(self, world_size, collective):
+        self._world_size = world_size
+        self._collective = collective
+
+        self._start_cond = threading.Condition()
+        self._done_cond = threading.Condition()
+
+        self._data = [None] * world_size
+        self._count = 0
+        self._done = False
+
+    def join(self, rank, data):
+        with self._start_cond:
+            self._data[rank] = data
+            self._count += 1
+
+            # notify rank 0
+            if self._count == self._world_size:
+                if rank > 0:
+                    self._start_cond.notify()
+
+            if rank == 0:
+                while self._count < self._world_size:
+                    self._start_cond.wait()
+
+        with self._done_cond:
+            # wait for rank 0 to finish
+            if rank > 0:
+                while not self._done:
+                    self._done_cond.wait()
+            else:
+                # copy data around
+                self._collective.work(self._data)
+                self._done = True
+                self._done_cond.notify_all()
+        return ret_work(data)
+
+
+class ProcessLocalGroup(dist.ProcessGroup):
+    _pg_lock = threading.Lock()
+    _pg_list = []
+    _count = 0
+    _ready = False
+
+    _coll_lock = threading.Lock()
+    _cur_coll = None
+
+    @classmethod
+    def _register(cls, pg):
+        with cls._pg_lock:
+            while len(cls._pg_list) <= pg._rank:
+                cls._pg_list.append(None)
+            cls._pg_list[pg._rank] = pg
+            cls._count += 1
+            if cls._count == pg._world:
+                cls._ready = True
+
+    @classmethod
+    def _start_coll(cls, world_size, collective):
+        with cls._coll_lock:
+            if not cls._ready:
+                raise Exception(
+                    f"world not ready, only {cls._count} PG's registered but world has {world_size} ranks"
+                )
+            if cls._cur_coll is None:
+                cls._cur_coll = Collective(world_size, collective)
+            return cls._cur_coll
+
+    @classmethod
+    def _end_coll(cls, collective):
+        # This is racily called by all ranks, so only one will work
+        with cls._coll_lock:
+            if cls._cur_coll == collective:
+                cls._cur_coll = None
+
+    def allgather(self, output_tensors, input_tensor, options):
+        coll = ProcessLocalGroup._start_coll(self._world, AllGather())
+        res = coll.join(self._rank, (output_tensors, input_tensor))
+        ProcessLocalGroup._end_coll(coll)
+        return res
+
+    def broadcast(self, tensor_list, opts):
+        coll = ProcessLocalGroup._start_coll(self._world, Broadcast(opts.rootRank))
+        res = coll.join(self._rank, tensor_list)
+        ProcessLocalGroup._end_coll(coll)
+        return res
+
+    def __init__(self, rank, world):
+        super(ProcessLocalGroup, self).__init__(rank, world)
+        self._rank = rank
+        self._world = world
+        ProcessLocalGroup._register(self)
+
+    def size(self):
+        return self._world
+
+    def getBackendName(self):
+        return "local"
+
+    def __repr__(self):
+        return f"PLG w:{self._world} r:{self._rank}"
+
+
+def _create_threaded_pg(prefix_store, rank, world_size, timeout):
+    return ProcessLocalGroup(rank, world_size)
+
+
+dist.Backend.register_backend("threaded", _create_threaded_pg)
+
+
+@dataclass
+class WorldData:
+    default_pg: dist.ProcessGroup
+    pg_map: Dict[dist.ProcessGroup, Tuple[str, Optional[Store]]]
+    pg_names: Dict[dist.ProcessGroup, str]
+    pg_group_ranks: Dict[dist.ProcessGroup, Dict[int, int]]
+    group_count: int
+
+
+class ThreadLocalWorld:
+    _world = threading.local()
+
+    def _get_world(self) -> WorldData:
+        if not hasattr(ThreadLocalWorld._world, "world"):
+            ThreadLocalWorld._world.world = WorldData(None, {}, {}, {}, 0)
+        return ThreadLocalWorld._world.world
+
+    @property
+    def default_pg(self):
+        return self._get_world().default_pg
+
+    @default_pg.setter
+    def default_pg(self, value):
+        self._get_world().default_pg = value
+
+    @property
+    def pg_map(self):
+        return self._get_world().pg_map
+
+    @property
+    def pg_names(self):
+        return self._get_world().pg_names
+
+    @property
+    def pg_group_ranks(self):
+        return self._get_world().pg_group_ranks
+
+    @property
+    def group_count(self) -> int:
+        return self._get_world().group_count
+
+    @group_count.setter
+    def group_count(self, value):
+        self._get_world().group_count = value
+
+
+_old_pg_world = None
+
+
+def _install_threaded_pg():
+    global _old_pg_world
+    _old_pg_world = dist.distributed_c10d._world
+    dist.distributed_c10d._world = ThreadLocalWorld()
+    return dist.distributed_c10d._world
+
+
+def _uninstall_threaded_pg():
+    dist.distributed_c10d._world = _old_pg_world
+
+
+def run_with_threaded_pg(world_size, timeout, callback):
+    """
+    Run ``callback`` with ``world_size`` threads using the in-proc process group
+    """
+    world = _install_threaded_pg()
+
+    def world_is_valid():
+        return world == dist.distributed_c10d._world
+
+    global_store = dist.HashStore()
+    exception_queue = queue.Queue()
+
+    def worker(rank):
+        if not world_is_valid():
+            raise TimeoutError("Invalid world")
+        dist.init_process_group(
+            backend="threaded", rank=rank, world_size=world_size, store=global_store
+        )
+        try:
+            callback()
+        except BaseException as ex:
+            exception_queue.put((rank, sys.exc_info()))
+        finally:
+            if world_is_valid():
+                dist.destroy_process_group()
+
+    try:
+        threads = [
+            threading.Thread(target=worker, args=(rank,)) for rank in range(world_size)
+        ]
+        for thread in threads:
+            thread.start()
+
+        deadline = time.time() + timeout
+        for idx, thread in enumerate(threads):
+            thread.join(max(0, deadline - time.time()))
+            if thread.is_alive():
+                exception_queue.put(
+                    (
+                        idx,
+                        (
+                            TimeoutError,
+                            TimeoutError(
+                                f"Rank failed to join in under {timeout} seconds"
+                            ),
+                            None,
+                        ),
+                    )
+                )
+        failed_ranks = []
+        while not exception_queue.empty():
+            failure = exception_queue.get()
+            failed_ranks.append(failure)
+        return failed_ranks
+    finally:
+        _uninstall_threaded_pg()

From b05fe28e062ae821d168061a3413148796e1e93d Mon Sep 17 00:00:00 2001
From: Michael Lazos <mlazos@fb.com>
Date: Wed, 16 Nov 2022 00:08:34 +0000
Subject: [PATCH 0928/1922] Don't iterate over graph when adding graph input
 (#89084)

helps with https://github.com/pytorch/torchdynamo/issues/1803

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89084
Approved by: https://github.com/jansel
---
 torch/_dynamo/output_graph.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index ee5079581be76..4578fb98dfcbc 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -110,6 +110,10 @@ def __init__(
         self.tensor_id_to_sym_shape_ref = {}
         self.intermediary_symbols = {}
 
+        # Enables creating unique node names by tracking
+        # all current placeholder node names
+        self.name_to_input = collections.OrderedDict()
+
     @property
     def output(self):
         return self
@@ -147,6 +151,7 @@ def restore_graphstate(self, state):
                     del node.meta["example_value"]
                 self.graph.erase_node(node)
                 self.real_value_cache.pop(node, None)
+                self.name_to_input.pop(node.name, None)
 
     def count_calls(self):
         return count_calls(self.graph)
@@ -162,22 +167,22 @@ def get_submodule(self, keys):
         return obj
 
     def create_graph_input(self, name, type_expr=None):
-        placeholders = [n for n in self.graph.nodes if n.op == "placeholder"]
-
         # unique
-        used_names = {n.target for n in placeholders}
-        if name in used_names:
+        if name in self.name_to_input:
             for i in itertools.count():
-                if f"{name}_{i}" not in used_names:
+                if f"{name}_{i}" not in self.name_to_input:
                     name = f"{name}_{i}"
                     break
 
-        if placeholders:
-            ctx = self.graph.inserting_after(placeholders[-1])
+        if self.name_to_input:
+            prev_name = next(reversed(self.name_to_input))
+            ctx = self.graph.inserting_after(self.name_to_input[prev_name])
         else:
             ctx = self.graph.inserting_before(None)
         with ctx:
-            return self.create_proxy("placeholder", name, (), {}, type_expr=type_expr)
+            proxy = self.create_proxy("placeholder", name, (), {}, type_expr=type_expr)
+            self.name_to_input[name] = proxy.node
+            return proxy
 
     def new_var(self, name="tmp"):
         existing = set(self.code_options["co_varnames"])
@@ -490,6 +495,7 @@ def remove_unused_graphargs(self):
                     del node.meta["example_value"]
                 self.graph.erase_node(node)
                 self.real_value_cache.pop(node, None)
+                self.name_to_input.pop(node.name, None)
 
         self.graphargs = [arg for arg in self.graphargs if arg.uses > 0]
 
@@ -525,6 +531,7 @@ def cleanup(self):
             if "example_value" in node.meta:
                 del node.meta["example_value"]
         self.real_value_cache.clear()
+        self.name_to_input.clear()
 
     def create_proxy(
         self,

From 0e053291b5d06eab45330e4c4f5a911b3a38160f Mon Sep 17 00:00:00 2001
From: Shunting Zhang <shunting@meta.com>
Date: Wed, 16 Nov 2022 00:29:08 +0000
Subject: [PATCH 0929/1922] allow loading model from a path in torchbench
 (#89028)

Sometimes it's really convenient to run simple models thru the torchbench.py script rather than those from pytorch/benchmark. This PR add the ability to run any model from a specified path by overloading the --only argument.

This PR is split out from #88904

Here is the usage:

        Specify the path and class name of the model in format like:
        --only=path:<MODEL_FILE_PATH>,class:<CLASS_NAME>

        Due to the fact that dynamo changes current working directory,
        the path should be an absolute path.

        The class should have a method get_example_inputs to return the inputs
        for the model. An example looks like
        ```
        class LinearModel(nn.Module):
            def __init__(self):
                super().__init__()
                self.linear = nn.Linear(10, 10)

            def forward(self, x):
                return self.linear(x)

            def get_example_inputs(self):
                return (torch.randn(2, 10),)
        ```

Test command:
```
# python benchmarks/dynamo/torchbench.py --performance --only=path:/pytorch/myscripts/model_collection.py,class:LinearModel --backend=eager
WARNING:common:torch.cuda.is_available() == False, using CPU
cpu  eval  LinearModel                        0.824x p=0.00
```

Content of model_collection.py
```
from torch import nn
import torch

class LinearModel(nn.Module):
    """
    AotAutogradStrategy.compile_fn ignore graph with at most 1 call nodes.
    Make sure this model calls 2 linear layers to avoid being skipped.
    """
    def __init__(self, nlayer=2):
        super().__init__()
        layers = []
        for _ in range(nlayer):
            layers.append(nn.Linear(10, 10))
        self.layers = nn.Sequential(*layers)

    def forward(self, x):
        return self.layers(x)

    def get_example_inputs(self):
        return (torch.randn(2, 10),)
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89028
Approved by: https://github.com/jansel
---
 benchmarks/dynamo/common.py | 93 +++++++++++++++++++++++++++++++------
 1 file changed, 80 insertions(+), 13 deletions(-)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 198877e0313d8..a6e66c4281b60 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -4,6 +4,7 @@
 import copy
 import csv
 import functools
+import importlib
 import io
 import logging
 import os
@@ -164,6 +165,42 @@
 ]
 
 
+def model_specified_by_path(path_and_class_str):
+    return ":" in path_and_class_str
+
+
+def load_model_from_path(path_and_class_str):
+    configs = {}
+    for kvstr in path_and_class_str.split(","):
+        k, v = kvstr.split(":")
+        configs[k] = v
+
+    for name in ["path", "class"]:
+        if name not in configs:
+            raise RuntimeError(
+                "Invalid --only arguments. Check help message for the correct format"
+            )
+
+    path = configs["path"]
+    class_name = configs["class"]
+
+    if path[:1] != "/":
+        raise RuntimeError(
+            "Use absolute path since dynamo may change the current working directory which makes using relative path tricky"
+        )
+
+    spec = importlib.util.spec_from_file_location("module_name", path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+
+    model_class = getattr(module, class_name)
+    assert issubclass(model_class, torch.nn.Module)
+    model = model_class()
+    assert hasattr(model, "get_example_inputs")
+    inputs = model.get_example_inputs()
+    return model, inputs
+
+
 def output_csv(filename, headers, row):
     assert filename
     existed = os.path.exists(filename)
@@ -1393,7 +1430,31 @@ def parse_args(args=None):
     parser.add_argument(
         "--fast", "-f", action="store_true", help="skip slow benchmarks"
     )
-    parser.add_argument("--only", help="Run just one model")
+    parser.add_argument(
+        "--only",
+        help="""Run just one model from torchbench. Or
+        specify the path and class name of the model in format like:
+        --only=path:<MODEL_FILE_PATH>,class:<CLASS_NAME>
+
+        Due to the fact that dynamo changes current working directory,
+        the path should be an absolute path.
+
+        The class should have a method get_example_inputs to return the inputs
+        for the model. An example looks like
+        ```
+        class LinearModel(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = nn.Linear(10, 10)
+
+            def forward(self, x):
+                return self.linear(x)
+
+            def get_example_inputs(self):
+                return (torch.randn(2, 10),)
+        ```
+    """,
+    )
     parser.add_argument(
         "--training",
         action="store_true",
@@ -1885,19 +1946,25 @@ def run(runner, args, original_dir=None):
                 batch_size = read_batch_size_from_file(
                     args, args.batch_size_file, model_name
                 )
-            try:
-                device, name, model, example_inputs, batch_size = runner.load_model(
-                    device,
-                    model_name,
-                    batch_size=batch_size,
-                )
-            except NotImplementedError as e:
-                print(e)
-                import traceback
+            if model_specified_by_path(args.only):
+                model, example_inputs = load_model_from_path(args.only)
+                name = model.__class__.__name__
+                model = model.to(device=device)
+                example_inputs = tree_map(lambda x: x.to(device=device), example_inputs)
+            else:
+                try:
+                    device, name, model, example_inputs, batch_size = runner.load_model(
+                        device,
+                        model_name,
+                        batch_size=batch_size,
+                    )
+                except NotImplementedError as e:
+                    print(e)
+                    import traceback
 
-                print(traceback.format_exc())
-                logging.warn(f"{args.only} failed to load")
-                continue  # bad benchmark implementation
+                    print(traceback.format_exc())
+                    logging.warn(f"{args.only} failed to load")
+                    continue  # bad benchmark implementation
 
             current_name = name
             current_device = device

From b94e574edc9d8f9a4ef841c7a5e12b980af566b3 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 16 Nov 2022 00:45:41 +0000
Subject: [PATCH 0930/1922] Revert "Enable correct supported activities for
 kineto on rocm (#88207)"

This reverts commit 35093fc1ab9749e6b763acead007e56b54c6375b.

Reverted https://github.com/pytorch/pytorch/pull/88207 on behalf of https://github.com/kit1980 due to Broke test_kineto on trunk / win-vs2019-cuda11.6-py3 / test (default, 4, 5, windows.8xlarge.nvidia.gpu)
---
 torch/csrc/autograd/init.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 6bfd4bd4bfed6..ee963232d3166 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -279,9 +279,8 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
 
   m.def("_supported_activities", []() {
     std::set<ActivityType> activities{ActivityType::CPU};
-#if defined(USE_KINETO) && \
-    (!defined(LIBKINETO_NOCUPTI) || !defined(LIBKINETO_NOROCTRACER))
-    if (at::getNumGPUs() > 0) {
+#if defined(USE_KINETO) && !defined(LIBKINETO_NOCUPTI)
+    if (at::getNumGPUs() > 0 && !at::hasHIP()) {
       activities.insert(ActivityType::CUDA);
     }
 #endif

From 804556e2e2af8f8d7191be2a98804f68d2fe12c6 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Mon, 14 Nov 2022 11:00:15 -0800
Subject: [PATCH 0931/1922] [functorch] add switch to enable autograd.Function
 (#88784)

This is mostly a debug or "if you know what you're doing" switch for
now. It is not public API.

Test Plan:
- new tests
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88784
Approved by: https://github.com/samdow, https://github.com/soulitzer
---
 aten/src/ATen/functorch/DynamicLayer.cpp | 13 ++++++++++-
 aten/src/ATen/functorch/DynamicLayer.h   |  6 +++++
 test/functorch/test_eager_transforms.py  | 29 ++++++++++++++++++++++++
 torch/csrc/functorch/init.cpp            |  6 +++++
 4 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/functorch/DynamicLayer.cpp b/aten/src/ATen/functorch/DynamicLayer.cpp
index 8a2668fe748b1..bea9e6e3a2f4f 100644
--- a/aten/src/ATen/functorch/DynamicLayer.cpp
+++ b/aten/src/ATen/functorch/DynamicLayer.cpp
@@ -101,7 +101,7 @@ class FuncTorchTLS : public FuncTorchTLSBase {
   }
 
   int64_t checkSupportsAutogradFunction() const override {
-    TORCH_CHECK(dynamicLayerStack.size() == 0,
+    TORCH_CHECK(dynamicLayerStack.size() == 0 || getAutogradFunctionAllowed(),
         "functorch functions (vmap, grad, vjp, etc.) currently do not support the use of autograd.Function. ",
         "Please rewrite your function to not use autograd.Function while we work on fixing this");
     return 0;
@@ -128,6 +128,7 @@ class FuncTorchTLS : public FuncTorchTLSBase {
 
   std::vector<DynamicLayer> dynamicLayerStack;
   bool allow_inplace_requires_grad_ = false;
+  bool allow_autograd_function_ = false;
 };
 
 static FuncTorchTLS* getRawFunctorchTLS() {
@@ -151,6 +152,16 @@ bool getInplaceRequiresGradAllowed() {
   return functorch_tls->allow_inplace_requires_grad_;
 }
 
+void setAutogradFunctionAllowed(bool allowed) {
+  auto* functorch_tls = getRawFunctorchTLS();
+  functorch_tls->allow_autograd_function_ = allowed;
+}
+
+bool getAutogradFunctionAllowed() {
+  auto* functorch_tls = getRawFunctorchTLS();
+  return functorch_tls->allow_autograd_function_;
+}
+
 static std::vector<DynamicLayer>& dynamicLayerStackAccessor() {
   return getRawFunctorchTLS()->dynamicLayerStack;
 }
diff --git a/aten/src/ATen/functorch/DynamicLayer.h b/aten/src/ATen/functorch/DynamicLayer.h
index 576a9621651a4..737620e54ae67 100644
--- a/aten/src/ATen/functorch/DynamicLayer.h
+++ b/aten/src/ATen/functorch/DynamicLayer.h
@@ -113,6 +113,12 @@ TORCH_API Tensor unwrapIfDead(const Tensor& tensor);
 TORCH_API std::ostream& operator<<(std::ostream& os, const DynamicLayer& layer);
 TORCH_API std::ostream& operator<<(std::ostream& os, const std::vector<DynamicLayer>& dynamicLayerStack);
 
+// While a functorch transform is active, autograd.Function is disabled
+// by default. The following two APIs are APIs for enabling
+// autograd.Function. These are not user-facing APIs.
+TORCH_API void setAutogradFunctionAllowed(bool allowed);
+TORCH_API bool getAutogradFunctionAllowed();
+
 // While a functorch grad transform is active, Tensor.requires_grad_() gets
 // disabled. These two functions are the mechanism to controlling that.
 TORCH_API void setInplaceRequiresGradAllowed(bool allowed);
diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py
index e88e8007e77ed..2dc52d3af085c 100644
--- a/test/functorch/test_eager_transforms.py
+++ b/test/functorch/test_eager_transforms.py
@@ -2388,6 +2388,35 @@ def f(x):
         with self.assertRaises(RuntimeError):
             grad(f)(x)
 
+    def test_autograd_function_debug_switch(self, device):
+        class MySin(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                ctx.save_for_backward(x)
+                return x.sin()
+
+            @staticmethod
+            def backward(ctx, gy):
+                x, = ctx.saved_tensors
+                return gy * x.cos()
+
+        x = torch.randn([])
+
+        # by default, autograd.Function is disabled in a functorch transform
+        with self.assertRaisesRegex(RuntimeError, "autograd.Function"):
+            grad(MySin.apply)(x)
+
+        # we have a debug switch to allow it
+        self.assertFalse(torch._C._functorch.get_autograd_function_allowed())
+        try:
+            torch._C._functorch.set_autograd_function_allowed(True)
+            self.assertTrue(torch._C._functorch.get_autograd_function_allowed())
+            y = grad(MySin.apply)(x)
+        finally:
+            torch._C._functorch.set_autograd_function_allowed(False)
+        self.assertFalse(torch._C._functorch.get_autograd_function_allowed())
+        self.assertEqual(y, x.cos())
+
     @parametrize('transform', [
         'vmap', 'grad', 'jacrev', 'jacfwd', 'grad_and_value', 'hessian', 'functionalize'
     ])
diff --git a/torch/csrc/functorch/init.cpp b/torch/csrc/functorch/init.cpp
index b1f696ee3c7d0..5248da36baa53 100644
--- a/torch/csrc/functorch/init.cpp
+++ b/torch/csrc/functorch/init.cpp
@@ -438,6 +438,12 @@ void initFuncTorchBindings(PyObject* module) {
   m.def(
       "get_inplace_requires_grad_allowed",
       &at::functorch::getInplaceRequiresGradAllowed);
+  m.def(
+      "set_autograd_function_allowed",
+      &at::functorch::setAutogradFunctionAllowed);
+  m.def(
+      "get_autograd_function_allowed",
+      &at::functorch::getAutogradFunctionAllowed);
   m.def("dlevel", &dlevel, "dlevel");
   m.def("dump_tensor", &dump_tensor, "dump_tensor");
   m.def("reshape_dim_into", &at::functorch::reshape_dim_into);

From 5e2e38a81fe5b513cb44d2825475465544187a29 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Tue, 15 Nov 2022 08:12:03 -0800
Subject: [PATCH 0932/1922] PyDispatcher integration with functorch (#88785)

This PR teaches PyDispatcher and PyOperator about functorch transforms.
It is important that PyDispatcher/PyOperator dispatch with functorch
transforms, because this is our plan for higher-order operators
(operators that accept functions as arguments). Examples of these
include:
- functorch transforms over the existing cond operator (control flow)
- autograd.Function support for functorch (which I am working towards),
- AOTDispatcher (should be a higher order operator)

Concretely, the problem with teaching PyDispatcher/PyOperator about
functorch is that the stack-based dispatching logic (DynamicLayerStack)
is hidden inside the fallbacks for two dispatch keys
(DynamicLayer{Front, Back}). PyDispatcher doesn't know about C++ boxed
fallbacks, our plan on record for that is that we need to reimplement
all of them in Python (but can call helper functions in C++ to make our
lives easier).

Instead of exposing all of what DynamicLayer{Front, Back} do to python,
this PR takes the approach of re-implementing part of the stack-based
dispatching in Python. The motivation is that this is more sane and
follows what the "ideal" implementation of functorch would have been:
- each transform should be a "mode"
- there should be no TLS dispatch key set hackery. functorch needs to do
this hackery today to re-use VariableType implementations.

This PR:
- exposes the DynamicLayerStack to Python
- The DynamicLayerStack is a stack of Interpreters.
These get exposed to Python as well.
- Interpreters can run operations (Interpreter.process) or lower them to
the next interpreter in the stack (Interpreter.lower)
- To use a PyOperator with functorch transforms, a developer needs to
register a rule for each transform (vmap, grad, jvp, ...).
- The PyOperator API is NOT user-facing. Things like autograd.Function
support for functorch will end up going through the autograd.Function
API.

Question for reviewers:
- Does this design make sense?
- I'm trying to split up the "functorch support for autograd.Function"
work into logical pieces. Would it be better if I didn't? (the full
thing is a bit long - 1000-2000 LOC).

Test Plan:
- new tests that construct PyOperator and compose them with functorch
transforms
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88785
Approved by: https://github.com/samdow, https://github.com/soulitzer
---
 aten/src/ATen/functorch/ADInterpreters.cpp |  10 +-
 aten/src/ATen/functorch/ADInterpreters.h   |   6 +-
 aten/src/ATen/functorch/DynamicLayer.cpp   |   4 +-
 aten/src/ATen/functorch/DynamicLayer.h     |   3 +
 test/functorch/test_eager_transforms.py    | 124 ++++++++++++++++++
 torch/_C/_functorch.pyi                    |  34 +++++
 torch/_functorch/__init__.py               |   0
 torch/_functorch/pyfunctorch.py            | 142 +++++++++++++++++++++
 torch/_functorch/utils.py                  |  14 ++
 torch/_ops.py                              |  25 +++-
 torch/csrc/functorch/init.cpp              |  35 +++++
 torch/csrc/utils/python_dispatch.cpp       |  20 +--
 torchgen/model.py                          |   2 +
 13 files changed, 398 insertions(+), 21 deletions(-)
 create mode 100644 torch/_functorch/__init__.py
 create mode 100644 torch/_functorch/pyfunctorch.py
 create mode 100644 torch/_functorch/utils.py

diff --git a/aten/src/ATen/functorch/ADInterpreters.cpp b/aten/src/ATen/functorch/ADInterpreters.cpp
index 46c134f59d61b..174949bbc3b48 100644
--- a/aten/src/ATen/functorch/ADInterpreters.cpp
+++ b/aten/src/ATen/functorch/ADInterpreters.cpp
@@ -28,7 +28,7 @@ static void checkForInvalidMutationOnCaptures(
       "as inputs.");
 }
 
-static Tensor materializeGradWrappers(const Tensor& tensor, int64_t current_level) {
+Tensor materializeGradWrappers(const Tensor& tensor, int64_t current_level) {
   if (!tensor.defined()) {
     return tensor;
   }
@@ -44,6 +44,14 @@ static Tensor materializeGradWrappers(const Tensor& tensor, int64_t current_leve
   return makeTensorWrapper(tensor, current_level, /*is_immutable=*/true);
 }
 
+Tensor GradInterpreterPtr::lift(const Tensor& tensor) const {
+  return materializeGradWrappers(tensor, level());
+}
+
+Tensor JvpInterpreterPtr::lift(const Tensor& tensor) const {
+  return materializeGradWrappers(tensor, level());
+}
+
 static void autogradBasedTransformProcess(
     const c10::OperatorHandle& op,
     torch::jit::Stack* stack,
diff --git a/aten/src/ATen/functorch/ADInterpreters.h b/aten/src/ATen/functorch/ADInterpreters.h
index b8ad638c5aee4..6ec1cca065d61 100644
--- a/aten/src/ATen/functorch/ADInterpreters.h
+++ b/aten/src/ATen/functorch/ADInterpreters.h
@@ -7,7 +7,7 @@ namespace at { namespace functorch {
 // (grad, vjp and jvp).
 // See NOTE: [functorch interpreter stack] for more details.
 
-struct GradInterpreterPtr {
+struct TORCH_API GradInterpreterPtr {
   explicit GradInterpreterPtr(const Interpreter* base): base_(base) { TORCH_INTERNAL_ASSERT(base->key() == TransformType::Grad); }
   TransformType key() const { return base_->key(); }
   int64_t level() const { return base_->level(); }
@@ -16,11 +16,12 @@ struct GradInterpreterPtr {
   bool prevGradMode() const {
     return c10::get<GradInterpreterMeta>(base_->meta()).prevGradMode_;
   }
+  Tensor lift(const Tensor& tensor) const;
  private:
   const Interpreter* base_;
 };
 
-struct JvpInterpreterPtr {
+struct TORCH_API JvpInterpreterPtr {
   explicit JvpInterpreterPtr(const Interpreter* base): base_(base) { TORCH_INTERNAL_ASSERT(base->key() == TransformType::Jvp); }
   TransformType key() const { return base_->key(); }
   int64_t level() const { return base_->level(); }
@@ -29,6 +30,7 @@ struct JvpInterpreterPtr {
   bool prevFwdGradMode() const {
     return c10::get<JvpInterpreterMeta>(base_->meta()).prevFwdGradMode_;
   }
+  Tensor lift(const Tensor& tensor) const;
  private:
   const Interpreter* base_;
 };
diff --git a/aten/src/ATen/functorch/DynamicLayer.cpp b/aten/src/ATen/functorch/DynamicLayer.cpp
index bea9e6e3a2f4f..d152f3c08c2d4 100644
--- a/aten/src/ATen/functorch/DynamicLayer.cpp
+++ b/aten/src/ATen/functorch/DynamicLayer.cpp
@@ -214,7 +214,7 @@ bool areTransformsActive() {
   return !data.empty();
 }
 
-static DynamicLayer popDynamicLayer() {
+DynamicLayer popDynamicLayer() {
   auto& dynamicLayerStack = dynamicLayerStackAccessor();
   TORCH_INTERNAL_ASSERT(dynamicLayerStack.size() > 0);
   auto result = dynamicLayerStack.back();
@@ -232,7 +232,7 @@ static DynamicLayer popDynamicLayer() {
   return result;
 }
 
-static int64_t pushDynamicLayer(DynamicLayer&& dynamic_layer) {
+int64_t pushDynamicLayer(DynamicLayer&& dynamic_layer) {
   auto& dynamicLayerStack = dynamicLayerStackAccessor();
   int64_t layerId = 1 + dynamicLayerStack.size();
   TORCH_INTERNAL_ASSERT(layerId == dynamic_layer.layerId());
diff --git a/aten/src/ATen/functorch/DynamicLayer.h b/aten/src/ATen/functorch/DynamicLayer.h
index 737620e54ae67..6c7139f5c01ea 100644
--- a/aten/src/ATen/functorch/DynamicLayer.h
+++ b/aten/src/ATen/functorch/DynamicLayer.h
@@ -124,5 +124,8 @@ TORCH_API bool getAutogradFunctionAllowed();
 TORCH_API void setInplaceRequiresGradAllowed(bool allowed);
 TORCH_API bool getInplaceRequiresGradAllowed();
 
+TORCH_API DynamicLayer popDynamicLayer();
+TORCH_API int64_t pushDynamicLayer(DynamicLayer&& layer);
+
 }
 } // namespace at
diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py
index 2dc52d3af085c..e123da0d9d3c9 100644
--- a/test/functorch/test_eager_transforms.py
+++ b/test/functorch/test_eager_transforms.py
@@ -37,6 +37,8 @@
 )
 from functorch._src.eager_transforms import enable_fwd_grad, _slice_argnums
 from functorch.experimental import functionalize
+from torch._ops import PyOperator
+from torch._functorch.utils import enable_autograd_function
 
 # NB: numpy is a testing dependency!
 import numpy as np
@@ -3543,6 +3545,123 @@ def forward(self, x_1):
     """)
 
 
+def construct_sum_pyop():
+    mysum = PyOperator("mysum")
+
+    @mysum.py_impl(torch._C._functorch.TransformType.Vmap)
+    def mysum_batch_rule(interpreter, x, dim):
+        if not torch._C._functorch.is_batchedtensor(x):
+            with interpreter.lower():
+                x = x.view_as(x)  # unnecessary, just here to test the dispatch
+                return mysum(x, dim)
+
+        bdim = torch._C._functorch.maybe_get_bdim(x)
+        value = torch._C._functorch.get_unwrapped(x)
+
+        with interpreter.lower():
+            value = value.movedim(bdim, 0)
+            result = mysum(value, dim + 1)
+
+        return torch._C._functorch._add_batch_dim(result, 0, interpreter.level())
+
+    @mysum.py_impl(torch._C._functorch.TransformType.Grad)
+    def mysum_grad_rule(interpreter, x, dim):
+        level = interpreter.level()
+
+        class MySum(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x, dim):
+                ctx.x_shape = x.shape
+                ctx.dim = dim
+                x = torch._C._functorch._unwrap_for_grad(x, level)
+                with torch.enable_grad(), interpreter.lower():
+                    x = x.view_as(x)  # unnecessary, just here to test the dispatch
+                    y = mysum(x, dim)
+
+                y = torch._C._functorch._wrap_for_grad(y, level)
+                return y
+
+            @staticmethod
+            def backward(ctx, gy):
+                return gy.unsqueeze(ctx.dim).expand(ctx.x_shape), None
+
+        with enable_autograd_function():
+            return MySum.apply(x, dim)
+
+    @mysum.py_impl(torch._C.DispatchKey.AutogradCPU)
+    def mysum_autograd_cpu(x, dim):
+        return torch.sum(x, dim)
+
+    @mysum.py_impl(torch._C.DispatchKey.AutogradCUDA)
+    def mysum_autograd_cuda(x, dim):
+        return torch.sum(x, dim)
+
+    return mysum
+
+sum_pyop = construct_sum_pyop()
+
+class TestPyOperatorInteraction(TestCase):
+
+    def test_basic_sum(self, device):
+        x = torch.randn(2, 3, 4, device=device)
+        result = sum_pyop(x, 1)
+        self.assertEqual(result, torch.sum(x, 1))
+
+    def test_vmap_sum(self, device):
+        x = torch.randn(2, 3, 4, device=device)
+        result = vmap(sum_pyop, (0, None))(x, 0)
+        self.assertEqual(result, torch.sum(x, 1))
+
+        result = vmap(vmap(sum_pyop, (0, None)), (0, None))(x, 0)
+        self.assertEqual(result, torch.sum(x, 2))
+
+    def test_grad_sum(self, device):
+        x = torch.randn(3, device=device)
+        gx = grad(sum_pyop)(x, 0)
+        self.assertEqual(gx, torch.ones_like(x))
+
+    def test_grad_grad_sum(self, device):
+        x = torch.randn(3, requires_grad=True, device=device)
+
+        def f(x):
+            # higher order grad. Requires a non-linearity
+            return sum_pyop(x.sin(), 0)
+
+        def grad_f_sum(x):
+            return grad(f)(x).sum()
+
+        ggx = grad(grad_f_sum)(x)
+        self.assertEqual(ggx, -x.sin())
+
+    def test_vmap_grad_sum(self, device):
+        x = torch.randn(2, 3, device=device)
+        gx = vmap(grad(sum_pyop), (0, None))(x, 0)
+        self.assertEqual(gx, torch.ones_like(x))
+
+    def test_no_grad_outside_grad(self, device):
+        x = torch.randn(3, device=device, requires_grad=True)
+        with torch.no_grad():
+            y = grad(sum_pyop)(x, 0)
+        self.assertEqual(y, torch.ones_like(x))
+        self.assertFalse(y.requires_grad)
+
+    def test_no_grad_inside_grad(self, device):
+        def f(x):
+            with torch.no_grad():
+                shift = sum_pyop(x ** 2, 0)
+            return sum_pyop(x ** 2, 0) - shift
+
+        x = torch.randn(3, device=device)
+        y = grad(f)(x)
+        self.assertEqual(y, 2 * x)
+        y = grad(lambda x: grad(f)(x).sum())(x)
+        self.assertEqual(y, torch.full_like(x, 2))
+
+        x = torch.randn(3, device=device, requires_grad=True)
+        y = grad(f)(x)
+        z, = torch.autograd.grad(y.sum(), x)
+        self.assertEqual(z, torch.full_like(x, 2))
+
 
 only_for = ("cpu", "cuda")
 instantiate_device_type_tests(
@@ -3585,6 +3704,11 @@ def forward(self, x_1):
     globals(),
     only_for=only_for,
 )
+instantiate_device_type_tests(
+    TestPyOperatorInteraction,
+    globals(),
+    only_for=only_for,
+)
 instantiate_device_type_tests(
     TestFunctionalize,
     globals(),
diff --git a/torch/_C/_functorch.pyi b/torch/_C/_functorch.pyi
index 6ab5f91b78f1e..bb9649daadcbb 100644
--- a/torch/_C/_functorch.pyi
+++ b/torch/_C/_functorch.pyi
@@ -1,4 +1,5 @@
 from torch import Tensor
+from enum import Enum
 
 # Defined in torch/csrc/functorch/init.cpp
 
@@ -10,3 +11,36 @@ def is_functorch_wrapped_tensor(tensor: Tensor) -> bool: ...
 def is_gradtrackingtensor(tensor: Tensor) -> bool: ...
 def maybe_get_bdim(tensor: Tensor) -> int: ...
 def maybe_get_level(tensor: Tensor) -> int: ...
+
+def set_autograd_function_allowed(allowed: bool) -> None: ...
+def get_autograd_function_allowed() -> bool: ...
+
+# Defined in aten/src/ATen/functorch/Interpreter.h
+class TransformType(Enum):
+    Torch: TransformType = ...
+    Vmap: TransformType = ...
+    Grad: TransformType = ...
+    Jvp: TransformType = ...
+    Functionalize: TransformType = ...
+
+class CInterpreter:
+    def key(self) -> TransformType: ...
+    def level(self) -> int: ...
+
+class CGradInterpreterPtr:
+    def __init__(self, interpreter: CInterpreter): ...
+    def lift(self, Tensor) -> Tensor: ...
+    def prevGradMode(self) -> bool: ...
+
+class CVmapInterpreterPtr:
+    def __init__(self, interpreter: CInterpreter): ...
+    def key(self) -> TransformType: ...
+    def level(self) -> int: ...
+    def batchSize(self) -> int: ...
+
+class DynamicLayer:
+    pass
+
+def peek_interpreter_stack() -> CInterpreter: ...
+def pop_dynamic_layer_stack() -> DynamicLayer: ...
+def push_dynamic_layer_stack(dl: DynamicLayer) -> int: ...
diff --git a/torch/_functorch/__init__.py b/torch/_functorch/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/torch/_functorch/pyfunctorch.py b/torch/_functorch/pyfunctorch.py
new file mode 100644
index 0000000000000..1ada5b4e19771
--- /dev/null
+++ b/torch/_functorch/pyfunctorch.py
@@ -0,0 +1,142 @@
+from abc import ABC, abstractmethod
+import contextlib
+from typing import Any
+import torch
+import torch.utils._pytree as pytree
+from torch._C._functorch import (
+    TransformType,
+    CInterpreter,
+    CGradInterpreterPtr,
+    CVmapInterpreterPtr,
+    pop_dynamic_layer_stack,
+    push_dynamic_layer_stack,
+)
+
+"""
+This file contains the functorch integration with PyDispatcher.
+
+PyDispatcher does not understand functorch's DynamicLayerStack dispatching
+logic because it is entirely implemented in C++ in the fallbacks for two
+dispatch keys, FuncTorchDynamicLayer{Front, Back}Mode (PyDispatcher is unable
+to directly reuse C++ boxed fallbacks).
+
+Instead of trying to hammer PyDispatcher into understanding those fallbacks,
+we re-implement the logic of peeking the top of the stack for an interpreter,
+selecting the interpreter to dispatch on, etc, in Python. This leads to a
+simpler design.
+
+The main difference between C++ functorch and PyDispatcher's functorch logic
+is that:
+- C++ functorch needs to manually tweak dispatch keys to ping-pong between
+  DynamicLayerFrontMode and DynamicLayerBackMode.
+- PyDispatcher's functorch logic pops an Interpreter from the top of the stack
+  and asks it to execute the rule associated with the Interpreter.
+
+In C++ we do the ping-pong because e.g. vmap rules are associated with the
+batched DispatchKey, but in PyDispatcher we are able to avoid this by asking
+the user to register a batching rule directly to a transform that an
+interpreter then invokes.
+"""
+
+
+# FuncTorchInterpreter is the Python version of Interpreter (recall that
+# the DynamicLayerStack is a stack of interpreters).
+# It is a wrapper around the actual C++ Interpreter object.
+#
+# Keep the methods in sync with aten/src/ATen/functorch/Interpreter.h
+class FuncTorchInterpreter(ABC):
+    def __init__(self, cptr: Any):
+        self._cptr = cptr
+
+    # Process an operation. eg for vmap, this is invoking a batching rule.
+    # Conceptually this is analogous to Interpreter::process in C++
+    @abstractmethod
+    def process(self, op, args, kwargs):
+        pass
+
+    # lower an operation from this Interpreter to the next Interpreter on the stack.
+    # Concretely, this involves temporarily popping the current Interpreter.
+    # Conceptually this is analogous to Interpreter::sendToNextInterpreter in C++
+    def lower(self):
+        return temporarily_pop_interpreter_stack()
+
+    def level(self):
+        return self._cptr.level()
+
+    def key(self):
+        return self._cptr.key()
+
+
+@contextlib.contextmanager
+def temporarily_pop_interpreter_stack():
+    try:
+        saved = pop_dynamic_layer_stack()
+        yield
+    finally:
+        push_dynamic_layer_stack(saved)
+
+
+class VmapInterpreter(FuncTorchInterpreter):
+    def __init__(self, cdata: CInterpreter):
+        assert cdata.key() == TransformType.Vmap
+        # NOTE: [Interpreter cdata vs cptr]
+        # cdata is a generic CInterpreter. We wrap it in a CVmapInterpreterPtr
+        # so that we can access methods specific to the vmap interpreter
+        self._cdata = cdata
+        self._cptr = CVmapInterpreterPtr(cdata)
+
+    def process(self, op, args, kwargs):
+        kernel = op.functorch_table[TransformType.Vmap]
+        return kernel(self, *args, **kwargs)
+
+    def batch_size(self):
+        return self._cptr.batchSize()
+
+
+class GradInterpreter(FuncTorchInterpreter):
+    def __init__(self, cdata: CInterpreter):
+        assert cdata.key() == TransformType.Grad
+        # See NOTE: [Interpreter cdata vs cptr]
+        self._cdata = cdata
+        self._cptr = CGradInterpreterPtr(cdata)
+
+    def lift(self, args, kwargs):
+        args, kwargs = pytree.tree_map_only(torch.Tensor, self._cptr.lift, [args, kwargs])
+        return args, kwargs
+
+    def process(self, op, args, kwargs):
+        kernel = op.functorch_table[TransformType.Grad]
+        args, kwargs = self.lift(args, kwargs)
+        return kernel(self, *args, **kwargs)
+
+    # GradInterpreter has custom lower because of the no_grad interaction
+    # See NOTE [grad and vjp interaction with no_grad]
+    # This logic is mirrored from C++ GradInterpreterPtr::sendToNextInterpreter
+    def lower(self):
+        prev_grad_mode = self.prev_grad_mode()
+        if not self.prev_grad_mode:
+            return contextlib.nested(torch.no_grad(), super().lower())
+        return super().lower()
+
+    def prev_grad_mode(self):
+        return self._cptr.prevGradMode()
+
+
+def coerce_cinterpreter(cinterpreter: CInterpreter) -> FuncTorchInterpreter:
+    key = cinterpreter.key()
+    if key == TransformType.Grad:
+        return GradInterpreter(cinterpreter)
+    if key == TransformType.Vmap:
+        return VmapInterpreter(cinterpreter)
+    raise RuntimeError(f"NYI: PyDispatcher has not implemented support for {key}")
+
+
+def retrieve_current_functorch_interpreter():
+    interpreter = torch._C._functorch.peek_interpreter_stack()
+    assert interpreter is not None
+    return coerce_cinterpreter(interpreter)
+
+
+def dispatch_functorch(op, args, kwargs):
+    interpreter = retrieve_current_functorch_interpreter()
+    return interpreter.process(op, args, kwargs)
diff --git a/torch/_functorch/utils.py b/torch/_functorch/utils.py
new file mode 100644
index 0000000000000..c1474ba90fe3e
--- /dev/null
+++ b/torch/_functorch/utils.py
@@ -0,0 +1,14 @@
+import contextlib
+from torch._C._functorch import (
+    set_autograd_function_allowed,
+    get_autograd_function_allowed,
+)
+
+@contextlib.contextmanager
+def enable_autograd_function():
+    try:
+        prev_state = get_autograd_function_allowed()
+        set_autograd_function_allowed(True)
+        yield
+    finally:
+        set_autograd_function_allowed(prev_state)
diff --git a/torch/_ops.py b/torch/_ops.py
index 4c194e9d938bb..9163932144d0d 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -10,6 +10,7 @@
 
 import torch.jit
 from torch import _utils_internal
+from torch._functorch.pyfunctorch import dispatch_functorch
 
 # Query `hasattr` only once.
 
@@ -114,6 +115,7 @@ def __init__(self, name):
         self._name = name
         self.table = {}
         self.python_key_mode_table = {}
+        self.functorch_table = {}
 
         # Make _OPNamespace not scream, this whole name based association needs a good hard look
         self.__name__ = name
@@ -122,18 +124,26 @@ def __init__(self, name):
     def fallthrough(self, dispatch_key):
         self.table[dispatch_key] = self._fallthrough_fn(self, dispatch_key)
 
-    def py_impl(self, dispatch_key_or_mode):
+    def py_impl(self, dispatch_key_or_mode_or_transform):
         def inner(fn):
-            if inspect.isclass(dispatch_key_or_mode) and issubclass(
-                dispatch_key_or_mode, torch.utils._python_dispatch.TorchDispatchMode
+            if inspect.isclass(dispatch_key_or_mode_or_transform) and issubclass(
+                dispatch_key_or_mode_or_transform,
+                torch.utils._python_dispatch.TorchDispatchMode,
             ):
-                mode = dispatch_key_or_mode
+                mode = dispatch_key_or_mode_or_transform
                 assert mode not in self.python_key_mode_table
                 # TODO(voz): Should we replace setting torch._C.DispatchKey.Python entirely with setting mode keys?
                 self.python_key_mode_table[mode] = fn
                 return fn
 
-            dispatch_key = dispatch_key_or_mode
+            if isinstance(
+                dispatch_key_or_mode_or_transform, torch._C._functorch.TransformType
+            ):
+                transform = dispatch_key_or_mode_or_transform
+                self.functorch_table[transform] = fn
+                return fn
+
+            dispatch_key = dispatch_key_or_mode_or_transform
             assert (
                 dispatch_key != torch._C.DispatchKey.Python
             ), "Please register a mode for the torch._C.DispatchKey.Python key instead."
@@ -147,6 +157,9 @@ def inner(fn):
     def dispatch(self, dispatch_key, *args, **kwargs):
         from torch.utils._python_dispatch import _get_current_dispatch_mode
 
+        if dispatch_key == torch._C.DispatchKey.FuncTorchDynamicLayerFrontMode:
+            return dispatch_functorch(self, args, kwargs)
+
         if dispatch_key == torch._C.DispatchKey.Python:
             # TODO(voz): We should walk all the nodes here / turn it into a list, topmode is ok for now.
             curr_mode = type(_get_current_dispatch_mode())
@@ -159,7 +172,7 @@ def dispatch(self, dispatch_key, *args, **kwargs):
             # TODO(voz): The idea behind this is that we do not yet support dispatch by key + mode, only key.
             return self.python_key_mode_table[curr_mode](*args, **kwargs)
 
-        assert dispatch_key in self.table
+        assert dispatch_key in self.table, dispatch_key
         return self.table[dispatch_key](*args, **kwargs)
 
     def __call__(self, *args, **kwargs):
diff --git a/torch/csrc/functorch/init.cpp b/torch/csrc/functorch/init.cpp
index 5248da36baa53..65a3b3415b7e2 100644
--- a/torch/csrc/functorch/init.cpp
+++ b/torch/csrc/functorch/init.cpp
@@ -12,6 +12,7 @@
 #include <ATen/functorch/BatchedFallback.h>
 #include <ATen/functorch/BatchedTensorImpl.h>
 #include <ATen/functorch/DynamicLayer.h>
+#include <ATen/functorch/Interpreter.h>
 #include <ATen/functorch/LegacyVmapTransforms.h>
 #include <ATen/functorch/PlumbingHelper.h>
 #include <ATen/functorch/TensorWrapper.h>
@@ -467,6 +468,40 @@ void initFuncTorchBindings(PyObject* module) {
   m.def("is_functorch_wrapped_tensor", [](const Tensor& tensor) {
     return maybe_get_level(tensor) != -1;
   });
+  m.def("peek_interpreter_stack", []() -> c10::optional<Interpreter> {
+    const auto& stack = getDynamicLayerStack();
+    if (stack.size() == 0) {
+      return c10::nullopt;
+    }
+    auto result = stack.back().interpreter();
+    return result;
+  });
+  m.def("pop_dynamic_layer_stack", &popDynamicLayer);
+  m.def("push_dynamic_layer_stack", [](DynamicLayer layer) -> int64_t {
+    return pushDynamicLayer(std::move(layer));
+  });
+  py::class_<DynamicLayer>(m, "DynamicLayer");
+
+  py::enum_<TransformType>(m, "TransformType")
+      .value("Torch", TransformType::Torch)
+      .value("Grad", TransformType::Grad)
+      .value("Jvp", TransformType::Jvp)
+      .value("Functionalize", TransformType::Functionalize)
+      .value("Vmap", TransformType::Vmap);
+  py::class_<Interpreter>(m, "CInterpreter")
+      .def("key", &Interpreter::key)
+      .def("level", &Interpreter::level);
+  py::class_<GradInterpreterPtr>(m, "CGradInterpreterPtr")
+      .def(py::init<const Interpreter*>())
+      .def("key", &GradInterpreterPtr::key)
+      .def("level", &GradInterpreterPtr::level)
+      .def("lift", &GradInterpreterPtr::lift)
+      .def("prevGradMode", &GradInterpreterPtr::prevGradMode);
+  py::class_<VmapInterpreterPtr>(m, "CVmapInterpreterPtr")
+      .def(py::init<const Interpreter*>())
+      .def("key", &VmapInterpreterPtr::key)
+      .def("level", &VmapInterpreterPtr::level)
+      .def("batchSize", &VmapInterpreterPtr::batchSize);
 }
 
 } // namespace impl
diff --git a/torch/csrc/utils/python_dispatch.cpp b/torch/csrc/utils/python_dispatch.cpp
index 0ff1f575a61aa..662ab9981a1d9 100644
--- a/torch/csrc/utils/python_dispatch.cpp
+++ b/torch/csrc/utils/python_dispatch.cpp
@@ -479,14 +479,14 @@ void initDispatchBindings(PyObject* module) {
 
 #define DEF_ONE(n) .value(#n, c10::DispatchKey::n)
 
-  py::enum_<c10::DispatchKey>(m, "DispatchKey") DEF_ONE(Undefined)
-      DEF_ONE(CompositeExplicitAutogradNonFunctional)
-          DEF_ONE(CompositeExplicitAutograd)
-              DEF_ONE(CompositeImplicitAutogradNestedTensor)
-                  DEF_ONE(CompositeImplicitAutograd) DEF_ONE(AutogradOther)
-                      DEF_ONE(Autograd) DEF_ONE(BackendSelect)
-                          DEF_ONE(ADInplaceOrView) DEF_ONE(PythonTLSSnapshot)
-                              DEF_ONE(Python)
+  py::enum_<c10::DispatchKey>(m, "DispatchKey") DEF_ONE(Undefined) DEF_ONE(
+      CompositeExplicitAutogradNonFunctional) DEF_ONE(CompositeExplicitAutograd)
+      DEF_ONE(CompositeImplicitAutogradNestedTensor)
+          DEF_ONE(CompositeImplicitAutograd) DEF_ONE(AutogradOther)
+              DEF_ONE(Autograd) DEF_ONE(BackendSelect) DEF_ONE(ADInplaceOrView)
+                  DEF_ONE(PythonTLSSnapshot) DEF_ONE(Python)
+                      DEF_ONE(FuncTorchDynamicLayerFrontMode)
+                          DEF_ONE(FuncTorchDynamicLayerBackMode)
 
 #define DEF_SINGLE(n, prefix) .value(#prefix #n, c10::DispatchKey::prefix##n)
 #define DEF_MULTIPLE(fullname, prefix)              \
@@ -495,11 +495,11 @@ void initDispatchBindings(PyObject* module) {
   C10_FORALL_BACKEND_COMPONENTS(DEF_SINGLE, prefix) \
   DEF_SINGLE(, EndOf##fullname##Backends)
 
-                                  C10_FORALL_FUNCTIONALITY_KEYS(DEF_MULTIPLE)
+                              C10_FORALL_FUNCTIONALITY_KEYS(DEF_MULTIPLE)
 
 #undef DEF_MULTIPLE
 #undef DEF_SINGLE
-                                      ;
+                                  ;
 
   py::class_<c10::DispatchKeySet>(m, "DispatchKeySet")
       .def(py::init<c10::DispatchKey>())
diff --git a/torchgen/model.py b/torchgen/model.py
index a2a658d0a59c1..d57d3372a159a 100644
--- a/torchgen/model.py
+++ b/torchgen/model.py
@@ -81,6 +81,7 @@ class DispatchKey(Enum):
     SparseCsrCUDA = auto()
 
     Python = auto()
+    FuncTorchDynamicLayerBackMode = auto()
     ZeroTensor = auto()
     BackendSelect = auto()
     Named = auto()
@@ -91,6 +92,7 @@ class DispatchKey(Enum):
     Autocast = auto()
     Batched = auto()
     VmapMode = auto()
+    FuncTorchDynamicLayerFrontMode = auto()
     TESTING_ONLY_GenericWrapper = auto()
     TESTING_ONLY_GenericMode = auto()
 

From b7c14fec1b3660d415288546ea87af0c2902ea35 Mon Sep 17 00:00:00 2001
From: Salil Desai <salilsdesai@meta.com>
Date: Wed, 16 Nov 2022 00:56:12 +0000
Subject: [PATCH 0933/1922] Fix and Re-enable
 test_quantize_fx_lite_script_module.py (#88897)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary: After D35984526 (https://github.com/pytorch/pytorch/commit/416899d1a9fcb9dbc8bb66ed796b86360f573903), ```torch.ao.quantization.quantize_fx.prepare_fx``` requires passing in  ```example_args```. This diff fixes the calls to ```prepare_fx``` in this test by adding in ```example_args``` as necessary.

Test Plan:
```
buck test caffe2/test:fx_quantization_lite
```

```
  ✓ ListingSuccess: caffe2/test:fx_quantization_lite : 3 tests discovered (39.689)
    ✓ Pass: caffe2/test:fx_quantization_lite - test_conv2d (mobile.test_quantize_fx_lite_script_module.TestLiteFuseFx) (44.451)
    ✓ Pass: caffe2/test:fx_quantization_lite - test_embedding (mobile.test_quantize_fx_lite_script_module.TestLiteFuseFx) (45.462)
    ✓ Pass: caffe2/test:fx_quantization_lite - test_submodule (mobile.test_quantize_fx_lite_script_module.TestLiteFuseFx) (45.933)
Summary
  Pass: 3
  ListingSuccess: 1
Finished test run: https://www.internalfb.com/intern/testinfra/testrun/3096224827259146
```

Differential Revision: D41227335

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88897
Approved by: https://github.com/dagitses
---
 test/mobile/test_quantize_fx_lite_script_module.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/test/mobile/test_quantize_fx_lite_script_module.py b/test/mobile/test_quantize_fx_lite_script_module.py
index 44beeef818c33..ebc96d17697bd 100644
--- a/test/mobile/test_quantize_fx_lite_script_module.py
+++ b/test/mobile/test_quantize_fx_lite_script_module.py
@@ -47,7 +47,11 @@ def forward(self, indices):
 
         for qconfig, node in configs:
             qconfig_dict = {"": qconfig}
-            m = prepare_fx(model, qconfig_dict)
+            m = prepare_fx(
+                model,
+                qconfig_dict,
+                example_inputs=torch.randint(low=0, high=10, size=(20,)),
+            )
             m = convert_fx(m)
             self._compare_script_and_mobile(m, input=indices)
 
@@ -65,7 +69,7 @@ def forward(self, x):
 
         m = M().eval()
         qconfig_dict = {"": default_qconfig, "module_name": [("conv1", None)]}
-        m = prepare_fx(m, qconfig_dict)
+        m = prepare_fx(m, qconfig_dict, example_inputs=torch.randn(1, 1, 1, 1))
         data = torch.randn(1, 1, 1, 1)
         m = convert_fx(m)
         # first conv is quantized, second conv is not quantized
@@ -84,7 +88,11 @@ def test_submodule(self):
                 "": torch.ao.quantization.get_default_qconfig("qnnpack"),
                 **config,
             }
-            model = prepare_fx(model, qconfig_dict)
+            model = prepare_fx(
+                model,
+                qconfig_dict,
+                example_inputs=torch.randn(5, 5),
+            )
             quant = convert_fx(model)
 
             x = torch.randn(5, 5)

From 15c1e1cda8cbca11ac9c55a80bbeb56b09838bf1 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Tue, 15 Nov 2022 06:32:36 -0800
Subject: [PATCH 0934/1922] Hide ConvParams struct from ConvUtils.h (#89059)

It isn't actually used outside of Convolution.cpp, so no reason
to publish it.  I intend to turn this into a template, so moving
it with the method definitions is very convenient.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89059
Approved by: https://github.com/SherlockNoMad
---
 aten/src/ATen/native/ConvUtils.h     | 55 ----------------------------
 aten/src/ATen/native/Convolution.cpp | 55 ++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h
index 675f701c8582d..b8e2b0842a002 100644
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@@ -80,40 +80,6 @@ static inline bool cudnnv8_use_heur_mode_b() {
   return cudnnv8_heuristic_mode_b;
 }
 
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-struct ConvParams {
-  std::vector<int64_t> stride;
-  std::vector<int64_t> padding;
-  std::vector<int64_t> dilation;
-  bool transposed;
-  std::vector<int64_t> output_padding;
-  int groups;
-  bool benchmark;
-  bool deterministic;
-  bool cudnn_enabled;
-  bool allow_tf32;
-
-  bool is_strided() const;
-  bool is_dilated() const;
-  bool is_padded() const;
-  bool is_output_padding_neg() const;
-  bool is_output_padding_big() const;
-  bool is_padding_neg() const;
-  bool is_stride_nonpos() const;
-  void view1d_as_2d();
-  bool use_cpu_depthwise3x3_winograd(const at::Tensor& input, const at::Tensor& weight, const c10::optional<at::Tensor>& bias) const;
-  bool needs_64bit_indexing_no_split(const at::Tensor& input, const at::Tensor& weight) const;
-  bool use_cudnn(const at::Tensor& input, const at::Tensor& weight) const;
-  bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const;
-  bool use_miopen(const at::Tensor& input, const at::Tensor& weight, bool bias_defined) const;
-  bool use_mkldnn(const at::Tensor& input, const at::Tensor& weight) const;
-  bool use_nnpack(const at::Tensor& input, const at::Tensor& weight) const;
-  bool use_xnnpack(const at::Tensor& input, const at::Tensor& weight,
-                   const at::OptionalIntArrayRef bias_sizes_opt) const;
-  bool use_mps(const at::Tensor& input, const at::Tensor& weight) const;
-  bool is_depthwise(const at::Tensor& input, const at::Tensor& weight) const;
-};
-
 // Keep in sync with py::enum_ in Module.cpp
 enum class ConvBackend {
   CudaDepthwise2d,
@@ -140,27 +106,6 @@ enum class ConvBackend {
   MpsTranspose,
 };
 
-// Function to select the convolution backend based on the inputs and params.
-// This overload is used within the convolution internals but not exposed to python.
-// NB: The forward pass provides a bias tensor while the backward pass provides
-// a bool indicating whether the bias is defined. This is done to save memory by
-// avoiding saving the full bias tensor for backward.
-TORCH_API ConvBackend _select_conv_backend(
-    const Tensor& input,
-    const Tensor& weight,
-    const c10::optional<Tensor>& bias_opt,
-    const at::OptionalIntArrayRef bias_sizes_opt,
-    const bool need_backward,
-    const ConvParams& params);
-
-// For BC reasons, have a copy that does not require bias_opt
-TORCH_API ConvBackend select_conv_backend(
-    const Tensor& input,
-    const Tensor& weight,
-    const at::OptionalIntArrayRef bias_sizes_opt,
-    const bool need_backward,
-    const ConvParams& params);
-
 // Overload for selecting the convolution backend from the full set of convolution inputs.
 // This overload is exposed to python for testing, etc.
 TORCH_API ConvBackend select_conv_backend(
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 109f0ac059220..e87d98357ca99 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -82,6 +82,61 @@ constexpr int MIOPEN_DIM_MAX = 5;
 
 namespace at { namespace native {
 
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct ConvParams {
+  std::vector<int64_t> stride;
+  std::vector<int64_t> padding;
+  std::vector<int64_t> dilation;
+  bool transposed;
+  std::vector<int64_t> output_padding;
+  int groups;
+  bool benchmark;
+  bool deterministic;
+  bool cudnn_enabled;
+  bool allow_tf32;
+
+  bool is_strided() const;
+  bool is_dilated() const;
+  bool is_padded() const;
+  bool is_output_padding_neg() const;
+  bool is_output_padding_big() const;
+  bool is_padding_neg() const;
+  bool is_stride_nonpos() const;
+  void view1d_as_2d();
+  bool use_cpu_depthwise3x3_winograd(const at::Tensor& input, const at::Tensor& weight, const c10::optional<at::Tensor>& bias) const;
+  bool needs_64bit_indexing_no_split(const at::Tensor& input, const at::Tensor& weight) const;
+  bool use_cudnn(const at::Tensor& input, const at::Tensor& weight) const;
+  bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const;
+  bool use_miopen(const at::Tensor& input, const at::Tensor& weight, bool bias_defined) const;
+  bool use_mkldnn(const at::Tensor& input, const at::Tensor& weight) const;
+  bool use_nnpack(const at::Tensor& input, const at::Tensor& weight) const;
+  bool use_xnnpack(const at::Tensor& input, const at::Tensor& weight,
+                   const at::OptionalIntArrayRef bias_sizes_opt) const;
+  bool use_mps(const at::Tensor& input, const at::Tensor& weight) const;
+  bool is_depthwise(const at::Tensor& input, const at::Tensor& weight) const;
+};
+
+// Function to select the convolution backend based on the inputs and params.
+// This overload is used within the convolution internals but not exposed to python.
+// NB: The forward pass provides a bias tensor while the backward pass provides
+// a bool indicating whether the bias is defined. This is done to save memory by
+// avoiding saving the full bias tensor for backward.
+ConvBackend _select_conv_backend(
+    const Tensor& input,
+    const Tensor& weight,
+    const c10::optional<Tensor>& bias_opt,
+    const at::OptionalIntArrayRef bias_sizes_opt,
+    const bool need_backward,
+    const ConvParams& params);
+
+// For BC reasons, have a copy that does not require bias_opt
+ConvBackend select_conv_backend(
+    const Tensor& input,
+    const Tensor& weight,
+    const at::OptionalIntArrayRef bias_sizes_opt,
+    const bool need_backward,
+    const ConvParams& params);
+
 DEFINE_DISPATCH(conv_depthwise2d_backward_stub);
 DEFINE_DISPATCH(conv_depthwise3d_backward_stub);
 DEFINE_DISPATCH(cudnn_convolution_backward_stub);

From c3908171989327539984a23e62b126b524dd4e1b Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Tue, 15 Nov 2022 06:41:53 -0800
Subject: [PATCH 0935/1922] Move ConvParams methods directly on struct (#89062)

This reduces boilerplate.  Also, I plan to add a template
parameter to ConvParams; without moving the methods onto the
struct, I would have to manually template every method.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89062
Approved by: https://github.com/SherlockNoMad
---
 aten/src/ATen/native/Convolution.cpp | 734 +++++++++++++--------------
 1 file changed, 352 insertions(+), 382 deletions(-)

diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index e87d98357ca99..29b2ce804c806 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -82,359 +82,6 @@ constexpr int MIOPEN_DIM_MAX = 5;
 
 namespace at { namespace native {
 
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-struct ConvParams {
-  std::vector<int64_t> stride;
-  std::vector<int64_t> padding;
-  std::vector<int64_t> dilation;
-  bool transposed;
-  std::vector<int64_t> output_padding;
-  int groups;
-  bool benchmark;
-  bool deterministic;
-  bool cudnn_enabled;
-  bool allow_tf32;
-
-  bool is_strided() const;
-  bool is_dilated() const;
-  bool is_padded() const;
-  bool is_output_padding_neg() const;
-  bool is_output_padding_big() const;
-  bool is_padding_neg() const;
-  bool is_stride_nonpos() const;
-  void view1d_as_2d();
-  bool use_cpu_depthwise3x3_winograd(const at::Tensor& input, const at::Tensor& weight, const c10::optional<at::Tensor>& bias) const;
-  bool needs_64bit_indexing_no_split(const at::Tensor& input, const at::Tensor& weight) const;
-  bool use_cudnn(const at::Tensor& input, const at::Tensor& weight) const;
-  bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const;
-  bool use_miopen(const at::Tensor& input, const at::Tensor& weight, bool bias_defined) const;
-  bool use_mkldnn(const at::Tensor& input, const at::Tensor& weight) const;
-  bool use_nnpack(const at::Tensor& input, const at::Tensor& weight) const;
-  bool use_xnnpack(const at::Tensor& input, const at::Tensor& weight,
-                   const at::OptionalIntArrayRef bias_sizes_opt) const;
-  bool use_mps(const at::Tensor& input, const at::Tensor& weight) const;
-  bool is_depthwise(const at::Tensor& input, const at::Tensor& weight) const;
-};
-
-// Function to select the convolution backend based on the inputs and params.
-// This overload is used within the convolution internals but not exposed to python.
-// NB: The forward pass provides a bias tensor while the backward pass provides
-// a bool indicating whether the bias is defined. This is done to save memory by
-// avoiding saving the full bias tensor for backward.
-ConvBackend _select_conv_backend(
-    const Tensor& input,
-    const Tensor& weight,
-    const c10::optional<Tensor>& bias_opt,
-    const at::OptionalIntArrayRef bias_sizes_opt,
-    const bool need_backward,
-    const ConvParams& params);
-
-// For BC reasons, have a copy that does not require bias_opt
-ConvBackend select_conv_backend(
-    const Tensor& input,
-    const Tensor& weight,
-    const at::OptionalIntArrayRef bias_sizes_opt,
-    const bool need_backward,
-    const ConvParams& params);
-
-DEFINE_DISPATCH(conv_depthwise2d_backward_stub);
-DEFINE_DISPATCH(conv_depthwise3d_backward_stub);
-DEFINE_DISPATCH(cudnn_convolution_backward_stub);
-DEFINE_DISPATCH(cudnn_convolution_transpose_backward_stub);
-DEFINE_DISPATCH(slow_conv_transpose3d_backward_stub);
-DEFINE_DISPATCH(convolution_depthwise3x3_winograd_stub);
-DEFINE_DISPATCH(miopen_convolution_backward_stub);
-DEFINE_DISPATCH(miopen_convolution_transpose_backward_stub);
-DEFINE_DISPATCH(miopen_depthwise_convolution_backward_stub);
-DEFINE_DISPATCH(mkldnn_convolution_backward_stub);
-DEFINE_DISPATCH(slow_conv_dilated2d_backward_stub);
-DEFINE_DISPATCH(slow_conv_dilated3d_backward_stub);
-DEFINE_DISPATCH(slow_conv_transpose2d_backward_stub);
-REGISTER_NO_CPU_DISPATCH(conv_depthwise2d_backward_stub);
-REGISTER_NO_CPU_DISPATCH(conv_depthwise3d_backward_stub);
-REGISTER_NO_CPU_DISPATCH(cudnn_convolution_backward_stub);
-REGISTER_NO_CPU_DISPATCH(cudnn_convolution_transpose_backward_stub);
-REGISTER_NO_CPU_DISPATCH(miopen_convolution_backward_stub);
-REGISTER_NO_CPU_DISPATCH(miopen_convolution_transpose_backward_stub);
-REGISTER_NO_CPU_DISPATCH(miopen_depthwise_convolution_backward_stub);
-
-std::ostream& operator<<(std::ostream & out, const ConvParams& params) {
-  out << "ConvParams {"
-      << "  stride = " << IntArrayRef{params.stride}
-      << "  padding = " << IntArrayRef{params.padding}
-      << "  dilation = " << IntArrayRef{params.dilation}
-      << "  transposed = " << params.transposed
-      << "  output_padding = " << IntArrayRef{params.output_padding}
-      << "  groups = " << params.groups
-      << "  benchmark = " << params.benchmark
-      << "  deterministic = " << params.deterministic
-      << "  cudnn_enabled = " << params.cudnn_enabled
-      << "  allow_tf32 = " << params.allow_tf32
-      << "}";
-  return out;
-}
-
-auto ConvParams::is_strided() const -> bool {
-  bool is_strided = false;
-  for (auto s : stride) {
-    is_strided |= (s != 1);
-  }
-  return is_strided;
-}
-
-auto ConvParams::is_dilated() const -> bool {
-  bool is_dilated = false;
-  for (auto d : dilation) {
-    is_dilated |= (d != 1);
-  }
-  return is_dilated;
-}
-
-auto ConvParams::is_padded() const -> bool {
-  bool is_padded = false;
-  for (auto p : padding) {
-    is_padded |= (p != 0);
-  }
-  return is_padded;
-}
-
-auto ConvParams::is_output_padding_neg() const -> bool {
-  bool is_non_neg = false;
-  for (auto p : output_padding) {
-    is_non_neg |= (p < 0);
-  }
-  return is_non_neg;
-}
-
-auto ConvParams::is_output_padding_big() const -> bool {
-  bool is_big = false;
-  for (auto i: c10::irange(output_padding.size())) {
-    is_big |= (output_padding[i] >= stride[i]);
-  }
-  return is_big;
-}
-
-auto ConvParams::is_padding_neg() const -> bool {
-  bool is_non_neg = false;
-  for (auto p : padding) {
-    is_non_neg |= (p < 0);
-  }
-  return is_non_neg;
-}
-
-auto ConvParams::is_stride_nonpos() const -> bool {
-  bool is_nonpos = false;
-  for (auto s : stride) {
-    is_nonpos |= (s <= 0);
-  }
-  return is_nonpos;
-}
-
-auto ConvParams::view1d_as_2d() -> void {
-  if (stride.size() == 1) {
-    stride.insert(stride.begin(), 1);
-    padding.insert(padding.begin(), 0);
-    dilation.insert(dilation.begin(), 1);
-    output_padding.insert(output_padding.begin(), 0);
-  }
-}
-
-auto ConvParams::use_cpu_depthwise3x3_winograd(
-    const at::Tensor& input,
-    const at::Tensor& weight,
-    const c10::optional<at::Tensor>& bias) const -> bool {
-#if defined(__ARM_NEON__)
-  // Currently only 3x3 depthwise convolutions on tensors of float are supported.
-  return (input.ndimension() == 4) &&
-         (input.size(1) == groups) &&
-         (weight.ndimension() == 4 ) &&
-         (weight.size(0) % input.size(1) == 0) &&
-         (weight.size(1) == 1) &&
-         (weight.size(2) == 3) &&
-         (weight.size(3) == 3) &&
-         (input.device().is_cpu()) &&
-         (input.scalar_type() == at::kFloat) &&
-         input.is_contiguous() &&
-         (weight.device().is_cpu()) &&
-         (weight.scalar_type() == at::kFloat) &&
-         weight.is_contiguous() &&
-         (!bias.has_value() || bias->is_contiguous()) &&
-         !is_strided() &&
-         !is_dilated() &&
-         !transposed;
-#else
-  return false;
-#endif
-}
-
-auto ConvParams::needs_64bit_indexing_no_split(const at::Tensor& input, const at::Tensor& weight) const -> bool {
-  constexpr int64_t int_max = std::numeric_limits<int>::max();
-  int64_t numel_input = input.numel();
-  // empty input
-  if (numel_input == 0) {
-    return false;
-  }
-  // input size can not be reduced to the range of int by splitting the batch dim
-  int64_t n = input.size(0);
-  if (numel_input / n > int_max) {
-    return true;
-  }
-  // output size can not be reduced to the range of int by splitting the batch dim
-  int64_t outsize = 1;
-  if (transposed) {
-    std::vector<int64_t> o = conv_input_size(input.sizes(), weight.sizes(), padding, output_padding, stride, dilation, groups);
-    outsize = c10::multiply_integers(o.begin() + 1, o.end());
-  } else {
-    std::vector<int64_t> o = conv_output_size(input.sizes(), weight.sizes(), padding, stride, dilation);
-    outsize = c10::multiply_integers(o.begin() + 1, o.end());
-  }
-  return outsize > int_max;
-}
-
-auto ConvParams::use_cudnn(const at::Tensor& input, const at::Tensor& weight) const -> bool {
-
-// Note [Mobile check segfaults]
-// cudnn and miopen are guaranteed not to be on mobile, and T102591915 / T110194934 suggest
-// that maybe the compiledWithCuDNN() check sometimes segfaults (though I can't imagine how)
-#if !defined(C10_MOBILE)
-  if (needs_64bit_indexing_no_split(input, weight)) {
-    return false;
-  }
-  if (!detail::getCUDAHooks().compiledWithCuDNN()) {
-    return false;
-  }
-  if (!input.is_cuda() || !cudnn_enabled) {
-    return false;
-  }
-  if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16) {
-    if (!(detail::getCUDAHooks().supportsBFloat16ConvolutionWithCuDNNv8() && at::native::cudnnv8_enabled_check_debug())) {
-      return false;
-    }
-  }
-  if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous) {
-    // bypass dilation checks for channels_last convolution
-    if (deterministic && is_dilated()) {
-      // cudnn doesn't support deterministic dilated convolution fully yet
-      return false;
-    }
-    if (is_dilated()) {
-      return detail::getCUDAHooks().supportsDilatedConvolutionWithCuDNN() && !is_output_padding_big();
-    }
-  }
-  return !is_output_padding_big();
-#else
-  return false;
-#endif
-}
-
-auto ConvParams::use_mps( const at::Tensor& input, const at::Tensor& weight) const -> bool {
-  // These checks need to be expanded. Currently we have very limited set of
-  // checks for MPS.
-#ifdef USE_MPS
-  if (needs_64bit_indexing_no_split(input, weight)) {
-    return false;
-  }
-  if (!input.is_mps()) {
-    return false;
-  }
-  return true;
-#else
-  return false;
-#endif
-}
-
-auto ConvParams::use_miopen(const at::Tensor& input, const at::Tensor& weight, bool bias_defined) const -> bool {
-  if (needs_64bit_indexing_no_split(input, weight)) {
-    return false;
-  }
-  return ((input.scalar_type() == at::kFloat) || (input.scalar_type() == at::kHalf) || (input.scalar_type() == at::kBFloat16))
-         && detail::getCUDAHooks().compiledWithMIOpen()
-         && input.is_cuda()
-         && input.dim() <= MIOPEN_DIM_MAX
-         && !(groups > 1 && is_dilated()) // MIOpen currently does not support dilation with groups of size > 1
-         && !(input.scalar_type() == at::kBFloat16 && bias_defined) // MIOpen currently doesn't support bias with bfloat16
-         && cudnn_enabled
-         ;
-}
-
-auto ConvParams::use_mkldnn(const at::Tensor& input, const at::Tensor& weight) const -> bool {
-#if AT_MKLDNN_ENABLED()
-  if (!at::globalContext().userEnabledMkldnn()) {
-    return false;
-  }
-  if (input.device().is_cpu() && input.scalar_type() == kBFloat16 && mkldnn_bf16_device_check()) {
-    return true;
-  }
-  return (input.is_mkldnn()) || // input is mkldnn Tensor
-    (input.device().is_cpu() &&
-     input.scalar_type() == kFloat && // only on CPU Float Tensors
-     !transposed && // or transposed tensors
-     // For 1x1 filters, MKLDNN is faster than THNN when multi-threaded,
-     // but THNN is faster when single-threaded.
-     (is_strided() || is_dilated() || input.size(0) >= 16 ||
-      weight.size(-1) != 1 || weight.size(-2) != 1 || at::get_num_threads() > 1) &&
-     (groups > 1
-      || (weight.size(-1) > 3 && weight.size(-2) > 3)
-      || input.size(0) > 1
-      || input.size(0)*input.size(1)*input.size(2)*input.size(3) > 20480) // for some case, native is faster
-      );
-
-#endif
-  return false;
-}
-
-auto ConvParams::use_nnpack(const at::Tensor& input, const at::Tensor& weight) const -> bool {
-#if AT_NNPACK_ENABLED()
-  return at::_nnpack_available() &&
-         input.device().is_cpu() &&
-         input.scalar_type() == kFloat && // only on CPU Float Tensors
-         !is_dilated() && // or dilation
-         !transposed &&   // or transposed tensors
-         input.ndimension() == 4 && // must be in NCHW format
-         weight.ndimension() == 4 &&
-         (weight.size(2) < 17) && (weight.size(3) < 17) // NNPACK only supports kernels up to 16x16
-#if !defined(C10_MOBILE)
-         && input.size(0) >= 16 // ensure large enough batch size to ensure perf, tuneable
-#endif
-     ;
-#endif
-  return false;
-}
-
-auto ConvParams::use_xnnpack(
-    const at::Tensor& input,
-    const at::Tensor& weight,
-    const at::OptionalIntArrayRef bias_sizes_opt) const -> bool {
-#if defined(C10_MOBILE)
-  if (!transposed) {
-    return (input.size(1) == groups) &&
-            xnnpack::use_convolution2d(
-                input,
-                weight,
-                bias_sizes_opt,
-                padding,
-                stride,
-                dilation,
-                groups,
-                transposed);
-  }
-#endif
-  return false;
-}
-
-// We currently only have depthwise support for the case where groups ==
-// nInputPlane and nInputPlane == nOutputPlane (the latter due to the lack of
-// a depthwise multiplier)
-auto ConvParams::is_depthwise(
-        const at::Tensor& input, const at::Tensor& weight) const -> bool {
-  return input.is_cuda() &&
-         !transposed &&
-         (input.ndimension() == 4 || input.ndimension() == 5) &&
-         input.size(1) == groups &&
-         groups > 1 && // no point if there is only a single group
-         weight.size(0) % input.size(1) == 0; // output channels must be a multiple of input channels
-}
-
 // Check workload to activate fast depthwise FP16 cudnn conv kernels
 bool check_cudnn_depthwise_workload(const at::Tensor& input, int stride) {
   int w = input.size(3);  // same as h
@@ -592,49 +239,372 @@ bool check_cudnn_depthwise_workload_with_filter(const at::Tensor& input, int str
   return false;
 }
 
-// Use cudnn for FP16 depthwise convolutions
-auto ConvParams::use_cudnn_depthwise(
-        const at::Tensor& input, const at::Tensor& weight) const -> bool {
-  if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous && use_cudnn(input, weight)) {
-    // always use cudnn_depthwise for channels_last format
-    return true;
+
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct ConvParams {
+  std::vector<int64_t> stride;
+  std::vector<int64_t> padding;
+  std::vector<int64_t> dilation;
+  bool transposed;
+  std::vector<int64_t> output_padding;
+  int groups;
+  bool benchmark;
+  bool deterministic;
+  bool cudnn_enabled;
+  bool allow_tf32;
+
+  bool is_strided() const {
+    bool is_strided = false;
+    for (auto s : stride) {
+      is_strided |= (s != 1);
+    }
+    return is_strided;
+  }
+
+  bool is_dilated() const {
+    bool is_dilated = false;
+    for (auto d : dilation) {
+      is_dilated |= (d != 1);
+    }
+    return is_dilated;
+  }
+
+  bool is_padded() const {
+    bool is_padded = false;
+    for (auto p : padding) {
+      is_padded |= (p != 0);
+    }
+    return is_padded;
+  }
+
+  bool is_output_padding_neg() const {
+    bool is_non_neg = false;
+    for (auto p : output_padding) {
+      is_non_neg |= (p < 0);
+    }
+    return is_non_neg;
+  }
+
+  bool is_output_padding_big() const {
+    bool is_big = false;
+    for (auto i: c10::irange(output_padding.size())) {
+      is_big |= (output_padding[i] >= stride[i]);
+    }
+    return is_big;
+  }
+
+  bool is_padding_neg() const {
+    bool is_non_neg = false;
+    for (auto p : padding) {
+      is_non_neg |= (p < 0);
+    }
+    return is_non_neg;
+  }
+
+  bool is_stride_nonpos() const {
+    bool is_nonpos = false;
+    for (auto s : stride) {
+      is_nonpos |= (s <= 0);
+    }
+    return is_nonpos;
+  }
+
+  void view1d_as_2d() {
+    if (stride.size() == 1) {
+      stride.insert(stride.begin(), 1);
+      padding.insert(padding.begin(), 0);
+      dilation.insert(dilation.begin(), 1);
+      output_padding.insert(output_padding.begin(), 0);
+    }
+  }
+
+  bool use_cpu_depthwise3x3_winograd(const at::Tensor& input, const at::Tensor& weight, const c10::optional<at::Tensor>& bias) const {
+#if defined(__ARM_NEON__)
+    // Currently only 3x3 depthwise convolutions on tensors of float are supported.
+    return (input.ndimension() == 4) &&
+           (input.size(1) == groups) &&
+           (weight.ndimension() == 4 ) &&
+           (weight.size(0) % input.size(1) == 0) &&
+           (weight.size(1) == 1) &&
+           (weight.size(2) == 3) &&
+           (weight.size(3) == 3) &&
+           (input.device().is_cpu()) &&
+           (input.scalar_type() == at::kFloat) &&
+           input.is_contiguous() &&
+           (weight.device().is_cpu()) &&
+           (weight.scalar_type() == at::kFloat) &&
+           weight.is_contiguous() &&
+           (!bias.has_value() || bias->is_contiguous()) &&
+           !is_strided() &&
+           !is_dilated() &&
+           !transposed;
+#else
+    return false;
+#endif
+  }
+
+  bool needs_64bit_indexing_no_split(const at::Tensor& input, const at::Tensor& weight) const {
+    constexpr int64_t int_max = std::numeric_limits<int>::max();
+    int64_t numel_input = input.numel();
+    // empty input
+    if (numel_input == 0) {
+      return false;
+    }
+    // input size can not be reduced to the range of int by splitting the batch dim
+    int64_t n = input.size(0);
+    if (numel_input / n > int_max) {
+      return true;
+    }
+    // output size can not be reduced to the range of int by splitting the batch dim
+    int64_t outsize = 1;
+    if (transposed) {
+      std::vector<int64_t> o = conv_input_size(input.sizes(), weight.sizes(), padding, output_padding, stride, dilation, groups);
+      outsize = c10::multiply_integers(o.begin() + 1, o.end());
+    } else {
+      std::vector<int64_t> o = conv_output_size(input.sizes(), weight.sizes(), padding, stride, dilation);
+      outsize = c10::multiply_integers(o.begin() + 1, o.end());
+    }
+    return outsize > int_max;
   }
-  if (detail::getCUDAHooks().supportsDepthwiseConvolutionWithCuDNN()) {
-    long cudnn_version = detail::getCUDAHooks().versionCuDNN();
-    if (cudnn_version >= 8200) {
-      bool kernel_cond =  (use_cudnn(input, weight) &&
+
+  bool use_cudnn(const at::Tensor& input, const at::Tensor& weight) const {
+  // Note [Mobile check segfaults]
+  // cudnn and miopen are guaranteed not to be on mobile, and T102591915 / T110194934 suggest
+  // that maybe the compiledWithCuDNN() check sometimes segfaults (though I can't imagine how)
+#if !defined(C10_MOBILE)
+    if (needs_64bit_indexing_no_split(input, weight)) {
+      return false;
+    }
+    if (!detail::getCUDAHooks().compiledWithCuDNN()) {
+      return false;
+    }
+    if (!input.is_cuda() || !cudnn_enabled) {
+      return false;
+    }
+    if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16) {
+      if (!(detail::getCUDAHooks().supportsBFloat16ConvolutionWithCuDNNv8() && at::native::cudnnv8_enabled_check_debug())) {
+        return false;
+      }
+    }
+    if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous) {
+      // bypass dilation checks for channels_last convolution
+      if (deterministic && is_dilated()) {
+        // cudnn doesn't support deterministic dilated convolution fully yet
+        return false;
+      }
+      if (is_dilated()) {
+        return detail::getCUDAHooks().supportsDilatedConvolutionWithCuDNN() && !is_output_padding_big();
+      }
+    }
+    return !is_output_padding_big();
+#else
+    return false;
+#endif
+  }
+
+  // Use cudnn for FP16 depthwise convolutions
+  bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const  {
+    if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous && use_cudnn(input, weight)) {
+      // always use cudnn_depthwise for channels_last format
+      return true;
+    }
+    if (detail::getCUDAHooks().supportsDepthwiseConvolutionWithCuDNN()) {
+      long cudnn_version = detail::getCUDAHooks().versionCuDNN();
+      if (cudnn_version >= 8200) {
+        bool kernel_cond =  (use_cudnn(input, weight) &&
+                             input.scalar_type() == kHalf && // only for FP16
+                             weight.scalar_type() == kHalf &&
+                             is_depthwise(input, weight) &&
+                             input.ndimension() == 4 &&   // TODO: 5-D contiguous depthwise is not supported yet, need benchmarks
+                             !is_dilated() && // no dilation supported
+                             (stride[0] == stride[1] || input.size(2) == 1) && // square or 1d
+                             input.size(1) >= 32); // min 32 channels supported)
+        if (kernel_cond) {
+          return check_cudnn_depthwise_workload_with_filter(input, stride[1], weight);
+        }
+      }
+      // keep (7600 <= cudnn < 8200) code unchanged
+      bool kernel_cond =  (cudnn_version >= 7600 &&
+                           use_cudnn(input, weight) &&
                            input.scalar_type() == kHalf && // only for FP16
                            weight.scalar_type() == kHalf &&
                            is_depthwise(input, weight) &&
                            input.ndimension() == 4 &&   // TODO: 5-D contiguous depthwise is not supported yet, need benchmarks
+                           weight.size(2) == weight.size(3) && // only square kernels
+                           input.size(2) >= 7 && // min width/height 7
                            !is_dilated() && // no dilation supported
-                           (stride[0] == stride[1] || input.size(2) == 1) && // square or 1d
+                           stride[0] == stride[1] && // equal strides
+                           ((weight.size(3) == 3) || (weight.size(3) == 1)) &&
                            input.size(1) >= 32); // min 32 channels supported)
       if (kernel_cond) {
-        return check_cudnn_depthwise_workload_with_filter(input, stride[1], weight);
+        return check_cudnn_depthwise_workload(input, stride[0]);
+      } else {
+        return false;
       }
-    }
-    // keep (7600 <= cudnn < 8200) code unchanged
-    bool kernel_cond =  (cudnn_version >= 7600 &&
-                         use_cudnn(input, weight) &&
-                         input.scalar_type() == kHalf && // only for FP16
-                         weight.scalar_type() == kHalf &&
-                         is_depthwise(input, weight) &&
-                         input.ndimension() == 4 &&   // TODO: 5-D contiguous depthwise is not supported yet, need benchmarks
-                         weight.size(2) == weight.size(3) && // only square kernels
-                         input.size(2) >= 7 && // min width/height 7
-                         !is_dilated() && // no dilation supported
-                         stride[0] == stride[1] && // equal strides
-                         ((weight.size(3) == 3) || (weight.size(3) == 1)) &&
-                         input.size(1) >= 32); // min 32 channels supported)
-    if (kernel_cond) {
-      return check_cudnn_depthwise_workload(input, stride[0]);
     } else {
       return false;
     }
-  } else {
+  }
+
+  bool use_miopen(const at::Tensor& input, const at::Tensor& weight, bool bias_defined) const  {
+    if (needs_64bit_indexing_no_split(input, weight)) {
+      return false;
+    }
+    return ((input.scalar_type() == at::kFloat) || (input.scalar_type() == at::kHalf) || (input.scalar_type() == at::kBFloat16))
+           && detail::getCUDAHooks().compiledWithMIOpen()
+           && input.is_cuda()
+           && input.dim() <= MIOPEN_DIM_MAX
+           && !(groups > 1 && is_dilated()) // MIOpen currently does not support dilation with groups of size > 1
+           && !(input.scalar_type() == at::kBFloat16 && bias_defined) // MIOpen currently doesn't support bias with bfloat16
+           && cudnn_enabled
+           ;
+  }
+  bool use_mkldnn(const at::Tensor& input, const at::Tensor& weight) const  {
+#if AT_MKLDNN_ENABLED()
+    if (!at::globalContext().userEnabledMkldnn()) {
+      return false;
+    }
+    if (input.device().is_cpu() && input.scalar_type() == kBFloat16 && mkldnn_bf16_device_check()) {
+      return true;
+    }
+    return (input.is_mkldnn()) || // input is mkldnn Tensor
+      (input.device().is_cpu() &&
+       input.scalar_type() == kFloat && // only on CPU Float Tensors
+       !transposed && // or transposed tensors
+       // For 1x1 filters, MKLDNN is faster than THNN when multi-threaded,
+       // but THNN is faster when single-threaded.
+       (is_strided() || is_dilated() || input.size(0) >= 16 ||
+        weight.size(-1) != 1 || weight.size(-2) != 1 || at::get_num_threads() > 1) &&
+       (groups > 1
+        || (weight.size(-1) > 3 && weight.size(-2) > 3)
+        || input.size(0) > 1
+        || input.size(0)*input.size(1)*input.size(2)*input.size(3) > 20480) // for some case, native is faster
+        );
+
+#endif
+    return false;
+  }
+  bool use_nnpack(const at::Tensor& input, const at::Tensor& weight) const  {
+#if AT_NNPACK_ENABLED()
+    return at::_nnpack_available() &&
+           input.device().is_cpu() &&
+           input.scalar_type() == kFloat && // only on CPU Float Tensors
+           !is_dilated() && // or dilation
+           !transposed &&   // or transposed tensors
+           input.ndimension() == 4 && // must be in NCHW format
+           weight.ndimension() == 4 &&
+           (weight.size(2) < 17) && (weight.size(3) < 17) // NNPACK only supports kernels up to 16x16
+#if !defined(C10_MOBILE)
+           && input.size(0) >= 16 // ensure large enough batch size to ensure perf, tuneable
+#endif
+       ;
+#endif
+    return false;
+  }
+  bool use_xnnpack(const at::Tensor& input, const at::Tensor& weight,
+                   const at::OptionalIntArrayRef bias_sizes_opt) const {
+#if defined(C10_MOBILE)
+    if (!transposed) {
+      return (input.size(1) == groups) &&
+              xnnpack::use_convolution2d(
+                  input,
+                  weight,
+                  bias_sizes_opt,
+                  padding,
+                  stride,
+                  dilation,
+                  groups,
+                  transposed);
+    }
+#endif
     return false;
   }
+
+  bool use_mps(const at::Tensor& input, const at::Tensor& weight) const {
+    // These checks need to be expanded. Currently we have very limited set of
+    // checks for MPS.
+#ifdef USE_MPS
+    if (needs_64bit_indexing_no_split(input, weight)) {
+      return false;
+    }
+    if (!input.is_mps()) {
+      return false;
+    }
+    return true;
+#else
+    return false;
+#endif
+  }
+
+  // We currently only have depthwise support for the case where groups ==
+  // nInputPlane and nInputPlane == nOutputPlane (the latter due to the lack of
+  // a depthwise multiplier)
+  bool is_depthwise(const at::Tensor& input, const at::Tensor& weight) const  {
+    return input.is_cuda() &&
+           !transposed &&
+           (input.ndimension() == 4 || input.ndimension() == 5) &&
+           input.size(1) == groups &&
+           groups > 1 && // no point if there is only a single group
+           weight.size(0) % input.size(1) == 0; // output channels must be a multiple of input channels
+  }
+};
+
+// Function to select the convolution backend based on the inputs and params.
+// This overload is used within the convolution internals but not exposed to python.
+// NB: The forward pass provides a bias tensor while the backward pass provides
+// a bool indicating whether the bias is defined. This is done to save memory by
+// avoiding saving the full bias tensor for backward.
+ConvBackend _select_conv_backend(
+    const Tensor& input,
+    const Tensor& weight,
+    const c10::optional<Tensor>& bias_opt,
+    const at::OptionalIntArrayRef bias_sizes_opt,
+    const bool need_backward,
+    const ConvParams& params);
+
+// For BC reasons, have a copy that does not require bias_opt
+ConvBackend select_conv_backend(
+    const Tensor& input,
+    const Tensor& weight,
+    const at::OptionalIntArrayRef bias_sizes_opt,
+    const bool need_backward,
+    const ConvParams& params);
+
+DEFINE_DISPATCH(conv_depthwise2d_backward_stub);
+DEFINE_DISPATCH(conv_depthwise3d_backward_stub);
+DEFINE_DISPATCH(cudnn_convolution_backward_stub);
+DEFINE_DISPATCH(cudnn_convolution_transpose_backward_stub);
+DEFINE_DISPATCH(slow_conv_transpose3d_backward_stub);
+DEFINE_DISPATCH(convolution_depthwise3x3_winograd_stub);
+DEFINE_DISPATCH(miopen_convolution_backward_stub);
+DEFINE_DISPATCH(miopen_convolution_transpose_backward_stub);
+DEFINE_DISPATCH(miopen_depthwise_convolution_backward_stub);
+DEFINE_DISPATCH(mkldnn_convolution_backward_stub);
+DEFINE_DISPATCH(slow_conv_dilated2d_backward_stub);
+DEFINE_DISPATCH(slow_conv_dilated3d_backward_stub);
+DEFINE_DISPATCH(slow_conv_transpose2d_backward_stub);
+REGISTER_NO_CPU_DISPATCH(conv_depthwise2d_backward_stub);
+REGISTER_NO_CPU_DISPATCH(conv_depthwise3d_backward_stub);
+REGISTER_NO_CPU_DISPATCH(cudnn_convolution_backward_stub);
+REGISTER_NO_CPU_DISPATCH(cudnn_convolution_transpose_backward_stub);
+REGISTER_NO_CPU_DISPATCH(miopen_convolution_backward_stub);
+REGISTER_NO_CPU_DISPATCH(miopen_convolution_transpose_backward_stub);
+REGISTER_NO_CPU_DISPATCH(miopen_depthwise_convolution_backward_stub);
+
+std::ostream& operator<<(std::ostream & out, const ConvParams& params) {
+  out << "ConvParams {"
+      << "  stride = " << IntArrayRef{params.stride}
+      << "  padding = " << IntArrayRef{params.padding}
+      << "  dilation = " << IntArrayRef{params.dilation}
+      << "  transposed = " << params.transposed
+      << "  output_padding = " << IntArrayRef{params.output_padding}
+      << "  groups = " << params.groups
+      << "  benchmark = " << params.benchmark
+      << "  deterministic = " << params.deterministic
+      << "  cudnn_enabled = " << params.cudnn_enabled
+      << "  allow_tf32 = " << params.allow_tf32
+      << "}";
+  return out;
 }
 
 static void check_shape_forward(const at::Tensor& input,

From b42aef3f883e9309efe7be1c69882bda5cde277c Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Tue, 15 Nov 2022 08:05:31 -0800
Subject: [PATCH 0936/1922] Add int64_t, SymInt overloads for all binary
 operators in C++ (#89063)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89063
Approved by: https://github.com/SherlockNoMad
---
 c10/core/SymInt.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/c10/core/SymInt.h b/c10/core/SymInt.h
index 0c7c69fe9553b..9ab72a0776804 100644
--- a/c10/core/SymInt.h
+++ b/c10/core/SymInt.h
@@ -235,6 +235,40 @@ inline c10::SymInt multiply_integers(const C& container) {
       [](const c10::SymInt& a, const c10::SymInt& b) { return a * b; });
 }
 
+inline SymInt operator+(int64_t a, const SymInt& b) {
+  return c10::SymInt(a) + b;
+}
+inline SymInt operator-(int64_t a, const SymInt& b) {
+  return c10::SymInt(a) - b;
+}
+inline SymInt operator*(int64_t a, const SymInt& b) {
+  return c10::SymInt(a) * b;
+}
+inline SymInt operator/(int64_t a, const SymInt& b) {
+  return c10::SymInt(a) / b;
+}
+inline SymInt operator%(int64_t a, const SymInt& b) {
+  return c10::SymInt(a) % b;
+}
+inline bool operator==(int64_t a, const SymInt& b) {
+  return c10::SymInt(a) == b;
+}
+inline bool operator!=(int64_t a, const SymInt& b) {
+  return c10::SymInt(a) != b;
+}
+inline bool operator<(int64_t a, const SymInt& b) {
+  return c10::SymInt(a) < b;
+}
+inline bool operator<=(int64_t a, const SymInt& b) {
+  return c10::SymInt(a) <= b;
+}
+inline bool operator>(int64_t a, const SymInt& b) {
+  return c10::SymInt(a) > b;
+}
+inline bool operator>=(int64_t a, const SymInt& b) {
+  return c10::SymInt(a) >= b;
+}
+
 C10_API std::ostream& operator<<(std::ostream& os, const SymInt& s);
 C10_API SymInt operator-(const SymInt& s);
 } // namespace c10

From 1ef52ad9ee62ee1940d91392e778e71cef4facac Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 16 Nov 2022 01:13:00 +0000
Subject: [PATCH 0937/1922] Revert "[Inductor] Build FX Linear + Permute
 Vertical Fusion in Inductor (#88859)"

This reverts commit d60abe4b9521e235c0e9beb00cda0d6c5673f4e0.

Reverted https://github.com/pytorch/pytorch/pull/88859 on behalf of https://github.com/kit1980 due to Broke Mac OS testing, which were clearly shown in CI
---
 test/inductor/test_torchinductor.py | 106 ---------------
 torch/_inductor/config.py           |   4 -
 torch/_inductor/overrides.py        | 199 ----------------------------
 3 files changed, 309 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index f43a333d1f096..dcb01b9ec78c1 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -10,7 +10,6 @@
 import typing
 import unittest
 import weakref
-from typing import Any, Callable
 from unittest.mock import patch
 
 import torch
@@ -19,7 +18,6 @@
 from torch._dynamo.debug_utils import same_two_models
 from torch._dynamo.testing import rand_strided, same
 from torch.fx.experimental.proxy_tensor import make_fx
-from torch.fx.passes.shape_prop import ShapeProp
 from torch.nn import functional as F
 from torch.testing._internal.common_utils import (
     TEST_WITH_ASAN,
@@ -41,14 +39,6 @@
     from torch._inductor import codecache, config, metrics
     from torch._inductor.compile_fx import compile_fx, complex_memory_overlap
     from torch._inductor.ir import IndexingDiv, ModularIndexing
-    from torch._inductor.overrides import (
-        linear_permute_fusion,
-        linear_transpose,
-        permute_linear_fusion,
-        permute_matmul_fusion,
-        transpose_linear,
-        transpose_matmul,
-    )
     from torch._inductor.sizevars import SizeVarAllocator
     from torch._inductor.utils import has_torchvision_roi_align, timed
 
@@ -123,29 +113,6 @@ def maybe_test(*args, **kwargs):
     return wrap_test
 
 
-PassFunc = Callable[[torch.fx.GraphModule, Any], torch.fx.GraphModule]
-
-
-def chain_passes(*passes: PassFunc) -> PassFunc:
-    def parent_pass(module: torch.fx.GraphModule, input: Any) -> torch.fx.GraphModule:
-        for pass_ in passes:
-            if isinstance(module, torch.fx.GraphModule):
-                ShapeProp(module).propagate(*input)
-            module = pass_(module)
-        return module
-
-    return parent_pass
-
-
-def count_call_function(module: torch.fx.GraphModule, target_op: Any) -> int:
-    return sum(
-        [
-            1 if (n.op == "call_function" and n.target == target_op) else 0
-            for n in module.graph.nodes
-        ]
-    )
-
-
 class TestCase(TorchTestCase):
     @classmethod
     def setUpClass(cls):
@@ -1619,79 +1586,6 @@ def fn(a, b):
         y = torch.tensor(0)
         self.assertEqual(fn(x, y), x + x)
 
-    def test_linear_permute_fusion(self):
-        class TestModule(torch.nn.Module):
-            def __init__(self, k: int, n: int):
-                super().__init__()
-                self.weight = torch.nn.Parameter(torch.randn(n, k))
-                self.bias = torch.nn.Parameter(torch.randn(n))
-
-            def forward(self, input: torch.Tensor):
-                a0 = torch.nn.functional.linear(input, self.weight, self.bias)
-                b0 = a0.permute(0, 2, 1)
-                return b0
-
-        m, k, n = 16, 8, 4
-        trace_func = chain_passes(torch.fx.symbolic_trace, linear_permute_fusion)
-        module = TestModule(k, n).eval()
-        input = torch.randn(6, m, k)
-        traced = trace_func(module, [input])
-        num_linear = count_call_function(traced, torch.nn.functional.linear)
-        num_linear_transpose = count_call_function(traced, linear_transpose)
-        self.assertEqual(num_linear, 0)
-        self.assertEqual(num_linear_transpose, 1)
-
-        self.assertTrue(torch.allclose(module(input), traced(input)))
-
-    def test_permute_linear_fusion(self):
-        class TestModule(torch.nn.Module):
-            def __init__(self, k: int, n: int):
-                super().__init__()
-                self.weight = torch.nn.Parameter(torch.randn(n, k))
-                self.bias = torch.nn.Parameter(torch.randn(n))
-
-            def forward(self, input: torch.Tensor):
-                input1 = input.permute(0, 2, 1)
-                output = torch.nn.functional.linear(input1, self.weight, self.bias)
-                return output
-
-        m, k, n = 16, 8, 4
-
-        trace_func = chain_passes(torch.fx.symbolic_trace, permute_linear_fusion)
-        module = TestModule(k, n).eval()
-        input = torch.randn(6, k, m)
-        traced = trace_func(module, [input])
-        num_linear = count_call_function(traced, torch.nn.functional.linear)
-        num_transpose_linear = count_call_function(traced, transpose_linear)
-        self.assertEqual(num_linear, 0)
-        self.assertEqual(num_transpose_linear, 1)
-
-        self.assertTrue(torch.allclose(module(input), traced(input)))
-
-    def test_permute_bmm_fusion(self):
-        class TestModule(torch.nn.Module):
-            def __init__(self, batch: int, k: int, n: int):
-                super().__init__()
-                self.other = torch.randn(batch, k, n)
-
-            def forward(self, input: torch.Tensor):
-                input1 = input.permute(0, 2, 1)
-                output = torch.bmm(input1, self.other)
-                return output
-
-        batch, m, k, n = 6, 16, 8, 4
-
-        trace_func = chain_passes(torch.fx.symbolic_trace, permute_matmul_fusion)
-        module = TestModule(batch, k, n).eval()
-        input = torch.randn(batch, k, m)
-        traced = trace_func(module, [input])
-        num_bmm = count_call_function(traced, torch.bmm)
-        num_transpose_matmul = count_call_function(traced, transpose_matmul)
-        self.assertEqual(num_bmm, 0)
-        self.assertEqual(num_transpose_matmul, 1)
-
-        self.assertTrue(torch.allclose(module(input), traced(input)))
-
     def test_slice1(self):
         def fn(a):
             return (
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index c552101c1caee..d376fe3e8bf7f 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -75,10 +75,6 @@
 shape_padding = os.environ.get("TORCHINDUCTOR_SHAPE_PADDING", "0") == "1"
 alignment_size = 4
 
-# Fx-based linear/matmul/bmm + permute/transpose vertical fusion
-permute_fusion = os.environ.get("TORCHINDUCTOR_PERMUTE_FUSION", "0") == "1"
-
-
 # config specific to codegen/cpp.pp
 class cpp:
     # set to torch.get_num_threads()
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index cf2cd5f60f510..3a95aa7ce8807 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -19,8 +19,6 @@
 from torch.nn.utils.fusion import fuse_conv_bn_eval
 from torch.overrides import TorchFunctionMode
 
-from . import config
-
 log = logging.getLogger(__name__)
 
 
@@ -427,14 +425,6 @@ def check_node_is_add_inplace(node):
 
 
 def fuse_fx(gm: torch.fx.GraphModule, example_inputs):
-    if config.permute_fusion:
-        # For linear permute fusion, we need to check input info to identify
-        # and perform proper permutation/transpose
-        ShapeProp(gm).propagate(*example_inputs)
-        gm = linear_permute_fusion(gm)
-        gm = permute_linear_fusion(gm)
-        gm = permute_matmul_fusion(gm)
-
     # make sure the autograd is disabled.
     if torch.is_grad_enabled():
         return gm
@@ -538,195 +528,6 @@ def _philox_rand_like(input, seed, offset):
     return torch.rand_like(input)
 
 
-class NormalizedLinearNode:
-    def __init__(self, node: torch.fx.Node) -> None:
-        assert node.op == "call_function"
-        assert node.target in [torch.nn.functional.linear]
-        self.node: torch.fx.Node = node
-
-    def get_input(self) -> torch.fx.Node:
-        if len(self.node.args) > 0:
-            return self.node.args[0]
-        else:
-            return self.node.kwargs["input"]
-
-    def get_weight(self) -> torch.fx.Node:
-        if len(self.node.args) > 1:
-            return self.node.args[1]
-        else:
-            return self.node.kwargs["weight"]
-
-    def get_bias(self) -> torch.fx.Node:
-        if len(self.node.args) > 2:
-            return self.node.args[2]
-        else:
-            return self.node.kwargs["bias"]
-
-
-class NormalizedMatmulNode:
-    def __init__(self, node: torch.fx.Node) -> None:
-        assert node.op == "call_function"
-        assert node.target in [torch.bmm, torch.matmul]
-        self.node: torch.fx.Node = node
-
-    def get_input(self) -> torch.fx.Node:
-        if len(self.node.args) > 0:
-            return self.node.args[0]
-        else:
-            return self.node.kwargs["input"]
-
-    def get_other(self) -> torch.fx.Node:
-        if len(self.node.args) > 1:
-            return self.node.args[1]
-        else:
-            return self.node.kwargs["other"]
-
-
-def check_permute(node: torch.fx.Node):
-    ranks = len(node.meta["tensor_meta"].shape)
-    if len(node.args) > 3:
-        permutation = [node.args[i] % ranks for i in range(1, ranks + 1)]
-    elif (
-        "permutation" in node.kwargs
-        and node.kwargs["permutation"] is not None
-        and len(node.kwargs["permutation"]) > 2
-    ):
-        permutation = [i % ranks for i in node.kwargs["permutation"]]
-    else:
-        return False
-    allowed_permutation = list(range(ranks))
-    allowed_permutation[-1] = ranks - 2
-    allowed_permutation[-2] = ranks - 1
-    return permutation == allowed_permutation
-
-
-def linear_permute_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
-    for node in module.graph.nodes:
-        if (
-            node.op == "call_method"
-            and node.target == "permute"
-            and check_permute(node)
-        ):
-            if len(node.args) > 0:
-                input_node = node.args[0]
-            else:
-                input_node = node.kwargs["input"]
-            if (
-                input_node.op == "call_function"
-                and input_node.target == torch.nn.functional.linear
-            ):
-                normalized = NormalizedLinearNode(input_node)
-                input = normalized.get_input()
-                weight = normalized.get_weight()
-                bias = normalized.get_bias()
-                with module.graph.inserting_before(node):
-                    fused_node = module.graph.call_function(
-                        linear_transpose, args=(input, weight, bias)
-                    )
-                    node.replace_all_uses_with(fused_node)
-
-    module.graph.lint()
-    module.graph.eliminate_dead_code()
-    module.recompile()
-    return module
-
-
-# Y1 = X * W^T + bias
-# Y2 = Y1.permute(0, 2, 1)
-# ---->
-# Y2 = (W * X^T + bias.unsqueeze(-1))^T
-def linear_transpose(
-    input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor
-) -> torch.Tensor:
-    return torch.matmul(weight, input.transpose(-1, -2)) + bias.unsqueeze(-1)
-
-
-def permute_linear_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
-    for node in module.graph.nodes:
-        if node.op == "call_function" and node.target == torch.nn.functional.linear:
-            if len(node.args) > 0:
-                input_node = node.args[0]
-            else:
-                input_node = node.kwargs["input"]
-            if (
-                input_node.op == "call_method"
-                and input_node.target == "permute"
-                and check_permute(input_node)
-            ):
-                normalized = NormalizedLinearNode(node)
-                if len(input_node.args) > 0:
-                    input = input_node.args[0]
-                else:
-                    input = input_node.kwargs["input"]
-                weight = normalized.get_weight()
-                bias = normalized.get_bias()
-                with module.graph.inserting_before(node):
-                    fused_node = module.graph.call_function(
-                        transpose_linear, args=(input, weight, bias)
-                    )
-                    node.replace_all_uses_with(fused_node)
-
-    module.graph.lint()
-    module.graph.eliminate_dead_code()
-    module.recompile()
-    return module
-
-
-def permute_matmul_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
-    for node in module.graph.nodes:
-        if node.op == "call_function" and (
-            node.target == torch.bmm or node.target == torch.matmul
-        ):
-            normalized = NormalizedMatmulNode(node)
-            A = normalized.get_input()
-            B = normalized.get_other()
-            Atrans = Btrans = False
-            if A.op == "call_method" and A.target == "permute" and check_permute(A):
-                Atrans = True
-                if len(A.args) > 0:
-                    A = A.args[0]
-                else:
-                    A = A.kwargs["input"]
-
-            if B.op == "call_method" and B.target == "permute" and check_permute(B):
-                Btrans = True
-                if len(B.args) > 0:
-                    B = B.args[0]
-                else:
-                    B = B.kwargs["input"]
-
-            if Atrans or Btrans:
-                with module.graph.inserting_before(node):
-                    fused_node = module.graph.call_function(
-                        transpose_matmul,
-                        args=(A, B, Atrans, Btrans),
-                    )
-                node.replace_all_uses_with(fused_node)
-
-    module.graph.lint()
-    module.graph.eliminate_dead_code()
-    module.recompile()
-    return module
-
-
-# X1 = X.permute(0, 2, 1)
-# Y1 = X1 * W1^T + bias1
-# ---->
-# Y2 = X1.transpose(-1, -2) * W1^T + bias1
-def transpose_linear(
-    input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor
-) -> torch.Tensor:
-    return torch.matmul(input.transpose(-1, -2), weight.t()) + bias
-
-
-def transpose_matmul(A: torch.Tensor, B: torch.Tensor, Atrans: bool, Btrans: bool):
-    if Atrans:
-        A = A.transpose(-1, -2)
-    if Btrans:
-        B = B.transpose(-1, -2)
-    return torch.matmul(A, B)
-
-
 def replace_and_fuse_for_binary(
     computation_node, node, fuse_func, attr, modules, index_node, index_pointwise
 ):

From 4e37c7db6e31a18afb4a6d651112b6d819da4141 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 16 Nov 2022 02:39:22 +0000
Subject: [PATCH 0938/1922] Increase slow grad check timeout (#89079)

Now that periodic jobs are run under `mem_leak_check` mode with parallelization turning off. It's very easy for `linux-bionic-cuda11.6-py3-gcc7-slow-gradcheck / test` to timeout because one of the shards is very close to the 4h mark.

* https://hud.pytorch.org/pytorch/pytorch/commit/2452e3f99a072760fc46d3f9025aaa37ca7ea2ab
* https://hud.pytorch.org/pytorch/pytorch/commit/35e668b5ced25e735b6e523d557ed7fd60267914

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89079
Approved by: https://github.com/clee2000
---
 .github/workflows/_linux-test.yml | 8 +++++++-
 .github/workflows/periodic.yml    | 1 +
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
index 6ad30080fd649..16f25fed91212 100644
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@@ -22,6 +22,12 @@ on:
         description: |
           If this is set, our linter will use this to make sure that every other
           job with the same `sync-tag` is identical.
+      timeout-minutes:
+        required: false
+        type: number
+        default: 240
+        description: |
+          Set the maximum (in minutes) how long the workflow should take to finish
 
 env:
   GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
@@ -56,6 +62,7 @@ jobs:
       matrix: ${{ fromJSON(needs.filter.outputs.test-matrix) }}
       fail-fast: false
     runs-on: ${{ matrix.runner }}
+    timeout-minutes: ${{ inputs.timeout-minutes }}
     steps:
       - name: Setup SSH (Click me for login details)
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
@@ -117,7 +124,6 @@ jobs:
           XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
           PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
           PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
-        timeout-minutes: 240
         run: |
           set -x
 
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index 6e1722b4b6c01..61302e1a0d61b 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -33,6 +33,7 @@ jobs:
       build-environment: linux-bionic-cuda11.6-py3-gcc7-slow-gradcheck
       docker-image: ${{ needs.linux-bionic-cuda11_6-py3-gcc7-slow-gradcheck-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-bionic-cuda11_6-py3-gcc7-slow-gradcheck-build.outputs.test-matrix }}
+      timeout-minutes: 300
 
   linux-focal-rocm5_2-py3_8-slow-build:
     name: linux-focal-rocm5.2-py3.8-slow

From 7bab23e8bb801c323e886a589f1b58ad73991bc5 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Tue, 15 Nov 2022 19:19:47 +0000
Subject: [PATCH 0939/1922] [FSDP] Test `named_parameters()` in forward
 (`use_orig_params=True`) (#89066)

This adds a unit test following the FSDP change in https://github.com/pytorch/pytorch/pull/88781.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89066
Approved by: https://github.com/fegin
---
 .../fsdp/test_fsdp_use_orig_params.py         | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/test/distributed/fsdp/test_fsdp_use_orig_params.py b/test/distributed/fsdp/test_fsdp_use_orig_params.py
index 0f5ffa564c2d4..e61f2e4d96ded 100644
--- a/test/distributed/fsdp/test_fsdp_use_orig_params.py
+++ b/test/distributed/fsdp/test_fsdp_use_orig_params.py
@@ -1006,6 +1006,47 @@ def forward(self, x):
         fsdp_buffer_names = [n for n, _ in fsdp_model.named_buffers()]
         self.assertEqual(buffer_names, fsdp_buffer_names)
 
+    @skip_if_lt_x_gpu(2)
+    def test_named_parameters_in_forward(self):
+        """
+        Tests that calling ``named_parameters()`` during forward returns FQNs
+        and ``Tensor`` s corresponding to the original parameters.
+        """
+        param_shapes = [None, None]
+        assert_equal_fn = self.assertEqual
+
+        class Model(nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.lin = nn.Linear(5, 5)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                nonlocal param_shapes
+                param_names = [tup[0] for tup in self.named_parameters()]
+                params = [tup[1] for tup in self.named_parameters()]
+                assert (
+                    param_shapes[0] is not None and param_shapes[1] is not None
+                ), "`param_sizes` should be set"
+                assert_equal_fn(
+                    param_names,
+                    [
+                        "lin.weight",
+                        "lin.bias",
+                    ],
+                )
+                assert_equal_fn(params[0].shape, param_shapes[0])
+                assert_equal_fn(params[1].shape, param_shapes[1])
+                return self.lin(x)
+
+        model = Model().cuda()
+        # Save the *unsharded* original parameter shapes and check the shapes
+        # match in the forward pass
+        param_shapes[0] = model.lin.weight.shape
+        param_shapes[1] = model.lin.bias.shape
+        fsdp_model = FSDP(model, use_orig_params=True)
+        inp = torch.randn((2, 5), device=torch.device("cuda"))
+        fsdp_model(inp)
+
 
 instantiate_parametrized_tests(TestFSDPUseOrigParamsMultipleParamGroups)
 instantiate_parametrized_tests(TestFSDPUseOrigParamsUnshardReshard)

From 2bc3224c19b1cf1516116a0c2e06c95c586897b7 Mon Sep 17 00:00:00 2001
From: Driss Guessous <drisspg@fb.com>
Date: Wed, 16 Nov 2022 03:07:54 +0000
Subject: [PATCH 0940/1922] Create native function for determining which
 implementation of SDP to call (#89029)

# Summary
Creates a callable native function that can determine which implementation of scaled dot product will get called. This allows to bump re-order the runtime dispatch of SDP to enable autograd.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89029
Approved by: https://github.com/cpuhrsch
---
 aten/src/ATen/native/native_functions.yaml    |   5 +
 .../ATen/native/transformers/attention.cpp    |   6 +
 .../native/transformers/cuda/attention.cu     |  13 ++
 .../ATen/native/transformers/cuda/sdp_utils.h |   3 +-
 .../ATen/native/transformers/sdp_utils_cpp.h  |   9 ++
 docs/source/backends.rst                      |   2 +
 test/test_transformers.py                     | 114 +++++++++++++-----
 torch/backends/cuda/__init__.py               |  17 ++-
 torchgen/native_function_generation.py        |   1 +
 9 files changed, 137 insertions(+), 33 deletions(-)
 create mode 100644 aten/src/ATen/native/transformers/sdp_utils_cpp.h

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 9572ccc56653d..726a54b5e225f 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -13239,6 +13239,11 @@
   variants: function
   autogen: _scaled_dot_product_attention.out
 
+- func: _fused_sdp_choice(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool need_attn_weights=False, bool is_causal=False) -> int
+  dispatch:
+    CPU, NestedTensorCPU, Meta: _fused_sdp_choice_cpp
+    CUDA, NestedTensorCUDA: _fused_sdp_choice_cuda
+
 # Register the math kernel for cpu
 - func: _scaled_dot_product_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool need_attn_weights=False, bool is_causal=False) -> (Tensor, Tensor)
   variants: function
diff --git a/aten/src/ATen/native/transformers/attention.cpp b/aten/src/ATen/native/transformers/attention.cpp
index 55c71f9fd0645..89a0e4691018c 100644
--- a/aten/src/ATen/native/transformers/attention.cpp
+++ b/aten/src/ATen/native/transformers/attention.cpp
@@ -7,6 +7,7 @@
 #include <ATen/TensorIndexing.h>
 #include <ATen/cpu/vec/vec256/vec256.h>
 #include <ATen/native/transformers/attention.h>
+#include <ATen/native/transformers/sdp_utils_cpp.h>
 
 
 #ifndef AT_PER_OPERATOR_HEADERS
@@ -685,6 +686,11 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention(
         return at::_scaled_dot_product_attention_forward(query_, key, value, attn_mask_, dropout_p, need_attn_weights, is_causal);
 }
 
+int64_t _fused_sdp_choice_cpp(const Tensor& query_, const Tensor& key, const Tensor& value,
+        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool need_attn_weights, bool is_causal){
+  return static_cast<int64_t>(sdp::SDPBackend::math);
+}
+
 std::tuple<Tensor, Tensor> _scaled_dot_product_attention_forward_math(
     const Tensor& query_,
     const Tensor& key,
diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu
index 46543d4663fab..602cf319f74a6 100644
--- a/aten/src/ATen/native/transformers/cuda/attention.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention.cu
@@ -789,6 +789,19 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_forward_cuda(
     }
 }
 
+int64_t _fused_sdp_choice_cuda(const Tensor& query_, const Tensor& key, const Tensor& value,
+        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool need_attn_weights, bool is_causal){
+  sdp::sdp_params kernel_params{query_, key, value, attn_mask_.has_value(), dropout_p, need_attn_weights, is_causal};
+  auto backend = select_sdp_backend(kernel_params);
+  if (backend == sdp::SDPBackend::error) {
+    TORCH_CHECK(
+        false,
+        "No viable backend for scaled_dot_product_attention was found. ",
+        "This is likely due to turning off both the math kernel and the fused kernels.");
+  }
+  return static_cast<int64_t>(backend);
+}
+
 Tensor flash_scaled_dot_product_attention(
     const Tensor& query,
     const Tensor& key,
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.h b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
index e9f3d5029aa86..5d62a6cbd0dc5 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.h
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
@@ -10,6 +10,7 @@
 #include <c10/util/env.h>
 #include <c10/util/irange.h>
 #include <ATen/NestedTensorImpl.h>
+#include <ATen/native/transformers/sdp_utils_cpp.h>
 
 #include <functional>
 #include <unordered_set>
@@ -27,8 +28,6 @@ struct sdp_params {
   bool is_causal;
 };
 
-enum class SDPBackend { flash_attention, efficient_attention, math, error };
-
 template <typename dtype_vector>
 inline bool check_tensor_dtype(
     sdp_params params,
diff --git a/aten/src/ATen/native/transformers/sdp_utils_cpp.h b/aten/src/ATen/native/transformers/sdp_utils_cpp.h
new file mode 100644
index 0000000000000..9641a36b33b2c
--- /dev/null
+++ b/aten/src/ATen/native/transformers/sdp_utils_cpp.h
@@ -0,0 +1,9 @@
+#pragma once
+namespace sdp {
+enum class SDPBackend {
+  error = -1,
+  math = 0,
+  flash_attention = 1,
+  efficient_attention = 2
+};
+} // namespace sdp
\ No newline at end of file
diff --git a/docs/source/backends.rst b/docs/source/backends.rst
index 80e18f7017a01..2a02b325341fb 100644
--- a/docs/source/backends.rst
+++ b/docs/source/backends.rst
@@ -52,6 +52,8 @@ torch.backends.cuda
 
 .. autofunction:: torch.backends.cuda.preferred_linalg_library
 
+.. autoclass:: torch.backends.cuda.SDPBackend
+
 .. autofunction:: torch.backends.cuda.flash_sdp_enabled
 
 .. autofunction:: torch.backends.cuda.enable_mem_efficient_sdp
diff --git a/test/test_transformers.py b/test/test_transformers.py
index 93a94a5604c91..abb4c71ec19ad 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -1,14 +1,16 @@
 # Owner(s): ["module: nn"]
 
 import contextlib
+from functools import partial
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import unittest
 from unittest.mock import patch
 import math
-from torch.backends.cuda import sdp_kernel
+from torch.backends.cuda import sdp_kernel, SDPBackend
 import torch.optim as optim
+from torch.testing._internal.common_dtype import floating_types_and_half
 
 from torch.testing._internal.common_nn import NNTestCase
 from torch.testing._internal.common_utils import (
@@ -936,18 +938,24 @@ def _test_fastpath(model, key_padding_mask, mock_return_value, attn_mask=None, n
         _test_fastpath(model, aligned_key_padding_mask, nested_tensor_return_value, nested_tensors=True)
         _test_fastpath(model, not_aligned_key_padding_mask, nested_tensor_return_value, nested_tensors=True)
 
+    def rand_nt(self, shape, device, dtype, requires_grad=False, packed=False):
+        batch, seq_len, num_heads, head_dim = shape
+        size = (seq_len, num_heads, head_dim) if not packed else (seq_len, 3 * num_heads * head_dim)
+        return torch.nested.nested_tensor([
+            torch.randn(size, device=device, dtype=dtype, requires_grad=requires_grad)
+            for _ in range(batch)])
+
+    def rand_tensor(self, shape, device, dtype, requires_grad=False, packed=False):
+        batch, seq_len, num_heads, head_dim = shape
+        size = (batch, seq_len, num_heads, head_dim) if not packed else (batch, seq_len, 3 * num_heads * head_dim)
+        return torch.randn(size, device=device, dtype=dtype, requires_grad=requires_grad)
+
     @unittest.skipIf(not TEST_CUDA or TEST_WITH_ROCM or IS_WINDOWS, "Flash Attention was not built for this system")
     @parametrize("type", ["dense", "nested"])
     @parametrize("is_contiguous", [True, False])
     def test_scaled_dot_product_attention_fused_kernels(self, type: str, is_contiguous: bool):
-        def rand_nt(shape):
-            batch, seq_len, num_heads, head_dim = shape
-            return torch.nested.nested_tensor([torch.randn(seq_len, num_heads, head_dim,
-                                                           device="cuda", dtype=torch.float16) for _ in range(batch)])
-
-        def rand_tensor(shape):
-            batch, seq_len, num_heads, head_dim = shape
-            return torch.randn(batch, seq_len, num_heads, head_dim, device="cuda", dtype=torch.float16)
+        rand_nt = partial(self.rand_nt, device="cuda", dtype=torch.float16)
+        rand_tensor = partial(self.rand_tensor, device="cuda", dtype=torch.float16)
 
         batch, seq_len, num_heads, head_dim = 32, 64, 16, 64
         shape = (batch, seq_len, num_heads, head_dim)
@@ -985,14 +993,8 @@ def rand_tensor(shape):
     @parametrize("type", ["dense", "nested"])
     @parametrize("is_contiguous", [True, False])
     def test_scaled_dot_product_attention_fused_kernels_packed(self, type: str, is_contiguous: bool):
-        def rand_nt(shape):
-            batch, seq_len, num_heads, head_dim = shape
-            return torch.nested.nested_tensor([torch.randn(seq_len, 3 * num_heads * head_dim,
-                                                           device="cuda", dtype=torch.float16) for _ in range(batch)])
-
-        def rand_tensor(shape):
-            batch, seq_len, num_heads, head_dim = shape
-            return torch.randn(batch, seq_len, 3 * num_heads * head_dim, device="cuda", dtype=torch.float16)
+        rand_nt = partial(self.rand_nt, device="cuda", dtype=torch.float16, packed=True)
+        rand_tensor = partial(self.rand_tensor, device="cuda", dtype=torch.float16, packed=True)
 
         batch_size, seq_len, num_heads, head_dim = 32, 64, 16, 64
         shape = (batch_size, seq_len, num_heads, head_dim)
@@ -1098,8 +1100,10 @@ def rand_tensor(shape):
     def test_efficient_attention_gradcheck(self, contiguous_inputs: bool):
 
         batch_size, seq_len, num_heads, head_dim = 8, 8, 4, 64
-        query, key, value = torch.rand((batch_size, seq_len, 3 * num_heads * head_dim),
-                                       device="cuda", dtype=torch.float32, requires_grad=True).chunk(3, -1)
+        rand_tensor = partial(self.rand_tensor, device="cuda", dtype=torch.float16, requires_grad=True, packed=True)
+
+        qkv = rand_tensor((batch_size, seq_len, num_heads, head_dim))
+        query, key, value = qkv.chunk(3, dim=-1)
         query = query.view(batch_size, -1, num_heads, head_dim)
         key = key.view(batch_size, -1, num_heads, head_dim)
         value = value.view(batch_size, -1, num_heads, head_dim)
@@ -1116,6 +1120,49 @@ def test_efficient_attention_gradcheck(self, contiguous_inputs: bool):
                          wrapper_set_seed(torch.ops.aten._efficient_attention_forward, *args, **kwargs),
                          (query, key, value, None, None, None, True, False), fast_mode=True, atol=8e-5, rtol=1e-3)
 
+    @parametrize("type", ["dense", "nested"])
+    def test_fused_sdp_choice(self, type: str):
+        device = "cpu"
+        # Test that cpu and nestedtensor cpu return MATH backend
+        for dtype in floating_types_and_half():
+            make_tensor = partial(self.rand_tensor, device=device, dtype=dtype)
+            size = (2, 2, 3, 4)
+            q, k, v = make_tensor(size), make_tensor(size), make_tensor(size)
+            assert torch._fused_sdp_choice(q, k, v) == SDPBackend.MATH
+
+        if TEST_CUDA and not TEST_WITH_ROCM and not IS_WINDOWS:
+            batch_size, seq_len, num_heads, head_dim = 32, 64, 16, 64
+            shape = (batch_size, seq_len, num_heads, head_dim)
+            device = "cuda"
+            make_tensor = partial(self.rand_tensor, device=device, dtype=torch.float16, packed=True)
+            make_nt = partial(self.rand_nt, device=device, dtype=torch.float16, packed=True)
+
+            qkv = make_tensor(shape) if type == "dense" else make_nt(shape)
+            query, key, value = qkv.chunk(3, dim=-1)
+
+            query = query.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+            value = value.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+            key = key.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+
+            if SM80OrLater:
+                assert torch._fused_sdp_choice(query, key, value) == SDPBackend.FLASH_ATTENTION
+            else:
+                assert torch._fused_sdp_choice(query, key, value) == SDPBackend.EFFICIENT_ATTENTION
+
+            # Change dtype to float32 so that efficient attention should get chosen
+            make_tensor = partial(self.rand_tensor, device=device, dtype=torch.float32, packed=True)
+            make_nt = partial(self.rand_nt, device=device, dtype=torch.float32, packed=True)
+
+            qkv = make_tensor(shape) if type == "dense" else make_nt(shape)
+            query, key, value = qkv.chunk(3, dim=-1)
+
+            query = query.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+            value = value.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+            key = key.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+
+            assert torch._fused_sdp_choice(query, key, value) == SDPBackend.EFFICIENT_ATTENTION
+
+
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     def test_sdp_runtime_dispatch(self):
         # We will test all the constraints that we know will cause a failure
@@ -1123,12 +1170,15 @@ def test_sdp_runtime_dispatch(self):
         # will fail on CI/CD becuase it is not compiled with the right flags
         device = 'cuda'
         dtype = torch.float16
-
-        def make_tensor(*size, device=device, dtype=dtype):
-            return torch.randn(size, device=device, dtype=dtype)
+        make_tensor = partial(self.rand_tensor, device=device, dtype=dtype)
 
         with sdp_kernel(enable_flash=False, enable_math=False, enable_mem_efficient=False):
-            q, k, v = make_tensor(2, 3, 4), make_tensor(2, 3, 4), make_tensor(2, 3, 4)
+            size = (2, 3, 4)
+            q = torch.randn(size, device=device, dtype=dtype)
+            k = torch.randn(size, device=device, dtype=dtype)
+            v = torch.randn(size, device=device, dtype=dtype)
+            self.assertRaisesRegex(RuntimeError, "No viable backend for scaled_dot_product_attention was found.",
+                                   lambda: torch._fused_sdp_choice(q, k, v))
             self.assertRaisesRegex(RuntimeError, "No viable backend for scaled_dot_product_attention was found.",
                                    lambda: torch.nn.functional._scaled_dot_product_attention(q, k, v))
 
@@ -1136,29 +1186,33 @@ def make_tensor(*size, device=device, dtype=dtype):
             # Failures for invalid input
 
             # Dim is not 4
-            q, k, v = make_tensor(2, 3, 4), make_tensor(2, 3, 4), make_tensor(2, 3, 4)
+            q = torch.randn(size, device=device, dtype=dtype)
+            k = torch.randn(size, device=device, dtype=dtype)
+            v = torch.randn(size, device=device, dtype=dtype)
             self.assertRaises(RuntimeError, lambda: torch.nn.functional._scaled_dot_product_attention(
                 q, k, v, None, 0.0, False, False))
 
             # Xformers can now cover this case but will add back in next PR
             # Invalid last_dim size
-            q, k, v = make_tensor(2, 2, 3, 4), make_tensor(2, 2, 3, 4), make_tensor(2, 2, 3, 4)
+            size = (2, 2, 3, 4)
+            q, k, v = make_tensor(size), make_tensor(size), make_tensor(size)
             self.assertRaises(RuntimeError, lambda: torch.nn.functional._scaled_dot_product_attention(
                 q, k, v, None, 0.0, False, False))
 
             # Invalid dtype
-            q, k, v = make_tensor(2, 2, 3, 16, dtype=torch.float64), make_tensor(
-                2, 2, 3, 16, dtype=torch.float64), make_tensor(2, 2, 3, 16, dtype=torch.float64)
+            size = (2, 2, 3, 16)
+            make_tensor = partial(self.rand_tensor, device=device, dtype=torch.float64)
+            q, k, v = make_tensor(size), make_tensor(size), make_tensor(size)
             self.assertRaises(RuntimeError, lambda: torch.nn.functional._scaled_dot_product_attention(
                 q, k, v, None, 0.0, False, False))
 
-            q, k, v = make_tensor(2, 2, 3, 16, dtype=torch.float32), make_tensor(
-                2, 2, 3, 16, dtype=torch.float32), make_tensor(2, 2, 3, 16, dtype=torch.float32)
+            make_tensor = partial(self.rand_tensor, device=device, dtype=torch.float32)
+            q, k, v = make_tensor(size), make_tensor(size), make_tensor(size)
             self.assertRaises(RuntimeError, lambda: torch.nn.functional._scaled_dot_product_attention(
                 q, k, v, None, 0.0, False, False))
 
             # Failures for unsupported SDP args
-            q, k, v = make_tensor(2, 2, 3, 16), make_tensor(2, 2, 3, 16), make_tensor(2, 2, 3, 16)
+            q, k, v = make_tensor(size), make_tensor(size), make_tensor(size)
 
             # Needs attention weights
             self.assertRaises(RuntimeError, lambda: torch.nn.functional._scaled_dot_product_attention(
diff --git a/torch/backends/cuda/__init__.py b/torch/backends/cuda/__init__.py
index dd05535d39359..50735e125ec36 100644
--- a/torch/backends/cuda/__init__.py
+++ b/torch/backends/cuda/__init__.py
@@ -1,11 +1,12 @@
 import sys
 import torch
 import contextlib
+from enum import IntEnum
 
 from typing import Union
 
 __all__ = ["is_built", "cuFFTPlanCacheAttrContextProp", "cuFFTPlanCache", "cuFFTPlanCacheManager",
-           "cuBLASModule", "preferred_linalg_library", "cufft_plan_cache", "matmul", "enable_flash_sdp",
+           "cuBLASModule", "preferred_linalg_library", "cufft_plan_cache", "matmul", "SDPBackend", "enable_flash_sdp",
            "flash_sdp_enabled", "enable_mem_efficient_sdp", "mem_efficient_sdp_enabled",
            "math_sdp_enabled", "enable_math_sdp", "sdp_kernel"]
 
@@ -164,6 +165,20 @@ def preferred_linalg_library(backend: Union[None, str, torch._C._LinalgBackend]
     return torch._C._get_linalg_preferred_backend()
 
 
+class SDPBackend(IntEnum):
+    r"""Enum class for the scaled dot product attention backends.
+
+    .. warning:: This flag is experimental and subject to change.'
+
+    This class needs to stay inline with the enum defined in:
+    pytorch/aten/src/ATen/native/transformers/sdp_utils_cpp.h
+    """
+    ERROR = -1
+    MATH = 0
+    FLASH_ATTENTION = 1
+    EFFICIENT_ATTENTION = 2
+
+
 def flash_sdp_enabled():
     r"""
     .. warning:: This flag is experimental and subject to change.
diff --git a/torchgen/native_function_generation.py b/torchgen/native_function_generation.py
index 89314c1dd18de..657a133c31c71 100644
--- a/torchgen/native_function_generation.py
+++ b/torchgen/native_function_generation.py
@@ -73,6 +73,7 @@
     "record_stream",  # no return
     "sparse_dim",  # returns an int
     "_nested_tensor_offsets",  # returns a vector of ints
+    "_fused_sdp_choice",  # returns an int
 ]
 
 INPLACE_OPS_THAT_DONT_GET_GROUPED_PROPERLY = [

From c5540885bca20ee732583d6954d26c00b65cb5a8 Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Tue, 15 Nov 2022 21:02:44 +0000
Subject: [PATCH 0941/1922] Symintify numel(), infer_size,
 prims.elementwise_meta (#88956)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88956
Approved by: https://github.com/ezyang
---
 aten/src/ATen/ExpandUtils.cpp                 | 10 ++++----
 aten/src/ATen/ExpandUtils.h                   |  2 ++
 test/test_proxy_tensor.py                     | 25 ++++++++++++++++---
 torch/_prims/__init__.py                      | 16 +++++++++---
 torch/_refs/__init__.py                       |  4 +--
 torch/_subclasses/fake_tensor.py              |  6 +----
 torch/csrc/autograd/input_metadata.h          |  4 +--
 .../python_torch_functions_manual.cpp         |  2 +-
 torch/fx/experimental/symbolic_shapes.py      |  3 ---
 torch/fx/traceback.py                         |  2 +-
 10 files changed, 48 insertions(+), 26 deletions(-)

diff --git a/aten/src/ATen/ExpandUtils.cpp b/aten/src/ATen/ExpandUtils.cpp
index a44005a2ef815..ee846c9b82e34 100644
--- a/aten/src/ATen/ExpandUtils.cpp
+++ b/aten/src/ATen/ExpandUtils.cpp
@@ -13,8 +13,8 @@ TensorBase expand_slow_path(const TensorBase &self, IntArrayRef size) {
 
 namespace {
 // NOTE: are_expandable did a similar check, please keep them sync if change is needed
-template <typename Container>
-Container infer_size_impl(IntArrayRef a, IntArrayRef b) {
+template <typename Container, typename ArrayType>
+Container infer_size_impl(ArrayType a, ArrayType b) {
   size_t dimsA = a.size();
   size_t dimsB = b.size();
   size_t ndim = dimsA > dimsB ? dimsA : dimsB;
@@ -25,8 +25,8 @@ Container infer_size_impl(IntArrayRef a, IntArrayRef b) {
     ptrdiff_t offset = ndim - 1 - i;
     ptrdiff_t dimA = dimsA - 1 - offset;
     ptrdiff_t dimB = dimsB - 1 - offset;
-    int64_t sizeA = (dimA >= 0) ? a[dimA] : 1;
-    int64_t sizeB = (dimB >= 0) ? b[dimB] : 1;
+    auto sizeA = (dimA >= 0) ? a[dimA] : 1;
+    auto sizeB = (dimB >= 0) ? b[dimB] : 1;
 
     TORCH_CHECK(
         sizeA == sizeB || sizeA == 1 || sizeB == 1,
@@ -35,7 +35,7 @@ Container infer_size_impl(IntArrayRef a, IntArrayRef b) {
         ") at non-singleton dimension ", i);
 
       // 1s map to the other size (even 0).
-      expandedSizes[i] = sizeA == 1 ? sizeB : sizeA;
+      expandedSizes[i] = sizeA == 1 ? std::move(sizeB) : std::move(sizeA);
   }
 
   return expandedSizes;
diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h
index 786cbf132cd77..9e48421e540fe 100644
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@@ -21,6 +21,8 @@ namespace at {
 
 TORCH_API std::vector<int64_t> infer_size(IntArrayRef a, IntArrayRef b);
 TORCH_API DimVector infer_size_dimvector(IntArrayRef a, IntArrayRef b);
+TORCH_API SymDimVector
+infer_size_symdimvector(SymIntArrayRef a, SymIntArrayRef b);
 
 // Named type instead of a pair/tuple so that we can be sure to
 // construct the vectors in place and get NRVO.
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 24efcab9e5cb6..59b08eea8dce8 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -13,6 +13,7 @@
 from torch._subclasses.fake_tensor import DynamicOutputShapeException
 
 from torch._decomp import decomposition_table
+from torch.fx.experimental.symbolic_shapes import sym_float
 from torch.testing._internal.common_device_type import ops
 from torch._C import _disabled_torch_function_impl
 from torch.fx.experimental.proxy_tensor import make_fx, DecompositionInterpreter, get_isolated_graphmodule, has_proxy
@@ -719,7 +720,6 @@ def deco(cls):
 
 @skipIfNoSympy
 @xfail_inherited_tests([
-    "test_mode_tracing_factory_function",
     "test_make_fx_overloads",
     "test_trace_subclasses",
 ])
@@ -961,8 +961,27 @@ def f(x):
         # happened afterwards
         self.assertTrue(meta_inp.meta['val'].shape[0].get_pyobj().expr == 3)
 
-
-
+    def test_elementwise_meta_with_sym_numbers(self):
+        def f(x, offset, as_sym_float=False):
+            x0 = x.size()[0]
+            if as_sym_float:
+                x0 = sym_float(x0)
+            return torch.add(x0, offset)
+
+        fx_g = make_fx(f, tracing_mode="symbolic")(torch.rand(2, 3), 2.0, False)
+        meta_add = _get_node(fx_g, lambda x: x.target == aten.add.Tensor)
+        self.assertEqual(meta_add.meta['val'].shape, ())
+        self.assertEqual(meta_add.meta['val'].dtype, torch.float32)
+
+        fx_g = make_fx(f, tracing_mode="symbolic")(torch.rand(2, 3), 2, False)
+        meta_add = _get_node(fx_g, lambda x: x.target == aten.add.Tensor)
+        self.assertEqual(meta_add.meta['val'].shape, ())
+        self.assertEqual(meta_add.meta['val'].dtype, torch.int64)
+
+        fx_g = make_fx(f, tracing_mode="symbolic")(torch.rand(2, 3), 2, True)
+        meta_add = _get_node(fx_g, lambda x: x.target == aten.add.Tensor)
+        self.assertEqual(meta_add.meta['val'].shape, ())
+        self.assertEqual(meta_add.meta['val'].dtype, torch.float32)
 
     def test_return_symint(self):
         def f(x):
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index c40960a22445c..da8d9af723acf 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -31,6 +31,7 @@
 )
 from torch._prims_common.wrappers import backwards_not_supported
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+from torch.fx.experimental.symbolic_shapes import sym_float
 from torch.overrides import handle_torch_function, has_torch_function
 from torch.utils._pytree import tree_flatten, tree_map, tree_unflatten
 
@@ -390,11 +391,18 @@ def _elementwise_meta(
         return TensorMeta(device=device, shape=shape, strides=strides, dtype=dtype)
 
     # Number case
-    # NOTE: this case is not currently exercised
     # TODO: fix number type promotion (bool, complex->float)
-    assert not isinstance(number, torch.SymInt), "NYI"
-    assert not isinstance(number, torch.SymFloat), "NYI"
-    return TensorMeta(number)
+
+    # For now for symint/float, just implementing the common / simple cases of (int,float,symint,symfloat)
+    seen_float = False
+    if isinstance(number, (torch.SymInt, torch.SymFloat)):
+        for a in args:
+            assert isinstance(a, (int, float, torch.SymInt, torch.SymFloat)), "NYI"
+            seen_float = seen_float or isinstance(a, (float, torch.SymFloat))
+        if seen_float:
+            number = sym_float(number)
+
+    return TensorMeta(number)  # type: ignore[arg-type]
 
 
 def _complex_only_elementwise_meta(*args, **kwargs):
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index f2817f0331ac5..a0916c3f8268a 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -723,10 +723,10 @@ def nan_to_num(
         nan = 0.0
 
     if posinf is None:
-        posinf = prims.maximum_value(a.dtype)
+        posinf = torch.finfo(a.dtype).max
 
     if neginf is None:
-        neginf = prims.minimum_value(a.dtype)
+        neginf = torch.finfo(a.dtype).min
 
     result = where(isnan(a), nan, a)
 
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 8dec2475df15f..5d3d3a0e32fe1 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -861,11 +861,7 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         # and ensure that Meta kernels are dispatched to (see)
         # Fake Tensor Dispatch Keys
         # TODO - we should be use the prim aten impl
-        if (
-            "prims::" in func._schema.name
-            and len(flat_arg_fake_tensors) != 0
-            and hasattr(func, "prim_meta_impl")
-        ):
+        if "prims::" in func._schema.name and hasattr(func, "prim_meta_impl"):
             with self:
                 return func.prim_meta_impl(*args, **kwargs)
 
diff --git a/torch/csrc/autograd/input_metadata.h b/torch/csrc/autograd/input_metadata.h
index 7cb9e8aedb195..8060c11ac4575 100644
--- a/torch/csrc/autograd/input_metadata.h
+++ b/torch/csrc/autograd/input_metadata.h
@@ -125,13 +125,13 @@ struct InputMetadata {
     if (grad.is_nested()) {
       ss << at::native::get_nested_size_tensor(grad);
     } else {
-      ss << grad.sizes();
+      ss << grad.sym_sizes();
     }
     ss << " but expected shape compatible with ";
     if (is_nested_tensor()) {
       ss << shape_as_tensor();
     } else {
-      ss << c10::asIntArrayRefSlow(shape_as_dim_vector());
+      ss << shape_as_dim_vector();
     }
     return ss;
   }
diff --git a/torch/csrc/autograd/python_torch_functions_manual.cpp b/torch/csrc/autograd/python_torch_functions_manual.cpp
index 562f5a427d380..2c4999c971eab 100644
--- a/torch/csrc/autograd/python_torch_functions_manual.cpp
+++ b/torch/csrc/autograd/python_torch_functions_manual.cpp
@@ -692,7 +692,7 @@ static PyObject* THPVariable_numel(
   }
 
   if (r.idx == 0) {
-    return wrap(r.tensor(0).numel());
+    return wrap(r.tensor(0).sym_numel());
   }
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index ae4427e2320e9..bd52760502c6b 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -261,9 +261,6 @@ def eval(cls, base, divisor):
     'floordiv': lambda a, b: FloorDiv(a, b),
 }
 
-def _nyi():
-    raise NotImplementedError()
-
 magic_methods = {
     **reflectable_magic_methods,
     'eq': lambda a, b: sympy.Eq(a, b),
diff --git a/torch/fx/traceback.py b/torch/fx/traceback.py
index a07b36b997bdb..cee7626e5c83a 100644
--- a/torch/fx/traceback.py
+++ b/torch/fx/traceback.py
@@ -54,7 +54,7 @@ def format_stack() -> List[str]:
         return current_stack.copy()
     else:
         # fallback to traceback.format_stack()
-        return traceback.format_stack()
+        return traceback.format_list(traceback.extract_stack()[:-1])
 
 
 @compatibility(is_backward_compatible=False)

From 225cf0fe68aa49a773fe31ff98b552ab138cfd8c Mon Sep 17 00:00:00 2001
From: Johannes Pitz <johannes.pitz@tum.de>
Date: Wed, 16 Nov 2022 04:38:30 +0000
Subject: [PATCH 0942/1922] Easier to understand event_dim computation (#81396)

Fixes #81254
Only easier to understand, not a real fix.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/81396
Approved by: https://github.com/fritzo, https://github.com/kit1980
---
 .../distributions/transformed_distribution.py | 25 +++++++++++--------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/torch/distributions/transformed_distribution.py b/torch/distributions/transformed_distribution.py
index 9d7bd6fbd6907..a3bab3e836a3b 100644
--- a/torch/distributions/transformed_distribution.py
+++ b/torch/distributions/transformed_distribution.py
@@ -57,26 +57,29 @@ def __init__(self, base_distribution, transforms, validate_args=None):
         base_shape = base_distribution.batch_shape + base_distribution.event_shape
         base_event_dim = len(base_distribution.event_shape)
         transform = ComposeTransform(self.transforms)
-        domain_event_dim = transform.domain.event_dim
-        if len(base_shape) < domain_event_dim:
+        if len(base_shape) < transform.domain.event_dim:
             raise ValueError("base_distribution needs to have shape with size at least {}, but got {}."
-                             .format(domain_event_dim, base_shape))
-        shape = transform.forward_shape(base_shape)
-        expanded_base_shape = transform.inverse_shape(shape)
+                             .format(transform.domain.event_dim, base_shape))
+        forward_shape = transform.forward_shape(base_shape)
+        expanded_base_shape = transform.inverse_shape(forward_shape)
         if base_shape != expanded_base_shape:
             base_batch_shape = expanded_base_shape[:len(expanded_base_shape) - base_event_dim]
             base_distribution = base_distribution.expand(base_batch_shape)
-        reinterpreted_batch_ndims = domain_event_dim - base_event_dim
+        reinterpreted_batch_ndims = transform.domain.event_dim - base_event_dim
         if reinterpreted_batch_ndims > 0:
             base_distribution = Independent(base_distribution, reinterpreted_batch_ndims)
         self.base_dist = base_distribution
 
         # Compute shapes.
-        event_dim = transform.codomain.event_dim + max(base_event_dim - domain_event_dim, 0)
-        assert len(shape) >= event_dim
-        cut = len(shape) - event_dim
-        batch_shape = shape[:cut]
-        event_shape = shape[cut:]
+        transform_change_in_event_dim = transform.codomain.event_dim - transform.domain.event_dim
+        event_dim = max(
+            transform.codomain.event_dim,  # the transform is coupled
+            base_event_dim + transform_change_in_event_dim  # the base dist is coupled
+        )
+        assert len(forward_shape) >= event_dim
+        cut = len(forward_shape) - event_dim
+        batch_shape = forward_shape[:cut]
+        event_shape = forward_shape[cut:]
         super(TransformedDistribution, self).__init__(batch_shape, event_shape, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):

From d6b1bf91eb5bb74db04bb42f9d659eaf4ccd5824 Mon Sep 17 00:00:00 2001
From: Kenichi Maehashi <webmaster@kenichimaehashi.com>
Date: Wed, 16 Nov 2022 05:07:51 +0000
Subject: [PATCH 0943/1922] Add an option to include actual license terms to
 the output (#85624)

When building products using PyTorch, it is often required to display license terms for all dependencies.
The feature itself has been implemented in #81500 but it seems there are no options to enable it.
This PR implements the option.

cc/ @mattip @rgommers
Pull Request resolved: https://github.com/pytorch/pytorch/pull/85624
Approved by: https://github.com/rgommers, https://github.com/seemethere
---
 third_party/build_bundled.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/third_party/build_bundled.py b/third_party/build_bundled.py
index 4da1b84a6f32e..d60a2c1354fd2 100644
--- a/third_party/build_bundled.py
+++ b/third_party/build_bundled.py
@@ -181,9 +181,14 @@ def squeeze(t):
         ),
         help="location to output new bundled licenses file",
     )
-
+    parser.add_argument(
+        "--include-files",
+        action="store_true",
+        default=False,
+        help="include actual license terms to the output",
+    )
     args = parser.parse_args()
     fname = args.out_file
     print(f"+ Writing bundled licenses to {args.out_file}")
     with open(fname, 'w') as fid:
-        create_bundled(third_party, fid)
+        create_bundled(third_party, fid, args.include_files)

From e77bbf1746a64bf7f8c3fd8b97d4b8a08ac0a7be Mon Sep 17 00:00:00 2001
From: Jiawen Liu <jiawenl@meta.com>
Date: Wed, 16 Nov 2022 06:27:13 +0000
Subject: [PATCH 0944/1922] [Inductor] Support Shape Padding for aten.mm in
 Inductor (#89086)

Summary: Support shape padding for aten.mm in Inductor (originally from [#88709](https://github.com/pytorch/pytorch/pull/88709))

Differential Revision: D41315078

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89086
Approved by: https://github.com/jianyuh
---
 torch/_inductor/decomposition.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index 44bfd46505a2e..3254f174b495b 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -267,6 +267,31 @@ def should_pad_bench(mat1, mat2, op, input=None):
             return ori_time > pad_time * 2
 
 
+@register_decomposition([aten.mm])
+def mm_decomp(mat1, mat2):
+    if (
+        config.shape_padding
+        and check_device_dtype(mat1, mat2)
+        and should_pad_bench(mat1, mat2, torch.ops.aten.mm)
+    ):
+        m_padded_length = get_padded_length(mat1.shape[0])
+        k_padded_length = get_padded_length(mat1.shape[1])
+        n_padded_length = get_padded_length(mat2.shape[1])
+
+        if k_padded_length != 0:
+            mat1 = pad_dim(mat1, k_padded_length, 1)
+            mat2 = pad_dim(mat2, k_padded_length, 0)
+            return torch.ops.aten.mm(mat1, mat2)
+        elif m_padded_length != 0:
+            mat1 = pad_dim(mat1, m_padded_length, 0)
+            return torch.ops.aten.mm(mat1, mat2)[:-m_padded_length, :]
+        elif n_padded_length != 0:
+            mat2 = pad_dim(mat2, n_padded_length, 1)
+            return torch.ops.aten.mm(mat1, mat2)[:, :-n_padded_length]
+
+    return NotImplemented  # go directly to lowering
+
+
 @register_decomposition([aten.bmm])
 def bmm_decomp(mat1, mat2):
     if (

From 2fea6a56820376e557d80fe8f58f3845a002a6ed Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 16 Nov 2022 07:44:41 +0000
Subject: [PATCH 0945/1922] Upload CSV test reports from inductor (#89112)

Inductor test report artifacts are now on HUD but its files are in CSV format instead of the default XML files from pytest or unittest that we expect. So this PR uploads both suffixes

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89112
Approved by: https://github.com/desertfire
---
 .github/actions/upload-test-artifacts/action.yml | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/.github/actions/upload-test-artifacts/action.yml b/.github/actions/upload-test-artifacts/action.yml
index 624c4895155a7..9fd2342601f11 100644
--- a/.github/actions/upload-test-artifacts/action.yml
+++ b/.github/actions/upload-test-artifacts/action.yml
@@ -34,7 +34,7 @@ runs:
       run: |
         # Remove any previous test reports if they exist
         rm -f test-reports-*.zip
-        zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml'
+        zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' -i '*.csv'
 
     - name: Zip usage log for upload
       if: runner.os != 'Windows' && !inputs.use-gha
@@ -67,7 +67,7 @@ runs:
         FILE_SUFFIX: ${{ inputs.file-suffix }}
       run: |
         # -ir => recursive include all files in pattern
-        7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml'
+        7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' -ir'!test\*.csv'
 
     - name: Zip usage log for upload
       if: runner.os == 'Windows' && !inputs.use-gha
@@ -127,8 +127,11 @@ runs:
         # Add the run attempt, see [Artifact run attempt]
         name: test-reports-runattempt${{ github.run_attempt }}-${{ inputs.file-suffix }}.zip
         retention-days: 14
-        if-no-files-found: error
-        path: test/**/*.xml
+        # Don't want to fail the workflow here because not all workflows have csv files
+        if-no-files-found: ignore
+        path: |
+          test/**/*.xml
+          test/**/*.csv
 
     - name: Store Usage Logs on Github
       uses: actions/upload-artifact@v3

From 6b29b7754b1f0e9da9c9c621b8a537516e3b7fa7 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Tue, 15 Nov 2022 08:04:37 +0000
Subject: [PATCH 0946/1922] [dtensor] PART 1: move DeviceMesh and placement to
 core distributed (#88549)

This PR creates `torch.distributed._tensor` package and moves
DeviceMesh, PlacementTypes to it

part of https://github.com/pytorch/pytorch/issues/88838
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88549
Approved by: https://github.com/fduwjj
---
 torch/distributed/_tensor/__init__.py        |   0
 torch/distributed/_tensor/device_mesh.py     | 506 +++++++++++++++++++
 torch/distributed/_tensor/placement_types.py | 432 ++++++++++++++++
 3 files changed, 938 insertions(+)
 create mode 100644 torch/distributed/_tensor/__init__.py
 create mode 100644 torch/distributed/_tensor/device_mesh.py
 create mode 100644 torch/distributed/_tensor/placement_types.py

diff --git a/torch/distributed/_tensor/__init__.py b/torch/distributed/_tensor/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/torch/distributed/_tensor/device_mesh.py b/torch/distributed/_tensor/device_mesh.py
new file mode 100644
index 0000000000000..5ca3f8c6159b1
--- /dev/null
+++ b/torch/distributed/_tensor/device_mesh.py
@@ -0,0 +1,506 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import warnings
+from typing import List, Optional, Sequence, TypeVar, Union
+import torch
+from torch.distributed.distributed_c10d import (
+    all_gather,
+    all_reduce,
+    broadcast,
+    get_rank,
+    get_world_size,
+    get_global_rank,
+    ReduceOp,
+    GroupMember,
+    scatter,
+    _get_default_group,
+    reduce_scatter,
+    new_group,
+    ProcessGroup,
+    all_to_all,
+    Work,
+)
+
+_global_device_mesh: Optional["DeviceMesh"] = None
+
+
+def get_global_device_mesh() -> "DeviceMesh":
+    global _global_device_mesh
+    assert (
+        _global_device_mesh is not None
+    ), "Could not get a default device mesh!"
+    return _global_device_mesh
+
+
+def set_global_device_mesh(mesh: Optional["DeviceMesh"]) -> None:
+    global _global_device_mesh
+    _global_device_mesh = mesh
+
+
+# We want a type for "can be passed to torch.as_tensor()";
+# this is a recursive sequence type, which isn't fully supported
+# yet in python. This construct simulates that up to depth 7.
+T = TypeVar("T")
+_L = Union[T, Sequence[T]]
+NDIntList = _L[_L[_L[_L[_L[_L[_L[int]]]]]]]
+
+MeshExprT = Union[
+    torch.Tensor,
+    NDIntList,
+]
+
+
+class DeviceMesh(object):
+    """
+    DeviceMesh represents a mesh of devices, where layout of devices could be
+    represented as a n-d dimension array, and each value of the n-d dimensional
+    array is the global id of the default process group ranks.
+
+    DeviceMesh could be used to describe the layout of devices across the cluster,
+    and serves as a proxy for communication among the device lists within the cluster.
+
+    We use the default ProcessGroup in this DeviceMesh class to implement proper
+    communications. Note that we also add collective wrappers in this class. This is
+    used to decouple detailed communication backend with the underlying
+    DTensor implementation.
+
+    DeviceMesh can be used as a context manager.
+    Args:
+        device_type (str): device type of the mesh. Currently supports: cpu, cuda.
+        mesh (ndarray): could be a multi-dimension array or an integer tensor that
+            describes the layout of devices, the ids are global ids of the
+            default process group.
+        dim_groups (List[ProcessGroup], optional): The ProcessGroup used per mesh
+            dimension.
+
+    Returns:
+        A :class:`DeviceMesh` object
+
+    Example (2 host with 4 GPUs each):
+        ```
+        # The following program runs on each process/rank in SPMD manner.
+        # initialized default world
+        torch.distributed.init_process_group(backend="nccl", world_size=8)
+        # initialize device mesh as (2, 4) to represent the topology
+        # of cross-host(dim 0), and within-host (dim 1)
+        mesh = DeviceMesh(device_type="cuda",
+                          mesh=[
+                            [0, 1, 2, 3],
+                            [4, 5, 6, 7]
+                          ])
+        ```
+        A reduction over the first dimension of mesh will reduce across
+        columns (0, 4), .. and (3, 7), a reduction over the second dimension
+        of mesh reduces across rows (0, 1, 2, 3) and (4, 5, 6, 7)
+
+    """
+
+    device_type: str
+    mesh: torch.Tensor
+    _backend: str
+
+    def __init__(
+        self,
+        device_type: str,
+        mesh: MeshExprT,
+        dim_groups: Optional[List[ProcessGroup]] = None,
+    ) -> None:
+        self.device_type = device_type
+        self.mesh = (
+            mesh.detach()
+            if isinstance(mesh, torch.Tensor)
+            else torch.tensor(mesh, dtype=torch.int)
+        )
+        default_pg = _get_default_group()
+        self._backend = default_pg._get_backend_name()
+        # TODO: if user want to pass pg_options, offer a way to do it
+        # check default pg backend, should support device_type
+        if device_type == "cpu":
+            assert (
+                self._backend == "gloo"
+            ), f"ProcessGroup backend: {self._backend} not supporting CPU!"
+        elif device_type == "cuda":
+            if self._backend == "gloo":
+                warnings.warn(
+                    "We recommend using nccl backend for cuda device type, gloo backend might only have partial support!"
+                )
+            assert self._backend == "gloo" or self._backend == "nccl"
+        else:
+            raise RuntimeError(
+                f"DeviceMesh only support cpu or cuda device type, but got {device_type}"
+            )
+
+        world_size = get_world_size()
+        if self.mesh.numel() > world_size:
+            raise RuntimeError(
+                f"Mesh should not be bigger than default world size, but found {self.mesh.numel()} ranks!"
+            )
+
+        unique_mesh_values = self.mesh.unique(sorted=True)
+        if unique_mesh_values.numel() != self.mesh.numel():
+            raise RuntimeError(
+                f"DeviceMesh cannot have duplicate values, but found {self.mesh.tolist()}"
+            )
+
+        # coordinates of this rank on the mesh
+        rank_coords = (self.mesh == get_rank()).nonzero()
+        assert rank_coords.size(0) in (0, 1)
+        self._coordinate_on_dim: Optional[List[int]] = (
+            rank_coords[0].tolist() if rank_coords.size(0) > 0 else None
+        )
+
+        # groups created by dimension, each dimension should have exact
+        # one valid process group per rank
+        self._dim_groups: List[ProcessGroup] = []
+        if dim_groups is not None:
+            # if user hand creating dimension based groups
+            # we just take it and use it for communication
+            if not isinstance(dim_groups, list):
+                raise RuntimeError(
+                    "dim_groups expected to be Optional[List[ProcessGroup]]"
+                )
+
+            for group in dim_groups:
+                if not isinstance(group, ProcessGroup):
+                    raise RuntimeError(
+                        f"found object in dim_groups that is not a ProcessGroup: {group}"
+                    )
+
+            if self.get_rank() in self.mesh:
+                if len(dim_groups) != self.mesh.ndim:
+                    raise RuntimeError(
+                        f"length of dim_groups ({len(dim_groups)}) expected to be equal to mesh.ndim ({self.mesh.ndim})"
+                    )
+            else:
+                if len(dim_groups) != 0:
+                    raise RuntimeError(
+                        f"length of dim_groups ({len(dim_groups)}) expected to be equal to 0 on rank {self.get_rank()} "
+                        f"for mesh {self.mesh}"
+                    )
+
+            self._dim_groups = dim_groups
+            return
+
+        if self.mesh.ndim == 1 and unique_mesh_values[-1] == world_size - 1:
+            # if the mesh is the same as world_pg, we just append the default
+            # pg to the first dim goups, as new_group cannot have the exact
+            # same ranks as world
+            self._dim_groups.append(default_pg)
+        else:
+            # create sub pgs base on the mesh argument specified
+            # handle multi-dim mesh, create subgroups by
+            # looping over the pg_ranks_by_dim for each dim
+            for dim in range(self.mesh.ndim):
+                # swap the current dim to the last dim
+                # then reshape to flatten out other dims
+                pg_ranks_by_dim = self.mesh.swapdims(-1, dim).reshape(
+                    -1, self.mesh.size(dim)
+                )
+
+                # multi-dim mesh, create subgroups by
+                # looping over the pg_ranks for each dim
+                # and append the groups
+                for dim_mesh in pg_ranks_by_dim:
+                    subgroup_ranks = dim_mesh.tolist()
+                    # call new_group regardless of the current rank in the
+                    # pg or not, it's required that all ranks participate
+                    # in subgroup construction
+                    new_subgroup = new_group(
+                        ranks=subgroup_ranks, backend=self._backend
+                    )
+                    # only add to dim_groups if the current rank in the subgroup
+                    if self.get_rank() in subgroup_ranks:
+                        if len(self._dim_groups) > dim:
+                            raise RuntimeError(
+                                f"Each device mesh dimension should get only one process group, but got {self.get_rank} "
+                                f"in {subgroup_ranks}!"
+                            )
+                        self._dim_groups.append(new_subgroup)
+
+    def __enter__(self) -> "DeviceMesh":
+        # set global device_mesh to this instance
+        set_global_device_mesh(self)
+        return self
+
+    # pyre-fixme[2]: Parameter must be annotated.
+    def __exit__(self, exc_type, exc_value, exc_traceback) -> None:
+        # unset global device mesh
+        set_global_device_mesh(None)
+
+    def __repr__(self) -> str:
+        return f"DeviceMesh:({self.mesh.tolist()})"
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, DeviceMesh):
+            return False
+        if id(self) == id(other):
+            return True
+        return self.mesh.equal(other.mesh)
+
+    def get_dim_groups(self) -> List[ProcessGroup]:
+        return self._dim_groups
+
+    # pyre-fixme[3]: Return type must be annotated.
+    def size(self, dim: int = 0):
+        return self.mesh.size(dim)
+
+    @property
+    def ndim(self) -> int:
+        return self.mesh.ndim
+
+    def backend(self) -> str:
+        return self._backend
+
+    def get_rank(self) -> int:
+        return get_rank()
+
+    def get_coordinate_on_dim(self, dim: int) -> Optional[int]:
+        """
+        Return the relative index of this rank relative to a given
+        dimension of the mesh. If this rank is not part of the mesh, return None.
+        """
+        return self._coordinate_on_dim[dim] if self._coordinate_on_dim else None
+
+    def scatter(
+        self,
+        output: torch.Tensor,
+        scatter_list: List[torch.Tensor],
+        mesh_dim: int = 0,
+        async_op: bool = False,
+    ) -> Optional[Work]:
+        """
+        scatter a list of tensors to a device mesh dimension. We by default
+        use the first rank of the mesh dimension as the source of truth, i.e
+        for a 2d mesh [[0, 1], [2, 3]], if we scatter on mesh_dim = 1, we will
+        scatter the tensor list on rank 0 to rank 0/1, and tensor lista on rank
+        2 to rank 2/3.
+
+        Args:
+            tensor (torch.Tensor): the tensor to receive the scattered list.
+            scatter_list (List[torch.Tensor]): the tensor list to be scattered.
+            mesh_dim (int, optional): indicate which mesh dimension we want
+                to scatter on, we by default choose the first rank on the
+                mesh dimension as source of truth.
+
+        Returns:
+            A :class:`Work` object
+        """
+        dim_group = self._dim_groups[mesh_dim]
+        # src need to be global rank
+        src_for_dim = 0
+        if dim_group is not GroupMember.WORLD:
+            src_for_dim = get_global_rank(dim_group, 0)
+
+        if src_for_dim == get_rank():
+            fut = scatter(
+                output,
+                scatter_list=scatter_list,
+                src=src_for_dim,
+                group=dim_group,
+                async_op=async_op,
+            )
+        else:
+            fut = scatter(
+                output,
+                scatter_list=None,
+                src=src_for_dim,
+                group=dim_group,
+                async_op=async_op,
+            )
+
+        return fut
+
+    def broadcast(
+        self,
+        tensor: torch.Tensor,
+        mesh_dim: int = 0,
+        async_op: bool = False,
+    ) -> Optional[Work]:
+        """
+        broadcast the tensor to a device mesh dimension. We by default
+        use the first rank of the mesh dimension as the source of truth, i.e
+        for a 2d mesh [[0, 1], [2, 3]], if we broadcast on mesh_dim = 1, we will
+        broadcast the tensor on rank 0 to rank 0/1, and tensor on rank 2
+        to rank 2/3.
+
+        Args:
+            tensor (torch.Tensor): tensor to broadcast.
+            mesh_dim (int, optional): indicate which mesh dimension we want
+                to scatter on, we by default choose the first rank on the
+                mesh dimension as source of truth.
+
+        Returns:
+            A :class:`Work` object
+        """
+        dim_group = self._dim_groups[mesh_dim]
+        # src need to be global rank
+        src_for_dim = 0
+        if dim_group is not GroupMember.WORLD:
+            src_for_dim = get_global_rank(dim_group, 0)
+
+        return broadcast(
+            tensor, src=src_for_dim, group=dim_group, async_op=async_op
+        )
+
+    def all_gather(
+        self,
+        tensor_list: List[torch.Tensor],
+        tensor: torch.Tensor,
+        mesh_dim: int = 0,
+        async_op: bool = False,
+    ) -> Optional[Work]:
+        """
+        all_gather the tensor on each rank to the tensor_list on a
+        device mesh dimension.
+
+        Args:
+            tensor_list (List[torch.Tensor]): The gathered tensor list.
+            tensor (torch.Tensor): tensor to be gathered on each rank.
+            mesh_dim (int, optional): indicate which mesh dimension we want
+                to scatter on, we by default choose the first rank on the
+                mesh dimension as source of truth.
+
+        Returns:
+            A :class:`Work` object
+        """
+        dim_group = self._dim_groups[mesh_dim]
+        return all_gather(
+            tensor_list, tensor, group=dim_group, async_op=async_op
+        )
+
+    def all_reduce(
+        self,
+        tensor: torch.Tensor,
+        op: ReduceOp = ReduceOp.SUM,  # type: ignore[assignment]
+        mesh_dim: int = 0,
+        async_op: bool = False,
+    ) -> Optional[Work]:
+        """
+        all_reduce the tensor on each rank on a device mesh dimension, and
+        return an output tensor on each rank after all_reduce.
+
+        Args:
+            tensor (torch.Tensor): tensor to be all_reduced on each rank.
+            op (:class:`torch.distributed.distributed_c10d.ReduceOp, optional):
+                the reduction op of all_reduce (i.e. ReduceOp.SUM)
+            mesh_dim (int, optional): indicate which mesh dimension we want
+                to reduce on.
+
+        Returns:
+            A :class:`Work` object
+        """
+        dim_group = self._dim_groups[mesh_dim]
+        return all_reduce(tensor, op=op, group=dim_group, async_op=async_op)
+
+    def reduce_scatter(
+        self,
+        output: torch.Tensor,
+        input_list: List[torch.Tensor],
+        op: ReduceOp = ReduceOp.SUM,  # type: ignore[assignment]
+        mesh_dim: int = 0,
+        async_op: bool = False,
+    ) -> Optional[Work]:
+        """
+        reduce the input_list on each rank on a device mesh dimension, and scatter
+        the results to the output tensor on each rank.
+
+        Args:
+            output (torch.Tensor): tensor to receive the scattered result.
+            input_list (List[torch.Tensor]): tensor list to be reduced and scattered
+                and scattered on each rank.
+            op (:class:`torch.distributed.distributed_c10d.ReduceOp, optional):
+                the reduction op of reduce_scatter (i.e. ReduceOp.SUM)
+            mesh_dim (int, optional): indicate which mesh dimension we want
+                to scatter on.
+
+        Returns:
+            A :class:`Work` object
+        """
+        if self._backend == "nccl":
+            dim_group = self._dim_groups[mesh_dim]
+            fut = reduce_scatter(
+                output, input_list, op=op, group=dim_group, async_op=async_op
+            )
+
+        elif self._backend == "gloo":
+            # it's gloo, which does not have reduce_scatter
+            # we have to do all_reduce + scatter
+            warnings.warn(
+                "ProcessGroupGloo does not support reduce_scatter, falling back with all reduce!"
+            )
+            my_coordinate = self.get_coordinate_on_dim(mesh_dim)
+            # TODO: what should happen if rank is not in the mesh?
+            # see issue https://github.com/pytorch/tau/pull/492
+            assert (
+                my_coordinate is not None
+            ), "Rank if not part of mesh"  # TODO: figure out behavior here
+            fut = None
+            flattened_list = []
+            offset_list = []
+
+            offset = 0
+            for input in input_list:
+                offset_list.append(offset)
+                offset += input.numel()
+                flattened_list.append(input.flatten())
+
+            # all reduce since gloo does not support reduce_scatter
+            flat_tensor = torch.cat(flattened_list).clone(
+                memory_format=torch.contiguous_format
+            )
+            fut = self.all_reduce(
+                flat_tensor, op=op, mesh_dim=mesh_dim, async_op=async_op
+            )
+            # scatter the tensor
+            output_offset = offset_list[my_coordinate]
+            output.copy_(
+                flat_tensor[
+                    output_offset : output_offset + output.numel()
+                ].view(output.shape)
+            )
+        else:
+            raise RuntimeError(
+                f"backend {self._backend} does not support reduce_scatter!"
+            )
+        return fut
+
+    # TODO: test uneven split on GLOO and NCCL
+    def all_to_all(
+        self,
+        output_tensor_list: List[torch.Tensor],
+        input_tensor_list: List[torch.Tensor],
+        mesh_dim: int = 0,
+        async_op: bool = False,
+    ) -> Optional[Work]:
+        dim_group = self._dim_groups[mesh_dim]
+
+        work = None
+        # no direct dist.all_to_all support on 'gloo' so we manually do scatters
+        if self.backend() == "gloo":
+            # TODO: pull the handle of uneven case in #492
+            dim_group_size = get_world_size(dim_group)
+            for i in range(dim_group_size):
+                # src need to be global rank
+                src_for_dim = i
+                if dim_group is not GroupMember.WORLD:
+                    src_for_dim = get_global_rank(dim_group, i)
+
+                work = scatter(
+                    output_tensor_list[i],
+                    input_tensor_list if self.get_rank() == src_for_dim else [],
+                    group=dim_group,
+                    src=src_for_dim,
+                    async_op=async_op,
+                )
+
+        elif self.backend() == "nccl":
+            work = all_to_all(
+                output_tensor_list,
+                input_tensor_list,
+                dim_group,
+                async_op=async_op,
+            )
+        else:
+            raise RuntimeError(
+                f"DeviceMesh does not support all-to-all collective operations on {self.backend()} backend."
+            )
+        return work
diff --git a/torch/distributed/_tensor/placement_types.py b/torch/distributed/_tensor/placement_types.py
new file mode 100644
index 0000000000000..f2df183b046db
--- /dev/null
+++ b/torch/distributed/_tensor/placement_types.py
@@ -0,0 +1,432 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+
+from dataclasses import dataclass
+from typing import Optional, List, Sequence, Tuple, cast
+
+import torch
+import torch.distributed.distributed_c10d as c10d
+from torch.distributed._spmd.comm_tensor import CommTensor
+
+from torch.distributed._tensor.device_mesh import DeviceMesh
+
+
+class Placement(object):
+    # base class Placement type
+
+    # convenient utils to check for placement types
+    def is_shard(self, dim: Optional[int] = None) -> bool:
+        if dim is not None and isinstance(self, Shard):
+            return self.dim == dim
+        else:
+            return isinstance(self, Shard)
+
+    def is_replicate(self) -> bool:
+        return isinstance(self, Replicate)
+
+    def is_partial(self) -> bool:
+        return isinstance(self, _Partial)
+
+
+@dataclass
+class Shard(Placement):
+    # shard placement, shard on a dim
+    dim: int
+
+    def _split_tensor(
+        self,
+        tensor: torch.Tensor,
+        num_chunks: int,
+        *,
+        with_padding: bool = True,
+        contiguous: bool = True,
+    ) -> Tuple[List[torch.Tensor], int]:
+        # NOTE: For with_padding option, we pad the tensor on each rank before calling
+        # the collectives (i.e. scatter/all_gather, etc.). This is because for gloo
+        # backend, it does not support uneven collectives, nccl supports some, but
+        # it might be slow compared to even size collective, we need to pad tensor
+        # before really calling the collective, and unpad/narrow it afterwards
+        # TODO: consider if we should remove this logic once ProcessGroupGloo
+        # support uneven list, and collective perfomance on par
+        assert (
+            self.dim <= tensor.ndim
+        ), f"Sharding dim {self.dim} greater than tensor ndim {tensor.ndim}"
+        assert (
+            tensor.size(self.dim) >= num_chunks
+        ), f"Tensors to be sharded on dim {self.dim} must be at least as large as "
+        f"the number of devices in that dimension {num_chunks}"
+        # split tensor over dimension `dim` into n slices with padding if necessary
+        tensor_list = list(tensor.tensor_split(num_chunks, self.dim))
+        idx_start_to_pad = tensor.size(self.dim) % num_chunks
+        if with_padding or contiguous:
+            shard_list = []
+            for i, shard in enumerate(tensor_list):
+                if (
+                    with_padding
+                    and idx_start_to_pad != 0
+                    and i >= idx_start_to_pad
+                ):
+                    shard = self._pad_tensor(shard)
+                # input tensors are expected to be congtiguous by the collective backend
+                shard = shard.contiguous() if contiguous else shard
+                shard_list.append(shard)
+            return shard_list, idx_start_to_pad
+        else:
+            return tensor_list, idx_start_to_pad
+
+    def _pad_tensor(self, tensor: torch.Tensor) -> torch.Tensor:
+        # pad tensor by 1 on the shard dim
+        pad = [0, 0] * (tensor.ndim - self.dim)
+        pad[-1] = 1
+        return torch.nn.functional.pad(tensor, pad)
+
+    def _unpad_tensor(self, tensor: torch.Tensor) -> torch.Tensor:
+        # unpad tensor by 1 on the shard dim
+        return tensor.narrow(
+            self.dim, start=0, length=tensor.size(self.dim) - 1
+        )
+
+    def _local_shard_size_on_dim(
+        self,
+        size_on_dim: int,
+        num_chunks: int,
+        rank: int,
+        return_offset: bool = False,
+    ) -> Tuple[int, int]:
+        """
+        returns the local shard size and offset on a given tensor dim
+        """
+        assert (
+            size_on_dim >= num_chunks
+        ), f"Size to be sharded on dim {self.dim} must be at least as large as the number of devices in that dimension {num_chunks}"
+        split_size, pad_idx = divmod(size_on_dim, num_chunks)
+        local_shard_size = (
+            split_size + 1 if pad_idx != 0 and rank < pad_idx else split_size
+        )
+        local_offset_on_dim = -1
+        if return_offset:
+            local_offset_on_dim = (
+                rank * split_size + pad_idx if rank >= pad_idx else rank
+            )
+        return (local_shard_size, local_offset_on_dim)
+
+    def _shard_tensor(
+        self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
+    ) -> torch.Tensor:
+        """
+        shard and scatter a tensor on a mesh dimension (use coordinate
+        0 on the mesh dimension as source of truth)
+        """
+        my_coordinate = mesh.get_coordinate_on_dim(mesh_dim)
+        num_chunks = mesh.size(dim=mesh_dim)
+        # TODO: what should happen if rank is not in the mesh?
+        # see issue https://github.com/pytorch/tau/pull/492
+        assert (
+            my_coordinate is not None
+        ), "Rank if not part of mesh"  # TODO: figure out behavior here
+        scatter_list, pad_idx = self._split_tensor(
+            tensor, num_chunks, with_padding=True, contiguous=True
+        )
+        output = torch.empty_like(scatter_list[my_coordinate])
+        mesh.scatter(output, scatter_list, mesh_dim=mesh_dim)
+
+        if pad_idx != 0 and my_coordinate >= pad_idx:
+            output = self._unpad_tensor(output)
+        return output
+
+    def _reduce_shard_tensor(
+        self,
+        tensor: torch.Tensor,
+        mesh: DeviceMesh,
+        reduce_op: c10d.ReduceOp,
+        mesh_dim: int,
+    ) -> torch.Tensor:
+        """
+        reduce and scatter a tensor on a mesh dimension
+        """
+        my_coordinate = mesh.get_coordinate_on_dim(mesh_dim)
+        num_chunks = mesh.size(dim=mesh_dim)
+        # TODO: what should happen if rank is not in the mesh?
+        # see issue https://github.com/pytorch/tau/pull/492
+        assert (
+            my_coordinate is not None
+        ), "Rank if not part of mesh"  # TODO: figure out behavior here
+        scattered_list, pad_idx = self._split_tensor(
+            tensor, num_chunks, with_padding=True, contiguous=True
+        )
+        # wrap with comm tensor
+        scattered_list = [CommTensor(t) for t in scattered_list]
+        output = torch.empty_like(scattered_list[my_coordinate])
+        mesh.reduce_scatter(
+            CommTensor(output),
+            scattered_list,  # pyre-ignore[6]
+            op=reduce_op,
+            mesh_dim=mesh_dim,
+        )
+        if pad_idx != 0 and my_coordinate >= pad_idx:
+            output = self._unpad_tensor(output)
+        return output
+
+    def _to_replicate_tensor(
+        self,
+        local_tensor: torch.Tensor,
+        size: torch.Size,
+        mesh: DeviceMesh,
+        mesh_dim: int,
+    ) -> torch.Tensor:
+        """
+        This function all_gather all shards and return a tensor that
+        is replicated on the previously sharded mesh dimension
+        """
+        my_coordinate = mesh.get_coordinate_on_dim(mesh_dim)
+        num_chunks = mesh.size(dim=mesh_dim)
+        # TODO: what should happen if rank is not in the mesh?
+        # see issue https://github.com/pytorch/tau/pull/492
+        assert (
+            my_coordinate is not None
+        ), "Rank if not part of mesh"  # TODO: figure out behavior here
+        # check if it needs to pad input tensor before all_gather
+        pad_idx = size[self.dim] % num_chunks
+        if pad_idx != 0 and my_coordinate >= pad_idx:
+            local_tensor = self._pad_tensor(local_tensor).contiguous()
+
+        gathered_list = []
+        # N.B. CommTensor does not change eager mode behavior. During tracing, it
+        # makes sure communication result is properly waited before subsequent
+        # read operations.
+        for _ in range(num_chunks):
+            gathered_list.append(
+                CommTensor(
+                    torch.empty_like(
+                        local_tensor,
+                        memory_format=torch.contiguous_format,
+                    )
+                )
+            )
+
+        mesh.all_gather(gathered_list, CommTensor(local_tensor.contiguous()), mesh_dim=mesh_dim)  # type: ignore[arg-type]
+        # unpad the tensor if the input tensor was padded
+        if pad_idx != 0:
+            gathered_list = [
+                self._unpad_tensor(gathered_tensor)  # type: ignore[misc]
+                if i >= pad_idx
+                else gathered_tensor
+                for i, gathered_tensor in enumerate(gathered_list)
+            ]
+        return torch.cat(gathered_list, dim=self.dim)  # type: ignore[arg-type]
+
+
+@dataclass
+class Replicate(Placement):
+    # replicate placement
+    pass
+
+
+@dataclass
+class _Partial(Placement):
+    # This is a default partial placement with element-wise reduce op
+    # when doing reduction it follows the contract of `_to_replicate`
+    # and `_to_shard` to do the reduction and convert the local tensor
+    # to the corresponding state (replicate or shard)
+    #
+    # We can implement custom reductions as needed by subclassing this
+    # class and override those contracts.
+    reduce_op: c10d.ReduceOp.RedOpType = c10d.ReduceOp.RedOpType.SUM  # type: ignore[attr-defined]
+
+    def _to_replicate(
+        self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
+    ) -> torch.Tensor:
+        # out-of-place all_reduce to replicate, since the current partial DTensor
+        # might get used by other ops as well, so we can't inplace modify it
+        cloned_local = CommTensor(
+            tensor.clone(memory_format=torch.contiguous_format)
+        )
+        mesh.all_reduce(
+            cloned_local, c10d.ReduceOp(self.reduce_op), mesh_dim=mesh_dim  # type: ignore[call-arg]
+        )
+        return cloned_local
+
+    def _to_shard(
+        self,
+        tensor: torch.Tensor,
+        mesh: DeviceMesh,
+        mesh_dim: int,
+        shard_spec: Placement,
+    ) -> torch.Tensor:
+        # by default call reduce_shard_tensor of the shard_spec.
+        shard_spec = cast(Shard, shard_spec)
+        return shard_spec._reduce_shard_tensor(
+            tensor, mesh, c10d.ReduceOp(self.reduce_op), mesh_dim  # type: ignore[call-arg]
+        )
+
+
+# used internally to propagate the placements
+@dataclass
+class DTensorSpec(object):
+    mesh: DeviceMesh
+    placements: Sequence[Placement]
+    # shape of the current dist tensor, this will be set upon
+    # construction of the DTensor, prop rule could read it, and
+    # would need to set in output spec when calculate the output
+    # sharding
+    shape: torch.Size
+    # ndim of the current dist tensor, if passed in, this would be
+    # validated with shape, if not passed in, will be generated from
+    # the shape
+    ndim: int = -1
+
+    def __post_init__(self) -> None:
+        if self.ndim == -1:
+            self.ndim = len(self.shape)
+
+    @property
+    def dim_map(self) -> List[int]:
+        """
+        dim_map is a property we derive from `placements` of
+        the distributed tensor. It simply return a list of ints
+        where dim_map[i] denotes the sharding mapping to the mesh
+        dimension, and len(dim_map) == dist_tensor.ndim
+        dim_map[i] = -1: means tensor dim i replicate on mesh
+        dim_map[i] = j: means tensor dim i shard on mesh dim j
+
+        For example, we have a dist tensor that have the shape of
+        [18, 20, 30], and device_mesh([0, 1, 2, 3]), placements:
+        [Shard(1)], the dim_map of this placement would be:
+        [-1, 1, -1]. This representation is pretty helpful during
+        sharding propagation where we could know exactly each
+        tensor dimension is sharded or not.
+
+        Note that if placements contains `_Partial`, we have to
+        explicitly deal with it, so that when we create a DTensorSpec
+        with dim_map, we could properly record the pending sums.
+        """
+        # dims mapping of dist tensor sharding
+        # return size of tensor ndim, -1 represent replicate
+        # and int >=0 represent shard on that device mesh dim
+        r = [-1] * self.ndim
+        for i, placement in enumerate(self.placements):
+            if placement.is_shard():
+                shard_dim = cast(Shard, placement).dim
+                if r[shard_dim] > -1:
+                    raise ValueError(
+                        f"Tensor dim {shard_dim} is already sharded on mesh dim {r[shard_dim]},"
+                        " DTensor operator implementation does not support things like hybrid"
+                        " sharding strategies yet (i.e. [Shard(0), Shard(0)])"
+                    )
+                r[shard_dim] = i
+        return r
+
+    @property
+    def sums(self) -> List[int]:
+        """
+        sums is a property we derive from `placements` of the
+        distributed tensor. It simply return a list of ints where
+        sums[i] denotes the pending sum (partial) on mesh dim i
+        """
+        return [
+            idx
+            for idx, placement in enumerate(self.placements)
+            if placement.is_partial()
+        ]
+
+    @property
+    def local_shape(self) -> Tuple[int, ...]:
+        """
+        Compute the shape of a local shard of the given DTensor on its current
+        coordinate of the mesh.
+        """
+        assert (
+            self.shape is not None
+        ), "DTensorSpec does not contain global shape."
+        local_shape = list(self.shape)  # start with global shape
+        for idx, placement in enumerate(self.placements):
+            mesh_dim_size = self.mesh.size(idx)
+            my_coordinate = self.mesh.get_coordinate_on_dim(idx)
+            assert my_coordinate is not None, "Rank not part of mesh!"
+            if isinstance(placement, Shard):
+                shard_dim = placement.dim
+                assert (
+                    shard_dim < self.ndim
+                ), f"Sharding dim {shard_dim} greater than tensor ndim {self.ndim}"
+                local_shard_size, _ = placement._local_shard_size_on_dim(
+                    local_shape[shard_dim], mesh_dim_size, my_coordinate
+                )
+                assert isinstance(local_shard_size, int)
+                local_shape[shard_dim] = local_shard_size
+        return tuple(local_shape)
+
+    @property
+    def local_offsets(self) -> Tuple[int, ...]:
+        """
+        Compute the offsets of a local shard of the given DTensor on its current
+        global rank. This is mostly used by distributed checkpointing to know the
+        exact offsets of the local shard.
+        """
+        assert (
+            self.shape is not None
+        ), "DTensorSpec does not contain global shape."
+        local_offsets = [0] * self.ndim
+        local_shape = list(self.shape)
+
+        for idx, placement in enumerate(self.placements):
+            mesh_dim_size = self.mesh.size(idx)
+            my_coordinate = self.mesh.get_coordinate_on_dim(idx)
+            assert my_coordinate is not None, "Rank not part of mesh!"
+            if isinstance(placement, Shard):
+                shard_dim = placement.dim
+                assert (
+                    shard_dim < self.ndim
+                ), f"Sharding dim {shard_dim} greater than tensor ndim {self.ndim}"
+                shard_size, shard_offset = placement._local_shard_size_on_dim(
+                    local_shape[shard_dim],
+                    mesh_dim_size,
+                    my_coordinate,
+                    return_offset=True,
+                )
+                local_shape[shard_dim] = shard_size
+                local_offsets[shard_dim] = shard_offset
+        return tuple(local_offsets)
+
+    @classmethod
+    def from_dim_map(
+        cls,
+        mesh: DeviceMesh,
+        dim_map: List[int],
+        sums: List[int],
+        shape: torch.Size,
+    ) -> "DTensorSpec":
+        """
+        Construct a DTensorSpec from dim_map list and pending sum.
+
+        Args:
+            mesh (class:`DeviceMesh`): device mesh to be used in the DTensorSpec
+            dim_map (List[int]): a list of integer that represents sharding on each
+                tensor dimension, see `dim_map` property doc for details
+            sums (List[int]): a list of integer that represents the dist tensor have
+                pending sum on which device mesh dimension.
+            shape (torch.Size): shape of the DTensor associated with this spec.
+
+        Return:
+            a class:`DTensorSpec` object
+        """
+        # by default replicate on device mesh dims
+        placements: List[Placement] = [Replicate() for _ in range(mesh.ndim)]
+
+        # find all mesh dims that need pending reductions
+        for s in sums:
+            placements[s] = _Partial()
+
+        for i, m in enumerate(dim_map):
+            if m >= 0:
+                placement = placements[m]
+                if placement.is_shard():
+                    placement = cast(Shard, placement)
+                    raise RuntimeError(
+                        f"DeviceMesh dimension cann't be mapped to two dimension of the same tensor: {i} and {placement.dim}"
+                    )
+                elif placement.is_partial():
+                    raise RuntimeError(
+                        f"DeviceMesh dimension {m} cannot be both shard and partial!"
+                    )
+                placements[m] = Shard(i)
+
+        return cls(mesh, placements, shape=shape, ndim=len(dim_map))

From 74dcbc234467b98d1f491d3614541ae3d4353442 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Tue, 15 Nov 2022 08:04:38 +0000
Subject: [PATCH 0947/1922] [dtensor] PART 2: move DTensor abstraction and APIs
 to core distributed (#88176)

This PR moves the core DTensor abstraction and high level APIs to
torch.distributed._tensor folder, which includes the following:
1. DTensor class
2. high level APIs (distribute_tensor/module)
3. dispatching logic
4. redistribute logic

part of https://github.com/pytorch/pytorch/issues/88838
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88176
Approved by: https://github.com/fduwjj
---
 torch/distributed/_tensor/README.md       |   3 +
 torch/distributed/_tensor/__init__.py     | 189 +++++++++++
 torch/distributed/_tensor/api.py          | 393 ++++++++++++++++++++++
 torch/distributed/_tensor/dispatch.py     | 301 +++++++++++++++++
 torch/distributed/_tensor/redistribute.py | 236 +++++++++++++
 torch/distributed/_tensor/utils.py        |  53 +++
 6 files changed, 1175 insertions(+)
 create mode 100644 torch/distributed/_tensor/README.md
 create mode 100644 torch/distributed/_tensor/api.py
 create mode 100644 torch/distributed/_tensor/dispatch.py
 create mode 100644 torch/distributed/_tensor/redistribute.py
 create mode 100644 torch/distributed/_tensor/utils.py

diff --git a/torch/distributed/_tensor/README.md b/torch/distributed/_tensor/README.md
new file mode 100644
index 0000000000000..9bbd71b764e5f
--- /dev/null
+++ b/torch/distributed/_tensor/README.md
@@ -0,0 +1,3 @@
+# Distributed Tensor
+
+This is a prototype distributed tensor implementation that implements most of the basic parts in the RFC https://docs.google.com/document/d/15R3fmoPbzedlKSjtpQ97HFPidp9QTXLEap6gyIvRrMY/edit#
diff --git a/torch/distributed/_tensor/__init__.py b/torch/distributed/_tensor/__init__.py
index e69de29bb2d1d..ba09f2fbb6907 100644
--- a/torch/distributed/_tensor/__init__.py
+++ b/torch/distributed/_tensor/__init__.py
@@ -0,0 +1,189 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from typing import Optional, Sequence, Callable, cast
+
+import torch
+import torch.nn as nn
+from torch.distributed._tensor.api import DTensor
+from torch.distributed._tensor.device_mesh import DeviceMesh, get_global_device_mesh
+from torch.distributed._tensor.placement_types import Placement, Shard, Replicate
+
+
+# Import all builtin dist tensor ops
+# import torch.distributed._tensor.ops
+
+
+def distribute_tensor(
+    tensor: torch.Tensor,
+    device_mesh: Optional[DeviceMesh] = None,
+    placements: Optional[Sequence[Placement]] = None,
+) -> DTensor:
+    """
+    Distribute a torch.Tensor to the `device_mesh` according to the `placements`
+    specified. The rank of `device_mesh` and `placements` must be the same.
+
+    Args:
+        tensor (torch.Tensor): torch.Tensor to be distributed. Note that if you
+            want to shard a tensor on a dimension that is not evenly divisible by
+            the number of devices in that mesh dimension, we use `torch.tensor_split`
+            semantic to shard the tensor and scatter the shards.
+        device_mesh (:class:`DeviceMesh`, optional): DeviceMesh to distribute the
+            tensor, if not specified, must be called under a DeviceMesh context
+            manager, default: None
+        placements (List[:class:`Placement`], optional): the placements that
+            describes how to place the tensor on DeviceMesh, must have the same
+            number of elements as `device_mesh.ndim`. If not specified, we will
+            by default replicate the tensor across the `device_mesh` from the
+            first rank of each dimension of the `device_mesh`.
+
+    Returns:
+        A :class:`DTensor` object
+    """
+    # get default device mesh if there's nothing specified
+    device_mesh = (
+        get_global_device_mesh() if device_mesh is None else device_mesh
+    )
+    # convert tensor to the correponding device type if it's not in that device type
+    tensor = tensor.to(device_mesh.device_type)
+    # set default placements to replicated if not specified
+    if placements is None:
+        placements = [Replicate() for _ in range(device_mesh.ndim)]
+
+    if len(placements) != device_mesh.ndim:
+        raise ValueError(
+            f"`placements` must have the same length as `device_mesh.ndim`! "
+            f"Found placements length: {len(placements)}, and device_mesh.ndim: {device_mesh.ndim}."
+        )
+
+    if isinstance(tensor, DTensor):
+        # if the tensor is already a DTensor, we just need to check if the
+        # device mesh and placements are the same
+        if tensor.device_mesh != device_mesh:
+            raise ValueError(
+                f"Cannot distribute a DTensor with device mesh {tensor.device_mesh} "
+                f"to a different device mesh {device_mesh}."
+            )
+        if tensor.placements != placements:
+            raise ValueError(
+                f"Cannot distribute a DTensor with placements {tensor.placements} "
+                f"to a different placements {placements}. do you want to call "
+                f"`redistribute` instead?"
+            )
+        return tensor
+
+    local_tensor = tensor
+
+    # distribute the tensor according to the placements.
+    for idx, placement in enumerate(placements):
+        if placement.is_shard():
+            placement = cast(Shard, placement)
+            output = placement._shard_tensor(local_tensor, device_mesh, idx)
+            # scatter call could not return a tensor with correct requires_grad
+            # field, as ProcessGroupNCCL refuse to take a tensor with requires_grad
+            # to do inplace update! So we manually set it here
+            output.requires_grad_(tensor.requires_grad)
+            local_tensor = output
+        elif placement.is_replicate():
+            local_tensor = local_tensor.contiguous()
+            device_mesh.broadcast(local_tensor, mesh_dim=idx)
+        else:
+            raise RuntimeError(
+                f"Trying to distribute tensor with unsupported placements {placement} on device mesh dimension {idx}!"
+            )
+
+    assert local_tensor is not None, "distributing a tensor should not be None"
+    return DTensor(
+        local_tensor,
+        device_mesh,
+        placements,
+        size=tensor.size(),
+        requires_grad=tensor.requires_grad,
+    )
+
+
+def distribute_module(
+    module: nn.Module,
+    device_mesh: Optional[DeviceMesh] = None,
+    partition_fn: Optional[Callable[[str, nn.Module, DeviceMesh], None]] = None,
+    input_fn: Optional[Callable[..., None]] = None,
+    output_fn: Optional[Callable[..., None]] = None,
+) -> nn.Module:
+    """
+    This function converts all module parameters to :class:`DTensor` parameters
+    according to the `partition_fn` specified. It could also control the input or
+    output of the module by specifying the `input_fn` and `output_fn`. (i.e. convert
+    the input to :class:`DTensor`, convert the output back to torch.Tensor)
+    Args:
+        module (:class:`nn.Module`): user module to be partitioned.
+        device_mesh (:class:`DeviceMesh`): the device mesh to place the module.
+        partition_fn (Callable): the function to partition parameters (i.e. shard certain
+            parameters across the `device_mesh`). If `partition_fn` is not specified,
+            by default we replicate all module parameters of `module` across the mesh.
+        input_fn (Callable): specify the input distribution, i.e. could control how the
+            input of the module is sharded. `input_fn` will be installed as a module
+            `forward_pre_hook` (pre forward hook).
+        output_fn (Callable): specify the output distribution, i.e. could control how the
+            output is sharded, or convert it back to torch.Tensor. output_fn will be
+            installed as a module `forward_hook` (post forward hook).
+
+    Returns:
+        A module that contains parameters/buffers that are all `DTensor`s.
+    """
+
+    if device_mesh is None:
+        device_mesh = get_global_device_mesh()
+
+    def replicate_module_params_buffers(m: nn.Module, mesh: DeviceMesh) -> None:
+        # This function loop over the immediate module parameters and
+        # buffers, replicate all non DTensor params/buffers to DTensor
+        # parameters/buffers, if they have not been partitioned in the
+        # partition_fn, we can't easily use `module._apply` here
+        # because we don't know what happened inside partition_fn as
+        # user could do anything, i.e. install hooks, and we want to
+        # preserve those.
+        full_replicate = [Replicate()] * mesh.ndim
+        for key, param in m._parameters.items():
+            if param is not None and not isinstance(param, DTensor):
+                m.register_parameter(
+                    key,
+                    nn.Parameter(
+                        distribute_tensor(param.data, mesh, full_replicate)
+                    ),
+                )
+        for key, buffer in m._buffers.items():
+            if buffer is not None and not isinstance(buffer, DTensor):
+                m._buffers[key] = distribute_tensor(
+                    buffer, mesh, full_replicate
+                )
+
+    if partition_fn is None:
+        # if partition_fn not specified, we by default replicate
+        # all module params/buffers
+        for name, submod in module.named_modules():
+            replicate_module_params_buffers(submod, device_mesh)
+    else:
+        # apply partition_fun to submodules
+        for name, submod in module.named_modules():
+            partition_fn(name, submod, device_mesh)
+            replicate_module_params_buffers(submod, device_mesh)
+
+    # register input_fn as module forward pre hook
+    if input_fn is not None:
+        module.register_forward_pre_hook(lambda _, inputs: input_fn(inputs, device_mesh))  # type: ignore[misc]
+    # register input_fn as module forward hook
+    if output_fn is not None:
+        module.register_forward_hook(
+            lambda mod, inputs, outputs: output_fn(outputs, device_mesh)  # type: ignore[misc]
+        )
+
+    return module
+
+
+# All public APIs from dtensor package
+__all__ = [
+    "DTensor",
+    "DeviceMesh",
+    "distribute_tensor",
+    "distribute_module",
+    "Shard",
+    "Replicate",
+]
diff --git a/torch/distributed/_tensor/api.py b/torch/distributed/_tensor/api.py
new file mode 100644
index 0000000000000..bf5514cc7d4e4
--- /dev/null
+++ b/torch/distributed/_tensor/api.py
@@ -0,0 +1,393 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import copy
+import warnings
+import torch
+from torch.utils._pytree import tree_flatten
+from typing import Dict, Callable, Optional, Sequence, cast
+from torch.distributed._tensor.device_mesh import get_global_device_mesh, DeviceMesh
+from torch.distributed._tensor.placement_types import (
+    Placement,
+    Shard,
+    Replicate,
+    _Partial,
+    DTensorSpec,
+)
+from torch.distributed._tensor.redistribute import Redistribute
+
+from torch.distributed._tensor.dispatch import operator_dispatch, OpSchema, OutputSharding
+
+# NOTE [Autograd interaction between torch.Tensor]
+#
+# The autograd functions defined below are being used by the public
+# facing APIs (i.e. from_local, to_local) to ensure our DTensor
+# works together with torch.Tensor within autograd engine. This
+# allows DistributedTensor to exist on part of the module hierarchy
+# and still able to calculate gradients across the torch.Tensor and
+# DistributedTensor boundary.
+# As an example, we have the a module that consists of submodules
+# A, B, and C, the execution flow would be like:
+#  input(torch.Tensor) -> Module A -> Module B -> Module C -> output (torch.Tensor)
+#
+# Suppose I only want to make Module B be a sharded module with
+# DistributedTensor params, we would need to make the folloing
+# flow to work:
+#
+#  input(torch.Tensor) -> Module A
+#       -> DTensor input -> Sharded Module B -> DTensor output
+#           -> output (torch.Tensor) -> Module C -> output (torch.Tensor)
+#
+# We need the conversion from Module A to DTensor input, which is
+# `from_local`, and conversion from DTensor output to output, which
+# is `to_local`, thus these two functions must be Autograd functions.
+#
+class ToTorchTensor(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input: "DTensor"):  # type: ignore[override]
+        ctx.dtensor_device_mesh = input.device_mesh
+        ctx.dtensor_placements = input.placements
+        ctx.dtensor_shape = input.shape
+        ctx.dtensor_requires_grad = input.requires_grad
+        return input._local_tensor.detach()
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor):  # type: ignore[override]
+        device_mesh = ctx.dtensor_device_mesh
+        placements = ctx.dtensor_placements
+        return DTensor(
+            grad_output,
+            device_mesh,
+            placements,
+            size=ctx.dtensor_shape,
+            requires_grad=grad_output.requires_grad,
+        )
+
+
+class FromTorchTensor(torch.autograd.Function):
+    @staticmethod
+    def forward(  # type: ignore[override]
+        ctx,  # pyre-ignore[2]: Parameter must be annotated.
+        input: torch.Tensor,
+        device_mesh: DeviceMesh,
+        placements: Sequence[Placement],
+        run_check: bool,
+    ) -> "DTensor":
+        ctx.previous_placement = placements
+        ctx.previous_device_mesh = device_mesh
+
+        if run_check:
+            # TODO: by default check tensor metas across rank
+            # TODO: See if we need to make this run_check logic
+            # have a corresponding backward.
+            for idx, placement in enumerate(placements):
+                if placement.is_replicate():
+                    # broadcast rank 0 tensor to all ranks
+                    # only broadcast if run_check is True
+                    input = input.contiguous()
+                    device_mesh.broadcast(input, mesh_dim=idx)
+
+        # if it's not by default run_check, we assume user is certain that each
+        # rank has the same tensor shape, and we just use that to calculate the
+        # global shape
+        tensor_shape = list(input.size())
+        for idx, placement in enumerate(placements):
+            if placement.is_shard():
+                shard_dim = cast(Shard, placement).dim
+                local_dim_size = tensor_shape[shard_dim]
+                tensor_shape[shard_dim] = local_dim_size * device_mesh.size(idx)
+
+        dist_tensor = DTensor(
+            input,
+            device_mesh,
+            placements,
+            size=torch.Size(tensor_shape),
+            # requires_grad of the dist tensor depends on if input
+            # requires_grad or not
+            requires_grad=input.requires_grad,
+        )
+        return dist_tensor
+
+    @staticmethod
+    def backward(ctx, grad_output: "DTensor"):  # type: ignore[override]
+        previous_placement = ctx.previous_placement
+        previous_device_mesh = ctx.previous_device_mesh
+
+        # reshard to the placement when creating DistributedTensor
+        # so that the gradient layout matches, and we could return
+        # local gradients directly
+        if grad_output.placements != previous_placement:
+            # pyre-fixme[16]: `Redistribute` has no attribute `apply`.
+            grad_output = Redistribute.apply(
+                grad_output, previous_device_mesh, previous_placement
+            )
+
+        # TODO: backward is also differentiable now, add a test
+        # to test higher level gradients.
+        return grad_output.to_local(), None, None, None
+
+
+class DTensor(torch.Tensor):  # pyre-ignore[13]: pyre is bad at __new__
+    _local_tensor: torch.Tensor
+    _spec: DTensorSpec
+    __slots__ = ["_local_tensor", "_spec"]
+
+    # class attribute that handles operator placements propagation
+    # rules, keyed by aten op name, value is propagation func
+    _op_to_rules: Dict[str, Callable[[OpSchema], OutputSharding]] = {}
+
+    # class attribute that handles custom registered ops, all handled
+    # custom ops should appear in this table, and overriding the default
+    # operators that's been covered by _op_to_rules or fallbacks.
+    # (custom operator is the highest priority when dispatching).
+    # pyre-fixme[24]: Generic type `Callable` expects 2 type parameters.
+    _custom_dispatch_ops: Dict[str, Callable] = {}
+
+    @staticmethod
+    def __new__(
+        cls,
+        local_tensor: torch.Tensor,
+        device_mesh: DeviceMesh,
+        placements: Sequence[Placement],
+        *,
+        size: torch.Size,
+        requires_grad: bool = False,
+    ) -> "DTensor":
+        """
+        Construct a DTensor from a local tensor, device mesh, and placement and
+        other tensor properties (i.e. shape, requires_grad, strides, etc).
+        Note: This is not a public API and it's only supposed to be used by the
+            operator implementations and internals. If you want to construct a
+            DTensor from a local tensor, consider using `DTensor.from_local`, if
+            you want to construct a DTensor from a "global" tensor (where you
+            already have tensor initialized and want to shard this tensor),
+            consider using `distribute_tensor`.
+        """
+        # recover tensor strides from local tensor strides and global size info
+        # in the case of sharding
+        # TODO: we should try to use meta tensor for shape and stride calculation
+        tensor_stride = list(local_tensor.stride())
+        local_size = list(local_tensor.size())
+        for placement in placements:
+            if isinstance(placement, Shard):
+                shard_dim = placement.dim
+                # recover tensor stride by modifying the stride that larger than
+                # the current stride on the shard_dim
+                for i in range(len(tensor_stride)):
+                    if (
+                        i != shard_dim
+                        and tensor_stride[i] >= tensor_stride[shard_dim]
+                    ):
+                        # rescale the stride by the shard size
+                        tensor_stride[i] = (
+                            tensor_stride[i] // local_size[shard_dim]
+                        ) * size[shard_dim]
+            elif not isinstance(placement, (Replicate, _Partial)):
+                raise RuntimeError(
+                    f"placement type {type(placement)} not supported!"
+                )
+
+        if requires_grad != local_tensor.requires_grad:
+            warnings.warn(
+                "To construct DTensor from torch.Tensor, it's recommended to "
+                "use local_tensor.detach() and make requires_grad consistent."
+            )
+
+        # new method instruct wrapper tensor from local_tensor and add
+        # placement spec, it does not do actual distribution
+        r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
+            cls,
+            size,
+            strides=tensor_stride,
+            dtype=local_tensor.dtype,
+            device=local_tensor.device,
+            layout=local_tensor.layout,
+            requires_grad=requires_grad,
+        )
+        # deepcopy and set spec
+        r._spec = DTensorSpec(
+            device_mesh, copy.deepcopy(placements), shape=r.size()
+        )
+        # detach local tensor from autograd graph as we initialize the
+        # distributed tensor and autograd will be working on top of
+        # the wrapper tensor directly instead of local torch.Tensor
+        r._local_tensor = local_tensor.detach()
+        return r
+
+    # pyre-fixme[14]: `__repr__` overrides method defined in `DTensor` inconsistently.
+    # pyre-fixme[3]: Return type must be annotated.
+    def __repr__(self):
+        # TODO: consider all_gather the local tensors for better debugging
+        return f"DTensor(local_tensor={self._local_tensor}, device_mesh={self._spec.mesh}, placements={self._spec.placements})"
+
+    @classmethod
+    # pyre-fixme[3]: Return type must be annotated.
+    # pyre-fixme[2]: Parameter must be annotated.
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        # if we find nn.functional name in dispatch op, dispatch to it instead,
+        # this allow us to override some python level behaviors that wouldn't be
+        # possible in __torch_dispatch__ level.
+        if func.__name__ in DTensor._custom_dispatch_ops:
+            # dispatch to the same table as the name should be different between
+            # torch_function and torch_dispatch
+            return DTensor._custom_dispatch_ops[func.__name__](*args, **kwargs)
+        else:
+            # if not, just do nothing here
+            return super().__torch_function__(func, types, args, kwargs)
+
+    @classmethod
+    # pyre-fixme[3]: Return type must be annotated.
+    # pyre-fixme[2]: Parameter must be annotated.
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        # check that we are not getting mixed vanilla and Distributed tensors
+        arg_list, _ = tree_flatten(args)
+        for arg in arg_list:
+            if isinstance(arg, torch.Tensor) and not isinstance(arg, DTensor):
+                raise RuntimeError(
+                    f"{func}: got mixed distributed and non-distributed tensors."
+                )
+
+        if kwargs is None:
+            kwargs = {}
+
+        return operator_dispatch(
+            func,
+            args,
+            kwargs,
+            DTensor._op_to_rules,
+            DTensor._custom_dispatch_ops,
+        )
+
+    @classmethod
+    def from_local(
+        cls,
+        local_tensor: torch.Tensor,
+        device_mesh: Optional[DeviceMesh] = None,
+        placements: Optional[Sequence[Placement]] = None,
+        run_check: bool = True,
+    ) -> "DTensor":
+        """
+        Create a :class:`DTensor` from a local torch.Tensor on each rank
+        according to the `device_mesh` and `placements` specified.
+
+        Args:
+            local_tensor (torch.Tensor): local torch.Tensor on each rank.
+            device_mesh (:class:`DeviceMesh`, optional): DeviceMesh to place the
+                tensor, if not specified, must be called under a DeviceMesh
+                context manager, default: None
+            placements (List[:class:`Placement`], optional): the placements that
+                describes how to place the local torch.Tensor on DeviceMesh, must
+                have the same number of elements as `device_mesh.ndim`. If not
+                specified, we will by default replicate the tensor across the
+                `device_mesh` from the first rank of each dimension of the `device_mesh`.
+            run_check (bool, optional): indicate whether to run check across ranks
+                to check meta information and data. if have :class:`Replicate` in
+                `placements`, the data on first rank of the device mesh dimension
+                will be broadcasted to other ranks.
+
+        Returns:
+            A :class:`DTensor` object
+
+        .. note:: `from_local` is differentiable, the `requires_grad` of the created
+            `DTensor` object will depend on if `local_tensor` requires_grad or not.
+        """
+        # if same shape/dtype, no need to run_check, if not, must allgather
+        # the metadatas to check the size/dtype across ranks
+        # There should be no data communication unless there's replication
+        # strategy, where we broadcast the replication from the first rank
+        # in the mesh dimension
+        device_mesh = (
+            get_global_device_mesh() if device_mesh is None else device_mesh
+        )
+        # convert the local tensor to desired device base on device mesh's device_type
+        local_tensor = local_tensor.to(device_mesh.device_type)
+
+        # set default placements to replicated if not specified
+        if placements is None:
+            placements = [Replicate() for _ in range(device_mesh.ndim)]
+
+        # `from_local` is differentiable, and the gradient of the dist tensor this function
+        # created should flow back the gradients to the local_tensor, so we call an autograd
+        # function to construct the dist tensor instead.
+        return FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+            local_tensor, device_mesh, placements, run_check
+        )
+
+    def to_local(self) -> torch.Tensor:
+        """
+        Get the local tensor of this DTensor on its current rank. For sharding it returns
+        a local shard of the logical tensor view, for replication it returns the replica on
+        its current rank.
+
+        Returns:
+            A :class:`torch.Tensor` object that represents the local tensor of its current rank.
+
+        .. note:: `to_local` is differentiable, the `requires_grad` of the local tensor returned
+            will depend on if the `DTensor` requires_grad or not.
+        """
+        return ToTorchTensor.apply(self)  # pyre-ignore[16]: autograd func
+
+    def redistribute(
+        self,
+        device_mesh: Optional[DeviceMesh] = None,
+        placements: Optional[Sequence[Placement]] = None,
+    ) -> "DTensor":
+        """
+        `redistribute` performs necessary collective operations that redistribute the current
+        DTensor from its current placements to a new placements, or from is current DeviceMesh
+        to a new DeviceMesh. i.e. we can turn a Sharded DTensor to a Replicated DTensor by
+        specifying a Replicate placement for each dimension of the DeviceMesh.
+
+        Args:
+            device_mesh (:class:`DeviceMesh`, optional): DeviceMesh to place the
+                DTensor, if not specified, must be called under a DeviceMesh
+                context manager, default: None
+            placements (List[:class:`Placement`], optional): the new placements that
+                describes how to place the DTensor into the DeviceMesh, must
+                have the same number of elements as `device_mesh.ndim`.
+
+        Returns:
+            A :class:`DTensor` object
+
+        .. note:: `redistribute` is differentiable.
+        """
+        # This API perform necessary transformations and get
+        # a new DTensor with the new spec. i.e. for
+        # sharding it's a reshard behavior.
+        # Note that redistribute currently only supports out
+        # of place redistribution, i.e. it always create a new
+        # DTensor object and leave the original one unchanged.
+        device_mesh = (
+            get_global_device_mesh() if device_mesh is None else device_mesh
+        )
+        # raise error if new placements not specified
+        if placements is None:
+            raise RuntimeError("placements is needed for redistribute!")
+
+        for placement in placements:
+            if placement.is_partial():
+                raise RuntimeError(
+                    "Can not redistribute to _Partial, _Partial is for internal use only!"
+                )
+
+        # pyre-fixme[16]: `Redistribute` has no attribute `apply`.
+        return Redistribute.apply(self, device_mesh, placements)
+
+    @property
+    def device_mesh(self) -> DeviceMesh:
+        """
+        The :class:`DeviceMesh` attribute that associates with this DTensor object.
+
+        .. note:: device_mesh is a read-only property, it can not be set.
+        """
+        return self._spec.mesh
+
+    @property
+    def placements(self) -> Sequence[Placement]:
+        """
+        The placements attribute of this DTensor that describes the layout of this
+        DTensor on the its DeviceMesh.
+
+        .. note:: placements is a read-only property, it can not be set.
+        """
+        return self._spec.placements
diff --git a/torch/distributed/_tensor/dispatch.py b/torch/distributed/_tensor/dispatch.py
new file mode 100644
index 0000000000000..8c9e5a22efb83
--- /dev/null
+++ b/torch/distributed/_tensor/dispatch.py
@@ -0,0 +1,301 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from dataclasses import dataclass
+from typing import List, Callable, Dict, Tuple, Optional, cast
+
+import torch
+from torch.utils._pytree import tree_map, tree_flatten, tree_unflatten
+from torchgen.model import FunctionSchema, SchemaKind
+
+import torch.distributed._tensor.api as dtensor
+from torch.distributed._tensor.placement_types import DTensorSpec
+from torch.distributed._tensor.redistribute import redistribute_dtensor
+from torch.distributed._tensor.utils import (
+    ArgKwargsType,
+    OutputSpecType,
+    unwrap_local_tensor,
+    unwrap_schema,
+    wrap,
+)
+
+
+"""
+If _ENABLE_FALLBACK set to False, dispatch will fail when an op doesn't
+have a sharding rule registered.
+"""
+_ENABLE_FALLBACK = False
+
+
+"""
+Print information on ops input shape and sharding for debugging purposes.
+"""
+_DEBUG_VERBOSE = False
+
+
+@dataclass
+class OpSchema(object):
+    """
+    OpSchema is a data class that describes an operator input schemas, it
+    includes DTensor DTensorSpecs and non-tensor args/kwargs (positional order
+    preserved). It is mainly used by the dispatching logic below to run things like
+    sharding propagation.
+
+    Sharding propagation rules registered could utilize this data class and
+    do inplace update some fields (when necessary, i.e shape related ops) to make
+    sure the args/kwargs are legit before passing to the local tensor operator.
+    This is the main reason that we don't freeze this dataclass.
+
+    NOTE: greater access to the operator inputs comes with greater responsibility.
+    Here are some basic rules about what can be used and what can be changed.
+
+    Args:
+        func_schema: the function schema of the operator
+        args_schema: contains args except that the DTensor args have been replaced
+            with its DTensorSpec
+        kwargs_schema: contains kwargs except that the DTensor kwargs have been replaced
+            with its DTensorSpec
+
+    What can be used:
+        - every attribute within this class could be read to conduct
+          sharding propagation.
+    What can be changed:
+        - only the args_schema and kwargs_schema could be changed.
+        - every non-tensor args could be changed to accomodate for local tensor
+          operations (i.e. for ops like view/reshape/...)
+        - every "DTensorSpec" attribute inside `args_schema`, `kwargs_schema` and
+          `args_spec` SHOULD NOT be updated! DTensorSpec are read only and sharding
+          propagation shouldn't inplace update them, otherwise the input DTensor
+          placements will get implicitly changed and it's error-prone.
+    """
+
+    func_schema: FunctionSchema
+    args_schema: Tuple[object, ...]
+    kwargs_schema: Dict[str, object]
+    is_inplace: bool = False
+    is_out_variant: bool = False
+
+    def __post_init__(self) -> None:
+        schema_kind = self.func_schema.kind()
+        self.is_inplace = (
+            schema_kind
+            == SchemaKind.inplace  # pyre-ignore [16] pyre bad at enum
+        )
+        self.is_out_variant = (
+            schema_kind == SchemaKind.out  # pyre-ignore [16] pyre bad at enum
+        )
+
+    @property
+    def args_spec(self) -> Tuple[DTensorSpec, ...]:
+        """
+        args_spec: Tuple[DTensorSpec, ...]: contains a clean list of args spec list
+            with NO non-DTensor positional arguments (i.e. int/float/tuple, etc)
+            mainly used by sharding propagation to propagate the output spec
+        """
+        # filter out non-relavant values from args schema to get a clean spec list
+        # this would mainly be used by sharding propagation rules
+        return tuple(
+            item for item in self.args_schema if isinstance(item, DTensorSpec)
+        )
+
+    def __repr__(self) -> str:
+        return (
+            f"OpSchema(func_schema={self.func_schema},"
+            f" args_schema={self.args_schema},"
+            f" kwargs_schema={self.kwargs_schema})"
+        )
+
+
+@dataclass
+class OutputSharding:
+    """
+    OutputSharding is a data class that is used by the sharding propagation
+    rules, it could set the output_spec upon successful propagation, and if
+    it failed, output_spec would become None and sharding propagation rules
+    could give a list of suggestions for inputs to reshard.
+
+    NOTE: the schema_suggestion generated by sharding propagation should be
+    exactly the same as the operator OpSchema, except the DTensor DTensorSpecs
+    """
+
+    output_spec: OutputSpecType
+    schema_suggestions: Optional[List[OpSchema]] = None
+    failed_reason: Optional[str] = None
+
+
+def pack_args_kwargs_with_local_tensor(
+    args: ArgKwargsType,
+    args_schema: ArgKwargsType,
+    redistribute_with_schema: bool = False,
+) -> ArgKwargsType:
+    flatten_args, args_tree_spec = tree_flatten(args)
+    flatten_args_schema, _ = tree_flatten(args_schema)
+
+    for i, arg in enumerate(flatten_args):
+        if isinstance(arg, dtensor.DTensor):
+            if redistribute_with_schema:
+                target_spec = flatten_args_schema[i]
+                arg = redistribute_dtensor(
+                    arg, target_spec.mesh, target_spec.placements
+                )
+
+            # reuse the schema list and update it with local tensor
+            flatten_args_schema[i] = arg._local_tensor
+
+    return tree_unflatten(flatten_args_schema, args_tree_spec)
+
+
+def _reshape_alias(
+    x: torch.Tensor, shape: Tuple[int, ...], strides: Tuple[int, ...]
+) -> torch.Tensor:
+    return torch.ops.aten.view(x, shape)
+
+
+_CURRENT_DECOMPOSITION_TABLE: Dict[
+    Callable[..., object], Callable[..., object]
+] = {torch.ops.aten._reshape_alias.default: _reshape_alias}
+
+
+def propagate_input_sharding(
+    op_call: torch._ops.OpOverload,
+    args: Tuple[object, ...],
+    kwargs: Dict[str, object],
+    op_to_rules: Dict[str, Callable[[OpSchema], OutputSharding]],
+) -> Tuple[OpSchema, bool, Optional[OutputSharding]]:
+    # parse the operator schema
+    func_schema = FunctionSchema.parse(str(op_call._schema))
+    # unwrap the args/kwargs schema
+    args_schema = tree_map(unwrap_schema, args)
+    kwargs_schema = tree_map(unwrap_schema, kwargs)
+
+    op_schema = OpSchema(func_schema, args_schema, kwargs_schema)
+
+    if _DEBUG_VERBOSE and torch.distributed.get_rank() == 0:
+        print(f"{op_call}({op_schema})")
+        local_shapes = tree_map(
+            lambda t: t.to_local().shape
+            if isinstance(t, dtensor.DTensor)
+            else None,
+            args,
+        )
+        print(f"    local shapes: {local_shapes}")
+
+    op_key = str(op_call)
+    sharding_prop_func = op_to_rules.get(op_key, None)
+
+    if sharding_prop_func is None:
+        # step 1. If there's not even one sharding rule
+        # implemented for the operator, we fall back to
+        # local tensor compute, this is wront currently
+        # we will change the behavior to reshard to full
+        # replicate and do the computatation
+        if not _ENABLE_FALLBACK:
+            raise NotImplementedError(
+                f"Operator {op_key} does not have a DistributedTensor rule registered."
+            )
+        else:
+            return op_schema, False, None
+
+    # step 2. there's sharding propagation rule, run
+    # sharding propagation to get output sharding
+    try:
+        output_sharding = sharding_prop_func(op_schema)
+    except Exception as e:
+        raise RuntimeError(
+            f"Sharding propagation failed on op {op_key}.\n"
+            f"Input schema: {op_schema}.\n"
+            f"Error: {e}"
+        ) from e
+
+    # step 3. if can't get output_spec from sharding
+    # propagation (i.e. no rules apply for input
+    # placements), we do auto redistribute on inputs
+    # to get an eligble input, which we will pick a
+    # target schema base on the redistribute cost
+    # TODO: implement full auto distribute with a
+    # simple cost estimation model
+    if output_sharding.output_spec is None:
+        # do auto distributed/boxing here
+        if output_sharding.schema_suggestions is not None:
+            # pick the first suggestion for now,
+            target_schema = output_sharding.schema_suggestions[0]
+            # run sharding propagation again with target schema
+            output_sharding = sharding_prop_func(target_schema)
+
+            return target_schema, True, output_sharding
+
+        else:
+            raise RuntimeError(
+                f"Sharding propagation failed on op {op_key}!"
+                f"Input schema: {op_schema}."
+                f"Failed reason: {output_sharding.failed_reason}"
+            )
+    else:
+        return op_schema, False, output_sharding
+
+
+def operator_dispatch(
+    op_call: torch._ops.OpOverload,
+    args: Tuple[object, ...],
+    kwargs: Dict[str, object],
+    op_to_rules: Dict[str, Callable[[OpSchema], OutputSharding]],
+    custom_dispatch_ops: Dict[str, Callable[..., object]],
+) -> object:
+    # first we need to lift some private aten aliases to public calls
+    if op_call in _CURRENT_DECOMPOSITION_TABLE:
+        return _CURRENT_DECOMPOSITION_TABLE[op_call](*args, **kwargs)
+
+    # STEP 0. See if threre're user defined custom aten operator
+    # implementations. Custom operators take the highest priority
+    if str(op_call) in custom_dispatch_ops:
+        # dispatch to user defined custom distributed tensor ops
+        return custom_dispatch_ops[str(op_call)](*args, **kwargs)
+
+    target_schema, redistribute, output_sharding = propagate_input_sharding(
+        op_call, args, kwargs, op_to_rules
+    )
+
+    if output_sharding is None:
+        # default to local tensor ops, this is wrong
+        # but we use it now to enable more tensor point-wise ops
+        # TODO: delete this and use replicate (all_gather) as
+        # the default fallback.
+        tensor_args = tree_map(unwrap_local_tensor, args)
+        tensor_kwargs = tree_map(unwrap_local_tensor, kwargs)
+        local_results = op_call(*tensor_args, **tensor_kwargs)
+        return wrap(local_results, target_schema.args_spec[0])
+
+    local_tensor_args = pack_args_kwargs_with_local_tensor(
+        args,
+        target_schema.args_schema,
+        redistribute_with_schema=redistribute,
+    )
+    local_tensor_kwargs = pack_args_kwargs_with_local_tensor(
+        kwargs,
+        target_schema.kwargs_schema,
+        redistribute_with_schema=redistribute,
+    )
+
+    # run local op computation with potentially modified args/kwargs
+    local_tensor_args = cast(Tuple[object, ...], local_tensor_args)
+    local_tensor_kwargs = cast(Dict[str, object], local_tensor_kwargs)
+    local_results = op_call(*local_tensor_args, **local_tensor_kwargs)
+
+    if target_schema.is_inplace:
+        # inplace op should return self instead of re-wrapping
+        self = cast(dtensor.DTensor, args[0])
+        self._spec = cast(DTensorSpec, output_sharding.output_spec)
+        return self
+    elif target_schema.is_out_variant:
+        # out variant could possibly have multiple out args (i.e. lu_unpack.out)
+        output_specs = (
+            (output_sharding.output_spec,)
+            if not isinstance(output_sharding.output_spec, tuple)
+            else output_sharding.output_spec
+        )
+        out_dts = []
+        for i, out in enumerate(target_schema.func_schema.arguments.out):
+            out_dt = cast(dtensor.DTensor, kwargs[out.name])
+            out_dt._spec = cast(DTensorSpec, output_specs[i])
+            out_dts.append(out_dt)
+        return tuple(out_dts) if len(out_dts) > 1 else out_dts[0]
+    else:
+        return wrap(local_results, output_sharding.output_spec)
diff --git a/torch/distributed/_tensor/redistribute.py b/torch/distributed/_tensor/redistribute.py
new file mode 100644
index 0000000000000..ab36cd4089036
--- /dev/null
+++ b/torch/distributed/_tensor/redistribute.py
@@ -0,0 +1,236 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from typing import Dict, List, Sequence, Tuple, cast
+
+import torch
+import torch.distributed._tensor.api as dtensor
+from torch.distributed._tensor.placement_types import Placement, _Partial, Shard, Replicate
+from torch.distributed._tensor.device_mesh import DeviceMesh
+
+
+_PlacementItem = Tuple[int, Tuple[Placement, Placement]]
+
+
+def _replicate_then_shard(val: _PlacementItem) -> int:
+    """
+    Replicate from inner to outer dimension.
+    Shard from outer to inner dimension.
+    """
+    i, (current, target) = val
+    if (target.is_replicate() or target.is_partial()) and current.is_shard():
+        return -i
+    elif (current.is_replicate() or current.is_partial()) and target.is_shard():
+        return i
+    else:
+        return 0
+
+
+def _decompose_reshard(val: List[_PlacementItem]) -> List[_PlacementItem]:
+    """
+    Decompose Si -> Sj into Si -> R -> Sj
+    There's 2 ways a shardings can differ within a mesh dimension:
+      1) sharding on different tensor dimensions, e.g. Shard(0) -> Shard(1)
+      2) different sub-shards of a repeated shard ("mis-aligned sharding")
+          (Shard(0), Shard(0)) -> (Replicate(), Shard(0))
+          Here the Shard(0) -> Shard(0) for mesh dimension 2 is actually
+          a reshard, because in the first case it's a sub-sharding of an already tensor dimension 0,
+          and in the second case, it's the first sharding on tensor dimesnion 0.
+    """
+    # detect mis-aligned repeated shardings
+    from collections import defaultdict
+
+    repeat_dim_current: Dict[int, int] = defaultdict(int)
+    repeat_dim_target: Dict[int, int] = defaultdict(int)
+
+    output: List[_PlacementItem] = []
+
+    for i, (current, target) in val:
+        # detect mis-aligned sharding
+        if current.is_shard():
+            repeat_dim_current[cast(Shard, current).dim] += 1
+        if target.is_shard():
+            repeat_dim_target[cast(Shard, target).dim] += 1
+        if (
+            isinstance(current, Shard)
+            and isinstance(target, Shard)
+            and (
+                current.dim != target.dim
+                or repeat_dim_current[current.dim]
+                != repeat_dim_target[target.dim]
+            )
+        ):
+            # decompose Shard(i) -> Shard(j) into Shard(i) -> Replicate() -> Shard(j)
+            output.append((i, (current, Replicate())))
+            output.append((i, (Replicate(), target)))
+        else:
+            output.append((i, (current, target)))
+
+    return output
+
+
+# Intentionally expose this API to trace ops on local tensors
+def _redistribute_with_local_tensor(
+    local_tensor: torch.Tensor,
+    size: torch.Size,
+    device_mesh: DeviceMesh,
+    current_placements: Sequence[Placement],
+    target_placements: Sequence[Placement],
+) -> torch.Tensor:
+    new_local_tensor = None
+
+    sorted_placements = list(
+        enumerate(zip(current_placements, target_placements))
+    )
+    sorted_placements = _decompose_reshard(sorted_placements)
+    sorted_placements.sort(key=_replicate_then_shard)
+
+    for i, (current, target) in sorted_placements:
+        my_coordinate = device_mesh.get_coordinate_on_dim(i)
+        num_chunks = device_mesh.size(dim=i)
+        # TODO: what should happen if rank is not in the mesh?
+        # see issue https://github.com/pytorch/tau/pull/492
+        assert (
+            my_coordinate is not None
+        ), "Rank if not part of mesh"  # TODO: figure out behavior here
+
+        if current == target:
+            # short cut, just use the original local tensor
+            new_local_tensor = local_tensor
+            continue
+
+        if target.is_replicate():
+            # Case 1: target is Replicate
+            if current.is_partial():
+                partial_spec = cast(_Partial, current)
+                new_local_tensor = partial_spec._to_replicate(
+                    local_tensor, device_mesh, i
+                )
+            elif current.is_shard():
+                current_placement = cast(Shard, current)
+                new_local_tensor = current_placement._to_replicate_tensor(
+                    local_tensor, size, device_mesh, i
+                )
+            else:
+                raise RuntimeError(
+                    f"redistribute from {current_placements} to {target_placements} not supported yet"
+                )
+        elif target.is_shard():
+            # Case 2: target is Shard
+            target_placement = cast(Shard, target)
+            if current.is_partial():
+                partial_spec = cast(_Partial, current)
+                new_local_tensor = partial_spec._to_shard(
+                    local_tensor, device_mesh, i, target_placement
+                )
+            elif current.is_replicate():
+                # split the tensor and return the corresponding cloned local shard
+                shards, _ = target_placement._split_tensor(
+                    local_tensor,
+                    num_chunks,
+                    with_padding=False,
+                    contiguous=False,
+                )
+                new_local_tensor = shards[my_coordinate].clone()
+            else:
+                # NOTE: this case shouldn't hit _decompose_sharding, decompose sharding should
+                # decompose Shard(0) -> Shard(1) into Shard(0) -> Replicate -> Shard(1)
+                assert (
+                    current.is_shard()
+                ), f"Current placement should be shard but found {current}"
+                shard_spec = cast(Shard, current)
+                if shard_spec.dim != target_placement.dim:
+                    # TODO: enable this with all_to_all
+                    raise NotImplementedError(
+                        "Changing sharding dim is not supported yet!"
+                    )
+
+        elif target.is_partial():
+            if current.is_replicate():
+                # For replicate -> partial, we zero out all other ranks of the current mesh dim
+                # and leave only 1 rank have the data, to perform a "zero cost" reshard.
+                if my_coordinate is not None and my_coordinate != 0:
+                    new_local_tensor = local_tensor.zero_()
+                else:
+                    new_local_tensor = local_tensor
+            else:
+                raise RuntimeError(
+                    f"redistribute from {current_placements} to {target_placements} not supported yet"
+                )
+
+        assert new_local_tensor is not None
+        local_tensor = new_local_tensor
+
+    assert new_local_tensor is not None, "redistribute failed!"
+
+    return new_local_tensor
+
+
+def redistribute_dtensor(
+    input: "dtensor.DTensor",
+    device_mesh: DeviceMesh,
+    placements: Sequence[Placement],
+) -> "dtensor.DTensor":
+    if input.device_mesh != device_mesh:
+        # TODO: alltoall reshuffling to change device_mesh if they are not the same
+        raise NotImplementedError("Cross device mesh comm not supported yet!")
+
+    local_tensor = input._local_tensor
+    new_local_tensor = _redistribute_with_local_tensor(
+        local_tensor,
+        input.size(),
+        device_mesh,
+        input.placements,
+        placements,
+    )
+
+    return dtensor.DTensor(
+        new_local_tensor,
+        device_mesh,
+        placements,
+        size=input.size(),
+        requires_grad=local_tensor.requires_grad,
+    )
+
+
+class Redistribute(torch.autograd.Function):
+    @staticmethod
+    def forward(  # type: ignore[override]
+        # pyre-fixme[2]: Parameter must be annotated.
+        ctx,
+        input: "dtensor.DTensor",
+        device_mesh: DeviceMesh,
+        placements: List[Placement],
+    ):
+        ctx.previous_placement = input.placements
+        ctx.previous_device_mesh = input.device_mesh
+        return redistribute_dtensor(input, device_mesh, placements)
+
+    @staticmethod
+    def backward(ctx, grad_output: "dtensor.DTensor"):  # type: ignore[override]
+        previous_placement = ctx.previous_placement
+        previous_device_mesh = ctx.previous_device_mesh
+        # When we run backward pass of redistribute (i.e. manual redistribute from
+        # user code instead of torch_dispatch), we scan first and see if we need
+        # to change the target placement for one special case:
+        #   replicate -> partial.
+        # In this case we keep the grad as replicate, this is because we don't
+        # want to convert the replicated gradients back to partial, although
+        # that's logically conform with the same layout, converting the gradients
+        # back to partial is acutally useless as you would have to do reduce later
+        # which would be more expensive than keeping it replicate! For this reason,
+        # we keep the replicate grad here.
+        # TODO: see if this make sense for all cases.
+        target_placements: List[Placement] = []
+        for current, target in zip(grad_output.placements, previous_placement):
+            if current.is_replicate() and target.is_partial():
+                # keep target placement to replicate instead of partial in this case
+                target_placements.append(current)
+            else:
+                target_placements.append(target)
+
+        return (
+            redistribute_dtensor(
+                grad_output, previous_device_mesh, target_placements
+            ),
+            None,
+            None,
+        )
diff --git a/torch/distributed/_tensor/utils.py b/torch/distributed/_tensor/utils.py
new file mode 100644
index 0000000000000..bb56f488d81f9
--- /dev/null
+++ b/torch/distributed/_tensor/utils.py
@@ -0,0 +1,53 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+
+import torch
+from typing import Union, Dict, Tuple, Optional, Sequence
+
+import torch.distributed._tensor.api as dtensor
+from torch.distributed._tensor.placement_types import DTensorSpec
+
+ArgKwargsType = Union[Tuple[object, ...], Dict[str, object]]
+# ATen op schemas could have Tensor, Tuple[Tensor] and List[Tensor], so output type sould
+# be the same set of possiblities.
+OutputSpecType = Optional[Union[DTensorSpec, Sequence[DTensorSpec]]]
+
+
+def unwrap_local_tensor(e: "dtensor.DTensor") -> torch.Tensor:
+    return e._local_tensor if isinstance(e, dtensor.DTensor) else e
+
+
+def unwrap_schema(e: object) -> object:
+    return e._spec if isinstance(e, dtensor.DTensor) else e
+
+
+def wrap(res: object, spec: OutputSpecType) -> object:
+    if isinstance(res, torch.Tensor):
+        assert spec is not None and isinstance(
+            spec, DTensorSpec
+        ), f"output spec does not match with output! Expected DTensorSpec, got {spec}."
+        return dtensor.DTensor(
+            res,
+            spec.mesh,
+            spec.placements,
+            size=spec.shape,
+            requires_grad=res.requires_grad,
+        )
+    elif isinstance(res, list):
+        assert spec is not None and isinstance(
+            spec, list
+        ), f"output spec does not match with output! Expected list, got {spec}."
+        return list(
+            dtensor.DTensor(e, s.mesh, s.placements, size=s.shape)
+            for e, s in zip(res, spec)
+        )
+    elif isinstance(res, tuple):
+        assert spec is not None and isinstance(
+            spec, tuple
+        ), f"output spec does not match with output! Expected tuple, got {spec}"
+        return tuple(
+            dtensor.DTensor(e, s.mesh, s.placements, size=s.shape)
+            for e, s in zip(res, spec)
+        )
+    else:
+        # if the res contains only non tensor values, we simply return it without rewrapping
+        return res

From 1bdc8be79c5294024f9efa80e8d887d2dd6b54fc Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Tue, 15 Nov 2022 08:04:38 +0000
Subject: [PATCH 0948/1922] [dtensor] PART 3: move most DTensor ops to core
 distributed (#88177)

This PR moves most DTensor ops to torch.distributed._tensor. We will
add all tests in the following PRs.

part of https://github.com/pytorch/pytorch/issues/88838
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88177
Approved by: https://github.com/fduwjj
---
 torch/distributed/_tensor/ops/__init__.py     |   7 +
 torch/distributed/_tensor/ops/common_rules.py | 376 ++++++++++++++
 torch/distributed/_tensor/ops/math_ops.py     | 141 +++++
 torch/distributed/_tensor/ops/matrix_ops.py   | 129 +++++
 .../distributed/_tensor/ops/pointwise_ops.py  | 396 ++++++++++++++
 torch/distributed/_tensor/ops/tensor_ops.py   | 481 ++++++++++++++++++
 .../_tensor/ops/tp_sharding_ops.py            |  55 ++
 torch/distributed/_tensor/ops/utils.py        |  81 +++
 8 files changed, 1666 insertions(+)
 create mode 100644 torch/distributed/_tensor/ops/__init__.py
 create mode 100644 torch/distributed/_tensor/ops/common_rules.py
 create mode 100644 torch/distributed/_tensor/ops/math_ops.py
 create mode 100644 torch/distributed/_tensor/ops/matrix_ops.py
 create mode 100644 torch/distributed/_tensor/ops/pointwise_ops.py
 create mode 100644 torch/distributed/_tensor/ops/tensor_ops.py
 create mode 100644 torch/distributed/_tensor/ops/tp_sharding_ops.py
 create mode 100644 torch/distributed/_tensor/ops/utils.py

diff --git a/torch/distributed/_tensor/ops/__init__.py b/torch/distributed/_tensor/ops/__init__.py
new file mode 100644
index 0000000000000..5012768ee0519
--- /dev/null
+++ b/torch/distributed/_tensor/ops/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from .matrix_ops import *  # noqa: F403
+from .math_ops import *  # noqa: F403
+from .tensor_ops import *  # noqa: F403
+from .tp_sharding_ops import *  # noqa: F403
+from .pointwise_ops import *  # noqa: F403
+# from .view_ops import *  # noqa: F403
diff --git a/torch/distributed/_tensor/ops/common_rules.py b/torch/distributed/_tensor/ops/common_rules.py
new file mode 100644
index 0000000000000..29925c8a52c73
--- /dev/null
+++ b/torch/distributed/_tensor/ops/common_rules.py
@@ -0,0 +1,376 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import torch
+from typing import List, Sequence, Dict, Tuple, Optional, cast
+from torch.distributed._tensor.dispatch import OpSchema, OutputSharding
+from torch.distributed._tensor.placement_types import DTensorSpec
+from torch.distributed._tensor.ops.utils import prod
+
+
+def _replace_char_in_str(string: str, new_char: str, idx: int) -> str:
+    return string[:idx] + new_char + string[idx + 1 :]
+
+
+def _inplace_rewrap_schema_suggestion(
+    suggestion: OpSchema, input_schema: OpSchema
+) -> None:
+    suggestion_args_spec = suggestion.args_spec
+    new_arg_schema: List[object] = []
+    idx_of_args_spec = 0
+    for arg in input_schema.args_schema:
+        if isinstance(arg, DTensorSpec):
+            new_arg_schema.append(suggestion_args_spec[idx_of_args_spec])
+            idx_of_args_spec += 1
+        else:
+            new_arg_schema.append(arg)
+    suggestion.args_schema = tuple(new_arg_schema)
+    suggestion.kwargs_schema = input_schema.kwargs_schema
+
+
+def _gen_reshard_suggestions(
+    op_schema: OpSchema,
+    input_dims: List[str],
+    input_specs: Tuple[DTensorSpec, ...],
+    dim_to_sharding: Dict[str, int],
+    pending_sum: List[int],
+) -> OutputSharding:
+    suggested_arg_specs: List[DTensorSpec] = []
+    for input_dim, input_spec in zip(input_dims, input_specs):
+        dim_map = [dim_to_sharding[dim] for dim in input_dim]
+        suggested_arg_specs.append(
+            DTensorSpec.from_dim_map(
+                mesh=input_spec.mesh,
+                dim_map=dim_map,
+                sums=pending_sum,
+                shape=input_spec.shape,
+            )
+        )
+    suggested_schema = OpSchema(
+        op_schema.func_schema, tuple(suggested_arg_specs), {}
+    )
+    _inplace_rewrap_schema_suggestion(suggested_schema, op_schema)
+    return OutputSharding(
+        None,
+        schema_suggestions=[suggested_schema],
+        failed_reason="Input placements op sharding propagation failed, need to reshard!",
+    )
+
+
+def einop_rule(
+    equation: str,
+    op_schema: OpSchema,
+    *,
+    linearity: bool = False,
+    enforce_sharding: Optional[Dict[str, int]] = None,
+) -> OutputSharding:
+    """
+    Propagate the sharding of inputs to output for ops whose data
+    moves according to einsum notation. This is mostly borrowed
+    from @zdevito's sharding simulator. Examples:
+        mk,kn->mn - einsum
+        ij,ij->ij - addition
+        ij,j->ij - broadcasted addition
+        ij->i - reduction
+    Other ops could use this propagation algorithm when applied, note
+    that einsum propagation only deal with list of specs (DTensor specs)
+    as it only works on list of tensors!
+
+    linearity in einop_rule means that the calling op `f` follows this rule:
+        f(a + b) = f(a) + f(b)
+
+    In this case we can propagate the partial sum, note that linearity in einop
+    only applies to partial sum, not other operations like min/max (which are
+    associative but not linear).
+    """
+    # parse einop equation and extract arg specs
+    inputs, outputs = equation.split("->")
+    input_dims, output_dims = inputs.split(","), outputs.split(",")
+    input_specs = op_schema.args_spec
+    # NOTE: only support single output unless needed in future
+    output_dim = output_dims[0]
+
+    dim_to_sharding: Dict[str, int] = {}
+    dim_to_size: Dict[str, int] = {}
+    # record pending sum, key is mesh dimension, value is pending sum
+    # counter across input specs
+    pending_sums_counter: Dict[int, int] = {}
+    seen_shardings: Dict[int, str] = {}
+    needs_reshard = False
+
+    def merge_sharding(dim: str, a: int, b: int) -> int:
+        # merge the sharding of inputs if it's able to merge, i.e. we can merge
+        # replicate and shard to shard, but this will trigger an reshard operation
+        if a != b:
+            if a == -1 or b == -1:
+                # reshard the replicate to match the sharded one
+                nonlocal needs_reshard
+                needs_reshard = True
+                return a if a != -1 else b
+            else:
+                # TODO: further merge the sharding properly (i.e. reshard one input to replicate)
+                raise RuntimeError(
+                    f"{equation}: dim {dim} sharded two different ways: {a} and {b}"
+                )
+        else:
+            return a
+
+    for input_dim, input_spec in zip(input_dims, input_specs):
+        # deal with partial sums
+        input_sums = input_spec.sums
+        for sum_dim in input_sums:
+            if sum_dim not in pending_sums_counter:
+                seen_shardings[sum_dim] = "+"
+            # update pending sum counter for pending sum mesh
+            # dimension with the occurance from each input
+            pending_sums_counter[sum_dim] = (
+                pending_sums_counter.get(sum_dim, 0) + 1
+            )
+
+        for idx, (dim, mesh_dim) in enumerate(
+            zip(input_dim, input_spec.dim_map)
+        ):
+            if enforce_sharding and dim in enforce_sharding:
+                if enforce_sharding[dim] != mesh_dim:
+                    needs_reshard = True
+                dim_to_sharding[dim] = enforce_sharding[dim]
+                dim_to_size[dim] = input_spec.shape[idx]
+            elif dim not in dim_to_sharding:
+                dim_to_sharding[dim] = mesh_dim
+                dim_to_size[dim] = input_spec.shape[idx]
+            else:
+                dim_to_sharding[dim] = merge_sharding(
+                    dim, dim_to_sharding[dim], mesh_dim
+                )
+                assert dim_to_size[dim] == input_spec.shape[idx]
+
+            # after merging sharding, we check if there're multiple
+            # sharding on the same mesh dim.
+            merged_sharding_for_dim = dim_to_sharding[dim]
+            if merged_sharding_for_dim != -1:
+                if (
+                    merged_sharding_for_dim in seen_shardings
+                    and dim != seen_shardings[merged_sharding_for_dim]
+                ):
+                    needs_reshard = True
+                    seen_shardings[merged_sharding_for_dim] += dim
+                else:
+                    seen_shardings[merged_sharding_for_dim] = dim
+
+    if pending_sums_counter and not linearity:
+        # return reshard suggestion with no pending sum, because we already properly
+        # merge the sharding, this reshard suggestion is legit to use
+        return _gen_reshard_suggestions(
+            op_schema, input_dims, input_specs, dim_to_sharding, []
+        )
+    else:
+        # It's a op that support linearity, but not all input arguments are partial
+        # we fail the sharding propagation with suggestion to make all inputs be
+        # partial on the corresponding mesh dim (all inputs should be partial for
+        # the mesh dims in order to execute locally and delay the sum reduction)
+        for value in pending_sums_counter.values():
+            if value != len(input_specs):
+                needs_reshard = True
+
+    for mesh_dim, dims in seen_shardings.items():
+        if len(dims) > 1:
+            # we found different input dims are being sharded on the same mesh dim
+            # in order to perform local op computation, we need to reshard inputs
+            # base on some simple heuristics, now we simply pick the one with least comm
+            # volume. (i.e. the input with least size)
+            # TODO: consider a more advanced heuristic to pick the best sharding
+            costs = []
+            for d in dims:
+                cost = 0
+                for input_dim, input_spec in zip(input_dims, input_specs):
+                    if (
+                        d in input_dim
+                        and input_spec.dim_map[input_dim.index(d)] == mesh_dim
+                    ):
+                        cost += prod(
+                            input_spec.local_shape
+                        ) * input_spec.mesh.size(mesh_dim)
+                costs.append(cost)
+            d_to_keep_sharding = dims[costs.index(max(costs))]
+            for d in dims:
+                # update dim_to_sharding to keep the sharding of the dim with
+                # highest comm and make the rest of the dims to replicate
+                if d != d_to_keep_sharding:
+                    dim_to_sharding[d] = -1
+
+    pending_sums = list(pending_sums_counter.keys())
+    if needs_reshard:
+        return _gen_reshard_suggestions(
+            op_schema, input_dims, input_specs, dim_to_sharding, pending_sums
+        )
+
+    # generate output pending sum if a dim is sharded, and it appears in input
+    # but not output
+    for dim, shard_on_mesh in dim_to_sharding.items():
+        if dim not in output_dims[0] and shard_on_mesh != -1:
+            pending_sums.append(shard_on_mesh)
+
+    # if no need to reshard, we directly generate the output sharding
+    output_dim_map = []
+    output_shape = []
+    for dim in output_dim:
+        if dim == "1":
+            # find output dim that is a singleton dimension, mark sharding and shape
+            output_dim_map.append(-1)
+            output_shape.append(1)
+        else:
+            output_dim_map.append(dim_to_sharding[dim])
+            output_shape.append(dim_to_size[dim])
+
+    return OutputSharding(
+        DTensorSpec.from_dim_map(
+            input_specs[0].mesh,
+            output_dim_map,
+            pending_sums,
+            shape=torch.Size(output_shape),
+        )
+    )
+
+
+def pointwise_rule(
+    op_schema: OpSchema, linearity: bool = False
+) -> OutputSharding:
+    """
+    Propagate the sharding for pointwise operations. Examples:
+        ij,ij->ij - addition/mul
+        ij,j->ij - broadcasted addition
+    """
+    alphabet = "abcdefghijklmnopqrstuvwxyz"
+    # find the max_dim first in case we need to broadcasting
+    input_specs = op_schema.args_spec
+    max_dim = max(input.ndim for input in input_specs)
+    dimchars = []
+    singleton_counter: List[int] = [0] * max_dim
+    for input in input_specs:
+        start_dim = max_dim - input.ndim
+        p = alphabet[start_dim:max_dim]
+        # handle the "broadcasting to a common shape case"
+        # see https://pytorch.org/docs/stable/notes/broadcasting.html
+        # If any of the dimensions is singleton dimension (i.e. 1).
+        # we mark the dim char as a special "1" to distinguish with
+        # the non-singleton dimension, so that sharding propagation
+        # should just ignore the singleton dimension.
+        if len(input_specs) > 1:
+            for i in range(max_dim):
+                if i < start_dim:
+                    # treat the leading miss dim chars as singleton
+                    singleton_counter[i] += 1
+                elif input.shape[i - start_dim] == 1:
+                    # mark singleton dim char as a special "1" in einop rule
+                    singleton_counter[i] += 1
+                    p = _replace_char_in_str(p, "1", (i - start_dim))
+
+        dimchars.append(p)
+    out_dimchars = alphabet[:max_dim]
+    # check if we replace the all inputs dim char with singleton dimension,
+    # if we replace all inputs, we also need to replace the output dimension.
+    for output_dim_idx in range(len(out_dimchars)):
+        out_dimchar = out_dimchars[output_dim_idx]
+        if singleton_counter[output_dim_idx] == len(input_specs):
+            out_dimchars = _replace_char_in_str(
+                out_dimchars, "1", output_dim_idx
+            )
+
+    fmt = f"{','.join(p for p in dimchars)}->{out_dimchars}"
+
+    enforce_sharding: Dict[str, int] = {}
+    if op_schema.is_inplace:
+        # inplace op should keep the input sharding it writes to
+        for out_dimchar, mesh_dim in zip(out_dimchars, input_specs[0].dim_map):
+            enforce_sharding[out_dimchar] = mesh_dim
+    elif op_schema.is_out_variant:
+        out_spec = cast(DTensorSpec, op_schema.kwargs_schema["out"])
+        for out_dimchar, mesh_dim in zip(out_dimchars, out_spec.dim_map):
+            enforce_sharding[out_dimchar] = mesh_dim
+
+    return einop_rule(
+        fmt,
+        op_schema,
+        linearity=linearity,
+        enforce_sharding=enforce_sharding,
+    )
+
+
+def linear_pointwise_rule(op_schema: OpSchema) -> OutputSharding:
+    """
+    Linear pointwise operators can propagate pending reductions.
+    For example, c = add(a, b); if a is pending sum, then c will be
+    pending sum as well without any communication overhead.
+    """
+    return pointwise_rule(op_schema, linearity=True)
+
+
+def reduction_rule(
+    op_schema: OpSchema,
+    *,
+    dims: Optional[Sequence[int]] = None,
+    keep_dim: bool = False,
+    reduction_linear: bool = False,
+) -> OutputSharding:
+    """
+    Propagate the sharding for reduction operations. Examples:
+        ij->i - sum on dim
+
+    reduction_linear means that the reduction `f` follows this rule:
+        f([f(a), f(b)]) = f([a, b])
+
+    reduction linear should be super set of linearity.
+    """
+    alphabet = "abcdefghijklmnopqrstuvwxyz"
+    # reduction op usually begin with a single tensor
+    input_spec = cast(DTensorSpec, op_schema.args_schema[0])
+    reduce_dims = range(input_spec.ndim) if dims is None else dims
+
+    if not reduction_linear:
+        # if the reduction is not linear, we need to clear the pending sum
+        # on the input spec, also replicate the reducing dimension if the
+        # reducing dimension is sharded, then suggest a resharding
+        reshard_dim_map = input_spec.dim_map
+        needs_reshard = False
+        for dim in reduce_dims:
+            if input_spec.dim_map[dim] != -1:
+                needs_reshard = True
+                reshard_dim_map[dim] = -1
+        needs_reshard = needs_reshard or len(input_spec.sums) > 0
+
+        if needs_reshard:
+            no_partial_spec = DTensorSpec.from_dim_map(
+                input_spec.mesh, reshard_dim_map, [], input_spec.shape
+            )
+            schema_suggestion = OpSchema(
+                op_schema.func_schema, (no_partial_spec,), {}
+            )
+            _inplace_rewrap_schema_suggestion(schema_suggestion, op_schema)
+            return OutputSharding(
+                output_spec=None, schema_suggestions=[schema_suggestion]
+            )
+
+    input_chars = alphabet[: input_spec.ndim]
+
+    if dims is None and not keep_dim:
+        # reducing to a single scalar tensor, we just mark output as empty
+        out_dimchars = ""
+    else:
+        # if keep the reduction dim, we need to keep the dim char by marking
+        # it as a singleton "1" in the out_dimchars
+        reduce_dim_char = ord("1") if keep_dim else None
+        out_dimchars = input_chars.translate(
+            {ord(alphabet[dim]): reduce_dim_char for dim in reduce_dims}
+        )
+    fmt = f"{input_chars}->{out_dimchars}"
+
+    enforce_sharding: Dict[str, int] = {}
+    if op_schema.is_out_variant:
+        out_spec = cast(DTensorSpec, op_schema.kwargs_schema["out"])
+        for out_dimchar, mesh_dim in zip(out_dimchars, out_spec.dim_map):
+            enforce_sharding[out_dimchar] = mesh_dim
+
+    return einop_rule(
+        fmt,
+        op_schema,
+        linearity=reduction_linear,
+        enforce_sharding=enforce_sharding,
+    )
diff --git a/torch/distributed/_tensor/ops/math_ops.py b/torch/distributed/_tensor/ops/math_ops.py
new file mode 100644
index 0000000000000..eb4cd86ed5c66
--- /dev/null
+++ b/torch/distributed/_tensor/ops/math_ops.py
@@ -0,0 +1,141 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from typing import cast, Optional, Sequence
+
+from torch.distributed._tensor.api import DTensor
+from torch.distributed._tensor.placement_types import DTensorSpec
+from torch.distributed._tensor.dispatch import OpSchema, OutputSharding
+from torch.distributed._tensor.ops.common_rules import reduction_rule, pointwise_rule
+from torch.distributed._tensor.ops.utils import register_prop_rule, as_list, normalize_dims
+
+
+def _infer_reduction_dims(
+    dims_arg: object, ndim: int
+) -> Optional[Sequence[int]]:
+    if dims_arg is None:
+        return None
+    dims = cast(Sequence[int], as_list(dims_arg))
+    dims = normalize_dims(dims, ndim)
+    empty_dims = [[0], [-1], []]
+    if ndim == 0 and dims_arg in empty_dims:
+        return None
+    return dims
+
+
+@register_prop_rule("aten.all.default")
+def default_reduction_rule(op_schema: OpSchema) -> OutputSharding:
+    return reduction_rule(op_schema, reduction_linear=True)
+
+
+def sum_rule(op_schema: OpSchema) -> OutputSharding:
+    args_schema = op_schema.args_schema
+    input_spec = cast(DTensorSpec, args_schema[0])
+    dims = None
+    if len(args_schema) > 1:
+        dims = _infer_reduction_dims(args_schema[1], input_spec.ndim)
+
+    keep_dim = len(args_schema) > 2 and bool(args_schema[2])
+    return reduction_rule(
+        op_schema, dims=dims, keep_dim=keep_dim, reduction_linear=True
+    )
+
+
+sum_ops = [
+    "aten.sum.default",
+    "aten.sum.dim_IntList",
+]
+for sum_op in sum_ops:
+    DTensor._op_to_rules[sum_op] = sum_rule
+
+
+@register_prop_rule("aten._softmax.default")
+def softmax_rule(op_schema: OpSchema) -> OutputSharding:
+    input_spec, softmax_dim, _ = op_schema.args_schema
+    input_spec = cast(DTensorSpec, input_spec)
+    softmax_dim = cast(int, softmax_dim)
+    dim_map = input_spec.dim_map
+    if softmax_dim < len(dim_map) and dim_map[softmax_dim] >= 0:
+        raise RuntimeError("Cannot run softmax on sharding dimension!")
+    return OutputSharding(input_spec)
+
+
+@register_prop_rule("aten._softmax_backward_data.default")
+def softmax_bwd_rule(op_schema: OpSchema) -> OutputSharding:
+    grad_out_spec, out_spec, softmax_dim, _ = op_schema.args_schema
+    grad_out_spec = cast(DTensorSpec, grad_out_spec)
+    out_spec = cast(DTensorSpec, out_spec)
+    softmax_dim = cast(int, softmax_dim)
+    grad_out_dim_map = grad_out_spec.dim_map
+    out_dim_map = out_spec.dim_map
+    if softmax_dim < len(grad_out_dim_map) and (
+        grad_out_dim_map[softmax_dim] >= 0 or out_dim_map[softmax_dim] >= 0
+    ):
+        raise RuntimeError(
+            "Cannot run _softmax_backward_data on sharding dimension!"
+        )
+    return pointwise_rule(op_schema)
+
+
+def mean_rule(op_schema: OpSchema) -> OutputSharding:
+    args_schema = op_schema.args_schema
+    input_spec = cast(DTensorSpec, args_schema[0])
+    dims = None
+    # if length of args > 1, we check args to find dims
+    if len(args_schema) > 1:
+        dims = _infer_reduction_dims(args_schema[1], input_spec.ndim)
+
+    keep_dim = len(args_schema) > 2 and bool(args_schema[2])
+    return reduction_rule(
+        op_schema, dims=dims, keep_dim=keep_dim, reduction_linear=False
+    )
+
+
+mean_ops = [
+    "aten.mean.default",
+    "aten.mean.dim",
+    "aten.mean.out",
+]
+
+for mean_op in mean_ops:
+    DTensor._op_to_rules[mean_op] = mean_rule
+
+
+def var_rule(op_schema: OpSchema) -> OutputSharding:
+    args_schema = op_schema.args_schema
+    input_spec = cast(DTensorSpec, args_schema[0])
+    dims = None
+    # if length of args > 1, we check args to find dims, note that
+    # var.default have unbias arg as the first argument, so we want
+    # to check if it's not bool
+    if len(args_schema) > 1 and not isinstance(args_schema[1], bool):
+        dims = _infer_reduction_dims(args_schema[1], input_spec.ndim)
+
+    keep_dim = len(args_schema) > 3 and bool(args_schema[3])
+    return reduction_rule(
+        op_schema, dims=dims, keep_dim=keep_dim, reduction_linear=False
+    )
+
+
+var_ops = [
+    "aten.var.default",
+    "aten.var.dim",
+    "aten.var.out",
+]
+
+for var_op in var_ops:
+    DTensor._op_to_rules[var_op] = var_rule
+
+
+@register_prop_rule("aten.var.correction")
+@register_prop_rule("aten.var.correction_out")
+def var_correction_rule(op_schema: OpSchema) -> OutputSharding:
+    args_schema = op_schema.args_schema
+    input_spec = cast(DTensorSpec, args_schema[0])
+    dims = None
+    if len(args_schema) > 1:
+        dims = _infer_reduction_dims(args_schema[1], input_spec.ndim)
+
+    # keep_dim is a kwarg instead of arg for var.correction
+    keep_dim = cast(bool, op_schema.kwargs_schema.get("keepdim", False))
+    return reduction_rule(
+        op_schema, dims=dims, keep_dim=keep_dim, reduction_linear=False
+    )
diff --git a/torch/distributed/_tensor/ops/matrix_ops.py b/torch/distributed/_tensor/ops/matrix_ops.py
new file mode 100644
index 0000000000000..47988799282e9
--- /dev/null
+++ b/torch/distributed/_tensor/ops/matrix_ops.py
@@ -0,0 +1,129 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# implement matrix related ops for distributed tensor
+from torch.distributed._tensor.dispatch import OpSchema, OutputSharding
+from torch.distributed._tensor.ops.common_rules import einop_rule, pointwise_rule
+from torch.distributed._tensor.ops.utils import register_prop_rule
+
+
+def _update_schema_suggestion_for_addmm(
+    output_sharding: OutputSharding,
+    op_schema: OpSchema,
+    pointwise_add_update: bool = True,
+) -> OutputSharding:
+    # schema suggestion coming from output sharding could be:
+    # 1. pointwise add sharding input suggestion
+    # 2. mm sharding input suggestion
+    # inplace update schema suggestion to return addmm suggestion
+    assert output_sharding.schema_suggestions is not None
+    suggestion = output_sharding.schema_suggestions[0]
+    if pointwise_add_update:
+        # update with pointwise suggestion
+        args_schema = (
+            suggestion.args_schema[0],
+            op_schema.args_schema[1],
+            op_schema.args_schema[2],
+        )
+    else:
+        # update with mm suggestion
+        args_schema = (
+            op_schema.args_schema[0],
+            suggestion.args_schema[0],
+            suggestion.args_schema[1],
+        )
+
+    output_sharding.schema_suggestions = [
+        OpSchema(
+            func_schema=op_schema.func_schema,
+            args_schema=args_schema,
+            kwargs_schema=op_schema.kwargs_schema,
+        )
+    ]
+    return output_sharding
+
+
+@register_prop_rule("aten.mm.default")
+def mm_rules(op_schema: OpSchema) -> OutputSharding:
+    return einop_rule("mk,kn->mn", op_schema, linearity=False)
+
+
+@register_prop_rule("aten.addmm.default")
+def addmm_rules(op_schema: OpSchema) -> OutputSharding:
+    input_spec, mat1_spec, mat2_spec = op_schema.args_spec
+    mm_out_sharding = mm_rules(
+        OpSchema(op_schema.func_schema, (mat1_spec, mat2_spec), {})
+    )
+    if mm_out_sharding.output_spec is None:
+        # non-eligible input, suggest addmm input specs
+        if mm_out_sharding.schema_suggestions is not None:
+            # TODO: add more suggestions for resharding
+            return _update_schema_suggestion_for_addmm(
+                mm_out_sharding,
+                op_schema,
+                pointwise_add_update=False,
+            )
+        else:
+            return OutputSharding(None)
+
+    # run point wise rule on input + (mm_out) with linearity
+    output_sharding = pointwise_rule(
+        OpSchema(
+            op_schema.func_schema, (input_spec, mm_out_sharding.output_spec), {}
+        ),
+        linearity=True,
+    )
+    # if propagation failed, edit the schema suggestion from pointwise rules
+    # to return addmm suggestion instead as it's a chained suggestion.
+    if (
+        output_sharding.output_spec is None
+        and output_sharding.schema_suggestions is not None
+    ):
+        return _update_schema_suggestion_for_addmm(output_sharding, op_schema)
+
+    return output_sharding
+
+
+@register_prop_rule("aten.t.default")
+def transpose_rule(op_schema: OpSchema) -> OutputSharding:
+    return einop_rule("ij->ji", op_schema, linearity=True)
+
+
+@register_prop_rule("aten.bmm.default")
+def bmm_rules(op_schema: OpSchema) -> OutputSharding:
+    return einop_rule("bmk,bkn->bmn", op_schema, linearity=False)
+
+
+@register_prop_rule("aten.baddbmm.default")
+def baddbmm_rules(op_schema: OpSchema) -> OutputSharding:
+    input_spec, mat1_spec, mat2_spec = op_schema.args_spec
+    bmm_output_sharding = bmm_rules(
+        OpSchema(op_schema.func_schema, (mat1_spec, mat2_spec), {})
+    )
+    if bmm_output_sharding.output_spec is None:
+        # TODO: add more suggestions
+        if bmm_output_sharding.schema_suggestions is not None:
+            return _update_schema_suggestion_for_addmm(
+                bmm_output_sharding,
+                op_schema,
+                pointwise_add_update=False,
+            )
+        else:
+            return OutputSharding(None)
+
+    # run point wise rule on input + (bmm_out) with linearity
+    output_sharding = pointwise_rule(
+        OpSchema(
+            op_schema.func_schema,
+            (input_spec, bmm_output_sharding.output_spec),
+            {},
+        ),
+        linearity=True,
+    )
+    # if propagation failed, edit the schema suggestion from pointwise rules
+    # to return baddbmm suggestion instead as it's a chained suggestion.
+    if (
+        output_sharding.output_spec is None
+        and output_sharding.schema_suggestions is not None
+    ):
+        return _update_schema_suggestion_for_addmm(output_sharding, op_schema)
+
+    return output_sharding
diff --git a/torch/distributed/_tensor/ops/pointwise_ops.py b/torch/distributed/_tensor/ops/pointwise_ops.py
new file mode 100644
index 0000000000000..6c92eacd1b8b9
--- /dev/null
+++ b/torch/distributed/_tensor/ops/pointwise_ops.py
@@ -0,0 +1,396 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from typing import cast
+
+from torch.distributed._tensor.api import DTensor
+from torch.distributed._tensor.ops.common_rules import linear_pointwise_rule, pointwise_rule
+from torch.distributed._tensor.placement_types import DTensorSpec, Replicate, _Partial
+from torch.distributed._tensor.dispatch import OpSchema, OutputSharding
+from torch.distributed._tensor.ops.utils import register_prop_rule
+
+# leave the remaining pointwise_ops list here for convenience,
+# Below ops are some pointwise ops that are yet to be supported,
+# they might not be a complete list.
+# pointwise_ops = [
+#     "fake_quantize_per_channel_affine",
+#     "fake_quantize_per_tensor_affine",
+#     "floor_divide",  # floor_divide is deprecated
+#     "frexp",  # multiple output pointwise op, need to add support
+#     "gradient",  #  need investigation on this op
+#     "imag",  # complex data type only
+#     "quantized_batch_norm",
+#     "quantized_max_pool1d",
+#     "quantized_max_pool2d",
+#     "real",  # complex data type only
+# ]
+
+
+linear_pointwise_ops = [
+    "aten.div.Scalar",  # this op is linear on the first argument, and the second argument is scalar, so it fits as a linear op.
+    "aten.to.dtype",
+]
+
+
+pointwise_ops = [
+    # please keep the entries below alphabetically sorted
+    "aten.abs.default",
+    "aten.acos.default",
+    "aten.acos.out",
+    "aten.acos_.default",
+    "aten.acosh.default",
+    "aten.acosh.out",
+    "aten.acosh_.default",
+    "aten.add.Scalar",
+    "aten.add.Tensor",
+    "aten.add.out",
+    "aten.add_.Scalar",
+    "aten.add_.Tensor",
+    "aten.addcdiv.default",
+    "aten.addcdiv.out",
+    "aten.addcdiv_.default",
+    "aten.addcmul.default",
+    "aten.addcmul.out",
+    "aten.addcmul_.default",
+    "aten.angle.default",
+    "aten.angle.out",
+    "aten.asin.default",
+    "aten.asin.out",
+    "aten.asin_.default",
+    "aten.asinh.default",
+    "aten.asinh.out",
+    "aten.asinh_.default",
+    "aten.atan.default",
+    "aten.atan.out",
+    "aten.atan2.default",
+    "aten.atan2.out",
+    "aten.atan2_.default",
+    "aten.atan_.default",
+    "aten.atanh.default",
+    "aten.atanh.out",
+    "aten.atanh_.default",
+    "aten.bitwise_and.Scalar",
+    "aten.bitwise_and.Scalar_Tensor",
+    "aten.bitwise_and.Scalar_out",
+    "aten.bitwise_and.Tensor",
+    "aten.bitwise_and.Tensor_out",
+    "aten.bitwise_and_.Scalar",
+    "aten.bitwise_and_.Tensor",
+    "aten.bitwise_left_shift.Scalar_Tensor",
+    "aten.bitwise_left_shift.Tensor",
+    "aten.bitwise_left_shift.Tensor_Scalar",
+    "aten.bitwise_left_shift.Tensor_Scalar_out",
+    "aten.bitwise_left_shift.Tensor_out",
+    "aten.bitwise_left_shift_.Tensor",
+    "aten.bitwise_left_shift_.Tensor_Scalar",
+    "aten.bitwise_not.default",
+    "aten.bitwise_not.out",
+    "aten.bitwise_not_.default",
+    "aten.bitwise_or.Scalar",
+    "aten.bitwise_or.Scalar_Tensor",
+    "aten.bitwise_or.Scalar_out",
+    "aten.bitwise_or.Tensor",
+    "aten.bitwise_or.Tensor_out",
+    "aten.bitwise_or_.Scalar",
+    "aten.bitwise_or_.Tensor",
+    "aten.bitwise_right_shift.Scalar_Tensor",
+    "aten.bitwise_right_shift.Tensor",
+    "aten.bitwise_right_shift.Tensor_Scalar",
+    "aten.bitwise_right_shift.Tensor_Scalar_out",
+    "aten.bitwise_right_shift.Tensor_out",
+    "aten.bitwise_right_shift_.Tensor",
+    "aten.bitwise_right_shift_.Tensor_Scalar",
+    "aten.bitwise_xor.Scalar",
+    "aten.bitwise_xor.Scalar_Tensor",
+    "aten.bitwise_xor.Scalar_out",
+    "aten.bitwise_xor.Tensor",
+    "aten.bitwise_xor.Tensor_out",
+    "aten.bitwise_xor_.Scalar",
+    "aten.bitwise_xor_.Tensor",
+    "aten.ceil.default",
+    "aten.ceil.out",
+    "aten.ceil_.default",
+    "aten.clamp.default",
+    "aten.clamp.out",
+    "aten.clamp_.default",
+    "aten.clip.default",
+    "aten.clip.out",
+    "aten.clip_.default",
+    "aten.conj_physical.default",
+    "aten.conj_physical.out",
+    "aten.conj_physical_.default",
+    "aten.copy_sign.Scalar",
+    "aten.copy_sign.Scalar_out",
+    "aten.copy_sign.Tensor",
+    "aten.copy_sign.out",
+    "aten.copy_sign_.Scalar",
+    "aten.copy_sign_.Tensor",
+    "aten.cos.default",
+    "aten.cos.out",
+    "aten.cos_.default",
+    "aten.cosh.default",
+    "aten.cosh.out",
+    "aten.cosh_.default",
+    "aten.deg2rad.default",
+    "aten.deg2rad.out",
+    "aten.deg2rad_.default",
+    "aten.digamma.default",
+    "aten.digamma.out",
+    "aten.digamma_.default",
+    "aten.div.Tensor",
+    "aten.div.Tensor_mode",
+    "aten.div.out",
+    "aten.div.out_mode",
+    "aten.div_.Tensor",
+    "aten.div_.Tensor_mode",
+    "aten.eq.Tensor",
+    "aten.eq.Tensor_out",
+    "aten.eq.Scalar",
+    "aten.eq.Scalar_out",
+    "aten.equal.default",
+    "aten.erf.default",
+    "aten.erf.out",
+    "aten.erf_.default",
+    "aten.erfc.default",
+    "aten.erfc.out",
+    "aten.erfc_.default",
+    "aten.erfinv.default",
+    "aten.erfinv.out",
+    "aten.erfinv_.default",
+    "aten.exp.default",
+    "aten.exp.out",
+    "aten.exp2.default",
+    "aten.exp2.out",
+    "aten.exp2_.default",
+    "aten.exp_.default",
+    "aten.expm1.default",
+    "aten.expm1.out",
+    "aten.expm1_.default",
+    "aten.float_power.Scalar",
+    "aten.float_power.Scalar_out",
+    "aten.float_power.Tensor_Scalar",
+    "aten.float_power.Tensor_Scalar_out",
+    "aten.float_power.Tensor_Tensor",
+    "aten.float_power.Tensor_Tensor_out",
+    "aten.float_power_.Scalar",
+    "aten.float_power_.Tensor",
+    "aten.floor.default",
+    "aten.floor.out",
+    "aten.floor_.default",
+    "aten.fmod.Scalar",
+    "aten.fmod.Scalar_out",
+    "aten.fmod.Tensor",
+    "aten.fmod.Tensor_out",
+    "aten.fmod_.Scalar",
+    "aten.fmod_.Tensor",
+    "aten.frac.default",
+    "aten.frac.out",
+    "aten.frac_.default",
+    "aten.ge.Scalar",
+    "aten.ge.Tensor",
+    "aten.gelu.default",
+    "aten.gt.Scalar",
+    "aten.gt.Tensor",
+    "aten.hypot.default",
+    "aten.hypot.out",
+    "aten.hypot_.default",
+    "aten.i0.default",
+    "aten.i0.out",
+    "aten.i0_.default",
+    "aten.igamma.default",
+    "aten.igamma.out",
+    "aten.igamma_.default",
+    "aten.igammac.default",
+    "aten.igammac.out",
+    "aten.igammac_.default",
+    "aten.isnan.default",
+    "aten.ldexp.default",
+    "aten.ldexp.out",
+    "aten.ldexp_.default",
+    "aten.le.Scalar",
+    "aten.le.Tensor",
+    "aten.lerp.Scalar",
+    "aten.lerp.Scalar_out",
+    "aten.lerp.Tensor",
+    "aten.lerp.Tensor_out",
+    "aten.lerp_.Scalar",
+    "aten.lerp_.Tensor",
+    "aten.lgamma.default",
+    "aten.lgamma.out",
+    "aten.lgamma_.default",
+    "aten.log.default",
+    "aten.log.out",
+    "aten.log10.default",
+    "aten.log10.out",
+    "aten.log10_.default",
+    "aten.log1p.default",
+    "aten.log1p.out",
+    "aten.log1p_.default",
+    "aten.log2.default",
+    "aten.log2.out",
+    "aten.log2_.default",
+    "aten.log_.default",
+    "aten.logaddexp.default",
+    "aten.logaddexp.out",
+    "aten.logaddexp2.default",
+    "aten.logaddexp2.out",
+    "aten.logical_and.default",
+    "aten.logical_and.out",
+    "aten.logical_and_.default",
+    "aten.logical_not.default",
+    "aten.logical_not.out",
+    "aten.logical_not_.default",
+    "aten.logical_or.default",
+    "aten.logical_or.out",
+    "aten.logical_or_.default",
+    "aten.logical_xor.default",
+    "aten.logical_xor.out",
+    "aten.logical_xor_.default",
+    "aten.logit.default",
+    "aten.logit.out",
+    "aten.logit_.default",
+    "aten.masked_fill.Scalar",
+    "aten.mul.Scalar",
+    "aten.mul.Tensor",
+    "aten.mul.out",
+    "aten.mul_.Scalar",
+    "aten.mul_.Tensor",
+    "aten.mvlgamma.default",
+    "aten.mvlgamma.out",
+    "aten.mvlgamma_.default",
+    "aten.native_dropout_backward.default",
+    "aten.native_dropout_backward.out",
+    "aten.nan_to_num.default",
+    "aten.nan_to_num.out",
+    "aten.nan_to_num_.default",
+    "aten.ne.Scalar",
+    "aten.neg.default",
+    "aten.neg.out",
+    "aten.neg_.default",
+    "aten.nextafter.default",
+    "aten.nextafter.out",
+    "aten.nextafter_.default",
+    "aten.polygamma.default",
+    "aten.polygamma.out",
+    "aten.polygamma_.default",
+    "aten.positive.default",
+    "aten.pow.Scalar",
+    "aten.pow.Scalar_out",
+    "aten.pow.Tensor_Scalar",
+    "aten.pow.Tensor_Scalar_out",
+    "aten.pow.Tensor_Tensor",
+    "aten.pow.Tensor_Tensor_out",
+    "aten.pow_.Scalar",
+    "aten.pow_.Tensor",
+    "aten.reciprocal.default",
+    "aten.reciprocal.out",
+    "aten.reciprocal_.default",
+    "aten.red2deg.default",
+    "aten.red2deg.out",
+    "aten.red2deg_.default",
+    "aten.relu.default",
+    "aten.relu_.default",
+    "aten.remainder.Scalar",
+    "aten.remainder.Scalar_Tensor",
+    "aten.remainder.Scalar_out",
+    "aten.remainder.Tensor",
+    "aten.remainder.Tensor_out",
+    "aten.remainder_.Scalar",
+    "aten.remainder_.Tensor",
+    "aten.round.decimals",
+    "aten.round.decimals_out",
+    "aten.round.default",
+    "aten.round.out",
+    "aten.round_.decimals",
+    "aten.round_.default",
+    "aten.rsqrt.default",
+    "aten.rsqrt.out",
+    "aten.rsqrt_.default",
+    "aten.rsub.Scalar",
+    "aten.sgn.default",
+    "aten.sgn.out",
+    "aten.sgn_.default",
+    "aten.sigmoid.default",
+    "aten.sigmoid.out",
+    "aten.sigmoid_.default",
+    "aten.sign.default",
+    "aten.sign.out",
+    "aten.sign_.default",
+    "aten.signbit.default",
+    "aten.signbit.out",
+    "aten.sin.default",
+    "aten.sin.out",
+    "aten.sin_.default",
+    "aten.sinc.default",
+    "aten.sinc.out",
+    "aten.sinc_.default",
+    "aten.sinh.default",
+    "aten.sinh.out",
+    "aten.sinh_.default",
+    "aten.sqrt.default",
+    "aten.sqrt.out",
+    "aten.sqrt_.default",
+    "aten.square.default",
+    "aten.square.out",
+    "aten.square_.default",
+    "aten.sub.Scalar",
+    "aten.sub.Tensor",
+    "aten.sub.out",
+    "aten.sub_.Scalar",
+    "aten.sub_.Tensor",
+    "aten.tan.default",
+    "aten.tan.out",
+    "aten.tan_.default",
+    "aten.tanh.default",
+    "aten.tanh.out",
+    "aten.tanh_.default",
+    "aten.true_divide.Tensor",
+    "aten.trunc.default",
+    "aten.trunc.out",
+    "aten.trunc_.default",
+    "aten.where.self",
+    "aten.xlogy.OutScalar_Self",
+    "aten.xlogy.OutTensor",
+    "aten.xlogy.Scalar_other",
+    "aten.xlogy.Scalar_self",
+    "aten.xlogy.Tensor",
+    "aten.xlogy_.OutScalar_Other",
+    "aten.xlogy_.Scalar_other",
+    "aten.xlogy_.Tensor",
+    "prims.convert_element_type.default",
+    # backward point-wise ops
+    # please keep the entries below alphabetically sorted
+    "aten.gelu_backward.default",
+    "aten.sigmoid_backward.default",
+    "aten.tanh_backward.default",
+    "aten.threshold_backward.default",
+]
+
+
+for op in linear_pointwise_ops:
+    DTensor._op_to_rules[op] = linear_pointwise_rule
+
+
+for op in pointwise_ops:
+    DTensor._op_to_rules[op] = pointwise_rule
+
+
+@register_prop_rule("aten.native_dropout.default")
+def dropout_rule(op_schema: OpSchema) -> OutputSharding:
+    self_spec = cast(DTensorSpec, op_schema.args_schema[0])
+
+    # TODO: We are specializing dropout_rule now because it's
+    # a non-deterministic algorithm, and replication does not
+    # not support non-deterministic op yet. We should remove
+    # this rule and make dropout to use pointwise rule instead
+    # once we support non-deterministic op.
+    replicate_or_partial = False
+    for placement in self_spec.placements:
+        if isinstance(placement, (Replicate, _Partial)):
+            replicate_or_partial = True
+            break
+
+    if replicate_or_partial:
+        return OutputSharding(
+            None, failed_reason="Dropout with replication is not supported yet!"
+        )
+    else:
+        return OutputSharding(self_spec)
diff --git a/torch/distributed/_tensor/ops/tensor_ops.py b/torch/distributed/_tensor/ops/tensor_ops.py
new file mode 100644
index 0000000000000..f386e1fdb9fd1
--- /dev/null
+++ b/torch/distributed/_tensor/ops/tensor_ops.py
@@ -0,0 +1,481 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import torch
+from torch.distributed._tensor.api import (
+    DTensor,
+    DTensorSpec,
+    Placement,
+    Replicate,
+    Shard,
+    _Partial,
+)
+from torch.distributed._tensor.dispatch import OpSchema, OutputSharding
+from torch.distributed._tensor.ops.common_rules import pointwise_rule
+from torch.distributed._tensor.ops.utils import register_prop_rule
+from typing import List, Optional, Sequence, Tuple, cast
+
+
+# NOTE: the default propagation rule should apply for
+# any operator that does not return a DTensor, i.e.
+# for operators that only returns int/float/bool, we by
+# default still propagate the spec, this is to ensure
+# that we only return None for the case where the sharding
+# propagation failed, and we should do auto-redistribute
+def default_prop_rule(op_schema: OpSchema) -> OutputSharding:
+    # by default prop the first arg spec
+    return OutputSharding(op_schema.args_spec[0])
+
+
+def prop_create_like(op_schema: OpSchema) -> OutputSharding:
+    # For operators that create tensors with same shape as input but
+    # with specific content that does not depend on the input, we
+    # can propagate Sharding, but we have to make sure we move from
+    # partial to replicated.
+    input_spec = op_schema.args_spec[0]
+    output_spec = DTensorSpec(
+        mesh=input_spec.mesh,
+        placements=tuple(
+            Replicate() if isinstance(p, _Partial) else p
+            for p in input_spec.placements
+        ),
+        ndim=input_spec.ndim,
+        shape=input_spec.shape,
+    )
+    return OutputSharding(output_spec=output_spec)
+
+
+# some tensor ops should not support shard, i.e. local_scalar_dense
+# shouldn't work for shard as it requires numel == 1
+def no_shard_prop_rule(op_schema: OpSchema) -> OutputSharding:
+    # by default prop the first arg spec
+    tensor_spec = op_schema.args_spec[0]
+    for placement in tensor_spec.placements:
+        if placement.is_shard():
+            return OutputSharding(
+                None,
+                failed_reason=f"Op does not support input placements "
+                f"with `Shard`, but found placements: "
+                f"{tensor_spec.placements}",
+            )
+    # otherwise default prop the first arg spec
+    return OutputSharding(tensor_spec)
+
+
+def new_factory_rule(op_schema: OpSchema) -> OutputSharding:
+    # this op would benefit from backward sharding propagation!
+    # Since we cannot do that yet, just return replicated
+    input = op_schema.args_schema[0]
+    size = torch.Size(cast(Sequence[int], op_schema.args_schema[1]))
+    assert isinstance(input, DTensorSpec)
+
+    return OutputSharding(
+        output_spec=DTensorSpec(
+            mesh=input.mesh,
+            placements=[Replicate()] * input.mesh.ndim,
+            shape=size,
+            ndim=len(size),
+        )
+    )
+
+
+default_prop_ops = [
+    "aten._to_copy.default",
+    "aten.clone.default",
+    "aten.contiguous.default",
+    "aten.copy_.default",
+    "aten.detach.default",
+    "aten.is_same_size.default",
+    "aten.new_empty_strided.default",
+]
+
+create_like_ops = [
+    "aten.empty_like.default",
+    "aten.fill_.Scalar",
+    "aten.full_like.default",
+    "aten.ones_like.default",
+    "aten.zero_.default",
+    "aten.zeros_like.default",
+]
+
+new_factory_ops = [
+    "aten.new_full.default",
+    "aten.new_ones.default",
+    "aten.new_zeros.default",
+]
+
+no_shard_prop_ops = ["aten._local_scalar_dense.default"]
+
+for op in default_prop_ops:
+    DTensor._op_to_rules[op] = default_prop_rule
+
+for op in create_like_ops:
+    DTensor._op_to_rules[op] = prop_create_like
+
+for op in no_shard_prop_ops:
+    DTensor._op_to_rules[op] = no_shard_prop_rule
+
+for op in new_factory_ops:
+    DTensor._op_to_rules[op] = new_factory_rule
+
+
+@register_prop_rule("aten.bucketize.Tensor")
+def prop_bucketize(op_schema: OpSchema) -> OutputSharding:
+    """
+    Point-wise on the first input (just propagate input sharding).
+    Expect replicated for second input.
+    """
+    input_schema, boundaries = op_schema.args_schema
+    assert isinstance(input_schema, DTensorSpec)
+    assert isinstance(boundaries, DTensorSpec)
+
+    if all(isinstance(p, Replicate) for p in boundaries.placements):
+        return OutputSharding(output_spec=input_schema)
+    else:
+        return OutputSharding(
+            output_spec=None,
+            schema_suggestions=[
+                OpSchema(
+                    func_schema=op_schema.func_schema,
+                    args_schema=(
+                        input_schema,
+                        DTensorSpec(
+                            mesh=boundaries.mesh,
+                            placements=[Replicate()]
+                            * len(boundaries.placements),
+                            ndim=boundaries.ndim,
+                            shape=boundaries.shape,
+                        ),
+                    ),
+                    kwargs_schema=op_schema.kwargs_schema,
+                )
+            ],
+        )
+
+
+def unshard_tensor_dim(
+    placements: Sequence[Placement], dim: int
+) -> Sequence[Placement]:
+    """Disallow the given tensor dimension to be sharded"""
+    return tuple(
+        p if (not isinstance(p, Shard) or p.dim != dim) else Replicate()
+        for p in placements
+    )
+
+
+def _prop_all_but_dim(
+    op_schema: OpSchema, dim: int, out_shape: torch.Size
+) -> OutputSharding:
+    """
+    Considering an op that takes its input as first argument, forwards all shardings
+    except for the given dimension.
+    """
+    input_spec = op_schema.args_schema[0]
+    assert isinstance(input_spec, DTensorSpec)
+
+    output_placements = unshard_tensor_dim(input_spec.placements, dim=dim)
+    output_spec = DTensorSpec(
+        mesh=input_spec.mesh,
+        placements=output_placements,
+        shape=out_shape,
+        ndim=input_spec.ndim,
+    )
+
+    if input_spec.placements == output_placements:
+        out = OutputSharding(output_spec=output_spec)
+    else:
+        suggested_input_spec = DTensorSpec(
+            mesh=input_spec.mesh,
+            placements=output_placements,
+            ndim=input_spec.ndim,
+            shape=input_spec.shape,
+        )
+        out = OutputSharding(
+            output_spec=None,
+            schema_suggestions=[
+                OpSchema(
+                    func_schema=op_schema.func_schema,
+                    args_schema=(suggested_input_spec,)
+                    + op_schema.args_schema[1:],
+                    kwargs_schema=op_schema.kwargs_schema,
+                ),
+            ],
+        )
+    return out
+
+
+@register_prop_rule("aten.slice.Tensor")
+def prop_slice(op_schema: OpSchema) -> OutputSharding:
+    """NOTE: can be further optimized (right now it replicates before slicing on a sharded dimension)"""
+    defaults = (None, 0, None, None, 1)
+    input_spec, dim, start, end, step = (
+        op_schema.args_schema + defaults[len(op_schema.args_schema) :]
+    )
+    assert isinstance(input_spec, DTensorSpec)
+    assert isinstance(dim, int)
+    assert start is None or isinstance(start, int)
+    assert end is None or isinstance(end, int)
+    assert isinstance(step, int)
+
+    # normalize arguments
+    if dim < 0:
+        dim += input_spec.ndim
+    if start is None:
+        start = 0
+    if step is None:
+        step = 1
+    if end is None or end > input_spec.shape[dim]:
+        end = input_spec.shape[dim]
+    if start < 0:
+        start += input_spec.shape[dim]
+    if end < 0:
+        end += input_spec.shape[dim]
+
+    if start == 0 and end == input_spec.shape[dim] and step == 1:
+        return OutputSharding(output_spec=input_spec)
+
+    # shape propagation
+    slice_len = (end - start + step - 1) // step
+    out_shape = torch.Size(
+        tuple(input_spec.shape[0:dim])
+        + (slice_len,)
+        + tuple(input_spec.shape[dim + 1 :])
+    )
+
+    return _prop_all_but_dim(op_schema, dim=dim, out_shape=out_shape)
+
+
+@register_prop_rule("aten.slice_scatter.default")
+def prop_slice_scatter(op_schema: OpSchema) -> OutputSharding:
+    # 1. number of dimensions in input and src need to match.
+    # 2. number of elements on all non-dim need to match between input and src.
+    # 3. numer of elements in src in dim need to match the slice size.
+    # Given the above:
+    # - We suggest for src to follow the sharding of input, except on the scatter dimension,
+    #   where our best bet for now is to make them replicated as a fall-back.
+    #   TODO: Ideally we'd like to make sure the output is re-sharded afterwards to keep input sharding.
+
+    defaults = (None, None, 0, None, None, 1)
+    input, src, dim, start, end, step = (
+        op_schema.args_schema + defaults[len(op_schema.args_schema) :]
+    )
+    assert isinstance(input, DTensorSpec)
+    assert isinstance(src, DTensorSpec)
+    assert isinstance(dim, int)
+
+    if dim < 0:
+        dim += input.ndim
+
+    # first, we keep the input sharding, except for the input dimension
+    # also, we cannot allow partial sum anymore.
+    input_suggestion = tuple(
+        Replicate()
+        if isinstance(p, _Partial) or (isinstance(p, Shard) and p.dim == dim)
+        else p
+        for p in input.placements
+    )
+
+    if input_suggestion == tuple(input.placements) and src.placements == tuple(
+        input.placements
+    ):
+        # if our sharding is correct, the output sharding will be the same as the input.
+        return OutputSharding(
+            output_spec=DTensorSpec(
+                mesh=input.mesh,
+                placements=input.placements,
+                shape=input.shape,
+                ndim=input.ndim,
+            )
+        )
+    else:
+        # otherwise, return the suggestion.
+        return OutputSharding(
+            output_spec=None,
+            schema_suggestions=[
+                OpSchema(
+                    func_schema=op_schema.func_schema,
+                    args_schema=(
+                        DTensorSpec(
+                            mesh=input.mesh,
+                            placements=input_suggestion,
+                            shape=input.shape,
+                            ndim=input.ndim,
+                        ),
+                        DTensorSpec(
+                            mesh=src.mesh,
+                            placements=input_suggestion,
+                            shape=src.shape,
+                            ndim=src.ndim,
+                        ),
+                    )
+                    + op_schema.args_schema[2:],
+                    kwargs_schema=op_schema.kwargs_schema,
+                )
+            ],
+        )
+
+
+@register_prop_rule("aten.index_select.default")
+def prop_index_select(op_schema: OpSchema) -> OutputSharding:
+    values_spec, dim, indices_spec = op_schema.args_schema
+
+    assert isinstance(values_spec, DTensorSpec)
+    assert isinstance(dim, int)
+    assert isinstance(indices_spec, DTensorSpec)
+
+    all_indices_spec: List[Optional[DTensorSpec]] = [
+        indices_spec if dim == i else None for i in range(values_spec.ndim)
+    ]
+
+    result = prop_index(
+        OpSchema(
+            func_schema=op_schema.func_schema,
+            args_schema=(values_spec, all_indices_spec),
+            kwargs_schema=op_schema.kwargs_schema,
+        )
+    )
+    if result.schema_suggestions:
+        result.schema_suggestions = [
+            OpSchema(
+                func_schema=op_schema.func_schema,
+                args_schema=(s.args_schema[0], dim, s.args_schema[1][dim]),
+                kwargs_schema=op_schema.kwargs_schema,
+            )
+            for s in result.schema_suggestions
+        ]
+    return result
+
+
+@register_prop_rule("aten.index.Tensor")
+def prop_index(op_schema: OpSchema) -> OutputSharding:
+    """
+    Expect replicated on the first input; _mostly_ pointwise on the second input.
+    TODO: exception: when the dtype of second input is "bool", then a torch.nonzero needs to be triggered first.
+    """
+    # Current sharding constraints:
+    # For values:
+    #   1. We currently require that the dimension of values_spec be replicated or partial
+    #      if they are being indexed on.
+    #   2. Other dimensions of values_spec can remain sharded if they are so.
+    # For indices:
+    #   Indices can be either sharded or replicated. All index tensors need to be sharded
+    #   in a compatible way, following the pointwise rule (including resolving _Partial
+    #   into either sharded or replicated)
+
+    values_spec, multi_indices_spec = op_schema.args_schema
+    assert isinstance(values_spec, DTensorSpec)
+    assert isinstance(multi_indices_spec, list)
+    multi_indices_spec = cast(List[Optional[DTensorSpec]], multi_indices_spec)
+    valid_indices_spec: List[Tuple[int, DTensorSpec]] = [
+        (i, a) for i, a in enumerate(multi_indices_spec) if a is not None
+    ]
+
+    # 1. All indices have to be sharded equally. Moreover, indices can be broadcast.
+    #    Here, we piggyback on the pointwise sharding rule for indices.
+    indices_out = pointwise_rule(
+        OpSchema(
+            func_schema=op_schema.func_schema,
+            args_schema=tuple(v[1] for v in valid_indices_spec),
+            kwargs_schema={},
+        )
+    )
+    need_reshard_on_indices = indices_out.output_spec is None
+
+    if not need_reshard_on_indices:
+        # this means that our inputs are already sharded properly and we will use that as our indices_spec
+        assert isinstance(indices_out.output_spec, DTensorSpec)
+        indices_spec: DTensorSpec = indices_out.output_spec
+    else:
+        assert indices_out.schema_suggestions is not None
+        valid_indices_suggestion = indices_out.schema_suggestions[0]
+        for i, v in enumerate(valid_indices_suggestion.args_spec):
+            multi_indices_spec[valid_indices_spec[i][0]] = v
+        # we'll need to call pointwise_rule again to see what's our ideal indices_spec and then
+        # use that to compute our ideal values_spec
+        indices_output_spec = pointwise_rule(
+            valid_indices_suggestion
+        ).output_spec
+        assert isinstance(indices_output_spec, DTensorSpec)
+        indices_spec = indices_output_spec
+
+    lookup_dims = set(v[0] for v in valid_indices_spec)
+
+    need_reshard_on_values = tuple(
+        (
+            isinstance(vp, Shard)
+            and (vp.dim in lookup_dims or isinstance(ip, Shard))
+        )
+        for vp, ip in zip(values_spec.placements, indices_spec.placements)
+    )
+
+    if not need_reshard_on_indices and not any(need_reshard_on_values):
+
+        value_placements = values_spec.placements
+        value_shape = values_spec.shape
+
+        all_dims_consecutive = all(
+            b[0] - a[0] == 1
+            for b, a in zip(valid_indices_spec[1:], valid_indices_spec[:-1])
+        )
+        if all_dims_consecutive:
+            # if all index vectors are consecutives, insert at the dimension of the first index
+            insert_dim: int = valid_indices_spec[0][0]
+        else:
+            # else, insert on the first dimension
+            insert_dim = 0
+
+        def place(vp: Placement, ip: Placement) -> Placement:
+            if isinstance(vp, Shard):
+                return Shard(
+                    vp.dim
+                    if vp.dim < insert_dim
+                    # accounts for the offset in output dimensions
+                    else vp.dim
+                    + indices_spec.ndim
+                    - sum(1 if vp.dim > v[0] else 0 for v in valid_indices_spec)
+                )
+            if isinstance(ip, Shard):
+                return Shard(ip.dim + insert_dim)
+            # _Partial or Replicated
+            return vp
+
+        value_placements = tuple(
+            place(vp, ip)
+            for vp, ip in zip(values_spec.placements, indices_spec.placements)
+        )
+        value_shape = torch.Size(
+            tuple(value_shape[:insert_dim])
+            + tuple(indices_spec.shape)
+            + tuple(value_shape[insert_dim + len(valid_indices_spec) :])
+        )
+
+        result = OutputSharding(
+            output_spec=DTensorSpec(
+                mesh=values_spec.mesh,
+                placements=value_placements,
+                shape=value_shape,
+                ndim=len(value_shape),
+            )
+        )
+        return result
+    else:
+        result = OutputSharding(
+            output_spec=None,
+            schema_suggestions=[
+                OpSchema(
+                    func_schema=op_schema.func_schema,
+                    args_schema=(
+                        DTensorSpec(
+                            mesh=values_spec.mesh,
+                            placements=[
+                                Replicate() if need_reshard_on_values[i] else v
+                                for i, v in enumerate(values_spec.placements)
+                            ],
+                            ndim=values_spec.ndim,
+                            shape=values_spec.shape,
+                        ),
+                        multi_indices_spec,
+                    ),
+                    kwargs_schema=op_schema.kwargs_schema,
+                )
+            ],
+        )
+        return result
diff --git a/torch/distributed/_tensor/ops/tp_sharding_ops.py b/torch/distributed/_tensor/ops/tp_sharding_ops.py
new file mode 100644
index 0000000000000..01db8920e6747
--- /dev/null
+++ b/torch/distributed/_tensor/ops/tp_sharding_ops.py
@@ -0,0 +1,55 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# implement matrix related ops for distributed tensor
+import torch
+import torch.utils._pytree as pytree
+from typing import List
+from torch.distributed._tensor.api import DTensor
+from torch.distributed._tensor.utils import unwrap_local_tensor
+from torch.distributed._tensor.ops.utils import unwrap_single_placement, register_impl
+
+"""
+The ops below were quickly hacked and needed to be polished down the road.
+Although they come with unit tests already, the logic is directly borrowed
+from ShardedTensor. We need to also make it work for all placement types
+of DTensor and all corner cases for sharded distributed tensor.
+"""
+
+
+@register_impl("aten.cat.default")
+def dist_cat(tensor_list: List[DTensor], dim: int = 0) -> DTensor:
+    local_inputs = pytree.tree_map(unwrap_local_tensor, tensor_list)
+    local_tensor = torch.ops.aten.concat(local_inputs, dim=dim)
+    return DTensor.from_local(
+        local_tensor,
+        tensor_list[0].device_mesh,
+        tensor_list[0].placements,
+        run_check=False,
+    )
+
+
+@register_impl("aten.split.Tensor")
+# pyre-fixme[2]: Parameter must be annotated.
+def dist_split(self: DTensor, split_size_or_sections, dim=0) -> List[DTensor]:
+    local_mat = pytree.tree_map(unwrap_local_tensor, self)
+    mat_placement = pytree.tree_map(unwrap_single_placement, self)
+    sharding_dim = mat_placement.dim
+    world_size = self.device_mesh.size(dim=0)
+    if dim < 0:
+        dim = self.dim() + dim
+    if sharding_dim < 0:
+        sharding_dim = self.dim() + sharding_dim
+    if dim == sharding_dim:
+        if type(split_size_or_sections) is list:
+            split_size_or_sections[sharding_dim] //= world_size
+        else:
+            split_size_or_sections //= world_size
+    tensor_list = local_mat.split(split_size_or_sections, dim=dim)
+    return [
+        DTensor.from_local(
+            tensor,
+            self.device_mesh,
+            [mat_placement],
+            run_check=False,
+        )
+        for tensor in tensor_list
+    ]
diff --git a/torch/distributed/_tensor/ops/utils.py b/torch/distributed/_tensor/ops/utils.py
new file mode 100644
index 0000000000000..42db7142638a5
--- /dev/null
+++ b/torch/distributed/_tensor/ops/utils.py
@@ -0,0 +1,81 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import functools
+import operator
+
+import torch
+from typing import List, Union, Sequence, Iterable
+from torch.distributed._tensor.api import DTensor
+
+
+# pyre-fixme[3]: Return type must be annotated.
+# pyre-fixme[2]: Parameter must be annotated.
+def unwrap_single_placement(e):
+    if not isinstance(e, DTensor):
+        return None
+    assert len(e.placements) == 1, "more than one placement!"
+    return e.placements[0]
+
+
+# convenient wrapper to register custom operator impls
+# pyre-fixme[3]: Return type must be annotated.
+# pyre-fixme[2]: Parameter must be annotated.
+def register_impl(func):
+    # pyre-fixme[53]: Captured variable `func` is not annotated.
+    # pyre-fixme[3]: Return type must be annotated.
+    # pyre-fixme[2]: Parameter must be annotated.
+    def wrapper(impl):
+        DTensor._custom_dispatch_ops[func] = impl
+        return impl
+
+    return wrapper
+
+
+# convenient wrapper to register sharding propagation rules
+# pyre-fixme[3]: Return type must be annotated.
+# pyre-fixme[2]: Parameter must be annotated.
+def register_prop_rule(func):
+    # pyre-fixme[53]: Captured variable `func` is not annotated.
+    # pyre-fixme[3]: Return type must be annotated.
+    # pyre-fixme[2]: Parameter must be annotated.
+    def wrapper(impl):
+        DTensor._op_to_rules[func] = impl
+        return impl
+
+    return wrapper
+
+
+def as_list(
+    x: Union[List[object], object]
+    # pyre-fixme[11]: Annotation `immutable_list` is not defined as a type.
+) -> Union[List[object], torch.fx.immutable_collections.immutable_list]:
+    # During tracing, `aten.sum.dim_IntList` uses `immutable_list` for its args,
+    # which is an object but treated as a list by the tracer. Therefore, keep
+    # `immutable_list` intact here as well.
+    if type(x) is list or isinstance(
+        x, torch.fx.immutable_collections.immutable_list
+    ):
+        return x
+    else:
+        return [x]
+
+
+def normalize_dim(dim: int, ndim: int) -> int:
+    return dim if dim >= 0 else dim + ndim
+
+
+def normalize_dims(dims: Union[int, Sequence[int]], ndim: int) -> Sequence[int]:
+    """
+    normalize a dim or a sequence of dims, so that they
+    are all positive.
+    """
+    if isinstance(dims, int):
+        dims = (normalize_dim(dims, ndim),)
+    elif isinstance(dims, list):
+        dims = [normalize_dim(dim, ndim) for dim in dims]
+    elif isinstance(dims, tuple):
+        dims = tuple([normalize_dim(dim, ndim) for dim in dims])
+    return dims
+
+
+def prod(xs: Iterable[int]) -> int:
+    return functools.reduce(operator.mul, xs, 1)

From 873ed7a0a62d2567a245ace192ebe6a8c6134a06 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Tue, 15 Nov 2022 08:04:38 +0000
Subject: [PATCH 0949/1922] [dtensor] PART 4: move remaining DTensor ops to
 core distributed (#88550)

This PR moves the view related DTensor ops to core distributed,
tests will be add in follow up PRs

part of https://github.com/pytorch/pytorch/issues/88838
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88550
Approved by: https://github.com/fduwjj
---
 torch/distributed/_tensor/__init__.py     |   2 +-
 torch/distributed/_tensor/ops/__init__.py |   2 +-
 torch/distributed/_tensor/ops/view_ops.py | 707 ++++++++++++++++++++++
 3 files changed, 709 insertions(+), 2 deletions(-)
 create mode 100644 torch/distributed/_tensor/ops/view_ops.py

diff --git a/torch/distributed/_tensor/__init__.py b/torch/distributed/_tensor/__init__.py
index ba09f2fbb6907..32a57146bc939 100644
--- a/torch/distributed/_tensor/__init__.py
+++ b/torch/distributed/_tensor/__init__.py
@@ -9,7 +9,7 @@
 
 
 # Import all builtin dist tensor ops
-# import torch.distributed._tensor.ops
+import torch.distributed._tensor.ops
 
 
 def distribute_tensor(
diff --git a/torch/distributed/_tensor/ops/__init__.py b/torch/distributed/_tensor/ops/__init__.py
index 5012768ee0519..5550b2ffae085 100644
--- a/torch/distributed/_tensor/ops/__init__.py
+++ b/torch/distributed/_tensor/ops/__init__.py
@@ -4,4 +4,4 @@
 from .tensor_ops import *  # noqa: F403
 from .tp_sharding_ops import *  # noqa: F403
 from .pointwise_ops import *  # noqa: F403
-# from .view_ops import *  # noqa: F403
+from .view_ops import *  # noqa: F403
diff --git a/torch/distributed/_tensor/ops/view_ops.py b/torch/distributed/_tensor/ops/view_ops.py
new file mode 100644
index 0000000000000..a8849b2ed14bf
--- /dev/null
+++ b/torch/distributed/_tensor/ops/view_ops.py
@@ -0,0 +1,707 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from dataclasses import dataclass
+from typing import (
+    Callable,
+    Dict,
+    Iterable,
+    Optional,
+    Tuple,
+    Set,
+    Union,
+    Sequence,
+    cast,
+)
+
+import torch
+from torch import Tensor
+
+from torch.distributed._tensor.placement_types import DTensorSpec, Placement, Replicate
+from torch.distributed._tensor.api import Shard
+from torch.distributed._tensor.dispatch import OpSchema, OutputSharding
+from torch.distributed._tensor.ops.utils import (
+    normalize_dim,
+    normalize_dims,
+    prod,
+    register_prop_rule,
+)
+
+
+Shape = Tuple[int, ...]
+
+
+@dataclass
+class DimSpec:
+    """Specifies how an output dimension maps to an input dimension."""
+
+    def inputs(self) -> Iterable["DimSpec"]:
+        return ()
+
+
+# Rules that map each dimension of the output to dimensions of the input tensor
+DimMap = Tuple[DimSpec, ...]
+
+
+@dataclass
+class Singleton(DimSpec):
+    """Output dimension is a singleton"""
+
+    pass
+
+
+@dataclass
+class InputDim(DimSpec):
+    """Output dimension maps directly to an input dimension."""
+
+    input_dim: int
+
+
+@dataclass
+class Broadcast(DimSpec):
+    """Output is the broadcast of a singleton input dimension."""
+
+    dim: DimSpec
+    dim_size: int
+
+    @classmethod
+    def new(cls, dim: DimSpec, dim_size: int) -> DimSpec:
+        return Broadcast(dim, dim_size)
+
+    def inputs(self) -> Iterable[DimSpec]:
+        return (self.dim,)
+
+
+@dataclass
+class NewDim(DimSpec):
+    """This is a new dimension created by the op."""
+
+    size: int
+
+    @classmethod
+    def new(cls, size: int) -> DimSpec:
+        return Singleton() if size == 1 else NewDim(size)
+
+
+@dataclass
+class Repeat(DimSpec):
+    """Output dimension is the input dimension repeated n-times."""
+
+    input_dim: DimSpec
+    times: int
+
+    @classmethod
+    def new(cls, dim: DimSpec, times: int) -> DimSpec:
+        if times == 1:
+            return dim
+        elif isinstance(dim, Singleton):
+            # repeating a singleton is the same as broadcasting it
+            return Broadcast(dim, times)
+        else:
+            return Repeat(dim, times)
+
+    def inputs(self) -> Iterable[DimSpec]:
+        return (self.input_dim,)
+
+
+@dataclass
+class Flatten(DimSpec):
+    """
+    Output dimension is a set of input dimensions flattened, keeping
+    right-most adjacent elements adjacent in the output.
+    """
+
+    input_dims: Sequence[DimSpec]
+
+    @classmethod
+    def new(cls, dims: Sequence[DimSpec]) -> DimSpec:
+        if len(dims) == 0:
+            # flattening a scalar leads to a singleton
+            return Singleton()
+        elif len(dims) == 1:
+            # flattening a single dimension is no-op
+            return dims[0]
+        else:
+            return Flatten(dims)
+
+    def inputs(self) -> Iterable[DimSpec]:
+        return self.input_dims
+
+
+@dataclass
+class Split(DimSpec):
+    """
+    This dimension is a member of a decomposition of the input dim.
+    Note that input_dim itself could be a Flattened set of input dims.
+    """
+
+    input_dim: DimSpec
+    group_shape: Shape
+    split_id: int
+
+    @classmethod
+    def new(
+        cls, dim: DimSpec, group_shape: Tuple[int, ...], idx: int
+    ) -> DimSpec:
+        assert len(group_shape) > 0
+        if len(group_shape) == 1:
+            # not really a group, just return the input dim back
+            assert idx == 0
+            return dim
+        elif group_shape[idx] == 1:
+            return Singleton()
+        else:
+            # remove singletons from group
+            # group_mapping = [(new_index, (shape, old_index)) ...]
+            group_mapping = list(
+                enumerate((s, i) for i, s in enumerate(group_shape) if s != 1)
+            )
+            new_group_shape = tuple(m[1][0] for m in group_mapping)
+            new_idx = next(filter(lambda x: x[1][1] == idx, group_mapping))[0]
+            return Split(dim, new_group_shape, new_idx)
+
+    def inputs(self) -> Iterable[DimSpec]:
+        return (self.input_dim,)
+
+
+def dim_pad_left(ndim: int, min_dims: int) -> DimMap:
+    return (Singleton(),) * max(0, min_dims - ndim) + tuple(
+        InputDim(i) for i in range(ndim)
+    )
+
+
+def dim_atleast_3d(ndim: int) -> DimMap:
+    if ndim == 0:
+        return (Singleton(), Singleton(), Singleton())
+    elif ndim == 1:
+        return (Singleton(), InputDim(0), Singleton())
+    elif ndim == 2:
+        return (InputDim(0), InputDim(1), Singleton())
+    else:
+        return tuple(InputDim(i) for i in range(ndim))
+
+
+def expand(input_shape: Shape, shape: Shape) -> DimMap:
+    """Implements broadcast on multiple dimensions"""
+    assert len(shape) >= len(input_shape)
+
+    # 1. create padded input dimensions
+    padded_input = dim_pad_left(len(input_shape), len(shape))
+    # 2. check that input shapes are compatible
+    mapping = []
+    for p, desired_s in zip(padded_input, shape):
+        if isinstance(p, Singleton):
+            actual_s = 1
+            assert desired_s >= 0
+        else:
+            assert isinstance(
+                p, InputDim
+            ), f"DimSpec not supported in expand: {p}"
+            actual_s = input_shape[p.input_dim]
+            assert actual_s == 1 or desired_s == -1 or desired_s == actual_s
+        mapping.append(
+            p
+            if desired_s in (1, -1) or desired_s == actual_s
+            else Broadcast.new(p, desired_s)
+        )
+    return tuple(mapping)
+
+
+def normalize_sizes(sizes: Union[Shape, Tuple[Shape]]) -> Shape:
+    if isinstance(sizes[0], int):
+        return cast(Shape, sizes)
+    elif len(sizes) == 1:
+        return cast(Shape, sizes[0])  # type: ignore[redundant-cast]
+    else:
+        raise RuntimeError("Size must be int... or tuple")
+
+
+def dim_flatten(ndim: int) -> DimMap:
+    if ndim == 0:
+        return (Singleton(),)
+    elif ndim == 1:
+        return (InputDim(0),)
+    else:
+        return (Flatten.new(tuple(InputDim(i) for i in range(ndim))),)
+
+
+def dim_movedim(
+    ndim: int,
+    input: Union[int, Sequence[int]],
+    destination: Union[int, Sequence[int]],
+) -> DimMap:
+    input = normalize_dims(input, ndim)
+    destination = normalize_dims(destination, ndim)
+
+    assert len(input) == len(destination)
+    input_set = set(input)
+    assert len(input_set) == len(input), "Found repeated input dims"
+    assert len(set(destination)) == len(
+        destination
+    ), "Found repeated output dims"
+    assert max(input) < ndim
+    assert max(destination) < ndim
+
+    dest = [-1] * ndim
+    for i, d in zip(input, destination):
+        dest[d] = i
+
+    unused_inputs_iter = iter(i for i in range(ndim) if i not in input_set)
+    for i in range(ndim):
+        if dest[i] == -1:
+            dest[i] = next(unused_inputs_iter)
+
+    return tuple(InputDim(i) for i in dest)
+
+
+def dim_repeat(ndim: int, sizes: Shape) -> DimMap:
+    sizes = normalize_sizes(sizes)
+    assert (
+        len(sizes) >= ndim
+    ), f"Number of dimensions of repeat dims {sizes} can not be smaller than number of dimensions of tensor {ndim}."
+    pad = len(sizes) - ndim
+    return tuple(Repeat.new(Singleton(), s) for s in sizes[:pad]) + tuple(
+        Repeat.new(InputDim(i), s) for i, s in enumerate(sizes[pad:])
+    )
+
+
+def infer_size(total_size: int, sizes: Shape) -> Shape:
+    """
+    One dimension input to view may be "-1".
+    Infer the size of this dimension given the total_size.
+    """
+    infers = [i for i, s in enumerate(sizes) if s == -1]
+    size = prod(sizes)
+    assert len(infers) <= 1, "can only infer one size"
+    if infers:
+        size = -size
+        missing_size = total_size // size
+        assert (
+            total_size % size == 0
+        ), f"size inferred for -1 is not integral {sizes} should have {total_size} elements."
+        return tuple(s if s != -1 else missing_size for s in sizes)
+    assert size == total_size, f"sizes do not match {total_size} vs {size}"
+    return sizes
+
+
+def view_groups(from_size: Shape, to_size: Shape) -> DimMap:
+    """
+    A view or reshape operation can be decomposed into a set of 3 types of smaller operations:
+    1) Forward a dimension from input to output
+    2) Flatten a set of dimensions into a single dimension
+    3) Split one dimension into multiple dimensions
+
+    view_groups identifies these operations and returns, for each output dimension, what
+    is operation was performed in the input dimension. For example:
+
+        view_groups([2, 3, 4], [2, 12]) -> (
+            InputDim(0),
+            Flatten((InputDim(1), InputDim(2)))
+        )
+
+    - ouptut dimension 0 maps to input dimension 0
+    - output dimension 1 maps to a flattened input dimensions 1 and 2
+
+
+        view_groups([2, 3], [3, 2]) -> (
+            Split(Flatten((InputDim(0), InputDim(1))), (3, 2), 0),
+            Split(Flatten((InputDim(0), InputDim(1))), (3, 2), 1),
+        )
+
+    - in the above, input is flattened into a single dimension and then split
+      into two separate dimensions with different sizes from the input.
+    """
+    from_nelem = prod(from_size)
+    to_size = infer_size(from_nelem, normalize_sizes(to_size))
+
+    assert from_nelem == prod(to_size), "Total view shape does not add up"
+
+    from_idx = 0
+    to_idx = 0
+    from_len = len(from_size)
+    to_len = len(to_size)
+
+    result_pp = []
+
+    while from_idx < from_len or to_idx < to_len:
+        from_group_dim, to_group_shape = [], []
+
+        if from_idx >= from_len:
+            f = 1
+        else:
+            f = from_size[from_idx]
+            from_group_dim.append(from_idx)
+            from_idx += 1
+
+        if to_idx >= to_len:
+            t = 1
+        else:
+            t = to_size[to_idx]
+            to_group_shape.append(t)
+            to_idx += 1
+
+        # if any of the groups is singleton, great, we need to backtrack though
+        if f == 1 and t != 1:
+            # produces ([1], [])
+            to_idx -= 1
+            to_group_shape = []
+        elif f != 1 and t == 1:
+            # produces ([], [1])
+            from_idx -= 1
+            from_group_dim = []
+        else:
+            # produces ([1], [1]),  ([2], [2]), ([2,3], [6])
+            while f != t:
+                if f < t:
+                    nf = from_size[from_idx]
+                    from_group_dim.append(from_idx)
+                    from_idx += 1
+                    f *= nf
+                else:
+                    nt = to_size[to_idx]
+                    to_group_shape.append(nt)
+                    to_idx += 1
+                    t *= nt
+
+        if len(to_group_shape) > 0:
+            flattened = Flatten.new(
+                tuple(
+                    InputDim(fi) for fi in from_group_dim if from_size[fi] > 1
+                )
+            )
+            result_pp += [
+                Split.new(flattened, tuple(to_group_shape), i)
+                for i in range(len(to_group_shape))
+            ]
+
+    return tuple(result_pp)
+
+
+def dim_tile(ndim: int, dims: Tuple[int, ...]) -> DimMap:
+    if len(dims) < ndim:
+        dims = (1,) * (ndim - len(dims)) + dims
+    return dim_repeat(ndim, dims)
+
+
+def dim_transpose(ndim: int, dim1: int, dim2: int) -> DimMap:
+    dim1 = normalize_dim(dim1, ndim)
+    dim2 = normalize_dim(dim2, ndim)
+    assert dim1 < ndim
+    assert dim2 < ndim
+    dimmap = list(InputDim(i) for i in range(ndim))
+    swapdim = dimmap[dim1]
+    dimmap[dim1] = dimmap[dim2]
+    dimmap[dim2] = swapdim
+    return tuple(dimmap)
+
+
+def dim_squeeze(shape: Shape, dim: Optional[int] = None) -> DimMap:
+    # FIXME: this is wrong when dim=None and one of the dimensions
+    # equals size of the mesh. For example squeeze(DTensor(tensor(4), Shard[0])) could
+    # end up as squeeze(tensor(1)) if we have 4 devices; this would lead to
+    # removal of a dimension that is not acutally a singleton.
+    return tuple(
+        InputDim(i)
+        for i, s in enumerate(shape)
+        if s > 1 or (dim is not None and i != normalize_dim(dim, len(shape)))
+    )
+
+
+def dim_unsqueeze(ndim: int, dim: int) -> DimMap:
+    dims = tuple(InputDim(i) for i in range(ndim))
+    if dim < 0:
+        dim += ndim + 1
+    return dims[:dim] + (Singleton(),) + dims[dim:]
+
+
+def dim_reduction(
+    ndim: int, dim_or_dims: Optional[Union[int, Sequence[int]]], keepdim: bool
+) -> DimMap:
+    """
+    General fallback for reduction ops where _Partial() does not apply.
+    This will cause incoming tensor to be replicated on the reducing dimensions.
+    """
+    if dim_or_dims is None:
+        dim_or_dims = tuple(range(ndim))
+    if isinstance(dim_or_dims, int):
+        dim_or_dims = (dim_or_dims,)
+    dim_or_dims = tuple(d if d >= 0 else d + ndim for d in dim_or_dims)
+    return tuple(
+        InputDim(i) if i not in dim_or_dims else Singleton()
+        for i in range(ndim)
+        if i not in dim_or_dims or keepdim
+    )
+
+
+@dataclass
+class Op:
+    dim_map: Callable[..., DimMap]
+    shape_argnum: Optional[int] = None
+
+
+ops: Dict[Callable[..., torch.Tensor], Op] = {
+    torch.atleast_1d: Op(dim_map=lambda x: dim_pad_left(x.ndim, 1)),
+    torch.atleast_2d: Op(dim_map=lambda x: dim_pad_left(x.ndim, 2)),
+    torch.atleast_3d: Op(dim_map=lambda x: dim_atleast_3d(x.ndim)),
+    torch.broadcast_to: Op(
+        dim_map=lambda input, shape: expand(input.shape, shape), shape_argnum=1
+    ),
+    Tensor.expand: Op(
+        dim_map=lambda self, *sizes: expand(self.shape, normalize_sizes(sizes)),
+        shape_argnum=1,
+    ),
+    torch.flatten: Op(dim_map=lambda tensor: dim_flatten(tensor.ndim)),
+    torch.movedim: Op(
+        dim_map=lambda input, source, destination: dim_movedim(
+            input.ndim, source, destination
+        )
+    ),
+    torch.permute: Op(
+        dim_map=lambda input, dims: tuple(
+            InputDim(i) for i in normalize_dims(dims, input.ndim)
+        )
+    ),
+    torch.ravel: Op(dim_map=lambda tensor: dim_flatten(tensor.ndim)),
+    Tensor.repeat: Op(
+        dim_map=lambda self, *sizes: dim_repeat(self.ndim, sizes)
+    ),
+    torch.reshape: Op(
+        dim_map=lambda input, shape: view_groups(input.shape, shape),
+        shape_argnum=1,
+    ),
+    torch.squeeze: Op(
+        dim_map=lambda input, dim=None: dim_squeeze(input.shape, dim)
+    ),
+    torch.tile: Op(dim_map=lambda input, dims: dim_tile(input.ndim, dims)),
+    torch.transpose: Op(
+        dim_map=lambda input, dim0, dim1: dim_transpose(input.ndim, dim0, dim1)
+    ),
+    torch.unsqueeze: Op(
+        dim_map=lambda input, dim: dim_unsqueeze(input.ndim, dim)
+    ),
+    Tensor.view: Op(
+        dim_map=lambda input, *shape: view_groups(input.shape, shape),
+        shape_argnum=1,
+    ),
+}
+
+
+def propagate_shape_and_sharding(
+    in_shard: Sequence[Placement],
+    local_in_shape: Shape,
+    rule: DimMap,
+    mesh_sizes: Shape,
+) -> Tuple[Shape, Optional[Sequence[Placement]], torch.Tensor]:
+    """
+    Takes as input the global shape of the tensor, and the input sharding,
+    and produce corresponding output sharding and shape of the output tensor.
+
+    Sharding propagation follows mapped dimensions:
+    - An output dimension that maps directly to an input dimension is sharded equally
+    - An output dimension that is a flattened set of input dimensions can only be
+      sharded if only the leftmost flattened dimension is sharded.
+    - An output dimension that is a split of the input dimension can only be sharded
+      if the leftmost split size is divisible by the mesh dimension
+    """
+    assert len(in_shard) == len(mesh_sizes)
+    sharded_in_dims: Set[int] = set(
+        s.dim for s in in_shard if isinstance(s, Shard)
+    )
+    # for each input dim, for each mesh dim, provides a list of possible shardable dimensions
+    shardable_dims: torch.Tensor = torch.ones(
+        (len(local_in_shape), len(mesh_sizes)), dtype=torch.bool
+    )
+
+    # in case an input dimension disappears (e.g. collapsing, reduction)
+    # we cannot shard in that dimension (we need a replication fall-back rule)
+
+    seen_input_dims: Set[int] = set()
+
+    def collect_used_inputs(cmd: DimSpec) -> None:
+        if isinstance(cmd, InputDim):
+            seen_input_dims.add(cmd.input_dim)
+        for inp in cmd.inputs():
+            collect_used_inputs(inp)
+
+    for cmd in rule:
+        collect_used_inputs(cmd)
+    for dim in range(len(local_in_shape)):
+        shardable_dims[dim, :] = dim in seen_input_dims
+
+    def get_dim_size(cmd: DimSpec) -> Tuple[int, Optional[InputDim]]:
+        if isinstance(cmd, InputDim):
+            seen_input_dims.add(cmd.input_dim)
+            return (
+                local_in_shape[cmd.input_dim],
+                cmd if cmd.input_dim in sharded_in_dims else None,
+            )
+        elif isinstance(cmd, Flatten):
+            for dim in cmd.input_dims[1:]:
+                if isinstance(dim, InputDim):
+                    shardable_dims[dim.input_dim, :] = False
+            dim0 = cmd.input_dims[0]
+            return (
+                prod(get_dim_size(a)[0] for a in cmd.input_dims),
+                dim0
+                if isinstance(dim0, InputDim)
+                and dim0.input_dim in sharded_in_dims
+                else None,
+            )
+        elif isinstance(cmd, Split):
+            _, in_dim = get_dim_size(cmd.input_dim)
+            out_size = cmd.group_shape[cmd.split_id]
+            if cmd.split_id == 0 and in_dim is not None:
+                # we need to check that the input dimension is divisble
+                # by the size of the submesh we're sharding it on
+                # NOTE: it would be possible to shard the same input dimension
+                # on more than one mesh dimension. In that case, the dimension
+                # needs to be divisible by the product of mesh sizes.
+                # In order to keep the problem more tractable, we will not consider
+                # double resharding as a suggestion (e.g. [Shard(0), Shard(0) ])
+                # but we will allow it if that's the input and it's compatible
+
+                # 1. is this dimension shardable on each individual mesh dim?
+                for mesh_dim, mesh_dim_size in enumerate(mesh_sizes):
+                    shardable_dims[in_dim.input_dim, mesh_dim] = (
+                        out_size % mesh_dim_size == 0
+                    )
+
+                # 2. here we special case things like [Shard(0), Shard(0)]
+                submesh_size = 1
+                for size, shard in zip(mesh_sizes, in_shard):
+                    if isinstance(shard, Shard) and shard.dim == in_dim:
+                        submesh_size *= size
+                assert (
+                    out_size % submesh_size == 0
+                ), f"Resulting dimension size {out_size} is not divisible by its mesh dimension {submesh_size}."
+
+            # we will only shard our first component of the split
+            return out_size, in_dim if cmd.split_id == 0 else None
+        elif isinstance(cmd, Singleton):
+            return 1, None
+        elif isinstance(cmd, Broadcast):
+            return cmd.dim_size, None
+        elif isinstance(cmd, NewDim):
+            return cmd.size, None
+        elif isinstance(cmd, Repeat):
+            size, in_dim = get_dim_size(cmd.input_dim)
+            if in_dim is not None:
+                shardable_dims[in_dim.input_dim, :] = False
+            return size * cmd.times, None
+        else:
+            raise RuntimeError(f"cmd not found: {cmd}, in rule: {rule}")
+
+    dim_map = {}
+    out_shape = []
+    for dim, cmd in enumerate(rule):
+        out_size, in_dim = get_dim_size(cmd)
+        out_shape.append(out_size)
+        if in_dim is not None:
+            dim_map[in_dim.input_dim] = dim
+
+    needs_reshard = any(
+        isinstance(placement, Shard)
+        and not shardable_dims[placement.dim][mesh_dim]
+        for mesh_dim, placement in enumerate(in_shard)
+    )
+
+    output_placements = (
+        None
+        if needs_reshard
+        else [
+            Shard(dim_map[s.dim]) if isinstance(s, Shard) else s
+            for s in in_shard
+        ]
+    )
+
+    return (tuple(out_shape), output_placements, shardable_dims)
+
+
+def register_prop_rule_map(
+    aten_op_name: str, local_op_name: Callable[..., torch.Tensor]
+) -> None:
+    spec: Op = ops[local_op_name]
+
+    @register_prop_rule(aten_op_name)
+    def reshape_prop(op_schema: OpSchema) -> OutputSharding:
+        rules = spec.dim_map(*op_schema.args_schema, **op_schema.kwargs_schema)
+        input_dtensor_spec = op_schema.args_schema[0]
+
+        assert isinstance(
+            input_dtensor_spec, DTensorSpec
+        ), "Expected first input to be a DTensorSpec"
+        global_in_shape = input_dtensor_spec.shape
+        assert global_in_shape is not None, "Shape required."
+
+        (
+            global_out_shape,
+            shard_out,
+            shardable_dims,
+        ) = propagate_shape_and_sharding(
+            input_dtensor_spec.placements,
+            tuple(global_in_shape),
+            rules,
+            tuple(input_dtensor_spec.mesh.mesh.shape),
+        )
+
+        if shard_out is not None:
+            # no reshard needed
+            output_dtensor_spec = DTensorSpec(
+                mesh=input_dtensor_spec.mesh,
+                placements=shard_out,
+                shape=torch.Size(global_out_shape),
+                ndim=len(global_out_shape),
+            )
+            local_out_shape = output_dtensor_spec.local_shape
+
+            # We only need the local shape to lower he call into the local op
+            args = op_schema.args_schema
+            shape_argnum = spec.shape_argnum
+            if shape_argnum is not None:
+                op_schema.args_schema = (
+                    args[:shape_argnum]
+                    + (tuple(local_out_shape),)
+                    + args[shape_argnum + 1 :]
+                )
+
+            return OutputSharding(output_spec=output_dtensor_spec)
+
+        else:
+            # TODO: optimize this. we shouldn't simply blindly replicate
+            #       unshardable dims ...
+            # FIXME: this can be wrong for situations where we have
+            #        [Shard(0), Shard(0)]
+            suggested_placements = [
+                p
+                if not isinstance(p, Shard) or shardable_dims[p.dim][mesh_dim]
+                else Replicate()
+                for mesh_dim, p in enumerate(input_dtensor_spec.placements)
+            ]
+            return OutputSharding(
+                output_spec=None,
+                schema_suggestions=[
+                    OpSchema(
+                        func_schema=op_schema.func_schema,
+                        args_schema=(
+                            DTensorSpec(
+                                placements=suggested_placements,
+                                mesh=input_dtensor_spec.mesh,
+                                ndim=input_dtensor_spec.ndim,
+                                shape=input_dtensor_spec.shape,
+                            ),
+                        )
+                        + op_schema.args_schema[1:],
+                        kwargs_schema=op_schema.kwargs_schema,
+                    )
+                ],
+            )
+
+
+register_prop_rule_map("aten.squeeze.default", torch.squeeze)
+register_prop_rule_map("aten.squeeze.dim", torch.squeeze)
+register_prop_rule_map("aten.view.default", Tensor.view)
+register_prop_rule_map("aten.view.SymInt", Tensor.view)
+register_prop_rule_map("aten._unsafe_view.default", Tensor.view)
+register_prop_rule_map("aten.unsqueeze.default", torch.unsqueeze)
+register_prop_rule_map("aten.expand.default", Tensor.expand)
+register_prop_rule_map("aten.permute.default", torch.permute)
+register_prop_rule_map("aten.repeat.default", Tensor.repeat)
+register_prop_rule_map("aten.transpose.int", torch.transpose)

From dd439020c02df8f58f0d8a980657356745c0e7c5 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Tue, 15 Nov 2022 22:51:31 +0000
Subject: [PATCH 0950/1922] [dtensor] PART 5: move DTensor basic tests to core
 distributed (#88178)

This PR moves DTensor basic tests to torch.distributed, including
dtensor, device_mesh tests

part of https://github.com/pytorch/pytorch/issues/88838
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88178
Approved by: https://github.com/fduwjj
---
 test/distributed/_tensor/README.md            |  11 +
 test/distributed/_tensor/__init__.py          |   1 +
 test/distributed/_tensor/test_api.py          | 234 ++++++++
 test/distributed/_tensor/test_device_mesh.py  | 518 ++++++++++++++++++
 test/distributed/_tensor/test_dtensor.py      | 359 ++++++++++++
 test/distributed/_tensor/test_redistribute.py | 317 +++++++++++
 .../_internal/distributed/_tensor/__init__.py |   0
 .../distributed/_tensor/common_dtensor.py     | 334 +++++++++++
 8 files changed, 1774 insertions(+)
 create mode 100644 test/distributed/_tensor/README.md
 create mode 100644 test/distributed/_tensor/__init__.py
 create mode 100644 test/distributed/_tensor/test_api.py
 create mode 100644 test/distributed/_tensor/test_device_mesh.py
 create mode 100644 test/distributed/_tensor/test_dtensor.py
 create mode 100644 test/distributed/_tensor/test_redistribute.py
 create mode 100644 torch/testing/_internal/distributed/_tensor/__init__.py
 create mode 100644 torch/testing/_internal/distributed/_tensor/common_dtensor.py

diff --git a/test/distributed/_tensor/README.md b/test/distributed/_tensor/README.md
new file mode 100644
index 0000000000000..6235f9657d5fe
--- /dev/null
+++ b/test/distributed/_tensor/README.md
@@ -0,0 +1,11 @@
+## Run distributed tensor tests:
+
+from root, run (either CPU or GPU)
+
+`pytest test/spmd/tensor/test_tensor.py`
+
+`pytest test/spmd/tensor/test_ddp.py`
+
+run specific test case and print stdout/stderr:
+
+`pytest test/spmd/tensor/test_tensor.py -s -k test_tensor_from_local`
diff --git a/test/distributed/_tensor/__init__.py b/test/distributed/_tensor/__init__.py
new file mode 100644
index 0000000000000..087882b22d1f0
--- /dev/null
+++ b/test/distributed/_tensor/__init__.py
@@ -0,0 +1 @@
+# shut up pylint
diff --git a/test/distributed/_tensor/test_api.py b/test/distributed/_tensor/test_api.py
new file mode 100644
index 0000000000000..a966f30d1cb9f
--- /dev/null
+++ b/test/distributed/_tensor/test_api.py
@@ -0,0 +1,234 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+import torch
+import torch.nn as nn
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import DTensorTestBase, with_comms
+from torch.distributed._tensor import (
+    distribute_tensor,
+    distribute_module,
+    DeviceMesh,
+    DTensor,
+    Shard,
+    Replicate,
+)
+
+
+class MyModel(nn.Module):
+    def __init__(self, n_features, n_layers, device):
+        super().__init__()
+        self.seq = nn.Sequential(
+            *[
+                nn.Linear(n_features, n_features, device=device)
+                for _ in range(n_layers)
+            ]
+        )
+
+    def forward(self, x):
+        return self.seq(x)
+
+    def reset_parameters(self):
+        for m in self.seq:
+            m.reset_parameters()
+
+
+class DTensorAPITest(DTensorTestBase):
+    @property
+    def world_size(self) -> int:
+        # hard code world size to 4 as we need to test
+        # at least with 2d mesh
+        return 4
+
+    @with_comms
+    def test_distribute_tensor(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        shard_spec = [Shard(0)]
+
+        for requires_grad in [True, False]:
+
+            tensor_to_shard = torch.randn(
+                3 * self.world_size, 3, requires_grad=requires_grad
+            )
+            dist_tensor = distribute_tensor(
+                tensor_to_shard, device_mesh, shard_spec
+            )
+            self.assertEqual(
+                dist_tensor.size(), torch.Size([3 * self.world_size, 3])
+            )
+            local_tensor = dist_tensor.to_local()
+            self.assertEqual(local_tensor.size(), torch.Size([3, 3]))
+            if requires_grad:
+                self.assertTrue(dist_tensor.requires_grad)
+                self.assertTrue(dist_tensor.is_leaf)
+
+    @with_comms
+    def test_distribute_tensor_errors(self):
+        device_mesh = DeviceMesh(
+            self.device_type, torch.arange(self.world_size).reshape(2, 2)
+        )
+        tensor_shape = [3 * self.world_size, 3 * self.world_size]
+        tensor_to_distribute = torch.randn(*tensor_shape)
+
+        with self.assertRaisesRegex(ValueError, "must have the same length"):
+            shard_spec = [Shard(0)]
+            distribute_tensor(tensor_to_distribute, device_mesh, shard_spec)
+
+        spec = [Shard(0), Shard(1)]
+        dtensor = distribute_tensor(tensor_to_distribute, device_mesh, spec)
+
+        with self.assertRaisesRegex(ValueError, "to a different device mesh"):
+            new_mesh = DeviceMesh(
+                self.device_type, torch.arange(self.world_size)
+            )
+            distribute_tensor(dtensor, new_mesh, [Shard(0)])
+
+        with self.assertRaisesRegex(ValueError, "to a different placements"):
+            new_spec = [Shard(0), Replicate()]
+            distribute_tensor(dtensor, device_mesh, new_spec)
+
+    @with_comms
+    def test_distribute_tensor_uneven_sharding(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        input_sizes_and_shard_dims = [
+            ((self.world_size * 3 + 1, 3, 3), 0),
+            ((self.world_size * 3 + 2, 3, 3), 0),
+            ((3, self.world_size * 3 + 1, 3), 1),
+            ((3, self.world_size * 3 + 2, 3), 1),
+            ((3, 3, self.world_size * 3 + 1), 2),
+            ((3, 3, self.world_size * 3 + 2), 2),
+        ]
+        for input_size, shard_dim in input_sizes_and_shard_dims:
+            shard_spec = [Shard(shard_dim)]
+            tensor_to_shard = torch.randn(input_size)
+            splitted_tensor_list = tensor_to_shard.tensor_split(
+                self.world_size, dim=shard_dim
+            )
+            dist_tensor = distribute_tensor(
+                tensor_to_shard, device_mesh, shard_spec
+            )
+            self.assertEqual(dist_tensor.size(), torch.Size(input_size))
+            local_tensor = dist_tensor.to_local()
+            self.assertEqual(local_tensor, splitted_tensor_list[self.rank])
+
+    @with_comms
+    def test_distribute_module(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        # fully shard all linear modules on dim 0
+        module_to_shard = MyModel(
+            5 * self.world_size, 20, device=self.device_type
+        )
+        shard_spec = [Shard(0)]
+
+        def shard_fn(name, module, device_mesh):
+            if isinstance(module, nn.Linear):
+                for name, param in module.named_parameters():
+                    dist_param = torch.nn.Parameter(
+                        distribute_tensor(param, device_mesh, shard_spec)
+                    )
+                    module.register_parameter(name, dist_param)
+
+        sharded_module = distribute_module(
+            module_to_shard, device_mesh, shard_fn
+        )
+        for param in sharded_module.parameters():
+            self.assertIsInstance(param, DTensor)
+            self.assertEqual(param.placements, shard_spec)
+
+        replica_spec = [Replicate()]
+        # fully replicate all modules without passing in partition_fn
+        module_to_replicate = MyModel(5, 20, device=self.device_type)
+        replica_module = distribute_module(module_to_replicate, device_mesh)
+        for param in replica_module.parameters():
+            self.assertIsInstance(param, DTensor)
+            self.assertEqual(param.placements, replica_spec)
+
+        # fully replicate all modules by passing in partition_fn
+        def replicate_fn(name, module, device_mesh):
+            if isinstance(module, nn.Linear):
+                for name, param in module.named_parameters():
+                    dist_param = torch.nn.Parameter(
+                        distribute_tensor(param, device_mesh, replica_spec)
+                    )
+                    module.register_parameter(name, dist_param)
+
+        module_to_replicate = MyModel(5, 20, device=self.device_type)
+        replica_module = distribute_module(
+            module_to_replicate, device_mesh, replicate_fn
+        )
+        for param in replica_module.parameters():
+            self.assertIsInstance(param, DTensor)
+            self.assertEqual(param.placements, replica_spec)
+
+        # only shard part of module, and rest of module should be replicate
+        def shard_fn(name, module, device_mesh):
+            if isinstance(module, nn.Linear) and (
+                name == "seq.0" or name == "seq.8"
+            ):
+                for name, param in module.named_parameters():
+                    dist_param = torch.nn.Parameter(
+                        distribute_tensor(param, device_mesh, shard_spec)
+                    )
+                    module.register_parameter(name, dist_param)
+
+        module_to_distribute = MyModel(
+            5 * self.world_size, 20, device=self.device_type
+        )
+        dist_module = distribute_module(
+            module_to_distribute, device_mesh, shard_fn
+        )
+        for name, param in dist_module.named_parameters():
+            self.assertIsInstance(param, DTensor)
+            if name.startswith("seq.0") or name.startswith("seq.8"):
+                self.assertEqual(param.placements, shard_spec)
+            else:
+                self.assertEqual(param.placements, replica_spec)
+
+    @with_comms
+    def test_distribute_module_input_fn_output_fn(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+
+        # fully replicate all linear modules
+        module_to_replicate = MyModel(20, 1, device=self.device_type)
+
+        # mark input sharding on dim 0
+        def input_fn(inputs, device_mesh):
+            return DTensor.from_local(inputs[0], device_mesh, [Shard(0)])
+
+        def output_fn(outputs, device_mesh):
+            assert isinstance(outputs, DTensor)
+            return outputs.to_local()
+
+        replica_module = distribute_module(
+            module_to_replicate,
+            device_mesh,
+            input_fn=input_fn,
+            output_fn=output_fn,
+        )
+
+        input_tensor = torch.randn(5, 20, device=self.device_type)
+        local_out = replica_module(input_tensor)
+        self.assertIsInstance(local_out, torch.Tensor)
+        self.assertNotIsInstance(local_out, DTensor)
+
+        # full replicate (even on inputs)
+        model = MyModel(10, 10, device=self.device_type)
+
+        def replicate_input_fn(inputs, device_mesh):
+            return DTensor.from_local(inputs[0], device_mesh, [Replicate()])
+
+        replica_model = distribute_module(
+            model,
+            device_mesh,
+            input_fn=replicate_input_fn,
+        )
+        input = torch.randn(10, 10, requires_grad=True)
+        output = replica_model(input)
+        output.sum().backward()
+        param_grad = list(replica_model.parameters())[0].grad
+        self.assertTrue(isinstance(param_grad, DTensor))
+        self.assertTrue(isinstance(param_grad.placements[0], Replicate))
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_tensor/test_device_mesh.py b/test/distributed/_tensor/test_device_mesh.py
new file mode 100644
index 0000000000000..7088f33f42dbe
--- /dev/null
+++ b/test/distributed/_tensor/test_device_mesh.py
@@ -0,0 +1,518 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+import torch
+
+from torch.distributed.distributed_c10d import (
+    ProcessGroup,
+    new_group,
+    get_global_rank,
+    get_world_size,
+)
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
+)
+from torch.distributed._tensor.device_mesh import DeviceMesh
+from torch.distributed._tensor.placement_types import Shard
+
+
+class DeviceMeshTest(DTensorTestBase):
+    @property
+    def world_size(self):
+        return 8
+
+    @with_comms
+    def test_device_mesh_2d(self):
+        mesh_tensor = torch.arange(4).reshape(2, 2)
+        # construct a cuda device mesh
+        mesh = DeviceMesh(self.device_type, mesh_tensor)
+
+        # check all dim groups
+        dim_to_subgroups = mesh.get_dim_groups()
+
+        expected_ranks_by_dim = [[[0, 2], [1, 3]], [[0, 1], [2, 3]]]
+        for dim, dim_group in enumerate(dim_to_subgroups):
+            self.assertTrue(dim < 2)
+            dim_ranks = expected_ranks_by_dim[dim]
+
+            dim_group_size = get_world_size(dim_group)
+            self.assertIsInstance(dim_group, ProcessGroup)
+            self.assertEqual(dim_group_size, 2)
+            global_ranks = [
+                get_global_rank(dim_group, i) for i in range(dim_group_size)
+            ]
+            current_rank_expected_group_ranks = (
+                dim_ranks[0] if self.rank in dim_ranks[0] else dim_ranks[1]
+            )
+            self.assertEqual(global_ranks, current_rank_expected_group_ranks)
+
+    @with_comms
+    def test_device_mesh_2d_from_dim_groups(self):
+        # construct a two dimension subgroups
+        dim_groups = []
+        expected_ranks_by_dim = [[[0, 2], [1, 3]], [[0, 1], [2, 3]]]
+        for dim_group_ranks in expected_ranks_by_dim:
+            for subgroup_ranks in dim_group_ranks:
+                subgroup = new_group(ranks=subgroup_ranks)
+                if self.rank in subgroup_ranks:
+                    dim_groups.append(subgroup)
+
+        # construct a device mesh from the subgroups
+        mesh = DeviceMesh(
+            self.device_type, [[0, 1], [2, 3]], dim_groups=dim_groups
+        )
+
+        # check all dim groups
+        dim_to_subgroups = mesh.get_dim_groups()
+        for dim, dim_group in enumerate(dim_to_subgroups):
+            self.assertTrue(dim < 2)
+            dim_ranks = expected_ranks_by_dim[dim]
+
+            dim_group_size = get_world_size(dim_group)
+            self.assertIsInstance(dim_group, ProcessGroup)
+            self.assertEqual(dim_group_size, 2)
+            global_ranks = [
+                get_global_rank(dim_group, i) for i in range(dim_group_size)
+            ]
+            current_rank_expected_group_ranks = (
+                dim_ranks[0] if self.rank in dim_ranks[0] else dim_ranks[1]
+            )
+            self.assertEqual(global_ranks, current_rank_expected_group_ranks)
+
+    @with_comms
+    def test_device_mesh_dim_groups_error(self):
+        # construct a two dimension subgroups
+        dim_groups = []
+        expected_ranks_by_dim = [[[0, 2], [1, 3]], [[0, 1], [2, 3]]]
+        for dim_group_ranks in expected_ranks_by_dim:
+            for subgroup_ranks in dim_group_ranks:
+                subgroup = new_group(ranks=subgroup_ranks)
+                if self.rank in subgroup_ranks:
+                    dim_groups.append(subgroup)
+
+        if len(dim_groups) > 0:
+            # dim_groups is not a list
+            self.assertRaises(
+                RuntimeError,
+                DeviceMesh,
+                self.device_type,
+                [[0, 1], [2, 3]],
+                dim_groups=dim_groups[0],
+            )
+
+            # dim_groups is a list, but not a list of ProcessGroup
+            self.assertRaises(
+                RuntimeError,
+                DeviceMesh,
+                self.device_type,
+                [[0, 1], [2, 3]],
+                dim_groups=[dim_groups[0], "dummy"],
+            )
+
+            # dim_groups has incorrect length
+            self.assertRaises(
+                RuntimeError,
+                DeviceMesh,
+                self.device_type,
+                [[0, 1], [2, 3]],
+                dim_groups=[dim_groups[0]],
+            )
+
+    @with_comms
+    def test_device_mesh_nd(self):
+        # construct a cuda device mesh
+        mesh_tensor = torch.arange(8).reshape(2, 2, 2)
+        mesh = DeviceMesh(self.device_type, mesh_tensor)
+
+        # check all dim groups
+        dim_to_subgroups = mesh.get_dim_groups()
+
+        for dim, dim_group in enumerate(dim_to_subgroups):
+            self.assertTrue(dim < mesh_tensor.ndim)
+            dim_ranks = mesh_tensor.swapdims(-1, dim).reshape(-1, 2)
+            # print(dim_ranks)
+            # dim_ranks = expected_ranks_by_dim[dim]
+
+            dim_group_size = get_world_size(dim_group)
+            self.assertIsInstance(dim_group, ProcessGroup)
+            self.assertEqual(dim_group_size, 2)
+            global_ranks = [
+                get_global_rank(dim_group, i) for i in range(dim_group_size)
+            ]
+            for ranks in dim_ranks:
+                if self.rank in ranks:
+                    self.assertEqual(global_ranks, ranks.tolist())
+
+
+class DeviceMeshCollectiveTest(DTensorTestBase):
+    @property
+    def world_size(self):
+        return 8
+
+    @with_comms
+    def test_all_reduce_1d(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+        local_tensor = torch.ones(3, 3, device=self.device_type) * self.rank
+        mesh.all_reduce(local_tensor, mesh_dim=0)
+        res_num = ((0 + self.world_size - 1) * self.world_size) / 2
+        self.assertEqual(local_tensor, torch.ones(3, 3) * res_num)
+
+    @with_comms
+    def test_broadcast_1d(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+        local_tensor = torch.ones(3, 3, device=self.device_type) * self.rank
+        mesh.broadcast(local_tensor, mesh_dim=0)
+        self.assertEqual(local_tensor, torch.zeros(3, 3))
+
+    @with_comms
+    def test_scatter_1d(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+        scatter_tensor_shape = [3, 3, 3]
+        for scatter_dim in range(len(scatter_tensor_shape)):
+            shard_placement = Shard(scatter_dim)
+            scatter_tensor_shape[scatter_dim] *= self.world_size
+            # make the random seed same across rank
+            torch.manual_seed(0)
+            global_tensor = torch.randn(
+                scatter_tensor_shape, device=self.device_type
+            )
+            splitted_list, _ = shard_placement._split_tensor(
+                global_tensor, mesh.size(), with_padding=True, contiguous=True
+            )
+            recv_tensor = torch.empty_like(splitted_list[mesh.get_rank()])
+            # scatter on dim > 0 would generate non-contiguous tensor, verify that works
+            mesh.scatter(recv_tensor, splitted_list, mesh_dim=0)
+            self.assertEqual(recv_tensor, splitted_list[mesh.get_rank()])
+
+    @with_comms
+    def test_scatter_uneven(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        my_rank = device_mesh.get_rank()
+        tensor_to_split = torch.randn(
+            device_mesh.size() + 3, device_mesh.size() + 1
+        )
+
+        for shard_dim in range(tensor_to_split.ndim):
+            shard_placement = Shard(shard_dim)
+            tensor_to_scatter = tensor_to_split.clone()
+            tensor_splitted_list = tensor_to_split.tensor_split(
+                device_mesh.size(), dim=shard_dim
+            )
+            padded_tensor_list, pad_idx = shard_placement._split_tensor(
+                tensor_to_scatter,
+                device_mesh.size(),
+                with_padding=True,
+                contiguous=True,
+            )
+
+            scattered_tensor = torch.empty_like(padded_tensor_list[my_rank])
+            device_mesh.scatter(
+                scattered_tensor, padded_tensor_list, mesh_dim=0
+            )
+            # unpad scattered_tensor
+            if pad_idx != 0 and my_rank >= pad_idx:
+                scattered_tensor = shard_placement._unpad_tensor(
+                    scattered_tensor
+                )
+
+            self.assertEqual(
+                scattered_tensor.size(), tensor_splitted_list[my_rank].size()
+            )
+            self.assertEqual(scattered_tensor, tensor_splitted_list[my_rank])
+
+    @with_comms
+    def test_all_gather_1d(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+        dims_to_gather = [0, 1]
+        for dim in dims_to_gather:
+            output_size = [3, 3]
+            output_size[dim] *= self.world_size
+            # each rank have its own tensor, all_gather gives a list
+            local_tensor = torch.ones(3, 3, device=self.device_type)
+            gathered_list = []
+            for _ in range(self.world_size):
+                gathered_list.append(torch.zeros_like(local_tensor))
+            mesh.all_gather(gathered_list, local_tensor, mesh_dim=0)
+            gathered_tensor = torch.cat(gathered_list, dim=dim)
+            self.assertEqual(gathered_tensor, torch.ones(output_size))
+
+    @with_comms
+    def test_all_gather_uneven(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        my_rank = device_mesh.get_rank()
+        tensor_to_split = torch.ones(
+            device_mesh.size() + 3,
+            device_mesh.size() + 1,
+            device=self.device_type,
+        )
+
+        for shard_dim in range(tensor_to_split.ndim):
+            shard_placement = Shard(shard_dim)
+            tensor_padded_list, pad_idx = shard_placement._split_tensor(
+                tensor_to_split,
+                device_mesh.size(),
+                with_padding=True,
+                contiguous=True,
+            )
+            local_tensor = tensor_padded_list[my_rank]
+            gathered_list = []
+            for _ in range(device_mesh.size()):
+                gathered_list.append(torch.empty_like(local_tensor))
+
+            device_mesh.all_gather(
+                gathered_list,
+                local_tensor,
+                mesh_dim=0,
+            )
+            if pad_idx != 0:
+                gathered_list = [
+                    shard_placement._unpad_tensor(gathered_tensor)
+                    if i >= pad_idx
+                    else gathered_tensor
+                    for i, gathered_tensor in enumerate(gathered_list)
+                ]
+            all_gathered_tensor = torch.cat(gathered_list, dim=shard_dim)
+            self.assertEqual(all_gathered_tensor.size(), tensor_to_split.size())
+            self.assertEqual(all_gathered_tensor, tensor_to_split)
+
+    @with_comms
+    def test_reduce_scatter_1d(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+        dims_to_scatter = [0, 1]
+        for dim in dims_to_scatter:
+            input_size = [3, 3]
+            scattered_tensor = torch.empty(input_size, device=self.device_type)
+            input_size[dim] *= self.world_size
+
+            input_rs_list = (
+                torch.ones(input_size, device=self.device_type) * self.rank
+            ).tensor_split(self.world_size, dim=dim)
+            res_num = ((0 + self.world_size - 1) * self.world_size) / 2
+            mesh.reduce_scatter(scattered_tensor, input_rs_list, mesh_dim=0)
+            self.assertEqual(scattered_tensor, torch.ones(3, 3) * res_num)
+
+    @with_comms
+    def test_reduce_scatter_uneven(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        my_rank = device_mesh.get_rank()
+        tensor_to_split = (
+            torch.ones(
+                device_mesh.size() + 3,
+                device_mesh.size() + 1,
+                device=self.device_type,
+            )
+            * self.rank
+        )
+
+        for shard_dim in range(tensor_to_split.ndim):
+            shard_placement = Shard(shard_dim)
+            tensor_to_scatter = tensor_to_split.clone()
+            tensor_splitted_list = tensor_to_split.tensor_split(
+                device_mesh.size(), dim=shard_dim
+            )
+            padded_tensor_list, pad_idx = shard_placement._split_tensor(
+                tensor_to_scatter,
+                device_mesh.size(),
+                with_padding=True,
+                contiguous=True,
+            )
+
+            res_num = ((0 + self.world_size - 1) * self.world_size) / 2
+            scattered_tensor = torch.empty_like(padded_tensor_list[my_rank])
+            device_mesh.reduce_scatter(
+                scattered_tensor, padded_tensor_list, mesh_dim=0
+            )
+            # unpad scattered_tensor
+            if pad_idx != 0 and my_rank >= pad_idx:
+                scattered_tensor = shard_placement._unpad_tensor(
+                    scattered_tensor
+                )
+
+            self.assertEqual(
+                scattered_tensor.size(), tensor_splitted_list[my_rank].size()
+            )
+            self.assertEqual(
+                scattered_tensor,
+                torch.ones_like(tensor_splitted_list[my_rank]) * res_num,
+            )
+
+    @with_comms
+    def test_all_gather_nd(self):
+        mesh_tensor = torch.arange(8).reshape(2, 2, 2)
+        mesh = DeviceMesh(self.device_type, mesh_tensor)
+        local_tensor = torch.ones(3, 3, device=self.device_type) * self.rank
+
+        dim_to_subgroups = mesh.get_dim_groups()
+        for dim, dim_group in enumerate(dim_to_subgroups):
+            dim_group_size = get_world_size(dim_group)
+            global_ranks = [
+                get_global_rank(dim_group, i) for i in range(dim_group_size)
+            ]
+            gathered_tensor_list = list(
+                torch.empty(
+                    (dim_group_size * 3, 3), device=self.device_type
+                ).tensor_split(dim_group_size, dim=0)
+            )
+            mesh.all_gather(gathered_tensor_list, local_tensor, mesh_dim=dim)
+            gathered_tensor = torch.cat(gathered_tensor_list)
+            exp_tensor = torch.ones(3 * dim_group_size, 3)
+            for i in range(len(global_ranks)):
+                exp_tensor[i * 3 : (i + 1) * 3] = (
+                    torch.ones(3, 3) * global_ranks[i]
+                )
+            self.assertEqual(gathered_tensor, exp_tensor)
+
+    @with_comms
+    def test_reduce_scatter_nd(self):
+        mesh_tensor = torch.arange(8).reshape(2, 2, 2)
+        mesh = DeviceMesh(self.device_type, mesh_tensor)
+
+        dim_to_subgroups = mesh.get_dim_groups()
+        for dim, dim_group in enumerate(dim_to_subgroups):
+            dim_group_size = get_world_size(dim_group)
+            local_rs_list = (
+                torch.ones(dim_group_size * 3, 3, device=self.device_type)
+                * self.rank
+            ).tensor_split(dim_group_size, dim=0)
+            scattered_tensor = torch.empty_like(
+                local_rs_list[mesh.get_coordinate_on_dim(dim)],
+                device=self.device_type,
+            )
+            global_ranks = [
+                get_global_rank(dim_group, i) for i in range(dim_group_size)
+            ]
+            mesh.reduce_scatter(scattered_tensor, local_rs_list, mesh_dim=dim)
+            res_num = torch.sum(torch.tensor(global_ranks))
+            self.assertEqual(scattered_tensor, torch.ones(3, 3) * res_num)
+
+    @with_comms
+    def test_all_reduce_nd(self):
+        mesh_tensor = torch.arange(8).reshape(2, 2, 2)
+        mesh = DeviceMesh(self.device_type, mesh_tensor)
+        local_tensor = torch.ones(3, 3, device=self.device_type) * self.rank
+
+        # check all dim groups
+        dim_to_subgroups = mesh.get_dim_groups()
+        for dim, dim_group in enumerate(dim_to_subgroups):
+            dim_group_size = get_world_size(dim_group)
+            global_ranks = [
+                get_global_rank(dim_group, i) for i in range(dim_group_size)
+            ]
+            cloned_local_tensor = local_tensor.clone()
+            mesh.all_reduce(cloned_local_tensor, mesh_dim=dim)
+            res_num = sum(global_ranks)
+            self.assertEqual(cloned_local_tensor, torch.ones(3, 3) * res_num)
+
+    @with_comms
+    def test_broadcast_nd(self):
+        mesh_tensor = torch.arange(8).reshape(2, 2, 2)
+        mesh = DeviceMesh(self.device_type, mesh_tensor)
+        local_tensor = torch.ones(3, 3, device=self.device_type) * self.rank
+
+        # check all dim groups
+        dim_to_subgroups = mesh.get_dim_groups()
+        for dim, dim_group in enumerate(dim_to_subgroups):
+            dim_group_size = get_world_size(dim_group)
+            global_ranks = [
+                get_global_rank(dim_group, i) for i in range(dim_group_size)
+            ]
+            cloned_local_tensor = local_tensor.clone()
+            mesh.broadcast(cloned_local_tensor, mesh_dim=dim)
+            res_num = global_ranks[0]
+            self.assertEqual(cloned_local_tensor, torch.ones(3, 3) * res_num)
+
+    @with_comms
+    def test_scatter_nd(self):
+        mesh_tensor = torch.arange(8).reshape(2, 2, 2)
+        mesh = DeviceMesh(self.device_type, mesh_tensor)
+
+        # check all dim groups
+        dim_to_subgroups = mesh.get_dim_groups()
+        for dim, dim_group in enumerate(dim_to_subgroups):
+            dim_group_size = get_world_size(dim_group)
+            global_ranks = [
+                get_global_rank(dim_group, i) for i in range(dim_group_size)
+            ]
+            scattered_tensors = [
+                torch.ones(3, 3, device=self.device_type) * global_rank
+                for global_rank in global_ranks
+            ]
+            received_tensor = torch.empty_like(
+                scattered_tensors[mesh.get_coordinate_on_dim(dim)]
+            )
+            mesh.scatter(received_tensor, scattered_tensors, mesh_dim=dim)
+            self.assertEqual(received_tensor, torch.ones(3, 3) * self.rank)
+
+    @with_comms
+    def test_all_to_all_1d(self):
+        # transpose on a 2D tensor distributed over N nodes:
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+        tensor_shape = [3, 3]
+        input_tensor_list = [
+            torch.ones(*tensor_shape, device=self.device_type)
+            * (rank + self.rank * self.world_size)
+            for rank in range(self.world_size)
+        ]
+        expected_tensor_list = [
+            torch.ones(tensor_shape, device=self.device_type)
+            * (self.rank + rank * self.world_size)  # i.e. transpose
+            for rank in range(self.world_size)
+        ]
+        for scatter_dim in range(len(tensor_shape)):
+            output_tensor_list = [
+                torch.empty_like(input_tensor_list[idx])
+                for idx in range(len(input_tensor_list))
+            ]
+            # scatter on dim > 0 would generate non-contiguous tensor, verify that works
+            mesh.all_to_all(output_tensor_list, input_tensor_list, mesh_dim=0)
+            output_tensor = torch.cat(output_tensor_list, dim=scatter_dim)
+            expected_tensor = torch.cat(expected_tensor_list, dim=scatter_dim)
+
+            self.assertEqual(output_tensor, expected_tensor)
+
+    @with_comms
+    def test_all_to_all_nd(self):
+        mesh_tensor = torch.arange(8).reshape(2, 2, 2)
+        mesh = DeviceMesh(self.device_type, mesh_tensor)
+        tensor_shape = [3, 3, 3]
+        # check all dim groups
+        dim_to_subgroups = mesh.get_dim_groups()
+        for dim, dim_group in enumerate(dim_to_subgroups):
+            my_coordinate = mesh.get_coordinate_on_dim(dim)
+            dim_group_size = get_world_size(dim_group)
+            global_ranks = [
+                get_global_rank(dim_group, i) for i in range(dim_group_size)
+            ]
+            input_tensor_list = [
+                torch.ones(*tensor_shape, device=self.device_type)
+                * (i + self.rank * dim_group_size)
+                for i in range(dim_group_size)
+            ]
+            expected_tensor_list = [
+                torch.ones(*tensor_shape, device=self.device_type)
+                * (
+                    my_coordinate + global_rank * dim_group_size
+                )  # i.e. transpose
+                for global_rank in global_ranks
+            ]
+            for scatter_dim in range(len(tensor_shape)):
+                # input_tensor = torch.cat(input_tensor_list, dim=scatter_dim)
+                output_tensor_list = [
+                    torch.empty_like(input_tensor_list[idx])
+                    for idx in range(len(input_tensor_list))
+                ]
+                # scatter on dim > 0 would generate non-contiguous tensor, verify that works
+                mesh.all_to_all(
+                    output_tensor_list, input_tensor_list, mesh_dim=dim
+                )
+                output_tensor = torch.cat(output_tensor_list, dim=scatter_dim)
+                expected_tensor = torch.cat(
+                    expected_tensor_list, dim=scatter_dim
+                )
+                self.assertEqual(output_tensor, expected_tensor)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_tensor/test_dtensor.py b/test/distributed/_tensor/test_dtensor.py
new file mode 100644
index 0000000000000..51ce1bd4ec583
--- /dev/null
+++ b/test/distributed/_tensor/test_dtensor.py
@@ -0,0 +1,359 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+import torch
+
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
+)
+from torch.distributed._tensor import DeviceMesh, DTensor, distribute_tensor
+from torch.distributed._tensor.placement_types import _Partial, Replicate, Shard
+
+
+class DTensorTest(DTensorTestBase):
+    # @with_comms
+    # def test_tensor_constructor(self):
+    #     import torch.distributed._tensor as dist_tensor
+    #     shard_spec = PlacementSpec(device_mesh, strategies=[Shard(0)])
+    #     empty_tensor = dist_tensor.empty((12, 10), placement_spec=shard_spec)
+    #     zero_tensor = dist_tensor.zeros((12, 10), placement_spec=shard_spec)
+    #     one_tensor = dist_tensor.ones((12, 10), placement_spec=shard_spec)
+
+    #     zero_cuda_tensor = dist_tensor.zeros((12, 10), device="cuda", placement_spec=shard_spec)
+
+    #     dist_tensor.empty_like(empty_tensor)
+    #     dist_tensor.zero_like(empty_tensor)
+    #     dist_tensor.one_like(empty_tensor)
+
+    @with_comms
+    def test_dtensor_constructor(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        shard_spec = [Shard(0)]
+        local_tensor = torch.randn(3, 3, requires_grad=True)
+        dist_tensor_shape = torch.Size([self.world_size * 3, 3])
+        dist_tensor = DTensor(
+            local_tensor,
+            device_mesh,
+            shard_spec,
+            size=dist_tensor_shape,
+            requires_grad=True,
+        )
+        self.assertEqual(
+            dist_tensor.size(), torch.Size((self.world_size * 3, 3))
+        )
+
+        with self.assertWarnsRegex(UserWarning, "To construct"):
+            DTensor(
+                local_tensor, device_mesh, shard_spec, size=dist_tensor_shape
+            )
+
+        local_tensor = torch.randn(3, 3, requires_grad=False)
+        with self.assertWarnsRegex(UserWarning, "To construct"):
+            dist_tensor = DTensor(
+                local_tensor,
+                device_mesh,
+                shard_spec,
+                size=dist_tensor_shape,
+                requires_grad=True,
+            )
+
+    @with_comms
+    def test_dtensor_stride(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        shard0_spec = [Shard(0)]
+        local_tensor = torch.randn(4, 8)
+        global_shape = torch.Size([self.world_size * 4, 8])
+        dist_tensor = DTensor(
+            local_tensor, device_mesh, shard0_spec, size=global_shape
+        )
+        # won't affect stride
+        self.assertEqual(dist_tensor.stride(), (8, 1))
+
+        shard1_spec = [Shard(1)]
+        local_tensor = torch.randn(8, 4)
+        global_shape = torch.Size([8, self.world_size * 4])
+        dist_tensor = DTensor(
+            local_tensor, device_mesh, shard1_spec, size=global_shape
+        )
+        # will affect stride after DT initialized
+        self.assertEqual(dist_tensor.stride(), (4 * self.world_size, 1))
+
+        # if initialized from a transposed mat
+        local_tensor = torch.randn(8, 4, 8)
+        local_tensor_t = local_tensor.permute(1, 2, 0)
+        global_shape = torch.Size([4, self.world_size * 8, 8])
+        self.assertEqual(local_tensor_t.stride(), (8, 1, 32))
+        dist_tensor = DTensor(
+            local_tensor_t, device_mesh, shard1_spec, size=global_shape
+        )
+        global_stride = (8 * self.world_size, 1, 32 * self.world_size)
+        self.assertEqual(dist_tensor.stride(), global_stride)
+
+    @with_comms
+    def test_from_local(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        shard_spec = [Shard(0)]
+        local_tensor = torch.randn(3, 3)
+        sharded_tensor = DTensor.from_local(
+            local_tensor, device_mesh, shard_spec
+        )
+        self.assertEqual(
+            sharded_tensor.size(), torch.Size([self.world_size * 3, 3])
+        )
+
+        replica_spec = [Replicate()]
+        ddp_tensor = DTensor.from_local(local_tensor, device_mesh, replica_spec)
+        self.assertEqual(ddp_tensor.size(), local_tensor.size())
+
+        partial_spec = [_Partial()]
+        partial_tensor = DTensor.from_local(
+            local_tensor, device_mesh, partial_spec
+        )
+        self.assertEqual(partial_tensor.size(), local_tensor.size())
+
+        # test dist tensor works with torch.Tensor during backwards
+        local_tensor_with_grad = torch.randn(3, 3, requires_grad=True)
+        # do some operations on local tensor
+        local_tensor_temp = local_tensor_with_grad * 3
+        # create the dist tensor with non leaf local tensor, dist tensor created
+        # should also be non leaf node
+        dist_tensor = DTensor.from_local(
+            local_tensor_temp, device_mesh, shard_spec
+        )
+        self.assertFalse(dist_tensor.is_leaf)
+        # do some random operations on dist tensor
+        output = dist_tensor * 3
+        self.assertIsInstance(output, DTensor)
+        # trigger .backward() on dist tensor directly
+        local_grad = torch.ones(3, 3)
+        grad_output = DTensor.from_local(local_grad, device_mesh, shard_spec)
+        # run backward directly on dist tensor
+        output.backward(grad_output)
+        # check it gradients flow back to original torch.Tensor
+        self.assertIsNotNone(local_tensor_with_grad.grad)
+        expected_grad = torch.ones(3, 3) * 9
+        self.assertEqual(local_tensor_with_grad.grad, expected_grad)
+
+    @with_comms
+    def test_to_local(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        shard_spec = [Shard(0)]
+        dist_tensor_shape = torch.Size([self.world_size * 3, 3])
+        local_tensor_with_grad = torch.randn(
+            3, 3, device=self.device_type, requires_grad=True
+        )
+
+        sharded_tensor = DTensor(
+            local_tensor_with_grad,
+            device_mesh,
+            shard_spec,
+            size=dist_tensor_shape,
+            requires_grad=True,
+        )
+        self.assertEqual(sharded_tensor.size(), dist_tensor_shape)
+        self.assertEqual(sharded_tensor.to_local(), local_tensor_with_grad)
+
+        # test dist tensor works with torch.Tensor during backwards
+        # dist tensor created is a leaf node, do some operation on dist tensor
+        temp_st = sharded_tensor * 3
+
+        # do some operation on local tensor of the dist tensor
+        new_tensor_with_grad = torch.randn(
+            3, 3, device=self.device_type, requires_grad=True
+        )
+        res = temp_st.to_local() + new_tensor_with_grad
+        # call backward directly on torch.Tensor, and see if it works by
+        # propagating through dist tensor
+        res.sum().backward()
+        self.assertIsNotNone(sharded_tensor.grad)
+
+        self.assertEqual(sharded_tensor.grad.to_local(), torch.ones(3, 3) * 3)
+
+    @with_comms
+    def test_from_local_then_to_local(self):
+        # this test ensure end to end from torch.Tensor -> dist tensor -> torch.Tensor works
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        shard_spec = [Shard(0)]
+
+        # step 1. construct from construct local tensor
+        local_tensor_with_grad = torch.randn(
+            3, 3, device=self.device_type, requires_grad=True
+        )
+        # do some operations on local tensor
+        local_tensor_temp = local_tensor_with_grad + 8
+        # step 2. create the dist tensor with non leaf local tensor, dist tensor
+        # created should also be non leaf node
+        dist_tensor = DTensor.from_local(
+            local_tensor_temp, device_mesh, shard_spec
+        )
+        self.assertFalse(dist_tensor.is_leaf)
+        # do some random operations on dist tensor
+        output = dist_tensor * 6
+        self.assertIsInstance(output, DTensor)
+
+        # step 3. do some operation on local tensor of the dist tensor
+        new_tensor_with_grad = torch.randn(
+            3, 3, device=self.device_type, requires_grad=True
+        )
+        res = output.to_local() + new_tensor_with_grad
+        # call backward directly on torch.Tensor, and see if it works by
+        # propagating all the way back to the original torch.Tensor
+        res.sum().backward()
+        self.assertIsNotNone(local_tensor_with_grad.grad)
+
+        expected_grad = torch.ones(3, 3) * 6
+        self.assertEqual(local_tensor_with_grad.grad, expected_grad)
+
+    @with_comms
+    def test_dtensor_spec_read_only_after_set(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        shard_spec = [Shard(0)]
+        local_tensor = torch.randn(3, 3)
+        sharded_tensor = DTensor.from_local(
+            local_tensor, device_mesh, shard_spec
+        )
+
+        # modify shard_spec, and dist_tensor's spec should not be changed
+        shard_spec[0] = Replicate()
+        self.assertTrue(sharded_tensor.placements is not shard_spec)
+        self.assertNotEqual(sharded_tensor.placements, shard_spec)
+
+    @with_comms
+    def test_dtensor_properties(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        shard_spec = [Shard(0)]
+        local_tensor = torch.randn(3, 3)
+        sharded_tensor = DTensor.from_local(
+            local_tensor, device_mesh, shard_spec
+        )
+        self.assertEqual(sharded_tensor.device.type, self.device_type)
+
+
+class DTensorMeshTest(DTensorTestBase):
+    @property
+    def world_size(self):
+        return 8
+
+    @with_comms
+    def test_dtensor_device_mesh_device_conversion(self):
+        # construct a cuda device mesh
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        # construct from a cpu local tensor with cuda device mesh
+        # should automatically convert the dist tensor to cuda
+        shard_spec = [Shard(0)]
+        local_tensor = torch.randn(3, 3)
+        dist_tensor = DTensor.from_local(local_tensor, mesh, shard_spec)
+        self.assertEqual(dist_tensor.device.type, self.device_type)
+        self.assertEqual(dist_tensor.to_local().device.type, self.device_type)
+
+    @with_comms
+    def test_dtensor_api_device_mesh_context_manager(self):
+        with DeviceMesh(self.device_type, list(range(self.world_size))) as mesh:
+            shard_spec = [Shard(0)]
+            local_tensor = torch.randn(3, 3)
+            sharded_tensor = DTensor.from_local(
+                local_tensor, device_mesh=mesh, placements=shard_spec
+            )
+
+        with DeviceMesh(self.device_type, list(range(self.world_size))):
+            shard_spec = [Shard(0)]
+            local_tensor = torch.randn(3, 3)
+            sharded_tensor = DTensor.from_local(
+                local_tensor, placements=shard_spec
+            )
+            replica_spec = [Replicate()]
+            replica_tensor = sharded_tensor.redistribute(
+                placements=replica_spec
+            )
+            self.assertEqual(
+                replica_tensor.size(), torch.Size([3 * self.world_size, 3])
+            )
+
+    @with_comms
+    def test_dtensor_2d_mesh(self):
+        mesh_tensor = torch.arange(self.world_size).reshape(2, 4)
+        # construct a cuda device mesh
+        mesh = DeviceMesh(self.device_type, mesh_tensor)
+
+        # construct a dist tensor on 2d device mesh and test if works
+        shard_spec = [Shard(0), Shard(1)]
+        local_tensor = torch.randn(3, 3)
+        dist_tensor = DTensor.from_local(local_tensor, mesh, shard_spec)
+        self.assertEqual(
+            dist_tensor.size(), torch.Size([3 * mesh.size(0), 3 * mesh.size(1)])
+        )
+        self.assertEqual(dist_tensor.device.type, self.device_type)
+        self.assertEqual(dist_tensor.to_local().device.type, self.device_type)
+
+        # if shard on the same tensor dimension
+        # we should correctly construct the global tensor size
+        shard_same_dim_spec = [Shard(0), Shard(0)]
+        local_tensor = torch.randn(3, 3)
+        dist_tensor = DTensor.from_local(
+            local_tensor, mesh, shard_same_dim_spec
+        )
+        self.assertEqual(
+            dist_tensor.size(), torch.Size([3 * self.world_size, 3])
+        )
+
+    @with_comms
+    def test_device_mesh_nd(self):
+        # construct a cuda device mesh
+        mesh_tensor = torch.arange(self.world_size).reshape(2, 2, 2)
+        mesh = DeviceMesh(self.device_type, mesh_tensor)
+        # construct a dist tensor on 3d device mesh and test if works
+        shard_spec = [Shard(0), Shard(1), Shard(2)]
+        local_tensor = torch.randn(3, 3, 3)
+        dist_tensor = DTensor.from_local(local_tensor, mesh, shard_spec)
+        self.assertEqual(dist_tensor.size(), torch.Size([6, 6, 6]))
+        self.assertEqual(dist_tensor.device.type, self.device_type)
+        self.assertEqual(dist_tensor.to_local().device.type, self.device_type)
+
+        # construct a dist tensor on 3d device mesh with some shards on same dim
+        shard_spec = [Shard(0), Shard(0), Shard(2)]
+        local_tensor = torch.randn(3, 3, 3)
+        dist_tensor = DTensor.from_local(local_tensor, mesh, shard_spec)
+        self.assertEqual(dist_tensor.size(), torch.Size([12, 3, 6]))
+        self.assertEqual(dist_tensor.device.type, self.device_type)
+        self.assertEqual(dist_tensor.to_local().device.type, self.device_type)
+
+    @with_comms
+    def test_dtensor_spec_local_shard_offset(self):
+        device_mesh = DeviceMesh(
+            self.device_type, torch.arange(self.world_size).reshape(2, 4)
+        )
+        tensor_shape = (3 * self.world_size, 3 * self.world_size)
+        # sharding specs and its corresponding local shard offsets
+        shard_spec_and_offsets = [
+            (
+                [Shard(0), Replicate()],
+                (3 * (self.world_size // 2) * (self.rank // 4), 0),
+            ),
+            (
+                [Shard(1), Replicate()],
+                (0, 3 * (self.world_size // 2) * (self.rank // 4)),
+            ),
+            (
+                [Replicate(), Shard(0)],
+                (3 * (self.world_size // 4) * (self.rank % 4), 0),
+            ),
+            (
+                [Replicate(), Shard(1)],
+                (0, 3 * (self.world_size // 4) * (self.rank % 4)),
+            ),
+        ]
+
+        # loop through all sharding specs and check local shard offsets
+        logical_tensor = torch.randn(tensor_shape)
+        for shard_spec, expected_shard_offsets in shard_spec_and_offsets:
+            dtensor = distribute_tensor(logical_tensor, device_mesh, shard_spec)
+            self.assertEqual(
+                expected_shard_offsets, dtensor._spec.local_offsets
+            )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_tensor/test_redistribute.py b/test/distributed/_tensor/test_redistribute.py
new file mode 100644
index 0000000000000..78fc991d615f9
--- /dev/null
+++ b/test/distributed/_tensor/test_redistribute.py
@@ -0,0 +1,317 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+import itertools
+import torch
+
+from torch.testing._internal.common_utils import run_tests
+
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
+)
+from torch.distributed._tensor import distribute_tensor, DeviceMesh, DTensor
+from torch.distributed._tensor.placement_types import _Partial, Replicate, Shard
+
+
+class RedistributeTest(DTensorTestBase):
+    @with_comms
+    def test_shard_to_replicate_forward_backward(self):
+        # 1) test shard -> replicate forward
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        replica_spec = [Replicate()]
+
+        input_sizes_and_shard_dim = [
+            ((self.world_size * 3, 3), 0),
+            ((self.world_size * 3 + 1, 3), 0),
+            ((self.world_size * 3 + 2, 3), 0),
+            ((3, self.world_size * 3), 1),
+            ((3, self.world_size * 3 + 1), 1),
+            ((3, self.world_size * 3 + 2), 1),
+        ]
+
+        for input_size, shard_dim in input_sizes_and_shard_dim:
+            shard_spec = [Shard(shard_dim)]
+            expected_tensor = torch.randn(
+                input_size, device=self.device_type, requires_grad=True
+            )
+            dtensor = distribute_tensor(
+                expected_tensor.clone(), device_mesh, shard_spec
+            )
+            reshard_dtensor = dtensor.redistribute(device_mesh, replica_spec)
+            self.assertEqual(reshard_dtensor.size(), torch.Size(input_size))
+            self.assertEqual(expected_tensor, reshard_dtensor.to_local())
+
+            # 2) test shard -> replicate backward:
+            # should give gradient as shard
+            grad_output = torch.ones_like(reshard_dtensor)
+            reshard_dtensor.backward(grad_output)
+            grad_input = dtensor.grad
+            self.assertEqual(grad_input.placements, shard_spec)
+            self.assertEqual(
+                grad_input.to_local(), torch.ones(dtensor.to_local().size())
+            )
+
+    @with_comms
+    def test_replicate_to_replicate_forward_backward(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        replica_spec = [Replicate()]
+        local_tensor = torch.randn(
+            12, 3, device=self.device_type, requires_grad=True
+        )
+        # 1) test replicate -> replicate forward
+        replica_tensor = distribute_tensor(
+            local_tensor, device_mesh, replica_spec
+        )
+        reshard_replica_tensor = replica_tensor.redistribute(
+            device_mesh, replica_spec
+        )
+        self.assertEqual(replica_tensor.size(), local_tensor.size())
+        self.assertEqual(replica_tensor, reshard_replica_tensor)
+
+        # 2) test replicate -> replicate backward:
+        # should give gradient as replicate
+        grad_output = torch.ones_like(reshard_replica_tensor)
+        reshard_replica_tensor.backward(grad_output)
+        grad_input = replica_tensor.grad
+        self.assertEqual(grad_input.placements, replica_spec)
+        self.assertEqual(grad_input.to_local(), torch.ones(12, 3))
+
+    @with_comms
+    def test_replicate_to_shard_forward_backward(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        replica_spec = [Replicate()]
+
+        input_sizes_and_shard_dim = [
+            ((self.world_size * 3, 3), 0),
+            ((self.world_size * 3 + 1, 3), 0),
+            ((self.world_size * 3 + 2, 3), 0),
+            ((3, self.world_size * 3), 1),
+            ((3, self.world_size * 3 + 1), 1),
+            ((3, self.world_size * 3 + 2), 1),
+        ]
+        for input_size, shard_dim in input_sizes_and_shard_dim:
+            shard_spec = [Shard(shard_dim)]
+            # 1) test replicate -> shard forward
+            local_replica = torch.randn(
+                input_size, device=self.device_type, requires_grad=True
+            )
+            splitted_list = local_replica.tensor_split(
+                self.world_size, shard_dim
+            )
+            # make local tensor as the element of the corresponding chunked list
+            local_tensor = splitted_list[self.rank]
+            replica_tensor = distribute_tensor(
+                local_replica, device_mesh, replica_spec
+            )
+            reshard_tensor = replica_tensor.redistribute(
+                device_mesh, shard_spec
+            )
+            self.assertEqual(reshard_tensor.size(), replica_tensor.size())
+            self.assertEqual(reshard_tensor.placements, shard_spec)
+            self.assertEqual(reshard_tensor.to_local(), local_tensor)
+
+            # 2) test replicate -> shard backward:
+            # should give gradient as replicate
+            grad_output = torch.ones_like(reshard_tensor)
+            reshard_tensor.backward(grad_output)
+            grad_input = replica_tensor.grad
+            self.assertEqual(grad_input.placements, replica_spec)
+            self.assertEqual(grad_input.to_local(), torch.ones(input_size))
+
+    @with_comms
+    def test_partial_to_replicate_forward_backward(self):
+        # Although we don't allow user to reshard to produce a partial
+        # placement (i.e. user can't reshard to partial), we do allow
+        # replicate to partial internally, and also partial to replicate
+        # backward should work as expected
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        partial_local = torch.randn(
+            12, 3, device=self.device_type, requires_grad=True
+        )
+        partial_spec = [_Partial()]
+        replica_spec = [Replicate()]
+        # test partial -> replicate, which trigger all_reduce
+        partial_tensor = DTensor.from_local(
+            partial_local, device_mesh, partial_spec
+        )
+        global_partial_tensor = partial_tensor.redistribute(
+            device_mesh, replica_spec
+        )
+
+        self.assertEqual(partial_tensor.size(), partial_local.size())
+        self.assertEqual(
+            partial_local * self.world_size, global_partial_tensor.to_local()
+        )
+
+        # test backward to have replicate grad on partial
+        global_partial_tensor.backward(torch.ones_like(global_partial_tensor))
+        self.assertIsNotNone(partial_local.grad)
+        if device_mesh.get_rank() == 0:
+            self.assertEqual(partial_local.grad, torch.ones_like(partial_local))
+
+    @with_comms
+    def test_replicate_to_partial(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        local_tensor = torch.randn(
+            12, 3, device=self.device_type, requires_grad=True
+        )
+        partial_spec = _Partial()
+        replica_spec = Replicate()
+        # 1) test replicate -> partial forward
+        replica_tensor = distribute_tensor(
+            local_tensor, device_mesh, [replica_spec]
+        )
+        with self.assertRaisesRegex(
+            RuntimeError, "Can not redistribute to _Partial"
+        ):
+            partial_tensor = replica_tensor.redistribute(
+                device_mesh, [partial_spec]
+            )
+
+        from torch.distributed._tensor.redistribute import Redistribute
+
+        partial_tensor = Redistribute.apply(
+            replica_tensor, device_mesh, [partial_spec]
+        )
+        self.assertEqual(partial_tensor.size(), local_tensor.size())
+        # test it successfully zero out the contents on other ranks
+        if self.rank == 0:
+            self.assertEqual(
+                replica_tensor.to_local(), partial_tensor.to_local()
+            )
+        else:
+            self.assertEqual(
+                partial_tensor.to_local(), torch.zeros_like(local_tensor)
+            )
+
+        # replicate to partial on sub groups
+        local_tensor = torch.randn(12, 3, device=self.device_type)
+        device_mesh = DeviceMesh(
+            self.device_type,
+            torch.arange(self.world_size).reshape(self.world_size // 2, 2),
+        )
+        # 1) test replicate -> partial on 2d-mesh subgroups
+        replica_tensor = distribute_tensor(
+            local_tensor, device_mesh, [replica_spec, replica_spec]
+        )
+        partial_tensor = Redistribute.apply(
+            replica_tensor, device_mesh, [partial_spec, partial_spec]
+        )
+        self.assertEqual(partial_tensor.size(), local_tensor.size())
+
+        if self.rank != 3:
+            # replicate to partial should only zero out rank 3, and leave
+            # rank 0/2 (rank0 on mesh dim 1) and 0, 1 (rank0 on mesh dim 1) un-touched
+            self.assertEqual(
+                replica_tensor.to_local(), partial_tensor.to_local()
+            )
+        else:
+            self.assertEqual(
+                replica_tensor.to_local(), torch.zeros_like(local_tensor)
+            )
+
+    @with_comms
+    def test_partial_to_shard(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        partial_spec = [_Partial()]
+
+        input_sizes_and_shard_dim = [
+            ((self.world_size * 3, 3), 0),
+            ((self.world_size * 3 + 1, 3), 0),
+            ((self.world_size * 3 + 2, 3), 0),
+            ((3, self.world_size * 3), 1),
+            ((3, self.world_size * 3 + 1), 1),
+            ((3, self.world_size * 3 + 2), 1),
+        ]
+
+        for input_size, shard_dim in input_sizes_and_shard_dim:
+            shard_spec = [Shard(shard_dim)]
+
+            partial_local = torch.ones(input_size, device=self.device_type)
+            partial_tensor = DTensor.from_local(
+                partial_local, device_mesh, partial_spec, run_check=False
+            )
+
+            quot, rem = divmod(input_size[shard_dim], self.world_size)
+            local_shape = list(input_size)
+            local_shape[shard_dim] = quot + (1 if self.rank < rem else 0)
+            # test partial to shard, trigger reduce_scatter
+            scatter_shard_tensor = partial_tensor.redistribute(
+                device_mesh, shard_spec
+            )
+            self.assertEqual(scatter_shard_tensor.size(), partial_tensor.size())
+            self.assertEqual(scatter_shard_tensor.placements, shard_spec)
+            self.assertEqual(
+                scatter_shard_tensor.to_local(),
+                torch.ones(local_shape) * self.world_size,
+            )
+
+
+class MultiDimRedistributeTest(DTensorTestBase):
+    @property
+    def world_size(self) -> int:
+        return 8
+
+    @with_comms
+    def test_multi_dim_mesh(self):
+        devices = torch.arange(self.world_size)
+        for mesh_shape in [devices, devices.view(4, 2), devices.view(2, 2, 2)]:
+            mesh_shape = torch.arange(self.world_size).view(-1, 2)
+            device_mesh = DeviceMesh(self.device_type, mesh_shape)
+            tensor_shape = (16, 24)
+
+            if torch.distributed.get_rank() == 0:
+                full_tensor = torch.randn(*tensor_shape)
+            else:
+                # these should be entirely ignored
+                # because distribute_tensor is expected to override shards in ranks != 0
+                full_tensor = torch.ones(*tensor_shape)
+
+            possibilities = [Replicate()] + [
+                Shard(i) for i in range(full_tensor.ndim)
+            ]
+            all_outputs = list(
+                itertools.product(*(mesh_shape.ndim * [possibilities]))
+            )
+            all_inputs = list(
+                itertools.product(
+                    *(mesh_shape.ndim * [possibilities + [_Partial()]])
+                )
+            )
+
+            for inputs in all_inputs:
+                # if partial, temporarily make it Replicated, then replace replicated with partial afterwards
+                repl_inputs = [
+                    Replicate() if s.is_partial() else s for s in inputs
+                ]
+                dt = distribute_tensor(full_tensor, device_mesh, repl_inputs)
+
+                if repl_inputs != inputs:
+                    # create a new DTensor reinterpreting some of the replicated entires as "Partial"
+                    dt = DTensor.from_local(
+                        dt.to_local(), device_mesh, inputs, run_check=False
+                    )
+
+                for outputs in all_outputs:
+                    # redistribute on target outputs
+                    dt2 = dt.redistribute(device_mesh, outputs)
+
+                    # replicate and then get first shard
+                    local_full = dt2.redistribute(
+                        device_mesh, device_mesh.ndim * [Replicate()]
+                    ).to_local()
+
+                    if torch.distributed.get_rank() == 0:
+                        self.assertEqual(local_full.shape, full_tensor.shape)
+
+                        num_sums = 1
+                        for idx, input in enumerate(inputs):
+                            if input.is_partial():
+                                num_sums *= mesh_shape.size(idx)
+                        expected = num_sums * full_tensor
+                        self.assertEqual(local_full, expected)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/testing/_internal/distributed/_tensor/__init__.py b/torch/testing/_internal/distributed/_tensor/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
new file mode 100644
index 0000000000000..cf2abe0ee8d27
--- /dev/null
+++ b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
@@ -0,0 +1,334 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+
+from contextlib import contextmanager
+from dataclasses import dataclass
+import itertools
+import sys
+from functools import wraps
+from typing import (
+    Any,
+    Callable,
+    Generator,
+    Iterator,
+    Tuple,
+    Dict,
+    Optional,
+    List,
+    Sequence,
+    TypeVar,
+    cast,
+)
+
+import torch
+import torch.distributed as dist
+
+from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec
+from torch.testing._internal.common_distributed import (
+    MultiProcessTestCase,
+    TEST_SKIPS,
+    skip_if_lt_x_gpu,
+)
+
+from torch.distributed._tensor import (
+    DeviceMesh,
+    Shard,
+    Replicate,
+    distribute_tensor,
+    redistribute,
+)
+from torch.distributed._tensor.api import DTensor
+from torch.distributed._tensor.placement_types import Placement
+
+DEVICE_TYPE = "cuda" if torch.cuda.is_available() else "cpu"
+NUM_DEVICES = 4
+
+# We use this as a proxy for "multiple GPUs exist"
+if torch.cuda.is_available() and torch.cuda.device_count() > 1:
+    # when we actually have multiple GPUs, relax the requirement to smaller counts.
+    NUM_DEVICES = min(NUM_DEVICES, torch.cuda.device_count())
+
+T = TypeVar("T")
+
+
+def skip_unless_torch_gpu(method: T) -> T:
+    """
+    Test decorator which skips the test unless there's a GPU available to torch.
+
+    >>> @skip_unless_torch_gpu
+    >>> def test_some_method(self) -> None:
+    >>>   ...
+    """
+    # The builtin @skip_if_no_gpu relies on os.environ['WORLD_SIZE'] being set.
+    return cast(T, skip_if_lt_x_gpu(NUM_DEVICES)(method))
+
+
+@dataclass
+class RedistributeProfile:
+    num_calls: int
+
+
+@contextmanager
+def redistribute_profiler() -> Generator[RedistributeProfile, None, None]:
+
+    orig_redistribute_dtensor = redistribute.redistribute_dtensor
+    profile: RedistributeProfile = RedistributeProfile(num_calls=0)
+
+    # pyre-ignore[53]
+    def patched_redistribute_dtensor(
+        input: DTensor,
+        device_mesh: DeviceMesh,
+        placements: Sequence[Placement],
+    ) -> DTensor:
+        result = orig_redistribute_dtensor(input, device_mesh, placements)
+        profile.num_calls += 1
+        return result
+
+    try:
+        # pyre-ignore[9]
+        redistribute.redistribute_dtensor = patched_redistribute_dtensor
+        yield profile
+    finally:
+        redistribute.redistribute_dtensor = orig_redistribute_dtensor
+
+
+class DTensorTestBase(MultiProcessTestCase):
+    @property
+    def world_size(self) -> int:
+        return NUM_DEVICES
+
+    def build_device_mesh(self) -> DeviceMesh:
+        return DeviceMesh(DEVICE_TYPE, list(range(NUM_DEVICES)))
+
+    def init_pg(self, backend: str = "nccl") -> None:
+        if backend == "nccl" and torch.cuda.device_count() < self.world_size:
+            sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
+
+        if backend not in ["nccl", "gloo", "mpi"]:
+            raise RuntimeError(f"Backend {backend} not supported!")
+
+        dist.init_process_group(
+            backend=backend,
+            world_size=self.world_size,
+            rank=self.rank,  # pyre-ignore[16]
+            init_method=f"file://{self.file_name}",  # pyre-ignore[16]
+        )
+
+        # set device for nccl pg for collectives
+        if backend == "nccl":
+            torch.cuda.set_device(self.rank)
+
+    def destroy_pg(self) -> None:
+        # Wait for all ranks to reach here before starting shutdown.
+        dist.barrier()
+        dist.destroy_process_group()
+
+    def setUp(self) -> None:
+        super().setUp()
+        self._spawn_processes()
+
+    # pyre-ignore[2]:
+    def _test_op(self, mesh: DeviceMesh, op_call, *args, **kwargs) -> None:
+        with redistribute_profiler() as profile:
+            out = op_call(*args, **kwargs)
+            dtc = DTensorConverter(mesh, args, kwargs)
+            for d_args, d_kwargs in dtc:
+                # pyre can't find assertTrue anymore?
+                self.assertEqual(dtc.successful(), True)
+                d_out = op_call(*d_args, **d_kwargs)
+                self.assertEqual(
+                    d_out.redistribute(
+                        mesh, [Replicate()] * mesh.ndim
+                    ).to_local(),
+                    out,
+                )
+
+
+# wrapper to initialize comms (processgroup)
+def with_comms(
+    func: Optional[  # pyre-fixme[24]: Generic type `Callable` expects 2 type parameters.
+        Callable
+    ] = None,
+    backend: Optional[str] = None,
+) -> Optional[  # pyre-fixme[24]: Generic type `Callable` expects 2 type parameters.
+    Callable
+]:
+    assert func is not None
+
+    @wraps(func)  # pyre-ignore[6]
+    def wrapper(
+        self, *args: Tuple[object], **kwargs: Dict[str, Any]  # type: ignore[misc]
+    ) -> None:
+        # if backend not specified, and cuda available, then use nccl, else gloo
+        pg_backend = (
+            "nccl" if backend is None and torch.cuda.is_available() else "gloo"
+        )
+        if pg_backend == "nccl" and torch.cuda.device_count() < self.world_size:
+            sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
+
+        self.device_type = "cuda" if pg_backend == "nccl" else "cpu"
+        self.init_pg(backend=pg_backend)
+        func(self)  # type: ignore[misc]
+        self.destroy_pg()
+
+    return wrapper
+
+
+# This is a class for converting args/kwargs of an op into distributed args/kwargs
+class DTensorConverter(object):
+    def __init__(
+        self,
+        mesh: DeviceMesh,
+        args: Tuple[object, ...],
+        kwargs: Dict[str, object],
+    ) -> None:
+        self.hit = 0
+        self.miss = 0
+        self.mesh = mesh
+        self.args = args
+        self.kwargs = kwargs
+        flatten_args, flatten_args_spec = tree_flatten(args)
+        flatten_kwargs, flatten_kwargs_spec = tree_flatten(kwargs)
+
+        self.flatten_args: List[object] = flatten_args
+        self.flatten_args_spec: TreeSpec = flatten_args_spec
+        self.flatten_kwargs: List[object] = flatten_kwargs
+        self.flatten_kwargs_spec: TreeSpec = flatten_kwargs_spec
+
+        choices_for_args = []
+        for arg in self.flatten_args:
+            if isinstance(arg, torch.Tensor):
+                choices_for_args.append(self.gen_sharding_choices_for_arg(arg))
+
+        for arg in self.flatten_kwargs:
+            if isinstance(arg, torch.Tensor):
+                choices_for_args.append(self.gen_sharding_choices_for_arg(arg))
+
+        self.sharding_combs: Iterator[Sequence[Placement]] = iter(
+            itertools.product(*choices_for_args)
+        )
+
+    def successful(self) -> bool:
+        return self.hit > 0 and self.miss == 0
+
+    def is_supported_tensor(self, t: torch.Tensor) -> bool:
+        # TODO: dist tensor need to support quantized and sparse
+        # tensors, quantized tensor might be relatively easy, but
+        # sparse tensor have special layouts that we need to possibly
+        # deal with, until we are clear about them, we don't officially
+        # support them.
+        return not any(
+            [
+                t.is_sparse_csr,
+                t.is_sparse,
+                t.is_mkldnn,
+                t.is_quantized,
+                t.is_nested,
+                torch._is_functional_tensor(t),
+                t.is_neg(),
+                t.is_conj(),
+                t.device.type in ("lazy", "meta"),
+                # We need a way to test if a tensor is batched but there
+                # is no official APi to do it
+                # torch._C._is_batched(t),
+            ]
+        )
+
+    def gen_sharding_choices_for_arg(
+        self, arg: torch.Tensor
+    ) -> Sequence[Placement]:
+        mesh_size = self.mesh.size()
+        sharding_choices: List[Placement] = [Replicate()]
+        # c10d collective does not support bool tensor
+        # for bool tensor we treat it as replicated
+        if arg.dtype != torch.bool:
+            # only generating choices with: replicate, or sharding
+            # evenly on a dimension that could be sharded
+            sharding_choices = sharding_choices + [
+                Shard(i)
+                for i, s in enumerate(arg.shape)
+                if s > 1 and s % mesh_size == 0
+            ]
+        # TODO: add multi mesh choices
+        # all_choices = itertools.product(
+        #     *(self.mesh.ndim * [sharding_choices])
+        # )
+        return sharding_choices
+
+    def __iter__(self) -> "DTensorConverter":
+        return self
+
+    def __next__(self) -> Tuple[Tuple[object, ...], Dict[str, object]]:
+        try:
+            next_sharding_choices = next(self.sharding_combs)
+            idx = 0
+
+            new_args: List[object] = []
+            for arg in self.flatten_args:
+                if isinstance(arg, torch.Tensor):
+                    new_args.append(
+                        self.to_dist_tensor(
+                            arg, self.mesh, [next_sharding_choices[idx]]
+                        )
+                    )
+                    idx += 1
+                else:
+                    new_args.append(arg)
+
+            new_kwargs: List[object] = []
+            for arg in self.flatten_kwargs:
+                if isinstance(arg, torch.Tensor):
+                    new_kwargs.append(
+                        self.to_dist_tensor(
+                            arg, self.mesh, [next_sharding_choices[idx]]
+                        )
+                    )
+                    idx += 1
+                else:
+                    new_kwargs.append(arg)
+
+            return (
+                tree_unflatten(new_args, self.flatten_args_spec),
+                tree_unflatten(new_kwargs, self.flatten_kwargs_spec),
+            )
+        except StopIteration:
+            raise StopIteration
+
+    def to_dist_tensor(
+        self, t: torch.Tensor, mesh: DeviceMesh, placements: List[Placement]
+    ) -> torch.Tensor:
+        if type(t) is torch.Tensor or type(t) is torch.nn.Parameter:
+            if self.is_supported_tensor(t):
+                self.hit += 1
+                # We cannot use distribute_tensor for bool tensors as c10d
+                # collectives does not support the dtype, we assume op with
+                # bool tensor args the same tensor so we don't need to broadcast
+                # TODO: add bool tensor dtype support in c10d collective
+                if t.dtype == torch.bool:
+                    r = DTensor(
+                        t,
+                        mesh,
+                        placements,
+                        size=t.size(),
+                        requires_grad=t.requires_grad,
+                    )
+                else:
+                    r = distribute_tensor(t, mesh, placements)
+                if type(t) is torch.nn.Parameter:
+                    r = torch.nn.Parameter(  # type: ignore[assignment]
+                        r, requires_grad=r.requires_grad
+                    )
+                return r
+            else:
+                self.miss += 1
+                return t
+        elif torch.overrides.is_tensor_like(t):
+            # Blindly converting tensor subclasses to dist tensor can cause
+            # unpredictable problems, we explicitly disable this conversion
+            # for now (i.e. we don't support DTensor holding tensor subclass
+            # until there's a strong reason later).
+            self.miss += 1
+            return t
+        else:
+            raise RuntimeError(
+                f"Trying to convert to DTensor, but got {type(t)}"
+            )

From 3240ab2a2ef3db5222a20829d465ba6316398fb3 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Tue, 15 Nov 2022 22:51:32 +0000
Subject: [PATCH 0951/1922] [dtensor] PART 6: move DTensor op tests to core
 distributed (#88551)

This PR moves DTensor op tests to core distributed, including
prop_rule, pointwise op, matrix op tests, etc.

part of https://github.com/pytorch/pytorch/issues/88838
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88551
Approved by: https://github.com/aazzolini
---
 test/distributed/_tensor/test_common_rules.py | 476 +++++++++++++++++
 test/distributed/_tensor/test_math_ops.py     | 126 +++++
 test/distributed/_tensor/test_matrix_ops.py   | 302 +++++++++++
 .../distributed/_tensor/test_pointwise_ops.py | 285 +++++++++++
 .../_tensor/test_tp_sharding_ops.py           | 101 ++++
 test/distributed/_tensor/test_view_ops.py     | 480 ++++++++++++++++++
 6 files changed, 1770 insertions(+)
 create mode 100644 test/distributed/_tensor/test_common_rules.py
 create mode 100644 test/distributed/_tensor/test_math_ops.py
 create mode 100644 test/distributed/_tensor/test_matrix_ops.py
 create mode 100644 test/distributed/_tensor/test_pointwise_ops.py
 create mode 100644 test/distributed/_tensor/test_tp_sharding_ops.py
 create mode 100644 test/distributed/_tensor/test_view_ops.py

diff --git a/test/distributed/_tensor/test_common_rules.py b/test/distributed/_tensor/test_common_rules.py
new file mode 100644
index 0000000000000..ab9743c1d5e9b
--- /dev/null
+++ b/test/distributed/_tensor/test_common_rules.py
@@ -0,0 +1,476 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+import torch
+from torch.testing._internal.common_utils import run_tests
+from torchgen.model import FunctionSchema
+from torch.distributed._tensor.dispatch import OpSchema
+
+from torch.distributed._tensor.ops.common_rules import (
+    einop_rule,
+    reduction_rule,
+    pointwise_rule,
+)
+from torch.distributed._tensor.placement_types import DTensorSpec
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
+)
+from torch.distributed._tensor import DeviceMesh
+
+
+class CommonRulesTest(DTensorTestBase):
+    def parse_schema(self, schema_str):
+        return FunctionSchema.parse(schema_str)
+
+    @property
+    def world_size(self) -> int:
+        # hard code world size to 4 as we need to test
+        # at least with 2d mesh
+        return 4
+
+    @with_comms
+    def test_einop_basic_propagation(self):
+        # plain einsum, mm
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        func_schema = self.parse_schema(
+            "aten::mm(Tensor self, Tensor mat2) -> Tensor"
+        )
+        # propagate col-wise sharding
+        mat1, mat2 = [-1, -1], [-1, 0]
+        mat1_spec = DTensorSpec.from_dim_map(
+            mesh, mat1, [], shape=torch.Size([8, 4])
+        )
+        mat2_spec = DTensorSpec.from_dim_map(
+            mesh, mat2, [], shape=torch.Size([4, 8])
+        )
+        output_sharding = einop_rule(
+            "mk,kn->mn", OpSchema(func_schema, (mat1_spec, mat2_spec), {})
+        )
+        output_spec = output_sharding.output_spec
+        self.assertIsNotNone(output_spec)
+        self.assertEqual(output_spec.dim_map, [-1, 0])
+        self.assertEqual(output_spec.shape, torch.Size([8, 8]))
+
+        # propagate row-wise sharding
+        mat1, mat2 = [0, -1], [-1, -1]
+        mat1_spec = DTensorSpec.from_dim_map(
+            mesh, mat1, [], shape=torch.Size([8, 4])
+        )
+        mat2_spec = DTensorSpec.from_dim_map(
+            mesh, mat2, [], shape=torch.Size([4, 8])
+        )
+        output_sharding = einop_rule(
+            "mk,kn->mn", OpSchema(func_schema, (mat1_spec, mat2_spec), {})
+        )
+        output_spec = output_sharding.output_spec
+        self.assertIsNotNone(output_spec)
+        self.assertEqual(output_spec.dim_map, [0, -1])
+        self.assertEqual(output_spec.shape, torch.Size([8, 8]))
+
+        # generate partial
+        mat1, mat2 = [-1, 0], [0, -1]
+        mat1_spec = DTensorSpec.from_dim_map(
+            mesh, mat1, [], shape=torch.Size([8, 4])
+        )
+        mat2_spec = DTensorSpec.from_dim_map(
+            mesh, mat2, [], shape=torch.Size([4, 8])
+        )
+        output_sharding = einop_rule(
+            "mk,kn->mn", OpSchema(func_schema, (mat1_spec, mat2_spec), {})
+        )
+        output_spec = output_sharding.output_spec
+        self.assertIsNotNone(output_spec)
+        self.assertTrue(output_spec.placements[0].is_partial())
+        self.assertEqual(output_spec.shape, torch.Size([8, 8]))
+
+    @with_comms
+    def test_einop_pointwise_propagation(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        func_schema = self.parse_schema(
+            "aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor"
+        )
+        # addition
+        mat1 = [0, -1]
+        mat1_spec = DTensorSpec.from_dim_map(
+            mesh, mat1, [], shape=torch.Size([8, 8])
+        )
+        output_sharding = einop_rule(
+            "ij,ij->ij", OpSchema(func_schema, (mat1_spec, mat1_spec), {})
+        )
+        output_spec = output_sharding.output_spec
+        self.assertIsNotNone(output_spec)
+        self.assertEqual(output_spec.dim_map, [0, -1])
+        self.assertEqual(output_spec.shape, torch.Size([8, 8]))
+
+        # broadcast addition
+        mat1 = [-1, 0, -1]
+        mat1_spec = DTensorSpec.from_dim_map(
+            mesh, mat1, [], shape=torch.Size([8, 4, 2])
+        )
+        mat2_spec = DTensorSpec.from_dim_map(
+            mesh, [-1], [], shape=torch.Size([2])
+        )
+        output_sharding = einop_rule(
+            "ijk,k->ijk", OpSchema(func_schema, (mat1_spec, mat2_spec), {})
+        )
+        output_spec = output_sharding.output_spec
+        self.assertIsNotNone(output_spec)
+        self.assertEqual(output_spec.dim_map, [-1, 0, -1])
+        self.assertEqual(output_spec.shape, torch.Size([8, 4, 2]))
+
+        # broadcast to a common shape
+        mat1_spec = DTensorSpec.from_dim_map(
+            mesh, [0, -1, -1], [], shape=torch.Size([8, 8, 8])
+        )
+        mat2_spec = DTensorSpec.from_dim_map(
+            mesh, [-1, -1], [], shape=torch.Size([1, 8])
+        )
+        output_sharding = einop_rule(
+            "ijk,1k->ijk", OpSchema(func_schema, (mat1_spec, mat2_spec), {})
+        )
+        output_spec = output_sharding.output_spec
+        self.assertIsNotNone(output_spec)
+        self.assertEqual(output_spec.dim_map, [0, -1, -1])
+        self.assertEqual(output_spec.shape, torch.Size([8, 8, 8]))
+
+    @with_comms
+    def test_einop_merge_sharding(self):
+        # 2d mesh einop merge sharding
+        mesh_shape = torch.arange(self.world_size).reshape(
+            self.world_size // 2, self.world_size // 2
+        )
+        mesh = DeviceMesh(self.device_type, mesh_shape)
+
+        func_schema = self.parse_schema(
+            "aten::mm(Tensor self, Tensor mat2) -> Tensor"
+        )
+
+        mat1, mat2 = [0, -1], [-1, 1]
+        mat1_spec = DTensorSpec.from_dim_map(
+            mesh, mat1, [], shape=torch.Size([8, 4])
+        )
+        mat2_spec = DTensorSpec.from_dim_map(
+            mesh, mat2, [], shape=torch.Size([4, 8])
+        )
+        output_sharding = einop_rule(
+            "mk,kn->mn", OpSchema(func_schema, (mat1_spec, mat2_spec), {})
+        )
+        output_spec = output_sharding.output_spec
+        self.assertIsNotNone(output_spec)
+        self.assertEqual(output_spec.dim_map, [0, 1])
+        self.assertEqual(output_spec.shape, torch.Size([8, 8]))
+
+    @with_comms
+    def test_einop_linearity(self):
+        mesh_shape = torch.arange(self.world_size).reshape(
+            self.world_size // 2, self.world_size // 2
+        )
+        mesh = DeviceMesh(self.device_type, mesh_shape)
+
+        mm_func_schema = self.parse_schema(
+            "aten::mm(Tensor self, Tensor mat2) -> Tensor"
+        )
+
+        mat1, mat2 = [0, -1], [-1, -1]
+        mat1_spec = DTensorSpec.from_dim_map(
+            mesh, mat1, [1], shape=torch.Size([8, 4])
+        )
+        mat2_spec = DTensorSpec.from_dim_map(
+            mesh, mat2, [], shape=torch.Size([4, 8])
+        )
+        # if not turn on linearity, partial sum is not eligible to propagate, we return
+        # suggestion to reshard inputs with no partial sum (i.e. all_reduce one input)
+        output_sharding = einop_rule(
+            "mk,kn->mn", OpSchema(mm_func_schema, (mat1_spec, mat2_spec), {})
+        )
+        self.assertIsNone(output_sharding.output_spec)
+        suggestions = output_sharding.schema_suggestions
+        self.assertIsNotNone(suggestions)
+        suggested_spec = suggestions[0].args_schema[0]
+        self.assertFalse(suggested_spec.placements[1].is_partial())
+
+        # einop prop with linearity on mm, should give back suggestion
+        # on converting placements to partial
+        output_sharding = einop_rule(
+            "mk,kn->mn",
+            OpSchema(mm_func_schema, (mat1_spec, mat2_spec), {}),
+            linearity=True,
+        )
+        self.assertIsNone(output_sharding.output_spec)
+        suggestions = output_sharding.schema_suggestions
+        self.assertIsNotNone(suggestions)
+        mat2_spec = suggestions[0].args_schema[1]
+        # mat2 mesh dim 1 should become partial now!
+        self.assertTrue(mat2_spec.placements[1].is_partial())
+
+        # einop prop with linearity on point-wise, should give back suggestion
+        # on converting placements to partial
+        add_func_schema = self.parse_schema(
+            "aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor"
+        )
+        mat1, mat2 = [0, -1], [0, -1]
+        mat1_spec = DTensorSpec.from_dim_map(
+            mesh, mat1, [1], shape=torch.Size([8, 6])
+        )
+        mat2_spec = DTensorSpec.from_dim_map(
+            mesh, mat2, [], shape=torch.Size([8, 6])
+        )
+
+        output_sharding = einop_rule(
+            "ij,ij->ij",
+            OpSchema(add_func_schema, (mat1_spec, mat2_spec), {}),
+            linearity=True,
+        )
+        self.assertIsNone(output_sharding.output_spec)
+        suggestions = output_sharding.schema_suggestions
+        self.assertIsNotNone(suggestions)
+        mat2_spec = suggestions[0].args_schema[1]
+        # mat2 mesh dim 1 should become partial now!
+        self.assertTrue(mat2_spec.placements[1].is_partial())
+
+    @with_comms
+    def test_einop_multi_sharding_on_mesh_dim(self):
+        # einop prop with multi sharding on same mesh dim
+        mesh_shape = torch.arange(self.world_size)
+        mesh = DeviceMesh(self.device_type, mesh_shape)
+
+        func_schema = self.parse_schema(
+            "aten::mm(Tensor self, Tensor mat2) -> Tensor"
+        )
+        mat1, mat2 = [0, -1], [0, -1]
+        mat1_spec = DTensorSpec.from_dim_map(
+            mesh, mat1, [], shape=torch.Size([8, 12])
+        )
+        mat2_spec = DTensorSpec.from_dim_map(
+            mesh, mat2, [], shape=torch.Size([12, 4])
+        )
+        output_sharding = einop_rule(
+            "mk,kn->mn", OpSchema(func_schema, (mat1_spec, mat2_spec), {})
+        )
+        output_spec = output_sharding.output_spec
+        self.assertIsNone(output_spec)
+        self.assertIsNotNone(output_sharding.schema_suggestions)
+
+        # ensure that the suggestion is to reshard the second
+        # arg by all_gather its tensor dim sharding
+        schema_suggestion = output_sharding.schema_suggestions[0]
+        self.assertEqual(schema_suggestion.args_schema[0].dim_map, [0, -1])
+        self.assertEqual(schema_suggestion.args_schema[1].dim_map, [-1, -1])
+
+    @with_comms
+    def test_einop_errors(self):
+        mesh_shape = torch.arange(self.world_size).reshape(
+            self.world_size // 2, self.world_size // 2
+        )
+        mesh = DeviceMesh(self.device_type, mesh_shape)
+
+        func_schema = self.parse_schema(
+            "aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor"
+        )
+        mat1, mat2 = [0, -1], [1, -1]
+        mat1_spec = DTensorSpec.from_dim_map(
+            mesh, mat1, [], shape=torch.Size([8, 4])
+        )
+        mat2_spec = DTensorSpec.from_dim_map(
+            mesh, mat2, [], shape=torch.Size([8, 4])
+        )
+
+        with self.assertRaisesRegex(
+            RuntimeError, "sharded two different ways:"
+        ):
+            einop_rule(
+                "ij,ij->ij", OpSchema(func_schema, (mat1_spec, mat2_spec), {})
+            )
+
+    @with_comms
+    def test_pointwise_rules_broadcasting(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        func_schema = self.parse_schema(
+            "where.self(Tensor condition, Tensor self, Tensor other) -> Tensor"
+        )
+        inp1, inp2, inp3 = [0], [], [-1, -1]
+        condition = DTensorSpec.from_dim_map(
+            mesh, inp1, [], shape=torch.Size([8])
+        )
+        self_tensor = DTensorSpec.from_dim_map(
+            mesh, inp2, [], shape=torch.Size([])
+        )
+        other_tensor = DTensorSpec.from_dim_map(
+            mesh, inp3, [], shape=torch.Size([1, 1])
+        )
+        # propagate point-wise sharding with broadcasting
+        output_sharding = pointwise_rule(
+            OpSchema(func_schema, (condition, self_tensor, other_tensor), {})
+        )
+        output_spec = output_sharding.output_spec
+        self.assertIsNotNone(output_spec)
+        self.assertEqual(output_spec.dim_map, [-1, 0])
+        self.assertEqual(output_spec.shape, [1, 8])
+
+    @with_comms
+    def test_pointwise_rules_suggestion(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        func_schema = self.parse_schema(
+            "aten::lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor"
+        )
+        # propagate point-wise sharding
+        inp1, inp2 = [-1, -1], [-1, 0]
+        mat1_spec = DTensorSpec.from_dim_map(
+            mesh, inp1, [], shape=torch.Size([8, 4])
+        )
+        mat2_spec = DTensorSpec.from_dim_map(
+            mesh, inp2, [], shape=torch.Size([8, 4])
+        )
+        # adding a positional argument -1 to arg schema
+        output_sharding = pointwise_rule(
+            OpSchema(func_schema, (mat1_spec, mat2_spec, -1), {})
+        )
+        self.assertIsNone(output_sharding.output_spec)
+        self.assertIsNotNone(output_sharding.schema_suggestions)
+
+        # ensure that the suggestion from pointwise rules still have
+        # the positional args that are not DTensorSpec
+        schema_suggestion = output_sharding.schema_suggestions[0]
+        self.assertEqual(len(schema_suggestion.args_schema), 3)
+        self.assertEqual(schema_suggestion.args_schema[2], -1)
+
+    @with_comms
+    def test_pointwise_multi_sharding_on_mesh_dim(self):
+        # 2d mesh pointwise sharding
+        mesh_shape = torch.arange(self.world_size).reshape(
+            self.world_size // 2, self.world_size // 2
+        )
+        mesh = DeviceMesh(self.device_type, mesh_shape)
+
+        func_schema = self.parse_schema(
+            "aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor"
+        )
+
+        # basic case to test implicit broadcasting shape alignment
+        mat1, mat2 = [-1, 0], [0]
+        mat1_spec = DTensorSpec.from_dim_map(
+            mesh, mat1, [], shape=torch.Size([20, 6])
+        )
+        mat2_spec = DTensorSpec.from_dim_map(
+            mesh, mat2, [], shape=torch.Size([6])
+        )
+        output_sharding = pointwise_rule(
+            OpSchema(func_schema, (mat1_spec, mat2_spec), {})
+        )
+        output_spec = output_sharding.output_spec
+        self.assertIsNotNone(output_spec)
+        self.assertEqual(output_spec.dim_map, [-1, 0])
+
+        # more advanced case that needs reshard one input to align sharding
+        mat1, mat2 = [0, -1, -1, 1], [0, -1, 1]
+        mat1_spec = DTensorSpec.from_dim_map(
+            mesh, mat1, [], shape=torch.Size([12, 1, 1, 8])
+        )
+        mat2_spec = DTensorSpec.from_dim_map(
+            mesh, mat2, [], shape=torch.Size([12, 4, 8])
+        )
+        output_sharding = pointwise_rule(
+            OpSchema(func_schema, (mat1_spec, mat2_spec), {})
+        )
+        output_spec = output_sharding.output_spec
+        self.assertIsNone(output_spec)
+        self.assertIsNotNone(output_sharding.schema_suggestions)
+
+        # ensure that the suggestion is to reshard the first
+        # arg by all_gather first tensor dim sharding
+        schema_suggestion = output_sharding.schema_suggestions[0]
+        self.assertEqual(
+            schema_suggestion.args_schema[0].dim_map, [-1, -1, -1, 1]
+        )
+        self.assertEqual(schema_suggestion.args_schema[1].dim_map, mat2)
+
+    @with_comms
+    def test_pointwise_enforce_sharding_multi_sharding_on_mesh_dim(self):
+        # 2d mesh pointwise sharding
+        mesh_shape = torch.arange(self.world_size).reshape(
+            self.world_size // 2, self.world_size // 2
+        )
+        mesh = DeviceMesh(self.device_type, mesh_shape)
+
+        func_schema = self.parse_schema(
+            "aten::add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)"
+        )
+
+        # more advanced case that needs reshard one input to align sharding
+        mat1, mat2 = [0, -1, 1], [-1, -1, 0]
+        mat1_spec = DTensorSpec.from_dim_map(
+            mesh, mat1, [], shape=torch.Size([12, 4, 8])
+        )
+        mat2_spec = DTensorSpec.from_dim_map(
+            mesh, mat2, [], shape=torch.Size([12, 1, 8])
+        )
+        output_sharding = pointwise_rule(
+            OpSchema(func_schema, (mat1_spec, mat2_spec), {})
+        )
+        output_spec = output_sharding.output_spec
+        self.assertIsNone(output_spec)
+        self.assertIsNotNone(output_sharding.schema_suggestions)
+
+        # ensure that the suggestion is to reshard the second
+        # arg as we should enforce the sharding of the first arg
+        schema_suggestion = output_sharding.schema_suggestions[0]
+        self.assertEqual(schema_suggestion.args_schema[0].dim_map, mat1)
+        self.assertEqual(schema_suggestion.args_schema[1].dim_map, mat1)
+
+    @with_comms
+    def test_reduction_rule(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        func_schema = self.parse_schema(
+            "aten::sum(Tensor self, *, ScalarType? dtype=None) -> Tensor"
+        )
+        # reduction on a 2d mat
+        mat1 = [0, -1]
+        mat1_spec = DTensorSpec.from_dim_map(
+            mesh, mat1, [], shape=torch.Size([8, 4])
+        )
+        # reduction on dim 0
+        output_sharding_0 = reduction_rule(
+            OpSchema(func_schema, (mat1_spec, 0), {}),
+            dims=[0],
+            reduction_linear=True,
+        )
+        self.assertIsNotNone(output_sharding_0.output_spec)
+        self.assertEqual(output_sharding_0.output_spec.dim_map, [-1])
+        # pending sum on dim 0
+        self.assertEqual(output_sharding_0.output_spec.sums, [0])
+        self.assertEqual(output_sharding_0.output_spec.shape, torch.Size([4]))
+
+        # reduction on dim 1
+        output_sharding_1 = reduction_rule(
+            OpSchema(func_schema, (mat1_spec, 1), {}),
+            dims=[1],
+            reduction_linear=True,
+        )
+        self.assertIsNotNone(output_sharding_1.output_spec)
+        self.assertEqual(output_sharding_1.output_spec.dim_map, [0])
+        self.assertEqual(output_sharding_1.output_spec.sums, [])
+        self.assertEqual(output_sharding_1.output_spec.shape, torch.Size([8]))
+
+        # full reduction if not specify dim
+        output_sharding_all_dim = reduction_rule(
+            OpSchema(func_schema, (mat1_spec,), {}),
+            dims=[0, 1],
+            reduction_linear=True,
+        )
+        self.assertIsNotNone(output_sharding_all_dim.output_spec)
+        self.assertEqual(output_sharding_all_dim.output_spec.dim_map, [])
+        # pending sum on mesh
+        self.assertEqual(output_sharding_all_dim.output_spec.sums, [0])
+        self.assertEqual(
+            output_sharding_all_dim.output_spec.shape, torch.Size([])
+        )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_tensor/test_math_ops.py b/test/distributed/_tensor/test_math_ops.py
new file mode 100644
index 0000000000000..403f22d8325ed
--- /dev/null
+++ b/test/distributed/_tensor/test_math_ops.py
@@ -0,0 +1,126 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+import torch
+from torch.testing._internal.common_utils import run_tests
+
+from torch.distributed._tensor import distribute_tensor
+from torch.distributed._tensor.placement_types import Shard, Replicate
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
+    skip_unless_torch_gpu,
+)
+import itertools
+
+
+class DistMathOpsTest(DTensorTestBase):
+    @with_comms
+    def test_sum(self):
+        device_mesh = self.build_device_mesh()
+
+        shard_spec = [Shard(0)]
+
+        tensor_to_sum = torch.randn(12, 8, 8)
+
+        mat1 = distribute_tensor(tensor_to_sum, device_mesh, shard_spec)
+
+        keep_dim_or_not = [True, False, None]
+        for dim in range(tensor_to_sum.ndim):
+            for keep_dim in keep_dim_or_not:
+                sum_args = (dim, keep_dim) if keep_dim is not None else (dim,)
+                dim_sumed_tensor = tensor_to_sum.sum(*sum_args)
+                dt_dim_sumed_tensor = mat1.sum(*sum_args).redistribute(
+                    device_mesh, [Replicate()] * device_mesh.ndim
+                )
+                self.assertEqual(
+                    dt_dim_sumed_tensor.to_local(), dim_sumed_tensor
+                )
+
+        full_sumed_tensor = tensor_to_sum.sum()
+        dt_sum = mat1.sum().redistribute(
+            device_mesh, [Replicate()] * device_mesh.ndim
+        )
+        self.assertEqual(dt_sum.to_local(), full_sumed_tensor)
+
+    # TODO: forward test can be removed once test_softmax_with_bwd passes on CPU
+    @with_comms
+    def test_softmax_fwd(self):
+        device_mesh = self.build_device_mesh()
+
+        x = torch.rand(8, 12, 16, device=self.device_type)
+        dims = range(3)  # used to convert -1 to the actual dim
+        softmax_dims = [-1, 0, 1, 2]
+        shard_dims = [-1, 0, 1, 2]
+        test_list = list(itertools.product(softmax_dims, shard_dims))
+
+        for softmax_dim, shard_dim in test_list:
+            local_y = torch.nn.functional.softmax(
+                x, dim=softmax_dim, dtype=torch.float32
+            )
+            dist_x = distribute_tensor(x, device_mesh, [Shard(shard_dim)])
+            if dims[shard_dim] == dims[softmax_dim]:
+                with self.assertRaisesRegex(
+                    Exception, "Cannot run .* on sharding dimension!$"
+                ):
+                    dist_y = torch.nn.functional.softmax(
+                        dist_x, dim=softmax_dim, dtype=torch.float32
+                    )
+            else:
+                dist_y = torch.nn.functional.softmax(
+                    dist_x, dim=softmax_dim, dtype=torch.float32
+                )
+                self.assertTrue(dist_y.placements[0].is_shard(dim=shard_dim))
+                dist_y = dist_y.redistribute(device_mesh, [Replicate()])
+                self.assertEqual(dist_y.to_local(), local_y)
+
+    # TODO: get test_softmax_with_bwd pass on CPU
+    # DTensor's _softmax_backward_data produces wrong result on CPU on certain dimension.
+    # fail_on_cpu_list = [(0, -1), (1, -1)]
+    @with_comms
+    @skip_unless_torch_gpu
+    def test_softmax_with_bwd(self):
+        device_mesh = self.build_device_mesh()
+
+        dims = range(3)  # used to convert -1 to the actual dim
+        softmax_dims = [-1, 0, 1, 2]
+        shard_dims = [-1, 0, 1, 2]
+        test_list = list(itertools.product(softmax_dims, shard_dims))
+
+        for params in test_list:
+            softmax_dim, shard_dim = params
+            x = torch.rand(
+                8, 12, 16, device=self.device_type, requires_grad=True
+            )
+            self.assertTrue(x.requires_grad)
+            local_y = torch.nn.functional.softmax(
+                x, dim=softmax_dim, dtype=torch.float32
+            ).sum()
+            local_y.backward()
+
+            dist_x = distribute_tensor(x, device_mesh, [Shard(shard_dim)])
+            self.assertTrue(dist_x.requires_grad)
+            if dims[softmax_dim] == dims[shard_dim]:
+                with self.assertRaisesRegex(
+                    Exception, "Cannot run .* on sharding dimension!$"
+                ):
+                    dist_softmax = dist_x.softmax(dim=softmax_dim)
+            else:
+                dist_softmax = dist_x.softmax(dim=softmax_dim)
+                self.assertTrue(
+                    dist_softmax.placements[0].is_shard(dim=shard_dim)
+                )
+                dist_y = dist_softmax.sum()
+                dist_y = dist_y.redistribute(device_mesh, [Replicate()])
+                self.assertEqual(dist_y.to_local(), local_y)
+                self.assertIsNone(dist_x.grad)
+                dist_y.backward()
+                self.assertIsNotNone(dist_x.grad)
+                dist_x_grad = dist_x.grad.redistribute(
+                    device_mesh, [Replicate()]
+                )
+                self.assertEqual(dist_x_grad.to_local(), x.grad)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_tensor/test_matrix_ops.py b/test/distributed/_tensor/test_matrix_ops.py
new file mode 100644
index 0000000000000..ed2af130ac884
--- /dev/null
+++ b/test/distributed/_tensor/test_matrix_ops.py
@@ -0,0 +1,302 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+import torch
+from torch.testing._internal.common_utils import run_tests
+from torch.distributed._tensor.api import DTensor
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
+    skip_unless_torch_gpu,
+)
+from torch.distributed._tensor import distribute_tensor, DeviceMesh
+from torch.distributed._tensor.placement_types import Placement, Shard, Replicate, _Partial
+from typing import List, Optional, cast
+import itertools
+
+
+class DistMatrixOpsTest(DTensorTestBase):
+    @with_comms
+    def test_addmm(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        shard_spec = [Shard(0)]
+        replica_spec = [Replicate()]
+
+        tensor_to_shard = torch.randn(12, 8)
+        mat1 = distribute_tensor(tensor_to_shard, device_mesh, shard_spec)
+        tensor_to_replicate = torch.randn(8, 4)
+        mat2 = distribute_tensor(tensor_to_replicate, device_mesh, replica_spec)
+        input_tensor = torch.randn(4)
+        input = distribute_tensor(input_tensor, device_mesh, replica_spec)
+
+        dist_res = torch.addmm(input, mat1, mat2)
+        local_res = torch.addmm(
+            input_tensor, tensor_to_shard, tensor_to_replicate
+        )
+        self.assertEqual(
+            dist_res.redistribute(device_mesh, replica_spec).to_local(),
+            local_res,
+        )
+
+    @with_comms
+    def test_addmm_auto_redistribute(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        shard0_spec = [Shard(0)]
+        shard1_spec = [Shard(1)]
+        replica_spec = [Replicate()]
+
+        tensor_to_shard1 = torch.randn(12, 8, requires_grad=True)
+        mat1 = distribute_tensor(tensor_to_shard1, device_mesh, shard1_spec)
+        tensor_to_shard0 = torch.randn(8, 4, requires_grad=True)
+        mat2 = distribute_tensor(tensor_to_shard0, device_mesh, shard0_spec)
+        input_tensor = torch.randn(4, requires_grad=True)
+        input = distribute_tensor(input_tensor, device_mesh, replica_spec)
+
+        local_res = torch.addmm(
+            input_tensor, tensor_to_shard1, tensor_to_shard0
+        )
+        dist_res = torch.addmm(input, mat1, mat2)
+
+        # test if addmm output is a partial
+        self.assertIsInstance(dist_res, DTensor)
+        self.assertIsInstance(dist_res.placements[0], _Partial)
+
+        # test if result is the same as tensor
+        replica_res = dist_res.redistribute(device_mesh, replica_spec)
+        dist_local_res = replica_res.to_local()
+        self.assertEqual(local_res, dist_local_res)
+
+        # backward checks
+        dist_local_res.sum().backward()
+        local_res.sum().backward()
+        self.assertIsNotNone(mat2.grad)
+        mat2_grad = mat2.grad.redistribute(device_mesh, replica_spec)
+        self.assertEqual(mat2_grad.to_local(), tensor_to_shard0.grad)
+
+    @with_comms
+    def test_mm(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        shard0_spec = Shard(0)
+        shard1_spec = Shard(1)
+        replica_spec = Replicate()
+
+        t1 = torch.randn(12, 8, requires_grad=True)
+        t2 = torch.randn(8, 16, requires_grad=True)
+        local_res = torch.mm(t1, t2)
+
+        def test_placement_comb(
+            placements1: List[Placement], placements2: List[Placement]
+        ) -> None:
+            dt1 = distribute_tensor(t1, device_mesh, placements1)
+            dt2 = distribute_tensor(t2, device_mesh, placements2)
+            dist_res: DTensor = cast(DTensor, torch.mm(dt1, dt2)).redistribute(
+                device_mesh, [replica_spec]
+            )
+            self.assertEqual(dist_res.to_local(), local_res)
+            # backward
+            grad_dist_res = torch.ones_like(dist_res)
+            dist_res.backward(grad_dist_res)
+            self.assertIsNotNone(dt1.grad)
+
+        placement_specs = [shard0_spec, shard1_spec, replica_spec]
+        shard_specs_comb = list(
+            itertools.product(placement_specs, placement_specs)
+        )
+        for spec in shard_specs_comb:
+            test_placement_comb([spec[0]], [spec[1]])
+
+    @with_comms
+    def test_t(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        shard_spec = [Shard(0)]
+
+        tensor_to_transpose = torch.randn(12, 8, requires_grad=True)
+        mat = distribute_tensor(tensor_to_transpose, device_mesh, shard_spec)
+        tranposed_mat = mat.t()
+        self.assertEqual(tranposed_mat.size(), torch.Size([8, 12]))
+        self.assertEqual(tranposed_mat.placements, [Shard(1)])
+        tranposed_mat2 = tranposed_mat.t()
+        self.assertEqual(tranposed_mat2.size(), torch.Size([12, 8]))
+        self.assertEqual(tranposed_mat2.placements, shard_spec)
+
+    @with_comms
+    def test_t_partial(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+
+        a = torch.randn(12, 8)
+        b = torch.randn(8, 4)
+        c = torch.mm(a, b).t()
+
+        da = distribute_tensor(a, device_mesh, [Shard(1)])
+        db = distribute_tensor(b, device_mesh, [Shard(0)])
+
+        # mm(da, db) should return a _Partial tensor.
+        # transposing it should keep it _Partial
+        dc = torch.mm(da, db).t()
+
+        self.assertTrue(isinstance(dc.placements[0], _Partial))
+
+        # check that the local and distributed op results match
+        self.assertEqual(
+            c,
+            dc.redistribute(device_mesh, [Replicate()]).to_local(),
+        )
+
+    # baddbmm introduces nan occasionally on CPU: https://github.com/pytorch/pytorch/issues/80588
+    @with_comms
+    @skip_unless_torch_gpu
+    def test_baddbmm(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        tensor = torch.rand(
+            4, 4, 8, device=self.device_type, requires_grad=True
+        )
+        batch_1 = torch.rand(
+            4, 4, 8, device=self.device_type, requires_grad=True
+        )
+        batch_2 = torch.rand(
+            4, 8, 8, device=self.device_type, requires_grad=True
+        )
+
+        def test_placement_comb(
+            tensor_placements: List[Placement],
+            batch_1_placements: List[Placement],
+            batch_2_placements: List[Placement],
+            beta: int,
+            alpha: int,
+            batch_1_grad: Optional[torch.Tensor],
+        ) -> None:
+            tensor_dt = distribute_tensor(
+                tensor, device_mesh, tensor_placements
+            )
+            batch_1_dt = distribute_tensor(
+                batch_1, device_mesh, batch_1_placements
+            )
+            batch_2_dt = distribute_tensor(
+                batch_2, device_mesh, batch_2_placements
+            )
+            dist_res = cast(
+                DTensor,
+                torch.baddbmm(
+                    tensor_dt, batch_1_dt, batch_2_dt, beta=beta, alpha=alpha
+                ),
+            ).redistribute(device_mesh, [Replicate()])
+            dist_local_res = dist_res.to_local()
+            assert not torch.isnan(local_result).any()
+            assert not torch.isnan(dist_local_res).any()
+            self.assertEqual(dist_local_res.detach(), local_result.detach())
+
+            # TODO: add test backward
+            # grad_dist_res = torch.ones_like(dist_res)
+            # dist_res.backward(grad_dist_res)
+            # self.assertIsNotNone(batch_1_dt.grad)
+            # batch_1_grad_local = batch_1_dt.grad.redistribute(
+            #     device_mesh, [Replicate()]
+            # ).to_local()
+            # self.assertEqual(batch_1_grad_local, batch_1_grad)
+
+        shard0_spec = Shard(0)
+        shard1_spec = Shard(1)
+        shard2_spec = Shard(2)
+        replica_spec = Replicate()
+        shard_specs = [shard0_spec, shard1_spec, shard2_spec, replica_spec]
+        shard_specs_comb = list(
+            itertools.product(shard_specs, shard_specs, shard_specs)
+        )
+        passlist = [
+            (shard0_spec, shard0_spec, shard0_spec),
+            (shard0_spec, shard0_spec, replica_spec),
+            (shard0_spec, shard1_spec, shard0_spec),
+            (shard0_spec, shard2_spec, shard0_spec),
+            (shard1_spec, shard1_spec, replica_spec),
+            (shard0_spec, replica_spec, shard0_spec),
+            (shard2_spec, replica_spec, shard2_spec),
+            (shard2_spec, shard0_spec, shard2_spec),
+            (shard2_spec, shard1_spec, shard2_spec),
+            (shard2_spec, shard2_spec, shard2_spec),
+            (replica_spec, shard0_spec, shard0_spec),
+            (replica_spec, shard1_spec, replica_spec),
+            (replica_spec, shard2_spec, shard1_spec),
+            (replica_spec, replica_spec, shard2_spec),
+            (replica_spec, replica_spec, replica_spec),
+        ]
+        # If beta is 0, input tensor will be ignored
+        numeric_params_comb = [
+            (0.0, 0.5),  # zero-beta
+            (0.8, 0.5),  # non-zero-beta
+        ]
+
+        for beta, alpha in numeric_params_comb:
+            local_result = torch.baddbmm(
+                tensor, batch_1, batch_2, beta=beta, alpha=alpha
+            )
+            grad_local_res = torch.ones_like(local_result)
+            local_result.backward(grad_local_res)
+            # tests that currently pass
+            for spec in passlist:
+                test_placement_comb(
+                    [spec[0]], [spec[1]], [spec[2]], beta, alpha, batch_1.grad
+                )
+            # TODO: support these tests
+            shard_specs_comb = [
+                spec for spec in shard_specs_comb if spec not in passlist
+            ]
+            for spec in shard_specs_comb:
+                with self.assertRaises(Exception):
+                    test_placement_comb(
+                        [spec[0]],
+                        [spec[1]],
+                        [spec[2]],
+                        beta,
+                        alpha,
+                        batch_1.grad,
+                    )
+
+    @with_comms
+    def test_bmm(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        mat1 = torch.rand(4, 8, 4, device=self.device_type, requires_grad=True)
+        mat2 = torch.rand(4, 4, 8, device=self.device_type, requires_grad=True)
+        local_result = torch.bmm(mat1, mat2)
+        grad_local_res = torch.ones_like(local_result)
+        local_result.backward(grad_local_res)
+
+        def test_placement_comb(
+            placements1: List[Placement],
+            placements2: List[Placement],
+        ) -> None:
+            mat1_dt = distribute_tensor(mat1, device_mesh, placements1)
+            mat2_dt = distribute_tensor(mat2, device_mesh, placements2)
+            dist_res = cast(DTensor, torch.bmm(mat1_dt, mat2_dt)).redistribute(
+                device_mesh, [Replicate()]
+            )
+            dist_local_res = dist_res.to_local()
+            self.assertEqual(dist_local_res, local_result)
+
+            # test backward
+            # TODO: figure out (replicate, shard1) fail on backward
+            # it generates a different grad shape
+            grad_dist_res = torch.ones_like(dist_res)
+            dist_res.backward(grad_dist_res)
+            self.assertIsNotNone(mat1_dt.grad)
+            mat1_dt_grad = cast(DTensor, mat1_dt.grad)
+            mat1_grad_local = mat1_dt_grad.redistribute(
+                device_mesh, [Replicate()]
+            ).to_local()
+            self.assertEqual(mat1_grad_local, mat1.grad)
+
+        shard0_spec = Shard(0)
+        shard1_spec = Shard(1)
+        shard2_spec = Shard(2)
+        replica_spec = Replicate()
+        placement_specs = [shard0_spec, shard1_spec, shard2_spec, replica_spec]
+        shard_specs_comb = list(
+            itertools.product(placement_specs, placement_specs)
+        )
+
+        # tests that currently pass
+        for spec in shard_specs_comb:
+            test_placement_comb([spec[0]], [spec[1]])
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_tensor/test_pointwise_ops.py b/test/distributed/_tensor/test_pointwise_ops.py
new file mode 100644
index 0000000000000..5069166dee279
--- /dev/null
+++ b/test/distributed/_tensor/test_pointwise_ops.py
@@ -0,0 +1,285 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+from typing import Sequence, Any, Dict, Callable, Optional
+from unittest import skip
+
+import torch
+from torch import Tensor
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
+    skip_unless_torch_gpu,
+)
+
+from torch.distributed._tensor import DeviceMesh, DTensor, distribute_tensor
+from torch.distributed._tensor.placement_types import (
+    Shard,
+    Replicate,
+    _Partial,
+    Placement,
+)
+from torch.distributed.distributed_c10d import ReduceOp
+
+import torch.utils._pytree as pytree
+
+
+def no_op():
+    return None
+
+
+def deepcopy_convert_to_dtensor(
+    val: Any,
+    device_mesh: DeviceMesh,
+    placements: Sequence[Placement],
+) -> Any:
+    """
+    Recursively convert (over Sequence and Dict types) Tensors into DTensors.
+
+    :param device_mesh: the DeviceMesh to use.
+    :param placements: the Placement list to use.
+    :return: the transformed structure.
+    """
+
+    def f(x):
+        if isinstance(x, Tensor) and not isinstance(x, DTensor):
+            return distribute_tensor(
+                x,
+                device_mesh=device_mesh,
+                placements=placements,
+            )
+        return x
+
+    return pytree.tree_map(f, [val])[0]
+
+
+def deepcopy_convert_from_dtensor(val: Any) -> Any:
+    """
+    Recursive convert any DTensor to local Tensor.
+
+    :param val: the structure to coerce.
+    :return: the coerced structure.
+    """
+
+    def f(x):
+        if isinstance(x, DTensor):
+            return x.redistribute(
+                device_mesh=x.device_mesh,
+                placements=[Replicate()] * x.device_mesh.ndim,
+            ).to_local()
+        return x
+
+    return pytree.tree_map(f, [val])[0]
+
+
+class DistElementwiseOpsTest(DTensorTestBase):
+    def _compare_pairwise_ops(
+        self,
+        *,
+        device_mesh: DeviceMesh,
+        placements: Sequence[Placement],
+        op: Callable,
+        pre_op_fn: Optional[Callable] = None,
+        args: Sequence[Any] = tuple(),
+        kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        if pre_op_fn is None:
+            pre_op_fn = no_op
+
+        if not kwargs:
+            kwargs = {}
+
+        dargs = deepcopy_convert_to_dtensor(
+            args,
+            device_mesh=device_mesh,
+            placements=placements,
+        )
+        dkwargs = deepcopy_convert_to_dtensor(
+            kwargs,
+            device_mesh=device_mesh,
+            placements=placements,
+        )
+
+        pre_op_fn()
+
+        # run the reference first, in case the call is broken;
+        # it's better to debug an incorrect call at this point.
+        reference_result = op(*args, **kwargs)
+
+        pre_op_fn()
+
+        dist_result = op(*dargs, **dkwargs)
+
+        collected_result = deepcopy_convert_from_dtensor(dist_result)
+
+        self.assertEqual(reference_result, collected_result)
+
+    # TODO: We need to add CPU tests for ops in the future.
+    def _run_sharded_elementwise_ops(
+        self,
+        *,
+        device_mesh: DeviceMesh,
+        placements: Sequence[Placement],
+        pre_op_fn: Optional[Callable] = None,
+        input_size: Sequence[int],
+        op: Callable,
+        **kwargs,
+    ):
+        if pre_op_fn is None:
+            pre_op_fn = no_op
+
+        input_tensor = torch.randn(
+            *input_size,
+            device=self.device_type,
+            requires_grad=True,
+        )
+
+        self._compare_pairwise_ops(
+            device_mesh=device_mesh,
+            placements=placements,
+            pre_op_fn=pre_op_fn,
+            op=op,
+            args=(input_tensor,),
+            kwargs=kwargs,
+        )
+
+    @with_comms
+    def test_activations(self):
+        device_mesh = self.build_device_mesh()
+        self._run_sharded_elementwise_ops(
+            device_mesh=device_mesh,
+            placements=[Shard(0)],
+            input_size=(8, 5),
+            op=torch.nn.functional.gelu,
+        )
+        self._run_sharded_elementwise_ops(
+            device_mesh=device_mesh,
+            placements=[Replicate()],
+            input_size=(8, 5),
+            op=torch.nn.functional.gelu,
+        )
+        self._run_sharded_elementwise_ops(
+            device_mesh=device_mesh,
+            placements=[Shard(1)],
+            input_size=(3, 12),
+            op=torch.nn.functional.relu,
+        )
+        self._run_sharded_elementwise_ops(
+            device_mesh=device_mesh,
+            placements=[Replicate()],
+            input_size=(8, 5),
+            op=torch.nn.functional.relu,
+        )
+        self._run_sharded_elementwise_ops(
+            device_mesh=device_mesh,
+            placements=[Shard(0)],
+            input_size=(8, 5),
+            op=torch.sigmoid,
+        )
+        self._run_sharded_elementwise_ops(
+            device_mesh=device_mesh,
+            placements=[Replicate()],
+            input_size=(8, 5),
+            op=torch.sigmoid,
+        )
+
+    @with_comms
+    @skip(
+        "testing RNG based ops is broken: https://github.com/pytorch/tau/issues/494"
+    )
+    def test_dropout(self):
+        device_mesh = self.build_device_mesh()
+
+        def _reset_random_seed():
+            torch.manual_seed(self.rank + 4)
+
+        self._run_sharded_elementwise_ops(
+            device_mesh=device_mesh,
+            placements=[Shard(0)],
+            input_size=(8, 5),
+            op=torch.nn.functional.dropout,
+            pre_op_fn=_reset_random_seed,
+            p=0.4,
+            training=False,
+        )
+        self._run_sharded_elementwise_ops(
+            device_mesh=device_mesh,
+            placements=[Shard(1)],
+            input_size=(3, 14),
+            op=torch.nn.functional.dropout,
+            pre_op_fn=_reset_random_seed,
+            p=0.5,
+            training=True,
+        )
+
+    @with_comms
+    @skip_unless_torch_gpu
+    def test_dropout_backward(self):
+        device_mesh = self.build_device_mesh()
+        placements = [Shard(0)]
+
+        input_size = (8, 5)
+
+        grad_output = torch.rand(
+            input_size,
+            device=self.device_type,
+            requires_grad=True,
+        )
+        mask = (
+            torch.rand(
+                input_size,
+                device=self.device_type,
+                requires_grad=False,
+            )
+            < 0.8
+        )
+
+        self._compare_pairwise_ops(
+            device_mesh=device_mesh,
+            placements=placements,
+            op=torch.ops.aten.native_dropout_backward,
+            kwargs=dict(
+                grad_output=grad_output,
+                mask=mask,
+                scale=0.3,
+            ),
+        )
+
+    @with_comms
+    def test_dropout_errors(self):
+        device_mesh = self.build_device_mesh()
+        with self.assertRaisesRegex(RuntimeError, "supported"):
+            self._run_sharded_elementwise_ops(
+                device_mesh=device_mesh,
+                placements=[_Partial(ReduceOp.SUM)],
+                input_size=(8, 5),
+                op=torch.nn.functional.dropout,
+            )
+
+    @with_comms
+    def test_mul_out(self):
+        device_mesh = self.build_device_mesh()
+        torch.manual_seed(self.rank)
+        shard_spec = [Shard(0)]
+        input_size = (8, 4)
+        input_tensor = torch.randn(*input_size, device=self.device_type)
+        dtensor = DTensor.from_local(input_tensor, device_mesh, shard_spec)
+
+        other_tensor = torch.randn(*input_size, device=self.device_type)
+        other_dtensor = DTensor.from_local(
+            other_tensor, device_mesh, shard_spec
+        )
+
+        output_tensor = torch.randn(*input_size, device=self.device_type)
+        output_dtensor = DTensor.from_local(
+            output_tensor, device_mesh, shard_spec
+        )
+        dt = torch.mul(dtensor, other_dtensor, out=output_dtensor)
+        expected = torch.mul(input_tensor, other_tensor, out=output_tensor)
+        self.assertEqual(input_tensor, dtensor.to_local())
+        self.assertEqual(expected, dt.to_local())
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_tensor/test_tp_sharding_ops.py b/test/distributed/_tensor/test_tp_sharding_ops.py
new file mode 100644
index 0000000000000..acd28fe3a3065
--- /dev/null
+++ b/test/distributed/_tensor/test_tp_sharding_ops.py
@@ -0,0 +1,101 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+import torch
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
+)
+from torch.distributed._tensor import DeviceMesh, DTensor, Shard, Replicate, distribute_tensor
+
+
+class TPShardingOpsTest(DTensorTestBase):
+    @property
+    def world_size(self) -> int:
+        return 4
+
+    @with_comms
+    def test_sharded_view(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        torch.manual_seed(0)
+        tensor = torch.rand(16, 35, 26)
+        sharding = [Shard(0)]
+        st = distribute_tensor(tensor, device_mesh, sharding).view(8, 4, 35, 13)
+        st_new = distribute_tensor(
+            tensor.view(8, 4, 35, 13), device_mesh, sharding
+        )
+        self.assertEqual(st.to_local(), st_new.to_local())
+        self.assertEqual(st.placements[0], st_new.placements[0])
+
+    @with_comms
+    def test_sharded_transpose(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        torch.manual_seed(self.rank)
+        tensor = torch.rand(3, 5, 6, device=self.device_type)
+        sharding = [Shard(0)]
+        dist_tensor = DTensor.from_local(tensor, device_mesh, sharding)
+        new_dt = dist_tensor.transpose(0, 2)
+        self.assertTrue(new_dt.placements[0].is_shard(dim=2))
+        self.assertEqual(new_dt.to_local(), tensor.transpose(0, 2))
+        new_dt = dist_tensor.transpose(1, 2)
+        self.assertTrue(new_dt.placements[0].is_shard(dim=0))
+        self.assertEqual(new_dt.to_local(), tensor.transpose(1, 2))
+
+    @with_comms
+    def test_sharded_permute(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        torch.manual_seed(self.rank)
+        tensor = torch.rand(3, 5, 6, device=self.device_type)
+        sharding = [Shard(0)]
+        dist_tensor = DTensor.from_local(tensor, device_mesh, sharding)
+        new_dt = dist_tensor.permute(1, 0, 2)
+        self.assertTrue(new_dt.placements[0].is_shard(dim=1))
+        self.assertEqual(new_dt.to_local(), tensor.permute(1, 0, 2))
+
+    @with_comms
+    def test_replicated_permute(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        torch.manual_seed(0)
+        tensor = torch.rand(3, 5, 6, device=self.device_type)
+        sharding = [Replicate()]
+        dist_tensor = DTensor.from_local(tensor, device_mesh, sharding)
+        new_dt = dist_tensor.permute(1, 0, 2)
+        self.assertTrue(new_dt.placements[0].is_replicate())
+        self.assertEqual(new_dt.to_local(), tensor.permute(1, 0, 2))
+        self.assertEqual(new_dt.stride(), tensor.permute(1, 0, 2).stride())
+
+    @with_comms
+    def test_sharded_cat(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        torch.manual_seed(self.rank)
+        tensor_1 = torch.rand(3, 5, 6)
+        tensor_2 = torch.rand(3, 5, 6)
+        tensor_3 = torch.rand(3, 5, 6)
+        sharding = [Shard(0)]
+        dt_1 = DTensor.from_local(tensor_1, device_mesh, sharding)
+        dt_2 = DTensor.from_local(tensor_2, device_mesh, sharding)
+        dt_3 = DTensor.from_local(tensor_3, device_mesh, sharding)
+        new_dt = torch.cat([dt_1, dt_2, dt_3])
+        cat_dt = DTensor.from_local(
+            torch.cat([tensor_1, tensor_2, tensor_3]), device_mesh, sharding
+        )
+        self.assertEqual(new_dt.to_local(), cat_dt.to_local())
+        self.assertEqual(new_dt.size(), cat_dt.size())
+
+    @with_comms
+    def test_sharded_split(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        torch.manual_seed(self.rank)
+        tensor = torch.rand(3, 5, 6, device=self.device_type)
+        sharding = [Shard(2)]
+        dist_tensor = DTensor.from_local(tensor, device_mesh, sharding)
+        dt_list = dist_tensor.split(dist_tensor.size(-1) // 2, dim=-1)
+        local_tensors = tensor.split(3, dim=-1)
+        for idx, dt in enumerate(dt_list):
+            self.assertTrue(dt.placements[0].is_shard(dim=2))
+            self.assertEqual(dt.to_local(), local_tensors[idx])
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_tensor/test_view_ops.py b/test/distributed/_tensor/test_view_ops.py
new file mode 100644
index 0000000000000..c1c5a03b91132
--- /dev/null
+++ b/test/distributed/_tensor/test_view_ops.py
@@ -0,0 +1,480 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+from typing import List, cast
+from torch.distributed._tensor.placement_types import Placement
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    redistribute_profiler,
+    with_comms,
+)
+from torch.distributed._tensor import DeviceMesh, Shard, Replicate, distribute_tensor
+from torch.distributed._tensor.ops.view_ops import (
+    ops,
+    Singleton,
+    Broadcast,
+    Flatten,
+    Repeat,
+    Split,
+    InputDim,
+    view_groups,
+)
+from torch import Tensor, rand, randn
+from torch.testing._internal.common_utils import run_tests
+from torch.utils._pytree import tree_flatten
+
+import itertools
+import torch
+import torch.distributed as dist
+
+
+class TestViewOps(DTensorTestBase):
+    def test_view_groups(self):
+        self.assertEquals(
+            view_groups([2, 3], [3, 2]),
+            (
+                Split(Flatten((InputDim(0), InputDim(1))), (3, 2), 0),
+                Split(Flatten((InputDim(0), InputDim(1))), (3, 2), 1),
+            ),
+        )
+        self.assertEquals(
+            view_groups([3, 4, 5], [12, 5]),
+            (Flatten((InputDim(0), InputDim(1))), InputDim(2)),
+        )
+        self.assertEquals(
+            view_groups([2, 3, 4, 5, 7], [12, 70]),
+            (
+                Split(
+                    Flatten(
+                        (
+                            InputDim(0),
+                            InputDim(1),
+                            InputDim(2),
+                            InputDim(3),
+                            InputDim(4),
+                        )
+                    ),
+                    (12, 70),
+                    0,
+                ),
+                Split(
+                    Flatten(
+                        (
+                            InputDim(0),
+                            InputDim(1),
+                            InputDim(2),
+                            InputDim(3),
+                            InputDim(4),
+                        )
+                    ),
+                    (12, 70),
+                    1,
+                ),
+            ),
+        )
+        self.assertEquals(
+            view_groups([2, 3, 4, 5, 7], [3, 8, 7, 5]),
+            (
+                Split(
+                    Flatten((InputDim(0), InputDim(1), InputDim(2))), (3, 8), 0
+                ),
+                Split(
+                    Flatten((InputDim(0), InputDim(1), InputDim(2))), (3, 8), 1
+                ),
+                Split(Flatten((InputDim(3), InputDim(4))), (7, 5), 0),
+                Split(Flatten((InputDim(3), InputDim(4))), (7, 5), 1),
+            ),
+        )
+        self.assertEquals(
+            view_groups([3, 4, 8, 3], [12, 4, 2, 3]),
+            (
+                Flatten((InputDim(0), InputDim(1))),
+                Split(InputDim(2), (4, 2), 0),
+                Split(InputDim(2), (4, 2), 1),
+                InputDim(3),
+            ),
+        )
+        self.assertEquals(
+            view_groups([3, 24], [1, 3, 2, 4, 1, 3, 1]),
+            (
+                Singleton(),
+                InputDim(0),
+                Split(InputDim(1), (2, 4, 3), 0),
+                Split(InputDim(1), (2, 4, 3), 1),
+                Singleton(),
+                Split(InputDim(1), (2, 4, 3), 2),
+                Singleton(),
+            ),
+        )
+        self.assertEquals(
+            view_groups([1, 1, 3, 2, 1, 1], [6, 1, 1, 1]),
+            (
+                Flatten((InputDim(2), InputDim(3))),
+                Singleton(),
+                Singleton(),
+                Singleton(),
+            ),
+        )
+        self.assertEquals(
+            view_groups([1, 1, 12, 1, 1, 1, 2, 5, 1], [3, 4, 1, 10]),
+            (
+                Split(InputDim(2), (3, 4), 0),
+                Split(InputDim(2), (3, 4), 1),
+                Singleton(),
+                Flatten((InputDim(6), InputDim(7))),
+            ),
+        )
+        self.assertEquals(
+            view_groups([2, 3, 4], [2, -1, 4]),
+            (InputDim(0), InputDim(1), InputDim(2)),
+        )
+
+    @property
+    def world_size(self) -> int:
+        return 6
+
+    def call_dt_test(self, op, args, kwargs, device_mesh: DeviceMesh):
+        spec = ops[op]
+        rules = spec.dim_map(*args, **kwargs)
+        outputs = op(*args, **kwargs)
+        flat_args, _ = tree_flatten(args)
+        in_shape = flat_args[0].shape
+
+        no_shard_dims = set()
+        for rule in rules:
+            if isinstance(rule, Repeat):
+                if isinstance(rule.input_dim, InputDim):
+                    no_shard_dims.add(rule.input_dim.input_dim)
+            elif isinstance(rule, Flatten):
+                for dim in rule.input_dims[1:]:
+                    if isinstance(dim, InputDim):
+                        no_shard_dims.add(dim.input_dim)
+            elif isinstance(rule, Split):
+                if isinstance(rule.input_dim, Flatten):
+                    for dim in rule.input_dim.input_dims[1:]:
+                        if isinstance(dim, InputDim):
+                            no_shard_dims.add(dim.input_dim)
+
+        if op == torch.unbind:
+            no_shard_dims.add(kwargs.get("dim", 0))
+
+        sharding_choices = cast(List[Placement], [Replicate()]) + [
+            Shard(i)
+            for i, s in enumerate(in_shape)
+            if s > 1 and i not in no_shard_dims
+        ]
+
+        all_sharding_choices = itertools.product(
+            *(device_mesh.ndim * [sharding_choices])
+        )
+
+        for in_shard in all_sharding_choices:
+            # print(f'   |--- {in_shard}')
+            in_dt = distribute_tensor(args[0], device_mesh, in_shard)
+
+            with redistribute_profiler() as profiler:
+                out_dt = op(in_dt, *args[1:], **kwargs)
+
+            self.assertEqual(
+                profiler.num_calls, 0, "Expected no redistribution."
+            )
+
+            full_out = out_dt.redistribute(
+                device_mesh, device_mesh.ndim * [Replicate()]
+            ).to_local()
+
+            if dist.get_rank() == 0:
+                self.assertEqual(outputs, full_out)
+
+    def dimmap_test(self, op, args, expected_rule_output):
+        rules = ops[op].dim_map(*args)
+        self.assertEquals(rules, expected_rule_output)
+        self.call_dt_test(op, args, {}, self.device_mesh)
+
+    @with_comms
+    def test_view_ops(self):
+        self.device_mesh = DeviceMesh(
+            self.device_type, torch.arange(dist.get_world_size()).view(-1, 2)
+        )
+        self.dimmap_test(torch.atleast_1d, (randn(()),), (Singleton(),))
+        self.dimmap_test(torch.atleast_1d, (randn(24),), (InputDim(0),))
+        self.dimmap_test(
+            torch.atleast_1d, (randn(24, 36),), (InputDim(0), InputDim(1))
+        )
+
+        self.dimmap_test(
+            torch.atleast_2d, (randn(()),), (Singleton(), Singleton())
+        )
+        self.dimmap_test(
+            torch.atleast_2d, (randn(24),), (Singleton(), InputDim(0))
+        )
+        self.dimmap_test(
+            torch.atleast_2d, (randn(24, 36),), (InputDim(0), InputDim(1))
+        )
+        self.dimmap_test(
+            torch.atleast_2d,
+            (randn(24, 36, 48),),
+            (InputDim(0), InputDim(1), InputDim(2)),
+        )
+
+        self.dimmap_test(
+            torch.atleast_3d,
+            (randn(()),),
+            (Singleton(), Singleton(), Singleton()),
+        )
+        self.dimmap_test(
+            torch.atleast_3d,
+            (randn(24),),
+            (Singleton(), InputDim(0), Singleton()),
+        )
+        self.dimmap_test(
+            torch.atleast_3d,
+            (randn(24, 36),),
+            (InputDim(0), InputDim(1), Singleton()),
+        )
+        self.dimmap_test(
+            torch.atleast_3d,
+            (randn(24, 36, 42),),
+            (InputDim(0), InputDim(1), InputDim(2)),
+        )
+        self.dimmap_test(
+            torch.atleast_3d,
+            (randn(24, 36, 42, 24),),
+            (InputDim(0), InputDim(1), InputDim(2), InputDim(3)),
+        )
+
+        with self.assertRaises(AssertionError):
+            ops[torch.broadcast_to].dim_map(randn(24, 36), (1, 2, 4))
+
+        self.dimmap_test(
+            torch.broadcast_to,
+            (rand(24, 36), (1, 24, 36)),
+            (Singleton(), InputDim(0), InputDim(1)),
+        )
+        self.dimmap_test(
+            torch.broadcast_to,
+            (rand(24, 36), (42, 24, 36)),
+            (Broadcast(Singleton(), 42), InputDim(0), InputDim(1)),
+        )
+        self.dimmap_test(
+            torch.broadcast_to,
+            (rand(24, 1, 36), (12, 24, 24, 36)),
+            (
+                Broadcast(Singleton(), 12),
+                InputDim(0),
+                Broadcast(InputDim(1), 24),
+                InputDim(2),
+            ),
+        )
+        self.dimmap_test(
+            torch.broadcast_to,
+            (rand(24, 36), (-1, 36)),
+            (InputDim(0), InputDim(1)),
+        )
+        self.dimmap_test(
+            torch.broadcast_to,
+            (rand(24, 1, 36), (-1, 1, 36)),
+            (InputDim(0), InputDim(1), InputDim(2)),
+        )
+
+        self.dimmap_test(
+            torch.broadcast_to,
+            (randn(36, 1, 24), (12, 36, 42, 24)),
+            (
+                Broadcast(Singleton(), 12),
+                InputDim(0),
+                Broadcast(InputDim(1), 42),
+                InputDim(2),
+            ),
+        )
+
+        self.dimmap_test(
+            Tensor.expand,
+            (randn(24, 1, 36, 1), 36, 24, 42, -1, 24),
+            (
+                Broadcast(Singleton(), 36),
+                InputDim(0),
+                Broadcast(InputDim(1), 42),
+                InputDim(2),
+                Broadcast(InputDim(3), 24),
+            ),
+        )
+
+        self.dimmap_test(
+            Tensor.expand,
+            (randn(24, 1, 36, 1), (36, 24, 42, -1, 24)),
+            (
+                Broadcast(Singleton(), 36),
+                InputDim(0),
+                Broadcast(InputDim(1), 42),
+                InputDim(2),
+                Broadcast(InputDim(3), 24),
+            ),
+        )
+
+        self.dimmap_test(
+            torch.flatten,
+            (randn(24, 36),),
+            (Flatten((InputDim(0), InputDim(1))),),
+        )
+        self.dimmap_test(torch.flatten, (randn(42),), (InputDim(0),))
+        self.dimmap_test(torch.flatten, (randn(()),), (Singleton(),))
+
+        self.dimmap_test(
+            torch.movedim,
+            (randn(12, 24, 48, 96), 1, 2),
+            (InputDim(0), InputDim(2), InputDim(1), InputDim(3)),
+        )
+        self.dimmap_test(
+            torch.movedim,
+            (randn(6, 12, 24), 1, 0),
+            (InputDim(1), InputDim(0), InputDim(2)),
+        )
+        self.dimmap_test(
+            torch.movedim,
+            (randn(24, 12, 6), (1, 2), (0, 1)),
+            (InputDim(1), InputDim(2), InputDim(0)),
+        )
+        self.dimmap_test(
+            torch.movedim,
+            (randn(24, 6, 12), (0, 2, 1), (2, 1, 0)),
+            (InputDim(1), InputDim(2), InputDim(0)),
+        )
+        self.dimmap_test(
+            torch.movedim,
+            (randn(24, 12), (1, 0), (0, 1)),
+            (InputDim(1), InputDim(0)),
+        )
+
+        self.dimmap_test(
+            torch.movedim,
+            (randn(36, 24, 12), (1, 2), (0, 1)),
+            (InputDim(1), InputDim(2), InputDim(0)),
+        )
+        self.dimmap_test(
+            torch.movedim,
+            (randn(36, 24, 12), (1, 2), (-3, -2)),
+            (InputDim(1), InputDim(2), InputDim(0)),
+        )
+
+        self.dimmap_test(
+            torch.permute,
+            (randn(24, 36, 42), (2, 0, 1)),
+            (InputDim(2), InputDim(0), InputDim(1)),
+        )
+        self.dimmap_test(
+            torch.permute,
+            (randn(24, 36, 42), (-1, -3, -2)),
+            (InputDim(2), InputDim(0), InputDim(1)),
+        )
+
+        self.dimmap_test(
+            torch.ravel,
+            (randn(24, 36),),
+            (Flatten((InputDim(0), InputDim(1))),),
+        )
+        self.dimmap_test(torch.ravel, (randn(42),), (InputDim(0),))
+        self.dimmap_test(torch.ravel, (randn(()),), (Singleton(),))
+
+        self.dimmap_test(
+            Tensor.repeat,
+            (randn(24, 36), 1, 2, 1, 1, 2),
+            (
+                Singleton(),
+                Broadcast(Singleton(), 2),
+                Singleton(),
+                InputDim(0),
+                Repeat(InputDim(1), 2),
+            ),
+        )
+
+        self.dimmap_test(
+            torch.reshape,
+            (randn(6, 12, 24), (72, 24)),
+            (Flatten((InputDim(0), InputDim(1))), InputDim(2)),
+        )
+
+        self.dimmap_test(
+            torch.tile,
+            (randn(24, 36), (1, 2, 1, 1, 2)),
+            (
+                Singleton(),
+                Broadcast(Singleton(), 2),
+                Singleton(),
+                InputDim(0),
+                Repeat(InputDim(1), 2),
+            ),
+        )
+        self.dimmap_test(
+            torch.tile,
+            (randn(42, 24, 36), (1, 3)),
+            (InputDim(0), InputDim(1), Repeat(InputDim(2), 3)),
+        )
+
+        self.dimmap_test(
+            torch.transpose,
+            (randn(24, 60, 42, 60), 2, 0),
+            (InputDim(2), InputDim(1), InputDim(0), InputDim(3)),
+        )
+        self.dimmap_test(
+            torch.transpose,
+            (randn(24, 60, 42, 60), -1, 0),
+            (InputDim(3), InputDim(1), InputDim(2), InputDim(0)),
+        )
+
+        self.dimmap_test(
+            torch.unsqueeze,
+            (randn(42, 24, 36), 1),
+            (InputDim(0), Singleton(), InputDim(1), InputDim(2)),
+        )
+
+        self.dimmap_test(
+            Tensor.view,
+            (randn(6, 12, 24), 72, 24),
+            (Flatten((InputDim(0), InputDim(1))), InputDim(2)),
+        )
+
+        self.dimmap_test(Tensor.view, (randn(1, 1, 12), -1), (InputDim(2),))
+
+        self.dimmap_test(
+            Tensor.view,
+            (randn(1, 1, 42, 24), -1),
+            (Flatten((InputDim(2), InputDim(3))),),
+        )
+
+        self.dimmap_test(
+            Tensor.view,
+            (randn(1, 1, 42, 1, 24, 1), -1),
+            (Flatten((InputDim(2), InputDim(4))),),
+        )
+
+        self.dimmap_test(
+            Tensor.view,
+            (randn(48, 35, 26), (24, 4, 35, 13)),
+            (
+                Split(
+                    Flatten(input_dims=(InputDim(0), InputDim(1), InputDim(2))),
+                    group_shape=(24, 4, 35, 13),
+                    split_id=0,
+                ),
+                Split(
+                    Flatten(input_dims=(InputDim(0), InputDim(1), InputDim(2))),
+                    group_shape=(24, 4, 35, 13),
+                    split_id=1,
+                ),
+                Split(
+                    Flatten(input_dims=(InputDim(0), InputDim(1), InputDim(2))),
+                    group_shape=(24, 4, 35, 13),
+                    split_id=2,
+                ),
+                Split(
+                    Flatten(input_dims=(InputDim(0), InputDim(1), InputDim(2))),
+                    group_shape=(24, 4, 35, 13),
+                    split_id=3,
+                ),
+            ),
+        )
+
+
+if __name__ == "__main__":
+    run_tests()

From 798a6fef0baf3b6068eb8393ec024fac75cd37fb Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Tue, 15 Nov 2022 22:51:33 +0000
Subject: [PATCH 0952/1922] [dtensor] PART 7: move remaining DTensor tests to
 core distributed (#88179)

This PR moves remaining tests, i.e. tensor_ops, op db tests to core distributed

part of https://github.com/pytorch/pytorch/issues/88838
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88179
Approved by: https://github.com/aazzolini
---
 test/distributed/_tensor/test_dtensor_ops.py  | 704 ++++++++++++++++++
 test/distributed/_tensor/test_tensor_ops.py   | 365 +++++++++
 .../_tensor/dtensor_lagging_op_db.py          | 661 ++++++++++++++++
 .../_tensor/gen_dtensor_lagging_op_db.py      |  67 ++
 4 files changed, 1797 insertions(+)
 create mode 100644 test/distributed/_tensor/test_dtensor_ops.py
 create mode 100644 test/distributed/_tensor/test_tensor_ops.py
 create mode 100644 torch/testing/_internal/distributed/_tensor/dtensor_lagging_op_db.py
 create mode 100644 torch/testing/_internal/distributed/_tensor/gen_dtensor_lagging_op_db.py

diff --git a/test/distributed/_tensor/test_dtensor_ops.py b/test/distributed/_tensor/test_dtensor_ops.py
new file mode 100644
index 0000000000000..22ae5807d5f34
--- /dev/null
+++ b/test/distributed/_tensor/test_dtensor_ops.py
@@ -0,0 +1,704 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+import torch
+import sys
+import unittest
+import warnings
+
+from torch.overrides import resolve_name
+from torch.utils._pytree import tree_flatten, tree_map
+from torch.testing._internal.common_utils import (
+    suppress_warnings,
+    TEST_WITH_ASAN,
+    run_tests,
+)
+import torch.distributed as dist
+from torch.testing._internal.common_device_type import (
+    ops,
+    instantiate_device_type_tests,
+)
+import torch.testing._internal.common_methods_invocations as common_ops
+from torch.testing._internal.common_methods_invocations import DecorateInfo
+
+from torch.distributed._tensor import DTensor, DeviceMesh, Replicate
+from torch.testing._internal.distributed._tensor.dtensor_lagging_op_db import dtensor_lagging_op_db
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    TEST_SKIPS,
+    DTensorConverter,
+    DEVICE_TYPE,
+)
+
+# rewrite common size variables to sth can be sharded evenly
+# we can enable uneven shards later, but need to adjust more on
+# sample inputs (i.e. view/reshape need to adjust shape size as well)
+common_ops.L = 24
+common_ops.M = 12
+common_ops.S = 4
+common_ops.XS = 2
+
+
+def assert_ref_dtensor_equal(test_case, dtensor_rs, rs):
+    flat_dtensor_rs, _ = tree_flatten(dtensor_rs)
+    flat_rs, _ = tree_flatten(rs)
+    test_case.assertEqual(len(flat_dtensor_rs), len(flat_rs))
+    for dtensor_r, r in zip(flat_dtensor_rs, flat_rs):
+
+        if not isinstance(r, torch.Tensor):
+            continue
+
+        test_case.assertIsInstance(dtensor_r, torch.Tensor)
+        test_case.assertEqual(
+            dtensor_r.shape,
+            r.shape,
+            f"Shape mismatch! original shape:{r.shape}, dtensor shape: {dtensor_r.shape}",
+        )
+        test_case.assertEqual(
+            dtensor_r.requires_grad,
+            r.requires_grad,
+            "op result requires_grad mismatch!"
+            f"original requires_grad: {r.requires_grad}, "
+            f"dtensor requires_grad: {dtensor_r.requires_grad}",
+        )
+
+        test_case.assertEqual(dtensor_r.to_local(), r)
+
+
+# Copied from functorch
+def xfail(op_name, variant_name="", *, device_type=None, dtypes=None):
+    return (op_name, variant_name, device_type, dtypes, True)
+
+
+def skip(op_name, variant_name="", *, device_type=None, dtypes=None):
+    return (op_name, variant_name, device_type, dtypes, False)
+
+
+def skipOps(test_case_name, base_test_name, to_skip):
+    all_opinfos = dtensor_lagging_op_db
+    for xfail in to_skip:
+        op_name, variant_name, device_type, dtypes, expected_failure = xfail
+        matching_opinfos = [
+            o
+            for o in all_opinfos
+            if o.name == op_name and o.variant_test_name == variant_name
+        ]
+        assert len(matching_opinfos) >= 1, f"Couldn't find OpInfo for {xfail}"
+        for opinfo in matching_opinfos:
+            decorators = list(opinfo.decorators)
+            if expected_failure:
+                decorator = DecorateInfo(
+                    unittest.expectedFailure,
+                    test_case_name,
+                    base_test_name,
+                    device_type=device_type,
+                    dtypes=dtypes,
+                )
+                decorators.append(decorator)
+            else:
+                decorator = DecorateInfo(
+                    unittest.skip("Skipped!"),
+                    test_case_name,
+                    base_test_name,
+                    device_type=device_type,
+                    dtypes=dtypes,
+                )
+                decorators.append(decorator)
+            opinfo.decorators = tuple(decorators)
+
+    # This decorator doesn't modify fn in any way
+    def wrapped(fn):
+        return fn
+
+    return wrapped
+
+
+# Re-generate this failed list, turn on dry_run of the below func
+# check_dtensor_func(self, test, op, dry_run=True), then run sth
+# like python test/spmd/tensor/test_dtensor_ops.py > failed.expect
+dtensor_fails = {
+    # these sometimes pass and sometimes fail
+    # we need to remove many of them from list once op
+    # get full support with varying sharding specs
+    xfail("__getitem__"),
+    xfail("__rsub__"),
+    xfail("masked.amax"),
+    xfail("masked.amin"),
+    xfail("masked.argmax"),
+    xfail("masked.argmin"),
+    xfail("masked.cumprod"),
+    xfail("masked.cumsum"),
+    xfail("masked.log_softmax"),
+    xfail("masked.logaddexp"),
+    xfail("masked.logsumexp"),
+    xfail("masked.median"),
+    xfail("masked.norm"),
+    xfail("masked.prod"),
+    xfail("masked.softmin"),
+    xfail("masked.softmax"),
+    xfail("masked.sum"),
+    xfail("addbmm"),
+    xfail("addmv"),
+    xfail("addr"),
+    xfail("all"),
+    xfail("allclose"),
+    xfail("amax"),
+    xfail("amin"),
+    xfail("aminmax"),
+    xfail("any"),
+    xfail("arange"),
+    xfail("argmax"),
+    xfail("argmin"),
+    xfail("argsort"),
+    xfail("as_strided"),
+    xfail("as_strided_scatter"),
+    xfail("baddbmm"),
+    xfail("bernoulli"),
+    xfail("block_diag"),
+    xfail("broadcast_shapes"),
+    xfail("cat"),
+    xfail("cartesian_prod"),
+    xfail("cdist"),
+    xfail("cholesky"),
+    xfail("cholesky_inverse"),
+    xfail("cholesky_solve"),
+    xfail("chunk"),
+    xfail("clamp"),
+    xfail("clamp_max"),
+    xfail("clamp_min"),
+    xfail("column_stack"),
+    xfail("combinations"),
+    xfail("complex"),
+    xfail("constant_pad_nd"),
+    xfail("copysign"),
+    xfail("corrcoef"),
+    xfail("count_nonzero"),
+    xfail("cov"),
+    xfail("cross"),
+    xfail("cummax"),
+    xfail("cummin"),
+    xfail("cumsum"),
+    xfail("cumulative_trapezoid"),
+    xfail("diag"),
+    xfail("diag_embed"),
+    xfail("diagflat"),
+    xfail("diagonal"),
+    xfail("diagonal_copy"),
+    xfail("diagonal_scatter"),
+    xfail("diff"),
+    xfail("dist"),
+    xfail("dot"),
+    xfail("dstack"),
+    xfail("einsum"),
+    xfail("empty"),
+    xfail("empty_like"),
+    xfail("eq"),
+    xfail("eye"),
+    xfail("fft.fft2"),
+    xfail("fft.fft"),
+    xfail("fft.fftn"),
+    xfail("fft.fftshift"),
+    xfail("fft.ifft2"),
+    xfail("fft.ifft"),
+    xfail("fft.ifftshift"),
+    xfail("fft.ihfft2"),
+    xfail("fft.ihfft"),
+    xfail("fft.ihfftn"),
+    xfail("fft.irfft2"),
+    xfail("fft.irfftn"),
+    xfail("fft.rfft2"),
+    xfail("fft.rfft"),
+    xfail("fft.rfftn"),
+    xfail("flip"),
+    xfail("fliplr"),
+    xfail("flipud"),
+    xfail("floor_divide"),
+    xfail("fmax"),
+    xfail("fmin"),
+    xfail("frexp"),
+    xfail("full"),
+    xfail("gather"),
+    xfail("geqrf"),
+    xfail("gradient"),
+    xfail("heaviside"),
+    xfail("histc"),
+    xfail("histogram"),
+    xfail("histogramdd"),
+    xfail("hstack"),
+    xfail("index_add"),
+    xfail("index_copy"),
+    xfail("index_fill"),
+    xfail("index_put"),
+    xfail("index_reduce"),
+    xfail("index_select"),
+    xfail("isfinite"),
+    xfail("isin"),
+    xfail("isinf"),
+    xfail("isnan"),
+    xfail("isneginf"),
+    xfail("isposinf"),
+    xfail("kthvalue"),
+    xfail("linalg.cholesky"),
+    xfail("linalg.cholesky_ex"),
+    xfail("linalg.cond"),
+    xfail("linalg.cross"),
+    xfail("linalg.det"),
+    xfail("linalg.det", "singular"),
+    xfail("linalg.eig"),
+    xfail("linalg.eigh"),
+    xfail("linalg.eigvals"),
+    xfail("linalg.eigvalsh"),
+    xfail("linalg.householder_product"),
+    xfail("linalg.inv"),
+    xfail("linalg.inv_ex"),
+    xfail("linalg.ldl_factor"),
+    xfail("linalg.ldl_factor_ex"),
+    xfail("linalg.ldl_solve"),
+    xfail("linalg.lstsq"),
+    xfail("linalg.lstsq", "grad_oriented"),
+    xfail("linalg.lu"),
+    xfail("linalg.lu_factor"),
+    xfail("linalg.lu_factor_ex"),
+    xfail("linalg.lu_solve"),
+    xfail("linalg.matrix_norm"),
+    xfail("linalg.matrix_power"),
+    xfail("linalg.matrix_rank"),
+    xfail("linalg.matrix_rank", "hermitian"),
+    xfail("linalg.multi_dot"),
+    xfail("linalg.norm"),
+    xfail("linalg.norm", "subgradients_at_zero"),
+    xfail("linalg.pinv"),
+    xfail("linalg.pinv", "hermitian"),
+    xfail("linalg.qr"),
+    xfail("linalg.slogdet"),
+    xfail("linalg.solve"),
+    xfail("linalg.solve_ex"),
+    xfail("linalg.solve_triangular"),
+    xfail("linalg.svd"),
+    xfail("linalg.svdvals"),
+    xfail("linalg.tensorinv"),
+    xfail("linalg.tensorsolve"),
+    xfail("linalg.vander"),
+    xfail("linalg.vecdot"),
+    xfail("linalg.vector_norm"),
+    xfail("linspace"),
+    xfail("log_softmax"),
+    xfail("log_softmax", "with_dtype"),
+    xfail("logcumsumexp"),
+    xfail("logdet"),
+    xfail("logical_not"),
+    xfail("logspace"),
+    xfail("logsumexp"),
+    xfail("lt"),
+    xfail("lu"),
+    xfail("lu_solve"),
+    xfail("lu_unpack"),
+    xfail("masked_fill"),
+    xfail("masked_scatter"),
+    xfail("masked_select"),
+    xfail("matrix_exp"),
+    xfail("max", "binary"),
+    xfail("max", "reduction_no_dim"),
+    xfail("max", "reduction_with_dim"),
+    xfail("maximum"),
+    xfail("median"),
+    xfail("min", "binary"),
+    xfail("min", "reduction_no_dim"),
+    xfail("min", "reduction_with_dim"),
+    xfail("minimum"),
+    xfail("mode"),
+    xfail("msort"),
+    xfail("multinomial"),
+    xfail("mv"),
+    xfail("max_pool2d_with_indices_backward", ""),
+    xfail("nanmean"),
+    xfail("nanmedian"),
+    xfail("nanquantile"),
+    xfail("nansum"),
+    xfail("native_batch_norm"),
+    xfail("native_layer_norm"),
+    xfail("narrow_copy"),
+    xfail("ne"),
+    xfail("new_empty"),
+    xfail("new_empty_strided"),
+    xfail("transpose"),
+    xfail("nn.functional.adaptive_avg_pool1d"),
+    xfail("nn.functional.adaptive_avg_pool2d"),
+    xfail("nn.functional.adaptive_avg_pool3d"),
+    xfail("nn.functional.adaptive_max_pool1d"),
+    xfail("nn.functional.adaptive_max_pool2d"),
+    xfail("nn.functional.adaptive_max_pool3d"),
+    xfail("nn.functional.alpha_dropout"),
+    xfail("nn.functional.avg_pool1d"),
+    xfail("nn.functional.avg_pool2d"),
+    xfail("nn.functional.avg_pool3d"),
+    xfail("nn.functional.batch_norm"),
+    xfail("nn.functional.batch_norm", "without_cudnn"),
+    xfail("nn.functional.bilinear"),
+    xfail("nn.functional.binary_cross_entropy"),
+    xfail("nn.functional.binary_cross_entropy_with_logits"),
+    xfail("nn.functional.celu"),
+    xfail("nn.functional.conv1d"),
+    xfail("nn.functional.conv2d"),
+    xfail("nn.functional.conv_transpose1d"),
+    xfail("nn.functional.conv_transpose2d"),
+    xfail("nn.functional.conv_transpose3d"),
+    xfail("nn.functional.cosine_similarity"),
+    xfail("nn.functional.cross_entropy"),
+    xfail("nn.functional.ctc_loss"),
+    xfail("nn.functional.dropout"),
+    xfail("nn.functional.dropout2d"),
+    xfail("nn.functional.dropout3d"),
+    xfail("nn.functional.elu"),
+    xfail("nn.functional.fractional_max_pool2d"),
+    xfail("nn.functional.fractional_max_pool3d"),
+    xfail("nn.functional.gaussian_nll_loss"),
+    xfail("nn.functional.glu"),
+    xfail("nn.functional.grid_sample"),
+    xfail("nn.functional.group_norm"),
+    xfail("nn.functional.hardshrink"),
+    xfail("nn.functional.hardsigmoid"),
+    xfail("nn.functional.hardswish"),
+    xfail("nn.functional.hardtanh"),
+    xfail("nn.functional.huber_loss"),
+    xfail("nn.functional.instance_norm"),
+    xfail("nn.functional.interpolate", "area"),
+    xfail("nn.functional.interpolate", "bicubic"),
+    xfail("nn.functional.interpolate", "bilinear"),
+    xfail("nn.functional.interpolate", "linear"),
+    xfail("nn.functional.interpolate", "nearest"),
+    xfail("nn.functional.interpolate", "trilinear"),
+    xfail("nn.functional.layer_norm"),
+    xfail("nn.functional.leaky_relu"),
+    xfail("nn.functional.linear"),
+    xfail("nn.functional.local_response_norm"),
+    xfail("nn.functional.logsigmoid"),
+    xfail("nn.functional.margin_ranking_loss"),
+    xfail("nn.functional.max_pool1d"),
+    xfail("nn.functional.max_pool2d"),
+    xfail("nn.functional.max_pool3d"),
+    xfail("nn.functional.max_unpool1d"),
+    xfail("nn.functional.max_unpool1d", "grad"),
+    xfail("nn.functional.max_unpool2d"),
+    xfail("nn.functional.max_unpool2d", "grad"),
+    xfail("nn.functional.max_unpool3d"),
+    xfail("nn.functional.max_unpool3d", "grad"),
+    xfail("nn.functional.mish"),
+    xfail("nn.functional.mse_loss"),
+    xfail("nn.functional.multi_margin_loss"),
+    xfail("nn.functional.multilabel_margin_loss"),
+    xfail("nn.functional.multilabel_soft_margin_loss"),
+    xfail("nn.functional.nll_loss"),
+    xfail("nn.functional.normalize"),
+    xfail("nn.functional.pad", "circular"),
+    xfail("nn.functional.pad", "constant"),
+    xfail("nn.functional.pad", "reflect"),
+    xfail("nn.functional.pad", "replicate"),
+    xfail("nn.functional.pairwise_distance"),
+    xfail("nn.functional.pdist"),
+    xfail("nn.functional.pixel_shuffle"),
+    xfail("nn.functional.pixel_unshuffle"),
+    xfail("nn.functional.poisson_nll_loss"),
+    xfail("nn.functional.prelu"),
+    xfail("nn.functional.relu6"),
+    xfail("nn.functional.rrelu"),
+    xfail("nn.functional.selu"),
+    xfail("nn.functional.silu"),
+    xfail("nn.functional.smooth_l1_loss"),
+    xfail("nn.functional.soft_margin_loss"),
+    xfail("nn.functional.softplus"),
+    xfail("nn.functional.softshrink"),
+    xfail("nn.functional.threshold"),
+    xfail("nn.functional.triplet_margin_loss"),
+    xfail("nn.functional.triplet_margin_with_distance_loss"),
+    xfail("nn.functional.unfold"),
+    xfail("nn.functional.upsample_bilinear"),
+    xfail("nn.functional.upsample_nearest"),
+    xfail("nonzero"),
+    xfail("norm"),
+    xfail("norm", "fro"),
+    xfail("norm", "inf"),
+    xfail("norm", "nuc"),
+    xfail("normal"),
+    xfail("normal", "number_mean"),
+    xfail("ormqr"),
+    xfail("ones"),
+    xfail("pca_lowrank"),
+    xfail("pinverse"),
+    xfail("polar"),
+    xfail("put"),
+    xfail("qr"),
+    xfail("quantile"),
+    xfail("rad2deg"),
+    xfail("rand_like"),
+    xfail("randint_like"),
+    xfail("randint"),
+    xfail("randn"),
+    xfail("randn_like"),
+    xfail("renorm"),
+    xfail("repeat_interleave"),
+    xfail("resize_"),
+    xfail("resize_as_"),
+    xfail("roll"),
+    xfail("rot90"),
+    xfail("rsub"),
+    xfail("scalar_tensor"),
+    xfail("scatter_add"),
+    xfail("scatter"),
+    xfail("scatter_reduce", "amax"),
+    xfail("scatter_reduce", "amin"),
+    xfail("scatter_reduce", "mean"),
+    xfail("scatter_reduce", "prod"),
+    xfail("scatter_reduce", "sum"),
+    xfail("searchsorted"),
+    xfail("select"),
+    xfail("select_scatter"),
+    xfail("signbit"),
+    xfail("sort"),
+    xfail("sparse.sampled_addmm"),
+    xfail("special.airy_ai"),
+    xfail("special.bessel_j0"),
+    xfail("special.bessel_j1"),
+    xfail("special.bessel_y0"),
+    xfail("special.bessel_y1"),
+    xfail("special.chebyshev_polynomial_t"),
+    xfail("special.chebyshev_polynomial_u"),
+    xfail("special.entr"),
+    xfail("special.erfcx"),
+    xfail("special.hermite_polynomial_h"),
+    xfail("special.hermite_polynomial_he"),
+    xfail("special.i0e"),
+    xfail("special.i1"),
+    xfail("special.i1e"),
+    xfail("special.laguerre_polynomial_l"),
+    xfail("special.log_ndtr"),
+    xfail("special.modified_bessel_i0"),
+    xfail("special.modified_bessel_i1"),
+    xfail("special.modified_bessel_k0"),
+    xfail("special.modified_bessel_k1"),
+    xfail("special.ndtri"),
+    xfail("special.scaled_modified_bessel_k0"),
+    xfail("special.scaled_modified_bessel_k1"),
+    xfail("special.spherical_bessel_j0"),
+    xfail("special.xlog1py"),
+    xfail("special.zeta"),
+    xfail("split"),
+    xfail("split", "list_args"),
+    xfail("split_with_sizes"),
+    xfail("signal.windows.cosine"),
+    xfail("signal.windows.exponential"),
+    xfail("signal.windows.gaussian"),
+    xfail("signal.windows.kaiser"),
+    xfail("squeeze"),
+    xfail("stack"),
+    xfail("std"),
+    xfail("std_mean"),
+    xfail("stft"),
+    xfail("svd"),
+    xfail("svd_lowrank"),
+    xfail("symeig"),
+    xfail("t"),
+    xfail("take_along_dim"),
+    xfail("take"),
+    xfail("tensor_split"),
+    xfail("to_sparse"),
+    xfail("topk"),
+    xfail("trace"),
+    xfail("trapezoid"),
+    xfail("trapz"),
+    xfail("triangular_solve"),
+    xfail("tril"),
+    xfail("triu"),
+    xfail("unbind"),
+    xfail("unfold"),
+    xfail("unfold_copy"),
+    xfail("uniform"),
+    xfail("unflatten"),
+    xfail("unique_consecutive"),
+    xfail("unique"),
+    xfail("var_mean"),
+    xfail("vdot"),
+    xfail("view_as_complex"),
+    xfail("vstack"),
+    xfail("zeros"),
+    # ops inside this might even fail without dtensor
+    # tests, as we rescale op db common test size factor (i.e. L, M, S)
+    # which triggered the orignal function run failures with input
+    # generation becomes wrong, we skip them for now but should enable later.
+    # TODO: need to clean this list and remove all cases
+    skip("argwhere"),
+    skip("cumprod"),
+    skip("__rmatmul__"),
+    skip("meshgrid", "list_of_tensors"),
+    skip("meshgrid", "variadic_tensors"),
+    skip("nn.functional._scaled_dot_product_attention"),
+    skip("nn.functional.softmin"),
+    skip("nn.functional.embedding"),
+    skip("nn.functional.embedding_bag"),
+    skip("nn.functional.feature_alpha_dropout", "with_train"),
+    skip("nn.functional.feature_alpha_dropout", "without_train"),
+    skip("nn.functional.hinge_embedding_loss"),
+    skip("nn.functional.cosine_embedding_loss"),
+    skip("fft.hfft"),
+    skip("fft.hfft2"),
+    skip("fft.hfft2"),
+    skip("fft.hfftn"),
+    skip("fft.ifftn"),
+    skip("fft.irfft"),
+    skip("istft"),
+    skip("isclose"),
+    skip("isreal"),
+    skip("matmul"),
+    skip("masked.mean"),
+    skip("masked.var"),
+    skip("masked.std"),
+    skip("masked.normalize"),
+    skip("prod"),
+    skip("segment_reduce", "lengths"),
+    skip("segment_reduce", "offsets"),
+}
+
+
+# Add a list of ops that are currently failing BW pass
+skip_bw = [
+    None,  # corresponds to the transpose ops 'H' and 'T'
+    "torch.bucketize",
+    "torch.conj_physical",
+    "torch.eq",
+    "torch.isfinite",
+    "torch.isnan",
+]
+
+
+def run_dtensor_crossref(test_case, func, args, kwargs):
+    to_dtensor = DTensorConverter(test_case.mesh, args, kwargs)
+
+    # TODO: also handle cases where func raise an exception
+    rs = func(*args, **kwargs)
+
+    def to_replicate(e: object) -> object:
+        return (
+            e.redistribute(test_case.mesh, test_case.mesh.ndim * [Replicate()])
+            if isinstance(e, DTensor)
+            else e
+        )
+
+    try:
+        # Suppress warnings, this doesn't matter for test_meta.py
+        # but it does matter if you want to use this decorator
+        # for cross-ref testing, as some tests may be looking at
+        # errors
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            # for every comb of sharding choices, we test if it works
+            for dtensor_args, dtensor_kwargs in to_dtensor:
+                # Only attempt if we managed to convert all tensors to DTensor
+                # (if any of them failed, we're in a mixed tensor situation and
+                # this is not allowed in DTensor)
+                if to_dtensor.successful():
+                    # Handle special cases first if there's any
+                    # Suppress warnings, this doesn't matter for test_meta.py
+                    # but it does matter if you want to use this decorator
+                    # for cross-ref testing, as some tests may be looking at
+                    # errors
+                    dtensor_rs = func(*dtensor_args, **dtensor_kwargs)
+
+                    # we need to skip tests containing tensors of zero elmeents for now.
+                    # see issue: https://github.com/pytorch/tau/issues/470
+                    # TODO remove this once issue above fixed.
+                    flat_args, _ = tree_flatten(dtensor_rs)
+                    if any(
+                        isinstance(e, torch.Tensor) and e.numel() == 0
+                        for e in flat_args
+                    ):
+                        continue
+
+                    # redistribute/all_gather the results to compare with normal output
+                    dtensor_rs = tree_map(to_replicate, dtensor_rs)
+                    try:
+                        if resolve_name(func) not in skip_bw:
+                            if isinstance(dtensor_rs, DTensor):
+                                dtensor_rs.to_local().sum().backward()
+                            elif isinstance(dtensor_rs, tuple):
+                                dtensor_rs[0].to_local().sum().backward()
+
+                    except Exception as e:
+                        # TODO(anj): Remove this guard exception after gaining more confidence.
+                        if torch.distributed.get_rank() == 0:
+                            print(
+                                f"failed to run BW: {resolve_name(func)}, {func}, {str(e)})"
+                            )
+                    assert_ref_dtensor_equal(test_case, dtensor_rs, rs)
+                else:
+                    raise RuntimeError(
+                        f"failed to convert args to DTensor; "
+                        f"originally (*{args}, **{kwargs})"
+                    )
+    except Exception as e:
+        raise RuntimeError(
+            f"failed to run: {resolve_name(func)}, with (*{args}, **{kwargs})"
+        ) from e
+
+    return rs
+
+
+def check_dtensor_func(test_case, test_func, opinfo, dry_run=False):
+    try:
+        test_func()
+    except Exception:
+        test_case.destroy_pg()
+        if not dry_run:
+            raise
+        if dist.get_rank() == 0:
+            if opinfo.variant_test_name:
+                print(f"xfail('{opinfo.name}', '{opinfo.variant_test_name}'),")
+            else:
+                print(f"xfail('{opinfo.name}'),")
+    else:
+        test_case.destroy_pg()
+
+
+class TestDTensorOps(DTensorTestBase):
+    @property
+    def world_size(self) -> int:
+        return 4
+
+    # only allow float dytpe for now, we can relax this constraint
+    # when feel necessary later (i.e when adding quantization support).
+    @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
+    @suppress_warnings
+    @ops(dtensor_lagging_op_db, allowed_dtypes=(torch.float,))
+    @skipOps("TestDTensorOps", "test_dtensor_op_db", dtensor_fails)
+    def test_dtensor_op_db(self, dtype, op):
+        pg_backend = "nccl" if DEVICE_TYPE == "cuda" else "gloo"
+        if pg_backend == "nccl" and torch.cuda.device_count() < self.world_size:
+            sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
+
+        self.init_pg(backend=pg_backend)
+        self.mesh = DeviceMesh(DEVICE_TYPE, torch.arange(self.world_size))
+
+        # test each op with dist tensor inputs and normal inputs
+        def test():
+            samples = op.sample_inputs(DEVICE_TYPE, dtype, requires_grad=True)
+            for sample_input in samples:
+                args = [sample_input.input] + list(sample_input.args)
+                kwargs = sample_input.kwargs
+
+                run_dtensor_crossref(self, op.op, args, kwargs)
+                # we need to figure out a way to test the out variant, out variant testing
+                # is tricky, as we need to pre allocate the dtensor out, some of them rely
+                # on sharding placements to be pre-known (i.e. mm.out)
+                # if isinstance(expected, torch.Tensor) and op.supports_out:
+                #     func(*args, **kwargs, out=expected)
+
+        check_dtensor_func(self, test, op)
+
+
+# only instantiate tests for DEVICE_TYPE alone (i.e. either CPU or GPU)
+instantiate_device_type_tests(
+    TestDTensorOps, globals(), only_for=(DEVICE_TYPE,)
+)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_tensor/test_tensor_ops.py b/test/distributed/_tensor/test_tensor_ops.py
new file mode 100644
index 0000000000000..1ba3f6d5f95b6
--- /dev/null
+++ b/test/distributed/_tensor/test_tensor_ops.py
@@ -0,0 +1,365 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+import torch
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorConverter,
+    DTensorTestBase,
+    with_comms,
+)
+from torch.distributed._tensor import distribute_tensor, DeviceMesh, DTensor
+from torch.distributed._tensor.placement_types import Shard, Replicate, _Partial
+
+
+class DistTensorOpsTest(DTensorTestBase):
+    @with_comms
+    def test_aten_contiguous(self):
+        # this op not covered by dtensor_ops
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        self._test_op(
+            mesh,
+            lambda x: torch.ops.aten.contiguous(x),
+            torch.randn(16, 32),
+        )
+
+    @with_comms
+    def test_detach(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        shard_spec = [Shard(0)]
+
+        tensor_to_detach = torch.randn(12, 8, requires_grad=True)
+        mat = distribute_tensor(tensor_to_detach, device_mesh, shard_spec)
+        detached_mat = mat.detach()
+        self.assertFalse(detached_mat is mat)
+
+    @with_comms
+    def test_clone(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        specs = [[Replicate()], [Shard(0)]]
+        tensor_to_clone = torch.randn(12, 8, requires_grad=True)
+        for spec in specs:
+            mat = distribute_tensor(tensor_to_clone, device_mesh, spec)
+            cloned_mat = mat.clone()
+            self.assertFalse(cloned_mat is mat)
+            self.assertEqual(cloned_mat.to_local(), mat.to_local())
+
+    @with_comms
+    def test_contiguous(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        tensor = torch.rand(3, 5, 6, requires_grad=True)
+        sharding = [Shard(0)]
+        dist_tensor = DTensor.from_local(tensor, device_mesh, sharding)
+        self.assertTrue(dist_tensor.is_contiguous())
+        # shard on dim 0 should not change stride (30, 6, 1)
+        self.assertEqual(dist_tensor.stride(), tensor.stride())
+
+        new_dt = dist_tensor.transpose(0, 2)
+        self.assertFalse(new_dt.is_contiguous())
+        self.assertFalse(new_dt.to_local().is_contiguous())
+        # check stride
+        self.assertEqual(new_dt.stride(), (1, 6, 30))
+
+        new_dt = new_dt.contiguous()
+        self.assertTrue(new_dt.is_contiguous())
+        self.assertTrue(new_dt.to_local().is_contiguous())
+        # check stride
+        self.assertEqual(dist_tensor.stride(), tensor.stride())
+
+        # check backward
+        new_dt.to_local().sum().backward()
+        self.assertEqual(tensor.grad, torch.ones(3, 5, 6))
+
+    @with_comms
+    def test_inplace_op(self):
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        input_tensor = torch.randn((12, 3), device=self.device_type)
+        dt_to_add = distribute_tensor(input_tensor, mesh, [Shard(0)])
+        dt_to_mul = dt_to_add.clone()
+        expected_add_dt = dt_to_add.clone() + 3
+        add_res = dt_to_add.add_(3)
+        expected_mul_dt = dt_to_mul.clone() * 3
+        mul_res = dt_to_mul.mul_(3)
+        # inplace op should be the same instance before and after
+        self.assertTrue(add_res is dt_to_add)
+        self.assertEqual(add_res.to_local(), expected_add_dt.to_local())
+
+        self.assertTrue(mul_res is dt_to_mul)
+        self.assertEqual(mul_res.to_local(), expected_mul_dt.to_local())
+
+        # test inplace op self and other dtensor with other specs
+        # and make sure out spec not change
+        shard_spec = [Shard(0)]
+        partial_spec = [_Partial()]
+        dt_to_inplace_add = distribute_tensor(input_tensor, mesh, shard_spec)
+        partial_grad = DTensor.from_local(
+            torch.randn(12, 3), mesh, partial_spec
+        )
+        res = dt_to_inplace_add.add_(partial_grad)
+        self.assertTrue(res is dt_to_inplace_add)
+        self.assertTrue(res.placements == shard_spec)
+
+    @with_comms
+    def test_op_out_variant(self):
+        mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        input_tensor = torch.randn((12, 3), device=self.device_type)
+        sharded_dt_input = distribute_tensor(input_tensor, mesh, [Shard(0)])
+        expected_dt = sharded_dt_input.clone() + 3
+        sharded_dt_out = sharded_dt_input.clone()
+        res = torch.add(sharded_dt_input, 3, out=sharded_dt_out)
+        # op out variant should be the same instance before and after
+        self.assertTrue(res is sharded_dt_out)
+        self.assertEqual(sharded_dt_out.to_local(), expected_dt.to_local())
+
+        # test op out variant with other spec and make sure out spec not change
+        replica_spec = [Replicate()]
+        replicate_out = distribute_tensor(input_tensor, mesh, replica_spec)
+        expected_dt = replicate_out.clone() + 3
+        res = torch.add(sharded_dt_input, 3, out=replicate_out)
+        self.assertTrue(res is replicate_out)
+        self.assertTrue(res.placements == replica_spec)
+        self.assertEqual(replicate_out.to_local(), expected_dt.to_local())
+
+    @with_comms
+    def test_empty_like(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        shard_spec = [Shard(0)]
+
+        input_tensor = torch.randn(4, 8, requires_grad=True)
+        dist_tensor = DTensor.from_local(input_tensor, device_mesh, shard_spec)
+        empty_like_dt = torch.empty_like(dist_tensor)
+        # empty is not deterministic, so we only check that the shard propagation worked
+        self.assertEqual((4, 8), empty_like_dt.to_local().shape)
+
+    @with_comms
+    def test_fill_inplace(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        shard_spec = [Shard(0)]
+
+        input_tensor = torch.randn(4, 8, requires_grad=True)
+        dist_tensor = DTensor.from_local(input_tensor, device_mesh, shard_spec)
+        full_like_dt = torch.fill_(dist_tensor, 42.0)
+        full_expected = torch.full((4, 8), 42.0)
+        self.assertEqual(full_expected, full_like_dt.to_local())
+        self.assertEqual(full_expected, dist_tensor.to_local())
+
+    @with_comms
+    def test_full_like(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        shard_spec = [Shard(0)]
+
+        input_tensor = torch.randn(4, 8, requires_grad=True)
+        dist_tensor = DTensor.from_local(input_tensor, device_mesh, shard_spec)
+        full_like_dt = torch.full_like(dist_tensor, 42.0)
+        full_expected = torch.full((4, 8), 42.0)
+        self.assertEqual(full_expected, full_like_dt.to_local())
+
+    @with_comms
+    def test_ones_like(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        shard_spec = [Shard(0)]
+
+        input_tensor = torch.randn(4, 8, requires_grad=True)
+        dist_tensor = DTensor.from_local(input_tensor, device_mesh, shard_spec)
+        ones_like_dt = torch.ones_like(dist_tensor)
+        ones_expected = torch.ones(4, 8)
+        self.assertEqual(ones_expected, ones_like_dt.to_local())
+
+    @with_comms
+    def test_ones_like_partial_sum(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        shard_spec = [_Partial()]
+
+        input_tensor = torch.randn(4, 8, requires_grad=True)
+        dist_tensor = DTensor.from_local(input_tensor, device_mesh, shard_spec)
+        assert dist_tensor.shape == (4, 8)
+
+        ones_like_dt = torch.ones_like(dist_tensor)
+        ones_expected = torch.ones(dist_tensor.shape)
+        self.assertEqual(
+            ones_expected,
+            ones_like_dt.redistribute(device_mesh, [Replicate()]).to_local(),
+        )
+
+    @with_comms
+    def test_fill_inplace_partial_sum(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        shard_spec = [_Partial()]
+
+        input_tensor = torch.randn(4, 8, requires_grad=True)
+        dist_tensor = DTensor.from_local(input_tensor, device_mesh, shard_spec)
+        assert dist_tensor.shape == (4, 8)
+
+        torch.fill_(dist_tensor, 42)
+        fill_expected = torch.full(
+            dist_tensor.shape, 42, dtype=input_tensor.dtype
+        )
+        self.assertEqual(
+            fill_expected,
+            dist_tensor.redistribute(device_mesh, [Replicate()]).to_local(),
+        )
+
+    @with_comms
+    def test_zeros_like_partial_sum(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        shard_spec = [_Partial()]
+
+        input_tensor = torch.randn(4, 8, requires_grad=True)
+        dist_tensor = DTensor.from_local(input_tensor, device_mesh, shard_spec)
+        assert dist_tensor.shape == (4, 8)
+
+        zeros_like_dt = torch.zeros_like(dist_tensor)
+        zeros_expected = torch.zeros(dist_tensor.shape)
+        self.assertEqual(
+            zeros_expected,
+            zeros_like_dt.redistribute(device_mesh, [Replicate()]).to_local(),
+        )
+
+    @with_comms
+    def test_zero_inplace(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        shard_spec = [Shard(0)]
+
+        input_tensor = torch.randn(4, 8, requires_grad=True)
+        dist_tensor = DTensor.from_local(input_tensor, device_mesh, shard_spec)
+        zeros_like_dt = torch.zero_(dist_tensor)
+        zeros_expected = torch.zeros(4, 8)
+        self.assertEqual(zeros_expected, zeros_like_dt.to_local())
+        self.assertEqual(zeros_expected, dist_tensor.to_local())
+
+    @with_comms
+    def test_zeros_like(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        shard_spec = [Shard(0)]
+
+        input_tensor = torch.randn(4, 8, requires_grad=True)
+        dist_tensor = DTensor.from_local(input_tensor, device_mesh, shard_spec)
+        zeros_like_dt = torch.zeros_like(dist_tensor)
+        zeros_expected = torch.zeros(4, 8)
+        self.assertEqual(zeros_expected, zeros_like_dt.to_local())
+
+    def _test_op(self, mesh, op_call, *args, **kwargs):
+        out = op_call(*args, **kwargs)
+        dtc = DTensorConverter(mesh, args, kwargs)
+        for d_args, d_kwargs in dtc:
+            self.assertTrue(dtc.successful())
+            d_out = op_call(*d_args, **d_kwargs)
+            self.assertEqual(
+                d_out.redistribute(mesh, [Replicate()] * mesh.ndim).to_local(),
+                out,
+            )
+
+    @with_comms
+    def test_index(self):
+        meshes = [
+            DeviceMesh(
+                self.device_type, list(range(self.world_size))
+            ),  # 1D mesh
+            # TODO(@azzolini): un-comment when DTensorConverter supports N-D mesh
+            # DeviceMesh(self.device_type, torch.arange(self.world_size).reshape(2, -1)), # 2D mesh
+        ]
+        for mesh in meshes:
+            self._test_op(
+                mesh,
+                lambda x, y: x[y],
+                torch.randn(16, 32, 16),
+                torch.randint(5, (4, 8)),
+            )
+            self._test_op(
+                mesh,
+                lambda x, y: x.index_select(1, y),
+                torch.randn(16, 32, 16),
+                torch.randint(5, (4,)),
+            )
+            self._test_op(
+                mesh,
+                lambda x, y: x.index_select(0, y),
+                torch.randn(16, 32, 16),
+                torch.randint(5, (4,)),
+            )
+            self._test_op(
+                mesh,
+                lambda x, y: x[y],
+                torch.randn(16, 32, 16),
+                torch.randint(5, (12,)),
+            )
+            self._test_op(
+                mesh,
+                lambda x, y: x[:, y],
+                torch.randn(16, 32, 16),
+                torch.randint(5, (4, 8)),
+            )
+            self._test_op(
+                mesh,
+                lambda x, y: x[..., y],
+                torch.randn(16, 32, 16),
+                torch.randint(5, (4, 12)),
+            )
+            self._test_op(
+                mesh,
+                lambda x, y: x[..., y],
+                torch.randn(16, 32, 16),
+                torch.randint(5, (4, 8, 16)),
+            )
+            self._test_op(
+                mesh,
+                lambda x, y, z: x[z, y],
+                torch.randn(16, 32, 16),
+                torch.randint(5, (12, 8, 12)),
+                torch.randint(2, (12, 8, 12)),
+            )
+            self._test_op(
+                mesh,
+                lambda x, y, z: x[z, :, y],
+                torch.randn(16, 32, 16),
+                torch.randint(5, (12, 8, 12)),
+                torch.randint(2, (12, 8, 12)),
+            )
+            self._test_op(
+                mesh,
+                lambda x, y, z: x[:, z, :, y],
+                torch.randn(16, 32, 16, 12),
+                torch.randint(5, (12, 8, 12)),
+                torch.randint(2, (12, 8, 12)),
+            )
+            # broadcast in inner dimensions
+            self._test_op(
+                mesh,
+                lambda x, y, z: x[:, z, :, y],
+                torch.randn(16, 32, 16, 12),
+                torch.randint(5, (12, 8, 12)),
+                torch.randint(2, (12, 1, 12)),
+            )
+            # implicit (left-padded) broadcast
+            self._test_op(
+                mesh,
+                lambda x, y, z: x[:, z, :, y],
+                torch.randn(16, 32, 16, 12),
+                torch.randint(5, (12, 8, 12)),
+                torch.randint(2, (8, 12)),
+            )
+            self._test_op(
+                mesh,
+                lambda x, y, z: x[z, y, :, :],
+                torch.randn(16, 32, 16, 12),
+                torch.randint(2, (8, 12)),
+                torch.randint(5, (12, 8, 12)),
+            )
+            self._test_op(
+                mesh,
+                lambda x, y, z: x[z, :, y, :],
+                torch.randn(16, 32, 16, 12),
+                torch.randint(2, (8, 12)),
+                torch.randint(5, (12, 8, 12)),
+            )
+            self._test_op(
+                mesh,
+                lambda x, y, z: x[z, :, :, y],
+                torch.randn(16, 32, 16, 12),
+                torch.randint(2, (8, 1)),
+                torch.randint(5, (12, 8, 12)),
+            )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/testing/_internal/distributed/_tensor/dtensor_lagging_op_db.py b/torch/testing/_internal/distributed/_tensor/dtensor_lagging_op_db.py
new file mode 100644
index 0000000000000..abd0ccfe0a099
--- /dev/null
+++ b/torch/testing/_internal/distributed/_tensor/dtensor_lagging_op_db.py
@@ -0,0 +1,661 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List
+from torch.testing._internal.common_methods_invocations import op_db, OpInfo
+
+# Generated from test/gen_dtensor_op_db.py via
+# python spmd/testing/gen_dtensor_lagging_op_db.py > spmd/testing/dtensor_lagging_op_db.py
+#
+# This approach is copied from functorch:
+# People add new OpInfos to PyTorch all the time.
+# We want them to be able to add OpInfos without breaking our CI.
+# To achieve this, we keep our OpInfo library behind that of Pytorch's and
+# we periodically update our OpInfo library by regenerating this file
+_dtensor_lagging_meta = {
+    ("H", ""),
+    ("T", ""),
+    ("__getitem__", ""),
+    ("__radd__", ""),
+    ("__rand__", ""),
+    ("__rdiv__", ""),
+    ("__rmatmul__", ""),
+    ("__rmod__", ""),
+    ("__rmul__", ""),
+    ("__ror__", ""),
+    ("__rpow__", ""),
+    ("__rsub__", ""),
+    ("__rxor__", ""),
+    ("abs", ""),
+    ("acos", ""),
+    ("acosh", ""),
+    ("add", ""),
+    ("addbmm", ""),
+    ("addcdiv", ""),
+    ("addcmul", ""),
+    ("addmm", ""),
+    ("addmm", "decomposed"),
+    ("addmv", ""),
+    ("addr", ""),
+    ("all", ""),
+    ("allclose", ""),
+    ("amax", ""),
+    ("amin", ""),
+    ("aminmax", ""),
+    ("angle", ""),
+    ("any", ""),
+    ("arange", ""),
+    ("argmax", ""),
+    ("argmin", ""),
+    ("argsort", ""),
+    ("argwhere", ""),
+    ("as_strided", ""),
+    ("as_strided_scatter", ""),
+    ("asin", ""),
+    ("asinh", ""),
+    ("atan", ""),
+    ("atan2", ""),
+    ("atanh", ""),
+    ("atleast_1d", ""),
+    ("atleast_2d", ""),
+    ("atleast_3d", ""),
+    ("baddbmm", ""),
+    ("bernoulli", ""),
+    ("bfloat16", ""),
+    ("bincount", ""),
+    ("bitwise_and", ""),
+    ("bitwise_left_shift", ""),
+    ("bitwise_not", ""),
+    ("bitwise_or", ""),
+    ("bitwise_right_shift", ""),
+    ("bitwise_xor", ""),
+    ("block_diag", ""),
+    ("bmm", ""),
+    ("bool", ""),
+    ("broadcast_shapes", ""),
+    ("broadcast_tensors", ""),
+    ("broadcast_to", ""),
+    ("bucketize", ""),
+    ("byte", ""),
+    ("cartesian_prod", ""),
+    ("cat", ""),
+    ("cdist", ""),
+    ("cdouble", ""),
+    ("ceil", ""),
+    ("cfloat", ""),
+    ("chalf", ""),
+    ("char", ""),
+    ("cholesky", ""),
+    ("cholesky_inverse", ""),
+    ("cholesky_solve", ""),
+    ("chunk", ""),
+    ("clamp", ""),
+    ("clamp_max", ""),
+    ("clamp_min", ""),
+    ("clone", ""),
+    ("column_stack", ""),
+    ("combinations", ""),
+    ("complex", ""),
+    ("conj", ""),
+    ("conj_physical", ""),
+    ("constant_pad_nd", ""),
+    ("contiguous", ""),
+    ("copysign", ""),
+    ("corrcoef", ""),
+    ("cos", ""),
+    ("cosh", ""),
+    ("count_nonzero", ""),
+    ("cov", ""),
+    ("cross", ""),
+    ("cummax", ""),
+    ("cummin", ""),
+    ("cumprod", ""),
+    ("cumsum", ""),
+    ("cumulative_trapezoid", ""),
+    ("deg2rad", ""),
+    ("diag", ""),
+    ("diag_embed", ""),
+    ("diagflat", ""),
+    ("diagonal", ""),
+    ("diagonal_copy", ""),
+    ("diagonal_scatter", ""),
+    ("diff", ""),
+    ("digamma", ""),
+    ("dist", ""),
+    ("div", "floor_rounding"),
+    ("div", "no_rounding_mode"),
+    ("div", "trunc_rounding"),
+    ("dot", ""),
+    ("double", ""),
+    ("dsplit", ""),
+    ("dstack", ""),
+    ("einsum", ""),
+    ("empty", ""),
+    ("empty_like", ""),
+    ("eq", ""),
+    ("equal", ""),
+    ("erf", ""),
+    ("erfc", ""),
+    ("erfinv", ""),
+    ("exp", ""),
+    ("exp2", ""),
+    ("expand", ""),
+    ("expand_as", ""),
+    ("expm1", ""),
+    ("eye", ""),
+    ("fft.fft", ""),
+    ("fft.fft2", ""),
+    ("fft.fftn", ""),
+    ("fft.fftshift", ""),
+    ("fft.hfft", ""),
+    ("fft.hfft2", ""),
+    ("fft.hfftn", ""),
+    ("fft.ifft", ""),
+    ("fft.ifft2", ""),
+    ("fft.ifftn", ""),
+    ("fft.ifftshift", ""),
+    ("fft.ihfft", ""),
+    ("fft.ihfft2", ""),
+    ("fft.ihfftn", ""),
+    ("fft.irfft", ""),
+    ("fft.irfft2", ""),
+    ("fft.irfftn", ""),
+    ("fft.rfft", ""),
+    ("fft.rfft2", ""),
+    ("fft.rfftn", ""),
+    ("fill", ""),
+    ("flatten", ""),
+    ("flip", ""),
+    ("fliplr", ""),
+    ("flipud", ""),
+    ("float", ""),
+    ("float_power", ""),
+    ("floor", ""),
+    ("floor_divide", ""),
+    ("fmax", ""),
+    ("fmin", ""),
+    ("fmod", ""),
+    ("frac", ""),
+    ("frexp", ""),
+    ("full", ""),
+    ("full_like", ""),
+    ("gather", ""),
+    ("gcd", ""),
+    ("ge", ""),
+    ("geqrf", ""),
+    ("gradient", ""),
+    ("gt", ""),
+    ("half", ""),
+    ("heaviside", ""),
+    ("histc", ""),
+    ("histogram", ""),
+    ("histogramdd", ""),
+    ("hsplit", ""),
+    ("hstack", ""),
+    ("hypot", ""),
+    ("i0", ""),
+    ("igamma", ""),
+    ("igammac", ""),
+    ("imag", ""),
+    ("index_add", ""),
+    ("index_copy", ""),
+    ("index_fill", ""),
+    ("index_put", ""),
+    ("index_reduce", ""),
+    ("index_select", ""),
+    ("inner", ""),
+    ("int", ""),
+    ("isclose", ""),
+    ("isfinite", ""),
+    ("isin", ""),
+    ("isinf", ""),
+    ("isnan", ""),
+    ("isneginf", ""),
+    ("isposinf", ""),
+    ("isreal", ""),
+    ("istft", ""),
+    ("jiterator_2inputs_2outputs", ""),
+    ("jiterator_4inputs_with_extra_args", ""),
+    ("jiterator_binary", ""),
+    ("jiterator_binary_return_by_ref", ""),
+    ("jiterator_unary", ""),
+    ("kron", ""),
+    ("kthvalue", ""),
+    ("lcm", ""),
+    ("ldexp", ""),
+    ("le", ""),
+    ("lerp", ""),
+    ("lgamma", ""),
+    ("linalg.cholesky", ""),
+    ("linalg.cholesky_ex", ""),
+    ("linalg.cond", ""),
+    ("linalg.cross", ""),
+    ("linalg.det", ""),
+    ("linalg.det", "singular"),
+    ("linalg.eig", ""),
+    ("linalg.eigh", ""),
+    ("linalg.eigvals", ""),
+    ("linalg.eigvalsh", ""),
+    ("linalg.householder_product", ""),
+    ("linalg.inv", ""),
+    ("linalg.inv_ex", ""),
+    ("linalg.ldl_factor", ""),
+    ("linalg.ldl_factor_ex", ""),
+    ("linalg.ldl_solve", ""),
+    ("linalg.lstsq", ""),
+    ("linalg.lstsq", "grad_oriented"),
+    ("linalg.lu", ""),
+    ("linalg.lu_factor", ""),
+    ("linalg.lu_factor_ex", ""),
+    ("linalg.lu_solve", ""),
+    ("linalg.matrix_norm", ""),
+    ("linalg.matrix_power", ""),
+    ("linalg.matrix_rank", ""),
+    ("linalg.matrix_rank", "hermitian"),
+    ("linalg.multi_dot", ""),
+    ("linalg.norm", ""),
+    ("linalg.norm", "subgradients_at_zero"),
+    ("linalg.pinv", ""),
+    ("linalg.pinv", "hermitian"),
+    ("linalg.pinv", "singular"),
+    ("linalg.qr", ""),
+    ("linalg.slogdet", ""),
+    ("linalg.solve", ""),
+    ("linalg.solve_ex", ""),
+    ("linalg.solve_triangular", ""),
+    ("linalg.svd", ""),
+    ("linalg.svdvals", ""),
+    ("linalg.tensorinv", ""),
+    ("linalg.tensorsolve", ""),
+    ("linalg.vander", ""),
+    ("linalg.vecdot", ""),
+    ("linalg.vector_norm", ""),
+    ("linspace", ""),
+    ("log", ""),
+    ("log10", ""),
+    ("log1p", ""),
+    ("log2", ""),
+    ("log_softmax", ""),
+    ("log_softmax", "with_dtype"),
+    ("logaddexp", ""),
+    ("logaddexp2", ""),
+    ("logcumsumexp", ""),
+    ("logdet", ""),
+    ("logical_and", ""),
+    ("logical_not", ""),
+    ("logical_or", ""),
+    ("logical_xor", ""),
+    ("logit", ""),
+    ("logspace", ""),
+    ("logsumexp", ""),
+    ("long", ""),
+    ("lt", ""),
+    ("lu", ""),
+    ("lu_solve", ""),
+    ("lu_unpack", ""),
+    ("mH", ""),
+    ("mT", ""),
+    ("masked.amax", ""),
+    ("masked.amin", ""),
+    ("masked.argmax", ""),
+    ("masked.argmin", ""),
+    ("masked.cumprod", ""),
+    ("masked.cumsum", ""),
+    ("masked.log_softmax", ""),
+    ("masked.logaddexp", ""),
+    ("masked.logsumexp", ""),
+    ("masked.mean", ""),
+    ("masked.median", ""),
+    ("masked.norm", ""),
+    ("masked.normalize", ""),
+    ("masked.prod", ""),
+    ("masked.softmax", ""),
+    ("masked.softmin", ""),
+    ("masked.std", ""),
+    ("masked.sum", ""),
+    ("masked.var", ""),
+    ("masked_fill", ""),
+    ("masked_scatter", ""),
+    ("masked_select", ""),
+    ("matmul", ""),
+    ("matrix_exp", ""),
+    ("max", "binary"),
+    ("max", "reduction_no_dim"),
+    ("max", "reduction_with_dim"),
+    ("max_pool2d_with_indices_backward", ""),
+    ("maximum", ""),
+    ("mean", ""),
+    ("median", ""),
+    ("meshgrid", "list_of_tensors"),
+    ("meshgrid", "variadic_tensors"),
+    ("min", "binary"),
+    ("min", "reduction_no_dim"),
+    ("min", "reduction_with_dim"),
+    ("minimum", ""),
+    ("mm", ""),
+    ("mode", ""),
+    ("movedim", ""),
+    ("msort", ""),
+    ("mul", ""),
+    ("multinomial", ""),
+    ("mv", ""),
+    ("mvlgamma", "mvlgamma_p_1"),
+    ("mvlgamma", "mvlgamma_p_3"),
+    ("mvlgamma", "mvlgamma_p_5"),
+    ("nan_to_num", ""),
+    ("nanmean", ""),
+    ("nanmedian", ""),
+    ("nanquantile", ""),
+    ("nansum", ""),
+    ("narrow", ""),
+    ("narrow_copy", ""),
+    ("native_batch_norm", ""),
+    ("native_layer_norm", ""),
+    ("ne", ""),
+    ("neg", ""),
+    ("new_empty", ""),
+    ("new_empty_strided", ""),
+    ("new_full", ""),
+    ("new_ones", ""),
+    ("new_zeros", ""),
+    ("nextafter", ""),
+    ("nn.functional._scaled_dot_product_attention", ""),
+    ("nn.functional.adaptive_avg_pool1d", ""),
+    ("nn.functional.adaptive_avg_pool2d", ""),
+    ("nn.functional.adaptive_avg_pool3d", ""),
+    ("nn.functional.adaptive_max_pool1d", ""),
+    ("nn.functional.adaptive_max_pool2d", ""),
+    ("nn.functional.adaptive_max_pool3d", ""),
+    ("nn.functional.alpha_dropout", ""),
+    ("nn.functional.avg_pool1d", ""),
+    ("nn.functional.avg_pool2d", ""),
+    ("nn.functional.avg_pool3d", ""),
+    ("nn.functional.batch_norm", ""),
+    ("nn.functional.batch_norm", "without_cudnn"),
+    ("nn.functional.bilinear", ""),
+    ("nn.functional.binary_cross_entropy", ""),
+    ("nn.functional.binary_cross_entropy_with_logits", ""),
+    ("nn.functional.celu", ""),
+    ("nn.functional.conv1d", ""),
+    ("nn.functional.conv2d", ""),
+    ("nn.functional.conv_transpose1d", ""),
+    ("nn.functional.conv_transpose2d", ""),
+    ("nn.functional.conv_transpose3d", ""),
+    ("nn.functional.cosine_embedding_loss", ""),
+    ("nn.functional.cosine_similarity", ""),
+    ("nn.functional.cross_entropy", ""),
+    ("nn.functional.ctc_loss", ""),
+    ("nn.functional.dropout", ""),
+    ("nn.functional.dropout2d", ""),
+    ("nn.functional.dropout3d", ""),
+    ("nn.functional.elu", ""),
+    ("nn.functional.embedding", ""),
+    ("nn.functional.embedding_bag", ""),
+    ("nn.functional.feature_alpha_dropout", "with_train"),
+    ("nn.functional.feature_alpha_dropout", "without_train"),
+    ("nn.functional.fractional_max_pool2d", ""),
+    ("nn.functional.fractional_max_pool3d", ""),
+    ("nn.functional.gaussian_nll_loss", ""),
+    ("nn.functional.gelu", ""),
+    ("nn.functional.glu", ""),
+    ("nn.functional.grid_sample", ""),
+    ("nn.functional.group_norm", ""),
+    ("nn.functional.hardshrink", ""),
+    ("nn.functional.hardsigmoid", ""),
+    ("nn.functional.hardswish", ""),
+    ("nn.functional.hardtanh", ""),
+    ("nn.functional.hinge_embedding_loss", ""),
+    ("nn.functional.huber_loss", ""),
+    ("nn.functional.instance_norm", ""),
+    ("nn.functional.interpolate", "area"),
+    ("nn.functional.interpolate", "bicubic"),
+    ("nn.functional.interpolate", "bilinear"),
+    ("nn.functional.interpolate", "linear"),
+    ("nn.functional.interpolate", "nearest"),
+    ("nn.functional.interpolate", "trilinear"),
+    ("nn.functional.kl_div", ""),
+    ("nn.functional.l1_loss", ""),
+    ("nn.functional.layer_norm", ""),
+    ("nn.functional.leaky_relu", ""),
+    ("nn.functional.linear", ""),
+    ("nn.functional.local_response_norm", ""),
+    ("nn.functional.logsigmoid", ""),
+    ("nn.functional.margin_ranking_loss", ""),
+    ("nn.functional.max_pool1d", ""),
+    ("nn.functional.max_pool2d", ""),
+    ("nn.functional.max_pool3d", ""),
+    ("nn.functional.max_unpool1d", ""),
+    ("nn.functional.max_unpool1d", "grad"),
+    ("nn.functional.max_unpool2d", ""),
+    ("nn.functional.max_unpool2d", "grad"),
+    ("nn.functional.max_unpool3d", ""),
+    ("nn.functional.max_unpool3d", "grad"),
+    ("nn.functional.mish", ""),
+    ("nn.functional.mse_loss", ""),
+    ("nn.functional.multi_margin_loss", ""),
+    ("nn.functional.multilabel_margin_loss", ""),
+    ("nn.functional.multilabel_soft_margin_loss", ""),
+    ("nn.functional.nll_loss", ""),
+    ("nn.functional.normalize", ""),
+    ("nn.functional.one_hot", ""),
+    ("nn.functional.pad", "circular"),
+    ("nn.functional.pad", "constant"),
+    ("nn.functional.pad", "reflect"),
+    ("nn.functional.pad", "replicate"),
+    ("nn.functional.pairwise_distance", ""),
+    ("nn.functional.pdist", ""),
+    ("nn.functional.pixel_shuffle", ""),
+    ("nn.functional.pixel_unshuffle", ""),
+    ("nn.functional.poisson_nll_loss", ""),
+    ("nn.functional.prelu", ""),
+    ("nn.functional.relu", ""),
+    ("nn.functional.relu6", ""),
+    ("nn.functional.rrelu", ""),
+    ("nn.functional.selu", ""),
+    ("nn.functional.silu", ""),
+    ("nn.functional.silu", "complex"),
+    ("nn.functional.smooth_l1_loss", ""),
+    ("nn.functional.soft_margin_loss", ""),
+    ("nn.functional.softmin", ""),
+    ("nn.functional.softmin", "with_dtype"),
+    ("nn.functional.softplus", ""),
+    ("nn.functional.softshrink", ""),
+    ("nn.functional.softsign", ""),
+    ("nn.functional.tanhshrink", ""),
+    ("nn.functional.threshold", ""),
+    ("nn.functional.triplet_margin_loss", ""),
+    ("nn.functional.triplet_margin_with_distance_loss", ""),
+    ("nn.functional.unfold", ""),
+    ("nn.functional.upsample_bilinear", ""),
+    ("nn.functional.upsample_nearest", ""),
+    ("nonzero", ""),
+    ("norm", ""),
+    ("norm", "fro"),
+    ("norm", "inf"),
+    ("norm", "nuc"),
+    ("normal", ""),
+    ("normal", "number_mean"),
+    ("ones", ""),
+    ("ones_like", ""),
+    ("ormqr", ""),
+    ("outer", ""),
+    ("pca_lowrank", ""),
+    ("permute", ""),
+    ("pinverse", ""),
+    ("polar", ""),
+    ("polygamma", "polygamma_n_0"),
+    ("polygamma", "polygamma_n_1"),
+    ("polygamma", "polygamma_n_2"),
+    ("polygamma", "polygamma_n_3"),
+    ("polygamma", "polygamma_n_4"),
+    ("positive", ""),
+    ("pow", ""),
+    ("prod", ""),
+    ("put", ""),
+    ("qr", ""),
+    ("quantile", ""),
+    ("rad2deg", ""),
+    ("rand_like", ""),
+    ("randint", ""),
+    ("randint_like", ""),
+    ("randn", ""),
+    ("randn_like", ""),
+    ("ravel", ""),
+    ("real", ""),
+    ("reciprocal", ""),
+    ("remainder", ""),
+    ("renorm", ""),
+    ("repeat", ""),
+    ("repeat_interleave", ""),
+    ("reshape", ""),
+    ("reshape_as", ""),
+    ("resize_", ""),
+    ("resize_as_", ""),
+    ("resolve_conj", ""),
+    ("resolve_neg", ""),
+    ("roll", ""),
+    ("rot90", ""),
+    ("round", ""),
+    ("round", "decimals_0"),
+    ("round", "decimals_3"),
+    ("round", "decimals_neg_3"),
+    ("rsqrt", ""),
+    ("rsub", ""),
+    ("scalar_tensor", ""),
+    ("scatter", ""),
+    ("scatter_add", ""),
+    ("scatter_reduce", "amax"),
+    ("scatter_reduce", "amin"),
+    ("scatter_reduce", "mean"),
+    ("scatter_reduce", "prod"),
+    ("scatter_reduce", "sum"),
+    ("searchsorted", ""),
+    ("segment_reduce", "lengths"),
+    ("segment_reduce", "offsets"),
+    ("select", ""),
+    ("select_scatter", ""),
+    ("sgn", ""),
+    ("short", ""),
+    ("sigmoid", ""),
+    ("sign", ""),
+    ("signal.windows.cosine", ""),
+    ("signal.windows.exponential", ""),
+    ("signal.windows.gaussian", ""),
+    ("signal.windows.kaiser", ""),
+    ("signbit", ""),
+    ("sin", ""),
+    ("sinc", ""),
+    ("sinh", ""),
+    ("slice", ""),
+    ("slice_scatter", ""),
+    ("softmax", ""),
+    ("softmax", "with_dtype"),
+    ("sort", ""),
+    ("sparse.sampled_addmm", ""),
+    ("special.airy_ai", ""),
+    ("special.bessel_j0", ""),
+    ("special.bessel_j1", ""),
+    ("special.bessel_y0", ""),
+    ("special.bessel_y1", ""),
+    ("special.chebyshev_polynomial_t", ""),
+    ("special.chebyshev_polynomial_u", ""),
+    ("special.chebyshev_polynomial_v", ""),
+    ("special.chebyshev_polynomial_w", ""),
+    ("special.entr", ""),
+    ("special.erfcx", ""),
+    ("special.hermite_polynomial_h", ""),
+    ("special.hermite_polynomial_he", ""),
+    ("special.i0e", ""),
+    ("special.i1", ""),
+    ("special.i1e", ""),
+    ("special.laguerre_polynomial_l", ""),
+    ("special.legendre_polynomial_p", ""),
+    ("special.log_ndtr", ""),
+    ("special.modified_bessel_i0", ""),
+    ("special.modified_bessel_i1", ""),
+    ("special.modified_bessel_k0", ""),
+    ("special.modified_bessel_k1", ""),
+    ("special.ndtr", ""),
+    ("special.ndtri", ""),
+    ("special.polygamma", "special_polygamma_n_0"),
+    ("special.scaled_modified_bessel_k0", ""),
+    ("special.scaled_modified_bessel_k1", ""),
+    ("special.shifted_chebyshev_polynomial_t", ""),
+    ("special.shifted_chebyshev_polynomial_u", ""),
+    ("special.shifted_chebyshev_polynomial_v", ""),
+    ("special.shifted_chebyshev_polynomial_w", ""),
+    ("special.spherical_bessel_j0", ""),
+    ("special.xlog1py", ""),
+    ("special.zeta", ""),
+    ("split", ""),
+    ("split", "list_args"),
+    ("split_with_sizes", ""),
+    ("sqrt", ""),
+    ("square", ""),
+    ("squeeze", ""),
+    ("stack", ""),
+    ("std", ""),
+    ("std_mean", ""),
+    ("stft", ""),
+    ("sub", ""),
+    ("sum", ""),
+    ("sum_to_size", ""),
+    ("svd", ""),
+    ("svd_lowrank", ""),
+    ("symeig", ""),
+    ("t", ""),
+    ("take", ""),
+    ("take_along_dim", ""),
+    ("tan", ""),
+    ("tanh", ""),
+    ("tensor_split", ""),
+    ("tensordot", ""),
+    ("tile", ""),
+    ("to", ""),
+    ("to_sparse", ""),
+    ("topk", ""),
+    ("trace", ""),
+    ("transpose", ""),
+    ("trapezoid", ""),
+    ("trapz", ""),
+    ("triangular_solve", ""),
+    ("tril", ""),
+    ("tril_indices", ""),
+    ("triu", ""),
+    ("triu_indices", ""),
+    ("true_divide", ""),
+    ("trunc", ""),
+    ("unbind", ""),
+    ("unflatten", ""),
+    ("unfold", ""),
+    ("unfold_copy", ""),
+    ("uniform", ""),
+    ("unique", ""),
+    ("unique_consecutive", ""),
+    ("unsqueeze", ""),
+    ("var", ""),
+    ("var_mean", ""),
+    ("vdot", ""),
+    ("view", ""),
+    ("view_as", ""),
+    ("view_as_complex", ""),
+    ("view_as_real", ""),
+    ("vsplit", ""),
+    ("vstack", ""),
+    ("where", ""),
+    ("xlogy", ""),
+    ("zero_", ""),
+    ("zeros", ""),
+    ("zeros_like", ""),
+}
+
+
+def in_dtensor_lagging_op_db(opinfo: OpInfo) -> bool:
+    return (opinfo.name, opinfo.variant_test_name) in _dtensor_lagging_meta
+
+
+dtensor_lagging_op_db: List[OpInfo] = [
+    opinfo for opinfo in op_db if in_dtensor_lagging_op_db(opinfo)
+]
diff --git a/torch/testing/_internal/distributed/_tensor/gen_dtensor_lagging_op_db.py b/torch/testing/_internal/distributed/_tensor/gen_dtensor_lagging_op_db.py
new file mode 100644
index 0000000000000..f684f77ed2c4c
--- /dev/null
+++ b/torch/testing/_internal/distributed/_tensor/gen_dtensor_lagging_op_db.py
@@ -0,0 +1,67 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List, Tuple
+from torch.testing._internal.common_methods_invocations import op_db
+
+
+def num_leading_spaces(line: str) -> int:
+    result = len(line) - len(line.lstrip())
+    # Empty space handling
+    if result == 0:
+        return 999999
+    return result
+
+
+def deindent(code: str) -> str:
+    lines = code.split("\n")
+    min_leading_spaces = min(map(num_leading_spaces, lines))
+    lines = [line[min_leading_spaces:] for line in lines]
+    return "\n".join(lines)
+
+
+if __name__ == "__main__":
+    supported: List[Tuple[str, str]] = [
+        (opinfo.name, opinfo.variant_test_name) for opinfo in op_db
+    ]
+    supported = sorted(supported)
+    print(
+        deindent(
+            """\
+    # Copyright (c) Facebook, Inc. and its affiliates.
+    # All rights reserved.
+    #
+    # This source code is licensed under the BSD-style license found in the
+    # LICENSE file in the root directory of this source tree.
+    from typing import List
+    from torch.testing._internal.common_methods_invocations import op_db, OpInfo
+    # Generated from test/gen_dtensor_op_db.py via
+    # python spmd/testing/gen_dtensor_lagging_op_db.py > spmd/testing/dtensor_lagging_op_db.py
+    #
+    # This approach is copied from functorch:
+    # People add new OpInfos to PyTorch all the time.
+    # We want them to be able to add OpInfos without breaking our CI.
+    # To achieve this, we keep our OpInfo library behind that of Pytorch's and
+    # we periodically update our OpInfo library by regenerating this file"""
+        )
+    )
+
+    print("_dtensor_lagging_meta = {")
+    for name, variant in supported:
+        print(f"    {(name, variant)},")
+    print("}")
+
+    print(
+        deindent(
+            """\
+    def in_dtensor_lagging_op_db(opinfo: OpInfo) -> bool:
+        return (opinfo.name, opinfo.variant_test_name) in _dtensor_lagging_meta
+
+    dtensor_lagging_op_db: List[OpInfo] = [
+        opinfo for opinfo in op_db if in_dtensor_lagging_op_db(opinfo)
+    ]"""
+        )
+    )

From 41e608583e2f21e67e97434a5546e237bf2b8318 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Tue, 15 Nov 2022 22:51:33 +0000
Subject: [PATCH 0953/1922] [dtensor] PART 8: move tensor parallel api and
 tests to core distributed (#88180)

This PR moves tensor/parallel folder and tests to torch.distributed.

part of https://github.com/pytorch/pytorch/issues/88838
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88180
Approved by: https://github.com/aazzolini
---
 test/distributed/_tensor/parallel/__init__.py |   0
 .../_tensor/parallel/test_2d_parallel.py      | 223 ++++++++
 .../_tensor/parallel/test_tp_examples.py      | 516 ++++++++++++++++++
 .../parallel/test_view_sharding_dim_change.py |  30 +
 .../distributed/_tensor/parallel/__init__.py  |  10 +
 .../_tensor/parallel/_view_with_dim_change.py | 108 ++++
 torch/distributed/_tensor/parallel/api.py     |  86 +++
 torch/distributed/_tensor/parallel/fsdp.py    | 357 ++++++++++++
 .../parallel/multihead_attention_tp.py        | 273 +++++++++
 9 files changed, 1603 insertions(+)
 create mode 100644 test/distributed/_tensor/parallel/__init__.py
 create mode 100644 test/distributed/_tensor/parallel/test_2d_parallel.py
 create mode 100644 test/distributed/_tensor/parallel/test_tp_examples.py
 create mode 100644 test/distributed/_tensor/parallel/test_view_sharding_dim_change.py
 create mode 100644 torch/distributed/_tensor/parallel/__init__.py
 create mode 100644 torch/distributed/_tensor/parallel/_view_with_dim_change.py
 create mode 100644 torch/distributed/_tensor/parallel/api.py
 create mode 100644 torch/distributed/_tensor/parallel/fsdp.py
 create mode 100644 torch/distributed/_tensor/parallel/multihead_attention_tp.py

diff --git a/test/distributed/_tensor/parallel/__init__.py b/test/distributed/_tensor/parallel/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/test/distributed/_tensor/parallel/test_2d_parallel.py b/test/distributed/_tensor/parallel/test_2d_parallel.py
new file mode 100644
index 0000000000000..7a3779c296c3c
--- /dev/null
+++ b/test/distributed/_tensor/parallel/test_2d_parallel.py
@@ -0,0 +1,223 @@
+# Owner(s): ["oncall: distributed"]
+
+from typing import Any
+
+
+import torch
+import torch.nn.functional as F
+import torch.distributed as dist
+from torch.distributed._shard.sharded_tensor.api import ShardedTensor
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
+from torch.distributed._tensor import (
+    distribute_tensor,
+    DeviceMesh,
+    DTensor as DT,
+    Shard,
+    Replicate,
+)
+
+import torch.distributed.distributed_c10d as distributed_c10d
+
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.distributed._tensor.parallel.fsdp import is_available
+
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
+)
+
+# Tensor-Parallel degree
+TP_DEGREE = 2
+LR = 3e-5
+
+OPS_NOT_SHARD = [
+    "net3.weight",
+    "net3.bias",
+]
+
+SHARD_PARAMS = [
+    "net1.weight",
+    "net1.bias",
+    "net2.weight",
+]
+
+
+class SimpleModel(torch.nn.Module):
+    def __init__(self):
+        super(SimpleModel, self).__init__()
+        self.net1 = torch.nn.Linear(5, 8)
+        self.relu = torch.nn.ReLU()
+        self.net2 = torch.nn.Linear(8, 4)
+        self.net3 = torch.nn.Linear(4, 12)
+
+    def forward(self, x):
+        x = F.relu(self.net1(x))
+        x = F.relu(self.net2(x))
+        x = F.relu(self.net3(x))
+        return x
+
+
+def _aggregate_local_tensor(module: torch.nn.Module) -> torch.nn.Module:
+    def hook_func(_module, _input, output):
+        if isinstance(output, DT):
+            replica_placement = [Replicate()]
+            return output.redistribute(
+                output.device_mesh, replica_placement
+            ).to_local()
+
+    module.register_forward_hook(hook_func)
+    return module
+
+
+def _replicate_input_tensor(
+    module: torch.nn.Module, device_mesh, replica_placement
+) -> torch.nn.Module:
+    def hook_func(_, input):
+        if not isinstance(input[0], DT):
+            return DT.from_local(
+                input[0], device_mesh, replica_placement, run_check=False
+            )
+
+    module.register_forward_pre_hook(hook_func)
+    return module
+
+
+def shard_module(m, pg):
+    start_idx = distributed_c10d.get_global_rank(pg, 0)
+    device_mesh = DeviceMesh(
+        "cuda", list(range(start_idx, start_idx + pg.size())), dim_groups=[pg]
+    )
+    col_wise_sharding = [Shard(0)]
+    row_wise_sharding = [Shard(1)]
+    replicate = [Replicate()]
+    m.net1.weight = torch.nn.Parameter(
+        distribute_tensor(m.net1.weight, device_mesh, col_wise_sharding),
+    )
+    m.net2.weight = torch.nn.Parameter(
+        distribute_tensor(m.net2.weight, device_mesh, row_wise_sharding)
+    )
+    m.net1.bias = torch.nn.Parameter(
+        distribute_tensor(m.net1.bias, device_mesh, col_wise_sharding)
+    )
+    m.net2.bias = torch.nn.Parameter(
+        distribute_tensor(m.net2.bias, device_mesh, replicate)
+    )
+    m = _replicate_input_tensor(m, device_mesh, replicate)
+    m.net2 = _aggregate_local_tensor(m.net2)
+
+
+def _shard_wrap_module(module, module_shard, fsdp_wrap, tp_pg, fsdp_pg):
+    if module_shard:
+        # Fetch the module sharding planner.
+        shard_module(module, tp_pg)
+
+    if fsdp_wrap and module_shard:
+        return FSDP(module, process_group=fsdp_pg)
+    if fsdp_wrap:
+        return FSDP(module, process_group=distributed_c10d._get_default_group())
+    return module
+
+
+def init_model(model_parallel_size=TP_DEGREE):
+    rank = dist.get_rank()
+    torch.cuda.set_device(rank)
+    world_size = dist.get_world_size()
+
+    model = SimpleModel().cuda(rank)
+
+    # 2-D mesh is [dp, tp]
+    twod_mesh = DeviceMesh(
+        device_type="cuda",
+        mesh=torch.arange(0, world_size).view(model_parallel_size, -1),
+    )
+
+    fsdp_pg = twod_mesh.get_dim_groups()[0]
+    tp_pg = twod_mesh.get_dim_groups()[1]
+
+    # Create Input
+    model = _shard_wrap_module(model, True, True, tp_pg, fsdp_pg)
+    return model, tp_pg, fsdp_pg
+
+
+def is_nested_tensor(val: Any) -> bool:
+    if isinstance(val, ShardedTensor):
+        if len(val.local_shards()) == 0:
+            return False
+        if isinstance(val.local_shards()[0].tensor, ShardedTensor):
+            return True
+        if isinstance(val.local_shards()[0].tensor, DT):
+            raise ValueError("Cannot handle DT nested insided ST")
+    # Safety valve for when this eventually happen
+    elif isinstance(val, DT) and isinstance(
+        val._local_tensor, (DT, ShardedTensor)
+    ):
+        raise ValueError("Cannot handle nested DT")
+    return False
+
+
+class Test2dParallelIntegration(DTensorTestBase):
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    def test_2d_fsdp_integration_functionality(self) -> None:
+        if not is_available():
+            self.skipTest("FSDP 2d parallel integration not available")
+
+        model_tp = init_model()[0]
+
+        with FSDP.state_dict_type(model_tp, StateDictType.SHARDED_STATE_DICT):
+            state_dict = model_tp.state_dict()
+            # TODO once 2D is out, validate the nesting
+            self.assertTrue(is_nested_tensor(state_dict["net1.weight"]))
+            self.assertFalse(is_nested_tensor(state_dict["net3.bias"]))
+
+        optim = torch.optim.Adam(model_tp.parameters(), lr=0.0001)
+
+        # Create Input
+        input_seed = self.rank
+        torch.manual_seed(input_seed + 1)
+        input = torch.rand(4, 5).cuda(self.rank)
+
+        model_tp(input).sum().backward()
+        optim.step()
+
+        optim_state = FSDP.sharded_optim_state_dict(model_tp, optim)
+        # TODO once 2D is out, validate the nesting
+        self.assertTrue(
+            is_nested_tensor(optim_state["state"]["net1.weight"]["exp_avg"])
+        )
+        self.assertFalse(
+            is_nested_tensor(optim_state["state"]["net3.bias"]["exp_avg"])
+        )
+
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    def test_2d_fsdp_integration_correctness(self) -> None:
+        if not is_available():
+            self.skipTest("FSDP 2d parallel integration not available")
+        torch.manual_seed(0)
+        model = SimpleModel().cuda(self.rank)
+        model = FSDP(model)
+        torch.manual_seed(0)
+        model_2d, _, dp_pg = init_model()
+
+        optim = torch.optim.Adam(model.parameters(), lr=0.0001)
+        optim_2d = torch.optim.Adam(model_2d.parameters(), lr=0.0001)
+
+        for i in range(5):
+            # Ensure all input across TP ranks are same.
+            torch.manual_seed(i + dist.get_rank(dp_pg))
+            input = torch.rand(4, 5).cuda(self.rank)
+            output = model(input)
+            output_2d = model_2d(input)
+            self.assertEqual(output, output_2d)
+            output.sum().backward()
+            output_2d.sum().backward()
+            optim.step()
+            optim_2d.step()
+            self.assertEqual(model(input), model_2d(input))
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_tensor/parallel/test_tp_examples.py b/test/distributed/_tensor/parallel/test_tp_examples.py
new file mode 100644
index 0000000000000..582108ea7599a
--- /dev/null
+++ b/test/distributed/_tensor/parallel/test_tp_examples.py
@@ -0,0 +1,516 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+import torch
+import torch.nn as nn
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
+    NUM_DEVICES,
+    skip_unless_torch_gpu,
+)
+from torch.distributed._tensor import (
+    distribute_tensor,
+    distribute_module,
+    DeviceMesh,
+    DTensor,
+    Shard,
+    Replicate,
+)
+from torch.distributed._tensor.parallel import (
+    TensorParallelMultiheadAttention,
+    tp_shard_self_attn,
+    replicate_input,
+    replicate_output,
+)
+
+
+class MLPModule(torch.nn.Module):
+    def __init__(self, device):
+        super(MLPModule, self).__init__()
+        torch.manual_seed(5)
+        self.net1 = torch.nn.Linear(10, 16, device=device)
+        self.relu = torch.nn.ReLU()
+        self.net2 = torch.nn.Linear(16, 12, device=device)
+
+    def forward(self, x):
+        return self.net2(self.relu(self.net1(x)))
+
+
+def _aggregate_local_tensor(module: torch.nn.Module) -> torch.nn.Module:
+    def hook_func(_module, _input, output):
+        if isinstance(output, DTensor):
+            replica_placement = [Replicate()] * device_mesh.ndim
+            return (
+                output.redistribute(output.device_mesh, replica_placement)
+                .contiguous()
+                .to_local()
+            )
+
+    module.register_forward_hook(hook_func)
+    return module
+
+
+def shard_mlp(m, device_type, tp_size):
+    start_idx = 0
+    device_mesh = DeviceMesh(
+        device_type,
+        list(range(start_idx, start_idx + tp_size)),
+    )
+    col_wise_sharding = [Shard(0)]
+    row_wise_sharding = [Shard(1)]
+    replicate = [Replicate()] * device_mesh.ndim
+
+    def shard_params(name, module, device_mesh):
+        if isinstance(module, nn.Linear):
+            if name == "net1":
+                sharded_weight = nn.Parameter(
+                    distribute_tensor(
+                        module.weight, device_mesh, col_wise_sharding
+                    )
+                )
+                sharded_bias = nn.Parameter(
+                    distribute_tensor(
+                        module.bias, device_mesh, col_wise_sharding
+                    )
+                )
+                module.register_parameter("weight", sharded_weight)
+                module.register_parameter("bias", sharded_bias)
+            elif name == "net2":
+                sharded_weight = nn.Parameter(
+                    distribute_tensor(
+                        module.weight, device_mesh, row_wise_sharding
+                    )
+                )
+                replicated_bias = nn.Parameter(
+                    distribute_tensor(module.bias, device_mesh, replicate)
+                )
+                module.register_parameter("weight", sharded_weight)
+                module.register_parameter("bias", replicated_bias)
+
+    def aggregate_output(outputs, device_mesh):
+        assert isinstance(outputs, DTensor)
+        return (
+            outputs.redistribute(device_mesh, replicate).contiguous().to_local()
+        )
+
+    dist_mod = distribute_module(
+        m,
+        device_mesh,
+        partition_fn=shard_params,
+        input_fn=replicate_input,
+        output_fn=aggregate_output,
+    )
+    return dist_mod
+
+
+class MultiheadAttnWrap(nn.Module):
+    def __init__(self, embed_dim, num_heads, add_bias_kv=False, device=None):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(
+            embed_dim, num_heads, add_bias_kv=add_bias_kv, device=device
+        )
+
+    def forward(self, query, key, value):
+        return self.attn(query, key, value)
+
+
+class DistTensorParallelExampleTest(DTensorTestBase):
+    @with_comms
+    def test_mlp_megatron_e2e(self):
+        inp_size = [5, 10]
+        # Ensure all tp ranks have same input.
+        torch.manual_seed(0)
+        inp = torch.rand(*inp_size, device=self.device_type)
+        model = MLPModule(self.device_type)
+        model_tp = MLPModule(self.device_type)
+
+        # Ensure model are initialized the same way.
+        self.assertEqual(model.net1.weight, model_tp.net1.weight)
+        self.assertEqual(model.net1.bias, model_tp.net1.bias)
+        self.assertEqual(model.net2.weight, model_tp.net2.weight)
+        self.assertEqual(model.net2.bias, model_tp.net2.bias)
+
+        # Shard module and initialize optimizer.
+        LR = 0.25
+        shard_mlp(model_tp, self.device_type, NUM_DEVICES)
+        optim = torch.optim.SGD(model.parameters(), lr=LR)
+        optim_tp = torch.optim.SGD(model_tp.parameters(), lr=LR)
+
+        output = model(inp)
+        output_tp = model_tp(inp)
+        self.assertEqual(output, output_tp)
+
+        output.sum().backward()
+        output_tp.sum().backward()
+
+        device_mesh = model_tp.net1.weight.device_mesh
+        replicate = [Replicate()] * device_mesh.ndim
+
+        # Ensure gradients are same.
+        self.assertEqual(
+            model.net1.weight.grad,
+            model_tp.net1.weight.grad.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+        self.assertEqual(
+            model.net1.bias.grad,
+            model_tp.net1.bias.grad.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+        self.assertEqual(
+            model.net2.weight.grad,
+            model_tp.net2.weight.grad.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+        self.assertEqual(
+            model.net2.bias.grad,
+            model_tp.net2.bias.grad.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+
+        optim.step()
+        optim_tp.step()
+
+        # Ensure model weights are still same after update.
+        self.assertEqual(
+            model.net1.weight,
+            model_tp.net1.weight.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+        self.assertEqual(
+            model.net1.bias,
+            model_tp.net1.bias.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+        self.assertEqual(
+            model.net2.weight,
+            model_tp.net2.weight.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+        # Due to the trick we use for Partial aggregation, we only check the weight when local_rank = 0.
+        if self.rank == 0:
+            self.assertEqual(
+                model.net2.bias,
+                model_tp.net2.bias.redistribute(
+                    device_mesh=device_mesh, placements=replicate
+                ).to_local(),
+            )
+
+        inp = torch.rand(*inp_size, device=self.device_type)
+        output = model(inp)
+        output_tp = model_tp(inp)
+        self.assertEqual(output, output_tp)
+
+    # TensorParallelMultiheadAttention == dist_module(TensorParallelMultiheadAttention)
+    # baddbmm introduces nan occasionally on CPU: https://github.com/pytorch/pytorch/issues/80588
+    @with_comms
+    @skip_unless_torch_gpu
+    def test_self_attn_megatron_e2e(self):
+        inp_size = [8, 12, 16]
+        # Ensure all tp ranks have same input.
+        torch.manual_seed(0)
+        inp = torch.rand(*inp_size, device=self.device_type)
+
+        # Initialize model using same seed.
+        torch.manual_seed(5)
+        model = TensorParallelMultiheadAttention(
+            16,
+            8,
+            tp_size=NUM_DEVICES,
+            add_bias_kv=True,
+            device=self.device_type,
+        )
+        torch.manual_seed(5)
+        model_tp = TensorParallelMultiheadAttention(
+            16,
+            8,
+            tp_size=NUM_DEVICES,
+            add_bias_kv=True,
+            device=self.device_type,
+        )
+
+        # Ensure model are initialized the same way.
+        self.assertEqual(model.qkv.weight, model_tp.qkv.weight)
+        self.assertEqual(model.qkv.bias, model_tp.qkv.bias)
+        self.assertEqual(model.proj.weight, model_tp.proj.weight)
+        self.assertEqual(model.proj.bias, model_tp.proj.bias)
+
+        # Shard module and initialize optimizer.
+        device_mesh = DeviceMesh(self.device_type, list(range(NUM_DEVICES)))
+        distribute_module(
+            model_tp,
+            device_mesh,
+            partition_fn=tp_shard_self_attn,
+            input_fn=replicate_input,
+            output_fn=replicate_output,
+        )
+
+        device_mesh = model_tp.qkv.weight.device_mesh
+        replicate = [Replicate()] * device_mesh.ndim
+        # Ensure model are initialized the same way.
+        self.assertEqual(
+            model.qkv.weight,
+            model_tp.qkv.weight.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+        self.assertEqual(
+            model.qkv.bias,
+            model_tp.qkv.bias.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+        self.assertEqual(
+            model.proj.weight,
+            model_tp.proj.weight.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+        self.assertEqual(
+            model.proj.bias,
+            model_tp.proj.bias.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+
+        LR = 0.25
+        optim = torch.optim.SGD(model.parameters(), lr=LR)
+        optim_tp = torch.optim.SGD(model_tp.parameters(), lr=LR)
+
+        output = model(inp, inp, inp)
+        output_tp = model_tp(inp, inp, inp)
+        self.assertEqual(output, output_tp)
+
+        output.sum().backward()
+        output_tp.sum().backward()
+
+        device_mesh = model_tp.qkv.weight.device_mesh
+        # Ensure gradients are same.
+        self.assertEqual(
+            model.qkv.weight.grad,
+            model_tp.qkv.weight.grad.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+        self.assertEqual(
+            model.qkv.bias.grad,
+            model_tp.qkv.bias.grad.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+        self.assertEqual(
+            model.proj.weight.grad,
+            model_tp.proj.weight.grad.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+        self.assertEqual(
+            model.proj.bias.grad,
+            model_tp.proj.bias.grad.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+
+        optim.step()
+        optim_tp.step()
+
+        # Ensure model weights are still same after update.
+        self.assertEqual(
+            model.qkv.weight,
+            model_tp.qkv.weight.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+        self.assertEqual(
+            model.qkv.bias,
+            model_tp.qkv.bias.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+        self.assertEqual(
+            model.proj.weight,
+            model_tp.proj.weight.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+        self.assertEqual(
+            model.proj.bias,
+            model_tp.proj.bias.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+
+        inp = torch.rand(*inp_size, device=self.device_type)
+        output = model(inp, inp, inp)
+        output_tp = model_tp(inp, inp, inp)
+        self.assertEqual(output, output_tp)
+
+    # TensorParallelMultiheadAttention == dist_module(torch.nn.MultiheadAttention)
+    # baddbmm introduces nan occasionally on CPU: https://github.com/pytorch/pytorch/issues/80588
+    @with_comms
+    @skip_unless_torch_gpu
+    def test_self_attn_replacement_megatron_e2e(self):
+        inp_size = [8, 12, 16]
+        # Ensure all tp ranks have same input.
+        torch.manual_seed(0)
+        inp = torch.rand(*inp_size, device=self.device_type)
+
+        # TODO: our sharding function cannot shard the root node
+        torch.manual_seed(5)
+        model = TensorParallelMultiheadAttention(
+            16,
+            8,
+            tp_size=NUM_DEVICES,
+            add_bias_kv=True,
+            device=self.device_type,
+        )
+        model_tp = MultiheadAttnWrap(
+            16, 8, add_bias_kv=True, device=self.device_type
+        )
+
+        # TODO: somehow using torch.nn.MultiheadAttention's initial params does not work
+        # Use TensorParallelMultiheadAttention parameters instead
+        x = model.qkv.weight.clone().detach().requires_grad_()
+        model_tp.attn.register_parameter(
+            "in_proj_weight", torch.nn.Parameter(x)
+        )
+
+        x = model.qkv.bias.clone().detach().requires_grad_()
+        model_tp.attn.register_parameter("in_proj_bias", torch.nn.Parameter(x))
+
+        x = model.proj.weight.clone().detach().requires_grad_()
+        model_tp.attn.out_proj.register_parameter(
+            "weight", torch.nn.Parameter(x)
+        )
+
+        x = model.proj.bias.clone().detach().requires_grad_()
+        model_tp.attn.out_proj.register_parameter("bias", torch.nn.Parameter(x))
+
+        # check if parameters are same
+        self.assertEqual(model.qkv.weight, model_tp.attn.in_proj_weight)
+        self.assertEqual(model.qkv.bias, model_tp.attn.in_proj_bias)
+        self.assertEqual(model.proj.weight, model_tp.attn.out_proj.weight)
+        self.assertEqual(model.proj.bias, model_tp.attn.out_proj.bias)
+
+        # Shard module and initialize optimizer.
+        device_mesh = DeviceMesh(self.device_type, list(range(NUM_DEVICES)))
+        distribute_module(
+            model_tp,
+            device_mesh,
+            partition_fn=tp_shard_self_attn,
+            input_fn=replicate_input,
+            output_fn=replicate_output,
+        )
+
+        device_mesh = model_tp.attn.qkv.weight.device_mesh
+        replicate = [Replicate()] * device_mesh.ndim
+        # Ensure model are initialized the same way.
+        self.assertEqual(
+            model.qkv.weight,
+            model_tp.attn.qkv.weight.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+        self.assertEqual(
+            model.qkv.bias,
+            model_tp.attn.qkv.bias.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+        self.assertEqual(
+            model.proj.weight,
+            model_tp.attn.proj.weight.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+        self.assertEqual(
+            model.proj.bias,
+            model_tp.attn.proj.bias.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+
+        LR = 0.25
+        optim = torch.optim.SGD(model.parameters(), lr=LR)
+        optim_tp = torch.optim.SGD(model_tp.parameters(), lr=LR)
+
+        output = model(inp, inp, inp)
+        output_tp = model_tp(inp, inp, inp)
+        self.assertEqual(output, output_tp)
+
+        output.sum().backward()
+        output_tp.sum().backward()
+
+        device_mesh = model_tp.attn.qkv.weight.device_mesh
+        # Ensure gradients are same.
+        self.assertEqual(
+            model.qkv.weight.grad,
+            model_tp.attn.qkv.weight.grad.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+        self.assertEqual(
+            model.qkv.bias.grad,
+            model_tp.attn.qkv.bias.grad.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+        self.assertEqual(
+            model.proj.weight.grad,
+            model_tp.attn.proj.weight.grad.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+        self.assertEqual(
+            model.proj.bias.grad,
+            model_tp.attn.proj.bias.grad.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+
+        optim.step()
+        optim_tp.step()
+
+        # Ensure model weights are still same after update.
+        self.assertEqual(
+            model.qkv.weight,
+            model_tp.attn.qkv.weight.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+        self.assertEqual(
+            model.qkv.bias,
+            model_tp.attn.qkv.bias.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+        self.assertEqual(
+            model.proj.weight,
+            model_tp.attn.proj.weight.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+        self.assertEqual(
+            model.proj.bias,
+            model_tp.attn.proj.bias.redistribute(
+                device_mesh=device_mesh, placements=replicate
+            ).to_local(),
+        )
+
+        inp = torch.rand(*inp_size, device=self.device_type)
+        output = model(inp, inp, inp)
+        output_tp = model_tp(inp, inp, inp)
+        self.assertEqual(output, output_tp)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_tensor/parallel/test_view_sharding_dim_change.py b/test/distributed/_tensor/parallel/test_view_sharding_dim_change.py
new file mode 100644
index 0000000000000..4648d930b9eb6
--- /dev/null
+++ b/test/distributed/_tensor/parallel/test_view_sharding_dim_change.py
@@ -0,0 +1,30 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+import torch
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
+)
+from torch.distributed._tensor import DeviceMesh, DTensor, Shard
+from torch.distributed._tensor.parallel._view_with_dim_change import (
+    _view_with_sharding_dim_change,
+)
+
+
+class TPViewShardingDimChangeTest(DTensorTestBase):
+    @with_comms
+    def test_view_with_sharding_dim_change(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        torch.manual_seed(self.rank)
+        tensor = torch.rand(3, 5, 6, device=self.device_type)
+        sharding = [Shard(2)]
+        dt = DTensor.from_local(tensor, device_mesh, sharding)
+        dt = _view_with_sharding_dim_change(dt, 1, (3, -1, 6))
+        self.assertTrue(dt.placements[0].is_shard(dim=1))
+        self.assertEqual(dt.to_local(), tensor.view(3, -1, 6))
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/distributed/_tensor/parallel/__init__.py b/torch/distributed/_tensor/parallel/__init__.py
new file mode 100644
index 0000000000000..5725c5077d4bb
--- /dev/null
+++ b/torch/distributed/_tensor/parallel/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from torch.distributed._tensor.parallel.multihead_attention_tp import (
+    TensorParallelMultiheadAttention,
+)
+
+from torch.distributed._tensor.parallel.api import (
+    tp_shard_self_attn,
+    replicate_input,
+    replicate_output,
+)
diff --git a/torch/distributed/_tensor/parallel/_view_with_dim_change.py b/torch/distributed/_tensor/parallel/_view_with_dim_change.py
new file mode 100644
index 0000000000000..7988129318b78
--- /dev/null
+++ b/torch/distributed/_tensor/parallel/_view_with_dim_change.py
@@ -0,0 +1,108 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from typing import Tuple, Union
+
+import torch
+from torch.distributed._tensor import DTensor as DT
+from torch.distributed._tensor.placement_types import Shard
+from torch.distributed._tensor.ops.utils import prod
+
+
+def _view_with_sharding_dim_change(
+    tensor: Union[torch.Tensor, DT], sharding_dim: int, shape: Tuple[int, ...]
+) -> Union[torch.Tensor, DT]:
+    """
+    We change the implicit sharding dim for a distributed tensor without comms.
+    Because if we don't change sharding dim, we will ended up having more comms that are not necessary.
+    Note that this op will produce invalid DTensor, you will need to call this op in pair to recover
+    it back to a valid DTensor.
+
+    This should only be used when implicitly changing sharding dim doesn't have semantic issue.
+    """
+    if isinstance(tensor, DT):
+        # pyre-fixme[16]: Undefined attribute.
+        return _ViewAndRedistribute.apply(tensor, sharding_dim, shape)
+    else:
+        return tensor.view(shape)
+
+
+class _ViewAndRedistribute(torch.autograd.Function):
+    @staticmethod
+    # pyre-fixme[14]: Inconsistent override.
+    def forward(  # type: ignore[override]
+        ctx,  # pyre-ignore[2]: Parameter must be annotated.
+        self: DT,
+        sharding_dim: int,
+        shape: Tuple[int, ...],
+    ) -> DT:
+        ctx.previous_placement = self.placements
+        ctx.previous_device_mesh = self.device_mesh
+        ctx.previous_local_shape = self.to_local().size()
+        ctx.previous_global_shape = self.size()
+        assert (
+            self.device_mesh.ndim == 1
+        ), "Only support 1D Device Mesh for _ViewAndRedistribute."
+        if (
+            self.placements[0].is_shard(dim=sharding_dim)
+            or self.placements[0].is_replicate()
+            or self.placements[0].is_partial()
+        ):
+            # pyre-fixme[7]: Incompatible return type.
+            return self.view(shape)  # type: ignore[return-value]
+        else:
+            if sharding_dim < 0:
+                sharding_dim += self.dim()
+
+            device_mesh = self.device_mesh
+            world_size = device_mesh.size(dim=0)
+            new_sharding_placement = [Shard(sharding_dim)]
+
+            # Fix shape
+            try:
+                infer_idx = shape.index(-1)
+            except ValueError:
+                infer_idx = None  # type: ignore[assignment]
+
+            # Infer the dim which is specified with -1.
+            if infer_idx is not None:
+                st_size = prod(self.size())  # type: ignore[attr-defined]
+                shape_size = -1 * prod(shape)  # type: ignore[attr-defined]
+                # pyre-fixme[60]: Concatenation not yet support for multiple variadic
+                shape = (
+                    *shape[:infer_idx],
+                    st_size // shape_size,
+                    *shape[infer_idx + 1 :],
+                )
+
+            # pyre-fixme[60]: Concatenation not yet support for multiple variadic
+            new_local_tensor_size = (
+                *shape[:sharding_dim],
+                shape[sharding_dim] // world_size,
+                *shape[sharding_dim + 1 :],
+            )
+            new_local_tensor = self.to_local().view(*new_local_tensor_size)
+
+            return DT(
+                new_local_tensor,
+                device_mesh,
+                new_sharding_placement,
+                size=torch.Size(shape),
+                requires_grad=new_local_tensor.requires_grad,
+            )
+
+    @staticmethod
+    def backward(ctx, grad_output: DT) -> Tuple[DT, None, None]:  # type: ignore[override]
+        previous_placement = ctx.previous_placement
+        previous_device_mesh = ctx.previous_device_mesh
+        previous_local_tensor_size = ctx.previous_local_shape
+        previous_global_shape = ctx.previous_global_shape
+        return (
+            DT(
+                grad_output.to_local().view(*previous_local_tensor_size),
+                previous_device_mesh,
+                previous_placement,
+                size=previous_global_shape,
+                requires_grad=grad_output.requires_grad,
+            ),
+            None,
+            None,
+        )
diff --git a/torch/distributed/_tensor/parallel/api.py b/torch/distributed/_tensor/parallel/api.py
new file mode 100644
index 0000000000000..7ab3ad2199f29
--- /dev/null
+++ b/torch/distributed/_tensor/parallel/api.py
@@ -0,0 +1,86 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import torch
+import torch.nn as nn
+from typing import Sequence, Tuple
+from torch.distributed._tensor import (
+    distribute_tensor,
+    DTensor,
+    Shard,
+    Replicate,
+    DeviceMesh,
+    Placement,
+)
+from torch.distributed._tensor.parallel import TensorParallelMultiheadAttention
+
+
+def replicate_input(
+    inputs: Sequence[torch.Tensor], device_mesh: DeviceMesh
+) -> Tuple[DTensor, ...]:
+    replicate = [Replicate()] * device_mesh.ndim
+    return tuple(
+        DTensor.from_local(tensor, device_mesh, replicate) for tensor in inputs
+    )
+
+
+def replicate_output(output: DTensor, device_mesh: DeviceMesh) -> torch.Tensor:
+    if isinstance(output, DTensor):
+        replicate = [Replicate()] * output.device_mesh.ndim
+        # TODO: can the output be left incontiguous?
+        return (
+            output.redistribute(output.device_mesh, replicate)
+            .to_local()
+            .contiguous()
+        )
+
+
+def tp_shard_self_attn(
+    name: str, module: nn.Module, device_mesh: DeviceMesh
+) -> None:
+    col_wise_sharding: Sequence[Placement] = [Shard(0)]
+    row_wise_sharding: Sequence[Placement] = [Shard(1)]
+    replicate: Sequence[Placement] = [Replicate()] * device_mesh.ndim
+
+    def _shard_self_attn_params(name: str, module: nn.Module) -> None:
+        if isinstance(module, nn.Linear):
+            if name == "qkv":
+                sharded_weight = nn.Parameter(
+                    distribute_tensor(
+                        module.weight, device_mesh, col_wise_sharding
+                    )
+                )
+                module.register_parameter("weight", sharded_weight)
+                if module.bias is not None:
+                    sharded_bias = nn.Parameter(
+                        distribute_tensor(
+                            module.bias, device_mesh, col_wise_sharding
+                        )
+                    )
+                    module.register_parameter("bias", sharded_bias)
+            elif name == "proj":
+                sharded_weight = nn.Parameter(
+                    distribute_tensor(
+                        module.weight, device_mesh, row_wise_sharding
+                    )
+                )
+                module.register_parameter("weight", sharded_weight)
+                if module.bias is not None:
+                    replicated_bias = nn.Parameter(
+                        distribute_tensor(module.bias, device_mesh, replicate)
+                    )
+                    module.register_parameter("bias", replicated_bias)
+
+    if isinstance(module, TensorParallelMultiheadAttention):  # shard TPMA
+        for n, m in module.named_children():
+            _shard_self_attn_params(n, m)
+    else:
+        for n, m in module.named_children():  # replace with TPMA
+            if isinstance(m, nn.MultiheadAttention):
+                tp_multi_head_attention = TensorParallelMultiheadAttention(
+                    m.embed_dim,
+                    m.num_heads,
+                    device=torch.device(device_mesh.device_type),
+                    tp_size=device_mesh.size(0),  # group size on dim 0
+                    add_bias_kv=m.bias_k is not None,
+                )
+                tp_multi_head_attention.copy(m)
+                module.register_module(n, tp_multi_head_attention)
diff --git a/torch/distributed/_tensor/parallel/fsdp.py b/torch/distributed/_tensor/parallel/fsdp.py
new file mode 100644
index 0000000000000..1f1123c517756
--- /dev/null
+++ b/torch/distributed/_tensor/parallel/fsdp.py
@@ -0,0 +1,357 @@
+import warnings
+import copy
+from typing import List, NamedTuple, Optional, Tuple, cast
+
+import torch
+import torch.distributed as dist
+import torch.distributed.distributed_c10d as c10d
+
+from torch.distributed.fsdp._shard_utils import _create_chunk_sharded_tensor
+
+import torch.distributed._shard.sharding_spec as shard_spec
+from torch.distributed._shard.sharding_spec.chunk_sharding_spec import (
+    ChunkShardingSpec,
+)
+
+from torch.distributed._shard.sharded_tensor import (
+    Shard,
+    ShardedTensor,
+    ShardedTensorMetadata,
+    TensorProperties,
+)
+
+from torch.distributed._shard.sharding_spec import (
+    ShardMetadata,
+)
+
+from torch.distributed.remote_device import _remote_device
+
+from torch.distributed._tensor import (
+    DTensor as DistributedTensor,
+    DeviceMesh,
+    Shard as DShard,
+)
+from torch.distributed._tensor.placement_types import Placement
+
+__all__ = ["is_available"]
+
+
+class _STShardingInfo(NamedTuple):
+    """:class:`ShardedTensor` sharding information."""
+
+    sharding_spec: Optional[shard_spec.ShardingSpec]
+    global_size: Optional[torch.Size]
+    process_group: Optional[c10d.ProcessGroup]
+    device_mesh: Optional[DeviceMesh]
+    placements: Optional[List[Placement]]
+
+
+def _get_box(tensor: DistributedTensor) -> Tuple[torch.Size, torch.Size]:
+    device_mesh = tensor.device_mesh
+    assert device_mesh.ndim == 1, "Only 1D DeviceMeshes currently handled"
+
+    placement = tensor.placements[0]
+    offsets = [0] * len(tensor.size())
+    num_chunks = device_mesh.size(dim=0)
+
+    if tensor.placements[0].is_shard():
+        shard_dim = cast(DShard, placement).dim
+        chunk_size = tensor.size(shard_dim) // num_chunks
+        offsets[shard_dim] = chunk_size
+
+    return (torch.Size(offsets), tensor._local_tensor.size())
+
+
+def _get_box_for(
+    tensor: DistributedTensor, idx: int
+) -> Tuple[torch.Size, torch.Size]:
+    offsets, size = _get_box(tensor)
+    return (torch.Size([val * idx for val in offsets]), size)
+
+
+def _get_local_box(tensor: DistributedTensor) -> Tuple[torch.Size, torch.Size]:
+    device_mesh = tensor.device_mesh
+    dim_0_coord = device_mesh.get_coordinate_on_dim(0)
+    assert dim_0_coord is not None
+    return _get_box_for(tensor, dim_0_coord)
+
+
+def _create_shard_md_from_dt(
+    dt: DistributedTensor, current_rank: int
+) -> ShardMetadata:
+    mesh = dt.device_mesh
+    assert mesh.ndim == 1, "Only 1D DeviceMeshes currently handled"
+
+    offsets, sizes = _get_local_box(dt)
+    return ShardMetadata(
+        shard_offsets=list(offsets),
+        shard_sizes=list(sizes),
+        placement=f"rank:{current_rank}/{dt._local_tensor.device}",
+    )
+
+
+def _create_sharded_tensor_md_from_dt(
+    dt: DistributedTensor, dt_pg: c10d.ProcessGroup
+) -> ShardedTensorMetadata:
+    # This is where it gets tricky, we have to produce a ShardedTensor that has full coverage
+    # and yet has only one valid shard for the current rank.
+
+    shards_md = []
+    my_rank = dist.get_rank(dt_pg)
+    scapegoat_rank = 0 if my_rank > 0 else 1
+
+    if dt.placements[0].is_shard():
+        shard_count = dt_pg.size()
+    else:
+        shard_count = 1
+
+    for i in range(shard_count):
+        offsets, sizes = _get_box_for(dt, i)
+        shards_md.append(
+            ShardMetadata(
+                shard_offsets=list(offsets),
+                shard_sizes=list(sizes),
+                placement=f"rank:{scapegoat_rank if i > 0 else my_rank}/{dt._local_tensor.device}",
+            )
+        )
+
+    return ShardedTensorMetadata(
+        shards_metadata=shards_md,
+        size=dt.size(),
+        tensor_properties=TensorProperties(
+            dtype=dt.dtype,
+            layout=dt.layout,
+            requires_grad=dt.requires_grad,
+            # ignore memory_format and pin_memory as those are not supported by DT
+        ),
+    )
+
+
+def _get_dt_pg(dt: DistributedTensor) -> c10d.ProcessGroup:
+    mesh = dt.device_mesh
+    assert mesh.ndim == 1, "Only 1D DeviceMeshes currently handled"
+    return mesh.get_dim_groups()[0]
+
+
+def _rewrite_spec_if_needed(
+    spec: shard_spec.ShardingSpec, tensor: torch.Tensor, rank: int
+) -> shard_spec.ShardingSpec:
+    """
+    Rewrite ``spec`` to match the device of ``tensor``.
+
+    FSDP.sharded_optim_state_dict sneakly ships optimizer state to CPU so if the original ShardingSpec
+    produces CUDA metadata, ST construction bombs.
+    """
+    if not isinstance(spec, ChunkShardingSpec):
+        return spec
+
+    # let's see if we need
+    rewrite = False
+    for p in spec.placements:
+        p = cast(_remote_device, p)
+        if p.rank() == rank and p.device() != tensor.device:
+            rewrite = True
+            break
+    if rewrite:
+        spec = copy.deepcopy(spec)
+        for i, placement in enumerate(spec.placements):
+            placement = cast(_remote_device, placement)
+            if placement.rank() == rank and placement.device() != tensor.device:
+                spec.placements[i] = _remote_device(
+                    f"rank:{rank}/{tensor.device}"
+                )
+
+    return spec
+
+
+def _flatten_tensor(
+    tensor: torch.Tensor,
+) -> Tuple[torch.Tensor, Optional[_STShardingInfo]]:
+    if type(tensor) is ShardedTensor:
+        return tensor.local_tensor(), _STShardingInfo(
+            tensor.sharding_spec(),
+            tensor.size(),
+            tensor._process_group,
+            None,
+            None,
+        )
+    elif type(tensor) is DistributedTensor:
+        tensor._local_tensor.requires_grad_()
+        return tensor._local_tensor, _STShardingInfo(
+            None,
+            None,
+            None,
+            tensor.device_mesh,
+            list(tensor.placements),
+        )
+    return tensor, None
+
+
+def _unflatten_tensor(
+    tensor: torch.Tensor, sharding_info: _STShardingInfo
+) -> torch.Tensor:
+    result: torch.Tensor
+
+    if sharding_info.sharding_spec is not None:
+        assert sharding_info.global_size is not None
+        result = ShardedTensor._init_from_local_tensor(
+            tensor,
+            _rewrite_spec_if_needed(
+                sharding_info.sharding_spec,
+                tensor,
+                dist.get_rank(sharding_info.process_group),
+            ),
+            sharding_info.global_size,
+            process_group=cast(dist.ProcessGroup, sharding_info.process_group),
+        )
+    else:
+        result = DistributedTensor.from_local(
+            tensor,
+            device_mesh=sharding_info.device_mesh,
+            placements=sharding_info.placements,
+            run_check=False,
+        )
+
+    _set_fsdp_flattened(result)
+    return result
+
+
+def _chunk_tensor(
+    tensor: torch.Tensor,
+    rank: int,
+    world_size: int,
+    num_devices_per_node: int,
+    pg: dist.ProcessGroup,
+) -> torch.Tensor:
+    if type(tensor) is ShardedTensor:
+        assert len(tensor.local_shards()) == 1
+
+        inner_param = tensor.local_tensor()
+        inner_st = _create_chunk_sharded_tensor(
+            inner_param,
+            rank,
+            world_size,
+            num_devices_per_node,
+            pg,
+        )
+
+        outer_local_shard = tensor.local_shards()[0]
+        shards: List[Shard] = [
+            Shard(inner_st, copy.deepcopy(outer_local_shard.metadata))
+        ]
+        st_meta = copy.deepcopy(tensor.metadata())
+        st_meta.tensor_properties.requires_grad = False
+
+        st_outer = ShardedTensor._init_from_local_shards_and_global_metadata(
+            shards,
+            sharded_tensor_metadata=st_meta,
+            process_group=tensor._process_group,
+            init_rrefs=False,
+        )
+        return st_outer
+    elif type(tensor) is DistributedTensor:
+        device_mesh = tensor.device_mesh
+        assert device_mesh.ndim == 1, "Only 1D DeviceMeshes currently handled"
+
+        inner_param = tensor._local_tensor
+
+        inner_st = _create_chunk_sharded_tensor(
+            inner_param,
+            rank,
+            world_size,
+            torch.cuda.device_count(),
+            pg,
+        )
+
+        dt_pg = _get_dt_pg(tensor)
+        # We do this differently here, we create a ST with no local shards then patch it
+        shards = [
+            Shard(
+                inner_st, _create_shard_md_from_dt(tensor, dist.get_rank(dt_pg))
+            )
+        ]
+
+        st_meta = _create_sharded_tensor_md_from_dt(tensor, dt_pg)
+        st_meta.tensor_properties.requires_grad = False
+
+        st_outer = ShardedTensor._init_from_local_shards_and_global_metadata(
+            shards,
+            sharded_tensor_metadata=st_meta,
+            process_group=dt_pg,
+            init_rrefs=False,
+        )
+
+        return st_outer
+    else:
+        return _create_chunk_sharded_tensor(
+            tensor,
+            rank,
+            world_size,
+            num_devices_per_node,
+            pg,
+        )
+
+
+def _pre_load_state_dict(
+    tensor: torch.Tensor,
+) -> Tuple[torch.Tensor, List[Shard]]:
+    shards = cast(ShardedTensor, tensor).local_shards()
+    if len(shards) == 1 and type(shards[0].tensor) is ShardedTensor:
+        inner_tensor = shards[0].tensor
+        shards = inner_tensor.local_shards()  # pyre-ignore[16]
+        tensor = inner_tensor
+
+    return (tensor, shards if len(shards) > 0 else [])
+
+
+try:
+    from torch.distributed.fsdp._fsdp_extensions import (
+        _set_fsdp_extensions,
+        FSDPExtensions,
+    )
+    from torch.distributed.fsdp._common_utils import _set_fsdp_flattened
+
+    class DTensorExtensions(FSDPExtensions):
+        def pre_flatten_transform(
+            self,
+            tensor: torch.Tensor,
+        ) -> Tuple[torch.Tensor, Optional[_STShardingInfo]]:
+            return _flatten_tensor(tensor)
+
+        def post_unflatten_transform(
+            self, tensor: torch.Tensor, param_extension: _STShardingInfo
+        ) -> torch.Tensor:
+            return _unflatten_tensor(tensor, param_extension)
+
+        def chunk_tensor(
+            self,
+            tensor: torch.Tensor,
+            rank: int,
+            world_size: int,
+            num_devices_per_node: int,
+            pg: dist.ProcessGroup,
+        ) -> torch.Tensor:
+            return _chunk_tensor(
+                tensor, rank, world_size, num_devices_per_node, pg
+            )
+
+        def pre_load_state_dict_transform(
+            self,
+            tensor: torch.Tensor,
+        ) -> Tuple[torch.Tensor, List[Shard]]:
+            return _pre_load_state_dict(tensor)
+
+    _set_fsdp_extensions(DTensorExtensions())
+
+    def is_available() -> bool:
+        return True
+
+except BaseException as e:
+    warnings.warn(
+        "PyTorch doesn't have TensorFlattener extension point available"
+        "2D parallelism won't work with FSDP"
+        f"exception: {e}"
+    )
+
+    def is_available() -> bool:
+        return False
diff --git a/torch/distributed/_tensor/parallel/multihead_attention_tp.py b/torch/distributed/_tensor/parallel/multihead_attention_tp.py
new file mode 100644
index 0000000000000..3071f42632fd5
--- /dev/null
+++ b/torch/distributed/_tensor/parallel/multihead_attention_tp.py
@@ -0,0 +1,273 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# pyre-ignore-all-errors[6]
+
+import math
+
+import torch
+from torch.distributed._tensor import DTensor as DT
+from torch.distributed._tensor.placement_types import Shard
+from torch.distributed._tensor.parallel._view_with_dim_change import (
+    _view_with_sharding_dim_change,
+)
+
+from typing import Optional, Union
+
+
+# TODO: Add a test to test equivalence between our Multihead Attention
+# with other mainstream ones (Megatron-LM or PyTorch).
+def _stride_same_as_shard(
+    tensor: torch.Tensor, tp_size: int, chunk_dim: int, cat_dim: int
+) -> torch.Tensor:
+    """
+    Adjust local tensor's stride same as the sharded situation.
+    So that view result will keeps the same.
+    """
+    if isinstance(tensor, DT):
+        return tensor
+    view_size = list(tensor.size())
+    view_size[chunk_dim] //= tp_size
+    return torch.cat(
+        [t.view(*view_size) for t in tensor.chunk(tp_size, dim=chunk_dim)],
+        dim=cat_dim,
+    ).contiguous()
+
+
+class TensorParallelMultiheadAttention(torch.nn.Module):
+    """
+    Multi-head Attention block from Transformer models.
+    Since we need some customizations for the attention layer,
+    we are writing a customized but mathematically equivalent
+    attention module as defined in torch.nn.
+
+    Note that:
+    We now only support the case when it's self attention with
+    limited input args and we also assume that the input tensor
+    has a dimension of three. Although we do implement the logic
+    for multihead attention, it was not fully tested.
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+        add_bias_kv: bool = False,
+        add_zero_attn: bool = False,
+        kdim: Optional[int] = None,
+        vdim: Optional[int] = None,
+        batch_first: bool = False,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+        tp_size: int = 1,
+        self_attention: bool = True,
+    ) -> None:
+        super(TensorParallelMultiheadAttention, self).__init__()
+        self.device: torch.device = (
+            torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            if device is None
+            else device
+        )
+        self.num_heads = num_heads
+        self.hidden_size = embed_dim
+        self.hidden_size_per_attention_head: int = self.hidden_size // num_heads
+        self.scale: float = self.hidden_size_per_attention_head**-0.5
+        if self_attention:
+            self.qkv: torch.nn.Module = torch.nn.Linear(
+                embed_dim, embed_dim * 3, bias=add_bias_kv, device=self.device
+            )
+            torch.nn.init.xavier_uniform_(self.qkv.weight)
+            if add_bias_kv:
+                torch.nn.init.zeros_(self.qkv.bias)
+        else:
+            self.query: torch.nn.Module = torch.nn.Linear(
+                embed_dim, embed_dim, bias=add_bias_kv, device=self.device
+            )
+            self.key: torch.nn.Module = torch.nn.Linear(
+                embed_dim, embed_dim, bias=add_bias_kv, device=self.device
+            )
+            self.value: torch.nn.Module = torch.nn.Linear(
+                embed_dim, embed_dim, bias=add_bias_kv, device=self.device
+            )
+            torch.nn.init.xavier_uniform_(self.query.weight)
+            torch.nn.init.xavier_uniform_(self.key.weight)
+            torch.nn.init.xavier_uniform_(self.value.weight)
+            if add_bias_kv:
+                torch.nn.init.zeros_(self.query.bias)
+                torch.nn.init.zeros_(self.key.bias)
+                torch.nn.init.zeros_(self.value.bias)
+        self.proj: torch.nn.Module = torch.nn.Linear(
+            embed_dim, embed_dim, bias=bias, device=self.device
+        )
+        torch.nn.init.kaiming_uniform_(self.proj.weight, a=math.sqrt(5))
+        if bias:
+            torch.nn.init.zeros_(self.proj.bias)
+        self.tp_size = tp_size
+        self.hidden_size = embed_dim
+        self.norm_factor: float = math.sqrt(self.hidden_size_per_attention_head)
+        self.self_attention = self_attention
+
+    def forward(
+        self,
+        query: Union[torch.Tensor, DT],
+        key: Union[torch.Tensor, DT],
+        value: Union[torch.Tensor, DT],
+        key_padding_mask: Optional[torch.Tensor] = None,
+        need_weights: bool = True,
+        attn_mask: Optional[torch.Tensor] = None,
+        average_attn_weights: bool = True,
+    ) -> Union[torch.Tensor, DT]:
+        b, sq, h = query.shape
+        sk = key.size(1)
+        nh = self.num_heads
+        hn = self.hidden_size_per_attention_head
+
+        # x: [b, sq/sk/sv, h]
+        # ===================
+        # Permute. [sq/sk/sv, b, h]
+        # ===================
+        if not self.self_attention:
+            # =====================
+            # Query, Key, and Value
+            # =====================
+            query = query.permute(1, 0, 2).contiguous()
+            key = key.permute(1, 0, 2).contiguous()
+            value = value.permute(1, 0, 2).contiguous()
+
+            # Attention heads [sq/sk/sv, b, h] --> [sq/sk/sv * b, (nh * hn)]
+            query = query.view(-1, h)
+            key = key.view(-1, h)
+            value = value.view(-1, h)
+
+            query_layer = _view_with_sharding_dim_change(
+                self.query(query), 1, (sq, b * nh, hn)
+            )
+            key_layer = _view_with_sharding_dim_change(
+                self.key(key), 1, (sk, b * nh, hn)
+            )
+            value_layer = _view_with_sharding_dim_change(
+                self.value(value), 1, (sk, b * nh, hn)
+            )
+        else:
+            assert torch.equal(query, key) and torch.equal(
+                query, value
+            ), "inputs are different for self-attention."
+            # =====================
+            # Query
+            # =====================
+            query = query.permute(1, 0, 2).contiguous()
+
+            # Attention heads [sq, b, h] --> [sq * b, (nh * 3 * hn)]
+            query = query.view(-1, h)
+            mixed_x_layer = self.qkv(query)
+
+            # [sq * b, 3 * h] --> [sq, b, nh, 3 * hn]
+            mixed_x_layer = _view_with_sharding_dim_change(
+                mixed_x_layer, 2, (sq, b, nh, 3 * hn)
+            )
+
+            # [sq, b, nh, 3 * hn] --> 3 [sq, b, nh, hn]
+            last_dim = mixed_x_layer.dim() - 1
+            last_dim_size = mixed_x_layer.size(last_dim) // 3
+            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
+                last_dim_size, dim=last_dim
+            )
+
+            query_layer = _stride_same_as_shard(query_layer, self.tp_size, 2, 1)
+            key_layer = _stride_same_as_shard(key_layer, self.tp_size, 2, 1)
+            value_layer = _stride_same_as_shard(value_layer, self.tp_size, 2, 1)
+            # [sq, b, nh, hn] -> [sq, b * nh, hn]
+            query_layer = _view_with_sharding_dim_change(
+                query_layer, 1, (sq, b * nh, -1)
+            )
+            key_layer = _view_with_sharding_dim_change(
+                key_layer, 1, (sq, b * nh, -1)
+            )
+            value_layer = _view_with_sharding_dim_change(
+                value_layer, 1, (sq, b * nh, -1)
+            )
+
+        # ===================================
+        # Raw attention scores. [b, nh, s, s]
+        # ===================================
+
+        factor = self.tp_size if isinstance(query_layer, DT) else 1
+        # preallocting result tensor: [b * nh, sq, sk]
+        matmul_result = torch.empty(
+            b * nh // factor,
+            sq,
+            sk,
+            dtype=query_layer.dtype,
+            device=self.device,
+        )
+        if isinstance(query_layer, DT):
+            matmul_result = DT.from_local(
+                matmul_result,
+                query_layer.device_mesh,
+                [Shard(0)],
+                run_check=False,
+            )
+
+        # Raw attention scores. [b * nh, sq, sk]
+        attn = torch.baddbmm(
+            matmul_result,
+            query_layer.transpose(0, 1),  # [b * nh, sq, hn]
+            key_layer.transpose(0, 1).transpose(1, 2),  # [b * nh, hn, sk]
+            beta=0.0,
+            alpha=(1.0 / self.norm_factor),
+        )
+
+        # ===============
+        # Attention probs
+        # ===============
+        attn = attn.softmax(dim=-1)
+
+        # =========================
+        # Context layer. [sq * b, hidden]
+        # =========================
+
+        # bmm: [b * nh, sq, hn]
+        context_layer = torch.bmm(attn, value_layer.transpose(0, 1))
+
+        # change view [nh, b, sq, hn]
+        context_layer = context_layer.view(nh, b, sq, hn)
+
+        # [nh, b, sq, hn] --> [sq, b, nh, hn]
+        context_layer = context_layer.permute(2, 1, 0, 3).contiguous()
+
+        # [sq, b, nh, hn] --> [sq * b, hidden]
+        context_layer = _view_with_sharding_dim_change(
+            context_layer.contiguous(), 1, (-1, self.hidden_size)
+        )
+
+        # =================
+        # Projection. [sq, b, h]
+        # =================
+        output = self.proj(context_layer).view(sq, b, h)
+
+        # ===================
+        # Permute. [b, sq, h]
+        # ===================
+        output = output.permute(1, 0, 2)
+
+        return output
+
+    def copy(self, that: torch.nn.MultiheadAttention) -> None:
+        # TODO: current implementation assume `self` is a self attention module
+        assert (
+            self.hidden_size == that.embed_dim
+        ), "embed_dim must be equal in TensorParallelMultiheadAttention.copy()!"
+
+        if that.in_proj_weight is not None:
+            self.qkv.register_parameter("weight", that.in_proj_weight)
+        if that.in_proj_bias is not None:
+            self.qkv.register_parameter("bias", that.in_proj_bias)
+        if that.out_proj.weight is not None:
+            # TODO: The use of Parameter is to avoid `mypy` issue caused
+            # by the `tensor` type annotation on Linear.weight to which
+            # a Parameter object is actually assigned
+            self.proj.register_parameter(
+                "weight", torch.nn.Parameter(that.out_proj.weight)
+            )
+        if that.out_proj.bias is not None:
+            self.proj.register_parameter("bias", that.out_proj.bias)

From 662414252998280ba39465cdaed6094c6a27a652 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 16 Nov 2022 08:51:30 +0000
Subject: [PATCH 0954/1922] [dynamo] Support if cond on NNModuleVariable
 (#89095)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89095
Approved by: https://github.com/yanboliang, https://github.com/mlazos
---
 test/dynamo/test_misc.py          | 28 ++++++++++++++++++++++++++++
 torch/_dynamo/symbolic_convert.py |  5 +++++
 2 files changed, 33 insertions(+)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index e27f7bc5198dd..8f79f2476aeea 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -2885,6 +2885,34 @@ def func(x, y):
         self.assertTrue(same(ref, res))
         self.assertTrue(same(x, x1))
 
+    def test_if_cond_nn_mod(self):
+        class MockModule(torch.nn.Module):
+            def __init__(self, output_relu=True):
+                super(MockModule, self).__init__()
+                self.relu = torch.nn.ReLU() if output_relu else None
+
+            def forward(self, x):
+                x = torch.sin(x)
+                if self.relu:
+                    x = self.relu(x)
+                return x
+
+        model = MockModule()
+        opt_model = torch._dynamo.optimize("eager", nopython=True)(model)
+
+        x = torch.rand(4)
+        ref = model(x)
+        res = opt_model(x)
+        self.assertTrue(same(ref, res))
+
+        model = MockModule(output_relu=False)
+        opt_model = torch._dynamo.optimize("eager", nopython=True)(model)
+
+        x = torch.rand(4)
+        ref = model(x)
+        res = opt_model(x)
+        self.assertTrue(same(ref, res))
+
 
 class CustomFunc(torch.autograd.Function):
     @staticmethod
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index d5c05f76efb0a..d2bc5332719c5 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -252,6 +252,11 @@ def inner(self: "InstructionTranslatorBase", inst: Instruction):
                 + if_next
                 + if_jump
             )
+        elif isinstance(value, NNModuleVariable):
+            # Equivant of "self.nn_module is not None"
+            if truth_fn(value):
+                push and self.push(value)
+                self.jump(inst)
         elif not isinstance(value, TensorVariable) and value.has_unpack_var_sequence(
             self
         ):

From ba98a14b94b385899551606d1c8b2ae7e24418dc Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 16 Nov 2022 09:45:49 +0000
Subject: [PATCH 0955/1922] Revert "Rewrite assert statement with torch._assert
 under config (#88246)"

This reverts commit 62ba15e10e875ce088dff26e872605ee70c8c04a.

Reverted https://github.com/pytorch/pytorch/pull/88246 on behalf of https://github.com/DanilBaibak due to breaking internal builds
---
 test/dynamo/test_repros.py        | 92 ------------------------------
 torch/_dynamo/config.py           |  3 -
 torch/_dynamo/symbolic_convert.py | 94 -------------------------------
 3 files changed, 189 deletions(-)

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index e30a1275ed135..503231b4cb120 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -1938,98 +1938,6 @@ def fn(x):
         self.assertEqual(cnt.frame_count, 1)
         self.assertEqual(cnt.op_count, 1)
 
-    @patch.object(torch._dynamo.config, "rewrite_assert_with_torch_assert", True)
-    def test_rewrite_assert_with_msg(self):
-        def f(x):
-            b = x.sin()
-            assert x[0] == 3, "First dim need to be 3"
-            return x.cos() + b
-
-        args = (torch.Tensor([3, 4, 5]),)
-        cnt = torch._dynamo.testing.CompileCounter()
-
-        opt_f = torch._dynamo.optimize(cnt, nopython=True)(f)
-        self.assertTrue(same(f(*args), opt_f(*args)))
-        self.assertEqual(cnt.op_count, 6)
-        self.assertEqual(cnt.frame_count, 1)
-
-        exported, _ = torch._dynamo.export(f, torch.Tensor([3, 4, 5]))
-        self.assertTrue(same(exported(*args), f(*args)))
-
-        with self.assertRaisesRegex(AssertionError, ""):
-            exported, _ = torch._dynamo.export(f, torch.Tensor([4, 4, 5]))
-
-    @patch.object(torch._dynamo.config, "rewrite_assert_with_torch_assert", True)
-    def test_not_rewrite_assert_for_other_errors(self):
-        def f(x):
-            b = x.sin()
-            if not x.sum() <= 3:
-                raise ValueError("input sum needs to be 3")
-            return x.cos() + b
-
-        args = (torch.Tensor([3, 4, 5]),)
-        opt_fn = torch._dynamo.optimize("eager")(f)
-        with self.assertRaisesRegex(ValueError, "input sum needs to be 3"):
-            opt_fn(*args)
-
-    # TODO (tmanlaibaatar) handle data-dependent fstring in assert statement.
-    @patch.object(torch._dynamo.config, "rewrite_assert_with_torch_assert", True)
-    def test_rewrite_assert_with_fstring_msg(self):
-        def f(x):
-            b = x.sin()
-            assert x[0] == 3, f"First dim need to be {x[0]}"
-            return x.cos() + b
-
-        args = (torch.Tensor([3, 4, 5]),)
-        with self.assertRaisesRegex(torch._dynamo.exc.Unsupported, "generic_jump"):
-            exported, _ = torch._dynamo.export(f, torch.Tensor([3, 4, 5]))
-
-    @patch.object(torch._dynamo.config, "rewrite_assert_with_torch_assert", True)
-    def test_rewrite_assert_without_msg(self):
-        def f(x):
-            b = x.sin()
-            assert x[0] == 3
-            return x.cos() + b
-
-        args = (torch.Tensor([3, 4, 5]),)
-        exported, _ = torch._dynamo.export(f, torch.Tensor([3, 4, 5]))
-        self.assertTrue(same(exported(*args), f(*args)))
-
-        with self.assertRaisesRegex(AssertionError, ""):
-            exported, _ = torch._dynamo.export(f, torch.Tensor([4, 4, 5]))
-
-    @patch.object(torch._dynamo.config, "rewrite_assert_with_torch_assert", True)
-    def test_rewrite_assert_noop(self):
-        def f(x):
-            b = x.sin()
-            assert True
-            assert x.dtype == torch.float32
-            return x.cos() + b
-
-        args = (torch.Tensor([3, 4, 5]),)
-        exported, _ = torch._dynamo.export(f, torch.Tensor([3, 4, 5]))
-        self.assertTrue(same(exported(*args), f(*args)))
-
-        cnt = torch._dynamo.testing.CompileCounter()
-        opt_f = torch._dynamo.optimize(cnt, nopython=True)(f)
-        self.assertTrue(same(f(*args), opt_f(*args)))
-        # torch._assert shouldn't be in the graph
-        self.assertEqual(cnt.op_count, 3)
-        self.assertEqual(cnt.frame_count, 1)
-
-        exported, _ = torch._dynamo.export(f, torch.Tensor([4, 4, 5]))
-        self.assertTrue(same(exported(*args), f(*args)))
-
-    @patch.object(torch._dynamo.config, "rewrite_assert_with_torch_assert", False)
-    def test_not_rewrite_assert(self):
-        def f(x):
-            b = x.sin()
-            assert x[0] == 3
-            return x.cos() + b
-
-        with self.assertRaisesRegex(torch._dynamo.exc.Unsupported, "generic_jump"):
-            torch._dynamo.export(f, torch.Tensor([3, 4, 5]))
-
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index 39a1a6433419f..12088383e741c 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -87,9 +87,6 @@
 # if an exception is encountered
 replay_record_enabled = False
 
-# Rewrite assert statement in python with torch._assert
-rewrite_assert_with_torch_assert = True
-
 # Show a warning on every graph break
 print_graph_breaks = False
 
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index d2bc5332719c5..e64804cb68b2c 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -53,7 +53,6 @@
     fake_tensors_available,
     graph_break_dup_warning_checker,
     istype,
-    proxy_args_kwargs,
 )
 from .variables.base import MutableLocal, typestr, VariableTracker
 from .variables.builder import VariableBuilder, wrap_fx_proxy
@@ -122,103 +121,10 @@ def impl(self: "InstructionTranslatorBase", inst: Instruction):
     return impl
 
 
-def _detect_and_normalize_assert_statement(
-    self: "InstructionTranslatorBase", truth_fn: typing.Callable, push: bool
-):
-    # Detect if this jump instruction is assert and normalize the assert
-    # by pushing dummy error message when nothing is given.
-    #
-    # Python 3.9 assertion is in following format:
-    # 18 POP_JUMP_IF_TRUE       28
-    # 20 LOAD_ASSERTION_ERROR
-    # 22 LOAD_CONST               3 ('Assert message') -> optional instruction
-    # 24 CALL_FUNCTION            1                    -> optional instruction
-    # 26 RAISE_VARARGS
-    #
-    # Python 3.8 assertion is in following format:
-    # 18 POP_JUMP_IF_TRUE       28
-    # 20 LOAD_GLOBAL              0 (Assertion type)
-    # 22 LOAD_CONST               3 ('Assert message') -> optional instruction
-    # 24 CALL_FUNCTION            1                    -> optional instruction
-    # 26 RAISE_VARARGS            1
-
-    if (truth_fn is not operator.truth) or push:
-        return False
-
-    current_instruction_pointer = self.instruction_pointer
-    inst = self.instructions[current_instruction_pointer]
-    # Detect LOAD_ASSERTION_ERROR or LOAD_GLOBAL 0
-    if sys.version_info < (3, 9):
-        if inst.opname != "LOAD_GLOBAL" or inst.argval != "AssertionError":
-            return False
-    else:
-        if inst.opname != "LOAD_ASSERTION_ERROR":
-            return False
-
-    current_instruction_pointer += 1
-
-    if current_instruction_pointer >= len(self.instructions):
-        return False
-
-    inst = self.instructions[current_instruction_pointer]
-    has_error_msg = False
-    # DETECT RAISE_VARARGS or LOAD CONST
-    if inst.opname == "LOAD_CONST":
-        if not isinstance(inst.argval, str):
-            return False
-        self.LOAD_CONST(inst)
-        has_error_msg = True
-
-        # if it is LOAD_CONSTANT, it must be followed by CALL_FUNCTION
-        current_instruction_pointer += 1
-        if current_instruction_pointer >= len(self.instructions):
-            return False
-        inst = self.instructions[current_instruction_pointer]
-        if inst.opname != "CALL_FUNCTION":
-            return False
-
-        # CALL_FUNCTION should be followed by RAISE_VARARGS
-        current_instruction_pointer += 1
-        if current_instruction_pointer >= len(self.instructions):
-            return False
-        inst = self.instructions[current_instruction_pointer]
-
-    if inst.opname != "RAISE_VARARGS":
-        return False
-
-    if not has_error_msg:
-        # Push dummy value instead of error message
-        self.push(ConstantVariable("assertion error"))
-
-    return True
-
-
 def generic_jump(truth_fn: typing.Callable, push: bool):
     def inner(self: "InstructionTranslatorBase", inst: Instruction):
         value: VariableTracker = self.pop()
         self.output.guards.update(value.guards)
-        if (
-            config.rewrite_assert_with_torch_assert
-            and _detect_and_normalize_assert_statement(self, truth_fn, push)
-        ):
-            error_msg: VariableTracker = self.pop()
-            self.output.guards.update(error_msg.guards)
-            # Skip over things like `assert True`
-            if value.is_python_constant() and bool(value.as_python_constant()):
-                self.jump(inst)
-                return
-
-            # Manually insert torch._assert instead of python assert and jump over
-            # assert related instructions as we don't need them anymore.
-            self.output.create_proxy(
-                "call_function",
-                torch._assert,
-                *proxy_args_kwargs((value, error_msg), {}),
-                current_tx=self,
-            )
-            self.jump(inst)
-            return
-
         if value.is_python_constant():
             if truth_fn(value.as_python_constant()):
                 push and self.push(value)

From 95b541dc8babe203ab2039599fecdd76b48dfe63 Mon Sep 17 00:00:00 2001
From: Jiawen Liu <jiawenl@meta.com>
Date: Wed, 16 Nov 2022 10:37:26 +0000
Subject: [PATCH 0956/1922] [Inductor] Build FX Linear + Permute Vertical
 Fusion in Inductor (#89118)

Summary:
Build fx-based linear/matmul/bmm + permute/transpose vertical fusion in Inductor

For an internal Ads model: **1.15x -> 1.36x speedup**

Test Plan: CI

Reviewed By: bertmaher, jansel, jianyuh

Differential Revision: D41071665

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89118
Approved by: https://github.com/jianyuh
---
 test/inductor/test_torchinductor.py | 109 +++++++++++++++
 torch/_inductor/config.py           |   4 +
 torch/_inductor/overrides.py        | 206 +++++++++++++++++++++++++++-
 3 files changed, 316 insertions(+), 3 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index dcb01b9ec78c1..1265ca3e78728 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -10,6 +10,7 @@
 import typing
 import unittest
 import weakref
+from typing import Any, Callable
 from unittest.mock import patch
 
 import torch
@@ -18,6 +19,7 @@
 from torch._dynamo.debug_utils import same_two_models
 from torch._dynamo.testing import rand_strided, same
 from torch.fx.experimental.proxy_tensor import make_fx
+from torch.fx.passes.shape_prop import ShapeProp
 from torch.nn import functional as F
 from torch.testing._internal.common_utils import (
     TEST_WITH_ASAN,
@@ -39,6 +41,14 @@
     from torch._inductor import codecache, config, metrics
     from torch._inductor.compile_fx import compile_fx, complex_memory_overlap
     from torch._inductor.ir import IndexingDiv, ModularIndexing
+    from torch._inductor.overrides import (
+        linear_permute_fusion,
+        linear_transpose,
+        permute_linear_fusion,
+        permute_matmul_fusion,
+        transpose_linear,
+        transpose_matmul,
+    )
     from torch._inductor.sizevars import SizeVarAllocator
     from torch._inductor.utils import has_torchvision_roi_align, timed
 
@@ -113,6 +123,29 @@ def maybe_test(*args, **kwargs):
     return wrap_test
 
 
+PassFunc = Callable[[torch.fx.GraphModule, Any], torch.fx.GraphModule]
+
+
+def chain_passes(*passes: PassFunc) -> PassFunc:
+    def parent_pass(module: torch.fx.GraphModule, input: Any) -> torch.fx.GraphModule:
+        for pass_ in passes:
+            if isinstance(module, torch.fx.GraphModule):
+                ShapeProp(module).propagate(*input)
+            module = pass_(module)
+        return module
+
+    return parent_pass
+
+
+def count_call_function(module: torch.fx.GraphModule, target_op: Any) -> int:
+    return sum(
+        [
+            1 if (n.op == "call_function" and n.target == target_op) else 0
+            for n in module.graph.nodes
+        ]
+    )
+
+
 class TestCase(TorchTestCase):
     @classmethod
     def setUpClass(cls):
@@ -1586,6 +1619,82 @@ def fn(a, b):
         y = torch.tensor(0)
         self.assertEqual(fn(x, y), x + x)
 
+    @unittest.skipIf(HAS_CPU, "Support GPU so far")
+    def test_linear_permute_fusion(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self, k: int, n: int):
+                super().__init__()
+                self.weight = torch.nn.Parameter(torch.randn(n, k))
+                self.bias = torch.nn.Parameter(torch.randn(n))
+
+            def forward(self, input: torch.Tensor):
+                a0 = torch.nn.functional.linear(input, self.weight, self.bias)
+                b0 = a0.permute(0, 2, 1)
+                return b0
+
+        m, k, n = 16, 8, 4
+        trace_func = chain_passes(torch.fx.symbolic_trace, linear_permute_fusion)
+        module = TestModule(k, n).eval()
+        input = torch.randn(6, m, k)
+        traced = trace_func(module, [input])
+        num_linear = count_call_function(traced, torch.nn.functional.linear)
+        num_linear_transpose = count_call_function(traced, linear_transpose)
+        self.assertEqual(num_linear, 0)
+        self.assertEqual(num_linear_transpose, 1)
+
+        self.assertTrue(torch.allclose(module(input), traced(input)))
+
+    @unittest.skipIf(HAS_CPU, "Support GPU so far")
+    def test_permute_linear_fusion(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self, k: int, n: int):
+                super().__init__()
+                self.weight = torch.nn.Parameter(torch.randn(n, k))
+                self.bias = torch.nn.Parameter(torch.randn(n))
+
+            def forward(self, input: torch.Tensor):
+                input1 = input.permute(0, 2, 1)
+                output = torch.nn.functional.linear(input1, self.weight, self.bias)
+                return output
+
+        m, k, n = 16, 8, 4
+
+        trace_func = chain_passes(torch.fx.symbolic_trace, permute_linear_fusion)
+        module = TestModule(k, n).eval()
+        input = torch.randn(6, k, m)
+        traced = trace_func(module, [input])
+        num_linear = count_call_function(traced, torch.nn.functional.linear)
+        num_transpose_linear = count_call_function(traced, transpose_linear)
+        self.assertEqual(num_linear, 0)
+        self.assertEqual(num_transpose_linear, 1)
+
+        self.assertTrue(torch.allclose(module(input), traced(input)))
+
+    @unittest.skipIf(HAS_CPU, "Support GPU so far")
+    def test_permute_bmm_fusion(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self, batch: int, k: int, n: int):
+                super().__init__()
+                self.other = torch.randn(batch, k, n)
+
+            def forward(self, input: torch.Tensor):
+                input1 = input.permute(0, 2, 1)
+                output = torch.bmm(input1, self.other)
+                return output
+
+        batch, m, k, n = 6, 16, 8, 4
+
+        trace_func = chain_passes(torch.fx.symbolic_trace, permute_matmul_fusion)
+        module = TestModule(batch, k, n).eval()
+        input = torch.randn(batch, k, m)
+        traced = trace_func(module, [input])
+        num_bmm = count_call_function(traced, torch.bmm)
+        num_transpose_matmul = count_call_function(traced, transpose_matmul)
+        self.assertEqual(num_bmm, 0)
+        self.assertEqual(num_transpose_matmul, 1)
+
+        self.assertTrue(torch.allclose(module(input), traced(input)))
+
     def test_slice1(self):
         def fn(a):
             return (
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index d376fe3e8bf7f..c552101c1caee 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -75,6 +75,10 @@
 shape_padding = os.environ.get("TORCHINDUCTOR_SHAPE_PADDING", "0") == "1"
 alignment_size = 4
 
+# Fx-based linear/matmul/bmm + permute/transpose vertical fusion
+permute_fusion = os.environ.get("TORCHINDUCTOR_PERMUTE_FUSION", "0") == "1"
+
+
 # config specific to codegen/cpp.pp
 class cpp:
     # set to torch.get_num_threads()
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index 3a95aa7ce8807..9a8bc6266ac01 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -19,6 +19,8 @@
 from torch.nn.utils.fusion import fuse_conv_bn_eval
 from torch.overrides import TorchFunctionMode
 
+from . import config
+
 log = logging.getLogger(__name__)
 
 
@@ -425,14 +427,23 @@ def check_node_is_add_inplace(node):
 
 
 def fuse_fx(gm: torch.fx.GraphModule, example_inputs):
+    is_cpu = all(
+        example_input.device == torch.device("cpu") for example_input in example_inputs
+    )
+
+    if config.permute_fusion and not is_cpu:
+        # For linear permute fusion, we need to check input info to identify
+        # and perform proper permutation/transpose
+        ShapeProp(gm).propagate(*example_inputs)
+        gm = linear_permute_fusion(gm)
+        gm = permute_linear_fusion(gm)
+        gm = permute_matmul_fusion(gm)
+
     # make sure the autograd is disabled.
     if torch.is_grad_enabled():
         return gm
     if not (torch.backends.mkldnn.enabled and torch.backends.mkldnn.is_available()):
         return gm
-    is_cpu = all(
-        example_input.device == torch.device("cpu") for example_input in example_inputs
-    )
     if not is_cpu:
         return gm
     gm = fuse_conv_bn(gm)
@@ -528,6 +539,195 @@ def _philox_rand_like(input, seed, offset):
     return torch.rand_like(input)
 
 
+class NormalizedLinearNode:
+    def __init__(self, node: torch.fx.Node) -> None:
+        assert node.op == "call_function"
+        assert node.target in [torch.nn.functional.linear]
+        self.node: torch.fx.Node = node
+
+    def get_input(self) -> torch.fx.Node:
+        if len(self.node.args) > 0:
+            return self.node.args[0]
+        else:
+            return self.node.kwargs["input"]
+
+    def get_weight(self) -> torch.fx.Node:
+        if len(self.node.args) > 1:
+            return self.node.args[1]
+        else:
+            return self.node.kwargs["weight"]
+
+    def get_bias(self) -> torch.fx.Node:
+        if len(self.node.args) > 2:
+            return self.node.args[2]
+        else:
+            return self.node.kwargs["bias"]
+
+
+class NormalizedMatmulNode:
+    def __init__(self, node: torch.fx.Node) -> None:
+        assert node.op == "call_function"
+        assert node.target in [torch.bmm, torch.matmul]
+        self.node: torch.fx.Node = node
+
+    def get_input(self) -> torch.fx.Node:
+        if len(self.node.args) > 0:
+            return self.node.args[0]
+        else:
+            return self.node.kwargs["input"]
+
+    def get_other(self) -> torch.fx.Node:
+        if len(self.node.args) > 1:
+            return self.node.args[1]
+        else:
+            return self.node.kwargs["other"]
+
+
+def check_permute(node: torch.fx.Node):
+    ranks = len(node.meta["tensor_meta"].shape)
+    if len(node.args) > 3:
+        permutation = [node.args[i] % ranks for i in range(1, ranks + 1)]
+    elif (
+        "permutation" in node.kwargs
+        and node.kwargs["permutation"] is not None
+        and len(node.kwargs["permutation"]) > 2
+    ):
+        permutation = [i % ranks for i in node.kwargs["permutation"]]
+    else:
+        return False
+    allowed_permutation = list(range(ranks))
+    allowed_permutation[-1] = ranks - 2
+    allowed_permutation[-2] = ranks - 1
+    return permutation == allowed_permutation
+
+
+def linear_permute_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    for node in module.graph.nodes:
+        if (
+            node.op == "call_method"
+            and node.target == "permute"
+            and check_permute(node)
+        ):
+            if len(node.args) > 0:
+                input_node = node.args[0]
+            else:
+                input_node = node.kwargs["input"]
+            if (
+                input_node.op == "call_function"
+                and input_node.target == torch.nn.functional.linear
+            ):
+                normalized = NormalizedLinearNode(input_node)
+                input = normalized.get_input()
+                weight = normalized.get_weight()
+                bias = normalized.get_bias()
+                with module.graph.inserting_before(node):
+                    fused_node = module.graph.call_function(
+                        linear_transpose, args=(input, weight, bias)
+                    )
+                    node.replace_all_uses_with(fused_node)
+
+    module.graph.lint()
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return module
+
+
+# Y1 = X * W^T + bias
+# Y2 = Y1.permute(0, 2, 1)
+# ---->
+# Y2 = (W * X^T + bias.unsqueeze(-1))^T
+def linear_transpose(
+    input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor
+) -> torch.Tensor:
+    return torch.matmul(weight, input.transpose(-1, -2)) + bias.unsqueeze(-1)
+
+
+def permute_linear_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    for node in module.graph.nodes:
+        if node.op == "call_function" and node.target == torch.nn.functional.linear:
+            if len(node.args) > 0:
+                input_node = node.args[0]
+            else:
+                input_node = node.kwargs["input"]
+            if (
+                input_node.op == "call_method"
+                and input_node.target == "permute"
+                and check_permute(input_node)
+            ):
+                normalized = NormalizedLinearNode(node)
+                if len(input_node.args) > 0:
+                    input = input_node.args[0]
+                else:
+                    input = input_node.kwargs["input"]
+                weight = normalized.get_weight()
+                bias = normalized.get_bias()
+                with module.graph.inserting_before(node):
+                    fused_node = module.graph.call_function(
+                        transpose_linear, args=(input, weight, bias)
+                    )
+                    node.replace_all_uses_with(fused_node)
+
+    module.graph.lint()
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return module
+
+
+def permute_matmul_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    for node in module.graph.nodes:
+        if node.op == "call_function" and (
+            node.target == torch.bmm or node.target == torch.matmul
+        ):
+            normalized = NormalizedMatmulNode(node)
+            A = normalized.get_input()
+            B = normalized.get_other()
+            Atrans = Btrans = False
+            if A.op == "call_method" and A.target == "permute" and check_permute(A):
+                Atrans = True
+                if len(A.args) > 0:
+                    A = A.args[0]
+                else:
+                    A = A.kwargs["input"]
+
+            if B.op == "call_method" and B.target == "permute" and check_permute(B):
+                Btrans = True
+                if len(B.args) > 0:
+                    B = B.args[0]
+                else:
+                    B = B.kwargs["input"]
+
+            if Atrans or Btrans:
+                with module.graph.inserting_before(node):
+                    fused_node = module.graph.call_function(
+                        transpose_matmul,
+                        args=(A, B, Atrans, Btrans),
+                    )
+                node.replace_all_uses_with(fused_node)
+
+    module.graph.lint()
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return module
+
+
+# X1 = X.permute(0, 2, 1)
+# Y1 = X1 * W1^T + bias1
+# ---->
+# Y2 = X1.transpose(-1, -2) * W1^T + bias1
+def transpose_linear(
+    input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor
+) -> torch.Tensor:
+    return torch.matmul(input.transpose(-1, -2), weight.t()) + bias
+
+
+def transpose_matmul(A: torch.Tensor, B: torch.Tensor, Atrans: bool, Btrans: bool):
+    if Atrans:
+        A = A.transpose(-1, -2)
+    if Btrans:
+        B = B.transpose(-1, -2)
+    return torch.matmul(A, B)
+
+
 def replace_and_fuse_for_binary(
     computation_node, node, fuse_func, attr, modules, index_node, index_pointwise
 ):

From 327618c61480e7439ef5e5ac16de3bb8dec5e82d Mon Sep 17 00:00:00 2001
From: anjali411 <chourdiaanjali123@gmail.com>
Date: Tue, 15 Nov 2022 19:24:31 +0000
Subject: [PATCH 0957/1922] Add meta impl for grid_sampler_2d_backward (#88745)

TODO: add an OpInfo

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88745
Approved by: https://github.com/ezyang
---
 test/functorch/test_aotdispatch.py            |  6 +---
 test/functorch/test_ops.py                    |  2 ++
 test/inductor/test_torchinductor_opinfo.py    |  1 +
 test/test_proxy_tensor.py                     |  8 ++---
 torch/_meta_registrations.py                  | 27 ++++++++++++++++
 .../_internal/common_methods_invocations.py   | 31 +++++++++++++++++++
 6 files changed, 66 insertions(+), 9 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 752b03ac9984f..1dc5476158f96 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1128,7 +1128,7 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('nn.functional.embedding_bag', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.fractional_max_pool2d', ''),  # rand() received an invalid combination of arguments - g...
     xfail('nn.functional.fractional_max_pool3d', ''),  # rand() received an invalid combination of arguments - g...
-    xfail('nn.functional.grid_sample', ''),  # prims::arange() Expected a value of type 'number' for argument...
+    xfail('nn.functional.grid_sample', ''),  # RuntimeError: aten.grid_sampler_3d.default - couldn't find sym ...
     xfail('nn.functional.group_norm', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.hinge_embedding_loss', ''),  # aten.zeros_like.default - couldn't find symbolic meta...
     xfail('nn.functional.interpolate', 'area'),  # Cannot call sizes() on tensor with symbolic sizes/strides
@@ -1182,10 +1182,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('repeat_interleave', ''),  # aten.repeat_interleave.Te...
     xfail('reshape_as', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('roll', ''),  # narrow() received an invalid combination of arguments - got (FakeTensor, int, torch._C...
-    xfail('round', ''),  # aten.round.default - couldn't find symbolic meta function/decomposition
-    xfail('round', 'decimals_0'),  # aten.round.decimals - couldn't find symbolic meta function/decomposition
-    xfail('round', 'decimals_3'),  # aten.round.decimals - couldn't find symbolic meta function/decomposition
-    xfail('round', 'decimals_neg_3'),  # aten.round.decimals - couldn't find symbolic meta function/decompos...
     xfail('segment_reduce', 'lengths'),  # aten.segment_reduce.default - couldn't find symbolic meta functio...
     xfail('segment_reduce', 'offsets'),  # aten.segment_reduce.default - couldn't find symbolic meta functio...
     xfail('sgn', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index 643ff0ec862a2..91ea2443777b8 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -1204,6 +1204,7 @@ def get_vjp(cotangents, *primals):
         xfail('logcumsumexp', ''),  # NYI: forward-AD for logcumsumexp
         xfail('nn.functional.embedding_bag', ''),  # NYI: forward-AD for _embedding_bag
         xfail('nn.functional.grid_sample', ''),  # NYI: forward AD for grid_sampler_2d
+        xfail('grid_sampler_2d', ''),  # NYI: forward AD for grid_sampler_2d
         xfail('nn.functional.hardsigmoid', ''),  # NYI: forward AD for hardsigmoid_backward
         xfail('nn.functional.huber_loss', ''),  # NYI: forward AD for huber_loss_backward
         xfail('nn.functional.logsigmoid', ''),  # not differentiable w.r.t. buffer
@@ -1343,6 +1344,7 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents):
         xfail('nn.functional.fractional_max_pool3d'),  # calls random op
         xfail('nn.functional.gaussian_nll_loss'),  # data depenedant flow
         xfail('nn.functional.grid_sample'),  # Forward AD not implemented and no decomposition
+        xfail('grid_sampler_2d'),  # Forward AD not implemented and no decomposition
         xfail('nn.functional.hardsigmoid'),  # Forward AD not implemented and no decomposition
         xfail('nn.functional.hinge_embedding_loss'),  # vmap: inplace into a regular tensor
         xfail('nn.functional.huber_loss'),  # Forward AD not implemented and no decomposition
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 83d8d40e21ecf..7db9d13733b4d 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -333,6 +333,7 @@ def process(device_type):
     "nn.functional.adaptive_avg_pool2d": {f16},
     "nn.functional.ctc_loss": {f32, f64},
     "nn.functional.grid_sample": {f16},
+    "grid_sampler_2d": {f16},
     "nn.functional.gaussian_nll_loss": {f16, f32, f64},
     "nn.functional.one_hot": {i64},
     "nn.functional.rrelu": {f16, f32, f64},
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 59b08eea8dce8..8dc42be7fdfb2 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1308,10 +1308,6 @@ def f(a, b, c, d, e):
     xfail('resize_', ''),  # aten.clone.default - couldn't find symbolic meta function/decomposition
     xfail('resize_as_', ''),  # aten.clone.default - couldn't find symbolic meta function/decomposition
     xfail('roll', ''),  # Tensors of type TensorImpl do not have numel
-    xfail('round', ''),  # aten.round.default - couldn't find symbolic meta function/decomposition
-    xfail('round', 'decimals_0'),  # aten.round.decimals - couldn't find symbolic meta function/decomposition
-    xfail('round', 'decimals_3'),  # aten.round.decimals - couldn't find symbolic meta function/decomposition
-    xfail('round', 'decimals_neg_3'),  # aten.round.decimals - couldn't find symbolic meta function/decomposition
     xfail('searchsorted', ''),  # Could not run 'aten::searchsorted.Tensor' with arguments from the 'Meta' backend. ...
     xfail('segment_reduce', 'offsets'),  # aten.segment_reduce.default - couldn't find symbolic meta function/decomposition
     xfail('special.airy_ai', ''),  # aten.special_airy_ai.default - couldn't find symbolic meta function/decomposition
@@ -1441,6 +1437,10 @@ def f(a, b, c, d, e):
     xfail('uniform', ''),  # aten.uniform_.default - couldn't find symbolic meta function/decomposition
     xfail('unique', ''),  # aten.unique_consecutive.default - couldn't find symbolic meta function/decomposition
     xfail('xlogy', ''),  # aten.xlogy_.Tensor - couldn't find symbolic meta function/decomposition
+    xfail('round', ''),  # aten.round_.default - couldn't find symbolic meta function/decomposition
+    xfail('round', 'decimals_0'),  # aten.round_.decimals - couldn't find symbolic meta function/decomposition
+    xfail('round', 'decimals_3'),  # aten.round_.decimals - couldn't find symbolic meta function/decomposition
+    xfail('round', 'decimals_neg_3')  # aten.round_.decimals - couldn't find symbolic meta function/decomposition
 }
 
 # Copies inputs to inplace operations to avoid inplace modifications
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index be7370e344f04..4fa3ab09d2755 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -6,6 +6,7 @@
 from torch import Tensor
 from torch._decomp import _add_op_to_registry, global_decomposition_table, meta_table
 from torch._ops import OpOverload
+from torch._prims import _elementwise_meta, ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND
 from torch._prims_common import (
     check,
     corresponding_complex_dtype,
@@ -1166,6 +1167,13 @@ def meta_binop_inplace_alpha(self, other, alpha=1):
     return self
 
 
+@register_meta([aten.round.default, aten.round.decimals])
+def meta_round(self, **kwargs):
+    return _elementwise_meta(
+        self, type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT
+    )
+
+
 @register_meta(aten.zero.default)
 def meta_zero(self):
     return self.new_empty(self.shape)
@@ -1474,6 +1482,25 @@ def meta_max_pool2d_with_indices(
     )
 
 
+@register_meta(aten.grid_sampler_2d_backward.default)
+def grid_sampler_2d_backward_meta(
+    grad_output,
+    input,
+    grid,
+    interpolation_mode,
+    padding_mode,
+    align_corners,
+    output_mask,
+):
+    input_requires_grad = output_mask[0]
+    if input_requires_grad:
+        grad_input = torch.zeros_like(input, memory_format=torch.contiguous_format)
+    else:
+        grad_input = None
+    grad_grid = torch.empty_like(grid, memory_format=torch.contiguous_format)
+    return (grad_input, grad_grid)
+
+
 @register_meta([aten.full.default])
 def full(size, fill_value, *args, **kwargs):
     return torch.empty(size, *args, **kwargs)
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 5e60eff2865e4..e498e4f285092 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -6969,6 +6969,28 @@ def sample_inputs_grid_sample(op_info, device, dtype, requires_grad, **kwargs):
                 align_corners=align_corners,
             )
 
+def sample_inputs_grid_sampler_2d(op_info, device, dtype, requires_grad, **kwargs):
+    # We get better tests if we change the range of the values to something like [-2,2]
+    # because for grid (second tensor argument) the "useful" range is [-1,1] and this way
+    # you get a better combination of out-of-range and in-range test cases
+    _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad,
+                           low=-2, high=2)
+
+    batch_size = 2
+    num_channels = 3
+    modes = (0, 1, 2)
+    align_cornerss = (False, True)
+    padding_modes = (0, 1, 2)
+
+    for mode, padding_mode, align_corners in itertools.product(modes, padding_modes, align_cornerss):
+        yield SampleInput(
+            _make_tensor((batch_size, num_channels, S, L)),
+            _make_tensor((batch_size, num_channels, M, 2)),
+            mode,
+            padding_mode,
+            align_corners,
+        )
+
 def sample_inputs_cosine_embedding_loss(op_info, device, dtype, requires_grad, **kwargs):
     make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
@@ -16190,6 +16212,15 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         sample_inputs_func=sample_inputs_grid_sample,
         supports_gradgrad=False,
         gradcheck_nondet_tol=1e-15),
+    # TODO: delete this OpInfo once we add meta support for grid_sampler_3d
+    OpInfo(
+        "grid_sampler_2d",
+        dtypes=floating_types(),
+        dtypesIfCUDA=floating_types_and(torch.float16),
+        supports_out=False,
+        sample_inputs_func=sample_inputs_grid_sampler_2d,
+        supports_gradgrad=False,
+        gradcheck_nondet_tol=1e-15),
     OpInfo(
         "argwhere",
         ref=np.argwhere,

From 75dbce33496cd96a1d0d1c1bc05f2a8c3a552be4 Mon Sep 17 00:00:00 2001
From: Nikita Karetnikov <nikita@karetnikov.org>
Date: Wed, 16 Nov 2022 11:25:35 +0100
Subject: [PATCH 0958/1922] Bug fix: make sure `copy_impl` doesn't read out of
 bounds (#88544)

Fixes #88543.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88544
Approved by: https://github.com/lezcano
---
 aten/src/ATen/native/Copy.cpp |  7 +++++-
 test/test_torch.py            | 45 +++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
index c6b82426d3bf6..dc30db8e11001 100644
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@@ -124,12 +124,17 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
   // 1. Memory Format for source and destination tensors is contiguous.
   // 2. Device for both the source and destination tensor is CPU.
   // 3. dtype conversion between FP32->FP16 and FP16->FP32.
+  // This checks that self.sizes() == src.sizes() because this code path doesn't
+  // support broadcasting. This also guards against out of bounds memory access
+  // when copying, see fbgemm::Float16ToFloat_ref.
+  // https://github.com/pytorch/pytorch/issues/88543
   #ifdef USE_FBGEMM
     if (((self.dtype() == at::kFloat && src.dtype() == at::kHalf) ||
          (self.dtype() == at::kHalf && src.dtype() == at::kFloat)) &&
         (self.device().is_cpu() && src.device().is_cpu()) &&
         ((self.is_contiguous() && src.is_contiguous()) ||
-         (self.is_non_overlapping_and_dense() && self.strides() == src.strides()))) {
+         (self.is_non_overlapping_and_dense() && self.strides() == src.strides())) &&
+        (self.sizes() == src.sizes())) {
       if (src.dtype() == at::kFloat && self.dtype() == at::kHalf) {
         auto* output_ptr =
             reinterpret_cast<fbgemm::float16*>(self.data_ptr<at::Half>());
diff --git a/test/test_torch.py b/test/test_torch.py
index 3ebc92676fe02..31759213ecefc 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -7661,6 +7661,51 @@ def test_copy_many_to_one(self):
         # storage to a single storage would cause RuntimeError to be thrown
         self.assertRaises(RuntimeError, lambda: torch.zeros(1, 6).expand(5, 6).copy_(torch.zeros(5, 6)))
 
+    def test_copy_float16(self):
+        # Check that fbgemm code no longer reads memory out of bounds, see
+        # copy_impl and fbgemm::Float16ToFloat_ref.
+        # https://github.com/pytorch/pytorch/issues/88543
+
+        # Types to test different code paths in copy_impl.
+        dtypes = (
+            # out_dtype, src_dtype
+            (torch.float32, torch.float16),  # fbgemm
+            (torch.float16, torch.float32),  # fbgemm
+            (torch.float32, torch.float32),  # TensorIterator
+        )
+
+        cases = (
+            # out_shape, src_shape, is_ok
+            # These cases used to crash with fbgemm, make sure these also raise
+            # exceptions with TensorIterator.
+            ((1, 2, 3), (0, 2, 3), False),  # same strides, not allowed by TI
+            ((1, 5, 6), (4, 5, 6), False),  # same strides, not allowed by TI
+            (1, (0, 2, 3), False),  # different strides
+            ((4, 5, 6), (0, 2, 3), False),  # different strides
+            ((4, 5, 6), (1, 2, 3), False),  # different strides
+            ((4, 5, 6), (6, 5, 4), False),  # same numel
+
+            # These cases should pass with fbgemm and TensorIterator.
+            ((4, 5, 6), (1, 5, 6), True),  # same strides
+            ((4, 5, 6), (4, 5, 6), True),  # same strides
+            ((0, 2, 3), 1, True),  # different strides, allowed by TI
+            ((4, 5, 6), (4, 5, 1), True),  # different strides, allowed by TI
+        )
+
+        for (out_shape, src_shape, is_ok), (out_dtype, src_dtype) in itertools.product(cases, dtypes):
+            out = torch.zeros(out_shape, dtype=out_dtype, device=torch.device('cpu'))
+            src = torch.ones(src_shape, dtype=src_dtype, device=torch.device('cpu'))
+            if is_ok:
+                if torch.cuda.is_available():
+                    out_cuda = out.cuda()
+                    src_cuda = src.cuda()
+                res = out.copy_(src)
+                if torch.cuda.is_available():
+                    res_cuda = out_cuda.copy_(src_cuda)
+                    self.assertEqual(res, res_cuda)
+            else:
+                self.assertRaises(RuntimeError, lambda: out.copy_(src))
+
     # FIXME: Port to a more appropriate test suite
     def _test_to_with_layout(self, layout):
         def test_copy_behavior(t, non_blocking=False):

From c9f45bbee9a639d8c1743ba2ae763c893de09581 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Tue, 15 Nov 2022 10:10:27 -0800
Subject: [PATCH 0959/1922] SymIntArrayRef type caster (#89074)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89074
Approved by: https://github.com/SherlockNoMad
---
 torch/csrc/utils.cpp      | 42 +++++++++++++++++++++++++++++++++++++++
 torch/csrc/utils/pybind.h | 16 +++++++++++++++
 2 files changed, 58 insertions(+)

diff --git a/torch/csrc/utils.cpp b/torch/csrc/utils.cpp
index b2eac4b54fa1b..5fc91d68dd180 100644
--- a/torch/csrc/utils.cpp
+++ b/torch/csrc/utils.cpp
@@ -4,6 +4,7 @@
 #include <torch/csrc/python_headers.h>
 #include <torch/csrc/utils/invalid_arguments.h>
 #include <torch/csrc/utils/python_strings.h>
+#include <torch/csrc/utils/python_symnode.h>
 #include <torch/csrc/utils/python_tuples.h>
 
 #include <torch/csrc/Export.h>
@@ -348,5 +349,46 @@ handle type_caster<at::IntArrayRef>::cast(
   return handle(THPUtils_packInt64Array(src.size(), src.data()));
 }
 
+bool type_caster<at::SymIntArrayRef>::load(handle src, bool) {
+  PyObject* source = src.ptr();
+
+  auto tuple = PyTuple_Check(source);
+  if (tuple || PyList_Check(source)) {
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+    const auto size =
+        tuple ? PyTuple_GET_SIZE(source) : PyList_GET_SIZE(source);
+    v_value.resize(size);
+    for (const auto idx : c10::irange(size)) {
+      PyObject* obj =
+          tuple ? PyTuple_GET_ITEM(source, idx) : PyList_GET_ITEM(source, idx);
+
+      if (THPVariable_Check(obj)) {
+        // TODO: this is for consistency with IntArrayRef but arguably
+        // we shouldn't really allow this on pybind11 casters
+        v_value[idx] = THPVariable_Unpack(obj).item<int64_t>();
+      } else if (torch::is_symint(py::handle(obj))) {
+        v_value[idx] = py::handle(obj).cast<c10::SymInt>();
+      } else if (PyLong_Check(obj)) {
+        v_value[idx] = c10::SymInt(THPUtils_unpackIndex(obj));
+      } else {
+        return false;
+      }
+    }
+    value = v_value;
+    return true;
+  }
+  return false;
+}
+handle type_caster<at::SymIntArrayRef>::cast(
+    at::SymIntArrayRef src,
+    return_value_policy /* policy */,
+    handle /* parent */) {
+  py::list t(src.size());
+  for (const auto i : c10::irange(src.size())) {
+    t[i] = py::cast(src[i]);
+  }
+  return t.release();
+}
+
 } // namespace detail
 } // namespace pybind11
diff --git a/torch/csrc/utils/pybind.h b/torch/csrc/utils/pybind.h
index c43cf5e732832..85532a42cee26 100644
--- a/torch/csrc/utils/pybind.h
+++ b/torch/csrc/utils/pybind.h
@@ -109,6 +109,22 @@ struct TORCH_PYTHON_API type_caster<at::IntArrayRef> {
   std::vector<int64_t> v_value;
 };
 
+template <>
+struct TORCH_PYTHON_API type_caster<at::SymIntArrayRef> {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  PYBIND11_TYPE_CASTER(at::SymIntArrayRef, _("at::SymIntArrayRef"));
+
+  bool load(handle src, bool);
+  static handle cast(
+      at::SymIntArrayRef src,
+      return_value_policy /* policy */,
+      handle /* parent */);
+
+ private:
+  std::vector<c10::SymInt> v_value;
+};
+
 template <>
 struct TORCH_PYTHON_API type_caster<at::MemoryFormat> {
  public:

From bf91cecb7efcdd659e12818380a62f5b2ac8150d Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Tue, 15 Nov 2022 10:10:28 -0800
Subject: [PATCH 0960/1922] SymIntify convolution backend calculation (#89069)

We will need this to implement a convolution meta function that
is SymInt aware.  I use templates so that regular convolution code
is not affected by the change.  No tests for symbolic ints directly; that will
come in a subsequent PR which also needs to refactor fake tensors.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89069
Approved by: https://github.com/SherlockNoMad
---
 aten/src/ATen/native/ConvUtils.h        |  79 ++++--
 aten/src/ATen/native/Convolution.cpp    | 319 +++++++++++++-----------
 aten/src/ATen/native/utils/ParamUtils.h |   7 +-
 c10/core/SymInt.h                       |  13 +
 torch/csrc/Module.cpp                   |  12 +-
 5 files changed, 256 insertions(+), 174 deletions(-)

diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h
index b8e2b0842a002..880ce0c2af54a 100644
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@@ -110,8 +110,8 @@ enum class ConvBackend {
 // This overload is exposed to python for testing, etc.
 TORCH_API ConvBackend select_conv_backend(
     const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
-    IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation,
-    bool transposed, IntArrayRef output_padding, int64_t groups, const at::OptionalIntArrayRef bias_sizes_opt);
+    IntArrayRef stride, SymIntArrayRef padding, IntArrayRef dilation,
+    bool transposed, SymIntArrayRef output_padding, int64_t groups, const at::OptionalSymIntArrayRef bias_sizes_opt);
 
 TORCH_API at::MemoryFormat _determine_backend_memory_format(const Tensor& input,
     const Tensor& weight,
@@ -200,15 +200,16 @@ static void convolution_shape_check(
 // as conv_output_size loses information; this is why conv_input_size
 // takes an extra output_padding argument to resolve the ambiguity.
 
-static inline std::vector<int64_t> conv_output_size(
-    IntArrayRef input_size, IntArrayRef weight_size,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef()
+template <typename T>
+static inline std::vector<T> _conv_output_size(
+    ArrayRef<T> input_size, ArrayRef<T> weight_size,
+    ArrayRef<T> padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef()
 ) {
   // ASSERT(input_size.size() > 2)
   // ASSERT(input_size.size() == weight_size.size())
   bool has_dilation = dilation.size() > 0;
   auto dim = input_size.size();
-  std::vector<int64_t> output_size(dim);
+  std::vector<T> output_size(dim);
   output_size[0] = input_size[input_batch_size_dim];
   output_size[1] = weight_size[weight_output_channels_dim];
   for (const auto d : c10::irange(2, dim)) {
@@ -219,40 +220,84 @@ static inline std::vector<int64_t> conv_output_size(
   return output_size;
 }
 
-static inline std::vector<int64_t> conv_input_size(
-    IntArrayRef output_size, IntArrayRef weight_size,
-    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
+static inline std::vector<int64_t> conv_output_size(
+    IntArrayRef input_size, IntArrayRef weight_size,
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef()
+) {
+  return _conv_output_size(input_size, weight_size, padding, stride, dilation);
+}
+
+static inline std::vector<c10::SymInt> conv_output_size(
+    SymIntArrayRef input_size, SymIntArrayRef weight_size,
+    SymIntArrayRef padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef()
+) {
+  return _conv_output_size(input_size, weight_size, padding, stride, dilation);
+}
+
+template <typename T>
+std::vector<T> _conv_input_size(
+    ArrayRef<T> output_size, ArrayRef<T> weight_size,
+    ArrayRef<T> padding, ArrayRef<T> output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
 ) {
   // ASSERT(output_size.size() > 2)
   // ASSERT(output_size.size() == weight_size.size())
   auto dim = output_size.size();
-  std::vector<int64_t> input_size(dim);
+  std::vector<T> input_size(dim);
   input_size[0] = output_size[output_batch_size_dim];
   input_size[1] = weight_size[weight_input_channels_dim] * groups;
   for (const auto d : c10::irange(2, dim)) {
-    int kernel = dilation[d - 2] * (weight_size[d] - 1) + 1;
-    input_size[d] = (output_size[d] - 1) * stride[d - 2] - (2 * padding[d - 2]) +
+    auto kernel = (weight_size[d] - 1) * dilation[d - 2] + 1;
+    input_size[d] = (output_size[d] - 1) * stride[d - 2] - (padding[d - 2] * 2) +
                      kernel + output_padding[d - 2];
   }
   return input_size;
 }
 
-static inline std::vector<int64_t> conv_weight_size(
-    IntArrayRef input_size, IntArrayRef output_size,
+static inline std::vector<c10::SymInt> conv_input_size(
+    SymIntArrayRef output_size, SymIntArrayRef weight_size,
+    SymIntArrayRef padding, SymIntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
+) {
+  return _conv_input_size(output_size, weight_size, padding, output_padding, stride, dilation, groups);
+}
+
+static inline std::vector<int64_t> conv_input_size(
+    IntArrayRef output_size, IntArrayRef weight_size,
     IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
+) {
+  return _conv_input_size(output_size, weight_size, padding, output_padding, stride, dilation, groups);
+}
+
+template <typename T>
+std::vector<T> _conv_weight_size(
+    ArrayRef<T> input_size, ArrayRef<T> output_size,
+    ArrayRef<T> padding, ArrayRef<T> output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
 ) {
   auto dim = input_size.size();
-  std::vector<int64_t> weight_size(dim);
+  std::vector<T> weight_size(dim);
   weight_size[0] = output_size[1];
   weight_size[1] = input_size[1] / groups;
   for (const auto d : c10::irange(2, dim)) {
-    int kernel = input_size[d] - (output_size[d] - 1) * stride[d - 2]
-               + 2 * padding[d - 2] - output_padding[d - 2];
+    auto kernel = input_size[d] - (output_size[d] - 1) * stride[d - 2]
+               + padding[d - 2] * 2 - output_padding[d - 2];
     weight_size[d] = (kernel - 1) / dilation[d - 2] + 1;
   }
   return weight_size;
 }
 
+static inline std::vector<c10::SymInt> conv_weight_size(
+    SymIntArrayRef input_size, SymIntArrayRef output_size,
+    SymIntArrayRef padding, SymIntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
+) {
+  return _conv_weight_size(input_size, output_size, padding, output_padding, stride, dilation, groups);
+}
+
+static inline std::vector<int64_t> conv_weight_size(
+    IntArrayRef input_size, IntArrayRef output_size,
+    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
+) {
+  return _conv_weight_size(input_size, output_size, padding, output_padding, stride, dilation, groups);
+}
+
 static inline Tensor reshape_bias(int64_t dim, const Tensor& bias) {
   std::vector<int64_t> shape(dim, 1);
   shape[1] = -1;
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 29b2ce804c806..bf7017f20a4fd 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -83,10 +83,11 @@ constexpr int MIOPEN_DIM_MAX = 5;
 namespace at { namespace native {
 
 // Check workload to activate fast depthwise FP16 cudnn conv kernels
+template <typename T>
 bool check_cudnn_depthwise_workload(const at::Tensor& input, int stride) {
-  int w = input.size(3);  // same as h
-  int ch = input.size(1);
-  int bs = input.size(0);
+  auto w = at::symint::size<T>(input, 3);  // same as h
+  auto ch = at::symint::size<T>(input, 1);
+  auto bs = at::symint::size<T>(input, 0);
   if (stride==1) {
     if (w >= 7) {
       // All batch sizes and nb_channels
@@ -205,27 +206,28 @@ bool check_cudnn_depthwise_workload(const at::Tensor& input, int stride) {
 }
 
 // simplified version for cudnn 8.2 and above
+template <typename T>
 bool check_cudnn_depthwise_workload_with_filter(const at::Tensor& input, int stride, const at::Tensor& weight) {
   // 1D conv
-  if(input.size(2) == 1 && stride == 1){
+  if(at::symint::size<T>(input, 2) == 1 && stride == 1){
     return true;
   }
 
   // 2d conv
   // only square filters
-  if (weight.size(2) != weight.size(3)) return false;
-  int filter = weight.size(3);
+  if (at::symint::size<T>(weight, 2) != at::symint::size<T>(weight, 3)) return false;
+  auto filter = at::symint::size<T>(weight, 3);
   // only 1/3/5 filter
   if (filter != 1 && filter != 3 && filter != 5) return false;
   // we don't enforce square input but only check width to reduce heuristic space
-  if (input.size(3) < 7) return false; // min width 7
-  int w = input.size(3);
+  if (at::symint::size<T>(input, 3) < 7) return false; // min width 7
+  auto w = at::symint::size<T>(input, 3);
   // only 1/2 stride, use cudnn for all stride 1
   if (stride == 1) return true;
   if (stride != 2) return false;
 
-  int ch = input.size(1);
-  int bs = input.size(0);
+  auto ch = at::symint::size<T>(input, 1);
+  auto bs = at::symint::size<T>(input, 0);
   // special case since bs1 show good perf in lots of cases
   if (bs == 1) {
     if (filter == 1 && w <= 28) return true;
@@ -240,13 +242,42 @@ bool check_cudnn_depthwise_workload_with_filter(const at::Tensor& input, int str
 }
 
 
+bool xnnpack_use_convolution2d(
+    const Tensor& input,
+    const Tensor& weight,
+    const at::OptionalIntArrayRef bias_sizes_opt,
+    const IntArrayRef padding,
+    const IntArrayRef stride,
+    const IntArrayRef dilation,
+    const int64_t groups,
+    const bool transposed) {
+  return xnnpack::use_convolution2d(input, weight, bias_sizes_opt, padding, stride, dilation, groups, transposed);
+}
+
+bool xnnpack_use_convolution2d(
+    const Tensor& input,
+    const Tensor& weight,
+    const at::OptionalSymIntArrayRef bias_sizes_opt,
+    const SymIntArrayRef padding,
+    const IntArrayRef stride,
+    const IntArrayRef dilation,
+    const int64_t groups,
+    const bool transposed) {
+  // Never use xnnpack for symbolic tracing
+  return false;
+}
+
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+// This struct is templated so that we can run backend selection in a dynamic
+// shapes context; all of the real kernel selection in eager mode runs with
+// int64_t
+template <typename T>
 struct ConvParams {
   std::vector<int64_t> stride;
-  std::vector<int64_t> padding;
+  std::vector<T> padding;
   std::vector<int64_t> dilation;
   bool transposed;
-  std::vector<int64_t> output_padding;
+  std::vector<T> output_padding;
   int groups;
   bool benchmark;
   bool deterministic;
@@ -322,12 +353,12 @@ struct ConvParams {
 #if defined(__ARM_NEON__)
     // Currently only 3x3 depthwise convolutions on tensors of float are supported.
     return (input.ndimension() == 4) &&
-           (input.size(1) == groups) &&
+           (at::symint::size<T>(input, 1) == groups) &&
            (weight.ndimension() == 4 ) &&
-           (weight.size(0) % input.size(1) == 0) &&
-           (weight.size(1) == 1) &&
-           (weight.size(2) == 3) &&
-           (weight.size(3) == 3) &&
+           (at::symint::size<T>(weight, 0) % at::symint::size<T>(input, 1) == 0) &&
+           (at::symint::size<T>(weight, 1) == 1) &&
+           (at::symint::size<T>(weight, 2) == 3) &&
+           (at::symint::size<T>(weight, 3) == 3) &&
            (input.device().is_cpu()) &&
            (input.scalar_type() == at::kFloat) &&
            input.is_contiguous() &&
@@ -345,23 +376,23 @@ struct ConvParams {
 
   bool needs_64bit_indexing_no_split(const at::Tensor& input, const at::Tensor& weight) const {
     constexpr int64_t int_max = std::numeric_limits<int>::max();
-    int64_t numel_input = input.numel();
+    auto numel_input = at::symint::numel<T>(input);
     // empty input
     if (numel_input == 0) {
       return false;
     }
     // input size can not be reduced to the range of int by splitting the batch dim
-    int64_t n = input.size(0);
+    auto n = at::symint::size<T>(input, 0);
     if (numel_input / n > int_max) {
       return true;
     }
     // output size can not be reduced to the range of int by splitting the batch dim
-    int64_t outsize = 1;
+    T outsize = 1;
     if (transposed) {
-      std::vector<int64_t> o = conv_input_size(input.sizes(), weight.sizes(), padding, output_padding, stride, dilation, groups);
+      auto o = conv_input_size(at::symint::sizes<T>(input), at::symint::sizes<T>(weight), padding, output_padding, stride, dilation, groups);
       outsize = c10::multiply_integers(o.begin() + 1, o.end());
     } else {
-      std::vector<int64_t> o = conv_output_size(input.sizes(), weight.sizes(), padding, stride, dilation);
+      auto o = conv_output_size(at::symint::sizes<T>(input), at::symint::sizes<T>(weight), padding, stride, dilation);
       outsize = c10::multiply_integers(o.begin() + 1, o.end());
     }
     return outsize > int_max;
@@ -417,10 +448,10 @@ struct ConvParams {
                              is_depthwise(input, weight) &&
                              input.ndimension() == 4 &&   // TODO: 5-D contiguous depthwise is not supported yet, need benchmarks
                              !is_dilated() && // no dilation supported
-                             (stride[0] == stride[1] || input.size(2) == 1) && // square or 1d
-                             input.size(1) >= 32); // min 32 channels supported)
+                             (stride[0] == stride[1] || at::symint::size<T>(input, 2) == 1) && // square or 1d
+                             at::symint::size<T>(input, 1) >= 32); // min 32 channels supported)
         if (kernel_cond) {
-          return check_cudnn_depthwise_workload_with_filter(input, stride[1], weight);
+          return check_cudnn_depthwise_workload_with_filter<T>(input, stride[1], weight);
         }
       }
       // keep (7600 <= cudnn < 8200) code unchanged
@@ -430,14 +461,14 @@ struct ConvParams {
                            weight.scalar_type() == kHalf &&
                            is_depthwise(input, weight) &&
                            input.ndimension() == 4 &&   // TODO: 5-D contiguous depthwise is not supported yet, need benchmarks
-                           weight.size(2) == weight.size(3) && // only square kernels
-                           input.size(2) >= 7 && // min width/height 7
+                           at::symint::size<T>(weight, 2) == at::symint::size<T>(weight, 3) && // only square kernels
+                           at::symint::size<T>(input, 2) >= 7 && // min width/height 7
                            !is_dilated() && // no dilation supported
                            stride[0] == stride[1] && // equal strides
-                           ((weight.size(3) == 3) || (weight.size(3) == 1)) &&
-                           input.size(1) >= 32); // min 32 channels supported)
+                           ((at::symint::size<T>(weight, 3) == 3) || (at::symint::size<T>(weight, 3) == 1)) &&
+                           at::symint::size<T>(input, 1) >= 32); // min 32 channels supported)
       if (kernel_cond) {
-        return check_cudnn_depthwise_workload(input, stride[0]);
+        return check_cudnn_depthwise_workload<T>(input, stride[0]);
       } else {
         return false;
       }
@@ -473,12 +504,12 @@ struct ConvParams {
        !transposed && // or transposed tensors
        // For 1x1 filters, MKLDNN is faster than THNN when multi-threaded,
        // but THNN is faster when single-threaded.
-       (is_strided() || is_dilated() || input.size(0) >= 16 ||
-        weight.size(-1) != 1 || weight.size(-2) != 1 || at::get_num_threads() > 1) &&
+       (is_strided() || is_dilated() || at::symint::size<T>(input, 0) >= 16 ||
+        at::symint::size<T>(weight, -1) != 1 || at::symint::size<T>(weight, -2) != 1 || at::get_num_threads() > 1) &&
        (groups > 1
-        || (weight.size(-1) > 3 && weight.size(-2) > 3)
-        || input.size(0) > 1
-        || input.size(0)*input.size(1)*input.size(2)*input.size(3) > 20480) // for some case, native is faster
+        || (at::symint::size<T>(weight, -1) > 3 && at::symint::size<T>(weight, -2) > 3)
+        || at::symint::size<T>(input, 0) > 1
+        || at::symint::size<T>(input, 0)*at::symint::size<T>(input, 1)*at::symint::size<T>(input, 2)*at::symint::size<T>(input, 3) > 20480) // for some case, native is faster
         );
 
 #endif
@@ -493,20 +524,23 @@ struct ConvParams {
            !transposed &&   // or transposed tensors
            input.ndimension() == 4 && // must be in NCHW format
            weight.ndimension() == 4 &&
-           (weight.size(2) < 17) && (weight.size(3) < 17) // NNPACK only supports kernels up to 16x16
+           (at::symint::size<T>(weight, 2) < 17) && (at::symint::size<T>(weight, 3) < 17) // NNPACK only supports kernels up to 16x16
 #if !defined(C10_MOBILE)
-           && input.size(0) >= 16 // ensure large enough batch size to ensure perf, tuneable
+           && at::symint::size<T>(input, 0) >= 16 // ensure large enough batch size to ensure perf, tuneable
 #endif
        ;
 #endif
     return false;
   }
   bool use_xnnpack(const at::Tensor& input, const at::Tensor& weight,
-                   const at::OptionalIntArrayRef bias_sizes_opt) const {
+                   const at::OptionalArrayRef<T> bias_sizes_opt) const {
 #if defined(C10_MOBILE)
     if (!transposed) {
-      return (input.size(1) == groups) &&
-              xnnpack::use_convolution2d(
+      // NB: for the call here, it MATTERS that we are templated. If you
+      // untemplate this to always use SymInt, the function
+      // xnnpack_use_convolution2d will always return false
+      return (at::symint::size<T>(input, 1) == groups) &&
+              xnnpack_use_convolution2d(
                   input,
                   weight,
                   bias_sizes_opt,
@@ -543,33 +577,12 @@ struct ConvParams {
     return input.is_cuda() &&
            !transposed &&
            (input.ndimension() == 4 || input.ndimension() == 5) &&
-           input.size(1) == groups &&
+           at::symint::size<T>(input, 1) == groups &&
            groups > 1 && // no point if there is only a single group
-           weight.size(0) % input.size(1) == 0; // output channels must be a multiple of input channels
+           at::symint::size<T>(weight, 0) % at::symint::size<T>(input, 1) == 0; // output channels must be a multiple of input channels
   }
 };
 
-// Function to select the convolution backend based on the inputs and params.
-// This overload is used within the convolution internals but not exposed to python.
-// NB: The forward pass provides a bias tensor while the backward pass provides
-// a bool indicating whether the bias is defined. This is done to save memory by
-// avoiding saving the full bias tensor for backward.
-ConvBackend _select_conv_backend(
-    const Tensor& input,
-    const Tensor& weight,
-    const c10::optional<Tensor>& bias_opt,
-    const at::OptionalIntArrayRef bias_sizes_opt,
-    const bool need_backward,
-    const ConvParams& params);
-
-// For BC reasons, have a copy that does not require bias_opt
-ConvBackend select_conv_backend(
-    const Tensor& input,
-    const Tensor& weight,
-    const at::OptionalIntArrayRef bias_sizes_opt,
-    const bool need_backward,
-    const ConvParams& params);
-
 DEFINE_DISPATCH(conv_depthwise2d_backward_stub);
 DEFINE_DISPATCH(conv_depthwise3d_backward_stub);
 DEFINE_DISPATCH(cudnn_convolution_backward_stub);
@@ -591,13 +604,14 @@ REGISTER_NO_CPU_DISPATCH(miopen_convolution_backward_stub);
 REGISTER_NO_CPU_DISPATCH(miopen_convolution_transpose_backward_stub);
 REGISTER_NO_CPU_DISPATCH(miopen_depthwise_convolution_backward_stub);
 
-std::ostream& operator<<(std::ostream & out, const ConvParams& params) {
+template <typename T>
+std::ostream& operator<<(std::ostream & out, const ConvParams<T>& params) {
   out << "ConvParams {"
       << "  stride = " << IntArrayRef{params.stride}
-      << "  padding = " << IntArrayRef{params.padding}
+      << "  padding = " << ArrayRef<T>{params.padding}
       << "  dilation = " << IntArrayRef{params.dilation}
       << "  transposed = " << params.transposed
-      << "  output_padding = " << IntArrayRef{params.output_padding}
+      << "  output_padding = " << ArrayRef<T>{params.output_padding}
       << "  groups = " << params.groups
       << "  benchmark = " << params.benchmark
       << "  deterministic = " << params.deterministic
@@ -607,9 +621,10 @@ std::ostream& operator<<(std::ostream & out, const ConvParams& params) {
   return out;
 }
 
+template <typename T>
 static void check_shape_forward(const at::Tensor& input,
-                                const c10::IntArrayRef& weight_sizes, const at::Tensor& bias,
-                                const ConvParams& params) {
+                                const c10::ArrayRef<T>& weight_sizes, const at::Tensor& bias,
+                                const ConvParams<T>& params) {
   int64_t k = input.ndimension();
   int64_t weight_dim = weight_sizes.size();
   int64_t groups = params.groups;
@@ -624,7 +639,7 @@ static void check_shape_forward(const at::Tensor& input,
   TORCH_CHECK(weight_dim == k,
            "Expected ", weight_dim, "-dimensional input for ", weight_dim,
            "-dimensional weight ", weight_sizes, ", but got ", k, "-dimensional input of size ",
-           input.sizes(), " instead");
+           at::symint::sizes<T>(input), " instead");
   TORCH_CHECK(weight_sizes[0] >= groups,
            "Given groups=", groups, ", expected weight to be at least ", groups,
            " at dimension 0, but got weight of size ", weight_sizes, " instead");
@@ -634,23 +649,23 @@ static void check_shape_forward(const at::Tensor& input,
            "] instead");
 
   if (!transposed) {
-    std::vector<int64_t> input_shape;
-    std::vector<int64_t> kernel_shape;
+    std::vector<T> input_shape;
+    std::vector<T> kernel_shape;
     bool kernel_size_correct = true;
 
-    TORCH_CHECK(input.size(1) == (weight_sizes[1] * groups),
+    TORCH_CHECK(at::symint::size<T>(input, 1) == (weight_sizes[1] * groups),
                 "Given groups=", groups, ", weight of size ", weight_sizes,
                 ", expected input", input.sizes(), " to have ",
-                (weight_sizes[1] * groups), " channels, but got ", input.size(1),
+                (weight_sizes[1] * groups), " channels, but got ", at::symint::size<T>(input, 1),
                 " channels instead");
 
-    TORCH_CHECK(!bias.defined() || (bias.ndimension() == 1 && bias.size(0) == weight_sizes[0]),
+    TORCH_CHECK(!bias.defined() || (bias.ndimension() == 1 && at::symint::size<T>(bias, 0) == weight_sizes[0]),
              "Given weight of size ", weight_sizes,
              ", expected bias to be 1-dimensional with ", weight_sizes[0], " elements",
-             ", but got bias of size ", bias.sizes(), " instead");
+             ", but got bias of size ", at::symint::sizes<T>(bias), " instead");
 
     for (const auto i : c10::irange(2, k)) {
-      input_shape.push_back(input.size(i) + 2 * padding[i-2]);
+      input_shape.push_back(at::symint::size<T>(input, i) + 2 * padding[i-2]);
       // log new kernel size considering dilation
       kernel_shape.push_back(dilation[i-2] * (weight_sizes[i]-1) + 1);
       if (input_shape.back() < kernel_shape.back()) {
@@ -676,22 +691,23 @@ static void check_shape_forward(const at::Tensor& input,
                "Kernel size: (", kernel_ss.str(), "). Kernel size can't be greater than actual input size");
     }
   } else { // transposed
-    TORCH_CHECK(input.size(1) == weight_sizes[0],
+    TORCH_CHECK(at::symint::size<T>(input, 1) == weight_sizes[0],
              "Given transposed=", transposed, ", weight of size ", weight_sizes,
              ", expected input", input.sizes(), " to have ", weight_sizes[0],
-             " channels, but got ", input.size(1), " channels instead");
-    TORCH_CHECK(!bias.defined() || (bias.ndimension() == 1 && bias.size(0) == weight_sizes[1] * groups),
+             " channels, but got ", at::symint::size<T>(input, 1), " channels instead");
+    TORCH_CHECK(!bias.defined() || (bias.ndimension() == 1 && at::symint::size<T>(bias, 0) == weight_sizes[1] * groups),
              "Given transposed=", transposed, ", weight of size ", weight_sizes,
              ", expected bias to be 1-dimensional with ", weight_sizes[1] * groups, " elements",
              ", but got bias of size ", bias.sizes(), " instead");
   }
 }
 
+template <typename T>
 static void check_shape_backward(
     const at::Tensor& input,
-    const c10::IntArrayRef& weight_sizes,
-    const ConvParams& params) {
-  check_shape_forward(input, weight_sizes, /*bias=*/ Tensor(), params);
+    const c10::ArrayRef<T>& weight_sizes,
+    const ConvParams<T>& params) {
+  check_shape_forward<T>(input, weight_sizes, /*bias=*/ Tensor(), params);
 }
 
 // Given an input tensor and an expected number of spatial dimensions, checks that the
@@ -1149,71 +1165,25 @@ at::Tensor convolution_overrideable(
   TORCH_CHECK_NOT_IMPLEMENTED(false, "convolution_overrideable not implemented. You are likely triggering this with tensor backend other than CPU/CUDA/MKLDNN, if this is intended, please use TORCH_LIBRARY_IMPL to override this function ");
 }
 
-// Selects a backend for convolution based on the inputs and params.
-ConvBackend select_conv_backend(
-    const Tensor& input_r, const Tensor& weight_r, const c10::optional<Tensor>& bias_opt,
-    IntArrayRef stride_, IntArrayRef padding_, IntArrayRef dilation_,
-    bool transposed_, IntArrayRef output_padding_, int64_t groups_, const at::OptionalIntArrayRef bias_sizes_opt) {
-  c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
-  const Tensor& bias = *bias_maybe_owned;
-
-  auto& ctx = at::globalContext();
-  auto k = weight_r.ndimension();
-  int64_t dim = k - 2;
-  ConvParams params;
-  params.stride = expand_param_if_needed(stride_, "stride", dim);
-  params.padding = expand_param_if_needed(padding_, "padding", dim);
-  params.dilation = expand_param_if_needed(dilation_, "dilation", dim);
-  params.transposed = transposed_;
-  params.output_padding = expand_param_if_needed(output_padding_, "output_padding", dim);
-  params.groups = groups_;
-  params.benchmark = ctx.benchmarkCuDNN();
-  params.deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms();
-  params.cudnn_enabled = ctx.userEnabledCuDNN();
-  params.allow_tf32 = ctx.allowTF32CuDNN();
-
-  auto input = input_r;
-  auto weight = weight_r;
-  check_shape_forward(input, weight.sizes(), bias, params);
-
-  // Expand 1d -> 2d.
-  // This is only done for backends that don't natively support 1d spatial input.
-  if (k == 3 && !input.is_mkldnn() && !input.is_xpu()) {
-    // avoid accidentally going through NHWC for permuted 3d input.
-    input = input.contiguous();
-    params.view1d_as_2d();
-    input = view4d(input);
-    weight = view4d(weight);
-  }
-
-  auto bias_sizes = bias.defined() ? c10::optional<IntArrayRef>(bias.sizes()) : bias_sizes_opt;
-  bool need_backward = GradMode::is_enabled() &&
-      (input.requires_grad() || weight.requires_grad() || (bias.defined() && bias.requires_grad()));
-  return _select_conv_backend(input, weight, bias, bias_sizes, need_backward, params);
-}
-
-ConvBackend select_conv_backend(
-    const Tensor& input,
-    const Tensor& weight,
-    const at::OptionalIntArrayRef bias_sizes_opt,
-    const bool need_backward,
-    const ConvParams& params) {
-  return _select_conv_backend(input, weight, {}, bias_sizes_opt, need_backward, params);
-}
-
+// Function to select the convolution backend based on the inputs and params.
+// This overload is used within the convolution internals but not exposed to python.
+// NB: The forward pass provides a bias tensor while the backward pass provides
+// a bool indicating whether the bias is defined. This is done to save memory by
+// avoiding saving the full bias tensor for backward.
+template <typename T>
 ConvBackend _select_conv_backend(
     const Tensor& input,
     const Tensor& weight,
     const c10::optional<Tensor>& bias,
-    const at::OptionalIntArrayRef bias_sizes_opt,
+    const at::OptionalArrayRef<T> bias_sizes_opt,
     const bool need_backward,
-    const ConvParams& params) {
+    const ConvParams<T>& params) {
 
   // don't send empty inputs through backends
-  if (input.size(0) == 0 || input.size(1) == 0) {
+  if (at::symint::size<T>(input, 0) == 0 || at::symint::size<T>(input, 1) == 0) {
     return input.is_mkldnn() ? ConvBackend::MkldnnEmpty : ConvBackend::Empty;
-  } else if (input.numel() == 0) {
-    TORCH_CHECK(false, "Only zero batch or zero channel inputs are supported, but got input shape: ", input.sizes());
+  } else if (at::symint::numel<T>(input) == 0) {
+    TORCH_CHECK(false, "Only zero batch or zero channel inputs are supported, but got input shape: ", at::symint::sizes<T>(input));
   }
 
   if (params.is_depthwise(input, weight)) {
@@ -1305,12 +1275,65 @@ ConvBackend _select_conv_backend(
   AT_ERROR("unsupported ConvNd parameters");
 }
 
+// Selects a backend for convolution based on the inputs and params.
+ConvBackend select_conv_backend(
+    const Tensor& input_r, const Tensor& weight_r, const c10::optional<Tensor>& bias_opt,
+    IntArrayRef stride_, SymIntArrayRef padding_, IntArrayRef dilation_,
+    bool transposed_, SymIntArrayRef output_padding_, int64_t groups_, const at::OptionalSymIntArrayRef bias_sizes_opt) {
+  c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
+  const Tensor& bias = *bias_maybe_owned;
+
+  auto& ctx = at::globalContext();
+  auto k = weight_r.ndimension();
+  int64_t dim = k - 2;
+  ConvParams<c10::SymInt> params;
+  params.stride = expand_param_if_needed(stride_, "stride", dim);
+  params.padding = expand_param_if_needed(padding_, "padding", dim);
+  params.dilation = expand_param_if_needed(dilation_, "dilation", dim);
+  params.transposed = transposed_;
+  params.output_padding = expand_param_if_needed(output_padding_, "output_padding", dim);
+  params.groups = groups_;
+  params.benchmark = ctx.benchmarkCuDNN();
+  params.deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms();
+  params.cudnn_enabled = ctx.userEnabledCuDNN();
+  params.allow_tf32 = ctx.allowTF32CuDNN();
+
+  auto input = input_r;
+  auto weight = weight_r;
+  check_shape_forward(input, weight.sym_sizes(), bias, params);
+
+  // Expand 1d -> 2d.
+  // This is only done for backends that don't natively support 1d spatial input.
+  if (k == 3 && !input.is_mkldnn() && !input.is_xpu()) {
+    // avoid accidentally going through NHWC for permuted 3d input.
+    input = input.contiguous();
+    params.view1d_as_2d();
+    input = view4d(input);
+    weight = view4d(weight);
+  }
+
+  auto bias_sizes = bias.defined() ? c10::optional<SymIntArrayRef>(bias.sym_sizes()) : bias_sizes_opt;
+  bool need_backward = GradMode::is_enabled() &&
+      (input.requires_grad() || weight.requires_grad() || (bias.defined() && bias.requires_grad()));
+  return _select_conv_backend(input, weight, bias, bias_sizes, need_backward, params);
+}
+
+// For BC reasons, have a copy that does not require bias_opt
+ConvBackend select_conv_backend(
+    const Tensor& input,
+    const Tensor& weight,
+    const at::OptionalIntArrayRef bias_sizes_opt,
+    const bool need_backward,
+    const ConvParams<int64_t>& params) {
+  return _select_conv_backend(input, weight, {}, bias_sizes_opt, need_backward, params);
+}
+
 at::Tensor _convolution_nogroup_backend(
     const Tensor& input,
     const Tensor& weight,
     const Tensor& bias,
     const ConvBackend backend,
-    const ConvParams& params) {
+    const ConvParams<int64_t>& params) {
   auto kernel_size = weight.sizes().slice(2);
   switch(backend) {
     case ConvBackend::NnpackSpatial:
@@ -1341,7 +1364,7 @@ at::Tensor _convolution_nogroup_backend(
 static inline std::vector<int64_t> calc_output_size(
     const Tensor& input,
     const Tensor& weight,
-    const ConvParams& params) {
+    const ConvParams<int64_t>& params) {
   std::vector<int64_t> output_size = params.transposed ?
     conv_input_size(input.sizes(), weight.sizes(), params.padding, params.output_padding,
         params.stride, params.dilation, params.groups) :
@@ -1422,7 +1445,7 @@ at::Tensor _convolution(
   TORCH_CHECK(dim > 0, "weight should have at least three dimensions");
   TORCH_CHECK(groups_ > 0, "non-positive groups is not supported");
 
-  ConvParams params;
+  ConvParams<int64_t> params;
   params.stride = expand_param_if_needed(stride_, "stride", dim);
   params.padding = expand_param_if_needed(padding_, "padding", dim);
   params.dilation = expand_param_if_needed(dilation_, "dilation", dim);
@@ -1450,7 +1473,7 @@ at::Tensor _convolution(
   auto bias_sizes_opt = bias.defined() ? c10::optional<IntArrayRef>(bias.sizes()) : c10::nullopt;
   bool need_backward = GradMode::is_enabled() &&
       (input.requires_grad() || weight.requires_grad() || (bias.defined() && bias.requires_grad()));
-  ConvBackend backend = _select_conv_backend(input, weight, bias, bias_sizes_opt, need_backward, params);
+  ConvBackend backend = _select_conv_backend(input, weight, bias, c10::OptionalIntArrayRef(bias_sizes_opt), need_backward, params);
   at::MemoryFormat backend_memory_format = determine_backend_memory_format(input, weight, backend);
 
   // Call the backend.
@@ -1663,7 +1686,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
   auto weight = weight_r;
 
   int64_t dim = weight.ndimension() - 2;
-  ConvParams params;
+  ConvParams<int64_t> params;
   params.stride = expand_param_if_needed(stride_, "stride", dim);
   params.padding = expand_param_if_needed(padding_, "padding", dim);
   params.dilation = expand_param_if_needed(dilation_, "dilation", dim);
@@ -1726,7 +1749,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
   if (ggI.defined()) {
 
     // Modified params with correct padding
-    ConvParams gw_conv_params(params);
+    ConvParams<int64_t> gw_conv_params(params);
 
     // Disable groups as they are handled separately
     auto groups = gw_conv_params.groups;
@@ -1795,7 +1818,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
   Tensor gI;
   if (input.numel() != 0) {
     if (ggW.defined()) {
-      ConvParams gi_conv_params(params);
+      ConvParams<int64_t> gi_conv_params(params);
       gi_conv_params.transposed = !params.transposed;
 
       if (params.transposed) {
@@ -1851,7 +1874,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _convolution_backward_nogroup_bac
     const Tensor& weight,
     const std::array<bool, 3> output_mask,
     const ConvBackend backend,
-    const ConvParams& params) {
+    const ConvParams<int64_t>& params) {
   auto kernel_size = weight.sizes().slice(2);
   switch(backend) {
     case ConvBackend::Slow2d:
@@ -1916,7 +1939,7 @@ std::tuple<Tensor, Tensor, Tensor> convolution_backward(
   TORCH_CHECK(dim > 0, "weight should have at least three dimensions");
 
   auto& ctx = at::globalContext();
-  ConvParams params;
+  ConvParams<int64_t> params;
   params.stride = expand_param_if_needed(stride, "stride", dim);
   params.padding = expand_param_if_needed(padding, "padding", dim);
   params.dilation = expand_param_if_needed(dilation, "dilation", dim);
diff --git a/aten/src/ATen/native/utils/ParamUtils.h b/aten/src/ATen/native/utils/ParamUtils.h
index 376467ff79cf5..7c89a3316cb4b 100644
--- a/aten/src/ATen/native/utils/ParamUtils.h
+++ b/aten/src/ATen/native/utils/ParamUtils.h
@@ -6,12 +6,13 @@
 namespace at {
 namespace native {
 
-inline std::vector<int64_t> expand_param_if_needed(
-    IntArrayRef list_param,
+template <typename T>
+inline std::vector<T> expand_param_if_needed(
+    ArrayRef<T> list_param,
     const char* param_name,
     int64_t expected_dim) {
   if (list_param.size() == 1) {
-    return std::vector<int64_t>(expected_dim, list_param[0]);
+    return std::vector<T>(expected_dim, list_param[0]);
   } else if ((int64_t)list_param.size() != expected_dim) {
     std::ostringstream ss;
     ss << "expected " << param_name << " to be a single integer value or a "
diff --git a/c10/core/SymInt.h b/c10/core/SymInt.h
index 9ab72a0776804..6355f13395053 100644
--- a/c10/core/SymInt.h
+++ b/c10/core/SymInt.h
@@ -235,6 +235,19 @@ inline c10::SymInt multiply_integers(const C& container) {
       [](const c10::SymInt& a, const c10::SymInt& b) { return a * b; });
 }
 
+template <
+    typename Iter,
+    typename = std::enable_if_t<std::is_same<
+        typename std::iterator_traits<Iter>::value_type,
+        c10::SymInt>::value>>
+inline c10::SymInt multiply_integers(Iter begin, Iter end) {
+  return std::accumulate(
+      begin,
+      end,
+      c10::SymInt(1),
+      [](const c10::SymInt& a, const c10::SymInt& b) { return a * b; });
+}
+
 inline SymInt operator+(int64_t a, const SymInt& b) {
   return c10::SymInt(a) + b;
 }
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index b8693a484ed9d..6073736257249 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -1408,10 +1408,10 @@ Call this whenever a new thread is created in order to propagate values from
          const at::Tensor& weight,
          const c10::optional<at::Tensor>& bias_opt,
          at::IntArrayRef stride_,
-         at::IntArrayRef padding_,
+         at::SymIntArrayRef padding_,
          at::IntArrayRef dilation_,
          bool transposed_,
-         at::IntArrayRef output_padding_,
+         at::SymIntArrayRef output_padding_,
          int64_t groups_) {
         return at::native::select_conv_backend(
             input,
@@ -1442,13 +1442,13 @@ Call this whenever a new thread is created in order to propagate values from
          const at::Tensor& weight,
          const c10::optional<at::Tensor>& bias,
          at::IntArrayRef stride_,
-         at::IntArrayRef padding_,
+         at::SymIntArrayRef padding_,
          at::IntArrayRef dilation_,
          bool transposed_,
-         at::IntArrayRef output_padding_,
+         at::SymIntArrayRef output_padding_,
          int64_t groups_,
-         c10::optional<std::vector<int64_t>> bias_sizes_opt) {
-        c10::OptionalArrayRef<int64_t> ref = c10::nullopt;
+         c10::optional<std::vector<c10::SymInt>> bias_sizes_opt) {
+        c10::OptionalArrayRef<c10::SymInt> ref = c10::nullopt;
         if (bias_sizes_opt) {
           ref = (*bias_sizes_opt);
         }

From e63bb38af2000d4652758a7104793ab09afd03a9 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 16 Nov 2022 05:58:02 -0800
Subject: [PATCH 0961/1922] Towards unifying symbolic and non symbolic fake
 tensor (#89038)

Fake tensor behaves pretty differently depending on if you have
symbolic shapes or not.  This leads to bugs; for example, we
weren't getting correct convolution_backward strides because we
bypassed the correct stride logic in fake tensor on symbolic
shapes.

This PR attempts to unify the two codepaths.  I don't manage to
unify everything, but I get most of it.  The algorithm is delicate
and I'm still hosing down test failures.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89038
Approved by: https://github.com/anjali411
---
 aten/src/ATen/native/TensorFactories.cpp |  6 ---
 test/functorch/test_aotdispatch.py       |  1 -
 test/test_proxy_tensor.py                | 21 +++------
 torch/_meta_registrations.py             | 44 +++++++++++++++---
 torch/_ops.py                            |  1 +
 torch/_prims/__init__.py                 |  5 +-
 torch/_prims_common/__init__.py          |  3 ++
 torch/_subclasses/fake_tensor.py         | 58 +++++++++---------------
 8 files changed, 71 insertions(+), 68 deletions(-)

diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 9d1c6d8a36333..7245cb77b1c50 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -325,12 +325,6 @@ Tensor empty_like(
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options_ = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
-
-  TORCH_CHECK(
-    !(options_.has_memory_format() && optional_memory_format.has_value()),
-    "Cannot set memory_format both in TensorOptions and explicit argument; please delete "
-    "the redundant setter.");
-
   TensorOptions options =
       self.options()
           .merge_in(options_)
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 1dc5476158f96..ae216f9be4a49 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1011,7 +1011,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('cumprod', ''),  # aten.cumprod.default - couldn't find symbolic meta function/decomposition
     xfail('cumsum', ''),  # aten.cumsum.default - couldn't find symbolic meta function/decomposition
     xfail('cumulative_trapezoid', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('deg2rad', ''),  # aten.deg2rad.default - couldn't find symbolic meta function/decomposition
     xfail('diff', ''),  # aten.zeros_like.default - couldn't find symbolic meta function/decomposition
     xfail('digamma', ''),  # aten.polygamma.default - couldn't find symbolic meta function/decomposition
     xfail('dist', ''),  # aten.dist.default - couldn't find symbolic meta function/decomposition
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 8dc42be7fdfb2..0a24807af55f0 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1151,9 +1151,7 @@ def f(a, b, c, d, e):
     xfail('cummin', ''),  # aten.cummin.default - couldn't find symbolic meta function/decomposition
     xfail('cumprod', ''),  # aten.cumprod.default - couldn't find symbolic meta function/decomposition
     xfail('cumulative_trapezoid', ''),  # aten.slice.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('deg2rad', ''),  # aten.deg2rad.default - couldn't find symbolic meta function/decomposition
     xfail('diff', ''),  # aten.empty_like.default - couldn't find symbolic meta function/decomposition
-    xfail('dist', ''),  # aten.dist.default - couldn't find symbolic meta function/decomposition
     xfail('dsplit', ''),  # aten.slice.Tensor - couldn't find symbolic meta function/decomposition
     xfail('fft.fft2', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('fft.fft', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
@@ -1235,8 +1233,6 @@ def f(a, b, c, d, e):
     xfail('lu', ''),  # aten.linalg_lu_factor_ex.default - couldn't find symbolic meta function/decomposition
     xfail('lu_solve', ''),  # aten.linalg_lu_solve.default - couldn't find symbolic meta function/decomposition
     xfail('lu_unpack', ''),  # aten.lu_unpack.default - couldn't find symbolic meta function/decomposition
-    xfail('masked_fill', ''),  # expected predicate to be bool, got torch.float32
-    xfail('masked_scatter', ''),  # aten.masked_scatter.default - couldn't find symbolic meta function/decomposition
     xfail('masked_select', ''),  # aten.masked_select.default - couldn't find symbolic meta function/decomposition
     xfail('matrix_exp', ''),  # aten.linalg_matrix_exp.default - couldn't find symbolic meta function/decomposition
     xfail('median', ''),  # Could not run 'aten::median' with arguments from the 'Meta' backend. This could be becau...
@@ -1281,7 +1277,6 @@ def f(a, b, c, d, e):
     xfail('nn.functional.pdist', ''),  # Could not run 'aten::_pdist_forward' with arguments from the 'Meta' backend...
     xfail('nn.functional.pixel_shuffle', ''),  # aten.pixel_shuffle.default - couldn't find symbolic meta function/decompos...
     xfail('nn.functional.pixel_unshuffle', ''),  # aten.pixel_unshuffle.default - couldn't find symbolic meta function/deco...
-    xfail('nn.functional.rrelu', ''),  # aten.empty_like.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.smooth_l1_loss', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.unfold', ''),  # aten.im2col.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.upsample_nearest', ''),  # aten.upsample_nearest1d.vec - couldn't find symbolic meta function/deco...
@@ -1298,7 +1293,6 @@ def f(a, b, c, d, e):
     xfail('polygamma', 'polygamma_n_2'),  # aten.polygamma.default - couldn't find symbolic meta function/decomposition
     xfail('polygamma', 'polygamma_n_3'),  # aten.polygamma.default - couldn't find symbolic meta function/decomposition
     xfail('polygamma', 'polygamma_n_4'),  # aten.polygamma.default - couldn't find symbolic meta function/decomposition
-    xfail('put', ''),  # aten.clone.default - couldn't find symbolic meta function/decomposition
     xfail('quantile', ''),  # Could not run 'aten::equal' with arguments from the 'Meta' backend.
     xfail('qr', ''),  # aten.linalg_qr.default - couldn't find symbolic meta function/decomposition
     xfail('rad2deg', ''),  # aten.rad2deg.default - couldn't find symbolic meta function/decomposition
@@ -1347,11 +1341,15 @@ def f(a, b, c, d, e):
 
 symbolic_tensor_failures.update(symbolic_tensor_segfaults)
 
+outplace_symbolic_tensor_failures = {
+    xfail('masked_fill', ''),  # expected predicate to be bool, got torch.float32
+    xfail('masked_scatter', ''),  # aten.masked_scatter.default - couldn't find symbolic meta function/decomposition
+    xfail('nn.functional.rrelu', ''),  # aten.empty_like.default - couldn't find symbolic meta function/decomposition
+}
+
 inplace_symbolic_tensor_failures = {
-    xfail('abs', ''),  # aten.abs_.default - couldn't find symbolic meta function/decomposition
     xfail('acos', ''),  # aten.acos_.default - couldn't find symbolic meta function/decomposition
     xfail('acosh', ''),  # aten.acosh_.default - couldn't find symbolic meta function/decomposition
-    xfail('addbmm', ''),  # aten.addbmm_.default - couldn't find symbolic meta function/decomposition
     xfail('addcdiv', ''),  # aten.addcdiv_.default - couldn't find symbolic meta function/decomposition
     xfail('addcmul', ''),  # aten.addcmul_.default - couldn't find symbolic meta function/decomposition
     xfail('addmm', ''),  # aten.addmm_.default - couldn't find symbolic meta function/decomposition
@@ -1365,7 +1363,6 @@ def f(a, b, c, d, e):
     xfail('clamp', ''),  # aten.clamp_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('clamp_max', ''),  # aten.clamp_max_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('clamp_min', ''),  # aten.clamp_min_.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('conj_physical', ''),  # aten.conj_physical_.default - couldn't find symbolic meta function/decomposition
     xfail('copysign', ''),  # aten.copysign_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('cos', ''),  # aten.cos_.default - couldn't find symbolic meta function/decomposition
     xfail('cosh', ''),  # aten.cosh_.default - couldn't find symbolic meta function/decomposition
@@ -1382,7 +1379,6 @@ def f(a, b, c, d, e):
     xfail('expm1', ''),  # aten.expm1_.default - couldn't find symbolic meta function/decomposition
     xfail('float_power', ''),  # the base given to float_power_ has dtype Float but the operation's result requires dtype Double
     xfail('floor', ''),  # aten.floor_.default - couldn't find symbolic meta function/decomposition
-    xfail('floor_divide', ''),  # aten.floor_divide_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('fmod', ''),  # aten.fmod_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('frac', ''),  # aten.frac_.default - couldn't find symbolic meta function/decomposition
     xfail('ge', ''),  # aten.ge_.Tensor - couldn't find symbolic meta function/decomposition
@@ -1398,7 +1394,6 @@ def f(a, b, c, d, e):
     xfail('log1p', ''),  # aten.log1p_.default - couldn't find symbolic meta function/decomposition
     xfail('log2', ''),  # aten.log2_.default - couldn't find symbolic meta function/decomposition
     xfail('log', ''),  # aten.log_.default - couldn't find symbolic meta function/decomposition
-    xfail('logit', ''),  # aten.logit_.default - couldn't find symbolic meta function/decomposition
     xfail('lt', ''),  # aten.lt_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('mvlgamma', 'mvlgamma_p_1'),  # aten.mvlgamma_.default - couldn't find symbolic meta function/decomposition
     xfail('mvlgamma', 'mvlgamma_p_3'),  # aten.mvlgamma_.default - couldn't find symbolic meta function/decomposition
@@ -1408,7 +1403,6 @@ def f(a, b, c, d, e):
     xfail('neg', ''),  # aten.neg_.default - couldn't find symbolic meta function/decomposition
     xfail('nextafter', ''),  # aten.nextafter_.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.celu', ''),  # aten.celu_.default - couldn't find symbolic meta function/decomposition
-    xfail('nn.functional.dropout3d', ''),  # aten.squeeze_.dim - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.elu', ''),  # aten.elu_.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.hardsigmoid', ''),  # aten.hardsigmoid_.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.mish', ''),  # aten.mish_.default - couldn't find symbolic meta function/decomposition
@@ -1426,7 +1420,6 @@ def f(a, b, c, d, e):
     xfail('sinh', ''),  # aten.sinh_.default - couldn't find symbolic meta function/decomposition
     xfail('sqrt', ''),  # aten.sqrt_.default - couldn't find symbolic meta function/decomposition
     xfail('square', ''),  # aten.pow_.Scalar - couldn't find symbolic meta function/decomposition
-    xfail('squeeze', ''),  # aten.squeeze_.default - couldn't find symbolic meta function/decomposition
     xfail('t', ''),  # aten.t_.default - couldn't find symbolic meta function/decomposition
     xfail('tan', ''),  # aten.tan_.default - couldn't find symbolic meta function/decomposition
     xfail('tanh', ''),  # aten.tanh_.default - couldn't find symbolic meta function/decomposition
@@ -1516,7 +1509,7 @@ def test_make_fx_fake_exhaustive(self, device, dtype, op):
     @skipIfNoSympy
     @ops(op_db, allowed_dtypes=(torch.float,))
     @skipOps('TestProxyTensorOpInfo', 'test_make_fx_symbolic_exhaustive',
-             make_fx_failures | fake_tensor_failures | symbolic_tensor_failures)
+             make_fx_failures | fake_tensor_failures | symbolic_tensor_failures | outplace_symbolic_tensor_failures)
     def test_make_fx_symbolic_exhaustive(self, device, dtype, op):
         _test_make_fx_helper(self, device, dtype, op, "symbolic")
 
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 4fa3ab09d2755..abcd1ead8b433 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1513,7 +1513,6 @@ def full(size, fill_value, *args, **kwargs):
         aten.randn_like.default,
         aten.rand_like.default,
         aten.full_like.default,
-        aten.zeros_like.default,
         aten.ones_like.default,
     ]
 )
@@ -1521,6 +1520,44 @@ def meta_like(self, *args, **kwargs):
     return aten.empty_like.default(self, **kwargs)
 
 
+# zeros_like is special cased to work for sparse
+@register_meta(aten.zeros_like.default)
+def zeros_like(
+    self, dtype=None, layout=None, device=None, pin_memory=None, memory_format=None
+):
+    if layout == torch.sparse_coo:
+        check(
+            memory_format is None,
+            lambda: "memory format option is only supported by strided tensors",
+        )
+
+        res = torch.empty(
+            0,
+            dtype=self.dtype if dtype is None else dtype,
+            layout=layout,
+            device=self.device if device is None else device,
+            pin_memory=pin_memory,
+        )
+
+        if self.is_sparse:
+            res.sparse_resize_and_clear_(
+                self.size(), self.sparse_dim(), self.dense_dim()
+            )
+        else:
+            res.sparse_resize_and_clear_(self.size(), self.dim(), 0)
+
+        res._coalesced_(True)
+        return res
+    return aten.empty_like.default(
+        self,
+        dtype=dtype,
+        layout=layout,
+        device=device,
+        pin_memory=pin_memory,
+        memory_format=memory_format,
+    )
+
+
 # hacky: Please remove after math.ceil works with arange
 @register_meta(aten.arange.default)
 def arange(end, **kwargs):
@@ -1894,11 +1931,6 @@ def activate_meta():
             # Instead, we should be letting those decompositions run, and writing meta kernels
             # only for the base operators.
             pass
-        elif op_overload.is_view:
-            # Attempting to register a python meta kernel for a view operator.
-            # We shouldn't do this, because the output will report as not having aliased storages.
-            # All view ops have meta kernels in C++ today, so we should use those instead.
-            pass
         elif op_overload.name() in {
             "aten::empty_strided",  # causing infinite recursion, test_meta.py
             "aten::clone",  # causing infinite recursion
diff --git a/torch/_ops.py b/torch/_ops.py
index 9163932144d0d..b20398a7f3ab3 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -365,6 +365,7 @@ def handler(*args, **kwargs):
             return handler
 
         final_key = resolve_key(self, key)
+        # print(self, key, final_key)
         r = self.py_kernels.get(final_key, final_key)
         self._dispatch_cache[key] = r
         return r
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index da8d9af723acf..a4bac68f0ff14 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -1150,9 +1150,6 @@ def _minimum_aten(
 
 #
 # View operations
-#
-# TODO: model view relationships
-# TODO: model storage
 def _as_strided_meta(
     a: TensorLikeType, size: ShapeType, stride: StrideType, storage_offset: int
 ) -> TensorLikeType:
@@ -1170,7 +1167,7 @@ def _as_strided_meta(
             a._typed_storage(), size, stride, storage_offset
         )
 
-    return TensorMeta(a, shape=size, strides=stride)
+    return torch.as_strided(a, size, stride, storage_offset)
 
 
 def _as_strided_aten(
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index 128796dfa3d07..041448e8102ac 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -291,6 +291,9 @@ def is_non_overlapping_and_dense(a: Tensor) -> bool:
     its dimensions that is contiguous.
     """
 
+    if a.is_sparse:
+        return False
+
     # Short-circuits if the tensor is already contiguous or channels-last contiguous
     if is_contiguous(a) or is_channels_last_contiguous(a):
         return True
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 5d3d3a0e32fe1..9a0ac050e6b94 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -1,7 +1,6 @@
 import contextlib
 import functools
 import itertools
-import sys
 import weakref
 from dataclasses import dataclass
 from functools import partial
@@ -297,8 +296,9 @@ def constructors(fake_mode, func, *args, **kwargs):
     out_device = new_kwargs.pop("device", None)
     out_device = out_device if out_device is not None else default_device
     new_kwargs["device"] = torch.device("meta")
-    # Not in_kernel_invocation_manager as no fake tensor inputs
-    with no_dispatch():
+    # _like constructors have fake tensor inputs (maybe this causes the non-like
+    # to fail? hmmm)
+    with in_kernel_invocation_manager(fake_mode):
         r = func(*args, **new_kwargs)
     return FakeTensor(fake_mode, r, out_device)
 
@@ -821,40 +821,30 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         # is written to must be invalidated
         self.invalidate_written_to_constants(func, flat_arg_fake_tensors, args, kwargs)
 
-        from torch._decomp import decomposition_table
-
-        with self:
-            # Decomposes CompositeImplicitAutograd ops
-            r = func.decompose(*args, **kwargs)
-            if r is not NotImplemented:
-                return r
+        # If there's a Python meta, prefer that over the decomposition
+        from torch._decomp import meta_table as meta_table
 
-        # IDK: feels bad man, sym_numel on as_strided infinite loops otherwise
-        if has_symbolic_sizes and not self.cpp_meta_supports_symint(func):
-            from torch._decomp import meta_table as meta_table
+        if func not in meta_table and not self.cpp_meta_supports_symint(func):
+            from torch._decomp import decomposition_table
 
-            if func == aten.size.default:
-                sys.stderr.write(
-                    "Trying to call aten.size on a tensor with symbolic shapes. "
-                    "It's likely that this is from calling tensor.shape in C++"
+            # Prefer Python decompositions over C++ ones
+            if func in decomposition_table and (
+                has_symbolic_sizes
+                or (
+                    # TODO: Remove these exclusions, so that we can remove
+                    # this leg entirely
+                    torch_decomp_decompositions(func)
+                    and all(not e.is_sparse for e in flat_arg_fake_tensors)
                 )
-                # We do this to allow for better error localization with `TORCH_SHOW_CPP_STACKTRACES=1`
-                return None
-
-            with self:
-                if func in meta_table:
-                    r = meta_table[func](*args, **kwargs)
-                    return r
-                if func in decomposition_table:
+            ):
+                with self:
                     return decomposition_table[func](*args, **kwargs)
 
-        if (
-            func in decomposition_table
-            and torch_decomp_decompositions(func)
-            and all(not e.is_sparse for e in flat_arg_fake_tensors)
-        ):
             with self:
-                return decomposition_table[func](*args, **kwargs)
+                # Decomposes CompositeImplicitAutograd ops
+                r = func.decompose(*args, **kwargs)
+                if r is not NotImplemented:
+                    return r
 
         # prims already wrap FakeTensor inputs to FakeTensor outputs
         # and do device logic, we dont need do anything but run them
@@ -865,12 +855,6 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
             with self:
                 return func.prim_meta_impl(*args, **kwargs)
 
-        if has_symbolic_sizes:
-            if not self.cpp_meta_supports_symint(func):
-                raise RuntimeError(
-                    f"{func} - couldn't find symbolic meta function/decomposition"
-                )
-
         # special handling for funcs registered through `register_op_impl`,
         # e.g., manipulating args on constructor calls to construct meta tensors
         # and then afterwards wrapping them to a FakeTensor

From f900d5056ec40b47c3641f4d7ea474af8a58d16a Mon Sep 17 00:00:00 2001
From: mindest <linminuser@gmail.com>
Date: Wed, 16 Nov 2022 15:08:41 +0000
Subject: [PATCH 0962/1922] [ONNX] Extra support for bernoulli export (#88655)

* add opset 15 support for `bernoulli`.
* add extra export options for different `bernoulli` cases: `x.bernoulli(p)` where `p` is a tensor or float.

Fixes #88299

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88655
Approved by: https://github.com/BowenBao
---
 test/onnx/test_pytorch_onnx_onnxruntime.py | 17 +++++++++++++++++
 torch/onnx/symbolic_opset15.py             | 16 ++++++++++++++++
 torch/onnx/symbolic_opset9.py              |  9 +++++----
 3 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index e4fc3f83b288d..7ae9d8edaccc6 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -2643,6 +2643,23 @@ def forward(self, x):
         x = torch.empty(2, 3, 3, dtype=torch.double).uniform_(0, 1)
         self.run_test(Bernoulli(), x)
 
+    def test_bernoulli_p(self):
+        class Bernoulli_float(torch.nn.Module):
+            def forward(self, x):
+                return torch.mul(x, torch.bernoulli(x, 0.2).size(0))
+
+        class Bernoulli_tensor(torch.nn.Module):
+            def forward(self, x):
+                return torch.mul(x, torch.rand_like(x).bernoulli_(x).size(0))
+
+        x = torch.rand(3, 3)
+        self.run_test(Bernoulli_float(), x)
+        self.run_test(Bernoulli_tensor(), x)
+
+        x = torch.rand(2, 3, 3, dtype=torch.double)
+        self.run_test(Bernoulli_float(), x)
+        self.run_test(Bernoulli_tensor(), x)
+
     @unittest.skip("Bug in ORT, skip test until rel-1.11.")
     @skipIfUnsupportedMinOpsetVersion(14)
     def test_reshape_allowzero(self):
diff --git a/torch/onnx/symbolic_opset15.py b/torch/onnx/symbolic_opset15.py
index efb96c717fde6..4f316a77f62e6 100644
--- a/torch/onnx/symbolic_opset15.py
+++ b/torch/onnx/symbolic_opset15.py
@@ -54,6 +54,22 @@ def aten__isnot_(g: jit_utils.GraphContext, self, other):
     return aten__is_(g, self, other)
 
 
+@_onnx_symbolic("aten::bernoulli")
+@_beartype.beartype
+def bernoulli(g: jit_utils.GraphContext, input, p=None, generator=None, out=None):
+    if out is not None and not symbolic_helper._is_none(out):
+        symbolic_helper._unimplemented(
+            "Bernoulli", "out parameter is not supported for bernoulli", input
+        )
+    if generator is not None and not symbolic_helper._is_none(generator):
+        symbolic_helper._unimplemented(
+            "Bernoulli", "generator is not supported for bernoulli", input
+        )
+    if p is None or symbolic_helper._is_none(p):
+        return g.op("Bernoulli", input)
+    return opset9.bernoulli(g, input, p, generator, out)
+
+
 @_onnx_symbolic("prim::unchecked_cast")
 @_beartype.beartype
 def prim_unchecked_cast(g: jit_utils.GraphContext, self):
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index c02fb0f200909..9984f602425cd 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -4942,8 +4942,8 @@ def rrelu(g: jit_utils.GraphContext, input, lower, upper, training, generator):
 
 @_onnx_symbolic("aten::bernoulli")
 @_beartype.beartype
-def bernoulli(g: jit_utils.GraphContext, input, generator=None, out=None):
-    if out is not None:
+def bernoulli(g: jit_utils.GraphContext, input, p=None, generator=None, out=None):
+    if out is not None and not symbolic_helper._is_none(out):
         symbolic_helper._unimplemented(
             "Bernoulli", "out parameter is not supported for bernoulli", input
         )
@@ -4960,14 +4960,15 @@ def bernoulli(g: jit_utils.GraphContext, input, generator=None, out=None):
             "Bernoulli", "input dtype not accessible", input
         )
 
-    p = g.op(
+    rands = g.op(
         "RandomUniformLike",
         input,
         high_f=1.0,
         low_f=0.0,
         dtype_i=dtype.onnx_type(),
     )
-    output = g.op("Less", p, input)
+    prob = p if p is not None and not symbolic_helper._is_none(p) else input
+    output = g.op("Less", rands, prob)
     return g.op("Cast", output, to_i=dtype.onnx_type())
 
 
From b5915b7eb15031d641f95d055b6b5953f1132cc3 Mon Sep 17 00:00:00 2001
From: AllenTiTaiWang <titaiwang@microsoft.com>
Date: Wed, 16 Nov 2022 06:30:03 +0000
Subject: [PATCH 0963/1922] [ONNX] Support custom Op with onnx-script local
 function (#86906)

Extend `register_custom_op` to support onnx-script local function. The FunctionProto from onnx-script is represented by custom op and inserted into ModelProto for op execution.

NOTE: I did experiments on >2GB case of a simple model with large initializers:

```python
import torch

class Net(torch.nn.Module):
    def __init__(self, B, C):
        super().__init__()
        self.layer_norm = torch.nn.LayerNorm((B, C), eps=1e-3)
    def forward(self, x):
        return self.layer_norm(x)

N, B, C = 3, 25000, 25000
model = Net(B, C)
x = torch.randn(N, B, C)

torch.onnx.export(model, x, "large_model.onnx", opset_version=12)
```

And it turns out we won't get model_bytes > 2GB after `_export_onnx` pybind cpp function, as we split initializer in external files in that function, and have serialization before return the model bytes, which protobuf is not allowed to be larger than 2GB at any circumstances.

The test cases can be found in the next PR #86907 .

Pull Request resolved: https://github.com/pytorch/pytorch/pull/86906
Approved by: https://github.com/justinchuby, https://github.com/BowenBao
---
 .jenkins/caffe2/test.sh           |   2 +-
 torch/onnx/_internal/jit_utils.py |  64 ++++++++++-
 torch/onnx/utils.py               | 183 +++++++++++++++++++++++-------
 3 files changed, 205 insertions(+), 44 deletions(-)

diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
index 2b6f7ec6b246a..42111ea22bdd3 100755
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@@ -176,7 +176,7 @@ fi
 ##############
 if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
   pip install -q --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
-  pip install -q --user ninja flatbuffers==2.0 numpy==1.21.5 onnxruntime==1.12.1 beartype==0.10.4
+  pip install -q --user ninja flatbuffers==2.0 numpy==1.21.5 onnxruntime==1.12.1 beartype==0.10.4 onnx==1.12.0
   # numba requires numpy <= 1.20, onnxruntime requires numpy >= 1.21.
   # We don't actually need it for our tests, but it's imported if it's present, so uninstall.
   pip uninstall -q --yes numba
diff --git a/torch/onnx/_internal/jit_utils.py b/torch/onnx/_internal/jit_utils.py
index 6354cea73fc04..a8740a4a2ff68 100644
--- a/torch/onnx/_internal/jit_utils.py
+++ b/torch/onnx/_internal/jit_utils.py
@@ -12,7 +12,8 @@
 from torch import _C
 from torch._C import _onnx as _C_onnx
 from torch.onnx._globals import GLOBALS
-from torch.onnx._internal import _beartype
+from torch.onnx._internal import _beartype, registration
+
 
 _ATTR_PATTERN = re.compile("^(.+)_(([ifstgz])|(ty))$")
 _SKIP_NODE_ATTRIBUTES = {"inplace", "aten"}
@@ -98,6 +99,49 @@ def aten_op(self, operator: str, *args, overload_name: str = "", **kwargs):
             **kwargs,
         )
 
+    @_beartype.beartype
+    def onnxscript_op(
+        self,
+        onnx_fn,  # TODO(titaiwang): annotate this when onnx-script becomes dependency
+        *raw_args: Union[torch.Tensor, _C.Value],
+        outputs: int = 1,
+        **kwargs,
+    ):
+        """Creates an ONNX operator from onnx-script function, taking "raw_args" as inputs and "kwargs" as attributes.
+
+        onnx-script repository: https://github.com/microsoft/onnx-script
+
+        Args:
+            onnx_fn: ONNXFunction from onnx-script; An example can be found at
+                https://github.com/microsoft/onnx-script#example
+            raw_args: The inputs to the operator; usually provided
+                as arguments to the `symbolic` definition.
+            outputs: The number of outputs this operator returns.
+                By default an operator is assumed to return a single output.
+                If `outputs` is greater than one, this functions returns a tuple
+                of output `Value`, representing each output of the ONNX operator
+                in order.
+            kwargs: The attributes of the ONNX operator, whose keys are named
+                according to the following convention: `alpha_f` indicates
+                the `alpha` attribute with type `f`.  The valid type specifiers are
+                `f` (float), `i` (int), `s` (string) or `t` (Tensor).  An attribute
+                specified with type float accepts either a single float, or a
+                list of floats (e.g., you would say `dims_i` for a `dims` attribute
+                that takes a list of integers).
+
+        Returns:
+            The value representing the single output of this operator (see the `outputs`
+            keyword argument for multi-return nodes).
+        """
+        # NOTE(titaiwang): This is using class attributes, and it needs to be updated
+        # if onnx-script makes any change on these.
+        symbolic_name = f"{onnx_fn.opset.domain}::{onnx_fn.opname}"
+        opset_version = onnx_fn.opset.version
+
+        registration.custom_onnx_symbolic(symbolic_name, opset_version)(onnx_fn)
+
+        return _add_op(self, symbolic_name, *raw_args, outputs=outputs, **kwargs)
+
 
 @_beartype.beartype
 def add_op_with_blocks(
@@ -332,3 +376,21 @@ def parse_node_kind(kind: str) -> Tuple[str, str]:
     if "::" in opname:
         raise ValueError(f"Node kind: {kind} is invalid. '::' should only apear once.")
     return domain, opname
+
+
+@_beartype.beartype
+def is_aten(domain: str) -> bool:
+    """Check if the domain is official."""
+    return domain == "aten"
+
+
+@_beartype.beartype
+def is_prim(domain: str) -> bool:
+    """Check if the domain is official."""
+    return domain == "prim"
+
+
+@_beartype.beartype
+def is_onnx(domain: str) -> bool:
+    """Check if the domain is official."""
+    return domain == "onnx"
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index b30b71812aaef..9d6ec0b325232 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -1597,49 +1597,14 @@ def _export(
                     model_file_location,
                     node_attr_to_name,
                 )
+            # insert function_proto into model_proto.
+            proto = _add_onnxscript_fn(
+                proto,
+                custom_opsets,
+            )
             if verbose:
                 torch.onnx.log("Exported graph: ", graph)
-            if export_type == _exporter_states.ExportTypes.PROTOBUF_FILE:
-                assert len(export_map) == 0
-                with torch.serialization._open_file_like(f, "wb") as opened_file:
-                    opened_file.write(proto)
-            elif export_type in [
-                _exporter_states.ExportTypes.ZIP_ARCHIVE,
-                _exporter_states.ExportTypes.COMPRESSED_ZIP_ARCHIVE,
-            ]:
-                compression = (
-                    zipfile.ZIP_DEFLATED
-                    if export_type
-                    == _exporter_states.ExportTypes.COMPRESSED_ZIP_ARCHIVE
-                    else zipfile.ZIP_STORED
-                )
-                with zipfile.ZipFile(f, "w", compression=compression) as z:
-                    z.writestr(_constants.ONNX_ARCHIVE_MODEL_PROTO_NAME, proto)
-                    for k, v in export_map.items():
-                        z.writestr(k, v)
-            elif export_type == _exporter_states.ExportTypes.DIRECTORY:
-                if os.path.exists(f):
-                    assert os.path.isdir(f)
-                else:
-                    os.makedirs(f)
-
-                model_proto_file = os.path.join(
-                    f, _constants.ONNX_ARCHIVE_MODEL_PROTO_NAME
-                )
-                with torch.serialization._open_file_like(
-                    model_proto_file, "wb"
-                ) as opened_file:
-                    opened_file.write(proto)
-
-                for k, v in export_map.items():
-                    weight_proto_file = os.path.join(f, k)
-                    with torch.serialization._open_file_like(
-                        weight_proto_file, "wb"
-                    ) as opened_file:
-                        opened_file.write(v)
-            else:
-                raise RuntimeError("Unknown export type")
-
+            _export_file(proto, f, export_type, export_map)
             # The ONNX checker only works for ONNX graph. So if the operator_export_type is not ONNX,
             # we can skip this check.
             # If large model format export is enabled, proto will only contain data location instead of
@@ -1660,6 +1625,138 @@ def _export(
     return torch_out
 
 
+@_beartype.beartype
+def _export_file(
+    model_bytes: bytes,
+    f: Union[io.BytesIO, str],
+    export_type: str,
+    export_map: Mapping[str, bytes],
+) -> None:
+    """export/write model bytes into directory/protobuf/zip"""
+    # TODO(titaiwang) MYPY asks for os.PathLike[str] type for parameter: f,
+    # but beartype raises beartype.roar.BeartypeDecorHintNonpepException,
+    # as os.PathLike[str] uncheckable at runtime
+    if export_type == _exporter_states.ExportTypes.PROTOBUF_FILE:
+        assert len(export_map) == 0
+        with torch.serialization._open_file_like(f, "wb") as opened_file:
+            opened_file.write(model_bytes)
+    elif export_type in [
+        _exporter_states.ExportTypes.ZIP_ARCHIVE,
+        _exporter_states.ExportTypes.COMPRESSED_ZIP_ARCHIVE,
+    ]:
+        compression = (
+            zipfile.ZIP_DEFLATED
+            if export_type == _exporter_states.ExportTypes.COMPRESSED_ZIP_ARCHIVE
+            else zipfile.ZIP_STORED
+        )
+        with zipfile.ZipFile(f, "w", compression=compression) as z:
+            z.writestr(_constants.ONNX_ARCHIVE_MODEL_PROTO_NAME, model_bytes)
+            for k, v in export_map.items():
+                z.writestr(k, v)
+    elif export_type == _exporter_states.ExportTypes.DIRECTORY:
+        if isinstance(f, io.BytesIO) or not os.path.isdir(f):  # type: ignore[arg-type]
+            raise ValueError(
+                f"f should be directory when export_type is set to DIRECTORY, instead get type(f): {type(f)}"
+            )
+        if not os.path.exists(f):  # type: ignore[arg-type]
+            os.makedirs(f)  # type: ignore[arg-type]
+
+        model_proto_file = os.path.join(f, _constants.ONNX_ARCHIVE_MODEL_PROTO_NAME)  # type: ignore[arg-type]
+        with torch.serialization._open_file_like(model_proto_file, "wb") as opened_file:
+            opened_file.write(model_bytes)
+
+        for k, v in export_map.items():
+            weight_proto_file = os.path.join(f, k)  # type: ignore[arg-type]
+            with torch.serialization._open_file_like(
+                weight_proto_file, "wb"
+            ) as opened_file:
+                opened_file.write(v)
+    else:
+        raise RuntimeError("Unknown export type")
+
+
+@_beartype.beartype
+def _add_onnxscript_fn(
+    model_bytes: bytes,
+    custom_opsets: Mapping[str, int],
+) -> bytes:
+    """Insert model-included custom onnx-script function into ModelProto"""
+
+    # TODO(titaiwang): remove this when onnx becomes dependency
+    try:
+        import onnx
+    except ImportError:
+        raise errors.OnnxExporterError("Module onnx is not installed!")
+
+    # For > 2GB model, onnx.load_fromstring would fail. However, because
+    # in _export_onnx, the tensors should be saved separately if the proto
+    # size > 2GB, and if it for some reason did not, the model would fail on
+    # serialization anyway in terms of the protobuf limitation. So we don't
+    # need to worry about > 2GB model getting here.
+    model_proto = onnx.load_from_string(model_bytes)
+
+    # Iterate graph nodes to insert only the included custom
+    # function_proto into model_proto
+    # TODO(titaiwang): Currently, onnxscript doesn't support ONNXFunction
+    # calling other ONNXFunction scenario, neither does it here
+    onnx_function_list = list()  # type: ignore[var-annotated]
+    included_node_func = set()  # type: Set[str]
+    # onnx_function_list and included_node_func are expanded in-place
+    _find_onnxscript_op(
+        model_proto.graph, included_node_func, custom_opsets, onnx_function_list
+    )
+
+    if onnx_function_list:
+        model_proto.functions.extend(onnx_function_list)
+        model_bytes = model_proto.SerializeToString()
+    return model_bytes
+
+
+@_beartype.beartype
+def _find_onnxscript_op(
+    graph_proto,
+    included_node_func: Set[str],
+    custom_opsets: Mapping[str, int],
+    onnx_function_list: List,
+):
+    """Recursively iterate ModelProto to find ONNXFunction op as it may contain control flow Op."""
+    for node in graph_proto.node:
+        node_kind = node.domain + "::" + node.op_type
+        # Recursive is needed for control flow nodes: IF/Loop which has inner graph_proto
+        for attr in node.attribute:
+            if attr.g is not None:
+                _find_onnxscript_op(
+                    attr.g, included_node_func, custom_opsets, onnx_function_list
+                )
+        # Only custom Op with ONNX function and aten with symbolic_fn should be found in registry
+        onnx_function_group = registration.registry.get_function_group(node_kind)
+        # Ruled out corner cases: onnx/prim in registry
+        if (
+            node.domain
+            and not jit_utils.is_aten(node.domain)
+            and not jit_utils.is_prim(node.domain)
+            and not jit_utils.is_onnx(node.domain)
+            and onnx_function_group is not None
+            and node_kind not in included_node_func
+        ):
+            specified_version = custom_opsets.get(node.domain, 1)
+            onnx_fn = onnx_function_group.get(specified_version)
+            if onnx_fn is not None:
+                # TODO(titaiwang): to_function_proto is onnx-script API and can be annotated
+                # after onnx-script is dependency
+                onnx_function_list.append(onnx_fn.to_function_proto())  # type: ignore[attr-defined]
+                included_node_func.add(node_kind)
+                continue
+            raise errors.UnsupportedOperatorError(
+                node_kind,
+                specified_version,
+                onnx_function_group.get_min_supported()
+                if onnx_function_group
+                else None,
+            )
+    return onnx_function_list, included_node_func
+
+
 @_beartype.beartype
 def _apply_friendly_debug_names(graph, params):
     for n in graph.nodes():
@@ -1959,7 +2056,9 @@ def _verify_custom_op_name(symbolic_name: str):
 
 @_beartype.beartype
 def register_custom_op_symbolic(
-    symbolic_name: str, symbolic_fn: Callable, opset_version: int
+    symbolic_name: str,
+    symbolic_fn: Callable,
+    opset_version: int,
 ):
     """Registers a symbolic function for a custom operator.
 

From 8e4a34b0eff1f799e8e3c9f9f3f01f2d5a450864 Mon Sep 17 00:00:00 2001
From: Angel Avila <angel.j.avila@gmail.com>
Date: Wed, 16 Nov 2022 16:30:56 +0000
Subject: [PATCH 0964/1922] Fix python types in pybind function signatures
 (#89115)

Fixes #88958

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89115
Approved by: https://github.com/ezyang
---
 torch/csrc/utils/pybind.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/torch/csrc/utils/pybind.h b/torch/csrc/utils/pybind.h
index 85532a42cee26..c582dee1d2f64 100644
--- a/torch/csrc/utils/pybind.h
+++ b/torch/csrc/utils/pybind.h
@@ -36,7 +36,7 @@ template <>
 struct TORCH_PYTHON_API type_caster<at::Tensor> {
  public:
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  PYBIND11_TYPE_CASTER(at::Tensor, _("at::Tensor"));
+  PYBIND11_TYPE_CASTER(at::Tensor, _("torch.Tensor"));
 
   bool load(handle src, bool);
 
@@ -51,7 +51,7 @@ template <>
 struct type_caster<at::Storage> {
  public:
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  PYBIND11_TYPE_CASTER(at::Storage, _("at::Storage"));
+  PYBIND11_TYPE_CASTER(at::Storage, _("torch.storage._StorageBase"));
 
   bool load(handle src, bool) {
     PyObject* obj = src.ptr();
@@ -74,7 +74,7 @@ template <>
 struct type_caster<at::Generator> {
  public:
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  PYBIND11_TYPE_CASTER(at::Generator, _("at::Generator"));
+  PYBIND11_TYPE_CASTER(at::Generator, _("torch.Generator"));
 
   bool load(handle src, bool) {
     PyObject* obj = src.ptr();
@@ -97,7 +97,7 @@ template <>
 struct TORCH_PYTHON_API type_caster<at::IntArrayRef> {
  public:
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  PYBIND11_TYPE_CASTER(at::IntArrayRef, _("at::IntArrayRef"));
+  PYBIND11_TYPE_CASTER(at::IntArrayRef, _("typing.Tuple[int, ...]"));
 
   bool load(handle src, bool);
   static handle cast(
@@ -129,7 +129,7 @@ template <>
 struct TORCH_PYTHON_API type_caster<at::MemoryFormat> {
  public:
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  PYBIND11_TYPE_CASTER(at::MemoryFormat, _("at::MemoryFormat"));
+  PYBIND11_TYPE_CASTER(at::MemoryFormat, _("torch.memory_format"));
 
   bool load(handle src, bool) {
     PyObject* obj = src.ptr();
@@ -151,7 +151,7 @@ template <>
 struct type_caster<at::Device> {
  public:
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  PYBIND11_TYPE_CASTER(at::Device, _("at::Device"));
+  PYBIND11_TYPE_CASTER(at::Device, _("torch.device"));
 
   // PYBIND11_TYPE_CASTER defines a member field called value. Since at::Device
   // cannot be default-initialized, we provide this constructor to explicitly
@@ -206,7 +206,7 @@ struct type_caster<c10::DispatchKey>
 template <>
 struct type_caster<c10::SymInt> {
  public:
-  PYBIND11_TYPE_CASTER(c10::SymInt, _("SymInt"));
+  PYBIND11_TYPE_CASTER(c10::SymInt, _("torch._prims_common.IntLike"));
   bool load(py::handle src, bool);
 
   static py::handle cast(
@@ -218,7 +218,7 @@ struct type_caster<c10::SymInt> {
 template <>
 struct type_caster<c10::SymFloat> {
  public:
-  PYBIND11_TYPE_CASTER(c10::SymFloat, _("SymFloat"));
+  PYBIND11_TYPE_CASTER(c10::SymFloat, _("torch._prims_common.FloatLike"));
   bool load(py::handle src, bool);
 
   static py::handle cast(

From c9a0ac4c8179e4e77ded0b6bcf020a93f30523f7 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 16 Nov 2022 16:36:27 +0000
Subject: [PATCH 0965/1922] Revert "SymIntify convolution backend calculation
 (#89069)"

This reverts commit 09ed8b67e24cfe29f3fa7b5dd28eaa7749229f12.

Reverted https://github.com/pytorch/pytorch/pull/89069 on behalf of https://github.com/DanilBaibak due to breaking internal builds
---
 aten/src/ATen/native/ConvUtils.h        |  79 ++----
 aten/src/ATen/native/Convolution.cpp    | 319 +++++++++++-------------
 aten/src/ATen/native/utils/ParamUtils.h |   7 +-
 c10/core/SymInt.h                       |  13 -
 torch/csrc/Module.cpp                   |  12 +-
 5 files changed, 174 insertions(+), 256 deletions(-)

diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h
index 880ce0c2af54a..b8e2b0842a002 100644
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@@ -110,8 +110,8 @@ enum class ConvBackend {
 // This overload is exposed to python for testing, etc.
 TORCH_API ConvBackend select_conv_backend(
     const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
-    IntArrayRef stride, SymIntArrayRef padding, IntArrayRef dilation,
-    bool transposed, SymIntArrayRef output_padding, int64_t groups, const at::OptionalSymIntArrayRef bias_sizes_opt);
+    IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation,
+    bool transposed, IntArrayRef output_padding, int64_t groups, const at::OptionalIntArrayRef bias_sizes_opt);
 
 TORCH_API at::MemoryFormat _determine_backend_memory_format(const Tensor& input,
     const Tensor& weight,
@@ -200,16 +200,15 @@ static void convolution_shape_check(
 // as conv_output_size loses information; this is why conv_input_size
 // takes an extra output_padding argument to resolve the ambiguity.
 
-template <typename T>
-static inline std::vector<T> _conv_output_size(
-    ArrayRef<T> input_size, ArrayRef<T> weight_size,
-    ArrayRef<T> padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef()
+static inline std::vector<int64_t> conv_output_size(
+    IntArrayRef input_size, IntArrayRef weight_size,
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef()
 ) {
   // ASSERT(input_size.size() > 2)
   // ASSERT(input_size.size() == weight_size.size())
   bool has_dilation = dilation.size() > 0;
   auto dim = input_size.size();
-  std::vector<T> output_size(dim);
+  std::vector<int64_t> output_size(dim);
   output_size[0] = input_size[input_batch_size_dim];
   output_size[1] = weight_size[weight_output_channels_dim];
   for (const auto d : c10::irange(2, dim)) {
@@ -220,84 +219,40 @@ static inline std::vector<T> _conv_output_size(
   return output_size;
 }
 
-static inline std::vector<int64_t> conv_output_size(
-    IntArrayRef input_size, IntArrayRef weight_size,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef()
-) {
-  return _conv_output_size(input_size, weight_size, padding, stride, dilation);
-}
-
-static inline std::vector<c10::SymInt> conv_output_size(
-    SymIntArrayRef input_size, SymIntArrayRef weight_size,
-    SymIntArrayRef padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef()
-) {
-  return _conv_output_size(input_size, weight_size, padding, stride, dilation);
-}
-
-template <typename T>
-std::vector<T> _conv_input_size(
-    ArrayRef<T> output_size, ArrayRef<T> weight_size,
-    ArrayRef<T> padding, ArrayRef<T> output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
+static inline std::vector<int64_t> conv_input_size(
+    IntArrayRef output_size, IntArrayRef weight_size,
+    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
 ) {
   // ASSERT(output_size.size() > 2)
   // ASSERT(output_size.size() == weight_size.size())
   auto dim = output_size.size();
-  std::vector<T> input_size(dim);
+  std::vector<int64_t> input_size(dim);
   input_size[0] = output_size[output_batch_size_dim];
   input_size[1] = weight_size[weight_input_channels_dim] * groups;
   for (const auto d : c10::irange(2, dim)) {
-    auto kernel = (weight_size[d] - 1) * dilation[d - 2] + 1;
-    input_size[d] = (output_size[d] - 1) * stride[d - 2] - (padding[d - 2] * 2) +
+    int kernel = dilation[d - 2] * (weight_size[d] - 1) + 1;
+    input_size[d] = (output_size[d] - 1) * stride[d - 2] - (2 * padding[d - 2]) +
                      kernel + output_padding[d - 2];
   }
   return input_size;
 }
 
-static inline std::vector<c10::SymInt> conv_input_size(
-    SymIntArrayRef output_size, SymIntArrayRef weight_size,
-    SymIntArrayRef padding, SymIntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
-) {
-  return _conv_input_size(output_size, weight_size, padding, output_padding, stride, dilation, groups);
-}
-
-static inline std::vector<int64_t> conv_input_size(
-    IntArrayRef output_size, IntArrayRef weight_size,
+static inline std::vector<int64_t> conv_weight_size(
+    IntArrayRef input_size, IntArrayRef output_size,
     IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
-) {
-  return _conv_input_size(output_size, weight_size, padding, output_padding, stride, dilation, groups);
-}
-
-template <typename T>
-std::vector<T> _conv_weight_size(
-    ArrayRef<T> input_size, ArrayRef<T> output_size,
-    ArrayRef<T> padding, ArrayRef<T> output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
 ) {
   auto dim = input_size.size();
-  std::vector<T> weight_size(dim);
+  std::vector<int64_t> weight_size(dim);
   weight_size[0] = output_size[1];
   weight_size[1] = input_size[1] / groups;
   for (const auto d : c10::irange(2, dim)) {
-    auto kernel = input_size[d] - (output_size[d] - 1) * stride[d - 2]
-               + padding[d - 2] * 2 - output_padding[d - 2];
+    int kernel = input_size[d] - (output_size[d] - 1) * stride[d - 2]
+               + 2 * padding[d - 2] - output_padding[d - 2];
     weight_size[d] = (kernel - 1) / dilation[d - 2] + 1;
   }
   return weight_size;
 }
 
-static inline std::vector<c10::SymInt> conv_weight_size(
-    SymIntArrayRef input_size, SymIntArrayRef output_size,
-    SymIntArrayRef padding, SymIntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
-) {
-  return _conv_weight_size(input_size, output_size, padding, output_padding, stride, dilation, groups);
-}
-
-static inline std::vector<int64_t> conv_weight_size(
-    IntArrayRef input_size, IntArrayRef output_size,
-    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
-) {
-  return _conv_weight_size(input_size, output_size, padding, output_padding, stride, dilation, groups);
-}
-
 static inline Tensor reshape_bias(int64_t dim, const Tensor& bias) {
   std::vector<int64_t> shape(dim, 1);
   shape[1] = -1;
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index bf7017f20a4fd..29b2ce804c806 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -83,11 +83,10 @@ constexpr int MIOPEN_DIM_MAX = 5;
 namespace at { namespace native {
 
 // Check workload to activate fast depthwise FP16 cudnn conv kernels
-template <typename T>
 bool check_cudnn_depthwise_workload(const at::Tensor& input, int stride) {
-  auto w = at::symint::size<T>(input, 3);  // same as h
-  auto ch = at::symint::size<T>(input, 1);
-  auto bs = at::symint::size<T>(input, 0);
+  int w = input.size(3);  // same as h
+  int ch = input.size(1);
+  int bs = input.size(0);
   if (stride==1) {
     if (w >= 7) {
       // All batch sizes and nb_channels
@@ -206,28 +205,27 @@ bool check_cudnn_depthwise_workload(const at::Tensor& input, int stride) {
 }
 
 // simplified version for cudnn 8.2 and above
-template <typename T>
 bool check_cudnn_depthwise_workload_with_filter(const at::Tensor& input, int stride, const at::Tensor& weight) {
   // 1D conv
-  if(at::symint::size<T>(input, 2) == 1 && stride == 1){
+  if(input.size(2) == 1 && stride == 1){
     return true;
   }
 
   // 2d conv
   // only square filters
-  if (at::symint::size<T>(weight, 2) != at::symint::size<T>(weight, 3)) return false;
-  auto filter = at::symint::size<T>(weight, 3);
+  if (weight.size(2) != weight.size(3)) return false;
+  int filter = weight.size(3);
   // only 1/3/5 filter
   if (filter != 1 && filter != 3 && filter != 5) return false;
   // we don't enforce square input but only check width to reduce heuristic space
-  if (at::symint::size<T>(input, 3) < 7) return false; // min width 7
-  auto w = at::symint::size<T>(input, 3);
+  if (input.size(3) < 7) return false; // min width 7
+  int w = input.size(3);
   // only 1/2 stride, use cudnn for all stride 1
   if (stride == 1) return true;
   if (stride != 2) return false;
 
-  auto ch = at::symint::size<T>(input, 1);
-  auto bs = at::symint::size<T>(input, 0);
+  int ch = input.size(1);
+  int bs = input.size(0);
   // special case since bs1 show good perf in lots of cases
   if (bs == 1) {
     if (filter == 1 && w <= 28) return true;
@@ -242,42 +240,13 @@ bool check_cudnn_depthwise_workload_with_filter(const at::Tensor& input, int str
 }
 
 
-bool xnnpack_use_convolution2d(
-    const Tensor& input,
-    const Tensor& weight,
-    const at::OptionalIntArrayRef bias_sizes_opt,
-    const IntArrayRef padding,
-    const IntArrayRef stride,
-    const IntArrayRef dilation,
-    const int64_t groups,
-    const bool transposed) {
-  return xnnpack::use_convolution2d(input, weight, bias_sizes_opt, padding, stride, dilation, groups, transposed);
-}
-
-bool xnnpack_use_convolution2d(
-    const Tensor& input,
-    const Tensor& weight,
-    const at::OptionalSymIntArrayRef bias_sizes_opt,
-    const SymIntArrayRef padding,
-    const IntArrayRef stride,
-    const IntArrayRef dilation,
-    const int64_t groups,
-    const bool transposed) {
-  // Never use xnnpack for symbolic tracing
-  return false;
-}
-
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-// This struct is templated so that we can run backend selection in a dynamic
-// shapes context; all of the real kernel selection in eager mode runs with
-// int64_t
-template <typename T>
 struct ConvParams {
   std::vector<int64_t> stride;
-  std::vector<T> padding;
+  std::vector<int64_t> padding;
   std::vector<int64_t> dilation;
   bool transposed;
-  std::vector<T> output_padding;
+  std::vector<int64_t> output_padding;
   int groups;
   bool benchmark;
   bool deterministic;
@@ -353,12 +322,12 @@ struct ConvParams {
 #if defined(__ARM_NEON__)
     // Currently only 3x3 depthwise convolutions on tensors of float are supported.
     return (input.ndimension() == 4) &&
-           (at::symint::size<T>(input, 1) == groups) &&
+           (input.size(1) == groups) &&
            (weight.ndimension() == 4 ) &&
-           (at::symint::size<T>(weight, 0) % at::symint::size<T>(input, 1) == 0) &&
-           (at::symint::size<T>(weight, 1) == 1) &&
-           (at::symint::size<T>(weight, 2) == 3) &&
-           (at::symint::size<T>(weight, 3) == 3) &&
+           (weight.size(0) % input.size(1) == 0) &&
+           (weight.size(1) == 1) &&
+           (weight.size(2) == 3) &&
+           (weight.size(3) == 3) &&
            (input.device().is_cpu()) &&
            (input.scalar_type() == at::kFloat) &&
            input.is_contiguous() &&
@@ -376,23 +345,23 @@ struct ConvParams {
 
   bool needs_64bit_indexing_no_split(const at::Tensor& input, const at::Tensor& weight) const {
     constexpr int64_t int_max = std::numeric_limits<int>::max();
-    auto numel_input = at::symint::numel<T>(input);
+    int64_t numel_input = input.numel();
     // empty input
     if (numel_input == 0) {
       return false;
     }
     // input size can not be reduced to the range of int by splitting the batch dim
-    auto n = at::symint::size<T>(input, 0);
+    int64_t n = input.size(0);
     if (numel_input / n > int_max) {
       return true;
     }
     // output size can not be reduced to the range of int by splitting the batch dim
-    T outsize = 1;
+    int64_t outsize = 1;
     if (transposed) {
-      auto o = conv_input_size(at::symint::sizes<T>(input), at::symint::sizes<T>(weight), padding, output_padding, stride, dilation, groups);
+      std::vector<int64_t> o = conv_input_size(input.sizes(), weight.sizes(), padding, output_padding, stride, dilation, groups);
       outsize = c10::multiply_integers(o.begin() + 1, o.end());
     } else {
-      auto o = conv_output_size(at::symint::sizes<T>(input), at::symint::sizes<T>(weight), padding, stride, dilation);
+      std::vector<int64_t> o = conv_output_size(input.sizes(), weight.sizes(), padding, stride, dilation);
       outsize = c10::multiply_integers(o.begin() + 1, o.end());
     }
     return outsize > int_max;
@@ -448,10 +417,10 @@ struct ConvParams {
                              is_depthwise(input, weight) &&
                              input.ndimension() == 4 &&   // TODO: 5-D contiguous depthwise is not supported yet, need benchmarks
                              !is_dilated() && // no dilation supported
-                             (stride[0] == stride[1] || at::symint::size<T>(input, 2) == 1) && // square or 1d
-                             at::symint::size<T>(input, 1) >= 32); // min 32 channels supported)
+                             (stride[0] == stride[1] || input.size(2) == 1) && // square or 1d
+                             input.size(1) >= 32); // min 32 channels supported)
         if (kernel_cond) {
-          return check_cudnn_depthwise_workload_with_filter<T>(input, stride[1], weight);
+          return check_cudnn_depthwise_workload_with_filter(input, stride[1], weight);
         }
       }
       // keep (7600 <= cudnn < 8200) code unchanged
@@ -461,14 +430,14 @@ struct ConvParams {
                            weight.scalar_type() == kHalf &&
                            is_depthwise(input, weight) &&
                            input.ndimension() == 4 &&   // TODO: 5-D contiguous depthwise is not supported yet, need benchmarks
-                           at::symint::size<T>(weight, 2) == at::symint::size<T>(weight, 3) && // only square kernels
-                           at::symint::size<T>(input, 2) >= 7 && // min width/height 7
+                           weight.size(2) == weight.size(3) && // only square kernels
+                           input.size(2) >= 7 && // min width/height 7
                            !is_dilated() && // no dilation supported
                            stride[0] == stride[1] && // equal strides
-                           ((at::symint::size<T>(weight, 3) == 3) || (at::symint::size<T>(weight, 3) == 1)) &&
-                           at::symint::size<T>(input, 1) >= 32); // min 32 channels supported)
+                           ((weight.size(3) == 3) || (weight.size(3) == 1)) &&
+                           input.size(1) >= 32); // min 32 channels supported)
       if (kernel_cond) {
-        return check_cudnn_depthwise_workload<T>(input, stride[0]);
+        return check_cudnn_depthwise_workload(input, stride[0]);
       } else {
         return false;
       }
@@ -504,12 +473,12 @@ struct ConvParams {
        !transposed && // or transposed tensors
        // For 1x1 filters, MKLDNN is faster than THNN when multi-threaded,
        // but THNN is faster when single-threaded.
-       (is_strided() || is_dilated() || at::symint::size<T>(input, 0) >= 16 ||
-        at::symint::size<T>(weight, -1) != 1 || at::symint::size<T>(weight, -2) != 1 || at::get_num_threads() > 1) &&
+       (is_strided() || is_dilated() || input.size(0) >= 16 ||
+        weight.size(-1) != 1 || weight.size(-2) != 1 || at::get_num_threads() > 1) &&
        (groups > 1
-        || (at::symint::size<T>(weight, -1) > 3 && at::symint::size<T>(weight, -2) > 3)
-        || at::symint::size<T>(input, 0) > 1
-        || at::symint::size<T>(input, 0)*at::symint::size<T>(input, 1)*at::symint::size<T>(input, 2)*at::symint::size<T>(input, 3) > 20480) // for some case, native is faster
+        || (weight.size(-1) > 3 && weight.size(-2) > 3)
+        || input.size(0) > 1
+        || input.size(0)*input.size(1)*input.size(2)*input.size(3) > 20480) // for some case, native is faster
         );
 
 #endif
@@ -524,23 +493,20 @@ struct ConvParams {
            !transposed &&   // or transposed tensors
            input.ndimension() == 4 && // must be in NCHW format
            weight.ndimension() == 4 &&
-           (at::symint::size<T>(weight, 2) < 17) && (at::symint::size<T>(weight, 3) < 17) // NNPACK only supports kernels up to 16x16
+           (weight.size(2) < 17) && (weight.size(3) < 17) // NNPACK only supports kernels up to 16x16
 #if !defined(C10_MOBILE)
-           && at::symint::size<T>(input, 0) >= 16 // ensure large enough batch size to ensure perf, tuneable
+           && input.size(0) >= 16 // ensure large enough batch size to ensure perf, tuneable
 #endif
        ;
 #endif
     return false;
   }
   bool use_xnnpack(const at::Tensor& input, const at::Tensor& weight,
-                   const at::OptionalArrayRef<T> bias_sizes_opt) const {
+                   const at::OptionalIntArrayRef bias_sizes_opt) const {
 #if defined(C10_MOBILE)
     if (!transposed) {
-      // NB: for the call here, it MATTERS that we are templated. If you
-      // untemplate this to always use SymInt, the function
-      // xnnpack_use_convolution2d will always return false
-      return (at::symint::size<T>(input, 1) == groups) &&
-              xnnpack_use_convolution2d(
+      return (input.size(1) == groups) &&
+              xnnpack::use_convolution2d(
                   input,
                   weight,
                   bias_sizes_opt,
@@ -577,12 +543,33 @@ struct ConvParams {
     return input.is_cuda() &&
            !transposed &&
            (input.ndimension() == 4 || input.ndimension() == 5) &&
-           at::symint::size<T>(input, 1) == groups &&
+           input.size(1) == groups &&
            groups > 1 && // no point if there is only a single group
-           at::symint::size<T>(weight, 0) % at::symint::size<T>(input, 1) == 0; // output channels must be a multiple of input channels
+           weight.size(0) % input.size(1) == 0; // output channels must be a multiple of input channels
   }
 };
 
+// Function to select the convolution backend based on the inputs and params.
+// This overload is used within the convolution internals but not exposed to python.
+// NB: The forward pass provides a bias tensor while the backward pass provides
+// a bool indicating whether the bias is defined. This is done to save memory by
+// avoiding saving the full bias tensor for backward.
+ConvBackend _select_conv_backend(
+    const Tensor& input,
+    const Tensor& weight,
+    const c10::optional<Tensor>& bias_opt,
+    const at::OptionalIntArrayRef bias_sizes_opt,
+    const bool need_backward,
+    const ConvParams& params);
+
+// For BC reasons, have a copy that does not require bias_opt
+ConvBackend select_conv_backend(
+    const Tensor& input,
+    const Tensor& weight,
+    const at::OptionalIntArrayRef bias_sizes_opt,
+    const bool need_backward,
+    const ConvParams& params);
+
 DEFINE_DISPATCH(conv_depthwise2d_backward_stub);
 DEFINE_DISPATCH(conv_depthwise3d_backward_stub);
 DEFINE_DISPATCH(cudnn_convolution_backward_stub);
@@ -604,14 +591,13 @@ REGISTER_NO_CPU_DISPATCH(miopen_convolution_backward_stub);
 REGISTER_NO_CPU_DISPATCH(miopen_convolution_transpose_backward_stub);
 REGISTER_NO_CPU_DISPATCH(miopen_depthwise_convolution_backward_stub);
 
-template <typename T>
-std::ostream& operator<<(std::ostream & out, const ConvParams<T>& params) {
+std::ostream& operator<<(std::ostream & out, const ConvParams& params) {
   out << "ConvParams {"
       << "  stride = " << IntArrayRef{params.stride}
-      << "  padding = " << ArrayRef<T>{params.padding}
+      << "  padding = " << IntArrayRef{params.padding}
       << "  dilation = " << IntArrayRef{params.dilation}
       << "  transposed = " << params.transposed
-      << "  output_padding = " << ArrayRef<T>{params.output_padding}
+      << "  output_padding = " << IntArrayRef{params.output_padding}
       << "  groups = " << params.groups
       << "  benchmark = " << params.benchmark
       << "  deterministic = " << params.deterministic
@@ -621,10 +607,9 @@ std::ostream& operator<<(std::ostream & out, const ConvParams<T>& params) {
   return out;
 }
 
-template <typename T>
 static void check_shape_forward(const at::Tensor& input,
-                                const c10::ArrayRef<T>& weight_sizes, const at::Tensor& bias,
-                                const ConvParams<T>& params) {
+                                const c10::IntArrayRef& weight_sizes, const at::Tensor& bias,
+                                const ConvParams& params) {
   int64_t k = input.ndimension();
   int64_t weight_dim = weight_sizes.size();
   int64_t groups = params.groups;
@@ -639,7 +624,7 @@ static void check_shape_forward(const at::Tensor& input,
   TORCH_CHECK(weight_dim == k,
            "Expected ", weight_dim, "-dimensional input for ", weight_dim,
            "-dimensional weight ", weight_sizes, ", but got ", k, "-dimensional input of size ",
-           at::symint::sizes<T>(input), " instead");
+           input.sizes(), " instead");
   TORCH_CHECK(weight_sizes[0] >= groups,
            "Given groups=", groups, ", expected weight to be at least ", groups,
            " at dimension 0, but got weight of size ", weight_sizes, " instead");
@@ -649,23 +634,23 @@ static void check_shape_forward(const at::Tensor& input,
            "] instead");
 
   if (!transposed) {
-    std::vector<T> input_shape;
-    std::vector<T> kernel_shape;
+    std::vector<int64_t> input_shape;
+    std::vector<int64_t> kernel_shape;
     bool kernel_size_correct = true;
 
-    TORCH_CHECK(at::symint::size<T>(input, 1) == (weight_sizes[1] * groups),
+    TORCH_CHECK(input.size(1) == (weight_sizes[1] * groups),
                 "Given groups=", groups, ", weight of size ", weight_sizes,
                 ", expected input", input.sizes(), " to have ",
-                (weight_sizes[1] * groups), " channels, but got ", at::symint::size<T>(input, 1),
+                (weight_sizes[1] * groups), " channels, but got ", input.size(1),
                 " channels instead");
 
-    TORCH_CHECK(!bias.defined() || (bias.ndimension() == 1 && at::symint::size<T>(bias, 0) == weight_sizes[0]),
+    TORCH_CHECK(!bias.defined() || (bias.ndimension() == 1 && bias.size(0) == weight_sizes[0]),
              "Given weight of size ", weight_sizes,
              ", expected bias to be 1-dimensional with ", weight_sizes[0], " elements",
-             ", but got bias of size ", at::symint::sizes<T>(bias), " instead");
+             ", but got bias of size ", bias.sizes(), " instead");
 
     for (const auto i : c10::irange(2, k)) {
-      input_shape.push_back(at::symint::size<T>(input, i) + 2 * padding[i-2]);
+      input_shape.push_back(input.size(i) + 2 * padding[i-2]);
       // log new kernel size considering dilation
       kernel_shape.push_back(dilation[i-2] * (weight_sizes[i]-1) + 1);
       if (input_shape.back() < kernel_shape.back()) {
@@ -691,23 +676,22 @@ static void check_shape_forward(const at::Tensor& input,
                "Kernel size: (", kernel_ss.str(), "). Kernel size can't be greater than actual input size");
     }
   } else { // transposed
-    TORCH_CHECK(at::symint::size<T>(input, 1) == weight_sizes[0],
+    TORCH_CHECK(input.size(1) == weight_sizes[0],
              "Given transposed=", transposed, ", weight of size ", weight_sizes,
              ", expected input", input.sizes(), " to have ", weight_sizes[0],
-             " channels, but got ", at::symint::size<T>(input, 1), " channels instead");
-    TORCH_CHECK(!bias.defined() || (bias.ndimension() == 1 && at::symint::size<T>(bias, 0) == weight_sizes[1] * groups),
+             " channels, but got ", input.size(1), " channels instead");
+    TORCH_CHECK(!bias.defined() || (bias.ndimension() == 1 && bias.size(0) == weight_sizes[1] * groups),
              "Given transposed=", transposed, ", weight of size ", weight_sizes,
              ", expected bias to be 1-dimensional with ", weight_sizes[1] * groups, " elements",
              ", but got bias of size ", bias.sizes(), " instead");
   }
 }
 
-template <typename T>
 static void check_shape_backward(
     const at::Tensor& input,
-    const c10::ArrayRef<T>& weight_sizes,
-    const ConvParams<T>& params) {
-  check_shape_forward<T>(input, weight_sizes, /*bias=*/ Tensor(), params);
+    const c10::IntArrayRef& weight_sizes,
+    const ConvParams& params) {
+  check_shape_forward(input, weight_sizes, /*bias=*/ Tensor(), params);
 }
 
 // Given an input tensor and an expected number of spatial dimensions, checks that the
@@ -1165,25 +1149,71 @@ at::Tensor convolution_overrideable(
   TORCH_CHECK_NOT_IMPLEMENTED(false, "convolution_overrideable not implemented. You are likely triggering this with tensor backend other than CPU/CUDA/MKLDNN, if this is intended, please use TORCH_LIBRARY_IMPL to override this function ");
 }
 
-// Function to select the convolution backend based on the inputs and params.
-// This overload is used within the convolution internals but not exposed to python.
-// NB: The forward pass provides a bias tensor while the backward pass provides
-// a bool indicating whether the bias is defined. This is done to save memory by
-// avoiding saving the full bias tensor for backward.
-template <typename T>
+// Selects a backend for convolution based on the inputs and params.
+ConvBackend select_conv_backend(
+    const Tensor& input_r, const Tensor& weight_r, const c10::optional<Tensor>& bias_opt,
+    IntArrayRef stride_, IntArrayRef padding_, IntArrayRef dilation_,
+    bool transposed_, IntArrayRef output_padding_, int64_t groups_, const at::OptionalIntArrayRef bias_sizes_opt) {
+  c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
+  const Tensor& bias = *bias_maybe_owned;
+
+  auto& ctx = at::globalContext();
+  auto k = weight_r.ndimension();
+  int64_t dim = k - 2;
+  ConvParams params;
+  params.stride = expand_param_if_needed(stride_, "stride", dim);
+  params.padding = expand_param_if_needed(padding_, "padding", dim);
+  params.dilation = expand_param_if_needed(dilation_, "dilation", dim);
+  params.transposed = transposed_;
+  params.output_padding = expand_param_if_needed(output_padding_, "output_padding", dim);
+  params.groups = groups_;
+  params.benchmark = ctx.benchmarkCuDNN();
+  params.deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms();
+  params.cudnn_enabled = ctx.userEnabledCuDNN();
+  params.allow_tf32 = ctx.allowTF32CuDNN();
+
+  auto input = input_r;
+  auto weight = weight_r;
+  check_shape_forward(input, weight.sizes(), bias, params);
+
+  // Expand 1d -> 2d.
+  // This is only done for backends that don't natively support 1d spatial input.
+  if (k == 3 && !input.is_mkldnn() && !input.is_xpu()) {
+    // avoid accidentally going through NHWC for permuted 3d input.
+    input = input.contiguous();
+    params.view1d_as_2d();
+    input = view4d(input);
+    weight = view4d(weight);
+  }
+
+  auto bias_sizes = bias.defined() ? c10::optional<IntArrayRef>(bias.sizes()) : bias_sizes_opt;
+  bool need_backward = GradMode::is_enabled() &&
+      (input.requires_grad() || weight.requires_grad() || (bias.defined() && bias.requires_grad()));
+  return _select_conv_backend(input, weight, bias, bias_sizes, need_backward, params);
+}
+
+ConvBackend select_conv_backend(
+    const Tensor& input,
+    const Tensor& weight,
+    const at::OptionalIntArrayRef bias_sizes_opt,
+    const bool need_backward,
+    const ConvParams& params) {
+  return _select_conv_backend(input, weight, {}, bias_sizes_opt, need_backward, params);
+}
+
 ConvBackend _select_conv_backend(
     const Tensor& input,
     const Tensor& weight,
     const c10::optional<Tensor>& bias,
-    const at::OptionalArrayRef<T> bias_sizes_opt,
+    const at::OptionalIntArrayRef bias_sizes_opt,
     const bool need_backward,
-    const ConvParams<T>& params) {
+    const ConvParams& params) {
 
   // don't send empty inputs through backends
-  if (at::symint::size<T>(input, 0) == 0 || at::symint::size<T>(input, 1) == 0) {
+  if (input.size(0) == 0 || input.size(1) == 0) {
     return input.is_mkldnn() ? ConvBackend::MkldnnEmpty : ConvBackend::Empty;
-  } else if (at::symint::numel<T>(input) == 0) {
-    TORCH_CHECK(false, "Only zero batch or zero channel inputs are supported, but got input shape: ", at::symint::sizes<T>(input));
+  } else if (input.numel() == 0) {
+    TORCH_CHECK(false, "Only zero batch or zero channel inputs are supported, but got input shape: ", input.sizes());
   }
 
   if (params.is_depthwise(input, weight)) {
@@ -1275,65 +1305,12 @@ ConvBackend _select_conv_backend(
   AT_ERROR("unsupported ConvNd parameters");
 }
 
-// Selects a backend for convolution based on the inputs and params.
-ConvBackend select_conv_backend(
-    const Tensor& input_r, const Tensor& weight_r, const c10::optional<Tensor>& bias_opt,
-    IntArrayRef stride_, SymIntArrayRef padding_, IntArrayRef dilation_,
-    bool transposed_, SymIntArrayRef output_padding_, int64_t groups_, const at::OptionalSymIntArrayRef bias_sizes_opt) {
-  c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
-  const Tensor& bias = *bias_maybe_owned;
-
-  auto& ctx = at::globalContext();
-  auto k = weight_r.ndimension();
-  int64_t dim = k - 2;
-  ConvParams<c10::SymInt> params;
-  params.stride = expand_param_if_needed(stride_, "stride", dim);
-  params.padding = expand_param_if_needed(padding_, "padding", dim);
-  params.dilation = expand_param_if_needed(dilation_, "dilation", dim);
-  params.transposed = transposed_;
-  params.output_padding = expand_param_if_needed(output_padding_, "output_padding", dim);
-  params.groups = groups_;
-  params.benchmark = ctx.benchmarkCuDNN();
-  params.deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms();
-  params.cudnn_enabled = ctx.userEnabledCuDNN();
-  params.allow_tf32 = ctx.allowTF32CuDNN();
-
-  auto input = input_r;
-  auto weight = weight_r;
-  check_shape_forward(input, weight.sym_sizes(), bias, params);
-
-  // Expand 1d -> 2d.
-  // This is only done for backends that don't natively support 1d spatial input.
-  if (k == 3 && !input.is_mkldnn() && !input.is_xpu()) {
-    // avoid accidentally going through NHWC for permuted 3d input.
-    input = input.contiguous();
-    params.view1d_as_2d();
-    input = view4d(input);
-    weight = view4d(weight);
-  }
-
-  auto bias_sizes = bias.defined() ? c10::optional<SymIntArrayRef>(bias.sym_sizes()) : bias_sizes_opt;
-  bool need_backward = GradMode::is_enabled() &&
-      (input.requires_grad() || weight.requires_grad() || (bias.defined() && bias.requires_grad()));
-  return _select_conv_backend(input, weight, bias, bias_sizes, need_backward, params);
-}
-
-// For BC reasons, have a copy that does not require bias_opt
-ConvBackend select_conv_backend(
-    const Tensor& input,
-    const Tensor& weight,
-    const at::OptionalIntArrayRef bias_sizes_opt,
-    const bool need_backward,
-    const ConvParams<int64_t>& params) {
-  return _select_conv_backend(input, weight, {}, bias_sizes_opt, need_backward, params);
-}
-
 at::Tensor _convolution_nogroup_backend(
     const Tensor& input,
     const Tensor& weight,
     const Tensor& bias,
     const ConvBackend backend,
-    const ConvParams<int64_t>& params) {
+    const ConvParams& params) {
   auto kernel_size = weight.sizes().slice(2);
   switch(backend) {
     case ConvBackend::NnpackSpatial:
@@ -1364,7 +1341,7 @@ at::Tensor _convolution_nogroup_backend(
 static inline std::vector<int64_t> calc_output_size(
     const Tensor& input,
     const Tensor& weight,
-    const ConvParams<int64_t>& params) {
+    const ConvParams& params) {
   std::vector<int64_t> output_size = params.transposed ?
     conv_input_size(input.sizes(), weight.sizes(), params.padding, params.output_padding,
         params.stride, params.dilation, params.groups) :
@@ -1445,7 +1422,7 @@ at::Tensor _convolution(
   TORCH_CHECK(dim > 0, "weight should have at least three dimensions");
   TORCH_CHECK(groups_ > 0, "non-positive groups is not supported");
 
-  ConvParams<int64_t> params;
+  ConvParams params;
   params.stride = expand_param_if_needed(stride_, "stride", dim);
   params.padding = expand_param_if_needed(padding_, "padding", dim);
   params.dilation = expand_param_if_needed(dilation_, "dilation", dim);
@@ -1473,7 +1450,7 @@ at::Tensor _convolution(
   auto bias_sizes_opt = bias.defined() ? c10::optional<IntArrayRef>(bias.sizes()) : c10::nullopt;
   bool need_backward = GradMode::is_enabled() &&
       (input.requires_grad() || weight.requires_grad() || (bias.defined() && bias.requires_grad()));
-  ConvBackend backend = _select_conv_backend(input, weight, bias, c10::OptionalIntArrayRef(bias_sizes_opt), need_backward, params);
+  ConvBackend backend = _select_conv_backend(input, weight, bias, bias_sizes_opt, need_backward, params);
   at::MemoryFormat backend_memory_format = determine_backend_memory_format(input, weight, backend);
 
   // Call the backend.
@@ -1686,7 +1663,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
   auto weight = weight_r;
 
   int64_t dim = weight.ndimension() - 2;
-  ConvParams<int64_t> params;
+  ConvParams params;
   params.stride = expand_param_if_needed(stride_, "stride", dim);
   params.padding = expand_param_if_needed(padding_, "padding", dim);
   params.dilation = expand_param_if_needed(dilation_, "dilation", dim);
@@ -1749,7 +1726,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
   if (ggI.defined()) {
 
     // Modified params with correct padding
-    ConvParams<int64_t> gw_conv_params(params);
+    ConvParams gw_conv_params(params);
 
     // Disable groups as they are handled separately
     auto groups = gw_conv_params.groups;
@@ -1818,7 +1795,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
   Tensor gI;
   if (input.numel() != 0) {
     if (ggW.defined()) {
-      ConvParams<int64_t> gi_conv_params(params);
+      ConvParams gi_conv_params(params);
       gi_conv_params.transposed = !params.transposed;
 
       if (params.transposed) {
@@ -1874,7 +1851,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _convolution_backward_nogroup_bac
     const Tensor& weight,
     const std::array<bool, 3> output_mask,
     const ConvBackend backend,
-    const ConvParams<int64_t>& params) {
+    const ConvParams& params) {
   auto kernel_size = weight.sizes().slice(2);
   switch(backend) {
     case ConvBackend::Slow2d:
@@ -1939,7 +1916,7 @@ std::tuple<Tensor, Tensor, Tensor> convolution_backward(
   TORCH_CHECK(dim > 0, "weight should have at least three dimensions");
 
   auto& ctx = at::globalContext();
-  ConvParams<int64_t> params;
+  ConvParams params;
   params.stride = expand_param_if_needed(stride, "stride", dim);
   params.padding = expand_param_if_needed(padding, "padding", dim);
   params.dilation = expand_param_if_needed(dilation, "dilation", dim);
diff --git a/aten/src/ATen/native/utils/ParamUtils.h b/aten/src/ATen/native/utils/ParamUtils.h
index 7c89a3316cb4b..376467ff79cf5 100644
--- a/aten/src/ATen/native/utils/ParamUtils.h
+++ b/aten/src/ATen/native/utils/ParamUtils.h
@@ -6,13 +6,12 @@
 namespace at {
 namespace native {
 
-template <typename T>
-inline std::vector<T> expand_param_if_needed(
-    ArrayRef<T> list_param,
+inline std::vector<int64_t> expand_param_if_needed(
+    IntArrayRef list_param,
     const char* param_name,
     int64_t expected_dim) {
   if (list_param.size() == 1) {
-    return std::vector<T>(expected_dim, list_param[0]);
+    return std::vector<int64_t>(expected_dim, list_param[0]);
   } else if ((int64_t)list_param.size() != expected_dim) {
     std::ostringstream ss;
     ss << "expected " << param_name << " to be a single integer value or a "
diff --git a/c10/core/SymInt.h b/c10/core/SymInt.h
index 6355f13395053..9ab72a0776804 100644
--- a/c10/core/SymInt.h
+++ b/c10/core/SymInt.h
@@ -235,19 +235,6 @@ inline c10::SymInt multiply_integers(const C& container) {
       [](const c10::SymInt& a, const c10::SymInt& b) { return a * b; });
 }
 
-template <
-    typename Iter,
-    typename = std::enable_if_t<std::is_same<
-        typename std::iterator_traits<Iter>::value_type,
-        c10::SymInt>::value>>
-inline c10::SymInt multiply_integers(Iter begin, Iter end) {
-  return std::accumulate(
-      begin,
-      end,
-      c10::SymInt(1),
-      [](const c10::SymInt& a, const c10::SymInt& b) { return a * b; });
-}
-
 inline SymInt operator+(int64_t a, const SymInt& b) {
   return c10::SymInt(a) + b;
 }
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 6073736257249..b8693a484ed9d 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -1408,10 +1408,10 @@ Call this whenever a new thread is created in order to propagate values from
          const at::Tensor& weight,
          const c10::optional<at::Tensor>& bias_opt,
          at::IntArrayRef stride_,
-         at::SymIntArrayRef padding_,
+         at::IntArrayRef padding_,
          at::IntArrayRef dilation_,
          bool transposed_,
-         at::SymIntArrayRef output_padding_,
+         at::IntArrayRef output_padding_,
          int64_t groups_) {
         return at::native::select_conv_backend(
             input,
@@ -1442,13 +1442,13 @@ Call this whenever a new thread is created in order to propagate values from
          const at::Tensor& weight,
          const c10::optional<at::Tensor>& bias,
          at::IntArrayRef stride_,
-         at::SymIntArrayRef padding_,
+         at::IntArrayRef padding_,
          at::IntArrayRef dilation_,
          bool transposed_,
-         at::SymIntArrayRef output_padding_,
+         at::IntArrayRef output_padding_,
          int64_t groups_,
-         c10::optional<std::vector<c10::SymInt>> bias_sizes_opt) {
-        c10::OptionalArrayRef<c10::SymInt> ref = c10::nullopt;
+         c10::optional<std::vector<int64_t>> bias_sizes_opt) {
+        c10::OptionalArrayRef<int64_t> ref = c10::nullopt;
         if (bias_sizes_opt) {
           ref = (*bias_sizes_opt);
         }

From 51b7935af8642848b926418e05e0cd2d59872e42 Mon Sep 17 00:00:00 2001
From: Kirtesh Patil <kirtesh@meta.com>
Date: Wed, 16 Nov 2022 16:40:24 +0000
Subject: [PATCH 0966/1922] [UCC] Add pre & post processing for CPU collectives
 (#89030)

Summary: The CPU block in `collective_post` was missing pre & post processing. The reduce-scatter implementaion expects use of pre-processing callback to flatten the input tensors, however, the missing invocation meant grabage values were being passed.

Test Plan: Tested the reduce-scatter collective using PARAM

Reviewed By: eastzone

Differential Revision: D41291592

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89030
Approved by: https://github.com/kingchc, https://github.com/kwen2501
---
 torch/csrc/distributed/c10d/ProcessGroupUCC.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp b/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp
index 5f286b7a716c5..ad135062a7024 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp
@@ -789,7 +789,9 @@ c10::intrusive_ptr<Work> ProcessGroupUCC::collective_post(
         work->future_ = c10::make_intrusive<at::ivalue::Future>(
             c10::ListType::create(c10::TensorType::get()));
       }
+      preproc();
       comm->enqueue_collective(std::move(data), work, coll, team);
+      postproc();
       return work;
     }
 #ifdef USE_CUDA

From 09d479de907172977cc9c7053fe0c45335271815 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 16 Nov 2022 16:52:47 +0000
Subject: [PATCH 0967/1922] Revert "Towards unifying symbolic and non symbolic
 fake tensor (#89038)"

This reverts commit 37d54239c7ea88fd9c98dcac3fcc9b98a6f9e9d1.

Reverted https://github.com/pytorch/pytorch/pull/89038 on behalf of https://github.com/ezyang due to executorch segfaults
---
 aten/src/ATen/native/TensorFactories.cpp |  6 +++
 test/functorch/test_aotdispatch.py       |  1 +
 test/test_proxy_tensor.py                | 21 ++++++---
 torch/_meta_registrations.py             | 44 +++---------------
 torch/_ops.py                            |  1 -
 torch/_prims/__init__.py                 |  5 +-
 torch/_prims_common/__init__.py          |  3 --
 torch/_subclasses/fake_tensor.py         | 58 +++++++++++++++---------
 8 files changed, 68 insertions(+), 71 deletions(-)

diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 7245cb77b1c50..9d1c6d8a36333 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -325,6 +325,12 @@ Tensor empty_like(
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options_ = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
+
+  TORCH_CHECK(
+    !(options_.has_memory_format() && optional_memory_format.has_value()),
+    "Cannot set memory_format both in TensorOptions and explicit argument; please delete "
+    "the redundant setter.");
+
   TensorOptions options =
       self.options()
           .merge_in(options_)
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index ae216f9be4a49..1dc5476158f96 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1011,6 +1011,7 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('cumprod', ''),  # aten.cumprod.default - couldn't find symbolic meta function/decomposition
     xfail('cumsum', ''),  # aten.cumsum.default - couldn't find symbolic meta function/decomposition
     xfail('cumulative_trapezoid', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
+    xfail('deg2rad', ''),  # aten.deg2rad.default - couldn't find symbolic meta function/decomposition
     xfail('diff', ''),  # aten.zeros_like.default - couldn't find symbolic meta function/decomposition
     xfail('digamma', ''),  # aten.polygamma.default - couldn't find symbolic meta function/decomposition
     xfail('dist', ''),  # aten.dist.default - couldn't find symbolic meta function/decomposition
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 0a24807af55f0..8dc42be7fdfb2 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1151,7 +1151,9 @@ def f(a, b, c, d, e):
     xfail('cummin', ''),  # aten.cummin.default - couldn't find symbolic meta function/decomposition
     xfail('cumprod', ''),  # aten.cumprod.default - couldn't find symbolic meta function/decomposition
     xfail('cumulative_trapezoid', ''),  # aten.slice.Tensor - couldn't find symbolic meta function/decomposition
+    xfail('deg2rad', ''),  # aten.deg2rad.default - couldn't find symbolic meta function/decomposition
     xfail('diff', ''),  # aten.empty_like.default - couldn't find symbolic meta function/decomposition
+    xfail('dist', ''),  # aten.dist.default - couldn't find symbolic meta function/decomposition
     xfail('dsplit', ''),  # aten.slice.Tensor - couldn't find symbolic meta function/decomposition
     xfail('fft.fft2', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('fft.fft', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
@@ -1233,6 +1235,8 @@ def f(a, b, c, d, e):
     xfail('lu', ''),  # aten.linalg_lu_factor_ex.default - couldn't find symbolic meta function/decomposition
     xfail('lu_solve', ''),  # aten.linalg_lu_solve.default - couldn't find symbolic meta function/decomposition
     xfail('lu_unpack', ''),  # aten.lu_unpack.default - couldn't find symbolic meta function/decomposition
+    xfail('masked_fill', ''),  # expected predicate to be bool, got torch.float32
+    xfail('masked_scatter', ''),  # aten.masked_scatter.default - couldn't find symbolic meta function/decomposition
     xfail('masked_select', ''),  # aten.masked_select.default - couldn't find symbolic meta function/decomposition
     xfail('matrix_exp', ''),  # aten.linalg_matrix_exp.default - couldn't find symbolic meta function/decomposition
     xfail('median', ''),  # Could not run 'aten::median' with arguments from the 'Meta' backend. This could be becau...
@@ -1277,6 +1281,7 @@ def f(a, b, c, d, e):
     xfail('nn.functional.pdist', ''),  # Could not run 'aten::_pdist_forward' with arguments from the 'Meta' backend...
     xfail('nn.functional.pixel_shuffle', ''),  # aten.pixel_shuffle.default - couldn't find symbolic meta function/decompos...
     xfail('nn.functional.pixel_unshuffle', ''),  # aten.pixel_unshuffle.default - couldn't find symbolic meta function/deco...
+    xfail('nn.functional.rrelu', ''),  # aten.empty_like.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.smooth_l1_loss', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.unfold', ''),  # aten.im2col.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.upsample_nearest', ''),  # aten.upsample_nearest1d.vec - couldn't find symbolic meta function/deco...
@@ -1293,6 +1298,7 @@ def f(a, b, c, d, e):
     xfail('polygamma', 'polygamma_n_2'),  # aten.polygamma.default - couldn't find symbolic meta function/decomposition
     xfail('polygamma', 'polygamma_n_3'),  # aten.polygamma.default - couldn't find symbolic meta function/decomposition
     xfail('polygamma', 'polygamma_n_4'),  # aten.polygamma.default - couldn't find symbolic meta function/decomposition
+    xfail('put', ''),  # aten.clone.default - couldn't find symbolic meta function/decomposition
     xfail('quantile', ''),  # Could not run 'aten::equal' with arguments from the 'Meta' backend.
     xfail('qr', ''),  # aten.linalg_qr.default - couldn't find symbolic meta function/decomposition
     xfail('rad2deg', ''),  # aten.rad2deg.default - couldn't find symbolic meta function/decomposition
@@ -1341,15 +1347,11 @@ def f(a, b, c, d, e):
 
 symbolic_tensor_failures.update(symbolic_tensor_segfaults)
 
-outplace_symbolic_tensor_failures = {
-    xfail('masked_fill', ''),  # expected predicate to be bool, got torch.float32
-    xfail('masked_scatter', ''),  # aten.masked_scatter.default - couldn't find symbolic meta function/decomposition
-    xfail('nn.functional.rrelu', ''),  # aten.empty_like.default - couldn't find symbolic meta function/decomposition
-}
-
 inplace_symbolic_tensor_failures = {
+    xfail('abs', ''),  # aten.abs_.default - couldn't find symbolic meta function/decomposition
     xfail('acos', ''),  # aten.acos_.default - couldn't find symbolic meta function/decomposition
     xfail('acosh', ''),  # aten.acosh_.default - couldn't find symbolic meta function/decomposition
+    xfail('addbmm', ''),  # aten.addbmm_.default - couldn't find symbolic meta function/decomposition
     xfail('addcdiv', ''),  # aten.addcdiv_.default - couldn't find symbolic meta function/decomposition
     xfail('addcmul', ''),  # aten.addcmul_.default - couldn't find symbolic meta function/decomposition
     xfail('addmm', ''),  # aten.addmm_.default - couldn't find symbolic meta function/decomposition
@@ -1363,6 +1365,7 @@ def f(a, b, c, d, e):
     xfail('clamp', ''),  # aten.clamp_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('clamp_max', ''),  # aten.clamp_max_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('clamp_min', ''),  # aten.clamp_min_.Tensor - couldn't find symbolic meta function/decomposition
+    xfail('conj_physical', ''),  # aten.conj_physical_.default - couldn't find symbolic meta function/decomposition
     xfail('copysign', ''),  # aten.copysign_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('cos', ''),  # aten.cos_.default - couldn't find symbolic meta function/decomposition
     xfail('cosh', ''),  # aten.cosh_.default - couldn't find symbolic meta function/decomposition
@@ -1379,6 +1382,7 @@ def f(a, b, c, d, e):
     xfail('expm1', ''),  # aten.expm1_.default - couldn't find symbolic meta function/decomposition
     xfail('float_power', ''),  # the base given to float_power_ has dtype Float but the operation's result requires dtype Double
     xfail('floor', ''),  # aten.floor_.default - couldn't find symbolic meta function/decomposition
+    xfail('floor_divide', ''),  # aten.floor_divide_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('fmod', ''),  # aten.fmod_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('frac', ''),  # aten.frac_.default - couldn't find symbolic meta function/decomposition
     xfail('ge', ''),  # aten.ge_.Tensor - couldn't find symbolic meta function/decomposition
@@ -1394,6 +1398,7 @@ def f(a, b, c, d, e):
     xfail('log1p', ''),  # aten.log1p_.default - couldn't find symbolic meta function/decomposition
     xfail('log2', ''),  # aten.log2_.default - couldn't find symbolic meta function/decomposition
     xfail('log', ''),  # aten.log_.default - couldn't find symbolic meta function/decomposition
+    xfail('logit', ''),  # aten.logit_.default - couldn't find symbolic meta function/decomposition
     xfail('lt', ''),  # aten.lt_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('mvlgamma', 'mvlgamma_p_1'),  # aten.mvlgamma_.default - couldn't find symbolic meta function/decomposition
     xfail('mvlgamma', 'mvlgamma_p_3'),  # aten.mvlgamma_.default - couldn't find symbolic meta function/decomposition
@@ -1403,6 +1408,7 @@ def f(a, b, c, d, e):
     xfail('neg', ''),  # aten.neg_.default - couldn't find symbolic meta function/decomposition
     xfail('nextafter', ''),  # aten.nextafter_.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.celu', ''),  # aten.celu_.default - couldn't find symbolic meta function/decomposition
+    xfail('nn.functional.dropout3d', ''),  # aten.squeeze_.dim - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.elu', ''),  # aten.elu_.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.hardsigmoid', ''),  # aten.hardsigmoid_.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.mish', ''),  # aten.mish_.default - couldn't find symbolic meta function/decomposition
@@ -1420,6 +1426,7 @@ def f(a, b, c, d, e):
     xfail('sinh', ''),  # aten.sinh_.default - couldn't find symbolic meta function/decomposition
     xfail('sqrt', ''),  # aten.sqrt_.default - couldn't find symbolic meta function/decomposition
     xfail('square', ''),  # aten.pow_.Scalar - couldn't find symbolic meta function/decomposition
+    xfail('squeeze', ''),  # aten.squeeze_.default - couldn't find symbolic meta function/decomposition
     xfail('t', ''),  # aten.t_.default - couldn't find symbolic meta function/decomposition
     xfail('tan', ''),  # aten.tan_.default - couldn't find symbolic meta function/decomposition
     xfail('tanh', ''),  # aten.tanh_.default - couldn't find symbolic meta function/decomposition
@@ -1509,7 +1516,7 @@ def test_make_fx_fake_exhaustive(self, device, dtype, op):
     @skipIfNoSympy
     @ops(op_db, allowed_dtypes=(torch.float,))
     @skipOps('TestProxyTensorOpInfo', 'test_make_fx_symbolic_exhaustive',
-             make_fx_failures | fake_tensor_failures | symbolic_tensor_failures | outplace_symbolic_tensor_failures)
+             make_fx_failures | fake_tensor_failures | symbolic_tensor_failures)
     def test_make_fx_symbolic_exhaustive(self, device, dtype, op):
         _test_make_fx_helper(self, device, dtype, op, "symbolic")
 
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index abcd1ead8b433..4fa3ab09d2755 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1513,6 +1513,7 @@ def full(size, fill_value, *args, **kwargs):
         aten.randn_like.default,
         aten.rand_like.default,
         aten.full_like.default,
+        aten.zeros_like.default,
         aten.ones_like.default,
     ]
 )
@@ -1520,44 +1521,6 @@ def meta_like(self, *args, **kwargs):
     return aten.empty_like.default(self, **kwargs)
 
 
-# zeros_like is special cased to work for sparse
-@register_meta(aten.zeros_like.default)
-def zeros_like(
-    self, dtype=None, layout=None, device=None, pin_memory=None, memory_format=None
-):
-    if layout == torch.sparse_coo:
-        check(
-            memory_format is None,
-            lambda: "memory format option is only supported by strided tensors",
-        )
-
-        res = torch.empty(
-            0,
-            dtype=self.dtype if dtype is None else dtype,
-            layout=layout,
-            device=self.device if device is None else device,
-            pin_memory=pin_memory,
-        )
-
-        if self.is_sparse:
-            res.sparse_resize_and_clear_(
-                self.size(), self.sparse_dim(), self.dense_dim()
-            )
-        else:
-            res.sparse_resize_and_clear_(self.size(), self.dim(), 0)
-
-        res._coalesced_(True)
-        return res
-    return aten.empty_like.default(
-        self,
-        dtype=dtype,
-        layout=layout,
-        device=device,
-        pin_memory=pin_memory,
-        memory_format=memory_format,
-    )
-
-
 # hacky: Please remove after math.ceil works with arange
 @register_meta(aten.arange.default)
 def arange(end, **kwargs):
@@ -1931,6 +1894,11 @@ def activate_meta():
             # Instead, we should be letting those decompositions run, and writing meta kernels
             # only for the base operators.
             pass
+        elif op_overload.is_view:
+            # Attempting to register a python meta kernel for a view operator.
+            # We shouldn't do this, because the output will report as not having aliased storages.
+            # All view ops have meta kernels in C++ today, so we should use those instead.
+            pass
         elif op_overload.name() in {
             "aten::empty_strided",  # causing infinite recursion, test_meta.py
             "aten::clone",  # causing infinite recursion
diff --git a/torch/_ops.py b/torch/_ops.py
index b20398a7f3ab3..9163932144d0d 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -365,7 +365,6 @@ def handler(*args, **kwargs):
             return handler
 
         final_key = resolve_key(self, key)
-        # print(self, key, final_key)
         r = self.py_kernels.get(final_key, final_key)
         self._dispatch_cache[key] = r
         return r
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index a4bac68f0ff14..da8d9af723acf 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -1150,6 +1150,9 @@ def _minimum_aten(
 
 #
 # View operations
+#
+# TODO: model view relationships
+# TODO: model storage
 def _as_strided_meta(
     a: TensorLikeType, size: ShapeType, stride: StrideType, storage_offset: int
 ) -> TensorLikeType:
@@ -1167,7 +1170,7 @@ def _as_strided_meta(
             a._typed_storage(), size, stride, storage_offset
         )
 
-    return torch.as_strided(a, size, stride, storage_offset)
+    return TensorMeta(a, shape=size, strides=stride)
 
 
 def _as_strided_aten(
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index 041448e8102ac..128796dfa3d07 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -291,9 +291,6 @@ def is_non_overlapping_and_dense(a: Tensor) -> bool:
     its dimensions that is contiguous.
     """
 
-    if a.is_sparse:
-        return False
-
     # Short-circuits if the tensor is already contiguous or channels-last contiguous
     if is_contiguous(a) or is_channels_last_contiguous(a):
         return True
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 9a0ac050e6b94..5d3d3a0e32fe1 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -1,6 +1,7 @@
 import contextlib
 import functools
 import itertools
+import sys
 import weakref
 from dataclasses import dataclass
 from functools import partial
@@ -296,9 +297,8 @@ def constructors(fake_mode, func, *args, **kwargs):
     out_device = new_kwargs.pop("device", None)
     out_device = out_device if out_device is not None else default_device
     new_kwargs["device"] = torch.device("meta")
-    # _like constructors have fake tensor inputs (maybe this causes the non-like
-    # to fail? hmmm)
-    with in_kernel_invocation_manager(fake_mode):
+    # Not in_kernel_invocation_manager as no fake tensor inputs
+    with no_dispatch():
         r = func(*args, **new_kwargs)
     return FakeTensor(fake_mode, r, out_device)
 
@@ -821,30 +821,40 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         # is written to must be invalidated
         self.invalidate_written_to_constants(func, flat_arg_fake_tensors, args, kwargs)
 
-        # If there's a Python meta, prefer that over the decomposition
-        from torch._decomp import meta_table as meta_table
+        from torch._decomp import decomposition_table
+
+        with self:
+            # Decomposes CompositeImplicitAutograd ops
+            r = func.decompose(*args, **kwargs)
+            if r is not NotImplemented:
+                return r
 
-        if func not in meta_table and not self.cpp_meta_supports_symint(func):
-            from torch._decomp import decomposition_table
+        # IDK: feels bad man, sym_numel on as_strided infinite loops otherwise
+        if has_symbolic_sizes and not self.cpp_meta_supports_symint(func):
+            from torch._decomp import meta_table as meta_table
 
-            # Prefer Python decompositions over C++ ones
-            if func in decomposition_table and (
-                has_symbolic_sizes
-                or (
-                    # TODO: Remove these exclusions, so that we can remove
-                    # this leg entirely
-                    torch_decomp_decompositions(func)
-                    and all(not e.is_sparse for e in flat_arg_fake_tensors)
+            if func == aten.size.default:
+                sys.stderr.write(
+                    "Trying to call aten.size on a tensor with symbolic shapes. "
+                    "It's likely that this is from calling tensor.shape in C++"
                 )
-            ):
-                with self:
-                    return decomposition_table[func](*args, **kwargs)
+                # We do this to allow for better error localization with `TORCH_SHOW_CPP_STACKTRACES=1`
+                return None
 
             with self:
-                # Decomposes CompositeImplicitAutograd ops
-                r = func.decompose(*args, **kwargs)
-                if r is not NotImplemented:
+                if func in meta_table:
+                    r = meta_table[func](*args, **kwargs)
                     return r
+                if func in decomposition_table:
+                    return decomposition_table[func](*args, **kwargs)
+
+        if (
+            func in decomposition_table
+            and torch_decomp_decompositions(func)
+            and all(not e.is_sparse for e in flat_arg_fake_tensors)
+        ):
+            with self:
+                return decomposition_table[func](*args, **kwargs)
 
         # prims already wrap FakeTensor inputs to FakeTensor outputs
         # and do device logic, we dont need do anything but run them
@@ -855,6 +865,12 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
             with self:
                 return func.prim_meta_impl(*args, **kwargs)
 
+        if has_symbolic_sizes:
+            if not self.cpp_meta_supports_symint(func):
+                raise RuntimeError(
+                    f"{func} - couldn't find symbolic meta function/decomposition"
+                )
+
         # special handling for funcs registered through `register_op_impl`,
         # e.g., manipulating args on constructor calls to construct meta tensors
         # and then afterwards wrapping them to a FakeTensor

From 63ca11ae42799e6929819962241c97f897d6af84 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Wed, 16 Nov 2022 16:59:36 +0000
Subject: [PATCH 0968/1922] add quantize_decomposed_dynamic to op lib (#88855)

Summary: Needed for dynamic quant reference pattern graphs.

Test Plan: added unittest

Differential Revision: D41205030

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88855
Approved by: https://github.com/jerryzh168
---
 .../core/test_quantized_tensor.py             | 25 +++++++++++++
 torch/ao/quantization/fx/_decomposed.py       | 36 +++++++++++++++----
 torch/ao/quantization/utils.py                |  1 -
 3 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py
index a2043509f1f13..dab53de5b1075 100644
--- a/test/quantization/core/test_quantized_tensor.py
+++ b/test/quantization/core/test_quantized_tensor.py
@@ -14,6 +14,7 @@
 from torch.testing._internal.common_utils import TestCase, DeterministicGuard
 import torch.testing._internal.hypothesis_utils as hu
 from torch.testing._internal.common_quantization import get_supported_device_types
+from torch.ao.quantization import MinMaxObserver
 
 hu.assert_deadline_disabled()
 
@@ -1498,6 +1499,30 @@ def test_decomposed_dequantize(self):
         self.assertEqual(quantized_X.int_repr(), quantized_decomposed_X)
         self.assertEqual(dequantized_X, dequantized_decomposed_X)
 
+    def test_decomposed_quantize_dynamic(self):
+        import torch.ao.quantization.fx._decomposed
+        X = torch.randn(5, 10)
+        dtype = torch.uint8
+        qdtype = torch.quint8
+        scale, zero_point = torch._choose_qparams_per_tensor(X, False)
+        quant_min, quant_max = 0, 255
+
+        quantized_X = torch.quantize_per_tensor(X, scale, zero_point, qdtype)
+        dequantized_X = torch.dequantize(quantized_X)
+
+        quantized_decomposed_X = torch.ops.quantized_decomposed.quantize_per_tensor_dynamic(
+            X, quant_min, quant_max, dtype)
+
+        # observer logic is what quantize_per_tensor_dynamic does internally
+        observer = MinMaxObserver(quant_min=quant_min, quant_max=quant_max)
+        observer(X)
+        scale_decomposed, zero_point_decomposed = observer.calculate_qparams()
+        dequantized_decomposed_X = torch.ops.quantized_decomposed.dequantize_per_tensor(
+            quantized_decomposed_X, scale_decomposed, zero_point_decomposed, quant_min, quant_max, dtype
+        )
+        self.assertEqual(quantized_X.int_repr(), quantized_decomposed_X)
+        self.assertEqual(dequantized_X, dequantized_decomposed_X)
+
 if __name__ == '__main__':
     raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
                        "\tpython test/test_quantization.py TESTNAME\n\n"
diff --git a/torch/ao/quantization/fx/_decomposed.py b/torch/ao/quantization/fx/_decomposed.py
index 001fa16f8cd3f..3f4d38872e174 100644
--- a/torch/ao/quantization/fx/_decomposed.py
+++ b/torch/ao/quantization/fx/_decomposed.py
@@ -1,16 +1,13 @@
 import torch
-from torch.library import Library, impl
+from torch.library import impl, Library
+from torch.ao.quantization import MinMaxObserver
 
 # Note: decomposed means decomposed quantized tensor, using decomposed so that the
 # name is not too long
 quantized_decomposed_lib = Library("quantized_decomposed", "DEF")
 
-quantized_decomposed_lib.define(
-    "quantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> Tensor")
-
-@impl(quantized_decomposed_lib, "quantize_per_tensor", "CompositeExplicitAutograd")
-def quantize_per_tensor(input, scale, zero_point, quant_min, quant_max, dtype):
-    assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+# Helper to check the passed in quant min and max are valid for the dtype
+def _quant_min_max_bounds_check(quant_min, quant_max, dtype):
     quant_min_lower_bound = 0
     quant_max_upper_bound = 0
     if dtype == torch.uint8:
@@ -30,6 +27,14 @@ def quantize_per_tensor(input, scale, zero_point, quant_min, quant_max, dtype):
         "quant_max out of bound for dtype, " \
         f"quant_max_upper_bound: {quant_max_upper_bound} quant_max: {quant_max}"
 
+quantized_decomposed_lib.define(
+    "quantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> Tensor")
+
+@impl(quantized_decomposed_lib, "quantize_per_tensor", "CompositeExplicitAutograd")
+def quantize_per_tensor(input, scale, zero_point, quant_min, quant_max, dtype):
+    assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+    _quant_min_max_bounds_check(quant_min, quant_max, dtype)
+
     inv_scale = 1.0 / scale
     return torch.clamp(torch.round(input * inv_scale) + zero_point, quant_min, quant_max).to(dtype)
 
@@ -50,3 +55,20 @@ def dequantize_per_tensor(input, scale, zero_point, quant_min, quant_max, dtype)
         return (input.to(torch.float32) - zero_point) * scale
     else:
         raise ValueError(f"Unsupported dtype in dequantize_per_tensor: {dtype}")
+
+quantized_decomposed_lib.define(
+    "quantize_per_tensor_dynamic(Tensor input, int quant_min, int quant_max, ScalarType dtype) -> Tensor")
+
+@impl(quantized_decomposed_lib, "quantize_per_tensor_dynamic", "CompositeExplicitAutograd")
+def quantize_per_tensor_dynamic(input, quant_min, quant_max, dtype):
+    assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+    _quant_min_max_bounds_check(quant_min, quant_max, dtype)
+
+    # Its weird to create an observer manually just to calculate qparams. I tried refactoring this functionality out of observer
+    # into a util and then use that util directly, but I kept running into jit typing errors related to torch.qscheme not
+    # being recognized as a type. TODO: properly refactor this out to avoid observer overhead
+    tensor_dtype_to_observer_dtype = {torch.uint8: torch.quint8, torch.int8: torch.qint8}
+    observer = MinMaxObserver(quant_min=quant_min, quant_max=quant_max, dtype=tensor_dtype_to_observer_dtype[dtype])
+    observer(input)
+    scale, zero_point = observer.calculate_qparams()
+    return torch.ops.quantized_decomposed.quantize_per_tensor(input, scale.item(), zero_point.item(), quant_min, quant_max, dtype)
diff --git a/torch/ao/quantization/utils.py b/torch/ao/quantization/utils.py
index afa278a795dd0..9f3dc712a9fe6 100644
--- a/torch/ao/quantization/utils.py
+++ b/torch/ao/quantization/utils.py
@@ -546,7 +546,6 @@ def _patched_module_call(self, *args, **kwargs):
         torch.nn.Module.__call__ = orig_module_call
     return fqn_to_example_inputs
 
-
 __all__ = [
     "NodePattern",
     "Pattern",

From 4c916b933e41fdc713b12031061cd69bb221f112 Mon Sep 17 00:00:00 2001
From: bmedishe <bmedishe@amd.com>
Date: Wed, 16 Nov 2022 17:42:26 +0000
Subject: [PATCH 0969/1922] test_unary_ufuncs few tests enabled on rocm which
 are passing (#89007)

This PR is to enable tests which are skip on rocm from test package test_unary_ufuncs.py::TestUnaryUfuncsCUDA

<html>
<body>
<!--StartFragment--><div ccp_infra_version='3' ccp_infra_timestamp='1667423453335' ccp_infra_user_hash='1693798314' ccp_infra_copy_id='81491a4a-67e6-4e87-aa71-47d953d2499a' data-ccp-timestamp='1667423453335'><html><head><meta name=ProgId content=Excel.Sheet><meta name=Generator content="Microsoft Excel 15"></head><body link="#0563C1" vlink="#954F72">

test_file | test_name | test_class
-- | -- | --
test_unary_ufuncs | test_reference_numerics_large_polygamma_polygamma_n_2_cuda_float16 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_large_polygamma_polygamma_n_2_cuda_float32 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_large_polygamma_polygamma_n_2_cuda_float64 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_large_polygamma_polygamma_n_2_cuda_int16 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_large_polygamma_polygamma_n_2_cuda_int32 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_large_polygamma_polygamma_n_2_cuda_int64 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_large_polygamma_polygamma_n_4_cuda_float16 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_large_polygamma_polygamma_n_4_cuda_float32 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_large_polygamma_polygamma_n_4_cuda_float64 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_large_polygamma_polygamma_n_4_cuda_int16 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_large_polygamma_polygamma_n_4_cuda_int32 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_large_polygamma_polygamma_n_4_cuda_int64 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_large_tan_cuda_float64 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_small_atan_cuda_bfloat16 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_small_atan_cuda_float16 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_small_atan_cuda_float32 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_small_atan_cuda_float64 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_small_atan_cuda_int16 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_small_atan_cuda_int32 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_small_atan_cuda_int64 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_small_atan_cuda_int8 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_small_atan_cuda_uint8 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_small_polygamma_polygamma_n_2_cuda_float16 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_small_polygamma_polygamma_n_2_cuda_float32 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_small_polygamma_polygamma_n_2_cuda_float64 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_small_polygamma_polygamma_n_2_cuda_int16 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_small_polygamma_polygamma_n_2_cuda_int32 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_small_polygamma_polygamma_n_2_cuda_int64 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_small_polygamma_polygamma_n_2_cuda_int8 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_small_polygamma_polygamma_n_2_cuda_uint8 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_small_polygamma_polygamma_n_4_cuda_float16 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_small_polygamma_polygamma_n_4_cuda_float32 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_small_polygamma_polygamma_n_4_cuda_float64 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_small_polygamma_polygamma_n_4_cuda_int16 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_small_polygamma_polygamma_n_4_cuda_int32 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_small_polygamma_polygamma_n_4_cuda_int64 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_small_polygamma_polygamma_n_4_cuda_int8 | (__main__.TestUnaryUfuncsCUDA)
test_unary_ufuncs | test_reference_numerics_small_polygamma_polygamma_n_4_cuda_uint8 | (__main__.TestUnaryUfuncsCUDA)

</body></html></div><!--EndFragment-->
</body>
</html>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89007
Approved by: https://github.com/mruberry
---
 .../_internal/common_methods_invocations.py     | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index e498e4f285092..1b8920cbc8670 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -8740,7 +8740,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    decorators=(precisionOverride({torch.bfloat16: 1e-2}),),
                    skips=(
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
-                                    active_if=TEST_WITH_ROCM, device_type='cuda'),
+                                    active_if=TEST_WITH_ROCM, device_type='cuda', dtypes=[torch.complex64, torch.complex128]),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     active_if=TEST_WITH_ROCM, device_type='cuda', dtypes=[torch.complex128]),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
@@ -13320,9 +13320,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
                                     active_if=(IS_MACOS or IS_WINDOWS)),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
-                                    device_type='cuda', dtypes=[torch.float64],
-                                    active_if=TEST_WITH_ROCM),
                        DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
                                     'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
                    ),
@@ -13841,11 +13838,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                        DecorateInfo(unittest.skip("Skipped!"), 'TestNormalizeOperators'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestCommon'),
                        # Mismatch: https://github.com/pytorch/pytorch/issues/55357
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
-                                    active_if=TEST_WITH_ROCM),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
-                                    active_if=TEST_WITH_ROCM),),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal'),),
                    sample_kwargs=lambda device, dtype, input: ({'n': 2}, {'n': 2}),
                    # polygamma functions have multiple singularities at x <= 0
                    reference_numerics_filter=NumericsFilter(condition=lambda x: x < 0.1, safe_val=1)),
@@ -13888,11 +13881,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                        DecorateInfo(unittest.skip("Skipped!"), 'TestNormalizeOperators'),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestCommon'),
                        # Mismatch: https://github.com/pytorch/pytorch/issues/55357
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal'),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
-                                    active_if=TEST_WITH_ROCM),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
-                                    active_if=TEST_WITH_ROCM),),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal'),),
                    sample_kwargs=lambda device, dtype, input: ({'n': 4}, {'n': 4}),
                    # polygamma functions have multiple singularities at x <= 0
                    reference_numerics_filter=NumericsFilter(condition=lambda x: x < 0.1, safe_val=1)),

From 02ca947b71dd5ee5b97b30263feffbabcedb27ee Mon Sep 17 00:00:00 2001
From: lezcano <lezcano-93@hotmail.com>
Date: Wed, 16 Nov 2022 14:09:58 +0000
Subject: [PATCH 0970/1922] Simplify linspace decomp and increase its tolerance
 (#87203)

This is an interesting one

Since this is an operation that's intrinsically defined on the reals,
we should perform the ops on that dtype always, and just cast to
the desired dtype at the end. This simplifies the decomposition.

Now, I started looking at this one when I started seeing failures on a
test that's added in a later PR. What's going on here is that, by doing
an upcast to a higher dtype and then cast down to integers, sometimes
there's an off-by-one error. I think this is fine, as the decomposition
is more accurate than the original function, which goes in line with
the whole PrimTorch effort.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87203
Approved by: https://github.com/mruberry
---
 test/test_decomp.py                           | 11 ++-
 torch/_refs/__init__.py                       | 89 ++++++++-----------
 .../_internal/common_methods_invocations.py   | 19 +---
 3 files changed, 50 insertions(+), 69 deletions(-)

diff --git a/test/test_decomp.py b/test/test_decomp.py
index a3658792c5e71..dc94b6714ccd9 100644
--- a/test/test_decomp.py
+++ b/test/test_decomp.py
@@ -201,8 +201,17 @@ def op_assert_equal(test_case, op, test_dtype, orig, decomp, args, kwargs):
         (torch.complex64, torch.ops.aten.mv.default): (5e-5, 5e-5),
         (torch.float64, torch.ops.aten.upsample_bicubic2d.vec) : (1e-5, 5e-4),
         (torch.float64, torch.ops.aten.upsample_bicubic2d.default) : (1e-5, 5e-4),
+        # The decomposition is TOO correct. It computes everything in int64, so sometimes
+        # there's an off-by-one error. See
+        # https://github.com/pytorch/pytorch/issues/81996
+        # https://github.com/pytorch/pytorch/issues/82230
+        (torch.int8, torch.ops.aten.linspace.default) : (0, 1),
+        (torch.uint8, torch.ops.aten.linspace.default) : (0, 1),
+        (torch.int16, torch.ops.aten.linspace.default) : (0, 1),
+        (torch.int32, torch.ops.aten.linspace.default) : (0, 1),
+        (torch.int64, torch.ops.aten.linspace.default) : (0, 1),
     }
-    if (test_dtype, op) in tol_table:
+    if (decomp.dtype, op) in tol_table:
         rtol, atol = tol_table[(decomp.dtype, op)]
     else:
         rtol, atol = _getDefaultRtolAndAtol(orig.dtype, decomp.dtype)
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index a0916c3f8268a..111c5c956f5d6 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -4182,8 +4182,9 @@ def lerp(start: Tensor, end: Tensor, weight: Union[Tensor, NumberType]):
         )
     assert isinstance(weight, Tensor)  # mypy
     # We implement it this way for numerical stability. We assume (in the stability optimisation)
-    # that 0 <= weight <= 1. We take the abs to deal with comples numbers
-    # We want to do operations near zero, which is where floating points are most precise
+    # that 0 <= weight <= 1. We take the abs to deal with complex numbers
+    # We want to perform operations near zero, which is where floating points are most precise
+    # thus, we perform the following optimisation:
     # If weight.abs() >= 0.5:
     #    return (1 - weight) * (start - end) + end
     mask = weight.abs() >= 0.5
@@ -4205,22 +4206,22 @@ def linspace(
     pin_memory: bool = False,
     requires_grad: bool = False,
 ) -> TensorLikeType:
-    if dtype is None:
-        dtype = torch.get_default_dtype()
-
-    # NB: NumPy actually doesn't do this cast, but for this ref, I'd rather have this
-    #     cast than not, because it allows us to always go into the precise path
-    #     if dtype is integral and not worry about whether start/end are float
-    if prims.utils.is_integer_dtype(dtype):
-        if isinstance(start, FloatLike):
-            start = sym_int(start)
-        if isinstance(end, FloatLike):
-            end = sym_int(end)
-
     if py_any(isinstance(arg, complex) for arg in (start, end, steps)):
-        raise NotImplementedError
-    assert not isinstance(start, complex) and not isinstance(end, complex)  # for mypy
+        default_complex_dtype = utils.corresponding_complex_dtype(
+            torch.get_default_dtype()
+        )
+        if dtype is None:
+            dtype = default_complex_dtype
+        else:
+            check(
+                utils.is_complex_dtype(dtype),
+                lambda: f"linspace(): inferred dtype {default_complex_dtype} can't be safely cast to passed dtype {dtype}",
+            )
+    else:
+        dtype = dtype or torch.get_default_dtype()
+    assert isinstance(dtype, torch.dtype)
 
+    # steps does not participate in the computation of the dtype
     check(
         isinstance(steps, IntLike),
         lambda: "steps must be int, not float",
@@ -4236,41 +4237,27 @@ def linspace(
         "requires_grad": requires_grad,
     }
     if steps == 0:
-        ret = torch.full((0,), 0, dtype=dtype, **factory_kwargs)  # type: ignore[call-overload]
-    elif steps == 1:
-        ret = torch.full((1,), start, dtype=dtype, **factory_kwargs)  # type: ignore[call-overload]
-    elif start == end:
-        ret = torch.full((steps,), start, dtype=dtype, **factory_kwargs)  # type: ignore[call-overload]
-    else:
-        if prims.utils.is_integer_dtype(dtype):
-            # We need to cast to int, so to avoid off-by-one issues
-            # do the entire computation with ints when we can
-            assert isinstance(start, IntLike) and isinstance(end, IntLike)
-            step_size_x_denom = end - start
-            eps = 1 if end > start else -1
-            denom = steps - 1
-            ret = prims.to_dtype(
-                torch.arange(
-                    start * denom,
-                    end * denom + eps,
-                    step_size_x_denom,
-                    dtype=torch.int64,
-                    **factory_kwargs,  # type: ignore[arg-type]
-                )
-                / denom,
-                dtype,
-            )
-        else:
-            step_size = (end - start) / (steps - 1)
-            eps = step_size / 2
-            ret = prims.to_dtype(
-                torch.arange(  # type: ignore[call-overload]
-                    start, end + eps, step_size, dtype=torch.float64, **factory_kwargs
-                ),
-                dtype,
-            )
-
-    return ret
+        return torch.full((0,), 0, dtype=dtype, **factory_kwargs)  # type: ignore[arg-type]
+    if steps == 1:
+        return torch.full((1,), start, dtype=dtype, **factory_kwargs)  # type: ignore[arg-type]
+    if start == end:
+        return torch.full((steps,), start, dtype=dtype, **factory_kwargs)  # type: ignore[arg-type]
+
+    # arange returns values in the interval [start, end) so we add an an eps to make it [start, end]
+    # The eps is small enough as to always add just the element end
+    step_size = 1 / (steps - 1)
+    eps = step_size / 2
+    # arange returns a tensor of size divup(end - start, step) and thus, for the arguemnts below
+    # ceil(div(1 + step_size/2,  1/(steps - 1)) = steps - 1  + ceil(1 / 2) = steps
+    # torch.arange is an scan algorithm, so we need a high-precision dtype
+    rg = torch.arange(
+        0, 1 + eps, step_size, dtype=torch.float64, **factory_kwargs  # type: ignore[arg-type]
+    )
+    double_dtype = torch.complex128 if utils.is_complex_dtype(dtype) else torch.float64
+    rg = _maybe_convert_to_dtype(rg, double_dtype)  # type: ignore[assignment]
+    cast = partial(torch.full, (1,), dtype=double_dtype, **factory_kwargs)
+    out = torch.lerp(cast(start), cast(end), rg)
+    return _maybe_convert_to_dtype(out, dtype)  # type: ignore[return-value]
 
 
 @register_decomposition(torch.ops.aten.logspace)
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 1b8920cbc8670..f81aa4f5024c2 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -9937,21 +9937,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
 
                # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
-
-               # cpu implementation is wrong on some integral types
-               # https://github.com/pytorch/pytorch/issues/81996
-               DecorateInfo(unittest.expectedFailure, 'TestDecomp', 'test_quick',
-                            dtypes=(torch.int16, torch.int32, torch.int64), device_type="cpu"),
-               DecorateInfo(unittest.expectedFailure, 'TestDecomp', 'test_comprehensive',
-                            dtypes=(torch.int16, torch.int32, torch.int64), device_type="cpu"),
-               # cuda implementation is off-by-one on some inputs due to precision issues
-               # https://github.com/pytorch/pytorch/issues/82230
-               DecorateInfo(unittest.expectedFailure, 'TestDecomp', 'test_quick',
-                            dtypes=(torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64),
-                            device_type="cuda"),
-               DecorateInfo(unittest.expectedFailure, 'TestDecomp', 'test_comprehensive',
-                            dtypes=(torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64),
-                            device_type="cuda"),
                # UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API
                # in __main__.TestJitCUDA.test_variant_consistency_jit_logspace_cuda_complex64!
                # Caching allocator allocated memory was 0 and is now reported as 307200 on device 0.
@@ -16965,9 +16950,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             # cpu implementation is wrong on some integral types
             # https://github.com/pytorch/pytorch/issues/81996
             DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_torch_fallback',
-                         dtypes=(torch.int16, torch.int32, torch.int64), device_type="cpu"),
+                         dtypes=(torch.int8, torch.uint8, torch.int16, torch.int32, torch.int64), device_type="cpu"),
             DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref',
-                         dtypes=(torch.int16, torch.int32, torch.int64), device_type="cpu"),
+                         dtypes=(torch.int8, torch.uint8, torch.int16, torch.int32, torch.int64), device_type="cpu"),
 
             # cuda implementation is off-by-one on some inputs due to precision issues
             # https://github.com/pytorch/pytorch/issues/82230

From 6ef05b9d1b600ff700ef43c2510400fc277f8513 Mon Sep 17 00:00:00 2001
From: lezcano <lezcano-93@hotmail.com>
Date: Wed, 16 Nov 2022 14:09:59 +0000
Subject: [PATCH 0971/1922] Fix decomp for embedding_backward and simplify the
 decomposition of embedding_dense and embedding_dense_backward (#87204)

See the title

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87204
Approved by: https://github.com/Chillee
---
 test/test_decomp.py             |  2 --
 torch/_decomp/decompositions.py | 54 ++++++++++++++++-----------------
 2 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/test/test_decomp.py b/test/test_decomp.py
index dc94b6714ccd9..ad8cf27ae0f21 100644
--- a/test/test_decomp.py
+++ b/test/test_decomp.py
@@ -310,8 +310,6 @@ def normalize_op_input_output(f, sample, requires_grad=True):
 
 CROSS_REF_BACKWARD_EXCLUDE_SET = {
     # Decomposed backward formula is not as precise
-    ("cuda", torch.float16, "nn.functional.embedding"),
-    ("cuda", torch.bfloat16, "nn.functional.embedding"),
     ("cpu", torch.bfloat16, "nn.functional.hardswish"),
     ("cuda", torch.float16, "nn.functional.cross_entropy"),
 }
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 7c84cb7e2ca8b..7e3d31bb97466 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -12,7 +12,12 @@
 from torch import Tensor
 from torch._decomp import register_decomposition
 from torch._prims_common import IntLike, NumberType, TensorLike, TensorSequenceType
-from torch._prims_common.wrappers import _maybe_resize_out, _safe_copy_out, out_wrapper
+from torch._prims_common.wrappers import (
+    _maybe_convert_to_dtype,
+    _maybe_resize_out,
+    _safe_copy_out,
+    out_wrapper,
+)
 from torch.fx.experimental.symbolic_shapes import guard_int, sym_float, sym_int
 from torch.utils._pytree import tree_flatten, tree_map
 
@@ -1039,22 +1044,19 @@ def embedding(
     sparse: bool = False,
 ) -> Tensor:
     assert weight.dim() == 2, "'weight' must be 2-D"
-    # TODO: Assert not ported over yet
-    #   auto indices_arg = TensorArg(indices, "indices", 1);
-    #   checkScalarTypes("embedding", indices_arg, {kLong, kInt});
-
-    if indices.dim() == 1:
-        return weight.index_select(0, indices)
-
-    size = list(indices.shape)
-    for d in weight.shape[1:]:
-        size.append(d)
-
-    return weight.index_select(0, indices.reshape(-1)).view(size)
+    # Nb. scale_grad_by_freq is not used in the forward
+    if indices.ndim <= 1:
+        # We need this one as weight[indices] calls item() in these cases
+        out = weight.index_select(0, indices)
+        if indices.ndim == 0:
+            out = out.squeeze(0)
+        return out
+    else:
+        return weight[indices]
 
 
-# TODO: Correct the type promotion semantics
 @register_decomposition(aten.embedding_dense_backward)
+@pw_cast_for_opmath
 def embedding_dense_backward(
     grad_output: Tensor,
     indices: Tensor,
@@ -1062,22 +1064,20 @@ def embedding_dense_backward(
     padding_idx: int,
     scale_grad_by_freq: bool,
 ):
-    numel = indices.numel()
-    grad = grad_output.reshape(numel, grad_output.size(-1))
-    grad_weight = grad_output.new_zeros((num_weights, grad_output.shape[-1]))
-    indices_rank1 = indices.reshape(numel)
+    indices = _maybe_convert_to_dtype(indices, torch.long)  # type: ignore[assignment]
     if scale_grad_by_freq:
         counts = indices.new_zeros((num_weights,))
-        ones = indices.new_ones((numel,))
-        counts = counts.index_put([indices_rank1], ones, accumulate=True)
-        grad_weights_scale = counts[indices_rank1]
-        grad = grad / grad_weights_scale.unsqueeze(1)
-    skip_padding = (indices_rank1 != padding_idx).unsqueeze(1)
-    skip_padding = skip_padding.expand_as(grad)
-    zero_grad = torch.full_like(grad, 0)
-    return grad_weight.index_put(
-        [indices_rank1], torch.where(skip_padding, grad, zero_grad), accumulate=True
+        ones = torch.ones_like(indices)
+        counts = counts.index_put([indices], ones, accumulate=True)
+        grad_weights_scale = counts[indices]
+        grad_output = grad_output / grad_weights_scale.unsqueeze(1)
+
+    mask = _unsqueeze_to_dim(indices == padding_idx, grad_output.ndim)
+    grad = grad_output.masked_fill(mask, 0)
+    grad_weight = grad_output.new_zeros(
+        (num_weights,) + grad_output.shape[indices.ndim :]
     )
+    return grad_weight.index_put([indices], grad, accumulate=True)
 
 
 def prod(x: List[int]):

From 49ce75d196efd06608b08b304efece76d11883d0 Mon Sep 17 00:00:00 2001
From: lezcano <lezcano-93@hotmail.com>
Date: Wed, 16 Nov 2022 14:09:59 +0000
Subject: [PATCH 0972/1922] Add bfloat16 support to torch.prod to align with
 torch.cumprod (#87205)

As per title

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87205
Approved by: https://github.com/mruberry
---
 aten/src/ATen/native/cpu/ReduceOpsKernel.cpp         |  2 +-
 .../testing/_internal/common_methods_invocations.py  |  5 +----
 .../testing/_internal/opinfo/definitions/_masked.py  | 12 ++----------
 3 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
index a4345c3fd5d86..a82f3ed3eaa1d 100644
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@@ -184,7 +184,7 @@ static void prod_kernel_impl(TensorIterator& iter) {
         // NOLINTNEXTLINE(bugprone-argument-comment)
         /*identity=*/1);
   } else {
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX(iter.dtype(), "prod_cpu", [&] {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, iter.dtype(), "prod_out_cpu", [&] {
       binary_kernel_reduce_vec(
           iter,
           [=](scalar_t a, scalar_t b)
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index f81aa4f5024c2..af4539ee5fecc 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -9312,9 +9312,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            skips=(
                # cumprod does not handle correctly out= dtypes
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
-               # RuntimeError: "prod_cpu" not implemented for 'BFloat16'
-               DecorateInfo(unittest.expectedFailure, 'TestDecomp', 'test_comprehensive',
-                            dtypes=(torch.bfloat16,), device_type='cpu'),
            ),
            # gradgradcheck fails in fast_mode=True: #56275
            sample_inputs_func=sample_inputs_cumprod,
@@ -16441,7 +16438,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         supports_fwgrad_bwgrad=True,
         promotes_int_to_int64=True,
         gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
-        dtypes=all_types_and_complex_and(torch.bool),
+        dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
         dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
         sample_inputs_func=sample_inputs_prod,
         ref=reference_reduction_numpy(np.prod),
diff --git a/torch/testing/_internal/opinfo/definitions/_masked.py b/torch/testing/_internal/opinfo/definitions/_masked.py
index 92231229bb5ec..f4b590fe25202 100644
--- a/torch/testing/_internal/opinfo/definitions/_masked.py
+++ b/torch/testing/_internal/opinfo/definitions/_masked.py
@@ -446,8 +446,8 @@ def sample_inputs_masked_normalize(op_info, device, dtype, requires_grad, **kwar
         supports_sparse=True,
         supports_sparse_csr=True,
         promotes_int_to_int64=True,
-        # FIXME: "prod_cpu" not implemented for 'Half' or 'BFloat16'
-        dtypes=all_types_and_complex_and(torch.bool),
+        # FIXME: "prod_cpu" not implemented for 'Half'
+        dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
         dtypesIfCUDA=all_types_and_complex_and(
             torch.bool, torch.float16, torch.bfloat16
         ),
@@ -549,14 +549,6 @@ def sample_inputs_masked_normalize(op_info, device, dtype, requires_grad, **kwar
             DecorateInfo(
                 unittest.skip("Skipped!"), "TestJit", "test_variant_consistency_jit"
             ),
-            # RuntimeError: "prod_cpu" not implemented for 'BFloat16'
-            DecorateInfo(
-                unittest.expectedFailure,
-                "TestDecomp",
-                "test_comprehensive",
-                dtypes=(torch.bfloat16,),
-                device_type="cpu",
-            ),
             DecorateInfo(
                 toleranceOverride({torch.float32: tol(atol=1e-5, rtol=1e-5)}),
                 "TestCompositeCompliance",

From 76e02469fa41648daaab3625f7138531a9ba0bd7 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Wed, 16 Nov 2022 12:40:27 +0000
Subject: [PATCH 0973/1922] Run tests from test/inductor in inductor CI job
 (#88957)

CUDA inductor tests are currently not run in CI because the only jobs
that have triton installed don't actually run these test.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88957
Approved by: https://github.com/ngimel, https://github.com/seemethere
---
 .jenkins/pytorch/test.sh                   |  1 +
 test/inductor/test_torchinductor_opinfo.py | 36 ++++++----------------
 test/run_test.py                           |  1 +
 3 files changed, 11 insertions(+), 27 deletions(-)

diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 5fa54f538f35f..135fb50762d6f 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -250,6 +250,7 @@ test_inductor_distributed() {
 
 test_inductor() {
   python test/run_test.py --include test_modules test_ops --verbose
+  PYTORCH_TEST_WITH_INDUCTOR=0 python test/run_test.py --include inductor/test_torchinductor --include inductor/test_torchinductor_opinfo --verbose
   # TODO: investigate "RuntimeError: CUDA driver API confirmed a leak"
   # seen intest_ops_gradients.py
   # pytest test/test_ops_gradients.py --verbose -k "not _complex and not test_inplace_grad_acos_cuda_float64"
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 7db9d13733b4d..67b64c73a8ef5 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -127,35 +127,14 @@ def process(device_type):
 }
 
 inductor_skips["cuda"] = {
-    # flaky
-    "__rdiv__": {b8, f16, f32, f64, i32, i64},
-    "masked.prod": {f16, f32, f64},
-    "linalg.vander": {f32, f64},
-    "sparse.sampled_addmm": {f32, f64},
-    "broadcast_tensors": {f16, f32, f64},
-    "dsplit": {f16, f32, f64},
     # Jiterator kernel is not expected to work with inductor
     "jiterator_2inputs_2outputs": {b8, f16, f32, f64, i32, i64},
     "jiterator_4inputs_with_extra_args": {b8, f16, f32, f64, i32, i64},
     "jiterator_binary": {b8, f16, f32, f64, i32, i64},
     "jiterator_binary_return_by_ref": {b8, f16, f32, f64, i32, i64},
     "jiterator_unary": {b8, f16, f32, f64, i32, i64},
-    # Disabled on migration to core
-    "linalg.pinv.singular": {f32, f64},
-    "linalg.householder_product": {f32},
-    # These might be passing now?
-    "__getitem__": {b8, f16, f32, f64, i32, i64},
-    "nn.functional.conv_transpose3d": {f16},
-    "max.reduction_with_dim": {i32, i64},
-    "min.reduction_with_dim": {i32, i64},
-    "linalg.lu": {f32, f64},
-    "lu_unpack": {f32, f64},
+    # flaky
     "native_batch_norm": {f16, f32, f64},
-    "native_layer_norm": {f16, f32, f64},
-    # Issues on sm86 periodic job (complex numbers)
-    "cdouble": {b8, f16, f32, f64, i32, i64},
-    "cfloat": {b8, f16, f32, f64, i32, i64},
-    "randint": {b8, f16, f32, f64, i32, i64},
 }
 
 inductor_expected_failures_single_sample = defaultdict(dict)
@@ -280,6 +259,7 @@ def process(device_type):
     "mH": {b8, f16, f32, f64, i32, i64},
     "mT": {b8, f16, f32, f64, i32, i64},
     "__getitem__": {b8, f16, f32, f64, i32, i64},
+    "__rdiv__": {b8, f16, f32, f64, i32, i64},
     "allclose": {f16, f32, f64},
     "angle": {f32, f64},
     "argwhere": {b8, f16, f32, f64, i32, i64},
@@ -287,6 +267,8 @@ def process(device_type):
     "bernoulli": {f16, f32, f64},
     "bincount": {i32, i64},
     "bucketize": {b8, f16, f32, f64, i32, i64},
+    "cdouble": {b8, f16, f32, f64, i32, i64},
+    "cfloat": {b8, f16, f32, f64, i32, i64},
     "chalf": {b8, f16, f32, f64, i32, i64},
     "cholesky": {f32, f64},
     "combinations": {b8, f16, f32, f64, i32, i64},
@@ -322,13 +304,13 @@ def process(device_type):
     "linalg.lstsq.grad_oriented": {f32, f64},
     "linalg.matrix_rank": {f32, f64},
     "linalg.matrix_rank.hermitian": {f32, f64},
-    "lu_unpack": {f32, f64},
+    "linalg.pinv.singular": {f32, f64},
     "masked.argmax": {f16, f32, f64, i32},
     "masked.argmin": {f16, f32, f64, i32},
     "masked_scatter": {f16, f32, f64},
     "masked_select": {b8, f16, f32, f64, i32, i64},
-    "max.reduction_with_dim": {b8, i32, i64},
-    "min.reduction_with_dim": {b8, i32, i64},
+    "max.reduction_with_dim": {b8},
+    "min.reduction_with_dim": {b8},
     "multinomial": {f16, f32, f64},
     "nn.functional.adaptive_avg_pool2d": {f16},
     "nn.functional.ctc_loss": {f32, f64},
@@ -346,12 +328,14 @@ def process(device_type):
     "pow": {i32, i64},
     "rand_like": {f16, f32, f64},
     "randint_like": {f16, f32, f64, i32, i64},
+    "randint": {f16, f32, f64, i32, i64},
     "randn_like": {f16, f32, f64},
     "repeat_interleave": {b8, f16, f32, f64, i32, i64},
     "round.decimals_3": {f16},
     "scatter_reduce.prod": {f16, f32, f64},
     "segment_reduce.lengths": {f16, f32, f64},
     "sgn": {f16, f32, f64},
+    "sparse.sampled_addmm": {f32, f64},
     "stft": {f32, f64},
     "svd_lowrank": {f32, f64},
     "tensor_split": {b8, f16, f32, f64, i32, i64},
@@ -375,8 +359,6 @@ def process(device_type):
     "linalg.vector_norm": {f64, f64},
     "kron": {f16},
     "nanquantile": {f32, f64},
-    "native_batch_norm": {f16, f32, f64},
-    "native_layer_norm": {f16, f32, f64},
     "nn.functional._scaled_dot_product_attention": {f16},
     "nn.functional.avg_pool2d": {f16, f32, f64},
     "nn.functional.batch_norm.without_cudnn": {f16},
diff --git a/test/run_test.py b/test/run_test.py
index 1273ab45c4fbc..8a25a2e707853 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -792,6 +792,7 @@ def run_test_ops(test_module, test_directory, options):
     "distributed/rpc/test_share_memory": get_run_test_with_subprocess_fn(),
     "distributed/rpc/cuda/test_tensorpipe_agent": get_run_test_with_subprocess_fn(),
     "doctests": run_doctests,
+    "inductor/test_torchinductor_opinfo": run_test_ops,
     "test_ops": run_test_ops,
     "test_ops_gradients": run_test_ops,
     "test_ops_fwd_gradients": run_test_ops,

From 47f30a33c8fdd07f8444e570cef683cb3e39b18e Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 16 Nov 2022 18:25:38 +0000
Subject: [PATCH 0974/1922] Support test-config filter logic for rocm (#89046)

The logic used by `mem_leak_check` https://github.com/pytorch/pytorch/pull/88373 is currently not applied to rocm, i.e. https://hud.pytorch.org/pytorch/pytorch/commit/06486cd0087200e08ebb8a9518e064251c7c5309 because its workflows don't have the test-config filtering logic yet (linux, mac, and windows all have it already). In another work, rocm tests always run with mem leak check disabled at the moment. We want that but also to run the test with mem leak check enabled periodically one per day.  This PR closes that gap

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89046
Approved by: https://github.com/clee2000
---
 .github/workflows/_rocm-test.yml | 28 +++++++++++++++++++++++++---
 .github/workflows/periodic.yml   | 20 +++++++++++---------
 .github/workflows/pull.yml       |  5 +++++
 .github/workflows/trunk.yml      | 11 ++++++-----
 4 files changed, 47 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml
index dd1a0830275cd..be4a5c9dcc6cd 100644
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@@ -39,12 +39,34 @@ env:
   GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
 
 jobs:
+  # This needs to be run right before the test starts so that it can gather the
+  # latest labels from the PR
+  filter:
+    runs-on: [self-hosted, linux.large]
+    outputs:
+      test-matrix: ${{ steps.filter.outputs.test-matrix }}
+      is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }}
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        with:
+          fetch-depth: 1
+          submodules: false
+
+      - name: Select all requested test configurations
+        id: filter
+        uses: ./.github/actions/filter-test-configs
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          test-matrix: ${{ inputs.test-matrix }}
+
   test:
-    # Don't run on forked repos.
-    if: github.repository_owner == 'pytorch'
+    needs: filter
+    # Don't run on forked repos or empty test matrix
+    if: github.repository_owner == 'pytorch' && needs.filter.outputs.is-test-matrix-empty == 'False'
     timeout-minutes: 300
     strategy:
-      matrix: ${{ fromJSON(inputs.test-matrix) }}
+      matrix: ${{ fromJSON(needs.filter.outputs.test-matrix) }}
       fail-fast: false
     runs-on: ${{ matrix.runner }}
     steps:
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index 61302e1a0d61b..b5512b20eaae8 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -41,6 +41,10 @@ jobs:
     with:
       build-environment: linux-focal-rocm5.2-py3.8
       docker-image-name: pytorch-linux-focal-rocm5.2-py3.8
+      test-matrix: |
+        { include: [
+          { config: "slow", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
+        ]}
 
   linux-focal-rocm5_2-py3_8-slow-test:
     name: linux-focal-rocm5.2-py3.8-slow
@@ -49,10 +53,7 @@ jobs:
     with:
       build-environment: linux-focal-rocm5.2-py3.8
       docker-image: ${{ needs.linux-focal-rocm5_2-py3_8-slow-build.outputs.docker-image }}
-      test-matrix: |
-        { include: [
-          { config: "slow", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
-        ]}
+      test-matrix: ${{ needs.linux-focal-rocm5_2-py3_8-slow-build.outputs.test-matrix }}
     secrets:
       AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
       AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
@@ -63,6 +64,11 @@ jobs:
     with:
       build-environment: linux-focal-rocm5.2-py3.8
       docker-image-name: pytorch-linux-focal-rocm5.2-py3.8
+      test-matrix: |
+        { include: [
+          { config: "distributed", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
+          { config: "distributed", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
+        ]}
 
   linux-focal-rocm5_2-py3_8-distributed-test:
     name: linux-focal-rocm5.2-py3.8-distributed
@@ -71,11 +77,7 @@ jobs:
     with:
       build-environment: linux-focal-rocm5.2-py3.8
       docker-image: ${{ needs.linux-focal-rocm5_2-py3_8-distributed-build.outputs.docker-image }}
-      test-matrix: |
-        { include: [
-          { config: "distributed", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
-          { config: "distributed", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
-        ]}
+      test-matrix: ${{ needs.linux-focal-rocm5_2-py3_8-distributed-build.outputs.test-matrix }}
     secrets:
       AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
       AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index c3d530e3e7189..3208cb198bb41 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -308,3 +308,8 @@ jobs:
       build-environment: linux-focal-rocm5.2-py3.8
       docker-image-name: pytorch-linux-focal-rocm5.2-py3.8
       sync-tag: rocm-build
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
+        ]}
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index cb5d1291833a2..6779a362209c2 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -298,6 +298,11 @@ jobs:
       build-environment: linux-focal-rocm5.2-py3.8
       docker-image-name: pytorch-linux-focal-rocm5.2-py3.8
       sync-tag: rocm-build
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
+        ]}
 
   linux-focal-rocm5_2-py3_8-test:
     name: linux-focal-rocm5.2-py3.8
@@ -306,11 +311,7 @@ jobs:
     with:
       build-environment: linux-focal-rocm5.2-py3.8
       docker-image: ${{ needs.linux-focal-rocm5_2-py3_8-build.outputs.docker-image }}
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
-        ]}
+      test-matrix: ${{ needs.linux-focal-rocm5_2-py3_8-build.outputs.test-matrix }}
     secrets:
       AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
       AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}

From 7a7df0a99f39082b7c2e49cf28989142e2232c02 Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Tue, 15 Nov 2022 13:27:57 -0800
Subject: [PATCH 0975/1922] [Quant][bc-breaking] Remove
 overwrite_output_observer (#88620)

Summary: When the BackendConfig was first introduced,
`overwrite_output_observer` and `overwrite_output_fake_quantize`
were added to ensure fixed qparams ops like `torch.nn.Sigmoid`
and `torch.nn.Tanh` used the correct observers and fake quantizes.
However, this is hacky because the BackendConfig should not set
the observer constructors themselves, but should instead specify
only requirements on the observers.

Later, https://github.com/pytorch/pytorch/pull/80184 added the
correct observers to `get_default_qconfig_mapping` along with
validation logic that throws an error if incorrect observers
were specified. With this change, we no longer need to overwrite
the observers from the BackendConfig, since we expect the user to
pass in the correct observers for these ops.

This commit removes these overwrite observer settings in the
BackendConfig. Instead, we represent the observer constraints for
fixed qparams ops through the existing DTypeWithConstraints
mechanism. Note that, however, to be consistent with other
DTypeWithConstraints checks, we no longer throw an error if an
incorrect observer is specified, but simply ignore the offending
QConfig and log a warning instead. This is the BC-breaking part
of the change.

BC-breaking notes:

```
from torch.ao.quantization.qconfig import default_qconfig
from torch.ao.quantization.quantize_fx import prepare_fx

model = ModelWithFixedQParamsOps()
qconfig_mapping = QConfigMapping().set_global(default_qconfig)
example_inputs = ...
prepare_fx(model, qconfig_mapping, example_inputs)
```

Before this commit, running the above leads to an exception
because the wrong observers are used for fixed qparams ops.
After this commit, the above will only encounter a warning,
and the fixed qparams ops will not be quantized. In both cases,
switching to `get_default_qconfig_mapping` will cause the
fixed qparams ops to be quantized.

Test Plan:
python test/test_quantization.py TestQuantizeFx
python test/test_quantization.py TestQuantizeFxOps

Reviewers: jerryzh168, vkuzo

Subscribers: jerryzh168, vkuzo

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88620
Approved by: https://github.com/jerryzh168
---
 test/quantization/core/test_backend_config.py | 25 +-----
 test/quantization/fx/test_quantize_fx.py      | 16 ++--
 .../ao/quantization/backend_config/README.md  |  4 +
 .../_common_operator_config_utils.py          | 82 ++++++++++++++++---
 .../backend_config/backend_config.py          | 26 ++----
 torch/ao/quantization/backend_config/utils.py |  8 --
 .../quantization/fx/backend_config_utils.py   | 33 +-------
 torch/ao/quantization/fx/prepare.py           | 61 +-------------
 .../quantization/fx/quantization_patterns.py  | 20 +----
 torch/ao/quantization/fx/utils.py             | 44 +++++++++-
 torch/ao/quantization/qconfig_mapping.py      |  1 +
 11 files changed, 137 insertions(+), 183 deletions(-)

diff --git a/test/quantization/core/test_backend_config.py b/test/quantization/core/test_backend_config.py
index aa9de64824bce..e641e58bb2aac 100644
--- a/test/quantization/core/test_backend_config.py
+++ b/test/quantization/core/test_backend_config.py
@@ -13,10 +13,8 @@
     DTypeWithConstraints,
     ObservationType,
 )
-from torch.ao.quantization.fake_quantize import FixedQParamsFakeQuantize
 from torch.ao.quantization.fuser_method_mappings import _reverse_sequential_wrapper2
 from torch.ao.quantization.fx.quantization_patterns import _default_root_node_getter
-from torch.ao.quantization.observer import default_fixed_qparams_range_0to1_observer
 
 
 class TestBackendConfig(QuantizationTestCase):
@@ -118,7 +116,6 @@ def test_dtype_config_to_dict(self):
         "input": 1,
         "weight": 2,
     }
-    _fake_quantize = FixedQParamsFakeQuantize.with_args(observer=default_fixed_qparams_range_0to1_observer)
 
     def _extra_inputs_getter(self, p):
         return (torch.rand(3, 3),)
@@ -141,9 +138,7 @@ def _get_backend_op_config2(self):
             ._set_extra_inputs_getter(self._extra_inputs_getter) \
             ._set_num_tensor_args_to_observation_type(self._num_tensor_args_to_observation_type) \
             ._set_input_type_to_index(self._input_type_to_index) \
-            ._set_input_output_observed(False) \
-            ._set_overwrite_output_fake_quantize(self._fake_quantize) \
-            ._set_overwrite_output_observer(default_fixed_qparams_range_0to1_observer)
+            ._set_input_output_observed(False)
 
     def _get_backend_pattern_config_dict1(self):
         return {
@@ -167,8 +162,6 @@ def _get_backend_pattern_config_dict2(self):
             "num_tensor_args_to_observation_type": self._num_tensor_args_to_observation_type,
             "input_type_to_index": self._input_type_to_index,
             "input_output_observed": False,
-            "overwrite_output_fake_quantize": self._fake_quantize,
-            "overwrite_output_observer": default_fixed_qparams_range_0to1_observer
         }
 
     def test_backend_op_config_set_observation_type(self):
@@ -246,18 +239,6 @@ def test_backend_op_config_set_input_output_observed(self):
         conf._set_input_output_observed(False)
         self.assertEqual(conf._input_output_observed, False)
 
-    def test_backend_op_config_set_overwrite_output_fake_quantize(self):
-        conf = BackendPatternConfig(torch.sigmoid)
-        self.assertTrue(conf._overwrite_output_fake_quantize is None)
-        conf._set_overwrite_output_fake_quantize(self._fake_quantize)
-        self.assertEqual(conf._overwrite_output_fake_quantize, self._fake_quantize)
-
-    def test_backend_op_config_set_overwrite_output_observer(self):
-        conf = BackendPatternConfig(torch.sigmoid)
-        self.assertTrue(conf._overwrite_output_observer is None)
-        conf._set_overwrite_output_observer(default_fixed_qparams_range_0to1_observer)
-        self.assertEqual(conf._overwrite_output_observer, default_fixed_qparams_range_0to1_observer)
-
     def test_backend_op_config_from_dict(self):
         conf_dict1 = self._get_backend_pattern_config_dict1()
         conf1 = BackendPatternConfig.from_dict(conf_dict1)
@@ -273,8 +254,6 @@ def test_backend_op_config_from_dict(self):
         self.assertEqual(len(conf1._num_tensor_args_to_observation_type), 0)
         self.assertEqual(len(conf1._input_type_to_index), 0)
         self.assertTrue(conf1._input_output_observed is None)
-        self.assertTrue(conf1._overwrite_output_fake_quantize is None)
-        self.assertTrue(conf1._overwrite_output_observer is None)
         # Test temporary/internal keys
         conf_dict2 = self._get_backend_pattern_config_dict2()
         conf2 = BackendPatternConfig.from_dict(conf_dict2)
@@ -290,8 +269,6 @@ def test_backend_op_config_from_dict(self):
         self.assertEqual(conf2._num_tensor_args_to_observation_type, self._num_tensor_args_to_observation_type)
         self.assertEqual(conf2._input_type_to_index, self._input_type_to_index)
         self.assertEqual(conf2._input_output_observed, False)
-        self.assertEqual(conf2._overwrite_output_fake_quantize, self._fake_quantize)
-        self.assertEqual(conf2._overwrite_output_observer, default_fixed_qparams_range_0to1_observer)
 
     def test_backend_op_config_to_dict(self):
         conf1 = self._get_backend_op_config1()
diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 6721e397180e2..6c631a24abc60 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -6792,9 +6792,8 @@ def forward(self, x):
             M(), data, quant_type, custom_qconfig_dict=qconfig_mapping,
             expected_node_occurrence=node_occurrence, is_reference=True)
 
-    def test_fixed_qparams_ops_qconfig_error(self):
-        """ Test that a proper error message is shown when user don't specify the correct
-        qconfig for fixed qaprams ops
+    def test_fixed_qparams_ops_wrong_qconfig(self):
+        """ Test that wrong qconfigs for fixed qparams ops results in the ops not being quantized.
         """
         class M(torch.nn.Module):
             def __init__(self):
@@ -6814,8 +6813,15 @@ def forward(self, x):
         data = (torch.randn((2, 2, 2, 2), dtype=torch.float),)
         qconfig_mapping = QConfigMapping().set_global(default_qconfig)
         m = M().eval()
-        with self.assertRaisesRegex(ValueError, "get_default_qconfig_mapping"):
-            m = prepare_fx(m, qconfig_mapping, data)
+        node_occurrence = {
+            ns.call_function(torch.quantize_per_tensor): 0,
+            ns.call_method("dequantize"): 0,
+        }
+        self.checkGraphModeFxOp(
+            m, data, QuantType.STATIC, custom_qconfig_dict=qconfig_mapping,
+            expected_node_occurrence=node_occurrence, is_reference=True)
+        self.assertTrue(isinstance(m.sigmoid, torch.nn.Sigmoid))
+        self.assertTrue(isinstance(m.tanh, torch.nn.Tanh))
 
     @skipIfNoFBGEMM
     def test_general_shape_ops(self):
diff --git a/torch/ao/quantization/backend_config/README.md b/torch/ao/quantization/backend_config/README.md
index b8d8ceb3e38de..985765e6badc4 100644
--- a/torch/ao/quantization/backend_config/README.md
+++ b/torch/ao/quantization/backend_config/README.md
@@ -152,3 +152,7 @@ The user's QConfig may specify `quant_min` and `quant_max`, which are min and ma
 #### Scale range
 
 Similarly, the user's QConfig may specify a minimum value for the quantization scale (currently exposed as `eps` but will change in the future to better reflect the semantics). Here we set the lower bound for the `scale_min` to represent the limits of the backend. If a QConfig's min scale value falls below this limit, the QConfig will be treated as violating this constraint. Note that `scale_max_upper_bound` is currently not used, because there is no corresponding mechanism to enforce this on the observer yet.
+
+#### Fixed quantization parameters
+
+For ops with fixed quantization parameters such as `torch.nn.Sigmoid` or `torch.nn.Tanh`, the BackendConfig can specify the specific scale and zero point values as constraints on the input and output activations. The user's QConfigs for these ops must use `FixedQParamsObserver` or `FixedQParamsFakeQuantize` for their activations with matching scale and zero point values, otherwise these QConfigs will be ignored.
diff --git a/torch/ao/quantization/backend_config/_common_operator_config_utils.py b/torch/ao/quantization/backend_config/_common_operator_config_utils.py
index c2f0f7227b10b..47a0b30242086 100644
--- a/torch/ao/quantization/backend_config/_common_operator_config_utils.py
+++ b/torch/ao/quantization/backend_config/_common_operator_config_utils.py
@@ -1,3 +1,4 @@
+import copy
 import operator
 import torch
 import torch.nn.functional as F
@@ -7,13 +8,13 @@
 import torch.nn.qat as nnqat
 import torch.nn.quantized._reference as nnqr
 from collections import namedtuple
-from typing import List
+from typing import Callable, Dict, List, Union
 from .backend_config import (
     BackendPatternConfig,
     DTypeConfig,
+    DTypeWithConstraints,
     ObservationType,
 )
-from ..fake_quantize import FixedQParamsFakeQuantize
 from ..fuser_method_mappings import (
     _reverse_sequential_wrapper2,
     _reverse2,
@@ -23,7 +24,6 @@
     fuse_linear_bn,
     fuse_convtranspose_bn,
 )
-from ..qconfig_mapping import _FIXED_QPARAMS_OP_TO_OBSERVER
 
 # TODO: rename to be more explict, e.g. qat_conv_relu
 _ConvMetadata = namedtuple(
@@ -48,6 +48,38 @@
     nnqat.Conv3d, nniqat.ConvReLU3d, nniqat.ConvBn3d, nniqat.ConvBnReLU3d,
     F.conv3d)
 
+# Add constraints for fixed qparams ops like sigmoid and tanh to ensure values
+# fall within the proper ranges, e.g. [0, 1] for sigmoid, [-1, 1] for tanh
+_FIXED_QPARAM_OP_0TO1_CONSTRAINTS = DTypeWithConstraints(
+    dtype=torch.quint8,
+    quant_min_lower_bound=0,
+    quant_max_upper_bound=255,
+    scale_exact_match=1.0 / 256.0,
+    zero_point_exact_match=0,
+)
+_FIXED_QPARAM_OP_NEG1TO1_CONSTRAINTS = DTypeWithConstraints(
+    dtype=torch.quint8,
+    quant_min_lower_bound=0,
+    quant_max_upper_bound=255,
+    scale_exact_match=2.0 / 256.0,
+    zero_point_exact_match=128,
+)
+_FIXED_QPARAMS_OP_TO_CONSTRAINTS: Dict[Union[Callable, str], DTypeWithConstraints] = {
+    torch.nn.Hardsigmoid: _FIXED_QPARAM_OP_0TO1_CONSTRAINTS,
+    torch.nn.functional.hardsigmoid: _FIXED_QPARAM_OP_0TO1_CONSTRAINTS,
+    "hardsigmoid": _FIXED_QPARAM_OP_0TO1_CONSTRAINTS,
+    "hardsigmoid_": _FIXED_QPARAM_OP_0TO1_CONSTRAINTS,
+    torch.nn.Sigmoid: _FIXED_QPARAM_OP_0TO1_CONSTRAINTS,
+    torch.sigmoid: _FIXED_QPARAM_OP_0TO1_CONSTRAINTS,
+    "sigmoid": _FIXED_QPARAM_OP_0TO1_CONSTRAINTS,
+    "sigmoid_": _FIXED_QPARAM_OP_0TO1_CONSTRAINTS,
+    torch.nn.Softmax: _FIXED_QPARAM_OP_0TO1_CONSTRAINTS,
+    torch.nn.Tanh: _FIXED_QPARAM_OP_NEG1TO1_CONSTRAINTS,
+    torch.tanh: _FIXED_QPARAM_OP_NEG1TO1_CONSTRAINTS,
+    "tanh": _FIXED_QPARAM_OP_NEG1TO1_CONSTRAINTS,
+    "tanh_": _FIXED_QPARAM_OP_NEG1TO1_CONSTRAINTS,
+}
+
 def _get_binary_op_configs(dtype_configs: List[DTypeConfig]) -> List[BackendPatternConfig]:
     binary_op_configs: List[BackendPatternConfig] = []
     num_tensor_args_to_observation_type_mapping = {
@@ -393,21 +425,45 @@ def _get_default_op_configs(dtype_configs: List[DTypeConfig]) -> List[BackendPat
     )
     return configs
 
+def _add_fixed_qparams_to_dtype_configs(
+    dtype_configs: List[DTypeConfig],
+    constraints: DTypeWithConstraints,
+) -> List[DTypeConfig]:
+    """
+    Return a copy of the list of DTypeConfigs where activations are subject to the specified
+    constraints required for fixed qparams ops.
+
+    If the data type doesn't match the one in the constraints, simply leave the corresponding
+    DTypeConfig unchanged.
+
+    If `scale_min_lower_bound` or `scale_max_upper_bound` is specified in the activations,
+    throw an exception since these settings are incompatible with fixed qparams ops.
+    """
+    new_dtype_configs = []
+    for dtype_config in dtype_configs:
+        dc = copy.deepcopy(dtype_config)
+        for orig_constraints in [dc.input_dtype_with_constraints, dc.output_dtype_with_constraints]:
+            if orig_constraints.dtype != constraints.dtype:
+                continue
+            if orig_constraints.scale_min_lower_bound is not None:
+                raise ValueError("scale_min_lower_bound is invalid for fixed qparams ops: %s" % dtype_config)
+            if orig_constraints.scale_max_upper_bound is not None:
+                raise ValueError("scale_max_upper_bound is invalid for fixed qparams ops: %s" % dtype_config)
+            orig_constraints.quant_min_lower_bound = constraints.quant_min_lower_bound
+            orig_constraints.quant_max_upper_bound = constraints.quant_max_upper_bound
+            orig_constraints.scale_exact_match = constraints.scale_exact_match
+            orig_constraints.zero_point_exact_match = constraints.zero_point_exact_match
+        new_dtype_configs.append(dc)
+    return new_dtype_configs
+
 def _get_fixed_qparams_op_configs(dtype_configs: List[DTypeConfig]) -> List[BackendPatternConfig]:
     fixed_qparams_op_configs = []
-    for fixed_qparam_op, output_observer in _FIXED_QPARAMS_OP_TO_OBSERVER.items():
+    for fixed_qparam_op, constraints in _FIXED_QPARAMS_OP_TO_CONSTRAINTS.items():
+        new_dtype_configs = _add_fixed_qparams_to_dtype_configs(dtype_configs, constraints)
         fixed_qparams_op_configs.append(
-            # TODO: The _overwrite_output keys are temporary, since we don't want to put observer
-            # in the configs we expect that it's provided by user
-            # What we want to put here is the requirement on observers, in this case dtype,
-            # quant_min, quant_max etc., but we need to first move all configs to
-            # backend_config_dict to do that, we'll remove these keys after we fully migrated
-            # everything to use backend_config_dict
             BackendPatternConfig(fixed_qparam_op)
                 .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)  # noqa: E131
-                .set_dtype_configs(dtype_configs)
-                ._set_overwrite_output_fake_quantize(FixedQParamsFakeQuantize.with_args(observer=output_observer))
-                ._set_overwrite_output_observer(output_observer))
+                .set_dtype_configs(new_dtype_configs))
     return fixed_qparams_op_configs
 
 def _get_share_qparams_op_configs(dtype_configs):
diff --git a/torch/ao/quantization/backend_config/backend_config.py b/torch/ao/quantization/backend_config/backend_config.py
index 1305c32a4ea8f..e8af42ff4b6a9 100644
--- a/torch/ao/quantization/backend_config/backend_config.py
+++ b/torch/ao/quantization/backend_config/backend_config.py
@@ -3,7 +3,6 @@
 from typing import Any, Callable, Dict, List, Optional, Type, Union
 
 import torch
-from torch.ao.quantization.observer import _PartialWrapper
 from torch.ao.quantization.utils import Pattern
 from enum import Enum
 
@@ -42,8 +41,6 @@
 NUM_TENSOR_ARGS_TO_OBSERVATION_TYPE_DICT_KEY = "num_tensor_args_to_observation_type"
 INPUT_TYPE_TO_INDEX_DICT_KEY = "input_type_to_index"
 INPUT_OUTPUT_OBSERVED_DICT_KEY = "input_output_observed"
-OVERWRITE_OUTPUT_FAKE_QUANTIZE_DICT_KEY = "overwrite_output_fake_quantize"
-OVERWRITE_OUTPUT_OBSERVER_DICT_KEY = "overwrite_output_observer"
 
 
 # TODO: maybe rename this to something that's not related to observer
@@ -69,14 +66,17 @@ class ObservationType(Enum):
 @dataclass
 class DTypeWithConstraints:
     """
-    Config for specifying additional constraints for a given dtype, such as quantization value
-    ranges and scale value ranges, to be used in :class:`~torch.ao.quantization.backend_config.DTypeConfig`.
+    Config for specifying additional constraints for a given dtype, such as quantization
+    value ranges, scale value ranges, and fixed quantization params, to be used in
+    :class:`~torch.ao.quantization.backend_config.DTypeConfig`.
     """
     dtype: Optional[torch.dtype] = None
     quant_min_lower_bound: Union[int, float, None] = None
     quant_max_upper_bound: Union[int, float, None] = None
     scale_min_lower_bound: Union[int, float, None] = None
     scale_max_upper_bound: Union[int, float, None] = None
+    scale_exact_match: Optional[float] = None
+    zero_point_exact_match: Optional[int] = None
 
 
 @dataclass
@@ -336,8 +336,6 @@ def __init__(self, pattern: Pattern):
         self._num_tensor_args_to_observation_type: Dict[int, ObservationType] = {}
         self._input_type_to_index: Dict[str, int] = {}
         self._input_output_observed: Optional[bool] = None
-        self._overwrite_output_fake_quantize: Optional[_PartialWrapper] = None
-        self._overwrite_output_observer: Optional[_PartialWrapper] = None
 
     def set_observation_type(self, observation_type: ObservationType) -> BackendPatternConfig:
         """
@@ -433,14 +431,6 @@ def _set_input_output_observed(self, input_output_observed: bool) -> BackendPatt
         self._input_output_observed = input_output_observed
         return self
 
-    def _set_overwrite_output_fake_quantize(self, overwrite_output_fake_quantize: _PartialWrapper) -> BackendPatternConfig:
-        self._overwrite_output_fake_quantize = overwrite_output_fake_quantize
-        return self
-
-    def _set_overwrite_output_observer(self, overwrite_output_observer: _PartialWrapper) -> BackendPatternConfig:
-        self._overwrite_output_observer = overwrite_output_observer
-        return self
-
     @classmethod
     def from_dict(cls, backend_pattern_config_dict: Dict[str, Any]) -> BackendPatternConfig:
         """
@@ -487,8 +477,6 @@ def _get_dtype_config(obj: Any) -> DTypeConfig:
             backend_pattern_config_dict.get(NUM_TENSOR_ARGS_TO_OBSERVATION_TYPE_DICT_KEY, {}))
         conf._set_input_type_to_index(backend_pattern_config_dict.get(INPUT_TYPE_TO_INDEX_DICT_KEY, {}))
         conf._set_input_output_observed(backend_pattern_config_dict.get(INPUT_OUTPUT_OBSERVED_DICT_KEY, None))
-        conf._set_overwrite_output_fake_quantize(backend_pattern_config_dict.get(OVERWRITE_OUTPUT_FAKE_QUANTIZE_DICT_KEY, None))
-        conf._set_overwrite_output_observer(backend_pattern_config_dict.get(OVERWRITE_OUTPUT_OBSERVER_DICT_KEY, None))
         return conf
 
     def to_dict(self) -> Dict[str, Any]:
@@ -521,8 +509,4 @@ def to_dict(self) -> Dict[str, Any]:
             backend_pattern_config_dict[INPUT_TYPE_TO_INDEX_DICT_KEY] = self._input_type_to_index
         if self._input_output_observed is not None:
             backend_pattern_config_dict[INPUT_OUTPUT_OBSERVED_DICT_KEY] = self._input_output_observed
-        if self._overwrite_output_fake_quantize is not None:
-            backend_pattern_config_dict[OVERWRITE_OUTPUT_FAKE_QUANTIZE_DICT_KEY] = self._overwrite_output_fake_quantize
-        if self._overwrite_output_observer is not None:
-            backend_pattern_config_dict[OVERWRITE_OUTPUT_OBSERVER_DICT_KEY] = self._overwrite_output_observer
         return backend_pattern_config_dict
diff --git a/torch/ao/quantization/backend_config/utils.py b/torch/ao/quantization/backend_config/utils.py
index cdc58327fbee1..fc7e9aca9ff69 100644
--- a/torch/ao/quantization/backend_config/utils.py
+++ b/torch/ao/quantization/backend_config/utils.py
@@ -5,7 +5,6 @@
 import torch.nn.functional as F
 from .backend_config import BackendConfig, DTypeConfig
 from ..utils import Pattern
-from ..observer import _PartialWrapper
 
 __all__ = [
     "get_pattern_to_dtype_configs",
@@ -86,13 +85,6 @@ def get_root_node(node_pattern):
             root_node_getter_mapping[pattern] = config._root_node_getter
     return root_node_getter_mapping
 
-def get_fixed_qparams_op_to_overwrite_output_observer(backend_config: BackendConfig) -> Dict[Union[Callable, str], _PartialWrapper]:
-    fixed_qparam_op_to_overwrite_output_observer: Dict[Union[Callable, str], _PartialWrapper] = {}
-    for pattern, config in backend_config.configs.items():
-        if config._overwrite_output_observer is not None:
-            fixed_qparam_op_to_overwrite_output_observer[pattern] = config._overwrite_output_observer  # type: ignore[index]
-    return fixed_qparam_op_to_overwrite_output_observer
-
 def get_fusion_pattern_to_extra_inputs_getter(backend_config: BackendConfig) -> Dict[Pattern, Callable]:
     """ Get a map from fusion pattern to a function that returns extra input nodes
     from the fusion pattern, in the order required by the root node. This is optional,
diff --git a/torch/ao/quantization/fx/backend_config_utils.py b/torch/ao/quantization/fx/backend_config_utils.py
index eef4979a0a064..50c6b6a27ede0 100644
--- a/torch/ao/quantization/fx/backend_config_utils.py
+++ b/torch/ao/quantization/fx/backend_config_utils.py
@@ -5,7 +5,6 @@
     ObservationType,
 )
 from torch.ao.quantization.utils import (
-    activation_dtype,
     get_combined_dict,
     Pattern,
     NodePattern,
@@ -16,14 +15,12 @@
 from .quantization_patterns import QuantizeHandler
 from .fusion_patterns import DefaultFuseHandler
 
-from typing import Dict, Any, Callable, Optional
+from typing import Callable, Dict
 
 def get_quantize_handler_cls(
         observation_type,
         dtype_configs,
         num_tensor_args_to_observation_type,
-        overwrite_output_fake_quantizer,
-        overwrite_output_observer,
         input_output_observed):
 
     class ConfigurableQuantizeHandler(QuantizeHandler):
@@ -41,35 +38,11 @@ def __init__(
             else:
                 self.observation_type = observation_type
             self.dtype_configs = dtype_configs
-            self.overwrite_output_fake_quantizer = overwrite_output_fake_quantizer
-            self.overwrite_output_observer = overwrite_output_observer
             self.input_output_observed_ = input_output_observed
 
         def is_general_tensor_value_op(self) -> bool:
             return self.observation_type == ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT
 
-        # TODO: change this to output activation
-        def get_activation_ctr(
-                self,
-                qconfig: Any,
-                pattern: Pattern,
-                is_training: bool,
-        ) -> Optional[Callable]:
-            """
-            Returns the constructor for the activation observer which should be
-            used for the pattern matched to this handler. Some handlers override
-            this to a different value than what is specified in the qconfig.
-            """
-            act_dtype = activation_dtype(qconfig)
-            # TODO: change to is_qat
-            if is_training:
-                if act_dtype == torch.quint8 and self.overwrite_output_fake_quantizer is not None:
-                    return self.overwrite_output_fake_quantizer
-            else:
-                if act_dtype == torch.quint8 and self.overwrite_output_observer is not None:
-                    return self.overwrite_output_observer
-            return qconfig.activation
-
         # This is temporary, and will be removed soon
         def input_output_observed(self):
             return self.input_output_observed_
@@ -89,8 +62,6 @@ def get_pattern_to_quantize_handlers(backend_config: BackendConfig) -> Dict[Patt
         observation_type = config.observation_type
         dtype_configs = config.dtype_configs
         num_tensor_args_to_observation_type = config._num_tensor_args_to_observation_type
-        overwrite_fake_quantizer = config._overwrite_output_fake_quantize
-        overwrite_observer = config._overwrite_output_observer
         input_output_observed = config._input_output_observed
         if input_output_observed is None:
             input_output_observed = True
@@ -99,8 +70,6 @@ def get_pattern_to_quantize_handlers(backend_config: BackendConfig) -> Dict[Patt
                 observation_type,
                 dtype_configs,
                 num_tensor_args_to_observation_type,
-                overwrite_fake_quantizer,
-                overwrite_observer,
                 input_output_observed)
 
     return pattern_to_quantize_handlers
diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index 281bd960ed7b9..c908e3f3b7644 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -18,9 +18,6 @@
     ObserverBase,
 )
 from ..qconfig import (
-    _obs_or_fq_ctr_equals,
-    float16_dynamic_qconfig,
-    float16_static_qconfig,
     _is_reuse_input_qconfig,
     QConfigAny,
 )
@@ -45,8 +42,6 @@
     NodePattern,
 )
 
-from torch.ao.quantization import FixedQParamsFakeQuantize
-
 from ._equalize import (
     is_equalization_observer,
     node_supports_equalization,
@@ -91,14 +86,12 @@
     get_qconfig_dtypes,
     get_swapped_custom_module_class,
     activation_is_statically_quantized,
-    activation_is_int8_quantized,
 )
 
 from ..backend_config.utils import (
     get_pattern_to_dtype_configs,
     get_module_to_qat_module,
     get_fusion_pattern_to_root_node_getter,
-    get_fixed_qparams_op_to_overwrite_output_observer,
 )
 from ..backend_config import (
     BackendConfig,
@@ -826,13 +819,7 @@ def maybe_insert_output_observer_for_node(
         (not is_standalone_module)
 
     if should_insert_observer:
-        act_post_process_ctr = qconfig.activation
-        if activation_is_int8_quantized(qconfig):
-            act_post_process_ctr = qhandler.get_activation_ctr(
-                qconfig,
-                matched_pattern,
-                is_qat)
-        observer = act_post_process_ctr()
+        observer = qconfig.activation()
         return insert_observer(node, observer, model, modules, graph)
     else:
         return None
@@ -1392,51 +1379,6 @@ def insert_observers_for_model(
 
     return results_node
 
-def _validate_fixed_qparams_qconfigs(
-        model: GraphModule,
-        node_name_to_qconfig: Dict[str, QConfigAny],
-        backend_config: BackendConfig):
-    """
-    Validate whether the correct observers are configured for fixed qparams ops in the model, if any.
-    """
-    # TODO: handle fp16 qconfigs properly
-    allowed_observer_ctrs = [
-        float16_dynamic_qconfig.activation,
-        float16_static_qconfig.activation,
-    ]
-    named_modules = dict(model.named_modules(remove_duplicate=False))
-    fixed_qparams_op_to_overwrite_output_observer = \
-        get_fixed_qparams_op_to_overwrite_output_observer(backend_config)
-    for node in model.graph.nodes:
-        if node.op == "call_function":
-            module_type_or_function_or_method = node.target
-        elif node.op == "call_module":
-            module_type_or_function_or_method = type(named_modules[node.target])
-        else:
-            module_type_or_function_or_method = None
-
-        if module_type_or_function_or_method in fixed_qparams_op_to_overwrite_output_observer:
-            bad_observer = True
-            qconfig = node_name_to_qconfig.get(node.name, None)
-            if qconfig is None:
-                bad_observer = False
-            else:
-                for observer_ctr in allowed_observer_ctrs + [
-                        fixed_qparams_op_to_overwrite_output_observer[module_type_or_function_or_method]]:
-                    if _obs_or_fq_ctr_equals(
-                            qconfig.activation,
-                            FixedQParamsFakeQuantize.with_args(observer=observer_ctr)) or \
-                            _obs_or_fq_ctr_equals(qconfig.activation, observer_ctr):
-                        bad_observer = False
-            if bad_observer:
-                raise ValueError("QConfigMapping must specify fixed qparams observer for fixed qparams op "
-                                 "'%s' type: '%s'. Please use torch.ao.quantization.get_default_qconfig_mapping or "
-                                 "torch.ao.quantization.get_default_qat_qconfig_mapping"
-                                 " instead. Example: \n"
-                                 "    qconfig_mapping = get_default_qconfig_mapping(\"fbgemm\") \n"
-                                 "    model = prepare_fx(model, qconfig_mapping, example_inputs)"
-                                 "" % (node.format_node(), module_type_or_function_or_method))
-
 def run_prepare_fx_on_standalone_modules(
     model: torch.nn.Module,
     is_qat: bool,
@@ -1609,7 +1551,6 @@ def prepare(
     equalization_node_name_to_qconfig = generate_node_name_to_qconfig(
         model, modules, model.graph, _equalization_config, node_name_to_scope)
     node_name_to_qconfig = generate_node_name_to_qconfig(model, modules, model.graph, qconfig_mapping, node_name_to_scope)
-    _validate_fixed_qparams_qconfigs(model, node_name_to_qconfig, backend_config)
 
     # match the patterns that will get quantized
     standalone_module_names = list(prepare_custom_config.standalone_module_names.keys())
diff --git a/torch/ao/quantization/fx/quantization_patterns.py b/torch/ao/quantization/fx/quantization_patterns.py
index c24adb9e11e90..f8d72de9c96ae 100644
--- a/torch/ao/quantization/fx/quantization_patterns.py
+++ b/torch/ao/quantization/fx/quantization_patterns.py
@@ -6,13 +6,10 @@
 from .utils import (
     all_node_args_have_no_tensors,
 )
-from torch.ao.quantization.utils import (
-    Pattern,
-    NodePattern,
-)
+from torch.ao.quantization.utils import NodePattern
 
 from abc import ABC
-from typing import Any, Callable, Dict, Optional
+from typing import Callable, Dict
 
 __all__ = [
     "QuantizeHandler",
@@ -98,19 +95,6 @@ def is_general_tensor_value_op(self) -> bool:
         """
         return False
 
-    def get_activation_ctr(
-        self,
-        qconfig: Any,
-        pattern: Pattern,
-        is_training: bool,
-    ) -> Optional[Callable]:
-        """
-        Returns the constructor for the activation observer which should be
-        used for the pattern matched to this handler. Some handlers override
-        this to a different value than what is specified in the qconfig.
-        """
-        return qconfig.activation
-
     def is_custom_module(self):
         return self.is_custom_module_
 
diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py
index a5a989ec21480..73fdb0700144d 100644
--- a/torch/ao/quantization/fx/utils.py
+++ b/torch/ao/quantization/fx/utils.py
@@ -10,8 +10,19 @@
     BackendConfig,
     DTypeWithConstraints,
 )
-from torch.ao.quantization.fake_quantize import FakeQuantizeBase
-from torch.ao.quantization.observer import ObserverBase
+from torch.ao.quantization.fake_quantize import (
+    FakeQuantizeBase,
+    FixedQParamsFakeQuantize,
+)
+from torch.ao.quantization.observer import (
+    FixedQParamsObserver,
+    ObserverBase,
+)
+from torch.ao.quantization.qconfig import (
+    float16_static_qconfig,
+    float16_dynamic_qconfig,
+    qconfig_equals,
+)
 from torch.ao.quantization.stubs import DeQuantStub
 from torch.ao.quantization.utils import (
     activation_is_statically_quantized,
@@ -951,10 +962,13 @@ def _qconfig_satisfies_dtype_config_constraints(
 
         1. QConfig specified a quantization range that falls within the backend's, if any
         2. QConfig specified a min scale value that is >= the backend's, if any
+        3. QConfig specified a FixedQParamsObserver or FixedQParamsFakeQuantize that has
+           scale and zero point that match the backend's, if any
 
     If `is_activation` is True, we check `qconfig.activation`, else we check `qconfig.weight`.
     If `qconfig` or `dtype_with_constraints.dtype` is None, or the dtypes do not match, return True.
     """
+    # TODO: log warnings only when the user enabled a debug flag
     def _activation_post_process_satisfies_dtype_config_constraints(
             activation_post_process: Union[ObserverBase, FakeQuantizeBase],
             dtype_with_constraints: DTypeWithConstraints,
@@ -968,6 +982,8 @@ def _activation_post_process_satisfies_dtype_config_constraints(
         backend_quant_min = dtype_with_constraints.quant_min_lower_bound
         backend_quant_max = dtype_with_constraints.quant_max_upper_bound
         backend_scale_min = dtype_with_constraints.scale_min_lower_bound
+        backend_scale_exact_match = dtype_with_constraints.scale_exact_match
+        backend_zero_point_exact_match = dtype_with_constraints.zero_point_exact_match
         # check quantization ranges
         if backend_quant_min is not None and backend_quant_max is not None:
             if app_quant_min is None or app_quant_max is None:
@@ -990,6 +1006,30 @@ def _activation_post_process_satisfies_dtype_config_constraints(
                               "the backend's min scale value (%s), ignoring %s") %
                               (debug_string, app_scale_min, backend_scale_min, qconfig))
                 return False
+        # check fixed scale and zero point
+        if backend_scale_exact_match is not None and backend_zero_point_exact_match is not None:
+            # For tests only, accept the following qconfigs for now
+            # TODO: handle fp16 qconfigs properly
+            for accepted_qconfig in [float16_static_qconfig, float16_dynamic_qconfig]:
+                if qconfig_equals(qconfig, accepted_qconfig):
+                    return True
+            suggestion_str = (
+                "Please use torch.ao.quantization.get_default_qconfig_mapping or "
+                "torch.ao.quantization.get_default_qat_qconfig_mapping. Example:\n"
+                "    qconfig_mapping = get_default_qconfig_mapping(\"fbgemm\")\n"
+                "    model = prepare_fx(model, qconfig_mapping, example_inputs)"
+            )
+            if not isinstance(activation_post_process, FixedQParamsObserver) and \
+                    not isinstance(activation_post_process, FixedQParamsFakeQuantize):
+                warnings.warn(("QConfig must specify a FixedQParamsObserver or a FixedQParamsFakeQuantize "
+                              "for fixed qparams ops, ignoring %s.\n%s") % (qconfig, suggestion_str))
+                return False
+            if observer.scale != backend_scale_exact_match or observer.zero_point != backend_zero_point_exact_match:
+                warnings.warn(("QConfig fixed scale (%s) and zero point (%s) do not match the backend's "
+                              "(%s and %s), ignoring %s.\n%s") %
+                              (observer.scale, observer.zero_point, backend_scale_exact_match,
+                              backend_zero_point_exact_match, qconfig, suggestion_str))
+                return False
         return True
 
     if qconfig is None or dtype_with_constraints.dtype is None:
diff --git a/torch/ao/quantization/qconfig_mapping.py b/torch/ao/quantization/qconfig_mapping.py
index e3410a52a9d83..65c85d033c5f9 100644
--- a/torch/ao/quantization/qconfig_mapping.py
+++ b/torch/ao/quantization/qconfig_mapping.py
@@ -39,6 +39,7 @@
 _MODULE_NAME_DICT_KEY = "module_name"
 _MODULE_NAME_OBJECT_TYPE_ORDER_DICT_KEY = "module_name_object_type_order"
 
+# TODO: derive this map from the BackendConfig
 _FIXED_QPARAMS_OP_TO_OBSERVER: Dict[Union[Callable, str], _PartialWrapper] = {
     torch.nn.Hardsigmoid: default_fixed_qparams_range_0to1_observer,
     torch.nn.functional.hardsigmoid: default_fixed_qparams_range_0to1_observer,

From c8f3dd298d766328e2a280a8000c42f66aa876ca Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 16 Nov 2022 19:08:49 +0000
Subject: [PATCH 0976/1922] [Dynamo] Add a dummy profiler to avoid activating
 real profiler (#88930)

See context at https://github.com/pytorch/torchdynamo/issues/1721#issuecomment-1312396059

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88930
Approved by: https://github.com/jansel
---
 test/dynamo/test_misc.py                |  4 ++--
 torch/_dynamo/variables/misc.py         | 22 ++++++++++++++++------
 torch/_dynamo/variables/torch.py        |  4 ++--
 torch/_dynamo/variables/user_defined.py | 10 +++++++---
 4 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index 8f79f2476aeea..aef364d769945 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -1388,7 +1388,7 @@ def fn():
         self.assertTrue(result[1] == fn.__code__.co_lnotab)
 
     def test_torch_profiler(self):
-        # wrap torch.profiler.* as ProfilerContextWrapperVariable and do nothing
+        # wrap torch.profiler.* as NullContextVariable and do nothing
         def fn(x):
             y = x**2
             with torch.profiler.profile():
@@ -1408,7 +1408,7 @@ def fn(x):
         self.assertEqual(cnts.frame_count, 2)
 
     def test_autograd_profiler(self):
-        # wrap torch.autograd.profiler.* as ProfilerContextWrapperVariable and do nothing
+        # wrap torch.autograd.profiler.* as NullContextVariable and do nothing
         def fn(x):
             y = x**2
             with torch.autograd.profiler.profile():
diff --git a/torch/_dynamo/variables/misc.py b/torch/_dynamo/variables/misc.py
index 5d7336cefeae7..298ddf24862bd 100644
--- a/torch/_dynamo/variables/misc.py
+++ b/torch/_dynamo/variables/misc.py
@@ -116,6 +116,9 @@ def exit(self, tx, *args):
         self._call_func(tx, self.initial_values)
         return variables.ConstantVariable(None, **VariableTracker.propagate(self))
 
+    def module_name(self):
+        return "torch"
+
     def reconstruct(self, codegen, target_inst=None):
         """
         Generate following Python Bytecode, with a `torch._C._set_grad_enable` call
@@ -356,11 +359,15 @@ def exit_functional_autocast(mode):
     mode.__exit__(None, None, None)
 
 
-class ProfilerContextWrapperVariable(ContextWrappingVariable):
+class NullContextVariable(ContextWrappingVariable):
+    """
+    This class represents Python contextlib.nullcontext.
+    It's used as a placeholder for other context managers that Dynamo doesn't
+    support yet, e.g, torch.autograd.profiler.record_function.
+    """
+
     def __init__(self, target_values=None, **kwargs):
-        super(ProfilerContextWrapperVariable, self).__init__(
-            target_values=target_values, **kwargs
-        )
+        super(NullContextVariable, self).__init__(target_values=target_values, **kwargs)
 
     def enter(self, tx):
         return variables.ConstantVariable(None, **VariableTracker.propagate(self))
@@ -368,8 +375,11 @@ def enter(self, tx):
     def exit(self, tx, *args):
         return variables.ConstantVariable(None, **VariableTracker.propagate(self))
 
+    def module_name(self):
+        return "contextlib"
+
     def fn_name(self):
-        return "autograd.profiler.profile"
+        return "nullcontext"
 
 
 class WithExitFunctionVariable(VariableTracker):
@@ -389,7 +399,7 @@ def reconstruct(self, codegen):
         # exit function.  The handler generated by BlockStackEntry
         # will re-enter the context in the resume function.
         output = AttrSource(
-            codegen.tx.import_source("torch"), self.ctx.fn_name()
+            codegen.tx.import_source(self.ctx.module_name()), self.ctx.fn_name()
         ).reconstruct(codegen)
 
         if codegen.tx.output.partial_convert:
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index 3b9b552542ac0..56e74503faca0 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -26,7 +26,7 @@
 )
 from .base import VariableTracker
 from .lists import ListVariable, TupleVariable
-from .misc import AutocastModeVariable, ProfilerContextWrapperVariable
+from .misc import AutocastModeVariable, NullContextVariable
 from .nn_module import NNModuleVariable
 from .tensor import TensorWithTFOverrideVariable
 
@@ -300,7 +300,7 @@ def call_function(
             torch.autograd.profiler.record_function,
         ):
             log.warning("Profiler will be ignored")
-            return ProfilerContextWrapperVariable(**options)
+            return NullContextVariable(**options)
         elif self.value is torch.autograd._profiler_enabled:
             unimplemented("torch.autograd._profiler_enabled not supported yet")
         elif self.value is torch.jit.annotate:
diff --git a/torch/_dynamo/variables/user_defined.py b/torch/_dynamo/variables/user_defined.py
index 09d7893bef665..8cc9528ed67c4 100644
--- a/torch/_dynamo/variables/user_defined.py
+++ b/torch/_dynamo/variables/user_defined.py
@@ -1,4 +1,5 @@
 import collections
+import contextlib
 import dataclasses
 import functools
 import importlib
@@ -15,7 +16,7 @@
 from ..source import AttrSource, ODictGetItemSource, RandomValueSource
 from ..utils import is_namedtuple_cls, namedtuple_fields
 from .base import MutableLocal, VariableTracker
-from .misc import ProfilerContextWrapperVariable
+from .misc import NullContextVariable
 
 
 class UserDefinedVariable(VariableTracker):
@@ -77,8 +78,11 @@ def call_function(
 
         options = VariableTracker.propagate(self, args, kwargs.values())
 
-        if self.value is torch.autograd.profiler.profile:
-            return ProfilerContextWrapperVariable()
+        if self.value in (
+            contextlib.nullcontext,
+            torch.autograd.profiler.profile,
+        ):
+            return NullContextVariable(**options)
         elif is_namedtuple_cls(self.value):
             fields = namedtuple_fields(self.value)
             items = list(args)

From 9e2e1aafd93ef684d49fc3cf3f5aaed6a15aa4f8 Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Mon, 14 Nov 2022 13:31:23 -0800
Subject: [PATCH 0977/1922] [ONNX] Document ONNX diagnostics (#88371)

Reference pages:
- Landing page: https://docs-preview.pytorch.org/88371/onnx_diagnostics.html
- Individual rule: https://docs-preview.pytorch.org/88371/generated/onnx_diagnostics_rules/POE0004%3Aoperator-supported-in-newer-opset-version.html

An initial PR to setup the document generation for ONNX diagnostics.
* Add document page for ONNX diagnostics.
* Add document generation for diagnostics rules from `rules.yaml`.
* Add dependency on `myst-parser` for markdown to rst parsing.

More content to be added.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88371
Approved by: https://github.com/abock, https://github.com/justinchuby, https://github.com/malfet, https://github.com/kit1980
---
 docs/Makefile                                 |  7 ++--
 docs/requirements.txt                         |  1 +
 docs/source/conf.py                           |  3 +-
 docs/source/index.rst                         |  1 +
 docs/source/onnx_diagnostics.rst              | 35 ++++++++++++++++++
 .../onnx/build_onnx_diagnostics_rules_md.py   | 37 +++++++++++++++++++
 6 files changed, 80 insertions(+), 4 deletions(-)
 create mode 100644 docs/source/onnx_diagnostics.rst
 create mode 100644 docs/source/scripts/onnx/build_onnx_diagnostics_rules_md.py

diff --git a/docs/Makefile b/docs/Makefile
index 122bda6231e39..c506845fa92bc 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -17,8 +17,9 @@ figures:
 	@$(PYCMD) source/scripts/build_activation_images.py
 	@$(PYCMD) source/scripts/build_quantization_configs.py
 
-onnx_supported_aten_ops:
+onnx:
 	@$(PYCMD) source/scripts/onnx/build_onnx_supported_aten_op_csv_table.py
+	@$(PYCMD) source/scripts/onnx/build_onnx_diagnostics_rules_md.py $(SOURCEDIR)/generated/onnx_diagnostics_rules
 
 docset: html
 	doc2dash --name $(SPHINXPROJ) --icon $(SOURCEDIR)/_static/img/pytorch-logo-flame.png --enable-js --online-redirect-url https://pytorch.org/docs/ --force $(BUILDDIR)/html/
@@ -34,11 +35,11 @@ html-stable:
 	# See conf.py for more details.
 	RELEASE=1 make html
 
-.PHONY: help Makefile docset onnx_supported_aten_ops
+.PHONY: help Makefile docset onnx
 
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile figures onnx_supported_aten_ops
+%: Makefile figures onnx
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
 clean:
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 14c93adc22e90..fdbe10778bf98 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -10,3 +10,4 @@ tensorboard==2.10.0
 python-etcd==0.4.5
 sphinx-copybutton==0.5.0
 sphinx-panels==0.4.1
+myst-parser==0.18.1
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 807f486ac0d6a..f4d1d8b68eb92 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -58,7 +58,8 @@
     'sphinxcontrib.katex',
     'sphinx.ext.autosectionlabel',
     'sphinx_copybutton',
-    'sphinx_panels'
+    'sphinx_panels',
+    'myst_parser',
 ]
 
 # build the templated autosummary files
diff --git a/docs/source/index.rst b/docs/source/index.rst
index b9d097f551913..00f8e0967b737 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -85,6 +85,7 @@ Features described in this documentation are classified by release status:
    profiler
    nn.init
    onnx
+   onnx_diagnostics
    optim
    complex_numbers
    ddp_comm_hooks
diff --git a/docs/source/onnx_diagnostics.rst b/docs/source/onnx_diagnostics.rst
new file mode 100644
index 0000000000000..ec2edd4cbdbe7
--- /dev/null
+++ b/docs/source/onnx_diagnostics.rst
@@ -0,0 +1,35 @@
+torch.onnx diagnostics
+======================
+
+.. contents:: :local:
+.. automodule:: torch.onnx._internal.diagnostics
+.. currentmodule:: torch.onnx._internal.diagnostics
+
+Overview
+--------
+
+NOTE: This feature is underdevelopment and is subject to change.
+
+The goal is to improve the diagnostics to help users debug and improve their model export to ONNX.
+
+- The diagnostics are emitted in machine parsable `Static Analysis Results Interchange Format (SARIF) <https://docs.oasis-open.org/sarif/sarif/v2.1.0/sarif-v2.1.0.html>`__.
+- A new clearer, structured way to add new and keep track of diagnostic rules.
+- Serve as foundation for more future improvements consuming the diagnostics.
+
+
+Diagnostic Rules
+----------------
+
+.. toctree::
+    :glob:
+
+    generated/onnx_diagnostics_rules/*
+
+API Reference
+-------------
+
+.. autoclass:: torch.onnx._internal.diagnostics.ExportDiagnostic
+    :members:
+
+.. autoclass:: torch.onnx._internal.diagnostics.infra.DiagnosticEngine
+    :members:
diff --git a/docs/source/scripts/onnx/build_onnx_diagnostics_rules_md.py b/docs/source/scripts/onnx/build_onnx_diagnostics_rules_md.py
new file mode 100644
index 0000000000000..3c2895f6fe769
--- /dev/null
+++ b/docs/source/scripts/onnx/build_onnx_diagnostics_rules_md.py
@@ -0,0 +1,37 @@
+import argparse
+import os
+from dataclasses import fields
+
+from torch.onnx._internal import diagnostics
+from torch.onnx._internal.diagnostics import infra
+
+
+def gen_docs(out_dir: str):
+    os.makedirs(out_dir, exist_ok=True)
+    for field in fields(diagnostics.rules):
+        rule = getattr(diagnostics.rules, field.name)
+        if not isinstance(rule, infra.Rule):
+            continue
+        title = f"{rule.id}:{rule.name}"
+        full_description_markdown = rule.full_description_markdown
+        assert (
+            full_description_markdown is not None
+        ), f"Expected {title} to have a full description in markdown"
+        with open(f"{out_dir}/{title}.md", "w") as f:
+            f.write(f"# {title}\n")
+            f.write(full_description_markdown)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Generate ONNX diagnostics rules doc in markdown."
+    )
+    parser.add_argument(
+        "out_dir", metavar="OUT_DIR", help="path to output directory for docs"
+    )
+    args = parser.parse_args()
+    gen_docs(args.out_dir)
+
+
+if __name__ == "__main__":
+    main()

From 5cd6ca9b3556dcd11e5a34c18b35de93a581881a Mon Sep 17 00:00:00 2001
From: soulitzer <soulitzer@gmail.com>
Date: Wed, 16 Nov 2022 11:22:58 -0500
Subject: [PATCH 0978/1922] Prevent module full_backward_hook from erroring in
 double backward (#88357)

Also clarifies documentation to say "execute if and only if gradients wrt outputs are computed" (previously, "execute every time gradients wrt inputs are computed")

See https://docs.google.com/document/d/1tFZKYdsSzRBJ7Di7SWt8X8fSg-E3eiUPwomMF10UyhM/edit for more details regarding the question: 'should module full_backward_hooks be called every time the gradients wrt module inputs are called, or should module full_backward_hooks only be called when the "backward for the module" have been computed?'

Fixes https://github.com/pytorch/pytorch/issues/88312

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88357
Approved by: https://github.com/albanD
---
 test/test_autograd.py      | 19 +++++++++++++++++++
 torch/nn/modules/module.py | 12 ++++++++----
 torch/utils/hooks.py       | 11 ++++-------
 3 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index 33cf188af0659..6e26f67f6dc34 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -6638,6 +6638,25 @@ def forward(self, x):
                 gc.collect()
                 self.assertIsNone(ref_())
 
+    def test_full_backward_hook_double_backward(self):
+        x = torch.rand(1, requires_grad=True)
+        y = torch.rand_like(x)
+
+        func = torch.nn.MSELoss()
+        counter = [0]
+
+        def hook(module, grad_input, grad_output):
+            counter[0] += 1
+
+        func.register_full_backward_hook(hook)
+
+        f = func(x, y)
+
+        (gradx_f,) = torch.autograd.grad(f, x, create_graph=True)
+        self.assertEqual(counter[0], 1)
+        _ = torch.autograd.grad(gradx_f, x)
+        # We should not error, and counter should not be incremented
+        self.assertEqual(counter[0], 1)
 
     def test_input_buffer_accum(self):
         leaf = torch.rand(2, 2, requires_grad=True)
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index fea0ca7b8de81..82389074f8a98 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -307,8 +307,10 @@ def register_module_full_backward_hook(
         This adds global state to the `nn.module` module
         and it is only intended for debugging/profiling purposes.
 
-    The hook will be called every time the gradients with respect to module
-    inputs are computed. The hook should have the following signature::
+    The hook will be called every time the gradients with respect to a module
+    are computed, i.e. the hook will execute if and only if the gradients with
+    respect to module outputs are computed. The hook should have the following
+    signature::
 
         hook(module, grad_input, grad_output) -> Tensor or None
 
@@ -1197,8 +1199,10 @@ def register_full_backward_hook(
     ) -> RemovableHandle:
         r"""Registers a backward hook on the module.
 
-        The hook will be called every time the gradients with respect to module
-        inputs are computed. The hook should have the following signature::
+        The hook will be called every time the gradients with respect to a module
+        are computed, i.e. the hook will execute if and only if the gradients with
+        respect to module outputs are computed. The hook should have the following
+        signature::
 
             hook(module, grad_input, grad_output) -> tuple(Tensor) or None
 
diff --git a/torch/utils/hooks.py b/torch/utils/hooks.py
index 327b2143607c7..133d2c0d2ceb5 100644
--- a/torch/utils/hooks.py
+++ b/torch/utils/hooks.py
@@ -99,13 +99,10 @@ def _unpack_none(self, indices, values):
     def _set_user_hook(self, grad_fn):
         def hook(grad_input, _):
             if self.grad_outputs is None:
-                raise RuntimeError("Module backward hook for grad_input is called before "
-                                   "the grad_output one. This happens because the gradient "
-                                   "in your nn.Module flows to the Module's input without "
-                                   "passing through the Module's output. Make sure that the "
-                                   "output depends on the input and that the loss is computed "
-                                   "based on the output.")
-
+                # This happens because the gradient in your nn.Module flows to
+                # the Module's input without " passing through the Module's
+                # output, e.g. when you're doing double backward.
+                return
             res = self._pack_with_none(self.input_tensors_index, grad_input, self.n_inputs)
 
             for hook in self.user_hooks:

From 48cdf875c41f99931064d30d4a745fc5b344b675 Mon Sep 17 00:00:00 2001
From: Iris <wz337@cornell.edu>
Date: Wed, 16 Nov 2022 21:06:35 +0000
Subject: [PATCH 0979/1922] [PT-D][Checkpointing] Move distributed
 checkpointing from torch.distributed._shard.checkpoint to
 torch.distributed.checkpoint (#88698)

Context in RFC: https://github.com/pytorch/pytorch/issues/86620

.rst file will be finalized in subsequent PRs.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88698
Approved by: https://github.com/wanchaol
---
 docs/source/distributed.checkpoint.rst        |   4 +
 docs/source/index.rst                         |   1 +
 .../checkpoint/test_checkpoint.py             |   6 +-
 .../checkpoint/test_file_system_checkpoint.py |   2 +-
 .../test_file_system_checkpoint_cpu.py        |   2 +-
 .../{_shard => }/checkpoint/test_planner.py   |   6 +-
 .../{_shard => }/checkpoint/test_utils.py     |   4 +-
 .../distributed/_shard/checkpoint/__init__.py |  29 ++---
 torch/distributed/checkpoint/__init__.py      |  21 ++++
 .../{_shard => }/checkpoint/api.py            |  10 +-
 .../checkpoint/default_planner.py             |  82 ++++++++++----
 .../{_shard => }/checkpoint/filesystem.py     |  82 +++++++++-----
 .../{_shard => }/checkpoint/metadata.py       |  23 +++-
 .../{_shard => }/checkpoint/planner.py        |  41 ++++++-
 .../checkpoint/planner_helpers.py             |  94 +++++++++-------
 .../{_shard => }/checkpoint/resharding.py     |   7 +-
 .../checkpoint/state_dict_loader.py           |   6 +-
 .../checkpoint/state_dict_saver.py            |  13 +--
 .../{_shard => }/checkpoint/storage.py        |  14 ++-
 .../{_shard => }/checkpoint/utils.py          | 101 +++++++++++++-----
 20 files changed, 389 insertions(+), 159 deletions(-)
 create mode 100644 docs/source/distributed.checkpoint.rst
 rename test/distributed/{_shard => }/checkpoint/test_checkpoint.py (98%)
 rename test/distributed/{_shard => }/checkpoint/test_file_system_checkpoint.py (99%)
 rename test/distributed/{_shard => }/checkpoint/test_file_system_checkpoint_cpu.py (99%)
 rename test/distributed/{_shard => }/checkpoint/test_planner.py (97%)
 rename test/distributed/{_shard => }/checkpoint/test_utils.py (96%)
 create mode 100644 torch/distributed/checkpoint/__init__.py
 rename torch/distributed/{_shard => }/checkpoint/api.py (90%)
 rename torch/distributed/{_shard => }/checkpoint/default_planner.py (76%)
 rename torch/distributed/{_shard => }/checkpoint/filesystem.py (82%)
 rename torch/distributed/{_shard => }/checkpoint/metadata.py (87%)
 rename torch/distributed/{_shard => }/checkpoint/planner.py (95%)
 rename torch/distributed/{_shard => }/checkpoint/planner_helpers.py (74%)
 rename torch/distributed/{_shard => }/checkpoint/resharding.py (91%)
 rename torch/distributed/{_shard => }/checkpoint/state_dict_loader.py (98%)
 rename torch/distributed/{_shard => }/checkpoint/state_dict_saver.py (96%)
 rename torch/distributed/{_shard => }/checkpoint/storage.py (96%)
 rename torch/distributed/{_shard => }/checkpoint/utils.py (77%)

diff --git a/docs/source/distributed.checkpoint.rst b/docs/source/distributed.checkpoint.rst
new file mode 100644
index 0000000000000..380ec0e6022a4
--- /dev/null
+++ b/docs/source/distributed.checkpoint.rst
@@ -0,0 +1,4 @@
+Distributed Checkpoint
+========================
+
+.. automodule:: torch.distributed.checkpoint
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 00f8e0967b737..20214466328a7 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -70,6 +70,7 @@ Features described in this documentation are classified by release status:
    torch.distributed.elastic <distributed.elastic>
    torch.distributed.fsdp <fsdp>
    torch.distributed.optim <distributed.optim>
+   torch.distributed.checkpoint <distributed.checkpoint>
    torch.distributions <distributions>
    torch.fft <fft>
    futures
diff --git a/test/distributed/_shard/checkpoint/test_checkpoint.py b/test/distributed/checkpoint/test_checkpoint.py
similarity index 98%
rename from test/distributed/_shard/checkpoint/test_checkpoint.py
rename to test/distributed/checkpoint/test_checkpoint.py
index 1b3cf04eb2ccf..167fdc5e7154c 100644
--- a/test/distributed/_shard/checkpoint/test_checkpoint.py
+++ b/test/distributed/checkpoint/test_checkpoint.py
@@ -20,17 +20,17 @@
 
 from torch.distributed._shard import sharded_tensor
 
-from torch.distributed._shard.checkpoint.default_planner import (
+from torch.distributed.checkpoint.default_planner import (
     _create_default_local_metadata,
 )
 
-from torch.distributed._shard.checkpoint.metadata import (
+from torch.distributed.checkpoint.metadata import (
     BytesStorageMetadata,
     Metadata,
     TensorStorageMetadata,
 )
 
-from torch.distributed._shard.checkpoint.planner import (
+from torch.distributed.checkpoint.planner import (
     SavePlan,
     SavePlanner,
     LoadPlan,
diff --git a/test/distributed/_shard/checkpoint/test_file_system_checkpoint.py b/test/distributed/checkpoint/test_file_system_checkpoint.py
similarity index 99%
rename from test/distributed/_shard/checkpoint/test_file_system_checkpoint.py
rename to test/distributed/checkpoint/test_file_system_checkpoint.py
index b5cc38767c962..7ef4e72e4fe0e 100644
--- a/test/distributed/_shard/checkpoint/test_file_system_checkpoint.py
+++ b/test/distributed/checkpoint/test_file_system_checkpoint.py
@@ -31,7 +31,7 @@
     run_tests,
 )
 
-from torch.distributed._shard.checkpoint import (
+from torch.distributed.checkpoint import (
     FileSystemReader,
     FileSystemWriter,
     load_state_dict,
diff --git a/test/distributed/_shard/checkpoint/test_file_system_checkpoint_cpu.py b/test/distributed/checkpoint/test_file_system_checkpoint_cpu.py
similarity index 99%
rename from test/distributed/_shard/checkpoint/test_file_system_checkpoint_cpu.py
rename to test/distributed/checkpoint/test_file_system_checkpoint_cpu.py
index 321dc2f546883..2ff2d9d127919 100644
--- a/test/distributed/_shard/checkpoint/test_file_system_checkpoint_cpu.py
+++ b/test/distributed/checkpoint/test_file_system_checkpoint_cpu.py
@@ -31,7 +31,7 @@
     run_tests,
 )
 
-from torch.distributed._shard.checkpoint import (
+from torch.distributed.checkpoint import (
     FileSystemReader,
     FileSystemWriter,
     load_state_dict,
diff --git a/test/distributed/_shard/checkpoint/test_planner.py b/test/distributed/checkpoint/test_planner.py
similarity index 97%
rename from test/distributed/_shard/checkpoint/test_planner.py
rename to test/distributed/checkpoint/test_planner.py
index 56373bd67c6d9..334fba237a9ba 100644
--- a/test/distributed/_shard/checkpoint/test_planner.py
+++ b/test/distributed/checkpoint/test_planner.py
@@ -3,7 +3,7 @@
 import sys
 
 import torch
-from torch.distributed._shard.checkpoint.planner import LoadItemType, WriteItemType
+from torch.distributed.checkpoint.planner import LoadItemType, WriteItemType
 
 from torch.distributed._shard.sharded_tensor import (
     Shard,
@@ -18,13 +18,13 @@
     TEST_WITH_DEV_DBG_ASAN,
     run_tests,
 )
-from torch.distributed._shard.checkpoint.metadata import BytesStorageMetadata, MetadataIndex, TensorStorageMetadata
+from torch.distributed.checkpoint.metadata import BytesStorageMetadata, MetadataIndex, TensorStorageMetadata
 from torch.testing._internal.distributed.distributed_utils import (
     with_fake_comms,
     with_dist
 )
 
-from torch.distributed._shard.checkpoint.default_planner import (
+from torch.distributed.checkpoint.default_planner import (
     create_default_global_save_plan,
     create_default_local_save_plan,
     create_default_local_load_plan,
diff --git a/test/distributed/_shard/checkpoint/test_utils.py b/test/distributed/checkpoint/test_utils.py
similarity index 96%
rename from test/distributed/_shard/checkpoint/test_utils.py
rename to test/distributed/checkpoint/test_utils.py
index e99a9cf863e4f..e2b4aac605bf6 100644
--- a/test/distributed/_shard/checkpoint/test_utils.py
+++ b/test/distributed/checkpoint/test_utils.py
@@ -17,8 +17,8 @@
     TEST_WITH_DEV_DBG_ASAN,
     run_tests,
 )
-from torch.distributed._shard.checkpoint.utils import find_state_dict_object
-from torch.distributed._shard.checkpoint.metadata import MetadataIndex
+from torch.distributed.checkpoint.utils import find_state_dict_object
+from torch.distributed.checkpoint.metadata import MetadataIndex
 from torch.testing._internal.distributed.distributed_utils import (
     with_fake_comms
 )
diff --git a/torch/distributed/_shard/checkpoint/__init__.py b/torch/distributed/_shard/checkpoint/__init__.py
index febc953f9b609..166c6f9254cf6 100644
--- a/torch/distributed/_shard/checkpoint/__init__.py
+++ b/torch/distributed/_shard/checkpoint/__init__.py
@@ -1,21 +1,12 @@
-from .metadata import (
-    TensorStorageMetadata,
-    BytesStorageMetadata,
-    ChunkStorageMetadata,
-    Metadata,
-)
-from .state_dict_loader import load_state_dict
-from .state_dict_saver import save_state_dict
-from .storage import StorageReader, StorageWriter
-from .filesystem import FileSystemReader, FileSystemWriter
-from .api import CheckpointException
-
+# Keep old package for BC purposes, this file should be removed once
+# everything moves to the `torch.distributed.checkpoint` package.
+import sys
+import torch
+import warnings
 
-from .planner import (
-    SavePlanner,
-    LoadPlanner,
-    SavePlan,
-    LoadPlan,
-    ReadItem,
-    WriteItem,
+from torch.distributed.checkpoint import *  # noqa: F403
+warnings.warn(
+    "torch.distributed._shard.checkpoint will be deprecated, use torch.distributed.checkpoint instead",
+    DeprecationWarning
 )
+sys.modules['torch.distributed._shard.checkpoint'] = torch.distributed.checkpoint
diff --git a/torch/distributed/checkpoint/__init__.py b/torch/distributed/checkpoint/__init__.py
new file mode 100644
index 0000000000000..febc953f9b609
--- /dev/null
+++ b/torch/distributed/checkpoint/__init__.py
@@ -0,0 +1,21 @@
+from .metadata import (
+    TensorStorageMetadata,
+    BytesStorageMetadata,
+    ChunkStorageMetadata,
+    Metadata,
+)
+from .state_dict_loader import load_state_dict
+from .state_dict_saver import save_state_dict
+from .storage import StorageReader, StorageWriter
+from .filesystem import FileSystemReader, FileSystemWriter
+from .api import CheckpointException
+
+
+from .planner import (
+    SavePlanner,
+    LoadPlanner,
+    SavePlan,
+    LoadPlan,
+    ReadItem,
+    WriteItem,
+)
diff --git a/torch/distributed/_shard/checkpoint/api.py b/torch/distributed/checkpoint/api.py
similarity index 90%
rename from torch/distributed/_shard/checkpoint/api.py
rename to torch/distributed/checkpoint/api.py
index e74b34d9f233f..d7bfa18ecd798 100644
--- a/torch/distributed/_shard/checkpoint/api.py
+++ b/torch/distributed/checkpoint/api.py
@@ -3,20 +3,28 @@
 
 WRAPPED_EXCEPTION = Tuple[BaseException, tb.StackSummary]
 
+__all__ = ["CheckpointException"]
+
+
 def _wrap_exception(exc: BaseException) -> WRAPPED_EXCEPTION:
     return (exc, tb.extract_tb(exc.__traceback__))
 
+
 def _is_wrapped_exception(obj: Any) -> bool:
     if not isinstance(obj, tuple):
         return False
     if len(obj) != 2:
         return False
-    return isinstance(obj[0], BaseException) and isinstance(obj[1], tb.StackSummary)
+    return isinstance(obj[0], BaseException) and isinstance(
+        obj[1], tb.StackSummary
+    )
+
 
 class CheckpointException(BaseException):
     """
     Exception raised if failure was detected as part of a checkpoint load or save.
     """
+
     def __init__(self, msg: str, failures: Dict[int, WRAPPED_EXCEPTION]):
         super().__init__(msg, failures)
         self._failures = failures
diff --git a/torch/distributed/_shard/checkpoint/default_planner.py b/torch/distributed/checkpoint/default_planner.py
similarity index 76%
rename from torch/distributed/_shard/checkpoint/default_planner.py
rename to torch/distributed/checkpoint/default_planner.py
index 8f6a0c2be7ed4..aa531a62d235e 100644
--- a/torch/distributed/_shard/checkpoint/default_planner.py
+++ b/torch/distributed/checkpoint/default_planner.py
@@ -24,18 +24,26 @@
     MetadataIndex,
     Metadata,
     STATE_DICT_TYPE,
-    STORAGE_TYPES
+    STORAGE_TYPES,
 )
 
 from .planner_helpers import (
     _create_read_items,
     _create_write_items,
-    _create_default_metadata_only_plan
+    _create_default_metadata_only_plan,
 )
 
-from .utils import (
-    find_state_dict_object
-)
+from .utils import find_state_dict_object
+
+__all__ = [
+    "DefaultSavePlanner",
+    "DefaultLoadPlanner",
+    "create_default_local_load_plan",
+    "create_default_global_load_plan",
+    "create_default_local_save_plan",
+    "create_default_global_save_plan",
+]
+
 
 class DefaultSavePlanner(SavePlanner):
     def init(self, state_dict: Dict[str, Any], is_coordinator: bool) -> None:
@@ -43,18 +51,26 @@ def init(self, state_dict: Dict[str, Any], is_coordinator: bool) -> None:
         self.is_coordinator = is_coordinator
 
     def create_local_plan(self) -> SavePlan:
-        self.plan = create_default_local_save_plan(self.state_dict, self.is_coordinator)
+        self.plan = create_default_local_save_plan(
+            self.state_dict, self.is_coordinator
+        )
         return self.plan
 
-    def create_global_plan(self, all_plans: List[SavePlan]) -> Tuple[List[SavePlan], Metadata]:
-        self.global_plan, self.metadata = create_default_global_save_plan(all_plans)
+    def create_global_plan(
+        self, all_plans: List[SavePlan]
+    ) -> Tuple[List[SavePlan], Metadata]:
+        self.global_plan, self.metadata = create_default_global_save_plan(
+            all_plans
+        )
         return self.global_plan, self.metadata
 
     def finish_plan(self, new_plan: SavePlan) -> SavePlan:
         self.plan = new_plan
         return new_plan
 
-    def resolve_data(self, write_item: WriteItem) -> Union[torch.Tensor, io.BytesIO]:
+    def resolve_data(
+        self, write_item: WriteItem
+    ) -> Union[torch.Tensor, io.BytesIO]:
         object = self.lookup_object(write_item.index)
         return self.transform_object(write_item, object)
 
@@ -76,7 +92,12 @@ def transform_object(self, write_item: WriteItem, object: Any):
 
 
 class DefaultLoadPlanner(LoadPlanner):
-    def init(self, state_dict: STATE_DICT_TYPE, metadata: Metadata, is_coordinator: bool) -> None:
+    def init(
+        self,
+        state_dict: STATE_DICT_TYPE,
+        metadata: Metadata,
+        is_coordinator: bool,
+    ) -> None:
         self.state_dict = state_dict
         self.metadata = metadata
         self.is_coordinator = is_coordinator
@@ -110,7 +131,9 @@ def transform_tensor(self, read_item: ReadItem, tensor: torch.Tensor):
         """
         This is an extension from the planner interface to make it easy to extend the default planner
         """
-        return narrow_tensor_by_index(tensor, read_item.dest_offsets, read_item.lengths)
+        return narrow_tensor_by_index(
+            tensor, read_item.dest_offsets, read_item.lengths
+        )
 
 
 def create_default_local_load_plan(
@@ -133,7 +156,10 @@ def create_default_local_load_plan(
 
     return LoadPlan(requests)
 
-def create_default_global_load_plan(all_plans: List[LoadPlan]) -> List[LoadPlan]:
+
+def create_default_global_load_plan(
+    all_plans: List[LoadPlan],
+) -> List[LoadPlan]:
     """
     Create global load plan used by DefaultLoadPlanner.
 
@@ -142,7 +168,10 @@ def create_default_global_load_plan(all_plans: List[LoadPlan]) -> List[LoadPlan]
     """
     return all_plans
 
-def create_default_local_save_plan(state_dict: Dict[str, Any], is_coordinator: bool) -> SavePlan:
+
+def create_default_local_save_plan(
+    state_dict: Dict[str, Any], is_coordinator: bool
+) -> SavePlan:
     """
     Create the ``SavePlan`` used by DefaultSavePlanner.
 
@@ -157,7 +186,10 @@ def create_default_local_save_plan(state_dict: Dict[str, Any], is_coordinator: b
             requests += _create_write_items(fqn, obj)
     return SavePlan(requests)
 
-def create_default_global_save_plan(all_plans: List[SavePlan]) -> Tuple[List[SavePlan], Metadata]:
+
+def create_default_global_save_plan(
+    all_plans: List[SavePlan],
+) -> Tuple[List[SavePlan], Metadata]:
     """
     Create the global plan and metadata used by DefaultSavePlanner.
 
@@ -180,21 +212,29 @@ def create_default_global_save_plan(all_plans: List[SavePlan]) -> Tuple[List[Sav
                 assert item.tensor_data is not None
                 tensor_md = cast(
                     TensorStorageMetadata,
-                    md.setdefault(item.index.fqn, TensorStorageMetadata(
-                        properties=item.tensor_data.properties,
-                        size=item.tensor_data.size,
-                        chunks=[],
-                    ))
+                    md.setdefault(
+                        item.index.fqn,
+                        TensorStorageMetadata(
+                            properties=item.tensor_data.properties,
+                            size=item.tensor_data.size,
+                            chunks=[],
+                        ),
+                    ),
+                )
+                new_index = dataclasses.replace(
+                    item.index, index=len(tensor_md.chunks)
                 )
-                new_index = dataclasses.replace(item.index, index=len(tensor_md.chunks))
                 new_item = dataclasses.replace(item, index=new_index)
                 new_items.append(new_item)
 
-                assert item.tensor_data.chunk is not None, f"Cannot create MD for tensor without bounds. FQN: {item.index.fqn}"
+                assert (
+                    item.tensor_data.chunk is not None
+                ), f"Cannot create MD for tensor without bounds. FQN: {item.index.fqn}"
                 tensor_md.chunks.append(item.tensor_data.chunk)
         new_plans.append(dataclasses.replace(plan, items=new_items))
     return (new_plans, Metadata(md))
 
+
 def _create_default_local_metadata(state_dict: STATE_DICT_TYPE) -> Metadata:
     """
     Return the ``Metadata`` if DefaultSavePlanner was used to checkpoint ``state_dict``.
diff --git a/torch/distributed/_shard/checkpoint/filesystem.py b/torch/distributed/checkpoint/filesystem.py
similarity index 82%
rename from torch/distributed/_shard/checkpoint/filesystem.py
rename to torch/distributed/checkpoint/filesystem.py
index 9788853d9aa66..0e679c3039219 100644
--- a/torch/distributed/_shard/checkpoint/filesystem.py
+++ b/torch/distributed/checkpoint/filesystem.py
@@ -34,32 +34,46 @@
 from torch.distributed._shard._utils import narrow_tensor_by_index
 
 
+__all__ = [
+    "FileSystemWriter",
+    "SlicedBufferedReader",
+    "FileSystemReader",
+]
+
+
 @dataclass
 class _StorageInfo:
     """
     This is the per entry storage info
     """
+
     relative_path: str
     offset: int
     length: int
 
+
 @dataclass
 class _StoragePrefix:
     prefix: str
 
+
 DEFAULT_SUFIX = ".distcp"
 
+
 def _trim(tensor: torch.Tensor) -> torch.Tensor:
     tensor = tensor.detach().cpu()
     if tensor._typed_storage()._size() != tensor.numel():
         tensor = tensor.clone()
     return tensor
 
-def _result_from_write_item(item: WriteItem, size_in_bytes, storage_data) -> WriteResult:
+
+def _result_from_write_item(
+    item: WriteItem, size_in_bytes, storage_data
+) -> WriteResult:
     return WriteResult(
-        index=item.index,
-        size_in_bytes=size_in_bytes,
-        storage_data=storage_data)
+        index=item.index, size_in_bytes=size_in_bytes, storage_data=storage_data
+    )
+
 
 def _write_item(stream, data, write_item, storage_key):
     offset = stream.tell()
@@ -74,11 +88,10 @@ def _write_item(stream, data, write_item, storage_key):
     length = stream.tell() - offset
 
     return _result_from_write_item(
-        write_item,
-        length,
-        _StorageInfo(storage_key, offset, length)
+        write_item, length, _StorageInfo(storage_key, offset, length)
     )
 
+
 def _write_files_from_queue(
     file_queue: List,
     planner: SavePlanner,
@@ -87,24 +100,33 @@ def _write_files_from_queue(
     write_results = []
 
     for file_path, file_name, write_items in file_queue:
-        tensor_w = [wi for wi in write_items if wi.type != WriteItemType.BYTE_IO]
+        tensor_w = [
+            wi for wi in write_items if wi.type != WriteItemType.BYTE_IO
+        ]
         bytes_w = [wi for wi in write_items if wi.type == WriteItemType.BYTE_IO]
 
         with open(file_path, "wb") as stream:
             for write_item in bytes_w:
                 data = planner.resolve_data(write_item)
-                write_results.append(_write_item(stream, data, write_item, file_name))
+                write_results.append(
+                    _write_item(stream, data, write_item, file_name)
+                )
 
             for write_item in tensor_w:
-                tensor = _trim(cast(torch.Tensor, planner.resolve_data(write_item)))
+                tensor = _trim(
+                    cast(torch.Tensor, planner.resolve_data(write_item))
+                )
                 assert not tensor.is_cuda
-                write_results.append(_write_item(stream, tensor, write_item, file_name))
+                write_results.append(
+                    _write_item(stream, tensor, write_item, file_name)
+                )
 
             if use_fsync:
                 os.fsync(stream.fileno())
 
     return write_results
 
+
 class FileSystemWriter(StorageWriter):
     """
     Basic implementation of StorageWriter using file IO.
@@ -118,6 +140,7 @@ class FileSystemWriter(StorageWriter):
     a `.metadata` file with the serialized metadata.
 
     """
+
     def __init__(
         self,
         path: Union[str, os.PathLike],
@@ -146,11 +169,14 @@ def prepare_local_plan(self, plan: SavePlan) -> SavePlan:
         # There's no storage input in the local plan
         return plan
 
-    def prepare_global_plan(self, global_plan: List[SavePlan]) -> List[SavePlan]:
+    def prepare_global_plan(
+        self, global_plan: List[SavePlan]
+    ) -> List[SavePlan]:
         self.path.mkdir(parents=True, exist_ok=True)
 
         new_plans = [
-            dataclasses.replace(plan, storage_data=_StoragePrefix(f"__{i}_")) for i, plan in enumerate(global_plan)
+            dataclasses.replace(plan, storage_data=_StoragePrefix(f"__{i}_"))
+            for i, plan in enumerate(global_plan)
         ]
         return new_plans
 
@@ -187,12 +213,12 @@ def gen_file():
         fut.set_result(results)
         return fut
 
-    def finish(self, metadata: Metadata, results: List[List[WriteResult]]) -> None:
+    def finish(
+        self, metadata: Metadata, results: List[List[WriteResult]]
+    ) -> None:
         storage_md = dict()
         for wr_list in results:
-            storage_md.update({
-                wr.index: wr.storage_data for wr in wr_list
-            })
+            storage_md.update({wr.index: wr.storage_data for wr in wr_list})
         metadata.storage_data = storage_md
         with (self.path / ".metadata.tmp").open("wb") as metadata_file:
             pickle.dump(metadata, metadata_file)
@@ -220,6 +246,7 @@ def seek(self, __offset: int, __whence: int = os.SEEK_SET) -> int:
     def tell(self) -> int:
         return super().tell() - self.offset
 
+
 class FileSystemReader(StorageReader):
     def __init__(self, path: Union[str, os.PathLike]) -> None:
         super().__init__()
@@ -228,15 +255,10 @@ def __init__(self, path: Union[str, os.PathLike]) -> None:
 
     def _slice_file(self, file, sinfo: _StorageInfo):
         return SlicedBufferedReader(
-            io.FileIO(file.fileno(), closefd=False),
-            sinfo.offset, sinfo.length
+            io.FileIO(file.fileno(), closefd=False), sinfo.offset, sinfo.length
         )
 
-    def read_data(
-        self,
-        plan: LoadPlan,
-        planner: LoadPlanner
-    ) -> Future[None]:
+    def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
         # group requests by file
         per_file: Dict[str, List[ReadItem]] = dict()
         for read_item in plan.items:
@@ -255,8 +277,12 @@ def read_data(
                         bytes.seek(0)
                         planner.load_bytes(req, bytes)
                     else:
-                        tensor = cast(Tensor, torch.load(file_slice, map_location="cpu"))
-                        tensor = narrow_tensor_by_index(tensor, req.storage_offsets, req.lengths)
+                        tensor = cast(
+                            Tensor, torch.load(file_slice, map_location="cpu")
+                        )
+                        tensor = narrow_tensor_by_index(
+                            tensor, req.storage_offsets, req.lengths
+                        )
                         target_tensor = planner.resolve_tensor(req).detach()
 
                         assert (
@@ -281,5 +307,7 @@ def init(self, metadata: Metadata, is_coordinator: bool) -> None:
     def prepare_local_plan(self, plan: LoadPlan) -> LoadPlan:
         return plan
 
-    def prepare_global_plan(self, global_plan: List[LoadPlan]) -> List[LoadPlan]:
+    def prepare_global_plan(
+        self, global_plan: List[LoadPlan]
+    ) -> List[LoadPlan]:
         return global_plan
diff --git a/torch/distributed/_shard/checkpoint/metadata.py b/torch/distributed/checkpoint/metadata.py
similarity index 87%
rename from torch/distributed/_shard/checkpoint/metadata.py
rename to torch/distributed/checkpoint/metadata.py
index 2321f02766232..1a03f16ff4731 100644
--- a/torch/distributed/_shard/checkpoint/metadata.py
+++ b/torch/distributed/checkpoint/metadata.py
@@ -7,28 +7,42 @@
     ShardedTensor,
 )
 
+__all__ = [
+    "ChunkStorageMetadata",
+    "TensorStorageMetadata",
+    "BytesStorageMetadata",
+    "Metadata",
+    "MetadataIndex",
+]
+
+
 @dataclass
 class ChunkStorageMetadata:
     """
     Each chunk is expected to have the same properties of the TensorStorageMetadata that includes it.
     """
+
     offsets: torch.Size
     sizes: torch.Size
 
+
 @dataclass
 class TensorStorageMetadata:
     properties: TensorProperties
     size: torch.Size
     chunks: List[ChunkStorageMetadata]
 
+
 @dataclass
 class BytesStorageMetadata:
     pass
 
+
 TENSOR_TYPE = Union[torch.Tensor, ShardedTensor]
 STORAGE_TYPES = Union[TensorStorageMetadata, BytesStorageMetadata]
 STATE_DICT_TYPE = Dict[str, Any]
 
+
 @dataclass
 class Metadata:
     # Keys are the same from the `state_dict` used.
@@ -36,11 +50,13 @@ class Metadata:
     planner_data: Any = None
     storage_data: Any = None
 
+
 @dataclass(frozen=True)
 class MetadataIndex:
     """
     This class represents a lookup key for items in a state dict or Metadata.
     """
+
     fqn: str
     """Fully Qualified Name of the object"""
 
@@ -59,7 +75,12 @@ class MetadataIndex:
     the linear search and thus making it significantly faster.
     """
 
-    def __init__(self, fqn: str, offset: Optional[Sequence[int]] = None, index: Optional[int] = None):
+    def __init__(
+        self,
+        fqn: str,
+        offset: Optional[Sequence[int]] = None,
+        index: Optional[int] = None,
+    ):
         # We must use object.__setattr__ due to frozen=True
         object.__setattr__(self, "fqn", fqn)
         object.__setattr__(self, "index", index)
diff --git a/torch/distributed/_shard/checkpoint/planner.py b/torch/distributed/checkpoint/planner.py
similarity index 95%
rename from torch/distributed/_shard/checkpoint/planner.py
rename to torch/distributed/checkpoint/planner.py
index f3692cc113956..cb94a40df7328 100644
--- a/torch/distributed/_shard/checkpoint/planner.py
+++ b/torch/distributed/checkpoint/planner.py
@@ -12,24 +12,41 @@
     ChunkStorageMetadata,
     MetadataIndex,
     Metadata,
-    STATE_DICT_TYPE
+    STATE_DICT_TYPE,
 )
 
+
+__all__ = [
+    "WriteItemType",
+    "LoadItemType",
+    "TensorWriteData",
+    "WriteItem",
+    "ReadItem",
+    "SavePlan",
+    "LoadPlan",
+    "SavePlanner",
+    "LoadPlanner",
+]
+
+
 class WriteItemType(Enum):
     TENSOR = auto()
     SHARD = auto()
     BYTE_IO = auto()
 
+
 class LoadItemType(Enum):
     TENSOR = auto()
     BYTE_IO = auto()
 
+
 @dataclass(frozen=True)
 class TensorWriteData:
     chunk: ChunkStorageMetadata
     properties: TensorProperties
     size: torch.Size
 
+
 @dataclass(frozen=True)
 class WriteItem:
     index: MetadataIndex
@@ -38,6 +55,7 @@ class WriteItem:
     # Value present if it's a tensor write
     tensor_data: Optional[TensorWriteData] = None
 
+
 @dataclass(frozen=True)
 class ReadItem:
     # Read Item
@@ -56,18 +74,21 @@ class ReadItem:
     # Size of the hypercube to copy
     lengths: torch.Size
 
+
 @dataclass(frozen=True)
 class SavePlan:
     items: List[WriteItem]
     storage_data: Any = None
     planner_data: Any = None
 
+
 @dataclass
 class LoadPlan:
     items: List[ReadItem]
     storage_data: Any = None
     planner_data: Any = None
 
+
 class SavePlanner(abc.ABC):
     """
     Abstract class defining the protocol used by save_state_dict to plan the save process.
@@ -156,6 +177,7 @@ class SavePlanner(abc.ABC):
     >>>         metadata = replace(metadata, planner_data=merged_data)
     >>>         return global_plan, metadata
     """
+
     @abc.abstractmethod
     def init(self, state_dict: STATE_DICT_TYPE, is_coordinator: bool) -> None:
         """
@@ -179,7 +201,9 @@ def create_local_plan(self) -> SavePlan:
         pass
 
     @abc.abstractmethod
-    def create_global_plan(self, all_plans: List[SavePlan]) -> Tuple[List[SavePlan], Metadata]:
+    def create_global_plan(
+        self, all_plans: List[SavePlan]
+    ) -> Tuple[List[SavePlan], Metadata]:
         """
         Compute the global checkpoint plan and return the local plan of each rank.
 
@@ -197,7 +221,9 @@ def finish_plan(self, new_plan: SavePlan) -> SavePlan:
         pass
 
     @abc.abstractmethod
-    def resolve_data(self, write_item: WriteItem) -> Union[torch.Tensor, io.BytesIO]:
+    def resolve_data(
+        self, write_item: WriteItem
+    ) -> Union[torch.Tensor, io.BytesIO]:
         """
         Lookup the object associated with ``write_item``in `state_dict` and apply any
         transformation (such as serialization) prior to the storage layer consuming it.
@@ -215,6 +241,7 @@ def resolve_data(self, write_item: WriteItem) -> Union[torch.Tensor, io.BytesIO]
         """
         pass
 
+
 class LoadPlanner:
     """
     Abstract class defining the protocol used by load_state_dict to plan the load process.
@@ -273,8 +300,14 @@ class LoadPlanner:
     >>>     def commit_tensor(self, read_item, tensor):
     >>>         self.state_dict[read_item.dest_index.fqn] = tensor
     """
+
     @abc.abstractmethod
-    def init(self, state_dict: STATE_DICT_TYPE, metadata: Metadata, is_coordinator: bool) -> None:
+    def init(
+        self,
+        state_dict: STATE_DICT_TYPE,
+        metadata: Metadata,
+        is_coordinator: bool,
+    ) -> None:
         """
         Initialize this instance to load data into ``state_dict``
 
diff --git a/torch/distributed/_shard/checkpoint/planner_helpers.py b/torch/distributed/checkpoint/planner_helpers.py
similarity index 74%
rename from torch/distributed/_shard/checkpoint/planner_helpers.py
rename to torch/distributed/checkpoint/planner_helpers.py
index fce7699b953f0..23fbcd0d7e78b 100644
--- a/torch/distributed/_shard/checkpoint/planner_helpers.py
+++ b/torch/distributed/checkpoint/planner_helpers.py
@@ -26,12 +26,13 @@
     TensorStorageMetadata,
     MetadataIndex,
     STATE_DICT_TYPE,
-    STORAGE_TYPES
+    STORAGE_TYPES,
 )
 
-from .resharding import (
-    _shards_get_overlap_region_wrt_saved_tensor
-)
+from .resharding import _shards_get_overlap_region_wrt_saved_tensor
+
+__all__: List[str] = []
+
 
 def _create_shard_metadata(size: torch.Size) -> ShardMetadata:
     return ShardMetadata(
@@ -39,26 +40,31 @@ def _create_shard_metadata(size: torch.Size) -> ShardMetadata:
         shard_sizes=list(size),
     )
 
+
 def _create_shard_from_tensor(tensor: torch.Tensor) -> Shard:
-    return Shard(
-        tensor=tensor,
-        metadata=_create_shard_metadata(tensor.size())
-    )
+    return Shard(tensor=tensor, metadata=_create_shard_metadata(tensor.size()))
+
 
 def _chunk_for_shard(shard_md: ShardMetadata) -> ChunkStorageMetadata:
     return ChunkStorageMetadata(
         offsets=torch.Size(shard_md.shard_offsets),
-        sizes=torch.Size(shard_md.shard_sizes)
+        sizes=torch.Size(shard_md.shard_sizes),
     )
 
-def _sharded_tensor_metadata(sharded_tensor: ShardedTensor, shard_md: ShardMetadata) -> TensorWriteData:
+
+def _sharded_tensor_metadata(
+    sharded_tensor: ShardedTensor, shard_md: ShardMetadata
+) -> TensorWriteData:
     return TensorWriteData(
         chunk=_chunk_for_shard(shard_md),
         properties=sharded_tensor.metadata().tensor_properties,
         size=sharded_tensor.metadata().size,
     )
 
-def _create_write_item_for_shard(fqn: str, sharded_tensor: ShardedTensor, shard_md: ShardMetadata) -> WriteItem:
+
+def _create_write_item_for_shard(
+    fqn: str, sharded_tensor: ShardedTensor, shard_md: ShardMetadata
+) -> WriteItem:
     offsets = torch.Size(shard_md.shard_offsets)
     return WriteItem(
         index=MetadataIndex(fqn, offsets),
@@ -66,28 +72,30 @@ def _create_write_item_for_shard(fqn: str, sharded_tensor: ShardedTensor, shard_
         tensor_data=_sharded_tensor_metadata(sharded_tensor, shard_md),
     )
 
+
 def _create_write_item_for_tensor(fqn: str, tensor: torch.Tensor) -> WriteItem:
     offsets = torch.Size([0] * len(tensor.size()))
     return WriteItem(
         index=MetadataIndex(fqn, offsets),
         type=WriteItemType.TENSOR,
         tensor_data=TensorWriteData(
-            chunk=ChunkStorageMetadata(
-                offsets=offsets,
-                sizes=tensor.size()
-            ),
+            chunk=ChunkStorageMetadata(offsets=offsets, sizes=tensor.size()),
             properties=TensorProperties.create_from_tensor(tensor),
             size=tensor.size(),
-        )
+        ),
     )
 
+
 def _create_write_item_for_bytesio(fqn: str, bytes: Any):
     return WriteItem(
         index=MetadataIndex(fqn),
         type=WriteItemType.BYTE_IO,
     )
 
-def _create_read_item_for_byteio(dest_index, dest_offset, storage_index, storage_offset, length):
+
+def _create_read_item_for_byteio(
+    dest_index, dest_offset, storage_index, storage_offset, length
+):
     return ReadItem(
         type=LoadItemType.BYTE_IO,
         dest_index=dest_index,
@@ -97,7 +105,10 @@ def _create_read_item_for_byteio(dest_index, dest_offset, storage_index, storage
         lengths=torch.Size((length,)),
     )
 
-def _create_read_item_for_tensor(dest_index, dest_offsets, storage_index, storage_offsets, lengths):
+
+def _create_read_item_for_tensor(
+    dest_index, dest_offsets, storage_index, storage_offsets, lengths
+):
     return ReadItem(
         type=LoadItemType.TENSOR,
         dest_index=dest_index,
@@ -107,6 +118,7 @@ def _create_read_item_for_tensor(dest_index, dest_offsets, storage_index, storag
         lengths=torch.Size(lengths),
     )
 
+
 def _create_sharded_read_items(
     fqn: str,
     checkpoint_md: TensorStorageMetadata,
@@ -144,56 +156,66 @@ def _create_sharded_read_items(
 
             read_items.append(
                 _create_read_item_for_tensor(
-                    dest_index=MetadataIndex(fqn, shard.metadata.shard_offsets, idx),
+                    dest_index=MetadataIndex(
+                        fqn, shard.metadata.shard_offsets, idx
+                    ),
                     dest_offsets=dest_offsets,
-                    storage_index=MetadataIndex(fqn, storage_md.offsets, storage_idx),
+                    storage_index=MetadataIndex(
+                        fqn, storage_md.offsets, storage_idx
+                    ),
                     storage_offsets=storage_offsets,
                     lengths=lengths,
                 )
             )
     return read_items
 
+
 def _create_default_metadata_only_plan(state_dict: STATE_DICT_TYPE) -> SavePlan:
     requests = []
     for fqn, obj in state_dict.items():
         if isinstance(obj, ShardedTensor):
             for shard_md in obj.metadata().shards_metadata:
-                requests.append(_create_write_item_for_shard(fqn, obj, shard_md))
+                requests.append(
+                    _create_write_item_for_shard(fqn, obj, shard_md)
+                )
         elif isinstance(obj, torch.Tensor):
             requests.append(_create_write_item_for_tensor(fqn, obj))
         else:
             requests.append(_create_write_item_for_bytesio(fqn, obj))
     return SavePlan(requests)
 
+
 def _create_write_items(fqn: str, object: Any) -> List[WriteItem]:
     if isinstance(object, ShardedTensor):
-        return [_create_write_item_for_shard(fqn, object, shard.metadata) for shard in object.local_shards()]
+        return [
+            _create_write_item_for_shard(fqn, object, shard.metadata)
+            for shard in object.local_shards()
+        ]
     elif isinstance(object, torch.Tensor):
         return [_create_write_item_for_tensor(fqn, object)]
     else:
         return [_create_write_item_for_bytesio(fqn, object)]
 
+
 def _create_read_items(fqn: str, md: STORAGE_TYPES, obj: Any) -> List[ReadItem]:
     if isinstance(md, BytesStorageMetadata):
-        return [_create_read_item_for_byteio(
-            dest_index=MetadataIndex(fqn),
-            dest_offset=0,
-            storage_index=MetadataIndex(fqn),
-            storage_offset=0,
-            length=0
-        )]
+        return [
+            _create_read_item_for_byteio(
+                dest_index=MetadataIndex(fqn),
+                dest_offset=0,
+                storage_index=MetadataIndex(fqn),
+                storage_offset=0,
+                length=0,
+            )
+        ]
     elif isinstance(obj, ShardedTensor):
         local_shards = obj.local_shards()
     elif isinstance(obj, torch.Tensor):
         local_shards = [_create_shard_from_tensor(obj)]
     else:
         raise ValueError(
-            f"Invalid checkpoint metadata for {fqn}, " +
-            f"expected BytesStorageMetadata but found {type(md)}"
+            f"Invalid checkpoint metadata for {fqn}, "
+            + f"expected BytesStorageMetadata but found {type(md)}"
         )
 
-    return _create_sharded_read_items(
-        fqn,
-        md,
-        local_shards
-    )
+    return _create_sharded_read_items(fqn, md, local_shards)
diff --git a/torch/distributed/_shard/checkpoint/resharding.py b/torch/distributed/checkpoint/resharding.py
similarity index 91%
rename from torch/distributed/_shard/checkpoint/resharding.py
rename to torch/distributed/checkpoint/resharding.py
index f98248f5367bf..c00def73b14dc 100644
--- a/torch/distributed/_shard/checkpoint/resharding.py
+++ b/torch/distributed/checkpoint/resharding.py
@@ -4,6 +4,9 @@
     ShardMetadata,
 )
 
+__all__: List[str] = []
+
+
 def _shards_get_overlap_region_wrt_saved_tensor(
     saved_shard: ShardMetadata, current_shard: ShardMetadata
 ) -> List[Tuple[int, int, int, int]]:
@@ -38,7 +41,9 @@ def _shards_get_overlap_region_wrt_saved_tensor(
 
         if saved_shard_offset > current_shard_offset:
             offset_for_saved_tensor = 0
-            offset_for_current_tensor = saved_shard_offset - current_shard_offset
+            offset_for_current_tensor = (
+                saved_shard_offset - current_shard_offset
+            )
         else:
             offset_for_saved_tensor = current_shard_offset - saved_shard_offset
             offset_for_current_tensor = 0
diff --git a/torch/distributed/_shard/checkpoint/state_dict_loader.py b/torch/distributed/checkpoint/state_dict_loader.py
similarity index 98%
rename from torch/distributed/_shard/checkpoint/state_dict_loader.py
rename to torch/distributed/checkpoint/state_dict_loader.py
index b9ea55c180c78..de94ffabf663e 100644
--- a/torch/distributed/_shard/checkpoint/state_dict_loader.py
+++ b/torch/distributed/checkpoint/state_dict_loader.py
@@ -10,13 +10,16 @@
 
 from .utils import _DistWrapper
 
+__all__ = ["load_state_dict"]
+
+
 def load_state_dict(
     state_dict: Dict[str, Any],
     storage_reader: StorageReader,
     process_group: Optional[dist.ProcessGroup] = None,
     coordinator_rank: int = 0,
     no_dist: bool = False,
-    planner: LoadPlanner = None
+    planner: LoadPlanner = None,
 ) -> None:
     """
     Load a distributed state_dict in SPMD style.
@@ -79,7 +82,6 @@ def load_state_dict(
     if planner is None:
         planner = DefaultLoadPlanner()
 
-
     def local_step():
         assert planner is not None
         metadata = storage_reader.read_metadata()
diff --git a/torch/distributed/_shard/checkpoint/state_dict_saver.py b/torch/distributed/checkpoint/state_dict_saver.py
similarity index 96%
rename from torch/distributed/_shard/checkpoint/state_dict_saver.py
rename to torch/distributed/checkpoint/state_dict_saver.py
index c4792e0c42ef9..af18fd0c11dde 100644
--- a/torch/distributed/_shard/checkpoint/state_dict_saver.py
+++ b/torch/distributed/checkpoint/state_dict_saver.py
@@ -9,12 +9,11 @@
     StorageWriter,
 )
 
-from .metadata import (
-    Metadata,
-    STATE_DICT_TYPE
-)
+from .metadata import Metadata, STATE_DICT_TYPE
 from .utils import _DistWrapper
 
+__all__ = ["save_state_dict"]
+
 
 def save_state_dict(
     state_dict: STATE_DICT_TYPE,
@@ -22,7 +21,7 @@ def save_state_dict(
     process_group: Optional[dist.ProcessGroup] = None,
     coordinator_rank: int = 0,
     no_dist: bool = False,
-    planner: SavePlanner = None
+    planner: SavePlanner = None,
 ) -> Metadata:
     """
     Save a distributed model in SPMD style.
@@ -92,7 +91,9 @@ def global_step(all_local_plans):
         nonlocal global_metatadata
 
         assert planner is not None
-        all_local_plans, global_metatadata = planner.create_global_plan(all_local_plans)
+        all_local_plans, global_metatadata = planner.create_global_plan(
+            all_local_plans
+        )
         all_local_plans = storage_writer.prepare_global_plan(all_local_plans)
         return all_local_plans
 
diff --git a/torch/distributed/_shard/checkpoint/storage.py b/torch/distributed/checkpoint/storage.py
similarity index 96%
rename from torch/distributed/_shard/checkpoint/storage.py
rename to torch/distributed/checkpoint/storage.py
index 56bd757765f23..dbc8fda59eacc 100644
--- a/torch/distributed/_shard/checkpoint/storage.py
+++ b/torch/distributed/checkpoint/storage.py
@@ -16,6 +16,9 @@
     LoadPlanner,
 )
 
+__all__ = ["WriteResult", "StorageWriter", "StorageReader"]
+
+
 @dataclass(frozen=True)
 class WriteResult:
     index: MetadataIndex
@@ -23,6 +26,7 @@ class WriteResult:
     size_in_bytes: int
     storage_data: Any
 
+
 class StorageWriter(abc.ABC):
     """
     Interface used by ``save_state_dict`` to write to storage.
@@ -87,9 +91,7 @@ def prepare_global_plan(self, plans: List[SavePlan]) -> List[SavePlan]:
 
     @abc.abstractmethod
     def write_data(
-        self,
-        plan: SavePlan,
-        planner: SavePlanner
+        self, plan: SavePlan, planner: SavePlanner
     ) -> Future[List[WriteResult]]:
         """
         Write all items from ``plan`` using ``planner`` to resolve the data.
@@ -113,7 +115,9 @@ def write_data(
         pass
 
     @abc.abstractmethod
-    def finish(self, metadata: Metadata, results: List[List[WriteResult]]) -> None:
+    def finish(
+        self, metadata: Metadata, results: List[List[WriteResult]]
+    ) -> None:
         """
         Writes the metadata and marks the current checkpoint as sucessful.
 
@@ -130,6 +134,7 @@ def finish(self, metadata: Metadata, results: List[List[WriteResult]]) -> None:
         """
         pass
 
+
 class StorageReader(abc.ABC):
     """
     Interface used by ``load_state_dict`` to read from storage.
@@ -146,6 +151,7 @@ class StorageReader(abc.ABC):
     4) (coordinator) prepare_global_plan
     5) (all ranks) read_data
     """
+
     @abc.abstractmethod
     def read_metadata(self) -> Metadata:
         """
diff --git a/torch/distributed/_shard/checkpoint/utils.py b/torch/distributed/checkpoint/utils.py
similarity index 77%
rename from torch/distributed/_shard/checkpoint/utils.py
rename to torch/distributed/checkpoint/utils.py
index e82855672c226..a8d2a42d0fca6 100644
--- a/torch/distributed/_shard/checkpoint/utils.py
+++ b/torch/distributed/checkpoint/utils.py
@@ -4,7 +4,7 @@
     CheckpointException,
     _wrap_exception,
     _is_wrapped_exception,
-    WRAPPED_EXCEPTION
+    WRAPPED_EXCEPTION,
 )
 
 import torch
@@ -20,12 +20,20 @@
     MetadataIndex,
 )
 
+__all__ = ["find_tensor_shard", "find_state_dict_object"]
 
-T = TypeVar('T')
-R = TypeVar('R')
+T = TypeVar("T")
+R = TypeVar("R")
+
+
+def _get_failure_dict(
+    results: List[Union[T, WRAPPED_EXCEPTION]]
+) -> Dict[int, WRAPPED_EXCEPTION]:
+    return cast(
+        Dict[int, WRAPPED_EXCEPTION],
+        {i: err for i, err in enumerate(results) if _is_wrapped_exception(err)},
+    )
 
-def _get_failure_dict(results: List[Union[T, WRAPPED_EXCEPTION]]) -> Dict[int, WRAPPED_EXCEPTION]:
-    return cast(Dict[int, WRAPPED_EXCEPTION], {i: err for i, err in enumerate(results) if _is_wrapped_exception(err)})
 
 class _DistWrapper:
     """
@@ -36,7 +44,13 @@ class _DistWrapper:
     All variants that take functions are exception robust, meaning that if one or more
     ranks raise errors, all ranks will observe those.
     """
-    def __init__(self, group: Optional[dist.ProcessGroup], use_dist: bool, coordinator_rank: int):
+
+    def __init__(
+        self,
+        group: Optional[dist.ProcessGroup],
+        use_dist: bool,
+        coordinator_rank: int,
+    ):
         self.group = group
         self.use_dist = use_dist
         self.coordinator_rank = coordinator_rank
@@ -64,7 +78,8 @@ def broadcast_object(self, object: Optional[T]) -> T:
             dist.broadcast_object_list(
                 object_list=object_list,
                 group=self.group,
-                src=self.coordinator_rank)
+                src=self.coordinator_rank,
+            )
         return cast(T, object_list[0])
 
     def gather_object(self, object: T) -> Optional[List[T]]:
@@ -72,13 +87,17 @@ def gather_object(self, object: T) -> Optional[List[T]]:
         Same as c10d::gather_object but works without distributed enabled.
         """
         if self.use_dist:
-            gather_objs = cast(List[T], [None] * dist.get_world_size(self.group)) if self.is_coordinator else None
+            gather_objs = (
+                cast(List[T], [None] * dist.get_world_size(self.group))
+                if self.is_coordinator
+                else None
+            )
 
             dist.gather_object(
                 obj=object,
                 object_gather_list=gather_objs if self.is_coordinator else None,
                 dst=self.coordinator_rank,
-                group=self.group
+                group=self.group,
             )
             result = gather_objs
         else:
@@ -90,12 +109,12 @@ def all_gather_object(self, object: T) -> List[T]:
         Same as c10d::all_gather_object but works without distributed enabled.
         """
         if self.use_dist:
-            gather_objs = cast(List[T], [None] * dist.get_world_size(self.group))
+            gather_objs = cast(
+                List[T], [None] * dist.get_world_size(self.group)
+            )
 
             dist.all_gather_object(
-                object_list=gather_objs,
-                obj=object,
-                group=self.group
+                object_list=gather_objs, obj=object, group=self.group
             )
         else:
             gather_objs = [object]
@@ -109,9 +128,11 @@ def scatter_object(self, object_list: Optional[List[T]]) -> T:
             gather_result = cast(List[T], [None])
             dist.scatter_object_list(
                 scatter_object_output_list=gather_result,
-                scatter_object_input_list=object_list if self.is_coordinator else None,
+                scatter_object_input_list=object_list
+                if self.is_coordinator
+                else None,
                 src=self.coordinator_rank,
-                group=self.group
+                group=self.group,
             )
 
             local_reply = gather_result[0]
@@ -124,7 +145,7 @@ def reduce_scatter(
         self,
         step: str,
         map_fun: Callable[[], T],
-        reduce_fun: Callable[[List[T]], List[R]]
+        reduce_fun: Callable[[List[T]], List[R]],
     ) -> R:
         """
         Compute a value on each rank, then do centralized reduce on a single rank, followed by a scatter.
@@ -150,12 +171,17 @@ def reduce_scatter(
             if len(node_failures) == 0:
                 try:
                     # N.B. why can't mypy cast List[R] to List[Union[R, WRAPPED_EXCEPTION]]?
-                    all_results = cast(List[Union[R, CheckpointException]], reduce_fun(cast(List[T], all_data)))
+                    all_results = cast(
+                        List[Union[R, CheckpointException]],
+                        reduce_fun(cast(List[T], all_data)),
+                    )
                 except BaseException as e:
                     node_failures[self.rank] = _wrap_exception(e)
 
             if len(node_failures) > 0:
-                all_results = [CheckpointException(step, node_failures)] * self.get_world_size()
+                all_results = [
+                    CheckpointException(step, node_failures)
+                ] * self.get_world_size()
 
         result = self.scatter_object(all_results)
         if isinstance(result, CheckpointException):
@@ -166,7 +192,7 @@ def all_reduce(
         self,
         step: str,
         map_fun: Callable[[], T],
-        reduce_fun: Callable[[List[T]], R]
+        reduce_fun: Callable[[List[T]], R],
     ) -> R:
         """
         Compute a value on each rank, then do centralized reduce on a single rank, followed by a broadcast.
@@ -244,43 +270,64 @@ def broadcast(
             try:
                 result = map_fun()
             except BaseException as e:
-                result = CheckpointException(step, {self.rank: _wrap_exception(e)})
+                result = CheckpointException(
+                    step, {self.rank: _wrap_exception(e)}
+                )
         final_result = self.broadcast_object(result)
         if isinstance(final_result, CheckpointException):
             raise final_result
         return cast(T, final_result)
 
+
 def _find_shard(tensor: ShardedTensor, index: MetadataIndex) -> Shard:
     if index.offset is None:
-        raise ValueError(f"Cannot lookup {index.fqn} since its a ShardedTensor and no offset was provided")
+        raise ValueError(
+            f"Cannot lookup {index.fqn} since its a ShardedTensor and no offset was provided"
+        )
 
     shards = tensor.local_shards()
     # index fast path
     if index.index is not None:
-        if len(shards) > index.index and torch.Size(shards[index.index].metadata.shard_offsets) == index.offset:
+        if (
+            len(shards) > index.index
+            and torch.Size(shards[index.index].metadata.shard_offsets)
+            == index.offset
+        ):
             return shards[index.index]
 
     for shard in shards:
         if torch.Size(shard.metadata.shard_offsets) == index.offset:
             return shard
-    raise ValueError(f"Could not find shard at '{index.offset}' for FQN: '{index.fqn}'")
+    raise ValueError(
+        f"Could not find shard at '{index.offset}' for FQN: '{index.fqn}'"
+    )
+
 
-def find_tensor_shard(tensor: torch.Tensor, index: MetadataIndex) -> torch.Tensor:
+def find_tensor_shard(
+    tensor: torch.Tensor, index: MetadataIndex
+) -> torch.Tensor:
     if isinstance(tensor, ShardedTensor):
         return _find_shard(tensor, index).tensor
     if index.offset is not None:
         # special case looking up a tensor by origin
         if index.offset == torch.Size([0] * len(tensor.size())):
             return tensor
-        raise ValueError(f"FQN: '{index.fqn}' is not a ShardedTensor, can't find by offset: '{index.offset}'")
+        raise ValueError(
+            f"FQN: '{index.fqn}' is not a ShardedTensor, can't find by offset: '{index.offset}'"
+        )
     return tensor
 
-def find_state_dict_object(state_dict: STATE_DICT_TYPE, index: MetadataIndex) -> Any:
+
+def find_state_dict_object(
+    state_dict: STATE_DICT_TYPE, index: MetadataIndex
+) -> Any:
     if index.fqn not in state_dict:
         raise ValueError(f"Could not find FQN: '{index.fqn}'")
     obj = state_dict[index.fqn]
     if isinstance(obj, torch.Tensor):
         return find_tensor_shard(obj, index)
     elif index.offset is not None:
-        raise ValueError(f"FQN: '{index.fqn}' is not a ShardedTensor, can't find by offset: '{index.offset}'")
+        raise ValueError(
+            f"FQN: '{index.fqn}' is not a ShardedTensor, can't find by offset: '{index.offset}'"
+        )
     return obj

From ade2cc61c89f502d97e373cf626a7f4766c90dd1 Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Wed, 16 Nov 2022 10:07:14 -0800
Subject: [PATCH 0980/1922] [ao] making _is_activation_post_process private
 (#87520)

Summary: same function in observer and quantize, consolidated to a
single function. Note the definitions were slightly different, I've
changed the definition to be maximally inclusive so that the name of the
function is more accurate

Test Plan: python test/test_public_bindings.py
python test/test_quantization.py

Reviewers:

Subscribers:

Tasks:

Tags:

Differential Revision: [D40709276](https://our.internmc.facebook.com/intern/diff/D40709276)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87520
Approved by: https://github.com/jcaip
---
 test/allowlist_for_publicAPI.json                   | 4 ++--
 test/quantization/ao_migration/test_ao_migration.py | 2 +-
 test/quantization/ao_migration/test_quantization.py | 2 +-
 test/quantization/fx/test_quantize_fx.py            | 6 +++---
 torch/ao/ns/fx/graph_passes.py                      | 4 ++--
 torch/ao/ns/fx/utils.py                             | 8 ++++----
 torch/ao/quantization/__init__.py                   | 1 -
 torch/ao/quantization/fx/_model_report/detector.py  | 4 ++--
 torch/ao/quantization/fx/convert.py                 | 6 +++---
 torch/ao/quantization/fx/prepare.py                 | 4 ++--
 torch/ao/quantization/fx/qconfig_mapping_utils.py   | 6 +++---
 torch/ao/quantization/fx/utils.py                   | 6 +++---
 torch/ao/quantization/observer.py                   | 2 +-
 torch/ao/quantization/quantize.py                   | 9 ++-------
 torch/quantization/quantize.py                      | 2 +-
 15 files changed, 30 insertions(+), 36 deletions(-)

diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index 94ff57700af67..2e1394a72e172 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -786,7 +786,7 @@
     "get_quantized_operator",
     "get_static_quant_module_class",
     "get_unique_devices_",
-    "is_activation_post_process",
+    "_is_activation_post_process",
     "load_observer_state_dict",
     "no_observer_set",
     "prepare",
@@ -894,7 +894,7 @@
     "convert",
     "get_observer_dict",
     "get_unique_devices_",
-    "is_activation_post_process",
+    "_is_activation_post_process",
     "prepare",
     "prepare_qat",
     "propagate_qconfig_",
diff --git a/test/quantization/ao_migration/test_ao_migration.py b/test/quantization/ao_migration/test_ao_migration.py
index accb13da0dcbd..260ab32056f61 100644
--- a/test/quantization/ao_migration/test_ao_migration.py
+++ b/test/quantization/ao_migration/test_ao_migration.py
@@ -19,7 +19,7 @@ def test_function_import_quantize(self):
             'convert',
             'get_observer_dict',
             'get_unique_devices_',
-            'is_activation_post_process',
+            '_is_activation_post_process',
             'prepare',
             'prepare_qat',
             'propagate_qconfig_',
diff --git a/test/quantization/ao_migration/test_quantization.py b/test/quantization/ao_migration/test_quantization.py
index 9c246e1b7cd89..95c5c7bd60150 100644
--- a/test/quantization/ao_migration/test_quantization.py
+++ b/test/quantization/ao_migration/test_quantization.py
@@ -22,7 +22,7 @@ def test_function_import_quantize(self):
             'convert',
             'get_observer_dict',
             'get_unique_devices_',
-            'is_activation_post_process',
+            '_is_activation_post_process',
             'prepare',
             'prepare_qat',
             'propagate_qconfig_',
diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 6c631a24abc60..6cee5e95f21cd 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -55,7 +55,6 @@
     get_default_qat_qconfig,
     get_default_qconfig_mapping,
     get_default_qat_qconfig_mapping,
-    is_activation_post_process,
     fuse_modules,
     fuse_modules_qat,
     prepare,
@@ -148,6 +147,7 @@
     default_fixed_qparams_range_0to1_observer,
     default_fixed_qparams_range_neg1to1_observer,
     MinMaxObserver,
+    _is_activation_post_process,
 )
 
 # test utils
@@ -3249,7 +3249,7 @@ def _check_node_not_observed(model, arg_node, node):
                     _check_node_not_observed(model, new_node, node)
             elif arg_node.op == "call_module":
                 self.assertTrue(
-                    not is_activation_post_process(getattr(model, arg_node.target)),
+                    not _is_activation_post_process(getattr(model, arg_node.target)),
                     "Arg: {0} of node: {1} is observed but is not a float tensor".format(
                         arg_node, node
                     ),
@@ -4933,7 +4933,7 @@ def forward(self, x):
                 qconfig_dict = func(backend)
                 m = prepare_fx(m, qconfig_dict, example_inputs=(torch.randn(1, 1, 1, 1)))
                 for name, mod in m.named_modules():
-                    if is_activation_post_process(mod) and mod.dtype == torch.quint8:
+                    if _is_activation_post_process(mod) and mod.dtype == torch.quint8:
                         if backend == "fbgemm":
                             lower_bnd = 0
                             upper_bnd = 127
diff --git a/torch/ao/ns/fx/graph_passes.py b/torch/ao/ns/fx/graph_passes.py
index c78b19d2701b1..3f4e156859024 100644
--- a/torch/ao/ns/fx/graph_passes.py
+++ b/torch/ao/ns/fx/graph_passes.py
@@ -24,7 +24,7 @@
 from torch.ao.ns.fx.mappings import (
     get_node_type_to_io_type_map,
 )
-from torch.ao.quantization.quantize import is_activation_post_process
+from torch.ao.quantization.observer import _is_activation_post_process
 
 from typing import Dict, Tuple, Callable, List, Any, Union, Optional, Set
 
@@ -38,7 +38,7 @@ def _maybe_get_fqn(node: Node, gm: GraphModule) -> Optional[str]:
         if node.op == 'call_module':
             assert isinstance(node.target, str)
             module = getattr_from_fqn(gm, node.target)
-            if is_activation_post_process(module):
+            if _is_activation_post_process(module):
                 node_to_use_for_fqn = get_normalized_nth_input(node, gm, 0)
         fqn = gm._node_name_to_scope[node_to_use_for_fqn.name][0]  # type: ignore[index]
     return fqn  # type: ignore[return-value]
diff --git a/torch/ao/ns/fx/utils.py b/torch/ao/ns/fx/utils.py
index 2993764b8a124..90574dc20248d 100644
--- a/torch/ao/ns/fx/utils.py
+++ b/torch/ao/ns/fx/utils.py
@@ -13,10 +13,10 @@
 from torch.fx.graph import Node
 from torch.ao.quantization import (
     ObserverBase,
-    FakeQuantizeBase,
+    FakeQuantizeBase
 )
+from torch.ao.quantization.observer import _is_activation_post_process
 from torch.ao.quantization.utils import getattr_from_fqn
-from torch.ao.quantization.quantize import is_activation_post_process
 
 from .ns_types import NSNodeTargetType, NSResultsType
 
@@ -256,14 +256,14 @@ def return_first_non_observer_node(
     """
     if node.op == "call_module":
         node_obj = getattr_from_fqn(gm, node.target)  # type: ignore[arg-type]
-        if is_activation_post_process(node_obj):
+        if _is_activation_post_process(node_obj):
             assert len(node.args) == 1
             assert isinstance(node.args[0], Node)
             node = node.args[0]
             # code duplication intended, not worth refactoring
             assert isinstance(node.target, str)
             node_obj = getattr_from_fqn(gm, node.target)
-            if is_activation_post_process(node_obj):
+            if _is_activation_post_process(node_obj):
                 assert len(node.args) == 1
                 assert isinstance(node.args[0], Node)
                 node = node.args[0]
diff --git a/torch/ao/quantization/__init__.py b/torch/ao/quantization/__init__.py
index 1ba2a60ed3d12..bc8403f32af8f 100644
--- a/torch/ao/quantization/__init__.py
+++ b/torch/ao/quantization/__init__.py
@@ -114,7 +114,6 @@
     "get_quantized_operator",
     "get_static_quant_module_class",
     "get_unique_devices_",
-    "is_activation_post_process",
     "load_observer_state_dict",
     "no_observer_set",
     "per_channel_weight_observer_range_neg_127_to_127",
diff --git a/torch/ao/quantization/fx/_model_report/detector.py b/torch/ao/quantization/fx/_model_report/detector.py
index c92733bbc1c32..d398819ddcdd5 100644
--- a/torch/ao/quantization/fx/_model_report/detector.py
+++ b/torch/ao/quantization/fx/_model_report/detector.py
@@ -23,7 +23,7 @@
     default_equalization_qconfig,
     EqualizationQConfig,
 )
-from torch.ao.quantization.quantize import is_activation_post_process
+from torch.ao.quantization.observer import _is_activation_post_process
 
 # Names for observer insert keys
 DETECTOR_TARGET_NODE_KEY = "target_node"
@@ -1273,7 +1273,7 @@ def _supports_insertion(self, module: nn.Module) -> bool:
         # case for insertion of module
         # check if the module has any children and isn't observer
         num_children = len(list(module.children()))
-        return num_children == 0 and not is_activation_post_process(module)
+        return num_children == 0 and not _is_activation_post_process(module)
 
     def get_qconfig_info(self, model) -> Dict[str, DetectorQConfigInfo]:
         r""" Returns the DetectorQConfigInfo for each module_fqn relavent
diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py
index b5e9cf3bbcb34..0c1249b4858d1 100644
--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@@ -61,7 +61,6 @@
 )
 from torch.ao.quantization.quantize import (
     _remove_qconfig,
-    is_activation_post_process,
 )
 from torch.ao.quantization.stubs import DeQuantStub
 from .custom_config import (
@@ -71,6 +70,7 @@
 from .lower_to_fbgemm import lower_to_fbgemm
 # importing the lib so that the quantized_decomposed ops are registered
 from ._decomposed import quantized_decomposed_lib  # noqa: F401
+from torch.ao.quantization.observer import _is_activation_post_process
 
 
 # TODO: revisit this list. Many helper methods shouldn't be public
@@ -218,7 +218,7 @@ def maybe_get_observer_for_node(
     for maybe_obs_node, _ in node.users.items():
         if maybe_obs_node.op == 'call_module':
             maybe_obs = modules[str(maybe_obs_node.target)]
-            if is_activation_post_process(maybe_obs):
+            if _is_activation_post_process(maybe_obs):
                 return maybe_obs
     return None
 
@@ -725,7 +725,7 @@ def replace_observer_or_dequant_stub_with_dequantize_node(node: Node, graph: Gra
         elif node.op == "call_module":
             mod = _get_module(node, modules)
             assert mod is not None
-            if is_activation_post_process(mod):
+            if _is_activation_post_process(mod):
                 observed_node = node.args[0]
                 if observed_node in statically_quantized_custom_module_nodes:
                     replace_observer_or_dequant_stub_with_dequantize_node(node, model.graph)
diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index c908e3f3b7644..005a9cef45e34 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -16,6 +16,7 @@
 )
 from ..observer import (
     ObserverBase,
+    _is_activation_post_process
 )
 from ..qconfig import (
     _is_reuse_input_qconfig,
@@ -78,7 +79,6 @@
 )
 
 from torch.ao.quantization.quantize import (
-    is_activation_post_process,
     convert
 )
 
@@ -148,7 +148,7 @@
 
 def is_activation_post_process_node(node: Node, modules: Dict[str, torch.nn.Module]) -> bool:
     return isinstance(node, torch.fx.Node) and node.op == "call_module" and \
-        is_activation_post_process(modules[str(node.target)])
+        _is_activation_post_process(modules[str(node.target)])
 
 def is_input_arg_dtype_supported_by_backend(
     arg: Argument,
diff --git a/torch/ao/quantization/fx/qconfig_mapping_utils.py b/torch/ao/quantization/fx/qconfig_mapping_utils.py
index 0b0407c0b106e..26c7effd44dbf 100644
--- a/torch/ao/quantization/fx/qconfig_mapping_utils.py
+++ b/torch/ao/quantization/fx/qconfig_mapping_utils.py
@@ -3,8 +3,8 @@
 from typing import Callable, Any, Dict, Tuple, Set, List
 from torch.ao.quantization import QConfig
 from torch.ao.quantization.qconfig import _add_module_to_qconfig_obs_ctr, QConfigAny, qconfig_equals
-from torch.ao.quantization.quantize import (
-    is_activation_post_process,
+from torch.ao.quantization.observer import (
+    _is_activation_post_process,
 )
 from torch.ao.quantization.backend_config import (
     DTypeConfig,
@@ -158,7 +158,7 @@ def generate_node_name_to_qconfig(
 
         elif node.op == 'call_module':
             # if the node is an observer, just continue - don't add it to the qconfig_map
-            if is_activation_post_process(modules[node.target]):
+            if _is_activation_post_process(modules[node.target]):
                 continue
             qconfig = _maybe_adjust_qconfig_for_module_type_or_name(
                 qconfig_mapping, type(modules[node.target]), node.target, global_qconfig)
diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py
index 73fdb0700144d..b8bfa4c9d053c 100644
--- a/torch/ao/quantization/fx/utils.py
+++ b/torch/ao/quantization/fx/utils.py
@@ -30,7 +30,7 @@
     is_per_channel,
     to_underlying_dtype,
 )
-from torch.ao.quantization.quantize import is_activation_post_process
+from torch.ao.quantization.observer import _is_activation_post_process
 
 from torch.fx import GraphModule, map_arg
 
@@ -447,7 +447,7 @@ def all_node_args_have_no_tensors(node: Node, modules: Dict[str, torch.nn.Module
         result = False
     elif node.op == 'call_module':
         assert isinstance(node.target, str)
-        if is_activation_post_process(modules[node.target]):
+        if _is_activation_post_process(modules[node.target]):
             result = all_node_args_have_no_tensors(node.args[0], modules, cache)  # type: ignore[arg-type]
     elif node.op == 'call_module':
         result = False
@@ -1040,7 +1040,7 @@ def _activation_post_process_satisfies_dtype_config_constraints(
     satisfies_constraints = True
     if activation_post_process_ctr is not None:
         activation_post_process = activation_post_process_ctr()
-        assert is_activation_post_process(activation_post_process)
+        assert _is_activation_post_process(activation_post_process)
         # If dtypes don't match, don't check the activation_post_process and return True early
         if activation_post_process.dtype != dtype_with_constraints.dtype:
             return True
diff --git a/torch/ao/quantization/observer.py b/torch/ao/quantization/observer.py
index e704444d0a6dc..26a39c8c2e02f 100644
--- a/torch/ao/quantization/observer.py
+++ b/torch/ao/quantization/observer.py
@@ -1437,7 +1437,7 @@ def _is_observer_script_module(mod, obs_type_name):
 def _is_activation_post_process(module):
     return (
         isinstance(module, torch.ao.quantization.ObserverBase)
-        or isinstance(module, torch.ao.quantization.FakeQuantize)
+        or isinstance(module, torch.ao.quantization.FakeQuantizeBase)
         or _is_observer_script_module(module, "quantization.observer")
     )
 
diff --git a/torch/ao/quantization/quantize.py b/torch/ao/quantization/quantize.py
index 9f5537ec85615..b9ef24e35fdbb 100644
--- a/torch/ao/quantization/quantize.py
+++ b/torch/ao/quantization/quantize.py
@@ -27,10 +27,10 @@
     float_qparams_weight_only_qconfig_4bit,
     _activation_is_memoryless)
 from torch.nn.utils.parametrize import type_before_parametrizations
+from torch.ao.quantization.observer import _is_activation_post_process
 
 __all__ = [
     "get_default_custom_config_dict",
-    "is_activation_post_process",
     "propagate_qconfig_",
     "register_activation_post_process_hook",
     "add_observer_",
@@ -62,11 +62,6 @@ def get_default_custom_config_dict():
     """
     return _DEFAULT_CUSTOM_CONFIG_DICT
 
-def is_activation_post_process(module):
-    return (isinstance(module, torch.ao.quantization.ObserverBase) or
-            isinstance(module, torch.ao.quantization.FakeQuantizeBase))
-
-
 def _propagate_qconfig_helper(module, qconfig_dict,
                               qconfig_parent=None, prefix='', prepare_custom_config_dict=None):
     r"""This is a helper function for `propagate_qconfig_`
@@ -322,7 +317,7 @@ def _remove_activation_post_process(module):
     # TODO: maybe we should change activation_post_process to _activation_post_process
     # to prevent it from being used by user
     if hasattr(module, 'activation_post_process') and \
-       is_activation_post_process(module.activation_post_process):
+       _is_activation_post_process(module.activation_post_process):
         delattr(module, 'activation_post_process')
 
     # remove activation_post_proceess pre and post hooks
diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py
index d9fcf1d04d8ba..24d7049ec50ec 100644
--- a/torch/quantization/quantize.py
+++ b/torch/quantization/quantize.py
@@ -17,7 +17,7 @@
 from torch.ao.quantization.quantize import convert
 from torch.ao.quantization.quantize import get_observer_dict
 from torch.ao.quantization.quantize import get_unique_devices_
-from torch.ao.quantization.quantize import is_activation_post_process
+from torch.ao.quantization.quantize import _is_activation_post_process
 from torch.ao.quantization.quantize import prepare
 from torch.ao.quantization.quantize import prepare_qat
 from torch.ao.quantization.quantize import propagate_qconfig_

From 727894ff5ec28ab5dd89376e227bd400fb557e65 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 16 Nov 2022 11:59:40 -0500
Subject: [PATCH 0981/1922] Reland "SymIntify convolution backend calculation
 (#89069)"" (#89142)

This reverts commit 90db86be108184a6c86c73e1b01012352c72e66b.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89142
Approved by: https://github.com/albanD, https://github.com/malfet
---
 aten/src/ATen/native/ConvUtils.h        |  79 ++++--
 aten/src/ATen/native/Convolution.cpp    | 319 +++++++++++++-----------
 aten/src/ATen/native/utils/ParamUtils.h |  21 +-
 c10/core/SymInt.h                       |  13 +
 torch/csrc/Module.cpp                   |  12 +-
 5 files changed, 270 insertions(+), 174 deletions(-)

diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h
index b8e2b0842a002..880ce0c2af54a 100644
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@@ -110,8 +110,8 @@ enum class ConvBackend {
 // This overload is exposed to python for testing, etc.
 TORCH_API ConvBackend select_conv_backend(
     const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
-    IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation,
-    bool transposed, IntArrayRef output_padding, int64_t groups, const at::OptionalIntArrayRef bias_sizes_opt);
+    IntArrayRef stride, SymIntArrayRef padding, IntArrayRef dilation,
+    bool transposed, SymIntArrayRef output_padding, int64_t groups, const at::OptionalSymIntArrayRef bias_sizes_opt);
 
 TORCH_API at::MemoryFormat _determine_backend_memory_format(const Tensor& input,
     const Tensor& weight,
@@ -200,15 +200,16 @@ static void convolution_shape_check(
 // as conv_output_size loses information; this is why conv_input_size
 // takes an extra output_padding argument to resolve the ambiguity.
 
-static inline std::vector<int64_t> conv_output_size(
-    IntArrayRef input_size, IntArrayRef weight_size,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef()
+template <typename T>
+static inline std::vector<T> _conv_output_size(
+    ArrayRef<T> input_size, ArrayRef<T> weight_size,
+    ArrayRef<T> padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef()
 ) {
   // ASSERT(input_size.size() > 2)
   // ASSERT(input_size.size() == weight_size.size())
   bool has_dilation = dilation.size() > 0;
   auto dim = input_size.size();
-  std::vector<int64_t> output_size(dim);
+  std::vector<T> output_size(dim);
   output_size[0] = input_size[input_batch_size_dim];
   output_size[1] = weight_size[weight_output_channels_dim];
   for (const auto d : c10::irange(2, dim)) {
@@ -219,40 +220,84 @@ static inline std::vector<int64_t> conv_output_size(
   return output_size;
 }
 
-static inline std::vector<int64_t> conv_input_size(
-    IntArrayRef output_size, IntArrayRef weight_size,
-    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
+static inline std::vector<int64_t> conv_output_size(
+    IntArrayRef input_size, IntArrayRef weight_size,
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef()
+) {
+  return _conv_output_size(input_size, weight_size, padding, stride, dilation);
+}
+
+static inline std::vector<c10::SymInt> conv_output_size(
+    SymIntArrayRef input_size, SymIntArrayRef weight_size,
+    SymIntArrayRef padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef()
+) {
+  return _conv_output_size(input_size, weight_size, padding, stride, dilation);
+}
+
+template <typename T>
+std::vector<T> _conv_input_size(
+    ArrayRef<T> output_size, ArrayRef<T> weight_size,
+    ArrayRef<T> padding, ArrayRef<T> output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
 ) {
   // ASSERT(output_size.size() > 2)
   // ASSERT(output_size.size() == weight_size.size())
   auto dim = output_size.size();
-  std::vector<int64_t> input_size(dim);
+  std::vector<T> input_size(dim);
   input_size[0] = output_size[output_batch_size_dim];
   input_size[1] = weight_size[weight_input_channels_dim] * groups;
   for (const auto d : c10::irange(2, dim)) {
-    int kernel = dilation[d - 2] * (weight_size[d] - 1) + 1;
-    input_size[d] = (output_size[d] - 1) * stride[d - 2] - (2 * padding[d - 2]) +
+    auto kernel = (weight_size[d] - 1) * dilation[d - 2] + 1;
+    input_size[d] = (output_size[d] - 1) * stride[d - 2] - (padding[d - 2] * 2) +
                      kernel + output_padding[d - 2];
   }
   return input_size;
 }
 
-static inline std::vector<int64_t> conv_weight_size(
-    IntArrayRef input_size, IntArrayRef output_size,
+static inline std::vector<c10::SymInt> conv_input_size(
+    SymIntArrayRef output_size, SymIntArrayRef weight_size,
+    SymIntArrayRef padding, SymIntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
+) {
+  return _conv_input_size(output_size, weight_size, padding, output_padding, stride, dilation, groups);
+}
+
+static inline std::vector<int64_t> conv_input_size(
+    IntArrayRef output_size, IntArrayRef weight_size,
     IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
+) {
+  return _conv_input_size(output_size, weight_size, padding, output_padding, stride, dilation, groups);
+}
+
+template <typename T>
+std::vector<T> _conv_weight_size(
+    ArrayRef<T> input_size, ArrayRef<T> output_size,
+    ArrayRef<T> padding, ArrayRef<T> output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
 ) {
   auto dim = input_size.size();
-  std::vector<int64_t> weight_size(dim);
+  std::vector<T> weight_size(dim);
   weight_size[0] = output_size[1];
   weight_size[1] = input_size[1] / groups;
   for (const auto d : c10::irange(2, dim)) {
-    int kernel = input_size[d] - (output_size[d] - 1) * stride[d - 2]
-               + 2 * padding[d - 2] - output_padding[d - 2];
+    auto kernel = input_size[d] - (output_size[d] - 1) * stride[d - 2]
+               + padding[d - 2] * 2 - output_padding[d - 2];
     weight_size[d] = (kernel - 1) / dilation[d - 2] + 1;
   }
   return weight_size;
 }
 
+static inline std::vector<c10::SymInt> conv_weight_size(
+    SymIntArrayRef input_size, SymIntArrayRef output_size,
+    SymIntArrayRef padding, SymIntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
+) {
+  return _conv_weight_size(input_size, output_size, padding, output_padding, stride, dilation, groups);
+}
+
+static inline std::vector<int64_t> conv_weight_size(
+    IntArrayRef input_size, IntArrayRef output_size,
+    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
+) {
+  return _conv_weight_size(input_size, output_size, padding, output_padding, stride, dilation, groups);
+}
+
 static inline Tensor reshape_bias(int64_t dim, const Tensor& bias) {
   std::vector<int64_t> shape(dim, 1);
   shape[1] = -1;
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 29b2ce804c806..bf7017f20a4fd 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -83,10 +83,11 @@ constexpr int MIOPEN_DIM_MAX = 5;
 namespace at { namespace native {
 
 // Check workload to activate fast depthwise FP16 cudnn conv kernels
+template <typename T>
 bool check_cudnn_depthwise_workload(const at::Tensor& input, int stride) {
-  int w = input.size(3);  // same as h
-  int ch = input.size(1);
-  int bs = input.size(0);
+  auto w = at::symint::size<T>(input, 3);  // same as h
+  auto ch = at::symint::size<T>(input, 1);
+  auto bs = at::symint::size<T>(input, 0);
   if (stride==1) {
     if (w >= 7) {
       // All batch sizes and nb_channels
@@ -205,27 +206,28 @@ bool check_cudnn_depthwise_workload(const at::Tensor& input, int stride) {
 }
 
 // simplified version for cudnn 8.2 and above
+template <typename T>
 bool check_cudnn_depthwise_workload_with_filter(const at::Tensor& input, int stride, const at::Tensor& weight) {
   // 1D conv
-  if(input.size(2) == 1 && stride == 1){
+  if(at::symint::size<T>(input, 2) == 1 && stride == 1){
     return true;
   }
 
   // 2d conv
   // only square filters
-  if (weight.size(2) != weight.size(3)) return false;
-  int filter = weight.size(3);
+  if (at::symint::size<T>(weight, 2) != at::symint::size<T>(weight, 3)) return false;
+  auto filter = at::symint::size<T>(weight, 3);
   // only 1/3/5 filter
   if (filter != 1 && filter != 3 && filter != 5) return false;
   // we don't enforce square input but only check width to reduce heuristic space
-  if (input.size(3) < 7) return false; // min width 7
-  int w = input.size(3);
+  if (at::symint::size<T>(input, 3) < 7) return false; // min width 7
+  auto w = at::symint::size<T>(input, 3);
   // only 1/2 stride, use cudnn for all stride 1
   if (stride == 1) return true;
   if (stride != 2) return false;
 
-  int ch = input.size(1);
-  int bs = input.size(0);
+  auto ch = at::symint::size<T>(input, 1);
+  auto bs = at::symint::size<T>(input, 0);
   // special case since bs1 show good perf in lots of cases
   if (bs == 1) {
     if (filter == 1 && w <= 28) return true;
@@ -240,13 +242,42 @@ bool check_cudnn_depthwise_workload_with_filter(const at::Tensor& input, int str
 }
 
 
+bool xnnpack_use_convolution2d(
+    const Tensor& input,
+    const Tensor& weight,
+    const at::OptionalIntArrayRef bias_sizes_opt,
+    const IntArrayRef padding,
+    const IntArrayRef stride,
+    const IntArrayRef dilation,
+    const int64_t groups,
+    const bool transposed) {
+  return xnnpack::use_convolution2d(input, weight, bias_sizes_opt, padding, stride, dilation, groups, transposed);
+}
+
+bool xnnpack_use_convolution2d(
+    const Tensor& input,
+    const Tensor& weight,
+    const at::OptionalSymIntArrayRef bias_sizes_opt,
+    const SymIntArrayRef padding,
+    const IntArrayRef stride,
+    const IntArrayRef dilation,
+    const int64_t groups,
+    const bool transposed) {
+  // Never use xnnpack for symbolic tracing
+  return false;
+}
+
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+// This struct is templated so that we can run backend selection in a dynamic
+// shapes context; all of the real kernel selection in eager mode runs with
+// int64_t
+template <typename T>
 struct ConvParams {
   std::vector<int64_t> stride;
-  std::vector<int64_t> padding;
+  std::vector<T> padding;
   std::vector<int64_t> dilation;
   bool transposed;
-  std::vector<int64_t> output_padding;
+  std::vector<T> output_padding;
   int groups;
   bool benchmark;
   bool deterministic;
@@ -322,12 +353,12 @@ struct ConvParams {
 #if defined(__ARM_NEON__)
     // Currently only 3x3 depthwise convolutions on tensors of float are supported.
     return (input.ndimension() == 4) &&
-           (input.size(1) == groups) &&
+           (at::symint::size<T>(input, 1) == groups) &&
            (weight.ndimension() == 4 ) &&
-           (weight.size(0) % input.size(1) == 0) &&
-           (weight.size(1) == 1) &&
-           (weight.size(2) == 3) &&
-           (weight.size(3) == 3) &&
+           (at::symint::size<T>(weight, 0) % at::symint::size<T>(input, 1) == 0) &&
+           (at::symint::size<T>(weight, 1) == 1) &&
+           (at::symint::size<T>(weight, 2) == 3) &&
+           (at::symint::size<T>(weight, 3) == 3) &&
            (input.device().is_cpu()) &&
            (input.scalar_type() == at::kFloat) &&
            input.is_contiguous() &&
@@ -345,23 +376,23 @@ struct ConvParams {
 
   bool needs_64bit_indexing_no_split(const at::Tensor& input, const at::Tensor& weight) const {
     constexpr int64_t int_max = std::numeric_limits<int>::max();
-    int64_t numel_input = input.numel();
+    auto numel_input = at::symint::numel<T>(input);
     // empty input
     if (numel_input == 0) {
       return false;
     }
     // input size can not be reduced to the range of int by splitting the batch dim
-    int64_t n = input.size(0);
+    auto n = at::symint::size<T>(input, 0);
     if (numel_input / n > int_max) {
       return true;
     }
     // output size can not be reduced to the range of int by splitting the batch dim
-    int64_t outsize = 1;
+    T outsize = 1;
     if (transposed) {
-      std::vector<int64_t> o = conv_input_size(input.sizes(), weight.sizes(), padding, output_padding, stride, dilation, groups);
+      auto o = conv_input_size(at::symint::sizes<T>(input), at::symint::sizes<T>(weight), padding, output_padding, stride, dilation, groups);
       outsize = c10::multiply_integers(o.begin() + 1, o.end());
     } else {
-      std::vector<int64_t> o = conv_output_size(input.sizes(), weight.sizes(), padding, stride, dilation);
+      auto o = conv_output_size(at::symint::sizes<T>(input), at::symint::sizes<T>(weight), padding, stride, dilation);
       outsize = c10::multiply_integers(o.begin() + 1, o.end());
     }
     return outsize > int_max;
@@ -417,10 +448,10 @@ struct ConvParams {
                              is_depthwise(input, weight) &&
                              input.ndimension() == 4 &&   // TODO: 5-D contiguous depthwise is not supported yet, need benchmarks
                              !is_dilated() && // no dilation supported
-                             (stride[0] == stride[1] || input.size(2) == 1) && // square or 1d
-                             input.size(1) >= 32); // min 32 channels supported)
+                             (stride[0] == stride[1] || at::symint::size<T>(input, 2) == 1) && // square or 1d
+                             at::symint::size<T>(input, 1) >= 32); // min 32 channels supported)
         if (kernel_cond) {
-          return check_cudnn_depthwise_workload_with_filter(input, stride[1], weight);
+          return check_cudnn_depthwise_workload_with_filter<T>(input, stride[1], weight);
         }
       }
       // keep (7600 <= cudnn < 8200) code unchanged
@@ -430,14 +461,14 @@ struct ConvParams {
                            weight.scalar_type() == kHalf &&
                            is_depthwise(input, weight) &&
                            input.ndimension() == 4 &&   // TODO: 5-D contiguous depthwise is not supported yet, need benchmarks
-                           weight.size(2) == weight.size(3) && // only square kernels
-                           input.size(2) >= 7 && // min width/height 7
+                           at::symint::size<T>(weight, 2) == at::symint::size<T>(weight, 3) && // only square kernels
+                           at::symint::size<T>(input, 2) >= 7 && // min width/height 7
                            !is_dilated() && // no dilation supported
                            stride[0] == stride[1] && // equal strides
-                           ((weight.size(3) == 3) || (weight.size(3) == 1)) &&
-                           input.size(1) >= 32); // min 32 channels supported)
+                           ((at::symint::size<T>(weight, 3) == 3) || (at::symint::size<T>(weight, 3) == 1)) &&
+                           at::symint::size<T>(input, 1) >= 32); // min 32 channels supported)
       if (kernel_cond) {
-        return check_cudnn_depthwise_workload(input, stride[0]);
+        return check_cudnn_depthwise_workload<T>(input, stride[0]);
       } else {
         return false;
       }
@@ -473,12 +504,12 @@ struct ConvParams {
        !transposed && // or transposed tensors
        // For 1x1 filters, MKLDNN is faster than THNN when multi-threaded,
        // but THNN is faster when single-threaded.
-       (is_strided() || is_dilated() || input.size(0) >= 16 ||
-        weight.size(-1) != 1 || weight.size(-2) != 1 || at::get_num_threads() > 1) &&
+       (is_strided() || is_dilated() || at::symint::size<T>(input, 0) >= 16 ||
+        at::symint::size<T>(weight, -1) != 1 || at::symint::size<T>(weight, -2) != 1 || at::get_num_threads() > 1) &&
        (groups > 1
-        || (weight.size(-1) > 3 && weight.size(-2) > 3)
-        || input.size(0) > 1
-        || input.size(0)*input.size(1)*input.size(2)*input.size(3) > 20480) // for some case, native is faster
+        || (at::symint::size<T>(weight, -1) > 3 && at::symint::size<T>(weight, -2) > 3)
+        || at::symint::size<T>(input, 0) > 1
+        || at::symint::size<T>(input, 0)*at::symint::size<T>(input, 1)*at::symint::size<T>(input, 2)*at::symint::size<T>(input, 3) > 20480) // for some case, native is faster
         );
 
 #endif
@@ -493,20 +524,23 @@ struct ConvParams {
            !transposed &&   // or transposed tensors
            input.ndimension() == 4 && // must be in NCHW format
            weight.ndimension() == 4 &&
-           (weight.size(2) < 17) && (weight.size(3) < 17) // NNPACK only supports kernels up to 16x16
+           (at::symint::size<T>(weight, 2) < 17) && (at::symint::size<T>(weight, 3) < 17) // NNPACK only supports kernels up to 16x16
 #if !defined(C10_MOBILE)
-           && input.size(0) >= 16 // ensure large enough batch size to ensure perf, tuneable
+           && at::symint::size<T>(input, 0) >= 16 // ensure large enough batch size to ensure perf, tuneable
 #endif
        ;
 #endif
     return false;
   }
   bool use_xnnpack(const at::Tensor& input, const at::Tensor& weight,
-                   const at::OptionalIntArrayRef bias_sizes_opt) const {
+                   const at::OptionalArrayRef<T> bias_sizes_opt) const {
 #if defined(C10_MOBILE)
     if (!transposed) {
-      return (input.size(1) == groups) &&
-              xnnpack::use_convolution2d(
+      // NB: for the call here, it MATTERS that we are templated. If you
+      // untemplate this to always use SymInt, the function
+      // xnnpack_use_convolution2d will always return false
+      return (at::symint::size<T>(input, 1) == groups) &&
+              xnnpack_use_convolution2d(
                   input,
                   weight,
                   bias_sizes_opt,
@@ -543,33 +577,12 @@ struct ConvParams {
     return input.is_cuda() &&
            !transposed &&
            (input.ndimension() == 4 || input.ndimension() == 5) &&
-           input.size(1) == groups &&
+           at::symint::size<T>(input, 1) == groups &&
            groups > 1 && // no point if there is only a single group
-           weight.size(0) % input.size(1) == 0; // output channels must be a multiple of input channels
+           at::symint::size<T>(weight, 0) % at::symint::size<T>(input, 1) == 0; // output channels must be a multiple of input channels
   }
 };
 
-// Function to select the convolution backend based on the inputs and params.
-// This overload is used within the convolution internals but not exposed to python.
-// NB: The forward pass provides a bias tensor while the backward pass provides
-// a bool indicating whether the bias is defined. This is done to save memory by
-// avoiding saving the full bias tensor for backward.
-ConvBackend _select_conv_backend(
-    const Tensor& input,
-    const Tensor& weight,
-    const c10::optional<Tensor>& bias_opt,
-    const at::OptionalIntArrayRef bias_sizes_opt,
-    const bool need_backward,
-    const ConvParams& params);
-
-// For BC reasons, have a copy that does not require bias_opt
-ConvBackend select_conv_backend(
-    const Tensor& input,
-    const Tensor& weight,
-    const at::OptionalIntArrayRef bias_sizes_opt,
-    const bool need_backward,
-    const ConvParams& params);
-
 DEFINE_DISPATCH(conv_depthwise2d_backward_stub);
 DEFINE_DISPATCH(conv_depthwise3d_backward_stub);
 DEFINE_DISPATCH(cudnn_convolution_backward_stub);
@@ -591,13 +604,14 @@ REGISTER_NO_CPU_DISPATCH(miopen_convolution_backward_stub);
 REGISTER_NO_CPU_DISPATCH(miopen_convolution_transpose_backward_stub);
 REGISTER_NO_CPU_DISPATCH(miopen_depthwise_convolution_backward_stub);
 
-std::ostream& operator<<(std::ostream & out, const ConvParams& params) {
+template <typename T>
+std::ostream& operator<<(std::ostream & out, const ConvParams<T>& params) {
   out << "ConvParams {"
       << "  stride = " << IntArrayRef{params.stride}
-      << "  padding = " << IntArrayRef{params.padding}
+      << "  padding = " << ArrayRef<T>{params.padding}
       << "  dilation = " << IntArrayRef{params.dilation}
       << "  transposed = " << params.transposed
-      << "  output_padding = " << IntArrayRef{params.output_padding}
+      << "  output_padding = " << ArrayRef<T>{params.output_padding}
       << "  groups = " << params.groups
       << "  benchmark = " << params.benchmark
       << "  deterministic = " << params.deterministic
@@ -607,9 +621,10 @@ std::ostream& operator<<(std::ostream & out, const ConvParams& params) {
   return out;
 }
 
+template <typename T>
 static void check_shape_forward(const at::Tensor& input,
-                                const c10::IntArrayRef& weight_sizes, const at::Tensor& bias,
-                                const ConvParams& params) {
+                                const c10::ArrayRef<T>& weight_sizes, const at::Tensor& bias,
+                                const ConvParams<T>& params) {
   int64_t k = input.ndimension();
   int64_t weight_dim = weight_sizes.size();
   int64_t groups = params.groups;
@@ -624,7 +639,7 @@ static void check_shape_forward(const at::Tensor& input,
   TORCH_CHECK(weight_dim == k,
            "Expected ", weight_dim, "-dimensional input for ", weight_dim,
            "-dimensional weight ", weight_sizes, ", but got ", k, "-dimensional input of size ",
-           input.sizes(), " instead");
+           at::symint::sizes<T>(input), " instead");
   TORCH_CHECK(weight_sizes[0] >= groups,
            "Given groups=", groups, ", expected weight to be at least ", groups,
            " at dimension 0, but got weight of size ", weight_sizes, " instead");
@@ -634,23 +649,23 @@ static void check_shape_forward(const at::Tensor& input,
            "] instead");
 
   if (!transposed) {
-    std::vector<int64_t> input_shape;
-    std::vector<int64_t> kernel_shape;
+    std::vector<T> input_shape;
+    std::vector<T> kernel_shape;
     bool kernel_size_correct = true;
 
-    TORCH_CHECK(input.size(1) == (weight_sizes[1] * groups),
+    TORCH_CHECK(at::symint::size<T>(input, 1) == (weight_sizes[1] * groups),
                 "Given groups=", groups, ", weight of size ", weight_sizes,
                 ", expected input", input.sizes(), " to have ",
-                (weight_sizes[1] * groups), " channels, but got ", input.size(1),
+                (weight_sizes[1] * groups), " channels, but got ", at::symint::size<T>(input, 1),
                 " channels instead");
 
-    TORCH_CHECK(!bias.defined() || (bias.ndimension() == 1 && bias.size(0) == weight_sizes[0]),
+    TORCH_CHECK(!bias.defined() || (bias.ndimension() == 1 && at::symint::size<T>(bias, 0) == weight_sizes[0]),
              "Given weight of size ", weight_sizes,
              ", expected bias to be 1-dimensional with ", weight_sizes[0], " elements",
-             ", but got bias of size ", bias.sizes(), " instead");
+             ", but got bias of size ", at::symint::sizes<T>(bias), " instead");
 
     for (const auto i : c10::irange(2, k)) {
-      input_shape.push_back(input.size(i) + 2 * padding[i-2]);
+      input_shape.push_back(at::symint::size<T>(input, i) + 2 * padding[i-2]);
       // log new kernel size considering dilation
       kernel_shape.push_back(dilation[i-2] * (weight_sizes[i]-1) + 1);
       if (input_shape.back() < kernel_shape.back()) {
@@ -676,22 +691,23 @@ static void check_shape_forward(const at::Tensor& input,
                "Kernel size: (", kernel_ss.str(), "). Kernel size can't be greater than actual input size");
     }
   } else { // transposed
-    TORCH_CHECK(input.size(1) == weight_sizes[0],
+    TORCH_CHECK(at::symint::size<T>(input, 1) == weight_sizes[0],
              "Given transposed=", transposed, ", weight of size ", weight_sizes,
              ", expected input", input.sizes(), " to have ", weight_sizes[0],
-             " channels, but got ", input.size(1), " channels instead");
-    TORCH_CHECK(!bias.defined() || (bias.ndimension() == 1 && bias.size(0) == weight_sizes[1] * groups),
+             " channels, but got ", at::symint::size<T>(input, 1), " channels instead");
+    TORCH_CHECK(!bias.defined() || (bias.ndimension() == 1 && at::symint::size<T>(bias, 0) == weight_sizes[1] * groups),
              "Given transposed=", transposed, ", weight of size ", weight_sizes,
              ", expected bias to be 1-dimensional with ", weight_sizes[1] * groups, " elements",
              ", but got bias of size ", bias.sizes(), " instead");
   }
 }
 
+template <typename T>
 static void check_shape_backward(
     const at::Tensor& input,
-    const c10::IntArrayRef& weight_sizes,
-    const ConvParams& params) {
-  check_shape_forward(input, weight_sizes, /*bias=*/ Tensor(), params);
+    const c10::ArrayRef<T>& weight_sizes,
+    const ConvParams<T>& params) {
+  check_shape_forward<T>(input, weight_sizes, /*bias=*/ Tensor(), params);
 }
 
 // Given an input tensor and an expected number of spatial dimensions, checks that the
@@ -1149,71 +1165,25 @@ at::Tensor convolution_overrideable(
   TORCH_CHECK_NOT_IMPLEMENTED(false, "convolution_overrideable not implemented. You are likely triggering this with tensor backend other than CPU/CUDA/MKLDNN, if this is intended, please use TORCH_LIBRARY_IMPL to override this function ");
 }
 
-// Selects a backend for convolution based on the inputs and params.
-ConvBackend select_conv_backend(
-    const Tensor& input_r, const Tensor& weight_r, const c10::optional<Tensor>& bias_opt,
-    IntArrayRef stride_, IntArrayRef padding_, IntArrayRef dilation_,
-    bool transposed_, IntArrayRef output_padding_, int64_t groups_, const at::OptionalIntArrayRef bias_sizes_opt) {
-  c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
-  const Tensor& bias = *bias_maybe_owned;
-
-  auto& ctx = at::globalContext();
-  auto k = weight_r.ndimension();
-  int64_t dim = k - 2;
-  ConvParams params;
-  params.stride = expand_param_if_needed(stride_, "stride", dim);
-  params.padding = expand_param_if_needed(padding_, "padding", dim);
-  params.dilation = expand_param_if_needed(dilation_, "dilation", dim);
-  params.transposed = transposed_;
-  params.output_padding = expand_param_if_needed(output_padding_, "output_padding", dim);
-  params.groups = groups_;
-  params.benchmark = ctx.benchmarkCuDNN();
-  params.deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms();
-  params.cudnn_enabled = ctx.userEnabledCuDNN();
-  params.allow_tf32 = ctx.allowTF32CuDNN();
-
-  auto input = input_r;
-  auto weight = weight_r;
-  check_shape_forward(input, weight.sizes(), bias, params);
-
-  // Expand 1d -> 2d.
-  // This is only done for backends that don't natively support 1d spatial input.
-  if (k == 3 && !input.is_mkldnn() && !input.is_xpu()) {
-    // avoid accidentally going through NHWC for permuted 3d input.
-    input = input.contiguous();
-    params.view1d_as_2d();
-    input = view4d(input);
-    weight = view4d(weight);
-  }
-
-  auto bias_sizes = bias.defined() ? c10::optional<IntArrayRef>(bias.sizes()) : bias_sizes_opt;
-  bool need_backward = GradMode::is_enabled() &&
-      (input.requires_grad() || weight.requires_grad() || (bias.defined() && bias.requires_grad()));
-  return _select_conv_backend(input, weight, bias, bias_sizes, need_backward, params);
-}
-
-ConvBackend select_conv_backend(
-    const Tensor& input,
-    const Tensor& weight,
-    const at::OptionalIntArrayRef bias_sizes_opt,
-    const bool need_backward,
-    const ConvParams& params) {
-  return _select_conv_backend(input, weight, {}, bias_sizes_opt, need_backward, params);
-}
-
+// Function to select the convolution backend based on the inputs and params.
+// This overload is used within the convolution internals but not exposed to python.
+// NB: The forward pass provides a bias tensor while the backward pass provides
+// a bool indicating whether the bias is defined. This is done to save memory by
+// avoiding saving the full bias tensor for backward.
+template <typename T>
 ConvBackend _select_conv_backend(
     const Tensor& input,
     const Tensor& weight,
     const c10::optional<Tensor>& bias,
-    const at::OptionalIntArrayRef bias_sizes_opt,
+    const at::OptionalArrayRef<T> bias_sizes_opt,
     const bool need_backward,
-    const ConvParams& params) {
+    const ConvParams<T>& params) {
 
   // don't send empty inputs through backends
-  if (input.size(0) == 0 || input.size(1) == 0) {
+  if (at::symint::size<T>(input, 0) == 0 || at::symint::size<T>(input, 1) == 0) {
     return input.is_mkldnn() ? ConvBackend::MkldnnEmpty : ConvBackend::Empty;
-  } else if (input.numel() == 0) {
-    TORCH_CHECK(false, "Only zero batch or zero channel inputs are supported, but got input shape: ", input.sizes());
+  } else if (at::symint::numel<T>(input) == 0) {
+    TORCH_CHECK(false, "Only zero batch or zero channel inputs are supported, but got input shape: ", at::symint::sizes<T>(input));
   }
 
   if (params.is_depthwise(input, weight)) {
@@ -1305,12 +1275,65 @@ ConvBackend _select_conv_backend(
   AT_ERROR("unsupported ConvNd parameters");
 }
 
+// Selects a backend for convolution based on the inputs and params.
+ConvBackend select_conv_backend(
+    const Tensor& input_r, const Tensor& weight_r, const c10::optional<Tensor>& bias_opt,
+    IntArrayRef stride_, SymIntArrayRef padding_, IntArrayRef dilation_,
+    bool transposed_, SymIntArrayRef output_padding_, int64_t groups_, const at::OptionalSymIntArrayRef bias_sizes_opt) {
+  c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
+  const Tensor& bias = *bias_maybe_owned;
+
+  auto& ctx = at::globalContext();
+  auto k = weight_r.ndimension();
+  int64_t dim = k - 2;
+  ConvParams<c10::SymInt> params;
+  params.stride = expand_param_if_needed(stride_, "stride", dim);
+  params.padding = expand_param_if_needed(padding_, "padding", dim);
+  params.dilation = expand_param_if_needed(dilation_, "dilation", dim);
+  params.transposed = transposed_;
+  params.output_padding = expand_param_if_needed(output_padding_, "output_padding", dim);
+  params.groups = groups_;
+  params.benchmark = ctx.benchmarkCuDNN();
+  params.deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms();
+  params.cudnn_enabled = ctx.userEnabledCuDNN();
+  params.allow_tf32 = ctx.allowTF32CuDNN();
+
+  auto input = input_r;
+  auto weight = weight_r;
+  check_shape_forward(input, weight.sym_sizes(), bias, params);
+
+  // Expand 1d -> 2d.
+  // This is only done for backends that don't natively support 1d spatial input.
+  if (k == 3 && !input.is_mkldnn() && !input.is_xpu()) {
+    // avoid accidentally going through NHWC for permuted 3d input.
+    input = input.contiguous();
+    params.view1d_as_2d();
+    input = view4d(input);
+    weight = view4d(weight);
+  }
+
+  auto bias_sizes = bias.defined() ? c10::optional<SymIntArrayRef>(bias.sym_sizes()) : bias_sizes_opt;
+  bool need_backward = GradMode::is_enabled() &&
+      (input.requires_grad() || weight.requires_grad() || (bias.defined() && bias.requires_grad()));
+  return _select_conv_backend(input, weight, bias, bias_sizes, need_backward, params);
+}
+
+// For BC reasons, have a copy that does not require bias_opt
+ConvBackend select_conv_backend(
+    const Tensor& input,
+    const Tensor& weight,
+    const at::OptionalIntArrayRef bias_sizes_opt,
+    const bool need_backward,
+    const ConvParams<int64_t>& params) {
+  return _select_conv_backend(input, weight, {}, bias_sizes_opt, need_backward, params);
+}
+
 at::Tensor _convolution_nogroup_backend(
     const Tensor& input,
     const Tensor& weight,
     const Tensor& bias,
     const ConvBackend backend,
-    const ConvParams& params) {
+    const ConvParams<int64_t>& params) {
   auto kernel_size = weight.sizes().slice(2);
   switch(backend) {
     case ConvBackend::NnpackSpatial:
@@ -1341,7 +1364,7 @@ at::Tensor _convolution_nogroup_backend(
 static inline std::vector<int64_t> calc_output_size(
     const Tensor& input,
     const Tensor& weight,
-    const ConvParams& params) {
+    const ConvParams<int64_t>& params) {
   std::vector<int64_t> output_size = params.transposed ?
     conv_input_size(input.sizes(), weight.sizes(), params.padding, params.output_padding,
         params.stride, params.dilation, params.groups) :
@@ -1422,7 +1445,7 @@ at::Tensor _convolution(
   TORCH_CHECK(dim > 0, "weight should have at least three dimensions");
   TORCH_CHECK(groups_ > 0, "non-positive groups is not supported");
 
-  ConvParams params;
+  ConvParams<int64_t> params;
   params.stride = expand_param_if_needed(stride_, "stride", dim);
   params.padding = expand_param_if_needed(padding_, "padding", dim);
   params.dilation = expand_param_if_needed(dilation_, "dilation", dim);
@@ -1450,7 +1473,7 @@ at::Tensor _convolution(
   auto bias_sizes_opt = bias.defined() ? c10::optional<IntArrayRef>(bias.sizes()) : c10::nullopt;
   bool need_backward = GradMode::is_enabled() &&
       (input.requires_grad() || weight.requires_grad() || (bias.defined() && bias.requires_grad()));
-  ConvBackend backend = _select_conv_backend(input, weight, bias, bias_sizes_opt, need_backward, params);
+  ConvBackend backend = _select_conv_backend(input, weight, bias, c10::OptionalIntArrayRef(bias_sizes_opt), need_backward, params);
   at::MemoryFormat backend_memory_format = determine_backend_memory_format(input, weight, backend);
 
   // Call the backend.
@@ -1663,7 +1686,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
   auto weight = weight_r;
 
   int64_t dim = weight.ndimension() - 2;
-  ConvParams params;
+  ConvParams<int64_t> params;
   params.stride = expand_param_if_needed(stride_, "stride", dim);
   params.padding = expand_param_if_needed(padding_, "padding", dim);
   params.dilation = expand_param_if_needed(dilation_, "dilation", dim);
@@ -1726,7 +1749,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
   if (ggI.defined()) {
 
     // Modified params with correct padding
-    ConvParams gw_conv_params(params);
+    ConvParams<int64_t> gw_conv_params(params);
 
     // Disable groups as they are handled separately
     auto groups = gw_conv_params.groups;
@@ -1795,7 +1818,7 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const c10::option
   Tensor gI;
   if (input.numel() != 0) {
     if (ggW.defined()) {
-      ConvParams gi_conv_params(params);
+      ConvParams<int64_t> gi_conv_params(params);
       gi_conv_params.transposed = !params.transposed;
 
       if (params.transposed) {
@@ -1851,7 +1874,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _convolution_backward_nogroup_bac
     const Tensor& weight,
     const std::array<bool, 3> output_mask,
     const ConvBackend backend,
-    const ConvParams& params) {
+    const ConvParams<int64_t>& params) {
   auto kernel_size = weight.sizes().slice(2);
   switch(backend) {
     case ConvBackend::Slow2d:
@@ -1916,7 +1939,7 @@ std::tuple<Tensor, Tensor, Tensor> convolution_backward(
   TORCH_CHECK(dim > 0, "weight should have at least three dimensions");
 
   auto& ctx = at::globalContext();
-  ConvParams params;
+  ConvParams<int64_t> params;
   params.stride = expand_param_if_needed(stride, "stride", dim);
   params.padding = expand_param_if_needed(padding, "padding", dim);
   params.dilation = expand_param_if_needed(dilation, "dilation", dim);
diff --git a/aten/src/ATen/native/utils/ParamUtils.h b/aten/src/ATen/native/utils/ParamUtils.h
index 376467ff79cf5..adb5f1cfa49f9 100644
--- a/aten/src/ATen/native/utils/ParamUtils.h
+++ b/aten/src/ATen/native/utils/ParamUtils.h
@@ -6,12 +6,13 @@
 namespace at {
 namespace native {
 
-inline std::vector<int64_t> expand_param_if_needed(
-    IntArrayRef list_param,
+template <typename T>
+inline std::vector<T> _expand_param_if_needed(
+    ArrayRef<T> list_param,
     const char* param_name,
     int64_t expected_dim) {
   if (list_param.size() == 1) {
-    return std::vector<int64_t>(expected_dim, list_param[0]);
+    return std::vector<T>(expected_dim, list_param[0]);
   } else if ((int64_t)list_param.size() != expected_dim) {
     std::ostringstream ss;
     ss << "expected " << param_name << " to be a single integer value or a "
@@ -23,5 +24,19 @@ inline std::vector<int64_t> expand_param_if_needed(
   }
 }
 
+inline std::vector<int64_t> expand_param_if_needed(
+    IntArrayRef list_param,
+    const char* param_name,
+    int64_t expected_dim) {
+  return _expand_param_if_needed(list_param, param_name, expected_dim);
+}
+
+inline std::vector<c10::SymInt> expand_param_if_needed(
+    SymIntArrayRef list_param,
+    const char* param_name,
+    int64_t expected_dim) {
+  return _expand_param_if_needed(list_param, param_name, expected_dim);
+}
+
 } // namespace native
 } // namespace at
diff --git a/c10/core/SymInt.h b/c10/core/SymInt.h
index 9ab72a0776804..6355f13395053 100644
--- a/c10/core/SymInt.h
+++ b/c10/core/SymInt.h
@@ -235,6 +235,19 @@ inline c10::SymInt multiply_integers(const C& container) {
       [](const c10::SymInt& a, const c10::SymInt& b) { return a * b; });
 }
 
+template <
+    typename Iter,
+    typename = std::enable_if_t<std::is_same<
+        typename std::iterator_traits<Iter>::value_type,
+        c10::SymInt>::value>>
+inline c10::SymInt multiply_integers(Iter begin, Iter end) {
+  return std::accumulate(
+      begin,
+      end,
+      c10::SymInt(1),
+      [](const c10::SymInt& a, const c10::SymInt& b) { return a * b; });
+}
+
 inline SymInt operator+(int64_t a, const SymInt& b) {
   return c10::SymInt(a) + b;
 }
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index b8693a484ed9d..6073736257249 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -1408,10 +1408,10 @@ Call this whenever a new thread is created in order to propagate values from
          const at::Tensor& weight,
          const c10::optional<at::Tensor>& bias_opt,
          at::IntArrayRef stride_,
-         at::IntArrayRef padding_,
+         at::SymIntArrayRef padding_,
          at::IntArrayRef dilation_,
          bool transposed_,
-         at::IntArrayRef output_padding_,
+         at::SymIntArrayRef output_padding_,
          int64_t groups_) {
         return at::native::select_conv_backend(
             input,
@@ -1442,13 +1442,13 @@ Call this whenever a new thread is created in order to propagate values from
          const at::Tensor& weight,
          const c10::optional<at::Tensor>& bias,
          at::IntArrayRef stride_,
-         at::IntArrayRef padding_,
+         at::SymIntArrayRef padding_,
          at::IntArrayRef dilation_,
          bool transposed_,
-         at::IntArrayRef output_padding_,
+         at::SymIntArrayRef output_padding_,
          int64_t groups_,
-         c10::optional<std::vector<int64_t>> bias_sizes_opt) {
-        c10::OptionalArrayRef<int64_t> ref = c10::nullopt;
+         c10::optional<std::vector<c10::SymInt>> bias_sizes_opt) {
+        c10::OptionalArrayRef<c10::SymInt> ref = c10::nullopt;
         if (bias_sizes_opt) {
           ref = (*bias_sizes_opt);
         }

From df27f73bbd6cf71f5fd8f53d3bee36c7878ef2de Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Wed, 16 Nov 2022 21:54:20 +0000
Subject: [PATCH 0982/1922] Fix XLASymNode.str() no str() attribute error
 (#89093)

This fixes https://github.com/pytorch/xla/issues/4199
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89093
Approved by: https://github.com/ezyang
---
 torch/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/__init__.py b/torch/__init__.py
index 6def80d1dc599..02765c4aeee81 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -235,7 +235,7 @@ def __sym_float__(self):
         raise AssertionError("type stub not overridden")
 
     def __repr__(self):
-        return self.node.str()
+        return str(self.node)
 
     # For BC; direct access of node is OK too
     def get_pyobj(self):

From 2369c2f7ddb324d1bc9cc51c1c112c0b3e87a1a1 Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@fb.com>
Date: Wed, 16 Nov 2022 21:54:24 +0000
Subject: [PATCH 0983/1922] More dynamo dashboard improvements (#89155)

A number of dashboard improvements:
- Add accuracy failures to warnings section
- Add regression detection to all metrics (speedup, compile time, peak memory), not just accuracy
- Add testing flag to update-dashboard to prevent image/comment uploads
- Add section for comparing summary statistics (passrate, speedup) between 2 most recent reports
- Show names of reports for summary stats diff and regression detection sections
- Remove metric graphs from the comment (they can still be found in the generated text file)

Sample comment: https://github.com/pytorch/torchdynamo/issues/1831#issuecomment-1317565972

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89155
Approved by: https://github.com/anijain2305
---
 benchmarks/dynamo/runner.py | 352 ++++++++++++++++++++++++------------
 1 file changed, 233 insertions(+), 119 deletions(-)

diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py
index 319ff677db4fb..8012e82607cff 100755
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@@ -36,6 +36,7 @@
 import re
 import shutil
 import subprocess
+import sys
 import tempfile
 from collections import defaultdict
 from datetime import datetime
@@ -133,10 +134,15 @@ def flag_compression_ratio(x):
     return x < 0.9
 
 
+def flag_accuracy(x):
+    return "pass" not in x
+
+
 FLAG_FNS = {
     "speedup": flag_speedup,
     "compilation_latency": flag_compilation_latency,
     "compression_ratio": flag_compression_ratio,
+    "accuracy": flag_accuracy,
 }
 
 
@@ -216,6 +222,12 @@ def parse_args():
         default=False,
         help="Updates to dashboard",
     )
+    parser.add_argument(
+        "--update-dashboard-test",
+        action="store_true",
+        default=False,
+        help="Do not udpate lookup file or upload images/comments when --update-dashboard is specified",
+    )
     parser.add_argument(
         "--dashboard-image-uploader",
         default=DASHBOARD_DEFAULTS["dashboard_image_uploader"],
@@ -412,6 +424,20 @@ def archive(src_dir, dest_dir_prefix, archive_name, dtype):
     print(f"copied contents of {src_dir} to {dest}")
 
 
+def get_metric_title(metric):
+    if metric == "speedup":
+        return "Performance speedup"
+    elif metric == "accuracy":
+        return "Accuracy"
+    elif metric == "compilation_latency":
+        return "Compilation latency (sec)"
+    elif metric == "compression_ratio":
+        return "Peak Memory Compression Ratio"
+    elif metric == "abs_latency":
+        return "Absolute latency (ms)"
+    raise RuntimeError("unknown metric")
+
+
 class Parser:
     def __init__(
         self, suites, devices, dtypes, compilers, flag_compilers, mode, output_dir
@@ -693,28 +719,18 @@ def flag_bad_entries(self, suite, metric, flag_fn):
         df = df.assign(suite=suite)
         return df.reindex(columns=["suite", "name"] + self.flag_compilers)
 
-    def get_metric_title(self, metric):
-        if metric == "speedup":
-            return "Performance speedup"
-        elif metric == "accuracy":
-            return "Accuracy"
-        elif metric == "compilation_latency":
-            return "Compilation latency (sec)"
-        elif metric == "compression_ratio":
-            return "Peak Memory Compression Ratio"
-        elif metric == "abs_latency":
-            return "Absolute latency (ms)"
-        raise RuntimeError("unknown metric")
-
     def generate_warnings(self):
         title = "## Warnings ##"
         body = (
             "We flag models where:\n\n"
-            " - speedup < 0.95x\n"
+            " - accuracy fails\n"
+            " - speedup < 0.95x (NOTE: 0.0 speedup typically signifies a failure in the performance test)\n"
             " - compilation latency > 120 sec.\n"
-            " - compression ratio < 0.9\n\n"
+            " - compression ratio < 0.9\n"
+            "\n"
         )
         for metric in [
+            "accuracy",
             "speedup",
             "compilation_latency",
             "compression_ratio",
@@ -728,7 +744,7 @@ def generate_warnings(self):
             tabform = tabulate(df, headers="keys", tablefmt="pretty", showindex="never")
             str_io = io.StringIO()
             str_io.write("\n")
-            str_io.write(self.get_metric_title(metric) + " warnings\n")
+            str_io.write(get_metric_title(metric) + " warnings\n")
             str_io.write("~~~\n")
             str_io.write(f"{tabform}\n")
             str_io.write("~~~\n")
@@ -753,7 +769,7 @@ def prepare_message(self, suite):
             tabform = tabulate(df, headers="keys", tablefmt="pretty", showindex="never")
             str_io = io.StringIO()
             str_io.write("\n")
-            str_io.write(self.get_metric_title(metric) + "\n")
+            str_io.write(get_metric_title(metric) + "\n")
             str_io.write("~~~\n")
             str_io.write(f"{tabform}\n")
             str_io.write("~~~\n")
@@ -779,18 +795,15 @@ def gen_summary_files(self):
 
         with open(f"{self.output_dir}/gh_executive_summary.txt", "w") as gh_fh:
             gh_fh.write(self.executive_summary)
-        print(self.executive_summary)
 
         with open(f"{self.output_dir}/gh_warnings.txt", "w") as gh_fh:
             warnings_body = self.generate_warnings()
             gh_fh.write(warnings_body)
-            print(warnings_body)
 
         str_io = io.StringIO()
         for suite in self.suites:
             str_io.write(self.prepare_message(suite))
         str_io.write("\n")
-        print(str_io.getvalue())
         with open(f"{self.output_dir}/gh_{self.mode}.txt", "w") as gh_fh:
             gh_fh.write(str_io.getvalue())
 
@@ -820,10 +833,86 @@ def get_date(log_info):
     return datetime.strptime(f"{log_info.day}", "%j").strftime("%m-%d")
 
 
-class AccuracyRegressionTracker:
+def find_last_2_with_filenames(lookup_file, dashboard_archive_path, dtype, filenames):
+    df = pd.read_csv(lookup_file, names=("day", "mode", "prec", "path"))
+    df = df[df["mode"] == "performance"]
+    df = df[df["prec"] == dtype]
+    df = df[::-1]
+    last2 = []
+    for path in df["path"]:
+        output_dir = os.path.join(dashboard_archive_path, path)
+        fullpaths = [
+            os.path.join(dashboard_archive_path, path, name) for name in filenames
+        ]
+        if all([os.path.exists(fullpath) for fullpath in fullpaths]):
+            last2.append(output_dir)
+        if len(last2) >= 2:
+            return last2
+    return None
+
+
+class SummaryStatDiffer:
+    def __init__(self, args):
+        self.args = args
+        self.lookup_file = os.path.join(self.args.dashboard_archive_path, "lookup.csv")
+        assert os.path.exists(self.lookup_file)
+
+    def generate_diff(self, last2, filename, caption):
+        df_cur, df_prev = [pd.read_csv(os.path.join(path, filename)) for path in last2]
+        df_merge = df_cur.merge(df_prev, on="Compiler", suffixes=("_cur", "_prev"))
+        data = {col: [] for col in ("compiler", "suite", "prev_value", "cur_value")}
+        for _, row in df_merge.iterrows():
+            if row["Compiler"] in self.args.flag_compilers:
+                for suite in self.args.suites:
+                    data["compiler"].append(row["Compiler"])
+                    data["suite"].append(suite)
+                    data["prev_value"].append(row[suite + "_prev"])
+                    data["cur_value"].append(row[suite + "_cur"])
+
+        df = pd.DataFrame(data)
+        tabform = tabulate(df, headers="keys", tablefmt="pretty", showindex="never")
+        str_io = io.StringIO()
+        str_io.write("\n")
+        str_io.write(f"{caption}\n")
+        str_io.write("~~~\n")
+        str_io.write(f"{tabform}\n")
+        str_io.write("~~~\n")
+        return str_io.getvalue()
+
+    def generate_comment(self):
+        title = "## Summary Statistics Diff ##\n"
+        body = (
+            "For each relevant compiler, we compare the summary statistics "
+            "for the most 2 recent reports that actually run the compiler.\n\n"
+        )
+        dtype = self.args.dtypes[0]
+        last2 = find_last_2_with_filenames(
+            self.lookup_file,
+            self.args.dashboard_archive_path,
+            dtype,
+            ["geomean.csv", "passrate.csv"],
+        )
+
+        if last2 is None:
+            body += "Could not find most 2 recent reports.\n\n"
+        else:
+            for state, path in zip(("Current", "Previous"), last2):
+                body += f"{state} report name: {path}\n\n"
+            body += self.generate_diff(last2, "passrate.csv", "Passrate diff")
+            body += self.generate_diff(
+                last2, "geomean.csv", "Geometric mean speedup diff"
+            )
+
+        comment = generate_dropdown_comment(title, body)
+
+        with open(f"{self.args.output_dir}/gh_summary_diff.txt", "w") as gh_fh:
+            gh_fh.write(comment)
+
+
+class RegressionDetector:
     """
-    Compares the most recent 2 accuracy benchmarks to find previously
-    passing models that now fail.
+    Compares the most recent 2 benchmarks to find previously unflagged models
+    that are now flagged.
     """
 
     def __init__(self, args):
@@ -831,97 +920,113 @@ def __init__(self, args):
         self.lookup_file = os.path.join(self.args.dashboard_archive_path, "lookup.csv")
         assert os.path.exists(self.lookup_file)
 
-    def find_last_2(self, suite, device, dtype, compiler):
-        df = pd.read_csv(self.lookup_file, names=("day", "mode", "prec", "path"))
-        df = df[df["mode"] == "performance"]
-        df = df[df["prec"] == dtype]
-        df = df[::-1]
-        parsers = []
-        for path in df["path"]:
-            output_dir = os.path.join(self.args.dashboard_archive_path, path)
-            if os.path.exists(
-                os.path.join(
-                    output_dir,
-                    generate_csv_name(
-                        self.args, dtype, suite, device, compiler, "accuracy"
-                    ),
-                )
-            ):
-                parsers.append(
-                    ParsePerformanceLogs(
-                        [suite],
-                        [device],
-                        [dtype],
-                        [compiler],
-                        [compiler],
-                        get_mode(self.args),
-                        output_dir,
-                    )
-                )
-            if len(parsers) >= 2:
-                return parsers
-        return None
-
     def generate_comment(self):
-        title = "## Accuracy Regressions ##\n"
+        title = "## Recent Regressions ##\n"
         body = (
             "For each relevant compiler, we compare the most recent 2 reports "
-            "(that actually run the compiler) to find models where previously "
-            "successful accuracy tests now fail.\n\n"
+            "(that actually run the compiler) to find previously unflagged "
+            "models that are now flagged as problematic (according to the "
+            "'Warnings' section).\n\n"
         )
         dtype = self.args.dtypes[0]
         device = self.args.devices[0]
-        regressions_present = False
         for suite in self.args.suites:
-            dfs = []
-            for compiler in self.args.flag_compilers:
-                last2 = self.find_last_2(suite, device, dtype, compiler)
-                if last2 is None:
-                    continue
+            body += f"### Regressions for {suite} ###\n"
+            last2 = {}
 
-                df_cur, df_prev = [
-                    last2[i].untouched_parsed_frames[suite]["accuracy"] for i in (0, 1)
+            for compiler in self.args.flag_compilers:
+                filenames = [
+                    generate_csv_name(
+                        self.args, dtype, suite, device, compiler, testing
+                    )
+                    for testing in ["performance", "accuracy"]
                 ]
-                df_merge = df_cur.merge(df_prev, on="name", suffixes=("_cur", "_prev"))
-                flag = np.logical_and(
-                    df_merge[compiler + "_prev"].apply(lambda x: "pass" in x),
-                    df_merge[compiler + "_cur"].apply(lambda x: "pass" not in x),
+                compiler_last2 = find_last_2_with_filenames(
+                    self.lookup_file, self.args.dashboard_archive_path, dtype, filenames
                 )
-                df_bad = df_merge[flag]
-                dfs.append(
-                    pd.DataFrame(
-                        data={
-                            "compiler": compiler,
-                            "name": df_bad["name"],
-                            "prev_status": df_bad[compiler + "_prev"],
-                            "cur_status": df_bad[compiler + "_cur"],
-                        }
+                if compiler_last2 is not None:
+                    last2[compiler] = [
+                        ParsePerformanceLogs(
+                            [suite],
+                            [device],
+                            [dtype],
+                            [compiler],
+                            [compiler],
+                            get_mode(self.args),
+                            output_dir,
+                        )
+                        for output_dir in compiler_last2
+                    ]
+                    for state, path in zip(("Current", "Previous"), compiler_last2):
+                        body += (
+                            f"{state} report name (compiler: {compiler}, "
+                            f"suite: {suite}): {path}\n\n"
+                        )
+
+            for metric in [
+                "accuracy",
+                "speedup",
+                "compilation_latency",
+                "compression_ratio",
+            ]:
+                regressions_present = False
+                dfs = []
+                for compiler in self.args.flag_compilers:
+                    if last2[compiler] is None:
+                        continue
+
+                    df_cur, df_prev = [
+                        last2[compiler][i].untouched_parsed_frames[suite][metric]
+                        for i in (0, 1)
+                    ]
+                    df_merge = df_cur.merge(
+                        df_prev, on="name", suffixes=("_cur", "_prev")
+                    )
+                    flag_fn = FLAG_FNS[metric]
+                    flag = np.logical_and(
+                        df_merge[compiler + "_prev"].apply(
+                            lambda x: not pd.isna(x) and not flag_fn(x)
+                        ),
+                        df_merge[compiler + "_cur"].apply(
+                            lambda x: not pd.isna(x) and flag_fn(x)
+                        ),
+                    )
+                    df_bad = df_merge[flag]
+                    dfs.append(
+                        pd.DataFrame(
+                            data={
+                                "compiler": compiler,
+                                "name": df_bad["name"],
+                                "prev_status": df_bad[compiler + "_prev"],
+                                "cur_status": df_bad[compiler + "_cur"],
+                            }
+                        )
                     )
-                )
 
-            if not dfs:
-                continue
-            df = pd.concat(dfs, axis=0)
-            if df.empty:
-                continue
-            regressions_present = True
-            tabform = tabulate(df, headers="keys", tablefmt="pretty", showindex="never")
-            str_io = io.StringIO()
-            str_io.write("\n")
-            str_io.write(f"Accuracy regressions for {suite}\n")
-            str_io.write("~~~\n")
-            str_io.write(f"{tabform}\n")
-            str_io.write("~~~\n")
-            body += str_io.getvalue()
+                if not dfs:
+                    continue
+                df = pd.concat(dfs, axis=0)
+                if df.empty:
+                    continue
+                regressions_present = True
+                tabform = tabulate(
+                    df, headers="keys", tablefmt="pretty", showindex="never"
+                )
+                str_io = io.StringIO()
+                str_io.write("\n")
+                str_io.write(f"{get_metric_title(metric)} regressions\n")
+                str_io.write("~~~\n")
+                str_io.write(f"{tabform}\n")
+                str_io.write("~~~\n")
+                body += str_io.getvalue()
 
-        if not regressions_present:
-            body += "No accuracy regressions found.\n"
+            if not regressions_present:
+                body += "No regressions found.\n"
 
         comment = generate_dropdown_comment(title, body)
 
-        with open(f"{self.args.output_dir}/gh_accuracy_regression.txt", "w") as gh_fh:
+        with open(f"{self.args.output_dir}/gh_metric_regression.txt", "w") as gh_fh:
             gh_fh.write(comment)
-            print(comment)
 
 
 class RegressionTracker:
@@ -955,13 +1060,14 @@ def find_last_k(self):
     def generate_comment(self):
         title = "## Metrics over time ##\n"
         str_io = io.StringIO()
-        for name in glob.glob(self.args.output_dir + "/*over_time.png"):
-            output = (
-                subprocess.check_output([self.args.dashboard_image_uploader, name])
-                .decode("ascii")
-                .rstrip()
-            )
-            str_io.write(f"\n{name} : ![]({output})\n")
+        if not self.args.update_dashboard_test:
+            for name in glob.glob(self.args.output_dir + "/*over_time.png"):
+                output = (
+                    subprocess.check_output([self.args.dashboard_image_uploader, name])
+                    .decode("ascii")
+                    .rstrip()
+                )
+                str_io.write(f"\n{name} : ![]({output})\n")
         comment = generate_dropdown_comment(title, str_io.getvalue())
 
         with open(f"{self.args.output_dir}/gh_regression.txt", "w") as gh_fh:
@@ -1032,9 +1138,10 @@ def __init__(self, args):
         self.lookup_file = os.path.join(self.args.dashboard_archive_path, "lookup.csv")
         assert os.path.exists(self.lookup_file)
         try:
-            self.update_lookup_file()
+            if not self.args.update_dashboard_test:
+                self.update_lookup_file()
         except subprocess.CalledProcessError:
-            print("failed to update lookup file")
+            sys.stderr.write("failed to update lookup file\n")
 
     def update_lookup_file(self):
         dtype = self.args.dtypes[0]
@@ -1063,14 +1170,17 @@ def archive(self):
     def upload_graphs(self):
         title = "## Performance graphs ##\n"
         str_io = io.StringIO()
-        for name in glob.glob(self.output_dir + "/*png"):
-            if "over_time" not in name:
-                output = (
-                    subprocess.check_output([self.args.dashboard_image_uploader, name])
-                    .decode("ascii")
-                    .rstrip()
-                )
-                str_io.write(f"\n{name} : ![]({output})\n")
+        if not self.args.update_dashboard_test:
+            for name in glob.glob(self.output_dir + "/*png"):
+                if "over_time" not in name:
+                    output = (
+                        subprocess.check_output(
+                            [self.args.dashboard_image_uploader, name]
+                        )
+                        .decode("ascii")
+                        .rstrip()
+                    )
+                    str_io.write(f"\n{name} : ![]({output})\n")
         comment = generate_dropdown_comment(title, str_io.getvalue())
 
         with open(f"{self.output_dir}/gh_graphs.txt", "w") as gh_fh:
@@ -1080,9 +1190,10 @@ def gen_comment(self):
         files = [
             "gh_title.txt",
             "gh_executive_summary.txt",
+            "gh_summary_diff.txt",
             "gh_warnings.txt",
-            "gh_regression.txt",
-            "gh_accuracy_regression.txt",
+            # "gh_regression.txt",
+            "gh_metric_regression.txt",
             "gh_training.txt",
             "gh_graphs.txt",
         ]
@@ -1120,7 +1231,8 @@ def comment_on_gh(self, comment):
 
     def update(self):
         self.upload_graphs()
-        AccuracyRegressionTracker(self.args).generate_comment()
+        SummaryStatDiffer(self.args).generate_comment()
+        RegressionDetector(self.args).generate_comment()
         try:
             RegressionTracker(self.args).diff()
         except Exception as e:
@@ -1129,9 +1241,11 @@ def update(self):
                 gh_fh.write("")
 
         comment = self.gen_comment()
-        self.comment_on_gh(comment)
+        print(comment)
 
-        self.archive()
+        if not self.args.update_dashboard_test:
+            self.comment_on_gh(comment)
+            self.archive()
 
 
 if __name__ == "__main__":

From 070b01252b975da2ab0fb466065cc025baf9db95 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 16 Nov 2022 21:59:31 +0000
Subject: [PATCH 0984/1922] [Dynamo] Fix bug in NamedTupleVariable (#89110)

Fixes https://github.com/pytorch/torchdynamo/issues/1866

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89110
Approved by: https://github.com/jansel
---
 test/dynamo/test_misc.py         | 14 ++++++++++++++
 torch/_dynamo/variables/lists.py |  3 +++
 2 files changed, 17 insertions(+)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index aef364d769945..b3cddcbf1dff7 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -495,6 +495,20 @@ def fn(packed):
         self.assertEqual(cnts.frame_count, 1)
         self.assertEqual(cnts.op_count, 3)
 
+    def test_namedtuple3(self):
+        def fn(x, packed):
+            if isinstance(packed, mytuple):
+                return x + 1
+            else:
+                return x - 1
+
+        x = torch.rand([2, 3])
+        packed = mytuple(1, 2, 3)
+        ref = fn(x, packed)
+        opt_fn = torch._dynamo.optimize("eager")(fn)
+        res = opt_fn(x, packed)
+        self.assertTrue(same(ref, res))
+
     def test_range_input(self):
         def fn(a, rng):
             x = a
diff --git a/torch/_dynamo/variables/lists.py b/torch/_dynamo/variables/lists.py
index 151619d0e4ab0..70c6da07adb5a 100644
--- a/torch/_dynamo/variables/lists.py
+++ b/torch/_dynamo/variables/lists.py
@@ -378,6 +378,9 @@ def __init__(self, items, tuple_cls, **kwargs):
     def python_type(self):
         return self.tuple_cls
 
+    def as_python_constant(self):
+        return self.python_type()(*[x.as_python_constant() for x in self.items])
+
     def reconstruct(self, codegen):
         create_fn = getattr(self.tuple_cls, "_make", self.tuple_cls)
         codegen.append_output(codegen._create_load_const(create_fn))

From 68954bcf7a94cdf6657b6c30cd468ea9f383ea20 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Wed, 16 Nov 2022 18:36:24 +0000
Subject: [PATCH 0985/1922] [FSDP] Add fast path for `NO_SHARD`
 `clip_grad_norm_()` (#89137)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89137
Approved by: https://github.com/rohan-varma
---
 .../fsdp/test_fsdp_clip_grad_norm.py          | 29 +++++++++++++++++++
 .../fsdp/fully_sharded_data_parallel.py       | 14 +++++++--
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
index 1a742da889ac3..97b37ff2f185f 100644
--- a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
+++ b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
@@ -209,6 +209,35 @@ def _test_ddp_parity(
                 self.assertEqual(n1, n2)
                 self.assertEqual(p1, p2)
 
+        if offload_params:
+            # TODO: Gradient computation on CPU and GPU differ slightly causing
+            # drift unrelated to `clip_grad_norm_()`.
+            # https://github.com/pytorch/pytorch/issues/89133
+            return
+
+        # Run a few more iterations
+        # TODO: We cannot run too many iterations, or else there is drift:
+        # https://github.com/pytorch/pytorch/issues/89136
+        for i in range(3):
+            set_to_none = i % 2 == 0  # exercise both
+            ddp_optim.zero_grad(set_to_none=set_to_none)
+            fsdp_optim.zero_grad(set_to_none=set_to_none)
+            inp = ddp_model.module.get_input(device)
+            for model in (ddp_model, fsdp_model):
+                out = model(*inp)
+                out.sum().backward()
+            ddp_total_norm = torch.nn.utils.clip_grad_norm_(
+                ddp_model.parameters(),
+                max_norm=max_norm,
+                norm_type=norm_type,
+            )
+            fsdp_total_norm = fsdp_model.clip_grad_norm_(
+                max_norm=max_norm, norm_type=norm_type
+            )
+            self.assertEqual(ddp_total_norm, fsdp_total_norm)
+            ddp_optim.step()
+            fsdp_optim.step()
+
 
 instantiate_parametrized_tests(TestClipGradNorm)
 
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 3e84315a4e116..d2d4fbf229b6a 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -1161,10 +1161,20 @@ def clip_grad_norm_(
             self._streams["unshard"],
             self._streams["pre_unshard"],
         )
+        # If every FSDP instance uses `NO_SHARD`, then we can directly use
+        # the normal `nn.utils` one targeting local gradients
+        all_no_shard = all(
+            not handle.uses_sharded_strategy
+            for handle in FullyShardedDataParallel._fsdp_handles(self)
+        )
+        if all_no_shard:
+            return torch.nn.utils.clip_grad_norm_(
+                self.parameters(), max_norm, norm_type
+            )
+        # Otherwise, there exists some FSDP instance using a sharded strategy,
+        # where sharded and non-sharded parameters must be handled separately
         max_norm = float(max_norm)
         norm_type = float(norm_type)
-        # Perform local gradient norm computation, where sharded and
-        # non-sharded parameters must be handled separately
         sharded_params = set()
         nonsharded_params = set()  # `NO_SHARD` or not FSDP-managed
         for handle in FullyShardedDataParallel._fsdp_handles(self):

From d165a6e55e9e549c450af461fc6f54127c189770 Mon Sep 17 00:00:00 2001
From: Mikayla Gawarecki <mikaylagawarecki@gmail.com>
Date: Wed, 16 Nov 2022 19:00:49 +0000
Subject: [PATCH 0986/1922] Removed unecessary check in `select_nested`
 (#89150)

Implementation in  #88585 should work for all dimensions. Removed unnecessary check that constrained select to dims 0 and 1

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89150
Approved by: https://github.com/cpuhrsch
---
 aten/src/ATen/native/nested/NestedTensorMath.cpp | 14 ++++++--------
 docs/source/nested.rst                           |  2 +-
 test/test_nestedtensor.py                        |  5 ++++-
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/aten/src/ATen/native/nested/NestedTensorMath.cpp b/aten/src/ATen/native/nested/NestedTensorMath.cpp
index 9a47322644ca2..5842c3b8b2172 100644
--- a/aten/src/ATen/native/nested/NestedTensorMath.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorMath.cpp
@@ -502,11 +502,6 @@ Tensor select_nested(const Tensor& self, int64_t dim, int64_t index) {
   int64_t ntensors = self_ptr->size(0);
   TORCH_CHECK_INDEX(ntensors > 0, "You can only select when the NT is not empty.");
   int64_t ndims = static_cast<long>(sizes[0].size());
-  TORCH_CHECK(
-    positive_dim == 0 || positive_dim == 1,
-    "NestedTensor can only be selected along dimension 0 or 1",
-    "got dimension ", dim, " instead."
-  );
   if (positive_dim == 0) {
     TORCH_CHECK_INDEX(
         index >= -ntensors && index < ntensors,
@@ -534,13 +529,16 @@ Tensor select_nested(const Tensor& self, int64_t dim, int64_t index) {
           size_ptr[dim_idx] = sizes[i][j];
           stride_ptr[dim_idx] = strides[i][j];
           ++dim_idx;
-        }
-        else {
+        } else {
           TORCH_CHECK_INDEX(
               index >= 0 && index < sizes[i][j],
               "index ",
               index,
-              " is out of bounds for irregular dimension 1 with size ",
+              " is out of bounds for dimension ",
+              j,
+              " of the ",
+              i,
+              "th constituent tensor with size ",
               sizes[i][j]);
           new_offsets[i] = offsets[i] + index * strides[i][j];
         }
diff --git a/docs/source/nested.rst b/docs/source/nested.rst
index 07712e0376f16..ac07f8acb5a23 100644
--- a/docs/source/nested.rst
+++ b/docs/source/nested.rst
@@ -201,7 +201,7 @@ NestedTensor and any constraints they have.
    Supports addition of a scalar to a nested tensor."
    :func:`torch.mul`; "Supports elementwise multiplication of two nested tensors.
    Supports multiplication of a nested tensor by a scalar."
-   :func:`torch.select`; "Supports selecting along ``dim=0`` only (analogously ``nt[i]``)."
+   :func:`torch.select`; "Supports selecting along all dimensions."
    :func:`torch.clone`; "Behavior is the same as on regular tensors."
    :func:`torch.detach`; "Behavior is the same as on regular tensors."
    :func:`torch.unbind`; "Supports unbinding along ``dim=0`` only."
diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py
index f1f211cdafcac..7107538863158 100644
--- a/test/test_nestedtensor.py
+++ b/test/test_nestedtensor.py
@@ -800,10 +800,13 @@ def test_nested_tensor_indexing(self, device, dtype):
         self.assertEqual(nt[1, ...], x1)
         self.assertRaises(IndexError, lambda: nt[1, 4, 2])
         self.assertRaises(NotImplementedError, lambda: nt[:, 1, 1])
-        # test select on the irregular dimension only
+        # test select on non-batch dimensions
         self.assertEqual(nt.select(1, 0)[0], x0.select(0, 0))
         self.assertEqual(nt.select(1, 0)[1], x1.select(0, 0))
         self.assertRaises(IndexError, lambda: nt.select(1, 3))
+        self.assertEqual(nt.select(2, 0)[0], x0.select(1, 0))
+        self.assertEqual(nt.select(2, 0)[1], x1.select(1, 0))
+        self.assertRaises(IndexError, lambda: nt.select(2, 5))
         # make sure indexing returns a view
         nt[0].fill_(100.0)
         answer = torch.tensor(100.0, device=device, dtype=dtype).expand((2, 5))

From 0e7d7467343172ef32ac186623803ffaf04a8c64 Mon Sep 17 00:00:00 2001
From: Mikayla Gawarecki <mikaylagawarecki@gmail.com>
Date: Wed, 16 Nov 2022 19:17:08 +0000
Subject: [PATCH 0987/1922] Fix inaccuracy in nt constructor documentation +
 broken rendering (#89152)

Rendering was broken and docstring seemed to be inaccurate

![Screen Shot 2022-11-16 at 2 16 28 PM](https://user-images.githubusercontent.com/35276741/202273588-a2da5b7b-1a6d-46bb-a74e-c0de9a0fd064.png)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89152
Approved by: https://github.com/cpuhrsch
---
 torch/nested/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/nested/__init__.py b/torch/nested/__init__.py
index 71498187298dd..151d44ab66e1e 100644
--- a/torch/nested/__init__.py
+++ b/torch/nested/__init__.py
@@ -125,8 +125,8 @@ def as_nested_tensor(
 :ref:`Autograd mechanics <autograd-mechanics>`) from :attr:`tensor_list` a list of tensors.
 
 Args:
-    tensor_list (List[array_like]): a list of tensors (or anything that can be passed to torch.tensor)
-    where their first dimension can be of irregular size, but all other dimensions have to be equal.
+    tensor_list (List[array_like]): a list of tensors, or anything that can be passed to torch.tensor,
+    where each element of the list has the same dimensionality.
 
 Keyword arguments:
     dtype (:class:`torch.dtype`, optional): the desired type of returned nested tensor.

From fc9e9d6feeaeab3b8a3653f986cd9e2fe8a4440d Mon Sep 17 00:00:00 2001
From: Fuzzkatt <zonghan2000@gmail.com>
Date: Wed, 16 Nov 2022 22:50:11 +0000
Subject: [PATCH 0988/1922] add test_c10d_spawn_ucc.py (#86508)

Initial PR to create UCC equivalent of https://github.com/pytorch/pytorch/blob/master/test/distributed/test_c10d_spawn_gloo.py and
https://github.com/pytorch/pytorch/blob/master/test/distributed/test_c10d_spawn_nccl.py. Currently only added common ops.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/86508
Approved by: https://github.com/kwen2501
---
 test/distributed/test_c10d_spawn_ucc.py       | 110 ++++++++++++++++++
 test/run_test.py                              |   1 +
 torch/testing/_internal/common_distributed.py |   5 +
 3 files changed, 116 insertions(+)
 create mode 100644 test/distributed/test_c10d_spawn_ucc.py

diff --git a/test/distributed/test_c10d_spawn_ucc.py b/test/distributed/test_c10d_spawn_ucc.py
new file mode 100644
index 0000000000000..eabd7e1cf45b5
--- /dev/null
+++ b/test/distributed/test_c10d_spawn_ucc.py
@@ -0,0 +1,110 @@
+# Owner(s): ["oncall: distributed"]
+
+import sys
+import test_c10d_spawn
+import torch
+import torch.distributed as c10d
+from test_c10d_spawn import _torch_dist_nn_available, TestDistributedNNFunctions
+from torch.testing._internal.common_cuda import TEST_MULTIGPU
+from torch.testing._internal.common_distributed import (
+    requires_ucc,
+    skip_if_lt_x_gpu,
+)
+from torch.testing._internal.common_utils import (
+    TestCase,
+    run_tests,
+    sandcastle_skip,
+    sandcastle_skip_if,
+    TEST_WITH_DEV_DBG_ASAN,
+)
+
+NO_UCC = not hasattr(c10d, "ProcessGroupUCC")
+
+# Fails on Python-3.9, see https://github.com/pytorch/pytorch/issues/51619
+if sys.version_info < (3, 9):
+
+    class ProcessGroupShareTensorTest(
+        test_c10d_spawn.AbstractProcessGroupShareTensorTest, TestCase
+    ):
+        @classmethod
+        def _init_pg_ucc(cls, rank, filename, world_size):
+            store = c10d.FileStore(filename, world_size)
+            return c10d.ProcessGroupUCC(store, rank, world_size)
+
+        @sandcastle_skip_if(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
+        @sandcastle_skip_if(NO_UCC, "UCC needed")
+        def test_shared_broadcast_ucc(self):
+            self._test_multiprocess(
+                ProcessGroupShareTensorTest._test_broadcast_process,
+                [torch.ones(2, 2).to(i) * i for i in range(self.world_size)],
+                ProcessGroupShareTensorTest._init_pg_ucc,
+                1,
+            )
+
+        @sandcastle_skip_if(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
+        @sandcastle_skip_if(NO_UCC, "UCC needed")
+        def test_shared_allreduce_ucc(self):
+            self._test_multiprocess(
+                ProcessGroupShareTensorTest._test_allreduce_process,
+                [torch.ones(2, 2).to(i) for i in range(self.world_size)],
+                ProcessGroupShareTensorTest._init_pg_ucc,
+                1,
+            )
+
+        @sandcastle_skip_if(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed")
+        @sandcastle_skip_if(NO_UCC, "UCC needed")
+        def test_shared_allgather_ucc(self):
+            self._test_multiprocess(
+                ProcessGroupShareTensorTest._test_allgather_process,
+                [torch.ones(2, 2).to(i) * i for i in range(self.world_size)],
+                ProcessGroupShareTensorTest._init_pg_ucc,
+                self.world_size,
+            )
+
+
+# Skip dev-asan as torch + multiprocessing spawn have known issues
+if not TEST_WITH_DEV_DBG_ASAN:
+
+    class TestDistributedNNFunctionsUcc(TestDistributedNNFunctions):
+        # Test Common Ops First.
+        @requires_ucc()
+        @skip_if_lt_x_gpu(2)
+        @sandcastle_skip_if(
+            not _torch_dist_nn_available, "torch.distributed.nn is not available"
+        )
+        def test_broadcast(self):
+            self._test_broadcast("ucc")
+
+        @requires_ucc()
+        @skip_if_lt_x_gpu(2)
+        @sandcastle_skip_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
+        def test_reduce(self):
+            self._test_reduce("ucc")
+
+        @requires_ucc()
+        @skip_if_lt_x_gpu(2)
+        @sandcastle_skip_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
+        def test_allreduce(self):
+            self._test_allreduce("ucc")
+
+        @requires_ucc()
+        @skip_if_lt_x_gpu(2)
+        @sandcastle_skip_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
+        @sandcastle_skip("runs into illegal memory access on first assertEqual check when run locally")
+        def test_all_gather(self):
+            self._test_all_gather("ucc")
+
+        @requires_ucc()
+        @skip_if_lt_x_gpu(2)
+        @sandcastle_skip_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
+        def test_all_to_all(self):
+            self._test_all_to_all("ucc")
+
+        @requires_ucc()
+        @skip_if_lt_x_gpu(2)
+        @sandcastle_skip_if(not _torch_dist_nn_available, "torch.distributed.nn is not available")
+        def test_all_to_all_single(self):
+            self._test_all_to_all_single("ucc")
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/run_test.py b/test/run_test.py
index 8a25a2e707853..6bf98a01a44d1 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -785,6 +785,7 @@ def run_test_ops(test_module, test_directory, options):
     "distributed/test_c10d_common": get_run_test_with_subprocess_fn(),
     "distributed/test_c10d_spawn_gloo": get_run_test_with_subprocess_fn(),
     "distributed/test_c10d_spawn_nccl": get_run_test_with_subprocess_fn(),
+    "distributed/test_c10d_spawn_ucc": get_run_test_with_subprocess_fn(),
     "distributed/test_store": get_run_test_with_subprocess_fn(),
     "distributed/test_pg_wrapper": get_run_test_with_subprocess_fn(),
     "distributed/rpc/test_faulty_agent": get_run_test_with_subprocess_fn(),
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 883a48a5a5fef..9dcb71ae0907f 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -304,6 +304,11 @@ def requires_nccl():
         "c10d was not compiled with the NCCL backend",
     )
 
+def requires_ucc():
+    return sandcastle_skip_if(
+        not c10d.is_ucc_available(),
+        "c10d was not compiled with the UCC backend",
+    )
 
 def requires_mpi():
     return sandcastle_skip_if(

From e86a6d470125ffeb22277000b4cf1eee6e9ba7b3 Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Wed, 16 Nov 2022 18:40:41 +0000
Subject: [PATCH 0989/1922] Use torchrun for dynamo/distributed.py (#89149)

Mainly wanted to confirm torchrun works fine with dynamo/ddp,
but it is also a better system than manually launching processes.

Partially addresses issue #1779

New run commands
------------

single process:
python benchmarks/dynamo/distributed.py [args]

multi-gpu (e.g. 2 gpu on one host):
torchrun --nproc_per_node 2 benchmarks/dynamo/distributed.py [args]

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89149
Approved by: https://github.com/aazzolini
---
 benchmarks/dynamo/dist_util.py   | 11 +++++--
 benchmarks/dynamo/distributed.py | 51 ++++++++------------------------
 2 files changed, 21 insertions(+), 41 deletions(-)

diff --git a/benchmarks/dynamo/dist_util.py b/benchmarks/dynamo/dist_util.py
index d0267cbca3073..9957ef6139dfa 100644
--- a/benchmarks/dynamo/dist_util.py
+++ b/benchmarks/dynamo/dist_util.py
@@ -25,9 +25,14 @@
 
 
 def setup(rank, world_size):
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = "12355"
-    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+    # set defaults in case torchrun isn't used; no idea why the if is needed, but it hangs torchrun otherwise
+    if not os.getenv("MASTER_ADDR"):
+        os.environ["MASTER_ADDR"] = os.getenv("MASTER_ADDR", "localhost")
+    if not os.getenv("MASTER_PORT"):
+        os.environ["MASTER_PORT"] = os.getenv("MASETER_PORT", "12355")
+    os.environ["RANK"] = os.getenv("RANK", "0")
+    os.environ["WORLD_SIZE"] = os.getenv("WORLD_SIZE", "1")
+    dist.init_process_group("nccl")
 
 
 def cleanup():
diff --git a/benchmarks/dynamo/distributed.py b/benchmarks/dynamo/distributed.py
index 32e3b544d87dd..360fd846dbe8a 100644
--- a/benchmarks/dynamo/distributed.py
+++ b/benchmarks/dynamo/distributed.py
@@ -1,12 +1,10 @@
 import argparse
+import logging
+import os
 from functools import partial
 
-import numpy as np
-import tabulate
 import torch
-
 import torch._dynamo as dynamo
-import torch.multiprocessing as mp
 import torch.utils._pytree as pytree
 from torch._dynamo.testing import reduce_to_scalar_loss
 from torch.nn.parallel import DistributedDataParallel as DDP
@@ -32,7 +30,11 @@ def profile_model(args, model, inputs, rank):
         prof.export_chrome_trace(args.trace_file)
 
 
-def run_model(args, model, inputs, rank, world_size, key, result_q):
+def run_model(args, model, inputs, key):
+    rank = int(os.getenv("RANK", 0))
+    world_size = int(os.getenv("WORLD_SIZE", 1))
+    # result_q = []
+
     setup(rank, world_size)
     if args.device == "cuda":
         # needed for FSDP
@@ -62,8 +64,10 @@ def move_tensor(maybe_tensor):
         print(model)
 
     if args.dynamo:
+        dynamo.reset()
         if args.verbose:
             dynamo.config.verbose = True
+            dynamo.config.log_level = logging.DEBUG
         if args.dynamo_optimize_ddp:
             dynamo.config.optimize_ddp = True
 
@@ -80,40 +84,15 @@ def print_compile(gm, ex):
 
     # warmup
     _ = timed(model, model_iter_fn, inputs, times=3, return_result=False)
-    times = []
     t_total = timed(
         model, model_iter_fn, inputs, times=args.repeat, return_result=False
     )
-    times.append(t_total / args.repeat)
-
-    if rank == 0:
-        result_q.put(times)
 
     if args.profile:
         profile_model(args, model, inputs, rank)
 
     cleanup()
-
-
-def experiment(fn, key, world_size, results):
-    key = f"{key}_{world_size}"
-    dynamo.reset()
-    ctx = mp.get_context("spawn")
-    result_q = ctx.SimpleQueue()
-    f_args = (world_size, key, result_q)
-    if world_size > 1:
-        mp.spawn(
-            fn,
-            args=f_args,
-            nprocs=world_size,
-            join=True,
-        )
-    else:
-        # rank 0
-        fn(0, *f_args)
-    times = result_q.get()
-
-    results.append((key, np.median(times)))
+    return t_total
 
 
 if __name__ == "__main__":
@@ -129,9 +108,6 @@ def experiment(fn, key, world_size, results):
     parser.add_argument("--profile", action="store_true", help="Run the profiler")
     parser.add_argument("--trace_file", default="profile.json", help="Run the profiler")
     parser.add_argument("--repeat", default=10, help="Repeats for timing run")
-    parser.add_argument(
-        "--world_size", type=int, default=2, help="Number of ranks/gpus for experiments"
-    )
     parser.add_argument(
         "--dynamo_optimize_ddp",
         action="store_true",
@@ -168,7 +144,6 @@ def experiment(fn, key, world_size, results):
 
     fn = partial(run_model, args, model, inputs)
 
-    times = []
-    experiment(fn, model_name, args.world_size, times)
-    print("\nExperiment Results:")
-    print(tabulate.tabulate(times, headers=("key", "time")))
+    world_size = os.getenv("WORLD_SIZE", 1)
+    t_total = fn(f"{model_name}_{world_size}")
+    print(f"mean latency {t_total / args.repeat} across {args.repeat} runs")

From 8cb91c98a366f26cb8301de1f5f2b4e8b34dd20d Mon Sep 17 00:00:00 2001
From: AllenTiTaiWang <titaiwang@microsoft.com>
Date: Wed, 16 Nov 2022 19:50:02 +0000
Subject: [PATCH 0990/1922] [ONNX] Add onnx-script test cases (#86907)

The test cases for #86906
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86907
Approved by: https://github.com/BowenBao
---
 .jenkins/caffe2/test.sh                 |   2 +
 test/onnx/test_onnxscript_no_runtime.py | 164 ++++++++++++++++++++++++
 test/onnx/test_onnxscript_runtime.py    | 132 +++++++++++++++++++
 3 files changed, 298 insertions(+)
 create mode 100644 test/onnx/test_onnxscript_no_runtime.py
 create mode 100644 test/onnx/test_onnxscript_runtime.py

diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
index 42111ea22bdd3..d245dabda4daa 100755
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@@ -177,6 +177,8 @@ fi
 if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
   pip install -q --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
   pip install -q --user ninja flatbuffers==2.0 numpy==1.21.5 onnxruntime==1.12.1 beartype==0.10.4 onnx==1.12.0
+  # TODO: change this when onnx-script is on testPypi
+  pip install 'onnx-script @ git+https://github.com/microsoft/onnx-script'
   # numba requires numpy <= 1.20, onnxruntime requires numpy >= 1.21.
   # We don't actually need it for our tests, but it's imported if it's present, so uninstall.
   pip uninstall -q --yes numba
diff --git a/test/onnx/test_onnxscript_no_runtime.py b/test/onnx/test_onnxscript_no_runtime.py
new file mode 100644
index 0000000000000..125e899af9449
--- /dev/null
+++ b/test/onnx/test_onnxscript_no_runtime.py
@@ -0,0 +1,164 @@
+# Owner(s): ["module: onnx"]
+
+"""Test the support on onnxscript in PyTorch-ONNX converter."""
+import io
+from typing import List
+
+import onnx
+import onnxscript
+import torch
+from onnxscript.onnx_types import FLOAT
+from torch.onnx._internal import jit_utils
+from torch.testing._internal import common_utils
+
+
+class TestONNXScriptExport(common_utils.TestCase):
+
+    # opset version is
+    # 1. local function is supported after opset 15
+    # 2. onnx-script requires users to determine opset in local function
+    opset_version = 15
+
+    def test_onnxscript_registration_with_multiple_models(self):
+
+        from onnxscript.onnx_opset import opset15 as op
+
+        # 1. Register Selu onnxscript function as custom Op
+        custom_opset = onnxscript.values.Opset(domain="onnx-script", version=1)
+
+        @onnxscript.script(custom_opset)
+        def Selu(X):
+            # TODO: onnx/ort doesn't support default values for now
+            # move this when they do
+            alpha = 1.67326  # auto wrapped as Constants
+            gamma = 1.0507
+            alphaX = op.CastLike(alpha, X)
+            gammaX = op.CastLike(gamma, X)
+            neg = gammaX * (alphaX * op.Exp(X) - alphaX)
+            pos = gammaX * X
+            zero = op.CastLike(0, X)
+            return op.Where(X <= zero, neg, pos)
+
+        def custom_selu(g: jit_utils.GraphContext, X):
+            return g.onnxscript_op(Selu, X).setType(X.type())
+
+        torch.onnx.register_custom_op_symbolic(
+            symbolic_name="aten::selu",
+            symbolic_fn=custom_selu,
+            opset_version=self.opset_version,
+        )
+
+        # 2. Register layer_norm onnxscript function as custom Op
+        @onnxscript.script(custom_opset)
+        def layer_norm(
+            X, axes: List[int], weight: FLOAT[...], bias: FLOAT[...], eps: float
+        ):
+            mean = op.ReduceMean(X, axes=axes)
+            D = X - mean  # op.Sub(X, mean)
+            DD = D * D  # op.Mul(D, D)
+            var = op.ReduceMean(DD, axes=axes)
+            vareps = var + eps  # op.Add(var, eps)
+            stddev = op.Sqrt(vareps)
+            invstddev = op.Reciprocal(stddev)
+            normalized = D * invstddev  # op.Mul(D, invstddev)
+            normalizedw = op.CastLike(
+                normalized, weight
+            )  # Type issue if missing this Op
+            normalizedscaled = normalizedw * weight  # op.Mul(normalized, weight)
+            return normalizedscaled + bias
+
+        @torch.onnx.symbolic_helper.parse_args("v", "is", "v", "v", "f", "none")
+        def custom_layer_norm(
+            g, input, normalized_shape, weight, bias, eps, cudnn_enable
+        ):
+            # TODO: move the comprehension into local function once
+            # it's supported by onnxscript
+            axes = [-i for i in range(len(normalized_shape), 0, -1)]
+            return g.onnxscript_op(
+                layer_norm, input, weight, bias, axes_i=axes, eps_f=eps
+            ).setType(input.type())
+
+        torch.onnx.register_custom_op_symbolic(
+            symbolic_name="aten::layer_norm",
+            symbolic_fn=custom_layer_norm,
+            opset_version=self.opset_version,
+        )
+
+        # 3. export two models
+        x = torch.randn(1, 2, 3, 4, requires_grad=True)
+        model_selu = torch.nn.SELU()
+        selu_onnx = io.BytesIO()
+        torch.onnx.export(model_selu, x, selu_onnx, opset_version=self.opset_version)
+
+        N, C = 3, 4
+        y = torch.randn(N, C)
+        model_layer_norm = torch.nn.LayerNorm(C)
+        layer_norm_onnx = io.BytesIO()
+        torch.onnx.export(
+            model_layer_norm, y, layer_norm_onnx, opset_version=self.opset_version
+        )
+
+        # 4. test on models
+        selu_proto = onnx.load(io.BytesIO(selu_onnx.getvalue()))
+        layer_norm_proto = onnx.load(io.BytesIO(layer_norm_onnx.getvalue()))
+
+        self.assertEqual(len(selu_proto.functions), 1)
+        self.assertEqual(len(layer_norm_proto.functions), 1)
+        self.assertEqual(selu_proto.functions[0].name, "Selu")
+        self.assertEqual(layer_norm_proto.functions[0].name, "layer_norm")
+
+    def test_loop_registration(self):
+        # Control flow is tested for _find_onnxscript_op function in torch/onnx/utils.py,
+        # which has recursive logic to go through every nodes with subgraph in model proto
+        class NestedLoopsModel(torch.jit.ScriptModule):
+            def __init__(self):
+                super().__init__()
+                self.selu = torch.nn.SELU()
+
+            @torch.jit.script_method
+            def forward(self, x):
+                y = x
+                for i in range(x.size(3)):
+                    if i == 0:
+                        y = self.selu(x)
+                    else:
+                        y += i
+                return y
+
+        model = NestedLoopsModel()
+        inputs = torch.zeros(1, 2, 3, 4)
+
+        from onnxscript.onnx_opset import opset15 as op
+
+        custom_opset = onnxscript.values.Opset(domain="onnx-script", version=2)
+
+        @onnxscript.script(custom_opset)
+        def Selu(X):
+            alpha = 1.6732632423543772848170429916717
+            gamma = 1.0507009873554804934193349852946
+            alphaX = op.CastLike(alpha, X)
+            gammaX = op.CastLike(gamma, X)
+            neg = gammaX * (alphaX * op.Exp(X) - alphaX)
+            pos = gammaX * X
+            zero = op.CastLike(0, X)
+            return op.Where(X <= zero, neg, pos)
+
+        def custom_selu(g, X):
+            # domain of the Op should be aligned with onnx-script
+            # setType API is required for custom Op to support
+            # torchscript shape type inference
+            print("custom_selu is used!")
+            return g.onnxscript_op(Selu, X).setType(X.type())
+
+        torch.onnx.register_custom_op_symbolic(
+            symbolic_name="aten::selu",
+            symbolic_fn=custom_selu,
+            opset_version=15,
+        )
+
+        saved_model = io.BytesIO()
+        torch.onnx.export(
+            torch.jit.script(model), inputs, f=saved_model, opset_version=15
+        )
+        loop_selu_proto = onnx.load(io.BytesIO(saved_model.getvalue()))
+        self.assertEqual(len(loop_selu_proto.functions), 1)
diff --git a/test/onnx/test_onnxscript_runtime.py b/test/onnx/test_onnxscript_runtime.py
new file mode 100644
index 0000000000000..2d0d1e3a5357a
--- /dev/null
+++ b/test/onnx/test_onnxscript_runtime.py
@@ -0,0 +1,132 @@
+# Owner(s): ["module: onnx"]
+
+"""Test the support on onnxscript in PyTorch-ONNX converter with onnxruntime."""
+from typing import List
+
+import onnx_test_common
+import onnxscript
+import torch
+from onnxscript.onnx_types import FLOAT
+from torch.onnx._internal import jit_utils
+from torch.testing._internal import common_utils
+
+
+class TestONNXScriptRuntime(onnx_test_common._TestONNXRuntime):
+
+    # opset version is
+    # 1. local function is supported after opset 15
+    # 2. onnx-script requires users to determine opset in local function
+    opset_version = 15
+
+    def test_selu_from_onnxscript_example(self):
+
+        x = torch.randn(1, 2, 3, 4, requires_grad=True)
+        model = torch.nn.SELU()
+
+        from onnxscript.onnx_opset import opset15 as op
+
+        # custom domain is needed for custom Op domain name should be
+        # aligned to the one in symbolic_fn
+        # TODO(titaiwang): make an official domain for onnxscript usage
+        custom_opset = onnxscript.values.Opset(domain="onnx-script", version=1)
+
+        @onnxscript.script(custom_opset)
+        def Selu(X):
+            # TODO: onnx/ort doesn't support default values for now
+            # move this when they do
+            alpha = 1.67326  # auto wrapped as Constants
+            gamma = 1.0507
+            alphaX = op.CastLike(alpha, X)
+            gammaX = op.CastLike(gamma, X)
+            neg = gammaX * (alphaX * op.Exp(X) - alphaX)
+            pos = gammaX * X
+            zero = op.CastLike(0, X)
+            return op.Where(X <= zero, neg, pos)
+
+        def custom_selu(g: jit_utils.GraphContext, X):
+            return g.onnxscript_op(Selu, X).setType(X.type())
+
+        torch.onnx.register_custom_op_symbolic(
+            symbolic_name="aten::selu",
+            symbolic_fn=custom_selu,
+            opset_version=self.opset_version,
+        )
+        self.run_test(model, x)
+
+    def test_layer_norm(self):
+
+        x = torch.randn(2, 3)
+        y = torch.randn(2, 3)
+        z = torch.randn(2, 3)
+
+        class N(torch.nn.Module):
+            def __init__(self, prob):
+                super().__init__()
+                self.dropout = torch.nn.Dropout(prob)
+
+            def forward(self, x):
+                return self.dropout(x)
+
+        class M(torch.nn.Module):
+            def __init__(self, num_layers):
+                super().__init__()
+                self.num_layers = num_layers
+                self.lns = torch.nn.ModuleList(
+                    [torch.nn.LayerNorm(3, eps=i) for i in range(num_layers)]
+                )
+                self.celu1 = torch.nn.CELU(1.0)
+                self.celu2 = torch.nn.CELU(2.0)
+                self.dropout = N(0.5)
+
+            def forward(self, x, y, z):
+                res1 = self.celu1(x)
+                res2 = self.celu2(y)
+                for ln in self.lns:
+                    z = ln(z)
+                return res1 + res2, self.dropout(z)
+
+        model = M(3)
+
+        from onnxscript.onnx_opset import opset15 as op
+
+        custom_opset = onnxscript.values.Opset(domain="onnxscript", version=1)
+
+        @onnxscript.script(custom_opset)
+        def layer_norm(
+            X, axes: List[int], weight: FLOAT[...], bias: FLOAT[...], eps: float
+        ):
+            mean = op.ReduceMean(X, axes=axes)
+            D = X - mean  # op.Sub(X, mean)
+            DD = D * D  # op.Mul(D, D)
+            var = op.ReduceMean(DD, axes=axes)
+            vareps = var + eps  # op.Add(var, eps)
+            stddev = op.Sqrt(vareps)
+            invstddev = op.Reciprocal(stddev)
+            normalized = D * invstddev  # op.Mul(D, invstddev)
+            normalizedw = op.CastLike(
+                normalized, weight
+            )  # Type issue if missing this Op
+            normalizedscaled = normalizedw * weight  # op.Mul(normalized, weight)
+            return normalizedscaled + bias
+
+        @torch.onnx.symbolic_helper.parse_args("v", "is", "v", "v", "f", "none")
+        def custom_layer_norm(
+            g, input, normalized_shape, weight, bias, eps, cudnn_enable
+        ):
+            # TODO: move the comprehension into local function once it's supported by onnxscript
+            axes = [-i for i in range(len(normalized_shape), 0, -1)]
+            return g.onnxscript_op(
+                layer_norm, input, weight, bias, axes_i=axes, eps_f=eps
+            ).setType(input.type())
+
+        torch.onnx.register_custom_op_symbolic(
+            symbolic_name="aten::layer_norm",
+            symbolic_fn=custom_layer_norm,
+            opset_version=self.opset_version,
+        )
+
+        self.run_test(model, (x, y, z))
+
+
+if __name__ == "__main__":
+    common_utils.run_tests()

From 17c05c8ac3a8d48a104d727b411d9c6442ec2567 Mon Sep 17 00:00:00 2001
From: atalman <atalman@fb.com>
Date: Thu, 17 Nov 2022 00:30:12 +0000
Subject: [PATCH 0991/1922] Fix nightly build binary errors (#89153)

This is pretty much self explanatory issues
Two typo's in generate generate binary script caused workflows to be generated with invalid parameters:

1 .generated-linux-binary-libtorch-pre-cxx11-master.yml
2 .generated-macos-arm64-binary-wheel-nightly.yml
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89153
Approved by: https://github.com/malfet
---
 .github/scripts/generate_ci_workflows.py      |   4 +-
 ...linux-binary-libtorch-pre-cxx11-master.yml |  18 +--
 ...rated-macos-arm64-binary-wheel-nightly.yml | 110 ------------------
 3 files changed, 11 insertions(+), 121 deletions(-)

diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index 1ef3142286bf3..35680e30ee6a7 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -154,7 +154,7 @@ class OperatingSystem:
         package_type="libtorch",
         abi_version=generate_binary_build_matrix.PRE_CXX11_ABI,
         build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
-            OperatingSystem.LINUX, generate_binary_build_matrix.CXX11_ABI,
+            OperatingSystem.LINUX, generate_binary_build_matrix.PRE_CXX11_ABI,
             arches=["cpu"],
             libtorch_variants=["shared-with-deps"],
         ),
@@ -277,7 +277,7 @@ class OperatingSystem:
     BinaryBuildWorkflow(
         os=OperatingSystem.MACOS_ARM64,
         package_type="wheel",
-        build_configs=generate_binary_build_matrix.generate_wheels_matrix(OperatingSystem.MACOS),
+        build_configs=generate_binary_build_matrix.generate_wheels_matrix(OperatingSystem.MACOS_ARM64),
         cross_compile_arm64=True,
         ciflow_config=CIFlowConfig(
             labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
diff --git a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-master.yml b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-master.yml
index edacb2e949b00..39e41e67853ac 100644
--- a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-master.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-master.yml
@@ -31,7 +31,7 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  libtorch-cpu-shared-with-deps-cxx11-abi-build:
+  libtorch-cpu-shared-with-deps-pre-cxx11-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -42,17 +42,17 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
       LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cpu-shared-with-deps-cxx11-abi
+      DESIRED_DEVTOOLSET: pre-cxx11
+      build_name: libtorch-cpu-shared-with-deps-pre-cxx11
       build_environment: linux-binary-libtorch-pre-cxx11
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  libtorch-cpu-shared-with-deps-cxx11-abi-test:  # Testing
+  libtorch-cpu-shared-with-deps-pre-cxx11-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-with-deps-cxx11-abi-build
+    needs: libtorch-cpu-shared-with-deps-pre-cxx11-build
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
       PYTORCH_ROOT: /pytorch
@@ -62,10 +62,10 @@ jobs:
       #       favor of GPU_ARCH_VERSION
       DESIRED_CUDA: cpu
       GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
       LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cpu-shared-with-deps-cxx11-abi
+      DESIRED_DEVTOOLSET: pre-cxx11
+      build_name: libtorch-cpu-shared-with-deps-pre-cxx11
       build_environment: linux-binary-libtorch-pre-cxx11
       runs_on: linux.4xlarge
     secrets:
diff --git a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
index 6bc3894a00be5..7a7df02efe891 100644
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@@ -34,116 +34,6 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  wheel-py3_7-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-12-xl
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.7"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v2.8.2
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        with:
-          timeout_minutes: 5
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-            sudo chmod +x /usr/local/bin/sccache
-            echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: wheel-py3_7-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_7-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_7-cpu-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu
-      DESIRED_PYTHON: "3.7"
-      build_name: wheel-py3_7-cpu
-      use_s3: False
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
   wheel-py3_8-cpu-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: macos-12-xl

From 5a26cb95d39e8f66749dd51f29fe1986f28ca613 Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Thu, 17 Nov 2022 00:38:44 +0000
Subject: [PATCH 0992/1922] handle scatter(Scalar) overload in inductor
 (#88894)

Relanding https://github.com/pytorch/pytorch/pull/88210

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88894
Approved by: https://github.com/desertfire
---
 torch/_inductor/lowering.py | 44 +++++++++++++++++++++++++++++--------
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 9924396075f6c..75d4e471e5bb9 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -2029,14 +2029,37 @@ def scatter(x, dim: int, index, src, **kwargs):
     return scatter_(clone(x), dim, index, src, **kwargs)
 
 
+def scatter_fallback(
+    fn, self, dim: int, index, src, *, reduce: str = None, include_self: bool = True
+):
+
+    if reduce not in {None, "sum"} or (
+        reduce == "sum" and self.get_dtype() in {torch.bool, torch.int64}
+    ):
+        self.realize()
+        return fallback_handler(fn)(
+            self, dim, index, src, reduce=reduce, include_self=include_self
+        )
+
+    return None
+
+
 @register_lowering(aten.scatter_, type_promotion_kind=None)
 def scatter_(self, dim: int, index, src, *, reduce: str = None):
+
     if reduce == "add":
         reduce = "sum"
     elif reduce == "multiply":
         reduce = "prod"
     else:
         assert reduce is None
+
+    fallback_result = scatter_fallback(
+        aten.scatter_, self, dim, index, src, reduce=reduce
+    )
+
+    if fallback_result:
+        return fallback_result
     return scatter_reduce_(self, dim, index, src, reduce)
 
 
@@ -2062,15 +2085,18 @@ def scatter_reduce(x, dim: int, index, src, reduction_type, **kwargs):
 def scatter_reduce_(self, dim: int, index, src, reduce, *, include_self: bool = True):
     assert reduce in {None, "sum", "prod", "mean", "amax", "amin"}
 
-    # TODO: Need to support more reduction type
-    # For reduction of "sum", tl.atomic_add doesn't support bool or int64
-    if reduce not in {None, "sum"} or (
-        reduce == "sum" and self.get_dtype() in {torch.bool, torch.int64}
-    ):
-        self.realize()
-        return fallback_scatter_reduce_(
-            self, dim, index, src, reduce, include_self=include_self
-        )
+    fallback_result = scatter_fallback(
+        aten.scatter_reduce_,
+        self,
+        dim,
+        index,
+        src,
+        reduce=reduce,
+        include_self=include_self,
+    )
+
+    if fallback_result:
+        return fallback_result
 
     assert isinstance(self, TensorBox)
     assert "int" in str(index.get_dtype())

From 976a9e6d4796231fd367bb2bd3f5c77889e156e9 Mon Sep 17 00:00:00 2001
From: Charlie West-Taylor <charliew@graphcore.ai>
Date: Thu, 17 Nov 2022 00:59:12 +0000
Subject: [PATCH 0993/1922] Use the Python frame safely in _pythonCallstack
 (#88993)

Currently, the result of `PyEval_GetFrame()` is piped straight to `Py_INCREF`. However, `PyEval_GetFrame` [may return null](https://docs.python.org/3/c-api/reflection.html#c.PyEval_GetFrame), which seems to be the case sometimes, when calling `_pythonCallstack` from another thread. This is handled in the subsequent `while (nullptr != frame)` block, but `Py_INCREF`, called before it, [doesn't handle this case](https://docs.python.org/3/c-api/refcounting.html#c.Py_INCREF), so the program segfaults. The safe form of `Py_INCREF` is `Py_XINCREF`, so use that instead ([docs](https://docs.python.org/3/c-api/refcounting.html#c.Py_XINCREF)).
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88993
Approved by: https://github.com/albanD
---
 torch/csrc/jit/python/python_tracer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/csrc/jit/python/python_tracer.cpp b/torch/csrc/jit/python/python_tracer.cpp
index 83570c85e9b4c..c89d54872a07b 100644
--- a/torch/csrc/jit/python/python_tracer.cpp
+++ b/torch/csrc/jit/python/python_tracer.cpp
@@ -27,7 +27,7 @@ namespace tracer {
 std::vector<StackEntry> _pythonCallstack() {
   pybind11::gil_scoped_acquire gil;
   PyFrameObject* frame = PyEval_GetFrame();
-  Py_INCREF(frame);
+  Py_XINCREF(frame);
   std::vector<StackEntry> entries;
 
   while (nullptr != frame) {

From 5e5adfcef05a693912649cd218cc9ddf8ccd5b8f Mon Sep 17 00:00:00 2001
From: R Max Espinoza <me@rmax.io>
Date: Thu, 17 Nov 2022 01:03:31 +0000
Subject: [PATCH 0994/1922] doc(typo): memroy -> memory (#89126)

Minor typo in comments.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89126
Approved by: https://github.com/kit1980
---
 torch/csrc/jit/codegen/cuda/executor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp
index 25e87c91cd25f..23be5f4232aad 100644
--- a/torch/csrc/jit/codegen/cuda/executor.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor.cpp
@@ -238,7 +238,7 @@ void FusionExecutor::compileFusion(
 #ifndef USE_ROCM
   device_smem_limit_ = properties->sharedMemPerBlockOptin;
 #else
-  // don't know if rocm supports opt-in shared memroy reconfiguration
+  // don't know if rocm supports opt-in shared memory reconfiguration
   device_smem_limit_ = properties->sharedMemPerBlock;
 #endif
   warp_size_ = properties->warpSize;

From cc781cf8a5806a2bec290264713b29dd084df783 Mon Sep 17 00:00:00 2001
From: John Detloff <jmdetloff@gmail.com>
Date: Thu, 17 Nov 2022 01:06:12 +0000
Subject: [PATCH 0995/1922] Update README.md (#85534)

Our jenkins builds are gone, so this badge is broken and should be removed

Pull Request resolved: https://github.com/pytorch/pytorch/pull/85534
Approved by: https://github.com/ngimel, https://github.com/kit1980
---
 caffe2/README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/caffe2/README.md b/caffe2/README.md
index 0b69eec8191b8..13171fca23bb7 100644
--- a/caffe2/README.md
+++ b/caffe2/README.md
@@ -1,7 +1,5 @@
 # Caffe2
 
-[![Jenkins Build Status](https://ci.pytorch.org/jenkins/job/caffe2-master/lastCompletedBuild/badge/icon)](https://ci.pytorch.org/jenkins/job/caffe2-master)
-
 Caffe2 is a lightweight, modular, and scalable deep learning framework. Building on the original [Caffe](http://caffe.berkeleyvision.org), Caffe2 is designed with expression, speed, and modularity in mind.
 
 ## Questions and Feedback

From 1a97d6cd2ecd0fb5633a55e88f10aa7645462ee4 Mon Sep 17 00:00:00 2001
From: keineahnung2345 <mimifasosofamire1123@gmail.com>
Date: Thu, 17 Nov 2022 01:09:55 +0000
Subject: [PATCH 0996/1922] Fix typo in Dispatcher.h (#89045)

Fix typo in Dispatcher.h: hamespace -> namespace
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89045
Approved by: https://github.com/bdhirsh, https://github.com/kit1980
---
 aten/src/ATen/core/dispatch/Dispatcher.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h
index 6e1c7d754d723..5af8ef1e52ded 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@@ -735,4 +735,4 @@ struct hash<c10::OperatorHandle> {
   }
 };
 
-} // hamespace std
+} // namespace std

From 0707ae4765ef9fe05596151bb0969d7580e8a090 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Thu, 17 Nov 2022 01:45:48 +0000
Subject: [PATCH 0997/1922] Add pytest-flakefinder as a test dependency
 (#89103)

This is used to re-run tests multiple times to determine their flakiness status. The way re-run is handled in https://github.com/pytorch/pytorch/pull/88646 only applies to unittest

Per their documentation, `pytest-repeat` doesn't work with `unittest.Testcase` it seems, so trying https://github.com/dropbox/pytest-flakefinder instead
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89103
Approved by: https://github.com/clee2000
---
 .circleci/docker/requirements-ci.txt            | 7 ++++++-
 .github/requirements/pip-requirements-macOS.txt | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/.circleci/docker/requirements-ci.txt b/.circleci/docker/requirements-ci.txt
index 018a7f6544fda..e527d29d4989b 100644
--- a/.circleci/docker/requirements-ci.txt
+++ b/.circleci/docker/requirements-ci.txt
@@ -159,8 +159,13 @@ pytest-shard
 #Pinned versions:
 #test that import:
 
+pytest-flakefinder==1.1.0
+#Description: plugin for rerunning tests a fixed number of times in pytest
+#Pinned versions: 1.1.0
+#test that import:
+
 pytest-rerunfailures
-#Description: plugin for rerunning tests in pytest
+#Description: plugin for rerunning failure tests in pytest
 #Pinned versions:
 #test that import:
 
diff --git a/.github/requirements/pip-requirements-macOS.txt b/.github/requirements/pip-requirements-macOS.txt
index 7aa2306b1309f..dfbaea260116e 100644
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@@ -14,6 +14,7 @@ pygments==2.12.0
 pytest==7.2.0
 pytest-xdist==3.0.2
 pytest-rerunfailures==10.2
+pytest-flakefinder==1.1.0
 pytest-shard==0.1.2
 scipy==1.9.0
 sympy==1.11.1

From 4d13d8d84eb30de604a954599bf4783d32e217a3 Mon Sep 17 00:00:00 2001
From: Horace He <chilli@fb.com>
Date: Wed, 16 Nov 2022 19:58:30 +0000
Subject: [PATCH 0998/1922] Added conv constraint that infers layouts (#89031)

The core problem that we often have with contiguous/channels-last layouts and convolutions is that Inductor often doesn't do a great job of "preserving" the eager-mode layouts.

So, for example, we'll often have something like
```
a: channels-last
b = foo(a)
c = convolution(a)
```

In eager-mode, `a` would stay channels-last, and we would avoid two transpose copies (one into NHWC and one back into NCHW) within the convolution kernel.

However, Inductor currently sometimes loses the "correct" layout of `b` (not in this simple example, but others). Then, not only will we do a transpose within `foo`, but we'll then immediately transpose it back to do the convolution (and then again once the convolution is done).

This is particularly egregious in `convnext_base`, where there's a lot of mixing of non-channels last tensors and channels-last tensors.

The solution in this PR is to constrain the inputs to `aten.convolution`/`aten.convolution_backward` to match the layouts from eager-mode. This ensures that we'll never do extra transposes *within* `aten.convolution`, which are particularly bad (since Inductor can't fuse them).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89031
Approved by: https://github.com/ngimel, https://github.com/jansel
---
 test/inductor/test_torchinductor.py   |   4 +-
 torch/_inductor/graph.py              |  29 ++++++-
 torch/_inductor/ir.py                 |   3 +
 torch/_inductor/lowering.py           | 118 +++++++++-----------------
 torch/fx/experimental/proxy_tensor.py |   5 ++
 5 files changed, 78 insertions(+), 81 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 1265ca3e78728..651ef9ec016fd 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -65,7 +65,6 @@
 from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
 
 aten = torch.ops.aten
-
 requires_cuda = functools.partial(unittest.skipIf, not HAS_CUDA, "requires cuda")
 
 torch._inductor.config.triton.autotune = False  # too slow
@@ -5088,6 +5087,8 @@ def get_kernels(self, fn, args) -> typing.List[CachingAutotuner]:
             return kernels
 
         def test_divisibile_by_16_covers_numel_args(self):
+            torch._dynamo.reset()
+
             def fn(a: torch.Tensor) -> torch.Tensor:
                 return torch.sum(a)
 
@@ -5107,6 +5108,7 @@ def fn(a: torch.Tensor) -> torch.Tensor:
                 kernels[1].meta["configs"][0].divisible_by_16
             )
             self.assertEqual(arguments_that_are_divisible_by_16_in_kernel1, (0, 1))
+            torch._dynamo.reset()
 
 
 if __name__ == "__main__":
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index e0e41fd8afa5d..5114ffa761117 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -20,7 +20,12 @@
     MissingOperatorWithoutDecomp,
 )
 from .ir import Constant, FixedLayout, InputBuffer, Pointwise, Reduction, TensorBox
-from .lowering import lowerings, make_fallback, needs_realized_inputs
+from .lowering import (
+    layout_constraints,
+    lowerings,
+    make_fallback,
+    needs_realized_inputs,
+)
 from .sizevars import SizeVarAllocator
 from .utils import dynamo_utils, gather_origins
 from .virtualized import V
@@ -301,7 +306,12 @@ def finalize(self):
 
     def run_node(self, n: torch.fx.Node):
         with ir.IRNode.current_origins({n}):
-            result = super().run_node(n)
+            if n.op == "call_function" and n.target in layout_constraints:
+                args, kwargs = self.fetch_args_kwargs_from_env(n)
+                args, kwargs = layout_constraints[n.target](n, *args, **kwargs)
+                result = self.call_function(n.target, args, kwargs)
+            else:
+                result = super().run_node(n)
 
             # Realize if (1) any user need inputs realized, or (2) there is
             # already too many reads and rematerializing can be bad.
@@ -310,7 +320,20 @@ def run_node(self, n: torch.fx.Node):
                 for user in n.users:
                     if user.target in needs_realized_inputs:
                         result.realize_hint()
-                    elif user.op == "output":
+                        # This inclusion is somewhat controversial (from
+                        # discussion between Horace, Natalia, and Elias).
+                        # Currently, it's not very clear why this is helpful.
+                        # The general idea here is that even though a node may
+                        # have FlexibleLayout, we still often *treat* it as if
+                        # it was contiguous. This appears to sometime result in
+                        # suboptimal behavior.
+                        #
+                        # When we do a better job selecting layout, we should
+                        # revisit this.
+                        result = ir.ExternKernel.require_stride_order(
+                            result, ir.get_stride_order(n.meta["val"].stride())
+                        )
+                    if user.op == "output":
                         if isinstance(result.data.data, (Pointwise, Reduction)):
                             result.realize()
 
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 8327fe0d7b521..d547246717689 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -2478,6 +2478,9 @@ def require_stride1(cls, x):
 
     @classmethod
     def require_stride_order(cls, x, order):
+        if x.get_numel() == 0:  # Layout doesn't matter
+            return x
+
         # require x to have the layout as strided_ordered as order
         if is_storage_and_layout(x):
             if isinstance(
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 75d4e471e5bb9..5168f37cd392c 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -23,7 +23,6 @@
 from .decomposition import decompositions, get_decompositions
 from .ir import (
     ExpandView,
-    get_stride_order,
     IndexingConstant,
     IndexingDiv,
     PermuteView,
@@ -38,6 +37,7 @@
 
 log = logging.getLogger(__name__)
 lowerings = {}
+layout_constraints = {}
 fallbacks = set()
 aten = torch.ops.aten
 prims = torch.ops.prims
@@ -53,6 +53,14 @@ def add_needs_realized_inputs(fn):
             needs_realized_inputs.add(getattr(fn, overload))
 
 
+def add_layout_constraint(fn, constraint):
+    if isinstance(fn, torch._ops.OpOverloadPacket):
+        for overload in fn.overloads():
+            layout_constraints[getattr(fn, overload)] = constraint
+    else:
+        layout_constraints[fn] = constraint
+
+
 add_needs_realized_inputs(
     [
         aten.as_strided,
@@ -1013,12 +1021,10 @@ def linear_binary(x: TensorBox, y: TensorBox, w: TensorBox, b: TensorBox, attr):
 register_onednn_fusion_ops()
 
 
-def fallback_handler(kernel, inps_hook=None):
+def fallback_handler(kernel):
     fallbacks.add(kernel)
 
     def handler(*args, **kwargs):
-        if inps_hook is not None:
-            args, kwargs = inps_hook(*args, **kwargs)
         return pytree.tree_map(
             TensorBox.create, ir.FallbackKernel.create(kernel, *args, **kwargs)
         )
@@ -1026,7 +1032,7 @@ def handler(*args, **kwargs):
     return handler
 
 
-def make_fallback(kernel, inps_hook=None):
+def make_fallback(kernel, layout_constraint=None):
     assert (
         kernel not in decompositions
     ), f"both a fallback and a decomp for same kernel: {kernel}"
@@ -1036,9 +1042,9 @@ def make_fallback(kernel, inps_hook=None):
         )
 
     add_needs_realized_inputs(kernel)
-    return register_lowering(kernel, type_promotion_kind=None)(
-        fallback_handler(kernel, inps_hook)
-    )
+    if layout_constraint is not None:
+        add_layout_constraint(kernel, layout_constraint)
+    return register_lowering(kernel, type_promotion_kind=None)(fallback_handler(kernel))
 
 
 @register_lowering(aten.native_dropout, type_promotion_kind=None)
@@ -1189,72 +1195,14 @@ def inner_fn(index):
     )
 
 
-def conv_backward(*args, **kwargs):
-    # output striding complex and has a lot of build dependent options,
-    # take the output strides to determine what to set the inputs
-    with torch._subclasses.FakeTensorMode():
-        args_fake, kwargs_fake = pytree.tree_map_only(
-            ir.IRNode,
-            lambda t: ir.ir_node_to_tensor(t, guard_shape=False),
-            (args, kwargs),
-        )
-        output = aten.convolution_backward(*args_fake, **kwargs_fake)
-
-    def constraints(
-        grad_output,
-        input,
-        weight,
-        bias_sizes,
-        stride,
-        padding,
-        dilation,
-        transposed,
-        output_padding,
-        groups,
-        output_mask,
-    ):
-        out = (
-            output[0]
-            if output[0] is not None
-            else output[1]
-            if output[1] is not None
-            else output[2]
-        )
-        if out is not None:
-            stride_order = get_stride_order(out.stride())
-            grad_output = ir.ExternKernel.require_stride_order(
-                grad_output, stride_order
-            )
-            weight = ir.ExternKernel.require_stride_order(weight, stride_order)
-            # Only make input contiguous when it is necessary for the backwards computation
-            if output_mask[1]:
-                input = ir.ExternKernel.require_stride_order(input, stride_order)
-
-        return (
-            grad_output,
-            input,
-            weight,
-            bias_sizes,
-            stride,
-            padding,
-            dilation,
-            transposed,
-            output_padding,
-            groups,
-            output_mask,
-        ), {}
-
-    return constraints(*args, **kwargs)
-
-
-def require_dense(*args, **kwargs):
+def require_dense(_, *args, **kwargs):
     args, kwargs = pytree.tree_map_only(
         ir.IRNode, lambda t: ir.ExternKernel.require_stride1(t), (args, kwargs)
     )
     return args, kwargs
 
 
-def require_contiguous(*args, **kwargs):
+def require_contiguous(_, *args, **kwargs):
     args, kwargs = pytree.tree_map_only(
         ir.IRNode, lambda t: ir.ExternKernel.require_contiguous(t), (args, kwargs)
     )
@@ -1264,26 +1212,42 @@ def require_contiguous(*args, **kwargs):
 if has_torchvision_roi_align():
     make_fallback(torch.ops.torchvision.roi_align)
 
+
+def constrain_to_fx_strides(fx_node, *args, **kwargs):
+    def apply_constraint(arg, fx_arg):
+        if isinstance(arg, ir.IRNode):
+            stride_order = ir.get_stride_order(fx_arg.meta["val"].stride())
+            return ir.ExternKernel.require_stride_order(arg, stride_order)
+        return arg
+
+    args = [apply_constraint(arg, fx_arg) for arg, fx_arg in zip(args, fx_node.args)]
+    kwargs = {k: apply_constraint(v, fx_node.kwargs[k]) for k, v in kwargs.items()}
+    return args, kwargs
+
+
 # TODO(jansel): we should implement decomps or lowerings for these
 # https://github.com/pytorch/torchdynamo/issues/327
 make_fallback(aten._adaptive_avg_pool2d_backward, require_dense)
-make_fallback(aten.convolution_backward, inps_hook=conv_backward)
+make_fallback(aten.convolution_backward, constrain_to_fx_strides)
 make_fallback(aten._cudnn_rnn, require_dense)
-make_fallback(aten._cudnn_rnn_backward, inps_hook=require_contiguous)
-make_fallback(aten.cumsum, inps_hook=require_dense)
-make_fallback(aten._embedding_bag, inps_hook=require_contiguous)
-make_fallback(aten._embedding_bag_forward_only, inps_hook=require_contiguous)
+make_fallback(aten._cudnn_rnn_backward, require_contiguous)
+make_fallback(aten.cumsum, require_dense)
+make_fallback(aten._embedding_bag, require_contiguous)
+make_fallback(aten._embedding_bag_forward_only, require_contiguous)
 make_fallback(aten._fused_moving_avg_obs_fq_helper)
 make_fallback(aten._fused_moving_avg_obs_fq_helper_functional)
-make_fallback(aten.grid_sampler_2d_backward, inps_hook=require_dense)
+make_fallback(aten.grid_sampler_2d_backward, require_dense)
 make_fallback(aten.randperm)
 make_fallback(aten.sort)
 make_fallback(aten.sort.stable)
 make_fallback(aten._sparse_coo_tensor_with_dims_and_tensors)
-make_fallback(aten._thnn_fused_lstm_cell, inps_hook=require_dense)
+make_fallback(aten._thnn_fused_lstm_cell, require_dense)
 make_fallback(aten.topk)
-make_fallback(aten.upsample_bicubic2d_backward, inps_hook=require_contiguous)
-make_fallback(aten.upsample_bilinear2d_backward, inps_hook=require_dense)
+make_fallback(aten.upsample_bicubic2d_backward, require_contiguous)
+make_fallback(aten.upsample_bilinear2d_backward, require_dense)
+
+
+add_layout_constraint(aten.convolution, constrain_to_fx_strides)
 
 
 @register_lowering(aten.convolution)
diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
index c835607548900..8a51294c5a8fd 100644
--- a/torch/fx/experimental/proxy_tensor.py
+++ b/torch/fx/experimental/proxy_tensor.py
@@ -118,6 +118,11 @@ def set_meta(proxy, val):
     elif isinstance(val, torch.Tensor):
         if not val.is_sparse:
             proxy.node.meta['tensor_meta'] = _extract_tensor_metadata(val)
+            # NB: Kinda hacky, but we should try to get val as the metadata
+            # everywhere
+            fake_tensor_mode = FakeTensorMode(allow_fallback_kernels=True)
+            with fake_tensor_mode:
+                proxy.node.meta['val'] = torch.empty_strided(val.shape, val.stride(), device=val.device, dtype=val.dtype)
     return proxy
 
 def thunkify(f, *args, **kwargs):

From 900f2bdfe05274eea2712f9154a1032bda88c5f7 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Thu, 17 Nov 2022 01:55:03 +0000
Subject: [PATCH 0999/1922] Fix typos in messages under test (#89121)

This PR fixes typos of messages in `.cpp` and `.py` files under test directory.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89121
Approved by: https://github.com/mruberry, https://github.com/kit1980
---
 test/cpp/c10d/ProcessGroupNCCLTest.cpp           | 2 +-
 test/cpp/jit/test_custom_class_registrations.cpp | 2 +-
 test/inductor/test_torchinductor.py              | 2 +-
 test/jit/test_hooks.py                           | 2 +-
 test/lazy/test_extract_compiled_graph.py         | 2 +-
 test/mobile/test_lite_script_module.py           | 4 ++--
 test/onnx/test_pytorch_onnx_onnxruntime.py       | 2 +-
 test/quantization/fx/test_quantize_fx.py         | 2 +-
 test/scripts/run_cuda_memcheck.py                | 2 +-
 test/test_sparse.py                              | 2 +-
 test/test_type_promotion.py                      | 2 +-
 11 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/test/cpp/c10d/ProcessGroupNCCLTest.cpp b/test/cpp/c10d/ProcessGroupNCCLTest.cpp
index 0d566344f2ced..083c4770e0ae3 100644
--- a/test/cpp/c10d/ProcessGroupNCCLTest.cpp
+++ b/test/cpp/c10d/ProcessGroupNCCLTest.cpp
@@ -355,7 +355,7 @@ void testAllreduce(const std::string& path, int rank, int size) {
     const auto* const data = tensor.data_ptr<float>();
     for (const auto k : c10::irange(tensor.numel())) {
       EXPECT_EQ(data[k], expected)
-          << "Allreduce ouputs do not match expected outputs";
+          << "Allreduce outputs do not match expected outputs";
     }
   }
 }
diff --git a/test/cpp/jit/test_custom_class_registrations.cpp b/test/cpp/jit/test_custom_class_registrations.cpp
index 63c6b70133062..16e690d99d8a1 100644
--- a/test/cpp/jit/test_custom_class_registrations.cpp
+++ b/test/cpp/jit/test_custom_class_registrations.cpp
@@ -222,7 +222,7 @@ struct ElementwiseInterpreter : torch::CustomClassHolder {
     }
 
     if (!output_name_) {
-      throw std::runtime_error("Output name not specififed!");
+      throw std::runtime_error("Output name not specified!");
     }
 
     return environment.at(*output_name_);
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 651ef9ec016fd..fb7ca1fc92b73 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -3298,7 +3298,7 @@ def fn(in_ptr0, in_ptr1, in_ptr2):
             ),
         )
 
-    @unittest.skipIf(not has_torchvision_roi_align(), "requirs torchvision")
+    @unittest.skipIf(not has_torchvision_roi_align(), "requires torchvision")
     def test_roi_align(self):
         def fn(a, b):
             return torch.ops.torchvision.roi_align(a, b, 0.25, 7, 7, 2, False)
diff --git a/test/jit/test_hooks.py b/test/jit/test_hooks.py
index 109a5e3f1b716..2963837a638a6 100644
--- a/test/jit/test_hooks.py
+++ b/test/jit/test_hooks.py
@@ -229,7 +229,7 @@ def pre_hook(self, input: Tuple[str]) -> Tuple[str]:
 
         with self.assertRaisesRegex(
             RuntimeError,
-            "This error occured while scripting the forward pre-hook 'pre_hook'",
+            "This error occurred while scripting the forward pre-hook 'pre_hook'",
         ):
             torch.jit.script(m)
 
diff --git a/test/lazy/test_extract_compiled_graph.py b/test/lazy/test_extract_compiled_graph.py
index f4152d0af68bf..b27a11bf49b61 100644
--- a/test/lazy/test_extract_compiled_graph.py
+++ b/test/lazy/test_extract_compiled_graph.py
@@ -141,7 +141,7 @@ def verify_reusing_compiled_graph(mod, exception_msg_pattern, ncase=10):
             raise e  # reraise the exception
         exception_message = str(e)
         if not re.search(exception_msg_pattern, exception_message):
-            raise RuntimeError(f"Expection message does not match the required pattern: {exception_message}")
+            raise RuntimeError(f"Exception message does not match the required pattern: {exception_message}")
         else:
             # We are done for the test case that expects an exception
             return
diff --git a/test/mobile/test_lite_script_module.py b/test/mobile/test_lite_script_module.py
index 638ac37eb88b3..9089977b77f12 100644
--- a/test/mobile/test_lite_script_module.py
+++ b/test/mobile/test_lite_script_module.py
@@ -241,7 +241,7 @@ def forward(self):
 
         script_module = torch.jit.script(MyTestModuleForListWithModuleClass())
         with self.assertRaisesRegex(RuntimeError,
-                                    r"^Returining a list or dictionary with pytorch class type "
+                                    r"^Returning a list or dictionary with pytorch class type "
                                     r"is not supported in mobile module "
                                     r"\(List\[Foo\] or Dict\[int\, Foo\] for class Foo\(torch\.nn\.Module\)\)\. "
                                     r"Workaround\: instead of using pytorch class as their element type\, "
@@ -264,7 +264,7 @@ def forward(self):
 
         script_module = torch.jit.script(MyTestModuleForDictWithModuleClass())
         with self.assertRaisesRegex(RuntimeError,
-                                    r"^Returining a list or dictionary with pytorch class type "
+                                    r"^Returning a list or dictionary with pytorch class type "
                                     r"is not supported in mobile module "
                                     r"\(List\[Foo\] or Dict\[int\, Foo\] for class Foo\(torch\.nn\.Module\)\)\. "
                                     r"Workaround\: instead of using pytorch class as their element type\, "
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 7ae9d8edaccc6..16839dded0c40 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -9168,7 +9168,7 @@ def forward(self, x, y, cond):
         )
 
     @skipScriptTest(
-        skip_before_opset_version=11, reason="dynamic split support addded in 11"
+        skip_before_opset_version=11, reason="dynamic split support added in 11"
     )
     def test_split_tensor_scalar(self):
         class SplitModel(torch.nn.Module):
diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 6cee5e95f21cd..b03b7fb0cf0e9 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -491,7 +491,7 @@ def forward(self, x):
 
         self.checkGraphModuleNodes(m, expected_node=ns.call_module(torch.nn.intrinsic.modules.fused.LinearReLU))
 
-    @unittest.skip("Temprorarily skipping the test case, will enable after the simple"
+    @unittest.skip("Temporarily skipping the test case, will enable after the simple"
                    "pattern format is supported")
     def test_fuse_addtional_fuser_method(self):
         class MyConvReLU(torch.nn.Module):
diff --git a/test/scripts/run_cuda_memcheck.py b/test/scripts/run_cuda_memcheck.py
index 10202e416d008..7d882b8c1fff4 100755
--- a/test/scripts/run_cuda_memcheck.py
+++ b/test/scripts/run_cuda_memcheck.py
@@ -119,7 +119,7 @@ async def run1(coroutine_id):
         gpuid = coroutine_id % GPUS
     else:
         gpu_assignments = args.gpus.split(':')
-        assert args.nproc == len(gpu_assignments), 'Please specify GPU assignmnent for each process, separated by :'
+        assert args.nproc == len(gpu_assignments), 'Please specify GPU assignment for each process, separated by :'
         gpuid = gpu_assignments[coroutine_id]
 
     while progress < len(ALL_TESTS):
diff --git a/test/test_sparse.py b/test/test_sparse.py
index a2b623e2508eb..4bfccaff0e2c9 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -3708,7 +3708,7 @@ def check_empty(sparse_shape, nnz, dense_shape, coalesce):
             check(self, s, d)
             check_empty(shape, nnz, sub_shape, coalesced)
 
-    @unittest.skipIf(not TEST_NUMPY, "NumPy is not availible")
+    @unittest.skipIf(not TEST_NUMPY, "NumPy is not available")
     @onlyCPU
     @dtypes(*all_types_and_complex_and(torch.bool))
     def test_sparse_spdiags(self, device, dtype):
diff --git a/test/test_type_promotion.py b/test/test_type_promotion.py
index b351f2d6d494a..1d80556a7d48f 100644
--- a/test/test_type_promotion.py
+++ b/test/test_type_promotion.py
@@ -473,7 +473,7 @@ def _get_dtype(x):
             elif isinstance(x, complex):
                 return torch.complex64
             else:
-                raise AssertionError(f"Unkonwn type {x}")
+                raise AssertionError(f"Unknown type {x}")
 
         # tensor against tensor
         a_tensor = torch.tensor((0, 1), device=device, dtype=dtypes[0])

From 82123403c749c1c01983099b8d0c8c8bc7878e97 Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@fb.com>
Date: Thu, 17 Nov 2022 02:02:26 +0000
Subject: [PATCH 1000/1922] Add comprehensive minifier tests (#88022)

Adds tests for https://github.com/pytorch/torchdynamo/issues/1241.

To run: `pytest test/dynamo/test_minifier.py`.

Actually runs minifier launcher script and repro scripts, rather than just checking for existence of the minifier launcher script.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88022
Approved by: https://github.com/mlazos, https://github.com/anijain2305
---
 test/dynamo/test_minifier.py          | 388 ++++++++++++++++++++------
 test/inductor/test_minifier.py        | 211 ++++++++++++++
 torch/_dynamo/debug_utils.py          |  78 +++++-
 torch/_dynamo/test_minifier_common.py | 131 +++++++++
 4 files changed, 704 insertions(+), 104 deletions(-)
 create mode 100644 test/inductor/test_minifier.py
 create mode 100644 torch/_dynamo/test_minifier_common.py

diff --git a/test/dynamo/test_minifier.py b/test/dynamo/test_minifier.py
index 0cec7d202a9d4..c1a56f070be5d 100644
--- a/test/dynamo/test_minifier.py
+++ b/test/dynamo/test_minifier.py
@@ -1,111 +1,315 @@
 # Owner(s): ["module: dynamo"]
-import os
-import shutil
+import functools
+import re
+import textwrap
 import unittest
-from unittest.mock import patch
 
 import torch
 import torch._dynamo
-import torch._dynamo.test_case
-import torch._dynamo.testing
-from torch._dynamo.optimizations.backends import create_backend
+from torch._dynamo.test_minifier_common import MinifierTestBase
 
+requires_cuda = functools.partial(
+    unittest.skipIf, not torch.cuda.is_available(), "requires cuda"
+)
 
-class MockModule(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
+RELU_COMPILE_ERROR_BACKEND = """\
+from torch._dynamo.optimizations.backends import register_backend
 
-    def forward(self, x):
-        for _ in range(10):
-            x = torch.sin(x)
-        x = torch._foobar(x)
-        for _ in range(10):
-            x = torch.cos(x)
-        return x
+class DynamoCompileError(Exception):
+    pass
 
+@register_backend
+def test_relu_compile_error(gm: torch.fx.GraphModule, example_inputs):
+    for node in gm.graph.nodes:
+        if node.target == torch.relu:
+            raise DynamoCompileError("relu found")
+    return gm
+"""
 
-class MinfierTests(torch._dynamo.test_case.TestCase):
+RELU_RUNTIME_ERROR_BACKEND = """\
+import copy
+from torch._dynamo.optimizations.backends import register_backend
+
+@register_backend
+def test_relu_runtime_error(gm: torch.fx.GraphModule, example_inputs):
+    gm = copy.deepcopy(gm)
+    for node in gm.graph.nodes:
+        if node.target == torch.relu:
+            node.target = torch._assert
+            node.args = (False, "DynamoRuntimeError")
+    gm.recompile()
+    return gm
+"""
+
+RELU_ACCURACY_ERROR_BACKEND = """\
+import copy
+from torch._dynamo.optimizations.backends import register_backend
+
+@register_backend
+def test_relu_accuracy_error(gm: torch.fx.GraphModule, example_inputs):
+    gm = copy.deepcopy(gm)
+    for node in gm.graph.nodes:
+        if node.target == torch.relu:
+            node.target = torch.add
+            node.args = (node.args[0], 1)
+    gm.recompile()
+
+    return gm
+"""
+
+RELU_CUSTOM_ERROR_BACKEND = """\
+class CustomError(Exception):
+    pass
+
+def test_relu_custom_error(gm: torch.fx.GraphModule, example_inputs):
+    for node in gm.graph.nodes:
+        if node.target == torch.relu:
+            raise CustomError("relu found")
+    return gm
+"""
+
+
+class MinifierTests(MinifierTestBase):
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
-        cls._exit_stack.enter_context(
-            unittest.mock.patch.object(
-                torch._dynamo.config,
-                "debug_dir_root",
-                "/tmp/_torchdynamo_debug_/",
-            )
-        )
 
     @classmethod
     def tearDownClass(cls):
-        shutil.rmtree(torch._dynamo.config.debug_dir_root, ignore_errors=True)
-        cls._exit_stack.close()
-
-    def setUp(self):
-        super().setUp()
-
-    def tearDown(self):
-        super().tearDown()
-
-    def test_after_dynamo(self):
-        @create_backend
-        def bad_dynamo_backend(subgraph):
-            import sys
-
-            def f(*args):
-                # Shifted the forced exception to runtime as this is more common
-                # in JIT compilers.
-                for node in subgraph.model.graph.nodes:
-                    if node.op == "call_function" and node.target is torch._foobar:
-                        sys.stdout.write("Dynamo compiled failed\n")
-                        raise NotImplementedError("foobar is not implemented")
-                return subgraph.model(*args)
-
-            return f
-
-        mod = MockModule()
-        opt_mod = torch._dynamo.optimize("bad_dynamo_backend")(mod)
-        repro_file = torch._dynamo.debug_utils.get_minifier_repro_path()
-
-        @patch.object(torch._dynamo.config, "repro_after", "dynamo")
-        def inner():
-            x = torch.randn(4)
-            try:
-                opt_mod(x)
-            except Exception:
-                pass
-
-        inner()
-        self.assertTrue(os.path.exists(repro_file))
-
-    # If error_at_aot is True, an error will be produced when AOTAutograd
-    # attempts to generate the backward graph.
-    # If error_after_aot is False, an error will be produced in inductor.
-    def _test_around_aot(self, error_at_aot):
-        mod = MockModule()
-        opt_mod = torch._dynamo.optimize("inductor")(mod)
-
-        repro_file = torch._dynamo.debug_utils.get_minifier_repro_path()
-        repro_after = "dynamo" if error_at_aot else "aot"
-
-        @patch.object(torch._dynamo.config, "repro_after", repro_after)
-        def inner():
-            x = torch.randn(4)
-            x.requires_grad = error_at_aot
-            try:
-                opt_mod(x)
-            except Exception:
-                pass
-
-        inner()
-
-        self.assertTrue(os.path.exists(repro_file))
-
-    def test_at_aot(self):
-        self._test_around_aot(True)
-
-    def test_after_aot(self):
-        self._test_around_aot(False)
+        super().tearDownClass()
+
+    # Test that compile, runtime, and accuracy errors after dynamo can be repro'd (both CPU and CUDA)
+    def _test_after_dynamo(self, device, repro_level, backend_code, error_name):
+        run_code = textwrap.dedent(
+            f"""\
+            @torch._dynamo.optimize("{self._get_fn_name(backend_code)}")
+            def inner(x):
+                for _ in range(10):
+                    x = torch.sin(x)
+                x = torch.relu(x)
+                for _ in range(10):
+                    x = torch.cos(x)
+                return x
+
+            inner(torch.randn(20, 20).to("{device}"))
+        """
+        )
+
+        (test_proc, _, repro_proc), _ = self._run_full_test(
+            run_code, "dynamo", repro_level, backend_code
+        )
+
+        self.assertIn(error_name, test_proc.stderr.decode("utf-8"))
+        self.assertIn(error_name, repro_proc.stderr.decode("utf-8"))
+
+    def test_after_dynamo_cpu_compile_error(self):
+        self._test_after_dynamo(
+            "cpu", 2, RELU_COMPILE_ERROR_BACKEND, "DynamoCompileError"
+        )
+
+    def test_after_dynamo_cpu_runtime_error(self):
+        self._test_after_dynamo(
+            "cpu", 2, RELU_RUNTIME_ERROR_BACKEND, "DynamoRuntimeError"
+        )
+
+    def test_after_dynamo_cpu_accuracy_error(self):
+        self._test_after_dynamo("cpu", 4, RELU_ACCURACY_ERROR_BACKEND, "AccuracyError")
+
+    @requires_cuda()
+    def test_after_dynamo_cuda_compile_error(self):
+        self._test_after_dynamo(
+            "cuda", 2, RELU_COMPILE_ERROR_BACKEND, "DynamoCompileError"
+        )
+
+    @requires_cuda()
+    def test_after_dynamo_cuda_runtime_error(self):
+        self._test_after_dynamo(
+            "cuda", 2, RELU_RUNTIME_ERROR_BACKEND, "DynamoRuntimeError"
+        )
+
+    @requires_cuda()
+    def test_after_dynamo_cuda_accuracy_error(self):
+        self._test_after_dynamo("cuda", 4, RELU_ACCURACY_ERROR_BACKEND, "AccuracyError")
+
+    # Ensure that the testing backends pass when relu is not present.
+    def _test_after_dynamo_backend_passes(self, device, repro_level, backend_code):
+        run_code = textwrap.dedent(
+            f"""\
+            @torch._dynamo.optimize("{self._get_fn_name(backend_code)}")
+            def inner(x):
+                for _ in range(10):
+                    x = torch.sin(x)
+                for _ in range(10):
+                    x = torch.cos(x)
+                return x
+
+            inner(torch.randn(20, 20).to("{device}"))
+        """
+        )
+
+        test_code = self._gen_test_code(run_code, "dynamo", repro_level, backend_code)
+        proc, repro_dir = self._run_test_code(test_code)
+        self.assertEqual(proc.returncode, 0)
+        self.assertIsNone(repro_dir)
+
+    def test_after_dynamo_cpu_compile_backend_passes(self):
+        self._test_after_dynamo_backend_passes("cpu", 2, RELU_COMPILE_ERROR_BACKEND)
+
+    def test_after_dynamo_cpu_runtime_backend_passes(self):
+        self._test_after_dynamo_backend_passes("cpu", 2, RELU_RUNTIME_ERROR_BACKEND)
+
+    def test_after_dynamo_cpu_accuracy_backend_passes(self):
+        self._test_after_dynamo_backend_passes("cpu", 4, RELU_ACCURACY_ERROR_BACKEND)
+
+    @requires_cuda()
+    def test_after_dynamo_cuda_compile_backend_passes(self):
+        self._test_after_dynamo_backend_passes("cuda", 2, RELU_COMPILE_ERROR_BACKEND)
+
+    @requires_cuda()
+    def test_after_dynamo_cuda_runtime_backend_passes(self):
+        self._test_after_dynamo_backend_passes("cuda", 2, RELU_RUNTIME_ERROR_BACKEND)
+
+    @requires_cuda()
+    def test_after_dynamo_cuda_accuracy_backend_passes(self):
+        self._test_after_dynamo_backend_passes("cuda", 4, RELU_ACCURACY_ERROR_BACKEND)
+
+    # Ensure that generated code with a custom backends generates a runnable minifier
+    # launcher script that results in a RuntimeError
+    def test_after_dynamo_custom_backend(self):
+        run_code = textwrap.dedent(
+            f"""\
+            @torch._dynamo.optimize({self._get_fn_name(RELU_CUSTOM_ERROR_BACKEND)})
+            def inner(x):
+                for _ in range(10):
+                    x = torch.sin(x)
+                x = torch.relu(x)
+                for _ in range(10):
+                    x = torch.cos(x)
+                return x
+
+            inner(torch.randn(20, 20))
+        """
+        )
+
+        test_code = self._gen_test_code(
+            run_code, "dynamo", 2, RELU_CUSTOM_ERROR_BACKEND
+        )
+        _, repro_dir = self._run_test_code(test_code)
+        launch_proc, _ = self._run_minifier_launcher("", repro_dir)
+        self.assertIn("RuntimeError", launch_proc.stderr.decode("utf-8"))
+
+    # Test that a module with mixed cpu/cuda parts with an error after dynamo can be repro'd
+    @requires_cuda()
+    def test_cpu_cuda_module_after_dynamo(self):
+        backend_name = self._get_fn_name(RELU_COMPILE_ERROR_BACKEND)
+
+        run_code = textwrap.dedent(
+            f"""\
+            class CpuCudaModule(torch.nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.m_x = torch.nn.Linear(20, 20).cuda()
+                    self.m_y = torch.nn.Linear(20, 20)
+                    self.p_x = torch.nn.Parameter(torch.randn(20, 20).cuda())
+                    self.p_y = torch.nn.Parameter(torch.randn(20, 20))
+                    self.register_buffer("b_x", torch.ones(20, 20).cuda())
+                    self.register_buffer("b_y", torch.ones(20, 20))
+
+                def forward(self, x, y):
+                    return self.m_x(x) + self.p_x + self.b_x, self.m_y(y) + self.p_y + self.b_y
+
+            mod = CpuCudaModule()
+
+            @torch._dynamo.optimize("{backend_name}")
+            def inner(x1, y1):
+                x2 = torch.randn(20, 20).cuda()
+                y2 = torch.randn(20, 20)
+                x3, y3 = mod(x1 + x2, y1 + y2)
+                return torch.relu(x3.cpu() + y3)
+
+            inner(torch.randn(20, 20).cuda(), torch.randn(20, 20))
+        """
+        )
+
+        (test_proc, _, repro_proc), (launch_code, _) = self._run_full_test(
+            run_code, "dynamo", 2, RELU_COMPILE_ERROR_BACKEND
+        )
+
+        tb1 = test_proc.stderr.decode("utf-8")
+        tb2 = repro_proc.stderr.decode("utf-8")
+
+        # Check if generated minifier code covers all cpu/cuda cases
+        self.assertIsNotNone(re.search(r"args.*cuda", launch_code))
+        self.assertIsNotNone(re.search(r"args.*cpu", launch_code))
+        # search for Linear(...).cuda()
+        self.assertIsNotNone(re.search(r"Linear.*cuda", launch_code))
+        # search for Linear(...)
+        self.assertIsNotNone(
+            re.search(r"Linear(?!.*cuda.*$)", launch_code, re.MULTILINE)
+        )
+        self.assertIsNotNone(re.search(r"register_buffer.*cuda", launch_code))
+        self.assertIsNotNone(
+            re.search(r"register_buffer(?!.*cuda.*$)", launch_code, re.MULTILINE)
+        )
+        self.assertIsNotNone(re.search(r"Parameter.*cuda", launch_code))
+        self.assertIsNotNone(
+            re.search(r"Parameter(?!.*cuda.*$)", launch_code, re.MULTILINE)
+        )
+        # search for
+        # <name> = torch.randn(...)
+        # ... = <name>.cuda()
+        self.assertIsNotNone(
+            re.search(r"(\w+) = torch.randn.*\1\.cuda", launch_code, re.DOTALL)
+        )
+        # search for
+        # <name> = torch.randn(...)
+        # no followup call to <name>.cuda()
+        self.assertIsNotNone(
+            re.search(
+                r"(\w+) = torch.randn(?!.*\1\.cuda\(\).*$)", launch_code, re.DOTALL
+            )
+        )
+
+        self.assertIn(backend_name, tb1)
+        self.assertIn(backend_name, tb2)
+
+    # Test if we can actually get a minified graph
+    def test_if_graph_minified(self):
+        backend_name = self._get_fn_name(RELU_COMPILE_ERROR_BACKEND)
+
+        run_code = textwrap.dedent(
+            f"""\
+            @torch._dynamo.optimize("{backend_name}")
+            def inner(x):
+                for _ in range(20):
+                    x = torch.sin(x)
+                x = torch.relu(x)
+                for _ in range(20):
+                    x = torch.cos(x)
+                return x
+
+            inner(torch.randn(20, 20))
+        """
+        )
+
+        (test_proc, _, repro_proc), (launch_code, repro_code) = self._run_full_test(
+            run_code, "dynamo", 2, RELU_COMPILE_ERROR_BACKEND
+        )
+
+        tb1 = test_proc.stderr.decode("utf-8")
+        tb2 = repro_proc.stderr.decode("utf-8")
+
+        self.assertIn(backend_name, tb1)
+        self.assertIn(backend_name, tb2)
+
+        # compare the length of the forward functions
+        match = re.search(r"def forward.*return", launch_code, re.DOTALL)
+        self.assertIsNotNone(match)
+        self.assertGreater(match.group(0).count("\n"), 40)
+
+        match = re.search(r"def forward.*return", repro_code, re.DOTALL)
+        self.assertIsNotNone(match)
+        self.assertLess(match.group(0).count("\n"), 5)
 
 
 if __name__ == "__main__":
diff --git a/test/inductor/test_minifier.py b/test/inductor/test_minifier.py
new file mode 100644
index 0000000000000..55c0a1b6bb05f
--- /dev/null
+++ b/test/inductor/test_minifier.py
@@ -0,0 +1,211 @@
+# Owner(s): ["module: inductor"]
+import functools
+import textwrap
+import unittest
+
+import torch
+import torch._dynamo
+import torch._inductor.utils
+from torch._dynamo.test_minifier_common import MinifierTestBase
+from torch.testing._internal.common_utils import IS_MACOS
+
+_HAS_TRITON = torch._inductor.utils.has_triton()
+requires_cuda = functools.partial(unittest.skipIf, not _HAS_TRITON, "requires cuda")
+
+CPP_COMPILE_ERROR = """\
+def cpp_compile_error(x):
+    return "compile error!"
+"""
+
+CPP_RUNTIME_ERROR = """\
+def cpp_runtime_error(x):
+    return f"{x}; throw 1"
+"""
+
+CPP_ACCURACY_ERROR = """\
+def cpp_accuracy_error(x):
+    return f"{x} + 1"
+"""
+
+TRITON_COMPILE_ERROR = """\
+def triton_compile_error(x):
+    return "compile error!"
+"""
+
+# NOTE: there is currently not an easy way to cause a triton runtime error.
+TRITON_RUNTIME_ERROR = """\
+def triton_runtime_error(x):
+    return f"{x}; assert?"
+"""
+
+TRITON_ACCURACY_ERROR = """\
+def triton_accuracy_error(x):
+    return f"{x} + 1"
+"""
+
+
+class MinifierTests(MinifierTestBase):
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+
+    @classmethod
+    def tearDownClass(cls):
+        super().tearDownClass()
+
+    # Generates code that patches CppOverrides/TritonOverrides.
+    def _gen_codegen_fn_patch_code(self, old_fn_name, new_fn_code, device):
+        new_fn_name = self._get_fn_name(new_fn_code)
+        if new_fn_name is not None:
+            patch_code = f"""\
+import torch._inductor.codegen.{"cpp" if device == "cpu" else "triton"} as codegen
+overrides = codegen.{"CppOverrides" if device == "cpu" else "TritonOverrides"}
+{new_fn_code}
+overrides.{old_fn_name} = staticmethod({new_fn_name})
+"""
+        return f"""\
+{patch_code}
+isolate_fails_code_str = \"\"\"\\
+{patch_code}
+torch._dynamo.config.debug_dir_root = "{self.DEBUG_DIR}"
+\"\"\"
+"""
+
+    # Test that compile and accuracy errors after aot can be repro'd (both CPU and CUDA)
+    def _test_after_aot(self, device, backend_code, repro_level):
+        run_code = textwrap.dedent(
+            f"""\
+            @torch._dynamo.optimize("inductor")
+            def inner(x):
+                for _ in range(3):
+                    x = torch.sin(x)
+                x = torch.relu(x)
+                for _ in range(3):
+                    x = torch.cos(x)
+                return x
+
+            inner(torch.randn(20, 20).to("{device}"))
+        """
+        )
+        patch_code = self._gen_codegen_fn_patch_code("relu", backend_code, device)
+        self.assertIsNotNone(patch_code)
+        (test_proc, _, repro_proc), _ = self._run_full_test(
+            run_code, "aot", repro_level, patch_code
+        )
+        return (
+            (test_proc.stderr.decode("utf-8"), repro_proc.stderr.decode("utf-8")),
+            (test_proc.returncode, repro_proc.returncode),
+        )
+
+    def test_after_aot_cpu_compile_error(self):
+        (tb1, tb2), _ = self._test_after_aot("cpu", CPP_COMPILE_ERROR, 2)
+        self.assertIn("CppCompileError", tb1)
+        self.assertIn("CppCompileError", tb2)
+
+    def test_after_aot_cpu_accuracy_error(self):
+        (tb1, tb2), _ = self._test_after_aot("cpu", CPP_ACCURACY_ERROR, 4)
+        self.assertIn("AccuracyError", tb1)
+        self.assertIn("AccuracyError", tb2)
+
+    @requires_cuda()
+    def test_after_aot_cuda_compile_error(self):
+        (tb1, tb2), _ = self._test_after_aot("cuda", TRITON_COMPILE_ERROR, 2)
+        self.assertIn("SyntaxError", tb1)
+        self.assertIn("SyntaxError", tb2)
+
+    @requires_cuda()
+    def test_after_aot_cuda_accuracy_error(self):
+        (tb1, tb2), _ = self._test_after_aot("cuda", TRITON_ACCURACY_ERROR, 4)
+        self.assertIn("AccuracyError", tb1)
+        self.assertIn("AccuracyError", tb2)
+
+    # Test that runtime errors after aot can be repro'd (CPU only for now)
+    def _test_after_aot_runtime_error(self, device, backend_code):
+        run_code = textwrap.dedent(
+            f"""\
+            @torch._dynamo.optimize("inductor")
+            def inner(x):
+                for _ in range(3):
+                    x = torch.sin(x)
+                x = torch.relu(x)
+                for _ in range(3):
+                    x = torch.cos(x)
+                return x
+
+            inner(torch.randn(20, 20).to("{device}"))
+        """
+        )
+        patch_code = self._gen_codegen_fn_patch_code("relu", backend_code, device)
+        self.assertIsNotNone(patch_code)
+
+        (test_proc, _, repro_proc), _ = self._run_full_test(
+            run_code, "aot", 3, patch_code
+        )
+
+        self.assertNotIn("CompilerError", test_proc.stderr.decode("utf-8"))
+
+        self.assertEqual(test_proc.returncode, repro_proc.returncode)
+        self.assertNotEqual(test_proc.returncode, 0)
+
+    def test_after_aot_cpu_runtime_error(self):
+        self._test_after_aot_runtime_error("cpu", CPP_RUNTIME_ERROR)
+
+    # NOTE: there is currently not an easy way to cause a triton runtime error.
+    @unittest.skip
+    @requires_cuda()
+    def test_after_aot_cuda_runtime_error(self):
+        self._test_after_aot_runtime_error("cuda", TRITON_RUNTIME_ERROR)
+
+    # Ensure that inductor codegen patches pass when relu is not present.
+    def _test_after_aot_backend_passes(self, device, repro_level, backend_code):
+        run_code = textwrap.dedent(
+            f"""\
+            @torch._dynamo.optimize("inductor")
+            def inner(x):
+                for _ in range(3):
+                    x = torch.sin(x)
+                for _ in range(3):
+                    x = torch.cos(x)
+                return x
+
+            inner(torch.randn(20, 20).to("{device}"))
+        """
+        )
+        patch_code = self._gen_codegen_fn_patch_code("relu", backend_code, device)
+        self.assertIsNotNone(patch_code)
+
+        test_code = self._gen_test_code(run_code, "aot", repro_level, patch_code)
+        proc, repro_dir = self._run_test_code(test_code)
+        self.assertEqual(proc.returncode, 0)
+        self.assertIsNone(repro_dir)
+
+    def test_after_aot_cpu_compile_backend_passes(self):
+        self._test_after_aot_backend_passes("cpu", 2, CPP_COMPILE_ERROR)
+
+    def test_after_aot_cpu_runtime_backend_passes(self):
+        self._test_after_aot_backend_passes("cpu", 2, CPP_RUNTIME_ERROR)
+
+    def test_after_aot_cpu_accuracy_backend_passes(self):
+        self._test_after_aot_backend_passes("cpu", 4, CPP_ACCURACY_ERROR)
+
+    @requires_cuda()
+    def test_after_aot_cuda_compile_backend_passes(self):
+        self._test_after_aot_backend_passes("cuda", 2, TRITON_COMPILE_ERROR)
+
+    # NOTE: there is currently not an easy way to cause a triton runtime error.
+    @unittest.skip
+    @requires_cuda()
+    def test_after_aot_cuda_runtime_backend_passes(self):
+        self._test_after_aot_backend_passes("cuda", 2, TRITON_RUNTIME_ERROR)
+
+    @requires_cuda()
+    def test_after_aot_cuda_accuracy_backend_passes(self):
+        self._test_after_aot_backend_passes("cuda", 4, TRITON_ACCURACY_ERROR)
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    # skip CI tests on mac since CPU inductor does not seem to work due to C++ compile errors
+    if not IS_MACOS:
+        run_tests()
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index f09991f9bf348..98a269fe8c9eb 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -84,6 +84,11 @@ def __init__(self):
 
         for module_name, module in gm.named_children():
             module_str = f"{module.__repr__()}"
+            # module should be a core torch.nn.Module, so all parameters
+            # should be on the same device.
+            example_param = next(module.parameters(), None)
+            if example_param is not None and example_param.is_cuda:
+                module_str = f"{module_str}.cuda()"
             model_str += f"{tab*2}self.{module_name} = {module_str}\n"
 
         for buffer_name, buffer in gm._buffers.items():
@@ -95,12 +100,16 @@ def __init__(self):
                 tensor_str = (
                     f"torch.randint(1, size={list(buffer.shape)}, dtype={buffer.dtype})"
                 )
+            if buffer.is_cuda:
+                tensor_str = f"{tensor_str}.cuda()"
             model_str += f"{tab*2}self.register_buffer('{buffer_name}', {tensor_str})\n"
 
         for param_name, param in gm._parameters.items():
             if param is None:
                 continue
             tensor_str = f"torch.nn.Parameter(torch.randn({list(param.shape)}, dtype={param.dtype}))"
+            if param.is_cuda:
+                tensor_str = f"{tensor_str}.cuda()"
             model_str += f"{tab*2}self.{param_name} = {tensor_str}\n"
 
         # TODO - Keep this code for now. But, I don't think we will need this.
@@ -145,6 +154,9 @@ def _cuda_system_info_comment():
     return model_str
 
 
+TEST_REPLACEABLE_COMMENT = "# REPLACEABLE COMMENT FOR TESTING PURPOSES"
+
+
 def generate_compiler_repro_string(gm, args):
     model_str = textwrap.dedent(
         f"""
@@ -155,6 +167,8 @@ def generate_compiler_repro_string(gm, args):
         from math import inf
         from torch.fx.experimental.proxy_tensor import make_fx
 
+        {TEST_REPLACEABLE_COMMENT}
+
         """
     )
     model_str += f"# torch version: {torch.version.__version__}\n"
@@ -170,7 +184,7 @@ def generate_compiler_repro_string(gm, args):
     model_str += (
         "args = [rand_strided(sh, st, dt, dev) for (sh, st, dt, dev) in args]\n"
     )
-    model_str += 'mod = make_fx(Repro().to(device="cuda"))(*args)\n'
+    model_str += "mod = make_fx(Repro())(*args)\n"
     return model_str
 
 
@@ -197,7 +211,8 @@ def dump_compiler_graph_state(gm, args, compiler_name):
     log.warning(f"Writing checkpoint with {len(gm.graph.nodes)} nodes to {file_name}")
     with open(file_name, "w") as fd:
         save_graph_repro(fd, gm, args, compiler_name)
-    repro_path = os.path.join(config.base_dir, "repro.py")
+    curdir = os.getcwd()
+    repro_path = os.path.join(curdir, "repro.py")
     try:
         shutil.copyfile(file_name, repro_path)
         log.warning(f"Copying repro file for convenience to {repro_path}")
@@ -216,7 +231,10 @@ def save_graph_repro(fd, gm, args, compiler_name):
             textwrap.dedent(
                 f"""
                 compiled = {COMPILER_REPRO_OPTIONS[compiler_name][1]}(mod, args)
-                assert same_two_models(mod, compiled, args, only_fwd=True), "Accuracy failed"
+                class AccuracyError(Exception):
+                    pass
+                if not same_two_models(mod, compiled, args, only_fwd=True):
+                    raise AccuracyError("Bad accuracy detected")
                 """
             )
         )
@@ -231,7 +249,7 @@ def save_graph_repro(fd, gm, args, compiler_name):
         )
 
 
-def isolate_fails(fx_g, args, compiler_name: str, env=None):
+def isolate_fails(fx_g, args, compiler_name: str, env=None, patch_code=None):
     if env is None:
         env = {}
     subdir = os.path.join(os.getcwd(), "isolate")
@@ -239,7 +257,10 @@ def isolate_fails(fx_g, args, compiler_name: str, env=None):
         os.makedirs(subdir, exist_ok=True)
     file_name = os.path.join(subdir, f"{str(uuid.uuid4())[:5]}.py")
     with open(file_name, "w") as fd:
-        fd.write(generate_compiler_repro_string(fx_g, args))
+        repro_code = generate_compiler_repro_string(fx_g, args)
+        if patch_code is not None:
+            repro_code = repro_code.replace(TEST_REPLACEABLE_COMMENT, patch_code)
+        fd.write(repro_code)
         fail_fn = COMPILER_REPRO_OPTIONS[compiler_name][2]
         fd.write(
             textwrap.dedent(
@@ -263,6 +284,7 @@ def isolate_fails(fx_g, args, compiler_name: str, env=None):
     stdout, stderr = TemporaryFile(), TemporaryFile()
     p = subprocess.Popen(
         ["python", file_name],
+        cwd=subdir,
         stdout=stdout,
         stderr=stderr,
         env=new_env,
@@ -329,6 +351,8 @@ def dump_to_minify(gm, args, compiler_name: str):
 
     contents = textwrap.dedent(
         f"""
+isolate_fails_code_str = None
+
 {generate_compiler_repro_string(gm, args)}
 
 from functools import partial
@@ -343,7 +367,7 @@ def dump_to_minify(gm, args, compiler_name: str):
 minifier(
     mod,
     args,
-    module_fails=partial(isolate_fails, env=env_variables, compiler_name="{compiler_name}"),
+    module_fails=partial(isolate_fails, env=env_variables, compiler_name="{compiler_name}", patch_code=isolate_fails_code_str),
     dump_state=partial(dump_compiler_graph_state, compiler_name="{compiler_name}"),
 )
         """
@@ -351,6 +375,10 @@ def dump_to_minify(gm, args, compiler_name: str):
     return helper_for_dump_minify(contents)
 
 
+class AccuracyError(Exception):
+    pass
+
+
 def wrap_compiler_debug(compiler_fn, compiler_name: str):
     """
     Minifier for Fx Graph modules after Aot Autograd has finished. We wrap both
@@ -410,7 +438,7 @@ def deferred_for_real_inputs(real_inputs):
                         copy_tensor_attrs,
                         f"{compiler_name}_accuracy",
                     )
-                    raise ValueError("Bad accuracy detected")
+                    raise AccuracyError("Bad accuracy detected")
                 else:
                     # Call the compiled function with real inputs
                     return inner_compiled_fn(real_inputs)
@@ -435,7 +463,8 @@ def deferred_for_real_inputs(real_inputs):
                             copy_tensor_attrs,
                             compiler_name,
                         )
-                    raise e
+                    log.error("CompilerError")
+                    raise
 
         if config.repro_after == "aot":
             compiled_fn = deferred_for_real_inputs
@@ -544,9 +573,14 @@ def generate_dynamo_fx_repro_string(
             f"""
 mod.eval()
 opt_mod.eval()
+
+class AccuracyError(Exception):
+    pass
+
 with torch.cuda.amp.autocast(enabled={torch.is_autocast_enabled()}):
     assert same_two_models(mod, mod, args), "Eager itself failed"
-    assert same_two_models(mod, opt_mod, args), "Dynamo failed"
+    if not same_two_models(mod, opt_mod, args):
+        raise AccuracyError("Dynamo failed")
     """
         )
 
@@ -561,12 +595,14 @@ def generate_dynamo_fx_repro_string(
 from {config.dynamo_import}.debug_utils import run_fwd_maybe_bwd
 from {config.dynamo_import}.debug_utils import same_two_models
 
+{TEST_REPLACEABLE_COMMENT}
+
 args = {[(tuple(a.shape), tuple(a.stride()), a.dtype, a.device.type, a.requires_grad) for a in args]}
 args = [rand_strided(sh, st, dt, dev).requires_grad_(rg) for (sh, st, dt, dev, rg) in args]
 
 {model_str}
 
-mod = Repro().cuda()
+mod = Repro()
 opt_mod = {config.dynamo_import}.optimize("{compiler_name}")(mod)
 
 {run_code}
@@ -705,6 +741,21 @@ def dump_to_minify_after_dynamo(gm, args, compiler_name):
     if config.repro_level == 4:
         minifier_backend = "dynamo_accuracy_minifier_backend"
 
+    custom_compiler_error = (
+        textwrap.dedent(
+            """\
+        raise RuntimeError(
+            'Compiler name is None - this likely means that a custom compiler '
+            'was called by torchdynamo. Please remove this error, import your '
+            'custom compiler function, and replace the compiler_name="None" '
+            'line below to compiler_name=<my_imported_custom_function>'
+        )
+        """
+        )
+        if compiler_name is None
+        else ""
+    )
+
     contents = textwrap.dedent(
         f"""
 import os
@@ -718,14 +769,17 @@ def dump_to_minify_after_dynamo(gm, args, compiler_name):
 from {config.dynamo_import}.optimizations.backends import BACKENDS
 from {config.dynamo_import}.testing import rand_strided
 
+{TEST_REPLACEABLE_COMMENT}
+
 args = {[(tuple(a.shape), tuple(a.stride()), a.dtype, a.device.type, a.requires_grad) for a in args]}
 args = [rand_strided(sh, st, dt, dev).requires_grad_(rg) for (sh, st, dt, dev, rg) in args]
 
 {model_str}
-mod = Repro().cuda()
+mod = Repro()
 
 # Setup debug minifier compiler
 compiler_fn = BACKENDS["{minifier_backend}"]
+{custom_compiler_error}
 dynamo_minifier_backend = functools.partial(
     compiler_fn,
     compiler_name="{compiler_name}",
@@ -769,7 +823,7 @@ def debug_wrapper(gm, example_inputs, **kwargs):
                         example_inputs,
                         compiler_name,
                     )
-                    exc = ValueError("Bad accuracy detected.")
+                    exc = AccuracyError("Bad accuracy detected.")
                     exc.minifier_path = os.path.join(
                         minifier_dir(), "minifier_launcher.py"
                     )
diff --git a/torch/_dynamo/test_minifier_common.py b/torch/_dynamo/test_minifier_common.py
new file mode 100644
index 0000000000000..8fb0688f2c3ed
--- /dev/null
+++ b/torch/_dynamo/test_minifier_common.py
@@ -0,0 +1,131 @@
+import os
+import re
+import subprocess
+import tempfile
+import unittest
+
+import torch
+import torch._dynamo
+import torch._dynamo.test_case
+from torch._dynamo.debug_utils import TEST_REPLACEABLE_COMMENT
+
+
+class MinifierTestBase(torch._dynamo.test_case.TestCase):
+    _debug_dir_obj = tempfile.TemporaryDirectory()
+    DEBUG_DIR = _debug_dir_obj.name
+
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls._exit_stack.enter_context(
+            unittest.mock.patch.object(
+                torch._dynamo.config,
+                "debug_dir_root",
+                cls.DEBUG_DIR,
+            )
+        )
+        os.makedirs(cls.DEBUG_DIR, exist_ok=True)
+
+    @classmethod
+    def tearDownClass(cls):
+        cls._debug_dir_obj.cleanup()
+        cls._exit_stack.close()
+
+    def setUp(self):
+        super().setUp()
+
+    def tearDown(self):
+        super().tearDown()
+
+    # Search for the name of the first function defined in a code string.
+    def _get_fn_name(self, code):
+        fn_name_match = re.search(r"def (\w+)\(", code)
+        if fn_name_match is not None:
+            return fn_name_match.group(1)
+        return None
+
+    # Run `code` in a separate python process.
+    # Returns the completed process state and the directory containing the
+    # minifier launcher script, if `code` outputted it.
+    def _run_test_code(self, code):
+        proc = subprocess.run(
+            ["python3", "-c", code], capture_output=True, cwd=self.DEBUG_DIR
+        )
+
+        repro_dir_match = re.search(
+            r"(\S+)minifier_launcher.py", proc.stderr.decode("utf-8")
+        )
+        if repro_dir_match is not None:
+            # Print repro directory for debugging generated code.
+            # Make sure to comment out `shutil.rmtree...` above as well.
+            print("repro dir:", repro_dir_match.group(1))
+            return proc, repro_dir_match.group(1)
+        return proc, None
+
+    # Patch generated files with testing patches
+    def _inject_code(self, patch_code, filename):
+        patch_code = f"""\
+{patch_code}
+torch._dynamo.config.debug_dir_root = "{self.DEBUG_DIR}"
+"""
+        with open(filename, "r") as f:
+            code = f.read()
+        code = code.replace(TEST_REPLACEABLE_COMMENT, patch_code)
+        with open(filename, "w") as f:
+            f.write(code)
+        return code
+
+    # Runs the minifier launcher script in `repro_dir`, patched with `patch_code`.
+    def _run_minifier_launcher(self, patch_code, repro_dir):
+        self.assertIsNotNone(repro_dir)
+        launch_file = os.path.join(repro_dir, "minifier_launcher.py")
+        self.assertTrue(os.path.exists(launch_file))
+        launch_code = self._inject_code(patch_code, launch_file)
+
+        launch_proc = subprocess.run(
+            ["python3", launch_file],
+            capture_output=True,
+            cwd=repro_dir,
+        )
+
+        return launch_proc, launch_code
+
+    # Runs the repro script in `repro_dir`, patched with `patch_code`
+    def _run_repro(self, patch_code, repro_dir):
+        self.assertIsNotNone(repro_dir)
+        repro_file = os.path.join(repro_dir, "repro.py")
+        self.assertTrue(os.path.exists(repro_file))
+        repro_code = self._inject_code(patch_code, repro_file)
+
+        repro_proc = subprocess.run(
+            ["python3", repro_file], capture_output=True, cwd=repro_dir
+        )
+
+        return repro_proc, repro_code
+
+    # Template for testing code.
+    # `run_code` is the code to run for the test case.
+    # `patch_code` is the code to be patched in every generated file.
+    def _gen_test_code(self, run_code, repro_after, repro_level, patch_code):
+        return f"""\
+import torch
+import torch._dynamo
+{patch_code}
+torch._dynamo.config.repro_after = "{repro_after}"
+torch._dynamo.config.repro_level = {repro_level}
+torch._dynamo.config.debug_dir_root = "{self.DEBUG_DIR}"
+{run_code}
+"""
+
+    # Runs a full minifier test.
+    # Minifier tests generally consist of 3 stages:
+    # 1. Run the problematic code (in a separate process since it could segfault)
+    # 2. Run the generated minifier launcher script
+    # 3. Run the generated repro script
+    def _run_full_test(self, run_code, repro_after, repro_level, patch_code):
+        test_code = self._gen_test_code(run_code, repro_after, repro_level, patch_code)
+        test_proc, repro_dir = self._run_test_code(test_code)
+        self.assertIsNotNone(repro_dir)
+        launch_proc, launch_code = self._run_minifier_launcher(patch_code, repro_dir)
+        repro_proc, repro_code = self._run_repro(patch_code, repro_dir)
+        return ((test_proc, launch_proc, repro_proc), (launch_code, repro_code))

From 840f2cf2b7c9b454fff96137649a411bc3e1825e Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 17 Nov 2022 02:03:45 +0000
Subject: [PATCH 1001/1922] [dynamo][reland] API Support for nn.Module (#89113)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89113
Approved by: https://github.com/ezyang
---
 test/dynamo/test_modules.py  | 135 +++++++++++++++++++++++++++++++++++
 torch/_dynamo/__init__.py    |   2 +
 torch/_dynamo/debug_utils.py |   8 +++
 torch/_dynamo/eval_frame.py  |  79 ++++++++++++++------
 torch/_dynamo/testing.py     |  14 ++++
 5 files changed, 218 insertions(+), 20 deletions(-)

diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py
index 2fb83b3add6cf..ed3b715f72f9d 100644
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@@ -904,6 +904,141 @@ def forward(self, x):
         self.assertTrue(torch._dynamo.testing.same(real, graph(rx)))
 
 
+class MockModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.relu = torch.nn.ReLU()
+        self.linear = torch.nn.Linear(10, 10)
+        self.register_buffer("buf0", torch.randn(10, 10))
+
+    def forward(self, x):
+        return self.relu(self.linear(x) + self.buf0)
+
+
+class OptimizedModuleTest(torch._dynamo.test_case.TestCase):
+    def test_nn_module(self):
+        mod = MockModule()
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_mod = torch._dynamo.optimize(cnt)(mod)
+        self.assertIsInstance(opt_mod, torch._dynamo.OptimizedModule)
+
+        x = torch.randn(10, 10)
+        self.assertTrue(torch._dynamo.testing.same(mod(x), opt_mod(x)))
+        self.assertEqual(cnt.frame_count, 1)
+
+    def test_to(self):
+        mod = MockModule()
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_mod = torch._dynamo.optimize(cnt)(mod)
+        x = torch.randn(10, 10)
+        self.assertTrue(torch._dynamo.testing.same(mod(x), opt_mod(x)))
+        self.assertEqual(cnt.frame_count, 1)
+
+        # Ensure that there is no recompilation
+        opt_mod(x)
+        self.assertEqual(cnt.frame_count, 1)
+
+        opt_mod = opt_mod.to(device="cpu").to(dtype=torch.float64)
+        self.assertIsInstance(opt_mod, torch._dynamo.OptimizedModule)
+        x = torch.randn(10, 10).to(dtype=torch.float64)
+        opt_mod(x)
+        # Ensure that there is a recompilation
+        self.assertEqual(cnt.frame_count, 2)
+
+        # Ensure that there is no recompilation
+        opt_mod(x)
+        self.assertEqual(cnt.frame_count, 2)
+
+        torch._dynamo.reset()
+        opt_mod(x)
+        self.assertEqual(cnt.frame_count, 3)
+
+    def test_attr(self):
+        class MockModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(10, 10)
+                self.register_buffer("buf0", torch.randn(10, 10))
+
+            def forward(self, x):
+                return self.r(torch.sin(x)) + self.buf0
+
+        mod = MockModule()
+        opt_mod = torch._dynamo.optimize("eager")(mod)
+
+        # Check parameteres and buffers
+        for (p1, p2) in zip(mod.parameters(), opt_mod.parameters()):
+            self.assertTrue(id(p1) == id(p2))
+
+    def test_recursion(self):
+        mod = MockModule()
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_mod = torch._dynamo.optimize(cnt)(mod)
+
+        for _ in range(5):
+            opt_mod = torch._dynamo.optimize(cnt)(opt_mod)
+        opt_mod(torch.randn(10, 10))
+        self.assertEqual(cnt.frame_count, 1)
+
+    def test_composition(self):
+        class InnerModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                return self.relu(torch.sin(x))
+
+        opt_inner_mod = InnerModule()
+
+        class OuterModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mod = opt_inner_mod
+
+            def forward(self, x):
+                return self.mod(torch.cos(x))
+
+        outer_mod = OuterModule()
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_outer_mod = torch._dynamo.optimize(cnt)(outer_mod)
+
+        x = torch.randn(4)
+        self.assertIsInstance(opt_outer_mod, torch._dynamo.OptimizedModule)
+        self.assertTrue(torch._dynamo.testing.same(outer_mod(x), opt_outer_mod(x)))
+        self.assertEqual(cnt.frame_count, 1)
+
+    def test_composition_with_opt_mod(self):
+        class InnerModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                return self.relu(torch.sin(x))
+
+        inner_mod = InnerModule()
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_inner_mod = torch._dynamo.optimize(cnt)(inner_mod)
+
+        class OuterModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mod = opt_inner_mod
+
+            def forward(self, x):
+                return self.mod(torch.cos(x))
+
+        outer_mod = OuterModule()
+        opt_outer_mod = torch._dynamo.optimize(cnt)(outer_mod)
+
+        x = torch.randn(4)
+        self.assertIsInstance(opt_outer_mod, torch._dynamo.OptimizedModule)
+        self.assertTrue(torch._dynamo.testing.same(outer_mod(x), opt_outer_mod(x)))
+        # There will be a graph break for the inner mod being OptimizedModule
+        self.assertEqual(cnt.frame_count, 2)
+
+
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/torch/_dynamo/__init__.py b/torch/_dynamo/__init__.py
index 80f927aeef2fa..5eee609b0852a 100644
--- a/torch/_dynamo/__init__.py
+++ b/torch/_dynamo/__init__.py
@@ -7,6 +7,7 @@
     export,
     optimize,
     optimize_assert,
+    OptimizedModule,
     reset_code,
     run,
     skip,
@@ -25,6 +26,7 @@
     "reset",
     "list_backends",
     "skip",
+    "OptimizedModule",
 ]
 
 
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index 98a269fe8c9eb..29d830167b109 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -515,8 +515,16 @@ def same_two_models(gm, opt_gm, example_inputs, only_fwd=False):
     """
     Check two models have same accuracy.
     """
+    from .eval_frame import OptimizedModule
+    from .testing import named_parameters_for_optimized_module
     from .utils import same
 
+    if isinstance(gm, OptimizedModule):
+        gm.named_parameters = named_parameters_for_optimized_module(gm)
+
+    if isinstance(opt_gm, OptimizedModule):
+        opt_gm.named_parameters = named_parameters_for_optimized_module(opt_gm)
+
     ref = run_fwd_maybe_bwd(gm, example_inputs, only_fwd)
 
     try:
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index cb3cffaa73d16..1188bfd74fc25 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -5,6 +5,7 @@
 import logging
 import os
 import sys
+import textwrap
 import threading
 import traceback
 import types
@@ -44,6 +45,27 @@
 most_recent_backend = None
 
 
+class OptimizedModule(torch.nn.Module):
+    """
+    Wraps the original nn.Module object and later patches its
+    forward method to optimized self.forward method.
+    """
+
+    def __init__(self, mod, dynamo_ctx):
+        super().__init__()
+        # Installs the params/buffer
+        self._orig_mod = mod
+        self.dynamo_ctx = dynamo_ctx
+
+    def __getattr__(self, name):
+        if name == "_orig_mod":
+            return self._modules["_orig_mod"]
+        return getattr(self._orig_mod, name)
+
+    def forward(self, *args, **kwargs):
+        return self.dynamo_ctx(self._orig_mod.forward)(*args, **kwargs)
+
+
 def remove_from_cache(f):
     """
     Make sure f.__code__ is not cached to force a recompile
@@ -118,31 +140,14 @@ def __call__(self, fn):
         # Optimize the forward method of torch.nn.Module object
         if isinstance(fn, torch.nn.Module):
             mod = fn
-            optimized_forward = self(mod.forward)
-
-            class TorchDynamoNNModuleWrapper:
-                """
-                A wrapper that redirects the forward call to the optimized
-                forward, while for rest it redirects the calls to the original
-                module.
-                """
-
-                def __getattr__(self, name):
-                    return getattr(mod, name)
-
-                def forward(self, *args, **kwargs):
-                    return optimized_forward(*args, **kwargs)
-
-                def __call__(self, *args, **kwargs):
-                    return self.forward(*args, **kwargs)
-
-            new_mod = TorchDynamoNNModuleWrapper()
+            new_mod = OptimizedModule(mod, self)
             # Save the function pointer to find the original callable while nesting
             # of decorators.
-            new_mod._torchdynamo_orig_callable = mod
+            new_mod._torchdynamo_orig_callable = mod.forward
             return new_mod
 
         assert callable(fn)
+
         callback = self.callback
         on_enter = self.on_enter
         backend_ctx_ctor = self.extra_ctx_ctor
@@ -184,6 +189,40 @@ def _fn(*args, **kwargs):
         # If the function is called using torch._dynamo.optimize decorator, we
         # should prevent any type of skipping.
         if callback not in (None, False):
+            if not hasattr(fn, "__code__"):
+                raise RuntimeError(
+                    textwrap.dedent(
+                        """
+
+                        torch._dynamo.optimize is called on a non function object.
+                        If this is a callable class, please wrap the relevant code into a function and optimize the
+                        wrapper function.
+
+                        >> class CallableClass:
+                        >>     def __init__(self):
+                        >>         super().__init__()
+                        >>         self.relu = torch.nn.ReLU()
+                        >>
+                        >>     def __call__(self, x):
+                        >>         return self.relu(torch.sin(x))
+                        >>
+                        >>     def print_hello(self):
+                        >>         print("Hello world")
+                        >>
+                        >> mod = CallableClass()
+
+                        If you want to optimize the __call__ function and other code, wrap that up in a function
+
+                        >> def wrapper_fn(x):
+                        >>     y = mod(x)
+                        >>     return y.sum()
+
+                        and then optimize the wrapper_fn
+
+                        >> opt_wrapper_fn = torch._dynamo.optimize(wrapper_fn)
+                        """
+                    )
+                )
             always_optimize_code_objects[fn.__code__] = True
 
         return _fn
diff --git a/torch/_dynamo/testing.py b/torch/_dynamo/testing.py
index d6082ce48acf8..6e0d32d21f978 100644
--- a/torch/_dynamo/testing.py
+++ b/torch/_dynamo/testing.py
@@ -32,6 +32,18 @@ def clone_me(x):
     return x.detach().clone().requires_grad_(x.requires_grad)
 
 
+def named_parameters_for_optimized_module(mod):
+    assert isinstance(mod, eval_frame.OptimizedModule)
+    return mod._orig_mod.named_parameters
+
+
+def remove_optimized_module_prefix(name):
+    prefix = "_orig_mod."
+    assert name.startswith(prefix)
+    name = name[len(prefix) :]
+    return torch.distributed.fsdp._common_utils.clean_tensor_name(name)
+
+
 def collect_results(model, prediction, loss, example_inputs):
     results = []
     results.append(prediction)
@@ -44,6 +56,8 @@ def collect_results(model, prediction, loss, example_inputs):
     grads = dict()
     params = dict()
     for name, param in model.named_parameters():
+        if isinstance(model, eval_frame.OptimizedModule):
+            name = remove_optimized_module_prefix(name)
         param_copy = param
         grad = param.grad
         # Treat None and zero grad as same

From 57d77171b4fed974d05b465129b4bcc4df69b93a Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Wed, 16 Nov 2022 22:28:36 +0000
Subject: [PATCH 1002/1922] [dtensor] disable op db tests for now (#89162)

context: https://github.com/pytorch/pytorch/issues/89160
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89162
Approved by: https://github.com/fduwjj
---
 test/run_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/run_test.py b/test/run_test.py
index 6bf98a01a44d1..94bee60cc24ed 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -113,6 +113,7 @@ def skip_test_p(name: str) -> bool:
         "distributed/launcher/bin/test_script_is_torchelastic_launched",
         "distributed/launcher/bin/test_script_local_rank",
         "distributed/test_c10d_spawn",
+        "distributed/_tensor/test_dtensor_ops",
         'distributions/test_transforms',
         'distributions/test_utils',
     ],

From 19f3dd7ea71fd9b3e4d7c2d27b47c4c0ccd77291 Mon Sep 17 00:00:00 2001
From: Riley Dulin <dulinr@meta.com>
Date: Thu, 17 Nov 2022 02:43:33 +0000
Subject: [PATCH 1003/1922] [torch][fx] Fix PassManager to not use a class
 variable mutable list (#89108)

Summary:
I found a confusing bug in the PassManager that only happens
when you instantiate one multiple times: it will use old passes and
constraints!

This occurs because the class-level declarations initialize it to an empty list,
but the problem is that class initializers only run once, and are creating class
variables. This means the same empty list was being reused every time, except
after the first time it isn't empty.

The empty list has to be created in `__init__` newly each time or else it'll be shared.
Note that this is the same type of bug as using an empty list as a default parameter, where
it'll reuse the same list pointer and not make it empty each time.

The better way to do this is with either:
* An immutable default parameter like an empty tuple, that you create a new list from: `self.passes = list(passes)`
* Use None and then create the empty list inside `__init__`

I chose the latter as it's less likely to cause a behavior change due to the changed default.

Note that for immutable values like `False` and `1` this doesn't apply as you can't mutate that
value for everyone.

Test Plan:
Added a test to ensure that the pass state is not saved.
Without my change, this test would fail as it would run all of the `2 * x` passes first,
then all of the `3 * x` passes.

Differential Revision: D41327056

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89108
Approved by: https://github.com/angelayi
---
 torch/fx/passes/infra/pass_manager.py      | 10 ++++------
 torch/fx/passes/pass_manager.py            | 10 ++++------
 torch/fx/passes/tests/test_pass_manager.py | 22 ++++++++++++++++++++++
 3 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/torch/fx/passes/infra/pass_manager.py b/torch/fx/passes/infra/pass_manager.py
index 265c6263da540..e649acfb28f54 100644
--- a/torch/fx/passes/infra/pass_manager.py
+++ b/torch/fx/passes/infra/pass_manager.py
@@ -165,8 +165,8 @@ class PassManager:
             checks
     """
 
-    passes: List[Callable[[nn.Module], PassResult]] = []
-    constraints: List[Callable[[Callable, Callable], bool]] = []
+    passes: List[Callable[[nn.Module], PassResult]]
+    constraints: List[Callable[[Callable, Callable], bool]]
     _validated: bool = False
     steps: int = 1
 
@@ -178,10 +178,8 @@ def __init__(
         run_checks_after_each_pass: bool = False,
         suppress_check_failures: bool = False,
     ):
-        if passes:
-            self.passes = passes
-        if constraints:
-            self.constraints = constraints
+        self.passes = passes or []
+        self.constraints = constraints or []
         if steps:
             self.steps = steps
 
diff --git a/torch/fx/passes/pass_manager.py b/torch/fx/passes/pass_manager.py
index 5a34c5bca3621..cf002b3611bfc 100644
--- a/torch/fx/passes/pass_manager.py
+++ b/torch/fx/passes/pass_manager.py
@@ -184,8 +184,8 @@ class PassManager:
             `this_before_that_pass_constraint` for example.
     """
 
-    passes: List[Callable] = []
-    constraints: List[Callable] = []
+    passes: List[Callable]
+    constraints: List[Callable]
     _validated: bool = False
 
     def __init__(
@@ -193,10 +193,8 @@ def __init__(
         passes=None,
         constraints=None,
     ):
-        if passes:
-            self.passes = passes
-        if constraints:
-            self.constraints = constraints
+        self.passes = passes or []
+        self.constraints = constraints or []
 
     @classmethod
     def build_from_passlist(cls, passes):
diff --git a/torch/fx/passes/tests/test_pass_manager.py b/torch/fx/passes/tests/test_pass_manager.py
index 4ed0cfce89de8..60ed6671179b2 100644
--- a/torch/fx/passes/tests/test_pass_manager.py
+++ b/torch/fx/passes/tests/test_pass_manager.py
@@ -34,3 +34,25 @@ def test_these_before_those_pass_constraint(self) -> None:
         pm.add_constraint(constraint)
 
         self.assertRaises(RuntimeError, pm.validate)
+
+    def test_two_pass_managers(self) -> None:
+        """Make sure we can construct the PassManager twice and not share any
+        state between them"""
+
+        passes = [lambda x: 2 * x for _ in range(3)]
+        constraint = these_before_those_pass_constraint(passes[0], passes[1])
+        pm1 = PassManager()
+        for p in passes:
+            pm1.add_pass(p)
+        pm1.add_constraint(constraint)
+        output1 = pm1(1)
+        self.assertEqual(output1, 2 ** 3)
+
+        passes = [lambda x: 3 * x for _ in range(3)]
+        constraint = these_before_those_pass_constraint(passes[0], passes[1])
+        pm2 = PassManager()
+        for p in passes:
+            pm2.add_pass(p)
+        pm2.add_constraint(constraint)
+        output2 = pm2(1)
+        self.assertEqual(output2, 3 ** 3)

From 3efeb66046f2e67db2e8cd9072b6dab551f34199 Mon Sep 17 00:00:00 2001
From: Charlie Yan <charlieyan@fb.com>
Date: Tue, 15 Nov 2022 23:33:05 +0000
Subject: [PATCH 1004/1922] [WIP] Composable API: `replicate` and
 `DistributedState` (#87649)

This PR adds the first version of the `replicate()` composable API. For this prototype version, I try to reuse as much code from existing `DistributedDataParallel` as possible, and iterate on it in later changes. The basic idea of this prototype is:
- create a `ReplicateState` object. It internally uses a `ParameterList` module to hold all parameters of modules marked by `replicate()` API.
- create an internal `_ddp` object, which reuses existing `DistributedDataParallel` implementation, and wraps the `ParameterList` object
- install pre-forward and after-forward hooks on the root module, which calls methods of `_ddp` to run initialization and forward

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87649
Approved by: https://github.com/zhaojuanmao
---
 .../distributed/_composable/test_replicate.py | 100 ++++++++++++++++
 torch/distributed/_composable/__init__.py     |   1 +
 torch/distributed/_composable/_ddp.py         |  20 +++-
 torch/distributed/_composable/contract.py     |  11 +-
 torch/distributed/_composable/replicate.py    | 107 ++++++++++++++++++
 5 files changed, 232 insertions(+), 7 deletions(-)
 create mode 100644 test/distributed/_composable/test_replicate.py
 create mode 100644 torch/distributed/_composable/replicate.py

diff --git a/test/distributed/_composable/test_replicate.py b/test/distributed/_composable/test_replicate.py
new file mode 100644
index 0000000000000..831ccc3376aff
--- /dev/null
+++ b/test/distributed/_composable/test_replicate.py
@@ -0,0 +1,100 @@
+# Owner(s): ["oncall: distributed"]
+
+import os
+from copy import deepcopy
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch import nn
+from torch.distributed._composable.replicate import mark_root_module, replicate
+from torch.testing._internal.common_distributed import MultiProcessTestCase
+from torch.testing._internal.common_utils import run_tests
+
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.fc1 = nn.Linear(2, 10, bias=False)
+        self.fc2 = nn.Linear(10, 50, bias=False)
+        self.fc3 = nn.Linear(50, 4, bias=False)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        x = self.relu(self.fc1(x))
+        x = self.relu(self.fc2(x))
+        x = self.fc3(x)
+        return F.softmax(x, dim=1)
+
+
+class ReplicateTest(MultiProcessTestCase):
+    def setUp(self) -> None:
+        super().setUp()
+        self._spawn_processes()
+
+    def tearDown(self):
+        super().tearDown()
+        try:
+            os.remove(self.file_name)
+        except OSError:
+            pass
+
+    def _prepare_module(self, global_batch_size):
+        model = Net()
+        input = torch.randn(global_batch_size, 2)
+        target = torch.randn(global_batch_size, 4)
+        return model, input, target
+
+    def test_replicate(self):
+        dist.init_process_group(
+            backend="gloo",
+            rank=self.rank,
+            world_size=self.world_size,
+            store=dist.FileStore(self.file_name, self.world_size),
+        )
+
+        local_batch_size = 1
+        global_batch_size = self.world_size * local_batch_size
+        model, input, target = self._prepare_module(global_batch_size)
+        replicate_model = mark_root_module(replicate(deepcopy(model)))
+
+        def step_model(model, input, target):
+            model.train()
+            output = model(input)
+            loss = F.mse_loss(output, target.to(output.device))
+            loss.backward()
+            for param in model.parameters():
+                with torch.no_grad():
+                    param -= param.grad
+                param.grad = None
+
+        for iteration in range(2):
+            step_model(model, input, target)
+            step_model(
+                replicate_model,
+                input[
+                    self.rank
+                    * local_batch_size : (self.rank + 1)
+                    * local_batch_size
+                ],
+                target[
+                    self.rank
+                    * local_batch_size : (self.rank + 1)
+                    * local_batch_size
+                ],
+            )
+
+            self.assertEqual(
+                len(list(model.parameters())),
+                len(list(replicate_model.parameters())),
+            )
+            for i, j in zip(model.parameters(), replicate_model.parameters()):
+                self.assertEqual(i, j, rtol=1.3e-06, atol=5e-5)
+
+            # Shuffle the input so that DDP input is different
+            torch.manual_seed(iteration)
+            input = input[torch.randperm(global_batch_size)]
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/distributed/_composable/__init__.py b/torch/distributed/_composable/__init__.py
index 5b0d8e77e5ccf..2952426a09fdb 100644
--- a/torch/distributed/_composable/__init__.py
+++ b/torch/distributed/_composable/__init__.py
@@ -1,3 +1,4 @@
 from .checkpoint_activation import checkpoint
 from .contract import contract
 from .fully_shard import fully_shard
+from .replicate import replicate
diff --git a/torch/distributed/_composable/_ddp.py b/torch/distributed/_composable/_ddp.py
index 76a4aa70c4224..9e94ec3d53cde 100644
--- a/torch/distributed/_composable/_ddp.py
+++ b/torch/distributed/_composable/_ddp.py
@@ -1058,9 +1058,9 @@ def _run_ddp_forward(self, *inputs, **kwargs):
             with self._inside_ddp_forward():
                 return module_to_run(*inputs, **kwargs)
 
-    def forward(self, *inputs, **kwargs):
+    def pre_forward(self):
         with torch.autograd.profiler.record_function(
-            "DistributedDataParallel.forward"
+            "DistributedDataParallel.pre_forward"
         ):
             if torch.is_grad_enabled() and self.require_backward_grad_sync:
                 assert self.logger is not None
@@ -1090,7 +1090,6 @@ def forward(self, *inputs, **kwargs):
 
             # sync params according to location (before/after forward) user
             # specified as part of hook, if hook was specified.
-            buffer_hook_registered = hasattr(self, "buffer_hook")
             if self._check_sync_bufs_pre_fwd():
                 self._sync_buffers()
 
@@ -1100,8 +1099,10 @@ def forward(self, *inputs, **kwargs):
                     is_joined_rank=False
                 )
 
-            output = self._run_ddp_forward(*inputs, **kwargs)
-
+    def post_forward(self, output):
+        with torch.autograd.profiler.record_function(
+            "DistributedDataParallel.post_forward"
+        ):
             # sync params according to location (before/after forward) user
             # specified as part of hook, if hook was specified.
             if self._check_sync_bufs_post_fwd():
@@ -1166,6 +1167,15 @@ def forward(self, *inputs, **kwargs):
             )
         return output
 
+    def forward(self, *inputs, **kwargs):
+        self.pre_forward(*inputs, **kwargs)
+        with torch.autograd.profiler.record_function(
+            "DistributedDataParallel.forward"
+        ):
+            output = self._run_ddp_forward(*inputs, **kwargs)
+        output = self.post_forward(output)
+        return output
+
     def scatter(self, inputs, kwargs, device_ids):
         return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
 
diff --git a/torch/distributed/_composable/contract.py b/torch/distributed/_composable/contract.py
index b75604872a592..fca817bcfc1d6 100644
--- a/torch/distributed/_composable/contract.py
+++ b/torch/distributed/_composable/contract.py
@@ -121,7 +121,9 @@ def check_fqn(orig_fqns: List[str], new_fqns: List[str]):
                     f"New FQNs: {new_only}"
                 )
 
-        check_fqn(list(orig_named_params.keys()), list(new_named_params.keys()))
+        check_fqn(
+            list(orig_named_params.keys()), list(new_named_params.keys())
+        )
         check_fqn(
             list(orig_named_buffers.keys()), list(new_named_buffers.keys())
         )
@@ -138,7 +140,12 @@ def check_fqn(orig_fqns: List[str], new_fqns: List[str]):
         return updated
 
     def get_state(module: nn.Module) -> Optional[_State]:
-        return module.__dict__.get(STATE_KEY).get(func)  # type: ignore[call-overload]
+        return module.__dict__.setdefault(  # type: ignore[call-overload]
+            STATE_KEY,
+            {},  # TODO(@yhcharles): this is a temporary fix, need a better way
+        ).get(
+            func
+        )  # type: ignore[call-overload]
 
     wrapper.state = get_state  # type: ignore[attr-defined]
 
diff --git a/torch/distributed/_composable/replicate.py b/torch/distributed/_composable/replicate.py
new file mode 100644
index 0000000000000..0e94427afee88
--- /dev/null
+++ b/torch/distributed/_composable/replicate.py
@@ -0,0 +1,107 @@
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+
+from . import _ddp
+from .contract import contract
+
+
+class DistributedState:
+    ...
+
+
+class ReplicateState(DistributedState):
+    def __init__(self) -> None:
+        self.modules: List[nn.Module] = []
+        self.has_initialized: bool = False
+        self._param_list: nn.ParameterList = nn.ParameterList()
+
+    def mark_modules(self, *modules: nn.Module) -> None:
+        for module in modules:
+            self.modules.append(module)
+            replicate.state(module)._distributed_state = self
+            replicate.state(module)._params_collected = False
+
+    def _recursive_collect_params(self, module: nn.Module) -> None:
+        # TODO: skip if managed by other APIs
+
+        if hasattr(replicate.state(module), "_params_collected"):
+            if replicate.state(module)._params_collected:
+                return
+            replicate.state(module)._params_collected = True
+
+        self._param_list.extend(
+            param
+            for param in module.parameters(recurse=False)
+            # for param in module.parameters()
+            if param.requires_grad
+        )
+        for child in module.children():
+            self._recursive_collect_params(child)
+
+    def init_helper(self):
+        if self.has_initialized:
+            return
+
+        self.has_initialized = True
+        for module in self.modules:
+            self._recursive_collect_params(module)
+
+        self._ddp = _ddp.DistributedDataParallel(self._param_list)
+
+    def root_module_forward_pre_hook(
+        self, module: nn.Module, input: Tuple[torch.Tensor]
+    ) -> None:
+        self.init_helper()
+        self._ddp.pre_forward()
+
+    def root_module_forward_post_hook(
+        self,
+        module: nn.Module,
+        input: Tuple[torch.Tensor],
+        output: torch.Tensor,
+    ) -> torch.Tensor:
+        return self._ddp.post_forward(output)
+
+
+# TODO(@yhcharles): use a per-model instance instead of a global one
+_default_state = ReplicateState()
+
+
+@contract
+def replicate(
+    module: nn.Module,  # NOTE: contract now supports single module only
+    dist_state: ReplicateState = _default_state,
+) -> nn.Module:
+    r"""Replicates module(s)
+
+    Args:
+        modules (torch.nn.Module): modules to replicate
+
+    Example::
+        >>> module = nn.Linear(3, 3)
+        >>> replicate(module)
+    """
+    dist_state.mark_modules(module)
+    return module
+
+
+def mark_root_module(
+    module: nn.Module, dist_state: ReplicateState = _default_state
+) -> nn.Module:
+    r"""Mark the root module. Its sub-modules can be replicated.
+
+    Args:
+        modules (torch.nn.Module): root module
+
+    Example::
+        >>> module = nn.Linear(3, 3)
+        >>> replicate(module)
+    """
+    module.register_forward_pre_hook(dist_state.root_module_forward_pre_hook)
+    # TODO(@yhcharles): fix type error
+    module.register_forward_hook(
+        dist_state.root_module_forward_post_hook  # type: ignore[arg-type]
+    )
+    return module

From a086e4ebb32e64e93a0c25ad56b85d3a194584d7 Mon Sep 17 00:00:00 2001
From: AllenTiTaiWang <titaiwang@microsoft.com>
Date: Wed, 16 Nov 2022 19:50:02 +0000
Subject: [PATCH 1005/1922] [ONNX] Add Internal Utils: onnx_proto_utils.py for
 onnx/onnx-script/onnx_proto (#88376)

Added `onnx_proto_utils.py` for onnx/onnx-script related process. The idea is like jit_utils.py, and to simplify what we have in `torch/onnx/utils.py`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88376
Approved by: https://github.com/justinchuby, https://github.com/BowenBao
---
 torch/onnx/_internal/onnx_proto_utils.py | 143 ++++++++++++++++++++++
 torch/onnx/utils.py                      | 148 ++---------------------
 2 files changed, 152 insertions(+), 139 deletions(-)
 create mode 100644 torch/onnx/_internal/onnx_proto_utils.py

diff --git a/torch/onnx/_internal/onnx_proto_utils.py b/torch/onnx/_internal/onnx_proto_utils.py
new file mode 100644
index 0000000000000..f557089707b88
--- /dev/null
+++ b/torch/onnx/_internal/onnx_proto_utils.py
@@ -0,0 +1,143 @@
+"""Utilities for manipulating the onnx and onnx-script dependencies and ONNX proto."""
+
+import io
+import os
+import zipfile
+from typing import List, Mapping, Set, Union
+
+import torch
+import torch.jit._trace
+import torch.serialization
+from torch.onnx import _constants, _exporter_states, errors
+from torch.onnx._internal import _beartype, jit_utils, registration
+
+
+@_beartype.beartype
+def _export_file(
+    model_bytes: bytes,
+    f: Union[io.BytesIO, str],
+    export_type: str,
+    export_map: Mapping[str, bytes],
+) -> None:
+    """export/write model bytes into directory/protobuf/zip"""
+    # TODO(titaiwang) MYPY asks for os.PathLike[str] type for parameter: f,
+    # but beartype raises beartype.roar.BeartypeDecorHintNonpepException,
+    # as os.PathLike[str] uncheckable at runtime
+    if export_type == _exporter_states.ExportTypes.PROTOBUF_FILE:
+        assert len(export_map) == 0
+        with torch.serialization._open_file_like(f, "wb") as opened_file:
+            opened_file.write(model_bytes)
+    elif export_type in {
+        _exporter_states.ExportTypes.ZIP_ARCHIVE,
+        _exporter_states.ExportTypes.COMPRESSED_ZIP_ARCHIVE,
+    }:
+        compression = (
+            zipfile.ZIP_DEFLATED
+            if export_type == _exporter_states.ExportTypes.COMPRESSED_ZIP_ARCHIVE
+            else zipfile.ZIP_STORED
+        )
+        with zipfile.ZipFile(f, "w", compression=compression) as z:
+            z.writestr(_constants.ONNX_ARCHIVE_MODEL_PROTO_NAME, model_bytes)
+            for k, v in export_map.items():
+                z.writestr(k, v)
+    elif export_type == _exporter_states.ExportTypes.DIRECTORY:
+        if isinstance(f, io.BytesIO) or not os.path.isdir(f):  # type: ignore[arg-type]
+            raise ValueError(
+                f"f should be directory when export_type is set to DIRECTORY, instead get type(f): {type(f)}"
+            )
+        if not os.path.exists(f):  # type: ignore[arg-type]
+            os.makedirs(f)  # type: ignore[arg-type]
+
+        model_proto_file = os.path.join(f, _constants.ONNX_ARCHIVE_MODEL_PROTO_NAME)  # type: ignore[arg-type]
+        with torch.serialization._open_file_like(model_proto_file, "wb") as opened_file:
+            opened_file.write(model_bytes)
+
+        for k, v in export_map.items():
+            weight_proto_file = os.path.join(f, k)  # type: ignore[arg-type]
+            with torch.serialization._open_file_like(
+                weight_proto_file, "wb"
+            ) as opened_file:
+                opened_file.write(v)
+    else:
+        raise ValueError("Unknown export type")
+
+
+@_beartype.beartype
+def _add_onnxscript_fn(
+    model_bytes: bytes,
+    custom_opsets: Mapping[str, int],
+) -> bytes:
+    """Insert model-included custom onnx-script function into ModelProto"""
+    # TODO(titaiwang): remove this when onnx becomes dependency
+    try:
+        import onnx
+    except ImportError:
+        raise errors.OnnxExporterError("Module onnx is not installed!")
+
+    # For > 2GB model, onnx.load_fromstring would fail. However, because
+    # in _export_onnx, the tensors should be saved separately if the proto
+    # size > 2GB, and if it for some reason did not, the model would fail on
+    # serialization anyway in terms of the protobuf limitation. So we don't
+    # need to worry about > 2GB model getting here.
+    model_proto = onnx.load_from_string(model_bytes)
+
+    # Iterate graph nodes to insert only the included custom
+    # function_proto into model_proto
+    # TODO(titaiwang): Currently, onnxscript doesn't support ONNXFunction
+    # calling other ONNXFunction scenario, neither does it here
+    onnx_function_list = list()  # type: ignore[var-annotated]
+    included_node_func = set()  # type: Set[str]
+    # onnx_function_list and included_node_func are expanded in-place
+    _find_onnxscript_op(
+        model_proto.graph, included_node_func, custom_opsets, onnx_function_list
+    )
+
+    if onnx_function_list:
+        model_proto.functions.extend(onnx_function_list)
+        model_bytes = model_proto.SerializeToString()
+    return model_bytes
+
+
+@_beartype.beartype
+def _find_onnxscript_op(
+    graph_proto,
+    included_node_func: Set[str],
+    custom_opsets: Mapping[str, int],
+    onnx_function_list: List,
+):
+    """Recursively iterate ModelProto to find ONNXFunction op as it may contain control flow Op."""
+    for node in graph_proto.node:
+        node_kind = node.domain + "::" + node.op_type
+        # Recursive needed for control flow nodes: IF/Loop which has inner graph_proto
+        for attr in node.attribute:
+            if attr.g is not None:
+                _find_onnxscript_op(
+                    attr.g, included_node_func, custom_opsets, onnx_function_list
+                )
+        # Only custom Op with ONNX function and aten with symbolic_fn should be found in registry
+        onnx_function_group = registration.registry.get_function_group(node_kind)
+        # Ruled out corner cases: onnx/prim in registry
+        if (
+            node.domain
+            and not jit_utils.is_aten(node.domain)
+            and not jit_utils.is_prim(node.domain)
+            and not jit_utils.is_onnx(node.domain)
+            and onnx_function_group is not None
+            and node_kind not in included_node_func
+        ):
+            specified_version = custom_opsets.get(node.domain, 1)
+            onnx_fn = onnx_function_group.get(specified_version)
+            if onnx_fn is not None:
+                # TODO(titaiwang): to_function_proto is onnx-script API and can be annotated
+                # after onnx-script is dependency
+                onnx_function_list.append(onnx_fn.to_function_proto())  # type: ignore[attr-defined]
+                included_node_func.add(node_kind)
+                continue
+            raise errors.UnsupportedOperatorError(
+                node_kind,
+                specified_version,
+                onnx_function_group.get_min_supported()
+                if onnx_function_group
+                else None,
+            )
+    return onnx_function_list, included_node_func
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 9d6ec0b325232..67dd719bae9f6 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -9,12 +9,10 @@
 import copy
 import inspect
 import io
-import os
 import re
 import textwrap
 import typing
 import warnings
-import zipfile
 from typing import (
     Any,
     Callable,
@@ -38,15 +36,19 @@
 from torch import _C
 from torch.onnx import (  # noqa: F401
     _constants,
-    _deprecation,
     _exporter_states,
-    _patch_torch,
     errors,
     symbolic_caffe2,
     symbolic_helper,
 )
 from torch.onnx._globals import GLOBALS
-from torch.onnx._internal import _beartype, diagnostics, jit_utils, registration
+from torch.onnx._internal import (
+    _beartype,
+    diagnostics,
+    jit_utils,
+    onnx_proto_utils,
+    registration,
+)
 
 __all__ = [
     "is_in_onnx_export",
@@ -1598,13 +1600,13 @@ def _export(
                     node_attr_to_name,
                 )
             # insert function_proto into model_proto.
-            proto = _add_onnxscript_fn(
+            proto = onnx_proto_utils._add_onnxscript_fn(
                 proto,
                 custom_opsets,
             )
             if verbose:
                 torch.onnx.log("Exported graph: ", graph)
-            _export_file(proto, f, export_type, export_map)
+            onnx_proto_utils._export_file(proto, f, export_type, export_map)
             # The ONNX checker only works for ONNX graph. So if the operator_export_type is not ONNX,
             # we can skip this check.
             # If large model format export is enabled, proto will only contain data location instead of
@@ -1625,138 +1627,6 @@ def _export(
     return torch_out
 
 
-@_beartype.beartype
-def _export_file(
-    model_bytes: bytes,
-    f: Union[io.BytesIO, str],
-    export_type: str,
-    export_map: Mapping[str, bytes],
-) -> None:
-    """export/write model bytes into directory/protobuf/zip"""
-    # TODO(titaiwang) MYPY asks for os.PathLike[str] type for parameter: f,
-    # but beartype raises beartype.roar.BeartypeDecorHintNonpepException,
-    # as os.PathLike[str] uncheckable at runtime
-    if export_type == _exporter_states.ExportTypes.PROTOBUF_FILE:
-        assert len(export_map) == 0
-        with torch.serialization._open_file_like(f, "wb") as opened_file:
-            opened_file.write(model_bytes)
-    elif export_type in [
-        _exporter_states.ExportTypes.ZIP_ARCHIVE,
-        _exporter_states.ExportTypes.COMPRESSED_ZIP_ARCHIVE,
-    ]:
-        compression = (
-            zipfile.ZIP_DEFLATED
-            if export_type == _exporter_states.ExportTypes.COMPRESSED_ZIP_ARCHIVE
-            else zipfile.ZIP_STORED
-        )
-        with zipfile.ZipFile(f, "w", compression=compression) as z:
-            z.writestr(_constants.ONNX_ARCHIVE_MODEL_PROTO_NAME, model_bytes)
-            for k, v in export_map.items():
-                z.writestr(k, v)
-    elif export_type == _exporter_states.ExportTypes.DIRECTORY:
-        if isinstance(f, io.BytesIO) or not os.path.isdir(f):  # type: ignore[arg-type]
-            raise ValueError(
-                f"f should be directory when export_type is set to DIRECTORY, instead get type(f): {type(f)}"
-            )
-        if not os.path.exists(f):  # type: ignore[arg-type]
-            os.makedirs(f)  # type: ignore[arg-type]
-
-        model_proto_file = os.path.join(f, _constants.ONNX_ARCHIVE_MODEL_PROTO_NAME)  # type: ignore[arg-type]
-        with torch.serialization._open_file_like(model_proto_file, "wb") as opened_file:
-            opened_file.write(model_bytes)
-
-        for k, v in export_map.items():
-            weight_proto_file = os.path.join(f, k)  # type: ignore[arg-type]
-            with torch.serialization._open_file_like(
-                weight_proto_file, "wb"
-            ) as opened_file:
-                opened_file.write(v)
-    else:
-        raise RuntimeError("Unknown export type")
-
-
-@_beartype.beartype
-def _add_onnxscript_fn(
-    model_bytes: bytes,
-    custom_opsets: Mapping[str, int],
-) -> bytes:
-    """Insert model-included custom onnx-script function into ModelProto"""
-
-    # TODO(titaiwang): remove this when onnx becomes dependency
-    try:
-        import onnx
-    except ImportError:
-        raise errors.OnnxExporterError("Module onnx is not installed!")
-
-    # For > 2GB model, onnx.load_fromstring would fail. However, because
-    # in _export_onnx, the tensors should be saved separately if the proto
-    # size > 2GB, and if it for some reason did not, the model would fail on
-    # serialization anyway in terms of the protobuf limitation. So we don't
-    # need to worry about > 2GB model getting here.
-    model_proto = onnx.load_from_string(model_bytes)
-
-    # Iterate graph nodes to insert only the included custom
-    # function_proto into model_proto
-    # TODO(titaiwang): Currently, onnxscript doesn't support ONNXFunction
-    # calling other ONNXFunction scenario, neither does it here
-    onnx_function_list = list()  # type: ignore[var-annotated]
-    included_node_func = set()  # type: Set[str]
-    # onnx_function_list and included_node_func are expanded in-place
-    _find_onnxscript_op(
-        model_proto.graph, included_node_func, custom_opsets, onnx_function_list
-    )
-
-    if onnx_function_list:
-        model_proto.functions.extend(onnx_function_list)
-        model_bytes = model_proto.SerializeToString()
-    return model_bytes
-
-
-@_beartype.beartype
-def _find_onnxscript_op(
-    graph_proto,
-    included_node_func: Set[str],
-    custom_opsets: Mapping[str, int],
-    onnx_function_list: List,
-):
-    """Recursively iterate ModelProto to find ONNXFunction op as it may contain control flow Op."""
-    for node in graph_proto.node:
-        node_kind = node.domain + "::" + node.op_type
-        # Recursive is needed for control flow nodes: IF/Loop which has inner graph_proto
-        for attr in node.attribute:
-            if attr.g is not None:
-                _find_onnxscript_op(
-                    attr.g, included_node_func, custom_opsets, onnx_function_list
-                )
-        # Only custom Op with ONNX function and aten with symbolic_fn should be found in registry
-        onnx_function_group = registration.registry.get_function_group(node_kind)
-        # Ruled out corner cases: onnx/prim in registry
-        if (
-            node.domain
-            and not jit_utils.is_aten(node.domain)
-            and not jit_utils.is_prim(node.domain)
-            and not jit_utils.is_onnx(node.domain)
-            and onnx_function_group is not None
-            and node_kind not in included_node_func
-        ):
-            specified_version = custom_opsets.get(node.domain, 1)
-            onnx_fn = onnx_function_group.get(specified_version)
-            if onnx_fn is not None:
-                # TODO(titaiwang): to_function_proto is onnx-script API and can be annotated
-                # after onnx-script is dependency
-                onnx_function_list.append(onnx_fn.to_function_proto())  # type: ignore[attr-defined]
-                included_node_func.add(node_kind)
-                continue
-            raise errors.UnsupportedOperatorError(
-                node_kind,
-                specified_version,
-                onnx_function_group.get_min_supported()
-                if onnx_function_group
-                else None,
-            )
-    return onnx_function_list, included_node_func
-
-
 @_beartype.beartype
 def _apply_friendly_debug_names(graph, params):
     for n in graph.nodes():

From bc4a448a74815d7d8ac73357614ef3c02042b4c0 Mon Sep 17 00:00:00 2001
From: "Wang, Eikan" <eikan.wang@intel.com>
Date: Wed, 16 Nov 2022 23:58:11 +0000
Subject: [PATCH 1006/1922] Redefine the simdlen semantic: (#88482)

This PR is targeting to automatically enable vectorization optimization for TorchInductor. It refined the semantics of `config.cpp.simdlen`.

Originally, `None` means to disable vectorization while a specific value means the number of elements to be vectorized once time. But it depends on the data. Regarding 256bit SVE/SIMD ISA for ARM and X86, the `simdlen` should be 16 for Float while 32 for BFloat. Hence, this PR defined the `simdlen` as the bit width. The detailed semantics are as follows.

- **_simdlen = None_**: Automatically determine the SIMD bit width. Detect HW information and pick the proper vectorization ISA. Specific for X86, the priority of AVX512 is higher than AVX2.
- **_simdlen <=1_**: Explicitly disable SIMD
- **_simdlen > 1_**: Explicitly specify the SIMD bit width. It equals the disabled semantic if the bit width does not match the ISA width.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88482
Approved by: https://github.com/jgong5, https://github.com/jansel
---
 test/inductor/test_torchinductor.py |  94 +++++++++++-
 torch/_inductor/codecache.py        | 215 +++++++++++++++++++++-------
 torch/_inductor/codegen/common.py   |   6 +
 torch/_inductor/codegen/cpp.py      |  92 +++++++++---
 4 files changed, 327 insertions(+), 80 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index fb7ca1fc92b73..f9aa93f4a7e63 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -4529,7 +4529,11 @@ def fn(x):
 
             v = torch.randn(10)
             result = fn(v)
-            assert same(result, mod(v))
+            # TODO: OMP parallel reduction order is not deterministic.
+            # Hence, the accurarcy might vary up and down. For short term,
+            # we increase the tolerance and will fix it later by using
+            # aten parallel.
+            assert same(result, mod(v), tol=5e-1)
 
         def test_inplace_add_alpha(self):
             def fn(x, y):
@@ -4599,7 +4603,79 @@ def test_complex_memory_overlap(self):
             self.assertFalse(complex_memory_overlap(gathered.t()))
 
         @unittest.skipIf(
-            not codecache.get_cpu_proc_info(), "Does not support vectorization"
+            not codecache.valid_vec_isa_list(), "Does not support vectorization"
+        )
+        @patch.object(config, "dynamic_shapes", True)
+        @patch.object(torch._dynamo.config, "dynamic_shapes", True)
+        @patch.object(functorch_config, "use_dynamic_shapes", True)
+        def test_vec_dynamic_shapes(self):
+            def fn(x):
+                return torch.softmax(x, -1)
+
+            value = torch.randn((2, 10))
+            with patch.object(config.cpp, "simdlen", None):
+                torch._dynamo.reset()
+                metrics.reset()
+                opt_fn = torch._dynamo.optimize("inductor")(fn)
+                opt_fn(value)
+
+                real_out = fn(value)
+                compiled_out = opt_fn(value)
+                assert same(real_out, compiled_out, equal_nan=True)
+                assert metrics.generated_cpp_vec_kernel_count < 1
+
+        @unittest.skipIf(
+            not codecache.valid_vec_isa_list(), "Does not support vectorization"
+        )
+        @patch("torch.cuda.is_available", lambda: False)
+        def test_auto_simd(self):
+            vec_avx512 = codecache.supported_vec_isa_list[0]
+            vec_avx2 = codecache.supported_vec_isa_list[1]
+            self.assertTrue(vec_avx512.bit_width() == 512)
+            self.assertTrue(vec_avx2.bit_width() == 256)
+            self.assertTrue(vec_avx512.nelements() == 16)
+            self.assertTrue(vec_avx2.nelements() == 8)
+            self.assertTrue(vec_avx512.nelements(torch.bfloat16) == 32)
+            self.assertTrue(vec_avx2.nelements(torch.bfloat16) == 16)
+
+            with patch.object(config.cpp, "simdlen", None):
+                isa = codecache.pick_vec_isa()
+                if vec_avx512 in codecache.valid_vec_isa_list():
+                    self.assertTrue(isa == vec_avx512)
+                else:
+                    self.assertTrue(isa == vec_avx2)
+
+            with patch.object(config.cpp, "simdlen", 0):
+                isa = codecache.pick_vec_isa()
+                self.assertFalse(isa)
+
+            with patch.object(config.cpp, "simdlen", 1):
+                isa = codecache.pick_vec_isa()
+                self.assertFalse(isa)
+
+            with patch.object(config.cpp, "simdlen", 257):
+                isa = codecache.pick_vec_isa()
+                self.assertFalse(isa)
+
+            with patch.object(config.cpp, "simdlen", 513):
+                isa_list = codecache.valid_vec_isa_list()
+                if vec_avx512 in isa_list:
+                    self.assertFalse(isa)
+
+            with patch.object(config.cpp, "simdlen", 512):
+                isa_list = codecache.valid_vec_isa_list()
+                if vec_avx512 in isa_list:
+                    isa = codecache.pick_vec_isa()
+                    self.assertTrue(isa == vec_avx512)
+
+            with patch.object(config.cpp, "simdlen", 256):
+                isa_list = codecache.valid_vec_isa_list()
+                if vec_avx2 in isa_list:
+                    isa = codecache.pick_vec_isa()
+                    self.assertTrue(isa == vec_avx2)
+
+        @unittest.skipIf(
+            not codecache.valid_vec_isa_list(), "Does not support vectorization"
         )
         @patch("torch.cuda.is_available", lambda: False)
         def test_sign_cpu_only(self):
@@ -4610,7 +4686,7 @@ def fn(x):
             x[0, 0] = torch.nan
             x[1, -1] = torch.nan
 
-            with patch.object(config.cpp, "simdlen", 8):
+            with patch.object(config.cpp, "simdlen", None):
                 torch._dynamo.reset()
                 metrics.reset()
                 traced = make_fx(fn)(x)
@@ -4623,7 +4699,7 @@ def fn(x):
         # other platforms support, we just need to add the ISA info to the supported_vector_isa
         # and include proper aten vectorization head file.
         @unittest.skipIf(
-            not codecache.get_cpu_proc_info(), "Does not support vectorization"
+            not codecache.valid_vec_isa_list(), "Does not support vectorization"
         )
         @patch("torch.cuda.is_available", lambda: False)
         def test_vec_kernel_cpu_only(self):
@@ -4662,7 +4738,15 @@ def fn(x1, x2):
             x1 = torch.randn((10, 20))
             x2 = torch.randn((10, 20))
 
-            with patch.object(config.cpp, "simdlen", 8):
+            with patch.object(config.cpp, "simdlen", 1):
+                torch._dynamo.reset()
+                metrics.reset()
+                traced = make_fx(fn)(x1, x2)
+                compiled = compile_fx_inner(traced, [x1, x2])
+                assert same(fn(x1, x2)[0], compiled([x1, x2])[0], equal_nan=True)
+                assert metrics.generated_cpp_vec_kernel_count == 0
+
+            with patch.object(config.cpp, "simdlen", None):
                 torch._dynamo.reset()
                 metrics.reset()
                 traced = make_fx(fn)(x1, x2)
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 2826f35999126..232a611b06c6a 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -1,5 +1,5 @@
 import base64
-import enum
+import dataclasses
 import functools
 import getpass
 import hashlib
@@ -18,7 +18,7 @@
 from ctypes import cdll
 from threading import Thread
 from time import sleep, time
-from typing import Any, Dict
+from typing import Any, Callable, Dict, List
 
 import torch
 from torch.utils import cpp_extension
@@ -147,79 +147,181 @@ def is_gcc():
     return re.search(r"(gcc|g\+\+)", cpp_compiler())
 
 
-class _SupportedVecIsa(enum.Enum):
-    AVX512 = 1
-    AVX2 = 2
-    INVALID = -1
+class VecISA(object):
+    _bit_width: int
+    _macro: str
+    _arch_flags: str
+    _dtype_nelements: Dict[torch.dtype, int]
+
+    # TorchInductor CPU vectorization reuses PyTorch vectorization utility functions
+    # Hence, TorchInductor would depend on Sleef* to accelerate mathematical functions
+    # like exp, pow, sin, cos and etc.
+    # But PyTorch and TorchInductor might use different compilers to build code. If
+    # PyTorch uses gcc-7/g++-7 to build the release package, the libtorch_cpu.so
+    # will not expose the Sleef* AVX512 symbols since gcc-7/g++-7 cannot pass
+    # avx512 check in CMake - FindAVX.cmake. But TorchInductor install the latest
+    # gcc/g++ compiler by default while it could support the AVX512 compilation.
+    # Therefore, there would be a conflict sleef version between PyTorch and
+    # TorchInductor. Hence, we dry-compile the following code to check whether current
+    # HW platform and PyTorch both could support AVX512 or AVX2. And suppose ARM
+    # also needs the logic
+    _avx_code = """
+#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2)
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+#endif
+
+__attribute__((aligned(64))) float in_out_ptr0[16] = {0.0};
+
+extern "C" void __avx_chk_kernel() {
+    auto tmp0 = at::vec::Vectorized<float>(1);
+    auto tmp1 = tmp0.exp();
+    tmp1.store(in_out_ptr0);
+}
+"""
+
+    _avx_py_load = """
+import torch
+from ctypes import cdll
+cdll.LoadLibrary("__lib_path__")
+"""
+
+    def bit_width(self):
+        return self._bit_width
+
+    def nelements(self, dtype: torch.dtype = torch.float):
+        return self._dtype_nelements[dtype]
 
+    def build_macro(self):
+        return self._macro
+
+    def build_arch_flags(self):
+        return self._arch_flags
+
+    def __hash__(self) -> int:
+        return hash(str(self))
+
+    @functools.lru_cache(None)
     def __bool__(self):
-        return self != _SupportedVecIsa.INVALID
+        key, input_path = write(VecISA._avx_code, "cpp", extra="")
+        from filelock import FileLock
+
+        lock_dir = get_lock_dir()
+        lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
+        with lock:
+            output_path = input_path[:-3] + "so"
+            build_cmd = cpp_compile_command(
+                input_path, output_path, warning_all=False, vec_isa=self
+            ).split(" ")
+            try:
+                # Check build result
+                subprocess.check_output(build_cmd, stderr=subprocess.STDOUT)
+                subprocess.check_call(
+                    [
+                        "python",
+                        "-c",
+                        VecISA._avx_py_load.replace("__lib_path__", output_path),
+                    ],
+                    stderr=subprocess.DEVNULL,
+                )
+            except Exception as e:
+                return False
 
-    @staticmethod
-    def isa_str(supported_isa: enum.Enum):
-        if supported_isa == _SupportedVecIsa.AVX512:
-            return "avx512"
-        elif supported_isa == _SupportedVecIsa.AVX2:
-            return "avx2"
-        else:
-            return ""
+            return True
+
+
+@dataclasses.dataclass
+class VecAVX512(VecISA):
+    _bit_width = 512
+    _macro = "CPU_CAPABILITY_AVX512"
+    _arch_flags = "-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma"
+    _dtype_nelements = {torch.float: 16, torch.bfloat16: 32}
+
+    def __str__(self) -> str:
+        return "avx512"
+
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
 
-    @staticmethod
-    def vec_macro(supported_isa: enum.Enum):
-        if supported_isa == _SupportedVecIsa.AVX512:
-            return "CPU_CAPABILITY_AVX512"
-        elif supported_isa == _SupportedVecIsa.AVX2:
-            return "CPU_CAPABILITY_AVX2"
-        else:
-            return ""
+
+@dataclasses.dataclass
+class VecAVX2(VecISA):
+    _bit_width = 256
+    _macro = "CPU_CAPABILITY_AVX2"
+    _arch_flags = "-mavx2 -mfma"
+    _dtype_nelements = {torch.float: 8, torch.bfloat16: 16}
+
+    def __str__(self) -> str:
+        return "avx2"
+
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+
+
+class InvalidVecISA(VecISA):
+    _bit_width = 0
+    _macro = ""
+    _arch_flags = ""
+    _dtype_nelements = {}
+
+    def __str__(self) -> str:
+        return "INVALID_VEC_ISA"
+
+    def __bool__(self):
+        return False
+
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+
+
+invalid_vec_isa = InvalidVecISA()
+supported_vec_isa_list = [VecAVX512(), VecAVX2()]
 
 
 # Cache the cpuinfo to avoid I/O overhead. Meanwhile, the cpuinfo content
 # might have too much redundant content that is useless for ISA check. Hence,
 # we only cache some key isa information.
-@functools.lru_cache(1)
-def get_cpu_proc_info():
+@functools.lru_cache(None)
+def valid_vec_isa_list():
     if sys.platform != "linux":
         return []
 
-    isa_info = []
+    isa_list = []
     with open("/proc/cpuinfo") as _cpu_info:
         _cpu_info_content = _cpu_info.read()
-        if _SupportedVecIsa.isa_str(_SupportedVecIsa.AVX512) in _cpu_info_content:
-            isa_info.append(_SupportedVecIsa.AVX512)
-
-        if _SupportedVecIsa.isa_str(_SupportedVecIsa.AVX2) in _cpu_info_content:
-            isa_info.append(_SupportedVecIsa.AVX2)
+        for isa in supported_vec_isa_list:
+            if str(isa) in _cpu_info_content and isa:
+                isa_list.append(isa)
+        return isa_list
 
-        return isa_info
 
+def pick_vec_isa():
+    _valid_vec_isa_list: List[VecISA] = valid_vec_isa_list()
+    if not _valid_vec_isa_list:
+        return invalid_vec_isa
 
-def supported_vector_isa():
-    # TODO: Add ARM Vec here.
-    # Dict(k: isa, v: number of float element)
-    vec_isa_info = {
-        _SupportedVecIsa.AVX512: 16,
-        _SupportedVecIsa.AVX2: 8,
-    }
+    # If the simdlen is None, it indicates determin the vectroization length automatically
+    if config.cpp.simdlen is None:
+        assert _valid_vec_isa_list
+        return _valid_vec_isa_list[0]
 
-    if config.cpp.simdlen is None or config.cpp.simdlen <= 1:
-        return _SupportedVecIsa.INVALID
-
-    cpu_info_content = get_cpu_proc_info()
-    for isa in vec_isa_info.keys():
-        if isa in cpu_info_content and config.cpp.simdlen == vec_isa_info[isa]:
+    for isa in _valid_vec_isa_list:
+        if config.cpp.simdlen == isa.bit_width():
             return isa
 
-    return _SupportedVecIsa.INVALID
+    return invalid_vec_isa
 
 
-def cpp_compile_command(input, output, include_pytorch=False):
-    valid_isa = supported_vector_isa()
-    if include_pytorch or valid_isa:
+def cpp_compile_command(
+    input,
+    output,
+    warning_all=True,
+    shared=True,
+    include_pytorch=False,
+    vec_isa: VecISA = invalid_vec_isa,
+):
+    if include_pytorch or vec_isa != invalid_vec_isa:
         ipaths = cpp_extension.include_paths() + [sysconfig.get_path("include")]
         lpaths = cpp_extension.library_paths() + [sysconfig.get_config_var("LIBDIR")]
         libs = ["c10", "torch", "torch_cpu", "torch_python", "gomp"]
-        macros = _SupportedVecIsa.vec_macro(valid_isa)
+        macros = vec_isa.build_macro()
         if macros:
             macros = f"-D{macros}"
     else:
@@ -235,11 +337,13 @@ def cpp_compile_command(input, output, include_pytorch=False):
     lpaths = " ".join(["-L" + p for p in lpaths])
     libs = " ".join(["-l" + p for p in libs])
 
+    shared_lib = "-shared -fPIC" if shared else ""
+    warning_all_flag = "-Wall" if warning_all else ""
     return re.sub(
         r"[ \n]+",
         " ",
         f"""
-            {cpp_compiler()} {input} -shared -fPIC -Wall -std=c++14 -Wno-unused-variable
+            {cpp_compiler()} {input} {shared_lib} {warning_all_flag} -std=c++14 -Wno-unused-variable
             {ipaths} {lpaths} {libs} {macros}
             -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp
             -D C10_USING_CUSTOM_GENERATED_MACROS
@@ -266,7 +370,12 @@ def _load_library(path):
 
     @classmethod
     def load(cls, source_code):
-        key, input_path = write(source_code, "cpp", extra=cpp_compile_command("i", "o"))
+        picked_vec_isa = pick_vec_isa()
+        key, input_path = write(
+            source_code,
+            "cpp",
+            extra=cpp_compile_command("i", "o", vec_isa=picked_vec_isa),
+        )
         if key not in cls.cache:
             from filelock import FileLock
 
@@ -276,7 +385,7 @@ def load(cls, source_code):
                 output_path = input_path[:-3] + "so"
                 if not os.path.exists(output_path):
                     cmd = cpp_compile_command(
-                        input=input_path, output=output_path
+                        input=input_path, output=output_path, vec_isa=picked_vec_isa
                     ).split(" ")
                     try:
                         subprocess.check_output(cmd, stderr=subprocess.STDOUT)
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index 2803970295ccc..cf98833964ca5 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -417,6 +417,12 @@ def __init__(self, name):
     def __str__(self):
         return self.name
 
+    def __hash__(self) -> int:
+        return hash(self.name)
+
+    def __eq__(self, other) -> bool:
+        return type(other) == type(self) and other.name == self.name
+
     def update_on_args(self, args, kwargs):
         pass
 
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index 65a9335d6cbfc..38ef2179d5b71 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -616,7 +616,7 @@ def codegen_loops(self, code, worksharing):
         )
         reductions.mark_reduction(self.reduction_vars)
 
-        if config.cpp.simdlen:
+        if codecache.pick_vec_isa():
             # TODO(jansel): detect stride-1 dimension and vectorize that
             if reductions:
                 reductions.loops[-1].simd = True
@@ -707,7 +707,8 @@ class CppVecKernel(CppKernel):
 
     def __init__(self, args, num_threads):
         super(CppVecKernel, self).__init__(args, num_threads)
-        self.simd_len = config.cpp.simdlen
+        assert codecache.pick_vec_isa()
+        self.simd_nelements = codecache.pick_vec_isa().nelements()
         self.reduction_omp_dec: Dict[str, str] = {}
         metrics.generated_cpp_vec_kernel_count += 1
 
@@ -723,10 +724,10 @@ def is_var_irrevelant(self, var: sympy.Symbol, index: sympy.Expr):
 
     def transform_index(self, index: sympy.Expr):
         expanded_index = sympy.expand(index)
-        assert self.simd_len
-        assert self.simd_len > 0
+        assert self.simd_nelements
+        assert self.simd_nelements >= 1
         most_inner_var = self.itervars[-1]
-        replacement = {most_inner_var: most_inner_var * self.simd_len}
+        replacement = {most_inner_var: most_inner_var * self.simd_nelements}
         new_index = sympy_subs(expanded_index, replacement)
         return new_index
 
@@ -947,21 +948,24 @@ def __init__(self, args=None, num_threads=None):
         super(CppKernelProxy, self).__init__(args, num_threads)
         self.simd_vec_kernel: CppVecKernel = None
         self.simd_omp_kernel: CppKernel = None
+        self.picked_vec_isa: codecache.VecISA = codecache.pick_vec_isa()
 
-    def vectorize_most_inner_loop(self, loop_nest):
-        loop_nest.split_most_inner_loop(config.cpp.simdlen)
+    def vectorize_most_inner_loop(self, loop_nest, dtype=torch.float):
+        assert self.picked_vec_isa
+        nelements = self.picked_vec_isa.nelements(dtype)
+        loop_nest.split_most_inner_loop(nelements)
         loop_with_tail = loop_nest.loops[-1]
         assert isinstance(loop_with_tail, LoopLevelWithTail)
 
         loop_with_tail.main_loop.simd_vec = True
 
         loop_with_tail.tail_loop.simd_omp = True
-        # We chope the loop into two cubes by the config.cpp.simdlen - main loop and tail loop.
+        # We chope the loop into two cubes by the nelements - main loop and tail loop.
         # Regarding the main loop, it is straightforward that it could be vectorized with
-        # config.cpp.simdlen. But for the tail loop, it still could be vectorized. For example,
-        # if the config.cpp.simdlen is 8(256bits), then the tail loop still could be vectorized
+        # nelements. But for the tail loop, it still could be vectorized. For example,
+        # if the nelements is 8(256bits), then the tail loop still could be vectorized
         # as 4(128bits).
-        loop_with_tail.tail_loop.simd_len = int(config.cpp.simdlen / 2)
+        loop_with_tail.tail_loop.simd_nelements = int(nelements / 2)
         loop_with_tail.tail_loop.simd_vec = False
 
         loop_with_tail.main_loop_body = self.simd_vec_kernel
@@ -971,7 +975,7 @@ def vectorize_most_inner_loop(self, loop_nest):
     def codegen_loops(self, code, worksharing):
         threads = parallel_num_threads()
 
-        if self.simd_vec_kernel is None:
+        if self.simd_vec_kernel is None or not self.picked_vec_isa:
             assert self.simd_omp_kernel
             return self.simd_omp_kernel.codegen_loops(code, worksharing)
 
@@ -993,12 +997,52 @@ def codegen_loops(self, code, worksharing):
         ), LoopNest(loops[reduction_depth:])
         loops_nest_reduce.mark_reduction(self.simd_vec_kernel.reduction_vars)
 
-        if config.cpp.simdlen:
-            # TODO(jansel): detect stride-1 dimension and vectorize that
-            if loops_nest_reduce:
-                loops_nest_reduce.loops[-1].simd = True
-            elif loops_nest_non_reduce:
-                loops_nest_non_reduce.loops[-1].simd = True
+        assert self.picked_vec_isa
+        # Do not apply vectorization since the range of most inner is too small. Meanwhile,
+        # If the range of the most inner is less then the codecache.pick_vec_isa().nelements(),
+        # the generated code for some reduction will be as follows that leads to incrrect result.
+        #
+        #    LINE01:  float tmp1 = 0;
+        #    LINE02:  auto tmp1_vec = at::vec::Vectorized<float>(tmp1);
+        #    LINE03:  for(long i1=0; i1<2; i1+=1)
+        #    LINE04:  {
+        #    LINE05:      for(long i2=0; i2<0; i2+=1)
+        #    LINE06:      {
+        #    LINE07:          auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + (8*i0) + (16*i2) + (32*i1));
+        #    LINE08:          tmp1_vec += tmp0;
+        #    LINE09:      }
+        #    LINE10:      tmp1 = vec_reduce_all<float>([](Vectorized<float>& x, Vectorized<float>&y) {return x + y;}, tmp1_vec);
+        #    LINE11:      #pragma omp simd simdlen(8)  reduction(+:tmp1)
+        #    LINE12:      for(long i2=0; i2<8; i2+=1)
+        #    LINE13:      {
+        #    LINE14:          auto tmp0 = in_ptr0[i2 + (8*i0) + (32*i1)];
+        #    LINE15:          tmp1 += tmp0;
+        #    LINE16:      }
+        #    LINE17:  }
+        #    LINE18:  out_ptr3[i0] = tmp1;
+        #
+        # tmp1_vec(LINE02) will always be zero as it is initialized with tmp1 value and the range(LINE05)
+        # is 0. Hence, the LINE10 will always reset tmp1 to 0. But tmp1(LINE01) is global value. So the result
+        # will be incorrect. We skip thie case.
+        most_inner_loop = (
+            loops_nest_reduce.loops[-1]
+            if loops_nest_reduce
+            else loops_nest_non_reduce.loops[-1]
+        )
+        main_loop_range = ir.IndexingDiv(
+            most_inner_loop.size, self.picked_vec_isa.nelements()
+        )
+        loop_interval = sympy.simplify(main_loop_range)
+        # TODO(Eikan): To support dynamic shape.
+        if not loop_interval.is_integer or loop_interval <= 0:
+            metrics.generated_cpp_vec_kernel_count -= 1
+            return self.simd_omp_kernel.codegen_loops(code, worksharing)
+
+        # TODO(jansel): detect stride-1 dimension and vectorize that
+        if loops_nest_reduce:
+            loops_nest_reduce.loops[-1].simd = True
+        elif loops_nest_non_reduce:
+            loops_nest_non_reduce.loops[-1].simd = True
 
         par_depth = 0
         reduction_par_depth = 0
@@ -1138,8 +1182,7 @@ def can_fuse_vertical(cls, node1, node2):
         return cls.can_fuse_horizontal(node1, node2) and not node1.is_reduction()
 
     def can_vec(self, nodes):
-        # TODO: Query cpu arch and vec length from aten
-        if not codecache.supported_vector_isa():
+        if not codecache.pick_vec_isa():
             return False
 
         _, (group, reduction_group) = max(
@@ -1349,7 +1392,8 @@ class LoopLevel:
     steps: sympy.Expr = sympy.Integer(1)
     parallel: int = 0
     simd_omp: bool = False
-    simd_len: int = config.cpp.simdlen
+    picked_vec_isa: codecache.VecISA = codecache.pick_vec_isa()
+    simd_nelements: int = picked_vec_isa.nelements() if picked_vec_isa else 0
     simd_vec: bool = False
     collapsed: bool = False
     reduction_vars: Dict[str, str] = None
@@ -1363,7 +1407,11 @@ def lines(self):
             )
         else:
             reduction = ""
-        simd = f"simd simdlen({self.simd_len}) " if self.simd_omp else ""
+        simd = (
+            f"simd simdlen({self.simd_nelements}) "
+            if self.simd_omp and self.simd_nelements > 1
+            else ""
+        )
         if self.parallel:
             # TODO(jansel): look into chunk size and other schedules
             line1 = f"#pragma omp for{reduction} "

From 9eca85d204b52781d7cb3b352609e799f1eaec95 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Thu, 17 Nov 2022 03:36:56 +0000
Subject: [PATCH 1007/1922] Analyze and upload disabled tests rerun to S3
 (#89083)

Analyze and upload disabled tests rerun to S3. Note that this only picks up `test-reports` from `rerun_disable_tests` workflows.

### Testing

Running the script manually `python -m tools.stats.check_disabled_tests --workflow-run-id 3473068035 --workflow-run-attempt 1 --repo pytorch/pytorch` and see the files successfully uploaded to s3://ossci-raw-job-status/rerun_disabled_tests/3473068035/1

Rockset collection created https://console.rockset.com/collections/details/commons.rerun_disabled_tests
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89083
Approved by: https://github.com/clee2000
---
 .github/workflows/upload-test-stats.yml |  11 +
 tools/stats/check_disabled_tests.py     | 290 ++++++++++++++++++++++++
 2 files changed, 301 insertions(+)
 create mode 100644 tools/stats/check_disabled_tests.py

diff --git a/.github/workflows/upload-test-stats.yml b/.github/workflows/upload-test-stats.yml
index 27289983e2707..3f3db80670d8c 100644
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@@ -72,6 +72,17 @@ jobs:
           # anything on GitHub to upload. The command should return right away
           python3 -m tools.stats.upload_artifacts --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --repo "${REPO_FULLNAME}"
 
+      - name: Analyze disabled tests rerun
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          WORKFLOW_ARTIFACTS_URL: ${{ github.event.workflow_run.artifacts_url }}
+          WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
+          WORKFLOW_RUN_ATTEMPT: ${{ github.event.workflow_run.run_attempt }}
+          REPO_FULLNAME: ${{ github.event.workflow_run.repository.full_name }}
+        run: |
+          # Analyze the results from disable tests rerun and upload them to S3
+          python3 -m tools.stats.check_disabled_tests --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --repo "${REPO_FULLNAME}"
+
   check-api-rate:
     if: ${{ always() }}
     runs-on: [self-hosted, linux.2xlarge]
diff --git a/tools/stats/check_disabled_tests.py b/tools/stats/check_disabled_tests.py
new file mode 100644
index 0000000000000..75c4f236ef216
--- /dev/null
+++ b/tools/stats/check_disabled_tests.py
@@ -0,0 +1,290 @@
+import argparse
+import json
+import os
+import xml.etree.ElementTree as ET
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import Any, Dict, Generator, Tuple
+
+from tools.stats.upload_stats_lib import (
+    download_gha_artifacts,
+    download_s3_artifacts,
+    unzip,
+    upload_to_s3,
+)
+from tools.stats.upload_test_stats import process_xml_element
+
+TESTCASE_TAG = "testcase"
+TARGET_WORKFLOW = "--rerun-disabled-tests"
+SEPARATOR = ";"
+
+
+def is_rerun_disabled_tests(root: ET.ElementTree) -> bool:
+    """
+    Check if the test report is coming from rerun_disabled_tests workflow
+    """
+    skipped = root.find(".//*skipped")
+    # Need to check against None here, if not skipped doesn't work as expected
+    if skipped is None:
+        return False
+
+    message = skipped.attrib.get("message", "")
+    return TARGET_WORKFLOW in message or "num_red" in message
+
+
+def process_report(
+    report: Path,
+) -> Dict[str, Dict[str, int]]:
+    """
+    Return a list of disabled tests that should be re-enabled and those that are still
+    flaky (failed or skipped)
+    """
+    root = ET.parse(report)
+
+    # All rerun tests from a report are grouped here:
+    #
+    # * Success test should be re-enable if it's green after rerunning in all platforms
+    #   where it is currently disabled
+    # * Failures from pytest because pytest-flakefinder is used to run the same test
+    #   multiple times, some could fails
+    # * Skipped tests from unittest
+    #
+    # We want to keep track of how many times the test fails (num_red) or passes (num_green)
+    all_tests: Dict[str, Dict[str, int]] = {}
+
+    if not is_rerun_disabled_tests(root):
+        return all_tests
+
+    for test_case in root.iter(TESTCASE_TAG):
+        parsed_test_case = process_xml_element(test_case)
+
+        # Under --rerun-disabled-tests mode, a test is skipped when:
+        # * it's skipped explicitly inside PyToch code
+        # * it's skipped because it's a normal enabled test
+        # * or it's falky (num_red > 0 and num_green > 0)
+        # * or it's failing (num_red > 0 and num_green == 0)
+        #
+        # We care only about the latter two here
+        skipped = parsed_test_case.get("skipped", None)
+        if skipped and "num_red" not in skipped.get("message", ""):
+            continue
+
+        name = parsed_test_case.get("name", "")
+        classname = parsed_test_case.get("classname", "")
+        filename = parsed_test_case.get("file", "")
+
+        if not name or not classname or not filename:
+            continue
+
+        # Check if the test is a failure
+        failure = parsed_test_case.get("failure", None)
+
+        disabled_test_id = SEPARATOR.join([name, classname, filename])
+        if disabled_test_id not in all_tests:
+            all_tests[disabled_test_id] = {
+                "num_green": 0,
+                "num_red": 0,
+            }
+
+        # Under --rerun-disabled-tests mode, if a test is not skipped or failed, it's
+        # counted as a success. Otherwise, it's still flaky or failing
+        if skipped:
+            try:
+                stats = json.loads(skipped.get("message", ""))
+            except json.JSONDecodeError:
+                stats = {}
+
+            all_tests[disabled_test_id]["num_green"] += stats.get("num_green", 0)
+            all_tests[disabled_test_id]["num_red"] += stats.get("num_red", 0)
+        elif failure:
+            # As a failure, increase the failure count
+            all_tests[disabled_test_id]["num_red"] += 1
+        else:
+            all_tests[disabled_test_id]["num_green"] += 1
+
+    return all_tests
+
+
+def get_test_reports(
+    repo: str, workflow_run_id: int, workflow_run_attempt: int
+) -> Generator[Path, None, None]:
+    """
+    Gather all the test reports from S3 and GHA. It is currently not possible to guess which
+    test reports are from rerun_disabled_tests workflow because the name doesn't include the
+    test config. So, all reports will need to be downloaded and examined
+    """
+    with TemporaryDirectory() as temp_dir:
+        print("Using temporary directory:", temp_dir)
+        os.chdir(temp_dir)
+
+        artifact_paths = download_s3_artifacts(
+            "test-reports", workflow_run_id, workflow_run_attempt
+        )
+        for path in artifact_paths:
+            unzip(path)
+
+        artifact_paths = download_gha_artifacts(
+            "test-report", workflow_run_id, workflow_run_attempt
+        )
+        for path in artifact_paths:
+            unzip(path)
+
+        for report in Path(".").glob("**/*.xml"):
+            yield report
+
+
+def get_disabled_test_name(test_id: str) -> Tuple[str, str, str, str]:
+    """
+    Follow flaky bot convention here, if that changes, this will also need to be updated
+    """
+    name, classname, filename = test_id.split(SEPARATOR)
+    return f"{name} (__main__.{classname})", name, classname, filename
+
+
+def prepare_record(
+    workflow_id: int,
+    workflow_run_attempt: int,
+    name: str,
+    classname: str,
+    filename: str,
+    flaky: bool,
+    num_red: int = 0,
+    num_green: int = 0,
+) -> Tuple[Any, Dict[str, Any]]:
+    """
+    Prepare the record to save onto S3
+    """
+    key = (
+        workflow_id,
+        workflow_run_attempt,
+        name,
+        classname,
+        filename,
+    )
+
+    record = {
+        "workflow_id": workflow_id,
+        "workflow_run_attempt": workflow_run_attempt,
+        "name": name,
+        "classname": classname,
+        "filename": filename,
+        "flaky": flaky,
+        "num_green": num_green,
+        "num_red": num_red,
+    }
+
+    return key, record
+
+
+def save_results(
+    workflow_id: int,
+    workflow_run_attempt: int,
+    all_tests: Dict[str, Dict[str, int]],
+) -> None:
+    """
+    Save the result to S3, so it can go to Rockset
+    """
+    should_be_enabled_tests = {
+        name: stats
+        for name, stats in all_tests.items()
+        if "num_green" in stats
+        and stats["num_green"]
+        and "num_red" in stats
+        and stats["num_red"] == 0
+    }
+    still_flaky_tests = {
+        name: stats
+        for name, stats in all_tests.items()
+        if name not in should_be_enabled_tests
+    }
+
+    records = {}
+    for test_id, stats in all_tests.items():
+        num_green = stats.get("num_green", 0)
+        num_red = stats.get("num_red", 0)
+        disabled_test_name, name, classname, filename = get_disabled_test_name(test_id)
+
+        key, record = prepare_record(
+            workflow_id=workflow_id,
+            workflow_run_attempt=workflow_run_attempt,
+            name=name,
+            classname=classname,
+            filename=filename,
+            flaky=test_id in still_flaky_tests,
+            num_green=num_green,
+            num_red=num_red,
+        )
+        records[key] = record
+
+    # Log the results
+    print(f"The following {len(should_be_enabled_tests)} tests should be re-enabled:")
+    for test_id, stats in should_be_enabled_tests.items():
+        disabled_test_name, name, classname, filename = get_disabled_test_name(test_id)
+        print(f"  {disabled_test_name} from {filename}")
+
+    print(f"The following {len(still_flaky_tests)} are still flaky:")
+    for test_id, stats in still_flaky_tests.items():
+        num_green = stats.get("num_green", 0)
+        num_red = stats.get("num_red", 0)
+
+        disabled_test_name, name, classname, filename = get_disabled_test_name(test_id)
+        print(
+            f"  {disabled_test_name} from {filename}, failing {num_red}/{num_red + num_green}"
+        )
+
+    upload_to_s3(
+        workflow_id,
+        workflow_run_attempt,
+        "rerun_disabled_tests",
+        list(records.values()),
+    )
+
+
+def main(repo: str, workflow_run_id: int, workflow_run_attempt: int) -> None:
+    """
+    Find the list of all disabled tests that should be re-enabled
+    """
+    # Aggregated across all jobs
+    all_tests: Dict[str, Dict[str, int]] = {}
+
+    for report in get_test_reports(
+        args.repo, args.workflow_run_id, args.workflow_run_attempt
+    ):
+        tests = process_report(report)
+        for name, stats in tests.items():
+            if name not in all_tests:
+                all_tests[name] = stats.copy()
+            else:
+                all_tests[name]["num_green"] += stats.get("num_green", 0)
+                all_tests[name]["num_red"] += stats.get("num_red", 0)
+
+    save_results(
+        workflow_run_id,
+        workflow_run_attempt,
+        all_tests,
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Upload test artifacts from GHA to S3")
+    parser.add_argument(
+        "--workflow-run-id",
+        type=int,
+        required=True,
+        help="id of the workflow to get artifacts from",
+    )
+    parser.add_argument(
+        "--workflow-run-attempt",
+        type=int,
+        required=True,
+        help="which retry of the workflow this is",
+    )
+    parser.add_argument(
+        "--repo",
+        type=str,
+        required=True,
+        help="which GitHub repo this workflow run belongs to",
+    )
+
+    args = parser.parse_args()
+    main(args.repo, args.workflow_run_id, args.workflow_run_attempt)

From dbc21e78c897959f7a472b47b045dc90cb67a59d Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Thu, 17 Nov 2022 03:36:59 +0000
Subject: [PATCH 1008/1922] Fix typos in .md and .rst files (#88962)

This PR fixes typos `Github` in `.md` and `.rst` files.
`Github` -> `GitHub`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88962
Approved by: https://github.com/kit1980
---
 .github/scripts/README.md                    | 2 +-
 RELEASE.md                                   | 2 +-
 caffe2/contrib/tensorrt/README.md            | 2 +-
 docs/source/community/contribution_guide.rst | 2 +-
 docs/source/masked.rst                       | 2 +-
 docs/source/onnx.rst                         | 2 +-
 docs/source/sparse.rst                       | 4 ++--
 torch/csrc/jit/operator_upgraders/README.md  | 2 +-
 torch/csrc/lazy/tutorial.md                  | 2 +-
 9 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/scripts/README.md b/.github/scripts/README.md
index 73bec509c2c41..cc9e1617b11a7 100644
--- a/.github/scripts/README.md
+++ b/.github/scripts/README.md
@@ -3,7 +3,7 @@
 > NOTE: This README contains information for the `.github` directory but cannot be located there because it will overwrite the
 repo README.
 
-This directory contains workflows and scripts to support our CI infrastructure that runs on Github Actions.
+This directory contains workflows and scripts to support our CI infrastructure that runs on GitHub Actions.
 
 ## Workflows
 
diff --git a/RELEASE.md b/RELEASE.md
index e2b69b5bf82ee..d13ca5d11e100 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -281,7 +281,7 @@ need to support these particular versions of software.
 
 In the event a submodule cannot be fast forwarded and a patch must be applied we can take two different approaches:
 
-* (preferred) Fork the said repository under the pytorch Github organization, apply the patches we need there, and then switch our submodule to accept our fork.
+* (preferred) Fork the said repository under the pytorch GitHub organization, apply the patches we need there, and then switch our submodule to accept our fork.
 * Get the dependencies maintainers to support a release branch for us
 
 Editing submodule remotes can be easily done with: (running from the root of the git repository)
diff --git a/caffe2/contrib/tensorrt/README.md b/caffe2/contrib/tensorrt/README.md
index f1e449e727e94..6ffe1dfb53bc6 100644
--- a/caffe2/contrib/tensorrt/README.md
+++ b/caffe2/contrib/tensorrt/README.md
@@ -15,4 +15,4 @@ For further information please explore `caffe2/python/trt/test_trt.py` test show
 
 ## Questions and Feedback
 
-Please use Github issues (https://github.com/pytorch/pytorch/issues) to ask questions, report bugs, and request new features.
+Please use GitHub issues (https://github.com/pytorch/pytorch/issues) to ask questions, report bugs, and request new features.
diff --git a/docs/source/community/contribution_guide.rst b/docs/source/community/contribution_guide.rst
index a2a89721b64e2..30bd9c6cf9751 100644
--- a/docs/source/community/contribution_guide.rst
+++ b/docs/source/community/contribution_guide.rst
@@ -138,7 +138,7 @@ A great deal of the tutorials on `pytorch.org <https://pytorch.org/>`__
 come from the community itself and we welcome additional contributions.
 To learn more about how to contribute a new tutorial you can learn more
 here: `PyTorch.org Tutorial Contribution Guide on
-Github <https://github.com/pytorch/tutorials/#contributing>`__
+GitHub <https://github.com/pytorch/tutorials/#contributing>`__
 
 Improving Documentation & Tutorials
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/source/masked.rst b/docs/source/masked.rst
index 10ae8420425f8..3655a6d79fd98 100644
--- a/docs/source/masked.rst
+++ b/docs/source/masked.rst
@@ -157,7 +157,7 @@ Binary Operators
 As you may have seen in the tutorial, :class:`MaskedTensor` also has binary operations implemented with the caveat
 that the masks in the two MaskedTensors must match or else an error will be raised. As noted in the error, if you
 need support for a particular operator or have proposed semantics for how they should behave instead, please open
-an issue on Github. For now, we have decided to go with the most conservative implementation to ensure that users
+an issue on GitHub. For now, we have decided to go with the most conservative implementation to ensure that users
 know exactly what is going on and are being intentional about their decisions with masked semantics.
 
 The available binary operators are:
diff --git a/docs/source/onnx.rst b/docs/source/onnx.rst
index 78ef3cd93663c..fea0b3bc94d29 100644
--- a/docs/source/onnx.rst
+++ b/docs/source/onnx.rst
@@ -594,7 +594,7 @@ all of the unconvertible ops in one go you can::
 The set is approximated because some ops may be removed during the conversion
 process and don't need to be converted. Some other ops may have partial support
 that will fail conversion with particular inputs, but this should give you a
-general idea of what ops are not supported. Please feel free to open Github Issues
+general idea of what ops are not supported. Please feel free to open GitHub Issues
 for op support requests.
 
 Frequently Asked Questions
diff --git a/docs/source/sparse.rst b/docs/source/sparse.rst
index 29790312cb3b8..77e8dabec2744 100644
--- a/docs/source/sparse.rst
+++ b/docs/source/sparse.rst
@@ -10,7 +10,7 @@ torch.sparse
 .. warning::
 
   The PyTorch API of sparse tensors is in beta and may change in the near future.
-  We highly welcome feature requests, bug reports and general suggestions as Github issues.
+  We highly welcome feature requests, bug reports and general suggestions as GitHub issues.
 
 Why and when to use sparsity
 ++++++++++++++++++++++++++++
@@ -40,7 +40,7 @@ Like many other performance optimization sparse storage formats are not
 always advantageous. When trying sparse formats for your use case
 you might find your execution time to decrease rather than increase.
 
-Please feel encouraged to open a Github issue if you analytically
+Please feel encouraged to open a GitHub issue if you analytically
 expected to see a stark increase in performance but measured a
 degradation instead. This helps us prioritize the implementation
 of efficient kernels and wider performance optimizations.
diff --git a/torch/csrc/jit/operator_upgraders/README.md b/torch/csrc/jit/operator_upgraders/README.md
index bf1350aa21f34..75639006e5034 100644
--- a/torch/csrc/jit/operator_upgraders/README.md
+++ b/torch/csrc/jit/operator_upgraders/README.md
@@ -226,7 +226,7 @@ def foo(x, y, z=100):
     return x, y, z
 ```
 
-2. To help understanding the BC/FC breakage changes, here are some FC breaking changes examples. The solution to resolve it is not there yet. If it's desired, please report it in either [PyTorch Forum](https://discuss.pytorch.org/) or [PyTorch Github](https://github.com/pytorch/pytorch). We will prioritize it accordingly.
+2. To help understanding the BC/FC breakage changes, here are some FC breaking changes examples. The solution to resolve it is not there yet. If it's desired, please report it in either [PyTorch Forum](https://discuss.pytorch.org/) or [PyTorch GitHub](https://github.com/pytorch/pytorch). We will prioritize it accordingly.
 
     - Adding new default argument:
     - Adding a new default argument not RIGHT BEFORE the out arguments which can be 0 or more.
diff --git a/torch/csrc/lazy/tutorial.md b/torch/csrc/lazy/tutorial.md
index 6d4e75affc38a..e26c55d2c520e 100644
--- a/torch/csrc/lazy/tutorial.md
+++ b/torch/csrc/lazy/tutorial.md
@@ -283,4 +283,4 @@ This concludes our brief introduction to LT. Hopefully, you'll remember the main
 * It's really tricky to produce such graphs without overburdening a user too much. Think, torch.jit.script, torch.jit.trace! Also, think ifs, fors, "Lions, and Tigers, and Bears, Oh My" We digressed.
 
 
-Please give LT a try and tell us what you think on Github! We are **eager, not lazy** (haha!) to hear from you!
+Please give LT a try and tell us what you think on GitHub! We are **eager, not lazy** (haha!) to hear from you!

From bc17f7d798e78b69a1ba2ab5dc79885bbedef345 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Tue, 15 Nov 2022 16:01:29 -0800
Subject: [PATCH 1009/1922] [nn][utils] Preserve requires_grad from original
 weight and bias in fuse conv/linear bn weights (#89100)

Summary:
att, previously we just call nn.Parameter which will have requires_grad=True by default, after
this PR we will preserve the requires_grad

Test Plan:
python test/test_nn.py TestFusionUtils

Reviewers:

Subscribers:

Tasks:

Tags:

Differential Revision: [D41343694](https://our.internmc.facebook.com/intern/diff/D41343694)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89100
Approved by: https://github.com/ngimel
---
 test/test_nn.py          | 28 ++++++++++++++++++++++++++++
 torch/nn/utils/fusion.py |  8 ++++----
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/test/test_nn.py b/test/test_nn.py
index 2b96838e36014..4231c19ed0dac 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -36,6 +36,8 @@
 import torch.nn.utils.parametrize as parametrize
 import torch.nn.utils.prune as prune
 from torch.nn.utils import parameters_to_vector, vector_to_parameters
+from torch.nn.utils.fusion import fuse_conv_bn_weights
+from torch.nn.utils.fusion import fuse_linear_bn_weights
 from torch.nn import Parameter
 from torch.nn.parallel._functions import Broadcast
 from torch.testing._internal.common_dtype import integral_types, get_all_math_dtypes
@@ -16267,6 +16269,32 @@ def my_post_load_hook(mod, _):
             m.load_state_dict(sd)
             self.assertTrue(called)
 
+class TestFusionUtils(TestCase):
+    def test_fuse_conv_bn_requires_grad(self):
+        conv = torch.nn.Conv2d(3, 3, 3)
+        bn = torch.nn.BatchNorm2d(3)
+        cases = itertools.product([True, False], [True, False])
+        for w_rg, b_rg in cases:
+            conv.weight.requires_grad = w_rg
+            conv.bias.requires_grad = b_rg
+            weight, bias = \
+                fuse_conv_bn_weights(conv.weight, conv.bias,
+                                     bn.running_mean, bn.running_var, bn.eps, bn.weight, bn.bias)
+            self.assertEqual(weight.requires_grad, w_rg)
+            self.assertEqual(bias.requires_grad, b_rg)
+
+    def test_fuse_linear_bn_requires_grad(self):
+        linear = torch.nn.Linear(3, 3)
+        bn = torch.nn.BatchNorm1d(3)
+        cases = itertools.product([True, False], [True, False])
+        for w_rg, b_rg in cases:
+            linear.weight.requires_grad = w_rg
+            linear.bias.requires_grad = b_rg
+            weight, bias = \
+                fuse_linear_bn_weights(linear.weight, linear.bias,
+                                       bn.running_mean, bn.running_var, bn.eps, bn.weight, bn.bias)
+            self.assertEqual(weight.requires_grad, w_rg)
+            self.assertEqual(bias.requires_grad, b_rg)
 
 instantiate_device_type_tests(TestNNDeviceType, globals())
 instantiate_parametrized_tests(TestNN)
diff --git a/torch/nn/utils/fusion.py b/torch/nn/utils/fusion.py
index e96c4f7d44260..81b1431c53c9a 100644
--- a/torch/nn/utils/fusion.py
+++ b/torch/nn/utils/fusion.py
@@ -27,10 +27,10 @@ def fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b, trans
     else:
         shape = [-1, 1] + [1] * (len(conv_w.shape) - 2)
 
-    conv_w = conv_w * (bn_w * bn_var_rsqrt).reshape(shape)
-    conv_b = (conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b
+    fused_conv_w = conv_w * (bn_w * bn_var_rsqrt).reshape(shape)
+    fused_conv_b = (conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b
 
-    return torch.nn.Parameter(conv_w), torch.nn.Parameter(conv_b)
+    return torch.nn.Parameter(fused_conv_w, conv_w.requires_grad), torch.nn.Parameter(fused_conv_b, conv_b.requires_grad)
 
 def fuse_linear_bn_eval(linear, bn):
     assert(not (linear.training or bn.training)), "Fusion only for eval!"
@@ -50,4 +50,4 @@ def fuse_linear_bn_weights(linear_w, linear_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b)
     fused_w = linear_w * bn_scale.unsqueeze(-1)
     fused_b = (linear_b - bn_rm) * bn_scale + bn_b
 
-    return torch.nn.Parameter(fused_w), torch.nn.Parameter(fused_b)
+    return torch.nn.Parameter(fused_w, linear_w.requires_grad), torch.nn.Parameter(fused_b, linear_b.requires_grad)

From 4a0d8746c818689628d449c00bc51b29b2cb736e Mon Sep 17 00:00:00 2001
From: maxren <maxren@meta.com>
Date: Wed, 16 Nov 2022 10:46:27 -0800
Subject: [PATCH 1010/1922] [xnnpack][lite-int] Freeze/Inline module to remove
 reference to self (#88863)

We need to inline graph before converting from torchscript to xnnpack flatubuffer. Remove graph dependence on self.

This will later help us work with constant data.

Differential Revision: [D41049858](https://our.internmc.facebook.com/intern/diff/D41049858/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88863
Approved by: https://github.com/digantdesai
---
 .../jit/backends/xnnpack/xnnpack_backend_preprocess.cpp  | 9 ++++-----
 .../csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp  | 1 -
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/torch/csrc/jit/backends/xnnpack/xnnpack_backend_preprocess.cpp b/torch/csrc/jit/backends/xnnpack/xnnpack_backend_preprocess.cpp
index f2734a5e529a1..b4b7c912554a5 100644
--- a/torch/csrc/jit/backends/xnnpack/xnnpack_backend_preprocess.cpp
+++ b/torch/csrc/jit/backends/xnnpack/xnnpack_backend_preprocess.cpp
@@ -32,8 +32,9 @@ c10::IValue preprocess(
     const Module& mod,
     const c10::Dict<c10::IValue, c10::IValue>& method_compile_spec,
     const BackendDebugHandleGenerator& generate_debug_handles) {
-  auto output_min = -std::numeric_limits<float>::infinity();
-  auto output_max = std::numeric_limits<float>::infinity();
+  auto eval_mod = mod.clone();
+  eval_mod.eval();
+  eval_mod = torch::jit::freeze(eval_mod);
 
   c10::Dict<IValue, IValue> compiled(StringType::get(), TensorType::get());
 
@@ -62,7 +63,7 @@ c10::IValue preprocess(
       "method_compile_spec does not contain either a Tensor or TensorList, under it's \"outputs\" key.");
 
   // Graph preprocessing
-  const auto& forward_method = mod.get_method("forward");
+  const auto& forward_method = eval_mod.get_method("forward");
 
   auto graph = toGraphFunction(forward_method.function()).graph()->copy();
   graph = tensorexpr::removeUnusedSelfArgument(graph);
@@ -75,7 +76,6 @@ c10::IValue preprocess(
 
     example_inputs.reserve(inp_list.size());
     for (const auto i : c10::irange(inp_list.size())) {
-      graph->inputs()[i]->setType(TensorType::create(inp_list[i]));
       example_inputs.emplace_back(inp_list[i]);
     }
   } else {
@@ -83,7 +83,6 @@ c10::IValue preprocess(
         graph->inputs().size() == 1,
         "method_compile_spec inputs do not match expected number of forward inputs");
 
-    graph->inputs()[0]->setType(TensorType::create(inp.toTensor()));
     example_inputs.emplace_back(inp.toTensor());
   }
 
diff --git a/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp
index ec740bd66c509..4eaefea569605 100644
--- a/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp
+++ b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp
@@ -21,7 +21,6 @@ namespace delegate {
 std::shared_ptr<torch::jit::Graph> XNNGraph::optimizeAndTraceGraph(
     std::shared_ptr<torch::jit::Graph> graph,
     std::vector<c10::IValue>& example_inputs) {
-  graph = tensorexpr::removeUnusedSelfArgument(graph);
   OptimizeFrozenGraph(graph, true);
   RemoveListMutation(graph);
   RemoveTensorMutation(graph);

From 16d264f8e8f0d76627adce9fd473c34dbc9e680a Mon Sep 17 00:00:00 2001
From: maxren <maxren@meta.com>
Date: Wed, 16 Nov 2022 10:46:28 -0800
Subject: [PATCH 1011/1922] [xnnpack][Bug Fix] Pass serialized model by
 reference (#89089)

Two changes
- Remove XNNCompiler Dependence on std::string by passing void*
- Grab ser_model by reference: This bug was causing data pointers given to xnn_runtime to be freed because ser_model was on the stack.

Differential Revision: [D41208380](https://our.internmc.facebook.com/intern/diff/D41208380/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89089
Approved by: https://github.com/digantdesai
---
 torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp | 6 +++---
 torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.h   | 2 +-
 torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp   | 5 +++--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp b/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp
index 4147edf90e85d..af9c68df31e80 100644
--- a/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp
+++ b/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp
@@ -11,9 +11,9 @@ namespace jit {
 namespace xnnpack {
 namespace delegate {
 
-XNNExecutor XNNCompiler::compileModel(std::string ser_model) {
-  const char* buffer_pointer = ser_model.data();
-
+XNNExecutor XNNCompiler::compileModel(
+    const void* buffer_pointer,
+    size_t num_bytes) {
   auto output_min = -std::numeric_limits<float>::infinity();
   auto output_max = std::numeric_limits<float>::infinity();
 
diff --git a/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.h b/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.h
index 99eecfdcaa45d..625b41e43c141 100644
--- a/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.h
+++ b/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.h
@@ -16,7 +16,7 @@ class XNNCompiler {
   // Takes Flatbuffer Serialized XNNPack Model and rebuilds the xnn-subgraph
   // returns an executor object that holds the xnn runtime object which we
   // can then use to set inputs and run inference using the xnn graph.
-  static XNNExecutor compileModel(std::string ser_model);
+  static XNNExecutor compileModel(const void* buffer_pointer, size_t num_bytes);
 };
 
 } // namespace delegate
diff --git a/torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp b/torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp
index a5718820fc198..553e8350ddbd7 100644
--- a/torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp
+++ b/torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp
@@ -40,8 +40,9 @@ class XNNPackBackend : public PyTorchBackendInterface {
     auto dict = processed.toGenericDict();
 
     // Compiling and wrapping exeuction object
-    std::string ser_model = dict.at("ser_model").toStringRef();
-    XNNExecutor executor = XNNCompiler::compileModel(ser_model);
+    const std::string& ser_model = dict.at("ser_model").toStringRef();
+    XNNExecutor executor =
+        XNNCompiler::compileModel(ser_model.data(), ser_model.length());
 
     auto model_ptr = c10::make_intrusive<XNNModelWrapper>(std::move(executor));
     auto runtime_handle = IValue::make_capsule(model_ptr);

From 48eaad19c7c5a42773c67099bca57e02e00d61a8 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Thu, 17 Nov 2022 04:18:10 +0000
Subject: [PATCH 1012/1922] Fix typos in messages under torch (#89049)

This PR fixes typos of messages in `.py` files under torch directory.
Only in `torch/onnx/symbolic_opset16.py`, fix a typo in comment to make the operator name correct.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89049
Approved by: https://github.com/lezcano
---
 torch/_refs/nn/functional/__init__.py                   | 2 +-
 torch/ao/nn/intrinsic/qat/modules/linear_fused.py       | 2 +-
 torch/ao/quantization/fx/_model_report/model_report.py  | 2 +-
 torch/ao/quantization/observer.py                       | 2 +-
 torch/backends/xeon/run_cpu.py                          | 2 +-
 torch/cuda/memory.py                                    | 2 +-
 torch/distributed/benchmarks/benchmark_ddp_rpc.py       | 2 +-
 torch/distributed/elastic/multiprocessing/api.py        | 4 ++--
 torch/distributed/elastic/rendezvous/etcd_rendezvous.py | 2 +-
 torch/distributions/mixture_same_family.py              | 2 +-
 torch/fx/experimental/accelerator_partitioner.py        | 2 +-
 torch/fx/experimental/graph_gradual_typechecker.py      | 4 ++--
 torch/fx/passes/split_module.py                         | 4 ++--
 torch/jit/frontend.py                                   | 2 +-
 torch/nn/utils/parametrize.py                           | 2 +-
 torch/onnx/symbolic_helper.py                           | 2 +-
 torch/onnx/symbolic_opset16.py                          | 2 +-
 torch/profiler/_pattern_matcher.py                      | 2 +-
 torch/serialization.py                                  | 2 +-
 torch/testing/_internal/common_distributed.py           | 2 +-
 torch/testing/_internal/composite_compliance.py         | 2 +-
 torch/utils/benchmark/examples/fuzzer.py                | 2 +-
 torch/utils/benchmark/examples/sparse/fuzzer.py         | 2 +-
 torch/utils/data/datapipes/dataframe/dataframes.py      | 2 +-
 torch/utils/data/datapipes/iter/callable.py             | 2 +-
 25 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/torch/_refs/nn/functional/__init__.py b/torch/_refs/nn/functional/__init__.py
index 3848a738d5346..12f44c4092a41 100644
--- a/torch/_refs/nn/functional/__init__.py
+++ b/torch/_refs/nn/functional/__init__.py
@@ -595,7 +595,7 @@ def _nll_loss_nd(
 ) -> TensorLikeType:
     utils.check(
         input.ndim > 0 and input.ndim <= 3,
-        lambda: f"Expected input dimension to be either [1, 2, 3] but recieved {input.ndim}.",
+        lambda: f"Expected input dimension to be either [1, 2, 3] but received {input.ndim}.",
     )
 
     utils.check(
diff --git a/torch/ao/nn/intrinsic/qat/modules/linear_fused.py b/torch/ao/nn/intrinsic/qat/modules/linear_fused.py
index f19dbd9a0f587..7c92c470ba5b9 100644
--- a/torch/ao/nn/intrinsic/qat/modules/linear_fused.py
+++ b/torch/ao/nn/intrinsic/qat/modules/linear_fused.py
@@ -35,7 +35,7 @@ def __init__(self,
                  freeze_bn=False,
                  qconfig=None):
         nn.modules.linear.Linear.__init__(self, in_features, out_features, bias)
-        assert qconfig, 'qconfig must be provded for QAT module'
+        assert qconfig, 'qconfig must be provided for QAT module'
         self.qconfig = qconfig
         self.freeze_bn = freeze_bn if self.training else True
         self.bn = nn.BatchNorm1d(out_features, eps, momentum, True, True)
diff --git a/torch/ao/quantization/fx/_model_report/model_report.py b/torch/ao/quantization/fx/_model_report/model_report.py
index dfe777a540585..ee96dd4bf5a9c 100644
--- a/torch/ao/quantization/fx/_model_report/model_report.py
+++ b/torch/ao/quantization/fx/_model_report/model_report.py
@@ -385,7 +385,7 @@ def _reformat_reports_for_visualizer(self) -> OrderedDict:
                         module_fqns_to_features[module_fqn] = {**new_info, **present_info}
                     else:
                         error_str = "You have the same key with different values across detectors. "
-                        error_str += "Someone incorrectly implemented a detector with conflicting keys to exisiting detectors."
+                        error_str += "Someone incorrectly implemented a detector with conflicting keys to existing detectors."
                         raise ValueError(error_str)
                 else:
                     # we just set it
diff --git a/torch/ao/quantization/observer.py b/torch/ao/quantization/observer.py
index 26a39c8c2e02f..3156b4245a12f 100644
--- a/torch/ao/quantization/observer.py
+++ b/torch/ao/quantization/observer.py
@@ -1019,7 +1019,7 @@ def _non_linear_param_search(self) -> Tuple[torch.Tensor, torch.Tensor]:
         This follows the implementation of NormMinimization::NonlinearQuantizationParamsSearch in
         caffe2/quantization/server/norm_minimization.cc
         """
-        assert self.histogram.size()[0] == self.bins, "bins mistmatch"
+        assert self.histogram.size()[0] == self.bins, "bins mismatch"
         bin_width = (self.max_val - self.min_val) / self.bins
 
         # cumulative sum
diff --git a/torch/backends/xeon/run_cpu.py b/torch/backends/xeon/run_cpu.py
index 69632cb208628..da55a9e605e10 100644
--- a/torch/backends/xeon/run_cpu.py
+++ b/torch/backends/xeon/run_cpu.py
@@ -598,7 +598,7 @@ def create_args(parser=None):
     _add_multi_instance_params(parser)
     # positional
     parser.add_argument("program", type=str,
-                        help="The full path to the proram/script to be launched. "
+                        help="The full path to the program/script to be launched. "
                              "followed by all the arguments for the script")
 
     # rest from the training program
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index 46bdda80bf87c..9f9ae724a15d6 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -61,7 +61,7 @@ def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None
     if not isinstance(stream, int):
         raise TypeError('Invalid type for stream argument, must be '
                         '`torch.cuda.Stream` or `int` representing a pointer '
-                        'to a exisiting stream')
+                        'to a existing stream')
     with torch.cuda.device(device):
         return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream)
 
diff --git a/torch/distributed/benchmarks/benchmark_ddp_rpc.py b/torch/distributed/benchmarks/benchmark_ddp_rpc.py
index e12556f396fb3..6614d3969bfca 100644
--- a/torch/distributed/benchmarks/benchmark_ddp_rpc.py
+++ b/torch/distributed/benchmarks/benchmark_ddp_rpc.py
@@ -335,7 +335,7 @@ def run_worker(rank, world_size):
         "--embedding-dim",
         type=int,
         default=EMBEDDING_DIM,
-        help="Number of embedding dimentions.",
+        help="Number of embedding dimensions.",
     )
     parser.add_argument(
         "--warmup-cycles",
diff --git a/torch/distributed/elastic/multiprocessing/api.py b/torch/distributed/elastic/multiprocessing/api.py
index 208c9a070e9de..727566fc60390 100644
--- a/torch/distributed/elastic/multiprocessing/api.py
+++ b/torch/distributed/elastic/multiprocessing/api.py
@@ -537,7 +537,7 @@ def _close(self, death_sig: signal.Signals, timeout: int = 30) -> None:
         for proc in self._pc.processes:
             if proc.is_alive():
                 log.warning(
-                    f"Unable to shutdown process {proc.pid} via {death_sig}, forcefully exitting via {_get_kill_signal()}"
+                    f"Unable to shutdown process {proc.pid} via {death_sig}, forcefully exiting via {_get_kill_signal()}"
                 )
                 try:
                     os.kill(proc.pid, _get_kill_signal())
@@ -714,7 +714,7 @@ def _close(self, death_sig: signal.Signals, timeout: int = 30) -> None:
         for handler in self.subprocess_handlers.values():
             if handler.proc.poll() is None:
                 log.warning(
-                    f"Unable to shutdown process {handler.proc.pid} via {death_sig}, forcefully exitting via {_get_kill_signal()}"
+                    f"Unable to shutdown process {handler.proc.pid} via {death_sig}, forcefully exiting via {_get_kill_signal()}"
                 )
                 handler.close(death_sig=_get_kill_signal())
                 handler.proc.wait()
diff --git a/torch/distributed/elastic/rendezvous/etcd_rendezvous.py b/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
index 5e11ad1e6d333..a7b682ccc89fa 100644
--- a/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
+++ b/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
@@ -293,7 +293,7 @@ def rendezvous_barrier(self):
                 time.sleep(1)
 
             except RendezvousTimeoutError:
-                log.info("Rendezvous timeout occured in EtcdRendezvousHandler")
+                log.info("Rendezvous timeout occurred in EtcdRendezvousHandler")
                 raise
 
             except RendezvousClosedError:
diff --git a/torch/distributions/mixture_same_family.py b/torch/distributions/mixture_same_family.py
index 8e0fdce3ada25..dd0beace1917b 100644
--- a/torch/distributions/mixture_same_family.py
+++ b/torch/distributions/mixture_same_family.py
@@ -60,7 +60,7 @@ def __init__(self,
 
         if not isinstance(self._mixture_distribution, Categorical):
             raise ValueError(" The Mixture distribution needs to be an "
-                             " instance of torch.distribtutions.Categorical")
+                             " instance of torch.distributions.Categorical")
 
         if not isinstance(self._component_distribution, Distribution):
             raise ValueError("The Component distribution need to be an "
diff --git a/torch/fx/experimental/accelerator_partitioner.py b/torch/fx/experimental/accelerator_partitioner.py
index 2b17ef2f86c34..5a007314d628b 100644
--- a/torch/fx/experimental/accelerator_partitioner.py
+++ b/torch/fx/experimental/accelerator_partitioner.py
@@ -696,7 +696,7 @@ def find_partition_to_combine_based_on_size(
             return find_combination, partitions
 
         def reset_partition_in_sparse_nn(partition, new_partition=True):
-            """If crossing the boudary between non-embedding nodes and
+            """If crossing the boundary between non-embedding nodes and
             embedding nodes, create a new partition
             """
             if in_embedding_region:
diff --git a/torch/fx/experimental/graph_gradual_typechecker.py b/torch/fx/experimental/graph_gradual_typechecker.py
index 6094952f1695e..7ffabc9c6996b 100644
--- a/torch/fx/experimental/graph_gradual_typechecker.py
+++ b/torch/fx/experimental/graph_gradual_typechecker.py
@@ -184,7 +184,7 @@ def get_attr_inference_rule(n: Node, traced):
     if attr_name == "shape":
         n.type = Dyn
     else:
-        raise TypeError("Not yet implelemted")
+        raise TypeError("Not yet implemented")
 
     # TODO. We leave it like this till we add a type to represent tensor sizes
     return n.type
@@ -507,7 +507,7 @@ def flatten_check(tensor_type, start_dim, end_dim):
         new_type_list = lhs + mid + rhs
         return TensorType(tuple(new_type_list))
     else:
-        raise TypeError(f'Incompatable dimentions {start_dim}, {end_dim - 1} in type {tensor_type}')
+        raise TypeError(f'Incompatable dimensions {start_dim}, {end_dim - 1} in type {tensor_type}')
 
 @register_inference_rule(torch.flatten)
 def flatten_inference_rule(n: Node):
diff --git a/torch/fx/passes/split_module.py b/torch/fx/passes/split_module.py
index 251fdadea7e27..c6954c2cc7177 100644
--- a/torch/fx/passes/split_module.py
+++ b/torch/fx/passes/split_module.py
@@ -28,8 +28,8 @@ def __repr__(self) -> str:
             f" nodes: {self.node_names},\n"
             f" inputs: {self.inputs},\n"
             f" outputs: {self.outputs},\n"
-            f" partitions depenent on: {self.partitions_dependent_on},\n"
-            f" parition dependents: {self.partition_dependents}"
+            f" partitions dependent on: {self.partitions_dependent_on},\n"
+            f" partition dependents: {self.partition_dependents}"
         )
 
 
diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py
index 4b5e3d68f75cd..44a8628f77d5c 100644
--- a/torch/jit/frontend.py
+++ b/torch/jit/frontend.py
@@ -614,7 +614,7 @@ def build_AugAssign(ctx, stmt):
         else:
             raise NotSupportedError(
                 find_before(ctx, rhs.range().start, '=', offsets=(-1, 0)),
-                "unsupported kind of augumented assignment: " + op.__name__)
+                "unsupported kind of augmented assignment: " + op.__name__)
         return AugAssign(lhs, op_token, rhs)
 
     @staticmethod
diff --git a/torch/nn/utils/parametrize.py b/torch/nn/utils/parametrize.py
index 17de23a97a4ac..801a1e80c1aac 100644
--- a/torch/nn/utils/parametrize.py
+++ b/torch/nn/utils/parametrize.py
@@ -242,7 +242,7 @@ def right_inverse(self, value: Tensor) -> None:
                 if len(value) != self.ntensors:
                     raise ValueError(
                         "'right_inverse' must return a sequence of tensors of length "
-                        f"{self.ntensors}. Got a sequence of lenght {len(value)}."
+                        f"{self.ntensors}. Got a sequence of length {len(value)}."
                     )
                 for i, tensor in enumerate(value):
                     original_i = getattr(self, f"original{i}")
diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py
index 84224e88d86e3..a27db1e2a327b 100644
--- a/torch/onnx/symbolic_helper.py
+++ b/torch/onnx/symbolic_helper.py
@@ -1308,7 +1308,7 @@ def _index_fill_reshape_helper(g: jit_utils.GraphContext, self, dim, index):
         from torch.onnx.symbolic_opset11 import scatter  # type: ignore[no-redef]
 
     if self.type().dim() is None:
-        return _unimplemented("index_fill", "input rank not accesible")
+        return _unimplemented("index_fill", "input rank not accessible")
     self_dim = self.type().dim()
     dim_value = _parse_arg(dim, "i")
     unsqueezed_index = _unsqueeze_helper(
diff --git a/torch/onnx/symbolic_opset16.py b/torch/onnx/symbolic_opset16.py
index a2d3505072bac..75cb96890a12f 100644
--- a/torch/onnx/symbolic_opset16.py
+++ b/torch/onnx/symbolic_opset16.py
@@ -15,7 +15,7 @@
     PRelu
     RoiAlign
     Scan
-    ScatterElemenets
+    ScatterElements
     ScatterND
     Where
     GreaterOrEqual
diff --git a/torch/profiler/_pattern_matcher.py b/torch/profiler/_pattern_matcher.py
index 3cec84df219ba..6c06bf2b2861b 100644
--- a/torch/profiler/_pattern_matcher.py
+++ b/torch/profiler/_pattern_matcher.py
@@ -161,7 +161,7 @@ class ExtraCUDACopyPattern(Pattern):
     def __init__(self, prof: profile, should_benchmark: bool = False):
         super().__init__(prof, should_benchmark)
         self.name = "Extra CUDA Copy Pattern"
-        self.description = "Filled a CPU tensor and immediately moved it to GPU. Please initalize it on GPU."
+        self.description = "Filled a CPU tensor and immediately moved it to GPU. Please initialize it on GPU."
         self.url = "https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#create-tensors-directly-on-the-target-device"
         self.init_ops = {
             "aten::fill_", "aten::zero_", "aten::normal_", "aten::uniform_"
diff --git a/torch/serialization.py b/torch/serialization.py
index 5f9eda67648b4..b9fc92b5110cf 100644
--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -773,7 +773,7 @@ def load(
 
     if weights_only:
         if pickle_module is not None:
-            raise RuntimeError("Can not safely load weights when expiclit picke_module is specified")
+            raise RuntimeError("Can not safely load weights when explicit picke_module is specified")
     else:
         if pickle_module is None:
             pickle_module = pickle
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 9dcb71ae0907f..272dd7479ce5e 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -333,7 +333,7 @@ def wrapper(*args, **kwargs):
 def skip_if_win32():
     return sandcastle_skip_if(
         sys.platform == "win32",
-        "This unit test case is not supportted on Windows platform",
+        "This unit test case is not supported on Windows platform",
     )
 
 
diff --git a/torch/testing/_internal/composite_compliance.py b/torch/testing/_internal/composite_compliance.py
index 0eaab2e1796d8..5d7de4e2328ab 100644
--- a/torch/testing/_internal/composite_compliance.py
+++ b/torch/testing/_internal/composite_compliance.py
@@ -311,7 +311,7 @@ def generate_subclass_choices_args_kwargs(args, kwargs, CCT, cct_mode):
 
 def raise_composite_compliance_error(err, additional_info=''):
     raise RuntimeError(
-        "Composite compilance check failed with "
+        "Composite compliance check failed with "
         "the above error.\n"
         f"{additional_info}"
         "If you are adding an OpInfo of an "
diff --git a/torch/utils/benchmark/examples/fuzzer.py b/torch/utils/benchmark/examples/fuzzer.py
index 4446e2d85c0a2..9728bf3d26c9f 100644
--- a/torch/utils/benchmark/examples/fuzzer.py
+++ b/torch/utils/benchmark/examples/fuzzer.py
@@ -65,7 +65,7 @@ def main():
     print()
 
     # More string munging to make pretty output.
-    print(f"Average attemts per valid config: {1. / (1. - add_fuzzer.rejection_rate):.1f}")
+    print(f"Average attempts per valid config: {1. / (1. - add_fuzzer.rejection_rate):.1f}")
 
     def time_fn(m):
         return m.median / m.metadata["numel"]
diff --git a/torch/utils/benchmark/examples/sparse/fuzzer.py b/torch/utils/benchmark/examples/sparse/fuzzer.py
index 8e2bf554c42a5..38421474ccf82 100644
--- a/torch/utils/benchmark/examples/sparse/fuzzer.py
+++ b/torch/utils/benchmark/examples/sparse/fuzzer.py
@@ -80,7 +80,7 @@ def main():
     print()
 
     # More string munging to make pretty output.
-    print(f"Average attemts per valid config: {1. / (1. - add_fuzzer.rejection_rate):.1f}")
+    print(f"Average attempts per valid config: {1. / (1. - add_fuzzer.rejection_rate):.1f}")
 
     def time_fn(m):
         return m.mean / m.metadata["nnz"]
diff --git a/torch/utils/data/datapipes/dataframe/dataframes.py b/torch/utils/data/datapipes/dataframe/dataframes.py
index fcbf15328e43c..3a7cbb44feaf8 100644
--- a/torch/utils/data/datapipes/dataframe/dataframes.py
+++ b/torch/utils/data/datapipes/dataframe/dataframes.py
@@ -408,7 +408,7 @@ def collate(self, *args, **kwargs):
 
     def __getattr__(self, attrname):  # ?
         if attrname in UNIMPLEMENTED_ATTR:
-            raise AttributeError('Attemping to get ', attrname)
+            raise AttributeError('Attempting to get ', attrname)
         if attrname in DATAPIPES_OPS:
             return (self.as_datapipe()).__getattr__(attrname)
         return super().__getattr__(attrname)
diff --git a/torch/utils/data/datapipes/iter/callable.py b/torch/utils/data/datapipes/iter/callable.py
index 30b04885787ac..f0f91dee34b46 100644
--- a/torch/utils/data/datapipes/iter/callable.py
+++ b/torch/utils/data/datapipes/iter/callable.py
@@ -155,7 +155,7 @@ def _collate_helper(conversion, item):
                 import torcharrow.pytorch as tap  # type: ignore[import]
                 collation_fn = tap.rec.Default()
             except Exception:
-                raise Exception("unable to import default collation function from the TorchArrrow")
+                raise Exception("unable to import default collation function from the TorchArrow")
 
         tuple_names.append(str(name))
         value = collation_fn(df[name])

From 38a80c92f3199a95d0c7e718e019e0d4caa3d82b Mon Sep 17 00:00:00 2001
From: Colin Taylor <colin2328@meta.com>
Date: Thu, 17 Nov 2022 04:26:10 +0000
Subject: [PATCH 1013/1922] [torchrec] [composable] update
 ShardedEmbeddingBagCollection to be use registered EBCs with shardedTensors
 as registered modules (#758) (#88026)

Summary:
X-link: https://github.com/pytorch/torchrec/pull/758

This PR fixes a bug in FSDP/DDP, where ShardedTensors are not supported even if passed in as params to ignore.
this is important for composability because TorchRec named_parameters() will return FQN of shardedTensors (as defined in goals)
It defines device of ShardedTensor to be None when local_tensor() does not exist on rank

update ShardedEmbeddingBagCollection to be composable according to https://docs.google.com/document/d/1TBJSd5zgEg6cRcXv3Okuj7bBkqQwGS2IPh4TLWNNzFI/edit

Differential Revision: D40458625

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88026
Approved by: https://github.com/wanchaol, https://github.com/rohan-varma
---
 test/distributed/test_c10d_gloo.py            | 62 +++++++++++++++++--
 .../_shard/sharded_tensor/_ops/tensor_ops.py  | 11 +++-
 .../distributed/_shard/sharded_tensor/api.py  |  8 ++-
 torch/nn/parallel/distributed.py              | 24 ++++---
 4 files changed, 82 insertions(+), 23 deletions(-)

diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
index ba214a02696f9..c0a25fff9d822 100644
--- a/test/distributed/test_c10d_gloo.py
+++ b/test/distributed/test_c10d_gloo.py
@@ -23,28 +23,35 @@
 import torch.nn.functional as F
 import torch.testing._internal.common_utils as common
 from test_c10d_common import (
-    LOOPBACK,
     gpus_for_rank,
-    Task,
+    LOOPBACK,
     ModuleForDdpCommHook,
     SparseGradientModule,
+    Task,
 )
 from torch import nn
+from torch.distributed._shard.sharded_tensor import (
+    init_from_local_shards,
+    Shard,
+    ShardedTensor,
+    ShardMetadata,
+)
 from torch.nn.parallel import DistributedDataParallel
+from torch.nn.parallel._replicated_tensor_ddp_utils import _ddp_replicated_tensor
 from torch.testing._internal.common_distributed import (
+    create_device,
     MultiProcessTestCase,
     requires_gloo,
-    skip_if_lt_x_gpu,
     simple_sparse_reduce_tests,
+    skip_if_lt_x_gpu,
     skip_if_win32,
-    create_device,
     verify_ddp_error_logged,
 )
 from torch.testing._internal.common_utils import (
-    TestCase,
-    run_tests,
     retry_on_connect_failures,
+    run_tests,
     sandcastle_skip,
+    TestCase,
 )
 
 
@@ -1754,6 +1761,49 @@ def forward(self, x):
             loss = criterion(output, target)
             loss.backward()
 
+    @requires_gloo()
+    @skip_if_lt_x_gpu(2)
+    def test_ignored_sharded_tensor(self):
+        class MyModule(nn.Module):
+            def __init__(self, shard_tensor: ShardedTensor) -> None:
+                super().__init__()
+                self.fc1 = nn.Linear(2, 10, bias=False)
+                self.st = nn.Parameter(shard_tensor)
+                self.relu = nn.ReLU()
+
+            def forward(self, x):
+                x = self.relu(self.fc1(x))
+                return F.softmax(x, dim=1)
+        pg = dist.init_process_group(
+            "gloo",
+            init_method=f"file://{self.file_name}",
+            world_size=self.world_size,
+            rank=self.rank,
+        )
+        device = torch.device(f"cuda:{self.rank}")
+        local_shard_metadata = ShardMetadata(
+            shard_offsets=[(self.rank % 2) * 5, 0],
+            shard_sizes=[5, 10],
+            placement=f"rank:{self.rank}/cuda:{self.rank}"
+        )
+        local_shards = [Shard(torch.randn(5, 10, device=device), local_shard_metadata)]
+        st = init_from_local_shards(local_shards, [10, 10])
+        m = MyModule(st)
+        with _ddp_replicated_tensor(False):
+            DistributedDataParallel._set_params_and_buffers_to_ignore_for_model(
+                module=m,
+                params_and_buffers_to_ignore={'st'}
+            )
+            # test to make DDP constructor will not fail when module includes a ShardedTensor when ignored
+            DistributedDataParallel(
+                m,
+                device_ids=[device] if device.type == "gpu" else None,
+                process_group=pg,
+                gradient_as_bucket_view=True,
+                broadcast_buffers=False,
+                static_graph=True,
+            )
+
     def _run_and_verify_sparse_gradients(self, vanilla_model, ddp_model):
         mult = 2
         batch_size = mult * self.world_size
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py b/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py
index e52c29238a62b..fbdeb553cc289 100644
--- a/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py
+++ b/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py
@@ -42,9 +42,14 @@ def tensor_device(types, args=(), kwargs=None, pg=None):
     # Validate types
     if not isinstance(self_st, ShardedTensor):
         raise TypeError("input needs to be a ShardedTensor")
-
-    return self_st.local_shards()[0].tensor.device
-
+    dev: torch.device
+    if self_st._local_shards:
+        dev = self_st._local_shards[0].tensor.device
+    elif pg and pg._get_backend_name() == "gloo":
+        dev = torch.device("cpu")
+    else:
+        dev = torch.device(torch.cuda.current_device())
+    return dev
 
 @_sharded_op_impl(torch.Tensor.is_meta.__get__)  # type: ignore[attr-defined]
 def st_is_meta(types, args=(), kwargs=None, pg=None):
diff --git a/torch/distributed/_shard/sharded_tensor/api.py b/torch/distributed/_shard/sharded_tensor/api.py
index 156423c65c112..36ab5d6969a30 100644
--- a/torch/distributed/_shard/sharded_tensor/api.py
+++ b/torch/distributed/_shard/sharded_tensor/api.py
@@ -630,7 +630,13 @@ def cuda(
         return st_cuda
 
     def to(self, *args, **kwargs) -> ShardedTensor:
-        current_device = self._local_shards[0].tensor.device
+        current_device: torch.device
+        if self._local_shards:
+            current_device = self._local_shards[0].tensor.device
+        elif self._process_group._get_backend_name() == "gloo":
+            current_device = torch.device("cpu")
+        else:
+            current_device = torch.device(torch.cuda.current_device())
         current_dtype = self.dtype
         device_to = current_device
         dtype_to = current_dtype
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index c29a0a7ef46bb..47eb6bb2ebf1c 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -553,11 +553,15 @@ def __init__(
         gradient_as_bucket_view=False,
         static_graph=False,
     ):
-
         super(DistributedDataParallel, self).__init__()
         Joinable.__init__(self)
         self.logger = None
-        if not any((p.requires_grad for p in module.parameters())):
+        if hasattr(module, "_ddp_params_and_buffers_to_ignore"):
+            self.parameters_to_ignore = set(module._ddp_params_and_buffers_to_ignore)
+        else:
+            self.parameters_to_ignore = set()
+        self._module_parameters = [p for n, p in module.named_parameters() if n not in self.parameters_to_ignore]
+        if not any((p.requires_grad for p in self._module_parameters)):
             self._log_and_throw(
                 RuntimeError,
                 "DistributedDataParallel is not needed when a module "
@@ -570,10 +574,8 @@ def __init__(
                 "device_ids can only be None or contain a single element.",
             )
 
-        self.is_multi_device_module = (
-            len({p.device for p in module.parameters()}) > 1
-        )
-        distinct_device_types = {p.device.type for p in module.parameters()}
+        self.is_multi_device_module = len({p.device for p in self._module_parameters}) > 1
+        distinct_device_types = {p.device.type for p in self._module_parameters if p.device is not None}
         if len(distinct_device_types) != 1:
             self._log_and_throw(
                 ValueError,
@@ -599,7 +601,7 @@ def __init__(
                     "but got device_ids {}, output_device {}, and module parameters {}.".format(
                         device_ids,
                         output_device,
-                        {p.device for p in module.parameters()},
+                        {p.device for p in self._module_parameters},
                     ),
                 )
 
@@ -621,16 +623,12 @@ def __init__(
         self.static_graph = False
         self.dim = dim
         self.module = module
-        self.device = list(self.module.parameters())[0].device
+        self.device = list(self._module_parameters)[0].device
         self.broadcast_buffers = broadcast_buffers
         self.find_unused_parameters = find_unused_parameters
         self.require_backward_grad_sync = True
         self.require_forward_param_sync = True
         self.gradient_as_bucket_view = gradient_as_bucket_view
-        if hasattr(module, "_ddp_params_and_buffers_to_ignore"):
-            self.parameters_to_ignore = module._ddp_params_and_buffers_to_ignore
-        else:
-            self.parameters_to_ignore = []
 
         self._use_replicated_tensor_module = (
             _ddp_with_replicated_tensor_enabled()
@@ -647,7 +645,7 @@ def __init__(
             )
 
         # Check that a module does not have Uninitialized parameters
-        for param in module.parameters():
+        for param in self._module_parameters:
             if isinstance(param, torch.nn.parameter.UninitializedParameter):
                 self._log_and_throw(
                     RuntimeError,

From c0641127889cd68fece15174ec89281bcd66fa70 Mon Sep 17 00:00:00 2001
From: maxren <maxren@meta.com>
Date: Wed, 16 Nov 2022 10:46:30 -0800
Subject: [PATCH 1014/1922] [xnnpack][executorch] Pass xnnexecutor pointer to
 compileModel() (#89090)

Here we pass XNNExecutor* to compile model so that XNNExecutor can be allocated by runtime. This signature change is for executorch:

```
XNNExecutor compileModel(void* buffer) --> void compileModel(void* buffer, XNNExecutor* executor)
```

The intended usecase for allocating Executor and Compiling the serialized flatbuffer:

```
XNNExecutor* executor = runtime_allocator->allocateList<jit::xnnpack::delegate::XNNExecutor>(1);
XNNCompiler::compileModel(processed.buffer, executor);

```

Differential Revision: [D41208387](https://our.internmc.facebook.com/intern/diff/D41208387/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89090
Approved by: https://github.com/digantdesai
---
 .../backends/xnnpack/compiler/xnn_compiler.cpp    | 15 ++++++++-------
 .../jit/backends/xnnpack/compiler/xnn_compiler.h  |  5 ++++-
 .../jit/backends/xnnpack/executor/xnn_executor.h  | 13 +++++++------
 .../jit/backends/xnnpack/xnnpack_backend_lib.cpp  |  4 ++--
 4 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp b/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp
index af9c68df31e80..49e2804c99a93 100644
--- a/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp
+++ b/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp
@@ -11,9 +11,10 @@ namespace jit {
 namespace xnnpack {
 namespace delegate {
 
-XNNExecutor XNNCompiler::compileModel(
+void XNNCompiler::compileModel(
     const void* buffer_pointer,
-    size_t num_bytes) {
+    size_t num_bytes,
+    XNNExecutor* executor) {
   auto output_min = -std::numeric_limits<float>::infinity();
   auto output_max = std::numeric_limits<float>::infinity();
 
@@ -109,17 +110,17 @@ XNNExecutor XNNCompiler::compileModel(
   status = xnn_create_runtime_v2(subgraph_ptr, nullptr, 0, &runtime_ptr);
   TORCH_CHECK(xnn_status_success == status);
 
-  XNNExecutor executor(runtime_ptr);
+  executor->runtime_ =
+      std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)>(
+          runtime_ptr, xnn_delete_runtime);
 
   for (auto old_id : *flatbuffer_graph->input_ids()) {
-    executor.input_ids_.push_back(remapped_ids.at(old_id));
+    executor->input_ids_.emplace_back(remapped_ids.at(old_id));
   }
 
   for (auto old_id : *flatbuffer_graph->output_ids()) {
-    executor.output_ids_.push_back(remapped_ids.at(old_id));
+    executor->output_ids_.emplace_back(remapped_ids.at(old_id));
   }
-
-  return executor;
 };
 
 } // namespace delegate
diff --git a/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.h b/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.h
index 625b41e43c141..e87fcbcd063d9 100644
--- a/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.h
+++ b/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.h
@@ -16,7 +16,10 @@ class XNNCompiler {
   // Takes Flatbuffer Serialized XNNPack Model and rebuilds the xnn-subgraph
   // returns an executor object that holds the xnn runtime object which we
   // can then use to set inputs and run inference using the xnn graph.
-  static XNNExecutor compileModel(const void* buffer_pointer, size_t num_bytes);
+  static void compileModel(
+      const void* buffer_pointer,
+      size_t num_bytes,
+      XNNExecutor* executor);
 };
 
 } // namespace delegate
diff --git a/torch/csrc/jit/backends/xnnpack/executor/xnn_executor.h b/torch/csrc/jit/backends/xnnpack/executor/xnn_executor.h
index f82bde231c90f..2521c0c7749d8 100644
--- a/torch/csrc/jit/backends/xnnpack/executor/xnn_executor.h
+++ b/torch/csrc/jit/backends/xnnpack/executor/xnn_executor.h
@@ -1,5 +1,5 @@
 // (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
-
+#pragma once
 #include <xnnpack.h>
 #include <memory>
 #include <vector>
@@ -11,14 +11,15 @@ namespace delegate {
 
 class XNNExecutor {
  private:
-  std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> runtime_;
+  std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> runtime_{
+      nullptr,
+      &xnn_delete_runtime};
   std::vector<uint32_t> input_ids_;
   std::vector<uint32_t> output_ids_;
   std::vector<xnn_external_value> externals_;
 
  public:
-  XNNExecutor(xnn_runtime_t runtime_ptr)
-      : runtime_(runtime_ptr, xnn_delete_runtime){};
+  XNNExecutor() = default;
 
   template <typename T>
   bool set_inputs(std::vector<T*>& inputs, std::vector<T*>& outputs) {
@@ -41,7 +42,7 @@ class XNNExecutor {
     }
 
     return true;
-  };
+  }
 
   bool forward() {
     xnn_status status =
@@ -58,7 +59,7 @@ class XNNExecutor {
     }
 
     return true;
-  };
+  }
 
   friend class XNNCompiler;
 };
diff --git a/torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp b/torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp
index 553e8350ddbd7..46c7458039d47 100644
--- a/torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp
+++ b/torch/csrc/jit/backends/xnnpack/xnnpack_backend_lib.cpp
@@ -41,8 +41,8 @@ class XNNPackBackend : public PyTorchBackendInterface {
 
     // Compiling and wrapping exeuction object
     const std::string& ser_model = dict.at("ser_model").toStringRef();
-    XNNExecutor executor =
-        XNNCompiler::compileModel(ser_model.data(), ser_model.length());
+    XNNExecutor executor;
+    XNNCompiler::compileModel(ser_model.data(), ser_model.length(), &executor);
 
     auto model_ptr = c10::make_intrusive<XNNModelWrapper>(std::move(executor));
     auto runtime_handle = IValue::make_capsule(model_ptr);

From 57666be98784230f3b7f525f39ebd3598ead8e94 Mon Sep 17 00:00:00 2001
From: ecao <e.cao@intel.com>
Date: Thu, 17 Nov 2022 04:47:45 +0000
Subject: [PATCH 1015/1922] Fix empty input issue of convolution for channels
 last memory format (#86521)

Fixes empty input convolution issue : when input is empty e.g. shape of (0, 3, 3, 4) and weight is channels last format, at::_unsafe_view will raise "view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead."

Pull Request resolved: https://github.com/pytorch/pytorch/pull/86521
Approved by: https://github.com/jgong5, https://github.com/malfet
---
 aten/src/ATen/native/Convolution.cpp | 22 +++++++++++++++++---
 test/test_nn.py                      | 30 ++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index bf7017f20a4fd..8584bae445ad7 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -11,11 +11,15 @@
 #include <ATen/native/xnnpack/Engine.h>
 #include <c10/util/accumulate.h>
 #include <c10/util/irange.h>
-
 #include <c10/macros/Macros.h>
-
 #include <limits>
 
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/permute.h>
+#endif
+
 #if AT_NNPACK_ENABLED()
 #include <nnpack.h>
 #endif
@@ -1508,7 +1512,19 @@ at::Tensor _convolution(
       break;
     case ConvBackend::Empty:
     {
-      auto weight_view = at::_unsafe_view(weight, -1);
+      Tensor weight_view;
+      // Use permute and clone to avoid at::_unsafe_view(weight, -1) failure for non-contiguous cases where
+      // view size is not compatible with input tensor's size and stride.
+      if(weight.is_contiguous()) {
+        weight_view = at::_unsafe_view(weight, -1);
+      } else if (weight.is_contiguous(at::MemoryFormat::ChannelsLast)) {
+        weight_view = at::_unsafe_view(at::permute(weight, {0, 2, 3, 1}), -1);
+      } else if (weight.is_contiguous(at::MemoryFormat::ChannelsLast3d)) {
+        weight_view = at::_unsafe_view(at::permute(weight, {0, 2, 3, 4, 1}), -1);
+      } else {
+        weight_view = at::_unsafe_view(weight.clone(at::MemoryFormat::Contiguous), -1);
+      }
+
       output = (input.size(1) == 0) ? (input.view(-1) * weight_view) : (input * weight_view[0]);
       if (bias.defined()) {
         output.add_(bias[0]);
diff --git a/test/test_nn.py b/test/test_nn.py
index 4231c19ed0dac..7d6a016a6f51a 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -11736,6 +11736,36 @@ def test_batchnorm_large_batch(self, device, dtype):
         data = torch.rand(880801, 1, 1, 1, device=device, dtype=dtype)
         out = bn(data).sum().backward()
 
+    @dtypesIfCUDA(torch.float, torch.double, torch.half, torch.complex128)
+    @dtypes(torch.float, torch.double, torch.bfloat16, torch.complex128)
+    def test_conv_empty_input(self, device, dtype):
+        def help(input, conv, memory_format):
+            ref_out = conv(input)
+            conv_cl = conv.to(memory_format=memory_format)
+            out_cl = conv_cl(input)
+            self.assertEqual(ref_out, out_cl)
+            input_cl = input.to(memory_format=memory_format)
+            out_cl2 = conv(input_cl)
+            self.assertEqual(out_cl, out_cl2)
+            out_cl3 = conv_cl(input_cl)
+            self.assertEqual(out_cl, out_cl3)
+
+        # channels_last case
+        input2d = torch.randn((0, 4, 20, 20)).to(device=device, dtype=dtype)
+        conv2d = torch.nn.Conv2d(4, 4, 3, 1).to(device=device, dtype=dtype)
+        help(input2d, conv2d, torch.channels_last)
+        # channels_last_3d case
+        input3d = torch.randn((0, 4, 20, 20, 20)).to(device=device, dtype=dtype)
+        conv3d = torch.nn.Conv3d(4, 4, 3, 1).to(device=device, dtype=dtype)
+        help(input3d, conv3d, torch.channels_last_3d)
+        # non-contiguous case
+        weight = torch.rand(4, 8, 3, 3)[:, ::2, :, :].to(device=device, dtype=dtype)
+        bias = torch.rand(4).to(device=device, dtype=dtype)
+        out = F.conv2d(input2d, weight, bias, (1, 1), 0, (1, 1), 1)
+        weight = weight.contiguous()
+        out_ref = F.conv2d(input2d, weight, bias, (1, 1), 0, (1, 1), 1)
+        self.assertEqual(out_ref, out)
+
     def test_InstanceNorm1d_general(self, device):
         b = random.randint(3, 5)
         c = random.randint(3, 5)

From 29a9a7042b239fcd5b6b77a764ef564e99a3ef92 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Thu, 17 Nov 2022 04:58:53 +0000
Subject: [PATCH 1016/1922] Revert "Redefine the simdlen semantic: (#88482)"

This reverts commit fce6d6b3dcc879720bc45143426b86232106818a.

Reverted https://github.com/pytorch/pytorch/pull/88482 on behalf of https://github.com/kit1980 due to Broke multiple tests in several trunk workflows, for example https://github.com/pytorch/pytorch/actions/runs/3485086792/jobs/5830429554
---
 test/inductor/test_torchinductor.py |  94 +-----------
 torch/_inductor/codecache.py        | 215 +++++++---------------------
 torch/_inductor/codegen/common.py   |   6 -
 torch/_inductor/codegen/cpp.py      |  92 +++---------
 4 files changed, 80 insertions(+), 327 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index f9aa93f4a7e63..fb7ca1fc92b73 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -4529,11 +4529,7 @@ def fn(x):
 
             v = torch.randn(10)
             result = fn(v)
-            # TODO: OMP parallel reduction order is not deterministic.
-            # Hence, the accurarcy might vary up and down. For short term,
-            # we increase the tolerance and will fix it later by using
-            # aten parallel.
-            assert same(result, mod(v), tol=5e-1)
+            assert same(result, mod(v))
 
         def test_inplace_add_alpha(self):
             def fn(x, y):
@@ -4603,79 +4599,7 @@ def test_complex_memory_overlap(self):
             self.assertFalse(complex_memory_overlap(gathered.t()))
 
         @unittest.skipIf(
-            not codecache.valid_vec_isa_list(), "Does not support vectorization"
-        )
-        @patch.object(config, "dynamic_shapes", True)
-        @patch.object(torch._dynamo.config, "dynamic_shapes", True)
-        @patch.object(functorch_config, "use_dynamic_shapes", True)
-        def test_vec_dynamic_shapes(self):
-            def fn(x):
-                return torch.softmax(x, -1)
-
-            value = torch.randn((2, 10))
-            with patch.object(config.cpp, "simdlen", None):
-                torch._dynamo.reset()
-                metrics.reset()
-                opt_fn = torch._dynamo.optimize("inductor")(fn)
-                opt_fn(value)
-
-                real_out = fn(value)
-                compiled_out = opt_fn(value)
-                assert same(real_out, compiled_out, equal_nan=True)
-                assert metrics.generated_cpp_vec_kernel_count < 1
-
-        @unittest.skipIf(
-            not codecache.valid_vec_isa_list(), "Does not support vectorization"
-        )
-        @patch("torch.cuda.is_available", lambda: False)
-        def test_auto_simd(self):
-            vec_avx512 = codecache.supported_vec_isa_list[0]
-            vec_avx2 = codecache.supported_vec_isa_list[1]
-            self.assertTrue(vec_avx512.bit_width() == 512)
-            self.assertTrue(vec_avx2.bit_width() == 256)
-            self.assertTrue(vec_avx512.nelements() == 16)
-            self.assertTrue(vec_avx2.nelements() == 8)
-            self.assertTrue(vec_avx512.nelements(torch.bfloat16) == 32)
-            self.assertTrue(vec_avx2.nelements(torch.bfloat16) == 16)
-
-            with patch.object(config.cpp, "simdlen", None):
-                isa = codecache.pick_vec_isa()
-                if vec_avx512 in codecache.valid_vec_isa_list():
-                    self.assertTrue(isa == vec_avx512)
-                else:
-                    self.assertTrue(isa == vec_avx2)
-
-            with patch.object(config.cpp, "simdlen", 0):
-                isa = codecache.pick_vec_isa()
-                self.assertFalse(isa)
-
-            with patch.object(config.cpp, "simdlen", 1):
-                isa = codecache.pick_vec_isa()
-                self.assertFalse(isa)
-
-            with patch.object(config.cpp, "simdlen", 257):
-                isa = codecache.pick_vec_isa()
-                self.assertFalse(isa)
-
-            with patch.object(config.cpp, "simdlen", 513):
-                isa_list = codecache.valid_vec_isa_list()
-                if vec_avx512 in isa_list:
-                    self.assertFalse(isa)
-
-            with patch.object(config.cpp, "simdlen", 512):
-                isa_list = codecache.valid_vec_isa_list()
-                if vec_avx512 in isa_list:
-                    isa = codecache.pick_vec_isa()
-                    self.assertTrue(isa == vec_avx512)
-
-            with patch.object(config.cpp, "simdlen", 256):
-                isa_list = codecache.valid_vec_isa_list()
-                if vec_avx2 in isa_list:
-                    isa = codecache.pick_vec_isa()
-                    self.assertTrue(isa == vec_avx2)
-
-        @unittest.skipIf(
-            not codecache.valid_vec_isa_list(), "Does not support vectorization"
+            not codecache.get_cpu_proc_info(), "Does not support vectorization"
         )
         @patch("torch.cuda.is_available", lambda: False)
         def test_sign_cpu_only(self):
@@ -4686,7 +4610,7 @@ def fn(x):
             x[0, 0] = torch.nan
             x[1, -1] = torch.nan
 
-            with patch.object(config.cpp, "simdlen", None):
+            with patch.object(config.cpp, "simdlen", 8):
                 torch._dynamo.reset()
                 metrics.reset()
                 traced = make_fx(fn)(x)
@@ -4699,7 +4623,7 @@ def fn(x):
         # other platforms support, we just need to add the ISA info to the supported_vector_isa
         # and include proper aten vectorization head file.
         @unittest.skipIf(
-            not codecache.valid_vec_isa_list(), "Does not support vectorization"
+            not codecache.get_cpu_proc_info(), "Does not support vectorization"
         )
         @patch("torch.cuda.is_available", lambda: False)
         def test_vec_kernel_cpu_only(self):
@@ -4738,15 +4662,7 @@ def fn(x1, x2):
             x1 = torch.randn((10, 20))
             x2 = torch.randn((10, 20))
 
-            with patch.object(config.cpp, "simdlen", 1):
-                torch._dynamo.reset()
-                metrics.reset()
-                traced = make_fx(fn)(x1, x2)
-                compiled = compile_fx_inner(traced, [x1, x2])
-                assert same(fn(x1, x2)[0], compiled([x1, x2])[0], equal_nan=True)
-                assert metrics.generated_cpp_vec_kernel_count == 0
-
-            with patch.object(config.cpp, "simdlen", None):
+            with patch.object(config.cpp, "simdlen", 8):
                 torch._dynamo.reset()
                 metrics.reset()
                 traced = make_fx(fn)(x1, x2)
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 232a611b06c6a..2826f35999126 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -1,5 +1,5 @@
 import base64
-import dataclasses
+import enum
 import functools
 import getpass
 import hashlib
@@ -18,7 +18,7 @@
 from ctypes import cdll
 from threading import Thread
 from time import sleep, time
-from typing import Any, Callable, Dict, List
+from typing import Any, Dict
 
 import torch
 from torch.utils import cpp_extension
@@ -147,181 +147,79 @@ def is_gcc():
     return re.search(r"(gcc|g\+\+)", cpp_compiler())
 
 
-class VecISA(object):
-    _bit_width: int
-    _macro: str
-    _arch_flags: str
-    _dtype_nelements: Dict[torch.dtype, int]
-
-    # TorchInductor CPU vectorization reuses PyTorch vectorization utility functions
-    # Hence, TorchInductor would depend on Sleef* to accelerate mathematical functions
-    # like exp, pow, sin, cos and etc.
-    # But PyTorch and TorchInductor might use different compilers to build code. If
-    # PyTorch uses gcc-7/g++-7 to build the release package, the libtorch_cpu.so
-    # will not expose the Sleef* AVX512 symbols since gcc-7/g++-7 cannot pass
-    # avx512 check in CMake - FindAVX.cmake. But TorchInductor install the latest
-    # gcc/g++ compiler by default while it could support the AVX512 compilation.
-    # Therefore, there would be a conflict sleef version between PyTorch and
-    # TorchInductor. Hence, we dry-compile the following code to check whether current
-    # HW platform and PyTorch both could support AVX512 or AVX2. And suppose ARM
-    # also needs the logic
-    _avx_code = """
-#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2)
-#include <ATen/cpu/vec/functional.h>
-#include <ATen/cpu/vec/vec.h>
-#endif
-
-__attribute__((aligned(64))) float in_out_ptr0[16] = {0.0};
-
-extern "C" void __avx_chk_kernel() {
-    auto tmp0 = at::vec::Vectorized<float>(1);
-    auto tmp1 = tmp0.exp();
-    tmp1.store(in_out_ptr0);
-}
-"""
-
-    _avx_py_load = """
-import torch
-from ctypes import cdll
-cdll.LoadLibrary("__lib_path__")
-"""
-
-    def bit_width(self):
-        return self._bit_width
-
-    def nelements(self, dtype: torch.dtype = torch.float):
-        return self._dtype_nelements[dtype]
-
-    def build_macro(self):
-        return self._macro
-
-    def build_arch_flags(self):
-        return self._arch_flags
-
-    def __hash__(self) -> int:
-        return hash(str(self))
-
-    @functools.lru_cache(None)
-    def __bool__(self):
-        key, input_path = write(VecISA._avx_code, "cpp", extra="")
-        from filelock import FileLock
-
-        lock_dir = get_lock_dir()
-        lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
-        with lock:
-            output_path = input_path[:-3] + "so"
-            build_cmd = cpp_compile_command(
-                input_path, output_path, warning_all=False, vec_isa=self
-            ).split(" ")
-            try:
-                # Check build result
-                subprocess.check_output(build_cmd, stderr=subprocess.STDOUT)
-                subprocess.check_call(
-                    [
-                        "python",
-                        "-c",
-                        VecISA._avx_py_load.replace("__lib_path__", output_path),
-                    ],
-                    stderr=subprocess.DEVNULL,
-                )
-            except Exception as e:
-                return False
-
-            return True
-
-
-@dataclasses.dataclass
-class VecAVX512(VecISA):
-    _bit_width = 512
-    _macro = "CPU_CAPABILITY_AVX512"
-    _arch_flags = "-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma"
-    _dtype_nelements = {torch.float: 16, torch.bfloat16: 32}
-
-    def __str__(self) -> str:
-        return "avx512"
-
-    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
-
-
-@dataclasses.dataclass
-class VecAVX2(VecISA):
-    _bit_width = 256
-    _macro = "CPU_CAPABILITY_AVX2"
-    _arch_flags = "-mavx2 -mfma"
-    _dtype_nelements = {torch.float: 8, torch.bfloat16: 16}
-
-    def __str__(self) -> str:
-        return "avx2"
-
-    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
-
-
-class InvalidVecISA(VecISA):
-    _bit_width = 0
-    _macro = ""
-    _arch_flags = ""
-    _dtype_nelements = {}
-
-    def __str__(self) -> str:
-        return "INVALID_VEC_ISA"
+class _SupportedVecIsa(enum.Enum):
+    AVX512 = 1
+    AVX2 = 2
+    INVALID = -1
 
     def __bool__(self):
-        return False
-
-    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+        return self != _SupportedVecIsa.INVALID
 
+    @staticmethod
+    def isa_str(supported_isa: enum.Enum):
+        if supported_isa == _SupportedVecIsa.AVX512:
+            return "avx512"
+        elif supported_isa == _SupportedVecIsa.AVX2:
+            return "avx2"
+        else:
+            return ""
 
-invalid_vec_isa = InvalidVecISA()
-supported_vec_isa_list = [VecAVX512(), VecAVX2()]
+    @staticmethod
+    def vec_macro(supported_isa: enum.Enum):
+        if supported_isa == _SupportedVecIsa.AVX512:
+            return "CPU_CAPABILITY_AVX512"
+        elif supported_isa == _SupportedVecIsa.AVX2:
+            return "CPU_CAPABILITY_AVX2"
+        else:
+            return ""
 
 
 # Cache the cpuinfo to avoid I/O overhead. Meanwhile, the cpuinfo content
 # might have too much redundant content that is useless for ISA check. Hence,
 # we only cache some key isa information.
-@functools.lru_cache(None)
-def valid_vec_isa_list():
+@functools.lru_cache(1)
+def get_cpu_proc_info():
     if sys.platform != "linux":
         return []
 
-    isa_list = []
+    isa_info = []
     with open("/proc/cpuinfo") as _cpu_info:
         _cpu_info_content = _cpu_info.read()
-        for isa in supported_vec_isa_list:
-            if str(isa) in _cpu_info_content and isa:
-                isa_list.append(isa)
-        return isa_list
+        if _SupportedVecIsa.isa_str(_SupportedVecIsa.AVX512) in _cpu_info_content:
+            isa_info.append(_SupportedVecIsa.AVX512)
+
+        if _SupportedVecIsa.isa_str(_SupportedVecIsa.AVX2) in _cpu_info_content:
+            isa_info.append(_SupportedVecIsa.AVX2)
 
+        return isa_info
 
-def pick_vec_isa():
-    _valid_vec_isa_list: List[VecISA] = valid_vec_isa_list()
-    if not _valid_vec_isa_list:
-        return invalid_vec_isa
 
-    # If the simdlen is None, it indicates determin the vectroization length automatically
-    if config.cpp.simdlen is None:
-        assert _valid_vec_isa_list
-        return _valid_vec_isa_list[0]
+def supported_vector_isa():
+    # TODO: Add ARM Vec here.
+    # Dict(k: isa, v: number of float element)
+    vec_isa_info = {
+        _SupportedVecIsa.AVX512: 16,
+        _SupportedVecIsa.AVX2: 8,
+    }
 
-    for isa in _valid_vec_isa_list:
-        if config.cpp.simdlen == isa.bit_width():
+    if config.cpp.simdlen is None or config.cpp.simdlen <= 1:
+        return _SupportedVecIsa.INVALID
+
+    cpu_info_content = get_cpu_proc_info()
+    for isa in vec_isa_info.keys():
+        if isa in cpu_info_content and config.cpp.simdlen == vec_isa_info[isa]:
             return isa
 
-    return invalid_vec_isa
+    return _SupportedVecIsa.INVALID
 
 
-def cpp_compile_command(
-    input,
-    output,
-    warning_all=True,
-    shared=True,
-    include_pytorch=False,
-    vec_isa: VecISA = invalid_vec_isa,
-):
-    if include_pytorch or vec_isa != invalid_vec_isa:
+def cpp_compile_command(input, output, include_pytorch=False):
+    valid_isa = supported_vector_isa()
+    if include_pytorch or valid_isa:
         ipaths = cpp_extension.include_paths() + [sysconfig.get_path("include")]
         lpaths = cpp_extension.library_paths() + [sysconfig.get_config_var("LIBDIR")]
         libs = ["c10", "torch", "torch_cpu", "torch_python", "gomp"]
-        macros = vec_isa.build_macro()
+        macros = _SupportedVecIsa.vec_macro(valid_isa)
         if macros:
             macros = f"-D{macros}"
     else:
@@ -337,13 +235,11 @@ def cpp_compile_command(
     lpaths = " ".join(["-L" + p for p in lpaths])
     libs = " ".join(["-l" + p for p in libs])
 
-    shared_lib = "-shared -fPIC" if shared else ""
-    warning_all_flag = "-Wall" if warning_all else ""
     return re.sub(
         r"[ \n]+",
         " ",
         f"""
-            {cpp_compiler()} {input} {shared_lib} {warning_all_flag} -std=c++14 -Wno-unused-variable
+            {cpp_compiler()} {input} -shared -fPIC -Wall -std=c++14 -Wno-unused-variable
             {ipaths} {lpaths} {libs} {macros}
             -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp
             -D C10_USING_CUSTOM_GENERATED_MACROS
@@ -370,12 +266,7 @@ def _load_library(path):
 
     @classmethod
     def load(cls, source_code):
-        picked_vec_isa = pick_vec_isa()
-        key, input_path = write(
-            source_code,
-            "cpp",
-            extra=cpp_compile_command("i", "o", vec_isa=picked_vec_isa),
-        )
+        key, input_path = write(source_code, "cpp", extra=cpp_compile_command("i", "o"))
         if key not in cls.cache:
             from filelock import FileLock
 
@@ -385,7 +276,7 @@ def load(cls, source_code):
                 output_path = input_path[:-3] + "so"
                 if not os.path.exists(output_path):
                     cmd = cpp_compile_command(
-                        input=input_path, output=output_path, vec_isa=picked_vec_isa
+                        input=input_path, output=output_path
                     ).split(" ")
                     try:
                         subprocess.check_output(cmd, stderr=subprocess.STDOUT)
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index cf98833964ca5..2803970295ccc 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -417,12 +417,6 @@ def __init__(self, name):
     def __str__(self):
         return self.name
 
-    def __hash__(self) -> int:
-        return hash(self.name)
-
-    def __eq__(self, other) -> bool:
-        return type(other) == type(self) and other.name == self.name
-
     def update_on_args(self, args, kwargs):
         pass
 
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index 38ef2179d5b71..65a9335d6cbfc 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -616,7 +616,7 @@ def codegen_loops(self, code, worksharing):
         )
         reductions.mark_reduction(self.reduction_vars)
 
-        if codecache.pick_vec_isa():
+        if config.cpp.simdlen:
             # TODO(jansel): detect stride-1 dimension and vectorize that
             if reductions:
                 reductions.loops[-1].simd = True
@@ -707,8 +707,7 @@ class CppVecKernel(CppKernel):
 
     def __init__(self, args, num_threads):
         super(CppVecKernel, self).__init__(args, num_threads)
-        assert codecache.pick_vec_isa()
-        self.simd_nelements = codecache.pick_vec_isa().nelements()
+        self.simd_len = config.cpp.simdlen
         self.reduction_omp_dec: Dict[str, str] = {}
         metrics.generated_cpp_vec_kernel_count += 1
 
@@ -724,10 +723,10 @@ def is_var_irrevelant(self, var: sympy.Symbol, index: sympy.Expr):
 
     def transform_index(self, index: sympy.Expr):
         expanded_index = sympy.expand(index)
-        assert self.simd_nelements
-        assert self.simd_nelements >= 1
+        assert self.simd_len
+        assert self.simd_len > 0
         most_inner_var = self.itervars[-1]
-        replacement = {most_inner_var: most_inner_var * self.simd_nelements}
+        replacement = {most_inner_var: most_inner_var * self.simd_len}
         new_index = sympy_subs(expanded_index, replacement)
         return new_index
 
@@ -948,24 +947,21 @@ def __init__(self, args=None, num_threads=None):
         super(CppKernelProxy, self).__init__(args, num_threads)
         self.simd_vec_kernel: CppVecKernel = None
         self.simd_omp_kernel: CppKernel = None
-        self.picked_vec_isa: codecache.VecISA = codecache.pick_vec_isa()
 
-    def vectorize_most_inner_loop(self, loop_nest, dtype=torch.float):
-        assert self.picked_vec_isa
-        nelements = self.picked_vec_isa.nelements(dtype)
-        loop_nest.split_most_inner_loop(nelements)
+    def vectorize_most_inner_loop(self, loop_nest):
+        loop_nest.split_most_inner_loop(config.cpp.simdlen)
         loop_with_tail = loop_nest.loops[-1]
         assert isinstance(loop_with_tail, LoopLevelWithTail)
 
         loop_with_tail.main_loop.simd_vec = True
 
         loop_with_tail.tail_loop.simd_omp = True
-        # We chope the loop into two cubes by the nelements - main loop and tail loop.
+        # We chope the loop into two cubes by the config.cpp.simdlen - main loop and tail loop.
         # Regarding the main loop, it is straightforward that it could be vectorized with
-        # nelements. But for the tail loop, it still could be vectorized. For example,
-        # if the nelements is 8(256bits), then the tail loop still could be vectorized
+        # config.cpp.simdlen. But for the tail loop, it still could be vectorized. For example,
+        # if the config.cpp.simdlen is 8(256bits), then the tail loop still could be vectorized
         # as 4(128bits).
-        loop_with_tail.tail_loop.simd_nelements = int(nelements / 2)
+        loop_with_tail.tail_loop.simd_len = int(config.cpp.simdlen / 2)
         loop_with_tail.tail_loop.simd_vec = False
 
         loop_with_tail.main_loop_body = self.simd_vec_kernel
@@ -975,7 +971,7 @@ def vectorize_most_inner_loop(self, loop_nest, dtype=torch.float):
     def codegen_loops(self, code, worksharing):
         threads = parallel_num_threads()
 
-        if self.simd_vec_kernel is None or not self.picked_vec_isa:
+        if self.simd_vec_kernel is None:
             assert self.simd_omp_kernel
             return self.simd_omp_kernel.codegen_loops(code, worksharing)
 
@@ -997,52 +993,12 @@ def codegen_loops(self, code, worksharing):
         ), LoopNest(loops[reduction_depth:])
         loops_nest_reduce.mark_reduction(self.simd_vec_kernel.reduction_vars)
 
-        assert self.picked_vec_isa
-        # Do not apply vectorization since the range of most inner is too small. Meanwhile,
-        # If the range of the most inner is less then the codecache.pick_vec_isa().nelements(),
-        # the generated code for some reduction will be as follows that leads to incrrect result.
-        #
-        #    LINE01:  float tmp1 = 0;
-        #    LINE02:  auto tmp1_vec = at::vec::Vectorized<float>(tmp1);
-        #    LINE03:  for(long i1=0; i1<2; i1+=1)
-        #    LINE04:  {
-        #    LINE05:      for(long i2=0; i2<0; i2+=1)
-        #    LINE06:      {
-        #    LINE07:          auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + (8*i0) + (16*i2) + (32*i1));
-        #    LINE08:          tmp1_vec += tmp0;
-        #    LINE09:      }
-        #    LINE10:      tmp1 = vec_reduce_all<float>([](Vectorized<float>& x, Vectorized<float>&y) {return x + y;}, tmp1_vec);
-        #    LINE11:      #pragma omp simd simdlen(8)  reduction(+:tmp1)
-        #    LINE12:      for(long i2=0; i2<8; i2+=1)
-        #    LINE13:      {
-        #    LINE14:          auto tmp0 = in_ptr0[i2 + (8*i0) + (32*i1)];
-        #    LINE15:          tmp1 += tmp0;
-        #    LINE16:      }
-        #    LINE17:  }
-        #    LINE18:  out_ptr3[i0] = tmp1;
-        #
-        # tmp1_vec(LINE02) will always be zero as it is initialized with tmp1 value and the range(LINE05)
-        # is 0. Hence, the LINE10 will always reset tmp1 to 0. But tmp1(LINE01) is global value. So the result
-        # will be incorrect. We skip thie case.
-        most_inner_loop = (
-            loops_nest_reduce.loops[-1]
-            if loops_nest_reduce
-            else loops_nest_non_reduce.loops[-1]
-        )
-        main_loop_range = ir.IndexingDiv(
-            most_inner_loop.size, self.picked_vec_isa.nelements()
-        )
-        loop_interval = sympy.simplify(main_loop_range)
-        # TODO(Eikan): To support dynamic shape.
-        if not loop_interval.is_integer or loop_interval <= 0:
-            metrics.generated_cpp_vec_kernel_count -= 1
-            return self.simd_omp_kernel.codegen_loops(code, worksharing)
-
-        # TODO(jansel): detect stride-1 dimension and vectorize that
-        if loops_nest_reduce:
-            loops_nest_reduce.loops[-1].simd = True
-        elif loops_nest_non_reduce:
-            loops_nest_non_reduce.loops[-1].simd = True
+        if config.cpp.simdlen:
+            # TODO(jansel): detect stride-1 dimension and vectorize that
+            if loops_nest_reduce:
+                loops_nest_reduce.loops[-1].simd = True
+            elif loops_nest_non_reduce:
+                loops_nest_non_reduce.loops[-1].simd = True
 
         par_depth = 0
         reduction_par_depth = 0
@@ -1182,7 +1138,8 @@ def can_fuse_vertical(cls, node1, node2):
         return cls.can_fuse_horizontal(node1, node2) and not node1.is_reduction()
 
     def can_vec(self, nodes):
-        if not codecache.pick_vec_isa():
+        # TODO: Query cpu arch and vec length from aten
+        if not codecache.supported_vector_isa():
             return False
 
         _, (group, reduction_group) = max(
@@ -1392,8 +1349,7 @@ class LoopLevel:
     steps: sympy.Expr = sympy.Integer(1)
     parallel: int = 0
     simd_omp: bool = False
-    picked_vec_isa: codecache.VecISA = codecache.pick_vec_isa()
-    simd_nelements: int = picked_vec_isa.nelements() if picked_vec_isa else 0
+    simd_len: int = config.cpp.simdlen
     simd_vec: bool = False
     collapsed: bool = False
     reduction_vars: Dict[str, str] = None
@@ -1407,11 +1363,7 @@ def lines(self):
             )
         else:
             reduction = ""
-        simd = (
-            f"simd simdlen({self.simd_nelements}) "
-            if self.simd_omp and self.simd_nelements > 1
-            else ""
-        )
+        simd = f"simd simdlen({self.simd_len}) " if self.simd_omp else ""
         if self.parallel:
             # TODO(jansel): look into chunk size and other schedules
             line1 = f"#pragma omp for{reduction} "

From 81930f607a85d1cc06716700cecf8df12a83e845 Mon Sep 17 00:00:00 2001
From: Aaron Gokaslan <aaronGokaslan@gmail.com>
Date: Thu, 17 Nov 2022 05:01:08 +0000
Subject: [PATCH 1017/1922] Fix: prefer .is_none() over .is(py::none()) for
 pybind11 in caffe2 (#88199)

Follow up to #88051 . I noticed that I missed a few spots in the caffe2 folder. Prefer `.is_none()` over `.is(py::none())` as `.is_none()` is more efficient since it avoid reference counting increments and decrements.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88199
Approved by: https://github.com/albanD, https://github.com/kit1980
---
 caffe2/python/pybind_state.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
index 3103006774df5..5b2c2f71a827a 100644
--- a/caffe2/python/pybind_state.cc
+++ b/caffe2/python/pybind_state.cc
@@ -209,7 +209,7 @@ bool feedBlob(
     const py::object& arg,
     const py::object device_option) {
   DeviceOption option;
-  if (!device_option.is(py::none())) {
+  if (!device_option.is_none()) {
     // If we have a device option passed in, read it.
     CAFFE_ENFORCE(ParseProtoFromLargeString(
         py::bytes(device_option).cast<std::string>(), &option));
@@ -752,7 +752,7 @@ void addObjectMethods(py::module& m) {
       .def(
           "reset",
           [](caffe2::onnx::DummyName& instance, const py::object& args) {
-            if (args.is(py::none())) {
+            if (args.is_none()) {
               instance.Reset(std::unordered_set<std::string>());
             } else {
               instance.Reset(args.cast<std::unordered_set<std::string>>());
@@ -1130,7 +1130,7 @@ void addGlobalMethods(py::module& m) {
   m.def(
       "switch_workspace",
       [](const std::string& name, const py::object create_if_missing) {
-        if (create_if_missing.is(py::none())) {
+        if (create_if_missing.is_none()) {
           return caffe2::python::SwitchWorkspaceInternal(name, false);
         }
         return caffe2::python::SwitchWorkspaceInternal(
@@ -1143,7 +1143,7 @@ void addGlobalMethods(py::module& m) {
       "reset_workspace",
       [](const py::object& root_folder) {
         VLOG(1) << "Resetting workspace.";
-        if (root_folder.is(py::none())) {
+        if (root_folder.is_none()) {
           caffe2::python::ResetWorkspace(new Workspace());
         } else {
           caffe2::python::ResetWorkspace(
@@ -1634,7 +1634,7 @@ void addGlobalMethods(py::module& m) {
       "register_python_op",
       [](py::object func, bool pass_workspace, std::string name) {
         using namespace python_detail;
-        CAFFE_ENFORCE(!func.is(py::none()));
+        CAFFE_ENFORCE(!func.is_none());
         if (!name.empty()) {
           name += ":";
         }
@@ -1650,7 +1650,7 @@ void addGlobalMethods(py::module& m) {
       "register_python_gradient_op",
       [](const std::string& token, py::object func) {
         using namespace python_detail;
-        CAFFE_ENFORCE(!func.is(py::none()));
+        CAFFE_ENFORCE(!func.is_none());
         CAFFE_ENFORCE(gRegistry().find(token) != gRegistry().end());
         // For global sanity gradient ops shouldn't access workspace
         gRegistry()[token + "_gradient"] = Func{func, false};

From fb1f77f2d544cba60dc251c166bac3cab6f4561e Mon Sep 17 00:00:00 2001
From: Rachel030219 <13704467+Rachel030219@users.noreply.github.com>
Date: Thu, 17 Nov 2022 05:55:25 +0000
Subject: [PATCH 1018/1922] Use software approach to catch overflow (
 `c10/utils/safe_numerics.h` ) on ARM devices (#89042)

Fixes #89040

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89042
Approved by: https://github.com/malfet
---
 c10/util/safe_numerics.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/c10/util/safe_numerics.h b/c10/util/safe_numerics.h
index 7eb9ed39395d8..e5c249dd1d2b7 100644
--- a/c10/util/safe_numerics.h
+++ b/c10/util/safe_numerics.h
@@ -22,7 +22,13 @@ C10_ALWAYS_INLINE bool add_overflows(uint64_t a, uint64_t b, uint64_t* out) {
   return __builtin_add_overflow(a, b, out);
 #else
   unsigned long long tmp;
+#if defined(_M_IX86) || defined(_M_X64)
   auto carry = _addcarry_u64(0, a, b, &tmp);
+#else
+  tmp = a + b;
+  unsigned long long vector = (a & b) ^ ((a ^ b) & ~tmp);
+  auto carry = vector >> 63;
+#endif
   *out = tmp;
   return carry;
 #endif

From 7f7618a1f93b50a56ad34b84b17a9dbe9b36c15b Mon Sep 17 00:00:00 2001
From: "Andrew M. James" <andrew.m.james2@gmail.com>
Date: Thu, 17 Nov 2022 02:01:13 +0000
Subject: [PATCH 1019/1922] Round out rad2deg sparse support (#88442)

- Add sparse coo dispatch
- Modify backward to work with sparse compressed layouts
- Enable sparse_compressed autograd testing
- Correct layout support attributes on OpInfo

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88442
Approved by: https://github.com/cpuhrsch
---
 aten/src/ATen/native/native_functions.yaml            | 3 +++
 aten/src/ATen/native/sparse/SparseUnaryOps.cpp        | 3 +++
 test/test_sparse_csr.py                               | 3 ++-
 torch/csrc/autograd/FunctionsManual.cpp               | 2 +-
 torch/testing/_internal/common_methods_invocations.py | 7 ++++++-
 5 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 726a54b5e225f..8046b4f6ac4b4 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -4020,17 +4020,20 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: rad2deg
+    SparseCPU, SparseCUDA: rad2deg_sparse
     SparseCsrCPU, SparseCsrCUDA: rad2deg_sparse_csr
 
 - func: rad2deg_(Tensor(a!) self) -> Tensor(a!)
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: rad2deg_
+    SparseCPU, SparseCUDA: rad2deg_sparse_
     SparseCsrCPU, SparseCsrCUDA: rad2deg_sparse_csr_
 
 - func: rad2deg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CompositeExplicitAutograd: rad2deg_out
+    SparseCPU, SparseCUDA: rad2deg_sparse_out
     SparseCsrCPU, SparseCsrCUDA: rad2deg_sparse_csr_out
 
 - func: deg2rad(Tensor self) -> Tensor
diff --git a/aten/src/ATen/native/sparse/SparseUnaryOps.cpp b/aten/src/ATen/native/sparse/SparseUnaryOps.cpp
index 084daed4df4e9..9e0503337b5de 100644
--- a/aten/src/ATen/native/sparse/SparseUnaryOps.cpp
+++ b/aten/src/ATen/native/sparse/SparseUnaryOps.cpp
@@ -43,6 +43,8 @@
 #include <ATen/ops/log1p_native.h>
 #include <ATen/ops/nan_to_num.h>
 #include <ATen/ops/nan_to_num_native.h>
+#include <ATen/ops/rad2deg.h>
+#include <ATen/ops/rad2deg_native.h>
 #include <ATen/ops/relu.h>
 #include <ATen/ops/relu_native.h>
 #include <ATen/ops/round.h>
@@ -177,6 +179,7 @@ COALESCED_UNARY_UFUNC(floor);
 COALESCED_UNARY_UFUNC(frac);
 COALESCED_UNARY_UFUNC(log1p);
 COALESCED_UNARY_UFUNC(round);
+COALESCED_UNARY_UFUNC(rad2deg);
 COALESCED_UNARY_UFUNC(sign);
 COALESCED_UNARY_UFUNC(sgn);
 COALESCED_UNARY_UFUNC(sin);
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index e83616489fc24..7ec2d4a79bf9f 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -66,7 +66,8 @@ def _check_cusparse_sddmm_available():
     'positive',
     'frac',
     'nn.functional.relu',
-    'log1p'
+    'log1p',
+    'rad2deg'
 ]
 
 # This should be just an import from test_linalg instead of code duplication
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index c0fbf5f6c0aa6..05fcdea3e6b7c 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -571,7 +571,7 @@ Tensor permute_backwards(const Tensor& grad, IntArrayRef fwd_dims) {
 Tensor rad2deg_backward(const Tensor& grad) {
   constexpr double M_180_PI =
       57.295779513082320876798154814105170332405472466564;
-  return at::mul(grad, at::native::wrapped_scalar_tensor(Scalar(M_180_PI)));
+  return at::mul(grad, Scalar(M_180_PI));
 }
 
 Tensor deg2rad_backward(const Tensor& grad) {
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index af4539ee5fecc..5db917424a2f2 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -12804,7 +12804,12 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                                     dtypes=[torch.bfloat16]),
                    ),
                    supports_forward_ad=True,
-                   supports_fwgrad_bwgrad=True),
+                   supports_fwgrad_bwgrad=True,
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True),
     UnaryUfuncInfo('real',
                    ref=np.real,
                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half, torch.chalf),

From 2fb6816682786baf387bfdff0c05e4b11bb59693 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Thu, 17 Nov 2022 06:14:21 +0000
Subject: [PATCH 1020/1922] [dynamo][benchmarks] HF - Fix seq len and batch
 sizes (#89165)

Fixes many models in https://github.com/pytorch/torchdynamo/issues/1842
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89165
Approved by: https://github.com/ngimel
---
 benchmarks/dynamo/common.py                   |  4 +-
 benchmarks/dynamo/huggingface.py              | 91 ++++++++++++++-----
 benchmarks/dynamo/huggingface_models_list.txt | 66 +++++++-------
 torch/_dynamo/testing.py                      |  8 +-
 4 files changed, 105 insertions(+), 64 deletions(-)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index a6e66c4281b60..789ebc3683d32 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -144,6 +144,8 @@
     "MT5ForConditionalGeneration",  # OOM
     "PegasusForConditionalGeneration",  # OOM
     "XGLMForCausalLM",  # fp64_OOM
+    "DebertaV2ForMaskedLM",  # OOM
+    "DebertaV2ForQuestionAnswering",  # OOM
     # OOM
     "BigBird",
     "TrOCRForCausalLM",
@@ -1038,7 +1040,7 @@ def decay_batch_exp(self, batch_size, factor=0.5, divisor=2):
             out_batch_size = batch_size - 1
         return max(0, int(out_batch_size))
 
-    def batch_size_finder(self, device, model_name, initial_batch_size=128):
+    def batch_size_finder(self, device, model_name, initial_batch_size=1024):
         batch_size = initial_batch_size
         while batch_size >= 1:
             torch.cuda.empty_cache()
diff --git a/benchmarks/dynamo/huggingface.py b/benchmarks/dynamo/huggingface.py
index c7ecd5f222ec5..489fcd69df944 100755
--- a/benchmarks/dynamo/huggingface.py
+++ b/benchmarks/dynamo/huggingface.py
@@ -89,18 +89,13 @@ def pip_install(package):
 
 
 SKIP = {
-    # Difficult to run and compare
-    "Reformer",
     # Fails deepcopy
-    "BlenderbotForCausalLM",
     "BlenderbotForConditionalGeneration",
-    "GPTJForCausalLM",
-    "GPTJForQuestionAnswering",
     "GPTNeoForCausalLM",
     "GPTNeoForSequenceClassification",
     # Fails with even batch size = 1
-    "DebertaV2ForMaskedLM",
-    "DebertaV2ForQuestionAnswering",
+    "GPTJForCausalLM",
+    "GPTJForQuestionAnswering",
 }
 
 # TODO - Fails even after fake tensors
@@ -108,23 +103,54 @@ def pip_install(package):
     "AlbertForMaskedLM": 2,
     "AlbertForQuestionAnswering": 2,
     "AllenaiLongformerBase": 2,
+    "BartForCausalLM": 2,
     "BartForConditionalGeneration": 2,
     "BertForMaskedLM": 2,
-    "BlenderbotSmallForCausalLM": 2,
+    "BertForQuestionAnswering": 2,
+    "BlenderbotForCausalLM": 8,
+    # "BlenderbotForConditionalGeneration" : 16,
+    "BlenderbotSmallForCausalLM": 4,
     "BlenderbotSmallForConditionalGeneration": 2,
+    "CamemBert": 2,
+    "DebertaForMaskedLM": 8,
+    "DebertaForQuestionAnswering": 4,
+    "DebertaV2ForMaskedLM": 8,
+    "DebertaV2ForQuestionAnswering": 4,
+    "DistilBertForMaskedLM": 2,
+    "DistilBertForQuestionAnswering": 2,
+    "DistillGPT2": 2,
     "ElectraForCausalLM": 2,
     "ElectraForQuestionAnswering": 2,
     "GPT2ForSequenceClassification": 2,
+    # "GPTJForCausalLM" : 2,
+    # "GPTJForQuestionAnswering" : 2,
+    # "GPTNeoForCausalLM" : 2,
+    # "GPTNeoForSequenceClassification" : 2,
+    "GoogleFnet": 2,
     "LayoutLMForMaskedLM": 2,
     "LayoutLMForSequenceClassification": 2,
+    "M2M100ForConditionalGeneration": 4,
+    "MBartForCausalLM": 2,
+    "MBartForConditionalGeneration": 2,
+    "MT5ForConditionalGeneration": 2,
+    "MegatronBertForCausalLM": 4,
+    "MegatronBertForQuestionAnswering": 2,
+    "MobileBertForMaskedLM": 4,
+    "MobileBertForQuestionAnswering": 2,
+    "OPTForCausalLM": 2,
+    "PLBartForCausalLM": 2,
+    "PLBartForConditionalGeneration": 2,
+    "PegasusForCausalLM": 4,
+    "PegasusForConditionalGeneration": 2,
     "RobertaForCausalLM": 2,
+    "RobertaForQuestionAnswering": 2,
+    "Speech2Text2ForCausalLM": 4,
     "T5ForConditionalGeneration": 2,
-    # Large footprint
-    "BartForCausalLM": 4,
-    "DebertaForQuestionAnswering": 4,
-    "XLNetLMHeadModel": 4,
-    # Very large footprint
-    "DebertaForMaskedLM": 8,
+    "T5Small": 2,
+    "TrOCRForCausalLM": 2,
+    "XGLMForCausalLM": 4,
+    "XLNetLMHeadModel": 2,
+    "YituTechConvBert": 2,
 }
 
 
@@ -139,18 +165,33 @@ def get_module_cls_by_model_name(model_cls_name):
 
 
 def get_sequence_length(model_cls, model_name):
-    if model_name.startswith(("Bert", "Roberta", "Blenderbot")):
+    if model_name.startswith(("Blenderbot",)):
         seq_length = 128
-    elif model_name.startswith(("GPT2", "Bart", "T5")):
+    elif model_name.startswith(("GPT2", "Bart", "T5", "PLBart", "MBart")):
         seq_length = 1024
     elif model_name in ("AllenaiLongformerBase", "BigBird"):
         seq_length = 1024
+    elif model_name.startswith("OPT"):
+        seq_length = 2048
     elif "Reformer" in model_name:
         seq_length = 4096
     elif model_name.startswith(
-        ("Albert", "Deberta", "Layout", "Electra", "XLNet")
+        (
+            "Albert",
+            "Deberta",
+            "Layout",
+            "Electra",
+            "XLNet",
+            "MegatronBert",
+            "Bert",
+            "Roberta",
+        )
     ) or model_name in ("DistillGPT2", "GoogleFnet", "YituTechConvBert", "CamemBert"):
         seq_length = 512
+    elif model_name in ("TrOCRForCausalLM"):
+        seq_length = 256
+    elif model_name.startswith("MobileBert"):
+        seq_length = 128
     else:
         log.warning(
             f"Sequence Length not defined for {model_name}. Choosing 128 arbitrarily"
@@ -287,10 +328,10 @@ def rand_int_tensor(device, low, high, shape):
         AutoConfig.from_pretrained("t5-small"),
         AutoModelForSeq2SeqLM,
     ),
-    "BigBird": (
-        BigBirdConfig(attention_type="block_sparse"),
-        AutoModelForMaskedLM,
-    ),
+    # "BigBird": (
+    #     BigBirdConfig(attention_type="block_sparse"),
+    #     AutoModelForMaskedLM,
+    # ),
     "DistillGPT2": (
         AutoConfig.from_pretrained("distilgpt2"),
         AutoModelForCausalLM,
@@ -461,10 +502,10 @@ def refresh_model_names_and_batch_sizes():
         if model_cls in [
             CLIPModel,
             CLIPVisionModel,
-            SwinForImageClassification,
-            SwinForImageClassification,
-            SwinForMaskedImageModeling,
-            SwinModel,
+            # SwinForImageClassification,
+            # SwinForImageClassification,
+            # SwinForMaskedImageModeling,
+            # SwinModel,
             ViTForImageClassification,
             ViTForMaskedImageModeling,
             ViTModel,
diff --git a/benchmarks/dynamo/huggingface_models_list.txt b/benchmarks/dynamo/huggingface_models_list.txt
index 8272c79b12bda..6e3cf19a783d7 100644
--- a/benchmarks/dynamo/huggingface_models_list.txt
+++ b/benchmarks/dynamo/huggingface_models_list.txt
@@ -1,53 +1,51 @@
 AlbertForMaskedLM,8
 AlbertForQuestionAnswering,8
-AllenaiLongformerBase,1
-BartForCausalLM,16
+AllenaiLongformerBase,8
+BartForCausalLM,8
 BartForConditionalGeneration,4
-BertForMaskedLM,128
-BertForQuestionAnswering,128
-BigBird,1
+BertForMaskedLM,32
+BertForQuestionAnswering,32
 BlenderbotForCausalLM,32
-BlenderbotForConditionalGeneration,32
-BlenderbotSmallForCausalLM,128
+BlenderbotForConditionalGeneration,16
+BlenderbotSmallForCausalLM,256
 BlenderbotSmallForConditionalGeneration,128
-CamemBert,1
+CamemBert,32
 DebertaForMaskedLM,32
 DebertaForQuestionAnswering,32
 DebertaV2ForMaskedLM,8
 DebertaV2ForQuestionAnswering,8
-DistilBertForMaskedLM,64
-DistilBertForQuestionAnswering,64
-DistillGPT2,1
+DistilBertForMaskedLM,256
+DistilBertForQuestionAnswering,512
+DistillGPT2,32
 ElectraForCausalLM,64
 ElectraForQuestionAnswering,128
 GPT2ForSequenceClassification,8
 GPTJForCausalLM,1
 GPTJForQuestionAnswering,1
-GPTNeoForCausalLM,8
-GPTNeoForSequenceClassification,8
-GoogleFnet,1
+GPTNeoForCausalLM,32
+GPTNeoForSequenceClassification,32
+GoogleFnet,32
 LayoutLMForMaskedLM,32
 LayoutLMForSequenceClassification,32
-M2M100ForConditionalGeneration,8
-MBartForCausalLM,32
-MBartForConditionalGeneration,16
-MT5ForConditionalGeneration,8
+M2M100ForConditionalGeneration,64
+MBartForCausalLM,8
+MBartForConditionalGeneration,4
+MT5ForConditionalGeneration,32
 MegatronBertForCausalLM,16
 MegatronBertForQuestionAnswering,16
-MobileBertForMaskedLM,32
-MobileBertForQuestionAnswering,64
-OPTForCausalLM,32
-PLBartForCausalLM,32
-PLBartForConditionalGeneration,16
-PegasusForCausalLM,32
-PegasusForConditionalGeneration,16
-Reformer,1
-RobertaForCausalLM,128
-RobertaForQuestionAnswering,128
-Speech2Text2ForCausalLM,128
+MobileBertForMaskedLM,256
+MobileBertForQuestionAnswering,256
+OPTForCausalLM,4
+PLBartForCausalLM,16
+PLBartForConditionalGeneration,8
+PegasusForCausalLM,128
+PegasusForConditionalGeneration,64
+RobertaForCausalLM,32
+RobertaForQuestionAnswering,32
+Speech2Text2ForCausalLM,1024
 T5ForConditionalGeneration,8
-T5Small,1
-TrOCRForCausalLM,32
-XGLMForCausalLM,8
-XLNetLMHeadModel,128
-YituTechConvBert,1
+T5Small,8
+TrOCRForCausalLM,64
+XGLMForCausalLM,32
+XLNetLMHeadModel,16
+YituTechConvBert,32
diff --git a/torch/_dynamo/testing.py b/torch/_dynamo/testing.py
index 6e0d32d21f978..eea4c26a171ca 100644
--- a/torch/_dynamo/testing.py
+++ b/torch/_dynamo/testing.py
@@ -48,10 +48,10 @@ def collect_results(model, prediction, loss, example_inputs):
     results = []
     results.append(prediction)
     results.append(loss)
-    if isinstance(loss, torch.Tensor) and loss.item() > 1:
-        log.warning(
-            f"High loss value alert - {loss:.2f}. Can result in unstable gradients."
-        )
+    # if isinstance(loss, torch.Tensor) and loss.item() > 1:
+    #     log.warning(
+    #         f"High loss value alert - {loss:.2f}. Can result in unstable gradients."
+    #     )
 
     grads = dict()
     params = dict()

From db7710b36e47bdf1b2e9c2b2b571ed9fe492184f Mon Sep 17 00:00:00 2001
From: AllenTiTaiWang <titaiwang@microsoft.com>
Date: Thu, 17 Nov 2022 03:27:18 +0000
Subject: [PATCH 1021/1922] [ONNX] Add onnx-script into ONNX docs (#89078)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89078
Approved by: https://github.com/BowenBao
---
 docs/source/onnx.rst                 | 70 +++++++++++++++++++++++++++-
 test/onnx/test_onnxscript_runtime.py |  2 -
 2 files changed, 68 insertions(+), 4 deletions(-)

diff --git a/docs/source/onnx.rst b/docs/source/onnx.rst
index fea0b3bc94d29..8f52be124e2ea 100644
--- a/docs/source/onnx.rst
+++ b/docs/source/onnx.rst
@@ -499,6 +499,7 @@ ONNX operators that represent the function's behavior in ONNX. For example::
 
 Inline Autograd Function
 ~~~~~~~~~~~~~~~~~~~~~~~~
+
 In cases where a static symbolic method is not provided for its subsequent :class:`torch.autograd.Function` or
 where a function to register ``prim::PythonOp`` as custom symbolic functions is not provided,
 :func:`torch.onnx.export` tries to inline the graph that corresponds to that :class:`torch.autograd.Function` such that
@@ -526,6 +527,73 @@ If you need to avoid inlining of :class:`torch.autograd.Function`, you should ex
 Custom operators
 ^^^^^^^^^^^^^^^^
 
+You can export your model with custom operators that includes a combination of many standard ONNX ops,
+or are driven by self-defined C++ backend.
+
+ONNX-script functions
+~~~~~~~~~~~~~~~~~~~~~
+
+If an operator is not a standard ONNX op, but can be composed of multiple existing ONNX ops, you can utilize
+`ONNX-script <https://github.com/microsoft/onnx-script>`_ to create an external ONNX function to support the operator.
+You can export it by following this example::
+
+    import onnxscript
+    # There are three opset version needed to be aligned
+    # This is (1) the opset version in ONNX function
+    from onnxscript.onnx_opset import opset15 as op
+    opset_version = 15
+
+    x = torch.randn(1, 2, 3, 4, requires_grad=True)
+    model = torch.nn.SELU()
+
+    custom_opset = onnxscript.values.Opset(domain="onnx-script", version=1)
+
+    @onnxscript.script(custom_opset)
+    def Selu(X):
+        alpha = 1.67326  # auto wrapped as Constants
+        gamma = 1.0507
+        alphaX = op.CastLike(alpha, X)
+        gammaX = op.CastLike(gamma, X)
+        neg = gammaX * (alphaX * op.Exp(X) - alphaX)
+        pos = gammaX * X
+        zero = op.CastLike(0, X)
+        return op.Where(X <= zero, neg, pos)
+
+    # setType API provides shape/type to ONNX shape/type inference
+    def custom_selu(g: jit_utils.GraphContext, X):
+        return g.onnxscript_op(Selu, X).setType(X.type())
+
+    # Register custom symbolic function
+    # There are three opset version needed to be aligned
+    # This is (2) the opset version in registry
+    torch.onnx.register_custom_op_symbolic(
+        symbolic_name="aten::selu",
+        symbolic_fn=custom_selu,
+        opset_version=opset_version,
+    )
+
+    # There are three opset version needed to be aligned
+    # This is (2) the opset version in exporter
+    torch.onnx.export(
+        model,
+        x,
+        "model.onnx",
+        opset_version=opset_version,
+        # only needed if you want to specify an opset version > 1.
+        custom_opsets={"onnx-script": 2}
+    )
+
+The example above exports it as a custom operator in the "onnx-script" opset.
+When exporting a custom operator, you can specify the custom domain version using the
+``custom_opsets`` dictionary at export. If not specified, the custom opset version defaults to 1.
+
+NOTE: Be careful to align the opset version mentioned in the above example, and make sure they are consumed in exporter step.
+The example usage of how to write a onnx-script function is a beta version in terms of the active development on onnx-script.
+Please follow the latest `ONNX-script <https://github.com/microsoft/onnx-script>`_
+
+C++ Operators
+~~~~~~~~~~~~~
+
 If a model uses a custom operator implemented in C++ as described in
 `Extending TorchScript with Custom C++ Operators <https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html>`_,
 you can export it by following this example::
@@ -563,8 +631,6 @@ you can export it by following this example::
         custom_opsets={"custom_domain": 2}
     )
 
-You can export your model as one or a combination of many standard ONNX ops, or as a custom ONNX operator.
-
 The example above exports it as a custom operator in the "custom_domain" opset.
 When exporting a custom operator, you can specify the custom domain version using the
 ``custom_opsets`` dictionary at export. If not specified, the custom opset version defaults to 1.
diff --git a/test/onnx/test_onnxscript_runtime.py b/test/onnx/test_onnxscript_runtime.py
index 2d0d1e3a5357a..e22e76c8315e7 100644
--- a/test/onnx/test_onnxscript_runtime.py
+++ b/test/onnx/test_onnxscript_runtime.py
@@ -25,8 +25,6 @@ def test_selu_from_onnxscript_example(self):
 
         from onnxscript.onnx_opset import opset15 as op
 
-        # custom domain is needed for custom Op domain name should be
-        # aligned to the one in symbolic_fn
         # TODO(titaiwang): make an official domain for onnxscript usage
         custom_opset = onnxscript.values.Opset(domain="onnx-script", version=1)
 

From 7859ed794dc37b71fb854dabb0ee6b9fd7df971c Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 17 Nov 2022 06:57:42 +0000
Subject: [PATCH 1022/1922] [Dynamo] Support typing.Mapping & Support function
 as argument (#88963)

These missing features come from https://github.com/pytorch/benchmark/pull/1302, where we'd like to enable E2E hf_bert dynamo train/eval. The dependent [HuggingFace accelerate library](https://huggingface.co/docs/accelerate/index) requires these improvements.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88963
Approved by: https://github.com/jansel
---
 test/dynamo/test_misc.py             | 36 ++++++++++++++++++++++++++++
 torch/_dynamo/utils.py               |  8 +++++++
 torch/_dynamo/variables/builder.py   |  6 +++--
 torch/_dynamo/variables/functions.py |  2 ++
 torch/_dynamo/variables/misc.py      |  6 +++++
 5 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index b3cddcbf1dff7..2825b157bc68e 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -2792,6 +2792,42 @@ def fn(x):
         res = opt_fn(x)
         self.assertTrue(torch.allclose(ref, res))
 
+    def test_user_function_variable_supports_function_argument(self):
+        def add1(x):
+            return x + 1
+
+        def add2(x):
+            return x + 2
+
+        def gn(x, f=add1):
+            if f is add1:
+                return x + 1
+            else:
+                return x + 2
+
+        def fn(x, f):
+            return gn(x, f)
+
+        x = torch.randn(2, 3)
+        ref = fn(x, add2)
+        opt_fn = torch._dynamo.optimize("eager", nopython=True)(fn)
+        res = opt_fn(x, add2)
+        self.assertTrue(torch.allclose(ref, res))
+
+    def test_typing_variable_isinstance(self):
+        def fn(x, m):
+            if isinstance(m, typing.Mapping):
+                return x + 1
+            else:
+                return x - 1
+
+        x = torch.randn(2, 3)
+        m = {"x": torch.randn(3)}
+        ref = fn(x, m)
+        opt_fn = torch._dynamo.optimize("eager")(fn)
+        res = opt_fn(x, m)
+        self.assertTrue(torch.allclose(ref, res))
+
     def test_repro_graph_breaks_in__get_item_by_idx(self):
         class Mod(torch.nn.Module):
             def __init__(self):
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 0b87be7393b52..f426ef6913079 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -19,6 +19,7 @@
 import sys
 import time
 import types
+import typing
 import weakref
 from contextlib import contextmanager
 from functools import lru_cache
@@ -275,6 +276,13 @@ def istype(obj, allowed_types):
     return type(obj) is allowed_types
 
 
+def is_typing(value):
+    if sys.version_info < (3, 9):
+        return isinstance(value, typing._GenericAlias)
+    else:
+        return isinstance(value, typing._SpecialGenericAlias)
+
+
 def is_numpy_int_type(value):
     return istype(
         value,
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index 67e506b5b435b..b1b691c41fc60 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -9,7 +9,7 @@
 import re
 import types
 from abc import ABCMeta
-from typing import Any, List, Union
+from typing import Any, Union
 
 import numpy as np
 from functorch.experimental.ops import PyOperator
@@ -43,6 +43,7 @@
     global_key_name,
     is_namedtuple,
     is_numpy_int_type,
+    is_typing,
     istensor,
     istype,
     odict_values,
@@ -360,7 +361,8 @@ def index_source(key):
                 value,
                 guards=make_guards(GuardBuilder.FUNCTION_MATCH),
             )
-        elif value is List:
+        elif is_typing(value):
+            # typing.List, typing.Mapping, etc.
             return TypingVariable(
                 value,
                 guards=make_guards(GuardBuilder.ID_MATCH),
diff --git a/torch/_dynamo/variables/functions.py b/torch/_dynamo/variables/functions.py
index 88be730c34236..a8bb8bd84c79e 100644
--- a/torch/_dynamo/variables/functions.py
+++ b/torch/_dynamo/variables/functions.py
@@ -24,6 +24,8 @@ def wrap_bound_arg(val, options):
         return cls([wrap_bound_arg(x, options) for x in val], **options)
     elif variables.ConstantVariable.is_literal(val):
         return variables.ConstantVariable(val, **options)
+    elif isinstance(val, types.FunctionType):
+        return variables.UserFunctionVariable(val, **options)
     elif isinstance(val, enum.Enum):
         return variables.EnumVariable(val, **options)
     elif isinstance(val, (type, abc.ABCMeta)):
diff --git a/torch/_dynamo/variables/misc.py b/torch/_dynamo/variables/misc.py
index 298ddf24862bd..952cbd2c64244 100644
--- a/torch/_dynamo/variables/misc.py
+++ b/torch/_dynamo/variables/misc.py
@@ -654,6 +654,12 @@ def call_method(
             )
         unimplemented("typing")
 
+    def python_type(self):
+        return type(self.value)
+
+    def as_python_constant(self):
+        return self.value
+
 
 class NumpyVariable(VariableTracker):
     """

From 4c7c3dd1f25f851eaacf322d026c47b7ee5751f8 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Thu, 17 Nov 2022 07:24:55 +0000
Subject: [PATCH 1023/1922] Add warning if tensor cores are not used (#88844)

Fixes https://github.com/pytorch/torchdynamo/issues/1839

Should I do this for all backends or just inductor?

## Test
On a V100 I got from AWS

```python
from torch._dynamo import optimize
import torch

def fn(x, y):
    a = torch.cos(x)
    b = torch.sin(y)
    return a + b

new_fn = optimize("inductor")(fn)

a = new_fn(torch.Tensor(1),torch.Tensor(1))
print(a)
```

## New logs
```
(sourcetorch) ubuntu@ip-172-31-31-152:~/test$ python test.py
/home/ubuntu/pytorch/torch/_dynamo/eval_frame.py:318: UserWarning: Tensor cores are available but not enabled. Consider setting torch.backends.cuda.matmul.allow_tf32 == True in your python script for speedups
  warnings.warn(
tensor([1.3717])
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88844
Approved by: https://github.com/ngimel, https://github.com/mlazos, https://github.com/anijain2305
---
 torch/_dynamo/eval_frame.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 1188bfd74fc25..6b500a87bc32f 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -350,6 +350,16 @@ def get_compiler_fn(compiler_fn):
 def lookup_backend(compiler_fn):
     """Expand backend strings to functions"""
     if compiler_fn == "inductor":
+        if torch.cuda.is_available():
+            if (
+                torch.backends.cuda.matmul.allow_tf32 is False
+                and torch.cuda.get_device_capability() >= (8, 0)
+            ):
+                warnings.warn(
+                    "TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled."
+                    "Consider setting `torch.set_float32_matmul_precision('high')`"
+                )
+
         compiler_fn = import_module(f"{config.inductor_import}.compile_fx").compile_fx
     elif isinstance(compiler_fn, str):
         from .optimizations import BACKENDS

From ceba6e0032f5ac8de8f05a5fa5988903dfe96e1c Mon Sep 17 00:00:00 2001
From: ecao <e.cao@intel.com>
Date: Thu, 17 Nov 2022 08:15:49 +0000
Subject: [PATCH 1024/1922] Add BFloat16 support and optimization for mish,
 hardtanh backward, and silu on CPU (#82460)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Description
* add BFloat16 support for mish and hardtanh backward on CPU.
* optimize the performance for silu

### Testing

- optimize the performance for silu: bfloat16

single socket (28 cores):
```
before: 1x128x1024  forward 0.090 s  backward  0.218 s
        10x128x1024 forward 0.146 s  backward  0.314 s

after:  1x128x1024   forward  0.064 s backward  0.100 s
        10x128x1024  forward  0.085 s backward  0.133 s
```
single core:
```
before: 1x128x1024   forward 0.300 s  backward  0.606 s
        10x128x1024  forward 2.825 s  backward  5.834 s

after:  1x128x1024   forward 0.156 s backward   0.239 s
        10x128x1024  forward 1.447 s backward   2.165 s
```

- Add BFloat16 support for mish and backward of hardtanh on CPU.

single socket (20 cores):
op | shape | fp32 / s | fp32 / s | bf16 / s |  bf16 / s
-- | -- | -- | -- | -- | --
  |   | forward | backward | forward | backward
silu | [10, 128, 10, 10] | 4.41E-05 | 7.67E-05 | 5.32E-05 | 9.38E-05
  | [10, 128, 80, 80] | 0.0008 | 0.001788 | 0.00067 | 0.001031
mish | [10, 128, 10, 10] | 0.000356 | 0.000427 | 0.000367 | 0.000436
  | [10, 128, 80, 80] | 0.004527 | 0.005807 | 0.004757 | 0.005393
hardtanh | [10, 128, 10, 10] | / | 3.97E-05 | / | 4.45E-05
  | [10, 128, 80, 80] | / | 0.001748 | / | 0.000645

single core:
op | shape | fp32 / s | fp32 / s | bf16 / s |  bf16 / s
-- | -- | -- | -- | -- | --
  |   | forward | backward | forward | backward
silu | [10, 128, 10, 10] | 1.17E-04 | 1.91E-04 | 1.35E-04 | 2.23E-04
  | [10, 128, 80, 80] | 0.007434 | 0.013141 | 0.008464 | 0.013044
mish | [10, 128, 10, 10] | 0.00103 | 0.00122 | 0.00106 | 0.001227
  | [10, 128, 80, 80] | 0.065629 | 0.078418 | 0.067779 | 0.077214
hardtanh | [10, 128, 10, 10] | / | 1.18E-04 | / | 9.30E-05
  | [10, 128, 80, 80] | / | 0.010773 | / | 0.005834

Pull Request resolved: https://github.com/pytorch/pytorch/pull/82460
Approved by: https://github.com/mingfeima, https://github.com/malfet
---
 aten/src/ATen/native/cpu/Activation.cpp       | 114 ++++++++++++++++--
 test/test_nn.py                               |   3 +
 .../_internal/common_methods_invocations.py   |   6 +-
 3 files changed, 113 insertions(+), 10 deletions(-)

diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp
index 6f3eac783ccda..728ea62f1898f 100644
--- a/aten/src/ATen/native/cpu/Activation.cpp
+++ b/aten/src/ATen/native/cpu/Activation.cpp
@@ -623,7 +623,25 @@ void shrink_backward_kernel(TensorIteratorBase& iter, const Scalar& lambd) {
 }
 
 void hardtanh_backward_kernel(TensorIterator& iter, const Scalar& min, const Scalar& max) {
-  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "hardshrink_backward_cpu", [&] {
+  if (iter.dtype() == kBFloat16) {
+    auto min_val = min.to<float>();
+    auto max_val = max.to<float>();
+    cpu_kernel_vec(
+        iter,
+        [=](BFloat16 grad_val, BFloat16 self_val) -> BFloat16 {
+          return (float(self_val) <= min_val || float(self_val) >= max_val) ? BFloat16(0) : grad_val;
+        },
+        [=](Vectorized<BFloat16> grad_val, Vectorized<BFloat16> self_val) -> Vectorized<BFloat16> {
+          Vectorized<float> grad_val0, grad_val1, self_val0, self_val1;
+          std::tie(grad_val0, grad_val1) = convert_bfloat16_float(grad_val);
+          std::tie(self_val0, self_val1) = convert_bfloat16_float(self_val);
+          return convert_float_bfloat16(
+            ((self_val0 > min_val) & (self_val0 < max_val)) & grad_val0,
+            ((self_val1 > min_val) & (self_val1 < max_val)) & grad_val1
+          );
+        });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "hardshrink_backward_cpu", [&] {
     auto min_val = min.to<scalar_t>();
     auto max_val = max.to<scalar_t>();
     cpu_kernel_vec(
@@ -635,6 +653,7 @@ void hardtanh_backward_kernel(TensorIterator& iter, const Scalar& min, const Sca
           return ((self_val > min_val) & (self_val < max_val)) & grad_val;
         });
   });
+  }
 }
 
 void hardswish_kernel(TensorIterator& iter) {
@@ -1035,8 +1054,23 @@ void glu_backward_kernel(TensorIterator& iter) {
 }
 
 void silu_kernel(TensorIteratorBase& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(
-      kBFloat16, iter.dtype(), "silu_cpu", [&]() {
+  if (iter.dtype() == kBFloat16) {
+      const Vectorized<float> kOneVec(1.0f);
+      cpu_kernel_vec(
+          iter,
+          [](BFloat16 x) -> BFloat16 {
+            return float(x) / (1.0f + std::exp(-float(x)));
+          },
+          [kOneVec](Vectorized<BFloat16> x_vec) -> Vectorized<BFloat16> {
+            Vectorized<float> x_vec0, x_vec1;
+            std::tie(x_vec0, x_vec1) = convert_bfloat16_float(x_vec);
+            return convert_float_bfloat16(
+              x_vec0 / (kOneVec + x_vec0.neg().exp()),
+              x_vec1 / (kOneVec + x_vec1.neg().exp()));
+          });
+  } else {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
+      iter.dtype(), "silu_cpu", [&]() {
         const Vectorized<scalar_t> kOneVec(scalar_t(1));
         cpu_kernel_vec(
             iter,
@@ -1047,11 +1081,34 @@ void silu_kernel(TensorIteratorBase& iter) {
               return x_vec / (kOneVec + x_vec.neg().exp());
             });
       });
+    }
 }
 
 void silu_backward_kernel(TensorIteratorBase& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(
-      kBFloat16, iter.dtype(), "silu_backward_cpu", [&]() {
+  if (iter.dtype() == kBFloat16) {
+    const Vectorized<float> kOneVec(1.0f);
+    cpu_kernel_vec(
+        iter,
+        [](BFloat16 dy, BFloat16 x) -> BFloat16 {
+          const float sigmoid =
+              1.0f / (1.0f + std::exp(-float(x)));
+          return dy * sigmoid * (1.0f + x * (1.0f - sigmoid));
+        },
+        [kOneVec](Vectorized<BFloat16> dy_vec, Vectorized<BFloat16> x_vec) -> Vectorized<BFloat16> {
+          Vectorized<float> x_vec0, x_vec1, dy_vec0, dy_vec1;
+          std::tie(x_vec0, x_vec1) = convert_bfloat16_float(x_vec);
+          std::tie(dy_vec0, dy_vec1) = convert_bfloat16_float(dy_vec);
+          const Vectorized<float> sigmoid0 =
+              kOneVec / (kOneVec + x_vec0.neg().exp());
+          const Vectorized<float> sigmoid1 =
+              kOneVec / (kOneVec + x_vec1.neg().exp());
+          return convert_float_bfloat16(
+            dy_vec0 * sigmoid0 * (kOneVec + x_vec0 * (kOneVec - sigmoid0)),
+            dy_vec1 * sigmoid1 * (kOneVec + x_vec1 * (kOneVec - sigmoid1)));
+        });
+  } else {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
+      iter.dtype(), "silu_backward_cpu", [&]() {
         const Vectorized<scalar_t> kOneVec(scalar_t(1));
         cpu_kernel_vec(
             iter,
@@ -1066,10 +1123,26 @@ void silu_backward_kernel(TensorIteratorBase& iter) {
               return dy_vec * sigmoid * (kOneVec + x_vec * (kOneVec - sigmoid));
             });
       });
+  }
 }
 
 void mish_kernel(TensorIteratorBase& iter) {
-  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "mish_cpu", [&]() {
+  if (iter.dtype() == kBFloat16) {
+    cpu_kernel_vec(
+        iter,
+        [](BFloat16 x) -> BFloat16{
+          return static_cast<BFloat16>(float(x) * std::tanh(std::log1p(std::exp(float(x)))));
+        },
+        [](Vectorized<BFloat16> x_vec) -> Vectorized<BFloat16> {
+          Vectorized<float> x_vec0, x_vec1;
+          std::tie(x_vec0, x_vec1) = convert_bfloat16_float(x_vec);
+          return convert_float_bfloat16(
+            x_vec0 * x_vec0.exp().log1p().tanh(),
+            x_vec1 * x_vec1.exp().log1p().tanh()
+          );
+        });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "mish_cpu", [&]() {
         using Vec = Vectorized<scalar_t>;
         cpu_kernel_vec(
             iter,
@@ -1080,10 +1153,36 @@ void mish_kernel(TensorIteratorBase& iter) {
               return x_vec * x_vec.exp().log1p().tanh();
             });
       });
+  }
 }
 
 void mish_backward_kernel(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "mish_backward_cpu", [&]() {
+  if (iter.dtype() == kBFloat16) {
+    using Vec = Vectorized<float>;
+    const Vec kOneVec(1.0f);
+    cpu_kernel_vec(
+        iter,
+        [](BFloat16 dy, BFloat16 x) -> BFloat16 {
+          const float sigmoid =
+              1.0f / (1.0f + std::exp(-float(x)));
+          const float tanh_softplus = std::tanh(std::log1p(std::exp(float(x))));
+          return dy * (tanh_softplus + x * sigmoid * (1.0f - tanh_softplus * tanh_softplus));
+        },
+        [kOneVec](Vectorized<BFloat16> dy_vec, Vectorized<BFloat16> x_vec) -> Vectorized<BFloat16> {
+          Vectorized<float> x_vec0, x_vec1, dy_vec0, dy_vec1;
+          std::tie(x_vec0, x_vec1) = convert_bfloat16_float(x_vec);
+          std::tie(dy_vec0, dy_vec1) = convert_bfloat16_float(dy_vec);
+          const Vec sigmoid0 = kOneVec / (kOneVec + x_vec0.neg().exp());
+          const Vec sigmoid1 = kOneVec / (kOneVec + x_vec1.neg().exp());
+          const Vec tanh_softplus0 = x_vec0.exp().log1p().tanh();
+          const Vec tanh_softplus1 = x_vec1.exp().log1p().tanh();
+          return convert_float_bfloat16(
+            dy_vec0 * (tanh_softplus0 + x_vec0 * sigmoid0 * (kOneVec - tanh_softplus0 * tanh_softplus0)),
+            dy_vec1 * (tanh_softplus1 + x_vec1 * sigmoid1 * (kOneVec - tanh_softplus1 * tanh_softplus1))
+          );
+        });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "mish_backward_cpu", [&]() {
         using Vec = Vectorized<scalar_t>;
         const Vec kOneVec(scalar_t(1));
         cpu_kernel_vec(
@@ -1100,6 +1199,7 @@ void mish_backward_kernel(TensorIterator& iter) {
               return dy_vec * (tanh_softplus + x_vec * sigmoid * (kOneVec - tanh_softplus * tanh_softplus));
             });
       });
+  }
 }
 
 void prelu_cpu_kernel(TensorIterator& iter) {
diff --git a/test/test_nn.py b/test/test_nn.py
index 7d6a016a6f51a..25f85c60037b6 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -14640,6 +14640,9 @@ def test_bfloat16(fn, device, inp_dims, prec):
             test_bfloat16(torch.nn.Softshrink(), device, shape, prec=1e-2)
             test_bfloat16(torch.nn.Hardswish(), device, shape, prec=2e-2)
             test_bfloat16(torch.nn.Softplus(), device, shape, prec=1e-2)
+            test_bfloat16(torch.nn.SiLU(), device, shape, prec=1e-2)
+            test_bfloat16(torch.nn.Hardtanh(), device, shape, prec=1e-2)
+            test_bfloat16(torch.nn.Mish(), device, shape, prec=1e-2)
 
     @onlyCUDA
     def test_activations_bfloat16(self, device):
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 5db917424a2f2..8fe70e71614d4 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -12141,7 +12141,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         'nn.functional.mish',
         aten_backward_name='mish_backward',
         ref=lambda x: x * np.tanh(reference_softplus(x)),
-        dtypes=floating_types(),
+        dtypes=floating_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
@@ -12497,7 +12497,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    aten_name="hardtanh",
                    aten_backward_name='hardtanh_backward',
                    dtypes=floating_types_and(torch.int8, torch.int16, torch.int32, torch.int64, torch.bfloat16),
-                   backward_dtypes=all_types(),
+                   backward_dtypes=all_types_and(torch.bfloat16),
                    dtypesIfCUDA=floating_types_and(torch.int8, torch.int16, torch.int32, torch.int64, torch.float16,
                                                    torch.bfloat16),
                    backward_dtypesIfCUDA=floating_types_and(torch.float16),
@@ -12530,7 +12530,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     UnaryUfuncInfo('nn.functional.relu6',
                    aten_name="relu6",
                    dtypes=all_types_and(torch.bfloat16),
-                   backward_dtypes=floating_types(),
+                   backward_dtypes=floating_types_and(torch.bfloat16),
                    dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
                    backward_dtypesIfCUDA=floating_types_and(torch.float16),
                    assert_autodiffed=True,

From c881bf968e7242efcce6ccc6443ba334ae911988 Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Wed, 16 Nov 2022 23:31:57 +0000
Subject: [PATCH 1025/1922] Fix typo in dist_util.py (#89167)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89167
Approved by: https://github.com/davidberard98
---
 benchmarks/dynamo/dist_util.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/benchmarks/dynamo/dist_util.py b/benchmarks/dynamo/dist_util.py
index 9957ef6139dfa..24625c84e1a10 100644
--- a/benchmarks/dynamo/dist_util.py
+++ b/benchmarks/dynamo/dist_util.py
@@ -25,11 +25,8 @@
 
 
 def setup(rank, world_size):
-    # set defaults in case torchrun isn't used; no idea why the if is needed, but it hangs torchrun otherwise
-    if not os.getenv("MASTER_ADDR"):
-        os.environ["MASTER_ADDR"] = os.getenv("MASTER_ADDR", "localhost")
-    if not os.getenv("MASTER_PORT"):
-        os.environ["MASTER_PORT"] = os.getenv("MASETER_PORT", "12355")
+    os.environ["MASTER_ADDR"] = os.getenv("MASTER_ADDR", "localhost")
+    os.environ["MASTER_PORT"] = os.getenv("MASTER_PORT", "12355")
     os.environ["RANK"] = os.getenv("RANK", "0")
     os.environ["WORLD_SIZE"] = os.getenv("WORLD_SIZE", "1")
     dist.init_process_group("nccl")

From 5177f577d70bebe94c0eadb54bca4e482e22edfe Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 16 Nov 2022 21:31:02 -0800
Subject: [PATCH 1026/1922] Reland "Towards unifying symbolic and non symbolic
 fake tensor (#89038)" (#89143)

This reverts commit cf6003f0469ae1440d4a8585860c2c5f4c738707.

Differential Revision: [D41363992](https://our.internmc.facebook.com/intern/diff/D41363992)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89143
Approved by: https://github.com/albanD
---
 aten/src/ATen/native/TensorFactories.cpp |  6 ---
 test/functorch/test_aotdispatch.py       |  1 -
 test/test_proxy_tensor.py                | 21 +++------
 torch/_meta_registrations.py             | 39 +++++++++++++++-
 torch/_ops.py                            |  1 +
 torch/_prims/__init__.py                 |  5 +-
 torch/_prims_common/__init__.py          |  3 ++
 torch/_subclasses/fake_tensor.py         | 58 +++++++++---------------
 8 files changed, 71 insertions(+), 63 deletions(-)

diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 9d1c6d8a36333..7245cb77b1c50 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -325,12 +325,6 @@ Tensor empty_like(
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options_ = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
-
-  TORCH_CHECK(
-    !(options_.has_memory_format() && optional_memory_format.has_value()),
-    "Cannot set memory_format both in TensorOptions and explicit argument; please delete "
-    "the redundant setter.");
-
   TensorOptions options =
       self.options()
           .merge_in(options_)
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 1dc5476158f96..ae216f9be4a49 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1011,7 +1011,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('cumprod', ''),  # aten.cumprod.default - couldn't find symbolic meta function/decomposition
     xfail('cumsum', ''),  # aten.cumsum.default - couldn't find symbolic meta function/decomposition
     xfail('cumulative_trapezoid', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('deg2rad', ''),  # aten.deg2rad.default - couldn't find symbolic meta function/decomposition
     xfail('diff', ''),  # aten.zeros_like.default - couldn't find symbolic meta function/decomposition
     xfail('digamma', ''),  # aten.polygamma.default - couldn't find symbolic meta function/decomposition
     xfail('dist', ''),  # aten.dist.default - couldn't find symbolic meta function/decomposition
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 8dc42be7fdfb2..0a24807af55f0 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1151,9 +1151,7 @@ def f(a, b, c, d, e):
     xfail('cummin', ''),  # aten.cummin.default - couldn't find symbolic meta function/decomposition
     xfail('cumprod', ''),  # aten.cumprod.default - couldn't find symbolic meta function/decomposition
     xfail('cumulative_trapezoid', ''),  # aten.slice.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('deg2rad', ''),  # aten.deg2rad.default - couldn't find symbolic meta function/decomposition
     xfail('diff', ''),  # aten.empty_like.default - couldn't find symbolic meta function/decomposition
-    xfail('dist', ''),  # aten.dist.default - couldn't find symbolic meta function/decomposition
     xfail('dsplit', ''),  # aten.slice.Tensor - couldn't find symbolic meta function/decomposition
     xfail('fft.fft2', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('fft.fft', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
@@ -1235,8 +1233,6 @@ def f(a, b, c, d, e):
     xfail('lu', ''),  # aten.linalg_lu_factor_ex.default - couldn't find symbolic meta function/decomposition
     xfail('lu_solve', ''),  # aten.linalg_lu_solve.default - couldn't find symbolic meta function/decomposition
     xfail('lu_unpack', ''),  # aten.lu_unpack.default - couldn't find symbolic meta function/decomposition
-    xfail('masked_fill', ''),  # expected predicate to be bool, got torch.float32
-    xfail('masked_scatter', ''),  # aten.masked_scatter.default - couldn't find symbolic meta function/decomposition
     xfail('masked_select', ''),  # aten.masked_select.default - couldn't find symbolic meta function/decomposition
     xfail('matrix_exp', ''),  # aten.linalg_matrix_exp.default - couldn't find symbolic meta function/decomposition
     xfail('median', ''),  # Could not run 'aten::median' with arguments from the 'Meta' backend. This could be becau...
@@ -1281,7 +1277,6 @@ def f(a, b, c, d, e):
     xfail('nn.functional.pdist', ''),  # Could not run 'aten::_pdist_forward' with arguments from the 'Meta' backend...
     xfail('nn.functional.pixel_shuffle', ''),  # aten.pixel_shuffle.default - couldn't find symbolic meta function/decompos...
     xfail('nn.functional.pixel_unshuffle', ''),  # aten.pixel_unshuffle.default - couldn't find symbolic meta function/deco...
-    xfail('nn.functional.rrelu', ''),  # aten.empty_like.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.smooth_l1_loss', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.unfold', ''),  # aten.im2col.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.upsample_nearest', ''),  # aten.upsample_nearest1d.vec - couldn't find symbolic meta function/deco...
@@ -1298,7 +1293,6 @@ def f(a, b, c, d, e):
     xfail('polygamma', 'polygamma_n_2'),  # aten.polygamma.default - couldn't find symbolic meta function/decomposition
     xfail('polygamma', 'polygamma_n_3'),  # aten.polygamma.default - couldn't find symbolic meta function/decomposition
     xfail('polygamma', 'polygamma_n_4'),  # aten.polygamma.default - couldn't find symbolic meta function/decomposition
-    xfail('put', ''),  # aten.clone.default - couldn't find symbolic meta function/decomposition
     xfail('quantile', ''),  # Could not run 'aten::equal' with arguments from the 'Meta' backend.
     xfail('qr', ''),  # aten.linalg_qr.default - couldn't find symbolic meta function/decomposition
     xfail('rad2deg', ''),  # aten.rad2deg.default - couldn't find symbolic meta function/decomposition
@@ -1347,11 +1341,15 @@ def f(a, b, c, d, e):
 
 symbolic_tensor_failures.update(symbolic_tensor_segfaults)
 
+outplace_symbolic_tensor_failures = {
+    xfail('masked_fill', ''),  # expected predicate to be bool, got torch.float32
+    xfail('masked_scatter', ''),  # aten.masked_scatter.default - couldn't find symbolic meta function/decomposition
+    xfail('nn.functional.rrelu', ''),  # aten.empty_like.default - couldn't find symbolic meta function/decomposition
+}
+
 inplace_symbolic_tensor_failures = {
-    xfail('abs', ''),  # aten.abs_.default - couldn't find symbolic meta function/decomposition
     xfail('acos', ''),  # aten.acos_.default - couldn't find symbolic meta function/decomposition
     xfail('acosh', ''),  # aten.acosh_.default - couldn't find symbolic meta function/decomposition
-    xfail('addbmm', ''),  # aten.addbmm_.default - couldn't find symbolic meta function/decomposition
     xfail('addcdiv', ''),  # aten.addcdiv_.default - couldn't find symbolic meta function/decomposition
     xfail('addcmul', ''),  # aten.addcmul_.default - couldn't find symbolic meta function/decomposition
     xfail('addmm', ''),  # aten.addmm_.default - couldn't find symbolic meta function/decomposition
@@ -1365,7 +1363,6 @@ def f(a, b, c, d, e):
     xfail('clamp', ''),  # aten.clamp_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('clamp_max', ''),  # aten.clamp_max_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('clamp_min', ''),  # aten.clamp_min_.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('conj_physical', ''),  # aten.conj_physical_.default - couldn't find symbolic meta function/decomposition
     xfail('copysign', ''),  # aten.copysign_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('cos', ''),  # aten.cos_.default - couldn't find symbolic meta function/decomposition
     xfail('cosh', ''),  # aten.cosh_.default - couldn't find symbolic meta function/decomposition
@@ -1382,7 +1379,6 @@ def f(a, b, c, d, e):
     xfail('expm1', ''),  # aten.expm1_.default - couldn't find symbolic meta function/decomposition
     xfail('float_power', ''),  # the base given to float_power_ has dtype Float but the operation's result requires dtype Double
     xfail('floor', ''),  # aten.floor_.default - couldn't find symbolic meta function/decomposition
-    xfail('floor_divide', ''),  # aten.floor_divide_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('fmod', ''),  # aten.fmod_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('frac', ''),  # aten.frac_.default - couldn't find symbolic meta function/decomposition
     xfail('ge', ''),  # aten.ge_.Tensor - couldn't find symbolic meta function/decomposition
@@ -1398,7 +1394,6 @@ def f(a, b, c, d, e):
     xfail('log1p', ''),  # aten.log1p_.default - couldn't find symbolic meta function/decomposition
     xfail('log2', ''),  # aten.log2_.default - couldn't find symbolic meta function/decomposition
     xfail('log', ''),  # aten.log_.default - couldn't find symbolic meta function/decomposition
-    xfail('logit', ''),  # aten.logit_.default - couldn't find symbolic meta function/decomposition
     xfail('lt', ''),  # aten.lt_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('mvlgamma', 'mvlgamma_p_1'),  # aten.mvlgamma_.default - couldn't find symbolic meta function/decomposition
     xfail('mvlgamma', 'mvlgamma_p_3'),  # aten.mvlgamma_.default - couldn't find symbolic meta function/decomposition
@@ -1408,7 +1403,6 @@ def f(a, b, c, d, e):
     xfail('neg', ''),  # aten.neg_.default - couldn't find symbolic meta function/decomposition
     xfail('nextafter', ''),  # aten.nextafter_.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.celu', ''),  # aten.celu_.default - couldn't find symbolic meta function/decomposition
-    xfail('nn.functional.dropout3d', ''),  # aten.squeeze_.dim - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.elu', ''),  # aten.elu_.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.hardsigmoid', ''),  # aten.hardsigmoid_.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.mish', ''),  # aten.mish_.default - couldn't find symbolic meta function/decomposition
@@ -1426,7 +1420,6 @@ def f(a, b, c, d, e):
     xfail('sinh', ''),  # aten.sinh_.default - couldn't find symbolic meta function/decomposition
     xfail('sqrt', ''),  # aten.sqrt_.default - couldn't find symbolic meta function/decomposition
     xfail('square', ''),  # aten.pow_.Scalar - couldn't find symbolic meta function/decomposition
-    xfail('squeeze', ''),  # aten.squeeze_.default - couldn't find symbolic meta function/decomposition
     xfail('t', ''),  # aten.t_.default - couldn't find symbolic meta function/decomposition
     xfail('tan', ''),  # aten.tan_.default - couldn't find symbolic meta function/decomposition
     xfail('tanh', ''),  # aten.tanh_.default - couldn't find symbolic meta function/decomposition
@@ -1516,7 +1509,7 @@ def test_make_fx_fake_exhaustive(self, device, dtype, op):
     @skipIfNoSympy
     @ops(op_db, allowed_dtypes=(torch.float,))
     @skipOps('TestProxyTensorOpInfo', 'test_make_fx_symbolic_exhaustive',
-             make_fx_failures | fake_tensor_failures | symbolic_tensor_failures)
+             make_fx_failures | fake_tensor_failures | symbolic_tensor_failures | outplace_symbolic_tensor_failures)
     def test_make_fx_symbolic_exhaustive(self, device, dtype, op):
         _test_make_fx_helper(self, device, dtype, op, "symbolic")
 
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 4fa3ab09d2755..9849df0a58af5 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1513,7 +1513,6 @@ def full(size, fill_value, *args, **kwargs):
         aten.randn_like.default,
         aten.rand_like.default,
         aten.full_like.default,
-        aten.zeros_like.default,
         aten.ones_like.default,
     ]
 )
@@ -1521,6 +1520,44 @@ def meta_like(self, *args, **kwargs):
     return aten.empty_like.default(self, **kwargs)
 
 
+# zeros_like is special cased to work for sparse
+@register_meta(aten.zeros_like.default)
+def zeros_like(
+    self, dtype=None, layout=None, device=None, pin_memory=None, memory_format=None
+):
+    if layout == torch.sparse_coo:
+        check(
+            memory_format is None,
+            lambda: "memory format option is only supported by strided tensors",
+        )
+
+        res = torch.empty(
+            0,
+            dtype=self.dtype if dtype is None else dtype,
+            layout=layout,
+            device=self.device if device is None else device,
+            pin_memory=pin_memory,
+        )
+
+        if self.is_sparse:
+            res.sparse_resize_and_clear_(
+                self.size(), self.sparse_dim(), self.dense_dim()
+            )
+        else:
+            res.sparse_resize_and_clear_(self.size(), self.dim(), 0)
+
+        res._coalesced_(True)
+        return res
+    return aten.empty_like.default(
+        self,
+        dtype=dtype,
+        layout=layout,
+        device=device,
+        pin_memory=pin_memory,
+        memory_format=memory_format,
+    )
+
+
 # hacky: Please remove after math.ceil works with arange
 @register_meta(aten.arange.default)
 def arange(end, **kwargs):
diff --git a/torch/_ops.py b/torch/_ops.py
index 9163932144d0d..b20398a7f3ab3 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -365,6 +365,7 @@ def handler(*args, **kwargs):
             return handler
 
         final_key = resolve_key(self, key)
+        # print(self, key, final_key)
         r = self.py_kernels.get(final_key, final_key)
         self._dispatch_cache[key] = r
         return r
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index da8d9af723acf..a4bac68f0ff14 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -1150,9 +1150,6 @@ def _minimum_aten(
 
 #
 # View operations
-#
-# TODO: model view relationships
-# TODO: model storage
 def _as_strided_meta(
     a: TensorLikeType, size: ShapeType, stride: StrideType, storage_offset: int
 ) -> TensorLikeType:
@@ -1170,7 +1167,7 @@ def _as_strided_meta(
             a._typed_storage(), size, stride, storage_offset
         )
 
-    return TensorMeta(a, shape=size, strides=stride)
+    return torch.as_strided(a, size, stride, storage_offset)
 
 
 def _as_strided_aten(
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index 128796dfa3d07..041448e8102ac 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -291,6 +291,9 @@ def is_non_overlapping_and_dense(a: Tensor) -> bool:
     its dimensions that is contiguous.
     """
 
+    if a.is_sparse:
+        return False
+
     # Short-circuits if the tensor is already contiguous or channels-last contiguous
     if is_contiguous(a) or is_channels_last_contiguous(a):
         return True
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 5d3d3a0e32fe1..9a0ac050e6b94 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -1,7 +1,6 @@
 import contextlib
 import functools
 import itertools
-import sys
 import weakref
 from dataclasses import dataclass
 from functools import partial
@@ -297,8 +296,9 @@ def constructors(fake_mode, func, *args, **kwargs):
     out_device = new_kwargs.pop("device", None)
     out_device = out_device if out_device is not None else default_device
     new_kwargs["device"] = torch.device("meta")
-    # Not in_kernel_invocation_manager as no fake tensor inputs
-    with no_dispatch():
+    # _like constructors have fake tensor inputs (maybe this causes the non-like
+    # to fail? hmmm)
+    with in_kernel_invocation_manager(fake_mode):
         r = func(*args, **new_kwargs)
     return FakeTensor(fake_mode, r, out_device)
 
@@ -821,40 +821,30 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         # is written to must be invalidated
         self.invalidate_written_to_constants(func, flat_arg_fake_tensors, args, kwargs)
 
-        from torch._decomp import decomposition_table
-
-        with self:
-            # Decomposes CompositeImplicitAutograd ops
-            r = func.decompose(*args, **kwargs)
-            if r is not NotImplemented:
-                return r
+        # If there's a Python meta, prefer that over the decomposition
+        from torch._decomp import meta_table as meta_table
 
-        # IDK: feels bad man, sym_numel on as_strided infinite loops otherwise
-        if has_symbolic_sizes and not self.cpp_meta_supports_symint(func):
-            from torch._decomp import meta_table as meta_table
+        if func not in meta_table and not self.cpp_meta_supports_symint(func):
+            from torch._decomp import decomposition_table
 
-            if func == aten.size.default:
-                sys.stderr.write(
-                    "Trying to call aten.size on a tensor with symbolic shapes. "
-                    "It's likely that this is from calling tensor.shape in C++"
+            # Prefer Python decompositions over C++ ones
+            if func in decomposition_table and (
+                has_symbolic_sizes
+                or (
+                    # TODO: Remove these exclusions, so that we can remove
+                    # this leg entirely
+                    torch_decomp_decompositions(func)
+                    and all(not e.is_sparse for e in flat_arg_fake_tensors)
                 )
-                # We do this to allow for better error localization with `TORCH_SHOW_CPP_STACKTRACES=1`
-                return None
-
-            with self:
-                if func in meta_table:
-                    r = meta_table[func](*args, **kwargs)
-                    return r
-                if func in decomposition_table:
+            ):
+                with self:
                     return decomposition_table[func](*args, **kwargs)
 
-        if (
-            func in decomposition_table
-            and torch_decomp_decompositions(func)
-            and all(not e.is_sparse for e in flat_arg_fake_tensors)
-        ):
             with self:
-                return decomposition_table[func](*args, **kwargs)
+                # Decomposes CompositeImplicitAutograd ops
+                r = func.decompose(*args, **kwargs)
+                if r is not NotImplemented:
+                    return r
 
         # prims already wrap FakeTensor inputs to FakeTensor outputs
         # and do device logic, we dont need do anything but run them
@@ -865,12 +855,6 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
             with self:
                 return func.prim_meta_impl(*args, **kwargs)
 
-        if has_symbolic_sizes:
-            if not self.cpp_meta_supports_symint(func):
-                raise RuntimeError(
-                    f"{func} - couldn't find symbolic meta function/decomposition"
-                )
-
         # special handling for funcs registered through `register_op_impl`,
         # e.g., manipulating args on constructor calls to construct meta tensors
         # and then afterwards wrapping them to a FakeTensor

From 0c28f263c5789ecf24f23d0eb46954ea0ead7fb8 Mon Sep 17 00:00:00 2001
From: "Wang, Eikan" <eikan.wang@intel.com>
Date: Thu, 17 Nov 2022 03:33:32 +0000
Subject: [PATCH 1027/1922] Support masked_fill (#88736)

Support `masked_fill` to address the GPT2 performance issue.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88736
Approved by: https://github.com/jansel, https://github.com/jgong5
---
 test/inductor/test_torchinductor.py  | 23 +++++++++++++
 torch/_inductor/codegen/cpp.py       | 51 ++++++++++++++++++++++++----
 torch/_inductor/codegen/cpp_prefix.h | 12 +++++++
 3 files changed, 79 insertions(+), 7 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index fb7ca1fc92b73..efedeca381f36 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -4602,6 +4602,29 @@ def test_complex_memory_overlap(self):
             not codecache.get_cpu_proc_info(), "Does not support vectorization"
         )
         @patch("torch.cuda.is_available", lambda: False)
+        def test_masked_fill_softmax(self):
+            def fn(value, mask):
+                mask = mask.to(torch.bool)
+                x = torch.masked_fill(value, mask, -33.0)
+                return torch.softmax(x, -1)
+
+            value = torch.randn((2, 17))
+            mask = torch.randint(0, 1, size=(2, 17), dtype=torch.uint8)
+            with patch.object(config.cpp, "simdlen", None):
+                torch._dynamo.reset()
+                metrics.reset()
+                opt_fn = torch._dynamo.optimize("inductor")(fn)
+                opt_fn(value, mask)
+
+                real_out = fn(value, mask)
+                compiled_out = opt_fn(value, mask)
+                assert same(real_out, compiled_out, equal_nan=True)
+                assert metrics.generated_cpp_vec_kernel_count >= 1
+
+        @unittest.skipIf(
+            not codecache.valid_vec_isa_list(), "Does not support vectorization"
+        )
+        @patch("torch.cuda.is_available", lambda: False)
         def test_sign_cpu_only(self):
             def fn(x):
                 return (torch.sign(x),)
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index 65a9335d6cbfc..9f00563a954e6 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -311,6 +311,10 @@ def maximum(a, b):
     def square(a):
         return f"{a}.pow(2)"
 
+    @staticmethod
+    def where(a, b, c):
+        return f"decltype({b})::blendv({c}, {b}, {a})"
+
     @staticmethod
     def sign(x):
         code = BracesBuffer()
@@ -330,6 +334,11 @@ def sign(x):
         V.kernel.compute.splice(code)
         return result
 
+    @staticmethod
+    def to_dtype(x, dtype):
+        assert dtype in [torch.bool], f"{__name__} does not support {dtype}"
+        return f"({x})"
+
 
 class CppOverrides(OpOverrides):
     """Map element-wise ops to C++"""
@@ -740,7 +749,16 @@ def load(self, name: str, index: sympy.Expr):
         if expanded_index == new_index:
             line = f"at::vec::Vectorized<float>({var}[{cexpr(index)}])"
         else:
-            line = f"at::vec::Vectorized<float>::loadu({var} + {cexpr(new_index)})"
+            if V.graph.get_dtype(name) in [torch.bool, torch.uint8]:
+                g_tmp_buf = f"g_tmp_buffer_{var}"
+                nelements = codecache.pick_vec_isa().nelements()
+                self.loads.writeline(f"float {g_tmp_buf}[{nelements}] = {{0}};")
+                self.loads.writeline(
+                    f"flag_to_float({var} + {cexpr(new_index)}, {g_tmp_buf}, {nelements});"
+                )
+                line = f"at::vec::Vectorized<float>::loadu({g_tmp_buf})"
+            else:
+                line = f"at::vec::Vectorized<float>::loadu({var} + {cexpr(new_index)})"
 
         return self.cse.generate(self.loads, line)
 
@@ -837,9 +855,6 @@ def is_legal_data_access(self, var: sympy.Symbol, index: sympy.Expr):
         return self.is_var_irrevelant(var, index) or self.is_single_step_var(var, index)
 
     def could_vec(self, name: str, index: sympy.Expr):
-        if V.graph.get_dtype(name) is not torch.float:
-            return False
-
         assert self.itervars is not None
         # Not a loop
         if len(self.itervars) == 0:
@@ -849,12 +864,24 @@ def could_vec(self, name: str, index: sympy.Expr):
         return self.is_legal_data_access(most_inner_var, index)
 
     def load(self, name: str, index: sympy.Expr):
-        index = self.rename_indexing(index)
+        if not V.graph.get_dtype(name) in [
+            torch.float,
+            torch.float32,
+            torch.bool,
+            torch.uint8,
+        ]:
+            self.simd_vec = False
+            return self.simd_vec
 
+        index = self.rename_indexing(index)
         self.simd_vec = self.simd_vec and self.could_vec(name, index)
         return self.simd_vec
 
     def store(self, name, index, value, mode=None):
+        if not V.graph.get_dtype(name) in [torch.float, torch.float32]:
+            self.simd_vec = False
+            return self.simd_vec
+
         assert "buf" in name
         index = self.rename_indexing(index)
 
@@ -927,15 +954,24 @@ def constant(val, dtype):
             @staticmethod
             def index_expr(expr, dtype):
                 self.simd_vec = False
-                return self.cse.newvar()
+                tmp_var = self.cse.newvar()
+                return tmp_var
 
             @staticmethod
             def indirect_indexing(index_var):
+                self.simd_vec = False
                 return sympy.Symbol(str(index_var))
 
             @staticmethod
             def masked(mask, body, other):
-                return V.kernel.cse.newvar()
+                tmp_var = self.cse.newvar()
+                return tmp_var
+
+            @staticmethod
+            def to_dtype(x, dtype):
+                if dtype != torch.bool:
+                    self.simd_vec = False
+                return x
 
         self.exit_stack.enter_context(V.set_ops_handler(VecCheckerProxy()))
         self.exit_stack.enter_context(V.set_kernel_handler(self))
@@ -1040,6 +1076,7 @@ def codegen_loops(self, code, worksharing):
         if reduction_par_depth > 0 and reduction_par_depth != len(
             loops_nest_reduce.loops
         ):
+            metrics.generated_cpp_vec_kernel_count -= 1
             return self.simd_omp_kernel.codegen_loops(code, worksharing)
 
         with contextlib.ExitStack() as stack:
diff --git a/torch/_inductor/codegen/cpp_prefix.h b/torch/_inductor/codegen/cpp_prefix.h
index 1905aefcda5c0..c1c9c3bae112d 100644
--- a/torch/_inductor/codegen/cpp_prefix.h
+++ b/torch/_inductor/codegen/cpp_prefix.h
@@ -57,3 +57,15 @@ template <typename T> void atomic_add(volatile T *addr, T offset) {
   } while (!atomic_addr->compare_exchange_weak(expected, desired,
                                                std::memory_order_relaxed));
 }
+
+// This function is used to convert bool or uint8 to float mask for
+// vectorization. The caller needs to make sure the src represents TRUE/FALSE
+// correctly.
+template <typename T>
+void flag_to_float(const T* src, float* dst, int64_t n) {
+#pragma unroll
+  for (int64_t i = 0; i < n; i++) {
+    uint32_t* dst_u32 = (uint32_t*)dst;
+    dst_u32[i] = *(src + i) ? 0xFFFFFFFF : 0;
+  }
+}

From 0d7a09e91916f8dc3768b8ccde1808b006c9646d Mon Sep 17 00:00:00 2001
From: Jiong Gong <jiong.gong@intel.com>
Date: Thu, 17 Nov 2022 16:43:16 +0000
Subject: [PATCH 1028/1922] Fix buffer overflow from AddressSanitizer checks
 due to inaccurate bfloat16 representation of large integer (#89210)

Fixes #88939

The root cause of the issue is that BF16 cannot accurately represent big integer values. In the test case below, `539` as one of the corner pixel index is wrongly represented as `540` (from https://github.com/jgong5/pytorch/blob/fc60a1865eafc985217eccc0251f82014041e6a7/aten/src/ATen/native/UpSample.h#L271) and then the access out of the range with this index. Thanks to @malfet for the investigation and initial fix. I also reported an issue https://github.com/pytorch/pytorch/issues/89212 to track the issue of inaccurate integer representation of bf16 that need to be addressed in other places of PyTorch.
```python
import torch

def test():
    arg_1 = torch.rand([1, 10, 540, 540], dtype=torch.bfloat16).clone()
    res = torch.nn.functional.interpolate(arg_1,2,mode='bilinear',align_corners=True)

test()
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89210
Approved by: https://github.com/malfet
---
 aten/src/ATen/native/cpu/UpSampleKernel.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/aten/src/ATen/native/cpu/UpSampleKernel.cpp b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
index 7eb7cf5e58bbf..8d418c2645040 100644
--- a/aten/src/ATen/native/cpu/UpSampleKernel.cpp
+++ b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
@@ -474,9 +474,9 @@ void cpu_upsample_linear_channels_last(
   using opmath_t = at::opmath_type<scalar_t>;
   using Vec = vec::Vectorized<scalar_t>;
   auto loop2d = [&](int64_t begin, int64_t end) {
-    const scalar_t height_scale = area_pixel_compute_scale<scalar_t>(
+    const auto height_scale = area_pixel_compute_scale<opmath_t>(
         input_height, output_height, align_corners, scales[0]);
-    const scalar_t width_scale = area_pixel_compute_scale<scalar_t>(
+    const auto width_scale = area_pixel_compute_scale<opmath_t>(
         input_width, output_width, align_corners, scales[1]);
 
     auto input_indexr = [=](int64_t n, int64_t h, int64_t w) {
@@ -486,7 +486,7 @@ void cpu_upsample_linear_channels_last(
 
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     int64_t ih0, ih1, iw0, iw1;
-    scalar_t h0lambda, h1lambda, w0lambda, w1lambda;
+    opmath_t h0lambda, h1lambda, w0lambda, w1lambda;
     for (const auto n : c10::irange(begin, end)) {
       for (const auto oh : c10::irange(output_height)) {
         compute_source_index_and_lambda(
@@ -521,11 +521,11 @@ void cpu_upsample_linear_channels_last(
   };
 
   auto loop3d = [&](int64_t begin, int64_t end) {
-    const scalar_t depth_scale = area_pixel_compute_scale<scalar_t>(
+    const auto depth_scale = area_pixel_compute_scale<opmath_t>(
         input_depth, output_depth, align_corners, scales[0]);
-    const scalar_t height_scale = area_pixel_compute_scale<scalar_t>(
+    const auto height_scale = area_pixel_compute_scale<opmath_t>(
         input_height, output_height, align_corners, scales[1]);
-    const scalar_t width_scale = area_pixel_compute_scale<scalar_t>(
+    const auto width_scale = area_pixel_compute_scale<opmath_t>(
         input_width, output_width, align_corners, scales[2]);
 
     auto input_indexr = [=](int64_t n, int64_t d, int64_t h, int64_t w) {
@@ -536,7 +536,7 @@ void cpu_upsample_linear_channels_last(
 
     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
     int64_t id0, id1, ih0, ih1, iw0, iw1;
-    scalar_t d0lambda, d1lambda, h0lambda, h1lambda, w0lambda, w1lambda;
+    opmath_t d0lambda, d1lambda, h0lambda, h1lambda, w0lambda, w1lambda;
     for (const auto n : c10::irange(begin, end)) {
       for (const auto od : c10::irange(output_depth)) {
         compute_source_index_and_lambda(

From df41c101d59aaf098f24bcd802a65f3c5ee9951d Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Thu, 17 Nov 2022 17:02:36 +0000
Subject: [PATCH 1029/1922] Revert "Reland "Towards unifying symbolic and non
 symbolic fake tensor (#89038)" (#89143)"

This reverts commit e686b8c3ba93cb7caa314c78bf84dbd2d7df9683.

Reverted https://github.com/pytorch/pytorch/pull/89143 on behalf of https://github.com/ZainRizvi due to This seems to be causing the test_make_fx_symbolic_exhaustive_rad2deg_cpu_float32 and test_make_fx_symbolic_exhaustive_inplace_rad2deg_cpu_float32 test to fail across multiple jobs
---
 aten/src/ATen/native/TensorFactories.cpp |  6 +++
 test/functorch/test_aotdispatch.py       |  1 +
 test/test_proxy_tensor.py                | 21 ++++++---
 torch/_meta_registrations.py             | 39 +---------------
 torch/_ops.py                            |  1 -
 torch/_prims/__init__.py                 |  5 +-
 torch/_prims_common/__init__.py          |  3 --
 torch/_subclasses/fake_tensor.py         | 58 +++++++++++++++---------
 8 files changed, 63 insertions(+), 71 deletions(-)

diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 7245cb77b1c50..9d1c6d8a36333 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -325,6 +325,12 @@ Tensor empty_like(
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options_ = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
+
+  TORCH_CHECK(
+    !(options_.has_memory_format() && optional_memory_format.has_value()),
+    "Cannot set memory_format both in TensorOptions and explicit argument; please delete "
+    "the redundant setter.");
+
   TensorOptions options =
       self.options()
           .merge_in(options_)
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index ae216f9be4a49..1dc5476158f96 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1011,6 +1011,7 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('cumprod', ''),  # aten.cumprod.default - couldn't find symbolic meta function/decomposition
     xfail('cumsum', ''),  # aten.cumsum.default - couldn't find symbolic meta function/decomposition
     xfail('cumulative_trapezoid', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
+    xfail('deg2rad', ''),  # aten.deg2rad.default - couldn't find symbolic meta function/decomposition
     xfail('diff', ''),  # aten.zeros_like.default - couldn't find symbolic meta function/decomposition
     xfail('digamma', ''),  # aten.polygamma.default - couldn't find symbolic meta function/decomposition
     xfail('dist', ''),  # aten.dist.default - couldn't find symbolic meta function/decomposition
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 0a24807af55f0..8dc42be7fdfb2 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1151,7 +1151,9 @@ def f(a, b, c, d, e):
     xfail('cummin', ''),  # aten.cummin.default - couldn't find symbolic meta function/decomposition
     xfail('cumprod', ''),  # aten.cumprod.default - couldn't find symbolic meta function/decomposition
     xfail('cumulative_trapezoid', ''),  # aten.slice.Tensor - couldn't find symbolic meta function/decomposition
+    xfail('deg2rad', ''),  # aten.deg2rad.default - couldn't find symbolic meta function/decomposition
     xfail('diff', ''),  # aten.empty_like.default - couldn't find symbolic meta function/decomposition
+    xfail('dist', ''),  # aten.dist.default - couldn't find symbolic meta function/decomposition
     xfail('dsplit', ''),  # aten.slice.Tensor - couldn't find symbolic meta function/decomposition
     xfail('fft.fft2', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('fft.fft', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
@@ -1233,6 +1235,8 @@ def f(a, b, c, d, e):
     xfail('lu', ''),  # aten.linalg_lu_factor_ex.default - couldn't find symbolic meta function/decomposition
     xfail('lu_solve', ''),  # aten.linalg_lu_solve.default - couldn't find symbolic meta function/decomposition
     xfail('lu_unpack', ''),  # aten.lu_unpack.default - couldn't find symbolic meta function/decomposition
+    xfail('masked_fill', ''),  # expected predicate to be bool, got torch.float32
+    xfail('masked_scatter', ''),  # aten.masked_scatter.default - couldn't find symbolic meta function/decomposition
     xfail('masked_select', ''),  # aten.masked_select.default - couldn't find symbolic meta function/decomposition
     xfail('matrix_exp', ''),  # aten.linalg_matrix_exp.default - couldn't find symbolic meta function/decomposition
     xfail('median', ''),  # Could not run 'aten::median' with arguments from the 'Meta' backend. This could be becau...
@@ -1277,6 +1281,7 @@ def f(a, b, c, d, e):
     xfail('nn.functional.pdist', ''),  # Could not run 'aten::_pdist_forward' with arguments from the 'Meta' backend...
     xfail('nn.functional.pixel_shuffle', ''),  # aten.pixel_shuffle.default - couldn't find symbolic meta function/decompos...
     xfail('nn.functional.pixel_unshuffle', ''),  # aten.pixel_unshuffle.default - couldn't find symbolic meta function/deco...
+    xfail('nn.functional.rrelu', ''),  # aten.empty_like.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.smooth_l1_loss', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.unfold', ''),  # aten.im2col.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.upsample_nearest', ''),  # aten.upsample_nearest1d.vec - couldn't find symbolic meta function/deco...
@@ -1293,6 +1298,7 @@ def f(a, b, c, d, e):
     xfail('polygamma', 'polygamma_n_2'),  # aten.polygamma.default - couldn't find symbolic meta function/decomposition
     xfail('polygamma', 'polygamma_n_3'),  # aten.polygamma.default - couldn't find symbolic meta function/decomposition
     xfail('polygamma', 'polygamma_n_4'),  # aten.polygamma.default - couldn't find symbolic meta function/decomposition
+    xfail('put', ''),  # aten.clone.default - couldn't find symbolic meta function/decomposition
     xfail('quantile', ''),  # Could not run 'aten::equal' with arguments from the 'Meta' backend.
     xfail('qr', ''),  # aten.linalg_qr.default - couldn't find symbolic meta function/decomposition
     xfail('rad2deg', ''),  # aten.rad2deg.default - couldn't find symbolic meta function/decomposition
@@ -1341,15 +1347,11 @@ def f(a, b, c, d, e):
 
 symbolic_tensor_failures.update(symbolic_tensor_segfaults)
 
-outplace_symbolic_tensor_failures = {
-    xfail('masked_fill', ''),  # expected predicate to be bool, got torch.float32
-    xfail('masked_scatter', ''),  # aten.masked_scatter.default - couldn't find symbolic meta function/decomposition
-    xfail('nn.functional.rrelu', ''),  # aten.empty_like.default - couldn't find symbolic meta function/decomposition
-}
-
 inplace_symbolic_tensor_failures = {
+    xfail('abs', ''),  # aten.abs_.default - couldn't find symbolic meta function/decomposition
     xfail('acos', ''),  # aten.acos_.default - couldn't find symbolic meta function/decomposition
     xfail('acosh', ''),  # aten.acosh_.default - couldn't find symbolic meta function/decomposition
+    xfail('addbmm', ''),  # aten.addbmm_.default - couldn't find symbolic meta function/decomposition
     xfail('addcdiv', ''),  # aten.addcdiv_.default - couldn't find symbolic meta function/decomposition
     xfail('addcmul', ''),  # aten.addcmul_.default - couldn't find symbolic meta function/decomposition
     xfail('addmm', ''),  # aten.addmm_.default - couldn't find symbolic meta function/decomposition
@@ -1363,6 +1365,7 @@ def f(a, b, c, d, e):
     xfail('clamp', ''),  # aten.clamp_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('clamp_max', ''),  # aten.clamp_max_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('clamp_min', ''),  # aten.clamp_min_.Tensor - couldn't find symbolic meta function/decomposition
+    xfail('conj_physical', ''),  # aten.conj_physical_.default - couldn't find symbolic meta function/decomposition
     xfail('copysign', ''),  # aten.copysign_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('cos', ''),  # aten.cos_.default - couldn't find symbolic meta function/decomposition
     xfail('cosh', ''),  # aten.cosh_.default - couldn't find symbolic meta function/decomposition
@@ -1379,6 +1382,7 @@ def f(a, b, c, d, e):
     xfail('expm1', ''),  # aten.expm1_.default - couldn't find symbolic meta function/decomposition
     xfail('float_power', ''),  # the base given to float_power_ has dtype Float but the operation's result requires dtype Double
     xfail('floor', ''),  # aten.floor_.default - couldn't find symbolic meta function/decomposition
+    xfail('floor_divide', ''),  # aten.floor_divide_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('fmod', ''),  # aten.fmod_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('frac', ''),  # aten.frac_.default - couldn't find symbolic meta function/decomposition
     xfail('ge', ''),  # aten.ge_.Tensor - couldn't find symbolic meta function/decomposition
@@ -1394,6 +1398,7 @@ def f(a, b, c, d, e):
     xfail('log1p', ''),  # aten.log1p_.default - couldn't find symbolic meta function/decomposition
     xfail('log2', ''),  # aten.log2_.default - couldn't find symbolic meta function/decomposition
     xfail('log', ''),  # aten.log_.default - couldn't find symbolic meta function/decomposition
+    xfail('logit', ''),  # aten.logit_.default - couldn't find symbolic meta function/decomposition
     xfail('lt', ''),  # aten.lt_.Tensor - couldn't find symbolic meta function/decomposition
     xfail('mvlgamma', 'mvlgamma_p_1'),  # aten.mvlgamma_.default - couldn't find symbolic meta function/decomposition
     xfail('mvlgamma', 'mvlgamma_p_3'),  # aten.mvlgamma_.default - couldn't find symbolic meta function/decomposition
@@ -1403,6 +1408,7 @@ def f(a, b, c, d, e):
     xfail('neg', ''),  # aten.neg_.default - couldn't find symbolic meta function/decomposition
     xfail('nextafter', ''),  # aten.nextafter_.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.celu', ''),  # aten.celu_.default - couldn't find symbolic meta function/decomposition
+    xfail('nn.functional.dropout3d', ''),  # aten.squeeze_.dim - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.elu', ''),  # aten.elu_.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.hardsigmoid', ''),  # aten.hardsigmoid_.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.mish', ''),  # aten.mish_.default - couldn't find symbolic meta function/decomposition
@@ -1420,6 +1426,7 @@ def f(a, b, c, d, e):
     xfail('sinh', ''),  # aten.sinh_.default - couldn't find symbolic meta function/decomposition
     xfail('sqrt', ''),  # aten.sqrt_.default - couldn't find symbolic meta function/decomposition
     xfail('square', ''),  # aten.pow_.Scalar - couldn't find symbolic meta function/decomposition
+    xfail('squeeze', ''),  # aten.squeeze_.default - couldn't find symbolic meta function/decomposition
     xfail('t', ''),  # aten.t_.default - couldn't find symbolic meta function/decomposition
     xfail('tan', ''),  # aten.tan_.default - couldn't find symbolic meta function/decomposition
     xfail('tanh', ''),  # aten.tanh_.default - couldn't find symbolic meta function/decomposition
@@ -1509,7 +1516,7 @@ def test_make_fx_fake_exhaustive(self, device, dtype, op):
     @skipIfNoSympy
     @ops(op_db, allowed_dtypes=(torch.float,))
     @skipOps('TestProxyTensorOpInfo', 'test_make_fx_symbolic_exhaustive',
-             make_fx_failures | fake_tensor_failures | symbolic_tensor_failures | outplace_symbolic_tensor_failures)
+             make_fx_failures | fake_tensor_failures | symbolic_tensor_failures)
     def test_make_fx_symbolic_exhaustive(self, device, dtype, op):
         _test_make_fx_helper(self, device, dtype, op, "symbolic")
 
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 9849df0a58af5..4fa3ab09d2755 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1513,6 +1513,7 @@ def full(size, fill_value, *args, **kwargs):
         aten.randn_like.default,
         aten.rand_like.default,
         aten.full_like.default,
+        aten.zeros_like.default,
         aten.ones_like.default,
     ]
 )
@@ -1520,44 +1521,6 @@ def meta_like(self, *args, **kwargs):
     return aten.empty_like.default(self, **kwargs)
 
 
-# zeros_like is special cased to work for sparse
-@register_meta(aten.zeros_like.default)
-def zeros_like(
-    self, dtype=None, layout=None, device=None, pin_memory=None, memory_format=None
-):
-    if layout == torch.sparse_coo:
-        check(
-            memory_format is None,
-            lambda: "memory format option is only supported by strided tensors",
-        )
-
-        res = torch.empty(
-            0,
-            dtype=self.dtype if dtype is None else dtype,
-            layout=layout,
-            device=self.device if device is None else device,
-            pin_memory=pin_memory,
-        )
-
-        if self.is_sparse:
-            res.sparse_resize_and_clear_(
-                self.size(), self.sparse_dim(), self.dense_dim()
-            )
-        else:
-            res.sparse_resize_and_clear_(self.size(), self.dim(), 0)
-
-        res._coalesced_(True)
-        return res
-    return aten.empty_like.default(
-        self,
-        dtype=dtype,
-        layout=layout,
-        device=device,
-        pin_memory=pin_memory,
-        memory_format=memory_format,
-    )
-
-
 # hacky: Please remove after math.ceil works with arange
 @register_meta(aten.arange.default)
 def arange(end, **kwargs):
diff --git a/torch/_ops.py b/torch/_ops.py
index b20398a7f3ab3..9163932144d0d 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -365,7 +365,6 @@ def handler(*args, **kwargs):
             return handler
 
         final_key = resolve_key(self, key)
-        # print(self, key, final_key)
         r = self.py_kernels.get(final_key, final_key)
         self._dispatch_cache[key] = r
         return r
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index a4bac68f0ff14..da8d9af723acf 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -1150,6 +1150,9 @@ def _minimum_aten(
 
 #
 # View operations
+#
+# TODO: model view relationships
+# TODO: model storage
 def _as_strided_meta(
     a: TensorLikeType, size: ShapeType, stride: StrideType, storage_offset: int
 ) -> TensorLikeType:
@@ -1167,7 +1170,7 @@ def _as_strided_meta(
             a._typed_storage(), size, stride, storage_offset
         )
 
-    return torch.as_strided(a, size, stride, storage_offset)
+    return TensorMeta(a, shape=size, strides=stride)
 
 
 def _as_strided_aten(
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index 041448e8102ac..128796dfa3d07 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -291,9 +291,6 @@ def is_non_overlapping_and_dense(a: Tensor) -> bool:
     its dimensions that is contiguous.
     """
 
-    if a.is_sparse:
-        return False
-
     # Short-circuits if the tensor is already contiguous or channels-last contiguous
     if is_contiguous(a) or is_channels_last_contiguous(a):
         return True
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 9a0ac050e6b94..5d3d3a0e32fe1 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -1,6 +1,7 @@
 import contextlib
 import functools
 import itertools
+import sys
 import weakref
 from dataclasses import dataclass
 from functools import partial
@@ -296,9 +297,8 @@ def constructors(fake_mode, func, *args, **kwargs):
     out_device = new_kwargs.pop("device", None)
     out_device = out_device if out_device is not None else default_device
     new_kwargs["device"] = torch.device("meta")
-    # _like constructors have fake tensor inputs (maybe this causes the non-like
-    # to fail? hmmm)
-    with in_kernel_invocation_manager(fake_mode):
+    # Not in_kernel_invocation_manager as no fake tensor inputs
+    with no_dispatch():
         r = func(*args, **new_kwargs)
     return FakeTensor(fake_mode, r, out_device)
 
@@ -821,30 +821,40 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         # is written to must be invalidated
         self.invalidate_written_to_constants(func, flat_arg_fake_tensors, args, kwargs)
 
-        # If there's a Python meta, prefer that over the decomposition
-        from torch._decomp import meta_table as meta_table
+        from torch._decomp import decomposition_table
+
+        with self:
+            # Decomposes CompositeImplicitAutograd ops
+            r = func.decompose(*args, **kwargs)
+            if r is not NotImplemented:
+                return r
 
-        if func not in meta_table and not self.cpp_meta_supports_symint(func):
-            from torch._decomp import decomposition_table
+        # IDK: feels bad man, sym_numel on as_strided infinite loops otherwise
+        if has_symbolic_sizes and not self.cpp_meta_supports_symint(func):
+            from torch._decomp import meta_table as meta_table
 
-            # Prefer Python decompositions over C++ ones
-            if func in decomposition_table and (
-                has_symbolic_sizes
-                or (
-                    # TODO: Remove these exclusions, so that we can remove
-                    # this leg entirely
-                    torch_decomp_decompositions(func)
-                    and all(not e.is_sparse for e in flat_arg_fake_tensors)
+            if func == aten.size.default:
+                sys.stderr.write(
+                    "Trying to call aten.size on a tensor with symbolic shapes. "
+                    "It's likely that this is from calling tensor.shape in C++"
                 )
-            ):
-                with self:
-                    return decomposition_table[func](*args, **kwargs)
+                # We do this to allow for better error localization with `TORCH_SHOW_CPP_STACKTRACES=1`
+                return None
 
             with self:
-                # Decomposes CompositeImplicitAutograd ops
-                r = func.decompose(*args, **kwargs)
-                if r is not NotImplemented:
+                if func in meta_table:
+                    r = meta_table[func](*args, **kwargs)
                     return r
+                if func in decomposition_table:
+                    return decomposition_table[func](*args, **kwargs)
+
+        if (
+            func in decomposition_table
+            and torch_decomp_decompositions(func)
+            and all(not e.is_sparse for e in flat_arg_fake_tensors)
+        ):
+            with self:
+                return decomposition_table[func](*args, **kwargs)
 
         # prims already wrap FakeTensor inputs to FakeTensor outputs
         # and do device logic, we dont need do anything but run them
@@ -855,6 +865,12 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
             with self:
                 return func.prim_meta_impl(*args, **kwargs)
 
+        if has_symbolic_sizes:
+            if not self.cpp_meta_supports_symint(func):
+                raise RuntimeError(
+                    f"{func} - couldn't find symbolic meta function/decomposition"
+                )
+
         # special handling for funcs registered through `register_op_impl`,
         # e.g., manipulating args on constructor calls to construct meta tensors
         # and then afterwards wrapping them to a FakeTensor

From 5aabf7dc81b3ae86153a00682991bd9f8e6afda5 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Thu, 17 Nov 2022 18:27:08 +0000
Subject: [PATCH 1030/1922] Revert "Support masked_fill (#88736)"

This reverts commit 2b131b1d43b10a2a005f3f042f920a62501e4e2d.

Reverted https://github.com/pytorch/pytorch/pull/88736 on behalf of https://github.com/kit1980 due to Inductor tests are failing with AttributeError: module 'torch._inductor.codecache' has no attribute 'valid_vec_isa_list'
---
 test/inductor/test_torchinductor.py  | 23 -------------
 torch/_inductor/codegen/cpp.py       | 51 ++++------------------------
 torch/_inductor/codegen/cpp_prefix.h | 12 -------
 3 files changed, 7 insertions(+), 79 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index efedeca381f36..fb7ca1fc92b73 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -4602,29 +4602,6 @@ def test_complex_memory_overlap(self):
             not codecache.get_cpu_proc_info(), "Does not support vectorization"
         )
         @patch("torch.cuda.is_available", lambda: False)
-        def test_masked_fill_softmax(self):
-            def fn(value, mask):
-                mask = mask.to(torch.bool)
-                x = torch.masked_fill(value, mask, -33.0)
-                return torch.softmax(x, -1)
-
-            value = torch.randn((2, 17))
-            mask = torch.randint(0, 1, size=(2, 17), dtype=torch.uint8)
-            with patch.object(config.cpp, "simdlen", None):
-                torch._dynamo.reset()
-                metrics.reset()
-                opt_fn = torch._dynamo.optimize("inductor")(fn)
-                opt_fn(value, mask)
-
-                real_out = fn(value, mask)
-                compiled_out = opt_fn(value, mask)
-                assert same(real_out, compiled_out, equal_nan=True)
-                assert metrics.generated_cpp_vec_kernel_count >= 1
-
-        @unittest.skipIf(
-            not codecache.valid_vec_isa_list(), "Does not support vectorization"
-        )
-        @patch("torch.cuda.is_available", lambda: False)
         def test_sign_cpu_only(self):
             def fn(x):
                 return (torch.sign(x),)
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index 9f00563a954e6..65a9335d6cbfc 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -311,10 +311,6 @@ def maximum(a, b):
     def square(a):
         return f"{a}.pow(2)"
 
-    @staticmethod
-    def where(a, b, c):
-        return f"decltype({b})::blendv({c}, {b}, {a})"
-
     @staticmethod
     def sign(x):
         code = BracesBuffer()
@@ -334,11 +330,6 @@ def sign(x):
         V.kernel.compute.splice(code)
         return result
 
-    @staticmethod
-    def to_dtype(x, dtype):
-        assert dtype in [torch.bool], f"{__name__} does not support {dtype}"
-        return f"({x})"
-
 
 class CppOverrides(OpOverrides):
     """Map element-wise ops to C++"""
@@ -749,16 +740,7 @@ def load(self, name: str, index: sympy.Expr):
         if expanded_index == new_index:
             line = f"at::vec::Vectorized<float>({var}[{cexpr(index)}])"
         else:
-            if V.graph.get_dtype(name) in [torch.bool, torch.uint8]:
-                g_tmp_buf = f"g_tmp_buffer_{var}"
-                nelements = codecache.pick_vec_isa().nelements()
-                self.loads.writeline(f"float {g_tmp_buf}[{nelements}] = {{0}};")
-                self.loads.writeline(
-                    f"flag_to_float({var} + {cexpr(new_index)}, {g_tmp_buf}, {nelements});"
-                )
-                line = f"at::vec::Vectorized<float>::loadu({g_tmp_buf})"
-            else:
-                line = f"at::vec::Vectorized<float>::loadu({var} + {cexpr(new_index)})"
+            line = f"at::vec::Vectorized<float>::loadu({var} + {cexpr(new_index)})"
 
         return self.cse.generate(self.loads, line)
 
@@ -855,6 +837,9 @@ def is_legal_data_access(self, var: sympy.Symbol, index: sympy.Expr):
         return self.is_var_irrevelant(var, index) or self.is_single_step_var(var, index)
 
     def could_vec(self, name: str, index: sympy.Expr):
+        if V.graph.get_dtype(name) is not torch.float:
+            return False
+
         assert self.itervars is not None
         # Not a loop
         if len(self.itervars) == 0:
@@ -864,24 +849,12 @@ def could_vec(self, name: str, index: sympy.Expr):
         return self.is_legal_data_access(most_inner_var, index)
 
     def load(self, name: str, index: sympy.Expr):
-        if not V.graph.get_dtype(name) in [
-            torch.float,
-            torch.float32,
-            torch.bool,
-            torch.uint8,
-        ]:
-            self.simd_vec = False
-            return self.simd_vec
-
         index = self.rename_indexing(index)
+
         self.simd_vec = self.simd_vec and self.could_vec(name, index)
         return self.simd_vec
 
     def store(self, name, index, value, mode=None):
-        if not V.graph.get_dtype(name) in [torch.float, torch.float32]:
-            self.simd_vec = False
-            return self.simd_vec
-
         assert "buf" in name
         index = self.rename_indexing(index)
 
@@ -954,24 +927,15 @@ def constant(val, dtype):
             @staticmethod
             def index_expr(expr, dtype):
                 self.simd_vec = False
-                tmp_var = self.cse.newvar()
-                return tmp_var
+                return self.cse.newvar()
 
             @staticmethod
             def indirect_indexing(index_var):
-                self.simd_vec = False
                 return sympy.Symbol(str(index_var))
 
             @staticmethod
             def masked(mask, body, other):
-                tmp_var = self.cse.newvar()
-                return tmp_var
-
-            @staticmethod
-            def to_dtype(x, dtype):
-                if dtype != torch.bool:
-                    self.simd_vec = False
-                return x
+                return V.kernel.cse.newvar()
 
         self.exit_stack.enter_context(V.set_ops_handler(VecCheckerProxy()))
         self.exit_stack.enter_context(V.set_kernel_handler(self))
@@ -1076,7 +1040,6 @@ def codegen_loops(self, code, worksharing):
         if reduction_par_depth > 0 and reduction_par_depth != len(
             loops_nest_reduce.loops
         ):
-            metrics.generated_cpp_vec_kernel_count -= 1
             return self.simd_omp_kernel.codegen_loops(code, worksharing)
 
         with contextlib.ExitStack() as stack:
diff --git a/torch/_inductor/codegen/cpp_prefix.h b/torch/_inductor/codegen/cpp_prefix.h
index c1c9c3bae112d..1905aefcda5c0 100644
--- a/torch/_inductor/codegen/cpp_prefix.h
+++ b/torch/_inductor/codegen/cpp_prefix.h
@@ -57,15 +57,3 @@ template <typename T> void atomic_add(volatile T *addr, T offset) {
   } while (!atomic_addr->compare_exchange_weak(expected, desired,
                                                std::memory_order_relaxed));
 }
-
-// This function is used to convert bool or uint8 to float mask for
-// vectorization. The caller needs to make sure the src represents TRUE/FALSE
-// correctly.
-template <typename T>
-void flag_to_float(const T* src, float* dst, int64_t n) {
-#pragma unroll
-  for (int64_t i = 0; i < n; i++) {
-    uint32_t* dst_u32 = (uint32_t*)dst;
-    dst_u32[i] = *(src + i) ? 0xFFFFFFFF : 0;
-  }
-}

From d44b4f8fb17a481afc5c6f4b3ac2ea59ac54b066 Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@fb.com>
Date: Thu, 17 Nov 2022 19:20:49 +0000
Subject: [PATCH 1031/1922] Fix bug in dynamo dashboard summary stats diff
 (#89226)

Fixes issue where a suite may not be present in one of the logs.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89226
Approved by: https://github.com/anijain2305
---
 benchmarks/dynamo/runner.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py
index 8012e82607cff..843dbd12909aa 100755
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@@ -864,6 +864,8 @@ def generate_diff(self, last2, filename, caption):
         for _, row in df_merge.iterrows():
             if row["Compiler"] in self.args.flag_compilers:
                 for suite in self.args.suites:
+                    if suite + "_prev" not in row or suite + "_cur" not in row:
+                        continue
                     data["compiler"].append(row["Compiler"])
                     data["suite"].append(suite)
                     data["prev_value"].append(row[suite + "_prev"])

From 0c1e1bd3630ceff3a638ac6ea2a34631c973a6f1 Mon Sep 17 00:00:00 2001
From: "Tugsbayasgalan (Tugsuu) Manlaibaatar" <tmanlaibaatar@fb.com>
Date: Mon, 14 Nov 2022 23:26:15 -0800
Subject: [PATCH 1032/1922] Rewrite assert statement with torch._assert under
 config (#88246)

This diff rewrites assert statement in python with torch._assert under config. The resulting graph looks something like:
```
SOURCE CODE:
def f(x):
      assert x[0] == 3
      return x.cos()

CAPTURED GRAPH:
graph():
    %arg0 : [#users=2] = placeholder[target=arg0]
    %getitem : [#users=1] = call_function[target=operator.getitem](args = (%arg0, 0), kwargs = {})
    %eq : [#users=1] = call_function[target=operator.eq](args = (%getitem, 3), kwargs = {})
    %_assert : [#users=0] = call_function[target=torch._assert](args = (%eq, "assertion_error"), kwargs = {})
    %cos : [#users=1] = call_method[target=cos](args = (%arg0,), kwargs = {})
    return cos
 ```
Note that this introduces side-effect as it could error out while executing graph, but the assertion can eliminated via DCE if we choose to ignore it.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88246
Approved by: https://github.com/jansel
---
 test/dynamo/test_repros.py        | 92 ++++++++++++++++++++++++++++++
 torch/_dynamo/config.py           |  3 +
 torch/_dynamo/symbolic_convert.py | 94 +++++++++++++++++++++++++++++++
 3 files changed, 189 insertions(+)

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 503231b4cb120..e30a1275ed135 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -1938,6 +1938,98 @@ def fn(x):
         self.assertEqual(cnt.frame_count, 1)
         self.assertEqual(cnt.op_count, 1)
 
+    @patch.object(torch._dynamo.config, "rewrite_assert_with_torch_assert", True)
+    def test_rewrite_assert_with_msg(self):
+        def f(x):
+            b = x.sin()
+            assert x[0] == 3, "First dim need to be 3"
+            return x.cos() + b
+
+        args = (torch.Tensor([3, 4, 5]),)
+        cnt = torch._dynamo.testing.CompileCounter()
+
+        opt_f = torch._dynamo.optimize(cnt, nopython=True)(f)
+        self.assertTrue(same(f(*args), opt_f(*args)))
+        self.assertEqual(cnt.op_count, 6)
+        self.assertEqual(cnt.frame_count, 1)
+
+        exported, _ = torch._dynamo.export(f, torch.Tensor([3, 4, 5]))
+        self.assertTrue(same(exported(*args), f(*args)))
+
+        with self.assertRaisesRegex(AssertionError, ""):
+            exported, _ = torch._dynamo.export(f, torch.Tensor([4, 4, 5]))
+
+    @patch.object(torch._dynamo.config, "rewrite_assert_with_torch_assert", True)
+    def test_not_rewrite_assert_for_other_errors(self):
+        def f(x):
+            b = x.sin()
+            if not x.sum() <= 3:
+                raise ValueError("input sum needs to be 3")
+            return x.cos() + b
+
+        args = (torch.Tensor([3, 4, 5]),)
+        opt_fn = torch._dynamo.optimize("eager")(f)
+        with self.assertRaisesRegex(ValueError, "input sum needs to be 3"):
+            opt_fn(*args)
+
+    # TODO (tmanlaibaatar) handle data-dependent fstring in assert statement.
+    @patch.object(torch._dynamo.config, "rewrite_assert_with_torch_assert", True)
+    def test_rewrite_assert_with_fstring_msg(self):
+        def f(x):
+            b = x.sin()
+            assert x[0] == 3, f"First dim need to be {x[0]}"
+            return x.cos() + b
+
+        args = (torch.Tensor([3, 4, 5]),)
+        with self.assertRaisesRegex(torch._dynamo.exc.Unsupported, "generic_jump"):
+            exported, _ = torch._dynamo.export(f, torch.Tensor([3, 4, 5]))
+
+    @patch.object(torch._dynamo.config, "rewrite_assert_with_torch_assert", True)
+    def test_rewrite_assert_without_msg(self):
+        def f(x):
+            b = x.sin()
+            assert x[0] == 3
+            return x.cos() + b
+
+        args = (torch.Tensor([3, 4, 5]),)
+        exported, _ = torch._dynamo.export(f, torch.Tensor([3, 4, 5]))
+        self.assertTrue(same(exported(*args), f(*args)))
+
+        with self.assertRaisesRegex(AssertionError, ""):
+            exported, _ = torch._dynamo.export(f, torch.Tensor([4, 4, 5]))
+
+    @patch.object(torch._dynamo.config, "rewrite_assert_with_torch_assert", True)
+    def test_rewrite_assert_noop(self):
+        def f(x):
+            b = x.sin()
+            assert True
+            assert x.dtype == torch.float32
+            return x.cos() + b
+
+        args = (torch.Tensor([3, 4, 5]),)
+        exported, _ = torch._dynamo.export(f, torch.Tensor([3, 4, 5]))
+        self.assertTrue(same(exported(*args), f(*args)))
+
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_f = torch._dynamo.optimize(cnt, nopython=True)(f)
+        self.assertTrue(same(f(*args), opt_f(*args)))
+        # torch._assert shouldn't be in the graph
+        self.assertEqual(cnt.op_count, 3)
+        self.assertEqual(cnt.frame_count, 1)
+
+        exported, _ = torch._dynamo.export(f, torch.Tensor([4, 4, 5]))
+        self.assertTrue(same(exported(*args), f(*args)))
+
+    @patch.object(torch._dynamo.config, "rewrite_assert_with_torch_assert", False)
+    def test_not_rewrite_assert(self):
+        def f(x):
+            b = x.sin()
+            assert x[0] == 3
+            return x.cos() + b
+
+        with self.assertRaisesRegex(torch._dynamo.exc.Unsupported, "generic_jump"):
+            torch._dynamo.export(f, torch.Tensor([3, 4, 5]))
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index 12088383e741c..39a1a6433419f 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -87,6 +87,9 @@
 # if an exception is encountered
 replay_record_enabled = False
 
+# Rewrite assert statement in python with torch._assert
+rewrite_assert_with_torch_assert = True
+
 # Show a warning on every graph break
 print_graph_breaks = False
 
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index e64804cb68b2c..d2bc5332719c5 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -53,6 +53,7 @@
     fake_tensors_available,
     graph_break_dup_warning_checker,
     istype,
+    proxy_args_kwargs,
 )
 from .variables.base import MutableLocal, typestr, VariableTracker
 from .variables.builder import VariableBuilder, wrap_fx_proxy
@@ -121,10 +122,103 @@ def impl(self: "InstructionTranslatorBase", inst: Instruction):
     return impl
 
 
+def _detect_and_normalize_assert_statement(
+    self: "InstructionTranslatorBase", truth_fn: typing.Callable, push: bool
+):
+    # Detect if this jump instruction is assert and normalize the assert
+    # by pushing dummy error message when nothing is given.
+    #
+    # Python 3.9 assertion is in following format:
+    # 18 POP_JUMP_IF_TRUE       28
+    # 20 LOAD_ASSERTION_ERROR
+    # 22 LOAD_CONST               3 ('Assert message') -> optional instruction
+    # 24 CALL_FUNCTION            1                    -> optional instruction
+    # 26 RAISE_VARARGS
+    #
+    # Python 3.8 assertion is in following format:
+    # 18 POP_JUMP_IF_TRUE       28
+    # 20 LOAD_GLOBAL              0 (Assertion type)
+    # 22 LOAD_CONST               3 ('Assert message') -> optional instruction
+    # 24 CALL_FUNCTION            1                    -> optional instruction
+    # 26 RAISE_VARARGS            1
+
+    if (truth_fn is not operator.truth) or push:
+        return False
+
+    current_instruction_pointer = self.instruction_pointer
+    inst = self.instructions[current_instruction_pointer]
+    # Detect LOAD_ASSERTION_ERROR or LOAD_GLOBAL 0
+    if sys.version_info < (3, 9):
+        if inst.opname != "LOAD_GLOBAL" or inst.argval != "AssertionError":
+            return False
+    else:
+        if inst.opname != "LOAD_ASSERTION_ERROR":
+            return False
+
+    current_instruction_pointer += 1
+
+    if current_instruction_pointer >= len(self.instructions):
+        return False
+
+    inst = self.instructions[current_instruction_pointer]
+    has_error_msg = False
+    # DETECT RAISE_VARARGS or LOAD CONST
+    if inst.opname == "LOAD_CONST":
+        if not isinstance(inst.argval, str):
+            return False
+        self.LOAD_CONST(inst)
+        has_error_msg = True
+
+        # if it is LOAD_CONSTANT, it must be followed by CALL_FUNCTION
+        current_instruction_pointer += 1
+        if current_instruction_pointer >= len(self.instructions):
+            return False
+        inst = self.instructions[current_instruction_pointer]
+        if inst.opname != "CALL_FUNCTION":
+            return False
+
+        # CALL_FUNCTION should be followed by RAISE_VARARGS
+        current_instruction_pointer += 1
+        if current_instruction_pointer >= len(self.instructions):
+            return False
+        inst = self.instructions[current_instruction_pointer]
+
+    if inst.opname != "RAISE_VARARGS":
+        return False
+
+    if not has_error_msg:
+        # Push dummy value instead of error message
+        self.push(ConstantVariable("assertion error"))
+
+    return True
+
+
 def generic_jump(truth_fn: typing.Callable, push: bool):
     def inner(self: "InstructionTranslatorBase", inst: Instruction):
         value: VariableTracker = self.pop()
         self.output.guards.update(value.guards)
+        if (
+            config.rewrite_assert_with_torch_assert
+            and _detect_and_normalize_assert_statement(self, truth_fn, push)
+        ):
+            error_msg: VariableTracker = self.pop()
+            self.output.guards.update(error_msg.guards)
+            # Skip over things like `assert True`
+            if value.is_python_constant() and bool(value.as_python_constant()):
+                self.jump(inst)
+                return
+
+            # Manually insert torch._assert instead of python assert and jump over
+            # assert related instructions as we don't need them anymore.
+            self.output.create_proxy(
+                "call_function",
+                torch._assert,
+                *proxy_args_kwargs((value, error_msg), {}),
+                current_tx=self,
+            )
+            self.jump(inst)
+            return
+
         if value.is_python_constant():
             if truth_fn(value.as_python_constant()):
                 push and self.push(value)

From 7909ecb53cd1ed55d8fe6fb0358637cdf3ced5df Mon Sep 17 00:00:00 2001
From: Xiao Wang <24860335+xwang233@users.noreply.github.com>
Date: Thu, 17 Nov 2022 20:10:52 +0000
Subject: [PATCH 1033/1922] Add an env var to skip cudnn version compatibility
 check (#89184)

skip the check by setting `PYTORCH_SKIP_CUDNN_COMPATIBILITY_CHECK=1`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89184
Approved by: https://github.com/ngimel
---
 torch/backends/cudnn/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch/backends/cudnn/__init__.py b/torch/backends/cudnn/__init__.py
index e187d6d26aed8..2b63a63796650 100644
--- a/torch/backends/cudnn/__init__.py
+++ b/torch/backends/cudnn/__init__.py
@@ -37,6 +37,8 @@ def _init():
             else:
                 cudnn_compatible = runtime_minor >= compile_minor
             if not cudnn_compatible:
+                if os.environ.get('PYTORCH_SKIP_CUDNN_COMPATIBILITY_CHECK', '0') == '1':
+                    return True
                 base_error_msg = (f'cuDNN version incompatibility: '
                                   f'PyTorch was compiled  against {compile_version} '
                                   f'but found runtime version {runtime_version}. '

From 617941fa508cc0c023a1be066b0b5d4097973ade Mon Sep 17 00:00:00 2001
From: mikey dagitses <mikeyd@meta.com>
Date: Thu, 17 Nov 2022 06:09:55 -0500
Subject: [PATCH 1034/1922] reland "support running test_mobile_profiler with
 buck1/buck2 and OSS (#89001)" (#89091)

We modify this to no longer use std::experimental::filesystem::path
and use our own custom type instead.

This reverts commit c53a5ac6cca7e2e7d7c47b1a816c7eaa2e7a7704.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89091
Approved by: https://github.com/r-barnes, https://github.com/malfet
---
 test/cpp/lite_interpreter_runtime/resources.h | 41 +++++++++++++++++++
 .../test_mobile_profiler.cpp                  | 34 +++++++--------
 2 files changed, 55 insertions(+), 20 deletions(-)
 create mode 100644 test/cpp/lite_interpreter_runtime/resources.h

diff --git a/test/cpp/lite_interpreter_runtime/resources.h b/test/cpp/lite_interpreter_runtime/resources.h
new file mode 100644
index 0000000000000..0be5928b299ba
--- /dev/null
+++ b/test/cpp/lite_interpreter_runtime/resources.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <string>
+
+namespace torch {
+namespace testing {
+
+namespace detail {
+class Path;
+}
+
+/// Gets the path to the resource identified by name.
+///
+/// @param name identifies a resource, relative path starting from the
+///             repo root
+inline auto getResourcePath(std::string name) -> detail::Path;
+
+// End interface: implementation details follow.
+
+namespace detail {
+
+class Path {
+ public:
+  explicit Path(std::string rep) : rep_(std::move(rep)) {}
+
+  auto string() const -> std::string const& {
+    return rep_;
+  }
+
+ private:
+  std::string rep_;
+};
+
+} // namespace detail
+
+inline auto getResourcePath(std::string name) -> detail::Path {
+  return detail::Path(std::move(name));
+}
+
+} // namespace testing
+} // namespace torch
diff --git a/test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp b/test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp
index 08cb81ae78763..df9cb9cea28c6 100644
--- a/test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp
+++ b/test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp
@@ -11,6 +11,8 @@
 
 #include <torch/csrc/profiler/events.h>
 
+#include "test/cpp/lite_interpreter_runtime/resources.h"
+
 #ifdef EDGE_PROFILER_USE_KINETO
 namespace torch {
 namespace jit {
@@ -42,16 +44,15 @@ bool checkMetaData(
 } // namespace
 
 TEST(MobileProfiler, ModuleHierarchy) {
-  std::string filePath(__FILE__);
-  auto testModelFile = filePath.substr(0, filePath.find_last_of("/\\") + 1);
-  testModelFile.append("to_be_profiled_module.ptl");
+  auto testModelFile = torch::testing::getResourcePath(
+      "test/cpp/lite_interpreter_runtime/to_be_profiled_module.ptl");
 
   std::vector<IValue> inputs;
   inputs.emplace_back(at::rand({64, 64}));
   inputs.emplace_back(at::rand({64, 64}));
   std::string trace_file_name("/tmp/test_trace.trace");
 
-  mobile::Module bc = _load_for_mobile(testModelFile);
+  mobile::Module bc = _load_for_mobile(testModelFile.string());
   {
     KinetoEdgeCPUProfiler profiler(
         bc,
@@ -95,16 +96,15 @@ TEST(MobileProfiler, ModuleHierarchy) {
 }
 
 TEST(MobileProfiler, Backend) {
-  std::string filePath(__FILE__);
-  auto testModelFile = filePath.substr(0, filePath.find_last_of("/\\") + 1);
-  testModelFile.append("test_backend_for_profiling.ptl");
+  auto testModelFile = torch::testing::getResourcePath(
+      "test/cpp/lite_interpreter_runtime/test_backend_for_profiling.ptl");
 
   std::vector<IValue> inputs;
   inputs.emplace_back(at::rand({64, 64}));
   inputs.emplace_back(at::rand({64, 64}));
   std::string trace_file_name("/tmp/test_trace_backend.trace");
 
-  mobile::Module bc = _load_for_mobile(testModelFile);
+  mobile::Module bc = _load_for_mobile(testModelFile.string());
   {
     KinetoEdgeCPUProfiler profiler(
         bc,
@@ -130,16 +130,15 @@ TEST(MobileProfiler, Backend) {
 }
 
 TEST(MobileProfiler, BackendMemoryEvents) {
-  std::string filePath(__FILE__);
-  auto testModelFile = filePath.substr(0, filePath.find_last_of("/\\") + 1);
-  testModelFile.append("test_backend_for_profiling.ptl");
+  auto testModelFile = torch::testing::getResourcePath(
+      "test/cpp/lite_interpreter_runtime/test_backend_for_profiling.ptl");
 
   std::vector<IValue> inputs;
   inputs.emplace_back(at::rand({64, 64}));
   inputs.emplace_back(at::rand({64, 64}));
   std::string trace_file_name("/tmp/test_trace_backend_memory.trace");
 
-  mobile::Module bc = _load_for_mobile(testModelFile);
+  mobile::Module bc = _load_for_mobile(testModelFile.string());
   {
     mobile::KinetoEdgeCPUProfiler profiler(
         bc,
@@ -163,13 +162,8 @@ TEST(MobileProfiler, BackendMemoryEvents) {
 }
 
 TEST(MobileProfiler, ProfilerEvent) {
-  /*
-   * TODO: Using __FILE__ is unreliable e.g. it fails to resolve correctly when
-   * using buck2, works ok with buck1
-   */
-  std::string filePath(__FILE__);
-  auto testModelFile = filePath.substr(0, filePath.find_last_of("/\\") + 1);
-  testModelFile.append("test_backend_for_profiling.ptl");
+  auto testModelFile = torch::testing::getResourcePath(
+      "test/cpp/lite_interpreter_runtime/test_backend_for_profiling.ptl");
 
   std::vector<IValue> inputs;
   inputs.emplace_back(at::rand({64, 64}));
@@ -180,7 +174,7 @@ TEST(MobileProfiler, ProfilerEvent) {
       torch::profiler::ProfilerPerfEvents.begin(),
       torch::profiler::ProfilerPerfEvents.end());
 
-  mobile::Module bc = _load_for_mobile(testModelFile);
+  mobile::Module bc = _load_for_mobile(testModelFile.string());
   {
     // Bail if something goes wrong here
     try {

From 90da18919540609329cfc64eb4c655b7b2f89d1a Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Thu, 17 Nov 2022 21:33:59 +0000
Subject: [PATCH 1035/1922] Fix distributed test paths when running periodic
 multigpu job (#89225)

Some distributed tests are moved to a new location after https://github.com/pytorch/pytorch/pull/88698. This is currently failing periodic multigpu job:

* https://github.com/pytorch/pytorch/actions/runs/3484486207/jobs/5829301159
* https://github.com/pytorch/pytorch/actions/runs/3484486207/jobs/5829301093

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89225
Approved by: https://github.com/clee2000
---
 .jenkins/pytorch/multigpu-test.sh | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/.jenkins/pytorch/multigpu-test.sh b/.jenkins/pytorch/multigpu-test.sh
index bbd1c370a638e..9d7efc969823c 100755
--- a/.jenkins/pytorch/multigpu-test.sh
+++ b/.jenkins/pytorch/multigpu-test.sh
@@ -8,11 +8,6 @@
 source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 
 echo "Testing pytorch"
-if [ -n "${CI}" ]; then
-  # TODO move this to docker
-  # Pin unittest-xml-reporting to freeze printing test summary logic, related: https://github.com/pytorch/pytorch/issues/69014
-  pip_install "unittest-xml-reporting<=3.2.0,>=2.0.0"
-fi
 
 # Disabling tests to see if they solve timeout issues; see https://github.com/pytorch/pytorch/issues/70015
 # python tools/download_mnist.py --quiet -d test/cpp/api/mnist
@@ -28,8 +23,8 @@ time python test/run_test.py --verbose -i distributed/rpc/cuda/test_tensorpipe_a
 # FSDP tests
 for f in test/distributed/fsdp/*.py ; do time python test/run_test.py --verbose -i "${f#*/}" ; done
 # ShardedTensor tests
-time python test/run_test.py --verbose -i distributed/_shard/checkpoint/test_checkpoint
-time python test/run_test.py --verbose -i distributed/_shard/checkpoint/test_file_system_checkpoint
+time python test/run_test.py --verbose -i distributed/checkpoint/test_checkpoint
+time python test/run_test.py --verbose -i distributed/checkpoint/test_file_system_checkpoint
 time python test/run_test.py --verbose -i distributed/_shard/sharding_spec/test_sharding_spec
 time python test/run_test.py --verbose -i distributed/_shard/sharding_plan/test_sharding_plan
 time python test/run_test.py --verbose -i distributed/_shard/sharded_tensor/test_megatron_prototype

From e9b5d8f74ca2969cab8bb9f6cd65053dde90689b Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Thu, 17 Nov 2022 22:05:27 +0000
Subject: [PATCH 1036/1922] [JIT][Security] Do not blindly eval input string
 (#89189)

Introduce `_eval_no_call` method, that evaluates statement only if it
does not contain any calls(done by examining the bytecode), thus preventing command injection exploit

Added simple unit test to check for that
`torch.jit.annotations.get_signature` would not result in calling random
code.

Although, this code path exists for Python-2 compatibility, and perhaps
should be simply removed.

Fixes https://github.com/pytorch/pytorch/issues/88868

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89189
Approved by: https://github.com/suo
---
 test/test_jit.py                               |  8 ++++++++
 torch/csrc/jit/frontend/script_type_parser.cpp |  2 +-
 torch/jit/annotations.py                       | 14 ++++++++++++--
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 13c27b0efa555..6cbc091d506b5 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -3951,6 +3951,14 @@ def invalid4(a):
                 return a + 2
             torch.jit.script(invalid4)
 
+    def test_calls_in_type_annotations(self):
+        with self.assertRaisesRegex(RuntimeError, "Type annotation should not contain calls"):
+            def spooky(a):
+                # type: print("Hello") -> Tensor # noqa: F723
+                return a + 2
+            print(torch.__file__)
+            torch.jit.annotations.get_signature(spooky, None, 1, True)
+
     def test_is_optional(self):
         ann = Union[List[int], List[float]]
         torch._jit_internal.is_optional(ann)
diff --git a/torch/csrc/jit/frontend/script_type_parser.cpp b/torch/csrc/jit/frontend/script_type_parser.cpp
index f5d6f640d413d..d05ec95fb9fa2 100644
--- a/torch/csrc/jit/frontend/script_type_parser.cpp
+++ b/torch/csrc/jit/frontend/script_type_parser.cpp
@@ -316,7 +316,7 @@ std::vector<IValue> ScriptTypeParser::evaluateDefaults(
   // We then run constant prop on this graph and check the results are
   // constant. This approach avoids having to have separate handling of
   // default arguments from standard expressions by piecing together existing
-  // machinery for graph generation, constant propgation, and constant
+  // machinery for graph generation, constant propagation, and constant
   // extraction.
   auto tuple_type = Subscript::create(
       r,
diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py
index a4a36ce36a5e8..a6ff2d04d2076 100644
--- a/torch/jit/annotations.py
+++ b/torch/jit/annotations.py
@@ -1,4 +1,5 @@
 import ast
+import dis
 import enum
 import inspect
 import re
@@ -144,6 +145,15 @@ def check_fn(fn, loc):
         raise torch.jit.frontend.FrontendError(loc, "Expected a single top-level function")
 
 
+def _eval_no_call(stmt, glob, loc):
+    """Evaluate statement as long as it does not contain any method/function calls"""
+    bytecode = compile(stmt, "", mode="eval")
+    for insn in dis.get_instructions(bytecode):
+        if "CALL" in insn.opname:
+            raise RuntimeError(f"Type annotation should not contain calls, but '{stmt}' does")
+    return eval(bytecode, glob, loc)  # type: ignore[arg-type] # noqa: P204
+
+
 def parse_type_line(type_line, rcb, loc):
     """Parses a type annotation specified as a comment.
 
@@ -154,7 +164,7 @@ def parse_type_line(type_line, rcb, loc):
     arg_ann_str, ret_ann_str = split_type_line(type_line)
 
     try:
-        arg_ann = eval(arg_ann_str, {}, EvalEnv(rcb))  # type: ignore[arg-type] # noqa: P204
+        arg_ann = _eval_no_call(arg_ann_str, {}, EvalEnv(rcb))
     except (NameError, SyntaxError) as e:
         raise RuntimeError("Failed to parse the argument list of a type annotation") from e
 
@@ -162,7 +172,7 @@ def parse_type_line(type_line, rcb, loc):
         arg_ann = (arg_ann,)
 
     try:
-        ret_ann = eval(ret_ann_str, {}, EvalEnv(rcb))  # type: ignore[arg-type] # noqa: P204
+        ret_ann = _eval_no_call(ret_ann_str, {}, EvalEnv(rcb))
     except (NameError, SyntaxError) as e:
         raise RuntimeError("Failed to parse the return type of a type annotation") from e
 

From fd2bfb49221383436eb8267104e0e3f6fd394456 Mon Sep 17 00:00:00 2001
From: Charlie Yan <charlieyan@fb.com>
Date: Thu, 17 Nov 2022 19:05:44 +0000
Subject: [PATCH 1037/1922] Add tests for replicate multiple modules (#89099)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89099
Approved by: https://github.com/zhaojuanmao
---
 .../distributed/_composable/test_replicate.py | 35 +++++++++++--------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/test/distributed/_composable/test_replicate.py b/test/distributed/_composable/test_replicate.py
index 831ccc3376aff..3e8bf44a1fdea 100644
--- a/test/distributed/_composable/test_replicate.py
+++ b/test/distributed/_composable/test_replicate.py
@@ -39,13 +39,7 @@ def tearDown(self):
         except OSError:
             pass
 
-    def _prepare_module(self, global_batch_size):
-        model = Net()
-        input = torch.randn(global_batch_size, 2)
-        target = torch.randn(global_batch_size, 4)
-        return model, input, target
-
-    def test_replicate(self):
+    def _compare_module(self, mod, replicate_mod):
         dist.init_process_group(
             backend="gloo",
             rank=self.rank,
@@ -55,8 +49,8 @@ def test_replicate(self):
 
         local_batch_size = 1
         global_batch_size = self.world_size * local_batch_size
-        model, input, target = self._prepare_module(global_batch_size)
-        replicate_model = mark_root_module(replicate(deepcopy(model)))
+        input = torch.randn(global_batch_size, 2)
+        target = torch.randn(global_batch_size, 4)
 
         def step_model(model, input, target):
             model.train()
@@ -69,9 +63,9 @@ def step_model(model, input, target):
                 param.grad = None
 
         for iteration in range(2):
-            step_model(model, input, target)
+            step_model(mod, input, target)
             step_model(
-                replicate_model,
+                replicate_mod,
                 input[
                     self.rank
                     * local_batch_size : (self.rank + 1)
@@ -85,16 +79,29 @@ def step_model(model, input, target):
             )
 
             self.assertEqual(
-                len(list(model.parameters())),
-                len(list(replicate_model.parameters())),
+                len(list(mod.parameters())),
+                len(list(replicate_mod.parameters())),
             )
-            for i, j in zip(model.parameters(), replicate_model.parameters()):
+            for i, j in zip(mod.parameters(), replicate_mod.parameters()):
                 self.assertEqual(i, j, rtol=1.3e-06, atol=5e-5)
 
             # Shuffle the input so that DDP input is different
             torch.manual_seed(iteration)
             input = input[torch.randperm(global_batch_size)]
 
+    def test_replicate_single_module(self):
+        model = Net()
+        replicate_model = mark_root_module(replicate(deepcopy(model)))
+        self._compare_module(model, replicate_model)
+
+    def test_replicate_multi_module(self):
+        model = Net()
+        replicate_model = mark_root_module(deepcopy(model))
+        replicate(replicate_model.fc1)
+        replicate(replicate_model.fc2)
+        replicate(replicate_model.fc3)
+        self._compare_module(model, replicate_model)
+
 
 if __name__ == "__main__":
     run_tests()

From 95b0bda80604c5a74da6b7e9ad0c9e152c147a63 Mon Sep 17 00:00:00 2001
From: keineahnung2345 <mimifasosofamire1123@gmail.com>
Date: Thu, 17 Nov 2022 22:28:20 +0000
Subject: [PATCH 1038/1922] Fix typo in aten/src/README.md (#89175)

remove redundant "have to"
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89175
Approved by: https://github.com/kit1980
---
 aten/src/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/aten/src/README.md b/aten/src/README.md
index add2816926331..3127ed5c8c399 100644
--- a/aten/src/README.md
+++ b/aten/src/README.md
@@ -69,8 +69,8 @@ will `retain` it itself.
 ```
 
 Sometimes, you have a tensor in hand which you'd like to use directly, but
-under some conditions you have to have to call, e.g., `newContiguous`, to get
-it into the correct form:
+under some conditions you have to call, e.g., `newContiguous`, to get it into
+the correct form:
 
 ```
   if (!(k_->stride(3) == 1) || !(k_->stride[2] == k_->size(3))) {

From 34d1d48520ff1f33e5f0b16e9a79fa7b7b474a03 Mon Sep 17 00:00:00 2001
From: erjia <erjia@fb.com>
Date: Thu, 17 Nov 2022 23:06:41 +0000
Subject: [PATCH 1039/1922] [DataPipe] Add container template for _Fork and
 _Demux (#89216)

- This would remove the hard-coded check within `_ChildDataPipe`.
- Add `get_length_by_instance` to parent class to make sure there is a chance that child DataPipe can have different lengths
- Prevent Error when `__del__` executed when the object has already been removed
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89216
Approved by: https://github.com/NivekT
---
 torch/utils/data/datapipes/iter/combining.py | 51 ++++++++++++++++----
 1 file changed, 41 insertions(+), 10 deletions(-)

diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py
index 0c4f34ad51f1b..c874cedbde29c 100644
--- a/torch/utils/data/datapipes/iter/combining.py
+++ b/torch/utils/data/datapipes/iter/combining.py
@@ -1,5 +1,6 @@
 import warnings
 
+from abc import ABC, abstractmethod
 from collections import deque
 from typing import Any, Callable, Iterator, List, Optional, Sized, Tuple, TypeVar, Deque
 
@@ -96,7 +97,31 @@ def __new__(cls, datapipe: IterDataPipe, num_instances: int, buffer_size: int =
         return [_ChildDataPipe(container, i) for i in range(num_instances)]
 
 
-class _ForkerIterDataPipe(IterDataPipe):
+class _ContainerTemplate(ABC):
+    r"""
+    Abstract class for container ``DataPipes``. The followings are three required
+    methods.
+    """
+    @abstractmethod
+    def get_next_element_by_instance(self, instance_id: int):
+        ...
+
+    @abstractmethod
+    def is_every_instance_exhausted(self) -> bool:
+        ...
+
+    @abstractmethod
+    def reset(self) -> None:
+        ...
+
+    @abstractmethod
+    def get_length_by_instance(self, instance_id: int):
+        r"""
+        Raise TypeError if it's not supposed to be implemented to support `list(datapipe)`
+        """
+
+
+class _ForkerIterDataPipe(IterDataPipe, _ContainerTemplate):
     r"""
     Container to hold instance-specific information on behalf of ForkerIterDataPipe. It tracks
     the state of its child DataPipes, maintains the buffer, and yields the next value
@@ -159,6 +184,9 @@ def is_every_instance_exhausted(self) -> bool:
         return self.end_ptr is not None and\
             all(self.end_ptr == ptr or self.end_ptr - 1 == ptr for ptr in self.child_pointers)
 
+    def get_length_by_instance(self, instance_id: int) -> int:
+        return len(self.main_datapipe)
+
     def reset(self) -> None:
         self._datapipe_iterator = iter(self.main_datapipe)
         self.buffer = deque()
@@ -195,7 +223,8 @@ def __setstate__(self, state):
         self.end_ptr = None
 
     def __del__(self):
-        self.buffer.clear()
+        if self.buffer:
+            self.buffer.clear()
 
 
 class _ChildDataPipe(IterDataPipe):
@@ -229,10 +258,8 @@ class _ChildDataPipe(IterDataPipe):
     _is_child_datapipe: bool = True
 
     def __init__(self, main_datapipe: IterDataPipe, instance_id: int):
-        required_attrs = ["get_next_element_by_instance", "is_every_instance_exhausted", "reset"]
-        required_ops = [getattr(main_datapipe, attr) for attr in required_attrs]
-        if any(not callable(op) for op in required_ops):
-            raise NotImplementedError(f"Main Datapipe must have methods {required_attrs} implemented.")
+        assert isinstance(main_datapipe, _ContainerTemplate)
+
         self.main_datapipe: IterDataPipe = main_datapipe
         self.instance_id = instance_id
 
@@ -242,7 +269,7 @@ def __iter__(self):
         return self.main_datapipe.get_next_element_by_instance(self.instance_id)
 
     def __len__(self):
-        return len(self.main_datapipe)
+        return self.main_datapipe.get_length_by_instance(self.instance_id)
 
     # This method is called by `hook_iterator` in `_typing.py`.
     def _set_main_datapipe_valid_iterator_id(self) -> int:
@@ -324,7 +351,7 @@ def __new__(cls, datapipe: IterDataPipe, num_instances: int,
         return [_ChildDataPipe(container, i) for i in range(num_instances)]
 
 
-class _DemultiplexerIterDataPipe(IterDataPipe):
+class _DemultiplexerIterDataPipe(IterDataPipe, _ContainerTemplate):
     r"""
     Container to hold instance-specific information on behalf of DemultiplexerIterDataPipe. It tracks
     the state of its child DataPipes, maintains the buffer, classifies and yields the next correct value
@@ -393,6 +420,9 @@ def get_next_element_by_instance(self, instance_id: int):
     def is_every_instance_exhausted(self) -> bool:
         return self.main_datapipe_exhausted and all(not child_buffer for child_buffer in self.child_buffers)
 
+    def get_length_by_instance(self, instance_id: int) -> int:
+        raise TypeError
+
     def reset(self) -> None:
         self._datapipe_iterator = None
         self.current_buffer_usage = 0
@@ -429,8 +459,9 @@ def __setstate__(self, state):
         self.main_datapipe_exhausted = False
 
     def __del__(self):
-        for dq in self.child_buffers:
-            dq.clear()
+        if self.child_buffers:
+            for dq in self.child_buffers:
+                dq.clear()
 
 
 @functional_datapipe('mux')

From 2c24ed5bf90500968eb396edc04537279b0cb623 Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Thu, 17 Nov 2022 19:43:37 +0000
Subject: [PATCH 1040/1922] Enable inductor CI for TorchBench (#87465)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87465
Approved by: https://github.com/malfet
---
 .github/ci_commit_pins/text.txt        |  1 +
 .github/scripts/filter_test_configs.py |  2 ++
 .github/workflows/inductor.yml         |  5 ++--
 .jenkins/pytorch/common_utils.sh       | 19 ++++--------
 .jenkins/pytorch/test.sh               | 40 ++++++++++++++++----------
 benchmarks/dynamo/common.py            | 17 ++---------
 6 files changed, 40 insertions(+), 44 deletions(-)
 create mode 100644 .github/ci_commit_pins/text.txt

diff --git a/.github/ci_commit_pins/text.txt b/.github/ci_commit_pins/text.txt
new file mode 100644
index 0000000000000..c0e01da17fd08
--- /dev/null
+++ b/.github/ci_commit_pins/text.txt
@@ -0,0 +1 @@
+5b78d074bd303eb230d30567646fcf0358ee2dd4
diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py
index bb5314434e077..f5c438c29e902 100755
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@@ -24,7 +24,9 @@
     "functorch",
     "inductor",
     "inductor_distributed",
+    "inductor_huggingface",
     "inductor_timm",
+    "inductor_torchbench",
     "jit_legacy",
     "multigpu",
     "nogpu_AVX512",
diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml
index e8390681e4abd..eb953ff42321f 100644
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@@ -23,10 +23,11 @@ jobs:
       cuda-arch-list: 8.6
       test-matrix: |
         { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_torchbench", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "inductor_distributed", shard: 1, num_shards: 1, runner: "linux.g5.12xlarge.nvidia.gpu" },
         ]}
 
diff --git a/.jenkins/pytorch/common_utils.sh b/.jenkins/pytorch/common_utils.sh
index 8af2c93a1e504..7fc1dd6c0f1a9 100644
--- a/.jenkins/pytorch/common_utils.sh
+++ b/.jenkins/pytorch/common_utils.sh
@@ -101,20 +101,16 @@ function get_pinned_commit() {
   cat .github/ci_commit_pins/"${1}".txt
 }
 
-function install_torchvision() {
+function install_torchtext() {
   local commit
-  commit=$(get_pinned_commit vision)
-  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/vision.git@${commit}"
+  commit=$(get_pinned_commit text)
+  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/text.git@${commit}"
 }
 
-function checkout_install_torchvision() {
+function install_torchvision() {
   local commit
   commit=$(get_pinned_commit vision)
-  git clone https://github.com/pytorch/vision
-  pushd vision
-  git checkout "${commit}"
-  time python setup.py install
-  popd
+  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/vision.git@${commit}"
 }
 
 function clone_pytorch_xla() {
@@ -194,13 +190,10 @@ function install_timm() {
 }
 
 function checkout_install_torchbench() {
-  local commit
-  commit=$(get_pinned_commit torchbench)
   git clone https://github.com/pytorch/benchmark torchbench
   pushd torchbench
-  git checkout "${commit}"
+  git checkout no_torchaudio
   python install.py
-  pip_install gym==0.25.2  # workaround issue in 0.26.0
   popd
 }
 
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 135fb50762d6f..17437a56ae0e8 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -256,20 +256,15 @@ test_inductor() {
   # pytest test/test_ops_gradients.py --verbose -k "not _complex and not test_inplace_grad_acos_cuda_float64"
 }
 
-test_inductor_huggingface_shard() {
-  if [[ -z "$NUM_TEST_SHARDS" ]]; then
-    echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
-    exit 1
-  fi
+test_inductor_huggingface() {
   # Use test-reports directory under test folder will allow the CI to automatically pick up
   # the test reports and upload them to S3. Need to use full path here otherwise the script
   # will bark about file not found later on
   TEST_REPORTS_DIR=$(pwd)/test/test-reports
   mkdir -p "$TEST_REPORTS_DIR"
   python benchmarks/dynamo/huggingface.py --ci --training --accuracy \
-    --device cuda --inductor --float32 --total-partitions 1 --partition-id "$1" \
-    --output "$TEST_REPORTS_DIR"/inductor_huggingface_"$1".csv
-  python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_huggingface_"$1".csv
+    --device cuda --inductor --float32 --output "$TEST_REPORTS_DIR"/inductor_huggingface.csv
+  python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_huggingface.csv
 }
 
 test_inductor_timm_shard() {
@@ -288,6 +283,14 @@ test_inductor_timm_shard() {
   python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_timm_"$1".csv
 }
 
+test_inductor_torchbench() {
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+  PYTHONPATH=$(pwd)/torchbench python benchmarks/dynamo/torchbench.py --ci --training --accuracy \
+    --device cuda --inductor --float32 --output "$TEST_REPORTS_DIR"/inductor_torchbench.csv
+  python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_torchbench.csv
+}
+
 test_python_gloo_with_tls() {
   source "$(dirname "${BASH_SOURCE[0]}")/run_glootls_test.sh"
   assert_git_not_dirty
@@ -742,25 +745,32 @@ elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHAR
   install_filelock
   install_triton
   test_dynamo_shard 2
-elif [[ "${TEST_CONFIG}" == *inductor_timm* && $SHARD_NUMBER -lt 3 && $NUM_TEST_SHARDS -gt 1 ]]; then
+elif [[ "${TEST_CONFIG}" == *inductor_huggingface* ]]; then
+  install_torchvision
+  install_filelock
+  install_triton
+  install_huggingface
+  test_inductor_huggingface
+elif [[ "${TEST_CONFIG}" == *inductor_timm* && $NUM_TEST_SHARDS -gt 1 ]]; then
   install_torchvision
   install_filelock
   install_triton
   install_timm
   id=$((SHARD_NUMBER-1))
   test_inductor_timm_shard $id
-elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
+elif [[ "${TEST_CONFIG}" == *inductor_torchbench* ]]; then
+  install_torchtext
   install_torchvision
   install_filelock
   install_triton
-  test_inductor
-  test_inductor_distributed
-elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then
+  checkout_install_torchbench
+  test_inductor_torchbench
+elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 ]]; then
   install_torchvision
   install_filelock
   install_triton
-  install_huggingface
-  test_inductor_huggingface_shard 0
+  test_inductor
+  test_inductor_distributed
 elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
   test_without_numpy
   install_torchvision
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 789ebc3683d32..cad954f825b2b 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -110,27 +110,16 @@
     # *CI_SKIP_AOT_EAGER_TRAINING,
     # *CI_SKIP_INDCUTOR_INFERENCE,
     # TorchBench
-    "attention_is_all_you_need_pytorch",
-    "drq",
-    "hf_Albert",
-    "hf_Bart",
-    "hf_GPT2",
-    "hf_Reformer",
+    "detectron2",
     "mobilenet_v3_large",
     "moco",
-    "pytorch_struct",
-    "vgg16",
-    "speech_transformer",  # from functionalization
-    "vision_maskrcnn",  # from functionalization
-    "timm_efficientnet",  # from functionalization (only fails for inductor)
-    "hf_Bert",
-    "soft_actor_critic",
     "tacotron2",
-    "yolov3",
+    "vision_maskrcnn",  # from functionalization
     # OOM
     "Background_Matting",
     "fastNLP_Bert",
     "hf_BigBird",
+    "hf_T5_base",  # fp64_OOM
     "mobilenet_v2",
     "mobilenet_v2_quantized_qat",
     "resnet50_quantized_qat",

From 48910cd7798cb18644c4592a3c75dfb706da885b Mon Sep 17 00:00:00 2001
From: Sean Ross-Ross <srossross@gmail.com>
Date: Thu, 17 Nov 2022 23:36:15 +0000
Subject: [PATCH 1041/1922] feat: adding view_copy_batch_rule and opinfo for
 view_copy (#88150)

to add view_copy to vmap dispatch and adding opinfo

part of https://github.com/pytorch/functorch/issues/825

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88150
Approved by: https://github.com/kshitij12345, https://github.com/zou3519
---
 aten/src/ATen/functorch/BatchRulesViews.cpp        | 14 ++++++++++++++
 .../_internal/common_methods_invocations.py        | 14 ++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/aten/src/ATen/functorch/BatchRulesViews.cpp b/aten/src/ATen/functorch/BatchRulesViews.cpp
index e4513cf69c184..5eb18f71dd11f 100644
--- a/aten/src/ATen/functorch/BatchRulesViews.cpp
+++ b/aten/src/ATen/functorch/BatchRulesViews.cpp
@@ -438,6 +438,19 @@ std::tuple<Tensor, optional<int64_t>> view_batching_rule(
   return std::make_tuple(self_.view_symint(size_), 0);
 }
 
+std::tuple<Tensor,optional<int64_t>> view_copy_batch_rule(
+    const Tensor& self,
+    optional<int64_t> self_bdim,
+    c10::SymIntArrayRef size) {
+  auto self_ = moveBatchDimToFront(self, self_bdim);
+  SymDimVector view_size(size.size() + 1);
+  view_size[0] = self_.size(0);
+  std::copy(size.cbegin(), size.cend(), view_size.begin() + 1);
+
+  return std::make_tuple(at::view_copy_symint(self_, view_size), 0);
+}
+
+
 template <typename F, F Func>
 std::tuple<Tensor, optional<int64_t>> expand_batch_rule(
     const Tensor &self, optional<int64_t> self_bdim, SymIntArrayRef size, bool implicit)
@@ -544,6 +557,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   VMAP_SUPPORT(select_backward, select_backward_batch_rule);
   VMAP_SUPPORT(slice_backward, slice_backward_batch_rule);
   VMAP_SUPPORT(view, view_batching_rule);
+  VMAP_SUPPORT(view_copy, view_copy_batch_rule);
   VMAP_SUPPORT(expand, SINGLE_ARG(expand_batch_rule<decltype(&ATEN_FN(expand)), &ATEN_FN(expand)>));
   VMAP_SUPPORT(expand_copy, SINGLE_ARG(expand_batch_rule<decltype(&ATEN_FN(expand_copy)), &ATEN_FN(expand_copy)>));
   VMAP_SUPPORT(unfold, unfold_batch_rule);
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 8fe70e71614d4..24ef757b768da 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -12636,6 +12636,20 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, 'TestLazyOpInfo', 'test_correctness'),
                DecorateInfo(unittest.expectedFailure, 'TestLazyOpInfo', 'test_correctness_with_reusing_ir'),
            )),
+    OpInfo('view_copy',
+           dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16),
+           ref=lambda x, newshape: np.reshape(x, newshape).copy(),
+           supports_out=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_autograd=True,
+           sample_inputs_func=sample_inputs_view_reshape,
+           error_inputs_func=error_inputs_view_reshape,
+           skips=(
+               # https://github.com/pytorch/pytorch/issues/89068
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+           )),
     UnaryUfuncInfo('neg',
                    aliases=('negative', ),
                    ref=np.negative,

From 5f3815525a7d8d62d033c22232d5857ebaa3b72f Mon Sep 17 00:00:00 2001
From: Dmitry Tomshin <dmitry.tomshin@gmail.com>
Date: Fri, 18 Nov 2022 00:10:48 +0000
Subject: [PATCH 1042/1922] Issue 68576 prefetch factor (#88972)

Fixes #68576
This PR allows set the `prefetch_factor=None` making it really optional according to the documentation
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88972
Approved by: https://github.com/kit1980
---
 torch/utils/data/dataloader.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index 4c141eccc3be7..c836c9fa975f6 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -217,7 +217,7 @@ class DataLoader(Generic[T_co]):
     timeout: float
     sampler: Union[Sampler, Iterable]
     pin_memory_device: str
-    prefetch_factor: int
+    prefetch_factor: Optional[int]
     _iterator : Optional['_BaseDataLoaderIter']
     __initialized = False
 
@@ -228,7 +228,7 @@ def __init__(self, dataset: Dataset[T_co], batch_size: Optional[int] = 1,
                  pin_memory: bool = False, drop_last: bool = False,
                  timeout: float = 0, worker_init_fn: Optional[_worker_init_fn_t] = None,
                  multiprocessing_context=None, generator=None,
-                 *, prefetch_factor: int = 2,
+                 *, prefetch_factor: Optional[int] = None,
                  persistent_workers: bool = False,
                  pin_memory_device: str = ""):
         torch._C._log_api_usage_once("python.data_loader")
@@ -240,10 +240,13 @@ def __init__(self, dataset: Dataset[T_co], batch_size: Optional[int] = 1,
         if timeout < 0:
             raise ValueError('timeout option should be non-negative')
 
-        if num_workers == 0 and prefetch_factor != 2:
+        if num_workers == 0 and prefetch_factor is not None:
             raise ValueError('prefetch_factor option could only be specified in multiprocessing.'
-                             'let num_workers > 0 to enable multiprocessing.')
-        assert prefetch_factor > 0
+                             'let num_workers > 0 to enable multiprocessing, otherwise set prefetch_factor to None.')
+        elif num_workers > 0 and prefetch_factor is None:
+            prefetch_factor = 2
+        elif prefetch_factor is not None and prefetch_factor < 0:
+            raise ValueError('prefetch_factor option should be non-negative')
 
         if persistent_workers and num_workers == 0:
             raise ValueError('persistent_workers option needs num_workers > 0')
@@ -581,7 +584,6 @@ def __init__(self, loader: DataLoader) -> None:
         ws, rank = _get_distributed_settings()
         self._world_size = ws
         self._rank = rank
-        self._prefetch_factor = loader.prefetch_factor
         # for other backends, pin_memory_device need to set. if not set
         # default behaviour is CUDA device. if pin_memory_device is selected
         # and pin_memory is not set, the default behaviour false.
@@ -991,6 +993,8 @@ class _MultiProcessingDataLoaderIter(_BaseDataLoaderIter):
     def __init__(self, loader):
         super(_MultiProcessingDataLoaderIter, self).__init__(loader)
 
+        self._prefetch_factor = loader.prefetch_factor
+
         assert self._num_workers > 0
         assert self._prefetch_factor > 0
 

From 2ff9affc9bb94d1e0fff06c5151378bcd484d0bf Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Fri, 18 Nov 2022 00:11:42 +0000
Subject: [PATCH 1043/1922] Use pytest-flakefinder to rerun tests multiple
 times (#89106)

Per title. The way re-run is handled in https://github.com/pytorch/pytorch/pull/88646 only applies to unittest.

### Testing

* https://github.com/pytorch/pytorch/actions/runs/3484930558
* https://github.com/pytorch/pytorch/actions/runs/3484930319

Manually download the test report artifacts and verify that that pytest test_ops is called multiple times.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89106
Approved by: https://github.com/clee2000
---
 test/run_test.py                        | 65 ++++++++++++++++++++-----
 torch/testing/_internal/common_utils.py | 12 +++--
 2 files changed, 61 insertions(+), 16 deletions(-)

diff --git a/test/run_test.py b/test/run_test.py
index 94bee60cc24ed..62ce99ae7937a 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -743,33 +743,72 @@ def print_log_file(test: str, file_path: str, failed: bool) -> None:
 
 
 def run_test_ops(test_module, test_directory, options):
+    if os.getenv("PYTORCH_TEST_RERUN_DISABLED_TESTS", "0") == "1":
+        # When under rerun-disabled-tests mode, run the same tests multiple times to determine their
+        # flakiness status. Default to 50 re-runs
+        rerun_options = ["--flake-finder", "--flake-runs=50"]
+    else:
+        # When under the normal mode, retry a failed test 2 more times. -x means stop at the first
+        # failure
+        rerun_options = ["-x", "--reruns=2"]
+
+    default_unittest_args = [
+        "--use-pytest",
+        "-vv",
+        "-rfEX"
+    ]
+    default_unittest_args.extend(rerun_options)
+
     if 'slow-gradcheck' in os.getenv("BUILD_ENVIRONMENT", ""):
+        extra_unittest_args = default_unittest_args.copy()
         # there are a lot of tests that take up a lot of space in slowgrad check, so don't bother parallelizing
         # it's also on periodic so we don't care about TTS as much
-        return run_test(test_module, test_directory, copy.deepcopy(options),
-                        extra_unittest_args=["--use-pytest", '-vv', '-x', '--reruns=2', '-rfEX'],
-                        )
+        return run_test(
+            test_module,
+            test_directory,
+            copy.deepcopy(options),
+            extra_unittest_args=extra_unittest_args,
+        )
+
     return_codes = []
     os.environ["NUM_PARALLEL_PROCS"] = str(NUM_PROCS)
     pool = get_context("spawn").Pool(NUM_PROCS)
     for i in range(NUM_PROCS):
-        return_code = pool.apply_async(run_test, args=(test_module, test_directory, copy.deepcopy(options)),
-                                       kwds={"extra_unittest_args": ["--use-pytest", '-vv', '-x', '--reruns=2', '-rfEX',
-                                                                     f'--shard-id={i}', f'--num-shards={NUM_PROCS}',
-                                                                     "-k=not _linalg_cholesky_"],
-                                             })
+        extra_unittest_args = default_unittest_args.copy()
+        extra_unittest_args.extend([
+            f"--shard-id={i}",
+            f"--num-shards={NUM_PROCS}",
+            "-k=not _linalg_cholesky_",
+        ])
+
+        return_code = pool.apply_async(
+            run_test,
+            args=(test_module, test_directory, copy.deepcopy(options)),
+            kwds={
+                "extra_unittest_args": extra_unittest_args,
+            },
+        )
         return_codes.append(return_code)
+
     pool.close()
     pool.join()
-    del os.environ['NUM_PARALLEL_PROCS']
+    del os.environ["NUM_PARALLEL_PROCS"]
 
     for return_code in return_codes:
         if return_code.get() != 0:
             return return_code.get()
-    return_code = run_test(test_module, test_directory, copy.deepcopy(options),
-                           extra_unittest_args=["--use-pytest", '-vv', '-x', '--reruns=2', '-rfEX',
-                                                "-k=_linalg_cholesky_"],
-                           )
+
+    extra_unittest_args = default_unittest_args.copy()
+    extra_unittest_args.extend([
+        "-k=_linalg_cholesky_",
+    ])
+
+    return_code = run_test(
+        test_module,
+        test_directory,
+        copy.deepcopy(options),
+        extra_unittest_args=extra_unittest_args,
+    )
     return return_code
 
 
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index fa3eda3758e4e..35ec53381c1fb 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -741,9 +741,15 @@ def run_tests(argv=UNITTEST_ARGS):
         if TEST_SAVE_XML:
             sanitize_pytest_xml(test_report_path)
         print("If in CI, skip info is located in the xml test reports, please either go to s3 or the hud to download them")
-        # exitcode of 5 means no tests were found, which happens since some test configs don't
-        # run tests from certain files
-        exit(0 if exit_code == 5 else exit_code)
+
+        if not RERUN_DISABLED_TESTS:
+            # exitcode of 5 means no tests were found, which happens since some test configs don't
+            # run tests from certain files
+            exit(0 if exit_code == 5 else exit_code)
+        else:
+            # Only record the test report and always return a success code when running under rerun
+            # disabled tests mode
+            exit(0)
     elif TEST_SAVE_XML is not None:
         # import here so that non-CI doesn't need xmlrunner installed
         import xmlrunner  # type: ignore[import]

From 8568a33379cf823fe92b689553ab420b57f6ff11 Mon Sep 17 00:00:00 2001
From: David Boetius <cherrywoods@posteo.org>
Date: Fri, 18 Nov 2022 01:57:38 +0000
Subject: [PATCH 1044/1922] Fix torch.nn.functional.gelu docstring formatting
 (#89061)

The docstring of `torch.nn.functional.gelu` is formatted incorrectly, so that part of the math isn't rendered and there are extra blocks when there shouldn't: https://pytorch.org/docs/stable/generated/torch.nn.functional.gelu.html

I didn't build the docs, so I am not 100% sure that I got the formatting right, but I am confident.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89061
Approved by: https://github.com/bdhirsh, https://github.com/kit1980
---
 torch/nn/functional.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 961dd83f57b2c..e3aea9f0acea0 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -1713,8 +1713,10 @@ def rrelu(
 
 where :math:`\Phi(x)` is the Cumulative Distribution Function for Gaussian Distribution.
 
-When the approximate argument is 'tanh', Gelu is estimated with:
-    :math::  \text{GELU}(x) = 0.5 * x * (1 + \text{Tanh}(\sqrt(2 / \pi) * (x + 0.044715 * x^3)))
+When the approximate argument is 'tanh', Gelu is estimated with
+
+.. math::
+    \text{GELU}(x) = 0.5 * x * (1 + \text{Tanh}(\sqrt(2 / \pi) * (x + 0.044715 * x^3)))
 
 See `Gaussian Error Linear Units (GELUs) <https://arxiv.org/abs/1606.08415>`_.
 """)

From 159f70cb3738d2e15524c3fc9b1a3ec7053c8c3c Mon Sep 17 00:00:00 2001
From: Dmytro Dzhulgakov <dima.v.dzhulgakov@gmail.com>
Date: Fri, 18 Nov 2022 02:42:45 +0000
Subject: [PATCH 1045/1922] Fix the kineto daemon build condition (#89174)

If we're not building the lite interpreter we shouldn't be disabling Kineto. This eliminates a step from https://github.com/facebookincubator/dynolog/blob/main/docs/pytorch_profiler.md
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89174
Approved by: https://github.com/kimishpatel, https://github.com/malfet
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6efd3f2df9366..5ea01f0c0f539 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -795,7 +795,7 @@ if(USE_SOURCE_DEBUG_ON_MOBILE)
   string(APPEND CMAKE_CXX_FLAGS " -DSYMBOLICATE_MOBILE_DEBUG_HANDLE")
 endif()
 
-if(USE_LITE_INTERPRETER_PROFILER)
+if(BUILD_LITE_INTERPRETER AND USE_LITE_INTERPRETER_PROFILER)
   string(APPEND CMAKE_CXX_FLAGS " -DEDGE_PROFILER_USE_KINETO")
 endif()
 

From 3f4a6728b7e87e232bbc8aabc3666221d855b61b Mon Sep 17 00:00:00 2001
From: Raman kumar <k.raman1998@yahoo.in>
Date: Fri, 18 Nov 2022 02:53:39 +0000
Subject: [PATCH 1046/1922] [MPS] Support for median with dim (#88807)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary ⚡

**Aim**: Add support for aten::median for MPS backend (Fixes #87220)

This is fresh clean PR from the previous [PR](https://github.com/pytorch/pytorch/pull/88554)

- Implementing the new median function in aten/src/ATen/native/mps/operations/ReduceOps.mm
- Adding it to aten/src/ATen/native/native_functions.yaml
- Adding it to existing test_median

### **this will works like this** 🪶
median of entire input tensor on MPS
`torch.median(mps_inputTensor)`
median of along a dim
`torch.median(mps_inputTensor, dim=[int], keepdim=[Bool])`
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88807
Approved by: https://github.com/kulinseth
---
 aten/src/ATen/native/mps/MPSGraphVenturaOps.h |   8 +
 .../ATen/native/mps/operations/ReduceOps.mm   | 315 ++++++++++++++++++
 aten/src/ATen/native/native_functions.yaml    |   2 +
 test/test_mps.py                              |  41 +++
 4 files changed, 366 insertions(+)

diff --git a/aten/src/ATen/native/mps/MPSGraphVenturaOps.h b/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
index 86153b58ed87e..b77db66795cf4 100644
--- a/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
+++ b/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
@@ -6,4 +6,12 @@
 - (MPSGraphTensor *)cumulativeSumWithTensor:(MPSGraphTensor *)tensor
                                        axis:(NSInteger)axis
                                        name:(NSString *)name;
+
+- (MPSGraphTensor *)sortWithTensor:(MPSGraphTensor *)tensor
+                                       axis:(NSInteger)axis
+                                       name:(NSString *)name;
+
+- (MPSGraphTensor *)argSortWithTensor:(MPSGraphTensor *)tensor
+                                       axis:(NSInteger)axis
+                                       name:(NSString *)name;
 @end
diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index 91aa245b89911..c99f22d89295f 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -9,6 +9,7 @@
 #include <ATen/native/ReduceOpsUtils.h>
 #include <ATen/native/Pool.h>
 #include <torch/library.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
 
 namespace at {
 namespace native {
@@ -1638,5 +1639,319 @@ Tensor min_mps(const Tensor& input_t) {
     return min_max_mps(input_t, dim, keepdim, MPSReductionType::MIN, "min_mps");
 }
 
+// Median of entire tensor into scalar result
+Tensor median_mps(const Tensor& input_t) {
+
+  if(!is_macos_13_or_newer()){
+        TORCH_WARN_ONCE("MPS: median op is supported natively starting from macOS 13.0. ",
+                    "Falling back on CPU. This may have performace implications.");
+        return at::median(input_t.to("cpu"));
+  }
+
+    TORCH_INTERNAL_ASSERT(input_t.scalar_type() != ScalarType::Long, "median not supported for Long dtype on MPS");
+
+    namespace native_mps = at::native::mps;
+    using CachedGraph = native_mps::MPSUnaryCachedGraph;
+
+    native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
+
+    IntArrayRef input_shape = input_t.sizes();
+    int64_t num_input_dims = input_shape.size();
+
+    // calculate total no. of elements in the input tensor to reduce it to one dimension
+    NSMutableArray<NSNumber*> *apparent_input_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:1];
+    int64_t num_in_elements = 1;
+    for(int i = 0; i < num_input_dims; i++) {
+        num_in_elements *= input_shape[i];
+    }
+
+    apparent_input_shape[0] = [NSNumber numberWithInt:num_in_elements];
+
+    Tensor output_t = at::native::empty_mps({}, input_t.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt);
+
+    if (output_t.numel() == 0 || num_in_elements == 0) {
+        return output_t;
+    }
+
+  @autoreleasepool {
+    string key = "median_mps:"+ mps::getMPSTypeString(input_t.scalar_type())  + mps::getTensorsStringKey(input_t);
+    CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
+    // Initialize once if configuration not found in cache
+    if(!cachedGraph) {
+      native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+            MPSGraph* mpsGraph = native_mps::make_mps_graph();
+            newCachedGraph = new CachedGraph(mpsGraph);
+
+            MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t);
+
+            MPSGraphTensor* outputTensor = nil;
+
+            MPSGraphTensor * reshapedTensor = [mpsGraph reshapeTensor:inputTensor
+                                                            withShape:@[@-1]
+                                                                  name:nil];
+            MPSGraphTensor * sortedTensor = [mpsGraph
+                                                  sortWithTensor:reshapedTensor
+                                                  axis:((NSUInteger) (int)0)
+                                                  name:nil];
+
+            outputTensor = [mpsGraph sliceTensor:sortedTensor
+                                                        dimension:0
+                                                        start:((NSUInteger) (int)((num_in_elements+1)/2 ) - 1)
+                                                        length:1
+                                                        name:nil];
+
+            newCachedGraph->inputTensor_ = inputTensor;
+            newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
+    auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, @[@1]);
+
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
+      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+    };
+
+    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    native_mps::runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+  }
+
+  return output_t;
+}
+
+
+void median_out_mps
+  (const Tensor& input_t,
+  int64_t dim,
+  bool keepdim,
+  const Tensor& output_t,
+  const Tensor& indices_t,
+  const std::string& func_name) {
+
+    namespace native_mps = at::native::mps;
+
+    if (output_t.numel() == 0) {
+      return;
+    }
+    if (input_t.numel() == 1 && input_t.dim() == 0) {
+      output_t.fill_(input_t);
+      indices_t.fill_(0);
+      return;
+    }
+
+    // Derive from MPSCachedGraph
+    struct CachedGraph : public native_mps::MPSCachedGraph
+    {
+      CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+      MPSGraphTensor *inputTensor_ = nil;
+      MPSGraphTensor *outputTensor_ = nil;
+      MPSGraphTensor *indicesTensor_ = nil;
+    };
+
+    native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
+
+    int64_t dim_ = maybe_wrap_dim(dim, input_t.dim());
+
+    // Calculate the output shape according to keepdim=True
+    // If there is no dim argument, the input shape is flattened
+    IntArrayRef input_shape = input_t.sizes();
+    int64_t num_input_dims = input_shape.size();
+    NSMutableArray<NSNumber*> *apparent_out_shape = nil;
+
+    apparent_out_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
+    for(int i = 0; i < num_input_dims; i++) {
+        if(dim_ == i)
+            apparent_out_shape[i] = @1;
+        else
+            apparent_out_shape[i] = [NSNumber numberWithInt:input_shape[i]];
+    }
+    int dim_total_elements = input_shape[dim_];
+
+    auto stream = at::mps::getCurrentMPSStream();
+
+    @autoreleasepool {
+        string key = func_name + ":" + to_string(dim_) + ":" + native_mps::getMPSTypeString(input_t.scalar_type());
+        CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+        if(!cachedGraph) {
+          native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+
+            CachedGraph *newCachedGraph = nil;
+
+            @autoreleasepool {
+              MPSGraph* mpsGraph = native_mps::make_mps_graph();
+              newCachedGraph = new CachedGraph(mpsGraph);
+
+              MPSGraphTensor* inputTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(input_t.scalar_type()));
+              MPSGraphTensor* outputTensor = nil;
+              MPSGraphTensor * sortedTensor = [mpsGraph
+                                                  sortWithTensor:inputTensor
+                                                  axis:((NSUInteger) (int)dim_)
+                                                  name:nil];
+
+              outputTensor = [mpsGraph sliceTensor:sortedTensor
+                                                        dimension:dim_
+                                                        start:((NSUInteger) (int)((dim_total_elements+1)/2 ) - 1)
+                                                        length:1
+                                                        name:nil];
+              MPSGraphTensor* argreduceOutTensor = nil;
+                argreduceOutTensor = [mpsGraph argSortWithTensor:inputTensor
+                                                                        axis:(NSInteger)dim_
+                                                                        name:@"argmax_out"];
+              MPSGraphTensor* argOutputTensor = [mpsGraph sliceTensor:argreduceOutTensor
+                                                        dimension:dim_
+                                                        start:((NSUInteger) (int)((dim_total_elements+1)/2 ) - 1)
+                                                        length:1
+                                                        name:nil];
+
+              newCachedGraph->inputTensor_ = inputTensor;
+              newCachedGraph->outputTensor_ = outputTensor;
+              newCachedGraph->indicesTensor_ = argOutputTensor;
+            }
+            return newCachedGraph;
+          });
+          cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+        }
+
+        auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
+        auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, apparent_out_shape);
+        auto indicesPlaceholder = native_mps::Placeholder(cachedGraph->indicesTensor_, indices_t, apparent_out_shape);
+
+        NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
+          inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+        };
+
+        NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
+          outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(),
+          indicesPlaceholder.getMPSGraphTensor() : indicesPlaceholder.getMPSGraphTensorData()
+        };
+
+        native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+    }
+
+}
+
+// in case mps sortWithTensor do not supported on macOS
+std::tuple<Tensor&, Tensor&> median_from_cpu(
+    const Tensor& self,
+    int64_t dim,
+    bool keepdim, Tensor & valuesI, Tensor & indicesI, IntArrayRef vec_out_shape, IntArrayRef vec_apparent_out_shape) {
+      // Tensor a = at::median(self.to("cpu"));
+      Tensor values;
+      Tensor indices;
+    if (!keepdim){
+        values = at::empty({vec_out_shape}, self.options());
+        indices = at::empty({vec_out_shape}, self.options().dtype(kLong));
+
+      }
+      else{
+          values = at::empty({vec_apparent_out_shape}, self.options());
+          indices = at::empty({vec_apparent_out_shape}, self.options().dtype(kLong));
+      }
+      at::median_out(values, indices, self, dim, keepdim);
+
+  valuesI.copy_(values);
+  indicesI.copy_(indices);
+  return std::forward_as_tuple(valuesI, indicesI);
+}
+
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> median_out_mps
+    (const at::Tensor & input_t,
+    int64_t dim,
+    bool keepdim,
+    at::Tensor & values,
+    at::Tensor & indices){
+
+  TORCH_INTERNAL_ASSERT(input_t.scalar_type() != ScalarType::Long, "median not supported for Long dtype on MPS");
+
+  namespace native_mps = at::native::mps;
+    int64_t dim_ = maybe_wrap_dim(dim, input_t.dim());
+    native::zero_numel_check_dims(input_t, dim_, "max()");
+
+    // Calculate the output shape according to keepdim=True
+    // If there is no dim argument, the input shape is flattened
+    IntArrayRef input_shape = input_t.sizes();
+    int64_t num_input_dims = input_shape.size();
+    NSMutableArray<NSNumber*> *apparent_out_shape = nil;
+    // Use this if keepdim is false
+    int64_t num_output_dims = num_input_dims - 1;
+
+    std::vector<int64_t> vec_apparent_out_shape(num_input_dims);
+    std::vector<int64_t> vec_out_shape(num_output_dims);
+
+    apparent_out_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
+    // Counter for shape when keepdim is false
+    int out_i = 0;
+    for(int i = 0; i < num_input_dims; i++) {
+        if(dim_ == i) {
+            apparent_out_shape[i] = @1;
+            vec_apparent_out_shape[i] = 1;
+        }
+        else {
+            apparent_out_shape[i] = [NSNumber numberWithInt:input_shape[i]];
+            vec_apparent_out_shape[i] = input_shape[i];
+            vec_out_shape[out_i] = input_shape[i];
+            out_i++;
+        }
+    }
+
+    if(!keepdim) {
+     values = at::native::empty_mps(
+                      IntArrayRef(vec_out_shape),
+                      input_t.scalar_type(),
+                      c10::nullopt,
+                      kMPS,
+                      c10::nullopt,
+                      c10::nullopt);
+     indices = at::native::empty_mps(
+                      IntArrayRef(vec_out_shape),
+                      ScalarType::Long,
+                      c10::nullopt,
+                      kMPS,
+                      c10::nullopt,
+                      c10::nullopt);
+    } else {
+      values = at::native::empty_mps(
+                      IntArrayRef(vec_apparent_out_shape),
+                      input_t.scalar_type(),
+                      c10::nullopt,
+                      kMPS,
+                      c10::nullopt,
+                      c10::nullopt);
+     indices = at::native::empty_mps(
+                      IntArrayRef(vec_apparent_out_shape),
+                      ScalarType::Long,
+                      c10::nullopt,
+                      kMPS,
+                      c10::nullopt,
+                      c10::nullopt);
+    }
+
+    if (values.numel() == 0 || input_t.numel() == 0) {
+        return std::tuple<Tensor&, Tensor&>{values, indices};
+    }
+
+    if(!is_macos_13_or_newer()){
+      TORCH_WARN_ONCE("MPS: median op is supported natively starting from macOS 13.0.",
+                    "Falling back on CPU. This may have performace implications.");
+    return median_from_cpu(input_t.to("cpu"), dim, keepdim, values, indices, IntArrayRef(vec_out_shape),IntArrayRef(vec_apparent_out_shape) );
+  }
+
+    median_out_mps(input_t, dim, keepdim, values, indices, "median_out_mps");
+
+    return std::tuple<Tensor&, Tensor&>{values, indices};
+}
+
 } // native
 } // at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 8046b4f6ac4b4..b1d1094667e17 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -3492,6 +3492,7 @@
   dispatch:
     CPU: median_cpu
     CUDA: median_cuda
+    MPS: median_mps
   autogen: median.out
 
 - func: median.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
@@ -3503,6 +3504,7 @@
   dispatch:
     CPU: median_out_cpu
     CUDA: median_out_cuda
+    MPS: median_out_mps
 
 - func: median.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
   variants: function, method
diff --git a/test/test_mps.py b/test/test_mps.py
index 266c6a5c2c220..aeddea0d21f3d 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -2541,6 +2541,47 @@ def helper(n, c, h, w, dtype=torch.float32):
         helper(2, 8, 4, 5, torch.int32)
         # helper(2, 8, 4, 5, torch.int64)
 
+    def test_median(self):
+        def helper_dtype_int32(n1, n2, n3):
+            cpu_x = torch.randint(50, (n1, n2, n3), device='cpu', dtype=torch.int32)
+            mps_x = cpu_x.detach().clone().to('mps')
+
+            result_cpu = torch.median(cpu_x)
+            result_mps = torch.median(mps_x)
+
+            self.assertEqual(result_cpu, result_mps)
+
+            for dim in [0, 1, 2]:
+                for keepdim in [True, False]:
+                    y, idx = torch.median(cpu_x, dim=dim, keepdim=keepdim)
+                    refy, refidx = torch.median(mps_x, dim=dim, keepdim=keepdim)
+                    self.assertEqual(y, refy)
+                    self.assertEqual(idx, refidx)
+
+        def helper_dtype_float32(n1, n2, n3):
+            cpu_x = torch.randn(n1, n2, n3, device='cpu', dtype=torch.float32)
+            mps_x = cpu_x.detach().clone().to('mps')
+
+            result_cpu = torch.median(cpu_x)
+            result_mps = torch.median(mps_x)
+
+            self.assertEqual(result_cpu, result_mps)
+
+            for dim in [0, 1, 2]:
+                for keepdim in [True, False]:
+                    y, idx = torch.median(cpu_x, dim=dim, keepdim=keepdim)
+                    refy, refidx = torch.median(mps_x, dim=dim, keepdim=keepdim)
+                    self.assertEqual(y, refy)
+                    self.assertEqual(idx, refidx)
+
+        helper_dtype_int32(10, 10, 10)  # median at even place
+        helper_dtype_int32(3, 3, 3)  # median at odd place
+        helper_dtype_int32(1, 1, 1)
+        helper_dtype_int32(1, 2, 3)
+        helper_dtype_float32(10, 10, 10)
+        helper_dtype_float32(3, 3, 3)
+        helper_dtype_float32(1, 1, 1)
+
     def test_any(self):
         def helper(shape):
             input_xs = []

From 8968937af0e87306c72d3aef00ac0618a367c09b Mon Sep 17 00:00:00 2001
From: mikey dagitses <mikeyd@meta.com>
Date: Thu, 17 Nov 2022 16:20:45 -0500
Subject: [PATCH 1047/1922] add -Wnarrowing as error to cmake builds (#89207)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89207
Approved by: https://github.com/wconstab, https://github.com/malfet
---
 CMakeLists.txt                                       | 2 +-
 aten/src/ATen/native/NNPACK.cpp                      | 4 ++--
 aten/src/ATen/native/mps/operations/Distributions.mm | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5ea01f0c0f539..3d70f6ef58161 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -815,7 +815,6 @@ endif()
 # ---[ Build flags
 if(NOT MSVC)
   string(APPEND CMAKE_CXX_FLAGS " -O2 -fPIC")
-  string(APPEND CMAKE_CXX_FLAGS " -Wno-narrowing")
   # Eigen fails to build with some versions, so convert this to a warning
   # Details at http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1459
   string(APPEND CMAKE_CXX_FLAGS " -Wall")
@@ -824,6 +823,7 @@ if(NOT MSVC)
   append_cxx_flag_if_supported("-Werror=non-virtual-dtor" CMAKE_CXX_FLAGS)
   append_cxx_flag_if_supported("-Werror=braced-scalar-init" CMAKE_CXX_FLAGS)
   append_cxx_flag_if_supported("-Werror=range-loop-construct" CMAKE_CXX_FLAGS)
+  append_cxx_flag_if_supported("-Wnarrowing" CMAKE_CXX_FLAGS)
   append_cxx_flag_if_supported("-Wno-missing-field-initializers" CMAKE_CXX_FLAGS)
   append_cxx_flag_if_supported("-Wno-type-limits" CMAKE_CXX_FLAGS)
   append_cxx_flag_if_supported("-Wno-array-bounds" CMAKE_CXX_FLAGS)
diff --git a/aten/src/ATen/native/NNPACK.cpp b/aten/src/ATen/native/NNPACK.cpp
index 544641f091a35..4fb40a17d0267 100644
--- a/aten/src/ATen/native/NNPACK.cpp
+++ b/aten/src/ATen/native/NNPACK.cpp
@@ -209,8 +209,8 @@ Tensor _nnpack_spatial_convolution(
       .height = (size_t)output.size(2),
   };
   const nnp_size output_subsample = {
-      .width = stride[1],
-      .height = stride[0],
+      .width = static_cast<std::size_t>(stride[1]),
+      .height = static_cast<std::size_t>(stride[0]),
   };
 
   const auto input_ = input.contiguous();
diff --git a/aten/src/ATen/native/mps/operations/Distributions.mm b/aten/src/ATen/native/mps/operations/Distributions.mm
index a1a41d11e5b50..1da2457f3a37e 100644
--- a/aten/src/ATen/native/mps/operations/Distributions.mm
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@@ -438,7 +438,7 @@ Tensor normal_mps(const Tensor& mean, const Tensor& std, c10::optional<Generator
           MPSGraphTensor *randomTensor = generatorTensors[0];
 
           auto broadcastShape = @[ns_numDist ,ns_n_sample, ns_numCategories];
-          int broadcastShapeVals[3] = {numDist, n_sample, numCategories};
+          int broadcastShapeVals[3] = {numDist, static_cast<int>(n_sample), numCategories};
           MPSGraphTensor *broadcastShapeTensor = [mpsGraph constantWithData:[NSData dataWithBytes:broadcastShapeVals length:sizeof(int) * broadcastShape.count]
                                                                       shape:@[[NSNumber numberWithUnsignedInteger:broadcastShape.count]]
                                                                    dataType:MPSDataTypeUInt32];

From c660da53c5374298d83f18637cafbb06f2d7d9f3 Mon Sep 17 00:00:00 2001
From: John Detloff <johndetloff@fb.com>
Date: Fri, 18 Nov 2022 03:17:35 +0000
Subject: [PATCH 1048/1922] Add previously deleted circleci readme back to repo
 (#85598)

This readme was deleted here: https://github.com/pytorch/pytorch/pull/73224 I chatted with the author, who doesn't remember exactly why it was deleted but suspects it was due either to out of date contents or because of the upcoming migration to github actions.

With that said, we have references to this readme through our circleci directory, and since we do still have a lot of circleci workflows I feel this readme still adds a lot of value. (I recently did some CI tasks that required me to dig this readme up in order to solve a problem).

I recommend we restore this file with a warning that its contents may be out of date, until our CircleCI workflows are entirely migrated to Github Actions

Pull Request resolved: https://github.com/pytorch/pytorch/pull/85598
Approved by: https://github.com/clee2000, https://github.com/malfet
---
 .circleci/README.md | 468 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 468 insertions(+)
 create mode 100644 .circleci/README.md

diff --git a/.circleci/README.md b/.circleci/README.md
new file mode 100644
index 0000000000000..e2429b4d1f037
--- /dev/null
+++ b/.circleci/README.md
@@ -0,0 +1,468 @@
+Warning
+=======
+
+Contents may be out of date. Our CircleCI workflows are gradually being migrated to Github actions.
+
+Structure of CI
+===============
+
+setup job:
+1. Does a git checkout
+2. Persists CircleCI scripts (everything in `.circleci`) into a workspace.  Why?
+   We don't always do a Git checkout on all subjobs, but we usually
+   still want to be able to call scripts one way or another in a subjob.
+   Persisting files this way lets us have access to them without doing a
+   checkout.  This workspace is conventionally mounted on `~/workspace`
+   (this is distinguished from `~/project`, which is the conventional
+   working directory that CircleCI will default to starting your jobs
+   in.)
+3. Write out the commit message to `.circleci/COMMIT_MSG`.  This is so
+   we can determine in subjobs if we should actually run the jobs or
+   not, even if there isn't a Git checkout.
+
+
+CircleCI configuration generator
+================================
+
+One may no longer make changes to the `.circleci/config.yml` file directly.
+Instead, one must edit these Python scripts or files in the `verbatim-sources/` directory.
+
+
+Usage
+----------
+
+1. Make changes to these scripts.
+2. Run the `regenerate.sh` script in this directory and commit the script changes and the resulting change to `config.yml`.
+
+You'll see a build failure on GitHub if the scripts don't agree with the checked-in version.
+
+
+Motivation
+----------
+
+These scripts establish a single, authoritative source of documentation for the CircleCI configuration matrix.
+The documentation, in the form of diagrams, is automatically generated and cannot drift out of sync with the YAML content.
+
+Furthermore, consistency is enforced within the YAML config itself, by using a single source of data to generate
+multiple parts of the file.
+
+* Facilitates one-off culling/enabling of CI configs for testing PRs on special targets
+
+Also see https://github.com/pytorch/pytorch/issues/17038
+
+
+Future direction
+----------------
+
+### Declaring sparse config subsets
+See comment [here](https://github.com/pytorch/pytorch/pull/17323#pullrequestreview-206945747):
+
+In contrast with a full recursive tree traversal of configuration dimensions,
+> in the future I think we actually want to decrease our matrix somewhat and have only a few mostly-orthogonal builds that taste as many different features as possible on PRs, plus a more complete suite on every PR and maybe an almost full suite nightly/weekly (we don't have this yet). Specifying PR jobs in the future might be easier to read with an explicit list when we come to this.
+----------------
+----------------
+
+# How do the binaries / nightlies / releases work?
+
+### What is a binary?
+
+A binary or package (used interchangeably) is a pre-built collection of c++ libraries, header files, python bits, and other files. We build these and distribute them so that users do not need to install from source.
+
+A **binary configuration** is a collection of
+
+* release or nightly
+    * releases are stable, nightlies are beta and built every night
+* python version
+    * linux: 3.7m (mu is wide unicode or something like that. It usually doesn't matter but you should know that it exists)
+    * macos: 3.7, 3.8
+    * windows: 3.7, 3.8
+* cpu version
+    * cpu, cuda 9.0, cuda 10.0
+    * The supported cuda versions occasionally change
+* operating system
+    * Linux - these are all built on CentOS. There haven't been any problems in the past building on CentOS and using on Ubuntu
+    * MacOS
+    * Windows - these are built on Azure pipelines
+* devtoolset version (gcc compiler version)
+    * This only matters on Linux cause only Linux uses gcc. tldr is gcc made a backwards incompatible change from gcc 4.8 to gcc 5, because it had to change how it implemented std::vector and std::string
+
+### Where are the binaries?
+
+The binaries are built in CircleCI. There are nightly binaries built every night at 9pm PST (midnight EST) and release binaries corresponding to Pytorch releases, usually every few months.
+
+We have 3 types of binary packages
+
+* pip packages - nightlies are stored on s3 (pip install -f \<a s3 url\>). releases are stored in a pip repo (pip install torch) (ask Soumith about this)
+* conda packages - nightlies and releases are both stored in a conda repo. Nighty packages have a '_nightly' suffix
+* libtorch packages - these are zips of all the c++ libraries, header files, and sometimes dependencies. These are c++ only
+    * shared with dependencies (the only supported option for Windows)
+    * static with dependencies
+    * shared without dependencies
+    * static without dependencies
+
+All binaries are built in CircleCI workflows except Windows. There are checked-in workflows (committed into the .circleci/config.yml) to build the nightlies every night. Releases are built by manually pushing a PR that builds the suite of release binaries (overwrite the config.yml to build the release)
+
+# CircleCI structure of the binaries
+
+Some quick vocab:
+
+* A \**workflow** is a CircleCI concept; it is a DAG of '**jobs**'. ctrl-f 'workflows' on https://github.com/pytorch/pytorch/blob/master/.circleci/config.yml to see the workflows.
+* **jobs** are a sequence of '**steps**'
+* **steps** are usually just a bash script or a builtin CircleCI command. *All steps run in new environments, environment variables declared in one script DO NOT persist to following steps*
+* CircleCI has a **workspace**, which is essentially a cache between steps of the *same job* in which you can store artifacts between steps.
+
+## How are the workflows structured?
+
+The nightly binaries have 3 workflows. We have one job (actually 3 jobs:  build, test, and upload) per binary configuration
+
+1. binary_builds
+    1. every day midnight EST
+    2. linux: https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/linux-binary-build-defaults.yml
+    3. macos: https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/macos-binary-build-defaults.yml
+    4. For each binary configuration, e.g. linux_conda_3.7_cpu there is a
+        1. binary_linux_conda_3.7_cpu_build
+            1. Builds the build. On linux jobs this uses the 'docker executor'.
+            2. Persists the package to the workspace
+        2. binary_linux_conda_3.7_cpu_test
+            1. Loads the package to the workspace
+            2. Spins up a docker image (on Linux), mapping the package and code repos into the docker
+            3. Runs some smoke tests in the docker
+            4. (Actually, for macos this is a step rather than a separate job)
+        3. binary_linux_conda_3.7_cpu_upload
+            1. Logs in to aws/conda
+            2. Uploads the package
+2. update_s3_htmls
+    1. every day 5am EST
+    2. https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/binary_update_htmls.yml
+    3. See below for what these are for and why they're needed
+    4. Three jobs that each examine the current contents of aws and the conda repo and update some html files in s3
+3. binarysmoketests
+    1. every day
+    2. https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml
+    3. For each binary configuration, e.g. linux_conda_3.7_cpu there is a
+        1. smoke_linux_conda_3.7_cpu
+            1. Downloads the package from the cloud, e.g. using the official pip or conda instructions
+            2. Runs the smoke tests
+
+## How are the jobs structured?
+
+The jobs are in https://github.com/pytorch/pytorch/tree/master/.circleci/verbatim-sources. Jobs are made of multiple steps. There are some shared steps used by all the binaries/smokes. Steps of these jobs are all delegated to scripts in https://github.com/pytorch/pytorch/tree/master/.circleci/scripts .
+
+* Linux jobs: https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/linux-binary-build-defaults.yml
+    * binary_linux_build.sh
+    * binary_linux_test.sh
+    * binary_linux_upload.sh
+* MacOS jobs: https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/macos-binary-build-defaults.yml
+    * binary_macos_build.sh
+    * binary_macos_test.sh
+    * binary_macos_upload.sh
+* Update html jobs: https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/binary_update_htmls.yml
+    * These delegate from the pytorch/builder repo
+    * https://github.com/pytorch/builder/blob/master/cron/update_s3_htmls.sh
+    * https://github.com/pytorch/builder/blob/master/cron/upload_binary_sizes.sh
+* Smoke jobs (both linux and macos): https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml
+    * These delegate from the pytorch/builder repo
+    * https://github.com/pytorch/builder/blob/master/run_tests.sh
+    * https://github.com/pytorch/builder/blob/master/smoke_test.sh
+    * https://github.com/pytorch/builder/blob/master/check_binary.sh
+* Common shared code (shared across linux and macos): https://github.com/pytorch/pytorch/blob/master/.circleci/verbatim-sources/nightly-binary-build-defaults.yml
+    * binary_checkout.sh - checks out pytorch/builder repo. Right now this also checks out pytorch/pytorch, but it shouldn't. pytorch/pytorch should just be shared through the workspace. This can handle being run before binary_populate_env.sh
+    * binary_populate_env.sh - parses BUILD_ENVIRONMENT into the separate env variables that make up a binary configuration. Also sets lots of default values, the date, the version strings, the location of folders in s3, all sorts of things. This generally has to be run before other steps.
+    * binary_install_miniconda.sh - Installs miniconda, cross platform. Also hacks this for the update_binary_sizes job that doesn't have the right env variables
+    * binary_run_in_docker.sh - Takes a bash script file (the actual test code) from a hardcoded location, spins up a docker image, and runs the script inside the docker image
+
+### **Why do the steps all refer to scripts?**
+
+CircleCI creates a  final yaml file by inlining every <<* segment, so if we were to keep all the code in the config.yml itself then the config size would go over 4 MB and cause infra problems.
+
+### **What is binary_run_in_docker for?**
+
+So, CircleCI has several executor types: macos, machine, and docker are the ones we use. The 'machine' executor gives you two cores on some linux vm. The 'docker' executor gives you considerably more cores (nproc was 32 instead of 2 back when I tried in February). Since the dockers are faster, we try to run everything that we can in dockers. Thus
+
+* linux build jobs use the docker executor. Running them on the docker executor was at least 2x faster than running them on the machine executor
+* linux test jobs use the machine executor in order for them to properly interface with GPUs since docker executors cannot execute with attached GPUs
+* linux upload jobs use the machine executor. The upload jobs are so short that it doesn't really matter what they use
+* linux smoke test jobs use the machine executor for the same reason as the linux test jobs
+
+binary_run_in_docker.sh is a way to share the docker start-up code between the binary test jobs and the binary smoke test jobs
+
+### **Why does binary_checkout also checkout pytorch? Why shouldn't it?**
+
+We want all the nightly binary jobs to run on the exact same git commit, so we wrote our own checkout logic to ensure that the same commit was always picked. Later circleci changed that to use a single pytorch checkout and persist it through the workspace (they did this because our config file was too big, so they wanted to take a lot of the setup code into scripts, but the scripts needed the code repo to exist to be called, so they added a prereq step called 'setup' to checkout the code and persist the needed scripts to the workspace). The changes to the binary jobs were not properly tested, so they all broke from missing pytorch code no longer existing. We hotfixed the problem by adding the pytorch checkout back to binary_checkout, so now there's two checkouts of pytorch on the binary jobs. This problem still needs to be fixed, but it takes careful tracing of which code is being called where.
+
+# Code structure of the binaries (circleci agnostic)
+
+## Overview
+
+The code that runs the binaries lives in two places, in the normal [github.com/pytorch/pytorch](http://github.com/pytorch/pytorch), but also in [github.com/pytorch/builder](http://github.com/pytorch/builder), which is a repo that defines how all the binaries are built. The relevant code is
+
+
+```
+# All code needed to set-up environments for build code to run in,
+# but only code that is specific to the current CI system
+pytorch/pytorch
+- .circleci/                # Folder that holds all circleci related stuff
+  - config.yml              # GENERATED file that actually controls all circleci behavior
+  - verbatim-sources        # Used to generate job/workflow sections in ^
+  - scripts/                # Code needed to prepare circleci environments for binary build scripts
+- setup.py                  # Builds pytorch. This is wrapped in pytorch/builder
+- cmake files               # used in normal building of pytorch
+# All code needed to prepare a binary build, given an environment
+# with all the right variables/packages/paths.
+pytorch/builder
+# Given an installed binary and a proper python env, runs some checks
+# to make sure the binary was built the proper way. Checks things like
+# the library dependencies, symbols present, etc.
+- check_binary.sh
+# Given an installed binary, runs python tests to make sure everything
+# is in order. These should be de-duped. Right now they both run smoke
+# tests, but are called from different places. Usually just call some
+# import statements, but also has overlap with check_binary.sh above
+- run_tests.sh
+- smoke_test.sh
+# Folders that govern how packages are built. See paragraphs below
+- conda/
+  - build_pytorch.sh          # Entrypoint. Delegates to proper conda build folder
+  - switch_cuda_version.sh    # Switches activate CUDA installation in Docker
+  - pytorch-nightly/          # Build-folder
+- manywheel/
+  - build_cpu.sh              # Entrypoint for cpu builds
+  - build.sh                  # Entrypoint for CUDA builds
+  - build_common.sh           # Actual build script that ^^ call into
+- wheel/
+  - build_wheel.sh            # Entrypoint for wheel builds
+- windows/
+  - build_pytorch.bat         # Entrypoint for wheel builds on Windows
+```
+
+Every type of package has an entrypoint build script that handles the all the important logic.
+
+## Conda
+
+Linux, MacOS and Windows use the same code flow for the conda builds.
+
+Conda packages are built with conda-build, see https://conda.io/projects/conda-build/en/latest/resources/commands/conda-build.html
+
+Basically, you pass `conda build` a build folder (pytorch-nightly/ above) that contains a build script and a meta.yaml. The meta.yaml specifies in what python environment to build the package in, and what dependencies the resulting package should have, and the build script gets called in the env to build the thing.
+tl;dr on conda-build is
+
+1. Creates a brand new conda environment, based off of deps in the meta.yaml
+    1. Note that environment variables do not get passed into this build env unless they are specified in the meta.yaml
+    2. If the build fails this environment will stick around. You can activate it for much easier debugging. The “General Python” section below explains what exactly a python “environment” is.
+2. Calls build.sh in the environment
+3. Copies the finished package to a new conda env, also specified by the meta.yaml
+4. Runs some simple import tests (if specified in the meta.yaml)
+5. Saves the finished package as a tarball
+
+The build.sh we use is essentially a wrapper around `python setup.py build`, but it also manually copies in some of our dependent libraries into the resulting tarball and messes with some rpaths.
+
+The entrypoint file `builder/conda/build_conda.sh` is complicated because
+
+* It works for Linux, MacOS and Windows
+    * The mac builds used to create their own environments, since they all used to be on the same machine. There’s now a lot of extra logic to handle conda envs. This extra machinery could be removed
+* It used to handle testing too, which adds more logic messing with python environments too. This extra machinery could be removed.
+
+## Manywheels (linux pip and libtorch packages)
+
+Manywheels are pip packages for linux distros. Note that these manywheels are not actually manylinux compliant.
+
+`builder/manywheel/build_cpu.sh` and `builder/manywheel/build.sh` (for CUDA builds) just set different env vars and then call into `builder/manywheel/build_common.sh`
+
+The entrypoint file `builder/manywheel/build_common.sh` is really really complicated because
+
+* This used to handle building for several different python versions at the same time. The loops have been removed, but there's still unnecessary folders and movements here and there.
+    * The script is never used this way anymore. This extra machinery could be removed.
+* This used to handle testing the pip packages too. This is why there’s testing code at the end that messes with python installations and stuff
+    * The script is never used this way anymore. This extra machinery could be removed.
+* This also builds libtorch packages
+    * This should really be separate. libtorch packages are c++ only and have no python. They should not share infra with all the python specific stuff in this file.
+* There is a lot of messing with rpaths. This is necessary, but could be made much much simpler if the above issues were fixed.
+
+## Wheels (MacOS pip and libtorch packages)
+
+The entrypoint file `builder/wheel/build_wheel.sh` is complicated because
+
+* The mac builds used to all run on one machine (we didn’t have autoscaling mac machines till circleci). So this script handled siloing itself by setting-up and tearing-down its build env and siloing itself into its own build directory.
+    * The script is never used this way anymore. This extra machinery could be removed.
+* This also builds libtorch packages
+    * Ditto the comment above. This should definitely be separated out.
+
+Note that the MacOS Python wheels are still built in conda environments. Some of the dependencies present during build also come from conda.
+
+## Windows Wheels (Windows pip and libtorch packages)
+
+The entrypoint file `builder/windows/build_pytorch.bat` is complicated because
+
+* This used to handle building for several different python versions at the same time. This is why there are loops everywhere
+    * The script is never used this way anymore. This extra machinery could be removed.
+* This used to handle testing the pip packages too. This is why there’s testing code at the end that messes with python installations and stuff
+    * The script is never used this way anymore. This extra machinery could be removed.
+* This also builds libtorch packages
+    * This should really be separate. libtorch packages are c++ only and have no python. They should not share infra with all the python specific stuff in this file.
+
+Note that the Windows Python wheels are still built in conda environments. Some of the dependencies present during build also come from conda.
+
+## General notes
+
+### Note on run_tests.sh, smoke_test.sh, and check_binary.sh
+
+* These should all be consolidated
+* These must run on all OS types: MacOS, Linux, and Windows
+* These all run smoke tests at the moment. They inspect the packages some, maybe run a few import statements. They DO NOT run the python tests nor the cpp tests. The idea is that python tests on master and PR merges will catch all breakages. All these tests have to do is make sure the special binary machinery didn’t mess anything up.
+* There are separate run_tests.sh and smoke_test.sh because one used to be called by the smoke jobs and one used to be called by the binary test jobs (see circleci structure section above). This is still true actually, but these could be united into a single script that runs these checks, given an installed pytorch package.
+
+### Note on libtorch
+
+Libtorch packages are built in the wheel build scripts: manywheel/build_*.sh for linux and build_wheel.sh for mac. There are several things wrong with this
+
+* It’s confusing. Most of those scripts deal with python specifics.
+* The extra conditionals everywhere severely complicate the wheel build scripts
+* The process for building libtorch is different from the official instructions (a plain call to cmake, or a call to a script)
+
+### Note on docker images / Dockerfiles
+
+All linux builds occur in docker images. The docker images are
+
+* pytorch/conda-cuda
+    * Has ALL CUDA versions installed. The script pytorch/builder/conda/switch_cuda_version.sh sets /usr/local/cuda to a symlink to e.g. /usr/local/cuda-10.0 to enable different CUDA builds
+    * Also used for cpu builds
+* pytorch/manylinux-cuda90
+* pytorch/manylinux-cuda100
+    * Also used for cpu builds
+
+The Dockerfiles are available in pytorch/builder, but there is no circleci job or script to build these docker images, and they cannot be run locally (unless you have the correct local packages/paths). Only Soumith can build them right now.
+
+### General Python
+
+* This is still a good explanation of python installations https://caffe2.ai/docs/faq.html#why-do-i-get-import-errors-in-python-when-i-try-to-use-caffe2
+
+# How to manually rebuild the binaries
+
+tl;dr make a PR that looks like https://github.com/pytorch/pytorch/pull/21159
+
+Sometimes we want to push a change to master and then rebuild all of today's binaries after that change. As of May 30, 2019 there isn't a way to manually run a workflow in the UI. You can manually re-run a workflow, but it will use the exact same git commits as the first run and will not include any changes. So we have to make a PR and then force circleci to run the binary workflow instead of the normal tests. The above PR is an example of how to do this; essentially you copy-paste the binarybuilds workflow steps into the default workflow steps. If you need to point the builder repo to a different commit then you'd need to change https://github.com/pytorch/pytorch/blob/master/.circleci/scripts/binary_checkout.sh#L42-L45 to checkout what you want.
+
+## How to test changes to the binaries via .circleci
+
+Writing PRs that test the binaries is annoying, since the default circleci jobs that run on PRs are not the jobs that you want to run. Likely, changes to the binaries will touch something under .circleci/ and require that .circleci/config.yml be regenerated (.circleci/config.yml controls all .circleci behavior, and is generated using `.circleci/regenerate.sh` in python 3.7). But you also need to manually hardcode the binary jobs that you want to test into the .circleci/config.yml workflow, so you should actually make at least two commits, one for your changes and one to temporarily hardcode jobs. See https://github.com/pytorch/pytorch/pull/22928 as an example of how to do this.
+
+```sh
+# Make your changes
+touch .circleci/verbatim-sources/nightly-binary-build-defaults.yml
+# Regenerate the yaml, has to be in python 3.7
+.circleci/regenerate.sh
+# Make a commit
+git add .circleci *
+git commit -m "My real changes"
+git push origin my_branch
+# Now hardcode the jobs that you want in the .circleci/config.yml workflows section
+# Also eliminate ensure-consistency and should_run_job checks
+# e.g. https://github.com/pytorch/pytorch/commit/2b3344bfed8772fe86e5210cc4ee915dee42b32d
+# Make a commit you won't keep
+git add .circleci
+git commit -m "[DO NOT LAND] testing binaries for above changes"
+git push origin my_branch
+# Now you need to make some changes to the first commit.
+git rebase -i HEAD~2 # mark the first commit as 'edit'
+# Make the changes
+touch .circleci/verbatim-sources/nightly-binary-build-defaults.yml
+.circleci/regenerate.sh
+# Ammend the commit and recontinue
+git add .circleci
+git commit --amend
+git rebase --continue
+# Update the PR, need to force since the commits are different now
+git push origin my_branch --force
+```
+
+The advantage of this flow is that you can make new changes to the base commit and regenerate the .circleci without having to re-write which binary jobs you want to test on. The downside is that all updates will be force pushes.
+
+## How to build a binary locally
+
+### Linux
+
+You can build Linux binaries locally easily using docker.
+
+```sh
+# Run the docker
+# Use the correct docker image, pytorch/conda-cuda used here as an example
+#
+# -v path/to/foo:path/to/bar makes path/to/foo on your local machine (the
+#    machine that you're running the command on) accessible to the docker
+#    container at path/to/bar. So if you then run `touch path/to/bar/baz`
+#    in the docker container then you will see path/to/foo/baz on your local
+#    machine. You could also clone the pytorch and builder repos in the docker.
+#
+# If you know how, add ccache as a volume too and speed up everything
+docker run \
+    -v your/pytorch/repo:/pytorch \
+    -v your/builder/repo:/builder \
+    -v where/you/want/packages/to/appear:/final_pkgs \
+    -it pytorch/conda-cuda /bin/bash
+# Export whatever variables are important to you. All variables that you'd
+# possibly need are in .circleci/scripts/binary_populate_env.sh
+# You should probably always export at least these 3 variables
+export PACKAGE_TYPE=conda
+export DESIRED_PYTHON=3.7
+export DESIRED_CUDA=cpu
+# Call the entrypoint
+# `|& tee foo.log` just copies all stdout and stderr output to foo.log
+# The builds generate lots of output so you probably need this when
+# building locally.
+/builder/conda/build_pytorch.sh |& tee build_output.log
+```
+
+**Building CUDA binaries on docker**
+
+You can build CUDA binaries on CPU only machines, but you can only run CUDA binaries on CUDA machines. This means that you can build a CUDA binary on a docker on your laptop if you so choose (though it’s gonna take a long time).
+
+For Facebook employees, ask about beefy machines that have docker support and use those instead of your laptop; it will be 5x as fast.
+
+### MacOS
+
+There’s no easy way to generate reproducible hermetic MacOS environments. If you have a Mac laptop then you can try emulating the .circleci environments as much as possible, but you probably have packages in /usr/local/, possibly installed by brew, that will probably interfere with the build. If you’re trying to repro an error on a Mac build in .circleci and you can’t seem to repro locally, then my best advice is actually to iterate on .circleci    :/
+
+But if you want to try, then I’d recommend
+
+```sh
+# Create a new terminal
+# Clear your LD_LIBRARY_PATH and trim as much out of your PATH as you
+# know how to do
+# Install a new miniconda
+# First remove any other python or conda installation from your PATH
+# Always install miniconda 3, even if building for Python <3
+new_conda="~/my_new_conda"
+conda_sh="$new_conda/install_miniconda.sh"
+curl -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+chmod +x "$conda_sh"
+"$conda_sh" -b -p "$MINICONDA_ROOT"
+rm -f "$conda_sh"
+export PATH="~/my_new_conda/bin:$PATH"
+# Create a clean python env
+# All MacOS builds use conda to manage the python env and dependencies
+# that are built with, even the pip packages
+conda create -yn binary python=2.7
+conda activate binary
+# Export whatever variables are important to you. All variables that you'd
+# possibly need are in .circleci/scripts/binary_populate_env.sh
+# You should probably always export at least these 3 variables
+export PACKAGE_TYPE=conda
+export DESIRED_PYTHON=3.7
+export DESIRED_CUDA=cpu
+# Call the entrypoint you want
+path/to/builder/wheel/build_wheel.sh
+```
+
+N.B. installing a brand new miniconda is important. This has to do with how conda installations work. See the “General Python” section above, but tldr; is that
+
+1. You make the ‘conda’ command accessible by prepending `path/to/conda_root/bin` to your PATH.
+2. You make a new env and activate it, which then also gets prepended to your PATH. Now you have `path/to/conda_root/envs/new_env/bin:path/to/conda_root/bin:$PATH`
+3. Now say you (or some code that you ran) call python executable `foo`
+    1. if you installed `foo` in `new_env`, then `path/to/conda_root/envs/new_env/bin/foo` will get called, as expected.
+    2. But if you forgot to installed `foo` in `new_env` but happened to previously install it in your root conda env (called ‘base’), then unix/linux will still find `path/to/conda_root/bin/foo` . This is dangerous, since `foo` can be a different version than you want; `foo` can even be for an incompatible python version!
+
+Newer conda versions and proper python hygiene can prevent this, but just install a new miniconda to be safe.
+
+### Windows
+
+TODO: fill in

From bbb3a9f6cb9a6a980a13fe7ac516a6b979711006 Mon Sep 17 00:00:00 2001
From: Jiewen Tan <jwtan@google.com>
Date: Fri, 18 Nov 2022 03:37:14 +0000
Subject: [PATCH 1049/1922] [LTC] Restore GetPythonFrames (#89122)

Summary:
pytorch/pytorch@936e930 delete the registration of GetPythonFramesFunction. Restore that and add a test case to prevent regression.

Test Plan:
python test/lazy/test_debug_util.py

Fixes pytorch/xla#4206.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89122
Approved by: https://github.com/JackCaoG
---
 test/lazy/test_debug_util.py    | 44 +++++++++++++++++++++++++++++++++
 torch/csrc/lazy/python/init.cpp |  4 +++
 2 files changed, 48 insertions(+)
 create mode 100644 test/lazy/test_debug_util.py

diff --git a/test/lazy/test_debug_util.py b/test/lazy/test_debug_util.py
new file mode 100644
index 0000000000000..df201d54737f1
--- /dev/null
+++ b/test/lazy/test_debug_util.py
@@ -0,0 +1,44 @@
+# Owner(s): ["oncall: jit"]
+
+import os
+import re
+import tempfile
+import torch.nn as nn
+import unittest
+
+import torch._lazy
+import torch._lazy.ts_backend
+from torch.testing._internal.common_utils import IS_WINDOWS, run_tests, TestCase
+
+torch._lazy.ts_backend.init()
+
+
+@unittest.skipIf(IS_WINDOWS, "To be fixed")
+class DebugUtilTest(TestCase):
+    def _run_linear(self):
+        device = "lazy"
+        model = nn.Linear(5, 5).to(device)
+        output = model(torch.randn(1, 5).to(device))
+        torch._lazy.mark_step()
+
+
+    def test_get_python_frames(self):
+        # We only care about the first "Python Stacktrace" part of the saved
+        # graph. However, we cannot save the whole stack for comparison given
+        # it depends on a lot of things.
+        partial_graph = (r"Python Stacktrace:.*"
+                         r"mark_step \(.*/_lazy/__init__.py:[0-9]+\).*"
+                         r"_run_linear \(.*lazy/test_debug_util.py:[0-9]+\).*"
+                         r"test_get_python_frames \(.*lazy/test_debug_util.py:[0-9]+\)")
+
+        with tempfile.NamedTemporaryFile(mode="r+", encoding="utf-8") as graph_file:
+            os.environ["LTC_SAVE_TENSORS_FILE"] = graph_file.name
+            self._run_linear()
+            file = graph_file.read()
+            if re.search(partial_graph, file, re.DOTALL) is None:
+                print(file)
+                self.assertTrue(False)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/csrc/lazy/python/init.cpp b/torch/csrc/lazy/python/init.cpp
index 774df68e26def..0b773788eff95 100644
--- a/torch/csrc/lazy/python/init.cpp
+++ b/torch/csrc/lazy/python/init.cpp
@@ -305,6 +305,10 @@ void initLazyBindings(PyObject* module) {
 #endif // !(defined(FBCODE_CAFFE2) || defined(OVRSOURCE))
         return result;
       });
+
+  // When libtorch_python is loaded, we register the python frame getter
+  // otherwise, debug util simply omits python frames
+  GetPythonFramesFunction() = GetPythonFrames;
 }
 
 } // namespace lazy

From 68cfe3e33795ad0574dc24d82416afca57cf51ff Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 18 Nov 2022 03:45:53 +0000
Subject: [PATCH 1050/1922] [vision hash update] update the pinned vision hash
 (#89102)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned vision hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89102
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/vision.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index c9bfe60001af3..cc0724ac842d1 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-b1f6c9e271368cd84837522af39e68dd4b5768a7
+d710f3d1edc06afa244468cb96603ba6dbd4d9d5

From e0138a1b57735d8943e4cd049b2fcb39d2464530 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Thu, 17 Nov 2022 16:45:47 -0800
Subject: [PATCH 1051/1922] [quant][be] Move some helper functions to the top
 level to reduce function length (#89246)

Summary:
att

Test Plan:
python test/test_quantization.py TestQuantizeFx

Reviewers:

Subscribers:

Tasks:

Tags:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89246
Approved by: https://github.com/vkuzo
---
 torch/ao/quantization/fx/convert.py | 162 ++++++++++++++--------------
 1 file changed, 80 insertions(+), 82 deletions(-)

diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py
index 0c1249b4858d1..ca6ae61a4c97f 100644
--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@@ -88,6 +88,83 @@
     "run_weight_observers",
 ]
 
+def _replace_observer_with_quantize_dequantize_node(
+        model: torch.nn.Module,
+        graph: Graph,
+        node: Node,
+        modules: Dict[str, torch.nn.Module],
+        node_name_to_scope: Dict[str, Tuple[str, type]],
+        node_name_to_qconfig: Dict[str, QConfigAny],
+        is_decomposed: bool) -> None:
+    """ Replace activation_post_process module call node with quantize and
+    dequantize node
+
+    Before:
+    ... -> observer_0(x) -> ...
+    After:
+    ... -> torch.quantize_per_tensor(x, ...) -> x.dequantize() -> ...
+    """
+    assert modules is not None
+    assert isinstance(node.target, str)
+    module_path, prefix = get_module_path_and_prefix(node, node_name_to_scope, node_name_to_qconfig)
+    observer_module = modules[node.target]
+    maybe_quantize_node_info = get_quantize_node_info(observer_module, is_decomposed)
+    # Skip replacing observers to quant/dequant nodes if the qconfigs of all
+    # consumers and producers of this observer are None
+    skip_replacement = all([
+        has_none_qconfig(n, node_name_to_qconfig) for n in
+        list(node.args) + list(node.users.keys())])
+    if skip_replacement or maybe_quantize_node_info is None:
+        # didn't find correponding quantize op and info for the observer_module
+        # so we just remove the observer
+        with graph.inserting_before(node):
+            node.replace_all_uses_with(node.args[0])
+            graph.erase_node(node)
+    else:
+        # otherwise, we can convert the observer moduel call to quantize/dequantize node
+        node_type, quantize_op, qparams = maybe_quantize_node_info
+        # replace observer node with quant - dequant node
+        with graph.inserting_before(node):
+            input_node = node.args[0]
+            quantize_op_inputs = [input_node]
+            for key, value in qparams.items():
+                # TODO: we can add the information of whether a value needs to
+                # be registered as an attribute in qparams dict itself
+                if key in ['_scale_', '_zero_point_']:
+                    # For scale and zero_point values we register them as buffers in the root module.
+                    # TODO: maybe need more complex attr name here
+                    qparam_node = create_getattr_from_value(model, graph, module_path + prefix + key, value)
+                    quantize_op_inputs.append(qparam_node)
+                else:
+                    # for qparams that are not scale/zero_point (like axis, dtype) we store them as literals in the graph.
+                    quantize_op_inputs.append(value)
+
+            quantized_node = graph.create_node(node_type, quantize_op, tuple(quantize_op_inputs), {})
+            if is_decomposed:
+                # use the same qparams from quantize op
+                dq_inputs = [quantized_node] + quantize_op_inputs[1:]
+                dequantized_node = graph.call_function(
+                    torch.ops.quantized_decomposed.dequantize_per_tensor,
+                    tuple(dq_inputs),
+                    {}
+                )
+            else:
+                dequantized_node = graph.call_method("dequantize", args=(quantized_node,))
+            node.replace_all_uses_with(dequantized_node)
+            graph.erase_node(node)
+
+# this is a temporary hack for custom module, we may want to implement
+# this properly after the custom module class design is finalized
+# TODO: DeQuantStubs are currently inserted only after custom module LSTM, while observers are inserted
+# after all other custom modules. In the future, we should simply insert QuantStubs before and DeQuantStubs
+# after custom modules in general, and replace these with "quantize" and "dequantize" nodes respectively.
+def _replace_observer_or_dequant_stub_with_dequantize_node(node: Node, graph: Graph):
+    call_custom_module_node = node.args[0]
+    assert isinstance(call_custom_module_node, Node), \
+        f"Expecting the for call custom module node to be a Node, but got {call_custom_module_node}"
+    node.replace_all_uses_with(call_custom_module_node)
+    graph.erase_node(node)
+    insert_dequantize_node(call_custom_module_node, graph)
 
 def restore_state(
         observed: torch.nn.Module
@@ -599,85 +676,6 @@ def convert(
         if node.op == 'placeholder':
             graph_inputs.append(node.name)
 
-    # TODO: move this outside of this function
-    def replace_observer_with_quantize_dequantize_node(
-            model: torch.nn.Module,
-            graph: Graph,
-            node: Node,
-            modules: Dict[str, torch.nn.Module],
-            node_name_to_scope: Dict[str, Tuple[str, type]],
-            node_name_to_qconfig: Dict[str, QConfigAny],
-            is_decomposed: bool) -> None:
-        """ Replace activation_post_process module call node with quantize and
-        dequantize node
-
-        Before:
-        ... -> observer_0(x) -> ...
-        After:
-        ... -> torch.quantize_per_tensor(x, ...) -> x.dequantize() -> ...
-        """
-        assert modules is not None
-        assert isinstance(node.target, str)
-        module_path, prefix = get_module_path_and_prefix(node, node_name_to_scope, node_name_to_qconfig)
-        observer_module = modules[node.target]
-        maybe_quantize_node_info = get_quantize_node_info(observer_module, is_decomposed)
-        # Skip replacing observers to quant/dequant nodes if the qconfigs of all
-        # consumers and producers of this observer are None
-        skip_replacement = all([
-            has_none_qconfig(n, node_name_to_qconfig) for n in
-            list(node.args) + list(node.users.keys())])
-        if skip_replacement or maybe_quantize_node_info is None:
-            # didn't find correponding quantize op and info for the observer_module
-            # so we just remove the observer
-            with graph.inserting_before(node):
-                node.replace_all_uses_with(node.args[0])
-                graph.erase_node(node)
-        else:
-            # otherwise, we can convert the observer moduel call to quantize/dequantize node
-            node_type, quantize_op, qparams = maybe_quantize_node_info
-            # replace observer node with quant - dequant node
-            with graph.inserting_before(node):
-                input_node = node.args[0]
-                quantize_op_inputs = [input_node]
-                for key, value in qparams.items():
-                    # TODO: we can add the information of whether a value needs to
-                    # be registered as an attribute in qparams dict itself
-                    if key in ['_scale_', '_zero_point_']:
-                        # For scale and zero_point values we register them as buffers in the root module.
-                        # TODO: maybe need more complex attr name here
-                        qparam_node = create_getattr_from_value(model, graph, module_path + prefix + key, value)
-                        quantize_op_inputs.append(qparam_node)
-                    else:
-                        # for qparams that are not scale/zero_point (like axis, dtype) we store them as literals in the graph.
-                        quantize_op_inputs.append(value)
-
-                quantized_node = graph.create_node(node_type, quantize_op, tuple(quantize_op_inputs), {})
-                if is_decomposed:
-                    # use the same qparams from quantize op
-                    dq_inputs = [quantized_node] + quantize_op_inputs[1:]
-                    dequantized_node = graph.call_function(
-                        torch.ops.quantized_decomposed.dequantize_per_tensor,
-                        tuple(dq_inputs),
-                        {}
-                    )
-                else:
-                    dequantized_node = graph.call_method("dequantize", args=(quantized_node,))
-                node.replace_all_uses_with(dequantized_node)
-                graph.erase_node(node)
-
-    # this is a temporary hack for custom module, we may want to implement
-    # this properly after the custom module class design is finalized
-    # TODO: DeQuantStubs are currently inserted only after custom module LSTM, while observers are inserted
-    # after all other custom modules. In the future, we should simply insert QuantStubs before and DeQuantStubs
-    # after custom modules in general, and replace these with "quantize" and "dequantize" nodes respectively.
-    def replace_observer_or_dequant_stub_with_dequantize_node(node: Node, graph: Graph):
-        call_custom_module_node = node.args[0]
-        assert isinstance(call_custom_module_node, Node), \
-            f"Expecting the for call custom module node to be a Node, but got {call_custom_module_node}"
-        node.replace_all_uses_with(call_custom_module_node)
-        graph.erase_node(node)
-        insert_dequantize_node(call_custom_module_node, graph)
-
     # additional state to override inputs to be quantized, if specified
     # by the user
     placeholder_node_seen_cnt = 0
@@ -728,13 +726,13 @@ def replace_observer_or_dequant_stub_with_dequantize_node(node: Node, graph: Gra
             if _is_activation_post_process(mod):
                 observed_node = node.args[0]
                 if observed_node in statically_quantized_custom_module_nodes:
-                    replace_observer_or_dequant_stub_with_dequantize_node(node, model.graph)
+                    _replace_observer_or_dequant_stub_with_dequantize_node(node, model.graph)
                 else:
-                    replace_observer_with_quantize_dequantize_node(
+                    _replace_observer_with_quantize_dequantize_node(
                         model, model.graph, node, modules, node_name_to_scope,
                         node_name_to_qconfig, is_decomposed)
             elif isinstance(mod, DeQuantStub):
-                replace_observer_or_dequant_stub_with_dequantize_node(node, model.graph)
+                _replace_observer_or_dequant_stub_with_dequantize_node(node, model.graph)
             elif is_observed_standalone_module(mod):
                 convert_standalone_module(
                     node, modules, model, is_reference, backend_config)

From 9c545f7dd41f32d8188f04ee90775a99412cd558 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@meta.com>
Date: Fri, 18 Nov 2022 04:13:03 +0000
Subject: [PATCH 1052/1922] [Executorch] [Quantization] New pattern for dynamic
 dequant (#89236)

Summary: The op exposed should be qparams, and then we have concerns about prims not being supported so make q and dq ops that take in tensors

Test Plan: unit test

Differential Revision: D41382580

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89236
Approved by: https://github.com/jerryzh168
---
 .../core/test_quantized_tensor.py             | 14 ++++----
 torch/ao/quantization/fx/_decomposed.py       | 34 +++++++++++++++----
 2 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py
index dab53de5b1075..a89c98f4e5ab1 100644
--- a/test/quantization/core/test_quantized_tensor.py
+++ b/test/quantization/core/test_quantized_tensor.py
@@ -14,7 +14,6 @@
 from torch.testing._internal.common_utils import TestCase, DeterministicGuard
 import torch.testing._internal.hypothesis_utils as hu
 from torch.testing._internal.common_quantization import get_supported_device_types
-from torch.ao.quantization import MinMaxObserver
 
 hu.assert_deadline_disabled()
 
@@ -1499,7 +1498,7 @@ def test_decomposed_dequantize(self):
         self.assertEqual(quantized_X.int_repr(), quantized_decomposed_X)
         self.assertEqual(dequantized_X, dequantized_decomposed_X)
 
-    def test_decomposed_quantize_dynamic(self):
+    def test_decomposed_dynamic_quant_pattern(self):
         import torch.ao.quantization.fx._decomposed
         X = torch.randn(5, 10)
         dtype = torch.uint8
@@ -1510,14 +1509,13 @@ def test_decomposed_quantize_dynamic(self):
         quantized_X = torch.quantize_per_tensor(X, scale, zero_point, qdtype)
         dequantized_X = torch.dequantize(quantized_X)
 
-        quantized_decomposed_X = torch.ops.quantized_decomposed.quantize_per_tensor_dynamic(
+        # Now try decomposed pattern
+        (scale_decomposed, zero_point_decomposed) = torch.ops.quantized_decomposed.choose_qparams.tensor(
             X, quant_min, quant_max, dtype)
+        quantized_decomposed_X = torch.ops.quantized_decomposed.quantize_per_tensor.tensor(
+            X, scale_decomposed, zero_point_decomposed, quant_min, quant_max, dtype)
 
-        # observer logic is what quantize_per_tensor_dynamic does internally
-        observer = MinMaxObserver(quant_min=quant_min, quant_max=quant_max)
-        observer(X)
-        scale_decomposed, zero_point_decomposed = observer.calculate_qparams()
-        dequantized_decomposed_X = torch.ops.quantized_decomposed.dequantize_per_tensor(
+        dequantized_decomposed_X = torch.ops.quantized_decomposed.dequantize_per_tensor.tensor(
             quantized_decomposed_X, scale_decomposed, zero_point_decomposed, quant_min, quant_max, dtype
         )
         self.assertEqual(quantized_X.int_repr(), quantized_decomposed_X)
diff --git a/torch/ao/quantization/fx/_decomposed.py b/torch/ao/quantization/fx/_decomposed.py
index 3f4d38872e174..fcb4a77a5f499 100644
--- a/torch/ao/quantization/fx/_decomposed.py
+++ b/torch/ao/quantization/fx/_decomposed.py
@@ -1,5 +1,5 @@
 import torch
-from torch.library import impl, Library
+from torch.library import Library, impl
 from torch.ao.quantization import MinMaxObserver
 
 # Note: decomposed means decomposed quantized tensor, using decomposed so that the
@@ -38,6 +38,16 @@ def quantize_per_tensor(input, scale, zero_point, quant_min, quant_max, dtype):
     inv_scale = 1.0 / scale
     return torch.clamp(torch.round(input * inv_scale) + zero_point, quant_min, quant_max).to(dtype)
 
+quantized_decomposed_lib.define(
+    "quantize_per_tensor.tensor("
+    "Tensor input, Tensor scale, Tensor zero_point, int quant_min, int quant_max, ScalarType dtype) -> Tensor")
+
+@impl(quantized_decomposed_lib, "quantize_per_tensor.tensor", "CompositeExplicitAutograd")
+def quantize_per_tensor_tensor(input, scale, zero_point, quant_min, quant_max, dtype):
+    assert zero_point.numel() == 1, f"Exepecting zero_point tensor to be one element, but received : {zero_point.numel()}"
+    assert scale.numel() == 1, f"Exepecting scale tensor to be one element, but received : {scale.numel()}"
+    return quantize_per_tensor(input, scale.item(), zero_point.item(), quant_min, quant_max, dtype)
+
 # Note: quant_min/quant_max/dtype are not used in the operator, but for now it's kept in
 # the signature as metadata for the input Tensor, this might be useful for pattern
 # matching in the future
@@ -56,13 +66,25 @@ def dequantize_per_tensor(input, scale, zero_point, quant_min, quant_max, dtype)
     else:
         raise ValueError(f"Unsupported dtype in dequantize_per_tensor: {dtype}")
 
+
 quantized_decomposed_lib.define(
-    "quantize_per_tensor_dynamic(Tensor input, int quant_min, int quant_max, ScalarType dtype) -> Tensor")
+    "dequantize_per_tensor.tensor("
+    "Tensor input, Tensor scale, Tensor zero_point, int quant_min, int quant_max, ScalarType dtype) -> Tensor")
+
+@impl(quantized_decomposed_lib, "dequantize_per_tensor.tensor", "CompositeExplicitAutograd")
+def dequantize_per_tensor_tensor(input, scale, zero_point, quant_min, quant_max, dtype):
+    assert zero_point.numel() == 1, f"Exepecting zero_point tensor to be one element, but received : {zero_point.numel()}"
+    assert scale.numel() == 1, f"Exepecting scale tensor to be one element, but received : {scale.numel()}"
+    return dequantize_per_tensor(input, scale.item(), zero_point.item(), quant_min, quant_max, dtype)
 
-@impl(quantized_decomposed_lib, "quantize_per_tensor_dynamic", "CompositeExplicitAutograd")
-def quantize_per_tensor_dynamic(input, quant_min, quant_max, dtype):
+
+quantized_decomposed_lib.define(
+    "choose_qparams.tensor(Tensor input, int quant_min, int quant_max, ScalarType dtype) -> (Tensor, Tensor)")
+
+@impl(quantized_decomposed_lib, "choose_qparams.tensor", "CompositeExplicitAutograd")
+def choose_qparams_tensor(input, quant_min, quant_max, dtype):
     assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
-    _quant_min_max_bounds_check(quant_min, quant_max, dtype)
+    assert quant_min < quant_max, f"Expecting quant_min to be smaller than quant_max but received min: {quant_min} max: {quant_max}"
 
     # Its weird to create an observer manually just to calculate qparams. I tried refactoring this functionality out of observer
     # into a util and then use that util directly, but I kept running into jit typing errors related to torch.qscheme not
@@ -71,4 +93,4 @@ def quantize_per_tensor_dynamic(input, quant_min, quant_max, dtype):
     observer = MinMaxObserver(quant_min=quant_min, quant_max=quant_max, dtype=tensor_dtype_to_observer_dtype[dtype])
     observer(input)
     scale, zero_point = observer.calculate_qparams()
-    return torch.ops.quantized_decomposed.quantize_per_tensor(input, scale.item(), zero_point.item(), quant_min, quant_max, dtype)
+    return (scale, zero_point)

From 388219729468cf0ff97d9dd12279760048238ed2 Mon Sep 17 00:00:00 2001
From: Yoni Chechik <chechik.yoni@gmail.com>
Date: Fri, 18 Nov 2022 04:29:00 +0000
Subject: [PATCH 1053/1922] docs: conv2d `padding` attribute- add `int` option
 (#85004)

`padding: int` already exists but isn't mentioned in the genereted docs

Pull Request resolved: https://github.com/pytorch/pytorch/pull/85004
Approved by: https://github.com/albanD, https://github.com/kit1980
---
 torch/nn/modules/conv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py
index 93a97f15e7c82..5c081e64eccae 100644
--- a/torch/nn/modules/conv.py
+++ b/torch/nn/modules/conv.py
@@ -340,7 +340,7 @@ class Conv2d(_ConvNd):
       number or a tuple.
 
     * :attr:`padding` controls the amount of padding applied to the input. It
-      can be either a string {{'valid', 'same'}} or a tuple of ints giving the
+      can be either a string {{'valid', 'same'}} or an int / a tuple of ints giving the
       amount of implicit padding applied on both sides.
 
     * :attr:`dilation` controls the spacing between the kernel points; also

From 5d49cac27ef24b6e6ab474d02c4109f808204417 Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Thu, 17 Nov 2022 18:10:40 +0000
Subject: [PATCH 1054/1922] Fix tol for
 test_nvfuser_correctness__softmax_backward_data_cuda (#89178)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89178
Approved by: https://github.com/kit1980
---
 torch/testing/_internal/common_methods_invocations.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 24ef757b768da..50732af6f8578 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -420,7 +420,9 @@ def sample_inputs_softmax_backward_data(op_info, device, dtype, requires_grad, *
         input_dtypes += [torch.float16]
 
     for (shape, dim), input_dtype in product(cases, input_dtypes):
-        yield SampleInput(make_arg(shape), make_arg(shape), dim, input_dtype)
+        input = make_arg(shape)
+        output = torch.nn.functional.softmax(input, dim=dim, dtype=input_dtype)
+        yield SampleInput(make_arg(shape), output, dim, input_dtype)
 
 def sample_inputs_native_batch_norm(op_info, device, dtype, requires_grad, **kwargs):
     samples = sample_inputs_batch_norm(op_info, device, dtype, requires_grad, **kwargs)
@@ -10596,6 +10598,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         skips=(
             DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples', device_type='cpu'),
             DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)),
+            DecorateInfo(toleranceOverride({torch.float16: tol(atol=2e-4, rtol=2e-3),
+                                            torch.bfloat16: tol(atol=1e-3, rtol=0.016)}),
+                         'TestCudaFuserOpInfo', 'test_nvfuser_correctness'),
         ),
     ),
     # `softmin` supports different dtypes based on whether `dtype` argument,

From 2127bd4c457f48cd93b1aa0457c7d85111f143b6 Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Thu, 17 Nov 2022 18:50:33 +0000
Subject: [PATCH 1055/1922] Symintify repeat_interleave.self_int (#89111)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89111
Approved by: https://github.com/ezyang
---
 .../functorch/BatchRulesDecompositions.cpp    |  2 +-
 aten/src/ATen/native/Repeat.cpp               | 17 +++++++---
 aten/src/ATen/native/native_functions.yaml    |  4 ++-
 .../cuda/NestedTensorTransformerFunctions.cpp |  4 +--
 .../quantized/FakeQuantPerTensorAffine.cpp    |  6 ++--
 c10/core/SymFloat.cpp                         |  8 +++++
 c10/core/SymFloat.h                           | 10 ++++++
 test/dynamo/test_dynamic_shapes.py            |  5 ---
 test/test_proxy_tensor.py                     |  3 +-
 torch/_prims/__init__.py                      | 10 +++---
 torch/csrc/utils/tensor_new.cpp               | 32 +++++++++++++++----
 11 files changed, 73 insertions(+), 28 deletions(-)

diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
index e31b36d112418..05ee8d07a410e 100644
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@@ -184,7 +184,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   OP_DECOMPOSE(positive);
   OP_DECOMPOSE(qr);
   OP_DECOMPOSE(ravel);
-  OP_DECOMPOSE2(repeat_interleave, self_int);
+  m.impl("repeat_interleave.self_int", native::repeat_interleave_symint);
   OP_DECOMPOSE2(repeat_interleave, self_Tensor);
   m.impl("reshape", native::reshape_symint);
   OP_DECOMPOSE(resolve_conj);
diff --git a/aten/src/ATen/native/Repeat.cpp b/aten/src/ATen/native/Repeat.cpp
index b671a2232044b..c8c4e134929f9 100644
--- a/aten/src/ATen/native/Repeat.cpp
+++ b/aten/src/ATen/native/Repeat.cpp
@@ -75,11 +75,11 @@ Tensor repeat_interleave(
   }
 
   Tensor repeats_ = repeats;
-  if (repeats.dim() == 0 || (repeats.dim() == 1 && repeats.size(0) == 1)) {
-    repeats_ = repeats.reshape({1}).expand({input.size(dim.value())});
+  if (repeats.dim() == 0 || (repeats.dim() == 1 && repeats.sym_size(0) == 1)) {
+    repeats_ = repeats.reshape({1}).expand_symint({input.sym_size(dim.value())});
   } else if (repeats.dim() == 1) {
     TORCH_CHECK(
-        repeats.size(0) == input.size(dim.value()),
+        repeats.sym_size(0) == input.sym_size(dim.value()),
         "repeats must have the same size as input along dim")
   } else {
     AT_ERROR("repeats must be 0-dim or 1-dim tensor");
@@ -102,10 +102,17 @@ Tensor repeat_interleave(
     int64_t repeats,
     c10::optional<int64_t> dim,
     c10::optional<int64_t> output_size) {
-  at::Tensor repeats_ =
-      at::empty(1, self.options().dtype(at::kLong)).fill_(repeats);
+  at::Tensor repeats_ = at::empty(1, self.options().dtype(at::kLong)).fill_(repeats);
   return at::native::repeat_interleave(self, repeats_, dim, output_size);
 }
 
+Tensor repeat_interleave_symint(
+    const Tensor& self,
+    c10::SymInt repeats,
+    c10::optional<int64_t> dim,
+    c10::optional<int64_t> output_size) {
+    return at::native::repeat_interleave(self, repeats.guard_int(__FILE__, __LINE__), dim, output_size);
+  }
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index b1d1094667e17..5cf0e759db1d5 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -4320,8 +4320,10 @@
 - func: repeat_interleave.self_Tensor(Tensor self, Tensor repeats, int? dim=None, *, int? output_size=None) -> Tensor
   variants: function, method
 
-- func: repeat_interleave.self_int(Tensor self, int repeats, int? dim=None, *, int? output_size=None) -> Tensor
+- func: repeat_interleave.self_int(Tensor self, SymInt repeats, int? dim=None, *, int? output_size=None) -> Tensor
   variants: function, method
+  dispatch:
+    CompositeImplicitAutograd: repeat_interleave_symint
 
 - func: reshape(Tensor(a) self, SymInt[] shape) -> Tensor(a)
   variants: function, method
diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
index 411ebdb19b5af..c2bf4e08ce042 100644
--- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
@@ -152,8 +152,8 @@ Tensor NestedTensor_to_padded_tensor_cuda(
     if (t_dim == 3 && nt_input->opt_size(2) && (*nt_input->opt_size(2) > 0) &&
         !(output_size.has_value())) {
       Tensor nt_sizes = nt_input->get_nested_size_tensor();
-      Tensor sizes_dim1 = at::native::narrow(nt_sizes, 1, 0, 1);
-      Tensor sizes_dim2 = at::native::narrow(nt_sizes, 1, 1, 1);
+      Tensor sizes_dim1 = at::native::narrow_symint(nt_sizes, 1, 0, 1);
+      Tensor sizes_dim2 = at::native::narrow_symint(nt_sizes, 1, 1, 1);
       Tensor result = at::detail::make_tensor<NestedTensorImpl>(
           nt_input->get_buffer(), sizes_dim1 * sizes_dim2[0]);
       TORCH_INTERNAL_ASSERT_DEBUG_ONLY(result.dim() == 2);
diff --git a/aten/src/ATen/native/quantized/FakeQuantPerTensorAffine.cpp b/aten/src/ATen/native/quantized/FakeQuantPerTensorAffine.cpp
index 700b3b14b180c..aac039f0e03ef 100644
--- a/aten/src/ATen/native/quantized/FakeQuantPerTensorAffine.cpp
+++ b/aten/src/ATen/native/quantized/FakeQuantPerTensorAffine.cpp
@@ -122,10 +122,10 @@ Tensor fake_quantize_per_tensor_affine_cachemask_backward(
     const Tensor& dY,
     const Tensor& mask) {
   TORCH_CHECK(mask.scalar_type() == ScalarType::Bool);
-  TORCH_CHECK(mask.numel() == dY.numel(),
+  TORCH_CHECK(mask.sym_numel() == dY.sym_numel(),
       "`mask` and `dY` are not the same size: ",
-      "`mask` is size ", mask.numel(), " and `dY` is size ", dY.numel());
-  if (dY.numel() <= 0) {
+      "`mask` is size ", mask.sym_numel(), " and `dY` is size ", dY.sym_numel());
+  if (dY.sym_numel() <= 0) {
     return dY;
   }
   // Note: no additional kernels needed, since mask is pre-computed
diff --git a/c10/core/SymFloat.cpp b/c10/core/SymFloat.cpp
index 81e8f25d5bb64..511c50e3398ee 100644
--- a/c10/core/SymFloat.cpp
+++ b/c10/core/SymFloat.cpp
@@ -70,4 +70,12 @@ std::ostream& operator<<(std::ostream& os, const SymFloat& s) {
   return os;
 }
 
+double SymFloat::guard_float(const char* file, int64_t line) const {
+  if (!is_symbolic()) {
+    return data_;
+  }
+  SymNode a = toSymNodeImpl();
+  return a->guard_float(file, line);
+}
+
 } // namespace c10
diff --git a/c10/core/SymFloat.h b/c10/core/SymFloat.h
index 7da364ce127ad..ff9e101e31afb 100644
--- a/c10/core/SymFloat.h
+++ b/c10/core/SymFloat.h
@@ -40,6 +40,16 @@ class C10_API SymFloat {
   SymFloat operator*(const SymFloat&) const;
   SymFloat operator/(const SymFloat&) const;
 
+  // Insert a guard for the float to be its concrete value, and then return
+  // that value.  This operation always works, even if the float is symbolic,
+  // so long as we know what the underlying value is. Don't blindly put this
+  // everywhere; you can cause overspecialization of PyTorch programs with
+  // this method.
+  //
+  // It should be called as guard_float(__FILE__, __LINE__).  The file and line
+  // number can be used to diagnose overspecialization.
+  double guard_float(const char* file, int64_t line) const;
+
   // N.B. It's important to keep this definition in the header
   // as we expect if checks to be folded for mobile builds
   // where `is_symbolic` is always false
diff --git a/test/dynamo/test_dynamic_shapes.py b/test/dynamo/test_dynamic_shapes.py
index f3964a777aa82..2eb16784514d0 100644
--- a/test/dynamo/test_dynamic_shapes.py
+++ b/test/dynamo/test_dynamic_shapes.py
@@ -106,11 +106,6 @@ def make_dynamic_cls(cls):
     DynamicShapesUnspecTests.test_unspec_float_precision_dynamic_shapes
 )
 
-unittest.expectedFailure(
-    DynamicShapesReproTests.test_reformer_sorting_dynamic_shapes
-    # Unable to cast Python instance to C++ type
-)
-
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 8dc42be7fdfb2..21682ac76fc65 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1122,6 +1122,8 @@ def f(a, b, c, d, e):
     xfail('multinomial'),
     xfail('cholesky'),
     xfail('cholesky_inverse'),
+    # cannot do these as they rely on tensor data
+    xfail('repeat_interleave'),
     # ASAN failures due to divide by 0
     skip('nn.functional.nll_loss'),
 }
@@ -1283,7 +1285,6 @@ def f(a, b, c, d, e):
     xfail('nn.functional.pixel_unshuffle', ''),  # aten.pixel_unshuffle.default - couldn't find symbolic meta function/deco...
     xfail('nn.functional.rrelu', ''),  # aten.empty_like.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.smooth_l1_loss', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
-    xfail('nn.functional.unfold', ''),  # aten.im2col.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.upsample_nearest', ''),  # aten.upsample_nearest1d.vec - couldn't find symbolic meta function/deco...
     xfail('nonzero', ''),  # aten.nonzero.default - couldn't find symbolic meta function/decomposition
     xfail('norm', 'nuc'),  # aten._linalg_svd.default - couldn't find symbolic meta function/decomposition
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index da8d9af723acf..22917ec048eb9 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -2323,10 +2323,12 @@ def _arange_meta(
         step != 0,
         lambda: "step must be nonzero",
     )
-    utils.check(
-        math.isfinite(start) and math.isfinite(end),
-        lambda: f"unsupported range: {start} -> {end}",
-    )
+    # SymInts can't represent inf
+    if not isinstance(start, torch.SymInt) and not isinstance(end, torch.SymInt):
+        utils.check(
+            math.isfinite(start) and math.isfinite(end),
+            lambda: f"unsupported range: {start} -> {end}",
+        )
     utils.check(
         (step > 0 and end >= start) or (step < 0 and end <= start),
         lambda: "upper bound and lower bound inconsistent with step sign",
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index 707ebeb19e846..83506346505e0 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -79,10 +79,10 @@ Tensor new_with_sizes(
     c10::TensorOptions options,
     at::ScalarType scalar_type,
     const optional<Device>& device,
-    IntArrayRef sizes) {
+    c10::SymIntArrayRef sizes) {
   maybe_initialize_cuda(options.device());
   pybind11::gil_scoped_release no_gil;
-  return torch::empty(sizes, build_options(options, scalar_type, device));
+  return at::empty_symint(sizes, build_options(options, scalar_type, device));
 }
 
 Tensor new_with_storage(
@@ -124,6 +124,12 @@ std::vector<int64_t> compute_sizes(PyObject* seq, ScalarType scalar_type) {
 }
 
 ScalarType infer_scalar_type(PyObject* obj) {
+  if (torch::is_symint(obj)) {
+    return ScalarType::Long;
+  }
+  if (torch::is_symfloat(obj)) {
+    return ScalarType::Double;
+  }
 #ifdef USE_NUMPY
   if (is_numpy_available()) {
     if (PyArray_Check(obj)) {
@@ -204,7 +210,21 @@ void recursive_store(
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(data != nullptr);
 
   int64_t ndim = sizes.size();
+  bool is_symfloat = torch::is_symfloat(obj);
+  bool is_symint = torch::is_symint(obj);
   if (dim == ndim) {
+    if (is_symfloat) {
+      auto new_obj = py::reinterpret_borrow<py::object>(obj);
+      auto val = new_obj.cast<c10::SymFloat>();
+      *(double*)data = val.guard_float(__FILE__, __LINE__);
+      return;
+    }
+    if (is_symint) {
+      auto new_obj = py::reinterpret_borrow<py::object>(obj);
+      auto val = new_obj.cast<c10::SymInt>();
+      *(int64_t*)data = val.guard_int(__FILE__, __LINE__);
+      return;
+    }
     torch::utils::store_scalar(data, scalarType, obj);
     return;
   }
@@ -531,7 +551,7 @@ Tensor legacy_sparse_tensor_generic_ctor_new(
       "new(*, int64_t cdata)|hidden",
       "new(Tensor indices, Tensor values, *, Device? device=None)",
       "new(Tensor indices, Tensor values, IntArrayRef size, *, Device? device=None)",
-      "new(IntArrayRef size, *, Device? device=None)",
+      "new(SymIntArrayRef size, *, Device? device=None)",
   });
   if (ctor_or_new == CtorOrNew::NEW)
     check_base_legacy_new(dispatch_key, c10::kSparse);
@@ -577,7 +597,7 @@ Tensor legacy_sparse_tensor_generic_ctor_new(
       }
     }
     return new_with_sizes(
-        options, scalar_type, r.deviceOptional(1), r.intlist(0));
+        options, scalar_type, r.deviceOptional(1), r.symintlist(0));
   }
   throw std::runtime_error("new(): invalid arguments");
 }
@@ -615,7 +635,7 @@ Tensor legacy_tensor_generic_ctor_new(
                                                           // matching with
                                                           // IntArrayRef,
                                                           // PyObject*
-      "new(IntArrayRef size, *, Device? device=None)",
+      "new(SymIntArrayRef size, *, Device? device=None)",
       "new(PyObject* data, *, Device? device=None)",
   });
 
@@ -690,7 +710,7 @@ Tensor legacy_tensor_generic_ctor_new(
           options, scalar_type, deviceOptional, r.pyobject(0));
     }
     return new_with_sizes(
-        options, scalar_type, r.deviceOptional(1), r.intlist(0));
+        options, scalar_type, r.deviceOptional(1), r.symintlist(0));
   } else if (r.idx == 6) {
     auto deviceOptional = r.deviceOptional(1);
     check_legacy_ctor_device(dispatch_key, deviceOptional);

From d323e42f41f1e75c60b099eed696d9c90faa28c5 Mon Sep 17 00:00:00 2001
From: Aidyn-A <31858918+Aidyn-A@users.noreply.github.com>
Date: Fri, 18 Nov 2022 05:08:45 +0000
Subject: [PATCH 1056/1922] Fix consistentcy of histc on CPU and CUDA (#87832)

Fixes #87657

The main reason why `histc` returns slightly different outputs is the difference on how bin position is calculate.
The CPU calculates it as: https://github.com/pytorch/pytorch/blob/449778a939f2adc8867c5035b08be4e2d88339d8/aten/src/ATen/native/cpu/HistogramKernel.cpp#L168-L170
which is basically `(i - a) / (b - a) * N`, while cuda code https://github.com/pytorch/pytorch/blob/449778a939f2adc8867c5035b08be4e2d88339d8/aten/src/ATen/native/cuda/SummaryOps.cu#L41
 which is `(i - a) * N / (b - a)`.

For some cases like in #87657 the order of arithmetic operations matters due to the floating point round-off.

________________

Not sure where would be the most appropriate place to put the unit test. Hope `test_reductions::test_histc` will do.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87832
Approved by: https://github.com/soumith
---
 aten/src/ATen/native/cpu/HistogramKernel.cpp | 4 ++--
 test/test_reductions.py                      | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/cpu/HistogramKernel.cpp b/aten/src/ATen/native/cpu/HistogramKernel.cpp
index 932bf9beb4993..83011aa2e9a79 100644
--- a/aten/src/ATen/native/cpu/HistogramKernel.cpp
+++ b/aten/src/ATen/native/cpu/HistogramKernel.cpp
@@ -166,8 +166,8 @@ void histogramdd_cpu_contiguous(Tensor& hist, const TensorList& bin_edges,
                      * the appropriate bin via simple division.
                      */
                     pos = static_cast<int64_t>((elt - leftmost_edge[dim])
-                            / (rightmost_edge[dim] - leftmost_edge[dim])
-                            * (num_bin_edges[dim] - 1));
+                            * (num_bin_edges[dim] - 1)
+                            / (rightmost_edge[dim] - leftmost_edge[dim]));
 
                     /* Ensures consistency with bin_edges by checking the bins to the left and right
                      * of the selected position. Necessary for cases in which an element very close
diff --git a/test/test_reductions.py b/test/test_reductions.py
index a4be31cd6f929..8d91f56545f01 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -2843,6 +2843,9 @@ def test_against_np(tensor, bins=100, min=0, max=0):
         expanded = torch.randn(1, 5, 1, 2, device=device).expand(3, 5, 7, 2)
         test_against_np(expanded)
 
+        linear = torch.linspace(0, 0.99 - 5.0e-7, 101).to(device)
+        test_against_np(linear, bins=20, min=0, max=0.99)
+
     @onlyCPU
     def test_histc_bfloat16(self, device):
         actual = torch.histc(

From 60e84c6ff090e32c8c626676acbdc02b0d5b7e9b Mon Sep 17 00:00:00 2001
From: Zain Rizvi <zainr@fb.com>
Date: Fri, 18 Nov 2022 07:03:22 +0000
Subject: [PATCH 1057/1922] Always retry curl downloads (#89157)

Modify our curl commands so that they always retry downloads.

By default, curl only retries what it considers to be "transient" errors, based on the server's response. However, curl's estimate of what's transient is very conservative.  By adding the --retry-all-errors parameter we'll always retry curl commands.

In particular, I'm hoping this mitigates errors where curl fails with the below error ([logs](https://github.com/pytorch/pytorch/actions/runs/3468758110/jobs/5794939941))
`curl: (35) OpenSSL SSL_connect: SSL_ERROR_SYSCALL in connection to ossci-linux.s3.amazonaws.com:443`

Some of the modified downloads didn't even have retries, so I added them in

More details: https://everything.curl.dev/usingcurl/downloads/retry
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89157
Approved by: https://github.com/kit1980, https://github.com/malfet
---
 .circleci/docker/common/install_cudnn.sh         |  4 ++--
 .circleci/docker/common/install_docs_reqs.sh     |  4 ++--
 .circleci/docker/common/install_protobuf.sh      |  2 +-
 .circleci/scripts/binary_install_miniconda.sh    |  4 ++--
 .circleci/scripts/binary_ios_build.sh            |  2 +-
 .circleci/scripts/binary_ios_upload.sh           |  2 +-
 .circleci/scripts/driver_update.bat              |  2 +-
 .circleci/scripts/setup_ci_environment.sh        |  4 ++--
 .../scripts/setup_linux_system_environment.sh    |  2 +-
 .circleci/scripts/vs_install.ps1                 |  2 +-
 .circleci/scripts/vs_install_cmath.ps1           |  2 +-
 .circleci/scripts/windows_cudnn_install.sh       |  2 +-
 .../templates/macos_binary_build_workflow.yml.j2 |  4 ++--
 .github/workflows/_ios-build-test.yml            |  2 +-
 .github/workflows/_mac-build.yml                 |  2 +-
 ...enerated-macos-arm64-binary-conda-nightly.yml | 12 ++++++------
 ...enerated-macos-arm64-binary-wheel-nightly.yml | 12 ++++++------
 .../generated-macos-binary-conda-nightly.yml     | 16 ++++++++--------
 ...d-macos-binary-libtorch-cxx11-abi-nightly.yml | 16 ++++++++--------
 ...d-macos-binary-libtorch-pre-cxx11-nightly.yml | 16 ++++++++--------
 .../generated-macos-binary-wheel-nightly.yml     | 16 ++++++++--------
 .jenkins/pytorch/common_utils.sh                 |  8 ++++++--
 .../installation-helpers/activate_miniconda3.bat |  2 +-
 .../installation-helpers/install_magma.bat       |  2 +-
 .../installation-helpers/install_mkl.bat         |  2 +-
 .../installation-helpers/install_sccache.bat     |  4 ++--
 scripts/buck_setup.sh                            |  6 +++---
 third_party/gloo                                 |  2 +-
 third_party/pybind11                             |  2 +-
 29 files changed, 80 insertions(+), 76 deletions(-)

diff --git a/.circleci/docker/common/install_cudnn.sh b/.circleci/docker/common/install_cudnn.sh
index 4a8829b1cba11..f68fc6946c2eb 100644
--- a/.circleci/docker/common/install_cudnn.sh
+++ b/.circleci/docker/common/install_cudnn.sh
@@ -6,9 +6,9 @@ if [[ ${CUDNN_VERSION} == 8 ]]; then
     CUDNN_NAME="cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive"
     if [[ ${CUDA_VERSION:0:4} == "11.7" ]]; then
         CUDNN_NAME="cudnn-linux-x86_64-8.5.0.96_cuda11-archive"
-        curl -OLs https://ossci-linux.s3.amazonaws.com/${CUDNN_NAME}.tar.xz
+        curl --retry 3 -OLs https://ossci-linux.s3.amazonaws.com/${CUDNN_NAME}.tar.xz
     else
-        curl -OLs  https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/${CUDNN_NAME}.tar.xz
+        curl --retry 3 -OLs  https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/${CUDNN_NAME}.tar.xz
     fi
 
     tar xf ${CUDNN_NAME}.tar.xz
diff --git a/.circleci/docker/common/install_docs_reqs.sh b/.circleci/docker/common/install_docs_reqs.sh
index 1adc9e8009a02..e60171208ae1a 100644
--- a/.circleci/docker/common/install_docs_reqs.sh
+++ b/.circleci/docker/common/install_docs_reqs.sh
@@ -7,10 +7,10 @@ if [ -n "$KATEX" ]; then
   # Ignore error if gpg-agent doesn't exist (for Ubuntu 16.04)
   apt-get install -y gpg-agent || :
 
-  curl -sL https://deb.nodesource.com/setup_12.x | sudo -E bash -
+  curl --retry 3 -sL https://deb.nodesource.com/setup_12.x | sudo -E bash -
   sudo apt-get install -y nodejs
 
-  curl -sS https://dl.yarnpkg.com/debian/pubkey.gpg | sudo apt-key add -
+  curl --retry 3 -sS https://dl.yarnpkg.com/debian/pubkey.gpg | sudo apt-key add -
   echo "deb https://dl.yarnpkg.com/debian/ stable main" | sudo tee /etc/apt/sources.list.d/yarn.list
 
   apt-get update
diff --git a/.circleci/docker/common/install_protobuf.sh b/.circleci/docker/common/install_protobuf.sh
index 9d9f6c40ba0cf..4b7a7a6ac23f7 100755
--- a/.circleci/docker/common/install_protobuf.sh
+++ b/.circleci/docker/common/install_protobuf.sh
@@ -12,7 +12,7 @@ install_protobuf_317() {
   #   g++: error: ./../lib64/crti.o: No such file or directory
   ln -s /usr/lib64 "$pb_dir/lib64"
 
-  curl -LO "https://github.com/protocolbuffers/protobuf/releases/download/v3.17.3/protobuf-all-3.17.3.tar.gz"
+  curl -LO "https://github.com/protocolbuffers/protobuf/releases/download/v3.17.3/protobuf-all-3.17.3.tar.gz" --retry 3
   tar -xvz -C "$pb_dir" --strip-components 1 -f protobuf-all-3.17.3.tar.gz
   # -j6 to balance memory usage and speed.
   # naked `-j` seems to use too much memory.
diff --git a/.circleci/scripts/binary_install_miniconda.sh b/.circleci/scripts/binary_install_miniconda.sh
index 43eb006742aed..3541a32ac6bf9 100755
--- a/.circleci/scripts/binary_install_miniconda.sh
+++ b/.circleci/scripts/binary_install_miniconda.sh
@@ -31,9 +31,9 @@ fi
 
 conda_sh="$workdir/install_miniconda.sh"
 if [[ "$(uname)" == Darwin ]]; then
-  curl --retry 3 -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+  curl --retry 3 --retry-all-errors -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
 else
-  curl --retry 3 -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+  curl --retry 3 --retry-all-errors -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
 fi
 chmod +x "$conda_sh"
 "$conda_sh" -b -p "$MINICONDA_ROOT"
diff --git a/.circleci/scripts/binary_ios_build.sh b/.circleci/scripts/binary_ios_build.sh
index 6c7674ed510ee..4bb5ea28af733 100644
--- a/.circleci/scripts/binary_ios_build.sh
+++ b/.circleci/scripts/binary_ios_build.sh
@@ -8,7 +8,7 @@ PROJ_ROOT=/Users/distiller/project
 export TCLLIBPATH="/usr/local/lib"
 
 # Install conda
-curl --retry 3 -o ~/conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+curl --retry 3 --retry-all-errors -o ~/conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
 chmod +x ~/conda.sh
 /bin/bash ~/conda.sh -b -p ~/anaconda
 export PATH="~/anaconda/bin:${PATH}"
diff --git a/.circleci/scripts/binary_ios_upload.sh b/.circleci/scripts/binary_ios_upload.sh
index da38065847eff..7949dc9170b0e 100644
--- a/.circleci/scripts/binary_ios_upload.sh
+++ b/.circleci/scripts/binary_ios_upload.sh
@@ -47,7 +47,7 @@ echo "${IOS_NIGHTLY_BUILD_VERSION}" > version.txt
 zip -r ${ZIPFILE} install src version.txt LICENSE
 # upload to aws
 # Install conda then 'conda install' awscli
-curl --retry 3 -o ~/conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+curl --retry 3 --retry-all-errors -o ~/conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
 chmod +x ~/conda.sh
 /bin/bash ~/conda.sh -b -p ~/anaconda
 export PATH="~/anaconda/bin:${PATH}"
diff --git a/.circleci/scripts/driver_update.bat b/.circleci/scripts/driver_update.bat
index 46c05475cdba8..fb87743666213 100644
--- a/.circleci/scripts/driver_update.bat
+++ b/.circleci/scripts/driver_update.bat
@@ -1,5 +1,5 @@
 set "DRIVER_DOWNLOAD_LINK=https://s3.amazonaws.com/ossci-windows/452.39-data-center-tesla-desktop-win10-64bit-international.exe"
-curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output 452.39-data-center-tesla-desktop-win10-64bit-international.exe
+curl --retry 3 --retry-all-errors -kL %DRIVER_DOWNLOAD_LINK% --output 452.39-data-center-tesla-desktop-win10-64bit-international.exe
 if errorlevel 1 exit /b 1
 
 start /wait 452.39-data-center-tesla-desktop-win10-64bit-international.exe -s -noreboot
diff --git a/.circleci/scripts/setup_ci_environment.sh b/.circleci/scripts/setup_ci_environment.sh
index e8dd9ab7195b9..42a605cd44451 100755
--- a/.circleci/scripts/setup_ci_environment.sh
+++ b/.circleci/scripts/setup_ci_environment.sh
@@ -40,8 +40,8 @@ if [ -n "${USE_CUDA_DOCKER_RUNTIME:-}" ]; then
   # Taken directly from https://github.com/NVIDIA/nvidia-docker
   # Add the package repositories
   distribution=$(. /etc/os-release;echo "$ID$VERSION_ID")
-  curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
-  curl -s -L "https://nvidia.github.io/nvidia-docker/${distribution}/nvidia-docker.list" | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
+  curl -s -L --retry 3 --retry-all-errors https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
+  curl -s -L --retry 3 --retry-all-errors "https://nvidia.github.io/nvidia-docker/${distribution}/nvidia-docker.list" | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
 
   retry sudo apt-get update -qq
   # Necessary to get the `--gpus` flag to function within docker
diff --git a/.circleci/scripts/setup_linux_system_environment.sh b/.circleci/scripts/setup_linux_system_environment.sh
index ce64076e2d64b..780f7c1bd3790 100755
--- a/.circleci/scripts/setup_linux_system_environment.sh
+++ b/.circleci/scripts/setup_linux_system_environment.sh
@@ -2,7 +2,7 @@
 set -eux -o pipefail
 
 # Set up CircleCI GPG keys for apt, if needed
-curl --retry 3 -s -L https://packagecloud.io/circleci/trusty/gpgkey | sudo apt-key add -
+curl --retry 3 --retry-all-errors -s -L https://packagecloud.io/circleci/trusty/gpgkey | sudo apt-key add -
 
 # Stop background apt updates.  Hypothetically, the kill should not
 # be necessary, because stop is supposed to send a kill signal to
diff --git a/.circleci/scripts/vs_install.ps1 b/.circleci/scripts/vs_install.ps1
index a2e373078adb6..4bbbc24bb0437 100644
--- a/.circleci/scripts/vs_install.ps1
+++ b/.circleci/scripts/vs_install.ps1
@@ -29,7 +29,7 @@ if (Test-Path "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswher
 }
 
 echo "Downloading VS installer from S3."
-curl.exe --retry 3 -kL $VS_DOWNLOAD_LINK --output vs_installer.exe
+curl.exe --retry 3 --retry-all-errors -kL $VS_DOWNLOAD_LINK --output vs_installer.exe
 if ($LASTEXITCODE -ne 0) {
     echo "Download of the VS 2019 Version ${env:VS_VERSION} installer failed"
     exit 1
diff --git a/.circleci/scripts/vs_install_cmath.ps1 b/.circleci/scripts/vs_install_cmath.ps1
index c2998eba25217..62b637ec21b82 100644
--- a/.circleci/scripts/vs_install_cmath.ps1
+++ b/.circleci/scripts/vs_install_cmath.ps1
@@ -1,5 +1,5 @@
 $CMATH_DOWNLOAD_LINK = "https://raw.githubusercontent.com/microsoft/STL/12c684bba78f9b032050526abdebf14f58ca26a3/stl/inc/cmath"
 $VC14_28_INSTALL_PATH="C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.28.29910\include"
 
-curl.exe --retry 3 -kL $CMATH_DOWNLOAD_LINK --output "$home\cmath"
+curl.exe --retry 3 --retry-all-errors -kL $CMATH_DOWNLOAD_LINK --output "$home\cmath"
 Move-Item -Path "$home\cmath" -Destination "$VC14_28_INSTALL_PATH" -Force
diff --git a/.circleci/scripts/windows_cudnn_install.sh b/.circleci/scripts/windows_cudnn_install.sh
index c279259e83416..bbf45a3290b37 100644
--- a/.circleci/scripts/windows_cudnn_install.sh
+++ b/.circleci/scripts/windows_cudnn_install.sh
@@ -36,7 +36,7 @@ else
     tmp_dir=$(mktemp -d)
     (
         pushd "${tmp_dir}"
-        curl --retry 3 -o "${cudnn_installer_name}" "$cudnn_installer_link"
+        curl --retry 3 --retry-all-errors -o "${cudnn_installer_name}" "$cudnn_installer_link"
         7z x "${cudnn_installer_name}" -ocudnn
         # Use '${var:?}/*' to avoid potentially expanding to '/*'
         # Remove all of the directories before attempting to copy files
diff --git a/.github/templates/macos_binary_build_workflow.yml.j2 b/.github/templates/macos_binary_build_workflow.yml.j2
index 95802252a4f98..eb0c2ff4b3734 100644
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@@ -69,7 +69,7 @@ jobs:
       - name: Install conda and dependencies
         run: |
           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
@@ -84,7 +84,7 @@ jobs:
           max_attempts: 3
           retry_wait_seconds: 90
           command: |
-            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
             echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
       - name: Populate binary env
diff --git a/.github/workflows/_ios-build-test.yml b/.github/workflows/_ios-build-test.yml
index e9b5461dde7fd..269ad3f153ca4 100644
--- a/.github/workflows/_ios-build-test.yml
+++ b/.github/workflows/_ios-build-test.yml
@@ -68,7 +68,7 @@ jobs:
       - name: Install conda and dependencies
         run: |
           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml
index faf069e7a7c35..5ee909f02c222 100644
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@@ -116,7 +116,7 @@ jobs:
           max_attempts: 3
           retry_wait_seconds: 90
           command: |
-            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
             echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
             echo "SCCACHE_S3_KEY_PREFIX=${GITHUB_WORKFLOW}" >> "${GITHUB_ENV}"
diff --git a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
index ce32755e32098..c88b107a90a94 100644
--- a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
@@ -67,7 +67,7 @@ jobs:
       - name: Install conda and dependencies
         run: |
           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
@@ -103,7 +103,7 @@ jobs:
           max_attempts: 3
           retry_wait_seconds: 90
           command: |
-            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
             echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
       - name: Populate binary env
@@ -177,7 +177,7 @@ jobs:
       - name: Install conda and dependencies
         run: |
           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
@@ -213,7 +213,7 @@ jobs:
           max_attempts: 3
           retry_wait_seconds: 90
           command: |
-            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
             echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
       - name: Populate binary env
@@ -287,7 +287,7 @@ jobs:
       - name: Install conda and dependencies
         run: |
           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
@@ -323,7 +323,7 @@ jobs:
           max_attempts: 3
           retry_wait_seconds: 90
           command: |
-            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
             echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
       - name: Populate binary env
diff --git a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
index 7a7df02efe891..c8858fd0501bd 100644
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@@ -67,7 +67,7 @@ jobs:
       - name: Install conda and dependencies
         run: |
           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
@@ -103,7 +103,7 @@ jobs:
           max_attempts: 3
           retry_wait_seconds: 90
           command: |
-            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
             echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
       - name: Populate binary env
@@ -177,7 +177,7 @@ jobs:
       - name: Install conda and dependencies
         run: |
           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
@@ -213,7 +213,7 @@ jobs:
           max_attempts: 3
           retry_wait_seconds: 90
           command: |
-            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
             echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
       - name: Populate binary env
@@ -287,7 +287,7 @@ jobs:
       - name: Install conda and dependencies
         run: |
           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
@@ -323,7 +323,7 @@ jobs:
           max_attempts: 3
           retry_wait_seconds: 90
           command: |
-            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
             echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
       - name: Populate binary env
diff --git a/.github/workflows/generated-macos-binary-conda-nightly.yml b/.github/workflows/generated-macos-binary-conda-nightly.yml
index ba3697e3fef91..52cfb3d98f764 100644
--- a/.github/workflows/generated-macos-binary-conda-nightly.yml
+++ b/.github/workflows/generated-macos-binary-conda-nightly.yml
@@ -65,7 +65,7 @@ jobs:
       - name: Install conda and dependencies
         run: |
           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
@@ -101,7 +101,7 @@ jobs:
           max_attempts: 3
           retry_wait_seconds: 90
           command: |
-            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
             echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
       - name: Populate binary env
@@ -175,7 +175,7 @@ jobs:
       - name: Install conda and dependencies
         run: |
           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
@@ -211,7 +211,7 @@ jobs:
           max_attempts: 3
           retry_wait_seconds: 90
           command: |
-            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
             echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
       - name: Populate binary env
@@ -285,7 +285,7 @@ jobs:
       - name: Install conda and dependencies
         run: |
           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
@@ -321,7 +321,7 @@ jobs:
           max_attempts: 3
           retry_wait_seconds: 90
           command: |
-            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
             echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
       - name: Populate binary env
@@ -395,7 +395,7 @@ jobs:
       - name: Install conda and dependencies
         run: |
           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
@@ -431,7 +431,7 @@ jobs:
           max_attempts: 3
           retry_wait_seconds: 90
           command: |
-            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
             echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
       - name: Populate binary env
diff --git a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml
index 381e0a4c73ad7..cd9ad45ba5610 100644
--- a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml
@@ -69,7 +69,7 @@ jobs:
       - name: Install conda and dependencies
         run: |
           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
@@ -105,7 +105,7 @@ jobs:
           max_attempts: 3
           retry_wait_seconds: 90
           command: |
-            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
             echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
       - name: Populate binary env
@@ -184,7 +184,7 @@ jobs:
       - name: Install conda and dependencies
         run: |
           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
@@ -220,7 +220,7 @@ jobs:
           max_attempts: 3
           retry_wait_seconds: 90
           command: |
-            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
             echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
       - name: Populate binary env
@@ -299,7 +299,7 @@ jobs:
       - name: Install conda and dependencies
         run: |
           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
@@ -335,7 +335,7 @@ jobs:
           max_attempts: 3
           retry_wait_seconds: 90
           command: |
-            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
             echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
       - name: Populate binary env
@@ -414,7 +414,7 @@ jobs:
       - name: Install conda and dependencies
         run: |
           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
@@ -450,7 +450,7 @@ jobs:
           max_attempts: 3
           retry_wait_seconds: 90
           command: |
-            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
             echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
       - name: Populate binary env
diff --git a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml
index 55b28480a7545..4ce5c6f32c36d 100644
--- a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml
+++ b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11-nightly.yml
@@ -69,7 +69,7 @@ jobs:
       - name: Install conda and dependencies
         run: |
           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
@@ -105,7 +105,7 @@ jobs:
           max_attempts: 3
           retry_wait_seconds: 90
           command: |
-            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
             echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
       - name: Populate binary env
@@ -184,7 +184,7 @@ jobs:
       - name: Install conda and dependencies
         run: |
           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
@@ -220,7 +220,7 @@ jobs:
           max_attempts: 3
           retry_wait_seconds: 90
           command: |
-            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
             echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
       - name: Populate binary env
@@ -299,7 +299,7 @@ jobs:
       - name: Install conda and dependencies
         run: |
           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
@@ -335,7 +335,7 @@ jobs:
           max_attempts: 3
           retry_wait_seconds: 90
           command: |
-            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
             echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
       - name: Populate binary env
@@ -414,7 +414,7 @@ jobs:
       - name: Install conda and dependencies
         run: |
           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
@@ -450,7 +450,7 @@ jobs:
           max_attempts: 3
           retry_wait_seconds: 90
           command: |
-            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
             echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
       - name: Populate binary env
diff --git a/.github/workflows/generated-macos-binary-wheel-nightly.yml b/.github/workflows/generated-macos-binary-wheel-nightly.yml
index f4baf9129b690..a3839d6e8a142 100644
--- a/.github/workflows/generated-macos-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-binary-wheel-nightly.yml
@@ -65,7 +65,7 @@ jobs:
       - name: Install conda and dependencies
         run: |
           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
@@ -101,7 +101,7 @@ jobs:
           max_attempts: 3
           retry_wait_seconds: 90
           command: |
-            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
             echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
       - name: Populate binary env
@@ -175,7 +175,7 @@ jobs:
       - name: Install conda and dependencies
         run: |
           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
@@ -211,7 +211,7 @@ jobs:
           max_attempts: 3
           retry_wait_seconds: 90
           command: |
-            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
             echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
       - name: Populate binary env
@@ -285,7 +285,7 @@ jobs:
       - name: Install conda and dependencies
         run: |
           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
@@ -321,7 +321,7 @@ jobs:
           max_attempts: 3
           retry_wait_seconds: 90
           command: |
-            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
             echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
       - name: Populate binary env
@@ -395,7 +395,7 @@ jobs:
       - name: Install conda and dependencies
         run: |
           # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
@@ -431,7 +431,7 @@ jobs:
           max_attempts: 3
           retry_wait_seconds: 90
           command: |
-            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
             echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
       - name: Populate binary env
diff --git a/.jenkins/pytorch/common_utils.sh b/.jenkins/pytorch/common_utils.sh
index 7fc1dd6c0f1a9..6d3c96b9278f7 100644
--- a/.jenkins/pytorch/common_utils.sh
+++ b/.jenkins/pytorch/common_utils.sh
@@ -9,6 +9,10 @@ log() { printf '%s\n' "$*"; }
 error() { log "ERROR: $*" >&2; }
 fatal() { error "$@"; exit 1; }
 
+retry () {
+    "$@" || (sleep 10 && "$@") || (sleep 20 && "$@") || (sleep 40 && "$@")
+}
+
 # compositional trap taken from https://stackoverflow.com/a/7287873/23845
 # appends a command to a trap
 #
@@ -78,12 +82,12 @@ function get_exit_code() {
 function get_bazel() {
   if [[ $(uname) == "Darwin" ]]; then
     # download bazel version
-    curl https://github.com/bazelbuild/bazel/releases/download/4.2.1/bazel-4.2.1-darwin-x86_64  -Lo tools/bazel
+    retry curl https://github.com/bazelbuild/bazel/releases/download/4.2.1/bazel-4.2.1-darwin-x86_64  -Lo tools/bazel
     # verify content
     echo '74d93848f0c9d592e341e48341c53c87e3cb304a54a2a1ee9cff3df422f0b23c  tools/bazel' | shasum -a 256 -c >/dev/null
   else
     # download bazel version
-    curl https://ossci-linux.s3.amazonaws.com/bazel-4.2.1-linux-x86_64 -o tools/bazel
+    retry curl https://ossci-linux.s3.amazonaws.com/bazel-4.2.1-linux-x86_64 -o tools/bazel
     # verify content
     echo '1a4f3a3ce292307bceeb44f459883859c793436d564b95319aacb8af1f20557c  tools/bazel' | shasum -a 256 -c >/dev/null
   fi
diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat b/.jenkins/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
index e6660a17b3890..0552d85a407a5 100644
--- a/.jenkins/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
+++ b/.jenkins/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
@@ -13,7 +13,7 @@ if not exist %CONDA_PARENT_DIR%\Miniconda3 (
 )
 
 if "%INSTALL_FRESH_CONDA%"=="1" (
-  curl --retry 3 -k https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe --output %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe
+  curl --retry 3 --retry-all-errors -k https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe --output %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe
   if errorlevel 1 exit /b
   if not errorlevel 0 exit /b
 
diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat
index d9f3ab1cf8211..d0fbf5b20d888 100644
--- a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat
+++ b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat
@@ -24,7 +24,7 @@ if "%CUDA_SUFFIX%" == "" (
 
 if "%REBUILD%"=="" (
   if "%BUILD_ENVIRONMENT%"=="" (
-    curl --retry 3 -k https://s3.amazonaws.com/ossci-windows/magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --output %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z
+    curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --output %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z
   ) else (
     aws s3 cp s3://ossci-windows/magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --quiet
   )
diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_mkl.bat b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_mkl.bat
index c700a04a1e4af..6c676d1baeded 100644
--- a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_mkl.bat
+++ b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_mkl.bat
@@ -1,6 +1,6 @@
 if "%REBUILD%"=="" (
   if "%BUILD_ENVIRONMENT%"=="" (
-    curl --retry 3 -k https://s3.amazonaws.com/ossci-windows/mkl_2020.2.254.7z --output %TMP_DIR_WIN%\mkl.7z
+    curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/mkl_2020.2.254.7z --output %TMP_DIR_WIN%\mkl.7z
   ) else (
     aws s3 cp s3://ossci-windows/mkl_2020.2.254.7z %TMP_DIR_WIN%\mkl.7z --quiet
   )
diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_sccache.bat b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_sccache.bat
index 0165604400ddc..6f8cc15ba8684 100644
--- a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_sccache.bat
+++ b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_sccache.bat
@@ -7,8 +7,8 @@ if "%REBUILD%"=="" (
     del %TMP_DIR_WIN%\bin\sccache.exe || ver > nul
     del %TMP_DIR_WIN%\bin\sccache-cl.exe || ver > nul
     if "%BUILD_ENVIRONMENT%"=="" (
-      curl --retry 3 -k https://s3.amazonaws.com/ossci-windows/sccache.exe --output %TMP_DIR_WIN%\bin\sccache.exe
-      curl --retry 3 -k https://s3.amazonaws.com/ossci-windows/sccache-cl.exe --output %TMP_DIR_WIN%\bin\sccache-cl.exe
+      curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/sccache.exe --output %TMP_DIR_WIN%\bin\sccache.exe
+      curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/sccache-cl.exe --output %TMP_DIR_WIN%\bin\sccache-cl.exe
     ) else (
       aws s3 cp s3://ossci-windows/sccache.exe %TMP_DIR_WIN%\bin\sccache.exe
       aws s3 cp s3://ossci-windows/sccache-cl.exe %TMP_DIR_WIN%\bin\sccache-cl.exe
diff --git a/scripts/buck_setup.sh b/scripts/buck_setup.sh
index 8e60d92a5fd15..331a299564167 100644
--- a/scripts/buck_setup.sh
+++ b/scripts/buck_setup.sh
@@ -22,16 +22,16 @@ python3 generate-xnnpack-wrappers.py
 # bazel-skylib
 printf "\nDownloading bazel-skylib\n"
 rm -rf bazel-skylib; mkdir bazel-skylib
-curl -L $PROXY https://github.com/bazelbuild/bazel-skylib/releases/download/1.0.2/bazel-skylib-1.0.2.tar.gz|tar zx -C bazel-skylib
+curl --retry 3 --retry-all-errors -L $PROXY https://github.com/bazelbuild/bazel-skylib/releases/download/1.0.2/bazel-skylib-1.0.2.tar.gz|tar zx -C bazel-skylib
 
 # glog
 printf "\nDownloading glog\n"
 rm -rf glog; mkdir glog
-curl -L $PROXY https://github.com/google/glog/archive/v0.4.0.tar.gz | tar zx -C glog --strip-components 1
+curl --retry 3 --retry-all-errors -L $PROXY https://github.com/google/glog/archive/v0.4.0.tar.gz | tar zx -C glog --strip-components 1
 
 # ruy
 printf "\nDownloading ruy\n"
-curl -L $PROXY -o /tmp/ruy.zip https://github.com/google/ruy/archive/a09683b8da7164b9c5704f88aef2dc65aa583e5d.zip
+curl --retry 3 --retry-all-errors -L $PROXY -o /tmp/ruy.zip https://github.com/google/ruy/archive/a09683b8da7164b9c5704f88aef2dc65aa583e5d.zip
 unzip -q /tmp/ruy.zip -d /tmp/
 rm -rf ruy/
 mv /tmp/ruy-a09683b8da7164b9c5704f88aef2dc65aa583e5d ruy/
diff --git a/third_party/gloo b/third_party/gloo
index 4a5e339b76426..5b14351326313 160000
--- a/third_party/gloo
+++ b/third_party/gloo
@@ -1 +1 @@
-Subproject commit 4a5e339b764261d20fc409071dc7a8b8989aa195
+Subproject commit 5b143513263133af2b95547e97c07cebeb72bf72
diff --git a/third_party/pybind11 b/third_party/pybind11
index 80dc998efced8..aa304c9c7d725 160000
--- a/third_party/pybind11
+++ b/third_party/pybind11
@@ -1 +1 @@
-Subproject commit 80dc998efced8ceb2be59756668a7e90e8bef917
+Subproject commit aa304c9c7d725ffb9d10af08a3b34cb372307020

From ec7a5e0e94a03ae9ce5535967a53e8d9e3f4ee7a Mon Sep 17 00:00:00 2001
From: maxren <maxren@meta.com>
Date: Thu, 17 Nov 2022 14:31:43 -0800
Subject: [PATCH 1058/1922] [xnnpack][executorch] remove unordered_set from
 xnn_compiler (#89231)

Removing unrodered_set from xnncompiler for executorch.

While some STL libraries are unavoidable, and I think it should be ok for delegate to pull these libraries, unordered_set wasn't really needed, and we should be serializing the number of external ids anyways

After this, the backend classes should be good to hg copy into executorch

Differential Revision: [D41227391](https://our.internmc.facebook.com/intern/diff/D41227391/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89231
Approved by: https://github.com/salilsdesai, https://github.com/cccclai
---
 .../jit/backends/xnnpack/compiler/xnn_compiler.cpp   | 12 +-----------
 .../jit/backends/xnnpack/compiler/xnn_compiler.h     |  1 -
 .../jit/backends/xnnpack/serialization/schema.fbs    |  3 +++
 .../backends/xnnpack/serialization/serializer.cpp    |  4 +++-
 .../jit/backends/xnnpack/serialization/serializer.h  |  3 ++-
 .../jit/backends/xnnpack/xnnpack_graph_builder.cpp   |  6 +++++-
 6 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp b/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp
index 49e2804c99a93..0f654dff0ac00 100644
--- a/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp
+++ b/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp
@@ -4,7 +4,6 @@
 #include <torch/csrc/jit/backends/xnnpack/serialization/schema_generated.h>
 
 #include <ATen/Utils.h>
-#include <unordered_set>
 
 namespace torch {
 namespace jit {
@@ -25,17 +24,8 @@ void XNNCompiler::compileModel(
 
   // create xnnpack subgraph
   xnn_subgraph_t subgraph_ptr = nullptr;
-
-  // TODO: @maxren serialize extern_ids in flatbuffer schema
-  std::unordered_set<uint32_t> extern_ids;
-  for (auto input_id : *flatbuffer_graph->input_ids()) {
-    extern_ids.insert(input_id);
-  }
-  for (auto output_id : *flatbuffer_graph->output_ids()) {
-    extern_ids.insert(output_id);
-  }
   status = xnn_create_subgraph(
-      /*external_value_ids=*/extern_ids.size(),
+      /*external_value_ids=*/flatbuffer_graph->num_externs(),
       /*flags=*/0,
       &subgraph_ptr);
   TORCH_CHECK(xnn_status_success == status, "Failed to create xnn subgraph");
diff --git a/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.h b/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.h
index e87fcbcd063d9..f74e784111d4f 100644
--- a/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.h
+++ b/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.h
@@ -3,7 +3,6 @@
 #include <caffe2/torch/csrc/jit/backends/xnnpack/executor/xnn_executor.h>
 #include <xnnpack.h>
 #include <memory>
-#include <string>
 #include <vector>
 
 namespace torch {
diff --git a/torch/csrc/jit/backends/xnnpack/serialization/schema.fbs b/torch/csrc/jit/backends/xnnpack/serialization/schema.fbs
index 87ebe20a825a6..cc1290b718fac 100644
--- a/torch/csrc/jit/backends/xnnpack/serialization/schema.fbs
+++ b/torch/csrc/jit/backends/xnnpack/serialization/schema.fbs
@@ -75,6 +75,9 @@ table XNNGraph {
   xnodes:[XNode];
   xvalues:[XValue];
 
+  // Number of external inputs/outputs
+  num_externs:uint;
+
   // Ids of external inputs
   input_ids:[uint];
 
diff --git a/torch/csrc/jit/backends/xnnpack/serialization/serializer.cpp b/torch/csrc/jit/backends/xnnpack/serialization/serializer.cpp
index df1ccc791781a..63cb62c5698ea 100644
--- a/torch/csrc/jit/backends/xnnpack/serialization/serializer.cpp
+++ b/torch/csrc/jit/backends/xnnpack/serialization/serializer.cpp
@@ -67,12 +67,14 @@ void XNNSerializer::serializeTensorValue(
 
 std::string XNNSerializer::finishAndSerialize(
     std::vector<uint32_t> input_ids,
-    std::vector<uint32_t> output_ids) {
+    std::vector<uint32_t> output_ids,
+    size_t num_extern_ids) {
   auto xnnGraph = CreateXNNGraphDirect(
       _builder,
       _version_sha1,
       &_nodes,
       &_values,
+      num_extern_ids,
       &input_ids,
       &output_ids,
       &_constantBuffer,
diff --git a/torch/csrc/jit/backends/xnnpack/serialization/serializer.h b/torch/csrc/jit/backends/xnnpack/serialization/serializer.h
index 6d01571d424d3..08a3875d32673 100644
--- a/torch/csrc/jit/backends/xnnpack/serialization/serializer.h
+++ b/torch/csrc/jit/backends/xnnpack/serialization/serializer.h
@@ -51,7 +51,8 @@ class XNNSerializer {
   // finish and serialize xnngraph returning serialized data
   std::string finishAndSerialize(
       std::vector<uint32_t> input_ids,
-      std::vector<uint32_t> output_ids);
+      std::vector<uint32_t> output_ids,
+      size_t num_extern_ids);
 
  private:
   // xnnpack version we are serializing
diff --git a/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp
index 4eaefea569605..45a4bd2fa7954 100644
--- a/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp
+++ b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp
@@ -129,16 +129,20 @@ void XNNGraph::checkOpsToDelegate(std::shared_ptr<torch::jit::Graph>& graph) {
 std::string XNNGraph::serializedXNNGraph() {
   std::vector<uint32_t> input_ids;
   std::vector<uint32_t> output_ids;
+  std::unordered_set<uint32_t> num_externs;
 
   for (auto val : _inputs) {
     input_ids.push_back(_val_to_ids[val]);
+    num_externs.emplace(_val_to_ids[val]);
   }
 
   for (auto val : _outputs) {
     output_ids.push_back(_val_to_ids[val]);
+    num_externs.emplace(_val_to_ids[val]);
   }
 
-  return _serializer.finishAndSerialize(input_ids, output_ids);
+  return _serializer.finishAndSerialize(
+      input_ids, output_ids, num_externs.size());
 }
 
 std::vector<std::vector<long>> XNNGraph::getGraphOutputShapes() {

From 9673b1631ca2a9120d8d26766e51dbaecd0fe41b Mon Sep 17 00:00:00 2001
From: PumeTu <pumetuchinda@gmail.com>
Date: Fri, 18 Nov 2022 07:24:33 +0000
Subject: [PATCH 1059/1922] Add support trace on MPS backend (#87910)

Fixes [#87221](https://github.com/pytorch/pytorch/issues/87221)
`trace` now supported on MPS

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87910
Approved by: https://github.com/kulinseth, https://github.com/malfet
---
 .../ATen/native/mps/operations/ReduceOps.mm   | 31 ++++++++++++++++++-
 aten/src/ATen/native/native_functions.yaml    |  1 +
 test/test_mps.py                              | 22 ++++++++++---
 3 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index c99f22d89295f..39680240f7f21 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -27,7 +27,8 @@
   SUM,
   PROD,
   MEAN,
-  COUNT_NONZERO
+  COUNT_NONZERO,
+  TRACE
 };
 
 
@@ -239,6 +240,14 @@ void set_axes_and_shapes(const Tensor& input_t,
             castOutputTensor = [mpsGraph reductionMinimumWithTensor:inputTensor
                                                                axes:axes
                                                                name:nil];
+          } else if(reduction_type == MPSReductionType::TRACE) {
+            MPSGraphTensor *bandPartWithTensor = [mpsGraph bandPartWithTensor:inputTensor
+                                                                     numLower:0
+                                                                     numUpper:0
+                                                                         name:nil];
+            castOutputTensor = [mpsGraph reductionSumWithTensor:bandPartWithTensor
+                                                           axes:@[@0, @1]
+                                                           name:nil];
           }
 
           MPSGraphTensor* outputTensor = nil;
@@ -287,6 +296,26 @@ void set_axes_and_shapes(const Tensor& input_t,
     reduction_out_mps(input_t, opt_dim, keepdim, dtype, output_t, MPSReductionType::SUM, "sum_out_mps");
 }
 
+Tensor trace_mps_out(const Tensor& self) {
+
+    Tensor output_t = at::native::empty_mps(
+                      {},
+                      self.scalar_type(),
+                      c10::nullopt,
+                      kMPS,
+                      c10::nullopt,
+                      c10::nullopt);
+
+    std::vector<int64_t> dims(self.dim());
+    std::iota(dims.begin(), dims.end(), 0);
+
+    reduction_out_mps(self, IntArrayRef(dims), false, c10::nullopt, const_cast<Tensor&>(output_t), MPSReductionType::TRACE, "trace_mps_out");
+
+  return output_t;
+
+
+}
+
 TORCH_IMPL_FUNC(prod_out_mps)
    (const Tensor& input_t,
     int64_t dim,
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 5cf0e759db1d5..f625c9faff412 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -7790,6 +7790,7 @@
   dispatch:
     CPU: trace_cpu
     CUDA: trace_cuda
+    MPS: trace_mps_out
   autogen: trace.out
 
 - func: trace_backward(Tensor grad, SymInt[] sizes) -> Tensor
diff --git a/test/test_mps.py b/test/test_mps.py
index aeddea0d21f3d..7ec8ac9d6baae 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -315,6 +315,16 @@ def test_bmm(self):
         self.assertEqual(output_cpu, output_mps)
         self.assertEqual(output_cpu.size(), output_mps.size())
 
+    def test_trace(self):
+        M_cpu = torch.randn(3, 3)
+        M_mps = M_cpu.detach().clone().to("mps")
+
+        output_cpu = torch.trace(M_cpu)
+        output_mps = torch.trace(M_mps)
+
+        self.assertEqual(output_cpu, output_mps)
+        self.assertEqual(output_cpu.size(), output_mps.size())
+
     def test_addbmm(self):
         M_cpu = torch.randn(3, 5)
         batch1_cpu = torch.randn(10, 3, 4)
@@ -5058,10 +5068,14 @@ def test_conv_expand(self):
 
     # The test should not crash
     def test_permute(self):
-        X = torch.randn(5, 5).to('mps')
-        torch.log(X)
-        X = X.permute(1, 0)
-        torch.log(X)
+        M_cpu = torch.randn(5, 5)
+        M_mps = M_cpu.to('mps')
+
+        output_cpu = M_cpu.permute(1, 0)
+        output_mps = M_mps.permute(1, 0)
+
+        self.assertEqual(output_cpu, output_mps)
+        self.assertEqual(output_cpu.size(), output_mps.size())
 
     # Printing of non_contiguous should not crash
     def test_print_non_contiguous(self):

From 74de3d9054c362b6fc2a107f51e108eab906ec83 Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Fri, 18 Nov 2022 07:31:10 +0000
Subject: [PATCH 1060/1922] [flaky] relax tolerance conv1d_vs_scipy (#89193)

Fixes https://github.com/pytorch/pytorch/issues/89087

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89193
Approved by: https://github.com/kit1980
---
 test/nn/test_convolution.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py
index c94eb5447d5ad..a30a276439754 100644
--- a/test/nn/test_convolution.py
+++ b/test/nn/test_convolution.py
@@ -1412,7 +1412,7 @@ def _test(t, weight, mode):
             if mode == 'same':
                 actual = actual[:feat_dim]
 
-            self.assertEqual(actual, expected)
+            self.assertEqual(actual, expected, atol=2e-5, rtol=2e-5)
 
         # Global dtype for this test suite is torch.double
         # This leads to change in type-promotion

From abf97b22bb3c4a1aef0b9e945c21dc493e1fbc5b Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Fri, 18 Nov 2022 07:39:16 +0000
Subject: [PATCH 1061/1922] Gate CUDA-only inductor tests by HAS_CUDA (#89251)

This is to prevent these tests from running on platform where CUDA doesn't exist such as macos. And they are quite flaky https://hud.pytorch.org/failure/test_linear_permute_fusion_cpu there failing the CI from time to time

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89251
Approved by: https://github.com/soumith, https://github.com/desertfire
---
 test/inductor/test_torchinductor.py | 149 ++++++++++++++--------------
 1 file changed, 73 insertions(+), 76 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index fb7ca1fc92b73..f2b1caeb32ea4 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -1618,82 +1618,6 @@ def fn(a, b):
         y = torch.tensor(0)
         self.assertEqual(fn(x, y), x + x)
 
-    @unittest.skipIf(HAS_CPU, "Support GPU so far")
-    def test_linear_permute_fusion(self):
-        class TestModule(torch.nn.Module):
-            def __init__(self, k: int, n: int):
-                super().__init__()
-                self.weight = torch.nn.Parameter(torch.randn(n, k))
-                self.bias = torch.nn.Parameter(torch.randn(n))
-
-            def forward(self, input: torch.Tensor):
-                a0 = torch.nn.functional.linear(input, self.weight, self.bias)
-                b0 = a0.permute(0, 2, 1)
-                return b0
-
-        m, k, n = 16, 8, 4
-        trace_func = chain_passes(torch.fx.symbolic_trace, linear_permute_fusion)
-        module = TestModule(k, n).eval()
-        input = torch.randn(6, m, k)
-        traced = trace_func(module, [input])
-        num_linear = count_call_function(traced, torch.nn.functional.linear)
-        num_linear_transpose = count_call_function(traced, linear_transpose)
-        self.assertEqual(num_linear, 0)
-        self.assertEqual(num_linear_transpose, 1)
-
-        self.assertTrue(torch.allclose(module(input), traced(input)))
-
-    @unittest.skipIf(HAS_CPU, "Support GPU so far")
-    def test_permute_linear_fusion(self):
-        class TestModule(torch.nn.Module):
-            def __init__(self, k: int, n: int):
-                super().__init__()
-                self.weight = torch.nn.Parameter(torch.randn(n, k))
-                self.bias = torch.nn.Parameter(torch.randn(n))
-
-            def forward(self, input: torch.Tensor):
-                input1 = input.permute(0, 2, 1)
-                output = torch.nn.functional.linear(input1, self.weight, self.bias)
-                return output
-
-        m, k, n = 16, 8, 4
-
-        trace_func = chain_passes(torch.fx.symbolic_trace, permute_linear_fusion)
-        module = TestModule(k, n).eval()
-        input = torch.randn(6, k, m)
-        traced = trace_func(module, [input])
-        num_linear = count_call_function(traced, torch.nn.functional.linear)
-        num_transpose_linear = count_call_function(traced, transpose_linear)
-        self.assertEqual(num_linear, 0)
-        self.assertEqual(num_transpose_linear, 1)
-
-        self.assertTrue(torch.allclose(module(input), traced(input)))
-
-    @unittest.skipIf(HAS_CPU, "Support GPU so far")
-    def test_permute_bmm_fusion(self):
-        class TestModule(torch.nn.Module):
-            def __init__(self, batch: int, k: int, n: int):
-                super().__init__()
-                self.other = torch.randn(batch, k, n)
-
-            def forward(self, input: torch.Tensor):
-                input1 = input.permute(0, 2, 1)
-                output = torch.bmm(input1, self.other)
-                return output
-
-        batch, m, k, n = 6, 16, 8, 4
-
-        trace_func = chain_passes(torch.fx.symbolic_trace, permute_matmul_fusion)
-        module = TestModule(batch, k, n).eval()
-        input = torch.randn(batch, k, m)
-        traced = trace_func(module, [input])
-        num_bmm = count_call_function(traced, torch.bmm)
-        num_transpose_matmul = count_call_function(traced, transpose_matmul)
-        self.assertEqual(num_bmm, 0)
-        self.assertEqual(num_transpose_matmul, 1)
-
-        self.assertTrue(torch.allclose(module(input), traced(input)))
-
     def test_slice1(self):
         def fn(a):
             return (
@@ -4710,6 +4634,79 @@ def fn(a):
                 fn, (torch.randn(2, 3, 10, 5, 6, device="cuda")[:, :, 2::2, :, :],)
             )
 
+        def test_linear_permute_fusion(self):
+            class TestModule(torch.nn.Module):
+                def __init__(self, k: int, n: int):
+                    super().__init__()
+                    self.weight = torch.nn.Parameter(torch.randn(n, k))
+                    self.bias = torch.nn.Parameter(torch.randn(n))
+
+                def forward(self, input: torch.Tensor):
+                    a0 = torch.nn.functional.linear(input, self.weight, self.bias)
+                    b0 = a0.permute(0, 2, 1)
+                    return b0
+
+            m, k, n = 16, 8, 4
+            trace_func = chain_passes(torch.fx.symbolic_trace, linear_permute_fusion)
+            module = TestModule(k, n).eval()
+            input = torch.randn(6, m, k)
+            traced = trace_func(module, [input])
+            num_linear = count_call_function(traced, torch.nn.functional.linear)
+            num_linear_transpose = count_call_function(traced, linear_transpose)
+            self.assertEqual(num_linear, 0)
+            self.assertEqual(num_linear_transpose, 1)
+
+            self.assertTrue(torch.allclose(module(input), traced(input)))
+
+        def test_permute_linear_fusion(self):
+            class TestModule(torch.nn.Module):
+                def __init__(self, k: int, n: int):
+                    super().__init__()
+                    self.weight = torch.nn.Parameter(torch.randn(n, k))
+                    self.bias = torch.nn.Parameter(torch.randn(n))
+
+                def forward(self, input: torch.Tensor):
+                    input1 = input.permute(0, 2, 1)
+                    output = torch.nn.functional.linear(input1, self.weight, self.bias)
+                    return output
+
+            m, k, n = 16, 8, 4
+
+            trace_func = chain_passes(torch.fx.symbolic_trace, permute_linear_fusion)
+            module = TestModule(k, n).eval()
+            input = torch.randn(6, k, m)
+            traced = trace_func(module, [input])
+            num_linear = count_call_function(traced, torch.nn.functional.linear)
+            num_transpose_linear = count_call_function(traced, transpose_linear)
+            self.assertEqual(num_linear, 0)
+            self.assertEqual(num_transpose_linear, 1)
+
+            self.assertTrue(torch.allclose(module(input), traced(input)))
+
+        def test_permute_bmm_fusion(self):
+            class TestModule(torch.nn.Module):
+                def __init__(self, batch: int, k: int, n: int):
+                    super().__init__()
+                    self.other = torch.randn(batch, k, n)
+
+                def forward(self, input: torch.Tensor):
+                    input1 = input.permute(0, 2, 1)
+                    output = torch.bmm(input1, self.other)
+                    return output
+
+            batch, m, k, n = 6, 16, 8, 4
+
+            trace_func = chain_passes(torch.fx.symbolic_trace, permute_matmul_fusion)
+            module = TestModule(batch, k, n).eval()
+            input = torch.randn(batch, k, m)
+            traced = trace_func(module, [input])
+            num_bmm = count_call_function(traced, torch.bmm)
+            num_transpose_matmul = count_call_function(traced, transpose_matmul)
+            self.assertEqual(num_bmm, 0)
+            self.assertEqual(num_transpose_matmul, 1)
+
+            self.assertTrue(torch.allclose(module(input), traced(input)))
+
     CommonTemplate.install(CudaTests, "cuda")
 
     class CudaReproTests(TestCase):

From 14db7a7e45c654d3b0d9411725ec08bd48f3b80e Mon Sep 17 00:00:00 2001
From: Michael Lazos <mlazos@fb.com>
Date: Fri, 18 Nov 2022 07:46:35 +0000
Subject: [PATCH 1062/1922] Disable tracing `zero_grad()` (#88731)

Tracing through zero grad is slow, and doesn't provide any benefits.

Helps https://github.com/pytorch/torchdynamo/issues/1803

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88731
Approved by: https://github.com/anijain2305
---
 test/dynamo/test_optimizers.py | 2 +-
 torch/_dynamo/eval_frame.py    | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/dynamo/test_optimizers.py b/test/dynamo/test_optimizers.py
index 2f204a7a11999..90b8cfaaad7b3 100644
--- a/test/dynamo/test_optimizers.py
+++ b/test/dynamo/test_optimizers.py
@@ -125,7 +125,7 @@ def training_iter_fn(batch, model, optimizer):
         batch = {"x": input1, "y": input2}
         for _ in range(2):
             opt_training_iter_fn(batch, net, optimizer)
-        self.assertEqual(cnts.frame_count, 2)
+        self.assertEqual(cnts.frame_count, 1)
 
 
 if __name__ == "__main__":
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 6b500a87bc32f..538f6131d62b1 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -730,6 +730,7 @@ def patch():
             opt._cuda_graph_capture_health_check = disable(
                 opt._cuda_graph_capture_health_check
             )
+            opt.zero_grad = disable(opt.zero_grad)
             # disable any currently set hooks
             # Note: we only want to disable the profiling hook
             # which is the *last* hook applied, we want to keep the no_grad hook

From 6d9957b63ba8241b0830ee38bd93b80624bf5b86 Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Thu, 17 Nov 2022 19:20:22 -0800
Subject: [PATCH 1063/1922] [ao] maintain BC for is_activation_post_process
 (#89260)

Summary: tests are failing due to code packaged with trained models calling now defunct function names (is_activation_post_process).

this diff maintains BC temporarily until the cached code can be refreshed

Test Plan: no functional change

Reviewers:

Subscribers:

Tasks:

Tags:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89260
Approved by: https://github.com/jerryzh168
---
 torch/ao/quantization/quantize.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/torch/ao/quantization/quantize.py b/torch/ao/quantization/quantize.py
index b9ef24e35fdbb..51eb2c1c1ec65 100644
--- a/torch/ao/quantization/quantize.py
+++ b/torch/ao/quantization/quantize.py
@@ -27,7 +27,12 @@
     float_qparams_weight_only_qconfig_4bit,
     _activation_is_memoryless)
 from torch.nn.utils.parametrize import type_before_parametrizations
-from torch.ao.quantization.observer import _is_activation_post_process
+
+from torch.ao.quantization.observer import (  # noqa: F401
+    _is_activation_post_process,
+    _is_activation_post_process as is_activation_post_process,
+    # TODO remove this once problems from name change are resolved
+)
 
 __all__ = [
     "get_default_custom_config_dict",

From f5f76e110c6c643ce043f5dcbe1da346697ef36c Mon Sep 17 00:00:00 2001
From: Jiewen Tan <jwtan@google.com>
Date: Fri, 18 Nov 2022 09:28:46 +0000
Subject: [PATCH 1064/1922] [LTC] Upstream short_metrics (#89186)

Summary:
This pull request upstreams pytorch/xla#4148.

Test Plan:
xla CI.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89186
Approved by: https://github.com/JackCaoG
---
 test/lazy/test_ts_opinfo.py      | 13 +++++++++++--
 torch/csrc/lazy/core/metrics.cpp | 32 +++++++++++++++++++++++++++++++-
 torch/csrc/lazy/core/metrics.h   |  5 +++++
 3 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/test/lazy/test_ts_opinfo.py b/test/lazy/test_ts_opinfo.py
index 2e67035581477..092ba3d0388d0 100644
--- a/test/lazy/test_ts_opinfo.py
+++ b/test/lazy/test_ts_opinfo.py
@@ -71,20 +71,28 @@ def init_lists():
         'linalg_pinv.atol_rtol_tensor',
         'logsumexp',
     ])
+    # For some ops, we don't support all variants. Here we use formatted_name
+    # to uniquely identify the variant.
+    SKIP_VARIANT_LIST = set([
+        'norm_nuc',
+        'min_reduction_with_dim'
+    ])
 
     return (LAZY_OPS_LIST,
             FALLBACK_LIST,
             SKIP_RUNTIME_ERROR_LIST,
             SKIP_INCORRECT_RESULTS_LIST,
             FUNCTIONAL_DECOMPOSE_LIST,
-            HAS_SYMINT_SUFFIX)
+            HAS_SYMINT_SUFFIX,
+            SKIP_VARIANT_LIST)
 
 (LAZY_OPS_LIST,
  FALLBACK_LIST,
  SKIP_RUNTIME_ERROR_LIST,
  SKIP_INCORRECT_RESULTS_LIST,
  FUNCTIONAL_DECOMPOSE_LIST,
- HAS_SYMINT_SUFFIX) = init_lists()
+ HAS_SYMINT_SUFFIX,
+ SKIP_VARIANT_LIST) = init_lists()
 
 torch.manual_seed(42)
 
@@ -166,6 +174,7 @@ class TestLazyOpInfo(TestCase):
           if op.name in LAZY_OPS_LIST
           and op.name not in SKIP_RUNTIME_ERROR_LIST
           and op.name not in FUNCTIONAL_DECOMPOSE_LIST
+          and op.formatted_name not in SKIP_VARIANT_LIST
           ], allowed_dtypes=(torch.float,))
     def test_dispatched_to_lazy(self, device, dtype, op):
         def get_name(op):
diff --git a/torch/csrc/lazy/core/metrics.cpp b/torch/csrc/lazy/core/metrics.cpp
index cb8120c1d45c9..86758edc4dfce 100644
--- a/torch/csrc/lazy/core/metrics.cpp
+++ b/torch/csrc/lazy/core/metrics.cpp
@@ -172,7 +172,9 @@ std::vector<std::string> MetricsArena::GetCounterNames() {
   std::vector<std::string> names;
   std::lock_guard<std::mutex> lock(lock_);
   for (auto& name_data : counters_) {
-    names.push_back(name_data.first);
+    if (name_data.second->Value() > 0) {
+      names.push_back(name_data.first);
+    }
   }
   return names;
 }
@@ -353,6 +355,34 @@ std::string CreateMetricReport() {
   return ss.str();
 }
 
+std::string CreateMetricReport(
+    const std::vector<std::string>& counter_names,
+    const std::vector<std::string>& metric_names) {
+  MetricsArena* arena = MetricsArena::Get();
+  std::stringstream ss;
+  for (const std::string& metric_name : metric_names) {
+    MetricData* data = arena->GetMetric(metric_name);
+    if (data && data->TotalSamples() > 0) {
+      EmitMetricInfo(metric_name, data, &ss);
+    }
+  }
+  for (const std::string& counter_name : counter_names) {
+    CounterData* data = arena->GetCounter(counter_name);
+    if (data && data->Value() > 0) {
+      EmitCounterInfo(counter_name, data, &ss);
+    }
+  }
+  static std::string fall_back_counter_prefix = "aten::";
+  arena->ForEachCounter([&ss](const std::string& name, CounterData* data) {
+    if (name.rfind(fall_back_counter_prefix, 0) == 0 && data->Value() > 0) {
+      // it might emit duplicated counter if user also specified exact aten
+      // counter in the `counter_names` but it should be very rare.
+      EmitCounterInfo(name, data, &ss);
+    }
+  });
+  return ss.str();
+}
+
 std::vector<std::string> GetMetricNames() {
   return MetricsArena::Get()->GetMetricNames();
 }
diff --git a/torch/csrc/lazy/core/metrics.h b/torch/csrc/lazy/core/metrics.h
index 43fb617c1ba16..1d629c4973db8 100644
--- a/torch/csrc/lazy/core/metrics.h
+++ b/torch/csrc/lazy/core/metrics.h
@@ -216,6 +216,11 @@ class TORCH_API Counter {
 // Creates a report with the current metrics statistics.
 TORCH_API std::string CreateMetricReport();
 
+// Creates a report with the selected metrics statistics.
+TORCH_API std::string CreateMetricReport(
+    const std::vector<std::string>& counter_names,
+    const std::vector<std::string>& metric_names);
+
 // Returns the currently registered metric names. Note that the list can grow
 // since metrics are usually function intialized (they are static function
 // variables).

From 03de71e833aabe115dfe21fceae981df50509117 Mon Sep 17 00:00:00 2001
From: Iris <wz337@cornell.edu>
Date: Fri, 18 Nov 2022 09:49:36 +0000
Subject: [PATCH 1065/1922] [PT-D][Checkpoint] Update import and update
 docstring for distributed checkpoint (#89256)

Update test import and docstring as we have moved distributed checkpointing from torch.distributed._shard.checkpoint to torch.distributed.checkpoint (https://github.com/pytorch/pytorch/pull/88698).

Test: CI
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89256
Approved by: https://github.com/fduwjj
---
 .../distributed/checkpoint/test_checkpoint.py | 125 +++++++++---------
 .../fsdp/test_distributed_checkpoint.py       |   2 +-
 .../checkpoint/state_dict_loader.py           |   4 +-
 .../checkpoint/state_dict_saver.py            |   4 +-
 4 files changed, 68 insertions(+), 67 deletions(-)

diff --git a/test/distributed/checkpoint/test_checkpoint.py b/test/distributed/checkpoint/test_checkpoint.py
index 167fdc5e7154c..96c98116328c4 100644
--- a/test/distributed/checkpoint/test_checkpoint.py
+++ b/test/distributed/checkpoint/test_checkpoint.py
@@ -2,9 +2,9 @@
 
 import sys
 from typing import Optional, List, cast
-from torch.distributed._shard.checkpoint.storage import WriteResult
+from torch.distributed.checkpoint.storage import WriteResult
 
-from torch.distributed._shard.checkpoint import (
+from torch.distributed.checkpoint import (
     StorageReader,
     StorageWriter,
     CheckpointException,
@@ -63,6 +63,7 @@
     )
     sys.exit(0)
 
+
 class TestModule(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -121,34 +122,44 @@ def test_default_metadata(self) -> None:
         )
 
         state_dict = {
-            'sharded': sharded_tensor.rand(spec, (10, 10, )),
-            'replicated': torch.rand(4, device=device),
-            'bytes': [1, 2, 3, 4],
+            "sharded": sharded_tensor.rand(
+                spec,
+                (
+                    10,
+                    10,
+                ),
+            ),
+            "replicated": torch.rand(4, device=device),
+            "bytes": [1, 2, 3, 4],
         }
 
         metadata = _create_default_local_metadata(state_dict)
-        self.assertTrue('bytes' in metadata.state_dict_metadata)
-        self.assertIsInstance(metadata.state_dict_metadata['bytes'], BytesStorageMetadata)
+        self.assertTrue("bytes" in metadata.state_dict_metadata)
+        self.assertIsInstance(
+            metadata.state_dict_metadata["bytes"], BytesStorageMetadata
+        )
 
-        self.assertTrue('replicated' in metadata.state_dict_metadata)
-        self.assertIsInstance(metadata.state_dict_metadata['replicated'], TensorStorageMetadata)
-        md = metadata.state_dict_metadata['replicated']
-        self.assertEqual(md.size, state_dict['replicated'].size())
+        self.assertTrue("replicated" in metadata.state_dict_metadata)
+        self.assertIsInstance(
+            metadata.state_dict_metadata["replicated"], TensorStorageMetadata
+        )
+        md = metadata.state_dict_metadata["replicated"]
+        self.assertEqual(md.size, state_dict["replicated"].size())
         self.assertEqual(md.properties.dtype, torch.float32)
         self.assertEqual(1, len(md.chunks))
 
-        self.assertTrue('sharded' in metadata.state_dict_metadata)
-        self.assertIsInstance(metadata.state_dict_metadata['sharded'], TensorStorageMetadata)
-        md = metadata.state_dict_metadata['sharded']
+        self.assertTrue("sharded" in metadata.state_dict_metadata)
+        self.assertIsInstance(
+            metadata.state_dict_metadata["sharded"], TensorStorageMetadata
+        )
+        md = metadata.state_dict_metadata["sharded"]
         self.assertEqual(md.properties.dtype, torch.float32)
-        self.assertEqual(md.size, state_dict['sharded'].size())
+        self.assertEqual(md.size, state_dict["sharded"].size())
         self.assertEqual(2, len(md.chunks))
 
+
 class TestStorageBase:
-    def __init__(
-        self,
-        fail_conf
-    ):
+    def __init__(self, fail_conf):
         self.fail_conf = fail_conf
         self.rank = 0 if not dist.is_initialized() else dist.get_rank()
 
@@ -164,16 +175,16 @@ def _fail_rank_async(self, name, result=None):
         ranks = self._get_ranks(name)
         fut = Future()
         if ranks is not None and self.rank in ranks:
-            fut.set_exception(ValueError(f"async rank fail {self.rank} for {name}"))
+            fut.set_exception(
+                ValueError(f"async rank fail {self.rank} for {name}")
+            )
         else:
             fut.set_result(result)
         return fut
 
+
 class FaultyStorageWriter(TestStorageBase, StorageWriter):
-    def __init__(
-        self,
-        fail_conf
-    ):
+    def __init__(self, fail_conf):
         super(FaultyStorageWriter, self).__init__(fail_conf)
 
     def init(self, is_coordinator: bool) -> None:
@@ -188,23 +199,19 @@ def prepare_global_plan(self, plans: List[SavePlan]) -> List[SavePlan]:
         return plans
 
     def write_data(
-        self,
-        plan: SavePlan,
-        planner: SavePlanner
+        self, plan: SavePlan, planner: SavePlanner
     ) -> Future[List[WriteResult]]:
         self._fail_rank("fail_write_data")
         return self._fail_rank_async("fail_write_data_async", [])
 
-    def finish(self, metadata: Metadata, results: List[List[WriteResult]]) -> None:
+    def finish(
+        self, metadata: Metadata, results: List[List[WriteResult]]
+    ) -> None:
         self._fail_rank("fail_finish")
 
 
 class FaultyStorageReader(TestStorageBase, StorageReader):
-    def __init__(
-        self,
-        metadata,
-        fail_conf
-    ):
+    def __init__(self, metadata, fail_conf):
         super(FaultyStorageReader, self).__init__(fail_conf)
         self.metadata = metadata
 
@@ -219,11 +226,7 @@ def prepare_global_plan(self, plans: List[LoadPlan]) -> List[LoadPlan]:
         self._fail_rank("fail_prepare_global_plan")
         return plans
 
-    def read_data(
-        self,
-        plan: LoadPlan,
-        planner: LoadPlanner
-    ) -> Future[None]:
+    def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
         self._fail_rank("fail_read_data")
         return self._fail_rank_async("fail_read_data_async")
 
@@ -231,13 +234,14 @@ def read_metadata(self) -> Metadata:
         self._fail_rank("fail_read_metadata")
         return self.metadata
 
+
 class TestDistributedFailure(ShardedTensorTestBase):
     def get_spec(self):
         return ChunkShardingSpec(
             dim=0,
             placements=[
                 f"rank:{r}/cuda:{r}" for r in range(dist.get_world_size())
-            ]
+            ],
         )
 
     @with_comms(init_rpc=False)
@@ -245,9 +249,9 @@ def get_spec(self):
     @requires_nccl()
     def test_dummy_writer_works(self) -> None:
         state_dict = {
-            'sharded': sharded_tensor.rand(self.get_spec(), 20, 20),
-            'replicated': torch.rand(10, 10),
-            'bytes': [1, 2, 3, 4]
+            "sharded": sharded_tensor.rand(self.get_spec(), 20, 20),
+            "replicated": torch.rand(10, 10),
+            "bytes": [1, 2, 3, 4],
         }
 
         save_state_dict(state_dict, FaultyStorageWriter({}))
@@ -257,9 +261,9 @@ def test_dummy_writer_works(self) -> None:
     @requires_nccl()
     def test_dummy_reader_works(self) -> None:
         state_dict = {
-            'sharded': sharded_tensor.rand(self.get_spec(), 20, 20),
-            'replicated': torch.rand(10, 10),
-            'bytes': [1, 2, 3, 4]
+            "sharded": sharded_tensor.rand(self.get_spec(), 20, 20),
+            "replicated": torch.rand(10, 10),
+            "bytes": [1, 2, 3, 4],
         }
         metadata = _create_default_local_metadata(state_dict)
 
@@ -283,8 +287,10 @@ def _test_dist_failure(self, callback, kwargs):
 
             failed_ranks = e.failures.keys()
             for rank in bad_ranks:
-                self.assertTrue(rank in failed_ranks, msg=f"{rank} was supposed to fail was fine")
-
+                self.assertTrue(
+                    rank in failed_ranks,
+                    msg=f"{rank} was supposed to fail was fine",
+                )
 
     def _test_save(self, state_dict, coordinator=0, **kwargs):
         no_dist = not dist.is_initialized()
@@ -296,6 +302,7 @@ def _save():
                 coordinator_rank=coordinator,
                 no_dist=no_dist,
             )
+
         self._test_dist_failure(_save, kwargs)
 
     def _test_load(self, state_dict, coordinator=0, **kwargs):
@@ -317,9 +324,9 @@ def _load():
     @requires_nccl()
     def test_save_error_handling(self) -> None:
         state_dict = {
-            'sharded': sharded_tensor.rand(self.get_spec(), 20, 20),
-            'replicated': torch.rand(10, 10),
-            'bytes': [1, 2, 3, 4]
+            "sharded": sharded_tensor.rand(self.get_spec(), 20, 20),
+            "replicated": torch.rand(10, 10),
+            "bytes": [1, 2, 3, 4],
         }
 
         self._test_save(state_dict, fail_init=[0])
@@ -334,10 +341,7 @@ def test_save_error_handling(self) -> None:
         self._test_save(state_dict, coordinator=1, fail_finish=[1])
 
     def test_save_error_handling_no_dist(self) -> None:
-        state_dict = {
-            'replicated': torch.rand(10, 10),
-            'bytes': [1, 2, 3, 4]
-        }
+        state_dict = {"replicated": torch.rand(10, 10), "bytes": [1, 2, 3, 4]}
 
         self.assertFalse(dist.is_initialized())
 
@@ -354,9 +358,9 @@ def test_save_error_handling_no_dist(self) -> None:
     @requires_nccl()
     def test_load_error_handling(self) -> None:
         state_dict = {
-            'sharded': sharded_tensor.rand(self.get_spec(), 20, 20),
-            'replicated': torch.rand(10, 10),
-            'bytes': [1, 2, 3, 4]
+            "sharded": sharded_tensor.rand(self.get_spec(), 20, 20),
+            "replicated": torch.rand(10, 10),
+            "bytes": [1, 2, 3, 4],
         }
 
         self._test_load(state_dict)
@@ -373,12 +377,8 @@ def test_load_error_handling(self) -> None:
         self._test_load(state_dict, coordinator=3, fail_read_data_async=[2])
         self._test_load(state_dict, coordinator=1, fail_prepare_global_plan=[1])
 
-
     def test_load_error_handling_no_dist(self) -> None:
-        state_dict = {
-            'replicated': torch.rand(10, 10),
-            'bytes': [1, 2, 3, 4]
-        }
+        state_dict = {"replicated": torch.rand(10, 10), "bytes": [1, 2, 3, 4]}
         self._test_load(state_dict)
         self._test_load(state_dict, fail_init=[0])
         self._test_load(state_dict, fail_read_metadata=[0])
@@ -387,5 +387,6 @@ def test_load_error_handling_no_dist(self) -> None:
         self._test_load(state_dict, fail_read_data=[0])
         self._test_load(state_dict, fail_read_data_async=[0])
 
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_distributed_checkpoint.py b/test/distributed/fsdp/test_distributed_checkpoint.py
index e64fd358a305e..3e9b967e0d114 100644
--- a/test/distributed/fsdp/test_distributed_checkpoint.py
+++ b/test/distributed/fsdp/test_distributed_checkpoint.py
@@ -5,7 +5,7 @@
 
 import torch
 from torch import distributed as dist
-from torch.distributed._shard.checkpoint import (
+from torch.distributed.checkpoint import (
     FileSystemReader,
     FileSystemWriter,
     load_state_dict,
diff --git a/torch/distributed/checkpoint/state_dict_loader.py b/torch/distributed/checkpoint/state_dict_loader.py
index de94ffabf663e..1d085f4d339ed 100644
--- a/torch/distributed/checkpoint/state_dict_loader.py
+++ b/torch/distributed/checkpoint/state_dict_loader.py
@@ -59,9 +59,9 @@ def load_state_dict(
         >>> my_model = MyModule()
         >>> optimizer = Adagrad(my_model.parameters())
         >>> model_state_dict = my_model.state_dict()
-        >>> fs_storage_loader = torch.distributed._shard.checkpoint.FileSystemLoader("/checkpoint/1")
+        >>> fs_storage_loader = torch.distributed.checkpoint.FileSystemLoader("/checkpoint/1")
 
-        >>> torch.distributed._shard.checkpoint.load_state_dict(
+        >>> torch.distributed.checkpoint.load_state_dict(
         >>>     state_dict=model_state_dict,
         >>>     storage_reader=fs_storage_loader,
         >>> )
diff --git a/torch/distributed/checkpoint/state_dict_saver.py b/torch/distributed/checkpoint/state_dict_saver.py
index af18fd0c11dde..5e7fde10324cd 100644
--- a/torch/distributed/checkpoint/state_dict_saver.py
+++ b/torch/distributed/checkpoint/state_dict_saver.py
@@ -59,8 +59,8 @@ def save_state_dict(
 
         >>> model_state_dict = my_model.state_dict()
 
-        >>> fs_storage_writer = torch.distributed._shard.checkpoint.FileSystemWriter("/checkpoint/1")
-        >>> torch.distributed._shard.checkpoint.save_state_dict(
+        >>> fs_storage_writer = torch.distributed.checkpoint.FileSystemWriter("/checkpoint/1")
+        >>> torch.distributed.checkpoint.save_state_dict(
         >>>     state_dict=model_state_dict,
         >>>     storage_writer=fs_stroage_writer,
         >>> )

From b0d8b709f1cd0184b28ba87bc3e2d379324582fb Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Fri, 18 Nov 2022 10:51:07 +0000
Subject: [PATCH 1066/1922] Export c10/[macros|util] headers to be used by
 internal inductor builds (#89249)

Summary: Fixes package boundary violation that existed in previous implementation

Test Plan: CI

Differential Revision: D41391862

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89249
Approved by: https://github.com/izaitsevfb
---
 c10/macros/build.bzl | 9 +++++++++
 c10/util/build.bzl   | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/c10/macros/build.bzl b/c10/macros/build.bzl
index 932d0cabac4cb..50f283560d7e8 100644
--- a/c10/macros/build.bzl
+++ b/c10/macros/build.bzl
@@ -29,3 +29,12 @@ def define_targets(rules):
             "//conditions:default": [],
         }),
     )
+    rules.filegroup(
+        name = "headers",
+        srcs = rules.glob(
+            ["*.h"],
+            exclude = [
+            ],
+        ),
+        visibility = ["//:__pkg__"],
+    )
diff --git a/c10/util/build.bzl b/c10/util/build.bzl
index b981eba677185..8d79a557477f0 100644
--- a/c10/util/build.bzl
+++ b/c10/util/build.bzl
@@ -68,5 +68,5 @@ def define_targets(rules):
             exclude = [
             ],
         ),
-        visibility = ["//c10:__pkg__"],
+        visibility = ["//c10:__pkg__", "//:__pkg__"],
     )

From f24170ca609d21109d71bb021260175a16302a71 Mon Sep 17 00:00:00 2001
From: Jacob Hayes <jacob.r.hayes@gmail.com>
Date: Fri, 18 Nov 2022 14:09:21 +0000
Subject: [PATCH 1067/1922] Add platform markers for linux only
 extra_install_requires (#88826)

Fixes #88049

https://github.com/pytorch/pytorch/pull/85097 added new extra dependencies on `nvidia-*`. They are linux (GPU) only packages, but were not marked as such, causing issues installing pytorch 1.13 via Poetry (and possibly other tools that follow PyPI's metadata API) on non-Linux systems. This "fixes" the issue by adding the `; platform_system = 'Linux'` marker on these dependencies, but the main problem of different metadata for different wheels is a [somewhat larger issue](https://github.com/pytorch/pytorch/issues/88049#issuecomment-1302555269).

https://github.com/pytorch/pytorch/pull/85097 used `;` as a delimiter for splitting the different deps, but that is the delimiter used in markers, so I changed to split on `|`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88826
Approved by: https://github.com/neersighted, https://github.com/lalmei, https://github.com/malfet
---
 .github/scripts/generate_binary_build_matrix.py        |  6 +++---
 .../generated-linux-binary-manywheel-nightly.yml       | 10 +++++-----
 setup.py                                               |  2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py
index 54949ff27bb1b..4031ee9aacca6 100644
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@@ -219,9 +219,9 @@ def generate_wheels_matrix(os: str,
                         "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                         "package_type": package_type,
                         "pytorch_extra_install_requirements":
-                        "nvidia-cuda-runtime-cu11;"
-                        "nvidia-cudnn-cu11==8.5.0.96;"
-                        "nvidia-cublas-cu11==11.10.3.66",
+                        "nvidia-cuda-runtime-cu11; platform_system == 'Linux' | "
+                        "nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' | "
+                        "nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux'",
                         "build_name":
                         f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-with-pypi-cudnn"
                         .replace(
diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
index efe3e2c0d17c9..ba9401d717a6d 100644
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@@ -169,7 +169,7 @@ jobs:
       DESIRED_PYTHON: "3.7"
       build_name: manywheel-py3_7-cuda11_7-with-pypi-cudnn
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-runtime-cu11;nvidia-cudnn-cu11==8.5.0.96;nvidia-cublas-cu11==11.10.3.66
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-runtime-cu11; platform_system == 'Linux' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -697,7 +697,7 @@ jobs:
       DESIRED_PYTHON: "3.8"
       build_name: manywheel-py3_8-cuda11_7-with-pypi-cudnn
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-runtime-cu11;nvidia-cudnn-cu11==8.5.0.96;nvidia-cublas-cu11==11.10.3.66
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-runtime-cu11; platform_system == 'Linux' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -1225,7 +1225,7 @@ jobs:
       DESIRED_PYTHON: "3.9"
       build_name: manywheel-py3_9-cuda11_7-with-pypi-cudnn
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-runtime-cu11;nvidia-cudnn-cu11==8.5.0.96;nvidia-cublas-cu11==11.10.3.66
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-runtime-cu11; platform_system == 'Linux' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -1753,7 +1753,7 @@ jobs:
       DESIRED_PYTHON: "3.10"
       build_name: manywheel-py3_10-cuda11_7-with-pypi-cudnn
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-runtime-cu11;nvidia-cudnn-cu11==8.5.0.96;nvidia-cublas-cu11==11.10.3.66
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-runtime-cu11; platform_system == 'Linux' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -2281,7 +2281,7 @@ jobs:
       DESIRED_PYTHON: "3.11"
       build_name: manywheel-py3_11-cuda11_7-with-pypi-cudnn
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-runtime-cu11;nvidia-cudnn-cu11==8.5.0.96;nvidia-cublas-cu11==11.10.3.66
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-runtime-cu11; platform_system == 'Linux' | nvidia-cudnn-cu11==8.5.0.96; platform_system == 'Linux' | nvidia-cublas-cu11==11.10.3.66; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
diff --git a/setup.py b/setup.py
index bc8badb9b2e46..0aa27bef64d98 100644
--- a/setup.py
+++ b/setup.py
@@ -852,7 +852,7 @@ def configure_extension_build():
     pytorch_extra_install_requirements = os.getenv("PYTORCH_EXTRA_INSTALL_REQUIREMENTS", "")
     if pytorch_extra_install_requirements:
         report(f"pytorch_extra_install_requirements: {pytorch_extra_install_requirements}")
-        extra_install_requires += pytorch_extra_install_requirements.split(";")
+        extra_install_requires += pytorch_extra_install_requirements.split("|")
 
 
     # Cross-compile for M1

From fc3421ad1e6ddfb23618016a1fa9432bbf120b0c Mon Sep 17 00:00:00 2001
From: lezcano <lezcano-93@hotmail.com>
Date: Fri, 18 Nov 2022 10:35:45 +0000
Subject: [PATCH 1068/1922] Fix names of some reference functions (#88115)

The `__name__` field of some binary reference functions was wrong. We
fix this to be consistent with unary reference functions. In the future,
we should probably make the binary reference wrapper return a wrapper
itself to avoid all those calls to `partial`.

This change helps performing some homogeneous treatment of functions by
their name.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88115
Approved by: https://github.com/mruberry
---
 torch/_prims_common/__init__.py       |  21 ++
 torch/_refs/__init__.py               | 364 +++++++++++++-------------
 torch/_refs/nn/functional/__init__.py |   4 -
 torch/_refs/special/__init__.py       |  16 +-
 4 files changed, 215 insertions(+), 190 deletions(-)

diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index 128796dfa3d07..7752f18361411 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -1577,6 +1577,27 @@ def mask_tensor(mask: TensorLikeType, t: TensorLikeType):
         return torch.where(mask, t, 0)
 
 
+def get_aten_op(fn: Callable, name: str):
+    """
+    Given the __module__ of reference and its name, it returns
+    (our best guess of) the ATen name of the associated operation
+
+    Note: In ATen, the __name__ of a function within a module often
+    starts by the module name. E.g. linalg_eigh, or special_zeta
+    """
+    module = fn.__module__
+    prefix = "torch._refs"
+    assert(module.startswith(prefix))
+    module = module[len(prefix):]
+    # We want to go from .special / .nn.functional
+    # to special and special_ / nn_functional_
+    if module:
+        module = module[1:]
+        module = module.replace(".", "_")
+        module = module + "_"
+    return getattr(torch.ops.aten, f"{module}{name}")
+
+
 def dtype_or_default(dtype: Optional[torch.dtype]) -> torch.dtype:
     return dtype if dtype is not None else torch.get_default_dtype()
 
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 111c5c956f5d6..25b6f2da37c8c 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -394,18 +394,13 @@ def inner(prim: Callable):
             type_promotion_kind=type_promotion_kind,
         )
         def _ref(a: TensorLikeType) -> TensorLikeType:
-            if not isinstance(a, TensorLike):
-                raise RuntimeError(
-                    "Expected a tensor input for an elementwise unary operation!"
-                )
-
             if extra_meta is not None:
                 extra_meta(a)
 
             return prim(a)
 
         if aten_op is infer_aten_op:
-            aten_op = getattr(torch.ops.aten, prim.__name__)
+            aten_op = utils.get_aten_op(prim, prim.__name__)
         if aten_op is not None:
             register_decomposition(aten_op)(_ref)
 
@@ -860,54 +855,59 @@ def trunc(a):
 
 
 def _make_elementwise_binary_reference(
-    prim: Callable,
-    *,
     type_promotion_kind,
     aten_op=infer_aten_op,
+    name=None,
     has_out=True,
     supports_lhs_python_scalar=True,
     supports_rhs_python_scalar=True,
     supports_two_python_scalars=False,
 ) -> Callable:
-    @elementwise_type_promotion_wrapper(
-        type_promoting_args=("a", "b"),
-        type_promotion_kind=type_promotion_kind,
-    )
-    def _ref(
-        a: Union[Tensor, NumberType],
-        b: Union[Tensor, NumberType],
-    ) -> Tensor:
-        if not supports_lhs_python_scalar and isinstance(a, Number):
-            raise ValueError(
-                "Received a lhs Python scalar to an elementwise binary operation that does not accept lhs scalars!"
-            )
+    def inner(prim: Callable):
+        nonlocal aten_op, name
+        if name is None:
+            name = prim.__name__
 
-        if not supports_rhs_python_scalar and isinstance(b, Number):
-            raise ValueError(
-                "Received a rhs Python scalar to an elementwise binary operation that does not accept rhs scalars!"
+        @wraps(prim)
+        @elementwise_type_promotion_wrapper(
+            type_promoting_args=("a", "b"),
+            type_promotion_kind=type_promotion_kind,
+        )
+        def _ref(
+            a: Union[Tensor, NumberType],
+            b: Union[Tensor, NumberType],
+        ) -> Tensor:
+            check(
+                supports_lhs_python_scalar or not isinstance(a, Number),
+                lambda: "{name}: Received a lhs Python scalar to an elementwise binary operation that does not accept lhs scalars!",
+                ValueError,
             )
-
-        if (
-            not supports_two_python_scalars
-            and isinstance(a, Number)
-            and isinstance(b, Number)
-        ):
-            raise ValueError(
-                f"Receive two Number inputs to an elementwise binary operation {prim}!"
+            check(
+                supports_rhs_python_scalar or not isinstance(b, Number),
+                lambda: "{name}: Received a rhs Python scalar to an elementwise binary operation that does not accept rhs scalars!",
+                ValueError,
+            )
+            check(
+                supports_two_python_scalars
+                or not (isinstance(a, Number) and isinstance(b, Number)),
+                lambda: f"{name}: Receive two Number inputs to an elementwise binary operation!",
+                ValueError,
             )
+            a, b = _maybe_broadcast(a, b)
+            return prim(a, b)
 
-        a, b = _maybe_broadcast(a, b)
-        return prim(a, b)
+        if has_out:
+            _ref = out_wrapper()(_ref)
 
-    if has_out:
-        _ref = out_wrapper()(_ref)
+        _ref.__name__ = name
+        if aten_op is infer_aten_op:
+            aten_op = utils.get_aten_op(prim, name)
+        if aten_op is not None:
+            register_decomposition(aten_op)(_ref)
 
-    if aten_op is infer_aten_op:
-        aten_op = getattr(torch.ops.aten, prim.__name__.split(".")[0])
-    if aten_op is not None:
-        register_decomposition(aten_op)(_ref)
+        return _ref
 
-    return _ref
+    return inner
 
 
 # Add has its own implementation because it has an alpha argument
@@ -947,47 +947,61 @@ def add(
 
 
 # TODO: add docstring
-atan2 = _make_elementwise_binary_reference(
-    prims.atan2,  # type: ignore[has-type]
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
     supports_lhs_python_scalar=False,
     supports_rhs_python_scalar=False,
 )
+def atan2(a, b):
+    return prims.atan2(a, b)
+
 
 # TODO: add docstring
-bitwise_and = _make_elementwise_binary_reference(
-    prims.bitwise_and,  # type: ignore[has-type]
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
 )
+def bitwise_and(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.bitwise_and(a, b)
+
 
 # TODO: add docstring
-bitwise_left_shift = _make_elementwise_binary_reference(
-    prims.shift_left,  # type: ignore[has-type]
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
-    aten_op=torch.ops.aten.bitwise_left_shift,  # prim/aten name mismatch
 )
+def bitwise_left_shift(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.shift_left(a, b)
+
 
 # TODO: add docstring
-bitwise_or = _make_elementwise_binary_reference(
-    prims.bitwise_or,  # type: ignore[has-type]
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
 )
+def bitwise_or(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.bitwise_or(a, b)
+
 
 # TODO: add docstring
-bitwise_right_shift = _make_elementwise_binary_reference(
-    prims.shift_right_arithmetic,  # type: ignore[has-type]
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
-    aten_op=torch.ops.aten.bitwise_right_shift,  # prim/aten name mismatch
 )
+def bitwise_right_shift(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.shift_right_arithmetic(a, b)
+
 
 # TODO: add docstring
-bitwise_xor = _make_elementwise_binary_reference(
-    prims.bitwise_xor,  # type: ignore[has-type]
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
 )
+def bitwise_xor(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.bitwise_xor(a, b)
 
 
-def _copysign(
+# TODO: add docstring
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+    supports_lhs_python_scalar=False,
+)
+def copysign(
     a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, NumberType]
 ):
     if isinstance(b, Number) and isinstance(a, Tensor):
@@ -1000,14 +1014,6 @@ def _copysign(
     return where(signbit(b), neg(abs(a)), abs(a))
 
 
-# TODO: add docstring
-copysign = _make_elementwise_binary_reference(
-    _copysign,
-    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
-    supports_lhs_python_scalar=False,
-    aten_op=torch.ops.aten.copysign,
-)
-
 # TODO: add docstring
 # complex =  _make_elementwise_binary_reference(prims.complex, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT)
 
@@ -1038,14 +1044,19 @@ def div(
 
 
 # TODO: add docstring
-eq = _make_elementwise_binary_reference(
-    prims.eq,  # type: ignore[has-type]
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
     supports_lhs_python_scalar=False,
 )
+def eq(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.eq(a, b)
 
 
-def _pow(
+# TODO: add docstring
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.BOOL_TO_LONG,
+)
+def pow(
     a: Union[TensorLikeType, NumberType],
     b: Union[TensorLikeType, NumberType],
 ) -> TensorLikeType:
@@ -1061,13 +1072,6 @@ def _pow(
     return prims.pow(a, b)
 
 
-# TODO: add docstring
-pow = _make_elementwise_binary_reference(
-    _pow,
-    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.BOOL_TO_LONG,
-    aten_op=torch.ops.aten.pow,
-)
-
 # TODO: add docstring
 # Float power has its own implementation because it has unique type promotion.
 # NB: aten_op not registered because CompositeExplicitAutograd
@@ -1127,7 +1131,13 @@ def float_power(
 #
 # For reference, see CPython's implementation:
 # https://github.com/python/cpython/blob/ace008c531dd685a30c1dd68f9b5ba35f20171cf/Objects/floatobject.c#L636
-def _floor_divide(
+
+# TODO: add docstring
+@_make_elementwise_binary_reference(
+    type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    supports_two_python_scalars=True,
+)
+def floor_divide(
     a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, NumberType]
 ):
     # Wrap scalars because some references only accept tensor arguments.
@@ -1194,66 +1204,69 @@ def _floor_divide_float(a: Tensor, b: Tensor) -> Tensor:
 
 
 # TODO: add docstring
-floor_divide = _make_elementwise_binary_reference(
-    _floor_divide,
-    type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
-    aten_op=torch.ops.aten.floor_divide,
-    supports_two_python_scalars=True,
-)
-
-
-# TODO: add docstring
-fmax = _make_elementwise_binary_reference(
-    prims.fmax,  # type: ignore[has-type]
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
-    aten_op=torch.ops.aten.fmax,
     supports_lhs_python_scalar=False,
     supports_rhs_python_scalar=False,
 )
+def fmax(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.fmax(a, b)
+
 
 # TODO: add docstring
-fmin = _make_elementwise_binary_reference(
-    prims.fmin,  # type: ignore[has-type]
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
-    aten_op=torch.ops.aten.fmin,
     supports_lhs_python_scalar=False,
     supports_rhs_python_scalar=False,
 )
+def fmin(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.fmin(a, b)
+
 
 # TODO: add docstring
-fmod = _make_elementwise_binary_reference(
-    prims.fmod,  # type: ignore[has-type]
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
-    aten_op=torch.ops.aten.fmod,
     supports_lhs_python_scalar=False,
     supports_rhs_python_scalar=True,
 )
+def fmod(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.fmod(a, b)
+
 
 # TODO: add docstring
-gcd = _make_elementwise_binary_reference(
-    prims.gcd,  # type: ignore[has-type]
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
-    aten_op=torch.ops.aten.gcd,
     supports_lhs_python_scalar=False,
     supports_rhs_python_scalar=False,
 )
+def gcd(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.gcd(a, b)
+
 
 # TODO: add docstring
-ge = _make_elementwise_binary_reference(
-    prims.ge,  # type: ignore[has-type]
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
     supports_lhs_python_scalar=False,
 )
+def ge(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.ge(a, b)
+
 
 # TODO: add docstring
-gt = _make_elementwise_binary_reference(
-    prims.gt,  # type: ignore[has-type]
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
     supports_lhs_python_scalar=False,
 )
+def gt(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.gt(a, b)
 
 
-def _heaviside(input: TensorLikeType, values: TensorLikeType) -> TensorLikeType:
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH,
+    supports_lhs_python_scalar=False,
+    supports_rhs_python_scalar=False,
+)
+def heaviside(input: TensorLikeType, values: TensorLikeType) -> TensorLikeType:
     input_eq_zero = eq(input, 0)
     input_lt_zero = logical_or(lt(input, 0), isnan(input))
     zeros_and_ones = where(input_lt_zero, 0, 1)
@@ -1261,34 +1274,31 @@ def _heaviside(input: TensorLikeType, values: TensorLikeType) -> TensorLikeType:
     return output
 
 
-heaviside = _make_elementwise_binary_reference(
-    _heaviside,
-    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH,
-    supports_lhs_python_scalar=False,
-    supports_rhs_python_scalar=False,
-    aten_op=torch.ops.aten.heaviside,
-)
-
-hypot = _make_elementwise_binary_reference(
-    prims.hypot,  # type: ignore[has-type]
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
     supports_lhs_python_scalar=False,
     supports_rhs_python_scalar=False,
 )
+def hypot(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.hypot(a, b)
+
 
-igamma = _make_elementwise_binary_reference(
-    prims.igamma,  # type: ignore[has-type]
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
     supports_lhs_python_scalar=False,
     supports_rhs_python_scalar=False,
 )
+def igamma(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.igamma(a, b)
 
-igammac = _make_elementwise_binary_reference(
-    prims.igammac,  # type: ignore[has-type]
+
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
     supports_lhs_python_scalar=False,
     supports_rhs_python_scalar=False,
 )
+def igammac(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.igammac(a, b)
 
 
 def _check_close_args(
@@ -1363,7 +1373,13 @@ def isclose(
     return result
 
 
-def _lcm(a: TensorLikeType, b: TensorLikeType):
+# TODO: add docstring
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    supports_lhs_python_scalar=False,
+    supports_rhs_python_scalar=False,
+)
+def lcm(a: TensorLikeType, b: TensorLikeType):
     dtype = a.dtype
     # promoting to int32 to maintain 100% consistency with C++ and to
     # prevent overflow in case of int8 and int16
@@ -1380,24 +1396,19 @@ def _lcm(a: TensorLikeType, b: TensorLikeType):
 
 
 # TODO: add docstring
-lcm = _make_elementwise_binary_reference(
-    _lcm,
-    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
-    aten_op=torch.ops.aten.lcm,
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
     supports_lhs_python_scalar=False,
-    supports_rhs_python_scalar=False,
 )
+def le(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.le(a, b)
 
 
 # TODO: add docstring
-le = _make_elementwise_binary_reference(
-    prims.le,  # type: ignore[has-type]
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
-    supports_lhs_python_scalar=False,
 )
-
-
-def _logical_and(a: TensorLikeType, b: TensorLikeType):
+def logical_and(a: TensorLikeType, b: TensorLikeType):
     if not utils.is_boolean_dtype(a.dtype):
         a = a != 0
     if not utils.is_boolean_dtype(b.dtype):
@@ -1405,23 +1416,19 @@ def _logical_and(a: TensorLikeType, b: TensorLikeType):
     return a & b
 
 
-logical_and = _make_elementwise_binary_reference(
-    _logical_and,
-    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
-    aten_op=torch.ops.aten.logical_and,
-)
-
-
-@_make_elementwise_unary_reference(
-    ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL, aten_op=torch.ops.aten.logical_not
-)
+# TODO: add docstring
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL)
 def logical_not(a: TensorLikeType):
     if not utils.is_boolean_dtype(a.dtype):
         return a == 0
     return ~a
 
 
-def _logical_or(a: TensorLikeType, b: TensorLikeType):
+# TODO: add docstring
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+)
+def logical_or(a: TensorLikeType, b: TensorLikeType):
     if not utils.is_boolean_dtype(a.dtype):
         a = a != 0
     if not utils.is_boolean_dtype(b.dtype):
@@ -1429,14 +1436,12 @@ def _logical_or(a: TensorLikeType, b: TensorLikeType):
     return bitwise_or(a, b)
 
 
-logical_or = _make_elementwise_binary_reference(
-    _logical_or,
+# TODO: add docstring
+# TODO: skip unnecessary conversion of long to float
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
-    aten_op=torch.ops.aten.logical_or,
 )
-
-
-def _logical_xor(a: TensorLikeType, b: TensorLikeType):
+def logical_xor(a: TensorLikeType, b: TensorLikeType):
     if not utils.is_boolean_dtype(a.dtype):
         a = a != 0
     if not utils.is_boolean_dtype(b.dtype):
@@ -1444,61 +1449,66 @@ def _logical_xor(a: TensorLikeType, b: TensorLikeType):
     return a ^ b
 
 
-# TODO: skip unnecessary conversion of long to float
-logical_xor = _make_elementwise_binary_reference(
-    _logical_xor,
-    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
-    aten_op=torch.ops.aten.logical_xor,
-)
-
-
 # TODO: add docstring
-lt = _make_elementwise_binary_reference(
-    prims.lt,  # type: ignore[has-type]
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
     supports_lhs_python_scalar=False,
 )
+def lt(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.lt(a, b)
+
 
 # TODO: add docstring
-maximum = _make_elementwise_binary_reference(
-    prims.maximum,  # type: ignore[has-type]
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
 )
+def maximum(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.maximum(a, b)
+
 
 # TODO: add docstring
-minimum = _make_elementwise_binary_reference(
-    prims.minimum,  # type: ignore[has-type]
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
 )
+def minimum(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.minimum(a, b)
+
 
 # TODO: add docstring
-mul = _make_elementwise_binary_reference(
-    prims.mul,  # type: ignore[has-type]
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
     supports_two_python_scalars=True,
 )
+def mul(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.mul(a, b)
+
 
 # TODO: add docstring
-ne = _make_elementwise_binary_reference(
-    prims.ne,  # type: ignore[has-type]
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
     supports_lhs_python_scalar=False,
 )
+def ne(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.ne(a, b)
+
 
 # TODO: add docstring
-nextafter = _make_elementwise_binary_reference(
-    prims.nextafter,  # type: ignore[has-type]
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH,
     supports_lhs_python_scalar=False,
     supports_rhs_python_scalar=False,
 )
+def nextafter(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.nextafter(a, b)
+
 
 # TODO: add docstring
-remainder = _make_elementwise_binary_reference(
-    prims.remainder,  # type: ignore[has-type]
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
-    aten_op=torch.ops.aten.remainder,
 )
+def remainder(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.remainder(a, b)
+
 
 # reverse sub
 def rsub(
@@ -1550,12 +1560,14 @@ def sub(
 
 
 # TODO: add docstring
-true_divide = _make_elementwise_binary_reference(
-    prims.div,  # type: ignore[has-type]
+@_make_elementwise_binary_reference(
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+    name="true_divide",
     aten_op=None,  # CompositeImplicitAutograd
     supports_two_python_scalars=True,
 )
+def true_divide(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.div(a, b)
 
 
 @register_decomposition(torch.ops.aten.xlogy)
@@ -1583,7 +1595,13 @@ def xlogy(a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, NumberT
     return torch.where(torch.isnan(b), float("nan"), rhs)
 
 
-def _trunc_divide(
+# TODO: add docstring
+@_make_elementwise_binary_reference(
+    type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    aten_op=None,  # CompositeImplicitAutograd
+    supports_two_python_scalars=True,
+)
+def trunc_divide(
     a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, NumberType]
 ):
     dtype = utils.get_dtype(a)
@@ -1593,14 +1611,6 @@ def _trunc_divide(
     return trunc(prims.div(a, b))
 
 
-# TODO: add docstring
-trunc_divide = _make_elementwise_binary_reference(
-    _trunc_divide,
-    type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
-    aten_op=None,  # CompositeImplicitAutograd
-    supports_two_python_scalars=True,
-)
-
 #
 # Elementwise Ternary References
 #
diff --git a/torch/_refs/nn/functional/__init__.py b/torch/_refs/nn/functional/__init__.py
index 12f44c4092a41..ab352c40a93ae 100644
--- a/torch/_refs/nn/functional/__init__.py
+++ b/torch/_refs/nn/functional/__init__.py
@@ -20,10 +20,6 @@
     elementwise_unary_scalar_wrapper,
     out_wrapper,
 )
-from torch._refs import (
-    _make_elementwise_binary_reference,
-    _make_elementwise_unary_reference,
-)
 
 from torch._subclasses.fake_tensor import FakeTensor
 
diff --git a/torch/_refs/special/__init__.py b/torch/_refs/special/__init__.py
index 1227a2631475b..4983823242653 100644
--- a/torch/_refs/special/__init__.py
+++ b/torch/_refs/special/__init__.py
@@ -46,7 +46,6 @@
 
 @_make_elementwise_unary_reference(
     ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
-    aten_op=torch.ops.aten.special_bessel_j0,
 )
 def bessel_j0(a: TensorLikeType) -> TensorLikeType:
     return prims.bessel_j0(a)
@@ -54,7 +53,6 @@ def bessel_j0(a: TensorLikeType) -> TensorLikeType:
 
 @_make_elementwise_unary_reference(
     ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
-    aten_op=torch.ops.aten.special_bessel_j1,
 )
 def bessel_j1(a: TensorLikeType) -> TensorLikeType:
     return prims.bessel_j1(a)
@@ -89,21 +87,21 @@ def erfcx(a: TensorLikeType) -> TensorLikeType:
 
 
 @_make_elementwise_unary_reference(
-    ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, aten_op=torch.ops.aten.special_i0e
+    ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
 )
 def i0e(a: TensorLikeType) -> TensorLikeType:
     return prims.bessel_i0e(a)
 
 
 @_make_elementwise_unary_reference(
-    ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, aten_op=torch.ops.aten.special_i1
+    ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
 )
 def i1(a: TensorLikeType) -> TensorLikeType:
     return prims.bessel_i1(a)
 
 
 @_make_elementwise_unary_reference(
-    ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, aten_op=torch.ops.aten.special_i1e
+    ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
 )
 def i1e(a: TensorLikeType) -> TensorLikeType:
     return prims.bessel_i1e(a)
@@ -223,14 +221,14 @@ def softmax(
 
 @_make_elementwise_unary_reference(
     ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
-    aten_op=torch.ops.aten.special_spherical_bessel_j0,
 )
 def spherical_bessel_j0(a: TensorLikeType) -> TensorLikeType:
     return prims.spherical_bessel_j0(a)
 
 
-zeta = _make_elementwise_binary_reference(
-    prims.zeta,  # type: ignore[has-type]
+# TODO: add docstring
+@_make_elementwise_binary_reference(
     type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
-    aten_op=torch.ops.aten.special_zeta,
 )
+def zeta(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.zeta(a, b)

From f788e85c0d30f0a4e5c63ffb95463932a633c416 Mon Sep 17 00:00:00 2001
From: lezcano <lezcano-93@hotmail.com>
Date: Fri, 18 Nov 2022 10:35:46 +0000
Subject: [PATCH 1069/1922] Simplify maybe_resize_out (#88116)

The previous behaviour would call `resize_` on 0-sized elements even
when their size was correct. This would make some test fail, as resize_
may be an in-place operation and it's not supported by some subsystems

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88116
Approved by: https://github.com/mruberry
---
 torch/_prims_common/wrappers.py | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/torch/_prims_common/wrappers.py b/torch/_prims_common/wrappers.py
index 76886f886a726..349e450cf3723 100644
--- a/torch/_prims_common/wrappers.py
+++ b/torch/_prims_common/wrappers.py
@@ -4,6 +4,7 @@
     NumberType,
     TensorLike,
     TensorLikeType,
+    ShapeType,
     ELEMENTWISE_TYPE_PROMOTION_KIND,
 )
 import torch._prims_common as utils
@@ -11,8 +12,7 @@
 
 from typing import Callable, Sequence, Union, Tuple, NamedTuple
 import inspect
-from functools import wraps, reduce
-import operator
+from functools import wraps
 import warnings
 from itertools import chain
 
@@ -129,25 +129,22 @@ def _fn(*args, **kwargs):
 
 
 # TODO: handle tuples of tensors
-def _maybe_resize_out(out: TensorLikeType, shape):
-    if out.numel() == 0:
-        return out.resize_(shape)
-
-    if out.numel() != reduce(operator.mul, shape, 1):
-        msg = (
-            "An output with one or more elements was resized since it had shape {0} "
-            "which does not match the required output shape {1}. "
-            "This behavior is deprecated, and in a future PyTorch release outputs will not "
-            "be resized unless they have zero elements. "
-            "You can explicitly reuse an out tensor t by resizing it, inplace, to zero elements with t.resize_(0).".format(
-                str(out.shape), str(shape)
+def _maybe_resize_out(out: TensorLikeType, shape: ShapeType):
+    # If the shapes are correct there's nothing to do
+    if utils.same_shape(out.shape, shape):
+        return out
+    else:
+        if out.numel() != 0:
+            msg = (
+                f"An output with one or more elements was resized since it had shape {str(out.shape)} "
+                "which does not match the required output shape {str(shape)}. "
+                "This behavior is deprecated, and in a future PyTorch release outputs will not "
+                "be resized unless they have zero elements. "
+                "You can explicitly reuse an out tensor t by resizing it, inplace, to zero elements with t.resize_(0)."
             )
-        )
-        warnings.warn(msg)
+            warnings.warn(msg)
         return out.resize_(shape)
 
-    return out
-
 
 def _safe_copy_out(
     *, copy_from: TensorLikeType, copy_to: TensorLikeType, exact_dtype: bool = False

From e2c851974209bc9805783be9e637fa065597edd9 Mon Sep 17 00:00:00 2001
From: lezcano <lezcano-93@hotmail.com>
Date: Fri, 18 Nov 2022 11:25:36 +0000
Subject: [PATCH 1070/1922] Add most in-place references/decompositions
 (#88117)

We add most in-place references in a generic way. We also implement a
wrapper to implement the annoying interface that `nn.functional`
nonlinearities have.

We fix along the way a couple decompositions for some non-linearities by
extending the arguments that the references have.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88117
Approved by: https://github.com/mruberry
---
 test/functorch/test_aotdispatch.py            |   9 --
 test/test_meta.py                             |  11 +-
 test/test_ops.py                              |   7 +-
 test/test_proxy_tensor.py                     | 116 +++---------------
 torch/_decomp/decompositions.py               |  13 --
 torch/_refs/__init__.py                       | 114 ++++++++++++++++-
 torch/_refs/nn/functional/__init__.py         |  95 +++++++++++---
 .../_internal/common_methods_invocations.py   |   9 ++
 8 files changed, 231 insertions(+), 143 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 1dc5476158f96..de6d82960adc8 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1078,7 +1078,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('linalg.tensorinv', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('linalg.tensorsolve', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('linalg.vander', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('linalg.vector_norm', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('logaddexp2', ''),  # aten.logaddexp2.default - couldn't find symbolic meta function/decomposition
     xfail('logaddexp', ''),  # aten.logaddexp.default - couldn't find symbolic meta function/decomposition
     xfail('logcumsumexp', ''),  # aten.logcumsumexp.default - couldn't find symbolic meta function/decomposition
@@ -1105,9 +1104,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('min', 'reduction_with_dim'),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('mode', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('mv', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('mvlgamma', 'mvlgamma_p_1'),  # aten.digamma_.default - couldn't find symbolic meta function/decom...
-    xfail('mvlgamma', 'mvlgamma_p_3'),  # aten.digamma_.default - couldn't find symbolic meta function/decom...
-    xfail('mvlgamma', 'mvlgamma_p_5'),  # aten.digamma_.default - couldn't find symbolic meta function/decom...
 
     # Deleting this in a followup
     xfail('nn.functional.poisson_nll_loss', ''),
@@ -1121,7 +1117,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     skip('nn.functional.batch_norm', ''),  # '0 is not tracked with proxy for <torch.fx.experimental.proxy_te..
     xfail('nn.functional.bilinear', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.binary_cross_entropy', ''),  # aten.fill_.Scalar - couldn't find symbolic meta funct...
-    xfail('nn.functional.cosine_embedding_loss', ''),  # Cannot call sizes() on tensor with symbolic sizes/st...
     xfail('nn.functional.cosine_similarity', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.cross_entropy', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.ctc_loss', ''),  # aten._ctc_loss.Tensor - couldn't find symbolic meta function/deco...
@@ -1130,7 +1125,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('nn.functional.fractional_max_pool3d', ''),  # rand() received an invalid combination of arguments - g...
     xfail('nn.functional.grid_sample', ''),  # RuntimeError: aten.grid_sampler_3d.default - couldn't find sym ...
     xfail('nn.functional.group_norm', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('nn.functional.hinge_embedding_loss', ''),  # aten.zeros_like.default - couldn't find symbolic meta...
     xfail('nn.functional.interpolate', 'area'),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.interpolate', 'bicubic'),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.interpolate', 'linear'),  # Cannot call sizes() on tensor with symbolic sizes/strides
@@ -1148,10 +1142,8 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('nn.functional.multi_margin_loss', ''),  # could not find kernel
     xfail('nn.functional.multilabel_margin_loss', ''),  # could not find kernel
     xfail('nn.functional.nll_loss', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('nn.functional.normalize', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.pad', 'reflect'),  # aten.reflection_pad1d.default - couldn't find symbolic meta fu...
     xfail('nn.functional.pad', 'replicate'),  # aten.replication_pad1d.default - couldn't find symbolic meta...
-    xfail('nn.functional.pairwise_distance', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.pdist', ''),  # could not find kernel
     xfail('nn.functional.pixel_shuffle', ''),  # aten.pixel_shuffle.default - couldn't find symbolic meta fun...
     xfail('nn.functional.pixel_unshuffle', ''),  # aten.pixel_unshuffle.default - couldn't find symbolic meta...
@@ -1160,7 +1152,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('nn.functional.smooth_l1_loss', ''),  # could not find kernel
     xfail('nn.functional.unfold', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('nn.functional.upsample_nearest', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('norm', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('norm', 'nuc'),  # aten._linalg_svd.default - couldn't find symbolic meta function/decomposition
     xfail('normal', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('normal', 'number_mean'),  # Cannot call sizes() on tensor with symbolic sizes/strides
diff --git a/test/test_meta.py b/test/test_meta.py
index ae248a90cffb7..6d21d5c7bd75a 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -622,7 +622,6 @@ def run_meta_crossref(
     torch.linalg.eig : {f64, f32, c128, c64},
     torch.linalg.eigvals : {f64, f32, c128, c64},
     torch.linalg.lstsq : {f64, f32, c128, c64},
-    torch.Tensor.conj_physical_: {c128, c32, c64},
 }
 
 meta_function_expected_failures_only_outplace = {
@@ -893,7 +892,6 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
     aten.unique_consecutive.default : {i8, f64, i64, bf16, f32, i32, b8, i16, u8},
     aten.unique_dim.default : {i8, f64, i64, bf16, f32, i32, b8, i16, u8},
     aten.upsample_nearest3d.vec : {bf16, f32, f64, u8},
-    aten.conj_physical_.default: {c128, c32, c64},
 }
 
 # these sometimes pass and sometimes fail
@@ -916,6 +914,13 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
 # For CompositeImplicitAutograd functions that fail before hitting the Mode
 meta_dispatch_early_skips = set({
     torch.Tensor.float_power_,
+    # Errors out in one of the tests, while ProxyTensor passes...
+    torch.Tensor.cumsum_,
+})
+
+meta_inplace_skips = set({
+    # Errors out in one of the tests, while ProxyTensor passes...
+    torch.Tensor.cumsum_,
 })
 
 meta_dispatch_device_expected_failures = defaultdict(dict)
@@ -1116,6 +1121,8 @@ def test_meta_inplace(self, device, dtype, op):
         func = op.get_inplace()
         if not func:
             self.skipTest("No inplace variable for this op")
+        if func in meta_inplace_skips:
+            self.skipTest("Skipped")
         func = self._get_safe_inplace(func)
         samples = op.sample_inputs(device, dtype, requires_grad=False)
         for sample_input in samples:
diff --git a/test/test_ops.py b/test/test_ops.py
index 11d659e5cd2b8..5f9ad6ff43176 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -1668,7 +1668,6 @@ class TestRefsOpsInfo(TestCase):
 
     not_in_decomp_table = {
         # duplicated in _decomp and _refs
-        '_refs.nn.functional.elu',
         '_refs.nn.functional.group_norm',
         '_refs.nn.functional.mse_loss',
         '_refs.rsub',
@@ -1763,9 +1762,13 @@ class TestRefsOpsInfo(TestCase):
 
     @parametrize("op", ref_ops_names)
     def test_refs_are_in_python_ref_db(self, op):
+        inplace = op[-1] == "_"
         if op in self.skip_ref_ops:
             raise unittest.SkipTest(f"{op} does not have an entry in python_ref_db")
-        self.assertIn(op, self.ref_db_names)
+        elif inplace:
+            self.assertNotIn(op, self.ref_db_names, msg=f"{op} is an in-place operation and should not have an OpInfo")
+        else:
+            self.assertIn(op, self.ref_db_names)
 
     @parametrize("op", ref_ops_names)
     def test_refs_are_in_decomp_table(self, op):
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 21682ac76fc65..aa12b5b74d1c8 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1255,7 +1255,6 @@ def f(a, b, c, d, e):
     xfail('nn.functional.avg_pool3d', ''),  # aten.avg_pool3d.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.bilinear', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.binary_cross_entropy', ''),  # aten.new_empty.default - couldn't find symbolic meta function/decom...
-    xfail('nn.functional.cosine_embedding_loss', ''),  # The underlying op of 'aten.stride' has no overload name '_schema'
     xfail('nn.functional.cosine_similarity', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.cross_entropy', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.ctc_loss'),  # aten._ctc_loss.Tensor - couldn't find symbolic meta function/decomposition
@@ -1264,13 +1263,11 @@ def f(a, b, c, d, e):
     xfail('nn.functional.fractional_max_pool2d', ''),  # argument 'size' must be tuple of ints, but found element of t...
     xfail('nn.functional.fractional_max_pool3d', ''),  # argument 'size' must be tuple of ints, but found element of t...
     xfail('nn.functional.grid_sample', ''),  # aten.grid_sampler_2d.default - couldn't find symbolic meta function/decompos...
-    xfail('nn.functional.hinge_embedding_loss', ''),  # aten.empty_like.default - couldn't find symbolic meta function/deco...
     xfail('nn.functional.interpolate', 'area'),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.interpolate', 'bicubic'),  # aten.upsample_bicubic2d.vec - couldn't find symbolic meta function/d...
     xfail('nn.functional.interpolate', 'linear'),  # aten.upsample_linear1d.vec - couldn't find symbolic meta function/dec...
     xfail('nn.functional.interpolate', 'nearest'),  # aten.upsample_nearest1d.vec - couldn't find symbolic meta function/d...
     xfail('nn.functional.interpolate', 'trilinear'),  # aten.upsample_trilinear3d.vec - couldn't find symbolic meta functi...
-    xfail('nn.functional.margin_ranking_loss', ''),  # The underlying op of 'aten.stride' has no overload name '_schema'
     xfail('nn.functional.max_pool1d', ''),  # Trying to call aten.size on a tensor with symbolic shapes.
     xfail('nn.functional.max_pool3d', ''),  # aten.max_pool3d_with_indices.default - couldn't find symbolic meta function/d...
     xfail('nn.functional.max_unpool1d', 'grad'),  # aten.max_unpool2d.default - couldn't find symbolic meta function/decom...
@@ -1349,99 +1346,26 @@ def f(a, b, c, d, e):
 symbolic_tensor_failures.update(symbolic_tensor_segfaults)
 
 inplace_symbolic_tensor_failures = {
-    xfail('abs', ''),  # aten.abs_.default - couldn't find symbolic meta function/decomposition
-    xfail('acos', ''),  # aten.acos_.default - couldn't find symbolic meta function/decomposition
-    xfail('acosh', ''),  # aten.acosh_.default - couldn't find symbolic meta function/decomposition
-    xfail('addbmm', ''),  # aten.addbmm_.default - couldn't find symbolic meta function/decomposition
-    xfail('addcdiv', ''),  # aten.addcdiv_.default - couldn't find symbolic meta function/decomposition
-    xfail('addcmul', ''),  # aten.addcmul_.default - couldn't find symbolic meta function/decomposition
-    xfail('addmm', ''),  # aten.addmm_.default - couldn't find symbolic meta function/decomposition
-    xfail('addmm', 'decomposed'),  # aten.addmm_.default - couldn't find symbolic meta function/decomposition
-    xfail('asin', ''),  # aten.asin_.default - couldn't find symbolic meta function/decomposition
-    xfail('asinh', ''),  # aten.asinh_.default - couldn't find symbolic meta function/decomposition
-    xfail('atan2', ''),  # aten.atan2_.default - couldn't find symbolic meta function/decomposition
-    xfail('atan', ''),  # aten.atan_.default - couldn't find symbolic meta function/decomposition
-    xfail('atanh', ''),  # aten.atanh_.default - couldn't find symbolic meta function/decomposition
-    xfail('ceil', ''),  # aten.ceil_.default - couldn't find symbolic meta function/decomposition
-    xfail('clamp', ''),  # aten.clamp_.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('clamp_max', ''),  # aten.clamp_max_.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('clamp_min', ''),  # aten.clamp_min_.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('conj_physical', ''),  # aten.conj_physical_.default - couldn't find symbolic meta function/decomposition
-    xfail('copysign', ''),  # aten.copysign_.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('cos', ''),  # aten.cos_.default - couldn't find symbolic meta function/decomposition
-    xfail('cosh', ''),  # aten.cosh_.default - couldn't find symbolic meta function/decomposition
-    xfail('cumsum', ''),  # aten.cumsum_.default - couldn't find symbolic meta function/decomposition
-    xfail('digamma', ''),  # aten.digamma_.default - couldn't find symbolic meta function/decomposition
-    xfail('div', 'floor_rounding'),  # aten.div_.Tensor_mode - couldn't find symbolic meta function/decomposition
-    xfail('div', 'trunc_rounding'),  # aten.div_.Tensor_mode - couldn't find symbolic meta function/decomposition
-    xfail('eq', ''),  # aten.eq_.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('erf', ''),  # aten.erf_.default - couldn't find symbolic meta function/decomposition
-    xfail('erfc', ''),  # aten.erfc_.default - couldn't find symbolic meta function/decomposition
-    xfail('erfinv', ''),  # aten.erfinv_.default - couldn't find symbolic meta function/decomposition
-    xfail('exp2', ''),  # aten.exp2_.default - couldn't find symbolic meta function/decomposition
-    xfail('exp', ''),  # aten.exp_.default - couldn't find symbolic meta function/decomposition
-    xfail('expm1', ''),  # aten.expm1_.default - couldn't find symbolic meta function/decomposition
-    xfail('float_power', ''),  # the base given to float_power_ has dtype Float but the operation's result requires dtype Double
-    xfail('floor', ''),  # aten.floor_.default - couldn't find symbolic meta function/decomposition
-    xfail('floor_divide', ''),  # aten.floor_divide_.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('fmod', ''),  # aten.fmod_.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('frac', ''),  # aten.frac_.default - couldn't find symbolic meta function/decomposition
-    xfail('ge', ''),  # aten.ge_.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('gt', ''),  # aten.gt_.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('heaviside', ''),  # aten.heaviside_.default - couldn't find symbolic meta function/decomposition
-    xfail('hypot', ''),  # aten.hypot_.default - couldn't find symbolic meta function/decomposition
-    xfail('igamma', ''),  # aten.igamma_.default - couldn't find symbolic meta function/decomposition
-    xfail('igammac', ''),  # aten.igammac_.default - couldn't find symbolic meta function/decomposition
-    xfail('le', ''),  # aten.le_.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('lerp', ''),  # aten.lerp_.default - couldn't find symbolic meta function/decomposition
-    xfail('lgamma', ''),  # aten.lgamma_.default - couldn't find symbolic meta function/decomposition
-    xfail('log10', ''),  # aten.log10_.default - couldn't find symbolic meta function/decomposition
-    xfail('log1p', ''),  # aten.log1p_.default - couldn't find symbolic meta function/decomposition
-    xfail('log2', ''),  # aten.log2_.default - couldn't find symbolic meta function/decomposition
-    xfail('log', ''),  # aten.log_.default - couldn't find symbolic meta function/decomposition
-    xfail('logit', ''),  # aten.logit_.default - couldn't find symbolic meta function/decomposition
-    xfail('lt', ''),  # aten.lt_.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('mvlgamma', 'mvlgamma_p_1'),  # aten.mvlgamma_.default - couldn't find symbolic meta function/decomposition
-    xfail('mvlgamma', 'mvlgamma_p_3'),  # aten.mvlgamma_.default - couldn't find symbolic meta function/decomposition
-    xfail('mvlgamma', 'mvlgamma_p_5'),  # aten.mvlgamma_.default - couldn't find symbolic meta function/decomposition
-    xfail('nan_to_num', ''),  # aten.nan_to_num_.default - couldn't find symbolic meta function/decomposition
-    xfail('ne', ''),  # aten.ne_.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('neg', ''),  # aten.neg_.default - couldn't find symbolic meta function/decomposition
-    xfail('nextafter', ''),  # aten.nextafter_.default - couldn't find symbolic meta function/decomposition
-    xfail('nn.functional.celu', ''),  # aten.celu_.default - couldn't find symbolic meta function/decomposition
-    xfail('nn.functional.dropout3d', ''),  # aten.squeeze_.dim - couldn't find symbolic meta function/decomposition
-    xfail('nn.functional.elu', ''),  # aten.elu_.default - couldn't find symbolic meta function/decomposition
-    xfail('nn.functional.hardsigmoid', ''),  # aten.hardsigmoid_.default - couldn't find symbolic meta function/decomposition
-    xfail('nn.functional.mish', ''),  # aten.mish_.default - couldn't find symbolic meta function/decomposition
-    xfail('nn.functional.selu', ''),  # aten.elu_.default - couldn't find symbolic meta function/decomposition
-    xfail('nn.functional.threshold', ''),  # aten.threshold_.default - couldn't find symbolic meta function/decomposition
-    xfail('pow', ''),  # aten.pow_.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('reciprocal', ''),  # aten.reciprocal_.default - couldn't find symbolic meta function/decomposition
-    xfail('remainder', ''),  # aten.remainder_.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('rsqrt', ''),  # aten.rsqrt_.default - couldn't find symbolic meta function/decomposition
-    xfail('sgn', ''),  # aten.sgn_.default - couldn't find symbolic meta function/decomposition
-    xfail('sigmoid', ''),  # aten.sigmoid_.default - couldn't find symbolic meta function/decomposition
-    xfail('sign', ''),  # aten.sign_.default - couldn't find symbolic meta function/decomposition
-    xfail('sin', ''),  # aten.sin_.default - couldn't find symbolic meta function/decomposition
-    xfail('sinc', ''),  # aten.sinc_.default - couldn't find symbolic meta function/decomposition
-    xfail('sinh', ''),  # aten.sinh_.default - couldn't find symbolic meta function/decomposition
-    xfail('sqrt', ''),  # aten.sqrt_.default - couldn't find symbolic meta function/decomposition
-    xfail('square', ''),  # aten.pow_.Scalar - couldn't find symbolic meta function/decomposition
-    xfail('squeeze', ''),  # aten.squeeze_.default - couldn't find symbolic meta function/decomposition
-    xfail('t', ''),  # aten.t_.default - couldn't find symbolic meta function/decomposition
-    xfail('tan', ''),  # aten.tan_.default - couldn't find symbolic meta function/decomposition
-    xfail('tanh', ''),  # aten.tanh_.default - couldn't find symbolic meta function/decomposition
-    xfail('transpose', ''),  # aten.transpose_.default - couldn't find symbolic meta function/decomposition
-    xfail('tril', ''),  # aten.tril_.default - couldn't find symbolic meta function/decomposition
-    xfail('triu', ''),  # aten.triu_.default - couldn't find symbolic meta function/decomposition
-    xfail('trunc', ''),  # aten.trunc_.default - couldn't find symbolic meta function/decomposition
-    xfail('uniform', ''),  # aten.uniform_.default - couldn't find symbolic meta function/decomposition
-    xfail('unique', ''),  # aten.unique_consecutive.default - couldn't find symbolic meta function/decomposition
-    xfail('xlogy', ''),  # aten.xlogy_.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('round', ''),  # aten.round_.default - couldn't find symbolic meta function/decomposition
-    xfail('round', 'decimals_0'),  # aten.round_.decimals - couldn't find symbolic meta function/decomposition
-    xfail('round', 'decimals_3'),  # aten.round_.decimals - couldn't find symbolic meta function/decomposition
-    xfail('round', 'decimals_neg_3')  # aten.round_.decimals - couldn't find symbolic meta function/decomposition
+    # bugs
+    xfail('float_power', ''),  # base given to float_power_ has dtype Float but the operation's result requires dtype Double
+    # decomp not implemented
+    xfail('addbmm', ''),
+    xfail('addmm', ''),
+    xfail('addmm', 'decomposed'),
+    xfail('logit', ''),
+    xfail('nn.functional.hardsigmoid', ''),
+    xfail('round', ''),  # ref missing a kwarg
+    xfail('round', 'decimals_0'),  # ref missing a kwarg
+    xfail('round', 'decimals_3'),  # ref missing a kwarg
+    xfail('round', 'decimals_neg_3'),  # ref missing a kwarg
+    xfail('unique', ''),
+    # in-place has a different signature than out-of-place
+    xfail('uniform', ''),
+    # Views
+    xfail('squeeze', ''),
+    xfail('t', ''),
+    xfail('transpose', ''),
+    xfail('nn.functional.dropout3d', ''),  # calls unsqueeze_
 }
 
 # Copies inputs to inplace operations to avoid inplace modifications
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 7e3d31bb97466..e36abd0457e5b 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -116,19 +116,6 @@ def softplus_backward(out_grad: Tensor, x: Tensor, beta: float, threshold: float
     return torch.where((x * beta) > threshold, out_grad, out_grad * z / (z + 1.0))
 
 
-@register_decomposition(aten.elu)
-@pw_cast_for_opmath
-def elu(
-    self: Tensor, alpha: float = 1, scale: float = 1, input_scale: float = 1
-) -> Tensor:
-    negcoef = alpha * scale
-    poscoef = scale
-    negiptcoef = input_scale
-    return torch.where(
-        self > 0, self * poscoef, (torch.exp(self * negiptcoef) - 1) * negcoef
-    )
-
-
 @register_decomposition(aten.elu_backward)
 @pw_cast_for_opmath
 def elu_backward(
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 25b6f2da37c8c..3355400db43cc 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -122,6 +122,7 @@
     "bitwise_right_shift",
     "bitwise_xor",
     "clamp_min",
+    "clamp_max",
     "copysign",
     "div",
     "eq",
@@ -422,6 +423,31 @@ def _fn(*args, **kwargs):
     return _fn
 
 
+def _make_inplace(fn):
+    """
+    Given a function with out variant (i.e. using `out_wrapper()), it returns its in-place variant
+    See https://github.com/pytorch/pytorch/wiki/Developer-FAQ#how-do-in-place-operations-work-in-pytorch
+    """
+
+    # nb. We use the name of the first argument used in the unary references
+    @wraps(fn)
+    def _fn(a, *args, **kwargs):
+        return fn(a, *args, out=a, **kwargs)
+
+    inplace_name = f"{fn.__name__}_"
+    _fn.__name__ = inplace_name
+    _fn = register_decomposition(getattr(torch.ops.aten, inplace_name))(_fn)
+
+    # We access the __all__ attribute of the module where fn is defined
+    # There may be a cleaner way of doing this...
+    from inspect import getmodule
+
+    _all = getmodule(fn).__all__  # type: ignore[union-attr]
+    if inplace_name not in _all:
+        _all.append(inplace_name)
+    return _fn
+
+
 @_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT)
 def abs(a):
     return prims.abs(a)
@@ -3419,7 +3445,6 @@ def index_select(x: TensorLike, dim: int, index: TensorLike):
     return x[idx]
 
 
-# Note: although squeeze is documented as having the out= kwarg it doesn't
 @register_decomposition(torch.ops.aten.squeeze)
 def squeeze(a: TensorLikeType, dim: Optional[int] = None) -> TensorLikeType:
     if dim is not None:
@@ -3843,6 +3868,7 @@ def cumsum(
     return sum(masked_a, dim=dim, keepdim=keepdim, dtype=dtype, out=out)
 
 
+# Note: although squeeze is documented as having the out= kwarg it doesn't
 @register_decomposition(torch.ops.aten.unsqueeze)
 def unsqueeze(a: TensorLikeType, dim: int) -> TensorLikeType:
     # Note that unsqueeze canonicalizes with rank + 1 because it allows
@@ -5013,6 +5039,92 @@ def bucketize(
     return start.to(dtype=out_dtype)
 
 
+# inplace
+abs_ = _make_inplace(abs)
+acos_ = _make_inplace(acos)
+acosh_ = _make_inplace(acosh)
+addcmul_ = _make_inplace(addcmul)
+addcdiv_ = _make_inplace(addcdiv)
+asin_ = _make_inplace(asin)
+asinh_ = _make_inplace(asinh)
+atan_ = _make_inplace(atan)
+atanh_ = _make_inplace(atanh)
+atan2_ = _make_inplace(atan2)
+ceil_ = _make_inplace(ceil)
+clamp_ = _make_inplace(clamp)
+clamp_min_ = _make_inplace(clamp_min)
+clamp_max_ = _make_inplace(clamp_max)
+conj_physical_ = _make_inplace(conj_physical)
+copysign_ = _make_inplace(copysign)
+cos_ = _make_inplace(cos)
+cosh_ = _make_inplace(cosh)
+cumsum_ = _make_inplace(cumsum)
+digamma_ = _make_inplace(digamma)
+div_ = _make_inplace(div)
+eq_ = _make_inplace(eq)
+erf_ = _make_inplace(erf)
+erfc_ = _make_inplace(erfc)
+erfinv_ = _make_inplace(erfinv)
+exp_ = _make_inplace(exp)
+exp2_ = _make_inplace(exp2)
+expm1_ = _make_inplace(expm1)
+float_power_ = _make_inplace(float_power)
+floor_ = _make_inplace(floor)
+floor_divide_ = _make_inplace(floor_divide)
+fmod_ = _make_inplace(fmod)
+frac_ = _make_inplace(frac)
+ge_ = _make_inplace(ge)
+gt_ = _make_inplace(gt)
+heaviside_ = _make_inplace(heaviside)
+hypot_ = _make_inplace(hypot)
+igamma_ = _make_inplace(igamma)
+igammac_ = _make_inplace(igammac)
+le_ = _make_inplace(le)
+lerp_ = _make_inplace(lerp)
+lgamma_ = _make_inplace(lgamma)
+log10_ = _make_inplace(log10)
+log1p_ = _make_inplace(log1p)
+log2_ = _make_inplace(log2)
+log_ = _make_inplace(log)
+logical_and_ = _make_inplace(logical_and)
+logical_or_ = _make_inplace(logical_or)
+logical_xor_ = _make_inplace(logical_xor)
+lt_ = _make_inplace(lt)
+mvlgamma_ = _make_inplace(mvlgamma)
+nan_to_num_ = _make_inplace(nan_to_num)
+ne_ = _make_inplace(ne)
+neg_ = _make_inplace(neg)
+nextafter_ = _make_inplace(nextafter)
+pow_ = _make_inplace(pow)
+reciprocal_ = _make_inplace(reciprocal)
+remainder_ = _make_inplace(remainder)
+rsqrt_ = _make_inplace(rsqrt)
+sgn_ = _make_inplace(sgn)
+sigmoid_ = _make_inplace(sigmoid)
+sign_ = _make_inplace(sign)
+sin_ = _make_inplace(sin)
+sinc_ = _make_inplace(sinc)
+sinh_ = _make_inplace(sinh)
+sqrt_ = _make_inplace(sqrt)
+square_ = _make_inplace(square)
+tan_ = _make_inplace(tan)
+tanh_ = _make_inplace(tanh)
+tril_ = _make_inplace(tril)
+triu_ = _make_inplace(triu)
+true_divide_ = _make_inplace(true_divide)
+trunc_ = _make_inplace(trunc)
+xlogy_ = _make_inplace(xlogy)
+
+# Views
+# We can't model these as above, as the pattern of doing `op(a, out=a)` does not work for a view function
+# given that it does not reshape the input (it just copies the result into it)
+
+# squeeze_ = _make_inplace(squeeze)
+# t_ = _make_inplace(t)
+# transpose_ = _make_inplace(transpose)
+# unsqueeze_ = _make_inplace(unsqueeze)
+
+
 import torch._refs._conversions
 import torch._refs.fft
 import torch._refs.linalg
diff --git a/torch/_refs/nn/functional/__init__.py b/torch/_refs/nn/functional/__init__.py
index ab352c40a93ae..4ebe6e2b05d91 100644
--- a/torch/_refs/nn/functional/__init__.py
+++ b/torch/_refs/nn/functional/__init__.py
@@ -1,4 +1,5 @@
 import math
+from functools import wraps
 from typing import Callable, Optional, Union
 
 import torch
@@ -20,6 +21,7 @@
     elementwise_unary_scalar_wrapper,
     out_wrapper,
 )
+from torch._refs import _make_inplace
 
 from torch._subclasses.fake_tensor import FakeTensor
 
@@ -116,9 +118,31 @@ def alpha_dropout(
     return self * dropout_mask + b
 
 
+def inplace_wrapper(fn):
+    """
+    Given a nn.functional non-linearity, implements its `inplace: bool` argument
+    """
+
+    # nb. We use the name of the first argument used in the unary references
+    @wraps(fn)
+    def _fn(a, *args, inplace=False, **kwargs):
+        if inplace:
+            check(
+                "out" not in kwargs,
+                lambda: "Cannot set inplace=True and pass out= at the same time",
+            )
+            return fn(a, *args, inplace=False, out=a, **kwargs)
+        else:
+            return fn(a, *args, inplace=False, **kwargs)
+
+    return _fn
+
+
 # celu is implemented specially because it has an alpha argument
 # celu is very similar to elu
 @register_decomposition(torch.ops.aten.celu)
+@inplace_wrapper
+@out_wrapper()
 @elementwise_type_promotion_wrapper(
     type_promoting_args=("a",),
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
@@ -151,6 +175,8 @@ def celu(
 
 
 @register_decomposition(torch.ops.aten.dropout)
+@inplace_wrapper
+@out_wrapper()
 def dropout(
     a: TensorLikeType, p: float = 0.5, training: bool = True, inplace: bool = False
 ) -> TensorLikeType:
@@ -178,14 +204,19 @@ def dropout(
     return a * dropout_mask * scale
 
 
-# elu is implemented specially because it has an alpha argument
-# This cannot be used as a decomposition because the aten op takes in 2 extra kwargs
+@register_decomposition(torch.ops.aten.elu)
+@inplace_wrapper
+@out_wrapper()
 @elementwise_type_promotion_wrapper(
     type_promoting_args=("a",),
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
 )
 def elu(
-    a: TensorLikeType, alpha: Optional[NumberType] = None, inplace: bool = False
+    a: TensorLikeType,
+    alpha: NumberType = 1.0,
+    scale: NumberType = 1.0,
+    input_scale: NumberType = 1.0,
+    inplace: bool = False,
 ) -> TensorLikeType:
     """
     Reference implementation of torch.nn.functional.elu
@@ -193,24 +224,27 @@ def elu(
     if inplace:
         raise NotImplementedError
 
-    rhs: TensorLikeType
-    if alpha is not None:
-        python_type = utils.dtype_to_type(a.dtype)
-        if not utils.is_weakly_lesser_type(type(alpha), python_type):
-            msg = (
-                "alpha argument of type {0} cannot be safely cast to type {1}!".format(
-                    type(alpha), python_type
-                )
-            )
-            raise ValueError(msg)
-        rhs = alpha * torch.expm1(a)
-    else:
-        rhs = torch.expm1(a)
+    # nb. This should be factored out into a can_cast aux function
+    python_type = utils.dtype_to_type(a.dtype)
+    check(
+        utils.is_weakly_lesser_type(type(input_scale), python_type),
+        lambda: f"input_scale argument of type {type(input_scale)} cannot be safely cast to type {python_type}!",
+    )
+    check(
+        utils.is_weakly_lesser_type(type(scale), python_type),
+        lambda: f"scale argument of type {type(scale)} cannot be safely cast to type {python_type}!",
+    )
+    check(
+        utils.is_weakly_lesser_type(type(alpha), python_type),
+        lambda: f"alpha argument of type {type(alpha)} cannot be safely cast to type {python_type}!",
+    )
 
-    return torch.where(a > 0, a, rhs)
+    return torch.where(a > 0, scale * a, (alpha * scale) * torch.expm1(a * input_scale))
 
 
 @register_decomposition(torch.ops.aten.relu)
+@inplace_wrapper
+@out_wrapper()
 @elementwise_type_promotion_wrapper(
     type_promoting_args=("a",),
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
@@ -280,6 +314,8 @@ def layer_norm(
 
 
 @register_decomposition(torch.ops.aten.leaky_relu)
+@inplace_wrapper
+@out_wrapper()
 @elementwise_type_promotion_wrapper(
     type_promoting_args=("a",),
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
@@ -302,6 +338,8 @@ def leaky_relu(
 
 
 @register_decomposition(torch.ops.aten.mish)
+@inplace_wrapper
+@out_wrapper()
 @elementwise_type_promotion_wrapper(
     type_promoting_args=("a",),
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
@@ -317,6 +355,8 @@ def mish(a: TensorLikeType, inplace: bool = False) -> TensorLikeType:
 
 
 @register_decomposition(torch.ops.aten.selu)
+@inplace_wrapper
+@out_wrapper()
 @elementwise_type_promotion_wrapper(
     type_promoting_args=("a",),
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
@@ -369,6 +409,7 @@ def softmin(
 
 # softplus is implemented specially because it has beta and threshold arguments
 @register_decomposition(torch.ops.aten.softplus)
+@inplace_wrapper
 @out_wrapper()
 @elementwise_type_promotion_wrapper(
     type_promoting_args=("a",),
@@ -661,11 +702,11 @@ def _nll_loss_nd(
 
 
 @register_decomposition(torch.ops.aten.nll_loss)
+@out_wrapper()
 @elementwise_type_promotion_wrapper(
     type_promoting_args=("input",),
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
 )
-@out_wrapper()
 def nll_loss(
     input: TensorLikeType,
     target: TensorLikeType,
@@ -784,6 +825,8 @@ def tanhshrink(a: TensorLikeType) -> TensorLikeType:
 
 
 @register_decomposition(torch.ops.aten.threshold)
+@inplace_wrapper
+@out_wrapper()
 @elementwise_type_promotion_wrapper(
     type_promoting_args=("a",),
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
@@ -883,6 +926,8 @@ def _triplet_margin_with_distance_loss(
 
 
 @register_decomposition(torch.ops.aten.hardtanh)
+@inplace_wrapper
+@out_wrapper()
 @elementwise_unary_scalar_wrapper
 @elementwise_type_promotion_wrapper(
     type_promoting_args=("a"),
@@ -1022,6 +1067,8 @@ def prelu(a: TensorLikeType, weight: TensorLikeType) -> TensorLikeType:
 
 
 @register_decomposition(torch.ops.aten.relu6)
+@inplace_wrapper
+@out_wrapper()
 def relu6(a: TensorLikeType, inplace: bool = False) -> TensorLikeType:
     """
     Reference implementation of torch.nn.functional.relu6
@@ -1036,11 +1083,11 @@ def relu6(a: TensorLikeType, inplace: bool = False) -> TensorLikeType:
 
 
 @register_decomposition(torch.ops.aten.glu)
+@out_wrapper()
 @elementwise_type_promotion_wrapper(
     type_promoting_args=("a",),
     type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
 )
-@out_wrapper()
 def glu(a: TensorLikeType, dim: int = -1) -> TensorLikeType:
     dim = utils.canonicalize_dims(a.ndim, dim)
     check(
@@ -1065,11 +1112,11 @@ def pairwise_distance(
 
 
 @register_decomposition(torch.ops.aten.pdist)
+@out_wrapper()
 @elementwise_type_promotion_wrapper(
     type_promoting_args=("a",),
     type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
 )
-@out_wrapper()
 def pdist(a: TensorLikeType, p: float = 2) -> TensorLikeType:
     check(a.ndim == 2, lambda: f"pdist only supports 2D tensors, got: {a.ndim}D")
     check(p >= 0, lambda: "pdist only supports non-negative p values")
@@ -1083,3 +1130,11 @@ def pdist(a: TensorLikeType, p: float = 2) -> TensorLikeType:
         t = torch.linalg.vector_norm(a.unsqueeze(1) - a, ord=p, dim=2)
     i = torch.triu_indices(t.shape[0], t.shape[1], offset=1, device=a.device)
     return t.flatten().index_select(0, i[0] * t.shape[0] + i[1])
+
+
+# Needed as aten.{celu_,elu_...} exist (even if they don't have the in-place kwarg)
+celu_ = _make_inplace(celu)
+elu_ = _make_inplace(elu)
+mish_ = _make_inplace(mish)
+selu_ = _make_inplace(selu)
+threshold_ = _make_inplace(threshold)
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 50732af6f8578..6cff2f6a47491 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -17419,11 +17419,13 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     ElementwiseUnaryPythonRefInfo(
         "_refs.nn.functional.celu",
         torch_opinfo_name="nn.functional.celu",
+        supports_out=True,
     ),
     ElementwiseUnaryPythonRefInfo(
         "_refs.nn.functional.threshold",
         torch_opinfo_name="nn.functional.threshold",
         supports_nvfuser=False,
+        supports_out=True,
     ),
     PythonRefInfo(
         "_refs.nn.functional.dropout",
@@ -17458,11 +17460,13 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     ElementwiseUnaryPythonRefInfo(
         "_refs.nn.functional.elu",
         torch_opinfo_name="nn.functional.elu",
+        supports_out=True,
     ),
     ElementwiseUnaryPythonRefInfo(
         "_refs.nn.functional.hardtanh",
         torch_opinfo_name="nn.functional.hardtanh",
         supports_nvfuser=False,
+        supports_out=True,
     ),
     PythonRefInfo(  # TODO: Port this to an UnaryOpInfo
         "_refs.nn.functional.gelu",
@@ -17501,6 +17505,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     PythonRefInfo(
         "_refs.nn.functional.leaky_relu",
         torch_opinfo_name="nn.functional.leaky_relu",
+        supports_out=True,
     ),
     PythonRefInfo(
         "_refs.nn.functional.log_softmax",
@@ -17526,18 +17531,22 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.nn.functional.relu",
         torch_opinfo_name="nn.functional.relu",
         supports_nvfuser=False,
+        supports_out=True,
     ),
     ElementwiseUnaryPythonRefInfo(
         "_refs.nn.functional.relu6",
         torch_opinfo_name="nn.functional.relu6",
+        supports_out=True,
     ),
     ElementwiseUnaryPythonRefInfo(
         "_refs.nn.functional.mish",
         torch_opinfo_name="nn.functional.mish",
+        supports_out=True,
     ),
     ElementwiseUnaryPythonRefInfo(
         "_refs.nn.functional.selu",
         torch_opinfo_name="nn.functional.selu",
+        supports_out=True,
     ),
     PythonRefInfo(
         "_refs.nn.functional.softmax",

From 035c36ffe5a905f14658a9a70fd7cc25c110462a Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang@meta.com>
Date: Fri, 18 Nov 2022 15:27:15 +0000
Subject: [PATCH 1071/1922] Update torch.distributed.DistBackendError type
 (#89235)

Summary: Update torch.distributed.DistBackendError type based on https://fb.workplace.com/groups/pyreqa/posts/5753993921357059

Test Plan:
Pyre tests should pass?

let sandcastle run

Reviewed By: markkm

Differential Revision: D41384130

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89235
Approved by: https://github.com/awgu
---
 test/distributed/test_c10d_nccl.py | 4 +++-
 torch/_C/__init__.pyi.in           | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index cdc167bc4d1ae..fb28e744b5ed9 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -1031,9 +1031,11 @@ def test_nccl_dist_backend_error(self):
         self._create_process_group_nccl(store, self.opts())
 
         # Both rank 0 and 1 will use the same CUDA device resulting in ncclInvalidUsage
-        with self.assertRaises(dist.DistBackendError):
+        with self.assertRaises(dist.DistBackendError) as cm:
             dist.broadcast(torch.tensor([1, 2, 3]).cuda(), 0)
 
+        self.assertIsInstance(cm.exception, RuntimeError)
+
 class DistributedDataParallelTest(
     test_c10d_common.CommonDistributedDataParallelTest, MultiProcessTestCase
 ):
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 5833d7d7f2a41..d69cf1f3477ed 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -1508,5 +1508,5 @@ def _current_graph_task_id() -> _int: ...
 class _OutOfMemoryError:
     pass
 
-class _DistBackendError:
+class _DistBackendError(RuntimeError):
     pass

From be7d30b6185a2c00e016e38b8ed7e69bb203c57b Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Fri, 18 Nov 2022 13:41:51 +0000
Subject: [PATCH 1072/1922] [inductor] Temporarily disable functorch_dp_cifar10
 test in TorchBench (#89281)

Summary: The failure wasn't caught because of a land race. Skip the test
for now.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89281
Approved by: https://github.com/Krovatkin
---
 benchmarks/dynamo/common.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index cad954f825b2b..44b020413c9ca 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -111,6 +111,7 @@
     # *CI_SKIP_INDCUTOR_INFERENCE,
     # TorchBench
     "detectron2",
+    "functorch_dp_cifar10",
     "mobilenet_v3_large",
     "moco",
     "tacotron2",

From 913fb7c687ab452accb6d687138322deca30b0b2 Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Fri, 18 Nov 2022 16:15:55 +0000
Subject: [PATCH 1073/1922] [inductor] Skip DALLE2_pytorch in torchbench
 (#89288)

Summary: DALLE2_pytorch fails in eager as well.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89288
Approved by: https://github.com/Krovatkin
---
 benchmarks/dynamo/common.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 44b020413c9ca..c4e9d62f0a7c9 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -110,6 +110,7 @@
     # *CI_SKIP_AOT_EAGER_TRAINING,
     # *CI_SKIP_INDCUTOR_INFERENCE,
     # TorchBench
+    "DALLE2_pytorch",
     "detectron2",
     "functorch_dp_cifar10",
     "mobilenet_v3_large",

From d1619e334513cad3d42efb10f8806fb2ac68a971 Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Thu, 17 Nov 2022 12:47:33 -0800
Subject: [PATCH 1074/1922] [Quant] Allow setting fixed qparams for inner LSTM
 ops (#88456)

Summary: In both eager and FX graph mode quantization,
`torch.ao.nn.quantizable.LSTM` is used as an observed custom module,
which is responsible for inserting its own observers. By default,
the user specifies a single QConfig for the custom module (either
through QConfigMapping or by setting the "qconfig" attribute"),
and all inner ops will [inherit this
QConfig](https://github.com/pytorch/pytorch/blob/dc00bb51b8d370bf3891f0edb2c6e0c2914e329a/torch/ao/nn/quantizable/modules/rnn.py#L366-L378)
and use the same observer/fake_quantize constructors.

Today, users who wish to override this behavior must extend
`torch.ao.nn.quantizable.LSTM` and write a lot of custom code
to manually assign the QConfigs to the inner ops. This commit
alleviates this burden on the user by providing a helper function
to assign QConfigs with custom observers. An example use case of
this is providing a reference implementation for a backend kernel
that hardcodes qparams for efficiency.

Example usage:
```
import torch
from torch.ao.quantization import get_default_qconfig_mapping
from torch.ao.quantization.fx.custom_config import (
    PrepareCustomConfig,
    ConvertCustomConfig,
)

class MyModel(torch.nn.Module):
    ...

class UserLSTM(torch.ao.nn.quantizable.LSTM):
    @classmethod
    def from_float(cls, other):
        assert isinstance(other, cls._FLOAT_MODULE)
        linear_output_obs_ctr = FixedQParamsObserver.with_args(
            scale=2 ** -11, zero_point=2 ** 15, dtype=torch.qint32)
        sigmoid_obs_ctr = FixedQParamsObserver.with_args(
            scale=2 ** -16, zero_point=0, dtype=torch.qint32)
        tanh_obs_ctr = FixedQParamsObserver.with_args(
            scale=2 ** -15, zero_point=2 ** 15, dtype=torch.qint32)
        cell_state_obs_ctr = FixedQParamsObserver.with_args(
            scale=2 ** -11, zero_point=0, dtype=torch.qint32)
        hidden_state_obs_ctr = FixedQParamsObserver.with_args(
            scale=2 ** -7, zero_point=2 ** 7, dtype=torch.quint8)
        return torch.ao.quantization.utils._get_lstm_with_individually_observed_parts(
            float_lstm=other,
            linear_output_obs_ctr=linear_output_obs_ctr,
            sigmoid_obs_ctr=sigmoid_obs_ctr,
            tanh_obs_ctr=tanh_obs_ctr,
            cell_state_obs_ctr=cell_state_obs_ctr,
            hidden_state_obs_ctr=hidden_state_obs_ctr,
        )

qconfig_mapping = get_default_qconfig_mapping()
example_inputs = (torch.rand(5, 3, 50), torch.rand(1, 3, 50), torch.randn(1, 3, 50))
prepare_custom_config = PrepareCustomConfig() \
    .set_float_to_observed_mapping(torch.nn.LSTM, UserLSTM)
convert_custom_config = ConvertCustomConfig() \
    .set_observed_to_quantized_mapping(UserLSTM, torch.ao.nn.quantized.LSTM)
model = MyModel()
model = prepare_fx(model, qconfig_mapping, example_inputs, prepare_custom_config=prepare_custom_config)
model(*example_inputs)  # calibrate
model = convert_fx(model, convert_custom_config=convert_custom_config)
model(*example_inputs)
```

Test Plan:
python test/test_quantization.py TestQuantizeFx.test_static_lstm_with_custom_fixed_qparams

Reviewers: jerryzh168, vkuzo

Subscribers: jerryzh168, vkuzo

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88456
Approved by: https://github.com/jerryzh168, https://github.com/vkuzo
---
 test/quantization/fx/test_quantize_fx.py | 77 ++++++++++++++++++++-
 torch/ao/nn/quantizable/modules/rnn.py   | 24 +++++--
 torch/ao/quantization/utils.py           | 88 ++++++++++++++++++++++++
 3 files changed, 182 insertions(+), 7 deletions(-)

diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index b03b7fb0cf0e9..2d91ba80b7e02 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -192,7 +192,7 @@
 import operator
 import unittest
 import io
-from typing import Callable, Optional, List
+from typing import Callable, Optional, List, Tuple
 
 class BinaryOp(torch.nn.Module):
     def __init__(self, binary_op, ibinary_op, is_inplace, is_scalar):
@@ -4217,6 +4217,81 @@ def forward(self, inputs: torch.Tensor, h0: torch.Tensor, c0: torch.Tensor):
         }
         self._test_static_lstm_helper(m2, prepare_node_occurrence, convert_node_occurrence2)
 
+    def test_static_lstm_with_custom_fixed_qparams(self):
+        """
+        Test statically quantized LSTM with custom fixed qparams assigned to each of the
+        inner submodules. This flow requires users to extend `torch.ao.nn.quantizable.LSTM`
+        and use the child class in the custom module mapping.
+        """
+        class MyModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.my_lstm = torch.nn.LSTM(50, 50, 1)
+
+            def forward(self, inputs: torch.Tensor, h0: torch.Tensor, c0: torch.Tensor):
+                x = self.my_lstm(inputs, (h0, c0))
+                return x
+
+        class UserLSTM(torch.ao.nn.quantizable.LSTM):
+            """
+            Example of user provided LSTM implementation that has fixed qparams assigned
+            to the inner submodules.
+            """
+            @classmethod
+            def from_float(cls, other):
+                assert isinstance(other, cls._FLOAT_MODULE)
+                # uint16, [-16, 16)
+                linear_output_obs_ctr = FixedQParamsObserver.with_args(scale=2 ** -11, zero_point=2 ** 15, dtype=torch.qint32)
+                # uint16, [0, 1)
+                sigmoid_obs_ctr = FixedQParamsObserver.with_args(scale=2 ** -16, zero_point=0, dtype=torch.qint32)
+                # uint16, [-1, 1)
+                tanh_obs_ctr = FixedQParamsObserver.with_args(scale=2 ** -15, zero_point=2 ** 15, dtype=torch.qint32)
+                # int16, [-16, 16)
+                cell_state_obs_ctr = FixedQParamsObserver.with_args(scale=2 ** -11, zero_point=0, dtype=torch.qint32)
+                # uint8, [-1, 1)
+                hidden_state_obs_ctr = FixedQParamsObserver.with_args(scale=2 ** -7, zero_point=2 ** 7, dtype=torch.quint8)
+                return torch.ao.quantization.utils._get_lstm_with_individually_observed_parts(
+                    float_lstm=other,
+                    linear_output_obs_ctr=linear_output_obs_ctr,
+                    sigmoid_obs_ctr=sigmoid_obs_ctr,
+                    tanh_obs_ctr=tanh_obs_ctr,
+                    cell_state_obs_ctr=cell_state_obs_ctr,
+                    hidden_state_obs_ctr=hidden_state_obs_ctr,
+                )
+
+        # Prepare model
+        qconfig_mapping = get_default_qconfig_mapping()
+        example_inputs = (torch.rand(5, 3, 50), torch.rand(1, 3, 50), torch.randn(1, 3, 50))
+        prepare_custom_config = PrepareCustomConfig() \
+            .set_float_to_observed_mapping(torch.nn.LSTM, UserLSTM)
+        convert_custom_config = ConvertCustomConfig() \
+            .set_observed_to_quantized_mapping(UserLSTM, torch.ao.nn.quantized.LSTM)
+        model = MyModel()
+        model = prepare_fx(model, qconfig_mapping, example_inputs, prepare_custom_config=prepare_custom_config)
+
+        # Validate that the observers inserted to each inner module has the expected qparams
+        def validate_qparams(inner_module: torch.nn.Module, scale: float, zero_point: int, dtype: torch.dtype):
+            self.assertTrue(hasattr(inner_module, "activation_post_process"))
+            obs = inner_module.activation_post_process
+            self.assertTrue(isinstance(obs, FixedQParamsObserver))
+            self.assertEqual(obs.scale, scale)
+            self.assertEqual(obs.zero_point, zero_point)
+            self.assertEqual(obs.dtype, dtype)
+        cell = model.my_lstm.layers[0].layer_fw.cell
+        validate_qparams(cell.igates, 2 ** -11, 2 ** 15, torch.qint32)
+        validate_qparams(cell.hgates, 2 ** -11, 2 ** 15, torch.qint32)
+        validate_qparams(cell.input_gate, 2 ** -16, 0, torch.qint32)
+        validate_qparams(cell.forget_gate, 2 ** -16, 0, torch.qint32)
+        validate_qparams(cell.cell_gate, 2 ** -15, 2 ** 15, torch.qint32)
+        validate_qparams(cell.output_gate, 2 ** -16, 0, torch.qint32)
+        validate_qparams(cell.fgate_cx_igate_cgate, 2 ** -11, 0, torch.qint32)
+        validate_qparams(cell.ogate_cy, 2 ** -7, 2 ** 7, torch.quint8)
+
+        # Make sure the rest of the flow runs
+        model(*example_inputs)
+        model = convert_fx(model, convert_custom_config=convert_custom_config, _remove_qconfig=False)
+        model(*example_inputs)
+
     def test_reroute_tuple_getitem_patterns(self):
         """
         The following graph should redirect the output to `b`. After the transformation,
diff --git a/torch/ao/nn/quantizable/modules/rnn.py b/torch/ao/nn/quantizable/modules/rnn.py
index 59f23137097ce..72156a7ba5fe1 100644
--- a/torch/ao/nn/quantizable/modules/rnn.py
+++ b/torch/ao/nn/quantizable/modules/rnn.py
@@ -41,12 +41,22 @@ def __init__(self, input_dim: int, hidden_dim: int, bias: bool = True,
         self.hgates = torch.nn.Linear(hidden_dim, 4 * hidden_dim, bias=bias, **factory_kwargs)
         self.gates = torch.ao.nn.quantized.FloatFunctional()
 
+        self.input_gate = torch.nn.Sigmoid()
+        self.forget_gate = torch.nn.Sigmoid()
+        self.cell_gate = torch.nn.Tanh()
+        self.output_gate = torch.nn.Sigmoid()
+
         self.fgate_cx = torch.ao.nn.quantized.FloatFunctional()
         self.igate_cgate = torch.ao.nn.quantized.FloatFunctional()
         self.fgate_cx_igate_cgate = torch.ao.nn.quantized.FloatFunctional()
 
         self.ogate_cy = torch.ao.nn.quantized.FloatFunctional()
 
+        self.initial_hidden_state_qparams: Tuple[float, int] = (1.0, 0)
+        self.initial_cell_state_qparams: Tuple[float, int] = (1.0, 0)
+        self.hidden_state_dtype: torch.dtype = torch.quint8
+        self.cell_state_dtype: torch.dtype = torch.quint8
+
     def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None) -> Tuple[Tensor, Tensor]:
         if hidden is None or hidden[0] is None or hidden[1] is None:
             hidden = self.initialize_hidden(x.shape[0], x.is_quantized)
@@ -58,10 +68,10 @@ def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None) ->
 
         input_gate, forget_gate, cell_gate, out_gate = gates.chunk(4, 1)
 
-        input_gate = torch.sigmoid(input_gate)
-        forget_gate = torch.sigmoid(forget_gate)
-        cell_gate = torch.tanh(cell_gate)
-        out_gate = torch.sigmoid(out_gate)
+        input_gate = self.input_gate(input_gate)
+        forget_gate = self.forget_gate(forget_gate)
+        cell_gate = self.cell_gate(cell_gate)
+        out_gate = self.output_gate(out_gate)
 
         fgate_cx = self.fgate_cx.mul(forget_gate, cx)
         igate_cgate = self.igate_cgate.mul(input_gate, cell_gate)
@@ -75,8 +85,10 @@ def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None) ->
     def initialize_hidden(self, batch_size: int, is_quantized: bool = False) -> Tuple[Tensor, Tensor]:
         h, c = torch.zeros((batch_size, self.hidden_size)), torch.zeros((batch_size, self.hidden_size))
         if is_quantized:
-            h = torch.quantize_per_tensor(h, scale=1.0, zero_point=0, dtype=torch.quint8)
-            c = torch.quantize_per_tensor(c, scale=1.0, zero_point=0, dtype=torch.quint8)
+            (h_scale, h_zp) = self.initial_hidden_state_qparams
+            (c_scale, c_zp) = self.initial_cell_state_qparams
+            h = torch.quantize_per_tensor(h, scale=h_scale, zero_point=h_zp, dtype=self.hidden_state_dtype)
+            c = torch.quantize_per_tensor(c, scale=c_scale, zero_point=c_zp, dtype=self.cell_state_dtype)
         return h, c
 
     def _get_name(self):
diff --git a/torch/ao/quantization/utils.py b/torch/ao/quantization/utils.py
index 9f3dc712a9fe6..662d0068fef4f 100644
--- a/torch/ao/quantization/utils.py
+++ b/torch/ao/quantization/utils.py
@@ -546,6 +546,94 @@ def _patched_module_call(self, *args, **kwargs):
         torch.nn.Module.__call__ = orig_module_call
     return fqn_to_example_inputs
 
+def _get_lstm_with_individually_observed_parts(
+    float_lstm: torch.nn.LSTM,
+    # Use Callable instead of _PartialWrapper here to avoid circular dependencies
+    linear_output_obs_ctr: Optional[Callable] = None,
+    sigmoid_obs_ctr: Optional[Callable] = None,
+    tanh_obs_ctr: Optional[Callable] = None,
+    cell_state_obs_ctr: Optional[Callable] = None,
+    hidden_state_obs_ctr: Optional[Callable] = None,
+) -> torch.ao.nn.quantizable.LSTM:
+    """
+    Return an observed `torch.ao.nn.quantizable.LSTM` created from a `torch.nn.LSTM`
+    with specific observers or fake quantizes assigned to the inner ops or submodules.
+
+    In both eager and FX graph mode quantization, `torch.ao.nn.quantizable.LSTM` is
+    used as an observed custom module, which is responsible for inserting its own
+    observers. By default, all inner ops inherit the parent custom module's QConfig.
+    Users who wish to override this behavior may extend `torch.ao.nn.quantizable.LSTM`
+    and use this helper function to customize the observer insertion logic.
+
+    Args:
+        `float_lstm`: The float LSTM module
+        `linear_output_obs_ctr`: observer or fake quantize for linear outputs Wx + b,
+            where W is the weight matrix, b is the bias, and x is either the inputs
+            or the hidden state from the previous layer (if any)
+        `sigmoid_obs_ctr`: observer or fake quantize for sigmoid activations
+        `tanh_obs_ctr`: observer or fake quantize for tanh activations
+        `cell_state_obs_ctr`: observer or fake quantize for the cell state
+        `hidden_state_obs_ctr`: observer or fake quantize for the hidden state and
+            the output
+
+    Return:
+        A `torch.ao.nn.quantizable.LSTM` with the specified observers or fake quantizes
+        attached to the inner submodules.
+    """
+    def make_qconfig(obs_ctr: Callable) -> torch.ao.quantization.QConfig:
+        """
+        Make a QConfig with fixed qparams observers or fake quantizes.
+        """
+        if isinstance(obs_ctr(), torch.ao.quantization.FakeQuantizeBase):
+            weight = torch.ao.quantization.default_weight_fake_quant
+        else:
+            weight = torch.ao.quantization.default_weight_observer
+        return torch.ao.quantization.QConfig(activation=obs_ctr, weight=weight)
+
+    observed_lstm = torch.ao.nn.quantizable.LSTM(
+        float_lstm.input_size, float_lstm.hidden_size, float_lstm.num_layers, float_lstm.bias,
+        float_lstm.batch_first, float_lstm.dropout, float_lstm.bidirectional)
+
+    # Assign QConfigs with fixed qparams to all inner submodules
+    # Module hierarchy: LSTM > _LSTMLayer > _LSTMSingleLayer (forward or backward) > LSTMCell
+    for layer in observed_lstm.layers:
+        inner_layers = [layer.layer_fw]
+        if float_lstm.bidirectional:
+            inner_layers.append(layer.layer_bw)
+        for inner_layer in inner_layers:
+            cell = inner_layer.cell
+            if linear_output_obs_ctr is not None:
+                qconfig = make_qconfig(linear_output_obs_ctr)
+                cell.igates.qconfig = qconfig
+                cell.hgates.qconfig = qconfig
+            if sigmoid_obs_ctr is not None:
+                qconfig = make_qconfig(sigmoid_obs_ctr)
+                cell.input_gate.qconfig = qconfig
+                cell.forget_gate.qconfig = qconfig
+                cell.output_gate.qconfig = qconfig
+            if tanh_obs_ctr is not None:
+                cell.cell_gate.qconfig = make_qconfig(tanh_obs_ctr)
+            if cell_state_obs_ctr is not None:
+                cell.fgate_cx_igate_cgate.qconfig = make_qconfig(cell_state_obs_ctr)
+                obs = cell_state_obs_ctr()
+                if hasattr(obs, "scale") and hasattr(obs, "zero_point"):
+                    cell.initial_cell_state_qparams = (obs.scale, obs.zero_point)
+                cell.cell_state_dtype = obs.dtype
+            if hidden_state_obs_ctr is not None:
+                cell.ogate_cy.qconfig = make_qconfig(hidden_state_obs_ctr)
+                obs = hidden_state_obs_ctr()
+                if hasattr(obs, "scale") and hasattr(obs, "zero_point"):
+                    cell.initial_hidden_state_qparams = (obs.scale, obs.zero_point)
+                cell.hidden_state_dtype = obs.dtype
+
+    # Insert the observers based on the previously attached QConfigs
+    # Pass in non_leaf_module_list to prevent the observers for sigmoid/tanh from being overridden
+    torch.ao.quantization.add_observer_(
+        observed_lstm,
+        non_leaf_module_list=[torch.nn.Sigmoid, torch.nn.Tanh]
+    )
+    return observed_lstm
+
 __all__ = [
     "NodePattern",
     "Pattern",

From e502803351bc12aff9fb99885b124ee0ce643b9c Mon Sep 17 00:00:00 2001
From: Richard Howell <rhow@meta.com>
Date: Fri, 18 Nov 2022 16:30:53 +0000
Subject: [PATCH 1075/1922] [xplat] remove -weak_framework (#89233)

Summary: The `-weak_framework` flag is no longer necessary, Buck will weakly link frameworks depending on the `target_sdk_version` of the binary being linked.

Test Plan:
Compare IG load commands before and after change with P553208168
```
load command difference in Instagram.app/Frameworks/InstagramXplatFramework.framework/InstagramXplatFramework
 --- /tmp/tmpvd97s2v0    2022-11-16 12:13:54.082910598 -0800
+++ /tmp/tmpj20r_4ca    2022-11-16 12:13:54.082910598 -0800
@@ -9,7 +9,7 @@
        /System/Library/Frameworks/CoreHaptics.framework/CoreHaptics (compatibility version 1.0.0, current version 1.0.0, weak)
        /System/Library/Frameworks/CoreImage.framework/CoreImage (compatibility version 1.0.0, current version 5.0.0)
        /System/Library/Frameworks/CoreLocation.framework/CoreLocation (compatibility version 1.0.0, current version 2780.0.17)
-       /System/Library/Frameworks/CoreML.framework/CoreML (compatibility version 1.0.0, current version 1.0.0, weak)
+       /System/Library/Frameworks/CoreML.framework/CoreML (compatibility version 1.0.0, current version 1.0.0)
        /System/Library/Frameworks/CoreMedia.framework/CoreMedia (compatibility version 1.0.0, current version 1.0.0)
        /System/Library/Frameworks/CoreServices.framework/CoreServices (compatibility version 1.0.0, current version 1226.0.0)
        /System/Library/Frameworks/CoreTelephony.framework/CoreTelephony (compatibility version 1.0.0, current version 0.0.0)
@@ -33,9 +33,9 @@
        /System/Library/Frameworks/Security.framework/Security (compatibility version 1.0.0, current version 60420.40.34)
        /System/Library/Frameworks/SystemConfiguration.framework/SystemConfiguration (compatibility version 1.0.0, current version 1241.40.2)
        /System/Library/Frameworks/UIKit.framework/UIKit (compatibility version 1.0.0, current version 6109.1.108)
-       /System/Library/Frameworks/UserNotifications.framework/UserNotifications (compatibility version 1.0.0, current version 1.0.0, weak)
+       /System/Library/Frameworks/UserNotifications.framework/UserNotifications (compatibility version 1.0.0, current version 1.0.0)
        /System/Library/Frameworks/VideoToolbox.framework/VideoToolbox (compatibility version 1.0.0, current version 1.0.0)
-       /System/Library/Frameworks/WebKit.framework/WebKit (compatibility version 1.0.0, current version 614.2.9, weak)
+       /System/Library/Frameworks/WebKit.framework/WebKit (compatibility version 1.0.0, current version 614.2.9)
        /usr/lib/libSystem.B.dylib (compatibility version 1.0.0, current version 1319.0.0)
        /usr/lib/libbz2.1.0.dylib (compatibility version 1.0.0, current version 1.0.8)
        /usr/lib/libc++.1.dylib (compatibility version 1.0.0, current version 1300.32.0)
```
Both these changes are correct, WebKit is available from 8.0, UserNotifications from 10.0 and CoreML from 11.0. Instagram has a deployment target of 12.4.

Reviewed By: ebgraham

Differential Revision: D41348639

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89233
Approved by: https://github.com/malfet
---
 c2_defs.bzl | 25 +++----------------------
 1 file changed, 3 insertions(+), 22 deletions(-)

diff --git a/c2_defs.bzl b/c2_defs.bzl
index 0a89bb88093db..fedbb4bca84b7 100644
--- a/c2_defs.bzl
+++ b/c2_defs.bzl
@@ -221,26 +221,13 @@ def get_c2_fbobjc_ios_frameworks():
     frameworks = []
 
     if get_c2_mpscnn():
-        frameworks.append(
+        frameworks.extend([
             "$SDKROOT/System/Library/Frameworks/Metal.framework",
-        )
+            "$SDKROOT/System/Library/Frameworks/MetalPerformanceShaders.framework",
+        ])
 
     return frameworks
 
-def get_c2_fbobjc_linker_flags():
-    flags = []
-
-    if get_c2_mpscnn():
-        # Need linker flags as no platform_frameworks exist, and we can't
-        # use MPSCNN on x86_64.
-        # We use weak_framework as it's iOS 10
-        flags = [
-            "-L$SDKROOT/System/Library/Frameworks/MetalPerformanceShaders.framework",
-            "-weak_framework",
-            "MetalPerformanceShaders",
-        ]
-    return flags
-
 def get_c2_fbobjc_exported_preprocessor_flags():
     flags = []
 
@@ -311,12 +298,6 @@ def get_c2_default_cxx_args():
             STATIC_LIBRARY_IOS_CONFIG,
             extra_target_config = C2_FBOBJC_EXTRA_TARGET_CONFIG,
         ),
-        fbobjc_exported_platform_linker_flags = [
-            (
-                "iphoneos",
-                get_c2_fbobjc_linker_flags(),
-            ),
-        ],
         fbobjc_exported_platform_preprocessor_flags = [
             (
                 "iphoneos",

From 0a7a7670031c500c0fa2627ead222207b0d3f76e Mon Sep 17 00:00:00 2001
From: zhxchen17 <zhxchen17@fb.com>
Date: Thu, 17 Nov 2022 21:35:51 -0800
Subject: [PATCH 1076/1922] Use standard __func__ macro in symbolic shape.
 (#89264)

Summary:
I saw the following issue only on Windows build in PR #88767:
```
RuntimeError: AttributeError: 'SymNode' object has no attribute 'torch::impl::PythonSymNodeImpl::ge'
```
It's only on Windows because we get the attributes of SymNode in C++ with
`__FUNCTION__` macro, which is not in C++ standard, therefore has platform specific behavior.
In this case, MSVC will include a function's namespace and class name, which is not intended here.

Instead we should use `__func__`. see: https://en.cppreference.com/w/cpp/language/function#Function_definition

godbolt example to show the difference: https://godbolt.org/z/PGfvecxPx

Test Plan:
CI

Reviewers:

Subscribers:

Tasks:

Tags:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89264
Approved by: https://github.com/ezyang
---
 torch/csrc/utils/python_symnode.h | 38 +++++++++++++++----------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/torch/csrc/utils/python_symnode.h b/torch/csrc/utils/python_symnode.h
index 3a9fa79d37d6e..00bddfb9e4dc3 100644
--- a/torch/csrc/utils/python_symnode.h
+++ b/torch/csrc/utils/python_symnode.h
@@ -94,78 +94,78 @@ class PythonSymNodeImpl : public c10::SymNodeImpl {
   }
 
   c10::SymNode add(const c10::SymNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
+    return dispatch_common_(__func__, other);
   }
 
   c10::SymNode sub(const c10::SymNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
+    return dispatch_common_(__func__, other);
   }
 
   c10::SymNode mul(const c10::SymNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
+    return dispatch_common_(__func__, other);
   }
 
   c10::SymNode truediv(const c10::SymNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
+    return dispatch_common_(__func__, other);
   }
 
   c10::SymNode pow(const c10::SymNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
+    return dispatch_common_(__func__, other);
   }
 
   c10::SymNode floordiv(const c10::SymNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
+    return dispatch_common_(__func__, other);
   }
 
   c10::SymNode mod(const c10::SymNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
+    return dispatch_common_(__func__, other);
   }
 
   c10::SymNode eq(const c10::SymNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
+    return dispatch_common_(__func__, other);
   }
 
   c10::SymNode gt(const c10::SymNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
+    return dispatch_common_(__func__, other);
   }
 
   c10::SymNode lt(const c10::SymNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
+    return dispatch_common_(__func__, other);
   }
 
   c10::SymNode le(const c10::SymNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
+    return dispatch_common_(__func__, other);
   }
 
   c10::SymNode ge(const c10::SymNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
+    return dispatch_common_(__func__, other);
   }
 
   c10::SymNode min(const c10::SymNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
+    return dispatch_common_(__func__, other);
   }
   c10::SymNode max(const c10::SymNode& other) override {
-    return dispatch_common_(__FUNCTION__, other);
+    return dispatch_common_(__func__, other);
   }
 
   c10::SymNode ceil() override {
-    return dispatch_common_(__FUNCTION__);
+    return dispatch_common_(__func__);
   }
 
   c10::SymNode floor() override {
-    return dispatch_common_(__FUNCTION__);
+    return dispatch_common_(__func__);
   }
 
   c10::SymNode neg() override {
-    return dispatch_common_(__FUNCTION__);
+    return dispatch_common_(__func__);
   }
 
   c10::SymNode clone() override {
-    return dispatch_common_(__FUNCTION__);
+    return dispatch_common_(__func__);
   }
 
   c10::SymNode sym_float() override {
-    return dispatch_common_(__FUNCTION__);
+    return dispatch_common_(__func__);
   }
 
   py::handle getPyObj() {

From 41d702c841043b39d4c0c24ded6b856527ee1287 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Fri, 18 Nov 2022 00:15:45 -0800
Subject: [PATCH 1077/1922] [quant][fx][be] Refactor replace observer with q/dq
 op code (#89247)

Summary:
This is a refactor to prepare for future extensions, no functionality changes

Test Plan:
python test/test_quantization.py TestQuantizeFx

Reviewers:

Subscribers:

Tasks:

Tags:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89247
Approved by: https://github.com/vkuzo, https://github.com/andrewor14
---
 torch/ao/quantization/fx/convert.py | 132 +++++++++++++++++++++-------
 1 file changed, 98 insertions(+), 34 deletions(-)

diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py
index ca6ae61a4c97f..f09785679e37b 100644
--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@@ -53,12 +53,15 @@
     _get_module,
     _is_custom_module_lstm,
     get_custom_module_class_keys,
-    get_quantize_node_info,
     create_getattr_from_value,
     collect_producer_nodes,
     graph_module_from_producer_nodes,
     node_arg_is_weight,
 )
+from torch.ao.quantization.utils import (
+    is_per_channel,
+    to_underlying_dtype,
+)
 from torch.ao.quantization.quantize import (
     _remove_qconfig,
 )
@@ -107,51 +110,103 @@ def _replace_observer_with_quantize_dequantize_node(
     assert modules is not None
     assert isinstance(node.target, str)
     module_path, prefix = get_module_path_and_prefix(node, node_name_to_scope, node_name_to_qconfig)
-    observer_module = modules[node.target]
-    maybe_quantize_node_info = get_quantize_node_info(observer_module, is_decomposed)
+    activation_post_process = modules[node.target]
     # Skip replacing observers to quant/dequant nodes if the qconfigs of all
     # consumers and producers of this observer are None
     skip_replacement = all([
         has_none_qconfig(n, node_name_to_qconfig) for n in
         list(node.args) + list(node.users.keys())])
-    if skip_replacement or maybe_quantize_node_info is None:
-        # didn't find correponding quantize op and info for the observer_module
+    if skip_replacement or not _is_conversion_supported(activation_post_process):
+        # didn't find correponding quantize op and info for the activation_post_process
         # so we just remove the observer
         with graph.inserting_before(node):
             node.replace_all_uses_with(node.args[0])
             graph.erase_node(node)
-    else:
-        # otherwise, we can convert the observer moduel call to quantize/dequantize node
-        node_type, quantize_op, qparams = maybe_quantize_node_info
-        # replace observer node with quant - dequant node
-        with graph.inserting_before(node):
-            input_node = node.args[0]
-            quantize_op_inputs = [input_node]
-            for key, value in qparams.items():
-                # TODO: we can add the information of whether a value needs to
-                # be registered as an attribute in qparams dict itself
-                if key in ['_scale_', '_zero_point_']:
-                    # For scale and zero_point values we register them as buffers in the root module.
-                    # TODO: maybe need more complex attr name here
-                    qparam_node = create_getattr_from_value(model, graph, module_path + prefix + key, value)
-                    quantize_op_inputs.append(qparam_node)
-                else:
-                    # for qparams that are not scale/zero_point (like axis, dtype) we store them as literals in the graph.
-                    quantize_op_inputs.append(value)
+        return
 
-            quantized_node = graph.create_node(node_type, quantize_op, tuple(quantize_op_inputs), {})
+    # otherwise, we can convert the observer module call to quantize/dequantize node
+    # 1. extract the information from activation_post_process module for generating
+    # the quantize and dequantize operator
+    dtype = activation_post_process.dtype  # type: ignore[attr-defined]
+    compute_dtype = None
+    if hasattr(activation_post_process, "compute_dtype"):
+        compute_dtype = activation_post_process.compute_dtype  # type: ignore[attr-defined]
+    quantize_op : Optional[Union[Callable, str]] = None
+    if dtype in [torch.quint8, torch.qint8, torch.qint32] and \
+            not hasattr(activation_post_process, 'compute_dtype'):
+        node_type = "call_function"
+        scale, zero_point = activation_post_process.calculate_qparams()  # type: ignore[attr-defined]
+        if is_per_channel(activation_post_process.qscheme):  # type: ignore[attr-defined]
+            ch_axis = int(activation_post_process.ch_axis)  # type: ignore[attr-defined]
+            qparams = {"_scale_": scale, "_zero_point_": zero_point, "_axis_": ch_axis, "_dtype_": dtype}
             if is_decomposed:
-                # use the same qparams from quantize op
-                dq_inputs = [quantized_node] + quantize_op_inputs[1:]
-                dequantized_node = graph.call_function(
-                    torch.ops.quantized_decomposed.dequantize_per_tensor,
-                    tuple(dq_inputs),
-                    {}
-                )
+                raise NotImplementedError("decomposed quantize_per_channel op not implemented yet")
             else:
-                dequantized_node = graph.call_method("dequantize", args=(quantized_node,))
-            node.replace_all_uses_with(dequantized_node)
-            graph.erase_node(node)
+                quantize_op = torch.quantize_per_channel
+        else:
+            scale = float(scale)
+            zero_point = int(zero_point)
+            if is_decomposed:
+                quant_min = activation_post_process.quant_min  # type: ignore[attr-defined]
+                quant_max = activation_post_process.quant_max  # type: ignore[attr-defined]
+                dtype = to_underlying_dtype(dtype)
+                qparams = {
+                    "_scale_": scale,
+                    "_zero_point_": zero_point,
+                    "_quant_min": quant_min,
+                    "_quant_max": quant_max,
+                    "_dtype_": dtype
+                }
+                quantize_op = torch.ops.quantized_decomposed.quantize_per_tensor
+            else:
+                qparams = {"_scale_": scale, "_zero_point_": zero_point, "_dtype_": dtype}
+                quantize_op = torch.quantize_per_tensor
+    elif compute_dtype in [torch.quint8, torch.qint8, torch.float16]:
+        # TODO(future PR): switch compute_dtype to is_dynamic
+        # dynamic quantization
+        node_type = "call_function"
+        if is_decomposed:
+            raise NotImplementedError("decomposed quantize_per_tensor_dynamic op not implemented yet")
+        else:
+            quantize_op = torch.quantize_per_tensor_dynamic
+        # TODO: get reduce range from observer
+        # reduce_range = activation_post_process.reduce_range
+        reduce_range = torch.backends.quantized.engine in ("fbgemm", "x86")
+        qparams = {"_dtype_": compute_dtype, "_reduce_range_": reduce_range}
+    elif dtype == torch.float16:
+        node_type = "call_method"
+        quantize_op = "to"
+        qparams = {"_dtype_": dtype}
+
+    # 2. replace observer node with quant - dequant node
+    with graph.inserting_before(node):
+        input_node = node.args[0]
+        quantize_op_inputs = [input_node]
+        for key, value in qparams.items():
+            # TODO: we can add the information of whether a value needs to
+            # be registered as an attribute in qparams dict itself
+            if key in ['_scale_', '_zero_point_']:
+                # For scale and zero_point values we register them as buffers in the root module.
+                # TODO: maybe need more complex attr name here
+                qparam_node = create_getattr_from_value(model, graph, module_path + prefix + key, value)
+                quantize_op_inputs.append(qparam_node)
+            else:
+                # for qparams that are not scale/zero_point (like axis, dtype) we store them as literals in the graph.
+                quantize_op_inputs.append(value)
+
+        quantized_node = graph.create_node(node_type, quantize_op, tuple(quantize_op_inputs), {})
+        if is_decomposed:
+            # use the same qparams from quantize op
+            dq_inputs = [quantized_node] + quantize_op_inputs[1:]
+            dequantized_node = graph.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_tensor,
+                tuple(dq_inputs),
+                {}
+            )
+        else:
+            dequantized_node = graph.call_method("dequantize", args=(quantized_node,))
+        node.replace_all_uses_with(dequantized_node)
+        graph.erase_node(node)
 
 # this is a temporary hack for custom module, we may want to implement
 # this properly after the custom module class design is finalized
@@ -166,6 +221,15 @@ def _replace_observer_or_dequant_stub_with_dequantize_node(node: Node, graph: Gr
     graph.erase_node(node)
     insert_dequantize_node(call_custom_module_node, graph)
 
+def _is_conversion_supported(activation_post_process: torch.nn.Module) -> bool:
+    dtype = activation_post_process.dtype  # type: ignore[attr-defined]
+    compute_dtype = None
+    if hasattr(activation_post_process, "compute_dtype"):
+        compute_dtype = activation_post_process.compute_dtype  # type: ignore[attr-defined]
+    return (dtype in [torch.quint8, torch.qint8, torch.qint32] and compute_dtype is None) or \
+        compute_dtype in [torch.quint8, torch.qint8, torch.float16] or \
+        dtype == torch.float16
+
 def restore_state(
         observed: torch.nn.Module
 ) -> Tuple[Dict[str, Tuple[str, type]],

From 211d182567af82de2081bfad2b2f0420c117b13b Mon Sep 17 00:00:00 2001
From: Horace He <chilli@fb.com>
Date: Fri, 18 Nov 2022 10:56:03 +0000
Subject: [PATCH 1078/1922] Added partial decomposition of conv_backward and
 grad_bias computation (#89128)

`convolution_backward` often just kicks off the `sum` as a separate kernel. Splitting it off in a decomp allows us to fuse it into other ops: https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Convolution.cpp#L2150

Improves `convnext_base` from 373 img/s => 383 img/s

Not sure what other models use convolution with bias haha.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89128
Approved by: https://github.com/ezyang
---
 test/inductor/test_torchinductor.py | 72 +++++++++++++++++++++++++++++
 torch/_inductor/decomposition.py    | 33 +++++++++++++
 2 files changed, 105 insertions(+)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index f2b1caeb32ea4..fc0ae82a2598f 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -4101,6 +4101,78 @@ def fn(x):
             rtol=0.5,
         )
 
+    def test_conv_backward(self):
+        def fn(rank4_inps, rank3_inps, rank5_inps):
+
+            out1 = aten.convolution_backward(
+                *rank4_inps,
+                [C],
+                [1, 1],
+                [0, 0],
+                [1, 1],
+                False,
+                [0, 0],
+                1,
+                [True, True, True],
+            )
+            out2 = aten.convolution_backward(
+                *rank4_inps,
+                [C],
+                [1, 1],
+                [0, 0],
+                [1, 1],
+                False,
+                [0, 0],
+                1,
+                [True, False, False],
+            )
+            out3 = aten.convolution_backward(
+                *rank3_inps,
+                [C],
+                [1],
+                [0],
+                [1],
+                False,
+                [0],
+                1,
+                [True, True, True],
+            )
+            out4 = aten.convolution_backward(
+                *rank5_inps,
+                [C],
+                [1, 1, 1],
+                [0, 0, 0],
+                [1, 1, 1],
+                False,
+                [0, 0, 0],
+                1,
+                [True, True, True],
+            )
+            return (out1, out2, out3, out4)
+
+        B = 3
+        C = 4
+        H = 5
+        grad_out = torch.randn(B, C, H - 2, H - 2, H - 2)
+        inp = torch.randn(B, C, H, H, H)
+        weight = torch.randn(C, C, 3, 3, 3)
+
+        def shrink_rank(x, rank):
+            res = x
+            while res.dim() > rank:
+                res = torch.select(res, -1, 0)
+            return res.contiguous()
+
+        rank4_inps = [shrink_rank(x, 4) for x in [grad_out, inp, weight]]
+        rank3_inps = [shrink_rank(x, 4) for x in [grad_out, inp, weight]]
+        rank5_inps = [shrink_rank(x, 5) for x in [grad_out, inp, weight]]
+
+        with torch.backends.cudnn.flags(allow_tf32=False):
+            self.common(
+                fn,
+                [rank4_inps, rank3_inps, rank5_inps],
+            )
+
     @unittest.skip(
         """
         FIXME: In the case of having equally max/min elements, our implementation returns
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index 3254f174b495b..09ee53579345a 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -317,6 +317,39 @@ def bmm_decomp(mat1, mat2):
     return NotImplemented  # go directly to lowering
 
 
+@register_decomposition([aten.convolution_backward])
+def convolution_backward(
+    grad_output,
+    input,
+    weight,
+    bias_sizes,
+    stride,
+    padding,
+    dilation,
+    transposed,
+    output_padding,
+    groups,
+    output_mask,
+):
+    if not output_mask[2] or grad_output.device.type != "cuda":
+        return NotImplemented
+    grad_bias = aten.sum(grad_output, [0] + list(range(2, grad_output.dim())))
+    grad_inp, grad_weight, _ = aten.convolution_backward(
+        grad_output,
+        input,
+        weight,
+        bias_sizes,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        [output_mask[0], output_mask[1], False],
+    )
+    return (grad_inp, grad_weight, grad_bias)
+
+
 @register_decomposition([aten.rsqrt])
 def rsqrt(x):
     return torch.reciprocal(torch.sqrt(x))

From d19bdc8b43abf123632995b0b7a70da2b25f21cb Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 18 Nov 2022 18:43:15 +0000
Subject: [PATCH 1079/1922] [Dynamo] Support Tensor.nelement &
 torch.cuda.is_available (#89164)

Fix several errors in [7k github models](https://github.com/pytorch/torchdynamo/issues/1198).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89164
Approved by: https://github.com/soumith
---
 test/dynamo/test_misc.py          | 17 +++++++++++++++--
 torch/_dynamo/variables/tensor.py |  2 +-
 torch/_dynamo/variables/torch.py  |  1 +
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index 2825b157bc68e..e3274738fc21f 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -419,10 +419,10 @@ def fn(x):
 
     def test_numel(self):
         def fn(a):
-            return a + a.numel() + torch.numel(a)
+            return (a + a.numel() + torch.numel(a), a + a.nelement())
 
         return torch._dynamo.testing.standard_test(
-            self, fn=fn, nargs=1, expected_ops=2, expected_ops_dynamic=4
+            self, fn=fn, nargs=1, expected_ops=3, expected_ops_dynamic=6
         )
 
     def test_pair(self):
@@ -2963,6 +2963,19 @@ def forward(self, x):
         res = opt_model(x)
         self.assertTrue(same(ref, res))
 
+    def test_torch_cuda_is_available(self):
+        def fn(x):
+            if torch.cuda.is_available():
+                return x + 1
+            else:
+                return x - 1
+
+        x = torch.rand(4)
+        ref = fn(x)
+        opt_fn = torch._dynamo.optimize("eager", nopython=True)(fn)
+        res = opt_fn(x)
+        self.assertTrue(same(ref, res))
+
 
 class CustomFunc(torch.autograd.Function):
     @staticmethod
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index 8867f7e6cc93d..ab94aaf537d2d 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -203,7 +203,7 @@ def call_method(
                 ),
                 **options,
             )
-        elif name == "numel" and self.size is not None:
+        elif name in ("numel", "nelement") and self.size is not None:
             constant_result = ConstantVariable(product(self.size), **options)
         elif name in ("ndimension", "dim") and self.ndim is not None:
             constant_result = ConstantVariable(self.ndim, **options)
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index 56e74503faca0..651f80b5d77d4 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -163,6 +163,7 @@ def can_constant_fold_through(self):
             torch.finfo,
             torch.iinfo,
             torch.is_floating_point,
+            torch.cuda.is_available,
         ):
             return True
         return getattr(self.value, "__module__", None) == "math"

From 960276bb9235d8c98a40ba44be14ecfbaf3686e7 Mon Sep 17 00:00:00 2001
From: vfdev-5 <vfdev.5@gmail.com>
Date: Fri, 18 Nov 2022 18:46:50 +0000
Subject: [PATCH 1080/1922] Vectorized horizontal flip implementation (#88989)

When we benchmarked image processing transforms in torchvision : tensor vs pillow we saw that horizontal flip on uint8 data `(3, X, X)` is 2-3x slower.

Due to the fact that output's first stride is negative, implementation does a simple data copy using [`basic_loop`](https://github.com/pytorch/pytorch/blob/8371bb8a3dddbead709bc1e9d26715818a34fa8a/aten/src/ATen/native/cpu/Loops.h#L286). In this PR, a vectorized path is added for horizontal flip op for dtypes: uint8, int, float32, long and double and there is a speed-up that reduces the gap between PIL and tensor ops

```
CPU capability usage: AVX2

[----------------------------------------------------------------- Horizontal flip -----------------------------------------------------------------]
                                                 |  torch (1.14.0a0+git2ed1d29) PR  |    Pillow (9.3.0)   |  torch (1.14.0.dev20221116+cu116) nightly
1 threads: ------------------------------------------------------------------------------------------------------------------------------------------
      channels=3, size=256, dtype=torch.int64    |        101.307 (+-0.904)         |                     |             111.364 (+-0.328)
      channels=3, size=520, dtype=torch.int64    |        462.369 (+-2.184)         |                     |             505.602 (+-0.541)
      channels=3, size=712, dtype=torch.int64    |        1855.441 (+-6.528)        |                     |             1828.370 (+-8.600)

      channels=1, size=256, dtype=torch.int32    |         22.282 (+-0.130)         |   44.218 (+-0.936)  |              34.651 (+-0.162)
      channels=1, size=520, dtype=torch.int32    |         72.180 (+-0.076)         |  166.639 (+-1.180)  |             118.820 (+-0.210)
      channels=1, size=712, dtype=torch.int32    |        129.621 (+-0.649)         |  307.140 (+-2.221)  |             216.104 (+-0.793)

      channels=3, size=256, dtype=torch.uint8    |         51.685 (+-0.200)         |   44.171 (+-0.818)  |             361.611 (+-0.276)
      channels=3, size=520, dtype=torch.uint8    |        223.320 (+-0.726)         |  166.607 (+-2.256)  |             1462.012 (+-4.917)
      channels=3, size=712, dtype=torch.uint8    |        423.298 (+-1.156)         |  307.067 (+-1.999)  |             2738.481 (+-1.715)

      channels=1, size=256, dtype=torch.float32  |         22.281 (+-0.056)         |   44.149 (+-0.808)  |              35.316 (+-0.028)
      channels=1, size=520, dtype=torch.float32  |         72.268 (+-0.106)         |  166.631 (+-1.212)  |             119.504 (+-0.340)
      channels=1, size=712, dtype=torch.float32  |        129.777 (+-0.632)         |  307.078 (+-1.909)  |             216.987 (+-0.185)

      channels=1, size=256, dtype=torch.float16  |         32.789 (+-0.081)         |                     |              34.044 (+-0.039)
      channels=1, size=520, dtype=torch.float16  |        112.693 (+-0.478)         |                     |             117.445 (+-0.125)
      channels=1, size=712, dtype=torch.float16  |        203.644 (+-0.791)         |                     |             213.283 (+-0.397)

      channels=3, size=256, dtype=torch.float64  |        102.058 (+-0.333)         |                     |             108.404 (+-0.346)
      channels=3, size=520, dtype=torch.float64  |        473.139 (+-1.327)         |                     |             503.265 (+-0.365)
      channels=3, size=712, dtype=torch.float64  |        1854.489 (+-9.513)        |                     |             1844.345 (+-1.371)

      channels=1, size=256, dtype=torch.int16    |         11.927 (+-0.056)         |                     |              33.993 (+-0.037)
      channels=1, size=520, dtype=torch.int16    |         39.724 (+-0.148)         |                     |             117.577 (+-0.153)
      channels=1, size=712, dtype=torch.int16    |         68.264 (+-0.133)         |                     |             213.118 (+-0.157)

Times are in microseconds (us).

```

```
CPU capability usage: AVX512

[----------------------------------------------------------------- Horizontal flip ------------------------------------------------------------------]
                                                 |  torch (1.14.0a0+git2ed1d29) PR  |    Pillow (9.3.0)    |  torch (1.14.0.dev20221118+cu116) nightly
1 threads: -------------------------------------------------------------------------------------------------------------------------------------------
      channels=3, size=256, dtype=torch.int64    |        131.244 (+-1.954)         |                      |             135.649 (+-4.066)
      channels=3, size=520, dtype=torch.int64    |        522.032 (+-4.660)         |                      |             539.822 (+-10.420)
      channels=3, size=712, dtype=torch.int64    |       1041.111 (+-53.575)        |                      |            1322.411 (+-80.017)

      channels=1, size=256, dtype=torch.int32    |         10.108 (+-0.414)         |   49.164 (+-1.000)   |              34.606 (+-0.865)
      channels=1, size=520, dtype=torch.int32    |         93.218 (+-1.417)         |  191.985 (+-5.047)   |             133.664 (+-5.372)
      channels=1, size=712, dtype=torch.int32    |        167.919 (+-2.854)         |  353.574 (+-6.568)   |             246.162 (+-5.753)

      channels=3, size=256, dtype=torch.uint8    |         34.710 (+-0.541)         |   49.005 (+-0.923)   |             136.603 (+-2.339)
      channels=3, size=520, dtype=torch.uint8    |        154.873 (+-3.049)         |  191.729 (+-4.997)   |             534.329 (+-10.754)
      channels=3, size=712, dtype=torch.uint8    |        290.319 (+-4.819)         |  351.619 (+-6.978)   |             997.119 (+-33.086)

      channels=1, size=256, dtype=torch.float32  |         10.345 (+-0.338)         |   49.105 (+-0.942)   |              35.478 (+-0.733)
      channels=1, size=520, dtype=torch.float32  |         81.131 (+-5.281)         |  191.697 (+-4.555)   |             133.554 (+-4.193)
      channels=1, size=712, dtype=torch.float32  |        169.581 (+-3.476)         |  352.995 (+-10.792)  |             251.089 (+-7.485)

      channels=1, size=256, dtype=torch.float16  |         35.259 (+-0.612)         |                      |              35.154 (+-0.924)
      channels=1, size=520, dtype=torch.float16  |        132.407 (+-1.980)         |                      |             131.850 (+-5.611)
      channels=1, size=712, dtype=torch.float16  |        240.192 (+-5.479)         |                      |             239.555 (+-7.273)

      channels=3, size=256, dtype=torch.float64  |        129.649 (+-2.349)         |                      |             130.429 (+-6.240)
      channels=3, size=520, dtype=torch.float64  |        548.534 (+-5.179)         |                      |             622.568 (+-25.720)
      channels=3, size=712, dtype=torch.float64  |       1208.091 (+-77.095)        |                      |            1679.204 (+-316.292)

      channels=1, size=256, dtype=torch.int16    |         7.801 (+-0.115)          |                      |              34.517 (+-0.482)
      channels=1, size=520, dtype=torch.int16    |         36.010 (+-0.855)         |                      |             131.001 (+-1.686)
      channels=1, size=712, dtype=torch.int16    |         87.395 (+-1.355)         |                      |             237.731 (+-4.181)

Times are in microseconds (us).
```

[Source](https://gist.github.com/vfdev-5/c0421f54c8aed655b042dd1ce4cb621e)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88989
Approved by: https://github.com/lezcano, https://github.com/datumbox, https://github.com/peterbell10, https://github.com/ngimel
---
 aten/src/ATen/cpu/vec/vec256/vec256.h    | 45 ++++++++++++
 aten/src/ATen/cpu/vec/vec512/vec512.h    | 50 +++++++++++++
 aten/src/ATen/cpu/vec/vec_base.h         | 12 ++++
 aten/src/ATen/native/cpu/IndexKernel.cpp | 92 ++++++++++++++++++++++++
 4 files changed, 199 insertions(+)

diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h
index 98ec588137ce3..d0a8cb03604a9 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
@@ -222,6 +222,51 @@ inline deinterleave2<float>(const Vectorized<float>& a, const Vectorized<float>&
                         _mm256_permute2f128_ps(a_grouped, b_grouped, 0b0110001)); // 1, 3.   4 bits apart
 }
 
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FLIP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template<>
+inline Vectorized<float> flip(const Vectorized<float> & v) {
+  const __m256i mask_float = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+  return _mm256_permutevar8x32_ps(v, mask_float);
+}
+
+template<>
+inline Vectorized<double> flip(const Vectorized<double> & v) {
+  return _mm256_permute4x64_pd(v, 27);  // 27 == _MM_SHUFFLE(0, 1, 2, 3)
+}
+
+template<>
+inline Vectorized<int64_t> flip(const Vectorized<int64_t> & v) {
+  return _mm256_permute4x64_epi64(v, 27);  // 27 == _MM_SHUFFLE(0, 1, 2, 3)
+}
+
+template<>
+inline Vectorized<int32_t> flip(const Vectorized<int32_t> & v) {
+  const __m256i mask_int32 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+  return _mm256_permutevar8x32_epi32(v, mask_int32);
+}
+
+template<>
+inline Vectorized<int16_t> flip(const Vectorized<int16_t> & v) {
+  const __m256i mask = _mm256_set_epi8(
+    1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+    1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
+  );
+  auto reversed = _mm256_shuffle_epi8(v, mask);
+  return _mm256_permute2x128_si256(reversed, reversed, 1);
+}
+
+template<>
+inline Vectorized<int8_t> flip(const Vectorized<int8_t> & v) {
+  const __m256i mask_int8 = _mm256_set_epi8(
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  );
+  auto reversed = _mm256_shuffle_epi8(v, mask_int8);
+  return _mm256_permute2x128_si256(reversed, reversed, 1);
+}
+
+
 #endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
 
 }}}
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h
index 0c6f33fa08a06..dd1235e82eced 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
@@ -190,6 +190,56 @@ inline deinterleave2<float>(const Vectorized<float>& a, const Vectorized<float>&
                         _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b));
 }
 
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FLIP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template<>
+inline Vectorized<float> flip(const Vectorized<float> & v) {
+  const __m512i mask = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7,
+                                        8, 9, 10, 11, 12, 13, 14, 15);
+  return _mm512_permutexvar_ps(mask, v);
+}
+
+template<>
+inline Vectorized<double> flip(const Vectorized<double> & v) {
+  const __m512i mask = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+  return _mm512_permutexvar_pd(mask, v);
+}
+
+template<>
+inline Vectorized<int64_t> flip(const Vectorized<int64_t> & v) {
+  const __m512i mask = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+  return _mm512_permutexvar_epi64(mask, v);
+}
+
+template<>
+inline Vectorized<int32_t> flip(const Vectorized<int32_t> & v) {
+  const __m512i mask = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7,
+                                        8, 9, 10, 11, 12, 13, 14, 15);
+  return _mm512_permutexvar_epi32(mask, v);
+}
+
+template<>
+inline Vectorized<int16_t> flip(const Vectorized<int16_t> & v) {
+  const __m512i mask = _mm512_set_epi16(
+      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+      16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+  );
+  return _mm512_permutexvar_epi16(mask, v);
+}
+
+template<>
+inline Vectorized<int8_t> flip(const Vectorized<int8_t> & v) {
+  const __m512i mask1 = _mm512_set_epi8(
+      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  );
+  const __m512i mask2 = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
+  auto reversed_vec = _mm512_shuffle_epi8(v, mask1);
+  return _mm512_permutexvar_epi64(mask2, reversed_vec);
+}
+
 #endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
 
 }}}
diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h
index f045437ac3689..e9e87fa605f79 100644
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@@ -1001,4 +1001,16 @@ inline void convert(const src_T *src, dst_T *dst, int64_t n) {
   }
 }
 
+template <typename T>
+inline Vectorized<T> flip(const Vectorized<T> & data) {
+  static constexpr int size = Vectorized<T>::size();
+  T output[size];
+  T buffer[size];
+  data.store(static_cast<void*>(buffer));
+  for (const auto i : c10::irange(size)) {
+    output[i] = buffer[size - i - 1];
+  }
+  return Vectorized<T>::loadu(static_cast<void*>(output));
+}
+
 }}}
diff --git a/aten/src/ATen/native/cpu/IndexKernel.cpp b/aten/src/ATen/native/cpu/IndexKernel.cpp
index be0dc3301a006..81e135d1e7498 100644
--- a/aten/src/ATen/native/cpu/IndexKernel.cpp
+++ b/aten/src/ATen/native/cpu/IndexKernel.cpp
@@ -457,6 +457,75 @@ void masked_select_kernel(TensorIterator& iter, int64_t result_stride) {
     });
 }
 
+
+template <typename scalar_t>
+void cpu_hflip_vec(at::TensorIterator& iter) {
+
+  auto loop2d = [&](char** base, const int64_t *strides, int64_t size0, int64_t size1) {
+
+    static constexpr int ntensors = 3;
+    std::array<char*, ntensors> data_arr;
+    std::copy_n(base, ntensors, data_arr.data());
+    const int64_t *outer_strides = &strides[ntensors];
+
+    using Vec = Vectorized<scalar_t>;
+
+    constexpr auto stride = sizeof(scalar_t);
+    TORCH_INTERNAL_ASSERT(stride == -strides[0] && stride == strides[1]);
+
+    for (const auto j C10_UNUSED : c10::irange(size1)) {
+
+      // vectorized loop with negative stride for output
+      char** C10_RESTRICT data_ = data_arr.data();
+      int64_t n = size0;
+
+      char* C10_RESTRICT data[ntensors];
+      for (const auto arg : c10::irange(ntensors)) {
+        data[arg] = data_[arg];
+      }
+
+      int64_t i = 0;
+
+      // data[0] unaligned pre-pass
+      int64_t offset = (j * n + (n - i - Vec::size())) % 32;
+      offset = (offset >= n) ? n : offset;
+      for (; i < offset; i++) {
+        scalar_t* out_ptr = (scalar_t*)(data[0] - i * stride);
+        *out_ptr = *(scalar_t *)(data[1] + i * stride);
+      }
+      // Empirically found that it is faster to process 3 data items together vs 2 or 4
+      for (; i <= n - 3 * Vec::size(); i += 3 * Vec::size()) {
+        auto out1 = Vec::loadu(data[1] + i * stride);
+        auto out2 = Vec::loadu(data[1] + (i + Vec::size()) * stride);
+        auto out3 = Vec::loadu(data[1] + (i + 2 * Vec::size()) * stride);
+        // flip the vector: 1234 -> 4321
+        out1 = flip(out1);
+        out2 = flip(out2);
+        out3 = flip(out3);
+        out1.store(data[0] - (i + Vec::size() - 1) * stride);
+        out2.store(data[0] - (i + 2 * Vec::size() - 1) * stride);
+        out3.store(data[0] - (i + 3 * Vec::size() - 1) * stride);
+      }
+      if (i < n) {
+        for (; i < n; i++) {
+          scalar_t* out_ptr = (scalar_t*)(data[0] - i * stride);
+          *out_ptr = *(scalar_t *)(data[1] + i * stride);
+        }
+      }
+
+      // advance:
+      for (const auto arg : c10::irange(data_arr.size())) {
+        data_arr[arg] += outer_strides[arg];
+      }
+    }
+  };
+
+  int64_t grain_size = at::internal::GRAIN_SIZE;
+  iter.for_each(loop2d, grain_size);
+  iter.cast_outputs();
+}
+
+
 void flip_kernel(TensorIterator& iter, const bool quantized) {
   if (quantized) {
     AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(iter.dtype(), "flip_quantized_cpu",
@@ -466,6 +535,29 @@ void flip_kernel(TensorIterator& iter, const bool quantized) {
         });
     });
   } else {
+    // Special case: horizontal flip with vectorization and input is contiguous
+    // Context: horizontal flip leads to strides[0] < 0 and
+    // thus is_contiguous condition is not satisfied and non-vectorized code path is taken.
+    auto output_strides = iter.strides(0);
+    auto input_strides = iter.strides(1);
+    if (iter.ndim() > 0 && output_strides[0] < 0 && input_strides[0] == iter.element_size(1)) {
+      auto iter_dtype = iter.dtype();
+      if (iter_dtype == kByte) {
+        return cpu_hflip_vec<uint8_t>(iter);
+      } else if (iter_dtype == kFloat) {
+        return cpu_hflip_vec<float>(iter);
+      } else if (iter_dtype == kInt) {
+        return cpu_hflip_vec<int32_t>(iter);
+      } else if (iter_dtype == kShort) {
+        return cpu_hflip_vec<int16_t>(iter);
+      } else if (iter_dtype == kLong) {
+        return cpu_hflip_vec<int64_t>(iter);
+      } else if (iter_dtype == kDouble) {
+        return cpu_hflip_vec<double>(iter);
+      }
+      // other dtypes are handled below with cpu_kernel_vec
+    }
+
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(), "flip_cpu",
         [&iter] { cpu_kernel_vec(iter,
           [](scalar_t a, scalar_t /*dummy input*/) -> scalar_t {

From ca6a01966cc1f3b95b59344848956d67922cf15e Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Fri, 18 Nov 2022 18:55:33 +0000
Subject: [PATCH 1081/1922] Set make max load when building libtorch (#89237)

The nccl build is still OOM sometimes when using `$(MAKE)`:

```
virtual memory exhausted: Cannot allocate memory
Makefile:73: recipe for target '/var/lib/jenkins/cpp-build/caffe2/build/nccl/obj/collectives/device/devlink.o' failed
make[5]: *** [/var/lib/jenkins/cpp-build/caffe2/build/nccl/obj/collectives/device/devlink.o] Error 1
make[5]: Leaving directory '/var/lib/jenkins/workspace/third_party/nccl/nccl/src/collectives/device'
```

* https://github.com/pytorch/pytorch/actions/runs/3476485191/jobs/5811758058
* https://github.com/pytorch/pytorch/actions/runs/3422228421/jobs/5702153639

So trying to set the same limit here as when building with ninja

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89237
Approved by: https://github.com/malfet
---
 cmake/External/nccl.cmake | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/cmake/External/nccl.cmake b/cmake/External/nccl.cmake
index cb928baf3a595..160d2b648c051 100644
--- a/cmake/External/nccl.cmake
+++ b/cmake/External/nccl.cmake
@@ -15,23 +15,24 @@ if(NOT __NCCL_INCLUDED)
     # this second replacement is needed when there are multiple archs
     string(REPLACE ";-gencode" " -gencode" NVCC_GENCODE "${NVCC_GENCODE}")
 
-    if("${CMAKE_GENERATOR}" MATCHES "Make")
-      # Recursive make with jobserver for parallelism
-      set(MAKE_COMMAND "$(MAKE)")
+    if(DEFINED ENV{MAX_JOBS})
+      set(MAX_JOBS "$ENV{MAX_JOBS}")
     else()
-      if(DEFINED ENV{MAX_JOBS})
-        set(MAX_JOBS "$ENV{MAX_JOBS}")
-      else()
-        include(ProcessorCount)
-        ProcessorCount(NUM_HARDWARE_THREADS)
-        # Assume 2 hardware threads per cpu core
-        math(EXPR MAX_JOBS "${NUM_HARDWARE_THREADS} / 2")
-        # ProcessorCount might return 0, set to a positive number
-        if(MAX_JOBS LESS 2)
-            set(MAX_JOBS 2)
-        endif()
+      include(ProcessorCount)
+      ProcessorCount(NUM_HARDWARE_THREADS)
+      # Assume 2 hardware threads per cpu core
+      math(EXPR MAX_JOBS "${NUM_HARDWARE_THREADS} / 2")
+      # ProcessorCount might return 0, set to a positive number
+      if(MAX_JOBS LESS 2)
+        set(MAX_JOBS 2)
       endif()
+    endif()
 
+    if("${CMAKE_GENERATOR}" MATCHES "Make")
+      # Recursive make with jobserver for parallelism, and also put a load limit
+      # here to avoid flaky OOM, https://www.gnu.org/software/make/manual/html_node/Parallel.html
+      set(MAKE_COMMAND "$(MAKE)" "-l${MAX_JOBS}")
+    else()
       # Parallel build with CPU load limit to avoid oversubscription
       set(MAKE_COMMAND "make" "-j${MAX_JOBS}" "-l${MAX_JOBS}")
     endif()

From 73f727549e5f716c1bf2ae6833bcdb172b7018a1 Mon Sep 17 00:00:00 2001
From: Zain Rizvi <zainr@fb.com>
Date: Fri, 18 Nov 2022 19:36:09 +0000
Subject: [PATCH 1082/1922] Remove --retry-all-errors from environment with old
 curl (#89298)

The version of curl on the `ubuntu-latest` box doesn't support the `--retry-all-errors` param and is breaking periodic builds

Example: https://github.com/pytorch/pytorch/actions/runs/3495466804/jobs/5852265880
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89298
Approved by: https://github.com/huydhn
---
 scripts/buck_setup.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/buck_setup.sh b/scripts/buck_setup.sh
index 331a299564167..f6152537435c2 100644
--- a/scripts/buck_setup.sh
+++ b/scripts/buck_setup.sh
@@ -22,16 +22,16 @@ python3 generate-xnnpack-wrappers.py
 # bazel-skylib
 printf "\nDownloading bazel-skylib\n"
 rm -rf bazel-skylib; mkdir bazel-skylib
-curl --retry 3 --retry-all-errors -L $PROXY https://github.com/bazelbuild/bazel-skylib/releases/download/1.0.2/bazel-skylib-1.0.2.tar.gz|tar zx -C bazel-skylib
+curl --retry 3 -L $PROXY https://github.com/bazelbuild/bazel-skylib/releases/download/1.0.2/bazel-skylib-1.0.2.tar.gz|tar zx -C bazel-skylib
 
 # glog
 printf "\nDownloading glog\n"
 rm -rf glog; mkdir glog
-curl --retry 3 --retry-all-errors -L $PROXY https://github.com/google/glog/archive/v0.4.0.tar.gz | tar zx -C glog --strip-components 1
+curl --retry 3 -L $PROXY https://github.com/google/glog/archive/v0.4.0.tar.gz | tar zx -C glog --strip-components 1
 
 # ruy
 printf "\nDownloading ruy\n"
-curl --retry 3 --retry-all-errors -L $PROXY -o /tmp/ruy.zip https://github.com/google/ruy/archive/a09683b8da7164b9c5704f88aef2dc65aa583e5d.zip
+curl --retry 3 -L $PROXY -o /tmp/ruy.zip https://github.com/google/ruy/archive/a09683b8da7164b9c5704f88aef2dc65aa583e5d.zip
 unzip -q /tmp/ruy.zip -d /tmp/
 rm -rf ruy/
 mv /tmp/ruy-a09683b8da7164b9c5704f88aef2dc65aa583e5d ruy/

From 7774d9a5a8e2f125c1cd146cec0a4fedd2330dda Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Fri, 18 Nov 2022 06:59:20 -0800
Subject: [PATCH 1083/1922] Symintify obeys_layout_contract (#89138)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89138
Approved by: https://github.com/bdhirsh
---
 torch/csrc/autograd/utils/grad_layout_contract.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/autograd/utils/grad_layout_contract.h b/torch/csrc/autograd/utils/grad_layout_contract.h
index 2addde79c8ec2..37dda0f9acaac 100644
--- a/torch/csrc/autograd/utils/grad_layout_contract.h
+++ b/torch/csrc/autograd/utils/grad_layout_contract.h
@@ -28,9 +28,9 @@ inline bool obeys_layout_contract(
     return false;
   } else if (variable.is_non_overlapping_and_dense()) {
     // Only look at stride for dimensions that are not of size 1.
-    const auto& grad_sizes = grad.sizes();
-    const auto& grad_strides = grad.strides();
-    const auto& variable_strides = variable.strides();
+    const auto& grad_sizes = grad.sym_sizes();
+    const auto& grad_strides = grad.sym_strides();
+    const auto& variable_strides = variable.sym_strides();
     for (const auto idx : c10::irange(grad_sizes.size())) {
       if (grad_sizes[idx] != 1) {
         if (grad_strides[idx] != variable_strides[idx]) {

From 78d11e0c706dcbdbc75213c94e98002f857c1d1a Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Fri, 18 Nov 2022 06:59:21 -0800
Subject: [PATCH 1084/1922] Don't trace when we track_tensor_tree (#89139)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89139
Approved by: https://github.com/bdhirsh
---
 torch/fx/experimental/proxy_tensor.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
index 8a51294c5a8fd..c3a5d706e3cc4 100644
--- a/torch/fx/experimental/proxy_tensor.py
+++ b/torch/fx/experimental/proxy_tensor.py
@@ -426,7 +426,9 @@ def wrap_key(f, tensors, tracer):
     def wrapped(*proxies):
         flat_proxies, proxies_spec = pytree.tree_flatten(proxies)
         assert len(flat_proxies) == len(flat_tensors)
-        track_tensor_tree(flat_tensors, flat_proxies, constant=None, tracer=tracer)
+        assert isinstance(_get_current_dispatch_mode(), ProxyTorchDispatchMode)
+        with _pop_mode_temporarily():
+            track_tensor_tree(flat_tensors, flat_proxies, constant=None, tracer=tracer)
 
         out = f(*tensors)
         out = pytree.tree_map_only(

From 959b2773a7ebfd5fbafab65cce0041ea92d3edc6 Mon Sep 17 00:00:00 2001
From: David Berard <dberard@fb.com>
Date: Thu, 17 Nov 2022 09:58:29 -0800
Subject: [PATCH 1085/1922] Re-enable test_hf_bert_fsdp (#89223)

It looks like this failure was actually caused by https://github.com/pytorch/pytorch/pull/88629, see the revert message on that PR. It probably just looked like a flaky test on CI because of how quickly the PR was reverted.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89223
Approved by: https://github.com/voznesenskym
---
 test/distributed/test_dynamo_distributed.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index 21550a0120e46..b6bc16edb941a 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -258,8 +258,6 @@ def test_fsdp_inductor(self):
     # TODO(whc) Investigate why cudagraphs breaks inductor+fsdp for hf_bert
     @patch.object(torch._inductor.config.triton, "cudagraphs", False)
     @patch.object(torch._inductor.config, "fallback_random", True)
-    # TODO(voz): Flaky on CI failure, consistent failure on local master.
-    @unittest.skipIf(True, "Flaky on CI failure, consistent failure on local master")
     def test_hf_bert_fsdp(self):
         from transformers.models.bert.modeling_bert import BertLayer
 

From 9e6816ed4d1a26432423c7ce003e830c9ff40883 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Thu, 17 Nov 2022 23:06:09 +0000
Subject: [PATCH 1086/1922] [DDP][Docs] Add warning that `no_sync()` should
 include forward (#89244)

The issue where the user only includes `loss.backward()` inside `no_sync()` but not the forward pass has arisen several times now. I think adding an explicit warning in the docs is worthwhile.

Rendered doc:
<img width="769" alt="Screen Shot 2022-11-17 at 9 21 32 PM" src="https://user-images.githubusercontent.com/31054793/202602005-22c000b7-1093-4eaf-ba66-9c929a66906b.png">

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89244
Approved by: https://github.com/zhaojuanmao
---
 torch/nn/parallel/distributed.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index 47eb6bb2ebf1c..b6673874eecca 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -1001,6 +1001,10 @@ def no_sync(self):
             >>>   for input in inputs:
             >>>     ddp(input).backward()  # no synchronization, accumulate grads
             >>> ddp(another_input).backward()  # synchronize grads
+
+        .. warning::
+            The forward pass should be included inside the context manager, or
+            else gradients will still be synchronized.
         """
         old_require_backward_grad_sync = self.require_backward_grad_sync
         self.require_backward_grad_sync = False

From 9ccc61b6769df44110cd63f52b59930bdcfb8a89 Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Thu, 17 Nov 2022 13:33:39 -0800
Subject: [PATCH 1087/1922] [Profiler] Don't raise SOFT_ASSERT in debug builds.
 (#89240)

Enough people are hitting this issue that we need to turn off hard failures until the fire rate is zero in steady state. (via scuba logging.)

Differential Revision: [D41382914](https://our.internmc.facebook.com/intern/diff/D41382914/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89240
Approved by: https://github.com/aaronenyeshi
---
 torch/csrc/profiler/util.cpp | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp
index f4fb4dd1eee12..6833e8abef70d 100644
--- a/torch/csrc/profiler/util.cpp
+++ b/torch/csrc/profiler/util.cpp
@@ -94,13 +94,7 @@ void setSoftAssertRaises(c10::optional<bool> value) {
 }
 
 bool softAssertRaises() {
-  return soft_assert_raises_.value_or(
-#ifdef NDEBUG
-      false
-#else
-      true
-#endif
-  );
+  return soft_assert_raises_.value_or(false);
 }
 
 // ----------------------------------------------------------------------------

From e7c945ed78229d1aff9484cc882320b40c026ab1 Mon Sep 17 00:00:00 2001
From: Bryce Long <blong@nvidia.com>
Date: Fri, 18 Nov 2022 22:36:05 +0000
Subject: [PATCH 1088/1922] Add NVTX markers that dump additional information
 for nvprim_nvfuser Dynamo graphs (#88259)

dump information on graphs that NVFuser JIT compiles:
- the markers show the list of ops, args, and inputs that make up the graph

also dumps information on FX nodes that are not touched by NVFuser:
- the markers show the op, name, and arg list of the node

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88259
Approved by: https://github.com/IvanYashchuk, https://github.com/jjsjann123, https://github.com/mruberry
---
 torch/_prims/nvfuser_executor.py | 47 ++++++++++++++++++++++++++++++--
 1 file changed, 45 insertions(+), 2 deletions(-)

diff --git a/torch/_prims/nvfuser_executor.py b/torch/_prims/nvfuser_executor.py
index 0f4e7b49fa27c..a155433231e11 100644
--- a/torch/_prims/nvfuser_executor.py
+++ b/torch/_prims/nvfuser_executor.py
@@ -28,6 +28,14 @@
 else:
     DataType = None
 
+import os
+
+
+@lru_cache(None)
+def get_nvprim_dump_nvtx():
+    return os.getenv("PYTORCH_NVFUSER_DUMP_NVTX")
+
+
 DEFAULT_NVFUSER_PYTHON_CONFIG = MappingProxyType(
     {
         "use_python_fusion_cache": True,
@@ -247,10 +255,30 @@ def nvfuser_execute(gm: GraphModule, *args, executor_parameters=None):
             arg for arg in flat_args if isinstance(arg, (torch.Tensor, Number))
         )
 
-        return tree_unflatten(
+        if get_nvprim_dump_nvtx():
+            torch.cuda.nvtx.range_push(
+                "fusion: {0}, graph: {1}".format(
+                    fusion.id(),
+                    str(
+                        [
+                            {
+                                "op": n.op,
+                                "name": n.name,
+                                "args": n.args,
+                                "kwargs": n.kwargs,
+                            }
+                            for n in gm.graph.nodes
+                        ]
+                    ),
+                )
+            )
+        result = tree_unflatten(
             fusion.execute(concrete_fusion_inputs),  # type: ignore[has-type]
             unflatten_spec,  # type: ignore[has-type]
         )
+        if get_nvprim_dump_nvtx():
+            torch.cuda.nvtx.range_pop()
+        return result
     else:
         warn(
             "nvfuser_executor is executed with non-cuda args, fallback to aten executor"
@@ -421,6 +449,18 @@ def maybe_partition_graph(
         return gm, any_unsupported
 
 
+class NVTXInterpreter(torch.fx.Interpreter):
+    def run_node(self, n):
+        torch.cuda.nvtx.range_push(
+            "name: {0}, args: {1}, op: {2}, kwargs: {3}".format(
+                n.name, n.args, n.op, n.kwargs
+            )
+        )
+        result = super().run_node(n)
+        torch.cuda.nvtx.range_pop()
+        return result
+
+
 def nvfuser_execute_partitioned(gm: GraphModule, *args, executor_parameters=None):
     executor_parameters = executor_parameters or DEFAULT_NVFUSER_PYTHON_CONFIG
     # maybe_partition_graph function is cached so we can't use non-hashable arguments
@@ -440,6 +480,9 @@ def nvfuser_execute_partitioned(gm: GraphModule, *args, executor_parameters=None
         use_python_fusion_cache=use_python_fusion_cache,
     )
     if is_partitioned:
-        return gm(*args)
+        if get_nvprim_dump_nvtx():
+            return NVTXInterpreter(gm).run(*args)
+        else:
+            return gm(*args)
     else:
         return nvfuser_execute(gm, *args, executor_parameters=executor_parameters)

From f66d67ac5e1ba99f1628a7fe23936b5ca0f99620 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwangmeta@meta.com>
Date: Fri, 18 Nov 2022 23:05:50 +0000
Subject: [PATCH 1089/1922] [Inductor CI] Use string format for cuda-arch-list
 input to prevent 8.0/9.0/10.0 etc from being interpreted as 8/9/10 (#89279)

Currently or in future whenever we change the cuda-arch-list to num.0, github action or some agent would pass just num to TORCH_CUDA_ARCH_LIST

This num is not regex matched during cuda arch analysis phase. (here: https://github.com/pytorch/pytorch/blob/c5fafb4e1694f141d8a1a31142cce4049d9057ed/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake#L229)
Example failure: https://github.com/weiwangmeta/pytorch/actions/runs/3495656108/jobs/5852735299
  Unknown CUDA Architecture Name 8 in CUDA_SELECT_NVCC_ARCH_FLAGS
This change reminds us to use e.g. '8.0', '9.0', '10.0' etc instead of 8.0, 9.0, 10.0 as GHA or some other agent may erroneously truncate it to pure numbers.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89279
Approved by: https://github.com/desertfire, https://github.com/atalman
---
 .github/workflows/inductor.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml
index eb953ff42321f..9179b186e9182 100644
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@@ -20,7 +20,7 @@ jobs:
     with:
       build-environment: linux-bionic-cuda11.6-py3.10-gcc7-sm86
       docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
-      cuda-arch-list: 8.6
+      cuda-arch-list: '8.6'
       test-matrix: |
         { include: [
           { config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },

From a5350a6826d26ae8e1eeaf7faa57a49e29d4fbd7 Mon Sep 17 00:00:00 2001
From: Tran Le <quytranle@meta.com>
Date: Fri, 18 Nov 2022 23:19:14 +0000
Subject: [PATCH 1090/1922] [FX] Add type annotation to `getitem` node before
 `split_module` (#88510)

Summary: Some nodes lost the type annotation during `split_module`, causing the submodels to be un-scriptable. This is because compiler always infer Tensor type, which is wrong for non-Tensor types. We attempt to infer type annotation for `getitem` node to improve scriptability.

Test Plan:
```
buck2 test //caffe2/test:fx_experimental
```

Differential Revision: D41037819

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88510
Approved by: https://github.com/xush6528
---
 torch/fx/passes/split_module.py | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/torch/fx/passes/split_module.py b/torch/fx/passes/split_module.py
index c6954c2cc7177..0343bae94c31e 100644
--- a/torch/fx/passes/split_module.py
+++ b/torch/fx/passes/split_module.py
@@ -1,4 +1,5 @@
 import inspect
+import operator
 from typing import Any, Callable, Dict, List, Optional
 
 import torch
@@ -159,6 +160,25 @@ def record_cross_partition_use(
 
     # split nodes into parititons
     for node in m.graph.nodes:
+        # Annotations on local names within function are lost during FX transforms.
+        # Adding back known type annotation for getitem nodes for jit scriptability.
+        if node.target == operator.getitem:
+            sequence_node, index_node = node.args
+            # only support type Tuple for now
+            if (
+                hasattr(sequence_node.type, "_name")
+                and sequence_node.type._name == "Tuple"
+            ):
+                parameterized_types = sequence_node.type.__args__
+                if len(parameterized_types) == 2 and isinstance(
+                    parameterized_types[1], type(...)
+                ):
+                    node.type = parameterized_types[0]
+                else:
+                    assert len(parameterized_types) > index_node
+                    node_type = parameterized_types[index_node]
+                    node.type = node_type
+
         orig_nodes[node.name] = node
 
         # TODO currently placeholders/parameters aren't put into random partitions,
@@ -210,7 +230,10 @@ def record_cross_partition_use(
     for partition_name in sorted_partitions:
         partition = partitions[partition_name]
         for input in partition.inputs:
-            placeholder = partition.graph.placeholder(input)
+            placeholder = partition.graph.placeholder(
+                input,
+                type_expr=orig_nodes[input].type,
+            )
             placeholder.meta = orig_nodes[input].meta.copy()
             partition.environment[orig_nodes[input]] = placeholder
 
@@ -248,7 +271,11 @@ def record_cross_partition_use(
             assert isinstance(gathered_args, tuple)
             assert isinstance(gathered_kwargs, dict)
             new_node = partition.graph.create_node(
-                op=node.op, target=target, args=gathered_args, kwargs=gathered_kwargs
+                op=node.op,
+                target=target,
+                args=gathered_args,
+                kwargs=gathered_kwargs,
+                type_expr=node.type,
             )
             new_node.meta = node.meta.copy()
             partition.environment[node] = new_node

From 11d215d69c7c6ea54d1bda89735200e57da5ded4 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Fri, 18 Nov 2022 23:44:57 +0000
Subject: [PATCH 1091/1922] [BE] Print backtraces from coredumps (#89309)

By simply invoking `gdb python core -ex "bt" -ex "q"`

Test plan:
 See: [linux-focal-py3.7-gcc7 / test (default, 1, 2, linux.2xlarge)](https://github.com/pytorch/pytorch/actions/runs/3500498821/jobs/5863369649#step:14:39)
Not sure why multiprocessing tests SEGFAULT, but they do
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89309
Approved by: https://github.com/clee2000, https://github.com/huydhn
---
 .github/workflows/_linux-test.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
index 16f25fed91212..454e558fbee49 100644
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@@ -192,6 +192,7 @@ jobs:
             -w /var/lib/jenkins/workspace \
             "${DOCKER_IMAGE}"
           )
+          echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
           docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
 
       - name: Get workflow job id
@@ -216,6 +217,12 @@ jobs:
         with:
           file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
 
+      - name: Collect backtraces from coredumps (if any)
+        if: always()
+        run: |
+          # shellcheck disable=SC2156
+          find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
+
       - name: Store Core dumps on S3
         uses: seemethere/upload-artifact-s3@v5
         if: failure()

From d72c340d16855c73f152d533d218b4a0675e1b89 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Fri, 18 Nov 2022 13:14:40 -0800
Subject: [PATCH 1092/1922] Detach fake tensors into val, so they aren't
 affected by metadata mutation (#89140)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89140
Approved by: https://github.com/bdhirsh
---
 test/test_proxy_tensor.py             | 19 +++++++++++++++----
 torch/fx/experimental/proxy_tensor.py | 17 +++++++++++++++--
 2 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index aa12b5b74d1c8..e174a14837919 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -401,6 +401,19 @@ def f(x):
             )
         )
 
+    def test_val_metadata_mutation(self):
+        def f(x):
+            y = x.clone()
+            y.unsqueeze_(0)
+            return y
+
+        traced = make_fx(f, tracing_mode=self.tracing_mode)(torch.randn(3, requires_grad=True))
+        self.assertEqual([
+            tuple(node.meta['val'].shape)
+            for node in traced.graph.nodes
+            if 'val' in node.meta
+        ], [(3,), (3,), (1, 3)])
+
     def test_make_fx_overloads(self):
         def f(x):
             return x.cos() + torch.randn(x.shape)
@@ -847,8 +860,7 @@ def forward(self, a_1):
     sym_size = torch.ops.aten.sym_size(a_1, 0);  a_1 = None
     mul = sym_size * 2;  sym_size = None
     empty = torch.ops.aten.empty.memory_format([mul], device = device(type='cpu'), pin_memory = False);  mul = None
-    detach = torch.ops.aten.detach.default(empty);  empty = None
-    return detach""")
+    return empty""")
 
 
     def test_neg_shape(self):
@@ -862,8 +874,7 @@ def forward(self, a_1):
     neg = -sym_size;  sym_size = None
     add = neg + 10;  neg = None
     empty = torch.ops.aten.empty.memory_format([add], device = device(type='cpu'), pin_memory = False);  add = None
-    detach = torch.ops.aten.detach.default(empty);  empty = None
-    return detach""")
+    return empty""")
 
     def test_sqrt_size(self):
         def f(a):
diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
index c3a5d706e3cc4..daa17f94b7bb0 100644
--- a/torch/fx/experimental/proxy_tensor.py
+++ b/torch/fx/experimental/proxy_tensor.py
@@ -106,20 +106,33 @@ def get_proxy(obj):
 def has_proxy(obj):
     return get_proxy(obj) is not None
 
+def snapshot_fake(val):
+    return val.detach()
+
+# What invariants do we have for the 'val' set on the FX node?  It has accurate
+# metadata... but only for metadata that exists "below" all other subsystems
+# (most notably autograd, but also vmap, functorch transforms, etc).  This means
+# you can get the dtype, shape, stride, storage, but you CANNOT get requires_grad,
+# grad_fn, _base (_base actually may be set due to recursive call to
+# ADInplaceOrView, but you shouldn't rely on it.)
 def set_meta(proxy, val):
     if isinstance(val, FakeTensor):
-        proxy.node.meta['val'] = val
+        proxy.node.meta['val'] = snapshot_fake(val)
         proxy.node.meta['tensor_meta'] = _extract_tensor_metadata(val)
     elif isinstance(val, py_sym_types):
         proxy.node.meta['val'] = val
     elif isinstance(val, list) or isinstance(val, tuple):
         if all(isinstance(x, FakeTensor) for x in val):
-            proxy.node.meta['val'] = val
+            proxy.node.meta['val'] = [snapshot_fake(x) for x in val]
     elif isinstance(val, torch.Tensor):
         if not val.is_sparse:
             proxy.node.meta['tensor_meta'] = _extract_tensor_metadata(val)
             # NB: Kinda hacky, but we should try to get val as the metadata
             # everywhere
+            # TODO: This doesn't properly track storages.  A more robust
+            # approach would be to maintain a per-trace FakeTensorMode and
+            # from_real_tensor to create fake values (don't forget to
+            # snapshot_fake)
             fake_tensor_mode = FakeTensorMode(allow_fallback_kernels=True)
             with fake_tensor_mode:
                 proxy.node.meta['val'] = torch.empty_strided(val.shape, val.stride(), device=val.device, dtype=val.dtype)

From fc590ecd6b41d726fff503df8c3bbf8488d3e13c Mon Sep 17 00:00:00 2001
From: zhxchen17 <zhxchen17@fb.com>
Date: Sat, 19 Nov 2022 00:19:47 +0000
Subject: [PATCH 1093/1922] [functorch] introduce an experimental map() op.
 (#88767)

Summary:
We want to introduce an experimental control flow op: map() to export some models as FX graphs correctly.

Some calrification on basic requirements we have in mind:
1. This op can nest cond() and other control flow primitives internally.
2. We don't necessarily need loop carried dependencies for the models we've seen.
3. This map() op can handle dynamically shaped tensor as input and return dynamically shaped output based on input shapes.
4. We should be able to pass through additional arguments to the loop body as extra arguments.

In this diff we introduce a new control flow op `map()` which has the following semantics:
```
def map(f: Callable, xs: Tensor, *args):
    # one possible implementation:
    return torch.stack([f(x, *args) for x in xs])
```

Test Plan:
pytest functorch/test_control_flow.py
CI

Differential Revision: D41165796

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88767
Approved by: https://github.com/zou3519
---
 functorch/experimental/__init__.py     |   4 +-
 functorch/experimental/_map.py         | 105 +++++++++++++++++++++
 functorch/experimental/cond.py         |  48 ++++++++--
 functorch/experimental/control_flow.py |   1 +
 test/functorch/test_control_flow.py    | 122 ++++++++++++++++++++++++-
 torch/csrc/utils/python_dispatch.cpp   |  29 ++++--
 6 files changed, 290 insertions(+), 19 deletions(-)
 create mode 100644 functorch/experimental/_map.py
 create mode 100644 functorch/experimental/control_flow.py

diff --git a/functorch/experimental/__init__.py b/functorch/experimental/__init__.py
index ea874acafc425..3a4c92ffbe7a5 100644
--- a/functorch/experimental/__init__.py
+++ b/functorch/experimental/__init__.py
@@ -1,5 +1,5 @@
-from .batch_norm_replacement import replace_all_batch_norm_modules_
 # PyTorch forward-mode is not mature yet
-from .._src.eager_transforms import jvp, jacfwd, hessian
+from .._src.eager_transforms import hessian, jacfwd, jvp
 from .._src.vmap import chunk_vmap
+from .batch_norm_replacement import replace_all_batch_norm_modules_
 from functorch import functionalize
diff --git a/functorch/experimental/_map.py b/functorch/experimental/_map.py
new file mode 100644
index 0000000000000..d681526da4b34
--- /dev/null
+++ b/functorch/experimental/_map.py
@@ -0,0 +1,105 @@
+import torch
+import torch.utils._pytree as pytree
+from torch._C import DispatchKey, DispatchKeySet, ExcludeDispatchKeyGuard
+from torch._ops import PyOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    get_proxy_slot,
+    make_fx,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+from torch.utils._python_dispatch import (
+    _get_current_dispatch_mode,
+    _pop_mode_temporarily,
+)
+from torch.utils._pytree import tree_flatten
+
+
+map = PyOperator("map")
+
+
+def trace_map(proxy_mode, func_overload, f, xs, *args):
+    def _unwrap_proxy(e):
+        if not isinstance(e, (torch.Tensor, torch.SymInt, torch.SymFloat)):
+            return e
+        return get_proxy_slot(e, proxy_mode.tracer, e, lambda e: e.proxy)
+
+
+    if not isinstance(xs, torch.Tensor):
+        raise ValueError("map() must loop over a tensor")
+    if len(xs.shape) == 0 or xs.shape[0] == 0:
+        raise ValueError("map() cannot be traced with scalar tensors or zero dimension tensors")
+    if not all(isinstance(o, (torch.Tensor, torch.nn.Module)) for o in args):
+        raise ValueError("map() operands must be a list of tensors or modules")
+
+    with disable_proxy_modes_tracing():
+        body_graph = make_fx(f)(xs[0], *args)
+
+    next_name = None
+    i = 0
+    while not next_name:
+        candidate = f"body_graph_{i}"
+        if hasattr(proxy_mode.tracer.root, candidate):
+            i += 1
+        else:
+            next_name = candidate
+
+    proxy_mode.tracer.root.register_module(next_name, body_graph)
+    node_args = (body_graph, xs, *args)
+    proxy_args = pytree.tree_map(_unwrap_proxy, node_args)
+    out_proxy = proxy_mode.tracer.create_proxy('call_function', func_overload, proxy_args, {},
+                                               name="map")
+    outs = [body_graph(x, *args) for x in xs]
+    # Implementation notes: we need to use new_empty() + copy_() here instead of stack() directly
+    # because stack([...]) takes a fixed size list which will specialize dynamic shape here.
+    # Meanwhile we want to preserve the looped over dimension as symbolic shape, such that:
+    # ys: Tensor[s0, ...] = map(xs: Tensor[s0, ...], *args)
+    out = xs.new_empty([xs.shape[0], *outs[0].shape])
+    out.copy_(torch.stack(outs))
+    return track_tensor_tree(out, out_proxy, constant=None, tracer=proxy_mode.tracer)
+
+
+@map.py_impl(DispatchKey.CPU)
+def map_cpu(f, xs, *args):
+    mode = _get_current_dispatch_mode()
+    assert (mode is None), "Mode should never be enabled for CPU key"
+    return torch.stack([f(x, *args) for x in xs])
+
+
+@map.py_impl(DispatchKey.AutogradCPU)
+def map_autograd(f, xs, *args):
+    # TODO: support autograd
+    flat_operands, _ = tree_flatten([f, xs, args])
+    assert all([not f.requires_grad for f in flat_operands
+                if isinstance(f, torch.Tensor)])
+
+    _ = ExcludeDispatchKeyGuard(DispatchKeySet(DispatchKey.AutogradCPU))
+    return map(f, xs, *args)
+
+
+@map.py_impl(ProxyTorchDispatchMode)
+def map_proxy_torch_dispatch_mode(f, xs, *args):
+    mode = _get_current_dispatch_mode()
+    assert (mode is not None), "Mode should always be enabled for python fallback key"
+    with _pop_mode_temporarily() as mode:
+        res = trace_map(mode, map, f, xs, *args)
+    return res
+
+
+@map.py_impl(FakeTensorMode)
+def map_fake_tensor_mode(f, xs, *args):
+    return torch.stack([f(x, *args) for x in xs])
+
+# We cannot directly call fallthrough here due to issue #89037.
+@map.py_impl(DispatchKey.PythonDispatcher)
+def map_python_dispatcher(*args):
+    _ = ExcludeDispatchKeyGuard(DispatchKeySet(DispatchKey.PythonDispatcher))
+    return map(*args)
+
+
+# TODO(voz) Make this automatic for keys, this is very ugly atm
+map.fallthrough(DispatchKey.PythonTLSSnapshot)
+map.fallthrough(DispatchKey.ADInplaceOrView)
+map.fallthrough(DispatchKey.BackendSelect)
diff --git a/functorch/experimental/cond.py b/functorch/experimental/cond.py
index e620dbadeccbc..bc6f776d073f6 100644
--- a/functorch/experimental/cond.py
+++ b/functorch/experimental/cond.py
@@ -1,19 +1,31 @@
+# TODO(zhxchen17) Expose API through functorhc.experimental.control_flow
+#                 and rename this file to _cond.py.
 import torch
+
+import torch.utils._pytree as pytree
+
 from torch._C import DispatchKey, DispatchKeySet, ExcludeDispatchKeyGuard
 from torch._ops import PyOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    get_isolated_graphmodule,
+    get_proxy_slot,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+from torch.fx.passes.shape_prop import _extract_tensor_metadata
+from torch.utils._python_dispatch import (
+    _get_current_dispatch_mode,
+    _pop_mode_temporarily,
+)
 from torch.utils._pytree import tree_flatten
-from torch.fx.experimental.proxy_tensor import get_isolated_graphmodule, get_proxy_slot
-import torch.utils._pytree as pytree
-from torch.utils._python_dispatch import _get_current_dispatch_mode, _pop_mode_temporarily
-from torch.fx.experimental.proxy_tensor import track_tensor_tree
-from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode
 
 
 """
 We're going to define a `cond` operation.
 In order to do this, we need implementations for each of the dispatch keys.
 """
-cond = PyOperator('cond')
+cond = PyOperator("cond")
 
 
 def trace_cond(proxy_mode, func_overload, pred, true_fn, false_fn, operands):
@@ -115,6 +127,30 @@ def inner(pred, true_fn, false_fn, operands):
     return res
 
 
+@cond.py_impl(FakeTensorMode)
+def cond_fake_tensor_mode(pred, true_fn, false_fn, operands):
+    true_outs = true_fn(*operands)
+    flat_true_outs, _ = pytree.tree_flatten(true_outs)
+    flat_false_outs, _ = pytree.tree_flatten(false_fn(*operands))
+    if len(flat_true_outs) != len(flat_false_outs):
+        raise RuntimeError("Unmatched number of outputs from cond() branches.")
+
+    for true_out, false_out in zip(flat_true_outs, flat_false_outs):
+        true_meta = _extract_tensor_metadata(true_out)
+        false_meta = _extract_tensor_metadata(false_out)
+        if true_meta != false_meta:
+            raise RuntimeError(
+                f"Unmatched tensor metadata from cond() branches.\ntrue branch: {true_meta}, false branch: {false_meta}")
+    return true_outs
+
+
+# We cannot directly call fallthrough here due to issue #89037.
+@cond.py_impl(DispatchKey.PythonDispatcher)
+def cond_python_dispatcher(*args):
+    _ = ExcludeDispatchKeyGuard(DispatchKeySet(DispatchKey.PythonDispatcher))
+    return cond(*args)
+
+
 # TODO(voz): Make this automatic for keys, this is very ugly atm
 cond.fallthrough(DispatchKey.PythonTLSSnapshot)
 cond.fallthrough(DispatchKey.ADInplaceOrView)
diff --git a/functorch/experimental/control_flow.py b/functorch/experimental/control_flow.py
new file mode 100644
index 0000000000000..c46c83fd005d9
--- /dev/null
+++ b/functorch/experimental/control_flow.py
@@ -0,0 +1 @@
+from ._map import map  # noqa: F401
diff --git a/test/functorch/test_control_flow.py b/test/functorch/test_control_flow.py
index 5c3cb2dd72ad4..39e1967d1b278 100644
--- a/test/functorch/test_control_flow.py
+++ b/test/functorch/test_control_flow.py
@@ -1,10 +1,11 @@
 # Owner(s): ["module: functorch"]
 import torch
-
-from torch.testing._internal.common_utils import TestCase, run_tests
 from functorch.experimental.cond import cond
+from functorch.experimental import control_flow
 from torch.fx.experimental.proxy_tensor import make_fx
 
+from torch.testing._internal.common_utils import run_tests, TestCase
+
 class TestControlFlow(TestCase):
     def test_cond_no_trace(self):
         def true_fn(x):
@@ -345,5 +346,122 @@ def f(x, y):
         with self.assertRaises(AssertionError):
             make_fx(f, tracing_mode="fake")(x, torch.tensor(False))
 
+    def check_map_graph(self, gm, key):
+        i = 0
+        for node in gm.graph.nodes:
+            if node.op == "call_function" and node.target == torch.ops.map:
+                i += 1
+                self.assertEqual(
+                    node.meta[key].shape[0], node.args[1].meta[key].shape[0]
+                )
+        self.assertEqual(i, 1)
+
+    def test_map_real(self):
+        def f(x, y):
+            return x + y
+
+        def g(xs, y):
+            return control_flow.map(f, xs, y)
+
+        gm = make_fx(g, tracing_mode="real")(torch.ones(3, 2, 2), torch.ones(2))
+        x = torch.randn(3, 2, 2)
+        y = torch.randn(2)
+        res = gm(x, y)
+        self.assertEqual(res, g(x, y))
+        self.check_map_graph(gm, "tensor_meta")
+
+    def test_map_symbolic(self):
+        def f(x, y):
+            return x + y
+
+        def g(xs, y):
+            return control_flow.map(f, xs, y)
+
+        gm = make_fx(g, tracing_mode="symbolic")(torch.ones(3, 2, 4), torch.ones(4))
+        x = torch.randn(3, 2, 2)
+        y = torch.randn(2)
+        res = gm(x, y)
+        self.assertEqual(res, g(x, y))
+        self.check_map_graph(gm, "val")
+
+    def test_nested_map_cond_real(self):
+        def true_fn(x, y):
+            return x * y
+
+        def false_fn(x, y):
+            return x + y
+
+        def f(x, pred, y):
+            return cond(pred, true_fn, false_fn, [x, y])
+
+        def g(pred, xs, y):
+            return control_flow.map(f, xs, pred, y)
+
+        gm = make_fx(g, tracing_mode="real")(
+            torch.tensor(True), torch.ones(3, 2, 4), torch.ones(4)
+        )
+        pred = torch.tensor(False)
+        x = torch.randn(3, 2, 2)
+        y = torch.randn(2)
+        res = gm(pred, x, y)
+        self.assertEqual(res, g(pred, x, y))
+        self.check_map_graph(gm, "tensor_meta")
+
+    def test_nested_map_cond_symbolic(self):
+        def true_fn(x, y):
+            return x * y
+
+        def false_fn(x, y):
+            return x + y
+
+        def f(x, pred, y):
+            return cond(pred, true_fn, false_fn, [x, y])
+
+        def g(pred, xs, y):
+            return control_flow.map(f, xs, pred, y)
+
+        gm = make_fx(g, tracing_mode="symbolic")(
+            torch.tensor(True), torch.ones(3, 2, 4), torch.ones(4)
+        )
+        pred = torch.tensor(False)
+        x = torch.randn(3, 2, 2)
+        y = torch.randn(2)
+        res = gm(pred, x, y)
+        self.assertEqual(res, g(pred, x, y))
+        self.check_map_graph(gm, "val")
+
+    def test_nested_cond_map_cond_symbolic(self):
+
+        def true_fn(x, y):
+            return x * y
+
+        def false_fn(x, y):
+            return x + y
+
+        def f(x, pred, y):
+            return cond(pred, true_fn, false_fn, [x, y])
+
+        def g(pred, xs, y):
+            return control_flow.map(f, xs, pred, y)
+
+        def main_true_fn(pred, xs, y):
+            return g(pred, xs, y) * 2
+
+        def main_false_fn(pred, xs, y):
+            return g(pred, xs, y) + 1
+
+        def main(p, pred, xs, y):
+            return cond(p, main_true_fn, main_false_fn, [pred, xs, y])
+
+        gm = make_fx(main, tracing_mode="symbolic")(
+            torch.tensor(True), torch.tensor(True), torch.ones(3, 2, 4), torch.ones(4)
+        )
+        p = torch.tensor(False)
+        pred = torch.tensor(False)
+        xs = torch.randn(3, 2, 2)
+        y = torch.randn(2)
+        res = gm(p, pred, xs, y)
+        self.assertEqual(res, main(p, pred, xs, y))
+
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/csrc/utils/python_dispatch.cpp b/torch/csrc/utils/python_dispatch.cpp
index 662ab9981a1d9..381e82e1fcdbc 100644
--- a/torch/csrc/utils/python_dispatch.cpp
+++ b/torch/csrc/utils/python_dispatch.cpp
@@ -479,14 +479,23 @@ void initDispatchBindings(PyObject* module) {
 
 #define DEF_ONE(n) .value(#n, c10::DispatchKey::n)
 
-  py::enum_<c10::DispatchKey>(m, "DispatchKey") DEF_ONE(Undefined) DEF_ONE(
-      CompositeExplicitAutogradNonFunctional) DEF_ONE(CompositeExplicitAutograd)
+  py::enum_<c10::DispatchKey>(m, "DispatchKey")
+      // clang-format off
+      DEF_ONE(Undefined)
+      DEF_ONE(CompositeExplicitAutogradNonFunctional)
+      DEF_ONE(CompositeExplicitAutograd)
       DEF_ONE(CompositeImplicitAutogradNestedTensor)
-          DEF_ONE(CompositeImplicitAutograd) DEF_ONE(AutogradOther)
-              DEF_ONE(Autograd) DEF_ONE(BackendSelect) DEF_ONE(ADInplaceOrView)
-                  DEF_ONE(PythonTLSSnapshot) DEF_ONE(Python)
-                      DEF_ONE(FuncTorchDynamicLayerFrontMode)
-                          DEF_ONE(FuncTorchDynamicLayerBackMode)
+      DEF_ONE(CompositeImplicitAutograd)
+      DEF_ONE(AutogradOther)
+      DEF_ONE(Autograd)
+      DEF_ONE(BackendSelect)
+      DEF_ONE(ADInplaceOrView)
+      DEF_ONE(PythonTLSSnapshot)
+      DEF_ONE(Python)
+      DEF_ONE(FuncTorchDynamicLayerFrontMode)
+      DEF_ONE(FuncTorchDynamicLayerBackMode)
+      DEF_ONE(PythonDispatcher)
+  // clang-format on
 
 #define DEF_SINGLE(n, prefix) .value(#prefix #n, c10::DispatchKey::prefix##n)
 #define DEF_MULTIPLE(fullname, prefix)              \
@@ -495,11 +504,13 @@ void initDispatchBindings(PyObject* module) {
   C10_FORALL_BACKEND_COMPONENTS(DEF_SINGLE, prefix) \
   DEF_SINGLE(, EndOf##fullname##Backends)
 
-                              C10_FORALL_FUNCTIONALITY_KEYS(DEF_MULTIPLE)
+      // clang-format off
+  C10_FORALL_FUNCTIONALITY_KEYS(DEF_MULTIPLE)
+  // clang-format on
 
 #undef DEF_MULTIPLE
 #undef DEF_SINGLE
-                                  ;
+          ;
 
   py::class_<c10::DispatchKeySet>(m, "DispatchKeySet")
       .def(py::init<c10::DispatchKey>())

From 6be90629bc2bedf728a41e62c16116c565885d20 Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang@meta.com>
Date: Sat, 19 Nov 2022 00:21:11 +0000
Subject: [PATCH 1094/1922] [small] Update error message (#89294)

Summary:
`RuntimeError: Invalid function argument. Expected parameter "tensor_list" to be of type List[torch.Tensor].`

to

`RuntimeError: Invalid function argument. Expected parameter "input_tensor_list" to be of type List[torch.Tensor].`

Test Plan: sandcastle

Differential Revision: D41405238

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89294
Approved by: https://github.com/awgu
---
 torch/distributed/distributed_c10d.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index f46aaaef94ef4..4d343bcffec38 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -2517,7 +2517,7 @@ def all_gather_coalesced(
     if _rank_not_in_group(group):
         _warn_not_in_group("all_gather_coalesced")
         return
-    _check_tensor_list(input_tensor_list, "tensor_list")
+    _check_tensor_list(input_tensor_list, "input_tensor_list")
     _ensure_all_tensors_same_dtype(input_tensor_list)
     if not isinstance(output_tensor_lists, list):
         raise RuntimeError(

From 3ecb34aee36eeaaa45b9402f6ff2dc6ac368cc40 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Sat, 19 Nov 2022 00:22:43 +0000
Subject: [PATCH 1095/1922] =?UTF-8?q?[dashboard][huggingface]=20skip=20acc?=
 =?UTF-8?q?uracy=20checks=20for=20really=20large=20models=E2=80=A6=20(#892?=
 =?UTF-8?q?73)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89273
Approved by: https://github.com/desertfire
---
 benchmarks/dynamo/huggingface.py | 17 ++++++++++++++++-
 benchmarks/dynamo/torchbench.py  |  1 +
 torch/_dynamo/testing.py         |  2 +-
 torch/_dynamo/utils.py           |  4 +++-
 4 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/benchmarks/dynamo/huggingface.py b/benchmarks/dynamo/huggingface.py
index 489fcd69df944..bf127deaa43ab 100755
--- a/benchmarks/dynamo/huggingface.py
+++ b/benchmarks/dynamo/huggingface.py
@@ -89,6 +89,8 @@ def pip_install(package):
 
 
 SKIP = {
+    # Difficult to setup accuracy test because .eval() not supported
+    "Reformer",
     # Fails deepcopy
     "BlenderbotForConditionalGeneration",
     "GPTNeoForCausalLM",
@@ -124,7 +126,7 @@ def pip_install(package):
     "GPT2ForSequenceClassification": 2,
     # "GPTJForCausalLM" : 2,
     # "GPTJForQuestionAnswering" : 2,
-    # "GPTNeoForCausalLM" : 2,
+    # "GPTNeoForCausalLM" : 32,
     # "GPTNeoForSequenceClassification" : 2,
     "GoogleFnet": 2,
     "LayoutLMForMaskedLM": 2,
@@ -153,6 +155,13 @@ def pip_install(package):
     "YituTechConvBert": 2,
 }
 
+SKIP_ACCURACY_CHECK_MODELS = {
+    # Models too large to have eager, dynamo and fp64_numbers simultaneosuly
+    # even for 40 GB machine.
+    "DebertaV2ForMaskedLM",
+    "BlenderbotForCausalLM",
+}
+
 
 def get_module_cls_by_model_name(model_cls_name):
     _module_by_model_name = {
@@ -445,6 +454,12 @@ def iter_model_names(self, args):
                 continue
             yield model_name
 
+    @property
+    def skip_accuracy_checks_large_models_dashboard(self):
+        if self.args.dashboard or self.args.accuracy:
+            return SKIP_ACCURACY_CHECK_MODELS
+        return set()
+
     def pick_grad(self, name, is_training):
         if is_training:
             return torch.enable_grad()
diff --git a/benchmarks/dynamo/torchbench.py b/benchmarks/dynamo/torchbench.py
index cec284ebcd8ca..b7d4a3be7933d 100755
--- a/benchmarks/dynamo/torchbench.py
+++ b/benchmarks/dynamo/torchbench.py
@@ -50,6 +50,7 @@ def setup_torchbench_cwd():
 # size to test the accuracy.
 USE_SMALL_BATCH_SIZE = {
     "demucs": 4,
+    "dlrm": 1024,
     "densenet121": 4,
     "hf_Reformer": 4,
     "timm_efficientdet": 1,
diff --git a/torch/_dynamo/testing.py b/torch/_dynamo/testing.py
index eea4c26a171ca..55186931988ba 100644
--- a/torch/_dynamo/testing.py
+++ b/torch/_dynamo/testing.py
@@ -236,7 +236,7 @@ def rand_strided(size, stride, dtype=torch.float32, device="cpu"):
     if dtype.is_floating_point:
         buffer = torch.randn(needed_size, dtype=dtype, device=device)
     else:
-        buffer = torch.ones(size=[needed_size], dtype=dtype, device=device)
+        buffer = torch.zeros(size=[needed_size], dtype=dtype, device=device)
     return torch.as_strided(buffer, size, stride)
 
 
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index f426ef6913079..e4b92a73aacfc 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -816,7 +816,9 @@ def same(
             res = res.to_dense()
         assert isinstance(res, torch.Tensor), f"type mismatch {type(ref)} {type(res)}"
         if exact_dtype:
-            assert ref.dtype == res.dtype, f"dtype mismatch {ref.dtype}, {res.dtype}"
+            if ref.dtype != res.dtype:
+                log.error(f"dtype mismatch {ref.dtype}, {res.dtype}")
+                return False
             if ref.dtype == torch.bool:
                 # triton stores bool as int8, so add this for more accurate checking
                 return torch.allclose(

From 40d2c05ed6b4470436085ec8da11abe399bac37b Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Sat, 19 Nov 2022 01:13:08 +0000
Subject: [PATCH 1096/1922] Move bazel to c++17 (#89297)

Splitting out various smaller pieces from https://github.com/pytorch/pytorch/pull/85969
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89297
Approved by: https://github.com/huydhn
---
 .bazelrc               | 2 +-
 third_party/gloo.BUILD | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.bazelrc b/.bazelrc
index ce8406b58aaab..f8ff2215f2d6b 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -1,4 +1,4 @@
-build --cxxopt=--std=c++14
+build --cxxopt=--std=c++17
 build --copt=-I.
 # Bazel does not support including its cc_library targets as system
 # headers. We work around this for generated code
diff --git a/third_party/gloo.BUILD b/third_party/gloo.BUILD
index 3f623e54e6ad4..e9deaa13fc63f 100644
--- a/third_party/gloo.BUILD
+++ b/third_party/gloo.BUILD
@@ -75,8 +75,7 @@ cc_library(
         ]
     ) + if_cuda(glob(["gloo/cuda*.cc"])),
     copts = [
-        "-std=gnu++11",
-        "-std=c++11",
+        "-std=c++17",
     ],
     visibility = ["//visibility:public"],
     deps = [":gloo_headers"] + if_cuda(

From 92f577b152eb0b2b74f91724da8bfd9ef16e2661 Mon Sep 17 00:00:00 2001
From: Michael Lazos <mlazos@fb.com>
Date: Sat, 19 Nov 2022 01:47:45 +0000
Subject: [PATCH 1097/1922] [dynamo] mutable local caching to make dynamo
 faster at tracing mutation (#89170)

Make mutation faster to speed up tracing optimizers, helps with https://github.com/pytorch/torchdynamo/issues/1803

`replace_all` no longer iterates over the entire variable tracker data structure  every time a mutation is performed

Each variable tracker internally keeps a set of contained mutable variable trackers, to provide a hint to `replace_all`. This is populated with a call to `apply` from `__post_init__` in the base `VariableTracker`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89170
Approved by: https://github.com/jansel
---
 torch/_dynamo/side_effects.py     |  6 +--
 torch/_dynamo/symbolic_convert.py | 13 +++++--
 torch/_dynamo/variables/base.py   | 61 +++++++++++++++++++++++++------
 torch/_dynamo/variables/dicts.py  | 41 +++++++++++++++++----
 torch/_dynamo/variables/lists.py  | 20 +++++++---
 torch/_dynamo/variables/misc.py   | 10 +++--
 torch/_dynamo/variables/torch.py  |  2 +-
 7 files changed, 119 insertions(+), 34 deletions(-)

diff --git a/torch/_dynamo/side_effects.py b/torch/_dynamo/side_effects.py
index 1f8675ae1c9e3..55e6e9f927e8d 100644
--- a/torch/_dynamo/side_effects.py
+++ b/torch/_dynamo/side_effects.py
@@ -82,16 +82,16 @@ def clone(self):
             keepalive=list(self.keepalive),
         )
 
-    def apply(self, fn, cache=None):
+    def apply(self, fn, cache=None, skip_fn=lambda _: False):
         if cache is None:
             cache = dict()
 
         self.id_to_variable = collections.OrderedDict(
-            (k, VariableTracker.apply(fn, v, cache))
+            (k, VariableTracker.apply(fn, v, cache, skip_fn))
             for k, v in self.id_to_variable.items()
         )
         self.store_attr_mutations = collections.OrderedDict(
-            (k, VariableTracker.apply(fn, v, cache))
+            (k, VariableTracker.apply(fn, v, cache, skip_fn))
             for k, v in self.store_attr_mutations.items()
         )
 
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index d2bc5332719c5..7a16b6b982a03 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -400,11 +400,18 @@ def repl(v: VariableTracker):
                 return newvar
             return v
 
+        def skip(v: VariableTracker):
+            return oldvar.mutable_local not in v.recursively_contains
+
         cache = dict()
-        self.output.side_effects.apply(repl, cache)
-        self.stack = [VariableTracker.apply(repl, x, cache) for x in self.stack]
+        self.output.side_effects.apply(repl, cache, skip_fn=skip)
+        self.stack = [
+            VariableTracker.apply(repl, x, cache, skip_fn=skip) for x in self.stack
+        ]
         for k, x in self.symbolic_locals.items():
-            self.symbolic_locals[k] = VariableTracker.apply(repl, x, cache)
+            self.symbolic_locals[k] = VariableTracker.apply(
+                repl, x, cache, skip_fn=skip
+            )
 
     def replace_all(self, oldvar: VariableTracker, newvar: VariableTracker):
         if isinstance(oldvar.mutable_local, side_effects.MutableSideEffects):
diff --git a/torch/_dynamo/variables/base.py b/torch/_dynamo/variables/base.py
index 62cddfff0cb29..4c5aee344061e 100644
--- a/torch/_dynamo/variables/base.py
+++ b/torch/_dynamo/variables/base.py
@@ -21,7 +21,15 @@ def __eq__(self, other):
         return self is other
 
 
-class VariableTracker:
+# metaclass to call post_init
+class HasPostInit(type):
+    def __call__(cls, *args, **kwargs):
+        obj = type.__call__(cls, *args, **kwargs)
+        obj.__post_init__(*args, **kwargs)
+        return obj
+
+
+class VariableTracker(object, metaclass=HasPostInit):
     """
     Base class for tracked locals and stack values
 
@@ -70,7 +78,11 @@ def copy(cls, value):
 
     @classmethod
     def apply(
-        cls, fn: Callable[["VariableTracker"], "VariableTracker"], value, cache=None
+        cls,
+        fn: Callable[["VariableTracker"], "VariableTracker"],
+        value,
+        cache=None,
+        skip_fn=lambda _: False,  # Whether we should skip applying to this var
     ):
         """
         Walk this object and call fn on all the VariableTracker
@@ -84,21 +96,29 @@ def apply(
             return cache[idx][0]
 
         if isinstance(value, VariableTracker):
-            updated_dict = dict(value.__dict__)
-            for key in updated_dict.keys():
-                if key not in value._nonvar_fields:
-                    updated_dict[key] = cls.apply(fn, updated_dict[key], cache)
-            result = fn(value.clone(**updated_dict))
+            if not skip_fn(value):
+                updated_dict = dict(value.__dict__)
+                for key in updated_dict.keys():
+                    if key not in value._nonvar_fields:
+                        updated_dict[key] = cls.apply(
+                            fn, updated_dict[key], cache, skip_fn
+                        )
+                result = fn(value.clone(**updated_dict))
+            else:
+                result = fn(value)
+
         elif istype(value, list):
-            result = [cls.apply(fn, v, cache) for v in value]
+            result = [cls.apply(fn, v, cache, skip_fn) for v in value]
         elif istype(value, tuple):
-            result = tuple(cls.apply(fn, v, cache) for v in value)
+            result = tuple(cls.apply(fn, v, cache, skip_fn) for v in value)
         elif istype(value, collections.OrderedDict):
             result = collections.OrderedDict(
-                cls.apply(fn, v, cache) for v in value.items()
+                cls.apply(fn, v, cache, skip_fn) for v in value.items()
             )
         elif istype(value, dict):
-            result = {k: cls.apply(fn, v, cache) for k, v in list(value.items())}
+            result = {
+                k: cls.apply(fn, v, cache, skip_fn) for k, v in list(value.items())
+            }
         else:
             result = value
 
@@ -244,11 +264,30 @@ def __init__(
         guards: Optional[Set] = None,
         source: Source = None,
         mutable_local: MutableLocal = None,
+        recursively_contains: Optional[Set] = None,
     ):
         super(VariableTracker, self).__init__()
         self.guards = guards or set()
         self.source = source
         self.mutable_local = mutable_local
+        self.recursively_contains = (
+            recursively_contains  # provides hint to replace_all when replacing vars
+        )
+
+    def __post_init__(self, *args, **kwargs):
+        if self.recursively_contains is None:
+            self.recursively_contains = set()
+
+            def aggregate_mutables(var):
+                self.recursively_contains.update(var.recursively_contains)
+                if var.mutable_local is not None:
+                    self.recursively_contains.add(var.mutable_local)
+
+                return var
+
+            VariableTracker.apply(
+                aggregate_mutables, self, skip_fn=lambda var: var is not self
+            )
 
 
 def typestr(*objs):
diff --git a/torch/_dynamo/variables/dicts.py b/torch/_dynamo/variables/dicts.py
index 30df18f6d6e92..f28efc713db4a 100644
--- a/torch/_dynamo/variables/dicts.py
+++ b/torch/_dynamo/variables/dicts.py
@@ -16,8 +16,10 @@
 
 
 class ConstDictVariable(VariableTracker):
-    def __init__(self, items, user_cls, **kwargs):
-        super(ConstDictVariable, self).__init__(**kwargs)
+    def __init__(self, items, user_cls, recursively_contains=None, **kwargs):
+        super(ConstDictVariable, self).__init__(
+            recursively_contains=recursively_contains, **kwargs
+        )
         self.items = items
         self.user_cls = user_cls
 
@@ -112,7 +114,17 @@ def call_method(
                 tx.store_dict_key(global_key_name(k), k)
             newval = collections.OrderedDict(val)
             newval[k] = args[1]
-            return tx.replace_all(self, self.modifed(newval, **options))
+
+            new_rec_contains = self.recursively_contains.union(
+                args[1].recursively_contains
+            )
+            if args[1].mutable_local is not None:
+                new_rec_contains.add(args[1].mutable_local)
+
+            return tx.replace_all(
+                self,
+                self.modifed(newval, new_rec_contains, **options),
+            )
         elif (
             name in ("pop", "get")
             and args
@@ -130,7 +142,7 @@ def call_method(
         ):
             newval = collections.OrderedDict(val)
             result = newval.pop(ConstDictVariable.get_key(args[0]))
-            tx.replace_all(self, self.modifed(newval, **options))
+            tx.replace_all(self, self.modifed(newval, None, **options))
             return result.add_options(options)
         elif (
             name == "update"
@@ -140,7 +152,12 @@ def call_method(
         ):
             newval = collections.OrderedDict(val)
             newval.update(args[0].items)
-            result = self.modifed(newval, **options)
+            new_rec_contains = self.recursively_contains.union(
+                args[0].recursively_contains
+            )
+            result = self.modifed(
+                newval, recursively_contains=new_rec_contains, **options
+            )
             return tx.replace_all(self, result)
         elif (
             name in ("get", "__getattr__")
@@ -159,9 +176,11 @@ def call_method(
         else:
             return super().call_method(tx, name, args, kwargs)
 
-    def modifed(self, items, **options):
+    def modifed(self, items, recursively_contains, **options):
         """a copy of self with different items"""
-        return self.clone(items=items, **options)
+        return self.clone(
+            items=items, recursively_contains=recursively_contains, **options
+        )
 
     def unpack_var_sequence(self, tx):
         options = VariableTracker.propagate([self])
@@ -237,7 +256,13 @@ def call_method(
                             f"defaultdict with default_factory = {self.default_factory}"
                         )
                     new_val[k] = default_var
-                    tx.replace_all(self, self.modifed(new_val, **options))
+                    new_rec_contains = self.recursively_contains.union(
+                        default_var.recursively_contains
+                    )
+                    new_rec_contains.add(default_var.mutable_local)
+                    tx.replace_all(
+                        self, self.modifed(new_val, new_rec_contains, **options)
+                    )
                     return default_var
         else:
             return super().call_method(tx, name, args, kwargs)
diff --git a/torch/_dynamo/variables/lists.py b/torch/_dynamo/variables/lists.py
index 70c6da07adb5a..553c9ca1e664d 100644
--- a/torch/_dynamo/variables/lists.py
+++ b/torch/_dynamo/variables/lists.py
@@ -23,8 +23,12 @@ def cls_for(obj):
             tuple: TupleVariable,
         }[obj]
 
-    def __init__(self, items: List[VariableTracker], **kwargs):
-        super(BaseListVariable, self).__init__(**kwargs)
+    def __init__(
+        self, items: List[VariableTracker], recursively_contains=None, **kwargs
+    ):
+        super(BaseListVariable, self).__init__(
+            recursively_contains=recursively_contains, **kwargs
+        )
         assert isinstance(items, list)
         assert all(isinstance(x, VariableTracker) for x in items)
         self.items: List[VariableTracker] = items
@@ -145,9 +149,13 @@ def call_method(
         if name == "append" and self.mutable_local:
             assert not kwargs
             (arg,) = args
+            new_rec_contains = self.recursively_contains.union(arg.recursively_contains)
+            new_rec_contains.add(arg.mutable_local)
             tx.replace_all(
                 self,
-                ListVariable(self.items + [arg], **options),
+                ListVariable(
+                    self.items + [arg], recursively_contains=new_rec_contains, **options
+                ),
             )
             return ConstantVariable(None)
         elif (
@@ -454,8 +462,10 @@ def var_getattr(self, tx, name):
 
 
 class ListIteratorVariable(VariableTracker):
-    def __init__(self, items, index: int = 0, **kwargs):
-        super(ListIteratorVariable, self).__init__(**kwargs)
+    def __init__(self, items, index: int = 0, recursively_contains=None, **kwargs):
+        super(ListIteratorVariable, self).__init__(
+            recursively_contains=recursively_contains, **kwargs
+        )
         assert isinstance(items, list)
         # Removing this check as it slows things down too much
         # https://github.com/pytorch/pytorch/pull/87533#issuecomment-1287574492
diff --git a/torch/_dynamo/variables/misc.py b/torch/_dynamo/variables/misc.py
index 952cbd2c64244..f8975f70fcfb3 100644
--- a/torch/_dynamo/variables/misc.py
+++ b/torch/_dynamo/variables/misc.py
@@ -107,6 +107,10 @@ def __init__(self, target_values, initial_values=None, **kwargs):
         super(ContextWrappingVariable, self).__init__(**kwargs)
         self.target_values = target_values
         self.initial_values = initial_values
+        self.recursively_contains = (
+            set()
+        )  # This var doesn't contain any child vars and doesn't support clone() properly,
+        # so don't populate this automatically
 
     def enter(self, tx):
         self._call_func(tx, self.target_values)
@@ -294,7 +298,7 @@ def fn_name(self):
 
 class AutocastModeVariable(ContextWrappingVariable):
     @staticmethod
-    def create(tx, target_values, kwargs):
+    def create(target_values, kwargs):
         values = target_values
         # device_type : str,
         # dtype : Optional[_dtype] = None,
@@ -322,10 +326,10 @@ def create(tx, target_values, kwargs):
         else:
             values.append(variables.ConstantVariable(None))
 
-        var = AutocastModeVariable(tx, target_values, initial_values=None, **kwargs)
+        var = AutocastModeVariable(target_values, initial_values=None, **kwargs)
         return var
 
-    def __init__(self, tx, target_values, initial_values=None, **kwargs):
+    def __init__(self, target_values, initial_values=None, **kwargs):
         super(AutocastModeVariable, self).__init__(
             target_values=target_values, initial_values=initial_values, **kwargs
         )
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index 651f80b5d77d4..4c4681b75622b 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -293,7 +293,7 @@ def call_function(
                 tensor_with_tf_override.subclass_type,
             )
         elif self.value is torch.amp.autocast_mode.autocast:
-            return AutocastModeVariable.create(tx, target_values=args, kwargs=kwargs)
+            return AutocastModeVariable.create(target_values=args, kwargs=kwargs)
         elif self.value in (
             torch.profiler.profile,
             torch.profiler.record_function,

From 3573252c6f1ea54558541eb1e1d65db492d46c8b Mon Sep 17 00:00:00 2001
From: Driss Guessous <drisspg@fb.com>
Date: Sat, 19 Nov 2022 02:06:24 +0000
Subject: [PATCH 1098/1922] Update sdp dispatch logic to enable fused backward
 (#89154)

# Summary
Reorganizes how the sdp dispatch logic is down in order to enable backwards for fused kernels

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89154
Approved by: https://github.com/cpuhrsch
---
 aten/src/ATen/native/native_functions.yaml    |  52 ++---
 .../cuda/NestedTensorTransformerFunctions.cpp | 100 ++++++---
 .../ATen/native/transformers/attention.cpp    |  65 ++++--
 .../native/transformers/cuda/attention.cu     |  46 ++---
 .../transformers/cuda/attention_backward.cu   |  40 +++-
 .../transformers/cuda/flash_attn/fmha_api.cpp |   7 +-
 .../transformers/cuda/flash_attn/fmha_api.h   |   2 +-
 .../ATen/native/transformers/cuda/sdp_utils.h |  34 +++-
 benchmarks/transformer/sdp_backwards.py       | 189 ++++++++++++++++++
 .../check_forward_backward_compatibility.py   |   3 +
 test/functorch/test_ops.py                    |   8 +-
 test/test_meta.py                             |   1 -
 test/test_transformers.py                     |  74 +++++--
 tools/autograd/derivatives.yaml               |   6 +-
 .../_internal/common_methods_invocations.py   |   5 +
 15 files changed, 497 insertions(+), 135 deletions(-)
 create mode 100644 benchmarks/transformer/sdp_backwards.py

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index f625c9faff412..8c759cd09c486 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -13252,18 +13252,39 @@
     CPU, NestedTensorCPU, Meta: _fused_sdp_choice_cpp
     CUDA, NestedTensorCUDA: _fused_sdp_choice_cuda
 
-# Register the math kernel for cpu
-- func: _scaled_dot_product_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool need_attn_weights=False, bool is_causal=False) -> (Tensor, Tensor)
+- func: _scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool need_attn_weights=False, bool is_causal=False) -> (Tensor, Tensor)
   variants: function
+
+- func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool return_softmax=False, bool is_causal=False) -> (Tensor, Tensor, Tensor)
   dispatch:
-    CUDA: _scaled_dot_product_attention_forward_cuda
-    CPU: _scaled_dot_product_attention_forward_math
-    NestedTensorCUDA: _scaled_dot_product_attention_forward_nested
-    NestedTensorCPU: _scaled_dot_product_attention_forward_math
-    Meta: _scaled_dot_product_attention_forward_math
+    CUDA: _scaled_dot_product_flash_attention_cuda
+    NestedTensorCUDA: _scaled_dot_product_flash_attention_nestedtensor_cuda
 
-- func: _scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool need_attn_weights=False, bool is_causal=False) -> (Tensor, Tensor)
+- func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, bool compute_log_sumexp, bool is_causal=False) -> (Tensor, Tensor)
+  dispatch:
+    CUDA: _scaled_dot_product_efficient_attention_cuda
+    NestedTensorCUDA: _scaled_dot_product_efficient_attention_nestedtensor_cuda
+
+- func: _scaled_dot_product_efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, bool is_causal=False) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CUDA: _scaled_dot_product_efficient_attention_backward_cuda
+
+# Returns ouput, softmax_logsumexp, softmax
+- func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, bool return_softmax, float dropout_p, bool is_causal) -> (Tensor, Tensor, Tensor)
   variants: function
+  dispatch:
+    CUDA: _flash_attention_forward
+
+# Returns ouput, logsumexp if compute_logsumexp
+- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, bool compute_log_sumexp=False, bool causal=False) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: _efficient_attention_forward
+
+- func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, bool is_causal=False) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: _efficient_attention_backward
 
 - func: _triton_scaled_dot_attention(Tensor q, Tensor k, Tensor v, float dropout_p=0.0) -> Tensor
   variants: function
@@ -13290,21 +13311,6 @@
   structured: True
   variants: function
 
-- func: _flash_scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal) -> Tensor
-  variants: function
-  dispatch:
-    CUDA: flash_scaled_dot_product_attention
-
-- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, bool compute_log_sumexp=False, bool causal=False) -> (Tensor, Tensor)
-  variants: function
-  dispatch:
-    CUDA: _efficient_attention_forward
-
-- func: _efficient_attention_backward(Tensor grad, Tensor query, Tensor key, Tensor value, Tensor logsumexp, Tensor out, bool is_causal=False) -> (Tensor, Tensor, Tensor)
-  variants: function
-  dispatch:
-    CUDA: _efficient_attention_backward
-
 - func: _transformer_decoder_only_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, Tensor? incr_key=None, Tensor? incr_value=None) -> (Tensor, Tensor, Tensor)
   variants: function
   dispatch:
diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
index c2bf4e08ce042..9c72454560d38 100644
--- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
@@ -214,26 +214,6 @@ Tensor NestedTensor_to_padded_tensor_cuda(
   return NestedTensor_to_padded_tensor_generic(t, padding, output_size);
 }
 
-std::tuple<Tensor, Tensor> _scaled_dot_product_attention_forward_nested(
-        const Tensor& query_, const Tensor& key, const Tensor& value,
-        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool need_attn_weights, bool is_causal) {
-
-    // Determine which efficient kernel to use
-    sdp::sdp_params kernel_params{query_, key, value, attn_mask_.has_value(), dropout_p, need_attn_weights, is_causal};
-    auto backend = select_sdp_backend(kernel_params);
-    switch(backend){
-      case sdp::SDPBackend::flash_attention:
-          // TODO: enable flash attention kernel
-          return mem_efficient_helper_nested_unpacked(query_, key, value, dropout_p, need_attn_weights, is_causal);
-      case sdp::SDPBackend::efficient_attention:
-          return mem_efficient_helper_nested_unpacked(query_, key, value, dropout_p, need_attn_weights, is_causal);
-      case sdp::SDPBackend::math:
-        return at::_scaled_dot_product_attention_math(query_, key, value, attn_mask_, dropout_p, need_attn_weights, is_causal);
-      default:
-        TORCH_CHECK(false, "Unsupported backend for scaled_dot_product_attention");
-        return std::make_tuple(Tensor(), Tensor());
-    }
-}
 namespace{
 
 /**
@@ -340,19 +320,80 @@ bool is_safe_to_get_storage_as_tensor(const NestedTensorImpl* tensor) {
 }
 
 } // namespace
-std::tuple<Tensor, Tensor> mem_efficient_helper_nested_unpacked(
+
+std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_flash_attention_nestedtensor_cuda(
     const Tensor& query,
     const Tensor& key,
     const Tensor& value,
     double dropout_p,
-    bool need_atten_weights,
+    bool return_softmax,
     bool is_causal) {
+  TORCH_CHECK(false, "There are currently cuda memory errors being returned from this path.")
   // Query (Batch x Num_heads x {Q_seq_len}  x Dim_per_head)
   // Key   (Batch x Num_heads x {KV_seq_len} x Dim_per_head)
   // Value (Batch x Num_heads x {KV_seq_len} x Dim_per_head)
   const int64_t num_heads = query.size(1);
   const int64_t head_dim = query.size(3);
 
+  // Query -> Query (Batch x {Q_seq_len}  x Num_heads x Dim_per_head)
+  // Key   -> Key   (Batch x {KV_seq_len} x Num_heads x Dim_per_head)
+  // Value -> Value (Batch x {KV_seq_len} x Num_heads x Dim_per_head)
+  Tensor q_t = query.transpose(1, 2).contiguous();
+  Tensor k_t = key.transpose(1, 2).contiguous();
+  Tensor v_t = value.transpose(1, 2).contiguous();
+
+  // K and V have to have the same Nnz, should probably torch_check
+  // assume in order to not iterate over v
+
+  auto cumulative_and_max_q = cumulative_and_max_seq_len(q_t);
+  auto cumulative_and_max_k = cumulative_and_max_seq_len(k_t);
+
+  Tensor cumulative_sequence_length_q = std::get<0>(cumulative_and_max_q);
+  Tensor cumulative_sequence_length_k = std::get<0>(cumulative_and_max_k);
+
+  const int64_t max_seqlen_batch_q = std::get<1>(cumulative_and_max_q);
+  const int64_t max_seqlen_batch_k = std::get<1>(cumulative_and_max_k);
+
+  const int64_t Nnz_q  = cumulative_sequence_length_q[-1].item<int64_t>();
+  const int64_t Nnz_kv = cumulative_sequence_length_k[-1].item<int64_t>();
+
+  auto query_buffer_reshaped =
+      get_buffer(q_t).view({Nnz_q, num_heads, head_dim});
+  auto key_buffer_reshaped =
+      get_buffer(k_t).view({Nnz_kv, num_heads, head_dim});
+  auto value_buffer_reshaped =
+      get_buffer(v_t).view({Nnz_kv, num_heads, head_dim});
+
+  auto attention_and_lse_and_softmax =
+  at::_flash_attention_forward(
+      query_buffer_reshaped,
+      key_buffer_reshaped,
+      value_buffer_reshaped,
+      cumulative_sequence_length_q,
+      cumulative_sequence_length_k,
+      max_seqlen_batch_q,
+      max_seqlen_batch_k,
+      return_softmax,
+      dropout_p,
+      is_causal);
+  // Reshape output to convert nnz to batch_size and seq_len
+  Tensor attention = std::get<0>(attention_and_lse_and_softmax);
+  attention = wrap_buffer(attention.view(-1), get_nested_size_tensor(q_t).clone()).transpose(1,2);
+  return std::tie(attention, std::get<1>(attention_and_lse_and_softmax), std::get<2>(attention_and_lse_and_softmax));
+}
+
+std::tuple<Tensor, Tensor> _scaled_dot_product_efficient_attention_nestedtensor_cuda(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    bool compute_log_sumexp,
+    bool is_causal) {
+   // Query (Batch x Num_heads x {Q_seq_len}  x Dim_per_head)
+  // Key   (Batch x Num_heads x {KV_seq_len} x Dim_per_head)
+  // Value (Batch x Num_heads x {KV_seq_len} x Dim_per_head)
+  const int64_t num_heads = query.size(1);
+  const int64_t head_dim = query.size(3);
+
   Tensor q_t = query.transpose(1, 2);
   Tensor k_t = key.transpose(1, 2);
   Tensor v_t = value.transpose(1, 2);
@@ -432,7 +473,7 @@ std::tuple<Tensor, Tensor> mem_efficient_helper_nested_unpacked(
       {Nnz_kv, num_heads, head_dim},
       {nnz_v_stride, head_v_stride, head_dim_stride},
       value_impl->get_storage_offsets()[0]);
-  std::tuple<Tensor, Tensor> attention_and_weights =
+  std::tuple<Tensor, Tensor> attention_and_logsumexp=
       at::_efficient_attention_forward(
           query_buffer_reshaped.unsqueeze(0),
           key_buffer_reshaped.unsqueeze(0),
@@ -440,14 +481,14 @@ std::tuple<Tensor, Tensor> mem_efficient_helper_nested_unpacked(
           cumulative_sequence_length_q,
           cumulative_sequence_length_k,
           max_seqlen_batch_q,
-          false,
-          false);
+          compute_log_sumexp,
+          is_causal);
   // Reshape output to convert nnz to batch_size and seq_len
-  Tensor attention = std::get<0>(attention_and_weights);
+  Tensor attention = std::get<0>(attention_and_logsumexp);
   attention =
       wrap_buffer(attention.view(-1), get_nested_size_tensor(q_t).clone())
           .transpose(1, 2);
-  return std::tie(attention, std::get<1>(attention_and_weights));
+  return std::tie(attention, std::get<1>(attention_and_logsumexp));
 }
 
 Tensor flash_attention_helper(
@@ -492,7 +533,7 @@ Tensor flash_attention_helper(
   // If we are passing in query, key, value all the same tensors then we have
   // packed them into one tensor and need to slice for flash attention
   Tensor attention =
-      at::_flash_scaled_dot_product_attention(
+      std::get<0>(at::_flash_attention_forward(
           q,
           k,
           v,
@@ -500,8 +541,9 @@ Tensor flash_attention_helper(
           cumulative_sequence_length_q,
           max_seqlen_batch_q,
           max_seqlen_batch_q,
+          false /*return_softmax*/,
           dropout_p,
-          is_causal);
+          is_causal));
   // Output of flash_attention is a regular tensor lets wrap it back up to
   // form a nested tensor
 
diff --git a/aten/src/ATen/native/transformers/attention.cpp b/aten/src/ATen/native/transformers/attention.cpp
index 89a0e4691018c..9c5be12ef24db 100644
--- a/aten/src/ATen/native/transformers/attention.cpp
+++ b/aten/src/ATen/native/transformers/attention.cpp
@@ -678,20 +678,6 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> native_decoder_only_multi_head_attent
 //     L: Target sequence length
 //     E: Embedding dimension
 std::tuple<Tensor, Tensor> _scaled_dot_product_attention(
-        const Tensor& query_, const Tensor& key, const Tensor& value,
-        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool need_attn_weights, bool is_causal) {
-        if (query_.requires_grad() || key.requires_grad() || value.requires_grad()){
-          return at::_scaled_dot_product_attention_math(query_, key, value, attn_mask_, dropout_p, need_attn_weights, is_causal);
-        }
-        return at::_scaled_dot_product_attention_forward(query_, key, value, attn_mask_, dropout_p, need_attn_weights, is_causal);
-}
-
-int64_t _fused_sdp_choice_cpp(const Tensor& query_, const Tensor& key, const Tensor& value,
-        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool need_attn_weights, bool is_causal){
-  return static_cast<int64_t>(sdp::SDPBackend::math);
-}
-
-std::tuple<Tensor, Tensor> _scaled_dot_product_attention_forward_math(
     const Tensor& query_,
     const Tensor& key,
     const Tensor& value,
@@ -699,14 +685,49 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_forward_math(
     double dropout_p,
     bool need_attn_weights,
     bool is_causal) {
-  return at::_scaled_dot_product_attention_math(
-      query_,
-      key,
-      value,
-      attn_mask_,
-      dropout_p,
-      need_attn_weights,
-      is_causal);
+  // TODO: The second return is the attention weights if the math kernel is
+  // used. The fused kernels do not return this Tensor so for the fused kernels
+  // The second return SHOULD always be an empty Tensor, unless need_attn_weights
+  // is true (in which case the fused kernels would not be called). This blows up
+  // op_info tests.
+  int64_t choice_int = at::_fused_sdp_choice(
+      query_, key, value, attn_mask_, dropout_p, need_attn_weights, is_causal);
+  sdp::SDPBackend backend = static_cast<sdp::SDPBackend>(choice_int);
+  switch (backend) {
+    case sdp::SDPBackend::flash_attention: {
+      auto out_lse_softmax = at::_scaled_dot_product_flash_attention(
+          query_, key, value, dropout_p, need_attn_weights, is_causal);
+      return std::make_tuple(
+          std::move(std::get<0>(out_lse_softmax)),
+          std::move(std::get<2>(out_lse_softmax)));
+    }
+    case sdp::SDPBackend::efficient_attention: {
+      bool compute_logsumexp =
+          (query_.requires_grad() || key.requires_grad() ||
+           value.requires_grad());
+      return at::_scaled_dot_product_efficient_attention(
+          query_, key, value, compute_logsumexp, is_causal);
+    }
+    case sdp::SDPBackend::math:
+      return at::_scaled_dot_product_attention_math(
+          query_,
+          key,
+          value,
+          attn_mask_,
+          dropout_p,
+          need_attn_weights,
+          is_causal);
+    default:
+      TORCH_CHECK(
+          false,
+          "No viable backend for scaled_dot_product_attention was found.");
+      return std::make_tuple(Tensor(), Tensor());
+  }
+}
+
+int64_t _fused_sdp_choice_cpp(const Tensor& query_, const Tensor& key, const Tensor& value,
+        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool need_attn_weights, bool is_causal){
+  return static_cast<int64_t>(sdp::SDPBackend::math);
 }
 
 std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math(
diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu
index 602cf319f74a6..8dcb99b3380d9 100644
--- a/aten/src/ATen/native/transformers/cuda/attention.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention.cu
@@ -678,12 +678,12 @@ std::tuple<Tensor, Tensor> native_multi_head_attention_cuda(
   return std::make_tuple(std::move(proj), std::move(qkt));
 }
 
-std::tuple<Tensor, Tensor> flash_attention_helper_dense_unpacked(
+std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_flash_attention_cuda(
     const Tensor& query,
     const Tensor& key,
     const Tensor& value,
     double dropout_p,
-    bool need_atten_weights,
+    bool return_softmax,
     bool is_causal) {
   // Query (Batch x Num_heads x Q_seq_len  x Dim_per_head)
   // Key   (Batch x Num_heads x KV_seq_len x Dim_per_head)
@@ -726,8 +726,9 @@ std::tuple<Tensor, Tensor> flash_attention_helper_dense_unpacked(
   Tensor key_reshaped = k_t.reshape({Nnz_kv, num_heads, head_dim});
   Tensor value_reshaped = v_t.reshape({Nnz_kv, num_heads, head_dim});
 
-  Tensor attention =
-      at::_flash_scaled_dot_product_attention(
+  Tensor attention, log_sumexp, softmax;
+  std::tie(attention, log_sumexp, softmax) =
+      at::_flash_attention_forward(
           query_reshaped,
           key_reshaped,
           value_reshaped,
@@ -735,15 +736,17 @@ std::tuple<Tensor, Tensor> flash_attention_helper_dense_unpacked(
           cumulative_sequence_length_k,
           max_seqlen_batch_q,
           max_seqlen_batch_k,
+          return_softmax,
           dropout_p,
           is_causal);
   // Reshape output to convert nnz to batch_size and seq_len
   attention =
       attention.view({batch_size, max_seqlen_batch_q, num_heads, head_dim}).transpose(1,2);
 
-  return std::tuple<Tensor, Tensor>(attention, Tensor());
+  return std::make_tuple(attention, log_sumexp, softmax);
 }
-std::tuple<Tensor, Tensor> mem_eff_helper(
+
+std::tuple<Tensor, Tensor> _scaled_dot_product_efficient_attention_cuda(
     const Tensor& query,
     const Tensor& key,
     const Tensor& value,
@@ -767,26 +770,7 @@ std::tuple<Tensor, Tensor> mem_eff_helper(
       compute_log_sumexp,
       is_causal);
   attention = attention.transpose(1,2);
-  return std::make_tuple(std::move(attention), Tensor());
-}
-
-std::tuple<Tensor, Tensor> _scaled_dot_product_attention_forward_cuda(
-        const Tensor& query_, const Tensor& key, const Tensor& value,
-        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool need_attn_weights, bool is_causal) {
-    // Determine which efficient kernel to use
-    sdp::sdp_params kernel_params{query_, key, value, attn_mask_.has_value(), dropout_p, need_attn_weights, is_causal};
-    auto backend = select_sdp_backend(kernel_params);
-    switch(backend){
-      case sdp::SDPBackend::flash_attention:
-          return flash_attention_helper_dense_unpacked(query_, key, value, dropout_p, need_attn_weights, is_causal);
-      case sdp::SDPBackend::efficient_attention:
-          return mem_eff_helper(query_, key , value, need_attn_weights, is_causal);
-      case sdp::SDPBackend::math:
-        return at::_scaled_dot_product_attention_math(query_, key, value, attn_mask_, dropout_p, need_attn_weights, is_causal);
-      default:
-        TORCH_CHECK(false, "No viable backend for scaled_dot_product_attention was found.");
-        return std::make_tuple(Tensor(), Tensor());
-    }
+  return std::make_tuple(std::move(attention), std::move(log_sumexp));
 }
 
 int64_t _fused_sdp_choice_cuda(const Tensor& query_, const Tensor& key, const Tensor& value,
@@ -802,7 +786,7 @@ int64_t _fused_sdp_choice_cuda(const Tensor& query_, const Tensor& key, const Te
   return static_cast<int64_t>(backend);
 }
 
-Tensor flash_scaled_dot_product_attention(
+std::tuple<Tensor, Tensor, Tensor> _flash_attention_forward(
     const Tensor& query,
     const Tensor& key,
     const Tensor& value,
@@ -810,11 +794,12 @@ Tensor flash_scaled_dot_product_attention(
     const Tensor& cumulative_sequence_length_k,
     const int64_t max_seqlen_batch_q,
     const int64_t max_seqlen_batch_k,
+    bool return_softmax,
     double dropout_p,
     bool is_causal) {
 #if defined(USE_FLASH_ATTENTION)
   auto softmax_scale = std::pow(query.size(-1), -0.5);
-  std::vector<Tensor> output = fmha::mha_fwd(
+  return fmha::mha_fwd(
       query,
       key,
       value,
@@ -826,12 +811,11 @@ Tensor flash_scaled_dot_product_attention(
       softmax_scale,
       false,
       is_causal,
-      false,
+      return_softmax,
       c10::nullopt);
-  return output[0];
 #endif
   TORCH_CHECK(false, "USE_FLASH_ATTENTION was not enabled for build.")
-  return Tensor();
+  return std::make_tuple(Tensor(), Tensor(), Tensor());
 }
 
 std::tuple<at::Tensor, at::Tensor> _efficient_attention_forward(
diff --git a/aten/src/ATen/native/transformers/cuda/attention_backward.cu b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
index af005b2669b29..a063aacb901ee 100644
--- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
@@ -10,6 +10,7 @@
 #include <ATen/native/transformers/attention.h>
 #include <ATen/native/transformers/cuda/sdp_utils.h>
 
+#include <iostream>
 #ifdef USE_FLASH_ATTENTION
 #include <ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h>
 #endif
@@ -73,14 +74,14 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _efficient_attention_backward(
     const at::Tensor& query,
     const at::Tensor& key,
     const at::Tensor& value,
-    const at::Tensor& logsumexp,
     const at::Tensor& out,
+    const at::Tensor& logsumexp,
     bool causal) {
   #if defined(USE_FLASH_ATTENTION)
   if (!grad_out_.defined()) {
     return std::make_tuple(Tensor{}, Tensor{}, Tensor{});
   }
-    // ndim
+  // ndim
   TORCH_CHECK(query.dim() == grad_out_.dim());
   TORCH_CHECK(query.dim() == key.dim());
   TORCH_CHECK(query.dim() == value.dim());
@@ -128,6 +129,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _efficient_attention_backward(
   // initialized
   bool grad_kv_needs_init = causal && N > M;
   at::Tensor grad_q, grad_k, grad_v;
+  int8_t gQKV_strideM_multiplier = 1;
   if (!grad_kv_needs_init && query.size(1) == key.size(1) &&
       query.size(3) == value.size(3) &&
       query.storage().is_alias_of(key.storage()) &&
@@ -141,10 +143,13 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _efficient_attention_backward(
     grad_q = chunk.select(2, 0);
     grad_k = chunk.select(2, 1);
     grad_v = chunk.select(2, 2);
+    gQKV_strideM_multiplier=3;
   } else {
-    grad_q = at::empty_like(query);
-    grad_k = grad_kv_needs_init ? at::zeros_like(key) : at::empty_like(key);
-    grad_v = grad_kv_needs_init ? at::zeros_like(value) : at::empty_like(value);
+    grad_q = at::empty(query.sizes(), query.options());
+    grad_k = grad_kv_needs_init ? at::zeros(key.sizes(), key.options())
+                                : at::empty(key.sizes(), key.options());
+    grad_v = grad_kv_needs_init ? at::zeros(value.sizes(), value.options())
+                                : at::empty(value.sizes(), value.options());
   }
 
   auto launchKernel = [&](auto _k, int computeCapability) {
@@ -198,7 +203,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _efficient_attention_backward(
     ASSIGN_CHECK_OVERFLOW(p.gQ_strideH, grad_q.stride(2));
     ASSIGN_CHECK_OVERFLOW(p.gK_strideH, grad_k.stride(2));
     ASSIGN_CHECK_OVERFLOW(p.gV_strideH, grad_v.stride(2));
-    p.gQKV_strideM_multiplier = grad_q.is_contiguous() ? 1 : 3;
+    p.gQKV_strideM_multiplier = gQKV_strideM_multiplier;
     TORCH_INTERNAL_ASSERT(p.gQ_strideM() == grad_q.stride(1));
     TORCH_INTERNAL_ASSERT(p.gK_strideM() == grad_k.stride(1));
     TORCH_INTERNAL_ASSERT(p.gV_strideM() == grad_v.stride(1));
@@ -257,5 +262,28 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _efficient_attention_backward(
   return std::make_tuple(Tensor{}, Tensor{}, Tensor{});
 }
 
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_efficient_attention_backward_cuda(
+    const at::Tensor& grad_out_,
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const at::Tensor& out,
+    const at::Tensor& logsumexp,
+    bool causal){
+  if (!grad_out_.defined()) {
+    return std::make_tuple(Tensor{}, Tensor{}, Tensor{});
+  }
+  auto grad_out = grad_out_.transpose(1, 2);
+  auto out_t = out.transpose(1, 2);
+  auto q_t = query.transpose(1, 2);
+  auto k_t = key.transpose(1, 2);
+  auto v_t = value.transpose(1, 2);
+
+  Tensor grad_q, grad_k, grad_v;
+  std::tie(grad_q, grad_k, grad_v) = at::_efficient_attention_backward(grad_out, q_t, k_t, v_t, out_t, logsumexp, causal);
+  return std::make_tuple(grad_q.transpose(1, 2), grad_k.transpose(1, 2), grad_v.transpose(1, 2));
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
index aaf7d833fe833..7cc0c250664e1 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
@@ -26,6 +26,7 @@
  *
  ******************************************************************************/
 
+#include <tuple>
 #ifdef USE_FLASH_ATTENTION
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
@@ -115,7 +116,7 @@ void set_params_fprop(FMHA_fprop_params &params,
     params.is_causal = is_causal;
 }
 
-std::vector<at::Tensor>
+std::tuple<at::Tensor, at::Tensor, at::Tensor>
 mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
         const at::Tensor &k,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
         const at::Tensor &v,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
@@ -241,9 +242,7 @@ mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q
 
     run_fmha_fprop(launch_params, /*configure=*/false);
 
-    std::vector<at::Tensor> result = {o, softmax_lse};
-    if (return_softmax) {result.push_back(s);}
-    return result;
+    return std::make_tuple(o, softmax_lse, s);
 }
 } // namespace fmha
 #endif
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.h b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.h
index 226d4ddd2b551..b0555463be040 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.h
@@ -7,7 +7,7 @@
 namespace fmha {
 
 TORCH_API
-std::vector<at::Tensor>
+std::tuple<at::Tensor, at::Tensor, at::Tensor>
 mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
         const at::Tensor &k,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
         const at::Tensor &v,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.h b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
index 5d62a6cbd0dc5..55e9aeb184a22 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.h
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
@@ -91,6 +91,31 @@ inline bool check_for_seq_len_1_nested_tensor(sdp_params params, bool debug) {
   return true;
 }
 
+inline bool check_for_nested_inputs(sdp_params params, bool debug){
+  if (params.query.is_nested() || params.key.is_nested() || params.value.is_nested()) {
+    TORCH_CHECK(!debug, "We are not enabling nested Tensors for Flash Attention because of cuda memory errors.");
+    return false;
+  }
+  return true;
+}
+
+inline bool check_requires_grad(sdp_params params, bool debug) {
+  if (params.query.requires_grad() || params.key.requires_grad() || params.value.requires_grad()) {
+    TORCH_CHECK(!debug, "Flash Attention does not currently support training.");
+    return false;
+  }
+  return true;
+}
+
+inline bool check_requires_grad_and_nested(sdp_params params, bool debug) {
+  // If we fail both checks then we return false
+  if (!check_for_nested_inputs(params, false) && !check_requires_grad(params,false)){
+      TORCH_CHECK(!debug, "Memory efficient attention currently doesn't support training with NT inputs.");
+      return false;
+  }
+  return true;
+}
+
 inline bool check_for_attn_mask(sdp_params params, bool debug) {
   if (params.has_attn_mask) {
     TORCH_CHECK(!debug, "Flash Attention does not support attention mask.");
@@ -198,13 +223,15 @@ inline bool use_flash_attention(sdp_params params, bool debug) {
   return false;
 #endif
   //  Define gate functions that determine if a flash kernel can be ran
-  constexpr std::array<bool(*)(sdp_params, bool), 7> constraints {{
+  constexpr std::array<bool(*)(sdp_params, bool), 9> constraints {{
       check_runtime_disabled_flash,
+      check_requires_grad,
       check_tensor_shapes,
       check_for_attn_weights,
       check_for_attn_mask,
       check_head_dim_size,
       check_gpu_sm75_or_greater,
+      check_for_nested_inputs,
       check_for_seq_len_1_nested_tensor}};
   for (auto& constraint : constraints) {
     if (!constraint(params, debug)) {
@@ -232,14 +259,15 @@ inline bool use_mem_efficient_attention(sdp_params params, bool debug) {
       at::kHalf, at::kFloat, at::kBFloat16};
 
   //  Define gate functions that determine if a flash kernel can be ran
-  std::vector<std::function<bool(sdp_params, bool)>> constraints{
+  constexpr std::array<bool(*)(sdp_params, bool), 8> constraints{{
       check_gpu_sm50_or_greater,
       check_runtime_disabled_mem_efficient,
+      check_requires_grad_and_nested,
       check_for_attn_weights,
       check_tensor_shapes,
       check_for_attn_mask,
       check_for_seq_len_1_nested_tensor,
-      check_for_non_zero_dropout};
+      check_for_non_zero_dropout}};
   for (auto& constraint : constraints) {
     if (!constraint(params, debug)) {
       return false;
diff --git a/benchmarks/transformer/sdp_backwards.py b/benchmarks/transformer/sdp_backwards.py
new file mode 100644
index 0000000000000..2f745e157b280
--- /dev/null
+++ b/benchmarks/transformer/sdp_backwards.py
@@ -0,0 +1,189 @@
+import torch
+import numpy as np
+import random
+import torch.utils.benchmark as benchmark
+from torch.profiler import profile, record_function, ProfilerActivity
+
+
+class CompositeMHA(torch.nn.Module):
+    def __init__(self, num_heads, in_proj_weight, in_proj_bias, out_proj):
+        super().__init__()
+        self.in_proj_weight = in_proj_weight
+        self.in_proj_bias = in_proj_bias
+        self.out_proj = out_proj
+        self.num_heads = num_heads
+
+    def forward(self, query, key, value, mask):
+        if not (query is key and key is value):
+            raise NotImplementedError(
+                "query, key and value must be the same Tensor for now."
+            )
+        if mask is not None:
+            raise NotImplementedError("mask is currently not supported.")
+
+        query_projected = torch.nn.functional.linear(
+            query, self.in_proj_weight, self.in_proj_bias
+        )
+
+        batch_size = query_projected.size(0)
+        embed_dim = query_projected.size(2)
+        head_dim = embed_dim // (self.num_heads * 3)
+
+        query, key, value = query_projected.chunk(3, -1)
+
+        query = query.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        attn, _ = torch.nn.functional._scaled_dot_product_attention(
+            query,
+            key,
+            value,
+            attn_mask=None,
+            dropout_p=0.0,
+            need_attn_weights=False,
+            is_causal=False,
+        )
+
+        attn = attn.transpose(1, 2).reshape(batch_size, -1, self.num_heads * head_dim)
+        # Match return signature of nn.MHA
+        return self.out_proj(attn)
+
+
+def build_composite_mha_from_nn_mha(pt):
+    assert pt._qkv_same_embed_dim
+    in_proj_weight = pt.in_proj_weight
+    assert in_proj_weight is not None
+    assert pt.batch_first
+    return CompositeMHA(pt.num_heads, pt.in_proj_weight, pt.in_proj_bias, pt.out_proj)
+
+
+def forw_back(model, input, upward):
+    output = model(*input)
+    output.backward(upward)
+
+
+# Context manger not working in timer
+
+
+def forw_back_fused(model, input, upward):
+    with torch.backends.cuda.sdp_kernel(enable_math=False, enable_mem_efficient=True):
+        output = model(*input)
+        output.backward(upward)
+
+
+def forw_back_eager(model, input, upward):
+    with torch.backends.cuda.sdp_kernel(enable_math=True, enable_mem_efficient=False):
+        output = model(*input)
+        output.backward(upward)
+
+
+def run_timing(
+    min_run_time, batch_size, embed_dimension, num_heads, max_sequence_len, dtype
+):
+    dropout_p = 0.0
+    mask = None
+
+    pt = torch.nn.MultiheadAttention(
+        embed_dim=embed_dimension,
+        num_heads=num_heads,
+        batch_first=True,
+        dropout=dropout_p,
+    )
+    npt = pt.cuda().to(dtype)
+    cpt = build_composite_mha_from_nn_mha(npt)
+    x = torch.randn(
+        batch_size,
+        max_sequence_len,
+        embed_dimension,
+        dtype=dtype,
+        device="cuda",
+        requires_grad=True,
+    )
+
+    with torch.backends.cuda.sdp_kernel(enable_math=False, enable_mem_efficient=True):
+        rand_fused_upward = cpt(x, x, x, mask).clone().detach()
+
+    with torch.backends.cuda.sdp_kernel(enable_math=True, enable_mem_efficient=False):
+        rand_eager_upward = cpt(x, x, x, mask).clone().detach()
+
+    t0 = benchmark.Timer(
+        stmt="forw_back_fused(cpt, (x,x,x,mask), rand_fused_upward)",
+        globals={
+            "forw_back_fused": forw_back_fused,
+            "cpt": cpt,
+            "x": x,
+            "rand_fused_upward": rand_fused_upward,
+            "mask": mask,
+        },
+        label=f"Fused SDP forward and backward batch_size={batch_size} max_sequence_len={max_sequence_len} "
+        f"num_heads={num_heads} embed_dimension={embed_dimension} dtype={dtype}",
+        num_threads=torch.get_num_threads(),
+    )
+
+    t1 = benchmark.Timer(
+        stmt="forw_back_eager(cpt, (x,x,x,mask), rand_eager_upward)",
+        globals={
+            "forw_back_eager": forw_back_eager,
+            "cpt": cpt,
+            "x": x,
+            "rand_eager_upward": rand_eager_upward,
+            "mask": mask,
+        },
+        label=f"Eager SDP forward and backward batch_size={batch_size} max_sequence_len={max_sequence_len} "
+        f"num_heads={num_heads} embed_dimension={embed_dimension} dtype={dtype}",
+        num_threads=torch.get_num_threads(),
+    )
+
+    m0 = t0.blocked_autorange(min_run_time=min_run_time)
+    m1 = t1.blocked_autorange(min_run_time=min_run_time)
+
+    print(m0)
+    print(m1)
+
+    activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA]
+
+    print("Profile for Fused".center(200, "-"))
+    with torch.backends.cuda.sdp_kernel(enable_math=False, enable_mem_efficient=True):
+        with profile(
+            activities=activities, record_shapes=False, with_stack=True
+        ) as prof:
+            with record_function("Fused SDP forward and backward"):
+                for _ in range(20):
+                    forw_back(cpt, (x, x, x, mask), rand_fused_upward)
+    print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=20))
+
+    print("Profile for eager".center(200, "-"))
+    with torch.backends.cuda.sdp_kernel(enable_math=True, enable_mem_efficient=False):
+        with profile(
+            activities=activities, record_shapes=False, with_stack=True
+        ) as prof:
+            with record_function("Fused SDP forward and backward"):
+                for _ in range(20):
+                    forw_back(cpt, (x, x, x, mask), rand_eager_upward)
+    print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=20))
+
+
+def main():
+    seed = 123
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    random.seed(seed)
+
+    min_run_time = 10
+    batch_size = 64
+    num_heads = 32
+    max_seq_len = 256
+    embed_dim = 1024
+    dtype = torch.bfloat16
+
+    print(
+        f"Running timing for batch_size={batch_size} max_sequence_len={max_seq_len} "
+        f"num_heads={num_heads} embed_dimension={embed_dim} dtype={dtype}"
+    )
+    run_timing(min_run_time, batch_size, embed_dim, num_heads, max_seq_len, dtype)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/forward_backward_compatibility/check_forward_backward_compatibility.py b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
index 90080ab0934f4..853f5206969b3 100644
--- a/test/forward_backward_compatibility/check_forward_backward_compatibility.py
+++ b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
@@ -317,6 +317,9 @@
     ("aten::_upsample_nearest_exact1d_backward", datetime.date(2022, 12, 15)),
     ("aten::_upsample_nearest_exact2d", datetime.date(2022, 12, 15)),
     ("aten::_upsample_nearest_exact2d_backward", datetime.date(2022, 12, 15)),
+    ("aten::_flash_scaled_dot_product_attention", datetime.date(2022, 12, 15)),
+    ("aten::_scaled_dot_product_attention_forward", datetime.date(2022, 12, 15)),
+    ("aten::_efficient_attention_backward", datetime.date(2022, 12, 15)),
     ("mkldnn::_convolution_pointwise.binary", datetime.date(2022, 12, 15)),
 ]
 
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index 91ea2443777b8..f276b739f81da 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -401,6 +401,7 @@ def wrapped_fn(*args, **kwargs):
         skip('nn.functional.max_unpool2d'),  # fails everywhere except on windows
         skip('nn.functional.max_unpool3d'),  # fails everywhere except on mac
         xfail("native_batch_norm"),
+        xfail('nn.functional._scaled_dot_product_attention', device_type='cuda'),
 
         xfail('nn.functional.rrelu')  # in-place test errors out with no formula implemented
     }))
@@ -522,6 +523,7 @@ def f(inp, *args, **kwargs):
         xfail('nn.functional.ctc_loss'),  # Not Implemented
         xfail('native_layer_norm', ''),  # Expected a proper Tensor but got None for argument #1 'other'
         xfail('sparse.sampled_addmm', ''),  # sparse tensors have no strides
+        skip('nn.functional._scaled_dot_product_attention', device_type='cuda'),
         # AssertionError: Tensor-likes are not close!
         # Mismatched elements: 1 / 15 (6.7%)
         # Greatest absolute difference: 24.0 at index (2, 4) (up to 1e-05 allowed)
@@ -616,7 +618,7 @@ def fn(inp, *args, **kwargs):
         skip("nn.functional.feature_alpha_dropout", "with_train"),  # calls random op
         skip("nn.functional.fractional_max_pool2d"),  # calls random op
         skip("nn.functional.fractional_max_pool3d"),  # calls random op
-        skip('nn.functional._scaled_dot_product_attention'),  # randomness
+        xfail('nn.functional._scaled_dot_product_attention'),  # randomness
         # It looks like you're either (1) calling .item() on a Tensor or
         # (2) attempting to use a Tensor in some data-dependent control flow or
         # (3) encountering this error in PyTorch internals.
@@ -1093,6 +1095,7 @@ def test():
         skip('nn.functional.rrelu'),  # randomness
         skip('nn.functional.feature_alpha_dropout', 'with_train'),  # randomness
         skip('nn.functional.feature_alpha_dropout', 'without_train'),  # randomness
+        skip('nn.functional._scaled_dot_product_attention', device_type='cuda'),
         skip('nn.functional.alpha_dropout'),  # randomness
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
         skip('to_sparse', ''),  # non-dense output
@@ -1216,6 +1219,7 @@ def get_vjp(cotangents, *primals):
         xfail('nn.functional.soft_margin_loss', ''),  # NYI: forward-AD for log_sigmoid_backward
         xfail('nn.functional.ctc_loss', ''),  # NYI: forward-AD for _ctc_loss
         xfail('nn.functional.pdist', ''),  # NYI: forward-AD with _pdist_forward
+        skip('nn.functional._scaled_dot_product_attention', device_type='cuda'),
         xfail('nn.functional.multi_margin_loss', ''),  # NYI: forward AD with multi_margin_loss
         skip('linalg.householder_product', '', device_type='cuda'),  # flaky, I'm not sure why
         xfail('sparse.sampled_addmm', ''),  # Sparse tensors have no strides
@@ -1336,7 +1340,7 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents):
         xfail('nn.functional.dropout2d'),  # calls random op
         xfail('nn.functional.dropout3d'),  # calls random op
         xfail('nn.functional.dropout'),  # calls random op
-        skip('nn.functional._scaled_dot_product_attention'),  # randomness
+        xfail('nn.functional._scaled_dot_product_attention'),  # randomness
         xfail('nn.functional.embedding_bag'),  # Forward AD not implemented and no decomposition
         xfail('nn.functional.alpha_dropout'),  # calls randomn op
         xfail('nn.functional.feature_alpha_dropout', 'with_train'),  # calls random op
diff --git a/test/test_meta.py b/test/test_meta.py
index 6d21d5c7bd75a..0e3cfb6ef1404 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -294,7 +294,6 @@ def test_tensor_outlives_converter(self):
     aten._fft_c2r.default,
     aten._fft_r2c.default,
     aten._linalg_svd.default,
-    aten._scaled_dot_product_attention_forward.default,
     aten.binary_cross_entropy.default,
     aten.complex.default,
     aten.copysign.Tensor,
diff --git a/test/test_transformers.py b/test/test_transformers.py
index abb4c71ec19ad..f6bc0cc2d6639 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -1059,6 +1059,11 @@ def rand_tensor(shape):
 
         if fused_kernel == "flash":
             with sdp_kernel(enable_mem_efficient=False, enable_math=False):
+                # TODO Flash for the nested path is currently not working due to cuda memory issues
+                if type == "nested":
+                    self.assertRaises(RuntimeError, lambda: torch.nn.functional._scaled_dot_product_attention(
+                        query_lp, key_lp, value_lp, attn_mask=None, dropout_p=0.0, need_attn_weights=False, is_causal=False))
+                    return
                 actual = torch.nn.functional._scaled_dot_product_attention(
                     query_lp, key_lp, value_lp, attn_mask=None, dropout_p=0.0, need_attn_weights=False, is_causal=False)
         elif fused_kernel == "mem_efficient":
@@ -1097,28 +1102,73 @@ def rand_tensor(shape):
 
     @unittest.skipIf(not TEST_CUDA or TEST_WITH_ROCM or IS_WINDOWS, "Flash Attention was not built for this system")
     @parametrize("contiguous_inputs", [True, False])
-    def test_efficient_attention_gradcheck(self, contiguous_inputs: bool):
+    def test_sdp_math_gradcheck(self, contiguous_inputs: bool):
 
         batch_size, seq_len, num_heads, head_dim = 8, 8, 4, 64
-        rand_tensor = partial(self.rand_tensor, device="cuda", dtype=torch.float16, requires_grad=True, packed=True)
+        rand_tensor = partial(self.rand_tensor, device="cuda", dtype=torch.float64, requires_grad=True, packed=True)
 
         qkv = rand_tensor((batch_size, seq_len, num_heads, head_dim))
         query, key, value = qkv.chunk(3, dim=-1)
-        query = query.view(batch_size, -1, num_heads, head_dim)
-        key = key.view(batch_size, -1, num_heads, head_dim)
-        value = value.view(batch_size, -1, num_heads, head_dim)
+
+        query = query.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
 
         if contiguous_inputs:
             query = query.contiguous()
             key = key.contiguous()
             value = value.contiguous()
 
-        # Normally we would transpose the inputs but the fused kernels expect
-        # (batch, seq_len, num_heads, head_dim) bump the tolerance since we can only run kernel
-        # in fp32
-        assert gradcheck(lambda *args, **kwargs:
-                         wrapper_set_seed(torch.ops.aten._efficient_attention_forward, *args, **kwargs),
-                         (query, key, value, None, None, None, True, False), fast_mode=True, atol=8e-5, rtol=1e-3)
+        with sdp_kernel(enable_math=True, enable_mem_efficient=False, enable_flash=False):
+            assert gradcheck(lambda *args, **kwargs:
+                             wrapper_set_seed(torch.nn.functional._scaled_dot_product_attention, *args, **kwargs),
+                             (query, key, value, None, 0.0, False, False)
+                             )
+
+    @unittest.skipIf(not TEST_CUDA or TEST_WITH_ROCM or IS_WINDOWS, "Flash Attention was not built for this system")
+    @parametrize("contiguous_inputs", [True, False])
+    def test_sdp_fused_grad_against_math(self, contiguous_inputs: bool):
+        batch_size, seq_len, num_heads, head_dim = 8, 8, 4, 64
+        rand_tensor = partial(self.rand_tensor, device="cuda", dtype=torch.float64, requires_grad=True, packed=True)
+
+        qkv = rand_tensor((batch_size, seq_len, num_heads, head_dim))
+        qkv_lp = qkv.detach().clone().to(torch.float32).requires_grad_()
+
+        query, key, value = qkv.chunk(3, dim=-1)
+        query_lp, key_lp, value_lp = qkv_lp.chunk(3, dim=-1)
+
+        query = query.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+
+        query_lp = query_lp.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+        key_lp = key_lp.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+        value_lp = value_lp.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+
+        if contiguous_inputs:
+            query = query.contiguous()
+            key = key.contiguous()
+            value = value.contiguous()
+
+            query_lp = query_lp.contiguous()
+            key_lp = key_lp.contiguous()
+            value_lp = value_lp.contiguous()
+
+        with sdp_kernel(enable_math=True, enable_mem_efficient=False, enable_flash=False):
+            out, atten = torch.nn.functional._scaled_dot_product_attention(query, key, value, None, 0.0, False, False)
+
+        with sdp_kernel(enable_math=False, enable_mem_efficient=True, enable_flash=False):
+            out_lp, atten_lp = torch.nn.functional._scaled_dot_product_attention(
+                query_lp, key_lp, value_lp, None, 0.0, False, False)
+
+        rand_upward = torch.rand_like(out)
+        rand_upward_lp = rand_upward.to(torch.float32)
+
+        out.backward(rand_upward)
+        out_lp.backward(rand_upward_lp)
+
+        # Cast up and compare
+        self.assertEqual(qkv.grad, qkv_lp.grad.to(torch.float64), atol=1e-5, rtol=1e-5)
 
     @parametrize("type", ["dense", "nested"])
     def test_fused_sdp_choice(self, type: str):
@@ -1144,7 +1194,7 @@ def test_fused_sdp_choice(self, type: str):
             value = value.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
             key = key.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
 
-            if SM80OrLater:
+            if SM80OrLater and not type == "nested":
                 assert torch._fused_sdp_choice(query, key, value) == SDPBackend.FLASH_ATTENTION
             else:
                 assert torch._fused_sdp_choice(query, key, value) == SDPBackend.EFFICIENT_ATTENTION
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index a0892b32a8352..52c0f76bf0708 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -2613,9 +2613,13 @@
   nested_strides: non_differentiable
 
 # Transformers
+- name: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, bool compute_log_sumexp, bool is_causal=False) -> (Tensor, Tensor)
+  output_differentiability: [True, False]
+  query, key, value: _scaled_dot_product_efficient_attention_backward(grad, query, key, value, result0, result1, is_causal)
+
 - name:  _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, bool compute_log_sumexp=False, bool causal=False) -> (Tensor, Tensor)
   output_differentiability: [True, False]
-  query, key, value: _efficient_attention_backward(grad, query, key, value, result1, result0, causal)
+  query, key, value: _efficient_attention_backward(grad, query, key, value, result0, result1, causal)
 
 # fft
 - name: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 6cff2f6a47491..cf68a68cf629a 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -12008,16 +12008,21 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             # This is only failing on Linux Bionic 3.10 Cuda 11.6
             DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes',
                          device_type='cuda', active_if=_get_torch_cuda_version() >= (11, 6)),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_noncontiguous_samples',
+                         device_type='cuda', dtypes=(torch.float32,)),
             # AssertionError: JIT Test does not execute any logic
             DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
             # Doesn't support autocasting
             DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensorNonErroring', 'test_fake_autocast', device_type='cpu'),
             DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensor', 'test_fake_autocast'),
+            # Forward works for dtype=float64 which is the math path
+            DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD'),
             # No meta function
             DecorateInfo(unittest.skip("Skipped!"), 'TestProxyTensorOpInfo', 'test_make_fx_symbolic_exhaustive'),
             DecorateInfo(unittest.skip("Skipped!"), 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
             DecorateInfo(unittest.skip("Skipped"), 'TestDecomp', 'test_comprehensive'),
             DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensor', 'test_fake'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', device_type='cuda'),
             DecorateInfo(unittest.skip('output is non-deterministic (when dropout_p > 0)'), 'TestCommon', 'test_compare_cpu'),),
     ),
     UnaryUfuncInfo(

From 964ac213ff5f97c9e65415dfdb40dcfbdcd849a7 Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Sat, 19 Nov 2022 02:18:03 +0000
Subject: [PATCH 1099/1922] cast C++ py-bound SymNode to SymInt correctly
 (#89295)

Unfortunately, it's a bit hard to test purely on the Pytorch core side, but it passes the XLA tests which are currently disabled.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89295
Approved by: https://github.com/ezyang
---
 torch/csrc/utils/pybind.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/utils/pybind.cpp b/torch/csrc/utils/pybind.cpp
index 4cd148fdfa91c..1b9d1e3a2f735 100644
--- a/torch/csrc/utils/pybind.cpp
+++ b/torch/csrc/utils/pybind.cpp
@@ -7,8 +7,14 @@ namespace detail {
 
 bool type_caster<c10::SymInt>::load(py::handle src, bool) {
   if (torch::is_symint(src)) {
+    auto node = src.attr("node");
+    if (py::isinstance<c10::SymNodeImpl>(node)) {
+      value = c10::SymInt(py::cast<c10::SymNode>(node));
+      return true;
+    }
+
     value = c10::SymInt(static_cast<c10::SymNode>(
-        c10::make_intrusive<torch::impl::PythonSymNodeImpl>(src.attr("node"))));
+        c10::make_intrusive<torch::impl::PythonSymNodeImpl>(node)));
     return true;
   }
 

From c3f7e415e02fb77d65f22f1a91096f65dff111f8 Mon Sep 17 00:00:00 2001
From: Yuxin Wu <ppwwyyxxc@gmail.com>
Date: Sat, 19 Nov 2022 02:24:18 +0000
Subject: [PATCH 1100/1922] Don't redefine __STDC_FORMAT_MACROS (#89310)

Similar to https://github.com/pytorch/pytorch/pull/39608 and https://github.com/pytorch/pytorch/pull/6676

This causes a compile error in our internal build.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89310
Approved by: https://github.com/kit1980
---
 torch/csrc/cuda/Tensor.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch/csrc/cuda/Tensor.cpp b/torch/csrc/cuda/Tensor.cpp
index beb81f187a6e2..f9486164358d4 100644
--- a/torch/csrc/cuda/Tensor.cpp
+++ b/torch/csrc/cuda/Tensor.cpp
@@ -1,4 +1,6 @@
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif
 
 // Order of these includes matters, which should be fixed.
 // clang-format off

From c6e1ec735fb82710457a8c327a657c1515acfab4 Mon Sep 17 00:00:00 2001
From: Michael Voznesensky <voznesenskym@gmail.com>
Date: Sat, 19 Nov 2022 03:35:07 +0000
Subject: [PATCH 1101/1922] Add --explain flag to bench (#89316)

TORCHDYNAMO_DYNAMIC_SHAPES=1 AOT_DYNAMIC_SHAPES=1 time python benchmarks/dynamo/torchbench.py  --accuracy --explain  --backend aot_eager --train --only BERT_pytorch

Dynamo produced 76 graphs with 75 graph break and 198 ops

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89316
Approved by: https://github.com/ezyang
---
 benchmarks/dynamo/common.py | 10 ++++++++++
 torch/_dynamo/eval_frame.py | 17 +++++++++++++----
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index c4e9d62f0a7c9..f4d1bfad37d71 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -1318,6 +1318,7 @@ def run_one_model(
         experiment,
         diff=False,
         branch=None,
+        explain=False,
     ):
         if diff:
             self.compare_branches(
@@ -1337,6 +1338,8 @@ def run_one_model(
                 name, model, example_inputs, optimize_ctx, experiment
             )
             print(status)
+        if explain:
+            print(torch._dynamo.explain(model, *example_inputs)[0])
 
 
 def help(fn):
@@ -1515,6 +1518,12 @@ def get_example_inputs(self):
         help="Delta this branch against main. In the future, we may add support for picking the branch.",
     )
 
+    parser.add_argument(
+        "--explain",
+        action="store_true",
+        help="run .explain() on the graph at the end of the run.",
+    )
+
     parser.add_argument(
         "--cold_start_latency",
         action="store_true",
@@ -1982,6 +1991,7 @@ def run(runner, args, original_dir=None):
                 optimize_ctx,
                 experiment,
                 diff=args.diff_main,
+                explain=args.explain,
             )
         if args.generate_aot_autograd_stats:
             stats_file = output_filename.split(".csv")[0] + "_stats.csv"
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 538f6131d62b1..31fb479906e1f 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -429,6 +429,7 @@ def toy_example(a, b):
     )
 
 
+# TODO(voz): Consider making "explain" output alongside a run / part of a run
 @patch("torch._dynamo.symbolic_convert.explain", True)
 def explain(f, *args, **kwargs):
     # TODO(voz): Do we want a decorator for this?
@@ -487,15 +488,23 @@ def guard_export_print(guards):
         msg = f"{break_reason.reason}\n{formatted_stack}"
         formatted_list += f"{idx + 1}. {msg} \n"
 
-    explanation = f"Dynamo produced {graph_count} graphs"
+    explanation = f"Dynamo produced {graph_count} graphs "
     explanation += f"with {graph_count - 1} graph break and {op_count} ops"
-    explanation += f"\n Break reasons: \n\n{formatted_list}"
+    explanation_verbose = explanation
+    explanation_verbose += f"\n Break reasons: \n\n{formatted_list}"
 
-    explanation += compile_times()
+    explanation_verbose += compile_times()
 
     # TODO(voz): Do we want a decorator for this?
     reset()
-    return explanation, out_guards, graphs, ops_per_graph, break_reasons
+    return (
+        explanation,
+        out_guards,
+        graphs,
+        ops_per_graph,
+        break_reasons,
+        explanation_verbose,
+    )
 
 
 def export(

From 9e08a248b2e56a8f0176ed90b47bfda96ca67826 Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Sat, 19 Nov 2022 04:09:29 +0000
Subject: [PATCH 1102/1922] add jvp test with non-contig inputs (#89131)

Ref: https://github.com/pytorch/functorch/issues/1029

We update `test_jvp` to do contiguous and non-contiguous testing in a single test.

Prev time for `test_jvp` : ~28s
New time for `test_jvp`: ~45s

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89131
Approved by: https://github.com/zou3519
---
 test/functorch/test_ops.py | 53 +++++++++++++++++++++++++++++++-------
 1 file changed, 43 insertions(+), 10 deletions(-)

diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index f276b739f81da..e9451b596b4ac 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -10,7 +10,7 @@
 import unittest
 
 from torch.testing._internal.common_utils import TestCase, run_tests, is_iterable_of_tensors, IS_MACOS, \
-    IS_ARM64, parametrize, TEST_WITH_ASAN
+    IS_ARM64, IS_X86, parametrize, TEST_WITH_ASAN, noncontiguous_like
 import torch
 from torch import Tensor
 import functools
@@ -403,15 +403,31 @@ def wrapped_fn(*args, **kwargs):
         xfail("native_batch_norm"),
         xfail('nn.functional._scaled_dot_product_attention', device_type='cuda'),
 
-        xfail('nn.functional.rrelu')  # in-place test errors out with no formula implemented
+        xfail('nn.functional.rrelu'),  # in-place test errors out with no formula implemented
+
+        # --- Non-Contiguous Failures! ---
+        # This is expected to fail as the operator
+        # expects last dim to have stride=1
+        xfail('view_as_complex'),
+        # BUG
+        # AssertionError: Tensor-likes are not close!
+        xfail('as_strided'),
+        decorate('linalg.det', 'singular',
+                 decorator=unittest.skipIf(IS_MACOS and IS_X86, "Fails on x86 MacOS CI")),
     }))
     @opsToleranceOverride('TestOperators', 'test_jvp', (
         tol1('nn.functional.conv_transpose3d',
              {torch.float32: tol(atol=1e-04, rtol=1.3e-06)}, device_type='cuda'),
+        tol1('linalg.tensorsolve',
+             {torch.float32: tol(atol=1e-04, rtol=1.3e-05)}, device_type='cuda'),
         tol1('nn.functional.binary_cross_entropy_with_logits',
              {torch.float32: tol(atol=4e-04, rtol=4e-04)}),
         tol1('nn.functional.batch_norm',
              {torch.float32: tol(atol=4e-05, rtol=5e-05)}),
+        tol1('nn.functional.conv2d',
+             {torch.float32: tol(atol=4e-05, rtol=5e-05)}),
+        tol1('pca_lowrank',
+             {torch.float32: tol(atol=5e-05, rtol=5e-05)}),
     ))
     def test_jvp(self, device, dtype, op):
         # TODO: get rid of vjp_decomp when we add decomposition support to
@@ -435,28 +451,38 @@ def test_jvp(self, device, dtype, op):
         inplace_variant = op.inplace_variant if op.supports_inplace_autograd else None
 
         for sample in samples:
-            args = (sample.input,) + sample.args
-            kwargs = sample.kwargs
             if outplace_variant:
-                self.jvp_opinfo_test(outplace_variant, args, kwargs,
+                self.jvp_opinfo_test(outplace_variant, sample,
                                      sample.output_process_fn_grad,
                                      clone_inputs=False,
                                      fixme_ref_jvp_local=fixme_ref_jvp_local)
             if is_valid_inplace_sample_input(sample, op, inplace_variant):
-                self.jvp_opinfo_test(inplace_variant, args, kwargs,
+                self.jvp_opinfo_test(inplace_variant, sample,
                                      sample.output_process_fn_grad,
                                      clone_inputs=True,
                                      fixme_ref_jvp_local=fixme_ref_jvp_local)
 
-    def jvp_opinfo_test(self, fn, args, kwargs, output_process_fn,
+
+    def jvp_opinfo_test(self, fn, sample, output_process_fn,
                         clone_inputs, fixme_ref_jvp_local):
         # NB: we used requires_grad=True to determine where the primals are,
         # but don't need that information otherwise
-        fn, primals = normalize_op_input_output2(
+        args = (sample.input,) + sample.args
+        kwargs = sample.kwargs
+        contig_fn, primals = normalize_op_input_output2(
             fn, args, kwargs, output_process_fn, requires_grad=True)
         orig_primals = tree_map(lambda x: x.detach(), primals)
         orig_tangents = tree_map(lambda x: torch.randn_like(x), primals)
 
+        noncontig_sample = sample.noncontiguous()
+        noncontig_args = (noncontig_sample.input,) + noncontig_sample.args
+        noncontig_kwargs = sample.kwargs
+        noncontig_fn, primals = normalize_op_input_output2(
+            fn, noncontig_args, noncontig_kwargs,
+            output_process_fn, requires_grad=True)
+        noncontig_primals = tree_map(lambda x: x.detach(), primals)
+        noncontig_tangents = tree_map(lambda x: noncontiguous_like(x), orig_tangents)
+
         def maybe_clone_inputs():
             if clone_inputs:
                 primals = tree_map(torch.clone, orig_primals)
@@ -466,14 +492,21 @@ def maybe_clone_inputs():
 
         primals, tangents = maybe_clone_inputs()
         expected_primal_outs, expected_tangent_outs = \
-            fixme_ref_jvp_local(fn, primals, tangents)
+            fixme_ref_jvp_local(contig_fn, primals, tangents)
 
         primals, tangents = maybe_clone_inputs()
-        primal_outs, tangent_outs = jvp(fn, primals, tangents)
+        primal_outs, tangent_outs = jvp(contig_fn, primals, tangents)
+
+        noncontig_primal_outs, noncontig_tangent_outs = jvp(noncontig_fn,
+                                                            noncontig_primals,
+                                                            noncontig_tangents)
 
         self.assertEqual(primal_outs, expected_primal_outs)
         self.assertEqual(tangent_outs, expected_tangent_outs)
 
+        self.assertEqual(noncontig_primal_outs, expected_primal_outs)
+        self.assertEqual(noncontig_tangent_outs, expected_tangent_outs)
+
     @ops(op_db + additional_op_db, allowed_dtypes=(torch.float,))
     @skipOps('TestOperators', 'test_vjp', vjp_fail.union({
         xfail('sparse.sampled_addmm', ''),

From e8d64b4cde9243e38e32b606d9bf57872f0e4a06 Mon Sep 17 00:00:00 2001
From: Horace He <chilli@fb.com>
Date: Fri, 18 Nov 2022 21:39:11 +0000
Subject: [PATCH 1103/1922] Added utility to count memory reads/written in
 Inductor (#89203)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89203
Approved by: https://github.com/jansel, https://github.com/ngimel
---
 test/inductor/test_perf.py      | 434 ++++++++++++++++++++++++++++++++
 torch/_inductor/compile_fx.py   |  28 ++-
 torch/_inductor/dependencies.py |  39 ++-
 torch/_inductor/graph.py        |  43 +++-
 torch/_inductor/metrics.py      |   5 +
 torch/_inductor/scheduler.py    |   2 +-
 torch/_inductor/utils.py        |   5 +
 torch/_inductor/virtualized.py  |   2 +-
 8 files changed, 545 insertions(+), 13 deletions(-)
 create mode 100644 test/inductor/test_perf.py

diff --git a/test/inductor/test_perf.py b/test/inductor/test_perf.py
new file mode 100644
index 0000000000000..d473ff4b74495
--- /dev/null
+++ b/test/inductor/test_perf.py
@@ -0,0 +1,434 @@
+# Owner(s): ["module: inductor"]
+import contextlib
+from unittest.mock import patch
+
+import torch._dynamo
+import torch._inductor.config as config
+from torch._dynamo.optimizations.backends import register_backend
+from torch._inductor import metrics
+from torch._inductor.compile_fx import compile_fx, count_bytes_inner
+from torch.testing._internal.common_utils import (
+    TEST_WITH_ROCM,
+    TestCase as TorchTestCase,
+)
+from torch.testing._internal.inductor_utils import HAS_CUDA
+
+aten = torch.ops.aten
+
+
+@register_backend
+def count_bytes_inductor(gm, example_inputs):
+    return compile_fx(gm, example_inputs, inner_compile=count_bytes_inner)
+
+
+@torch._dynamo.optimize("count_bytes_inductor")
+def f(x):
+    return torch.cat([x, x.cos()])
+
+
+def count_numel(f, *args):
+    """
+    Assumes all inputs are fp32
+    """
+    metrics.reset()
+    torch._dynamo.optimize("count_bytes_inductor")(f)(*args)
+    print(metrics.nodes_num_elem)
+    return str(metrics.num_bytes_accessed // 4)
+
+
+DEVICE = "cuda"
+
+
+def T(*size, dtype=torch.float32, device=DEVICE):
+    return torch.randn(size, dtype=dtype, device=device)
+
+
+def TI(*size, mx=10, dtype=torch.int32, device=DEVICE):
+    return torch.randint(0, mx, size, dtype=dtype, device=device)
+
+
+class TestCase(TorchTestCase):
+    device = DEVICE
+    pass
+
+
+class NumBytesMetricTests(TestCase):
+    """
+    Primarily used for sanity testing that the num_bytes_accessed metrics is correct.
+    """
+
+    def test_pointwise(self):
+        def f(x):
+            return x.cos()
+
+        inp = (T(10),)
+        self.assertExpectedInline(count_numel(f, *inp), """20""")
+
+        def f(x, y):
+            return x + y
+
+        inp = (T(10), T(10))
+        self.assertExpectedInline(count_numel(f, *inp), """30""")
+
+        def f(x, y):
+            return x + y
+
+        inp = (T(10, 10), T(10))
+        self.assertExpectedInline(count_numel(f, *inp), """210""")
+
+        def f(x):
+            return x + x
+
+        inp = (T(10),)
+        self.assertExpectedInline(count_numel(f, *inp), """20""")
+
+        def f(x):
+            return x + x.t()
+
+        inp = (T(10, 10),)
+        self.assertExpectedInline(count_numel(f, *inp), """200""")
+
+        def f(a, b, c):
+            return a.cos(), b.sin() + c.sin()
+
+        inp = (T(10), T(10), T(10))
+        self.assertExpectedInline(count_numel(f, *inp), """50""")
+
+    def test_reduction(self):
+        def f(x):
+            return x.sum(dim=1)
+
+        inp = (T(10, 10),)
+        self.assertExpectedInline(count_numel(f, *inp), """110""")
+
+        def f(x):
+            return x.sum(dim=0)
+
+        inp = (T(10, 10),)
+        self.assertExpectedInline(count_numel(f, *inp), """110""")
+
+    def test_extern(self):
+        def f(x):
+            return torch.mm(x, x)
+
+        inp = (T(10, 10),)
+        self.assertExpectedInline(count_numel(f, *inp), """200""")
+
+        def f(a, b):
+            return torch.mm(a, b)
+
+        inp = (T(10, 10), T(10, 10))
+        self.assertExpectedInline(count_numel(f, *inp), """300""")
+
+        def f(x):
+            x = x.cos()
+            x = torch.mm(x, x)
+            x = x.cos()
+            return x
+
+        inp = (T(10, 10),)
+        self.assertExpectedInline(count_numel(f, *inp), """600""")
+
+        def f(x):
+            a = x.cos()
+            b = x.sin()
+            x = torch.mm(a, b)
+            return x
+
+        inp = (T(10, 10),)
+        self.assertExpectedInline(count_numel(f, *inp), """600""")
+
+    def test_cat(self):
+        def f(a, b):
+            return torch.cat([a.sin(), b.sin()])
+
+        inp = (T(10), T(10))
+        self.assertExpectedInline(count_numel(f, *inp), """40""")
+
+        def f(a, b):
+            return torch.cat([a, b])
+
+        inp = (T(10), T(10))
+        self.assertExpectedInline(count_numel(f, *inp), """40""")
+
+        def f(a, b):
+            return torch.cat([a.cos(), b])
+
+        inp = (T(10), T(10))
+        self.assertExpectedInline(count_numel(f, *inp), """40""")
+
+        def f(a):
+            return torch.cat([a.cos(), a.sin()])
+
+        inp = (T(10),)
+        self.assertExpectedInline(count_numel(f, *inp), """30""")
+
+    def test_index(self):
+        def f(a, b):
+            return a[b]
+
+        inp = (T(10), TI(10, mx=10))
+        self.assertExpectedInline(count_numel(f, *inp), """30""")
+
+
+class FusionTests(TestCase):
+    """
+    Tests that things can be fused into a single kernel
+    """
+
+    def test_horizontal_reduction_pointwise(self):
+        def f(a):
+            b = a.sum(dim=1)
+            c = a.cos()
+            return b, c
+
+        inp = (T(10, 10),)
+        self.assertExpectedInline(count_numel(f, *inp), """210""")
+
+    def test_horizontal_reduction_reduction(self):
+        def f(a):
+            b = a.sum(dim=1)
+            c = a.amax(dim=1)
+            return b, c
+
+        inp = (T(10, 10),)
+        self.assertExpectedInline(count_numel(f, *inp), """120""")
+
+    def test_horizontal_reduction_pointwise2(self):
+        def f(a, b):
+            c = a.sum(dim=1)
+            b = b.cos()
+            return b + c
+
+        inp = (T(10, 10), T(10))
+        self.assertExpectedInline(count_numel(f, *inp), """120""")
+
+    def test_horizontal_reduction_outer_pointwise(self):
+        def f(a, b):
+            c = a.sum(dim=0)
+            b = b.cos()
+            return b + c
+
+        inp = (T(10, 10), T(10))
+        self.assertExpectedInline(count_numel(f, *inp), """120""")
+
+    def test_horizontal_sum_pw_broadcast(self):
+        def f(a, b):
+            a = a.sum(dim=1, keepdim=True)
+            b = b.cos()
+            return a * b
+
+        inp = (T(10, 10), T(10))
+        self.assertExpectedInline(count_numel(f, *inp), """210""")
+
+    def test_vertical_sum_pw(self):
+        def f(a):
+            a = a.cos()
+            a = a.sum(dim=1)
+            return a.cos()
+
+        inp = (T(10, 10),)
+        self.assertExpectedInline(count_numel(f, *inp), """110""")
+
+    def test_norm_chain(self):
+        def f(a):
+            b = a.sum(dim=1, keepdim=True)
+            a = a * b
+            b = a.sum(dim=1, keepdim=True)
+            a = a * b
+            b = a.sum(dim=1, keepdim=True)
+            a = a * b
+            return a
+
+        inp = (T(10, 10),)
+        self.assertExpectedInline(count_numel(f, *inp), """200""")
+
+    def test_softmax_inner(self):
+        def f(a):
+            return torch.softmax(a, dim=1)
+
+        inp = (T(10, 10),)
+        self.assertExpectedInline(count_numel(f, *inp), """200""")
+
+    def test_layer_norm(self):
+        # TODO: Suboptimal! We shouldn't need to save normalization stats.
+        mod = torch.nn.LayerNorm(10, device=self.device)
+
+        def f(x):
+            return mod(x)
+
+        inp = (T(10, 10),)
+        with torch.no_grad():
+            self.assertExpectedInline(count_numel(f, *inp), """220""")
+
+    def test_double_softmax(self):
+        def f(x):
+            x = torch.softmax(x, dim=1)
+            x = torch.softmax(x, dim=1)
+            return x
+
+        inp = (T(10, 10),)
+        self.assertExpectedInline(count_numel(f, *inp), """200""")
+
+    def test_softmax_backward(self):
+        def f(grad_out, out):
+            return aten._softmax_backward_data(grad_out, out, 1, torch.float32)
+
+        inp = (T(10, 10), T(10, 10))
+        self.assertExpectedInline(count_numel(f, *inp), """300""")
+
+    def test_neighbor(self):
+        def f(a, b):
+            return ((a - b) ** 2).sum(dim=-1).amax(dim=1)
+
+        inp = (T(10, 1, 4), T(1, 10, 4))
+        self.assertExpectedInline(count_numel(f, *inp), """90""")
+
+    def test_factory_reduction(self):
+        def f():
+            a = torch.ones(10, device=self.device)
+            b = torch.ones(10, 10, device=self.device)
+            return (a + b).sum(dim=-1)
+
+        inp = ()
+        self.assertExpectedInline(count_numel(f, *inp), """10""")
+
+    def test_index_pointwise(self):
+        def f(a, b):
+            return a[b].cos()
+
+        inp = (T(10, 10), TI(20, mx=10))
+        self.assertExpectedInline(count_numel(f, *inp), """320""")
+
+    def test_index_reduction(self):
+        def f(a, b):
+            return a[b].cos().sum(dim=1)
+
+        inp = (T(10, 10), TI(20, mx=10))
+        self.assertExpectedInline(count_numel(f, *inp), """140""")
+
+
+class SchedulerFusionTests(TestCase):
+    """
+    Testing the fusion group creation heuristic (i.e. cases where we can't fuse
+    everything into a single kernel)
+    Disables inductor rematerialization for easier reasoning of tests.
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls._stack = contextlib.ExitStack()
+        cls._stack.enter_context(patch.object(config, "realize_bytes_threshold", 0))
+
+    @classmethod
+    def tearDownClass(cls):
+        cls._stack.close()
+        super().tearDownClass()
+
+    def test_fusion_choice1(self):
+        # Doesn't matter where we break fusion group here
+        def f(a):
+            c = a.cos()
+            d = torch.mm(c, c)
+            e = c.cos()
+            return d + e
+
+        inp = (T(10, 10),)
+        self.assertExpectedInline(count_numel(f, *inp), """700""")
+
+    def test_fusion_choice2(self):
+        # We should materialize e (it's smaller!)
+        # [c, e]: 210, [f]: 210, [d]: 200
+        def f(a):
+            c = a.cos()
+            d = torch.mm(c, c)
+            e = c.sum(dim=1)
+            f = d + e
+            return f
+
+        inp = (T(10, 10),)
+        self.assertExpectedInline(count_numel(f, *inp), """620""")
+
+    def test_fusion_choice3(self):
+        # We should materialize e.
+        # [c, e]: 300, [f]: 300, [d]: 200
+        def f(a):
+            c = a.cos()
+            d = torch.mm(c, c)
+            e = c + a
+            f = d + e
+            return f, e
+
+        inp = (T(10, 10),)
+        self.assertExpectedInline(count_numel(f, *inp), """800""")
+
+
+class TilingTests(TestCase):
+    def test_tiling_simple(self):
+        def f(a, b):
+            return a + b.t()
+
+        inp = (T(10, 10), T(10, 10))
+        self.assertExpectedInline(count_numel(f, *inp), """300""")
+
+        def f(a, b):
+            return a.t() + b
+
+        inp = (T(10, 10), T(10, 10))
+        self.assertExpectedInline(count_numel(f, *inp), """300""")
+
+    def test_tiling_three(self):
+        def f(a, b, c):
+            return a + b.permute(1, 2, 0) + c.permute(2, 0, 1)
+
+        inp = (T(10, 10, 10), T(10, 10, 10), T(10, 10, 10))
+        self.assertExpectedInline(count_numel(f, *inp), """4000""")
+
+
+# Test cases where we don't do the right thing yet.
+class WouldBeNiceIfItWorked:
+    def test_horizontal(self):
+        def f(a):
+            b = a.sum(dim=0)
+            c = a.cos()
+            return b, c
+
+        inp = (T(10, 10),)
+        self.assertExpectedInline(count_numel(f, *inp), """210""")
+
+    # TODO: We aren't fusing outer dim softmaxes
+    def test_softmax_outer(self):
+        def f(a):
+            return torch.softmax(a, dim=0)
+
+        inp = (T(10, 10),)
+        self.assertExpectedInline(count_numel(f, *inp), """200""")
+
+    # TODO: The greedy fusion strategy results in suboptimal grouping
+    @patch.object(config, "realize_bytes_threshold", 0)
+    def test_fusion_choice4(self):
+        def f(a, b, b2):
+            c = a + b
+            d = torch.mm(c, c)
+            e = c + b + b2
+            f = d + e + b2
+            return f, e
+
+        inp = (T(10, 10), T(10, 10, dtype=torch.float16), T(10, 10))
+        self.assertExpectedInline(count_numel(f, *inp), """1000""")
+
+    # TODO: We materialize the intermediate if we don't unroll the reduction
+    def test_neighbor(self):
+        def f(a, b):
+            return ((a - b) ** 2).sum(dim=-1).amax(dim=1)
+
+        inp = (T(10, 1, 8), T(1, 10, 8))
+        self.assertExpectedInline(count_numel(f, *inp), """170""")
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    if HAS_CUDA and not TEST_WITH_ROCM:
+        run_tests(needs="filelock")
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index 813daee1252f1..c482e55a954da 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -12,7 +12,7 @@
 import torch.fx
 from torch._subclasses.fake_tensor import FakeTensor
 
-from . import config, overrides
+from . import config, metrics, overrides
 from .debug import DebugContext
 from .decomposition import select_decomp_table
 from .graph import GraphLowering
@@ -83,6 +83,22 @@ def _step_logger():
     return dynamo_logging.get_step_logger(log)
 
 
+@DebugContext.wrap
+def count_bytes_inner(gm, example_inputs, num_fixed=0, **kwargs):
+    shape_env = None
+    for inp in example_inputs:
+        if isinstance(inp, FakeTensor) and inp.fake_mode.shape_env is not None:
+            shape_env = inp.fake_mode.shape_env
+
+    graph = GraphLowering(gm, shape_env=shape_env, num_static_inputs=num_fixed)
+    with V.set_graph_handler(graph):
+        graph.run(*example_inputs)
+        num_bytes, nodes_num_elem = graph.count_bytes()
+        metrics.num_bytes_accessed += num_bytes
+        metrics.nodes_num_elem += nodes_num_elem
+    return make_boxed_func(gm.forward)
+
+
 @DebugContext.wrap
 @torch.utils._python_dispatch._disable_current_modes()
 def compile_fx_inner(
@@ -326,7 +342,11 @@ def is_not_gradout(x):
 _graph_counter = itertools.count(0)
 
 
-def compile_fx(model_: torch.fx.GraphModule, example_inputs_: List[torch.Tensor]):
+def compile_fx(
+    model_: torch.fx.GraphModule,
+    example_inputs_: List[torch.Tensor],
+    inner_compile=compile_fx_inner,
+):
     """Main entrypoint to a compile given FX graph"""
 
     if not is_aot_autograd_safe_to_run(model_, example_inputs_):
@@ -348,7 +368,7 @@ def compile_fx(model_: torch.fx.GraphModule, example_inputs_: List[torch.Tensor]
     @dynamo_utils.dynamo_timed
     def fw_compiler(model: torch.fx.GraphModule, example_inputs):
         fixed = len(example_inputs) - num_example_inputs
-        return compile_fx_inner(
+        return inner_compile(
             model,
             example_inputs,
             num_fixed=fixed,
@@ -359,7 +379,7 @@ def fw_compiler(model: torch.fx.GraphModule, example_inputs):
     @dynamo_utils.dynamo_timed
     def bw_compiler(model: torch.fx.GraphModule, example_inputs):
         fixed = count_tangents(model)
-        return compile_fx_inner(
+        return inner_compile(
             model,
             example_inputs,
             num_fixed=fixed,
diff --git a/torch/_inductor/dependencies.py b/torch/_inductor/dependencies.py
index 27c92f82c07c9..5434d7addfa9a 100644
--- a/torch/_inductor/dependencies.py
+++ b/torch/_inductor/dependencies.py
@@ -9,7 +9,14 @@
 
 from . import config
 from .codegen.common import index_prevent_reordering
-from .utils import sympy_product, sympy_str, sympy_subs, sympy_symbol, VarRanges
+from .utils import (
+    get_dtype_size,
+    sympy_product,
+    sympy_str,
+    sympy_subs,
+    sympy_symbol,
+    VarRanges,
+)
 from .virtualized import V
 
 log = logging.getLogger(__name__)
@@ -69,11 +76,18 @@ def rename(self, renames: Dict[str, str]) -> "MemoryDep":
             return MemoryDep(renames[self.name], self.index, self.size)
         return self
 
-    def numel_hint(self):
+    def numbytes_hint(self):
         vars = set(self.index.free_symbols)
+        size_vars_used = []
+        for var in vars:
+            if var.name.startswith(canonicalization_prefix()):
+                # Sometimes with indirect indexing we have very weird symbol names
+                assert " " not in var.name
+                size_vars_used.append(int(var.name[len(canonicalization_prefix()) :]))
+
         return V.graph.sizevars.size_hint(
-            sympy_product([s for s in self.size if s in vars])
-        )
+            sympy_product([self.size[i] for i in size_vars_used])
+        ) * get_dtype_size(V.graph.get_dtype(self.name))
 
     def is_contiguous(self) -> bool:
         return isinstance(self.index, (sympy.Symbol, sympy.Integer))
@@ -88,8 +102,21 @@ def rename(self, renames: Dict[str, str]) -> "StarDep":
             return StarDep(renames[self.name])
         return self
 
-    def numel_hint(self):
-        return 1
+    def numbytes_hint(self):
+        from .ir import MultiOutputLayout
+
+        if self.name in V.graph.name_to_buffer:
+            buf = V.graph.name_to_buffer[self.name]
+        elif self.name in V.graph.graph_inputs:
+            buf = V.graph.graph_inputs[self.name]
+        else:
+            return 1
+        if hasattr(buf, "layout") and isinstance(buf.layout, MultiOutputLayout):
+            # NB: Too annoying to acquire, should only be used for instrumentation
+            return 1
+        return V.graph.sizevars.size_hint(
+            sympy_product(buf.get_size())
+        ) * get_dtype_size(buf.get_dtype())
 
     def is_contiguous(self) -> bool:
         return False
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 5114ffa761117..a47d9c1a02e11 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -27,7 +27,7 @@
     needs_realized_inputs,
 )
 from .sizevars import SizeVarAllocator
-from .utils import dynamo_utils, gather_origins
+from .utils import dynamo_utils, gather_origins, get_dtype_size, sympy_product
 from .virtualized import V
 
 log = logging.getLogger(__name__)
@@ -356,6 +356,47 @@ def codegen(self):
         self.scheduler.codegen()
         return self.wrapper_code.generate()
 
+    def count_bytes(self):
+        from .scheduler import FusedSchedulerNode, NopKernelSchedulerNode, Scheduler
+
+        scheduler = Scheduler(self.buffers)
+
+        def get_read_write_buffers_sizes(node):
+            if isinstance(node, NopKernelSchedulerNode):
+                return 0
+            reads = set(dep.name for dep in node.read_writes.reads)
+            writes = set(dep.name for dep in node.read_writes.writes)
+
+            def is_materialized(buf):
+                buf_uses = set(
+                    [user.node for user in scheduler.name_to_node[buf].users]
+                )
+                return len(buf_uses - set(node.snodes)) > 0
+
+            if isinstance(node, FusedSchedulerNode):
+                writes = set([dep for dep in writes if is_materialized(dep)])
+            node_bytes = 0
+            for buf in reads | writes:
+                if buf in self.name_to_buffer:
+                    buf = self.name_to_buffer[buf]
+                elif buf in self.graph_inputs:
+                    buf = self.graph_inputs[buf]
+                else:
+                    continue
+
+                node_bytes += V.graph.sizevars.size_hint(
+                    sympy_product(buf.get_size())
+                ) * get_dtype_size(buf.get_dtype())
+            return node_bytes
+
+        total_bytes = 0
+        node_counts = []
+        for node in scheduler.nodes:
+            num_bytes = get_read_write_buffers_sizes(node)
+            node_counts.append((node, num_bytes // 4))
+            total_bytes += num_bytes
+        return total_bytes, node_counts
+
     @dynamo_utils.dynamo_timed
     def compile_to_module(self):
         from .codecache import PyCodeCache
diff --git a/torch/_inductor/metrics.py b/torch/_inductor/metrics.py
index 582c5aca7f880..f7e05288c9a5e 100644
--- a/torch/_inductor/metrics.py
+++ b/torch/_inductor/metrics.py
@@ -1,12 +1,17 @@
 # counter for tracking how many kernels have been generated
 generated_kernel_count = 0
 generated_cpp_vec_kernel_count = 0
+num_bytes_accessed = 0
+nodes_num_elem = []
 
 
 # reset all counters
 def reset():
     global generated_kernel_count
     global generated_cpp_vec_kernel_count
+    global num_bytes_accessed, nodes_num_elem
 
     generated_kernel_count = 0
     generated_cpp_vec_kernel_count = 0
+    num_bytes_accessed = 0
+    nodes_num_elem.clear()
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
index cb71a44438049..8609617897bf5 100644
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@@ -977,7 +977,7 @@ def score_fusion_memory(self, node1, node2):
         common_memory_deps = (node1.read_writes.reads | node1.read_writes.writes) & (
             node2.read_writes.reads | node2.read_writes.writes
         )
-        return sum(dep.numel_hint() for dep in common_memory_deps)
+        return sum(dep.numbytes_hint() for dep in common_memory_deps)
 
     def score_fusion_key(self, nodes):
         """
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index 08e95b9b5cc34..62357be8bcf39 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -375,3 +375,8 @@ def fresh_inductor_cache(cache_entries=None):
 
 def argsort(seq):
     return sorted(range(len(seq)), key=seq.__getitem__)
+
+
+@functools.lru_cache(8)
+def get_dtype_size(dtype):
+    return torch.empty((), dtype=dtype).element_size()
diff --git a/torch/_inductor/virtualized.py b/torch/_inductor/virtualized.py
index 5d40d05f751f9..27e60b1daf1df 100644
--- a/torch/_inductor/virtualized.py
+++ b/torch/_inductor/virtualized.py
@@ -74,7 +74,7 @@ def masked(cls, mask, body, other):
 
     @staticmethod
     def indirect_indexing(index_var):
-        return sympy_symbol(str(index_var))
+        return sympy_symbol(f"({str(index_var)})")
 
     @classmethod
     def _init_cls(cls):

From 92c183b9889f70e0e324b2472410513729917b0b Mon Sep 17 00:00:00 2001
From: Michael Voznesensky <voznesenskym@gmail.com>
Date: Sat, 19 Nov 2022 07:16:29 +0000
Subject: [PATCH 1104/1922] Fix try/except flow where
 DataDependentOutputException is getting wrapped in a RuntimeError (#89314)

Repro fixed

```
def fn(a):
    return a.repeat_interleave(14, dim=0).repeat_interleave(14, dim=1)

x = torch.ones(14, 14).to(dtype=torch.int64)
opt_fn = torch._dynamo.optimize("eager")(fn)
opt_fn(x)
```

Fixes [#1886](https://github.com/pytorch/torchdynamo/issues/1886)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89314
Approved by: https://github.com/anijain2305, https://github.com/eellison
---
 torch/_dynamo/utils.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index e4b92a73aacfc..889bb5683b6b0 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -1055,13 +1055,20 @@ def visit(n: torch.fx.Node):
     except Unsupported:
         raise
     except RuntimeError as e:
-        if isinstance(e, torch._subclasses.fake_tensor.DataDependentOutputException):
+        cause = e
+        if e.__cause__ is not None:
+            cause = e.__cause__
+        if isinstance(
+            cause, torch._subclasses.fake_tensor.DataDependentOutputException
+        ):
             if config.capture_scalar_outputs and node.target == "item":
                 return torch.zeros(size=(), dtype=args[0].dtype).item()
             else:
-                unimplemented(f"data dependent operator: {e.func}")
-        elif isinstance(e, torch._subclasses.fake_tensor.DynamicOutputShapeException):
-            unimplemented(f"dynamic shape operator: {e.func}")
+                unimplemented(f"data dependent operator: {cause.func}")
+        elif isinstance(
+            cause, torch._subclasses.fake_tensor.DynamicOutputShapeException
+        ):
+            unimplemented(f"dynamic shape operator: {cause.func}")
         raise TorchRuntimeError() from e
 
 
From 081f3c325f4d7258c11001341cc9e3147abe795b Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Fri, 18 Nov 2022 22:46:47 -0800
Subject: [PATCH 1105/1922] [quant][fix] Add quant_min/quant_max for default
 dynamic quantization observer (#89267)

Summary:
This is needed for choose qparams, but previously it is not configurable, and in the reference quantization flow
with decomposed Tensor, we are making this explicit

Test Plan:
tested in future PR

Reviewers:

Subscribers:

Tasks:

Tags:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89267
Approved by: https://github.com/vkuzo
---
 torch/ao/quantization/observer.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/torch/ao/quantization/observer.py b/torch/ao/quantization/observer.py
index 3156b4245a12f..42962fe7c29a0 100644
--- a/torch/ao/quantization/observer.py
+++ b/torch/ao/quantization/observer.py
@@ -1316,6 +1316,8 @@ class PlaceholderObserver(ObserverBase):
     Args:
         dtype: dtype argument to the `quantize` node needed to implement the
                reference model spec.
+        quant_min: minimum value in quantized domain (TODO: align behavior with other observers)
+        quant_min: maximum value in quantized domain
         custom_op_name: (temporary) specify this observer for an operator that doesn't require any observation
                         (Can be used in Graph Mode Passes for special case ops).
         compute_dtype: if set, marks the future quantize function to use
@@ -1325,12 +1327,15 @@ class PlaceholderObserver(ObserverBase):
     """
 
     def __init__(
-        self, dtype=torch.float32, custom_op_name="", compute_dtype=None
+        self, dtype=torch.float32, custom_op_name="", compute_dtype=None,
+        quant_min=None, quant_max=None,
     ) -> None:
-        super(PlaceholderObserver, self).__init__(dtype=dtype)
+        super().__init__(dtype=dtype)
         # dtype of input of the target operator, e.g. for dynamic quantization
         # ops, the dtype will be float32
         self.dtype = dtype
+        self.quant_min = quant_min
+        self.quant_max = quant_max
         self.custom_op = custom_op_name
         # used for configuration of computation type for dynamic quantization
         # TODO(future PR): replace this with `is_dynamic`
@@ -1551,7 +1556,7 @@ def load_observer_state_dict(mod, obs_dict):
 """
 
 default_dynamic_quant_observer = PlaceholderObserver.with_args(
-    dtype=torch.quint8, compute_dtype=torch.quint8
+    dtype=torch.quint8, compute_dtype=torch.quint8, quant_min=0, quant_max=255
 )
 """
 Default observer for dynamic quantization.

From 52afb2a8f75b28877c8e5391d7ab8fbb5edb65ef Mon Sep 17 00:00:00 2001
From: AllenTiTaiWang <titaiwang@microsoft.com>
Date: Sat, 19 Nov 2022 02:56:14 +0000
Subject: [PATCH 1106/1922] [ONNX] Add setType from user into InferredType and
 Reliable in ConstantValueMap (#88622)

`setType` API is not respected in current exporter because the graph-level shape type inference simply overrides every NOT ONNX Op shape we had from node-level shape type inference. To address this issue, this PR (1) makes custom Op with `setType` **reliable** in ConstantValueMap to secure its shape/type information in pass:  _C._jit_pass_onnx. (2) If an invalid Op with shape/type in pass: _C._jit_pass_onnx_graph_shape_type_inference(graph-level), we recognize it as reliable.

1. In #62856, The refactor in onnx.cpp made regression on custom Op, as that was the step we should update custom Op shape/type information into ConstantValueMap for remaining Ops.

2. Add another condition besides IsValidONNXNode for custom Op setType in shape_type_inference.cpp. If all the node output has shape (not all dynamic), we say it's custom set type.

3. ~However, this PR won't solve the [issue](https://github.com/pytorch/pytorch/issues/87738#issuecomment-1292831219) that in the node-level shape type inference, exporter invokes the warning in terms of the unknow custom Op, since we process its symbolic_fn after this warning, but it would have shape/type if setType is used correctly. And that will be left for another issue to solve. #84661~ Add `no_type_warning` in UpdateReliable() and it only warns if non ONNX node with no given type appears.

Fixes #81693
Fixes #87738

NOTE: not confident of this not breaking anything. Please share your thoughts if there is a robust test on your mind.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88622
Approved by: https://github.com/BowenBao
---
 test/onnx/internal/test_diagnostics.py        |   5 +-
 .../onnx/test_pytorch_onnx_shape_inference.py | 171 +++++++++++++++++-
 torch/csrc/jit/passes/onnx.cpp                |  17 +-
 .../jit/passes/onnx/shape_type_inference.cpp  |  85 ++++++---
 .../jit/passes/onnx/shape_type_inference.h    |   7 +-
 5 files changed, 250 insertions(+), 35 deletions(-)

diff --git a/test/onnx/internal/test_diagnostics.py b/test/onnx/internal/test_diagnostics.py
index 884b7cb1c3880..49402204e9d27 100644
--- a/test/onnx/internal/test_diagnostics.py
+++ b/test/onnx/internal/test_diagnostics.py
@@ -215,10 +215,11 @@ def test_diagnostics_records_cpp_call_stack(self):
         assert stack is not None  # for mypy
         self.assertGreater(len(stack.frames), 0)
         frame_messages = [frame.location.message for frame in stack.frames]
+        # node missing onnx shape inference warning only comes from ToONNX (_jit_pass_onnx)
+        # after node-level shape type inference and processed symbolic_fn output type
         self.assertTrue(
             any(
-                isinstance(message, str)
-                and "torch::jit::ONNXShapeTypeInference" in message
+                isinstance(message, str) and "torch::jit::NodeToONNX" in message
                 for message in frame_messages
             )
         )
diff --git a/test/onnx/test_pytorch_onnx_shape_inference.py b/test/onnx/test_pytorch_onnx_shape_inference.py
index cf9ef2fd893e7..915677279d017 100644
--- a/test/onnx/test_pytorch_onnx_shape_inference.py
+++ b/test/onnx/test_pytorch_onnx_shape_inference.py
@@ -1,8 +1,10 @@
 # Owner(s): ["module: onnx"]
 
+import io
+
 import numpy as np
+import onnx
 import pytorch_test_common
-
 import torch
 from pytorch_test_common import skipIfUnsupportedMinOpsetVersion
 from torch.onnx import _constants, symbolic_helper
@@ -284,5 +286,172 @@ def test_reduce_prod_without_axes(self):
         self.run_test(g, reduce_prod.node(), expect_tensor("Long", shape=(1,)))
 
 
+class TestONNXCustomOpShapeInference(pytorch_test_common.ExportTestCase):
+    def setUp(self):
+        super().setUp()
+        self.opset_version = _constants.ONNX_MAX_OPSET
+
+    def test_setType_maintains_output_shape_for_single_custom_op(self):
+
+        self.addCleanup(torch.onnx.unregister_custom_op_symbolic, "::linalg_inv", 9)
+
+        class CustomInverse(torch.nn.Module):
+            def forward(self, x):
+                return torch.inverse(x) + x
+
+        def linalg_inv_settype(g, self):
+            return g.op("com.microsoft::Inverse", self).setType(self.type())
+
+        torch.onnx.register_custom_op_symbolic("::linalg_inv", linalg_inv_settype, 9)
+        model = CustomInverse()
+        x = torch.randn(2, 3, 3)
+        f = io.BytesIO()
+        torch.onnx.export(
+            model,
+            (x,),
+            f,
+            opset_version=self.opset_version,
+            custom_opsets={"com.microsoft": 1},
+        )
+
+        model_proto = onnx.load(io.BytesIO(f.getvalue()))
+        model_value_info = model_proto.graph.value_info
+        self.assertIsNotNone(model_value_info)
+        assert model_value_info
+        dims = model_value_info[0].type.tensor_type.shape.dim
+        for i in range(len(dims)):
+            # If node output has shape info, it should have dim_value
+            # Otherwise, it has dim_params with dynamic shape
+            self.assertTrue(dims[i].HasField("dim_value"))
+        for dim, rank in zip(dims, x.size()):
+            self.assertEqual(dim.dim_value, rank)
+
+    def test_no_setType_for_single_custom_op(self):
+
+        self.addCleanup(torch.onnx.unregister_custom_op_symbolic, "::linalg_inv", 9)
+
+        class CustomInverse(torch.nn.Module):
+            def forward(self, x):
+                return torch.inverse(x) + x
+
+        def linalg_inv_no_settype(g, self):
+            return g.op("com.microsoft::Inverse", self)
+
+        torch.onnx.register_custom_op_symbolic("::linalg_inv", linalg_inv_no_settype, 9)
+        model = CustomInverse()
+        x = torch.randn(2, 3, 3)
+        f = io.BytesIO()
+        torch.onnx.export(
+            model,
+            (x,),
+            f,
+            opset_version=self.opset_version,
+            custom_opsets={"com.microsoft": 1},
+        )
+
+        model_proto = onnx.load(io.BytesIO(f.getvalue()))
+        model_value_info = model_proto.graph.value_info
+        self.assertIsNotNone(model_value_info)
+        assert model_value_info
+        dims = model_value_info[0].type.tensor_type.shape.dim
+        for i in range(len(dims)):
+            # If node output has shape info, it should have dim_value
+            # Otherwise, it has dim_params with dynamic shape
+            self.assertTrue(dims[i].HasField("dim_param"))
+
+    def test_setType_maintains_output_shape_for_single_custom_op_with_dynamic_axes(
+        self,
+    ):
+
+        self.addCleanup(torch.onnx.unregister_custom_op_symbolic, "::linalg_inv", 9)
+
+        class CustomInverse(torch.nn.Module):
+            def forward(self, x):
+                return torch.inverse(x) + x
+
+        def linalg_inv_settype(g, self):
+            return g.op("com.microsoft::Inverse", self).setType(
+                self.type().with_dtype(torch.float).with_sizes([None, 3, 3])
+            )
+
+        torch.onnx.register_custom_op_symbolic("::linalg_inv", linalg_inv_settype, 9)
+        model = CustomInverse()
+        x = torch.randn(2, 3, 3)
+        f = io.BytesIO()
+        torch.onnx.export(
+            model,
+            (x,),
+            f,
+            opset_version=self.opset_version,
+            custom_opsets={"com.microsoft": 1},
+            input_names=["x"],
+            dynamic_axes={"x": {0: "batch"}},
+        )
+
+        model_proto = onnx.load(io.BytesIO(f.getvalue()))
+        model_value_info = model_proto.graph.value_info
+        self.assertIsNotNone(model_value_info)
+        assert model_value_info
+        dims = model_value_info[0].type.tensor_type.shape.dim
+        # The first axe should be dynamic as we defined when exporting
+        self.assertTrue(dims[0].HasField("dim_param"))
+        for i in range(1, len(dims)):
+            # If node output has shape info, it should have dim_value
+            # Otherwise, it has dim_params with dynamic shape
+            self.assertTrue(dims[i].HasField("dim_value"))
+            self.assertEqual(dims[i].dim_value, x.size()[i])
+
+    def test_setType_maintains_output_shape_for_single_custom_op_with_onnx_ops(self):
+
+        self.addCleanup(torch.onnx.unregister_custom_op_symbolic, "::linalg_inv", 9)
+
+        class CustomInverse(torch.nn.Module):
+            def forward(self, x, y, z):
+                x = torch.inverse(x)
+                return x + y + z
+
+        def linalg_inv_settype(g, self):
+            return g.op("com.microsoft::Inverse", self).setType(
+                self.type().with_dtype(torch.float).with_sizes([2, 3, 10, 10])
+            )
+
+        torch.onnx.register_custom_op_symbolic("::linalg_inv", linalg_inv_settype, 9)
+        model = CustomInverse()
+        x = torch.randn(2, 3, 10, 10)
+        y = torch.randn(2, 3, 10, 10)
+        z = torch.randn(2, 3, 10, 10)
+        f = io.BytesIO()
+        torch.onnx.export(
+            model,
+            (x, y, z),
+            f,
+            opset_version=self.opset_version,
+            custom_opsets={"com.microsoft": 1},
+        )
+
+        model_proto = onnx.load(io.BytesIO(f.getvalue()))
+        # To validate the shape of inverse Op, we need to find inverse output name,
+        # and then use it to identify its value_info for the shape.
+        output_name = ""
+        for node in model_proto.graph.node:
+            if node.op_type == "Inverse":
+                output_name = node.output[0]
+                break
+        assert output_name
+        model_value_info = model_proto.graph.value_info
+        self.assertIsNotNone(model_value_info)
+        assert model_value_info
+        for value_info in model_value_info:
+            assert value_info.name
+            if value_info.name == output_name:
+                dims = value_info.type.tensor_type.shape.dim
+                for i in range(len(dims)):
+                    # If node output has shape info, it should have dim_value
+                    # Otherwise, it has dim_params with dynamic shape
+                    self.assertTrue(dims[i].HasField("dim_value"))
+                for dim, rank in zip(dims, x.size()):
+                    self.assertEqual(dim.dim_value, rank)
+
+
 if __name__ == "__main__":
     common_utils.run_tests()
diff --git a/torch/csrc/jit/passes/onnx.cpp b/torch/csrc/jit/passes/onnx.cpp
index 607f2ce61ada4..75e2d754aa503 100644
--- a/torch/csrc/jit/passes/onnx.cpp
+++ b/torch/csrc/jit/passes/onnx.cpp
@@ -14,7 +14,6 @@
 #include <torch/csrc/utils/pybind.h>
 #include <sstream>
 #include <unordered_map>
-
 namespace torch {
 namespace jit {
 
@@ -326,10 +325,20 @@ void NodeToONNX(
           ONNXShapeTypeInference(const_node, empty_params_dict, opset_version);
           env[old] = const_node->output();
         } else {
-          // ConstantValueMap has been set in shape inference,
-          // set_constant_value_map = false here to avoid redundancy.
+          // An update in ConstantValueMap is also needed here, since
+          // the user setType can be only accessed in this step, and it
+          // should be reliable.
           MergeInferredTypeAndSetMap(
-              outputs[i], old->type(), outputs[i]->type(), false);
+              outputs[i], old->type(), outputs[i]->type());
+          // non ONNX node with no type given will throw out the warnings here.
+          UpdateReliable(
+              outputs[i],
+              AreInputsReliableOrStatic(outputs[i]->node()),
+              /*no_type_warning=*/true);
+          // For the node type that does not have ComputeConstant logic, it may
+          // have reliable shape but its shape is not in ConstantValueMap. So we
+          // need to update ConstantValueMap.
+          UpdateShapeConstantIfReliable(outputs[i]);
 
           // Copy over source location and scope information to all nodes
           // created by the symbolic
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
index 8baa439bdb58a..a9087508e6ad2 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
@@ -76,16 +76,13 @@ std::pair<TypePtr, bool> MergeInferredType(
 void MergeInferredTypeAndSetMap(
     Value* dest_v,
     TypePtr existing_type,
-    TypePtr inferred_type,
-    bool set_constant_value_map) {
+    TypePtr inferred_type) {
   TypePtr mergedType;
   bool inferred;
   std::tie(mergedType, inferred) =
       MergeInferredType(existing_type, inferred_type);
   dest_v->setType(mergedType);
-  if (set_constant_value_map) {
-    ConstantValueMap::SetUseInferredType(dest_v->debugName(), inferred);
-  }
+  ConstantValueMap::SetUseInferredType(dest_v->debugName(), inferred);
 }
 
 namespace {
@@ -232,6 +229,28 @@ bool IsValidONNXNode(const Node* n) {
   return true;
 }
 
+bool CustomSettype(Node* node) {
+  // This is a helper function to decide if the non-ONNX node actually has
+  // custom setType from user
+  // Go through every symbolic_sizes and if any one of them is static, we say
+  // this is set by user. On the other hand, if all of them are * (dynamic), we
+  // take this node does not have given type, since unreliable nodes have *
+  // shape anyway.
+  auto all_output_has_type = [](Value* output) {
+    if (auto output_type = output->type()->cast<TensorType>()) {
+      if (auto sizes = output_type->symbolic_sizes().sizes()) {
+        return std::any_of(std::begin(*sizes), std::end(*sizes), [](auto size) {
+          return size.is_static();
+        });
+      }
+    }
+    return false;
+  };
+
+  return std::all_of(
+      node->outputs().begin(), node->outputs().end(), all_output_has_type);
+}
+
 Value* CloneValueFromListConstruct(
     Value* v,
     std::shared_ptr<Graph> n_graph,
@@ -1879,7 +1898,8 @@ static std::unordered_set<std::string> nodeTypeReliableForTracer = {
 
 void UpdateReliable(
     torch::jit::Value* output,
-    const std::pair<bool, bool>& inferred_type_reliable) {
+    const std::pair<bool, bool>& inferred_type_reliable,
+    bool no_type_warning) {
   auto inferred =
       ConstantValueMap::GetUseInferredType(output->debugName()).value_or(false);
   auto isTypeReliableForTracer =
@@ -1887,7 +1907,9 @@ void UpdateReliable(
           output->node()->kind().toDisplayString()) !=
       nodeTypeReliableForTracer.end();
   if (!inferred && !isTypeReliableForTracer &&
-      !output->node()->kind().is_onnx()) {
+      !output->node()->kind().is_onnx() && no_type_warning) {
+    // TODO(84661): This warning comes before setType in symbolic_fn.
+    // tracked in #84661
     TORCH_WARN(
         "The shape inference of ",
         output->node()->kind().toDisplayString(),
@@ -1949,6 +1971,7 @@ void ONNXShapeTypeInference(
   SetGraphInputTypeReliable(n->owningGraph());
   GRAPH_UPDATE(
       "Running ONNX shape inference for node: ", n->kind().toDisplayString());
+
   if (IsValidONNXNode(n)) {
     // Create a Graph containing only the single node n.
     // This graph is later converted to ONNX to run shape inference.
@@ -2041,6 +2064,15 @@ void ONNXShapeTypeInference(
       GRAPH_DEBUG(
           "ONNX graph after shape inference: ", prettyPrint(*model_proto));
     }
+  } else if (CustomSettype(n)) {
+    // If the node is not ONNX standard, go through every output to check if
+    // they all have shape. If they all do, this should be reliable even if the
+    // Op is not from ONNX.
+    for (auto node_output : n->outputs()) {
+      // Custom setType output should get in here if it's set correctly. They
+      // will be updated to inferred for later updatereliable function.
+      ConstantValueMap::SetUseInferredType(node_output->debugName(), true);
+    }
   }
 
   SpecialPostProcess(n);
@@ -2082,20 +2114,7 @@ void ONNXShapeTypeInference(
   // reliable shape but its shape is not in ConstantValueMap. So we need this
   // logic to update ConstantValueMap.
   for (auto node_output : n->outputs()) {
-    if (ConstantValueMap::HasTypeReliable(node_output->debugName())) {
-      auto reliable =
-          ConstantValueMap::GetTypeReliable(node_output->debugName())
-              .value_or(false);
-      if (reliable && !ConstantValueMap::HasShape(node_output->debugName())) {
-        // TODO: ListType case
-        if (auto output_tensor_type = node_output->type()->cast<TensorType>()) {
-          if (output_tensor_type->dim()) {
-            auto symbolic_sizes = output_tensor_type->symbolic_sizes();
-            UpdateShapeConstantValueMap(node_output, symbolic_sizes);
-          }
-        }
-      }
-    }
+    UpdateShapeConstantIfReliable(node_output);
   }
 
   GRAPH_DEBUG(
@@ -2280,10 +2299,10 @@ size_t ONNXAssignOutputShape(
     // Tracing:
     //    Ignore None, since it is not captured in IR graph as output.
     // Scripting:
-    //    Ignore None, if observing a fixed `None` node in IR graph. Because it
-    //    is meaningless to include it as graph output as it carries no
-    //    data/information. Plus that static `None` is not supported in ONNX IR.
-    //    Otherwise, the output should have type `Optional`, and should be
+    //    Ignore None, if observing a fixed `None` node in IR graph. Because
+    //    it is meaningless to include it as graph output as it carries no
+    //    data/information. Plus that static `None` is not supported in ONNX
+    //    IR. Otherwise, the output should have type `Optional`, and should be
     //    converted to ONNX `Optional`.
 
     // More context:
@@ -2343,5 +2362,21 @@ void ONNXShapeTypeInference(
   ConstantValueMap::ClearMaps();
 }
 
+void UpdateShapeConstantIfReliable(torch::jit::Value* node_output) {
+  if (ConstantValueMap::HasTypeReliable(node_output->debugName())) {
+    auto reliable = ConstantValueMap::GetTypeReliable(node_output->debugName())
+                        .value_or(false);
+    if (reliable && !ConstantValueMap::HasShape(node_output->debugName())) {
+      // TODO: ListType case
+      if (auto output_tensor_type = node_output->type()->cast<TensorType>()) {
+        if (output_tensor_type->dim()) {
+          auto symbolic_sizes = output_tensor_type->symbolic_sizes();
+          UpdateShapeConstantValueMap(node_output, symbolic_sizes);
+        }
+      }
+    }
+  }
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.h b/torch/csrc/jit/passes/onnx/shape_type_inference.h
index afda5b1765377..39350ed273d48 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.h
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.h
@@ -34,8 +34,7 @@ std::pair<TypePtr, bool> MergeInferredType(
 void MergeInferredTypeAndSetMap(
     Value* dest_v,
     TypePtr existing_type,
-    TypePtr inferred_type,
-    bool set_constant_value_map = true);
+    TypePtr inferred_type);
 
 // Update graph input types with dynamic axes info.
 // Axes that are marked as dynamic will be assigned as dynamic ShapeSymbol.
@@ -80,9 +79,11 @@ TORCH_API void ONNXShapeTypeInference(
 std::pair<bool, bool> AreInputsReliableOrStatic(Node* n);
 void UpdateReliable(
     torch::jit::Value* output,
-    const std::pair<bool, bool>& input_reliable);
+    const std::pair<bool, bool>& input_reliable,
+    bool no_type_warning = false);
 
 void UpdateReliable(torch::jit::Node* n);
+void UpdateShapeConstantIfReliable(torch::jit::Value* output);
 
 } // namespace jit
 } // namespace torch

From fcb8436d590cf8ce774abe4ae7be25b957abc978 Mon Sep 17 00:00:00 2001
From: Michael Voznesensky <voznesenskym@gmail.com>
Date: Sat, 19 Nov 2022 17:49:39 +0000
Subject: [PATCH 1107/1922] RM expectedFailure
 UnspecReproTests.test_batch_norm_act_unspec (#89340)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89340
Approved by: https://github.com/bertmaher
---
 test/dynamo/test_unspec.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/dynamo/test_unspec.py b/test/dynamo/test_unspec.py
index e46d79208de02..fd5396981b740 100644
--- a/test/dynamo/test_unspec.py
+++ b/test/dynamo/test_unspec.py
@@ -50,8 +50,6 @@ class UnspecTest(cls):
 UnspecReproTests = make_unspec_cls(test_repros.ReproTests)
 UnspecNNModuleTests = make_unspec_cls(test_modules.NNModuleTests)
 
-unittest.expectedFailure(UnspecReproTests.test_batch_norm_act_unspec)
-
 
 @patch.object(torch._dynamo.config, "specialize_int_float", False)
 class UnspecTests(torch._dynamo.test_case.TestCase):

From 969c77422dea7dbecb879e9e98e908217936b13e Mon Sep 17 00:00:00 2001
From: fduwjj <fduwjj@fb.com>
Date: Sat, 19 Nov 2022 18:01:25 +0000
Subject: [PATCH 1108/1922] [PT-D][1/N] Sync TP Beta change to prod (#89242)

This is part of TP Beta Release efforts.

ref: https://github.com/pytorch/tau/issues/576

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89242
Approved by: https://github.com/wanchaol
---
 .../_tensor/parallel/test_parallelize_api.py  | 136 ++++++++++++
 .../_tensor/parallel/test_tp_examples.py      |  78 +------
 .../_tensor/parallel/test_tp_style.py         | 192 +++++++++++++++++
 .../distributed/_tensor/parallel/__init__.py  |  12 ++
 torch/distributed/_tensor/parallel/api.py     | 112 ++++++++++
 torch/distributed/_tensor/parallel/style.py   | 197 ++++++++++++++++++
 torch/distributed/_tensor/parallel/utils.py   | 149 +++++++++++++
 7 files changed, 805 insertions(+), 71 deletions(-)
 create mode 100644 test/distributed/_tensor/parallel/test_parallelize_api.py
 create mode 100644 test/distributed/_tensor/parallel/test_tp_style.py
 create mode 100644 torch/distributed/_tensor/parallel/style.py
 create mode 100644 torch/distributed/_tensor/parallel/utils.py

diff --git a/test/distributed/_tensor/parallel/test_parallelize_api.py b/test/distributed/_tensor/parallel/test_parallelize_api.py
new file mode 100644
index 0000000000000..fb3e8f4721c86
--- /dev/null
+++ b/test/distributed/_tensor/parallel/test_parallelize_api.py
@@ -0,0 +1,136 @@
+# Owner(s): ["oncall: distributed"]
+
+import torch
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import DTensorTestBase, with_comms
+from torch.distributed._tensor import distribute_tensor, DeviceMesh, Shard, Replicate
+from torch.distributed._tensor.parallel import PairwiseParallel, ParallelStyle
+from torch.distributed._tensor.parallel.api import _parallelize_mlp
+from torch.distributed._tensor.parallel.utils import _create_1d_device_mesh
+from torch.distributed._tensor.parallel.style import (
+    make_input_replicate_1d,
+    make_output_replicate_1d,
+)
+
+
+class MLPModule(torch.nn.Module):
+    def __init__(self, device):
+        super(MLPModule, self).__init__()
+        torch.manual_seed(5)
+        self.net1 = torch.nn.Linear(10, 16, device=device)
+        self.relu = torch.nn.ReLU()
+        self.net2 = torch.nn.Linear(16, 12, device=device)
+
+    def forward(self, x):
+        return self.net2(self.relu(self.net1(x)))
+
+
+class TensorParallelAPITests(DTensorTestBase):
+    @property
+    def world_size(self):
+        gpu_num = torch.cuda.device_count()
+        return gpu_num if gpu_num % 2 == 0 and gpu_num > 4 else 4
+
+    @with_comms
+    def test_creat_1d_device_mesh(self):
+        dim_one_size = 2
+        mesh_shape = (
+            torch.arange(self.world_size)
+            .reshape(
+                self.world_size // dim_one_size,
+                dim_one_size,
+            )
+            .to(torch.int)
+        )
+        mesh = DeviceMesh(self.device_type, mesh_shape)
+        # When 1D dim is 1.
+        one_dimention_mesh_shape = mesh_shape[self.rank // dim_one_size, :]
+        pg = mesh.get_dim_groups()[1]
+        new_mesh = _create_1d_device_mesh(mesh, 1)
+        expected_mesh = DeviceMesh(
+            self.device_type, one_dimention_mesh_shape, [pg]
+        )
+        self.assertEqual(new_mesh.mesh, expected_mesh.mesh)
+        self.assertEqual(new_mesh.device_type, expected_mesh.device_type)
+        # When 1D dim is 0.
+        one_dimention_mesh_shape = mesh_shape[:, self.rank % dim_one_size]
+        pg = mesh.get_dim_groups()[0]
+        new_mesh = _create_1d_device_mesh(mesh, 0)
+        expected_mesh = DeviceMesh(
+            self.device_type, one_dimention_mesh_shape, [pg]
+        )
+        self.assertEqual(new_mesh.mesh, expected_mesh.mesh)
+        self.assertEqual(new_mesh.device_type, expected_mesh.device_type)
+
+    @with_comms
+    def test_creat_1d_device_mesh_error(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+        with self.assertRaisesRegex(
+            AssertionError,
+            "Expect tp_mesh_dim within range \\[-1, 1\\), but found 3.",
+        ):
+            _create_1d_device_mesh(mesh, 3)
+
+    @with_comms
+    def test_parallelize_mlp(self):
+        model = MLPModule(self.device_type)
+        model_tp = MLPModule(self.device_type)
+
+        # Ensure model are initialized the same way.
+        self.assertEqual(model.net1.weight, model_tp.net1.weight)
+        self.assertEqual(model.net1.bias, model_tp.net1.bias)
+        self.assertEqual(model.net2.weight, model_tp.net2.weight)
+        self.assertEqual(model.net2.bias, model_tp.net2.bias)
+
+        # Parallelize module.
+        device_mesh = DeviceMesh(
+            self.device_type, torch.arange(self.world_size)
+        )
+        _parallelize_mlp(model_tp, device_mesh, PairwiseParallel())
+
+        # Ensure the parameter is properly distributed.
+        self.assertEqual(
+            distribute_tensor(model.net1.weight, device_mesh, [Shard(0)]),
+            model_tp.net1.weight,
+        )
+        self.assertEqual(
+            distribute_tensor(model.net1.bias, device_mesh, [Shard(0)]),
+            model_tp.net1.bias,
+        )
+        self.assertEqual(
+            distribute_tensor(model.net2.weight, device_mesh, [Shard(1)]),
+            model_tp.net2.weight,
+        )
+        self.assertEqual(
+            distribute_tensor(model.net2.bias, device_mesh, [Replicate()]),
+            model_tp.net2.bias,
+        )
+
+    @with_comms
+    def test_parallelize_mlp_error(self):
+        class DummyParallel(ParallelStyle):
+            def __init__(self) -> None:
+                super().__init__(
+                    make_input_replicate_1d, make_output_replicate_1d
+                )
+
+        model_tp = MLPModule(self.device_type)
+        device_mesh = DeviceMesh(
+            self.device_type, torch.arange(self.world_size)
+        )
+        with self.assertRaisesRegex(
+            NotImplementedError,
+            "Only support PairwiseParallel for MLP parallelization.",
+        ):
+            _parallelize_mlp(model_tp, device_mesh, DummyParallel())
+
+        with self.assertRaisesRegex(
+            RuntimeError, "We only support even number of Linear for MLP."
+        ):
+            _parallelize_mlp(
+                torch.nn.Linear(10, 5), device_mesh, PairwiseParallel()
+            )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_tensor/parallel/test_tp_examples.py b/test/distributed/_tensor/parallel/test_tp_examples.py
index 582108ea7599a..696171e4ca883 100644
--- a/test/distributed/_tensor/parallel/test_tp_examples.py
+++ b/test/distributed/_tensor/parallel/test_tp_examples.py
@@ -11,11 +11,8 @@
     skip_unless_torch_gpu,
 )
 from torch.distributed._tensor import (
-    distribute_tensor,
     distribute_module,
     DeviceMesh,
-    DTensor,
-    Shard,
     Replicate,
 )
 from torch.distributed._tensor.parallel import (
@@ -24,6 +21,8 @@
     replicate_input,
     replicate_output,
 )
+from torch.distributed._tensor.parallel import PairwiseParallel
+from torch.distributed._tensor.parallel.api import _parallelize_mlp
 
 
 class MLPModule(torch.nn.Module):
@@ -38,73 +37,6 @@ def forward(self, x):
         return self.net2(self.relu(self.net1(x)))
 
 
-def _aggregate_local_tensor(module: torch.nn.Module) -> torch.nn.Module:
-    def hook_func(_module, _input, output):
-        if isinstance(output, DTensor):
-            replica_placement = [Replicate()] * device_mesh.ndim
-            return (
-                output.redistribute(output.device_mesh, replica_placement)
-                .contiguous()
-                .to_local()
-            )
-
-    module.register_forward_hook(hook_func)
-    return module
-
-
-def shard_mlp(m, device_type, tp_size):
-    start_idx = 0
-    device_mesh = DeviceMesh(
-        device_type,
-        list(range(start_idx, start_idx + tp_size)),
-    )
-    col_wise_sharding = [Shard(0)]
-    row_wise_sharding = [Shard(1)]
-    replicate = [Replicate()] * device_mesh.ndim
-
-    def shard_params(name, module, device_mesh):
-        if isinstance(module, nn.Linear):
-            if name == "net1":
-                sharded_weight = nn.Parameter(
-                    distribute_tensor(
-                        module.weight, device_mesh, col_wise_sharding
-                    )
-                )
-                sharded_bias = nn.Parameter(
-                    distribute_tensor(
-                        module.bias, device_mesh, col_wise_sharding
-                    )
-                )
-                module.register_parameter("weight", sharded_weight)
-                module.register_parameter("bias", sharded_bias)
-            elif name == "net2":
-                sharded_weight = nn.Parameter(
-                    distribute_tensor(
-                        module.weight, device_mesh, row_wise_sharding
-                    )
-                )
-                replicated_bias = nn.Parameter(
-                    distribute_tensor(module.bias, device_mesh, replicate)
-                )
-                module.register_parameter("weight", sharded_weight)
-                module.register_parameter("bias", replicated_bias)
-
-    def aggregate_output(outputs, device_mesh):
-        assert isinstance(outputs, DTensor)
-        return (
-            outputs.redistribute(device_mesh, replicate).contiguous().to_local()
-        )
-
-    dist_mod = distribute_module(
-        m,
-        device_mesh,
-        partition_fn=shard_params,
-        input_fn=replicate_input,
-        output_fn=aggregate_output,
-    )
-    return dist_mod
-
-
 class MultiheadAttnWrap(nn.Module):
     def __init__(self, embed_dim, num_heads, add_bias_kv=False, device=None):
         super().__init__()
@@ -134,7 +66,11 @@ def test_mlp_megatron_e2e(self):
 
         # Shard module and initialize optimizer.
         LR = 0.25
-        shard_mlp(model_tp, self.device_type, NUM_DEVICES)
+        device_mesh = DeviceMesh(
+            self.device_type,
+            torch.arange(0, NUM_DEVICES),
+        )
+        _parallelize_mlp(model_tp, device_mesh, PairwiseParallel())
         optim = torch.optim.SGD(model.parameters(), lr=LR)
         optim_tp = torch.optim.SGD(model_tp.parameters(), lr=LR)
 
diff --git a/test/distributed/_tensor/parallel/test_tp_style.py b/test/distributed/_tensor/parallel/test_tp_style.py
new file mode 100644
index 0000000000000..314fe470955b4
--- /dev/null
+++ b/test/distributed/_tensor/parallel/test_tp_style.py
@@ -0,0 +1,192 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+import torch
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import DTensorTestBase, with_comms
+from torch.distributed._tensor import distribute_tensor, DeviceMesh, Shard, Replicate
+from torch.distributed._tensor.parallel.style import (
+    RowwiseParallel,
+    ColwiseParallel,
+    make_input_shard_1d,
+    make_input_replicate_1d,
+    make_output_shard_1d,
+    make_output_replicate_1d,
+    make_output_tensor,
+)
+
+
+class TensorParallelStyleTest(DTensorTestBase):
+    @property
+    def world_size(self):
+        gpu_num = torch.cuda.device_count()
+        return gpu_num if gpu_num % 2 == 0 and gpu_num > 4 else 4
+
+    def _1d_input_func_check(
+        self, input_local_tensor, expected_local_tensor, func
+    ) -> None:
+        with self.assertRaisesRegex(
+            RuntimeError, "device_mesh is not passed nor can be inferred"
+        ):
+            dtensor = func(input_local_tensor)
+        device_mesh = DeviceMesh(
+            self.device_type,
+            torch.arange(self.world_size).reshape(self.world_size // 2, 2),
+        )
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "device_mesh has dims [0-9]+ but expcted to be 1 for input.",
+        ):
+            dtensor = func(input_local_tensor, device_mesh)
+
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        # test 1: replicate local tensor
+        dtensor = func(input_local_tensor, device_mesh)
+        self.assertEqual(expected_local_tensor, dtensor.to_local())
+        # test 2: replicate DTensor
+        dtensor = func(dtensor)
+        self.assertEqual(expected_local_tensor, dtensor.to_local())
+        # test 3: replicate DTensor with DeviceMesh passed
+        dtensor = func(dtensor, device_mesh)
+        self.assertEqual(expected_local_tensor, dtensor.to_local())
+
+    @with_comms
+    def test_make_input_replicate_1d(self):
+        tensor = torch.rand(8, 16, device=self.device_type)
+        self._1d_input_func_check(tensor, tensor, make_input_replicate_1d)
+
+    @with_comms
+    def test_make_input_shard_1d(self):
+        tensor = torch.rand(8, 16, device=self.device_type)
+        self._1d_input_func_check(tensor, tensor, make_input_shard_1d)
+
+    # Common logic for testing prepare output funcs
+    def _test_prepare_output(
+        self, func, spec, dim=None, device_mesh_input_none=False
+    ):
+        device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+        tensor = torch.rand(8, 16, device=self.device_type)
+        dtensor = distribute_tensor(tensor, device_mesh, spec)
+        device_mesh_input = None if device_mesh_input_none else device_mesh
+        if dim is not None:
+            output = func(dtensor, device_mesh_input, dim)
+        else:
+            output = func(dtensor, device_mesh_input)
+        return output, dtensor, device_mesh
+
+    @with_comms
+    def test_make_output_shard_1d(self):
+        # test when output is sharded.
+        output, dtensor, device_mesh = self._test_prepare_output(
+            make_output_shard_1d, [Shard(0)], 1
+        )
+        self.assertEqual(output, dtensor.redistribute(device_mesh, [Shard(1)]))
+        #  test when output is replicated.
+        output, dtensor, device_mesh = self._test_prepare_output(
+            make_output_shard_1d, [Replicate()], 0
+        )
+        self.assertEqual(output, dtensor.redistribute(device_mesh, [Shard(0)]))
+        # test when input device_mesh is None.
+        output, dtensor, device_mesh = self._test_prepare_output(
+            make_output_shard_1d, [Shard(0)], 1, True
+        )
+        self.assertEqual(output, dtensor.redistribute(device_mesh, [Shard(1)]))
+
+    @with_comms
+    def test_make_output_replicate_1d(self):
+        output, dtensor, device_mesh = self._test_prepare_output(
+            make_output_replicate_1d, [Shard(0)]
+        )
+        self.assertEqual(
+            output, dtensor.redistribute(device_mesh, [Replicate()])
+        )
+        # test when input device_mesh is None.
+        output, dtensor, device_mesh = self._test_prepare_output(
+            make_output_replicate_1d, [Shard(0)], None, True
+        )
+        self.assertEqual(
+            output, dtensor.redistribute(device_mesh, [Replicate()])
+        )
+
+    @with_comms
+    def test_make_output_tensor(self):
+        # test when output is sharded.
+        output, dtensor, device_mesh = self._test_prepare_output(
+            make_output_tensor, [Shard(0)]
+        )
+        self.assertEqual(
+            output, dtensor.redistribute(device_mesh, [Replicate()]).to_local()
+        )
+        #  test when output is replicated.
+        output, dtensor, device_mesh = self._test_prepare_output(
+            make_output_tensor, [Replicate()]
+        )
+        self.assertEqual(
+            output, dtensor.redistribute(device_mesh, [Replicate()]).to_local()
+        )
+        # test when input device_mesh is None.
+        output, dtensor, device_mesh = self._test_prepare_output(
+            make_output_tensor, [Shard(0)], None, True
+        )
+        self.assertEqual(
+            output, dtensor.redistribute(device_mesh, [Replicate()]).to_local()
+        )
+
+    # Common logic for testing prepare output funcs errors.
+    def _test_prepare_output_error(self, func):
+        tensor = torch.rand(8, 16, device=self.device_type)
+        device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+        dtensor = distribute_tensor(tensor, device_mesh, [Shard(0)])
+        output = [dtensor]
+        with self.assertRaisesRegex(
+            AssertionError,
+            f"Expect output of Tensor Parallel to be a DTensor, but found {type(output)}.",
+        ):
+            func(output, device_mesh)
+        device_mesh = DeviceMesh(
+            self.device_type,
+            torch.arange(self.world_size).reshape(self.world_size // 2, 2),
+        )
+        with self.assertRaisesRegex(
+            AssertionError,
+            "device_mesh has dims 2 but expcted to be 1 for output.",
+        ):
+            func(dtensor, device_mesh)
+
+    @with_comms
+    def test_prepare_output_error(self):
+        self._test_prepare_output_error(make_output_shard_1d)
+        self._test_prepare_output_error(make_output_replicate_1d)
+        self._test_prepare_output_error(make_output_tensor)
+
+    @with_comms
+    def test_rowwise_parallel_style(self):
+        tensor = torch.rand(8, 16, device=self.device_type)
+        rs = RowwiseParallel()
+        self._1d_input_func_check(tensor, tensor, rs._prepare_input)
+        # TODO: change output test
+        output, dtensor, device_mesh = self._test_prepare_output(
+            rs._prepare_input, [Shard(0)]
+        )
+        self.assertEqual(
+            output, dtensor.redistribute(device_mesh, [Replicate()])
+        )
+        # test when input device_mesh is None.
+        output, dtensor, device_mesh = self._test_prepare_output(
+            rs._prepare_input, [Shard(0)], None, True
+        )
+        self.assertEqual(
+            output, dtensor.redistribute(device_mesh, [Replicate()])
+        )
+        self._test_prepare_output_error(rs._prepare_output)
+
+    @with_comms
+    def test_colwise_parallel_style(self):
+        tensor = torch.rand(8, 16, device=self.device_type)
+        cs = ColwiseParallel()
+        self._1d_input_func_check(tensor, tensor, cs._prepare_input)
+        self.assertEqual(None, cs._prepare_output)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/distributed/_tensor/parallel/__init__.py b/torch/distributed/_tensor/parallel/__init__.py
index 5725c5077d4bb..0ef0e8ff0b9ef 100644
--- a/torch/distributed/_tensor/parallel/__init__.py
+++ b/torch/distributed/_tensor/parallel/__init__.py
@@ -8,3 +8,15 @@
     replicate_input,
     replicate_output,
 )
+
+from torch.distributed._tensor.parallel.style import (
+    ParallelStyle,
+    PairwiseParallel,
+    RowwiseParallel,
+    ColwiseParallel,
+    make_input_shard_1d,
+    make_input_replicate_1d,
+    make_output_shard_1d,
+    make_output_replicate_1d,
+    make_output_tensor,
+)
diff --git a/torch/distributed/_tensor/parallel/api.py b/torch/distributed/_tensor/parallel/api.py
index 7ab3ad2199f29..68d444882c4c8 100644
--- a/torch/distributed/_tensor/parallel/api.py
+++ b/torch/distributed/_tensor/parallel/api.py
@@ -3,6 +3,7 @@
 import torch.nn as nn
 from typing import Sequence, Tuple
 from torch.distributed._tensor import (
+    distribute_module,
     distribute_tensor,
     DTensor,
     Shard,
@@ -11,6 +12,8 @@
     Placement,
 )
 from torch.distributed._tensor.parallel import TensorParallelMultiheadAttention
+from torch.distributed._tensor.parallel.style import ParallelStyle, PairwiseParallel
+from torch.distributed._tensor.parallel.utils import _create_1d_device_mesh
 
 
 def replicate_input(
@@ -84,3 +87,112 @@ def _shard_self_attn_params(name: str, module: nn.Module) -> None:
                 )
                 tp_multi_head_attention.copy(m)
                 module.register_module(n, tp_multi_head_attention)
+
+
+def _has_even_num_linears(module: nn.Module) -> bool:
+    """
+    We traverse through all the children of the given module and count the
+    number of Linear module. If the number is even, we return True.
+
+    Args:
+        module (nn.Module):
+            :class:``nn.Module`` object to be traversed and counted.
+
+    Return:
+        A boolean object which specifies whether the module contains
+        event-number of Linears in its children.
+
+    .. warning::
+        The traversal is not recursive for now.
+    """
+    linear_submodules = list(
+        filter(lambda x: isinstance(x, nn.Linear), module.children())
+    )
+    return len(linear_submodules) > 0 and len(linear_submodules) % 2 == 0
+
+
+def _parallelize_mlp(
+    module: nn.Module,
+    device_mesh: DeviceMesh,
+    parallel_style: ParallelStyle = PairwiseParallel(),
+    tp_mesh_dim: int = 0,
+) -> None:
+    """
+    This function assumes the input module is a sequence of nn.Linear
+    and we parallelize the module based on the given parallel style.
+    We don't change the FQN of each sub-module and replace each parameter
+    in place.
+
+    Args:
+        module (nn.Module):
+            :class:``nn.Module`` object to be parallelized.
+        device_mesh (DeviceMesh):
+            :class:``DeviceMesh`` object which describes the mesh topology
+            of devices for the DTensor.
+        parallel_style (ParallelStyle):
+            :class:``ParallelStyle`` object which contains how
+            we prepare input/output for Tensor Parallelism.
+        tp_mesh_dim (int):
+            the dimension of ``device_mesh`` where we perform
+            Tensor Parallelism on.
+
+    Return:
+        None
+
+    .. warning::
+        We only support ``PairwiseParallel`` right now.
+    """
+
+    # Define partition functions needed.
+    def _rowwise_parallelize_fn(name, module, device_mesh):  # pyre-ignore[2, 3]
+        for name, param in module.named_parameters():
+            dist_spec = (
+                [Shard(1)] if name == "weight" else [Replicate()]  # type: ignore[list-item]
+            )
+            dist_param = torch.nn.Parameter(
+                distribute_tensor(param, device_mesh, dist_spec)
+            )
+            module.register_parameter(name, dist_param)
+
+    def _colwise_parallelize_fn(name, module, device_mesh):  # pyre-ignore[2, 3]
+        for name, param in module.named_parameters():
+            dist_param = torch.nn.Parameter(
+                distribute_tensor(param, device_mesh, [Shard(0)])
+            )
+            module.register_parameter(name, dist_param)
+
+    if not isinstance(parallel_style, PairwiseParallel):
+        raise NotImplementedError(
+            "Only support PairwiseParallel for MLP parallelization."
+        )
+
+    if not _has_even_num_linears(module):
+        raise RuntimeError("We only support even number of Linear for MLP.")
+
+    if device_mesh.ndim > 1:
+        device_mesh = _create_1d_device_mesh(device_mesh, tp_mesh_dim)
+
+    linear_submodules = list(
+        filter(lambda x: isinstance(x, nn.Linear), module.children())
+    )
+    for i, m in enumerate(linear_submodules):
+        if i % 2 == 0:
+            # Col-wise Parallelize the linear layer
+            distribute_module(
+                m,
+                device_mesh,
+                _colwise_parallelize_fn,
+                input_fn=parallel_style._prepare_input  # type: ignore[arg-type, misc] # pyre-ignore[6]
+                if i == 0
+                else None,
+            )
+        else:
+            # Row-wise Parallelize the linear layer
+            distribute_module(
+                m,
+                device_mesh,
+                _rowwise_parallelize_fn,
+                output_fn=parallel_style._prepare_output  # type: ignore[arg-type, misc] # pyre-ignore[6]
+                if i == (len(linear_submodules) - 1)
+                else None,
+            )
diff --git a/torch/distributed/_tensor/parallel/style.py b/torch/distributed/_tensor/parallel/style.py
new file mode 100644
index 0000000000000..5ea96434118ab
--- /dev/null
+++ b/torch/distributed/_tensor/parallel/style.py
@@ -0,0 +1,197 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from abc import abstractmethod
+import torch
+from abc import ABC
+from typing import Union, Optional
+from torch.distributed._tensor import DTensor, Shard, Replicate, DeviceMesh
+from torch.distributed._tensor.parallel.utils import (
+    _Prepare_Input_Func_Type,
+    _Prepare_Output_Func_Type,
+    _prepare_input_validate,
+    _prepare_output_validate,
+)
+
+
+class ParallelStyle(ABC):
+    """
+    The parallel style user wants the module or submodule to be parallelized.
+    Users can extend this class to build their own parallel style with customized input/output preparations.
+    """
+
+    _prepare_input: _Prepare_Input_Func_Type
+    _prepare_output: _Prepare_Output_Func_Type
+
+    @abstractmethod
+    def __init__(self, _prepare_input, _prepare_output) -> None:
+        self._prepare_input = _prepare_input  # type: ignore[assignment, misc]
+        self._prepare_output = _prepare_output  # type: ignore[assignment, misc]
+
+
+class PairwiseParallel(ParallelStyle):
+    """
+    PairwiseParallel concatenate colwise and rowwise styles as a fixed
+    pair like what Megatron-LM(https://arxiv.org/abs/1909.08053) is doing.
+    We assume both input and output needs to a replicate DTensor.
+
+    .. warning::
+        PairwiseParallel only supports ``nn.Multihead Attention``,
+        ``nn.Transformer`` or even-number-layer MLP for now.
+    """
+
+    def __init__(self) -> None:
+        super().__init__(make_input_replicate_1d, make_output_tensor)
+
+
+class RowwiseParallel(ParallelStyle):
+    """
+    Partitioning the row of a module.
+    We assume the input to be a sharded :class:``DTensor`` and output to be a replicated :class:``DTensor``.
+    """
+
+    def __init__(self) -> None:
+        super().__init__(make_input_shard_1d, make_output_replicate_1d)
+
+
+class ColwiseParallel(ParallelStyle):
+    """
+    Partitioning the column of a tensor or module.
+    We assume the input to be a replicated :class:``DTensor`` and output to be a sharded :class:``DTensor``.
+    """
+
+    def __init__(self) -> None:
+        super().__init__(make_input_replicate_1d, None)
+
+
+@_prepare_input_validate  # type: ignore[arg-type] # pyre-ignore[56]
+def make_input_shard_1d(
+    input: Union[torch.Tensor, DTensor],
+    device_mesh: Optional[DeviceMesh] = None,
+    dim: int = 0,
+) -> DTensor:
+    """
+    Shard input tensor on ``dim`` over an 1-D device mesh. This function will be used in ParallelStyle.
+
+    Args:
+        input (Union[Tensor, DTensor]):
+            This single tensor will be sharded on dimension ``dim``
+            over the 1-D :class:``DeviceMesh``.
+        device_mesh (DeviceMesh, optional):
+            The 1-D device mesh where ``input`` will be sharded.
+            If no :class:``DeviceMesh`` is passed and ``input`` is a :class:``DTensor``,
+            `input.device_mesh` will be used.
+            If :class:``DeviceMesh`` is not 1-D, an exception will be thrown.
+            Default: ``None``
+        dim (int, optional): The sharding dimension of ``input`` tensor.
+            Default: 0
+
+    Returns:
+        A :class:``DTensor`` sharded on dimension ``dim`` over ``device_mesh``.
+    """
+    shard_spec = [Shard(dim)]
+    if isinstance(input, DTensor):
+        return input.redistribute(device_mesh, shard_spec)
+    elif isinstance(input, torch.Tensor):
+        return DTensor.from_local(
+            input, device_mesh, shard_spec, run_check=False
+        )
+    else:
+        raise RuntimeError(
+            f"Tensor parallel module expects torch.Tensor or DTensor input but received {type(input)}!"
+        )
+
+
+@_prepare_input_validate  # type: ignore[arg-type] # pyre-ignore[56]
+def make_input_replicate_1d(
+    input: Union[torch.Tensor, DTensor],
+    device_mesh: Optional[DeviceMesh] = None,
+) -> DTensor:
+    """
+    Replicate input tensor over an 1-D device mesh. This function will be used in ParallelStyle.
+
+    Args:
+        input (Union[Tensor, DTensor]):
+            This single tensor will be replicated over the 1-D :class:``DeviceMesh``.
+        device_mesh (DeviceMesh, optional):
+            The 1-D device mesh where ``input`` will be replicated.
+            If no :class:``DeviceMesh`` is passed and ``input`` is a :class:``DTensor``,
+            ``input.device_mesh`` will be used.
+            If :class:``DeviceMesh`` is not 1-D, an exception will be thrown.
+            Default: ``None``
+
+    Returns:
+        A :class:``DTensor`` replicated over ``device_mesh``.
+    """
+    replicate = [Replicate()]
+    if isinstance(input, DTensor):
+        return input.redistribute(device_mesh, replicate)
+    elif isinstance(input, torch.Tensor):
+        return DTensor.from_local(
+            input, device_mesh, replicate, run_check=False
+        )
+    else:
+        raise RuntimeError(
+            f"Tensor parallel module expects torch.Tensor or DTensor input but received {type(input)}!"
+        )
+
+
+@_prepare_output_validate  # type: ignore[arg-type] # pyre-ignore[56]
+def make_output_shard_1d(
+    output: DTensor, device_mesh: Optional[DeviceMesh] = None, dim: int = 0
+) -> DTensor:
+    """
+    Convert Output DTensor to a sharded DTensor. This will be used in ParallelStyle.
+    Args:
+        output (DTensor): output of module to be converted.
+        device_mesh (Optional[DeviceMesh]): :class:``DeviceMesh`` object needed to
+            shard the output and it needs to be a 1D ``device_mesh`` and we will throw
+            exceptions if a non-1D ``device_mesh`` is passed in. If no ``device_mesh``
+            is passed in, we will reuse the one from output.
+            Default: ``None``
+        dim (int): Sharding dim for output. Default: 0
+    Return:
+        A :class:``DTensor`` object sharded on the given dim.
+    """
+
+    return output.redistribute(device_mesh, [Shard(dim)])
+
+
+@_prepare_output_validate  # type: ignore[arg-type] # pyre-ignore[56]
+def make_output_replicate_1d(
+    output: DTensor, device_mesh: Optional[DeviceMesh] = None
+) -> DTensor:
+    """
+    Convert Output DTensor to a replicated DTensor. This will be used in ParallelStyle.
+    Args:
+        output (DTensor): output of module to be converted.
+        device_mesh (Optional[DeviceMesh]): :class:``DeviceMesh`` object needed to
+            replicate the output and it needs to be a 1D ``device_mesh`` and we will
+            throw exceptions if a non-1D ``device_mesh`` is passed in. If no
+            ``device_mesh`` is passed in, we will reuse the one from output.
+            Default: ``None``
+    Return:
+        A :class:``DTensor`` object made replicate.
+    """
+
+    return output.redistribute(device_mesh, [Replicate()])
+
+
+@_prepare_output_validate  # type: ignore[arg-type] # pyre-ignore[56]
+def make_output_tensor(
+    output: DTensor, device_mesh: Optional[DeviceMesh] = None
+) -> torch.Tensor:
+    """
+    Convert Output DTensor to a replicated DTensor first and then convert it to Tensor.
+    Args:
+        output (DTensor): output of module to be converted.
+        device_mesh (Optional[DeviceMesh]): :class:``DeviceMesh`` object needed to
+            replicate the output and it needs to be a 1D ``device_mesh`` and we will
+            throw exceptions if a non-1D ``device_mesh`` is passed in. If no
+            ``device_mesh`` is passed in, we will reuse the one from output.
+            Default: ``None``
+    Return:
+        A :class:``torch.Tensor`` object converted from output DTensor.
+    """
+
+    return make_output_replicate_1d(  # type: ignore[attr-defined]
+        output, device_mesh
+    ).to_local()  # type: ignore[call-arg]
diff --git a/torch/distributed/_tensor/parallel/utils.py b/torch/distributed/_tensor/parallel/utils.py
new file mode 100644
index 0000000000000..2680ae41ffbe7
--- /dev/null
+++ b/torch/distributed/_tensor/parallel/utils.py
@@ -0,0 +1,149 @@
+import functools
+
+import torch
+from torch.distributed._tensor import DeviceMesh, DTensor
+from typing import Callable, Optional, Union
+
+_Prepare_Input_Func_Type = Callable[
+    [Union[torch.Tensor, DTensor], Optional[DeviceMesh], Optional[int]], DTensor
+]
+
+_Prepare_Output_Func_Type = Callable[
+    [DTensor, Optional[DeviceMesh], Optional[int]], Union[torch.Tensor, DTensor]
+]
+
+
+def _prepare_input_validate(
+    _prepare_input_func: _Prepare_Input_Func_Type,
+) -> _Prepare_Input_Func_Type:
+    """
+    Inject common validation logics for `_prepare_input` funcs via this
+    decorator, including verifying that input needs to be either
+    a :class:`Tensor` or :class:`DTensor` and only 1D :class:`DeviceMesh`
+    is passed in.
+
+    Args:
+        _prepare_input_func (Callable): The func we want to inject the
+            validation into.
+
+    Returns:
+        func (Callable): Same input function with validation logic added.
+
+    Example::
+        >>> @_prepare_input_validate
+        >>> def make_input_shard_1d(args, kwargs):
+        >>>   ...
+        >>>
+        >>> input = torch.rand(...)
+        >>> dtensor = make_input_shard_1d(input, device_mesh, 1)
+        >>> # This will call '_prepare_input_validate' first
+    """
+
+    @functools.wraps(_prepare_input_func)
+    def wrapper(*args, **kwargs):  # pyre-ignore[2, 3]
+        assert len(args) >= 1, "_prepare_input needs at least one arg."
+        input = args[0]
+        if isinstance(input, list) or isinstance(input, tuple):
+            input = input[0]
+            args = (input, *args[1:])
+        device_mesh = None if len(args) < 2 else args[1]
+
+        if device_mesh is None:
+            if isinstance(input, DTensor):
+                device_mesh = input.device_mesh
+                args = (*args[:1], device_mesh, *args[2:])  # pyre-ignore[60]
+            else:
+                raise RuntimeError(
+                    "device_mesh is not passed nor can be inferred"
+                )
+        if device_mesh.ndim != 1:
+            raise RuntimeError(
+                f"device_mesh has dims {device_mesh.ndim} but expcted to be 1 for input."
+            )
+        return _prepare_input_func(*args, **kwargs)
+
+    return wrapper
+
+
+def _prepare_output_validate(
+    _prepare_output_func: _Prepare_Output_Func_Type,
+) -> _Prepare_Output_Func_Type:
+    """
+    Inject common validation logics for _prepare_output funcs via this
+    decorator, including verifying that output needs to be a DTensor
+    and only 1D Device Mesh is passed in.
+    Example::
+        >>> @_prepare_output_validate
+        >>> def make_output_shard_1d(args, kwargs):
+        >>>   ...
+        >>>
+        >>> dt = distribute(tensor, device_mesh, [Shard(0)])
+        >>> make_output_shard_1d(dt, device_mesh, 1)
+        >>> # This will call '_prepare_output_validate' first
+    Args:
+        _prepare_output_func (Callable): The func we want to inject the
+            validation into.
+    Return:
+        func (Callable): Same input func with validation logic added.
+    """
+
+    @functools.wraps(_prepare_output_func)
+    def wrapper(*args, **kwargs):  # pyre-ignore[2, 3]
+        assert len(args) >= 1, "_prepare_output needs at least one arg."
+        output = args[0]
+        assert isinstance(
+            output, DTensor
+        ), f"Expect output of Tensor Parallel to be a DTensor, but found {type(output)}."
+        if len(args) < 2 or args[1] is None:
+            device_mesh = output.device_mesh
+            args = (*args[:1], device_mesh, *args[2:])  # pyre-ignore[60]
+        else:
+            device_mesh = args[1]
+
+        assert (
+            device_mesh.ndim == 1
+        ), f"device_mesh has dims {device_mesh.ndim} but expcted to be 1 for output."
+        return _prepare_output_func(*args, **kwargs)
+
+    return wrapper
+
+
+def _create_1d_device_mesh(
+    device_mesh: DeviceMesh, tp_mesh_dim: int = 0
+) -> DeviceMesh:
+    """
+    This function converts a N-D ``device_mesh`` into a 1D ``device_mesh``
+    for 1D Tensor Parallelism.
+
+    Args:
+        device_mesh (DeviceMesh):
+            :class:``DeviceMesh`` object which describes the mesh topology
+            of devices for the DTensor.
+        tp_mesh_dim (int):
+            the dimension of ``device_mesh`` where we perform
+            Tensor Parallelism on.
+
+    Return:
+        device_mesh (DeviceMesh): 1-D :class:``DeviceMesh`` object that
+            Tensor Parallelism operates on.
+    """
+    assert (
+        tp_mesh_dim < device_mesh.ndim and tp_mesh_dim >= -device_mesh.ndim
+    ), (
+        f"Expect tp_mesh_dim within range [{-device_mesh.ndim}, {device_mesh.ndim})"
+        f", but found {tp_mesh_dim}."
+    )
+
+    if device_mesh.ndim == 1:
+        return device_mesh
+
+    # swap the current dim to the last dim then reshape to flatten out other
+    # dims, so we can just extract the list of ranks which contains cur_rank.
+    cur_rank = device_mesh.get_rank()
+    pg_ranks_by_dim = device_mesh.mesh.swapdims(-1, tp_mesh_dim).reshape(
+        -1, device_mesh.mesh.size(tp_mesh_dim)
+    )
+    dim_mesh_1d = pg_ranks_by_dim[torch.any(pg_ranks_by_dim == cur_rank, 1), :]
+
+    sub_pg = device_mesh.get_dim_groups()[tp_mesh_dim]
+    return DeviceMesh(device_mesh.device_type, dim_mesh_1d.squeeze(), [sub_pg])

From e3f98f11e036a22378c56bcc3a58f2608d359329 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Sat, 19 Nov 2022 12:51:53 -0500
Subject: [PATCH 1109/1922] Reland 2 "Towards unifying symbolic and non
 symbolic fake tensor (#89038) (#89143)" (#89346)

This reverts commit 8e4c9828f4c990f439179912159086aaed790493.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89346
Approved by: https://github.com/wconstab
---
 aten/src/ATen/native/TensorFactories.cpp |  6 ---
 test/functorch/test_aotdispatch.py       |  3 --
 test/test_proxy_tensor.py                | 19 +++-----
 torch/_meta_registrations.py             | 39 +++++++++++++++-
 torch/_ops.py                            |  1 +
 torch/_prims/__init__.py                 |  5 +-
 torch/_prims_common/__init__.py          |  3 ++
 torch/_subclasses/fake_tensor.py         | 58 +++++++++---------------
 8 files changed, 71 insertions(+), 63 deletions(-)

diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 9d1c6d8a36333..7245cb77b1c50 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -325,12 +325,6 @@ Tensor empty_like(
   // See [Note: hacky wrapper removal for TensorOptions]
   TensorOptions options_ = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
 
-
-  TORCH_CHECK(
-    !(options_.has_memory_format() && optional_memory_format.has_value()),
-    "Cannot set memory_format both in TensorOptions and explicit argument; please delete "
-    "the redundant setter.");
-
   TensorOptions options =
       self.options()
           .merge_in(options_)
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index de6d82960adc8..e03fe1e153851 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1011,10 +1011,8 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('cumprod', ''),  # aten.cumprod.default - couldn't find symbolic meta function/decomposition
     xfail('cumsum', ''),  # aten.cumsum.default - couldn't find symbolic meta function/decomposition
     xfail('cumulative_trapezoid', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('deg2rad', ''),  # aten.deg2rad.default - couldn't find symbolic meta function/decomposition
     xfail('diff', ''),  # aten.zeros_like.default - couldn't find symbolic meta function/decomposition
     xfail('digamma', ''),  # aten.polygamma.default - couldn't find symbolic meta function/decomposition
-    xfail('dist', ''),  # aten.dist.default - couldn't find symbolic meta function/decomposition
     xfail('dsplit', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('fft.fft2', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('fft.fft', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
@@ -1168,7 +1166,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('prod', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('put', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('qr', ''),  # aten.linalg_qr.default - couldn't find symbolic meta function/decomposition
-    xfail('rad2deg', ''),  # aten.rad2deg.default - couldn't find symbolic meta function/decomposition
     xfail('renorm', ''),  # aten.renorm.default - couldn't find symbolic meta function/decomposition
     xfail('repeat_interleave', ''),  # aten.repeat_interleave.Te...
     xfail('reshape_as', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index e174a14837919..fa04c57d94260 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1164,9 +1164,7 @@ def f(a, b, c, d, e):
     xfail('cummin', ''),  # aten.cummin.default - couldn't find symbolic meta function/decomposition
     xfail('cumprod', ''),  # aten.cumprod.default - couldn't find symbolic meta function/decomposition
     xfail('cumulative_trapezoid', ''),  # aten.slice.Tensor - couldn't find symbolic meta function/decomposition
-    xfail('deg2rad', ''),  # aten.deg2rad.default - couldn't find symbolic meta function/decomposition
     xfail('diff', ''),  # aten.empty_like.default - couldn't find symbolic meta function/decomposition
-    xfail('dist', ''),  # aten.dist.default - couldn't find symbolic meta function/decomposition
     xfail('dsplit', ''),  # aten.slice.Tensor - couldn't find symbolic meta function/decomposition
     xfail('fft.fft2', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('fft.fft', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
@@ -1248,8 +1246,6 @@ def f(a, b, c, d, e):
     xfail('lu', ''),  # aten.linalg_lu_factor_ex.default - couldn't find symbolic meta function/decomposition
     xfail('lu_solve', ''),  # aten.linalg_lu_solve.default - couldn't find symbolic meta function/decomposition
     xfail('lu_unpack', ''),  # aten.lu_unpack.default - couldn't find symbolic meta function/decomposition
-    xfail('masked_fill', ''),  # expected predicate to be bool, got torch.float32
-    xfail('masked_scatter', ''),  # aten.masked_scatter.default - couldn't find symbolic meta function/decomposition
     xfail('masked_select', ''),  # aten.masked_select.default - couldn't find symbolic meta function/decomposition
     xfail('matrix_exp', ''),  # aten.linalg_matrix_exp.default - couldn't find symbolic meta function/decomposition
     xfail('median', ''),  # Could not run 'aten::median' with arguments from the 'Meta' backend. This could be becau...
@@ -1291,7 +1287,6 @@ def f(a, b, c, d, e):
     xfail('nn.functional.pdist', ''),  # Could not run 'aten::_pdist_forward' with arguments from the 'Meta' backend...
     xfail('nn.functional.pixel_shuffle', ''),  # aten.pixel_shuffle.default - couldn't find symbolic meta function/decompos...
     xfail('nn.functional.pixel_unshuffle', ''),  # aten.pixel_unshuffle.default - couldn't find symbolic meta function/deco...
-    xfail('nn.functional.rrelu', ''),  # aten.empty_like.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.smooth_l1_loss', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.upsample_nearest', ''),  # aten.upsample_nearest1d.vec - couldn't find symbolic meta function/deco...
     xfail('nonzero', ''),  # aten.nonzero.default - couldn't find symbolic meta function/decomposition
@@ -1307,10 +1302,8 @@ def f(a, b, c, d, e):
     xfail('polygamma', 'polygamma_n_2'),  # aten.polygamma.default - couldn't find symbolic meta function/decomposition
     xfail('polygamma', 'polygamma_n_3'),  # aten.polygamma.default - couldn't find symbolic meta function/decomposition
     xfail('polygamma', 'polygamma_n_4'),  # aten.polygamma.default - couldn't find symbolic meta function/decomposition
-    xfail('put', ''),  # aten.clone.default - couldn't find symbolic meta function/decomposition
     xfail('quantile', ''),  # Could not run 'aten::equal' with arguments from the 'Meta' backend.
     xfail('qr', ''),  # aten.linalg_qr.default - couldn't find symbolic meta function/decomposition
-    xfail('rad2deg', ''),  # aten.rad2deg.default - couldn't find symbolic meta function/decomposition
     xfail('renorm', ''),  # aten.renorm.default - couldn't find symbolic meta function/decomposition
     xfail('repeat_interleave', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('reshape_as', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
@@ -1356,14 +1349,18 @@ def f(a, b, c, d, e):
 
 symbolic_tensor_failures.update(symbolic_tensor_segfaults)
 
+outplace_symbolic_tensor_failures = {
+    xfail('masked_fill', ''),  # expected predicate to be bool, got torch.float32
+    xfail('masked_scatter', ''),  # aten.masked_scatter.default - couldn't find symbolic meta function/decomposition
+    xfail('nn.functional.rrelu', ''),  # aten.empty_like.default - couldn't find symbolic meta function/decomposition
+}
+
 inplace_symbolic_tensor_failures = {
     # bugs
     xfail('float_power', ''),  # base given to float_power_ has dtype Float but the operation's result requires dtype Double
     # decomp not implemented
-    xfail('addbmm', ''),
     xfail('addmm', ''),
     xfail('addmm', 'decomposed'),
-    xfail('logit', ''),
     xfail('nn.functional.hardsigmoid', ''),
     xfail('round', ''),  # ref missing a kwarg
     xfail('round', 'decimals_0'),  # ref missing a kwarg
@@ -1373,10 +1370,8 @@ def f(a, b, c, d, e):
     # in-place has a different signature than out-of-place
     xfail('uniform', ''),
     # Views
-    xfail('squeeze', ''),
     xfail('t', ''),
     xfail('transpose', ''),
-    xfail('nn.functional.dropout3d', ''),  # calls unsqueeze_
 }
 
 # Copies inputs to inplace operations to avoid inplace modifications
@@ -1452,7 +1447,7 @@ def test_make_fx_fake_exhaustive(self, device, dtype, op):
     @skipIfNoSympy
     @ops(op_db, allowed_dtypes=(torch.float,))
     @skipOps('TestProxyTensorOpInfo', 'test_make_fx_symbolic_exhaustive',
-             make_fx_failures | fake_tensor_failures | symbolic_tensor_failures)
+             make_fx_failures | fake_tensor_failures | symbolic_tensor_failures | outplace_symbolic_tensor_failures)
     def test_make_fx_symbolic_exhaustive(self, device, dtype, op):
         _test_make_fx_helper(self, device, dtype, op, "symbolic")
 
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 4fa3ab09d2755..9849df0a58af5 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1513,7 +1513,6 @@ def full(size, fill_value, *args, **kwargs):
         aten.randn_like.default,
         aten.rand_like.default,
         aten.full_like.default,
-        aten.zeros_like.default,
         aten.ones_like.default,
     ]
 )
@@ -1521,6 +1520,44 @@ def meta_like(self, *args, **kwargs):
     return aten.empty_like.default(self, **kwargs)
 
 
+# zeros_like is special cased to work for sparse
+@register_meta(aten.zeros_like.default)
+def zeros_like(
+    self, dtype=None, layout=None, device=None, pin_memory=None, memory_format=None
+):
+    if layout == torch.sparse_coo:
+        check(
+            memory_format is None,
+            lambda: "memory format option is only supported by strided tensors",
+        )
+
+        res = torch.empty(
+            0,
+            dtype=self.dtype if dtype is None else dtype,
+            layout=layout,
+            device=self.device if device is None else device,
+            pin_memory=pin_memory,
+        )
+
+        if self.is_sparse:
+            res.sparse_resize_and_clear_(
+                self.size(), self.sparse_dim(), self.dense_dim()
+            )
+        else:
+            res.sparse_resize_and_clear_(self.size(), self.dim(), 0)
+
+        res._coalesced_(True)
+        return res
+    return aten.empty_like.default(
+        self,
+        dtype=dtype,
+        layout=layout,
+        device=device,
+        pin_memory=pin_memory,
+        memory_format=memory_format,
+    )
+
+
 # hacky: Please remove after math.ceil works with arange
 @register_meta(aten.arange.default)
 def arange(end, **kwargs):
diff --git a/torch/_ops.py b/torch/_ops.py
index 9163932144d0d..b20398a7f3ab3 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -365,6 +365,7 @@ def handler(*args, **kwargs):
             return handler
 
         final_key = resolve_key(self, key)
+        # print(self, key, final_key)
         r = self.py_kernels.get(final_key, final_key)
         self._dispatch_cache[key] = r
         return r
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index 22917ec048eb9..67e16ca102ac1 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -1150,9 +1150,6 @@ def _minimum_aten(
 
 #
 # View operations
-#
-# TODO: model view relationships
-# TODO: model storage
 def _as_strided_meta(
     a: TensorLikeType, size: ShapeType, stride: StrideType, storage_offset: int
 ) -> TensorLikeType:
@@ -1170,7 +1167,7 @@ def _as_strided_meta(
             a._typed_storage(), size, stride, storage_offset
         )
 
-    return TensorMeta(a, shape=size, strides=stride)
+    return torch.as_strided(a, size, stride, storage_offset)
 
 
 def _as_strided_aten(
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index 7752f18361411..6df72f6c158d4 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -291,6 +291,9 @@ def is_non_overlapping_and_dense(a: Tensor) -> bool:
     its dimensions that is contiguous.
     """
 
+    if a.is_sparse:
+        return False
+
     # Short-circuits if the tensor is already contiguous or channels-last contiguous
     if is_contiguous(a) or is_channels_last_contiguous(a):
         return True
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 5d3d3a0e32fe1..9a0ac050e6b94 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -1,7 +1,6 @@
 import contextlib
 import functools
 import itertools
-import sys
 import weakref
 from dataclasses import dataclass
 from functools import partial
@@ -297,8 +296,9 @@ def constructors(fake_mode, func, *args, **kwargs):
     out_device = new_kwargs.pop("device", None)
     out_device = out_device if out_device is not None else default_device
     new_kwargs["device"] = torch.device("meta")
-    # Not in_kernel_invocation_manager as no fake tensor inputs
-    with no_dispatch():
+    # _like constructors have fake tensor inputs (maybe this causes the non-like
+    # to fail? hmmm)
+    with in_kernel_invocation_manager(fake_mode):
         r = func(*args, **new_kwargs)
     return FakeTensor(fake_mode, r, out_device)
 
@@ -821,40 +821,30 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         # is written to must be invalidated
         self.invalidate_written_to_constants(func, flat_arg_fake_tensors, args, kwargs)
 
-        from torch._decomp import decomposition_table
-
-        with self:
-            # Decomposes CompositeImplicitAutograd ops
-            r = func.decompose(*args, **kwargs)
-            if r is not NotImplemented:
-                return r
+        # If there's a Python meta, prefer that over the decomposition
+        from torch._decomp import meta_table as meta_table
 
-        # IDK: feels bad man, sym_numel on as_strided infinite loops otherwise
-        if has_symbolic_sizes and not self.cpp_meta_supports_symint(func):
-            from torch._decomp import meta_table as meta_table
+        if func not in meta_table and not self.cpp_meta_supports_symint(func):
+            from torch._decomp import decomposition_table
 
-            if func == aten.size.default:
-                sys.stderr.write(
-                    "Trying to call aten.size on a tensor with symbolic shapes. "
-                    "It's likely that this is from calling tensor.shape in C++"
+            # Prefer Python decompositions over C++ ones
+            if func in decomposition_table and (
+                has_symbolic_sizes
+                or (
+                    # TODO: Remove these exclusions, so that we can remove
+                    # this leg entirely
+                    torch_decomp_decompositions(func)
+                    and all(not e.is_sparse for e in flat_arg_fake_tensors)
                 )
-                # We do this to allow for better error localization with `TORCH_SHOW_CPP_STACKTRACES=1`
-                return None
-
-            with self:
-                if func in meta_table:
-                    r = meta_table[func](*args, **kwargs)
-                    return r
-                if func in decomposition_table:
+            ):
+                with self:
                     return decomposition_table[func](*args, **kwargs)
 
-        if (
-            func in decomposition_table
-            and torch_decomp_decompositions(func)
-            and all(not e.is_sparse for e in flat_arg_fake_tensors)
-        ):
             with self:
-                return decomposition_table[func](*args, **kwargs)
+                # Decomposes CompositeImplicitAutograd ops
+                r = func.decompose(*args, **kwargs)
+                if r is not NotImplemented:
+                    return r
 
         # prims already wrap FakeTensor inputs to FakeTensor outputs
         # and do device logic, we dont need do anything but run them
@@ -865,12 +855,6 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
             with self:
                 return func.prim_meta_impl(*args, **kwargs)
 
-        if has_symbolic_sizes:
-            if not self.cpp_meta_supports_symint(func):
-                raise RuntimeError(
-                    f"{func} - couldn't find symbolic meta function/decomposition"
-                )
-
         # special handling for funcs registered through `register_op_impl`,
         # e.g., manipulating args on constructor calls to construct meta tensors
         # and then afterwards wrapping them to a FakeTensor

From c508018f339e96042f00e2e3463f1dfa1dd0d658 Mon Sep 17 00:00:00 2001
From: kvathupo <kalinda619@gmail.com>
Date: Sat, 19 Nov 2022 21:40:07 +0000
Subject: [PATCH 1110/1922] Add nullptr_t overload to c10::intrusive_ptr 
 (#89196)

__What?__

Fixes #82413
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89196
Approved by: https://github.com/ezyang
---
 c10/test/util/intrusive_ptr_test.cpp | 5 +++++
 c10/util/intrusive_ptr.h             | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/c10/test/util/intrusive_ptr_test.cpp b/c10/test/util/intrusive_ptr_test.cpp
index 7ed1c292841d5..632fe7fc2f202 100644
--- a/c10/test/util/intrusive_ptr_test.cpp
+++ b/c10/test/util/intrusive_ptr_test.cpp
@@ -146,6 +146,11 @@ TEST(IntrusivePtrTest, givenInvalidPtr_whenCallingGet_thenReturnsNullptr) {
   EXPECT_EQ(nullptr, obj.get());
 }
 
+TEST(IntrusivePtrTest, givenNullptr_whenCallingGet_thenReturnsNullptr) {
+  intrusive_ptr<SomeClass1Parameter> obj(nullptr);
+  EXPECT_EQ(nullptr, obj.get());
+}
+
 TEST(IntrusivePtrTest, givenValidPtr_whenDereferencing_thenReturnsObject) {
   intrusive_ptr<SomeClass1Parameter> obj =
       make_intrusive<SomeClass1Parameter>(5);
diff --git a/c10/util/intrusive_ptr.h b/c10/util/intrusive_ptr.h
index c87305b08be57..e75c1980fdfa7 100644
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@@ -326,6 +326,9 @@ class intrusive_ptr final {
   intrusive_ptr() noexcept
       : intrusive_ptr(NullType::singleton(), raw::DontIncreaseRefcount{}) {}
 
+  intrusive_ptr(std::nullptr_t) noexcept
+      : intrusive_ptr(NullType::singleton(), raw::DontIncreaseRefcount{}) {}
+
   // This constructor will not increase the ref counter for you.
   // We use the tagged dispatch mechanism to explicitly mark this constructor
   // to not increase the refcount

From 0acfa84fb82a71343e19e248f5dfa83995c638ef Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Sat, 19 Nov 2022 21:47:55 +0000
Subject: [PATCH 1111/1922] Revert "Symintify numel(), infer_size,
 prims.elementwise_meta (#88956)"

This reverts commit ce2f8700bafcf44850402a39188ec121ba8b5486.

Reverted https://github.com/pytorch/pytorch/pull/88956 on behalf of https://github.com/ezyang due to somehow breaks torch.numel
---
 aten/src/ATen/ExpandUtils.cpp                 | 10 ++++----
 aten/src/ATen/ExpandUtils.h                   |  2 --
 test/test_proxy_tensor.py                     | 25 +++----------------
 torch/_prims/__init__.py                      | 16 +++---------
 torch/_refs/__init__.py                       |  4 +--
 torch/_subclasses/fake_tensor.py              |  6 ++++-
 torch/csrc/autograd/input_metadata.h          |  4 +--
 .../python_torch_functions_manual.cpp         |  2 +-
 torch/fx/experimental/symbolic_shapes.py      |  3 +++
 torch/fx/traceback.py                         |  2 +-
 10 files changed, 26 insertions(+), 48 deletions(-)

diff --git a/aten/src/ATen/ExpandUtils.cpp b/aten/src/ATen/ExpandUtils.cpp
index ee846c9b82e34..a44005a2ef815 100644
--- a/aten/src/ATen/ExpandUtils.cpp
+++ b/aten/src/ATen/ExpandUtils.cpp
@@ -13,8 +13,8 @@ TensorBase expand_slow_path(const TensorBase &self, IntArrayRef size) {
 
 namespace {
 // NOTE: are_expandable did a similar check, please keep them sync if change is needed
-template <typename Container, typename ArrayType>
-Container infer_size_impl(ArrayType a, ArrayType b) {
+template <typename Container>
+Container infer_size_impl(IntArrayRef a, IntArrayRef b) {
   size_t dimsA = a.size();
   size_t dimsB = b.size();
   size_t ndim = dimsA > dimsB ? dimsA : dimsB;
@@ -25,8 +25,8 @@ Container infer_size_impl(ArrayType a, ArrayType b) {
     ptrdiff_t offset = ndim - 1 - i;
     ptrdiff_t dimA = dimsA - 1 - offset;
     ptrdiff_t dimB = dimsB - 1 - offset;
-    auto sizeA = (dimA >= 0) ? a[dimA] : 1;
-    auto sizeB = (dimB >= 0) ? b[dimB] : 1;
+    int64_t sizeA = (dimA >= 0) ? a[dimA] : 1;
+    int64_t sizeB = (dimB >= 0) ? b[dimB] : 1;
 
     TORCH_CHECK(
         sizeA == sizeB || sizeA == 1 || sizeB == 1,
@@ -35,7 +35,7 @@ Container infer_size_impl(ArrayType a, ArrayType b) {
         ") at non-singleton dimension ", i);
 
       // 1s map to the other size (even 0).
-      expandedSizes[i] = sizeA == 1 ? std::move(sizeB) : std::move(sizeA);
+      expandedSizes[i] = sizeA == 1 ? sizeB : sizeA;
   }
 
   return expandedSizes;
diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h
index 9e48421e540fe..786cbf132cd77 100644
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@@ -21,8 +21,6 @@ namespace at {
 
 TORCH_API std::vector<int64_t> infer_size(IntArrayRef a, IntArrayRef b);
 TORCH_API DimVector infer_size_dimvector(IntArrayRef a, IntArrayRef b);
-TORCH_API SymDimVector
-infer_size_symdimvector(SymIntArrayRef a, SymIntArrayRef b);
 
 // Named type instead of a pair/tuple so that we can be sure to
 // construct the vectors in place and get NRVO.
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index fa04c57d94260..34edc5cfac949 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -13,7 +13,6 @@
 from torch._subclasses.fake_tensor import DynamicOutputShapeException
 
 from torch._decomp import decomposition_table
-from torch.fx.experimental.symbolic_shapes import sym_float
 from torch.testing._internal.common_device_type import ops
 from torch._C import _disabled_torch_function_impl
 from torch.fx.experimental.proxy_tensor import make_fx, DecompositionInterpreter, get_isolated_graphmodule, has_proxy
@@ -733,6 +732,7 @@ def deco(cls):
 
 @skipIfNoSympy
 @xfail_inherited_tests([
+    "test_mode_tracing_factory_function",
     "test_make_fx_overloads",
     "test_trace_subclasses",
 ])
@@ -972,27 +972,8 @@ def f(x):
         # happened afterwards
         self.assertTrue(meta_inp.meta['val'].shape[0].get_pyobj().expr == 3)
 
-    def test_elementwise_meta_with_sym_numbers(self):
-        def f(x, offset, as_sym_float=False):
-            x0 = x.size()[0]
-            if as_sym_float:
-                x0 = sym_float(x0)
-            return torch.add(x0, offset)
-
-        fx_g = make_fx(f, tracing_mode="symbolic")(torch.rand(2, 3), 2.0, False)
-        meta_add = _get_node(fx_g, lambda x: x.target == aten.add.Tensor)
-        self.assertEqual(meta_add.meta['val'].shape, ())
-        self.assertEqual(meta_add.meta['val'].dtype, torch.float32)
-
-        fx_g = make_fx(f, tracing_mode="symbolic")(torch.rand(2, 3), 2, False)
-        meta_add = _get_node(fx_g, lambda x: x.target == aten.add.Tensor)
-        self.assertEqual(meta_add.meta['val'].shape, ())
-        self.assertEqual(meta_add.meta['val'].dtype, torch.int64)
-
-        fx_g = make_fx(f, tracing_mode="symbolic")(torch.rand(2, 3), 2, True)
-        meta_add = _get_node(fx_g, lambda x: x.target == aten.add.Tensor)
-        self.assertEqual(meta_add.meta['val'].shape, ())
-        self.assertEqual(meta_add.meta['val'].dtype, torch.float32)
+
+
 
     def test_return_symint(self):
         def f(x):
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index 67e16ca102ac1..a867a44f72e30 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -31,7 +31,6 @@
 )
 from torch._prims_common.wrappers import backwards_not_supported
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
-from torch.fx.experimental.symbolic_shapes import sym_float
 from torch.overrides import handle_torch_function, has_torch_function
 from torch.utils._pytree import tree_flatten, tree_map, tree_unflatten
 
@@ -391,18 +390,11 @@ def _elementwise_meta(
         return TensorMeta(device=device, shape=shape, strides=strides, dtype=dtype)
 
     # Number case
+    # NOTE: this case is not currently exercised
     # TODO: fix number type promotion (bool, complex->float)
-
-    # For now for symint/float, just implementing the common / simple cases of (int,float,symint,symfloat)
-    seen_float = False
-    if isinstance(number, (torch.SymInt, torch.SymFloat)):
-        for a in args:
-            assert isinstance(a, (int, float, torch.SymInt, torch.SymFloat)), "NYI"
-            seen_float = seen_float or isinstance(a, (float, torch.SymFloat))
-        if seen_float:
-            number = sym_float(number)
-
-    return TensorMeta(number)  # type: ignore[arg-type]
+    assert not isinstance(number, torch.SymInt), "NYI"
+    assert not isinstance(number, torch.SymFloat), "NYI"
+    return TensorMeta(number)
 
 
 def _complex_only_elementwise_meta(*args, **kwargs):
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 3355400db43cc..8ea1390a4449a 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -744,10 +744,10 @@ def nan_to_num(
         nan = 0.0
 
     if posinf is None:
-        posinf = torch.finfo(a.dtype).max
+        posinf = prims.maximum_value(a.dtype)
 
     if neginf is None:
-        neginf = torch.finfo(a.dtype).min
+        neginf = prims.minimum_value(a.dtype)
 
     result = where(isnan(a), nan, a)
 
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 9a0ac050e6b94..f52bec927b113 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -851,7 +851,11 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         # and ensure that Meta kernels are dispatched to (see)
         # Fake Tensor Dispatch Keys
         # TODO - we should be use the prim aten impl
-        if "prims::" in func._schema.name and hasattr(func, "prim_meta_impl"):
+        if (
+            "prims::" in func._schema.name
+            and len(flat_arg_fake_tensors) != 0
+            and hasattr(func, "prim_meta_impl")
+        ):
             with self:
                 return func.prim_meta_impl(*args, **kwargs)
 
diff --git a/torch/csrc/autograd/input_metadata.h b/torch/csrc/autograd/input_metadata.h
index 8060c11ac4575..7cb9e8aedb195 100644
--- a/torch/csrc/autograd/input_metadata.h
+++ b/torch/csrc/autograd/input_metadata.h
@@ -125,13 +125,13 @@ struct InputMetadata {
     if (grad.is_nested()) {
       ss << at::native::get_nested_size_tensor(grad);
     } else {
-      ss << grad.sym_sizes();
+      ss << grad.sizes();
     }
     ss << " but expected shape compatible with ";
     if (is_nested_tensor()) {
       ss << shape_as_tensor();
     } else {
-      ss << shape_as_dim_vector();
+      ss << c10::asIntArrayRefSlow(shape_as_dim_vector());
     }
     return ss;
   }
diff --git a/torch/csrc/autograd/python_torch_functions_manual.cpp b/torch/csrc/autograd/python_torch_functions_manual.cpp
index 2c4999c971eab..562f5a427d380 100644
--- a/torch/csrc/autograd/python_torch_functions_manual.cpp
+++ b/torch/csrc/autograd/python_torch_functions_manual.cpp
@@ -692,7 +692,7 @@ static PyObject* THPVariable_numel(
   }
 
   if (r.idx == 0) {
-    return wrap(r.tensor(0).sym_numel());
+    return wrap(r.tensor(0).numel());
   }
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index bd52760502c6b..ae4427e2320e9 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -261,6 +261,9 @@ def eval(cls, base, divisor):
     'floordiv': lambda a, b: FloorDiv(a, b),
 }
 
+def _nyi():
+    raise NotImplementedError()
+
 magic_methods = {
     **reflectable_magic_methods,
     'eq': lambda a, b: sympy.Eq(a, b),
diff --git a/torch/fx/traceback.py b/torch/fx/traceback.py
index cee7626e5c83a..a07b36b997bdb 100644
--- a/torch/fx/traceback.py
+++ b/torch/fx/traceback.py
@@ -54,7 +54,7 @@ def format_stack() -> List[str]:
         return current_stack.copy()
     else:
         # fallback to traceback.format_stack()
-        return traceback.format_list(traceback.extract_stack()[:-1])
+        return traceback.format_stack()
 
 
 @compatibility(is_backward_compatible=False)

From f0c1ae6bdcf9af9d790b12d29f00a245e1da0811 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Sat, 19 Nov 2022 12:52:39 -0500
Subject: [PATCH 1112/1922] Add support for dynamic kwarg to
 torch._dynamo.optimize (#89290)

This is an easier way to enable dynamic shapes for a region.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89290
Approved by: https://github.com/soumith, https://github.com/jansel, https://github.com/voznesenskym
---
 test/dynamo/test_subgraphs.py | 12 ++++++++++
 torch/_dynamo/eval_frame.py   | 43 +++++++++++++++++++++++++++++++----
 2 files changed, 50 insertions(+), 5 deletions(-)

diff --git a/test/dynamo/test_subgraphs.py b/test/dynamo/test_subgraphs.py
index 3a38561f16d2a..27f73026435cd 100644
--- a/test/dynamo/test_subgraphs.py
+++ b/test/dynamo/test_subgraphs.py
@@ -367,6 +367,18 @@ def fn(a, b):
         # just one graph now rather than 10
         self.assertEqual(cnt_dynamic.frame_count, 1)
 
+    def test_dynamic_kwarg(self):
+        def fn(a, b):
+            return a - b * 10
+
+        torch._dynamo.reset()
+        cnt_dynamic = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnt_dynamic, dynamic=True)(fn)
+        for i in range(10):
+            opt_fn(torch.randn(i), torch.randn(i))
+        # just one graph
+        self.assertEqual(cnt_dynamic.frame_count, 1)
+
     @patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
     def test_no_graph_break_on_item(self):
         def fn(a, b):
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 31fb479906e1f..65e8af4883ab3 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -100,6 +100,17 @@ def innermost_fn(fn):
     return unaltered_fn
 
 
+@contextlib.contextmanager
+def enable_dynamic(enable: bool = True):
+    if not enable:
+        yield
+        return
+    with patch("torch._dynamo.config.dynamic_shapes", True), patch(
+        "functorch._src.config.use_dynamic_shapes", True
+    ):
+        yield
+
+
 class _TorchDynamoContext:
     def __init__(
         self,
@@ -108,6 +119,8 @@ def __init__(
         backend_ctx_ctor=null_context,
         patch_fn=nothing,
         first_ctx=False,
+        *,
+        dynamic=False,
     ):
         super().__init__()
         assert callable(callback) or callback is False or callback is None
@@ -116,6 +129,7 @@ def __init__(
         self.on_enter = on_enter
         self.extra_ctx_ctor = backend_ctx_ctor
         self.first_ctx = first_ctx
+        self.dynamic = dynamic
         patch_fn()
 
     def __enter__(self):
@@ -129,10 +143,14 @@ def __enter__(self):
         self.prior = set_eval_frame(self.callback)
         self.backend_ctx = self.extra_ctx_ctor()
         self.backend_ctx.__enter__()
+        self.dynamic_ctx = enable_dynamic(self.dynamic)
+        self.dynamic_ctx.__enter__()
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         set_eval_frame(self.prior)
         self.prior = unset
+        # TODO: This is totally not the right way to chain contexts manually
+        self.dynamic_ctx.__exit__(exc_type, exc_val, exc_tb)
         self.backend_ctx.__exit__(exc_type, exc_val, exc_tb)
 
     def __call__(self, fn):
@@ -170,10 +188,13 @@ def _fn(*args, **kwargs):
             prior = set_eval_frame(callback)
             backend_ctx = backend_ctx_ctor()
             backend_ctx.__enter__()
+            dynamic_ctx = enable_dynamic(self.dynamic)
+            dynamic_ctx.__enter__()
             try:
                 return fn(*args, **kwargs)
             finally:
                 set_eval_frame(prior)
+                dynamic_ctx.__exit__(None, None, None)
                 backend_ctx.__exit__(None, None, None)
 
         # hooks to properly handle inlining
@@ -229,7 +250,7 @@ def _fn(*args, **kwargs):
 
 
 class OptimizeContext(_TorchDynamoContext):
-    def __init__(self, callback, backend_ctx_ctor, first_ctx=False):
+    def __init__(self, callback, backend_ctx_ctor, first_ctx=False, *, dynamic=False):
         def on_enter():
             global most_recent_backend
             if (
@@ -247,6 +268,7 @@ def on_enter():
             backend_ctx_ctor=backend_ctx_ctor,
             patch_fn=TorchPatcher.patch,
             first_ctx=first_ctx,
+            dynamic=dynamic,
         )
 
 
@@ -289,11 +311,12 @@ def catch_errors(frame, cache_size):
     return catch_errors
 
 
-def _optimize_catch_errors(compile_fn, backend_ctx_ctor=null_context):
+def _optimize_catch_errors(compile_fn, backend_ctx_ctor=null_context, dynamic=False):
     return OptimizeContext(
         catch_errors_wrapper(compile_fn),
         backend_ctx_ctor=backend_ctx_ctor,
         first_ctx=True,
+        dynamic=dynamic,
     )
 
 
@@ -375,7 +398,12 @@ def __call__(self, fn):
 
 
 def optimize(
-    backend="inductor", *, nopython=False, guard_export_fn=None, disable=False
+    backend="inductor",
+    *,
+    nopython=False,
+    guard_export_fn=None,
+    disable=False,
+    dynamic=False,
 ):
     """
     The main entrypoint of TorchDynamo.  Do graph capture and call
@@ -393,6 +421,7 @@ def optimize(
         nopython: If True, graph breaks will be errors and there will
             be a single whole-program graph.
         disable: If True, turn this decorator into a no-op
+        dynamic: If True, turn on dynamic shapes support
 
     Example Usage:
 
@@ -422,10 +451,13 @@ def toy_example(a, b):
     backend_ctx_ctor = getattr(backend, "backend_ctx_ctor", null_context)
 
     if nopython:
-        return optimize_assert(backend, guard_export_fn=guard_export_fn)
+        return optimize_assert(
+            backend, guard_export_fn=guard_export_fn, dynamic=dynamic
+        )
     return _optimize_catch_errors(
         convert_frame.convert_frame(backend, guard_export_fn=guard_export_fn),
         backend_ctx_ctor,
+        dynamic=dynamic,
     )
 
 
@@ -655,7 +687,7 @@ def assume_constant_result(fn):
     return fn
 
 
-def optimize_assert(backend, *, guard_export_fn=None, export=False):
+def optimize_assert(backend, *, guard_export_fn=None, export=False, dynamic=False):
     """
     The same as `torch._dynamo.optimize(backend, nopython=True)`
     """
@@ -667,6 +699,7 @@ def optimize_assert(backend, *, guard_export_fn=None, export=False):
     return _optimize_catch_errors(
         convert_frame.convert_frame_assert(backend, guard_export_fn, export=export),
         backend_ctx_ctor,
+        dynamic=dynamic,
     )
 
 
From 10f8883ac7050ac69887eca02872d8be62c7184b Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Sat, 19 Nov 2022 23:10:34 +0000
Subject: [PATCH 1113/1922] Symintify numel(), infer_size,
 prims.elementwise_meta (#88956)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88956
Approved by: https://github.com/ezyang
---
 aten/src/ATen/ExpandUtils.cpp                 | 10 ++++----
 aten/src/ATen/ExpandUtils.h                   |  2 ++
 test/test_dynamic_shapes.py                   | 11 ++++++++
 test/test_proxy_tensor.py                     | 25 ++++++++++++++++---
 torch/_prims/__init__.py                      | 16 +++++++++---
 torch/_refs/__init__.py                       |  4 +--
 torch/_subclasses/fake_tensor.py              |  6 +----
 torch/csrc/autograd/input_metadata.h          |  4 +--
 .../python_torch_functions_manual.cpp         |  2 +-
 torch/fx/experimental/symbolic_shapes.py      |  3 ---
 torch/fx/traceback.py                         |  2 +-
 11 files changed, 59 insertions(+), 26 deletions(-)

diff --git a/aten/src/ATen/ExpandUtils.cpp b/aten/src/ATen/ExpandUtils.cpp
index a44005a2ef815..ee846c9b82e34 100644
--- a/aten/src/ATen/ExpandUtils.cpp
+++ b/aten/src/ATen/ExpandUtils.cpp
@@ -13,8 +13,8 @@ TensorBase expand_slow_path(const TensorBase &self, IntArrayRef size) {
 
 namespace {
 // NOTE: are_expandable did a similar check, please keep them sync if change is needed
-template <typename Container>
-Container infer_size_impl(IntArrayRef a, IntArrayRef b) {
+template <typename Container, typename ArrayType>
+Container infer_size_impl(ArrayType a, ArrayType b) {
   size_t dimsA = a.size();
   size_t dimsB = b.size();
   size_t ndim = dimsA > dimsB ? dimsA : dimsB;
@@ -25,8 +25,8 @@ Container infer_size_impl(IntArrayRef a, IntArrayRef b) {
     ptrdiff_t offset = ndim - 1 - i;
     ptrdiff_t dimA = dimsA - 1 - offset;
     ptrdiff_t dimB = dimsB - 1 - offset;
-    int64_t sizeA = (dimA >= 0) ? a[dimA] : 1;
-    int64_t sizeB = (dimB >= 0) ? b[dimB] : 1;
+    auto sizeA = (dimA >= 0) ? a[dimA] : 1;
+    auto sizeB = (dimB >= 0) ? b[dimB] : 1;
 
     TORCH_CHECK(
         sizeA == sizeB || sizeA == 1 || sizeB == 1,
@@ -35,7 +35,7 @@ Container infer_size_impl(IntArrayRef a, IntArrayRef b) {
         ") at non-singleton dimension ", i);
 
       // 1s map to the other size (even 0).
-      expandedSizes[i] = sizeA == 1 ? sizeB : sizeA;
+      expandedSizes[i] = sizeA == 1 ? std::move(sizeB) : std::move(sizeA);
   }
 
   return expandedSizes;
diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h
index 786cbf132cd77..9e48421e540fe 100644
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@@ -21,6 +21,8 @@ namespace at {
 
 TORCH_API std::vector<int64_t> infer_size(IntArrayRef a, IntArrayRef b);
 TORCH_API DimVector infer_size_dimvector(IntArrayRef a, IntArrayRef b);
+TORCH_API SymDimVector
+infer_size_symdimvector(SymIntArrayRef a, SymIntArrayRef b);
 
 // Named type instead of a pair/tuple so that we can be sure to
 // construct the vectors in place and get NRVO.
diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index 3a8e31151bf37..953b6d9a53f64 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -291,6 +291,17 @@ def test_size_expressions(self):
         self.assertTrue(str(expand_x.shape[1]), str(x.shape[0]))
         self.assertTrue(str(expand_x.shape[1]), str(result.shape[0]))
 
+    @skipIfNoSympy
+    def test_numel(self):
+        shape_env = ShapeEnv()
+        x = create_symbolic_tensor("x", torch.randn(5), shape_env)
+        self.assertIsInstance(x.numel(), torch.SymInt)
+        self.assertIsInstance(torch.numel(x), torch.SymInt)
+
+        x = torch.rand(3, 3)
+        self.assertIsInstance(x.numel(), int)
+        self.assertIsInstance(torch.numel(x), int)
+
     @skipIfNoSympy
     def test_int_to_float(self):
         shape_env = ShapeEnv()
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 34edc5cfac949..fa04c57d94260 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -13,6 +13,7 @@
 from torch._subclasses.fake_tensor import DynamicOutputShapeException
 
 from torch._decomp import decomposition_table
+from torch.fx.experimental.symbolic_shapes import sym_float
 from torch.testing._internal.common_device_type import ops
 from torch._C import _disabled_torch_function_impl
 from torch.fx.experimental.proxy_tensor import make_fx, DecompositionInterpreter, get_isolated_graphmodule, has_proxy
@@ -732,7 +733,6 @@ def deco(cls):
 
 @skipIfNoSympy
 @xfail_inherited_tests([
-    "test_mode_tracing_factory_function",
     "test_make_fx_overloads",
     "test_trace_subclasses",
 ])
@@ -972,8 +972,27 @@ def f(x):
         # happened afterwards
         self.assertTrue(meta_inp.meta['val'].shape[0].get_pyobj().expr == 3)
 
-
-
+    def test_elementwise_meta_with_sym_numbers(self):
+        def f(x, offset, as_sym_float=False):
+            x0 = x.size()[0]
+            if as_sym_float:
+                x0 = sym_float(x0)
+            return torch.add(x0, offset)
+
+        fx_g = make_fx(f, tracing_mode="symbolic")(torch.rand(2, 3), 2.0, False)
+        meta_add = _get_node(fx_g, lambda x: x.target == aten.add.Tensor)
+        self.assertEqual(meta_add.meta['val'].shape, ())
+        self.assertEqual(meta_add.meta['val'].dtype, torch.float32)
+
+        fx_g = make_fx(f, tracing_mode="symbolic")(torch.rand(2, 3), 2, False)
+        meta_add = _get_node(fx_g, lambda x: x.target == aten.add.Tensor)
+        self.assertEqual(meta_add.meta['val'].shape, ())
+        self.assertEqual(meta_add.meta['val'].dtype, torch.int64)
+
+        fx_g = make_fx(f, tracing_mode="symbolic")(torch.rand(2, 3), 2, True)
+        meta_add = _get_node(fx_g, lambda x: x.target == aten.add.Tensor)
+        self.assertEqual(meta_add.meta['val'].shape, ())
+        self.assertEqual(meta_add.meta['val'].dtype, torch.float32)
 
     def test_return_symint(self):
         def f(x):
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index a867a44f72e30..67e16ca102ac1 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -31,6 +31,7 @@
 )
 from torch._prims_common.wrappers import backwards_not_supported
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+from torch.fx.experimental.symbolic_shapes import sym_float
 from torch.overrides import handle_torch_function, has_torch_function
 from torch.utils._pytree import tree_flatten, tree_map, tree_unflatten
 
@@ -390,11 +391,18 @@ def _elementwise_meta(
         return TensorMeta(device=device, shape=shape, strides=strides, dtype=dtype)
 
     # Number case
-    # NOTE: this case is not currently exercised
     # TODO: fix number type promotion (bool, complex->float)
-    assert not isinstance(number, torch.SymInt), "NYI"
-    assert not isinstance(number, torch.SymFloat), "NYI"
-    return TensorMeta(number)
+
+    # For now for symint/float, just implementing the common / simple cases of (int,float,symint,symfloat)
+    seen_float = False
+    if isinstance(number, (torch.SymInt, torch.SymFloat)):
+        for a in args:
+            assert isinstance(a, (int, float, torch.SymInt, torch.SymFloat)), "NYI"
+            seen_float = seen_float or isinstance(a, (float, torch.SymFloat))
+        if seen_float:
+            number = sym_float(number)
+
+    return TensorMeta(number)  # type: ignore[arg-type]
 
 
 def _complex_only_elementwise_meta(*args, **kwargs):
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 8ea1390a4449a..3355400db43cc 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -744,10 +744,10 @@ def nan_to_num(
         nan = 0.0
 
     if posinf is None:
-        posinf = prims.maximum_value(a.dtype)
+        posinf = torch.finfo(a.dtype).max
 
     if neginf is None:
-        neginf = prims.minimum_value(a.dtype)
+        neginf = torch.finfo(a.dtype).min
 
     result = where(isnan(a), nan, a)
 
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index f52bec927b113..9a0ac050e6b94 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -851,11 +851,7 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         # and ensure that Meta kernels are dispatched to (see)
         # Fake Tensor Dispatch Keys
         # TODO - we should be use the prim aten impl
-        if (
-            "prims::" in func._schema.name
-            and len(flat_arg_fake_tensors) != 0
-            and hasattr(func, "prim_meta_impl")
-        ):
+        if "prims::" in func._schema.name and hasattr(func, "prim_meta_impl"):
             with self:
                 return func.prim_meta_impl(*args, **kwargs)
 
diff --git a/torch/csrc/autograd/input_metadata.h b/torch/csrc/autograd/input_metadata.h
index 7cb9e8aedb195..8060c11ac4575 100644
--- a/torch/csrc/autograd/input_metadata.h
+++ b/torch/csrc/autograd/input_metadata.h
@@ -125,13 +125,13 @@ struct InputMetadata {
     if (grad.is_nested()) {
       ss << at::native::get_nested_size_tensor(grad);
     } else {
-      ss << grad.sizes();
+      ss << grad.sym_sizes();
     }
     ss << " but expected shape compatible with ";
     if (is_nested_tensor()) {
       ss << shape_as_tensor();
     } else {
-      ss << c10::asIntArrayRefSlow(shape_as_dim_vector());
+      ss << shape_as_dim_vector();
     }
     return ss;
   }
diff --git a/torch/csrc/autograd/python_torch_functions_manual.cpp b/torch/csrc/autograd/python_torch_functions_manual.cpp
index 562f5a427d380..bd969f6a26fb2 100644
--- a/torch/csrc/autograd/python_torch_functions_manual.cpp
+++ b/torch/csrc/autograd/python_torch_functions_manual.cpp
@@ -692,7 +692,7 @@ static PyObject* THPVariable_numel(
   }
 
   if (r.idx == 0) {
-    return wrap(r.tensor(0).numel());
+    return py::cast(r.tensor(0).sym_numel()).release().ptr();
   }
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index ae4427e2320e9..bd52760502c6b 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -261,9 +261,6 @@ def eval(cls, base, divisor):
     'floordiv': lambda a, b: FloorDiv(a, b),
 }
 
-def _nyi():
-    raise NotImplementedError()
-
 magic_methods = {
     **reflectable_magic_methods,
     'eq': lambda a, b: sympy.Eq(a, b),
diff --git a/torch/fx/traceback.py b/torch/fx/traceback.py
index a07b36b997bdb..cee7626e5c83a 100644
--- a/torch/fx/traceback.py
+++ b/torch/fx/traceback.py
@@ -54,7 +54,7 @@ def format_stack() -> List[str]:
         return current_stack.copy()
     else:
         # fallback to traceback.format_stack()
-        return traceback.format_stack()
+        return traceback.format_list(traceback.extract_stack()[:-1])
 
 
 @compatibility(is_backward_compatible=False)

From 53d30fd71c602ae54e4100c021b486baed1ef4a4 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Sat, 19 Nov 2022 19:44:18 -0500
Subject: [PATCH 1114/1922] Fix cat striding in PrimTorch (#89332)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89332
Approved by: https://github.com/ngimel
---
 test/jit/test_symbolic_shape_analysis.py      |  7 +++++-
 test/test_mps.py                              |  2 --
 torch/_refs/__init__.py                       | 24 +++++++++++++++++--
 torch/testing/_creation.py                    |  8 ++++++-
 .../_internal/common_methods_invocations.py   |  5 ++++
 5 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/test/jit/test_symbolic_shape_analysis.py b/test/jit/test_symbolic_shape_analysis.py
index 1c4e359662bda..3e3cb3ffed73a 100644
--- a/test/jit/test_symbolic_shape_analysis.py
+++ b/test/jit/test_symbolic_shape_analysis.py
@@ -319,7 +319,12 @@ def forward(self, x, y):
             mod = torch.jit.script(CatMod(**inp.kwargs).eval())
 
             args = inp.input
-            self.assertTrue(len(args) == 2)
+
+            # This test is hard-coded only to work with two sample inputs
+            # but the OpInfo may have more/less
+            if len(args) != 2:
+                continue
+
             out_size = mod(*args).size()
             inps = list(mod.graph.inputs())
             inps[1].setType(inps[1].type().with_sizes(args[0].size()))
diff --git a/test/test_mps.py b/test/test_mps.py
index 7ec8ac9d6baae..19f70ce35a21b 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -7243,7 +7243,6 @@ class TestConsistency(TestCase):
         'block_diag': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
         'bmm': ['f32'],
         'broadcast_shapes': ['f32'],
-        'cat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'ceil': ['f32', 'int32', 'int64', 'f16'],
         'char': ['b8', 'u8'],
         'chunk': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -7475,7 +7474,6 @@ class TestConsistency(TestCase):
         'block_diag': ['f16', 'f32'],
         'bmm': ['f32'],
         'broadcast_shapes': ['f32'],
-        'cat': ['f16', 'f32'],
         'ceil': ['f32'],
         'chunk': ['f16', 'f32'],
         'clone': ['f16', 'f32'],
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 3355400db43cc..fda73cf0bc608 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -2530,6 +2530,18 @@ def broadcast_to(a: TensorLikeType, size: ShapeType) -> TensorLikeType:
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH,
 )
 def cat(tensors: TensorSequenceType, dim: int = 0) -> TensorLikeType:
+    def cat_compute_output_memory_format(inputs):
+        format = None
+        for t in inputs:
+            f = utils.suggest_memory_format(t)
+            if f == torch.contiguous_format:
+                return f
+            if format is not None and format != f:
+                return torch.contiguous_format
+            format = f
+        assert format is not None
+        return format
+
     if len(tensors) == 0:
         msg = "cat expects at least one tensor, but received zero!"
         raise ValueError(msg)
@@ -2547,6 +2559,8 @@ def cat(tensors: TensorSequenceType, dim: int = 0) -> TensorLikeType:
         utils.validate_idx(t.ndim, dim)
         break
 
+    memory_format = cat_compute_output_memory_format(tensors)
+
     # Filters tensors with one dimension of length zero
     filtered = tuple(x for x in tensors if not (x.ndim == 1 and x.numel() == 0))
     if len(filtered) == 0:
@@ -2558,9 +2572,15 @@ def cat(tensors: TensorSequenceType, dim: int = 0) -> TensorLikeType:
         except Exception:
             requires_grad = False
 
-        return empty((0,), dtype=t.dtype, device=t.device, requires_grad=requires_grad)
+        return empty(
+            (0,),
+            dtype=t.dtype,
+            device=t.device,
+            requires_grad=requires_grad,
+            memory_format=memory_format,
+        )
 
-    return prims.cat(filtered, dim)
+    return prims.cat(filtered, dim).clone(memory_format=memory_format)
 
 
 # CompositeImplicitAutograd - don't register decomp
diff --git a/torch/testing/_creation.py b/torch/testing/_creation.py
index b8f41f04743cb..33b9739a7f360 100644
--- a/torch/testing/_creation.py
+++ b/torch/testing/_creation.py
@@ -31,7 +31,8 @@ def make_tensor(
     high: Optional[float] = None,
     requires_grad: bool = False,
     noncontiguous: bool = False,
-    exclude_zero: bool = False
+    exclude_zero: bool = False,
+    memory_format: Optional[torch.memory_format] = None,
 ) -> torch.Tensor:
     r"""Creates a tensor with the given :attr:`shape`, :attr:`device`, and :attr:`dtype`, and filled with
     values uniformly drawn from ``[low, high)``.
@@ -74,6 +75,8 @@ def make_tensor(
             :attr:`dtype`'s :func:`~torch.finfo` object), and for complex types it is replaced with a complex number
             whose real and imaginary parts are both the smallest positive normal number representable by the complex
             type. Default ``False``.
+        memory_format (Optional[torch.memory_format]): The memory format of the returned tensor.  Incompatible
+            with :attr:`noncontiguous`.
 
     Raises:
         ValueError: if ``requires_grad=True`` is passed for integral `dtype`
@@ -152,9 +155,12 @@ def clamp(a, l, h):
         raise TypeError(f"The requested dtype '{dtype}' is not supported by torch.testing.make_tensor()."
                         " To request support, file an issue at: https://github.com/pytorch/pytorch/issues")
 
+    assert not (noncontiguous and memory_format is not None)
     if noncontiguous and result.numel() > 1:
         result = torch.repeat_interleave(result, 2, dim=-1)
         result = result[..., ::2]
+    elif memory_format is not None:
+        result = result.clone(memory_format=memory_format)
 
     if exclude_zero:
         if dtype in _integral_types or dtype is torch.bool:
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index cf68a68cf629a..4edba78ec4ae1 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -1873,6 +1873,9 @@ def sample_inputs_cat_concat(op_info, device, dtype, requires_grad, **kwargs):
     for input_shape1, input_shape2, kwargs in cases:
         yield SampleInput([make_arg(input_shape1), make_arg(input_shape2)], kwargs=kwargs)
 
+    # from coat_lite_mini
+    yield SampleInput([make_arg((2, 2, 2, 2), memory_format=torch.channels_last)], args=(1,),)
+
 def error_inputs_cat(op_info, device, **kwargs):
 
     make_arg = partial(make_tensor, device=device, dtype=torch.float32)
@@ -15016,6 +15019,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_fwgrad_bwgrad=True,
            assert_autodiffed=True,
            skips=(
+               # https://github.com/pytorch/pytorch/issues/89353
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_numpy_ref_mps'),
                # RuntimeError: Arguments for call not valid.
                #               Expected a value of type 'List[Tensor]' for argument
                #               'tensors' but instead found type 'Tensor (inferred)'.

From ca92b4f2d321bbb78cdfce352de3a5559de822e6 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Sat, 19 Nov 2022 22:31:24 -0500
Subject: [PATCH 1115/1922] Set INTERFACE_LINK_DIRECTORIES on caffe2::mkl
 (#89359)

This ensures that subsequent link commands involving mkl libraries
know where to find the libraries if they are in a non-standard
location (which is the case if you installed mkl via conda, which
is what our standard instructions recommend.)

This is kind of a hack, because the MKL libraries are not actually
guaranteed to be in $MKL_ROOT/lib (they are for the conda install
though).  The real fix is to properly use the MKL targets from
FindMKL.cmake but thats its own can of fish.  See
https://github.com/pytorch/pytorch/issues/73008

This fixes https://github.com/pytorch/audio/issues/2784

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89359
Approved by: https://github.com/soumith
---
 cmake/public/mkl.cmake | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cmake/public/mkl.cmake b/cmake/public/mkl.cmake
index 9515a4ae96813..57c4042991365 100644
--- a/cmake/public/mkl.cmake
+++ b/cmake/public/mkl.cmake
@@ -10,3 +10,8 @@ set_property(
 set_property(
   TARGET caffe2::mkl PROPERTY INTERFACE_LINK_LIBRARIES
   ${MKL_LIBRARIES})
+# TODO: This is a hack, it will not pick up architecture dependent
+# MKL libraries correctly; see https://github.com/pytorch/pytorch/issues/73008
+set_property(
+  TARGET caffe2::mkl PROPERTY INTERFACE_LINK_DIRECTORIES
+  ${MKL_ROOT}/lib)

From 9b90cdb5ec37cf17ff5ec4a3f56ab745234d34f6 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Sun, 20 Nov 2022 09:13:30 -0500
Subject: [PATCH 1116/1922] Also include MKL_THREAD_LIB in link libraries for
 caffe2::mkl (#89378)

Actually fixes https://github.com/pytorch/audio/issues/2784 for
real; in my previous testing I didn't check if I could import
torchaudio; now torchaudio successfully imports.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89378
Approved by: https://github.com/soumith
---
 cmake/public/mkl.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/public/mkl.cmake b/cmake/public/mkl.cmake
index 57c4042991365..f4ab1ffa9d0fe 100644
--- a/cmake/public/mkl.cmake
+++ b/cmake/public/mkl.cmake
@@ -9,7 +9,7 @@ set_property(
   ${MKL_INCLUDE_DIR})
 set_property(
   TARGET caffe2::mkl PROPERTY INTERFACE_LINK_LIBRARIES
-  ${MKL_LIBRARIES})
+  ${MKL_LIBRARIES} ${MKL_THREAD_LIB})
 # TODO: This is a hack, it will not pick up architecture dependent
 # MKL libraries correctly; see https://github.com/pytorch/pytorch/issues/73008
 set_property(

From ed56299711d1ed0baf6f7dccccb2ef652d758525 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Sun, 20 Nov 2022 22:14:38 +0000
Subject: [PATCH 1117/1922] Revert "Update sdp dispatch logic to enable fused
 backward (#89154)"

This reverts commit 2e72ec79823111e8dd8c5e82c5d1b56197cd52d3.

Reverted https://github.com/pytorch/pytorch/pull/89154 on behalf of https://github.com/huydhn due to Sorry for reverting your PR but the new test_sdp_math_gradcheck test breaks periodic slow gradcheck, i.e. https://hud.pytorch.org/pytorch/pytorch/commit/419ef2cdcfe84442de5232739284c6a51a18632f
---
 aten/src/ATen/native/native_functions.yaml    |  52 +++--
 .../cuda/NestedTensorTransformerFunctions.cpp | 100 +++------
 .../ATen/native/transformers/attention.cpp    |  65 ++----
 .../native/transformers/cuda/attention.cu     |  46 +++--
 .../transformers/cuda/attention_backward.cu   |  40 +---
 .../transformers/cuda/flash_attn/fmha_api.cpp |   7 +-
 .../transformers/cuda/flash_attn/fmha_api.h   |   2 +-
 .../ATen/native/transformers/cuda/sdp_utils.h |  34 +---
 benchmarks/transformer/sdp_backwards.py       | 189 ------------------
 .../check_forward_backward_compatibility.py   |   3 -
 test/functorch/test_ops.py                    |   8 +-
 test/test_meta.py                             |   1 +
 test/test_transformers.py                     |  74 ++-----
 tools/autograd/derivatives.yaml               |   6 +-
 .../_internal/common_methods_invocations.py   |   5 -
 15 files changed, 135 insertions(+), 497 deletions(-)
 delete mode 100644 benchmarks/transformer/sdp_backwards.py

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 8c759cd09c486..f625c9faff412 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -13252,39 +13252,18 @@
     CPU, NestedTensorCPU, Meta: _fused_sdp_choice_cpp
     CUDA, NestedTensorCUDA: _fused_sdp_choice_cuda
 
-- func: _scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool need_attn_weights=False, bool is_causal=False) -> (Tensor, Tensor)
+# Register the math kernel for cpu
+- func: _scaled_dot_product_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool need_attn_weights=False, bool is_causal=False) -> (Tensor, Tensor)
   variants: function
-
-- func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool return_softmax=False, bool is_causal=False) -> (Tensor, Tensor, Tensor)
-  dispatch:
-    CUDA: _scaled_dot_product_flash_attention_cuda
-    NestedTensorCUDA: _scaled_dot_product_flash_attention_nestedtensor_cuda
-
-- func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, bool compute_log_sumexp, bool is_causal=False) -> (Tensor, Tensor)
   dispatch:
-    CUDA: _scaled_dot_product_efficient_attention_cuda
-    NestedTensorCUDA: _scaled_dot_product_efficient_attention_nestedtensor_cuda
+    CUDA: _scaled_dot_product_attention_forward_cuda
+    CPU: _scaled_dot_product_attention_forward_math
+    NestedTensorCUDA: _scaled_dot_product_attention_forward_nested
+    NestedTensorCPU: _scaled_dot_product_attention_forward_math
+    Meta: _scaled_dot_product_attention_forward_math
 
-- func: _scaled_dot_product_efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, bool is_causal=False) -> (Tensor, Tensor, Tensor)
-  dispatch:
-    CUDA: _scaled_dot_product_efficient_attention_backward_cuda
-
-# Returns ouput, softmax_logsumexp, softmax
-- func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, bool return_softmax, float dropout_p, bool is_causal) -> (Tensor, Tensor, Tensor)
-  variants: function
-  dispatch:
-    CUDA: _flash_attention_forward
-
-# Returns ouput, logsumexp if compute_logsumexp
-- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, bool compute_log_sumexp=False, bool causal=False) -> (Tensor, Tensor)
+- func: _scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool need_attn_weights=False, bool is_causal=False) -> (Tensor, Tensor)
   variants: function
-  dispatch:
-    CUDA: _efficient_attention_forward
-
-- func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, bool is_causal=False) -> (Tensor, Tensor, Tensor)
-  variants: function
-  dispatch:
-    CUDA: _efficient_attention_backward
 
 - func: _triton_scaled_dot_attention(Tensor q, Tensor k, Tensor v, float dropout_p=0.0) -> Tensor
   variants: function
@@ -13311,6 +13290,21 @@
   structured: True
   variants: function
 
+- func: _flash_scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: flash_scaled_dot_product_attention
+
+- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, bool compute_log_sumexp=False, bool causal=False) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: _efficient_attention_forward
+
+- func: _efficient_attention_backward(Tensor grad, Tensor query, Tensor key, Tensor value, Tensor logsumexp, Tensor out, bool is_causal=False) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: _efficient_attention_backward
+
 - func: _transformer_decoder_only_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, Tensor? incr_key=None, Tensor? incr_value=None) -> (Tensor, Tensor, Tensor)
   variants: function
   dispatch:
diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
index 9c72454560d38..c2bf4e08ce042 100644
--- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
@@ -214,6 +214,26 @@ Tensor NestedTensor_to_padded_tensor_cuda(
   return NestedTensor_to_padded_tensor_generic(t, padding, output_size);
 }
 
+std::tuple<Tensor, Tensor> _scaled_dot_product_attention_forward_nested(
+        const Tensor& query_, const Tensor& key, const Tensor& value,
+        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool need_attn_weights, bool is_causal) {
+
+    // Determine which efficient kernel to use
+    sdp::sdp_params kernel_params{query_, key, value, attn_mask_.has_value(), dropout_p, need_attn_weights, is_causal};
+    auto backend = select_sdp_backend(kernel_params);
+    switch(backend){
+      case sdp::SDPBackend::flash_attention:
+          // TODO: enable flash attention kernel
+          return mem_efficient_helper_nested_unpacked(query_, key, value, dropout_p, need_attn_weights, is_causal);
+      case sdp::SDPBackend::efficient_attention:
+          return mem_efficient_helper_nested_unpacked(query_, key, value, dropout_p, need_attn_weights, is_causal);
+      case sdp::SDPBackend::math:
+        return at::_scaled_dot_product_attention_math(query_, key, value, attn_mask_, dropout_p, need_attn_weights, is_causal);
+      default:
+        TORCH_CHECK(false, "Unsupported backend for scaled_dot_product_attention");
+        return std::make_tuple(Tensor(), Tensor());
+    }
+}
 namespace{
 
 /**
@@ -320,80 +340,19 @@ bool is_safe_to_get_storage_as_tensor(const NestedTensorImpl* tensor) {
 }
 
 } // namespace
-
-std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_flash_attention_nestedtensor_cuda(
+std::tuple<Tensor, Tensor> mem_efficient_helper_nested_unpacked(
     const Tensor& query,
     const Tensor& key,
     const Tensor& value,
     double dropout_p,
-    bool return_softmax,
+    bool need_atten_weights,
     bool is_causal) {
-  TORCH_CHECK(false, "There are currently cuda memory errors being returned from this path.")
   // Query (Batch x Num_heads x {Q_seq_len}  x Dim_per_head)
   // Key   (Batch x Num_heads x {KV_seq_len} x Dim_per_head)
   // Value (Batch x Num_heads x {KV_seq_len} x Dim_per_head)
   const int64_t num_heads = query.size(1);
   const int64_t head_dim = query.size(3);
 
-  // Query -> Query (Batch x {Q_seq_len}  x Num_heads x Dim_per_head)
-  // Key   -> Key   (Batch x {KV_seq_len} x Num_heads x Dim_per_head)
-  // Value -> Value (Batch x {KV_seq_len} x Num_heads x Dim_per_head)
-  Tensor q_t = query.transpose(1, 2).contiguous();
-  Tensor k_t = key.transpose(1, 2).contiguous();
-  Tensor v_t = value.transpose(1, 2).contiguous();
-
-  // K and V have to have the same Nnz, should probably torch_check
-  // assume in order to not iterate over v
-
-  auto cumulative_and_max_q = cumulative_and_max_seq_len(q_t);
-  auto cumulative_and_max_k = cumulative_and_max_seq_len(k_t);
-
-  Tensor cumulative_sequence_length_q = std::get<0>(cumulative_and_max_q);
-  Tensor cumulative_sequence_length_k = std::get<0>(cumulative_and_max_k);
-
-  const int64_t max_seqlen_batch_q = std::get<1>(cumulative_and_max_q);
-  const int64_t max_seqlen_batch_k = std::get<1>(cumulative_and_max_k);
-
-  const int64_t Nnz_q  = cumulative_sequence_length_q[-1].item<int64_t>();
-  const int64_t Nnz_kv = cumulative_sequence_length_k[-1].item<int64_t>();
-
-  auto query_buffer_reshaped =
-      get_buffer(q_t).view({Nnz_q, num_heads, head_dim});
-  auto key_buffer_reshaped =
-      get_buffer(k_t).view({Nnz_kv, num_heads, head_dim});
-  auto value_buffer_reshaped =
-      get_buffer(v_t).view({Nnz_kv, num_heads, head_dim});
-
-  auto attention_and_lse_and_softmax =
-  at::_flash_attention_forward(
-      query_buffer_reshaped,
-      key_buffer_reshaped,
-      value_buffer_reshaped,
-      cumulative_sequence_length_q,
-      cumulative_sequence_length_k,
-      max_seqlen_batch_q,
-      max_seqlen_batch_k,
-      return_softmax,
-      dropout_p,
-      is_causal);
-  // Reshape output to convert nnz to batch_size and seq_len
-  Tensor attention = std::get<0>(attention_and_lse_and_softmax);
-  attention = wrap_buffer(attention.view(-1), get_nested_size_tensor(q_t).clone()).transpose(1,2);
-  return std::tie(attention, std::get<1>(attention_and_lse_and_softmax), std::get<2>(attention_and_lse_and_softmax));
-}
-
-std::tuple<Tensor, Tensor> _scaled_dot_product_efficient_attention_nestedtensor_cuda(
-    const Tensor& query,
-    const Tensor& key,
-    const Tensor& value,
-    bool compute_log_sumexp,
-    bool is_causal) {
-   // Query (Batch x Num_heads x {Q_seq_len}  x Dim_per_head)
-  // Key   (Batch x Num_heads x {KV_seq_len} x Dim_per_head)
-  // Value (Batch x Num_heads x {KV_seq_len} x Dim_per_head)
-  const int64_t num_heads = query.size(1);
-  const int64_t head_dim = query.size(3);
-
   Tensor q_t = query.transpose(1, 2);
   Tensor k_t = key.transpose(1, 2);
   Tensor v_t = value.transpose(1, 2);
@@ -473,7 +432,7 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_efficient_attention_nestedtensor_
       {Nnz_kv, num_heads, head_dim},
       {nnz_v_stride, head_v_stride, head_dim_stride},
       value_impl->get_storage_offsets()[0]);
-  std::tuple<Tensor, Tensor> attention_and_logsumexp=
+  std::tuple<Tensor, Tensor> attention_and_weights =
       at::_efficient_attention_forward(
           query_buffer_reshaped.unsqueeze(0),
           key_buffer_reshaped.unsqueeze(0),
@@ -481,14 +440,14 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_efficient_attention_nestedtensor_
           cumulative_sequence_length_q,
           cumulative_sequence_length_k,
           max_seqlen_batch_q,
-          compute_log_sumexp,
-          is_causal);
+          false,
+          false);
   // Reshape output to convert nnz to batch_size and seq_len
-  Tensor attention = std::get<0>(attention_and_logsumexp);
+  Tensor attention = std::get<0>(attention_and_weights);
   attention =
       wrap_buffer(attention.view(-1), get_nested_size_tensor(q_t).clone())
           .transpose(1, 2);
-  return std::tie(attention, std::get<1>(attention_and_logsumexp));
+  return std::tie(attention, std::get<1>(attention_and_weights));
 }
 
 Tensor flash_attention_helper(
@@ -533,7 +492,7 @@ Tensor flash_attention_helper(
   // If we are passing in query, key, value all the same tensors then we have
   // packed them into one tensor and need to slice for flash attention
   Tensor attention =
-      std::get<0>(at::_flash_attention_forward(
+      at::_flash_scaled_dot_product_attention(
           q,
           k,
           v,
@@ -541,9 +500,8 @@ Tensor flash_attention_helper(
           cumulative_sequence_length_q,
           max_seqlen_batch_q,
           max_seqlen_batch_q,
-          false /*return_softmax*/,
           dropout_p,
-          is_causal));
+          is_causal);
   // Output of flash_attention is a regular tensor lets wrap it back up to
   // form a nested tensor
 
diff --git a/aten/src/ATen/native/transformers/attention.cpp b/aten/src/ATen/native/transformers/attention.cpp
index 9c5be12ef24db..89a0e4691018c 100644
--- a/aten/src/ATen/native/transformers/attention.cpp
+++ b/aten/src/ATen/native/transformers/attention.cpp
@@ -678,6 +678,20 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> native_decoder_only_multi_head_attent
 //     L: Target sequence length
 //     E: Embedding dimension
 std::tuple<Tensor, Tensor> _scaled_dot_product_attention(
+        const Tensor& query_, const Tensor& key, const Tensor& value,
+        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool need_attn_weights, bool is_causal) {
+        if (query_.requires_grad() || key.requires_grad() || value.requires_grad()){
+          return at::_scaled_dot_product_attention_math(query_, key, value, attn_mask_, dropout_p, need_attn_weights, is_causal);
+        }
+        return at::_scaled_dot_product_attention_forward(query_, key, value, attn_mask_, dropout_p, need_attn_weights, is_causal);
+}
+
+int64_t _fused_sdp_choice_cpp(const Tensor& query_, const Tensor& key, const Tensor& value,
+        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool need_attn_weights, bool is_causal){
+  return static_cast<int64_t>(sdp::SDPBackend::math);
+}
+
+std::tuple<Tensor, Tensor> _scaled_dot_product_attention_forward_math(
     const Tensor& query_,
     const Tensor& key,
     const Tensor& value,
@@ -685,49 +699,14 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention(
     double dropout_p,
     bool need_attn_weights,
     bool is_causal) {
-  // TODO: The second return is the attention weights if the math kernel is
-  // used. The fused kernels do not return this Tensor so for the fused kernels
-  // The second return SHOULD always be an empty Tensor, unless need_attn_weights
-  // is true (in which case the fused kernels would not be called). This blows up
-  // op_info tests.
-  int64_t choice_int = at::_fused_sdp_choice(
-      query_, key, value, attn_mask_, dropout_p, need_attn_weights, is_causal);
-  sdp::SDPBackend backend = static_cast<sdp::SDPBackend>(choice_int);
-  switch (backend) {
-    case sdp::SDPBackend::flash_attention: {
-      auto out_lse_softmax = at::_scaled_dot_product_flash_attention(
-          query_, key, value, dropout_p, need_attn_weights, is_causal);
-      return std::make_tuple(
-          std::move(std::get<0>(out_lse_softmax)),
-          std::move(std::get<2>(out_lse_softmax)));
-    }
-    case sdp::SDPBackend::efficient_attention: {
-      bool compute_logsumexp =
-          (query_.requires_grad() || key.requires_grad() ||
-           value.requires_grad());
-      return at::_scaled_dot_product_efficient_attention(
-          query_, key, value, compute_logsumexp, is_causal);
-    }
-    case sdp::SDPBackend::math:
-      return at::_scaled_dot_product_attention_math(
-          query_,
-          key,
-          value,
-          attn_mask_,
-          dropout_p,
-          need_attn_weights,
-          is_causal);
-    default:
-      TORCH_CHECK(
-          false,
-          "No viable backend for scaled_dot_product_attention was found.");
-      return std::make_tuple(Tensor(), Tensor());
-  }
-}
-
-int64_t _fused_sdp_choice_cpp(const Tensor& query_, const Tensor& key, const Tensor& value,
-        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool need_attn_weights, bool is_causal){
-  return static_cast<int64_t>(sdp::SDPBackend::math);
+  return at::_scaled_dot_product_attention_math(
+      query_,
+      key,
+      value,
+      attn_mask_,
+      dropout_p,
+      need_attn_weights,
+      is_causal);
 }
 
 std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math(
diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu
index 8dcb99b3380d9..602cf319f74a6 100644
--- a/aten/src/ATen/native/transformers/cuda/attention.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention.cu
@@ -678,12 +678,12 @@ std::tuple<Tensor, Tensor> native_multi_head_attention_cuda(
   return std::make_tuple(std::move(proj), std::move(qkt));
 }
 
-std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_flash_attention_cuda(
+std::tuple<Tensor, Tensor> flash_attention_helper_dense_unpacked(
     const Tensor& query,
     const Tensor& key,
     const Tensor& value,
     double dropout_p,
-    bool return_softmax,
+    bool need_atten_weights,
     bool is_causal) {
   // Query (Batch x Num_heads x Q_seq_len  x Dim_per_head)
   // Key   (Batch x Num_heads x KV_seq_len x Dim_per_head)
@@ -726,9 +726,8 @@ std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_flash_attention_cuda(
   Tensor key_reshaped = k_t.reshape({Nnz_kv, num_heads, head_dim});
   Tensor value_reshaped = v_t.reshape({Nnz_kv, num_heads, head_dim});
 
-  Tensor attention, log_sumexp, softmax;
-  std::tie(attention, log_sumexp, softmax) =
-      at::_flash_attention_forward(
+  Tensor attention =
+      at::_flash_scaled_dot_product_attention(
           query_reshaped,
           key_reshaped,
           value_reshaped,
@@ -736,17 +735,15 @@ std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_flash_attention_cuda(
           cumulative_sequence_length_k,
           max_seqlen_batch_q,
           max_seqlen_batch_k,
-          return_softmax,
           dropout_p,
           is_causal);
   // Reshape output to convert nnz to batch_size and seq_len
   attention =
       attention.view({batch_size, max_seqlen_batch_q, num_heads, head_dim}).transpose(1,2);
 
-  return std::make_tuple(attention, log_sumexp, softmax);
+  return std::tuple<Tensor, Tensor>(attention, Tensor());
 }
-
-std::tuple<Tensor, Tensor> _scaled_dot_product_efficient_attention_cuda(
+std::tuple<Tensor, Tensor> mem_eff_helper(
     const Tensor& query,
     const Tensor& key,
     const Tensor& value,
@@ -770,7 +767,26 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_efficient_attention_cuda(
       compute_log_sumexp,
       is_causal);
   attention = attention.transpose(1,2);
-  return std::make_tuple(std::move(attention), std::move(log_sumexp));
+  return std::make_tuple(std::move(attention), Tensor());
+}
+
+std::tuple<Tensor, Tensor> _scaled_dot_product_attention_forward_cuda(
+        const Tensor& query_, const Tensor& key, const Tensor& value,
+        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool need_attn_weights, bool is_causal) {
+    // Determine which efficient kernel to use
+    sdp::sdp_params kernel_params{query_, key, value, attn_mask_.has_value(), dropout_p, need_attn_weights, is_causal};
+    auto backend = select_sdp_backend(kernel_params);
+    switch(backend){
+      case sdp::SDPBackend::flash_attention:
+          return flash_attention_helper_dense_unpacked(query_, key, value, dropout_p, need_attn_weights, is_causal);
+      case sdp::SDPBackend::efficient_attention:
+          return mem_eff_helper(query_, key , value, need_attn_weights, is_causal);
+      case sdp::SDPBackend::math:
+        return at::_scaled_dot_product_attention_math(query_, key, value, attn_mask_, dropout_p, need_attn_weights, is_causal);
+      default:
+        TORCH_CHECK(false, "No viable backend for scaled_dot_product_attention was found.");
+        return std::make_tuple(Tensor(), Tensor());
+    }
 }
 
 int64_t _fused_sdp_choice_cuda(const Tensor& query_, const Tensor& key, const Tensor& value,
@@ -786,7 +802,7 @@ int64_t _fused_sdp_choice_cuda(const Tensor& query_, const Tensor& key, const Te
   return static_cast<int64_t>(backend);
 }
 
-std::tuple<Tensor, Tensor, Tensor> _flash_attention_forward(
+Tensor flash_scaled_dot_product_attention(
     const Tensor& query,
     const Tensor& key,
     const Tensor& value,
@@ -794,12 +810,11 @@ std::tuple<Tensor, Tensor, Tensor> _flash_attention_forward(
     const Tensor& cumulative_sequence_length_k,
     const int64_t max_seqlen_batch_q,
     const int64_t max_seqlen_batch_k,
-    bool return_softmax,
     double dropout_p,
     bool is_causal) {
 #if defined(USE_FLASH_ATTENTION)
   auto softmax_scale = std::pow(query.size(-1), -0.5);
-  return fmha::mha_fwd(
+  std::vector<Tensor> output = fmha::mha_fwd(
       query,
       key,
       value,
@@ -811,11 +826,12 @@ std::tuple<Tensor, Tensor, Tensor> _flash_attention_forward(
       softmax_scale,
       false,
       is_causal,
-      return_softmax,
+      false,
       c10::nullopt);
+  return output[0];
 #endif
   TORCH_CHECK(false, "USE_FLASH_ATTENTION was not enabled for build.")
-  return std::make_tuple(Tensor(), Tensor(), Tensor());
+  return Tensor();
 }
 
 std::tuple<at::Tensor, at::Tensor> _efficient_attention_forward(
diff --git a/aten/src/ATen/native/transformers/cuda/attention_backward.cu b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
index a063aacb901ee..af005b2669b29 100644
--- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
@@ -10,7 +10,6 @@
 #include <ATen/native/transformers/attention.h>
 #include <ATen/native/transformers/cuda/sdp_utils.h>
 
-#include <iostream>
 #ifdef USE_FLASH_ATTENTION
 #include <ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h>
 #endif
@@ -74,14 +73,14 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _efficient_attention_backward(
     const at::Tensor& query,
     const at::Tensor& key,
     const at::Tensor& value,
-    const at::Tensor& out,
     const at::Tensor& logsumexp,
+    const at::Tensor& out,
     bool causal) {
   #if defined(USE_FLASH_ATTENTION)
   if (!grad_out_.defined()) {
     return std::make_tuple(Tensor{}, Tensor{}, Tensor{});
   }
-  // ndim
+    // ndim
   TORCH_CHECK(query.dim() == grad_out_.dim());
   TORCH_CHECK(query.dim() == key.dim());
   TORCH_CHECK(query.dim() == value.dim());
@@ -129,7 +128,6 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _efficient_attention_backward(
   // initialized
   bool grad_kv_needs_init = causal && N > M;
   at::Tensor grad_q, grad_k, grad_v;
-  int8_t gQKV_strideM_multiplier = 1;
   if (!grad_kv_needs_init && query.size(1) == key.size(1) &&
       query.size(3) == value.size(3) &&
       query.storage().is_alias_of(key.storage()) &&
@@ -143,13 +141,10 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _efficient_attention_backward(
     grad_q = chunk.select(2, 0);
     grad_k = chunk.select(2, 1);
     grad_v = chunk.select(2, 2);
-    gQKV_strideM_multiplier=3;
   } else {
-    grad_q = at::empty(query.sizes(), query.options());
-    grad_k = grad_kv_needs_init ? at::zeros(key.sizes(), key.options())
-                                : at::empty(key.sizes(), key.options());
-    grad_v = grad_kv_needs_init ? at::zeros(value.sizes(), value.options())
-                                : at::empty(value.sizes(), value.options());
+    grad_q = at::empty_like(query);
+    grad_k = grad_kv_needs_init ? at::zeros_like(key) : at::empty_like(key);
+    grad_v = grad_kv_needs_init ? at::zeros_like(value) : at::empty_like(value);
   }
 
   auto launchKernel = [&](auto _k, int computeCapability) {
@@ -203,7 +198,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _efficient_attention_backward(
     ASSIGN_CHECK_OVERFLOW(p.gQ_strideH, grad_q.stride(2));
     ASSIGN_CHECK_OVERFLOW(p.gK_strideH, grad_k.stride(2));
     ASSIGN_CHECK_OVERFLOW(p.gV_strideH, grad_v.stride(2));
-    p.gQKV_strideM_multiplier = gQKV_strideM_multiplier;
+    p.gQKV_strideM_multiplier = grad_q.is_contiguous() ? 1 : 3;
     TORCH_INTERNAL_ASSERT(p.gQ_strideM() == grad_q.stride(1));
     TORCH_INTERNAL_ASSERT(p.gK_strideM() == grad_k.stride(1));
     TORCH_INTERNAL_ASSERT(p.gV_strideM() == grad_v.stride(1));
@@ -262,28 +257,5 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _efficient_attention_backward(
   return std::make_tuple(Tensor{}, Tensor{}, Tensor{});
 }
 
-
-std::tuple<at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_efficient_attention_backward_cuda(
-    const at::Tensor& grad_out_,
-    const at::Tensor& query,
-    const at::Tensor& key,
-    const at::Tensor& value,
-    const at::Tensor& out,
-    const at::Tensor& logsumexp,
-    bool causal){
-  if (!grad_out_.defined()) {
-    return std::make_tuple(Tensor{}, Tensor{}, Tensor{});
-  }
-  auto grad_out = grad_out_.transpose(1, 2);
-  auto out_t = out.transpose(1, 2);
-  auto q_t = query.transpose(1, 2);
-  auto k_t = key.transpose(1, 2);
-  auto v_t = value.transpose(1, 2);
-
-  Tensor grad_q, grad_k, grad_v;
-  std::tie(grad_q, grad_k, grad_v) = at::_efficient_attention_backward(grad_out, q_t, k_t, v_t, out_t, logsumexp, causal);
-  return std::make_tuple(grad_q.transpose(1, 2), grad_k.transpose(1, 2), grad_v.transpose(1, 2));
-}
-
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
index 7cc0c250664e1..aaf7d833fe833 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
@@ -26,7 +26,6 @@
  *
  ******************************************************************************/
 
-#include <tuple>
 #ifdef USE_FLASH_ATTENTION
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
@@ -116,7 +115,7 @@ void set_params_fprop(FMHA_fprop_params &params,
     params.is_causal = is_causal;
 }
 
-std::tuple<at::Tensor, at::Tensor, at::Tensor>
+std::vector<at::Tensor>
 mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
         const at::Tensor &k,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
         const at::Tensor &v,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
@@ -242,7 +241,9 @@ mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q
 
     run_fmha_fprop(launch_params, /*configure=*/false);
 
-    return std::make_tuple(o, softmax_lse, s);
+    std::vector<at::Tensor> result = {o, softmax_lse};
+    if (return_softmax) {result.push_back(s);}
+    return result;
 }
 } // namespace fmha
 #endif
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.h b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.h
index b0555463be040..226d4ddd2b551 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.h
@@ -7,7 +7,7 @@
 namespace fmha {
 
 TORCH_API
-std::tuple<at::Tensor, at::Tensor, at::Tensor>
+std::vector<at::Tensor>
 mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
         const at::Tensor &k,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
         const at::Tensor &v,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.h b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
index 55e9aeb184a22..5d62a6cbd0dc5 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.h
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
@@ -91,31 +91,6 @@ inline bool check_for_seq_len_1_nested_tensor(sdp_params params, bool debug) {
   return true;
 }
 
-inline bool check_for_nested_inputs(sdp_params params, bool debug){
-  if (params.query.is_nested() || params.key.is_nested() || params.value.is_nested()) {
-    TORCH_CHECK(!debug, "We are not enabling nested Tensors for Flash Attention because of cuda memory errors.");
-    return false;
-  }
-  return true;
-}
-
-inline bool check_requires_grad(sdp_params params, bool debug) {
-  if (params.query.requires_grad() || params.key.requires_grad() || params.value.requires_grad()) {
-    TORCH_CHECK(!debug, "Flash Attention does not currently support training.");
-    return false;
-  }
-  return true;
-}
-
-inline bool check_requires_grad_and_nested(sdp_params params, bool debug) {
-  // If we fail both checks then we return false
-  if (!check_for_nested_inputs(params, false) && !check_requires_grad(params,false)){
-      TORCH_CHECK(!debug, "Memory efficient attention currently doesn't support training with NT inputs.");
-      return false;
-  }
-  return true;
-}
-
 inline bool check_for_attn_mask(sdp_params params, bool debug) {
   if (params.has_attn_mask) {
     TORCH_CHECK(!debug, "Flash Attention does not support attention mask.");
@@ -223,15 +198,13 @@ inline bool use_flash_attention(sdp_params params, bool debug) {
   return false;
 #endif
   //  Define gate functions that determine if a flash kernel can be ran
-  constexpr std::array<bool(*)(sdp_params, bool), 9> constraints {{
+  constexpr std::array<bool(*)(sdp_params, bool), 7> constraints {{
       check_runtime_disabled_flash,
-      check_requires_grad,
       check_tensor_shapes,
       check_for_attn_weights,
       check_for_attn_mask,
       check_head_dim_size,
       check_gpu_sm75_or_greater,
-      check_for_nested_inputs,
       check_for_seq_len_1_nested_tensor}};
   for (auto& constraint : constraints) {
     if (!constraint(params, debug)) {
@@ -259,15 +232,14 @@ inline bool use_mem_efficient_attention(sdp_params params, bool debug) {
       at::kHalf, at::kFloat, at::kBFloat16};
 
   //  Define gate functions that determine if a flash kernel can be ran
-  constexpr std::array<bool(*)(sdp_params, bool), 8> constraints{{
+  std::vector<std::function<bool(sdp_params, bool)>> constraints{
       check_gpu_sm50_or_greater,
       check_runtime_disabled_mem_efficient,
-      check_requires_grad_and_nested,
       check_for_attn_weights,
       check_tensor_shapes,
       check_for_attn_mask,
       check_for_seq_len_1_nested_tensor,
-      check_for_non_zero_dropout}};
+      check_for_non_zero_dropout};
   for (auto& constraint : constraints) {
     if (!constraint(params, debug)) {
       return false;
diff --git a/benchmarks/transformer/sdp_backwards.py b/benchmarks/transformer/sdp_backwards.py
deleted file mode 100644
index 2f745e157b280..0000000000000
--- a/benchmarks/transformer/sdp_backwards.py
+++ /dev/null
@@ -1,189 +0,0 @@
-import torch
-import numpy as np
-import random
-import torch.utils.benchmark as benchmark
-from torch.profiler import profile, record_function, ProfilerActivity
-
-
-class CompositeMHA(torch.nn.Module):
-    def __init__(self, num_heads, in_proj_weight, in_proj_bias, out_proj):
-        super().__init__()
-        self.in_proj_weight = in_proj_weight
-        self.in_proj_bias = in_proj_bias
-        self.out_proj = out_proj
-        self.num_heads = num_heads
-
-    def forward(self, query, key, value, mask):
-        if not (query is key and key is value):
-            raise NotImplementedError(
-                "query, key and value must be the same Tensor for now."
-            )
-        if mask is not None:
-            raise NotImplementedError("mask is currently not supported.")
-
-        query_projected = torch.nn.functional.linear(
-            query, self.in_proj_weight, self.in_proj_bias
-        )
-
-        batch_size = query_projected.size(0)
-        embed_dim = query_projected.size(2)
-        head_dim = embed_dim // (self.num_heads * 3)
-
-        query, key, value = query_projected.chunk(3, -1)
-
-        query = query.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2)
-        key = key.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2)
-
-        # the output of sdp = (batch, num_heads, seq_len, head_dim)
-        attn, _ = torch.nn.functional._scaled_dot_product_attention(
-            query,
-            key,
-            value,
-            attn_mask=None,
-            dropout_p=0.0,
-            need_attn_weights=False,
-            is_causal=False,
-        )
-
-        attn = attn.transpose(1, 2).reshape(batch_size, -1, self.num_heads * head_dim)
-        # Match return signature of nn.MHA
-        return self.out_proj(attn)
-
-
-def build_composite_mha_from_nn_mha(pt):
-    assert pt._qkv_same_embed_dim
-    in_proj_weight = pt.in_proj_weight
-    assert in_proj_weight is not None
-    assert pt.batch_first
-    return CompositeMHA(pt.num_heads, pt.in_proj_weight, pt.in_proj_bias, pt.out_proj)
-
-
-def forw_back(model, input, upward):
-    output = model(*input)
-    output.backward(upward)
-
-
-# Context manger not working in timer
-
-
-def forw_back_fused(model, input, upward):
-    with torch.backends.cuda.sdp_kernel(enable_math=False, enable_mem_efficient=True):
-        output = model(*input)
-        output.backward(upward)
-
-
-def forw_back_eager(model, input, upward):
-    with torch.backends.cuda.sdp_kernel(enable_math=True, enable_mem_efficient=False):
-        output = model(*input)
-        output.backward(upward)
-
-
-def run_timing(
-    min_run_time, batch_size, embed_dimension, num_heads, max_sequence_len, dtype
-):
-    dropout_p = 0.0
-    mask = None
-
-    pt = torch.nn.MultiheadAttention(
-        embed_dim=embed_dimension,
-        num_heads=num_heads,
-        batch_first=True,
-        dropout=dropout_p,
-    )
-    npt = pt.cuda().to(dtype)
-    cpt = build_composite_mha_from_nn_mha(npt)
-    x = torch.randn(
-        batch_size,
-        max_sequence_len,
-        embed_dimension,
-        dtype=dtype,
-        device="cuda",
-        requires_grad=True,
-    )
-
-    with torch.backends.cuda.sdp_kernel(enable_math=False, enable_mem_efficient=True):
-        rand_fused_upward = cpt(x, x, x, mask).clone().detach()
-
-    with torch.backends.cuda.sdp_kernel(enable_math=True, enable_mem_efficient=False):
-        rand_eager_upward = cpt(x, x, x, mask).clone().detach()
-
-    t0 = benchmark.Timer(
-        stmt="forw_back_fused(cpt, (x,x,x,mask), rand_fused_upward)",
-        globals={
-            "forw_back_fused": forw_back_fused,
-            "cpt": cpt,
-            "x": x,
-            "rand_fused_upward": rand_fused_upward,
-            "mask": mask,
-        },
-        label=f"Fused SDP forward and backward batch_size={batch_size} max_sequence_len={max_sequence_len} "
-        f"num_heads={num_heads} embed_dimension={embed_dimension} dtype={dtype}",
-        num_threads=torch.get_num_threads(),
-    )
-
-    t1 = benchmark.Timer(
-        stmt="forw_back_eager(cpt, (x,x,x,mask), rand_eager_upward)",
-        globals={
-            "forw_back_eager": forw_back_eager,
-            "cpt": cpt,
-            "x": x,
-            "rand_eager_upward": rand_eager_upward,
-            "mask": mask,
-        },
-        label=f"Eager SDP forward and backward batch_size={batch_size} max_sequence_len={max_sequence_len} "
-        f"num_heads={num_heads} embed_dimension={embed_dimension} dtype={dtype}",
-        num_threads=torch.get_num_threads(),
-    )
-
-    m0 = t0.blocked_autorange(min_run_time=min_run_time)
-    m1 = t1.blocked_autorange(min_run_time=min_run_time)
-
-    print(m0)
-    print(m1)
-
-    activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA]
-
-    print("Profile for Fused".center(200, "-"))
-    with torch.backends.cuda.sdp_kernel(enable_math=False, enable_mem_efficient=True):
-        with profile(
-            activities=activities, record_shapes=False, with_stack=True
-        ) as prof:
-            with record_function("Fused SDP forward and backward"):
-                for _ in range(20):
-                    forw_back(cpt, (x, x, x, mask), rand_fused_upward)
-    print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=20))
-
-    print("Profile for eager".center(200, "-"))
-    with torch.backends.cuda.sdp_kernel(enable_math=True, enable_mem_efficient=False):
-        with profile(
-            activities=activities, record_shapes=False, with_stack=True
-        ) as prof:
-            with record_function("Fused SDP forward and backward"):
-                for _ in range(20):
-                    forw_back(cpt, (x, x, x, mask), rand_eager_upward)
-    print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=20))
-
-
-def main():
-    seed = 123
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    random.seed(seed)
-
-    min_run_time = 10
-    batch_size = 64
-    num_heads = 32
-    max_seq_len = 256
-    embed_dim = 1024
-    dtype = torch.bfloat16
-
-    print(
-        f"Running timing for batch_size={batch_size} max_sequence_len={max_seq_len} "
-        f"num_heads={num_heads} embed_dimension={embed_dim} dtype={dtype}"
-    )
-    run_timing(min_run_time, batch_size, embed_dim, num_heads, max_seq_len, dtype)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/test/forward_backward_compatibility/check_forward_backward_compatibility.py b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
index 853f5206969b3..90080ab0934f4 100644
--- a/test/forward_backward_compatibility/check_forward_backward_compatibility.py
+++ b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
@@ -317,9 +317,6 @@
     ("aten::_upsample_nearest_exact1d_backward", datetime.date(2022, 12, 15)),
     ("aten::_upsample_nearest_exact2d", datetime.date(2022, 12, 15)),
     ("aten::_upsample_nearest_exact2d_backward", datetime.date(2022, 12, 15)),
-    ("aten::_flash_scaled_dot_product_attention", datetime.date(2022, 12, 15)),
-    ("aten::_scaled_dot_product_attention_forward", datetime.date(2022, 12, 15)),
-    ("aten::_efficient_attention_backward", datetime.date(2022, 12, 15)),
     ("mkldnn::_convolution_pointwise.binary", datetime.date(2022, 12, 15)),
 ]
 
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index e9451b596b4ac..5e3aa1ff898f2 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -401,7 +401,6 @@ def wrapped_fn(*args, **kwargs):
         skip('nn.functional.max_unpool2d'),  # fails everywhere except on windows
         skip('nn.functional.max_unpool3d'),  # fails everywhere except on mac
         xfail("native_batch_norm"),
-        xfail('nn.functional._scaled_dot_product_attention', device_type='cuda'),
 
         xfail('nn.functional.rrelu'),  # in-place test errors out with no formula implemented
 
@@ -556,7 +555,6 @@ def f(inp, *args, **kwargs):
         xfail('nn.functional.ctc_loss'),  # Not Implemented
         xfail('native_layer_norm', ''),  # Expected a proper Tensor but got None for argument #1 'other'
         xfail('sparse.sampled_addmm', ''),  # sparse tensors have no strides
-        skip('nn.functional._scaled_dot_product_attention', device_type='cuda'),
         # AssertionError: Tensor-likes are not close!
         # Mismatched elements: 1 / 15 (6.7%)
         # Greatest absolute difference: 24.0 at index (2, 4) (up to 1e-05 allowed)
@@ -651,7 +649,7 @@ def fn(inp, *args, **kwargs):
         skip("nn.functional.feature_alpha_dropout", "with_train"),  # calls random op
         skip("nn.functional.fractional_max_pool2d"),  # calls random op
         skip("nn.functional.fractional_max_pool3d"),  # calls random op
-        xfail('nn.functional._scaled_dot_product_attention'),  # randomness
+        skip('nn.functional._scaled_dot_product_attention'),  # randomness
         # It looks like you're either (1) calling .item() on a Tensor or
         # (2) attempting to use a Tensor in some data-dependent control flow or
         # (3) encountering this error in PyTorch internals.
@@ -1128,7 +1126,6 @@ def test():
         skip('nn.functional.rrelu'),  # randomness
         skip('nn.functional.feature_alpha_dropout', 'with_train'),  # randomness
         skip('nn.functional.feature_alpha_dropout', 'without_train'),  # randomness
-        skip('nn.functional._scaled_dot_product_attention', device_type='cuda'),
         skip('nn.functional.alpha_dropout'),  # randomness
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
         skip('to_sparse', ''),  # non-dense output
@@ -1252,7 +1249,6 @@ def get_vjp(cotangents, *primals):
         xfail('nn.functional.soft_margin_loss', ''),  # NYI: forward-AD for log_sigmoid_backward
         xfail('nn.functional.ctc_loss', ''),  # NYI: forward-AD for _ctc_loss
         xfail('nn.functional.pdist', ''),  # NYI: forward-AD with _pdist_forward
-        skip('nn.functional._scaled_dot_product_attention', device_type='cuda'),
         xfail('nn.functional.multi_margin_loss', ''),  # NYI: forward AD with multi_margin_loss
         skip('linalg.householder_product', '', device_type='cuda'),  # flaky, I'm not sure why
         xfail('sparse.sampled_addmm', ''),  # Sparse tensors have no strides
@@ -1373,7 +1369,7 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents):
         xfail('nn.functional.dropout2d'),  # calls random op
         xfail('nn.functional.dropout3d'),  # calls random op
         xfail('nn.functional.dropout'),  # calls random op
-        xfail('nn.functional._scaled_dot_product_attention'),  # randomness
+        skip('nn.functional._scaled_dot_product_attention'),  # randomness
         xfail('nn.functional.embedding_bag'),  # Forward AD not implemented and no decomposition
         xfail('nn.functional.alpha_dropout'),  # calls randomn op
         xfail('nn.functional.feature_alpha_dropout', 'with_train'),  # calls random op
diff --git a/test/test_meta.py b/test/test_meta.py
index 0e3cfb6ef1404..6d21d5c7bd75a 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -294,6 +294,7 @@ def test_tensor_outlives_converter(self):
     aten._fft_c2r.default,
     aten._fft_r2c.default,
     aten._linalg_svd.default,
+    aten._scaled_dot_product_attention_forward.default,
     aten.binary_cross_entropy.default,
     aten.complex.default,
     aten.copysign.Tensor,
diff --git a/test/test_transformers.py b/test/test_transformers.py
index f6bc0cc2d6639..abb4c71ec19ad 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -1059,11 +1059,6 @@ def rand_tensor(shape):
 
         if fused_kernel == "flash":
             with sdp_kernel(enable_mem_efficient=False, enable_math=False):
-                # TODO Flash for the nested path is currently not working due to cuda memory issues
-                if type == "nested":
-                    self.assertRaises(RuntimeError, lambda: torch.nn.functional._scaled_dot_product_attention(
-                        query_lp, key_lp, value_lp, attn_mask=None, dropout_p=0.0, need_attn_weights=False, is_causal=False))
-                    return
                 actual = torch.nn.functional._scaled_dot_product_attention(
                     query_lp, key_lp, value_lp, attn_mask=None, dropout_p=0.0, need_attn_weights=False, is_causal=False)
         elif fused_kernel == "mem_efficient":
@@ -1102,73 +1097,28 @@ def rand_tensor(shape):
 
     @unittest.skipIf(not TEST_CUDA or TEST_WITH_ROCM or IS_WINDOWS, "Flash Attention was not built for this system")
     @parametrize("contiguous_inputs", [True, False])
-    def test_sdp_math_gradcheck(self, contiguous_inputs: bool):
+    def test_efficient_attention_gradcheck(self, contiguous_inputs: bool):
 
         batch_size, seq_len, num_heads, head_dim = 8, 8, 4, 64
-        rand_tensor = partial(self.rand_tensor, device="cuda", dtype=torch.float64, requires_grad=True, packed=True)
+        rand_tensor = partial(self.rand_tensor, device="cuda", dtype=torch.float16, requires_grad=True, packed=True)
 
         qkv = rand_tensor((batch_size, seq_len, num_heads, head_dim))
         query, key, value = qkv.chunk(3, dim=-1)
-
-        query = query.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
-        key = key.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+        query = query.view(batch_size, -1, num_heads, head_dim)
+        key = key.view(batch_size, -1, num_heads, head_dim)
+        value = value.view(batch_size, -1, num_heads, head_dim)
 
         if contiguous_inputs:
             query = query.contiguous()
             key = key.contiguous()
             value = value.contiguous()
 
-        with sdp_kernel(enable_math=True, enable_mem_efficient=False, enable_flash=False):
-            assert gradcheck(lambda *args, **kwargs:
-                             wrapper_set_seed(torch.nn.functional._scaled_dot_product_attention, *args, **kwargs),
-                             (query, key, value, None, 0.0, False, False)
-                             )
-
-    @unittest.skipIf(not TEST_CUDA or TEST_WITH_ROCM or IS_WINDOWS, "Flash Attention was not built for this system")
-    @parametrize("contiguous_inputs", [True, False])
-    def test_sdp_fused_grad_against_math(self, contiguous_inputs: bool):
-        batch_size, seq_len, num_heads, head_dim = 8, 8, 4, 64
-        rand_tensor = partial(self.rand_tensor, device="cuda", dtype=torch.float64, requires_grad=True, packed=True)
-
-        qkv = rand_tensor((batch_size, seq_len, num_heads, head_dim))
-        qkv_lp = qkv.detach().clone().to(torch.float32).requires_grad_()
-
-        query, key, value = qkv.chunk(3, dim=-1)
-        query_lp, key_lp, value_lp = qkv_lp.chunk(3, dim=-1)
-
-        query = query.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
-        key = key.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
-
-        query_lp = query_lp.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
-        key_lp = key_lp.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
-        value_lp = value_lp.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
-
-        if contiguous_inputs:
-            query = query.contiguous()
-            key = key.contiguous()
-            value = value.contiguous()
-
-            query_lp = query_lp.contiguous()
-            key_lp = key_lp.contiguous()
-            value_lp = value_lp.contiguous()
-
-        with sdp_kernel(enable_math=True, enable_mem_efficient=False, enable_flash=False):
-            out, atten = torch.nn.functional._scaled_dot_product_attention(query, key, value, None, 0.0, False, False)
-
-        with sdp_kernel(enable_math=False, enable_mem_efficient=True, enable_flash=False):
-            out_lp, atten_lp = torch.nn.functional._scaled_dot_product_attention(
-                query_lp, key_lp, value_lp, None, 0.0, False, False)
-
-        rand_upward = torch.rand_like(out)
-        rand_upward_lp = rand_upward.to(torch.float32)
-
-        out.backward(rand_upward)
-        out_lp.backward(rand_upward_lp)
-
-        # Cast up and compare
-        self.assertEqual(qkv.grad, qkv_lp.grad.to(torch.float64), atol=1e-5, rtol=1e-5)
+        # Normally we would transpose the inputs but the fused kernels expect
+        # (batch, seq_len, num_heads, head_dim) bump the tolerance since we can only run kernel
+        # in fp32
+        assert gradcheck(lambda *args, **kwargs:
+                         wrapper_set_seed(torch.ops.aten._efficient_attention_forward, *args, **kwargs),
+                         (query, key, value, None, None, None, True, False), fast_mode=True, atol=8e-5, rtol=1e-3)
 
     @parametrize("type", ["dense", "nested"])
     def test_fused_sdp_choice(self, type: str):
@@ -1194,7 +1144,7 @@ def test_fused_sdp_choice(self, type: str):
             value = value.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
             key = key.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
 
-            if SM80OrLater and not type == "nested":
+            if SM80OrLater:
                 assert torch._fused_sdp_choice(query, key, value) == SDPBackend.FLASH_ATTENTION
             else:
                 assert torch._fused_sdp_choice(query, key, value) == SDPBackend.EFFICIENT_ATTENTION
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 52c0f76bf0708..a0892b32a8352 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -2613,13 +2613,9 @@
   nested_strides: non_differentiable
 
 # Transformers
-- name: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, bool compute_log_sumexp, bool is_causal=False) -> (Tensor, Tensor)
-  output_differentiability: [True, False]
-  query, key, value: _scaled_dot_product_efficient_attention_backward(grad, query, key, value, result0, result1, is_causal)
-
 - name:  _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, bool compute_log_sumexp=False, bool causal=False) -> (Tensor, Tensor)
   output_differentiability: [True, False]
-  query, key, value: _efficient_attention_backward(grad, query, key, value, result0, result1, causal)
+  query, key, value: _efficient_attention_backward(grad, query, key, value, result1, result0, causal)
 
 # fft
 - name: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 4edba78ec4ae1..4ccb5ef3840ff 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -12011,21 +12011,16 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             # This is only failing on Linux Bionic 3.10 Cuda 11.6
             DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes',
                          device_type='cuda', active_if=_get_torch_cuda_version() >= (11, 6)),
-            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_noncontiguous_samples',
-                         device_type='cuda', dtypes=(torch.float32,)),
             # AssertionError: JIT Test does not execute any logic
             DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
             # Doesn't support autocasting
             DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensorNonErroring', 'test_fake_autocast', device_type='cpu'),
             DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensor', 'test_fake_autocast'),
-            # Forward works for dtype=float64 which is the math path
-            DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD'),
             # No meta function
             DecorateInfo(unittest.skip("Skipped!"), 'TestProxyTensorOpInfo', 'test_make_fx_symbolic_exhaustive'),
             DecorateInfo(unittest.skip("Skipped!"), 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
             DecorateInfo(unittest.skip("Skipped"), 'TestDecomp', 'test_comprehensive'),
             DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensor', 'test_fake'),
-            DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', device_type='cuda'),
             DecorateInfo(unittest.skip('output is non-deterministic (when dropout_p > 0)'), 'TestCommon', 'test_compare_cpu'),),
     ),
     UnaryUfuncInfo(

From 6cc215323e1be403c0bb3ecd8394737dfb8f0ac7 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Sun, 20 Nov 2022 23:36:47 +0000
Subject: [PATCH 1118/1922] Temporary increase ASAN shard 5 to 4xlarge (#89387)

ASAN shard 5 also see OOM now https://hud.pytorch.org/pytorch/pytorch/commit/7b0d577c226fae78f377b26feab4122c4203ad59, may be we should increase all 5 of them to 4xlarge until https://github.com/pytorch/pytorch/issues/88309 is resolved
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89387
Approved by: https://github.com/kit1980
---
 .github/workflows/pull.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 3208cb198bb41..3642c7fc17691 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -78,7 +78,7 @@ jobs:
           { config: "default", shard: 2, num_shards: 5, runner: "linux.2xlarge" },
           { config: "default", shard: 3, num_shards: 5, runner: "linux.2xlarge" },
           { config: "default", shard: 4, num_shards: 5, runner: "linux.4xlarge" },
-          { config: "default", shard: 5, num_shards: 5, runner: "linux.2xlarge" },
+          { config: "default", shard: 5, num_shards: 5, runner: "linux.4xlarge" },
           { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
         ]}
 

From 03978e6008b1f6608e247ac6294a5eb6c076b018 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Mon, 21 Nov 2022 00:58:03 +0000
Subject: [PATCH 1119/1922] use std/libdevice erf in inductor (#89388)

By itself, libdevice version of erf has the same perf as our decomposition, but in real workloads it leads to better fusion groups (due to fewer ops in the fused kernel).
Bonus: a few fp64 test skips removed, because our decomposition wasn't accurate enough for fp64, but libdevice version is.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89388
Approved by: https://github.com/jansel
---
 test/inductor/test_torchinductor_opinfo.py |  4 ----
 torch/_inductor/codegen/cpp.py             |  4 ++++
 torch/_inductor/codegen/triton.py          |  4 ++++
 torch/_inductor/decomposition.py           | 24 ----------------------
 torch/_inductor/lowering.py                |  7 +++++++
 5 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 67b64c73a8ef5..188fcd8b67dc7 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -165,7 +165,6 @@ def process(device_type):
     "corrcoef": {f32, f64, i32, i64},
     "cov": {f32, f64, i32, i64},
     "equal": {b8, f16, f32, f64, i32, i64},
-    "erf": {b8, f64},
     "fft.fft": {f32, f64},
     "fft.fft2": {b8, f32, f64, i32, i64},
     "fft.fftn": {b8, f32, f64, i32, i64},
@@ -214,7 +213,6 @@ def process(device_type):
     "nn.functional.adaptive_avg_pool2d": {f16, f64},
     "nn.functional.ctc_loss": {f32, f64},
     "nn.functional.gaussian_nll_loss": {f32, f64},
-    "nn.functional.gelu": {f64},
     "nn.functional.local_response_norm": {i64},
     "nn.functional.one_hot": {i64},
     "nn.functional.pairwise_distance": {f16},
@@ -346,8 +344,6 @@ def process(device_type):
     "unique_consecutive": {b8, f16, f32, f64, i32, i64},
     "view_as_complex": {f16, f32, f64},
     # AssertionError: Tensor-likes are not close!
-    "erf": {b8, f64},
-    "nn.functional.gelu": {f64},
     "nn.functional.triplet_margin_loss": {f16},
 }
 
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index 65a9335d6cbfc..cf8e6616d677a 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -356,6 +356,10 @@ def exp(x):
         # return f"Sleef_expf_u10({x})"
         return f"std::exp({x})"
 
+    @staticmethod
+    def erf(x):
+        return f"std::erf({x})"
+
     @staticmethod
     def sqrt(x):
         return f"std::sqrt({x})"
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index b79b03232a8a5..2504bd2dcf8c3 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -218,6 +218,10 @@ def masked(mask, body, other):
     def lgamma(x):
         return f"tl.libdevice.lgamma({x})"
 
+    @staticmethod
+    def erf(x):
+        return f"tl.libdevice.erf({x})"
+
     @staticmethod
     def logical_and(a, b):
         return f"{a} & {b}"
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index 09ee53579345a..188072b3d4892 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -366,30 +366,6 @@ def round_dec(x, decimals=0):
     return aten.round(x * ten_pow_decimals) * (1.0 / ten_pow_decimals)
 
 
-@register_decomposition([aten.special_erf, aten.erf])
-def special_erf(x):
-    # TODO(jansel): this might be crazy slow.  Triton doesn't have the
-    #               cuda ::erf() builtin.  I've made a feature request for this,
-    #               so it may be coming soon.
-
-    # from https://www.johndcook.com/blog/2009/01/19/stand-alone-error-function-erf/
-    a1 = 0.254829592
-    a2 = -0.284496736
-    a3 = 1.421413741
-    a4 = -1.453152027
-    a5 = 1.061405429
-    p = 0.3275911
-
-    sign = torch.sign(x)
-    x = torch.abs(x)
-
-    # A & S 7.1.26
-    t = 1.0 / (1.0 + p * x)
-    y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * torch.exp(-x * x)
-
-    return sign * y
-
-
 @register_decomposition([aten.rsub.Tensor, aten.rsub.Scalar])
 def rsub(a, b):
     if isinstance(b, numbers.Number):
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 5168f37cd392c..a76a9baea953d 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -3505,6 +3505,13 @@ def sum_(x, axis=None, keepdims=False, *, dtype=None):
 register_pointwise(
     aten.lgamma, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
 )
+erf = register_pointwise(
+    aten.erf, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+)
+register_lowering(
+    aten.special_erf, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+)(erf)
+
 register_pointwise(
     aten.log,
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,

From 86d38cd4bdd33a2e3b23bc2684130f6232e3e662 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 21 Nov 2022 03:08:31 +0000
Subject: [PATCH 1120/1922] [vision hash update] update the pinned vision hash
 (#89287)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned vision hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89287
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/vision.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index cc0724ac842d1..80fe47b2cee2c 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-d710f3d1edc06afa244468cb96603ba6dbd4d9d5
+5b4f79d9ba8cbeeb8d6f0fbba3ba5757b718888b

From 2cace722deba8fce73926734764ffc110915be07 Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Sun, 20 Nov 2022 22:54:45 +0000
Subject: [PATCH 1121/1922] [Easy] Use prepend arg to register forward hooks in
 quantize.py (#89391)

Differential Revision: [D41431110](https://our.internmc.facebook.com/intern/diff/D41431110)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89391
Approved by: https://github.com/awgu
---
 torch/ao/quantization/quantize.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/torch/ao/quantization/quantize.py b/torch/ao/quantization/quantize.py
index 51eb2c1c1ec65..8b149b44ad3df 100644
--- a/torch/ao/quantization/quantize.py
+++ b/torch/ao/quantization/quantize.py
@@ -143,11 +143,13 @@ def register_activation_post_process_hook(module, pre_hook=False):
     assert hasattr(module, 'activation_post_process'), \
         'Expect activation_post_process attribute already attached to the module'
     if pre_hook:
-        handle = module.register_forward_pre_hook(_observer_forward_pre_hook)
-        module._forward_pre_hooks.move_to_end(handle.id, last=False)
+        handle = module.register_forward_pre_hook(
+            _observer_forward_pre_hook, prepend=True
+        )
     else:
-        handle = module.register_forward_hook(_observer_forward_hook)
-        module._forward_hooks.move_to_end(handle.id, last=False)
+        handle = module.register_forward_hook(
+            _observer_forward_hook, prepend=True
+        )
 
 
 def add_observer_(module, qconfig_propagation_list=None, non_leaf_module_list=None, device=None, custom_module_class_mapping=None):

From cfa9d795b3709e778c2bf790516eadb532dce7e4 Mon Sep 17 00:00:00 2001
From: XiaobingSuper <xiaobing.zhang@intel.com>
Date: Sun, 20 Nov 2022 20:46:02 -0500
Subject: [PATCH 1122/1922] TorchDynamo: enable conv+relu6 fusion (#89265)

This PR is about enabled conv+relu6 which improves mobilenet'e performance.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89265
Approved by: https://github.com/jgong5, https://github.com/jansel
---
 test/inductor/test_torchinductor.py | 1 +
 torch/_inductor/overrides.py        | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index fc0ae82a2598f..d5c3cd673aca5 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -93,6 +93,7 @@ def has_bf16_support():
     torch.nn.Hardtanh(min_val=-0.5, max_val=4, inplace=False),
     torch.nn.GELU(approximate="none"),
     torch.nn.GELU(approximate="tanh"),
+    torch.nn.ReLU6(),
 ]
 
 
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index 9a8bc6266ac01..cff3f6f470230 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -59,6 +59,8 @@ def __init__(self, op_name: str, scalars_attr=None, algorithm_attr=None):
         super(UnaryAttr, self).__init__()
 
     def __call__(self, unary_module: nn.Module):
+        if type(unary_module) is nn.ReLU6:
+            unary_module = nn.Hardtanh(min_val=0, max_val=6)
         assert all(hasattr(unary_module, item) for item in self.scalars_attr)
         scalars = [getattr(unary_module, item) for item in self.scalars_attr]
 
@@ -983,6 +985,7 @@ def rand_like(x, **kwargs):
     nn.LeakyReLU: UnaryAttr("leaky_relu", scalars_attr=["negative_slope"]),
     nn.Hardtanh: UnaryAttr("hardtanh", scalars_attr=["min_val", "max_val"]),
     nn.GELU: UnaryAttr("gelu", algorithm_attr="approximate"),
+    nn.ReLU6: UnaryAttr("hardtanh", scalars_attr=["min_val", "max_val"]),
 }
 
 
From 9078f2dfeacc5e16c5370d9d6fb123fd6aad0b96 Mon Sep 17 00:00:00 2001
From: "Wang, Eikan" <eikan.wang@intel.com>
Date: Mon, 21 Nov 2022 03:31:50 +0000
Subject: [PATCH 1123/1922] Redefine the simdlen semantic (#89263)

This PR is targeting to automatically enable vectorization optimization for TorchInductor. It refined the semantics of `config.cpp.simdlen`.

Originally, `None` means to disable vectorization while a specific value means the number of elements to be vectorized once time. But it depends on the data. Regarding 256bit SVE/SIMD ISA for ARM and X86, the `simdlen` should be 16 for Float while 32 for BFloat. Hence, this PR defined the `simdlen` as the bit width. The detailed semantics are as follows.

- **_simdlen = None_**: Automatically determine the SIMD bit width. Detect HW information and pick the proper vectorization ISA. Specific for X86, the priority of AVX512 is higher than AVX2.
- **_simdlen <=1_**: Explicitly disable SIMD
- **_simdlen > 1_**: Explicitly specify the SIMD bit width. It equals the disabled semantic if the bit width does not match the ISA width.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89263
Approved by: https://github.com/jgong5, https://github.com/jansel
---
 test/inductor/test_minifier.py      |   4 +-
 test/inductor/test_torchinductor.py |  94 +++++++++++-
 torch/_inductor/codecache.py        | 215 +++++++++++++++++++++-------
 torch/_inductor/codegen/common.py   |   6 +
 torch/_inductor/codegen/cpp.py      |  92 +++++++++---
 5 files changed, 330 insertions(+), 81 deletions(-)

diff --git a/test/inductor/test_minifier.py b/test/inductor/test_minifier.py
index 55c0a1b6bb05f..18c5e5f33cade 100644
--- a/test/inductor/test_minifier.py
+++ b/test/inductor/test_minifier.py
@@ -24,7 +24,7 @@ def cpp_runtime_error(x):
 
 CPP_ACCURACY_ERROR = """\
 def cpp_accuracy_error(x):
-    return f"{x} + 1"
+    return f"{x} + decltype({x})(1)"
 """
 
 TRITON_COMPILE_ERROR = """\
@@ -60,8 +60,10 @@ def _gen_codegen_fn_patch_code(self, old_fn_name, new_fn_code, device):
             patch_code = f"""\
 import torch._inductor.codegen.{"cpp" if device == "cpu" else "triton"} as codegen
 overrides = codegen.{"CppOverrides" if device == "cpu" else "TritonOverrides"}
+vec_overrides = codegen.{"CppVecOverrides" if device == "cpu" else "TritonOverrides"}
 {new_fn_code}
 overrides.{old_fn_name} = staticmethod({new_fn_name})
+vec_overrides.{old_fn_name} = staticmethod({new_fn_name})
 """
         return f"""\
 {patch_code}
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index d5c3cd673aca5..3b47cd867c735 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -4526,7 +4526,11 @@ def fn(x):
 
             v = torch.randn(10)
             result = fn(v)
-            assert same(result, mod(v))
+            # TODO: OMP parallel reduction order is not deterministic.
+            # Hence, the accurarcy might vary up and down. For short term,
+            # we increase the tolerance and will fix it later by using
+            # aten parallel.
+            assert same(result, mod(v), tol=5e-1)
 
         def test_inplace_add_alpha(self):
             def fn(x, y):
@@ -4596,7 +4600,79 @@ def test_complex_memory_overlap(self):
             self.assertFalse(complex_memory_overlap(gathered.t()))
 
         @unittest.skipIf(
-            not codecache.get_cpu_proc_info(), "Does not support vectorization"
+            not codecache.valid_vec_isa_list(), "Does not support vectorization"
+        )
+        @patch.object(config, "dynamic_shapes", True)
+        @patch.object(torch._dynamo.config, "dynamic_shapes", True)
+        @patch.object(functorch_config, "use_dynamic_shapes", True)
+        def test_vec_dynamic_shapes(self):
+            def fn(x):
+                return torch.softmax(x, -1)
+
+            value = torch.randn((2, 10))
+            with patch.object(config.cpp, "simdlen", None):
+                torch._dynamo.reset()
+                metrics.reset()
+                opt_fn = torch._dynamo.optimize("inductor")(fn)
+                opt_fn(value)
+
+                real_out = fn(value)
+                compiled_out = opt_fn(value)
+                assert same(real_out, compiled_out, equal_nan=True)
+                assert metrics.generated_cpp_vec_kernel_count < 1
+
+        @unittest.skipIf(
+            not codecache.valid_vec_isa_list(), "Does not support vectorization"
+        )
+        @patch("torch.cuda.is_available", lambda: False)
+        def test_auto_simd(self):
+            vec_avx512 = codecache.supported_vec_isa_list[0]
+            vec_avx2 = codecache.supported_vec_isa_list[1]
+            self.assertTrue(vec_avx512.bit_width() == 512)
+            self.assertTrue(vec_avx2.bit_width() == 256)
+            self.assertTrue(vec_avx512.nelements() == 16)
+            self.assertTrue(vec_avx2.nelements() == 8)
+            self.assertTrue(vec_avx512.nelements(torch.bfloat16) == 32)
+            self.assertTrue(vec_avx2.nelements(torch.bfloat16) == 16)
+
+            with patch.object(config.cpp, "simdlen", None):
+                isa = codecache.pick_vec_isa()
+                if vec_avx512 in codecache.valid_vec_isa_list():
+                    self.assertTrue(isa == vec_avx512)
+                else:
+                    self.assertTrue(isa == vec_avx2)
+
+            with patch.object(config.cpp, "simdlen", 0):
+                isa = codecache.pick_vec_isa()
+                self.assertFalse(isa)
+
+            with patch.object(config.cpp, "simdlen", 1):
+                isa = codecache.pick_vec_isa()
+                self.assertFalse(isa)
+
+            with patch.object(config.cpp, "simdlen", 257):
+                isa = codecache.pick_vec_isa()
+                self.assertFalse(isa)
+
+            with patch.object(config.cpp, "simdlen", 513):
+                isa_list = codecache.valid_vec_isa_list()
+                if vec_avx512 in isa_list:
+                    self.assertFalse(isa)
+
+            with patch.object(config.cpp, "simdlen", 512):
+                isa_list = codecache.valid_vec_isa_list()
+                if vec_avx512 in isa_list:
+                    isa = codecache.pick_vec_isa()
+                    self.assertTrue(isa == vec_avx512)
+
+            with patch.object(config.cpp, "simdlen", 256):
+                isa_list = codecache.valid_vec_isa_list()
+                if vec_avx2 in isa_list:
+                    isa = codecache.pick_vec_isa()
+                    self.assertTrue(isa == vec_avx2)
+
+        @unittest.skipIf(
+            not codecache.valid_vec_isa_list(), "Does not support vectorization"
         )
         @patch("torch.cuda.is_available", lambda: False)
         def test_sign_cpu_only(self):
@@ -4607,7 +4683,7 @@ def fn(x):
             x[0, 0] = torch.nan
             x[1, -1] = torch.nan
 
-            with patch.object(config.cpp, "simdlen", 8):
+            with patch.object(config.cpp, "simdlen", None):
                 torch._dynamo.reset()
                 metrics.reset()
                 traced = make_fx(fn)(x)
@@ -4620,7 +4696,7 @@ def fn(x):
         # other platforms support, we just need to add the ISA info to the supported_vector_isa
         # and include proper aten vectorization head file.
         @unittest.skipIf(
-            not codecache.get_cpu_proc_info(), "Does not support vectorization"
+            not codecache.valid_vec_isa_list(), "Does not support vectorization"
         )
         @patch("torch.cuda.is_available", lambda: False)
         def test_vec_kernel_cpu_only(self):
@@ -4659,7 +4735,15 @@ def fn(x1, x2):
             x1 = torch.randn((10, 20))
             x2 = torch.randn((10, 20))
 
-            with patch.object(config.cpp, "simdlen", 8):
+            with patch.object(config.cpp, "simdlen", 1):
+                torch._dynamo.reset()
+                metrics.reset()
+                traced = make_fx(fn)(x1, x2)
+                compiled = compile_fx_inner(traced, [x1, x2])
+                assert same(fn(x1, x2)[0], compiled([x1, x2])[0], equal_nan=True)
+                assert metrics.generated_cpp_vec_kernel_count == 0
+
+            with patch.object(config.cpp, "simdlen", None):
                 torch._dynamo.reset()
                 metrics.reset()
                 traced = make_fx(fn)(x1, x2)
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 2826f35999126..232a611b06c6a 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -1,5 +1,5 @@
 import base64
-import enum
+import dataclasses
 import functools
 import getpass
 import hashlib
@@ -18,7 +18,7 @@
 from ctypes import cdll
 from threading import Thread
 from time import sleep, time
-from typing import Any, Dict
+from typing import Any, Callable, Dict, List
 
 import torch
 from torch.utils import cpp_extension
@@ -147,79 +147,181 @@ def is_gcc():
     return re.search(r"(gcc|g\+\+)", cpp_compiler())
 
 
-class _SupportedVecIsa(enum.Enum):
-    AVX512 = 1
-    AVX2 = 2
-    INVALID = -1
+class VecISA(object):
+    _bit_width: int
+    _macro: str
+    _arch_flags: str
+    _dtype_nelements: Dict[torch.dtype, int]
+
+    # TorchInductor CPU vectorization reuses PyTorch vectorization utility functions
+    # Hence, TorchInductor would depend on Sleef* to accelerate mathematical functions
+    # like exp, pow, sin, cos and etc.
+    # But PyTorch and TorchInductor might use different compilers to build code. If
+    # PyTorch uses gcc-7/g++-7 to build the release package, the libtorch_cpu.so
+    # will not expose the Sleef* AVX512 symbols since gcc-7/g++-7 cannot pass
+    # avx512 check in CMake - FindAVX.cmake. But TorchInductor install the latest
+    # gcc/g++ compiler by default while it could support the AVX512 compilation.
+    # Therefore, there would be a conflict sleef version between PyTorch and
+    # TorchInductor. Hence, we dry-compile the following code to check whether current
+    # HW platform and PyTorch both could support AVX512 or AVX2. And suppose ARM
+    # also needs the logic
+    _avx_code = """
+#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2)
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+#endif
+
+__attribute__((aligned(64))) float in_out_ptr0[16] = {0.0};
+
+extern "C" void __avx_chk_kernel() {
+    auto tmp0 = at::vec::Vectorized<float>(1);
+    auto tmp1 = tmp0.exp();
+    tmp1.store(in_out_ptr0);
+}
+"""
+
+    _avx_py_load = """
+import torch
+from ctypes import cdll
+cdll.LoadLibrary("__lib_path__")
+"""
+
+    def bit_width(self):
+        return self._bit_width
+
+    def nelements(self, dtype: torch.dtype = torch.float):
+        return self._dtype_nelements[dtype]
 
+    def build_macro(self):
+        return self._macro
+
+    def build_arch_flags(self):
+        return self._arch_flags
+
+    def __hash__(self) -> int:
+        return hash(str(self))
+
+    @functools.lru_cache(None)
     def __bool__(self):
-        return self != _SupportedVecIsa.INVALID
+        key, input_path = write(VecISA._avx_code, "cpp", extra="")
+        from filelock import FileLock
+
+        lock_dir = get_lock_dir()
+        lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
+        with lock:
+            output_path = input_path[:-3] + "so"
+            build_cmd = cpp_compile_command(
+                input_path, output_path, warning_all=False, vec_isa=self
+            ).split(" ")
+            try:
+                # Check build result
+                subprocess.check_output(build_cmd, stderr=subprocess.STDOUT)
+                subprocess.check_call(
+                    [
+                        "python",
+                        "-c",
+                        VecISA._avx_py_load.replace("__lib_path__", output_path),
+                    ],
+                    stderr=subprocess.DEVNULL,
+                )
+            except Exception as e:
+                return False
 
-    @staticmethod
-    def isa_str(supported_isa: enum.Enum):
-        if supported_isa == _SupportedVecIsa.AVX512:
-            return "avx512"
-        elif supported_isa == _SupportedVecIsa.AVX2:
-            return "avx2"
-        else:
-            return ""
+            return True
+
+
+@dataclasses.dataclass
+class VecAVX512(VecISA):
+    _bit_width = 512
+    _macro = "CPU_CAPABILITY_AVX512"
+    _arch_flags = "-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma"
+    _dtype_nelements = {torch.float: 16, torch.bfloat16: 32}
+
+    def __str__(self) -> str:
+        return "avx512"
+
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
 
-    @staticmethod
-    def vec_macro(supported_isa: enum.Enum):
-        if supported_isa == _SupportedVecIsa.AVX512:
-            return "CPU_CAPABILITY_AVX512"
-        elif supported_isa == _SupportedVecIsa.AVX2:
-            return "CPU_CAPABILITY_AVX2"
-        else:
-            return ""
+
+@dataclasses.dataclass
+class VecAVX2(VecISA):
+    _bit_width = 256
+    _macro = "CPU_CAPABILITY_AVX2"
+    _arch_flags = "-mavx2 -mfma"
+    _dtype_nelements = {torch.float: 8, torch.bfloat16: 16}
+
+    def __str__(self) -> str:
+        return "avx2"
+
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+
+
+class InvalidVecISA(VecISA):
+    _bit_width = 0
+    _macro = ""
+    _arch_flags = ""
+    _dtype_nelements = {}
+
+    def __str__(self) -> str:
+        return "INVALID_VEC_ISA"
+
+    def __bool__(self):
+        return False
+
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+
+
+invalid_vec_isa = InvalidVecISA()
+supported_vec_isa_list = [VecAVX512(), VecAVX2()]
 
 
 # Cache the cpuinfo to avoid I/O overhead. Meanwhile, the cpuinfo content
 # might have too much redundant content that is useless for ISA check. Hence,
 # we only cache some key isa information.
-@functools.lru_cache(1)
-def get_cpu_proc_info():
+@functools.lru_cache(None)
+def valid_vec_isa_list():
     if sys.platform != "linux":
         return []
 
-    isa_info = []
+    isa_list = []
     with open("/proc/cpuinfo") as _cpu_info:
         _cpu_info_content = _cpu_info.read()
-        if _SupportedVecIsa.isa_str(_SupportedVecIsa.AVX512) in _cpu_info_content:
-            isa_info.append(_SupportedVecIsa.AVX512)
-
-        if _SupportedVecIsa.isa_str(_SupportedVecIsa.AVX2) in _cpu_info_content:
-            isa_info.append(_SupportedVecIsa.AVX2)
+        for isa in supported_vec_isa_list:
+            if str(isa) in _cpu_info_content and isa:
+                isa_list.append(isa)
+        return isa_list
 
-        return isa_info
 
+def pick_vec_isa():
+    _valid_vec_isa_list: List[VecISA] = valid_vec_isa_list()
+    if not _valid_vec_isa_list:
+        return invalid_vec_isa
 
-def supported_vector_isa():
-    # TODO: Add ARM Vec here.
-    # Dict(k: isa, v: number of float element)
-    vec_isa_info = {
-        _SupportedVecIsa.AVX512: 16,
-        _SupportedVecIsa.AVX2: 8,
-    }
+    # If the simdlen is None, it indicates determin the vectroization length automatically
+    if config.cpp.simdlen is None:
+        assert _valid_vec_isa_list
+        return _valid_vec_isa_list[0]
 
-    if config.cpp.simdlen is None or config.cpp.simdlen <= 1:
-        return _SupportedVecIsa.INVALID
-
-    cpu_info_content = get_cpu_proc_info()
-    for isa in vec_isa_info.keys():
-        if isa in cpu_info_content and config.cpp.simdlen == vec_isa_info[isa]:
+    for isa in _valid_vec_isa_list:
+        if config.cpp.simdlen == isa.bit_width():
             return isa
 
-    return _SupportedVecIsa.INVALID
+    return invalid_vec_isa
 
 
-def cpp_compile_command(input, output, include_pytorch=False):
-    valid_isa = supported_vector_isa()
-    if include_pytorch or valid_isa:
+def cpp_compile_command(
+    input,
+    output,
+    warning_all=True,
+    shared=True,
+    include_pytorch=False,
+    vec_isa: VecISA = invalid_vec_isa,
+):
+    if include_pytorch or vec_isa != invalid_vec_isa:
         ipaths = cpp_extension.include_paths() + [sysconfig.get_path("include")]
         lpaths = cpp_extension.library_paths() + [sysconfig.get_config_var("LIBDIR")]
         libs = ["c10", "torch", "torch_cpu", "torch_python", "gomp"]
-        macros = _SupportedVecIsa.vec_macro(valid_isa)
+        macros = vec_isa.build_macro()
         if macros:
             macros = f"-D{macros}"
     else:
@@ -235,11 +337,13 @@ def cpp_compile_command(input, output, include_pytorch=False):
     lpaths = " ".join(["-L" + p for p in lpaths])
     libs = " ".join(["-l" + p for p in libs])
 
+    shared_lib = "-shared -fPIC" if shared else ""
+    warning_all_flag = "-Wall" if warning_all else ""
     return re.sub(
         r"[ \n]+",
         " ",
         f"""
-            {cpp_compiler()} {input} -shared -fPIC -Wall -std=c++14 -Wno-unused-variable
+            {cpp_compiler()} {input} {shared_lib} {warning_all_flag} -std=c++14 -Wno-unused-variable
             {ipaths} {lpaths} {libs} {macros}
             -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp
             -D C10_USING_CUSTOM_GENERATED_MACROS
@@ -266,7 +370,12 @@ def _load_library(path):
 
     @classmethod
     def load(cls, source_code):
-        key, input_path = write(source_code, "cpp", extra=cpp_compile_command("i", "o"))
+        picked_vec_isa = pick_vec_isa()
+        key, input_path = write(
+            source_code,
+            "cpp",
+            extra=cpp_compile_command("i", "o", vec_isa=picked_vec_isa),
+        )
         if key not in cls.cache:
             from filelock import FileLock
 
@@ -276,7 +385,7 @@ def load(cls, source_code):
                 output_path = input_path[:-3] + "so"
                 if not os.path.exists(output_path):
                     cmd = cpp_compile_command(
-                        input=input_path, output=output_path
+                        input=input_path, output=output_path, vec_isa=picked_vec_isa
                     ).split(" ")
                     try:
                         subprocess.check_output(cmd, stderr=subprocess.STDOUT)
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index 2803970295ccc..cf98833964ca5 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -417,6 +417,12 @@ def __init__(self, name):
     def __str__(self):
         return self.name
 
+    def __hash__(self) -> int:
+        return hash(self.name)
+
+    def __eq__(self, other) -> bool:
+        return type(other) == type(self) and other.name == self.name
+
     def update_on_args(self, args, kwargs):
         pass
 
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index cf8e6616d677a..f82591ddff36f 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -620,7 +620,7 @@ def codegen_loops(self, code, worksharing):
         )
         reductions.mark_reduction(self.reduction_vars)
 
-        if config.cpp.simdlen:
+        if codecache.pick_vec_isa():
             # TODO(jansel): detect stride-1 dimension and vectorize that
             if reductions:
                 reductions.loops[-1].simd = True
@@ -711,7 +711,8 @@ class CppVecKernel(CppKernel):
 
     def __init__(self, args, num_threads):
         super(CppVecKernel, self).__init__(args, num_threads)
-        self.simd_len = config.cpp.simdlen
+        assert codecache.pick_vec_isa()
+        self.simd_nelements = codecache.pick_vec_isa().nelements()
         self.reduction_omp_dec: Dict[str, str] = {}
         metrics.generated_cpp_vec_kernel_count += 1
 
@@ -727,10 +728,10 @@ def is_var_irrevelant(self, var: sympy.Symbol, index: sympy.Expr):
 
     def transform_index(self, index: sympy.Expr):
         expanded_index = sympy.expand(index)
-        assert self.simd_len
-        assert self.simd_len > 0
+        assert self.simd_nelements
+        assert self.simd_nelements >= 1
         most_inner_var = self.itervars[-1]
-        replacement = {most_inner_var: most_inner_var * self.simd_len}
+        replacement = {most_inner_var: most_inner_var * self.simd_nelements}
         new_index = sympy_subs(expanded_index, replacement)
         return new_index
 
@@ -951,21 +952,24 @@ def __init__(self, args=None, num_threads=None):
         super(CppKernelProxy, self).__init__(args, num_threads)
         self.simd_vec_kernel: CppVecKernel = None
         self.simd_omp_kernel: CppKernel = None
+        self.picked_vec_isa: codecache.VecISA = codecache.pick_vec_isa()
 
-    def vectorize_most_inner_loop(self, loop_nest):
-        loop_nest.split_most_inner_loop(config.cpp.simdlen)
+    def vectorize_most_inner_loop(self, loop_nest, dtype=torch.float):
+        assert self.picked_vec_isa
+        nelements = self.picked_vec_isa.nelements(dtype)
+        loop_nest.split_most_inner_loop(nelements)
         loop_with_tail = loop_nest.loops[-1]
         assert isinstance(loop_with_tail, LoopLevelWithTail)
 
         loop_with_tail.main_loop.simd_vec = True
 
         loop_with_tail.tail_loop.simd_omp = True
-        # We chope the loop into two cubes by the config.cpp.simdlen - main loop and tail loop.
+        # We chope the loop into two cubes by the nelements - main loop and tail loop.
         # Regarding the main loop, it is straightforward that it could be vectorized with
-        # config.cpp.simdlen. But for the tail loop, it still could be vectorized. For example,
-        # if the config.cpp.simdlen is 8(256bits), then the tail loop still could be vectorized
+        # nelements. But for the tail loop, it still could be vectorized. For example,
+        # if the nelements is 8(256bits), then the tail loop still could be vectorized
         # as 4(128bits).
-        loop_with_tail.tail_loop.simd_len = int(config.cpp.simdlen / 2)
+        loop_with_tail.tail_loop.simd_nelements = int(nelements / 2)
         loop_with_tail.tail_loop.simd_vec = False
 
         loop_with_tail.main_loop_body = self.simd_vec_kernel
@@ -975,7 +979,7 @@ def vectorize_most_inner_loop(self, loop_nest):
     def codegen_loops(self, code, worksharing):
         threads = parallel_num_threads()
 
-        if self.simd_vec_kernel is None:
+        if self.simd_vec_kernel is None or not self.picked_vec_isa:
             assert self.simd_omp_kernel
             return self.simd_omp_kernel.codegen_loops(code, worksharing)
 
@@ -997,12 +1001,52 @@ def codegen_loops(self, code, worksharing):
         ), LoopNest(loops[reduction_depth:])
         loops_nest_reduce.mark_reduction(self.simd_vec_kernel.reduction_vars)
 
-        if config.cpp.simdlen:
-            # TODO(jansel): detect stride-1 dimension and vectorize that
-            if loops_nest_reduce:
-                loops_nest_reduce.loops[-1].simd = True
-            elif loops_nest_non_reduce:
-                loops_nest_non_reduce.loops[-1].simd = True
+        assert self.picked_vec_isa
+        # Do not apply vectorization since the range of most inner is too small. Meanwhile,
+        # If the range of the most inner is less then the codecache.pick_vec_isa().nelements(),
+        # the generated code for some reduction will be as follows that leads to incrrect result.
+        #
+        #    LINE01:  float tmp1 = 0;
+        #    LINE02:  auto tmp1_vec = at::vec::Vectorized<float>(tmp1);
+        #    LINE03:  for(long i1=0; i1<2; i1+=1)
+        #    LINE04:  {
+        #    LINE05:      for(long i2=0; i2<0; i2+=1)
+        #    LINE06:      {
+        #    LINE07:          auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + (8*i0) + (16*i2) + (32*i1));
+        #    LINE08:          tmp1_vec += tmp0;
+        #    LINE09:      }
+        #    LINE10:      tmp1 = vec_reduce_all<float>([](Vectorized<float>& x, Vectorized<float>&y) {return x + y;}, tmp1_vec);
+        #    LINE11:      #pragma omp simd simdlen(8)  reduction(+:tmp1)
+        #    LINE12:      for(long i2=0; i2<8; i2+=1)
+        #    LINE13:      {
+        #    LINE14:          auto tmp0 = in_ptr0[i2 + (8*i0) + (32*i1)];
+        #    LINE15:          tmp1 += tmp0;
+        #    LINE16:      }
+        #    LINE17:  }
+        #    LINE18:  out_ptr3[i0] = tmp1;
+        #
+        # tmp1_vec(LINE02) will always be zero as it is initialized with tmp1 value and the range(LINE05)
+        # is 0. Hence, the LINE10 will always reset tmp1 to 0. But tmp1(LINE01) is global value. So the result
+        # will be incorrect. We skip thie case.
+        most_inner_loop = (
+            loops_nest_reduce.loops[-1]
+            if loops_nest_reduce
+            else loops_nest_non_reduce.loops[-1]
+        )
+        main_loop_range = ir.IndexingDiv(
+            most_inner_loop.size, self.picked_vec_isa.nelements()
+        )
+        loop_interval = sympy.simplify(main_loop_range)
+        # TODO(Eikan): To support dynamic shape.
+        if not loop_interval.is_integer or loop_interval <= 0:
+            metrics.generated_cpp_vec_kernel_count -= 1
+            return self.simd_omp_kernel.codegen_loops(code, worksharing)
+
+        # TODO(jansel): detect stride-1 dimension and vectorize that
+        if loops_nest_reduce:
+            loops_nest_reduce.loops[-1].simd = True
+        elif loops_nest_non_reduce:
+            loops_nest_non_reduce.loops[-1].simd = True
 
         par_depth = 0
         reduction_par_depth = 0
@@ -1142,8 +1186,7 @@ def can_fuse_vertical(cls, node1, node2):
         return cls.can_fuse_horizontal(node1, node2) and not node1.is_reduction()
 
     def can_vec(self, nodes):
-        # TODO: Query cpu arch and vec length from aten
-        if not codecache.supported_vector_isa():
+        if not codecache.pick_vec_isa():
             return False
 
         _, (group, reduction_group) = max(
@@ -1353,7 +1396,8 @@ class LoopLevel:
     steps: sympy.Expr = sympy.Integer(1)
     parallel: int = 0
     simd_omp: bool = False
-    simd_len: int = config.cpp.simdlen
+    picked_vec_isa: codecache.VecISA = codecache.pick_vec_isa()
+    simd_nelements: int = picked_vec_isa.nelements() if picked_vec_isa else 0
     simd_vec: bool = False
     collapsed: bool = False
     reduction_vars: Dict[str, str] = None
@@ -1367,7 +1411,11 @@ def lines(self):
             )
         else:
             reduction = ""
-        simd = f"simd simdlen({self.simd_len}) " if self.simd_omp else ""
+        simd = (
+            f"simd simdlen({self.simd_nelements}) "
+            if self.simd_omp and self.simd_nelements > 1
+            else ""
+        )
         if self.parallel:
             # TODO(jansel): look into chunk size and other schedules
             line1 = f"#pragma omp for{reduction} "

From 14753b398e726aad461eb847c86cd2458c472bd8 Mon Sep 17 00:00:00 2001
From: XiaobingSuper <xiaobing.zhang@intel.com>
Date: Sun, 20 Nov 2022 20:46:03 -0500
Subject: [PATCH 1124/1922] TorchDynamo: enable conv+silu fusion (#89278)

This PR will improve the tf_efficientnet_b0 performance by fusing conv+silu.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89278
Approved by: https://github.com/jgong5, https://github.com/jansel
---
 aten/src/ATen/native/mkldnn/Utils.cpp | 1 +
 test/inductor/test_torchinductor.py   | 1 +
 torch/_inductor/overrides.py          | 1 +
 3 files changed, 3 insertions(+)

diff --git a/aten/src/ATen/native/mkldnn/Utils.cpp b/aten/src/ATen/native/mkldnn/Utils.cpp
index 5db6e0b07ff15..2c626884d8f0f 100644
--- a/aten/src/ATen/native/mkldnn/Utils.cpp
+++ b/aten/src/ATen/native/mkldnn/Utils.cpp
@@ -132,6 +132,7 @@ const std::map<c10::string_view, AttrFunction>& fusion_unary_attr_map() {
       {"relu", ATTR_FUNC(relu)},
       {"sigmoid", ATTR_FUNC(sigmoid)},
       {"tanh", ATTR_FUNC(tanh)},
+      {"swish", ATTR_FUNC(swish)},
       {"hardswish", ATTR_FUNC(hardswish)},
       {"leaky_relu", attr_func_leaky_relu},
       {"hardtanh", attr_func_hardtanh},
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 3b47cd867c735..399032890ca83 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -94,6 +94,7 @@ def has_bf16_support():
     torch.nn.GELU(approximate="none"),
     torch.nn.GELU(approximate="tanh"),
     torch.nn.ReLU6(),
+    torch.nn.SiLU(),
 ]
 
 
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index cff3f6f470230..5bd97cd5009a1 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -986,6 +986,7 @@ def rand_like(x, **kwargs):
     nn.Hardtanh: UnaryAttr("hardtanh", scalars_attr=["min_val", "max_val"]),
     nn.GELU: UnaryAttr("gelu", algorithm_attr="approximate"),
     nn.ReLU6: UnaryAttr("hardtanh", scalars_attr=["min_val", "max_val"]),
+    nn.SiLU: UnaryAttr("swish"),
 }
 
 
From 699eabeef4bfe5de94035e5f8d8bbc38f143b066 Mon Sep 17 00:00:00 2001
From: yanbing-j <yanbing.jiang@intel.com>
Date: Mon, 21 Nov 2022 09:52:34 +0000
Subject: [PATCH 1125/1922] Update ideep for future performance improvement
 (#87966)

**Summary**
The update includes API changes and optimzations to reduce framework overhead, which will benefit all mkldnn (onednn) ops in JIT mode and inductor CPU backend, etc. These benefits will be seen after switching to new ideep API by future PRs.

**Test plan**
For correctness, all UTs that call mkldnn ops, including test_ops.py, test_mkldnn*.py, test_quantization.py, etc.
For performance, TorchBench has been run and no regression is found. Results are shown below.
- Intel (R) Xeon (R) IceLake with 40 cores
- Use multi-instance
- Using tcmalloc & Intel OMP

![image](https://user-images.githubusercontent.com/12522207/201631004-bb77468d-953b-4757-a001-94d44615b5f6.png)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87966
Approved by: https://github.com/jgong5, https://github.com/XiaobingSuper
---
 third_party/ideep | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/ideep b/third_party/ideep
index ececd0a4f53c3..5ddc65efe0428 160000
--- a/third_party/ideep
+++ b/third_party/ideep
@@ -1 +1 @@
-Subproject commit ececd0a4f53c39f2d91caaddee0de1cd214f5b99
+Subproject commit 5ddc65efe0428bbce2942b3ce5e3ce15239abe2f

From 9566c74b1115d8ced3480c1fd2ddd73d594ea69d Mon Sep 17 00:00:00 2001
From: lezcano <lezcano-93@hotmail.com>
Date: Mon, 21 Nov 2022 10:54:32 +0000
Subject: [PATCH 1126/1922] Move the OpInfo same-storage error to the autograd
 test (#88306)

This check was previously located at the `non_contiguous` test (quite
and odd location). Even more, at https://github.com/pytorch/pytorch/pull/86378#discussion_r993658395, Kshiteej found that this assert was not doing anything really.

We move it to the autograd test and make it a proper `self.assert`. We also disallow returning 1-tuples from sample_input functions, as they were breaking this assert.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88306
Approved by: https://github.com/mruberry
---
 test/test_ops.py                              |  5 ---
 test/test_testing.py                          | 17 +++-------
 .../_internal/common_methods_invocations.py   | 31 ++++++++-----------
 torch/testing/_internal/common_utils.py       |  9 ++++++
 4 files changed, 26 insertions(+), 36 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 5f9ad6ff43176..7e0a9952389ce 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -500,11 +500,6 @@ def test_noncontiguous_samples(self, device, dtype, op):
                 noncontig_sample.kwargs,
             )
 
-            # Verifies sample input tensors should have no grad or history
-            sample_tensor = t_inp if isinstance(t_inp, torch.Tensor) else t_inp[0]
-            assert sample_tensor.grad is None
-            assert sample_tensor.grad_fn is None
-
             # validates forward
             expected = op(t_inp, *t_args, **t_kwargs)
             actual = op(n_inp, *n_args, **n_kwargs)
diff --git a/test/test_testing.py b/test/test_testing.py
index f05883919f17c..6dc06a8a2aeb8 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -12,7 +12,7 @@
 import subprocess
 import sys
 import unittest.mock
-from typing import Any, Callable, Iterator, List, Tuple, Generator, Sequence
+from typing import Any, Callable, Iterator, List, Tuple, Generator
 
 import torch
 
@@ -1923,32 +1923,23 @@ def test_sample_input_metadata(self) -> None:
 # Tests that validate the various sample generating functions on each OpInfo.
 class TestOpInfoSampleFunctions(TestCase):
 
-    def _assert_is_generator_or_singleton(self, item, property_name):
-        if isinstance(item, Sequence):
-            msg = (
-                "{property_name} may only return lists for single items"
-                ", please use a coroutine which yields items instead")
-            self.assertTrue(len(item) <= 1, msg=msg)
-        else:
-            self.assertIsInstance(item, Generator)
-
     @ops(op_db, dtypes=OpDTypes.any_one)
     def test_opinfo_sample_generators(self, device, dtype, op):
         # Test op.sample_inputs doesn't generate multiple samples when called
         samples = op.sample_inputs(device, dtype)
-        self._assert_is_generator_or_singleton(samples, "sample_inputs_func")
+        self.assertIsInstance(samples, Generator)
 
     @ops([op for op in op_db if op.reference_inputs_func is not None], dtypes=OpDTypes.any_one)
     def test_opinfo_reference_generators(self, device, dtype, op):
         # Test op.reference_inputs doesn't generate multiple samples when called
         samples = op.reference_inputs(device, dtype)
-        self._assert_is_generator_or_singleton(samples, "reference_inputs_func")
+        self.assertIsInstance(samples, Generator)
 
     @ops([op for op in op_db if op.error_inputs_func is not None], dtypes=OpDTypes.none)
     def test_opinfo_error_generators(self, device, op):
         # Test op.error_inputs doesn't generate multiple inputs when called
         samples = op.error_inputs(device)
-        self._assert_is_generator_or_singleton(samples, "error_inputs_func")
+        self.assertIsInstance(samples, Generator)
 
 
 instantiate_device_type_tests(TestOpInfoSampleFunctions, globals())
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 4ccb5ef3840ff..0f845f7658298 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -958,11 +958,11 @@ def sample_inputs_sparse_sampled_addmm(op_info, device, dtype, requires_grad, **
 
 def sample_inputs_mv(self, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)
-    return (SampleInput(make_arg(S, M), make_arg(M)),)
+    yield SampleInput(make_arg(S, M), make_arg(M))
 
 def sample_inputs_bmm(self, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)
-    return (SampleInput(make_arg(M, S, M), make_arg(M, M, S)),)
+    yield SampleInput(make_arg(M, S, M), make_arg(M, M, S))
 
 def sample_inputs_dot_vdot(self, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -1569,9 +1569,9 @@ def sample_inputs_logcumsumexp(self, device, dtype, requires_grad, **kwargs):
             yield SampleInput(t, dim)
 
 def sample_inputs_trace(self, device, dtype, requires_grad, **kwargs):
-    return (SampleInput((make_tensor((S, S), dtype=dtype, device=device,
-                                     low=None, high=None,
-                                     requires_grad=requires_grad))),)
+    yield SampleInput((make_tensor((S, S), dtype=dtype, device=device,
+                                   low=None, high=None,
+                                   requires_grad=requires_grad)))
 
 
 def error_inputs_trace(op, device):
@@ -5281,6 +5281,11 @@ def error_inputs_complex(op_info, device, is_ref=False, **kwargs):
                                  out=make_arg(M, S, dtype=torch.complex64)),
                      error_type=RuntimeError, error_regex=error_out)
 
+def sample_inputs_logaddexp(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    shape = (S, S)
+    yield SampleInput(make_arg(shape), make_arg(shape))
+
 def sample_inputs_prod(op_info, device, dtype, requires_grad, **kwargs):
     def make_arg(shape):
         # shrink values to be in the interval [-1, +1] for better precision in gradgradcheck
@@ -5322,7 +5327,7 @@ def error_inputs_neg(op_info, device, **kwargs):
     msg = ("Negation, the `\\-` operator, on a bool tensor is not supported."
            " If you are trying to invert a mask, use the `\\~` or"
            " `logical_not\\(\\)` operator instead.")
-    return (ErrorInput(si, error_regex=msg),)
+    yield ErrorInput(si, error_regex=msg)
 
 def sample_inputs_diag(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad, low=None, high=None)
@@ -8318,7 +8323,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
 
                # Tests that assume input is a tensor or sequence of tensors
-               DecorateInfo(unittest.expectedFailure, "TestCommon", "test_noncontiguous_samples"),
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
                DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
                DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
@@ -9928,7 +9932,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            sample_inputs_func=sample_inputs_linspace,
            skips=(
                # Tests that assume input is a tensor or sequence of tensors
-               DecorateInfo(unittest.expectedFailure, "TestCommon", "test_noncontiguous_samples"),
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
                DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
                DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
@@ -9956,7 +9959,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            sample_inputs_func=sample_inputs_logpace,
            skips=(
                # Tests that assume input is a tensor or sequence of tensors
-               DecorateInfo(unittest.expectedFailure, "TestCommon", "test_noncontiguous_samples"),
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
                DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
                DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
@@ -10060,18 +10062,14 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypesIfROCM=floating_types_and(torch.bfloat16),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           sample_inputs_func=lambda op_info, device, dtype, requires_grad=False, **kwargs:
-           (SampleInput(make_tensor((S, S), dtype=dtype, device=device, requires_grad=requires_grad),
-                        args=(make_tensor((S, S), dtype=dtype, device=device, requires_grad=requires_grad),)),)),
+           sample_inputs_func=sample_inputs_logaddexp),
     OpInfo('logaddexp2',
            dtypes=floating_types_and(torch.bfloat16),
            dtypesIfCUDA=floating_types_and(torch.bfloat16),
            dtypesIfROCM=floating_types_and(torch.bfloat16),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           sample_inputs_func=lambda op_info, device, dtype, requires_grad=False, **kwargs:
-           (SampleInput(make_tensor((S, S), dtype=dtype, device=device, requires_grad=requires_grad),
-                        args=(make_tensor((S, S), dtype=dtype, device=device, requires_grad=requires_grad),)),)),
+           sample_inputs_func=sample_inputs_logaddexp),
     UnaryUfuncInfo('logical_not',
                    ref=np.logical_not,
                    decorators=(precisionOverride({torch.bfloat16: 7e-1,
@@ -14573,7 +14571,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            sample_inputs_func=sample_inputs_ones_zeros,
            skips=(
                # Tests that assume input is a tensor or sequence of tensors
-               DecorateInfo(unittest.expectedFailure, "TestCommon", "test_noncontiguous_samples"),
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
                DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
                DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
@@ -14594,7 +14591,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            sample_inputs_func=sample_inputs_ones_zeros,
            skips=(
                # Tests that assume input is a tensor or sequence of tensors
-               DecorateInfo(unittest.expectedFailure, "TestCommon", "test_noncontiguous_samples"),
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
                DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
                DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
@@ -14615,7 +14611,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            sample_inputs_func=sample_inputs_full,
            skips=(
                # Tests that assume input is a tensor or sequence of tensors
-               DecorateInfo(unittest.expectedFailure, "TestCommon", "test_noncontiguous_samples"),
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
                DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
                DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 35ec53381c1fb..e53887a5fdbb3 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -3749,6 +3749,15 @@ def is_inplace(variant):
                 all_args = tuple(chain((sample.input,), sample.args, sample.kwargs.values()))
             gradcheck_args = tuple(x for x in all_args if (isinstance(x, torch.Tensor) and x.requires_grad))
 
+            # Verifies sample input tensors should have no grad
+            # This may happen if the same tensor is used in two different SampleInputs
+            for t in gradcheck_args:
+                self.assertIsNone(t.grad,
+                                  "A sampled input has a gradient before running autograd. "
+                                  "This usually means that (at least) one input tensor is reused "
+                                  "across different SampleInputs. "
+                                  "Please create a new tensor for each SampleInput.")
+
             def _input_recomposition_helper(inputs, inp, input_idx):
                 if is_iterable_of_tensors(inp):
                     tensor_list = []

From a2030c28493cd3a840bbdd488ef7ae9bcb7bb49f Mon Sep 17 00:00:00 2001
From: Jiong Gong <jiong.gong@intel.com>
Date: Mon, 21 Nov 2022 14:20:33 +0000
Subject: [PATCH 1127/1922] [Inductor] Limit the number of compile threads to
 the available cpu cores (#89377)

`config.compile_threads` gets the number of compile threads via `min(32,os.cpu_count())` while `os.cpu_count()` is the total number of cpu cores in the system, not the available ones. This would cause compile thread contention when the available cpu cores are less than `min(32,os.cpu_count())`, e.g., available cpu cores are limited with numactl or taskset, making the compilation very slow. This PR tries to use `len(os.sched_getaffinity(0))` if `os.sched_getaffinity` is available which returns the available number of cpu cores.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89377
Approved by: https://github.com/soumith
---
 torch/_inductor/config.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index c552101c1caee..a0062c4fe4e25 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -59,7 +59,16 @@
 
 comment_origin = False
 
-compile_threads = min(32, os.cpu_count()) if sys.platform != "win32" else 1
+compile_threads = (
+    min(
+        32,
+        len(os.sched_getaffinity(0))
+        if hasattr(os, "sched_getaffinity")
+        else os.cpu_count(),
+    )
+    if sys.platform != "win32"
+    else 1
+)
 
 # If kernel is fused, the name is generated from the origin node op names
 # for larger kernels limit this

From 522feab0d16d43535b0613152a1bbc91858c7ba9 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 21 Nov 2022 16:38:20 +0000
Subject: [PATCH 1128/1922] Revert "[ao] maintain BC for
 is_activation_post_process (#89260)"

This reverts commit c5fafb4e1694f141d8a1a31142cce4049d9057ed.

Reverted https://github.com/pytorch/pytorch/pull/89260 on behalf of https://github.com/DanilBaibak due to breaking internal builds
---
 torch/ao/quantization/quantize.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/torch/ao/quantization/quantize.py b/torch/ao/quantization/quantize.py
index 8b149b44ad3df..ae080ccaa2ca2 100644
--- a/torch/ao/quantization/quantize.py
+++ b/torch/ao/quantization/quantize.py
@@ -27,12 +27,7 @@
     float_qparams_weight_only_qconfig_4bit,
     _activation_is_memoryless)
 from torch.nn.utils.parametrize import type_before_parametrizations
-
-from torch.ao.quantization.observer import (  # noqa: F401
-    _is_activation_post_process,
-    _is_activation_post_process as is_activation_post_process,
-    # TODO remove this once problems from name change are resolved
-)
+from torch.ao.quantization.observer import _is_activation_post_process
 
 __all__ = [
     "get_default_custom_config_dict",

From 6eed66e9756608d0a8a1224a196f56d63d6faba3 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 21 Nov 2022 16:48:26 +0000
Subject: [PATCH 1129/1922] Revert "[ao] making _is_activation_post_process
 private (#87520)"

This reverts commit 45c62a337756ff9db97cd64d2d42d9e65dda0a85.

Reverted https://github.com/pytorch/pytorch/pull/87520 on behalf of https://github.com/bigfootjon due to Diff reverted internally
---
 test/allowlist_for_publicAPI.json                   | 4 ++--
 test/quantization/ao_migration/test_ao_migration.py | 2 +-
 test/quantization/ao_migration/test_quantization.py | 2 +-
 test/quantization/fx/test_quantize_fx.py            | 6 +++---
 torch/ao/ns/fx/graph_passes.py                      | 4 ++--
 torch/ao/ns/fx/utils.py                             | 8 ++++----
 torch/ao/quantization/__init__.py                   | 1 +
 torch/ao/quantization/fx/_model_report/detector.py  | 4 ++--
 torch/ao/quantization/fx/convert.py                 | 6 +++---
 torch/ao/quantization/fx/prepare.py                 | 4 ++--
 torch/ao/quantization/fx/qconfig_mapping_utils.py   | 6 +++---
 torch/ao/quantization/fx/utils.py                   | 6 +++---
 torch/ao/quantization/observer.py                   | 2 +-
 torch/ao/quantization/quantize.py                   | 9 +++++++--
 torch/quantization/quantize.py                      | 2 +-
 15 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index 2e1394a72e172..94ff57700af67 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -786,7 +786,7 @@
     "get_quantized_operator",
     "get_static_quant_module_class",
     "get_unique_devices_",
-    "_is_activation_post_process",
+    "is_activation_post_process",
     "load_observer_state_dict",
     "no_observer_set",
     "prepare",
@@ -894,7 +894,7 @@
     "convert",
     "get_observer_dict",
     "get_unique_devices_",
-    "_is_activation_post_process",
+    "is_activation_post_process",
     "prepare",
     "prepare_qat",
     "propagate_qconfig_",
diff --git a/test/quantization/ao_migration/test_ao_migration.py b/test/quantization/ao_migration/test_ao_migration.py
index 260ab32056f61..accb13da0dcbd 100644
--- a/test/quantization/ao_migration/test_ao_migration.py
+++ b/test/quantization/ao_migration/test_ao_migration.py
@@ -19,7 +19,7 @@ def test_function_import_quantize(self):
             'convert',
             'get_observer_dict',
             'get_unique_devices_',
-            '_is_activation_post_process',
+            'is_activation_post_process',
             'prepare',
             'prepare_qat',
             'propagate_qconfig_',
diff --git a/test/quantization/ao_migration/test_quantization.py b/test/quantization/ao_migration/test_quantization.py
index 95c5c7bd60150..9c246e1b7cd89 100644
--- a/test/quantization/ao_migration/test_quantization.py
+++ b/test/quantization/ao_migration/test_quantization.py
@@ -22,7 +22,7 @@ def test_function_import_quantize(self):
             'convert',
             'get_observer_dict',
             'get_unique_devices_',
-            '_is_activation_post_process',
+            'is_activation_post_process',
             'prepare',
             'prepare_qat',
             'propagate_qconfig_',
diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 2d91ba80b7e02..bab4467894e20 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -55,6 +55,7 @@
     get_default_qat_qconfig,
     get_default_qconfig_mapping,
     get_default_qat_qconfig_mapping,
+    is_activation_post_process,
     fuse_modules,
     fuse_modules_qat,
     prepare,
@@ -147,7 +148,6 @@
     default_fixed_qparams_range_0to1_observer,
     default_fixed_qparams_range_neg1to1_observer,
     MinMaxObserver,
-    _is_activation_post_process,
 )
 
 # test utils
@@ -3249,7 +3249,7 @@ def _check_node_not_observed(model, arg_node, node):
                     _check_node_not_observed(model, new_node, node)
             elif arg_node.op == "call_module":
                 self.assertTrue(
-                    not _is_activation_post_process(getattr(model, arg_node.target)),
+                    not is_activation_post_process(getattr(model, arg_node.target)),
                     "Arg: {0} of node: {1} is observed but is not a float tensor".format(
                         arg_node, node
                     ),
@@ -5008,7 +5008,7 @@ def forward(self, x):
                 qconfig_dict = func(backend)
                 m = prepare_fx(m, qconfig_dict, example_inputs=(torch.randn(1, 1, 1, 1)))
                 for name, mod in m.named_modules():
-                    if _is_activation_post_process(mod) and mod.dtype == torch.quint8:
+                    if is_activation_post_process(mod) and mod.dtype == torch.quint8:
                         if backend == "fbgemm":
                             lower_bnd = 0
                             upper_bnd = 127
diff --git a/torch/ao/ns/fx/graph_passes.py b/torch/ao/ns/fx/graph_passes.py
index 3f4e156859024..c78b19d2701b1 100644
--- a/torch/ao/ns/fx/graph_passes.py
+++ b/torch/ao/ns/fx/graph_passes.py
@@ -24,7 +24,7 @@
 from torch.ao.ns.fx.mappings import (
     get_node_type_to_io_type_map,
 )
-from torch.ao.quantization.observer import _is_activation_post_process
+from torch.ao.quantization.quantize import is_activation_post_process
 
 from typing import Dict, Tuple, Callable, List, Any, Union, Optional, Set
 
@@ -38,7 +38,7 @@ def _maybe_get_fqn(node: Node, gm: GraphModule) -> Optional[str]:
         if node.op == 'call_module':
             assert isinstance(node.target, str)
             module = getattr_from_fqn(gm, node.target)
-            if _is_activation_post_process(module):
+            if is_activation_post_process(module):
                 node_to_use_for_fqn = get_normalized_nth_input(node, gm, 0)
         fqn = gm._node_name_to_scope[node_to_use_for_fqn.name][0]  # type: ignore[index]
     return fqn  # type: ignore[return-value]
diff --git a/torch/ao/ns/fx/utils.py b/torch/ao/ns/fx/utils.py
index 90574dc20248d..2993764b8a124 100644
--- a/torch/ao/ns/fx/utils.py
+++ b/torch/ao/ns/fx/utils.py
@@ -13,10 +13,10 @@
 from torch.fx.graph import Node
 from torch.ao.quantization import (
     ObserverBase,
-    FakeQuantizeBase
+    FakeQuantizeBase,
 )
-from torch.ao.quantization.observer import _is_activation_post_process
 from torch.ao.quantization.utils import getattr_from_fqn
+from torch.ao.quantization.quantize import is_activation_post_process
 
 from .ns_types import NSNodeTargetType, NSResultsType
 
@@ -256,14 +256,14 @@ def return_first_non_observer_node(
     """
     if node.op == "call_module":
         node_obj = getattr_from_fqn(gm, node.target)  # type: ignore[arg-type]
-        if _is_activation_post_process(node_obj):
+        if is_activation_post_process(node_obj):
             assert len(node.args) == 1
             assert isinstance(node.args[0], Node)
             node = node.args[0]
             # code duplication intended, not worth refactoring
             assert isinstance(node.target, str)
             node_obj = getattr_from_fqn(gm, node.target)
-            if _is_activation_post_process(node_obj):
+            if is_activation_post_process(node_obj):
                 assert len(node.args) == 1
                 assert isinstance(node.args[0], Node)
                 node = node.args[0]
diff --git a/torch/ao/quantization/__init__.py b/torch/ao/quantization/__init__.py
index bc8403f32af8f..1ba2a60ed3d12 100644
--- a/torch/ao/quantization/__init__.py
+++ b/torch/ao/quantization/__init__.py
@@ -114,6 +114,7 @@
     "get_quantized_operator",
     "get_static_quant_module_class",
     "get_unique_devices_",
+    "is_activation_post_process",
     "load_observer_state_dict",
     "no_observer_set",
     "per_channel_weight_observer_range_neg_127_to_127",
diff --git a/torch/ao/quantization/fx/_model_report/detector.py b/torch/ao/quantization/fx/_model_report/detector.py
index d398819ddcdd5..c92733bbc1c32 100644
--- a/torch/ao/quantization/fx/_model_report/detector.py
+++ b/torch/ao/quantization/fx/_model_report/detector.py
@@ -23,7 +23,7 @@
     default_equalization_qconfig,
     EqualizationQConfig,
 )
-from torch.ao.quantization.observer import _is_activation_post_process
+from torch.ao.quantization.quantize import is_activation_post_process
 
 # Names for observer insert keys
 DETECTOR_TARGET_NODE_KEY = "target_node"
@@ -1273,7 +1273,7 @@ def _supports_insertion(self, module: nn.Module) -> bool:
         # case for insertion of module
         # check if the module has any children and isn't observer
         num_children = len(list(module.children()))
-        return num_children == 0 and not _is_activation_post_process(module)
+        return num_children == 0 and not is_activation_post_process(module)
 
     def get_qconfig_info(self, model) -> Dict[str, DetectorQConfigInfo]:
         r""" Returns the DetectorQConfigInfo for each module_fqn relavent
diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py
index f09785679e37b..faa267c492c68 100644
--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@@ -64,6 +64,7 @@
 )
 from torch.ao.quantization.quantize import (
     _remove_qconfig,
+    is_activation_post_process,
 )
 from torch.ao.quantization.stubs import DeQuantStub
 from .custom_config import (
@@ -73,7 +74,6 @@
 from .lower_to_fbgemm import lower_to_fbgemm
 # importing the lib so that the quantized_decomposed ops are registered
 from ._decomposed import quantized_decomposed_lib  # noqa: F401
-from torch.ao.quantization.observer import _is_activation_post_process
 
 
 # TODO: revisit this list. Many helper methods shouldn't be public
@@ -359,7 +359,7 @@ def maybe_get_observer_for_node(
     for maybe_obs_node, _ in node.users.items():
         if maybe_obs_node.op == 'call_module':
             maybe_obs = modules[str(maybe_obs_node.target)]
-            if _is_activation_post_process(maybe_obs):
+            if is_activation_post_process(maybe_obs):
                 return maybe_obs
     return None
 
@@ -787,7 +787,7 @@ def convert(
         elif node.op == "call_module":
             mod = _get_module(node, modules)
             assert mod is not None
-            if _is_activation_post_process(mod):
+            if is_activation_post_process(mod):
                 observed_node = node.args[0]
                 if observed_node in statically_quantized_custom_module_nodes:
                     _replace_observer_or_dequant_stub_with_dequantize_node(node, model.graph)
diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index 005a9cef45e34..c908e3f3b7644 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -16,7 +16,6 @@
 )
 from ..observer import (
     ObserverBase,
-    _is_activation_post_process
 )
 from ..qconfig import (
     _is_reuse_input_qconfig,
@@ -79,6 +78,7 @@
 )
 
 from torch.ao.quantization.quantize import (
+    is_activation_post_process,
     convert
 )
 
@@ -148,7 +148,7 @@
 
 def is_activation_post_process_node(node: Node, modules: Dict[str, torch.nn.Module]) -> bool:
     return isinstance(node, torch.fx.Node) and node.op == "call_module" and \
-        _is_activation_post_process(modules[str(node.target)])
+        is_activation_post_process(modules[str(node.target)])
 
 def is_input_arg_dtype_supported_by_backend(
     arg: Argument,
diff --git a/torch/ao/quantization/fx/qconfig_mapping_utils.py b/torch/ao/quantization/fx/qconfig_mapping_utils.py
index 26c7effd44dbf..0b0407c0b106e 100644
--- a/torch/ao/quantization/fx/qconfig_mapping_utils.py
+++ b/torch/ao/quantization/fx/qconfig_mapping_utils.py
@@ -3,8 +3,8 @@
 from typing import Callable, Any, Dict, Tuple, Set, List
 from torch.ao.quantization import QConfig
 from torch.ao.quantization.qconfig import _add_module_to_qconfig_obs_ctr, QConfigAny, qconfig_equals
-from torch.ao.quantization.observer import (
-    _is_activation_post_process,
+from torch.ao.quantization.quantize import (
+    is_activation_post_process,
 )
 from torch.ao.quantization.backend_config import (
     DTypeConfig,
@@ -158,7 +158,7 @@ def generate_node_name_to_qconfig(
 
         elif node.op == 'call_module':
             # if the node is an observer, just continue - don't add it to the qconfig_map
-            if _is_activation_post_process(modules[node.target]):
+            if is_activation_post_process(modules[node.target]):
                 continue
             qconfig = _maybe_adjust_qconfig_for_module_type_or_name(
                 qconfig_mapping, type(modules[node.target]), node.target, global_qconfig)
diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py
index b8bfa4c9d053c..73fdb0700144d 100644
--- a/torch/ao/quantization/fx/utils.py
+++ b/torch/ao/quantization/fx/utils.py
@@ -30,7 +30,7 @@
     is_per_channel,
     to_underlying_dtype,
 )
-from torch.ao.quantization.observer import _is_activation_post_process
+from torch.ao.quantization.quantize import is_activation_post_process
 
 from torch.fx import GraphModule, map_arg
 
@@ -447,7 +447,7 @@ def all_node_args_have_no_tensors(node: Node, modules: Dict[str, torch.nn.Module
         result = False
     elif node.op == 'call_module':
         assert isinstance(node.target, str)
-        if _is_activation_post_process(modules[node.target]):
+        if is_activation_post_process(modules[node.target]):
             result = all_node_args_have_no_tensors(node.args[0], modules, cache)  # type: ignore[arg-type]
     elif node.op == 'call_module':
         result = False
@@ -1040,7 +1040,7 @@ def _activation_post_process_satisfies_dtype_config_constraints(
     satisfies_constraints = True
     if activation_post_process_ctr is not None:
         activation_post_process = activation_post_process_ctr()
-        assert _is_activation_post_process(activation_post_process)
+        assert is_activation_post_process(activation_post_process)
         # If dtypes don't match, don't check the activation_post_process and return True early
         if activation_post_process.dtype != dtype_with_constraints.dtype:
             return True
diff --git a/torch/ao/quantization/observer.py b/torch/ao/quantization/observer.py
index 42962fe7c29a0..ea2a26bf3896d 100644
--- a/torch/ao/quantization/observer.py
+++ b/torch/ao/quantization/observer.py
@@ -1442,7 +1442,7 @@ def _is_observer_script_module(mod, obs_type_name):
 def _is_activation_post_process(module):
     return (
         isinstance(module, torch.ao.quantization.ObserverBase)
-        or isinstance(module, torch.ao.quantization.FakeQuantizeBase)
+        or isinstance(module, torch.ao.quantization.FakeQuantize)
         or _is_observer_script_module(module, "quantization.observer")
     )
 
diff --git a/torch/ao/quantization/quantize.py b/torch/ao/quantization/quantize.py
index ae080ccaa2ca2..d18f93987465c 100644
--- a/torch/ao/quantization/quantize.py
+++ b/torch/ao/quantization/quantize.py
@@ -27,10 +27,10 @@
     float_qparams_weight_only_qconfig_4bit,
     _activation_is_memoryless)
 from torch.nn.utils.parametrize import type_before_parametrizations
-from torch.ao.quantization.observer import _is_activation_post_process
 
 __all__ = [
     "get_default_custom_config_dict",
+    "is_activation_post_process",
     "propagate_qconfig_",
     "register_activation_post_process_hook",
     "add_observer_",
@@ -62,6 +62,11 @@ def get_default_custom_config_dict():
     """
     return _DEFAULT_CUSTOM_CONFIG_DICT
 
+def is_activation_post_process(module):
+    return (isinstance(module, torch.ao.quantization.ObserverBase) or
+            isinstance(module, torch.ao.quantization.FakeQuantizeBase))
+
+
 def _propagate_qconfig_helper(module, qconfig_dict,
                               qconfig_parent=None, prefix='', prepare_custom_config_dict=None):
     r"""This is a helper function for `propagate_qconfig_`
@@ -319,7 +324,7 @@ def _remove_activation_post_process(module):
     # TODO: maybe we should change activation_post_process to _activation_post_process
     # to prevent it from being used by user
     if hasattr(module, 'activation_post_process') and \
-       _is_activation_post_process(module.activation_post_process):
+       is_activation_post_process(module.activation_post_process):
         delattr(module, 'activation_post_process')
 
     # remove activation_post_proceess pre and post hooks
diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py
index 24d7049ec50ec..d9fcf1d04d8ba 100644
--- a/torch/quantization/quantize.py
+++ b/torch/quantization/quantize.py
@@ -17,7 +17,7 @@
 from torch.ao.quantization.quantize import convert
 from torch.ao.quantization.quantize import get_observer_dict
 from torch.ao.quantization.quantize import get_unique_devices_
-from torch.ao.quantization.quantize import _is_activation_post_process
+from torch.ao.quantization.quantize import is_activation_post_process
 from torch.ao.quantization.quantize import prepare
 from torch.ao.quantization.quantize import prepare_qat
 from torch.ao.quantization.quantize import propagate_qconfig_

From ff972ec8808d99cbe25842683fb9a9cdd4b38595 Mon Sep 17 00:00:00 2001
From: Xu Zhao <xzhao9@meta.com>
Date: Mon, 21 Nov 2022 17:25:28 +0000
Subject: [PATCH 1130/1922] Port torchdynamo's torchbench script to
 userbenchmark (#89239)

Summary:
This Diff ports the torchbench.py script from torchdynamo to torchbench to support the development of internal models.

Currently, only works with the `--only` option, and can only test one model at a time.

Note that the noisy logs are from upstream model code, not the benchmark code.
In the internal environment, `torch._dynamo.config.base_dir` is not writable, so we add an option to specify the output directory.

Test Plan:
```
$ buck2 run mode/opt //caffe2/benchmarks/dynamo:torchbench -- --performance --only ads_dhen_5x --part over --output-directory /tmp/tb-test/
cuda eval  ads_dhen_5x
  1/  1 +0 frames   2s  1 graphs  1 graph calls  412/ 411 = 100% ops 100% time
```

```
$  buck2 run mode/opt //caffe2/benchmarks/dynamo:torchbench -- --performance --only cmf_10x --part over --output-directory /tmp/tb-test/
cuda eval  cmf_10x
  1/  1 +0 frames   1s  1 graphs  1 graph calls  306/ 305 = 100% ops 100% time
```

Reviewed By: jansel

Differential Revision: D41294311

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89239
Approved by: https://github.com/jansel
---
 benchmarks/dynamo/common.py     | 39 ++++++++++++++++++++++++++++-----
 benchmarks/dynamo/torchbench.py | 21 +++++++++++++++---
 2 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index f4d1bfad37d71..8731d545c456a 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -1505,6 +1505,15 @@ def get_example_inputs(self):
         "--output",
         help="Overrides the output filename",
     )
+    parser.add_argument(
+        "--output-directory",
+        help="Overrides the directory to place output files.",
+    )
+    parser.add_argument(
+        "--part",
+        default=None,
+        help="Specify the part of the model to run.",
+    )
     parser.add_argument(
         "--export-profiler-trace",
         action="store_true",
@@ -1918,7 +1927,12 @@ def run(runner, args, original_dir=None):
         output_filename = args.output
 
     if output_filename:
-        output_filename = os.path.join(torch._dynamo.config.base_dir, output_filename)
+        if args.output_directory:
+            output_filename = os.path.join(args.output_directory, output_filename)
+        else:
+            output_filename = os.path.join(
+                torch._dynamo.config.base_dir, output_filename
+            )
 
     if args.find_batch_sizes and args.only:
         for device in args.devices:
@@ -1955,11 +1969,24 @@ def run(runner, args, original_dir=None):
                 example_inputs = tree_map(lambda x: x.to(device=device), example_inputs)
             else:
                 try:
-                    device, name, model, example_inputs, batch_size = runner.load_model(
-                        device,
-                        model_name,
-                        batch_size=batch_size,
-                    )
+                    if args.part:
+                        (
+                            device,
+                            name,
+                            model,
+                            example_inputs,
+                            batch_size,
+                        ) = runner.load_model(
+                            device, model_name, batch_size=batch_size, part=args.part
+                        )
+                    else:
+                        (
+                            device,
+                            name,
+                            model,
+                            example_inputs,
+                            batch_size,
+                        ) = runner.load_model(device, model_name, batch_size=batch_size)
                 except NotImplementedError as e:
                     print(e)
                     import traceback
diff --git a/benchmarks/dynamo/torchbench.py b/benchmarks/dynamo/torchbench.py
index b7d4a3be7933d..24a049f14ba2a 100755
--- a/benchmarks/dynamo/torchbench.py
+++ b/benchmarks/dynamo/torchbench.py
@@ -227,12 +227,16 @@ def load_model(
         device,
         model_name,
         batch_size=None,
+        part=None,
     ):
 
         is_training = self.args.training
         use_eval_mode = self.args.use_eval_mode
         dynamic_shapes = self.args.dynamic_shapes
-        module = importlib.import_module(f"torchbenchmark.models.{model_name}")
+        try:
+            module = importlib.import_module(f"torchbenchmark.models.{model_name}")
+        except ModuleNotFoundError:
+            module = importlib.import_module(f"torchbenchmark.models.fb.{model_name}")
         benchmark_cls = getattr(module, "Model", None)
         if not hasattr(benchmark_cls, "name"):
             benchmark_cls.name = model_name
@@ -248,13 +252,24 @@ def load_model(
 
         # workaround "RuntimeError: not allowed to set torch.backends.cudnn flags"
         torch.backends.__allow_nonbracketed_mutation_flag = True
+        extra_args = []
+        if part:
+            extra_args = ["--part", part]
         if is_training:
             benchmark = benchmark_cls(
-                test="train", device=device, jit=False, batch_size=batch_size
+                test="train",
+                device=device,
+                jit=False,
+                batch_size=batch_size,
+                extra_args=extra_args,
             )
         else:
             benchmark = benchmark_cls(
-                test="eval", device=device, jit=False, batch_size=batch_size
+                test="eval",
+                device=device,
+                jit=False,
+                batch_size=batch_size,
+                extra_args=extra_args,
             )
         if dynamic_shapes:
             if not hasattr(benchmark, "get_dynamic_shapes_module"):

From 5695b1645fe0bd3ed4da9a919c521bf4d4fa3350 Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Mon, 21 Nov 2022 09:23:16 -0800
Subject: [PATCH 1131/1922] update kineto pinned commit (#89435)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89435
Approved by: https://github.com/malfet
---
 third_party/kineto | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/kineto b/third_party/kineto
index 0703c78999061..6c1629809068e 160000
--- a/third_party/kineto
+++ b/third_party/kineto
@@ -1 +1 @@
-Subproject commit 0703c78999061b8329dfab7ec5046fc5764a5573
+Subproject commit 6c1629809068efd78a8d56b4aa479c7ec49ae562

From 3c12655a45cb631601daebb1ac30a92a4ef343be Mon Sep 17 00:00:00 2001
From: Driss Guessous <drisspg@fb.com>
Date: Mon, 21 Nov 2022 20:02:09 +0000
Subject: [PATCH 1132/1922] Update sdp dispatch logic to enable fused backward
 (#89154)

# Summary
Reorganizes how the sdp dispatch logic is down in order to enable backwards for fused kernels

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89154
Approved by: https://github.com/cpuhrsch
---
 aten/src/ATen/native/native_functions.yaml    |  52 ++---
 .../cuda/NestedTensorTransformerFunctions.cpp | 100 ++++++---
 .../ATen/native/transformers/attention.cpp    |  65 ++++--
 .../native/transformers/cuda/attention.cu     |  46 ++---
 .../transformers/cuda/attention_backward.cu   |  40 +++-
 .../transformers/cuda/flash_attn/fmha_api.cpp |   7 +-
 .../transformers/cuda/flash_attn/fmha_api.h   |   2 +-
 .../ATen/native/transformers/cuda/sdp_utils.h |  34 +++-
 benchmarks/transformer/sdp_backwards.py       | 189 ++++++++++++++++++
 .../check_forward_backward_compatibility.py   |   3 +
 test/functorch/test_ops.py                    |   8 +-
 test/test_meta.py                             |   1 -
 test/test_transformers.py                     |  76 +++++--
 tools/autograd/derivatives.yaml               |   6 +-
 .../_internal/common_methods_invocations.py   |   5 +
 15 files changed, 498 insertions(+), 136 deletions(-)
 create mode 100644 benchmarks/transformer/sdp_backwards.py

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index f625c9faff412..8c759cd09c486 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -13252,18 +13252,39 @@
     CPU, NestedTensorCPU, Meta: _fused_sdp_choice_cpp
     CUDA, NestedTensorCUDA: _fused_sdp_choice_cuda
 
-# Register the math kernel for cpu
-- func: _scaled_dot_product_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool need_attn_weights=False, bool is_causal=False) -> (Tensor, Tensor)
+- func: _scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool need_attn_weights=False, bool is_causal=False) -> (Tensor, Tensor)
   variants: function
+
+- func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool return_softmax=False, bool is_causal=False) -> (Tensor, Tensor, Tensor)
   dispatch:
-    CUDA: _scaled_dot_product_attention_forward_cuda
-    CPU: _scaled_dot_product_attention_forward_math
-    NestedTensorCUDA: _scaled_dot_product_attention_forward_nested
-    NestedTensorCPU: _scaled_dot_product_attention_forward_math
-    Meta: _scaled_dot_product_attention_forward_math
+    CUDA: _scaled_dot_product_flash_attention_cuda
+    NestedTensorCUDA: _scaled_dot_product_flash_attention_nestedtensor_cuda
 
-- func: _scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool need_attn_weights=False, bool is_causal=False) -> (Tensor, Tensor)
+- func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, bool compute_log_sumexp, bool is_causal=False) -> (Tensor, Tensor)
+  dispatch:
+    CUDA: _scaled_dot_product_efficient_attention_cuda
+    NestedTensorCUDA: _scaled_dot_product_efficient_attention_nestedtensor_cuda
+
+- func: _scaled_dot_product_efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, bool is_causal=False) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CUDA: _scaled_dot_product_efficient_attention_backward_cuda
+
+# Returns ouput, softmax_logsumexp, softmax
+- func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, bool return_softmax, float dropout_p, bool is_causal) -> (Tensor, Tensor, Tensor)
   variants: function
+  dispatch:
+    CUDA: _flash_attention_forward
+
+# Returns ouput, logsumexp if compute_logsumexp
+- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, bool compute_log_sumexp=False, bool causal=False) -> (Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: _efficient_attention_forward
+
+- func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, bool is_causal=False) -> (Tensor, Tensor, Tensor)
+  variants: function
+  dispatch:
+    CUDA: _efficient_attention_backward
 
 - func: _triton_scaled_dot_attention(Tensor q, Tensor k, Tensor v, float dropout_p=0.0) -> Tensor
   variants: function
@@ -13290,21 +13311,6 @@
   structured: True
   variants: function
 
-- func: _flash_scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, float dropout_p, bool is_causal) -> Tensor
-  variants: function
-  dispatch:
-    CUDA: flash_scaled_dot_product_attention
-
-- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, bool compute_log_sumexp=False, bool causal=False) -> (Tensor, Tensor)
-  variants: function
-  dispatch:
-    CUDA: _efficient_attention_forward
-
-- func: _efficient_attention_backward(Tensor grad, Tensor query, Tensor key, Tensor value, Tensor logsumexp, Tensor out, bool is_causal=False) -> (Tensor, Tensor, Tensor)
-  variants: function
-  dispatch:
-    CUDA: _efficient_attention_backward
-
 - func: _transformer_decoder_only_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, Tensor? incr_key=None, Tensor? incr_value=None) -> (Tensor, Tensor, Tensor)
   variants: function
   dispatch:
diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
index c2bf4e08ce042..9c72454560d38 100644
--- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
+++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp
@@ -214,26 +214,6 @@ Tensor NestedTensor_to_padded_tensor_cuda(
   return NestedTensor_to_padded_tensor_generic(t, padding, output_size);
 }
 
-std::tuple<Tensor, Tensor> _scaled_dot_product_attention_forward_nested(
-        const Tensor& query_, const Tensor& key, const Tensor& value,
-        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool need_attn_weights, bool is_causal) {
-
-    // Determine which efficient kernel to use
-    sdp::sdp_params kernel_params{query_, key, value, attn_mask_.has_value(), dropout_p, need_attn_weights, is_causal};
-    auto backend = select_sdp_backend(kernel_params);
-    switch(backend){
-      case sdp::SDPBackend::flash_attention:
-          // TODO: enable flash attention kernel
-          return mem_efficient_helper_nested_unpacked(query_, key, value, dropout_p, need_attn_weights, is_causal);
-      case sdp::SDPBackend::efficient_attention:
-          return mem_efficient_helper_nested_unpacked(query_, key, value, dropout_p, need_attn_weights, is_causal);
-      case sdp::SDPBackend::math:
-        return at::_scaled_dot_product_attention_math(query_, key, value, attn_mask_, dropout_p, need_attn_weights, is_causal);
-      default:
-        TORCH_CHECK(false, "Unsupported backend for scaled_dot_product_attention");
-        return std::make_tuple(Tensor(), Tensor());
-    }
-}
 namespace{
 
 /**
@@ -340,19 +320,80 @@ bool is_safe_to_get_storage_as_tensor(const NestedTensorImpl* tensor) {
 }
 
 } // namespace
-std::tuple<Tensor, Tensor> mem_efficient_helper_nested_unpacked(
+
+std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_flash_attention_nestedtensor_cuda(
     const Tensor& query,
     const Tensor& key,
     const Tensor& value,
     double dropout_p,
-    bool need_atten_weights,
+    bool return_softmax,
     bool is_causal) {
+  TORCH_CHECK(false, "There are currently cuda memory errors being returned from this path.")
   // Query (Batch x Num_heads x {Q_seq_len}  x Dim_per_head)
   // Key   (Batch x Num_heads x {KV_seq_len} x Dim_per_head)
   // Value (Batch x Num_heads x {KV_seq_len} x Dim_per_head)
   const int64_t num_heads = query.size(1);
   const int64_t head_dim = query.size(3);
 
+  // Query -> Query (Batch x {Q_seq_len}  x Num_heads x Dim_per_head)
+  // Key   -> Key   (Batch x {KV_seq_len} x Num_heads x Dim_per_head)
+  // Value -> Value (Batch x {KV_seq_len} x Num_heads x Dim_per_head)
+  Tensor q_t = query.transpose(1, 2).contiguous();
+  Tensor k_t = key.transpose(1, 2).contiguous();
+  Tensor v_t = value.transpose(1, 2).contiguous();
+
+  // K and V have to have the same Nnz, should probably torch_check
+  // assume in order to not iterate over v
+
+  auto cumulative_and_max_q = cumulative_and_max_seq_len(q_t);
+  auto cumulative_and_max_k = cumulative_and_max_seq_len(k_t);
+
+  Tensor cumulative_sequence_length_q = std::get<0>(cumulative_and_max_q);
+  Tensor cumulative_sequence_length_k = std::get<0>(cumulative_and_max_k);
+
+  const int64_t max_seqlen_batch_q = std::get<1>(cumulative_and_max_q);
+  const int64_t max_seqlen_batch_k = std::get<1>(cumulative_and_max_k);
+
+  const int64_t Nnz_q  = cumulative_sequence_length_q[-1].item<int64_t>();
+  const int64_t Nnz_kv = cumulative_sequence_length_k[-1].item<int64_t>();
+
+  auto query_buffer_reshaped =
+      get_buffer(q_t).view({Nnz_q, num_heads, head_dim});
+  auto key_buffer_reshaped =
+      get_buffer(k_t).view({Nnz_kv, num_heads, head_dim});
+  auto value_buffer_reshaped =
+      get_buffer(v_t).view({Nnz_kv, num_heads, head_dim});
+
+  auto attention_and_lse_and_softmax =
+  at::_flash_attention_forward(
+      query_buffer_reshaped,
+      key_buffer_reshaped,
+      value_buffer_reshaped,
+      cumulative_sequence_length_q,
+      cumulative_sequence_length_k,
+      max_seqlen_batch_q,
+      max_seqlen_batch_k,
+      return_softmax,
+      dropout_p,
+      is_causal);
+  // Reshape output to convert nnz to batch_size and seq_len
+  Tensor attention = std::get<0>(attention_and_lse_and_softmax);
+  attention = wrap_buffer(attention.view(-1), get_nested_size_tensor(q_t).clone()).transpose(1,2);
+  return std::tie(attention, std::get<1>(attention_and_lse_and_softmax), std::get<2>(attention_and_lse_and_softmax));
+}
+
+std::tuple<Tensor, Tensor> _scaled_dot_product_efficient_attention_nestedtensor_cuda(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    bool compute_log_sumexp,
+    bool is_causal) {
+   // Query (Batch x Num_heads x {Q_seq_len}  x Dim_per_head)
+  // Key   (Batch x Num_heads x {KV_seq_len} x Dim_per_head)
+  // Value (Batch x Num_heads x {KV_seq_len} x Dim_per_head)
+  const int64_t num_heads = query.size(1);
+  const int64_t head_dim = query.size(3);
+
   Tensor q_t = query.transpose(1, 2);
   Tensor k_t = key.transpose(1, 2);
   Tensor v_t = value.transpose(1, 2);
@@ -432,7 +473,7 @@ std::tuple<Tensor, Tensor> mem_efficient_helper_nested_unpacked(
       {Nnz_kv, num_heads, head_dim},
       {nnz_v_stride, head_v_stride, head_dim_stride},
       value_impl->get_storage_offsets()[0]);
-  std::tuple<Tensor, Tensor> attention_and_weights =
+  std::tuple<Tensor, Tensor> attention_and_logsumexp=
       at::_efficient_attention_forward(
           query_buffer_reshaped.unsqueeze(0),
           key_buffer_reshaped.unsqueeze(0),
@@ -440,14 +481,14 @@ std::tuple<Tensor, Tensor> mem_efficient_helper_nested_unpacked(
           cumulative_sequence_length_q,
           cumulative_sequence_length_k,
           max_seqlen_batch_q,
-          false,
-          false);
+          compute_log_sumexp,
+          is_causal);
   // Reshape output to convert nnz to batch_size and seq_len
-  Tensor attention = std::get<0>(attention_and_weights);
+  Tensor attention = std::get<0>(attention_and_logsumexp);
   attention =
       wrap_buffer(attention.view(-1), get_nested_size_tensor(q_t).clone())
           .transpose(1, 2);
-  return std::tie(attention, std::get<1>(attention_and_weights));
+  return std::tie(attention, std::get<1>(attention_and_logsumexp));
 }
 
 Tensor flash_attention_helper(
@@ -492,7 +533,7 @@ Tensor flash_attention_helper(
   // If we are passing in query, key, value all the same tensors then we have
   // packed them into one tensor and need to slice for flash attention
   Tensor attention =
-      at::_flash_scaled_dot_product_attention(
+      std::get<0>(at::_flash_attention_forward(
           q,
           k,
           v,
@@ -500,8 +541,9 @@ Tensor flash_attention_helper(
           cumulative_sequence_length_q,
           max_seqlen_batch_q,
           max_seqlen_batch_q,
+          false /*return_softmax*/,
           dropout_p,
-          is_causal);
+          is_causal));
   // Output of flash_attention is a regular tensor lets wrap it back up to
   // form a nested tensor
 
diff --git a/aten/src/ATen/native/transformers/attention.cpp b/aten/src/ATen/native/transformers/attention.cpp
index 89a0e4691018c..9c5be12ef24db 100644
--- a/aten/src/ATen/native/transformers/attention.cpp
+++ b/aten/src/ATen/native/transformers/attention.cpp
@@ -678,20 +678,6 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> native_decoder_only_multi_head_attent
 //     L: Target sequence length
 //     E: Embedding dimension
 std::tuple<Tensor, Tensor> _scaled_dot_product_attention(
-        const Tensor& query_, const Tensor& key, const Tensor& value,
-        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool need_attn_weights, bool is_causal) {
-        if (query_.requires_grad() || key.requires_grad() || value.requires_grad()){
-          return at::_scaled_dot_product_attention_math(query_, key, value, attn_mask_, dropout_p, need_attn_weights, is_causal);
-        }
-        return at::_scaled_dot_product_attention_forward(query_, key, value, attn_mask_, dropout_p, need_attn_weights, is_causal);
-}
-
-int64_t _fused_sdp_choice_cpp(const Tensor& query_, const Tensor& key, const Tensor& value,
-        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool need_attn_weights, bool is_causal){
-  return static_cast<int64_t>(sdp::SDPBackend::math);
-}
-
-std::tuple<Tensor, Tensor> _scaled_dot_product_attention_forward_math(
     const Tensor& query_,
     const Tensor& key,
     const Tensor& value,
@@ -699,14 +685,49 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_forward_math(
     double dropout_p,
     bool need_attn_weights,
     bool is_causal) {
-  return at::_scaled_dot_product_attention_math(
-      query_,
-      key,
-      value,
-      attn_mask_,
-      dropout_p,
-      need_attn_weights,
-      is_causal);
+  // TODO: The second return is the attention weights if the math kernel is
+  // used. The fused kernels do not return this Tensor so for the fused kernels
+  // The second return SHOULD always be an empty Tensor, unless need_attn_weights
+  // is true (in which case the fused kernels would not be called). This blows up
+  // op_info tests.
+  int64_t choice_int = at::_fused_sdp_choice(
+      query_, key, value, attn_mask_, dropout_p, need_attn_weights, is_causal);
+  sdp::SDPBackend backend = static_cast<sdp::SDPBackend>(choice_int);
+  switch (backend) {
+    case sdp::SDPBackend::flash_attention: {
+      auto out_lse_softmax = at::_scaled_dot_product_flash_attention(
+          query_, key, value, dropout_p, need_attn_weights, is_causal);
+      return std::make_tuple(
+          std::move(std::get<0>(out_lse_softmax)),
+          std::move(std::get<2>(out_lse_softmax)));
+    }
+    case sdp::SDPBackend::efficient_attention: {
+      bool compute_logsumexp =
+          (query_.requires_grad() || key.requires_grad() ||
+           value.requires_grad());
+      return at::_scaled_dot_product_efficient_attention(
+          query_, key, value, compute_logsumexp, is_causal);
+    }
+    case sdp::SDPBackend::math:
+      return at::_scaled_dot_product_attention_math(
+          query_,
+          key,
+          value,
+          attn_mask_,
+          dropout_p,
+          need_attn_weights,
+          is_causal);
+    default:
+      TORCH_CHECK(
+          false,
+          "No viable backend for scaled_dot_product_attention was found.");
+      return std::make_tuple(Tensor(), Tensor());
+  }
+}
+
+int64_t _fused_sdp_choice_cpp(const Tensor& query_, const Tensor& key, const Tensor& value,
+        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool need_attn_weights, bool is_causal){
+  return static_cast<int64_t>(sdp::SDPBackend::math);
 }
 
 std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math(
diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu
index 602cf319f74a6..8dcb99b3380d9 100644
--- a/aten/src/ATen/native/transformers/cuda/attention.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention.cu
@@ -678,12 +678,12 @@ std::tuple<Tensor, Tensor> native_multi_head_attention_cuda(
   return std::make_tuple(std::move(proj), std::move(qkt));
 }
 
-std::tuple<Tensor, Tensor> flash_attention_helper_dense_unpacked(
+std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_flash_attention_cuda(
     const Tensor& query,
     const Tensor& key,
     const Tensor& value,
     double dropout_p,
-    bool need_atten_weights,
+    bool return_softmax,
     bool is_causal) {
   // Query (Batch x Num_heads x Q_seq_len  x Dim_per_head)
   // Key   (Batch x Num_heads x KV_seq_len x Dim_per_head)
@@ -726,8 +726,9 @@ std::tuple<Tensor, Tensor> flash_attention_helper_dense_unpacked(
   Tensor key_reshaped = k_t.reshape({Nnz_kv, num_heads, head_dim});
   Tensor value_reshaped = v_t.reshape({Nnz_kv, num_heads, head_dim});
 
-  Tensor attention =
-      at::_flash_scaled_dot_product_attention(
+  Tensor attention, log_sumexp, softmax;
+  std::tie(attention, log_sumexp, softmax) =
+      at::_flash_attention_forward(
           query_reshaped,
           key_reshaped,
           value_reshaped,
@@ -735,15 +736,17 @@ std::tuple<Tensor, Tensor> flash_attention_helper_dense_unpacked(
           cumulative_sequence_length_k,
           max_seqlen_batch_q,
           max_seqlen_batch_k,
+          return_softmax,
           dropout_p,
           is_causal);
   // Reshape output to convert nnz to batch_size and seq_len
   attention =
       attention.view({batch_size, max_seqlen_batch_q, num_heads, head_dim}).transpose(1,2);
 
-  return std::tuple<Tensor, Tensor>(attention, Tensor());
+  return std::make_tuple(attention, log_sumexp, softmax);
 }
-std::tuple<Tensor, Tensor> mem_eff_helper(
+
+std::tuple<Tensor, Tensor> _scaled_dot_product_efficient_attention_cuda(
     const Tensor& query,
     const Tensor& key,
     const Tensor& value,
@@ -767,26 +770,7 @@ std::tuple<Tensor, Tensor> mem_eff_helper(
       compute_log_sumexp,
       is_causal);
   attention = attention.transpose(1,2);
-  return std::make_tuple(std::move(attention), Tensor());
-}
-
-std::tuple<Tensor, Tensor> _scaled_dot_product_attention_forward_cuda(
-        const Tensor& query_, const Tensor& key, const Tensor& value,
-        const c10::optional<Tensor>& attn_mask_, double dropout_p, bool need_attn_weights, bool is_causal) {
-    // Determine which efficient kernel to use
-    sdp::sdp_params kernel_params{query_, key, value, attn_mask_.has_value(), dropout_p, need_attn_weights, is_causal};
-    auto backend = select_sdp_backend(kernel_params);
-    switch(backend){
-      case sdp::SDPBackend::flash_attention:
-          return flash_attention_helper_dense_unpacked(query_, key, value, dropout_p, need_attn_weights, is_causal);
-      case sdp::SDPBackend::efficient_attention:
-          return mem_eff_helper(query_, key , value, need_attn_weights, is_causal);
-      case sdp::SDPBackend::math:
-        return at::_scaled_dot_product_attention_math(query_, key, value, attn_mask_, dropout_p, need_attn_weights, is_causal);
-      default:
-        TORCH_CHECK(false, "No viable backend for scaled_dot_product_attention was found.");
-        return std::make_tuple(Tensor(), Tensor());
-    }
+  return std::make_tuple(std::move(attention), std::move(log_sumexp));
 }
 
 int64_t _fused_sdp_choice_cuda(const Tensor& query_, const Tensor& key, const Tensor& value,
@@ -802,7 +786,7 @@ int64_t _fused_sdp_choice_cuda(const Tensor& query_, const Tensor& key, const Te
   return static_cast<int64_t>(backend);
 }
 
-Tensor flash_scaled_dot_product_attention(
+std::tuple<Tensor, Tensor, Tensor> _flash_attention_forward(
     const Tensor& query,
     const Tensor& key,
     const Tensor& value,
@@ -810,11 +794,12 @@ Tensor flash_scaled_dot_product_attention(
     const Tensor& cumulative_sequence_length_k,
     const int64_t max_seqlen_batch_q,
     const int64_t max_seqlen_batch_k,
+    bool return_softmax,
     double dropout_p,
     bool is_causal) {
 #if defined(USE_FLASH_ATTENTION)
   auto softmax_scale = std::pow(query.size(-1), -0.5);
-  std::vector<Tensor> output = fmha::mha_fwd(
+  return fmha::mha_fwd(
       query,
       key,
       value,
@@ -826,12 +811,11 @@ Tensor flash_scaled_dot_product_attention(
       softmax_scale,
       false,
       is_causal,
-      false,
+      return_softmax,
       c10::nullopt);
-  return output[0];
 #endif
   TORCH_CHECK(false, "USE_FLASH_ATTENTION was not enabled for build.")
-  return Tensor();
+  return std::make_tuple(Tensor(), Tensor(), Tensor());
 }
 
 std::tuple<at::Tensor, at::Tensor> _efficient_attention_forward(
diff --git a/aten/src/ATen/native/transformers/cuda/attention_backward.cu b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
index af005b2669b29..a063aacb901ee 100644
--- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
@@ -10,6 +10,7 @@
 #include <ATen/native/transformers/attention.h>
 #include <ATen/native/transformers/cuda/sdp_utils.h>
 
+#include <iostream>
 #ifdef USE_FLASH_ATTENTION
 #include <ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h>
 #endif
@@ -73,14 +74,14 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _efficient_attention_backward(
     const at::Tensor& query,
     const at::Tensor& key,
     const at::Tensor& value,
-    const at::Tensor& logsumexp,
     const at::Tensor& out,
+    const at::Tensor& logsumexp,
     bool causal) {
   #if defined(USE_FLASH_ATTENTION)
   if (!grad_out_.defined()) {
     return std::make_tuple(Tensor{}, Tensor{}, Tensor{});
   }
-    // ndim
+  // ndim
   TORCH_CHECK(query.dim() == grad_out_.dim());
   TORCH_CHECK(query.dim() == key.dim());
   TORCH_CHECK(query.dim() == value.dim());
@@ -128,6 +129,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _efficient_attention_backward(
   // initialized
   bool grad_kv_needs_init = causal && N > M;
   at::Tensor grad_q, grad_k, grad_v;
+  int8_t gQKV_strideM_multiplier = 1;
   if (!grad_kv_needs_init && query.size(1) == key.size(1) &&
       query.size(3) == value.size(3) &&
       query.storage().is_alias_of(key.storage()) &&
@@ -141,10 +143,13 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _efficient_attention_backward(
     grad_q = chunk.select(2, 0);
     grad_k = chunk.select(2, 1);
     grad_v = chunk.select(2, 2);
+    gQKV_strideM_multiplier=3;
   } else {
-    grad_q = at::empty_like(query);
-    grad_k = grad_kv_needs_init ? at::zeros_like(key) : at::empty_like(key);
-    grad_v = grad_kv_needs_init ? at::zeros_like(value) : at::empty_like(value);
+    grad_q = at::empty(query.sizes(), query.options());
+    grad_k = grad_kv_needs_init ? at::zeros(key.sizes(), key.options())
+                                : at::empty(key.sizes(), key.options());
+    grad_v = grad_kv_needs_init ? at::zeros(value.sizes(), value.options())
+                                : at::empty(value.sizes(), value.options());
   }
 
   auto launchKernel = [&](auto _k, int computeCapability) {
@@ -198,7 +203,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _efficient_attention_backward(
     ASSIGN_CHECK_OVERFLOW(p.gQ_strideH, grad_q.stride(2));
     ASSIGN_CHECK_OVERFLOW(p.gK_strideH, grad_k.stride(2));
     ASSIGN_CHECK_OVERFLOW(p.gV_strideH, grad_v.stride(2));
-    p.gQKV_strideM_multiplier = grad_q.is_contiguous() ? 1 : 3;
+    p.gQKV_strideM_multiplier = gQKV_strideM_multiplier;
     TORCH_INTERNAL_ASSERT(p.gQ_strideM() == grad_q.stride(1));
     TORCH_INTERNAL_ASSERT(p.gK_strideM() == grad_k.stride(1));
     TORCH_INTERNAL_ASSERT(p.gV_strideM() == grad_v.stride(1));
@@ -257,5 +262,28 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> _efficient_attention_backward(
   return std::make_tuple(Tensor{}, Tensor{}, Tensor{});
 }
 
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> _scaled_dot_product_efficient_attention_backward_cuda(
+    const at::Tensor& grad_out_,
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const at::Tensor& out,
+    const at::Tensor& logsumexp,
+    bool causal){
+  if (!grad_out_.defined()) {
+    return std::make_tuple(Tensor{}, Tensor{}, Tensor{});
+  }
+  auto grad_out = grad_out_.transpose(1, 2);
+  auto out_t = out.transpose(1, 2);
+  auto q_t = query.transpose(1, 2);
+  auto k_t = key.transpose(1, 2);
+  auto v_t = value.transpose(1, 2);
+
+  Tensor grad_q, grad_k, grad_v;
+  std::tie(grad_q, grad_k, grad_v) = at::_efficient_attention_backward(grad_out, q_t, k_t, v_t, out_t, logsumexp, causal);
+  return std::make_tuple(grad_q.transpose(1, 2), grad_k.transpose(1, 2), grad_v.transpose(1, 2));
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
index aaf7d833fe833..7cc0c250664e1 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
@@ -26,6 +26,7 @@
  *
  ******************************************************************************/
 
+#include <tuple>
 #ifdef USE_FLASH_ATTENTION
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
@@ -115,7 +116,7 @@ void set_params_fprop(FMHA_fprop_params &params,
     params.is_causal = is_causal;
 }
 
-std::vector<at::Tensor>
+std::tuple<at::Tensor, at::Tensor, at::Tensor>
 mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
         const at::Tensor &k,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
         const at::Tensor &v,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
@@ -241,9 +242,7 @@ mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q
 
     run_fmha_fprop(launch_params, /*configure=*/false);
 
-    std::vector<at::Tensor> result = {o, softmax_lse};
-    if (return_softmax) {result.push_back(s);}
-    return result;
+    return std::make_tuple(o, softmax_lse, s);
 }
 } // namespace fmha
 #endif
diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.h b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.h
index 226d4ddd2b551..b0555463be040 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.h
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.h
@@ -7,7 +7,7 @@
 namespace fmha {
 
 TORCH_API
-std::vector<at::Tensor>
+std::tuple<at::Tensor, at::Tensor, at::Tensor>
 mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
         const at::Tensor &k,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
         const at::Tensor &v,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.h b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
index 5d62a6cbd0dc5..55e9aeb184a22 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.h
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
@@ -91,6 +91,31 @@ inline bool check_for_seq_len_1_nested_tensor(sdp_params params, bool debug) {
   return true;
 }
 
+inline bool check_for_nested_inputs(sdp_params params, bool debug){
+  if (params.query.is_nested() || params.key.is_nested() || params.value.is_nested()) {
+    TORCH_CHECK(!debug, "We are not enabling nested Tensors for Flash Attention because of cuda memory errors.");
+    return false;
+  }
+  return true;
+}
+
+inline bool check_requires_grad(sdp_params params, bool debug) {
+  if (params.query.requires_grad() || params.key.requires_grad() || params.value.requires_grad()) {
+    TORCH_CHECK(!debug, "Flash Attention does not currently support training.");
+    return false;
+  }
+  return true;
+}
+
+inline bool check_requires_grad_and_nested(sdp_params params, bool debug) {
+  // If we fail both checks then we return false
+  if (!check_for_nested_inputs(params, false) && !check_requires_grad(params,false)){
+      TORCH_CHECK(!debug, "Memory efficient attention currently doesn't support training with NT inputs.");
+      return false;
+  }
+  return true;
+}
+
 inline bool check_for_attn_mask(sdp_params params, bool debug) {
   if (params.has_attn_mask) {
     TORCH_CHECK(!debug, "Flash Attention does not support attention mask.");
@@ -198,13 +223,15 @@ inline bool use_flash_attention(sdp_params params, bool debug) {
   return false;
 #endif
   //  Define gate functions that determine if a flash kernel can be ran
-  constexpr std::array<bool(*)(sdp_params, bool), 7> constraints {{
+  constexpr std::array<bool(*)(sdp_params, bool), 9> constraints {{
       check_runtime_disabled_flash,
+      check_requires_grad,
       check_tensor_shapes,
       check_for_attn_weights,
       check_for_attn_mask,
       check_head_dim_size,
       check_gpu_sm75_or_greater,
+      check_for_nested_inputs,
       check_for_seq_len_1_nested_tensor}};
   for (auto& constraint : constraints) {
     if (!constraint(params, debug)) {
@@ -232,14 +259,15 @@ inline bool use_mem_efficient_attention(sdp_params params, bool debug) {
       at::kHalf, at::kFloat, at::kBFloat16};
 
   //  Define gate functions that determine if a flash kernel can be ran
-  std::vector<std::function<bool(sdp_params, bool)>> constraints{
+  constexpr std::array<bool(*)(sdp_params, bool), 8> constraints{{
       check_gpu_sm50_or_greater,
       check_runtime_disabled_mem_efficient,
+      check_requires_grad_and_nested,
       check_for_attn_weights,
       check_tensor_shapes,
       check_for_attn_mask,
       check_for_seq_len_1_nested_tensor,
-      check_for_non_zero_dropout};
+      check_for_non_zero_dropout}};
   for (auto& constraint : constraints) {
     if (!constraint(params, debug)) {
       return false;
diff --git a/benchmarks/transformer/sdp_backwards.py b/benchmarks/transformer/sdp_backwards.py
new file mode 100644
index 0000000000000..2f745e157b280
--- /dev/null
+++ b/benchmarks/transformer/sdp_backwards.py
@@ -0,0 +1,189 @@
+import torch
+import numpy as np
+import random
+import torch.utils.benchmark as benchmark
+from torch.profiler import profile, record_function, ProfilerActivity
+
+
+class CompositeMHA(torch.nn.Module):
+    def __init__(self, num_heads, in_proj_weight, in_proj_bias, out_proj):
+        super().__init__()
+        self.in_proj_weight = in_proj_weight
+        self.in_proj_bias = in_proj_bias
+        self.out_proj = out_proj
+        self.num_heads = num_heads
+
+    def forward(self, query, key, value, mask):
+        if not (query is key and key is value):
+            raise NotImplementedError(
+                "query, key and value must be the same Tensor for now."
+            )
+        if mask is not None:
+            raise NotImplementedError("mask is currently not supported.")
+
+        query_projected = torch.nn.functional.linear(
+            query, self.in_proj_weight, self.in_proj_bias
+        )
+
+        batch_size = query_projected.size(0)
+        embed_dim = query_projected.size(2)
+        head_dim = embed_dim // (self.num_heads * 3)
+
+        query, key, value = query_projected.chunk(3, -1)
+
+        query = query.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, self.num_heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        attn, _ = torch.nn.functional._scaled_dot_product_attention(
+            query,
+            key,
+            value,
+            attn_mask=None,
+            dropout_p=0.0,
+            need_attn_weights=False,
+            is_causal=False,
+        )
+
+        attn = attn.transpose(1, 2).reshape(batch_size, -1, self.num_heads * head_dim)
+        # Match return signature of nn.MHA
+        return self.out_proj(attn)
+
+
+def build_composite_mha_from_nn_mha(pt):
+    assert pt._qkv_same_embed_dim
+    in_proj_weight = pt.in_proj_weight
+    assert in_proj_weight is not None
+    assert pt.batch_first
+    return CompositeMHA(pt.num_heads, pt.in_proj_weight, pt.in_proj_bias, pt.out_proj)
+
+
+def forw_back(model, input, upward):
+    output = model(*input)
+    output.backward(upward)
+
+
+# Context manger not working in timer
+
+
+def forw_back_fused(model, input, upward):
+    with torch.backends.cuda.sdp_kernel(enable_math=False, enable_mem_efficient=True):
+        output = model(*input)
+        output.backward(upward)
+
+
+def forw_back_eager(model, input, upward):
+    with torch.backends.cuda.sdp_kernel(enable_math=True, enable_mem_efficient=False):
+        output = model(*input)
+        output.backward(upward)
+
+
+def run_timing(
+    min_run_time, batch_size, embed_dimension, num_heads, max_sequence_len, dtype
+):
+    dropout_p = 0.0
+    mask = None
+
+    pt = torch.nn.MultiheadAttention(
+        embed_dim=embed_dimension,
+        num_heads=num_heads,
+        batch_first=True,
+        dropout=dropout_p,
+    )
+    npt = pt.cuda().to(dtype)
+    cpt = build_composite_mha_from_nn_mha(npt)
+    x = torch.randn(
+        batch_size,
+        max_sequence_len,
+        embed_dimension,
+        dtype=dtype,
+        device="cuda",
+        requires_grad=True,
+    )
+
+    with torch.backends.cuda.sdp_kernel(enable_math=False, enable_mem_efficient=True):
+        rand_fused_upward = cpt(x, x, x, mask).clone().detach()
+
+    with torch.backends.cuda.sdp_kernel(enable_math=True, enable_mem_efficient=False):
+        rand_eager_upward = cpt(x, x, x, mask).clone().detach()
+
+    t0 = benchmark.Timer(
+        stmt="forw_back_fused(cpt, (x,x,x,mask), rand_fused_upward)",
+        globals={
+            "forw_back_fused": forw_back_fused,
+            "cpt": cpt,
+            "x": x,
+            "rand_fused_upward": rand_fused_upward,
+            "mask": mask,
+        },
+        label=f"Fused SDP forward and backward batch_size={batch_size} max_sequence_len={max_sequence_len} "
+        f"num_heads={num_heads} embed_dimension={embed_dimension} dtype={dtype}",
+        num_threads=torch.get_num_threads(),
+    )
+
+    t1 = benchmark.Timer(
+        stmt="forw_back_eager(cpt, (x,x,x,mask), rand_eager_upward)",
+        globals={
+            "forw_back_eager": forw_back_eager,
+            "cpt": cpt,
+            "x": x,
+            "rand_eager_upward": rand_eager_upward,
+            "mask": mask,
+        },
+        label=f"Eager SDP forward and backward batch_size={batch_size} max_sequence_len={max_sequence_len} "
+        f"num_heads={num_heads} embed_dimension={embed_dimension} dtype={dtype}",
+        num_threads=torch.get_num_threads(),
+    )
+
+    m0 = t0.blocked_autorange(min_run_time=min_run_time)
+    m1 = t1.blocked_autorange(min_run_time=min_run_time)
+
+    print(m0)
+    print(m1)
+
+    activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA]
+
+    print("Profile for Fused".center(200, "-"))
+    with torch.backends.cuda.sdp_kernel(enable_math=False, enable_mem_efficient=True):
+        with profile(
+            activities=activities, record_shapes=False, with_stack=True
+        ) as prof:
+            with record_function("Fused SDP forward and backward"):
+                for _ in range(20):
+                    forw_back(cpt, (x, x, x, mask), rand_fused_upward)
+    print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=20))
+
+    print("Profile for eager".center(200, "-"))
+    with torch.backends.cuda.sdp_kernel(enable_math=True, enable_mem_efficient=False):
+        with profile(
+            activities=activities, record_shapes=False, with_stack=True
+        ) as prof:
+            with record_function("Fused SDP forward and backward"):
+                for _ in range(20):
+                    forw_back(cpt, (x, x, x, mask), rand_eager_upward)
+    print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=20))
+
+
+def main():
+    seed = 123
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    random.seed(seed)
+
+    min_run_time = 10
+    batch_size = 64
+    num_heads = 32
+    max_seq_len = 256
+    embed_dim = 1024
+    dtype = torch.bfloat16
+
+    print(
+        f"Running timing for batch_size={batch_size} max_sequence_len={max_seq_len} "
+        f"num_heads={num_heads} embed_dimension={embed_dim} dtype={dtype}"
+    )
+    run_timing(min_run_time, batch_size, embed_dim, num_heads, max_seq_len, dtype)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/forward_backward_compatibility/check_forward_backward_compatibility.py b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
index 90080ab0934f4..853f5206969b3 100644
--- a/test/forward_backward_compatibility/check_forward_backward_compatibility.py
+++ b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
@@ -317,6 +317,9 @@
     ("aten::_upsample_nearest_exact1d_backward", datetime.date(2022, 12, 15)),
     ("aten::_upsample_nearest_exact2d", datetime.date(2022, 12, 15)),
     ("aten::_upsample_nearest_exact2d_backward", datetime.date(2022, 12, 15)),
+    ("aten::_flash_scaled_dot_product_attention", datetime.date(2022, 12, 15)),
+    ("aten::_scaled_dot_product_attention_forward", datetime.date(2022, 12, 15)),
+    ("aten::_efficient_attention_backward", datetime.date(2022, 12, 15)),
     ("mkldnn::_convolution_pointwise.binary", datetime.date(2022, 12, 15)),
 ]
 
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index 5e3aa1ff898f2..e9451b596b4ac 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -401,6 +401,7 @@ def wrapped_fn(*args, **kwargs):
         skip('nn.functional.max_unpool2d'),  # fails everywhere except on windows
         skip('nn.functional.max_unpool3d'),  # fails everywhere except on mac
         xfail("native_batch_norm"),
+        xfail('nn.functional._scaled_dot_product_attention', device_type='cuda'),
 
         xfail('nn.functional.rrelu'),  # in-place test errors out with no formula implemented
 
@@ -555,6 +556,7 @@ def f(inp, *args, **kwargs):
         xfail('nn.functional.ctc_loss'),  # Not Implemented
         xfail('native_layer_norm', ''),  # Expected a proper Tensor but got None for argument #1 'other'
         xfail('sparse.sampled_addmm', ''),  # sparse tensors have no strides
+        skip('nn.functional._scaled_dot_product_attention', device_type='cuda'),
         # AssertionError: Tensor-likes are not close!
         # Mismatched elements: 1 / 15 (6.7%)
         # Greatest absolute difference: 24.0 at index (2, 4) (up to 1e-05 allowed)
@@ -649,7 +651,7 @@ def fn(inp, *args, **kwargs):
         skip("nn.functional.feature_alpha_dropout", "with_train"),  # calls random op
         skip("nn.functional.fractional_max_pool2d"),  # calls random op
         skip("nn.functional.fractional_max_pool3d"),  # calls random op
-        skip('nn.functional._scaled_dot_product_attention'),  # randomness
+        xfail('nn.functional._scaled_dot_product_attention'),  # randomness
         # It looks like you're either (1) calling .item() on a Tensor or
         # (2) attempting to use a Tensor in some data-dependent control flow or
         # (3) encountering this error in PyTorch internals.
@@ -1126,6 +1128,7 @@ def test():
         skip('nn.functional.rrelu'),  # randomness
         skip('nn.functional.feature_alpha_dropout', 'with_train'),  # randomness
         skip('nn.functional.feature_alpha_dropout', 'without_train'),  # randomness
+        skip('nn.functional._scaled_dot_product_attention', device_type='cuda'),
         skip('nn.functional.alpha_dropout'),  # randomness
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
         skip('to_sparse', ''),  # non-dense output
@@ -1249,6 +1252,7 @@ def get_vjp(cotangents, *primals):
         xfail('nn.functional.soft_margin_loss', ''),  # NYI: forward-AD for log_sigmoid_backward
         xfail('nn.functional.ctc_loss', ''),  # NYI: forward-AD for _ctc_loss
         xfail('nn.functional.pdist', ''),  # NYI: forward-AD with _pdist_forward
+        skip('nn.functional._scaled_dot_product_attention', device_type='cuda'),
         xfail('nn.functional.multi_margin_loss', ''),  # NYI: forward AD with multi_margin_loss
         skip('linalg.householder_product', '', device_type='cuda'),  # flaky, I'm not sure why
         xfail('sparse.sampled_addmm', ''),  # Sparse tensors have no strides
@@ -1369,7 +1373,7 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents):
         xfail('nn.functional.dropout2d'),  # calls random op
         xfail('nn.functional.dropout3d'),  # calls random op
         xfail('nn.functional.dropout'),  # calls random op
-        skip('nn.functional._scaled_dot_product_attention'),  # randomness
+        xfail('nn.functional._scaled_dot_product_attention'),  # randomness
         xfail('nn.functional.embedding_bag'),  # Forward AD not implemented and no decomposition
         xfail('nn.functional.alpha_dropout'),  # calls randomn op
         xfail('nn.functional.feature_alpha_dropout', 'with_train'),  # calls random op
diff --git a/test/test_meta.py b/test/test_meta.py
index 6d21d5c7bd75a..0e3cfb6ef1404 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -294,7 +294,6 @@ def test_tensor_outlives_converter(self):
     aten._fft_c2r.default,
     aten._fft_r2c.default,
     aten._linalg_svd.default,
-    aten._scaled_dot_product_attention_forward.default,
     aten.binary_cross_entropy.default,
     aten.complex.default,
     aten.copysign.Tensor,
diff --git a/test/test_transformers.py b/test/test_transformers.py
index abb4c71ec19ad..0260c822498d3 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -1059,6 +1059,11 @@ def rand_tensor(shape):
 
         if fused_kernel == "flash":
             with sdp_kernel(enable_mem_efficient=False, enable_math=False):
+                # TODO Flash for the nested path is currently not working due to cuda memory issues
+                if type == "nested":
+                    self.assertRaises(RuntimeError, lambda: torch.nn.functional._scaled_dot_product_attention(
+                        query_lp, key_lp, value_lp, attn_mask=None, dropout_p=0.0, need_attn_weights=False, is_causal=False))
+                    return
                 actual = torch.nn.functional._scaled_dot_product_attention(
                     query_lp, key_lp, value_lp, attn_mask=None, dropout_p=0.0, need_attn_weights=False, is_causal=False)
         elif fused_kernel == "mem_efficient":
@@ -1097,28 +1102,73 @@ def rand_tensor(shape):
 
     @unittest.skipIf(not TEST_CUDA or TEST_WITH_ROCM or IS_WINDOWS, "Flash Attention was not built for this system")
     @parametrize("contiguous_inputs", [True, False])
-    def test_efficient_attention_gradcheck(self, contiguous_inputs: bool):
+    def test_sdp_math_gradcheck(self, contiguous_inputs: bool):
 
-        batch_size, seq_len, num_heads, head_dim = 8, 8, 4, 64
-        rand_tensor = partial(self.rand_tensor, device="cuda", dtype=torch.float16, requires_grad=True, packed=True)
+        batch_size, seq_len, num_heads, head_dim = 4, 4, 2, 16
+        rand_tensor = partial(self.rand_tensor, device="cuda", dtype=torch.float64, requires_grad=True, packed=True)
 
         qkv = rand_tensor((batch_size, seq_len, num_heads, head_dim))
         query, key, value = qkv.chunk(3, dim=-1)
-        query = query.view(batch_size, -1, num_heads, head_dim)
-        key = key.view(batch_size, -1, num_heads, head_dim)
-        value = value.view(batch_size, -1, num_heads, head_dim)
+
+        query = query.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+
+        if contiguous_inputs:
+            query = query.contiguous()
+            key = key.contiguous()
+            value = value.contiguous()
+
+        with sdp_kernel(enable_math=True, enable_mem_efficient=False, enable_flash=False):
+            assert gradcheck(lambda *args, **kwargs:
+                             wrapper_set_seed(torch.nn.functional._scaled_dot_product_attention, *args, **kwargs),
+                             (query, key, value, None, 0.0, False, False)
+                             )
+
+    @unittest.skipIf(not TEST_CUDA or TEST_WITH_ROCM or IS_WINDOWS, "Flash Attention was not built for this system")
+    @parametrize("contiguous_inputs", [True, False])
+    def test_sdp_fused_grad_against_math(self, contiguous_inputs: bool):
+        batch_size, seq_len, num_heads, head_dim = 4, 4, 2, 16
+        rand_tensor = partial(self.rand_tensor, device="cuda", dtype=torch.float64, requires_grad=True, packed=True)
+
+        qkv = rand_tensor((batch_size, seq_len, num_heads, head_dim))
+        qkv_lp = qkv.detach().clone().to(torch.float32).requires_grad_()
+
+        query, key, value = qkv.chunk(3, dim=-1)
+        query_lp, key_lp, value_lp = qkv_lp.chunk(3, dim=-1)
+
+        query = query.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+
+        query_lp = query_lp.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+        key_lp = key_lp.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
+        value_lp = value_lp.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
 
         if contiguous_inputs:
             query = query.contiguous()
             key = key.contiguous()
             value = value.contiguous()
 
-        # Normally we would transpose the inputs but the fused kernels expect
-        # (batch, seq_len, num_heads, head_dim) bump the tolerance since we can only run kernel
-        # in fp32
-        assert gradcheck(lambda *args, **kwargs:
-                         wrapper_set_seed(torch.ops.aten._efficient_attention_forward, *args, **kwargs),
-                         (query, key, value, None, None, None, True, False), fast_mode=True, atol=8e-5, rtol=1e-3)
+            query_lp = query_lp.contiguous()
+            key_lp = key_lp.contiguous()
+            value_lp = value_lp.contiguous()
+
+        with sdp_kernel(enable_math=True, enable_mem_efficient=False, enable_flash=False):
+            out, atten = torch.nn.functional._scaled_dot_product_attention(query, key, value, None, 0.0, False, False)
+
+        with sdp_kernel(enable_math=False, enable_mem_efficient=True, enable_flash=False):
+            out_lp, atten_lp = torch.nn.functional._scaled_dot_product_attention(
+                query_lp, key_lp, value_lp, None, 0.0, False, False)
+
+        rand_upward = torch.rand_like(out)
+        rand_upward_lp = rand_upward.to(torch.float32)
+
+        out.backward(rand_upward)
+        out_lp.backward(rand_upward_lp)
+
+        # Cast up and compare
+        self.assertEqual(qkv.grad, qkv_lp.grad.to(torch.float64), atol=1e-5, rtol=1e-5)
 
     @parametrize("type", ["dense", "nested"])
     def test_fused_sdp_choice(self, type: str):
@@ -1144,7 +1194,7 @@ def test_fused_sdp_choice(self, type: str):
             value = value.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
             key = key.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
 
-            if SM80OrLater:
+            if SM80OrLater and not type == "nested":
                 assert torch._fused_sdp_choice(query, key, value) == SDPBackend.FLASH_ATTENTION
             else:
                 assert torch._fused_sdp_choice(query, key, value) == SDPBackend.EFFICIENT_ATTENTION
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index a0892b32a8352..52c0f76bf0708 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -2613,9 +2613,13 @@
   nested_strides: non_differentiable
 
 # Transformers
+- name: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, bool compute_log_sumexp, bool is_causal=False) -> (Tensor, Tensor)
+  output_differentiability: [True, False]
+  query, key, value: _scaled_dot_product_efficient_attention_backward(grad, query, key, value, result0, result1, is_causal)
+
 - name:  _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, bool compute_log_sumexp=False, bool causal=False) -> (Tensor, Tensor)
   output_differentiability: [True, False]
-  query, key, value: _efficient_attention_backward(grad, query, key, value, result1, result0, causal)
+  query, key, value: _efficient_attention_backward(grad, query, key, value, result0, result1, causal)
 
 # fft
 - name: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 0f845f7658298..998f1cde65f7d 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -12009,16 +12009,21 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             # This is only failing on Linux Bionic 3.10 Cuda 11.6
             DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes',
                          device_type='cuda', active_if=_get_torch_cuda_version() >= (11, 6)),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_noncontiguous_samples',
+                         device_type='cuda', dtypes=(torch.float32,)),
             # AssertionError: JIT Test does not execute any logic
             DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
             # Doesn't support autocasting
             DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensorNonErroring', 'test_fake_autocast', device_type='cpu'),
             DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensor', 'test_fake_autocast'),
+            # Forward works for dtype=float64 which is the math path
+            DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD'),
             # No meta function
             DecorateInfo(unittest.skip("Skipped!"), 'TestProxyTensorOpInfo', 'test_make_fx_symbolic_exhaustive'),
             DecorateInfo(unittest.skip("Skipped!"), 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
             DecorateInfo(unittest.skip("Skipped"), 'TestDecomp', 'test_comprehensive'),
             DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensor', 'test_fake'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', device_type='cuda'),
             DecorateInfo(unittest.skip('output is non-deterministic (when dropout_p > 0)'), 'TestCommon', 'test_compare_cpu'),),
     ),
     UnaryUfuncInfo(

From 311efb1492f8d174f16fbf6c1df4516e5ca97f12 Mon Sep 17 00:00:00 2001
From: Keval Morabia <kevalmorabia97@gmail.com>
Date: Mon, 21 Nov 2022 20:40:04 +0000
Subject: [PATCH 1133/1922] Fix unconvertible_ops as per #89261 (#89299)

Fixes #89261

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89299
Approved by: https://github.com/justinchuby, https://github.com/BowenBao
---
 test/onnx/test_utility_funs.py | 12 ++++++++++++
 torch/onnx/utils.py            |  4 +++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py
index 51adaef317af2..5d1cdc5e8ea58 100644
--- a/test/onnx/test_utility_funs.py
+++ b/test/onnx/test_utility_funs.py
@@ -124,6 +124,18 @@ def test_it_returns_empty_list_when_all_ops_convertible(
         _, unconvertible_ops = utils.unconvertible_ops(module, (x,), opset_version=12)
         self.assertEqual(unconvertible_ops, [])
 
+    def test_it_returns_empty_list_when_model_contains_supported_inplace_ops(self):
+        class SkipConnectionModule(torch.nn.Module):
+            def forward(self, x):
+                out = x
+                out += x
+                out = torch.nn.functional.relu(out, inplace=True)
+
+        module = SkipConnectionModule()
+        x = torch.randn(4, 4)
+        _, unconvertible_ops = utils.unconvertible_ops(module, (x,), opset_version=13)
+        self.assertEqual(unconvertible_ops, [])
+
 
 @parameterized.parameterized_class(
     [
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 67dd719bae9f6..36d7fdb75762c 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -1333,7 +1333,9 @@ def unconvertible_ops(
             # eliminated in the conversion passes. Users may still see errors caused
             # by prim ops even though they don't show up in the list.
             continue
-        if not registration.registry.is_registered_op(domain_op, opset_version):
+        if not registration.registry.is_registered_op(
+            domain_op.rstrip("_"), opset_version
+        ):
             # We consider all registered ops supported, even though some of them are
             # only partially supported, because there is not yet a good way to check
             # if an op is fully supported.

From dac45f96efb8056d5d935a2bf0a0d208f712ae80 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Sun, 20 Nov 2022 23:19:24 +0000
Subject: [PATCH 1134/1922] [inductor] Fix nan handling for aten.sign (#88937)

ATen gives `sign(nan) == 0` but inductor's cuda codegen would give
`sign(nan) == 1`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88937
Approved by: https://github.com/ngimel
---
 test/inductor/test_torchinductor.py | 8 ++++++++
 torch/_inductor/codegen/common.py   | 4 +++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 399032890ca83..ec024c67b81cc 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -13,6 +13,8 @@
 from typing import Any, Callable
 from unittest.mock import patch
 
+import numpy as np
+
 import torch
 
 import torch._dynamo
@@ -668,6 +670,12 @@ def fn(a):
 
         self.common(fn, [torch.linspace(-10, 10, 41)])
 
+    def test_sgn_extremal(self):
+        def fn(a):
+            return (torch.sgn(a),)
+
+        self.common(fn, [torch.tensor([np.nan, np.inf, -np.inf, 0])])
+
     def test_max_min(self):
         def fn(a, b):
             return (torch.maximum(a, b), torch.minimum(a, b))
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index cf98833964ca5..da64f3e63584e 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -90,7 +90,9 @@ def square(x):
 
     @staticmethod
     def sign(x):
-        return ops.where(f"{x} == 0", "0", ops.where(f"{x} < 0", "-1", "1"))
+        left = ops.where(ops.lt("0", x), "1", "0")
+        right = ops.where(ops.lt(x, "0"), "1", "0")
+        return ops.sub(left, right)
 
     @staticmethod
     def bitwise_not(x):

From b11457b27b19b8382d88a869aa1db314123e3aad Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Sun, 20 Nov 2022 23:36:41 +0000
Subject: [PATCH 1135/1922] [inductor] Misc division lowering fixes (#88603)

1. `aten.div.Tensor_mode` should allow broadcasting
2. `div` can use `ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT`
3. `prims.div` on integers should be truncating division
4. Add lowering for `true_divide` which is aliased to `div`
5. register lowering for inplace version of `div_mode`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88603
Approved by: https://github.com/ngimel
---
 test/inductor/test_torchinductor.py | 59 +++++++++++++++++++++++++++++
 torch/_inductor/lowering.py         | 46 ++++++++++------------
 2 files changed, 78 insertions(+), 27 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index ec024c67b81cc..2196f4f8a0264 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -23,6 +23,7 @@
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.passes.shape_prop import ShapeProp
 from torch.nn import functional as F
+from torch.testing import make_tensor
 from torch.testing._internal.common_utils import (
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
@@ -1166,6 +1167,45 @@ def fn(a, b):
 
         self.common(fn, (1024, 100))
 
+    def test_div_zero_dim(self):
+        def fn(a, b):
+            return (
+                aten.div(a, b, rounding_mode=None),
+                aten.div(a, b, rounding_mode="floor"),
+                aten.div(a, b, rounding_mode="trunc"),
+                a / b,
+                a // b,
+            )
+
+        for dtype in (torch.float32, torch.int64):
+            self.common(
+                fn,
+                (
+                    make_tensor(10, device="cpu", dtype=dtype),
+                    make_tensor((), device="cpu", dtype=dtype, exclude_zero=True),
+                ),
+            )
+            self.common(
+                fn,
+                (
+                    make_tensor((), device="cpu", dtype=dtype),
+                    make_tensor(10, device="cpu", dtype=dtype, exclude_zero=True),
+                ),
+            )
+
+    def test_div_prim(self):
+        def fn(a, b):
+            return (torch.ops.prims.div(a, b),)
+
+        for dtype in (torch.float32, torch.int64):
+            self.common(
+                fn,
+                (
+                    make_tensor(100, device="cpu", dtype=dtype),
+                    make_tensor(100, device="cpu", dtype=dtype, exclude_zero=True),
+                ),
+            )
+
     def test_both_scalars(self):
         def fn(a, b):
             return (
@@ -2589,6 +2629,25 @@ def fn(a, b):
         shape = [1, 2, 6, 6]
         self.common(fn, (torch.randn(shape), torch.randn(shape)))
 
+    def test_fmod_zero_dim(self):
+        def fn(a, b):
+            return (torch.fmod(a, b),)
+
+        self.common(
+            fn,
+            (
+                make_tensor(10, device="cpu", dtype=torch.float32),
+                make_tensor((), device="cpu", dtype=torch.float32),
+            ),
+        )
+        self.common(
+            fn,
+            (
+                make_tensor((), device="cpu", dtype=torch.float32),
+                make_tensor(10, device="cpu", dtype=torch.float32),
+            ),
+        )
+
     def test_log2(self):
         def fn(x):
             return torch.log2(x), torch.log2(x + 1) - 2
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index a76a9baea953d..0bd92007c9864 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -3354,7 +3354,7 @@ def truncdiv(a, b):
     return ops.truncdiv(a, b)
 
 
-@register_lowering(aten.div.Tensor_mode)
+@register_lowering(aten.div, broadcast=True)
 def div_mode(a, b, rounding_mode=None):
     both_integer = is_integer_type(a) and is_integer_type(b)
     both_boolean = is_boolean_type(a) and is_boolean_type(b)
@@ -3370,23 +3370,6 @@ def div_mode(a, b, rounding_mode=None):
     return div(a, b)
 
 
-@register_lowering([aten.div], broadcast=True)
-def div(a, b):
-    def fn(*args):
-        return ops.div(*args)
-
-    dtype = get_promoted_dtype(
-        a, b, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
-    )
-    # truediv produces a float tensor even if both operands are integer types
-    if is_integer_type(a) and is_integer_type(b):
-        dtype = torch.get_default_dtype()
-    return make_pointwise(fn, override_return_dtype=dtype)(
-        a if isinstance(a, Number) else to_dtype(a, dtype),
-        b if isinstance(b, Number) else to_dtype(b, dtype),
-    )
-
-
 @register_lowering([aten.mul], broadcast=True)
 def mul(a, b):
     both_bool = is_boolean_type(a) and is_boolean_type(b)
@@ -3397,21 +3380,29 @@ def mul(a, b):
         return make_pointwise(fn)(a, b)
 
 
-# TODO(lezcano) I believe the casting behaviour of prims.div is wrong
-# https://github.com/pytorch/pytorch/issues/84412
-# div prim performs truncation division on integer inputs
-#   and true division for floating and complex inputs
+# NOTE: prims.div maps to a / b in C, so performs truncation division on
+#   integer inputs and true division for floating and complex inputs.
 @register_lowering([prims.div], broadcast=True)
 def div_prim(a, b):
     is_integral = is_boolean_type(a) or is_integer_type(a)
 
     if is_integral:
-        return div_mode(a, b, rounding_mode="floor")
-    else:
-        return div(a, b)
+        return truncdiv(a, b)
+
+    def fn(*args):
+        return ops.div(*args)
+
+    return make_pointwise(fn)(a, b)
+
+
+div = register_lowering(
+    [aten.true_divide, aten.div.Tensor],
+    broadcast=True,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)(div_prim)
 
 
-@register_lowering([aten.fmod, prims.fmod])
+@register_lowering([aten.fmod, prims.fmod], broadcast=True)
 def fmod(a, b):
     is_integral = is_boolean_type(a) or is_integer_type(a)
 
@@ -3564,7 +3555,8 @@ def fn(*args, **kwargs):
 
 register_inplace(aten.add_, add)
 register_inplace(aten.mul_, mul)
-register_inplace(aten.div_, div)
+register_inplace(aten.div_.Tensor, div)
+register_inplace(aten.div_.Tensor_mode, div_mode)
 register_inplace(aten.sub_, sub)
 register_inplace(aten.relu_, relu)
 register_inplace(aten.sigmoid_, sigmoid)

From bf54b8367abb833bca9025c697dc80a06e7f0b4e Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Mon, 21 Nov 2022 21:08:13 +0000
Subject: [PATCH 1136/1922] [tools] expose selective build library (#89351)

Change the base module and visibility of `tools:gen_oplist_lib` so that it can be reused.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89351
Approved by: https://github.com/cccclai
---
 tools/BUCK.bzl                    | 5 +++--
 tools/code_analyzer/gen_oplist.py | 4 ++--
 tools/test/gen_oplist_test.py     | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/tools/BUCK.bzl b/tools/BUCK.bzl
index 6d16e8fe3ff8d..58a49fded0eec 100644
--- a/tools/BUCK.bzl
+++ b/tools/BUCK.bzl
@@ -62,10 +62,11 @@ def define_tools_targets(
             ("code_analyzer", "gen_oplist.py"),
             ("code_analyzer", "gen_op_registration_allowlist.py"),
         ]),
-        base_module = "",
+        base_module = "tools.code_analyzer",
         tests = [
             ":gen_oplist_test",
         ],
+        visibility = ["PUBLIC"],
         deps = [
             ":gen_selected_mobile_ops_header",
             torchgen_deps,
@@ -75,7 +76,7 @@ def define_tools_targets(
 
     python_binary(
         name = "gen_oplist",
-        main_module = "gen_oplist",
+        main_module = "tools.code_analyzer.gen_oplist",
         visibility = ["PUBLIC"],
         deps = [
             ":gen_oplist_lib",
diff --git a/tools/code_analyzer/gen_oplist.py b/tools/code_analyzer/gen_oplist.py
index 1e5d1277afcdf..18104ab30cb6c 100644
--- a/tools/code_analyzer/gen_oplist.py
+++ b/tools/code_analyzer/gen_oplist.py
@@ -127,7 +127,7 @@ def main(argv: List[Any]) -> None:
         default=False,
         required=False,
     )
-    options = parser.parse_args()
+    options = parser.parse_args(argv)
 
     if os.path.isfile(options.model_file_list_path):
         print("Processing model file: ", options.model_file_list_path)
@@ -186,4 +186,4 @@ def main(argv: List[Any]) -> None:
 
 
 if __name__ == "__main__":
-    main(sys.argv)
+    main(sys.argv[1:])
diff --git a/tools/test/gen_oplist_test.py b/tools/test/gen_oplist_test.py
index d58e2ccc90671..33f9fb293edc4 100644
--- a/tools/test/gen_oplist_test.py
+++ b/tools/test/gen_oplist_test.py
@@ -4,7 +4,7 @@
 import unittest
 from unittest.mock import MagicMock
 
-from gen_oplist import throw_if_any_op_includes_overloads
+from tools.code_analyzer.gen_oplist import throw_if_any_op_includes_overloads
 
 
 class GenOplistTest(unittest.TestCase):

From 32fdc6ef06ddd683ac27a98531f931e65cf7dce3 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Mon, 21 Nov 2022 19:19:29 +0000
Subject: [PATCH 1137/1922] [1/n] Thread PG: add test for allgather (#89439)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89439
Approved by: https://github.com/XilunWu, https://github.com/yhcharles, https://github.com/fduwjj
---
 test/distributed/test_multi_threaded_pg.py           | 12 ++++++++++--
 .../_internal/distributed/multi_threaded_pg.py       |  8 ++++++--
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/test/distributed/test_multi_threaded_pg.py b/test/distributed/test_multi_threaded_pg.py
index 6a0fe33cd8ad6..dc4713b504390 100644
--- a/test/distributed/test_multi_threaded_pg.py
+++ b/test/distributed/test_multi_threaded_pg.py
@@ -1,6 +1,7 @@
 # Owner(s): ["oncall: distributed"]
 
 import sys
+import torch
 import torch.distributed as dist
 
 if not dist.is_available():
@@ -16,7 +17,7 @@
 
 DEFAULT_WORLD_SIZE = 4
 
-class TestObjectCollectivesWithWrapper(TestCase):
+class TestCollectivesWithWrapper(TestCase):
     @spawn_threads_and_init_comms(world_size=4)
     def test_broadcast_object_list(self):
         val = 99 if dist.get_rank() == 0 else None
@@ -25,11 +26,18 @@ def test_broadcast_object_list(self):
         dist.broadcast_object_list(object_list=object_list)
         self.assertEqual(99, object_list[0])
 
-class TestObjectCollectivesWithBaseClass(MultiThreadedTestCase):
+class TestCollectivesWithBaseClass(MultiThreadedTestCase):
     @property
     def world_size(self):
         return 4
 
+    def test_allgather(self):
+        input_tensor = torch.ones(3, 3) * dist.get_rank()
+        output_tensors = [torch.empty_like(input_tensor) for _ in range(self.world_size)]
+        dist.all_gather(output_tensors, input_tensor)
+        for rank, out_tensor in enumerate(output_tensors):
+            self.assertEqual(out_tensor, torch.ones(3, 3) * rank)
+
     def test_broadcast_object_list(self):
         val = 99 if dist.get_rank() == 0 else None
         object_list = [val] * dist.get_world_size()
diff --git a/torch/testing/_internal/distributed/multi_threaded_pg.py b/torch/testing/_internal/distributed/multi_threaded_pg.py
index 7e18f870f2e76..7ad4bfa4cddbc 100644
--- a/torch/testing/_internal/distributed/multi_threaded_pg.py
+++ b/torch/testing/_internal/distributed/multi_threaded_pg.py
@@ -7,7 +7,11 @@
 
 import torch
 import torch.distributed as dist
-from torch._C._distributed_c10d import _create_work_from_future, Store
+from torch._C._distributed_c10d import (
+    _create_work_from_future,
+    AllgatherOptions,
+    Store,
+)
 from torch.futures import Future
 from torch.utils._pytree import tree_flatten
 
@@ -135,7 +139,7 @@ def _end_coll(cls, collective):
             if cls._cur_coll == collective:
                 cls._cur_coll = None
 
-    def allgather(self, output_tensors, input_tensor, options):
+    def allgather(self, output_tensors, input_tensor, opts=AllgatherOptions()):
         coll = ProcessLocalGroup._start_coll(self._world, AllGather())
         res = coll.join(self._rank, (output_tensors, input_tensor))
         ProcessLocalGroup._end_coll(coll)

From 78a1a792a9e57da38e4d04d2724be116a0aca788 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Mon, 21 Nov 2022 19:19:29 +0000
Subject: [PATCH 1138/1922] [2/n] Thread PG: add test for broadcast (#89440)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89440
Approved by: https://github.com/XilunWu, https://github.com/yhcharles, https://github.com/fduwjj
---
 test/distributed/test_multi_threaded_pg.py               | 7 +++++++
 torch/testing/_internal/distributed/multi_threaded_pg.py | 3 ++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/test/distributed/test_multi_threaded_pg.py b/test/distributed/test_multi_threaded_pg.py
index dc4713b504390..3e1e765eef517 100644
--- a/test/distributed/test_multi_threaded_pg.py
+++ b/test/distributed/test_multi_threaded_pg.py
@@ -38,6 +38,13 @@ def test_allgather(self):
         for rank, out_tensor in enumerate(output_tensors):
             self.assertEqual(out_tensor, torch.ones(3, 3) * rank)
 
+    def test_broadcast(self):
+        input_tensor = torch.ones(3, 3) * dist.get_rank()
+        for rank in range(self.world_size):
+            cloned_input = input_tensor.clone()
+            dist.broadcast(cloned_input, src=rank)
+            self.assertEqual(cloned_input, torch.ones(3, 3) * rank)
+
     def test_broadcast_object_list(self):
         val = 99 if dist.get_rank() == 0 else None
         object_list = [val] * dist.get_world_size()
diff --git a/torch/testing/_internal/distributed/multi_threaded_pg.py b/torch/testing/_internal/distributed/multi_threaded_pg.py
index 7ad4bfa4cddbc..ae465b95641ba 100644
--- a/torch/testing/_internal/distributed/multi_threaded_pg.py
+++ b/torch/testing/_internal/distributed/multi_threaded_pg.py
@@ -10,6 +10,7 @@
 from torch._C._distributed_c10d import (
     _create_work_from_future,
     AllgatherOptions,
+    BroadcastOptions,
     Store,
 )
 from torch.futures import Future
@@ -145,7 +146,7 @@ def allgather(self, output_tensors, input_tensor, opts=AllgatherOptions()):
         ProcessLocalGroup._end_coll(coll)
         return res
 
-    def broadcast(self, tensor_list, opts):
+    def broadcast(self, tensor_list, opts=BroadcastOptions()):
         coll = ProcessLocalGroup._start_coll(self._world, Broadcast(opts.rootRank))
         res = coll.join(self._rank, tensor_list)
         ProcessLocalGroup._end_coll(coll)

From 8758604cbab056d9c001228c2059bccabd820926 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Mon, 21 Nov 2022 19:19:29 +0000
Subject: [PATCH 1139/1922] [3/n] Thread PG: add scatter to threaded pg
 (#89441)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89441
Approved by: https://github.com/XilunWu, https://github.com/yhcharles, https://github.com/fduwjj
---
 test/distributed/test_multi_threaded_pg.py    | 12 ++++++++--
 .../distributed/multi_threaded_pg.py          | 24 +++++++++++++++++++
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/test/distributed/test_multi_threaded_pg.py b/test/distributed/test_multi_threaded_pg.py
index 3e1e765eef517..1e16f5d03a8cb 100644
--- a/test/distributed/test_multi_threaded_pg.py
+++ b/test/distributed/test_multi_threaded_pg.py
@@ -45,6 +45,16 @@ def test_broadcast(self):
             dist.broadcast(cloned_input, src=rank)
             self.assertEqual(cloned_input, torch.ones(3, 3) * rank)
 
+    def test_scatter(self):
+        if dist.get_rank() == 0:
+            scatter_list = [torch.ones(3, 3) * rank for rank in range(self.world_size)]
+        else:
+            scatter_list = None
+        output_tensor = torch.empty(3, 3)
+
+        dist.scatter(output_tensor, scatter_list)
+        self.assertEqual(output_tensor, torch.ones(3, 3) * dist.get_rank())
+
     def test_broadcast_object_list(self):
         val = 99 if dist.get_rank() == 0 else None
         object_list = [val] * dist.get_world_size()
@@ -53,8 +63,6 @@ def test_broadcast_object_list(self):
         dist.broadcast_object_list(object_list=object_list)
         self.assertEqual(99, object_list[0])
 
-    def test_something_else(self):
-        pass
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/testing/_internal/distributed/multi_threaded_pg.py b/torch/testing/_internal/distributed/multi_threaded_pg.py
index ae465b95641ba..321c61d993cfc 100644
--- a/torch/testing/_internal/distributed/multi_threaded_pg.py
+++ b/torch/testing/_internal/distributed/multi_threaded_pg.py
@@ -11,6 +11,7 @@
     _create_work_from_future,
     AllgatherOptions,
     BroadcastOptions,
+    ScatterOptions,
     Store,
 )
 from torch.futures import Future
@@ -50,6 +51,23 @@ def work(self, data):
                 with torch.no_grad():
                     dest_tensor.copy_(src_tensor)
 
+class Scatter:
+    def __init__(self, src):
+        self.src = src
+
+    def work(self, data):
+        src_in_tensor_list = data[self.src][1]
+        # Can't handle scatter with multiple input tensor list
+        assert len(src_in_tensor_list) == 1
+        src_in_tensors = src_in_tensor_list[0]
+
+        for rank, each_rank_data in enumerate(data):
+            out_tensor_list = each_rank_data[0]
+            # Can't handle scatter with multiple output tensor
+            assert len(out_tensor_list) == 1
+            dest_tensor = out_tensor_list[0]
+            with torch.no_grad():
+                dest_tensor.copy_(src_in_tensors[rank])
 
 class Broadcast:
     def __init__(self, src):
@@ -152,6 +170,12 @@ def broadcast(self, tensor_list, opts=BroadcastOptions()):
         ProcessLocalGroup._end_coll(coll)
         return res
 
+    def scatter(self, output_tensors, input_tensors, opts=ScatterOptions()):
+        coll = ProcessLocalGroup._start_coll(self._world, Scatter(opts.rootRank))
+        res = coll.join(self._rank, (output_tensors, input_tensors))
+        ProcessLocalGroup._end_coll(coll)
+        return res
+
     def __init__(self, rank, world):
         super(ProcessLocalGroup, self).__init__(rank, world)
         self._rank = rank

From ebdb23dc2845d50de69bf0ec1c4886d6a450eb23 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Mon, 21 Nov 2022 19:19:29 +0000
Subject: [PATCH 1140/1922] [4/n] Thread PG: add reduce_scatter to threaded pg
 (#89442)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89442
Approved by: https://github.com/yhcharles, https://github.com/fduwjj
---
 test/distributed/test_multi_threaded_pg.py    |  8 +++++
 .../distributed/multi_threaded_pg.py          | 31 +++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/test/distributed/test_multi_threaded_pg.py b/test/distributed/test_multi_threaded_pg.py
index 1e16f5d03a8cb..f520698258ed9 100644
--- a/test/distributed/test_multi_threaded_pg.py
+++ b/test/distributed/test_multi_threaded_pg.py
@@ -55,6 +55,14 @@ def test_scatter(self):
         dist.scatter(output_tensor, scatter_list)
         self.assertEqual(output_tensor, torch.ones(3, 3) * dist.get_rank())
 
+    def test_reduce_scatter(self):
+        to_reduce_scatter = [torch.ones(3, 3) * rank for rank in range(self.world_size)]
+        output_tensor = torch.empty(3, 3)
+
+        dist.reduce_scatter(output_tensor, to_reduce_scatter)
+        expected_tensor = torch.ones(3, 3) * dist.get_rank() * self.world_size
+        self.assertEqual(output_tensor, expected_tensor)
+
     def test_broadcast_object_list(self):
         val = 99 if dist.get_rank() == 0 else None
         object_list = [val] * dist.get_world_size()
diff --git a/torch/testing/_internal/distributed/multi_threaded_pg.py b/torch/testing/_internal/distributed/multi_threaded_pg.py
index 321c61d993cfc..df45748ee6c6f 100644
--- a/torch/testing/_internal/distributed/multi_threaded_pg.py
+++ b/torch/testing/_internal/distributed/multi_threaded_pg.py
@@ -11,6 +11,7 @@
     _create_work_from_future,
     AllgatherOptions,
     BroadcastOptions,
+    ReduceScatterOptions,
     ScatterOptions,
     Store,
 )
@@ -69,6 +70,30 @@ def work(self, data):
             with torch.no_grad():
                 dest_tensor.copy_(src_in_tensors[rank])
 
+class ReduceScatter:
+    def __init__(self, op):
+        if op != dist.ReduceOp.SUM:
+            raise NotImplementedError("ReduceScatter only supports SUM on threaded pg for now.")
+        self.op = op
+
+    def work(self, data):
+        start_reduction = [False for _ in range(len(data))]
+        for each_rank_data in data:
+            # Can't handle reduce_scatter with multiple scatter list
+            assert len(each_rank_data[1]) == 1
+            to_scatter = each_rank_data[1][0]
+            for i in range(len(to_scatter)):
+                dest_tensor_on_rank_i = data[i][0]
+                # Can't handle reduce_scatter with multiple output tensor
+                assert len(dest_tensor_on_rank_i) == 1
+                if not start_reduction[i]:
+                    with torch.no_grad():
+                        dest_tensor_on_rank_i[0].copy_(to_scatter[i])
+                    start_reduction[i] = True
+                else:
+                    with torch.no_grad():
+                        dest_tensor_on_rank_i[0].add_(to_scatter[i])
+
 class Broadcast:
     def __init__(self, src):
         self.src = src
@@ -176,6 +201,12 @@ def scatter(self, output_tensors, input_tensors, opts=ScatterOptions()):
         ProcessLocalGroup._end_coll(coll)
         return res
 
+    def reduce_scatter(self, output_tensor, scatter_list, opts=ReduceScatterOptions()):
+        coll = ProcessLocalGroup._start_coll(self._world, ReduceScatter(opts.reduceOp))
+        res = coll.join(self._rank, (output_tensor, scatter_list))
+        ProcessLocalGroup._end_coll(coll)
+        return res
+
     def __init__(self, rank, world):
         super(ProcessLocalGroup, self).__init__(rank, world)
         self._rank = rank

From 1921100f2e4e4287a97afff044c51ed659adc539 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Mon, 21 Nov 2022 22:43:58 +0000
Subject: [PATCH 1141/1922] [Dynamo] Fix bugs when calling tensor.data and
 tensor.layout (#89257)

Fix bugs in [7k github models](https://github.com/pytorch/torchdynamo/issues/1884).
* Legacy code still use ```tensor.data```, I think we can use ```tensor.detach``` to rewrite, not sure if there is anything I didn't anticipate.
* Support ```tensor.layout```.

The root cause of these issues are: dynamo wraps unimplemented ```tensor.x``` call into ```GetAttrVariable(TensorVariable, x)```, but this op was not inserted into FX graph. Hence, during the fake tensor propagation, it throws ```KeyError: 'example_value` ```.

For these two popular attributes, Dynamo should support them anyway. However, if dynamo should support ___all___ ```tensor.x``` call and not fallback to ```GetAttrVariable```, I think it's debatable.
If I turn off fake tensor propagation, it works well even not including this fix. So I'm curious if we should improve the fake propagation to cover similar cases. cc @mlazos @soumith @voznesenskym @penguinwu @anijain2305 @EikanWang @jgong5 @Guobing-Chen @chunyuan-w @XiaobingSuper @zhuhaozhe @blzheng @Xia-Weiwen @wenzhe-nrv @jiayisunx @desertfire @jansel @eellison

```
Traceback (most recent call last):
  File "/scratch/ybliang/work/repos/pytorch/torch/_dynamo/convert_frame.py", line 404, in _compile
    out_code = transform_code_object(code, transform)
  File "/scratch/ybliang/work/repos/pytorch/torch/_dynamo/bytecode_transformation.py", line 341, in transform_code_object
    transformations(instructions, code_options)
  File "/scratch/ybliang/work/repos/pytorch/torch/_dynamo/convert_frame.py", line 392, in transform
    tracer.run()
  File "/scratch/ybliang/work/repos/pytorch/torch/_dynamo/symbolic_convert.py", line 1523, in run
    super().run()
  File "/scratch/ybliang/work/repos/pytorch/torch/_dynamo/symbolic_convert.py", line 389, in run
    and self.step()
  File "/scratch/ybliang/work/repos/pytorch/torch/_dynamo/symbolic_convert.py", line 359, in step
    getattr(self, inst.opname)(inst)
  File "/scratch/ybliang/work/repos/pytorch/torch/_dynamo/symbolic_convert.py", line 193, in wrapper
    return inner_fn(self, inst)
  File "/scratch/ybliang/work/repos/pytorch/torch/_dynamo/symbolic_convert.py", line 865, in CALL_FUNCTION_KW
    self.call_function(fn, args, kwargs)
  File "/scratch/ybliang/work/repos/pytorch/torch/_dynamo/symbolic_convert.py", line 301, in call_function
    self.push(fn.call_function(self, args, kwargs))
  File "/scratch/ybliang/work/repos/pytorch/torch/_dynamo/variables/torch.py", line 407, in call_function
    tensor_variable = wrap_fx_proxy(
  File "/scratch/ybliang/work/repos/pytorch/torch/_dynamo/variables/builder.py", line 636, in wrap_fx_proxy
    return wrap_fx_proxy_cls(
  File "/scratch/ybliang/work/repos/pytorch/torch/_dynamo/variables/builder.py", line 676, in wrap_fx_proxy_cls
    example_value = get_fake_value(proxy.node, tx)
  File "/scratch/ybliang/work/repos/pytorch/torch/_dynamo/utils.py", line 1024, in get_fake_value
    args, kwargs = torch.fx.node.map_arg((node.args, node.kwargs), visit)
  File "/scratch/ybliang/work/repos/pytorch/torch/fx/node.py", line 613, in map_arg
    return map_aggregate(a, lambda x: fn(x) if isinstance(x, Node) else x)
  File "/scratch/ybliang/work/repos/pytorch/torch/fx/node.py", line 621, in map_aggregate
    t = tuple(map_aggregate(elem, fn) for elem in a)
  File "/scratch/ybliang/work/repos/pytorch/torch/fx/node.py", line 621, in <genexpr>
    t = tuple(map_aggregate(elem, fn) for elem in a)
  File "/scratch/ybliang/work/repos/pytorch/torch/fx/node.py", line 627, in map_aggregate
    return immutable_dict((k, map_aggregate(v, fn)) for k, v in a.items())
  File "/scratch/ybliang/work/repos/pytorch/torch/fx/node.py", line 627, in <genexpr>
    return immutable_dict((k, map_aggregate(v, fn)) for k, v in a.items())
  File "/scratch/ybliang/work/repos/pytorch/torch/fx/node.py", line 631, in map_aggregate
    return fn(a)
  File "/scratch/ybliang/work/repos/pytorch/torch/fx/node.py", line 613, in <lambda>
    return map_aggregate(a, lambda x: fn(x) if isinstance(x, Node) else x)
  File "/scratch/ybliang/work/repos/pytorch/torch/_dynamo/utils.py", line 1022, in visit
    return n.meta["example_value"]
KeyError: 'example_value\n\nfrom user code:\n   File "./generated/test_BayesWatch_pytorch_prunes.py", line 108, in forward\n    return torch.zeros([x.size()[0], self.channels, x.size()[2] // self.spatial, x.size()[3] // self.spatial], dtype=x.dtype, layout=x.layout, device=x.device)\n\nSet torch._dynamo.config.verbose=True for more information\n\n\nYou can suppress this exception and fall back to eager by setting:\n    torch._dynamo.config.suppress_errors = True\n'

```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89257
Approved by: https://github.com/jansel
---
 test/dynamo/test_misc.py          | 26 ++++++++++++++++++++++++++
 torch/_dynamo/variables/tensor.py | 16 ++++++++++++----
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index e3274738fc21f..1a04f25e74043 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -1301,6 +1301,32 @@ def fn(x):
         self.assertTrue(same(ref0, res0))
         self.assertTrue(same(ref1, res1))
 
+    def test_tensor_data(self):
+        def fn(x, y):
+            return x[y.data]
+
+        x = torch.rand(8)
+        y = torch.ones(8).to(torch.int)
+        ref = fn(x, y)
+        opt_fn = torch._dynamo.optimize("eager", nopython=True)(fn)
+        res = opt_fn(x, y)
+        self.assertTrue(same(ref, res))
+
+    def test_tensor_layout(self):
+        def fn(x):
+            return torch.zeros(
+                [x.size()[0], x.size()[1]],
+                dtype=x.dtype,
+                layout=x.layout,
+                device=x.device,
+            )
+
+        x = torch.rand(2, 3)
+        ref = fn(x)
+        opt_fn = torch._dynamo.optimize("eager", nopython=True)(fn)
+        res = opt_fn(x)
+        self.assertTrue(same(ref, res))
+
     def test_version_ci(self):
         # temporary test to check that the ci torch version is set correctly
         self.assertTrue(hasattr(torch, "_subclasses"))
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index ab94aaf537d2d..84de57c0f2955 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -30,6 +30,7 @@ class TensorVariable(VariableTracker):
         "proxy",
         "dtype",
         "device",
+        "layout",
         "ndim",
         "size",
         "stride",
@@ -52,6 +53,7 @@ def __init__(
         proxy: torch.fx.Proxy,
         dtype=None,
         device=None,
+        layout=None,
         ndim=None,
         size=None,
         stride=None,
@@ -67,6 +69,7 @@ def __init__(
         self.proxy = proxy
         self.dtype = dtype
         self.device = device
+        self.layout = layout
         self.ndim = ndim
         self.size = size
         self.stride = stride
@@ -101,6 +104,7 @@ def specialize(value: torch.Tensor):
         props = {
             "dtype": value.dtype,
             "device": value.device,
+            "layout": value.layout,
             "ndim": int(value.ndim),
             "requires_grad": value.requires_grad,
             "is_quantized": value.is_quantized,
@@ -130,6 +134,8 @@ def var_getattr(self, tx, name):
             result = TorchVariable(self.dtype, **options)
         elif name == "device" and self.device is not None:
             result = TorchVariable(self.device, **options)
+        elif name == "layout" and self.layout is not None:
+            result = TorchVariable(self.layout, **options)
         elif name == "is_cuda" and self.device is not None:
             result = ConstantVariable(self.device.type == "cuda", **options)
         elif name == "shape" and self.size is not None:
@@ -145,6 +151,8 @@ def var_getattr(self, tx, name):
             result = self.call_method(tx, "size", [], {})
         elif name == "ndim" and self.ndim is None:
             result = self.call_method(tx, "dim", [], {})
+        elif name == "data":
+            result = self.call_method(tx, "detach", [], {})
         elif name == "T":
             args = [variables.ConstantVariable(i) for i in range(self.ndim - 1, -1, -1)]
             result = self.call_method(tx, "permute", args, {})
@@ -198,7 +206,7 @@ def call_method(
                 tx.output.create_proxy(
                     "call_method",
                     name,
-                    *proxy_args_kwargs([self] + args, kwargs),
+                    *proxy_args_kwargs([self] + list(args), kwargs),
                     current_tx=tx,
                 ),
                 **options,
@@ -277,7 +285,7 @@ def call_method(
             tx.output.create_proxy(
                 "call_function",
                 operator.setitem,
-                *proxy_args_kwargs([self] + args, kwargs),
+                *proxy_args_kwargs([self] + list(args), kwargs),
                 current_tx=tx,
             )
             return ConstantVariable(None, **options)
@@ -309,7 +317,7 @@ def call_method(
                 tx.output.create_proxy(
                     "call_method",
                     name,
-                    *proxy_args_kwargs([self] + args, kwargs),
+                    *proxy_args_kwargs([self] + list(args), kwargs),
                     current_tx=tx,
                 ),
                 **options,
@@ -329,7 +337,7 @@ def call_method(
                 tx.output.create_proxy(
                     "call_method",
                     name,
-                    *proxy_args_kwargs([self] + args, kwargs),
+                    *proxy_args_kwargs([self] + list(args), kwargs),
                     current_tx=tx,
                 ),
                 **options,

From ef5574bc724303a92b93b4048fa3d623071d2306 Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@fb.com>
Date: Mon, 21 Nov 2022 22:56:13 +0000
Subject: [PATCH 1142/1922] Add commit hash to dynamo dashboard (#89462)

Title - also fix a small bug with dashboard outputs.

Sample: https://github.com/pytorch/torchdynamo/issues/1831#issuecomment-1322732698

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89462
Approved by: https://github.com/anijain2305
---
 benchmarks/dynamo/runner.py | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py
index 843dbd12909aa..963dcf493705a 100755
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@@ -336,7 +336,7 @@ def generate_dropdown_comment(title, body):
     return str_io.getvalue()
 
 
-def build_summary():
+def build_summary(args):
     import git
 
     out_io = io.StringIO()
@@ -352,31 +352,36 @@ def print_commit_hash(path, name):
     def env_var(name):
         out_io.write(f"{name} = {os.environ[name]}\n")
 
-    out_io.write("## Commit hashes ##\n")
-    print_commit_hash(".", "torch._dynamo")
+    out_io.write("\n")
+    out_io.write("### Run name ###\n")
+    out_io.write(get_archive_name(args, args.dtypes[0]))
+    out_io.write("\n")
+
+    out_io.write("\n")
+    out_io.write("### Commit hashes ###\n")
     print_commit_hash("../pytorch", "pytorch")
     print_commit_hash("../functorch", "functorch")
     print_commit_hash("../torchbenchmark", "torchbench")
 
     out_io.write("\n")
-    out_io.write("## TorchDynamo config flags ##\n")
+    out_io.write("### TorchDynamo config flags ###\n")
     for key in dir(torch._dynamo.config):
         val = getattr(torch._dynamo.config, key)
         if not key.startswith("__") and isinstance(val, bool):
             out_io.write(f"torch._dynamo.config.{key} = {val}\n")
 
     out_io.write("\n")
-    out_io.write("## Torch version ##\n")
+    out_io.write("### Torch version ###\n")
     out_io.write(f"torch: {torch.__version__}\n")
 
     out_io.write("\n")
-    out_io.write("## Environment variables ##\n")
+    out_io.write("### Environment variables ###\n")
     env_var("TORCH_CUDA_ARCH_LIST")
     env_var("CUDA_HOME")
     env_var("USE_LLVM")
 
     out_io.write("\n")
-    out_io.write("## GPU details ##\n")
+    out_io.write("### GPU details ###\n")
     out_io.write(f"CUDNN VERSION: {torch.backends.cudnn.version()}\n")
     out_io.write(f"Number CUDA Devices: {torch.cuda.device_count()}\n")
     out_io.write(f"Device Name: {torch.cuda.get_device_name(0)}\n")
@@ -415,6 +420,12 @@ def default_archive_name(dtype):
     return f"{prefix}_performance_{dtype}_{randint(100, 999)}"
 
 
+def get_archive_name(args, dtype):
+    return (
+        default_archive_name(dtype) if args.archive_name is None else args.archive_name
+    )
+
+
 def archive(src_dir, dest_dir_prefix, archive_name, dtype):
     if archive_name is None:
         archive_name = default_archive_name(dtype)
@@ -810,7 +821,7 @@ def gen_summary_files(self):
 
 def parse_logs(args, dtypes, suites, devices, compilers, flag_compilers, output_dir):
     mode = get_mode(args)
-    build_summary()
+    build_summary(args)
 
     parser_class = ParsePerformanceLogs
     parser = parser_class(
@@ -965,13 +976,13 @@ def generate_comment(self):
                             f"suite: {suite}): {path}\n\n"
                         )
 
+            regressions_present = False
             for metric in [
                 "accuracy",
                 "speedup",
                 "compilation_latency",
                 "compression_ratio",
             ]:
-                regressions_present = False
                 dfs = []
                 for compiler in self.args.flag_compilers:
                     if last2[compiler] is None:
@@ -1148,11 +1159,7 @@ def __init__(self, args):
     def update_lookup_file(self):
         dtype = self.args.dtypes[0]
         day, _ = archive_data(self.args.archive_name)
-        target_dir = (
-            default_archive_name(dtype)
-            if self.args.archive_name is None
-            else self.args.archive_name
-        )
+        target_dir = get_archive_name(self.args, dtype)
         # Update lookup csv the folder to arhived logs
         subprocess.check_call(
             f'echo "{day},performance,{dtype},{target_dir}" >> {self.lookup_file}',
@@ -1198,6 +1205,7 @@ def gen_comment(self):
             "gh_metric_regression.txt",
             "gh_training.txt",
             "gh_graphs.txt",
+            "gh_build_summary.txt",
         ]
         all_lines = []
         for f in files:

From f6cf408994e6fc868e6caf7105107284db5a8446 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 21 Nov 2022 18:12:21 -0500
Subject: [PATCH 1143/1922] Suppress guards when creating fake tensors (#89349)

When we create fake tensors, we may call operators that introduce
guards, to accurately reconstruct views.  But these guards are spurious:
if a user is able to present a tensor that "looks the same", they have
implicitly fulfilled the contract that the view is creatable.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89349
Approved by: https://github.com/voznesenskym
---
 torch/_subclasses/fake_tensor.py         |  6 ++++-
 torch/fx/experimental/symbolic_shapes.py | 29 ++++++++++++++++++------
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 9a0ac050e6b94..758f4431f688e 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -230,7 +230,11 @@ def mk_fake_tensor(make_meta_t):
                     constant=t if make_constant else None,
                 )
 
-        out = self.meta_converter(t, shape_env=shape_env, callback=mk_fake_tensor)
+        ctx = contextlib.nullcontext()
+        if shape_env is not None:
+            ctx = shape_env.suppress_guards()
+        with ctx:
+            out = self.meta_converter(t, shape_env=shape_env, callback=mk_fake_tensor)
         if out is NotImplemented:
             raise UnsupportedFakeTensorException("meta converter nyi")
         if make_constant:
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index bd52760502c6b..f25302a883978 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -6,6 +6,8 @@
 import builtins
 import math
 import functools
+import threading
+from contextlib import contextmanager
 from functools import lru_cache, partial
 import traceback
 import collections
@@ -439,6 +441,18 @@ def __init__(self):
         # Duck-shaping says that if two input tensors have the same size,
         # they get assigned the same symbolic variable
         self.val_to_var: Dict[int, "sympy.Expr"] = {0: sympy.Integer(0), 1: sympy.Integer(1)}
+        self.tls = threading.local()
+
+    def _suppress_guards_tls(self):
+        return getattr(self.tls, "suppress_guards", False)
+
+    @contextmanager
+    def suppress_guards(self):
+        self.tls.suppress_guards = True
+        try:
+            yield
+        finally:
+            self.tls.suppress_guards = False
 
     def _get_key(self):
         """
@@ -673,11 +687,12 @@ def evaluate_expr(self, expr: "sympy.Expr"):
         # TODO: optimize this; avoid formatting traces until we need them
         # NB: drop two frames; evaluate_expr and the Sym* function that
         # actually called us
-        stack = ''.join(traceback.format_list(traceback.extract_stack()[:-2]))
-        if concrete_val is sympy.true:
-            self.guards.append((expr, stack))
-        elif concrete_val is sympy.false:
-            self.guards.append((sympy.Not(expr), stack))
-        else:
-            self.guards.append((sympy.Eq(expr, concrete_val), stack))
+        if not self._suppress_guards_tls():
+            stack = ''.join(traceback.format_list(traceback.extract_stack()[:-2]))
+            if concrete_val is sympy.true:
+                self.guards.append((expr, stack))
+            elif concrete_val is sympy.false:
+                self.guards.append((sympy.Not(expr), stack))
+            else:
+                self.guards.append((sympy.Eq(expr, concrete_val), stack))
         return concrete_val

From f1ff16aafa2f7fc1143ea082e75df3d4c530475d Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 22 Nov 2022 00:13:38 +0000
Subject: [PATCH 1144/1922] Mitigate flaky test_ops_fwd_gradients on macOS
 (#89410)

This has been flaky on macOS for a while ([hud](https://hud.pytorch.org/failure/RuntimeError%3A%20test_ops_fwd_gradients%20failed)) and I can reproduce this locally. The issue was raised by https://github.com/pytorch/pytorch/issues/66033 and it seems to point to macos itself https://github.com/graphia-app/graphia/issues/33.  So switching to single thread when running `test_ops_fwd_gradients` on macOS as a mitigation for the flaky tests.

### Testing

`pytest test_ops_fwd_gradients.py -k test_fn_fwgrad_bwgrad -vv --flake-finder` to run all `test_fn_fwgrad_bwgrad` tests 50 times to make sure they all pass (no flaky anymore)

https://hud.pytorch.org/tests shows that `test_ops_fwd_gradients` on macOS takes about 15m to finish or 8 minute if using 2 shards like in the test.  There is no obvious difference in the test duration:

```
2022-11-21T21:34:18.6078080Z Running test_ops_fwd_gradients ... [2022-11-21 21:34:18.600663]
2022-11-21T21:34:21.6805770Z Executing ['/Users/runner/work/_temp/conda_environment_3517515737/bin/python', '-bb', 'test_ops_fwd_gradients.py', '-v', '--use-pytest', '-vv', '-rfEX', '-x', '--reruns=2', '--shard-id=0', '--num-shards=2', '-k=not _linalg_cholesky_', '--import-slow-tests', '--import-disabled-tests'] ... [2022-11-21 21:34:21.680156]
2022-11-21T21:34:21.6806380Z Ignoring disabled issues:  []
2022-11-21T21:34:21.6815250Z Executing ['/Users/runner/work/_temp/conda_environment_3517515737/bin/python', '-bb', 'test_ops_fwd_gradients.py', '-v', '--use-pytest', '-vv', '-rfEX', '-x', '--reruns=2', '--shard-id=1', '--num-shards=2', '-k=not _linalg_cholesky_', '--import-slow-tests', '--import-disabled-tests'] ... [2022-11-21 21:34:21.681174]
2022-11-21T21:34:21.6815830Z Ignoring disabled issues:  []
.....
2022-11-21T21:40:42.2422700Z =============================== warnings summary ===============================
.....
2022-11-21T21:40:42.2424670Z - generated xml file: /Users/runner/work/pytorch/pytorch/test/test-reports/python-pytest/test_ops_fwd_gradients/test_ops_fwd_gradients-47b619449ea7db1f.xml -
2022-11-21T21:40:42.2424850Z = 831 passed, 596 skipped, 5 deselected, 17 xfailed, 1 warning in 374.54s (0:06:14) =
.....
2022-11-21T21:42:00.1923310Z =============================== warnings summary ===============================
.....
2022-11-21T21:42:00.1925370Z - generated xml file: /Users/runner/work/pytorch/pytorch/test/test-reports/python-pytest/test_ops_fwd_gradients/test_ops_fwd_gradients-d24ee6419a602a6e.xml -
2022-11-21T21:42:00.1925540Z = 828 passed, 603 skipped, 7 deselected, 20 xfailed, 1 warning in 452.94s (0:07:32) =
....
2022-11-21T21:42:09.9035670Z FINISHED PRINTING LOG FILE of test_ops_fwd_gradients (/Users/runner/work/pytorch/pytorch/test/test-reports/test_ops_fwd_gradients_ha_3rfhb)
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89410
Approved by: https://github.com/soulitzer
---
 test/test_ops_fwd_gradients.py                        | 8 +++++++-
 torch/testing/_internal/common_methods_invocations.py | 9 ++++++---
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/test/test_ops_fwd_gradients.py b/test/test_ops_fwd_gradients.py
index c3fca7235461f..4b7b1c785d5f0 100644
--- a/test/test_ops_fwd_gradients.py
+++ b/test/test_ops_fwd_gradients.py
@@ -4,7 +4,7 @@
 import torch
 
 from torch.testing._internal.common_utils import (
-    TestGradients, run_tests, skipIfTorchInductor)
+    TestGradients, run_tests, skipIfTorchInductor, IS_MACOS)
 from torch.testing._internal.common_methods_invocations import op_db
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, ops, OpDTypes)
@@ -12,6 +12,12 @@
 # TODO: fixme https://github.com/pytorch/pytorch/issues/68972
 torch.set_default_dtype(torch.float32)
 
+# TODO: mitigate flaky issue on macOS https://github.com/pytorch/pytorch/issues/66033
+# AFAIK, c10::ThreadPool looks correct in the way it uses condition_variable wait. The
+# issue seems to point to macOS itself https://github.com/graphia-app/graphia/issues/33
+if IS_MACOS:
+    torch.set_num_threads(1)
+
 # gradcheck requires double precision
 _gradcheck_ops = partial(ops, dtypes=OpDTypes.supported,
                          allowed_dtypes=[torch.double, torch.cdouble])
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 998f1cde65f7d..c0c53efa503e3 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -11711,7 +11711,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # (see sample_inputs_max_unpool_grad to find out more).
                DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_grad'),
                DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_gradgrad'),
-               DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD',
+                            active_if=(not IS_MACOS)),
                DecorateInfo(unittest.skip("Skipped!"), 'TestCompositeCompliance', 'test_forward_ad',
                             device_type='cpu'),
            )),
@@ -11743,7 +11744,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # and if there are several indices pointing to the same memory,
                # gradcheck is oblivious about that and cannot perturb them all at once
                # (see sample_inputs_max_unpool_grad to find out more).
-               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD'),
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD',
+                            active_if=(not IS_MACOS)),
                DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_gradgrad'),
                DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_grad'),
                DecorateInfo(unittest.skip("Skipped!"), 'TestCompositeCompliance', 'test_forward_ad'),
@@ -11780,7 +11782,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # and if there are several indices pointing to the same memory,
                # gradcheck is oblivious about that and cannot perturb them all at once
                # (see sample_inputs_max_unpool_grad to find out more).
-               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD'),
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD',
+                            active_if=(not IS_MACOS)),
                DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_gradgrad'),
                DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_grad'),
                DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),

From 38fe673a0a491b070147ba81cbde3d846bdcd639 Mon Sep 17 00:00:00 2001
From: Khushi <khushiagrawal411@gmail.com>
Date: Tue, 22 Nov 2022 00:15:30 +0000
Subject: [PATCH 1145/1922] [fix] tril & tril : out of bound check (#89384)

Fixes #83326

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89384
Approved by: https://github.com/ngimel
---
 aten/src/ATen/native/TriangularOps.cpp                | 2 ++
 test/functorch/test_vmap.py                           | 4 ++++
 torch/testing/_internal/common_methods_invocations.py | 8 ++++++++
 3 files changed, 14 insertions(+)

diff --git a/aten/src/ATen/native/TriangularOps.cpp b/aten/src/ATen/native/TriangularOps.cpp
index fbdd204f64307..59d2b8a0d224b 100644
--- a/aten/src/ATen/native/TriangularOps.cpp
+++ b/aten/src/ATen/native/TriangularOps.cpp
@@ -23,10 +23,12 @@ namespace at {
 namespace meta {
 
 TORCH_META_FUNC(tril)(const Tensor& self, int64_t k) {
+  TORCH_CHECK(self.dim() >= 2, "tril: input tensor must have at least 2 dimensions")
   set_output_raw_strided(0, self.sizes(), {}, self.options());
 }
 
 TORCH_META_FUNC(triu)(const Tensor& self, int64_t k) {
+  TORCH_CHECK(self.dim() >= 2, "triu: input tensor must have at least 2 dimensions")
   set_output_raw_strided(0, self.sizes(), {}, self.options());
 }
 
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index 0c38c5101cf86..4c2c680ca6371 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -3295,6 +3295,8 @@ def test():
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
     @skipOps('TestVmapOperatorsOpInfo', 'test_vmap_exhaustive', vmap_fail.union({
         xfail('native_batch_norm'),
+        xfail('tril'),  # Exception not raised on error input
+        xfail('triu'),  # Exception not raised on error input
         # The error inputs are vectors, that pass when batched as they are treated as a matrix
         xfail('trace'),
     }))
@@ -3342,6 +3344,8 @@ def test_vmap_exhaustive(self, device, dtype, op):
         xfail('tensor_split'),
         xfail('to_sparse'),
         xfail('vdot'),
+        xfail('tril'),  # Exception not raised on error input
+        xfail('triu'),  # Exception not raised on error input
         xfail('__getitem__', ''),
         xfail('all'),
         xfail('any'),
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index c0c53efa503e3..3d3c13bb7208e 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -6087,6 +6087,12 @@ def sample_inputs_tril_triu(op_info, device, dtype, requires_grad, **kwargs):
     for shape, args in cases:
         yield SampleInput(make_arg(shape), args=args)
 
+def error_inputs_tril_triu(opinfo, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+
+    # error inputs for input.ndim <= 2
+    yield ErrorInput(SampleInput(make_arg((4,))), error_regex="input tensor must have at least 2 dimensions")
+
 def sample_inputs_trilu_indices(op_info, device, dtype, requires_grad, **kwargs):
     # (row, col, offset)
     args_list = ((0, 0),
@@ -15371,12 +15377,14 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
+           error_inputs_func=error_inputs_tril_triu,
            sample_inputs_func=sample_inputs_tril_triu),
     OpInfo('triu',
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
            dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
+           error_inputs_func=error_inputs_tril_triu,
            sample_inputs_func=sample_inputs_tril_triu),
     OpInfo('triu_indices',
            dtypes=_dispatch_dtypes((torch.int32, torch.int64)),

From b957f44afae1462eddc3dd9c3a871ea4aea123a0 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 21 Nov 2022 16:04:46 -0500
Subject: [PATCH 1146/1922] Bind DispatchKey.Functionalonalize in pybind11
 (#89452)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89452
Approved by: https://github.com/albanD, https://github.com/bdhirsh
---
 torch/csrc/utils/python_dispatch.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch/csrc/utils/python_dispatch.cpp b/torch/csrc/utils/python_dispatch.cpp
index 381e82e1fcdbc..e4ce9ccf52175 100644
--- a/torch/csrc/utils/python_dispatch.cpp
+++ b/torch/csrc/utils/python_dispatch.cpp
@@ -495,6 +495,7 @@ void initDispatchBindings(PyObject* module) {
       DEF_ONE(FuncTorchDynamicLayerFrontMode)
       DEF_ONE(FuncTorchDynamicLayerBackMode)
       DEF_ONE(PythonDispatcher)
+      DEF_ONE(Functionalize)
   // clang-format on
 
 #define DEF_SINGLE(n, prefix) .value(#prefix #n, c10::DispatchKey::prefix##n)

From 87a9129b971320e887c10689113f0968b482561e Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Mon, 21 Nov 2022 21:37:32 +0000
Subject: [PATCH 1147/1922] Add torchvis support to dist bench (#89324)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89324
Approved by: https://github.com/davidberard98, https://github.com/albanD
---
 benchmarks/dynamo/distributed.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/benchmarks/dynamo/distributed.py b/benchmarks/dynamo/distributed.py
index 360fd846dbe8a..dee44210e93c8 100644
--- a/benchmarks/dynamo/distributed.py
+++ b/benchmarks/dynamo/distributed.py
@@ -18,6 +18,17 @@
     from dist_util import apply_fsdp, cleanup, get_model, model_iter_fn, setup
 
 
+def torchviz_model(args, model, inputs, rank):
+    from torchviz import make_dot
+
+    outputs = model(*inputs)
+    loss = reduce_to_scalar_loss(outputs)
+    parameter_names = dict(model.named_parameters())
+    dot = make_dot(loss, params=parameter_names, show_attrs=True, show_saved=True)
+    if rank == 0:
+        dot.render("torchviz.dot")
+
+
 def profile_model(args, model, inputs, rank):
     with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
         for i in range(args.repeat):
@@ -87,7 +98,8 @@ def print_compile(gm, ex):
     t_total = timed(
         model, model_iter_fn, inputs, times=args.repeat, return_result=False
     )
-
+    if args.torchviz:
+        torchviz_model(args, model, inputs, rank)
     if args.profile:
         profile_model(args, model, inputs, rank)
 
@@ -105,6 +117,9 @@ def print_compile(gm, ex):
     )
     parser.add_argument("--verbose", action="store_true")
     parser.add_argument("--batch_size", default=None)
+    parser.add_argument(
+        "--torchviz", action="store_true", help="Dump autograd graph with torchviz"
+    )
     parser.add_argument("--profile", action="store_true", help="Run the profiler")
     parser.add_argument("--trace_file", default="profile.json", help="Run the profiler")
     parser.add_argument("--repeat", default=10, help="Repeats for timing run")

From a3d3cdfed404f342dfd523bd25b329bbdc27957f Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang@fb.com>
Date: Mon, 21 Nov 2022 11:05:38 -0800
Subject: [PATCH 1148/1922] [17/N] Add _reduce_scatter_base custom op with
 CPU/CUDA implementation (#88903)

Differential Revision: [D41415325](https://our.internmc.facebook.com/intern/diff/D41415325)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88903
Approved by: https://github.com/kwen2501
---
 test/distributed/test_c10d_nccl.py      | 15 ++++++++++
 torch/csrc/distributed/c10d/Ops.cpp     | 38 +++++++++++++++++++++++++
 torch/csrc/distributed/c10d/Ops.hpp     |  6 ++++
 torch/csrc/distributed/c10d/OpsImpl.cpp | 34 ++++++++++++++++++++++
 torch/csrc/distributed/c10d/init.cpp    |  8 +++++-
 5 files changed, 100 insertions(+), 1 deletion(-)

diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index fb28e744b5ed9..85ebb6b75bc5f 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -2974,6 +2974,21 @@ def test_allgather_base(self):
         dist.all_gather_into_tensor(output_tensor, tensor)
         self.assertEqual(output_tensor, tensor)
 
+    @requires_nccl()
+    @skip_if_lt_x_gpu(1)
+    def test_reduce_scatter_base(self):
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            "nccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        device = "cuda"
+        tensor = torch.ones(10, 10, device=torch.device(device))
+        output_tensor = torch.zeros(10, 10, device=torch.device(device))
+        dist.reduce_scatter_tensor(output_tensor, tensor)
+        self.assertEqual(output_tensor, tensor)
 
 if __name__ == "__main__":
     assert (
diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
index f825afca2a1d9..5d343c344ec81 100644
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@@ -111,6 +111,19 @@ std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>> reduce_scatter_(
       output_tensors, work);
 }
 
+c10::intrusive_ptr<Work> _reduce_scatter_base_(
+    at::Tensor& output_tensor,
+    at::Tensor& input_tensor,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    const c10::intrusive_ptr<ReduceOp>& reduce_op,
+    int64_t timeout) {
+  return process_group->_reduce_scatter_base(
+      output_tensor,
+      input_tensor,
+      ReduceScatterOptions{
+          *reduce_op.get(), std::chrono::milliseconds(timeout)});
+}
+
 c10::intrusive_ptr<Work> gather_(
     const std::vector<std::vector<at::Tensor>>& output_tensors,
     const std::vector<at::Tensor>& input_tensors,
@@ -210,6 +223,10 @@ TORCH_LIBRARY(c10d, m) {
   m.def(
       "reduce_scatter_",
       dispatch(c10::DispatchKey::CompositeExplicitAutograd, reduce_scatter_));
+  m.def(
+      "_reduce_scatter_base_",
+      dispatch(
+          c10::DispatchKey::CompositeExplicitAutograd, _reduce_scatter_base_));
   m.def(
       "reduce_",
       dispatch(c10::DispatchKey::CompositeExplicitAutograd, reduce_));
@@ -350,6 +367,27 @@ c10::intrusive_ptr<Work> reduce_scatter(
       opts.timeout.count()));
 }
 
+c10::intrusive_ptr<Work> _reduce_scatter_base(
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    at::Tensor& output_tensor,
+    at::Tensor& input_tensor,
+    const ReduceScatterOptions& opts) {
+  static auto op = c10::Dispatcher::singleton()
+                       .findSchemaOrThrow("c10d::_reduce_scatter_base_", "")
+                       .typed<c10::intrusive_ptr<Work>(
+                           at::Tensor&,
+                           at::Tensor&,
+                           const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                           const c10::intrusive_ptr<::c10d::ReduceOp>&,
+                           int64_t)>();
+  return op.call(
+      output_tensor,
+      input_tensor,
+      process_group,
+      c10::make_intrusive<::c10d::ReduceOp>(opts.reduceOp),
+      opts.timeout.count());
+}
+
 c10::intrusive_ptr<Work> reduce(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     at::TensorList tensors,
diff --git a/torch/csrc/distributed/c10d/Ops.hpp b/torch/csrc/distributed/c10d/Ops.hpp
index 72f09e341d7df..f6425e0ea3504 100644
--- a/torch/csrc/distributed/c10d/Ops.hpp
+++ b/torch/csrc/distributed/c10d/Ops.hpp
@@ -44,6 +44,12 @@ TORCH_API c10::intrusive_ptr<Work> reduce_scatter(
     const std::vector<std::vector<at::Tensor>>& input_tensors,
     const ReduceScatterOptions& opts = {});
 
+TORCH_API c10::intrusive_ptr<Work> _reduce_scatter_base(
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+     at::Tensor& output_tensor,
+     at::Tensor& input_tensor,
+    const ReduceScatterOptions& opts = {});
+
 TORCH_API c10::intrusive_ptr<Work> reduce(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     at::TensorList tensors,
diff --git a/torch/csrc/distributed/c10d/OpsImpl.cpp b/torch/csrc/distributed/c10d/OpsImpl.cpp
index 78e26c9656d8d..c3db5c438124a 100644
--- a/torch/csrc/distributed/c10d/OpsImpl.cpp
+++ b/torch/csrc/distributed/c10d/OpsImpl.cpp
@@ -259,6 +259,32 @@ reduce_scatter_cuda_(
       output_tensors, work);
 }
 
+c10::intrusive_ptr<Work> _reduce_scatter_base_cpu_(
+    at::Tensor& output_tensor,
+    at::Tensor& input_tensor,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    const c10::intrusive_ptr<ReduceOp>& reduce_op,
+    int64_t timeout) {
+  return process_group->_reduce_scatter_base(
+      output_tensor,
+      input_tensor,
+      ReduceScatterOptions{
+          *reduce_op.get(), std::chrono::milliseconds(timeout)});
+}
+
+c10::intrusive_ptr<Work> _reduce_scatter_base_cuda_(
+    at::Tensor& output_tensor,
+    at::Tensor& input_tensor,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    const c10::intrusive_ptr<ReduceOp>& reduce_op,
+    int64_t timeout) {
+  return process_group->_reduce_scatter_base(
+      output_tensor,
+      input_tensor,
+      ReduceScatterOptions{
+          *reduce_op.get(), std::chrono::milliseconds(timeout)});
+}
+
 c10::intrusive_ptr<Work> gather_cpu_(
     const std::vector<std::vector<at::Tensor>>& output_tensors,
     const std::vector<at::Tensor>& input_tensors,
@@ -439,6 +465,14 @@ TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
   m.impl("reduce_scatter_", reduce_scatter_cuda_);
 }
 
+TORCH_LIBRARY_IMPL(c10d, CPU, m) {
+  m.impl("_reduce_scatter_base_", _reduce_scatter_base_cpu_);
+}
+
+TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
+  m.impl("_reduce_scatter_base_", _reduce_scatter_base_cuda_);
+}
+
 TORCH_LIBRARY_IMPL(c10d, CPU, m) {
   m.impl("gather_", gather_cpu_);
 }
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index d39fc322d326b..ae98000112fc5 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -1411,7 +1411,13 @@ that adds a prefix to each key inserted to the store.
 
           .def(
               "_reduce_scatter_base",
-              &::c10d::ProcessGroup::_reduce_scatter_base,
+              [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
+                 at::Tensor& output_tensor,
+                 at::Tensor& input_tensor,
+                 const ::c10d::ReduceScatterOptions& opts) {
+                return ::c10d::ops::_reduce_scatter_base(
+                    self, output_tensor, input_tensor, opts);
+              },
               py::arg("outputTensor"),
               py::arg("inputTensor"),
               py::arg("opts") = ::c10d::ReduceScatterOptions(),

From 589df80f4a08d35d0fc99203c20d3579f74a54aa Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 21 Nov 2022 17:54:25 -0500
Subject: [PATCH 1149/1922] dont clone symints, dont clobber symint proxies
 (#88230)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88230
Approved by: https://github.com/albanD
---
 test/functorch/test_aotdispatch.py       |  7 +------
 torch/fx/experimental/proxy_tensor.py    | 13 ++++++++++++-
 torch/fx/experimental/symbolic_shapes.py |  2 +-
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index e03fe1e153851..84b1ba893cce0 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -694,7 +694,7 @@ def f(a, b, c, d):
             #
             # TODO(whc)- are the saved-tensors/saved-symints correct here?
             # i just made the test pass based on what default partition did
-            [False, True, True, False, False] + [False] * 5 + [True] * 3,
+            [False, True, True, False, False] + [False] * 4 + [True] * 4,
             [is_sym_node(n) for n in fw_graph_out_nodes]
         )
 
@@ -996,7 +996,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('addr', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('amax', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('amin', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('as_strided', ''),  # Tensor-likes are not close!
     xfail('baddbmm', ''),  # aten.baddbmm.default - couldn't find symbolic meta function/decomposition
     xfail('block_diag', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('cartesian_prod', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
@@ -1102,10 +1101,6 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('min', 'reduction_with_dim'),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('mode', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('mv', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-
-    # Deleting this in a followup
-    xfail('nn.functional.poisson_nll_loss', ''),
-
     xfail('nn.functional._scaled_dot_product_attention', ''),  # Cannot call sizes() on tensor with symbolic ...
     xfail('nn.functional.adaptive_avg_pool3d', ''),  # aten._adaptive_avg_pool3d_backward.default - couldn't ...
     xfail('nn.functional.adaptive_max_pool1d', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
index daa17f94b7bb0..012984ebe6f05 100644
--- a/torch/fx/experimental/proxy_tensor.py
+++ b/torch/fx/experimental/proxy_tensor.py
@@ -65,7 +65,18 @@ def set_proxy_slot(obj, tracer, proxy):
     assert isinstance(obj, (torch.Tensor, SymNode)), type(obj)
     d = obj.__dict__.setdefault(proxy_slot, weakref.WeakKeyDictionary())  # type: ignore[call-overload]
     assert isinstance(d, weakref.WeakKeyDictionary)
-    d[tracer] = proxy
+    # NB: Never clobber pre-existing proxy.  Although the proxies
+    # are in principle equivalent, when we do graph partitioning
+    # we need there not to be spurious dependencies on tangent inputs.
+    # This works because primals get their SymInts set first, and
+    # THEN later we allocate tangent inputs.  Make sure if a SymInt
+    # is derivable from a primal that we use that.
+    #
+    # However, we DO want to clobber proxies whenever we run an inplace operation
+    # on a tensor, and it affects the metadata on the proxy.
+    # This doesn't really apply to SymInts/SymFloats though, which are immutable.
+    if tracer not in d or isinstance(obj, torch.Tensor):
+        d[tracer] = proxy
 
 def has_proxy_slot(obj, tracer):
     assert isinstance(obj, (torch.Tensor, SymNode)), type(obj)
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index f25302a883978..41121808e24e9 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -177,7 +177,7 @@ def wrap_float(self, num):
         return SymNode(sympy.Float(num), self.shape_env, float, constant=num)
 
     def clone(self):
-        return SymNode(self.expr, self.shape_env, self.pytype, constant=self.constant)
+        return self
 
     def str(self):
         return f"{self.expr}"

From 7dffba3eec5e85ef788e94d0b21a6bfda2544d4b Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Tue, 22 Nov 2022 02:20:45 +0000
Subject: [PATCH 1150/1922] Revert "Added conv constraint that infers layouts
 (#89031)" (#89451)

This reverts commit 716f70f19a4b63268da2a753afdbe9b385a831ab.

Fixes performance regression and compilation latency increase.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89451
Approved by: https://github.com/soumith, https://github.com/jansel
---
 test/inductor/test_torchinductor.py |   4 +-
 torch/_inductor/graph.py            |  29 +------
 torch/_inductor/ir.py               |   3 -
 torch/_inductor/lowering.py         | 118 ++++++++++++++++++----------
 4 files changed, 81 insertions(+), 73 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 2196f4f8a0264..0d28f156ecc08 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -68,6 +68,7 @@
 from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
 
 aten = torch.ops.aten
+
 requires_cuda = functools.partial(unittest.skipIf, not HAS_CUDA, "requires cuda")
 
 torch._inductor.config.triton.autotune = False  # too slow
@@ -5309,8 +5310,6 @@ def get_kernels(self, fn, args) -> typing.List[CachingAutotuner]:
             return kernels
 
         def test_divisibile_by_16_covers_numel_args(self):
-            torch._dynamo.reset()
-
             def fn(a: torch.Tensor) -> torch.Tensor:
                 return torch.sum(a)
 
@@ -5330,7 +5329,6 @@ def fn(a: torch.Tensor) -> torch.Tensor:
                 kernels[1].meta["configs"][0].divisible_by_16
             )
             self.assertEqual(arguments_that_are_divisible_by_16_in_kernel1, (0, 1))
-            torch._dynamo.reset()
 
 
 if __name__ == "__main__":
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index a47d9c1a02e11..7a5791de8a388 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -20,12 +20,7 @@
     MissingOperatorWithoutDecomp,
 )
 from .ir import Constant, FixedLayout, InputBuffer, Pointwise, Reduction, TensorBox
-from .lowering import (
-    layout_constraints,
-    lowerings,
-    make_fallback,
-    needs_realized_inputs,
-)
+from .lowering import lowerings, make_fallback, needs_realized_inputs
 from .sizevars import SizeVarAllocator
 from .utils import dynamo_utils, gather_origins, get_dtype_size, sympy_product
 from .virtualized import V
@@ -306,12 +301,7 @@ def finalize(self):
 
     def run_node(self, n: torch.fx.Node):
         with ir.IRNode.current_origins({n}):
-            if n.op == "call_function" and n.target in layout_constraints:
-                args, kwargs = self.fetch_args_kwargs_from_env(n)
-                args, kwargs = layout_constraints[n.target](n, *args, **kwargs)
-                result = self.call_function(n.target, args, kwargs)
-            else:
-                result = super().run_node(n)
+            result = super().run_node(n)
 
             # Realize if (1) any user need inputs realized, or (2) there is
             # already too many reads and rematerializing can be bad.
@@ -320,20 +310,7 @@ def run_node(self, n: torch.fx.Node):
                 for user in n.users:
                     if user.target in needs_realized_inputs:
                         result.realize_hint()
-                        # This inclusion is somewhat controversial (from
-                        # discussion between Horace, Natalia, and Elias).
-                        # Currently, it's not very clear why this is helpful.
-                        # The general idea here is that even though a node may
-                        # have FlexibleLayout, we still often *treat* it as if
-                        # it was contiguous. This appears to sometime result in
-                        # suboptimal behavior.
-                        #
-                        # When we do a better job selecting layout, we should
-                        # revisit this.
-                        result = ir.ExternKernel.require_stride_order(
-                            result, ir.get_stride_order(n.meta["val"].stride())
-                        )
-                    if user.op == "output":
+                    elif user.op == "output":
                         if isinstance(result.data.data, (Pointwise, Reduction)):
                             result.realize()
 
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index d547246717689..8327fe0d7b521 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -2478,9 +2478,6 @@ def require_stride1(cls, x):
 
     @classmethod
     def require_stride_order(cls, x, order):
-        if x.get_numel() == 0:  # Layout doesn't matter
-            return x
-
         # require x to have the layout as strided_ordered as order
         if is_storage_and_layout(x):
             if isinstance(
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 0bd92007c9864..80743a563e731 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -23,6 +23,7 @@
 from .decomposition import decompositions, get_decompositions
 from .ir import (
     ExpandView,
+    get_stride_order,
     IndexingConstant,
     IndexingDiv,
     PermuteView,
@@ -37,7 +38,6 @@
 
 log = logging.getLogger(__name__)
 lowerings = {}
-layout_constraints = {}
 fallbacks = set()
 aten = torch.ops.aten
 prims = torch.ops.prims
@@ -53,14 +53,6 @@ def add_needs_realized_inputs(fn):
             needs_realized_inputs.add(getattr(fn, overload))
 
 
-def add_layout_constraint(fn, constraint):
-    if isinstance(fn, torch._ops.OpOverloadPacket):
-        for overload in fn.overloads():
-            layout_constraints[getattr(fn, overload)] = constraint
-    else:
-        layout_constraints[fn] = constraint
-
-
 add_needs_realized_inputs(
     [
         aten.as_strided,
@@ -1021,10 +1013,12 @@ def linear_binary(x: TensorBox, y: TensorBox, w: TensorBox, b: TensorBox, attr):
 register_onednn_fusion_ops()
 
 
-def fallback_handler(kernel):
+def fallback_handler(kernel, inps_hook=None):
     fallbacks.add(kernel)
 
     def handler(*args, **kwargs):
+        if inps_hook is not None:
+            args, kwargs = inps_hook(*args, **kwargs)
         return pytree.tree_map(
             TensorBox.create, ir.FallbackKernel.create(kernel, *args, **kwargs)
         )
@@ -1032,7 +1026,7 @@ def handler(*args, **kwargs):
     return handler
 
 
-def make_fallback(kernel, layout_constraint=None):
+def make_fallback(kernel, inps_hook=None):
     assert (
         kernel not in decompositions
     ), f"both a fallback and a decomp for same kernel: {kernel}"
@@ -1042,9 +1036,9 @@ def make_fallback(kernel, layout_constraint=None):
         )
 
     add_needs_realized_inputs(kernel)
-    if layout_constraint is not None:
-        add_layout_constraint(kernel, layout_constraint)
-    return register_lowering(kernel, type_promotion_kind=None)(fallback_handler(kernel))
+    return register_lowering(kernel, type_promotion_kind=None)(
+        fallback_handler(kernel, inps_hook)
+    )
 
 
 @register_lowering(aten.native_dropout, type_promotion_kind=None)
@@ -1195,14 +1189,72 @@ def inner_fn(index):
     )
 
 
-def require_dense(_, *args, **kwargs):
+def conv_backward(*args, **kwargs):
+    # output striding complex and has a lot of build dependent options,
+    # take the output strides to determine what to set the inputs
+    with torch._subclasses.FakeTensorMode():
+        args_fake, kwargs_fake = pytree.tree_map_only(
+            ir.IRNode,
+            lambda t: ir.ir_node_to_tensor(t, guard_shape=False),
+            (args, kwargs),
+        )
+        output = aten.convolution_backward(*args_fake, **kwargs_fake)
+
+    def constraints(
+        grad_output,
+        input,
+        weight,
+        bias_sizes,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        output_mask,
+    ):
+        out = (
+            output[0]
+            if output[0] is not None
+            else output[1]
+            if output[1] is not None
+            else output[2]
+        )
+        if out is not None:
+            stride_order = get_stride_order(out.stride())
+            grad_output = ir.ExternKernel.require_stride_order(
+                grad_output, stride_order
+            )
+            weight = ir.ExternKernel.require_stride_order(weight, stride_order)
+            # Only make input contiguous when it is necessary for the backwards computation
+            if output_mask[1]:
+                input = ir.ExternKernel.require_stride_order(input, stride_order)
+
+        return (
+            grad_output,
+            input,
+            weight,
+            bias_sizes,
+            stride,
+            padding,
+            dilation,
+            transposed,
+            output_padding,
+            groups,
+            output_mask,
+        ), {}
+
+    return constraints(*args, **kwargs)
+
+
+def require_dense(*args, **kwargs):
     args, kwargs = pytree.tree_map_only(
         ir.IRNode, lambda t: ir.ExternKernel.require_stride1(t), (args, kwargs)
     )
     return args, kwargs
 
 
-def require_contiguous(_, *args, **kwargs):
+def require_contiguous(*args, **kwargs):
     args, kwargs = pytree.tree_map_only(
         ir.IRNode, lambda t: ir.ExternKernel.require_contiguous(t), (args, kwargs)
     )
@@ -1212,42 +1264,26 @@ def require_contiguous(_, *args, **kwargs):
 if has_torchvision_roi_align():
     make_fallback(torch.ops.torchvision.roi_align)
 
-
-def constrain_to_fx_strides(fx_node, *args, **kwargs):
-    def apply_constraint(arg, fx_arg):
-        if isinstance(arg, ir.IRNode):
-            stride_order = ir.get_stride_order(fx_arg.meta["val"].stride())
-            return ir.ExternKernel.require_stride_order(arg, stride_order)
-        return arg
-
-    args = [apply_constraint(arg, fx_arg) for arg, fx_arg in zip(args, fx_node.args)]
-    kwargs = {k: apply_constraint(v, fx_node.kwargs[k]) for k, v in kwargs.items()}
-    return args, kwargs
-
-
 # TODO(jansel): we should implement decomps or lowerings for these
 # https://github.com/pytorch/torchdynamo/issues/327
 make_fallback(aten._adaptive_avg_pool2d_backward, require_dense)
-make_fallback(aten.convolution_backward, constrain_to_fx_strides)
+make_fallback(aten.convolution_backward, inps_hook=conv_backward)
 make_fallback(aten._cudnn_rnn, require_dense)
-make_fallback(aten._cudnn_rnn_backward, require_contiguous)
-make_fallback(aten.cumsum, require_dense)
-make_fallback(aten._embedding_bag, require_contiguous)
-make_fallback(aten._embedding_bag_forward_only, require_contiguous)
+make_fallback(aten._cudnn_rnn_backward, inps_hook=require_contiguous)
+make_fallback(aten.cumsum, inps_hook=require_dense)
+make_fallback(aten._embedding_bag, inps_hook=require_contiguous)
+make_fallback(aten._embedding_bag_forward_only, inps_hook=require_contiguous)
 make_fallback(aten._fused_moving_avg_obs_fq_helper)
 make_fallback(aten._fused_moving_avg_obs_fq_helper_functional)
-make_fallback(aten.grid_sampler_2d_backward, require_dense)
+make_fallback(aten.grid_sampler_2d_backward, inps_hook=require_dense)
 make_fallback(aten.randperm)
 make_fallback(aten.sort)
 make_fallback(aten.sort.stable)
 make_fallback(aten._sparse_coo_tensor_with_dims_and_tensors)
-make_fallback(aten._thnn_fused_lstm_cell, require_dense)
+make_fallback(aten._thnn_fused_lstm_cell, inps_hook=require_dense)
 make_fallback(aten.topk)
-make_fallback(aten.upsample_bicubic2d_backward, require_contiguous)
-make_fallback(aten.upsample_bilinear2d_backward, require_dense)
-
-
-add_layout_constraint(aten.convolution, constrain_to_fx_strides)
+make_fallback(aten.upsample_bicubic2d_backward, inps_hook=require_contiguous)
+make_fallback(aten.upsample_bilinear2d_backward, inps_hook=require_dense)
 
 
 @register_lowering(aten.convolution)

From baa31ae6719aa5e84d6afafe8bf40cfb90da4861 Mon Sep 17 00:00:00 2001
From: maxren <maxren@meta.com>
Date: Mon, 21 Nov 2022 10:58:05 -0800
Subject: [PATCH 1151/1922] [xnnpack][lite-int] Handle Constant Data (#89445)

Handling constant data for xnnpack delegation. This allows us to handle new modules like such:

```
class Module(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self._constant = torch.ones(4, 4, 4)

            def forward(self, x):
                return x + self._constant
```

this is the precursor work to handling convolution, as we need to serialize constant data(weights)

Differential Revision: [D41050349](https://our.internmc.facebook.com/intern/diff/D41050349/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89445
Approved by: https://github.com/digantdesai
---
 test/jit/xnnpack/test_xnnpack_delegate.py     | 28 +++++++++++++++
 .../xnnpack/compiler/xnn_compiler.cpp         | 13 ++++---
 .../xnnpack/serialization/serializer.cpp      | 35 +++++++++++--------
 .../xnnpack/serialization/serializer.h        | 17 ++++++---
 .../xnnpack/xnnpack_graph_builder.cpp         | 26 +++++++++++---
 5 files changed, 89 insertions(+), 30 deletions(-)

diff --git a/test/jit/xnnpack/test_xnnpack_delegate.py b/test/jit/xnnpack/test_xnnpack_delegate.py
index 997cc757e629d..c54d9ba1b0881 100644
--- a/test/jit/xnnpack/test_xnnpack_delegate.py
+++ b/test/jit/xnnpack/test_xnnpack_delegate.py
@@ -8,6 +8,34 @@
 torch.ops.load_library("//caffe2:xnnpack_backend")
 
 class TestXNNPackBackend(unittest.TestCase):
+    def test_xnnpack_constant_data(self):
+        class Module(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self._constant = torch.ones(4, 4, 4)
+
+            def forward(self, x):
+                return x + self._constant
+
+        scripted_module = torch.jit.script(Module())
+
+        lowered_module = torch._C._jit_to_backend(
+            "xnnpack",
+            scripted_module,
+            {
+                "forward": {
+                    "inputs" : [torch.randn(4, 4, 4)],
+                    "outputs": [torch.randn(4, 4, 4)]
+                }
+            }
+        )
+
+        for i in range(0, 20):
+            sample_input = torch.randn(4, 4, 4)
+            actual_output = scripted_module(sample_input)
+            expected_output = lowered_module(sample_input)
+            self.assertTrue(torch.allclose(actual_output, expected_output, atol=1e-03, rtol=1e-03))
+
     def test_xnnpack_lowering(self):
         class Module(torch.nn.Module):
             def __init__(self):
diff --git a/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp b/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp
index 0f654dff0ac00..a64bf35431fdd 100644
--- a/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp
+++ b/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.cpp
@@ -42,24 +42,23 @@ void XNNCompiler::compileModel(
       case fb_xnnpack::XValueUnion::XNNTensorValue: {
         auto tensor_value = value->xvalue_as_XNNTensorValue();
 
-        const void* data_ptr = nullptr;
-        auto buffer_idx = tensor_value->constant_buffer_idx();
-        if (buffer_idx != 0) {
-          // TODO: @maxren implement data handling
-          TORCH_CHECK(false, "Constant data handling not yet implemented")
-        }
         std::vector<size_t> dims_data;
         for (auto dim : *tensor_value->dims()) {
           dims_data.push_back(static_cast<size_t>(dim));
         }
 
         uint32_t id = XNN_INVALID_VALUE_ID;
+        const auto& constant_buffer = *flatbuffer_graph->constant_buffer();
+        auto buffer_idx = tensor_value->constant_buffer_idx();
+        const auto buffer_ptr = buffer_idx == 0
+            ? nullptr
+            : constant_buffer[buffer_idx]->storage()->data();
         status = xnn_define_tensor_value(
             /*subgraph=*/subgraph_ptr,
             /*datatype=*/xnn_datatype_fp32,
             /*num_dims=*/tensor_value->num_dims(),
             /*dims=*/dims_data.data(),
-            /*data=*/data_ptr,
+            /*data=*/buffer_ptr,
             /*external_id=*/tensor_value->external_id(),
             /*flags=*/tensor_value->flags(),
             /*id_out=*/&id);
diff --git a/torch/csrc/jit/backends/xnnpack/serialization/serializer.cpp b/torch/csrc/jit/backends/xnnpack/serialization/serializer.cpp
index 63cb62c5698ea..637f7cdf4c521 100644
--- a/torch/csrc/jit/backends/xnnpack/serialization/serializer.cpp
+++ b/torch/csrc/jit/backends/xnnpack/serialization/serializer.cpp
@@ -24,26 +24,33 @@ void XNNSerializer::serializeAddNode(
   _nodes.push_back(flatbufferNode);
 }
 
+size_t XNNSerializer::serializeData(const uint8_t* data_ptr, size_t num_bytes) {
+  size_t constant_buffer_idx = 0;
+  // Handling the tensor _values with data
+  if (data_ptr != nullptr) {
+    // steps:
+    // 1. creating flatbuffer byte-vector for tensor data
+    auto storage = _builder.CreateVector(data_ptr, num_bytes);
+
+    // 2. put it in the common buffer
+    constant_buffer_idx = _constantBuffer.size();
+    _constantBuffer.emplace_back(CreateBuffer(_builder, storage));
+
+    // 3. record size into bufferSizes
+    _bufferSizes.push_back(num_bytes);
+    assert(_bufferSizes.size() == _constantBuffer.size());
+  }
+  return constant_buffer_idx;
+}
+
 void XNNSerializer::serializeTensorValue(
     uint32_t xnn_datatype,
     size_t num_dims,
     std::vector<size_t> dims,
-    void* data,
+    size_t data_buffer_idx,
     uint32_t external_id,
     uint32_t flags,
     uint32_t id_out) {
-  // we will reserve buffers without data to index 0
-  int constant_buffer_idx = 0;
-  // Handling the tensor _values with data
-  // TODO @maxren fill out when handling tensors with data
-  if (data != nullptr) {
-    assert(false); // not supported yet
-    // steps:
-    // 1. creating buffer to store the 16 bit aligned data
-    // 2. increment buffer_idx, to reflect no buffer being added
-    // 3. record size into bufferSizes
-  }
-
   std::vector<uint32_t> serialized_dims;
   serialized_dims.reserve(dims.size());
   for (auto dim : dims) {
@@ -55,7 +62,7 @@ void XNNSerializer::serializeTensorValue(
       XNNDatatype(xnn_datatype),
       num_dims,
       &serialized_dims,
-      constant_buffer_idx,
+      data_buffer_idx,
       external_id,
       flags,
       id_out);
diff --git a/torch/csrc/jit/backends/xnnpack/serialization/serializer.h b/torch/csrc/jit/backends/xnnpack/serialization/serializer.h
index 08a3875d32673..5a683c3dc3233 100644
--- a/torch/csrc/jit/backends/xnnpack/serialization/serializer.h
+++ b/torch/csrc/jit/backends/xnnpack/serialization/serializer.h
@@ -17,15 +17,18 @@ class XNNSerializer {
  public:
   // Constructors
   // initial buffersize of 1024 which will grow
-  // automatically
+  // automatically, constant buffer and buffer sizes initialized with dummy
+  // values as 0 index is reserved for non-constant tensors
   XNNSerializer() : XNNSerializer(1024) {}
 
   explicit XNNSerializer(size_t bufferSize)
       : _builder(bufferSize),
         _nodes(),
         _values(),
-        _constantBuffer(),
-        _bufferSizes() {}
+        _constantBuffer({CreateBuffer(
+            _builder,
+            {})}), // index 0 is reserved for non-const data
+        _bufferSizes({0}) {}
 
   // Serializing Nodes
 
@@ -43,7 +46,7 @@ class XNNSerializer {
       uint32_t xnn_datatype,
       size_t num_dims,
       std::vector<size_t> dims,
-      void* data,
+      size_t buffer_data_idx,
       uint32_t external_id,
       uint32_t flags,
       uint32_t id_out);
@@ -54,6 +57,12 @@ class XNNSerializer {
       std::vector<uint32_t> output_ids,
       size_t num_extern_ids);
 
+  // decoupled data serialization with tensor values. This way constant tensor
+  // data can be referenced by multiple intermediate tensors. This call
+  // serializes the num_bytes of the data_ptr and returns the index it was
+  // placed in.
+  size_t serializeData(const uint8_t* data_ptr, size_t num_bytes);
+
  private:
   // xnnpack version we are serializing
   const char* _version_sha1 = "ae108ef49aa5623b896fc93d4298c49d1750d9ba";
diff --git a/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp
index 45a4bd2fa7954..7c7bb2d02e4c2 100644
--- a/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp
+++ b/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.cpp
@@ -225,6 +225,22 @@ void XNNGraph::defineAllTensorValues() {
       // update flag for if tensor is either graph input/output
       uint32_t flags = 0;
 
+      // Check if value was produced by prim::Constant
+      void* value_data = nullptr;
+      size_t buffer_idx = 0;
+      size_t num_bytes = 0;
+      if (val->node()->kind() == prim::Constant) {
+        c10::optional<IValue> constant = val->node()->t(attr::value);
+        auto const_val = constant->toIValue().toTensor();
+        // Need tensor data to be contiguous for serialization
+        auto cont_const_val = const_val.contiguous();
+        value_data = cont_const_val.data_ptr();
+
+        num_bytes = const_val.storage().nbytes();
+        buffer_idx = _serializer.serializeData(
+            static_cast<const uint8_t*>(value_data), num_bytes);
+      }
+
       if (isGraphInput(val) || isGraphOutput(val)) {
         if (isGraphInput(val)) {
           flags |= XNN_VALUE_FLAG_EXTERNAL_INPUT;
@@ -239,21 +255,21 @@ void XNNGraph::defineAllTensorValues() {
           /*datatype=*/xnn_datatype_fp32,
           /*num_dims=*/num_dims,
           /*dims=*/tensor_shape.data(),
-          /*data=*/nullptr, // currently no constant data
+          /*data=*/value_data,
           /*external_id=*/ext_id,
           /*flags=*/flags,
           /*id_out=*/&id);
+      TORCH_CHECK(
+          status == xnn_status_success,
+          "failed to define xnn_tensor_id for: " + val->debugName());
       _serializer.serializeTensorValue(
           xnn_datatype_fp32,
           num_dims,
           tensor_shape,
-          nullptr,
+          buffer_idx,
           ext_id,
           flags,
           id);
-      TORCH_CHECK(
-          status == xnn_status_success,
-          "failed to define xnn_tensor_id for: " + val->debugName());
       _val_to_ids.insert({val, id});
     }
   }

From 4f885d1ba66b06765e6e42d9fe7556c1ca400b84 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Tue, 22 Nov 2022 02:23:21 +0000
Subject: [PATCH 1152/1922] [inductor][compilation time] Fallback when kernel
 size for avg/max pool is large (#89448)

This fixes compilation time for yolov3 from 400 seconds to 48 seconds. yolov3 has a 13x13 max_pool2d kernel, which was creating really large Triton code.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89448
Approved by: https://github.com/ngimel
---
 test/inductor/test_torchinductor.py | 93 +++++++++++++++++++++++++++++
 torch/_dynamo/utils.py              |  4 +-
 torch/_inductor/codecache.py        | 14 ++++-
 torch/_inductor/lowering.py         | 64 ++++++++++++++++++++
 4 files changed, 173 insertions(+), 2 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 0d28f156ecc08..0aaf74886c7cf 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -1920,6 +1920,7 @@ def fn(x):
         self.common(
             fn,
             (torch.randn(2, 4, 16, 16),),
+            check_lowp=False,
         )
 
         # lowering to avg_pool2d case
@@ -1934,6 +1935,19 @@ def fn(x):
             (torch.randn(2, 4, 6, 6),),
         )
 
+    def test_adaptive_avg_pool2d2(self):
+        # Big kernel size, use fallback
+        def fn(x):
+            return aten._adaptive_avg_pool2d(x, (4, 4))
+
+        torch._inductor.metrics.generated_kernel_count = 0
+        self.common(
+            fn,
+            (torch.randn(2, 4, 21, 21),),
+            check_lowp=False,
+        )
+        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 0)
+
     def test_max_pool2d1(self):
         def fn(x):
             return aten.max_pool2d_with_indices(x, [3, 3], [2, 2])
@@ -1981,6 +1995,18 @@ def fn(x):
             (torch.randn([16, 64, 55, 55]),),
         )
 
+    def test_max_pool2d6(self):
+        # Too big kernel size, use fallback
+        def fn(x):
+            return aten.max_pool2d_with_indices(x, [13, 13], [])
+
+        torch._inductor.metrics.generated_kernel_count = 0
+        self.common(
+            fn,
+            (torch.randn([16, 64, 55, 55]),),
+        )
+        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 0)
+
     def test_avg_pool2d1(self):
         def fn(x):
             return aten.avg_pool2d(x, [3, 3], [2, 2])
@@ -2035,6 +2061,18 @@ def fn(x):
             (-torch.arange(1 * 8 * 8, dtype=torch.float32).view(1, 1, 8, 8),),
         )
 
+    def test_avg_pool2d7(self):
+        # Large kernel size, use fallback
+        def fn(x):
+            return aten.avg_pool2d(x, [13, 13], [1, 1], [0, 0])
+
+        torch._inductor.metrics.generated_kernel_count = 0
+        self.common(
+            fn,
+            (-torch.arange(1 * 24 * 24, dtype=torch.float32).view(1, 1, 24, 24),),
+        )
+        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 0)
+
     def test_alexnet_prefix(self):
         def forward(arg6, arg7, arg16):
             convolution = torch.ops.aten.convolution(
@@ -3936,6 +3974,7 @@ def fn(a, b, c):
                 a, b, [5, 5], [1, 1], [2, 2], [1, 1], False, c
             )
 
+        torch._inductor.metrics.generated_kernel_count = 0
         x = torch.randn([2, 64, 3, 4])
         result, indices = aten.max_pool2d_with_indices(
             x,
@@ -3953,6 +3992,34 @@ def fn(a, b, c):
                 indices,
             ],
         )
+        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
+
+    def test_max_pool2d_with_indices_backward5(self):
+        # Window size is too big. Should fallback
+        def fn(a, b, c):
+            return aten.max_pool2d_with_indices_backward(
+                a, b, [13, 13], [1, 1], [2, 2], [1, 1], False, c
+            )
+
+        torch._inductor.metrics.generated_kernel_count = 0
+        x = torch.randn([2, 64, 20, 20])
+        result, indices = aten.max_pool2d_with_indices(
+            x,
+            [13, 13],
+            [1, 1],
+            2,
+            1,
+            False,
+        )
+        self.common(
+            fn,
+            [
+                torch.randn_like(result),
+                x,
+                indices,
+            ],
+        )
+        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 0)
 
     def test_avg_pool2d_backward(self):
         def fn(a, b):
@@ -4009,6 +4076,7 @@ def fn(a, b):
                 None,
             )
 
+        torch._inductor.metrics.generated_kernel_count = 0
         self.common(
             fn,
             [
@@ -4016,6 +4084,31 @@ def fn(a, b):
                 torch.randn([1, 2016, 21, 21]),
             ],
         )
+        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
+
+    def test_avg_pool2d_backward4(self):
+        def fn(a, b):
+            return aten.avg_pool2d_backward(
+                a,
+                b,
+                [13, 13],
+                [1, 1],
+                [0, 0],
+                True,
+                False,
+                None,
+            )
+
+        torch._inductor.metrics.generated_kernel_count = 0
+        self.common(
+            fn,
+            [
+                torch.randn([1, 16, 12, 12]),
+                torch.randn([1, 16, 24, 24]),
+            ],
+            check_lowp=False,
+        )
+        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 0)
 
     def test_mm_views(self):
         def fn(a, b):
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 889bb5683b6b0..cbf5a0b46148d 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -88,7 +88,9 @@ def time_wrapper(*args, **kwargs):
             compilation_metrics[key] = []
         t0 = time.time()
         r = func(*args, **kwargs)
-        compilation_metrics[key].append(time.time() - t0)
+        latency = time.time() - t0
+        # print(f"Dynamo timer: key={key}, latency={latency:.2f} sec")
+        compilation_metrics[key].append(latency)
         return r
 
     return time_wrapper
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 232a611b06c6a..c020ff52f3af0 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -22,7 +22,6 @@
 
 import torch
 from torch.utils import cpp_extension
-
 from . import config, cuda_properties, exc
 
 LOCK_TIMEOUT = 600
@@ -449,17 +448,30 @@ def _load_kernel(source_code):
     return kernel
 
 
+def _load_kernel_name(source_code):
+    return TritonCodeCache.get_name(PyCodeCache.load(source_code))
+
+
 class TritonFuture:
     def __init__(self, source_code, future):
         self.source_code = source_code
         self.future = future
 
+    # @dynamo_utils.dynamo_timed
     def result(self):
+        t0 = time()
         if hasattr(self, "kernel"):
             return self.kernel
         # If the worker failed this will throw an exception.
         self.future.result()
         kernel = self.kernel = _load_kernel(self.source_code)
+        latency = time() - t0
+        if latency > 50:
+            name = _load_kernel_name(self.source_code)
+            log.warning(
+                f"Detected long compilation time of {latency} seconds for kernel name {name}"
+            )
+            log.warning(self.source_code)
         del self.source_code, self.future
         return kernel
 
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 80743a563e731..221f064e2e731 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -2580,6 +2580,9 @@ def pooling_size(x, i, kernel_size, stride, padding, ceil_mode):
     return x_out, ceil_mode
 
 
+fallback_max_pool2d_with_indices = fallback_handler(aten.max_pool2d_with_indices)
+
+
 @register_lowering(aten.max_pool2d_with_indices, type_promotion_kind=None)
 def max_pool2d_with_indices(
     x, kernel_size, stride=None, padding=0, dilation=1, ceil_mode=False
@@ -2608,6 +2611,13 @@ def max_pool2d_with_indices(
         x_loader = x.make_loader()
 
     new_size = list(batch) + [h_out, w_out]
+    window_size = kernel_size[0] * kernel_size[1]
+
+    if window_size > 25:
+        # Kernel size too big. Results in hard-to-optimize Triton code. Use fallback.
+        return fallback_max_pool2d_with_indices(
+            x, kernel_size, stride, padding, dilation, ceil_mode
+        )
 
     def fn(idx, return_index):
         *prefix, bh, bw = idx
@@ -2645,6 +2655,11 @@ def fn(idx, return_index):
     return r1, r2
 
 
+fallback_max_pool2d_with_indices_backward = fallback_handler(
+    aten.max_pool2d_with_indices_backward
+)
+
+
 @register_lowering(aten.max_pool2d_with_indices_backward, type_promotion_kind=None)
 def max_pool2d_with_indices_backward(
     grad_output, x, kernel_size, stride, padding, dilation, ceil_mode, indices
@@ -2685,6 +2700,14 @@ def max_pool2d_with_indices_backward(
         ]
     )
 
+    window_size = h_window_size * w_window_size
+
+    if window_size > 25:
+        # Kernel size too big. Results in hard-to-optimize Triton code. Use fallback.
+        return fallback_max_pool2d_with_indices_backward(
+            grad_output, x, kernel_size, stride, padding, dilation, ceil_mode, indices
+        )
+
     def fn(idx):
         *prefix, h, w = idx
         index_test = ops.index_expr(h * width + w, torch.int32)
@@ -2807,6 +2830,9 @@ def fn_sum(idx, loader):
     return fn_sum
 
 
+fallback_adaptive_avg_pool2d = fallback_handler(aten._adaptive_avg_pool2d)
+
+
 @register_lowering(aten._adaptive_avg_pool2d)
 def _adaptive_avg_pool2d(x, output_size):
     assert isinstance(x, TensorBox)
@@ -2846,6 +2872,11 @@ def end_index(index, out_dim, inp_dim):
     w_start_index = functools.partial(start_index, out_dim=w_out, inp_dim=w_in)
     w_end_index = functools.partial(end_index, out_dim=w_out, inp_dim=w_in)
 
+    window_size = h_kernel_max * w_kernel_max
+    if window_size > 25:
+        # Kernel size too big. Results in hard-to-optimize Triton code. Use fallback.
+        return fallback_adaptive_avg_pool2d(x, output_size)
+
     fn_sum = _adaptive_pooling_idx_sum(
         [h_kernel_max, w_kernel_max],
         [h_start_index, w_start_index],
@@ -2916,6 +2947,9 @@ def fn(idx):
     return rv
 
 
+fallback_avg_pool2d = fallback_handler(aten.avg_pool2d)
+
+
 @register_lowering(aten.avg_pool2d, type_promotion_kind=None)
 def avg_pool2d(
     x,
@@ -2953,6 +2987,19 @@ def avg_pool2d(
     new_size = list(batch) + [h_out, w_out]
     dtype = x.get_dtype()
 
+    window_size = kernel_size[0] * kernel_size[1]
+    if window_size > 25:
+        # Kernel size too big. Results in hard-to-optimize Triton code. Use fallback.
+        return fallback_avg_pool2d(
+            x,
+            kernel_size,
+            stride,
+            padding,
+            ceil_mode,
+            count_include_pad,
+            divisor_override,
+        )
+
     def fn_sum(idx, loader):
         *prefix, bh, bw = idx
         total = None
@@ -2992,6 +3039,9 @@ def fn(idx):
     return rv
 
 
+fallback_avg_pool2d_backward = fallback_handler(aten.avg_pool2d_backward)
+
+
 @register_lowering(aten.avg_pool2d_backward, type_promotion_kind=None)
 def avg_pool2d_backward(
     grad_output,
@@ -3045,6 +3095,20 @@ def avg_pool2d_backward(
         ]
     )
 
+    window_size = h_window_size * w_window_size
+    if window_size > 25:
+        # Kernel size too big. Results in hard-to-optimize Triton code. Use fallback.
+        return fallback_avg_pool2d_backward(
+            grad_output,
+            x,
+            kernel_size,
+            stride,
+            padding,
+            ceil_mode,
+            count_include_pad,
+            divisor_override,
+        )
+
     def compute_pool_size_without_padding(ph, pw):
         """
         This computes the scaling factor that we will divide an element

From 23981de771e8f2bb15d01de45ef37612bec8dfa6 Mon Sep 17 00:00:00 2001
From: fduwjj <fduwjj@fb.com>
Date: Tue, 22 Nov 2022 03:05:50 +0000
Subject: [PATCH 1153/1922] [PT-D][Tensor Parallelism][2/N] Sync TP API change
 to PT prod (#89467)

This is part of TP Beta Release efforts.
ref: https://github.com/pytorch/tau/issues/576
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89467
Approved by: https://github.com/wanchaol
---
 .../_tensor/parallel/test_2d_parallel.py      |  27 +-
 .../_tensor/parallel/test_parallelize_api.py  |   4 +-
 .../_tensor/parallel/test_tp_examples.py      |  26 +-
 .../_tensor/parallel/test_tp_style.py         |   8 +-
 .../distributed/_tensor/parallel/__init__.py  |  30 +-
 torch/distributed/_tensor/parallel/api.py     | 326 ++++++++++++------
 torch/distributed/_tensor/parallel/style.py   |   8 +-
 torch/distributed/_tensor/parallel/utils.py   |  12 +-
 8 files changed, 280 insertions(+), 161 deletions(-)

diff --git a/test/distributed/_tensor/parallel/test_2d_parallel.py b/test/distributed/_tensor/parallel/test_2d_parallel.py
index 7a3779c296c3c..ea41d5388660e 100644
--- a/test/distributed/_tensor/parallel/test_2d_parallel.py
+++ b/test/distributed/_tensor/parallel/test_2d_parallel.py
@@ -16,6 +16,10 @@
     Shard,
     Replicate,
 )
+from torch.distributed._tensor.parallel import (
+    PairwiseParallel,
+    parallelize_module,
+)
 
 import torch.distributed.distributed_c10d as distributed_c10d
 
@@ -32,17 +36,6 @@
 TP_DEGREE = 2
 LR = 3e-5
 
-OPS_NOT_SHARD = [
-    "net3.weight",
-    "net3.bias",
-]
-
-SHARD_PARAMS = [
-    "net1.weight",
-    "net1.bias",
-    "net2.weight",
-]
-
 
 class SimpleModel(torch.nn.Module):
     def __init__(self):
@@ -108,10 +101,9 @@ def shard_module(m, pg):
     m.net2 = _aggregate_local_tensor(m.net2)
 
 
-def _shard_wrap_module(module, module_shard, fsdp_wrap, tp_pg, fsdp_pg):
+def _shard_wrap_module(module, module_shard, fsdp_wrap, mesh_2d, fsdp_pg):
     if module_shard:
-        # Fetch the module sharding planner.
-        shard_module(module, tp_pg)
+        parallelize_module(module, mesh_2d, PairwiseParallel(), tp_mesh_dim=1)
 
     if fsdp_wrap and module_shard:
         return FSDP(module, process_group=fsdp_pg)
@@ -134,11 +126,10 @@ def init_model(model_parallel_size=TP_DEGREE):
     )
 
     fsdp_pg = twod_mesh.get_dim_groups()[0]
-    tp_pg = twod_mesh.get_dim_groups()[1]
 
     # Create Input
-    model = _shard_wrap_module(model, True, True, tp_pg, fsdp_pg)
-    return model, tp_pg, fsdp_pg
+    model = _shard_wrap_module(model, True, True, twod_mesh, fsdp_pg)
+    return model, fsdp_pg
 
 
 def is_nested_tensor(val: Any) -> bool:
@@ -200,7 +191,7 @@ def test_2d_fsdp_integration_correctness(self) -> None:
         model = SimpleModel().cuda(self.rank)
         model = FSDP(model)
         torch.manual_seed(0)
-        model_2d, _, dp_pg = init_model()
+        model_2d, dp_pg = init_model()
 
         optim = torch.optim.Adam(model.parameters(), lr=0.0001)
         optim_2d = torch.optim.Adam(model_2d.parameters(), lr=0.0001)
diff --git a/test/distributed/_tensor/parallel/test_parallelize_api.py b/test/distributed/_tensor/parallel/test_parallelize_api.py
index fb3e8f4721c86..036f4ef79a491 100644
--- a/test/distributed/_tensor/parallel/test_parallelize_api.py
+++ b/test/distributed/_tensor/parallel/test_parallelize_api.py
@@ -86,7 +86,7 @@ def test_parallelize_mlp(self):
         device_mesh = DeviceMesh(
             self.device_type, torch.arange(self.world_size)
         )
-        _parallelize_mlp(model_tp, device_mesh, PairwiseParallel())
+        model_tp = _parallelize_mlp(model_tp, device_mesh, PairwiseParallel())
 
         # Ensure the parameter is properly distributed.
         self.assertEqual(
@@ -125,7 +125,7 @@ def __init__(self) -> None:
             _parallelize_mlp(model_tp, device_mesh, DummyParallel())
 
         with self.assertRaisesRegex(
-            RuntimeError, "We only support even number of Linear for MLP."
+            RuntimeError, "More than one nn.Linear needed for a MLP."
         ):
             _parallelize_mlp(
                 torch.nn.Linear(10, 5), device_mesh, PairwiseParallel()
diff --git a/test/distributed/_tensor/parallel/test_tp_examples.py b/test/distributed/_tensor/parallel/test_tp_examples.py
index 696171e4ca883..74cd44dfd57d8 100644
--- a/test/distributed/_tensor/parallel/test_tp_examples.py
+++ b/test/distributed/_tensor/parallel/test_tp_examples.py
@@ -11,18 +11,14 @@
     skip_unless_torch_gpu,
 )
 from torch.distributed._tensor import (
-    distribute_module,
     DeviceMesh,
     Replicate,
 )
 from torch.distributed._tensor.parallel import (
+    PairwiseParallel,
     TensorParallelMultiheadAttention,
-    tp_shard_self_attn,
-    replicate_input,
-    replicate_output,
+    parallelize_module,
 )
-from torch.distributed._tensor.parallel import PairwiseParallel
-from torch.distributed._tensor.parallel.api import _parallelize_mlp
 
 
 class MLPModule(torch.nn.Module):
@@ -70,7 +66,7 @@ def test_mlp_megatron_e2e(self):
             self.device_type,
             torch.arange(0, NUM_DEVICES),
         )
-        _parallelize_mlp(model_tp, device_mesh, PairwiseParallel())
+        model_tp = parallelize_module(model_tp, device_mesh, PairwiseParallel())
         optim = torch.optim.SGD(model.parameters(), lr=LR)
         optim_tp = torch.optim.SGD(model_tp.parameters(), lr=LR)
 
@@ -182,13 +178,7 @@ def test_self_attn_megatron_e2e(self):
 
         # Shard module and initialize optimizer.
         device_mesh = DeviceMesh(self.device_type, list(range(NUM_DEVICES)))
-        distribute_module(
-            model_tp,
-            device_mesh,
-            partition_fn=tp_shard_self_attn,
-            input_fn=replicate_input,
-            output_fn=replicate_output,
-        )
+        parallelize_module(model_tp, device_mesh, PairwiseParallel())
 
         device_mesh = model_tp.qkv.weight.device_mesh
         replicate = [Replicate()] * device_mesh.ndim
@@ -339,13 +329,7 @@ def test_self_attn_replacement_megatron_e2e(self):
 
         # Shard module and initialize optimizer.
         device_mesh = DeviceMesh(self.device_type, list(range(NUM_DEVICES)))
-        distribute_module(
-            model_tp,
-            device_mesh,
-            partition_fn=tp_shard_self_attn,
-            input_fn=replicate_input,
-            output_fn=replicate_output,
-        )
+        parallelize_module(model_tp, device_mesh, PairwiseParallel())
 
         device_mesh = model_tp.attn.qkv.weight.device_mesh
         replicate = [Replicate()] * device_mesh.ndim
diff --git a/test/distributed/_tensor/parallel/test_tp_style.py b/test/distributed/_tensor/parallel/test_tp_style.py
index 314fe470955b4..e52aef1a6f3f6 100644
--- a/test/distributed/_tensor/parallel/test_tp_style.py
+++ b/test/distributed/_tensor/parallel/test_tp_style.py
@@ -64,7 +64,9 @@ def test_make_input_shard_1d(self):
     def _test_prepare_output(
         self, func, spec, dim=None, device_mesh_input_none=False
     ):
-        device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+        device_mesh = DeviceMesh(
+            self.device_type, torch.arange(self.world_size)
+        )
         tensor = torch.rand(8, 16, device=self.device_type)
         dtensor = distribute_tensor(tensor, device_mesh, spec)
         device_mesh_input = None if device_mesh_input_none else device_mesh
@@ -135,7 +137,9 @@ def test_make_output_tensor(self):
     # Common logic for testing prepare output funcs errors.
     def _test_prepare_output_error(self, func):
         tensor = torch.rand(8, 16, device=self.device_type)
-        device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+        device_mesh = DeviceMesh(
+            self.device_type, torch.arange(self.world_size)
+        )
         dtensor = distribute_tensor(tensor, device_mesh, [Shard(0)])
         output = [dtensor]
         with self.assertRaisesRegex(
diff --git a/torch/distributed/_tensor/parallel/__init__.py b/torch/distributed/_tensor/parallel/__init__.py
index 0ef0e8ff0b9ef..3c72143f345f3 100644
--- a/torch/distributed/_tensor/parallel/__init__.py
+++ b/torch/distributed/_tensor/parallel/__init__.py
@@ -3,20 +3,32 @@
     TensorParallelMultiheadAttention,
 )
 
-from torch.distributed._tensor.parallel.api import (
-    tp_shard_self_attn,
-    replicate_input,
-    replicate_output,
-)
-
 from torch.distributed._tensor.parallel.style import (
+    ColwiseParallel,
     ParallelStyle,
     PairwiseParallel,
     RowwiseParallel,
-    ColwiseParallel,
-    make_input_shard_1d,
     make_input_replicate_1d,
-    make_output_shard_1d,
+    make_input_shard_1d,
     make_output_replicate_1d,
+    make_output_shard_1d,
     make_output_tensor,
 )
+
+from torch.distributed._tensor.parallel.api import (
+    parallelize_module,
+)
+
+__all__ = [
+    "ColwiseParallel",
+    "TensorParallelMultiheadAttention",
+    "ParallelStyle",
+    "PairwiseParallel",
+    "RowwiseParallel",
+    "make_input_replicate_1d",
+    "make_input_shard_1d",
+    "make_output_replicate_1d",
+    "make_output_tensor",
+    "make_output_shard_1d",
+    "parallelize_module",
+]
diff --git a/torch/distributed/_tensor/parallel/api.py b/torch/distributed/_tensor/parallel/api.py
index 68d444882c4c8..a7a896ebf8598 100644
--- a/torch/distributed/_tensor/parallel/api.py
+++ b/torch/distributed/_tensor/parallel/api.py
@@ -1,106 +1,127 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import torch
 import torch.nn as nn
-from typing import Sequence, Tuple
+from typing import Union, Dict
 from torch.distributed._tensor import (
     distribute_module,
     distribute_tensor,
-    DTensor,
     Shard,
     Replicate,
     DeviceMesh,
-    Placement,
 )
 from torch.distributed._tensor.parallel import TensorParallelMultiheadAttention
-from torch.distributed._tensor.parallel.style import ParallelStyle, PairwiseParallel
+from torch.distributed._tensor.parallel.style import PairwiseParallel, ParallelStyle
 from torch.distributed._tensor.parallel.utils import _create_1d_device_mesh
 
 
-def replicate_input(
-    inputs: Sequence[torch.Tensor], device_mesh: DeviceMesh
-) -> Tuple[DTensor, ...]:
-    replicate = [Replicate()] * device_mesh.ndim
-    return tuple(
-        DTensor.from_local(tensor, device_mesh, replicate) for tensor in inputs
-    )
+__all__ = [
+    "parallelize_module",
+]
 
 
-def replicate_output(output: DTensor, device_mesh: DeviceMesh) -> torch.Tensor:
-    if isinstance(output, DTensor):
-        replicate = [Replicate()] * output.device_mesh.ndim
-        # TODO: can the output be left incontiguous?
-        return (
-            output.redistribute(output.device_mesh, replicate)
-            .to_local()
-            .contiguous()
-        )
+def parallelize_module(  # type: ignore[return]
+    module: nn.Module,
+    device_mesh: DeviceMesh,
+    parallelize_plan: Union[ParallelStyle, Dict[str, ParallelStyle]],
+    tp_mesh_dim: int = 0,
+) -> nn.Module:
+    """
+    The API to apply Tensor Parallelism (TP) in PyTorch. We parallelize module
+    or sub_modules based on a parallelize_plan which contains the parallel_style
+    which indicates how user want the module or sub_module to be parallelized.
+    User can also specify different parallel_style per module fully qualifed name (FQN).
+    The API supports 2D parallelism natively by accepting an n-dimension device_mesh
+    and users just need to specify the dimension where we perform tensor parallelism on.
 
+    Args:
+        module (nn.Module):
+            :class:`nn.Module` object to be parallelized.
+        device_mesh (DeviceMesh):
+            :class:`DeviceMesh` object which describes the mesh topology
+            of devices for the DTensor.
+        parallelize_plan (Union[ParallelStyle, Dict[str, ParallelStyle]]):
+            The plan used to parallelize the module. It can be either a
+            :class:`ParallelStyle` object which contains how
+            we prepare input/output for Tensor Parallelism or it can be a
+            dict of module FQN and its corresponding :class:`ParallelStyle` object.
+        tp_mesh_dim (int):
+            the dimension of ``device_mesh`` where we perform
+            Tensor Parallelism on.
 
-def tp_shard_self_attn(
-    name: str, module: nn.Module, device_mesh: DeviceMesh
-) -> None:
-    col_wise_sharding: Sequence[Placement] = [Shard(0)]
-    row_wise_sharding: Sequence[Placement] = [Shard(1)]
-    replicate: Sequence[Placement] = [Replicate()] * device_mesh.ndim
-
-    def _shard_self_attn_params(name: str, module: nn.Module) -> None:
-        if isinstance(module, nn.Linear):
-            if name == "qkv":
-                sharded_weight = nn.Parameter(
-                    distribute_tensor(
-                        module.weight, device_mesh, col_wise_sharding
-                    )
+    Return:
+        A :class:`nn.Module` object parallelized.
+
+    Example::
+        >>> # xdoctest: +SKIP("distributed")
+        >>> from from torch.distributed._tensor.parallel import parallelize_module, PairwiseParallel
+        >>>
+        >>> # Define the module.
+        >>> m = Model(...)
+        >>> m = parallelize_module(m, PairwiseParallel())
+        >>>
+
+    .. warning::
+        ``PairwiseParallel`` comes with constraints for now. If you need finer
+        granularity, you need to pass in a dict of module FQN and parallel style instead.
+    """
+
+    if device_mesh.ndim > 1:
+        device_mesh = _create_1d_device_mesh(device_mesh, tp_mesh_dim)
+
+    if isinstance(parallelize_plan, ParallelStyle):
+        if _is_mha_for_pairwise_parallel(module):
+            return _parallelize_multihead_attn(module, device_mesh)
+        elif _is_mlp_for_pairwise_parallel(module):
+            return _parallelize_mlp(module, device_mesh)
+        else:
+            for n, m in module.named_children():
+                module.register_module(
+                    n, parallelize_module(m, device_mesh, parallelize_plan)
                 )
-                module.register_parameter("weight", sharded_weight)
-                if module.bias is not None:
-                    sharded_bias = nn.Parameter(
-                        distribute_tensor(
-                            module.bias, device_mesh, col_wise_sharding
-                        )
-                    )
-                    module.register_parameter("bias", sharded_bias)
-            elif name == "proj":
-                sharded_weight = nn.Parameter(
-                    distribute_tensor(
-                        module.weight, device_mesh, row_wise_sharding
-                    )
+            return module
+    # TODO: Add parallelize linear logic when https://github.com/pytorch/tau/pull/624/ merged.
+    elif isinstance(parallelize_plan, dict):
+        for module_path, parallelize_style in parallelize_plan.items():
+            sub_module = module.get_submodule(module_path)
+            module.register_module(  # type: ignore[call-arg] # pyre-ignore[20]
+                parallelize_module(  # type: ignore[arg-type]
+                    module_path, sub_module, device_mesh, parallelize_style  # type: ignore[arg-type] # pyre-ignore[6]
                 )
-                module.register_parameter("weight", sharded_weight)
-                if module.bias is not None:
-                    replicated_bias = nn.Parameter(
-                        distribute_tensor(module.bias, device_mesh, replicate)
-                    )
-                    module.register_parameter("bias", replicated_bias)
-
-    if isinstance(module, TensorParallelMultiheadAttention):  # shard TPMA
-        for n, m in module.named_children():
-            _shard_self_attn_params(n, m)
+            )
+            return module
     else:
-        for n, m in module.named_children():  # replace with TPMA
-            if isinstance(m, nn.MultiheadAttention):
-                tp_multi_head_attention = TensorParallelMultiheadAttention(
-                    m.embed_dim,
-                    m.num_heads,
-                    device=torch.device(device_mesh.device_type),
-                    tp_size=device_mesh.size(0),  # group size on dim 0
-                    add_bias_kv=m.bias_k is not None,
-                )
-                tp_multi_head_attention.copy(m)
-                module.register_module(n, tp_multi_head_attention)
+        raise RuntimeError(  # pyre-ignore[7]
+            f"Expect Union[ParallelStyle, Dict[str, ParallelStyle]] for parallelize_plan, {type(parallelize_plan)} found!"
+        )
 
 
-def _has_even_num_linears(module: nn.Module) -> bool:
+def _is_mha_for_pairwise_parallel(module: nn.Module) -> bool:
     """
-    We traverse through all the children of the given module and count the
-    number of Linear module. If the number is even, we return True.
+    Check whether the mha module is the one can be handled for Pairwise parallel.
+
+    Args:
+        module (nn.Module):
+            :class:``nn.Module`` object to be checked.
+
+    Return:
+        A boolean object which specifies whether the module is MHA supported by Pairwise parallel or not.
+    """
+    return isinstance(module, TensorParallelMultiheadAttention) or isinstance(
+        module, nn.MultiheadAttention
+    )
+
+
+def _is_mlp_for_pairwise_parallel(module: nn.Module) -> bool:
+    """
+    Traverse through all the immediate children of the given module and count the
+    number of Linear module. If the number is more than one, we return True.
 
     Args:
         module (nn.Module):
             :class:``nn.Module`` object to be traversed and counted.
 
     Return:
-        A boolean object which specifies whether the module contains
-        event-number of Linears in its children.
+        A boolean object which specifies whether the module is MLP or not.
 
     .. warning::
         The traversal is not recursive for now.
@@ -108,15 +129,66 @@ def _has_even_num_linears(module: nn.Module) -> bool:
     linear_submodules = list(
         filter(lambda x: isinstance(x, nn.Linear), module.children())
     )
-    return len(linear_submodules) > 0 and len(linear_submodules) % 2 == 0
+    return len(linear_submodules) > 1
 
 
-def _parallelize_mlp(
+def _rowwise_parallelize_linear_fn(
+    name: str,
+    module: nn.Module,
+    device_mesh: DeviceMesh,
+) -> None:
+    """
+    This function parallelizes the input :class:``nn.Linear`` module in :class:``RowwiseParallel`` style.
+
+    Args:
+        name (str): name of the input module.
+        module (nn.Module): the :class:``nn.Linear`` object to be parallelized.
+        device_mesh (DeviceMesh): :class:``DeviceMesh`` object which describes the mesh topology
+            of devices for the DTensor.
+
+    Return:
+        None
+    """
+    for name, param in module.named_parameters():
+        dist_spec = (
+            [Shard(1)] if name == "weight" else [Replicate()]  # type: ignore[list-item]
+        )
+        dist_param = torch.nn.Parameter(
+            distribute_tensor(param, device_mesh, dist_spec)
+        )
+        module.register_parameter(name, dist_param)
+
+
+def _colwise_parallelize_linear_fn(
+    name: str,
+    module: nn.Module,
+    device_mesh: DeviceMesh,
+) -> None:
+    """
+    This function parallelizes the input :class:``nn.Linear`` module in :class:``ColwiseParallel`` style.
+
+    Args:
+        name (str): name of the input module.
+        module (nn.Module): the :class:``nn.Linear`` object to be parallelized.
+        device_mesh (DeviceMesh): :class:``DeviceMesh`` object which describes the mesh topology
+            of devices for the DTensor.
+
+    Return:
+        None
+    """
+    for name, param in module.named_parameters():
+        dist_param = torch.nn.Parameter(
+            distribute_tensor(param, device_mesh, [Shard(0)])
+        )
+        module.register_parameter(name, dist_param)
+
+
+def _parallelize_multihead_attn(
     module: nn.Module,
     device_mesh: DeviceMesh,
     parallel_style: ParallelStyle = PairwiseParallel(),
     tp_mesh_dim: int = 0,
-) -> None:
+) -> nn.Module:
     """
     This function assumes the input module is a sequence of nn.Linear
     and we parallelize the module based on the given parallel style.
@@ -137,37 +209,90 @@ def _parallelize_mlp(
             Tensor Parallelism on.
 
     Return:
-        None
+        A :class:``nn.Module`` object parallelized.
 
     .. warning::
         We only support ``PairwiseParallel`` right now.
     """
 
-    # Define partition functions needed.
-    def _rowwise_parallelize_fn(name, module, device_mesh):  # pyre-ignore[2, 3]
-        for name, param in module.named_parameters():
-            dist_spec = (
-                [Shard(1)] if name == "weight" else [Replicate()]  # type: ignore[list-item]
-            )
-            dist_param = torch.nn.Parameter(
-                distribute_tensor(param, device_mesh, dist_spec)
-            )
-            module.register_parameter(name, dist_param)
+    if not isinstance(parallel_style, PairwiseParallel):
+        raise NotImplementedError(
+            "Only support PairwiseParallel for Multihead Attention parallelization."
+        )
+
+    if device_mesh.ndim > 1:
+        device_mesh = _create_1d_device_mesh(device_mesh, tp_mesh_dim)
+
+    if isinstance(module, nn.MultiheadAttention):
+        tp_multi_head_attention = TensorParallelMultiheadAttention(
+            module.embed_dim,
+            module.num_heads,
+            device=torch.device(device_mesh.device_type),
+            tp_size=device_mesh.size(tp_mesh_dim),
+            add_bias_kv=module.bias_k is not None,
+        )
+        tp_multi_head_attention.copy(module)
+        module = tp_multi_head_attention
+
+    if isinstance(module, TensorParallelMultiheadAttention):  # shard TPMA
+        for n, m in module.named_children():
+            if n == "qkv":
+                # Col-wise Parallelize the qkv layer.
+                distribute_module(
+                    m,
+                    device_mesh,
+                    _colwise_parallelize_linear_fn,
+                    input_fn=parallel_style._prepare_input,  # type: ignore[arg-type, misc] # pyre-ignore[6]
+                )
+            elif n == "proj":
+                # Row-wise Parallelize the proj layer
+                distribute_module(
+                    m,
+                    device_mesh,
+                    _rowwise_parallelize_linear_fn,
+                    output_fn=parallel_style._prepare_output,  # type: ignore[arg-type, misc] # pyre-ignore[6]
+                )
+    return module
 
-    def _colwise_parallelize_fn(name, module, device_mesh):  # pyre-ignore[2, 3]
-        for name, param in module.named_parameters():
-            dist_param = torch.nn.Parameter(
-                distribute_tensor(param, device_mesh, [Shard(0)])
-            )
-            module.register_parameter(name, dist_param)
 
+def _parallelize_mlp(
+    module: nn.Module,
+    device_mesh: DeviceMesh,
+    parallel_style: ParallelStyle = PairwiseParallel(),
+    tp_mesh_dim: int = 0,
+) -> nn.Module:
+    """
+    This function assumes the input module is a sequence of nn.Linear
+    and we parallelize the module based on the given parallel style.
+    We don't change the FQN of each sub-module and replace each parameter
+    in place.
+
+    Args:
+        module (nn.Module):
+            :class:``nn.Module`` object to be parallelized.
+        device_mesh (DeviceMesh):
+            :class:``DeviceMesh`` object which describes the mesh topology
+            of devices for the DTensor.
+        parallel_style (ParallelStyle):
+            :class:``ParallelStyle`` object which contains how
+            we prepare input/output for Tensor Parallelism.
+        tp_mesh_dim (int):
+            the dimension of ``device_mesh`` where we perform
+            Tensor Parallelism on.
+
+    Return:
+        A :class:``nn.Module`` object parallelized.
+
+    .. warning::
+        We only support ``PairwiseParallel`` right now.
+    """
     if not isinstance(parallel_style, PairwiseParallel):
         raise NotImplementedError(
             "Only support PairwiseParallel for MLP parallelization."
         )
 
-    if not _has_even_num_linears(module):
-        raise RuntimeError("We only support even number of Linear for MLP.")
+    if not _is_mlp_for_pairwise_parallel(module):
+        raise RuntimeError("More than one nn.Linear needed for a MLP.")
 
     if device_mesh.ndim > 1:
         device_mesh = _create_1d_device_mesh(device_mesh, tp_mesh_dim)
@@ -175,13 +300,15 @@ def _colwise_parallelize_fn(name, module, device_mesh):  # pyre-ignore[2, 3]
     linear_submodules = list(
         filter(lambda x: isinstance(x, nn.Linear), module.children())
     )
-    for i, m in enumerate(linear_submodules):
+    mlp_last_even_layer = (len(linear_submodules) // 2) * 2
+    for i in range(mlp_last_even_layer):
+        m = linear_submodules[i]
         if i % 2 == 0:
             # Col-wise Parallelize the linear layer
             distribute_module(
                 m,
                 device_mesh,
-                _colwise_parallelize_fn,
+                _colwise_parallelize_linear_fn,
                 input_fn=parallel_style._prepare_input  # type: ignore[arg-type, misc] # pyre-ignore[6]
                 if i == 0
                 else None,
@@ -191,8 +318,9 @@ def _colwise_parallelize_fn(name, module, device_mesh):  # pyre-ignore[2, 3]
             distribute_module(
                 m,
                 device_mesh,
-                _rowwise_parallelize_fn,
+                _rowwise_parallelize_linear_fn,
                 output_fn=parallel_style._prepare_output  # type: ignore[arg-type, misc] # pyre-ignore[6]
-                if i == (len(linear_submodules) - 1)
+                if i == (mlp_last_even_layer - 1)
                 else None,
             )
+    return module
diff --git a/torch/distributed/_tensor/parallel/style.py b/torch/distributed/_tensor/parallel/style.py
index 5ea96434118ab..60b6a1c88dfd3 100644
--- a/torch/distributed/_tensor/parallel/style.py
+++ b/torch/distributed/_tensor/parallel/style.py
@@ -5,8 +5,8 @@
 from typing import Union, Optional
 from torch.distributed._tensor import DTensor, Shard, Replicate, DeviceMesh
 from torch.distributed._tensor.parallel.utils import (
-    _Prepare_Input_Func_Type,
-    _Prepare_Output_Func_Type,
+    _PrepareInputType,
+    _PrepareOutputType,
     _prepare_input_validate,
     _prepare_output_validate,
 )
@@ -18,8 +18,8 @@ class ParallelStyle(ABC):
     Users can extend this class to build their own parallel style with customized input/output preparations.
     """
 
-    _prepare_input: _Prepare_Input_Func_Type
-    _prepare_output: _Prepare_Output_Func_Type
+    _prepare_input: _PrepareInputType
+    _prepare_output: _PrepareOutputType
 
     @abstractmethod
     def __init__(self, _prepare_input, _prepare_output) -> None:
diff --git a/torch/distributed/_tensor/parallel/utils.py b/torch/distributed/_tensor/parallel/utils.py
index 2680ae41ffbe7..c4cca5c88eda1 100644
--- a/torch/distributed/_tensor/parallel/utils.py
+++ b/torch/distributed/_tensor/parallel/utils.py
@@ -4,18 +4,18 @@
 from torch.distributed._tensor import DeviceMesh, DTensor
 from typing import Callable, Optional, Union
 
-_Prepare_Input_Func_Type = Callable[
+_PrepareInputType = Callable[
     [Union[torch.Tensor, DTensor], Optional[DeviceMesh], Optional[int]], DTensor
 ]
 
-_Prepare_Output_Func_Type = Callable[
+_PrepareOutputType = Callable[
     [DTensor, Optional[DeviceMesh], Optional[int]], Union[torch.Tensor, DTensor]
 ]
 
 
 def _prepare_input_validate(
-    _prepare_input_func: _Prepare_Input_Func_Type,
-) -> _Prepare_Input_Func_Type:
+    _prepare_input_func: _PrepareInputType,
+) -> _PrepareInputType:
     """
     Inject common validation logics for `_prepare_input` funcs via this
     decorator, including verifying that input needs to be either
@@ -66,8 +66,8 @@ def wrapper(*args, **kwargs):  # pyre-ignore[2, 3]
 
 
 def _prepare_output_validate(
-    _prepare_output_func: _Prepare_Output_Func_Type,
-) -> _Prepare_Output_Func_Type:
+    _prepare_output_func: _PrepareOutputType,
+) -> _PrepareOutputType:
     """
     Inject common validation logics for _prepare_output funcs via this
     decorator, including verifying that output needs to be a DTensor

From c2213222c9efdbe81bc426f809e787af6c866184 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Tue, 22 Nov 2022 03:38:53 +0000
Subject: [PATCH 1154/1922] [vision hash update] update the pinned vision hash
 (#89471)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned vision hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89471
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/vision.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index 80fe47b2cee2c..30711c5bbfd9b 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-5b4f79d9ba8cbeeb8d6f0fbba3ba5757b718888b
+4a310f26049371959617921d0eb9b001f4d262c6

From 6daa5856d249edd4631378f867910830d5bb0b8b Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 22 Nov 2022 03:39:15 +0000
Subject: [PATCH 1155/1922] Fix retrying logic for successful unittest tests
 under --rerun-disabled-tests mode (#89454)

When looking into Rockset data for disabled test unittest, for example `testAdd`, I see that it's re-run only 3 times instead of 50+ times as expected under rerun-disabled -test mode

```
[
  {
    "name": "testAdd",
    "classname": "TestLazyReuseIr",
    "filename": "lazy/test_reuse_ir.py",
    "flaky": false,
    "num_green": 3,
    "num_red": 0
  }
]
```

It turns out that I made a mistake mixing `RERUN_DISABLED_TESTS` and `report_only` into `(RERUN_DISABLED_TESTS or report_only) and num_retries_left < MAX_NUM_RETRIES` in https://github.com/pytorch/pytorch/pull/88646.  The retrying logic for successful tests under rerun-disabled-tests mode is never executed because num_retries_left would be equal to MAX_NUM_RETRIES (not smaller) if the very first run successes. Thus, the sample test `testAdd` finishes right away (1 success count)

* `report_only` and `RERUN_DISABLED_TESTS` are 2 different things and shouldn't be mixed together. RERUN_DISABLED_TESTS has the higher priority.
* We also don't want to retry skipped tests under rerun-disabled-tests mode because they are only skipped due to `check_if_enable` check `Test is enabled but --rerun-disabled-tests verification mode is set, so only disabled tests are run`

### Testing

* CI https://github.com/pytorch/pytorch/actions/runs/3518228784 generates https://gha-artifacts.s3.amazonaws.com/pytorch/pytorch/3518228784/1/artifact/test-reports-test-default-4-4-linux.4xlarge.nvidia.gpu_9627285587.zip in which `testAdd` is correctly called multiple times and `TestLazyReuseIr` is skipped correctly
* Locally

```
# export CI=1
# export PYTORCH_RETRY_TEST_CASES=1
# export PYTORCH_OVERRIDE_FLAKY_SIGNAL=1
# export PYTORCH_TEST_RERUN_DISABLED_TESTS=1
$ python test/run_test.py --verbose -i lazy/test_reuse_ir
Ignoring disabled issues:  []
Selected tests:
 lazy/test_reuse_ir
Prioritized test from test file changes.
reordering tests for PR:
prioritized: []
the rest: ['lazy/test_reuse_ir']

Downloading https://raw.githubusercontent.com/pytorch/test-infra/generated-stats/stats/slow-tests.json to /Users/huydo/Storage/mine/pytorch/test/.pytorch-slow-tests.json
Downloading https://raw.githubusercontent.com/pytorch/test-infra/generated-stats/stats/disabled-tests-condensed.json to /Users/huydo/Storage/mine/pytorch/test/.pytorch-disabled-tests.json
parallel (file granularity) tests:
 lazy/test_reuse_ir
serial (file granularity) tests:

Ignoring disabled issues:  []
Ignoring disabled issues:  []
Running lazy/test_reuse_ir ... [2022-11-21 13:21:07.165877]
Executing ['/Users/huydo/miniconda3/envs/py3.9/bin/python', '-bb', 'lazy/test_reuse_ir.py', '-v', '--import-slow-tests', '--import-disabled-tests', '--rerun-disabled-tests'] ... [2022-11-21 13:21:07.166279]

Expand the folded group to see the log file of lazy/test_reuse_ir
##[group]PRINTING LOG FILE of lazy/test_reuse_ir (/Users/huydo/Storage/mine/pytorch/test/test-reports/lazy-test_reuse_ir_6cf_dxa1)

Running tests...
----------------------------------------------------------------------
Test results will be stored in test-reports/python-unittest/lazy.test_reuse_ir
  testAdd (__main__.TestLazyReuseIr) ... ok (1.215s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 50
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 49
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 48
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 47
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 46
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 45
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 44
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 43
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 42
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 41
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 40
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 39
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 38
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 37
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 36
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 35
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 34
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 33
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 32
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 31
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 30
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 29
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 28
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 27
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 26
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 25
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 24
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 23
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 22
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 21
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 20
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 19
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 18
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 17
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 16
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 15
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 14
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 13
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 12
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 11
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 10
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 9
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 8
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 7
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 6
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 5
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 4
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 3
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 2
ok (0.001s)
  testAdd (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 1
ok (0.001s)
  testAddSub (__main__.TestLazyReuseIr) ...     testAdd succeeded - num_retries_left: 0
skip: Test is enabled but --rerun-disabled-tests verification mode is set, so only disabled tests are run (0.001s)
  testAddSubFallback (__main__.TestLazyReuseIr) ... skip: Test is enabled but --rerun-disabled-tests verification mode is set, so only disabled tests are run (0.001s)
  testBatchNorm (__main__.TestLazyReuseIr) ... skip: Test is enabled but --rerun-disabled-tests verification mode is set, so only disabled tests are run (0.001s)

----------------------------------------------------------------------
Ran 54 tests in 1.264s

OK (skipped=3)
```

Here is the sample rockset query

```
WITH added_row_number AS (
  SELECT
    *,
    ROW_NUMBER() OVER(PARTITION BY name, classname, filename ORDER BY _event_time DESC) AS row_number
  FROM
    commons.rerun_disabled_tests
)
SELECT
  name,
  classname,
  filename,
  flaky,
  num_green,
  num_red
FROM
  added_row_number
WHERE
  row_number = 1
  AND name = 'testAdd'
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89454
Approved by: https://github.com/clee2000
---
 torch/testing/_internal/common_utils.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index e53887a5fdbb3..2c72296d1e30f 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -2165,13 +2165,22 @@ def _run_with_retry(self, result=None, num_runs_left=0, report_only=True, num_re
                 result.addExpectedFailure(self, err)
             self._run_with_retry(result=result, num_runs_left=num_retries_left, report_only=report_only,
                                  num_red=num_red + 1, num_green=num_green)
-        elif (RERUN_DISABLED_TESTS or report_only) and num_retries_left < MAX_NUM_RETRIES:
-            # Always re-run up to MAX_NUM_RETRIES when running under report only or rerun disabled tests modes
+        elif RERUN_DISABLED_TESTS and num_retries_left <= MAX_NUM_RETRIES and not result.skipped:
+            # Always re-run up to MAX_NUM_RETRIES when running under rerun disabled tests modes if the test successes.
+            # The parameter num_retries_left can be equal to MAX_NUM_RETRIES here because num_runs_left is initially
+            # set to MAX_NUM_RETRIES + 1, i.e. the first run successes
+            #
+            # Also if the result is skipped, this is due to check_if_enable skipping non-disabled tests, thus we
+            # want to ignore them, not retrying and skipping multiple times
             print(f"    {self._testMethodName} succeeded - num_retries_left: {num_retries_left}")
-            if RERUN_DISABLED_TESTS:
-                result.addSuccess(self)
-            else:
-                result.addUnexpectedSuccess(self)
+            result.addSuccess(self)
+            self._run_with_retry(result=result, num_runs_left=num_retries_left, report_only=report_only,
+                                 num_red=num_red, num_green=num_green + 1)
+        elif report_only and num_retries_left < MAX_NUM_RETRIES:
+            # The original logic here is that num_retries_left must be smaller than MAX_NUM_RETRIES indicating
+            # that at least one retry has been spent
+            print(f"    {self._testMethodName} succeeded - num_retries_left: {num_retries_left}")
+            result.addUnexpectedSuccess(self)
             self._run_with_retry(result=result, num_runs_left=num_retries_left, report_only=report_only,
                                  num_red=num_red, num_green=num_green + 1)
         elif not report_only and num_retries_left < MAX_NUM_RETRIES:

From 6b655eaaede4b7aad7c58b8b626461787753dfe5 Mon Sep 17 00:00:00 2001
From: Iris <wz337@cornell.edu>
Date: Tue, 22 Nov 2022 03:52:32 +0000
Subject: [PATCH 1156/1922] [Checkpoint][2D][1/N] Add dedup_tensors for
 distributed checkpoint to core distributed (#89399)

This PR moves dedup_tensors and its test to torch.distributed.checkpoint. This is a pre-req for enabling 2D checkpoint.

This removes duplicated shards in list of SavePlan. It is used when saving DT with replicated placement.

Docstring and comments will be added in the following PRs.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89399
Approved by: https://github.com/wanchaol
---
 .../checkpoint/test_dedup_tensors.py          | 45 +++++++++++++++++++
 torch/distributed/checkpoint/dedup_tensors.py | 38 ++++++++++++++++
 2 files changed, 83 insertions(+)
 create mode 100644 test/distributed/checkpoint/test_dedup_tensors.py
 create mode 100644 torch/distributed/checkpoint/dedup_tensors.py

diff --git a/test/distributed/checkpoint/test_dedup_tensors.py b/test/distributed/checkpoint/test_dedup_tensors.py
new file mode 100644
index 0000000000000..a0d72147efeba
--- /dev/null
+++ b/test/distributed/checkpoint/test_dedup_tensors.py
@@ -0,0 +1,45 @@
+# Owner(s): ["oncall: distributed"]
+
+import dataclasses
+import torch
+from torch.distributed.checkpoint.dedup_tensors import dedup_tensors
+from torch.distributed.checkpoint.planner import SavePlan, WriteItemType
+from torch.distributed.checkpoint.planner_helpers import (
+    _create_write_item_for_tensor,
+)
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+# TODO: add comments for create_plan
+def create_plan(second_fqn) -> SavePlan:
+    # the first write item is for a duplicated shard (that covers the whole tensor)
+    write_item_1 = _create_write_item_for_tensor("tensor_0", torch.rand(4))
+    write_item_1 = dataclasses.replace(write_item_1, type=WriteItemType.SHARD)
+
+    # the second write item has different keys
+    write_item_2 = _create_write_item_for_tensor(second_fqn, torch.rand(10))
+
+    return SavePlan([write_item_1, write_item_2])
+
+
+# TODO: add comments for TestDedupTensor
+class TestDedupTensor(TestCase):
+    def test_dedup_shards(self):
+        rank0 = create_plan("r0")
+        rank1 = create_plan("r1")
+
+        dedup_plans = dedup_tensors([rank0, rank1])
+
+        self.assertEqual(2, len(dedup_plans[0].items))
+        self.assertEqual(1, len(dedup_plans[1].items))
+
+        self.assertIn(
+            "tensor_0", (item.index.fqn for item in dedup_plans[0].items)
+        )
+        self.assertIn("r0", (item.index.fqn for item in dedup_plans[0].items))
+
+        self.assertIn("r1", (item.index.fqn for item in dedup_plans[1].items))
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/distributed/checkpoint/dedup_tensors.py b/torch/distributed/checkpoint/dedup_tensors.py
new file mode 100644
index 0000000000000..4b60e49d31053
--- /dev/null
+++ b/torch/distributed/checkpoint/dedup_tensors.py
@@ -0,0 +1,38 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+
+from typing import Dict, List
+import dataclasses
+
+from torch.distributed.checkpoint.metadata import MetadataIndex
+from torch.distributed.checkpoint.planner import SavePlan
+
+__all__ = ["dedup_tensors"]
+
+# TODO add docstring for dedup_tensors
+def dedup_tensors(all_plans: List[SavePlan]) -> List[SavePlan]:
+    all_plans = list(all_plans)
+    key_to_plan: Dict[MetadataIndex, List[int]] = {}
+    for plan_idx, plan in enumerate(all_plans):
+        for wi in plan.items:
+            key_to_plan.setdefault(wi.index, []).append(plan_idx)
+
+    replicated_items = {k: v for k, v in key_to_plan.items() if len(v) > 1}
+
+    # Remove deplicates by always keeping the first entry.
+    # Compute the per-rank remove set.
+    plan_to_keys: Dict[int, List[MetadataIndex]] = {}
+    for key, plans in replicated_items.items():
+        for plan_idx in plans[1:]:
+            plan_to_keys.setdefault(plan_idx, []).append(key)
+
+    for plan_idx, keys in plan_to_keys.items():
+        key_set = set(keys)
+        # rewrite items and remove elements
+        new_items = [
+            wi for wi in all_plans[plan_idx].items if wi.index not in key_set
+        ]
+        all_plans[plan_idx] = dataclasses.replace(
+            all_plans[plan_idx], items=new_items
+        )
+
+    return all_plans

From 5a491a783a17c912dc39ea69cdbec2a0197d64b6 Mon Sep 17 00:00:00 2001
From: Shunting Zhang <shunting@meta.com>
Date: Tue, 22 Nov 2022 03:57:01 +0000
Subject: [PATCH 1157/1922] dynamo/torchxla integration: trace on xla rather
 than eager (#88904)

In #87741 we added the inference support for dynamo/torchxla integration. Later on in #88449 we attempt to add the training support. That attempt is not smooth because
- we try 2 things together
   1. let dynamo trace the model on xla rather than eager
   2. enable training
- It turns out neither of these two tasks are trivial enough.

Furthermore, item 2 (enable training) depends on item 1 (tracing on xla). We enable training via AOTAutograd. AOTAutograd lift all model parameters/buffers as graph inputs. Without item 1 being done, we would need copy all graph inputs (including model parameters/buffers) from eager device to xla devices. That hurts performance a lot. Have a cache to map eager parameter to XLA parameter does not solve the problem since the update on either will not sync automatically to the other. They will easily go out of sync.

This PR let dynamo trace the model on XLA rather than eager. This is a preparation step to enabling training.

Also, tracing on XLA makes the data movement more efficient. We see 1.5x geomean speedup compared to previous 1.38x.
```
+-------------------------+--------------------+-------------------------+
| Model                   |   XLA (trace once) |   XLA (trace everytime) |
+=========================+====================+=========================+
| resnet18                |            1.38    |                 1.008   |
+-------------------------+--------------------+-------------------------+
| resnet50                |            1.227   |                 0.998   |
+-------------------------+--------------------+-------------------------+
| resnext50_32x4d         |            1.544   |                 1.008   |
+-------------------------+--------------------+-------------------------+
| alexnet                 |            1.085   |                 1.045   |
+-------------------------+--------------------+-------------------------+
| mobilenet_v2            |            2.028   |                 1.013   |
+-------------------------+--------------------+-------------------------+
| mnasnet1_0              |            1.516   |                 0.995   |
+-------------------------+--------------------+-------------------------+
| squeezenet1_1           |            0.868   |                 1.01    |
+-------------------------+--------------------+-------------------------+
| vgg16                   |            1.099   |                 1.008   |
+-------------------------+--------------------+-------------------------+
| BERT_pytorch            |            3.26    |                 1.027   |
+-------------------------+--------------------+-------------------------+
| timm_vision_transformer |            2.182   |                 1.015   |
+-------------------------+--------------------+-------------------------+
| geomean                 |            1.50389 |                 1.01261 |
+-------------------------+--------------------+-------------------------+
```

Example command
```
GPU_NUM_DEVICES=1 python benchmarks/dynamo/torchbench.py --randomize-input --performance --trace-on-xla --only resnet18 --backend=torchxla_trace_once
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88904
Approved by: https://github.com/wconstab, https://github.com/JackCaoG, https://github.com/jansel
---
 .github/ci_commit_pins/xla.txt                |   2 +-
 benchmarks/dynamo/common.py                   | 106 +++++++++---------
 torch/_dynamo/optimizations/backends.py       |  26 +----
 .../optimizations/torchxla_integration.py     |  62 +++++-----
 torch/_dynamo/utils.py                        |  20 +++-
 5 files changed, 103 insertions(+), 113 deletions(-)

diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index 6e29f8ee3c313..f680f0ddccb20 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-dd9b67ff0d6ba4da6a46ca1b22e35c98dbed0d77
+50855d7babfa7970cba18528c659989b91c83824
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 8731d545c456a..3fad203c5d871 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -266,13 +266,35 @@ def print_summary(filename):
             pass
 
 
+def tensor_is_on_xla(tensors):
+    if not isinstance(tensors, (tuple, list)):
+        tensors = [tensors]
+    return any(map(lambda x: x.device.type == "xla", tensors))
+
+
 def timed(model, model_iter_fn, example_inputs, times=1, return_result=False):
     synchronize()
+    if tensor_is_on_xla(example_inputs):
+        import torch_xla.core.xla_model as xm
+
+        xm.mark_step()
+
     reset_rng_state()
     t0 = time.perf_counter()
     # Dont collect outputs to correctly measure timing
     for _ in range(times):
         result = model_iter_fn(model, example_inputs, collect_outputs=False)
+        if tensor_is_on_xla(result):
+            # If the model is on XLA device, it's possible that after running
+            # the model, the computation is accumulated but not performed yet.
+            # Flush all the accumulated computations to make the time measurement
+            # accurate.
+            import torch_xla
+
+            result_list = result
+            if not isinstance(result, (tuple, list)):
+                result_list = [result]
+            torch_xla._XLAC._xla_sync_multi(result_list, [])
         synchronize()
     t1 = time.perf_counter()
     return (t1 - t0, result) if return_result else t1 - t0
@@ -384,6 +406,13 @@ def randomize_input(inputs):
         )
 
 
+def maybe_mark_step(args):
+    if args.trace_on_xla:
+        import torch_xla.core.xla_model as xm
+
+        xm.mark_step()
+
+
 def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
     """
     Measure speedups over eager.
@@ -398,9 +427,6 @@ def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
     should_check_result = should_randomize_input = args.randomize_input
     is_correct = True
 
-    baseline_model_iter_fn = get_baseline_model_iter_fn(args, model_iter_fn)
-    baseline_model = get_baseline_model(args, model)
-
     import contextlib
 
     @contextlib.contextmanager
@@ -419,16 +445,25 @@ def maybe_profile(*args, **kwargs):
                 if should_randomize_input
                 else example_inputs
             )
+            # need call mark_step to perform the computation
+            # on randomize_input. Otherwise the first call using the
+            # inputs will incur high penalty then the next one.
+            maybe_mark_step(args)
 
             # interleave the runs to handle frequency scaling and load changes
             timings[rep, 0], expected_output = timed(
-                baseline_model, baseline_model_iter_fn, inputs, return_result=True
+                model, model_iter_fn, inputs, return_result=True
             )
+
+            # call mark_step between the 2 calls to make the comparison fair.
+            maybe_mark_step(args)
+
             timings[rep, 1], actual_output = timed(
                 model, frozen_model_iter_fn, inputs, return_result=True
             )
             if should_check_result:
                 is_correct = is_correct and same(expected_output, actual_output)
+
     if args.export_profiler_trace:
         name = args.profiler_trace_name + "_" + model.name + ".json"
         name = os.path.join(torch._dynamo.config.base_dir, name)
@@ -843,56 +878,6 @@ def maybe_init_distributed(should_init_distributed, port="6789", rank=0, world_s
             torch.distributed.destroy_process_group()
 
 
-def xla_wrapper(model_iter_fn):
-    """
-    Wrap the model_iter_fn to run the model on XLA devices.
-    """
-
-    def wrapper(xla_mod, inputs, collect_outputs=True):
-        import torch_xla.core.xla_model as xm
-
-        # Make sure the model is already moved to the xla device. Moving
-        # the model to xla device can be very expensive since model parameters
-        # need to be copied. We should not do that inside the wrapper since
-        # the wrapper will be calles for each set of inputs.
-        assert (
-            next(xla_mod.parameters()).device.type == "xla"
-        ), "The model should be already on xla device"
-
-        xla_dev = xm.xla_device()
-        eager_dev = inputs[0].device
-        xla_inputs = tree_map(lambda x: x.to(device=xla_dev), inputs)
-        xla_out = model_iter_fn(xla_mod, xla_inputs, collect_outputs)
-        if isinstance(xla_out, torch.Tensor):
-            return xla_out.to(device=eager_dev)
-        elif hasattr(xla_out, "__dict__"):
-            for k in xla_out.__dict__.keys():
-                if xla_out.__dict__[k] is None:
-                    continue
-                xla_out.__dict__[k] = tree_map(
-                    lambda x: x.to(device=eager_dev), xla_out.__dict__[k]
-                )
-            return xla_out
-        else:
-            raise RuntimeError(f"Can not handle type {type(xla_out)}")
-
-    return wrapper
-
-
-def get_baseline_model_iter_fn(args, model_iter_fn):
-    return xla_wrapper(model_iter_fn) if args.use_xla_baseline else model_iter_fn
-
-
-def get_baseline_model(args, model):
-    if args.use_xla_baseline:
-        import torch_xla.core.xla_model as xm
-
-        xla_dev = xm.xla_device()
-        return copy.deepcopy(model).to(device=xla_dev)
-    else:
-        return model
-
-
 class BenchmarkRunner:
     def __init__(self):
         self.model_iter_fn = None
@@ -1544,9 +1529,9 @@ def get_example_inputs(self):
         help="Disables cudagraphs for Inductor",
     )
     parser.add_argument(
-        "--use-xla-baseline",
+        "--trace-on-xla",
         action="store_true",
-        help="Whether to run baseline on XLA devices or eager devices",
+        help="Whether to trace the model on XLA or on eager device",
     )
 
     group_fuser = parser.add_mutually_exclusive_group()
@@ -1995,6 +1980,15 @@ def run(runner, args, original_dir=None):
                     logging.warn(f"{args.only} failed to load")
                     continue  # bad benchmark implementation
 
+            if args.trace_on_xla:
+                import torch_xla.core.xla_model as xm
+
+                xla_dev = xm.xla_device()
+                model = model.to(device=xla_dev)
+                example_inputs = tree_map(
+                    lambda x: x.to(device=xla_dev), example_inputs
+                )
+
             current_name = name
             current_device = device
             current_batch_size = batch_size
diff --git a/torch/_dynamo/optimizations/backends.py b/torch/_dynamo/optimizations/backends.py
index 55974c69d76e3..e97940b7311f7 100644
--- a/torch/_dynamo/optimizations/backends.py
+++ b/torch/_dynamo/optimizations/backends.py
@@ -785,33 +785,9 @@ def ltc_model(*inputs):
     return ltc_model
 
 
-@functools.lru_cache(None)
-def _init_torchxla():
-    global xm
-    try:
-        import torch_xla.core.xla_model as xm
-    except ModuleNotFoundError as e:
-        print(f"torchxla backend fails. Can not import {e.name}")
-        raise
-
-
 @create_backend
 def torchxla_trivial(subgraph):
-    _init_torchxla()
-
-    xla_dev = xm.xla_device()
-
-    xla_model = copy.deepcopy(subgraph.model).to(device=xla_dev)
-
-    def xla_model_wrapper(*inputs):
-        orig_device = inputs[0].device if len(inputs) > 0 else "cpu"
-        xla_inputs = tuple(inp.to(device=xla_dev) for inp in inputs)
-
-        xla_out = xla_model(*xla_inputs)
-        result = tuple(out.to(device=orig_device) for out in xla_out)
-        return result
-
-    return xla_model_wrapper
+    return subgraph.model
 
 
 @create_backend
diff --git a/torch/_dynamo/optimizations/torchxla_integration.py b/torch/_dynamo/optimizations/torchxla_integration.py
index d3cac23e7c4b4..f93e4d385ad82 100644
--- a/torch/_dynamo/optimizations/torchxla_integration.py
+++ b/torch/_dynamo/optimizations/torchxla_integration.py
@@ -1,7 +1,7 @@
-import copy
 import dataclasses
 
 import functools
+import itertools
 import os
 import time
 from typing import Any, Dict, List
@@ -19,7 +19,7 @@ class GraphInputMatcher:
     arguments for the current call.
 
     tensor_id_to_arg_idx maps the tensor id to the parameter index.
-    graph_input_tensor_ids, graph_input_ivalues list the tensor_id and ivalue for each of the
+    graph_input_tensor_ids, graph_input_xla_values list the tensor_id and ivalue for each of the
     TS/XLA graph inputs.
     """
 
@@ -30,17 +30,17 @@ class GraphInputMatcher:
     # most likely const tensors and we can get its content from graph_input_tensors
     # Category 2: those whose id are found in tensor_id_to_arg_idx. We should get
     #  the tensor from method arguments
-    graph_input_ivalues: List[Any]
+    graph_input_xla_values: List[Any]
 
     # get the real graph input tensors
     def __call__(self, args):
         real_input = []
-        for tensor_id, traced_ivalue in zip(
-            self.graph_input_tensor_ids, self.graph_input_ivalues
+        for tensor_id, traced_xla_value in zip(
+            self.graph_input_tensor_ids, self.graph_input_xla_values
         ):
             arg_idx = self.tensor_id_to_arg_idx.get(tensor_id, None)
             if arg_idx is None:
-                inp = traced_ivalue
+                inp = traced_xla_value
             else:
                 inp = args[arg_idx]
             real_input.append(inp)
@@ -73,12 +73,25 @@ def import_torchxla():
     import torch_xla.debug.metrics as metrics
 
 
-def extract_compiled_graph(model: torch.fx.GraphModule, example_inputs):
+def is_xla_tensor(tensor: torch.Tensor) -> bool:
+    return tensor.device.type == "xla"
+
+
+def extract_compiled_graph(xla_model: torch.fx.GraphModule, xla_args):
     import_torchxla()
-    orig_device = example_inputs[0].device
-    xla_dev = xm.xla_device()
-    xla_model = copy.deepcopy(model).to(device=xla_dev)
-    xla_args = [arg.to(device=xla_dev) for arg in example_inputs]
+
+    assert all(
+        map(
+            is_xla_tensor,
+            filter(
+                lambda x: isinstance(x, torch.Tensor),
+                itertools.chain(xla_model.parameters(), xla_args),
+            ),
+        )
+    ), "All tensors should be on xla"
+
+    # This call is critical to make sure xla_args' tensor id show up in graph_input_tensor_ids
+    xm.mark_step()
     args_tensor_ids = [
         torch_xla._XLAC._xla_get_tensor_id(xla_arg) for xla_arg in xla_args
     ]
@@ -88,6 +101,7 @@ def extract_compiled_graph(model: torch.fx.GraphModule, example_inputs):
 
     tensor_id_to_arg_idx = {tensor_id: i for i, tensor_id in enumerate(args_tensor_ids)}
     xla_out = xla_model(*xla_args)
+
     fallback_ops = get_fallback_ops()
     if len(fallback_ops) > 0:
         raise RuntimeError(
@@ -121,28 +135,29 @@ def extract_compiled_graph(model: torch.fx.GraphModule, example_inputs):
 
     (
         graph_input_tensor_ids,
-        graph_input_ivalues,
+        graph_input_xla_values,
     ) = torch_xla._XLAC._get_tensors_xla_device_data_node(args_and_out)
     if debug:
         print(f"graph_input_tensor_ids {graph_input_tensor_ids}")
     assert len(graph_input_tensor_ids) == len(
-        graph_input_ivalues
-    ), f"{len(graph_input_tensor_ids)} v.s. {len(graph_input_ivalues)}"
+        graph_input_xla_values
+    ), f"{len(graph_input_tensor_ids)} v.s. {len(graph_input_xla_values)}"
     graph_input_matcher = GraphInputMatcher(
-        tensor_id_to_arg_idx, graph_input_tensor_ids, graph_input_ivalues
+        tensor_id_to_arg_idx, graph_input_tensor_ids, graph_input_xla_values
     )
 
     # compiles+runs graph rooted at tensors in 'args_and_out'
     torch_xla._XLAC._xla_sync_multi(args_and_out, [])
+    torch_xla._XLAC._clear_pending_irs(str(xm.xla_device()))
 
     # input all cpu tensors
     def optimized_mod(*args):
+        torch_xla._XLAC._xla_sync_multi(args, [])
         enter_ts = time.time()
         if len(args_and_out) == 0:
             return ()
 
         assert len(args) > 0  # can not handle no args case for now
-        eager_device = args[0].device
         graph_input = graph_input_matcher(args)
         start_ts = time.time()
         res = torch_xla._XLAC._run_cached_graph(graph_hash, graph_input)
@@ -151,9 +166,7 @@ def optimized_mod(*args):
                 f"torchxla reuse compiled graph run_cached_graph takes {time.time() - start_ts} seconds"
             )
 
-        prepare_output_ts = time.time()
-
-        copy_args_ts = time.time()
+        args_inplace_update_ts = time.time()
         assert len(res) == len(args_and_out)
         ncopy = 0
 
@@ -161,17 +174,16 @@ def optimized_mod(*args):
             args[arg_index].copy_(res[res_index])
 
         if debug:
-            print(f"Copy {ncopy} args takes {time.time() - copy_args_ts} seconds")
+            print(
+                f"Copy {ncopy} args takes {time.time() - args_inplace_update_ts} seconds"
+            )
 
-        # need to convert xla tensor back to eager tensor
-        copy_res_ts = time.time()
         # First few elements might be xla_args that needs to be in place updated
-        result = [x.to(device=eager_device) for x in res[len(xla_args_need_update) :]]
+        result = res[len(xla_args_need_update) :]
         if debug:
-            print(f"Copy results takes {time.time() - copy_res_ts} seconds")
-            print(f"prepare output takes {time.time() - prepare_output_ts} seconds")
             print(f"optimized_mod takes {time.time() - enter_ts} seconds overall")
 
+        xm.mark_step()
         return result
 
     return optimized_mod
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index cbf5a0b46148d..481794707efdc 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -399,7 +399,20 @@ def clone_tensor(x):
 
 def clone_input(x):
     """copy while preserving strides"""
+
+    def torch_clone(x):
+        y = torch.clone(x)
+        if x.is_leaf:
+            y.requires_grad_(x.requires_grad)
+        if x.is_leaf and x.grad is not None:
+            y.grad = clone_input(x.grad)
+        return y
+
     with torch.no_grad():
+        if x.device.type == "xla":
+            # Access data_ptr() for a xla tensor will cause crash
+            return torch_clone(x)
+
         needed_size = sum(
             (shape - 1) * stride for shape, stride in zip(x.size(), x.stride())
         )
@@ -421,12 +434,7 @@ def clone_input(x):
             # RuntimeError: unsupported operation: more than one element of the written-to
             # tensor refers to a single memory location. Please clone() the tensor before
             # performing the operation.
-            y = torch.clone(x)
-            if x.is_leaf:
-                y.requires_grad_(x.requires_grad)
-            if x.is_leaf and x.grad is not None:
-                y.grad = clone_input(x.grad)
-            return y
+            return torch_clone(x)
         return result
 
 
From 6f9df74531d3f84600a33950fab7792f4dd646cb Mon Sep 17 00:00:00 2001
From: "Wang, Eikan" <eikan.wang@intel.com>
Date: Mon, 21 Nov 2022 03:31:51 +0000
Subject: [PATCH 1158/1922] Support masked_fill to address the GPT2 performance
 issue (#89274)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89274
Approved by: https://github.com/jgong5, https://github.com/jansel
---
 test/inductor/test_torchinductor.py  | 23 +++++++++++++
 torch/_inductor/codegen/cpp.py       | 51 ++++++++++++++++++++++++----
 torch/_inductor/codegen/cpp_prefix.h | 12 +++++++
 3 files changed, 79 insertions(+), 7 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 0aaf74886c7cf..ed68c28442367 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -4833,6 +4833,29 @@ def test_auto_simd(self):
                     isa = codecache.pick_vec_isa()
                     self.assertTrue(isa == vec_avx2)
 
+        @unittest.skipIf(
+            not codecache.valid_vec_isa_list(), "Does not support vectorization"
+        )
+        @patch("torch.cuda.is_available", lambda: False)
+        def test_masked_fill_softmax(self):
+            def fn(value, mask):
+                mask = mask.to(torch.bool)
+                x = torch.masked_fill(value, mask, -33.0)
+                return torch.softmax(x, -1)
+
+            value = torch.randn((2, 17))
+            mask = torch.randint(0, 1, size=(2, 17), dtype=torch.uint8)
+            with patch.object(config.cpp, "simdlen", None):
+                torch._dynamo.reset()
+                metrics.reset()
+                opt_fn = torch._dynamo.optimize("inductor")(fn)
+                opt_fn(value, mask)
+
+                real_out = fn(value, mask)
+                compiled_out = opt_fn(value, mask)
+                assert same(real_out, compiled_out, equal_nan=True)
+                assert metrics.generated_cpp_vec_kernel_count >= 1
+
         @unittest.skipIf(
             not codecache.valid_vec_isa_list(), "Does not support vectorization"
         )
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index f82591ddff36f..3568cfdc08ef3 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -311,6 +311,10 @@ def maximum(a, b):
     def square(a):
         return f"{a}.pow(2)"
 
+    @staticmethod
+    def where(a, b, c):
+        return f"decltype({b})::blendv({c}, {b}, {a})"
+
     @staticmethod
     def sign(x):
         code = BracesBuffer()
@@ -330,6 +334,11 @@ def sign(x):
         V.kernel.compute.splice(code)
         return result
 
+    @staticmethod
+    def to_dtype(x, dtype):
+        assert dtype in [torch.bool], f"{__name__} does not support {dtype}"
+        return f"({x})"
+
 
 class CppOverrides(OpOverrides):
     """Map element-wise ops to C++"""
@@ -745,7 +754,16 @@ def load(self, name: str, index: sympy.Expr):
         if expanded_index == new_index:
             line = f"at::vec::Vectorized<float>({var}[{cexpr(index)}])"
         else:
-            line = f"at::vec::Vectorized<float>::loadu({var} + {cexpr(new_index)})"
+            if V.graph.get_dtype(name) in [torch.bool, torch.uint8]:
+                g_tmp_buf = f"g_tmp_buffer_{var}"
+                nelements = codecache.pick_vec_isa().nelements()
+                self.loads.writeline(f"float {g_tmp_buf}[{nelements}] = {{0}};")
+                self.loads.writeline(
+                    f"flag_to_float({var} + {cexpr(new_index)}, {g_tmp_buf}, {nelements});"
+                )
+                line = f"at::vec::Vectorized<float>::loadu({g_tmp_buf})"
+            else:
+                line = f"at::vec::Vectorized<float>::loadu({var} + {cexpr(new_index)})"
 
         return self.cse.generate(self.loads, line)
 
@@ -842,9 +860,6 @@ def is_legal_data_access(self, var: sympy.Symbol, index: sympy.Expr):
         return self.is_var_irrevelant(var, index) or self.is_single_step_var(var, index)
 
     def could_vec(self, name: str, index: sympy.Expr):
-        if V.graph.get_dtype(name) is not torch.float:
-            return False
-
         assert self.itervars is not None
         # Not a loop
         if len(self.itervars) == 0:
@@ -854,12 +869,24 @@ def could_vec(self, name: str, index: sympy.Expr):
         return self.is_legal_data_access(most_inner_var, index)
 
     def load(self, name: str, index: sympy.Expr):
-        index = self.rename_indexing(index)
+        if not V.graph.get_dtype(name) in [
+            torch.float,
+            torch.float32,
+            torch.bool,
+            torch.uint8,
+        ]:
+            self.simd_vec = False
+            return self.simd_vec
 
+        index = self.rename_indexing(index)
         self.simd_vec = self.simd_vec and self.could_vec(name, index)
         return self.simd_vec
 
     def store(self, name, index, value, mode=None):
+        if not V.graph.get_dtype(name) in [torch.float, torch.float32]:
+            self.simd_vec = False
+            return self.simd_vec
+
         assert "buf" in name
         index = self.rename_indexing(index)
 
@@ -932,15 +959,24 @@ def constant(val, dtype):
             @staticmethod
             def index_expr(expr, dtype):
                 self.simd_vec = False
-                return self.cse.newvar()
+                tmp_var = self.cse.newvar()
+                return tmp_var
 
             @staticmethod
             def indirect_indexing(index_var):
+                self.simd_vec = False
                 return sympy.Symbol(str(index_var))
 
             @staticmethod
             def masked(mask, body, other):
-                return V.kernel.cse.newvar()
+                tmp_var = self.cse.newvar()
+                return tmp_var
+
+            @staticmethod
+            def to_dtype(x, dtype):
+                if dtype != torch.bool:
+                    self.simd_vec = False
+                return x
 
         self.exit_stack.enter_context(V.set_ops_handler(VecCheckerProxy()))
         self.exit_stack.enter_context(V.set_kernel_handler(self))
@@ -1088,6 +1124,7 @@ def codegen_loops(self, code, worksharing):
         if reduction_par_depth > 0 and reduction_par_depth != len(
             loops_nest_reduce.loops
         ):
+            metrics.generated_cpp_vec_kernel_count -= 1
             return self.simd_omp_kernel.codegen_loops(code, worksharing)
 
         with contextlib.ExitStack() as stack:
diff --git a/torch/_inductor/codegen/cpp_prefix.h b/torch/_inductor/codegen/cpp_prefix.h
index 1905aefcda5c0..c1c9c3bae112d 100644
--- a/torch/_inductor/codegen/cpp_prefix.h
+++ b/torch/_inductor/codegen/cpp_prefix.h
@@ -57,3 +57,15 @@ template <typename T> void atomic_add(volatile T *addr, T offset) {
   } while (!atomic_addr->compare_exchange_weak(expected, desired,
                                                std::memory_order_relaxed));
 }
+
+// This function is used to convert bool or uint8 to float mask for
+// vectorization. The caller needs to make sure the src represents TRUE/FALSE
+// correctly.
+template <typename T>
+void flag_to_float(const T* src, float* dst, int64_t n) {
+#pragma unroll
+  for (int64_t i = 0; i < n; i++) {
+    uint32_t* dst_u32 = (uint32_t*)dst;
+    dst_u32[i] = *(src + i) ? 0xFFFFFFFF : 0;
+  }
+}

From a130e907974f470ba7ba7f284d49a7d46d1f07c2 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Tue, 22 Nov 2022 05:48:43 +0000
Subject: [PATCH 1159/1922] Revert submodule updates introduced by #89157
 (#89449)

Reverts updates that were introduced by https://github.com/pytorch/pytorch/pull/89157
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89449
Approved by: https://github.com/kit1980, https://github.com/huydhn, https://github.com/clee2000
---
 third_party/gloo     | 2 +-
 third_party/pybind11 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/gloo b/third_party/gloo
index 5b14351326313..4a5e339b76426 160000
--- a/third_party/gloo
+++ b/third_party/gloo
@@ -1 +1 @@
-Subproject commit 5b143513263133af2b95547e97c07cebeb72bf72
+Subproject commit 4a5e339b764261d20fc409071dc7a8b8989aa195
diff --git a/third_party/pybind11 b/third_party/pybind11
index aa304c9c7d725..80dc998efced8 160000
--- a/third_party/pybind11
+++ b/third_party/pybind11
@@ -1 +1 @@
-Subproject commit aa304c9c7d725ffb9d10af08a3b34cb372307020
+Subproject commit 80dc998efced8ceb2be59756668a7e90e8bef917

From 7316be96447c0d3ab47fe3693f1ed9deed51afb1 Mon Sep 17 00:00:00 2001
From: Mike Iovine <mikeiovine@meta.com>
Date: Tue, 22 Nov 2022 06:26:10 +0000
Subject: [PATCH 1160/1922] Back out "[static-runtime] change the backend for
 permute_copy" (#89463)

Summary: This permute copy change seems to be causing huge regressions on machines without AVX512. Revert to mitigate. This shouldn't be problematic since the improvement from changing it was super small anyways.

Differential Revision: D41450088

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89463
Approved by: https://github.com/hlu1
---
 .../static_runtime/test_static_runtime.cc     |  5 --
 torch/csrc/jit/runtime/static/ops.cpp         | 53 -------------------
 2 files changed, 58 deletions(-)

diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index dc4ce01df72cf..ef3bc75f921b2 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -2164,12 +2164,7 @@ TEST(StaticRuntime, Permute) {
   c10::List<int64_t> dims_b{0, 2, 1};
   std::vector<IValue> args_b{b, dims_b};
 
-  auto c = at::randn({3, 3, 3});
-  c10::List<int64_t> dims_c{0, -1, 1};
-  std::vector<IValue> args_c{c, dims_c};
-
   testStaticRuntime(permute_script, args_a);
-  testStaticRuntime(permute_script, args_c);
   testStaticRuntime(permute_script, args_a, args_b);
 
   permute_script = R"JIT(
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 834a71b081614..e2a154ad069e9 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -1675,36 +1675,6 @@ REGISTER_OPERATOR_FUNCTOR(
       };
     });
 
-namespace {
-
-std::vector<std::int64_t> permute_output_sizes(
-    c10::IntArrayRef self_sizes,
-    c10::IntArrayRef dims) {
-  const auto nDim = dims.size();
-  TORCH_CHECK(
-      self_sizes.size() == nDim,
-      "permute input and output tensors must have the same rank, got input rank=",
-      self_sizes.size(),
-      "; output rank=",
-      nDim);
-  std::vector<bool> dims_seen(nDim, false);
-  std::vector<std::int64_t> output_sizes;
-  output_sizes.reserve(nDim);
-  for (size_t i = 0; i < nDim; ++i) {
-    auto dim = c10::maybe_wrap_dim(dims[i], nDim);
-    TORCH_CHECK(
-        !dims_seen[dim],
-        "permute dims must be unique, found duplicate dim=",
-        dim);
-
-    output_sizes.push_back(self_sizes[dim]);
-    dims_seen[dim] = true;
-  }
-  return output_sizes;
-}
-
-} // namespace
-
 // Out variants for view ops are registered to a separate registry because
 // their outputs (views) can't participate in memory reuse.
 REGISTER_OPERATOR_FUNCTOR(
@@ -1729,29 +1699,6 @@ REGISTER_OPERATOR_FUNCTOR(
       };
     });
 
-REGISTER_OPERATOR_FUNCTOR(
-    static_runtime::permute_copy,
-    sr_permute_copy,
-    [](Node* n) -> SROperator {
-      if (!n->matches(torch::schema(
-              "static_runtime::permute_copy(Tensor self, int[] dims) -> Tensor"))) {
-        LogAndDumpSchema(n);
-        return nullptr;
-      }
-      return [](ProcessedNode* p_node) {
-        const auto& self = p_node->Input(0).toTensor();
-        const auto dims = p_node->Input(1).toDimVector();
-
-        if (p_node->Output(0).isNone()) {
-          p_node->Output(0) = create_empty_from(self);
-        }
-        auto& output = p_node->Output(0).toTensor();
-        at::native::resize_(
-            output, permute_output_sizes(self.sizes(), dims), c10::nullopt);
-        at::native::permute_copy_out(self, dims, output);
-      };
-    });
-
 REGISTER_OPERATOR_FUNCTOR(
     static_runtime::flatten_copy,
     aten_flatten,

From bb22c57d7846bf919d55b723742b870377e19987 Mon Sep 17 00:00:00 2001
From: Iris <wz337@cornell.edu>
Date: Tue, 22 Nov 2022 07:49:06 +0000
Subject: [PATCH 1161/1922] [Checkpoint][2D][2/N] Add traverse for distributed
 checkpoint to core distributed (#89398)

This PR moves traverse and its test to torch.distributed.checkpoint. This is a pre-req for enabling 2D checkpoint.

This is used when flatten nested dict and flatten sharded tensors.

Docstring and comments will be added in the following PRs.

Test:
```
python3 test/distributed/_tensor/parallel/test_2d_parallel.py
```
and CI
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89398
Approved by: https://github.com/wanchaol
---
 test/distributed/checkpoint/test_traverse.py | 176 +++++++++++++++++++
 torch/distributed/checkpoint/traverse.py     | 170 ++++++++++++++++++
 2 files changed, 346 insertions(+)
 create mode 100644 test/distributed/checkpoint/test_traverse.py
 create mode 100644 torch/distributed/checkpoint/traverse.py

diff --git a/test/distributed/checkpoint/test_traverse.py b/test/distributed/checkpoint/test_traverse.py
new file mode 100644
index 0000000000000..a73cb89befba5
--- /dev/null
+++ b/test/distributed/checkpoint/test_traverse.py
@@ -0,0 +1,176 @@
+# Owner(s): ["oncall: distributed"]
+
+from collections import OrderedDict
+import torch
+
+import torch.distributed.checkpoint.traverse as traverse
+from torch.distributed.checkpoint.metadata import STATE_DICT_TYPE
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+# TODO: add comments for TestTraverse
+class TestTraverse(TestCase):
+    def test_traverse_shallow(self) -> None:
+        state_dict = {
+            "key0": 1,
+            "key1": [1, 2],
+            "key2": {1: 2, 2: 3},
+            "key3": torch.tensor([1]),
+        }
+
+        data = {}
+
+        def collect_data(path, value):
+            nonlocal data
+            data[path] = value
+
+        traverse.traverse_state_dict(state_dict, collect_data)
+
+        self.assertIn(("key0",), data)
+        self.assertEqual(data[("key0",)], 1)
+
+        self.assertIn(("key1",), data)
+        self.assertEqual(data[("key1",)], [1, 2])
+
+        self.assertIn(("key2",), data)
+        self.assertEqual(data[("key2",)], {1: 2, 2: 3})
+
+        self.assertIn(("key3",), data)
+        self.assertEqual(data[("key3",)], torch.tensor([1]))
+
+    def test_traverse_nested_list(self) -> None:
+        state_dict = {
+            "key1": [
+                torch.tensor([1]),
+                [33, torch.tensor([2]), [44, 55]],
+                [66, 77],
+            ],
+        }
+
+        data = {}
+
+        def collect_data(path, value):
+            nonlocal data
+            data[path] = value
+
+        traverse.traverse_state_dict(state_dict, collect_data)
+
+        self.assertNotIn(("key1"), data)
+
+        self.assertIn(("key1", 0), data)
+        self.assertEqual(data[("key1", 0)], torch.tensor([1]))
+
+        self.assertIn(("key1", 1, 0), data)
+        self.assertEqual(data[("key1", 1, 0)], 33)
+
+        self.assertIn(("key1", 1, 1), data)
+        self.assertEqual(data[("key1", 1, 1)], torch.tensor([2]))
+
+        self.assertIn(("key1", 1, 2), data)
+        self.assertEqual(data[("key1", 1, 2)], [44, 55])
+        self.assertNotIn(("key1", 1, 2, 0), data)
+
+        self.assertIn(("key1", 2), data)
+        self.assertEqual(data[("key1", 2)], [66, 77])
+
+    def test_traverse_nested_dict(self) -> None:
+        state_dict = {
+            "key0": {"key1": 99, "key2": torch.tensor([1])},
+        }
+
+        data = {}
+
+        def collect_data(path, value):
+            nonlocal data
+            data[path] = value
+
+        traverse.traverse_state_dict(state_dict, collect_data)
+
+        self.assertNotIn(("key0",), data)
+
+        self.assertIn(("key0", "key1"), data)
+        self.assertEqual(data[("key0", "key1")], 99)
+
+        self.assertIn(("key0", "key2"), data)
+        self.assertEqual(data[("key0", "key2")], torch.tensor([1]))
+
+    def test_traverse_doesnt_ignore_intermediate_collections(self) -> None:
+        state_dict: STATE_DICT_TYPE = {
+            "key0": [{"key1": {"key2": torch.tensor([1])}}]
+        }
+
+        data = {}
+
+        def collect_data(path, value):
+            nonlocal data
+            data[path] = value
+
+        traverse.traverse_state_dict(state_dict, collect_data)
+
+        self.assertIn(("key0", 0, "key1", "key2"), data)
+        self.assertEqual(
+            data[("key0", 0, "key1", "key2")],
+            torch.tensor([1]),
+        )
+
+    def test_traverse_with_ordered_dict(self) -> None:
+        state_dict = OrderedDict(
+            {
+                "key0": [
+                    99,
+                    torch.tensor([3]),
+                ]
+            }
+        )
+
+        data = {}
+
+        def collect_data(path, value):
+            nonlocal data
+            data[path] = value
+
+        traverse.traverse_state_dict(state_dict, collect_data)
+
+        self.assertIn(("key0", 0), data)
+        self.assertEqual(data[("key0", 0)], 99)
+
+        self.assertIn(("key0", 1), data)
+        self.assertEqual(data[("key0", 1)], torch.tensor([3]))
+
+    def test_set_element(self) -> None:
+        state_dict: STATE_DICT_TYPE = {}
+
+        traverse.set_element(state_dict, ("k",), 10)
+        self.assertEqual(state_dict["k"], 10)
+
+        traverse.set_element(state_dict, ("k1", 2), 1)
+        self.assertEqual(state_dict["k1"], [None, None, 1])
+
+        traverse.set_element(state_dict, ("k1", 1), 99)
+        self.assertEqual(state_dict["k1"], [None, 99, 1])
+
+        traverse.set_element(state_dict, ("k1", 3), 88)
+        self.assertEqual(state_dict["k1"], [None, 99, 1, 88])
+
+        traverse.set_element(state_dict, ("k2", "k3"), 3)
+        self.assertEqual(state_dict["k2"], {"k3": 3})
+
+        traverse.set_element(state_dict, ("k2", "k4", 0, 0), 99)
+        self.assertEqual(state_dict["k2"]["k4"][0], [99])
+
+    def test_get_element(self) -> None:
+        state_dict = {"a": [0, 1], "b": [2, {"c": "d"}]}
+        self.assertEqual(traverse.get_element(state_dict, ("a",)), [0, 1])
+        self.assertEqual(traverse.get_element(state_dict, ("b", 0)), 2)
+        self.assertEqual(traverse.get_element(state_dict, ("b", 1, "c")), "d")
+
+        self.assertIsNone(traverse.get_element(state_dict, ("c",)))
+        self.assertIsNone(traverse.get_element(state_dict, ("a", 33)))
+        self.assertIsNone(traverse.get_element(state_dict, ("b", 88)))
+        self.assertIsNone(traverse.get_element(state_dict, ("b", 0, 2)))
+        self.assertIsNone(traverse.get_element(state_dict, ("b", 1, 2)))
+        self.assertIsNone(traverse.get_element(state_dict, ("b", 1, "d")))
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/distributed/checkpoint/traverse.py b/torch/distributed/checkpoint/traverse.py
new file mode 100644
index 0000000000000..75dc42453348f
--- /dev/null
+++ b/torch/distributed/checkpoint/traverse.py
@@ -0,0 +1,170 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import torch
+
+from typing import (
+    Callable,
+    Collection,
+    List,
+    Mapping,
+    MutableMapping,
+    Optional,
+    Tuple,
+    TypeVar,
+    Union,
+    cast,
+)
+from torch.distributed.checkpoint.metadata import (
+    STATE_DICT_TYPE,
+)
+from torch.distributed._shard.sharded_tensor.api import ShardedTensor
+from torch.distributed._tensor import DTensor
+
+PATH_ITEM = Union[str, int]
+OBJ_PATH = Tuple[PATH_ITEM, ...]
+T = TypeVar("T")
+
+STATE_DICT_ITEM = object
+CONTAINER_TYPE = MutableMapping[PATH_ITEM, STATE_DICT_ITEM]
+
+__all__ = ["traverse_state_dict", "set_element", "get_element", "print_tensor"]
+
+
+def _keep_visiting_tensors(value: STATE_DICT_ITEM) -> bool:
+    return isinstance(value, torch.Tensor)
+
+
+# TODO: update docstring for traverse.py
+def traverse_state_dict(
+    state_dict: STATE_DICT_TYPE,
+    visitor: Callable[[OBJ_PATH, STATE_DICT_ITEM], None],
+    keep_traversing: Callable[[STATE_DICT_ITEM], bool] = _keep_visiting_tensors,
+) -> None:
+    """
+    Invoke ``visitor`` for each value recursively in ``state_dict``.
+    Traversal is short-circuited when if finds a collection for which ``keep_visiting_tensors`` evaluates
+    to false for all elements.
+    By default, all collections with at least one ``torch.Tensor`` element are traversed.
+    Visitor takes a path argument that is a tuple of the keys used to reach it.
+    """
+    # a value is terminal if it has no other containers values inside it
+    def _is_terminal(value: STATE_DICT_ITEM) -> bool:
+        values: Collection[STATE_DICT_ITEM]
+        if isinstance(value, Mapping):
+            values = value.values()
+        elif isinstance(value, list):
+            values = value
+        else:
+            return True
+
+        for entry in values:
+            if isinstance(entry, (Mapping, list)) and not _is_terminal(entry):
+                return False
+            if keep_traversing is not None and keep_traversing(entry):
+                return False
+        return True
+
+    def _traverse_obj(path: OBJ_PATH, value: STATE_DICT_ITEM) -> None:
+        if _is_terminal(value):
+            visitor(path, value)
+        elif isinstance(value, Mapping):
+            for k, v in value.items():
+                _traverse_obj(path + (str(k),), v)
+        elif isinstance(value, list):
+            for i, v in enumerate(value):
+                _traverse_obj(path + (i,), v)
+
+    for key, value in state_dict.items():
+        _traverse_obj((str(key),), value)
+
+
+def set_element(
+    root_dict: STATE_DICT_TYPE, path: OBJ_PATH, value: STATE_DICT_ITEM
+) -> None:
+    """
+    Set ``value`` in ``root_dict`` along the ``path`` object path.
+    """
+    cur_container = cast(CONTAINER_TYPE, root_dict)
+
+    def extend_list(lst: List[STATE_DICT_ITEM], idx: int) -> None:
+        while len(lst) <= idx:
+            lst.append(None)
+
+    for i in range(1, len(path)):
+        prev_key = path[i - 1]
+        key = path[i]
+        def_val = cast(STATE_DICT_ITEM, {} if type(key) == str else [])
+
+        if isinstance(cur_container, Mapping):
+            cur_container = cast(
+                CONTAINER_TYPE, cur_container.setdefault(prev_key, def_val)
+            )
+        else:
+            extend_list(cur_container, prev_key)
+            if cur_container[prev_key] is None:
+                cur_container[prev_key] = def_val
+            cur_container = cur_container[prev_key]
+
+    key = path[-1]
+    if type(key) == int:
+        extend_list(cast(List[STATE_DICT_ITEM], cur_container), key)
+
+    cur_container[key] = value
+
+
+def get_element(
+    root_dict: STATE_DICT_TYPE,
+    path: OBJ_PATH,
+    default_value: Optional[T] = None,
+) -> Optional[T]:
+    """
+    Retrieve the value at ``path``from ``root_dict``, returning ``default_value`` if not found.
+    """
+    cur_value = cast(CONTAINER_TYPE, root_dict)
+    for part in path:
+        if type(part) is int:
+            if not isinstance(cur_value, list) or len(cur_value) < part:
+                return default_value
+        elif not isinstance(cur_value, Mapping) or part not in cur_value:
+            return default_value
+
+        cur_value = cast(CONTAINER_TYPE, cur_value[part])
+    return cast(Optional[T], cur_value)
+
+
+def _print_nested(
+    value: STATE_DICT_ITEM,
+    prefix: str = "",
+    print_fun: Callable[[str], None] = print,
+) -> None:
+    if type(value) is ShardedTensor:
+        print_fun(f"{prefix} ShardedTensor size: {value.size()}")
+        for shard in value.local_shards():
+            _print_nested(
+                shard.tensor,
+                f"{shard.metadata.shard_offsets} ",
+                print_fun=print_fun,
+            )
+    elif type(value) is (DTensor):
+        print_fun(f"{prefix} DistributedTensor size: {value.size()}")
+        # TODO: add local offset for _local_tensor in print_nested.
+        _print_nested(
+            value._local_tensor,
+            print_fun=print_fun,
+        )
+    elif isinstance(value, torch.Tensor):
+        print_fun(f"{prefix} Tensor size: {value.size()}")
+    else:
+        print_fun(f"{prefix} Type: {type(value)}")
+
+
+def print_tensor(
+    path: OBJ_PATH,
+    value: STATE_DICT_ITEM,
+    print_fun: Callable[[str], None] = print,
+) -> None:
+    """
+    Callback that can be used with travese_state_dict to print its content.
+    By default the content is printed using the builtin ``print`` but this can
+    be change by passing a different ``print_fun` callable.
+    """
+    _print_nested(value, prefix=str(path), print_fun=print_fun)

From 13b656aa8858722b54f74db1d6f713c11a0c28f4 Mon Sep 17 00:00:00 2001
From: lezcano <lezcano-93@hotmail.com>
Date: Sat, 19 Nov 2022 01:00:03 +0000
Subject: [PATCH 1162/1922] Reland Dispatch torch.norm to linalg.vector_norm
 and linalg.matrix_norm (#81761) (#84624)

Reland https://github.com/pytorch/pytorch/pull/81761

Differential Revision: [D39332292](https://our.internmc.facebook.com/intern/diff/D39332292)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84624
Approved by: https://github.com/kit1980
---
 aten/src/ATen/autocast_mode.cpp               |  5 +-
 .../functorch/BatchRulesDecompositions.cpp    |  1 -
 .../ATen/functorch/BatchRulesReduceOps.cpp    |  2 +-
 aten/src/ATen/native/LinearAlgebra.cpp        |  4 +-
 test/functorch/test_vmap.py                   |  5 ++
 test/onnx/test_operators.py                   |  6 +++
 test/onnx/test_pytorch_onnx_onnxruntime.py    |  8 +++
 test/onnx/test_utility_funs.py                |  2 -
 test/test_decomp.py                           |  6 +--
 test/test_linalg.py                           | 15 +++---
 test/test_reductions.py                       |  8 ++-
 torch/functional.py                           | 49 +++++++++++++++---
 .../_internal/common_methods_invocations.py   | 51 +++++++------------
 13 files changed, 100 insertions(+), 62 deletions(-)

diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
index ca75c38258ff9..ee8b4b30b1520 100644
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@@ -450,6 +450,9 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
   KERNEL2(cumprod, dimname, fp32_set_opt_dtype)
   KERNEL(cumsum, fp32_set_opt_dtype)
   KERNEL2(cumsum, dimname, fp32_set_opt_dtype)
+  KERNEL(linalg_vector_norm, fp32_set_opt_dtype)
+  KERNEL(linalg_matrix_norm, fp32_set_opt_dtype)
+  KERNEL2(linalg_matrix_norm, str_ord, fp32_set_opt_dtype)
   // commenting these out because they accept an explicit (not-optional) dtype, and we shouldn't try to flip that even
   // when autocasting.
   // KERNEL2(norm, ScalarOpt_dtype, fp32_set_opt_dtype)
@@ -576,8 +579,6 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
   KERNEL_CPU(fft_irfftn, fp32)
   KERNEL_CPU(fft_hfft, fp32)
   KERNEL_CPU(fft_ihfft, fp32)
-  KERNEL_CPU(linalg_matrix_norm, fp32)
-  KERNEL_CPU2(linalg_matrix_norm, str_ord, fp32)
   KERNEL_CPU(linalg_cond, fp32)
   KERNEL_CPU2(linalg_cond, p_str, fp32)
   KERNEL_CPU(linalg_matrix_rank, fp32)
diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
index 05ee8d07a410e..d5a38e9804dd4 100644
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@@ -253,7 +253,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   OP_DECOMPOSE2(conv2d, padding);
   OP_DECOMPOSE2(conv3d, padding);
   OP_DECOMPOSE(_convolution_mode);
-  OP_DECOMPOSE(frobenius_norm);
   OP_DECOMPOSE(type_as);
   OP_DECOMPOSE(linalg_diagonal);
   OP_DECOMPOSE(diagonal_copy);
diff --git a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
index 8654b78455014..9126507e73be0 100644
--- a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
@@ -168,7 +168,7 @@ void boxed_reduction_batch_rule(const c10::OperatorHandle& op, torch::jit::Stack
 #define REDUCTION_BOXED_ARGS(op, dim_pos) \
   m.impl(#op, torch::CppFunction::makeFromBoxedFunction<boxed_reduction_batch_rule<dim_pos>>());
 
-// Skipping frobenius/nuclear/all/any since they don't have opinfo tests right now :P
+// Skipping all/any since they don't have opinfo tests right now :P
 
 Tensor dist_decomp(const Tensor& self, const Tensor& other, const Scalar& p) {
   return at::norm((self - other), p);
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index c21bc4b475312..7e47170cd72ee 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -2770,7 +2770,7 @@ Tensor& linalg_norm_out(const Tensor& X, c10::string_view ord, OptionalIntArrayR
 
 ////////////////////////////////////////////////////////////////////////////////
 //                              Frobenius Norm                                //
-//             Just used in linalg.norm. It should not be removed.            //
+//             Just used in torch..norm. It should not be removed.            //
 ////////////////////////////////////////////////////////////////////////////////
 
 Tensor frobenius_norm(const Tensor& self) {
@@ -2816,7 +2816,7 @@ Tensor &frobenius_norm_out(const Tensor& self,
 
 ////////////////////////////////////////////////////////////////////////////////
 //                                Nuclear Norm                                //
-//             Just used in linalg.norm. It should not be removed.            //
+//              Just used in torch.norm. It should not be removed.            //
 ////////////////////////////////////////////////////////////////////////////////
 
 Tensor nuclear_norm(const Tensor& self, bool keepdim) {
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index 4c2c680ca6371..9b3293a7db752 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -3878,6 +3878,11 @@ def f(e_):
         skip('linalg.multi_dot'),  # accepts list of tensor inputs, has its own special test
         xfail('linalg.vander'),
         xfail('linalg.vecdot'),
+        # throws in vmap on CUDA
+        # IndexError: Dimension out of range (expected to be in range of [-1, 0], but got -2)
+        # https://github.com/pytorch/pytorch/runs/8110653462?check_suite_focus=true
+        # but it passes locally
+        skip('linalg.matrix_norm', ''),
         skip('linalg.ldl_solve', ''),
     })
     def test_vmap_linalg_failure_1D_input(self, device, dtype, op):
diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py
index cfb36732af4d8..7375cf3fe4d7a 100644
--- a/test/onnx/test_operators.py
+++ b/test/onnx/test_operators.py
@@ -654,10 +654,14 @@ def test_repeat_dim_overflow(self):
         x = torch.randn(1, 2, requires_grad=True)
         self.assertONNX(lambda x: x.repeat(1, 2, 3, 4), x)
 
+    @unittest.skip("It started failing after #81761")
+    # TODO(#83661): Fix and enable the test
     def test_norm_p1(self):
         x = torch.randn(1, 2, 3, 4, requires_grad=True)
         self.assertONNX(lambda x: x.norm(p=1, dim=2), (x))
 
+    @unittest.skip("It started failing after #81761")
+    # TODO(#83661): Fix and enable the test
     def test_norm_p2(self):
         x = torch.randn(1, 2, 3, 4, requires_grad=True)
         self.assertONNX(lambda x: x.norm(p=2, dim=2), (x))
@@ -957,6 +961,8 @@ def test_pixel_shuffle(self):
             lambda x: torch.pixel_shuffle(x, upscale_factor=2), x, opset_version=11
         )
 
+    @unittest.skip("It started failing after #81761")
+    # TODO(#83661): Fix and enable the test
     def test_frobenius_norm(self):
         x = torch.randn(2, 3, 4).float()
         self.assertONNX(lambda x: torch.norm(x, p="fro", dim=(0, 1), keepdim=True), x)
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 16839dded0c40..184cc5f4ae672 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -6701,6 +6701,8 @@ def forward(self, x, y):
         y = torch.tensor(2)
         self.run_test(FullLikeModel(), (x, y))
 
+    @unittest.skip("It started failing after #81761")
+    # TODO(#83661): Fix and enable the test
     def test_l1_norm(self):
         class NormModel(torch.nn.Module):
             def forward(self, x):
@@ -6709,6 +6711,8 @@ def forward(self, x):
         x = torch.randn(4, 2, 3, requires_grad=True)
         self.run_test(NormModel(), x)
 
+    @unittest.skip("It started failing after #81761")
+    # TODO(#83661): Fix and enable the test
     def test_l2_norm(self):
         class NormModel(torch.nn.Module):
             def forward(self, x):
@@ -6717,6 +6721,8 @@ def forward(self, x):
         x = torch.randn(4, 2, 3, requires_grad=True)
         self.run_test(NormModel(), x)
 
+    @unittest.skip("It started failing after #81761")
+    # TODO(#83661): Fix and enable the test
     def test_frobenius_norm(self):
         class NormModel(torch.nn.Module):
             def forward(self, x):
@@ -6725,6 +6731,8 @@ def forward(self, x):
         x = torch.randn(4, 2, 3, requires_grad=True)
         self.run_test(NormModel(), x)
 
+    @unittest.skip("It started failing after #81761")
+    # TODO(#83661): Fix and enable the test
     def test_frobenius_norm_keepdim(self):
         class NormModel(torch.nn.Module):
             def forward(self, x):
diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py
index 5d1cdc5e8ea58..7e23b06e55413 100644
--- a/test/onnx/test_utility_funs.py
+++ b/test/onnx/test_utility_funs.py
@@ -240,7 +240,6 @@ def forward(self, x):
 
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::ReduceL2")
-        self.assertEqual(len(list(graph.nodes())), 2)
 
     def test_constant_fold_reduceL1(self):
         class NormModule(torch.nn.Module):
@@ -258,7 +257,6 @@ def forward(self, x):
 
         for node in graph.nodes():
             self.assertNotEqual(node.kind(), "onnx::ReduceL1")
-        self.assertEqual(len(list(graph.nodes())), 2)
 
     def test_constant_fold_slice(self):
         class NarrowModule(torch.nn.Module):
diff --git a/test/test_decomp.py b/test/test_decomp.py
index ad8cf27ae0f21..d69d72753e470 100644
--- a/test/test_decomp.py
+++ b/test/test_decomp.py
@@ -159,8 +159,8 @@ def op_assert_ref(test_case, op, test_dtype, i, orig, decomp, ref, args, kwargs)
         (torch.bfloat16, torch.ops.aten.native_layer_norm_backward.default): 2e-2,
         (torch.bfloat16, torch.ops.aten.native_batch_norm.default): 1e-5,
         (torch.float16, torch.ops.aten.native_batch_norm.default): 1e-5,
-        (torch.bfloat16, torch.ops.aten.linalg_vector_norm.default): 1e-6,
-        (torch.float16, torch.ops.aten.linalg_vector_norm.default): 1e-6,
+        (torch.bfloat16, torch.ops.aten.linalg_vector_norm.default): 1e-5,
+        (torch.float16, torch.ops.aten.linalg_vector_norm.default): 1e-5,
         (torch.float16, torch.ops.aten.nll_loss_forward.default): 1e-2,
         (torch.bfloat16, torch.ops.aten.nll_loss_forward.default): 1e-1,
     }
@@ -303,9 +303,9 @@ def normalize_op_input_output(f, sample, requires_grad=True):
     (None, None, "meshgrid"),
     # diag was not decomposed (it just registers a decomp for diag_out, torch.diag is CompImplicit)
     (None, None, "diag"),
-
     # _softmax_backward_data's CPU kernel for bfloat16 always return the grad_input as float32
     ("cpu", torch.bfloat16, "_softmax_backward_data"),
+    (None, None, "norm"),
 }
 
 CROSS_REF_BACKWARD_EXCLUDE_SET = {
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 273c74d4e6146..41c3e8a2d9ba2 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -1357,17 +1357,16 @@ def run_test_case(input, ord, dim, keepdim):
     def test_norm_fused_type_promotion(self, device, dtype):
         x = torch.randn(10, device=device, dtype=dtype)
 
-        def profile_and_check(fn, x, kwargs, fn_name):
+        def profile_and_check(fn, x, kwargs):
             with torch.profiler.profile(activities=(torch.profiler.ProfilerActivity.CPU,)) as p:
                 fn(x, **kwargs, dtype=torch.float)
             # smoke check that profiler returned some events
-            self.assertTrue(fn_name in map(lambda e: e.name, p.events()))
+            self.assertTrue("aten::linalg_vector_norm" in (e.name for e in p.events()))
             # test that there was no explicit copy
-            self.assertFalse("aten::to" in map(lambda e: e.name, p.events()))
+            self.assertFalse("aten::to" in (e.name for e in p.events()))
 
-        for f, kwargs, fn_name in zip((torch.norm, torch.linalg.vector_norm), ({"p" : 2}, {}),
-                                      ("aten::norm", "aten::linalg_vector_norm")):
-            profile_and_check(f, x, kwargs, fn_name)
+        for f, kwargs, in zip((torch.linalg.vector_norm, torch.norm), ({}, {"p" : 2})):
+            profile_and_check(f, x, kwargs)
 
     @skipMeta  # https://github.com/pytorch/pytorch/issues/53739
     @skipCPUIfNoLapack
@@ -2310,10 +2309,10 @@ def test_nuclear_norm_exceptions_old(self, device):
             x = torch.tensor(lst, dtype=torch.double, device=device)
             for axes in (), (0,):
                 self.assertRaises(RuntimeError, torch.norm, x, "nuc", axes)
-            self.assertRaises(IndexError, torch.norm, x, "nuc", (0, 1))
+            self.assertRaises(RuntimeError, torch.norm, x, "nuc", (0, 1))
 
         x = torch.tensor([[0, 1, 2], [3, 4, 5]], dtype=torch.double, device=device)
-        self.assertRaisesRegex(RuntimeError, "duplicate or invalid", torch.norm, x, "nuc", (0, 0))
+        self.assertRaisesRegex(RuntimeError, "must be different", torch.norm, x, "nuc", (0, 0))
         self.assertRaisesRegex(IndexError, "Dimension out of range", torch.norm, x, "nuc", (0, 2))
 
     @skipCUDAIfNoCusolver
diff --git a/test/test_reductions.py b/test/test_reductions.py
index 8d91f56545f01..7a360888e6592 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -464,9 +464,9 @@ def test_dim_reduction_less_than_64(self, device):
                torch.norm]
         for op in ops:
             with self.assertRaisesRegex(RuntimeError, "only tensors with up to 64 dims are supported"):
-                op(x, 64)
+                op(x, dim=64)
             with self.assertRaisesRegex(RuntimeError, "only tensors with up to 64 dims are supported"):
-                op(x, -1)
+                op(x, dim=-1)
 
     @onlyCPU
     @dtypes(torch.float, torch.bfloat16)
@@ -1793,11 +1793,9 @@ def test_repeated_dim(self, device):
         x = torch.randn(3, 3, 3, 3, device=device)
 
         error_msg = r'appears multiple times in the list of dims'
-        norm_error_msg = r'Expected dims to be different, got'
         for op in ops:
             for dim in [(0, 0), (0, -4)]:
-                e_msg = norm_error_msg if op == torch.norm else error_msg
-                with self.assertRaisesRegex(RuntimeError, e_msg):
+                with self.assertRaisesRegex(RuntimeError, error_msg):
                     op(x, dim=dim)
 
     # TODO: update this test to comapre against NumPy
diff --git a/torch/functional.py b/torch/functional.py
index 7e96d42fde30c..ee04cb250c2ce 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -1393,10 +1393,11 @@ def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa
         Its documentation and behavior may be incorrect, and it is no longer
         actively maintained.
 
-        Use :func:`torch.linalg.norm`, instead, or :func:`torch.linalg.vector_norm`
-        when computing vector norms and :func:`torch.linalg.matrix_norm` when
-        computing matrix norms. Note, however, the signature for these functions
-        is slightly different than the signature for torch.norm.
+        Use :func:`torch.linalg.vector_norm` when computing vector norms and
+        :func:`torch.linalg.matrix_norm` when computing matrix norms.
+        For a function with a similar behavior as this one see :func:`torch.linalg.norm`.
+        Note, however, the signature for these functions is slightly different than the
+        signature for ``torch.norm``.
 
     Args:
         input (Tensor): The input tensor. Its data type must be either a floating
@@ -1446,8 +1447,8 @@ def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa
     .. note::
         Even though ``p='fro'`` supports any number of dimensions, the true
         mathematical definition of Frobenius norm only applies to tensors with
-        exactly two dimensions. :func:`torch.linalg.norm` with ``ord='fro'`` aligns
-        with the mathematical definition, since it can only be applied across
+        exactly two dimensions. :func:`torch.linalg.matrix_norm` with ``ord='fro'``
+        aligns with the mathematical definition, since it can only be applied across
         exactly two dimensions.
 
     Example::
@@ -1481,6 +1482,42 @@ def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa
         return handle_torch_function(
             norm, (input,), input, p=p, dim=dim, keepdim=keepdim, out=out, dtype=dtype)
 
+    # NB. All the repeated code and weird python is to please TorchScript.
+    #     For a more compact implementation see the relevant function in `_refs/__init__.py`
+
+    # We don't do this for MPS or sparse tensors
+    if input.layout == torch.strided and input.device.type in ("cpu", "cuda", "meta"):
+        if dim is not None:
+            if isinstance(dim, int):
+                _dim = [dim]
+            else:
+                _dim = dim
+        else:
+            _dim = None  # type: ignore[assignment]
+
+        if isinstance(p, str):
+            if p == "fro" and (dim is None or isinstance(dim, int) or len(dim) <= 2):
+                if out is None:
+                    return torch.linalg.vector_norm(input, 2, _dim, keepdim, dtype=dtype)
+                else:
+                    return torch.linalg.vector_norm(input, 2, _dim, keepdim, dtype=dtype, out=out)
+
+            # Here we either call the nuclear norm, or we call matrix_norm with some arguments
+            # that will throw an error
+            if _dim is None:
+                _dim = list(range(input.ndim))
+            if out is None:
+                return torch.linalg.matrix_norm(input, p, _dim, keepdim, dtype=dtype)
+            else:
+                return torch.linalg.matrix_norm(input, p, _dim, keepdim, dtype=dtype, out=out)
+        else:
+            # NB. p should be Union[str, number], not Optional!
+            _p = 2.0 if p is None else p
+            if out is None:
+                return torch.linalg.vector_norm(input, _p, _dim, keepdim, dtype=dtype)
+            else:
+                return torch.linalg.vector_norm(input, _p, _dim, keepdim, dtype=dtype, out=out)
+
     ndim = input.dim()
 
     # catch default case
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 3d3c13bb7208e..4b2d0ebabc46b 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -15878,24 +15878,20 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "norm",
         sample_inputs_func=sample_inputs_norm,
         dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+        # TODO Benchmark again with the new implementation
         # Runs very slowly on slow gradcheck - alternatively reduce input sizes
         gradcheck_fast_mode=True,
+        check_batched_forward_grad=False,
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
         skips=(
-            # AssertionError: RuntimeError not raised : Expected RuntimeError when doing an unsafe cast from a result
-            # of dtype torch.float32 into an out= with dtype torch.long
-            DecorateInfo(
-                unittest.expectedFailure,
-                "TestCommon",
-                "test_out",
-                device_type="meta",
-            ),
-        ),
+            # Dispatches in Python to vector_norm. Not sure how to make this test happy
+            # Happens to pass on complex64. Also a mystery
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',
+                         dtypes=(torch.float32,)),)
     ),
     OpInfo('norm',
            variant_test_name='nuc',
-           aten_name='nuclear_norm',
            sample_inputs_func=sample_inputs_norm_nuc,
            decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
            check_batched_gradgrad=False,
@@ -15907,19 +15903,14 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypes=floating_and_complex_types(),
            dtypesIfCUDA=floating_and_complex_types(),
            skips=(
-               # RuntimeError not raised :
-               # Expected RuntimeError when calling with input.device=cpu and out.device=cuda
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
-               # RuntimeError:
-               # Arguments for call are not valid.
-               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.complex64, torch.float32,)),  # noqa: B950
-           )
+               # Dispatches in Python to matrix_norm. Not sure how to make this test happy
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',
+                            dtypes=(torch.complex64, torch.float32,)),)
            ),
     OpInfo('norm',
            variant_test_name='fro',
-           aten_name='frobenius_norm',
            sample_inputs_func=sample_inputs_norm_fro,
-           dtypes=floating_and_complex_types_and(torch.bfloat16),
+           dtypes=floating_and_complex_types_and(torch.bfloat16, torch.float16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
            supports_forward_ad=True,
            # torch.autograd.gradcheck.GradcheckError: While computing batched gradients
@@ -15933,33 +15924,29 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    'TestSchemaCheckModeOpInfo',
                    'test_schema_correctness',
                    dtypes=(torch.complex64, torch.complex128)),
-               # Expected RuntimeError when calling with input.device=cpu and out.device=cuda
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
-               # Arguments for call are not valid.
-               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.complex64, torch.float32,)),  # noqa: B950
-           )),
+               # Dispatches in Python to vector_norm. Not sure how to make this test happy
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',
+                            dtypes=(torch.complex64, torch.float32,)),)
+           ),
     OpInfo(
         "norm",
         variant_test_name="inf",
         sample_inputs_func=sample_inputs_norm_inf,
         dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
         supports_forward_ad=True,
+        check_batched_forward_grad=False,
         supports_fwgrad_bwgrad=True,
         # fast gradcheck produces NaNs
         gradcheck_fast_mode=False,
         skips=(
-            # AssertionError: RuntimeError not raised : Expected RuntimeError when doing an unsafe cast from a result
-            # of dtype torch.float32 into an out= with dtype torch.long
-            DecorateInfo(
-                unittest.expectedFailure,
-                "TestCommon",
-                "test_out",
-                device_type="meta",
-            ),
             DecorateInfo(
                 toleranceOverride({torch.float16: tol(atol=2e-3, rtol=1e-3)}),
                 'TestInductorOpInfo', 'test_comprehensive', device_type='cuda',
             ),
+            # Dispatches in Python to vector_norm. Not sure how to make this test happy
+            # Happens to pass on complex64. Also a mystery
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',
+                         dtypes=(torch.float32,))
         ),
     ),
     OpInfo('t',

From 63fdf028c85905f7499fca59815767e14b394aeb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleksandar=20Samard=C5=BEi=C4=87?=
 <asamardzic@quansight.com>
Date: Mon, 21 Nov 2022 04:22:00 +0000
Subject: [PATCH 1163/1922] Vectorized CPU code implementing right shift
 operator. (#88990)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88990
Approved by: https://github.com/lezcano, https://github.com/peterbell10
---
 aten/src/ATen/cpu/vec/vec256/vec256_int.h    | 44 ++++++++++++++++++++
 aten/src/ATen/cpu/vec/vec512/vec512_int.h    | 24 +++++++++++
 aten/src/ATen/cpu/vec/vec_base.h             | 14 +++++++
 aten/src/ATen/native/cpu/BinaryOpsKernel.cpp | 11 +++--
 4 files changed, 89 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_int.h b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
index 7737f4a0037cd..f17cdc5bc156a 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
@@ -1181,6 +1181,8 @@ Vectorized<int16_t> inline shift_256_16(const Vectorized<int16_t>& a, const Vect
   __m256i c0;
   if (left_shift)
     c0 = _mm256_sllv_epi32(a0, b0);
+  else
+    c0 = _mm256_srav_epi32(a0, b0);
   c0 = _mm256_shuffle_epi8(c0, ctl_1_0);
 
   // Peform shifting the same way for input array elements with
@@ -1190,6 +1192,8 @@ Vectorized<int16_t> inline shift_256_16(const Vectorized<int16_t>& a, const Vect
   __m256i c1;
   if (left_shift)
     c1 = _mm256_sllv_epi32(a1, b1);
+  else
+    c1 = _mm256_srav_epi32(a1, b1);
   c1 = _mm256_and_si256(c1, keep_1);
 
   // Merge partial results into the final result.
@@ -1271,6 +1275,8 @@ Vectorized<int8_t> inline shift_256_8(const Vectorized<int8_t>& a, const Vectori
   __m256i c0;
   if (left_shift)
     c0 = _mm256_sllv_epi32(a0, b0);
+  else
+    c0 = _mm256_srav_epi32(a0, b0);
   c0 = _mm256_shuffle_epi8(c0, ctl_3_0);
 
   // Peform shifting the same way for input array elements with
@@ -1280,6 +1286,8 @@ Vectorized<int8_t> inline shift_256_8(const Vectorized<int8_t>& a, const Vectori
   __m256i c1;
   if (left_shift)
     c1 = _mm256_sllv_epi32(a1, b1);
+  else
+    c1 = _mm256_srav_epi32(a1, b1);
   c1 = _mm256_shuffle_epi8(c1, ctl_3_1);
 
   // Peform shifting the same way for input array elements with
@@ -1289,6 +1297,8 @@ Vectorized<int8_t> inline shift_256_8(const Vectorized<int8_t>& a, const Vectori
   __m256i c2;
   if (left_shift)
     c2 = _mm256_sllv_epi32(a2, b2);
+  else
+    c2 = _mm256_srav_epi32(a2, b2);
   c2 = _mm256_shuffle_epi8(c2, ctl_3_2);
 
   // Peform shifting the same way for input array elements with
@@ -1298,6 +1308,8 @@ Vectorized<int8_t> inline shift_256_8(const Vectorized<int8_t>& a, const Vectori
   __m256i c3;
   if (left_shift)
     c3 = _mm256_sllv_epi32(a3, b3);
+  else
+    c3 = _mm256_srav_epi32(a3, b3);
   c3 = _mm256_and_si256(c3, keep_3);
 
   // Merge partial results into the final result.
@@ -1328,6 +1340,38 @@ Vectorized<int8_t> inline operator<<(const Vectorized<int8_t>& a, const Vectoriz
   return shift_256_8<true>(a, b);
 }
 
+template <>
+Vectorized<int64_t> inline operator>>(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  // No vector instruction for right shifting int64_t, so emulating it
+  // instead.
+
+  // Shift the number logically to the right, thus filling the most
+  // significant bits with 0s.  Then, replace these bits with the sign
+  // bit.
+  __m256i sign_bits = _mm256_cmpgt_epi64(_mm256_set1_epi64x(0), a);
+  __m256i b_inv_mod_64 = _mm256_sub_epi64(_mm256_set1_epi64x(64), b);
+  __m256i sign_ext = _mm256_sllv_epi64(sign_bits, b_inv_mod_64);
+  __m256i c = _mm256_srlv_epi64(a, b);
+  c = _mm256_or_si256(c, sign_ext);
+
+  return c;
+}
+
+template <>
+Vectorized<int32_t> inline operator>>(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return _mm256_srav_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator>>(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return shift_256_16<false>(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline operator>>(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  return shift_256_8<false>(a, b);
+}
+
 #endif
 
 }}}
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_int.h b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
index 590c3254e3790..bf03f8e290b60 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_int.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
@@ -1219,6 +1219,8 @@ Vectorized<int8_t> inline shift_512_8(const Vectorized<int8_t>& a, const Vectori
   __m512i c0;
   if (left_shift)
     c0 = _mm512_sllv_epi16(a0, b0);
+  else
+    c0 = _mm512_srav_epi16(a0, b0);
   c0 = _mm512_shuffle_epi8(c0, ctl_1_0);
 
   // Peform shifting the same way for input array elements with
@@ -1228,6 +1230,8 @@ Vectorized<int8_t> inline shift_512_8(const Vectorized<int8_t>& a, const Vectori
   __m512i c1;
   if (left_shift)
     c1 = _mm512_sllv_epi16(a1, b1);
+  else
+    c1 = _mm512_srav_epi16(a1, b1);
   c1 = _mm512_and_si512(c1, keep_1);
 
   // Merge partial results into the final result.
@@ -1256,6 +1260,26 @@ Vectorized<int8_t> inline operator<<(const Vectorized<int8_t>& a, const Vectoriz
   return shift_512_8<true>(a, b);
 }
 
+template <>
+Vectorized<int64_t> inline operator>>(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return _mm512_srav_epi64(a, b);
+}
+
+template <>
+Vectorized<int32_t> inline operator>>(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return _mm512_srav_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator>>(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return _mm512_srav_epi16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline operator>>(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  return shift_512_8<false>(a, b);
+}
+
 #endif
 
 }}}
diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h
index e9e87fa605f79..abf106e8d5b36 100644
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@@ -807,6 +807,14 @@ template <class T> Vectorized<T> inline operator<<(const Vectorized<T> &a, const
   return c;
 }
 
+template <class T> Vectorized<T> inline operator>>(const Vectorized<T> &a, const Vectorized<T> &b) {
+  Vectorized<T> c;
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    c[i] = a[i] >> b[i];
+  }
+  return c;
+}
+
 template <typename T>
 inline Vectorized<T>& operator += (Vectorized<T>& a, const Vectorized<T>& b) {
   a = a + b;
@@ -839,6 +847,12 @@ inline Vectorized<T>& operator <<= (Vectorized<T>& a, const Vectorized<T>& b) {
   return a;
 }
 
+template <typename T>
+inline Vectorized<T>& operator >>= (Vectorized<T>& a, const Vectorized<T>& b) {
+  a = a >> b;
+  return a;
+}
+
 template <typename T>
 inline Vectorized<T> fmadd(const Vectorized<T>& a, const Vectorized<T>& b, const Vectorized<T>& c) {
   return a * b + c;
diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
index c2497a6949f12..9b5f442ef02cc 100644
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@@ -383,10 +383,13 @@ void logical_xor_kernel(TensorIterator& iter) {
 
 void rshift_kernel(TensorIteratorBase& iter) {
   AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "rshift_cpu", [&]() {
-    cpu_kernel(iter,
-      [](scalar_t a, scalar_t b) -> scalar_t {
-        return a >> b;
-      });
+    cpu_kernel_vec(iter,
+        [](scalar_t a, scalar_t b) -> scalar_t {
+          return a >> b;
+        },
+        [](Vectorized<scalar_t> a, Vectorized<scalar_t> b) {
+          return a >> b;
+        });
   });
 }
 

From 7fee22da06e469eff4913c5ff47baa299f13b9cd Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Tue, 22 Nov 2022 11:05:58 +0000
Subject: [PATCH 1164/1922] [Vulkan][TCC] Fix quantized shaders (#89456)

Summary: Fix rounding issue in quantized shaders

Test Plan:
On Mac
```
cd ~/fbsource
buck1 run -c pt.vulkan_full_precision=1 //xplat/caffe2:pt_vulkan_quantized_api_test_binAppleMac\#macosx-arm64
```

On Android
```
cd ~/fbsource
buck1 build -c ndk.custom_libcxx=false -c pt.enable_qpl=0 -c pt.vulkan_full_precision=1 //xplat/caffe2:pt_vulkan_quantized_api_test_binAndroid\#android-arm64 --show-output
adb push buck-out/gen/xplat/caffe2/pt_vulkan_quantized_api_test_binAndroid\#android-arm64 /data/local/tmp/vulkan_quantized_api_test
adb shell "/data/local/tmp/vulkan_quantized_api_test"
```

Reviewed By: salilsdesai

Differential Revision: D41047095

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89456
Approved by: https://github.com/kirklandsign, https://github.com/digantdesai
---
 aten/src/ATen/native/vulkan/glsl/quantize_per_tensor.glsl | 8 +++++---
 aten/src/ATen/native/vulkan/glsl/quantized_add.glsl       | 4 ++--
 aten/src/ATen/native/vulkan/glsl/quantized_conv2d.glsl    | 2 +-
 aten/src/ATen/native/vulkan/glsl/quantized_conv2d_dw.glsl | 2 +-
 .../ATen/native/vulkan/glsl/quantized_conv2d_pw_2x2.glsl  | 2 +-
 aten/src/ATen/native/vulkan/glsl/quantized_div.glsl       | 4 ++--
 aten/src/ATen/native/vulkan/glsl/quantized_mul.glsl       | 4 ++--
 aten/src/ATen/native/vulkan/glsl/quantized_sub.glsl       | 4 ++--
 .../native/vulkan/glsl/quantized_upsample_nearest2d.glsl  | 3 +--
 9 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/aten/src/ATen/native/vulkan/glsl/quantize_per_tensor.glsl b/aten/src/ATen/native/vulkan/glsl/quantize_per_tensor.glsl
index 910603aa29f26..f67954ad48c14 100644
--- a/aten/src/ATen/native/vulkan/glsl/quantize_per_tensor.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/quantize_per_tensor.glsl
@@ -19,11 +19,13 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
   if (all(lessThan(pos, uBlock.size.xyz))) {
-    vec4 ret = texelFetch(uInput, pos, 0) / uBlock.scale.x + uBlock.zero_point.x;
-    uvec4 texel = uvec4(int(ret.x), int(ret.y), int(ret.z), int(ret.w));
+    vec4 q_res = roundEven(texelFetch(uInput, pos, 0) / uBlock.scale.x) + uBlock.zero_point.x;
+
+    uvec4 ret = uvec4(q_res);
+
     imageStore(
         uOutput,
         pos,
-        texel);
+        ret);
   }
 }
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_add.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_add.glsl
index 8f6e51397d1c1..a526dc2121bf7 100644
--- a/aten/src/ATen/native/vulkan/glsl/quantized_add.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/quantized_add.glsl
@@ -34,9 +34,9 @@ void main() {
     vec4 deq_in_1 = uBlock.in_scale.y * (texel1 - uBlock.in_zero_point.y);
 
     vec4 res = deq_in_0 + deq_in_1;
-    vec4 q_res = res / uBlock.out_scale.x + uBlock.out_zero_point.x;
+    vec4 q_res = roundEven(res / uBlock.out_scale.x) + uBlock.out_zero_point.x;
 
-    uvec4 ret = uvec4(int(q_res.x), int(q_res.y), int(q_res.z), int(q_res.w));
+    uvec4 ret = uvec4(q_res);
 
     imageStore(
         uOutput,
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d.glsl
index bb139d914f07a..63bf055761cc9 100644
--- a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d.glsl
@@ -64,7 +64,7 @@ vec4 dequantize(vec4 tex, float scale, int zero_point) {
  * Quantizes a float texel based on a scale and zero point.
  */
 uvec4 quantize(vec4 tex, float scale, int zero_point) {
-  return uvec4(tex / scale + zero_point);
+  return uvec4(roundEven(tex / scale) + zero_point);
 }
 
 /*
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_dw.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_dw.glsl
index c2ccee79d56ad..0d823620a517f 100644
--- a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_dw.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_dw.glsl
@@ -65,7 +65,7 @@ vec4 dequantize(vec4 tex, float scale, int zero_point) {
  * Quantizes a float texel based on a scale and zero point.
  */
 uvec4 quantize(vec4 tex, float scale, int zero_point) {
-  return uvec4(tex / scale + zero_point);
+  return uvec4(roundEven(tex / scale) + zero_point);
 }
 
 void main() {
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_pw_2x2.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_pw_2x2.glsl
index c8a2a98f9ef0b..2ef6d3d60f324 100644
--- a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_pw_2x2.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_pw_2x2.glsl
@@ -60,7 +60,7 @@ vec4 dequantize(vec4 tex, float scale, int zero_point) {
  * Quantizes a float texel based on a scale and zero point.
  */
 uvec4 quantize(vec4 tex, float scale, int zero_point) {
-  return uvec4(tex / scale + zero_point);
+  return uvec4(roundEven(tex / scale) + zero_point);
 }
 
 /*
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_div.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_div.glsl
index aa961eb349934..1998c5abbca38 100644
--- a/aten/src/ATen/native/vulkan/glsl/quantized_div.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/quantized_div.glsl
@@ -34,9 +34,9 @@ void main() {
     vec4 deq_in_1 = uBlock.in_scale.y * (texel1 - uBlock.in_zero_point.y);
 
     vec4 res = deq_in_0 / deq_in_1;
-    vec4 q_res = res / uBlock.out_scale.x + uBlock.out_zero_point.x;
+    vec4 q_res = roundEven(res / uBlock.out_scale.x) + uBlock.out_zero_point.x;
 
-    uvec4 ret = uvec4(int(q_res.x), int(q_res.y), int(q_res.z), int(q_res.w));
+    uvec4 ret = uvec4(q_res);
 
     imageStore(
         uOutput,
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_mul.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_mul.glsl
index 459f56915d774..c1ce18dbb38c1 100644
--- a/aten/src/ATen/native/vulkan/glsl/quantized_mul.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/quantized_mul.glsl
@@ -34,9 +34,9 @@ void main() {
     vec4 deq_in_1 = uBlock.in_scale.y * (texel1 - uBlock.in_zero_point.y);
 
     vec4 res = deq_in_0 * deq_in_1;
-    vec4 q_res = res / uBlock.out_scale.x + uBlock.out_zero_point.x;
+    vec4 q_res = roundEven(res / uBlock.out_scale.x) + uBlock.out_zero_point.x;
 
-    uvec4 ret = uvec4(int(q_res.x), int(q_res.y), int(q_res.z), int(q_res.w));
+    uvec4 ret = uvec4(q_res);
 
     imageStore(
         uOutput,
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_sub.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_sub.glsl
index 6bd00f33a89c0..767181f080fdd 100644
--- a/aten/src/ATen/native/vulkan/glsl/quantized_sub.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/quantized_sub.glsl
@@ -34,9 +34,9 @@ void main() {
     vec4 deq_in_1 = uBlock.in_scale.y * (texel1 - uBlock.in_zero_point.y);
 
     vec4 res = deq_in_0 - deq_in_1;
-    vec4 q_res = res / uBlock.out_scale.x + uBlock.out_zero_point.x;
+    vec4 q_res = roundEven(res / uBlock.out_scale.x) + uBlock.out_zero_point.x;
 
-    uvec4 ret = uvec4(int(q_res.x), int(q_res.y), int(q_res.z), int(q_res.w));
+    uvec4 ret = uvec4(q_res);
 
     imageStore(
         uOutput,
diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_upsample_nearest2d.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_upsample_nearest2d.glsl
index 28c167515405e..46abbb1a8d768 100644
--- a/aten/src/ATen/native/vulkan/glsl/quantized_upsample_nearest2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/quantized_upsample_nearest2d.glsl
@@ -25,8 +25,7 @@ void main() {
         ivec2(0),
         uBlock.isize);
 
-    vec4 texel = texelFetch(uInput, ivec3(ipos, pos.z), 0);
-    uvec4 ret = uvec4(int(texel.r), int(texel.g), int(texel.b), int(texel.a));
+    uvec4 ret = texelFetch(uInput, ivec3(ipos, pos.z), 0);
 
     imageStore(
         uOutput,

From c79c66f52f7ade94107bbded4244cee5988f6671 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Tue, 22 Nov 2022 05:02:45 -0800
Subject: [PATCH 1165/1922] Make aten.copy preserve strides (hf_Longformer)
 (#89464)

Fixes https://github.com/pytorch/torchdynamo/issues/1888

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Differential Revision: [D41460986](https://our.internmc.facebook.com/intern/diff/D41460986)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89464
Approved by: https://github.com/bdhirsh
---
 aten/src/ATen/native/Copy.cpp              | 46 +++++++++------
 aten/src/ATen/native/native_functions.yaml |  2 +
 test/test_functionalization.py             | 65 +++++++++++++---------
 test/test_fx_reinplace_pass.py             |  3 +-
 torch/_inductor/decomposition.py           | 11 ++++
 5 files changed, 81 insertions(+), 46 deletions(-)

diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
index dc30db8e11001..0c99943eb0cb0 100644
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@@ -278,27 +278,39 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
   return self;
 }
 
+// NB: cribbed from https://github.com/pytorch/pytorch/pull/88198
+at::Tensor clone_preserve_strides(const at::Tensor& self) {
+  TORCH_INTERNAL_ASSERT(self.has_storage());
+  // In cases where the input tensor has internal memory overlap, we cannot actually
+  // preserve the strides/storage_offset of the input tensor, because
+  // *_scatter ops will try to copy_() into the cloned tensor.
+  // However, this should **never** show up in functionalized user code;
+  // most aten ops that try to mutate a tensor with internal memory overlap would error anyway.
+  //
+  // The one place that this does come up is in autograd - if there's a select_scatter
+  // in the forward, then autograd will generate one for the backward.
+  // If the input to the select_scatter is grad_output, then this could be an expanded tensor
+  // with internal overlap.
+  //if (at::has_internal_overlap(self) == at::MemOverlap::Yes) {
+  //  return self.clone();
+  //}
+  auto dtype_size = self.dtype().itemsize();
+  auto nbytes = self.storage().sym_nbytes();
+  TORCH_INTERNAL_ASSERT(nbytes % dtype_size == 0);
+  auto numel = nbytes / dtype_size;
+  auto self_full_size = self.as_strided_symint({numel}, {1}, 0);
+  auto clone = self_full_size.clone();
+  auto out = clone.as_strided_symint(self.sym_sizes(), self.sym_strides(), self.sym_storage_offset());
+  return out;
+}
+
 Tensor copy(const Tensor& self, const Tensor& src, bool non_blocking) {
   // copy() is the "functional" form of copy_(). It exists so we can properly functionalize copy_(), but:
   // (1) It isn't exposed to the frontend (no python bindings)
   // (2) It isn't exposed to the backend (it's a composite, that decomposes into to() and expand_as() calls.
-  // Note: This implementation doesn't currently preserve the strides of `self`.
-  // That might be fine for functorch (which already doesn't preserve strides in vmap),
-  // but it's worth looking into whether or not this implementation will be problematic for LazyTensor/XLA.
-  auto intermediate = src.to(self, non_blocking);
-  // We can't use expand() here. Why?
-  // The contract for copy_() is that the output tensor has the same amount of storage as the original tensor.
-  // e.g. This should work:
-  //   a = torch.ones(4, 4)
-  //   b = torch.ones(1, 4)
-  //   c = torch.ones(4, 4)
-  //   torch.ops.aten.copy(a, b).add_(c)
-  // We don't want to emit an extra copy every time though, so we only do it if the shapes are different.
-  if (self.sym_sizes() != intermediate.sym_sizes()) {
-    return at::expand_copy_symint(intermediate, self.sym_sizes());
-  } else {
-    return intermediate;
-  }
+  auto r = clone_preserve_strides(self);
+  r.copy_(src, non_blocking);
+  return r;
 }
 
 Tensor& copy_(Tensor& self, const Tensor& src, bool non_blocking) {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 8c759cd09c486..730032528661d 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1535,6 +1535,8 @@
 
 - func: copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor
   variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: copy
 
 - func: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
   variants: method
diff --git a/test/test_functionalization.py b/test/test_functionalization.py
index c5330664d1e83..0ab552d0d04a5 100644
--- a/test/test_functionalization.py
+++ b/test/test_functionalization.py
@@ -114,6 +114,15 @@ def f(x):
 
         _functionalize(f, reapply_views=True)(torch.ones(3, 3))
 
+    def test_copy_stride_mismatch(self):
+        def f(x):
+            y = torch.empty_strided((2, 2), (5, 1))
+            y.copy_(x)
+            return y
+
+        r = _functionalize(f, reapply_views=True)(torch.ones(2, 2))
+        self.assertEqual(r.stride(), (5, 1))
+
     def test_view_clone_view_inplace(self):
         def f(input):
             shape = [1, 1024, 128, 128]
@@ -149,13 +158,15 @@ def forward(self, a_1):
     expand_copy = torch.ops.aten.expand_copy.default(ones_like, [16, 64, 128, 128]);  ones_like = None
     view_copy_3 = torch.ops.aten.view_copy.default(expand_copy, [1, 1024, 128, 128]);  expand_copy = None
     new_empty_strided = torch.ops.aten.new_empty_strided.default(view_copy_3, [1, 1024, 128, 128], [16777216, 16384, 128, 1])
-    view_copy_4 = torch.ops.aten.view_copy.default(view_copy_3, [16, 64, 128, 128])
-    view_copy_5 = torch.ops.aten.view_copy.default(view_copy_3, [16, 64, 128, 128])
-    clone_1 = torch.ops.aten.clone.default(view_copy_5, memory_format = torch.contiguous_format);  view_copy_5 = None
+    copy = torch.ops.aten.copy.default(new_empty_strided, view_copy_3);  new_empty_strided = view_copy_3 = None
+    view_copy_4 = torch.ops.aten.view_copy.default(copy, [16, 64, 128, 128])
+    view_copy_5 = torch.ops.aten.view_copy.default(copy, [16, 64, 128, 128])
+    clone_1 = torch.ops.aten.clone.default(view_copy_5, memory_format = torch.contiguous_format)
     threshold_backward = torch.ops.aten.threshold_backward.default(clone_1, relu, 0);  clone_1 = relu = None
-    view_copy_6 = torch.ops.aten.view_copy.default(view_copy_3, [16, 64, 128, 128]);  view_copy_3 = None
+    copy_1 = torch.ops.aten.copy.default(view_copy_5, threshold_backward);  view_copy_5 = threshold_backward = None
+    view_copy_6 = torch.ops.aten.view_copy.default(copy, [16, 64, 128, 128]);  copy = None
     detach_copy = torch.ops.aten.detach_copy.default(view_copy_6);  view_copy_6 = None
-    view_copy_7 = torch.ops.aten.view_copy.default(threshold_backward, [1, 1024, 128, 128]);  threshold_backward = None
+    view_copy_7 = torch.ops.aten.view_copy.default(copy_1, [1, 1024, 128, 128]);  copy_1 = None
     view_copy_8 = torch.ops.aten.view_copy.default(view_copy_7, [16, 64, 128, 128]);  view_copy_7 = None
     detach_copy_1 = torch.ops.aten.detach_copy.default(view_copy_8);  view_copy_8 = None
     return detach_copy_1
@@ -829,8 +840,8 @@ def f(x):
         _z = torch._from_functional_tensor(z)
         self.assertTrue(are_aliased(_y, _z))
 
-    # copy_() gets its own test, because it is special cased in functionalization.
-    # self.copy_(src) decomposes into src.to(self).expand_as(self).
+    # copy_() gets its own test, because it used to be special cased in functionalization.
+    # However, now it works pretty similar to other functional ops
     def test_copy_(self):
         def f(x):
             tmp = torch.zeros(2, 2)
@@ -850,7 +861,8 @@ def f(x):
 def forward(self, a_1):
     zeros = torch.ops.aten.zeros.default([2, 2], device = device(type='cpu'), pin_memory = False)
     diagonal_copy = torch.ops.aten.diagonal_copy.default(zeros);  zeros = None
-    add = torch.ops.aten.add.Tensor(a_1, a_1);  a_1 = None
+    copy = torch.ops.aten.copy.default(diagonal_copy, a_1);  diagonal_copy = None
+    add = torch.ops.aten.add.Tensor(copy, a_1);  copy = a_1 = None
     return add
     """)
 
@@ -862,8 +874,9 @@ def forward(self, a_1):
 def forward(self, a_1):
     zeros = torch.ops.aten.zeros.default([2, 2], device = device(type='cpu'), pin_memory = False)
     diagonal = torch.ops.aten.diagonal.default(zeros);  zeros = None
-    add = torch.ops.aten.add.Tensor(a_1, a_1);  a_1 = None
-    return add
+    copy = torch.ops.aten.copy_.default(diagonal, a_1)
+    add = torch.ops.aten.add_.Tensor(diagonal, a_1);  a_1 = None
+    return diagonal
     """)
 
         # Test 2: copy_() with same dtype, different shape
@@ -876,8 +889,8 @@ def forward(self, a_1):
 def forward(self, a_1):
     zeros = torch.ops.aten.zeros.default([2, 2], device = device(type='cpu'), pin_memory = False)
     diagonal_copy = torch.ops.aten.diagonal_copy.default(zeros);  zeros = None
-    expand_copy = torch.ops.aten.expand_copy.default(a_1, [2])
-    add = torch.ops.aten.add.Tensor(expand_copy, a_1);  expand_copy = a_1 = None
+    copy = torch.ops.aten.copy.default(diagonal_copy, a_1);  diagonal_copy = None
+    add = torch.ops.aten.add.Tensor(copy, a_1);  copy = a_1 = None
     return add
     """)
 
@@ -889,9 +902,9 @@ def forward(self, a_1):
 def forward(self, a_1):
     zeros = torch.ops.aten.zeros.default([2, 2], device = device(type='cpu'), pin_memory = False)
     diagonal = torch.ops.aten.diagonal.default(zeros);  zeros = None
-    expand_copy = torch.ops.aten.expand_copy.default(a_1, [2])
-    add = torch.ops.aten.add_.Tensor(expand_copy, a_1);  a_1 = None
-    return expand_copy
+    copy = torch.ops.aten.copy_.default(diagonal, a_1)
+    add = torch.ops.aten.add_.Tensor(diagonal, a_1);  a_1 = None
+    return diagonal
     """)
 
         # Test 3: copy_() with different dtype, same shape
@@ -904,8 +917,8 @@ def forward(self, a_1):
 def forward(self, a_1):
     zeros = torch.ops.aten.zeros.default([2, 2], device = device(type='cpu'), pin_memory = False)
     diagonal_copy = torch.ops.aten.diagonal_copy.default(zeros);  zeros = None
-    _to_copy = torch.ops.aten._to_copy.default(a_1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
-    add = torch.ops.aten.add.Tensor(_to_copy, a_1);  _to_copy = a_1 = None
+    copy = torch.ops.aten.copy.default(diagonal_copy, a_1);  diagonal_copy = None
+    add = torch.ops.aten.add.Tensor(copy, a_1);  copy = a_1 = None
     return add
     """)  # noqa: B950
 
@@ -917,9 +930,9 @@ def forward(self, a_1):
 def forward(self, a_1):
     zeros = torch.ops.aten.zeros.default([2, 2], device = device(type='cpu'), pin_memory = False)
     diagonal = torch.ops.aten.diagonal.default(zeros);  zeros = None
-    _to_copy = torch.ops.aten._to_copy.default(a_1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
-    add = torch.ops.aten.add_.Tensor(_to_copy, a_1);  a_1 = None
-    return _to_copy
+    copy = torch.ops.aten.copy_.default(diagonal, a_1)
+    add = torch.ops.aten.add_.Tensor(diagonal, a_1);  a_1 = None
+    return diagonal
     """)  # noqa: B950
 
         # Test 4: copy_() with different dtype, different shape
@@ -932,9 +945,8 @@ def forward(self, a_1):
 def forward(self, a_1):
     zeros = torch.ops.aten.zeros.default([2, 2], device = device(type='cpu'), pin_memory = False)
     diagonal_copy = torch.ops.aten.diagonal_copy.default(zeros);  zeros = None
-    _to_copy = torch.ops.aten._to_copy.default(a_1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
-    expand_copy = torch.ops.aten.expand_copy.default(_to_copy, [2]);  _to_copy = None
-    add = torch.ops.aten.add.Tensor(expand_copy, a_1);  expand_copy = a_1 = None
+    copy = torch.ops.aten.copy.default(diagonal_copy, a_1);  diagonal_copy = None
+    add = torch.ops.aten.add.Tensor(copy, a_1);  copy = a_1 = None
     return add
     """)  # noqa: B950
 
@@ -946,10 +958,9 @@ def forward(self, a_1):
 def forward(self, a_1):
     zeros = torch.ops.aten.zeros.default([2, 2], device = device(type='cpu'), pin_memory = False)
     diagonal = torch.ops.aten.diagonal.default(zeros);  zeros = None
-    _to_copy = torch.ops.aten._to_copy.default(a_1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
-    expand_copy = torch.ops.aten.expand_copy.default(_to_copy, [2]);  _to_copy = None
-    add = torch.ops.aten.add_.Tensor(expand_copy, a_1);  a_1 = None
-    return expand_copy
+    copy = torch.ops.aten.copy_.default(diagonal, a_1)
+    add = torch.ops.aten.add_.Tensor(diagonal, a_1);  a_1 = None
+    return diagonal
     """)  # noqa: B950
 
     def test_expand_symint(self):
diff --git a/test/test_fx_reinplace_pass.py b/test/test_fx_reinplace_pass.py
index abb9696225c44..dc512cadea69e 100644
--- a/test/test_fx_reinplace_pass.py
+++ b/test/test_fx_reinplace_pass.py
@@ -345,9 +345,8 @@ def forward(self):
     ones = torch.ops.aten.ones.default([4, 2, 4], device = device(type='cpu'), pin_memory = False)
     slice_1 = torch.ops.aten.slice.Tensor(zeros, 0, 0, 9223372036854775807)
     slice_2 = torch.ops.aten.slice.Tensor(slice_1, 1, 2, 9223372036854775807);  slice_1 = None
+    copy = torch.ops.aten.copy_.default(slice_2, ones);  slice_2 = ones = None
     slice_3 = torch.ops.aten.slice.Tensor(zeros, 0, 0, 9223372036854775807)
-    slice_tensor = torch.ops.aten.slice.Tensor(slice_3, 1, 2, 9223372036854775807);  slice_3 = None
-    copy__default = torch.ops.aten.copy_.default(slice_tensor, ones);  slice_tensor = ones = None
     return zeros
     """)
 
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index 188072b3d4892..6cddc0f489c55 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -416,6 +416,17 @@ def all_dim(input, dim, keeepdim=False):
     return torch.logical_not(torch.any(torch.logical_not(input), dim, keeepdim))
 
 
+# NB: this decomposition is not stride accurate, do not put it in the main
+# library
+@register_decomposition(aten.copy)
+def copy(self, src, non_blocking=False):
+    intermediate = src.to(self, non_blocking)
+    if self.size() != intermediate.size():
+        return aten.expand_copy.default(intermediate, self.size())
+    else:
+        return intermediate
+
+
 @register_decomposition(aten.hardswish_)
 def hardswish_(x):
     return x.copy_(aten.hardswish(x))

From ae1a7e9a61307841eaa55ab7174a7122d5ba8350 Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang@fb.com>
Date: Mon, 21 Nov 2022 11:05:38 -0800
Subject: [PATCH 1166/1922] [18/N] Add allgather_coalesced custom op with
 CPU/CUDA implementations (#89317)

Differential Revision: [D41415321](https://our.internmc.facebook.com/intern/diff/D41415321)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89317
Approved by: https://github.com/kwen2501
---
 test/distributed/test_c10d_gloo.py      | 14 +++++++++++++
 torch/csrc/distributed/c10d/Ops.cpp     | 28 +++++++++++++++++++++++++
 torch/csrc/distributed/c10d/Ops.hpp     |  6 ++++++
 torch/csrc/distributed/c10d/OpsImpl.cpp | 26 +++++++++++++++++++++++
 torch/csrc/distributed/c10d/init.cpp    |  8 ++++++-
 5 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
index c0a25fff9d822..545f125527af0 100644
--- a/test/distributed/test_c10d_gloo.py
+++ b/test/distributed/test_c10d_gloo.py
@@ -2417,6 +2417,20 @@ def test_collectives(self):
     def test_allreduce_coalesced(self):
         self._test_allreduce_coalesced(backend="gloo")
 
+    @requires_gloo()
+    def test_allgather_coalesced(self):
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            "gloo",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        input_tensor = torch.ones(10, 10, dtype=torch.float32)
+        output_tensor_list = [torch.zeros_like(input_tensor)]
+        dist.all_gather_coalesced([output_tensor_list], [input_tensor])
+        self.assertEqual(output_tensor_list, [input_tensor])
+
 class CompilerTest(test_c10d_common.CompilerTest):
 
     @property
diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
index 5d343c344ec81..4edb70c413bf9 100644
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@@ -95,6 +95,15 @@ c10::intrusive_ptr<Work> _allgather_base_(
   return process_group->_allgather_base(output_tensor, input_tensor);
 }
 
+c10::intrusive_ptr<Work> allgather_coalesced_(
+    const std::vector<std::vector<at::Tensor>>& output_lists,
+    const std::vector<at::Tensor>& input_list,
+    const c10::intrusive_ptr<ProcessGroup>& process_group) {
+  return process_group->allgather_coalesced(
+      const_cast<std::vector<std::vector<at::Tensor>>&>(output_lists),
+      const_cast<std::vector<at::Tensor>&>(input_list));
+}
+
 std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>> reduce_scatter_(
     const std::vector<at::Tensor>& output_tensors,
     const std::vector<std::vector<at::Tensor>>& input_tensors,
@@ -220,6 +229,10 @@ TORCH_LIBRARY(c10d, m) {
   m.def(
       "_allgather_base_",
       dispatch(c10::DispatchKey::CompositeExplicitAutograd, _allgather_base_));
+  m.def(
+      "allgather_coalesced_",
+      dispatch(
+          c10::DispatchKey::CompositeExplicitAutograd, allgather_coalesced_));
   m.def(
       "reduce_scatter_",
       dispatch(c10::DispatchKey::CompositeExplicitAutograd, reduce_scatter_));
@@ -345,6 +358,21 @@ c10::intrusive_ptr<Work> _allgather_base(
   return op.call(output_tensor, input_tensor, process_group);
 }
 
+c10::intrusive_ptr<Work> allgather_coalesced(
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    const std::vector<std::vector<at::Tensor>>& output_lists,
+    const std::vector<at::Tensor>& input_list,
+    const AllgatherOptions& opts) {
+  static auto op = c10::Dispatcher::singleton()
+                       .findSchemaOrThrow("c10d::allgather_coalesced_", "")
+                       .typed<c10::intrusive_ptr<Work>(
+                           const std::vector<std::vector<at::Tensor>>&,
+                           const std::vector<at::Tensor>&,
+                           const c10::intrusive_ptr<::c10d::ProcessGroup>&)>();
+
+  return op.call(output_lists, input_list, process_group);
+}
+
 c10::intrusive_ptr<Work> reduce_scatter(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const std::vector<at::Tensor>& output_tensors,
diff --git a/torch/csrc/distributed/c10d/Ops.hpp b/torch/csrc/distributed/c10d/Ops.hpp
index f6425e0ea3504..ad6e2d3573eeb 100644
--- a/torch/csrc/distributed/c10d/Ops.hpp
+++ b/torch/csrc/distributed/c10d/Ops.hpp
@@ -38,6 +38,12 @@ TORCH_API c10::intrusive_ptr<Work> _allgather_base(
     at::Tensor& inputTensor,
     const AllgatherOptions& opts = {});
 
+TORCH_API c10::intrusive_ptr<Work> allgather_coalesced(
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    const std::vector<std::vector<at::Tensor>>& output_lists,
+    const std::vector<at::Tensor>& input_list,
+    const AllgatherOptions& opts = {});
+
 TORCH_API c10::intrusive_ptr<Work> reduce_scatter(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const std::vector<at::Tensor>& output_tensors,
diff --git a/torch/csrc/distributed/c10d/OpsImpl.cpp b/torch/csrc/distributed/c10d/OpsImpl.cpp
index c3db5c438124a..66269db1eae8b 100644
--- a/torch/csrc/distributed/c10d/OpsImpl.cpp
+++ b/torch/csrc/distributed/c10d/OpsImpl.cpp
@@ -225,6 +225,24 @@ c10::intrusive_ptr<Work> _allgather_base_cuda_(
   return process_group->_allgather_base(output_tensor, input_tensor);
 }
 
+c10::intrusive_ptr<Work> allgather_coalesced_cpu_(
+    const std::vector<std::vector<at::Tensor>>& output_lists,
+    const std::vector<at::Tensor>& input_list,
+    const c10::intrusive_ptr<ProcessGroup>& process_group) {
+  return process_group->allgather_coalesced(
+      const_cast<std::vector<std::vector<at::Tensor>>&>(output_lists),
+      const_cast<std::vector<at::Tensor>&>(input_list));
+}
+
+c10::intrusive_ptr<Work> allgather_coalesced_cuda_(
+    const std::vector<std::vector<at::Tensor>>& output_lists,
+    const std::vector<at::Tensor>& input_list,
+    const c10::intrusive_ptr<ProcessGroup>& process_group) {
+  return process_group->allgather_coalesced(
+      const_cast<std::vector<std::vector<at::Tensor>>&>(output_lists),
+      const_cast<std::vector<at::Tensor>&>(input_list));
+}
+
 std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>
 reduce_scatter_cpu_(
     const std::vector<at::Tensor>& output_tensors,
@@ -457,6 +475,14 @@ TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
   m.impl("_allgather_base_", _allgather_base_cuda_);
 }
 
+TORCH_LIBRARY_IMPL(c10d, CPU, m) {
+  m.impl("allgather_coalesced_", allgather_coalesced_cpu_);
+}
+
+TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
+  m.impl("allgather_coalesced_", allgather_coalesced_cuda_);
+}
+
 TORCH_LIBRARY_IMPL(c10d, CPU, m) {
   m.impl("reduce_scatter_", reduce_scatter_cpu_);
 }
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index ae98000112fc5..f65354d97f976 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -1310,7 +1310,13 @@ that adds a prefix to each key inserted to the store.
 
           .def(
               "allgather_coalesced",
-              &::c10d::ProcessGroup::allgather_coalesced,
+              [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
+                 const std::vector<std::vector<at::Tensor>>& output_lists,
+                 const std::vector<at::Tensor>& input_list,
+                 const ::c10d::AllgatherOptions& opts) {
+                return ::c10d::ops::allgather_coalesced(
+                    self, output_lists, input_list, opts);
+              },
               py::arg("output_lists"),
               py::arg("input_list"),
               py::arg("opts") = ::c10d::AllgatherOptions(),

From 12d58e0dca868deb36bccd6434e055fc00b6586c Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang@fb.com>
Date: Mon, 21 Nov 2022 11:05:39 -0800
Subject: [PATCH 1167/1922] [19/N] Add monitored_barrier custom op with CPU
 implementation (#89318)

Differential Revision: [D41415324](https://our.internmc.facebook.com/intern/diff/D41415324)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89318
Approved by: https://github.com/kwen2501
---
 test/distributed/test_c10d_gloo.py      | 11 ++++++++
 torch/csrc/distributed/c10d/Ops.cpp     | 37 +++++++++++++++++++++++++
 torch/csrc/distributed/c10d/Ops.hpp     |  5 ++++
 torch/csrc/distributed/c10d/OpsImpl.cpp | 15 ++++++++++
 torch/csrc/distributed/c10d/init.cpp    |  2 +-
 5 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
index 545f125527af0..bee76e788d192 100644
--- a/test/distributed/test_c10d_gloo.py
+++ b/test/distributed/test_c10d_gloo.py
@@ -2431,6 +2431,17 @@ def test_allgather_coalesced(self):
         dist.all_gather_coalesced([output_tensor_list], [input_tensor])
         self.assertEqual(output_tensor_list, [input_tensor])
 
+    @requires_gloo()
+    def test_monitored_barrier(self):
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            "gloo",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        dist.monitored_barrier()
+
 class CompilerTest(test_c10d_common.CompilerTest):
 
     @property
diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
index 4edb70c413bf9..6b4717a8e1d11 100644
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@@ -181,6 +181,17 @@ c10::intrusive_ptr<Work> barrier(
       BarrierOptions{device_ids, std::chrono::milliseconds(timeout)});
 }
 
+void monitored_barrier_(
+    at::Tensor /* unused */,
+    const c10::intrusive_ptr<::c10d::ProcessGroup>& process_group,
+    const std::vector<int64_t>& device_ids,
+    int64_t timeout,
+    bool wait_all_ranks) {
+  process_group->monitoredBarrier(
+      BarrierOptions{device_ids, std::chrono::milliseconds(timeout)},
+      wait_all_ranks);
+}
+
 c10::intrusive_ptr<Work> send(
     at::TensorList tensors,
     const c10::intrusive_ptr<ProcessGroup>& process_group,
@@ -255,6 +266,10 @@ TORCH_LIBRARY(c10d, m) {
   m.def(
       "barrier",
       dispatch(c10::DispatchKey::CompositeExplicitAutograd, barrier));
+  m.def(
+      "monitored_barrier_",
+      dispatch(
+          c10::DispatchKey::CompositeExplicitAutograd, monitored_barrier_));
   m.def("send", dispatch(c10::DispatchKey::CompositeExplicitAutograd, send));
   m.def("recv_", dispatch(c10::DispatchKey::CompositeExplicitAutograd, recv_));
 }
@@ -497,6 +512,28 @@ c10::intrusive_ptr<Work> alltoall(
       output_tensors, input_tensors, process_group, opts.timeout.count());
 }
 
+void monitored_barrier(
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    const BarrierOptions& opts,
+    bool wait_all_ranks) {
+  static auto op = c10::Dispatcher::singleton()
+                       .findSchemaOrThrow("c10d::monitored_barrier_", "")
+                       .typed<void(
+                           at::Tensor,
+                           const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                           const std::vector<int64_t>&,
+                           int64_t,
+                           bool)>();
+  // Default to using cpu implementation, monitored barrier is only for GLOO
+  at::Tensor tensor = at::empty({0}, at::TensorOptions().device(at::kCPU));
+  op.call(
+      tensor,
+      process_group,
+      opts.device_ids,
+      opts.timeout.count(),
+      wait_all_ranks);
+}
+
 c10::intrusive_ptr<Work> barrier(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const BarrierOptions& opts) {
diff --git a/torch/csrc/distributed/c10d/Ops.hpp b/torch/csrc/distributed/c10d/Ops.hpp
index ad6e2d3573eeb..b5426039f01e4 100644
--- a/torch/csrc/distributed/c10d/Ops.hpp
+++ b/torch/csrc/distributed/c10d/Ops.hpp
@@ -83,6 +83,11 @@ TORCH_API c10::intrusive_ptr<Work> barrier(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const BarrierOptions& opts = {});
 
+TORCH_API void monitored_barrier(
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    const BarrierOptions& opts,
+    bool waitAllRanks);
+
 TORCH_API c10::intrusive_ptr<Work> send(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     at::TensorList tensors,
diff --git a/torch/csrc/distributed/c10d/OpsImpl.cpp b/torch/csrc/distributed/c10d/OpsImpl.cpp
index 66269db1eae8b..31386695a132e 100644
--- a/torch/csrc/distributed/c10d/OpsImpl.cpp
+++ b/torch/csrc/distributed/c10d/OpsImpl.cpp
@@ -399,6 +399,17 @@ c10::intrusive_ptr<Work> barrier_cuda(
       BarrierOptions{device_ids, std::chrono::milliseconds(timeout)});
 }
 
+void monitored_barrier_cpu_(
+    at::Tensor /* unused */,
+    const c10::intrusive_ptr<::c10d::ProcessGroup>& process_group,
+    const std::vector<int64_t>& device_ids,
+    int64_t timeout,
+    bool wait_all_ranks) {
+  process_group->monitoredBarrier(
+      BarrierOptions{device_ids, std::chrono::milliseconds(timeout)},
+      wait_all_ranks);
+}
+
 // register functions to dispatcher
 namespace {
 TORCH_LIBRARY_IMPL(c10d, CPU, m) {
@@ -531,6 +542,10 @@ TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
   m.impl("barrier", barrier_cuda);
 }
 
+TORCH_LIBRARY_IMPL(c10d, CPU, m) {
+  m.impl("monitored_barrier_", monitored_barrier_cpu_);
+}
+
 } // namespace
 
 } // namespace ops
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index f65354d97f976..9a9699c5e12f1 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -1539,7 +1539,7 @@ that adds a prefix to each key inserted to the store.
                  bool waitAllRanks) {
                 ::c10d::BarrierOptions opts;
                 opts.timeout = timeout;
-                return self->monitoredBarrier(opts, waitAllRanks);
+                return ::c10d::ops::monitored_barrier(self, opts, waitAllRanks);
               },
               py::arg("timeout") = ::c10d::kUnsetTimeout,
               py::arg("wait_all_ranks") = false,

From 34d870a3712d075a000ea0cd4951d5aa4189c57b Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Tue, 22 Nov 2022 00:30:12 +0000
Subject: [PATCH 1168/1922] [inductor] generate nan in the cpp backend (#89289)

Summary: Fixes https://github.com/pytorch/torchdynamo/issues/1797

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89289
Approved by: https://github.com/ngimel, https://github.com/jansel, https://github.com/jgong5
---
 test/inductor/test_torchinductor.py        | 11 +++++++++++
 test/inductor/test_torchinductor_opinfo.py |  1 -
 torch/_inductor/codegen/cpp.py             |  5 +++++
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index ed68c28442367..4f672afff80a8 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -3367,6 +3367,17 @@ def fn(x):
             ],
         )
 
+    def test_isinf2(self):
+        def fn(x):
+            y = torch.tensor(
+                [1, float("inf"), 2, float("-inf"), float("nan")], device=self.device
+            )
+            return x == y
+
+        self.common(
+            fn, (torch.tensor([1, float("inf"), 2, float("-inf"), float("nan")]),)
+        )
+
     def test_any(self):
         def fn(x):
             return (
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 188fcd8b67dc7..8d2ac24afb7e2 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -195,7 +195,6 @@ def process(device_type):
     "linalg.matrix_rank": {f32, f64},
     "linalg.matrix_rank.hermitian": {f32, f64},
     "linalg.pinv.singular": {f32, f64},
-    "logdet": {f32, f64},
     "masked.norm": {f16},
     "masked.normalize": {f16},
     "masked_fill": {f16},
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index 3568cfdc08ef3..c7e40899c86f3 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -1,6 +1,7 @@
 import contextlib
 import dataclasses
 import functools
+import math
 from copy import deepcopy
 from pathlib import Path
 from typing import Dict, List
@@ -268,6 +269,8 @@ def constant(val, dtype):
             quote = f"std::numeric_limits<{DTYPE_TO_CPP[dtype]}>::infinity()"
         elif val == float("-inf"):
             quote = f"-std::numeric_limits<{DTYPE_TO_CPP[dtype]}>::infinity()"
+        elif math.isnan(val):
+            quote = f"std::numeric_limits<{DTYPE_TO_CPP[dtype]}>::quiet_NaN()"
         elif val is True or val is False:
             quote = f"static_cast<{DTYPE_TO_CPP[dtype]}>({str(val).lower()})"
         else:
@@ -459,6 +462,8 @@ def constant(val, dtype):
             return f"std::numeric_limits<{DTYPE_TO_CPP[dtype]}>::infinity()"
         elif val == float("-inf"):
             return f"-std::numeric_limits<{DTYPE_TO_CPP[dtype]}>::infinity()"
+        elif math.isnan(val):
+            return f"std::numeric_limits<{DTYPE_TO_CPP[dtype]}>::quiet_NaN()"
         elif val is True or val is False:
             return ops.to_dtype(str(val).lower(), dtype)
         return ops.to_dtype(repr(val), dtype)

From d07c33c9d793a9252189494e673daf3588cb32ef Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Mon, 21 Nov 2022 14:19:02 -0800
Subject: [PATCH 1169/1922] [quant][fx] Support producing reference quantized
 patterns for dynamic quantization (#89248)

Summary:
split the is_decomposed logic for `_replace_observer_with_quantize_dequantize_node` in a separate function and added support for dynamic quantization in the decomposed version of this function.

In case of dynamic quantization, we'll produce the following reference quantized pattern in decomposed mode:
```
x -> choose_qparams -> quantize_per_tensor -> dequantize_per_tensor -> linear
```

Test Plan:
python test/test_quantization.py -k test__convert_to_reference_decomposed_fx_dynamic_quant

Reviewers:

Subscribers:

Tasks:

Tags:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89248
Approved by: https://github.com/vkuzo
---
 test/quantization/fx/test_quantize_fx.py |  35 +++
 torch/ao/quantization/fx/convert.py      | 338 ++++++++++++++++++-----
 2 files changed, 310 insertions(+), 63 deletions(-)

diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index bab4467894e20..d31641ec2ae31 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -159,6 +159,7 @@
     LinearReluModel,
     QuantizationTestCase,
     skipIfNoFBGEMM,
+    skipIfNoQNNPACK,
     skip_if_no_torchvision,
     train_one_epoch,
     run_ddp,
@@ -5342,6 +5343,40 @@ def forward(self, x):
         res = m(*example_inputs)
         self.assertEqual(res, res_ref)
 
+    @skipIfNoQNNPACK
+    def test__convert_to_reference_decomposed_fx_dynamic_quant(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(5, 10)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        # to avoid reduce_range
+        with override_quantized_engine("qnnpack"):
+            m = M().eval()
+            qconfig_mapping = get_default_qconfig_mapping("fbgemm") \
+                .set_object_type(torch.nn.Linear, default_dynamic_qconfig)
+            example_inputs = (torch.randn(1, 5),)
+            m = prepare_fx(m, qconfig_mapping, example_inputs)
+            m(*example_inputs)
+            m_ref = copy.deepcopy(m)
+            m_ref = convert_to_reference_fx(m_ref)
+            m = _convert_to_reference_decomposed_fx(m)
+            expected_occurrence = {
+                ns.call_function(torch.ops.quantized_decomposed.choose_qparams.tensor): 1,
+                ns.call_function(torch.ops.quantized_decomposed.quantize_per_tensor.tensor): 1,
+                ns.call_function(torch.ops.quantized_decomposed.dequantize_per_tensor.tensor): 1,
+            }
+            self.checkGraphModuleNodes(
+                m,
+                expected_node_occurrence=expected_occurrence)
+            # make sure it runs
+            res_ref = m_ref(*example_inputs)
+            res = m(*example_inputs)
+            self.assertEqual(res, res_ref)
+
     def test_change_backend_config_for_fixed_qparam_ops(self):
         """ Making sure we can skip validation of qconfigs for fixedqparam ops based
         on BackendConfig
diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py
index faa267c492c68..e7e0b482356a8 100644
--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@@ -74,7 +74,7 @@
 from .lower_to_fbgemm import lower_to_fbgemm
 # importing the lib so that the quantized_decomposed ops are registered
 from ._decomposed import quantized_decomposed_lib  # noqa: F401
-
+import operator
 
 # TODO: revisit this list. Many helper methods shouldn't be public
 __all__ = [
@@ -91,27 +91,29 @@
     "run_weight_observers",
 ]
 
-def _replace_observer_with_quantize_dequantize_node(
+def _replace_observer_with_quantize_dequantize_node_decomposed(
         model: torch.nn.Module,
         graph: Graph,
         node: Node,
         modules: Dict[str, torch.nn.Module],
         node_name_to_scope: Dict[str, Tuple[str, type]],
-        node_name_to_qconfig: Dict[str, QConfigAny],
-        is_decomposed: bool) -> None:
+        node_name_to_qconfig: Dict[str, QConfigAny]) -> None:
     """ Replace activation_post_process module call node with quantize and
-    dequantize node
+    dequantize node working with decomposed Tensor
 
     Before:
     ... -> observer_0(x) -> ...
     After:
-    ... -> torch.quantize_per_tensor(x, ...) -> x.dequantize() -> ...
+    ... -> torch.ops.quantized_decomposed.quantize_per_tensor(x, ...) ->
+    torch.ops.quantized_decomposed.dequantize_per_tensor() -> ...
+
+    or quantize_per_channel and dequantize_per_channel
     """
     assert modules is not None
     assert isinstance(node.target, str)
     module_path, prefix = get_module_path_and_prefix(node, node_name_to_scope, node_name_to_qconfig)
     activation_post_process = modules[node.target]
-    # Skip replacing observers to quant/dequant nodes if the qconfigs of all
+    # skip replacing observers to quant/dequant nodes if the qconfigs of all
     # consumers and producers of this observer are None
     skip_replacement = all([
         has_none_qconfig(n, node_name_to_qconfig) for n in
@@ -124,89 +126,294 @@ def _replace_observer_with_quantize_dequantize_node(
             graph.erase_node(node)
         return
 
-    # otherwise, we can convert the observer module call to quantize/dequantize node
+    # otherwise, we can convert the activation_post_process module call to quantize/dequantize node
+
     # 1. extract the information from activation_post_process module for generating
     # the quantize and dequantize operator
     dtype = activation_post_process.dtype  # type: ignore[attr-defined]
     compute_dtype = None
     if hasattr(activation_post_process, "compute_dtype"):
         compute_dtype = activation_post_process.compute_dtype  # type: ignore[attr-defined]
-    quantize_op : Optional[Union[Callable, str]] = None
     if dtype in [torch.quint8, torch.qint8, torch.qint32] and \
             not hasattr(activation_post_process, 'compute_dtype'):
+        # TODO: probably should cleanup this condition check, it's hard
+        # to reason about this if and the following elif
+
+        # uint8/int8/int32 static quantization branch
+
+        # 1. extract information for inserting q/dq node from activation_post_process
         node_type = "call_function"
+        quantize_op : Optional[Callable] = None
+        scale, zero_point = activation_post_process.calculate_qparams()  # type: ignore[attr-defined]
+        if is_per_channel(activation_post_process.qscheme):  # type: ignore[attr-defined]
+            raise NotImplementedError("decomposed quantize_per_channel op not implemented yet")
+        else:
+            scale = float(scale)
+            zero_point = int(zero_point)
+            quant_min = activation_post_process.quant_min  # type: ignore[attr-defined]
+            quant_max = activation_post_process.quant_max  # type: ignore[attr-defined]
+            dtype_ = to_underlying_dtype(dtype)
+            qparams = {
+                "_scale_": scale,
+                "_zero_point_": zero_point,
+                "_quant_min_": quant_min,
+                "_quant_max_": quant_max,
+                "_dtype_": dtype_
+            }
+            quantize_op = torch.ops.quantized_decomposed.quantize_per_tensor
+
+        # 2. replace activation_post_process node with quantize and dequantize
+        with graph.inserting_before(node):
+            input_node = node.args[0]
+            quantize_op_inputs = [input_node]
+            for key, value_or_node in qparams.items():
+                # TODO: we can add the information of whether a value needs to
+                # be registered as an attribute in qparams dict itself
+                if key in ['_scale_', '_zero_point_']:
+                    # For scale and zero_point values we register them as buffers in the root module.
+                    # TODO: maybe need more complex attr name here
+                    qparam_node = create_getattr_from_value(
+                        model, graph, module_path + prefix + key, value_or_node)
+                    quantize_op_inputs.append(qparam_node)
+                else:
+                    # for qparams that are not scale/zero_point (like axis, dtype) we store them as literals in the graph.
+                    quantize_op_inputs.append(value_or_node)
+
+            quantized_node = graph.create_node(node_type, quantize_op, tuple(quantize_op_inputs), {})
+            # use the same qparams from quantize op
+            dq_inputs = [quantized_node] + quantize_op_inputs[1:]
+            dequantize_op = torch.ops.quantized_decomposed.dequantize_per_tensor
+            dequantized_node = graph.call_function(
+                dequantize_op,
+                tuple(dq_inputs),
+                {}
+            )
+            node.replace_all_uses_with(dequantized_node)
+            graph.erase_node(node)
+    elif compute_dtype in [torch.quint8, torch.qint8, torch.float16]:
+        # TODO(future PR): switch compute_dtype to is_dynamic
+
+        # uint8/int8/fp16 dynamic quantization
+
+        # 1. extract information for inserting q/dq node from activation_post_process
+        node_type = "call_function"
+        quantize_op = torch.ops.quantized_decomposed.quantize_per_tensor.tensor
+        # we only use choose_qparams for is_decomposed now,
+        # but we should probably align the non-decomposed path with this as well,
+        # and that can be done after we remove reduce_range flag
+        # 1. extract qparams from activation_post_process module
+        dtype_ = to_underlying_dtype(dtype)
+        assert dtype_ in [torch.uint8, torch.int8], \
+            "only uint8 and int8 are supported in reference flow for " \
+            "dynamic quantization right now"
+        quant_min = activation_post_process.quant_min  # type: ignore[attr-defined]
+        quant_max = activation_post_process.quant_max  # type: ignore[attr-defined]
+        # note: scale and zero_point are missing for quantize_per_tensor op
+        # we'll need to get this from choose_qparams op, which we'll add after
+        # this step
+        qparams = {
+            "_quant_min_": quant_min,
+            "_quant_max_": quant_max,
+            "_dtype_": dtype_
+        }
+
+        # 2. insert choose_qparams op and update the qparams list
+        with graph.inserting_before(node):
+            input_node = node.args[0]
+            choose_qparams_op_inputs = [node.args[0]]
+            for key, value in qparams.items():
+                # we have quant_min, quant_max and dtype, all should be stored
+                # as literals
+                choose_qparams_op_inputs.append(value)
+            choose_qparams_node = graph.create_node(
+                "call_function",
+                torch.ops.quantized_decomposed.choose_qparams.tensor,
+                tuple(choose_qparams_op_inputs),
+                {}
+            )
+            # choose_qparms returns (scale, zero_point)
+            scale_node = graph.create_node(
+                "call_function",
+                operator.getitem,
+                (choose_qparams_node, 0),
+                {}
+            )
+            zero_point_node = graph.create_node(
+                "call_function",
+                operator.getitem,
+                (choose_qparams_node, 1),
+                {}
+            )
+            quant_min = qparams["_quant_min_"]
+            quant_max = qparams["_quant_max_"]
+            dtype = qparams["_dtype_"]
+            qparams = {
+                "_scale_": scale_node,
+                "_zero_point_": zero_point_node,
+                "_quant_min_": quant_min,
+                "_quant_max_": quant_max,
+                "_dtype_": dtype
+            }
+
+        # 3. replace activation_post_process node to quantize and dequantize node
+        with graph.inserting_before(node):
+            input_node = node.args[0]
+            quantize_op_inputs = [input_node]
+            for key, value_or_node in qparams.items():
+                # TODO: we can add the information of whether a value needs to
+                # be registered as an attribute in qparams dict itself
+                if key in ['_scale_', '_zero_point_']:
+                    # in this case we have a node in the graph since it's dynamically
+                    # computed from the input, with choose_qparams op
+                    qparam_node = value_or_node
+                    quantize_op_inputs.append(qparam_node)
+                else:
+                    # for qparams that are not scale/zero_point (like axis, dtype) we
+                    # store them as literals in the graph.
+                    quantize_op_inputs.append(value_or_node)
+
+            quantized_node = graph.create_node(node_type, quantize_op, tuple(quantize_op_inputs), {})
+            # use the same qparams from quantize op
+            dq_inputs = [quantized_node] + quantize_op_inputs[1:]
+            # need to use the tensor variant of this op, since scale and zero_point
+            # from choose_qparam are Tensors, instead of float/int, this is to
+            # prevent these nodes being traced away by downstream systems
+            dequantize_op = torch.ops.quantized_decomposed.dequantize_per_tensor.tensor
+            dequantized_node = graph.call_function(
+                dequantize_op,
+                tuple(dq_inputs),
+                {}
+            )
+            node.replace_all_uses_with(dequantized_node)
+            graph.erase_node(node)
+    elif dtype == torch.float16:
+        raise NotImplementedError("decomposed to float16 op not implemented yet")
+
+    # should not reach since we have checks in the begining to make sure the
+    # activation_post_process is supported
+
+def _replace_observer_with_quantize_dequantize_node(
+        model: torch.nn.Module,
+        graph: Graph,
+        node: Node,
+        modules: Dict[str, torch.nn.Module],
+        node_name_to_scope: Dict[str, Tuple[str, type]],
+        node_name_to_qconfig: Dict[str, QConfigAny]) -> None:
+    """ Replace activation_post_process module call node with quantize and
+    dequantize node
+
+    Before:
+    ... -> observer_0(x) -> ...
+    After:
+    ... -> torch.quantize_per_tensor(x, ...) -> x.dequantize() -> ...
+    """
+    assert modules is not None
+    assert isinstance(node.target, str)
+    module_path, prefix = get_module_path_and_prefix(node, node_name_to_scope, node_name_to_qconfig)
+    activation_post_process = modules[node.target]
+    # skip replacing observers to quant/dequant nodes if the qconfigs of all
+    # consumers and producers of this observer are None
+    skip_replacement = all([
+        has_none_qconfig(n, node_name_to_qconfig) for n in
+        list(node.args) + list(node.users.keys())])
+    if skip_replacement or not _is_conversion_supported(activation_post_process):
+        # didn't find correponding quantize op and info for the activation_post_process
+        # so we just remove the observer
+        with graph.inserting_before(node):
+            node.replace_all_uses_with(node.args[0])
+            graph.erase_node(node)
+        return
+
+    # otherwise, we can convert the activation_post_process module call to quantize/dequantize node
+    dtype = activation_post_process.dtype  # type: ignore[attr-defined]
+    compute_dtype = None
+    if hasattr(activation_post_process, "compute_dtype"):
+        compute_dtype = activation_post_process.compute_dtype  # type: ignore[attr-defined]
+
+    if dtype in [torch.quint8, torch.qint8, torch.qint32] and \
+            not hasattr(activation_post_process, "compute_dtype"):
+        # TODO: probably should cleanup this condition check, it's hard
+        # to reason about this if and the following elif
+
+        # uint8/int8/int32 static quantization branch
+
+        # 1. extract the information from activation_post_process module for generating
+        # the quantize and dequantize operator
+        node_type = "call_function"
+        quantize_op : Optional[Callable] = None
         scale, zero_point = activation_post_process.calculate_qparams()  # type: ignore[attr-defined]
         if is_per_channel(activation_post_process.qscheme):  # type: ignore[attr-defined]
             ch_axis = int(activation_post_process.ch_axis)  # type: ignore[attr-defined]
             qparams = {"_scale_": scale, "_zero_point_": zero_point, "_axis_": ch_axis, "_dtype_": dtype}
-            if is_decomposed:
-                raise NotImplementedError("decomposed quantize_per_channel op not implemented yet")
-            else:
-                quantize_op = torch.quantize_per_channel
+            quantize_op = torch.quantize_per_channel
         else:
             scale = float(scale)
             zero_point = int(zero_point)
-            if is_decomposed:
-                quant_min = activation_post_process.quant_min  # type: ignore[attr-defined]
-                quant_max = activation_post_process.quant_max  # type: ignore[attr-defined]
-                dtype = to_underlying_dtype(dtype)
-                qparams = {
-                    "_scale_": scale,
-                    "_zero_point_": zero_point,
-                    "_quant_min": quant_min,
-                    "_quant_max": quant_max,
-                    "_dtype_": dtype
-                }
-                quantize_op = torch.ops.quantized_decomposed.quantize_per_tensor
-            else:
-                qparams = {"_scale_": scale, "_zero_point_": zero_point, "_dtype_": dtype}
-                quantize_op = torch.quantize_per_tensor
+            qparams = {"_scale_": scale, "_zero_point_": zero_point, "_dtype_": dtype}
+            quantize_op = torch.quantize_per_tensor
+
+        # 2. replace activation_post_process node with quantize and dequantize
+        with graph.inserting_before(node):
+            input_node = node.args[0]
+            quantize_op_inputs = [input_node]
+            for key, value_or_node in qparams.items():
+                # TODO: we can add the information of whether a value needs to
+                # be registered as an attribute in qparams dict itself
+                if key in ['_scale_', '_zero_point_']:
+                    # For scale and zero_point values we register them as buffers in the root module.
+                    # TODO: maybe need more complex attr name here
+                    qparam_node = create_getattr_from_value(
+                        model, graph, module_path + prefix + key, value_or_node)
+                    quantize_op_inputs.append(qparam_node)
+                else:
+                    # for qparams that are not scale/zero_point (like axis, dtype) we store them as literals in the graph.
+                    quantize_op_inputs.append(value_or_node)
+
+            quantized_node = graph.create_node(node_type, quantize_op, tuple(quantize_op_inputs), {})
+            dequantized_node = graph.call_method("dequantize", args=(quantized_node,))
+            node.replace_all_uses_with(dequantized_node)
+            graph.erase_node(node)
     elif compute_dtype in [torch.quint8, torch.qint8, torch.float16]:
         # TODO(future PR): switch compute_dtype to is_dynamic
-        # dynamic quantization
+
+        # uint8/int8/fp16 dynamic quantization branch
+
         node_type = "call_function"
-        if is_decomposed:
-            raise NotImplementedError("decomposed quantize_per_tensor_dynamic op not implemented yet")
-        else:
-            quantize_op = torch.quantize_per_tensor_dynamic
+        quantize_op = torch.quantize_per_tensor_dynamic
         # TODO: get reduce range from observer
         # reduce_range = activation_post_process.reduce_range
         reduce_range = torch.backends.quantized.engine in ("fbgemm", "x86")
         qparams = {"_dtype_": compute_dtype, "_reduce_range_": reduce_range}
+
+        with graph.inserting_before(node):
+            input_node = node.args[0]
+            quantize_op_inputs = [input_node]
+            for key, value in qparams.items():
+                quantize_op_inputs.append(value)
+
+            quantized_node = graph.create_node(node_type, quantize_op, tuple(quantize_op_inputs), {})
+            dequantized_node = graph.call_method("dequantize", args=(quantized_node,))
+            node.replace_all_uses_with(dequantized_node)
+            graph.erase_node(node)
     elif dtype == torch.float16:
         node_type = "call_method"
         quantize_op = "to"
         qparams = {"_dtype_": dtype}
-
-    # 2. replace observer node with quant - dequant node
-    with graph.inserting_before(node):
-        input_node = node.args[0]
-        quantize_op_inputs = [input_node]
-        for key, value in qparams.items():
-            # TODO: we can add the information of whether a value needs to
-            # be registered as an attribute in qparams dict itself
-            if key in ['_scale_', '_zero_point_']:
-                # For scale and zero_point values we register them as buffers in the root module.
-                # TODO: maybe need more complex attr name here
-                qparam_node = create_getattr_from_value(model, graph, module_path + prefix + key, value)
-                quantize_op_inputs.append(qparam_node)
-            else:
-                # for qparams that are not scale/zero_point (like axis, dtype) we store them as literals in the graph.
+        with graph.inserting_before(node):
+            input_node = node.args[0]
+            quantize_op_inputs = [input_node]
+            for key, value in qparams.items():
+                # TODO: we can add the information of whether a value needs to
+                # be registered as an attribute in qparams dict itself
                 quantize_op_inputs.append(value)
 
-        quantized_node = graph.create_node(node_type, quantize_op, tuple(quantize_op_inputs), {})
-        if is_decomposed:
-            # use the same qparams from quantize op
-            dq_inputs = [quantized_node] + quantize_op_inputs[1:]
-            dequantized_node = graph.call_function(
-                torch.ops.quantized_decomposed.dequantize_per_tensor,
-                tuple(dq_inputs),
-                {}
-            )
-        else:
+            quantized_node = graph.create_node(node_type, quantize_op, tuple(quantize_op_inputs), {})
             dequantized_node = graph.call_method("dequantize", args=(quantized_node,))
-        node.replace_all_uses_with(dequantized_node)
-        graph.erase_node(node)
+            node.replace_all_uses_with(dequantized_node)
+            graph.erase_node(node)
+
+    # should not reach since we have checks in the begining to make sure the
+    # activation_post_process is supported
 
 # this is a temporary hack for custom module, we may want to implement
 # this properly after the custom module class design is finalized
@@ -792,9 +999,14 @@ def convert(
                 if observed_node in statically_quantized_custom_module_nodes:
                     _replace_observer_or_dequant_stub_with_dequantize_node(node, model.graph)
                 else:
-                    _replace_observer_with_quantize_dequantize_node(
-                        model, model.graph, node, modules, node_name_to_scope,
-                        node_name_to_qconfig, is_decomposed)
+                    if is_decomposed:
+                        _replace_observer_with_quantize_dequantize_node_decomposed(
+                            model, model.graph, node, modules, node_name_to_scope,
+                            node_name_to_qconfig)
+                    else:
+                        _replace_observer_with_quantize_dequantize_node(
+                            model, model.graph, node, modules, node_name_to_scope,
+                            node_name_to_qconfig)
             elif isinstance(mod, DeQuantStub):
                 _replace_observer_or_dequant_stub_with_dequantize_node(node, model.graph)
             elif is_observed_standalone_module(mod):

From 5e1023bb4273b8acddaa557be223d9e80899c661 Mon Sep 17 00:00:00 2001
From: anjali411 <chourdiaanjali123@gmail.com>
Date: Tue, 22 Nov 2022 13:33:55 +0000
Subject: [PATCH 1170/1922] Meta impl for linalg_cholesky and
 linalg_cholesky_ex (#89430)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89430
Approved by: https://github.com/ezyang
---
 test/functorch/test_aotdispatch.py |  2 +-
 test/test_proxy_tensor.py          |  2 --
 torch/_meta_registrations.py       | 48 +++++++++++++++++++++++++++++-
 torch/_prims_common/__init__.py    |  7 +++--
 4 files changed, 53 insertions(+), 6 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 84b1ba893cce0..648dc04dc5226 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1041,7 +1041,7 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
     xfail('inner', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('kron', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('kthvalue', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail('linalg.cholesky_ex', ''),  # aten.linalg_cholesky_ex.default - couldn't find symbolic meta functio...
+    xfail('linalg.cholesky_ex', ''),  # could not find kernel for aten.linalg_solve_triangular.default
     xfail('linalg.cond', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('linalg.cross', ''),  # aten.linalg_cross.default - couldn't find symbolic meta function/decomposition
     xfail('linalg.det', ''),  # aten._linalg_det.default - couldn't find symbolic meta function/decomposition
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index fa04c57d94260..21142f56e7296 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1201,8 +1201,6 @@ def f(a, b, c, d, e):
     xfail('isin', ''),  # aten.isin.Tensor_Tensor - couldn't find symbolic meta function/decomposition
     xfail('kron', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('kthvalue', ''),  # aten.kthvalue.default - couldn't find symbolic meta function/decomposition
-    xfail('linalg.cholesky', ''),  # aten.linalg_cholesky_ex.default - couldn't find symbolic meta function/decomposition
-    xfail('linalg.cholesky_ex', ''),  # aten.linalg_cholesky_ex.default - couldn't find symbolic meta function/decomposition
     xfail('linalg.cond', ''),  # Tensors of type TensorImpl do not have numel
     xfail('linalg.cross', ''),  # aten.linalg_cross.default - couldn't find symbolic meta function/decomposition
     xfail('linalg.det', ''),  # aten._linalg_det.default - couldn't find symbolic meta function/decomposition
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 9849df0a58af5..6232462ede216 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -15,6 +15,7 @@
     ELEMENTWISE_TYPE_PROMOTION_KIND,
     FloatLike,
     IntLike,
+    make_contiguous_strides_for,
 )
 
 from torch._prims_common.wrappers import out_wrapper
@@ -178,7 +179,8 @@ def meta_angle_out(self, out):
     return out.copy_(torch.angle(self))
 
 
-def squareCheckInputs(self, f_name):
+# From aten/src/ATen/native/LinearAlgebraUtils.h
+def squareCheckInputs(self: Tensor, f_name: str):
     assert (
         self.dim() >= 2
     ), f"{f_name}: The input tensor must have at least 2 dimensions."
@@ -187,6 +189,22 @@ def squareCheckInputs(self, f_name):
     ), f"{f_name}: A must be batches of square matrices, but they are {self.size(-2)} by {self.size(-1)} matrices"
 
 
+# From aten/src/ATen/native/LinearAlgebraUtils.h
+def checkFloatingOrComplex(
+    t: Tensor, f_name: str, allow_low_precision_dtypes: bool = True
+):
+    dtype = t.dtype
+    check(
+        t.is_floating_point() or t.is_complex(),
+        lambda: f"{f_name}, : Expected a floating point or complex tensor as input. Got , {dtype}",
+    )
+    if allow_low_precision_dtypes:
+        check(
+            dtype in (torch.float, torch.double, torch.cfloat, torch.cdouble),
+            lambda: f"{f_name} : Low precision dtypes not supported. Got {dtype}",
+        )
+
+
 def checkUplo(uplo: str):
     uplo_uppercase = uplo.upper()
     assert (
@@ -206,6 +224,34 @@ def meta_linalg_eigh(self, uplo="L"):
     return (values, vectors)
 
 
+# From aten/src/ATen/native/BatchLinearAlgebra.cpp
+@register_meta(aten.linalg_cholesky_ex.default)
+def linalg_cholesky_ex(A: Tensor, upper: bool = False, check_errors: bool = False):
+    squareCheckInputs(A, "linalg.cholesky")
+    checkFloatingOrComplex(A, "linalg.cholesky")
+
+    A_shape = A.shape
+    ndim = len(A_shape)
+
+    # L
+    L_strides = make_contiguous_strides_for(A_shape, False)
+    L = A.new_empty(A_shape)
+    L.as_strided_(A_shape, L_strides)
+
+    # infos
+    infos = A.new_empty(A_shape[0 : ndim - 2], dtype=torch.int32)
+    return L, infos
+
+
+# From aten/src/ATen/native/BatchLinearAlgebra.cpp
+@register_meta(aten.linalg_cholesky.default)
+def meta_linalg_cholesky(A: Tensor, upper=False):
+    # All the checks done on info in the corresponding C++ function
+    # are data dependent, so we skip info computation
+    L, infos = linalg_cholesky_ex(A, upper, False)
+    return L, infos
+
+
 # From aten/src/ATen/native/ReflectionPad.cpp
 @register_meta(
     [aten.reflection_pad2d_backward.default, aten.replication_pad2d_backward.default]
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index 6df72f6c158d4..a17dad4f2a92b 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -1341,15 +1341,17 @@ def reduction_dtypes(
         result_dtype = torch.bool
     return computation_dtype, result_dtype
 
-
+# This function's logic is borrowed from the following functions defined in C++:
+# batched_matrix_contiguous_strides and contiguous_strides
 def make_contiguous_strides_for(
     shape: ShapeType, row_major: bool = True
 ) -> Tuple[int, ...]:
     """
-    Returns the strides of a contriguous tensor if row_major
+    Returns the strides of a contiguous tensor if row_major
     If row_major=True, it returns the strides of a contiguous batch of Fortran-contiguous matrices
     This is often used when calling external libraries like BLAS/LAPACK/cuSolver...
     """
+    # contiguous_strides from c10/util/strides.h
     validate_shape(shape)
     if not shape:
         return ()
@@ -1363,6 +1365,7 @@ def make_contiguous_strides_for(
 
     result = tuple(reversed(strides))
 
+    # batched_matrix_contiguous_strides from aten/src/ATen/native/LinearAlgebraUtils.h
     if row_major:
         return result
     else:

From a45f8899777f1ce24dec7e5a4602c0f00572bf08 Mon Sep 17 00:00:00 2001
From: PratsBhatt <pbhatt110@gmail.com>
Date: Tue, 22 Nov 2022 18:00:01 +0000
Subject: [PATCH 1171/1922] Add cached conda env file for Buck CI workflow
 (#89422)

Fixes - T137631262

Caching conda dependencies for build workflows.
Conda dependencies have been gathered from the workflow https://github.com/pytorch/pytorch/blob/master/.github/workflows/_buck-build-test.yml

The pull request updates the action from `conda-incubator/setup-miniconda@v2` to `pytorch/test-infra/.github/actions/setup-miniconda@main` as it supports caching.

Test Plan:

Running the `ciflow/periodic` which runs the ci builds `buck-build-test` workflow. Expected output is to have all the conda dependencies cached.

<img width="1227" alt="Screenshot 2022-11-22 at 15 44 20" src="https://user-images.githubusercontent.com/15447437/203343298-e55c384b-01ad-45c3-a5e9-ba5c53149be4.png">

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89422
Approved by: https://github.com/huydhn
---
 .github/requirements/README.md           |  2 ++
 .github/requirements/conda-env-Linux-X64 | 10 ++++++++++
 .github/workflows/_buck-build-test.yml   | 23 ++---------------------
 3 files changed, 14 insertions(+), 21 deletions(-)
 create mode 100644 .github/requirements/conda-env-Linux-X64

diff --git a/.github/requirements/README.md b/.github/requirements/README.md
index a4f3cb75d9a76..7300eee145629 100644
--- a/.github/requirements/README.md
+++ b/.github/requirements/README.md
@@ -17,6 +17,8 @@ The list of support files are as follows:
     test jobs to setup the conda environment
   * conda-env-macOS-X64. This is use by MacOS (x86-64) build and test
     jobs to setup the conda environment
+  * conda-env-Linux-X64. This is used by Linux buck build and test jobs
+    to setup the conda environment
 * Pip:
   * pip-requirements-macOS.txt. This is used by MacOS build and test jobs to
     setup the pip environment
diff --git a/.github/requirements/conda-env-Linux-X64 b/.github/requirements/conda-env-Linux-X64
new file mode 100644
index 0000000000000..f2b3811263e59
--- /dev/null
+++ b/.github/requirements/conda-env-Linux-X64
@@ -0,0 +1,10 @@
+cffi=1.15.1
+cmake=3.22.1
+mkl=2022.1.0
+mkl-include=2022.1.0
+ninja=1.10.2
+numpy=1.23.3
+pyyaml=6.0
+requests=2.28.1
+setuptools=65.5.0
+typing_extensions=4.3.0
diff --git a/.github/workflows/_buck-build-test.yml b/.github/workflows/_buck-build-test.yml
index f52bb6017c587..07f41299c711b 100644
--- a/.github/workflows/_buck-build-test.yml
+++ b/.github/workflows/_buck-build-test.yml
@@ -21,29 +21,10 @@ jobs:
           distribution: 'temurin'
 
       - name: Setup miniconda
-        uses: conda-incubator/setup-miniconda@v2
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
         with:
-          auto-update-conda: true
           python-version: 3.8
-          activate-environment: build
-
-      - name: Install dependencies
-        uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
-        with:
-          timeout_minutes: 10
-          max_attempts: 5
-          command: |
-            conda install -y \
-              cffi=1.15.1 \
-              cmake=3.22.1 \
-              mkl=2022.1.0 \
-              mkl-include=2022.1.0 \
-              ninja=1.10.2 \
-              numpy=1.23.3 \
-              pyyaml=6.0 \
-              requests=2.28.1 \
-              setuptools=65.5.0 \
-              typing_extensions=4.3.0
+          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
 
       - name: Install Buck
         uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482

From 7b682fc96fc63bc944b72d2c1a2b36117982ac96 Mon Sep 17 00:00:00 2001
From: mantaionut <ionut@janeasystems.com>
Date: Tue, 22 Nov 2022 18:37:14 +0000
Subject: [PATCH 1172/1922] Force numpy prod to use 64 bit integers on Windows
 in some tests (#88089)

This fixes some prod and masked.prod tests on Windows.

np.prod uses int32 on Windows so it overflows.

On Linux it uses by default int64.

Fixes #77305
Fixes #77320
Fixes #77334
Fixes #77335
Fixes #77336
Fixes #77337

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88089
Approved by: https://github.com/mruberry
---
 .../_internal/common_methods_invocations.py   |  3 ++-
 .../_internal/opinfo/definitions/_masked.py   |  5 ++---
 torch/testing/_internal/opinfo/utils.py       | 20 +++++++++++++++++++
 3 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 4b2d0ebabc46b..4116d967fd8a3 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -105,6 +105,7 @@
 from torch.testing._internal.opinfo.utils import (
     np_unary_ufunc_integer_promotion_wrapper,
     reference_reduction_numpy,
+    prod_numpy
 )
 from torch.testing._internal import opinfo
 from torch.testing._internal.opinfo.definitions.linalg import (
@@ -16468,7 +16469,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
         dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
         sample_inputs_func=sample_inputs_prod,
-        ref=reference_reduction_numpy(np.prod),
+        ref=prod_numpy,
         skips=(
             # FIXME: prod does not support passing keepdim without passing dim
             DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_default_keepdim'),
diff --git a/torch/testing/_internal/opinfo/definitions/_masked.py b/torch/testing/_internal/opinfo/definitions/_masked.py
index f4b590fe25202..5a5ce8bc7e164 100644
--- a/torch/testing/_internal/opinfo/definitions/_masked.py
+++ b/torch/testing/_internal/opinfo/definitions/_masked.py
@@ -26,8 +26,7 @@
     sample_inputs_reduction,
     SampleInput,
 )
-from torch.testing._internal.opinfo.utils import reference_reduction_numpy
-
+from torch.testing._internal.opinfo.utils import prod_numpy, reference_reduction_numpy
 
 # Used for log_softmax, softmax, softmin
 def sample_inputs_softmax_variant(
@@ -434,7 +433,7 @@ def sample_inputs_masked_normalize(op_info, device, dtype, requires_grad, **kwar
     ),
     ReductionOpInfo(
         "masked.prod",
-        ref=reference_reduction_numpy(np.prod),
+        ref=prod_numpy,
         method_variant=None,
         identity=1,
         nan_policy="propagate",
diff --git a/torch/testing/_internal/opinfo/utils.py b/torch/testing/_internal/opinfo/utils.py
index a19da98cd1d0c..0bbba7c769d84 100644
--- a/torch/testing/_internal/opinfo/utils.py
+++ b/torch/testing/_internal/opinfo/utils.py
@@ -258,3 +258,23 @@ def wrapper(x: np.ndarray, *args, **kwargs):
         return result
 
     return wrapper
+
+
+def prod_numpy(a, *args, **kwargs):
+    """
+    The function will call np.prod with type as np.int64 if the input type
+    is int or uint64 if is uint. This is necessary because windows np.prod uses by default
+    int32 while on linux it uses int64.
+    This is for fixing integer overflow https://github.com/pytorch/pytorch/issues/77320
+
+    Returns:
+        np.prod of input
+    """
+    if "dtype" not in kwargs:
+        if np.issubdtype(a.dtype, np.signedinteger):
+            a = a.astype(np.int64)
+        elif np.issubdtype(a.dtype, np.unsignedinteger):
+            a = a.astype(np.uint64)
+
+    fn = reference_reduction_numpy(np.prod)
+    return fn(a, *args, **kwargs)

From 06283aad1541c9df22e5d34b4d5354ad2e8a29c3 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Tue, 22 Nov 2022 18:42:13 +0000
Subject: [PATCH 1173/1922] Fix benchmarks - xla tensor test (#89509)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89509
Approved by: https://github.com/ngimel, https://github.com/shunting314
---
 benchmarks/dynamo/common.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 3fad203c5d871..a167ab75b53f7 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -269,6 +269,7 @@ def print_summary(filename):
 def tensor_is_on_xla(tensors):
     if not isinstance(tensors, (tuple, list)):
         tensors = [tensors]
+    tensors = [x for x in tensors if isinstance(x, torch.Tensor)]
     return any(map(lambda x: x.device.type == "xla", tensors))
 
 
From 49de267bb1c16ef65cd673bd61a9d5c6aac9c892 Mon Sep 17 00:00:00 2001
From: Fuzzkatt <zonghan2000@gmail.com>
Date: Tue, 22 Nov 2022 19:05:56 +0000
Subject: [PATCH 1174/1922] enable previously failing UCC distributed_test.py
 tests (#89023)

Enables previously failing UCC distributed_test.py tests that are now fixed due to either ProcessGroupUCC barrier blocking fix (https://github.com/pytorch/pytorch/pull/86961) or UCC-side timeout error handling fix:  (https://github.com/openucx/ucc/pull/679/files). Bump upstream UCC version to build UCC with timeout error handling fix merged in.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89023
Approved by: https://github.com/kwen2501, https://github.com/malfet
---
 .circleci/docker/build.sh                     |  2 +-
 .../_internal/distributed/distributed_test.py | 20 +++++++++----------
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh
index 61d9b73d73dfb..b41d5fe2c8c1b 100755
--- a/.circleci/docker/build.sh
+++ b/.circleci/docker/build.sh
@@ -81,7 +81,7 @@ fi
 
 TRAVIS_DL_URL_PREFIX="https://s3.amazonaws.com/travis-python-archives/binaries/ubuntu/14.04/x86_64"
 _UCX_COMMIT=31e74cac7bee0ef66bef2af72e7d86d9c282e5ab
-_UCC_COMMIT=12944da33f911daf505d9bbc51411233d0ed85e1
+_UCC_COMMIT=1c7a7127186e7836f73aafbd7697bbc274a77eee
 
 # It's annoying to rename jobs every time you want to rewrite a
 # configuration, so we hardcode everything here rather than do it
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index c67dfc7c40a3d..814dd3d5ad5f8 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -73,6 +73,7 @@
     FILE_SCHEMA,
     IS_FBCODE,
     NO_MULTIPROCESSING_SPAWN,
+    IS_SANDCASTLE,
     parametrize,
     sandcastle_skip,
     sandcastle_skip_if,
@@ -3748,7 +3749,7 @@ def _test_barrier_helper(
 
         @skip_if_no_gpu
         @sandcastle_skip_if(BACKEND == "mpi", "MPI doesn't supports GPU barrier")
-        @sandcastle_skip_if(BACKEND == "ucc", "flaky on PyTorch CI with timeout")
+        @sandcastle_skip_if(BACKEND == "ucc" and IS_SANDCASTLE, "Skipped internally")
         def test_barrier_cuda(self):
             group, group_id, rank = self._init_global_test()
             rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
@@ -3842,7 +3843,7 @@ def _test_all_reduce_multigpu_helper(
 
         @sandcastle_skip_if(BACKEND == "mpi", "MPI doesn't support broadcast multigpu")
         @sandcastle_skip_if(BACKEND == "nccl", "CUDA all_reduce multigpu skipped for NCCL")
-        @sandcastle_skip_if(BACKEND == "ucc", "UCC all_reduce multigpu skipped")
+        @sandcastle_skip_if(BACKEND == "ucc" and IS_SANDCASTLE, "Skipped internally")
         @skip_if_no_gpu
         def test_all_reduce_multigpu(self):
             group, group_id, rank = self._init_global_test()
@@ -3860,7 +3861,7 @@ def test_all_reduce_multigpu(self):
 
         @sandcastle_skip_if(BACKEND == "mpi", "MPI doesn't support broadcast multigpu")
         @sandcastle_skip_if(BACKEND == "nccl", "CUDA all_reduce multigpu skipped for NCCL")
-        @sandcastle_skip_if(BACKEND == "ucc", "UCC all_reduce multigpu skipped")
+        @sandcastle_skip_if(BACKEND == "ucc" and IS_SANDCASTLE, "Skipped internally")
         @skip_if_no_gpu
         def test_all_reduce_multigpu_complex(self):
             group, group_id, rank = self._init_global_test()
@@ -7717,14 +7718,14 @@ def _test_verify_model_across_rank(self, use_logger):
 
         @require_backend(DistTestCases.backend_feature["gpu"])
         @require_backends_available(DistTestCases.backend_feature["gpu"])
-        @sandcastle_skip_if(BACKEND == "ucc", "test timing out locally with ucc")
+        @sandcastle_skip_if(BACKEND == "ucc" and IS_SANDCASTLE, "Skipped internally")
         @skip_if_lt_x_gpu(2)
         def test_verify_model_across_rank_with_logger(self):
             self._test_verify_model_across_rank(use_logger=True)
 
         @require_backend(DistTestCases.backend_feature["gpu"])
         @require_backends_available(DistTestCases.backend_feature["gpu"])
-        @sandcastle_skip_if(BACKEND == "ucc", "test timing out locally with ucc")
+        @sandcastle_skip_if(BACKEND == "ucc" and IS_SANDCASTLE, "Skipped internally")
         @skip_if_lt_x_gpu(2)
         def test_verify_model_across_rank_without_logger(self):
             self._test_verify_model_across_rank(use_logger=False)
@@ -7748,7 +7749,7 @@ def _run_test_ddp_model_with_diff_params(self, ctx, net, ddp_group, group_gloo):
 
         @require_backend(DistTestCases.backend_feature["gpu"])
         @require_backends_available(DistTestCases.backend_feature["gpu"])
-        @sandcastle_skip_if(BACKEND == "ucc", "test failing locally with UCC")
+        @sandcastle_skip_if(BACKEND == "ucc" and IS_SANDCASTLE, "Skipped internally")
         @skip_if_lt_x_gpu(2)
         def test_ddp_model_diff_shape_across_ranks(self):
             group_gloo = dist.new_group(
@@ -7771,7 +7772,7 @@ def test_ddp_model_diff_shape_across_ranks(self):
 
         @require_backend(DistTestCases.backend_feature["gpu"])
         @require_backends_available(DistTestCases.backend_feature["gpu"])
-        @sandcastle_skip_if(BACKEND == "ucc", "test failing locally with UCC")
+        @sandcastle_skip_if(BACKEND == "ucc" and IS_SANDCASTLE, "Skipped internally")
         @skip_if_lt_x_gpu(2)
         def test_ddp_model_diff_num_params_across_ranks(self):
             group_gloo = dist.new_group(
@@ -9185,11 +9186,8 @@ def _test_hook_pickling(self, hook, hook_state):
             BACKEND not in DistTestCases.backend_feature["cuda"],
             f"The {BACKEND} backend does not support DDP communication hook on CUDA devices"
         )
-        @sandcastle_skip_if(
-            BACKEND == "ucc",
-            "flaky on PyTorch CI: No such file or directory: '/tmp/checkpoint.pt'"
-        )
         @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
+        @sandcastle_skip_if(BACKEND == "ucc" and IS_SANDCASTLE, "Skipped internally")
         def test_ddp_hook_pickling_powerSGD(self):
 
             hook = powerSGD.powerSGD_hook

From f40df9b51a99fca7bf20c4ac4b2f759da2a78916 Mon Sep 17 00:00:00 2001
From: "Li-Huai (Allan) Lin" <qqaatw@gmail.com>
Date: Tue, 22 Nov 2022 19:33:21 +0000
Subject: [PATCH 1175/1922] Fix dev-discuss link in the maintainer docs
 (#89493)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89493
Approved by: https://github.com/H-Huang
---
 docs/source/community/persons_of_interest.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/community/persons_of_interest.rst b/docs/source/community/persons_of_interest.rst
index d011250d490d0..02224696c61b0 100644
--- a/docs/source/community/persons_of_interest.rst
+++ b/docs/source/community/persons_of_interest.rst
@@ -7,7 +7,7 @@ Responsibilities
 * Triage and fix high priority issues assigned to the module or library
 * Triage, review, and land high priority pull requests assigned to the module or library
 * Answer module or library questions on `discuss.pytorch.org <https://discuss.pytorch.org/>`__
-  and `dev-discuss.pytorch.org <dev-discuss.pytorch.org>`__
+  and `dev-discuss.pytorch.org <https://dev-discuss.pytorch.org/>`__
 * Maintain public user and development documentation
 * Run meetings and share minutes plus roadmap on a half or quarterly basis
 

From bc36fa0acafd5db06955ffcf7d7ffd936ae8cfbd Mon Sep 17 00:00:00 2001
From: Alexander Grund <Flamefire@users.noreply.github.com>
Date: Tue, 22 Nov 2022 20:25:38 +0000
Subject: [PATCH 1176/1922] (Further) limit world size in test_fsdp_pure_fp16
 (#86280)

Test still fails when run on 5 A100 GPUs, although it works with 5 V100s. Using 4 GPUs seems to be fine.

Followup to #85957

Pull Request resolved: https://github.com/pytorch/pytorch/pull/86280
Approved by: https://github.com/awgu, https://github.com/kit1980
---
 test/distributed/fsdp/test_fsdp_pure_fp16.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/distributed/fsdp/test_fsdp_pure_fp16.py b/test/distributed/fsdp/test_fsdp_pure_fp16.py
index 1c663f8263354..e0033ef3d4b72 100644
--- a/test/distributed/fsdp/test_fsdp_pure_fp16.py
+++ b/test/distributed/fsdp/test_fsdp_pure_fp16.py
@@ -33,8 +33,8 @@
 class TestPureFP16(FSDPTest):
     @property
     def world_size(self):
-        # Test fails due to inaccuracies when using more than 5 GPUs
-        return min(5, super().world_size)
+        # Test fails due to inaccuracies when using more than 4 GPUs
+        return min(4, super().world_size)
 
     @skip_if_lt_x_gpu(2)
     @parametrize(

From 571e6dc7440d75ab84728f1ddac761c88a01b1a3 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Tue, 22 Nov 2022 20:27:27 +0000
Subject: [PATCH 1177/1922] Relax tolerance for test_out_addbmm_cpu_float32
 (#86365)

The test may fail due to slightly different values caused by different order of matrizes in SGEMM:

> Mismatched elements: 1 / 50 (2.0%)
> Greatest absolute difference: 1.430511474609375e-05 at index (4, 5) (up to 1e-05 allowed)
> Greatest relative difference: 4.65393206065873e-06 at index (4, 5) (up to 1.3e-06 allowed)

Observed on POWER (ppc64le)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/86365
Approved by: https://github.com/mruberry, https://github.com/kit1980
---
 torch/testing/_internal/common_methods_invocations.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 4116d967fd8a3..177dc669469e7 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -8539,6 +8539,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    'TestConsistency',
                    'test_output_match',
                ),
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=1.5e-05, rtol=1e-05)}),
+                   'TestCommon', 'test_out'),
            ],
            skips=(
                # NVIDIA only assures that bfloat16 is supported by bmm if SM >= 5.3

From dbce750ff0ac919636ebd2d9ae95f61f3b49119b Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Tue, 22 Nov 2022 20:29:07 +0000
Subject: [PATCH 1178/1922] Fix vectorized trigonometric functions for VSX
 (#86453)

Replace the remaining hand-written code in vec256_float_vsx.h by calls to Sleef functions similar to what was done in #59382 & #82646 after #41541

This fixes wrong results for e.g. `sin(1e20)`.
Fixes #85978

To fix #85978 I only needed to do the sin/cos functions to make the test pass but to not encounter the same issue again and again (see the previous PRs and issues) I checked the whole file for similar functions where a Sleef function could be used and changed those too. In the diff I've noticed the faulty whitespace so to make this complete I fixed that too, so it should now be done.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/86453
Approved by: https://github.com/malfet
---
 .../cpu/vec/vec256/vsx/vec256_float_vsx.h     | 224 ++----------------
 1 file changed, 21 insertions(+), 203 deletions(-)

diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
index 77cf3695ab912..8fe6cc25f0ee9 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
@@ -256,29 +256,29 @@ class Vectorized<float> {
   }
 
   Vectorized<float> C10_ALWAYS_INLINE acos() const {
-     return {Sleef_acosf4_u10vsx(_vec0), Sleef_acosf4_u10vsx(_vec1)};
+    return {Sleef_acosf4_u10vsx(_vec0), Sleef_acosf4_u10vsx(_vec1)};
   }
   Vectorized<float> C10_ALWAYS_INLINE asin() const {
-     return {Sleef_asinf4_u10vsx(_vec0), Sleef_asinf4_u10vsx(_vec1)};
+    return {Sleef_asinf4_u10vsx(_vec0), Sleef_asinf4_u10vsx(_vec1)};
   }
   Vectorized<float> atan() const {
-     return {Sleef_atanf4_u10vsx(_vec0), Sleef_atanf4_u10vsx(_vec1)};
+    return {Sleef_atanf4_u10vsx(_vec0), Sleef_atanf4_u10vsx(_vec1)};
   }
   Vectorized<float> atan2(const Vectorized<float>& b) const {
-     return {Sleef_atan2f4_u10vsx(_vec0, b._vec0), Sleef_atan2f4_u10vsx(_vec1, b._vec1)};
+    return {Sleef_atan2f4_u10vsx(_vec0, b._vec0), Sleef_atan2f4_u10vsx(_vec1, b._vec1)};
   }
   Vectorized<float> copysign(const Vectorized<float> &sign) const {
     return {Sleef_copysignf4_vsx(_vec0, sign._vec0), Sleef_copysignf4_vsx(_vec1, sign._vec1)};
   }
   Vectorized<float> lgamma() const {
-     return {Sleef_lgammaf4_u10vsx(_vec0), Sleef_lgammaf4_u10vsx(_vec1)};
+    return {Sleef_lgammaf4_u10vsx(_vec0), Sleef_lgammaf4_u10vsx(_vec1)};
   }
   Vectorized<float> erf() const {
-     return {Sleef_erff4_u10vsx(_vec0), Sleef_erff4_u10vsx(_vec1)};
+    return {Sleef_erff4_u10vsx(_vec0), Sleef_erff4_u10vsx(_vec1)};
   }
 
   Vectorized<float> erfc() const {
-     return {Sleef_erfcf4_u15vsx(_vec0), Sleef_erfcf4_u15vsx(_vec1)};
+    return {Sleef_erfcf4_u15vsx(_vec0), Sleef_erfcf4_u15vsx(_vec1)};
   }
 
   Vectorized<float> erfinv() const {
@@ -301,133 +301,32 @@ class Vectorized<float> {
   }
 
   Vectorized<float> C10_ALWAYS_INLINE exp() const {
-    // implementation logic from avx_mathfun with some modifications from sleef
-    // Express e**x = e**g 2**n
-    ///   = e**g e**( n loge(2) )
-    ///   = e**( g + n loge(2) )
-    //
-    auto tmp_x = *this;
-    auto fx = (tmp_x * log2e_inv).round();
-
-    auto x = fx.madd(negln2f_hi, tmp_x);
-    x = fx.madd(negln2f_lo, x);
-    auto z = x * x;
-    auto y = x.madd(exp_p0, exp_p1);
-    y = y.madd(x, exp_p2);
-    y = y.madd(x, exp_p3);
-    y = y.madd(x, exp_p4);
-    y = y.madd(x, exp_p5);
-    y = y.madd(z, x) + one;
-
-    // vm_pow2n 2^n
-    vint32 imm0 = vec_signed(fx._vec0);
-    vint32 imm1 = vec_signed(fx._vec1);
-    // this pow2n logic is  from Sleef code
-    vint32 imm00 = imm0 >> 1; //>>1
-    vint32 imm01 = imm1 >> 1;
-    vint32 imm10 = imm0 - imm00;
-    vint32 imm11 = imm1 - imm01;
-    imm00 = (imm00 + v0x7f) << vu_23;
-    imm01 = (imm01 + v0x7f) << vu_23;
-    imm10 = (imm10 + v0x7f) << vu_23;
-    imm11 = (imm11 + v0x7f) << vu_23;
-    // treat imm as float vector without conversion
-
-    y._vec0 = (y._vec0 * (vfloat32)imm00) * (vfloat32)imm10;
-    y._vec1 = (y._vec1 * (vfloat32)imm01) * (vfloat32)imm11;
-    // boundary check
-    auto tmp = blendv(y, v_inf, (Vectorized<float>(exp_hi) <= tmp_x));
-    y = blendv(tmp, zero, (tmp_x < Vectorized<float>(exp_lo)));
-
-    return y;
+    return {Sleef_expf4_u10vsx(_vec0), Sleef_expf4_u10vsx(_vec1)};
   }
   Vectorized<float> expm1() const {
-    return exp() - one;
+    return {Sleef_expm1f4_u10vsx(_vec0), Sleef_expm1f4_u10vsx(_vec1)};
   }
 
   Vectorized<float> C10_ALWAYS_INLINE log() const {
     return {Sleef_logf4_u10vsx(_vec0), Sleef_logf4_u10vsx(_vec1)};
   }
   Vectorized<float> C10_ALWAYS_INLINE log10() const {
-     return {Sleef_log10f4_u10vsx(_vec0), Sleef_log10f4_u10vsx(_vec1)};
+    return {Sleef_log10f4_u10vsx(_vec0), Sleef_log10f4_u10vsx(_vec1)};
   }
   Vectorized<float> C10_ALWAYS_INLINE log1p() const {
-     return {Sleef_log1pf4_u10vsx(_vec0), Sleef_log1pf4_u10vsx(_vec1)};
+    return {Sleef_log1pf4_u10vsx(_vec0), Sleef_log1pf4_u10vsx(_vec1)};
   }
   Vectorized<float> C10_ALWAYS_INLINE log2() const {
-     return {Sleef_log2f4_u10vsx(_vec0), Sleef_log2f4_u10vsx(_vec1)};
+    return {Sleef_log2f4_u10vsx(_vec0), Sleef_log2f4_u10vsx(_vec1)};
   }
   Vectorized<float> C10_ALWAYS_INLINE ceil() const {
     return {vec_ceil(_vec0), vec_ceil(_vec1)};
   }
   Vectorized<float> C10_ALWAYS_INLINE cos() const {
-    // take the absolute value
-    auto x = abs();
-    // extract the sign bit (upper one)
-    auto sign_bit = (*this) & sign_mask;
-    // scale by 4/Pi
-    auto y = x * _4div_pi;
-    // store the integer part of y in mm0
-    // j=(j+1) & (~1) (see the cephes sources)
-    vint32 imm0 = (vec_signed(y._vec0) + vi_1) & vi_inv1;
-    vint32 imm1 = (vec_signed(y._vec1) + vi_1) & vi_inv1;
-    y._vec0 = vec_float(imm0);
-    y._vec1 = vec_float(imm1);
-
-    imm0 = imm0 - vi_2;
-    imm1 = imm1 - vi_2;
-    Vectorized<float> poly_mask;
-    // get the swap sign flag
-    vint32 tmp0 = vec_and(vec_nand(imm0, imm0), vi_4);
-    vint32 tmp1 = vec_and(vec_nand(imm1, imm1), vi_4);
-    sign_bit._vecb0 = (vbool32)vec_sl(tmp0, vu_29);
-    sign_bit._vecb1 = (vbool32)vec_sl(tmp1, vu_29);
-    // get the polynom selection mask
-    // there is one polynom for 0 <= x <= Pi / 4
-    // and another one for Pi / 4 < x <= Pi / 2
-    // Both branches will be computed.
-
-    poly_mask._vecb0 = (vbool32)vec_cmpeq((imm0 & vi_2), vi_0);
-    poly_mask._vecb1 = (vbool32)vec_cmpeq((imm1 & vi_2), vi_0);
-
-    // The magic pass: "Extended precision modular arithmetic"
-    //  x = ((x - y * DP1) - y * DP2) - y * DP3;
-    x = y.madd(minus_cephes_dp1, x);
-    x = y.madd(minus_cephes_dp2, x);
-    x = y.madd(minus_cephes_dp3, x);
-
-    // Evaluate the first polynom  (0 <= x <= Pi/4)
-    auto z = x * x;
-    y = z.madd(coscof_p0, coscof_p1);
-    y = y.madd(z, coscof_p2);
-    y = y * z * z;
-    y = y - z * half + one;
-
-    // Evaluate the second polynom  (Pi/4 <= x <= 0)
-    auto y_2 = z.madd(sincof_p0, sincof_p1);
-    y_2 = y_2.madd(z, sincof_p2);
-    y_2 = y_2 * z;
-    y_2 = y_2.madd(x, x);
-
-    // select the correct result from the two polynoms
-    y = blendv(y, y_2, poly_mask);
-    // update the sign
-    y = y ^ sign_bit;
-
-    return y;
+    return {Sleef_cosf4_u10vsx(_vec0), Sleef_cosf4_u10vsx(_vec1)};
   }
   Vectorized<float> C10_ALWAYS_INLINE cosh() const {
-    // cosh = 1/2 * (e^x + e^-x)
-    auto x = abs();
-    auto e_x = x.exp();
-    auto ret = (e_x + Vectorized<float>(one) / e_x) * half;
-    // inf and nan checks
-#if 0
-                    ret = blendv(ret, v_inf, x >= vf_89);
-                    ret = blendv(ret, v_inf, ret.isnan());
-                    ret = blendv(ret, v_nan, this->isnan());
-#endif
-    return ret;
+    return {Sleef_coshf4_u10vsx(_vec0), Sleef_coshf4_u10vsx(_vec1)};
   }
   Vectorized<float> C10_ALWAYS_INLINE floor() const {
     return {vec_floor(_vec0), vec_floor(_vec1)};
@@ -440,97 +339,16 @@ class Vectorized<float> {
     return {vec_round(_vec0), vec_round(_vec1)};
   }
   Vectorized<float> C10_ALWAYS_INLINE sin() const {
-    // take the absolute value and xtract sign
-    auto x = abs();
-    auto sign_bit = (*this) & sign_mask;
-
-    // scale by 4/Pi
-    auto y = x * _4div_pi;
-    // store the integer part of y in mm0
-
-    // j=(j+1) & (~1) (see the cephes sources)
-    vint32 imm0 = (vec_signed(y._vec0) + vi_1) & vi_inv1;
-    vint32 imm1 = (vec_signed(y._vec1) + vi_1) & vi_inv1;
-    y._vec0 = vec_float(imm0);
-    y._vec1 = vec_float(imm1);
-    // get the swap sign flag
-    Vectorized<float> swap_sign_bit, poly_mask;
-    swap_sign_bit._vecb0 = (vbool32)vec_sl(imm0 & vi_4, vu_29);
-    swap_sign_bit._vecb1 = (vbool32)vec_sl(imm1 & vi_4, vu_29);
-    // get the polynom selection mask
-    // there is one polynom for 0 <= x <= Pi/4
-    // and another one for Pi/4<x<=Pi/2
-    // Both branches will be computed.
-
-    poly_mask._vecb0 = vec_cmpeq((imm0 & vi_2), vi_0);
-    poly_mask._vecb1 = vec_cmpeq((imm1 & vi_2), vi_0);
-    sign_bit = sign_bit ^ swap_sign_bit; // xor operation
-
-    // The magic pass: "Extended precision modular arithmetic"
-    //  x = ((x - y * DP1) - y * DP2) - y * DP3;
-    x = y.madd(minus_cephes_dp1, x);
-    x = y.madd(minus_cephes_dp2, x);
-    x = y.madd(minus_cephes_dp3, x);
-
-    // Evaluate the first polynom  (0 <= x <= Pi/4)
-    auto z = x * x;
-    y = z.madd(coscof_p0, coscof_p1);
-    y = y.madd(z, coscof_p2);
-    y = y * z * z;
-    y = y - z * half + one;
-
-    // Evaluate the second polynom  (Pi/4 <= x <= 0)
-    auto y2 = z.madd(sincof_p0, sincof_p1);
-    y2 = y2.madd(z, sincof_p2);
-    y2 = y2 * z;
-    y2 = y2.madd(x, x);
-    // select the correct result from the two polynoms
-    y = blendv(y, y2, poly_mask);
-    y = y ^ sign_bit;
-
-    return y;
+    return {Sleef_sinf4_u10vsx(_vec0), Sleef_sinf4_u10vsx(_vec1)};
   }
   Vectorized<float> C10_ALWAYS_INLINE sinh() const {
-    auto temp_abs = abs();
-    // get exponent
-    auto ret = temp_abs.exp();
-    auto recp = Vectorized<float>(half) / ret;
-    auto v = ret * half - recp;
-    // extract the sign bit (upper one)
-    auto sign_bit = (*this) & sign_mask;
-    auto z = temp_abs * temp_abs;
-    auto y = z.madd(p0, p1);
-    y = y.madd(z, p2);
-    y = (y * z).madd(temp_abs, temp_abs);
-    // check and select
-    auto result = blendv(y, v, temp_abs > one);
-    return result | sign_bit;
+    return {Sleef_sinhf4_u10vsx(_vec0), Sleef_sinhf4_u10vsx(_vec1)};
   }
   Vectorized<float> C10_ALWAYS_INLINE tan() const {
-     return {Sleef_tanf4_u10vsx(_vec0), Sleef_tanf4_u10vsx(_vec1)};
+    return {Sleef_tanf4_u10vsx(_vec0), Sleef_tanf4_u10vsx(_vec1)};
   }
   Vectorized<float> C10_ALWAYS_INLINE tanh() const {
-    auto x = *this;
-    auto vabs = abs();
-    // get exponent
-    auto exp2x = (vabs + vabs).exp();
-    auto vv = Vectorized<float>(one) - Vectorized<float>(two) / (exp2x + one);
-    // extract the sign bit (upper one)
-    auto sign_bit = (*this) & sign_mask;
-    auto z = vabs * vabs;
-    auto y = z.madd(tanh_p0, tanh_p1);
-    auto tmp = y.madd(z, tanh_p2);
-    y = z.madd(tmp, tanh_p3);
-    tmp = y.madd(z, tanh_p4);
-    y = tmp * z;
-    tmp = y.madd(x, x);
-    // add sign
-    vv = vv | sign_bit;
-    // check and select
-    auto sel_mask = vabs >= tanh_0p625;
-    auto max_mask = vabs > tanh_half_max;
-    auto max_ret = sign_bit ^ one;
-    return blendv(blendv(tmp, vv, sel_mask), max_ret, max_mask);
+    return {Sleef_tanhf4_u10vsx(_vec0), Sleef_tanhf4_u10vsx(_vec1)};
   }
   Vectorized<float> C10_ALWAYS_INLINE trunc() const {
     return {vec_trunc(_vec0), vec_trunc(_vec1)};
@@ -555,15 +373,15 @@ class Vectorized<float> {
   }
 
   Vectorized<float> fmod(const Vectorized<float>& b) const {
-     return {Sleef_fmodf4_vsx(_vec0, b._vec0),Sleef_fmodf4_vsx(_vec1, b._vec1)};
+    return {Sleef_fmodf4_vsx(_vec0, b._vec0),Sleef_fmodf4_vsx(_vec1, b._vec1)};
   }
 
   Vectorized<float> hypot(const Vectorized<float>& b) const {
-     return {Sleef_hypotf4_u05vsx(_vec0, b._vec0), Sleef_hypotf4_u05vsx(_vec1, b._vec1)};
+    return {Sleef_hypotf4_u05vsx(_vec0, b._vec0), Sleef_hypotf4_u05vsx(_vec1, b._vec1)};
   }
 
   Vectorized<float> nextafter(const Vectorized<float>& b) const {
-     return {Sleef_nextafterf4_vsx(_vec0, b._vec0), Sleef_nextafterf4_vsx(_vec1, b._vec1)};
+    return {Sleef_nextafterf4_vsx(_vec0, b._vec0), Sleef_nextafterf4_vsx(_vec1, b._vec1)};
   }
 
   Vectorized<float> igamma(const Vectorized<float>& x) const {

From b6579e9eae521cd84477021c9d3880e2fdd93560 Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@fb.com>
Date: Tue, 22 Nov 2022 21:17:36 +0000
Subject: [PATCH 1179/1922] [dashboard] Add commit date & fix date related
 issues (#89517)

Add commit date to build summary of dashboard. Make the date of the run reflective of when the run started, not when the run ended. Use PST (UTC -8) to determine day, rather than GMT (UTC +0).

Test comment: https://github.com/pytorch/torchdynamo/issues/1831#issuecomment-1324176119

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89517
Approved by: https://github.com/anijain2305
---
 benchmarks/dynamo/runner.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py
index 963dcf493705a..f39d8dbab05f5 100755
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@@ -39,7 +39,7 @@
 import sys
 import tempfile
 from collections import defaultdict
-from datetime import datetime
+from datetime import datetime, timedelta, timezone
 from os.path import abspath, exists
 from random import randint
 
@@ -345,7 +345,9 @@ def print_commit_hash(path, name):
         if exists(path):
             repo = git.Repo(path, search_parent_directories=True)
             sha = repo.head.object.hexsha
+            date = repo.head.object.committed_datetime
             out_io.write(f"{name} commit: {sha}\n")
+            out_io.write(f"{name} commit date: {date}\n")
         else:
             out_io.write(f"{name} Absent\n")
 
@@ -409,8 +411,9 @@ def archive_data(archive_name):
         else:
             day = "000"
     else:
-        day = datetime.today().strftime("%j")
-        prefix = datetime.today().strftime(f"day_{day}_%d_%m_%y")
+        now = datetime.now(tz=timezone(timedelta(hours=-8)))
+        day = now.strftime("%j")
+        prefix = now.strftime(f"day_{day}_%d_%m_%y")
     return day, prefix
 
 
@@ -1297,6 +1300,9 @@ def extract(key):
         parse_logs(args, dtypes, suites, devices, compilers, flag_compilers, output_dir)
     elif args.run:
         generate_commands(args, dtypes, suites, devices, compilers, output_dir)
+        # generate memoized archive name now so that the date is reflective
+        # of when the run started
+        get_archive_name(args, dtypes[0])
         # TODO - Do we need to worry about segfaults
         try:
             os.system("bash run.sh")

From aef10f0d4024f462e1a1ee9b67da473b8d5b7902 Mon Sep 17 00:00:00 2001
From: Catherine Lee <csl@fb.com>
Date: Tue, 22 Nov 2022 21:52:50 +0000
Subject: [PATCH 1180/1922] Shard windows periodic job more (#89455)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89455
Approved by: https://github.com/huydhn
---
 .github/workflows/periodic.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index b5512b20eaae8..80ad04c9be321 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -167,8 +167,9 @@ jobs:
       cuda-version: "11.7"
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "windows.8xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 2, runner: "windows.8xlarge.nvidia.gpu" },
+          { config: "default", shard: 1, num_shards: 3, runner: "windows.8xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 3, runner: "windows.8xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 3, runner: "windows.8xlarge.nvidia.gpu" },
           { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
         ]}
 

From 5eba9a396958ef90e2cc3cbb483079e8e53c65ef Mon Sep 17 00:00:00 2001
From: Hubert Lu <55214931+hubertlu-tw@users.noreply.github.com>
Date: Tue, 22 Nov 2022 22:15:38 +0000
Subject: [PATCH 1181/1922] [ROCm] Optimize layer norm backward kernel for ROCm
 (#87635)

We observed that the native PyTorch LayerNormBackwardKernelImplInternal has suboptimal performance for certain input sizes on AMD GPUs especially when `fs`  (=`config_m` in our benchmark script) is large and `bs`  (=`config_n` in our benchmark script) is small (commonly seen in [the CvT model](https://arxiv.org/abs/2103.15808)) in the benchmark script of [PR #68238](https://github.com/pytorch/pytorch/pull/68238#issue-1051621716) on AMD GPUs.

This PR is to replace `GammaBetaBackwardCUDAKernel` with the Apex layernorm backward kernel with some ROCm-specific parameter tuning when `fs`  (=`config_m`) is larger than 512 on AMD GPUs.

There are a few PRs for LayerNorm kernel:
- https://github.com/pytorch/pytorch/pull/26201
- https://github.com/pytorch/pytorch/pull/27634
- https://github.com/pytorch/pytorch/pull/68238

Therefore, we have tested and compared the kernel before and at this PR with the input shapes in the last two PRs along with those commonly used in the CvT model on AMD MI100.

---
**Current**
<html xmlns:v="urn:schemas-microsoft-com:vml"
xmlns:o="urn:schemas-microsoft-com:office:office"
xmlns:x="urn:schemas-microsoft-com:office:excel"
xmlns="http://www.w3.org/TR/REC-html40">

<head>

<meta name=ProgId content=Excel.Sheet>
<meta name=Generator content="Microsoft Excel 15">
<link id=Main-File rel=Main-File
href="file:///C:/Users/hubertlu/AppData/Local/Temp/msohtmlclip1/01/clip.htm">
<link rel=File-List
href="file:///C:/Users/hubertlu/AppData/Local/Temp/msohtmlclip1/01/clip_filelist.xml">
<!--table
	{mso-displayed-decimal-separator:"\.";
	mso-displayed-thousand-separator:"\,";}
@page
	{mso-header-data:"&L&\0022Arial\0022&10&K0000FF \[AMD Official Use Only - General\]&1\#\000D";
	margin:.75in .7in .75in .7in;
	mso-header-margin:.3in;
	mso-footer-margin:.3in;}
tr
	{mso-height-source:auto;}
col
	{mso-width-source:auto;}
br
	{mso-data-placement:same-cell;}
td
	{padding-top:1px;
	padding-right:1px;
	padding-left:1px;
	mso-ignore:padding;
	color:black;
	font-size:11.0pt;
	font-weight:400;
	font-style:normal;
	text-decoration:none;
	font-family:Calibri, sans-serif;
	mso-font-charset:0;
	mso-number-format:General;
	text-align:general;
	vertical-align:bottom;
	border:none;
	mso-background-source:auto;
	mso-pattern:auto;
	mso-protection:locked visible;
	white-space:nowrap;
	mso-rotate:0;}
-->
</head>

<body link="#0563C1" vlink="#954F72">

M | N | fwd (half) | fwdbwd (half) | fwd (float) | fwdbwd (float)
-- | -- | -- | -- | -- | --
50432 | 384 | 0.387256 | 1.372758 | 0.378975 | 1.47892
50176 | 384 | 0.38231 | 1.362416 | 0.378084 | 1.473886
200704 | 192 | 0.997859 | 4.315875 | 0.989306 | 4.560827
802816 | 64 | 3.671828 | 16.68013 | 3.613515 | 16.827946
200 | 256 | 0.066503 | 0.332096 | 0.071422 | 0.325349
1000 | 256 | 0.071848 | 0.333355 | 0.073038 | 0.334753
6000 | 256 | 0.086334 | 0.345139 | 0.086834 | 0.347429
6272 | 256 | 0.088601 | 0.347906 | 0.087855 | 0.351245
200 | 512 | 0.071626 | 0.329726 | 0.073798 | 0.326878
1000 | 512 | 0.073975 | 0.330226 | 0.074166 | 0.332751
6000 | 512 | 0.099617 | 0.362367 | 0.100095 | 0.378313
6272 | 512 | 0.100378 | 0.358066 | 0.099857 | 0.395982
200 | 1024 | 0.072954 | 0.326382 | 0.073899 | 0.333007
1000 | 1024 | 0.0743 | 0.325532 | 0.071126 | 0.330991
6000 | 1024 | 0.127025 | 0.390084 | 0.128692 | 0.471504
6272 | 1024 | 0.130704 | 0.403536 | 0.135244 | 0.487133
200 | 1536 | 0.070331 | 0.339169 | 0.070086 | 0.331015
1000 | 1536 | 0.075085 | 0.330042 | 0.076295 | 0.328778
6000 | 1536 | 0.148889 | 0.44949 | 0.155781 | 0.659987
6272 | 1536 | 0.154939 | 0.478871 | 0.17673 | 0.716025
200 | 2048 | 0.070269 | 0.335585 | 0.072804 | 0.334655
1000 | 2048 | 0.080094 | 0.326991 | 0.080426 | 0.32685
6000 | 2048 | 0.187888 | 0.623023 | 0.245762 | 0.981635
6272 | 2048 | 0.195431 | 0.65244 | 0.262574 | 1.008141
200 | 3072 | 0.068205 | 0.339428 | 0.073068 | 0.344034
1000 | 3072 | 0.087554 | 0.328899 | 0.09218 | 0.346433
6000 | 3072 | 0.240352 | 0.905058 | 0.368135 | 1.280462
6272 | 3072 | 0.26179 | 0.959387 | 0.387782 | 1.476524
128 | 2097152 | 5.905976 | 22.724793 | 10.287974 | 30.242092
256 | 1048576 | 4.561596 | 19.554308 | 10.223171 | 29.42371
512 | 524288 | 4.146751 | 22.7247 | 11.404285 | 39.175902
1024 | 262144 | 5.193135 | 23.403325 | 11.334512 | 38.947192
2048 | 131072 | 4.992907 | 23.377801 | 11.400286 | 40.889191
4096 | 65536 | 5.429488 | 24.275701 | 11.196778 | 41.4751
8192 | 32768 | 5.35758 | 21.360312 | 10.535418 | 42.875646
16384 | 16384 | 5.44947 | 20.852605 | 10.357685 | 34.603408
32768 | 8192 | 4.688925 | 17.379392 | 9.635596 | 31.188271

</body>

</html>

---------
**At this PR**
<html xmlns:v="urn:schemas-microsoft-com:vml"
xmlns:o="urn:schemas-microsoft-com:office:office"
xmlns:x="urn:schemas-microsoft-com:office:excel"
xmlns="http://www.w3.org/TR/REC-html40">

<head>

<meta name=ProgId content=Excel.Sheet>
<meta name=Generator content="Microsoft Excel 15">
<link id=Main-File rel=Main-File
href="file:///C:/Users/hubertlu/AppData/Local/Temp/msohtmlclip1/01/clip.htm">
<link rel=File-List
href="file:///C:/Users/hubertlu/AppData/Local/Temp/msohtmlclip1/01/clip_filelist.xml">

<!--table
	{mso-displayed-decimal-separator:"\.";
	mso-displayed-thousand-separator:"\,";}
@page
	{mso-header-data:"&L&\0022Arial\0022&10&K0000FF \[AMD Official Use Only - General\]&1\#\000D";
	margin:.75in .7in .75in .7in;
	mso-header-margin:.3in;
	mso-footer-margin:.3in;}
tr
	{mso-height-source:auto;}
col
	{mso-width-source:auto;}
br
	{mso-data-placement:same-cell;}
td
	{padding-top:1px;
	padding-right:1px;
	padding-left:1px;
	mso-ignore:padding;
	color:black;
	font-size:11.0pt;
	font-weight:400;
	font-style:normal;
	text-decoration:none;
	font-family:Calibri, sans-serif;
	mso-font-charset:0;
	mso-number-format:General;
	text-align:general;
	vertical-align:bottom;
	border:none;
	mso-background-source:auto;
	mso-pattern:auto;
	mso-protection:locked visible;
	white-space:nowrap;
	mso-rotate:0;}
.xl63
	{color:windowtext;}
-->
</head>

<body link="#0563C1" vlink="#954F72">

M | N | fwd (half) | fwdbwd (half) | fwd (float) | fwdbwd (float)
-- | -- | -- | -- | -- | --
50432 | 384 | 0.38797 | 0.93103 | 0.37966 | 1.15283
50176 | 384 | 0.3874 | 0.96417 | 0.38462 | 1.18595
200704 | 192 | 1.00002 | 2.40876 | 0.99224 | 2.55579
802816 | 64 | 3.67348 | 7.98658 | 3.61871 | 7.72404
200 | 256 | 0.07292 | 0.35119 | 0.07195 | 0.32602
1000 | 256 | 0.07354 | 0.33325 | 0.07237 | 0.33742
6000 | 256 | 0.08819 | 0.33283 | 0.08453 | 0.3279
6272 | 256 | 0.0886 | 0.33446 | 0.08774 | 0.33426
200 | 512 | 0.0701 | 0.33505 | 0.07072 | 0.33018
1000 | 512 | 0.07042 | 0.33442 | 0.074 | 0.33206
6000 | 512 | 0.09931 | 0.34956 | 0.09895 | 0.3572
6272 | 512 | 0.10103 | 0.32976 | 0.10041 | 0.36635
200 | 1024 | 0.07144 | 0.33579 | 0.07209 | 0.33216
1000 | 1024 | 0.0736 | 0.32803 | 0.07286 | 0.32936
6000 | 1024 | 0.12584 | 0.38916 | 0.12852 | 0.48273
6272 | 1024 | 0.13053 | 0.38804 | 0.13464 | 0.49545
200 | 1536 | 0.07159 | 0.3396 | 0.07062 | 0.33545
1000 | 1536 | 0.07443 | 0.33239 | 0.07366 | 0.33204
6000 | 1536 | 0.14959 | 0.45043 | 0.15826 | 0.69119
6272 | 1536 | 0.1542 | 0.47644 | 0.18249 | 0.72208
200 | 2048 | 0.07258 | 0.33982 | 0.07412 | 0.33859
1000 | 2048 | 0.0793 | 0.32816 | 0.07864 | 0.32583
6000 | 2048 | 0.18973 | 0.571 | 0.25506 | 0.91796
6272 | 2048 | 0.19719 | 0.64208 | 0.26445 | 0.95055
200 | 3072 | 0.07092 | 0.33867 | 0.07104 | 0.34695
1000 | 3072 | 0.08727 | 0.33144 | 0.09144 | 0.36633
6000 | 3072 | 0.24683 | 0.87275 | 0.37761 | 1.3289
6272 | 3072 | 0.26437 | 0.91178 | 0.38496 | 1.53694
128 | 2097152 | 6.27936 | 23.69425 | 10.40004 | 30.13699
256 | 1048576 | 4.5404 | 19.47675 | 10.28494 | 29.36936
512 | 524288 | 4.13951 | 18.78771 | 10.09557 | 32.67083
1024 | 262144 | 4.47576 | 18.00411 | 9.56488 | 31.47117
2048 | 131072 | 4.28026 | 16.95619 | 9.40297 | 30.82845
4096 | 65536 | 4.2653 | 16.5018 | 9.03315 | 30.08392
8192 | 32768 | 4.25613 | 16.13583 | 8.9258 | 30.75296
16384 | 16384 | 4.20256 | 16.38207 | 9.52587 | 31.31113
32768 | 8192 | 4.20231 | 16.19452 | 9.31478 | 31.03514

</body>

</html>

---------

**Performance Improvement (%)**
<html xmlns:o="urn:schemas-microsoft-com:office:office"
xmlns:dt="uuid:C2F41010-65B3-11d1-A29F-00AA00C14882"
xmlns="http://www.w3.org/TR/REC-html40">

<head>

<meta name=ProgId content=OneNote.File>
<meta name=Generator content="Microsoft OneNote 15">
</head>

<body lang=en-US style='font-family:Calibri;font-size:11.0pt'>
<!--StartFragment-->

<div style='direction:ltr'>

M | N | fwdbwd,   torch.float16 | fwdbwd,   torch.float32
-- | -- | -- | --
50432 | 384 | 32.178 | 22.049
50176 | 384 | 29.231 | 19.536
200704 | 192 | 44.188 | 43.962
802816 | 64 | 52.119 | 54.100
200 | 256 | -5.750 | -0.206
1000 | 256 | 0.031 | -0.797
6000 | 256 | 3.566 | 5.621
6272 | 256 | 3.865 | 4.836
200 | 512 | -1.615 | -1.010
1000 | 512 | -1.270 | 0.208
6000 | 512 | 3.534 | 5.581
6272 | 512 | 7.905 | 7.483
200 | 1024 | -2.883 | 0.254
1000 | 1024 | -0.767 | 0.493
6000 | 1024 | 0.237 | -2.381
6272 | 1024 | 3.840 | -1.707
200 | 1536 | -0.127 | -1.340
1000 | 1536 | -0.711 | -0.992
6000 | 1536 | -0.209 | -4.728
6272 | 1536 | 0.508 | -0.846
200 | 2048 | -1.262 | -1.176
1000 | 2048 | -0.358 | 0.312
6000 | 2048 | 8.350 | 6.487
6272 | 2048 | 1.588 | 5.713
200 | 3072 | 0.223 | -0.848
1000 | 3072 | -0.773 | -5.743
6000 | 3072 | 3.570 | -3.783
6272 | 3072 | 4.962 | -4.092
128 | 2097152 | -4.266 | 0.348
256 | 1048576 | 0.397 | 0.185
512 | 524288 | 17.325 | 16.605
1024 | 262144 | 23.070 | 19.195
2048 | 131072 | 27.469 | 24.605
4096 | 65536 | 32.023 | 27.465
8192 | 32768 | 24.459 | 28.274
16384 | 16384 | 21.439 | 9.514
32768 | 8192 | 6.818 | 0.491

</div>

<!--EndFragment-->
</body>

</html>

---------
**Benchmark script of this PR**
```
# Ref:
#       1. https://github.com/pytorch/pytorch/pull/26201
#       2. https://github.com/pytorch/pytorch/pull/68238

from distutils.command.config import config
import torch
from torch.nn import LayerNorm
import timeit

number_runs = 1000  # TODO: Modify this to save time!
def test_forward(layer_norm_cuda, input_cuda):
    layer_norm_cuda(input_cuda); torch.cuda.synchronize()

def test_backward(out_cuda, layer_norm_grad_cuda, create_graph):
    out_cuda.backward(layer_norm_grad_cuda, retain_graph=True, create_graph=create_graph); torch.cuda.synchronize()

def test_fwdbwd(input_cuda, layer_norm_cuda, gO):
    input_cuda.grad = None
    layer_norm_cuda.zero_grad(set_to_none=True)
    out = layer_norm_cuda(input_cuda)
    out.backward(gO)
    torch.cuda.synchronize()

def benchmark(config_m, config_n):

    print("M | N | fwd (half) | fwdbwd (half) | fwd (float) | fwdbwd (float)")
    if len(config_m) != len(config_n):
        print("Please make sure the lengths of config_m and config_m are the same.")

    for i in range(len(config_m)):
        normalized_shape = config_n[i]
        results = [config_m[i], config_n[i]]
        for dtype in (torch.half, torch.float):
            if dtype == torch.half:
                layer_norm_cuda = LayerNorm(normalized_shape).half().cuda()
            else:
                layer_norm_cuda = LayerNorm(normalized_shape).cuda()

            input_cuda = torch.randn(config_m[i], config_n[i], device='cuda', dtype=dtype, requires_grad=True)

            # print("cuda forward:")
            result_fwd = timeit.timeit(lambda: test_forward(layer_norm_cuda, input_cuda), number=number_runs)
            results.append(result_fwd / number_runs * 1000)

            gO = torch.rand_like(input_cuda)

            result_fwdbwd = timeit.timeit(lambda: test_fwdbwd(input_cuda, layer_norm_cuda, gO), number=number_runs)
            results.append(result_fwdbwd / number_runs * 1000)

        print('{:09d}|{:09d}|{:9.5f}|{:9.5f}|{:9.5f}|{:9.5f}'.format(results[0], results[1], results[2], results[3], results[4], results[5]))

    print("Times are in microseconds (us).")

# CVT
config_m_cvt = [50432, 50176, 200704, 802816]
config_n_cvt = [384, 384, 192, 64]

# https://github.com/pytorch/pytorch/pull/68238#issue-1051621716
config_m_68238 = [200, 1000, 6000, 6272, 200, 1000, 6000, 6272, 200, 1000, 6000, 6272, 200, 1000, 6000, 6272, 200, 1000, 6000, 6272, 200, 1000, 6000, 6272]
config_n_68238 = [256,256,256,256,512,512,512,512,1024,1024,1024,1024,1536,1536,1536,1536,2048,2048,2048,2048,3072,3072,3072,3072]

# https://github.com/pytorch/pytorch/pull/27634
config_m_27634 = [128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768]
config_n_27634 = [2097152, 1048576, 524288, 262144, 131072, 65536, 32768, 16384, 8192]

config_m = config_m_cvt + config_m_68238 + config_m_27634
config_n = config_n_cvt + config_n_68238 + config_n_27634

benchmark(config_m, config_n)
```

CC: @jeffdaily

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87635
Approved by: https://github.com/jataylo, https://github.com/jeffdaily, https://github.com/ezyang
---
 .../src/ATen/native/cuda/layer_norm_kernel.cu | 234 +++++++++++++++++-
 1 file changed, 233 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
index fa70f075d4fa7..693524818fb43 100644
--- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@@ -25,6 +25,8 @@
 #endif
 
 #include <c10/cuda/CUDAMathCompat.h>
+#include <c10/util/env.h>
+
 
 namespace at {
 namespace native {
@@ -832,6 +834,201 @@ void LayerNormKernelImpl(
       });
 }
 
+template<typename T, typename T_ACC> __device__
+void cuLoadWriteStridedInputs(
+    const int i1_block,
+    const int thr_load_row_off,
+    const int thr_load_col_off,
+    const int i2_off,
+    const int row_stride,
+    T_ACC* warp_buf1,
+    T_ACC* warp_buf2,
+    const T* input,
+    const T* dout,
+    const int i1_end,
+    const int64_t N,
+    const T_ACC* __restrict__ mean,
+    const T_ACC* __restrict__ rstd)
+{
+  int i1 = i1_block+thr_load_row_off;
+  if (i1 < i1_end) {
+    T curr_mean = mean[i1];
+    T curr_rstd = rstd[i1];
+    for (int k = 0;  k < blockDim.y;  ++k) {
+      int i2 = i2_off + k;
+      int load_idx = i1*N+i2;
+      int write_idx = thr_load_row_off*row_stride+thr_load_col_off+k;
+      if (i2<N) {
+        T curr_input = static_cast<T>(input[load_idx]);
+        T curr_dout = static_cast<T>(dout[load_idx]);
+        warp_buf1[write_idx] = curr_dout;
+        warp_buf2[write_idx] = curr_dout * (curr_input - curr_mean) * curr_rstd;
+      } else {
+        warp_buf1[write_idx] = T(0);
+        warp_buf2[write_idx] = T(0);
+      }
+    }
+  } else {
+    for (int k = 0;  k < blockDim.y;  ++k) {
+      int write_idx = thr_load_row_off*row_stride+thr_load_col_off+k;
+      warp_buf1[write_idx] = T(0);
+      warp_buf2[write_idx] = T(0);
+    }
+  }
+}
+
+template<typename T, typename T_ACC> __device__
+void cuLoadAddStridedInputs(
+    const int i1_block,
+    const int thr_load_row_off,
+    const int thr_load_col_off,
+    const int i2_off,
+    const int row_stride,
+    T_ACC* warp_buf1,
+    T_ACC* warp_buf2,
+    const T* input,
+    const T* dout,
+    const int i1_end,
+    const int64_t N,
+    const T_ACC* __restrict__ mean,
+    const T_ACC* __restrict__ rstd)
+{
+  int i1 = i1_block+thr_load_row_off;
+  if (i1 < i1_end) {
+    T_ACC curr_mean = mean[i1];
+    T_ACC curr_rstd = rstd[i1];
+    for (int k = 0;  k < blockDim.y;  ++k) {
+      int i2 = i2_off + k;
+      int load_idx = i1*N+i2;
+      int write_idx = thr_load_row_off*row_stride+thr_load_col_off+k;
+      if (i2<N) {
+        T_ACC curr_input = static_cast<T_ACC>(input[load_idx]);
+        T_ACC curr_dout = static_cast<T_ACC>(dout[load_idx]);
+        warp_buf1[write_idx] += curr_dout;
+        warp_buf2[write_idx] += curr_dout * (curr_input - curr_mean) * curr_rstd;
+      }
+    }
+  }
+}
+
+template<typename T, typename T_ACC> __global__
+void cuComputePartGradGammaBeta(
+    const T* __restrict__ dout,
+    const T* __restrict__ input,
+    const int64_t M,
+    const int64_t N,
+    const T_ACC* __restrict__ mean,
+    const T_ACC* __restrict__ rstd,
+    T_ACC* part_grad_gamma,
+    T_ACC* part_grad_beta)
+{
+    const int numsegs_M = (M+blockDim.y*blockDim.y-1) / (blockDim.y*blockDim.y);
+    const int segs_per_block = (numsegs_M + gridDim.y - 1) / gridDim.y;
+    const int i1_beg = blockIdx.y * segs_per_block * blockDim.y*blockDim.y;
+    const int i1_beg_plus_one = (blockIdx.y+1) * segs_per_block * blockDim.y*blockDim.y;
+    const int i1_end = i1_beg_plus_one < M ? i1_beg_plus_one : M;
+    const int row_stride = blockDim.x+1;
+    const int thr_load_col_off = (threadIdx.x*blockDim.y)&(blockDim.x-1);
+    const int thr_load_row_off = (threadIdx.x*blockDim.y)/blockDim.x + threadIdx.y*blockDim.y;
+    const int i2_off = blockIdx.x * blockDim.x + thr_load_col_off;
+    alignas(sizeof(double)) extern __shared__ char shared[];
+    T_ACC * buf = reinterpret_cast<T_ACC*>(&shared); // buf has at least blockDim.x * blockDim.y * blockDim.y + (blockDim.y - 1)*(blockDim.x/blockDim.y) elements
+    T_ACC* warp_buf1 = (T_ACC*)buf;
+    T_ACC* warp_buf2 = warp_buf1 + blockDim.y * blockDim.y * row_stride;
+    // compute partial sums from strided inputs
+    // do this to increase number of loads in flight
+    cuLoadWriteStridedInputs(i1_beg,thr_load_row_off,thr_load_col_off,i2_off,row_stride,warp_buf1,warp_buf2,input,dout,i1_end,N,mean,rstd);
+    for (int i1_block = i1_beg+blockDim.y*blockDim.y;  i1_block < i1_end;  i1_block+=blockDim.y*blockDim.y) {
+      cuLoadAddStridedInputs(i1_block,thr_load_row_off,thr_load_col_off,i2_off,row_stride,warp_buf1,warp_buf2,input,dout,i1_end,N,mean,rstd);
+    }
+    __syncthreads();
+    // inter-warp reductions
+    // sum within each warp
+    T_ACC acc1 = T_ACC(0);
+    T_ACC acc2 = T_ACC(0);
+    for (int k = 0;  k < blockDim.y;  ++k) {
+      int row1 = threadIdx.y + k*blockDim.y;
+      int idx1 = row1*row_stride + threadIdx.x;
+      acc1 += warp_buf1[idx1];
+      acc2 += warp_buf2[idx1];
+    }
+    warp_buf1[threadIdx.y*row_stride+threadIdx.x] = acc1;
+    warp_buf2[threadIdx.y*row_stride+threadIdx.x] = acc2;
+    __syncthreads();
+    // sum all warps
+    for (int offset = blockDim.y/2;  offset > 1;  offset /= 2) {
+      if (threadIdx.y < offset) {
+        int row1 = threadIdx.y;
+        int row2 = threadIdx.y + offset;
+        int idx1 = row1*row_stride + threadIdx.x;
+        int idx2 = row2*row_stride + threadIdx.x;
+        warp_buf1[idx1] += warp_buf1[idx2];
+        warp_buf2[idx1] += warp_buf2[idx2];
+      }
+      __syncthreads();
+    }
+    int i2 = blockIdx.x * blockDim.x + threadIdx.x;
+    if (threadIdx.y == 0 && i2 < N) {
+      int row1 = threadIdx.y;
+      int row2 = threadIdx.y + 1;
+      int idx1 = row1*row_stride + threadIdx.x;
+      int idx2 = row2*row_stride + threadIdx.x;
+      part_grad_beta[blockIdx.y*N+i2] = warp_buf1[idx1] + warp_buf1[idx2];
+      part_grad_gamma[blockIdx.y*N+i2] = warp_buf2[idx1] + warp_buf2[idx2];
+    }
+}
+
+template<typename T, typename T_ACC> __global__
+void cuComputeGradGammaBeta(
+    const T_ACC* part_grad_gamma,
+    const T_ACC* part_grad_beta,
+    const int part_size,
+    const int64_t M,
+    const int64_t N,
+    T* grad_gamma,
+    T* grad_beta)
+{
+    // sum partial gradients for gamma and beta
+    alignas(sizeof(double)) extern __shared__ char shared[];
+    T_ACC * buf = reinterpret_cast<T_ACC*>(&shared);
+    int i2 = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i2 < N) {
+      // each warp does sequential reductions until reduced part_size is num_warps
+      int num_warp_reductions = part_size / blockDim.y;
+      T_ACC sum_gamma = T_ACC(0);
+      T_ACC sum_beta = T_ACC(0);
+      const T_ACC* part_grad_gamma_ptr = part_grad_gamma + threadIdx.y * num_warp_reductions * N + i2;
+      const T_ACC* part_grad_beta_ptr = part_grad_beta + threadIdx.y * num_warp_reductions * N + i2;
+      for (int warp_offset = 0;  warp_offset < num_warp_reductions;  ++warp_offset) {
+        sum_gamma += part_grad_gamma_ptr[warp_offset*N];
+        sum_beta += part_grad_beta_ptr[warp_offset*N];
+      }
+      // inter-warp reductions
+      const int nbsize3 = blockDim.x * blockDim.y / 2;
+      for (int offset = blockDim.y/2;  offset >= 1;  offset /= 2) {
+        // top half write to shared memory
+        if (threadIdx.y >= offset && threadIdx.y < 2*offset) {
+          const int write_idx = (threadIdx.y - offset) * blockDim.x + threadIdx.x;
+          buf[write_idx] = sum_gamma;
+          buf[write_idx+nbsize3] = sum_beta;
+        }
+        __syncthreads();
+        // bottom half sums
+        if (threadIdx.y < offset) {
+          const int read_idx = threadIdx.y * blockDim.x + threadIdx.x;
+          sum_gamma += buf[read_idx];
+          sum_beta += buf[read_idx+nbsize3];
+        }
+        __syncthreads();
+      }
+      // write out fully summed gradients
+      if (threadIdx.y == 0) {
+        grad_gamma[i2] = sum_gamma;
+        grad_beta[i2] = sum_beta;
+      }
+    }
+}
+
 template <typename T>
 void LayerNormBackwardKernelImplInternal(
     const Tensor& dY,
@@ -860,8 +1057,8 @@ void LayerNormBackwardKernelImplInternal(
       gamma.defined() ? gamma.template data_ptr<T>() : nullptr;
   T* dX_data = dX->defined() ? dX->template data_ptr<T>() : nullptr;
   cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
+  const int warp_size = at::cuda::warp_size();
   if (dX_data != nullptr) {
-    const int warp_size = at::cuda::warp_size();
     const dim3 blocks(M);
     int nshared = (num_threads()/warp_size) * sizeof(T_ACC);
     layer_norm_grad_input_kernel<<<blocks, num_threads(), nshared, cuda_stream>>>(dY_data,
@@ -889,6 +1086,40 @@ void LayerNormBackwardKernelImplInternal(
               dbeta_data);
       C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
+#if defined(USE_ROCM)
+      // For small batch size, do colwise reduce directly.
+      const int part_size = warp_size;
+      const dim3 threads2(warp_size, 4, 1);
+      const dim3 blocks2((N + threads2.x - 1) / threads2.x, part_size, 1);
+      const int nshared2_a = 2 * sizeof(T_ACC) * threads2.y * threads2.y * (threads2.x + 1);
+      const int nshared2_b = threads2.x * threads2.y * sizeof(T_ACC);
+      const int nshared2 = nshared2_a > nshared2_b ? nshared2_a : nshared2_b;
+
+      const auto part_grad_dtype = at::toAccumulateType(X.scalar_type(), true);
+      Tensor part_grad_gamma = at::empty({part_size,N}, gamma.options().dtype(part_grad_dtype));
+      Tensor part_grad_beta = at::native::empty_like(part_grad_gamma);
+      cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, cuda_stream>>>(
+                      dY_data,
+                      X_data,
+                      M,N,
+                      mean_data,
+                      rstd_data,
+                      part_grad_gamma.template data_ptr<T_ACC>(),
+                      part_grad_beta.template data_ptr<T_ACC>());
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+      const dim3 threads3(warp_size, 8, 1); // Optimization for ROCm
+      const dim3 blocks3((N + threads2.x - 1) / threads2.x, 1, 1);
+      const int nshared3 = threads3.x * threads3.y * sizeof(T);
+      cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, cuda_stream>>>(
+                      part_grad_gamma.template data_ptr<T_ACC>(),
+                      part_grad_beta.template data_ptr<T_ACC>(),
+                      part_size,
+                      M,N,
+                      dgamma_data,
+                      dbeta_data);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+#else
       if ((M % kWarpSize == 0) && (N % kWarpSize == 0)) {
         // This implementation relies on warp primitives and requires that M and N divide
         // exactly to warp size.
@@ -925,6 +1156,7 @@ void LayerNormBackwardKernelImplInternal(
                 dbeta_data);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
       }
+#endif
     }
   }
 }

From 7407f617d39982ae2501ce93f467ea0543d109ee Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Tue, 22 Nov 2022 22:15:54 +0000
Subject: [PATCH 1182/1922] [Vulkan][TCC] Add tests for quantize_per_tensor and
 dequantize (#89496)

Summary: Add tests for quantize per tensor and dequantize

Test Plan:
On Mac
```
cd ~/fbsource
buck1 run -c pt.vulkan_full_precision=1 //xplat/caffe2:pt_vulkan_quantized_api_test_binAppleMac\#macosx-arm64
```

On Android
```
cd ~/fbsource
buck1 build -c ndk.custom_libcxx=false -c pt.enable_qpl=0 -c pt.vulkan_full_precision=1 //xplat/caffe2:pt_vulkan_quantized_api_test_binAndroid\#android-arm64 --show-output
adb push buck-out/gen/xplat/caffe2/pt_vulkan_quantized_api_test_binAndroid\#android-arm64 /data/local/tmp/vulkan_quantized_api_test
adb shell "/data/local/tmp/vulkan_quantized_api_test"
```

Reviewed By: salilsdesai

Differential Revision: D41047097

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89496
Approved by: https://github.com/digantdesai
---
 .../ATen/test/vulkan_quantized_api_test.cpp   | 78 ++++++++++++++++++-
 1 file changed, 74 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/test/vulkan_quantized_api_test.cpp b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
index 3372417e29f40..c30fac431d7bd 100644
--- a/aten/src/ATen/test/vulkan_quantized_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
@@ -21,7 +21,10 @@
 
 namespace {
 
-bool checkRtol(const at::Tensor& diff, const std::vector<at::Tensor>& inputs) {
+bool checkRtol(
+    const at::Tensor& diff,
+    const std::vector<at::Tensor>& inputs,
+    const float tolerated_error = 0) {
   float maxValue = 0.0f;
 
   for (const auto& tensor : inputs) {
@@ -34,11 +37,11 @@ bool checkRtol(const at::Tensor& diff, const std::vector<at::Tensor>& inputs) {
   constexpr float tolerance = 1e-5;
 #endif
 
-  return diff.abs().max().item<float>() <= (tolerance * maxValue);
+  return diff.abs().max().item<float>() <= (tolerance * maxValue + tolerated_error);
 }
 
-bool almostEqual(const at::Tensor& a, const at::Tensor& b) {
-  return checkRtol(a - b, {a, b});
+bool almostEqual(const at::Tensor& a, const at::Tensor& b, const float tolerated_error = 0) {
+  return checkRtol(a - b, {a, b}, tolerated_error);
 }
 
 /* Unused function
@@ -354,6 +357,73 @@ TEST_F(VulkanAPITest, quantize_dequantize) {
   ASSERT_TRUE(check_two);
 }
 
+void test_quantize_per_tensor_and_dequantize(
+    const at::IntArrayRef input_shape,
+    const double input_scale,
+    const int input_zero_point,
+    const float tolerance = 0) {
+  at::Tensor input = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
+
+  // quantize tensors
+  at::Tensor out_q_cpu = at::quantize_per_tensor(
+    input, input_scale, input_zero_point, c10::ScalarType::QUInt8);
+  at::Tensor out_q_vk = at::quantize_per_tensor(
+    input.vulkan(), input_scale, input_zero_point, c10::ScalarType::QUInt8);
+
+  // dequantize tensors
+  const auto out_cpu_deq = at::dequantize(out_q_cpu);
+  const auto out_vk_deq = at::dequantize(out_q_vk);
+
+  // check dequantized tensor are equal
+  const auto check = almostEqual(out_cpu_deq, out_vk_deq.cpu(), tolerance);
+
+  if (!check) {
+    std::cout
+      << "Quantize and Dequantize failed with input shape: " << input_shape
+      << " scale: " << input_scale << " and zero point: " << input_zero_point
+    << std::endl;
+  }
+  ASSERT_TRUE(check);
+}
+
+void test_quantize_per_tensor_and_dequantize_random() {
+  const double scale = 0.0001 + (double)rand() / (double)RAND_MAX;
+  const int zero_point = int((double)rand() / (double)RAND_MAX * 255);
+  const int n = 1 + int((double)rand() / (double)RAND_MAX * 30);
+  const int c = 1 + int((double)rand() / (double)RAND_MAX * 30);
+  const int h = 1 + int((double)rand() / (double)RAND_MAX * 100);
+  const int w = 1 + int((double)rand() / (double)RAND_MAX * 100);
+  // tolerated error = scale, to allow for precision differences after dividing
+  // by random scale, which could result on a difference of 1 unit in the
+  // quantized result.
+  test_quantize_per_tensor_and_dequantize({n, c, h, w}, scale, zero_point, scale);
+}
+
+TEST_F(VulkanAPITest, quantize_per_tensor_and_dequantize) {
+  test_quantize_per_tensor_and_dequantize({1, 1, 1, 1}, 0.13, 21);
+  test_quantize_per_tensor_and_dequantize({1, 1, 1, 4}, 0.3, 87);
+  test_quantize_per_tensor_and_dequantize({1, 1, 4, 1}, 0.2, 120);
+  test_quantize_per_tensor_and_dequantize({1, 1, 7, 7}, 0.3, 87);
+  test_quantize_per_tensor_and_dequantize({1, 1, 8, 8}, 0.1, 10);
+  test_quantize_per_tensor_and_dequantize({3, 5, 8, 8}, 0.04, 97);
+  test_quantize_per_tensor_and_dequantize({1, 1, 11, 17}, 0.07, 15);
+  test_quantize_per_tensor_and_dequantize({1, 1, 12, 17}, 0.1, 10);
+  test_quantize_per_tensor_and_dequantize({3, 5, 12, 17}, 0.1, 10);
+  test_quantize_per_tensor_and_dequantize({1, 1, 17, 12}, 0.1, 10);
+  test_quantize_per_tensor_and_dequantize({2, 4, 17, 12}, 0.1, 10);
+  test_quantize_per_tensor_and_dequantize({1, 1, 10, 14}, 0.0001, 101);
+  test_quantize_per_tensor_and_dequantize({3, 5, 10, 14}, 0.009, 43);
+  test_quantize_per_tensor_and_dequantize({3, 5, 10, 15}, 0.1, 19);
+  test_quantize_per_tensor_and_dequantize({4, 4, 9, 17}, 0.1, 19);
+  test_quantize_per_tensor_and_dequantize({3, 5, 25, 29}, 0.1, 19);
+  test_quantize_per_tensor_and_dequantize({4, 4, 25, 29}, 0.1, 19);
+  test_quantize_per_tensor_and_dequantize({11, 17, 25, 29}, 0.027, 89);
+
+  for (int i = 0; i < 20; i += 1) {
+    test_quantize_per_tensor_and_dequantize_random();
+  }
+}
+
 TEST_F(VulkanAPITest, quantized_add) {
   const auto in_cpu =
       at::rand({2, 13, 32, 27}, at::device(at::kCPU).dtype(at::kFloat)) * 6;

From a44c8eefc4b80c245e3a86a1403029a4e19e9cd3 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Tue, 22 Nov 2022 22:25:30 +0000
Subject: [PATCH 1183/1922] [dashboard] Remove aot_cudagraphs and
 nvprims_nvfuser (#89514)

Helps speeding up Dashboard runs

We will bring these back when the backends are ready to be tested on full model suite.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89514
Approved by: https://github.com/SherlockNoMad
---
 benchmarks/dynamo/runner.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py
index f39d8dbab05f5..8fc44be8c817a 100755
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@@ -90,8 +90,6 @@
     "training": [
         "eager",
         "aot_eager",
-        "aot_cudagraphs",
-        "nvprims_nvfuser",
         "inductor",
         "inductor_no_cudagraphs",
     ],

From 4243ab7628fb1fc3655c13e8648a6362bacefed9 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Mon, 21 Nov 2022 14:19:03 -0800
Subject: [PATCH 1184/1922] [quant] Add quantize_per_channel in
 quantized_decomposed op library (#89268)

Summary:
att

Test Plan:
python test/test_quantization.py -k test_decomposed_quantize_per_channel

Reviewers:

Subscribers:

Tasks:

Tags:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89268
Approved by: https://github.com/vkuzo
---
 .../core/test_quantized_tensor.py             | 22 ++++++++++++--
 torch/ao/quantization/fx/_decomposed.py       | 29 +++++++++++++++++++
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py
index a89c98f4e5ab1..3c19b957a0e7c 100644
--- a/test/quantization/core/test_quantized_tensor.py
+++ b/test/quantization/core/test_quantized_tensor.py
@@ -1463,7 +1463,7 @@ def test_bfp16_quantize(self):
         dedequantized_X = quantized_X.to(torch.float32)
         torch.testing.assert_close(X, dedequantized_X, rtol=1e-4, atol=5e-3)
 
-    def test_decomposed_quantize(self):
+    def test_decomposed_quantize_per_tensor(self):
         # register the ops
         import torch.ao.quantization.fx._decomposed
         X = torch.randn(5, 10)
@@ -1479,7 +1479,7 @@ def test_decomposed_quantize(self):
         self.assertEqual(quantized_decomposed_X.dtype, dtype)
         self.assertEqual(quantized_X.int_repr(), quantized_decomposed_X)
 
-    def test_decomposed_dequantize(self):
+    def test_decomposed_dequantize_per_tensor(self):
         import torch.ao.quantization.fx._decomposed
         X = torch.randn(5, 10)
         dtype = torch.uint8
@@ -1521,6 +1521,24 @@ def test_decomposed_dynamic_quant_pattern(self):
         self.assertEqual(quantized_X.int_repr(), quantized_decomposed_X)
         self.assertEqual(dequantized_X, dequantized_decomposed_X)
 
+    def test_decomposed_quantize_per_channel(self):
+        # register the ops
+        import torch.ao.quantization.fx._decomposed
+        X = torch.randn(5, 10)
+        qdtype = torch.quint8
+        dtype = torch.uint8
+        scales = torch.randn(5,)
+        zero_points = torch.randint(0, 100, (5,))
+        quant_min, quant_max = 0, 255
+        axis = 0
+
+        quantized_X = torch.quantize_per_channel(X, scales, zero_points, axis, qdtype)
+        quantized_decomposed_X = \
+            torch.ops.quantized_decomposed.quantize_per_channel(
+                X, scales, zero_points, axis, quant_min, quant_max, dtype)
+        self.assertEqual(quantized_decomposed_X.dtype, dtype)
+        self.assertEqual(quantized_X.int_repr(), quantized_decomposed_X)
+
 if __name__ == '__main__':
     raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
                        "\tpython test/test_quantization.py TESTNAME\n\n"
diff --git a/torch/ao/quantization/fx/_decomposed.py b/torch/ao/quantization/fx/_decomposed.py
index fcb4a77a5f499..da793c799a7ae 100644
--- a/torch/ao/quantization/fx/_decomposed.py
+++ b/torch/ao/quantization/fx/_decomposed.py
@@ -94,3 +94,32 @@ def choose_qparams_tensor(input, quant_min, quant_max, dtype):
     observer(input)
     scale, zero_point = observer.calculate_qparams()
     return (scale, zero_point)
+
+# Helper function used to implement per-channel quantization against any axis
+def _permute_to_axis_zero(x, axis):
+    new_axis_list = list(range(x.dim()))
+    new_axis_list[axis] = 0
+    new_axis_list[0] = axis
+    y = x.permute(tuple(new_axis_list))
+    return y, new_axis_list
+
+quantized_decomposed_lib.define(
+    "quantize_per_channel(Tensor input, Tensor scales, Tensor zero_points, int axis, "
+    "int quant_min, int quant_max, ScalarType dtype) -> Tensor")
+
+@impl(quantized_decomposed_lib, "quantize_per_channel", "CompositeExplicitAutograd")
+def quantize_per_channel(input, scales, zero_points, axis, quant_min, quant_max, dtype):
+    assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+    _quant_min_max_bounds_check(quant_min, quant_max, dtype)
+    input, permute_axis_list = _permute_to_axis_zero(input, axis)
+    res = torch.zeros_like(input)
+
+    for i in range(input.size(0)):
+        res[i] = torch.clamp(
+            torch.round(input[i] * (1.0 / scales[i])) + zero_points[i],
+            quant_min,
+            quant_max
+        )
+
+    out = res.permute(tuple(permute_axis_list))
+    return out.to(dtype)

From e2d97ff4f0f321c277aeb44cd156696030b06318 Mon Sep 17 00:00:00 2001
From: Kshiteej K <kshitijkalambarkar@gmail.com>
Date: Tue, 22 Nov 2022 22:55:41 +0000
Subject: [PATCH 1185/1922] [test_nn] fix missing class attributes for
 NNTestCase (#89200)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Missed setting these class variable 😓
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89200
Approved by: https://github.com/albanD
---
 test/nn/test_dropout.py | 3 +++
 test/nn/test_pooling.py | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/test/nn/test_dropout.py b/test/nn/test_dropout.py
index fa2b0baea5549..150e5f57df7c6 100644
--- a/test/nn/test_dropout.py
+++ b/test/nn/test_dropout.py
@@ -15,6 +15,9 @@
 import torch.nn as nn
 
 class TestDropoutNN(NNTestCase):
+    _do_cuda_memory_leak_check = True
+    _do_cuda_non_default_stream = True
+
     def _test_alpha_dropout(self, cls, input):
         mean = input.mean()
         std = input.std()
diff --git a/test/nn/test_pooling.py b/test/nn/test_pooling.py
index 073269a7c5539..35579e643a464 100644
--- a/test/nn/test_pooling.py
+++ b/test/nn/test_pooling.py
@@ -130,6 +130,9 @@ def test_avg_pool3d_ceil_mode(self):
 
 
 class TestPoolingNN(NNTestCase):
+    _do_cuda_memory_leak_check = True
+    _do_cuda_non_default_stream = True
+
     def test_adaptive_pooling_input_size(self):
         for numel in (2, 3):
             for pool_type in ('Max', 'Avg'):

From 9b019abb56e45b4026232b62e2735c10b9a404cf Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@fb.com>
Date: Tue, 22 Nov 2022 23:42:09 +0000
Subject: [PATCH 1186/1922] [dashboard] Add metric graphs back to dashboard
 (#89531)

Title.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89531
Approved by: https://github.com/davidberard98
---
 benchmarks/dynamo/runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py
index 8fc44be8c817a..29605f946bbd9 100755
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@@ -1202,7 +1202,7 @@ def gen_comment(self):
             "gh_executive_summary.txt",
             "gh_summary_diff.txt",
             "gh_warnings.txt",
-            # "gh_regression.txt",
+            "gh_regression.txt",
             "gh_metric_regression.txt",
             "gh_training.txt",
             "gh_graphs.txt",

From 3537f99822c07627d0c5ff7cb60b71c5982c282a Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Tue, 22 Nov 2022 19:24:00 +0000
Subject: [PATCH 1187/1922] Fix inductor fallback_random for dropout/rand_like
 (#89515)

- Avoid fx graph rewrite that replaces certain ops with ones using
  triton random
- Keep track of replacement ops using triton random, so it is possible
  to not disable all replacements when using fallback_random

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89515
Approved by: https://github.com/ngimel
---
 torch/_inductor/overrides.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index 5bd97cd5009a1..ed523d356c39a 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -40,6 +40,11 @@ def replace_fx(gm: torch.fx.GraphModule):
     # Sometimes patch_functions() misses things already in the graph
     for node in reversed(list(gm.graph.nodes)):
         if node.op == "call_function" and node.target in replacements:
+            if (
+                config.fallback_random
+                and replacements[node.target] in replacements_using_triton_random
+            ):
+                continue
             with gm.graph.inserting_before(node):
                 node.replace_all_uses_with(
                     gm.graph.call_function(
@@ -967,7 +972,9 @@ def rand_like(x, **kwargs):
 
 
 replacements = {torch.nn.functional.dropout: lowmem_dropout, torch.rand_like: rand_like}
-
+# Keep track of any replacement functions that use triton random,
+# so they can be avoided when fallback_random is set
+replacements_using_triton_random = {lowmem_dropout, rand_like}
 
 computation_op_unary_op_fusion_map = {
     nn.Conv2d: fused_conv_unary_eval,

From d13f23d724d2d9ce9ac6371115fdc5530cda0f93 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Wed, 23 Nov 2022 00:07:59 +0000
Subject: [PATCH 1188/1922] [Inductor] Limit g++12 installation to Linux
 (#89472)

According to https://anaconda.org/conda-forge/gxx/ its only available on Linux

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89472
Approved by: https://github.com/soumith, https://github.com/jgong5
---
 torch/_inductor/codecache.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index c020ff52f3af0..9fada4d0d8dcf 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -101,6 +101,10 @@ def cpp_compiler_search(search):
     for cxx in search:
         try:
             if cxx is None:
+                # gxx package is only available for Linux
+                # according to https://anaconda.org/conda-forge/gxx/
+                if sys.platform != "linux":
+                    continue
                 from filelock import FileLock
 
                 lock_dir = get_lock_dir()

From 6d2fd48aa5288686b29ef1abe04669278a44cbe6 Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Tue, 22 Nov 2022 19:24:00 +0000
Subject: [PATCH 1189/1922] Add limited FSDP correctness to torchdynamo
 benchmark (#89469)

- Does not do recursive wrapping
- Only supports accuracy bench
- Mainly useful for sweeping over models for correctness, in part
  to evaluate whether dynamo support for FSDP is breaking anywhere

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89469
Approved by: https://github.com/davidberard98, https://github.com/aazzolini
---
 benchmarks/dynamo/common.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index a167ab75b53f7..6d77326c8f2cb 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -33,6 +33,7 @@
 from torch._inductor import config as inductor_config
 from torch._inductor.utils import fresh_inductor_cache
 from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils._pytree import tree_map
 
@@ -1091,6 +1092,10 @@ def deepcopy_and_maybe_ddp(model):
             model = copy.deepcopy(model)
             if self.args.ddp:
                 model = DDP(model, find_unused_parameters=True)
+            elif self.args.fsdp:
+                model = FSDP(model)
+                torch._inductor.config.triton.cudagraphs = False
+                log.warn("Disabling cudagraphs for FSDP compatibility")
             return model
 
         # Collect the fp64 reference outputs to be used later for accuracy checking.
@@ -1447,6 +1452,13 @@ def get_example_inputs(self):
         action="store_true",
         help="Wraps model in DDP before running it, and uses dynamo DDPOptmizer (graph breaks) by default.",
     )
+    parser.add_argument(
+        "--fsdp",
+        action="store_true",
+        help="""Wraps model in FSDP before running it. Disables cudagraphs by default.
+        Doesn't recursively wrap, mainly useful for checking dynamo UnspecNNModule compatibility
+    """,
+    )
     parser.add_argument(
         "--no-optimize-ddp",
         action="store_true",
@@ -1638,7 +1650,7 @@ def get_example_inputs(self):
 def main(runner, original_dir=None):
     args = parse_args()
     with maybe_init_distributed(
-        args.ddp and args.only, port=args.distributed_master_port
+        (args.ddp or args.fsdp) and args.only, port=args.distributed_master_port
     ):
         return maybe_fresh_cache(run, args.cold_start_latency and args.only)(
             runner, args, original_dir

From 18f2affc97b4be192795357b0f3db0942f6e2ddd Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 23 Nov 2022 00:48:00 +0000
Subject: [PATCH 1190/1922] [inductor] Use dense masks for indirect indexing
 (#89524)

Fixes https://github.com/pytorch/torchdynamo/issues/1654

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89524
Approved by: https://github.com/jansel
---
 test/inductor/test_torchinductor.py | 18 ++++++++++++++++++
 torch/_inductor/codegen/triton.py   |  6 ++++++
 2 files changed, 24 insertions(+)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 4f672afff80a8..89b94cec0ae1f 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -5377,6 +5377,24 @@ def fn(a):
             fn_optimized = torch._dynamo.optimize("inductor")(fn)
             assert same(fn(a), fn_optimized(a))
 
+        @requires_cuda()
+        def test_indirect_indexing_dense_mask(self):
+            def fn(x, y):
+                ne = torch.ops.aten.ne.Scalar(x, 1)
+                sum_1 = torch.ops.aten.sum.dim_IntList(ne, [1])
+                sub = torch.ops.aten.sub.Tensor(sum_1, 1)
+                unsqueeze = torch.ops.aten.unsqueeze.default(sub, -1)
+                gather = torch.ops.aten.gather.default(x, 1, unsqueeze)
+                squeeze = torch.ops.aten.squeeze.default(gather)
+                out = torch.ops.aten.multiply(y, squeeze)
+                return (out,)
+
+            a = torch.zeros((1, 128), dtype=torch.int64, device="cuda")
+            b = torch.zeros((1, 128), dtype=torch.int64, device="cuda")
+
+            fn_optimized = torch._dynamo.optimize("inductor")(fn)
+            assert same(fn(a, b), fn_optimized(a, b))
+
     class TritonCodeGenTests(TestCase):
         from torch._inductor.triton_ops.autotune import CachingAutotuner
 
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 2504bd2dcf8c3..e14b417c173f8 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -756,6 +756,12 @@ def indexing(
             mask = dense_mask
             index_str = f"{index_str} + tl.zeros({copy_shape}.shape, tl.int32)"
         elif indirect_indexing:
+            # Use dense mask for indirect_indexing
+            # See https://github.com/pytorch/torchdynamo/issues/1654
+            # TODO - An optimization could be to hoist this load outside of
+            # reduction loop, if it is independent of rmask. Such example can be found in
+            # https://github.com/pytorch/torchdynamo/issues/1654
+            index_str = f"{index_str} + tl.zeros({self.dense_size_str()}, tl.int32)"
             mask = dense_mask
 
         if self._load_mask:

From 43f1040cbbe72a5b61e77c583c9f92c5b455abc7 Mon Sep 17 00:00:00 2001
From: Thomas <37830237+thomaslin2020@users.noreply.github.com>
Date: Wed, 23 Nov 2022 02:18:03 +0000
Subject: [PATCH 1191/1922] Added implementation and tests for MPS Hardswish
 (#87952)

## What?
Fixes issue #86807 by adding MPS backend support for aten::hardswish.

## How?
Registered mps hardswish functions in native_functions.yaml, and added the code implementation to Activations.mm.

Added functions:
- hardswish_mps
- hardswish_mps_
- hardswish_backward_mps
- hardswish_out_mps

## Testing
Added test in test/test_mps.py and tested code using the command `python3 test/test_mps.py -k test_hardswish`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87952
Approved by: https://github.com/kulinseth, https://github.com/kit1980
---
 .../ATen/native/mps/operations/Activation.mm  | 252 ++++++++++++++++++
 aten/src/ATen/native/native_functions.yaml    |   4 +
 test/test_mps.py                              | 156 ++++++-----
 3 files changed, 350 insertions(+), 62 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Activation.mm b/aten/src/ATen/native/mps/operations/Activation.mm
index 3837e407a76bf..618a00f337876 100644
--- a/aten/src/ATen/native/mps/operations/Activation.mm
+++ b/aten/src/ATen/native/mps/operations/Activation.mm
@@ -2202,5 +2202,257 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) {
   return grad_input;
 }
 
+Tensor& hardswish_out_mps(const Tensor& self, Tensor& output) {
+  using namespace mps;
+  using CachedGraph = MPSUnaryCachedGraph;
+
+  TORCH_CHECK(self.is_mps());
+
+  if (output.numel() == 0) {
+    return output;
+  }
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = at::mps::getCurrentMPSStream();
+
+  @autoreleasepool {
+    string key = "hardswish_out_mps" + getTensorsStringKey({self});
+    CachedGraph* cachedGraph = static_cast<CachedGraph*>(cache_->LookUp(key));
+    if (!cachedGraph) {
+      MPSCachedGraph* tmpCachedGraph =
+          cache_->CreateCachedGraph(key, ^MPSCachedGraph*() {
+            CachedGraph* newCachedGraph = nil;
+            @autoreleasepool {
+              MPSGraph* mpsGraph = make_mps_graph();
+              newCachedGraph = new CachedGraph(mpsGraph);
+              MPSGraphTensor* inputTensor =
+                  mpsGraphRankedPlaceHolder(mpsGraph, self);
+
+              MPSGraphTensor* zeroTensor = [mpsGraph
+                  constantWithScalar:0.0f
+                               shape:@[ @1 ]
+                            dataType:getMPSDataType(self.scalar_type())];
+
+              MPSGraphTensor* threeTensor = [mpsGraph
+                  constantWithScalar:3.0f
+                               shape:@[ @1 ]
+                            dataType:getMPSDataType(self.scalar_type())];
+
+              MPSGraphTensor* negativeThreeTensor = [mpsGraph
+                  constantWithScalar:-3.0f
+                               shape:@[ @1 ]
+                            dataType:getMPSDataType(self.scalar_type())];
+
+              MPSGraphTensor* sixTensor = [mpsGraph
+                  constantWithScalar:6.0f
+                               shape:@[ @1 ]
+                            dataType:getMPSDataType(self.scalar_type())];
+
+              MPSGraphTensor* lessThanMinPredicateTensor = [mpsGraph
+                  lessThanOrEqualToWithPrimaryTensor:inputTensor
+                                     secondaryTensor:negativeThreeTensor
+                                                name:nil];
+
+              MPSGraphTensor* lessThanMaxPredicateTensor =
+                  [mpsGraph lessThanWithPrimaryTensor:inputTensor
+                                      secondaryTensor:threeTensor
+                                                 name:nil];
+
+              MPSGraphTensor* inputPlusThreeTensor =
+                  [mpsGraph additionWithPrimaryTensor:inputTensor
+                                      secondaryTensor:threeTensor
+                                                 name:nil];
+
+              MPSGraphTensor* inputDivSixTensor =
+                  [mpsGraph divisionWithPrimaryTensor:inputPlusThreeTensor
+                                      secondaryTensor:sixTensor
+                                                 name:nil];
+
+              MPSGraphTensor* weightedTensor =
+                  [mpsGraph multiplicationWithPrimaryTensor:inputTensor
+                                            secondaryTensor:inputDivSixTensor
+                                                       name:nil];
+
+              MPSGraphTensor* tempTensor =
+                  [mpsGraph selectWithPredicateTensor:lessThanMaxPredicateTensor
+                                  truePredicateTensor:weightedTensor
+                                 falsePredicateTensor:inputTensor
+                                                 name:nil];
+
+              MPSGraphTensor* outputTensor =
+                  [mpsGraph selectWithPredicateTensor:lessThanMinPredicateTensor
+                                  truePredicateTensor:zeroTensor
+                                 falsePredicateTensor:tempTensor
+                                                 name:nil];
+              newCachedGraph->inputTensor_ = inputTensor;
+              newCachedGraph->outputTensor_ = outputTensor;
+            }
+            return newCachedGraph;
+          });
+      cachedGraph = static_cast<CachedGraph*>(tmpCachedGraph);
+    }
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder outputPlaceholder =
+        Placeholder(cachedGraph->outputTensor_, output);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() :
+          selfPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() :
+          outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+  return output;
+}
+
+Tensor hardswish_mps(const Tensor& self) {
+  using namespace mps;
+  Tensor output = at::empty_like(self, self.suggest_memory_format());
+
+  return hardswish_out_mps(self, output);
+}
+
+Tensor& hardswish_mps_(Tensor& self) {
+  using namespace mps;
+  Tensor& output = self;
+
+  return hardswish_out_mps(self, output);
+}
+
+Tensor hardswish_backward_mps(const Tensor& grad_output, const Tensor& self) {
+  using namespace mps;
+
+  if (grad_output.numel() == 0) {
+    return grad_output;
+  }
+
+  Tensor grad_input = at::empty_like(self, self.suggest_memory_format());
+
+  struct CachedGraph : public MPSCachedGraph {
+    CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* gradOutputTensor_ = nil;
+    MPSGraphTensor* inputTensor_ = nil;
+    MPSGraphTensor* gradInputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  MPSStream* stream = at::mps::getCurrentMPSStream();
+
+  @autoreleasepool {
+    string key = "hardswish_backward_mps" + getTensorsStringKey({self});
+    CachedGraph* cachedGraph = static_cast<CachedGraph*>(cache_->LookUp(key));
+    if (!cachedGraph) {
+      MPSCachedGraph* tmpCachedGraph =
+          cache_->CreateCachedGraph(key, ^MPSCachedGraph*() {
+            CachedGraph* newCachedGraph = nil;
+            @autoreleasepool {
+              MPSGraph* mpsGraph = make_mps_graph();
+              newCachedGraph = new CachedGraph(mpsGraph);
+              MPSGraphTensor* gradOutputTensor =
+                  mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
+              MPSGraphTensor* inputTensor =
+                  mpsGraphRankedPlaceHolder(mpsGraph, self);
+
+              MPSGraphTensor* zeroTensor = [mpsGraph
+                  constantWithScalar:0.0f
+                               shape:@[ @1 ]
+                            dataType:getMPSDataType(grad_output.scalar_type())];
+
+              MPSGraphTensor* unitTensor = [mpsGraph
+                  constantWithScalar:1.0f
+                               shape:@[ @1 ]
+                            dataType:getMPSDataType(grad_output.scalar_type())];
+
+              MPSGraphTensor* threeTensor = [mpsGraph
+                  constantWithScalar:3.0f
+                               shape:@[ @1 ]
+                            dataType:getMPSDataType(grad_output.scalar_type())];
+
+              MPSGraphTensor* negativeThreeTensor = [mpsGraph
+                  constantWithScalar:-3.0f
+                               shape:@[ @1 ]
+                            dataType:getMPSDataType(grad_output.scalar_type())];
+
+              MPSGraphTensor* halfTensor = [mpsGraph
+                  constantWithScalar:0.5f
+                               shape:@[ @1 ]
+                            dataType:getMPSDataType(grad_output.scalar_type())];
+
+              MPSGraphTensor* tempTensor =
+                  [mpsGraph divisionWithPrimaryTensor:inputTensor
+                                      secondaryTensor:threeTensor
+                                                 name:nil];
+
+              MPSGraphTensor* weightedTensor =
+                  [mpsGraph additionWithPrimaryTensor:tempTensor
+                                      secondaryTensor:halfTensor
+                                                 name:nil];
+
+              MPSGraphTensor* lessThanMinPredicateTensor = [mpsGraph
+                  lessThanOrEqualToWithPrimaryTensor:inputTensor
+                                     secondaryTensor:negativeThreeTensor
+                                                name:nil];
+
+              MPSGraphTensor* lessThanMaxPredicateTensor =
+                  [mpsGraph lessThanWithPrimaryTensor:inputTensor
+                                      secondaryTensor:threeTensor
+                                                 name:nil];
+
+              MPSGraphTensor* lessThanMaxGradTensor =
+                  [mpsGraph selectWithPredicateTensor:lessThanMaxPredicateTensor
+                                  truePredicateTensor:weightedTensor
+                                 falsePredicateTensor:unitTensor
+                                                 name:nil];
+
+              MPSGraphTensor* gradTensor =
+                  [mpsGraph selectWithPredicateTensor:lessThanMinPredicateTensor
+                                  truePredicateTensor:zeroTensor
+                                 falsePredicateTensor:lessThanMaxGradTensor
+                                                 name:nil];
+              MPSGraphTensor* gradInputTensor =
+                  [mpsGraph multiplicationWithPrimaryTensor:gradTensor
+                                            secondaryTensor:gradOutputTensor
+                                                       name:nil];
+
+              newCachedGraph->gradOutputTensor_ = gradOutputTensor;
+              newCachedGraph->inputTensor_ = inputTensor;
+              newCachedGraph->gradInputTensor_ = gradInputTensor;
+            }
+            return newCachedGraph;
+          });
+      cachedGraph = static_cast<CachedGraph*>(tmpCachedGraph);
+    }
+
+    Placeholder gradOutputPlaceholder =
+        Placeholder(cachedGraph->gradOutputTensor_, grad_output);
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder gradInputPlaceholder =
+        Placeholder(cachedGraph->gradInputTensor_, grad_input);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      gradOutputPlaceholder.getMPSGraphTensor() :
+          gradOutputPlaceholder.getMPSGraphTensorData(),
+      selfPlaceholder.getMPSGraphTensor() :
+          selfPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      gradInputPlaceholder.getMPSGraphTensor() :
+          gradInputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+  return grad_input;
+}
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 730032528661d..6c9a96937a3f1 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -10393,23 +10393,27 @@
   python_module: nn
   dispatch:
     CPU, CUDA: hardswish_out
+    MPS: hardswish_out_mps
 
 - func: hardswish(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
   python_module: nn
   dispatch:
     CPU, CUDA: hardswish
+    MPS: hardswish_mps
 
 - func: hardswish_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   python_module: nn
   dispatch:
     CPU, CUDA: hardswish_
+    MPS: hardswish_mps_
 
 - func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor
   python_module: nn
   dispatch:
     CPU, CUDA: hardswish_backward
+    MPS: hardswish_backward_mps
   autogen: hardswish_backward.out
 
 - func: leaky_relu.out(Tensor self, Scalar negative_slope=0.01, *, Tensor(a!) out) -> Tensor(a!)
diff --git a/test/test_mps.py b/test/test_mps.py
index 19f70ce35a21b..fc923daa57940 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -493,7 +493,7 @@ def test_max_pool2d(self):
         def helper(shape, ks, padding=0, dilation=1, ceil_mode=False, return_indices=False, test_ties=False):
 
             cpu_x = None
-            if(test_ties):
+            if (test_ties):
                 cpu_x = torch.ones(shape, device='cpu', dtype=torch.float, requires_grad=True)
             else:
                 cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
@@ -502,7 +502,7 @@ def helper(shape, ks, padding=0, dilation=1, ceil_mode=False, return_indices=Fal
             pool = torch.nn.MaxPool2d(kernel_size=ks, padding=padding, dilation=dilation,
                                       ceil_mode=ceil_mode, return_indices=return_indices)
 
-            if(return_indices is False):
+            if (return_indices is False):
                 y = pool(x)
                 ref_y = pool(cpu_x)
 
@@ -637,7 +637,7 @@ def helper(shape, channels_last=False):
             np.random.seed(332)
             arr = (256 - 128) * np.random.random_sample(size=shape) + 128
             cpu_x = torch.tensor(arr, device='cpu', dtype=torch.float, requires_grad=True)
-            if(channels_last):
+            if (channels_last):
                 cpu_x = cpu_x.to(memory_format=torch.channels_last)
                 cpu_x.retain_grad()
             x = cpu_x.detach().clone().to('mps').requires_grad_()
@@ -656,7 +656,7 @@ def helper(shape, eps=1, momentum=0.1, wts=False, training=False, channels_last=
             np.random.seed(332)
             arr = (256 - 128) * np.random.random_sample(size=shape) + 128
             cpu_x = torch.tensor(arr, device='cpu', dtype=torch.float, requires_grad=True)
-            if(channels_last):
+            if (channels_last):
                 cpu_x = cpu_x.to(memory_format=torch.channels_last)
                 cpu_x.retain_grad()
             x = cpu_x.detach().clone().to('mps').requires_grad_()
@@ -666,7 +666,7 @@ def helper(shape, eps=1, momentum=0.1, wts=False, training=False, channels_last=
             cpu_running_var = None
             running_mean = None
             running_var = None
-            if(track_running_stats):
+            if (track_running_stats):
                 mean_arr = (240 - 140) * np.random.random_sample(size=mean_shape) + 140
                 cpu_running_mean = torch.tensor(mean_arr, device='cpu', dtype=torch.float)
                 var_arr = 32 * np.random.random_sample(size=mean_shape)
@@ -678,7 +678,7 @@ def helper(shape, eps=1, momentum=0.1, wts=False, training=False, channels_last=
             cpu_weight = None
             bias = None
             cpu_bias = None
-            if(wts):
+            if (wts):
                 cpu_weight = torch.randn(mean_shape, device='cpu', dtype=torch.float, requires_grad=True)
                 weight = cpu_weight.detach().clone().to('mps').requires_grad_()
                 cpu_bias = torch.randn(mean_shape, device='cpu', dtype=torch.float, requires_grad=True)
@@ -687,7 +687,7 @@ def helper(shape, eps=1, momentum=0.1, wts=False, training=False, channels_last=
             y = None
             ref_y = None
 
-            if(not test_module):
+            if (not test_module):
                 y = torch.nn.functional.batch_norm(x, running_mean, running_var,
                                                    weight=weight,
                                                    bias=bias,
@@ -704,7 +704,7 @@ def helper(shape, eps=1, momentum=0.1, wts=False, training=False, channels_last=
                 batchnorm_op = None
                 mps_batchnorm_op = None
 
-                if(len(shape) == 3):
+                if (len(shape) == 3):
                     batchnorm_op = torch.nn.BatchNorm1d(shape[1],
                                                         eps=eps,
                                                         momentum=momentum,
@@ -717,7 +717,7 @@ def helper(shape, eps=1, momentum=0.1, wts=False, training=False, channels_last=
                                                             affine=wts,
                                                             track_running_stats=track_running_stats,
                                                             device='mps')
-                elif(len(shape) == 4):
+                elif (len(shape) == 4):
                     batchnorm_op = torch.nn.BatchNorm2d(shape[1],
                                                         eps=eps,
                                                         momentum=momentum,
@@ -730,7 +730,7 @@ def helper(shape, eps=1, momentum=0.1, wts=False, training=False, channels_last=
                                                             affine=wts,
                                                             track_running_stats=track_running_stats,
                                                             device='mps')
-                elif(len(shape) == 5):
+                elif (len(shape) == 5):
                     batchnorm_op = torch.nn.BatchNorm3d(shape[1],
                                                         eps=eps,
                                                         momentum=momentum,
@@ -744,12 +744,12 @@ def helper(shape, eps=1, momentum=0.1, wts=False, training=False, channels_last=
                                                             track_running_stats=track_running_stats,
                                                             device='mps')
 
-                if(track_running_stats):
+                if (track_running_stats):
                     batchnorm_op.running_mean = cpu_running_mean
                     batchnorm_op.running_var = cpu_running_var
                     mps_batchnorm_op.running_mean = running_mean
                     mps_batchnorm_op.running_var = running_var
-                if(wts):
+                if (wts):
                     batchnorm_op.weight = torch.nn.Parameter(cpu_weight)
                     batchnorm_op.bias = torch.nn.Parameter(cpu_bias)
                     mps_batchnorm_op.weight = torch.nn.Parameter(weight)
@@ -759,7 +759,7 @@ def helper(shape, eps=1, momentum=0.1, wts=False, training=False, channels_last=
                 y = mps_batchnorm_op(x)
 
             self.assertEqual(y, ref_y)
-            if(not test_module):
+            if (not test_module):
                 self.assertEqual(running_mean, cpu_running_mean)
                 self.assertEqual(running_var, cpu_running_var)
             else:
@@ -772,8 +772,8 @@ def helper(shape, eps=1, momentum=0.1, wts=False, training=False, channels_last=
             y.backward(gradient=grad)
 
             self.assertEqual(x.grad, cpu_x.grad)
-            if(wts):
-                if(not test_module):
+            if (wts):
+                if (not test_module):
                     self.assertEqual(weight.grad, cpu_weight.grad)
                     self.assertEqual(bias.grad, cpu_bias.grad)
                 else:
@@ -784,10 +784,10 @@ def helper(shape, eps=1, momentum=0.1, wts=False, training=False, channels_last=
             for test_module in [False, True]:
                 for track_running_stats in [True, False]:
                     for channels_last in [False]:
-                        if(channels_last and len(shape) != 4):
+                        if (channels_last and len(shape) != 4):
                             continue
                         # Running stats must be tracked in eval mode
-                        if(track_running_stats):
+                        if (track_running_stats):
                             helper(shape, eps=0, momentum=1, channels_last=channels_last,
                                    track_running_stats=track_running_stats, test_module=test_module)
                             helper(shape, channels_last=channels_last,
@@ -822,7 +822,7 @@ def helper(input_shape, normalized_shape, eps=1e-05, elementwise_affine=True, dt
             cpu_bias = torch.randn(normalized_shape, device='cpu', dtype=dtype, requires_grad=True)
             bias = cpu_bias.detach().clone().to('mps').requires_grad_()
 
-            if(elementwise_affine):
+            if (elementwise_affine):
                 cpu_op.weight = torch.nn.Parameter(cpu_wt)
                 mps_op.weight = torch.nn.Parameter(wt)
                 cpu_op.bias = torch.nn.Parameter(cpu_bias)
@@ -839,7 +839,7 @@ def helper(input_shape, normalized_shape, eps=1e-05, elementwise_affine=True, dt
 
             self.assertEqual(result, cpu_result)
             self.assertEqual(x.grad, cpu_x.grad)
-            if(elementwise_affine):
+            if (elementwise_affine):
                 self.assertEqual(mps_op.weight.grad, cpu_op.weight.grad)
                 self.assertEqual(mps_op.bias.grad, cpu_op.bias.grad)
 
@@ -855,7 +855,7 @@ def helper(shape, eps=1, momentum=0.1, wts=False, channels_last=False, track_run
             np.random.seed(332)
             arr = (256 - 128) * np.random.random_sample(size=shape) + 128
             cpu_x = torch.tensor(arr, device='cpu', dtype=torch.float, requires_grad=True)
-            if(channels_last):
+            if (channels_last):
                 cpu_x = cpu_x.to(memory_format=torch.channels_last)
                 cpu_x.retain_grad()
             x = cpu_x.detach().clone().to('mps').requires_grad_()
@@ -865,7 +865,7 @@ def helper(shape, eps=1, momentum=0.1, wts=False, channels_last=False, track_run
             cpu_running_var = None
             running_mean = None
             running_var = None
-            if(track_running_stats):
+            if (track_running_stats):
                 mean_arr = (240 - 140) * np.random.random_sample(size=mean_shape) + 140
                 cpu_running_mean = torch.tensor(mean_arr, device='cpu', dtype=torch.float)
                 var_arr = 32 * np.random.random_sample(size=mean_shape)
@@ -877,7 +877,7 @@ def helper(shape, eps=1, momentum=0.1, wts=False, channels_last=False, track_run
             cpu_weight = None
             bias = None
             cpu_bias = None
-            if(wts):
+            if (wts):
                 cpu_weight = torch.randn(mean_shape, device='cpu', dtype=torch.float, requires_grad=True)
                 weight = cpu_weight.detach().clone().to('mps').requires_grad_()
                 cpu_bias = torch.randn(mean_shape, device='cpu', dtype=torch.float, requires_grad=True)
@@ -886,7 +886,7 @@ def helper(shape, eps=1, momentum=0.1, wts=False, channels_last=False, track_run
             y = None
             ref_y = None
 
-            if(not test_module):
+            if (not test_module):
                 ref_y = torch.nn.functional.instance_norm(cpu_x, cpu_running_mean, cpu_running_var,
                                                           weight=cpu_weight,
                                                           bias=cpu_bias,
@@ -901,7 +901,7 @@ def helper(shape, eps=1, momentum=0.1, wts=False, channels_last=False, track_run
                 instancenorm_op = None
                 mps_instancenorm_op = None
 
-                if(len(shape) == 3):
+                if (len(shape) == 3):
                     instancenorm_op = torch.nn.InstanceNorm1d(shape[1],
                                                               eps=eps,
                                                               momentum=momentum,
@@ -914,7 +914,7 @@ def helper(shape, eps=1, momentum=0.1, wts=False, channels_last=False, track_run
                                                                   affine=wts,
                                                                   track_running_stats=track_running_stats,
                                                                   device='mps')
-                elif(len(shape) == 4):
+                elif (len(shape) == 4):
                     instancenorm_op = torch.nn.InstanceNorm2d(shape[1],
                                                               eps=eps,
                                                               momentum=momentum,
@@ -927,7 +927,7 @@ def helper(shape, eps=1, momentum=0.1, wts=False, channels_last=False, track_run
                                                                   affine=wts,
                                                                   track_running_stats=track_running_stats,
                                                                   device='mps')
-                elif(len(shape) == 5):
+                elif (len(shape) == 5):
                     instancenorm_op = torch.nn.InstanceNorm3d(shape[1],
                                                               eps=eps,
                                                               momentum=momentum,
@@ -941,12 +941,12 @@ def helper(shape, eps=1, momentum=0.1, wts=False, channels_last=False, track_run
                                                                   track_running_stats=track_running_stats,
                                                                   device='mps')
 
-                if(track_running_stats):
+                if (track_running_stats):
                     instancenorm_op.running_mean = cpu_running_mean
                     instancenorm_op.running_var = cpu_running_var
                     mps_instancenorm_op.running_mean = running_mean
                     mps_instancenorm_op.running_var = running_var
-                if(wts):
+                if (wts):
                     instancenorm_op.weight = torch.nn.Parameter(cpu_weight)
                     instancenorm_op.bias = torch.nn.Parameter(cpu_bias)
                     mps_instancenorm_op.weight = torch.nn.Parameter(weight)
@@ -956,7 +956,7 @@ def helper(shape, eps=1, momentum=0.1, wts=False, channels_last=False, track_run
                 y = mps_instancenorm_op(x)
 
             self.assertEqual(y, ref_y)
-            if(not test_module):
+            if (not test_module):
                 self.assertEqual(running_mean, cpu_running_mean)
                 self.assertEqual(running_var, cpu_running_var)
             else:
@@ -969,8 +969,8 @@ def helper(shape, eps=1, momentum=0.1, wts=False, channels_last=False, track_run
             y.backward(gradient=grad)
 
             self.assertEqual(x.grad, cpu_x.grad)
-            if(wts):
-                if(not test_module):
+            if (wts):
+                if (not test_module):
                     self.assertEqual(weight.grad, cpu_weight.grad)
                     self.assertEqual(bias.grad, cpu_bias.grad)
                 else:
@@ -981,10 +981,10 @@ def helper(shape, eps=1, momentum=0.1, wts=False, channels_last=False, track_run
             for test_module in [False, True]:
                 for track_running_stats in [True, False]:
                     for channels_last in [False]:
-                        if(channels_last and len(shape) != 4):
+                        if (channels_last and len(shape) != 4):
                             continue
                         # Running stats must be tracked in eval mode
-                        if(track_running_stats):
+                        if (track_running_stats):
                             helper(shape, eps=0, momentum=1, channels_last=channels_last,
                                    track_running_stats=track_running_stats, test_module=test_module)
                             helper(shape, channels_last=channels_last,
@@ -1022,7 +1022,7 @@ def helper(input_shape, wt_shape,
             cpu_bias = None
             bias = None
 
-            if(bias_shape is not None):
+            if (bias_shape is not None):
                 cpu_bias = torch.randn(bias_shape, device='cpu', dtype=torch.float, requires_grad=True)
                 bias = cpu_bias.detach().clone().to('mps').requires_grad_()
 
@@ -1040,7 +1040,7 @@ def helper(input_shape, wt_shape,
             self.assertEqual(y, ref_y, rtol=2.6e-05, atol=2e-04)
             self.assertEqual(x.grad, cpu_x.grad, rtol=2.6e-06, atol=2e-05)
             self.assertEqual(wt.grad, cpu_wt.grad, atol=8e-04, rtol=10.4e-05)
-            if(bias_shape is not None):
+            if (bias_shape is not None):
                 self.assertEqual(bias.grad, cpu_bias.grad, atol=8e-04, rtol=10.4e-05)
 
         N = 1
@@ -1096,7 +1096,7 @@ def helper(input_shape, wt_shape,
             cpu_bias = None
             bias = None
 
-            if(bias_shape is not None):
+            if (bias_shape is not None):
                 cpu_bias = torch.randn(bias_shape, device='cpu', dtype=torch.float, requires_grad=True)
                 bias = cpu_bias.detach().clone().to('mps').requires_grad_()
 
@@ -1116,7 +1116,7 @@ def helper(input_shape, wt_shape,
             self.assertEqual(x.grad, cpu_x.grad, rtol=2.6e-06, atol=2e-05)
             self.assertEqual(wt.grad, cpu_wt.grad, atol=8e-04, rtol=10.4e-05)
 
-            # if(bias_shape is not None):
+            # if (bias_shape is not None):
             #  print(cpu_bias.grad)
             #  print(bias.grad.to('cpu'))
             #  self.assertEqual(bias.grad, cpu_bias.grad)
@@ -1135,7 +1135,7 @@ def helper(input_shape, wt_shape,
             for padding in [0, 1, 2]:
                 for output_padding in [0, 1, 2]:
                     for dilation in [1, 2]:
-                        if(output_padding >= stride or output_padding >= dilation):
+                        if (output_padding >= stride or output_padding >= dilation):
                             continue
                         helper((N, C_out, H, W), (C_out, C_in, kH, kW), stride=stride,
                                padding=padding, output_padding=output_padding, dilation=dilation)
@@ -2410,7 +2410,7 @@ def helper(n, c, h, w, reduction_type, dtype=torch.float32):
 
             cpu_x = None
             x = None
-            if(dtype not in [torch.float32, torch.bool]):
+            if (dtype not in [torch.float32, torch.bool]):
                 cpu_x = torch.randint(50, (n, c, h, w), device='cpu', dtype=dtype, requires_grad=False)
                 x = cpu_x.detach().clone().to('mps')
             elif (dtype == torch.bool):
@@ -2470,7 +2470,7 @@ def helper(n, c, h, w, reduction_type, dtype=torch.float32):
     def test_max_el(self):
         def helper(n, c, h, w, dtype=torch.float32):
 
-            if(dtype not in [torch.float32, torch.bool]):
+            if (dtype not in [torch.float32, torch.bool]):
                 cpu_x = torch.randint(50, (n, c, h, w), device='cpu', dtype=dtype, requires_grad=False)
                 x = cpu_x.detach().clone().to('mps')
             elif (dtype == torch.bool):
@@ -2835,7 +2835,7 @@ def test_sum(self):
         def helper(n, c, h, w, dtype=torch.float32):
             cpu_x = None
             x = None
-            if(dtype not in [torch.float32, torch.bool]):
+            if (dtype not in [torch.float32, torch.bool]):
                 cpu_x = torch.randint(50, (n, c, h, w), device='cpu', dtype=dtype, requires_grad=False)
                 x = cpu_x.detach().clone().to('mps')
             elif (dtype == torch.bool):
@@ -2900,7 +2900,7 @@ def test_prod(self):
         def helper(shape, dtype=torch.float32):
             cpu_x = None
             x = None
-            if(dtype not in [torch.float32, torch.bool]):
+            if (dtype not in [torch.float32, torch.bool]):
                 cpu_x = torch.randint(1, 6, shape, device='cpu', dtype=dtype, requires_grad=False)
                 x = cpu_x.detach().clone().to('mps')
             elif (dtype == torch.bool):
@@ -3284,7 +3284,7 @@ def helper(shape, rounding_mode):
             for dtype in [torch.float32, torch.float16, torch.int32, torch.int64]:
                 cpu_x = None
                 cpu_y = None
-                if(dtype in [torch.float32, torch.float16]):
+                if (dtype in [torch.float32, torch.float16]):
                     cpu_x = torch.randn(shape, device='cpu', dtype=dtype, requires_grad=False)
                     cpu_y = torch.randn(shape, device='cpu', dtype=dtype, requires_grad=False)
                 else:
@@ -3611,7 +3611,7 @@ def helper(shape, dtype=torch.float32):
             y, cpu_y = None, None
             z, cpu_z = None, None
 
-            if(dtype not in [torch.float32, torch.bool]):
+            if (dtype not in [torch.float32, torch.bool]):
                 cpu_x = torch.randint(50, shape, device='cpu', dtype=dtype, requires_grad=False)
                 x = cpu_x.detach().clone().to('mps')
                 cpu_y = torch.randint(50, shape, device='cpu', dtype=dtype, requires_grad=False)
@@ -3956,7 +3956,7 @@ def helper(src_dtype, dst_dtype):
     def test_adaptive_avg_pool2d_simple(self):
         def helper(input_shape, out_shape, channels_last):
             cpu_x = torch.randn(input_shape, device='cpu', dtype=torch.float, requires_grad=True)
-            if(channels_last):
+            if (channels_last):
                 cpu_x = cpu_x.to(memory_format=torch.channels_last)
                 cpu_x.retain_grad()
             x = cpu_x.detach().clone().to('mps').requires_grad_()
@@ -4001,11 +4001,11 @@ def helper(input_shape, out_shape, channels_last):
     def test_adaptive_max_pool2d_simple(self):
         def helper(input_shape, out_shape, return_indices, dtype, channels_last=False):
             cpu_x = None
-            if(dtype in [torch.float16, torch.float32]):
+            if (dtype in [torch.float16, torch.float32]):
                 cpu_x = torch.randn(input_shape, device='cpu', dtype=dtype, requires_grad=True)
             else:
                 cpu_x = torch.randint(50, input_shape, device='cpu', dtype=dtype, requires_grad=True)
-            if(channels_last):
+            if (channels_last):
                 cpu_x = cpu_x.to(memory_format=torch.channels_last)
                 cpu_x.retain_grad()
             x = cpu_x.detach().clone().to('mps').requires_grad_()
@@ -4013,7 +4013,7 @@ def helper(input_shape, out_shape, return_indices, dtype, channels_last=False):
             max_result, max_indices = None, None
             max_result_cpu, max_indices_cpu = None, None
 
-            if(return_indices):
+            if (return_indices):
                 max_result, max_indices = torch.nn.AdaptiveMaxPool2d(out_shape, return_indices)(x)
                 max_result_cpu, max_indices_cpu = torch.nn.AdaptiveMaxPool2d(out_shape, return_indices)(cpu_x)
             else:
@@ -4027,7 +4027,7 @@ def helper(input_shape, out_shape, return_indices, dtype, channels_last=False):
             max_result_cpu.backward(gradient=cpu_grad)
 
             self.assertEqual(max_result, max_result_cpu)
-            if(return_indices):
+            if (return_indices):
                 self.assertEqual(max_indices, max_indices_cpu)
             self.assertEqual(x.grad, cpu_x.grad)
 
@@ -4104,7 +4104,7 @@ def helper(shape, min_val, max_val, inplace=False):
             cpu_x = None
             x = None
 
-            if(not inplace):
+            if (not inplace):
                 cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
                 x = cpu_x.detach().clone().to('mps').requires_grad_()
             else:
@@ -4116,7 +4116,7 @@ def helper(shape, min_val, max_val, inplace=False):
 
             self.assertEqual(hardtanh_result, hardtanh_result_cpu)
 
-            if(not inplace):
+            if (not inplace):
                 cpu_grad = torch.randn(hardtanh_result_cpu.shape)
                 grad = cpu_grad.to('mps')
                 hardtanh_result.backward(gradient=grad)
@@ -4129,6 +4129,38 @@ def helper(shape, min_val, max_val, inplace=False):
                 helper(shape, min_val, max_val)
                 helper(shape, min_val, max_val, inplace=True)
 
+    def test_hardswish(self):
+        def helper(shape, inplace=False, requires_grad=True):
+            m = nn.Hardswish(inplace=inplace)
+
+            input_cpu = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=requires_grad)
+            input_mps = input_cpu.detach().clone().to('mps').requires_grad_(requires_grad)
+
+            if inplace and requires_grad:  # check that both raise runtime error
+                self.assertRaises(RuntimeError, lambda: m(input_cpu))
+                self.assertRaises(RuntimeError, lambda: m(input_mps))
+                return
+
+            output_cpu = m(input_cpu)
+            output_mps = m(input_mps)
+
+            cpu_grad = torch.ones_like(output_cpu)
+            mps_grad = cpu_grad.to('mps')
+
+            self.assertEqual(output_cpu, output_mps)
+
+            if requires_grad:
+                output_cpu.backward(gradient=cpu_grad)
+                output_mps.backward(gradient=mps_grad)
+
+                self.assertEqual(input_cpu.grad, input_mps.grad)
+
+        for shape in [(0, 3), [], (2, 3), (2, 8, 4, 5)]:
+            helper(shape, inplace=False, requires_grad=False)
+            helper(shape, inplace=True, requires_grad=False)
+            helper(shape, inplace=False, requires_grad=True)
+            helper(shape, inplace=True, requires_grad=True)
+
     def test_transpose_2D(self):
         values = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]
         values1 = [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]
@@ -4406,7 +4438,7 @@ def helper(shape, dim, idx_shape, src_shape, idx_dtype=torch.int64, do_add=True)
 
             # Indices should be taken from range of axis along which gathering is done
             idx_np = None
-            if(do_add):
+            if (do_add):
                 idx_np = np.random.randint(0, shape[dim], idx_shape)
             else:
                 idx_np = np.array([[0, 1, 2],
@@ -4421,7 +4453,7 @@ def helper(shape, dim, idx_shape, src_shape, idx_dtype=torch.int64, do_add=True)
             scatter_result = None
             scatter_result_cpu = None
 
-            if(do_add):
+            if (do_add):
                 scatter_result = torch.scatter_add(x, dim=dim, index=idx, src=src)
                 scatter_result_cpu = torch.scatter_add(cpu_x, dim=dim, index=cpu_idx, src=cpu_src)
             else:
@@ -4431,14 +4463,14 @@ def helper(shape, dim, idx_shape, src_shape, idx_dtype=torch.int64, do_add=True)
             cpu_grad = None
             grad = None
 
-            if(idx_shape == src_shape):
+            if (idx_shape == src_shape):
                 cpu_grad = torch.randn(shape, device='cpu', dtype=torch.float)
                 grad = cpu_grad.to('mps')
                 scatter_result.backward(gradient=grad)
                 scatter_result_cpu.backward(gradient=cpu_grad)
 
             self.assertEqual(scatter_result, scatter_result_cpu)
-            if(idx_shape == src_shape):
+            if (idx_shape == src_shape):
                 self.assertEqual(cpu_x.grad, x.grad)
                 self.assertEqual(cpu_src.grad, src.grad)
 
@@ -4480,7 +4512,7 @@ def helper(idx_dtype=torch.int64, do_add=True):
             scatter_result = None
             scatter_result_cpu = None
 
-            if(do_add):
+            if (do_add):
                 scatter_result = torch.scatter_add(x, dim=0, index=idx, src=src)
                 scatter_result_cpu = torch.scatter_add(cpu_x, dim=0, index=cpu_idx, src=cpu_src)
             else:
@@ -4604,7 +4636,7 @@ def helper(n, m, dtype):
             cpu_result = None
             result = None
 
-            if(n == m):
+            if (n == m):
                 cpu_result = torch.eye(n, dtype=dtype, device='cpu')
                 result = torch.eye(n, dtype=dtype, device='mps')
             else:
@@ -4667,7 +4699,7 @@ def test_arange(self):
     def test_softmax(self):
         def helper(shape, dim, channels_last=False):
             cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
-            if(channels_last):
+            if (channels_last):
                 cpu_x = cpu_x.to(memory_format=torch.channels_last)
                 cpu_x.retain_grad()
             x = cpu_x.detach().clone().to('mps').requires_grad_()
@@ -4679,7 +4711,7 @@ def helper(shape, dim, channels_last=False):
             cpu_grad = None
             grad = None
 
-            if(not channels_last):
+            if (not channels_last):
                 cpu_grad = torch.randn(shape, device='cpu', dtype=torch.float)
                 grad = cpu_grad.to('mps')
 
@@ -4687,7 +4719,7 @@ def helper(shape, dim, channels_last=False):
                 softmax_result_cpu.backward(gradient=cpu_grad)
 
             self.assertEqual(softmax_result, softmax_result_cpu)
-            if(not channels_last):
+            if (not channels_last):
                 self.assertEqual(x.grad, cpu_x.grad)
 
         def helper2(dim):
@@ -4710,7 +4742,7 @@ def helper2(dim):
 
         for channels_last in [False]:
             for shape in [(2, 4, 8, 5), (3, 4, 6, 7, 2)]:
-                if(len(shape) != 4 and channels_last):
+                if (len(shape) != 4 and channels_last):
                     continue
                 for dim in [0, 1, 2, 3, -1, -2, -3]:
                     helper(shape, dim, channels_last)
@@ -4984,7 +5016,7 @@ def helper(probs, compare_mean, compare_var, num_samples=5, replacement=True):
             prob_tensor = cpu_prob_tensor.detach().clone().to('mps')
 
             mps_out = torch.multinomial(prob_tensor, num_samples, replacement=replacement)
-            if(not replacement):
+            if (not replacement):
                 print(mps_out.to('cpu'))
             else:
                 # Compare "real" with theoretical values

From 69485cf4f37c0bb314fae3da9d3858f8aa2be635 Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Tue, 22 Nov 2022 22:32:49 +0000
Subject: [PATCH 1192/1922] Allow Module forward-pre and forward hooks to take
 kwargs (#89389)

closes #35643

This PR is mostly borrowed from #82042. Thanks @Padarn for implementing
the first version and debugging into the errors.

Based on the discussion in #82042 this PR adds a with_kwargs
argument to register_forward_pre_hook and register_forward_hook
methods. When the arg is set to true, the provided hook must accept
kwargs args. Under the hook, this PR adds a
`_forward_pre_hooks_with_kwargs` and a `_forward_hook_with_kwargs`
set to keep track of which hooks accept kwargs.

Differential Revision: [D41431111](https://our.internmc.facebook.com/intern/diff/D41431111)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89389
Approved by: https://github.com/soulitzer
---
 test/nn/test_module_hooks.py              | 195 ++++++++++++++++++++--
 torch/distributed/nn/api/remote_module.py |   4 +
 torch/jit/_recursive.py                   |   2 +
 torch/nn/modules/module.py                | 145 ++++++++++++----
 torch/utils/hooks.py                      |  36 +++-
 5 files changed, 335 insertions(+), 47 deletions(-)

diff --git a/test/nn/test_module_hooks.py b/test/nn/test_module_hooks.py
index 5fe984c2bd6a7..27b3fd5c224ff 100644
--- a/test/nn/test_module_hooks.py
+++ b/test/nn/test_module_hooks.py
@@ -9,7 +9,7 @@
 import torch.nn as nn
 
 from functools import partial
-from typing import List, Tuple
+from typing import Any, Dict, List, Tuple
 
 
 class Net(nn.Module):
@@ -87,8 +87,64 @@ def full_backward_pre_hook(
     self.assertEqual(len(grad_input), 1)
 
 
-class TestModuleHooks(TestCase):
+class KwargModel(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.net1 = Net()
+        self.net2 = Net()
+
+    def forward(
+        self, x: torch.Tensor, bias: torch.Tensor = None
+    ) -> torch.Tensor:
+        if bias is not None:
+            x = x + bias
+        return x
+
+    def internal_forward_hook(
+        self,
+        module: nn.Module,
+        args: Tuple[torch.Tensor],
+        kwargs: Dict[str, Any],
+        out: torch.Tensor,
+    ):
+        return out + kwargs["bias"]
+
+
+def kwarg_forward_pre_hook(
+    self: TestCase,
+    fired_hooks: List[int],
+    expected_module: nn.Module,
+    hook_id: int,
+    module: nn.Module,
+    args: Tuple[torch.Tensor],
+    kwargs: Dict[str, Any],
+) -> Tuple[Any, Any]:
+    fired_hooks.append(hook_id)
+    self.assertEqual(id(module), id(expected_module))
+    self.assertEqual(len(args), 1)
+    kwargs["bias"] = 2 * kwargs["bias"]
+    return args, kwargs
+
+
+def kwarg_forward_hook(
+    self: TestCase,
+    fired_hooks: List[int],
+    expected_module: nn.Module,
+    hook_id: int,
+    module: nn.Module,
+    args: Tuple[torch.Tensor],
+    kwargs: Dict[str, Any],
+    out: torch.Tensor,
+) -> Any:
+    fired_hooks.append(hook_id)
+    self.assertEqual(id(module), id(expected_module))
+    self.assertEqual(len(args), 1)
+
+    out = out + kwargs["bias"]
+    return out
 
+
+class TestModuleHooks(TestCase):
     @skipIfTorchDynamo("Dynamo does not yet capture hooks")
     def test_forward_hooks(self):
         fired_hooks: List[int] = []
@@ -116,11 +172,15 @@ def test_forward_pre_hooks(self):
         model = ToyModel()
         x = torch.randn(10, 10)
         hook = partial(forward_pre_hook, self, fired_hooks, model.net2.seq1)
-        model.net2.seq1.register_forward_pre_hook(partial(hook, 0), prepend=True)
+        model.net2.seq1.register_forward_pre_hook(
+            partial(hook, 0), prepend=True
+        )
         model.net2.seq1.register_forward_pre_hook(partial(hook, 1))
         model.net2.seq1.register_forward_pre_hook(partial(hook, 2))
         model.net2.seq1.register_forward_pre_hook(partial(hook, 3))
-        model.net2.seq1.register_forward_pre_hook(partial(hook, 4), prepend=True)
+        model.net2.seq1.register_forward_pre_hook(
+            partial(hook, 4), prepend=True
+        )
         expected = [4, 0, 1, 2, 3]
 
         self.assertEqual(fired_hooks, [])
@@ -158,8 +218,12 @@ def test_full_backward_pre_hooks(self):
         model = ToyModel()
         x = torch.randn(10, 10)
         hook = partial(full_backward_pre_hook, self, fired_hooks, model.net1)
-        model.net1.register_full_backward_pre_hook(partial(hook, 0), prepend=True)
-        model.net1.register_full_backward_pre_hook(partial(hook, 1), prepend=True)
+        model.net1.register_full_backward_pre_hook(
+            partial(hook, 0), prepend=True
+        )
+        model.net1.register_full_backward_pre_hook(
+            partial(hook, 1), prepend=True
+        )
         model.net1.register_full_backward_pre_hook(partial(hook, 2))
         model.net1.register_full_backward_pre_hook(partial(hook, 3))
         model.net1.register_full_backward_pre_hook(partial(hook, 4))
@@ -178,10 +242,18 @@ def test_mixed_hooks(self):
         fired_hooks: List[int] = []
         model = ToyModel()
         x = torch.randn(10, 10)
-        model.register_forward_pre_hook(partial(forward_pre_hook, self, fired_hooks, model, 0))
-        model.register_forward_hook(partial(forward_hook, self, fired_hooks, model, 1))
-        model.register_full_backward_pre_hook(partial(full_backward_pre_hook, self, fired_hooks, model, 2))
-        model.register_full_backward_hook(partial(full_backward_hook, self, fired_hooks, model, 3))
+        model.register_forward_pre_hook(
+            partial(forward_pre_hook, self, fired_hooks, model, 0)
+        )
+        model.register_forward_hook(
+            partial(forward_hook, self, fired_hooks, model, 1)
+        )
+        model.register_full_backward_pre_hook(
+            partial(full_backward_pre_hook, self, fired_hooks, model, 2)
+        )
+        model.register_full_backward_hook(
+            partial(full_backward_hook, self, fired_hooks, model, 3)
+        )
 
         self.assertEqual(fired_hooks, [])
         out = model(x)
@@ -191,6 +263,109 @@ def test_mixed_hooks(self):
         model(x).sum().backward()
         self.assertEqual(fired_hooks, [0, 1, 2, 3, 0, 1, 2, 3])
 
+    @skipIfTorchDynamo("Dynamo does not yet capture hooks")
+    def test_kwarg_hooks(self):
+        # 1. test forward pre hook
+        fired_hooks: List[int] = []
+        x: torch.Tensor = torch.ones(10, 10)
+        bias: torch.Tensor = torch.ones(10, 10)
+        model = KwargModel()
+        model.register_forward_pre_hook(
+            partial(kwarg_forward_pre_hook, self, fired_hooks, model, 0),
+            with_kwargs=True,
+        )
+
+        # forward-pre: bias' = bias * 2
+        # So, out = x + bias * 2
+        self.assertEqual(fired_hooks, [])
+        out = model(x, bias=bias)
+        self.assertEqual(fired_hooks, [0])
+        self.assertEqual(out, x + 2 * bias, rtol=0, atol=1e-5)
+
+        # 2. test forward pre and forward hooks
+        fired_hooks: List[int] = []
+        x: torch.Tensor = torch.ones(10, 10)
+        bias: torch.Tensor = torch.ones(10, 10)
+        model = KwargModel()
+        model.register_forward_hook(
+            partial(kwarg_forward_hook, self, fired_hooks, model, 1),
+            with_kwargs=True,
+        )
+        model.register_forward_pre_hook(
+            partial(kwarg_forward_pre_hook, self, fired_hooks, model, 0),
+            with_kwargs=True,
+        )
+
+        # forward-pre: bias' = bias * 2
+        # forward: out = x + bias'
+        # forward-post: out = out + bias'
+        # So, out = x + bias * 4
+        self.assertEqual(fired_hooks, [])
+        out = model(x, bias=bias)
+        self.assertEqual(fired_hooks, [0, 1])
+        self.assertEqual(out, x + 4 * bias, rtol=0, atol=1e-5)
+
+        # 3. test nn.Module member method as forward-post hook
+        x: torch.Tensor = torch.ones(10, 10)
+        bias: torch.Tensor = torch.ones(10, 10)
+        model = KwargModel()
+        model.register_forward_hook(
+            model.internal_forward_hook, with_kwargs=True
+        )
+
+        # forward: out = x + bias
+        # forward-post: out = out + bias
+        # So, out = x + bias * 2
+        out = model(x, bias=bias)
+        self.assertEqual(out, x + 2 * bias, rtol=0, atol=1e-5)
+
+
+    @skipIfTorchDynamo("Dynamo does not yet capture hooks")
+    def test_remove_kwarg_hooks(self):
+        # test forward pre and forward hooks
+        fired_hooks: List[int] = []
+        x: torch.Tensor = torch.ones(10, 10)
+        bias: torch.Tensor = torch.ones(10, 10)
+        model = KwargModel()
+        forward_hook_handle = model.register_forward_hook(
+            partial(kwarg_forward_hook, self, fired_hooks, model, 1),
+            with_kwargs=True,
+        )
+        forward_pre_hook_handle = model.register_forward_pre_hook(
+            partial(kwarg_forward_pre_hook, self, fired_hooks, model, 0),
+            with_kwargs=True,
+        )
+
+        # forward-pre: bias' = bias * 2
+        # forward: out = x + bias'
+        # forward-post: out = out + bias'
+        # So, out = x + bias * 4
+        self.assertEqual(fired_hooks, [])
+        out = model(x, bias=bias)
+        self.assertEqual(fired_hooks, [0, 1])
+        self.assertEqual(out, x + 4 * bias, rtol=0, atol=1e-5)
+
+        # forward-pre: bias' = bias * 2
+        # forward: out = x + bias'
+        # So, out = x + bias * 2
+        forward_hook_handle.remove()
+        out = model(x, bias=bias)
+        self.assertEqual(fired_hooks, [0, 1, 0])
+        self.assertEqual(out, x + 2 * bias, rtol=0, atol=1e-5)
+        self.assertFalse(
+            forward_hook_handle.id in model._forward_hooks_with_kwargs
+        )
+
+        # forward: out = x + bias
+        # So, out = x + bias
+        forward_pre_hook_handle.remove()
+        out = model(x, bias=bias)
+        self.assertEqual(fired_hooks, [0, 1, 0])
+        self.assertEqual(out, x + bias, rtol=0, atol=1e-5)
+        self.assertFalse(
+            forward_pre_hook_handle.id in model._forward_pre_hooks_with_kwargs
+        )
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/distributed/nn/api/remote_module.py b/torch/distributed/nn/api/remote_module.py
index 3fd8539d529dc..e9f9d4d3d3bb8 100644
--- a/torch/distributed/nn/api/remote_module.py
+++ b/torch/distributed/nn/api/remote_module.py
@@ -64,7 +64,9 @@
     "_backward_pre_hooks",
     "_is_full_backward_hook",
     "_forward_hooks",
+    "_forward_hooks_with_kwargs",
     "_forward_pre_hooks",
+    "_forward_pre_hooks_with_kwargs",
     "_state_dict_hooks",
     "_load_state_dict_pre_hooks",
     "_load_state_dict_post_hooks",
@@ -365,6 +367,7 @@ def register_forward_pre_hook(  # type: ignore[return]
         self,
         hook: Callable[..., None],
         prepend: bool = False,
+        with_kwargs: bool = False,
     ) -> RemovableHandle:
         _raise_not_supported(self.register_forward_pre_hook.__name__)
 
@@ -372,6 +375,7 @@ def register_forward_hook(  # type: ignore[return]
         self,
         hook: Callable[..., None],
         prepend: bool = False,
+        with_kwargs: bool = False,
     ) -> RemovableHandle:
         _raise_not_supported(self.register_forward_hook.__name__)
 
diff --git a/torch/jit/_recursive.py b/torch/jit/_recursive.py
index 226bfe4589be7..e2717c78ab7e1 100644
--- a/torch/jit/_recursive.py
+++ b/torch/jit/_recursive.py
@@ -30,7 +30,9 @@
     "_backward_hooks",
     "_backward_pre_hooks",
     "_forward_hooks",
+    "_forward_hooks_with_kwargs",
     "_forward_pre_hooks",
+    "_forward_pre_hooks_with_kwargs",
     "_state_dict_hooks",
     "_load_state_dict_pre_hooks",
     "_load_state_dict_post_hooks",
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index 82389074f8a98..e57a7b26d1e5d 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -418,7 +418,15 @@ def forward(self, x):
     _backward_hooks: Dict[int, Callable]
     _is_full_backward_hook: Optional[bool]
     _forward_hooks: Dict[int, Callable]
+    # Marks whether the corresponding _forward_hooks accept kwargs or not.
+    # As JIT does not support Set[int], this dict is used as a set, where all
+    # hooks represented in this dict accept kwargs.
+    _forward_hooks_with_kwargs: Dict[int, bool]
     _forward_pre_hooks: Dict[int, Callable]
+    # Marks whether the corresponding _forward_hooks accept kwargs or not.
+    # As JIT does not support Set[int], this dict is used as a set, where all
+    # hooks represented in this dict accept kwargs.
+    _forward_pre_hooks_with_kwargs: Dict[int, bool]
     _state_dict_hooks: Dict[int, Callable]
     _load_state_dict_pre_hooks: Dict[int, Callable]
     _load_state_dict_post_hooks: Dict[int, Callable]
@@ -444,7 +452,9 @@ def __init__(self) -> None:
         super().__setattr__('_backward_hooks', OrderedDict())
         super().__setattr__('_is_full_backward_hook', None)
         super().__setattr__('_forward_hooks', OrderedDict())
+        super().__setattr__('_forward_hooks_with_kwargs', OrderedDict())
         super().__setattr__('_forward_pre_hooks', OrderedDict())
+        super().__setattr__('_forward_pre_hooks_with_kwargs', OrderedDict())
         super().__setattr__('_state_dict_hooks', OrderedDict())
         super().__setattr__('_load_state_dict_pre_hooks', OrderedDict())
         super().__setattr__('_load_state_dict_post_hooks', OrderedDict())
@@ -1322,20 +1332,33 @@ def _maybe_warn_non_full_backward_hook(self, inputs, result, grad_fn):
                               "behavior.")
 
     def register_forward_pre_hook(
-        self, hook: Callable[..., None], prepend: bool = False
+        self,
+        hook: Callable[..., None],
+        *,
+        prepend: bool = False,
+        with_kwargs: bool = False,
     ) -> RemovableHandle:
         r"""Registers a forward pre-hook on the module.
 
         The hook will be called every time before :func:`forward` is invoked.
-        It should have the following signature::
 
-            hook(module, input) -> None or modified input
 
-        The input contains only the positional arguments given to the module.
-        Keyword arguments won't be passed to the hooks and only to the ``forward``.
-        The hook can modify the input. User can either return a tuple or a
-        single modified value in the hook. We will wrap the value into a tuple
-        if a single value is returned(unless that value is already a tuple).
+        If ``with_kwargs`` is false or not specified, the input contains only
+        the positional arguments given to the module. Keyword arguments won't be
+        passed to the hooks and only to the ``forward``. The hook can modify the
+        input. User can either return a tuple or a single modified value in the
+        hook. We will wrap the value into a tuple if a single value is returned
+        (unless that value is already a tuple). The hook should have the
+        following signature::
+
+            hook(module, args) -> None or modified input
+
+        If ``with_kwargs`` is true, the forward pre-hook will be passed the
+        kwargs given to the forward function. And if the hook modifies the
+        input, both the args and kwargs should be returned. The hook should have
+        the following signature::
+
+            hook(module, args, kwargs) -> None or a tuple of modified input and kwargs
 
         Args:
             hook (Callable): The user defined hook to be registered.
@@ -1347,52 +1370,82 @@ def register_forward_pre_hook(
                 ``forward_pre`` hooks registered with
                 :func:`register_module_forward_pre_hook` will fire before all
                 hooks registered by this method.
+                Default: ``False``
+            with_kwargs (bool): If true, the ``hook`` will be passed the kwargs
+                given to the forward function.
+                Default: ``False``
 
         Returns:
             :class:`torch.utils.hooks.RemovableHandle`:
                 a handle that can be used to remove the added hook by calling
                 ``handle.remove()``
         """
-        handle = hooks.RemovableHandle(self._forward_pre_hooks)
+        handle = hooks.RemovableHandle(
+            self._forward_pre_hooks,
+            extra_dict=self._forward_pre_hooks_with_kwargs
+        )
         self._forward_pre_hooks[handle.id] = hook
+        if with_kwargs:
+            self._forward_pre_hooks_with_kwargs[handle.id] = True
+
         if prepend:
             self._forward_pre_hooks.move_to_end(handle.id, last=False)  # type: ignore[attr-defined]
         return handle
 
     def register_forward_hook(
-        self, hook: Callable[..., None], prepend: bool = False
+        self,
+        hook: Callable[..., None],
+        *,
+        prepend: bool = False,
+        with_kwargs: bool = False,
     ) -> RemovableHandle:
         r"""Registers a forward hook on the module.
 
         The hook will be called every time after :func:`forward` has computed an output.
-        It should have the following signature::
 
-            hook(module, input, output) -> None or modified output
+        If ``with_kwargs`` is ``False`` or not specified, the input contains only
+        the positional arguments given to the module. Keyword arguments won't be
+        passed to the hooks and only to the ``forward``. The hook can modify the
+        output. It can modify the input inplace but it will not have effect on
+        forward since this is called after :func:`forward` is called. The hook
+        should have the following signature::
+
+            hook(module, args, output) -> None or modified output
+
+        If ``with_kwargs`` is ``True``, the forward hook will be passed the
+        ``kwargs`` given to the forward function and be expected to return the
+        output possibly modified. The hook should have the following signature::
 
-        The input contains only the positional arguments given to the module.
-        Keyword arguments won't be passed to the hooks and only to the ``forward``.
-        The hook can modify the output. It can modify the input inplace but
-        it will not have effect on forward since this is called after
-        :func:`forward` is called.
+            hook(module, args, kwargs, output) -> None or modified output
 
         Args:
             hook (Callable): The user defined hook to be registered.
-            prepend (bool): If true, the provided ``hook`` will be fired before
-                all existing ``forward`` hooks on this
+            prepend (bool): If ``True``, the provided ``hook`` will be fired
+                before all existing ``forward`` hooks on this
                 :class:`torch.nn.modules.Module`. Otherwise, the provided
                 ``hook`` will be fired after all existing ``forward`` hooks on
                 this :class:`torch.nn.modules.Module`. Note that global
                 ``forward`` hooks registered with
                 :func:`register_module_forward_hook` will fire before all hooks
                 registered by this method.
+                Default: ``False``
+            with_kwargs (bool): If ``True``, the ``hook`` will be passed the
+                kwargs given to the forward function.
+                Default: ``False``
 
         Returns:
             :class:`torch.utils.hooks.RemovableHandle`:
                 a handle that can be used to remove the added hook by calling
                 ``handle.remove()``
         """
-        handle = hooks.RemovableHandle(self._forward_hooks)
+        handle = hooks.RemovableHandle(
+            self._forward_hooks,
+            extra_dict=self._forward_hooks_with_kwargs
+        )
         self._forward_hooks[handle.id] = hook
+        if with_kwargs:
+            self._forward_hooks_with_kwargs[handle.id] = True
+
         if prepend:
             self._forward_hooks.move_to_end(handle.id, last=False)  # type: ignore[attr-defined]
         return handle
@@ -1417,14 +1470,14 @@ def _slow_forward(self, *input, **kwargs):
                 tracing_state.pop_scope()
         return result
 
-    def _call_impl(self, *input, **kwargs):
+    def _call_impl(self, *args, **kwargs):
         forward_call = (self._slow_forward if torch._C._get_tracing_state() else self.forward)
         # If we don't have any hooks, we want to skip the rest of the logic in
         # this function, and just call forward.
         if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
                 or _global_backward_pre_hooks or _global_backward_hooks
                 or _global_forward_hooks or _global_forward_pre_hooks):
-            return forward_call(*input, **kwargs)
+            return forward_call(*args, **kwargs)
         # Do not call functions when jit is used
         full_backward_hooks, non_full_backward_hooks = [], []
         backward_pre_hooks = []
@@ -1433,23 +1486,45 @@ def _call_impl(self, *input, **kwargs):
 
         if self._backward_hooks or _global_backward_hooks:
             full_backward_hooks, non_full_backward_hooks = self._get_backward_hooks()
+
         if _global_forward_pre_hooks or self._forward_pre_hooks:
-            for hook in (*_global_forward_pre_hooks.values(), *self._forward_pre_hooks.values()):
-                result = hook(self, input)
-                if result is not None:
-                    if not isinstance(result, tuple):
-                        result = (result,)
-                    input = result
+            for hook_id, hook in (
+                *_global_forward_pre_hooks.items(),
+                *self._forward_pre_hooks.items(),
+            ):
+                if hook_id in self._forward_pre_hooks_with_kwargs:
+                    result = hook(self, args, kwargs)  # type: ignore[misc]
+                    if result is not None:
+                        if isinstance(result, tuple) and len(result) == 2:
+                            args, kwargs = result
+                        else:
+                            raise RuntimeError(
+                                "forward pre-hook must return None or a tuple "
+                                f"of (new_args, new_kwargs), but got {result}."
+                            )
+                else:
+                    result = hook(self, args)
+                    if result is not None:
+                        if not isinstance(result, tuple):
+                            result = (result,)
+                        args = result
 
         bw_hook = None
         if full_backward_hooks or backward_pre_hooks:
             bw_hook = hooks.BackwardHook(self, full_backward_hooks, backward_pre_hooks)
-            input = bw_hook.setup_input_hook(input)
+            args = bw_hook.setup_input_hook(args)
 
-        result = forward_call(*input, **kwargs)
+        result = forward_call(*args, **kwargs)
         if _global_forward_hooks or self._forward_hooks:
-            for hook in (*_global_forward_hooks.values(), *self._forward_hooks.values()):
-                hook_result = hook(self, input, result)
+            for hook_id, hook in (
+                *_global_forward_hooks.items(),
+                *self._forward_hooks.items(),
+            ):
+                if hook_id in self._forward_hooks_with_kwargs:
+                    hook_result = hook(self, args, kwargs, result)
+                else:
+                    hook_result = hook(self, args, result)
+
                 if hook_result is not None:
                     result = hook_result
 
@@ -1468,7 +1543,7 @@ def _call_impl(self, *input, **kwargs):
             if grad_fn is not None:
                 for hook in non_full_backward_hooks:
                     grad_fn.register_hook(_WrappedHook(hook, self))
-                self._maybe_warn_non_full_backward_hook(input, result, grad_fn)
+                self._maybe_warn_non_full_backward_hook(args, result, grad_fn)
 
         return result
 
@@ -1479,6 +1554,10 @@ def __setstate__(self, state):
         # Support loading old checkpoints that don't have the following attrs:
         if '_forward_pre_hooks' not in self.__dict__:
             self._forward_pre_hooks = OrderedDict()
+        if '_forward_pre_hooks_with_kwargs' not in self.__dict__:
+            self._forward_pre_hooks_with_kwargs = OrderedDict()
+        if '_forward_hooks_with_kwargs' not in self.__dict__:
+            self._forward_hooks_with_kwargs = OrderedDict()
         if '_state_dict_hooks' not in self.__dict__:
             self._state_dict_hooks = OrderedDict()
         if '_load_state_dict_pre_hooks' not in self.__dict__:
diff --git a/torch/utils/hooks.py b/torch/utils/hooks.py
index 133d2c0d2ceb5..be9a4c1f0a653 100644
--- a/torch/utils/hooks.py
+++ b/torch/utils/hooks.py
@@ -7,23 +7,45 @@
 __all__ = ["RemovableHandle", "unserializable_hook", "warn_if_has_hooks", "BackwardHook"]
 
 class RemovableHandle(object):
-    """A handle which provides the capability to remove a hook."""
+    r"""
+    A handle which provides the capability to remove a hook.
+
+    Args:
+        hooks_dict (dict): A dictionary of hooks, indexed by hook ``id``.
+        extra_dict (dict): An additional dictionary whose keys will be deleted
+            when the same keys are removed from ``hooks_dict``.
+    """
 
     id: int
     next_id: int = 0
 
-    def __init__(self, hooks_dict: Any) -> None:
+    def __init__(self, hooks_dict: Any, *, extra_dict: Any = None) -> None:
         self.hooks_dict_ref = weakref.ref(hooks_dict)
         self.id = RemovableHandle.next_id
         RemovableHandle.next_id += 1
 
+        self.extra_dict_ref = (
+            weakref.ref(extra_dict)
+            if extra_dict is not None
+            else None
+        )
+
     def remove(self) -> None:
         hooks_dict = self.hooks_dict_ref()
         if hooks_dict is not None and self.id in hooks_dict:
             del hooks_dict[self.id]
 
+        if self.extra_dict_ref is not None:
+            extra_dict = self.extra_dict_ref()
+            if extra_dict is not None and self.id in extra_dict:
+                del extra_dict[self.id]
+
     def __getstate__(self):
-        return (self.hooks_dict_ref(), self.id)
+        return (
+            (self.hooks_dict_ref(), self.id)
+            if self.extra_dict_ref is None
+            else (self.hooks_dict_ref(), self.id, self.extra_dict_ref())
+        )
 
     def __setstate__(self, state) -> None:
         if state[0] is None:
@@ -34,7 +56,13 @@ def __setstate__(self, state) -> None:
         self.id = state[1]
         RemovableHandle.next_id = max(RemovableHandle.next_id, self.id + 1)
 
-    def __enter__(self) -> 'RemovableHandle':
+        self.extra_dict_ref = (
+            None
+            if len(state) < 3
+            else weakref.ref(OrderedDict() if state[2] is None else state[2])
+        )
+
+    def __enter__(self) -> "RemovableHandle":
         return self
 
     def __exit__(self, type: Any, value: Any, tb: Any) -> None:

From a0f272961320f393e27f6182ed3cd16986712b79 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Tue, 22 Nov 2022 12:02:59 -0800
Subject: [PATCH 1193/1922] Don't use explain() for --explain; instead read it
 off the counters (#89518)

Fixes huggingface problem where example_inputs is not actually the
args.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89518
Approved by: https://github.com/albanD
---
 benchmarks/dynamo/common.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 6d77326c8f2cb..98a326d77191f 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -1319,6 +1319,8 @@ def run_one_model(
             print("RUNNING ON BRANCH:", branch)
         mode = "train" if self.args.training else "eval"
         print(f"{current_device:4} {mode:5} {current_name:34} ", end="", flush=True)
+        start_calls_captured = torch._dynamo.utils.counters["stats"]["calls_captured"]
+        start_unique_graphs = torch._dynamo.utils.counters["stats"]["unique_graphs"]
         if self.args.accuracy:
             status = self.check_accuracy(
                 name, model, example_inputs, optimize_ctx, experiment
@@ -1329,8 +1331,13 @@ def run_one_model(
                 name, model, example_inputs, optimize_ctx, experiment
             )
             print(status)
+        end_calls_captured = torch._dynamo.utils.counters["stats"]["calls_captured"]
+        end_unique_graphs = torch._dynamo.utils.counters["stats"]["unique_graphs"]
         if explain:
-            print(torch._dynamo.explain(model, *example_inputs)[0])
+            print(
+                f"Dynamo produced {end_unique_graphs-start_unique_graphs} graph(s) "
+                f"covering {end_calls_captured-start_calls_captured} ops"
+            )
 
 
 def help(fn):
@@ -1528,7 +1535,7 @@ def get_example_inputs(self):
     parser.add_argument(
         "--explain",
         action="store_true",
-        help="run .explain() on the graph at the end of the run.",
+        help="print some graph/op statistics during the run, similar to .explain()",
     )
 
     parser.add_argument(

From 065c23d41a85c72bdf37092a724f198bf3751327 Mon Sep 17 00:00:00 2001
From: Sergii Dymchenko <sdym@fb.com>
Date: Wed, 23 Nov 2022 02:59:25 +0000
Subject: [PATCH 1194/1922] Delete unused variable assignment in
 _refs/__init__.py (#89538)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89538
Approved by: https://github.com/huydhn
---
 torch/_refs/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index fda73cf0bc608..fac2509afd414 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -4637,7 +4637,6 @@ def randn(
 
     dtype = utils.dtype_or_default(dtype)
     device = utils.device_or_default(device)
-    layout = utils.layout_or_default(layout)
 
     return prims.normal(
         shape_,

From d44d0c67595afaf1b3864502eb9c9cb18d3d6f17 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Wed, 23 Nov 2022 03:07:22 +0000
Subject: [PATCH 1195/1922] Update CUDA compiler matrix (#86360)

Switch GCC/Clang max versions to be exclusive as the `include/crt/host_config.h` checks the major version only for the upper bound. This allows to be less restrictive and match the checks in the aforementioned header.
Also update the versions using that header in the CUDA SDKs.

Follow up to #82860

I noticed this as PyTorch 1.12.1 with CUDA 11.3.1 and GCC 10.3 was failing in the `test_cpp_extensions*` tests.

Example for CUDA 11.3.1 from the SDK header:

```
#if __GNUC__ > 11
// Error out
...
#if (__clang_major__ >= 12) || (__clang_major__ < 3) || ((__clang_major__ == 3) &&  (__clang_minor__ < 3))
// Error out
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86360
Approved by: https://github.com/ezyang
---
 torch/utils/cpp_extension.py | 61 +++++++++++++++++++-----------------
 1 file changed, 33 insertions(+), 28 deletions(-)

diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 720935296504f..d74ef9a38372d 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -18,7 +18,7 @@
 from ._cpp_extension_versioner import ExtensionVersioner
 from .hipify import hipify_python
 from .hipify.hipify_python import GeneratedFileCleaner
-from typing import List, Optional, Union, Tuple
+from typing import Dict, List, Optional, Union, Tuple
 from torch.torch_version import TorchVersion
 
 from setuptools.command.build_ext import build_ext
@@ -42,29 +42,35 @@
 MINIMUM_GCC_VERSION = (5, 0, 0)
 MINIMUM_MSVC_VERSION = (19, 0, 24215)
 
+VersionRange = Tuple[Tuple[int, ...], Tuple[int, ...]]
+VersionMap = Dict[str, VersionRange]
 # The following values were taken from the following GitHub gist that
 # summarizes the minimum valid major versions of g++/clang++ for each supported
 # CUDA version: https://gist.github.com/ax3l/9489132
-CUDA_GCC_VERSIONS = {
-    '10.2': (MINIMUM_GCC_VERSION, (8, 0, 0)),
-    '11.1': (MINIMUM_GCC_VERSION, (10, 0, 0)),
-    '11.2': (MINIMUM_GCC_VERSION, (10, 2, 1)),
-    '11.3': (MINIMUM_GCC_VERSION, (10, 2, 1)),
-    '11.4': ((6, 0, 0), (11, 5, 0)),
-    '11.5': ((6, 0, 0), (11, 5, 0)),
-    '11.6': ((6, 0, 0), (11, 5, 0)),
-    '11.7': ((6, 0, 0), (11, 5, 0)),
+# Or from include/crt/host_config.h in the CUDA SDK
+# The second value is the exclusive(!) upper bound, i.e. min <= version < max
+CUDA_GCC_VERSIONS: VersionMap = {
+    '10.2': (MINIMUM_GCC_VERSION, (9, 0)),
+    '11.0': (MINIMUM_GCC_VERSION, (10, 0)),
+    '11.1': (MINIMUM_GCC_VERSION, (11, 0)),
+    '11.2': (MINIMUM_GCC_VERSION, (11, 0)),
+    '11.3': (MINIMUM_GCC_VERSION, (11, 0)),
+    '11.4': ((6, 0, 0), (12, 0)),
+    '11.5': ((6, 0, 0), (12, 0)),
+    '11.6': ((6, 0, 0), (12, 0)),
+    '11.7': ((6, 0, 0), (12, 0)),
 }
 
-CUDA_CLANG_VERSIONS = {
-    '10.2': ((3, 3, 0), (8, 0, 0)),
-    '11.1': ((6, 0, 0), (9, 0, 0)),
-    '11.2': ((6, 0, 0), (9, 0, 0)),
-    '11.3': ((6, 0, 0), (11, 0, 0)),
-    '11.4': ((6, 0, 0), (11, 0, 0)),
-    '11.5': ((6, 0, 0), (12, 0, 0)),
-    '11.6': ((6, 0, 0), (12, 0, 0)),
-    '11.7': ((6, 0, 0), (13, 0, 0)),
+MINIMUM_CLANG_VERSION = (3, 3, 0)
+CUDA_CLANG_VERSIONS: VersionMap = {
+    '10.2': (MINIMUM_CLANG_VERSION, (9, 0)),
+    '11.1': (MINIMUM_CLANG_VERSION, (11, 0)),
+    '11.2': (MINIMUM_CLANG_VERSION, (12, 0)),
+    '11.3': (MINIMUM_CLANG_VERSION, (12, 0)),
+    '11.4': (MINIMUM_CLANG_VERSION, (13, 0)),
+    '11.5': (MINIMUM_CLANG_VERSION, (13, 0)),
+    '11.6': (MINIMUM_CLANG_VERSION, (14, 0)),
+    '11.7': (MINIMUM_CLANG_VERSION, (14, 0)),
 }
 
 __all__ = ["get_default_build_root", "check_compiler_ok_for_platform", "get_compiler_abi_compatibility_and_version", "BuildExtension",
@@ -388,20 +394,19 @@ def _check_cuda_version(compiler_name: str, compiler_version: TorchVersion) -> N
             _is_binary_build()):
         return
 
-    cuda_compiler_bounds = CUDA_CLANG_VERSIONS if compiler_name.startswith('clang') else CUDA_GCC_VERSIONS
+    cuda_compiler_bounds: VersionMap = CUDA_CLANG_VERSIONS if compiler_name.startswith('clang') else CUDA_GCC_VERSIONS
 
     if cuda_str_version not in cuda_compiler_bounds:
         warnings.warn(f'There are no {compiler_name} version bounds defined for CUDA version {cuda_str_version}')
     else:
-        min_compiler_version, max_compiler_version = cuda_compiler_bounds[cuda_str_version]
-        # Special case for 11.4.0, which has lower compiler bounds that 11.4.1
+        min_compiler_version, max_excl_compiler_version = cuda_compiler_bounds[cuda_str_version]
+        # Special case for 11.4.0, which has lower compiler bounds than 11.4.1
         if "V11.4.48" in cuda_version_str and cuda_compiler_bounds == CUDA_GCC_VERSIONS:
-            max_compiler_version = (10, 0, 0)
+            max_excl_compiler_version = (11, 0)
         min_compiler_version_str = '.'.join(map(str, min_compiler_version))
-        max_compiler_version_str = '.'.join(map(str, max_compiler_version))
+        max_excl_compiler_version_str = '.'.join(map(str, max_excl_compiler_version))
 
-        version_bound_str = f'>={min_compiler_version_str}'
-        version_bound_str = f'{version_bound_str}, <={max_compiler_version_str}'
+        version_bound_str = f'>={min_compiler_version_str}, <{max_excl_compiler_version_str}'
 
         if compiler_version < TorchVersion(min_compiler_version_str):
             raise RuntimeError(
@@ -409,10 +414,10 @@ def _check_cuda_version(compiler_name: str, compiler_version: TorchVersion) -> N
                 f'than the minimum required version by CUDA {cuda_str_version} ({min_compiler_version_str}). '
                 f'Please make sure to use an adequate version of {compiler_name} ({version_bound_str}).'
             )
-        if compiler_version > TorchVersion(max_compiler_version_str):
+        if compiler_version >= TorchVersion(max_excl_compiler_version_str):
             raise RuntimeError(
                 f'The current installed version of {compiler_name} ({compiler_version}) is greater '
-                f'than the maximum required version by CUDA {cuda_str_version} ({max_compiler_version_str}). '
+                f'than the maximum required version by CUDA {cuda_str_version}. '
                 f'Please make sure to use an adequate version of {compiler_name} ({version_bound_str}).'
             )
 

From 713f4dfc73d0c87b9e639f3c052d9acfd8ae5c1c Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Wed, 23 Nov 2022 03:31:17 +0000
Subject: [PATCH 1196/1922] [BE] Add more `ssh` instructions (#89516)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89516
Approved by: https://github.com/huydhn
---
 .github/workflows/_docs.yml       | 8 ++++++++
 .github/workflows/_linux-test.yml | 3 +++
 .github/workflows/_win-build.yml  | 9 +++++++++
 3 files changed, 20 insertions(+)

diff --git a/.github/workflows/_docs.yml b/.github/workflows/_docs.yml
index cb4dc71c8c897..318471e7c7860 100644
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@@ -67,6 +67,11 @@ jobs:
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
+          instructions: |
+            All builds are done inside the container, to start an interactive session run:
+              docker exec -it $(docker container ps --format '{{.ID}}') bash
+            To start Python docs build type:
+              cd docs && make html && make coverage
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
@@ -170,3 +175,6 @@ jobs:
           if-no-files-found: error
           path: functorch_ghpages/nightly/
           s3-prefix: pytorch/${{ github.event.pull_request.number }}/functorchdocs
+      - name: Teardown Linux
+        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        if: always()
diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
index 454e558fbee49..a444a5fc530a8 100644
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@@ -68,6 +68,9 @@ jobs:
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
+          instructions: |
+            All testing is done inside the container, to start an interactive session run:
+              docker exec -it $(docker container ps --format '{{.ID}}') bash
 
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml
index 3ce41afaf7cc1..8baaca498d176 100644
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@@ -50,6 +50,15 @@ jobs:
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
+          instructions: |
+            To forward remote desktop on your local machine ssh as follows:
+              ssh -L 3389:localhost:3389 %%username%%@%%hostname%%
+            And then change password using `passwd` command.
+
+            To start build locally, change working folder to \actions-runner\_work\pytorch\pytorch,
+            Activate miniconda and Visual Studio environment, but running:
+              call C:\Jenkins\Miniconda3\Scripts\activate.bat C:\Jenkins\Miniconda3
+              call "C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Auxiliary\Build\vcvarsall.bat" x64
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch

From a2660f8461630a17d40c3c7f1de702117e7843dd Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Tue, 22 Nov 2022 07:47:48 -0800
Subject: [PATCH 1197/1922] Add crossref debug mode for functionalization,
 catches stride errors (#89498)

The idea is to add a custom handler to Functionalize key in Python
dispatcher that runs the functionalized version along side a non
functionalized version, and checks that their outputs agree in the
end.  (Technically, for metadata mutation we should also check the
inputs, but for now we're relying on those functions returning self.)
I turned this on for test_functionalize.py (new TestCrossRefFunctionalize)
and found a bunch of failures that look legit.

This probably doesn't interact that nicely if you're also tracing at
the same time, probably need more special logic for that (directly,
just disabling tracing for when we create the nested fake tensor mode,
but IDK if there's a more principled way to organize this.)

There are some misc fixups which I can split if people really want.

- xfail_inherited_tests moved to test common_utils
- Bindings for _dispatch_tls_set_dispatch_key_included,
  _dispatch_tls_is_dispatch_key_included and _functionalization_reapply_views_tls
- Type stubs for _enable_functionalization, _disable_functionalization
- all_known_overloads utility to let you iterate over all OpOverloads
  in all namespaces.  Iterator support on all torch._ops objects to let
  you iterate over their members.
- suspend_functionalization lets you temporarily disable functionalization mode
  in a context
- check_metadata_matches for easily comparing outputs of functions and see
  if they match (TODO: there are a few copies of this logic, consolidate!)
- _fmt for easily printing the metadata of a tensor without its data
- _uncache_dispatch for removing a particular dispatch key from the cache,
  so that we force it to regenerate
- check_significant_strides new kwarg only_cuda to let you also do stride
  test even when inputs are not CUDA
- Functionalize in torch._C.DispatchKey

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89498
Approved by: https://github.com/malfet
---
 test/test_functionalization.py          |  82 +++++++++++-----
 test/test_proxy_tensor.py               |  29 +-----
 tools/pyi/gen_pyi.py                    |   4 +
 torch/_C/__init__.pyi.in                |   3 +
 torch/_dispatch/python.py               | 122 ++++++++++++++++++++++++
 torch/_ops.py                           |  40 +++++++-
 torch/_prims_common/__init__.py         |   4 +-
 torch/csrc/utils/python_dispatch.cpp    |  14 +++
 torch/testing/_internal/common_utils.py |  26 +++++
 torchgen/model.py                       |   1 +
 10 files changed, 268 insertions(+), 57 deletions(-)

diff --git a/test/test_functionalization.py b/test/test_functionalization.py
index 0ab552d0d04a5..0731cae285b03 100644
--- a/test/test_functionalization.py
+++ b/test/test_functionalization.py
@@ -1,11 +1,16 @@
 # Owner(s): ["module: codegen"]
 
 import torch
-from torch.testing._internal.common_utils import TestCase, run_tests, skipIfTorchDynamo, TEST_WITH_TORCHDYNAMO
+from contextlib import nullcontext
+from torch.testing._internal.common_utils import (
+    TestCase, run_tests, skipIfTorchDynamo, TEST_WITH_TORCHDYNAMO,
+    xfail_inherited_tests
+)
 from torch.testing._internal.logging_tensor import LoggingTensor, capture_logs
 from torch.utils._pytree import tree_map
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.passes.reinplace import reinplace
+from torch._dispatch.python import enable_crossref_functionalize
 
 import unittest
 
@@ -21,34 +26,40 @@ def are_aliased(x, y):
 # We can unify testing and use functionalize() here instead
 # if/when functorch moves into core.
 # This is basically a crappy version of `functionalize()` for single-tensor-arg inputs.
-def _functionalize(f, *, reapply_views: bool):
+def _functionalize(f, *, reapply_views: bool, crossref: bool):
     def wrapped(a):
-        input_functional = torch._to_functional_tensor(a)
-        input_functional.requires_grad = a.requires_grad
-        torch._enable_functionalization(reapply_views=reapply_views)
-        try:
-            out = f(input_functional)
-        finally:
-            torch._disable_functionalization()
-        torch._sync(input_functional)
-        inpt_new = torch._from_functional_tensor(input_functional)
-        if inpt_new is not a:
-            # Existing deficiency in functionalize():
-            # we don't correctly mutate input metadata (yet?)
-            if inpt_new.shape == a.shape:
-                a.copy_(inpt_new)
-        tree_map(torch._sync, out)
-        out_unwrapped = tree_map(torch._from_functional_tensor, out)
-        return out_unwrapped
+        ctx = nullcontext()
+        if crossref:
+            ctx = enable_crossref_functionalize()
+        with ctx:
+            input_functional = torch._to_functional_tensor(a)
+            input_functional.requires_grad = a.requires_grad
+            torch._enable_functionalization(reapply_views=reapply_views)
+            try:
+                out = f(input_functional)
+            finally:
+                torch._disable_functionalization()
+            torch._sync(input_functional)
+            inpt_new = torch._from_functional_tensor(input_functional)
+            if inpt_new is not a:
+                # Existing deficiency in functionalize():
+                # we don't correctly mutate input metadata (yet?)
+                if inpt_new.shape == a.shape:
+                    a.copy_(inpt_new)
+            tree_map(torch._sync, out)
+            out_unwrapped = tree_map(torch._from_functional_tensor, out)
+            return out_unwrapped
 
     return wrapped
 
 @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "https://github.com/pytorch/pytorch/issues/81457")
 class TestFunctionalization(TestCase):
 
+    crossref = False
+
     def get_logs(self, func, inpt, *, reapply_views=False, run_reinplace=False):
         inpt_clone = inpt.clone()
-        traced_f = make_fx(_functionalize(func, reapply_views=reapply_views))(inpt)
+        traced_f = make_fx(_functionalize(func, reapply_views=reapply_views, crossref=self.crossref))(inpt)
         if run_reinplace:
             traced_f = reinplace(traced_f, inpt_clone)
         return traced_f.code
@@ -60,10 +71,15 @@ def assert_functionalization(self, func, inpt, *, reapply_views=False, mutated_i
 
         # Compare outputs (and mutated inputs), with and without functionalization.
         out_ref = func(inpt)
-        out_functional = _functionalize(func, reapply_views=reapply_views)(input_clone)
+        out_functional = _functionalize(func, reapply_views=reapply_views, crossref=self.crossref)(input_clone)
         # The reinplacing pass is only valid to run with reapply_views=True.
-        functional_func = make_fx(_functionalize(func, reapply_views=True))(input_clone2)
-        reinplace_func = reinplace(make_fx(_functionalize(func, reapply_views=True))(input_clone2), input_clone2)
+        functional_func = make_fx(_functionalize(func, reapply_views=True, crossref=self.crossref))(input_clone2)
+        reinplace_func = reinplace(
+            make_fx(
+                _functionalize(func, reapply_views=True, crossref=self.crossref)
+            )(input_clone2),
+            input_clone2
+        )
 
         # NOTE: for now, need to pass in fresh inputs here, because make_fx
         # will directly mutate the inputs that you trace with.
@@ -112,7 +128,7 @@ def f(x):
             self.assertRaises(RuntimeError, lambda: z.add_(1))
             return z
 
-        _functionalize(f, reapply_views=True)(torch.ones(3, 3))
+        _functionalize(f, reapply_views=True, crossref=self.crossref)(torch.ones(3, 3))
 
     def test_copy_stride_mismatch(self):
         def f(x):
@@ -120,7 +136,7 @@ def f(x):
             y.copy_(x)
             return y
 
-        r = _functionalize(f, reapply_views=True)(torch.ones(2, 2))
+        r = _functionalize(f, reapply_views=True, crossref=self.crossref)(torch.ones(2, 2))
         self.assertEqual(r.stride(), (5, 1))
 
     def test_view_clone_view_inplace(self):
@@ -334,7 +350,7 @@ def f(x):
             out = x[functional_tensor, nonfunctional_tensor]
             return out
         out = f(torch.ones(2, 2))
-        out_functional = _functionalize(f, reapply_views=True)(torch.ones(2, 2))
+        out_functional = _functionalize(f, reapply_views=True, crossref=self.crossref)(torch.ones(2, 2))
         self.assertEqual(out, out_functional)
 
     def test_inplace_on_non_view(self):
@@ -1212,5 +1228,19 @@ def forward(self, a_1):
     return zeros
     """)
 
+@xfail_inherited_tests([
+    "test_as_strided",
+    "test_copy_",
+    "test_diagonal",
+    "test_diagonal_mutated_input",
+    "test_everything",
+    "test_fill_",
+    "test_split",
+    "test_view_clone_view_inplace",
+    "test_view_inplace",
+])
+class TestCrossRefFunctionalization(TestFunctionalization):
+    crossref = True
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 21142f56e7296..60fc4f07c847e 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1,6 +1,6 @@
 # Owner(s): ["module: ProxyTensor"]
 
-from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS
+from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS, xfail_inherited_tests
 import torch
 import unittest
 import warnings
@@ -21,7 +21,6 @@
 from torch import nn
 import re
 
-import types
 import functools
 import itertools
 
@@ -71,16 +70,6 @@ def create_normalized_name(op):
     print("}")
 
 
-def copy_func(f):
-    """Based on http://stackoverflow.com/a/6528148/190597 (Glenn Maynard)"""
-    g = types.FunctionType(f.__code__, f.__globals__, name=f.__name__,
-                           argdefs=f.__defaults__,
-                           closure=f.__closure__)
-    g = functools.update_wrapper(g, f)
-    g.__kwdefaults__ = f.__kwdefaults__
-    return g
-
-
 # Copied from functorch
 def xfail(op_name, variant_name='', *, device_type=None, dtypes=None):
     return (op_name, variant_name, device_type, dtypes, True)
@@ -715,22 +704,6 @@ class TestGenericProxyTensorFake(TestGenericProxyTensor):
     tracing_mode = "fake"
 
 
-def xfail_inherited_tests(tests):
-    """
-    Given a list of test names which are defined by a superclass of the
-    class this decorates, mark them as expected failure.  This is useful
-    if you are doing poor man's parameterized tests by subclassing a generic
-    test class.
-    """
-    def deco(cls):
-        for t in tests:
-            # NB: expectedFailure operates by mutating the method in question,
-            # which is why you have to copy the function first
-            setattr(cls, t, unittest.expectedFailure(copy_func(getattr(cls, t))))
-        return cls
-    return deco
-
-
 @skipIfNoSympy
 @xfail_inherited_tests([
     "test_make_fx_overloads",
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index 0d1cdcb4ad06d..43118edb98bd5 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -443,6 +443,10 @@ def gen_pyi(
             "_to_functional_tensor": [
                 "def _to_functional_tensor(t: Tensor) -> Tensor: ..."
             ],
+            "_enable_functionalization": [
+                "def _enable_functionalization(*, reapply_views: _bool = False): ..."
+            ],
+            "_disable_functionalization": ["def _disable_functionalization(): ..."],
             "range": [
                 "def range(start: Number, end: Number,"
                 " step: Number=1, *, out: Optional[Tensor]=None, {}) -> Tensor: ...".format(
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index d69cf1f3477ed..04553be1d44cd 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -1057,10 +1057,13 @@ def _dispatch_find_dangling_impls() -> List[str]: ...
 def _dispatch_get_all_op_names() -> List[str]: ...
 def _dispatch_tls_set_dispatch_key_excluded(dispatch: _dispatchkey, val: _bool) -> None: ...
 def _dispatch_tls_is_dispatch_key_excluded(dispatch: _dispatchkey) -> _bool: ...
+def _dispatch_tls_set_dispatch_key_included(dispatch: _dispatchkey, val: _bool) -> None: ...
+def _dispatch_tls_is_dispatch_key_included(dispatch: _dispatchkey) -> _bool: ...
 def _dispatch_isTensorSubclassLike(tensor: Tensor) -> _bool: ...
 def _dispatch_key_name(dispatch: _dispatchkey) -> str: ...
 def _dispatch_key_parse(dispatch: _dispatchkey) -> DispatchKey: ...
 def _dispatch_num_backends() -> _int: ...
+def _functionalization_reapply_views_tls() -> _bool: ...
 
 class DispatchKey(Enum):
     ${dispatch_key_hints}
diff --git a/torch/_dispatch/python.py b/torch/_dispatch/python.py
index 95b7fa05bfe2c..f0814889ba2d2 100644
--- a/torch/_dispatch/python.py
+++ b/torch/_dispatch/python.py
@@ -1,5 +1,9 @@
 import torch._C
 from contextlib import contextmanager
+import unittest.mock
+import torch
+import torch.utils._pytree as pytree
+import itertools
 
 __all__ = ['enable_python_dispatcher', 'no_python_dispatcher']
 
@@ -18,3 +22,121 @@ def enable_python_dispatcher():
         yield
     finally:
         del g
+
+CROSSREF_FUNCTIONALIZE = False
+
+def all_known_overloads():
+    for ns in torch.ops:
+        packets = getattr(torch.ops, ns)
+        for op_name in packets:
+            packet = getattr(packets, op_name)
+            for overload in packet:
+                yield getattr(packet, overload)
+
+@contextmanager
+def suspend_functionalization():
+    f_tls = torch._C._dispatch_tls_is_dispatch_key_included(torch._C.DispatchKey.Functionalize)
+    f_rv = torch._C._functionalization_reapply_views_tls()
+    if f_tls:
+        torch._disable_functionalization()
+    try:
+        yield
+    finally:
+        if f_tls:
+            torch._enable_functionalization(reapply_views=f_rv)
+
+def check_tensor_metadata_matches(nv, rv, desc):
+    assert callable(desc)
+    assert nv.size() == rv.size(), f"{desc()}: sizes {nv.size()} != {rv.size()}"
+    assert nv.dtype == rv.dtype, f"{desc()}: dtype {nv.dtype} != {rv.dtype}"
+    same_strides, idx = torch._prims_common.check_significant_strides(nv, rv, only_cuda=False)
+    assert same_strides, f"{desc()}: strides {nv.stride()} != {rv.stride()} (mismatch at index {idx})"
+
+def check_metadata_matches(n, r, desc):
+    assert callable(desc)
+    n_vals, n_spec = pytree.tree_flatten(n)
+    r_vals, r_spec = pytree.tree_flatten(r)
+    # TODO: test the specs match; empirically  sometimes we have a tuple
+    # on one side and a list on the other
+    assert len(n_vals) == len(r_vals), f"{len(n_vals)} != {len(r_vals)}"
+    for i, nv, rv in zip(range(len(n_vals)), n_vals, r_vals):
+        if not isinstance(rv, torch.Tensor):
+            continue
+        check_tensor_metadata_matches(nv, rv, lambda: f"{desc()} output {i}")
+
+class Lit:
+    def __init__(self, s):
+        self.s = s
+
+    def __repr__(self):
+        return self.s
+
+def _fmt(a: object) -> object:
+    if isinstance(a, torch.Tensor):
+        return Lit(f"torch.empty_strided({tuple(a.size())}, {a.stride()}, dtype={a.dtype})")
+    else:
+        return a
+
+def make_crossref_functionalize(op, final_key):
+    from torch._subclasses.fake_tensor import FakeTensorMode
+    # This case is pretty weird, suppress it for now
+    if op == torch.ops.aten.lift_fresh.default:
+        return final_key
+
+    def handler(*args, **kwargs):
+        fake_mode = FakeTensorMode()
+
+        def fakeify_defun(t):
+            if isinstance(t, torch.Tensor):
+                if torch._is_functional_tensor(t):
+                    r = torch._from_functional_tensor(t)
+                    # NB: This assumes that the inner tensor sizes/strides match
+                    # the outer tensor sizes/strides.  This doesn't necessarily have to
+                    # be the case, see discussion at
+                    # https://github.com/pytorch/pytorch/pull/87610/files/401ddeda1d769bedc88a12de332c7357b60e51a4#r1007264456
+                    assert t.size() == r.size()
+                    assert t.stride() == r.stride()
+                else:
+                    r = t
+                # TODO: suppress guards
+                return fake_mode.from_tensor(r)
+            return t
+
+        def maybe_detach(t):
+            if isinstance(t, torch.Tensor):
+                return t.detach()
+            else:
+                return t
+
+        with suspend_functionalization():
+            f_args, f_kwargs = pytree.tree_map(fakeify_defun, (args, kwargs))
+            orig_f_args, orig_f_kwargs = pytree.tree_map(maybe_detach, (f_args, f_kwargs))
+            with fake_mode:
+                f_r = op(*f_args, **f_kwargs)
+        r = op._op_dk(final_key, *args, **kwargs)
+
+        def desc():
+            fmt_args = ", ".join(
+                itertools.chain(
+                    (repr(pytree.tree_map(_fmt, a)) for a in orig_f_args),
+                    (f"{k}={pytree.tree_map(_fmt, v)}" for k, v in orig_f_kwargs.items()),
+                )
+            )
+            return f"{op}({fmt_args})"
+        check_metadata_matches(f_r, r, desc)
+        return r
+    return handler
+
+# NB: enabling this is slow, don't do it in a hot loop.  This is purely
+# for debugging purposes.
+@contextmanager
+def enable_crossref_functionalize():
+    for op in all_known_overloads():
+        op._uncache_dispatch(torch._C.DispatchKey.Functionalize)
+    try:
+        with enable_python_dispatcher(), unittest.mock.patch(
+                'torch._dispatch.python.CROSSREF_FUNCTIONALIZE', True):
+            yield
+    finally:
+        for op in all_known_overloads():
+            op._uncache_dispatch(torch._C.DispatchKey.Functionalize)
diff --git a/torch/_ops.py b/torch/_ops.py
index b20398a7f3ab3..033d8f361eed7 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -335,10 +335,21 @@ def inner(fn):
 
         return inner
 
+    # Remove a dispatch key from the dispatch cache.  This will force it to get
+    # recomputed the next time.  Does nothing
+    # WARNING: if you register a dispatch key to py_kernels of an OpOverload,
+    # calling _del_dispatch on that key is NOT sufficient to apply your change,
+    # because a single registration may affect MULTIPLE dispatch keys (e.g.,
+    # registering Autograd affects AutogradCPU).  del_dispatch is to be used
+    # only if you are specifically modifying how get_dispatch handles a
+    # particular input 'key'.
+    def _uncache_dispatch(self, key):
+        self._dispatch_cache.pop(key, None)
+
     # This implements the pre-computation logic for the Python dispatcher.
     def _get_dispatch(self, key):
         # This is only called upon a cache miss
-        assert key not in self._dispatch_cache
+        assert key not in self._dispatch_cache, f"{self} {key}"
 
         if key == torch._C.DispatchKey.Python:
             if not self.python_key_mode_table:
@@ -365,6 +376,18 @@ def handler(*args, **kwargs):
             return handler
 
         final_key = resolve_key(self, key)
+
+        # TODO: We could potentially have lots of debugging wrappers against
+        # dispatch keys; design some general registration mechanism instead of
+        # having if statement for each of them
+        if key == torch._C.DispatchKey.Functionalize:
+            import torch._dispatch.python as pydispatch
+
+            if pydispatch.CROSSREF_FUNCTIONALIZE:
+                handler = pydispatch.make_crossref_functionalize(self, final_key)
+                self._dispatch_cache[key] = handler
+                return handler
+
         # print(self, key, final_key)
         r = self.py_kernels.get(final_key, final_key)
         self._dispatch_cache[key] = r
@@ -398,6 +421,7 @@ def __init__(self, qualified_op_name, op_name, op, overload_names):
         self.__name__ = op_name
         self._op = op
         self._overload_names = overload_names
+        self._dir = []
 
     # it's a no-op since OpOverloadPacket object is immutable and must be unique for a given op.
     def __deepcopy__(self, memo=None):
@@ -456,6 +480,7 @@ def __getattr__(self, key):
             overload = OpOverload(self, op_, op_dk_, schema, tags)
             # cache the overload object
             setattr(self, key, overload)
+            self._dir.append(key)
             return overload
         except RuntimeError:
             raise AttributeError(
@@ -464,6 +489,9 @@ def __getattr__(self, key):
                 )
             ) from None
 
+    def __iter__(self):
+        return iter(self._dir)
+
     def __call__(self, *args, **kwargs):
         # overloading __call__ to ensure torch.ops.foo.bar()
         # is still callable from JIT
@@ -515,6 +543,10 @@ class _OpNamespace(types.ModuleType):
     def __init__(self, name):
         super(_OpNamespace, self).__init__("torch.ops." + name)
         self.name = name
+        self._dir = []
+
+    def __iter__(self):
+        return iter(self._dir)
 
     def __getattr__(self, op_name):
         # It is not a valid op_name when __file__ is passed in
@@ -547,6 +579,7 @@ def __getattr__(self, op_name):
         # cache the opoverloadpacket to ensure that each op corresponds to
         # a unique OpOverloadPacket object
         setattr(self, op_name, opoverloadpacket)
+        self._dir.append(op_name)
         return opoverloadpacket
 
 
@@ -563,6 +596,7 @@ def __init__(self):
         super(_Ops, self).__init__("torch.ops")
         self.loaded_libraries = set()
         self.pyops = _PyOpNamespace()
+        self._dir = []
 
     def __getattr__(self, name):
         # Check if the name is a pyop
@@ -572,8 +606,12 @@ def __getattr__(self, name):
         # Here we are creating `torch.ops.my_namespace`
         namespace = _OpNamespace(name)
         setattr(self, name, namespace)
+        self._dir.append(name)
         return namespace
 
+    def __iter__(self):
+        return iter(self._dir)
+
     def load_library(self, path):
         """
         Loads a shared library from the given path into the current process.
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index a17dad4f2a92b..1fa4087d06f0e 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -150,13 +150,13 @@ def compare_tensor_meta(a: TensorLikeType, b: TensorLikeType, check_strides=Fals
 
 
 def check_significant_strides(
-    a: TensorLikeType, b: TensorLikeType
+    a: TensorLikeType, b: TensorLikeType, *, only_cuda=True
 ) -> Tuple[bool, Optional[int]]:
     # NOTE: only on CUDA because CPU elementwise strides are incorrect in PyTorch
     # See https://github.com/pytorch/pytorch/issues/77553
     # Only compares strides that are "meaningful" -- strides for dimensions with length > 1
     # and for tensors with more than one element
-    if (a.device.type == "cuda" or b.device.type == "cuda") and a.numel() > 0:
+    if (not only_cuda or a.device.type == "cuda" or b.device.type == "cuda") and a.numel() > 0:
         for idx in range(a.ndim):
             if a.stride()[idx] != b.stride()[idx] and a.shape[idx] > 1:
                 return False, idx
diff --git a/torch/csrc/utils/python_dispatch.cpp b/torch/csrc/utils/python_dispatch.cpp
index e4ce9ccf52175..a44cb3277352d 100644
--- a/torch/csrc/utils/python_dispatch.cpp
+++ b/torch/csrc/utils/python_dispatch.cpp
@@ -3,6 +3,7 @@
 
 #include <ATen/ATen.h>
 #include <ATen/FuncTorchTLS.h>
+#include <ATen/FunctionalTensorWrapper.h>
 #include <ATen/TensorSubclassLikeUtils.h>
 #include <ATen/core/PythonOpRegistrationTrampoline.h>
 #include <ATen/core/dispatch/Dispatcher.h>
@@ -466,6 +467,16 @@ void initDispatchBindings(PyObject* module) {
       [](c10::DispatchKey dispatch_key) {
         return c10::impl::tls_is_dispatch_key_excluded(dispatch_key);
       });
+  m.def(
+      "_dispatch_tls_set_dispatch_key_included",
+      [](c10::DispatchKey dispatch_key, bool desired_state) {
+        c10::impl::tls_set_dispatch_key_included(dispatch_key, desired_state);
+      });
+  m.def(
+      "_dispatch_tls_is_dispatch_key_included",
+      [](c10::DispatchKey dispatch_key) {
+        return c10::impl::tls_is_dispatch_key_included(dispatch_key);
+      });
 
   m.def("_dispatch_isTensorSubclassLike", [](const at::Tensor& tensor) {
     return at::isTensorSubclassLike(tensor);
@@ -551,6 +562,9 @@ void initDispatchBindings(PyObject* module) {
   m.def("_dispatch_tls_local_exclude_set", []() {
     return c10::impl::tls_local_dispatch_key_set().excluded_;
   });
+  m.def("_functionalization_reapply_views_tls", []() {
+    return at::functionalization::impl::getFunctionalizationReapplyViewsTLS();
+  });
   m.def(
       "_dispatch_is_included_in_alias",
       [](c10::DispatchKey a, c10::DispatchKey b) {
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 2c72296d1e30f..fc8dff3809c6f 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -3583,6 +3583,32 @@ def check_bytes(byte_list):
     return torch.tensor(res, device=device, dtype=dtype)
 
 
+def copy_func(f):
+    """Based on http://stackoverflow.com/a/6528148/190597 (Glenn Maynard)"""
+    g = types.FunctionType(f.__code__, f.__globals__, name=f.__name__,
+                           argdefs=f.__defaults__,
+                           closure=f.__closure__)
+    g = functools.update_wrapper(g, f)
+    g.__kwdefaults__ = f.__kwdefaults__
+    return g
+
+
+def xfail_inherited_tests(tests):
+    """
+    Given a list of test names which are defined by a superclass of the
+    class this decorates, mark them as expected failure.  This is useful
+    if you are doing poor man's parameterized tests by subclassing a generic
+    test class.
+    """
+    def deco(cls):
+        for t in tests:
+            # NB: expectedFailure operates by mutating the method in question,
+            # which is why you have to copy the function first
+            setattr(cls, t, unittest.expectedFailure(copy_func(getattr(cls, t))))
+        return cls
+    return deco
+
+
 def sandcastle_skip_if(condition, reason):
     """
     Similar to unittest.skipIf, however in the sandcastle environment it just
diff --git a/torchgen/model.py b/torchgen/model.py
index d57d3372a159a..f511b8ffcb3b9 100644
--- a/torchgen/model.py
+++ b/torchgen/model.py
@@ -93,6 +93,7 @@ class DispatchKey(Enum):
     Batched = auto()
     VmapMode = auto()
     FuncTorchDynamicLayerFrontMode = auto()
+    Functionalize = auto()
     TESTING_ONLY_GenericWrapper = auto()
     TESTING_ONLY_GenericMode = auto()
 

From 302e8d295774abc94e446819d49315463d9956fe Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Mon, 21 Nov 2022 14:19:04 -0800
Subject: [PATCH 1198/1922] [quant] Add dequantize_per_channel in
 quantized_decomposed op library (#89269)

Summary:
att

Test Plan:
python test/test_quantization.py -k test_decomposed_dequantize_per_channel

Reviewers:

Subscribers:

Tasks:

Tags:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89269
Approved by: https://github.com/vkuzo
---
 .../core/test_quantized_tensor.py             | 24 +++++++++++++++++
 torch/ao/quantization/fx/_decomposed.py       | 26 +++++++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py
index 3c19b957a0e7c..4e7ba6409b5b9 100644
--- a/test/quantization/core/test_quantized_tensor.py
+++ b/test/quantization/core/test_quantized_tensor.py
@@ -1539,6 +1539,30 @@ def test_decomposed_quantize_per_channel(self):
         self.assertEqual(quantized_decomposed_X.dtype, dtype)
         self.assertEqual(quantized_X.int_repr(), quantized_decomposed_X)
 
+    def test_decomposed_dequantize_per_channel(self):
+        # register the ops
+        import torch.ao.quantization.fx._decomposed
+        X = torch.randn(5, 10)
+        qdtype = torch.quint8
+        dtype = torch.uint8
+        scales = torch.randn(5,)
+        zero_points = torch.randint(0, 100, (5,))
+        quant_min, quant_max = 0, 255
+        axis = 0
+
+        quantized_X = torch.quantize_per_channel(X, scales, zero_points, axis, qdtype)
+        dequantized_X = torch.dequantize(quantized_X)
+
+        quantized_decomposed_X = \
+            torch.ops.quantized_decomposed.quantize_per_channel(
+                X, scales, zero_points, axis, quant_min, quant_max, dtype)
+        dequantized_decomposed_X = \
+            torch.ops.quantized_decomposed.dequantize_per_channel(
+                quantized_decomposed_X, scales, zero_points, axis, quant_min, quant_max, dtype)
+
+        self.assertEqual(quantized_X.int_repr(), quantized_decomposed_X)
+        self.assertEqual(dequantized_X, dequantized_decomposed_X)
+
 if __name__ == '__main__':
     raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
                        "\tpython test/test_quantization.py TESTNAME\n\n"
diff --git a/torch/ao/quantization/fx/_decomposed.py b/torch/ao/quantization/fx/_decomposed.py
index da793c799a7ae..3b80dfdf2a64d 100644
--- a/torch/ao/quantization/fx/_decomposed.py
+++ b/torch/ao/quantization/fx/_decomposed.py
@@ -110,6 +110,7 @@ def _permute_to_axis_zero(x, axis):
 @impl(quantized_decomposed_lib, "quantize_per_channel", "CompositeExplicitAutograd")
 def quantize_per_channel(input, scales, zero_points, axis, quant_min, quant_max, dtype):
     assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+    assert axis < input.dim(), f"Expecting axis to be < {input.dim()}"
     _quant_min_max_bounds_check(quant_min, quant_max, dtype)
     input, permute_axis_list = _permute_to_axis_zero(input, axis)
     res = torch.zeros_like(input)
@@ -123,3 +124,28 @@ def quantize_per_channel(input, scales, zero_points, axis, quant_min, quant_max,
 
     out = res.permute(tuple(permute_axis_list))
     return out.to(dtype)
+
+# Note: quant_min/quant_max/dtype are not used in the operator, but for now it's kept in
+# the signature as metadata for the input Tensor, this might be useful for pattern
+# matching in the future
+# We will revisit this later if we found there are no use cases for it
+quantized_decomposed_lib.define(
+    "dequantize_per_channel(Tensor input, Tensor scales, Tensor zero_points, int axis, "
+    "int quant_min, int quant_max, ScalarType dtype) -> Tensor")
+
+@impl(quantized_decomposed_lib, "dequantize_per_channel", "CompositeExplicitAutograd")
+def dequantize_per_channel(input, scales, zero_points, axis, quant_min, quant_max, dtype):
+    assert input.dtype == dtype, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+    assert axis < input.dim(), f"Expecting axis to be < {input.dim()}"
+    _quant_min_max_bounds_check(quant_min, quant_max, dtype)
+    input, permute_axis_list = _permute_to_axis_zero(input, axis)
+    res = torch.zeros_like(input, dtype=torch.float32)
+
+    for i in range(input.size(0)):
+        # TODO: investigate why
+        # (input[i] - zero_points[i]).to(torch.float32) * scales[i]
+        # failed the test
+        res[i] = (input[i].to(torch.float32) - zero_points[i]) * scales[i]
+
+    out = res.permute(tuple(permute_axis_list))
+    return out

From 6c42c906353f58aef864d3512cbd34782b3a2732 Mon Sep 17 00:00:00 2001
From: Nikita Karetnikov <nikita@karetnikov.org>
Date: Wed, 23 Nov 2022 00:49:43 +0100
Subject: [PATCH 1199/1922] Symintify `select` (#89326)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89326
Approved by: https://github.com/ezyang
---
 .github/ci_commit_pins/xla.txt                |  2 +-
 aten/src/ATen/FunctionalInverses.cpp          |  5 +--
 aten/src/ATen/functorch/BatchRulesViews.cpp   |  8 ++---
 aten/src/ATen/native/NonSymbolicBC.h          |  1 +
 aten/src/ATen/native/TensorShape.cpp          | 35 ++++++++++---------
 aten/src/ATen/native/native_functions.yaml    | 24 ++++++-------
 .../native/nested/NestedTensorBackward.cpp    |  6 ++--
 tools/autograd/derivatives.yaml               | 14 ++++----
 .../lazy/ts_backend/ts_native_functions.cpp   |  2 +-
 9 files changed, 51 insertions(+), 46 deletions(-)

diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index f680f0ddccb20..41b0981f86c67 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-50855d7babfa7970cba18528c659989b91c83824
+8c2a3c41592aee25dffcf48933e7cbdc5c3fc91c
diff --git a/aten/src/ATen/FunctionalInverses.cpp b/aten/src/ATen/FunctionalInverses.cpp
index ed1026152e32c..2bdc76c7764af 100644
--- a/aten/src/ATen/FunctionalInverses.cpp
+++ b/aten/src/ATen/FunctionalInverses.cpp
@@ -159,10 +159,11 @@ Tensor FunctionalInverses::_reshape_alias_copy_inverse(const Tensor& base, const
     }
 }
 
-Tensor FunctionalInverses::select_copy_int_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, int64_t dim, int64_t index) {
+Tensor FunctionalInverses::select_copy_int_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views, int64_t dim, c10::SymInt index) {
     // Pessimism: we can't reapply views for slice_scatter.
-    return base.select_scatter(mutated_view, dim, index);
+    return base.select_scatter_symint(mutated_view, dim, index);
 }
+
 Tensor FunctionalInverses::detach_copy_inverse(const Tensor& base, const Tensor& mutated_view, bool reapply_views) {
     // the functionalization pass doesn't care about autograd metadata - as a view, I think detach() is just an identity function
     return mutated_view;
diff --git a/aten/src/ATen/functorch/BatchRulesViews.cpp b/aten/src/ATen/functorch/BatchRulesViews.cpp
index 5eb18f71dd11f..98eaf0f387a6e 100644
--- a/aten/src/ATen/functorch/BatchRulesViews.cpp
+++ b/aten/src/ATen/functorch/BatchRulesViews.cpp
@@ -275,14 +275,14 @@ std::tuple<std::vector<Tensor>, optional<int64_t>> chunk_batching_rule(const Ten
   return std::make_tuple(at::chunk(self_, chunks, new_dim), 0);
 }
 
-std::tuple<Tensor, optional<int64_t>> select_batching_rule(const Tensor& self, optional<int64_t> bdim, int64_t dim, int64_t index) {
+std::tuple<Tensor, optional<int64_t>> select_batching_rule(const Tensor& self, optional<int64_t> bdim, int64_t dim, c10::SymInt index) {
   if (!bdim) {
-    return std::make_tuple(self.select(dim, index), nullopt);
+    return std::make_tuple(self.select_symint(dim, index), nullopt);
   }
 
   auto _self = moveBatchDimToFront(self, bdim);
   auto dim_physical = getPhysicalDim(_self, true, dim);
-  auto result = _self.select(dim_physical, index);
+  auto result = _self.select_symint(dim_physical, index);
   return std::make_tuple(result, 0);
 }
 
@@ -402,7 +402,7 @@ std::tuple<Tensor, optional<int64_t>> permute_batching_rule(
 
 std::tuple<Tensor,optional<int64_t>> select_backward_batch_rule(
     const Tensor& grad_input, optional<int64_t> grad_input_bdim,
-    SymIntArrayRef input_sizes, int64_t dim, int64_t index) {
+    c10::SymIntArrayRef input_sizes, int64_t dim, c10::SymInt index) {
   auto logical_rank = rankWithoutBatchDim(grad_input, grad_input_bdim);
   auto grad_input_ = moveBatchDimToFront(grad_input, grad_input_bdim);
   dim = maybe_wrap_dim(dim, logical_rank + 1) + 1;
diff --git a/aten/src/ATen/native/NonSymbolicBC.h b/aten/src/ATen/native/NonSymbolicBC.h
index f57c868f345f1..0b942efb52c3b 100644
--- a/aten/src/ATen/native/NonSymbolicBC.h
+++ b/aten/src/ATen/native/NonSymbolicBC.h
@@ -22,5 +22,6 @@ TORCH_API at::Tensor _embedding_bag_sparse_backward(const at::Tensor & grad, con
 TORCH_API at::Tensor value_selecting_reduction_backward(const at::Tensor & grad, int64_t dim, const at::Tensor & indices, at::IntArrayRef sizes, bool keepdim);
 TORCH_API at::Tensor trace_backward(const at::Tensor & grad, at::IntArrayRef sizes);
 TORCH_API at::Tensor index_select_backward(const at::Tensor & grad, at::IntArrayRef self_sizes, int64_t dim, const at::Tensor & index);
+TORCH_API at::Tensor select(const at::Tensor& self, int64_t dim, int64_t index);
 TORCH_API std::vector<Tensor> tensor_split(const Tensor& self, IntArrayRef indices, int64_t dim);
 }}
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index c44f3a921afc1..f2ee31fe0bcdb 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -1728,22 +1728,29 @@ QuantizerPtr create_subtensor_quantizer(const Tensor& self, bool is_select, int6
   return quantizer;
 }
 
-Tensor select(const Tensor& self, int64_t dim, int64_t index_) {
+Tensor select(const Tensor& self, int64_t dim, int64_t index) {
+  return at::select_symint(self, dim, c10::SymInt{index});
+}
+
+Tensor select(const Tensor& self, Dimname dim, int64_t index) {
+  return at::select_symint(self, dimname_to_position(self, dim), c10::SymInt{index});
+}
+
+Tensor select_symint(const Tensor& self, int64_t dim, c10::SymInt index) {
   int64_t ndim = self.dim();
   if (ndim == 0) {
     TORCH_CHECK_INDEX(false, "select() cannot be applied to a 0-dim tensor.");
   }
   dim = maybe_wrap_dim(dim, ndim);
   auto size = self.sym_sizes()[dim];
-  if (size < -index_ || size <= index_) {
+  if (size < -index || size <= index) {
     if (self.has_names() && self.names()[dim] != Dimname::wildcard()) {
-      TORCH_CHECK_INDEX(false, "select(): index ", index_, " out of range for tensor of size ",
+      TORCH_CHECK_INDEX(false, "select(): index ", index, " out of range for tensor of size ",
                      self.sizes(), " at dimension ", self.names()[dim]);
     }
-    TORCH_CHECK_INDEX(false, "select(): index ", index_, " out of range for tensor of size ",
+    TORCH_CHECK_INDEX(false, "select(): index ", index, " out of range for tensor of size ",
                    self.sizes(), " at dimension ", dim);
   }
-  SymInt index = index_;
   if (index < 0) {
     index += size;
   }
@@ -1776,13 +1783,9 @@ Tensor select(const Tensor& self, int64_t dim, int64_t index_) {
   return result;
 }
 
-Tensor select(const Tensor& self, Dimname dim, int64_t index) {
-  return at::select(self, dimname_to_position(self, dim), index);
-}
-
-Tensor select_backward(const Tensor& grad, IntArrayRef input_sizes, int64_t dim, int64_t index) {
-  auto grad_input = at::zeros(input_sizes, grad.options());
-  grad_input.select(dim, index).copy_(grad);
+Tensor select_backward_symint(const Tensor& grad, c10::SymIntArrayRef input_sizes, int64_t dim, c10::SymInt index) {
+  auto grad_input = at::zeros_symint(input_sizes, grad.options());
+  grad_input.select_symint(dim, index).copy_(grad);
   return grad_input;
 }
 
@@ -3789,9 +3792,9 @@ at::Tensor slice_scatter(const at::Tensor& self, const at::Tensor& src, int64_t
     slice.copy_(src);
     return output;
 }
-at::Tensor select_scatter(const at::Tensor& self, const at::Tensor& src, int64_t dim, int64_t index) {
+at::Tensor select_scatter_symint(const at::Tensor& self, const at::Tensor& src, int64_t dim, c10::SymInt index) {
     auto output = self.clone();
-    auto slice = output.select(dim, index);
+    auto slice = output.select_symint(dim, index);
     TORCH_CHECK(slice.sizes() == src.sizes(), "expected src to have a size equal to the slice of self. src size = ", src.sizes(), ", slice size = ", slice.sizes());
     slice.copy_(src);
     return output;
@@ -3931,8 +3934,8 @@ at::Tensor& _reshape_alias_copy_out(const at::Tensor & self, at::IntArrayRef siz
 }
 
 
-at::Tensor& select_copy_int_out(const at::Tensor & self, int64_t dim, int64_t index, at::Tensor & out) {
-  auto tmp = self.select(dim, index);
+at::Tensor& select_copy_symint_out(const at::Tensor & self, int64_t dim, c10::SymInt index, at::Tensor & out) {
+  auto tmp = self.select_symint(dim, index);
   out.copy_(tmp);
   return out;
 }
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 6c9a96937a3f1..3917be0014b44 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -4567,29 +4567,29 @@
   device_check: NoCheck
   device_guard: False
 
-- func: select.int(Tensor(a) self, int dim, int index) -> Tensor(a)
+- func: select.int(Tensor(a) self, int dim, SymInt index) -> Tensor(a)
   variants: function, method
   device_check: NoCheck
   device_guard: False
   dispatch:
-    CompositeExplicitAutograd: select
+    CompositeExplicitAutograd: select_symint
     SparseCsrCPU, SparseCsrCUDA: select_sparse_csr
     NestedTensorCPU, NestedTensorCUDA: select_nested
 
-- func: select_backward(Tensor grad_output, SymInt[] input_sizes, int dim, int index) -> Tensor
+- func: select_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index) -> Tensor
   variants: function
   device_check: NoCheck
   device_guard: False
   dispatch:
-    CompositeExplicitAutogradNonFunctional: select_backward
+    CompositeExplicitAutogradNonFunctional: select_backward_symint
   autogen: select_backward.out
 
-- func: _nested_select_backward(Tensor grad_output, Tensor self, int dim, int index) -> Tensor
+- func: _nested_select_backward(Tensor grad_output, Tensor self, int dim, SymInt index) -> Tensor
   variants: function
   device_check: NoCheck
   device_guard: False
   dispatch:
-    NestedTensorCPU, NestedTensorCUDA: _nested_select_backward
+    NestedTensorCPU, NestedTensorCUDA: _nested_select_backward_symint
 
 - func: selu(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -4828,12 +4828,12 @@
   autogen: slice_scatter.out
   tags: canonical
 
-- func: select_scatter(Tensor self, Tensor src, int dim, int index) -> Tensor
+- func: select_scatter(Tensor self, Tensor src, int dim, SymInt index) -> Tensor
   variants: function, method
   device_check: NoCheck
   device_guard: False
   dispatch:
-    CompositeExplicitAutograd: select_scatter
+    CompositeExplicitAutograd: select_scatter_symint
   autogen: select_scatter.out
 
 - func: diagonal_scatter(Tensor self, Tensor src, int offset=0, int dim1=0, int dim2=1) -> Tensor
@@ -12885,10 +12885,10 @@
     CompositeExplicitAutogradNonFunctional: _reshape_alias_copy_symint
   tags: view_copy
 
-- func: select_copy.int(Tensor self, int dim, int index) -> Tensor
+- func: select_copy.int(Tensor self, int dim, SymInt index) -> Tensor
   variants: function
   dispatch:
-    CompositeExplicitAutogradNonFunctional: select_copy_int
+    CompositeExplicitAutogradNonFunctional: select_copy_symint
   tags: view_copy
 
 - func: detach_copy(Tensor self) -> Tensor
@@ -13097,10 +13097,10 @@
     CompositeExplicitAutograd: _reshape_alias_copy_out
 
 
-- func: select_copy.int_out(Tensor self, int dim, int index, *, Tensor(a!) out) -> Tensor(a!)
+- func: select_copy.int_out(Tensor self, int dim, SymInt index, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   dispatch:
-    CompositeExplicitAutograd: select_copy_int_out
+    CompositeExplicitAutograd: select_copy_symint_out
 
 
 - func: detach_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
diff --git a/aten/src/ATen/native/nested/NestedTensorBackward.cpp b/aten/src/ATen/native/nested/NestedTensorBackward.cpp
index 0807e39e952d3..51a4210a56ae5 100644
--- a/aten/src/ATen/native/nested/NestedTensorBackward.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorBackward.cpp
@@ -154,18 +154,18 @@ Tensor _nested_sum_backward_cpu(
 }
 
 
-Tensor _nested_select_backward(
+Tensor _nested_select_backward_symint(
   const Tensor& grad,
   const Tensor& nested_self,
   int64_t dim,
-  int64_t index) {
+  c10::SymInt index) {
   auto nt_self = get_nested_tensor_impl(nested_self);
   const Tensor& self_buffer = nt_self->get_buffer();
   const auto self_sizes = nt_self->get_nested_size_tensor();
   const Tensor& self_grad_buffer = self_buffer.new_zeros(self_buffer.sizes());
 
   auto nt_grad = wrap_buffer(self_grad_buffer, self_sizes);
-  nt_grad.select(dim, index).copy_(grad);
+  nt_grad.select_symint(dim, index).copy_(grad);
 
   return nt_grad;
 }
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 52c0f76bf0708..6e1b456316826 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -1379,16 +1379,16 @@
   src: grad.gather(dim, index)
   result: scatter_add(self_t, dim, index, src_t)
 
-- name: select.int(Tensor(a) self, int dim, int index) -> Tensor(a)
+- name: select.int(Tensor(a) self, int dim, SymInt index) -> Tensor(a)
   dispatch:
     Default:
       self: select_backward_symint(grad, self.sym_sizes(), dim, index)
       result: auto_linear
     AutogradNestedTensor:
-      self: _nested_select_backward(grad, self, dim, index)
+      self: _nested_select_backward_symint(grad, self, dim, index)
 
-- name: select_backward(Tensor grad_output, SymInt[] input_sizes, int dim, int index) -> Tensor
-  grad_output: grad.select(dim, index)
+- name: select_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index) -> Tensor
+  grad_output: grad.select_symint(dim, index)
   result: auto_linear
 
 - name: sigmoid(Tensor self) -> Tensor
@@ -1435,9 +1435,9 @@
   src: grad.slice_symint(dim, start, end, step)
   result: auto_linear
 
-- name: select_scatter(Tensor self, Tensor src, int dim, int index) -> Tensor
-  self: select_scatter(grad, zeros_like(src), dim, index)
-  src: grad.select(dim, index)
+- name: select_scatter(Tensor self, Tensor src, int dim, SymInt index) -> Tensor
+  self: select_scatter_symint(grad, zeros_like(src), dim, index)
+  src: grad.select_symint(dim, index)
   result: auto_linear
 
 - name: diagonal_scatter(Tensor self, Tensor src, int offset=0, int dim1=0, int dim2=1) -> Tensor
diff --git a/torch/csrc/lazy/ts_backend/ts_native_functions.cpp b/torch/csrc/lazy/ts_backend/ts_native_functions.cpp
index 1bdc0aca8d9af..95eb0c9be455d 100644
--- a/torch/csrc/lazy/ts_backend/ts_native_functions.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_native_functions.cpp
@@ -470,7 +470,7 @@ at::Tensor LazyNativeFunctions::select_backward_symint(
     const at::Tensor& grad_output,
     c10::SymIntArrayRef input_sizes,
     int64_t dim,
-    int64_t index) {
+    c10::SymInt index) {
   return at::functionalization::functionalize_aten_op_symint<ATEN_OP(
       select_backward)>::call(grad_output, input_sizes, dim, index);
 }

From ef97a253a1d33f252b0059489f61e37079984168 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Wed, 23 Nov 2022 06:50:05 +0000
Subject: [PATCH 1200/1922] Fix Use-after-Free in
 qembeddingbag_byte_prepack_out (#84750)

When FBGEMM is not used (either manually disabled or on platforms such as POWER where it isn't supported at all) the fallback code requests a `data_ptr<float>` on a `Tensor` object returned by `to(ScalarType::Float)` in the same line. This object will be destroyed at the end of the line leading to a dangling pointer.

On some platforms this manifests in wrong results being returned as the memory gets overwritten. On other platforms anything may happen due to this being undefined behavior, although most likely it will just crash or continue to return semi-random results which may even happen to be correct (when the memory is not reused yet)

Fix this by binding the temporary object (or initial object) to a const value reference which extents its lifetime and getting the `data_ptr` from that.

Fixes #84748

This bug was introduced by a seemingly unrelated change in #64081 hence ccing @d1jang

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84750
Approved by: https://github.com/kimishpatel
---
 .../ATen/native/quantized/cpu/qembeddingbag_prepack.cpp    | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
index 00a3a9b10e96a..dab19e0908e35 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
@@ -266,9 +266,10 @@ Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight) {
   }
 
 #else
-  const auto weight_data = weight_contig->scalar_type() == at::ScalarType::Half
-      ? weight_contig->to(at::ScalarType::Float).data_ptr<float>()
-      : weight_contig->data_ptr<float>();
+  const Tensor& float_weight = weight_contig->scalar_type() == at::ScalarType::Half
+    ? weight_contig->to(at::ScalarType::Float)
+    : *weight_contig;
+  const auto weight_data = float_weight.data_ptr<float>();
   constexpr float kEpsilon = 1e-8f;
   for (auto row : c10::irange(embedding_rows)) {
     const float* input_row = weight_data + row * embedding_cols;

From ef17c288360125b6e974038770ff952c9d208add Mon Sep 17 00:00:00 2001
From: Kshiteej K <kshitijkalambarkar@gmail.com>
Date: Wed, 23 Nov 2022 08:30:51 +0000
Subject: [PATCH 1201/1922] [test_nn] move init tests from test_nn (#89202)

Ref: https://github.com/pytorch/pytorch/issues/63085

Note: Doesn't need corresponding XLA PR as the migrated tests were not run on XLA (as they weren't in TestNNDeviceType).
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89202
Approved by: https://github.com/albanD
---
 test/nn/test_init.py | 420 +++++++++++++++++++++++++++++++++++++++++++
 test/test_nn.py      | 410 +-----------------------------------------
 2 files changed, 421 insertions(+), 409 deletions(-)
 create mode 100644 test/nn/test_init.py

diff --git a/test/nn/test_init.py b/test/nn/test_init.py
new file mode 100644
index 0000000000000..9e72c1040a55a
--- /dev/null
+++ b/test/nn/test_init.py
@@ -0,0 +1,420 @@
+# Owner(s): ["module: nn"]
+import random
+import unittest
+import math
+import string
+from functools import reduce
+from operator import mul
+
+from torch.testing._internal.common_utils import TestCase, TEST_SCIPY, skipIfNoLapack
+import torch
+import torch.nn.init as init
+import torch.nn.functional as F
+
+if TEST_SCIPY:
+    from scipy import stats
+
+class TestNNInit(TestCase):
+    def setUp(self):
+        super(TestNNInit, self).setUp()
+        random.seed(123)
+
+    def _is_normal(self, tensor, mean, std):
+        samples = tensor.view(-1).tolist()
+        p_value = stats.kstest(samples, 'norm', args=(mean, std))[1]
+        return p_value > 0.0001
+
+    def _is_trunc_normal(self, tensor, mean, std, a, b):
+        # scipy's trunc norm is suited for data drawn from N(0, 1),
+        # so we need to transform our data to test it using scipy.
+        z_samples = (tensor.view(-1) - mean) / std
+        z_samples = z_samples.tolist()
+        a0 = (a - mean) / std
+        b0 = (b - mean) / std
+        p_value = stats.kstest(z_samples, 'truncnorm', args=(a0, b0))[1]
+        return p_value > 0.0001
+
+    def _is_uniform(self, tensor, a, b):
+        samples = tensor.view(-1).tolist()
+        p_value = stats.kstest(samples, 'uniform', args=(a, (b - a)))[1]
+        return p_value > 0.0001
+
+    def _create_random_nd_tensor(self, dims, size_min, size_max):
+        size = [random.randint(size_min, size_max) for _ in range(dims)]
+        tensor = torch.zeros(size)
+        return tensor
+
+    def _random_float(self, a, b):
+        return (b - a) * random.random() + a
+
+    def test_calculate_gain_linear(self):
+        for fn in ['linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose2d', 'conv_transpose2d', 'conv_transpose3d']:
+            gain = init.calculate_gain(fn)
+            self.assertEqual(gain, 1)
+
+    def test_calculate_gain_nonlinear(self):
+        for fn in ['sigmoid', 'tanh', 'relu', 'leaky_relu']:
+            gain = init.calculate_gain(fn)
+            if fn == 'sigmoid':
+                self.assertEqual(gain, 1)
+            elif fn == 'tanh':  # 5 / 3
+                self.assertEqual(gain, 1.6666666666666667)
+            elif fn == 'relu':  # sqrt(2)
+                self.assertEqual(gain, 1.4142135623730951)
+            elif fn == 'leaky_relu':  # sqrt(2 / 1 + slope^2))
+                self.assertEqual(gain, 1.4141428569978354)
+            elif fn == 'selu':
+                self.assertEqual(gain, 0.75)
+
+    def test_calculate_gain_leaky_relu(self):
+        for param in [None, 0, 0.01, 10]:
+            gain = init.calculate_gain('leaky_relu', param)
+            if param is None:  # Default slope is 0.01
+                self.assertEqual(gain, 1.4141428569978354)
+            elif param == 0:  # No slope = same gain as normal ReLU
+                self.assertEqual(gain, 1.4142135623730951)
+            elif param == 0.01:
+                self.assertEqual(gain, 1.4141428569978354)
+            elif param == 10:
+                self.assertEqual(gain, 0.14071950894605836)
+
+    def test_calculate_gain_leaky_relu_only_accepts_numbers(self):
+        for param in [True, [1], {'a': 'b'}]:
+            with self.assertRaises(ValueError):
+                init.calculate_gain('leaky_relu', param)
+
+    def test_calculate_gain_only_accepts_valid_nonlinearities(self):
+        for n in [2, 5, 25]:
+            # Generate random strings of lengths that definitely aren't supported
+            random_string = ''.join([random.choice(string.ascii_lowercase) for i in range(n)])
+            with self.assertRaises(ValueError):
+                init.calculate_gain(random_string)
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy not found.")
+    def test_uniform(self):
+        for dims in [1, 2, 4]:
+            input_tensor = self._create_random_nd_tensor(dims, size_min=30, size_max=50)
+            a = self._random_float(-3, 3)
+            b = a + self._random_float(1, 5)
+            init.uniform_(input_tensor, a=a, b=b)
+            assert self._is_uniform(input_tensor, a, b)
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy not found.")
+    def test_normal(self):
+        for dims in [1, 2, 4]:
+            input_tensor = self._create_random_nd_tensor(dims, size_min=30, size_max=50)
+            mean = self._random_float(-3, 3)
+            std = self._random_float(1, 5)
+            init.normal_(input_tensor, mean=mean, std=std)
+
+            assert self._is_normal(input_tensor, mean, std)
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy not found.")
+    def test_trunc_normal(self):
+        for dims in [1, 2, 4]:
+            input_tensor = self._create_random_nd_tensor(dims, size_min=30, size_max=50)
+            mean = self._random_float(-3, 3)
+            std = self._random_float(.01, 1)
+            a = self._random_float(mean - 2 * std, mean)
+            b = self._random_float(mean, mean + 2 * std)
+            init.trunc_normal_(input_tensor, mean=mean, std=std, a=a, b=b)
+
+            assert self._is_trunc_normal(input_tensor, mean, std, a, b)
+
+    def test_constant(self):
+        for dims in [1, 2, 4]:
+            input_tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=5)
+            val = self._random_float(1, 10)
+            init.constant_(input_tensor, val)
+
+            self.assertEqual(input_tensor, input_tensor.clone().fill_(val))
+
+    def test_ones_and_zeros(self):
+        for init_fn_, val in zip([init.ones_, init.zeros_], [1, 0]):
+            for dims in [1, 2, 4]:
+                input_tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=5)
+                init_fn_(input_tensor)
+
+                self.assertEqual(input_tensor, input_tensor.clone().fill_(val))
+
+    def test_eye(self):
+        input_tensor = self._create_random_nd_tensor(2, size_min=1, size_max=5)
+        init.eye_(input_tensor)
+
+        # Check every single element
+        for i in range(input_tensor.size(0)):
+            for j in range(input_tensor.size(1)):
+                if i == j:
+                    assert input_tensor[i][j] == 1
+                else:
+                    assert input_tensor[i][j] == 0
+
+    def test_eye_only_works_on_2d_inputs(self):
+        for dims in [1, 3]:
+            with self.assertRaises(ValueError):
+                tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=3)
+                init.eye_(tensor)
+
+    def test_dirac_properties(self):
+        for dims in [3, 4, 5]:
+            for groups in [1, 2, 3]:
+                # prepare random tensor with random sizes, but fits groups
+                a, c, d, e = (random.randint(1, 5) for _ in range(4))
+                b = random.randint(1, 5 * groups)  # same range as a*groups but all range allowed
+                # make sure first dim divides by groups
+                input_tensor = torch.randn((a * groups, b, c, d, e)[:dims])
+
+                init.dirac_(input_tensor, groups)
+
+                c_out, c_in = input_tensor.size(0) // groups, input_tensor.size(1)
+                min_d = min(c_out, c_in)
+                # Check number of nonzeros is equivalent to smallest dim (for each group)
+                assert torch.nonzero(input_tensor).size(0) == min_d * groups
+                # Check sum of values (can have precision issues, hence assertEqual) is also equivalent
+                self.assertEqual(input_tensor.sum(), min_d * groups)
+
+
+    def test_dirac_identity(self):
+        for groups in [1, 3]:
+            batch, in_c, out_c, size, kernel_size = 8, 3, 9, 5, 3  # in_c, out_c must divide by groups
+            eff_out_c = out_c // groups
+
+            # Test 1D
+            input_var = torch.randn(batch, in_c, size)
+            filter_var = torch.zeros(eff_out_c, in_c, kernel_size)
+            filter_var = torch.cat([filter_var] * groups)
+            init.dirac_(filter_var, groups)
+            output_var = F.conv1d(input_var, filter_var)
+            input_tensor, output_tensor = input_var.data, output_var.data  # Variables do not support nonzero
+            for g in range(groups):
+                # Assert in_c outputs are preserved (per each group)
+                self.assertEqual(input_tensor[:, :, 1:-1],
+                                 output_tensor[:, eff_out_c * g:eff_out_c * g + in_c, :])
+                # Assert extra outputs are 0
+                assert torch.nonzero(output_tensor[:, eff_out_c * g + in_c:eff_out_c * (g + 1), :]).numel() == 0
+
+            # Test 2D
+            input_var = torch.randn(batch, in_c, size, size)
+            filter_var = torch.zeros(eff_out_c, in_c, kernel_size, kernel_size)
+            filter_var = torch.cat([filter_var] * groups)
+            init.dirac_(filter_var, groups)
+            output_var = F.conv2d(input_var, filter_var)
+            input_tensor, output_tensor = input_var.data, output_var.data  # Variables do not support nonzero
+            for g in range(groups):
+                # Assert in_c outputs are preserved (per each group)
+                self.assertEqual(input_tensor[:, :, 1:-1, 1:-1],
+                                 output_tensor[:, eff_out_c * g:eff_out_c * g + in_c, :, :])
+                # Assert extra outputs are 0
+                assert torch.nonzero(output_tensor[:, eff_out_c * g + in_c:eff_out_c * (g + 1), :, :]).numel() == 0
+
+            # Test 3D
+            input_var = torch.randn(batch, in_c, size, size, size)
+            filter_var = torch.zeros(eff_out_c, in_c, kernel_size, kernel_size, kernel_size)
+            filter_var = torch.cat([filter_var] * groups)
+            init.dirac_(filter_var, groups)
+            output_var = F.conv3d(input_var, filter_var)
+            input_tensor, output_tensor = input_var.data, output_var.data
+            for g in range(groups):
+                # Assert in_c outputs are preserved (per each group)
+                self.assertEqual(input_tensor[:, :, 1:-1, 1:-1, 1:-1],
+                                 output_tensor[:, eff_out_c * g:eff_out_c * g + in_c, :, :, :])
+                # Assert extra outputs are 0
+                assert torch.nonzero(output_tensor[:, eff_out_c * g + in_c:eff_out_c * (g + 1), :, :, :]).numel() == 0
+
+    def test_dirac_only_works_on_3_4_5d_inputs(self):
+        for dims in [1, 2, 6]:
+            with self.assertRaises(ValueError):
+                tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=3)
+                init.dirac_(tensor)
+
+    def test_xavier_uniform_errors_on_inputs_smaller_than_2d(self):
+        for dims in [0, 1]:
+            tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=1)
+            with self.assertRaises(ValueError):
+                init.xavier_uniform_(tensor)
+
+    def test_xavier_normal_errors_on_inputs_smaller_than_2d(self):
+        for dims in [0, 1]:
+            tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=1)
+            with self.assertRaises(ValueError):
+                init.xavier_normal_(tensor)
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy not found.")
+    def test_xavier_uniform(self):
+        for use_gain in [True, False]:
+            for dims in [2, 4]:
+                input_tensor = self._create_random_nd_tensor(dims, size_min=20, size_max=25)
+                gain = 1
+
+                if use_gain:
+                    gain = self._random_float(0.1, 2)
+                    init.xavier_uniform_(input_tensor, gain=gain)
+                else:
+                    init.xavier_uniform_(input_tensor)
+
+                fan_in = input_tensor.size(1)
+                fan_out = input_tensor.size(0)
+                if input_tensor.dim() > 2:
+                    fan_in *= input_tensor[0, 0].numel()
+                    fan_out *= input_tensor[0, 0].numel()
+
+                expected_std = gain * math.sqrt(2.0 / (fan_in + fan_out))
+                bounds = expected_std * math.sqrt(3)
+                assert self._is_uniform(input_tensor, -bounds, bounds)
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy not found.")
+    def test_xavier_normal(self):
+        for use_gain in [True, False]:
+            for dims in [2, 4]:
+                input_tensor = self._create_random_nd_tensor(dims, size_min=20, size_max=25)
+                gain = 1
+
+                if use_gain:
+                    gain = self._random_float(0.1, 2)
+                    init.xavier_normal_(input_tensor, gain=gain)
+                else:
+                    init.xavier_normal_(input_tensor)
+
+                fan_in = input_tensor.size(1)
+                fan_out = input_tensor.size(0)
+                if input_tensor.dim() > 2:
+                    fan_in *= input_tensor[0, 0].numel()
+                    fan_out *= input_tensor[0, 0].numel()
+
+                expected_std = gain * math.sqrt(2.0 / (fan_in + fan_out))
+                assert self._is_normal(input_tensor, 0, expected_std)
+
+    def test_kaiming_uniform_errors_on_inputs_smaller_than_2d(self):
+        for dims in [0, 1]:
+            with self.assertRaises(ValueError):
+                tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=1)
+                init.kaiming_uniform_(tensor)
+
+    def test_kaiming_normal_errors_on_inputs_smaller_than_2d(self):
+        for dims in [0, 1]:
+            with self.assertRaises(ValueError):
+                tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=1)
+                init.kaiming_normal_(tensor)
+
+    def test_kaiming_uniform_warning_on_0element_tensor(self):
+        tensor = torch.empty(0, 1)
+        with self.assertWarnsRegex(UserWarning, "Initializing zero-element tensors is a no-op"):
+            _ = init.kaiming_uniform_(tensor)
+
+    def test_kaiming_normal_warning_on_0element_tensor(self):
+        tensor = torch.empty(0, 1)
+        with self.assertWarnsRegex(UserWarning, "Initializing zero-element tensors is a no-op"):
+            _ = init.kaiming_normal_(tensor)
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy not found.")
+    def test_kaiming_uniform(self):
+        for use_a in [True, False]:
+            for dims in [2, 4]:
+                for mode in ['fan_in', 'fan_out']:
+                    input_tensor = self._create_random_nd_tensor(dims, size_min=20, size_max=25)
+                    if use_a:
+                        a = self._random_float(0.1, 2)
+                        init.kaiming_uniform_(input_tensor, a=a, mode=mode)
+                    else:
+                        a = 0
+                        init.kaiming_uniform_(input_tensor, mode=mode)
+
+                    fan_in = input_tensor.size(1)
+                    fan_out = input_tensor.size(0)
+                    if input_tensor.dim() > 2:
+                        fan_in *= input_tensor[0, 0].numel()
+                        fan_out *= input_tensor[0, 0].numel()
+
+                    if mode == 'fan_in':
+                        n = fan_in
+                    else:
+                        n = fan_out
+
+                    expected_std = math.sqrt(2.0 / ((1 + a**2) * n))
+                    bounds = expected_std * math.sqrt(3.0)
+                    assert self._is_uniform(input_tensor, -bounds, bounds)
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy not found.")
+    def test_kaiming_normal(self):
+        for use_a in [True, False]:
+            for dims in [2, 4]:
+                for mode in ['fan_in', 'fan_out']:
+                    input_tensor = self._create_random_nd_tensor(dims, size_min=20, size_max=25)
+                    if use_a:
+                        a = self._random_float(0.1, 2)
+                        init.kaiming_normal_(input_tensor, a=a, mode=mode)
+                    else:
+                        a = 0
+                        init.kaiming_normal_(input_tensor, mode=mode)
+
+                    fan_in = input_tensor.size(1)
+                    fan_out = input_tensor.size(0)
+                    if input_tensor.dim() > 2:
+                        fan_in *= input_tensor[0, 0].numel()
+                        fan_out *= input_tensor[0, 0].numel()
+
+                    if mode == 'fan_in':
+                        n = fan_in
+                    else:
+                        n = fan_out
+
+                    expected_std = math.sqrt(2.0 / ((1 + a**2) * n))
+                    assert self._is_normal(input_tensor, 0, expected_std)
+
+    def test_sparse_only_works_on_2d_inputs(self):
+        for dims in [1, 3]:
+            with self.assertRaises(ValueError):
+                sparsity = self._random_float(0.1, 0.9)
+                tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=3)
+                init.sparse_(tensor, sparsity)
+
+    @unittest.skipIf(not TEST_SCIPY, "Scipy not found.")
+    def test_sparse_default_std(self):
+        for use_random_std in [True, False]:
+            input_tensor = self._create_random_nd_tensor(2, size_min=30, size_max=35)
+            rows, cols = input_tensor.size(0), input_tensor.size(1)
+            sparsity = self._random_float(0.1, 0.2)
+
+            std = 0.01  # default std
+            if use_random_std:
+                std = self._random_float(0.01, 0.2)
+                init.sparse_(input_tensor, sparsity=sparsity, std=std)
+            else:
+                init.sparse_(input_tensor, sparsity=sparsity)
+
+            for col_idx in range(input_tensor.size(1)):
+                column = input_tensor[:, col_idx]
+                assert column[column == 0].nelement() >= math.ceil(sparsity * rows)
+
+            assert self._is_normal(input_tensor[input_tensor != 0], 0, std)
+
+    @skipIfNoLapack
+    def test_orthogonal(self):
+        for use_gain in [True, False]:
+            for tensor_size in [[3, 4], [4, 3], [20, 2, 3, 4], [2, 3, 4, 5]]:
+                input_tensor = torch.zeros(tensor_size)
+                gain = 1.0
+
+                if use_gain:
+                    gain = self._random_float(0.1, 2)
+                    init.orthogonal_(input_tensor, gain=gain)
+                else:
+                    init.orthogonal_(input_tensor)
+
+                rows, cols = tensor_size[0], reduce(mul, tensor_size[1:])
+                flattened_tensor = input_tensor.view(rows, cols)
+                if rows > cols:
+                    self.assertEqual(torch.mm(flattened_tensor.t(), flattened_tensor),
+                                     torch.eye(cols) * gain ** 2, atol=1e-6, rtol=0)
+                else:
+                    self.assertEqual(torch.mm(flattened_tensor, flattened_tensor.t()),
+                                     torch.eye(rows) * gain ** 2, atol=1e-6, rtol=0)
+
+    def test_deprecation(self):
+        x = torch.randn(3, 3)
+
+        def fn():
+            init.normal(x)
+
+        with self.assertWarnsRegex(UserWarning, 'deprecated', msg='methods not suffixed with underscore should be deprecated'):
+            fn()
diff --git a/test/test_nn.py b/test/test_nn.py
index 25f85c60037b6..71cdb219040e9 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -3,7 +3,6 @@
 import contextlib
 import math
 import random
-import string
 import unittest
 import io
 import unittest.mock as mock
@@ -12,8 +11,7 @@
 import pickle
 from copy import deepcopy
 from itertools import product
-from functools import reduce, partial
-from operator import mul
+from functools import partial
 from collections import OrderedDict
 from tempfile import NamedTemporaryFile
 import weakref
@@ -73,7 +71,6 @@
 load_tests = load_tests
 
 if TEST_SCIPY:
-    from scipy import stats
     import scipy.signal
     import scipy.ndimage
 
@@ -10384,411 +10381,6 @@ def test_padding_list(self):
         y = net(x)
 
 
-class TestNNInit(TestCase):
-    def setUp(self):
-        super(TestNNInit, self).setUp()
-        random.seed(123)
-
-    def _is_normal(self, tensor, mean, std):
-        samples = tensor.view(-1).tolist()
-        p_value = stats.kstest(samples, 'norm', args=(mean, std))[1]
-        return p_value > 0.0001
-
-    def _is_trunc_normal(self, tensor, mean, std, a, b):
-        # scipy's trunc norm is suited for data drawn from N(0, 1),
-        # so we need to transform our data to test it using scipy.
-        z_samples = (tensor.view(-1) - mean) / std
-        z_samples = z_samples.tolist()
-        a0 = (a - mean) / std
-        b0 = (b - mean) / std
-        p_value = stats.kstest(z_samples, 'truncnorm', args=(a0, b0))[1]
-        return p_value > 0.0001
-
-    def _is_uniform(self, tensor, a, b):
-        samples = tensor.view(-1).tolist()
-        p_value = stats.kstest(samples, 'uniform', args=(a, (b - a)))[1]
-        return p_value > 0.0001
-
-    def _create_random_nd_tensor(self, dims, size_min, size_max):
-        size = [random.randint(size_min, size_max) for _ in range(dims)]
-        tensor = torch.zeros(size)
-        return tensor
-
-    def _random_float(self, a, b):
-        return (b - a) * random.random() + a
-
-    def test_calculate_gain_linear(self):
-        for fn in ['linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose2d', 'conv_transpose2d', 'conv_transpose3d']:
-            gain = init.calculate_gain(fn)
-            self.assertEqual(gain, 1)
-
-    def test_calculate_gain_nonlinear(self):
-        for fn in ['sigmoid', 'tanh', 'relu', 'leaky_relu']:
-            gain = init.calculate_gain(fn)
-            if fn == 'sigmoid':
-                self.assertEqual(gain, 1)
-            elif fn == 'tanh':  # 5 / 3
-                self.assertEqual(gain, 1.6666666666666667)
-            elif fn == 'relu':  # sqrt(2)
-                self.assertEqual(gain, 1.4142135623730951)
-            elif fn == 'leaky_relu':  # sqrt(2 / 1 + slope^2))
-                self.assertEqual(gain, 1.4141428569978354)
-            elif fn == 'selu':
-                self.assertEqual(gain, 0.75)
-
-    def test_calculate_gain_leaky_relu(self):
-        for param in [None, 0, 0.01, 10]:
-            gain = init.calculate_gain('leaky_relu', param)
-            if param is None:  # Default slope is 0.01
-                self.assertEqual(gain, 1.4141428569978354)
-            elif param == 0:  # No slope = same gain as normal ReLU
-                self.assertEqual(gain, 1.4142135623730951)
-            elif param == 0.01:
-                self.assertEqual(gain, 1.4141428569978354)
-            elif param == 10:
-                self.assertEqual(gain, 0.14071950894605836)
-
-    def test_calculate_gain_leaky_relu_only_accepts_numbers(self):
-        for param in [True, [1], {'a': 'b'}]:
-            with self.assertRaises(ValueError):
-                init.calculate_gain('leaky_relu', param)
-
-    def test_calculate_gain_only_accepts_valid_nonlinearities(self):
-        for n in [2, 5, 25]:
-            # Generate random strings of lengths that definitely aren't supported
-            random_string = ''.join([random.choice(string.ascii_lowercase) for i in range(n)])
-            with self.assertRaises(ValueError):
-                init.calculate_gain(random_string)
-
-    @unittest.skipIf(not TEST_SCIPY, "Scipy not found.")
-    def test_uniform(self):
-        for dims in [1, 2, 4]:
-            input_tensor = self._create_random_nd_tensor(dims, size_min=30, size_max=50)
-            a = self._random_float(-3, 3)
-            b = a + self._random_float(1, 5)
-            init.uniform_(input_tensor, a=a, b=b)
-            assert self._is_uniform(input_tensor, a, b)
-
-    @unittest.skipIf(not TEST_SCIPY, "Scipy not found.")
-    def test_normal(self):
-        for dims in [1, 2, 4]:
-            input_tensor = self._create_random_nd_tensor(dims, size_min=30, size_max=50)
-            mean = self._random_float(-3, 3)
-            std = self._random_float(1, 5)
-            init.normal_(input_tensor, mean=mean, std=std)
-
-            assert self._is_normal(input_tensor, mean, std)
-
-    @unittest.skipIf(not TEST_SCIPY, "Scipy not found.")
-    def test_trunc_normal(self):
-        for dims in [1, 2, 4]:
-            input_tensor = self._create_random_nd_tensor(dims, size_min=30, size_max=50)
-            mean = self._random_float(-3, 3)
-            std = self._random_float(.01, 1)
-            a = self._random_float(mean - 2 * std, mean)
-            b = self._random_float(mean, mean + 2 * std)
-            init.trunc_normal_(input_tensor, mean=mean, std=std, a=a, b=b)
-
-            assert self._is_trunc_normal(input_tensor, mean, std, a, b)
-
-    def test_constant(self):
-        for dims in [1, 2, 4]:
-            input_tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=5)
-            val = self._random_float(1, 10)
-            init.constant_(input_tensor, val)
-
-            self.assertEqual(input_tensor, input_tensor.clone().fill_(val))
-
-    def test_ones_and_zeros(self):
-        for init_fn_, val in zip([init.ones_, init.zeros_], [1, 0]):
-            for dims in [1, 2, 4]:
-                input_tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=5)
-                init_fn_(input_tensor)
-
-                self.assertEqual(input_tensor, input_tensor.clone().fill_(val))
-
-    def test_eye(self):
-        input_tensor = self._create_random_nd_tensor(2, size_min=1, size_max=5)
-        init.eye_(input_tensor)
-
-        # Check every single element
-        for i in range(input_tensor.size(0)):
-            for j in range(input_tensor.size(1)):
-                if i == j:
-                    assert input_tensor[i][j] == 1
-                else:
-                    assert input_tensor[i][j] == 0
-
-    def test_eye_only_works_on_2d_inputs(self):
-        for dims in [1, 3]:
-            with self.assertRaises(ValueError):
-                tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=3)
-                init.eye_(tensor)
-
-    def test_dirac_properties(self):
-        for dims in [3, 4, 5]:
-            for groups in [1, 2, 3]:
-                # prepare random tensor with random sizes, but fits groups
-                a, c, d, e = (random.randint(1, 5) for _ in range(4))
-                b = random.randint(1, 5 * groups)  # same range as a*groups but all range allowed
-                # make sure first dim divides by groups
-                input_tensor = torch.randn((a * groups, b, c, d, e)[:dims])
-
-                init.dirac_(input_tensor, groups)
-
-                c_out, c_in = input_tensor.size(0) // groups, input_tensor.size(1)
-                min_d = min(c_out, c_in)
-                # Check number of nonzeros is equivalent to smallest dim (for each group)
-                assert torch.nonzero(input_tensor).size(0) == min_d * groups
-                # Check sum of values (can have precision issues, hence assertEqual) is also equivalent
-                self.assertEqual(input_tensor.sum(), min_d * groups)
-
-
-    def test_dirac_identity(self):
-        for groups in [1, 3]:
-            batch, in_c, out_c, size, kernel_size = 8, 3, 9, 5, 3  # in_c, out_c must divide by groups
-            eff_out_c = out_c // groups
-
-            # Test 1D
-            input_var = torch.randn(batch, in_c, size)
-            filter_var = torch.zeros(eff_out_c, in_c, kernel_size)
-            filter_var = torch.cat([filter_var] * groups)
-            init.dirac_(filter_var, groups)
-            output_var = F.conv1d(input_var, filter_var)
-            input_tensor, output_tensor = input_var.data, output_var.data  # Variables do not support nonzero
-            for g in range(groups):
-                # Assert in_c outputs are preserved (per each group)
-                self.assertEqual(input_tensor[:, :, 1:-1],
-                                 output_tensor[:, eff_out_c * g:eff_out_c * g + in_c, :])
-                # Assert extra outputs are 0
-                assert torch.nonzero(output_tensor[:, eff_out_c * g + in_c:eff_out_c * (g + 1), :]).numel() == 0
-
-            # Test 2D
-            input_var = torch.randn(batch, in_c, size, size)
-            filter_var = torch.zeros(eff_out_c, in_c, kernel_size, kernel_size)
-            filter_var = torch.cat([filter_var] * groups)
-            init.dirac_(filter_var, groups)
-            output_var = F.conv2d(input_var, filter_var)
-            input_tensor, output_tensor = input_var.data, output_var.data  # Variables do not support nonzero
-            for g in range(groups):
-                # Assert in_c outputs are preserved (per each group)
-                self.assertEqual(input_tensor[:, :, 1:-1, 1:-1],
-                                 output_tensor[:, eff_out_c * g:eff_out_c * g + in_c, :, :])
-                # Assert extra outputs are 0
-                assert torch.nonzero(output_tensor[:, eff_out_c * g + in_c:eff_out_c * (g + 1), :, :]).numel() == 0
-
-            # Test 3D
-            input_var = torch.randn(batch, in_c, size, size, size)
-            filter_var = torch.zeros(eff_out_c, in_c, kernel_size, kernel_size, kernel_size)
-            filter_var = torch.cat([filter_var] * groups)
-            init.dirac_(filter_var, groups)
-            output_var = F.conv3d(input_var, filter_var)
-            input_tensor, output_tensor = input_var.data, output_var.data
-            for g in range(groups):
-                # Assert in_c outputs are preserved (per each group)
-                self.assertEqual(input_tensor[:, :, 1:-1, 1:-1, 1:-1],
-                                 output_tensor[:, eff_out_c * g:eff_out_c * g + in_c, :, :, :])
-                # Assert extra outputs are 0
-                assert torch.nonzero(output_tensor[:, eff_out_c * g + in_c:eff_out_c * (g + 1), :, :, :]).numel() == 0
-
-    def test_dirac_only_works_on_3_4_5d_inputs(self):
-        for dims in [1, 2, 6]:
-            with self.assertRaises(ValueError):
-                tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=3)
-                init.dirac_(tensor)
-
-    def test_xavier_uniform_errors_on_inputs_smaller_than_2d(self):
-        for dims in [0, 1]:
-            tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=1)
-            with self.assertRaises(ValueError):
-                init.xavier_uniform_(tensor)
-
-    def test_xavier_normal_errors_on_inputs_smaller_than_2d(self):
-        for dims in [0, 1]:
-            tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=1)
-            with self.assertRaises(ValueError):
-                init.xavier_normal_(tensor)
-
-    @unittest.skipIf(not TEST_SCIPY, "Scipy not found.")
-    def test_xavier_uniform(self):
-        for use_gain in [True, False]:
-            for dims in [2, 4]:
-                input_tensor = self._create_random_nd_tensor(dims, size_min=20, size_max=25)
-                gain = 1
-
-                if use_gain:
-                    gain = self._random_float(0.1, 2)
-                    init.xavier_uniform_(input_tensor, gain=gain)
-                else:
-                    init.xavier_uniform_(input_tensor)
-
-                fan_in = input_tensor.size(1)
-                fan_out = input_tensor.size(0)
-                if input_tensor.dim() > 2:
-                    fan_in *= input_tensor[0, 0].numel()
-                    fan_out *= input_tensor[0, 0].numel()
-
-                expected_std = gain * math.sqrt(2.0 / (fan_in + fan_out))
-                bounds = expected_std * math.sqrt(3)
-                assert self._is_uniform(input_tensor, -bounds, bounds)
-
-    @unittest.skipIf(not TEST_SCIPY, "Scipy not found.")
-    def test_xavier_normal(self):
-        for use_gain in [True, False]:
-            for dims in [2, 4]:
-                input_tensor = self._create_random_nd_tensor(dims, size_min=20, size_max=25)
-                gain = 1
-
-                if use_gain:
-                    gain = self._random_float(0.1, 2)
-                    init.xavier_normal_(input_tensor, gain=gain)
-                else:
-                    init.xavier_normal_(input_tensor)
-
-                fan_in = input_tensor.size(1)
-                fan_out = input_tensor.size(0)
-                if input_tensor.dim() > 2:
-                    fan_in *= input_tensor[0, 0].numel()
-                    fan_out *= input_tensor[0, 0].numel()
-
-                expected_std = gain * math.sqrt(2.0 / (fan_in + fan_out))
-                assert self._is_normal(input_tensor, 0, expected_std)
-
-    def test_kaiming_uniform_errors_on_inputs_smaller_than_2d(self):
-        for dims in [0, 1]:
-            with self.assertRaises(ValueError):
-                tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=1)
-                init.kaiming_uniform_(tensor)
-
-    def test_kaiming_normal_errors_on_inputs_smaller_than_2d(self):
-        for dims in [0, 1]:
-            with self.assertRaises(ValueError):
-                tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=1)
-                init.kaiming_normal_(tensor)
-
-    def test_kaiming_uniform_warning_on_0element_tensor(self):
-        tensor = torch.empty(0, 1)
-        with self.assertWarnsRegex(UserWarning, "Initializing zero-element tensors is a no-op"):
-            _ = init.kaiming_uniform_(tensor)
-
-    def test_kaiming_normal_warning_on_0element_tensor(self):
-        tensor = torch.empty(0, 1)
-        with self.assertWarnsRegex(UserWarning, "Initializing zero-element tensors is a no-op"):
-            _ = init.kaiming_normal_(tensor)
-
-    @unittest.skipIf(not TEST_SCIPY, "Scipy not found.")
-    def test_kaiming_uniform(self):
-        for use_a in [True, False]:
-            for dims in [2, 4]:
-                for mode in ['fan_in', 'fan_out']:
-                    input_tensor = self._create_random_nd_tensor(dims, size_min=20, size_max=25)
-                    if use_a:
-                        a = self._random_float(0.1, 2)
-                        init.kaiming_uniform_(input_tensor, a=a, mode=mode)
-                    else:
-                        a = 0
-                        init.kaiming_uniform_(input_tensor, mode=mode)
-
-                    fan_in = input_tensor.size(1)
-                    fan_out = input_tensor.size(0)
-                    if input_tensor.dim() > 2:
-                        fan_in *= input_tensor[0, 0].numel()
-                        fan_out *= input_tensor[0, 0].numel()
-
-                    if mode == 'fan_in':
-                        n = fan_in
-                    else:
-                        n = fan_out
-
-                    expected_std = math.sqrt(2.0 / ((1 + a**2) * n))
-                    bounds = expected_std * math.sqrt(3.0)
-                    assert self._is_uniform(input_tensor, -bounds, bounds)
-
-    @unittest.skipIf(not TEST_SCIPY, "Scipy not found.")
-    def test_kaiming_normal(self):
-        for use_a in [True, False]:
-            for dims in [2, 4]:
-                for mode in ['fan_in', 'fan_out']:
-                    input_tensor = self._create_random_nd_tensor(dims, size_min=20, size_max=25)
-                    if use_a:
-                        a = self._random_float(0.1, 2)
-                        init.kaiming_normal_(input_tensor, a=a, mode=mode)
-                    else:
-                        a = 0
-                        init.kaiming_normal_(input_tensor, mode=mode)
-
-                    fan_in = input_tensor.size(1)
-                    fan_out = input_tensor.size(0)
-                    if input_tensor.dim() > 2:
-                        fan_in *= input_tensor[0, 0].numel()
-                        fan_out *= input_tensor[0, 0].numel()
-
-                    if mode == 'fan_in':
-                        n = fan_in
-                    else:
-                        n = fan_out
-
-                    expected_std = math.sqrt(2.0 / ((1 + a**2) * n))
-                    assert self._is_normal(input_tensor, 0, expected_std)
-
-    def test_sparse_only_works_on_2d_inputs(self):
-        for dims in [1, 3]:
-            with self.assertRaises(ValueError):
-                sparsity = self._random_float(0.1, 0.9)
-                tensor = self._create_random_nd_tensor(dims, size_min=1, size_max=3)
-                init.sparse_(tensor, sparsity)
-
-    @unittest.skipIf(not TEST_SCIPY, "Scipy not found.")
-    def test_sparse_default_std(self):
-        for use_random_std in [True, False]:
-            input_tensor = self._create_random_nd_tensor(2, size_min=30, size_max=35)
-            rows, cols = input_tensor.size(0), input_tensor.size(1)
-            sparsity = self._random_float(0.1, 0.2)
-
-            std = 0.01  # default std
-            if use_random_std:
-                std = self._random_float(0.01, 0.2)
-                init.sparse_(input_tensor, sparsity=sparsity, std=std)
-            else:
-                init.sparse_(input_tensor, sparsity=sparsity)
-
-            for col_idx in range(input_tensor.size(1)):
-                column = input_tensor[:, col_idx]
-                assert column[column == 0].nelement() >= math.ceil(sparsity * rows)
-
-            assert self._is_normal(input_tensor[input_tensor != 0], 0, std)
-
-    @skipIfNoLapack
-    def test_orthogonal(self):
-        for use_gain in [True, False]:
-            for tensor_size in [[3, 4], [4, 3], [20, 2, 3, 4], [2, 3, 4, 5]]:
-                input_tensor = torch.zeros(tensor_size)
-                gain = 1.0
-
-                if use_gain:
-                    gain = self._random_float(0.1, 2)
-                    init.orthogonal_(input_tensor, gain=gain)
-                else:
-                    init.orthogonal_(input_tensor)
-
-                rows, cols = tensor_size[0], reduce(mul, tensor_size[1:])
-                flattened_tensor = input_tensor.view(rows, cols)
-                if rows > cols:
-                    self.assertEqual(torch.mm(flattened_tensor.t(), flattened_tensor),
-                                     torch.eye(cols) * gain ** 2, atol=1e-6, rtol=0)
-                else:
-                    self.assertEqual(torch.mm(flattened_tensor, flattened_tensor.t()),
-                                     torch.eye(rows) * gain ** 2, atol=1e-6, rtol=0)
-
-    def test_deprecation(self):
-        x = torch.randn(3, 3)
-
-        def fn():
-            init.normal(x)
-
-        with self.assertWarnsRegex(UserWarning, 'deprecated', msg='methods not suffixed with underscore should be deprecated'):
-            fn()
-
 class TestFusionEval(TestCase):
     @given(X=hu.tensor(shapes=((5, 3, 5, 5),)),
            running_mean=hu.tensor(shapes=(6,)),

From 84e5bee5b7b626dc220f62b304c15ffb0af3a3dd Mon Sep 17 00:00:00 2001
From: Kshiteej K <kshitijkalambarkar@gmail.com>
Date: Wed, 23 Nov 2022 08:39:45 +0000
Subject: [PATCH 1202/1922] [test_nn] split hooks test from test_nn (#89201)

Ref: https://github.com/pytorch/pytorch/issues/63085

Note: Doesn't need corresponding XLA PR as the migrated tests were not run on XLA (as they weren't in TestNNDeviceType).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89201
Approved by: https://github.com/albanD
---
 test/nn/test_module_hooks.py         | 963 ++++++++++++++++++++++++++
 test/test_nn.py                      | 979 +--------------------------
 torch/testing/_internal/common_nn.py |  21 +
 3 files changed, 989 insertions(+), 974 deletions(-)

diff --git a/test/nn/test_module_hooks.py b/test/nn/test_module_hooks.py
index 27b3fd5c224ff..889966e006c1a 100644
--- a/test/nn/test_module_hooks.py
+++ b/test/nn/test_module_hooks.py
@@ -3,13 +3,23 @@
     TestCase,
     run_tests,
     skipIfTorchDynamo,
+    IS_WINDOWS
 )
+from torch.testing._internal.common_nn import NNTestCase, _create_basic_net
 
 import torch
 import torch.nn as nn
 
 from functools import partial
 from typing import Any, Dict, List, Tuple
+import gc
+import unittest
+from copy import deepcopy
+from tempfile import NamedTemporaryFile
+import weakref
+import pickle
+from collections import OrderedDict
+import math
 
 
 class Net(nn.Module):
@@ -367,5 +377,958 @@ def test_remove_kwarg_hooks(self):
         )
 
 
+def _hook_to_pickle(*args, **kwargs):
+    pass
+
+class TestStateDictHooks(TestCase):
+
+    def test_load_state_dict_pre_hook(self):
+
+        m = nn.Linear(10, 10)
+        m_state_dict = m.state_dict()
+
+        m_load = nn.Linear(10, 10)
+
+        hook_called = 0
+
+        def hook_without_module(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+            self.assertEqual(m_state_dict, state_dict)
+            nonlocal hook_called
+            hook_called += 1
+
+        def hook_with_module(module, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+            self.assertEqual(m_state_dict, state_dict)
+            self.assertTrue(m_load is module)
+            nonlocal hook_called
+            hook_called += 1
+
+        hook_called = 0
+        m_load._register_load_state_dict_pre_hook(hook_without_module)
+        m_load.load_state_dict(m_state_dict)
+        self.assertEqual(1, hook_called)
+
+        hook_called = 0
+        m_load._register_load_state_dict_pre_hook(hook_with_module, True)
+        m_load.load_state_dict(m_state_dict)
+        self.assertEqual(2, hook_called)
+
+    def test_no_extra_ref_to_module(self):
+        try:
+            gc.disable()
+            m = nn.Linear(10, 10)
+
+            m._register_load_state_dict_pre_hook(_hook_to_pickle, True)
+            weak_m = weakref.ref(m)
+            del m
+
+            self.assertEqual(weak_m(), None)
+        finally:
+            gc.enable()
+
+    def test_pickled_hook(self):
+        m = nn.Linear(10, 10)
+        m._register_load_state_dict_pre_hook(_hook_to_pickle, True)
+        pickle.loads(pickle.dumps(m))
+
+    def test_load_state_dict_module_pre_hook(self):
+        hook_called = 0
+
+        # Test with module instance method as hook
+        class MyModule(nn.Module):
+            def __init__(self):
+                super(MyModule, self).__init__()
+                self.foo = torch.nn.Parameter(torch.rand(10))
+
+            def my_pre_load_hook(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+                assert [] == error_msgs
+                assert [] == unexpected_keys
+                assert [] == missing_keys
+                assert strict
+                nonlocal hook_called
+                hook_called += 1
+
+            def my_pre_load_hook_with_module(
+                self,
+                module,
+                state_dict,
+                prefix,
+                local_metadata,
+                strict,
+                missing_keys,
+                unexpected_keys,
+                error_msgs,
+            ):
+                assert [] == error_msgs
+                assert [] == unexpected_keys
+                assert [] == missing_keys
+                assert strict
+                assert self is module
+                nonlocal hook_called
+                hook_called += 1
+
+        # Test that hooks registered on a submodule are also called
+        # appropriately, i.e. with the submodule as module argument in
+        # my_pre_load_hook_with_module.
+        class MyModuleContainer(nn.Module):
+            def __init__(self, mod):
+                super().__init__()
+                self.mod = mod
+
+        for ctor in [MyModuleContainer, lambda x: x]:
+            m = ctor(MyModule())
+            state_dict = m.state_dict()
+            if isinstance(m, MyModuleContainer):
+                mod = m.mod
+            else:
+                mod = m
+
+            hook_called = 0
+            mod._register_load_state_dict_pre_hook(
+                mod.my_pre_load_hook
+            )
+            m.load_state_dict(state_dict)
+            self.assertEqual(1, hook_called)
+
+            hook_called = 0
+            mod._register_load_state_dict_pre_hook(
+                mod.my_pre_load_hook_with_module, True
+            )
+            m.load_state_dict(state_dict)
+            self.assertEqual(2, hook_called)
+
+    def test_load_state_dict_post_hook(self):
+        hook_called = 0
+
+        class MyModule(nn.Module):
+            def __init__(self):
+                super(MyModule, self).__init__()
+                self.foo = torch.nn.Parameter(torch.rand(10))
+
+            def my_post_load_hook(self, module, incompatible_keys):
+                assert module is self
+                nonlocal hook_called
+                incompatible_keys.missing_keys.append("foo")
+                incompatible_keys.unexpected_keys.append("bar")
+                hook_called += 1
+
+        nested = MyModule()
+        wrapped = nn.ModuleList([nested])
+        handle = nested.register_load_state_dict_post_hook(
+            nested.my_post_load_hook,
+        )
+        # Hook must be called even if it is wrapped
+        ret = wrapped.load_state_dict(wrapped.state_dict(), strict=False)
+        self.assertEqual(hook_called, 1)
+        # Ensure that the hook modified missing_keys and unexpected_keys
+        missing = ret.missing_keys
+        unexpected = ret.unexpected_keys
+        self.assertEqual(missing, ["foo"])
+        self.assertEqual(unexpected, ["bar"])
+        # When called with strict=True, the error raised should mention the
+        # missing and unexpected keys the hook added.
+        with self.assertRaisesRegex(RuntimeError, "foo.*\n.*bar"):
+            wrapped.load_state_dict(wrapped.state_dict(), strict=True)
+        self.assertEqual(hook_called, 2)
+        # Removing the hook via handle.remove() should cause it not to
+        # fire anymore.
+        handle.remove()
+        # Hook did not run so it should not have added any keys
+        ret = wrapped.load_state_dict(wrapped.state_dict(), strict=False)
+        self.assertEqual(ret.missing_keys, [])
+        self.assertEqual(ret.unexpected_keys, [])
+        # hook_called should not have been incremented
+        self.assertEqual(hook_called, 2)
+
+        def load_hook_clear_incompatible(module, incompatible_keys):
+            incompatible_keys.missing_keys.clear()
+            incompatible_keys.unexpected_keys.clear()
+
+        nested.register_load_state_dict_post_hook(load_hook_clear_incompatible)
+        state_dict = wrapped.state_dict()
+        state_dict["extra"] = torch.ones(1)
+        # load state_dict with strict=True should not throw.
+        ret = wrapped.load_state_dict(state_dict, strict=True)
+        # explicitly ensure that the post hook clearned out incompatible_keys
+        self.assertEqual([], ret.missing_keys)
+        self.assertEqual([], ret.unexpected_keys)
+
+    @unittest.skipIf(IS_WINDOWS, "Tempfile permission issue on windows")
+    def test_load_state_dict_post_hook_backward_compatibility(self):
+        def my_post_load_hook(mod, _):
+            nonlocal called
+            called = True
+
+        for m in [nn.Softmin(10), nn.Softmax(10), nn.LogSoftmax(10)]:
+            called = False
+            sd = deepcopy(m.state_dict())
+            self.assertTrue(hasattr(m, '_load_state_dict_post_hooks'))
+            # Simulate an older model that did not have this attr
+            delattr(m, '_load_state_dict_post_hooks')
+            # Save and load, and ensure that load_state_dict works (without proper
+            # BC we would run into errors because this attribute would be expected).
+            # In particular, Softmax runs into the issue described here:
+            # https://github.com/pytorch/pytorch/issues/77280
+            with NamedTemporaryFile() as f:
+                # Note that torch.save / torch.load is not recommended to save/load
+                # modules.
+                torch.save(m, f.name)
+                m = torch.load(f.name)
+                m.load_state_dict(sd)
+                self.assertFalse(called)
+
+            # Ensure hooks can be registered and called.
+            m.register_load_state_dict_post_hook(my_post_load_hook)
+            m.load_state_dict(sd)
+            self.assertTrue(called)
+
+
+class TestModuleGlobalHooks(TestCase):
+
+    def tearDown(self):
+        nn.modules.module._global_backward_hooks = OrderedDict()
+        nn.modules.module._global_forward_hooks = OrderedDict()
+        nn.modules.module._global_forward_pre_hooks = OrderedDict()
+
+    @skipIfTorchDynamo("TorchDynamo does not work well with hooks")
+    def test_module_global_hooks(self):
+        module = nn.Sigmoid
+
+        module_1 = module()
+        module_2 = module()
+        module_3 = module()
+
+        input = torch.ones(5, 5, requires_grad=True)
+
+        counter = {
+            'forwards': 0,
+            'backwards': 0
+        }
+
+        def fw_hook(inc, h_module, input, output):
+            self.assertIsInstance(input, tuple)
+            self.assertTrue(isinstance(output, torch.Tensor))
+            self.assertTrue(isinstance(h_module, module))
+            self.assertEqual(input[0], torch.ones(5, 5))
+            self.assertEqual(output, torch.empty(5, 5).fill_(1 / (1 + 1 / math.e)))
+            counter['forwards'] += inc
+
+        def bw_hook(inc, h_module, grad_input, grad_output):
+            self.assertIsInstance(grad_input, tuple)
+            self.assertIsInstance(grad_output, tuple)
+            self.assertTrue(isinstance(h_module, module))
+            self.assertEqual(grad_output[0], torch.ones(5, 5) * 2)
+            counter['backwards'] += inc
+
+        test_fwd = nn.modules.module.register_module_forward_hook(lambda *args: fw_hook(1, *args))
+
+        module_1(input)
+        module_2(input)
+        module_3(input)
+        self.assertEqual(counter['forwards'], 3)
+        self.assertEqual(counter['backwards'], 0)
+
+        test_bwd = nn.modules.module.register_module_backward_hook(
+            lambda *args: bw_hook(1, *args))
+
+        output_1 = module_1(input)
+        output_2 = module_2(input)
+        output_3 = module_3(input)
+        self.assertEqual(counter['forwards'], 6)
+        self.assertEqual(counter['backwards'], 0)
+
+        output_1.backward(torch.ones(5, 5) * 2, retain_graph=True)
+        output_2.backward(torch.ones(5, 5) * 2, retain_graph=False)
+        output_3.backward(torch.ones(5, 5) * 2, retain_graph=False)
+        self.assertEqual(counter['forwards'], 6)
+        self.assertEqual(counter['backwards'], 3)
+
+        output_1.backward(torch.ones(5, 5) * 2, retain_graph=True)
+        self.assertEqual(counter['forwards'], 6)
+        self.assertEqual(counter['backwards'], 4)
+
+        test2_fwd = nn.modules.module.register_module_forward_hook(lambda *args: fw_hook(2, *args))
+
+        output = module_1(input)
+        output = module_2(input)
+        output = module_3(input)
+        self.assertEqual(counter['forwards'], 15)
+        self.assertEqual(counter['backwards'], 4)
+
+        test2_bwd = nn.modules.module.register_module_backward_hook(lambda *args: bw_hook(2, *args))
+
+        module_1(input).backward(torch.ones(5, 5) * 2)
+        self.assertEqual(counter['forwards'], 18)
+        self.assertEqual(counter['backwards'], 7)
+
+        test2_bwd.remove()
+
+        module_2(input).backward(torch.ones(5, 5) * 2)
+        self.assertEqual(counter['forwards'], 21)
+        self.assertEqual(counter['backwards'], 8)
+
+        test2_fwd.remove()
+
+        module_3(input).backward(torch.ones(5, 5) * 2)
+        self.assertEqual(counter['forwards'], 22)
+        self.assertEqual(counter['backwards'], 9)
+
+        test_fwd.remove()
+        test_bwd.remove()
+
+    def test_module_global_hook_invalid_outputs(self):
+        module = nn.Sigmoid()
+        input = torch.randn(5, 5, requires_grad=True)
+
+        def bw_fail1(self, grad_input, grad_output):
+            return grad_input[:-1]
+
+        def bw_fail2(self, grad_input, grad_output):
+            return grad_input + (torch.randn(2, 2),)
+
+        with nn.modules.module.register_module_backward_hook(bw_fail1):
+            with self.assertRaisesRegex(RuntimeError, 'got 0, but expected 1'):
+                module(input).sum().backward()
+
+        with nn.modules.module.register_module_backward_hook(bw_fail2):
+            with self.assertRaisesRegex(RuntimeError, 'got 2, but expected 1'):
+                module(input).sum().backward()
+
+    @skipIfTorchDynamo("https://github.com/pytorch/torchdynamo/issues/847")
+    def test_module_backward_global_hook_writeable(self):
+        module = nn.Sigmoid()
+        input = torch.randn(5, 5, requires_grad=True)
+        sig_x = torch.sigmoid(input)
+
+        def bw_hook(module, grad_input, grad_output):
+            for grad in grad_input:
+                self.assertTrue(isinstance(grad, torch.Tensor))
+            for grad in grad_output:
+                self.assertTrue(isinstance(grad, torch.Tensor))
+            return tuple(gi * 2 for gi in grad_input)
+
+        nn.modules.module.register_module_backward_hook(bw_hook)
+        module(input).backward(torch.ones(5, 5))
+        expected_grad = sig_x * (1 - sig_x) * 2
+        self.assertEqual(input.grad, expected_grad)
+
+    @skipIfTorchDynamo("TorchDynamo does not work well with hooks")
+    def test_module_global_forward_preforward_hook_writeable(self):
+        module = nn.Sigmoid()
+        input = torch.randn(5, 5, requires_grad=True)
+        sig_x = torch.sigmoid(input)
+
+        def forward_pre_hook(m, input):
+            return torch.nn.functional.relu(input[0])
+
+        def forward_hook(m, input, output):
+            return -output
+
+        nn.modules.module.register_module_forward_pre_hook(forward_pre_hook)
+        nn.modules.module.register_module_forward_hook(forward_hook)
+        output = module(input)
+        expected_res = -torch.sigmoid(torch.nn.functional.relu(input))
+        self.assertEqual(output, expected_res)
+        output.backward(torch.ones(5, 5) * 2, retain_graph=True)
+        mask = (input > 0)
+        expected_grad = -sig_x * (1 - sig_x) * 2 * mask
+        self.assertEqual(input.grad, expected_grad)
+
+    @skipIfTorchDynamo("TorchDynamo does not work well with hooks")
+    def test_module_forward_preforward_hook_removable(self):
+        """
+        This test is to test when multiple pre-forward hook functions can be
+        registered successfully and used correctly, if the handle can be removable
+        during the pre-forward hook function call.
+        """
+        module = nn.Sigmoid()
+
+        def removable_hook(m, input):
+            nonlocal handle
+            handle.remove()
+            return input
+
+        def removable_hook_2(m, input):
+            nonlocal handle_2
+            handle_2.remove()
+            return input
+
+        handle = module.register_forward_pre_hook(removable_hook)
+        handle_2 = module.register_forward_pre_hook(removable_hook_2)
+
+        # make sure hook register is successful
+        self.assertEqual(len(handle.hooks_dict_ref()), 2)
+        self.assertEqual(len(handle_2.hooks_dict_ref()), 2)
+
+        input = torch.randn(2, 2)
+        output = module(input)
+        self.assertEqual(torch.sigmoid(input), output)
+
+        # make sure hook removal is successful
+        self.assertFalse(handle.id in handle.hooks_dict_ref())
+        self.assertFalse(handle_2.id in handle.hooks_dict_ref())
+        self.assertEqual(len(handle.hooks_dict_ref()), 0)
+        self.assertEqual(len(handle_2.hooks_dict_ref()), 0)
+
+    @skipIfTorchDynamo("TorchDynamo does not work well with hooks")
+    def test_module_forward_forward_hook_removable(self):
+        """
+        This test is to test when multiple forward hook functions can be registered
+        successfully and used correctly, if the handle can be removable during the
+        forward hook function call.
+        """
+        module = nn.Sigmoid()
+
+        def removable_hook(m, input, output):
+            nonlocal handle
+            handle.remove()
+            return output
+
+        def removable_hook_2(m, input, output):
+            nonlocal handle_2
+            handle_2.remove()
+            return output
+
+        handle = module.register_forward_hook(removable_hook)
+        handle_2 = module.register_forward_hook(removable_hook_2)
+
+        # make sure hook register is successful
+        self.assertEqual(len(handle.hooks_dict_ref()), 2)
+        self.assertEqual(len(handle_2.hooks_dict_ref()), 2)
+
+        input = torch.randn(2, 2)
+        output = module(input)
+        self.assertEqual(torch.sigmoid(input), output)
+
+        # make sure hook removal is successful
+        self.assertFalse(handle.id in handle.hooks_dict_ref())
+        self.assertFalse(handle_2.id in handle.hooks_dict_ref())
+        self.assertEqual(len(handle.hooks_dict_ref()), 0)
+        self.assertEqual(len(handle_2.hooks_dict_ref()), 0)
+
+    @skipIfTorchDynamo("TorchDynamo does not work well with hooks")
+    def test_global_and_local_hooks_order(self):
+        module = nn.Sigmoid()
+
+        global_forward_pre_called = False
+        local_forward_pre_called = False
+        global_forward_called = False
+        local_forward_called = False
+        global_backward_called = False
+        local_backward_called = False
+
+        def global_forward_pre_hook(m, input):
+            nonlocal global_forward_pre_called
+            self.assertTrue(not local_forward_pre_called)
+            global_forward_pre_called = True
+            return input
+
+        def local_forward_pre_hook(m, input):
+            nonlocal local_forward_pre_called
+            self.assertTrue(global_forward_pre_called)
+            local_forward_pre_called = True
+            return input
+
+        def global_forward_hook(m, input, output):
+            nonlocal global_forward_called
+            self.assertTrue(not local_forward_called)
+            global_forward_called = True
+            return output
+
+        def local_forward_hook(m, input, output):
+            nonlocal local_forward_called
+            self.assertTrue(global_forward_called)
+            local_forward_called = True
+            return output
+
+        def global_backward_hook(m, input, output):
+            nonlocal global_backward_called
+            self.assertTrue(not local_backward_called)
+            global_backward_called = True
+            return input
+
+        def local_backward_hook(m, input, output):
+            nonlocal local_backward_called
+            self.assertTrue(global_backward_called)
+            local_backward_called = True
+            return input
+
+        input = torch.randn(5, 5, requires_grad=True)
+        nn.modules.module.register_module_forward_pre_hook(global_forward_pre_hook)
+        module.register_forward_pre_hook(local_forward_pre_hook)
+        nn.modules.module.register_module_forward_hook(global_forward_hook)
+        module.register_forward_hook(local_forward_hook)
+        nn.modules.module.register_module_backward_hook(global_backward_hook)
+        module.register_backward_hook(local_backward_hook)
+
+        output = module(input)
+        self.assertTrue(local_forward_called and local_forward_pre_called and global_forward_called and global_forward_pre_called)
+
+        output.backward(torch.ones(5, 5), retain_graph=True)
+        self.assertTrue(local_backward_called and global_backward_called)
+
+
+class TestModuleHookNN(NNTestCase):
+    _do_cuda_memory_leak_check = True
+    _do_cuda_non_default_stream = True
+
+    def _test_hooks(self, backward_register_fn):
+        module = nn.Sigmoid()
+        input = torch.ones(5, 5, requires_grad=True)
+
+        counter = {
+            'forwards': 0,
+            'backwards': 0
+        }
+
+        def fw_hook(inc, h_module, input, output):
+            self.assertIsInstance(input, tuple)
+            self.assertTrue(isinstance(output, torch.Tensor))
+            self.assertTrue(h_module is module)
+            self.assertEqual(input[0], torch.ones(5, 5))
+            self.assertEqual(output, torch.empty(5, 5).fill_(1 / (1 + 1 / math.e)))
+            counter['forwards'] += inc
+
+        def bw_hook(inc, h_module, grad_input, grad_output):
+            self.assertIsInstance(grad_input, tuple)
+            self.assertIsInstance(grad_output, tuple)
+            self.assertTrue(h_module is module)
+            self.assertEqual(grad_output[0], torch.ones(5, 5) * 2)
+            counter['backwards'] += inc
+
+        # backward_pre_hook expects callback with only `module` and `grad_output`
+        # as arguments.
+        def bw_pre_hook(inc, h_module, grad_output):
+            self.assertIsInstance(grad_output, tuple)
+            self.assertTrue(h_module is module)
+            self.assertEqual(grad_output[0], torch.ones(5, 5) * 2)
+            counter['backwards'] += inc
+
+        test_fwd = module.register_forward_hook(lambda *args: fw_hook(1, *args))
+
+        module(input)
+        module(input)
+        self.assertEqual(counter['forwards'], 2)
+        self.assertEqual(counter['backwards'], 0)
+
+        bw_hook_fn = bw_pre_hook if backward_register_fn == 'register_full_backward_pre_hook' else bw_hook
+        test_bwd = getattr(module, backward_register_fn)(
+            lambda *args: bw_hook_fn(1, *args))
+
+        output = module(input)
+        self.assertEqual(counter['forwards'], 3)
+        self.assertEqual(counter['backwards'], 0)
+
+        output.backward(torch.ones(5, 5) * 2, retain_graph=True)
+        self.assertEqual(counter['forwards'], 3)
+        self.assertEqual(counter['backwards'], 1)
+
+        output.backward(torch.ones(5, 5) * 2, retain_graph=True)
+        self.assertEqual(counter['forwards'], 3)
+        self.assertEqual(counter['backwards'], 2)
+
+        test2_fwd = module.register_forward_hook(lambda *args: fw_hook(2, *args))
+
+        output = module(input)
+        self.assertEqual(counter['forwards'], 6)
+        self.assertEqual(counter['backwards'], 2)
+
+        test2_bwd = getattr(module, backward_register_fn)(lambda *args: bw_hook_fn(2, *args))
+
+        module(input).backward(torch.ones(5, 5) * 2)
+        self.assertEqual(counter['forwards'], 9)
+        self.assertEqual(counter['backwards'], 5)
+
+        test2_bwd.remove()
+
+        module(input).backward(torch.ones(5, 5) * 2)
+        self.assertEqual(counter['forwards'], 12)
+        self.assertEqual(counter['backwards'], 6)
+
+        test2_fwd.remove()
+
+        module(input).backward(torch.ones(5, 5) * 2)
+        self.assertEqual(counter['forwards'], 13)
+        self.assertEqual(counter['backwards'], 7)
+
+        test_fwd.remove()
+        test_bwd.remove()
+
+    @skipIfTorchDynamo("TorchDynamo does not work well with hooks")
+    def test_hooks(self):
+        self._test_hooks("register_backward_hook")
+        self._test_hooks("register_full_backward_hook")
+        self._test_hooks("register_full_backward_pre_hook")
+
+    def test_hook_cpp(self):
+        bn = nn.BatchNorm1d(5)
+
+        def hook(module, grad_inputs, grad_outputs):
+            self.assertEqual(len(grad_inputs), 1)
+            self.assertEqual(len(grad_outputs), 1)
+            self.assertEqual(module, bn)
+
+        bn.register_full_backward_hook(hook)
+        output = bn(torch.randn(5, 5, requires_grad=True))
+        output.sum().backward()
+
+    @skipIfTorchDynamo("TorchDynamo does not work well with hooks")
+    def test_backward_hooks_interaction(self):
+        # Test to make sure that the grad_outputs
+        # updated by full_backward_pre_hook are received by
+        # the full_backward_hook
+        module = torch.nn.Sigmoid()
+
+        cnt = {'backward_cnt': 0}
+
+        def bw_pre_hook(m, grad_output):
+            cnt['backward_cnt'] += 1
+            return (grad_output[0] * 0.5, )
+
+        def bw_hook(m, grad_in, grad_output):
+            self.assertEqual(torch.full_like(grad_output[0], 0.5), grad_output[0])
+            cnt['backward_cnt'] += 1
+            return grad_output
+
+        module.register_full_backward_pre_hook(bw_pre_hook)
+        module.register_full_backward_hook(bw_hook)
+
+        t = torch.ones(1, 2, requires_grad=True)
+        module(t).sum().backward()
+        self.assertEqual(cnt['backward_cnt'], 2)
+
+    def test_hook_invalid_outputs(self):
+        module = nn.Sigmoid()
+        input = torch.randn(5, 5, requires_grad=True)
+
+        def bw_fail1(self, grad_input, grad_output):
+            return grad_input[:-1]
+
+        def bw_fail2(self, grad_input, grad_output):
+            return grad_input + (torch.randn(2, 2),)
+
+        with module.register_backward_hook(bw_fail1):
+            with self.assertRaisesRegex(RuntimeError, 'got 0, but expected 1'):
+                module(input).sum().backward()
+
+        with module.register_backward_hook(bw_fail2):
+            with self.assertRaisesRegex(RuntimeError, 'got 2, but expected 1'):
+                module(input).sum().backward()
+
+        def bw_pre_fail1(self, grad_output):
+            return ()
+
+        def bw_pre_fail2(self, grad_output):
+            return grad_output + (torch.randn(2, 2),)
+
+        with module.register_full_backward_pre_hook(bw_pre_fail1):
+            with self.assertRaisesRegex(RuntimeError, 'got 0, but expected 1'):
+                module(input).sum().backward()
+
+        with module.register_full_backward_pre_hook(bw_pre_fail2):
+            with self.assertRaisesRegex(RuntimeError, 'got 2, but expected 1'):
+                module(input).sum().backward()
+
+    def test_hook_requires_grad(self):
+        test_self = self
+
+        class MyModule(nn.Module):
+            def forward(self, arg1, arg2, arg3):
+                test_self.assertTrue(arg1.requires_grad)
+                test_self.assertFalse(arg2.requires_grad)
+                test_self.assertTrue(arg3.requires_grad)
+                return arg1.sum() + arg2.sum() + arg3.sum()
+
+        inp = torch.rand(2, requires_grad=True)
+        mod = MyModule()
+
+        mod(inp, inp.detach(), inp)
+        # Ensure that requires grad is properly propagated
+        mod.register_full_backward_hook(lambda mod, gI, gO: None)
+        mod(inp, inp.detach(), inp)
+
+    @skipIfTorchDynamo("TorchDynamo does not work well with hooks")
+    def test_hook_no_requires_grad(self):
+        mod = nn.Linear(2, 3)
+
+        inp = torch.rand(1, 2)
+
+        return_val = "None"
+        hook_called = [0]
+
+        def hook(mod, grad_input, grad_output):
+            hook_called[0] += 1
+            for gI in grad_input:
+                self.assertIsNone(gI)
+            for gO in grad_output:
+                self.assertEqual(gO.size(), (1, 3))
+
+            if return_val == "grad_input":
+                return grad_input
+            elif return_val == "invalid":
+                # If the inputs were requiring gradients, this would be
+                # a valid return
+                return inp
+            elif return_val == "None":
+                return None
+            else:
+                raise RuntimeError("Invalid return_val string")
+
+        mod.register_full_backward_hook(hook)
+
+        # This should run and trigger the hook properly
+        mod(inp).sum().backward()
+        self.assertEqual(hook_called[0], 1)
+
+        return_val = "grad_input"
+
+        mod(inp).sum().backward()
+        self.assertEqual(hook_called[0], 2)
+
+        return_val = "invalid"
+        with self.assertRaisesRegex(RuntimeError, "where no input requires gradient"):
+            mod(inp).sum().backward()
+
+    def test_hook_last_arg_requires_grad(self):
+        mod = nn.L1Loss()
+        inp = torch.rand(1, requires_grad=True)
+        mod.register_full_backward_hook(lambda m, gI, gO: None)
+
+        try:
+            mod(inp.detach(), inp)
+        except Exception as ex:
+            self.fail("Unexpected exception: %s" % ex)
+
+    def test_hook_extra_input(self):
+        class MyModule(nn.Module):
+            def forward(self, non_tensor, tensor):
+                return tensor.clone(), non_tensor
+
+        inp = torch.rand(2, requires_grad=True)
+        mod = MyModule()
+
+        def hook(mod, grad_input, grad_output):
+            self.assertIsNone(grad_input[0])
+            self.assertIsInstance(grad_input[1], torch.Tensor)
+
+            self.assertIsInstance(grad_output[0], torch.Tensor)
+            self.assertIsNone(grad_output[1])
+
+        mod.register_full_backward_hook(hook)
+        out, _ = mod(True, inp)
+        out.sum().backward()
+
+    def test_hook_inplace(self):
+        class MyModule(nn.Module):
+            def forward(self, inp, do_inplace):
+                self.inp = inp
+                if do_inplace:
+                    inp += 1
+                return inp.clone()
+
+        hook_called = [0]
+
+        def hook(mod, grad_input, grad_output):
+            hook_called[0] += 1
+
+        def hook_pre(mod, grad_output):
+            hook_called[0] += 1
+
+        inp = torch.rand(10, requires_grad=True)
+        mod = MyModule()
+        for hook_fn, register_fn in [(hook, mod.register_full_backward_hook),
+                                     (hook_pre, mod.register_full_backward_pre_hook)]:
+            hook_called[0] = 0
+            with register_fn(hook_fn):
+                # No inplace should work
+                mod(inp, False).sum().backward()
+                self.assertEqual(hook_called[0], 1)
+
+                # Input inplace error should throw an error
+                with self.assertRaisesRegex(RuntimeError, "Output 0 of BackwardHookFunctionBackward is "
+                                            "a view and is being modified inplace."):
+                    mod(inp.clone(), True)
+
+                # Input inplace error should throw an error if we try to re-use the view after they have
+                # been modified
+                local_inp = inp.clone()
+                out = mod(local_inp, False)
+                local_inp[0] *= 1
+                with self.assertRaisesRegex(RuntimeError, "Output 0 of BackwardHookFunctionBackward is "
+                                            "a view and its base or another view"):
+                    # Any operation involving the view will fail here
+                    mod.inp + 2
+
+                # Output inplace error should throw an error
+                out = mod(inp, False)
+                with self.assertRaisesRegex(RuntimeError, "BackwardHookFunctionBackward is a view "
+                                            "and is being modified inplace."):
+                    out += 1
+
+    def test_hook_non_full_warning(self):
+        def noop(*args):
+            pass
+
+        a = torch.rand(2, requires_grad=True)
+        b = torch.rand(2, requires_grad=True)
+
+        # Check invalid input container
+        class MyModule(nn.Module):
+            def forward(self, l):
+                return l[0].clone(), l[1].clone()
+
+        m = MyModule()
+        m.register_backward_hook(noop)
+
+        with self.assertWarnsRegex(UserWarning, "does not take as input a single Tensor or a tuple of Tensors"):
+            m([a, b])
+
+        # Check invalid output container
+        class MyModule(nn.Module):
+            def forward(self, a, b):
+                return [a.clone(), b.clone()]
+
+        m = MyModule()
+        m.register_backward_hook(noop)
+
+        with self.assertWarnsRegex(UserWarning, "does not return a single Tensor or a tuple of Tensors"):
+            m(a, b)
+
+        # Check invalid output from different Nodes
+        class MyModule(nn.Module):
+            def forward(self, a, b):
+                return a.clone(), b.clone()
+
+        m = MyModule()
+        m.register_backward_hook(noop)
+
+        with self.assertWarnsRegex(UserWarning, "outputs are generated by different autograd Nodes"):
+            m(a, b)
+
+        # Check invalid forward with multiple Nodes
+        class MyModule(nn.Module):
+            def forward(self, a):
+                return a.clone().clone()
+
+        m = MyModule()
+        m.register_backward_hook(noop)
+
+        with self.assertWarnsRegex(UserWarning, "the forward contains multiple autograd Nodes"):
+            m(a)
+
+    def test_hook_backward_size(self):
+        # Make module with multiple operations in forward
+        # And different size for input and outputs
+        class MyModule(nn.Module):
+            def forward(self, arg1, arg2):
+                tmp = arg1.sum() * arg2
+                tmp = tmp + arg2.sum() * arg1.sum()
+                tmp = tmp.sum().view(1)
+                tmp = tmp.expand(8).contiguous()
+                return tmp
+
+        module = MyModule()
+        inp1 = torch.randn(5, 5, requires_grad=True)
+        inp2 = torch.randn(10, 10, requires_grad=True)
+
+        def bw_hook(module, grad_input, grad_output):
+            self.assertEqual(len(grad_input), 2)
+            self.assertEqual(grad_input[0].size(), torch.Size([5, 5]))
+            self.assertEqual(grad_input[1].size(), torch.Size([10, 10]))
+            self.assertEqual(len(grad_output), 1)
+            self.assertEqual(grad_output[0].size(), torch.Size([8]))
+
+        with module.register_full_backward_hook(bw_hook):
+            module(inp1, inp2).sum().backward()
+
+    @skipIfTorchDynamo("TorchDynamo does not work well with hooks")
+    def test_hook_backward_writeable(self):
+        module = nn.Sigmoid()
+        input = torch.randn(5, 5, requires_grad=True)
+        sig_x = torch.nn.functional.sigmoid(input)
+
+        def bw_hook(module, grad_input, grad_output):
+            for grad in grad_input:
+                self.assertTrue(isinstance(grad, torch.Tensor))
+            for grad in grad_output:
+                self.assertTrue(isinstance(grad, torch.Tensor))
+            return tuple(gi * 2 for gi in grad_input)
+
+        module.register_backward_hook(bw_hook)
+        module(input).backward(torch.ones(5, 5))
+        expected_grad = sig_x * (1 - sig_x) * 2
+        self.assertEqual(input.grad, expected_grad)
+
+    @skipIfTorchDynamo("TorchDynamo does not work well with hooks")
+    def test_hook_forward_preforward_writable(self):
+        module = nn.Sigmoid()
+        input = torch.randn(5, 5, requires_grad=True)
+        sig_x = torch.nn.functional.sigmoid(input)
+
+        def forward_pre_hook(m, input):
+            return torch.nn.functional.relu(input[0])
+
+        def forward_hook(m, input, output):
+            return -output
+
+        module.register_forward_pre_hook(forward_pre_hook)
+        module.register_forward_hook(forward_hook)
+        output = module(input)
+        expected_res = -torch.nn.functional.sigmoid(torch.nn.functional.relu(input))
+        self.assertEqual(output, expected_res)
+        output.backward(torch.ones(5, 5) * 2, retain_graph=True)
+        mask = (input > 0)
+        expected_grad = -sig_x * (1 - sig_x) * 2 * mask
+        self.assertEqual(input.grad, expected_grad)
+
+    def test_hook_buffer_registration(self):
+        for return_buffer in (True, False):
+            def buffer_registration_hook(module, name, buffer):
+                buffer.registered = True
+                if return_buffer:
+                    return buffer
+            handle = torch.nn.modules.module.register_module_buffer_registration_hook(
+                buffer_registration_hook
+            )
+            try:
+                l, n, s = _create_basic_net()
+                for b in s.buffers():
+                    self.assertTrue(getattr(b, "registered", False))
+            finally:
+                handle.remove()
+
+    def test_hook_submodule_registration(self):
+        for return_submodule in (True, False):
+            def module_registration_hook(module, name, submodule):
+                module.registered = True
+                submodule.registered = True
+                if return_submodule:
+                    return submodule
+            handle = torch.nn.modules.module.register_module_module_registration_hook(
+                module_registration_hook
+            )
+            try:
+                l, n, s = _create_basic_net()
+                for m in s.modules():
+                    self.assertTrue(getattr(m, "registered", False))
+            finally:
+                handle.remove()
+
+    def test_hook_parameter_registration(self):
+        for return_parameter in (True, False):
+            def parameter_registration_hook(module, name, parameter):
+                parameter.registered = True
+                if return_parameter:
+                    return parameter
+            handle = torch.nn.modules.module.register_module_parameter_registration_hook(
+                parameter_registration_hook
+            )
+            try:
+                l, n, s = _create_basic_net()
+                for p in s.parameters():
+                    self.assertTrue(getattr(p, "registered", False))
+            finally:
+                handle.remove()
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_nn.py b/test/test_nn.py
index 71cdb219040e9..dc9ce0707dc1f 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -13,9 +13,6 @@
 from itertools import product
 from functools import partial
 from collections import OrderedDict
-from tempfile import NamedTemporaryFile
-import weakref
-import gc
 
 import torch
 
@@ -43,11 +40,11 @@
     TEST_NUMPY, TEST_SCIPY, TEST_WITH_CROSSREF, TEST_WITH_ROCM, \
     download_file, get_function_arglist, load_tests, skipIfMps,\
     TemporaryFileName, TEST_WITH_UBSAN, IS_PPC, \
-    parametrize as parametrize_test, subtest, instantiate_parametrized_tests, IS_WINDOWS, \
+    parametrize as parametrize_test, subtest, instantiate_parametrized_tests, \
     skipIfTorchDynamo
 from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, TEST_CUDNN_VERSION
 from torch.testing._internal.common_nn import NNTestCase, NewModuleTest, CriterionTest, \
-    module_tests, criterion_tests, loss_reference_fns, \
+    module_tests, criterion_tests, loss_reference_fns, _create_basic_net, \
     ctcloss_reference, new_module_tests, single_batch_reference_fn, _test_bfloat16_ops, _test_module_empty_input
 from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes, \
     dtypesIfCUDA, precisionOverride, skipCUDAIfCudnnVersionLessThan, onlyCUDA, onlyCPU, \
@@ -142,26 +139,6 @@ def _get_parameters(self, module):
             d_params.append(p.grad)
         return params, d_params
 
-    def _create_basic_net(self):
-        class Layer(nn.Module):
-            def __init__(self):
-                super(Layer, self).__init__()
-                self.layer_dummy_param = Parameter(torch.empty(3, 5))
-                self.register_buffer('layer_dummy_buf', torch.zeros(1, 3, 3, 7))
-
-        class Net(nn.Module):
-            def __init__(self):
-                super(Net, self).__init__()
-                self.l1 = Layer()
-                self.dummy_param = Parameter(torch.empty(3, 5))
-                self.register_buffer('dummy_buf', torch.zeros(7, 3, 3, 1))
-
-        l = Layer()
-        n = Net()
-        s = nn.Sequential(n, n)
-
-        return l, n, s
-
     def test_parse_to(self):
         # Test for buggy use of THPMemoryFormat_New
         self.assertEqual(
@@ -170,7 +147,7 @@ def test_parse_to(self):
         )
 
     def test_requires_grad_(self):
-        m = self._create_basic_net()[-1]
+        m = _create_basic_net()[-1]
         assert len(list(m.buffers())) > 0, 'invalid test'
         assert all(not b.requires_grad for b in m.buffers()) > 0, 'invalid test'
         assert len(list(m.parameters())) > 0, 'invalid test'
@@ -214,464 +191,6 @@ def forward(self, inp):
         for b in net.buffers():
             self.assertTrue(b.storage().is_shared())
 
-    def _test_hooks(self, backward_register_fn):
-        module = nn.Sigmoid()
-        input = torch.ones(5, 5, requires_grad=True)
-
-        counter = {
-            'forwards': 0,
-            'backwards': 0
-        }
-
-        def fw_hook(inc, h_module, input, output):
-            self.assertIsInstance(input, tuple)
-            self.assertTrue(isinstance(output, torch.Tensor))
-            self.assertTrue(h_module is module)
-            self.assertEqual(input[0], torch.ones(5, 5))
-            self.assertEqual(output, torch.empty(5, 5).fill_(1 / (1 + 1 / math.e)))
-            counter['forwards'] += inc
-
-        def bw_hook(inc, h_module, grad_input, grad_output):
-            self.assertIsInstance(grad_input, tuple)
-            self.assertIsInstance(grad_output, tuple)
-            self.assertTrue(h_module is module)
-            self.assertEqual(grad_output[0], torch.ones(5, 5) * 2)
-            counter['backwards'] += inc
-
-        # backward_pre_hook expects callback with only `module` and `grad_output`
-        # as arguments.
-        def bw_pre_hook(inc, h_module, grad_output):
-            self.assertIsInstance(grad_output, tuple)
-            self.assertTrue(h_module is module)
-            self.assertEqual(grad_output[0], torch.ones(5, 5) * 2)
-            counter['backwards'] += inc
-
-        test_fwd = module.register_forward_hook(lambda *args: fw_hook(1, *args))
-
-        module(input)
-        module(input)
-        self.assertEqual(counter['forwards'], 2)
-        self.assertEqual(counter['backwards'], 0)
-
-        bw_hook_fn = bw_pre_hook if backward_register_fn == 'register_full_backward_pre_hook' else bw_hook
-        test_bwd = getattr(module, backward_register_fn)(
-            lambda *args: bw_hook_fn(1, *args))
-
-        output = module(input)
-        self.assertEqual(counter['forwards'], 3)
-        self.assertEqual(counter['backwards'], 0)
-
-        output.backward(torch.ones(5, 5) * 2, retain_graph=True)
-        self.assertEqual(counter['forwards'], 3)
-        self.assertEqual(counter['backwards'], 1)
-
-        output.backward(torch.ones(5, 5) * 2, retain_graph=True)
-        self.assertEqual(counter['forwards'], 3)
-        self.assertEqual(counter['backwards'], 2)
-
-        test2_fwd = module.register_forward_hook(lambda *args: fw_hook(2, *args))
-
-        output = module(input)
-        self.assertEqual(counter['forwards'], 6)
-        self.assertEqual(counter['backwards'], 2)
-
-        test2_bwd = getattr(module, backward_register_fn)(lambda *args: bw_hook_fn(2, *args))
-
-        module(input).backward(torch.ones(5, 5) * 2)
-        self.assertEqual(counter['forwards'], 9)
-        self.assertEqual(counter['backwards'], 5)
-
-        test2_bwd.remove()
-
-        module(input).backward(torch.ones(5, 5) * 2)
-        self.assertEqual(counter['forwards'], 12)
-        self.assertEqual(counter['backwards'], 6)
-
-        test2_fwd.remove()
-
-        module(input).backward(torch.ones(5, 5) * 2)
-        self.assertEqual(counter['forwards'], 13)
-        self.assertEqual(counter['backwards'], 7)
-
-        test_fwd.remove()
-        test_bwd.remove()
-
-    @skipIfTorchDynamo("TorchDynamo does not work well with hooks")
-    def test_hooks(self):
-        self._test_hooks("register_backward_hook")
-        self._test_hooks("register_full_backward_hook")
-        self._test_hooks("register_full_backward_pre_hook")
-
-    def test_hook_cpp(self):
-        bn = nn.BatchNorm1d(5)
-
-        def hook(module, grad_inputs, grad_outputs):
-            self.assertEqual(len(grad_inputs), 1)
-            self.assertEqual(len(grad_outputs), 1)
-            self.assertEqual(module, bn)
-
-        bn.register_full_backward_hook(hook)
-        output = bn(torch.randn(5, 5, requires_grad=True))
-        output.sum().backward()
-
-    @skipIfTorchDynamo("TorchDynamo does not work well with hooks")
-    def test_backward_hooks_interaction(self):
-        # Test to make sure that the grad_outputs
-        # updated by full_backward_pre_hook are received by
-        # the full_backward_hook
-        module = torch.nn.Sigmoid()
-
-        cnt = {'backward_cnt': 0}
-
-        def bw_pre_hook(m, grad_output):
-            cnt['backward_cnt'] += 1
-            return (grad_output[0] * 0.5, )
-
-        def bw_hook(m, grad_in, grad_output):
-            self.assertEqual(torch.full_like(grad_output[0], 0.5), grad_output[0])
-            cnt['backward_cnt'] += 1
-            return grad_output
-
-        module.register_full_backward_pre_hook(bw_pre_hook)
-        module.register_full_backward_hook(bw_hook)
-
-        t = torch.ones(1, 2, requires_grad=True)
-        module(t).sum().backward()
-        self.assertEqual(cnt['backward_cnt'], 2)
-
-    def test_hook_invalid_outputs(self):
-        module = nn.Sigmoid()
-        input = torch.randn(5, 5, requires_grad=True)
-
-        def bw_fail1(self, grad_input, grad_output):
-            return grad_input[:-1]
-
-        def bw_fail2(self, grad_input, grad_output):
-            return grad_input + (torch.randn(2, 2),)
-
-        with module.register_backward_hook(bw_fail1):
-            with self.assertRaisesRegex(RuntimeError, 'got 0, but expected 1'):
-                module(input).sum().backward()
-
-        with module.register_backward_hook(bw_fail2):
-            with self.assertRaisesRegex(RuntimeError, 'got 2, but expected 1'):
-                module(input).sum().backward()
-
-        def bw_pre_fail1(self, grad_output):
-            return ()
-
-        def bw_pre_fail2(self, grad_output):
-            return grad_output + (torch.randn(2, 2),)
-
-        with module.register_full_backward_pre_hook(bw_pre_fail1):
-            with self.assertRaisesRegex(RuntimeError, 'got 0, but expected 1'):
-                module(input).sum().backward()
-
-        with module.register_full_backward_pre_hook(bw_pre_fail2):
-            with self.assertRaisesRegex(RuntimeError, 'got 2, but expected 1'):
-                module(input).sum().backward()
-
-    def test_hook_requires_grad(self):
-        test_self = self
-
-        class MyModule(nn.Module):
-            def forward(self, arg1, arg2, arg3):
-                test_self.assertTrue(arg1.requires_grad)
-                test_self.assertFalse(arg2.requires_grad)
-                test_self.assertTrue(arg3.requires_grad)
-                return arg1.sum() + arg2.sum() + arg3.sum()
-
-        inp = torch.rand(2, requires_grad=True)
-        mod = MyModule()
-
-        mod(inp, inp.detach(), inp)
-        # Ensure that requires grad is properly propagated
-        mod.register_full_backward_hook(lambda mod, gI, gO: None)
-        mod(inp, inp.detach(), inp)
-
-    @skipIfTorchDynamo("TorchDynamo does not work well with hooks")
-    def test_hook_no_requires_grad(self):
-        mod = nn.Linear(2, 3)
-
-        inp = torch.rand(1, 2)
-
-        return_val = "None"
-        hook_called = [0]
-
-        def hook(mod, grad_input, grad_output):
-            hook_called[0] += 1
-            for gI in grad_input:
-                self.assertIsNone(gI)
-            for gO in grad_output:
-                self.assertEqual(gO.size(), (1, 3))
-
-            if return_val == "grad_input":
-                return grad_input
-            elif return_val == "invalid":
-                # If the inputs were requiring gradients, this would be
-                # a valid return
-                return inp
-            elif return_val == "None":
-                return None
-            else:
-                raise RuntimeError("Invalid return_val string")
-
-        mod.register_full_backward_hook(hook)
-
-        # This should run and trigger the hook properly
-        mod(inp).sum().backward()
-        self.assertEqual(hook_called[0], 1)
-
-        return_val = "grad_input"
-
-        mod(inp).sum().backward()
-        self.assertEqual(hook_called[0], 2)
-
-        return_val = "invalid"
-        with self.assertRaisesRegex(RuntimeError, "where no input requires gradient"):
-            mod(inp).sum().backward()
-
-    def test_hook_last_arg_requires_grad(self):
-        mod = nn.L1Loss()
-        inp = torch.rand(1, requires_grad=True)
-        mod.register_full_backward_hook(lambda m, gI, gO: None)
-
-        try:
-            mod(inp.detach(), inp)
-        except Exception as ex:
-            self.fail("Unexpected exception: %s" % ex)
-
-    def test_hook_extra_input(self):
-        class MyModule(nn.Module):
-            def forward(self, non_tensor, tensor):
-                return tensor.clone(), non_tensor
-
-        inp = torch.rand(2, requires_grad=True)
-        mod = MyModule()
-
-        def hook(mod, grad_input, grad_output):
-            self.assertIsNone(grad_input[0])
-            self.assertIsInstance(grad_input[1], torch.Tensor)
-
-            self.assertIsInstance(grad_output[0], torch.Tensor)
-            self.assertIsNone(grad_output[1])
-
-        mod.register_full_backward_hook(hook)
-        out, _ = mod(True, inp)
-        out.sum().backward()
-
-    def test_hook_inplace(self):
-        class MyModule(nn.Module):
-            def forward(self, inp, do_inplace):
-                self.inp = inp
-                if do_inplace:
-                    inp += 1
-                return inp.clone()
-
-        hook_called = [0]
-
-        def hook(mod, grad_input, grad_output):
-            hook_called[0] += 1
-
-        def hook_pre(mod, grad_output):
-            hook_called[0] += 1
-
-        inp = torch.rand(10, requires_grad=True)
-        mod = MyModule()
-        for hook_fn, register_fn in [(hook, mod.register_full_backward_hook),
-                                     (hook_pre, mod.register_full_backward_pre_hook)]:
-            hook_called[0] = 0
-            with register_fn(hook_fn):
-                # No inplace should work
-                mod(inp, False).sum().backward()
-                self.assertEqual(hook_called[0], 1)
-
-                # Input inplace error should throw an error
-                with self.assertRaisesRegex(RuntimeError, "Output 0 of BackwardHookFunctionBackward is "
-                                            "a view and is being modified inplace."):
-                    mod(inp.clone(), True)
-
-                # Input inplace error should throw an error if we try to re-use the view after they have
-                # been modified
-                local_inp = inp.clone()
-                out = mod(local_inp, False)
-                local_inp[0] *= 1
-                with self.assertRaisesRegex(RuntimeError, "Output 0 of BackwardHookFunctionBackward is "
-                                            "a view and its base or another view"):
-                    # Any operation involving the view will fail here
-                    mod.inp + 2
-
-                # Output inplace error should throw an error
-                out = mod(inp, False)
-                with self.assertRaisesRegex(RuntimeError, "BackwardHookFunctionBackward is a view "
-                                            "and is being modified inplace."):
-                    out += 1
-
-    def test_hook_non_full_warning(self):
-        def noop(*args):
-            pass
-
-        a = torch.rand(2, requires_grad=True)
-        b = torch.rand(2, requires_grad=True)
-
-        # Check invalid input container
-        class MyModule(nn.Module):
-            def forward(self, l):
-                return l[0].clone(), l[1].clone()
-
-        m = MyModule()
-        m.register_backward_hook(noop)
-
-        with self.assertWarnsRegex(UserWarning, "does not take as input a single Tensor or a tuple of Tensors"):
-            m([a, b])
-
-        # Check invalid output container
-        class MyModule(nn.Module):
-            def forward(self, a, b):
-                return [a.clone(), b.clone()]
-
-        m = MyModule()
-        m.register_backward_hook(noop)
-
-        with self.assertWarnsRegex(UserWarning, "does not return a single Tensor or a tuple of Tensors"):
-            m(a, b)
-
-        # Check invalid output from different Nodes
-        class MyModule(nn.Module):
-            def forward(self, a, b):
-                return a.clone(), b.clone()
-
-        m = MyModule()
-        m.register_backward_hook(noop)
-
-        with self.assertWarnsRegex(UserWarning, "outputs are generated by different autograd Nodes"):
-            m(a, b)
-
-        # Check invalid forward with multiple Nodes
-        class MyModule(nn.Module):
-            def forward(self, a):
-                return a.clone().clone()
-
-        m = MyModule()
-        m.register_backward_hook(noop)
-
-        with self.assertWarnsRegex(UserWarning, "the forward contains multiple autograd Nodes"):
-            m(a)
-
-    def test_hook_backward_size(self):
-        # Make module with multiple operations in forward
-        # And different size for input and outputs
-        class MyModule(nn.Module):
-            def forward(self, arg1, arg2):
-                tmp = arg1.sum() * arg2
-                tmp = tmp + arg2.sum() * arg1.sum()
-                tmp = tmp.sum().view(1)
-                tmp = tmp.expand(8).contiguous()
-                return tmp
-
-        module = MyModule()
-        inp1 = torch.randn(5, 5, requires_grad=True)
-        inp2 = torch.randn(10, 10, requires_grad=True)
-
-        def bw_hook(module, grad_input, grad_output):
-            self.assertEqual(len(grad_input), 2)
-            self.assertEqual(grad_input[0].size(), torch.Size([5, 5]))
-            self.assertEqual(grad_input[1].size(), torch.Size([10, 10]))
-            self.assertEqual(len(grad_output), 1)
-            self.assertEqual(grad_output[0].size(), torch.Size([8]))
-
-        with module.register_full_backward_hook(bw_hook):
-            module(inp1, inp2).sum().backward()
-
-    @skipIfTorchDynamo("TorchDynamo does not work well with hooks")
-    def test_hook_backward_writeable(self):
-        module = nn.Sigmoid()
-        input = torch.randn(5, 5, requires_grad=True)
-        sig_x = torch.nn.functional.sigmoid(input)
-
-        def bw_hook(module, grad_input, grad_output):
-            for grad in grad_input:
-                self.assertTrue(isinstance(grad, torch.Tensor))
-            for grad in grad_output:
-                self.assertTrue(isinstance(grad, torch.Tensor))
-            return tuple(gi * 2 for gi in grad_input)
-
-        module.register_backward_hook(bw_hook)
-        module(input).backward(torch.ones(5, 5))
-        expected_grad = sig_x * (1 - sig_x) * 2
-        self.assertEqual(input.grad, expected_grad)
-
-    @skipIfTorchDynamo("TorchDynamo does not work well with hooks")
-    def test_hook_forward_preforward_writable(self):
-        module = nn.Sigmoid()
-        input = torch.randn(5, 5, requires_grad=True)
-        sig_x = torch.nn.functional.sigmoid(input)
-
-        def forward_pre_hook(m, input):
-            return torch.nn.functional.relu(input[0])
-
-        def forward_hook(m, input, output):
-            return -output
-
-        module.register_forward_pre_hook(forward_pre_hook)
-        module.register_forward_hook(forward_hook)
-        output = module(input)
-        expected_res = -torch.nn.functional.sigmoid(torch.nn.functional.relu(input))
-        self.assertEqual(output, expected_res)
-        output.backward(torch.ones(5, 5) * 2, retain_graph=True)
-        mask = (input > 0).double()
-        expected_grad = -sig_x * (1 - sig_x) * 2 * mask
-        self.assertEqual(input.grad, expected_grad)
-
-    def test_hook_buffer_registration(self):
-        for return_buffer in (True, False):
-            def buffer_registration_hook(module, name, buffer):
-                buffer.registered = True
-                if return_buffer:
-                    return buffer
-            handle = torch.nn.modules.module.register_module_buffer_registration_hook(
-                buffer_registration_hook
-            )
-            try:
-                l, n, s = self._create_basic_net()
-                for b in s.buffers():
-                    self.assertTrue(getattr(b, "registered", False))
-            finally:
-                handle.remove()
-
-    def test_hook_submodule_registration(self):
-        for return_submodule in (True, False):
-            def module_registration_hook(module, name, submodule):
-                module.registered = True
-                submodule.registered = True
-                if return_submodule:
-                    return submodule
-            handle = torch.nn.modules.module.register_module_module_registration_hook(
-                module_registration_hook
-            )
-            try:
-                l, n, s = self._create_basic_net()
-                for m in s.modules():
-                    self.assertTrue(getattr(m, "registered", False))
-            finally:
-                handle.remove()
-
-    def test_hook_parameter_registration(self):
-        for return_parameter in (True, False):
-            def parameter_registration_hook(module, name, parameter):
-                parameter.registered = True
-                if return_parameter:
-                    return parameter
-            handle = torch.nn.modules.module.register_module_parameter_registration_hook(
-                parameter_registration_hook
-            )
-            try:
-                l, n, s = self._create_basic_net()
-                for p in s.parameters():
-                    self.assertTrue(getattr(p, "registered", False))
-            finally:
-                handle.remove()
-
     def test_to(self):
         m = nn.Linear(3, 5)
         self.assertIs(m, m.to('cpu'))
@@ -743,7 +262,7 @@ def test_parameters_and_named_parameters(self):
         def names(named_parameters):
             return [k for k, _ in named_parameters]
 
-        l, n, s = self._create_basic_net()
+        l, n, s = _create_basic_net()
 
         self.assertEqual(len(list(l.parameters())), 1)
         self.assertEqual(
@@ -797,7 +316,7 @@ def test_buffers_and_named_buffers(self):
         def names(named_buffers):
             return [k for k, _ in named_buffers]
 
-        l, n, s = self._create_basic_net()
+        l, n, s = _create_basic_net()
 
         self.assertEqual(len(list(l.buffers())), 1)
         self.assertEqual(
@@ -15398,291 +14917,6 @@ def perm_fn(x):
                 _test(activation=activation, batch_first=batch_first, training=training)
 
 
-class TestModuleGlobalHooks(TestCase):
-
-    def tearDown(self):
-        nn.modules.module._global_backward_hooks = OrderedDict()
-        nn.modules.module._global_forward_hooks = OrderedDict()
-        nn.modules.module._global_forward_pre_hooks = OrderedDict()
-
-    @skipIfTorchDynamo("TorchDynamo does not work well with hooks")
-    def test_module_global_hooks(self):
-        module = nn.Sigmoid
-
-        module_1 = module()
-        module_2 = module()
-        module_3 = module()
-
-        input = torch.ones(5, 5, requires_grad=True)
-
-        counter = {
-            'forwards': 0,
-            'backwards': 0
-        }
-
-        def fw_hook(inc, h_module, input, output):
-            self.assertIsInstance(input, tuple)
-            self.assertTrue(isinstance(output, torch.Tensor))
-            self.assertTrue(isinstance(h_module, module))
-            self.assertEqual(input[0], torch.ones(5, 5))
-            self.assertEqual(output, torch.empty(5, 5).fill_(1 / (1 + 1 / math.e)))
-            counter['forwards'] += inc
-
-        def bw_hook(inc, h_module, grad_input, grad_output):
-            self.assertIsInstance(grad_input, tuple)
-            self.assertIsInstance(grad_output, tuple)
-            self.assertTrue(isinstance(h_module, module))
-            self.assertEqual(grad_output[0], torch.ones(5, 5) * 2)
-            counter['backwards'] += inc
-
-        test_fwd = nn.modules.module.register_module_forward_hook(lambda *args: fw_hook(1, *args))
-
-        module_1(input)
-        module_2(input)
-        module_3(input)
-        self.assertEqual(counter['forwards'], 3)
-        self.assertEqual(counter['backwards'], 0)
-
-        test_bwd = nn.modules.module.register_module_backward_hook(
-            lambda *args: bw_hook(1, *args))
-
-        output_1 = module_1(input)
-        output_2 = module_2(input)
-        output_3 = module_3(input)
-        self.assertEqual(counter['forwards'], 6)
-        self.assertEqual(counter['backwards'], 0)
-
-        output_1.backward(torch.ones(5, 5) * 2, retain_graph=True)
-        output_2.backward(torch.ones(5, 5) * 2, retain_graph=False)
-        output_3.backward(torch.ones(5, 5) * 2, retain_graph=False)
-        self.assertEqual(counter['forwards'], 6)
-        self.assertEqual(counter['backwards'], 3)
-
-        output_1.backward(torch.ones(5, 5) * 2, retain_graph=True)
-        self.assertEqual(counter['forwards'], 6)
-        self.assertEqual(counter['backwards'], 4)
-
-        test2_fwd = nn.modules.module.register_module_forward_hook(lambda *args: fw_hook(2, *args))
-
-        output = module_1(input)
-        output = module_2(input)
-        output = module_3(input)
-        self.assertEqual(counter['forwards'], 15)
-        self.assertEqual(counter['backwards'], 4)
-
-        test2_bwd = nn.modules.module.register_module_backward_hook(lambda *args: bw_hook(2, *args))
-
-        module_1(input).backward(torch.ones(5, 5) * 2)
-        self.assertEqual(counter['forwards'], 18)
-        self.assertEqual(counter['backwards'], 7)
-
-        test2_bwd.remove()
-
-        module_2(input).backward(torch.ones(5, 5) * 2)
-        self.assertEqual(counter['forwards'], 21)
-        self.assertEqual(counter['backwards'], 8)
-
-        test2_fwd.remove()
-
-        module_3(input).backward(torch.ones(5, 5) * 2)
-        self.assertEqual(counter['forwards'], 22)
-        self.assertEqual(counter['backwards'], 9)
-
-        test_fwd.remove()
-        test_bwd.remove()
-
-    def test_module_global_hook_invalid_outputs(self):
-        module = nn.Sigmoid()
-        input = torch.randn(5, 5, requires_grad=True)
-
-        def bw_fail1(self, grad_input, grad_output):
-            return grad_input[:-1]
-
-        def bw_fail2(self, grad_input, grad_output):
-            return grad_input + (torch.randn(2, 2),)
-
-        with nn.modules.module.register_module_backward_hook(bw_fail1):
-            with self.assertRaisesRegex(RuntimeError, 'got 0, but expected 1'):
-                module(input).sum().backward()
-
-        with nn.modules.module.register_module_backward_hook(bw_fail2):
-            with self.assertRaisesRegex(RuntimeError, 'got 2, but expected 1'):
-                module(input).sum().backward()
-
-    @skipIfTorchDynamo("https://github.com/pytorch/torchdynamo/issues/847")
-    def test_module_backward_global_hook_writeable(self):
-        module = nn.Sigmoid()
-        input = torch.randn(5, 5, requires_grad=True)
-        sig_x = torch.sigmoid(input)
-
-        def bw_hook(module, grad_input, grad_output):
-            for grad in grad_input:
-                self.assertTrue(isinstance(grad, torch.Tensor))
-            for grad in grad_output:
-                self.assertTrue(isinstance(grad, torch.Tensor))
-            return tuple(gi * 2 for gi in grad_input)
-
-        nn.modules.module.register_module_backward_hook(bw_hook)
-        module(input).backward(torch.ones(5, 5))
-        expected_grad = sig_x * (1 - sig_x) * 2
-        self.assertEqual(input.grad, expected_grad)
-
-    @skipIfTorchDynamo("TorchDynamo does not work well with hooks")
-    def test_module_global_forward_preforward_hook_writeable(self):
-        module = nn.Sigmoid()
-        input = torch.randn(5, 5, requires_grad=True)
-        sig_x = torch.sigmoid(input)
-
-        def forward_pre_hook(m, input):
-            return torch.nn.functional.relu(input[0])
-
-        def forward_hook(m, input, output):
-            return -output
-
-        nn.modules.module.register_module_forward_pre_hook(forward_pre_hook)
-        nn.modules.module.register_module_forward_hook(forward_hook)
-        output = module(input)
-        expected_res = -torch.sigmoid(torch.nn.functional.relu(input))
-        self.assertEqual(output, expected_res)
-        output.backward(torch.ones(5, 5) * 2, retain_graph=True)
-        mask = (input > 0).double()
-        expected_grad = -sig_x * (1 - sig_x) * 2 * mask
-        self.assertEqual(input.grad, expected_grad)
-
-    @skipIfTorchDynamo("TorchDynamo does not work well with hooks")
-    def test_module_forward_preforward_hook_removable(self):
-        """
-        This test is to test when multiple pre-forward hook functions can be
-        registered successfully and used correctly, if the handle can be removable
-        during the pre-forward hook function call.
-        """
-        module = nn.Sigmoid()
-
-        def removable_hook(m, input):
-            nonlocal handle
-            handle.remove()
-            return input
-
-        def removable_hook_2(m, input):
-            nonlocal handle_2
-            handle_2.remove()
-            return input
-
-        handle = module.register_forward_pre_hook(removable_hook)
-        handle_2 = module.register_forward_pre_hook(removable_hook_2)
-
-        # make sure hook register is successful
-        self.assertEqual(len(handle.hooks_dict_ref()), 2)
-        self.assertEqual(len(handle_2.hooks_dict_ref()), 2)
-
-        input = torch.randn(2, 2)
-        output = module(input)
-        self.assertEqual(torch.sigmoid(input), output)
-
-        # make sure hook removal is successful
-        self.assertFalse(handle.id in handle.hooks_dict_ref())
-        self.assertFalse(handle_2.id in handle.hooks_dict_ref())
-        self.assertEqual(len(handle.hooks_dict_ref()), 0)
-        self.assertEqual(len(handle_2.hooks_dict_ref()), 0)
-
-    @skipIfTorchDynamo("TorchDynamo does not work well with hooks")
-    def test_module_forward_forward_hook_removable(self):
-        """
-        This test is to test when multiple forward hook functions can be registered
-        successfully and used correctly, if the handle can be removable during the
-        forward hook function call.
-        """
-        module = nn.Sigmoid()
-
-        def removable_hook(m, input, output):
-            nonlocal handle
-            handle.remove()
-            return output
-
-        def removable_hook_2(m, input, output):
-            nonlocal handle_2
-            handle_2.remove()
-            return output
-
-        handle = module.register_forward_hook(removable_hook)
-        handle_2 = module.register_forward_hook(removable_hook_2)
-
-        # make sure hook register is successful
-        self.assertEqual(len(handle.hooks_dict_ref()), 2)
-        self.assertEqual(len(handle_2.hooks_dict_ref()), 2)
-
-        input = torch.randn(2, 2)
-        output = module(input)
-        self.assertEqual(torch.sigmoid(input), output)
-
-        # make sure hook removal is successful
-        self.assertFalse(handle.id in handle.hooks_dict_ref())
-        self.assertFalse(handle_2.id in handle.hooks_dict_ref())
-        self.assertEqual(len(handle.hooks_dict_ref()), 0)
-        self.assertEqual(len(handle_2.hooks_dict_ref()), 0)
-
-    @skipIfTorchDynamo("TorchDynamo does not work well with hooks")
-    def test_global_and_local_hooks_order(self):
-        module = nn.Sigmoid()
-
-        global_forward_pre_called = False
-        local_forward_pre_called = False
-        global_forward_called = False
-        local_forward_called = False
-        global_backward_called = False
-        local_backward_called = False
-
-        def global_forward_pre_hook(m, input):
-            nonlocal global_forward_pre_called
-            self.assertTrue(not local_forward_pre_called)
-            global_forward_pre_called = True
-            return input
-
-        def local_forward_pre_hook(m, input):
-            nonlocal local_forward_pre_called
-            self.assertTrue(global_forward_pre_called)
-            local_forward_pre_called = True
-            return input
-
-        def global_forward_hook(m, input, output):
-            nonlocal global_forward_called
-            self.assertTrue(not local_forward_called)
-            global_forward_called = True
-            return output
-
-        def local_forward_hook(m, input, output):
-            nonlocal local_forward_called
-            self.assertTrue(global_forward_called)
-            local_forward_called = True
-            return output
-
-        def global_backward_hook(m, input, output):
-            nonlocal global_backward_called
-            self.assertTrue(not local_backward_called)
-            global_backward_called = True
-            return input
-
-        def local_backward_hook(m, input, output):
-            nonlocal local_backward_called
-            self.assertTrue(global_backward_called)
-            local_backward_called = True
-            return input
-
-        input = torch.randn(5, 5, requires_grad=True)
-        nn.modules.module.register_module_forward_pre_hook(global_forward_pre_hook)
-        module.register_forward_pre_hook(local_forward_pre_hook)
-        nn.modules.module.register_module_forward_hook(global_forward_hook)
-        module.register_forward_hook(local_forward_hook)
-        nn.modules.module.register_module_backward_hook(global_backward_hook)
-        module.register_backward_hook(local_backward_hook)
-
-        output = module(input)
-        self.assertTrue(local_forward_called and local_forward_pre_called and global_forward_called and global_forward_pre_called)
-
-        output.backward(torch.ones(5, 5), retain_graph=True)
-        self.assertTrue(local_backward_called and global_backward_called)
-
-
 class TestFunctionalPickle(TestCase):
 
     # issue gh-38137
@@ -15690,209 +14924,6 @@ def test_pickle_softsign(self):
         # Make sure it does not throw an exception
         s = pickle.dumps(F.softsign)
 
-def _hook_to_pickle(*args, **kwargs):
-    pass
-
-class TestStateDictHooks(TestCase):
-
-    def test_load_state_dict_pre_hook(self):
-
-        m = nn.Linear(10, 10)
-        m_state_dict = m.state_dict()
-
-        m_load = nn.Linear(10, 10)
-
-        hook_called = 0
-
-        def hook_without_module(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
-            self.assertEqual(m_state_dict, state_dict)
-            nonlocal hook_called
-            hook_called += 1
-
-        def hook_with_module(module, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
-            self.assertEqual(m_state_dict, state_dict)
-            self.assertTrue(m_load is module)
-            nonlocal hook_called
-            hook_called += 1
-
-        hook_called = 0
-        m_load._register_load_state_dict_pre_hook(hook_without_module)
-        m_load.load_state_dict(m_state_dict)
-        self.assertEqual(1, hook_called)
-
-        hook_called = 0
-        m_load._register_load_state_dict_pre_hook(hook_with_module, True)
-        m_load.load_state_dict(m_state_dict)
-        self.assertEqual(2, hook_called)
-
-    def test_no_extra_ref_to_module(self):
-        try:
-            gc.disable()
-            m = nn.Linear(10, 10)
-
-            m._register_load_state_dict_pre_hook(_hook_to_pickle, True)
-            weak_m = weakref.ref(m)
-            del m
-
-            self.assertEqual(weak_m(), None)
-        finally:
-            gc.enable()
-
-    def test_pickled_hook(self):
-        m = nn.Linear(10, 10)
-        m._register_load_state_dict_pre_hook(_hook_to_pickle, True)
-        pickle.loads(pickle.dumps(m))
-
-    def test_load_state_dict_module_pre_hook(self):
-        hook_called = 0
-
-        # Test with module instance method as hook
-        class MyModule(nn.Module):
-            def __init__(self):
-                super(MyModule, self).__init__()
-                self.foo = torch.nn.Parameter(torch.rand(10))
-
-            def my_pre_load_hook(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
-                assert [] == error_msgs
-                assert [] == unexpected_keys
-                assert [] == missing_keys
-                assert strict
-                nonlocal hook_called
-                hook_called += 1
-
-            def my_pre_load_hook_with_module(
-                self,
-                module,
-                state_dict,
-                prefix,
-                local_metadata,
-                strict,
-                missing_keys,
-                unexpected_keys,
-                error_msgs,
-            ):
-                assert [] == error_msgs
-                assert [] == unexpected_keys
-                assert [] == missing_keys
-                assert strict
-                assert self is module
-                nonlocal hook_called
-                hook_called += 1
-
-        # Test that hooks registered on a submodule are also called
-        # appropriately, i.e. with the submodule as module argument in
-        # my_pre_load_hook_with_module.
-        class MyModuleContainer(nn.Module):
-            def __init__(self, mod):
-                super().__init__()
-                self.mod = mod
-
-        for ctor in [MyModuleContainer, lambda x: x]:
-            m = ctor(MyModule())
-            state_dict = m.state_dict()
-            if isinstance(m, MyModuleContainer):
-                mod = m.mod
-            else:
-                mod = m
-
-            hook_called = 0
-            mod._register_load_state_dict_pre_hook(
-                mod.my_pre_load_hook
-            )
-            m.load_state_dict(state_dict)
-            self.assertEqual(1, hook_called)
-
-            hook_called = 0
-            mod._register_load_state_dict_pre_hook(
-                mod.my_pre_load_hook_with_module, True
-            )
-            m.load_state_dict(state_dict)
-            self.assertEqual(2, hook_called)
-
-    def test_load_state_dict_post_hook(self):
-        hook_called = 0
-
-        class MyModule(nn.Module):
-            def __init__(self):
-                super(MyModule, self).__init__()
-                self.foo = torch.nn.Parameter(torch.rand(10))
-
-            def my_post_load_hook(self, module, incompatible_keys):
-                assert module is self
-                nonlocal hook_called
-                incompatible_keys.missing_keys.append("foo")
-                incompatible_keys.unexpected_keys.append("bar")
-                hook_called += 1
-
-        nested = MyModule()
-        wrapped = nn.ModuleList([nested])
-        handle = nested.register_load_state_dict_post_hook(
-            nested.my_post_load_hook,
-        )
-        # Hook must be called even if it is wrapped
-        ret = wrapped.load_state_dict(wrapped.state_dict(), strict=False)
-        self.assertEqual(hook_called, 1)
-        # Ensure that the hook modified missing_keys and unexpected_keys
-        missing = ret.missing_keys
-        unexpected = ret.unexpected_keys
-        self.assertEqual(missing, ["foo"])
-        self.assertEqual(unexpected, ["bar"])
-        # When called with strict=True, the error raised should mention the
-        # missing and unexpected keys the hook added.
-        with self.assertRaisesRegex(RuntimeError, "foo.*\n.*bar"):
-            wrapped.load_state_dict(wrapped.state_dict(), strict=True)
-        self.assertEqual(hook_called, 2)
-        # Removing the hook via handle.remove() should cause it not to
-        # fire anymore.
-        handle.remove()
-        # Hook did not run so it should not have added any keys
-        ret = wrapped.load_state_dict(wrapped.state_dict(), strict=False)
-        self.assertEqual(ret.missing_keys, [])
-        self.assertEqual(ret.unexpected_keys, [])
-        # hook_called should not have been incremented
-        self.assertEqual(hook_called, 2)
-
-        def load_hook_clear_incompatible(module, incompatible_keys):
-            incompatible_keys.missing_keys.clear()
-            incompatible_keys.unexpected_keys.clear()
-
-        nested.register_load_state_dict_post_hook(load_hook_clear_incompatible)
-        state_dict = wrapped.state_dict()
-        state_dict["extra"] = torch.ones(1)
-        # load state_dict with strict=True should not throw.
-        ret = wrapped.load_state_dict(state_dict, strict=True)
-        # explicitly ensure that the post hook clearned out incompatible_keys
-        self.assertEqual([], ret.missing_keys)
-        self.assertEqual([], ret.unexpected_keys)
-
-    @unittest.skipIf(IS_WINDOWS, "Tempfile permission issue on windows")
-    def test_load_state_dict_post_hook_backward_compatibility(self):
-        def my_post_load_hook(mod, _):
-            nonlocal called
-            called = True
-
-        for m in [nn.Softmin(10), nn.Softmax(10), nn.LogSoftmax(10)]:
-            called = False
-            sd = deepcopy(m.state_dict())
-            self.assertTrue(hasattr(m, '_load_state_dict_post_hooks'))
-            # Simulate an older model that did not have this attr
-            delattr(m, '_load_state_dict_post_hooks')
-            # Save and load, and ensure that load_state_dict works (without proper
-            # BC we would run into errors because this attribute would be expected).
-            # In particular, Softmax runs into the issue described here:
-            # https://github.com/pytorch/pytorch/issues/77280
-            with NamedTemporaryFile() as f:
-                # Note that torch.save / torch.load is not recommended to save/load
-                # modules.
-                torch.save(m, f.name)
-                m = torch.load(f.name)
-                m.load_state_dict(sd)
-                self.assertFalse(called)
-
-            # Ensure hooks can be registered and called.
-            m.register_load_state_dict_post_hook(my_post_load_hook)
-            m.load_state_dict(sd)
-            self.assertTrue(called)
 
 class TestFusionUtils(TestCase):
     def test_fuse_conv_bn_requires_grad(self):
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index 2d10ba4ba0ab4..d582b5a2b4580 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -6471,3 +6471,24 @@ def _test_module_empty_input(test_case, module, inp, check_size=True, inference=
             if p.requires_grad:
                 test_case.assertEqual(p.grad, torch.zeros_like(p.grad))
         test_case.assertEqual(inp.grad, torch.zeros_like(inp))
+
+
+def _create_basic_net():
+    class Layer(nn.Module):
+        def __init__(self):
+            super(Layer, self).__init__()
+            self.layer_dummy_param = nn.Parameter(torch.empty(3, 5))
+            self.register_buffer('layer_dummy_buf', torch.zeros(1, 3, 3, 7))
+
+    class Net(nn.Module):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.l1 = Layer()
+            self.dummy_param = nn.Parameter(torch.empty(3, 5))
+            self.register_buffer('dummy_buf', torch.zeros(7, 3, 3, 1))
+
+    l = Layer()
+    n = Net()
+    s = nn.Sequential(n, n)
+
+    return l, n, s

From 89649a05edba4d9778a30eea89d9075eb8a1d57e Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Tue, 22 Nov 2022 20:29:25 -0800
Subject: [PATCH 1203/1922] [quant] Add support for quantize_per_channel in the
 reference flow with decomposed tensor (#89270)

Summary:
att, after this PR we can produce quantize_per_channel and dequantize_per_channel ops (typically used for quantizing weights)
in the reference flow using decomposed tensor

Test Plan:
python test/test_quantization.py -k test__convert_to_reference_decomposed_fx_per_channel_quant

Reviewers:

Subscribers:

Tasks:

Tags:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89270
Approved by: https://github.com/vkuzo
---
 test/quantization/fx/test_quantize_fx.py | 31 ++++++++++++++++++++++++
 torch/ao/quantization/fx/convert.py      | 19 ++++++++++++---
 2 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index d31641ec2ae31..107e6eb589f2e 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -42,6 +42,7 @@
     QuantWrapper,
     default_qconfig,
     default_dynamic_qconfig,
+    default_per_channel_qconfig,
     default_qat_qconfig,
     default_reuse_input_qconfig,
     default_symmetric_qnnpack_qconfig,
@@ -5377,6 +5378,36 @@ def forward(self, x):
             res = m(*example_inputs)
             self.assertEqual(res, res_ref)
 
+    def test__convert_to_reference_decomposed_fx_per_channel_quant(self):
+        class M(torch.nn.Module):
+            def forward(self, x, weight, bias):
+                return F.linear(x, weight, bias)
+
+        m = M().eval()
+        qconfig_mapping = get_default_qconfig_mapping("fbgemm") \
+            .set_object_type(F.linear, default_per_channel_qconfig)
+        example_inputs = (torch.randn(1, 5), torch.randn(10, 5), torch.randn(10,))
+        m = prepare_fx(m, qconfig_mapping, example_inputs)
+        m(*example_inputs)
+        m_ref = copy.deepcopy(m)
+        m_ref = convert_to_reference_fx(m_ref)
+        m = _convert_to_reference_decomposed_fx(m)
+        expected_occurrence = {
+            # for input and output activations
+            ns.call_function(torch.ops.quantized_decomposed.quantize_per_tensor): 2,
+            ns.call_function(torch.ops.quantized_decomposed.dequantize_per_tensor): 2,
+            # for weight
+            ns.call_function(torch.ops.quantized_decomposed.quantize_per_channel): 1,
+            ns.call_function(torch.ops.quantized_decomposed.dequantize_per_channel): 1,
+        }
+        self.checkGraphModuleNodes(
+            m,
+            expected_node_occurrence=expected_occurrence)
+        # make sure it runs
+        res_ref = m_ref(*example_inputs)
+        res = m(*example_inputs)
+        self.assertEqual(res, res_ref)
+
     def test_change_backend_config_for_fixed_qparam_ops(self):
         """ Making sure we can skip validation of qconfigs for fixedqparam ops based
         on BackendConfig
diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py
index e7e0b482356a8..58846d116ff65 100644
--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@@ -146,8 +146,23 @@ def _replace_observer_with_quantize_dequantize_node_decomposed(
         quantize_op : Optional[Callable] = None
         scale, zero_point = activation_post_process.calculate_qparams()  # type: ignore[attr-defined]
         if is_per_channel(activation_post_process.qscheme):  # type: ignore[attr-defined]
-            raise NotImplementedError("decomposed quantize_per_channel op not implemented yet")
+            ch_axis = int(activation_post_process.ch_axis)  # type: ignore[attr-defined]
+            quantize_op = torch.ops.quantized_decomposed.quantize_per_channel
+            dequantize_op = torch.ops.quantized_decomposed.dequantize_per_channel
+            quant_min = activation_post_process.quant_min
+            quant_max = activation_post_process.quant_max
+            dtype_ = to_underlying_dtype(dtype)
+            qparams = {
+                "_scale_": scale,
+                "_zero_point_": zero_point,
+                "_axis_": ch_axis,
+                "_quant_min_": quant_min,
+                "_quant_max_": quant_max,
+                "_dtype_": dtype_
+            }
         else:
+            quantize_op = torch.ops.quantized_decomposed.quantize_per_tensor
+            dequantize_op = torch.ops.quantized_decomposed.dequantize_per_tensor
             scale = float(scale)
             zero_point = int(zero_point)
             quant_min = activation_post_process.quant_min  # type: ignore[attr-defined]
@@ -160,7 +175,6 @@ def _replace_observer_with_quantize_dequantize_node_decomposed(
                 "_quant_max_": quant_max,
                 "_dtype_": dtype_
             }
-            quantize_op = torch.ops.quantized_decomposed.quantize_per_tensor
 
         # 2. replace activation_post_process node with quantize and dequantize
         with graph.inserting_before(node):
@@ -182,7 +196,6 @@ def _replace_observer_with_quantize_dequantize_node_decomposed(
             quantized_node = graph.create_node(node_type, quantize_op, tuple(quantize_op_inputs), {})
             # use the same qparams from quantize op
             dq_inputs = [quantized_node] + quantize_op_inputs[1:]
-            dequantize_op = torch.ops.quantized_decomposed.dequantize_per_tensor
             dequantized_node = graph.call_function(
                 dequantize_op,
                 tuple(dq_inputs),

From 170971187a01cd42b95153d305a9ac705e023571 Mon Sep 17 00:00:00 2001
From: Yukio Siraichi <yukio.siraichi@gmail.com>
Date: Wed, 23 Nov 2022 15:56:54 +0000
Subject: [PATCH 1204/1922] Call `symint::sizes()` instead of `sizes()` on
 convolution error messages. (#89549)

This PR fixes convolution when using `torchdynamo` with dynamic shapes.

**Problem:** there are some `tensor.sizes()` calls in a few error messages. As a result, an uninformative error message was being displayed.

```python
@torch._dynamo.optimize("eager")
def foo(inp, w):
    return F.conv2d(inp, w)

inp = torch.rand((1, 1, 32, 32))
w = torch.rand((1, 2, 3, 3))
#                  |
#                  |--------- incorrect shape!

foo(inp, w)
```

-----
**Before this PR:**
```python
Traceback (most recent call last):
  File "torch/_dynamo/utils.py", line 1076, in run_node
    return node.target(*args, **kwargs)
  File "torch/_subclasses/fake_tensor.py", line 867, in __torch_dispatch__
    op_impl_out = op_impl(self, func, *args, **kwargs)
  File "torch/_subclasses/fake_tensor.py", line 445, in conv
    conv_backend = torch._C._select_conv_backend(**kwargs)
RuntimeError: Cannot call sizes() on tensor with symbolic sizes/strides
```

**After this PR:**
```python
Traceback (most recent call last):
  File "torch/_dynamo/utils.py", line 1076, in run_node
    return node.target(*args, **kwargs)
  File "torch/_subclasses/fake_tensor.py", line 867, in __torch_dispatch__
    op_impl_out = op_impl(self, func, *args, **kwargs)
  File "torch/_subclasses/fake_tensor.py", line 445, in conv
    conv_backend = torch._C._select_conv_backend(**kwargs)
RuntimeError: Given groups=1, weight of size [1, s1, s2, s2], expected input[1, 1, s0, s0] to have s1 channels, but got 1 channels instead
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89549
Approved by: https://github.com/ezyang
---
 aten/src/ATen/native/Convolution.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 8584bae445ad7..edb51a5c837d8 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -659,7 +659,7 @@ static void check_shape_forward(const at::Tensor& input,
 
     TORCH_CHECK(at::symint::size<T>(input, 1) == (weight_sizes[1] * groups),
                 "Given groups=", groups, ", weight of size ", weight_sizes,
-                ", expected input", input.sizes(), " to have ",
+                ", expected input", at::symint::sizes<T>(input), " to have ",
                 (weight_sizes[1] * groups), " channels, but got ", at::symint::size<T>(input, 1),
                 " channels instead");
 
@@ -697,12 +697,12 @@ static void check_shape_forward(const at::Tensor& input,
   } else { // transposed
     TORCH_CHECK(at::symint::size<T>(input, 1) == weight_sizes[0],
              "Given transposed=", transposed, ", weight of size ", weight_sizes,
-             ", expected input", input.sizes(), " to have ", weight_sizes[0],
+             ", expected input", at::symint::sizes<T>(input), " to have ", weight_sizes[0],
              " channels, but got ", at::symint::size<T>(input, 1), " channels instead");
     TORCH_CHECK(!bias.defined() || (bias.ndimension() == 1 && at::symint::size<T>(bias, 0) == weight_sizes[1] * groups),
              "Given transposed=", transposed, ", weight of size ", weight_sizes,
              ", expected bias to be 1-dimensional with ", weight_sizes[1] * groups, " elements",
-             ", but got bias of size ", bias.sizes(), " instead");
+             ", but got bias of size ", at::symint::sizes<T>(bias), " instead");
   }
 }
 

From 3155cf556a0170b1b4845e8f8d8187c02667ecd1 Mon Sep 17 00:00:00 2001
From: fduwjj <fduwjj@fb.com>
Date: Wed, 23 Nov 2022 05:29:53 +0000
Subject: [PATCH 1205/1922] [PT-D][3/N] Sync TP API change to Pytorch (#89535)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89535
Approved by: https://github.com/wanchaol
---
 .../_tensor/parallel/test_parallelize_api.py  | 125 ++++++++++--
 .../_tensor/parallel/test_tp_examples.py      |   1 +
 .../_tensor/parallel/test_tp_style.py         |   5 +-
 .../distributed/_tensor/parallel/__init__.py  |   8 +-
 torch/distributed/_tensor/parallel/api.py     | 187 +++++++++++++-----
 torch/distributed/_tensor/parallel/fsdp.py    |   4 +-
 torch/distributed/_tensor/parallel/style.py   | 110 +++++++----
 torch/distributed/_tensor/parallel/utils.py   |  21 +-
 8 files changed, 339 insertions(+), 122 deletions(-)

diff --git a/test/distributed/_tensor/parallel/test_parallelize_api.py b/test/distributed/_tensor/parallel/test_parallelize_api.py
index 036f4ef79a491..82ba0b0032c6a 100644
--- a/test/distributed/_tensor/parallel/test_parallelize_api.py
+++ b/test/distributed/_tensor/parallel/test_parallelize_api.py
@@ -3,9 +3,17 @@
 import torch
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import DTensorTestBase, with_comms
-from torch.distributed._tensor import distribute_tensor, DeviceMesh, Shard, Replicate
-from torch.distributed._tensor.parallel import PairwiseParallel, ParallelStyle
-from torch.distributed._tensor.parallel.api import _parallelize_mlp
+from torch.distributed._tensor import DeviceMesh, Replicate, DTensor
+from torch.distributed._tensor.parallel.style import (
+    ColwiseParallel,
+    PairwiseParallel,
+    ParallelStyle,
+    RowwiseParallel,
+)
+from torch.distributed._tensor.parallel.api import (
+    _parallelize_linear,
+    _parallelize_mlp,
+)
 from torch.distributed._tensor.parallel.utils import _create_1d_device_mesh
 from torch.distributed._tensor.parallel.style import (
     make_input_replicate_1d,
@@ -71,8 +79,64 @@ def test_creat_1d_device_mesh_error(self):
         ):
             _create_1d_device_mesh(mesh, 3)
 
+    def _compare_params(
+        self,
+        local_module,
+        dist_module,
+        skip_rowwise_bias=False,
+        compare_grad=False,
+    ):
+        replicate = [Replicate()]
+        for name, param in local_module.named_parameters():
+            dist_param = dist_module.get_parameter(name)
+            param = param.grad if compare_grad else param
+            dist_param = dist_param.grad if compare_grad else dist_param
+            if self.rank == 0 or (
+                name not in ["net2.bias"]
+                and not skip_rowwise_bias
+                or name not in ["bias", "net2.bias"]
+            ):
+                self.assertEqual(
+                    param,
+                    dist_param.redistribute(
+                        device_mesh=dist_param.device_mesh, placements=replicate
+                    ).to_local(),
+                )
+
+    def _compare_module(
+        self, local_module, dist_module, inp_size, rowwise=False
+    ):
+        LR = 0.25  # the learning rate we use for testing
+        local_optim = torch.optim.SGD(local_module.parameters(), lr=LR)
+        dist_optim = torch.optim.SGD(dist_module.parameters(), lr=LR)
+        torch.manual_seed(0)
+        inp = torch.rand(*inp_size, device=self.device_type)
+        self._compare_params(local_module, dist_module)
+
+        # check forward correctness
+        local_output = local_module(inp)
+        inp = inp.chunk(self.world_size, dim=-1)[self.rank] if rowwise else inp
+        dist_output = dist_module(inp)
+        dist_output = (
+            dist_output.to_local()
+            if isinstance(dist_output, DTensor)
+            else dist_output
+        )
+        self.assertEqual(local_output, dist_output)
+
+        local_output.sum().backward()
+        dist_output.sum().backward()
+
+        # check backward and ensure gradients are same
+        self._compare_params(local_module, dist_module, rowwise, True)
+
+        local_optim.step()
+        dist_optim.step()
+        self._compare_params(local_module, dist_module, rowwise)
+
     @with_comms
     def test_parallelize_mlp(self):
+        inp_size = [12, 10]
         model = MLPModule(self.device_type)
         model_tp = MLPModule(self.device_type)
 
@@ -87,24 +151,7 @@ def test_parallelize_mlp(self):
             self.device_type, torch.arange(self.world_size)
         )
         model_tp = _parallelize_mlp(model_tp, device_mesh, PairwiseParallel())
-
-        # Ensure the parameter is properly distributed.
-        self.assertEqual(
-            distribute_tensor(model.net1.weight, device_mesh, [Shard(0)]),
-            model_tp.net1.weight,
-        )
-        self.assertEqual(
-            distribute_tensor(model.net1.bias, device_mesh, [Shard(0)]),
-            model_tp.net1.bias,
-        )
-        self.assertEqual(
-            distribute_tensor(model.net2.weight, device_mesh, [Shard(1)]),
-            model_tp.net2.weight,
-        )
-        self.assertEqual(
-            distribute_tensor(model.net2.bias, device_mesh, [Replicate()]),
-            model_tp.net2.bias,
-        )
+        self._compare_module(model, model_tp, inp_size)
 
     @with_comms
     def test_parallelize_mlp_error(self):
@@ -131,6 +178,42 @@ def __init__(self) -> None:
                 torch.nn.Linear(10, 5), device_mesh, PairwiseParallel()
             )
 
+    @with_comms
+    def test_linear_row_wise_parallel(self):
+        # test RowwiseParallel
+        inp_size = [9, 16]
+        rowwise = RowwiseParallel()
+
+        torch.manual_seed(5)
+        model = torch.nn.Linear(16, 10, device=self.device_type)
+        torch.manual_seed(5)
+        model_tp = torch.nn.Linear(16, 10, device=self.device_type)
+
+        # parallelize model_tp
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        model_tp = _parallelize_linear(model_tp, device_mesh, rowwise)
+
+        # let each rank generate unique local input
+        torch.manual_seed(self.rank)
+        self._compare_module(model, model_tp, inp_size, True)
+
+    @with_comms
+    def test_linear_col_wise_parallel(self):
+        # test ColwiseParallel
+        inp_size = [8, 10]
+        colwise = ColwiseParallel()
+
+        torch.manual_seed(5)
+        model = torch.nn.Linear(10, 16, device=self.device_type)
+        torch.manual_seed(5)
+        model_tp = torch.nn.Linear(10, 16, device=self.device_type)
+
+        # parallelize model_tp
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        model_tp = _parallelize_linear(model_tp, device_mesh, colwise)
+
+        self._compare_module(model, model_tp, inp_size)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_tensor/parallel/test_tp_examples.py b/test/distributed/_tensor/parallel/test_tp_examples.py
index 74cd44dfd57d8..73cf9e05b223e 100644
--- a/test/distributed/_tensor/parallel/test_tp_examples.py
+++ b/test/distributed/_tensor/parallel/test_tp_examples.py
@@ -44,6 +44,7 @@ def forward(self, query, key, value):
         return self.attn(query, key, value)
 
 
+# TODO: replace repeated test code with _check_module
 class DistTensorParallelExampleTest(DTensorTestBase):
     @with_comms
     def test_mlp_megatron_e2e(self):
diff --git a/test/distributed/_tensor/parallel/test_tp_style.py b/test/distributed/_tensor/parallel/test_tp_style.py
index e52aef1a6f3f6..0562c6713da46 100644
--- a/test/distributed/_tensor/parallel/test_tp_style.py
+++ b/test/distributed/_tensor/parallel/test_tp_style.py
@@ -144,7 +144,8 @@ def _test_prepare_output_error(self, func):
         output = [dtensor]
         with self.assertRaisesRegex(
             AssertionError,
-            f"Expect output of Tensor Parallel to be a DTensor, but found {type(output)}.",
+            "Expect output of Tensor Parallel to be a DTensor, but found"
+            f" {type(output)}.",
         ):
             func(output, device_mesh)
         device_mesh = DeviceMesh(
@@ -189,7 +190,7 @@ def test_colwise_parallel_style(self):
         tensor = torch.rand(8, 16, device=self.device_type)
         cs = ColwiseParallel()
         self._1d_input_func_check(tensor, tensor, cs._prepare_input)
-        self.assertEqual(None, cs._prepare_output)
+        self.assertEqual(make_output_replicate_1d, cs._prepare_output)
 
 
 if __name__ == "__main__":
diff --git a/torch/distributed/_tensor/parallel/__init__.py b/torch/distributed/_tensor/parallel/__init__.py
index 3c72143f345f3..bf050d57d1697 100644
--- a/torch/distributed/_tensor/parallel/__init__.py
+++ b/torch/distributed/_tensor/parallel/__init__.py
@@ -5,11 +5,12 @@
 
 from torch.distributed._tensor.parallel.style import (
     ColwiseParallel,
-    ParallelStyle,
     PairwiseParallel,
+    ParallelStyle,
     RowwiseParallel,
     make_input_replicate_1d,
     make_input_shard_1d,
+    make_input_shard_1d_dim_last,
     make_output_replicate_1d,
     make_output_shard_1d,
     make_output_tensor,
@@ -21,12 +22,13 @@
 
 __all__ = [
     "ColwiseParallel",
-    "TensorParallelMultiheadAttention",
-    "ParallelStyle",
     "PairwiseParallel",
+    "ParallelStyle",
     "RowwiseParallel",
+    "TensorParallelMultiheadAttention",
     "make_input_replicate_1d",
     "make_input_shard_1d",
+    "make_input_shard_1d_dim_last",
     "make_output_replicate_1d",
     "make_output_tensor",
     "make_output_shard_1d",
diff --git a/torch/distributed/_tensor/parallel/api.py b/torch/distributed/_tensor/parallel/api.py
index a7a896ebf8598..a1c513078b95a 100644
--- a/torch/distributed/_tensor/parallel/api.py
+++ b/torch/distributed/_tensor/parallel/api.py
@@ -10,7 +10,12 @@
     DeviceMesh,
 )
 from torch.distributed._tensor.parallel import TensorParallelMultiheadAttention
-from torch.distributed._tensor.parallel.style import PairwiseParallel, ParallelStyle
+from torch.distributed._tensor.parallel.style import (
+    ColwiseParallel,
+    PairwiseParallel,
+    ParallelStyle,
+    RowwiseParallel,
+)
 from torch.distributed._tensor.parallel.utils import _create_1d_device_mesh
 
 
@@ -34,18 +39,18 @@ def parallelize_module(  # type: ignore[return]
     and users just need to specify the dimension where we perform tensor parallelism on.
 
     Args:
-        module (nn.Module):
-            :class:`nn.Module` object to be parallelized.
-        device_mesh (DeviceMesh):
-            :class:`DeviceMesh` object which describes the mesh topology
+        module (:class:`nn.Module`):
+            Module to be parallelized.
+        device_mesh (:class:`DeviceMesh`):
+            Object which describes the mesh topology
             of devices for the DTensor.
-        parallelize_plan (Union[ParallelStyle, Dict[str, ParallelStyle]]):
+        parallelize_plan (Union[:class:`ParallelStyle`, Dict[str, :class:`ParallelStyle`]]):
             The plan used to parallelize the module. It can be either a
             :class:`ParallelStyle` object which contains how
             we prepare input/output for Tensor Parallelism or it can be a
             dict of module FQN and its corresponding :class:`ParallelStyle` object.
         tp_mesh_dim (int):
-            the dimension of ``device_mesh`` where we perform
+            The dimension of ``device_mesh`` where we perform
             Tensor Parallelism on.
 
     Return:
@@ -69,6 +74,12 @@ def parallelize_module(  # type: ignore[return]
         device_mesh = _create_1d_device_mesh(device_mesh, tp_mesh_dim)
 
     if isinstance(parallelize_plan, ParallelStyle):
+        # RowwiseParallel or ColwiseParallel
+        if isinstance(parallelize_plan, ColwiseParallel) or isinstance(
+            parallelize_plan, RowwiseParallel
+        ):
+            return _parallelize_linear(module, device_mesh, parallelize_plan)
+        # PairwiseParallel
         if _is_mha_for_pairwise_parallel(module):
             return _parallelize_multihead_attn(module, device_mesh)
         elif _is_mlp_for_pairwise_parallel(module):
@@ -79,7 +90,6 @@ def parallelize_module(  # type: ignore[return]
                     n, parallelize_module(m, device_mesh, parallelize_plan)
                 )
             return module
-    # TODO: Add parallelize linear logic when https://github.com/pytorch/tau/pull/624/ merged.
     elif isinstance(parallelize_plan, dict):
         for module_path, parallelize_style in parallelize_plan.items():
             sub_module = module.get_submodule(module_path)
@@ -91,7 +101,8 @@ def parallelize_module(  # type: ignore[return]
             return module
     else:
         raise RuntimeError(  # pyre-ignore[7]
-            f"Expect Union[ParallelStyle, Dict[str, ParallelStyle]] for parallelize_plan, {type(parallelize_plan)} found!"
+            "Expect Union[ParallelStyle, Dict[str, ParallelStyle]] for"
+            f" parallelize_plan, {type(parallelize_plan)} found!"
         )
 
 
@@ -100,8 +111,8 @@ def _is_mha_for_pairwise_parallel(module: nn.Module) -> bool:
     Check whether the mha module is the one can be handled for Pairwise parallel.
 
     Args:
-        module (nn.Module):
-            :class:``nn.Module`` object to be checked.
+        module (:class:`nn.Module`):
+            Module to be checked.
 
     Return:
         A boolean object which specifies whether the module is MHA supported by Pairwise parallel or not.
@@ -117,11 +128,11 @@ def _is_mlp_for_pairwise_parallel(module: nn.Module) -> bool:
     number of Linear module. If the number is more than one, we return True.
 
     Args:
-        module (nn.Module):
-            :class:``nn.Module`` object to be traversed and counted.
+        module (:class:`nn.Module`):
+            Module to be traversed and counted.
 
     Return:
-        A boolean object which specifies whether the module is MLP or not.
+        A bool which specifies whether the module is MLP supported or not.
 
     .. warning::
         The traversal is not recursive for now.
@@ -138,17 +149,21 @@ def _rowwise_parallelize_linear_fn(
     device_mesh: DeviceMesh,
 ) -> None:
     """
-    This function parallelizes the input :class:``nn.Linear`` module in :class:``RowwiseParallel`` style.
+    This function parallelizes the input :class:`nn.Linear` module in
+    :class:`RowwiseParallel` style.
 
     Args:
-        name (str): name of the input module.
-        module (nn.Module): the :class:``nn.Linear`` object to be parallelized.
-        device_mesh (DeviceMesh): :class:``DeviceMesh`` object which describes the mesh topology
-            of devices for the DTensor.
-
-    Return:
+        name (str):
+            Name of the input module.
+        module (:class:`nn.Module`):
+            The :class:`nn.Linear` module to be parallelized.
+        device_mesh (:class:`DeviceMesh`):
+            Object which describes the mesh topology of devices.
+
+    Returns:
         None
     """
+
     for name, param in module.named_parameters():
         dist_spec = (
             [Shard(1)] if name == "weight" else [Replicate()]  # type: ignore[list-item]
@@ -165,17 +180,21 @@ def _colwise_parallelize_linear_fn(
     device_mesh: DeviceMesh,
 ) -> None:
     """
-    This function parallelizes the input :class:``nn.Linear`` module in :class:``ColwiseParallel`` style.
+    This function parallelizes the input :class:`nn.Linear` module in
+    :class:`ColwiseParallel` style.
 
     Args:
-        name (str): name of the input module.
-        module (nn.Module): the :class:``nn.Linear`` object to be parallelized.
-        device_mesh (DeviceMesh): :class:``DeviceMesh`` object which describes the mesh topology
-            of devices for the DTensor.
-
-    Return:
+        name (str):
+            Name of the input module.
+        module (:class:`nn.Module`):
+            The :class:`nn.Linear` module to be parallelized.
+        device_mesh (:class:`DeviceMesh`):
+            Object which describes the mesh topology of devices.
+
+    Returns:
         None
     """
+
     for name, param in module.named_parameters():
         dist_param = torch.nn.Parameter(
             distribute_tensor(param, device_mesh, [Shard(0)])
@@ -183,6 +202,77 @@ def _colwise_parallelize_linear_fn(
         module.register_parameter(name, dist_param)
 
 
+def _parallelize_linear(
+    module: nn.Module,
+    device_mesh: DeviceMesh,
+    parallel_style: ParallelStyle = ColwiseParallel(),
+    tp_mesh_dim: int = 0,
+) -> nn.Module:
+    """
+    This function requires that the input module be an object
+    of :class:`nn.Linear`.
+    The module will be parallelized over a 1-d :class:`DeviceMesh`
+    based on the :class:`ParallelStyle`.
+
+    Args:
+        module (:class:`nn.Module`):
+            The module to be parallelized.
+        device_mesh (:class:`DeviceMesh`):
+            Object which describes the mesh topology of devices for the :class:`DTensor`.
+            If the mesh is more than 1-dimensional, we will use the mesh dim of
+            `device_mesh` specified by `tp_mesh_dim`.
+        parallel_style (:class:`ParallelStyle`, optional):
+            The object which describes how the :class:`nn.Linear` module
+            should be distributed over :class:`DeviceMesh` and how the input
+            and output should be prepared for Tensor Parallelism.
+            :class:`RowwiseStyle`: weight is sharded on dim 1 and bias is
+            replicate.
+            :class:`ColwiseStyle`: weight and bias are both sharded on dim 0.
+            Default: :class:`ColwiseParallel`
+        tp_mesh_dim (int):
+            The dimension of :class:`DeviceMesh` on which we
+            perform Tensor Parallelism.
+            Default: 0
+
+    Return:
+        A :class:`nn.Module` object parallelized.
+    """
+
+    if not isinstance(module, nn.Linear):
+        raise RuntimeError(
+            f"Expect a torch.nn.Linear module but received {type(module)}!"
+        )
+
+    if not isinstance(parallel_style, ParallelStyle):
+        raise RuntimeError(
+            "Expect a ParallelStyle object but received"
+            f" {type(parallel_style)}!"
+        )
+
+    if device_mesh.ndim > 1:
+        device_mesh = _create_1d_device_mesh(device_mesh, tp_mesh_dim)
+
+    if isinstance(parallel_style, RowwiseParallel):
+        distribute_module(
+            module,
+            device_mesh,
+            _rowwise_parallelize_linear_fn,
+            input_fn=parallel_style._prepare_input,  # type: ignore[arg-type, misc] # pyre-ignore[6]
+            output_fn=parallel_style._prepare_output,  # type: ignore[arg-type, misc] # pyre-ignore[6]
+        )
+    elif isinstance(parallel_style, ColwiseParallel):
+        distribute_module(
+            module,
+            device_mesh,
+            _colwise_parallelize_linear_fn,
+            input_fn=parallel_style._prepare_input,  # type: ignore[arg-type, misc] # pyre-ignore[6]
+            output_fn=parallel_style._prepare_output,  # type: ignore[arg-type, misc] # pyre-ignore[6]
+        )
+    else:
+        raise RuntimeError(f"{type(parallel_style)} is not supported!")
+    return module
+
+
 def _parallelize_multihead_attn(
     module: nn.Module,
     device_mesh: DeviceMesh,
@@ -196,20 +286,19 @@ def _parallelize_multihead_attn(
     in place.
 
     Args:
-        module (nn.Module):
-            :class:``nn.Module`` object to be parallelized.
-        device_mesh (DeviceMesh):
-            :class:``DeviceMesh`` object which describes the mesh topology
-            of devices for the DTensor.
-        parallel_style (ParallelStyle):
-            :class:``ParallelStyle`` object which contains how
-            we prepare input/output for Tensor Parallelism.
+        module (:class:`nn.Module`):
+            Module to be parallelized.
+        device_mesh (:class:`DeviceMesh`):
+            Object which describes the mesh topology of devices.
+        parallel_style (:class:`ParallelStyle`):
+            Object which contains how we prepare input/output
+            for Tensor Parallelism.
         tp_mesh_dim (int):
-            the dimension of ``device_mesh`` where we perform
+            The dimension of `device_mesh` where we perform
             Tensor Parallelism on.
 
     Return:
-        A :class:``nn.Module`` object parallelized.
+        A :class:`nn.Module` object parallelized.
 
     .. warning::
         We only support ``PairwiseParallel`` right now.
@@ -217,7 +306,8 @@ def _parallelize_multihead_attn(
 
     if not isinstance(parallel_style, PairwiseParallel):
         raise NotImplementedError(
-            "Only support PairwiseParallel for Multihead Attention parallelization."
+            "Only support PairwiseParallel for Multihead Attention"
+            " parallelization."
         )
 
     if device_mesh.ndim > 1:
@@ -268,20 +358,19 @@ def _parallelize_mlp(
     in place.
 
     Args:
-        module (nn.Module):
-            :class:``nn.Module`` object to be parallelized.
-        device_mesh (DeviceMesh):
-            :class:``DeviceMesh`` object which describes the mesh topology
-            of devices for the DTensor.
-        parallel_style (ParallelStyle):
-            :class:``ParallelStyle`` object which contains how
-            we prepare input/output for Tensor Parallelism.
+        module (:class:`nn.Module`):
+            Module to be parallelized.
+        device_mesh (:class:`DeviceMesh`):
+            Object which describes the mesh topology of devices.
+        parallel_style (:class:`ParallelStyle`):
+            Object which contains how we prepare input/output
+            for Tensor Parallelism.
         tp_mesh_dim (int):
-            the dimension of ``device_mesh`` where we perform
+            The dimension of `device_mesh` where we perform
             Tensor Parallelism on.
 
     Return:
-        A :class:``nn.Module`` object parallelized.
+        A :class:`nn.Module` object parallelized.
 
     .. warning::
         We only support ``PairwiseParallel`` right now.
diff --git a/torch/distributed/_tensor/parallel/fsdp.py b/torch/distributed/_tensor/parallel/fsdp.py
index 1f1123c517756..55b34f10d7437 100644
--- a/torch/distributed/_tensor/parallel/fsdp.py
+++ b/torch/distributed/_tensor/parallel/fsdp.py
@@ -111,7 +111,9 @@ def _create_sharded_tensor_md_from_dt(
             ShardMetadata(
                 shard_offsets=list(offsets),
                 shard_sizes=list(sizes),
-                placement=f"rank:{scapegoat_rank if i > 0 else my_rank}/{dt._local_tensor.device}",
+                placement=(
+                    f"rank:{scapegoat_rank if i > 0 else my_rank}/{dt._local_tensor.device}"
+                ),
             )
         )
 
diff --git a/torch/distributed/_tensor/parallel/style.py b/torch/distributed/_tensor/parallel/style.py
index 60b6a1c88dfd3..e414cb0dc09d5 100644
--- a/torch/distributed/_tensor/parallel/style.py
+++ b/torch/distributed/_tensor/parallel/style.py
@@ -45,21 +45,21 @@ def __init__(self) -> None:
 class RowwiseParallel(ParallelStyle):
     """
     Partitioning the row of a module.
-    We assume the input to be a sharded :class:``DTensor`` and output to be a replicated :class:``DTensor``.
+    We assume the input to be a sharded :class:`DTensor` and output to be a replicated :class:`DTensor`.
     """
 
     def __init__(self) -> None:
-        super().__init__(make_input_shard_1d, make_output_replicate_1d)
+        super().__init__(make_input_shard_1d_dim_last, make_output_replicate_1d)
 
 
 class ColwiseParallel(ParallelStyle):
     """
     Partitioning the column of a tensor or module.
-    We assume the input to be a replicated :class:``DTensor`` and output to be a sharded :class:``DTensor``.
+    We assume the input to be a replicated :class:`DTensor` and output to be a sharded :class:`DTensor`.
     """
 
     def __init__(self) -> None:
-        super().__init__(make_input_replicate_1d, None)
+        super().__init__(make_input_replicate_1d, make_output_replicate_1d)
 
 
 @_prepare_input_validate  # type: ignore[arg-type] # pyre-ignore[56]
@@ -72,20 +72,20 @@ def make_input_shard_1d(
     Shard input tensor on ``dim`` over an 1-D device mesh. This function will be used in ParallelStyle.
 
     Args:
-        input (Union[Tensor, DTensor]):
-            This single tensor will be sharded on dimension ``dim``
-            over the 1-D :class:``DeviceMesh``.
-        device_mesh (DeviceMesh, optional):
+        input (Union[:class:`torch.Tensor`, :class:`DTensor`]):
+            Single tensor will be sharded on dimension ``dim``
+            over the 1-D :class:`DeviceMesh`.
+        device_mesh (:class:`DeviceMesh`, optional):
             The 1-D device mesh where ``input`` will be sharded.
-            If no :class:``DeviceMesh`` is passed and ``input`` is a :class:``DTensor``,
+            If no :class:`DeviceMesh` is passed and ``input`` is a :class:`DTensor`,
             `input.device_mesh` will be used.
-            If :class:``DeviceMesh`` is not 1-D, an exception will be thrown.
+            If :class:`DeviceMesh` is not 1-D, an exception will be thrown.
             Default: ``None``
         dim (int, optional): The sharding dimension of ``input`` tensor.
             Default: 0
 
     Returns:
-        A :class:``DTensor`` sharded on dimension ``dim`` over ``device_mesh``.
+        A :class:`DTensor` sharded on dimension ``dim`` over ``device_mesh``.
     """
     shard_spec = [Shard(dim)]
     if isinstance(input, DTensor):
@@ -96,10 +96,35 @@ def make_input_shard_1d(
         )
     else:
         raise RuntimeError(
-            f"Tensor parallel module expects torch.Tensor or DTensor input but received {type(input)}!"
+            "Tensor parallel module expects torch.Tensor or DTensor input but"
+            f" received {type(input)}!"
         )
 
 
+def make_input_shard_1d_dim_last(
+    input: Union[torch.Tensor, DTensor],
+    device_mesh: Optional[DeviceMesh] = None,
+) -> DTensor:
+    """
+    Wrapper func of ``make_input_shard_1d`` with ``dim`` = -1.
+
+    Args:
+        input (Union[:class:`torch.Tensor`, :class:`DTensor`]):
+            This single tensor will be sharded on dimension ``dim``
+            over the 1-D :class:`DeviceMesh`.
+        device_mesh (:class:`DeviceMesh`, optional):
+            The 1-D device mesh where ``input`` will be sharded.
+            If no :class:`DeviceMesh` is passed and ``input`` is a :class:`DTensor`,
+            `input.device_mesh` will be used.
+            If :class:`DeviceMesh` is not 1-D, an exception will be thrown.
+            Default: ``None``
+
+    Returns:
+        A :class:`DTensor` sharded on dimension ``dim`` over ``device_mesh``.
+    """
+    return make_input_shard_1d(input, device_mesh, dim=-1)  # type: ignore[call-arg]
+
+
 @_prepare_input_validate  # type: ignore[arg-type] # pyre-ignore[56]
 def make_input_replicate_1d(
     input: Union[torch.Tensor, DTensor],
@@ -109,17 +134,17 @@ def make_input_replicate_1d(
     Replicate input tensor over an 1-D device mesh. This function will be used in ParallelStyle.
 
     Args:
-        input (Union[Tensor, DTensor]):
-            This single tensor will be replicated over the 1-D :class:``DeviceMesh``.
-        device_mesh (DeviceMesh, optional):
+        input (Union[:class:`torch.Tensor`, :class:`DTensor`]):
+            This input tensor will be replicated over the 1-D :class:`DeviceMesh`.
+        device_mesh (:class:`DeviceMesh`, optional):
             The 1-D device mesh where ``input`` will be replicated.
-            If no :class:``DeviceMesh`` is passed and ``input`` is a :class:``DTensor``,
+            If no :class:`DeviceMesh` is passed and ``input`` is a :class:`DTensor`,
             ``input.device_mesh`` will be used.
-            If :class:``DeviceMesh`` is not 1-D, an exception will be thrown.
+            If :class:`DeviceMesh` is not 1-D, an exception will be thrown.
             Default: ``None``
 
     Returns:
-        A :class:``DTensor`` replicated over ``device_mesh``.
+        A :class:`DTensor` replicated over ``device_mesh``.
     """
     replicate = [Replicate()]
     if isinstance(input, DTensor):
@@ -130,7 +155,8 @@ def make_input_replicate_1d(
         )
     else:
         raise RuntimeError(
-            f"Tensor parallel module expects torch.Tensor or DTensor input but received {type(input)}!"
+            "Tensor parallel module expects torch.Tensor or DTensor input but"
+            f" received {type(input)}!"
         )
 
 
@@ -140,16 +166,19 @@ def make_output_shard_1d(
 ) -> DTensor:
     """
     Convert Output DTensor to a sharded DTensor. This will be used in ParallelStyle.
+
     Args:
-        output (DTensor): output of module to be converted.
-        device_mesh (Optional[DeviceMesh]): :class:``DeviceMesh`` object needed to
-            shard the output and it needs to be a 1D ``device_mesh`` and we will throw
-            exceptions if a non-1D ``device_mesh`` is passed in. If no ``device_mesh``
-            is passed in, we will reuse the one from output.
+        output (:class:`DTensor`):
+            Output of module to be converted.
+        device_mesh (:class:`DeviceMesh`, optional):
+            Object needed to shard the output and it needs to be a 1D ``device_mesh``
+            and we will throw exceptions if a non-1D ``device_mesh`` is passed in.
+            If no ``device_mesh`` is passed in, we will reuse the one from output.
             Default: ``None``
         dim (int): Sharding dim for output. Default: 0
+
     Return:
-        A :class:``DTensor`` object sharded on the given dim.
+        A :class:`DTensor` object sharded on the given dim.
     """
 
     return output.redistribute(device_mesh, [Shard(dim)])
@@ -161,15 +190,18 @@ def make_output_replicate_1d(
 ) -> DTensor:
     """
     Convert Output DTensor to a replicated DTensor. This will be used in ParallelStyle.
+
     Args:
-        output (DTensor): output of module to be converted.
-        device_mesh (Optional[DeviceMesh]): :class:``DeviceMesh`` object needed to
-            replicate the output and it needs to be a 1D ``device_mesh`` and we will
-            throw exceptions if a non-1D ``device_mesh`` is passed in. If no
-            ``device_mesh`` is passed in, we will reuse the one from output.
+        output (:class:`DTensor`):
+            Output of module to be converted.
+        device_mesh (:class:`DeviceMesh`, optional):
+            Object needed to replicate the output and it needs to be a 1D ``device_mesh``
+            and we will throw exceptions if a non-1D ``device_mesh`` is passed in.
+            If no ``device_mesh`` is passed in, we will reuse the one from output.
             Default: ``None``
+
     Return:
-        A :class:``DTensor`` object made replicate.
+        A :class:`DTensor` object made replicate.
     """
 
     return output.redistribute(device_mesh, [Replicate()])
@@ -181,15 +213,19 @@ def make_output_tensor(
 ) -> torch.Tensor:
     """
     Convert Output DTensor to a replicated DTensor first and then convert it to Tensor.
+
     Args:
-        output (DTensor): output of module to be converted.
-        device_mesh (Optional[DeviceMesh]): :class:``DeviceMesh`` object needed to
-            replicate the output and it needs to be a 1D ``device_mesh`` and we will
-            throw exceptions if a non-1D ``device_mesh`` is passed in. If no
-            ``device_mesh`` is passed in, we will reuse the one from output.
+        output (:class:`DTensor`):
+            Output of module to be converted.
+        device_mesh (:class:`DeviceMesh`, optional):
+            Object which is needed to replicate the output and it needs to be
+            a 1D ``device_mesh`` and we will throw exceptions if a non-1D
+            ``device_mesh`` is passed in. If no ``device_mesh`` is passed in,
+            we will reuse the one from output.
             Default: ``None``
+
     Return:
-        A :class:``torch.Tensor`` object converted from output DTensor.
+        A :class:`torch.Tensor` object converted from output DTensor.
     """
 
     return make_output_replicate_1d(  # type: ignore[attr-defined]
diff --git a/torch/distributed/_tensor/parallel/utils.py b/torch/distributed/_tensor/parallel/utils.py
index c4cca5c88eda1..c63fe638b351b 100644
--- a/torch/distributed/_tensor/parallel/utils.py
+++ b/torch/distributed/_tensor/parallel/utils.py
@@ -58,7 +58,8 @@ def wrapper(*args, **kwargs):  # pyre-ignore[2, 3]
                 )
         if device_mesh.ndim != 1:
             raise RuntimeError(
-                f"device_mesh has dims {device_mesh.ndim} but expcted to be 1 for input."
+                f"device_mesh has dims {device_mesh.ndim} but expcted to be 1"
+                " for input."
             )
         return _prepare_input_func(*args, **kwargs)
 
@@ -91,18 +92,20 @@ def _prepare_output_validate(
     def wrapper(*args, **kwargs):  # pyre-ignore[2, 3]
         assert len(args) >= 1, "_prepare_output needs at least one arg."
         output = args[0]
-        assert isinstance(
-            output, DTensor
-        ), f"Expect output of Tensor Parallel to be a DTensor, but found {type(output)}."
+        assert isinstance(output, DTensor), (
+            "Expect output of Tensor Parallel to be a DTensor, but found"
+            f" {type(output)}."
+        )
         if len(args) < 2 or args[1] is None:
             device_mesh = output.device_mesh
             args = (*args[:1], device_mesh, *args[2:])  # pyre-ignore[60]
         else:
             device_mesh = args[1]
 
-        assert (
-            device_mesh.ndim == 1
-        ), f"device_mesh has dims {device_mesh.ndim} but expcted to be 1 for output."
+        assert device_mesh.ndim == 1, (
+            f"device_mesh has dims {device_mesh.ndim} but expcted to be 1 for"
+            " output."
+        )
         return _prepare_output_func(*args, **kwargs)
 
     return wrapper
@@ -130,8 +133,8 @@ def _create_1d_device_mesh(
     assert (
         tp_mesh_dim < device_mesh.ndim and tp_mesh_dim >= -device_mesh.ndim
     ), (
-        f"Expect tp_mesh_dim within range [{-device_mesh.ndim}, {device_mesh.ndim})"
-        f", but found {tp_mesh_dim}."
+        f"Expect tp_mesh_dim within range [{-device_mesh.ndim},"
+        f" {device_mesh.ndim}), but found {tp_mesh_dim}."
     )
 
     if device_mesh.ndim == 1:

From 3fc64817d66f4783fc71d3b79354ca1522ecd191 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Wed, 23 Nov 2022 16:33:13 +0000
Subject: [PATCH 1206/1922] Add `torch._dynamo` to docs (#89510)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89510
Approved by: https://github.com/msaroufim
---
 docs/source/_dynamo.rst     | 13 +++++++++++++
 docs/source/index.rst       |  1 +
 torch/_dynamo/__init__.py   | 16 +++++++++++-----
 torch/_dynamo/eval_frame.py |  2 +-
 4 files changed, 26 insertions(+), 6 deletions(-)
 create mode 100644 docs/source/_dynamo.rst

diff --git a/docs/source/_dynamo.rst b/docs/source/_dynamo.rst
new file mode 100644
index 0000000000000..5e16dcf52ddee
--- /dev/null
+++ b/docs/source/_dynamo.rst
@@ -0,0 +1,13 @@
+.. _torch_dynamo:
+
+torch._dynamo
+--------------------------
+
+.. warning ::
+     This module is an early prototype and is subject to change.
+
+.. currentmodule:: torch._dynamo
+
+.. automodule:: torch._dynamo
+    :members:
+    :member-order: bysource
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 20214466328a7..e4b6a124d6bdc 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -72,6 +72,7 @@ Features described in this documentation are classified by release status:
    torch.distributed.optim <distributed.optim>
    torch.distributed.checkpoint <distributed.checkpoint>
    torch.distributions <distributions>
+   torch._dynamo <_dynamo>
    torch.fft <fft>
    futures
    fx
diff --git a/torch/_dynamo/__init__.py b/torch/_dynamo/__init__.py
index 5eee609b0852a..57df92d75f4fa 100644
--- a/torch/_dynamo/__init__.py
+++ b/torch/_dynamo/__init__.py
@@ -15,7 +15,10 @@
 from .utils import compilation_metrics, guard_failures, orig_code_map
 
 __all__ = [
+    "allow_in_graph",
     "assume_constant_result",
+    "disallow_in_graph",
+    "graph_break",
     "optimize",
     "optimize_assert",
     "export",
@@ -47,7 +50,8 @@ def reset():
 
 def list_backends():
     """
-    Return valid strings that can be passed to:
+    Return valid strings that can be passed to::
+
         @torch._dynamo.optimize(<backend>)
         def foo(...):
            ....
@@ -60,7 +64,8 @@ def foo(...):
 def allow_in_graph(fn):
     """
     Customize which functions TorchDynamo will include in the generated
-    graph.  Similar to torch.fx.wrap().
+    graph. Similar to `torch.fx.wrap()`.
+    ::
 
         torch._dynamo.allow_in_graph(my_custom_function)
 
@@ -73,7 +78,7 @@ def fn(a):
 
         fn(...)
 
-    Will capture a single graph containing my_custom_function().
+    Will capture a single graph containing `my_custom_function()`.
     """
     if isinstance(fn, (list, tuple)):
         return [allow_in_graph(x) for x in fn]
@@ -87,6 +92,7 @@ def disallow_in_graph(fn):
     """
     Customize which functions TorchDynamo will exclude in the generated
     graph and force a graph break on.
+    ::
 
         torch._dynamo.disallow_in_graph(torch.sub)
 
@@ -99,8 +105,8 @@ def fn(a):
 
         fn(...)
 
-    Will break the graph on torch.sub, and give two graphs each with a
-    single torch.add() op.
+    Will break the graph on `torch.sub`, and give two graphs each with a
+    single `torch.add()` op.
     """
     if isinstance(fn, (list, tuple)):
         return [disallow_in_graph(x) for x in fn]
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 65e8af4883ab3..38c291c441feb 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -423,7 +423,7 @@ def optimize(
         disable: If True, turn this decorator into a no-op
         dynamic: If True, turn on dynamic shapes support
 
-    Example Usage:
+    Example Usage::
 
         @torch._dynamo.optimize()
         def toy_example(a, b):

From 0ce0d23dc4ecec926b52dc0b2d4a3c69c4befd6c Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Wed, 23 Nov 2022 16:51:42 +0000
Subject: [PATCH 1207/1922] Deprecate decorating classes with torch.no_grad and
 similar (#89522)

Fixes https://github.com/pytorch/pytorch/issues/89450

I would have completely removed it but I don't think this is particularly urgent and there are some use of it in the wild: https://github.com/search?q=%2Ftorch%5C.no_grad%5C%28%5C%29%5Cnclass%2F&type=code
So we might as well take one release to do it.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89522
Approved by: https://github.com/lezcano, https://github.com/soulitzer, https://github.com/janeyx99
---
 test/test_autograd.py       | 24 ++++++++++++++++++++++++
 torch/autograd/grad_mode.py |  7 +++++++
 2 files changed, 31 insertions(+)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index 6e26f67f6dc34..777b790da6559 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -63,6 +63,30 @@ def graph_desc(fn):
 
 class TestAutograd(TestCase):
 
+    def test_grad_mode_class_decoration(self):
+        # Decorating class is deprecated and should not be used
+        with self.assertWarnsRegex(UserWarning, "Decorating classes is deprecated"):
+            @torch.no_grad()
+            class Foo():
+                pass
+
+        # Decorating functions or methods is fine though
+        with warnings.catch_warnings(record=True) as w:
+            @torch.no_grad()
+            def foo():
+                pass
+
+            class Foo2():
+                @torch.no_grad()
+                def __init__(self):
+                    pass
+
+                @torch.no_grad()
+                def foo(self):
+                    pass
+
+        self.assertEqual(len(w), 0)
+
     def test_tensor_grad_warnings(self):
         dummy = torch.empty(1)
 
diff --git a/torch/autograd/grad_mode.py b/torch/autograd/grad_mode.py
index 94ca204fc9ab4..e5e410eeb42ee 100644
--- a/torch/autograd/grad_mode.py
+++ b/torch/autograd/grad_mode.py
@@ -2,6 +2,7 @@
 import torch
 import functools
 import inspect
+import warnings
 from typing import Any, Callable, TypeVar, cast
 
 __all__ = ['no_grad', 'enable_grad', 'set_grad_enabled',
@@ -18,6 +19,12 @@ class _DecoratorContextManager:
     """Allow a context manager to be used as a decorator"""
 
     def __call__(self, func: F) -> F:
+        if inspect.isclass(func):
+            warnings.warn("Decorating classes is deprecated and will be disabled in "
+                          "future versions. You should only decorate functions or methods. "
+                          "To preserve the current behavior of class decoration, you can "
+                          "directly decorate the `__init__` method and nothing else.")
+
         if inspect.isgeneratorfunction(func):
             return self._wrap_generator(func)
 

From 14063115338f2c9481edbdfeaf4c635634422a6e Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Wed, 23 Nov 2022 17:27:40 +0000
Subject: [PATCH 1208/1922] [test_nn] split parametrization test from test_nn
 (#89552)

Ref: https://github.com/pytorch/pytorch/issues/63085

Note: Doesn't need corresponding XLA PR as the migrated tests were not run on XLA (as they weren't in TestNNDeviceType).
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89552
Approved by: https://github.com/albanD
---
 test/nn/test_parametrization.py | 1525 +++++++++++++++++++++++++++++++
 test/test_nn.py                 | 1502 ------------------------------
 2 files changed, 1525 insertions(+), 1502 deletions(-)
 create mode 100644 test/nn/test_parametrization.py

diff --git a/test/nn/test_parametrization.py b/test/nn/test_parametrization.py
new file mode 100644
index 0000000000000..0ba361d310d3f
--- /dev/null
+++ b/test/nn/test_parametrization.py
@@ -0,0 +1,1525 @@
+# Owner(s): ["module: nn"]
+from copy import deepcopy
+from itertools import product
+
+import torch
+
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.init as init
+import torch.nn.utils.parametrize as parametrize
+from torch.nn import Parameter
+from torch.testing._internal.common_utils import run_tests, skipIfNoLapack, \
+    TemporaryFileName, instantiate_parametrized_tests, set_default_dtype
+from torch.testing._internal.common_cuda import TEST_MULTIGPU
+from torch.testing._internal.common_nn import NNTestCase
+from torch.testing._internal.common_utils import gradcheck
+
+
+class TestNNParametrization(NNTestCase):
+    _do_cuda_memory_leak_check = True
+    _do_cuda_non_default_stream = True
+
+    # FIXME: Rewrite this test using functions not depending on LAPACK
+    #        and remove the `@skipIfNoLapack` (see #70995)
+    # torch/nn/utils/parametrize
+    @skipIfNoLapack
+    def test_register_and_remove_parametrization(self):
+        r"""Test that it is possible to add a few parametrizations
+        on a parameter or a buffer and that removing them restores the initial state
+        It also tests that backpropagating through them works as expected
+        """
+        # Define a couple matrix parametrizations
+        class Skew(nn.Module):
+            def forward(self, X):
+                X = X.tril(-1)
+                return X - X.T
+
+        class Orthogonal(nn.Module):
+            def forward(self, X):
+                # Cayley map
+                # If X is skew-symmetric it returns an orthogonal matrix
+                Id = torch.eye(X.size(0), device=X.device)
+                # We call contiguous because solve returns a tensor with strides that are Fortran-contiguous
+                # and autograd raises a performance warning.
+                # This happens when we remove the parametrization with leave_parametrized=True,
+                # which does a set_ with a non-contiguous tensor while the gradient is contiguous
+                return torch.linalg.solve(Id + X, Id - X).contiguous()
+
+        class Resize(nn.Module):
+            def forward(self, X):
+                return X[[0]]
+
+        class NoResize(nn.Module):
+            def forward(self, X):
+                return X
+
+        # Define a couple vector parametrizations
+        class FirstZero(nn.Module):
+            def forward(self, x):
+                return torch.cat([x.new_zeros(1), x[1:]])
+
+        class LastZero(nn.Module):
+            def forward(self, x):
+                return torch.cat([x[:-1], x.new_zeros(1)])
+
+        model = nn.Linear(8, 8)
+        initial_weight_id = id(model.weight)
+        initial_bias_id = id(model.bias)
+        initial_model = deepcopy(model)
+
+        # Test unsafe flag
+        with self.assertRaisesRegex(ValueError, "Registering a parametrization may not change the shape of the tensor"):
+            parametrize.register_parametrization(model, "weight", Resize())  # default unsafe = False
+            model(torch.ones(8, 8))
+
+        # One parametrization with unsafe=True
+        parametrize.register_parametrization(model, "weight", Resize(), unsafe=True)
+        self.assertTrue(hasattr(model, "parametrizations"))
+        self.assertTrue(parametrize.is_parametrized(model))
+        self.assertTrue(parametrize.is_parametrized(model, "weight"))
+        self.assertFalse(parametrize.is_parametrized(model, "bias"))
+        self.assertNotIn("weight", model._parameters)
+        A = model.weight
+        self.assertTrue(A.shape[0] == 1)
+        parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
+        self.assertFalse(hasattr(model, "parametrizations"))
+        self.assertEqual(model.weight, initial_model.weight)
+        self.assertEqual(id(model.weight), initial_weight_id)
+        self.assertEqual(model.__class__, nn.Linear)
+
+        # Two parametrizations with unsafe=True
+        parametrize.register_parametrization(model, "weight", Resize(), unsafe=True)
+        parametrize.register_parametrization(model, "weight", NoResize(), unsafe=False)
+        self.assertTrue(hasattr(model, "parametrizations"))
+        self.assertTrue(parametrize.is_parametrized(model))
+        self.assertTrue(parametrize.is_parametrized(model, "weight"))
+        self.assertFalse(parametrize.is_parametrized(model, "bias"))
+        self.assertNotIn("weight", model._parameters)
+        A = model.weight
+        self.assertTrue(A.shape[0] == 1)
+        parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
+        self.assertFalse(hasattr(model, "parametrizations"))
+        self.assertEqual(model.weight, initial_model.weight)
+        self.assertEqual(id(model.weight), initial_weight_id)
+        self.assertEqual(model.__class__, nn.Linear)
+
+        # Test unsafe flag doesn't change expected behavior
+        parametrize.register_parametrization(model, "weight", Skew(), unsafe=True)
+        self.assertTrue(hasattr(model, "parametrizations"))
+        self.assertTrue(parametrize.is_parametrized(model))
+        self.assertTrue(parametrize.is_parametrized(model, "weight"))
+        self.assertFalse(parametrize.is_parametrized(model, "bias"))
+        self.assertNotIn("weight", model._parameters)
+        # Result should be skew-symmetric
+        A = model.weight
+        self.assertEqual(A, -A.T)
+        # Remove and check consistency
+        parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
+        self.assertFalse(hasattr(model, "parametrizations"))
+        self.assertEqual(model.weight, initial_model.weight)
+        self.assertEqual(id(model.weight), initial_weight_id)
+        self.assertEqual(model.__class__, nn.Linear)
+
+        # Test one parametrization
+        parametrize.register_parametrization(model, "weight", Skew())
+        self.assertTrue(hasattr(model, "parametrizations"))
+        self.assertTrue(parametrize.is_parametrized(model))
+        self.assertTrue(parametrize.is_parametrized(model, "weight"))
+        self.assertFalse(parametrize.is_parametrized(model, "bias"))
+        self.assertNotIn("weight", model._parameters)
+        # Result should be skew-symmetric
+        A = model.weight
+        self.assertEqual(A, -A.T)
+        # Remove and check consistency
+        parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
+        self.assertFalse(hasattr(model, "parametrizations"))
+        self.assertEqual(model.weight, initial_model.weight)
+        self.assertEqual(id(model.weight), initial_weight_id)
+        self.assertEqual(model.__class__, nn.Linear)
+
+        # Test two parametrizations at the same time and removing them
+        parametrize.register_parametrization(model, "weight", Skew())
+        parametrize.register_parametrization(model, "weight", Orthogonal())
+        # Result should be orthogonal
+        X = model.weight
+        Id = torch.eye(X.size(0), device=X.device)
+        self.assertEqual(X.T @ X, Id)
+        # Structure tests
+        self.assertTrue(hasattr(model, "parametrizations"))
+        self.assertTrue(parametrize.is_parametrized(model))
+        self.assertTrue(parametrize.is_parametrized(model, "weight"))
+        self.assertFalse(parametrize.is_parametrized(model, "bias"))
+        self.assertIn("weight", model.parametrizations)
+        self.assertNotIn("weight", model._parameters)
+        # Remove
+        parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
+        self.assertEqual(model.weight, initial_model.weight)
+        self.assertEqual(id(model.weight), initial_weight_id)
+        self.assertFalse(hasattr(model, "parametrizations"))
+        self.assertEqual(model.__class__, nn.Linear)
+
+        # Add everything
+        parametrize.register_parametrization(model, "weight", Skew())
+        parametrize.register_parametrization(model, "weight", Orthogonal())
+        parametrize.register_parametrization(model, "bias", FirstZero())
+        parametrize.register_parametrization(model, "bias", LastZero())
+
+        # Basic tests
+        self.assertTrue(parametrize.is_parametrized(model))
+        self.assertTrue(parametrize.is_parametrized(model, "weight"))
+        self.assertTrue(parametrize.is_parametrized(model, "bias"))
+        self.assertEqual(model.bias[0].item(), 0.)
+        self.assertEqual(model.bias[-1].item(), 0.)
+        self.assertEqual(len(list(model.parameters())), 2)  # Nothing weird has happpened
+        # Should not throw
+
+        sgd = torch.optim.SGD(model.parameters(), lr=0.01)
+
+        weight_copy = model.weight.clone()
+        bias_copy = model.bias.clone()
+        sgd.zero_grad()
+        (model.weight.T @ model.bias).sum().backward()
+        sgd.step()
+        self.assertNotEqual(model.weight, weight_copy)
+        self.assertNotEqual(model.bias, bias_copy)
+
+        # Remove first parametrization.
+        # Check that the model is still parametrized and so is the second parameter
+        parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
+        self.assertTrue(parametrize.is_parametrized(model))             # Still parametrized
+        self.assertFalse(parametrize.is_parametrized(model, "weight"))  # Parametrization removed
+        self.assertTrue(parametrize.is_parametrized(model, "bias"))     # Still parametrized
+        self.assertEqual(model.bias[0].item(), 0.)                      # Still parametrized
+        self.assertEqual(model.bias[-1].item(), 0.)                     # Still parametrized
+        self.assertNotEqual(model.weight, initial_model.weight)         # Has been updated
+        self.assertEqual(id(model.weight), initial_weight_id)           # Keeps the same id
+        self.assertEqual(len(list(model.parameters())), 2)              # Nothing weird has happened
+        # Should not throw
+        weight_copy = model.weight.clone()
+        bias_copy = model.bias.clone()
+        sgd.zero_grad()
+        (model.weight.T @ model.bias).sum().backward()
+        sgd.step()
+        self.assertNotEqual(model.weight, weight_copy)
+        self.assertNotEqual(model.bias, bias_copy)
+
+        # Remove the second parametrization.
+        # Check that the module is not parametrized
+        parametrize.remove_parametrizations(model, "bias", leave_parametrized=False)
+        self.assertFalse(parametrize.is_parametrized(model))  # Not parametrized
+        self.assertNotEqual(model.bias, initial_model.bias)   # Has been updated
+        self.assertNotEqual(model.bias[0].item(), 0.)         # Not parametrized
+        self.assertNotEqual(model.bias[-1].item(), 0.)        # Not parametrized
+        self.assertEqual(id(model.bias), initial_bias_id)     # Keeps the same id
+        self.assertFalse(hasattr(model, "parametrizations"))  # Not parametrized the module
+        self.assertEqual(model.__class__, nn.Linear)          # Resores the previous class
+        self.assertEqual(len(list(model.parameters())), 2)    # Nothing weird has happeed
+
+        # Should not throw things are updated
+        weight_copy = model.weight.clone()
+        bias_copy = model.bias.clone()
+        sgd.zero_grad()
+        (model.weight.T @ model.bias).sum().backward()
+        sgd.step()
+        self.assertNotEqual(model.weight, weight_copy)
+        self.assertNotEqual(model.bias, bias_copy)
+
+        # Test leave_parametrized=True
+        for _ in range(2):
+            parametrize.register_parametrization(model, "weight", Skew())
+            parametrize.register_parametrization(model, "weight", Orthogonal())
+            parametrize.remove_parametrizations(model, "weight", leave_parametrized=True)
+            # We didn't change the dtype nor had multiple inputs, so the id should be the same
+            self.assertEqual(id(model.weight), initial_weight_id)
+            self.assertEqual(id(model.bias), initial_bias_id)
+
+            # Should not throw. Things are updated
+            weight_copy = model.weight.clone()
+            bias_copy = model.bias.clone()
+            sgd.zero_grad()
+            (model.weight.T @ model.bias).sum().backward()
+            sgd.step()
+            self.assertNotEqual(model.weight, weight_copy)
+            self.assertNotEqual(model.bias, bias_copy)
+
+    def test_register_and_remove_nested_parametrization(self):
+        r"""Test that it is possible to nest the parametrizations
+        meaning that the original param is parametrized again
+        """
+        class Skew(nn.Module):
+            def forward(self, X):
+                X = X.tril(-1)
+                return X - X.T
+
+        model = nn.Linear(8, 8)
+        # Add top level parametrization
+        parametrize.register_parametrization(model, "weight", Skew())
+        self.assertTrue(hasattr(model, "parametrizations"))
+        self.assertTrue(parametrize.is_parametrized(model))
+        self.assertTrue(parametrize.is_parametrized(model, "weight"))
+        self.assertFalse(parametrize.is_parametrized(model, "bias"))
+        self.assertNotIn("weight", model._parameters)
+        # Result should be skew-symmetric
+        A = model.weight
+        self.assertEqual(A, -A.T)
+
+        # Add nested parametrization
+        param_mod = model.parametrizations.weight
+        self.assertFalse(hasattr(param_mod, "parametrizations"))
+        self.assertFalse(parametrize.is_parametrized(param_mod))
+        self.assertFalse(parametrize.is_parametrized(param_mod, "original"))
+
+        parametrize.register_parametrization(param_mod, "original", Skew())
+        self.assertTrue(hasattr(param_mod, "parametrizations"))
+        self.assertTrue(parametrize.is_parametrized(param_mod))
+        self.assertTrue(parametrize.is_parametrized(param_mod, "original"))
+        self.assertNotIn("original", param_mod._parameters)
+        # Result should be skew-symmetric
+        A = param_mod.original
+        self.assertEqual(A, -A.T)
+
+        # Remove nested param and check consistency
+        parametrize.remove_parametrizations(param_mod, "original", leave_parametrized=False)
+        self.assertFalse(hasattr(param_mod, "parametrizations"))
+        self.assertEqual(param_mod.__class__, parametrize.ParametrizationList)
+
+        # Remove top level and check consistency
+        parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
+        self.assertFalse(hasattr(model, "parametrizations"))
+        self.assertEqual(model.__class__, nn.Linear)
+
+    def test_register_and_remove_buffer_parametrization(self):
+        r"""Test that it is possible to add and remove parametrizations on buffers"""
+        # Define a couple vector parametrizations
+        class FirstZero(nn.Module):
+            def forward(self, x):
+                return torch.cat([x.new_zeros(1), x[1:]])
+
+        class LastZero(nn.Module):
+            def forward(self, x):
+                return torch.cat([x[:-1], x.new_zeros(1)])
+
+        model = nn.Linear(8, 8)
+
+        # Instantiate parametrizations on buffers. It should work as expected
+        delattr(model, "bias")
+        model.register_buffer("bias", torch.ones(8))
+        parametrize.register_parametrization(model, "bias", FirstZero())
+        parametrize.register_parametrization(model, "bias", LastZero())
+        self.assertTrue(parametrize.is_parametrized(model))
+        self.assertTrue(parametrize.is_parametrized(model, "bias"))
+        self.assertEqual(model.bias[0].item(), 0.)
+        self.assertEqual(model.bias[-1].item(), 0.)
+        self.assertTrue((model.bias[1:-1] == torch.ones(6)).all())
+        self.assertEqual(len(list(model.parameters())), 1)
+
+        # Remove parametrizations on buffers. It should work as expected
+        parametrize.remove_parametrizations(model, "bias", leave_parametrized=True)
+        self.assertFalse(parametrize.is_parametrized(model))
+        self.assertFalse(parametrize.is_parametrized(model, "bias"))
+        self.assertEqual(model.bias[0].item(), 0.)
+        self.assertEqual(model.bias[-1].item(), 0.)
+        self.assertTrue((model.bias[1:-1] == torch.ones(6)).all())
+        self.assertEqual(len(list(model.parameters())), 1)
+
+    # FIXME: Rewrite this test using functions not depending on LAPACK
+    #        and remove the `@skipIfNoLapack` (see #70995)
+    @skipIfNoLapack
+    def test_serialization_parametrization(self):
+        r"""Test that it is possible to serialize a parametrized model via state_dict"""
+        # A stateful parametrization
+        class Orthogonal(nn.Module):
+            def __init__(self, n):
+                super().__init__()
+                self.register_buffer("id", torch.eye(n))
+                self.register_buffer("B", torch.empty(n, n))
+                init.orthogonal_(self.B)
+
+            def forward(self, X):
+                A = X.triu(1)
+                A = A - A.T
+                return self.B @ torch.linalg.solve(self.id + A, self.id - A)
+
+        def get_model():
+            model = torch.nn.Sequential(
+                torch.nn.Linear(5, 5),
+                torch.nn.ReLU(),
+                torch.nn.Linear(5, 1),
+            )
+
+            parametrize.register_parametrization(model[0], "weight", Orthogonal(5))
+            return model
+
+        model = get_model()
+
+        prev_weight = model[0].weight
+        prev_B = model[0].parametrizations.weight[0].B
+
+        new_model = get_model()
+        with TemporaryFileName() as fname:
+            torch.save(model.state_dict(), fname)
+            new_model.load_state_dict(torch.load(fname))
+
+        # Integrity tests
+        self.assertTrue(parametrize.is_parametrized(new_model[0], "weight"))
+        self.assertEqual(prev_weight, new_model[0].weight)
+        self.assertEqual(prev_B, new_model[0].parametrizations.weight[0].B)
+
+        # Trying to save the whole parametrized model raises
+        with self.assertRaisesRegex(RuntimeError, "state_dict"):
+            with TemporaryFileName() as fname:
+                torch.save(model, fname)
+
+    # FIXME: Rewrite this test using functions not depending on LAPACK
+    #        and remove the `@skipIfNoLapack` (see #70995)
+    @skipIfNoLapack
+    def test_initialization_parametrization(self):
+        r"""Test that it is possible to initialize a parametrization when it
+            implements a `right_inverse` method
+        """
+        class Skew(nn.Module):
+            def forward(self, X):
+                A = X.triu(1)
+                return A - A.T
+
+            def is_skew(self, A):
+                return torch.allclose(A, -A.T, atol=1e-6)
+
+            def right_inverse(self, X):
+                if not self.is_skew(X):
+                    raise ValueError("The matrix is not skew-symmetric.")
+                return X.triu(1)
+
+        # Implements a Cayley map where right_inverse is not quite the inverse of forward
+        class Orthogonal(nn.Module):
+            def __init__(self, n):
+                super().__init__()
+                self.register_buffer("B", torch.eye(n))
+
+            def forward(self, X):
+                Id = torch.eye(X.size(0))
+                return self.B @ torch.linalg.solve(Id + X, Id - X)
+
+            def is_orthogonal(self, X):
+                Id = torch.eye(X.size(0))
+                return torch.allclose(X.T @ X, Id, atol=1e-4)
+
+            def right_inverse(self, X):
+                if not self.is_orthogonal(X):
+                    raise ValueError("The input is not orthogonal.")
+                # cayley(0) == Id, so B @ cayley(0) == B
+                self.B = X
+                return torch.zeros_like(X)
+
+        N = 5
+        model = nn.Linear(N, N)
+        # Register the skew-symmetric constraint. The result is now skew-symmetric
+        skew = Skew()
+        # Make the weight skew-symmetric before registering the parametrization
+        with torch.no_grad():
+            model.weight.set_(skew(model.weight))
+        parametrize.register_parametrization(model, "weight", skew)
+        X = torch.rand(N, N)
+        # X is not skew-symmetric, so it throws an error
+        with self.assertRaises(ValueError):
+            model.weight = X
+        # Make X skew-symmetric
+        X = X - X.T
+        model.weight = X
+        self.assertEqual(model.parametrizations.weight.original, X.triu(1))
+        self.assertEqual(model.weight, X)
+
+        # Having several parametrizations registered should work in the same way
+        parametrize.register_parametrization(model, "weight", Orthogonal(N))
+        # Register now the Cayley map. The result is now orthogonal
+        X = torch.rand(N, N)
+        # X is not orthogonal, so it throws an error
+        with self.assertRaises(ValueError):
+            model.weight = X
+        init.orthogonal_(X)
+        model.weight = X
+        self.assertEqual(model.weight, X)
+        self.assertEqual(model.parametrizations.weight.original, torch.zeros_like(X))
+
+    def test_errors_unparametrized_tensor_parametrization(self):
+        # Test errors when registering a parametrization on an unparametrized tensor
+        module = nn.Linear(3, 4)
+        weight_init = module.weight.clone()
+
+        class Identity(nn.Module):
+            def forward(self, x):
+                return x
+
+        # Register a parametrization on a non-existing parameter throws
+        with self.assertRaisesRegex(ValueError, "does not have a parameter"):
+            parametrize.register_parametrization(module, "foo", Identity())
+        self.assertFalse(parametrize.is_parametrized(module))
+
+        # Removing parametrizations from an unparametrized tensor throws
+        with self.assertRaisesRegex(ValueError, "does not have a parametrization"):
+            parametrize.remove_parametrizations(module, "bias")
+        self.assertFalse(parametrize.is_parametrized(module))
+
+        # A correct parametrization with several outputs
+        class Sum(nn.Module):
+            def forward(self, x, y):
+                return x + y
+
+            def right_inverse(self, z):
+                return z, torch.zeros_like(z)
+
+        parametrize.register_parametrization(module, "weight", Sum())
+        # Cannot remove a parametrization with several outputs with `leave_parametrized=False`
+        with self.assertRaisesRegex(ValueError, "leave_parametrized=False"):
+            parametrize.remove_parametrizations(module, "weight", leave_parametrized=False)
+        parametrize.remove_parametrizations(module, "weight", leave_parametrized=True)
+
+        # A parametrization with an incorrect number of outputs
+        class WrongNumberParams(nn.Module):
+            def forward(self, x, y, z):
+                return x + y + z
+
+            def right_inverse(self, w):
+                return w, torch.zeros_like(w)
+
+        # Makes param(*param.right_inverse(X)) fail
+        with self.assertRaisesRegex(TypeError, "positional argument"):
+            parametrize.register_parametrization(module, "weight", WrongNumberParams())
+        self.assertFalse(parametrize.is_parametrized(module))
+
+        # A parametrization with a right_inverse that does not return a Tensor or Sequence[Tensor]
+        class WrongRightInverse(Identity):
+            def right_inverse(self, z):
+                return None
+
+        # right_inverse should return a Tensor or a Sequence[Tensor]
+        with self.assertRaisesRegex(ValueError, "Tensor or a Sequence of"):
+            parametrize.register_parametrization(module, "weight", WrongRightInverse())
+        self.assertFalse(parametrize.is_parametrized(module))
+
+        # If it's a sequence, it must to be a sequence of tensors
+        class WrongRightInverseSequence(nn.Module):
+            def forward(self, x, y):
+                return x
+
+            def right_inverse(self, z):
+                return None, z
+
+        with self.assertRaisesRegex(ValueError, "of the sequence with type"):
+            parametrize.register_parametrization(module, "weight", WrongRightInverseSequence())
+        self.assertFalse(parametrize.is_parametrized(module))
+
+        # A parametrization from one tensor to one tensor that changes the dtype
+        class ChangeDtypeInverse(nn.Module):
+            def forward(self, x):
+                return x.float()
+
+            def right_inverse(self, w):
+                return w.bool()
+
+        # For parametrizations that return one tensor, right_inverse may not change the dtype
+        with self.assertRaisesRegex(ValueError, "outputs one tensor, it may not change the dtype"):
+            parametrize.register_parametrization(module, "weight", ChangeDtypeInverse())
+        self.assertFalse(parametrize.is_parametrized(module))
+
+        # Doesn't return a tensor
+        class NotTensor(nn.Module):
+            def forward(self, x):
+                return 2
+
+        # Forward must return a tensor
+        with self.assertRaisesRegex(ValueError, "must return a tensor"):
+            parametrize.register_parametrization(module, "weight", NotTensor())
+        self.assertFalse(parametrize.is_parametrized(module))
+
+        # A parametrization from one tensor to one tensor that changes the dtype
+        class ChangeDtype(nn.Module):
+            def forward(self, x):
+                return x.bool()
+
+        # forward should not change the initial dtype
+        with self.assertRaisesRegex(ValueError, "may not change the dtype"):
+            parametrize.register_parametrization(module, "weight", ChangeDtype())
+        self.assertFalse(parametrize.is_parametrized(module))
+
+        # Change shape
+        class ChangeShape(nn.Module):
+            def forward(self, x):
+                return x[:-1]
+
+        # forward should not change the original shape
+        with self.assertRaisesRegex(ValueError, "may not change the shape"):
+            parametrize.register_parametrization(module, "weight", ChangeShape())
+        self.assertFalse(parametrize.is_parametrized(module))
+
+        # Many to one that changes dtype
+        class ChangeDtypeMulti(nn.Module):
+            def forward(self, x, y):
+                return (x + y).bool()
+
+            def right_inverse(self, w):
+                return w, w + 1
+
+        # forward should not change the original shape even for parametrizations with many inputs
+        with self.assertRaisesRegex(ValueError, "may not change the dtype"):
+            parametrize.register_parametrization(module, "weight", ChangeDtypeMulti())
+        self.assertFalse(parametrize.is_parametrized(module))
+
+        # Returning a sequence of size one, although weird, it's correct
+        class SequenceLen1(nn.Module):
+            def forward(self, x):
+                return x
+
+            def right_inverse(self, w):
+                return (w,)
+
+        parametrize.register_parametrization(module, "weight", SequenceLen1())
+        self.assertTrue(hasattr(module.parametrizations.weight, "original0"))
+        self.assertFalse(hasattr(module.parametrizations.weight, "original1"))
+        _ = module.weight   # Does not throw
+        self.assertTrue(parametrize.is_parametrized(module))
+        parametrize.remove_parametrizations(module, "weight", leave_parametrized=True)
+
+        # None of the operations above should have altered the weight
+        self.assertFalse(parametrize.is_parametrized(module))
+        self.assertEqual(module.weight, weight_init)
+
+    def test_errors_parametrized_tensor_parametrization(self):
+        # Test errors when registering a parametrization on a parametrized tensor
+
+        class Identity(nn.Module):
+            def forward(self, x):
+                return x
+
+        module = nn.Linear(3, 4)
+        parametrize.register_parametrization(module, "weight", Identity())
+
+        # Has to return a tensor
+        class WrongReturn(nn.Module):
+            def forward(self, x):
+                return x, x
+
+        with self.assertRaisesRegex(ValueError, "must return a tensor"):
+            parametrize.register_parametrization(module, "weight", WrongReturn())
+        self.assertTrue(parametrize.is_parametrized(module))
+        self.assertEqual(len(module.parametrizations.weight), 1)
+        self.assertTrue(isinstance(module.parametrizations.weight[0], Identity))
+
+        # Cannot change dtype
+        class ChangeDtype(nn.Module):
+            def forward(self, x):
+                return x.bool()
+
+        with self.assertRaisesRegex(ValueError, "may not change the dtype"):
+            parametrize.register_parametrization(module, "weight", ChangeDtype())
+        self.assertTrue(parametrize.is_parametrized(module))
+        self.assertEqual(len(module.parametrizations.weight), 1)
+        self.assertTrue(isinstance(module.parametrizations.weight[0], Identity))
+
+        # Cannot change shape
+        class ChangeShape(nn.Module):
+            def forward(self, x):
+                return x[:-1]
+
+        with self.assertRaisesRegex(ValueError, "may not change the shape"):
+            parametrize.register_parametrization(module, "weight", ChangeShape())
+        self.assertTrue(parametrize.is_parametrized(module))
+        self.assertEqual(len(module.parametrizations.weight), 1)
+        self.assertTrue(isinstance(module.parametrizations.weight[0], Identity))
+
+        # The following checks are mostly due to bugs in the code of the parametrization
+
+        # right_inverse has to return a tensor
+        class WrongReturnInverse(Identity):
+            def right_inverse(self, x):
+                return x, x
+
+        with self.assertRaisesRegex(ValueError, "right_inverse must return a tensor"):
+            parametrize.register_parametrization(module, "weight", WrongReturnInverse())
+        self.assertTrue(parametrize.is_parametrized(module))
+        self.assertEqual(len(module.parametrizations.weight), 1)
+        self.assertTrue(isinstance(module.parametrizations.weight[0], Identity))
+
+        # Cannot change dtype
+        class ChangeDtypeInverse(Identity):
+            def right_inverse(self, x):
+                return x.bool()
+
+        with self.assertRaisesRegex(ValueError, "must have the same dtype"):
+            parametrize.register_parametrization(module, "weight", ChangeDtypeInverse())
+        self.assertTrue(parametrize.is_parametrized(module))
+        self.assertEqual(len(module.parametrizations.weight), 1)
+        self.assertTrue(isinstance(module.parametrizations.weight[0], Identity))
+
+        # Cannot change shape
+        class ChangeShapeInverse(Identity):
+            def right_inverse(self, x):
+                return x[:-1]
+
+        with self.assertRaisesRegex(ValueError, "must have the same shape"):
+            parametrize.register_parametrization(module, "weight", ChangeShapeInverse())
+        self.assertTrue(parametrize.is_parametrized(module))
+        self.assertEqual(len(module.parametrizations.weight), 1)
+        self.assertTrue(isinstance(module.parametrizations.weight[0], Identity))
+
+    # FIXME: Rewrite this test using functions not depending on LAPACK
+    #        and remove the `@skipIfNoLapack` (see #70995)
+    @skipIfNoLapack
+    def test_multiple_inputs_parametrization(self):
+        # A parametrization with several outputs
+        class RankOne(nn.Module):
+            def forward(self, x, y):
+                # Form a rank-1 matrix from a pair of vectors
+                return x.unsqueeze(-1) @ y.unsqueeze(-2)
+
+            def right_inverse(self, Y):
+                # We project the given matrix onto the rank 1 matrices
+                U, S, Vh = torch.linalg.svd(Y, full_matrices=False)
+                # S is ordered in a decreasing way.
+                s0_sqrt = S[0].sqrt().unsqueeze(-1)
+                return U[..., :, 0] * s0_sqrt, Vh[..., 0, :] * s0_sqrt
+
+        # Simple parametrisation
+        class Double(nn.Module):
+            def forward(self, x):
+                return 2.0 * x
+
+            def right_inverse(self, w):
+                return 0.5 * w
+
+        model = nn.Linear(3, 3)
+        # Test one parametrization
+        parametrize.register_parametrization(model, "weight", RankOne())
+        self.assertTrue(hasattr(model, "parametrizations"))
+        self.assertTrue(parametrize.is_parametrized(model))
+        self.assertTrue(parametrize.is_parametrized(model, "weight"))
+        self.assertTrue(hasattr(model.parametrizations.weight, "original0"))
+        self.assertIn("original0", model.parametrizations.weight._parameters)
+        self.assertTrue(hasattr(model.parametrizations.weight, "original1"))
+        self.assertIn("original1", model.parametrizations.weight._parameters)
+        self.assertFalse(parametrize.is_parametrized(model, "bias"))
+        self.assertNotIn("weight", model._parameters)
+        # Result should be rank 1
+        self.assertEqual(torch.linalg.matrix_rank(model.weight).item(), 1)
+
+        with self.assertRaisesRegex(ValueError, "leave_parametrized=False"):
+            # Cannot remove a parametrization with multiple inputs and not leave it parametrized
+            parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
+        # Remove parametrization and check consistency
+        parametrize.remove_parametrizations(model, "weight", leave_parametrized=True)
+        self.assertFalse(hasattr(model, "parametrizations"))
+        self.assertEqual(model.__class__, nn.Linear)
+        self.assertFalse(parametrize.is_parametrized(model))
+        self.assertEqual(torch.linalg.matrix_rank(model.weight).item(), 1)
+        self.assertIn("weight", model._parameters)
+
+        # Registering parametrizations with one input on top of one with multiple inputs should work
+        init_weight = model.weight.clone()
+        parametrize.register_parametrization(model, "weight", RankOne())
+        # Projecting a rank 1 matrix onto the matrices of rank one does not change the matrix
+        self.assertEqual(init_weight, model.weight)
+        parametrize.register_parametrization(model, "weight", Double())
+        # The matrix now is twice the initial matrix
+        self.assertEqual(2.0 * init_weight, model.weight)
+        # Multiplying by a scalar does not change the rank
+        self.assertEqual(torch.linalg.matrix_rank(model.weight).item(), 1)
+
+        # The model has now three parameters
+        self.assertEqual(len(list(model.parameters())), 3)
+
+        sgd = torch.optim.SGD(model.parameters(), lr=0.1)
+
+        # Test backward. Should not throw
+        for _ in range(2):
+            sgd.zero_grad()
+            loss = (model.weight.T @ model.bias).sum()
+            loss.backward()
+            sgd.step()
+
+        # Same drill as before, removing should work as expected
+        with self.assertRaisesRegex(ValueError, "leave_parametrized=False"):
+            # Cannot remove a parametrization with multiple inputs and not leave it parametrized
+            parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
+        # Remove parametrization and check consistency
+        parametrize.remove_parametrizations(model, "weight", leave_parametrized=True)
+        self.assertFalse(hasattr(model, "parametrizations"))
+        self.assertEqual(model.__class__, nn.Linear)
+        self.assertFalse(parametrize.is_parametrized(model))
+        self.assertEqual(torch.linalg.matrix_rank(model.weight).item(), 1)
+        self.assertIn("weight", model._parameters)
+
+        # The model has now two parameters
+        self.assertEqual(len(list(model.parameters())), 2)
+
+        # Test backward. Should not throw
+        sgd = torch.optim.SGD(model.parameters(), lr=0.1)
+        for _ in range(2):
+            sgd.zero_grad()
+            loss = (model.weight.T @ model.bias).sum()
+            loss.backward()
+            sgd.step()
+
+    # FIXME: Rewrite this test using functions not depending on LAPACK
+    #        and remove the `@skipIfNoLapack` (see #70995)
+    @skipIfNoLapack
+    def test_caching_parametrization(self):
+        r"""Test the caching system of a parametrization"""
+        # Define a couple matrix parametrizations
+        class Skew(nn.Module):
+            def forward(self, X):
+                X = X.tril(-1)
+                return X - X.T
+
+        class Orthogonal(nn.Module):
+            def forward(self, X):
+                Id = torch.eye(X.size(0), device=X.device)
+                return torch.linalg.solve(Id + X, Id - X)
+
+        model = nn.Linear(5, 5)
+        parametrize.register_parametrization(model, "weight", Skew())
+        parametrize.register_parametrization(model, "weight", Orthogonal())
+
+        # Test that the caching system works
+        with parametrize.cached():
+            X = model.weight
+            Y = model.weight
+            self.assertEqual(id(X), id(Y))
+
+    # FIXME: Rewrite this test using functions not depending on LAPACK
+    #        and remove the `@skipIfNoLapack` (see #70995)
+    @skipIfNoLapack
+    def test_caching_parametrization_with_transfer_parametrizations_and_params(self):
+        r"""Test that transferring parametrizations doesn't cause issues with caching"""
+        class Skew(nn.Module):
+            def forward(self, X):
+                X = X.tril(-1)
+                return X - X.T
+
+        class Orthogonal(nn.Module):
+            def forward(self, X):
+                Id = torch.eye(X.size(0), device=X.device)
+                return torch.linalg.solve(Id + X, Id - X)
+
+        model = nn.Linear(5, 5)
+        parametrize.register_parametrization(model, "weight", Skew())
+        parametrize.register_parametrization(model, "weight", Orthogonal())
+
+        to_model = nn.Linear(5, 5)
+        parametrize.transfer_parametrizations_and_params(model, to_model)
+
+        with parametrize.cached():
+            X = model.weight
+            Y = model.weight
+            self.assertEqual(id(X), id(Y))
+
+            A = to_model.weight
+            B = to_model.weight
+            self.assertEqual(id(A), id(B))
+
+            # test that the results are distinct objects for each module
+            self.assertNotEqual(id(A), id(X))
+
+    def test_parametrization_same_training_mode(self):
+        r"""Test training mode updated on parametrization registration"""
+        class Identity(nn.Module):
+            def forward(self, X):
+                return X
+
+        module = nn.Linear(4, 4)
+        module.eval()
+        parametrize.register_parametrization(module, "weight", Identity())
+        self.assertFalse(module.parametrizations.weight[0].training)
+        module.train()
+        parametrize.register_parametrization(module, "weight", Identity().eval())
+        self.assertTrue(module.parametrizations.weight[0].training)
+        self.assertTrue(module.parametrizations.weight[1].training)
+
+    def test_type_before_parametrizations(self):
+        r"""Test that type_before_parametrizations always retrieves original type"""
+
+        class Identity(nn.Module):
+            def forward(self, X):
+                return X
+
+        model = nn.Linear(5, 5)
+        original_type = type(model)
+        self.assertTrue(
+            parametrize.type_before_parametrizations(model) == original_type
+        )
+        parametrize.register_parametrization(model, "weight", Identity())
+        self.assertTrue(
+            parametrize.type_before_parametrizations(model) == original_type
+        )
+
+    def test_deepcopy_after_parametrization(self):
+        r"""Test that we are able to create a deepcopy of the module when it's parametrized."""
+
+        class AddOne(nn.Module):
+            def forward(self, x):
+                return x + 1.0
+
+        class ModelWithoutDeepcopy(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.weight = nn.Parameter(torch.tensor([1., 1., 1., 1.]), requires_grad=True)
+                self.bias = nn.Parameter(torch.tensor([0., 0., 0., 0.]), requires_grad=True)
+                self.attr = [1.0, 2.0, 3.0, 4.0]
+
+        class ActualModel(ModelWithoutDeepcopy):
+            # Emulate custom implementation of the deepcopying.
+            def __deepcopy__(self, memo):
+                result = self.__new__(self.__class__)
+                memo[id(self)] = result
+                result.__dict__ = deepcopy(self.__dict__, memo)
+                return result
+
+        def check_deepcopy(m1: nn.Module, m2: nn.Module):
+            w1 = m1.parametrizations.weight.original
+            w2 = m2.parametrizations.weight.original
+            b1 = m1.parametrizations.bias.original if parametrize.is_parametrized(m1, "bias") else m1.bias
+            b2 = m2.parametrizations.bias.original if parametrize.is_parametrized(m2, "bias") else m2.bias
+            # Weights, biases and attributes should be equal but they must be different objects.
+            self.assertEqual(m1.__dict__.keys(), m2.__dict__.keys())
+            self.assertIsNot(m1, m2)
+            self.assertEqual(w1, w2)
+            self.assertIsNot(w1, w2)
+            self.assertEqual(b1, b2)
+            self.assertIsNot(b1, b2)
+            self.assertEqual(m1.attr, m2.attr)
+            self.assertIsNot(m1.attr, m2.attr)
+
+        for model in (ModelWithoutDeepcopy(), ActualModel()):
+            # General check that we are able to create deepcopy.
+            parametrize.register_parametrization(model, "weight", AddOne())
+            check_deepcopy(model, deepcopy(model))
+            # Check that this works on models with several parametrized tensors.
+            parametrize.register_parametrization(model, "bias", AddOne())
+            check_deepcopy(model, deepcopy(model))
+            # Check that this works on models where tensors have more than one parametrization.
+            parametrize.register_parametrization(model, "weight", AddOne())
+            check_deepcopy(model, deepcopy(model))
+
+    def test_transfer_parametrizations_and_params(self):
+        r"""Test that all parametrizations and their associated parameters are transferred."""
+
+        class AddOne(nn.Module):
+            def forward(self, x):
+                return x + 1.0
+
+        class Double(nn.Module):
+            def forward(self, x):
+                return 2.0 * x
+
+            def right_inverse(self, x):
+                return 0.5 * x
+
+        class MinusOne(nn.Module):
+            def forward(self, x):
+                return x - 1.0
+
+        model = nn.Linear(5, 5)
+        parametrize.register_parametrization(model, "weight", AddOne())
+        parametrize.register_parametrization(model, "weight", Double())
+        parametrize.register_parametrization(model, "weight", MinusOne())
+        hold_weight = model.weight
+
+        to_model = torch.ao.nn.qat.Linear(
+            5, 5, qconfig=torch.ao.quantization.get_default_qconfig()
+        )
+        parametrize.transfer_parametrizations_and_params(model, to_model)
+
+        # checks that final and original value are correct and the to_model is parametrized
+        self.assertTrue(torch.nn.utils.parametrize.is_parametrized(to_model, "weight"))
+        self.assertEqual(model.weight, to_model.weight)
+        self.assertEqual(
+            model.parametrizations.weight.original,
+            to_model.parametrizations.weight.original,
+        )
+
+        # check that the transfer didn't affect the original value
+        self.assertEqual(hold_weight, model.weight)
+
+        # testing that changes to one set of parametrizations do not affect the other
+        parametrize.remove_parametrizations(to_model, "weight")
+        self.assertFalse(torch.nn.utils.parametrize.is_parametrized(to_model, "weight"))
+        self.assertTrue(torch.nn.utils.parametrize.is_parametrized(model, "weight"))
+
+        # also test that parameters that don't exist in to_model get transferred
+        model.test_param = Parameter(torch.randn(5, 5))
+
+        self.assertTrue(not hasattr(to_model, "test_param"))
+        parametrize.register_parametrization(model, "test_param", Double())
+        hold_test_param = model.test_param
+        parametrize.transfer_parametrizations_and_params(model, to_model, "test_param")
+
+        # check that previously missing params got transferred correctly
+        self.assertEqual(model.test_param, to_model.test_param)
+        self.assertEqual(
+            model.parametrizations.test_param.original,
+            to_model.parametrizations.test_param.original,
+        )
+
+        # check that the new transfer didn't change the value for the from_module
+        self.assertEqual(hold_test_param, model.test_param)
+
+    def test_transfer_parametrizations_and_params_right_inverse(self):
+        r"""Test that all parametrizations and their associated parameters are transferred."""
+
+        class Double(nn.Module):
+            def forward(self, x):
+                return 2.0 * x
+
+            def right_inverse(self, x):
+                return 0.5 * x
+
+        model = nn.Linear(5, 5)
+        parametrize.register_parametrization(model, "weight", Double())
+        hold_weight = model.weight
+
+        to_model = torch.ao.nn.qat.Linear(
+            5, 5, qconfig=torch.ao.quantization.get_default_qconfig()
+        )
+        parametrize.transfer_parametrizations_and_params(model, to_model)
+
+        # check that transfer occurs successfully
+        self.assertEqual(model.weight, to_model.weight)
+        self.assertEqual(
+            model.parametrizations.weight.original,
+            to_model.parametrizations.weight.original,
+        )
+
+        # check that transfer doesn't affect the from_model weight
+        self.assertEqual(hold_weight, model.weight)
+
+    def test_transfer_parametrizations_and_params_single_param(self):
+        r"""Test that all parametrizations and their associated parameters are transferred."""
+
+        class AddOne(nn.Module):
+            def forward(self, x):
+                return x + 1.0
+
+        class Double(nn.Module):
+            def forward(self, x):
+                return 2.0 * x
+
+        class MinusOne(nn.Module):
+            def forward(self, x):
+                return x - 1.0
+
+        model = nn.Linear(5, 5, bias=True)
+        parametrize.register_parametrization(model, "weight", AddOne())
+        parametrize.register_parametrization(model, "weight", Double())
+        parametrize.register_parametrization(model, "weight", MinusOne())
+        parametrize.register_parametrization(model, "bias", AddOne())
+        parametrize.register_parametrization(model, "bias", Double())
+        parametrize.register_parametrization(model, "bias", MinusOne())
+
+        to_model = torch.ao.nn.qat.Linear(
+            5, 5, bias=True, qconfig=torch.ao.quantization.get_default_qconfig()
+        )
+        parametrize.transfer_parametrizations_and_params(model, to_model, "weight")
+
+        # check that weight and only weight was transferred
+        self.assertEqual(model.weight, to_model.weight)
+        self.assertEqual(
+            model.parametrizations.weight.original,
+            to_model.parametrizations.weight.original,
+        )
+        self.assertTrue("bias" not in to_model.parametrizations)
+
+    # FIXME: Rewrite this test using functions not depending on LAPACK
+    # and remove the `@skipIfNoLapack` (see #70995)
+    @skipIfNoLapack
+    def test_transfer_parametrizations_and_params_many_to_one(self):
+        # A parametrization with several outputs
+        class RankOne(nn.Module):
+            def forward(self, x, y):
+                # Form a rank-1 matrix from a pair of vectors
+                return x.unsqueeze(-1) @ y.unsqueeze(-2)
+
+            def right_inverse(self, Y):
+                # We project the given matrix onto the rank 1 matrices
+                U, S, Vh = torch.linalg.svd(Y, full_matrices=False)
+                # S is ordered in a decreasing way.
+                s0_sqrt = S[0].sqrt().unsqueeze(-1)
+                return U[..., :, 0] * s0_sqrt, Vh[..., 0, :] * s0_sqrt
+
+        class Double(nn.Module):
+            def forward(self, x):
+                return 2.0 * x
+
+        model = nn.Linear(3, 3)
+        parametrize.register_parametrization(model, "weight", RankOne())
+        parametrize.register_parametrization(model, "weight", Double())
+        hold_weight = model.weight
+
+        to_model = torch.ao.nn.qat.Linear(
+            3, 3, qconfig=torch.ao.quantization.get_default_qconfig()
+        )
+
+        parametrize.transfer_parametrizations_and_params(model, to_model)
+
+        # checks that final and original value are correct and the to_model is parametrized
+        self.assertTrue(torch.nn.utils.parametrize.is_parametrized(to_model, "weight"))
+        self.assertEqual(model.weight, to_model.weight)
+        self.assertEqual(
+            model.parametrizations.weight.original0,
+            to_model.parametrizations.weight.original0,
+        )
+        self.assertEqual(
+            model.parametrizations.weight.original1,
+            to_model.parametrizations.weight.original1,
+        )
+
+        # check that the transfer didn't affect the original value
+        self.assertEqual(hold_weight, model.weight)
+
+        # testing that changes to one set of parametrizations do not affect the other
+        model.test_param = Parameter(torch.randn(3, 3))
+
+        self.assertTrue(not hasattr(to_model, "test_param"))
+        parametrize.register_parametrization(model, "test_param", RankOne())
+        hold_test_param = model.test_param
+        parametrize.transfer_parametrizations_and_params(model, to_model, "test_param")
+
+        # also check that previously missing params got transferred correctly
+        self.assertEqual(model.test_param, to_model.test_param)
+        self.assertEqual(
+            model.parametrizations.test_param.original0,
+            to_model.parametrizations.test_param.original0,
+        )
+        self.assertEqual(
+            model.parametrizations.test_param.original1,
+            to_model.parametrizations.test_param.original1,
+        )
+
+        # check that the new transfer didn't change the value for the from_module
+        self.assertEqual(hold_test_param, model.test_param)
+
+    def test_new_spectral_norm(self):
+        with set_default_dtype(torch.double):
+            input = torch.randn(3, 5)
+            m = nn.Linear(5, 7)
+            m = torch.nn.utils.parametrizations.spectral_norm(m)
+            spectral_norm_m = m.parametrizations.weight[0]
+
+            self.assertEqual(spectral_norm_m._u.size(), torch.Size([m.weight.size(0)]))
+
+            # .parametrizations.weight.original should be trainable
+            self.assertTrue(hasattr(m.parametrizations.weight, 'original'))
+            self.assertTrue('original' in m.parametrizations.weight._parameters)
+
+            # u should be just a reused buffer
+            self.assertTrue(hasattr(spectral_norm_m, '_u'))
+            self.assertTrue('_u' in spectral_norm_m._buffers)
+            self.assertTrue('_v' in spectral_norm_m._buffers)
+
+            # weight should be a plain attribute, not counted as a buffer or a param
+            self.assertIsNotNone(m.weight)
+            self.assertFalse('weight' in m._buffers)
+            self.assertFalse('weight' in m._parameters)
+
+            # it should also be sharing storage as `weight_orig`
+            # self.assertEqual(m.parametrizations.weight.original.storage(), m.weight.storage())
+            self.assertEqual(m.parametrizations.weight.original.size(), m.weight.size())
+            self.assertEqual(m.parametrizations.weight.original.stride(), m.weight.stride())
+
+            m = torch.nn.utils.parametrize.remove_parametrizations(m, 'weight')
+
+            # spectral_norm is the only parametrization
+            self.assertFalse(hasattr(m, 'parametrizations'))
+            self.assertTrue('weight' in m._parameters)
+
+            # We can register spectral_norm multiple times on the same parameter
+            # and on multiple parameters in the same module
+            m = torch.nn.utils.parametrizations.spectral_norm(m, 'weight')
+            m = torch.nn.utils.parametrizations.spectral_norm(m, 'weight')
+            m = torch.nn.utils.parametrizations.spectral_norm(m, 'bias')
+
+            # If we remove the parametrization on bias, weight is still parametrized
+            # Removing a parametrization runs forward in eval mode if leave_parametrized=True
+            m = torch.nn.utils.parametrize.remove_parametrizations(m, 'bias')
+            self.assertTrue('bias' in m._parameters)
+            self.assertTrue(hasattr(m, 'parametrizations'))
+            self.assertFalse('weight' in m._parameters)
+
+            m = torch.nn.utils.parametrize.remove_parametrizations(m, 'weight')
+            # Neither weight and bias are parametrized
+            self.assertFalse(hasattr(m, 'parametrizations'))
+            self.assertTrue('weight' in m._parameters)
+            self.assertFalse(torch.nn.utils.parametrize.is_parametrized(m))
+
+            # test correctness in training/eval modes and cpu/multi-gpu settings
+            for apply_dp in (True, False):
+                if apply_dp:
+                    if not TEST_MULTIGPU:
+                        continue
+                    device = torch.device('cuda:0')
+
+                    def maybe_wrap(m):
+                        return torch.nn.DataParallel(m, [0, 1])
+                else:
+                    device = torch.device('cpu')
+
+                    def maybe_wrap(m):
+                        return m
+
+                for requires_grad in (True, False):
+                    def get_modules():
+                        m = nn.Linear(3, 4).to(device)
+                        m.weight.requires_grad_(requires_grad)
+                        m = torch.nn.utils.parametrizations.spectral_norm(m)
+                        wrapped_m = maybe_wrap(m)
+                        spectral_norm_m = m.parametrizations.weight[0]
+                        return m, wrapped_m, spectral_norm_m
+
+                    input = torch.randn(2, 3, device=device)
+
+                    m, wrapped_m, spectral_norm_m = get_modules()
+
+                    self.assertTrue(hasattr(spectral_norm_m, '_u'))
+                    u0 = spectral_norm_m._u.clone()
+                    v0 = spectral_norm_m._v.clone()
+
+                    # TEST TRAINING BEHAVIOR
+
+                    # We perform GD first to modify the initial matrix
+                    opt = torch.optim.SGD(wrapped_m.parameters(), lr=0.1)
+
+                    opt.zero_grad()
+                    wrapped_m(input).sum().backward()
+                    opt.step()
+
+                    out = wrapped_m(input)
+                    if requires_grad:
+                        # run forward again and assert that u and v are updated
+                        self.assertNotEqual(u0, spectral_norm_m._u)
+                        self.assertNotEqual(v0, spectral_norm_m._v)
+
+                    # assert that backprop reaches original weight
+                    # can't use gradcheck because the function changes as we
+                    # activate through it in training mode
+                    if requires_grad:
+                        torch.autograd.grad(out.sum(), m.parametrizations.weight.original)
+
+                    # test backward works with multiple forwards
+                    # it uses training mode so we need to reset `u` and `v` vectors
+                    # to same value at beginning for finite difference test to pass
+                    saved_u = spectral_norm_m._u.clone()
+                    saved_v = spectral_norm_m._v.clone()
+
+                    def fn(input):
+                        spectral_norm_m._u.data.copy_(saved_u)
+                        spectral_norm_m._v.data.copy_(saved_v)
+                        out0 = wrapped_m(input)
+                        out1 = wrapped_m(input)
+                        return out0 + out1
+
+                    # Make sure we can compute gradients wrt to all the parameters in the case
+                    # of double forward
+                    fn(input.clone().requires_grad_()).sum().backward()
+                    gradcheck(fn, (input.clone().requires_grad_(),), check_batched_grad=False)
+
+                    # test removing
+                    # spectral norm module needs to be in eval mode if we'd like to
+                    # avoid doing another power iteration
+                    m, wrapped_m, _ = get_modules()
+                    pre_remove_out = wrapped_m(input)
+                    m.eval()
+                    m = torch.nn.utils.parametrize.remove_parametrizations(m, 'weight')
+                    self.assertEqual(wrapped_m(input), pre_remove_out)
+
+                    torch.nn.utils.parametrizations.spectral_norm(m)
+                    for _ in range(3):
+                        pre_remove_out = wrapped_m(input)
+                    m.eval()
+                    m = torch.nn.utils.parametrize.remove_parametrizations(m, 'weight')
+                    self.assertEqual(wrapped_m(input), pre_remove_out)
+
+                    # TEST EVAL BEHAVIOR
+                    m, wrapped_m, spectral_norm_m = get_modules()
+                    wrapped_m(input)
+                    last_train_out = wrapped_m(input)
+                    last_train_u = spectral_norm_m._u.clone()
+                    last_train_v = spectral_norm_m._v.clone()
+                    wrapped_m.zero_grad()
+                    wrapped_m.eval()
+
+                    eval_out0 = wrapped_m(input)
+                    # assert eval gives same result as last training iteration
+                    self.assertEqual(eval_out0, last_train_out)
+                    # assert doing more iteartion in eval don't change things
+                    self.assertEqual(eval_out0, wrapped_m(input))
+                    self.assertEqual(last_train_u, spectral_norm_m._u)
+                    self.assertEqual(last_train_v, spectral_norm_m._v)
+
+                    # FIXME: the code below is flaky when executed with DataParallel
+                    # see https://github.com/pytorch/pytorch/issues/13818
+                    if apply_dp:
+                        continue
+
+                    # test backward works with multiple forwards in mixed training
+                    # and eval modes
+                    # it uses training mode so we need to reset `u` and `v` vectors
+                    # to same value at beginning for finite difference test to pass
+                    saved_u = spectral_norm_m._u.clone()
+                    saved_v = spectral_norm_m._v.clone()
+
+                    def fn(input):
+                        spectral_norm_m._u.data.copy_(saved_u)
+                        spectral_norm_m._v.data.copy_(saved_v)
+                        wrapped_m.train()
+                        out0 = wrapped_m(input)
+                        wrapped_m.eval()
+                        out1 = wrapped_m(input)
+                        wrapped_m.train()
+                        out2 = wrapped_m(input)
+                        wrapped_m.eval()
+                        out3 = wrapped_m(input)
+                        return out0 + out1 + out2 + out3
+
+                    gradcheck(fn, (input.clone().requires_grad_(),))
+
+                    # assert that backprop reaches weight_orig in eval
+                    if requires_grad:
+                        def fn(weight):
+                            return wrapped_m(input)
+
+                        gradcheck(fn, (m.parametrizations.weight.original,))
+
+    def test_new_spectral_norm_load_state_dict(self):
+        for activate_times in (0, 3):
+            inp = torch.randn(2, 3)
+            m = nn.Linear(3, 5)
+            snm = torch.nn.utils.parametrizations.spectral_norm(m)
+            snm.train()
+
+            for _ in range(activate_times):
+                snm(inp)
+
+            state_dict = deepcopy(snm.state_dict())
+            self.assertEqual({
+                'parametrizations.weight.original',
+                'bias',
+                'parametrizations.weight.0._v',
+                'parametrizations.weight.0._u'
+            }, set(state_dict.keys()))
+
+            # test that non-strict loading works
+            non_strict_state_dict = deepcopy(state_dict)
+            non_strict_state_dict['nonsense'] = 'nonsense'
+            with self.assertRaisesRegex(RuntimeError, r'Unexpected key\(s\) in state_dict: "nonsense"'):
+                snm.load_state_dict(non_strict_state_dict, strict=True)
+            snm.load_state_dict(non_strict_state_dict, strict=False)
+            del non_strict_state_dict['parametrizations.weight.original']
+            snm.load_state_dict(non_strict_state_dict, strict=False)
+            del non_strict_state_dict['parametrizations.weight.0._u']
+            snm.load_state_dict(non_strict_state_dict, strict=False)
+            del non_strict_state_dict['parametrizations.weight.0._v']
+            snm.load_state_dict(non_strict_state_dict, strict=False)
+            non_strict_state_dict['weight'] = snm.weight.detach().clone()     # set W as a buffer
+            snm.load_state_dict(non_strict_state_dict, strict=False)
+            del non_strict_state_dict._metadata['parametrizations.weight.0']  # remove metadata info
+            snm.load_state_dict(non_strict_state_dict, strict=False)
+            del non_strict_state_dict['weight']                               # remove W buffer
+            snm.load_state_dict(non_strict_state_dict, strict=False)
+            del non_strict_state_dict['bias']
+            snm.load_state_dict(non_strict_state_dict, strict=False)
+
+            # normal state_dict
+
+            # test that re-wrapping does not matter
+            m = torch.nn.utils.parametrize.remove_parametrizations(snm, 'weight')
+            snm = torch.nn.utils.parametrizations.spectral_norm(m)
+
+            snm.load_state_dict(state_dict)
+            with torch.no_grad():
+                snm.eval()
+                out0_eval = snm(inp)
+                snm.train()
+                out1_train = snm(inp)
+                out2_train = snm(inp)
+                snm.eval()
+                out3_eval = snm(inp)
+
+            # test that re-wrapping does not matter
+            m = torch.nn.utils.parametrize.remove_parametrizations(snm, 'weight')
+            snm = torch.nn.utils.parametrizations.spectral_norm(m)
+
+            # Test normal loading
+            snm.load_state_dict(state_dict)
+            with torch.no_grad():
+                snm.eval()
+                self.assertEqual(out0_eval, snm(inp))
+                snm.train()
+                self.assertEqual(out1_train, snm(inp))
+                self.assertEqual(out2_train, snm(inp))
+                snm.eval()
+                self.assertEqual(out3_eval, snm(inp))
+
+    def test_new_spectral_norm_dim(self):
+        inp = torch.randn(2, 3, 10, 12)
+        m = nn.ConvTranspose2d(3, 4, (5, 6))
+        m = torch.nn.utils.parametrizations.spectral_norm(m)
+        snm = m.parametrizations.weight[0]
+        # this should not run into incompatible shapes
+        x = m(inp)
+        # check that u refers to the same dimension
+        self.assertEqual(snm._u.shape, m.parametrizations.weight.original[0, :, 0, 0].shape)
+
+    def test_new_spectral_norm_forward(self):
+        input = torch.randn(3, 5)
+        m = nn.Linear(5, 7)
+        m = torch.nn.utils.parametrizations.spectral_norm(m)
+        snm = m.parametrizations.weight[0]
+        # naive forward
+        _weight = m.parametrizations.weight.original
+        _bias, _v = m.bias, snm._v
+        _weight_mat = _weight.view(_weight.size(0), -1)
+        _u = torch.mv(_weight_mat, _v)
+        _u = F.normalize(_u, dim=0, eps=1e-12)
+        _v = torch.mv(_weight_mat.t(), _u)
+        _v = F.normalize(_v, dim=0, eps=1e-12)
+        _weight.data /= torch.dot(_u, torch.matmul(_weight_mat, _v))
+        out_hat = torch.nn.functional.linear(input, _weight, _bias)
+        expect_out = m(input)
+        self.assertEqual(expect_out, out_hat)
+
+    @skipIfNoLapack
+    def test_orthogonal_parametrization(self):
+        # Orthogonal implements 6 algorithms (3x parametrizations times 2 options of use_trivialization)
+
+        def assert_is_orthogonal(X):
+            n, k = X.size(-2), X.size(-1)
+            if n < k:
+                X = X.mT
+                n, k = k, n
+            Id = torch.eye(k, dtype=X.dtype, device=X.device).expand(*(X.size()[:-2]), k, k)
+            eps = 10 * n * torch.finfo(X.dtype).eps
+            torch.testing.assert_close(X.mH @ X, Id, atol=eps, rtol=0.)
+
+        def assert_weight_allclose_Q(weight, W):
+            # Test that weight is equal to the Q part of the QR decomposition of W
+            # (or of its transpose if the matrix is wide)
+            wide_matrix = W.size(-2) < W.size(-1)
+            if wide_matrix:
+                W = W.mT
+            Q, R = torch.linalg.qr(W)
+            Q *= R.diagonal(dim1=-2, dim2=-1).sgn().unsqueeze(-2)
+            if wide_matrix:
+                Q = Q.mT
+            torch.testing.assert_close(Q, weight, atol=1e-5, rtol=0.)
+
+        for shape, dtype, use_linear in product(((4, 4), (5, 3), (3, 5)),  # square/ tall / wide
+                                                (torch.float32, torch.complex64),
+                                                (True, False)):
+            # Conv2d does not support complex yet
+            if not use_linear:
+                continue
+
+            if use_linear:
+                input = torch.randn(3, shape[0], dtype=dtype)
+            else:
+                input = torch.randn(2, 2, shape[0] + 2, shape[1] + 1, dtype=dtype)
+
+            for parametrization, use_trivialization in product(("matrix_exp", "cayley", "householder"),
+                                                               (False, True)):
+                # right_inverse for Cayley and matrix_exp not implemented for use_trivialization=False
+                # See Note [right_inverse expm cayley]
+                can_initialize = use_trivialization or parametrization == "householder"
+
+                # We generate them every time to always start with fresh weights
+                if use_linear:
+                    m = nn.Linear(*shape, dtype=dtype)
+                else:
+                    m = nn.Conv2d(2, 3, shape, dtype=dtype)
+
+                # We do not support householder for complex inputs
+                # See Note [Householder complex]
+                w_init = m.weight.clone()
+                if parametrization == "householder" and m.weight.is_complex():
+                    msg = "householder parametrization does not support complex tensors"
+                    with self.assertRaisesRegex(ValueError, msg):
+                        torch.nn.utils.parametrizations.orthogonal(m,
+                                                                   "weight",
+                                                                   parametrization,
+                                                                   use_trivialization=use_trivialization)
+                    continue
+
+                wide_matrix = w_init.size(-2) < w_init.size(-1)
+                torch.nn.utils.parametrizations.orthogonal(m,
+                                                           "weight",
+                                                           parametrization,
+                                                           use_trivialization=use_trivialization)
+                # Forwards works as expected
+                self.assertEqual(w_init.shape, m.weight.shape)
+                assert_is_orthogonal(m.weight)
+                if can_initialize:
+                    assert_weight_allclose_Q(m.weight, w_init)
+
+                # Intializing with a given orthogonal matrix works
+                X = torch.randn_like(m.weight)
+                if wide_matrix:
+                    X = X.mT
+                w_new = torch.linalg.qr(X).Q
+                if wide_matrix:
+                    w_new = w_new.mT
+                if can_initialize:
+                    m.weight = w_new
+                    torch.testing.assert_close(w_new, m.weight, atol=1e-5, rtol=0.)
+                else:
+                    msg = "assign to the matrix exponential or the Cayley parametrization"
+                    with self.assertRaisesRegex(NotImplementedError, msg):
+                        m.weight = w_new
+
+                # Intializing with a non-orthogonal matrix makes m.weight be the Q part of the given matrix
+                w_new = torch.randn_like(m.weight)
+                if can_initialize:
+                    m.weight = w_new
+                    assert_weight_allclose_Q(m.weight, w_new)
+                else:
+                    msg = "assign to the matrix exponential or the Cayley parametrization"
+                    with self.assertRaisesRegex(NotImplementedError, msg):
+                        m.weight = w_new
+
+                opt = torch.optim.SGD(m.parameters(), lr=0.1)
+                for _ in range(2):
+                    opt.zero_grad()
+                    m(input).norm().backward()
+                    grad = m.parametrizations.weight.original.grad
+                    self.assertIsNotNone(grad)
+                    # We do not update the upper triangular part of the matrix if tall tril if wide
+                    if grad.size(-2) >= grad.size(-1):
+                        zeros_grad = grad.triu(1)
+                    else:
+                        zeros_grad = grad.tril(-1)
+                    self.assertEqual(zeros_grad, torch.zeros_like(zeros_grad))
+                    # The gradient in the diagonal can only be imaginary because a skew-Hermitian
+                    # matrix has imaginary diagonal
+                    diag_grad = grad.diagonal(dim1=-2, dim2=-1)
+                    if grad.is_complex():
+                        diag_grad = diag_grad.real
+                    self.assertEqual(diag_grad, torch.zeros_like(diag_grad))
+                    opt.step()
+                    assert_is_orthogonal(m.weight)
+
+    @skipIfNoLapack
+    def test_orthogonal_errors(self):
+        m = nn.Linear(3, 4)
+        with self.assertRaisesRegex(ValueError, "has to be one of"):
+            torch.nn.utils.parametrizations.orthogonal(m, "weight", "foo")
+
+        with self.assertRaisesRegex(ValueError, "Expected a matrix"):
+            torch.nn.utils.parametrizations.orthogonal(m, "bias")
+
+        torch.nn.utils.parametrizations.orthogonal(m, "weight")
+        with self.assertRaisesRegex(ValueError, "matrices of shape"):
+            m.weight = torch.randn(5, 5)
+        torch.nn.utils.parametrize.remove_parametrizations(m, "weight")
+
+
+instantiate_parametrized_tests(TestNNParametrization)
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_nn.py b/test/test_nn.py
index dc9ce0707dc1f..c50d9cdc7bd64 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -25,10 +25,8 @@
 import torch.backends.cudnn as cudnn
 import torch.nn as nn
 import torch.nn.functional as F
-import torch.nn.init as init
 import torch.nn.utils.rnn as rnn_utils
 from torch.nn.utils import clip_grad_norm_, clip_grad_value_
-import torch.nn.utils.parametrize as parametrize
 import torch.nn.utils.prune as prune
 from torch.nn.utils import parameters_to_vector, vector_to_parameters
 from torch.nn.utils.fusion import fuse_conv_bn_weights
@@ -1756,1083 +1754,6 @@ def test_vector_to_parameters(self):
         sample = next(model.parameters())[0, 0, 0]
         self.assertTrue(torch.equal(sample.data, vec.data[:5]))
 
-    # FIXME: Rewrite this test using functions not depending on LAPACK
-    #        and remove the `@skipIfNoLapack` (see #70995)
-    # torch/nn/utils/parametrize
-    @skipIfNoLapack
-    def test_register_and_remove_parametrization(self):
-        r"""Test that it is possible to add a few parametrizations
-        on a parameter or a buffer and that removing them restores the initial state
-        It also tests that backpropagating through them works as expected
-        """
-        # Define a couple matrix parametrizations
-        class Skew(nn.Module):
-            def forward(self, X):
-                X = X.tril(-1)
-                return X - X.T
-
-        class Orthogonal(nn.Module):
-            def forward(self, X):
-                # Cayley map
-                # If X is skew-symmetric it returns an orthogonal matrix
-                Id = torch.eye(X.size(0), device=X.device)
-                # We call contiguous because solve returns a tensor with strides that are Fortran-contiguous
-                # and autograd raises a performance warning.
-                # This happens when we remove the parametrization with leave_parametrized=True,
-                # which does a set_ with a non-contiguous tensor while the gradient is contiguous
-                return torch.linalg.solve(Id + X, Id - X).contiguous()
-
-        class Resize(nn.Module):
-            def forward(self, X):
-                return X[[0]]
-
-        class NoResize(nn.Module):
-            def forward(self, X):
-                return X
-
-        # Define a couple vector parametrizations
-        class FirstZero(nn.Module):
-            def forward(self, x):
-                return torch.cat([x.new_zeros(1), x[1:]])
-
-        class LastZero(nn.Module):
-            def forward(self, x):
-                return torch.cat([x[:-1], x.new_zeros(1)])
-
-        model = nn.Linear(8, 8)
-        initial_weight_id = id(model.weight)
-        initial_bias_id = id(model.bias)
-        initial_model = deepcopy(model)
-
-        # Test unsafe flag
-        with self.assertRaisesRegex(ValueError, "Registering a parametrization may not change the shape of the tensor"):
-            parametrize.register_parametrization(model, "weight", Resize())  # default unsafe = False
-            model(torch.ones(8, 8))
-
-        # One parametrization with unsafe=True
-        parametrize.register_parametrization(model, "weight", Resize(), unsafe=True)
-        self.assertTrue(hasattr(model, "parametrizations"))
-        self.assertTrue(parametrize.is_parametrized(model))
-        self.assertTrue(parametrize.is_parametrized(model, "weight"))
-        self.assertFalse(parametrize.is_parametrized(model, "bias"))
-        self.assertNotIn("weight", model._parameters)
-        A = model.weight
-        self.assertTrue(A.shape[0] == 1)
-        parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
-        self.assertFalse(hasattr(model, "parametrizations"))
-        self.assertEqual(model.weight, initial_model.weight)
-        self.assertEqual(id(model.weight), initial_weight_id)
-        self.assertEqual(model.__class__, nn.Linear)
-
-        # Two parametrizations with unsafe=True
-        parametrize.register_parametrization(model, "weight", Resize(), unsafe=True)
-        parametrize.register_parametrization(model, "weight", NoResize(), unsafe=False)
-        self.assertTrue(hasattr(model, "parametrizations"))
-        self.assertTrue(parametrize.is_parametrized(model))
-        self.assertTrue(parametrize.is_parametrized(model, "weight"))
-        self.assertFalse(parametrize.is_parametrized(model, "bias"))
-        self.assertNotIn("weight", model._parameters)
-        A = model.weight
-        self.assertTrue(A.shape[0] == 1)
-        parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
-        self.assertFalse(hasattr(model, "parametrizations"))
-        self.assertEqual(model.weight, initial_model.weight)
-        self.assertEqual(id(model.weight), initial_weight_id)
-        self.assertEqual(model.__class__, nn.Linear)
-
-        # Test unsafe flag doesn't change expected behavior
-        parametrize.register_parametrization(model, "weight", Skew(), unsafe=True)
-        self.assertTrue(hasattr(model, "parametrizations"))
-        self.assertTrue(parametrize.is_parametrized(model))
-        self.assertTrue(parametrize.is_parametrized(model, "weight"))
-        self.assertFalse(parametrize.is_parametrized(model, "bias"))
-        self.assertNotIn("weight", model._parameters)
-        # Result should be skew-symmetric
-        A = model.weight
-        self.assertEqual(A, -A.T)
-        # Remove and check consistency
-        parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
-        self.assertFalse(hasattr(model, "parametrizations"))
-        self.assertEqual(model.weight, initial_model.weight)
-        self.assertEqual(id(model.weight), initial_weight_id)
-        self.assertEqual(model.__class__, nn.Linear)
-
-        # Test one parametrization
-        parametrize.register_parametrization(model, "weight", Skew())
-        self.assertTrue(hasattr(model, "parametrizations"))
-        self.assertTrue(parametrize.is_parametrized(model))
-        self.assertTrue(parametrize.is_parametrized(model, "weight"))
-        self.assertFalse(parametrize.is_parametrized(model, "bias"))
-        self.assertNotIn("weight", model._parameters)
-        # Result should be skew-symmetric
-        A = model.weight
-        self.assertEqual(A, -A.T)
-        # Remove and check consistency
-        parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
-        self.assertFalse(hasattr(model, "parametrizations"))
-        self.assertEqual(model.weight, initial_model.weight)
-        self.assertEqual(id(model.weight), initial_weight_id)
-        self.assertEqual(model.__class__, nn.Linear)
-
-        # Test two parametrizations at the same time and removing them
-        parametrize.register_parametrization(model, "weight", Skew())
-        parametrize.register_parametrization(model, "weight", Orthogonal())
-        # Result should be orthogonal
-        X = model.weight
-        Id = torch.eye(X.size(0), device=X.device)
-        self.assertEqual(X.T @ X, Id)
-        # Structure tests
-        self.assertTrue(hasattr(model, "parametrizations"))
-        self.assertTrue(parametrize.is_parametrized(model))
-        self.assertTrue(parametrize.is_parametrized(model, "weight"))
-        self.assertFalse(parametrize.is_parametrized(model, "bias"))
-        self.assertIn("weight", model.parametrizations)
-        self.assertNotIn("weight", model._parameters)
-        # Remove
-        parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
-        self.assertEqual(model.weight, initial_model.weight)
-        self.assertEqual(id(model.weight), initial_weight_id)
-        self.assertFalse(hasattr(model, "parametrizations"))
-        self.assertEqual(model.__class__, nn.Linear)
-
-        # Add everything
-        parametrize.register_parametrization(model, "weight", Skew())
-        parametrize.register_parametrization(model, "weight", Orthogonal())
-        parametrize.register_parametrization(model, "bias", FirstZero())
-        parametrize.register_parametrization(model, "bias", LastZero())
-
-        # Basic tests
-        self.assertTrue(parametrize.is_parametrized(model))
-        self.assertTrue(parametrize.is_parametrized(model, "weight"))
-        self.assertTrue(parametrize.is_parametrized(model, "bias"))
-        self.assertEqual(model.bias[0].item(), 0.)
-        self.assertEqual(model.bias[-1].item(), 0.)
-        self.assertEqual(len(list(model.parameters())), 2)  # Nothing weird has happpened
-        # Should not throw
-
-        sgd = torch.optim.SGD(model.parameters(), lr=0.01)
-
-        weight_copy = model.weight.clone()
-        bias_copy = model.bias.clone()
-        sgd.zero_grad()
-        (model.weight.T @ model.bias).sum().backward()
-        sgd.step()
-        self.assertNotEqual(model.weight, weight_copy)
-        self.assertNotEqual(model.bias, bias_copy)
-
-        # Remove first parametrization.
-        # Check that the model is still parametrized and so is the second parameter
-        parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
-        self.assertTrue(parametrize.is_parametrized(model))             # Still parametrized
-        self.assertFalse(parametrize.is_parametrized(model, "weight"))  # Parametrization removed
-        self.assertTrue(parametrize.is_parametrized(model, "bias"))     # Still parametrized
-        self.assertEqual(model.bias[0].item(), 0.)                      # Still parametrized
-        self.assertEqual(model.bias[-1].item(), 0.)                     # Still parametrized
-        self.assertNotEqual(model.weight, initial_model.weight)         # Has been updated
-        self.assertEqual(id(model.weight), initial_weight_id)           # Keeps the same id
-        self.assertEqual(len(list(model.parameters())), 2)              # Nothing weird has happened
-        # Should not throw
-        weight_copy = model.weight.clone()
-        bias_copy = model.bias.clone()
-        sgd.zero_grad()
-        (model.weight.T @ model.bias).sum().backward()
-        sgd.step()
-        self.assertNotEqual(model.weight, weight_copy)
-        self.assertNotEqual(model.bias, bias_copy)
-
-        # Remove the second parametrization.
-        # Check that the module is not parametrized
-        parametrize.remove_parametrizations(model, "bias", leave_parametrized=False)
-        self.assertFalse(parametrize.is_parametrized(model))  # Not parametrized
-        self.assertNotEqual(model.bias, initial_model.bias)   # Has been updated
-        self.assertNotEqual(model.bias[0].item(), 0.)         # Not parametrized
-        self.assertNotEqual(model.bias[-1].item(), 0.)        # Not parametrized
-        self.assertEqual(id(model.bias), initial_bias_id)     # Keeps the same id
-        self.assertFalse(hasattr(model, "parametrizations"))  # Not parametrized the module
-        self.assertEqual(model.__class__, nn.Linear)          # Resores the previous class
-        self.assertEqual(len(list(model.parameters())), 2)    # Nothing weird has happeed
-
-        # Should not throw things are updated
-        weight_copy = model.weight.clone()
-        bias_copy = model.bias.clone()
-        sgd.zero_grad()
-        (model.weight.T @ model.bias).sum().backward()
-        sgd.step()
-        self.assertNotEqual(model.weight, weight_copy)
-        self.assertNotEqual(model.bias, bias_copy)
-
-        # Test leave_parametrized=True
-        for _ in range(2):
-            parametrize.register_parametrization(model, "weight", Skew())
-            parametrize.register_parametrization(model, "weight", Orthogonal())
-            parametrize.remove_parametrizations(model, "weight", leave_parametrized=True)
-            # We didn't change the dtype nor had multiple inputs, so the id should be the same
-            self.assertEqual(id(model.weight), initial_weight_id)
-            self.assertEqual(id(model.bias), initial_bias_id)
-
-            # Should not throw. Things are updated
-            weight_copy = model.weight.clone()
-            bias_copy = model.bias.clone()
-            sgd.zero_grad()
-            (model.weight.T @ model.bias).sum().backward()
-            sgd.step()
-            self.assertNotEqual(model.weight, weight_copy)
-            self.assertNotEqual(model.bias, bias_copy)
-
-    def test_register_and_remove_nested_parametrization(self):
-        r"""Test that it is possible to nest the parametrizations
-        meaning that the original param is parametrized again
-        """
-        class Skew(nn.Module):
-            def forward(self, X):
-                X = X.tril(-1)
-                return X - X.T
-
-        model = nn.Linear(8, 8)
-        # Add top level parametrization
-        parametrize.register_parametrization(model, "weight", Skew())
-        self.assertTrue(hasattr(model, "parametrizations"))
-        self.assertTrue(parametrize.is_parametrized(model))
-        self.assertTrue(parametrize.is_parametrized(model, "weight"))
-        self.assertFalse(parametrize.is_parametrized(model, "bias"))
-        self.assertNotIn("weight", model._parameters)
-        # Result should be skew-symmetric
-        A = model.weight
-        self.assertEqual(A, -A.T)
-
-        # Add nested parametrization
-        param_mod = model.parametrizations.weight
-        self.assertFalse(hasattr(param_mod, "parametrizations"))
-        self.assertFalse(parametrize.is_parametrized(param_mod))
-        self.assertFalse(parametrize.is_parametrized(param_mod, "original"))
-
-        parametrize.register_parametrization(param_mod, "original", Skew())
-        self.assertTrue(hasattr(param_mod, "parametrizations"))
-        self.assertTrue(parametrize.is_parametrized(param_mod))
-        self.assertTrue(parametrize.is_parametrized(param_mod, "original"))
-        self.assertNotIn("original", param_mod._parameters)
-        # Result should be skew-symmetric
-        A = param_mod.original
-        self.assertEqual(A, -A.T)
-
-        # Remove nested param and check consistency
-        parametrize.remove_parametrizations(param_mod, "original", leave_parametrized=False)
-        self.assertFalse(hasattr(param_mod, "parametrizations"))
-        self.assertEqual(param_mod.__class__, parametrize.ParametrizationList)
-
-        # Remove top level and check consistency
-        parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
-        self.assertFalse(hasattr(model, "parametrizations"))
-        self.assertEqual(model.__class__, nn.Linear)
-
-    def test_register_and_remove_buffer_parametrization(self):
-        r"""Test that it is possible to add and remove parametrizations on buffers"""
-        # Define a couple vector parametrizations
-        class FirstZero(nn.Module):
-            def forward(self, x):
-                return torch.cat([x.new_zeros(1), x[1:]])
-
-        class LastZero(nn.Module):
-            def forward(self, x):
-                return torch.cat([x[:-1], x.new_zeros(1)])
-
-        model = nn.Linear(8, 8)
-
-        # Instantiate parametrizations on buffers. It should work as expected
-        delattr(model, "bias")
-        model.register_buffer("bias", torch.ones(8))
-        parametrize.register_parametrization(model, "bias", FirstZero())
-        parametrize.register_parametrization(model, "bias", LastZero())
-        self.assertTrue(parametrize.is_parametrized(model))
-        self.assertTrue(parametrize.is_parametrized(model, "bias"))
-        self.assertEqual(model.bias[0].item(), 0.)
-        self.assertEqual(model.bias[-1].item(), 0.)
-        self.assertTrue((model.bias[1:-1] == torch.ones(6)).all())
-        self.assertEqual(len(list(model.parameters())), 1)
-
-        # Remove parametrizations on buffers. It should work as expected
-        parametrize.remove_parametrizations(model, "bias", leave_parametrized=True)
-        self.assertFalse(parametrize.is_parametrized(model))
-        self.assertFalse(parametrize.is_parametrized(model, "bias"))
-        self.assertEqual(model.bias[0].item(), 0.)
-        self.assertEqual(model.bias[-1].item(), 0.)
-        self.assertTrue((model.bias[1:-1] == torch.ones(6)).all())
-        self.assertEqual(len(list(model.parameters())), 1)
-
-    # FIXME: Rewrite this test using functions not depending on LAPACK
-    #        and remove the `@skipIfNoLapack` (see #70995)
-    @skipIfNoLapack
-    def test_serialization_parametrization(self):
-        r"""Test that it is possible to serialize a parametrized model via state_dict"""
-        # A stateful parametrization
-        class Orthogonal(nn.Module):
-            def __init__(self, n):
-                super().__init__()
-                self.register_buffer("id", torch.eye(n))
-                self.register_buffer("B", torch.empty(n, n))
-                init.orthogonal_(self.B)
-
-            def forward(self, X):
-                A = X.triu(1)
-                A = A - A.T
-                return self.B @ torch.linalg.solve(self.id + A, self.id - A)
-
-        def get_model():
-            model = torch.nn.Sequential(
-                torch.nn.Linear(5, 5),
-                torch.nn.ReLU(),
-                torch.nn.Linear(5, 1),
-            )
-
-            parametrize.register_parametrization(model[0], "weight", Orthogonal(5))
-            return model
-
-        model = get_model()
-
-        prev_weight = model[0].weight
-        prev_B = model[0].parametrizations.weight[0].B
-
-        new_model = get_model()
-        with TemporaryFileName() as fname:
-            torch.save(model.state_dict(), fname)
-            new_model.load_state_dict(torch.load(fname))
-
-        # Integrity tests
-        self.assertTrue(parametrize.is_parametrized(new_model[0], "weight"))
-        self.assertEqual(prev_weight, new_model[0].weight)
-        self.assertEqual(prev_B, new_model[0].parametrizations.weight[0].B)
-
-        # Trying to save the whole parametrized model raises
-        with self.assertRaisesRegex(RuntimeError, "state_dict"):
-            with TemporaryFileName() as fname:
-                torch.save(model, fname)
-
-    # FIXME: Rewrite this test using functions not depending on LAPACK
-    #        and remove the `@skipIfNoLapack` (see #70995)
-    @skipIfNoLapack
-    def test_initialization_parametrization(self):
-        r"""Test that it is possible to initialize a parametrization when it
-            implements a `right_inverse` method
-        """
-        class Skew(nn.Module):
-            def forward(self, X):
-                A = X.triu(1)
-                return A - A.T
-
-            def is_skew(self, A):
-                return torch.allclose(A, -A.T, atol=1e-6)
-
-            def right_inverse(self, X):
-                if not self.is_skew(X):
-                    raise ValueError("The matrix is not skew-symmetric.")
-                return X.triu(1)
-
-        # Implements a Cayley map where right_inverse is not quite the inverse of forward
-        class Orthogonal(nn.Module):
-            def __init__(self, n):
-                super().__init__()
-                self.register_buffer("B", torch.eye(n))
-
-            def forward(self, X):
-                Id = torch.eye(X.size(0))
-                return self.B @ torch.linalg.solve(Id + X, Id - X)
-
-            def is_orthogonal(self, X):
-                Id = torch.eye(X.size(0))
-                return torch.allclose(X.T @ X, Id, atol=1e-4)
-
-            def right_inverse(self, X):
-                if not self.is_orthogonal(X):
-                    raise ValueError("The input is not orthogonal.")
-                # cayley(0) == Id, so B @ cayley(0) == B
-                self.B = X
-                return torch.zeros_like(X)
-
-        N = 5
-        model = nn.Linear(N, N)
-        # Register the skew-symmetric constraint. The result is now skew-symmetric
-        skew = Skew()
-        # Make the weight skew-symmetric before registering the parametrization
-        with torch.no_grad():
-            model.weight.set_(skew(model.weight))
-        parametrize.register_parametrization(model, "weight", skew)
-        X = torch.rand(N, N)
-        # X is not skew-symmetric, so it throws an error
-        with self.assertRaises(ValueError):
-            model.weight = X
-        # Make X skew-symmetric
-        X = X - X.T
-        model.weight = X
-        self.assertEqual(model.parametrizations.weight.original, X.triu(1))
-        self.assertEqual(model.weight, X)
-
-        # Having several parametrizations registered should work in the same way
-        parametrize.register_parametrization(model, "weight", Orthogonal(N))
-        # Register now the Cayley map. The result is now orthogonal
-        X = torch.rand(N, N)
-        # X is not orthogonal, so it throws an error
-        with self.assertRaises(ValueError):
-            model.weight = X
-        init.orthogonal_(X)
-        model.weight = X
-        self.assertEqual(model.weight, X)
-        self.assertEqual(model.parametrizations.weight.original, torch.zeros_like(X))
-
-    def test_errors_unparametrized_tensor_parametrization(self):
-        # Test errors when registering a parametrization on an unparametrized tensor
-        module = nn.Linear(3, 4)
-        weight_init = module.weight.clone()
-
-        class Identity(nn.Module):
-            def forward(self, x):
-                return x
-
-        # Register a parametrization on a non-existing parameter throws
-        with self.assertRaisesRegex(ValueError, "does not have a parameter"):
-            parametrize.register_parametrization(module, "foo", Identity())
-        self.assertFalse(parametrize.is_parametrized(module))
-
-        # Removing parametrizations from an unparametrized tensor throws
-        with self.assertRaisesRegex(ValueError, "does not have a parametrization"):
-            parametrize.remove_parametrizations(module, "bias")
-        self.assertFalse(parametrize.is_parametrized(module))
-
-        # A correct parametrization with several outputs
-        class Sum(nn.Module):
-            def forward(self, x, y):
-                return x + y
-
-            def right_inverse(self, z):
-                return z, torch.zeros_like(z)
-
-        parametrize.register_parametrization(module, "weight", Sum())
-        # Cannot remove a parametrization with several outputs with `leave_parametrized=False`
-        with self.assertRaisesRegex(ValueError, "leave_parametrized=False"):
-            parametrize.remove_parametrizations(module, "weight", leave_parametrized=False)
-        parametrize.remove_parametrizations(module, "weight", leave_parametrized=True)
-
-        # A parametrization with an incorrect number of outputs
-        class WrongNumberParams(nn.Module):
-            def forward(self, x, y, z):
-                return x + y + z
-
-            def right_inverse(self, w):
-                return w, torch.zeros_like(w)
-
-        # Makes param(*param.right_inverse(X)) fail
-        with self.assertRaisesRegex(TypeError, "positional argument"):
-            parametrize.register_parametrization(module, "weight", WrongNumberParams())
-        self.assertFalse(parametrize.is_parametrized(module))
-
-        # A parametrization with a right_inverse that does not return a Tensor or Sequence[Tensor]
-        class WrongRightInverse(Identity):
-            def right_inverse(self, z):
-                return None
-
-        # right_inverse should return a Tensor or a Sequence[Tensor]
-        with self.assertRaisesRegex(ValueError, "Tensor or a Sequence of"):
-            parametrize.register_parametrization(module, "weight", WrongRightInverse())
-        self.assertFalse(parametrize.is_parametrized(module))
-
-        # If it's a sequence, it must to be a sequence of tensors
-        class WrongRightInverseSequence(nn.Module):
-            def forward(self, x, y):
-                return x
-
-            def right_inverse(self, z):
-                return None, z
-
-        with self.assertRaisesRegex(ValueError, "of the sequence with type"):
-            parametrize.register_parametrization(module, "weight", WrongRightInverseSequence())
-        self.assertFalse(parametrize.is_parametrized(module))
-
-        # A parametrization from one tensor to one tensor that changes the dtype
-        class ChangeDtypeInverse(nn.Module):
-            def forward(self, x):
-                return x.float()
-
-            def right_inverse(self, w):
-                return w.bool()
-
-        # For parametrizations that return one tensor, right_inverse may not change the dtype
-        with self.assertRaisesRegex(ValueError, "outputs one tensor, it may not change the dtype"):
-            parametrize.register_parametrization(module, "weight", ChangeDtypeInverse())
-        self.assertFalse(parametrize.is_parametrized(module))
-
-        # Doesn't return a tensor
-        class NotTensor(nn.Module):
-            def forward(self, x):
-                return 2
-
-        # Forward must return a tensor
-        with self.assertRaisesRegex(ValueError, "must return a tensor"):
-            parametrize.register_parametrization(module, "weight", NotTensor())
-        self.assertFalse(parametrize.is_parametrized(module))
-
-        # A parametrization from one tensor to one tensor that changes the dtype
-        class ChangeDtype(nn.Module):
-            def forward(self, x):
-                return x.bool()
-
-        # forward should not change the initial dtype
-        with self.assertRaisesRegex(ValueError, "may not change the dtype"):
-            parametrize.register_parametrization(module, "weight", ChangeDtype())
-        self.assertFalse(parametrize.is_parametrized(module))
-
-        # Change shape
-        class ChangeShape(nn.Module):
-            def forward(self, x):
-                return x[:-1]
-
-        # forward should not change the original shape
-        with self.assertRaisesRegex(ValueError, "may not change the shape"):
-            parametrize.register_parametrization(module, "weight", ChangeShape())
-        self.assertFalse(parametrize.is_parametrized(module))
-
-        # Many to one that changes dtype
-        class ChangeDtypeMulti(nn.Module):
-            def forward(self, x, y):
-                return (x + y).bool()
-
-            def right_inverse(self, w):
-                return w, w + 1
-
-        # forward should not change the original shape even for parametrizations with many inputs
-        with self.assertRaisesRegex(ValueError, "may not change the dtype"):
-            parametrize.register_parametrization(module, "weight", ChangeDtypeMulti())
-        self.assertFalse(parametrize.is_parametrized(module))
-
-        # Returning a sequence of size one, although weird, it's correct
-        class SequenceLen1(nn.Module):
-            def forward(self, x):
-                return x
-
-            def right_inverse(self, w):
-                return (w,)
-
-        parametrize.register_parametrization(module, "weight", SequenceLen1())
-        self.assertTrue(hasattr(module.parametrizations.weight, "original0"))
-        self.assertFalse(hasattr(module.parametrizations.weight, "original1"))
-        _ = module.weight   # Does not throw
-        self.assertTrue(parametrize.is_parametrized(module))
-        parametrize.remove_parametrizations(module, "weight", leave_parametrized=True)
-
-        # None of the operations above should have altered the weight
-        self.assertFalse(parametrize.is_parametrized(module))
-        self.assertEqual(module.weight, weight_init)
-
-    def test_errors_parametrized_tensor_parametrization(self):
-        # Test errors when registering a parametrization on a parametrized tensor
-
-        class Identity(nn.Module):
-            def forward(self, x):
-                return x
-
-        module = nn.Linear(3, 4)
-        parametrize.register_parametrization(module, "weight", Identity())
-
-        # Has to return a tensor
-        class WrongReturn(nn.Module):
-            def forward(self, x):
-                return x, x
-
-        with self.assertRaisesRegex(ValueError, "must return a tensor"):
-            parametrize.register_parametrization(module, "weight", WrongReturn())
-        self.assertTrue(parametrize.is_parametrized(module))
-        self.assertEqual(len(module.parametrizations.weight), 1)
-        self.assertTrue(isinstance(module.parametrizations.weight[0], Identity))
-
-        # Cannot change dtype
-        class ChangeDtype(nn.Module):
-            def forward(self, x):
-                return x.bool()
-
-        with self.assertRaisesRegex(ValueError, "may not change the dtype"):
-            parametrize.register_parametrization(module, "weight", ChangeDtype())
-        self.assertTrue(parametrize.is_parametrized(module))
-        self.assertEqual(len(module.parametrizations.weight), 1)
-        self.assertTrue(isinstance(module.parametrizations.weight[0], Identity))
-
-        # Cannot change shape
-        class ChangeShape(nn.Module):
-            def forward(self, x):
-                return x[:-1]
-
-        with self.assertRaisesRegex(ValueError, "may not change the shape"):
-            parametrize.register_parametrization(module, "weight", ChangeShape())
-        self.assertTrue(parametrize.is_parametrized(module))
-        self.assertEqual(len(module.parametrizations.weight), 1)
-        self.assertTrue(isinstance(module.parametrizations.weight[0], Identity))
-
-        # The following checks are mostly due to bugs in the code of the parametrization
-
-        # right_inverse has to return a tensor
-        class WrongReturnInverse(Identity):
-            def right_inverse(self, x):
-                return x, x
-
-        with self.assertRaisesRegex(ValueError, "right_inverse must return a tensor"):
-            parametrize.register_parametrization(module, "weight", WrongReturnInverse())
-        self.assertTrue(parametrize.is_parametrized(module))
-        self.assertEqual(len(module.parametrizations.weight), 1)
-        self.assertTrue(isinstance(module.parametrizations.weight[0], Identity))
-
-        # Cannot change dtype
-        class ChangeDtypeInverse(Identity):
-            def right_inverse(self, x):
-                return x.bool()
-
-        with self.assertRaisesRegex(ValueError, "must have the same dtype"):
-            parametrize.register_parametrization(module, "weight", ChangeDtypeInverse())
-        self.assertTrue(parametrize.is_parametrized(module))
-        self.assertEqual(len(module.parametrizations.weight), 1)
-        self.assertTrue(isinstance(module.parametrizations.weight[0], Identity))
-
-        # Cannot change shape
-        class ChangeShapeInverse(Identity):
-            def right_inverse(self, x):
-                return x[:-1]
-
-        with self.assertRaisesRegex(ValueError, "must have the same shape"):
-            parametrize.register_parametrization(module, "weight", ChangeShapeInverse())
-        self.assertTrue(parametrize.is_parametrized(module))
-        self.assertEqual(len(module.parametrizations.weight), 1)
-        self.assertTrue(isinstance(module.parametrizations.weight[0], Identity))
-
-    # FIXME: Rewrite this test using functions not depending on LAPACK
-    #        and remove the `@skipIfNoLapack` (see #70995)
-    @skipIfNoLapack
-    def test_multiple_inputs_parametrization(self):
-        # A parametrization with several outputs
-        class RankOne(nn.Module):
-            def forward(self, x, y):
-                # Form a rank-1 matrix from a pair of vectors
-                return x.unsqueeze(-1) @ y.unsqueeze(-2)
-
-            def right_inverse(self, Y):
-                # We project the given matrix onto the rank 1 matrices
-                U, S, Vh = torch.linalg.svd(Y, full_matrices=False)
-                # S is ordered in a decreasing way.
-                s0_sqrt = S[0].sqrt().unsqueeze(-1)
-                return U[..., :, 0] * s0_sqrt, Vh[..., 0, :] * s0_sqrt
-
-        # Simple parametrisation
-        class Double(nn.Module):
-            def forward(self, x):
-                return 2.0 * x
-
-            def right_inverse(self, w):
-                return 0.5 * w
-
-        model = nn.Linear(3, 3)
-        # Test one parametrization
-        parametrize.register_parametrization(model, "weight", RankOne())
-        self.assertTrue(hasattr(model, "parametrizations"))
-        self.assertTrue(parametrize.is_parametrized(model))
-        self.assertTrue(parametrize.is_parametrized(model, "weight"))
-        self.assertTrue(hasattr(model.parametrizations.weight, "original0"))
-        self.assertIn("original0", model.parametrizations.weight._parameters)
-        self.assertTrue(hasattr(model.parametrizations.weight, "original1"))
-        self.assertIn("original1", model.parametrizations.weight._parameters)
-        self.assertFalse(parametrize.is_parametrized(model, "bias"))
-        self.assertNotIn("weight", model._parameters)
-        # Result should be rank 1
-        self.assertEqual(torch.linalg.matrix_rank(model.weight).item(), 1)
-
-        with self.assertRaisesRegex(ValueError, "leave_parametrized=False"):
-            # Cannot remove a parametrization with multiple inputs and not leave it parametrized
-            parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
-        # Remove parametrization and check consistency
-        parametrize.remove_parametrizations(model, "weight", leave_parametrized=True)
-        self.assertFalse(hasattr(model, "parametrizations"))
-        self.assertEqual(model.__class__, nn.Linear)
-        self.assertFalse(parametrize.is_parametrized(model))
-        self.assertEqual(torch.linalg.matrix_rank(model.weight).item(), 1)
-        self.assertIn("weight", model._parameters)
-
-        # Registering parametrizations with one input on top of one with multiple inputs should work
-        init_weight = model.weight.clone()
-        parametrize.register_parametrization(model, "weight", RankOne())
-        # Projecting a rank 1 matrix onto the matrices of rank one does not change the matrix
-        self.assertEqual(init_weight, model.weight)
-        parametrize.register_parametrization(model, "weight", Double())
-        # The matrix now is twice the initial matrix
-        self.assertEqual(2.0 * init_weight, model.weight)
-        # Multiplying by a scalar does not change the rank
-        self.assertEqual(torch.linalg.matrix_rank(model.weight).item(), 1)
-
-        # The model has now three parameters
-        self.assertEqual(len(list(model.parameters())), 3)
-
-        sgd = torch.optim.SGD(model.parameters(), lr=0.1)
-
-        # Test backward. Should not throw
-        for _ in range(2):
-            sgd.zero_grad()
-            loss = (model.weight.T @ model.bias).sum()
-            loss.backward()
-            sgd.step()
-
-        # Same drill as before, removing should work as expected
-        with self.assertRaisesRegex(ValueError, "leave_parametrized=False"):
-            # Cannot remove a parametrization with multiple inputs and not leave it parametrized
-            parametrize.remove_parametrizations(model, "weight", leave_parametrized=False)
-        # Remove parametrization and check consistency
-        parametrize.remove_parametrizations(model, "weight", leave_parametrized=True)
-        self.assertFalse(hasattr(model, "parametrizations"))
-        self.assertEqual(model.__class__, nn.Linear)
-        self.assertFalse(parametrize.is_parametrized(model))
-        self.assertEqual(torch.linalg.matrix_rank(model.weight).item(), 1)
-        self.assertIn("weight", model._parameters)
-
-        # The model has now two parameters
-        self.assertEqual(len(list(model.parameters())), 2)
-
-        # Test backward. Should not throw
-        sgd = torch.optim.SGD(model.parameters(), lr=0.1)
-        for _ in range(2):
-            sgd.zero_grad()
-            loss = (model.weight.T @ model.bias).sum()
-            loss.backward()
-            sgd.step()
-
-    # FIXME: Rewrite this test using functions not depending on LAPACK
-    #        and remove the `@skipIfNoLapack` (see #70995)
-    @skipIfNoLapack
-    def test_caching_parametrization(self):
-        r"""Test the caching system of a parametrization"""
-        # Define a couple matrix parametrizations
-        class Skew(nn.Module):
-            def forward(self, X):
-                X = X.tril(-1)
-                return X - X.T
-
-        class Orthogonal(nn.Module):
-            def forward(self, X):
-                Id = torch.eye(X.size(0), device=X.device)
-                return torch.linalg.solve(Id + X, Id - X)
-
-        model = nn.Linear(5, 5)
-        parametrize.register_parametrization(model, "weight", Skew())
-        parametrize.register_parametrization(model, "weight", Orthogonal())
-
-        # Test that the caching system works
-        with parametrize.cached():
-            X = model.weight
-            Y = model.weight
-            self.assertEqual(id(X), id(Y))
-
-    # FIXME: Rewrite this test using functions not depending on LAPACK
-    #        and remove the `@skipIfNoLapack` (see #70995)
-    @skipIfNoLapack
-    def test_caching_parametrization_with_transfer_parametrizations_and_params(self):
-        r"""Test that transferring parametrizations doesn't cause issues with caching"""
-        class Skew(nn.Module):
-            def forward(self, X):
-                X = X.tril(-1)
-                return X - X.T
-
-        class Orthogonal(nn.Module):
-            def forward(self, X):
-                Id = torch.eye(X.size(0), device=X.device)
-                return torch.linalg.solve(Id + X, Id - X)
-
-        model = nn.Linear(5, 5)
-        parametrize.register_parametrization(model, "weight", Skew())
-        parametrize.register_parametrization(model, "weight", Orthogonal())
-
-        to_model = nn.Linear(5, 5)
-        parametrize.transfer_parametrizations_and_params(model, to_model)
-
-        with parametrize.cached():
-            X = model.weight
-            Y = model.weight
-            self.assertEqual(id(X), id(Y))
-
-            A = to_model.weight
-            B = to_model.weight
-            self.assertEqual(id(A), id(B))
-
-            # test that the results are distinct objects for each module
-            self.assertNotEqual(id(A), id(X))
-
-    def test_parametrization_same_training_mode(self):
-        r"""Test training mode updated on parametrization registration"""
-        class Identity(nn.Module):
-            def forward(self, X):
-                return X
-
-        module = nn.Linear(4, 4)
-        module.eval()
-        parametrize.register_parametrization(module, "weight", Identity())
-        self.assertFalse(module.parametrizations.weight[0].training)
-        module.train()
-        parametrize.register_parametrization(module, "weight", Identity().eval())
-        self.assertTrue(module.parametrizations.weight[0].training)
-        self.assertTrue(module.parametrizations.weight[1].training)
-
-    def test_type_before_parametrizations(self):
-        r"""Test that type_before_parametrizations always retrieves original type"""
-
-        class Identity(nn.Module):
-            def forward(self, X):
-                return X
-
-        model = nn.Linear(5, 5)
-        original_type = type(model)
-        self.assertTrue(
-            parametrize.type_before_parametrizations(model) == original_type
-        )
-        parametrize.register_parametrization(model, "weight", Identity())
-        self.assertTrue(
-            parametrize.type_before_parametrizations(model) == original_type
-        )
-
-    def test_deepcopy_after_parametrization(self):
-        r"""Test that we are able to create a deepcopy of the module when it's parametrized."""
-
-        class AddOne(nn.Module):
-            def forward(self, x):
-                return x + 1.0
-
-        class ModelWithoutDeepcopy(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.weight = nn.Parameter(torch.tensor([1., 1., 1., 1.]), requires_grad=True)
-                self.bias = nn.Parameter(torch.tensor([0., 0., 0., 0.]), requires_grad=True)
-                self.attr = [1.0, 2.0, 3.0, 4.0]
-
-        class ActualModel(ModelWithoutDeepcopy):
-            # Emulate custom implementation of the deepcopying.
-            def __deepcopy__(self, memo):
-                result = self.__new__(self.__class__)
-                memo[id(self)] = result
-                result.__dict__ = deepcopy(self.__dict__, memo)
-                return result
-
-        def check_deepcopy(m1: nn.Module, m2: nn.Module):
-            w1 = m1.parametrizations.weight.original
-            w2 = m2.parametrizations.weight.original
-            b1 = m1.parametrizations.bias.original if parametrize.is_parametrized(m1, "bias") else m1.bias
-            b2 = m2.parametrizations.bias.original if parametrize.is_parametrized(m2, "bias") else m2.bias
-            # Weights, biases and attributes should be equal but they must be different objects.
-            self.assertEqual(m1.__dict__.keys(), m2.__dict__.keys())
-            self.assertIsNot(m1, m2)
-            self.assertEqual(w1, w2)
-            self.assertIsNot(w1, w2)
-            self.assertEqual(b1, b2)
-            self.assertIsNot(b1, b2)
-            self.assertEqual(m1.attr, m2.attr)
-            self.assertIsNot(m1.attr, m2.attr)
-
-        for model in (ModelWithoutDeepcopy(), ActualModel()):
-            # General check that we are able to create deepcopy.
-            parametrize.register_parametrization(model, "weight", AddOne())
-            check_deepcopy(model, deepcopy(model))
-            # Check that this works on models with several parametrized tensors.
-            parametrize.register_parametrization(model, "bias", AddOne())
-            check_deepcopy(model, deepcopy(model))
-            # Check that this works on models where tensors have more than one parametrization.
-            parametrize.register_parametrization(model, "weight", AddOne())
-            check_deepcopy(model, deepcopy(model))
-
-    def test_transfer_parametrizations_and_params(self):
-        r"""Test that all parametrizations and their associated parameters are transferred."""
-
-        class AddOne(nn.Module):
-            def forward(self, x):
-                return x + 1.0
-
-        class Double(nn.Module):
-            def forward(self, x):
-                return 2.0 * x
-
-            def right_inverse(self, x):
-                return 0.5 * x
-
-        class MinusOne(nn.Module):
-            def forward(self, x):
-                return x - 1.0
-
-        model = nn.Linear(5, 5)
-        parametrize.register_parametrization(model, "weight", AddOne())
-        parametrize.register_parametrization(model, "weight", Double())
-        parametrize.register_parametrization(model, "weight", MinusOne())
-        hold_weight = model.weight
-
-        to_model = torch.ao.nn.qat.Linear(
-            5, 5, qconfig=torch.ao.quantization.get_default_qconfig()
-        )
-        parametrize.transfer_parametrizations_and_params(model, to_model)
-
-        # checks that final and original value are correct and the to_model is parametrized
-        self.assertTrue(torch.nn.utils.parametrize.is_parametrized(to_model, "weight"))
-        self.assertEqual(model.weight, to_model.weight)
-        self.assertEqual(
-            model.parametrizations.weight.original,
-            to_model.parametrizations.weight.original,
-        )
-
-        # check that the transfer didn't affect the original value
-        self.assertEqual(hold_weight, model.weight)
-
-        # testing that changes to one set of parametrizations do not affect the other
-        parametrize.remove_parametrizations(to_model, "weight")
-        self.assertFalse(torch.nn.utils.parametrize.is_parametrized(to_model, "weight"))
-        self.assertTrue(torch.nn.utils.parametrize.is_parametrized(model, "weight"))
-
-        # also test that parameters that don't exist in to_model get transferred
-        model.test_param = Parameter(torch.randn(5, 5))
-
-        self.assertTrue(not hasattr(to_model, "test_param"))
-        parametrize.register_parametrization(model, "test_param", Double())
-        hold_test_param = model.test_param
-        parametrize.transfer_parametrizations_and_params(model, to_model, "test_param")
-
-        # check that previously missing params got transferred correctly
-        self.assertEqual(model.test_param, to_model.test_param)
-        self.assertEqual(
-            model.parametrizations.test_param.original,
-            to_model.parametrizations.test_param.original,
-        )
-
-        # check that the new transfer didn't change the value for the from_module
-        self.assertEqual(hold_test_param, model.test_param)
-
-    def test_transfer_parametrizations_and_params_right_inverse(self):
-        r"""Test that all parametrizations and their associated parameters are transferred."""
-
-        class Double(nn.Module):
-            def forward(self, x):
-                return 2.0 * x
-
-            def right_inverse(self, x):
-                return 0.5 * x
-
-        model = nn.Linear(5, 5)
-        parametrize.register_parametrization(model, "weight", Double())
-        hold_weight = model.weight
-
-        to_model = torch.ao.nn.qat.Linear(
-            5, 5, qconfig=torch.ao.quantization.get_default_qconfig()
-        )
-        parametrize.transfer_parametrizations_and_params(model, to_model)
-
-        # check that transfer occurs successfully
-        self.assertEqual(model.weight, to_model.weight)
-        self.assertEqual(
-            model.parametrizations.weight.original,
-            to_model.parametrizations.weight.original,
-        )
-
-        # check that transfer doesn't affect the from_model weight
-        self.assertEqual(hold_weight, model.weight)
-
-    def test_transfer_parametrizations_and_params_single_param(self):
-        r"""Test that all parametrizations and their associated parameters are transferred."""
-
-        class AddOne(nn.Module):
-            def forward(self, x):
-                return x + 1.0
-
-        class Double(nn.Module):
-            def forward(self, x):
-                return 2.0 * x
-
-        class MinusOne(nn.Module):
-            def forward(self, x):
-                return x - 1.0
-
-        model = nn.Linear(5, 5, bias=True)
-        parametrize.register_parametrization(model, "weight", AddOne())
-        parametrize.register_parametrization(model, "weight", Double())
-        parametrize.register_parametrization(model, "weight", MinusOne())
-        parametrize.register_parametrization(model, "bias", AddOne())
-        parametrize.register_parametrization(model, "bias", Double())
-        parametrize.register_parametrization(model, "bias", MinusOne())
-
-        to_model = torch.ao.nn.qat.Linear(
-            5, 5, bias=True, qconfig=torch.ao.quantization.get_default_qconfig()
-        )
-        parametrize.transfer_parametrizations_and_params(model, to_model, "weight")
-
-        # check that weight and only weight was transferred
-        self.assertEqual(model.weight, to_model.weight)
-        self.assertEqual(
-            model.parametrizations.weight.original,
-            to_model.parametrizations.weight.original,
-        )
-        self.assertTrue("bias" not in to_model.parametrizations)
-
-    # FIXME: Rewrite this test using functions not depending on LAPACK
-    # and remove the `@skipIfNoLapack` (see #70995)
-    @skipIfNoLapack
-    def test_transfer_parametrizations_and_params_many_to_one(self):
-        # A parametrization with several outputs
-        class RankOne(nn.Module):
-            def forward(self, x, y):
-                # Form a rank-1 matrix from a pair of vectors
-                return x.unsqueeze(-1) @ y.unsqueeze(-2)
-
-            def right_inverse(self, Y):
-                # We project the given matrix onto the rank 1 matrices
-                U, S, Vh = torch.linalg.svd(Y, full_matrices=False)
-                # S is ordered in a decreasing way.
-                s0_sqrt = S[0].sqrt().unsqueeze(-1)
-                return U[..., :, 0] * s0_sqrt, Vh[..., 0, :] * s0_sqrt
-
-        class Double(nn.Module):
-            def forward(self, x):
-                return 2.0 * x
-
-        model = nn.Linear(3, 3)
-        parametrize.register_parametrization(model, "weight", RankOne())
-        parametrize.register_parametrization(model, "weight", Double())
-        hold_weight = model.weight
-
-        to_model = torch.ao.nn.qat.Linear(
-            3, 3, qconfig=torch.ao.quantization.get_default_qconfig()
-        )
-
-        parametrize.transfer_parametrizations_and_params(model, to_model)
-
-        # checks that final and original value are correct and the to_model is parametrized
-        self.assertTrue(torch.nn.utils.parametrize.is_parametrized(to_model, "weight"))
-        self.assertEqual(model.weight, to_model.weight)
-        self.assertEqual(
-            model.parametrizations.weight.original0,
-            to_model.parametrizations.weight.original0,
-        )
-        self.assertEqual(
-            model.parametrizations.weight.original1,
-            to_model.parametrizations.weight.original1,
-        )
-
-        # check that the transfer didn't affect the original value
-        self.assertEqual(hold_weight, model.weight)
-
-        # testing that changes to one set of parametrizations do not affect the other
-        model.test_param = Parameter(torch.randn(3, 3))
-
-        self.assertTrue(not hasattr(to_model, "test_param"))
-        parametrize.register_parametrization(model, "test_param", RankOne())
-        hold_test_param = model.test_param
-        parametrize.transfer_parametrizations_and_params(model, to_model, "test_param")
-
-        # also check that previously missing params got transferred correctly
-        self.assertEqual(model.test_param, to_model.test_param)
-        self.assertEqual(
-            model.parametrizations.test_param.original0,
-            to_model.parametrizations.test_param.original0,
-        )
-        self.assertEqual(
-            model.parametrizations.test_param.original1,
-            to_model.parametrizations.test_param.original1,
-        )
-
-        # check that the new transfer didn't change the value for the from_module
-        self.assertEqual(hold_test_param, model.test_param)
-
     # torch/nn/utils/prune.py
     @unittest.skipIf(not TEST_NUMPY, "numpy not found")
     def test_validate_pruning_amount_init(self):
@@ -4026,266 +2947,6 @@ def fn(weight):
 
                     gradcheck(fn, (m.weight_orig,))
 
-    def test_new_spectral_norm(self):
-        input = torch.randn(3, 5)
-        m = nn.Linear(5, 7)
-        m = torch.nn.utils.parametrizations.spectral_norm(m)
-        spectral_norm_m = m.parametrizations.weight[0]
-
-        self.assertEqual(spectral_norm_m._u.size(), torch.Size([m.weight.size(0)]))
-
-        # .parametrizations.weight.original should be trainable
-        self.assertTrue(hasattr(m.parametrizations.weight, 'original'))
-        self.assertTrue('original' in m.parametrizations.weight._parameters)
-
-        # u should be just a reused buffer
-        self.assertTrue(hasattr(spectral_norm_m, '_u'))
-        self.assertTrue('_u' in spectral_norm_m._buffers)
-        self.assertTrue('_v' in spectral_norm_m._buffers)
-
-        # weight should be a plain attribute, not counted as a buffer or a param
-        self.assertIsNotNone(m.weight)
-        self.assertFalse('weight' in m._buffers)
-        self.assertFalse('weight' in m._parameters)
-
-        # it should also be sharing storage as `weight_orig`
-        # self.assertEqual(m.parametrizations.weight.original.storage(), m.weight.storage())
-        self.assertEqual(m.parametrizations.weight.original.size(), m.weight.size())
-        self.assertEqual(m.parametrizations.weight.original.stride(), m.weight.stride())
-
-        m = torch.nn.utils.parametrize.remove_parametrizations(m, 'weight')
-
-        # spectral_norm is the only parametrization
-        self.assertFalse(hasattr(m, 'parametrizations'))
-        self.assertTrue('weight' in m._parameters)
-
-        # We can register spectral_norm multiple times on the same parameter
-        # and on multiple parameters in the same module
-        m = torch.nn.utils.parametrizations.spectral_norm(m, 'weight')
-        m = torch.nn.utils.parametrizations.spectral_norm(m, 'weight')
-        m = torch.nn.utils.parametrizations.spectral_norm(m, 'bias')
-
-        # If we remove the parametrization on bias, weight is still parametrized
-        # Removing a parametrization runs forward in eval mode if leave_parametrized=True
-        m = torch.nn.utils.parametrize.remove_parametrizations(m, 'bias')
-        self.assertTrue('bias' in m._parameters)
-        self.assertTrue(hasattr(m, 'parametrizations'))
-        self.assertFalse('weight' in m._parameters)
-
-        m = torch.nn.utils.parametrize.remove_parametrizations(m, 'weight')
-        # Neither weight and bias are parametrized
-        self.assertFalse(hasattr(m, 'parametrizations'))
-        self.assertTrue('weight' in m._parameters)
-        self.assertFalse(torch.nn.utils.parametrize.is_parametrized(m))
-
-        # test correctness in training/eval modes and cpu/multi-gpu settings
-        for apply_dp in (True, False):
-            if apply_dp:
-                if not TEST_MULTIGPU:
-                    continue
-                device = torch.device('cuda:0')
-
-                def maybe_wrap(m):
-                    return torch.nn.DataParallel(m, [0, 1])
-            else:
-                device = torch.device('cpu')
-
-                def maybe_wrap(m):
-                    return m
-
-            for requires_grad in (True, False):
-                def get_modules():
-                    m = nn.Linear(3, 4).to(device)
-                    m.weight.requires_grad_(requires_grad)
-                    m = torch.nn.utils.parametrizations.spectral_norm(m)
-                    wrapped_m = maybe_wrap(m)
-                    spectral_norm_m = m.parametrizations.weight[0]
-                    return m, wrapped_m, spectral_norm_m
-
-                input = torch.randn(2, 3, device=device)
-
-                m, wrapped_m, spectral_norm_m = get_modules()
-
-                self.assertTrue(hasattr(spectral_norm_m, '_u'))
-                u0 = spectral_norm_m._u.clone()
-                v0 = spectral_norm_m._v.clone()
-
-                # TEST TRAINING BEHAVIOR
-
-                # We perform GD first to modify the initial matrix
-                opt = torch.optim.SGD(wrapped_m.parameters(), lr=0.1)
-
-                opt.zero_grad()
-                wrapped_m(input).sum().backward()
-                opt.step()
-
-                out = wrapped_m(input)
-                if requires_grad:
-                    # run forward again and assert that u and v are updated
-                    self.assertNotEqual(u0, spectral_norm_m._u)
-                    self.assertNotEqual(v0, spectral_norm_m._v)
-
-                # assert that backprop reaches original weight
-                # can't use gradcheck because the function changes as we
-                # activate through it in training mode
-                if requires_grad:
-                    torch.autograd.grad(out.sum(), m.parametrizations.weight.original)
-
-                # test backward works with multiple forwards
-                # it uses training mode so we need to reset `u` and `v` vectors
-                # to same value at beginning for finite difference test to pass
-                saved_u = spectral_norm_m._u.clone()
-                saved_v = spectral_norm_m._v.clone()
-
-                def fn(input):
-                    spectral_norm_m._u.data.copy_(saved_u)
-                    spectral_norm_m._v.data.copy_(saved_v)
-                    out0 = wrapped_m(input)
-                    out1 = wrapped_m(input)
-                    return out0 + out1
-
-                # Make sure we can compute gradients wrt to all the parameters in the case
-                # of double forward
-                fn(input.clone().requires_grad_()).sum().backward()
-                gradcheck(fn, (input.clone().requires_grad_(),), check_batched_grad=False)
-
-                # test removing
-                # spectral norm module needs to be in eval mode if we'd like to
-                # avoid doing another power iteration
-                m, wrapped_m, _ = get_modules()
-                pre_remove_out = wrapped_m(input)
-                m.eval()
-                m = torch.nn.utils.parametrize.remove_parametrizations(m, 'weight')
-                self.assertEqual(wrapped_m(input), pre_remove_out)
-
-                torch.nn.utils.parametrizations.spectral_norm(m)
-                for _ in range(3):
-                    pre_remove_out = wrapped_m(input)
-                m.eval()
-                m = torch.nn.utils.parametrize.remove_parametrizations(m, 'weight')
-                self.assertEqual(wrapped_m(input), pre_remove_out)
-
-                # TEST EVAL BEHAVIOR
-                m, wrapped_m, spectral_norm_m = get_modules()
-                wrapped_m(input)
-                last_train_out = wrapped_m(input)
-                last_train_u = spectral_norm_m._u.clone()
-                last_train_v = spectral_norm_m._v.clone()
-                wrapped_m.zero_grad()
-                wrapped_m.eval()
-
-                eval_out0 = wrapped_m(input)
-                # assert eval gives same result as last training iteration
-                self.assertEqual(eval_out0, last_train_out)
-                # assert doing more iteartion in eval don't change things
-                self.assertEqual(eval_out0, wrapped_m(input))
-                self.assertEqual(last_train_u, spectral_norm_m._u)
-                self.assertEqual(last_train_v, spectral_norm_m._v)
-
-                # FIXME: the code below is flaky when executed with DataParallel
-                # see https://github.com/pytorch/pytorch/issues/13818
-                if apply_dp:
-                    continue
-
-                # test backward works with multiple forwards in mixed training
-                # and eval modes
-                # it uses training mode so we need to reset `u` and `v` vectors
-                # to same value at beginning for finite difference test to pass
-                saved_u = spectral_norm_m._u.clone()
-                saved_v = spectral_norm_m._v.clone()
-
-                def fn(input):
-                    spectral_norm_m._u.data.copy_(saved_u)
-                    spectral_norm_m._v.data.copy_(saved_v)
-                    wrapped_m.train()
-                    out0 = wrapped_m(input)
-                    wrapped_m.eval()
-                    out1 = wrapped_m(input)
-                    wrapped_m.train()
-                    out2 = wrapped_m(input)
-                    wrapped_m.eval()
-                    out3 = wrapped_m(input)
-                    return out0 + out1 + out2 + out3
-
-                gradcheck(fn, (input.clone().requires_grad_(),))
-
-                # assert that backprop reaches weight_orig in eval
-                if requires_grad:
-                    def fn(weight):
-                        return wrapped_m(input)
-
-                    gradcheck(fn, (m.parametrizations.weight.original,))
-
-    def test_new_spectral_norm_load_state_dict(self):
-        for activate_times in (0, 3):
-            inp = torch.randn(2, 3)
-            m = nn.Linear(3, 5)
-            snm = torch.nn.utils.parametrizations.spectral_norm(m)
-            snm.train()
-
-            for _ in range(activate_times):
-                snm(inp)
-
-            state_dict = deepcopy(snm.state_dict())
-            self.assertEqual({
-                'parametrizations.weight.original',
-                'bias',
-                'parametrizations.weight.0._v',
-                'parametrizations.weight.0._u'
-            }, set(state_dict.keys()))
-
-            # test that non-strict loading works
-            non_strict_state_dict = deepcopy(state_dict)
-            non_strict_state_dict['nonsense'] = 'nonsense'
-            with self.assertRaisesRegex(RuntimeError, r'Unexpected key\(s\) in state_dict: "nonsense"'):
-                snm.load_state_dict(non_strict_state_dict, strict=True)
-            snm.load_state_dict(non_strict_state_dict, strict=False)
-            del non_strict_state_dict['parametrizations.weight.original']
-            snm.load_state_dict(non_strict_state_dict, strict=False)
-            del non_strict_state_dict['parametrizations.weight.0._u']
-            snm.load_state_dict(non_strict_state_dict, strict=False)
-            del non_strict_state_dict['parametrizations.weight.0._v']
-            snm.load_state_dict(non_strict_state_dict, strict=False)
-            non_strict_state_dict['weight'] = snm.weight.detach().clone()     # set W as a buffer
-            snm.load_state_dict(non_strict_state_dict, strict=False)
-            del non_strict_state_dict._metadata['parametrizations.weight.0']  # remove metadata info
-            snm.load_state_dict(non_strict_state_dict, strict=False)
-            del non_strict_state_dict['weight']                               # remove W buffer
-            snm.load_state_dict(non_strict_state_dict, strict=False)
-            del non_strict_state_dict['bias']
-            snm.load_state_dict(non_strict_state_dict, strict=False)
-
-            # normal state_dict
-
-            # test that re-wrapping does not matter
-            m = torch.nn.utils.parametrize.remove_parametrizations(snm, 'weight')
-            snm = torch.nn.utils.parametrizations.spectral_norm(m)
-
-            snm.load_state_dict(state_dict)
-            with torch.no_grad():
-                snm.eval()
-                out0_eval = snm(inp)
-                snm.train()
-                out1_train = snm(inp)
-                out2_train = snm(inp)
-                snm.eval()
-                out3_eval = snm(inp)
-
-            # test that re-wrapping does not matter
-            m = torch.nn.utils.parametrize.remove_parametrizations(snm, 'weight')
-            snm = torch.nn.utils.parametrizations.spectral_norm(m)
-
-            # Test normal loading
-            snm.load_state_dict(state_dict)
-            with torch.no_grad():
-                snm.eval()
-                self.assertEqual(out0_eval, snm(inp))
-                snm.train()
-                self.assertEqual(out1_train, snm(inp))
-                self.assertEqual(out2_train, snm(inp))
-                snm.eval()
-                self.assertEqual(out3_eval, snm(inp))
-
     @skipIfNoLapack
     def test_spectral_norm_load_state_dict(self):
         inp = torch.randn(2, 3)
@@ -4393,16 +3054,6 @@ def test_spectral_norm_dim(self):
         # check that u refers to the same dimension
         self.assertEqual(m.weight_u.shape, m.weight_orig[0, :, 0, 0].shape)
 
-    def test_new_spectral_norm_dim(self):
-        inp = torch.randn(2, 3, 10, 12)
-        m = nn.ConvTranspose2d(3, 4, (5, 6))
-        m = torch.nn.utils.parametrizations.spectral_norm(m)
-        snm = m.parametrizations.weight[0]
-        # this should not run into incompatible shapes
-        x = m(inp)
-        # check that u refers to the same dimension
-        self.assertEqual(snm._u.shape, m.parametrizations.weight.original[0, :, 0, 0].shape)
-
     def test_spectral_norm_forward(self):
         input = torch.randn(3, 5)
         m = nn.Linear(5, 7)
@@ -4419,164 +3070,11 @@ def test_spectral_norm_forward(self):
         expect_out = m(input)
         self.assertEqual(expect_out, out_hat)
 
-    def test_new_spectral_norm_forward(self):
-        input = torch.randn(3, 5)
-        m = nn.Linear(5, 7)
-        m = torch.nn.utils.parametrizations.spectral_norm(m)
-        snm = m.parametrizations.weight[0]
-        # naive forward
-        _weight = m.parametrizations.weight.original
-        _bias, _v = m.bias, snm._v
-        _weight_mat = _weight.view(_weight.size(0), -1)
-        _u = torch.mv(_weight_mat, _v)
-        _u = F.normalize(_u, dim=0, eps=1e-12)
-        _v = torch.mv(_weight_mat.t(), _u)
-        _v = F.normalize(_v, dim=0, eps=1e-12)
-        _weight.data /= torch.dot(_u, torch.matmul(_weight_mat, _v))
-        out_hat = torch.nn.functional.linear(input, _weight, _bias)
-        expect_out = m(input)
-        self.assertEqual(expect_out, out_hat)
-
     def test_spectral_norm_pickle(self):
         m = torch.nn.utils.spectral_norm(nn.Linear(5, 7))
         m = pickle.loads(pickle.dumps(m))
         self.assertIsInstance(m, nn.Linear)
 
-    @skipIfNoLapack
-    def test_orthogonal_parametrization(self):
-        # Orthogonal implements 6 algorithms (3x parametrizations times 2 options of use_trivialization)
-
-        def assert_is_orthogonal(X):
-            n, k = X.size(-2), X.size(-1)
-            if n < k:
-                X = X.mT
-                n, k = k, n
-            Id = torch.eye(k, dtype=X.dtype, device=X.device).expand(*(X.size()[:-2]), k, k)
-            eps = 10 * n * torch.finfo(X.dtype).eps
-            torch.testing.assert_close(X.mH @ X, Id, atol=eps, rtol=0.)
-
-
-        def assert_weight_allclose_Q(weight, W):
-            # Test that weight is equal to the Q part of the QR decomposition of W
-            # (or of its transpose if the matrix is wide)
-            wide_matrix = W.size(-2) < W.size(-1)
-            if wide_matrix:
-                W = W.mT
-            Q, R = torch.linalg.qr(W)
-            Q *= R.diagonal(dim1=-2, dim2=-1).sgn().unsqueeze(-2)
-            if wide_matrix:
-                Q = Q.mT
-            torch.testing.assert_close(Q, weight, atol=1e-5, rtol=0.)
-
-
-        for shape, dtype, use_linear in product(((4, 4), (5, 3), (3, 5)),  # square/ tall / wide
-                                                (torch.float32, torch.complex64),
-                                                (True, False)):
-            # Conv2d does not support complex yet
-            if not use_linear:
-                continue
-
-            if use_linear:
-                input = torch.randn(3, shape[0], dtype=dtype)
-            else:
-                input = torch.randn(2, 2, shape[0] + 2, shape[1] + 1, dtype=dtype)
-
-            for parametrization, use_trivialization in product(("matrix_exp", "cayley", "householder"),
-                                                               (False, True)):
-                # right_inverse for Cayley and matrix_exp not implemented for use_trivialization=False
-                # See Note [right_inverse expm cayley]
-                can_initialize = use_trivialization or parametrization == "householder"
-
-                # We generate them every time to always start with fresh weights
-                if use_linear:
-                    m = nn.Linear(*shape, dtype=dtype)
-                else:
-                    m = nn.Conv2d(2, 3, shape, dtype=dtype)
-
-                # We do not support householder for complex inputs
-                # See Note [Householder complex]
-                w_init = m.weight.clone()
-                if parametrization == "householder" and m.weight.is_complex():
-                    msg = "householder parametrization does not support complex tensors"
-                    with self.assertRaisesRegex(ValueError, msg):
-                        torch.nn.utils.parametrizations.orthogonal(m,
-                                                                   "weight",
-                                                                   parametrization,
-                                                                   use_trivialization=use_trivialization)
-                    continue
-
-                wide_matrix = w_init.size(-2) < w_init.size(-1)
-                torch.nn.utils.parametrizations.orthogonal(m,
-                                                           "weight",
-                                                           parametrization,
-                                                           use_trivialization=use_trivialization)
-                # Forwards works as expected
-                self.assertEqual(w_init.shape, m.weight.shape)
-                assert_is_orthogonal(m.weight)
-                if can_initialize:
-                    assert_weight_allclose_Q(m.weight, w_init)
-
-                # Intializing with a given orthogonal matrix works
-                X = torch.randn_like(m.weight)
-                if wide_matrix:
-                    X = X.mT
-                w_new = torch.linalg.qr(X).Q
-                if wide_matrix:
-                    w_new = w_new.mT
-                if can_initialize:
-                    m.weight = w_new
-                    torch.testing.assert_close(w_new, m.weight, atol=1e-5, rtol=0.)
-                else:
-                    msg = "assign to the matrix exponential or the Cayley parametrization"
-                    with self.assertRaisesRegex(NotImplementedError, msg):
-                        m.weight = w_new
-
-                # Intializing with a non-orthogonal matrix makes m.weight be the Q part of the given matrix
-                w_new = torch.randn_like(m.weight)
-                if can_initialize:
-                    m.weight = w_new
-                    assert_weight_allclose_Q(m.weight, w_new)
-                else:
-                    msg = "assign to the matrix exponential or the Cayley parametrization"
-                    with self.assertRaisesRegex(NotImplementedError, msg):
-                        m.weight = w_new
-
-                opt = torch.optim.SGD(m.parameters(), lr=0.1)
-                for _ in range(2):
-                    opt.zero_grad()
-                    m(input).norm().backward()
-                    grad = m.parametrizations.weight.original.grad
-                    self.assertIsNotNone(grad)
-                    # We do not update the upper triangular part of the matrix if tall tril if wide
-                    if grad.size(-2) >= grad.size(-1):
-                        zeros_grad = grad.triu(1)
-                    else:
-                        zeros_grad = grad.tril(-1)
-                    self.assertEqual(zeros_grad, torch.zeros_like(zeros_grad))
-                    # The gradient in the diagonal can only be imaginary because a skew-Hermitian
-                    # matrix has imaginary diagonal
-                    diag_grad = grad.diagonal(dim1=-2, dim2=-1)
-                    if grad.is_complex():
-                        diag_grad = diag_grad.real
-                    self.assertEqual(diag_grad, torch.zeros_like(diag_grad))
-                    opt.step()
-                    assert_is_orthogonal(m.weight)
-
-    @skipIfNoLapack
-    def test_orthogonal_errors(self):
-        m = nn.Linear(3, 4)
-        with self.assertRaisesRegex(ValueError, "has to be one of"):
-            torch.nn.utils.parametrizations.orthogonal(m, "weight", "foo")
-
-        with self.assertRaisesRegex(ValueError, "Expected a matrix"):
-            torch.nn.utils.parametrizations.orthogonal(m, "bias")
-
-        torch.nn.utils.parametrizations.orthogonal(m, "weight")
-        with self.assertRaisesRegex(ValueError, "matrices of shape"):
-            m.weight = torch.randn(5, 5)
-        torch.nn.utils.parametrize.remove_parametrizations(m, "weight")
-
-
     def test_threshold_int(self):
         x = torch.tensor([-3, -2, -1, 0, 1, 2, 3])
         expected = torch.tensor([99, 99, 99, 99, 1, 2, 3])

From 46a4b29ecb36060288d1ca0e1163322b6747524d Mon Sep 17 00:00:00 2001
From: Emilio Castillo <ecastill@preferred.jp>
Date: Wed, 23 Nov 2022 17:54:33 +0000
Subject: [PATCH 1209/1922] Add Pluggable CUDA allocator backend (#86786)

Fixes #43144

This uses the Backend system added by [82682](https://github.com/pytorch/pytorch/pull/82682) to change allocators dynamically during the code execution. This will allow us to use RMM, use CUDA managed memory for some portions of the code that do not fit in GPU memory. Write static memory allocators to reduce fragmentation while training models and improve interoperability with external DL compilers/libraries.

For example, we could have the following allocator in c++

```c++
#include <sys/types.h>
#include <cuda_runtime_api.h>
#include <iostream>

extern "C" {
void* my_malloc(ssize_t size, int device, cudaStream_t stream) {
   void *ptr;
   std::cout<<"alloc "<< size<<std::endl;
   cudaMalloc(&ptr, size);
   return ptr;
}

void my_free(void* ptr) {
   std::cout<<"free "<<std::endl;
   cudaFree(ptr);
}
}
```

Compile it as a shared library
```
nvcc allocator.cc -o alloc.so -shared --compiler-options '-fPIC'
```

And use it from PyTorch as follows

```python
import torch

# Init caching
# b = torch.zeros(10, device='cuda')
new_alloc = torch.cuda.memory.CUDAPluggableAllocator('alloc.so', 'my_malloc', 'my_free')
old = torch.cuda.memory.get_current_allocator()
torch.cuda.memory.change_current_allocator(new_alloc)
b = torch.zeros(10, device='cuda')
# This will error since the current allocator was already instantiated
torch.cuda.memory.change_current_allocator(old)
```

Things to discuss
- How to test this, needs compiling external code ...

Pull Request resolved: https://github.com/pytorch/pytorch/pull/86786
Approved by: https://github.com/albanD
---
 build_variables.bzl                        |   1 +
 c10/cuda/CUDACachingAllocator.cpp          |   4 +
 c10/cuda/CUDACachingAllocator.h            |   1 +
 c10/cuda/CUDAMallocAsyncAllocator.cpp      |   4 +
 docs/source/cuda.rst                       |   2 +
 docs/source/notes/cuda.rst                 |  60 ++++
 torch/_C/__init__.pyi.in                   |   7 +
 torch/csrc/cuda/CUDAPluggableAllocator.cpp | 317 +++++++++++++++++++++
 torch/csrc/cuda/CUDAPluggableAllocator.h   | 135 +++++++++
 torch/csrc/cuda/Module.cpp                 | 120 ++++++++
 torch/cuda/__init__.py                     |  21 +-
 torch/cuda/memory.py                       |  73 ++++-
 torch/utils/hipify/cuda_to_hip_mappings.py |  17 ++
 13 files changed, 751 insertions(+), 11 deletions(-)
 create mode 100644 torch/csrc/cuda/CUDAPluggableAllocator.cpp
 create mode 100644 torch/csrc/cuda/CUDAPluggableAllocator.h

diff --git a/build_variables.bzl b/build_variables.bzl
index 473ed1c1de1b1..2faeed6e52d9e 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -862,6 +862,7 @@ libtorch_python_cuda_core_sources = [
     "torch/csrc/cuda/shared/cudart.cpp",
     "torch/csrc/cuda/shared/nvtx.cpp",
     "torch/csrc/cuda/utils.cpp",
+    "torch/csrc/cuda/CUDAPluggableAllocator.cpp",
 ]
 
 libtorch_python_cuda_sources = libtorch_python_cuda_core_sources + [
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 9876259522721..aaa647502a897 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -2008,6 +2008,10 @@ class NativeCachingAllocator : public CUDAAllocator {
     }
   }
 
+  bool initialized() override {
+    return device_allocator.size() > 0;
+  }
+
   /** allocates a block which is safe to use from the provided stream */
   void malloc(void** devPtr, int device, size_t size, cudaStream_t stream) {
     TORCH_INTERNAL_ASSERT(
diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h
index 74854b5a25fd3..41e082933d55d 100644
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@@ -183,6 +183,7 @@ class CUDAAllocator : public Allocator {
   virtual void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) = 0;
   virtual void raw_delete(void* ptr) = 0;
   virtual void init(int device_count) = 0;
+  virtual bool initialized() = 0;
   virtual void setMemoryFraction(double fraction, int device) = 0;
   virtual void emptyCache() = 0;
   virtual void cacheInfo(int dev_id, size_t* largestBlock) = 0;
diff --git a/c10/cuda/CUDAMallocAsyncAllocator.cpp b/c10/cuda/CUDAMallocAsyncAllocator.cpp
index 610342ac836bf..9edc4f87ccf31 100644
--- a/c10/cuda/CUDAMallocAsyncAllocator.cpp
+++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp
@@ -430,6 +430,10 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     (void)called;
   }
 
+  bool initialized() {
+    return devs_initialized_flags.size() > 0;
+  }
+
   static inline void assertValidDevice(int device) {
     TORCH_CHECK(
         0 <= device && device < device_count, "Invalid device argument.");
diff --git a/docs/source/cuda.rst b/docs/source/cuda.rst
index 601bb078752fd..b14e5cec360db 100644
--- a/docs/source/cuda.rst
+++ b/docs/source/cuda.rst
@@ -114,6 +114,8 @@ Memory management
      caching_allocator_alloc
      caching_allocator_delete
      get_allocator_backend
+     CUDAPluggableAllocator
+     change_current_allocator
 .. FIXME The following doesn't seem to exist. Is it supposed to?
    https://github.com/pytorch/pytorch/issues/27785
    .. autofunction:: reset_max_memory_reserved
diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
index 8eed57cfbd964..4a1538900a88c 100644
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@@ -430,6 +430,66 @@ Available options:
 .. _CUDA's built-in asynchronous allocator:
     https://developer.nvidia.com/blog/using-cuda-stream-ordered-memory-allocator-part-1/
 
+.. _cuda-memory-custom-allocator:
+
+Using custom memory allocators for CUDA
+---------------------------------------
+
+It is possible to define allocators as simple functions in C/C++ and compile
+them as a shared library, the code below shows a basic allocator that just
+traces all the memory operations.
+
+.. code:: C++
+
+   #include <sys/types.h>
+   #include <cuda_runtime_api.h>
+   #include <iostream>
+   // Compile with g++ alloc.cc -o alloc.so -I/usr/local/cuda/include -shared -fPIC
+   extern "C" {
+   void* my_malloc(ssize_t size, int device, cudaStream_t stream) {
+      void *ptr;
+      cudaMalloc(&ptr, size);
+      std::cout<<"alloc "<<ptr<<size<<std::endl;
+      return ptr;
+   }
+
+   void my_free(void* ptr, ssize_t size, cudaStream_t stream) {
+      std::cout<<"free "<<ptr<< " "<<stream<<std::endl;
+      cudaFree(ptr);
+   }
+   }
+
+
+This can be used in python through the :class:`torch.cuda.memory.CUDAPluggableAllocator`.
+The user is responsible for supplying the path to the `.so` file and the name
+of the alloc/free functions that match the signatures specified above.
+
+.. code:: python
+
+   import torch
+
+   # Load the allocator
+   new_alloc = torch.cuda.memory.CUDAPluggableAllocator(
+       'alloc.so', 'my_malloc', 'my_free')
+   # Swap the current allocator
+   torch.cuda.memory.change_current_allocator(new_alloc)
+   # This will allocate memory in the device using the new allocator
+   b = torch.zeros(10, device='cuda')
+
+
+.. code:: python
+
+   import torch
+
+   # Do an initial memory allocator
+   b = torch.zeros(10, device='cuda')
+   # Load the allocator
+   new_alloc = torch.cuda.memory.CUDAPluggableAllocator(
+       'alloc.so', 'my_malloc', 'my_free')
+   # This will error since the current allocator was already instantiated
+   torch.cuda.memory.change_current_allocator(new_alloc)
+
+
 .. _cufft-plan-cache:
 
 cuFFT plan cache
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 04553be1d44cd..cc1f1ed66714a 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -1199,6 +1199,13 @@ def _cuda_resetPeakMemoryStats(device: _int) -> None: ...
 def _cuda_memorySnapshot() -> Dict[str, Any]: ...
 def _cuda_recordMemoryHistory(enabled: _bool, record_context: _bool, record_context_cpp: _bool, alloc_trace_max_entries: _int, alloc_trace_record_context: _bool) -> None: ...
 def _cuda_getAllocatorBackend() -> str: ...
+
+class _cuda_CUDAAllocator:
+    ...
+
+def _cuda_customAllocator(alloc_fn: _int, free_fn: _int) -> _cuda_CUDAAllocator: ...
+def _cuda_changeCurrentAllocator(allocator: _cuda_CUDAAllocator) -> None: ...
+def _cuda_getAllocator() -> _cuda_CUDAAllocator: ...
 def _cuda_lock_mutex() -> None: ...
 def _cuda_unlock_mutex() -> None: ...
 def _cuda_canDeviceAccessPeer(device: _int, peer_device: _int) -> _bool: ...
diff --git a/torch/csrc/cuda/CUDAPluggableAllocator.cpp b/torch/csrc/cuda/CUDAPluggableAllocator.cpp
new file mode 100644
index 0000000000000..a64290b8a16e4
--- /dev/null
+++ b/torch/csrc/cuda/CUDAPluggableAllocator.cpp
@@ -0,0 +1,317 @@
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+
+#include <torch/csrc/cuda/CUDAPluggableAllocator.h>
+
+namespace torch {
+namespace cuda {
+namespace CUDAPluggableAllocator {
+
+int device_count = 0;
+
+void custom_raw_deleter(void* ptr);
+
+// This is a fast API to just register allocators
+// based on function pointers (ie. external .so libraries)
+// This avoids having to link against libtorch for C++ based custom allocators
+// And also use this from python
+CUDAPluggableAllocator::CUDAPluggableAllocator(
+    std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+    std::function<void(void*, size_t, cudaStream_t)> free_fn)
+    : alloc_fn_(alloc_fn), free_fn_(free_fn) {}
+
+CUDAPluggableAllocator::CUDAPluggableAllocator(CUDAPluggableAllocator& other)
+    : alloc_fn_(other.alloc_fn_),
+      free_fn_(other.free_fn_),
+      init_fn_(other.init_fn_),
+      reset_fn_(other.reset_fn_),
+      memory_fraction_fn_(other.memory_fraction_fn_),
+      base_alloc_fn_(other.base_alloc_fn_),
+      record_stream_fn_(other.record_stream_fn_),
+      capture_begin_fn_(other.capture_begin_fn_),
+      capture_about_to_end_fn_(other.capture_about_to_end_fn_),
+      capture_ended_fn_(other.capture_ended_fn_),
+      capture_destroy_fn_(other.capture_destroy_fn_) {}
+
+void CUDAPluggableAllocator::set_init_fn(std::function<void(int)> init_fn) {
+  init_fn_ = init_fn;
+}
+
+void CUDAPluggableAllocator::set_reset_fn(std::function<void()> reset_fn) {
+  reset_fn_ = reset_fn;
+}
+
+void CUDAPluggableAllocator::set_memory_fraction_fn(
+    std::function<void(double, int)> memory_fraction_fn) {
+  memory_fraction_fn_ = memory_fraction_fn;
+}
+
+void CUDAPluggableAllocator::set_base_alloc_fn(
+    std::function<void*(void*, size_t*)> base_alloc_fn) {
+  base_alloc_fn_ = base_alloc_fn;
+}
+
+void CUDAPluggableAllocator::set_record_stream_fn(
+    std::function<void(void* ptr, cudaStream_t stream)> record_stream_fn) {
+  record_stream_fn_ = record_stream_fn;
+}
+
+void CUDAPluggableAllocator::set_capture_begin_fn(
+    std::function<void(int, c10::cuda::CaptureId_t, c10::cuda::MempoolId_t)>
+        capture_begin_fn) {
+  capture_begin_fn_ = capture_begin_fn;
+}
+
+void CUDAPluggableAllocator::set_capture_about_to_end_fn(
+    std::function<void(int, c10::cuda::CaptureId_t)> capture_about_to_end_fn) {
+  capture_about_to_end_fn_ = capture_about_to_end_fn;
+}
+
+void CUDAPluggableAllocator::set_capture_ended_fn(
+    std::function<void(int, c10::cuda::CaptureId_t)> capture_ended_fn) {
+  capture_ended_fn_ = capture_ended_fn;
+}
+
+void CUDAPluggableAllocator::set_capture_destroy_fn(
+    std::function<void(int, c10::cuda::MempoolId_t)> capture_destroy_fn) {
+  capture_destroy_fn_ = capture_destroy_fn;
+}
+
+void* CUDAPluggableAllocator::malloc(
+    size_t size,
+    int device,
+    cudaStream_t stream) {
+  void* r = alloc_fn_(size, device, stream);
+  {
+    const std::lock_guard<std::mutex> lock(allocator_mutex_);
+    allocation_metadata_.emplace(r, std::make_pair(size, stream));
+  }
+  return r;
+}
+
+c10::DataPtr CUDAPluggableAllocator::allocate(size_t size) const {
+  int device;
+  C10_CUDA_CHECK(cudaGetDevice(&device));
+  cudaStream_t stream = c10::cuda::getCurrentCUDAStream(device);
+  void* r =
+      const_cast<CUDAPluggableAllocator*>(this)->malloc(size, device, stream);
+  c10::DataPtr data_ptr = {
+      r, r, raw_deleter(), c10::Device(c10::DeviceType::CUDA, device)};
+  return data_ptr;
+}
+
+c10::DeleterFnPtr CUDAPluggableAllocator::raw_deleter() const {
+  return &custom_raw_deleter;
+}
+
+void* CUDAPluggableAllocator::raw_alloc(size_t nbytes) {
+  int device;
+  C10_CUDA_CHECK(cudaGetDevice(&device));
+  cudaStream_t stream = c10::cuda::getCurrentCUDAStream(device);
+  return malloc(nbytes, device, stream);
+}
+
+void* CUDAPluggableAllocator::raw_alloc_with_stream(
+    size_t nbytes,
+    cudaStream_t stream) {
+  int device;
+  C10_CUDA_CHECK(cudaGetDevice(&device));
+  return malloc(nbytes, device, stream);
+}
+
+void CUDAPluggableAllocator::raw_delete(void* ptr) {
+  cudaStream_t stream;
+  size_t size;
+  {
+    const std::lock_guard<std::mutex> lock(allocator_mutex_);
+    TORCH_CHECK(
+        allocation_metadata_.count(ptr),
+        "Trying to free a pointer not allocated here");
+    auto pair = allocation_metadata_[ptr];
+    size = pair.first;
+    stream = pair.second;
+    allocation_metadata_.erase(ptr);
+  }
+  free_fn_(ptr, size, stream);
+}
+
+void CUDAPluggableAllocator::init(int device_count) {
+  if (init_fn_) {
+    init_fn_(device_count);
+  }
+  initialized_ = true;
+}
+
+bool CUDAPluggableAllocator::initialized() {
+  return initialized_;
+}
+
+void CUDAPluggableAllocator::setMemoryFraction(double fraction, int device) {
+  if (memory_fraction_fn_) {
+    memory_fraction_fn_(fraction, device);
+  }
+}
+
+void CUDAPluggableAllocator::emptyCache(void) {
+  if (reset_fn_) {
+    return reset_fn_();
+  }
+}
+
+void CUDAPluggableAllocator::cacheInfo(int dev_id, size_t* largestBlock) {
+  TORCH_CHECK(
+      false,
+      "CUDAPluggableAllocator does not yet support cacheInfo. "
+      "If you need it, please file an issue describing your use case.");
+}
+
+void* CUDAPluggableAllocator::getBaseAllocation(void* ptr, size_t* size) {
+  if (base_alloc_fn_) {
+    return base_alloc_fn_(ptr, size);
+  } else {
+    return ptr;
+  }
+}
+
+void CUDAPluggableAllocator::recordStream(
+    const c10::DataPtr& ptr,
+    streamType stream) {
+  if (record_stream_fn_) {
+    record_stream_fn_(ptr.get(), stream);
+  }
+}
+
+c10::cuda::CUDACachingAllocator::DeviceStats CUDAPluggableAllocator::
+    getDeviceStats(int device) {
+  TORCH_CHECK(
+      false,
+      "CUDAPluggableAllocator does not yet support getDeviceStats. "
+      "If you need it, please file an issue describing your use case.");
+}
+
+void CUDAPluggableAllocator::resetAccumulatedStats(int device) {
+  TORCH_CHECK(
+      false,
+      "CUDAPluggableAllocator does not yet support resetAccumulatedStats. "
+      "If you need it, please file an issue describing your use case.");
+}
+
+void CUDAPluggableAllocator::resetPeakStats(int device) {
+  TORCH_CHECK(
+      false,
+      "CUDAPluggableAllocator does not yet support resetPeakStats. "
+      "If you need it, please file an issue describing your use case.");
+}
+
+c10::cuda::CUDACachingAllocator::SnapshotInfo CUDAPluggableAllocator::
+    snapshot() {
+  TORCH_CHECK(
+      false,
+      "CUDAPluggableAllocator does not yet support snapshot. "
+      "If you need it, please file an issue describing your use case.");
+}
+
+std::shared_ptr<void> CUDAPluggableAllocator::getIpcDevPtr(std::string handle) {
+  TORCH_CHECK(
+      false,
+      "CUDAPluggableAllocator does not yet support getIpcDevPtr. "
+      "If you need it, please file an issue describing your use case.");
+}
+
+// CUDAGraph interactions
+void CUDAPluggableAllocator::notifyCaptureBegin(
+    int device,
+    c10::cuda::CaptureId_t graph_id,
+    c10::cuda::MempoolId_t mempool_id) {
+  if (capture_begin_fn_) {
+    capture_begin_fn_(device, graph_id, mempool_id);
+  }
+}
+
+void CUDAPluggableAllocator::notifyCaptureAboutToEnd(
+    int device,
+    c10::cuda::CaptureId_t graph_id) {
+  if (capture_about_to_end_fn_) {
+    capture_about_to_end_fn_(device, graph_id);
+  }
+}
+
+void CUDAPluggableAllocator::notifyCaptureEnded(
+    int device,
+    c10::cuda::CaptureId_t graph_id) {
+  if (capture_ended_fn_) {
+    capture_ended_fn_(device, graph_id);
+  }
+}
+
+void CUDAPluggableAllocator::notifyCaptureDestroy(
+    int device,
+    c10::cuda::MempoolId_t mempool_id) {
+  if (capture_destroy_fn_) {
+    capture_destroy_fn_(device, mempool_id);
+  }
+}
+
+void CUDAPluggableAllocator::recordHistory(
+    bool enabled,
+    c10::cuda::CUDACachingAllocator::CreateContextFn context_recorder,
+    size_t alloc_trace_max_entries,
+    bool alloc_trace_record_context) {
+  TORCH_CHECK(
+      false,
+      "CUDAPluggableAllocator does not yet support recordHistory. "
+      "If you need it, please file an issue describing your use case.");
+}
+
+void CUDAPluggableAllocator::attachOutOfMemoryObserver(
+    c10::cuda::CUDACachingAllocator::OutOfMemoryObserver observer) {
+  TORCH_CHECK(
+      false,
+      "CUDAPluggableAllocator does not yet support attachOutOfMemoryObserver. "
+      "If you need it, please file an issue describing your use case.");
+}
+
+bool CUDAPluggableAllocator::needsPoolSpecificPeerAccess() {
+  return false;
+}
+
+std::string CUDAPluggableAllocator::name() {
+  return "pluggable";
+}
+
+std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
+    current_custom_allocator;
+
+std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
+getCurrentAllocator() {
+  return current_custom_allocator;
+}
+
+// TODO: add more functions in the argument
+std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
+createCustomAllocator(
+    std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+    std::function<void(void*, size_t, cudaStream_t)> free_fn) {
+  std::shared_ptr<CUDAPluggableAllocator> allocator(
+      new CUDAPluggableAllocator(alloc_fn, free_fn));
+  allocator->init(device_count);
+  return allocator;
+}
+
+void changeCurrentAllocator(
+    std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator> allocator) {
+  TORCH_CHECK(
+      !getCurrentAllocator()->initialized(),
+      "Can't swap an already initialized allocator");
+  c10::cuda::CUDACachingAllocator::allocator.store(allocator.get());
+  current_custom_allocator = allocator;
+}
+
+void custom_raw_deleter(void* ptr) {
+  current_custom_allocator->raw_delete(ptr);
+}
+
+} // namespace CUDAPluggableAllocator
+} // namespace cuda
+} // namespace torch
diff --git a/torch/csrc/cuda/CUDAPluggableAllocator.h b/torch/csrc/cuda/CUDAPluggableAllocator.h
new file mode 100644
index 0000000000000..a02acabe3cd85
--- /dev/null
+++ b/torch/csrc/cuda/CUDAPluggableAllocator.h
@@ -0,0 +1,135 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/cuda/CUDAGraphsC10Utils.h>
+#include <c10/cuda/CUDAMacros.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <c10/cuda/CUDACachingAllocator.h>
+
+#include <array>
+#include <mutex>
+
+namespace torch {
+
+namespace cuda {
+
+namespace CUDAPluggableAllocator {
+
+#if defined(TORCH_HIP_VERSION)
+using streamType = c10::hip::HIPStream;
+#else
+using streamType = c10::cuda::CUDAStream;
+#endif
+
+std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
+getCurrentAllocator();
+std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
+createCustomAllocator(
+    std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+    std::function<void(void*, size_t, cudaStream_t)> free_fn);
+void changeCurrentAllocator(
+    std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator> allocator);
+
+struct CUDAPluggableAllocator
+    : public c10::cuda::CUDACachingAllocator::CUDAAllocator {
+  CUDAPluggableAllocator(
+      std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+      std::function<void(void*, size_t, cudaStream_t)> free_fn);
+
+  CUDAPluggableAllocator(CUDAPluggableAllocator& other);
+
+  void set_init_fn(std::function<void(int)> init_fn);
+
+  void set_reset_fn(std::function<void()> reset_fn);
+
+  void set_memory_fraction_fn(
+      std::function<void(double, int)> memory_fraction_fn);
+
+  void set_base_alloc_fn(std::function<void*(void*, size_t*)> base_alloc_fn);
+
+  void set_record_stream_fn(
+      std::function<void(void* ptr, cudaStream_t stream)> record_stream_fn);
+
+  void set_capture_begin_fn(
+      std::function<void(int, c10::cuda::CaptureId_t, c10::cuda::MempoolId_t)>
+          capture_begin_fn);
+
+  void set_capture_about_to_end_fn(
+      std::function<void(int, c10::cuda::CaptureId_t)> capture_about_to_end_fn);
+
+  void set_capture_ended_fn(
+      std::function<void(int, c10::cuda::CaptureId_t)> capture_ended_fn);
+
+  void set_capture_destroy_fn(
+      std::function<void(int, c10::cuda::MempoolId_t)> capture_destroy_fn);
+
+  void* malloc(size_t size, int device, cudaStream_t stream);
+
+  c10::DataPtr allocate(size_t size) const;
+  c10::DeleterFnPtr raw_deleter() const;
+
+  virtual void* raw_alloc(size_t nbytes) override;
+  virtual void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream)
+      override;
+  virtual void raw_delete(void* ptr) override;
+  virtual void init(int device_count) override;
+  virtual bool initialized() override;
+  virtual void setMemoryFraction(double fraction, int device) override;
+  virtual void emptyCache() override;
+  virtual void cacheInfo(int dev_id, size_t* largestBlock) override;
+  virtual void* getBaseAllocation(void* ptr, size_t* size) override;
+
+  virtual void recordStream(const c10::DataPtr&, streamType stream) override;
+
+  virtual c10::cuda::CUDACachingAllocator::DeviceStats getDeviceStats(
+      int device) override;
+  virtual void resetAccumulatedStats(int device) override;
+  virtual void resetPeakStats(int device) override;
+  virtual c10::cuda::CUDACachingAllocator::SnapshotInfo snapshot() override;
+  virtual void notifyCaptureBegin(
+      int device,
+      c10::cuda::CaptureId_t graph_id,
+      c10::cuda::MempoolId_t mempool_id) override;
+  virtual void notifyCaptureAboutToEnd(
+      int device,
+      c10::cuda::CaptureId_t graph_id) override;
+  virtual void notifyCaptureEnded(int device, c10::cuda::CaptureId_t graph_id)
+      override;
+  virtual void notifyCaptureDestroy(
+      int device,
+      c10::cuda::MempoolId_t mempool_id) override;
+  virtual std::shared_ptr<void> getIpcDevPtr(std::string handle) override;
+  virtual void recordHistory(
+      bool enabled,
+      c10::cuda::CUDACachingAllocator::CreateContextFn context_recorder,
+      size_t alloc_trace_max_entries,
+      bool alloc_trace_record_context) override;
+  virtual void attachOutOfMemoryObserver(
+      c10::cuda::CUDACachingAllocator::OutOfMemoryObserver observer) override;
+  virtual bool needsPoolSpecificPeerAccess() override;
+  virtual std::string name() override;
+
+ protected:
+  std::function<void*(size_t, int, cudaStream_t)> alloc_fn_;
+  std::function<void(void*, size_t, cudaStream_t)> free_fn_;
+  std::function<void(int)> init_fn_;
+  std::function<void()> reset_fn_;
+  std::function<void(double, int)> memory_fraction_fn_;
+  std::function<void*(void*, size_t*)> base_alloc_fn_;
+  std::function<void(void* ptr, cudaStream_t stream)> record_stream_fn_;
+  std::function<void(int, c10::cuda::CaptureId_t, c10::cuda::MempoolId_t)>
+      capture_begin_fn_;
+  std::function<void(int, c10::cuda::CaptureId_t)> capture_about_to_end_fn_;
+  std::function<void(int, c10::cuda::CaptureId_t)> capture_ended_fn_;
+  std::function<void(int, c10::cuda::MempoolId_t)> capture_destroy_fn_;
+  std::mutex allocator_mutex_;
+  // We do the bookeeping here in order to simplify custom allocators
+  std::unordered_map<void*, std::pair<size_t, cudaStream_t>>
+      allocation_metadata_;
+
+  bool initialized_ = false;
+};
+} // namespace CUDAPluggableAllocator
+} // namespace cuda
+} // namespace torch
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index 10dac0e0d0f7b..b526f87edd75d 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -11,6 +11,7 @@
 #include <ATen/cuda/Sleep.h>
 #include <ATen/cuda/detail/CUDAHooks.h>
 #include <ATen/cuda/jiterator.h>
+#include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAFunctions.h>
 #include <ATen/cuda/CUDAGraphsUtils.cuh>
 #ifdef USE_NCCL
@@ -21,6 +22,7 @@
 
 #include <torch/csrc/CudaIPCTypes.h>
 #include <torch/csrc/Generator.h>
+#include <torch/csrc/cuda/CUDAPluggableAllocator.h>
 #include <torch/csrc/cuda/THCP.h>
 #include <torch/csrc/cuda/python_comm.h>
 #include <torch/csrc/python_headers.h>
@@ -851,6 +853,123 @@ static void registerCudaDeviceProperties(PyObject* module) {
       });
 }
 
+static void registerCudaPluggableAllocator(PyObject* module) {
+  auto m = py::handle(module).cast<py::module>();
+
+  py::class_<
+      c10::cuda::CUDACachingAllocator::CUDAAllocator,
+      std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>>(
+      m, "_cuda_CUDAAllocator");
+  m.def("_cuda_getAllocator", []() {
+    return py::cast(torch::cuda::CUDAPluggableAllocator::getCurrentAllocator());
+  });
+
+  m.def(
+      "_cuda_changeCurrentAllocator",
+      [](std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
+             allocator) {
+        torch::cuda::CUDAPluggableAllocator::changeCurrentAllocator(allocator);
+      });
+  py::class_<
+      torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator,
+      c10::cuda::CUDACachingAllocator::CUDAAllocator,
+      std::shared_ptr<
+          torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator>>(
+      m, "_CUDAPluggableAllocator")
+      .def(
+          "set_init_fn",
+          [](torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator& self,
+             uint64_t func_ptr) {
+            using FuncType = void(int);
+            std::function<FuncType> func =
+                reinterpret_cast<FuncType*>(func_ptr);
+            self.set_init_fn(func);
+          })
+      .def(
+          "set_reset_fn",
+          [](torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator& self,
+             uint64_t func_ptr) {
+            using FuncType = void();
+            std::function<FuncType> func =
+                reinterpret_cast<FuncType*>(func_ptr);
+            self.set_reset_fn(func);
+          })
+      .def(
+          "set_memory_fraction_fn",
+          [](torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator& self,
+             uint64_t func_ptr) {
+            using FuncType = void(double, int);
+            std::function<FuncType> func =
+                reinterpret_cast<FuncType*>(func_ptr);
+            self.set_memory_fraction_fn(func);
+          })
+      .def(
+          "set_base_alloc_fn",
+          [](torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator& self,
+             uint64_t func_ptr) {
+            using FuncType = void*(void*, size_t*);
+            std::function<FuncType> func =
+                reinterpret_cast<FuncType*>(func_ptr);
+            self.set_base_alloc_fn(func);
+          })
+      .def(
+          "set_record_stream_fn",
+          [](torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator& self,
+             uint64_t func_ptr) {
+            using FuncType = void(void*, cudaStream_t);
+            std::function<FuncType> func =
+                reinterpret_cast<FuncType*>(func_ptr);
+            self.set_record_stream_fn(func);
+          })
+      .def(
+          "set_capture_begin_fn",
+          [](torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator& self,
+             uint64_t func_ptr) {
+            using FuncType =
+                void(int, c10::cuda::CaptureId_t, c10::cuda::MempoolId_t);
+            std::function<FuncType> func =
+                reinterpret_cast<FuncType*>(func_ptr);
+            self.set_capture_begin_fn(func);
+          })
+      .def(
+          "set_capture_about_to_end_fn",
+          [](torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator& self,
+             uint64_t func_ptr) {
+            using FuncType = void(int, c10::cuda::CaptureId_t);
+            std::function<FuncType> func =
+                reinterpret_cast<FuncType*>(func_ptr);
+            self.set_capture_about_to_end_fn(func);
+          })
+      .def(
+          "set_capture_ended_fn",
+          [](torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator& self,
+             uint64_t func_ptr) {
+            using FuncType = void(int, c10::cuda::CaptureId_t);
+            std::function<FuncType> func =
+                reinterpret_cast<FuncType*>(func_ptr);
+            self.set_capture_ended_fn(func);
+          })
+      .def(
+          "set_capture_destroy_fn",
+          [](torch::cuda::CUDAPluggableAllocator::CUDAPluggableAllocator& self,
+             uint64_t func_ptr) {
+            using FuncType = void(int, c10::cuda::MempoolId_t);
+            std::function<FuncType> func =
+                reinterpret_cast<FuncType*>(func_ptr);
+            self.set_capture_destroy_fn(func);
+          });
+  m.def("_cuda_customAllocator", [](uint64_t malloc_ptr, uint64_t free_ptr) {
+    using MallocFuncType = void*(size_t, int, cudaStream_t);
+    using FreeFuncType = void(void*, size_t, cudaStream_t);
+    std::function<MallocFuncType> malloc_fn =
+        reinterpret_cast<MallocFuncType*>(malloc_ptr);
+    std::function<FreeFuncType> free_fn =
+        reinterpret_cast<FreeFuncType*>(free_ptr);
+    return torch::cuda::CUDAPluggableAllocator::createCustomAllocator(
+        malloc_fn, free_fn);
+  });
+}
+
 static void bindGetDeviceProperties(PyObject* module) {
   // Add method to torch.cuda
   auto m = py::handle(module).cast<py::module>();
@@ -1141,6 +1260,7 @@ void initModule(PyObject* module) {
   shared::initCudnnBindings(module);
 #endif
   registerCudaDeviceProperties(module);
+  registerCudaPluggableAllocator(module);
 }
 
 } // namespace cuda
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index a684f2291de25..1eb6c70ab7b80 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -911,13 +911,14 @@ def _dtype(self):
     'CUDAGraph', 'CudaError', 'DeferredCudaCallError', 'Event', 'ExternalStream', 'OutOfMemoryError',
     'Stream', 'StreamContext', 'amp', 'caching_allocator_alloc', 'caching_allocator_delete', 'can_device_access_peer',
     'check_error', 'cudaStatus', 'cudart', 'current_blas_handle', 'current_device', 'current_stream', 'default_generators',
-    'default_stream', 'device', 'device_count', 'device_of', 'empty_cache', 'get_allocator_backend', 'get_arch_list',
-    'get_device_capability', 'get_device_name', 'get_device_properties', 'get_gencode_flags', 'get_rng_state', 'get_rng_state_all',
-    'get_sync_debug_mode', 'graph', 'graph_pool_handle', 'graphs', 'has_half', 'has_magma', 'init', 'initial_seed', 'ipc_collect',
-    'is_available', 'is_bf16_supported', 'is_current_stream_capturing', 'is_initialized', 'jiterator', 'list_gpu_processes',
-    'make_graphed_callables', 'manual_seed', 'manual_seed_all', 'max_memory_allocated', 'max_memory_cached', 'max_memory_reserved',
-    'mem_get_info', 'memory', 'memory_allocated', 'memory_cached', 'memory_reserved', 'memory_snapshot', 'memory_stats',
-    'memory_stats_as_nested_dict', 'memory_summary', 'memory_usage', 'nccl', 'nvtx', 'profiler', 'random',
-    'reset_accumulated_memory_stats', 'reset_max_memory_allocated', 'reset_max_memory_cached', 'reset_peak_memory_stats',
-    'seed', 'seed_all', 'set_device', 'set_per_process_memory_fraction', 'set_rng_state', 'set_rng_state_all', 'set_stream',
-    'set_sync_debug_mode', 'sparse', 'stream', 'streams', 'synchronize', 'utilization']
+    'default_stream', 'device', 'device_count', 'device_of', 'empty_cache', 'get_allocator_backend', 'CUDAPluggableAllocator',
+    'change_current_allocator', 'get_arch_list', 'get_device_capability', 'get_device_name', 'get_device_properties',
+    'get_gencode_flags', 'get_rng_state', 'get_rng_state_all', 'get_sync_debug_mode', 'graph', 'graph_pool_handle', 'graphs',
+    'has_half', 'has_magma', 'init', 'initial_seed', 'ipc_collect', 'is_available', 'is_bf16_supported',
+    'is_current_stream_capturing', 'is_initialized', 'jiterator', 'list_gpu_processes', 'make_graphed_callables',
+    'manual_seed', 'manual_seed_all', 'max_memory_allocated', 'max_memory_cached', 'max_memory_reserved',
+    'mem_get_info', 'memory', 'memory_allocated', 'memory_cached', 'memory_reserved', 'memory_snapshot',
+    'memory_stats', 'memory_stats_as_nested_dict', 'memory_summary', 'memory_usage', 'nccl', 'nvtx', 'profiler',
+    'random', 'reset_accumulated_memory_stats', 'reset_max_memory_allocated', 'reset_max_memory_cached',
+    'reset_peak_memory_stats', 'seed', 'seed_all', 'set_device', 'set_per_process_memory_fraction', 'set_rng_state',
+    'set_rng_state_all', 'set_stream', 'set_sync_debug_mode', 'sparse', 'stream', 'streams', 'synchronize', 'utilization']
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index 9f9ae724a15d6..c40d9de580406 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -1,10 +1,12 @@
 import collections
 import contextlib
+import ctypes
 import warnings
 from typing import Any, Dict, Union, Tuple
 
 import torch
 from . import is_initialized, _get_device_index, _lazy_init
+from ._utils import _dummy_type
 
 from ._memory_viz import segments as _segments, memory as _memory
 
@@ -16,7 +18,13 @@
            "reset_peak_memory_stats", "reset_max_memory_allocated", "reset_max_memory_cached",
            "memory_allocated", "max_memory_allocated", "memory_reserved", "max_memory_reserved",
            "memory_cached", "max_memory_cached", "memory_snapshot", "memory_summary", "list_gpu_processes",
-           "mem_get_info", "get_allocator_backend"]
+           "mem_get_info", "get_allocator_backend", "CUDAPluggableAllocator", "change_current_allocator"]
+
+
+if not hasattr(torch._C, '_cuda_CUDAAllocator'):
+    # Define dummy base classes
+    torch._C.__dict__['_cuda_CUDAAllocator'] = _dummy_type('_cuda_CUDAAllocator')
+
 
 def _host_allocator():
     _lazy_init()
@@ -651,3 +659,66 @@ def get_allocator_backend() -> str:
         See :ref:`cuda-memory-management` for details on choosing the allocator backend.
     """
     return torch._C._cuda_getAllocatorBackend()
+
+class _CUDAAllocator:
+    r"""Wrapper over internal CUDA memory allocators.
+    """
+    def __init__(self, allocator: torch._C._cuda_CUDAAllocator):
+        self._allocator = allocator
+
+    def allocator(self):
+        return self._allocator
+
+
+class CUDAPluggableAllocator(_CUDAAllocator):
+    r"""CUDA memory allocator loaded from a so file.
+
+    Memory allocators are compiled in .so files and loaded dynamically using ctypes.
+    To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator`
+    function.
+
+    Args:
+        path_to_so_file(str): Path in the filesystem to the `.so` file containing
+            the allocator functions
+        alloc_fn_name(str): Name of the function to perform the memory allocation
+            in the so file. The signature must be:
+            void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream);
+        free_fn_name(str): Name of the function to perform the memory release
+            in the so file. The signature must be:
+            void free_fn_name(void* ptr, size_t size, cudaStream_t stream);
+
+    .. warning::
+        This is currently supported only in unix OSs
+
+    .. note::
+        See :ref:`cuda-memory-management` for details on creating and using a custom allocator
+    """
+    def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str):
+        allocator = ctypes.CDLL(path_to_so_file)
+        alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value
+        free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value
+        assert alloc_fn is not None
+        assert free_fn is not None
+        self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn)
+
+
+def change_current_allocator(allocator: _CUDAAllocator) -> None:
+    r"""Changes the currently used memory allocator to be the one provided.
+    If the current allocator has already been used/initialized, this function will error.
+
+
+    Args:
+        allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one.
+    .. note::
+        See :ref:`cuda-memory-management` for details on creating and using a custom allocator
+    """
+    torch._C._cuda_changeCurrentAllocator(allocator.allocator())
+
+
+def _get_current_allocator() -> _CUDAAllocator:
+    r"""Returns the allocator being currently used.
+
+    .. note::
+        See :ref:`cuda-memory-management` for details on creating and using a custom allocator
+    """
+    return _CUDAAllocator(torch._C._cuda_getAllocator())
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index 33e14e9e0572e..9a3065c675a28 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -8085,6 +8085,20 @@
                 API_PYTORCH,
             ),
         ),
+        (
+            "cuda::CUDAAllocator::recordStream",
+            (
+                "hip::HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA",
+                API_PYTORCH,
+            ),
+        ),
+        (
+            "CUDAAllocator::recordStream",
+            (
+                "HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA",
+                API_PYTORCH,
+            ),
+        ),
         ("cuda::CUDAStream", ("hip::HIPStreamMasqueradingAsCUDA", API_PYTORCH)),
         ("CUDAStream", ("HIPStreamMasqueradingAsCUDA", API_PYTORCH)),
         (
@@ -8295,6 +8309,9 @@
         ("setCurrentCUDAStream", ("setCurrentHIPStream", API_C10)),
         ("cuda::CUDACachingAllocator", ("hip::HIPCachingAllocator", API_C10)),
         ("CUDACachingAllocator", ("HIPCachingAllocator", API_C10)),
+        ("c10::cuda::CUDAAllocator", ("c10::hip::HIPAllocator", API_C10)),
+        ("cuda::CUDAAllocator", ("hip::HIPAllocator", API_C10)),
+        ("CUDAAllocator", ("HIPAllocator", API_C10)),
         ("C10_CUDA_KERNEL_LAUNCH_CHECK", ("C10_HIP_KERNEL_LAUNCH_CHECK", API_C10))
     ]
 )

From d31b3b3e5911da89feb90ed7903d7da3f17b7ee7 Mon Sep 17 00:00:00 2001
From: Catherine Lee <csl@fb.com>
Date: Wed, 23 Nov 2022 18:27:37 +0000
Subject: [PATCH 1210/1922] Gate leak check and reruns on schedule (#89504)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89504
Approved by: https://github.com/huydhn
---
 .github/actions/filter-test-configs/action.yml | 3 ++-
 .github/scripts/filter_test_configs.py         | 5 ++++-
 .github/workflows/periodic.yml                 | 1 +
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/.github/actions/filter-test-configs/action.yml b/.github/actions/filter-test-configs/action.yml
index 76399e325ef21..0253577134c8a 100644
--- a/.github/actions/filter-test-configs/action.yml
+++ b/.github/actions/filter-test-configs/action.yml
@@ -53,7 +53,8 @@ runs:
           --test-matrix "${{ inputs.test-matrix }}" \
           --pr-number "${{ github.event.pull_request.number }}" \
           --tag "${{ steps.parse-ref.outputs.tag }}" \
-          --event-name "${{ github.event_name }}"
+          --event-name "${{ github.event_name }}" \
+          --schedule "${{ github.event.schedule }}"
 
     - name: Print the filtered test matrix
       shell: bash
diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py
index f5c438c29e902..eab32401ad97f 100755
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@@ -50,6 +50,7 @@ def parse_args() -> Any:
     parser.add_argument("--pr-number", type=str, help="the pull request number")
     parser.add_argument("--tag", type=str, help="the associated tag if it exists")
     parser.add_argument("--event-name", type=str, help="name of the event that triggered the job (pull, schedule, etc)")
+    parser.add_argument("--schedule", type=str, help="cron schedule that triggered the job")
     return parser.parse_args()
 
 
@@ -188,7 +189,9 @@ def main() -> None:
         # No PR number, no tag, we can just return the test matrix as it is
         filtered_test_matrix = test_matrix
 
-    if args.event_name == "schedule":
+    if args.event_name == "schedule" and args.schedule == '29 8 * * *':
+        # we don't want to run the mem leack check or disabled tests on normal
+        # periodically scheduled jobs, only the ones at this time
         filtered_test_matrix = set_periodic_modes(filtered_test_matrix)
 
     # Set the filtered test matrix as the output
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index 80ad04c9be321..9a188345899dc 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -3,6 +3,7 @@ name: periodic
 on:
   schedule:
     - cron: 45 0,4,8,12,16,20 * * *
+    - cron: 29 8 * * *  # about 1:29am PDT, for mem leak check and rerun disabled tests
   push:
     tags:
       - ciflow/periodic/*

From 227d7758eb91a7940d5d5ee816f7dcd185a1fbbd Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Tue, 22 Nov 2022 20:29:26 -0800
Subject: [PATCH 1211/1922] [quant][be] Refactor the error checking code for
 quantize_per_channel op (#89271)

Summary:
at

Test Plan:
make sure it compiles

Reviewers:

Subscribers:

Tasks:

Tags:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89271
Approved by: https://github.com/andrewor14
---
 .../ATen/native/quantized/AffineQuantizer.cpp | 47 ++++++++-----------
 1 file changed, 19 insertions(+), 28 deletions(-)

diff --git a/aten/src/ATen/native/quantized/AffineQuantizer.cpp b/aten/src/ATen/native/quantized/AffineQuantizer.cpp
index e2fa8f65adc60..dbda6ebd5f902 100644
--- a/aten/src/ATen/native/quantized/AffineQuantizer.cpp
+++ b/aten/src/ATen/native/quantized/AffineQuantizer.cpp
@@ -97,6 +97,21 @@ void checkSameSize(
       " only works with Tensors with the same shape");
 }
 
+void checkPerChannelParamsSize(
+    const Tensor& rtensor,
+    int64_t axis,
+    const Tensor& scales,
+    const Tensor& zero_points
+) {
+  int64_t channel = rtensor.size(axis);
+  TORCH_CHECK(
+      channel == int64_t(scales.numel()),
+      "length of scales must equal to channel, expected ", channel, " got, ", scales.numel());
+  TORCH_CHECK(
+      channel == int64_t(zero_points.numel()),
+      "length of zero_points must equal to channel expected ", channel, " got, ", zero_points.numel());
+}
+
 } // anonymous namespace
 
 Tensor& quantize_tensor_per_tensor_affine(
@@ -156,13 +171,7 @@ Tensor& quantize_tensor_per_channel_affine(
       "Expected: [0, ",
       rtensor.dim(),
       ")");
-  int64_t channel = rtensor.size(axis);
-  TORCH_CHECK(
-      channel == int64_t(scales.numel()),
-      "length of scales must equal to channel");
-  TORCH_CHECK(
-      channel == int64_t(zero_points.numel()),
-      "length of zero_points must equal to channel");
+  checkPerChannelParamsSize(rtensor, axis, scales, zero_points);
 
   quantize_tensor_per_channel_affine_stub(
       rtensor.device().type(), rtensor, qtensor, scales, zero_points, axis);
@@ -195,13 +204,7 @@ Tensor& quantize_tensor_per_channel_float_qparams(
       "Expected: [0, ",
       rtensor.dim(),
       ")");
-  int64_t channel = rtensor.size(axis);
-  TORCH_CHECK(
-      channel == int64_t(scales.numel()),
-      "length of scales must equal to channel");
-  TORCH_CHECK(
-      channel == int64_t(zero_points.numel()),
-      "length of zero_points must equal to channel");
+  checkPerChannelParamsSize(rtensor, axis, scales, zero_points);
 
   quantize_tensor_per_channel_float_qparams_stub(
       rtensor.device().type(), rtensor, qtensor, scales, zero_points, axis);
@@ -260,13 +263,7 @@ Tensor& dequantize_tensor_per_channel_affine(
       " Expected: [0, ",
       qtensor.dim(),
       ")");
-  int64_t channel = qtensor.size(axis);
-  TORCH_CHECK(
-      channel == int64_t(scales.numel()),
-      "length of scales must equal to channel");
-  TORCH_CHECK(
-      channel == int64_t(zero_points.numel()),
-      "length of zero_points must equal to channel");
+  checkPerChannelParamsSize(rtensor, axis, scales, zero_points);
 
   dequantize_tensor_per_channel_affine_stub(
       qtensor.device().type(), qtensor, rtensor, scales, zero_points, axis);
@@ -297,13 +294,7 @@ Tensor& dequantize_tensor_per_channel_float_qparams(
       " Expected: [0, ",
       qtensor.dim(),
       ")");
-  int64_t channel = qtensor.size(axis);
-  TORCH_CHECK(
-      channel == int64_t(scales.numel()),
-      "length of scales must equal to channel");
-  TORCH_CHECK(
-      channel == int64_t(zero_points.numel()),
-      "length of zero_points must equal to channel");
+  checkPerChannelParamsSize(rtensor, axis, scales, zero_points);
 
   dequantize_tensor_per_channel_float_qparams_stub(
       qtensor.device().type(), qtensor, rtensor, scales, zero_points, axis);

From 33504e1520708e6bc1cac3588797abbe3bc06345 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Tue, 22 Nov 2022 20:29:26 -0800
Subject: [PATCH 1212/1922] [quant][be] Remove unused util code (#89272)

Summary:
att

Test Plan:
python test/test_quantization.py TestQuantizeFx

Reviewers:

Subscribers:

Tasks:

Tags:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89272
Approved by: https://github.com/andrewor14
---
 torch/ao/quantization/fx/utils.py | 74 -------------------------------
 1 file changed, 74 deletions(-)

diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py
index 73fdb0700144d..edf440de28e12 100644
--- a/torch/ao/quantization/fx/utils.py
+++ b/torch/ao/quantization/fx/utils.py
@@ -27,8 +27,6 @@
 from torch.ao.quantization.utils import (
     activation_is_statically_quantized,
     is_per_tensor,
-    is_per_channel,
-    to_underlying_dtype,
 )
 from torch.ao.quantization.quantize import is_activation_post_process
 
@@ -64,7 +62,6 @@
     "get_per_tensor_qparams",
     "get_qconv_op",
     "get_qconv_prepack_op",
-    "get_quantize_node_info",
     "get_skipped_module_name_and_classes",
     "graph_module_from_producer_nodes",
     "graph_pretty_str",
@@ -173,77 +170,6 @@ def get_per_tensor_qparams(activation_post_process):
     dtype = activation_post_process.dtype
     return scale, zero_point, dtype
 
-def get_quantize_node_info(
-    activation_post_process: Callable,
-    is_decomposed: bool
-) -> Optional[Tuple[str, Union[Callable[..., Any], str], Dict[str, Any]]]:
-    """ Extract information about quantize op from activation_post_process module
-    Args:
-      * `activation_post_process`: observer module instance or fake quant module instance
-        after calibration/QAT
-      * `is_decomposed`: a boolean flag to indicate whether we want to use the
-        quantize operator for decomposed quantized tensor (torch.ops.quantized_decomposed.quantize_per_tensor) or default/standalone
-        quantized tensor (torch.quantize_per_tensor)
-
-    Returns
-        node_type(e.g. call_function), quantize op(e.g. quantize_per_tensor) and a dictionary
-        of extracted qparams from the module
-    """
-    dtype = activation_post_process.dtype  # type: ignore[attr-defined]
-    compute_dtype = None
-    if hasattr(activation_post_process, "compute_dtype"):
-        compute_dtype = activation_post_process.compute_dtype  # type: ignore[attr-defined]
-    quantize_op : Optional[Union[Callable, str]] = None
-    if dtype in [torch.quint8, torch.qint8, torch.qint32] and \
-            not hasattr(activation_post_process, 'compute_dtype'):
-        node_type = "call_function"
-        scale, zero_point = activation_post_process.calculate_qparams()  # type: ignore[attr-defined]
-        if is_per_channel(activation_post_process.qscheme):  # type: ignore[attr-defined]
-            ch_axis = int(activation_post_process.ch_axis)  # type: ignore[attr-defined]
-            qparams = {"_scale_": scale, "_zero_point_": zero_point, "_axis_": ch_axis, "_dtype_": dtype}
-            if is_decomposed:
-                raise NotImplementedError("decomposed quantize_per_channel op not implemented yet")
-            else:
-                quantize_op = torch.quantize_per_channel
-        else:
-            scale = float(scale)
-            zero_point = int(zero_point)
-            if is_decomposed:
-                quant_min = activation_post_process.quant_min  # type: ignore[attr-defined]
-                quant_max = activation_post_process.quant_max  # type: ignore[attr-defined]
-                dtype = to_underlying_dtype(dtype)
-                qparams = {
-                    "_scale_": scale,
-                    "_zero_point_": zero_point,
-                    "_quant_min": quant_min,
-                    "_quant_max": quant_max,
-                    "_dtype_": dtype
-                }
-                quantize_op = torch.ops.quantized_decomposed.quantize_per_tensor
-            else:
-                qparams = {"_scale_": scale, "_zero_point_": zero_point, "_dtype_": dtype}
-                quantize_op = torch.quantize_per_tensor
-    elif compute_dtype in [torch.quint8, torch.qint8, torch.float16]:
-        # TODO(future PR): switch compute_dtype to is_dynamic
-        # dynamic quantization
-        node_type = "call_function"
-        if is_decomposed:
-            raise NotImplementedError("decomposed quantize_per_tensor_dynamic op not implemented yet")
-        else:
-            quantize_op = torch.quantize_per_tensor_dynamic
-        # TODO: get reduce range from observer
-        # reduce_range = activation_post_process.reduce_range
-        reduce_range = torch.backends.quantized.engine in ("fbgemm", "x86")
-        qparams = {"_dtype_": compute_dtype, "_reduce_range_": reduce_range}
-    elif dtype == torch.float16:
-        node_type = "call_method"
-        quantize_op = "to"
-        qparams = {"_dtype_": dtype}
-    else:
-        warnings.warn(f"Unsupported activation_post_process in get_quantize_node_info: {activation_post_process}")
-        return None
-    return node_type, quantize_op, qparams  # type: ignore[return-value]
-
 # Keep it here for BC in torch.quantization namespace, we can remove it after
 # we deprecate the torch.quantization namespace
 quantize_node = NotImplemented

From bdb03536771cd89372afc4f86f4a97f43dbc044b Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Wed, 23 Nov 2022 02:00:44 +0000
Subject: [PATCH 1213/1922] [inductor] Update CI model tests (#89499)

Summary:
1) Add model inference test
2) Switch model training test to use AMP

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89499
Approved by: https://github.com/bertmaher
---
 .jenkins/pytorch/test.sh    | 30 ++++++++++++----
 benchmarks/dynamo/common.py | 71 ++++++++++++-------------------------
 2 files changed, 45 insertions(+), 56 deletions(-)

diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 17437a56ae0e8..ca50a31beb60b 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -262,9 +262,14 @@ test_inductor_huggingface() {
   # will bark about file not found later on
   TEST_REPORTS_DIR=$(pwd)/test/test-reports
   mkdir -p "$TEST_REPORTS_DIR"
+  # Check inference with --float32
+  python benchmarks/dynamo/huggingface.py --ci --accuracy \
+    --device cuda --inductor --float32 --output "$TEST_REPORTS_DIR"/inductor_inference_huggingface.csv
+  python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_inference_huggingface.csv
+  # Check training with --amp
   python benchmarks/dynamo/huggingface.py --ci --training --accuracy \
-    --device cuda --inductor --float32 --output "$TEST_REPORTS_DIR"/inductor_huggingface.csv
-  python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_huggingface.csv
+    --device cuda --inductor --amp --output "$TEST_REPORTS_DIR"/inductor_training_huggingface.csv
+  python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_training_huggingface.csv
 }
 
 test_inductor_timm_shard() {
@@ -277,18 +282,29 @@ test_inductor_timm_shard() {
   # will bark about file not found later on
   TEST_REPORTS_DIR=$(pwd)/test/test-reports
   mkdir -p "$TEST_REPORTS_DIR"
-  python benchmarks/dynamo/timm_models.py --ci --training --accuracy \
+  # Check inference with --float32
+  python benchmarks/dynamo/timm_models.py --ci --accuracy \
     --device cuda --inductor --float32 --total-partitions 2 --partition-id "$1" \
-    --output "$TEST_REPORTS_DIR"/inductor_timm_"$1".csv
-  python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_timm_"$1".csv
+    --output "$TEST_REPORTS_DIR"/inductor_inference_timm_"$1".csv
+  python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_inference_timm_"$1".csv
+  # Check training with --amp
+  python benchmarks/dynamo/timm_models.py --ci --training --accuracy \
+    --device cuda --inductor --amp --total-partitions 2 --partition-id "$1" \
+    --output "$TEST_REPORTS_DIR"/inductor_training_timm_"$1".csv
+  python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_training_timm_"$1".csv
 }
 
 test_inductor_torchbench() {
   TEST_REPORTS_DIR=$(pwd)/test/test-reports
   mkdir -p "$TEST_REPORTS_DIR"
+  # Check inference with --float32
+  PYTHONPATH=$(pwd)/torchbench python benchmarks/dynamo/torchbench.py --ci --accuracy \
+    --device cuda --inductor --float32 --output "$TEST_REPORTS_DIR"/inductor_inference_torchbench.csv
+  python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_inference_torchbench.csv
+  # Check training with --amp
   PYTHONPATH=$(pwd)/torchbench python benchmarks/dynamo/torchbench.py --ci --training --accuracy \
-    --device cuda --inductor --float32 --output "$TEST_REPORTS_DIR"/inductor_torchbench.csv
-  python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_torchbench.csv
+    --device cuda --inductor --amp --output "$TEST_REPORTS_DIR"/inductor_training_torchbench.csv
+  python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_training_torchbench.csv
 }
 
 test_python_gloo_with_tls() {
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 98a326d77191f..95cd0cd4ca17e 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -87,74 +87,47 @@
 CI_SKIP_INDCUTOR_INFERENCE = [
     *CI_SKIP_AOT_EAGER_INFERENCE,
     # TorchBench
+    "DALLE2_pytorch",
     "detectron2",
-    "hf_Reformer",
+    "hf_T5",  # accuracy
+    "hf_BigBird",  # accuracy
+    "hf_GPT2_large",  # OOM
+    "maml",  # accuracy
+    "mobilenet_v2_quantized_qat",  # The eval test only supports CPU
     "moco",  # accuracy
+    "pytorch_struct",  # Test eval is not implemented
     "pyhpc_equation_of_state",  # Accuracy
     "pyhpc_turbulent_kinetic_energy",  # Accuracy
     "tacotron2",
     "vision_maskrcnn",  # accuracy
-    "yolov3",  # Accuracy
     # Huggingface
-    "BigBird",
-    "YituTechConvBert",
+    "DebertaV2ForQuestionAnswering",  # OOM
     # TIMM
     "cait_m36_384",  # Accuracy
     "ghostnet_100",  # Accuracy
-    "swin_base_patch4_window7_224",  # Accuracy
-    # Trying to get CI working - https://github.com/pytorch/pytorch/pull/87588
-    "visformer_small",  # fails accuracy on CI but passes locally
 ]
 
 CI_SKIP_INDUCTOR_TRAINING = [
-    # CI does not check accuracy for inductor training yet
-    # *CI_SKIP_AOT_EAGER_TRAINING,
-    # *CI_SKIP_INDCUTOR_INFERENCE,
+    *CI_SKIP_INDCUTOR_INFERENCE,
     # TorchBench
-    "DALLE2_pytorch",
-    "detectron2",
-    "functorch_dp_cifar10",
-    "mobilenet_v3_large",
-    "moco",
-    "tacotron2",
-    "vision_maskrcnn",  # from functionalization
-    # OOM
-    "Background_Matting",
-    "fastNLP_Bert",
-    "hf_BigBird",
-    "hf_T5_base",  # fp64_OOM
-    "mobilenet_v2",
-    "mobilenet_v2_quantized_qat",
-    "resnet50_quantized_qat",
-    "timm_regnet",
+    "Background_Matting",  # fp64_OOM
+    "mobilenet_v3_large",  # accuracy
+    "resnet50_quantized_qat",  # Eager model failed to run
     # Huggingface
-    "AllenaiLongformerBase",
-    "AlbertForMaskedLM",  # OOM
-    "BartForConditionalGeneration",  # OOM
+    "BlenderbotForCausalLM",  # OOM
+    "GoogleFnet",  # Eager model failed to run
     "M2M100ForConditionalGeneration",  # OOM
-    "MBartForConditionalGeneration",  # OOM
-    "MT5ForConditionalGeneration",  # OOM
-    "PegasusForConditionalGeneration",  # OOM
-    "XGLMForCausalLM",  # fp64_OOM
-    "DebertaV2ForMaskedLM",  # OOM
-    "DebertaV2ForQuestionAnswering",  # OOM
-    # OOM
-    "BigBird",
-    "TrOCRForCausalLM",
-    "AlbertForQuestionAnswering",
+    "XGLMForCausalLM",  # OOM
     # TIMM
-    "cait_m36_384",  # fp64_OOM
-    "coat_lite_mini",  # time out
     "convit_base",  # fp64_OOM
-    "gernet_l",  # accuracy
-    "gluon_xception65",
-    "hrnet_w18",  # accuracy
-    "lcnet_0500",  # accuracy
-    "levit_128",  # levit_128
-    "poolformer_m36",
+    "eca_halonext26ts",  # accuracy
+    "fbnetv3_b",  # accuracy
+    "levit_128",  # fp64_OOM
+    "res2net101_26w_4s",  # accuracy
+    "resnest101e",  # accuracy
     "rexnet_100",  # accuracy
-    "swin_base_patch4_window7_224",
-    "twins_pcpvt_base",  # time out
+    "spnasnet_100",  # accuracy
+    "swin_base_patch4_window7_224",  # accuracy
     "xcit_large_24_p8_224",  # fp64_OOM
 ]
 

From 4459af68b5abd5481bb15f6ee1cfd3ff870d1df2 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 23 Nov 2022 19:05:13 +0000
Subject: [PATCH 1214/1922] Revert "Fix the kineto daemon build condition
 (#89174)"

This reverts commit 9fd00f194ae4e28948a9a03a6382c20dde04e4fd.

Reverted https://github.com/pytorch/pytorch/pull/89174 on behalf of https://github.com/robieta due to For some reason this is interacting badly with NVFuser. I think it is instability in kineto, but until we figure out what's going on reverting is a necessary evil.
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3d70f6ef58161..6f31baa687d52 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -795,7 +795,7 @@ if(USE_SOURCE_DEBUG_ON_MOBILE)
   string(APPEND CMAKE_CXX_FLAGS " -DSYMBOLICATE_MOBILE_DEBUG_HANDLE")
 endif()
 
-if(BUILD_LITE_INTERPRETER AND USE_LITE_INTERPRETER_PROFILER)
+if(USE_LITE_INTERPRETER_PROFILER)
   string(APPEND CMAKE_CXX_FLAGS " -DEDGE_PROFILER_USE_KINETO")
 endif()
 

From 15343e4462d04909b69fcb5f44d58e24e2a0f36a Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@fb.com>
Date: Wed, 23 Nov 2022 08:29:08 -0800
Subject: [PATCH 1215/1922] first draft of input mutation handling for aot
 autograd (#88817)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88817
Approved by: https://github.com/ezyang, https://github.com/wconstab
---
 functorch/_src/aot_autograd.py                | 1078 +++++++++++++++--
 functorch/_src/partitioners.py                |   40 +-
 test/dynamo/test_aot_autograd.py              |    6 +-
 test/dynamo/test_aot_cudagraphs.py            |    1 -
 test/functorch/test_aotdispatch.py            |  626 +++++++++-
 test/inductor/test_torchinductor.py           |   33 +-
 test/inductor/test_torchinductor_opinfo.py    |    3 +-
 torch/_dynamo/optimizations/training.py       |   13 +-
 torch/_dynamo/variables/torch.py              |   25 +
 torch/_inductor/graph.py                      |   10 +-
 torch/_inductor/ir.py                         |    2 +-
 torch/csrc/DynamicTypes.cpp                   |    7 +-
 .../python_torch_functions_manual.cpp         |   19 +-
 13 files changed, 1711 insertions(+), 152 deletions(-)

diff --git a/functorch/_src/aot_autograd.py b/functorch/_src/aot_autograd.py
index d682d8b4b71b9..e3bb1e0303bcf 100644
--- a/functorch/_src/aot_autograd.py
+++ b/functorch/_src/aot_autograd.py
@@ -2,8 +2,10 @@
 import dataclasses
 import warnings
 from contextlib import contextmanager, nullcontext
+from dataclasses import dataclass
+from enum import Enum
 from functools import wraps
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 from torch.fx.experimental.proxy_tensor import is_sym_node
 
 import torch
@@ -15,6 +17,7 @@
 from torch._subclasses import FakeTensorMode, CrossRefFakeMode
 from torch.fx import immutable_collections, Interpreter
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
+from torch.multiprocessing.reductions import StorageWeakRef
 from torch.nn.utils import stateless
 
 from functorch import make_fx
@@ -30,7 +33,6 @@
     def disable_torchdynamo(x):
         return x
 
-
 try:
     from torchdynamo.utils import dynamo_timed
 except ImportError:
@@ -38,6 +40,8 @@ def disable_torchdynamo(x):
     def dynamo_timed(x):
         return x
 
+MutationType = Enum("MutationType", ("none", "metadata_only", "data"))
+OutputType = Enum("OutputType", ("non_alias", "alias_of_input", "alias_of_intermediate"))
 
 pytree._register_pytree_node(
     immutable_collections.immutable_list,
@@ -55,6 +59,8 @@ def dynamo_timed(x):
 aten = torch.ops.aten
 
 
+KNOWN_TYPES = [torch.Tensor, int, str, float, bool, torch.SymInt, torch.SymFloat]
+
 @contextmanager
 def preserve_rng_state():
     rng_state = torch.clone(torch.random.get_rng_state())
@@ -131,29 +137,91 @@ def posthook(grad_input, grad_output):
         )
         node.register_hook(get_posthook(special_stack))
 
+# This class tells us about a user's forward output that is an alias.
+# It can be an alias of either a user forward input, of of a graph intermediate.
+@dataclass(frozen=True)
+class OutputAliasInfo:
+    # Tells us if this output is:
+    # (1) a regular (non-aliased) output
+    # (2) an alias of a forward input
+    # (2) an alias of an intermediate (aka an alias of an output of the inner traced forward)
+    output_type: OutputType
+    # If (1) above, then
+    # - Tells us that the base of this alias is user_fwd_input[base_idx]
+    #   (This is an index into the inputs *before* we make synthetic bases)
+    # If (2) above, then
+    # - Tells us that the base of this alias is traced_fwd_outputs[base_idx]
+    #   here, this refers to the index of the *direct* traced
+    base_idx: int
+    # sizes, strides and storage offset of the aliased output are all returned as actual (sym)ints
+    # in the compiled forward. These indices tell us where in the forward outputs to grab them.
+    sizes_idx: Optional[int]
+    strides_idx: Optional[int]
+    storage_offset_idx: Optional[int]
+    # We store the actual output alias that we traced in the forward (should be a fake tensor)
+    # to grab any other non-symbolic properties on the output alias, like requires_grad.
+    # It's optional here, for cases where the user directly returns an input as an output.
+    # If output_type == non_alias, then these fields are also always None.
+    tensor_meta: Optional[Tensor]
+
+# This class tells us about how to perform a metadata mutation on forward inputs.
+# it only applies to forward inputs that experience metadata-only mutations
+@dataclass(frozen=True)
+class InputAliasInfo:
+    # This object gives us information about how to perform a metadata-mutation
+    # on original_fwd_inputs[base_idx]
+    #   (This is an index into the inputs *before* we make synthetic bases)
+    base_idx: int
+    # sizes, strides and storage offset of the aliased output are all returned as actual (sym)ints
+    # in the compiled forward. These indices tell us where in the forward outputs to grab them.
+    sizes_idx: int
+    strides_idx: int
+    storage_offset_idx: int
+    # We store the actual output alias that we traced in the forward (should be a fake tensor)
+    # to grab any other non-symbolic properties on the output alias, like requires_grad.
+    tensor_meta: Tensor
+
+# This class encapsulates all aliasing + mutation info we need about the forward graph
+# See a more detailed overview of the edge case handling at
+# https://docs.google.com/document/d/19UoIh_SVrMy_b2Sx5ZaeOJttm6P0Qmyss2rdBuyfoic/edit
+@dataclass(frozen=True)
+class ViewAndMutationMeta:
+    # length: # user forward inputs
+    # For every input, tells us whether the input:
+    # (a) is not mutated
+    # (b) only metadata is mutated
+    # (c) data (and maybe metadta) is mutated
+    mutated_input_info: List[MutationType]
+    # length: (# inputs of the user forward)
+    # metadata_mutation_input_info[i] is not None <====> mutated_input_info[i] == MutationType.metadata_only
+    # We stash the updated FakeTensor that we traced with in the forward in here,
+    # that way we can use it to replay the metadata mutation
+    metadata_mutation_input_info: List[Optional[InputAliasInfo]]
+    # length: # outputs in the compiled forward (not including output alias symints). Equal to:
+    # length: (# inputs w data mutations) + (# outputs that don't alias inputs)
+    # For every output *and* mutated input returned from the forward,
+    # tells us whether or not the output should require gradients or not
+    requires_grad_out_info: List[bool]
+    # length: # fw outputs
+    aliased_output_info: List[OutputAliasInfo]
+
+def gen_alias_from_base(aliased_base_tensor, size, stride, storage_offset, target_meta_tensor):
+    # handle R2C and C2R
+    if aliased_base_tensor.is_complex() and not target_meta_tensor.is_complex():
+        aliased_out = torch.view_as_real(aliased_base_tensor).as_strided(size, stride, storage_offset)
+    elif not aliased_base_tensor.is_complex() and target_meta_tensor.is_complex():
+        aliased_out = torch.view_as_complex(aliased_base_tensor).as_strided(size, stride, storage_offset)
+    else:
+        aliased_out = aliased_base_tensor.as_strided(size, stride, storage_offset)
+    # For outputs aliasing inputs, we need to check if the requires-gradness has changed.
+    if aliased_base_tensor.requires_grad and not target_meta_tensor.requires_grad:
+        aliased_out = aliased_out.detach()
+    elif not aliased_base_tensor.requires_grad and target_meta_tensor.requires_grad:
+        aliased_out.requires_grad_(True)
+    return aliased_out
 
 # This is a version of functionalization that is specifically designed
-# for the AOTAutograd use case.  It might be generally applicable though
-# (if so, move it out of this file), so I've tried to give it a name that
-# describes what it does.
-#
-# Given a function f, it produces a new function g that:
-#
-#   - Detaches all inputs before running f; the inner function
-#     does not directly participate in any pre-existing autograd.
-#     preserve_requires_grad is provided as a convenience to set the
-#     requires_grad on the new detached leaves in sync with the originals.
-#     (NB: In principle, you could backward through the pure operations
-#     produced by functionalization; this is not used for AOTAutograd
-#     and we have not tested it.)
-#
-#   - Functionalizes all operations on f, under the assumption that the passed
-#     in function f must be "observationally pure"; that is, it cannot perform any
-#     mutations (inplace data or view operations) on the passed in inputs, nor is
-#     it allowed to directly close over tensors that aren't passed via its
-#     arguments.  See
-#     https://docs.google.com/document/d/19UoIh_SVrMy_b2Sx5ZaeOJttm6P0Qmyss2rdBuyfoic/edit
-#     for discussion how how to implement the more complicated case.
+# for the AOTAutograd use case.
 #
 # Unlike functorch's variant, this doesn't use the functorch level system,
 # instead it directly uses PyTorch's conventional dispatcher to hit the
@@ -167,72 +235,450 @@ def posthook(grad_input, grad_output):
 #
 # TODO: Provide a faster version of this that assumes flat arguments
 # (so no pytree necessary)
-def detach_and_functionalize_pure(f, preserve_requires_grad=True):
+def run_functionalized_fw_and_collect_metadata(f):
+    def to_fun(t):
+        if isinstance(t, Tensor):
+            return torch._to_functional_tensor(t, mirror_autograd_meta=True)
+        else:
+            return t
+
+    def from_fun(t):
+        if not isinstance(t, Tensor) or not torch._is_functional_tensor(t):
+            return t
+        torch._sync(t)
+        return torch._from_functional_tensor(t)
+
     @wraps(f)
-    def inner(*args, **kwargs):
-        def to_fun(t):
-            if isinstance(t, Tensor):
-                r = torch._to_functional_tensor(t)
-                # NB: r is a leaf; it has no grad_fn relating
-                # it to t.  If t has autograd metadata, that
-                # metadata was preserved *inside* the r wrapper
-                if preserve_requires_grad:
-                    r.requires_grad = t.requires_grad
-                return r
-            else:
-                return t
+    def inner(*args):
+        # This function is meant to be run with the forward, which expects a flat list of tensor/symint/other args.
+        assert all(isinstance(a, torch.Tensor) or type(a) in KNOWN_TYPES for a in args)
+
+        collect_mutated_input_info: List[MutationType] = []
+        collect_requires_grad_out_info: List[bool] = []
+        collect_aliased_output_info: List[OutputAliasInfo] = []
+        collect_metadata_mutation_input_info: List[Optional[InputAliasInfo]] = []
 
-        f_args, f_kwargs = pytree.tree_map(to_fun, (args, kwargs))
+        f_args = pytree.tree_map(to_fun, args)
 
         torch._enable_functionalization(reapply_views=True)
         try:
-            outs = f(*f_args, **f_kwargs)
+            outs = f(*f_args)
         finally:
             torch._disable_functionalization()
 
-        # Detect input mutation and error if found
-        flat_args, _ = pytree.tree_flatten((args, kwargs))
-        flat_f_args, _ = pytree.tree_flatten((f_args, f_kwargs))
+        flat_args, _ = pytree.tree_flatten(args)
+        flat_f_args, _ = pytree.tree_flatten(f_args)
+        flat_outs, _ = pytree.tree_flatten(outs)
 
-        # This is just for sanity checking, can be skipped
-        for arg, f_arg in zip(flat_args, flat_f_args):
+        # Inspect the state of the input tensor functional wrapper to detect input mutation info
+        inputs_with_mutated_data = []
+        # If inp[i] has a metadata-only mutation, then maybe_inputs_with_mutated_metadata[i] contains the updated version
+        maybe_inputs_with_mutated_metadata: List[Optional[torch.Tensor]] = []
+        for (i, (arg, f_arg)) in enumerate(zip(flat_args, flat_f_args)):
             if not isinstance(arg, Tensor):
                 continue
             torch._sync(f_arg)
             new_arg = torch._from_functional_tensor(f_arg)
-            # I want to do this assert, but it is annoying because
-            # we have operator tests that have mutating inputs.  So
-            # I do something unsound instead
-            # assert arg is new_arg, "input argument was mutated, this is not valid"
             if arg is not new_arg:
-                assert arg.shape == new_arg.shape
-                arg.copy_(new_arg)
+                # Note [Input mutation handling in aot autograd]
+                # We use functionalization to detect two types in input mutations:
+                # (1) metadata-only input mutations, like input.t_()
+                # (2) data input mutations, like input.add_(1)
+                #     inputs that have both data and metadata mutated get lumped into (2).
+                #
+                # Why do we distinguish these two cases? aot autograd needs to handle them very differently.
+                # For data mutations, we return the updated inputs *directly* in the compiled forward graph.
+                # e.g.
+                # def f(x):
+                #     x.mul_(2)
+                #     out = x.mul(3)
+                #     return out
+                #
+                # // This function gets compiled and dumped inside of an autograd.Function.forward()
+                # def traced_forward(x):
+                #     x_updated = x.mul(2)
+                #     out = x_updated.mul(3)
+                #     return x_updated, out
+                #
+                # // The returned function will call the compiled forward, and apply input mutations afterwards
+                # def compiled_fn(x):
+                #    x_updated, out = traced_forward(x)
+                #    x.copy_(x_updated)
+                #    return out
+                #
+                # For input metadata mutations, though, we cannot return the "updated input" in the forward graph,
+                # Because it is an alias of an input. autograd.Function.forward can't handle arbitrary outputs that alias inputs.
+                # Instead, we stash the "updated input metadata" during tracing
+                # e.g.
+                # def f(x):
+                #     x.t_()
+                #     out = x.mul(3)
+                #     return out
+                #
+                # // This function gets compiled and dumped inside of an autograd.Function.forward()
+                # // (We don't return x_updated. Just return the original fw out)
+                # def traced_forward(x):
+                #     x_updated = x.t()
+                #     out = x_updated.mul(3)
+                #     return out
+                #
+                # // The returned function will call the compiled forward, and apply input mutations afterwards
+                # def compiled_fn(x):
+                #    out = traced_forward(x)
+                #    _x_updated_metadata = CompiledFunction.fw_metadata.metadata_mutation_input_info[0]
+                #    x.as_strided_(_x_updated_metadata.size(), _x_updated_metadata.stride(), _x_updated_metadata.storage_offset())
+                #    return out
+                if StorageWeakRef(arg.storage()) == StorageWeakRef(new_arg.storage()):
+                    # We can use the storage aliasing of the inputs and updated inputs
+                    # to detect when an input was actually updated, or just inplace-viewed.
+                    collect_mutated_input_info.append(MutationType.metadata_only)
+                else:
+                    collect_mutated_input_info.append(MutationType.data)
+                    # Only return mutated inputs that mutate *data*, not metadata
+                    # Note [Input mutation handling in aot autograd]
+                    inputs_with_mutated_data.append(new_arg)
+                    # For every mutated input, we ALSO need to return info on
+                    # whether than mutated input requires gradients. Why?
+                    # Our custom autograd.Function.forward returns updated inputs as outputs,
+                    collect_requires_grad_out_info.append(f_arg.requires_grad)
+            else:
+                collect_mutated_input_info.append(MutationType.none)
+
+            maybe_inputs_with_mutated_metadata.append(
+                new_arg if collect_mutated_input_info[-1] == MutationType.metadata_only else None)
+
+        def collect_grad_info(t):
+            # Collect info on which output tensors require gradients,
+            # so we can mark them properly in the returned autograd.Function.
+            # We only collect requires_grad info on real forward outputs, and not on inputs.
+            collect_requires_grad_out_info.append(isinstance(t, torch.Tensor) and t.requires_grad)
+
+        # Note [output alias handling in aot autograd]
+        # Given a function to compile where one of its outputs aliases an input,
+        # we need to remove that output from the compiled graph and generate it off to the side.
+        # e.g.
+        # def f(x):
+        #     return x.view(-1)
+        #
+        # Why? Two reasons:
+        # (1) If your autograd.Function returns a view on an input in the forward, autograd.Function
+        #     will not allow you to mutate it (This original came from arbitrary user code where the user might want to mutate)
+        # (2) There's no reason to compile views anyway. We can just regenerate the view of the input off to the side,
+        #
+        # Another interesting case is when you have both mutation and aliasing:
+        # def f(x):
+        #     x.mul_(2)
+        #     return x.view(-1)
+        #
+        # You could imagine that this output is now *safe* to compile and return in the autograd.Function,
+        # because after functionalization runs, it will technically not alias an input:
+        # def f_functionalized(x):
+        #     x_updated = x.mul(2)
+        #     return x_updated, x_updated.view(-1)
+        #
+        # However, this is still wrong: we can't return x_updated.view(-1) to the user. We are on the hook to return:
+        # def traced_forward(x):
+        #     x_updated = x.mul(2)
+        #     return x_updated
+        #
+        # def compiled_fn(x)
+        #     x_updated = traced_forward(x)
+        #     x.copy_(x_updated)
+        #     return x.view(-1)
+        #
+        # Why can't we return x_updated.view(-1) to the user?
+        # It can have different metadata from x.view(-1)! Specifically, the input x could be a non-memory-dense tensor,
+        # But the intermediate created by our graph, x_updated, will always be memory-dense.
+        def filter_and_record_aliased_outs(outputs):
+            # NOTE: this dict will clobber keys if we have multiple inputs that alias.
+            # Let's say inpA and inpB alias, and the user generated an output using out = inpA.view(...)
+            # For now, since we're not handling the case with multiple _base's sharing a storage,
+            # it is actually fine to arbitrarily pick which input to regenerate the aliased output from.
+            # e.g. out_new = inpB.as_strided(out.size(), out.stride(), out.storage_offset())
+            #
+            # This will be more complicated when you have multiple _base tensors aliasing the same
+            # underlying storage, when we eventually handle that.
+            # We'll need to ensure that we generate the view off of the right base.
+            inp_storage_refs = {StorageWeakRef(inpt.storage()): idx for idx, inpt in enumerate(flat_f_args)}
+            inp_tensor_ids = {id(inpt) for inpt in flat_f_args if isinstance(inpt, torch.Tensor)}
+            inp_storage_refs_set = set(inp_storage_refs)
+
+            non_aliased_input_outs = []
+            # For a given output tensor that alias an input, tells us:
+            # (1) the index of the input that we alias
+            # (2) Whether or not the output is a view of the input, or if `output is input`
+            #     (so we don't need to generate a view, and can return the input directly)
+            # Note: if the function returns an output that *is* an input, we still cannot return it in the graph.
+            # e.g.
+            #   def f(x):
+            #       x.add_(1)
+            #       return x
+            # Our compiled fw will return an "x_updated", but it is *not* ok to return that to the user.
+            # We need to manually do x.copy_(x_updated), and return the original x to the user.
+            # Why? for example, the metadata between x and x_updated might be different (e.g. _is_leaf())
+            aliased_out_idx: Dict[torch.Tensor, Tuple[int, bool]] = {}
+
+            for o in outputs:
+                # Note: When detecting input/output aliasing, we NEED to do it using the outer FunctionalTensorWrapper objects.
+                # In the case where we mutate an input *and* return a view of it, the outer wrappers will still alias,
+                # but the inner tensors no longer alias.
+                if isinstance(o, torch.Tensor) and StorageWeakRef(o.storage()) in inp_storage_refs:
+                    aliased_inp_idx = inp_storage_refs[StorageWeakRef(o.storage())]
+                    is_exact_input = id(o) in inp_tensor_ids
+                    aliases_intermediate_and_not_input = False
+                    aliased_out_idx[o] = (aliased_inp_idx, aliases_intermediate_and_not_input, is_exact_input)
+                else:
+                    # Only return outputs that are not aliases of inputs.
+                    non_aliased_input_outs.append(o)
+            # If a function involves creating a tensor, and returning a view of it, such that its _base is the intermediiate,
+            # We need to make sure our graph returns the _base as a graph output, and we manually recreate the view
+            # to return to the user. Why? The backend compiler is free to (incorrectly) not set requires_grad
+            # on the base tensor, but we are obligated to properly set requires-gradness on the real output.
+            non_aliased_outs = []
+            for i, o in enumerate(non_aliased_input_outs):
+                non_aliased_outs.append(o)
+
+            return non_aliased_outs, aliased_out_idx
+
+        non_aliased_outs, aliased_out_to_inp_idx = filter_and_record_aliased_outs(outs)
+
+        pytree.tree_map(collect_grad_info, non_aliased_outs)
+
+        # Calling convention: the output is (mutated_input_values, original_outs)
+        # We return all mutated inputs + outputs here, **except** for any mutated inputs or outputs
+        # that alias original inputs.
+        # See Note [Input mutation handling in aot autograd]
+        mutated_inps_and_outs = inputs_with_mutated_data + list(non_aliased_outs)
+
+        # Our compiled forward function will return:
+        # (1) non-aliased updated inputs
+        # (2) non-aliased fw outputs
+        # (3) size/stride/storage_offset metadata for updated aliased inputs
+        # (4) size/stride/storage_offset metadata for aliased outputs
+
+        start_idx_for_aliased_output_metadata = 0
+
+        # First, gather the metadata info on mutated inputs (this only applies to inputs with metadata-only mutations))
+        for i, maybe_aliased_updated_inp in enumerate(maybe_inputs_with_mutated_metadata):
+            if maybe_aliased_updated_inp is None:
+                collect_metadata_mutation_input_info.append(None)
+                continue
+            # Figure out where the sizes/strides/storage_offset are in the compiled fw output.
+            sizes_idx = start_idx_for_aliased_output_metadata
+            strides_idx = sizes_idx + len(maybe_aliased_updated_inp.size())
+            storage_offset_idx = strides_idx + len(maybe_aliased_updated_inp.stride())
+            # update our offset for the next tensor
+            start_idx_for_aliased_output_metadata = storage_offset_idx + 1
+            inp_info = InputAliasInfo(
+                base_idx=i,
+                sizes_idx=sizes_idx,
+                strides_idx=strides_idx,
+                storage_offset_idx=storage_offset_idx,
+                tensor_meta=maybe_aliased_updated_inp,
+            )
+            collect_metadata_mutation_input_info.append(inp_info)
+
+        # Next, gather the metadata info on the user's outputs that alias (either inputs or graph outputs)
+        num_non_input_aliased_outputs = 0
+        for o in outs:
+            maybe_alias_info = aliased_out_to_inp_idx.get(o, None) if isinstance(o, torch.Tensor) else None
+            if maybe_alias_info is None:
+                output_type = OutputType.non_alias
+                # Here, alias_idx will tell us which output from the inner forward this corresponds to.
+                alias_idx = num_non_input_aliased_outputs
+                sizes_idx = None
+                strides_idx = None
+                storage_offset_idx = None
+                tensor_meta = None
+            else:
+                input_alias_idx, is_alias_of_intermediate_not_input, is_exact_input = maybe_alias_info
+                if is_exact_input:
+                    assert not is_alias_of_intermediate_not_input
+                    output_type = OutputType.alias_of_input
+                    alias_idx = input_alias_idx
+                    sizes_idx = None
+                    strides_idx = None
+                    storage_offset_idx = None
+                    tensor_meta = None
+                else:
+                    if is_alias_of_intermediate_not_input:
+                        output_type = OutputType.alias_of_intermediate
+                        alias_idx = num_non_input_aliased_outputs
+                    else:
+                        output_type = OutputType.alias_of_input
+                        alias_idx = input_alias_idx
+                    tensor_meta = o
+                    # Figure out where the sizes/strides/storage_offset are in the compiled fw output.
+                    sizes_idx = start_idx_for_aliased_output_metadata
+                    strides_idx = sizes_idx + len(tensor_meta.size())
+                    storage_offset_idx = strides_idx + len(tensor_meta.stride())
+                    # update our offset for the next tensor
+                    start_idx_for_aliased_output_metadata = storage_offset_idx + 1
+
+            if output_type != OutputType.alias_of_input:
+                num_non_input_aliased_outputs += 1
+
+            inp_info = OutputAliasInfo(
+                output_type=output_type,
+                base_idx=alias_idx,
+                sizes_idx=sizes_idx,
+                strides_idx=strides_idx,
+                storage_offset_idx=storage_offset_idx,
+                tensor_meta=tensor_meta
+            )
+            collect_aliased_output_info.append(inp_info)
 
-        def from_fun(t):
-            if not isinstance(t, Tensor) or not torch._is_functional_tensor(t):
-                return t
-            torch._sync(t)
-            return torch._from_functional_tensor(t)
+        # This is the total number of size/stride/storage_offset metadata outputs that we return in the forward,
+        # used for regenerating aliases later.
+        num_aliasing_metadata_outs = start_idx_for_aliased_output_metadata
 
-        return pytree.tree_map(from_fun, outs)
+        assert len(collect_metadata_mutation_input_info) == len(collect_mutated_input_info)
+
+        assert len([x for x in collect_metadata_mutation_input_info if x is not None]) == len([
+            x for x in collect_mutated_input_info if x == MutationType.metadata_only
+        ])
+        assert len(collect_aliased_output_info) == len(outs)
+        assert len([x for x in collect_aliased_output_info if x.output_type != OutputType.alias_of_input]) == len(non_aliased_outs)
+
+
+        # Our autograd.Function.forward returns both mutated inputs and outputs,
+        # so we need grad info on all of them.
+        assert len(collect_requires_grad_out_info) == len(mutated_inps_and_outs)
+
+        metadata = ViewAndMutationMeta(
+            mutated_input_info=collect_mutated_input_info,
+            metadata_mutation_input_info=collect_metadata_mutation_input_info,
+            requires_grad_out_info=collect_requires_grad_out_info,
+            aliased_output_info=collect_aliased_output_info,
+        )
+        return metadata, pytree.tree_map(from_fun, mutated_inps_and_outs), num_aliasing_metadata_outs
     return inner
 
 
-# This creates a joint forwards-backwards function given both
+# This creates a functionalized joint forwards-backwards function given both
 # the primals (to run forwards) and tangents (to run backwards).
 #
-# It has a precondition which is that the passed in function
-# must be observationally pure; it is not permitted to mutate
-# the primals or tangents.
-def create_joint_forward_backward_pure(fn):
+# It uses the metadata that was created earlier to figure out what all of the outputs to the autograd.Function.forward are:
+# (1) Which inputs received data mutations (and need to be passed as outputs into autograd.grad())
+# (2) Which outputs are aliases of inputs (and should *not* be passed as outputs into autograd.grad())
+def create_joint_forward_backward_functionalized(
+    fn,
+    *,
+    meta: ViewAndMutationMeta,
+    synthetic_base_info: Optional[List[Union[int, Tuple[int, List[Any]]]]],
+):
+    # NOTE: when we have synthetic base inputs, we need to clone them *before* creating views off of them.
+    # This means that "idx" here represents the index of the (potentially) synthetic base.
+    # What we need to do is:
+    # (1) map the current (post-synthetic-base calling convention) input argument index
+    #     to int index pre-synthetic-base-calling-convention.
+    # (2) There could be multiple, if this index corresponds to a synthetic base
+    #     that has multiple input aliases.
+    # (3) If any of those corresponding inputs get metadata mutations, then we clone the base.
+    def maybe_to_fresh_input(idx, t):
+        if not isinstance(t, Tensor):
+            return t
+
+        if synthetic_base_info is None:
+            outer_aliased_indices_of_current_base_arg = [idx]
+        else:
+            outer_aliased_indices_of_current_base_arg = [
+                # For every argument index in the outer calling convention (before synthetic bases)
+                # find its index in the inner calling convention.
+                # if it matches the index of our current arg (idx), track the outer argument's index (i)
+                i for i, outer_idx_or_lambda in enumerate(synthetic_base_info)
+                if (isinstance(outer_idx_or_lambda, int) and outer_idx_or_lambda == idx)
+                or (isinstance(outer_idx_or_lambda, tuple) and outer_idx_or_lambda[0] == idx)
+            ]
+        if any(meta.mutated_input_info[i] == MutationType.data for i in outer_aliased_indices_of_current_base_arg):
+            # Make sure the primal we pass to autograd.grad()
+            # seees the tensor before the mutation
+            out = t.clone()
+        elif any(meta.mutated_input_info[i] == MutationType.metadata_only for i in outer_aliased_indices_of_current_base_arg):
+            # Make sure the primal we pass to autograd.grad()
+            # seees the tensor before the metadata mutation
+            out = t.view(t.shape)
+        else:
+            out = t
+        return out
+
+    def unpack_synthetic_bases(primals: List[Any]) -> List[Any]:
+        # This is only not None if our graph mutates a graph input that aliases another graph input.
+        if synthetic_base_info is None:
+            return primals
+
+        f_args_inner = []
+        for outer_idx_or_lambda in synthetic_base_info:
+            if isinstance(outer_idx_or_lambda, int):
+                f_args_inner.append(primals[outer_idx_or_lambda])
+            else:
+                outer_base_idx, strided_args = outer_idx_or_lambda
+                outer_base = primals[outer_base_idx]
+                # TODO: we could consider storing and executing view replay logic here,
+                # instead of a general as_strided() call.
+                # This could also improve perf, since today this will cause
+                # more as_strided_scatter() ops in the graph.
+                view_arg = outer_base.as_strided(*strided_args)
+                f_args_inner.append(view_arg)
+        return f_args_inner
+
     def joint_forward_backward(
         primals: List[Any], tangents: List[Any]
     ) -> Tuple[List[Any], List[Any]]:
-        # Call the forward pass
-        outs = fn(*primals)
+        # Call the forward pass, making sure to clone any inputs that are mutated first.
+        # We need to ensure that the inputs we pass to autograd.grad() are the *original*
+        # inputs, and not their mutated values.
+        primals_no_input_mutations = [maybe_to_fresh_input(i, t) for i, t in enumerate(primals)]
+        # This is also where we handle the calling convention around synthetic bases.
+        # We need to make sure that we convert any synthetic base arguments into views
+        # *after* we do the cloning above, to preserve the view relationship.
+        primals_ = unpack_synthetic_bases(primals_no_input_mutations)
+        assert len(meta.mutated_input_info) == len(primals_)
+        all_outs = fn(*primals_)
+        assert len(meta.aliased_output_info) == len(all_outs)
+
+        # Pass any (non-aliased) outputs in as tangents, since they'll be returned as outputs in the fw
+        # For outputs that are aliases of intermediates, we will have returned the output's _base as an output in the graph instead,
+        # which we *should* send to grad()
+        outputs_for_grad = [
+            x
+            # TODO: support ._base
+            # x._base if meta.aliased_output_info[i].output_type == OutputType.alias_of_intermediate else x
+            for (i, x) in enumerate(all_outs) if meta.aliased_output_info[i].output_type != OutputType.alias_of_input
+        ]
+        # Pass any (non-aliased) mutated inputs in as tangents, since they'll be returned as outputs in the fw
+        # Important: the traced joint fw/bw will return updated inputs with data mutations,
+        # but *not* with metadata mutations.
+        # Instead, we shunt the updated metadata around externally
+        # and update the input's metadata outside of the autograd.Function
+        mutated_inputs_for_grad = [x for (i, x) in enumerate(primals_) if meta.mutated_input_info[i] == MutationType.data]
+        mutated_inputs_and_outs_to_grad = mutated_inputs_for_grad + outputs_for_grad
+
+        metadata_mutated_inps = [x for (i, x) in enumerate(primals_) if meta.mutated_input_info[i] == MutationType.metadata_only]
+        # for user outputs that are aliases (either of inputs, or of graph intermediates)
+        # figure out what metadata to return in the forward, which is needed to regenerate the output aliases
+        aliased_outs = [x for (i, x) in enumerate(all_outs) if meta.aliased_output_info[i].output_type != OutputType.non_alias
+                        and meta.aliased_output_info[i].tensor_meta is not None]
+        output_metadata_for_fw = []
+        for curr_alias in metadata_mutated_inps + aliased_outs:
+            size_ = curr_alias.size()
+            stride_ = curr_alias.stride()
+            storage_offset_ = curr_alias.storage_offset()
+            # FX IR doesn't know about tuples, so we flatten the metadata into individual ints/symints,
+            # and index into the final output list later.
+            output_metadata_for_fw += (size_ + stride_ + (storage_offset_,))
+
+        # Take care to grab and sync the updated inputs from primals_ (the inputs we actually mutate!)
+        # and not primals (the preserved inputs, pre-mutation, that we pass to grad())
+        for i, arg in enumerate(primals_):
+            if not isinstance(arg, Tensor):
+                continue
+            torch._sync(arg)
+
         # Get the inputs that need gradients
         grad_primals = []
         inputs_needs_grads = []
+        # Note that we're not using primals_ here, being carefully not to pass any mutated inputs into autograd.grad()
         for p in primals:
             is_grad_tensor = isinstance(p, Tensor) and p.requires_grad
             inputs_needs_grads.append(is_grad_tensor)
@@ -240,13 +686,16 @@ def joint_forward_backward(
                 grad_primals.append(p)
 
         # Get the outputs that need gradients
-        assert len(tangents) == len(outs)
+        assert len(tangents) == len(mutated_inputs_and_outs_to_grad)
         needed_outs = []
         needed_tangents = []
-        for out, tangent in zip(outs, tangents):
+        for out, tangent in zip(mutated_inputs_and_outs_to_grad, tangents):
             if isinstance(out, Tensor) and out.requires_grad:
-                needed_outs.append(out)
-                needed_tangents.append(tangent)
+                # A bit sketchy, but fixes e.g. test_aot_autograd_exhaustive_matmul_cpu_float32
+                # The issue is that we are sensitive to decomps that don't accurately maintain
+                # their output's _base.shape compared to eager mode, and this helps mitigate a bit.
+                needed_outs.append(out if out.shape == tangent.shape else out.view(tangent.shape))
+                needed_tangents.append(tangent.requires_grad_(True))
 
         setup_stacktrace_preservation_hooks([out.grad_fn for out in needed_outs])
 
@@ -261,11 +710,40 @@ def joint_forward_backward(
                     allow_unused=True,
                 )
         backward_out_iter = iter(backward_out)
-        return outs, [
+        all_fw_outs = mutated_inputs_and_outs_to_grad + output_metadata_for_fw
+        return all_fw_outs, [
             next(backward_out_iter) if i else None for i in inputs_needs_grads
         ]
 
-    return joint_forward_backward
+    def to_fun(t):
+        if isinstance(t, Tensor):
+            return torch._to_functional_tensor(t, mirror_autograd_meta=True)
+        else:
+            return t
+
+    def from_fun(t):
+        if not isinstance(t, Tensor) or not torch._is_functional_tensor(t):
+            return t
+        torch._sync(t)
+        return torch._from_functional_tensor(t)
+
+    def functionalized_joint(
+        primals: List[Any], tangents: List[Any]
+    ) -> Tuple[List[Any], List[Any]]:
+
+        # Wrap inputs into functional wrappers
+        f_primals, f_tangents = pytree.tree_map(to_fun, (primals, tangents))
+        torch._enable_functionalization(reapply_views=True)
+        try:
+            # Run the joint
+            outs = joint_forward_backward(f_primals, f_tangents)
+        finally:
+            torch._disable_functionalization()
+
+        # Syncing of inputs/outputs was already done directly in the joint call
+        return pytree.tree_map(from_fun, outs)
+
+    return functionalized_joint
 
 
 def normalize_as_list(x):
@@ -401,6 +879,179 @@ def disable_autocast_manager():
     finally:
         del guard
 
+def are_differentiable_views(view1, view2):
+    if view1 is view2:
+        return True
+    if view1._base is None and view2._base is None:
+        return False
+    if view1._base is view2._base or view1._base is view2 or view1 is view2._base:
+        return True
+    return False
+
+def same_dtype_views(view1, view2):
+    if view1.dtype != view2.dtype:
+        return False
+    if view1._base is not None and view1.dtype != view1._base.dtype:
+        return False
+    if view2._base is not None and view2.dtype != view2._base.dtype:
+        return False
+    return True
+
+# Note [Handling mutations on an input that aliases other inputs]
+# The easiest example to show-case this edge case is here:
+#
+# def f(a, b):
+#     a.mul_(2)
+#     out = a + b
+#     return out
+#
+# In this situation, if a and b happened to be aliased, we need to trace something different!
+# Suppose we had b = a.view(-1)
+# (In this case, that means that `a._base is b`)
+#
+# We need to ensure that the aliasing relationship between a and b is preserved.
+# We do that detecting the specific situation above (mutate an input that aliases another input),
+# and when we do that, we create a synthetic base argument. Then inside of the traced forward,
+# we regenerate a and b off of that base.
+# The complete example of the transformed function looks like this:
+#
+# // The traced forward takes in a synthetic base, and regenerates the aliased inputs as views
+# // We could consider getting view-replay support here to minimize as_strided_scatter ops in the graph
+# def traced_forward(base):
+#     a = base.as_strided(...)
+#     b = base.as_strided(...)
+#     a_updated = a.mul(2)
+#     base_updated = torch.as_strided_scatter(base, a_updated, ...)
+#     b_updated = base_updated.as_strided(...)
+#     out = a_updated + b_updated
+#     return a_updated, out
+#
+# def compiled_fn(a, b):
+#     // we detect that a is the "differentiable base" here
+#     base = a
+#     // In other situations, we might do either:
+#     // (1) a and b are both views off of some larger differentiable base
+#     //     assert a._base is b._base and a._base is not None
+#     //     base = a._base
+#     // (2) a and b both don't require gradients. Create a base from the storage
+#     //     assert a._base is None and b._base is None
+#     //     base = torch.Tensor(a.storage())
+#     a_updated, out = traced_forward(base)
+#     a.copy_(a_updated)
+#     return out
+#
+# This function:
+# (1) Merges input views into a synthetic base argument, when any of those input views are mutated
+# (2) Returns metadata telling the autograd.Function how to modify their arguments properly,
+#     to respect the new calling convention.
+#
+# The calling convention is as follows.
+# Any inputs that were originally views of one another get yanked, and replaced with a synthetic base.
+# The argument list ordering goes [base1, ..., baseN], [arg1, ..., argN],
+# Where the ordering of the bases is determined from the ordering of the original view args.
+# baseA will come before baseB if the earliest original argument coming from baseA
+# showed up earlier in the argument list than the earliest original argument coming from baseB.
+#
+# Example, given some tensors a, b, c, d
+# call site:
+#   f(a, c.view(-1), b.view(-1), b, c, d)
+# Modified argument list:
+#   c_base comes first because the first c view came earlier in arg list than the first b view
+#   b_base = torch.Tensor(b.storage())
+#   c_base = torch.Tensor(c.storage())
+#   f(c_base, b_base, a, d)
+def merge_view_inputs(
+    fwd_inputs: List[Any],
+    mutated_input_info: List[MutationType]
+) -> Tuple[List[Any], Optional[List[Union[int, Tuple[int, Tuple[Any]]]]]]:
+    assert len(fwd_inputs) == len(mutated_input_info)
+    storage_ref_to_idx: Dict[StorageWeakRef, List[int]] = collections.defaultdict(list)
+    for i, inpt in enumerate(fwd_inputs):
+        if isinstance(inpt, Tensor):
+            storage_ref = StorageWeakRef(inpt.storage())
+            storage_ref_to_idx[storage_ref].append(i)
+    base_args = []
+    other_args = []
+    # This list contains metadata that tells you what the i'th argument in the inner calling convention should be.
+    # It's either:
+    # - another int (corresponding to the index in the argument list of the element from the outer calling convention)
+    # - idx, *args, where we can generate the new output with old_args[idx].as_strided(*args)
+    #   idx corresponds to which synthetic base from the outer calling context to view
+    inner_calling_convention_meta: Dict[int, Union[int, Tuple[int, List[Any]]]] = {}
+    for aliased_input_indices in storage_ref_to_idx.values():
+        if len(aliased_input_indices) > 1 and any(
+            # We only care about mutations that affect all aliases,
+            # so metadata mutations on an input doesn't require us to do synthetic base handling.
+            mutated_input_info[inpt_idx] == MutationType.data for inpt_idx in aliased_input_indices
+        ):
+            # We detected an input that was mutated, AND aliases with another input.
+            # we need to replace this set of aliased inputs with a single synthetic base.
+            # For now, I'm banning a bunch of cases. We expect dynamo to properly detect these cases
+            # and error out. We can fix them later.
+            for idx1, idx2 in zip(aliased_input_indices, aliased_input_indices[1:]):
+                view1 = fwd_inputs[idx1]
+                view2 = fwd_inputs[idx2]
+                # The "inputs that are aliased but have different differentiable bases" case
+                # is more complicated and hopefully pretty rare. Not currently handled.
+                assert are_differentiable_views(view1, view2), \
+                    "aot_autograd() does not yet handle non-differentiable view input mutations."
+                # Regenerating views when reinterpreting complex / real tensors seems non-trivial,
+                # not handling for now
+                assert same_dtype_views(view1, view2), \
+                    "aot_autograd() does not yet handle input mutations on views with different dtypes."
+            non_none_bases = [fwd_inputs[i]._base for i in aliased_input_indices if fwd_inputs[i]._base is not None]
+            aliases_with_none_bases = [fwd_inputs[i] for i in aliased_input_indices if fwd_inputs[i]._base is None]
+            if len(non_none_bases) == 0:
+                # Case where none of the aliases require gradients
+                example_idx = aliased_input_indices[0]
+                synthetic_base = torch.Tensor(fwd_inputs[example_idx].storage())
+            else:
+                # Case where all of the aliases require gradients, and have the same _base.
+                synthetic_base = non_none_bases[0]
+                for other_base in non_none_bases[1:]:
+                    assert other_base is synthetic_base, \
+                        "aot_autograd() does not yet handle non-differentiable view input mutations."
+                for alias in aliases_with_none_bases:
+                    assert alias is synthetic_base, "aot_autograd() does not yet handle non-differentiable view input mutations."
+            base_args.append(synthetic_base)
+            for curr_view_idx in aliased_input_indices:
+                curr_view = fwd_inputs[curr_view_idx]
+                base_idx = len(base_args) - 1
+                size_ = curr_view.size()
+                stride_ = curr_view.stride()
+                storage_offset_ = curr_view.storage_offset()
+                # We store just enough info here so that we can regenerate the view later.
+                # Regeneration: args[base_idx].as_strided(size_, stride_, storage_offset_)
+                # If we want view replay instead of as_strided() calls, this will need to change.
+                inner_calling_convention_meta[curr_view_idx] = (base_idx, (size_, stride_, storage_offset_))
+        else:
+            for curr_idx in aliased_input_indices:
+                other_args.append(fwd_inputs[curr_idx])
+    if len(base_args) == 0:
+        assert len(other_args) == len(fwd_inputs)
+        # If no synthetic bases are necessary, just return the original inputs.
+        return fwd_inputs, None
+    else:
+        # Otherwise, return:
+        # (1) The new args according to the updated calling convention: (synthetic_bases, other_args)
+        # (2) Metadata telling functionalization how to generate the inner argument list given the outer calling convention.
+        #     We post-process it into a list, where meta[i] tells you info about the i'th argument in the inner calling convention.
+        args_to_functionalization = base_args + other_args
+        arg_to_old_idx_map = {arg: i for (i, arg) in enumerate(fwd_inputs)}
+        for i, other_arg in enumerate(other_args):
+            new_idx = len(base_args) + i
+            old_idx = arg_to_old_idx_map[other_arg]
+            inner_calling_convention_meta[old_idx] = new_idx
+        # post process into a list
+        post_processed_calling_convention_meta: List[Union[int, Callable]] = [-1 for _ in range(len(inner_calling_convention_meta))]
+        for k, v in inner_calling_convention_meta.items():
+            post_processed_calling_convention_meta[k] = v
+        # Quick assert: every argument in the inner calling convention should be accounted for.
+        for x in post_processed_calling_convention_meta:
+            assert x != -1
+        return args_to_functionalization, post_processed_calling_convention_meta
+
+
 
 def aot_dispatch_autograd(flat_fn, flat_args: List[Tensor], aot_config: AOTConfig):
     # Deduplicate inputs.  Suppose you have:
@@ -457,40 +1108,63 @@ def add_dupe_args(args):
 
     deduped_flat_args = remove_dupe_args(flat_args)
 
-    joint_forward_backward = create_joint_forward_backward_pure(lambda *args: flat_fn(*add_dupe_args(args)))
+    _fw_metadata, out, _num_aliasing_metadata_outs = run_functionalized_fw_and_collect_metadata(
+        lambda *args: flat_fn(*(add_dupe_args(args))),
+    )(*deduped_flat_args)
 
-    out = flat_fn(*flat_args)
-    # Collect info on which output tensors require gradients,
-    # so we can mark them properly in the returned autograd.Function
-    _flat_outs_not_requiring_grad, _ = pytree.tree_flatten(
-        pytree.tree_map(
-            lambda x: isinstance(x, Tensor) and not x.requires_grad, out
-        )
-    )
+    # pre-compute, so we can bail out quickly in the hotpath
+    _num_outputs_aliased_to_inputs = len([
+        x for x in _fw_metadata.aliased_output_info if x.output_type == OutputType.alias_of_input])
+    _num_outputs_aliased_to_intermediates = len([
+        x for x in _fw_metadata.aliased_output_info if x.output_type == OutputType.alias_of_intermediate])
+    _num_mutated_data_inputs = len([x for x in _fw_metadata.mutated_input_info if x == MutationType.data])
+    _num_mutated_metadata_only_inputs = len([x for x in _fw_metadata.metadata_mutation_input_info if x is not None])
+    _num_mutated_inputs = _num_mutated_data_inputs + _num_mutated_metadata_only_inputs
+
+    if isinstance(out, (list, tuple)):
+        _num_non_aliased_outs = len(out[_num_mutated_data_inputs:])
+    else:
+        _num_non_aliased_outs = 1
+    assert len(_fw_metadata.requires_grad_out_info) == _num_mutated_data_inputs + _num_non_aliased_outs
+
+    # out here corresponds to the set of outputs that should be returned by the traced forward call.
+    # It includes outputs of the original forward, *and* any updated inputs due to input mutations.
+    # However, it does *not* include any outputs that are aliases of inputs, or any metadata-only input mutations.
     out = pytree.tree_map(
         lambda x: x.detach().contiguous() if isinstance(x, Tensor) else x,
         out,
     )
 
-    if isinstance(out, (list, tuple)):
-        _num_outs = len(out)
-    else:
-        _num_outs = 1
+    # This code only executes if we have graph inputs that alias each other, and one of those inputs
+    # gets its data mutated.
+    # When that happens, we replace the aliased inputs with a synthetic base, and in the traced forward
+    # we later generate the input views
+    deduped_flat_args_with_views_handled, _synthetic_base_info = merge_view_inputs(
+        deduped_flat_args, _fw_metadata.mutated_input_info)
+
+    joint_forward_backward = create_joint_forward_backward_functionalized(
+        lambda *args: flat_fn(*add_dupe_args(args)),
+        meta=_fw_metadata,
+        synthetic_base_info=_synthetic_base_info,
+    )
 
-    joint_inputs = (deduped_flat_args, out)
+    joint_inputs = (deduped_flat_args_with_views_handled, out)
 
     disable_amp = torch._C._is_any_autocast_enabled()
 
     if config.use_functionalize:
         with enable_python_dispatcher():
+            flattened_joints, _ = pytree.tree_flatten(joint_inputs)
             fx_g = make_fx(
-                detach_and_functionalize_pure(joint_forward_backward), aot_config.decompositions
+                joint_forward_backward, aot_config.decompositions
             )(*joint_inputs)
         fx_g.graph.eliminate_dead_code()
         fx_g.recompile()
     else:
-        warnings.warn("graph partitioning without functionalization is not sound, we may introduce errors")
-        fx_g = make_fx(joint_forward_backward, aot_config.decompositions)(*joint_inputs)
+        # joint_forward_backward() now always runs with functionalization, and factoring it out
+        # to make that toggleable is a bit painful.
+        # aot autograd without functionalization is wrong anyway, so we error.
+        raise AssertionError("Graph partitioning without functionalization is not sound, we may introduce errors")
 
     if config.debug_joint:
         print("====== Joint graph ======")
@@ -498,13 +1172,15 @@ def add_dupe_args(args):
 
     with torch.no_grad():
         with track_graph_compiling("joint"):
-            fw_module, bw_module = aot_config.partition_fn(fx_g, joint_inputs)
+            num_inner_fwd_outputs = _num_mutated_data_inputs + _num_non_aliased_outs + _num_aliasing_metadata_outs
+            fw_module, bw_module = aot_config.partition_fn(
+                fx_g, joint_inputs, num_fwd_outputs=num_inner_fwd_outputs)
             fw_outs = [n for n in fw_module.graph.nodes if n.op == "output"][0].args[0]
             # we only need to bookkeep the symints that are saved for bw, not any symints
             # the user forward might have returned in its own output
-            fw_outs = fw_outs[_num_outs:]
-            symint_outs = [n for n in fw_outs if is_sym_node(n)]
-            _num_symints = len(symint_outs)
+            fw_outs_saved_for_bw = fw_outs[num_inner_fwd_outputs:]
+            symint_outs_saved_for_bw = [n for n in fw_outs_saved_for_bw if is_sym_node(n)]
+            _num_symints_saved_for_bw = len(symint_outs_saved_for_bw)
 
         if config.debug_graphs:
             print("====== Forward graph ======")
@@ -513,41 +1189,106 @@ def add_dupe_args(args):
             bw_module.print_readable()
 
         with track_graph_compiling("forward"):
-            compiled_fw_func = aot_config.fw_compiler(fw_module, deduped_flat_args)
+            compiled_fw_func = aot_config.fw_compiler(fw_module, deduped_flat_args_with_views_handled)
 
     class CompiledFunction(torch.autograd.Function):
         compiled_fw = compiled_fw_func
         compiled_bw = None
-        num_outs = _num_outs
-        num_symints = _num_symints
-        flat_outs_not_requiring_grad = _flat_outs_not_requiring_grad
+        # Corresponds to number of outs (not including updated inputs returns as outs),
+        # *and* not including outs that are aliases of inputs
+        num_non_aliased_outs = _num_non_aliased_outs
+        num_symints_saved_for_bw = _num_symints_saved_for_bw
+        # Corresponds to number of inputs that are mutated (both metadata only, and data)
+        num_mutated_inputs = _num_mutated_inputs
+        # Corresponds to number of inputs that only have their metadata mutated
+        num_mutated_data_inputs = _num_mutated_data_inputs
+        # Corresponds to number of inputs that get their metadata (but not data) mutated
+        # We don't return these in the compiled fw, and instead we stash enough info
+        # to replay the metadata mutations later.
+        num_mutated_metadata_only_inputs = _num_mutated_metadata_only_inputs
+        # Corresponds to number of outputs in the original fw that are aliases of inputs
+        # (These are all not returned by the compiled forward, and instead they are manually
+        # created in the epilogue)
+        num_outputs_aliased_to_inputs = _num_outputs_aliased_to_inputs
+        # Corresponds to the number of user outputs that alias intermediates (aka graph outputs).
+        num_outputs_aliased_to_intermediates = _num_outputs_aliased_to_intermediates
+        # For every output that aliases and input, and every input that gets only its metadata mutated,
+        # we return that tensor's size/stride/storage_offset directly at the end of the compiled forward,
+        # as a big list of ints.
+        # The number is tracked here.
+        num_aliasing_metadata_outs = _num_aliasing_metadata_outs
+        synthetic_base_info = _synthetic_base_info
+        fw_metadata = _fw_metadata
 
         @staticmethod
         @disable_torchdynamo
         def forward(ctx, *deduped_flat_tensor_args):
+
+            # There is a pretty complicated calling convention around what the compiled fw returns.
+            # The full list of outputs and their relative order is:
+            # (*mutated_data_inputs, *non_aliased_fw_outs, *saved_tensors, *saved_symints)
+            # - Note that in the synthetic bases case, mutated_inputs will correspond to an updated version
+            #   of the original view, and not the synthetic base
             fw_outs = call_func_with_args(
                 CompiledFunction.compiled_fw, deduped_flat_tensor_args, disable_amp=disable_amp
             )
-            num_outs = CompiledFunction.num_outs
-            num_symints = CompiledFunction.num_symints
+
+            num_non_aliased_outs = CompiledFunction.num_non_aliased_outs
+            num_aliasing_metadata_outs = CompiledFunction.num_aliasing_metadata_outs
+            num_symints_saved_for_bw = CompiledFunction.num_symints_saved_for_bw
+            num_mutated_data_inputs = CompiledFunction.num_mutated_data_inputs
+            # Our forward() returns both (mutated_inputs, outputs, output_alias_meta, saved_tensors, saved_symints)
+            num_forward_returns = num_mutated_data_inputs + num_non_aliased_outs + num_aliasing_metadata_outs
+            num_forward_returns_not_including_alias_meta = num_mutated_data_inputs + num_non_aliased_outs
+
             # Partitioners must put symint arguments at the end separate from tensor arguments
-            if num_symints > 0:
-                ctx.save_for_backward(*fw_outs[num_outs:-num_symints])
-                ctx.symints = fw_outs[-num_symints:]
+            if num_symints_saved_for_bw > 0:
+                tensors_saved_for_backwards = fw_outs[num_forward_returns:-num_symints_saved_for_bw]
+                assert all([isinstance(x, torch.Tensor) for x in tensors_saved_for_backwards])
+                ctx.save_for_backward(*tensors_saved_for_backwards)
+                symint_outs = fw_outs[-num_symints_saved_for_bw:]
+                assert all([isinstance(x, (int, float, torch.SymInt, torch.SymFloat)) for x in symint_outs])
+                ctx.symints = symint_outs
             else:
-                ctx.save_for_backward(*fw_outs[num_outs:])
+                ctx.save_for_backward(*fw_outs[num_forward_returns:])
                 ctx.symints = []
 
             fw_outs_not_requiring_grad = [
-                x for (i, x) in enumerate(fw_outs[0:num_outs]) if CompiledFunction.flat_outs_not_requiring_grad[i]
+                x for (i, x) in enumerate(fw_outs[:num_forward_returns_not_including_alias_meta])
+                if isinstance(x, torch.Tensor) and not CompiledFunction.fw_metadata.requires_grad_out_info[i]
             ]
+            fw_out_ids_requiring_grad = [
+                id(x) for (i, x) in enumerate(fw_outs[:num_forward_returns_not_including_alias_meta])
+                if isinstance(x, torch.Tensor) and CompiledFunction.fw_metadata.requires_grad_out_info[i]
+            ]
+
             ctx.mark_non_differentiable(*fw_outs_not_requiring_grad)
 
-            return tuple(fw_outs[0:num_outs])
+            return tuple(fw_outs[0:num_forward_returns])
 
         @staticmethod
         @disable_torchdynamo
-        def backward(ctx, *flat_args):
+        def backward(ctx, *all_flat_args):
+            # Calling convention: we expect a grad_out passed to the backward:
+            # - for every output of the fw that does *not* alias an input
+            # - for every updated_input generated by the fw that does *not* alias an input
+            # - for every size/stride metadata value for aliased outputs.
+            #   These are returned by the forward, but we just drop them in the backward.
+            #   We need to return them in the forward, but unfortunately there's no way to specify
+            #   in autograd.Function that certain non-tensor forward outputs shouldn't show up in the backward.
+            expected_grad_outs = CompiledFunction.num_non_aliased_outs + CompiledFunction.num_mutated_data_inputs
+            if CompiledFunction.num_aliasing_metadata_outs > 0:
+                flat_args = all_flat_args[:-CompiledFunction.num_aliasing_metadata_outs]
+                metadata_args = all_flat_args[-CompiledFunction.num_aliasing_metadata_outs:]
+                # metadata args are all ints/symints, which autograd will send Nones for as grad_outputs in the bw
+                assert all([x is None for x in metadata_args])
+                # delete
+                # for out_idx, (base_sizes, base_strides, base_storage_offset) in CompiledFunctions.fw_out_base_metadata.items():
+
+            else:
+                flat_args = all_flat_args
+
+            assert len(flat_args) == expected_grad_outs
             contiguous_args = [t.contiguous() if torch.is_tensor(t) else t for t in flat_args]
             all_args = list(ctx.symints) + list(ctx.saved_tensors) + list(contiguous_args)
             del contiguous_args
@@ -567,7 +1308,136 @@ def backward(ctx, *flat_args):
 
     @wraps(CompiledFunction.apply)
     def compiled_function(*args):
-        return CompiledFunction.apply(*remove_dupe_args(args))
+        # Step 1: remove dupe args
+        no_dupe_args = remove_dupe_args(args)
+
+        # Step 2: remove aliased inputs that are mutated, replace with synthetic bases
+        # Only happens if our graph mutates an input that aliases another input.
+        if CompiledFunction.synthetic_base_info is not None:
+            # Given: the original args, including at least one pair of inputs that are aliased
+            # and get subsequently mutated.
+            # Generate: the updated args, including (potentially multiple) synthetic bases
+            # that replace the views. The input views are regenerated manually in the compiled function.
+            # TODO: think harder about what happens if (a view of) one of these mutated input views is ALSO returned
+            new_inputs, metadata = merge_view_inputs(no_dupe_args, CompiledFunction.fw_metadata.mutated_input_info)
+            # We're just re-running the original-args-to-synthetic-base transformation
+            # that we ran during compilation.
+            # This returns metadata that we use during tracing to recover the input views,
+            # which we don't actually need at runtime.
+            assert metadata is not None
+            no_dupe_args_with_synthetic_bases = new_inputs
+        else:
+            no_dupe_args_with_synthetic_bases = no_dupe_args
+
+        all_outs = CompiledFunction.apply(*no_dupe_args_with_synthetic_bases)
+        if CompiledFunction.num_aliasing_metadata_outs > 0:
+            outs = all_outs[:-CompiledFunction.num_aliasing_metadata_outs]
+            aliasing_metadata_outs = all_outs[-CompiledFunction.num_aliasing_metadata_outs:]
+        else:
+            outs = all_outs
+            aliasing_metadata_outs = []
+
+        assert len(all_outs) == CompiledFunction.num_mutated_data_inputs + CompiledFunction.num_non_aliased_outs \
+            + CompiledFunction.num_aliasing_metadata_outs
+
+        # Step 3: After running the compiled fw, apply updates to mutated inputs
+        if CompiledFunction.num_mutated_inputs > 0:
+            # Calling convention: (mutated_inputs, real_outs, aliasing_metadata)
+
+            if CompiledFunction.num_mutated_data_inputs > 0:
+                updated_inputs = outs[:CompiledFunction.num_mutated_data_inputs]
+                fw_outs = outs[CompiledFunction.num_mutated_data_inputs:]
+            else:
+                updated_inputs = []
+                fw_outs = outs
+
+            curr_mutated_inpt_idx = 0
+            for inpt_idx, (mutation_type, metadata_mutation_info) in enumerate(zip(
+                # TODO: I should merge these two pieces of state
+                CompiledFunction.fw_metadata.mutated_input_info,
+                CompiledFunction.fw_metadata.metadata_mutation_input_info,
+            )):
+                if mutation_type == MutationType.none:
+                    continue
+                original_inpt = no_dupe_args[inpt_idx]
+                if mutation_type == MutationType.metadata_only:
+                    # We need to grab the size/stride/storage_offset from the compiled forward,
+                    # and use that to mutate the metadata of the input
+                    expected_meta = CompiledFunction.fw_metadata.metadata_mutation_input_info[inpt_idx]
+                    assert expected_meta is not None
+                    fake_meta = expected_meta.tensor_meta
+                    size_len = len(fake_meta.size())
+                    stride_len = len(fake_meta.stride())
+                    size_ = aliasing_metadata_outs[expected_meta.sizes_idx:expected_meta.sizes_idx + size_len]
+                    stride_ = aliasing_metadata_outs[expected_meta.strides_idx:expected_meta.strides_idx + stride_len]
+                    storage_offset_ = aliasing_metadata_outs[expected_meta.storage_offset_idx]
+                    original_inpt.as_strided_(size_, stride_, storage_offset_)
+                else:
+                    updated_inpt = updated_inputs[curr_mutated_inpt_idx]
+                    curr_mutated_inpt_idx += 1
+                    # TODO: handle resize_() on inputs to a larger size.
+                    # This is actually non-trivial to detect, so we should probably just handle it
+                    # (or make dynamo detect).
+                    # We can't just check of original_inpt.storage_size != updated_inpt.storage_size,
+                    # Because the original_inpt might be a view of some larger tensor,
+                    # and updated_inpt is always densely packed.
+                    if original_inpt.size() != updated_inpt.size() \
+                            or original_inpt.stride() != updated_inpt.stride() \
+                            or original_inpt.storage_offset() != updated_inpt.storage_offset():
+                        # Functionalization can't easily tell us if an input had BOTH its metadata actual data mutated.
+                        # So we check if metadata needs to be mutated here manually.
+                        original_inpt.as_strided_(updated_inpt.size(), updated_inpt.stride(), updated_inpt.storage_offset())
+                    original_inpt.copy_(updated_inpt)
+        else:
+            fw_outs = outs
+
+        # Step 4: Manually regenerate any outputs that are aliased to inputs, instead of
+        # compiling them.
+        if CompiledFunction.num_outputs_aliased_to_inputs > 0 or CompiledFunction.num_outputs_aliased_to_intermediates > 0:
+            assert CompiledFunction.num_outputs_aliased_to_inputs + len(fw_outs) == \
+                len(CompiledFunction.fw_metadata.aliased_output_info)
+            fw_outs_including_aliases = []
+            for aliased_out_metadata in CompiledFunction.fw_metadata.aliased_output_info:
+                if aliased_out_metadata.output_type == OutputType.non_alias:
+                    fw_outs_including_aliases.append(fw_outs[aliased_out_metadata.base_idx])
+                else:
+                    if aliased_out_metadata.output_type == OutputType.alias_of_input:
+                        aliased_base_tensor = args[aliased_out_metadata.base_idx]
+                    else:
+                        assert aliased_out_metadata.output_type == OutputType.alias_of_intermediate
+                        aliased_base_tensor = fw_outs[aliased_out_metadata.base_idx]
+                    # Note: here, we manually regenerate the output, using an as_strided() call,
+                    # OR if the aliased output came from a custom autograd.function, we replay it.
+                    # The as_strided() in the normal case is good for perf (this is hot-path code,
+                    # and we're consolidating potential chains of views into a single view op).
+                    # But we might need to figure out view replaying for e.g. XLA.
+                    # TODO: handle the custom autograd function case here.
+                    # We need a way to check whether a tensor came from a custom autograd fn from python,
+                    # AND a way to replay that custom view fn.
+                    fake_meta = aliased_out_metadata.tensor_meta
+                    if fake_meta is None:
+                        # This handles the specific case where the user returns an output that *was* an input. Don't create a view.
+                        fw_outs_including_aliases.append(aliased_base_tensor)
+                    else:
+                        # We need to grab the size/stride/storage_offset from the compiled forward,
+                        # and use that to create a view off of the right input
+                        fake_meta = aliased_out_metadata.tensor_meta
+                        size_len = len(fake_meta.size())
+                        stride_len = len(fake_meta.stride())
+                        size_ = aliasing_metadata_outs[aliased_out_metadata.sizes_idx:aliased_out_metadata.sizes_idx + size_len]
+                        stride_ = aliasing_metadata_outs[
+                            aliased_out_metadata.strides_idx:aliased_out_metadata.strides_idx + stride_len]
+                        storage_offset_ = aliasing_metadata_outs[aliased_out_metadata.storage_offset_idx]
+                        # Create the output alias
+                        aliased_out = gen_alias_from_base(aliased_base_tensor, size_, stride_, storage_offset_, fake_meta)
+                        fw_outs_including_aliases.append(aliased_out)
+
+            for inner_out, user_out in zip(fw_outs, fw_outs_including_aliases):
+                # Sanity check assert
+                assert type(inner_out) == type(user_out)
+            return fw_outs_including_aliases
+        else:
+            return fw_outs
 
     return compiled_function
 
@@ -686,8 +1556,6 @@ def unflatten(self, x):
             return x
         return pytree.tree_unflatten(x, self.spec)
 
-KNOWN_TYPES = [torch.Tensor, int, str, float, bool, torch.SymInt, torch.SymFloat]
-
 
 def aot_function(
     fn: Callable,
diff --git a/functorch/_src/partitioners.py b/functorch/_src/partitioners.py
index c82afe65787bd..e12840f696b73 100644
--- a/functorch/_src/partitioners.py
+++ b/functorch/_src/partitioners.py
@@ -84,16 +84,15 @@ def _is_tangent(node):
     return node.op == "placeholder" and "tangents" in node.target
 
 
-def _extract_fwd_bwd_outputs(joint_module: fx.GraphModule):
-    num_fwd_outputs = joint_module._out_spec.children_specs[0].num_leaves
+def _extract_fwd_bwd_outputs(joint_module: fx.GraphModule, *, num_fwd_outputs):
     outputs = pytree.tree_flatten([node.args for node in joint_module.graph.nodes if node.op == 'output'])[0]
     fwd_outputs = outputs[:num_fwd_outputs]
     bwd_outputs = outputs[num_fwd_outputs:]
     return fwd_outputs, bwd_outputs
 
 
-def _extract_fwd_bwd_modules(joint_module: fx.GraphModule, saved_values, saved_sym_nodes=()):
-    fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(joint_module)
+def _extract_fwd_bwd_modules(joint_module: fx.GraphModule, saved_values, saved_sym_nodes=(), *, num_fwd_outputs):
+    fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(joint_module, num_fwd_outputs=num_fwd_outputs)
     primal_inputs = list(filter(_is_primal, joint_module.graph.nodes))
     tangent_inputs = list(filter(_is_tangent, joint_module.graph.nodes))
     # Construct the forward module
@@ -125,7 +124,7 @@ def _extract_fwd_bwd_modules(joint_module: fx.GraphModule, saved_values, saved_s
 
 
 def default_partition(
-    joint_module: fx.GraphModule, _joint_inputs
+    joint_module: fx.GraphModule, _joint_inputs, *, num_fwd_outputs
 ) -> Tuple[fx.GraphModule, fx.GraphModule]:
     """
     Partitions the :attr:`joint_module` in a manner that closely resembles the
@@ -151,7 +150,7 @@ def default_partition(
         Returns the generated forward and backward Fx graph modules.
     """
     primal_inputs = list(filter(_is_primal, joint_module.graph.nodes))
-    fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(joint_module)
+    fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(joint_module, num_fwd_outputs=num_fwd_outputs)
     forward_only_graph = _extract_graph_with_inputs_outputs(joint_module.graph, primal_inputs, fwd_outputs)
     forward_node_names = {node.name for node in forward_only_graph.nodes if node.op != 'output'}
     saved_values = []
@@ -178,7 +177,7 @@ def default_partition(
     saved_values = list(set(saved_values))
     saved_sym_nodes = list(set(saved_sym_nodes))
 
-    return _extract_fwd_bwd_modules(joint_module, saved_values, saved_sym_nodes=saved_sym_nodes)
+    return _extract_fwd_bwd_modules(joint_module, saved_values, saved_sym_nodes=saved_sym_nodes, num_fwd_outputs=num_fwd_outputs)
 
 
 def _prod(x):
@@ -249,6 +248,7 @@ def _count_ops(graph):
 
 def min_cut_rematerialization_partition(
     joint_module: fx.GraphModule, _joint_inputs, compiler="nvfuser", recomputable_ops=None,
+    *, num_fwd_outputs
 ) -> Tuple[fx.GraphModule, fx.GraphModule]:
     """
     Partitions the joint graph such that the backward recomputes the forward.
@@ -270,6 +270,7 @@ def min_cut_rematerialization_partition(
         recomputable_ops: This is an optional set of recomputable ops. If this
             is not None, then this set of ops will be used instead of the
             default set of ops.
+        num_fwd_outputs: The number of outputs from the forward graph.
 
     Returns:
         Returns the generated forward and backward Fx graph modules.
@@ -281,6 +282,7 @@ def min_cut_rematerialization_partition(
 
     joint_module.graph.eliminate_dead_code()
     joint_module.recompile()
+
     fx_g = joint_module.graph
 
     #  add the CSE pass
@@ -302,15 +304,30 @@ def classify_nodes(joint_module):
                     required_bw_nodes.add(user)
 
         primal_inputs = list(filter(_is_primal, joint_module.graph.nodes))
-        fwd_outputs, _ = _extract_fwd_bwd_outputs(joint_module)
+        fwd_outputs, _ = _extract_fwd_bwd_outputs(joint_module, num_fwd_outputs=num_fwd_outputs)
         forward_only_graph = _extract_graph_with_inputs_outputs(joint_module.graph, primal_inputs, fwd_outputs)
         required_fw_nodes = {name_to_node[node.name] for node in forward_only_graph.nodes
                              if node.op != 'output'}
         unclaimed_nodes = {node for node in joint_module.graph.nodes
                            if node not in required_fw_nodes and node not in required_bw_nodes}
-        return required_fw_nodes, required_bw_nodes, unclaimed_nodes
+        return fwd_outputs, required_fw_nodes, required_bw_nodes, unclaimed_nodes
+
+    orig_fw_outputs, required_fw_nodes, required_bw_nodes, unclaimed_nodes = classify_nodes(joint_module)
+
+    def is_tensor_node(x):
+        # When dynamic shapes are not enabled, fw outputs can be raw ints and not fx nodes
+        if not isinstance(x, fx.Node):
+            return False
+        # It would be nice if we could guarantee that all fx nodes from make_fx get a 'val'
+        # key in their meta dict, but that isn't always true today (see proxy_tensor.py)
+        return 'tensor_meta' in x.meta or ('val' in x.meta and isinstance(x.meta['val'], torch.Tensor))
+
+    # networkx blows up on graphs with no tensor outputs.
+    # Since there's nothing to partition anyway, and the default partitioner can "handle"
+    # this case, send our graph over to the default partitioner.
+    if not any(is_tensor_node(x) for x in orig_fw_outputs):
+        return default_partition(joint_module, _joint_inputs, num_fwd_outputs=num_fwd_outputs)
 
-    required_fw_nodes, required_bw_nodes, unclaimed_nodes = classify_nodes(joint_module)
     for node in reversed(joint_module.graph.nodes):
         if node not in required_fw_nodes:
             node.dist_from_bw = 0
@@ -443,7 +460,8 @@ def get_node_weight(node) -> int:
     # save_for_backward on tensors and stashes symints in autograd .ctx
     saved_sym_nodes = list(filter(lambda n: is_sym_node(n), saved_values))
     saved_values = list(filter(lambda n: not is_sym_node(n), saved_values))
-    fw_module, bw_module = _extract_fwd_bwd_modules(joint_module, saved_values, saved_sym_nodes=saved_sym_nodes)
+    fw_module, bw_module = _extract_fwd_bwd_modules(
+        joint_module, saved_values, saved_sym_nodes=saved_sym_nodes, num_fwd_outputs=num_fwd_outputs)
     if AOT_PARTITIONER_DEBUG:
         print("Theoretical Activations Stored: ", sum([_size_of(i) for i in saved_values]) / 1e9)
         fw_module_nodes = set([node.name for node in fw_module.graph.nodes if node.op == 'call_function'])
diff --git a/test/dynamo/test_aot_autograd.py b/test/dynamo/test_aot_autograd.py
index 1532267a043d7..fe81a23cc3399 100644
--- a/test/dynamo/test_aot_autograd.py
+++ b/test/dynamo/test_aot_autograd.py
@@ -59,7 +59,7 @@ def fn(param, y):
         compiler_fn = functools.partial(compiler_safe_fn, is_safe=is_safe)
         aot_fn = torch._dynamo.optimize(compiler_fn)(fn)
         aot_fn(x, y)
-        self.assertTrue(not is_safe[0])
+        self.assertTrue(is_safe[0])
 
     def test_mutation1(self):
         def fn(_stack0: torch.Tensor, diagonal_chunked_attention_scores: torch.Tensor):
@@ -88,7 +88,7 @@ def fn(_stack0: torch.Tensor, diagonal_chunked_attention_scores: torch.Tensor):
         compiler_fn = functools.partial(compiler_safe_fn, is_safe=is_safe)
         aot_fn = torch._dynamo.optimize(compiler_fn)(fn)
         aot_fn(x, y)
-        self.assertTrue(not is_safe[0])
+        self.assertTrue(is_safe[0])
 
     def test_negative_testing_mutation(self):
         def fn(_stack0: torch.Tensor, diagonal_chunked_attention_scores: torch.Tensor):
@@ -202,7 +202,7 @@ def forward(self, x, y):
         compiler_fn = functools.partial(compiler_safe_fn, is_safe=is_safe)
         aot_fn = torch._dynamo.optimize(compiler_fn)(graph)
         aot_fn(x, y)
-        self.assertTrue(not is_safe[0])
+        self.assertTrue(is_safe[0])
 
     def test_call_fn_with_non_const_inputs_aot_unsafe_control_flow(self):
         class ModuleSpecialFwd(torch.nn.Module):
diff --git a/test/dynamo/test_aot_cudagraphs.py b/test/dynamo/test_aot_cudagraphs.py
index fdb7c88762b8b..5b2e6eb2f9eac 100644
--- a/test/dynamo/test_aot_cudagraphs.py
+++ b/test/dynamo/test_aot_cudagraphs.py
@@ -105,7 +105,6 @@ def fn(x, y):
         fn(x, y)
 
     @patch("functorch._src.config.use_functionalize", True)
-    @patch_all(ok=False)  # input mutation not supported yet
     def test_mutate_input(self):
         def model(x, y):
             y.add_(3)
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 648dc04dc5226..fe6fe461ed672 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -6,6 +6,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Union, Callable, List, Any
 from unittest.mock import patch
 from torch.testing._internal.common_utils import TestCase, run_tests, IS_ARM64, IS_WINDOWS
 import torch
@@ -231,8 +232,8 @@ def f(x):
         self.assertEqual(grads, grads2)
 
 
-def _outs_and_grads(fn, inps):
-    outs = fn(*inps)
+def _outs_and_grads(fn, graph_inps, inps):
+    outs = fn(*graph_inps)
     for out in pytree.tree_flatten(outs)[0]:
         if isinstance(out, torch.Tensor) and out.requires_grad:
             out.sum().backward(retain_graph=True)
@@ -243,14 +244,58 @@ def _outs_and_grads(fn, inps):
 
 
 class TestAOTAutograd(AOTTestCase):
-    def verify_aot_autograd(self, f, inp):
+    # test_mutation will:
+    # - Ensure that inputs are non-leaves, so our graphs can mutate them
+    # - try to mutate outputs of the graph (to ensure that autograd meta is set properly on outputs)
+    def verify_aot_autograd(
+        self,
+        f,
+        inp: Union[Callable, List[Any]],
+        *,
+        test_mutation: bool = False,
+        return_fw_graph: bool = False,
+    ):
+        # Some tests pass in a callable for inp, to generate the inputs
+        # (useful if we want to generate complicated aliasing inputs)
+        if isinstance(inp, Callable):
+            inp_callable = inp
+            # The callable should return a tuple of f_inputs, f_graph_inputs
+            # (The idea is that we might want to compile a function with the graph inputs,
+            # but test autograd backprop all the way through the actual inputs)
+            inp_copy, graph_inps_copy = inp_callable()
+            inp, graph_inps = inp_callable()
+        else:
+            inp_copy = []
+            # Our input clones need to mimic when inputs are duplicates of one another
+            dupes_map = {}
+            for i, x in enumerate(inp):
+                if x in dupes_map:
+                    x_dupe_idx = dupes_map[x]
+                    inp_copy.append(inp_copy[x_dupe_idx])
+                else:
+                    dupes_map[x] = i
+                    x_copy = x.clone().detach().requires_grad_(x.requires_grad)
+                    if x.requires_grad and not x.is_leaf:
+                        x_copy = x_copy.clone()
+                    inp_copy.append(x_copy)
+
+            if test_mutation:
+                # For graphs where we mutate inputs, need our test to make sure inputs aren't leaves
+                graph_inps = [x.add(1) for x in inp]
+                graph_inps_copy = [x.add(1) for x in inp_copy]
+            else:
+                graph_inps = inp
+                graph_inps_copy = inp_copy
+
+        # Create a copy of inputs, so we can test input mutation correctness.
+
+        fw_graph_cell = [None]
         if isinstance(f, nn.Module):
-            compiled_f = aot_module(f, nop)
+            compiled_f = aot_module(f, fw_compiler=partial(extract_graph, graph_cell=fw_graph_cell), bw_compiler=nop)
         else:
-            compiled_f = aot_function(f, nop)
-        ref_out, ref_grad = _outs_and_grads(f, inp)
-        test_out, test_grad = _outs_and_grads(compiled_f, inp)
-        self.assertEqual(ref_out, test_out)
+            compiled_f = aot_function(f, fw_compiler=partial(extract_graph, graph_cell=fw_graph_cell), bw_compiler=nop)
+        ref_out, ref_grad = _outs_and_grads(f, graph_inps, inp)
+        test_out, test_grad = _outs_and_grads(compiled_f, graph_inps_copy, inp_copy)
         self.assertEqual(ref_grad, test_grad)
 
         if isinstance(ref_out, torch.Tensor):
@@ -259,6 +304,21 @@ def verify_aot_autograd(self, f, inp):
         for ref_o, test_o in zip(ref_out, test_out):
             if isinstance(ref_o, torch.Tensor):
                 self.assertEqual(ref_o.requires_grad, test_o.requires_grad)
+                self.assertEqual(ref_o.is_leaf, test_o.is_leaf)
+                self.assertEqual(ref_o._is_view(), test_o._is_view())
+                self.assertEqual(ref_o, test_o)
+                if test_mutation:
+                    # This tests that autograd meta is set properly on the output we can
+                    # mutate it.
+                    ref_o.mul_(2)
+                    test_o.mul_(2)
+                    self.assertEqual(ref_o, test_o)
+        for ref_i, test_i in zip(inp, inp_copy):
+            if isinstance(ref_i, torch.Tensor):
+                self.assertEqual(ref_i.requires_grad, test_i.requires_grad)
+                self.assertEqual(ref_i, test_i)
+        if return_fw_graph:
+            return fw_graph_cell[0]
 
     def test_single_output(self):
         def f(a, b):
@@ -278,6 +338,515 @@ def f(a, b):
         inp = [torch.randn(3, 3, requires_grad=True), torch.randn(3, 3)]
         self.verify_aot_autograd(f, inp)
 
+    def test_input_mutation_simple(self):
+        def f(a):
+            a.mul_(2)
+            return a * 3
+        inp = [torch.ones(3, 3, requires_grad=True)]
+
+        fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True, return_fw_graph=True)
+        # Things to note:
+        # - the extra clone is because we need to pass the pre-mutated input to grad(),
+        #   but autograd operates above functionalization so we need to manually clone.
+        #   Hopefully backends can optimize this easily.
+        # - The extra return arg is because the compiled forward returns (mutated inputs + outputs)
+        self.assertExpectedInline(fw_graph.code.strip(), """\
+def forward(self, primals_1):
+    clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
+    mul = torch.ops.aten.mul.Tensor(clone, 2);  clone = None
+    mul_1 = torch.ops.aten.mul.Tensor(mul, 3)
+    return [mul, mul_1]""")
+
+    def test_input_mutation_is_output(self):
+        def f(a):
+            a.mul_(2)
+            return a
+        inp = [torch.ones(3, 3, requires_grad=True)]
+
+        fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True, return_fw_graph=True)
+        self.assertExpectedInline(fw_graph.code.strip(), """\
+def forward(self, primals_1):
+    clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
+    mul = torch.ops.aten.mul.Tensor(clone, 2);  clone = None
+    return [mul]""")
+
+    def test_input_mutation_multiple(self):
+        def f(a, b, c):
+            a.mul_(2)
+            c.mul_(2)
+            return a + b + c
+
+        inp = [
+            torch.ones(3, 3, requires_grad=True),
+            torch.ones(3, 3, requires_grad=True),
+            torch.ones(3, 3, requires_grad=True),
+        ]
+
+        fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True, return_fw_graph=True)
+        self.assertExpectedInline(fw_graph.code.strip(), """\
+def forward(self, primals_1, primals_2, primals_3):
+    clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
+    clone_1 = torch.ops.aten.clone.default(primals_3);  primals_3 = None
+    mul = torch.ops.aten.mul.Tensor(clone, 2);  clone = None
+    mul_1 = torch.ops.aten.mul.Tensor(clone_1, 2);  clone_1 = None
+    add = torch.ops.aten.add.Tensor(mul, primals_2);  primals_2 = None
+    add_1 = torch.ops.aten.add.Tensor(add, mul_1);  add = None
+    return [mul, mul_1, add_1]""")
+
+    def test_input_mutation_metadata(self):
+        def f(a, b):
+            a.transpose_(1, 0)
+            return a + b
+        inp = [
+            torch.ones(3, 3, requires_grad=True),
+            torch.ones(3, 3, requires_grad=True),
+        ]
+
+        self.verify_aot_autograd(f, inp, test_mutation=True, return_fw_graph=True)
+
+    def test_input_mutation_metadata2(self):
+        def f(a):
+            a.transpose_(1, 0)
+            a.mul_(2)
+            return a + 1
+        inp = [torch.ones(3, 3, requires_grad=True)]
+
+        self.verify_aot_autograd(f, inp, test_mutation=True, return_fw_graph=True)
+
+    def test_input_mutation_resize_smaller(self):
+        def f(a, b):
+            a.resize_(2, 2)
+            return a + b
+        # tenors that require gradients cannot be resized, so only test requires_grad=False case
+        inp = [
+            torch.ones(3, 3),
+            torch.ones(2, 2, requires_grad=True),
+        ]
+
+        self.verify_aot_autograd(f, inp, test_mutation=True, return_fw_graph=True)
+
+    def test_input_output_view_simple(self):
+        def f(a):
+            return a.view(-1)
+        inp = [
+            torch.ones(2, 2, requires_grad=True).add(1),
+        ]
+
+        fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True, return_fw_graph=True)
+        # Outputs that alias inputs are pulled out of the graph entirely, so we don't compile anything here
+        self.assertExpectedInline(fw_graph.code.strip(), """\
+def forward(self, primals_1):
+    return [4, 1, 0]""")
+
+    def test_input_output_view_mutate_multiple(self):
+        def f(a, b, c):
+            a.mul_(2)
+            c.mul_(3)
+            return b.view(2, 2), c.view(2, 2)
+        inp = [
+            torch.ones(2, 2, requires_grad=True).add(1),
+            torch.ones(2, 2, requires_grad=True).add(1),
+            torch.ones(2, 2, requires_grad=True).add(1),
+        ]
+
+        fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True, return_fw_graph=True)
+        # The original function returned two outputs, both of which aliased inputs.
+        # We expect two outputs in the functional graph, a_updated and c_updated.
+        # The actual aliased outputs themselves aren't in the compiled forward graph;
+        # Instead, they're generated outside of  the graph.
+        self.assertExpectedInline(fw_graph.code.strip(), """\
+def forward(self, primals_1, primals_2, primals_3):
+    clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
+    clone_1 = torch.ops.aten.clone.default(primals_3);  primals_3 = None
+    mul = torch.ops.aten.mul.Tensor(clone, 2);  clone = None
+    mul_1 = torch.ops.aten.mul.Tensor(clone_1, 3);  clone_1 = None
+    return [mul, mul_1, 2, 2, 2, 1, 0, 2, 2, 2, 1, 0]""")
+
+    def test_input_output_view_metadata_mutate_multiple(self):
+        def f(a, b, c):
+            b.mul_(3)
+            c.t_()
+            return a.view(2, 2), b.view(2, 2), c.view(2, 2)
+        inp = [
+            torch.ones(2, 2, requires_grad=True).add(1),
+            torch.ones(2, 2, requires_grad=True).add(1),
+            torch.ones(2, 2, requires_grad=True).add(1),
+        ]
+
+        fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True, return_fw_graph=True)
+        # Important thing to check here: of the three inputs:
+        # Only the b.mul_(3) should show up in the graph (we functionalize it and return it).
+        # Everything else that does not show up in the graph includes:
+        # - The metadata mutation on c (we do it outside the graph)
+        # - All 3 original fw outputs, which are aliases of inputs (we regenerate them outside of the graph)
+        self.assertExpectedInline(fw_graph.code.strip(), """\
+def forward(self, primals_1, primals_2, primals_3):
+    clone = torch.ops.aten.clone.default(primals_2);  primals_2 = None
+    mul = torch.ops.aten.mul.Tensor(clone, 3);  clone = None
+    return [mul, 2, 2, 1, 2, 0, 2, 2, 2, 1, 0, 2, 2, 2, 1, 0, 2, 2, 1, 2, 0]""")
+
+    def test_input_mutation_and_output_view(self):
+        def f(a):
+            a.add_(1)
+            return a.view(-1)
+        inp = [
+            torch.ones(2, 2, requires_grad=True).add(1),
+        ]
+
+        fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True, return_fw_graph=True)
+        # Here, total # of outputs is 1 because:
+        # - num_mutated_inps = 1 (a_updated)
+        # - num_fw_outputs = 0 (the output is an alias of the input, so we move it outside the compiled fw)
+        self.assertExpectedInline(fw_graph.code.strip(), """\
+def forward(self, primals_1):
+    clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
+    add = torch.ops.aten.add.Tensor(clone, 1);  clone = None
+    return [add, 4, 1, 0]""")
+
+
+    def test_input_mutation_output_view_multiple(self):
+        def f(a, b, c, d):
+            b.transpose_(1, 0)
+            c.add_(1)
+            return d + 1, b.diagonal(), a + c
+        inp = [
+            torch.arange(4, requires_grad=True, dtype=torch.float32).view(2, 2).add(1),
+            torch.arange(4, requires_grad=True, dtype=torch.float32).view(2, 2).add(1),
+            torch.ones(2, 2, requires_grad=True).add(1),
+            torch.ones(2, 2, requires_grad=True).add(1),
+        ]
+
+        fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True, return_fw_graph=True)
+        self.assertExpectedInline(fw_graph.code.strip(), """\
+def forward(self, primals_1, primals_2, primals_3, primals_4):
+    clone = torch.ops.aten.clone.default(primals_3);  primals_3 = None
+    add = torch.ops.aten.add.Tensor(clone, 1);  clone = None
+    add_1 = torch.ops.aten.add.Tensor(primals_4, 1);  primals_4 = None
+    add_2 = torch.ops.aten.add.Tensor(primals_1, add);  primals_1 = None
+    return [add, add_1, add_2, 2, 2, 1, 2, 0, 2, 3, 0]""")
+
+    def test_input_data_and_metadata_mutation(self):
+        def f(a):
+            a.t_()
+            a[0].mul_(2)
+            return a.view(a.shape)
+        inp = [torch.ones(3, 3, requires_grad=True)]
+
+        fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True, return_fw_graph=True)
+        self.assertExpectedInline(fw_graph.code.strip(), """\
+def forward(self, primals_1):
+    clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
+    t = torch.ops.aten.t.default(clone)
+    select = torch.ops.aten.select.int(t, 0, 0);  t = None
+    mul = torch.ops.aten.mul.Tensor(select, 2);  select = None
+    t_1 = torch.ops.aten.t.default(clone);  clone = None
+    select_scatter = torch.ops.aten.select_scatter.default(t_1, mul, 0, 0);  t_1 = mul = None
+    t_2 = torch.ops.aten.t.default(select_scatter);  select_scatter = None
+    t_3 = torch.ops.aten.t.default(t_2);  t_2 = None
+    return [t_3, 3, 3, 1, 3, 0]""")
+
+    def test_view_and_inplace_view(self):
+        def f(a, b):
+            a.t_()
+            return b.view(b.shape), a.view(a.shape)
+        inp = [
+            torch.ones(3, 3, requires_grad=True),
+            torch.ones(3, 3, requires_grad=True)
+        ]
+
+        fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True, return_fw_graph=True)
+        self.assertExpectedInline(fw_graph.code.strip(), """\
+def forward(self, primals_1, primals_2):
+    return [3, 3, 1, 3, 0, 3, 3, 3, 1, 0, 3, 3, 1, 3, 0]""")
+
+    def test_view_detach(self):
+        def f(a):
+            tmp = a.detach()
+            a.mul_(2)
+            return a, tmp
+        inp = [
+            torch.ones(3, 3, requires_grad=True),
+        ]
+
+        self.verify_aot_autograd(f, inp, test_mutation=True, return_fw_graph=True)
+
+    def test_input_inplace_requires_grad_true(self):
+        def f(a, b):
+            a.requires_grad_(True)
+            return a.mul(3), b.mul(4)
+        inp = [
+            # First inp doesnt require grad, but we switch it on
+            torch.ones(3, 3, requires_grad=False),
+            torch.ones(3, 3, requires_grad=True),
+        ]
+
+        fw_graph = self.verify_aot_autograd(f, inp, test_mutation=True, return_fw_graph=True)
+        self.assertExpectedInline(fw_graph.code.strip(), """\
+def forward(self, primals_1, primals_2):
+    mul = torch.ops.aten.mul.Tensor(primals_1, 3);  primals_1 = None
+    mul_1 = torch.ops.aten.mul.Tensor(primals_2, 4);  primals_2 = None
+    return [mul, mul_1]""")
+
+    def test_input_data_and_metadata_mutation_aliases_other_input(self):
+        # a and b are aliased
+        def f(a, b):
+            a.t_()
+            b.mul_(2)
+            return a.mul(3)
+
+        def inp_callable():
+            base = torch.ones(2, 2, requires_grad=True)
+            # Note: in our test, the add() is important because we need the graph inputs to be non-leaves so we can mutate them.
+            x = base.add(1)
+            inp1 = x.view(-1)
+            inp2 = x.view(-1)
+            return [base], [inp1, inp2]
+
+        fw_graph = self.verify_aot_autograd(f, inp_callable, test_mutation=True, return_fw_graph=True)
+        # Important parts of the graph:
+        # - the compiled graph takes in a base, and we generate a and b (the views) off of the base
+        # - clone() is still in the graph, because we need to call grad() on the original (non-mutated) inputs
+        # - We re-generate the views *after* the clone, to preserve view relationships.
+        self.assertExpectedInline(fw_graph.code.strip(), """\
+def forward(self, primals_1):
+    clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
+    as_strided_1 = torch.ops.aten.as_strided.default(clone, [4], [1], 0)
+    mul = torch.ops.aten.mul.Tensor(as_strided_1, 2);  as_strided_1 = None
+    as_strided_scatter = torch.ops.aten.as_strided_scatter.default(clone, mul, [4], [1], 0);  clone = None
+    as_strided_5 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0);  as_strided_scatter = None
+    t_1 = torch.ops.aten.t.default(as_strided_5);  as_strided_5 = None
+    mul_1 = torch.ops.aten.mul.Tensor(t_1, 3);  t_1 = None
+    return [mul, mul_1, 4, 1, 0]""")
+
+    def test_input_mutation_aliases_other_input(self):
+        def f(a, b):
+            a.add_(1)
+            return a + b
+
+        def inp_callable():
+            base = torch.ones(2, 2, requires_grad=True)
+            # Note: in our test, the add() is important because we need the graph inputs to be non-leaves so we can mutate them.
+            x = base.add(1)
+            inp1 = x[0]
+            inp2 = x[1]
+            return [base], [inp1, inp2]
+
+        fw_graph = self.verify_aot_autograd(f, inp_callable, test_mutation=True, return_fw_graph=True)
+        # Important parts of the graph:
+        # - the compiled graph takes in a base, and we generate a and b (the views) off of the base
+        # - clone() is still in the graph, because we need to call grad() on the original (non-mutated) inputs
+        # - We re-generate the views *after* the clone, to preserve view relationships.
+        self.assertExpectedInline(fw_graph.code.strip(), """\
+def forward(self, primals_1):
+    clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
+    as_strided = torch.ops.aten.as_strided.default(clone, [2], [1], 0)
+    add = torch.ops.aten.add.Tensor(as_strided, 1);  as_strided = None
+    as_strided_scatter = torch.ops.aten.as_strided_scatter.default(clone, add, [2], [1], 0);  clone = None
+    as_strided_4 = torch.ops.aten.as_strided.default(as_strided_scatter, [2], [1], 2);  as_strided_scatter = None
+    add_1 = torch.ops.aten.add.Tensor(add, as_strided_4);  as_strided_4 = None
+    return [add, add_1]""")
+
+    def test_input_mutation_aliases_other_input2(self):
+        def f(a, b):
+            a.add_(1)
+            return a + b
+
+        def inp_callable():
+            base = torch.ones(2, 2, requires_grad=True)
+            x = base.add(1)
+            inp1 = x[0]
+            # Here, one of the aliased inputs is the base itself
+            inp2 = x
+            return [base], [inp1, inp2]
+
+        fw_graph = self.verify_aot_autograd(f, inp_callable, test_mutation=True, return_fw_graph=True)
+        self.assertExpectedInline(fw_graph.code.strip(), """\
+def forward(self, primals_1):
+    clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
+    as_strided = torch.ops.aten.as_strided.default(clone, [2], [1], 0)
+    add = torch.ops.aten.add.Tensor(as_strided, 1);  as_strided = None
+    as_strided_scatter = torch.ops.aten.as_strided_scatter.default(clone, add, [2], [1], 0);  clone = None
+    as_strided_4 = torch.ops.aten.as_strided.default(as_strided_scatter, [2, 2], [2, 1], 0);  as_strided_scatter = None
+    add_1 = torch.ops.aten.add.Tensor(add, as_strided_4);  as_strided_4 = None
+    return [add, add_1]""")
+
+    def test_input_mutation_aliases_and_output_alias(self):
+        def f(a, b):
+            # Here, we need to take care:that because and b are aliased
+            # (1) since a and b are aliased, we generate a view off of "updated b"
+            # (2) We're returning a view, which doesn't show up in the graph
+            a.add_(1)
+            return b.view(b.shape)
+
+        def inp_callable():
+            base = torch.ones(2, 2, requires_grad=True)
+            x = base.add(1)
+            return [base], [x.view(-1), x.view(-1)]
+
+        fw_graph = self.verify_aot_autograd(f, inp_callable, test_mutation=True, return_fw_graph=True)
+        self.assertExpectedInline(fw_graph.code.strip(), """\
+def forward(self, primals_1):
+    clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
+    as_strided = torch.ops.aten.as_strided.default(clone, [4], [1], 0);  clone = None
+    add = torch.ops.aten.add.Tensor(as_strided, 1);  as_strided = None
+    return [add, 4, 1, 0]""")
+
+    def test_input_aliased_with_mutation_output_alias(self):
+        def f(a, b, c):
+            # a and c alias
+            c.mul_(2)
+            # The main thing we're testing here is that
+            # (1) We need to reconstruct c.view(-1) from the 3rd input to the forward
+            # (2) But we need to be careful to do this *before* converting aliased inputs into synthetic bases.
+            #     The original fw takes in 3 args, but the compiled fw takes in only 2 args.
+            return b.add(1), c.view(-1)
+
+        def inp_callable():
+            base1 = torch.ones(2, 2, requires_grad=True)
+            base2 = torch.ones(2, 2, requires_grad=True)
+            x = base1.add(1)
+            y = base2.add(1)
+            return [base1, base2], [x.view(-1), y, x.view(-1)]
+
+        fw_graph = self.verify_aot_autograd(f, inp_callable, test_mutation=True, return_fw_graph=True)
+        self.assertExpectedInline(fw_graph.code.strip(), """\
+def forward(self, primals_1, primals_2):
+    clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
+    as_strided_1 = torch.ops.aten.as_strided.default(clone, [4], [1], 0);  clone = None
+    mul = torch.ops.aten.mul.Tensor(as_strided_1, 2);  as_strided_1 = None
+    add = torch.ops.aten.add.Tensor(primals_2, 1);  primals_2 = None
+    return [mul, add, 4, 1, 0]""")
+
+    def test_input_metadata_mutation_aliases(self):
+        def f(a, b):
+            # a and b alias, and we do a metadata mutation on a
+            # Since we're not mutating data, then b isn't affected at all.
+            # We expect aot autograd to not bother with constructing a synthetic base.
+            a.t_()
+            return a + b
+
+        def inp_callable():
+            base = torch.ones(2, 2, requires_grad=True)
+            x = base.add(1)
+            return [base], [x.view(-1), x.view(-1)]
+
+        fw_graph = self.verify_aot_autograd(f, inp_callable, test_mutation=True, return_fw_graph=True)
+        # Expectation: fwd() takes in 2 args, and we don't construct a synthetic base.
+        self.assertExpectedInline(fw_graph.code.strip(), """\
+def forward(self, primals_1, primals_2):
+    view = torch.ops.aten.view.default(primals_1, [4]);  primals_1 = None
+    t = torch.ops.aten.t.default(view);  view = None
+    add = torch.ops.aten.add.Tensor(t, primals_2);  t = primals_2 = None
+    return [add, 4, 1, 0]""")
+
+    def test_input_mutation_aliases_and_none_require_gradients(self):
+        def f(a, b, c):
+            # a and b alias, but neither require gradients (so they don't have a _base)
+            # aot autograd should construct the synthetic base from `torch.Tensor(a.storage())`
+            a.mul_(2)
+            return b + 1, c + 1
+
+        def inp_callable():
+            base = torch.ones(2, 2)
+            c_arg = torch.ones(2, 2, requires_grad=True)
+            x = base.add(1)
+            return [base, c_arg], [x.view(-1), x.view(-1), c_arg]
+
+        fw_graph = self.verify_aot_autograd(f, inp_callable, test_mutation=True, return_fw_graph=True)
+        self.assertExpectedInline(fw_graph.code.strip(), """\
+def forward(self, primals_1, primals_2):
+    clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
+    as_strided = torch.ops.aten.as_strided.default(clone, [4], [1], 0)
+    mul = torch.ops.aten.mul.Tensor(as_strided, 2);  as_strided = None
+    as_strided_scatter = torch.ops.aten.as_strided_scatter.default(clone, mul, [4], [1], 0);  clone = None
+    as_strided_2 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0);  as_strided_scatter = None
+    add = torch.ops.aten.add.Tensor(as_strided_2, 1);  as_strided_2 = None
+    add_1 = torch.ops.aten.add.Tensor(primals_2, 1);  primals_2 = None
+    return [mul, add, add_1]""")
+
+    def test_input_mutation_aliases_bases_out_of_order(self):
+        # This tests our calling convention: if b and d are aliased, then the outer calling convention
+        # that we send to the compiled forward becomes:
+        # (b_d_base, a, c)
+        # Importantly, even though a and c alias in our test, neither inputs are mutated,
+        # So we don't need to do the base construction / deconstruction
+        def f(a, b, c, d):
+            b.add_(1)
+            d.t_()
+            return a + c + d, b.view(-1)
+
+        def inp_callable():
+            base1 = torch.ones(2, 2, requires_grad=True)
+            base2 = torch.ones(2, 2, requires_grad=True)
+            x1 = base1.add(1)
+            x2 = base2.add(1)
+            # a and c alias, b and d alias
+            return [base1, base2], [x1.view(-1), x2.view(-1), x1.view(-1), x2.view(-1)]
+
+        fw_graph = self.verify_aot_autograd(f, inp_callable, test_mutation=True, return_fw_graph=True)
+        # 3 graph inputs: (b_d_base, a, c)
+        # 2 returns: (b_updated, a+c+d)
+        # (there are 2 original fw outs, but one is a view of b so it's not part of the graph)
+        # (there are also 2 input mutations, but one is a metadata-only mutation so the compiled forward doesn't return it)
+        self.assertExpectedInline(fw_graph.code.strip(), """\
+def forward(self, primals_1, primals_2, primals_3):
+    clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
+    as_strided = torch.ops.aten.as_strided.default(clone, [4], [1], 0)
+    add = torch.ops.aten.add.Tensor(as_strided, 1);  as_strided = None
+    add_1 = torch.ops.aten.add.Tensor(primals_2, primals_3);  primals_2 = primals_3 = None
+    as_strided_scatter = torch.ops.aten.as_strided_scatter.default(clone, add, [4], [1], 0);  clone = None
+    as_strided_4 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0);  as_strided_scatter = None
+    t_1 = torch.ops.aten.t.default(as_strided_4);  as_strided_4 = None
+    add_2 = torch.ops.aten.add.Tensor(add_1, t_1);  add_1 = t_1 = None
+    return [add, add_2, 4, 1, 0, 4, 1, 0]""")
+
+    # Mondo test that tests a combination of:
+    # input is mutated, that aliases another input (so we make a synthetic base)
+    # an output is an alias of another output
+    # an output is an alias of an intermediate
+    def test_input_mutation_alias_everything(self):
+        # a and c are aliased
+        def f(a, b, c):
+            c.mul_(2)  # mutates c
+            b.t_()  # metadata mutate b
+            tmp = a + c
+            # TODO: this test doesn't test "alias of an intermediate" yet,
+            # delete this line later and get that to be tested
+            return tmp, b.t(), a
+            out1 = tmp.view(-1)
+            out2 = b.t()
+            out3 = out1.unsqueeze(0)
+            # out1 and out3 are aliases of an intermediate, and alias each other!
+            # out2 aliases an input, so we don't return it
+            return out1, out2, out3
+
+        def inp_callable():
+            base1 = torch.ones(2, 2, requires_grad=True)
+            base2 = torch.ones(2, 2, requires_grad=True)
+            # Note: in our test, the add() is important because we need the graph inputs to be non-leaves so we can mutate them.
+            base1_ = base1.add(1)
+            base2_ = base2.add(1)
+            a = base1_.view(-1)
+            b = base2_
+            c = base1_.view(-1)
+            return [base1, base2], [a, b, c]
+
+        fw_graph = self.verify_aot_autograd(f, inp_callable, test_mutation=True, return_fw_graph=True)
+        # Expected:
+        # - 2 inputs in the forward: synthetic_base_a_c, b
+        # - 1 output in the forward: "tmp"
+        #   out2 is an alias of an input, and will be generated off of b outside of the compiled fn
+        #   out1 and out3 are aliases of tmp, that we generate outside of the compiled function
+        self.assertExpectedInline(fw_graph.code.strip(), """\
+def forward(self, primals_1, primals_2):
+    clone = torch.ops.aten.clone.default(primals_1);  primals_1 = None
+    as_strided_1 = torch.ops.aten.as_strided.default(clone, [4], [1], 0)
+    mul = torch.ops.aten.mul.Tensor(as_strided_1, 2);  as_strided_1 = None
+    as_strided_scatter = torch.ops.aten.as_strided_scatter.default(clone, mul, [4], [1], 0);  clone = None
+    as_strided_4 = torch.ops.aten.as_strided.default(as_strided_scatter, [4], [1], 0);  as_strided_scatter = None
+    add = torch.ops.aten.add.Tensor(as_strided_4, mul);  as_strided_4 = None
+    return [mul, add, 2, 2, 1, 2, 0, 2, 2, 2, 1, 0]""")
+
     def test_no_grad_input_output(self):
         def f(a, b):
             return a.cos(), b.cos(), a * b
@@ -287,12 +856,26 @@ def f(a, b):
             inps = [i() for i in inps]
             self.verify_aot_autograd(f, inps)
 
-    def test_some_outputs_dont_require_grad(self):
+    def test_some_output_requires_grad_input_doesnt(self):
+        def f(a, b):
+            a_view = a.view(-1)
+            a_view.requires_grad_(True)
+            return a_view
+        inp = [torch.randn(3, 3), torch.randn(3, 3, requires_grad=True)]
+        self.verify_aot_autograd(f, inp)
+
+    def test_some_outputs_dont_require_grad_view(self):
         def f(a, b):
             return a.detach(), b
         inp = [torch.randn(3, 3, requires_grad=True), torch.randn(3, 3, requires_grad=True)]
         self.verify_aot_autograd(f, inp)
 
+    def test_some_outputs_dont_require_grad_non_view(self):
+        def f(a, b):
+            return a.add(1).detach(), b
+        inp = [torch.randn(3, 3, requires_grad=True), torch.randn(3, 3, requires_grad=True)]
+        self.verify_aot_autograd(f, inp)
+
     def test_inner_grad(self):
         def foo(x):
             y = torch.exp(x)
@@ -340,8 +923,12 @@ def f(x):
             for k in x:
                 new_d[k] = x[k] * 2
             return new_d
-        inp = [{'a': torch.randn(3, requires_grad=True), 'b': torch.randn(3, requires_grad=True)}]
-        self.verify_aot_autograd(f, inp)
+
+        def inp_callable():
+            inps = [{'a': torch.randn(3, requires_grad=True), 'b': torch.randn(3, requires_grad=True)}]
+            return inps, inps
+
+        self.verify_aot_autograd(f, inp_callable)
 
     def test_module(self):
         mod = nn.Sequential(nn.Linear(32, 32), nn.ReLU())
@@ -686,15 +1273,20 @@ def f(a, b, c, d):
         (compiled_outs[0].sum() + compiled_outs[2].sum()).backward()
         bw_graph = bw_graph_cell[0]
 
-        self.assertEqual(get_num_ins_outs(fw_graph), (4, 13))
-        self.assertEqual(get_num_ins_outs(bw_graph), (13, 4))
+        # 12 outs because:
+        # - 5 original outputs -> 4 graph outputs (the 3rd output is an input alias, gets moved outside)
+        # - 8 saved outputs for backward: 5 tensors, 3 symints
+        self.assertEqual(get_num_ins_outs(fw_graph), (4, 12))
+        self.assertEqual(get_num_ins_outs(bw_graph), (12, 4))
         _, fw_graph_out_nodes = get_ins_outs(fw_graph)
         self.assertEqual(
             # fw outputs include b.size() which expands to 2 symints,
             #
             # TODO(whc)- are the saved-tensors/saved-symints correct here?
             # i just made the test pass based on what default partition did
-            [False, True, True, False, False] + [False] * 4 + [True] * 4,
+            # Of the 5 original forward outputs, the 4th (c) is an input,
+            # which won't show up in the compiled forward graph
+            [False, True, True, False] + [False] * 4 + [True] * 4,
             [is_sym_node(n) for n in fw_graph_out_nodes]
         )
 
@@ -742,14 +1334,14 @@ def f(a, b, c, d):
         (compiled_outs[0].sum() + compiled_outs[2].sum()).backward()
         bw_graph = bw_graph_cell[0]
 
-        self.assertEqual(get_num_ins_outs(fw_graph), (4, 13))
-        self.assertEqual(get_num_ins_outs(bw_graph), (13, 4))
+        self.assertEqual(get_num_ins_outs(fw_graph), (4, 12))
+        self.assertEqual(get_num_ins_outs(bw_graph), (12, 4))
         _, fw_graph_out_nodes = get_ins_outs(fw_graph)
         self.assertEqual(
             # fw outputs include b.size() which expands to 2 symints,
             # then 4 tensors (transposes of matricies used for mm) are saved
             # finally 4 symints are saved
-            [False, True, True, False, False] + [False] * 4 + [True] * 4,
+            [False, True, True, False] + [False] * 4 + [True] * 4,
             [is_sym_node(n) for n in fw_graph_out_nodes]
         )
 
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 89b94cec0ae1f..2287316d788e4 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -1497,8 +1497,11 @@ def test_conv2d_unary(self):
             ).eval()
 
             # TODO: add bf16 test for cpu path?
-            v = torch.randn(x_shape, dtype=torch.float32).to(
-                memory_format=memory_format
+            # TODO: this test fails when requires_grad=False
+            v = (
+                torch.randn(x_shape, dtype=torch.float32, requires_grad=True)
+                .add(1)
+                .to(memory_format=memory_format)
             )
             with torch.no_grad():
                 self.common(
@@ -3241,7 +3244,9 @@ def fn(a):
             c = a + 2
             return b, c
 
-        arg1 = torch.randn([1, 64], device=self.device)
+        # NOTE: this test fails when none of the inputs require grad.
+        # That seems like an inductor bug.
+        arg1 = torch.randn([1, 64], device=self.device).requires_grad_(True).add(1)
         arg2 = arg1.clone()
         correct1 = fn(arg1)
         opt_fn = torch._dynamo.optimize_assert(compile_fx)(fn)
@@ -4445,7 +4450,10 @@ def forward(arg38_1, arg81_1, getitem_17, new_zeros_default_4):
             ((1, 88, 40, 40), (140800, 1600, 40, 1), torch.float32),
             ((3,), (1,), torch.float32),
         ]
-        args = [rand_strided(shape, stride, dtype) for shape, stride, dtype in args]
+        args = [
+            rand_strided(shape, stride, dtype).requires_grad_(True).add(1)
+            for shape, stride, dtype in args
+        ]
         self.common(forward, args)
 
     def test_misaligned_address_issue1(self):
@@ -5249,6 +5257,23 @@ def forward(self, x):
                     for param in model_opt.parameters():
                         param.add_(1.0)
 
+        # https://github.com/pytorch/torchdynamo/issues/1850
+        def test_inductor_output_aliases_intermediate(self):
+            def foo(x):
+                out = x + x
+                return out.t()
+
+            foo_opt = torch._dynamo.optimize("inductor")(foo)
+
+            inpt = torch.randn(10, 10, device="cuda", requires_grad=True)
+            # TODO: this is broken, fix later
+            # out = foo_opt(inpt)
+            # out.add_(2)
+
+            out_ref = foo(inpt)
+            out_ref.add_(2)
+            # self.assertEqual(out_ref, out)
+
         def test_accuracy_issue1(self):
             class Repro(torch.nn.Module):
                 def __init__(self):
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 8d2ac24afb7e2..2a791075706e8 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -246,7 +246,7 @@ def process(device_type):
     "unique_consecutive": {b8, f32, f64, i32, i64},
     "var": {f16},
     "var_mean": {f16},
-    "view_as_complex": {f16, f32, f64},
+    "view_as_complex": {f16},
 }
 
 
@@ -341,7 +341,6 @@ def process(device_type):
     "uniform": {f16, f32, f64},
     "unique": {b8, f16, f32, f64, i32, i64},
     "unique_consecutive": {b8, f16, f32, f64, i32, i64},
-    "view_as_complex": {f16, f32, f64},
     # AssertionError: Tensor-likes are not close!
     "nn.functional.triplet_margin_loss": {f16},
 }
diff --git a/torch/_dynamo/optimizations/training.py b/torch/_dynamo/optimizations/training.py
index a56a74ad5aeae..06b66e282a9ec 100644
--- a/torch/_dynamo/optimizations/training.py
+++ b/torch/_dynamo/optimizations/training.py
@@ -78,8 +78,10 @@ def raise_or_warn(reason):
             raise e
         pass
 
-    if mutated:
-        return raise_or_warn("mutation")
+    # TODO: delete the logic for this later.
+    # Now that aot autograd supports aliasing and mutation, we don't need it.
+    # if mutated:
+    # return raise_or_warn("mutation")
 
     return True
 
@@ -289,7 +291,7 @@ def prims_executor(gm, inputs, *, executor):
     return make_boxed_func(partial(execute, gm, executor=executor))
 
 
-def nvprims_fw_bw_partition_fn(joint_module, joint_inputs):
+def nvprims_fw_bw_partition_fn(joint_module, joint_inputs, *, num_fwd_outputs):
     # This function is called once per forward+backward pass of a graph in AOT
     # Autograd. We use it to set up the nvFuser-specific FX graph that is later
     # passed to the executor.
@@ -317,7 +319,10 @@ def func(primals, tangents):
     }
 
     fw_gm, bw_gm = min_cut_rematerialization_partition(
-        prim_gm, joint_inputs, recomputable_ops=recomputable_ops
+        prim_gm,
+        joint_inputs,
+        recomputable_ops=recomputable_ops,
+        num_fwd_outputs=num_fwd_outputs,
     )
     # AOT Autograd might not use the partitioner, so we need to make sure that
     # the graph is marked as already transformed to use nvFuser-compatible nodes
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index 4c4681b75622b..f69550cb34f2c 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -382,6 +382,31 @@ def get_state_from_generator():
                 **options,
             )
         else:
+            any_symints_or_symfloats = any(
+                [isinstance(x, DynamicShapeVariable) for x in args]
+            )
+            all_ints_or_floats = all(
+                [
+                    isinstance(
+                        x, (variables.ConstantVariable, variables.DynamicShapeVariable)
+                    )
+                    for x in args
+                ]
+            )
+            bin_ops = set(["add", "sub", "mul", "div", "sqrt"])
+            if (
+                self.value.__module__ == "torch"
+                and self.value.__name__ in bin_ops
+                and any_symints_or_symfloats
+                and all_ints_or_floats
+            ):
+                msg = f"""\
+Calling {str(self.value)} on only torch.SymInt arguments is not yet supported.
+To support this behavior, we need to allow const-propping tensors that store symint data.
+For now, dynamo will explicitly graph break when it encounters user code with this behavior.
+"""
+                log.warning(msg)
+                raise unimplemented(msg)
             # Handle sth like torch.LongTensor(list(np.int64, np.int64, ...)),
             # as FX symbolic trace doesn't support numpy int/float as base types.
             if (
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 7a5791de8a388..5aef1c548e2df 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -271,7 +271,15 @@ def output(self, target, args, kwargs):
         assert isinstance(result, (tuple, list)), type(result)
         assert all(
             isinstance(
-                x, (TensorBox, ir.Constant, type(None), ir.ConstantBuffer, sympy.Expr)
+                x,
+                (
+                    TensorBox,
+                    ir.Constant,
+                    type(None),
+                    ir.ConstantBuffer,
+                    sympy.Expr,
+                    int,
+                ),
             )
             for x in result
         ), result
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 8327fe0d7b521..f612000edf4ce 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -2439,7 +2439,7 @@ def convert_to_reinterpret_view(cls, x):
     def realize_input(cls, x):
         if x is None:
             return NoneAsConstantBuffer()
-        if isinstance(x, sympy.Expr):
+        if isinstance(x, (sympy.Expr, int)):
             return ShapeAsConstantBuffer(x)
         if isinstance(x, Constant):
             return V.graph.add_tensor_constant(
diff --git a/torch/csrc/DynamicTypes.cpp b/torch/csrc/DynamicTypes.cpp
index 93bb37017ce0b..00674cf81229e 100644
--- a/torch/csrc/DynamicTypes.cpp
+++ b/torch/csrc/DynamicTypes.cpp
@@ -13,6 +13,7 @@
 #include <torch/csrc/utils/object_ptr.h>
 
 #include <ATen/ATen.h>
+#include <ATen/FunctionalStorageImpl.h>
 
 #include <array>
 #include <memory>
@@ -77,7 +78,11 @@ THPLayout* getTHPLayout(at::Layout layout) {
 
 PyObject* createPyObject(const at::Storage& storage) {
   if (storage.device_type() != at::DeviceType::Meta &&
-      storage.data() == nullptr && storage.nbytes() != 0) {
+      storage.data() == nullptr && storage.sym_nbytes() != 0 &&
+      // Grabbing storage() from FunctionalTensorWrapper is allowed.
+      // This is useful for checking aliasing info from python
+      dynamic_cast<at::functionalization::FunctionalStorageImpl*>(
+          storage.unsafeGetStorageImpl()) == nullptr) {
     TORCH_CHECK_NOT_IMPLEMENTED(
         false,
         "python bindings to nullptr storage (e.g., from torch.Tensor._make_wrapper_subclass) are currently unsafe and thus disabled.  See https://github.com/pytorch/pytorch/issues/61669 for more details");
diff --git a/torch/csrc/autograd/python_torch_functions_manual.cpp b/torch/csrc/autograd/python_torch_functions_manual.cpp
index bd969f6a26fb2..0a9a71a01a6c6 100644
--- a/torch/csrc/autograd/python_torch_functions_manual.cpp
+++ b/torch/csrc/autograd/python_torch_functions_manual.cpp
@@ -405,12 +405,27 @@ static PyObject* THPVariable__to_functional_tensor(
     PyObject* kwargs) {
   HANDLE_TH_ERRORS
   static PythonArgParser parser(
-      {"_to_functional_tensor(Tensor t)"}, /*traceable=*/true);
+      {"_to_functional_tensor(Tensor t, *, bool mirror_autograd_meta=False)"},
+      /*traceable=*/true);
 
-  ParsedArgs<1> parsed_args;
+  ParsedArgs<2> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
   auto self_ = r.tensor(0);
+  auto mirror_autograd_meta = r.toBool(1);
   auto wrapped = at::functionalization::impl::to_functional_tensor(self_);
+  if (mirror_autograd_meta) {
+    // Here, we unsafely set the grad function on the wrapper to be the same as
+    // the inner. We expect this grad_fn to NEVER be used. It's needed so that
+    // .is_leaf metadata is accurate on the wrapper
+    auto inner_autograd_meta = impl::get_autograd_meta(self_);
+    if (inner_autograd_meta) {
+      wrapped.set_requires_grad(self_.requires_grad());
+      if (wrapped.requires_grad()) {
+        impl::get_autograd_meta(wrapped)->grad_fn_ =
+            inner_autograd_meta->grad_fn_;
+      }
+    }
+  }
   return wrap(wrapped);
   END_HANDLE_TH_ERRORS
 }

From b26e00a03854e33e14dc7580587166b14bc50b4d Mon Sep 17 00:00:00 2001
From: Iris <wz337@cornell.edu>
Date: Wed, 23 Nov 2022 19:36:01 +0000
Subject: [PATCH 1216/1922] [Checkpoint] Add a logger to dedup_tensors (#89503)

Add a logger to dedup_tensors to log the duplicate keys to remove in global plan (List of SavePlan).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89503
Approved by: https://github.com/fduwjj
---
 torch/distributed/checkpoint/dedup_tensors.py | 24 ++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/torch/distributed/checkpoint/dedup_tensors.py b/torch/distributed/checkpoint/dedup_tensors.py
index 4b60e49d31053..53938c3e0e159 100644
--- a/torch/distributed/checkpoint/dedup_tensors.py
+++ b/torch/distributed/checkpoint/dedup_tensors.py
@@ -1,13 +1,30 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
-
-from typing import Dict, List
 import dataclasses
+import logging
+from typing import Dict, List
 
 from torch.distributed.checkpoint.metadata import MetadataIndex
 from torch.distributed.checkpoint.planner import SavePlan
 
 __all__ = ["dedup_tensors"]
 
+
+def init_logger() -> logging.Logger:
+    logger = logging.getLogger(__name__)
+    level = logging.INFO
+    logger.setLevel(level)
+    console = logging.StreamHandler()
+    formatter = logging.Formatter(
+        "%(asctime)s %(filename)s:%(lineno)s %(levelname)s p:%(processName)s t:%(threadName)s: %(message)s"
+    )
+    console.setFormatter(formatter)
+    console.setLevel(level)
+    logger.addHandler(console)
+    logger.propagate = False
+    return logger
+
+logger = init_logger()
+
 # TODO add docstring for dedup_tensors
 def dedup_tensors(all_plans: List[SavePlan]) -> List[SavePlan]:
     all_plans = list(all_plans)
@@ -18,12 +35,13 @@ def dedup_tensors(all_plans: List[SavePlan]) -> List[SavePlan]:
 
     replicated_items = {k: v for k, v in key_to_plan.items() if len(v) > 1}
 
-    # Remove deplicates by always keeping the first entry.
+    # Remove duplicates by always keeping the first entry.
     # Compute the per-rank remove set.
     plan_to_keys: Dict[int, List[MetadataIndex]] = {}
     for key, plans in replicated_items.items():
         for plan_idx in plans[1:]:
             plan_to_keys.setdefault(plan_idx, []).append(key)
+    logger.info(f"Duplicate keys to remove: {plan_to_keys}")
 
     for plan_idx, keys in plan_to_keys.items():
         key_set = set(keys)

From 976edcd36b46f3427023f867419f8d854d89006a Mon Sep 17 00:00:00 2001
From: Iris <wz337@cornell.edu>
Date: Wed, 23 Nov 2022 19:39:01 +0000
Subject: [PATCH 1217/1922] [Checkpoint][2D] Minor update for dedup_tensors.py
 (#89542)

Rename variables for better readability.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89542
Approved by: https://github.com/H-Huang
---
 torch/distributed/checkpoint/dedup_tensors.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/torch/distributed/checkpoint/dedup_tensors.py b/torch/distributed/checkpoint/dedup_tensors.py
index 53938c3e0e159..ea425a6ad9a44 100644
--- a/torch/distributed/checkpoint/dedup_tensors.py
+++ b/torch/distributed/checkpoint/dedup_tensors.py
@@ -30,8 +30,8 @@ def dedup_tensors(all_plans: List[SavePlan]) -> List[SavePlan]:
     all_plans = list(all_plans)
     key_to_plan: Dict[MetadataIndex, List[int]] = {}
     for plan_idx, plan in enumerate(all_plans):
-        for wi in plan.items:
-            key_to_plan.setdefault(wi.index, []).append(plan_idx)
+        for write_item in plan.items:
+            key_to_plan.setdefault(write_item.index, []).append(plan_idx)
 
     replicated_items = {k: v for k, v in key_to_plan.items() if len(v) > 1}
 
@@ -47,7 +47,9 @@ def dedup_tensors(all_plans: List[SavePlan]) -> List[SavePlan]:
         key_set = set(keys)
         # rewrite items and remove elements
         new_items = [
-            wi for wi in all_plans[plan_idx].items if wi.index not in key_set
+            write_item
+            for write_item in all_plans[plan_idx].items
+            if write_item.index not in key_set
         ]
         all_plans[plan_idx] = dataclasses.replace(
             all_plans[plan_idx], items=new_items

From 1b026084cab22d81ccd88ca3ba8166a4f430be4f Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 23 Nov 2022 19:39:43 +0000
Subject: [PATCH 1218/1922] [Dynamo] Several fixes on TensorVariable &
 TorchVariable (#89486)

This is a group of bug fixes for [7k github models](https://github.com/pytorch/torchdynamo/issues/1884), it would fix 30+ model tests.
* Support ```tensor.type()```.
* Support ```tensor.get_device()```.
* Support ```torch.nn.functional._Reduction.get_enum```.
* Support ```torch._utils._get_device_index()```.
* Fallback ```tensor.data_ptr()```.
  * ```FakeTensor``` always returns 0
  * For no fake tensor propagation, we ```clone``` the input tensor, which makes no sense to track the original ```data_ptr```. And I don't think this is a very popular API.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89486
Approved by: https://github.com/jansel
---
 test/dynamo/test_functions.py     |  5 ++++
 test/dynamo/test_misc.py          | 47 ++++++++++++++++++++++++++++++-
 torch/_dynamo/variables/tensor.py | 12 +++++++-
 torch/_dynamo/variables/torch.py  |  2 ++
 4 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py
index 685393bc6766c..7b510fef07547 100644
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@@ -324,6 +324,11 @@ def test_device(x):
         if not x.is_cuda:
             return x + 1
 
+    @make_test
+    def test_tensor_type(a, b):
+        m = a.to(torch.float16)
+        return b.type(m.type())
+
     @make_test
     def test_ndim(x):
         if x.ndim == 2 and x.ndimension() == 2 and x.dim() == 2:
diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index 1a04f25e74043..109118c68b6f6 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -27,6 +27,7 @@
     same,
     unsupported,
 )
+from torch.nn import functional as F
 from torch.testing._internal.common_utils import freeze_rng_state
 from torch.testing._internal.jit_utils import JitTestCase
 
@@ -2071,6 +2072,23 @@ def test_cross_entropy_loss_simple_ctor(self):
 
         self.assertTrue(torch.allclose(dynamo_output, output))
 
+    def test_nn_functional_reduction(self):
+        def fn(loss, reduction):
+            reduction_enum = F._Reduction.get_enum(reduction)
+            if reduction_enum == 0:
+                return loss
+            elif reduction_enum == 1:
+                return loss.mean()
+            elif reduction_enum == 2:
+                return loss.sum()
+
+        x = torch.rand([3, 5])
+        y = "mean"
+        ref = fn(x, y)
+        opt_fn = torch._dynamo.optimize("eager", nopython=True)(fn)
+        res = opt_fn(x, y)
+        self.assertTrue(torch.allclose(ref, res))
+
     def test_large_reduction_list(self):
         dtype = torch.float32
         device = "cpu"
@@ -2176,7 +2194,6 @@ def __init__(self):
                 self.names = []
 
             def forward(self, idx, targets=None):
-                from torch.nn import functional as F
 
                 b, t = idx.size()
                 assert (
@@ -3002,6 +3019,34 @@ def fn(x):
         res = opt_fn(x)
         self.assertTrue(same(ref, res))
 
+    @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
+    def test_get_device(self):
+        def fn(x, y):
+            x = x + 1
+            y = y + 1
+            return x.get_device(), y.get_device()
+
+        x = torch.rand(4, device="cuda")
+        y = torch.rand(4, device="cpu")
+        ref = fn(x, y)
+        opt_fn = torch._dynamo.optimize("eager", nopython=True)(fn)
+        res = opt_fn(x, y)
+        self.assertTrue(same(ref, res))
+
+    @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
+    def test_get_device_index(self):
+        def fn(x):
+            x = x + 1
+            a = torch._utils._get_device_index(x.device)
+            b = torch._utils._get_device_index(1)
+            return a, b
+
+        x = torch.rand(4, device="cuda")
+        ref = fn(x)
+        opt_fn = torch._dynamo.optimize("eager", nopython=True)(fn)
+        res = opt_fn(x)
+        self.assertTrue(same(ref, res))
+
 
 class CustomFunc(torch.autograd.Function):
     @staticmethod
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index 84de57c0f2955..4ef9bf2fab11d 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -225,6 +225,16 @@ def call_method(
             constant_result = ConstantVariable(
                 memory_format in self.is_contiguous, **options
             )
+        elif name == "type" and self.dtype is not None and len(args) == 0:
+            tensortype = [k for k, v in tensortype_to_dtype.items() if self.dtype in v][
+                0
+            ]
+            constant_result = ConstantVariable(
+                f"torch.{tensortype.__name__}", **options
+            )
+        elif name == "get_device" and isinstance(self.device, torch.device):
+            index = self.device.index if self.device.type != "cpu" else -1
+            constant_result = ConstantVariable(index, **options)
         else:
             constant_result = None
 
@@ -245,7 +255,7 @@ def call_method(
             and not config.dynamic_shapes
         ):
             unimplemented("dynamic Tensor.repeat")
-        elif name in ("tolist", "numpy", "backward"):
+        elif name in ("tolist", "numpy", "backward", "data_ptr"):
             unimplemented(f"Tensor.{name}")
         elif name == "nonzero" and not config.dynamic_shapes:
             unimplemented(f"Tensor.{name}")
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index f69550cb34f2c..6e60f2be4ce03 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -164,6 +164,8 @@ def can_constant_fold_through(self):
             torch.iinfo,
             torch.is_floating_point,
             torch.cuda.is_available,
+            torch.nn.functional._Reduction.get_enum,
+            torch._utils._get_device_index,
         ):
             return True
         return getattr(self.value, "__module__", None) == "math"

From e8a573ae118df980f7cabe082457d9551e0b706a Mon Sep 17 00:00:00 2001
From: Sergii Dymchenko <sdym@fb.com>
Date: Wed, 23 Nov 2022 19:39:47 +0000
Subject: [PATCH 1219/1922] Remove BaseException TODO (#89540)

After discussion in https://github.com/pytorch/pytorch/pull/88461#issuecomment-1318965664
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89540
Approved by: https://github.com/H-Huang
---
 torch/distributed/rpc/internal.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/torch/distributed/rpc/internal.py b/torch/distributed/rpc/internal.py
index fb9f4520df01a..435fd29557d0a 100644
--- a/torch/distributed/rpc/internal.py
+++ b/torch/distributed/rpc/internal.py
@@ -205,8 +205,6 @@ def _run_function(python_udf):
         if isinstance(python_udf, AttributeError):
             raise python_udf
         result = python_udf.func(*python_udf.args, **python_udf.kwargs)
-    # TODO (rohan-varma): This should probably be BaseException, but change can
-    # cause BC issues.
     except Exception as e:
         # except str = exception info + traceback string
         except_str = (

From a2651b39feae71ac6d140c65bb0373b68031d5dd Mon Sep 17 00:00:00 2001
From: Charlie West-Taylor <charliew@graphcore.ai>
Date: Wed, 23 Nov 2022 19:41:07 +0000
Subject: [PATCH 1220/1922] Handle Tensor.__deepcopy__ via clone(), on IPU
 (#89129)

Currently it falls through to a call to `storage()`, which the IPU doesn't support.

I've made the minimal change here for ease of merging (this'd help us if it was in for 1.13.1), however...

**QUESTION**: Is there any reason why `not torch._C._has_storage(self)` needs to *also* be guarded on `self.device.type == privateuseone`? in other words, could the condition for using `clone` not be this?

```python
self.is_sparse
or self.device.type
in ["lazy", "xla", "mps", "ort", "meta", "hpu", "ipu"]
or not torch._C._has_storage(self)
or (type(self) is not Tensor and self.data_ptr() == 0)
```

If the condition fails, the very next thing is a call to `self._typed_storage()` which will fail, so it feels to me like *any* case without storage shouldn't fall through to the `storage()` call.

The original PR for adding the 'no storage and device is `PrivateUse1`' condition ([86557](https://github.com/pytorch/pytorch/pull/86557)) doesn't discuss whether this could be broadened.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89129
Approved by: https://github.com/albanD
---
 torch/_tensor.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch/_tensor.py b/torch/_tensor.py
index 39fc56452f5a4..6c610cbadb586 100644
--- a/torch/_tensor.py
+++ b/torch/_tensor.py
@@ -97,7 +97,8 @@ def __deepcopy__(self, memo):
             # Update the test in test_serialization if you remove 'meta' from here
             if (
                 self.is_sparse
-                or self.device.type in ["lazy", "xla", "mps", "ort", "meta", "hpu"]
+                or self.device.type
+                in ["lazy", "xla", "mps", "ort", "meta", "hpu", "ipu"]
                 or (
                     not torch._C._has_storage(self)
                     and self.device.type == "privateuseone"

From 5b78127233d041b22be1fa63bab956c97b847890 Mon Sep 17 00:00:00 2001
From: Xilun Wu <12968408+XilunWu@users.noreply.github.com>
Date: Wed, 23 Nov 2022 19:43:28 +0000
Subject: [PATCH 1221/1922] Thread PG: add allreduce to threaded pg (#89043)

Summary:
Goal
Add `all_reduce` collective  to multi-threaded ProcessGroup added in D40236769 (https://github.com/pytorch/pytorch/commit/6663ae5537f3c61030ba4d425bd57a097c51430a).

Code Motion
Added `allreduce` collective to ProcessLocalGroup (a subclass of c10d ProcessGroup).

What's Next
Add a DDP test utilizing the new allreduce op.
Generalize `allreduce` to allow other `ReduceOp`s besides `SUM`.

Test Plan:
cd fbcode/caffe2
buck2 test mode/dev //caffe2/test/distributed:multi_threaded

Differential Revision: D41046606

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89043
Approved by: https://github.com/wanchaol
---
 test/distributed/test_multi_threaded_pg.py    | 11 ++++++++
 .../distributed/multi_threaded_pg.py          | 27 +++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/test/distributed/test_multi_threaded_pg.py b/test/distributed/test_multi_threaded_pg.py
index f520698258ed9..875e3f066384d 100644
--- a/test/distributed/test_multi_threaded_pg.py
+++ b/test/distributed/test_multi_threaded_pg.py
@@ -3,6 +3,7 @@
 import sys
 import torch
 import torch.distributed as dist
+from torch._C._distributed_c10d import ReduceOp
 
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
@@ -71,6 +72,16 @@ def test_broadcast_object_list(self):
         dist.broadcast_object_list(object_list=object_list)
         self.assertEqual(99, object_list[0])
 
+    def test_all_reduce(self):
+        output = torch.ones(3, 3) * dist.get_rank()
+        dist.all_reduce(output)
+        res_num = ((0 + self.world_size - 1) * self.world_size) / 2
+        self.assertEqual(output, torch.ones(3, 3) * res_num)
+
+        # Test unimplemented error
+        with self.assertRaisesRegex(NotImplementedError, "only supports SUM on threaded pg for now"):
+            dist.all_reduce(output, op=ReduceOp.MAX)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/testing/_internal/distributed/multi_threaded_pg.py b/torch/testing/_internal/distributed/multi_threaded_pg.py
index df45748ee6c6f..b66ca14731659 100644
--- a/torch/testing/_internal/distributed/multi_threaded_pg.py
+++ b/torch/testing/_internal/distributed/multi_threaded_pg.py
@@ -10,10 +10,12 @@
 from torch._C._distributed_c10d import (
     _create_work_from_future,
     AllgatherOptions,
+    AllreduceOptions,
     BroadcastOptions,
     ReduceScatterOptions,
     ScatterOptions,
     Store,
+    ReduceOp,
 )
 from torch.futures import Future
 from torch.utils._pytree import tree_flatten
@@ -39,6 +41,25 @@ def ret_work(ret):
     return _create_work_from_future(fut)
 
 
+class AllReduce:
+    def __init__(self, op):
+        if op != ReduceOp.SUM:
+            raise NotImplementedError(
+                "AllReduce only supports SUM on threaded pg for now."
+            )
+        self.op = op
+
+    def work(self, data):
+        # data: List[List[Tensor]]
+        res = data[0][0]
+        for src_rank in range(1, len(data)):
+            in_tensor_list = data[src_rank]
+            res.add_(in_tensor_list[0])  # Hardcoded
+        with torch.no_grad():
+            for src_rank in range(len(data)):
+                data[src_rank][0].copy_(res)
+
+
 class AllGather:
     def work(self, data):
         for src_rank in range(len(data)):
@@ -183,6 +204,12 @@ def _end_coll(cls, collective):
             if cls._cur_coll == collective:
                 cls._cur_coll = None
 
+    def allreduce(self, tensor_list, opts=AllreduceOptions()):
+        coll = ProcessLocalGroup._start_coll(self._world, AllReduce(opts.reduceOp))
+        res = coll.join(self._rank, tensor_list)
+        ProcessLocalGroup._end_coll(coll)
+        return res
+
     def allgather(self, output_tensors, input_tensor, opts=AllgatherOptions()):
         coll = ProcessLocalGroup._start_coll(self._world, AllGather())
         res = coll.join(self._rank, (output_tensors, input_tensor))

From 9081bd328689b01457559654dd3a82806b1313fe Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 23 Nov 2022 19:44:46 +0000
Subject: [PATCH 1222/1922] [Dynamo] Fix several bugs & code refactor in
 RangeVariable (#89322)

Fix bug in [7k github models](https://github.com/pytorch/torchdynamo/issues/1884): https://github.com/jansel/pytorch-jit-paritybench/blob/master/generated/test_clovaai_stargan_v2.py
```
E       TypeError: 'list' object cannot be interpreted as an integer
E
E       from user code:
E          File "/scratch/ybliang/work/repos/pytorch-jit-paritybench/generated/test_clovaai_stargan_v2.py", line 335, in forward
E           idx = torch.LongTensor(range(y.size(0)))
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89322
Approved by: https://github.com/jansel
---
 test/dynamo/test_functions.py      | 11 ++++++
 torch/_dynamo/variables/builder.py | 24 ++++++------
 torch/_dynamo/variables/builtin.py | 23 ++++-------
 torch/_dynamo/variables/lists.py   | 63 +++++++++++++++++-------------
 torch/fx/node.py                   |  1 +
 torch/fx/proxy.py                  |  3 ++
 6 files changed, 71 insertions(+), 54 deletions(-)

diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py
index 7b510fef07547..327fa64f1209f 100644
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@@ -217,6 +217,17 @@ def test_slice5(a):
     def test_slice6(a):
         return torch.unsqueeze(a, 0)[:, 2:]
 
+    @make_test
+    def test_range1(a):
+        return torch.tensor(range(a.size(0)))
+
+    @make_test
+    def test_range2(x, y):
+        r = x + y
+        for i in range(x.size(0) + 2):
+            r = r / y
+        return r
+
     @make_test
     def test_unpack1(a):
         a, b = a[:5], a[5:]
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index b1b691c41fc60..6db7cbf87820d 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -241,9 +241,19 @@ def _wrap(self, value):
             return ListIteratorVariable(
                 output, mutable_local=MutableLocal(), guards=guards
             )
-        elif istype(value, range):
-            guards = self.make_guards(GuardBuilder.EQUALS_MATCH)
-            return RangeVariable(value=value, guards=guards)
+        elif istype(value, (slice, range)):
+            items = [
+                VariableBuilder(self.tx, AttrSource(self.get_source(), k))(
+                    getattr(value, k)
+                )
+                for k in ("start", "stop", "step")
+            ]
+            if isinstance(value, slice):
+                return SliceVariable(items, guards=make_guards(GuardBuilder.TYPE_MATCH))
+            else:
+                return RangeVariable(
+                    items, guards=make_guards(GuardBuilder.EQUALS_MATCH)
+                )
         elif istype(
             value, (dict, collections.defaultdict, collections.OrderedDict)
         ) and all(
@@ -448,14 +458,6 @@ def index_source(key):
             return HFPretrainedConfigVariable(
                 value, guards=make_guards(GuardBuilder.TYPE_MATCH)
             )
-        elif isinstance(value, slice):
-            items = [
-                VariableBuilder(self.tx, AttrSource(self.get_source(), k))(
-                    getattr(value, k)
-                )
-                for k in ("start", "stop", "step")
-            ]
-            return SliceVariable(items, guards=make_guards(GuardBuilder.TYPE_MATCH))
         elif isinstance(value, PyOperator):
             return TorchPyOperator(
                 value,
diff --git a/torch/_dynamo/variables/builtin.py b/torch/_dynamo/variables/builtin.py
index 904ed8a49f81c..369b9364a4163 100644
--- a/torch/_dynamo/variables/builtin.py
+++ b/torch/_dynamo/variables/builtin.py
@@ -468,28 +468,19 @@ def _call_min_max(self, tx, a, b):
     call_min = _call_min_max
     call_max = _call_min_max
 
-    def call_range(self, tx, *args, **kwargs):
-        if self.unspec_python_args(*args, **kwargs) or self.constant_args(
-            *args, **kwargs
-        ):
-            args, kwargs = specialize_args_kwargs(tx, args, kwargs)
-            return variables.RangeVariable(
-                value=range(
-                    *[x.value for x in args],
-                    **{k: v.value for k, v in kwargs.items()},
-                ),
-            )
-        elif self._dynamic_args(*args, **kwargs):
-            assert len(kwargs) == 0
+    def call_range(self, tx, *args):
+        if self.unspec_python_args(*args) or self.constant_args(*args):
+            args, _ = specialize_args_kwargs(tx, args, {})
+            return variables.RangeVariable(args)
+        elif self._dynamic_args(*args):
 
             def guard_if_dyn(arg):
                 if isinstance(arg, DynamicShapeVariable):
                     return arg.evaluate_expr(tx.output)
                 return arg
 
-            args = [guard_if_dyn(arg) for arg in args]
-            value = self.fn(*args)
-            return variables.RangeVariable(value=value)
+            args = [variables.ConstantVariable(guard_if_dyn(arg)) for arg in args]
+            return variables.RangeVariable(args)
         # None no-ops this handler and lets the driving function proceed
         return None
 
diff --git a/torch/_dynamo/variables/lists.py b/torch/_dynamo/variables/lists.py
index 553c9ca1e664d..8214edcc4c9de 100644
--- a/torch/_dynamo/variables/lists.py
+++ b/torch/_dynamo/variables/lists.py
@@ -93,42 +93,51 @@ def call_method(
 
 
 class RangeVariable(BaseListVariable):
-    def __init__(self, value, items=None, guards=None, **kwargs):
-        if items is None:
-            items = [variables.ConstantVariable(x, guards=guards) for x in value]
-        super().__init__(items, guards=guards, **kwargs)
-        self.value = value
+    def __init__(self, items, **kwargs):
+        items_to_map = items
+        start = variables.ConstantVariable(0)
+        stop = None
+        step = variables.ConstantVariable(1)
+
+        if len(items_to_map) == 1:
+            (stop,) = items_to_map
+        elif len(items_to_map) == 2:
+            start, stop = items_to_map
+        elif len(items_to_map) == 3:
+            start, stop, step = items_to_map
+        else:
+            raise AssertionError()
+
+        assert stop is not None
+        super().__init__([start, stop, step], **kwargs)
 
     def python_type(self):
         return range
 
     def as_python_constant(self):
-        return self.value
+        return range(*[x.as_python_constant() for x in self.items])
 
-    def reconstruct(self, codegen):
-        assert "range" not in codegen.tx.f_globals
-        range_fn = codegen.create_load_global("range", add=True)
-        if self.value.step == 1:
-            if self.value.start == 0:
-                return [
-                    range_fn,
-                    codegen.create_load_const(self.value.stop),
-                    create_instruction("CALL_FUNCTION", 1),
-                ]
-            return [
-                range_fn,
-                codegen.create_load_const(self.value.start),
-                codegen.create_load_const(self.value.stop),
-                create_instruction("CALL_FUNCTION", 2),
-            ]
+    def as_proxy(self):
+        return self.python_type()(*self._as_proxy())
+
+    def unpack_var_sequence(self, tx):
         return [
-            range_fn,
-            codegen.create_load_const(self.value.start),
-            codegen.create_load_const(self.value.stop),
-            codegen.create_load_const(self.value.step),
-            create_instruction("CALL_FUNCTION", 3),
+            variables.ConstantVariable(x).add_options(self)
+            for x in self.as_python_constant()
         ]
 
+    def reconstruct(self, codegen):
+        assert "range" not in codegen.tx.f_globals
+        codegen.append_output(codegen.create_load_python_module(range))
+        codegen.foreach(self.items)
+        return [create_instruction("CALL_FUNCTION", 3)]
+
+    def var_getattr(self, tx, name):
+        fields = ["start", "stop", "step"]
+        if name not in fields:
+            unimplemented(f"range.{name}")
+        return self.items[fields.index(name)].add_options(self)
+
 
 class ListVariable(BaseListVariable):
     def python_type(self):
diff --git a/torch/fx/node.py b/torch/fx/node.py
index 0505b39565cd7..2003feb6db339 100644
--- a/torch/fx/node.py
+++ b/torch/fx/node.py
@@ -24,6 +24,7 @@
     List[Any],  # actually Argument
     Dict[str, Any],  # actually Argument
     slice,  # Slice[Argument, Argument, Argument], but slice is not a templated type in typing
+    range,
     'Node',
     BaseArgumentTypes
 ]]
diff --git a/torch/fx/proxy.py b/torch/fx/proxy.py
index 064717eaabfc1..6f9535b117370 100644
--- a/torch/fx/proxy.py
+++ b/torch/fx/proxy.py
@@ -157,6 +157,9 @@ def no_node(arg):
         elif isinstance(a, slice):
             return slice(self.create_arg(a.start), self.create_arg(a.stop), self.create_arg(a.step))
 
+        elif isinstance(a, range):
+            return range(self.create_arg(a.start), self.create_arg(a.stop), self.create_arg(a.step))
+
         if isinstance(a, Proxy):
             # base case: we unwrap the Proxy object
             return a.node

From cac78dd7af7ca0a8fdf9738c78fda9a1bce8964d Mon Sep 17 00:00:00 2001
From: Charlie West-Taylor <charliew@graphcore.ai>
Date: Wed, 23 Nov 2022 19:51:50 +0000
Subject: [PATCH 1223/1922] Mark IPU device as not supports_as_strided (#89130)

Currently causes issues in calls to `.to`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89130
Approved by: https://github.com/albanD
---
 c10/core/Device.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/c10/core/Device.h b/c10/core/Device.h
index cea7cfec119e9..d53ab38ff9cb9 100644
--- a/c10/core/Device.h
+++ b/c10/core/Device.h
@@ -148,7 +148,8 @@ struct C10_API Device final {
 
   /// Return true if the device supports arbirtary strides.
   bool supports_as_strided() const noexcept {
-    return type_ != DeviceType::XLA && type_ != DeviceType::Lazy;
+    return type_ != DeviceType::IPU && type_ != DeviceType::XLA &&
+        type_ != DeviceType::Lazy;
   }
 
   /// Same string as returned from operator<<.

From d6d79d89ca702b48912b601403ae872a34e663e0 Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@fb.com>
Date: Wed, 23 Nov 2022 20:11:39 +0000
Subject: [PATCH 1224/1922] [dashboard] Add graphs for all summary metrics, add
 additional testing flags (#89580)

Title. Test post: https://github.com/pytorch/torchdynamo/issues/1831#issuecomment-1325572179

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89580
Approved by: https://github.com/davidberard98
---
 benchmarks/dynamo/runner.py | 46 ++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py
index 29605f946bbd9..38bfa3160625d 100755
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@@ -220,11 +220,29 @@ def parse_args():
         default=False,
         help="Updates to dashboard",
     )
+    parser.add_argument(
+        "--no-graphs",
+        action="store_true",
+        default=False,
+        help="Do not genenerate and upload metric graphs",
+    )
+    parser.add_argument(
+        "--no-update-archive",
+        action="store_true",
+        default=False,
+        help="Do not update lookup.csv or the log archive",
+    )
+    parser.add_argument(
+        "--no-gh-comment",
+        action="store_true",
+        default=False,
+        help="Do not write a comment to github",
+    )
     parser.add_argument(
         "--update-dashboard-test",
         action="store_true",
         default=False,
-        help="Do not udpate lookup file or upload images/comments when --update-dashboard is specified",
+        help="does all of --no-graphs, --no-update-lookup, and --no-gh-comment",
     )
     parser.add_argument(
         "--dashboard-image-uploader",
@@ -1074,7 +1092,7 @@ def find_last_k(self):
     def generate_comment(self):
         title = "## Metrics over time ##\n"
         str_io = io.StringIO()
-        if not self.args.update_dashboard_test:
+        if not self.args.update_dashboard_test and not self.args.no_graphs:
             for name in glob.glob(self.args.output_dir + "/*over_time.png"):
                 output = (
                     subprocess.check_output([self.args.dashboard_image_uploader, name])
@@ -1090,7 +1108,7 @@ def generate_comment(self):
     def diff(self):
         log_infos = self.find_last_k()
 
-        for metric in ["geomean", "passrate"]:
+        for metric in ["geomean", "passrate", "comp_time", "memory"]:
             fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))
             for idx, suite in enumerate(self.suites):
                 dfs = []
@@ -1105,7 +1123,7 @@ def diff(self):
                     df = pd.read_csv(gmean_filename)
                     if suite not in df:
                         continue
-                    if metric == "geomean":
+                    if metric == "geomean" or metric == "memory":
                         df[suite] = df[suite].str.replace("x", "").astype(float)
                     elif metric == "passrate":
                         df[suite] = df[suite].str.split("%").str[0].astype(float)
@@ -1152,7 +1170,7 @@ def __init__(self, args):
         self.lookup_file = os.path.join(self.args.dashboard_archive_path, "lookup.csv")
         assert os.path.exists(self.lookup_file)
         try:
-            if not self.args.update_dashboard_test:
+            if not self.args.update_dashboard_test and not self.args.no_update_archive:
                 self.update_lookup_file()
         except subprocess.CalledProcessError:
             sys.stderr.write("failed to update lookup file\n")
@@ -1180,7 +1198,7 @@ def archive(self):
     def upload_graphs(self):
         title = "## Performance graphs ##\n"
         str_io = io.StringIO()
-        if not self.args.update_dashboard_test:
+        if not self.args.update_dashboard_test and not self.args.no_graphs:
             for name in glob.glob(self.output_dir + "/*png"):
                 if "over_time" not in name:
                     output = (
@@ -1255,8 +1273,10 @@ def update(self):
         print(comment)
 
         if not self.args.update_dashboard_test:
-            self.comment_on_gh(comment)
-            self.archive()
+            if not self.args.no_gh_comment:
+                self.comment_on_gh(comment)
+            if not self.args.no_update_archive:
+                self.archive()
 
 
 if __name__ == "__main__":
@@ -1310,9 +1330,13 @@ def extract(key):
             )
             raise e
         if not args.log_operator_inputs:
-            archive(
-                output_dir, args.dashboard_archive_path, args.archive_name, dtypes[0]
-            )
+            if not args.no_update_archive:
+                archive(
+                    output_dir,
+                    args.dashboard_archive_path,
+                    args.archive_name,
+                    dtypes[0],
+                )
             parse_logs(
                 args, dtypes, suites, devices, compilers, flag_compilers, output_dir
             )

From 339457d6511ac37886db0af4aa5379a785fe7060 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Wed, 23 Nov 2022 20:18:54 +0000
Subject: [PATCH 1225/1922] Reland #89031 Added conv constraint that infers
 layouts (#89530)

Relands #89031
Per title. We now set strides from fx graph only for convolutions and mm, which is a hack, but bmm in some cases caused extra copy, and there is no obvious way to fix that, we should rethink the strides anyway.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89530
Approved by: https://github.com/Chillee
---
 test/inductor/test_torchinductor.py |   4 +-
 torch/_inductor/graph.py            |  34 +++++++-
 torch/_inductor/ir.py               |   3 +
 torch/_inductor/lowering.py         | 118 ++++++++++------------------
 torch/_inductor/utils.py            |   3 +-
 5 files changed, 80 insertions(+), 82 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 2287316d788e4..743655d07fd73 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -68,7 +68,6 @@
 from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
 
 aten = torch.ops.aten
-
 requires_cuda = functools.partial(unittest.skipIf, not HAS_CUDA, "requires cuda")
 
 torch._inductor.config.triton.autotune = False  # too slow
@@ -5480,6 +5479,8 @@ def get_kernels(self, fn, args) -> typing.List[CachingAutotuner]:
             return kernels
 
         def test_divisibile_by_16_covers_numel_args(self):
+            torch._dynamo.reset()
+
             def fn(a: torch.Tensor) -> torch.Tensor:
                 return torch.sum(a)
 
@@ -5499,6 +5500,7 @@ def fn(a: torch.Tensor) -> torch.Tensor:
                 kernels[1].meta["configs"][0].divisible_by_16
             )
             self.assertEqual(arguments_that_are_divisible_by_16_in_kernel1, (0, 1))
+            torch._dynamo.reset()
 
 
 if __name__ == "__main__":
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 5aef1c548e2df..44f136b356a71 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -20,7 +20,12 @@
     MissingOperatorWithoutDecomp,
 )
 from .ir import Constant, FixedLayout, InputBuffer, Pointwise, Reduction, TensorBox
-from .lowering import lowerings, make_fallback, needs_realized_inputs
+from .lowering import (
+    layout_constraints,
+    lowerings,
+    make_fallback,
+    needs_realized_inputs,
+)
 from .sizevars import SizeVarAllocator
 from .utils import dynamo_utils, gather_origins, get_dtype_size, sympy_product
 from .virtualized import V
@@ -309,7 +314,12 @@ def finalize(self):
 
     def run_node(self, n: torch.fx.Node):
         with ir.IRNode.current_origins({n}):
-            result = super().run_node(n)
+            if n.op == "call_function" and n.target in layout_constraints:
+                args, kwargs = self.fetch_args_kwargs_from_env(n)
+                args, kwargs = layout_constraints[n.target](n, *args, **kwargs)
+                result = self.call_function(n.target, args, kwargs)
+            else:
+                result = super().run_node(n)
 
             # Realize if (1) any user need inputs realized, or (2) there is
             # already too many reads and rematerializing can be bad.
@@ -318,7 +328,25 @@ def run_node(self, n: torch.fx.Node):
                 for user in n.users:
                     if user.target in needs_realized_inputs:
                         result.realize_hint()
-                    elif user.op == "output":
+                        # This inclusion is somewhat controversial (from
+                        # discussion between Horace, Natalia, and Elias).
+                        # Currently, it's not very clear why this is helpful.
+                        # The general idea here is that even though a node may
+                        # have FlexibleLayout, we still often *treat* it as if
+                        # it was contiguous. This appears to sometime result in
+                        # suboptimal behavior.
+                        #
+                        # When we do a better job selecting layout, we should
+                        # revisit this.
+                        if user.target in (
+                            torch.ops.aten.convolution.default,
+                            torch.ops.aten.convolution_backward.default,
+                            torch.ops.aten.mm.default,
+                        ):
+                            result = ir.ExternKernel.require_stride_order(
+                                result, ir.get_stride_order(n.meta["val"].stride())
+                            )
+                    if user.op == "output":
                         if isinstance(result.data.data, (Pointwise, Reduction)):
                             result.realize()
 
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index f612000edf4ce..033badc781f11 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -2478,6 +2478,9 @@ def require_stride1(cls, x):
 
     @classmethod
     def require_stride_order(cls, x, order):
+        if x.get_numel() == 0:  # Layout doesn't matter
+            return x
+
         # require x to have the layout as strided_ordered as order
         if is_storage_and_layout(x):
             if isinstance(
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 221f064e2e731..9b65431c6a175 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -23,7 +23,6 @@
 from .decomposition import decompositions, get_decompositions
 from .ir import (
     ExpandView,
-    get_stride_order,
     IndexingConstant,
     IndexingDiv,
     PermuteView,
@@ -38,6 +37,7 @@
 
 log = logging.getLogger(__name__)
 lowerings = {}
+layout_constraints = {}
 fallbacks = set()
 aten = torch.ops.aten
 prims = torch.ops.prims
@@ -53,6 +53,14 @@ def add_needs_realized_inputs(fn):
             needs_realized_inputs.add(getattr(fn, overload))
 
 
+def add_layout_constraint(fn, constraint):
+    if isinstance(fn, torch._ops.OpOverloadPacket):
+        for overload in fn.overloads():
+            layout_constraints[getattr(fn, overload)] = constraint
+    else:
+        layout_constraints[fn] = constraint
+
+
 add_needs_realized_inputs(
     [
         aten.as_strided,
@@ -1013,12 +1021,10 @@ def linear_binary(x: TensorBox, y: TensorBox, w: TensorBox, b: TensorBox, attr):
 register_onednn_fusion_ops()
 
 
-def fallback_handler(kernel, inps_hook=None):
+def fallback_handler(kernel):
     fallbacks.add(kernel)
 
     def handler(*args, **kwargs):
-        if inps_hook is not None:
-            args, kwargs = inps_hook(*args, **kwargs)
         return pytree.tree_map(
             TensorBox.create, ir.FallbackKernel.create(kernel, *args, **kwargs)
         )
@@ -1026,7 +1032,7 @@ def handler(*args, **kwargs):
     return handler
 
 
-def make_fallback(kernel, inps_hook=None):
+def make_fallback(kernel, layout_constraint=None):
     assert (
         kernel not in decompositions
     ), f"both a fallback and a decomp for same kernel: {kernel}"
@@ -1036,9 +1042,9 @@ def make_fallback(kernel, inps_hook=None):
         )
 
     add_needs_realized_inputs(kernel)
-    return register_lowering(kernel, type_promotion_kind=None)(
-        fallback_handler(kernel, inps_hook)
-    )
+    if layout_constraint is not None:
+        add_layout_constraint(kernel, layout_constraint)
+    return register_lowering(kernel, type_promotion_kind=None)(fallback_handler(kernel))
 
 
 @register_lowering(aten.native_dropout, type_promotion_kind=None)
@@ -1189,72 +1195,14 @@ def inner_fn(index):
     )
 
 
-def conv_backward(*args, **kwargs):
-    # output striding complex and has a lot of build dependent options,
-    # take the output strides to determine what to set the inputs
-    with torch._subclasses.FakeTensorMode():
-        args_fake, kwargs_fake = pytree.tree_map_only(
-            ir.IRNode,
-            lambda t: ir.ir_node_to_tensor(t, guard_shape=False),
-            (args, kwargs),
-        )
-        output = aten.convolution_backward(*args_fake, **kwargs_fake)
-
-    def constraints(
-        grad_output,
-        input,
-        weight,
-        bias_sizes,
-        stride,
-        padding,
-        dilation,
-        transposed,
-        output_padding,
-        groups,
-        output_mask,
-    ):
-        out = (
-            output[0]
-            if output[0] is not None
-            else output[1]
-            if output[1] is not None
-            else output[2]
-        )
-        if out is not None:
-            stride_order = get_stride_order(out.stride())
-            grad_output = ir.ExternKernel.require_stride_order(
-                grad_output, stride_order
-            )
-            weight = ir.ExternKernel.require_stride_order(weight, stride_order)
-            # Only make input contiguous when it is necessary for the backwards computation
-            if output_mask[1]:
-                input = ir.ExternKernel.require_stride_order(input, stride_order)
-
-        return (
-            grad_output,
-            input,
-            weight,
-            bias_sizes,
-            stride,
-            padding,
-            dilation,
-            transposed,
-            output_padding,
-            groups,
-            output_mask,
-        ), {}
-
-    return constraints(*args, **kwargs)
-
-
-def require_dense(*args, **kwargs):
+def require_dense(_, *args, **kwargs):
     args, kwargs = pytree.tree_map_only(
         ir.IRNode, lambda t: ir.ExternKernel.require_stride1(t), (args, kwargs)
     )
     return args, kwargs
 
 
-def require_contiguous(*args, **kwargs):
+def require_contiguous(_, *args, **kwargs):
     args, kwargs = pytree.tree_map_only(
         ir.IRNode, lambda t: ir.ExternKernel.require_contiguous(t), (args, kwargs)
     )
@@ -1264,26 +1212,42 @@ def require_contiguous(*args, **kwargs):
 if has_torchvision_roi_align():
     make_fallback(torch.ops.torchvision.roi_align)
 
+
+def constrain_to_fx_strides(fx_node, *args, **kwargs):
+    def apply_constraint(arg, fx_arg):
+        if isinstance(arg, ir.IRNode):
+            stride_order = ir.get_stride_order(fx_arg.meta["val"].stride())
+            return ir.ExternKernel.require_stride_order(arg, stride_order)
+        return arg
+
+    args = [apply_constraint(arg, fx_arg) for arg, fx_arg in zip(args, fx_node.args)]
+    kwargs = {k: apply_constraint(v, fx_node.kwargs[k]) for k, v in kwargs.items()}
+    return args, kwargs
+
+
 # TODO(jansel): we should implement decomps or lowerings for these
 # https://github.com/pytorch/torchdynamo/issues/327
 make_fallback(aten._adaptive_avg_pool2d_backward, require_dense)
-make_fallback(aten.convolution_backward, inps_hook=conv_backward)
+make_fallback(aten.convolution_backward, constrain_to_fx_strides)
 make_fallback(aten._cudnn_rnn, require_dense)
-make_fallback(aten._cudnn_rnn_backward, inps_hook=require_contiguous)
-make_fallback(aten.cumsum, inps_hook=require_dense)
-make_fallback(aten._embedding_bag, inps_hook=require_contiguous)
-make_fallback(aten._embedding_bag_forward_only, inps_hook=require_contiguous)
+make_fallback(aten._cudnn_rnn_backward, require_contiguous)
+make_fallback(aten.cumsum, require_dense)
+make_fallback(aten._embedding_bag, require_contiguous)
+make_fallback(aten._embedding_bag_forward_only, require_contiguous)
 make_fallback(aten._fused_moving_avg_obs_fq_helper)
 make_fallback(aten._fused_moving_avg_obs_fq_helper_functional)
-make_fallback(aten.grid_sampler_2d_backward, inps_hook=require_dense)
+make_fallback(aten.grid_sampler_2d_backward, require_dense)
 make_fallback(aten.randperm)
 make_fallback(aten.sort)
 make_fallback(aten.sort.stable)
 make_fallback(aten._sparse_coo_tensor_with_dims_and_tensors)
-make_fallback(aten._thnn_fused_lstm_cell, inps_hook=require_dense)
+make_fallback(aten._thnn_fused_lstm_cell, require_dense)
 make_fallback(aten.topk)
-make_fallback(aten.upsample_bicubic2d_backward, inps_hook=require_contiguous)
-make_fallback(aten.upsample_bilinear2d_backward, inps_hook=require_dense)
+make_fallback(aten.upsample_bicubic2d_backward, require_contiguous)
+make_fallback(aten.upsample_bilinear2d_backward, require_dense)
+
+
+add_layout_constraint(aten.convolution, constrain_to_fx_strides)
 
 
 @register_lowering(aten.convolution)
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index 62357be8bcf39..36a645c99a97b 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -374,7 +374,8 @@ def fresh_inductor_cache(cache_entries=None):
 
 
 def argsort(seq):
-    return sorted(range(len(seq)), key=seq.__getitem__)
+    # preserve original order for equal strides
+    return list(reversed(sorted(range(len(seq)), key=seq.__getitem__, reverse=True)))
 
 
 @functools.lru_cache(8)

From 8dc4499d4e1464b936aa873359e90379862cb0f4 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Wed, 23 Nov 2022 11:03:45 -0800
Subject: [PATCH 1226/1922] [quant][docs] Add docstrings for operators defined
 in torch.ops.quantized_decomposed namespace (#89547)

Summary:
no functionality changes

Test Plan:
NA

Reviewers:

Subscribers:

Tasks:

Tags:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89547
Approved by: https://github.com/vkuzo
---
 torch/ao/quantization/fx/_decomposed.py | 186 ++++++++++++++++++++++--
 1 file changed, 172 insertions(+), 14 deletions(-)

diff --git a/torch/ao/quantization/fx/_decomposed.py b/torch/ao/quantization/fx/_decomposed.py
index 3b80dfdf2a64d..ec814d6a17bb3 100644
--- a/torch/ao/quantization/fx/_decomposed.py
+++ b/torch/ao/quantization/fx/_decomposed.py
@@ -1,6 +1,7 @@
 import torch
 from torch.library import Library, impl
 from torch.ao.quantization import MinMaxObserver
+from typing import Tuple
 
 # Note: decomposed means decomposed quantized tensor, using decomposed so that the
 # name is not too long
@@ -28,10 +29,33 @@ def _quant_min_max_bounds_check(quant_min, quant_max, dtype):
         f"quant_max_upper_bound: {quant_max_upper_bound} quant_max: {quant_max}"
 
 quantized_decomposed_lib.define(
-    "quantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> Tensor")
+    "quantize_per_tensor(Tensor input, float scale, int zero_point, "
+    "int quant_min, int quant_max, ScalarType dtype) -> Tensor")
 
 @impl(quantized_decomposed_lib, "quantize_per_tensor", "CompositeExplicitAutograd")
-def quantize_per_tensor(input, scale, zero_point, quant_min, quant_max, dtype):
+def quantize_per_tensor(
+        input: torch.Tensor,
+        scale: float,
+        zero_point: int,
+        quant_min: int,
+        quant_max: int,
+        dtype: torch.dtype
+) -> torch.Tensor:
+    """ Affine quantization for the Tensor using the same quantization parameters to map
+    from floating point to quantized values
+
+    Args:
+       input (torch.Tensor): original float32 Tensor
+       scale (float): quantization parameter for affine quantization
+       zero_point (int): quantization parameter for affine quantization
+       quant_min (int): minimum quantized value for output Tensor
+       quant_max (int): maximum quantized value for output Tensor
+       dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor
+
+    Returns:
+       Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
+       are not stored in the Tensor, we are storing them in function arguments instead
+    """
     assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
     _quant_min_max_bounds_check(quant_min, quant_max, dtype)
 
@@ -39,11 +63,23 @@ def quantize_per_tensor(input, scale, zero_point, quant_min, quant_max, dtype):
     return torch.clamp(torch.round(input * inv_scale) + zero_point, quant_min, quant_max).to(dtype)
 
 quantized_decomposed_lib.define(
-    "quantize_per_tensor.tensor("
-    "Tensor input, Tensor scale, Tensor zero_point, int quant_min, int quant_max, ScalarType dtype) -> Tensor")
+    "quantize_per_tensor.tensor(Tensor input, Tensor scale, Tensor zero_point, "
+    "int quant_min, int quant_max, ScalarType dtype) -> Tensor")
 
 @impl(quantized_decomposed_lib, "quantize_per_tensor.tensor", "CompositeExplicitAutograd")
-def quantize_per_tensor_tensor(input, scale, zero_point, quant_min, quant_max, dtype):
+def quantize_per_tensor_tensor(
+        input: torch.Tensor,
+        scale: torch.Tensor,
+        zero_point: torch.Tensor,
+        quant_min: int,
+        quant_max: int,
+        dtype: torch.dtype
+) -> torch.Tensor:
+    """ Affine quantization for the Tensor using the same quantization parameters to map
+    from floating point to quantized values
+    Same as `quantize_per_tensor` but scale and zero_point are Scalar Tensor instead of
+    scalar values
+    """
     assert zero_point.numel() == 1, f"Exepecting zero_point tensor to be one element, but received : {zero_point.numel()}"
     assert scale.numel() == 1, f"Exepecting scale tensor to be one element, but received : {scale.numel()}"
     return quantize_per_tensor(input, scale.item(), zero_point.item(), quant_min, quant_max, dtype)
@@ -53,10 +89,42 @@ def quantize_per_tensor_tensor(input, scale, zero_point, quant_min, quant_max, d
 # matching in the future
 # We will revisit this later if we found there are no use cases for it
 quantized_decomposed_lib.define(
-    "dequantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> Tensor")
+    "dequantize_per_tensor(Tensor input, float scale, int zero_point, "
+    "int quant_min, int quant_max, ScalarType dtype) -> Tensor")
 
 @impl(quantized_decomposed_lib, "dequantize_per_tensor", "CompositeExplicitAutograd")
-def dequantize_per_tensor(input, scale, zero_point, quant_min, quant_max, dtype):
+def dequantize_per_tensor(
+        input: torch.Tensor,
+        scale: float,
+        zero_point: int,
+        quant_min: int,
+        quant_max: int,
+        dtype: torch.dtype
+) -> torch.Tensor:
+    """ Affine dequantization for the Tensor using the same quantization parameters to map
+    from quantized values to floating point values
+
+    Args:
+       input (torch.Tensor): Tensor with dtype matching `dtype` argument,
+       e.g. (`torch.uint8`), it is a per tensor quantized Tensor if combined with
+       quantization parameters in the argument of this function (scale/zero_point)
+
+       scale (float): quantization parameter for affine quantization
+
+       zero_point (int): quantization parameter for affine quantization
+
+       quant_min (int): minimum quantized value for input Tensor (not used in computation,
+       reserved for pattern matching)
+
+       quant_max (int): maximum quantized value for input Tensor (not used in computation,
+       reserved for pattern matching)
+
+       dtype (torch.dtype): dtype for input Tensor (not used in computation,
+       reserved for pattern matching)
+
+    Returns:
+       dequantized float32 Tensor
+    """
     assert input.dtype == dtype, f"Expecting input to have dtype: {dtype}"
     if dtype in [torch.uint8, torch.int8]:
         # TODO: investigate why
@@ -68,21 +136,52 @@ def dequantize_per_tensor(input, scale, zero_point, quant_min, quant_max, dtype)
 
 
 quantized_decomposed_lib.define(
-    "dequantize_per_tensor.tensor("
-    "Tensor input, Tensor scale, Tensor zero_point, int quant_min, int quant_max, ScalarType dtype) -> Tensor")
+    "dequantize_per_tensor.tensor(Tensor input, Tensor scale, Tensor zero_point, "
+    "int quant_min, int quant_max, ScalarType dtype) -> Tensor")
 
 @impl(quantized_decomposed_lib, "dequantize_per_tensor.tensor", "CompositeExplicitAutograd")
-def dequantize_per_tensor_tensor(input, scale, zero_point, quant_min, quant_max, dtype):
+def dequantize_per_tensor_tensor(
+        input: torch.Tensor,
+        scale: torch.Tensor,
+        zero_point: torch.Tensor,
+        quant_min: int,
+        quant_max: int,
+        dtype: torch.dtype
+) -> torch.Tensor:
+    """ Affine dequantization for the Tensor using the same quantization parameters to map
+    from quantized values to floating point values
+    Same as `dequantize_per_tensor` but scale and zero_point are Scalar Tensor instead of
+    scalar values
+    """
     assert zero_point.numel() == 1, f"Exepecting zero_point tensor to be one element, but received : {zero_point.numel()}"
     assert scale.numel() == 1, f"Exepecting scale tensor to be one element, but received : {scale.numel()}"
     return dequantize_per_tensor(input, scale.item(), zero_point.item(), quant_min, quant_max, dtype)
 
 
 quantized_decomposed_lib.define(
-    "choose_qparams.tensor(Tensor input, int quant_min, int quant_max, ScalarType dtype) -> (Tensor, Tensor)")
+    "choose_qparams.tensor(Tensor input, int quant_min, int quant_max, "
+    "ScalarType dtype) -> (Tensor, Tensor)")
 
 @impl(quantized_decomposed_lib, "choose_qparams.tensor", "CompositeExplicitAutograd")
-def choose_qparams_tensor(input, quant_min, quant_max, dtype):
+def choose_qparams_tensor(
+        input: torch.Tensor,
+        quant_min: int,
+        quant_max: int,
+        dtype: torch.dtype
+) -> Tuple[float, int]:
+    """ Given an input Tensor, derive the per tensor affine quantization parameter
+    (scale and zero_point) for target quantized Tensor from the Tensor
+
+    Args:
+       input (torch.Tensor): floating point input Tensor
+       quant_min (int): minimum quantized value for target quantized Tensor
+       quant_max (int): maximum quantized value for target quantized Tensor
+       dtype (torch.dtype): dtype for target quantized Tensor
+
+    Returns:
+       scale (float): quantization parameter for the target quantized Tensor
+       zero_point (int): quantization parameter for the target quantized Tensor
+    """
     assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
     assert quant_min < quant_max, f"Expecting quant_min to be smaller than quant_max but received min: {quant_min} max: {quant_max}"
 
@@ -108,7 +207,32 @@ def _permute_to_axis_zero(x, axis):
     "int quant_min, int quant_max, ScalarType dtype) -> Tensor")
 
 @impl(quantized_decomposed_lib, "quantize_per_channel", "CompositeExplicitAutograd")
-def quantize_per_channel(input, scales, zero_points, axis, quant_min, quant_max, dtype):
+def quantize_per_channel(
+        input: torch.Tensor,
+        scales: torch.Tensor,
+        zero_points: torch.Tensor,
+        axis: int,
+        quant_min: int,
+        quant_max: int,
+        dtype: torch.dtype
+) -> torch.Tensor:
+    """ Affine per channel quantization for the Tensor using the same quantization
+    parameters for each channel/axis to map from floating point to quantized values
+
+    Args:
+       input (torch.Tensor): original float32 Tensor
+       scales (torch.Tensor): a list of scale quantization parameter for
+       affine quantization, one per channel
+       zero_point (torch.Tensor): a list of zero_point quantization parameter for
+       affine quantization, one per channel
+       quant_min (int): minimum quantized value for output Tensor
+       quant_max (int): maximum quantized value for output Tensor
+       dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor
+
+    Returns:
+       Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
+       are not stored in the Tensor, we are storing them in function arguments instead
+    """
     assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
     assert axis < input.dim(), f"Expecting axis to be < {input.dim()}"
     _quant_min_max_bounds_check(quant_min, quant_max, dtype)
@@ -134,7 +258,41 @@ def quantize_per_channel(input, scales, zero_points, axis, quant_min, quant_max,
     "int quant_min, int quant_max, ScalarType dtype) -> Tensor")
 
 @impl(quantized_decomposed_lib, "dequantize_per_channel", "CompositeExplicitAutograd")
-def dequantize_per_channel(input, scales, zero_points, axis, quant_min, quant_max, dtype):
+def dequantize_per_channel(
+        input: torch.Tensor,
+        scales: torch.Tensor,
+        zero_points: torch.Tensor,
+        axis: int,
+        quant_min: int,
+        quant_max: int,
+        dtype: torch.dtype
+) -> torch.Tensor:
+    """ Affine per channel dequantization for the Tensor using the same quantization
+    parameters for each channel/axis to map from quantized values to floating point values
+
+    Args:
+       input (torch.Tensor): Tensor with dtype matching `dtype` argument,
+       e.g. (`torch.uint8`), it is a per channel quantized Tensor if combined with
+       quantization parameter in the argument of this function (scales/zero_points/axis)
+
+       scales (torch.Tensor): a list of scale quantization parameter for
+       affine quantization, one per channel
+
+       zero_points (torch.Tensor): a list of zero_point quantization parameter for
+       affine quantization, one per channel
+
+       quant_min (int): minimum quantized value for output Tensor (not used in computation,
+       reserved for pattern matching)
+
+       quant_max (int): maximum quantized value for output Tensor (not used in computation,
+       reserved for pattern matching)
+
+       dtype (torch.dtype): requested dtype for output Tensor (not used in computation,
+       reserved for pattern matching)
+
+    Returns:
+       dquantized float32 Tensor
+    """
     assert input.dtype == dtype, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
     assert axis < input.dim(), f"Expecting axis to be < {input.dim()}"
     _quant_min_max_bounds_check(quant_min, quant_max, dtype)

From 8b898f09c8f1cded0a16dceca6498dd9cdbbefa0 Mon Sep 17 00:00:00 2001
From: Elias Ellison <elias.ellison@gmail.com>
Date: Wed, 23 Nov 2022 17:03:09 +0000
Subject: [PATCH 1227/1922] Fix Upsample Decomp Striding For Small Channels
 (#89528)

Fix for https://github.com/pytorch/torchdynamo/issues/623.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89528
Approved by: https://github.com/ngimel, https://github.com/anijain2305
---
 test/test_fake_tensor.py        | 15 ++++++++++++++-
 torch/_decomp/decompositions.py |  9 +++++++--
 torch/_inductor/ir.py           |  1 +
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
index 3d47cc8ea0e51..1a213bb767b48 100644
--- a/test/test_fake_tensor.py
+++ b/test/test_fake_tensor.py
@@ -203,6 +203,19 @@ def test_print_in_fake_mode(self):
             out = str(x)
         assert "FakeTensor" not in out
 
+    @unittest.skipIf(not RUN_CUDA, "requires cuda")
+    def test_upsample_bilinear_small_channels(self):
+        out = []
+        mode = FakeTensorMode()
+        for i, context in enumerate([contextlib.nullcontext, lambda: mode]):
+            with context():
+                arg0_1 = torch.empty_strided((3, 427, 640), (1, 1920, 3), dtype=torch.float32, device='cuda')
+                unsqueeze = torch.ops.aten.unsqueeze.default(arg0_1, 0)
+                out.append(torch.ops.aten.upsample_bilinear2d.default(unsqueeze, [800, 1199], False))
+
+        self.assertTrue(out[1].is_contiguous())
+        self.checkMetaProps(out[0], out[1])
+
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_cpu_fallback(self):
         with FakeTensorMode(allow_fallback_kernels=False):
@@ -360,7 +373,7 @@ def test_data_dependent_operator(self):
             self.assertRaises(DynamicOutputShapeException, lambda: torch.nonzero(x))
 
     def checkMetaProps(self, t1, t2):
-        prims.utils.compare_tensor_meta(t1, t2)
+        prims.utils.compare_tensor_meta(t1, t2, check_strides=True)
 
     @skipIfCrossRef
     def test_deepcopy(self):
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index e36abd0457e5b..47ecabcb3b4ab 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -1904,8 +1904,13 @@ def upsample_bilinear2d(
     result = torch.mul(q1, yscale1) + torch.mul(q2, yscale2)
 
     # convert output to correct memory format, if necessary
-    input_memory_format = utils.suggest_memory_format(input)
-    result = result.contiguous(memory_format=input_memory_format)
+    memory_format = utils.suggest_memory_format(input)
+
+    # following "heuristic: only use channels_last path when it's faster than the contiguous path"
+    if input.device.type == "cuda" and n_channels < 16:
+        memory_format = torch.contiguous_format
+
+    result = result.contiguous(memory_format=memory_format)
 
     return result
 
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 033badc781f11..254a87c364238 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -2985,6 +2985,7 @@ def create(cls, kernel, *args, **kwargs):
             aten._fft_c2c.out,
             aten._linalg_svd.default,
             aten._linalg_svd.U,
+            aten.upsample_bilinear2d.default,
         )
         context = (
             FakeTensorMode if kernel not in fake_incorrect_kernels else nullcontext

From f3be08b154e0313958d2d1f2f0e29fe6f4cc907b Mon Sep 17 00:00:00 2001
From: Elias Ellison <elias.ellison@gmail.com>
Date: Wed, 23 Nov 2022 16:47:43 +0000
Subject: [PATCH 1228/1922] Fix norm decomp when dtype is passed in (#89508)

Fix for https://github.com/pytorch/torchdynamo/issues/1889. The wrapper was doing a downcast even when the dtype was explicitly passed in.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89508
Approved by: https://github.com/anijain2305
---
 test/dynamo/test_repros.py      | 25 +++++++++++++++++++++++++
 torch/_decomp/decompositions.py | 14 ++++++++++----
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index e30a1275ed135..cbdf69a337aa2 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -19,6 +19,8 @@
 import torch._dynamo.test_case
 import torch._dynamo.testing
 import torch._dynamo.utils
+
+from test_minifier import requires_cuda
 from torch import nn
 from torch._dynamo.debug_utils import same_two_models
 from torch._dynamo.testing import rand_strided, requires_static_shapes, same
@@ -1883,6 +1885,29 @@ def forward(self, inp):
         self.assertEqual(cnt.op_count, 5)
         self.assertEqual(cnt.frame_count, 1)
 
+    @requires_cuda()
+    def test_norm_dtype(self):
+        def foo(_stack0):
+            getitem = _stack0[(slice(None, None, None), -1)]
+            _stack0 = None
+            normalize = torch.nn.functional.normalize(getitem, p=2, dim=1)
+            getitem = None
+            return (normalize,)
+
+        args = [((2, 50, 256), (1, 256, 1), torch.float16, "cuda", False)]
+        args = [
+            rand_strided(sh, st, dt, dev).requires_grad_(rg)
+            for (sh, st, dt, dev, rg) in args
+        ]
+
+        opt_foo = torch._dynamo.optimize("aot_inductor_debug")(foo)
+        with torch.cuda.amp.autocast(enabled=True):
+            ref = foo(*args)[0]
+            res = foo(*args)[0]
+            self.assertEqual(ref.dtype, res.dtype)
+
+            self.assertTrue(same(res, ref))
+
     def test_for_loop_graph_break(self):
         def inner(x):
             return torch.sin(x)
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 47ecabcb3b4ab..57e068dcc159a 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -1775,7 +1775,6 @@ def log_sigmoid_forward(self: Tensor) -> Tuple[Tensor, Tensor]:
 
 @register_decomposition(aten.norm)
 @out_wrapper()
-@reduction_complex_to_real
 def norm(
     self: Tensor,
     p: Optional[float] = None,
@@ -1783,9 +1782,16 @@ def norm(
     keepdim: bool = False,
     dtype: Optional[torch.dtype] = None,
 ):
-    if p is None:
-        p = 2.0
-    return torch.linalg.vector_norm(self, p, dim, keepdim, dtype=dtype)
+    p = p if p is not None else 2.0
+    if dtype:
+        return torch.linalg.vector_norm(self.to(dtype), p, dim, keepdim, dtype=dtype)
+
+    computation_dtype, result_dtype = utils.elementwise_dtypes(
+        self, type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT
+    )
+    return torch.linalg.vector_norm(
+        self.to(computation_dtype), p, dim, keepdim, dtype=dtype
+    ).to(result_dtype)
 
 
 # aten/src/ATen/native/UpSample.cpp compute_output_size

From 93444aa773df644c4ae7919247381dc916f25545 Mon Sep 17 00:00:00 2001
From: Pearu Peterson <pearu.peterson@gmail.com>
Date: Wed, 23 Nov 2022 12:05:37 +0200
Subject: [PATCH 1229/1922] Sparse CSC/BSR/BSC serialization and pickle support
 (#89553)

Fixes https://github.com/pytorch/pytorch/issues/89497

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89553
Approved by: https://github.com/cpuhrsch
---
 test/test_serialization.py       | 41 ++++++++++++++++++++++++++------
 test/test_sparse_csr.py          | 16 +++++++++++++
 torch/_tensor.py                 | 36 ++++++++++++++++++----------
 torch/_utils.py                  | 40 +++++++++++++++++++++----------
 torch/_weights_only_unpickler.py |  1 -
 5 files changed, 101 insertions(+), 33 deletions(-)

diff --git a/test/test_serialization.py b/test/test_serialization.py
index dca926be60e70..b97c35c46762a 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -297,6 +297,9 @@ def _test_serialization(conversion):
                 self.assertEqual(x, y["tensor"])
         _test_serialization(lambda x: x.to_sparse())
         _test_serialization(lambda x: x.to_sparse_csr())
+        _test_serialization(lambda x: x.to_sparse_csc())
+        _test_serialization(lambda x: x.to_sparse_bsr(1, 1))
+        _test_serialization(lambda x: x.to_sparse_bsc(1, 1))
 
     def test_serialization_sparse(self):
         self._test_serialization(False)
@@ -333,36 +336,60 @@ def __reduce_ex__(self, proto):
                     "size is inconsistent with indices"):
                 y = torch.load(f)
 
-    def test_serialization_sparse_csr_invalid(self):
+    def _test_serialization_sparse_compressed_invalid(self,
+                                                      conversion,
+                                                      get_compressed_indices,
+                                                      get_plain_indices):
         x = torch.zeros(3, 3)
         x[1][1] = 1
-        x = x.to_sparse_csr()
+        x = conversion(x)
 
         class TensorSerializationSpoofer(object):
             def __init__(self, tensor):
                 self.tensor = tensor
 
             def __reduce_ex__(self, proto):
-                invalid_crow_indices = self.tensor.crow_indices().clone()
-                invalid_crow_indices[0] = 3
+                invalid_compressed_indices = get_compressed_indices(self.tensor).clone()
+                invalid_compressed_indices[0] = 3
                 return (
                     torch._utils._rebuild_sparse_tensor,
                     (
                         self.tensor.layout,
                         (
-                            invalid_crow_indices,
-                            self.tensor.col_indices(),
+                            invalid_compressed_indices,
+                            get_plain_indices(self.tensor),
                             self.tensor.values(),
                             self.tensor.size())))
 
+        if x.layout in {torch.sparse_csr, torch.sparse_bsr}:
+            compressed_indices_name = 'crow_indices'
+        else:
+            compressed_indices_name = 'ccol_indices'
+
         with tempfile.NamedTemporaryFile() as f:
             torch.save({"spoofed": TensorSerializationSpoofer(x)}, f)
             f.seek(0)
             with self.assertRaisesRegex(
                     RuntimeError,
-                    "rebuilding sparse tensor for layout torch.sparse_csr"):
+                    f"`{compressed_indices_name}[[]..., 0[]] == 0` is not satisfied."):
                 y = torch.load(f)
 
+    def test_serialization_sparse_csr_invalid(self):
+        self._test_serialization_sparse_compressed_invalid(
+            torch.Tensor.to_sparse_csr, torch.Tensor.crow_indices, torch.Tensor.col_indices)
+
+    def test_serialization_sparse_csc_invalid(self):
+        self._test_serialization_sparse_compressed_invalid(
+            torch.Tensor.to_sparse_csc, torch.Tensor.ccol_indices, torch.Tensor.row_indices)
+
+    def test_serialization_sparse_bsr_invalid(self):
+        self._test_serialization_sparse_compressed_invalid(
+            lambda x: x.to_sparse_bsr(1, 1), torch.Tensor.crow_indices, torch.Tensor.col_indices)
+
+    def test_serialization_sparse_bsc_invalid(self):
+        self._test_serialization_sparse_compressed_invalid(
+            lambda x: x.to_sparse_bsc(1, 1), torch.Tensor.ccol_indices, torch.Tensor.row_indices)
+
     def test_serialize_device(self):
         device_str = ['cpu', 'cpu:0', 'cuda', 'cuda:0']
         device_obj = [torch.device(d) for d in device_str]
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index 7ec2d4a79bf9f..6eee8a9d5b8bd 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -949,6 +949,22 @@ def test_to_dtype(self, layout, device, dtype):
                 dense_to_dtype = sparse.to_dense().to(to_dtype)
                 self.assertEqual(sparse_to_dtype.to_dense(), dense_to_dtype)
 
+    @skipMeta
+    @all_sparse_compressed_layouts()
+    @dtypes(torch.double)
+    def test_pickle(self, layout, dtype, device):
+        import pickle
+
+        input_gen = self._generate_small_inputs(layout)
+        for compressed_indices, plain_indices, values, size in input_gen:
+            sparse = torch.sparse_compressed_tensor(compressed_indices, plain_indices, values, size,
+                                                    dtype=dtype, device=device, layout=layout)
+            serialized = pickle.dumps(sparse)
+            sparse_loaded = pickle.loads(serialized)
+
+            self.assertEqual(sparse, sparse_loaded)
+
+
 def _npref_block_addmm_addmv(c, a, b, alpha, beta):
     return alpha * (a @ b) + beta * c
 
diff --git a/torch/_tensor.py b/torch/_tensor.py
index 6c610cbadb586..bf94639e2dbb0 100644
--- a/torch/_tensor.py
+++ b/torch/_tensor.py
@@ -332,22 +332,32 @@ def _reduce_ex_internal(self, proto):
                     "sparse tensor __reduce_ex__ for layout `%s`" % (self.layout)
                 )
             return (torch._utils._rebuild_sparse_tensor, args_sparse)
-        elif self.is_sparse_csr:
-            if self.layout == torch.sparse_csr:
-                args_sparse_csr = (
-                    self.layout,
-                    (
-                        self.crow_indices(),
-                        self.col_indices(),
-                        self.values(),
-                        self.size(),
-                    ),
+        elif self.layout in {
+            torch.sparse_csr,
+            torch.sparse_csc,
+            torch.sparse_bsr,
+            torch.sparse_bsc,
+        }:
+            if self.layout in {torch.sparse_csr, torch.sparse_bsr}:
+                compressed_indices, plain_indices = (
+                    self.crow_indices(),
+                    self.col_indices(),
                 )
             else:
-                raise NotImplementedError(
-                    "sparse csr tensor __reduce_ex__ for layout `%s`" % (self.layout)
+                compressed_indices, plain_indices = (
+                    self.ccol_indices(),
+                    self.row_indices(),
                 )
-            return (torch._utils._rebuild_sparse_csr_tensor, args_sparse_csr)
+            args_sparse_compressed = (
+                self.layout,
+                (
+                    compressed_indices,
+                    plain_indices,
+                    self.values(),
+                    self.size(),
+                ),
+            )
+            return (torch._utils._rebuild_sparse_tensor, args_sparse_compressed)
         elif (
             self.data_ptr() == 0
             and type(self) is not torch.Tensor
diff --git a/torch/_utils.py b/torch/_utils.py
index 9c646a2f85e0c..1bf3cf96ad1ce 100644
--- a/torch/_utils.py
+++ b/torch/_utils.py
@@ -193,15 +193,30 @@ def _rebuild_tensor_v2(
 def _validate_loaded_sparse_tensors():
     try:
         for t in _sparse_tensors_to_validate:
-            if t.is_sparse:
+            if t.layout is torch.sparse_coo:
                 torch._validate_sparse_coo_tensor_args(
                     t._indices(), t._values(), t.size()
                 )
-            elif t.is_sparse_csr:
+            elif t.layout in {
+                torch.sparse_csr,
+                torch.sparse_csc,
+                torch.sparse_bsr,
+                torch.sparse_bsc,
+            }:
                 # TODO: Validation currently involves an expensive traversal
                 # on CPU, which may include a device transfer.
-                torch._validate_sparse_csr_tensor_args(
-                    t.crow_indices(), t.col_indices(), t.values(), t.size()
+                if t.layout in {torch.sparse_csr, torch.sparse_bsr}:
+                    compressed_indices, plain_indices = (
+                        t.crow_indices(),
+                        t.col_indices(),
+                    )
+                else:
+                    compressed_indices, plain_indices = (
+                        t.ccol_indices(),
+                        t.row_indices(),
+                    )
+                torch._validate_sparse_compressed_tensor_args(
+                    compressed_indices, plain_indices, t.values(), t.size(), t.layout
                 )
             else:
                 raise NotImplementedError(
@@ -226,14 +241,15 @@ def _rebuild_sparse_tensor(layout, data):
         _sparse_tensors_to_validate.append(result)
         return result
 
-    raise NotImplementedError("rebuilding sparse tensor for layout %s" % (layout))
-
-
-def _rebuild_sparse_csr_tensor(layout, data):
-    if layout == torch.sparse_csr:
-        crow_indices, col_indices, values, size = data
-        result = torch._sparse_csr_tensor_unsafe(
-            crow_indices, col_indices, values, size
+    elif layout in {
+        torch.sparse_csr,
+        torch.sparse_csc,
+        torch.sparse_bsr,
+        torch.sparse_bsc,
+    }:
+        compressed_indices, plain_indices, values, size = data
+        result = torch._sparse_compressed_tensor_unsafe(
+            compressed_indices, plain_indices, values, size, layout=layout
         )
         _sparse_tensors_to_validate.append(result)
         return result
diff --git a/torch/_weights_only_unpickler.py b/torch/_weights_only_unpickler.py
index acc3554768b0b..30e10409184f7 100644
--- a/torch/_weights_only_unpickler.py
+++ b/torch/_weights_only_unpickler.py
@@ -100,7 +100,6 @@ def _get_allowed_globals():
         torch._utils._rebuild_tensor_v2,
         torch._utils._rebuild_sparse_tensor,
         torch._utils._rebuild_meta_tensor_no_storage,
-        torch._utils._rebuild_sparse_csr_tensor,
     ]:
         rc[f"torch._utils.{f.__name__}"] = f
 

From c5f94de12fc885a2ef8b39ed204cec32f31b1f51 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Tue, 22 Nov 2022 22:26:21 +0000
Subject: [PATCH 1230/1922] FFT: disable dimension wrapping for scalar tensors
 (#89234)

Fixes #88985

By default, `maybe_wrap_dim` allows through `dim=0` or `dim=-1`
for scalar tensors which leads to an invalid dimension being used to
index into `tensor.sizes()` as in the code sample from the issue.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89234
Approved by: https://github.com/mruberry
---
 aten/src/ATen/WrapDimUtils.h                  | 40 +++++++++++++++----
 aten/src/ATen/native/SpectralOps.cpp          | 12 +++---
 torch/_prims_common/__init__.py               | 15 +++----
 torch/_refs/fft.py                            |  8 ++--
 .../_internal/opinfo/definitions/fft.py       | 40 +++++++++++++++++++
 5 files changed, 90 insertions(+), 25 deletions(-)

diff --git a/aten/src/ATen/WrapDimUtils.h b/aten/src/ATen/WrapDimUtils.h
index e942245703287..b0bc583b90c2e 100644
--- a/aten/src/ATen/WrapDimUtils.h
+++ b/aten/src/ATen/WrapDimUtils.h
@@ -38,14 +38,29 @@ inline int64_t maybe_wrap_dim(
   return maybe_wrap_dim(dim, tensor_sizes[0].size());
 }
 
-// wrap each dim in the dims array, taking dim_post_expr as the true number of
-// dimensions
+// Given an array of dimensions `dims` of length `ndims`, this function "Wraps"
+// each dim in-place for a tensor of rank `dim_post_expr`, allowing dims to be
+// specified using negative indices.
+//
+// Additionally, if `wrap_scalar` is true then scalar tensors with rank 0, will
+// allow dimensions in the range [-1, 0]. Otherwise, an IndexError is raised for
+// dimensions not in the range [-dim_post_expr, dim_post_expr).
 inline void maybe_wrap_dims_n(
     int64_t* dims,
     int64_t ndims,
-    int64_t dim_post_expr) {
+    int64_t dim_post_expr,
+    bool wrap_scalars = true) {
   if (dim_post_expr <= 0) {
-    dim_post_expr = 1; // this will make range [-1, 0]
+    if (wrap_scalars) {
+      dim_post_expr = 1; // this will make range [-1, 0]
+    } else {
+      TORCH_CHECK_INDEX(
+          ndims == 0,
+          "Dimension specified as ",
+          dims[0],
+          " but tensor has no dimensions");
+      return;
+    }
   }
   int64_t min = -dim_post_expr;
   int64_t max = dim_post_expr - 1;
@@ -67,11 +82,20 @@ inline void maybe_wrap_dims_n(
   }
 }
 
-// Wrap each dim in a contiguous container, taking dim_post_expr as the true
-// number of dimensions E.g. could also be std::array or c10::SmallVector
+// Given a contiguous container of dimensions `dims`, this function "Wraps"
+// each dim in-place for a tensor of rank `dim_post_expr`, allowing dims to be
+// specified using negative indices.
+//
+// Additionally, if `wrap_scalar` is true then scalar tensors with rank 0, will
+// allow dimensions in the range [-1, 0]. Otherwise, an IndexError is raised for
+// dimensions not in the range [-dim_post_expr, dim_post_expr).
 template <typename Container>
-inline void maybe_wrap_dims(Container& dims, int64_t dim_post_expr) {
-  return maybe_wrap_dims_n(dims.data(), dims.size(), dim_post_expr);
+inline void maybe_wrap_dims(
+    Container& dims,
+    int64_t dim_post_expr,
+    bool wrap_scalars = true) {
+  return maybe_wrap_dims_n(
+      dims.data(), dims.size(), dim_post_expr, wrap_scalars);
 }
 
 // previously, size [0] tensors were the only possible empty tensors; thus, it
diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index e08e17af4d087..124c2d06d9e83 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -199,7 +199,7 @@ Tensor fft_c2r(c10::string_view function_name,
               " expects a floating point output tensor, but got ", out.scalar_type());
   input = promote_tensor_fft(input, /*require_complex=*/true);
   const auto input_dim = input.dim();
-  const auto dim = maybe_wrap_dim(unwrapped_dim, input_dim);
+  const auto dim = maybe_wrap_dim(unwrapped_dim, input_dim, /*wrap_scalar=*/false);
   const auto n = n_opt.value_or(2*(input.sizes()[dim] - 1));
   TORCH_CHECK(n >= 1, "Invalid number of data points (", n, ") specified");
   if (n_opt) {
@@ -225,7 +225,7 @@ Tensor fft_r2c(c10::string_view function_name,
               " expects a complex output tensor, but got ", out.scalar_type());
   input = promote_tensor_fft(input);
   const auto input_dim = input.dim();
-  const auto dim = maybe_wrap_dim(unwrapped_dim, input_dim);
+  const auto dim = maybe_wrap_dim(unwrapped_dim, input_dim, /*wrap_scalar=*/false);
   const auto n = n_opt.value_or(input.sizes()[dim]);
   TORCH_CHECK(n >= 1, "Invalid number of data points (", n, ") specified");
   if (n_opt) {
@@ -257,7 +257,7 @@ Tensor fft_c2c(c10::string_view function_name,
   TORCH_CHECK(input.is_complex(), function_name,
               " expects a complex input tensor, but got ", input.scalar_type());
   const auto input_dim = input.dim();
-  const auto dim = maybe_wrap_dim(unwrapped_dim, input_dim);
+  const auto dim = maybe_wrap_dim(unwrapped_dim, input_dim, /*wrap_scalar=*/false);
   const auto n = n_opt.value_or(input.sizes()[dim]);
   TORCH_CHECK(n >= 1, "Invalid number of data points (", n, ") specified");
   if (n_opt) {
@@ -284,7 +284,7 @@ ShapeAndDims canonicalize_fft_shape_and_dim_args(
   if (dim) {
     ret.dim.resize(dim->size());
     std::copy(dim->begin(), dim->end(), ret.dim.begin());
-    maybe_wrap_dims(ret.dim, input_dim);
+    maybe_wrap_dims(ret.dim, input_dim, /*wrap_scalars=*/false);
 
     // Check dims are unique
     DimVector copy = ret.dim;
@@ -750,7 +750,7 @@ DimVector default_alldims(const Tensor& self, at::OptionalIntArrayRef dim_opt) {
     IntArrayRef dim_unwrapped = *dim_opt;
     dim.resize(dim_unwrapped.size());
     for (const auto i : c10::irange(dim.size())) {
-      dim[i] = maybe_wrap_dim(dim_unwrapped[i], self.dim());
+      dim[i] = maybe_wrap_dim(dim_unwrapped[i], self.dim(), /*wrap_scalars=*/false);
     }
   } else {
     dim.resize(self.dim());
@@ -1182,7 +1182,7 @@ void _fft_fill_with_conjugate_symmetry_(const Tensor& input, IntArrayRef dim_) {
   const auto input_strides = input.strides();
   TORCH_CHECK(dim_.size() > 0);
   DimVector dim(dim_.begin(), dim_.end());
-  at::maybe_wrap_dims(dim, input_strides.size());
+  at::maybe_wrap_dims(dim, input_strides.size(), /*wrap_scalars=*/false);
 
   if (input.numel() == 0 || input_sizes[dim.back()] <= 2) {
     return;  // No elements need writing
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index 1fa4087d06f0e..08e24c4037749 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -473,8 +473,9 @@ def validate_exclusive_idx(rank: int, ex_idx: int):
 
 
 # "Wraps" a dim (up to one time) for the given rank, allowing dims to be
-# specified using negative indices. For scalar tensors with rank 0, then idx
-# must be in the range [-1, 0]. Otherwise, idx should be in the range [-rank, rank-1].
+# specified using negative indices. If `wrap_scalar` is true then scalar
+# tensors of rank 0 will allow dimensions in the range [-1, 0]. Otherwise,
+# idx should be in the range [-rank, rank-1].
 def canonicalize_dim(rank: int, idx: int, wrap_scalar: bool = True) -> int:
     if rank < 0:
         msg = f"Rank cannot be negative but got {rank}"
@@ -507,20 +508,20 @@ def canonicalize_dim(rank: int, idx: int, wrap_scalar: bool = True) -> int:
 # Takes a dimension or sequence of dimensions and "wraps" them,
 # mapping negative offsets to positive ones
 @overload
-def canonicalize_dims(rank: int, indices: Sequence[int]) -> Tuple[int, ...]:
+def canonicalize_dims(rank: int, indices: Sequence[int], wrap_scalar: bool = True) -> Tuple[int, ...]:
     pass
 
 
 @overload
-def canonicalize_dims(rank: int, indices: int) -> int:
+def canonicalize_dims(rank: int, indices: int, wrap_scalar: bool = True) -> int:
     pass
 
 
-def canonicalize_dims(rank, indices):
+def canonicalize_dims(rank, indices, wrap_scalar=True):
     if isinstance(indices, Dim):
-        return canonicalize_dim(rank, indices)
+        return canonicalize_dim(rank, indices, wrap_scalar)
 
-    return tuple(canonicalize_dim(rank, x) for x in indices)
+    return tuple(canonicalize_dim(rank, x, wrap_scalar) for x in indices)
 
 
 def is_valid_permutation(rank: int, perm: DimsSequenceType) -> bool:
diff --git a/torch/_refs/fft.py b/torch/_refs/fft.py
index 28df8dafc1fdf..738a33fde038b 100644
--- a/torch/_refs/fft.py
+++ b/torch/_refs/fft.py
@@ -115,7 +115,7 @@ def _fft_c2r(
 ) -> TensorLikeType:
     """Common code for performing any complex to real FFT (irfft or hfft)"""
     input = _maybe_promote_tensor_fft(input, require_complex=True)
-    dims = (utils.canonicalize_dim(input.ndim, dim),)
+    dims = (utils.canonicalize_dim(input.ndim, dim, wrap_scalar=False),)
     last_dim_size = n if n is not None else 2 * (input.shape[dim] - 1)
     check(last_dim_size >= 1, lambda: f"Invalid number of data points ({n}) specified")
 
@@ -144,7 +144,7 @@ def _fft_r2c(
         lambda: f"{func_name} expects a floating point input tensor, but got {input.dtype}",
     )
     input = _maybe_promote_tensor_fft(input)
-    dims = (utils.canonicalize_dim(input.ndim, dim),)
+    dims = (utils.canonicalize_dim(input.ndim, dim, wrap_scalar=False),)
 
     if n is not None:
         input = _resize_fft_input(input, dims, (n,))
@@ -167,7 +167,7 @@ def _fft_c2c(
         input.dtype.is_complex,
         lambda: f"{func_name} expects a complex input tensor, but got {input.dtype}",
     )
-    dims = (utils.canonicalize_dim(input.ndim, dim),)
+    dims = (utils.canonicalize_dim(input.ndim, dim, wrap_scalar=False),)
 
     if n is not None:
         input = _resize_fft_input(input, dims, (n,))
@@ -263,7 +263,7 @@ def _canonicalize_fft_shape_and_dim_args(
     if dim is not None:
         if not isinstance(dim, Sequence):
             dim = (dim,)
-        ret_dims = utils.canonicalize_dims(input_dim, dim)
+        ret_dims = utils.canonicalize_dims(input_dim, dim, wrap_scalar=False)
 
         # Check dims are unique
         check(len(set(dim)) == len(dim), lambda: "FFT dims must be unique")
diff --git a/torch/testing/_internal/opinfo/definitions/fft.py b/torch/testing/_internal/opinfo/definitions/fft.py
index 061718ec2b533..341e183319954 100644
--- a/torch/testing/_internal/opinfo/definitions/fft.py
+++ b/torch/testing/_internal/opinfo/definitions/fft.py
@@ -1,4 +1,5 @@
 import unittest
+from functools import partial
 from typing import List
 
 import numpy as np
@@ -15,6 +16,7 @@
 from torch.testing._internal.common_utils import TEST_SCIPY, TEST_WITH_ROCM
 from torch.testing._internal.opinfo.core import (
     DecorateInfo,
+    ErrorInput,
     OpInfo,
     SampleInput,
     SpectralFuncInfo,
@@ -65,6 +67,26 @@ def __init__(
         super().__init__(**ukwargs)
 
 
+def error_inputs_fft(op_info, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+    # Zero-dimensional tensor has no dimension to take FFT of
+    yield ErrorInput(
+        SampleInput(make_arg()),
+        error_type=IndexError,
+        error_regex="Dimension specified as -1 but tensor has no dimensions",
+    )
+
+
+def error_inputs_fftn(op_info, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+    # Specifying a dimension on a zero-dimensional tensor
+    yield ErrorInput(
+        SampleInput(make_arg(), dim=(0,)),
+        error_type=IndexError,
+        error_regex="Dimension specified as 0 but tensor has no dimensions",
+    )
+
+
 def sample_inputs_fftshift(op_info, device, dtype, requires_grad, **kwargs):
     def mt(shape, **kwargs):
         return make_tensor(
@@ -97,6 +119,7 @@ def mt(shape, **kwargs):
                 else (torch.half, torch.complex32)
             ),
         ),
+        error_inputs_func=error_inputs_fft,
         # https://github.com/pytorch/pytorch/issues/80411
         gradcheck_fast_mode=True,
         supports_forward_ad=True,
@@ -121,6 +144,7 @@ def mt(shape, **kwargs):
                 else (torch.half, torch.complex32)
             ),
         ),
+        error_inputs_func=error_inputs_fftn,
         # https://github.com/pytorch/pytorch/issues/80411
         gradcheck_fast_mode=True,
         supports_forward_ad=True,
@@ -146,6 +170,7 @@ def mt(shape, **kwargs):
                 else (torch.half, torch.complex32)
             ),
         ),
+        error_inputs_func=error_inputs_fftn,
         # https://github.com/pytorch/pytorch/issues/80411
         gradcheck_fast_mode=True,
         supports_forward_ad=True,
@@ -171,6 +196,7 @@ def mt(shape, **kwargs):
                 else (torch.half, torch.complex32)
             ),
         ),
+        error_inputs_func=error_inputs_fft,
         # https://github.com/pytorch/pytorch/issues/80411
         gradcheck_fast_mode=True,
         supports_forward_ad=True,
@@ -205,6 +231,7 @@ def mt(shape, **kwargs):
                 else (torch.half, torch.complex32)
             ),
         ),
+        error_inputs_func=error_inputs_fftn,
         # https://github.com/pytorch/pytorch/issues/80411
         gradcheck_fast_mode=True,
         supports_forward_ad=True,
@@ -245,6 +272,7 @@ def mt(shape, **kwargs):
                 else (torch.half, torch.complex32)
             ),
         ),
+        error_inputs_func=error_inputs_fftn,
         # https://github.com/pytorch/pytorch/issues/80411
         gradcheck_fast_mode=True,
         supports_forward_ad=True,
@@ -280,6 +308,7 @@ def mt(shape, **kwargs):
         dtypesIfCUDA=all_types_and(
             torch.bool, *(() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half,))
         ),
+        error_inputs_func=error_inputs_fft,
         # https://github.com/pytorch/pytorch/issues/80411
         gradcheck_fast_mode=True,
         supports_forward_ad=True,
@@ -300,6 +329,7 @@ def mt(shape, **kwargs):
         dtypesIfCUDA=all_types_and(
             torch.bool, *(() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half,))
         ),
+        error_inputs_func=error_inputs_fftn,
         # https://github.com/pytorch/pytorch/issues/80411
         gradcheck_fast_mode=True,
         supports_forward_ad=True,
@@ -322,6 +352,7 @@ def mt(shape, **kwargs):
         dtypesIfCUDA=all_types_and(
             torch.bool, *(() if (TEST_WITH_ROCM or not SM53OrLater) else (torch.half,))
         ),
+        error_inputs_func=error_inputs_fftn,
         # https://github.com/pytorch/pytorch/issues/80411
         gradcheck_fast_mode=True,
         supports_forward_ad=True,
@@ -338,6 +369,7 @@ def mt(shape, **kwargs):
         decomp_aten_name="_fft_c2c",
         ref=np.fft.ifft,
         ndimensional=SpectralFuncType.OneD,
+        error_inputs_func=error_inputs_fft,
         # https://github.com/pytorch/pytorch/issues/80411
         gradcheck_fast_mode=True,
         supports_forward_ad=True,
@@ -362,6 +394,7 @@ def mt(shape, **kwargs):
         decomp_aten_name="_fft_c2c",
         ref=np.fft.ifft2,
         ndimensional=SpectralFuncType.TwoD,
+        error_inputs_func=error_inputs_fftn,
         # https://github.com/pytorch/pytorch/issues/80411
         gradcheck_fast_mode=True,
         supports_forward_ad=True,
@@ -393,6 +426,7 @@ def mt(shape, **kwargs):
         decomp_aten_name="_fft_c2c",
         ref=np.fft.ifftn,
         ndimensional=SpectralFuncType.ND,
+        error_inputs_func=error_inputs_fftn,
         # https://github.com/pytorch/pytorch/issues/80411
         gradcheck_fast_mode=True,
         supports_forward_ad=True,
@@ -424,6 +458,7 @@ def mt(shape, **kwargs):
         decomp_aten_name="_fft_r2c",
         ref=np.fft.ihfft,
         ndimensional=SpectralFuncType.OneD,
+        error_inputs_func=error_inputs_fft,
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
         # See https://github.com/pytorch/pytorch/pull/78358
@@ -443,6 +478,7 @@ def mt(shape, **kwargs):
         decomp_aten_name="_fft_r2c",
         ref=scipy.fft.ihfftn if has_scipy_fft else None,
         ndimensional=SpectralFuncType.TwoD,
+        error_inputs_func=error_inputs_fftn,
         # https://github.com/pytorch/pytorch/issues/80411
         gradcheck_fast_mode=True,
         supports_forward_ad=True,
@@ -474,6 +510,7 @@ def mt(shape, **kwargs):
         decomp_aten_name="_fft_r2c",
         ref=scipy.fft.ihfftn if has_scipy_fft else None,
         ndimensional=SpectralFuncType.ND,
+        error_inputs_func=error_inputs_fftn,
         # https://github.com/pytorch/pytorch/issues/80411
         gradcheck_fast_mode=True,
         supports_forward_ad=True,
@@ -504,6 +541,7 @@ def mt(shape, **kwargs):
         decomp_aten_name="_fft_c2r",
         ref=np.fft.irfft,
         ndimensional=SpectralFuncType.OneD,
+        error_inputs_func=error_inputs_fft,
         # https://github.com/pytorch/pytorch/issues/80411
         gradcheck_fast_mode=True,
         supports_forward_ad=True,
@@ -529,6 +567,7 @@ def mt(shape, **kwargs):
         decomp_aten_name="_fft_c2r",
         ref=np.fft.irfft2,
         ndimensional=SpectralFuncType.TwoD,
+        error_inputs_func=error_inputs_fftn,
         # https://github.com/pytorch/pytorch/issues/80411
         gradcheck_fast_mode=True,
         supports_forward_ad=True,
@@ -561,6 +600,7 @@ def mt(shape, **kwargs):
         decomp_aten_name="_fft_c2r",
         ref=np.fft.irfftn,
         ndimensional=SpectralFuncType.ND,
+        error_inputs_func=error_inputs_fftn,
         # https://github.com/pytorch/pytorch/issues/80411
         gradcheck_fast_mode=True,
         supports_forward_ad=True,

From 315d5cc74429b87cefb80e8a56211bf6659054b2 Mon Sep 17 00:00:00 2001
From: Elias Ellison <elias.ellison@gmail.com>
Date: Wed, 23 Nov 2022 19:02:51 +0000
Subject: [PATCH 1231/1922] Dont clone unmutated args in triton autotuning
 (#89519)

Improves first memory compression on pytorch struct from .55 -> .73. However, it doesn't totally eliminate the overhead from autotuning. Any other pointers on where the overhead is coming from in autotuning would be great.

Edit: i think it's just the triton cache clearing https://github.com/openai/triton/blob/44f577984d28ee979f704e2c28a1dcbac9639840/python/triton/testing.py#L159

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89519
Approved by: https://github.com/ngimel, https://github.com/jansel
---
 test/inductor/test_torchinductor.py    | 19 ++++++++++++++++
 torch/_inductor/codegen/triton.py      | 30 ++++++++++++++++++++++++--
 torch/_inductor/triton_ops/autotune.py | 28 +++++++++++++++++-------
 3 files changed, 67 insertions(+), 10 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 743655d07fd73..126fd86d5b81f 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -5018,6 +5018,24 @@ def forward(self, input: torch.Tensor):
 
             self.assertTrue(torch.allclose(module(input), traced(input)))
 
+        @patch.object(config.triton, "autotune", True)
+        def test_inplace_add_alpha_autotune(self):
+            def fn(x, y):
+                aten.add_.Tensor(x, y, alpha=0.55)
+                return (x,)
+
+            x1 = torch.zeros(2, 3, 4, 10, device="cuda")
+            x2 = torch.zeros(2, 3, 4, 10, device="cuda")
+            x3 = torch.zeros(2, 3, 4, 10, device="cuda")
+            y = torch.randn(2, 3, 4, 10, device="cuda").to(
+                memory_format=torch.channels_last
+            )
+            fn_fx = make_fx(fn)(x1, y)
+            fn_compiled = compile_fx_inner(fn_fx, [x1, y])
+            fn(x2, y)
+            fn_compiled([x3, y])
+            assert same(x2, x3)
+
         def test_permute_linear_fusion(self):
             class TestModule(torch.nn.Module):
                 def __init__(self, k: int, n: int):
@@ -5327,6 +5345,7 @@ def decorator(fn):
                         meta=meta,
                         configs=configs,
                         save_cache_hook=False,
+                        mutated_arg_names=["in_out_ptr0"],
                     )
 
                 return decorator
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index e14b417c173f8..16cff0770a072 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -515,11 +515,18 @@ class TritonKernel(Kernel):
     overrides = TritonOverrides
     sexpr = texpr
 
-    def __init__(self, *groups, pid_cache=None, reduction_hint=ReductionHint.DEFAULT):
+    def __init__(
+        self,
+        *groups,
+        mutations=None,
+        pid_cache=None,
+        reduction_hint=ReductionHint.DEFAULT,
+    ):
         if pid_cache is None:
             pid_cache = {}
         super(TritonKernel, self).__init__()
         self.numels = [V.graph.sizevars.simplify(s) for s in groups]
+        self.mutations = mutations
         self.range_trees = []
         self.range_tree_nodes = {}
         self.iter_vars_count = itertools.count()
@@ -1017,10 +1024,21 @@ def codegen_kernel(self, name=None):
             )
 
         argdefs, _, signature = self.args.python_argdefs()
+
+        mutated_args = []
+        for mutation in self.mutations:
+            if mutation in self.args.input_buffers:
+                mutated_args.append(self.args.input_buffers[mutation])
+            if mutation in self.args.inplace_buffers:
+                mutated_args.append(self.args.inplace_buffers[mutation])
+            if mutation in self.args.output_buffers:
+                mutated_args.append(self.args.output_buffers[mutation])
+
         triton_meta = {
             "signature": dict(enumerate(map(signature_of, signature))),
             "device": V.graph.scheduler.current_device.index,
             "constants": {},
+            "mutated_arg_names": mutated_args,
         }
 
         for tree in self.range_trees:
@@ -1295,7 +1313,15 @@ def codegen_node_schedule(self, node_schedule, numel, reduction_numel):
                 reduction_hint_val = ReductionHint.DEFAULT
         else:
             reduction_hint_val = ReductionHint.DEFAULT
-        with TritonKernel(*tiled_groups, reduction_hint=reduction_hint_val) as kernel:
+
+        mutations = set()
+        for node in node_schedule:
+            if hasattr(node, "get_mutations"):
+                mutations.update(node.get_mutations())
+
+        with TritonKernel(
+            *tiled_groups, reduction_hint=reduction_hint_val, mutations=mutations
+        ) as kernel:
             stack = contextlib.ExitStack()
             for node in node_schedule:
                 if node not in (EnableReduction, DisableReduction):
diff --git a/torch/_inductor/triton_ops/autotune.py b/torch/_inductor/triton_ops/autotune.py
index 808241cd02a2f..285995c6254fa 100644
--- a/torch/_inductor/triton_ops/autotune.py
+++ b/torch/_inductor/triton_ops/autotune.py
@@ -42,11 +42,12 @@ class CachingAutotuner(KernelInterface):
     configs, and does not rely on the Triton JIT.
     """
 
-    def __init__(self, fn, meta, configs, save_cache_hook):
+    def __init__(self, fn, meta, configs, save_cache_hook, mutated_arg_names):
         super().__init__()
         self.fn = fn
         self.meta = meta
         self.save_cache_hook = save_cache_hook
+        self.mutated_arg_names = mutated_arg_names
         self.configs = configs
         self.launchers = []
         self.lock = threading.Lock()
@@ -141,12 +142,17 @@ def autotune_to_one_config(self, *args, **kwargs):
         """Do the actual autotuning"""
         from ..compile_fx import clone_preserve_strides
 
-        # clone the input args to avoid autotune contaminating them if
-        # the kernel does in-place stores
-        cloned_args = [
-            clone_preserve_strides(arg) if isinstance(arg, torch.Tensor) else arg
-            for arg in args
-        ]
+        # clone inplace buffers to avoid autotune contaminating them if
+        # the kernel does in-place stores. avoid cloning other buffers because
+        # it leads to increase memory use
+        cloned_args = []
+        for i, arg in enumerate(args):
+            if self.fn.arg_names[i] in self.mutated_arg_names:
+                assert isinstance(arg, torch.Tensor)
+                cloned_args.append(clone_preserve_strides(arg))
+            else:
+                cloned_args.append(arg)
+
         timings = {
             launcher: self.bench(launcher, *cloned_args, **kwargs)
             for launcher in self.launchers
@@ -251,9 +257,15 @@ def save_cache_hook(cfg):
     else:
         save_cache_hook = None
 
+    mutated_arg_names = meta.pop("mutated_arg_names", ())
+
     def decorator(fn):
         return CachingAutotuner(
-            fn, meta=meta, configs=configs, save_cache_hook=save_cache_hook
+            fn,
+            meta=meta,
+            configs=configs,
+            save_cache_hook=save_cache_hook,
+            mutated_arg_names=mutated_arg_names,
         )
 
     return decorator

From ac56490db09b0d98d04eea4b380e9989427a8993 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Wed, 23 Nov 2022 22:39:36 +0000
Subject: [PATCH 1232/1922] Skip upload test stats for test reports from rerun
 disabled tests workflow (#89548)

I have found the reason why uploading tests stats fails for rerun disabled workflow, for example https://github.com/pytorch/pytorch/actions/runs/3522896778/jobs/5917765699.  The problem is that the pytest XML file is now too big to be processed quickly (x50 bigger). Unlike unittest, `pytest-flakefinder` used by rerun disabled tests for test_ops includes skipped messages multiple times (50 times by default, retrying and skipping).  This slows down the upload test stats script too much (O(n)) because it tries to gather all the stats. On the other hand, `check_disabled_tests` doesn't suffer from the same issue because it ignores all these skipped messages.

This is a quick fix to skip test reports from rerun disabled tests workflow when trying to upload test stats.

I'll try to fix this properly later in the way we use pytest-flakefinder. From what I see, a zipped test report from rerun disabled test is only few MB ([example](https://gha-artifacts.s3.amazonaws.com/pytorch/pytorch/3521687954/1/artifact/test-reports-test-default-1-2-linux.2xlarge_9636028803.zip)), but will balloon up to a much bigger XML file after extracting from a dozen to a few hundred MB (text).  The size of the zipped file is not a big immediate problem

### Testing

[3521687954](https://github.com/pytorch/pytorch/actions/runs/3521687954) is an example workflow with rerun disabled tests and mem leak check.  The script can now finish when running locally:

* `upload_test_stats` finishes around 3+ minutes
```
time python -m tools.stats.upload_test_stats --workflow-run-id 3521687954 --workflow-run-attempt 1 --head-branch master
...
Writing 8925 documents to S3
Done!
Writing 1760 documents to S3
Done!
Writing 1675249 documents to S3
Done!
python3 -m tools.stats.upload_test_stats --workflow-run-id 3521687954  1    185.69s user 12.89s system 75% cpu 4:22.82 total
```

* `check_disabled_tests` finishes within 3 minutes
```
time python -m tools.stats.check_disabled_tests --workflow-run-id 3521687954 --workflow-run-attempt 1 --repo pytorch/pytorch
...
python -m tools.stats.check_disabled_tests --workflow-run-id 3521687954  1    154.19s user 4.17s system 97% cpu 2:42.50 total
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89548
Approved by: https://github.com/clee2000
---
 tools/stats/check_disabled_tests.py | 15 +--------------
 tools/stats/upload_stats_lib.py     | 15 +++++++++++++++
 tools/stats/upload_test_stats.py    | 20 ++++++++++++++++++--
 3 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/tools/stats/check_disabled_tests.py b/tools/stats/check_disabled_tests.py
index 75c4f236ef216..636af668a13d3 100644
--- a/tools/stats/check_disabled_tests.py
+++ b/tools/stats/check_disabled_tests.py
@@ -9,29 +9,16 @@
 from tools.stats.upload_stats_lib import (
     download_gha_artifacts,
     download_s3_artifacts,
+    is_rerun_disabled_tests,
     unzip,
     upload_to_s3,
 )
 from tools.stats.upload_test_stats import process_xml_element
 
 TESTCASE_TAG = "testcase"
-TARGET_WORKFLOW = "--rerun-disabled-tests"
 SEPARATOR = ";"
 
 
-def is_rerun_disabled_tests(root: ET.ElementTree) -> bool:
-    """
-    Check if the test report is coming from rerun_disabled_tests workflow
-    """
-    skipped = root.find(".//*skipped")
-    # Need to check against None here, if not skipped doesn't work as expected
-    if skipped is None:
-        return False
-
-    message = skipped.attrib.get("message", "")
-    return TARGET_WORKFLOW in message or "num_red" in message
-
-
 def process_report(
     report: Path,
 ) -> Dict[str, Dict[str, int]]:
diff --git a/tools/stats/upload_stats_lib.py b/tools/stats/upload_stats_lib.py
index da7402fce276e..c91075225a628 100644
--- a/tools/stats/upload_stats_lib.py
+++ b/tools/stats/upload_stats_lib.py
@@ -2,6 +2,7 @@
 import io
 import json
 import os
+import xml.etree.ElementTree as ET
 import zipfile
 from pathlib import Path
 from typing import Any, Dict, List
@@ -12,6 +13,7 @@
 
 PYTORCH_REPO = "https://api.github.com/repos/pytorch/pytorch"
 S3_RESOURCE = boto3.resource("s3")
+TARGET_WORKFLOW = "--rerun-disabled-tests"
 
 
 def _get_request_headers() -> Dict[str, str]:
@@ -165,3 +167,16 @@ def unzip(p: Path) -> None:
 
     with zipfile.ZipFile(p, "r") as zip:
         zip.extractall(unzipped_dir)
+
+
+def is_rerun_disabled_tests(root: ET.ElementTree) -> bool:
+    """
+    Check if the test report is coming from rerun_disabled_tests workflow
+    """
+    skipped = root.find(".//*skipped")
+    # Need to check against None here, if not skipped doesn't work as expected
+    if skipped is None:
+        return False
+
+    message = skipped.attrib.get("message", "")
+    return TARGET_WORKFLOW in message or "num_red" in message
diff --git a/tools/stats/upload_test_stats.py b/tools/stats/upload_test_stats.py
index 01647264705bb..23695933c704b 100644
--- a/tools/stats/upload_test_stats.py
+++ b/tools/stats/upload_test_stats.py
@@ -9,6 +9,7 @@
 from tools.stats.upload_stats_lib import (
     download_gha_artifacts,
     download_s3_artifacts,
+    is_rerun_disabled_tests,
     unzip,
     upload_to_s3,
 )
@@ -35,9 +36,18 @@ def parse_xml_report(
     job_id = get_job_id(report)
     print(f"Found job id: {job_id}")
 
+    test_cases: List[Dict[str, Any]] = []
+
     root = ET.parse(report)
+    # TODO: unlike unittest, pytest-flakefinder used by rerun disabled tests for test_ops
+    # includes skipped messages multiple times (50 times by default). This slows down
+    # this script too much (O(n)) because it tries to gather all the stats. This should
+    # be fixed later in the way we use pytest-flakefinder. A zipped test report from rerun
+    # disabled test is only few MB, but will balloon up to a much bigger XML file after
+    # extracting from a dozen to few hundred MB
+    if is_rerun_disabled_tests(root):
+        return test_cases
 
-    test_cases = []
     for test_case in root.iter(tag):
         case = process_xml_element(test_case)
         case["workflow_id"] = workflow_id
@@ -118,10 +128,16 @@ def process_xml_element(element: ET.Element) -> Dict[str, Any]:
 
 
 def get_pytest_parallel_times() -> Dict[Any, Any]:
-    pytest_parallel_times = {}
+    pytest_parallel_times: Dict[Any, Any] = {}
     for report in Path(".").glob("**/python-pytest/**/*.xml"):
         invoking_file = report.parent.name
+
         root = ET.parse(report)
+        # TODO: Skip test reports from rerun disabled tests, same reason as mentioned
+        # above
+        if is_rerun_disabled_tests(root):
+            continue
+
         assert len(list(root.iter("testsuite"))) == 1
         for test_suite in root.iter("testsuite"):
             pytest_parallel_times[

From d84c6957f51c0c951aa66cf85c85743fba4c067f Mon Sep 17 00:00:00 2001
From: Everton Constantino <everton.constantino@linaro.org>
Date: Wed, 23 Nov 2022 22:46:29 +0000
Subject: [PATCH 1233/1922] Fix CheckOutputStreamSetting on JitLoggingTest as
 it failed if logging wasn't enabled. (#82722)

`JIT_LOG` checks if logging was enabled for that particular file and when it isn't it doesn't output anything. Since the test checks for the size of `test_stream` it fails. I believe forcing the file to have logging enabled to see if the stream is being correctly set during test makes no sense so this patches just forcibly outputs and checks if it worked.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/82722
Approved by: https://github.com/davidberard98
---
 test/cpp/jit/test_jit_logging_levels.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/test/cpp/jit/test_jit_logging_levels.cpp b/test/cpp/jit/test_jit_logging_levels.cpp
index ca2e8c5156e6d..6b92bf7d270ce 100644
--- a/test/cpp/jit/test_jit_logging_levels.cpp
+++ b/test/cpp/jit/test_jit_logging_levels.cpp
@@ -41,7 +41,15 @@ TEST(JitLoggingTest, CheckOutputStreamSetting) {
   ::torch::jit::set_jit_logging_levels("test_jit_logging_levels");
   std::ostringstream test_stream;
   ::torch::jit::set_jit_logging_output_stream(test_stream);
-  JIT_LOG(::torch::jit::JitLoggingLevels::GRAPH_DUMP, "Message");
+  /* Using JIT_LOG checks if this file has logging enabled with
+    is_enabled(__FILE__, level) making the test fail. since we are only testing
+    the OutputStreamSetting we can forcefully output to it directly.
+  */
+  ::torch::jit::get_jit_logging_output_stream() << ::torch::jit::jit_log_prefix(
+      ::torch::jit::JitLoggingLevels::GRAPH_DUMP,
+      __FILE__,
+      __LINE__,
+      ::c10::str("Message"));
   ASSERT_TRUE(test_stream.str().size() > 0);
 }
 

From 6e39c4d64bea7d169786ec8fb4af86eed5bcc574 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@meta.com>
Date: Wed, 23 Nov 2022 23:23:17 +0000
Subject: [PATCH 1234/1922] Rectify `native_batch_norm` schema by splitting it
 into two legit schemas (#88697)

Using the same repro from the issue (but with BatchNorm2D)

Rectifies native_batch_norm schema by splitting the schema into 2:
1. one will have NON-optional alias-able running_mean and running_var inputs
2. the other will just not have those parameters at all (no_stats variation)

**Calling for name suggestions!**

## test plan
I've added tests in test_functionalization.py as well as an entry in common_method_invocations.py for `native_batch_norm_legit`
CI should pass.

## next steps
Because of bc/fc reasons, we reroute native_batch_norm to call our new schemas ONLY through the python dispatcher, but in 2 weeks or so, we should make `native_batch_norm_legit` the official batch_norm.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88697
Approved by: https://github.com/albanD
---
 .gitignore                                    |   1 +
 aten/src/ATen/functorch/BatchRulesNorm.cpp    |  18 ++
 aten/src/ATen/native/Normalization.cpp        |  24 +++
 aten/src/ATen/native/cuda/Normalization.cu    |  16 ++
 aten/src/ATen/native/mkldnn/Normalization.cpp |  36 ++++
 .../native/mps/operations/Normalization.mm    |  48 +++++
 aten/src/ATen/native/native_functions.yaml    |  29 +++
 functorch/_src/partitioners.py                |   2 +-
 test/functorch/test_ops.py                    |  10 +-
 test/functorch/test_vmap.py                   |   6 +
 test/inductor/test_torchinductor_opinfo.py    |   1 +
 test/lazy/test_reuse_ir.py                    |   4 +
 test/test_decomp.py                           |   6 +
 test/test_functionalization.py                | 203 +++++++++++++++++-
 test/test_jit_cuda_fuser.py                   |   2 +-
 test/test_meta.py                             |   6 +
 test/test_ops.py                              |   2 +-
 tools/autograd/derivatives.yaml               |   8 +
 torch/_decomp/decompositions.py               | 140 +++++++++++-
 torch/jit/_shape_functions.py                 |   2 +
 torch/overrides.py                            |   1 +
 .../_internal/common_methods_invocations.py   |  45 ++++
 22 files changed, 598 insertions(+), 12 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5ca188577e16b..597ae390abe9c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -46,6 +46,7 @@ docs/source/generated/
 log
 usage_log.txt
 test-reports/
+test/*.bak
 test/.coverage
 test/.hypothesis/
 test/cpp/api/mnist
diff --git a/aten/src/ATen/functorch/BatchRulesNorm.cpp b/aten/src/ATen/functorch/BatchRulesNorm.cpp
index 5e6f85510163d..d53d4f6a2e972 100644
--- a/aten/src/ATen/functorch/BatchRulesNorm.cpp
+++ b/aten/src/ATen/functorch/BatchRulesNorm.cpp
@@ -875,10 +875,28 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> cudnn_batch_norm_backward_wrapper(
     return at::miopen_batch_norm_backward(input, grad_out, weight_opt, running_mean_opt, running_var_opt, save_mean_opt, save_rstd_opt, eps);
   }
 
+// NB: This is NOT good. In the ideal world, we do NOT want to convert the new legit op back into native_batch_norm
+// as native_batch_norm has a problematic schema--it promises it is functional when it is not. However, vmap doesn't
+// work with dynamo anyway so we gain some buffer room to do wrong things here. The (reasonable) hope is that we will
+// make native_batch_norm composite implicit within a few weeks and we can fix this before vmap works with dynamo.
+std::tuple<at::Tensor,at::Tensor,at::Tensor> _native_batch_norm_legit_batch(
+  const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+  Tensor& running_mean, Tensor& running_var, bool train, double momentum, double eps) {
+    return at::native_batch_norm(self, weight_opt, bias_opt, running_mean, running_var, train, momentum, eps);
+}
+
+std::tuple<at::Tensor,at::Tensor,at::Tensor> _native_batch_norm_legit_no_stats_batch(
+  const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+  bool train, double momentum, double eps) {
+    return at::native_batch_norm(self, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, eps);
+}
+
 TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   VMAP_SUPPORT(native_batch_norm, NATIVE_BATCH_NORM_BATCH_RULE(native_batch_norm));
   VMAP_SUPPORT(cudnn_batch_norm, CUDNN_BATCH_NORM_BATCH_RULE(cudnn_batch_norm));
   VMAP_SUPPORT(miopen_batch_norm, MIOPEN_BATCH_NORM_BATCH_RULE(miopen_batch_norm));
+  m.impl("_native_batch_norm_legit", _native_batch_norm_legit_batch);
+  m.impl("_native_batch_norm_legit.no_stats", _native_batch_norm_legit_no_stats_batch);
   m.impl("native_batch_norm_backward", NATIVE_BATCH_NORM_BACKWARD_BATCH_RULE(native_batch_norm_backward));
   m.impl("cudnn_batch_norm_backward", CUDNN_BATCH_NORM_BACKWARD_BATCH_RULE(at::functorch::cudnn_batch_norm_backward_wrapper));
   m.impl("miopen_batch_norm_backward", MIOPEN_BATCH_NORM_BACKWARD_BATCH_RULE(at::functorch::miopen_batch_norm_backward_wrapper));
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index 69196a3cd8210..ab9094d9b5981 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -787,6 +787,30 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_cpu(const Tensor& self, const c10:
   return batch_norm_cpu_out(self, weight_opt, bias_opt, running_mean_opt, running_var_opt, train, momentum, eps, output, save_mean, save_var);
 }
 
+
+std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_cpu(
+    const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    Tensor& running_mean, Tensor& running_var, bool train, double momentum, double eps) {
+  return batch_norm_cpu(self, weight_opt, bias_opt, running_mean, running_var, train, momentum, eps);
+}
+
+std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_no_stats_cpu(
+    const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    bool train, double momentum, double eps) {
+  return batch_norm_cpu(self, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, eps);
+}
+
+
+std::tuple<Tensor&, Tensor&, Tensor&> _batch_norm_legit_cpu_out(const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, Tensor& running_mean, Tensor& running_var, bool train, double momentum, double eps, Tensor& out, Tensor& save_mean, Tensor& save_var) {
+  return batch_norm_cpu_out(self, weight_opt, bias_opt, running_mean, running_var, train, momentum, eps, out, save_mean, save_var);
+}
+
+
+std::tuple<Tensor&, Tensor&, Tensor&> _batch_norm_legit_no_stats_cpu_out(const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, bool train, double momentum, double eps, Tensor& out, Tensor& save_mean, Tensor& save_var) {
+  return batch_norm_cpu_out(self, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, eps, out, save_mean, save_var);
+}
+
+
 std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu(const Tensor& grad_out, const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_invstd_opt,
                                                            bool train, double eps, std::array<bool,3> grad_input_mask) {
   // See [Note: hacky wrapper removal for optional tensor]
diff --git a/aten/src/ATen/native/cuda/Normalization.cu b/aten/src/ATen/native/cuda/Normalization.cu
index df460447464b2..a8eff154c3505 100644
--- a/aten/src/ATen/native/cuda/Normalization.cu
+++ b/aten/src/ATen/native/cuda/Normalization.cu
@@ -473,6 +473,22 @@ std::tuple<Tensor, Tensor, Tensor> batch_norm_cuda(const Tensor& self, const c10
   return std::make_tuple(output, save_mean, save_invstd);
 }
 
+std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_cuda(const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, Tensor& running_mean, Tensor& running_var, bool train, double momentum, double epsilon) {
+  return batch_norm_cuda(self, weight_opt, bias_opt, running_mean, running_var, train, momentum, epsilon);
+}
+
+std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_no_stats_cuda(const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, bool train, double momentum, double epsilon) {
+  return batch_norm_cuda(self, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, epsilon);
+}
+
+std::tuple<Tensor&, Tensor&, Tensor&> _batch_norm_legit_cuda_out(const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, Tensor& running_mean, Tensor& running_var, bool train, double momentum, double epsilon, Tensor& output, Tensor& save_mean, Tensor& save_invstd) {
+  return batch_norm_cuda_out(self, weight_opt, bias_opt, running_mean, running_var, train, momentum, epsilon, output, save_mean, save_invstd);
+}
+
+std::tuple<Tensor&, Tensor&, Tensor&> _batch_norm_legit_no_stats_cuda_out(const Tensor& self, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, bool train, double momentum, double epsilon, Tensor& output, Tensor& save_mean, Tensor& save_invstd) {
+  return batch_norm_cuda_out(self, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, epsilon, output, save_mean, save_invstd);
+}
+
 std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cuda(const Tensor& grad_out, const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_invstd_opt, bool train, double epsilon, std::array<bool,3> grad_input_mask) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight = at::borrow_from_optional_tensor(weight_opt);
diff --git a/aten/src/ATen/native/mkldnn/Normalization.cpp b/aten/src/ATen/native/mkldnn/Normalization.cpp
index 1be6224a23c42..d0171865fac61 100644
--- a/aten/src/ATen/native/mkldnn/Normalization.cpp
+++ b/aten/src/ATen/native/mkldnn/Normalization.cpp
@@ -41,6 +41,23 @@ std::tuple<Tensor, Tensor, Tensor> mkldnn_layer_norm_last_index_weight_bias_f32(
   TORCH_CHECK(false, "mkldnn_layer_norm_last_index_weight_bias_f32: ATen not compiled with MKLDNN support");
 }
 
+std::tuple<Tensor, Tensor, Tensor> _mkldnn_batch_norm_legit(
+    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, Tensor& running_mean, Tensor& running_var,
+    bool train,
+    double momentum,
+    double eps) {
+  TORCH_CHECK(false, "_mkldnn_batch_norm_legit: ATen not compiled with MKLDNN support");
+}
+
+
+std::tuple<Tensor, Tensor, Tensor> _mkldnn_batch_norm_legit_no_stats(
+    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    bool train,
+    double momentum,
+    double eps) {
+  TORCH_CHECK(false, "_mkldnn_batch_norm_legit_no_stats: ATen not compiled with MKLDNN support");
+}
+
 } // namespace native
 } // namespace at
 
@@ -173,6 +190,25 @@ std::tuple<Tensor, Tensor, Tensor> mkldnn_batch_norm(
   }
 }
 
+
+std::tuple<Tensor, Tensor, Tensor> _mkldnn_batch_norm_legit(
+    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt, Tensor& running_mean, Tensor& running_var,
+    bool train,
+    double momentum,
+    double eps) {
+  return mkldnn_batch_norm(input, weight_opt, bias_opt, running_mean, running_var, train, momentum, eps);
+}
+
+
+std::tuple<Tensor, Tensor, Tensor> _mkldnn_batch_norm_legit_no_stats(
+    const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& bias_opt,
+    bool train,
+    double momentum,
+    double eps) {
+  return mkldnn_batch_norm(input, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, eps);
+}
+
+
 std::tuple<Tensor, Tensor, Tensor> mkldnn_batch_norm_backward(const Tensor& grad_output,
     const Tensor& input, const c10::optional<Tensor>& weight_opt, const c10::optional<Tensor>& running_mean_opt, const c10::optional<Tensor>& running_var_opt, const c10::optional<Tensor>& save_mean_opt, const c10::optional<Tensor>& save_invstd_opt,
     bool train,
diff --git a/aten/src/ATen/native/mps/operations/Normalization.mm b/aten/src/ATen/native/mps/operations/Normalization.mm
index 5384ee666fead..49f1e0538463f 100644
--- a/aten/src/ATen/native/mps/operations/Normalization.mm
+++ b/aten/src/ATen/native/mps/operations/Normalization.mm
@@ -411,6 +411,54 @@ Check if running mean exists (maybe do this check before making graph)
   return std::make_tuple(output, save_mean, save_var);
 }
 
+std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_mps
+                  (const Tensor& self,
+                   const c10::optional<Tensor>& weight_opt,
+                   const c10::optional<Tensor>& bias_opt,
+                   Tensor& running_mean,
+                   Tensor& running_var,
+                   bool train,
+                   double momentum,
+                   double epsilon) {
+
+  return batch_norm_mps(self, weight_opt, bias_opt, running_mean, running_var, train, momentum, epsilon);
+}
+
+std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_no_stats_mps
+                  (const Tensor& self,
+                   const c10::optional<Tensor>& weight_opt,
+                   const c10::optional<Tensor>& bias_opt,
+                   bool train,
+                   double momentum,
+                   double epsilon) {
+
+  return batch_norm_mps(self, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, epsilon);
+}
+
+std::tuple<Tensor&, Tensor&, Tensor&> _batch_norm_legit_mps_out
+                   (const Tensor& self,
+                    const c10::optional<Tensor>& weight_opt,
+                    const c10::optional<Tensor>& bias_opt,
+                    Tensor& running_mean,
+                    Tensor& running_var,
+                    bool train, double momentum, double epsilon,
+                    Tensor& output,
+                    Tensor& save_mean,
+                    Tensor& save_var) {
+  return batch_norm_mps_out(self, weight_opt, bias_opt, running_mean, running_var, train, momentum, epsilon, output, save_mean, save_var);
+}
+
+std::tuple<Tensor&, Tensor&, Tensor&> _batch_norm_legit_no_stats_mps_out
+                   (const Tensor& self,
+                    const c10::optional<Tensor>& weight_opt,
+                    const c10::optional<Tensor>& bias_opt,
+                    bool train, double momentum, double epsilon,
+                    Tensor& output,
+                    Tensor& save_mean,
+                    Tensor& save_var) {
+  return batch_norm_mps_out(self, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, epsilon, output, save_mean, save_var);
+}
+
 string get_mem_string(c10::MemoryFormat memory_format) {
   string mem_format_key;
   switch(memory_format) {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 3917be0014b44..9aa3a2cceb941 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -3811,6 +3811,35 @@
     MPS: batch_norm_mps_out
     CPU: batch_norm_cpu_out
 
+# TODO: In 2 weeks, we should make native_batch_norm composite implicit so that this correct schema percolates correctly through our dispatching
+- func: _native_batch_norm_legit(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CPU: _batch_norm_legit_cpu
+    CUDA: _batch_norm_legit_cuda
+    MPS: _batch_norm_legit_mps
+    MkldnnCPU: _mkldnn_batch_norm_legit
+  autogen: _native_batch_norm_legit_functional
+
+- func: _native_batch_norm_legit.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd) -> (Tensor(d!), Tensor(e!), Tensor(f!))
+  dispatch:
+    CPU: _batch_norm_legit_cpu_out
+    CUDA: _batch_norm_legit_cuda_out
+    MPS: _batch_norm_legit_mps_out
+
+- func: _native_batch_norm_legit.no_stats(Tensor input, Tensor? weight, Tensor? bias, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+  dispatch:
+    CPU: _batch_norm_legit_no_stats_cpu
+    CUDA: _batch_norm_legit_no_stats_cuda
+    MPS: _batch_norm_legit_no_stats_mps
+    MkldnnCPU: _mkldnn_batch_norm_legit_no_stats
+  tags: canonical
+
+- func: _native_batch_norm_legit.no_stats_out(Tensor input, Tensor? weight, Tensor? bias, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+  dispatch:
+    CPU: _batch_norm_legit_no_stats_cpu_out
+    CUDA: _batch_norm_legit_no_stats_cuda_out
+    MPS: _batch_norm_legit_no_stats_mps_out
+
 - func: batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor)
   dispatch:
     CUDA: batch_norm_stats_cuda
diff --git a/functorch/_src/partitioners.py b/functorch/_src/partitioners.py
index e12840f696b73..712c9a063eaf6 100644
--- a/functorch/_src/partitioners.py
+++ b/functorch/_src/partitioners.py
@@ -349,7 +349,7 @@ def is_tensor_node(x):
     recomputable_ops = set(recomputable_ops) if recomputable_ops is not None else set(default_recomputable_ops)
 
     random_ops = [aten.native_dropout, aten.rand_like, aten.randn_like]
-    compute_intensive_ops = [aten.mm, aten.convolution, aten.convolution_backward, aten.bmm, aten.addmm, aten.upsample_bilinear2d, aten._softmax, aten._softmax_backward_data, aten.native_layer_norm, aten.native_layer_norm_backward, aten.native_batch_norm, aten.native_batch_norm_backward]  # noqa: E501
+    compute_intensive_ops = [aten.mm, aten.convolution, aten.convolution_backward, aten.bmm, aten.addmm, aten.upsample_bilinear2d, aten._softmax, aten._softmax_backward_data, aten.native_layer_norm, aten.native_layer_norm_backward, aten.native_batch_norm, aten.native_batch_norm_backward, aten._native_batch_norm_legit]  # noqa: E501
 
     unrecomputable_ops = random_ops + compute_intensive_ops
 
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index e9451b596b4ac..c0ae683cdfbf7 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -400,7 +400,9 @@ def wrapped_fn(*args, **kwargs):
         skip('nn.functional.max_unpool1d'),  # fails everywhere except on mac
         skip('nn.functional.max_unpool2d'),  # fails everywhere except on windows
         skip('nn.functional.max_unpool3d'),  # fails everywhere except on mac
-        xfail("native_batch_norm"),
+        xfail("native_batch_norm"),          # TODO: fails comparing None to tensor of 0s for saved_mean/var tangents
+        xfail("_native_batch_norm_legit"),    # TODO: fails comparing None to tensor of 0s for saved_mean/var tangents
+
         xfail('nn.functional._scaled_dot_product_attention', device_type='cuda'),
 
         xfail('nn.functional.rrelu'),  # in-place test errors out with no formula implemented
@@ -689,6 +691,7 @@ def fn(inp, *args, **kwargs):
         # view doesn't work on sparse
         xfail("to_sparse"),
         xfail("native_batch_norm"),
+        xfail("_native_batch_norm_legit"),
     }))
     @ops(op_db + additional_op_db, allowed_dtypes=(torch.float,))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
@@ -773,6 +776,7 @@ def vjp_of_vjp(*args_and_cotangents):
         # All of the following are bugs and need to be fixed
         skip('linalg.svdvals'),  # # really annoying thing where it passes correctness check but not has_batch_rule
         skip("native_batch_norm"),
+        skip("_native_batch_norm_legit"),
         xfail('__getitem__', ''),  # dynamic error
         xfail('linalg.eig'),  # Uses aten::allclose
         xfail('nanquantile', device_type='cpu'),  # checks q via a .item() call
@@ -888,6 +892,7 @@ def test_vmapvjp(self, device, dtype, op):
         xfail('nn.functional.batch_norm'),
         xfail('nn.functional.batch_norm', 'without_cudnn'),
         xfail("native_batch_norm"),
+        xfail("_native_batch_norm_legit"),
         # ----------------------------------------------------------------------
     }
 
@@ -1090,6 +1095,7 @@ def test():
         xfail('segment_reduce', 'lengths'),
         xfail('sparse.sampled_addmm', ''),
         xfail("native_batch_norm"),
+        xfail("_native_batch_norm_legit"),
         xfail("native_dropout_backward"),
     }))
     def test_vmapvjp_has_batch_rule(self, device, dtype, op):
@@ -1162,6 +1168,7 @@ def test():
         xfail('as_strided_scatter', ''),
         xfail('sparse.sampled_addmm', ''),
         xfail("native_batch_norm"),
+        xfail("_native_batch_norm_legit"),
     }))
     def test_vjpvmap(self, device, dtype, op):
         # NB: there is no vjpvmap_has_batch_rule test because that is almost
@@ -1419,6 +1426,7 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents):
         # input while the running_mean or running_var, which will be updated in
         # place, were not batched.
         xfail("native_batch_norm"),
+        xfail("_native_batch_norm_legit"),
         xfail('native_dropout_backward',)
     }))
     @ops(op_db + additional_op_db, allowed_dtypes=(torch.float,))
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index 9b3293a7db752..4b460560d8a90 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -3294,7 +3294,10 @@ def test():
     ))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
     @skipOps('TestVmapOperatorsOpInfo', 'test_vmap_exhaustive', vmap_fail.union({
+        # RuntimeError: Batch norm got a batched tensor as input while the running_mean or running_var,
+        # which will be updated in place, were not batched.
         xfail('native_batch_norm'),
+        xfail('_native_batch_norm_legit'),
         xfail('tril'),  # Exception not raised on error input
         xfail('triu'),  # Exception not raised on error input
         # The error inputs are vectors, that pass when batched as they are treated as a matrix
@@ -3317,7 +3320,10 @@ def test_vmap_exhaustive(self, device, dtype, op):
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
         xfail('complex'),
         xfail('copysign'),
+        # Batch norm got a batched tensor as input while the running_mean or running_var,
+        # which will be updated in place, were not batched.
         xfail('native_batch_norm'),
+        xfail('_native_batch_norm_legit'),
         xfail('histogram'),
         xfail('index_fill'),
         xfail('nansum'),
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 2a791075706e8..c9a9147830e66 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -135,6 +135,7 @@ def process(device_type):
     "jiterator_unary": {b8, f16, f32, f64, i32, i64},
     # flaky
     "native_batch_norm": {f16, f32, f64},
+    "_native_batch_norm_legit": {f16, f32, f64},
 }
 
 inductor_expected_failures_single_sample = defaultdict(dict)
diff --git a/test/lazy/test_reuse_ir.py b/test/lazy/test_reuse_ir.py
index 2d19fe1a5b539..f7024e9519cca 100644
--- a/test/lazy/test_reuse_ir.py
+++ b/test/lazy/test_reuse_ir.py
@@ -111,6 +111,7 @@ def testBatchNorm(self):
             # BatchNorm2d does extra checks on dimensions which SymInts don't support yet
             # so we call `torch.ops.aten.native_batch_norm` to bypass the checks.
             z, _, _ = torch.ops.aten.native_batch_norm(x, weight, bias, None, None, True, 0.1, 1e-5)
+            z_legit, _, _ = torch.ops.aten._native_batch_norm_legit(x, weight, bias, True, 0.1, 1e-5)
 
         device = "lazy"
         x_lazy = x.detach().clone().to(device=device)
@@ -118,12 +119,15 @@ def testBatchNorm(self):
         bias_lazy = bias.detach().clone().to(device=device)
         for i in range(10):
             z_lazy, _, _ = torch.ops.aten.native_batch_norm(x_lazy, weight_lazy, bias_lazy, None, None, True, 0.1, 1e-5)
+            z_legit_lazy, _, _ = torch.ops.aten._native_batch_norm_legit(x_lazy, weight_lazy, bias_lazy, True, 0.1, 1e-5)
             torch._lazy.mark_step()
 
         torch.testing.assert_close(z.cpu(), z_lazy.cpu())
+        torch.testing.assert_close(z_legit.cpu(), z_legit_lazy.cpu())
         assert metrics.counter_value("IrNodeReused_torch::lazy::NativeBatchNorm") >= 7
         metrics.reset()
         torch._lazy.ir_cache.reset()
 
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_decomp.py b/test/test_decomp.py
index d69d72753e470..73f8c7a126ea9 100644
--- a/test/test_decomp.py
+++ b/test/test_decomp.py
@@ -159,6 +159,10 @@ def op_assert_ref(test_case, op, test_dtype, i, orig, decomp, ref, args, kwargs)
         (torch.bfloat16, torch.ops.aten.native_layer_norm_backward.default): 2e-2,
         (torch.bfloat16, torch.ops.aten.native_batch_norm.default): 1e-5,
         (torch.float16, torch.ops.aten.native_batch_norm.default): 1e-5,
+        (torch.bfloat16, torch.ops.aten._native_batch_norm_legit.default): 1e-5,
+        (torch.bfloat16, torch.ops.aten._native_batch_norm_legit.no_stats): 1e-5,
+        (torch.float16, torch.ops.aten._native_batch_norm_legit.default): 1e-5,
+        (torch.float16, torch.ops.aten._native_batch_norm_legit.no_stats): 1e-5,
         (torch.bfloat16, torch.ops.aten.linalg_vector_norm.default): 1e-5,
         (torch.float16, torch.ops.aten.linalg_vector_norm.default): 1e-5,
         (torch.float16, torch.ops.aten.nll_loss_forward.default): 1e-2,
@@ -306,6 +310,8 @@ def normalize_op_input_output(f, sample, requires_grad=True):
     # _softmax_backward_data's CPU kernel for bfloat16 always return the grad_input as float32
     ("cpu", torch.bfloat16, "_softmax_backward_data"),
     (None, None, "norm"),
+    # native_batch_norm is only implicit when python dispatcher is on (and noncomposite otherwise)
+    (None, None, "native_batch_norm"),
 }
 
 CROSS_REF_BACKWARD_EXCLUDE_SET = {
diff --git a/test/test_functionalization.py b/test/test_functionalization.py
index 0731cae285b03..d699c03ed4173 100644
--- a/test/test_functionalization.py
+++ b/test/test_functionalization.py
@@ -3,14 +3,14 @@
 import torch
 from contextlib import nullcontext
 from torch.testing._internal.common_utils import (
-    TestCase, run_tests, skipIfTorchDynamo, TEST_WITH_TORCHDYNAMO,
+    TestCase, run_tests, skipIfTorchDynamo, TEST_WITH_TORCHDYNAMO, IS_WINDOWS,
     xfail_inherited_tests
 )
 from torch.testing._internal.logging_tensor import LoggingTensor, capture_logs
 from torch.utils._pytree import tree_map
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.passes.reinplace import reinplace
-from torch._dispatch.python import enable_crossref_functionalize
+from torch._dispatch.python import enable_crossref_functionalize, enable_python_dispatcher
 
 import unittest
 
@@ -1228,6 +1228,205 @@ def forward(self, a_1):
     return zeros
     """)
 
+
+    def test_instance_norm(self):
+        def f(x):
+            with enable_python_dispatcher():
+                return torch.instance_norm(x, None, None, running_mean=torch.zeros(100), running_var=torch.ones(100),
+                                           use_input_stats=True, momentum=0.1, eps=1e-5, cudnn_enabled=False)
+        self.assert_functionalization(f, torch.randn(20, 100, 35, 45))
+        # On Windows, for instance_norm, the alias_copy's are reordered to come right before they need to be used
+        # whereas on other platforms, the alias_copy's are before the view_copy's.
+        # e.g., the alias_copy after the getitem_4 assignment would be moved to be right before the copy assignment.
+        if not IS_WINDOWS:
+            logs = self.get_logs(f, torch.randn(20, 100, 35, 45))
+            self.assertExpectedInline(logs, """\
+
+
+
+def forward(self, a_1):
+    zeros = torch.ops.aten.zeros.default([100], device = device(type='cpu'), pin_memory = False)
+    ones = torch.ops.aten.ones.default([100], device = device(type='cpu'), pin_memory = False)
+    repeat = torch.ops.aten.repeat.default(zeros, [20])
+    repeat_1 = torch.ops.aten.repeat.default(ones, [20])
+    view_copy = torch.ops.aten.view_copy.default(a_1, [1, 2000, 35, 45]);  a_1 = None
+    empty = torch.ops.aten.empty.memory_format([0], dtype = torch.uint8, layout = torch.strided, device = device(type='cpu'))
+    _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(view_copy, None, None, repeat, repeat_1, True, 0.1, 1e-05);  view_copy = repeat = repeat_1 = None
+    getitem = _native_batch_norm_legit_functional[0]
+    getitem_1 = _native_batch_norm_legit_functional[1]
+    getitem_2 = _native_batch_norm_legit_functional[2]
+    getitem_3 = _native_batch_norm_legit_functional[3]
+    getitem_4 = _native_batch_norm_legit_functional[4];  _native_batch_norm_legit_functional = None
+    alias_copy = torch.ops.aten.alias_copy.default(zeros);  zeros = None
+    view_copy_1 = torch.ops.aten.view_copy.default(getitem_3, [20, 100])
+    view_copy_2 = torch.ops.aten.view_copy.default(getitem_3, [20, 100]);  getitem_3 = None
+    mean = torch.ops.aten.mean.dim(view_copy_2, [0]);  view_copy_2 = None
+    copy = torch.ops.aten.copy.default(alias_copy, mean);  alias_copy = mean = None
+    alias_copy_1 = torch.ops.aten.alias_copy.default(ones);  ones = None
+    view_copy_3 = torch.ops.aten.view_copy.default(getitem_4, [20, 100])
+    view_copy_4 = torch.ops.aten.view_copy.default(getitem_4, [20, 100]);  getitem_4 = None
+    mean_1 = torch.ops.aten.mean.dim(view_copy_4, [0]);  view_copy_4 = None
+    copy_1 = torch.ops.aten.copy.default(alias_copy_1, mean_1);  alias_copy_1 = mean_1 = None
+    view_copy_5 = torch.ops.aten.view_copy.default(getitem, [20, 100, 35, 45]);  getitem = None
+    return view_copy_5
+    """)  # noqa: B950
+
+            reinplaced_logs = self.get_logs(f, torch.randn(20, 100, 35, 45), reapply_views=True, run_reinplace=True)
+            self.assertExpectedInline(reinplaced_logs, """\
+
+
+
+def forward(self, a_1):
+    zeros = torch.ops.aten.zeros.default([100], device = device(type='cpu'), pin_memory = False)
+    ones = torch.ops.aten.ones.default([100], device = device(type='cpu'), pin_memory = False)
+    repeat = torch.ops.aten.repeat.default(zeros, [20])
+    repeat_1 = torch.ops.aten.repeat.default(ones, [20])
+    view = torch.ops.aten.view.default(a_1, [1, 2000, 35, 45]);  a_1 = None
+    empty = torch.ops.aten.empty.memory_format([0], dtype = torch.uint8, layout = torch.strided, device = device(type='cpu'))
+    _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(view, None, None, repeat, repeat_1, True, 0.1, 1e-05);  view = repeat = repeat_1 = None
+    getitem = _native_batch_norm_legit_functional[0]
+    getitem_1 = _native_batch_norm_legit_functional[1]
+    getitem_2 = _native_batch_norm_legit_functional[2]
+    getitem_3 = _native_batch_norm_legit_functional[3]
+    getitem_4 = _native_batch_norm_legit_functional[4];  _native_batch_norm_legit_functional = None
+    alias = torch.ops.aten.alias.default(zeros);  zeros = None
+    view_1 = torch.ops.aten.view.default(getitem_3, [20, 100])
+    view_2 = torch.ops.aten.view.default(getitem_3, [20, 100]);  getitem_3 = None
+    mean = torch.ops.aten.mean.dim(view_2, [0]);  view_2 = None
+    copy = torch.ops.aten.copy_.default(alias, mean);  alias = mean = None
+    alias_1 = torch.ops.aten.alias.default(ones);  ones = None
+    view_3 = torch.ops.aten.view.default(getitem_4, [20, 100])
+    view_4 = torch.ops.aten.view.default(getitem_4, [20, 100]);  getitem_4 = None
+    mean_1 = torch.ops.aten.mean.dim(view_4, [0]);  view_4 = None
+    copy_1 = torch.ops.aten.copy_.default(alias_1, mean_1);  alias_1 = mean_1 = None
+    view_5 = torch.ops.aten.view.default(getitem, [20, 100, 35, 45]);  getitem = None
+    return view_5
+    """)  # noqa: B950
+
+
+    def test_instance_norm_running_mean_is_x(self):
+        def f(x):
+            with enable_python_dispatcher():
+                return torch.instance_norm(torch.randn(20, 100, 35, 45), None, None, running_mean=x, running_var=torch.ones(100),
+                                           use_input_stats=True, momentum=0.1, eps=1e-5, cudnn_enabled=False)
+        # TODO: uncomment following line after functionalization can handle input mutations
+        # self.assert_functionalization(f, torch.zeros(100))
+        logs = self.get_logs(f, torch.zeros(100))
+        # On Windows, for instance_norm, the alias_copy's are reordered to come right before they need to be used
+        # whereas on other platforms, the alias_copy's are before the view_copy's.
+        # e.g., the alias_copy after the getitem_4 assignment would be moved to be right before the copy assignment.
+        if not IS_WINDOWS:
+            self.assertExpectedInline(logs, """\
+
+
+
+def forward(self, a_1):
+    randn = torch.ops.aten.randn.default([20, 100, 35, 45], device = device(type='cpu'), pin_memory = False)
+    ones = torch.ops.aten.ones.default([100], device = device(type='cpu'), pin_memory = False)
+    repeat = torch.ops.aten.repeat.default(a_1, [20])
+    repeat_1 = torch.ops.aten.repeat.default(ones, [20])
+    view_copy = torch.ops.aten.view_copy.default(randn, [1, 2000, 35, 45]);  randn = None
+    empty = torch.ops.aten.empty.memory_format([0], dtype = torch.uint8, layout = torch.strided, device = device(type='cpu'))
+    _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(view_copy, None, None, repeat, repeat_1, True, 0.1, 1e-05);  view_copy = repeat = repeat_1 = None
+    getitem = _native_batch_norm_legit_functional[0]
+    getitem_1 = _native_batch_norm_legit_functional[1]
+    getitem_2 = _native_batch_norm_legit_functional[2]
+    getitem_3 = _native_batch_norm_legit_functional[3]
+    getitem_4 = _native_batch_norm_legit_functional[4];  _native_batch_norm_legit_functional = None
+    alias_copy = torch.ops.aten.alias_copy.default(a_1)
+    view_copy_1 = torch.ops.aten.view_copy.default(getitem_3, [20, 100])
+    view_copy_2 = torch.ops.aten.view_copy.default(getitem_3, [20, 100]);  getitem_3 = None
+    mean = torch.ops.aten.mean.dim(view_copy_2, [0]);  view_copy_2 = None
+    copy = torch.ops.aten.copy.default(alias_copy, mean);  alias_copy = mean = None
+    alias_copy_1 = torch.ops.aten.alias_copy.default(ones);  ones = None
+    view_copy_3 = torch.ops.aten.view_copy.default(getitem_4, [20, 100])
+    view_copy_4 = torch.ops.aten.view_copy.default(getitem_4, [20, 100]);  getitem_4 = None
+    mean_1 = torch.ops.aten.mean.dim(view_copy_4, [0]);  view_copy_4 = None
+    copy_1 = torch.ops.aten.copy.default(alias_copy_1, mean_1);  alias_copy_1 = mean_1 = None
+    view_copy_5 = torch.ops.aten.view_copy.default(getitem, [20, 100, 35, 45]);  getitem = None
+    alias_copy_2 = torch.ops.aten.alias_copy.default(copy);  copy = None
+    copy_ = torch.ops.aten.copy_.default(a_1, alias_copy_2);  a_1 = alias_copy_2 = None
+    return view_copy_5
+    """)  # noqa: B950
+
+            reinplaced_logs = self.get_logs(f, torch.zeros(100), reapply_views=True, run_reinplace=True)
+            self.assertExpectedInline(reinplaced_logs, """\
+
+
+
+def forward(self, a_1):
+    randn = torch.ops.aten.randn.default([20, 100, 35, 45], device = device(type='cpu'), pin_memory = False)
+    ones = torch.ops.aten.ones.default([100], device = device(type='cpu'), pin_memory = False)
+    repeat = torch.ops.aten.repeat.default(a_1, [20])
+    repeat_1 = torch.ops.aten.repeat.default(ones, [20])
+    view = torch.ops.aten.view.default(randn, [1, 2000, 35, 45]);  randn = None
+    empty = torch.ops.aten.empty.memory_format([0], dtype = torch.uint8, layout = torch.strided, device = device(type='cpu'))
+    _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(view, None, None, repeat, repeat_1, True, 0.1, 1e-05);  view = repeat = repeat_1 = None
+    getitem = _native_batch_norm_legit_functional[0]
+    getitem_1 = _native_batch_norm_legit_functional[1]
+    getitem_2 = _native_batch_norm_legit_functional[2]
+    getitem_3 = _native_batch_norm_legit_functional[3]
+    getitem_4 = _native_batch_norm_legit_functional[4];  _native_batch_norm_legit_functional = None
+    alias = torch.ops.aten.alias.default(a_1)
+    view_1 = torch.ops.aten.view.default(getitem_3, [20, 100])
+    view_2 = torch.ops.aten.view.default(getitem_3, [20, 100]);  getitem_3 = None
+    mean = torch.ops.aten.mean.dim(view_2, [0]);  view_2 = None
+    copy = torch.ops.aten.copy.default(alias, mean);  alias = mean = None
+    alias_1 = torch.ops.aten.alias.default(ones);  ones = None
+    view_3 = torch.ops.aten.view.default(getitem_4, [20, 100])
+    view_4 = torch.ops.aten.view.default(getitem_4, [20, 100]);  getitem_4 = None
+    mean_1 = torch.ops.aten.mean.dim(view_4, [0]);  view_4 = None
+    copy_1 = torch.ops.aten.copy_.default(alias_1, mean_1);  alias_1 = mean_1 = None
+    view_5 = torch.ops.aten.view.default(getitem, [20, 100, 35, 45]);  getitem = None
+    alias_2 = torch.ops.aten.alias.default(copy);  copy = None
+    copy_ = torch.ops.aten.copy_.default(a_1, alias_2);  a_1 = alias_2 = None
+    return view_5
+    """)  # noqa: B950
+
+
+    def test_batch_norm(self):
+        def f(x):
+            with enable_python_dispatcher():
+                return torch.batch_norm(x, None, None, torch.zeros(100), torch.ones(100), False, 0.1, 1e-5, False)
+
+        self.assert_functionalization(f, torch.randn(20, 100, 35, 45))
+        logs = self.get_logs(f, torch.randn(20, 100, 35, 45))
+        self.assertExpectedInline(logs, """\
+
+
+
+def forward(self, a_1):
+    zeros = torch.ops.aten.zeros.default([100], device = device(type='cpu'), pin_memory = False)
+    ones = torch.ops.aten.ones.default([100], device = device(type='cpu'), pin_memory = False)
+    empty = torch.ops.aten.empty.memory_format([0], dtype = torch.uint8, layout = torch.strided, device = device(type='cpu'))
+    _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(a_1, None, None, zeros, ones, False, 0.1, 1e-05);  a_1 = zeros = ones = None
+    getitem = _native_batch_norm_legit_functional[0]
+    getitem_1 = _native_batch_norm_legit_functional[1]
+    getitem_2 = _native_batch_norm_legit_functional[2]
+    getitem_3 = _native_batch_norm_legit_functional[3]
+    getitem_4 = _native_batch_norm_legit_functional[4];  _native_batch_norm_legit_functional = None
+    return getitem
+    """)  # noqa: B950
+
+        reinplaced_logs = self.get_logs(f, torch.randn(20, 100, 35, 45), reapply_views=True, run_reinplace=True)
+        self.assertExpectedInline(reinplaced_logs, """\
+
+
+
+def forward(self, a_1):
+    zeros = torch.ops.aten.zeros.default([100], device = device(type='cpu'), pin_memory = False)
+    ones = torch.ops.aten.ones.default([100], device = device(type='cpu'), pin_memory = False)
+    empty = torch.ops.aten.empty.memory_format([0], dtype = torch.uint8, layout = torch.strided, device = device(type='cpu'))
+    _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(a_1, None, None, zeros, ones, False, 0.1, 1e-05);  a_1 = zeros = ones = None
+    getitem = _native_batch_norm_legit_functional[0]
+    getitem_1 = _native_batch_norm_legit_functional[1]
+    getitem_2 = _native_batch_norm_legit_functional[2]
+    getitem_3 = _native_batch_norm_legit_functional[3]
+    getitem_4 = _native_batch_norm_legit_functional[4];  _native_batch_norm_legit_functional = None
+    return getitem
+    """)  # noqa: B950
+
+
 @xfail_inherited_tests([
     "test_as_strided",
     "test_copy_",
diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py
index 7de6f17716323..0a13fdb20a823 100644
--- a/test/test_jit_cuda_fuser.py
+++ b/test/test_jit_cuda_fuser.py
@@ -143,7 +143,7 @@ def setUp(self):
         disabled_ops = ("aten::batch_norm",
                         "aten::_batch_norm_impl_index",
                         "aten::_batch_norm_impl_index_backward",
-                        "aten::native_batch_norm_backward")
+                        "aten::native_batch_norm_backward",)
         for op in disabled_ops:
             disabled_flag = torch._C._jit_set_nvfuser_skip_node_kind(op, False)
             if disabled_flag:
diff --git a/test/test_meta.py b/test/test_meta.py
index 0e3cfb6ef1404..af81d14e37d5d 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -710,6 +710,7 @@ def run_meta_crossref(
 
 meta_function_device_expected_failures['cpu'] = {
     torch.native_batch_norm: {bf16},
+    torch._native_batch_norm_legit: {bf16},
     torch.native_layer_norm: {bf16},
 }
 
@@ -744,6 +745,7 @@ def run_meta_crossref(
 
 meta_function_device_skips['cpu'] = {
     torch.native_batch_norm: {f32, f64},
+    torch._native_batch_norm_legit: {f32, f64},
 }
 
 meta_function_device_skips['cuda'] = {
@@ -927,6 +929,8 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
 
 meta_dispatch_device_expected_failures['cpu'] = {
     aten.native_batch_norm.default: {bf16},
+    aten._native_batch_norm_legit.default: {bf16},
+    aten._native_batch_norm_legit.no_stats: {bf16},
     aten.native_layer_norm.default: {bf16},
 }
 
@@ -972,6 +976,8 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
 meta_dispatch_device_skips['cpu'] = {
     aten._embedding_bag_forward_only.default: {f16, f32, f64},
     aten.native_batch_norm.default: {f32, f64},
+    aten._native_batch_norm_legit.default: {f32, f64},
+    aten._native_batch_norm_legit.no_stats: {f32, f64},
 }
 
 meta_dispatch_device_skips['cuda'] = {
diff --git a/test/test_ops.py b/test/test_ops.py
index 7e0a9952389ce..62d44030cbff0 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -1574,7 +1574,7 @@ def is_bit_set(x):
 def check_inplace_view(func, input, rs, input_size, input_strides):
     if func is None:
         return
-    # TODO: extend this test to test ops with multiple outputs and ops like native_batch_norm.out
+    # TODO: extend this test to test ops with multiple outputs and ops like native_batch_norm(_legit).out
     # which mutate not necessarily the first input.
     if isinstance(rs, torch.Tensor) and rs is input:
         unequal_size = rs.size() != input_size
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 6e1b456316826..3500bd28ca7d0 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -1144,6 +1144,14 @@
   input, weight, bias: "grad.defined() ? native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, eps, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
   result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, running_mean, running_var, result1, result2, training, eps)
 
+- name: _native_batch_norm_legit(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+  input, weight, bias: "grad.defined() ? native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, eps, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+  result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, running_mean, running_var, result1, result2, training, eps)
+
+- name: _native_batch_norm_legit.no_stats(Tensor input, Tensor? weight, Tensor? bias, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+  input, weight, bias: "grad.defined() ? native_batch_norm_backward(grad, input, weight, Tensor(), Tensor(), result1, result2, training, eps, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+  result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, Tensor(), Tensor(), result1, result2, training, eps)
+
 - name: native_batch_norm_backward(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
   input, weight, grad_out: batchnorm_double_backward(input, weight, grads[0], grads[1], grads[2], grad_out, running_mean, running_var, train, eps, save_mean, save_invstd, grad_input_mask)
   save_mean: not_implemented("native_batch_norm_backward save_mean")
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 57e068dcc159a..b9c9225871362 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -1312,8 +1312,7 @@ def native_layer_norm_backward(
     )
 
 
-@register_decomposition(aten.native_batch_norm)
-def native_batch_norm(
+def native_batch_norm_helper(
     input: Tensor,
     weight: Optional[Tensor],
     bias: Optional[Tensor],
@@ -1322,16 +1321,21 @@ def native_batch_norm(
     training: bool,
     momentum: float,
     eps: float,
-) -> Tuple[Tensor, Tensor, Tensor]:
+    functional: bool,
+) -> Tuple[Tensor, Tensor, Tensor, Optional[Tensor], Optional[Tensor]]:
     reduction_dims = [0] + list(range(2, input.dim()))
     computation_dtype = utils.get_computation_dtype(input.dtype)
+    new_running_mean = running_mean
+    new_running_var = running_var
     if training:
         output, mean, rstd = normalize(input, reduction_dims, eps)
 
         save_mean = _squeeze_multiple(mean, reduction_dims)
         save_rstd = _squeeze_multiple(rstd, reduction_dims)
         if running_mean is not None:
-            running_mean.copy_(momentum * save_mean + (1 - momentum) * running_mean)
+            new_running_mean = momentum * save_mean + (1 - momentum) * running_mean
+            if not functional:
+                running_mean.copy_(new_running_mean)
         if running_var is not None:
             n = input.numel() / input.shape[1]
             # This doesn't strictly match eager's numerics, which accumulates var sum and then directly applies the correction
@@ -1340,11 +1344,15 @@ def native_batch_norm(
             unbiased_var = torch.var(input, reduction_dims, unbiased=False) * (
                 n / (n - 1)
             )
-            running_var.copy_(momentum * unbiased_var + (1 - momentum) * running_var)
+            new_running_var = momentum * unbiased_var + (1 - momentum) * running_var
+            if not functional:
+                running_var.copy_(new_running_var)
     else:
         assert running_mean is not None and running_var is not None
         running_mean = running_mean.to(dtype=computation_dtype, copy=True)
+        new_running_mean = running_mean
         running_var = running_var.to(dtype=computation_dtype, copy=True)
+        new_running_var = running_var
         mean = running_mean
         invstd = 1 / (torch.sqrt(running_var + eps))
         # Very annoying inconsistency where CPU and CUDA give different shapes
@@ -1370,7 +1378,127 @@ def native_batch_norm(
     if input.device.type == "cpu":
         save_mean = save_mean.to(dtype=input.dtype)
         save_rstd = save_rstd.to(dtype=input.dtype)
-    return output.to(dtype=input.dtype), save_mean, save_rstd
+    return (
+        output.to(dtype=input.dtype),
+        save_mean,
+        save_rstd,
+        new_running_mean,
+        new_running_var,
+    )
+
+
+@register_decomposition(aten.native_batch_norm)
+def native_batch_norm(
+    input: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    running_mean: Optional[Tensor],
+    running_var: Optional[Tensor],
+    training: bool,
+    momentum: float,
+    eps: float,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    output, save_mean, save_rstd, _, _ = native_batch_norm_helper(
+        input, weight, bias, running_mean, running_var, training, momentum, eps, False
+    )
+    return output, save_mean, save_rstd
+
+
+# TODO: this decomposition is NOT here to stay. We would much prefer replacing native_batch_norm
+# with our new correctly schema'd _native_batch_norm_legit and its variants, but
+# we cannot do that immediately in the C++ because it would be forwards incompatible
+# with some mobile use cases.
+#
+# Since this change is most impactful for aot autograd/functionalization, we simply
+# register this decomposition on the Autograd key for the python dispatcher (which is
+# currently only used by aot autograd/functionalization and no one else, really).
+# In two weeks or so, we should remove this decomposition and phase out the current native_batch_norm
+# to be _native_batch_norm_legit and have the right schema (stating that there are input mutations).
+@torch.ops.aten.native_batch_norm.default.py_impl(DispatchKey.Autograd)
+def native_batch_norm_decomposition(
+    input: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    running_mean: Optional[Tensor],
+    running_var: Optional[Tensor],
+    training: bool,
+    momentum: float,
+    eps: float,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    if running_mean is None and running_var is None:
+        return aten._native_batch_norm_legit(
+            input, weight, bias, training, momentum, eps
+        )
+    if running_mean is None:
+        raise RuntimeError(
+            "running_mean is None, but running_var is provided. "
+            "They should both be None or both be provided."
+        )
+    if running_var is None:
+        raise RuntimeError(
+            "running_var is None, but running_mean is provided. "
+            "They should both be None or both be provided."
+        )
+    return aten._native_batch_norm_legit(
+        input, weight, bias, running_mean, running_var, training, momentum, eps
+    )
+
+
+@register_decomposition(aten._native_batch_norm_legit.default)
+def _native_batch_norm_legit(
+    input: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    running_mean: Tensor,
+    running_var: Tensor,
+    training: bool,
+    momentum: float,
+    eps: float,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    output, save_mean, save_rstd, _, _ = native_batch_norm_helper(
+        input, weight, bias, running_mean, running_var, training, momentum, eps, False
+    )
+    return output, save_mean, save_rstd
+
+
+@register_decomposition(aten._native_batch_norm_legit.no_stats)
+def _native_batch_norm_legit_no_stats(
+    input: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    training: bool,
+    momentum: float,
+    eps: float,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    output, save_mean, save_rstd, _, _ = native_batch_norm_helper(
+        input, weight, bias, None, None, training, momentum, eps, False
+    )
+    return output, save_mean, save_rstd
+
+
+@register_decomposition(aten._native_batch_norm_legit_functional.default)
+def _native_batch_norm_legit_functional(
+    input: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    running_mean: Tensor,
+    running_var: Tensor,
+    training: bool,
+    momentum: float,
+    eps: float,
+) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
+    (
+        output,
+        save_mean,
+        save_rstd,
+        new_running_mean,
+        new_running_var,
+    ) = native_batch_norm_helper(
+        input, weight, bias, running_mean, running_var, training, momentum, eps, True
+    )
+    assert new_running_mean is not None, "new_running_mean should not be None"
+    assert new_running_var is not None, "new_running_var should not be None"
+    return output, save_mean, save_rstd, new_running_mean, new_running_var
 
 
 @register_decomposition(aten._fused_dropout)
diff --git a/torch/jit/_shape_functions.py b/torch/jit/_shape_functions.py
index ffb4463577ef5..5eace9515b59c 100644
--- a/torch/jit/_shape_functions.py
+++ b/torch/jit/_shape_functions.py
@@ -1091,6 +1091,8 @@ def add_bounded_compute_mapping(operator_schema: str, lower_bound_func: Callable
 add_shape_compute_mapping("aten::nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight)", nll_loss_forward)
 add_shape_compute_mapping("aten::native_layer_norm(Tensor input, int[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor)", native_layer_norm)
 add_shape_compute_mapping("aten::native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)", native_batch_norm)
+add_shape_compute_mapping("aten::_native_batch_norm_legit(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)", native_batch_norm)
+add_shape_compute_mapping("aten::_native_batch_norm_legit.no_stats(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)", native_batch_norm)
 # add_shape_compute_mapping("aten::index.Tensor(Tensor self, Tensor?[] indices) -> Tensor", index_Tensor)
 
 # TODO: migrate over all of symbolic_shape_registry_util.cpp
diff --git a/torch/overrides.py b/torch/overrides.py
index cb4402235e1a2..21cfe2477bd6f 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -716,6 +716,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.narrow_copy: lambda input, dim, start, length: -1,
         torch.nan_to_num: lambda input, nan=0.0, posinf=None, neginf=None, out=None: -1,
         torch.native_batch_norm: lambda input, weight, bias, running_mean, running_var, training, momentum, eps: -1,
+        torch._native_batch_norm_legit: lambda input, weight, bias, training, momentum, eps: -1,
         torch.native_dropout: lambda input, p, train: -1,
         torch.native_layer_norm: lambda input, normalized_shape, weight=None, bias=None, eps=1e-05: -1,
         torch.native_group_norm: lambda input, weight, bias, N, C, HxW, group, eps: -1,
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 177dc669469e7..f20f3a644e36b 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -439,6 +439,23 @@ def sample_inputs_native_batch_norm(op_info, device, dtype, requires_grad, **kwa
         yield SampleInput(sample.input, args=(args[2], args[3], args[0], args[1], training, momentum, eps))
 
 
+def sample_inputs__native_batch_norm_legit(op_info, device, dtype, requires_grad, **kwargs):
+    samples = sample_inputs_batch_norm(op_info, device, dtype, requires_grad, **kwargs)
+    for sample in samples:
+        # torch.native_batch_norm does not support 0 numel tensors
+        # IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)
+        if sample.input.numel() == 0:
+            continue
+        args = sample.args
+        training = sample.kwargs.get('training', True)
+        momentum = sample.kwargs.get('momentum', 0.5)
+        eps = sample.kwargs.get('eps', 1e-5)
+        if args[0] is not None and args[1] is not None:
+            yield SampleInput(sample.input, args=(args[2], args[3], args[0], args[1], training, momentum, eps))
+        else:
+            yield SampleInput(sample.input, args=(args[2], args[3], training, momentum, eps))
+
+
 def sample_inputs_nn_activation_relu(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
 
@@ -10779,6 +10796,34 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.skip("Skipped!"), 'TestCudaFuserOpInfo', 'test_nvfuser_extremal_values'),
            )
            ),
+    OpInfo('_native_batch_norm_legit',
+           aten_name='_native_batch_norm_legit',
+           dtypes=floating_types_and(torch.bfloat16),
+           dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           assert_jit_shape_analysis=True,
+           sample_inputs_func=sample_inputs__native_batch_norm_legit,
+           skips=(
+               # NotImplementedError: Could not run
+               # 'aten::native_batch_norm.out' with arguments from the 'CPU' backend.
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning', device_type="cpu"),
+               # RuntimeError: out_invstd.dim() == 1 && out_invstd.is_contiguous() && out_invstd.sizes()[0]
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type="cuda"),
+               # Problem with _get_numerical_jacobian
+               # IndexError: tuple index out of range
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD'),
+               # RuntimeError: deepEquals(input.iValue, deepCopiedInput) INTERNAL ASSERT FAILED
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               # https://github.com/pytorch/pytorch/issues/85960
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu'),
+               DecorateInfo(toleranceOverride({torch.float32: tol(atol=5e-5, rtol=5e-5)}),
+                            "TestCompositeCompliance", "test_forward_ad"),
+               # Extremal value issue on aten::native_batch_norm, which returns 'nan' for mean on 'inf' inputs
+               # possibly because of the welford implementation.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCudaFuserOpInfo', 'test_nvfuser_extremal_values'),
+           )
+           ),
     OpInfo('nn.functional.cosine_similarity',
            aten_name="cosine_similarity",
            dtypes=floating_types_and(torch.bfloat16),

From 71cbf1607511c8576b67cf179b62ddb5bd05688f Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Wed, 23 Nov 2022 23:23:24 +0000
Subject: [PATCH 1235/1922] Update default cmake to 3.18 (#89570)

Set `cmake.dir` to `/usr/local` in `.circleci/scripts/build_android_gradle.sh `
Prep change for raising compiler standard to C++17: cmake-3.18 is the first one to support CUDA17 language

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89570
Approved by: https://github.com/atalman
---
 .circleci/docker/build.sh                 |  6 ++----
 .circleci/scripts/build_android_gradle.sh |  2 +-
 CMakeLists.txt                            | 10 +++-------
 3 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh
index b41d5fe2c8c1b..ebea9eda85a6a 100755
--- a/.circleci/docker/build.sh
+++ b/.circleci/docker/build.sh
@@ -75,9 +75,8 @@ elif [[ "$image" == *rocm* ]]; then
   DOCKERFILE="${OS}-rocm/Dockerfile"
 fi
 
-if [[ "$image" == *bionic* ]]; then
-  CMAKE_VERSION=3.13.5
-fi
+# CMake 3.18 is needed to support CUDA17 language variant
+CMAKE_VERSION=3.18.5
 
 TRAVIS_DL_URL_PREFIX="https://s3.amazonaws.com/travis-python-archives/binaries/ubuntu/14.04/x86_64"
 _UCX_COMMIT=31e74cac7bee0ef66bef2af72e7d86d9c282e5ab
@@ -209,7 +208,6 @@ case "$image" in
     ;;
   pytorch-linux-focal-py3.7-gcc7)
     ANACONDA_PYTHON_VERSION=3.7
-    CMAKE_VERSION=3.16.9  # Required for precompiled header support
     GCC_VERSION=7
     PROTOBUF=yes
     DB=yes
diff --git a/.circleci/scripts/build_android_gradle.sh b/.circleci/scripts/build_android_gradle.sh
index 2007c91fe395a..598e9cd0a6bd2 100755
--- a/.circleci/scripts/build_android_gradle.sh
+++ b/.circleci/scripts/build_android_gradle.sh
@@ -24,7 +24,7 @@ export GRADLE_LOCAL_PROPERTIES=~/workspace/android/local.properties
 rm -f $GRADLE_LOCAL_PROPERTIES
 echo "sdk.dir=/opt/android/sdk" >> $GRADLE_LOCAL_PROPERTIES
 echo "ndk.dir=/opt/ndk" >> $GRADLE_LOCAL_PROPERTIES
-echo "cmake.dir=/usr" >> $GRADLE_LOCAL_PROPERTIES
+echo "cmake.dir=/usr/local" >> $GRADLE_LOCAL_PROPERTIES
 
 retry () {
   $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6f31baa687d52..784b528417041 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 #cmake_policy(SET CMP0022 NEW)
 #cmake_policy(SET CMP0023 NEW)
 
@@ -11,13 +11,9 @@ cmake_policy(SET CMP0025 NEW)
 # Suppress warning flags in default MSVC configuration.  It's not
 # mandatory that we do this (and we don't if cmake is old), but it's
 # nice when it's possible, and it's possible on our Windows configs.
-if(NOT CMAKE_VERSION VERSION_LESS 3.15.0)
-  cmake_policy(SET CMP0092 NEW)
-endif()
+cmake_policy(SET CMP0092 NEW)
 
-if(NOT CMAKE_VERSION VERSION_LESS 3.10)
-  set(FIND_CUDA_MODULE_DEPRECATED ON)
-endif()
+set(FIND_CUDA_MODULE_DEPRECATED ON)
 
 # ---[ Project and semantic versioning.
 project(Torch CXX C)

From 405ff635e850e472d94c6ac852cdc320a713c38b Mon Sep 17 00:00:00 2001
From: Nikita Karetnikov <nikita@karetnikov.org>
Date: Wed, 23 Nov 2022 20:42:55 +0000
Subject: [PATCH 1236/1922] [primTorch] Enable regex error testing for some
 refs (#87765)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87765
Approved by: https://github.com/mruberry
---
 aten/src/ATen/native/Activation.cpp           |  2 +-
 aten/src/ATen/native/cuda/Activation.cpp      |  2 +-
 test/test_nn.py                               | 13 ---
 torch/_prims_common/__init__.py               |  8 +-
 torch/_refs/__init__.py                       | 57 ++++++++-----
 torch/nn/functional.py                        |  2 +-
 .../_internal/common_methods_invocations.py   | 84 ++++++++-----------
 7 files changed, 81 insertions(+), 87 deletions(-)

diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp
index 37e832d1e457b..bef09e81a5ea5 100644
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@@ -699,7 +699,7 @@ Tensor prelu_cpu(const Tensor& self, const Tensor& weight_) {
   auto as_nd = [&](const Tensor& t) {
     TORCH_CHECK(
       t.dim() == 1 || t.dim() == 0,
-      "prelu: Expected `weight` to be a scalar or 1D tensor, but got ndim = ", t.dim());
+      "prelu: Expected `weight` to be a scalar or 1D tensor, but got: ndim = ", t.dim());
     if (ndim >= 2) {
       sizes[1] = t.dim() == 1 ? t.size(0) : 1;
       strides[1] = t.dim() == 1 ? t.stride(0) : 0;
diff --git a/aten/src/ATen/native/cuda/Activation.cpp b/aten/src/ATen/native/cuda/Activation.cpp
index 4360f8b5c3efc..31926b353b4a3 100644
--- a/aten/src/ATen/native/cuda/Activation.cpp
+++ b/aten/src/ATen/native/cuda/Activation.cpp
@@ -114,7 +114,7 @@ Tensor prelu_cuda(const Tensor& self, const Tensor& weight_) {
   Tensor result = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
 
   TORCH_CHECK(weight_dim == 0 || weight_dim == 1,
-      "prelu: Expected `weight` to be a scalar or 1D tensor, but got ndim = ",
+      "prelu: Expected `weight` to be a scalar or 1D tensor, but got: ndim = ",
       weight_dim);
 
   // case1: shared weight for all channels
diff --git a/test/test_nn.py b/test/test_nn.py
index c50d9cdc7bd64..ceabcb28ac84b 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -4175,19 +4175,6 @@ def test_mse_loss_size_warning(self):
             self.assertEqual(len(w), 1)
             self.assertIn('Please ensure they have the same size.', str(w[0]))
 
-    def test_poisson_nll_loss_reduction_modes(self):
-        input = torch.tensor([0.5, 1.5, 2.5])
-        target = torch.tensor([1., 2., 3.])
-        component_wise_loss = torch.exp(input) - target * input
-        self.assertEqual(component_wise_loss,
-                         F.poisson_nll_loss(input, target, reduction='none'))
-        self.assertEqual(torch.sum(component_wise_loss),
-                         F.poisson_nll_loss(input, target, reduction='sum'))
-        self.assertEqual(torch.mean(component_wise_loss),
-                         F.poisson_nll_loss(input, target, reduction='mean'))
-        with self.assertRaisesRegex(ValueError, 'is not valid'):
-            F.poisson_nll_loss(input, target, reduction='total')
-
     def test_gaussian_nll_loss_broadcasting(self):
         input = torch.tensor([[0.5, 1.5, 2.5], [2., 4., 6.]])
         target_full = torch.tensor([[1., 2., 3.], [1., 2., 3.]])
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index 08e24c4037749..647b0e66729e2 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -721,12 +721,14 @@ def infer_size(shape: ShapeType, numel: int) -> Tuple[int, ...]:
         lambda: f"shape '{list(shape)}' is invalid for input of size {numel}",
     )
     if dim is not None:
+        # Convert to list to produce a compatible error message with core
+        # PyTorch, which prints sequences in square brackets.
+        shape = list(shape)
         check(
             newsize != 0,
-            lambda: f"cannot reshape tensor fo 0 elements into shape {shape} because the "
-            f"unspecified dimension size -1 can be any value and is ambiguous",
+            lambda: (f"cannot reshape tensor of 0 elements into shape {shape} because the "
+                     f"unspecified dimension size -1 can be any value and is ambiguous"),
         )
-        shape = list(shape)
         shape[dim] = numel // newsize
     return tuple(shape)
 
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index fac2509afd414..04bf9e12927fa 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -761,9 +761,14 @@ def nan_to_num(
 
 
 def _neg_meta(a: TensorLikeType):
-    if a.dtype is torch.bool:
-        msg = "neg is not supported on bool tensors."
-        raise RuntimeError(msg)
+    check(
+        a.dtype is not torch.bool,
+        lambda: (
+            "Negation, the `-` operator, on a bool tensor is not supported. "
+            "If you are trying to invert a mask, use the `~` or `logical_not()` "
+            "operator instead."
+        ),
+    )
 
 
 @_make_elementwise_unary_reference(
@@ -2328,11 +2333,14 @@ def mean(
     # reduces over all dimensions if dim=() is passed
     if dim == () or dim == []:
         dim = None
+    orig_dtype = dtype
     if dtype is None:
         dtype = a.dtype
     # can't use out wrapper because of this argument
-    if out is not None and out.dtype != dtype:
-        raise RuntimeError("expected out dtype and dtype to match")
+    check(
+        out is None or out.dtype == dtype,
+        lambda: f"Expected out tensor to have dtype {dtype}, but got {out.dtype} instead",
+    )
     result = _reduction(
         a,
         prims.sum,
@@ -2342,8 +2350,14 @@ def mean(
         out=None,
         output_dtype_kind=REDUCTION_OUTPUT_TYPE_KIND.KEEP_PROMOTED_TYPE,
     )
-    if utils.is_integer_dtype(dtype):
-        raise RuntimeError("result type should be floating point or complex")
+    check(
+        utils.is_float_dtype(dtype) or utils.is_complex_dtype(dtype),
+        lambda: (
+            f"mean(): could not infer output dtype. "
+            f"{'Input' if orig_dtype is None else 'Optional'} dtype must be either "
+            f"a floating point or complex dtype. Got: {dtype}"
+        ),
+    )
     if isinstance(dim, Dim):
         dim = (dim,)  # type: ignore[assignment]
     dims = utils.reduction_dims(a.shape, dim)  # type: ignore[arg-type]
@@ -3371,7 +3385,7 @@ def unbind(t: TensorLikeType, dim: int = 0) -> TensorSequenceType:
     dim = utils.canonicalize_dim(t.ndim, dim)
     check(
         len(t.shape) > 0,
-        lambda: "dimension specified as 0 but tensor has no dimensions",
+        lambda: "Dimension specified as 0 but tensor has no dimensions",
         IndexError,
     )
     return tuple(
@@ -3621,12 +3635,12 @@ def vsplit(
         check(
             (split_size != 0 and a.shape[0] % split_size == 0),
             lambda: (
-                "torch.vsplit attempted to split along dimension 0 "
-                + ", but the size of the dimension "
-                + str(a.shape[0])
-                + " is not divisible by the split_size "
-                + str(split_size)
-                + "!"
+                f"torch.vsplit attempted to split along dimension 0"
+                f", but the size of the dimension "
+                f"{a.shape[0]}"
+                f" is not divisible by the split_size "
+                f"{split_size}"
+                f"!"
             ),
         )
         return tensor_split(a, split_size, 0)
@@ -3792,7 +3806,7 @@ def dsplit(a: TensorLikeType, sections: DimsType) -> TensorSequenceType:
         )
     if isinstance(sections, IntLike) and (sections == 0 or a.shape[2] % sections != 0):
         raise RuntimeError(
-            "torch._refs.dsplit attempted to split along dimension 2, "
+            "torch.dsplit attempted to split along dimension 2, "
             + f"but the size of the dimension {a.shape[2]} is not divisible by the split_size {sections}!"
         )
     return tensor_split(a, sections, 2)
@@ -4446,12 +4460,14 @@ def movedim(
     if type(destination) is int:
         destination = (destination,)
 
+    # Converts to list to produce a compatible error message with core PyTorch,
+    # which prints sequences in square brackets.
     utils.check(
         len(source) == len(destination),  # type: ignore[arg-type]
         lambda: (
-            "movedim: Invalid source or destination dims: source "
-            f"({source} dims) should contain the same number of dims as "
-            f"destination ({destination} dims)"
+            "movedim: Invalid source or destination dims: source "  # type: ignore[arg-type]
+            f"({list(source)} dims) should contain the same number of dims as "
+            f"destination ({list(destination)} dims)"
         ),
     )
 
@@ -4462,13 +4478,14 @@ def movedim(
     sss = set(ss)
     dss = set(ds)
 
+    # See above on why this converts to list in error messages.
     utils.check(
         len(ss) == len(sss),
-        lambda: f"movedim: repeated dim in `source` {source}",
+        lambda: f"movedim: repeated dim in `source` ({list(source)})",  # type: ignore[arg-type]
     )
     utils.check(
         len(ds) == len(dss),
-        lambda: f"movedim: repeated dim in `destination` {destination}",
+        lambda: f"movedim: repeated dim in `destination` ({list(destination)})",  # type: ignore[arg-type]
     )
 
     m = dict(zip(ds, ss))
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index e3aea9f0acea0..66fbcc5dbaeb2 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -2764,7 +2764,7 @@ def poisson_nll_loss(
         reduction = _Reduction.legacy_get_string(size_average, reduce)
     if reduction != "none" and reduction != "mean" and reduction != "sum":
         ret = input
-        raise ValueError(reduction + " is not valid")
+        raise ValueError(reduction + " is not a valid value for reduction")
 
     ret = torch.poisson_nll_loss(input, target, log_input, full, eps, _Reduction.get_enum(reduction))
     return ret
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index f20f3a644e36b..b3247976161fd 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -527,7 +527,7 @@ def error_inputs_prelu(op, device):
     inp = make_tensor((2, 8, 3), device=device, dtype=torch.float32)
     weight = make_tensor((2, 4), device=device, dtype=torch.float32)
     yield ErrorInput(SampleInput(inp, kwargs={'weight': weight}),
-                     error_regex="prelu: Expected `weight` to be a scalar or 1D tensor, but got ndim = 2")
+                     error_regex="prelu: Expected `weight` to be a scalar or 1D tensor, but got: ndim = 2")
 
     # src and index tensors must have the same # of dimensions
 def sample_inputs_norm(op_info, device, dtype, requires_grad, **kwargs):
@@ -2428,7 +2428,7 @@ def sample_inputs_take_along_dim(op_info, device, dtype, requires_grad, **kwargs
         make_arg((S, S)), gather_variable((S, S // 2), 0, S, True, device=device))
 
 
-def error_inputs_aminmax_amax_amin(op_info, device, **kwargs):
+def error_inputs_aminmax_amax_amin(op_info, device, is_ref=False, **kwargs):
 
     # Error Inputs for zero-dim tensors, when 'dim' arg is not provided.
     shape = (S, 0, S)
@@ -2461,7 +2461,15 @@ def error_inputs_aminmax_amax_amin(op_info, device, **kwargs):
     min_values = torch.empty(L, dtype=torch.double, device=device)
     illegal_values = torch.empty(L, dtype=torch.int, device=device)
 
-    err_msg_amax_amin2 = "Expected the dtype for input and out to match"
+    # Unlike regular PyTorch, amax and amin refs don't require input and out
+    # dtypes to match exactly:
+    # https://github.com/pytorch/pytorch/pull/87765#pullrequestreview-1162023824
+    if is_ref:
+        err_msg_amax_amin2 = ("Attempting to cast from torch.float32 to out tensor with dtype "
+                              "torch.int32, but this can't be cast because it is not safe!")
+    else:
+        err_msg_amax_amin2 = ("Expected the dtype for input and out to match, but got Float "
+                              "for input's dtype and Int for out's dtype.")
     err_msg_aminmax2 = "Expected out tensor to have dtype float, but got double instead"
 
     if op_info.name in ['amax', 'amin', '_refs.amax', '_refs.amin']:
@@ -7336,7 +7344,7 @@ def error_inputs_poisson_nll_loss(op_info, device, **kwargs):
     yield ErrorInput(SampleInput(make(5, 4), args=(make(5, 4),),
                      kwargs={'reduction': 'abc'}),
                      error_type=ValueError,
-                     error_regex='abc is not valid')
+                     error_regex='abc is not a valid value for reduction')
     # invalid input shapes
     yield ErrorInput(SampleInput(make(5, 4), args=(make(5,),)),
                      error_regex=(r'(Attempting to broadcast a dimension of length|'
@@ -8138,18 +8146,28 @@ def generate_std_var_kwargs(t: torch.Tensor, **kwargs):
         numel = torch.tensor(t.shape)[kwargs.get('dim')].prod()
         yield ((), {'correction': numel // 2})
 
-def error_inputs_mean(op_info, device, **kwargs):
-    err_msg1 = (r"mean\(\): could not infer output dtype. "
-                r"Input dtype must be either a floating point or complex dtype. "
-                r"Got: Long")
+def error_inputs_mean(op_info, device, is_ref=False, **kwargs):
+    if is_ref:
+        err_msg1 = (r"mean\(\): could not infer output dtype. "
+                    r"Input dtype must be either a floating point or complex dtype. "
+                    r"Got: torch.int64")
+    else:
+        err_msg1 = (r"mean\(\): could not infer output dtype. "
+                    r"Input dtype must be either a floating point or complex dtype. "
+                    r"Got: Long")
     yield ErrorInput(
         SampleInput(make_tensor((3, 4, 5), dtype=torch.int64, device=device), []),
         error_regex=err_msg1,
     )
 
-    err_msg2 = (r"mean\(\): could not infer output dtype. "
-                r"Optional dtype must be either a floating point or complex dtype. "
-                r"Got: Long")
+    if is_ref:
+        err_msg2 = (r"mean\(\): could not infer output dtype. "
+                    r"Optional dtype must be either a floating point or complex dtype. "
+                    r"Got: torch.int64")
+    else:
+        err_msg2 = (r"mean\(\): could not infer output dtype. "
+                    r"Optional dtype must be either a floating point or complex dtype. "
+                    r"Got: Long")
     yield ErrorInput(
         SampleInput(
             make_tensor((3, 4, 5), dtype=torch.float32, device=device),
@@ -8158,7 +8176,10 @@ def error_inputs_mean(op_info, device, **kwargs):
         error_regex=err_msg2
     )
 
-    err_msg3 = "Expected out tensor to have dtype double, but got float instead"
+    if is_ref:
+        err_msg3 = "Expected out tensor to have dtype torch.float64, but got torch.float32 instead"
+    else:
+        err_msg3 = "Expected out tensor to have dtype double, but got float instead"
     yield ErrorInput(
         SampleInput(
             make_tensor((3, 4, 5), dtype=torch.int64, device=device),
@@ -17125,9 +17146,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         aliases=('moveaxis',),
         torch_opinfo_name="movedim",
         supports_nvfuser=False,
-        skips=(
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
-        ),
     ),
     PythonRefInfo(
         "_refs.bucketize",
@@ -17323,9 +17341,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     ElementwiseUnaryPythonRefInfo(
         "_refs.neg",
         torch_opinfo_name="neg",
-        skips=(
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
-        ),
     ),
     ElementwiseUnaryPythonRefInfo(
         "_refs.positive",
@@ -17568,16 +17583,10 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     PythonRefInfo(
         "_refs.nn.functional.poisson_nll_loss",
         torch_opinfo_name="nn.functional.poisson_nll_loss",
-        skips=(
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
-        ),
     ),
     ElementwiseUnaryPythonRefInfo(
         "_refs.nn.functional.prelu",
         torch_opinfo_name="nn.functional.prelu",
-        skips=(
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
-        ),
     ),
     ElementwiseUnaryPythonRefInfo(
         "_refs.nn.functional.relu",
@@ -18339,9 +18348,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.dsplit",
         torch_opinfo_name="dsplit",
         supports_nvfuser=False,
-        skips=(
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
-        ),
     ),
     PythonRefInfo(
         "_refs.diag",
@@ -18465,9 +18471,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.reshape",
         torch_opinfo_name="reshape",
         supports_nvfuser=False,
-        skips=(
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
-        ),
     ),
     PythonRefInfo(
         "_refs.reshape_as",
@@ -18516,9 +18519,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.vsplit",
         torch_opinfo_name="vsplit",
         supports_nvfuser=False,
-        skips=(
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
-        ),
     ),
     PythonRefInfo(
         "_refs.transpose",
@@ -18552,9 +18552,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.view",
         torch_opinfo_name="view",
         supports_nvfuser=False,
-        skips=(
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
-        ),
     ),
     PythonRefInfo(
         "_refs.view_as",
@@ -18579,9 +18576,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.unbind",
         torch_opinfo_name="unbind",
         supports_nvfuser=False,
-        skips=(
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
-        ),
     ),
     #
     # Reduction Reference OpInfos
@@ -18593,16 +18587,12 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     ReductionPythonRefInfo(
         "_refs.amax",
         torch_opinfo_name="amax",
-        skips=(
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
-        ),
+        error_inputs_func=partial(error_inputs_aminmax_amax_amin, is_ref=True),
     ),
     ReductionPythonRefInfo(
         "_refs.amin",
         torch_opinfo_name="amin",
-        skips=(
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
-        ),
+        error_inputs_func=partial(error_inputs_aminmax_amax_amin, is_ref=True),
     ),
     ReductionPythonRefInfo(
         "_refs.any",
@@ -18612,9 +18602,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         "_refs.mean",
         torch_opinfo_name="mean",
         supports_out=True,
-        skips=(
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
-        ),
+        error_inputs_func=partial(error_inputs_mean, is_ref=True),
     ),
     ReductionPythonRefInfo(
         "_refs.std",

From 02a340a0dacd3d9233b4751d9f34ba9d44b3da94 Mon Sep 17 00:00:00 2001
From: clee2000 <44682903+clee2000@users.noreply.github.com>
Date: Wed, 23 Nov 2022 23:48:32 +0000
Subject: [PATCH 1237/1922] Don't run auto request review on forked PRs
 (#89583)

tested on https://github.com/pytorch/pytorch/pull/89581
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89583
Approved by: https://github.com/albanD, https://github.com/malfet
---
 .github/workflows/auto_request_review.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/auto_request_review.yml b/.github/workflows/auto_request_review.yml
index 352b1cf773b71..7c98c2990fba7 100644
--- a/.github/workflows/auto_request_review.yml
+++ b/.github/workflows/auto_request_review.yml
@@ -7,7 +7,7 @@ on:
 jobs:
   auto-request-review:
     # Don't run on forked repos
-    if: github.repository_owner == 'pytorch'
+    if: ${{ !github.event.pull_request.head.repo.fork }}
     name: Auto Request Review
     runs-on: ubuntu-latest
     steps:

From 4b1f9bebfece06879a163f5128488589b5356d21 Mon Sep 17 00:00:00 2001
From: foram-chandra <96388449+foram-chandra@users.noreply.github.com>
Date: Thu, 24 Nov 2022 00:34:26 +0000
Subject: [PATCH 1238/1922] [nn] Remove deprecation warning from
 nn.functional.{tanh, sigmoid} (#86905)

Fixes #65909

Pull Request resolved: https://github.com/pytorch/pytorch/pull/86905
Approved by: https://github.com/albanD, https://github.com/kit1980
---
 torch/nn/functional.py                                | 2 --
 torch/testing/_internal/common_methods_invocations.py | 8 +-------
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 66fbcc5dbaeb2..a1a102d786f16 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -1955,7 +1955,6 @@ def tanh(input):
 
     See :class:`~torch.nn.Tanh` for more details.
     """
-    warnings.warn("nn.functional.tanh is deprecated. Use torch.tanh instead.")
     return input.tanh()
 
 
@@ -1966,7 +1965,6 @@ def sigmoid(input):
 
     See :class:`~torch.nn.Sigmoid` for more details.
     """
-    warnings.warn("nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.")
     return input.sigmoid()
 
 
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index b3247976161fd..bf6f15f825f34 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -13440,9 +13440,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
                                     active_if=(IS_MACOS or IS_WINDOWS)),
-                       # alias, nn.functional.tanh, will produce (because of warning string saved):
-                       # "RuntimeError: Expected to not find "tanh" but found it"
-                       DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_jit_alias_remapping'),
                        DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
                                     'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
                    ),
@@ -15568,10 +15565,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     dtypes=[torch.complex64, torch.cdouble]),
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
-                                    dtypes=[torch.chalf, torch.complex64, torch.cdouble]),
-                       # alias, nn.functional.sigmoid, will produce (because of warning string saved):
-                       # "RuntimeError: Expected to not find "sigmoid" but found it"
-                       DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_jit_alias_remapping')),
+                                    dtypes=[torch.chalf, torch.complex64, torch.cdouble])),
                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and_complex_and(torch.complex32, torch.bool, torch.half, torch.bfloat16),
                    supports_forward_ad=True,

From f088d9573e0b5cd7a0bdd1a7cd24b980d25ddec3 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Thu, 24 Nov 2022 00:57:17 +0000
Subject: [PATCH 1239/1922] Add `c10::` namespace in front of `optional`
 (#89605)

Prep change for moving the codebase to C++17 standard
Was part of https://github.com/pytorch/pytorch/pull/85969

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89605
Approved by: https://github.com/weiwangmeta, https://github.com/kit1980
---
 aten/src/ATen/mps/MPSGuardImpl.h                     | 8 ++++----
 aten/src/ATen/native/ReduceOpsUtils.h                | 2 +-
 aten/src/ATen/native/mps/operations/Distributions.mm | 2 +-
 aten/src/ATen/native/mps/operations/ReduceOps.mm     | 2 +-
 aten/src/ATen/native/mps/operations/View.mm          | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/aten/src/ATen/mps/MPSGuardImpl.h b/aten/src/ATen/mps/MPSGuardImpl.h
index 27d32bf652e7a..b6002497d223d 100644
--- a/aten/src/ATen/mps/MPSGuardImpl.h
+++ b/aten/src/ATen/mps/MPSGuardImpl.h
@@ -109,12 +109,12 @@ struct TORCH_API MPSGuardImpl final : public c10::impl::DeviceGuardImplInterface
 struct OptionalMPSGuard {
   explicit OptionalMPSGuard() : guard_() {}
 
-  explicit OptionalMPSGuard(optional<Device> device_opt)
+  explicit OptionalMPSGuard(c10::optional<Device> device_opt)
       : guard_(device_opt) {}
 
   /// Set the current MPS device to the passed device index, if it is not
   /// nullopt
-  explicit OptionalMPSGuard(optional<DeviceIndex> device_index_opt)
+  explicit OptionalMPSGuard(c10::optional<DeviceIndex> device_index_opt)
       : guard_(device_index_opt) {}
 
   // Copy is not allowed
@@ -144,14 +144,14 @@ struct OptionalMPSGuard {
 
   /// Returns the device that was set immediately prior to initialization of the
   /// guard, or nullopt if the guard is uninitialized.
-  optional<Device> original_device() const {
+  c10::optional<Device> original_device() const {
     return guard_.original_device();
   }
 
   /// Returns the most recent device that was set using this device guard,
   /// either from construction, or via set_device, if the guard is initialized,
   /// or nullopt if the guard is uninitialized.
-  optional<Device> current_device() const {
+  c10::optional<Device> current_device() const {
     return guard_.current_device();
   }
 
diff --git a/aten/src/ATen/native/ReduceOpsUtils.h b/aten/src/ATen/native/ReduceOpsUtils.h
index 9db9802ea788b..2b46eb683f1c9 100644
--- a/aten/src/ATen/native/ReduceOpsUtils.h
+++ b/aten/src/ATen/native/ReduceOpsUtils.h
@@ -102,7 +102,7 @@ static inline void check_scalar_type_device_layout_equal(const Tensor& out, cons
   OPTION_TYPE_EQUALITY_CHECK(layout, out.options(), self.options());
 }
 
-static inline Tensor integer_upcast(const Tensor& self, optional<ScalarType> dtype) {
+static inline Tensor integer_upcast(const Tensor& self, c10::optional<ScalarType> dtype) {
   ScalarType scalarType = self.scalar_type();
   ScalarType upcast_scalarType = dtype.value_or(at::isIntegralType(scalarType, /*includeBool=*/true) ? ScalarType::Long : scalarType);
   return self.toType(upcast_scalarType);
diff --git a/aten/src/ATen/native/mps/operations/Distributions.mm b/aten/src/ATen/native/mps/operations/Distributions.mm
index 1da2457f3a37e..99d01c6825b35 100644
--- a/aten/src/ATen/native/mps/operations/Distributions.mm
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@@ -273,7 +273,7 @@ Tensor normal_mps(const Tensor& mean, const Tensor& std, c10::optional<Generator
 }
 
 // random_.from
-Tensor& random_mps_(Tensor& self, int64_t from, optional<int64_t> to_opt, c10::optional<Generator> gen) {
+Tensor& random_mps_(Tensor& self, int64_t from, c10::optional<int64_t> to_opt, c10::optional<Generator> gen) {
   auto input_dtype = self.scalar_type();
   int64_t to = 0;
 
diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index 39680240f7f21..d905107b8ffd4 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -331,7 +331,7 @@ Tensor trace_mps_out(const Tensor& self) {
 // Taken from ReduceOps.cpp
 inline ScalarType get_dtype_from_self(
     const Tensor& self,
-    const optional<ScalarType>& dtype,
+    const c10::optional<ScalarType>& dtype,
     bool promote_integers) {
   if (dtype.has_value()) {
     return dtype.value();
diff --git a/aten/src/ATen/native/mps/operations/View.mm b/aten/src/ATen/native/mps/operations/View.mm
index 70c7d50b730f3..0e35c7b2f642d 100644
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@@ -274,7 +274,7 @@ Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst)
 } // namespace mps
 
 // implementation of as_strided() op
-Tensor as_strided_tensorimpl_mps(const Tensor& self, IntArrayRef size, IntArrayRef stride, optional<int64_t> storage_offset_)
+Tensor as_strided_tensorimpl_mps(const Tensor& self, IntArrayRef size, IntArrayRef stride, c10::optional<int64_t> storage_offset_)
 {
   auto storage_offset = storage_offset_.value_or(self.storage_offset());
   auto result = detail::make_tensor<TensorImpl>(c10::TensorImpl::VIEW, Storage(self.storage()), self.key_set(), self.dtype());

From feafcead573edf7397c108f50c655acd24e0e423 Mon Sep 17 00:00:00 2001
From: Shunting Zhang <shunting@meta.com>
Date: Thu, 24 Nov 2022 01:28:10 +0000
Subject: [PATCH 1240/1922] verify the number of outputs of xla graph (#89536)

This PR add tests to verify the behavior of number of outputs returns by an XLA graph. The understanding from this PR will help us fix https://github.com/pytorch/torchdynamo/issues/1908 and enable training for dynamo/torchxla integration eventually. Send this PR separately so Jack could help verify if the behavior is expected and play with it.

List some code snippets here since their behavior is not straightforward at a first glance:
```
    def forward(self, a, b, c):
        """
        The XLA graph will only return the first 2 items
        """
        return a + b, a + c, b
```

```
    def forward(self, a, b, c):
        """
        Inplace update on b cause it to be returned in XLA graph
        """
        b.zero_()
        return a + b, a + c, b
```

```
    def forward(self, a, b, c):
        """
        Even if we return b twice, the XLA graph only return b once.
        """
        b.zero_()
        return a + b, a + c, b, b
```

Here are what observed by the added tests:

1. XLA does not return outputs that are also inputs -- if the tensor is not inplace updated. At first glance people may feel curious why should we consider this kind of 'non-realistic' corner case. But this kind of graphs indeed shows up in AOTAutograd. The main reason is AOTAutograd lift all model parameters/buffers as graph input and may return some of them.  Check ***test_direct_return***
2. if a tensor is inplace updated, XLA will still return it as graph output even if it's also an input.  The only difference compared to item 1 is, the inplace updating on the tensor cause it being returned. This happens for BatchNorm2d since the running_mean/variance tensors will be inplace updated during training. Check ***test_direct_return_with_inplace_update***

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89536
Approved by: https://github.com/jansel
---
 test/dynamo/test_torchxla_integration.py |  60 ++++--------
 test/dynamo/test_torchxla_num_output.py  | 120 +++++++++++++++++++++++
 test/dynamo/test_torchxla_util.py        |  26 +++++
 3 files changed, 167 insertions(+), 39 deletions(-)
 create mode 100644 test/dynamo/test_torchxla_num_output.py
 create mode 100644 test/dynamo/test_torchxla_util.py

diff --git a/test/dynamo/test_torchxla_integration.py b/test/dynamo/test_torchxla_integration.py
index 00a92e3799553..70be4d8e87dcc 100644
--- a/test/dynamo/test_torchxla_integration.py
+++ b/test/dynamo/test_torchxla_integration.py
@@ -1,18 +1,23 @@
 # Owner(s): ["module: dynamo"]
 import copy
-import functools
-import os
 import unittest
 
 import torch
 
-has_torch_xla = True
+try:
+    from .test_torchxla_util import maybe_skip_torchxla_test
+except ImportError:
+    from test_torchxla_util import maybe_skip_torchxla_test
+
 try:
     import torch._dynamo.optimizations.torchxla_integration as integration
+    import torch_xla.core.xla_model as xm
+    import torch_xla.debug.metrics as metrics
 except ImportError:
-    has_torch_xla = False
+    # tests using torch_xla will be skipped. It's fine to ignore the
+    # importing error here.
+    pass
 
-import torch.utils._pytree as pytree
 from torch import fx, nn
 
 
@@ -81,49 +86,26 @@ def unwrap(cont):
         raise RuntimeError("Unexpected types")
 
 
-@functools.lru_cache(None)
-def should_run_torchxla_tests():
-    """
-    Run the tests if torch_xla is available and number of gpu devices is specified.
-    """
-    gpu_device_specified = int(os.environ.get("GPU_NUM_DEVICES", "0")) > 0
-    return has_torch_xla and gpu_device_specified
-
-
 def make_reuse_graph_test(module_class, niter=100):
-    @unittest.skipIf(
-        not should_run_torchxla_tests(),
-        "Skip the tests since torch_xla is not available or XLA devices are not specified",
-    )
+    @maybe_skip_torchxla_test
     def test_wrapper(self):
-        import torch_xla.core.xla_model as xm
-
         xla_dev = xm.xla_device()
-        mod = module_class()
-        xla_module = copy.deepcopy(mod).to(device=xla_dev)
-        inputs = mod.get_random_inputs()
+        xla_module = module_class().to(device=xla_dev)
+        inputs = tuple(x.to(device=xla_dev) for x in xla_module.get_random_inputs())
+        metrics.clear_counters()
         optimized_mod = integration.extract_compiled_graph(
-            fx.symbolic_trace(mod), inputs
+            fx.symbolic_trace(xla_module), inputs
         )
 
         for i in range(niter):
-            rand_args = mod.get_random_inputs()
-            orig_dev = rand_args[0].device
-            rand_args_copy = copy.deepcopy(rand_args)
-
-            # Can not simply call
-            #   expected = mod(*rand_args)
-            # Since we need use xla to calculate expected results
             xla_inputs = tuple(
-                copy.deepcopy(inp).to(device=xla_dev) for inp in rand_args
+                inp.to(device=xla_dev) for inp in xla_module.get_random_inputs()
             )
-            xla_out = xla_module(*xla_inputs)
-            # copy xla_inputs back to rand_args since the model may inplace update
-            # the arguments
-            rand_args = tuple(inp.to(device=orig_dev) for inp in xla_inputs)
-            expected = pytree.tree_map(lambda o: o.to(device=orig_dev), xla_out)
+            xla_inputs_copy = copy.deepcopy(xla_inputs)
+
+            expected = xla_module(*xla_inputs)
 
-            actual = optimized_mod(*rand_args_copy)
+            actual = optimized_mod(*xla_inputs_copy)
 
             if not allclose(expected, actual):
                 print(
@@ -133,7 +115,7 @@ def test_wrapper(self):
 
             # make sure arguments match after calling the model forward method
             # to handle inplace updates.
-            if not allclose(rand_args, rand_args_copy):
+            if not allclose(xla_inputs, xla_inputs_copy):
                 print(
                     f"Incorrect updated arguments at iter {i}. expected\n{rand_args}, actual\n{rand_args_copy}"
                 )
diff --git a/test/dynamo/test_torchxla_num_output.py b/test/dynamo/test_torchxla_num_output.py
new file mode 100644
index 0000000000000..0e91a358d4690
--- /dev/null
+++ b/test/dynamo/test_torchxla_num_output.py
@@ -0,0 +1,120 @@
+# Owner(s): ["module: dynamo"]
+import unittest
+
+import torch
+from torch import nn
+from torch._dynamo.optimizations.torchxla_integration import GraphInputMatcher
+from torch.utils._pytree import tree_map_only
+
+try:
+    from .test_torchxla_util import maybe_skip_torchxla_test
+except ImportError:
+    from test_torchxla_util import maybe_skip_torchxla_test
+
+try:
+    import torch_xla
+    import torch_xla.core.xla_model as xm
+except ImportError:
+    # tests using torch_xla will be skipped. It's fine to ignore the
+    # importing error here.
+    pass
+
+
+class DirectReturnModule(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, a, b, c):
+        """
+        The XLA graph will only return the first 2 items
+        """
+        return a + b, a + c, b
+
+    def get_example_inputs(self):
+        return (torch.rand(2), torch.rand(2), torch.rand(2))
+
+
+class DirectReturnWithInplaceUpdateModule(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, a, b, c):
+        """
+        Inplace update on b cause it to be returned in XLA graph
+        """
+        b.zero_()
+        return a + b, a + c, b
+
+    def get_example_inputs(self):
+        return (torch.rand(2), torch.rand(2), torch.rand(2))
+
+
+class DirectReturnWithDuplicatedInplaceUpdateModule(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, a, b, c):
+        """
+        Even if we return b twice, the XLA graph only return b once.
+        """
+        b.zero_()
+        return a + b, a + c, b, b
+
+    def get_example_inputs(self):
+        return (torch.rand(2), torch.rand(2), torch.rand(2))
+
+
+class TestNumOutput(unittest.TestCase):
+    def do_test(self, model_class, expected_num_output):
+        xla_dev = xm.xla_device()
+        model = model_class().to(device=xla_dev)
+        inputs = tree_map_only(
+            torch.Tensor, lambda x: x.to(device=xla_dev), model.get_example_inputs()
+        )
+
+        xm.mark_step()
+        args_tensor_ids = [
+            torch_xla._XLAC._xla_get_tensor_id(xla_arg) for xla_arg in inputs
+        ]
+        tensor_id_to_arg_idx = {
+            tensor_id: i for i, tensor_id in enumerate(args_tensor_ids)
+        }
+        outputs = model(*inputs)
+        xla_graph_hash = torch_xla._XLAC._get_graph_hash(outputs)
+
+        (
+            graph_input_tensor_ids,
+            graph_input_xla_values,
+        ) = torch_xla._XLAC._get_tensors_xla_device_data_node(outputs)
+
+        graph_input_matcher = GraphInputMatcher(
+            tensor_id_to_arg_idx, graph_input_tensor_ids, graph_input_xla_values
+        )
+        torch_xla._XLAC._xla_sync_multi(outputs, [])
+
+        def run_cached_graph(*inputs):
+            torch_xla._XLAC._xla_sync_multi(inputs, [])
+            xla_graph_inputs = graph_input_matcher(inputs)
+            xla_graph_outputs = torch_xla._XLAC._run_cached_graph(
+                xla_graph_hash, xla_graph_inputs
+            )
+            return xla_graph_outputs
+
+        test_inputs = tree_map_only(
+            torch.Tensor, lambda x: x.to(device=xla_dev), model.get_example_inputs()
+        )
+        self.assertEqual(expected_num_output, len(run_cached_graph(*test_inputs)))
+
+    @maybe_skip_torchxla_test
+    def test_direct_return(self):
+        self.do_test(DirectReturnModule, expected_num_output=2)
+
+    @maybe_skip_torchxla_test
+    def test_direct_return_with_inplace_update(self):
+        self.do_test(DirectReturnWithInplaceUpdateModule, expected_num_output=3)
+
+    @maybe_skip_torchxla_test
+    def test_direct_return_with_duplicated_inplace_update(self):
+        self.do_test(
+            DirectReturnWithDuplicatedInplaceUpdateModule, expected_num_output=3
+        )
diff --git a/test/dynamo/test_torchxla_util.py b/test/dynamo/test_torchxla_util.py
new file mode 100644
index 0000000000000..5c54af34678a6
--- /dev/null
+++ b/test/dynamo/test_torchxla_util.py
@@ -0,0 +1,26 @@
+# Owner(s): ["module: dynamo"]
+import functools
+import os
+import unittest
+
+
+@functools.lru_cache(None)
+def should_run_torchxla_tests():
+    """
+    Run the tests if torch_xla is available and number of gpu devices is specified.
+    """
+    has_torch_xla = True
+    try:
+        import torch_xla  # noqa: F401
+    except ImportError:
+        has_torch_xla = False
+
+    gpu_device_specified = int(os.environ.get("GPU_NUM_DEVICES", "0")) > 0
+    return has_torch_xla and gpu_device_specified
+
+
+def maybe_skip_torchxla_test(test_case):
+    return unittest.skipIf(
+        not should_run_torchxla_tests(),
+        "Skip the tests since torch_xla is not available or XLA devices are not specified",
+    )(test_case)

From f82736a3c02bb86842b47a16886dce9c97cdb6ef Mon Sep 17 00:00:00 2001
From: Wei-Sheng Chin <wschin@outlook.com>
Date: Thu, 24 Nov 2022 01:30:09 +0000
Subject: [PATCH 1241/1922] [ONNX] Move two headers from .h to .cc (#86852)

As title. Header dependency should be as small as possible.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86852
Approved by: https://github.com/titaiwangms, https://github.com/BowenBao
---
 torch/csrc/jit/passes/onnx.cpp | 2 ++
 torch/csrc/jit/passes/onnx.h   | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/jit/passes/onnx.cpp b/torch/csrc/jit/passes/onnx.cpp
index 75e2d754aa503..d4e7aa6c7f98f 100644
--- a/torch/csrc/jit/passes/onnx.cpp
+++ b/torch/csrc/jit/passes/onnx.cpp
@@ -8,6 +8,8 @@
 #include <torch/csrc/jit/ir/constants.h>
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
+#include <torch/csrc/jit/passes/onnx/constant_map.h>
+#include <torch/csrc/jit/passes/onnx/helper.h>
 #include <torch/csrc/jit/passes/onnx/onnx_log.h>
 #include <torch/csrc/jit/passes/onnx/shape_type_inference.h>
 #include <torch/csrc/jit/python/python_ir.h>
diff --git a/torch/csrc/jit/passes/onnx.h b/torch/csrc/jit/passes/onnx.h
index e3c6cd23ecc3e..11bee67916404 100644
--- a/torch/csrc/jit/passes/onnx.h
+++ b/torch/csrc/jit/passes/onnx.h
@@ -1,8 +1,6 @@
 #pragma once
 
 #include <torch/csrc/jit/ir/ir.h>
-#include <torch/csrc/jit/passes/onnx/constant_map.h>
-#include <torch/csrc/jit/passes/onnx/helper.h>
 #include <torch/csrc/onnx/onnx.h>
 #include <unordered_map>
 

From d55869cf88f17f35db16891a7ba61c056ec6d2a3 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Thu, 24 Nov 2022 01:52:11 +0000
Subject: [PATCH 1242/1922] Install missing VSX headers (POWER) (#85547)

E.g. `test_cpp_extensions_aot_ninja` fails as it includes `vec.h` which requires the vec/vsx/* headers and `sleef.h`. The latter is also required for AVX512 builds on non MSVC compilers.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/85547
Approved by: https://github.com/kit1980
---
 aten/src/ATen/CMakeLists.txt | 2 +-
 setup.py                     | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index 629db87dc15d3..613c6a6834e33 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -56,7 +56,7 @@ if(NOT BUILD_CAFFE2 AND NOT BUILD_LITE_INTERPRETER)
   EXCLUDE(ATen_CORE_TEST_SRCS "${ATen_CORE_TEST_SRCS}" ${ATen_CORE_EXCLUDED_TEST_SRCS})
 endif()
 
-file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec512/*.h" "cpu/vec/vec256/*.h" "cpu/vec/*.h" "quantized/*.h" "functorch/*.h")
+file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec512/*.h" "cpu/vec/vec256/*.h" "cpu/vec/vec256/vsx/*.h" "cpu/vec/*.h" "quantized/*.h" "functorch/*.h")
 file(GLOB base_cpp "*.cpp" "detail/*.cpp" "cpu/*.cpp" "functorch/*.cpp")
 file(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh")
 file(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp")
diff --git a/setup.py b/setup.py
index 0aa27bef64d98..e18eb16869a3f 100644
--- a/setup.py
+++ b/setup.py
@@ -1040,6 +1040,7 @@ def main():
         'include/ATen/*.h',
         'include/ATen/cpu/*.h',
         'include/ATen/cpu/vec/vec256/*.h',
+        'include/ATen/cpu/vec/vec256/vsx/*.h',
         'include/ATen/cpu/vec/vec512/*.h',
         'include/ATen/cpu/vec/*.h',
         'include/ATen/core/*.h',
@@ -1148,6 +1149,7 @@ def main():
         'include/THH/*.cuh',
         'include/THH/*.h*',
         'include/THH/generic/*.h',
+        'include/sleef.h',
         "_inductor/codegen/*.h",
         "_inductor/codegen/*.j2",
         'share/cmake/ATen/*.cmake',

From aa8abfc7e9e0fbe1595fa91eb3590cafb5899575 Mon Sep 17 00:00:00 2001
From: "Zhijing Li (Accelerator Enablement)" <tissue030@meta.com>
Date: Thu, 24 Nov 2022 02:18:32 +0000
Subject: [PATCH 1243/1922] Symintified layer_norm (#89466)

Summary: As titled.

Test Plan:
```
buck2 run mode/opt scripts/wwei6:test_executorch
```

Differential Revision: D41451390

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89466
Approved by: https://github.com/frank-wei, https://github.com/ezyang
---
 aten/src/ATen/functorch/BatchRulesDecompositions.cpp | 2 +-
 aten/src/ATen/native/layer_norm.cpp                  | 7 +++----
 aten/src/ATen/native/native_functions.yaml           | 4 +++-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
index d5a38e9804dd4..13dedcfb879ac 100644
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@@ -132,7 +132,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   OP_DECOMPOSE(instance_norm);
   OP_DECOMPOSE(kron);
   OP_DECOMPOSE(l1_loss);
-  OP_DECOMPOSE(layer_norm);
+  m.impl("layer_norm", native::layer_norm_symint);
   OP_DECOMPOSE2(ldexp, Tensor);
   OP_DECOMPOSE2(less_equal, Tensor );
   OP_DECOMPOSE2(less, Tensor );
diff --git a/aten/src/ATen/native/layer_norm.cpp b/aten/src/ATen/native/layer_norm.cpp
index 71dc42da380b2..8269a4d3af9e1 100644
--- a/aten/src/ATen/native/layer_norm.cpp
+++ b/aten/src/ATen/native/layer_norm.cpp
@@ -175,9 +175,9 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_backward_cpu(
   return std::make_tuple(std::move(dX), std::move(dgamma), std::move(dbeta));
 }
 
-Tensor layer_norm(
+Tensor layer_norm_symint(
     const Tensor& input,
-    IntArrayRef normalized_shape, const c10::optional<Tensor>& weight_opt /* optional */, const c10::optional<Tensor>& bias_opt /* optional */,
+    c10::SymIntArrayRef normalized_shape, const c10::optional<Tensor>& weight_opt /* optional */, const c10::optional<Tensor>& bias_opt /* optional */,
     double eps,
     bool /* cudnn_enable, deprecated */) {
   // See [Note: hacky wrapper removal for optional tensor]
@@ -186,8 +186,7 @@ Tensor layer_norm(
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
   const Tensor& bias = *bias_maybe_owned;
 
-
-  return std::get<0>(at::native_layer_norm(input, normalized_shape, weight, bias, eps));
+  return std::get<0>(at::native_layer_norm_symint(input, normalized_shape, weight, bias, eps));
 }
 
 DEFINE_DISPATCH(LayerNormKernel);
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 9aa3a2cceb941..273d5cea85fbf 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -2938,7 +2938,9 @@
 
 - func: kthvalue.dimname_out(Tensor self, int k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
 
-- func: layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor
+- func: layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: layer_norm_symint
 
 - func: native_layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor)
   dispatch:

From 9605fd2d5a1362d471715aeb741cf22412aae92a Mon Sep 17 00:00:00 2001
From: "Wu, Chunyuan" <chunyuan.wu@intel.com>
Date: Wed, 23 Nov 2022 20:10:41 +0000
Subject: [PATCH 1244/1922] nnc: fix Store if value is fp32 while buf is bf16
 (#86788)

Fixes https://github.com/pytorch/pytorch/issues/86533.
For the below graph:
```bash
[DUMP kernel.cpp:1690] TensorExprKernel graph:
[DUMP kernel.cpp:1690] graph(%x.1 : BFloat16(10, strides=[1], requires_grad=0, device=cpu)):
[DUMP kernel.cpp:1690]   %1 : int = prim::Constant[value=0]()
[DUMP kernel.cpp:1690]   %2 : BFloat16(10, strides=[1], requires_grad=0, device=cpu) = aten::pow(%x.1, %1) # test/test_tensorexpr.py:1330:29
[DUMP kernel.cpp:1690]   %3 : BFloat16(10, strides=[1], requires_grad=0, device=cpu) = aten::sin(%2) # test/test_tensorexpr.py:1330:19
[DUMP kernel.cpp:1690]   return (%3)
```

**Loop stmt before the fix:**
The store value `0.8414709568023682f` is float while the scalar_type of the store buf `aten_sin` is bf16.
```bash
[DEBUG llvm_codegen.cpp:489] After HalfRewriter {
[DEBUG llvm_codegen.cpp:489]   aten_sin[Ramp(0ll, 1ll, 8)] = Broadcast(0.8414709568023682f, 8);
[DEBUG llvm_codegen.cpp:489]   for (int64_t i_1_tail_tail = 0ll; i_1_tail_tail < 2ll; i_1_tail_tail++) {
[DEBUG llvm_codegen.cpp:489]     aten_sin[i_1_tail_tail + 8ll] = 0.8414709568023682f;
[DEBUG llvm_codegen.cpp:489]   }
[DEBUG llvm_codegen.cpp:489] }
```

**Loop stmt after the fix:**
```bash
[DEBUG llvm_codegen.cpp:489] After HalfRewriter {
[DEBUG llvm_codegen.cpp:489]   aten_sin[Ramp(0ll, 1ll, 8)] = bfloat16(Broadcast(0.8414709568023682f, 8));
[DEBUG llvm_codegen.cpp:489]   for (int64_t i_1_tail_tail = 0ll; i_1_tail_tail < 2ll; i_1_tail_tail++) {
[DEBUG llvm_codegen.cpp:489]     aten_sin[i_1_tail_tail + 8ll] = bfloat16(0.8414709568023682f);
[DEBUG llvm_codegen.cpp:489]   }
[DEBUG llvm_codegen.cpp:489] }
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86788
Approved by: https://github.com/EikanWang, https://github.com/kit1980
---
 test/test_tensorexpr.py                  | 11 +++++++++++
 torch/csrc/jit/tensorexpr/half_support.h |  8 ++++++++
 2 files changed, 19 insertions(+)

diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py
index 20deb0a43c429..cf894f3749eb9 100644
--- a/test/test_tensorexpr.py
+++ b/test/test_tensorexpr.py
@@ -1325,6 +1325,17 @@ def do_exp(x, y, z):
             x = warmup_and_run_forward(traced, x, y, z)
             self.assertLastGraphAllFused()
 
+    def test_sin_pow(self):
+        def test(x):
+            return torch.sin(torch.pow(x, 0))
+
+        for data_type, shape in itertools.product(self.dtypes, [[3], [5], [10]]):
+            x = torch.rand(shape, dtype=data_type)
+            scripted = torch.jit.script(test)
+            out = warmup_and_run_forward(scripted, x)
+            self.assertLastGraphAllFused()
+            self.assertEqual(out, test(x))
+
     def test_transpose(self):
         @torch.jit.script
         def test(x, y, z):
diff --git a/torch/csrc/jit/tensorexpr/half_support.h b/torch/csrc/jit/tensorexpr/half_support.h
index af146a62baa09..f095c79fbb5a1 100644
--- a/torch/csrc/jit/tensorexpr/half_support.h
+++ b/torch/csrc/jit/tensorexpr/half_support.h
@@ -77,12 +77,20 @@ class HalfRewriter : public IRMutator {
     // get the dtype of the `value()` before that is mutated.
     auto newType = v->value()->dtype();
     ExprPtr new_val = v->value()->accept_mutator(this);
+    auto bufType = v->buf()->dtype();
 
     if (isHalf(newType.scalar_type())) {
       new_val = alloc<Cast>(newType, new_val);
       inserted_half_casts_.insert(new_val);
     }
 
+    // The scalar_type of value is not Half while the buf is Half
+    if (!isHalf(newType.scalar_type()) && isHalf(bufType.scalar_type())) {
+      new_val = alloc<Cast>(
+          newType.cloneWithScalarType(bufType.scalar_type()), new_val);
+      inserted_half_casts_.insert(new_val);
+    }
+
     v->set_value(new_val);
     return v;
   }

From 4c5abfb0ba542f06ba8ffadde4402dd2732a9fed Mon Sep 17 00:00:00 2001
From: Nikita Karetnikov <nikita@karetnikov.org>
Date: Thu, 24 Nov 2022 01:02:28 +0100
Subject: [PATCH 1245/1922] Symintify `embedding` (#89327)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89327
Approved by: https://github.com/ezyang
---
 .github/ci_commit_pins/xla.txt                |  2 +-
 aten/src/ATen/functorch/BatchRulesModules.cpp | 12 ++++++------
 aten/src/ATen/native/Embedding.cpp            | 15 +++++++++------
 aten/src/ATen/native/native_functions.yaml    |  8 ++++----
 test/test_proxy_tensor.py                     |  1 -
 tools/autograd/derivatives.yaml               |  6 +++---
 torch/csrc/autograd/FunctionsManual.cpp       |  4 ++--
 torch/csrc/autograd/FunctionsManual.h         |  4 ++--
 8 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index 41b0981f86c67..3905f11038841 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-8c2a3c41592aee25dffcf48933e7cbdc5c3fc91c
+d687b0a84269f476866638afd37db893a146387c
diff --git a/aten/src/ATen/functorch/BatchRulesModules.cpp b/aten/src/ATen/functorch/BatchRulesModules.cpp
index 2419fde8a7a96..f51d63feaa8e0 100644
--- a/aten/src/ATen/functorch/BatchRulesModules.cpp
+++ b/aten/src/ATen/functorch/BatchRulesModules.cpp
@@ -21,16 +21,16 @@ static Tensor getStepTensor(const Tensor& indices, c10::SymInt bdim_size, c10::S
 std::tuple<Tensor,optional<int64_t>> embedding_batch_rule(
     const Tensor& weight, optional<int64_t> weight_bdim,
     const Tensor& indices, optional<int64_t> indices_bdim,
-    int64_t padding_idx, bool scale_grad_by_freq, bool sparse) {
+    c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse) {
   if (!weight_bdim && indices_bdim) {
     // B*, ED -> B*D
-    const auto result = at::embedding(weight, indices, padding_idx, scale_grad_by_freq, sparse);
+    const auto result = at::embedding_symint(weight, indices, padding_idx, scale_grad_by_freq, sparse);
     return std::make_tuple(result, indices_bdim);
   } else if (weight_bdim && !indices_bdim) {
     // *, BED -> *, E(BD) -> *(BD) -> *BD
     const auto batch_size = weight.size(*weight_bdim);
     const auto weight_ = reshape_dim_into(*weight_bdim, /*embedding_dim*/1, weight);
-    auto result = at::embedding(weight_, indices, padding_idx, scale_grad_by_freq, sparse);
+    auto result = at::embedding_symint(weight_, indices, padding_idx, scale_grad_by_freq, sparse);
     result = reshape_dim_outof(-1, batch_size, result);
     return std::make_tuple(result, result.dim() - 2);
   }
@@ -44,7 +44,7 @@ std::tuple<Tensor,optional<int64_t>> embedding_batch_rule(
 
   const auto range = getStepTensor(indices, batch_size, num_embeddings);
   indices_ = indices_ + range;
-  const auto result = at::embedding(weight_, indices_, padding_idx, scale_grad_by_freq, sparse);
+  const auto result = at::embedding_symint(weight_, indices_, padding_idx, scale_grad_by_freq, sparse);
   return std::make_tuple(result, 0);
 }
 
@@ -52,7 +52,7 @@ std::tuple<Tensor,optional<int64_t>>
 embedding_dense_backward_batch_rule(
     const Tensor& grad_, optional<int64_t> grad_bdim,
     const Tensor& indices_, optional<int64_t> indices_bdim,
-    c10::SymInt num_weights, int64_t padding_idx, bool scale_grad_by_freq) {
+    c10::SymInt num_weights, c10::SymInt padding_idx, bool scale_grad_by_freq) {
   Tensor grad = grad_;
   Tensor indices = indices_;
   if (!indices_bdim && grad_bdim) {
@@ -74,7 +74,7 @@ embedding_dense_backward_batch_rule(
   // Fill in the padding. We can't do it in the embedding_dense_backward call
   // because we need to fill in multiple rows!
   if (padding_idx >= 0) {
-    result.select(1, padding_idx).fill_(0);
+    result.select_symint(1, padding_idx).fill_(0);
   }
   return std::make_tuple(result, 0);
 }
diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp
index 5972ce0d2404c..4c37325c48171 100644
--- a/aten/src/ATen/native/Embedding.cpp
+++ b/aten/src/ATen/native/Embedding.cpp
@@ -33,8 +33,8 @@
 
 namespace at { namespace native {
 
-Tensor embedding(const Tensor & weight, const Tensor & indices,
-                 int64_t padding_idx, bool scale_grad_by_freq, bool sparse) {
+Tensor embedding_symint(const Tensor & weight, const Tensor & indices,
+                        c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse) {
   TORCH_CHECK(weight.dim() == 2,  "'weight' must be 2-D");
   auto indices_arg = TensorArg(indices, "indices", 1);
   checkScalarTypes("embedding", indices_arg, {kLong, kInt});
@@ -53,18 +53,21 @@ Tensor embedding(const Tensor & weight, const Tensor & indices,
 }
 
 Tensor embedding_backward_symint(
-    const Tensor & grad, const Tensor & indices, SymInt num_weights,
-    int64_t padding_idx, bool scale_grad_by_freq, bool sparse) {
+    const Tensor & grad, const Tensor & indices, c10::SymInt num_weights,
+    c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse) {
   if (sparse) {
     // TODO: if we teach sparse tensor how to propagate symints, the guard
     // here is not strictly necessary.  However, we think it is fine as is
     // because num weights is derived from a parameter and therefore
     // typically not varying.
     return at::embedding_sparse_backward(
-    grad, indices, num_weights.guard_int(__FILE__, __LINE__), padding_idx, scale_grad_by_freq);
+      grad, indices,
+      num_weights.guard_int(__FILE__, __LINE__),
+      padding_idx.guard_int(__FILE__, __LINE__),
+      scale_grad_by_freq);
   } else {
     return at::embedding_dense_backward_symint(
-        grad, indices, num_weights, padding_idx, scale_grad_by_freq);
+      grad, indices, num_weights, padding_idx, scale_grad_by_freq);
   }
 }
 
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 273d5cea85fbf..701b88e0254d2 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -2031,17 +2031,17 @@
 
 - func: einsum(str equation, Tensor[] tensors, *, int[]? path=None) -> Tensor
 
-- func: embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
+- func: embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
   dispatch:
-    CompositeExplicitAutograd: embedding
+    CompositeExplicitAutograd: embedding_symint
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_embedding
   autogen: embedding.out
 
-- func: embedding_backward(Tensor grad, Tensor indices, SymInt num_weights, int padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor
+- func: embedding_backward(Tensor grad, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor
   dispatch:
     CompositeImplicitAutograd: embedding_backward_symint
 
-- func: embedding_dense_backward(Tensor grad_output, Tensor indices, SymInt num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor
+- func: embedding_dense_backward(Tensor grad_output, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq) -> Tensor
   dispatch:
     CPU: embedding_dense_backward_cpu
     CUDA: embedding_dense_backward_cuda
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 60fc4f07c847e..f82848dfb1076 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1237,7 +1237,6 @@ def f(a, b, c, d, e):
     xfail('nn.functional.cross_entropy', ''),  # aten.size.default - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.ctc_loss'),  # aten._ctc_loss.Tensor - couldn't find symbolic meta function/decomposition
     xfail('nn.functional.embedding_bag', ''),  # aten._embedding_bag_forward_only.default - couldn't find symbolic meta fun...
-    xfail('nn.functional.embedding', ''),  # argument 'size' must be tuple of ints, but found element of type tor...
     xfail('nn.functional.fractional_max_pool2d', ''),  # argument 'size' must be tuple of ints, but found element of t...
     xfail('nn.functional.fractional_max_pool3d', ''),  # argument 'size' must be tuple of ints, but found element of t...
     xfail('nn.functional.grid_sample', ''),  # aten.grid_sampler_2d.default - couldn't find symbolic meta function/decompos...
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 3500bd28ca7d0..b2627fedafc92 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -1833,13 +1833,13 @@
              + binary_cross_entropy_with_logits_target_backward(target_t, self_p, target_p, weight, pos_weight, at::Reduction::None),
            reduction)"
 
-- name: embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
+- name: embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
   indices: non_differentiable
   weight: embedding_backward_symint(grad, indices, weight.sym_size(0), padding_idx, scale_grad_by_freq, sparse)
   result: auto_linear
 
-- name: embedding_dense_backward(Tensor grad_output, Tensor indices, SymInt num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor
-  grad_output: embedding_dense_double_backward(grad, indices, padding_idx)
+- name: embedding_dense_backward(Tensor grad_output, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq) -> Tensor
+  grad_output: embedding_dense_double_backward_symint(grad, indices, padding_idx)
   indices: non_differentiable
   result: auto_linear
 
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 05fcdea3e6b7c..d1c59302b3926 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -4891,10 +4891,10 @@ Tensor constant_pad_nd_backward(const Tensor& grad, c10::SymIntArrayRef pad) {
   return at::constant_pad_nd_symint(grad, negated_pad, 0);
 }
 
-Tensor embedding_dense_double_backward(
+Tensor embedding_dense_double_backward_symint(
     const Tensor& grad,
     const Tensor& indices,
-    int64_t padding_idx) {
+    c10::SymInt padding_idx) {
   // since first backward takes care of scaling by frequency,
   // we don't need to worry about it here.
   auto gg_weight = grad.index_select(0, indices.reshape(-1));
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index 4da8aa074a534..edc7dcd140f7b 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -517,10 +517,10 @@ at::Tensor sinc_backward(const at::Tensor& grad, const at::Tensor& self);
 at::Tensor sparse_constructor_values_backward(
     const at::Tensor& sparse_grad_out,
     const at::Tensor& indices);
-at::Tensor embedding_dense_double_backward(
+at::Tensor embedding_dense_double_backward_symint(
     const at::Tensor& grad,
     const at::Tensor& indices,
-    int64_t padding_idx);
+    c10::SymInt padding_idx);
 at::Tensor index_backward(
     at::Tensor zeros_like_self,
     const torch::List<c10::optional<Tensor>>& indices,

From 89ba78457fc76bfc229ab5f659e39a82ebcdfdb5 Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Thu, 24 Nov 2022 03:39:55 +0000
Subject: [PATCH 1246/1922] Expose to python the backward AD view_func (#89586)

This will be useful for other systems (AOTAutograd) that want to replay autograd views.

FYI @bdhirsh
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89586
Approved by: https://github.com/soulitzer
---
 test/test_autograd.py                   | 22 ++++++++++++++++++
 torch/csrc/autograd/autograd_meta.cpp   | 10 ++++----
 torch/csrc/autograd/python_variable.cpp | 31 +++++++++++++++++++++++++
 torch/csrc/autograd/variable.h          |  5 ++++
 torch/csrc/cuda/comm.cpp                | 16 ++++++-------
 torch/overrides.py                      |  1 +
 6 files changed, 73 insertions(+), 12 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index 777b790da6559..4b1e97cb3b2b5 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -7236,6 +7236,28 @@ def get_out():
             err_msg = "RuntimeError: one of the variables needed for gradient computation"
             self.assertTrue(err_msg in e.output.decode("utf-8"))
 
+    def test_view_func_replay(self):
+        def _assert_match_metadata(a, b):
+            self.assertEqual(a.size(), b.size())
+            self.assertEqual(a.stride(), b.stride())
+            self.assertEqual(a.storage_offset(), b.storage_offset())
+
+        def _test_op(fn, inp, args):
+            out = fn(inp, *args)
+            self.assertTrue(out._is_view)
+            self.assertTrue(out._base is inp)
+
+            new_inp = inp.clone()
+            _assert_match_metadata(new_inp, inp)
+            new_out = out._view_func(new_inp)
+            _assert_match_metadata(new_out, out)
+
+        _test_op(torch.select, torch.rand(2, 2), (0, 0))
+        _test_op(torch.as_strided, torch.rand(2, 2), ((4,), (1,)))
+        _test_op(torch.view_as_complex, torch.rand(2, 2), ())
+        _test_op(torch.view_as_real, torch.rand(2, 2, dtype=torch.cfloat), ())
+
+
 def index_perm_variable(shape, max_indices):
     if not isinstance(shape, tuple):
         shape = (shape,)
diff --git a/torch/csrc/autograd/autograd_meta.cpp b/torch/csrc/autograd/autograd_meta.cpp
index db00d67576d3b..d11cd68e1800a 100644
--- a/torch/csrc/autograd/autograd_meta.cpp
+++ b/torch/csrc/autograd/autograd_meta.cpp
@@ -82,7 +82,7 @@ using at::Tensor;
 // base if needed. Case 5 is handled in fw_grad by reading the forward grad from
 // the base if needed.
 
-namespace {
+namespace utils {
 
 // Enforcing that the metadata between the primal and tangent are same has two
 // goals:
@@ -139,7 +139,8 @@ bool has_same_meta(const Variable& base, const Variable& other) {
   }
   return true;
 }
-} // anonymous namespace
+
+} // namespace utils
 
 // This function is will ensure that the fw_grad_ is properly a view of the base
 // for inplace ops on Tensors that do not have forward grad originally.
@@ -219,7 +220,8 @@ void AutogradMeta::set_fw_grad(
           // Enforce same meta here to make sure that the view op below is
           // always valid
           Tensor new_base_fw_grad;
-          if (has_same_meta(new_grad, base) && has_same_meta(new_grad, self)) {
+          if (utils::has_same_meta(new_grad, base) &&
+              utils::has_same_meta(new_grad, self)) {
             // TODO extend this special case to when the underlying storage of
             // new_grad can be re-used.
             new_base_fw_grad = new_grad;
@@ -248,7 +250,7 @@ void AutogradMeta::set_fw_grad(
     }
 
     // Enforce the basic layout constraint
-    if (!has_same_meta(new_grad, self)) {
+    if (!utils::has_same_meta(new_grad, self)) {
       if (is_view_) {
         auto this_view_meta = static_cast<DifferentiableViewMeta*>(this);
         TORCH_INTERNAL_ASSERT(
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index e3ab10c7499ca..a08d6f7761fd2 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -684,6 +684,36 @@ static PyObject* THPVariable_fix_weakref(PyObject* self, PyObject* noargs) {
   Py_RETURN_NONE;
 }
 
+static PyObject* THPVariable_view_func(PyObject* self_, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  const auto& self = THPVariable_Unpack(self_);
+  TORCH_CHECK(
+      THPVariable_Check(arg),
+      "_view_func expect a single argument that is a Tensor");
+  const auto& new_base = THPVariable_Unpack(arg);
+
+  // Ensure that self is indeed a backward differentiable view
+  auto diff_view_meta = torch::autograd::impl::get_view_autograd_meta(self);
+  TORCH_CHECK(
+      diff_view_meta && diff_view_meta->has_bw_view(),
+      "_view_func can only be called on "
+      "a Tensor that is a backward differentiable view.");
+  const auto& view_info = diff_view_meta->get_backward_view();
+  // Ensure that the newly provided base is similar to the original base
+  TORCH_CHECK(
+      torch::autograd::utils::has_same_meta(new_base, view_info.base_),
+      "The new base passed to _view_func must have the same metadata as the Tensors's base");
+
+  // Do the actual view replay
+  if (view_info.has_view_fn()) {
+    return THPVariable_Wrap(view_info.view_fn()(new_base));
+  } else {
+    return THPVariable_Wrap(new_base.as_strided(
+        self.sizes(), self.strides(), self.storage_offset()));
+  }
+  END_HANDLE_TH_ERRORS
+}
+
 // Instantiates a subclass of self with the same data.
 static PyObject* THPVariable_as_subclass(
     PyObject* _self,
@@ -1645,6 +1675,7 @@ static PyMethodDef extra_methods[] = {
      METH_STATIC | METH_VARARGS | METH_KEYWORDS,
      nullptr},
     {"_fix_weakref", THPVariable_fix_weakref, METH_NOARGS, nullptr},
+    {"_view_func", THPVariable_view_func, METH_O, nullptr},
     {nullptr}};
 
 /* From https://github.com/python/cpython/blob/v3.7.0/Modules/xxsubtype.c
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index 49905fe803f46..52ce34ec394d0 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -791,6 +791,11 @@ inline Variable make_variable(
   return Variable();
 }
 
+namespace utils {
+
+TORCH_API bool has_same_meta(const Variable& base, const Variable& other);
+
+} // namespace utils
 } // namespace autograd
 } // namespace torch
 
diff --git a/torch/csrc/cuda/comm.cpp b/torch/csrc/cuda/comm.cpp
index 117f6b571792b..e215ce0e3ed67 100644
--- a/torch/csrc/cuda/comm.cpp
+++ b/torch/csrc/cuda/comm.cpp
@@ -180,12 +180,12 @@ tensor_list2d broadcast_coalesced(
 
   unique_type_checker type_checker;
   at::cuda::CUDAGuard device_guard(devices[0]);
-  for (auto& chunk : utils::take_tensors(tensors, buffer_size)) {
+  for (auto& chunk : torch::utils::take_tensors(tensors, buffer_size)) {
     auto type_id = chunk.type_id();
     type_checker.show(type_id);
     std::vector<at::Tensor> results;
     if (chunk.options().is_sparse()) {
-      auto flat_tuple = utils::flatten_sparse_tensors(chunk.tensors);
+      auto flat_tuple = torch::utils::flatten_sparse_tensors(chunk.tensors);
       auto broadcast_indices = broadcast(flat_tuple.first, devices);
       auto broadcast_values = broadcast(flat_tuple.second, devices);
       results.reserve(devices.size());
@@ -194,20 +194,20 @@ tensor_list2d broadcast_coalesced(
         auto& device_outputs = outputs[i];
         auto& inds = broadcast_indices[i];
         auto& vals = broadcast_values[i];
-        for (const auto& var :
-             utils::unflatten_sparse_tensors(inds, vals, chunk.tensors)) {
+        for (const auto& var : torch::utils::unflatten_sparse_tensors(
+                 inds, vals, chunk.tensors)) {
           // See NOTE [ Version Counter in comm.*_coalesced ]
           device_outputs.push_back(make_variable(var.tensor_data(), false));
         }
       }
     } else {
-      auto results =
-          broadcast(utils::flatten_dense_tensors(chunk.tensors), devices);
+      auto results = broadcast(
+          torch::utils::flatten_dense_tensors(chunk.tensors), devices);
       for (size_t i = 1, num_devices = devices.size(); i < num_devices; ++i) {
         device_guard.set_index(devices[i]);
         auto& device_outputs = outputs[i];
         for (auto& var :
-             utils::unflatten_dense_tensors(results[i], chunk.tensors)) {
+             torch::utils::unflatten_dense_tensors(results[i], chunk.tensors)) {
           // See NOTE [ Version Counter in comm.*_coalesced ]
           device_outputs.push_back(make_variable(var.tensor_data(), false));
         }
@@ -218,7 +218,7 @@ tensor_list2d broadcast_coalesced(
   // If we only saw a single tensor type, then we can skip expensive reordering
   if (!type_checker.unique) {
     for (auto& o : outputs)
-      utils::reorder_tensors_like(o, tensors);
+      torch::utils::reorder_tensors_like(o, tensors);
   }
   return outputs;
 }
diff --git a/torch/overrides.py b/torch/overrides.py
index 21cfe2477bd6f..ae2b23e17d30b 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -276,6 +276,7 @@ def get_ignored_functions() -> Set[Callable]:
         Tensor._typed_storage,
         Tensor._reduce_ex_internal,
         Tensor._fix_weakref,
+        Tensor._view_func,
         Tensor._make_wrapper_subclass,
         Tensor._python_dispatch.__get__,
         Tensor._has_symbolic_sizes_strides.__get__,

From dec99e5f85546d81ce446f97ba0e1d57f662e660 Mon Sep 17 00:00:00 2001
From: Michael Lazos <mlazos@fb.com>
Date: Thu, 24 Nov 2022 04:15:34 +0000
Subject: [PATCH 1247/1922] Disable optimizer tracing, enable for tests only
 (#89500)

Disabling optimizer tracing before launch until it can be added to the benchmark suites without increasing compile times

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89500
Approved by: https://github.com/anijain2305
---
 test/dynamo/test_optimizers.py |   33 +
 test/test_optim.py             | 2343 +++++++++++++++++++++++---------
 torch/_dynamo/skipfiles.py     |   11 -
 3 files changed, 1753 insertions(+), 634 deletions(-)

diff --git a/test/dynamo/test_optimizers.py b/test/dynamo/test_optimizers.py
index 90b8cfaaad7b3..86d5d7ba6ce97 100644
--- a/test/dynamo/test_optimizers.py
+++ b/test/dynamo/test_optimizers.py
@@ -1,5 +1,6 @@
 # Owner(s): ["module: dynamo"]
 
+import contextlib
 import inspect
 import unittest
 
@@ -9,11 +10,25 @@
 import torch._dynamo.test_case
 import torch._dynamo.testing
 
+
 input = torch.ones([10, 10])
 model = torch.nn.Sequential(*[torch.nn.Linear(10, 10) for _ in range(2)])
 model(input).sum().backward()
 
 
+# Include optimizer code for tracing
+optim_filenames = set(
+    [
+        inspect.getfile(obj)
+        for obj in torch.optim.__dict__.values()
+        if inspect.isclass(obj)
+    ]
+)
+
+
+optim_filenames |= {torch.optim._functional.__file__}
+
+
 def make_test(optim_cls, exp_frame_cnt=1, closure=None, **kwargs):
     opt = optim_cls(model.parameters(), **kwargs)
 
@@ -38,10 +53,23 @@ def fn():
     return test_fn
 
 
+@contextlib.contextmanager
+def enable_optimizer_tracing():
+    try:
+        old = set(torch._dynamo.skipfiles.FILENAME_ALLOWLIST)
+
+        torch._dynamo.skipfiles.FILENAME_ALLOWLIST.update(optim_filenames)
+        yield
+    finally:
+        torch._dynamo.skipfiles.FILENAME_ALLOWLIST.clear()
+        torch._dynamo.skipfiles.FILENAME_ALLOWLIST.update(old)
+
+
 class OptimizerTests(torch._dynamo.test_case.TestCase):
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
+
         # needed until pytorch assertion is changed to enable Adam
         # to be called with capturable=True
         cls._exit_stack.enter_context(
@@ -54,6 +82,7 @@ def setUpClass(cls):
                 torch._dynamo.config, "fake_tensor_propagation", False
             )
         )
+        cls._exit_stack.enter_context(enable_optimizer_tracing())
 
     test_sgd = make_test(torch.optim.SGD, lr=0.01)
     # lgbfs has data-dependent control and internally iterates
@@ -94,6 +123,10 @@ def setUpClass(cls):
 
 
 class End2EndTests(torch._dynamo.test_case.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls._exit_stack.enter_context(enable_optimizer_tracing())
 
     # https://github.com/pytorch/torchdynamo/issues/1604
     def test_optimizing_over_tensor_with_requires_grad(self):
diff --git a/test/test_optim.py b/test/test_optim.py
index 31c5add46d9d2..b5d2d43c86ce2 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -13,13 +13,38 @@
 from torch.nn import Parameter
 from torch.optim import SGD
 from torch import sparse
-from torch.optim.lr_scheduler import LambdaLR, MultiplicativeLR, SequentialLR, StepLR, \
-    MultiStepLR, ConstantLR, LinearLR, ExponentialLR, CosineAnnealingLR, ReduceLROnPlateau, \
-    LRScheduler, CyclicLR, CosineAnnealingWarmRestarts, OneCycleLR, ChainedScheduler, PolynomialLR, \
-    EPOCH_DEPRECATION_WARNING
+from torch.optim.lr_scheduler import (
+    LambdaLR,
+    MultiplicativeLR,
+    SequentialLR,
+    StepLR,
+    MultiStepLR,
+    ConstantLR,
+    LinearLR,
+    ExponentialLR,
+    CosineAnnealingLR,
+    ReduceLROnPlateau,
+    LRScheduler,
+    CyclicLR,
+    CosineAnnealingWarmRestarts,
+    OneCycleLR,
+    ChainedScheduler,
+    PolynomialLR,
+    EPOCH_DEPRECATION_WARNING,
+)
 from torch.optim.swa_utils import AveragedModel, SWALR, update_bn
-from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_UBSAN, load_tests, \
-    parametrize, instantiate_parametrized_tests, gradcheck, skipIfRocm
+from torch.testing._internal.common_utils import (
+    TestCase,
+    run_tests,
+    TEST_WITH_UBSAN,
+    load_tests,
+    parametrize,
+    instantiate_parametrized_tests,
+    gradcheck,
+    skipIfRocm,
+    skipIfTorchDynamo
+)
+
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
 load_tests = load_tests
@@ -27,19 +52,24 @@
 
 def rosenbrock(tensor):
     x, y = tensor
-    return (1 - x) ** 2 + 100 * (y - x ** 2) ** 2
+    return (1 - x) ** 2 + 100 * (y - x**2) ** 2
 
 
 def drosenbrock(tensor):
     x, y = tensor
-    return torch.tensor((-400 * x * (y - x ** 2) - 2 * (1 - x), 200 * (y - x ** 2)))
+    return torch.tensor((-400 * x * (y - x**2) - 2 * (1 - x), 200 * (y - x**2)))
 
 
 class TestOptim(TestCase):
     exact_dtype = True
 
-    def _test_rosenbrock_sparse(self, constructor, scheduler_constructors=None,
-                                sparse_only=False, maximize=False):
+    def _test_rosenbrock_sparse(
+        self,
+        constructor,
+        scheduler_constructors=None,
+        sparse_only=False,
+        maximize=False,
+    ):
         if scheduler_constructors is None:
             scheduler_constructors = []
         params_t = torch.tensor([1.5, 1.5])
@@ -69,11 +99,11 @@ def eval(params, sparse_grad, w):
             if w:
                 i = torch.LongTensor([[0, 0]])
                 x = grad[0]
-                v = torch.tensor([x / 4., x - x / 4.])
+                v = torch.tensor([x / 4.0, x - x / 4.0])
             else:
                 i = torch.LongTensor([[1, 1]])
                 y = grad[1]
-                v = torch.tensor([y - y / 4., y / 4.])
+                v = torch.tensor([y - y / 4.0, y / 4.0])
             x = sparse.DoubleTensor(i, v, torch.Size([2])).to(dtype=v.dtype)
             with torch.no_grad():
                 if sparse_grad:
@@ -100,8 +130,16 @@ def eval(params, sparse_grad, w):
         else:
             self.assertGreaterEqual(rosenbrock(params), rosenbrock(params_t))
 
-    def _test_basic_cases_template(self, weight_tensor, bias_tensor, input_tensor, constructor,
-                                   scheduler_constructors, constructor_accepts_maximize=True, constructor_accepts_foreach=False):
+    def _test_basic_cases_template(
+        self,
+        weight_tensor,
+        bias_tensor,
+        input_tensor,
+        constructor,
+        scheduler_constructors,
+        constructor_accepts_maximize=True,
+        constructor_accepts_foreach=False,
+    ):
         maximize_options = set([False, constructor_accepts_maximize])
         foreach_options = set([False, constructor_accepts_foreach])
 
@@ -109,14 +147,19 @@ def _test_basic_cases_template(self, weight_tensor, bias_tensor, input_tensor, c
         if constructor_accepts_maximize and constructor_accepts_foreach:
             pass
         elif constructor_accepts_maximize:
+
             def four_arg_constructor(weight, bias, maximize, foreach):
                 self.assertFalse(foreach)
                 return constructor(weight, bias, maximize)
+
         elif constructor_accepts_foreach:
+
             def four_arg_constructor(weight, bias, maximize, foreach):
                 self.assertFalse(maximize)
                 return constructor(weight, bias, foreach)
+
         else:
+
             def four_arg_constructor(weight, bias, maximize, foreach):
                 self.assertFalse(maximize or foreach)
                 return constructor(weight, bias)
@@ -198,31 +241,35 @@ def fn_base(optimizer, weight, bias):
         self.assertEqual(optimizer.state_dict(), optimizer_c.state_dict())
         # Make sure repeated parameters have identical representation in state dict
         optimizer_c.param_groups.extend(optimizer_c.param_groups)
-        self.assertEqual(optimizer.state_dict()['param_groups'][-1],
-                         optimizer_c.state_dict()['param_groups'][-1])
+        self.assertEqual(
+            optimizer.state_dict()["param_groups"][-1],
+            optimizer_c.state_dict()["param_groups"][-1],
+        )
 
         # Make sure that optimizers that support maximize can load older models
         state_dict = optimizer.state_dict()
-        if 'maximize' in state_dict['param_groups'][0]:
-            for group in state_dict['param_groups']:
-                del group['maximize']
+        if "maximize" in state_dict["param_groups"][0]:
+            for group in state_dict["param_groups"]:
+                del group["maximize"]
             optimizer.load_state_dict(state_dict)
             # Make sure we can still step
             optimizer.step()
         # Make sure that optimizers that support foreach can load older models
         state_dict = optimizer.state_dict()
-        if 'foreach' in state_dict['param_groups'][0]:
-            for group in state_dict['param_groups']:
-                del group['foreach']
+        if "foreach" in state_dict["param_groups"][0]:
+            for group in state_dict["param_groups"]:
+                del group["foreach"]
             optimizer.load_state_dict(state_dict)
             # Make sure we can still step
             optimizer.step()
 
         # Make sure that loading optimizers with step not wrapped in tensor can work
         state_dict = optimizer.state_dict()
-        if 'step' in state_dict['state'][0] and torch.is_tensor(state_dict['state'][0]['step']):
-            for state in state_dict['state'].values():
-                state['step'] = state['step'].item()
+        if "step" in state_dict["state"][0] and torch.is_tensor(
+            state_dict["state"][0]["step"]
+        ):
+            for state in state_dict["state"].values():
+                state["step"] = state["step"].item()
             optimizer.load_state_dict(state_dict)
             optimizer.step()
 
@@ -233,8 +280,12 @@ def fn_base(optimizer, weight, bias):
 
         with torch.no_grad():
             input_cuda = input.clone().detach().to(dtype=torch.float32, device="cuda")
-            weight_cuda = Parameter(weight.clone().detach().to(dtype=torch.float32, device="cuda"))
-            bias_cuda = Parameter(bias.clone().detach().to(dtype=torch.float32, device="cuda"))
+            weight_cuda = Parameter(
+                weight.clone().detach().to(dtype=torch.float32, device="cuda")
+            )
+            bias_cuda = Parameter(
+                bias.clone().detach().to(dtype=torch.float32, device="cuda")
+            )
         optimizer_cuda = constructor(weight_cuda, bias_cuda)
         fn_cuda = functools.partial(fn_base, optimizer_cuda, weight_cuda, bias_cuda)
 
@@ -247,9 +298,11 @@ def fn_base(optimizer, weight, bias):
 
         # Make sure that device of state['step'] is still CPU
         new_state_dict = optimizer_cuda.state_dict()
-        if 'step' in state_dict['state'][0] and torch.is_tensor(state_dict['state'][0]['step']):
-            for state in new_state_dict['state'].values():
-                self.assertEqual(state['step'].device.type, 'cpu')
+        if "step" in state_dict["state"][0] and torch.is_tensor(
+            state_dict["state"][0]["step"]
+        ):
+            for state in new_state_dict["state"].values():
+                self.assertEqual(state["step"].device.type, "cpu")
 
         for _i in range(20):
             optimizer.step(fn)
@@ -259,16 +312,26 @@ def fn_base(optimizer, weight, bias):
 
         # validate deepcopy() copies all public attributes
         def getPublicAttr(obj):
-            return set(k for k in obj.__dict__ if not k.startswith('_'))
+            return set(k for k in obj.__dict__ if not k.startswith("_"))
+
         self.assertEqual(getPublicAttr(optimizer), getPublicAttr(deepcopy(optimizer)))
 
-    def _test_basic_cases(self, constructor, scheduler_constructors=None,
-                          ignore_multidevice=False, constructor_accepts_maximize=False, constructor_accepts_foreach=False,
-                          atol=None, rtol=None):
+    def _test_basic_cases(
+        self,
+        constructor,
+        scheduler_constructors=None,
+        ignore_multidevice=False,
+        constructor_accepts_maximize=False,
+        constructor_accepts_foreach=False,
+        atol=None,
+        rtol=None,
+    ):
         if scheduler_constructors is None:
             scheduler_constructors = []
 
-        def make_two_arg_constructor(constructor, maximize: bool = False, foreach: bool = False):
+        def make_two_arg_constructor(
+            constructor, maximize: bool = False, foreach: bool = False
+        ):
             if constructor_accepts_maximize and constructor_accepts_foreach:
                 return lambda weight, bias: constructor(weight, bias, maximize, foreach)
             if constructor_accepts_maximize:
@@ -286,7 +349,8 @@ def make_two_arg_constructor(constructor, maximize: bool = False, foreach: bool
                 torch.randn(10),
                 torch.randn(5),
                 make_two_arg_constructor(constructor, maximize, foreach),
-                atol=atol, rtol=rtol
+                atol=atol,
+                rtol=rtol,
             )
         self._test_basic_cases_template(
             torch.randn(10, 5),
@@ -373,90 +437,162 @@ def _test_complex_2d(self, optimizer_constructor, f=None):
             self.assertEqual(a1.imag, a1_imag)
 
     def _build_params_dict(self, weight, bias, **kwargs):
-        return [{'params': [weight]}, dict(params=[bias], **kwargs)]
+        return [{"params": [weight]}, dict(params=[bias], **kwargs)]
 
     def _build_params_dict_single(self, weight, bias, **kwargs):
         return [dict(params=bias, **kwargs)]
 
     def test_sgd(self):
         self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: optim.SGD([weight, bias], lr=1e-3, maximize=maximize, foreach=foreach),
-            constructor_accepts_maximize=True, constructor_accepts_foreach=True,
+            lambda weight, bias, maximize, foreach: optim.SGD(
+                [weight, bias], lr=1e-3, maximize=maximize, foreach=foreach
+            ),
+            constructor_accepts_maximize=True,
+            constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: optim.SGD([weight, bias], lr=1e-3, maximize=maximize, foreach=foreach),
-            constructor_accepts_maximize=True, constructor_accepts_foreach=True,
+            lambda weight, bias, maximize, foreach: optim.SGD(
+                [weight, bias], lr=1e-3, maximize=maximize, foreach=foreach
+            ),
+            constructor_accepts_maximize=True,
+            constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
             lambda weight, bias, maximize, foreach: optim.SGD(
                 self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3, maximize=maximize, foreach=foreach),
-            constructor_accepts_maximize=True, constructor_accepts_foreach=True,
+                lr=1e-3,
+                maximize=maximize,
+                foreach=foreach,
+            ),
+            constructor_accepts_maximize=True,
+            constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
             lambda weight, bias, maximize, foreach: optim.SGD(
                 self._build_params_dict_single(weight, bias, lr=1e-2),
-                lr=1e-3, maximize=maximize, foreach=foreach),
-            constructor_accepts_maximize=True, constructor_accepts_foreach=True,
+                lr=1e-3,
+                maximize=maximize,
+                foreach=foreach,
+            ),
+            constructor_accepts_maximize=True,
+            constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
             lambda weight, bias, maximize, foreach: optim.SGD(
-                self._build_params_dict_single(weight, bias, lr=1e-2), maximize=maximize, foreach=foreach),
-            constructor_accepts_maximize=True, constructor_accepts_foreach=True,
+                self._build_params_dict_single(weight, bias, lr=1e-2),
+                maximize=maximize,
+                foreach=foreach,
+            ),
+            constructor_accepts_maximize=True,
+            constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: optim.SGD([weight, bias], lr=1e-3, maximize=maximize, foreach=foreach),
+            lambda weight, bias, maximize, foreach: optim.SGD(
+                [weight, bias], lr=1e-3, maximize=maximize, foreach=foreach
+            ),
             [lambda opt: StepLR(opt, gamma=0.9, step_size=10)],
-            constructor_accepts_maximize=True, constructor_accepts_foreach=True,
+            constructor_accepts_maximize=True,
+            constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: optim.SGD([weight, bias], lr=1e-3, maximize=maximize, foreach=foreach),
-            [lambda opt: LinearLR(opt, start_factor=0.4, end_factor=0.8, total_iters=4)],
-            constructor_accepts_maximize=True, constructor_accepts_foreach=True,
+            lambda weight, bias, maximize, foreach: optim.SGD(
+                [weight, bias], lr=1e-3, maximize=maximize, foreach=foreach
+            ),
+            [
+                lambda opt: LinearLR(
+                    opt, start_factor=0.4, end_factor=0.8, total_iters=4
+                )
+            ],
+            constructor_accepts_maximize=True,
+            constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: optim.SGD([weight, bias], lr=1e-3, maximize=maximize, foreach=foreach),
+            lambda weight, bias, maximize, foreach: optim.SGD(
+                [weight, bias], lr=1e-3, maximize=maximize, foreach=foreach
+            ),
             [lambda opt: ConstantLR(opt, factor=0.4, total_iters=4)],
-            constructor_accepts_maximize=True, constructor_accepts_foreach=True,
+            constructor_accepts_maximize=True,
+            constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: optim.SGD([weight, bias], lr=1e-3, maximize=maximize, foreach=foreach),
-            [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
-                lambda opt: LinearLR(opt, start_factor=0.4, end_factor=0.6, total_iters=4)],
-            constructor_accepts_maximize=True, constructor_accepts_foreach=True,
+            lambda weight, bias, maximize, foreach: optim.SGD(
+                [weight, bias], lr=1e-3, maximize=maximize, foreach=foreach
+            ),
+            [
+                lambda opt: StepLR(opt, gamma=0.9, step_size=10),
+                lambda opt: LinearLR(
+                    opt, start_factor=0.4, end_factor=0.6, total_iters=4
+                ),
+            ],
+            constructor_accepts_maximize=True,
+            constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: optim.SGD([weight, bias], lr=1e-3, maximize=maximize, foreach=foreach),
-            [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
-                lambda opt: ReduceLROnPlateau(opt)],
-            constructor_accepts_maximize=True, constructor_accepts_foreach=True,
+            lambda weight, bias, maximize, foreach: optim.SGD(
+                [weight, bias], lr=1e-3, maximize=maximize, foreach=foreach
+            ),
+            [
+                lambda opt: StepLR(opt, gamma=0.9, step_size=10),
+                lambda opt: ReduceLROnPlateau(opt),
+            ],
+            constructor_accepts_maximize=True,
+            constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: optim.SGD([weight, bias], lr=1e-3, maximize=maximize, foreach=foreach),
-            [lambda opt: StepLR(opt, gamma=0.99, step_size=10),
+            lambda weight, bias, maximize, foreach: optim.SGD(
+                [weight, bias], lr=1e-3, maximize=maximize, foreach=foreach
+            ),
+            [
+                lambda opt: StepLR(opt, gamma=0.99, step_size=10),
                 lambda opt: ExponentialLR(opt, gamma=0.99),
-                lambda opt: ReduceLROnPlateau(opt)],
-            constructor_accepts_maximize=True, constructor_accepts_foreach=True,
+                lambda opt: ReduceLROnPlateau(opt),
+            ],
+            constructor_accepts_maximize=True,
+            constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
-            lambda weight, bias, maximize, foreach:
-            optim.SGD([weight, bias], lr=1e-3, momentum=0.5, maximize=maximize, foreach=foreach),
-            constructor_accepts_maximize=True, constructor_accepts_foreach=True,
+            lambda weight, bias, maximize, foreach: optim.SGD(
+                [weight, bias],
+                lr=1e-3,
+                momentum=0.5,
+                maximize=maximize,
+                foreach=foreach,
+            ),
+            constructor_accepts_maximize=True,
+            constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
-            lambda weight, bias, maximize, foreach:
-            optim.SGD([weight, bias], lr=1e-3, momentum=0.5, weight_decay=1, maximize=maximize, foreach=foreach),
-            constructor_accepts_maximize=True, constructor_accepts_foreach=True,
+            lambda weight, bias, maximize, foreach: optim.SGD(
+                [weight, bias],
+                lr=1e-3,
+                momentum=0.5,
+                weight_decay=1,
+                maximize=maximize,
+                foreach=foreach,
+            ),
+            constructor_accepts_maximize=True,
+            constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
-            lambda weight, bias, maximize, foreach:
-            optim.SGD([weight, bias], nesterov=True, lr=1e-3, momentum=0.5, weight_decay=1, maximize=maximize, foreach=foreach),
-            constructor_accepts_maximize=True, constructor_accepts_foreach=True,
+            lambda weight, bias, maximize, foreach: optim.SGD(
+                [weight, bias],
+                nesterov=True,
+                lr=1e-3,
+                momentum=0.5,
+                weight_decay=1,
+                maximize=maximize,
+                foreach=foreach,
+            ),
+            constructor_accepts_maximize=True,
+            constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: optim.SGD([weight, bias], lr=1e-3, maximize=maximize, foreach=foreach),
+            lambda weight, bias, maximize, foreach: optim.SGD(
+                [weight, bias], lr=1e-3, maximize=maximize, foreach=foreach
+            ),
             [lambda opt: PolynomialLR(opt, power=0.9, total_iters=4)],
-            constructor_accepts_maximize=True, constructor_accepts_foreach=True,
+            constructor_accepts_maximize=True,
+            constructor_accepts_foreach=True,
         )
         with self.assertRaisesRegex(ValueError, "Invalid momentum value: -0.5"):
             optim.SGD(None, lr=1e-2, momentum=-0.5)
@@ -468,7 +604,7 @@ def test_sgd_sparse(self):
             )
             self._test_rosenbrock_sparse(
                 lambda params: optim.SGD(params, lr=0.0048, foreach=foreach),
-                [lambda opt: StepLR(opt, gamma=0.99999, step_size=300)]
+                [lambda opt: StepLR(opt, gamma=0.99999, step_size=300)],
             )
 
     def test_sgd_complex(self):
@@ -480,13 +616,29 @@ def test_sgd_complex(self):
                 lambda param: optim.SGD([param], lr=0.001, momentum=1, foreach=foreach)
             )
             self._test_complex_optimizer(
-                lambda param: optim.SGD([param], lr=0.001, momentum=1, weight_decay=1, foreach=foreach)
+                lambda param: optim.SGD(
+                    [param], lr=0.001, momentum=1, weight_decay=1, foreach=foreach
+                )
             )
             self._test_complex_optimizer(
-                lambda param: optim.SGD([param], lr=0.001, nesterov=True, momentum=1, weight_decay=1, foreach=foreach)
+                lambda param: optim.SGD(
+                    [param],
+                    lr=0.001,
+                    nesterov=True,
+                    momentum=1,
+                    weight_decay=1,
+                    foreach=foreach,
+                )
             )
             self._test_complex_optimizer(
-                lambda param: optim.SGD([param], lr=0.001, momentum=1, dampening=0.5, weight_decay=1, foreach=foreach)
+                lambda param: optim.SGD(
+                    [param],
+                    lr=0.001,
+                    momentum=1,
+                    dampening=0.5,
+                    weight_decay=1,
+                    foreach=foreach,
+                )
             )
 
     def _test_derived_optimizers(self, optimizer_pairs_with_flags, flag):
@@ -495,21 +647,27 @@ def _test_derived_optimizers(self, optimizer_pairs_with_flags, flag):
         assert flag in ("foreach", "fused")
 
         kIterations = 4
-        device = 'cuda'
+        device = "cuda"
         for optimizer_constructor, params in optimizer_pairs_with_flags:
             res, state = [], []
             for foreach in (False, True):
-                input = torch.tensor([0.1, 0.2, 0.3, 0.4, 0.5, 0.6], dtype=torch.float64, device=device).reshape(3, 2)
+                input = torch.tensor(
+                    [0.1, 0.2, 0.3, 0.4, 0.5, 0.6], dtype=torch.float64, device=device
+                ).reshape(3, 2)
 
                 torch.manual_seed(1)
-                model = torch.nn.Sequential(torch.nn.Linear(2, 3),
-                                            torch.nn.Sigmoid(),
-                                            torch.nn.Linear(3, 1),
-                                            torch.nn.Sigmoid())
+                model = torch.nn.Sequential(
+                    torch.nn.Linear(2, 3),
+                    torch.nn.Sigmoid(),
+                    torch.nn.Linear(3, 1),
+                    torch.nn.Sigmoid(),
+                )
                 model.to(dtype=torch.float64, device=device)
                 params_with_foreach = deepcopy(params)
                 params_with_foreach["foreach"] = foreach
-                optimizer = optimizer_constructor(model.parameters(), **params_with_foreach)
+                optimizer = optimizer_constructor(
+                    model.parameters(), **params_with_foreach
+                )
 
                 for _ in range(kIterations):
                     optimizer.zero_grad()
@@ -539,26 +697,36 @@ def _test_derived_optimizers(self, optimizer_pairs_with_flags, flag):
                     actual = mt_p_state[k]
                     # If `torch.optim.Adam` is `__init__`ed with either `fused=True` or `capturable=True`,
                     # `step` Tensor is 1D while usually it's 0D.
-                    if k == "step" and isinstance(actual, torch.Tensor) and actual.ndim == 1:
+                    if (
+                        k == "step"
+                        and isinstance(actual, torch.Tensor)
+                        and actual.ndim == 1
+                    ):
                         actual = actual[0]
                     self.assertEqual(st_p_state[k], actual, atol=5e-5, rtol=0)
 
     def test_multi_tensor_optimizers(self):
         optimizer_pairs_with_flags = [
-            (optim.Adam, dict(weight_decay=1., amsgrad=True)),
-            (optim.Adam, dict(weight_decay=1., amsgrad=False)),
-            (optim.Adam, dict(weight_decay=0., amsgrad=True)),
-            (optim.Adam, dict(weight_decay=0., amsgrad=False)),
-            (optim.AdamW, dict(weight_decay=1., amsgrad=True)),
-            (optim.AdamW, dict(weight_decay=1., amsgrad=False)),
-            (optim.AdamW, dict(weight_decay=0., amsgrad=True)),
-            (optim.AdamW, dict(weight_decay=0., amsgrad=False)),
-            (optim.NAdam, dict(weight_decay=0., momentum_decay=6e-3)),
-            (optim.NAdam, dict(weight_decay=1., momentum_decay=6e-3)),
-            (optim.NAdam, dict(weight_decay=0., momentum_decay=4e-3)),
+            (optim.Adam, dict(weight_decay=1.0, amsgrad=True)),
+            (optim.Adam, dict(weight_decay=1.0, amsgrad=False)),
+            (optim.Adam, dict(weight_decay=0.0, amsgrad=True)),
+            (optim.Adam, dict(weight_decay=0.0, amsgrad=False)),
+            (optim.AdamW, dict(weight_decay=1.0, amsgrad=True)),
+            (optim.AdamW, dict(weight_decay=1.0, amsgrad=False)),
+            (optim.AdamW, dict(weight_decay=0.0, amsgrad=True)),
+            (optim.AdamW, dict(weight_decay=0.0, amsgrad=False)),
+            (optim.NAdam, dict(weight_decay=0.0, momentum_decay=6e-3)),
+            (optim.NAdam, dict(weight_decay=1.0, momentum_decay=6e-3)),
+            (optim.NAdam, dict(weight_decay=0.0, momentum_decay=4e-3)),
             (optim.NAdam, dict(weight_decay=0.01, momentum_decay=4e-3)),
-            (optim.SGD, dict(lr=0.2, momentum=1, dampening=0, weight_decay=1, nesterov=True)),
-            (optim.SGD, dict(lr=0.2, momentum=1, dampening=0.5, weight_decay=1, nesterov=False)),
+            (
+                optim.SGD,
+                dict(lr=0.2, momentum=1, dampening=0, weight_decay=1, nesterov=True),
+            ),
+            (
+                optim.SGD,
+                dict(lr=0.2, momentum=1, dampening=0.5, weight_decay=1, nesterov=False),
+            ),
             (optim.RAdam, dict(weight_decay=0)),
             (optim.RAdam, dict(weight_decay=1)),
             (optim.RMSprop, dict(weight_decay=1, momentum=1, centered=True)),
@@ -579,48 +747,71 @@ def test_multi_tensor_optimizers(self):
 
     def test_fused_optimizers(self):
         optimizer_pairs_with_flags = [
-            (optim.Adam, dict(weight_decay=1., amsgrad=False)),
-            (optim.Adam, dict(weight_decay=1., amsgrad=True)),
-            (optim.Adam, dict(weight_decay=0., amsgrad=False)),
-            (optim.Adam, dict(weight_decay=0., amsgrad=True)),
+            (optim.Adam, dict(weight_decay=1.0, amsgrad=False)),
+            (optim.Adam, dict(weight_decay=1.0, amsgrad=True)),
+            (optim.Adam, dict(weight_decay=0.0, amsgrad=False)),
+            (optim.Adam, dict(weight_decay=0.0, amsgrad=True)),
         ]
         self._test_derived_optimizers(optimizer_pairs_with_flags, "fused")
 
     def test_adam(self):
         self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: optim.Adam([weight, bias], lr=1e-3, maximize=maximize, foreach=foreach),
+            lambda weight, bias, maximize, foreach: optim.Adam(
+                [weight, bias], lr=1e-3, maximize=maximize, foreach=foreach
+            ),
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
             lambda weight, bias, maximize, foreach: optim.Adam(
-                self._build_params_dict(weight, bias, lr=1e-2), lr=1e-3, maximize=maximize, foreach=foreach),
+                self._build_params_dict(weight, bias, lr=1e-2),
+                lr=1e-3,
+                maximize=maximize,
+                foreach=foreach,
+            ),
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
             lambda weight, bias, maximize, foreach: optim.Adam(
-                [weight, bias], lr=1e-3, amsgrad=True, maximize=maximize, foreach=foreach),
+                [weight, bias],
+                lr=1e-3,
+                amsgrad=True,
+                maximize=maximize,
+                foreach=foreach,
+            ),
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
             lambda weight, bias, maximize, foreach: optim.Adam(
-                [weight, bias], lr=1e-3, weight_decay=0.1, maximize=maximize, foreach=foreach),
+                [weight, bias],
+                lr=1e-3,
+                weight_decay=0.1,
+                maximize=maximize,
+                foreach=foreach,
+            ),
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
             lambda weight, bias, maximize, foreach: optim.Adam(
                 self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3, amsgrad=True, maximize=maximize, foreach=foreach),
+                lr=1e-3,
+                amsgrad=True,
+                maximize=maximize,
+                foreach=foreach,
+            ),
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
             lambda weight, bias, maximize, foreach: optim.Adam(
                 self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3, maximize=maximize, foreach=foreach),
+                lr=1e-3,
+                maximize=maximize,
+                foreach=foreach,
+            ),
             [lambda opt: ExponentialLR(opt, gamma=0.9)],
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
@@ -628,7 +819,10 @@ def test_adam(self):
         self._test_basic_cases(
             lambda weight, bias, maximize, foreach: optim.Adam(
                 self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3, maximize=maximize, foreach=foreach),
+                lr=1e-3,
+                maximize=maximize,
+                foreach=foreach,
+            ),
             [lambda opt: LinearLR(opt, start_factor=0.4, total_iters=4)],
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
@@ -636,33 +830,56 @@ def test_adam(self):
         self._test_basic_cases(
             lambda weight, bias, maximize, foreach: optim.Adam(
                 self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3, maximize=maximize, foreach=foreach),
+                lr=1e-3,
+                maximize=maximize,
+                foreach=foreach,
+            ),
             [lambda opt: ConstantLR(opt, factor=0.4, total_iters=4)],
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
             lambda weight, bias, maximize, foreach: optim.Adam(
-                [weight, bias], lr=1e-3, amsgrad=True, maximize=maximize, foreach=foreach),
-            [lambda opt: ConstantLR(opt, factor=0.4, total_iters=4),
-                lambda opt: ExponentialLR(opt, gamma=0.9)],
+                [weight, bias],
+                lr=1e-3,
+                amsgrad=True,
+                maximize=maximize,
+                foreach=foreach,
+            ),
+            [
+                lambda opt: ConstantLR(opt, factor=0.4, total_iters=4),
+                lambda opt: ExponentialLR(opt, gamma=0.9),
+            ],
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
             lambda weight, bias, maximize, foreach: optim.Adam(
-                [weight, bias], lr=1e-3, amsgrad=True, maximize=maximize, foreach=foreach),
-            [lambda opt: ExponentialLR(opt, gamma=0.9),
-                lambda opt: ReduceLROnPlateau(opt)],
+                [weight, bias],
+                lr=1e-3,
+                amsgrad=True,
+                maximize=maximize,
+                foreach=foreach,
+            ),
+            [
+                lambda opt: ExponentialLR(opt, gamma=0.9),
+                lambda opt: ReduceLROnPlateau(opt),
+            ],
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
             lambda weight, bias, maximize, foreach: optim.Adam(
                 self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3, amsgrad=True, maximize=maximize, foreach=foreach),
-            [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
-                lambda opt: ReduceLROnPlateau(opt)],
+                lr=1e-3,
+                amsgrad=True,
+                maximize=maximize,
+                foreach=foreach,
+            ),
+            [
+                lambda opt: StepLR(opt, gamma=0.9, step_size=10),
+                lambda opt: ReduceLROnPlateau(opt),
+            ],
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
         )
@@ -670,7 +887,10 @@ def test_adam(self):
         self._test_basic_cases(
             lambda weight, bias, maximize, foreach: optim.Adam(
                 self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3, maximize=maximize, foreach=foreach),
+                lr=1e-3,
+                maximize=maximize,
+                foreach=foreach,
+            ),
             [lambda opt: PolynomialLR(opt, total_iters=4, power=0.9)],
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
@@ -678,7 +898,9 @@ def test_adam(self):
         self._test_complex_2d(optim.Adam)
         self._test_complex_2d(functools.partial(optim.Adam, foreach=True))
 
-        with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 0: 1.0"):
+        with self.assertRaisesRegex(
+            ValueError, "Invalid beta parameter at index 0: 1.0"
+        ):
             optim.Adam(None, lr=1e-2, betas=(1.0, 0.0))
 
         with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"):
@@ -686,25 +908,42 @@ def test_adam(self):
 
     def test_adamw(self):
         self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: optim.AdamW([weight, bias], lr=1e-3, maximize=maximize, foreach=foreach),
+            lambda weight, bias, maximize, foreach: optim.AdamW(
+                [weight, bias], lr=1e-3, maximize=maximize, foreach=foreach
+            ),
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
             lambda weight, bias, maximize, foreach: optim.AdamW(
-                self._build_params_dict(weight, bias, lr=1e-2), lr=1e-3, maximize=maximize, foreach=foreach),
+                self._build_params_dict(weight, bias, lr=1e-2),
+                lr=1e-3,
+                maximize=maximize,
+                foreach=foreach,
+            ),
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
             lambda weight, bias, maximize, foreach: optim.AdamW(
-                [weight, bias], lr=1e-3, weight_decay=1, maximize=maximize, foreach=foreach),
+                [weight, bias],
+                lr=1e-3,
+                weight_decay=1,
+                maximize=maximize,
+                foreach=foreach,
+            ),
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
             lambda weight, bias, maximize, foreach: optim.AdamW(
-                [weight, bias], lr=1e-3, weight_decay=1, amsgrad=True, maximize=maximize, foreach=foreach),
+                [weight, bias],
+                lr=1e-3,
+                weight_decay=1,
+                amsgrad=True,
+                maximize=maximize,
+                foreach=foreach,
+            ),
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
         )
@@ -715,21 +954,25 @@ def test_adamw(self):
 
     def test_sparse_adam(self):
         self._test_rosenbrock_sparse(
-            lambda params: optim.SparseAdam(params, lr=4e-2),
-            [],
-            True
+            lambda params: optim.SparseAdam(params, lr=4e-2), [], True
         )
         self._test_rosenbrock_sparse(
             lambda params: optim.SparseAdam(params, lr=4e-2, maximize=True),
             [],
             True,
-            True
+            True,
         )
-        with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 0: 1.0"):
+        with self.assertRaisesRegex(
+            ValueError, "Invalid beta parameter at index 0: 1.0"
+        ):
             optim.SparseAdam(None, lr=1e-2, betas=(1.0, 0.0))
-        with self.assertRaisesRegex(ValueError, "SparseAdam requires dense parameter tensors"):
+        with self.assertRaisesRegex(
+            ValueError, "SparseAdam requires dense parameter tensors"
+        ):
             optim.SparseAdam([torch.zeros(3, layout=torch.sparse_coo)])
-        with self.assertRaisesRegex(ValueError, "SparseAdam requires dense parameter tensors"):
+        with self.assertRaisesRegex(
+            ValueError, "SparseAdam requires dense parameter tensors"
+        ):
             optim.SparseAdam([{"params": [torch.zeros(3, layout=torch.sparse_coo)]}])
 
     # ROCm precision is too low to pass this test
@@ -737,27 +980,38 @@ def test_adadelta(self):
         # Handles https://github.com/pytorch/pytorch/issues/69698
         self.rel_tol = 4e-3
         self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: optim.Adadelta([weight, bias], maximize=maximize, foreach=foreach),
+            lambda weight, bias, maximize, foreach: optim.Adadelta(
+                [weight, bias], maximize=maximize, foreach=foreach
+            ),
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
             lambda weight, bias, maximize, foreach: optim.Adadelta(
-                self._build_params_dict(weight, bias, rho=0.95), maximize=maximize, foreach=foreach),
+                self._build_params_dict(weight, bias, rho=0.95),
+                maximize=maximize,
+                foreach=foreach,
+            ),
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
             lambda weight, bias, maximize, foreach: optim.Adadelta(
-                self._build_params_dict(weight, bias, rho=0.95), maximize=maximize, foreach=foreach),
-            [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
-                lambda opt: ReduceLROnPlateau(opt)],
+                self._build_params_dict(weight, bias, rho=0.95),
+                maximize=maximize,
+                foreach=foreach,
+            ),
+            [
+                lambda opt: StepLR(opt, gamma=0.9, step_size=10),
+                lambda opt: ReduceLROnPlateau(opt),
+            ],
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
             lambda weight, bias, maximize, foreach: optim.Adadelta(
-                [weight, bias], weight_decay=1, maximize=maximize, foreach=foreach),
+                [weight, bias], weight_decay=1, maximize=maximize, foreach=foreach
+            ),
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
         )
@@ -768,52 +1022,68 @@ def test_adadelta_complex(self):
         # Handles https://github.com/pytorch/pytorch/issues/69698
         self.rel_tol = 2e-2
         for optimizer in [optim.Adadelta]:
-            self._test_complex_optimizer(
-                lambda weight: optimizer([weight])
-            )
-            self._test_complex_optimizer(
-                lambda weight: optimizer([weight], rho=0.95)
-            )
+            self._test_complex_optimizer(lambda weight: optimizer([weight]))
+            self._test_complex_optimizer(lambda weight: optimizer([weight], rho=0.95))
             self._test_complex_optimizer(
                 lambda weight: optimizer([weight], rho=0.95, weight_decay=1)
             )
 
     def test_nadam(self):
         self._test_basic_cases(
-            lambda weight, bias, foreach: optim.NAdam([weight, bias], lr=1e-3, foreach=foreach),
+            lambda weight, bias, foreach: optim.NAdam(
+                [weight, bias], lr=1e-3, foreach=foreach
+            ),
             constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
             lambda weight, bias, foreach: optim.NAdam(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3, foreach=foreach),
+                self._build_params_dict(weight, bias, lr=1e-2), lr=1e-3, foreach=foreach
+            ),
             constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
             lambda weight, bias, foreach: optim.NAdam(
-                [weight, bias], lr=1e-3, weight_decay=0.1, momentum_decay=6e-3, foreach=foreach),
+                [weight, bias],
+                lr=1e-3,
+                weight_decay=0.1,
+                momentum_decay=6e-3,
+                foreach=foreach,
+            ),
             constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
             lambda weight, bias, foreach: optim.NAdam(
-                [weight, bias], lr=1e-3, weight_decay=0.1, momentum_decay=6e-3, foreach=foreach),
+                [weight, bias],
+                lr=1e-3,
+                weight_decay=0.1,
+                momentum_decay=6e-3,
+                foreach=foreach,
+            ),
             [lambda opt: ExponentialLR(opt, gamma=0.9)],
             constructor_accepts_foreach=True,
         )
-        with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 0: 1.0"):
+        with self.assertRaisesRegex(
+            ValueError, "Invalid beta parameter at index 0: 1.0"
+        ):
             optim.NAdam(None, lr=1e-2, betas=(1.0, 0.0))
         with self.assertRaisesRegex(ValueError, "Invalid momentum_decay value: -0.2"):
             optim.NAdam(None, lr=1e-2, momentum_decay=-0.2)
 
     def test_adagrad(self):
         self._test_basic_cases(
-            lambda weight, bias, maximize, foreach: optim.Adagrad([weight, bias], lr=1e-1, maximize=maximize, foreach=foreach),
+            lambda weight, bias, maximize, foreach: optim.Adagrad(
+                [weight, bias], lr=1e-1, maximize=maximize, foreach=foreach
+            ),
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
             lambda weight, bias, maximize, foreach: optim.Adagrad(
-                [weight, bias], lr=1e-1, initial_accumulator_value=0.1, maximize=maximize, foreach=foreach,
+                [weight, bias],
+                lr=1e-1,
+                initial_accumulator_value=0.1,
+                maximize=maximize,
+                foreach=foreach,
             ),
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
@@ -822,7 +1092,9 @@ def test_adagrad(self):
             lambda weight, bias, maximize, foreach: optim.Adagrad(
                 self._build_params_dict(weight, bias, lr=1e-2),
                 lr=1e-1,
-                maximize=maximize, foreach=foreach),
+                maximize=maximize,
+                foreach=foreach,
+            ),
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
         )
@@ -830,7 +1102,9 @@ def test_adagrad(self):
             lambda weight, bias, maximize, foreach: optim.Adagrad(
                 self._build_params_dict(weight, bias, lr=1e-2),
                 lr=1e-1,
-                maximize=maximize, foreach=foreach),
+                maximize=maximize,
+                foreach=foreach,
+            ),
             [lambda opt: ReduceLROnPlateau(opt)],
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
@@ -839,9 +1113,13 @@ def test_adagrad(self):
             lambda weight, bias, maximize, foreach: optim.Adagrad(
                 self._build_params_dict(weight, bias, lr=1e-2),
                 lr=1e-1,
-                maximize=maximize, foreach=foreach),
-            [lambda opt: ReduceLROnPlateau(opt),
-                lambda opt: ExponentialLR(opt, gamma=0.99)],
+                maximize=maximize,
+                foreach=foreach,
+            ),
+            [
+                lambda opt: ReduceLROnPlateau(opt),
+                lambda opt: ExponentialLR(opt, gamma=0.99),
+            ],
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
         )
@@ -855,8 +1133,10 @@ def test_adagrad_sparse(self):
             )
             self._test_rosenbrock_sparse(
                 lambda params: optim.Adagrad(params, lr=0.1, foreach=foreach),
-                [lambda opt: StepLR(opt, gamma=1 - 1e-5, step_size=500),
-                 lambda opt: ReduceLROnPlateau(opt, threshold=1e-4)]
+                [
+                    lambda opt: StepLR(opt, gamma=1 - 1e-5, step_size=500),
+                    lambda opt: ReduceLROnPlateau(opt, threshold=1e-4),
+                ],
             )
 
     def test_adagrad_complex(self):
@@ -866,55 +1146,81 @@ def test_adagrad_complex(self):
             )
             self._test_complex_optimizer(
                 lambda param: optim.Adagrad(
-                    [param], lr=1e-1, initial_accumulator_value=0.1, foreach=foreach,
+                    [param],
+                    lr=1e-1,
+                    initial_accumulator_value=0.1,
+                    foreach=foreach,
                 )
             )
 
     def test_adamax(self):
         self._test_basic_cases(
             lambda weight, bias, maximize, foreach: optim.Adamax(
-                [weight, bias], lr=1e-1, maximize=maximize, foreach=foreach),
+                [weight, bias], lr=1e-1, maximize=maximize, foreach=foreach
+            ),
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
             lambda weight, bias, maximize, foreach: optim.Adamax(
                 self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-1, maximize=maximize, foreach=foreach),
+                lr=1e-1,
+                maximize=maximize,
+                foreach=foreach,
+            ),
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
             lambda weight, bias, maximize, foreach: optim.Adamax(
-                [weight, bias], lr=1e-1, weight_decay=1, maximize=maximize, foreach=foreach),
+                [weight, bias],
+                lr=1e-1,
+                weight_decay=1,
+                maximize=maximize,
+                foreach=foreach,
+            ),
             constructor_accepts_maximize=True,
             constructor_accepts_foreach=True,
         )
         self._test_complex_2d(optim.Adamax)
         self._test_complex_2d(functools.partial(optim.Adamax, foreach=True))
-        with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 1: 1.0"):
+        with self.assertRaisesRegex(
+            ValueError, "Invalid beta parameter at index 1: 1.0"
+        ):
             optim.Adamax(None, lr=1e-2, betas=(0.0, 1.0))
 
     def test_radam(self):
         self._test_basic_cases(
-            lambda weight, bias, foreach: optim.RAdam([weight, bias], lr=1e-3, foreach=foreach),
+            lambda weight, bias, foreach: optim.RAdam(
+                [weight, bias], lr=1e-3, foreach=foreach
+            ),
             constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
             lambda weight, bias, foreach: optim.RAdam(
-                self._build_params_dict(weight, bias, lr=1e-2), lr=1e-3, foreach=foreach),
+                self._build_params_dict(weight, bias, lr=1e-2), lr=1e-3, foreach=foreach
+            ),
             constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
-            lambda weight, bias, foreach: optim.RAdam([weight, bias], lr=1e-3, weight_decay=0.1, foreach=foreach),
+            lambda weight, bias, foreach: optim.RAdam(
+                [weight, bias], lr=1e-3, weight_decay=0.1, foreach=foreach
+            ),
             constructor_accepts_foreach=True,
         )
         self._test_basic_cases(
-            lambda weight, bias, foreach: optim.RAdam([weight, bias], lr=1e-3, foreach=foreach),
-            [lambda opt: ExponentialLR(opt, gamma=0.9), lambda opt: ReduceLROnPlateau(opt)],
+            lambda weight, bias, foreach: optim.RAdam(
+                [weight, bias], lr=1e-3, foreach=foreach
+            ),
+            [
+                lambda opt: ExponentialLR(opt, gamma=0.9),
+                lambda opt: ReduceLROnPlateau(opt),
+            ],
             constructor_accepts_foreach=True,
         )
-        with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 0: 1.0"):
+        with self.assertRaisesRegex(
+            ValueError, "Invalid beta parameter at index 0: 1.0"
+        ):
             optim.RAdam(None, lr=1e-2, betas=(1.0, 0.0))
 
         with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"):
@@ -924,53 +1230,89 @@ def test_rmsprop(self):
         for foreach in (False, True):
             self._test_basic_cases(
                 lambda weight, bias, maximize, foreach: optim.RMSprop(
-                    [weight, bias], lr=1e-2, maximize=maximize, foreach=foreach),
+                    [weight, bias], lr=1e-2, maximize=maximize, foreach=foreach
+                ),
                 constructor_accepts_maximize=True,
                 constructor_accepts_foreach=True,
             )
             self._test_basic_cases(
                 lambda weight, bias, maximize, foreach: optim.RMSprop(
                     self._build_params_dict(weight, bias, lr=1e-3),
-                    lr=1e-2, maximize=maximize, foreach=foreach),
+                    lr=1e-2,
+                    maximize=maximize,
+                    foreach=foreach,
+                ),
                 constructor_accepts_maximize=True,
                 constructor_accepts_foreach=True,
             )
             self._test_basic_cases(
                 lambda weight, bias, maximize, foreach: optim.RMSprop(
                     self._build_params_dict(weight, bias, lr=1e-3),
-                    lr=1e-2, centered=True, maximize=maximize, foreach=foreach),
+                    lr=1e-2,
+                    centered=True,
+                    maximize=maximize,
+                    foreach=foreach,
+                ),
                 constructor_accepts_maximize=True,
                 constructor_accepts_foreach=True,
             )
             self._test_basic_cases(
                 lambda weight, bias, maximize, foreach: optim.RMSprop(
                     self._build_params_dict(weight, bias, lr=1e-3),
-                    lr=1e-2, centered=True, momentum=0.1, maximize=maximize, foreach=foreach),
+                    lr=1e-2,
+                    centered=True,
+                    momentum=0.1,
+                    maximize=maximize,
+                    foreach=foreach,
+                ),
                 constructor_accepts_maximize=True,
                 constructor_accepts_foreach=True,
             )
             self._test_basic_cases(
                 lambda weight, bias, maximize, foreach: optim.RMSprop(
                     self._build_params_dict(weight, bias, lr=1e-3),
-                    lr=1e-2, momentum=0.1, maximize=maximize, foreach=foreach),
+                    lr=1e-2,
+                    momentum=0.1,
+                    maximize=maximize,
+                    foreach=foreach,
+                ),
                 constructor_accepts_maximize=True,
                 constructor_accepts_foreach=True,
             )
             self._test_basic_cases(
                 lambda weight, bias, maximize, foreach: optim.RMSprop(
                     self._build_params_dict(weight, bias, lr=1e-3),
-                    lr=1e-2, momentum=0.1, weight_decay=1, maximize=maximize, foreach=foreach),
+                    lr=1e-2,
+                    momentum=0.1,
+                    weight_decay=1,
+                    maximize=maximize,
+                    foreach=foreach,
+                ),
                 constructor_accepts_maximize=True,
                 constructor_accepts_foreach=True,
             )
             self._test_complex_2d(lambda param: optim.RMSprop(param, foreach=foreach))
-            self._test_complex_2d(lambda param: optim.RMSprop(param, centered=True, foreach=foreach))
-            self._test_complex_2d(lambda param: optim.RMSprop(param, momentum=0.1, foreach=foreach))
-            self._test_complex_2d(lambda param: optim.RMSprop(param, maximize=True, foreach=foreach))
-            self._test_complex_optimizer(lambda param: optim.RMSprop([param], foreach=foreach))
-            self._test_complex_optimizer(lambda param: optim.RMSprop([param], centered=True, foreach=foreach))
-            self._test_complex_optimizer(lambda param: optim.RMSprop([param], momentum=0.1, foreach=foreach))
-            self._test_complex_optimizer(lambda param: optim.RMSprop([param], maximize=True, foreach=foreach))
+            self._test_complex_2d(
+                lambda param: optim.RMSprop(param, centered=True, foreach=foreach)
+            )
+            self._test_complex_2d(
+                lambda param: optim.RMSprop(param, momentum=0.1, foreach=foreach)
+            )
+            self._test_complex_2d(
+                lambda param: optim.RMSprop(param, maximize=True, foreach=foreach)
+            )
+            self._test_complex_optimizer(
+                lambda param: optim.RMSprop([param], foreach=foreach)
+            )
+            self._test_complex_optimizer(
+                lambda param: optim.RMSprop([param], centered=True, foreach=foreach)
+            )
+            self._test_complex_optimizer(
+                lambda param: optim.RMSprop([param], momentum=0.1, foreach=foreach)
+            )
+            self._test_complex_optimizer(
+                lambda param: optim.RMSprop([param], maximize=True, foreach=foreach)
+            )
             with self.assertRaisesRegex(ValueError, "Invalid momentum value: -1.0"):
                 optim.RMSprop(None, lr=1e-2, momentum=-1.0, foreach=foreach)
 
@@ -978,69 +1320,103 @@ def test_asgd(self):
         for foreach in (False, True):
             self._test_basic_cases(
                 lambda weight, bias, maximize, foreach: optim.ASGD(
-                    [weight, bias], lr=1e-3, t0=100, maximize=maximize, foreach=foreach),
+                    [weight, bias], lr=1e-3, t0=100, maximize=maximize, foreach=foreach
+                ),
                 constructor_accepts_maximize=True,
                 constructor_accepts_foreach=True,
             )
             self._test_basic_cases(
                 lambda weight, bias, maximize, foreach: optim.ASGD(
                     self._build_params_dict(weight, bias, lr=1e-2),
-                    lr=1e-3, t0=100, maximize=maximize, foreach=foreach),
+                    lr=1e-3,
+                    t0=100,
+                    maximize=maximize,
+                    foreach=foreach,
+                ),
                 constructor_accepts_maximize=True,
                 constructor_accepts_foreach=True,
             )
             self._test_basic_cases(
                 lambda weight, bias, maximize, foreach: optim.ASGD(
                     self._build_params_dict(weight, bias, lr=1e-2),
-                    lr=1e-3, weight_decay=1, maximize=maximize, foreach=foreach),
+                    lr=1e-3,
+                    weight_decay=1,
+                    maximize=maximize,
+                    foreach=foreach,
+                ),
                 constructor_accepts_maximize=True,
                 constructor_accepts_foreach=True,
             )
             # Ref: https://github.com/pytorch/pytorch/issues/84560
             # self._test_complex_2d(optimizer)
-            self._test_complex_optimizer(lambda params: optim.ASGD([params], foreach=foreach))
-            self._test_complex_optimizer(lambda params: optim.ASGD([params], maximize=True, foreach=foreach))
-            self._test_complex_optimizer(lambda params: optim.ASGD([params], maximize=True, weight_decay=0.9, foreach=foreach))
-            self._test_complex_optimizer(lambda params: optim.ASGD([params], maximize=False, weight_decay=0.9, foreach=foreach))
-            self._test_complex_optimizer(lambda params: optim.ASGD([params], weight_decay=0.9, foreach=foreach))
+            self._test_complex_optimizer(
+                lambda params: optim.ASGD([params], foreach=foreach)
+            )
+            self._test_complex_optimizer(
+                lambda params: optim.ASGD([params], maximize=True, foreach=foreach)
+            )
+            self._test_complex_optimizer(
+                lambda params: optim.ASGD(
+                    [params], maximize=True, weight_decay=0.9, foreach=foreach
+                )
+            )
+            self._test_complex_optimizer(
+                lambda params: optim.ASGD(
+                    [params], maximize=False, weight_decay=0.9, foreach=foreach
+                )
+            )
+            self._test_complex_optimizer(
+                lambda params: optim.ASGD([params], weight_decay=0.9, foreach=foreach)
+            )
             with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -0.5"):
                 optim.ASGD(None, lr=1e-2, weight_decay=-0.5, foreach=foreach)
 
     @skipIfRocm
     def test_rprop(self):
-        is_cuda_sm86 = torch.cuda.is_available() and torch.cuda.get_device_capability(0) == (8, 6)
+        is_cuda_sm86 = torch.cuda.is_available() and torch.cuda.get_device_capability(
+            0
+        ) == (8, 6)
         for foreach in (False, True):
             self._test_basic_cases(
                 lambda weight, bias, maximize, foreach: optim.Rprop(
-                    [weight, bias], lr=2e-4, maximize=maximize, foreach=foreach),
+                    [weight, bias], lr=2e-4, maximize=maximize, foreach=foreach
+                ),
                 constructor_accepts_maximize=True,
                 constructor_accepts_foreach=True,
             )
             self._test_basic_cases(
                 lambda weight, bias, maximize, foreach: optim.Rprop(
-                    self._build_params_dict(weight, bias, lr=1e-2), lr=2e-4, maximize=maximize, foreach=foreach),
+                    self._build_params_dict(weight, bias, lr=1e-2),
+                    lr=2e-4,
+                    maximize=maximize,
+                    foreach=foreach,
+                ),
                 constructor_accepts_maximize=True,
                 constructor_accepts_foreach=True,
-                atol=4e-5 if is_cuda_sm86 else None, rtol=3e-5 if is_cuda_sm86 else None
+                atol=4e-5 if is_cuda_sm86 else None,
+                rtol=3e-5 if is_cuda_sm86 else None,
             )
             self._test_complex_2d(lambda param: optim.Rprop(param, foreach=foreach))
             self._test_complex_optimizer(
                 lambda param: optim.Rprop([param], lr=0.001, foreach=foreach)
             )
             self._test_complex_optimizer(
-                lambda param: optim.Rprop([param], lr=0.001, maximize=True, foreach=foreach)
+                lambda param: optim.Rprop(
+                    [param], lr=0.001, maximize=True, foreach=foreach
+                )
             )
             with self.assertRaisesRegex(ValueError, "Invalid eta values: 1.0, 0.5"):
                 optim.Rprop(None, lr=1e-2, etas=(1.0, 0.5), foreach=foreach)
 
     def test_lbfgs(self):
         self._test_basic_cases(
-            lambda weight, bias: optim.LBFGS([weight, bias]),
-            ignore_multidevice=True
+            lambda weight, bias: optim.LBFGS([weight, bias]), ignore_multidevice=True
         )
         self._test_basic_cases(
-            lambda weight, bias: optim.LBFGS([weight, bias], line_search_fn="strong_wolfe"),
-            ignore_multidevice=True
+            lambda weight, bias: optim.LBFGS(
+                [weight, bias], line_search_fn="strong_wolfe"
+            ),
+            ignore_multidevice=True,
         )
 
     @unittest.skipIf(TEST_WITH_UBSAN, "division-by-zero error with UBSAN")
@@ -1066,7 +1442,9 @@ def test_duplicate_params_in_param_group(self):
             warnings.simplefilter("always")
             optim.SGD([param, param], lr=0.1)
             self.assertEqual(len(w), 1)
-            self.assertIn('a parameter group with duplicate parameters', str(w[0].message))
+            self.assertIn(
+                "a parameter group with duplicate parameters", str(w[0].message)
+            )
 
     def test_no_grad_for_all_params(self):
         params = [torch.randn(5, 5, requires_grad=False) for _ in range(2)]
@@ -1097,13 +1475,25 @@ def test_functional_fused_adam_with_foundinf(self):
 
         num_tensors = 5
         for amsgrad in (False, True):
-            params, grads, exp_avgs, exp_avg_sqs = [[torch.ones((1,), device="cuda") for _ in range(num_tensors)] for _ in range(4)]
-            max_exp_avg_sqs = [torch.ones((1,), device="cuda") for _ in range(num_tensors)] if amsgrad else []
-            state_steps = [torch.ones((1,), dtype=torch.float32, device="cuda") for _ in range(num_tensors)]
+            params, grads, exp_avgs, exp_avg_sqs = [
+                [torch.ones((1,), device="cuda") for _ in range(num_tensors)]
+                for _ in range(4)
+            ]
+            max_exp_avg_sqs = (
+                [torch.ones((1,), device="cuda") for _ in range(num_tensors)]
+                if amsgrad
+                else []
+            )
+            state_steps = [
+                torch.ones((1,), dtype=torch.float32, device="cuda")
+                for _ in range(num_tensors)
+            ]
             grad_scale = torch.cuda.amp.grad_scaler._MultiDeviceReplicator(
-                torch.ones((1,), dtype=torch.float32, device="cuda"))
+                torch.ones((1,), dtype=torch.float32, device="cuda")
+            )
             found_inf = torch.cuda.amp.grad_scaler._MultiDeviceReplicator(
-                torch.ones((1,), dtype=torch.float32, device="cuda"))
+                torch.ones((1,), dtype=torch.float32, device="cuda")
+            )
 
             adam.adam(
                 params,
@@ -1119,7 +1509,7 @@ def test_functional_fused_adam_with_foundinf(self):
                 beta1=0.9,
                 beta2=0.99,
                 lr=1e-2,
-                weight_decay=.0,
+                weight_decay=0.0,
                 eps=1e-8,
                 maximize=False,
                 grad_scale=grad_scale,
@@ -1128,16 +1518,32 @@ def test_functional_fused_adam_with_foundinf(self):
 
             self.assertEqual(
                 state_steps,
-                [torch.ones((1,), dtype=torch.float32, device="cuda") for _ in range(num_tensors)],
+                [
+                    torch.ones((1,), dtype=torch.float32, device="cuda")
+                    for _ in range(num_tensors)
+                ],
             )
 
     def test_empty_grad(self):
-        optimizers = [torch.optim.Adadelta, torch.optim.Adagrad, torch.optim.Adam, torch.optim.AdamW,
-                      torch.optim.Adamax, torch.optim.ASGD, torch.optim.NAdam, torch.optim.RAdam,
-                      torch.optim.RMSprop, torch.optim.Rprop, torch.optim.SGD, torch.optim.SparseAdam]
+        optimizers = [
+            torch.optim.Adadelta,
+            torch.optim.Adagrad,
+            torch.optim.Adam,
+            torch.optim.AdamW,
+            torch.optim.Adamax,
+            torch.optim.ASGD,
+            torch.optim.NAdam,
+            torch.optim.RAdam,
+            torch.optim.RMSprop,
+            torch.optim.Rprop,
+            torch.optim.SGD,
+            torch.optim.SparseAdam,
+        ]
 
         for optimizer in optimizers:
-            net = torch.nn.Embedding(5, 1, padding_idx=0, sparse=optimizer is torch.optim.SparseAdam)
+            net = torch.nn.Embedding(
+                5, 1, padding_idx=0, sparse=optimizer is torch.optim.SparseAdam
+            )
             original_params = (param.detach().clone() for param in net.parameters())
             # Simulate a batch that only indexes the embedding at padding_idx
             x = torch.tensor([[0, 0]]).int()
@@ -1152,7 +1558,6 @@ def test_empty_grad(self):
                 self.assertEqual(original_param, param)
 
 
-
 class SchedulerTestNet(torch.nn.Module):
     def __init__(self):
         super(SchedulerTestNet, self).__init__()
@@ -1184,8 +1589,12 @@ def setUp(self):
         super(TestLRScheduler, self).setUp()
         self.net = SchedulerTestNet()
         self.opt = SGD(
-            [{'params': self.net.conv1.parameters()}, {'params': self.net.conv2.parameters(), 'lr': 0.5}],
-            lr=0.05)
+            [
+                {"params": self.net.conv1.parameters()},
+                {"params": self.net.conv2.parameters(), "lr": 0.5},
+            ],
+            lr=0.05,
+        )
 
     def _check_warning_is_epoch_deprecation_warning(self, w, *, num_warnings: int = 1):
         """This function swallows the epoch deprecation warning which is produced when we
@@ -1201,23 +1610,30 @@ def _check_warning_is_epoch_deprecation_warning(self, w, *, num_warnings: int =
     def test_error_when_getlr_has_epoch(self):
         class MultiStepLR(torch.optim.lr_scheduler.LRScheduler):
             def __init__(self, optimizer, gamma, milestones, last_epoch=-1):
-                self.init_lr = [group['lr'] for group in optimizer.param_groups]
+                self.init_lr = [group["lr"] for group in optimizer.param_groups]
                 self.gamma = gamma
                 self.milestones = milestones
                 super().__init__(optimizer, last_epoch)
 
             def get_lr(self, step):
                 global_step = self.last_epoch
-                gamma_power = ([0] + [i + 1 for i, m in enumerate(self.milestones) if global_step >= m])[-1]
-                return [init_lr * (self.gamma ** gamma_power) for init_lr in self.init_lr]
+                gamma_power = (
+                    [0]
+                    + [i + 1 for i, m in enumerate(self.milestones) if global_step >= m]
+                )[-1]
+                return [
+                    init_lr * (self.gamma**gamma_power) for init_lr in self.init_lr
+                ]
 
         optimizer = torch.optim.SGD([torch.rand(1)], lr=1)
 
         with self.assertRaises(TypeError):
             scheduler = MultiStepLR(optimizer, gamma=1, milestones=[10, 20])
 
+    @skipIfTorchDynamo("Torchdynamo keeps references to optim in the guards and the stack of the graph break frames")
     def test_no_cyclic_references(self):
         import gc
+
         param = Parameter(torch.empty(10))
         optim = SGD([param], lr=0.5)
         scheduler = LambdaLR(optim, lambda epoch: 1.0)
@@ -1225,23 +1641,29 @@ def test_no_cyclic_references(self):
 
         # Prior to Python 3.7, local variables in a function will be referred by the current frame.
         import sys
+
         if sys.version_info < (3, 7):
             import inspect
+
             referrers = gc.get_referrers(optim)
             self.assertTrue(
                 len(referrers) == 1 and referrers[0] is inspect.currentframe(),
-                "Optimizer should contain no cyclic references (except current frame)")
+                "Optimizer should contain no cyclic references (except current frame)",
+            )
             del referrers
         else:
             self.assertTrue(
                 len(gc.get_referrers(optim)) == 0,
-                "Optimizer should contain no cyclic references")
+                "Optimizer should contain no cyclic references",
+            )
 
         gc.collect()
         del optim
         self.assertEqual(
-            gc.collect(), 0, msg="Optimizer should be garbage-collected on __del__")
+            gc.collect(), 0, msg="Optimizer should be garbage-collected on __del__"
+        )
 
+    @skipIfTorchDynamo("Torchdynamo keeps references to optim in the guards and the stack of the graph break frames")
     def test_no_cyclic_references_in_step(self):
         import gc
         import weakref
@@ -1261,6 +1683,7 @@ def run():
         # automatically collect unreachable objects.
         gc.disable()
         ref = run()
+
         assert ref() is None
         gc.enable()  # restore
 
@@ -1276,7 +1699,7 @@ def old_pattern():
                 scheduler.step()
                 self.opt.step()
 
-        self.assertWarnsRegex(UserWarning, r'how-to-adjust-learning-rate', old_pattern)
+        self.assertWarnsRegex(UserWarning, r"how-to-adjust-learning-rate", old_pattern)
 
     def test_old_pattern_warning_with_arg(self):
         epochs = 35
@@ -1290,12 +1713,12 @@ def old_pattern2():
                 scheduler.step()
                 self.opt.step()
 
-        self.assertWarnsRegex(UserWarning, r'how-to-adjust-learning-rate', old_pattern2)
+        self.assertWarnsRegex(UserWarning, r"how-to-adjust-learning-rate", old_pattern2)
 
     def test_old_pattern_warning_resuming(self):
         epochs = 35
         for i, group in enumerate(self.opt.param_groups):
-            group['initial_lr'] = 0.01
+            group["initial_lr"] = 0.01
 
         with warnings.catch_warnings(record=True) as ws:
             warnings.simplefilter("always")  # allow any warning to be raised
@@ -1307,12 +1730,12 @@ def old_pattern():
                 scheduler.step()
                 self.opt.step()
 
-        self.assertWarnsRegex(UserWarning, r'how-to-adjust-learning-rate', old_pattern)
+        self.assertWarnsRegex(UserWarning, r"how-to-adjust-learning-rate", old_pattern)
 
     def test_old_pattern_warning_resuming_with_arg(self):
         epochs = 35
         for i, group in enumerate(self.opt.param_groups):
-            group['initial_lr'] = 0.01
+            group["initial_lr"] = 0.01
 
         with warnings.catch_warnings(record=True) as ws:
             warnings.simplefilter("always")  # allow any warning to be raised
@@ -1324,12 +1747,12 @@ def old_pattern2():
                 scheduler.step()
                 self.opt.step()
 
-        self.assertWarnsRegex(UserWarning, r'how-to-adjust-learning-rate', old_pattern2)
+        self.assertWarnsRegex(UserWarning, r"how-to-adjust-learning-rate", old_pattern2)
 
     def test_old_pattern_warning_with_overridden_optim_step(self):
         epochs = 35
         for i, group in enumerate(self.opt.param_groups):
-            group['initial_lr'] = 0.01
+            group["initial_lr"] = 0.01
 
         with warnings.catch_warnings(record=True) as ws:
             warnings.simplefilter("always")  # allow any warning to be raised
@@ -1352,7 +1775,7 @@ def old_pattern2():
                 scheduler.step()
                 self.opt.step()
 
-        self.assertWarnsRegex(UserWarning, r'how-to-adjust-learning-rate', old_pattern2)
+        self.assertWarnsRegex(UserWarning, r"how-to-adjust-learning-rate", old_pattern2)
 
     def test_new_pattern_no_warning(self):
         epochs = 35
@@ -1405,7 +1828,9 @@ def new_pattern():
                 self.opt.step()
                 scheduler.step()
 
-        self.assertWarnsRegex(UserWarning, r'`optimizer.step\(\)` has been overridden', new_pattern)
+        self.assertWarnsRegex(
+            UserWarning, r"`optimizer.step\(\)` has been overridden", new_pattern
+        )
 
     def _test_lr_is_constant_for_constant_epoch(self, scheduler):
         l = []
@@ -1416,7 +1841,7 @@ def _test_lr_is_constant_for_constant_epoch(self, scheduler):
                 scheduler.step(2)
                 self._check_warning_is_epoch_deprecation_warning(w)
 
-            l.append(self.opt.param_groups[0]['lr'])
+            l.append(self.opt.param_groups[0]["lr"])
         self.assertEqual(min(l), max(l))
 
     def test_step_lr_is_constant_for_constant_epoch(self):
@@ -1451,8 +1876,11 @@ def test_step_lr(self):
 
     def test_get_last_lr_step_lr(self):
         from torch.nn import Parameter
+
         epochs = 10
-        optimizer = torch.optim.SGD([Parameter(torch.randn(2, 2, requires_grad=True))], 0.1)
+        optimizer = torch.optim.SGD(
+            [Parameter(torch.randn(2, 2, requires_grad=True))], 0.1
+        )
         targets = [[0.1] * 3 + [0.01] * 3 + [0.001] * 3 + [0.0001]]
         scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 3, gamma=0.1)
         self._test_get_last_lr(scheduler, targets, epochs)
@@ -1507,12 +1935,21 @@ def test_get_last_lr_linearlr(self):
         # lr = 0.005     if 4 <= epoch
         epochs = 10
         start_factor = 1.0 / 4
-        end_factor = 3. / 5
+        end_factor = 3.0 / 5
         iters = 4
-        interpolation = [start_factor + i * (end_factor - start_factor) / iters for i in range(iters)]
-        single_targets = [x * 0.05 for x in interpolation] + [0.05 * end_factor] * (epochs - iters)
+        interpolation = [
+            start_factor + i * (end_factor - start_factor) / iters for i in range(iters)
+        ]
+        single_targets = [x * 0.05 for x in interpolation] + [0.05 * end_factor] * (
+            epochs - iters
+        )
         targets = [single_targets, [x * epochs for x in single_targets]]
-        scheduler = LinearLR(self.opt, start_factor=start_factor, end_factor=end_factor, total_iters=iters)
+        scheduler = LinearLR(
+            self.opt,
+            start_factor=start_factor,
+            end_factor=end_factor,
+            total_iters=iters,
+        )
         self._test_get_last_lr(scheduler, targets, epochs)
 
     def test_constantlr(self):
@@ -1533,14 +1970,16 @@ def test_linearlr(self):
         epochs = 10
         start_factor = 1.0 / 2
         iters = 4
-        interpolation = [start_factor + i * (1 - start_factor) / iters for i in range(iters)]
+        interpolation = [
+            start_factor + i * (1 - start_factor) / iters for i in range(iters)
+        ]
         single_targets = [x * 0.05 for x in interpolation] + [0.05] * (epochs - iters)
         targets = [single_targets, [x * epochs for x in single_targets]]
         scheduler = LinearLR(self.opt, start_factor=start_factor, total_iters=iters)
         self._test(scheduler, targets, epochs)
 
     def test_linearlr_start_factor_limits1(self):
-        start_factor = 0.
+        start_factor = 0.0
         iters = 4
         with self.assertRaises(ValueError):
             LinearLR(self.opt, start_factor=start_factor, total_iters=iters)
@@ -1568,9 +2007,11 @@ def test_linearlr_with_epoch(self):
         # lr = 0.005     if 4 <= epoch
         epochs = 10
         start_factor = 1.0 / 2
-        end_factor = 1.
+        end_factor = 1.0
         iters = 4
-        interpolation = [start_factor + i * (end_factor - start_factor) / iters for i in range(iters)]
+        interpolation = [
+            start_factor + i * (end_factor - start_factor) / iters for i in range(iters)
+        ]
         single_targets = [x * 0.05 for x in interpolation] + [0.05] * (epochs - iters)
         targets = [single_targets, [x * epochs for x in single_targets]]
         scheduler = LinearLR(self.opt, start_factor=start_factor, total_iters=iters)
@@ -1578,7 +2019,7 @@ def test_linearlr_with_epoch(self):
 
     def test_exp_lr(self):
         epochs = 10
-        single_targets = [0.05 * (0.9 ** x) for x in range(epochs)]
+        single_targets = [0.05 * (0.9**x) for x in range(epochs)]
         targets = [single_targets, [x * epochs for x in single_targets]]
         scheduler = ExponentialLR(self.opt, gamma=0.9)
         self._test(scheduler, targets, epochs)
@@ -1587,7 +2028,9 @@ def test_poly_lr(self):
         epochs = 10
         power = 0.9
         total_iters = 5
-        single_targets = [(1.0 - x / total_iters) ** power * 0.05 for x in range(total_iters)] + [0.0] * (epochs - total_iters)
+        single_targets = [
+            (1.0 - x / total_iters) ** power * 0.05 for x in range(total_iters)
+        ] + [0.0] * (epochs - total_iters)
         targets = [single_targets, [x * epochs for x in single_targets]]
         scheduler = PolynomialLR(self.opt, power=power, total_iters=total_iters)
         self._test(scheduler, targets, epochs)
@@ -1595,9 +2038,10 @@ def test_poly_lr(self):
     def test_cos_anneal_lr(self):
         epochs = 10
         eta_min = 1e-10
-        single_targets = [eta_min + (0.05 - eta_min) *
-                          (1 + math.cos(math.pi * x / epochs)) / 2
-                          for x in range(epochs)]
+        single_targets = [
+            eta_min + (0.05 - eta_min) * (1 + math.cos(math.pi * x / epochs)) / 2
+            for x in range(epochs)
+        ]
         targets = [single_targets, [x * epochs for x in single_targets]]
         scheduler = CosineAnnealingLR(self.opt, T_max=epochs, eta_min=eta_min)
         self._test(scheduler, targets, epochs)
@@ -1608,8 +2052,12 @@ def test_closed_form_step_lr(self):
         self._test_against_closed_form(scheduler, closed_form_scheduler, 20)
 
     def test_closed_form_linearlr(self):
-        scheduler = LinearLR(self.opt, start_factor=1.0 / 3, end_factor=0.7, total_iters=4)
-        closed_form_scheduler = LinearLR(self.opt, start_factor=1.0 / 3, end_factor=0.7, total_iters=4)
+        scheduler = LinearLR(
+            self.opt, start_factor=1.0 / 3, end_factor=0.7, total_iters=4
+        )
+        closed_form_scheduler = LinearLR(
+            self.opt, start_factor=1.0 / 3, end_factor=0.7, total_iters=4
+        )
         self._test_against_closed_form(scheduler, closed_form_scheduler, 20)
 
     def test_closed_form_constantlr(self):
@@ -1637,7 +2085,9 @@ def test_closed_form_cos_anneal_lr(self):
         epochs = 20
         T_max = 5
         scheduler = CosineAnnealingLR(self.opt, T_max=T_max, eta_min=eta_min)
-        closed_form_scheduler = CosineAnnealingLR(self.opt, T_max=T_max, eta_min=eta_min)
+        closed_form_scheduler = CosineAnnealingLR(
+            self.opt, T_max=T_max, eta_min=eta_min
+        )
         self._test_against_closed_form(scheduler, closed_form_scheduler, epochs)
 
     def test_cos_anneal_lr_continue(self):
@@ -1648,97 +2098,135 @@ def test_cos_anneal_lr_continue(self):
         scheduler.step()
         original_lrs = scheduler._last_lr
         new_scheduler = CosineAnnealingLR(
-            self.opt, T_max=T_max, eta_min=eta_min, last_epoch=0)
+            self.opt, T_max=T_max, eta_min=eta_min, last_epoch=0
+        )
         new_lrs = new_scheduler._last_lr
         torch.testing.assert_close(original_lrs, new_lrs, rtol=1e-4, atol=1e-5)
 
     def test_reduce_lr_on_plateau1(self):
         epochs = 10
         for param_group in self.opt.param_groups:
-            param_group['lr'] = 0.5
+            param_group["lr"] = 0.5
         targets = [[0.5] * 20]
         metrics = [10 - i * 0.0167 for i in range(20)]
-        scheduler = ReduceLROnPlateau(self.opt, threshold_mode='abs', mode='min',
-                                      threshold=0.01, patience=5, cooldown=5)
+        scheduler = ReduceLROnPlateau(
+            self.opt,
+            threshold_mode="abs",
+            mode="min",
+            threshold=0.01,
+            patience=5,
+            cooldown=5,
+        )
         self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
 
     def test_reduce_lr_on_plateau2(self):
         epochs = 22
         for param_group in self.opt.param_groups:
-            param_group['lr'] = 0.5
+            param_group["lr"] = 0.5
         targets = [[0.5] * 6 + [0.05] * 7 + [0.005] * 7 + [0.0005] * 2]
         metrics = [10 - i * 0.0165 for i in range(22)]
-        scheduler = ReduceLROnPlateau(self.opt, patience=5, cooldown=0, threshold_mode='abs',
-                                      mode='min', threshold=0.1)
+        scheduler = ReduceLROnPlateau(
+            self.opt,
+            patience=5,
+            cooldown=0,
+            threshold_mode="abs",
+            mode="min",
+            threshold=0.1,
+        )
         self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
 
     def test_reduce_lr_on_plateau3(self):
         epochs = 22
         for param_group in self.opt.param_groups:
-            param_group['lr'] = 0.5
+            param_group["lr"] = 0.5
         targets = [[0.5] * (2 + 6) + [0.05] * (5 + 6) + [0.005] * 4]
         metrics = [-0.8] * 2 + [-0.234] * 20
-        scheduler = ReduceLROnPlateau(self.opt, mode='max', patience=5, cooldown=5,
-                                      threshold_mode='abs')
+        scheduler = ReduceLROnPlateau(
+            self.opt, mode="max", patience=5, cooldown=5, threshold_mode="abs"
+        )
         self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
 
     def test_reduce_lr_on_plateau4(self):
         epochs = 20
         for param_group in self.opt.param_groups:
-            param_group['lr'] = 0.5
+            param_group["lr"] = 0.5
         targets = [[0.5] * 20]
-        metrics = [1.5 * (1.025 ** i) for i in range(20)]  # 1.025 > 1.1**0.25
-        scheduler = ReduceLROnPlateau(self.opt, mode='max', patience=3,
-                                      threshold_mode='rel', threshold=0.1)
+        metrics = [1.5 * (1.025**i) for i in range(20)]  # 1.025 > 1.1**0.25
+        scheduler = ReduceLROnPlateau(
+            self.opt, mode="max", patience=3, threshold_mode="rel", threshold=0.1
+        )
         self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
 
     def test_reduce_lr_on_plateau5(self):
         epochs = 20
         for param_group in self.opt.param_groups:
-            param_group['lr'] = 0.5
+            param_group["lr"] = 0.5
         targets = [[0.5] * 6 + [0.05] * (5 + 6) + [0.005] * 4]
-        metrics = [1.5 * (1.005 ** i) for i in range(20)]
-        scheduler = ReduceLROnPlateau(self.opt, mode='max', threshold_mode='rel',
-                                      threshold=0.1, patience=5, cooldown=5)
+        metrics = [1.5 * (1.005**i) for i in range(20)]
+        scheduler = ReduceLROnPlateau(
+            self.opt,
+            mode="max",
+            threshold_mode="rel",
+            threshold=0.1,
+            patience=5,
+            cooldown=5,
+        )
         self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
 
     def test_reduce_lr_on_plateau6(self):
         epochs = 20
         for param_group in self.opt.param_groups:
-            param_group['lr'] = 0.5
+            param_group["lr"] = 0.5
         targets = [[0.5] * 20]
-        metrics = [1.5 * (0.85 ** i) for i in range(20)]
-        scheduler = ReduceLROnPlateau(self.opt, mode='min', threshold_mode='rel',
-                                      threshold=0.1)
+        metrics = [1.5 * (0.85**i) for i in range(20)]
+        scheduler = ReduceLROnPlateau(
+            self.opt, mode="min", threshold_mode="rel", threshold=0.1
+        )
         self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
 
     def test_reduce_lr_on_plateau7(self):
         epochs = 20
         for param_group in self.opt.param_groups:
-            param_group['lr'] = 0.5
+            param_group["lr"] = 0.5
         targets = [[0.5] * 6 + [0.05] * (5 + 6) + [0.005] * 4]
         metrics = [1] * 7 + [0.6] + [0.5] * 12
-        scheduler = ReduceLROnPlateau(self.opt, mode='min', threshold_mode='rel',
-                                      threshold=0.1, patience=5, cooldown=5)
+        scheduler = ReduceLROnPlateau(
+            self.opt,
+            mode="min",
+            threshold_mode="rel",
+            threshold=0.1,
+            patience=5,
+            cooldown=5,
+        )
         self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
 
     def test_reduce_lr_on_plateau8(self):
         epochs = 20
         for param_group in self.opt.param_groups:
-            param_group['lr'] = 0.5
+            param_group["lr"] = 0.5
         targets = [[0.5] * 6 + [0.4] * 14, [0.5] * 6 + [0.3] * 14]
-        metrics = [1.5 * (1.005 ** i) for i in range(20)]
-        scheduler = ReduceLROnPlateau(self.opt, mode='max', threshold_mode='rel', min_lr=[0.4, 0.3],
-                                      threshold=0.1, patience=5, cooldown=5)
+        metrics = [1.5 * (1.005**i) for i in range(20)]
+        scheduler = ReduceLROnPlateau(
+            self.opt,
+            mode="max",
+            threshold_mode="rel",
+            min_lr=[0.4, 0.3],
+            threshold=0.1,
+            patience=5,
+            cooldown=5,
+        )
         self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
 
     def test_sequentiallr1(self):
         epochs = 19
         schedulers = [None] * 2
-        targets = [[0.05, 0.04, 0.032] + [0.05 for x in range(4)]
-                                       + [0.05 * 0.1 for x in range(4)]
-                                       + [0.05 * 0.01 for x in range(4)]
-                                       + [0.05 * 0.001 for x in range(4)]]
+        targets = [
+            [0.05, 0.04, 0.032]
+            + [0.05 for x in range(4)]
+            + [0.05 * 0.1 for x in range(4)]
+            + [0.05 * 0.01 for x in range(4)]
+            + [0.05 * 0.001 for x in range(4)]
+        ]
         milestones = [3]
         schedulers[0] = ExponentialLR(self.opt, gamma=0.8)
         schedulers[1] = StepLR(self.opt, gamma=0.1, step_size=4)
@@ -1748,7 +2236,7 @@ def test_sequentiallr1(self):
     def test_sequentiallr2(self):
         epochs = 13
         schedulers = [None] * 2
-        targets = [[0.005, 0.005, 0.005] + [0.05 * 0.9 ** x for x in range(10)]]
+        targets = [[0.005, 0.005, 0.005] + [0.05 * 0.9**x for x in range(10)]]
         milestones = [3]
         schedulers[0] = ConstantLR(self.opt, factor=0.1, total_iters=3)
         schedulers[1] = ExponentialLR(self.opt, gamma=0.9)
@@ -1758,8 +2246,11 @@ def test_sequentiallr2(self):
     def test_sequentiallr3(self):
         epochs = 12
         schedulers = [None] * 3
-        targets = [[0.005, 0.005, 0.005] + [0.05, 0.04, 0.032]
-                                         + [0.05, 0.05, 0.005, 0.005, 0.0005, 0.0005]]
+        targets = [
+            [0.005, 0.005, 0.005]
+            + [0.05, 0.04, 0.032]
+            + [0.05, 0.05, 0.005, 0.005, 0.0005, 0.0005]
+        ]
         milestones = [3, 6]
         schedulers[0] = ConstantLR(self.opt, factor=0.1, total_iters=3)
         schedulers[1] = ExponentialLR(self.opt, gamma=0.8)
@@ -1773,9 +2264,11 @@ def test_sequentiallr4(self):
 
         schedulers = [
             torch.optim.lr_scheduler.ConstantLR(optimizer, factor=1),
-            torch.optim.lr_scheduler.ConstantLR(optimizer, factor=0.1)
+            torch.optim.lr_scheduler.ConstantLR(optimizer, factor=0.1),
         ]
-        scheduler = torch.optim.lr_scheduler.SequentialLR(optimizer, schedulers, milestones=[10])
+        scheduler = torch.optim.lr_scheduler.SequentialLR(
+            optimizer, schedulers, milestones=[10]
+        )
 
         new_lr = optimizer.param_groups[0]["lr"]
 
@@ -1800,7 +2293,7 @@ def test_get_last_lr_sequentiallr(self):
     def test_chained_lr2_get_last_lr_before_step(self):
         schedulers = [
             LinearLR(self.opt, start_factor=0.4, total_iters=3),
-            MultiStepLR(self.opt, milestones=[4, 8, 10], gamma=0.1)
+            MultiStepLR(self.opt, milestones=[4, 8, 10], gamma=0.1),
         ]
         scheduler = ChainedScheduler(schedulers)
         self.assertEqual(scheduler.get_last_lr(), schedulers[-1].get_last_lr())
@@ -1826,7 +2319,9 @@ def test_chained_lr2(self):
     def test_chained_lr3(self):
         epochs = 10
         schedulers = [None] * 2
-        targets = [[0.02, 0.03, 0.04, 0.05] + [0.005] * 4 + [0.0005] * 3 + [0.00005] * 3]
+        targets = [
+            [0.02, 0.03, 0.04, 0.05] + [0.005] * 4 + [0.0005] * 3 + [0.00005] * 3
+        ]
         schedulers[0] = LinearLR(self.opt, start_factor=0.4, total_iters=3)
         schedulers[1] = MultiStepLR(self.opt, milestones=[4, 8, 10], gamma=0.1)
         scheduler = ChainedScheduler(schedulers)
@@ -1836,10 +2331,12 @@ def test_chained_lr3(self):
     def test_chained_lr4(self):
         epochs = 9
         schedulers = [None] * 3
-        targets = [[0.05 * 0.2 * 0.9 ** x for x in range(3)]
-                   + [0.05 * 0.2 * 0.9 ** 3 * 0.1]
-                   + [0.05 * 0.9 ** x * 0.1 for x in range(4, 6)]
-                   + [0.05 * 0.9 ** x * 0.01 for x in range(6, 9)]]
+        targets = [
+            [0.05 * 0.2 * 0.9**x for x in range(3)]
+            + [0.05 * 0.2 * 0.9**3 * 0.1]
+            + [0.05 * 0.9**x * 0.1 for x in range(4, 6)]
+            + [0.05 * 0.9**x * 0.01 for x in range(6, 9)]
+        ]
         schedulers[0] = ExponentialLR(self.opt, gamma=0.9)
         schedulers[1] = ConstantLR(self.opt, factor=0.2, total_iters=4)
         schedulers[2] = StepLR(self.opt, gamma=0.1, step_size=3)
@@ -1877,10 +2374,10 @@ def test_compound_step_and_multistep_lr(self):
     def test_compound_step_and_exp_lr(self):
         epochs = 10
         schedulers = [None] * 2
-        single_targets = [0.05 * (0.9 ** x) for x in range(3)]
-        single_targets += [0.005 * (0.9 ** x) for x in range(3, 6)]
-        single_targets += [0.0005 * (0.9 ** x) for x in range(6, 9)]
-        single_targets += [0.00005 * (0.9 ** x) for x in range(9, 12)]
+        single_targets = [0.05 * (0.9**x) for x in range(3)]
+        single_targets += [0.005 * (0.9**x) for x in range(3, 6)]
+        single_targets += [0.0005 * (0.9**x) for x in range(6, 9)]
+        single_targets += [0.00005 * (0.9**x) for x in range(9, 12)]
         targets = [single_targets, [x * epochs for x in single_targets]]
         schedulers[0] = StepLR(self.opt, gamma=0.1, step_size=3)
         schedulers[1] = ExponentialLR(self.opt, gamma=0.9)
@@ -1889,10 +2386,10 @@ def test_compound_step_and_exp_lr(self):
     def test_compound_exp_and_multistep_lr(self):
         epochs = 10
         schedulers = [None] * 2
-        single_targets = [0.05 * (0.9 ** x) for x in range(2)]
-        single_targets += [0.005 * (0.9 ** x) for x in range(2, 5)]
-        single_targets += [0.0005 * (0.9 ** x) for x in range(5, 9)]
-        single_targets += [0.00005 * (0.9 ** x) for x in range(9, 11)]
+        single_targets = [0.05 * (0.9**x) for x in range(2)]
+        single_targets += [0.005 * (0.9**x) for x in range(2, 5)]
+        single_targets += [0.0005 * (0.9**x) for x in range(5, 9)]
+        single_targets += [0.00005 * (0.9**x) for x in range(9, 11)]
         targets = [single_targets, [x * epochs for x in single_targets]]
         schedulers[0] = MultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9])
         schedulers[1] = ExponentialLR(self.opt, gamma=0.9)
@@ -1904,13 +2401,18 @@ def test_compound_exp_and_linearlr(self):
         start_factor = 0.4
         end_factor = 0.9
         schedulers = [None] * 2
-        single_targets = [0.05 * (0.9 ** x) for x in range(11)]
+        single_targets = [0.05 * (0.9**x) for x in range(11)]
         for i in range(iters):
             single_targets[i] *= start_factor + i / iters * (end_factor - start_factor)
         for i in range(iters, 11):
             single_targets[i] *= end_factor
         targets = [single_targets, [x * epochs for x in single_targets]]
-        schedulers[0] = LinearLR(self.opt, start_factor=start_factor, end_factor=end_factor, total_iters=iters)
+        schedulers[0] = LinearLR(
+            self.opt,
+            start_factor=start_factor,
+            end_factor=end_factor,
+            total_iters=iters,
+        )
         schedulers[1] = ExponentialLR(self.opt, gamma=0.9)
         self._test(schedulers, targets, epochs)
 
@@ -1919,7 +2421,13 @@ def test_compound_step_and_constantlr(self):
         iters = 4
         factor = 0.4
         schedulers = [None] * 2
-        single_targets = [0.05 * 0.4] * 3 + [0.005 * 0.4] + [0.005] * 2 + [0.0005] * 3 + [0.00005] * 3
+        single_targets = (
+            [0.05 * 0.4] * 3
+            + [0.005 * 0.4]
+            + [0.005] * 2
+            + [0.0005] * 3
+            + [0.00005] * 3
+        )
         targets = [single_targets, [x * epochs for x in single_targets]]
         schedulers[0] = StepLR(self.opt, gamma=0.1, step_size=3)
         schedulers[1] = ConstantLR(self.opt, factor=0.4, total_iters=4)
@@ -1941,9 +2449,10 @@ def test_compound_linearlr_and_multistep_lr(self):
     def test_compound_cosanneal_and_step_lr(self):
         epochs = 10
         eta_min = 1e-10
-        single_targets = [eta_min + (0.05 - eta_min) *
-                          (1 + math.cos(math.pi * x / epochs)) / 2
-                          for x in range(epochs)]
+        single_targets = [
+            eta_min + (0.05 - eta_min) * (1 + math.cos(math.pi * x / epochs)) / 2
+            for x in range(epochs)
+        ]
         single_targets = [x * 0.1 ** (i // 3) for i, x in enumerate(single_targets)]
         targets = [single_targets, [x * epochs for x in single_targets]]
         schedulers = [None] * 2
@@ -1954,9 +2463,10 @@ def test_compound_cosanneal_and_step_lr(self):
     def test_compound_cosanneal_and_multistep_lr(self):
         epochs = 10
         eta_min = 1e-10
-        single_targets = [eta_min + (0.05 - eta_min) *
-                          (1 + math.cos(math.pi * x / epochs)) / 2
-                          for x in range(epochs)]
+        single_targets = [
+            eta_min + (0.05 - eta_min) * (1 + math.cos(math.pi * x / epochs)) / 2
+            for x in range(epochs)
+        ]
         multipliers = [1] * 2 + [0.1] * 3 + [0.01] * 4 + [0.001]
         single_targets = [x * y for x, y in zip(single_targets, multipliers)]
         targets = [single_targets, [x * epochs for x in single_targets]]
@@ -1971,9 +2481,10 @@ def test_compound_cosanneal_and_linearlr(self):
         start_factor = 0.4
         eta_min = 1e-10
         schedulers = [None] * 2
-        single_targets = [eta_min + (0.05 - eta_min) *
-                          (1 + math.cos(math.pi * x / epochs)) / 2
-                          for x in range(epochs)]
+        single_targets = [
+            eta_min + (0.05 - eta_min) * (1 + math.cos(math.pi * x / epochs)) / 2
+            for x in range(epochs)
+        ]
         for i in range(iters):
             single_targets[i] *= start_factor + i / iters * (1 - start_factor)
         targets = [single_targets, [x * epochs for x in single_targets]]
@@ -1984,10 +2495,11 @@ def test_compound_cosanneal_and_linearlr(self):
     def test_compound_cosanneal_and_exp_lr(self):
         epochs = 10
         eta_min = 1e-10
-        single_targets = [eta_min + (0.05 - eta_min) *
-                          (1 + math.cos(math.pi * x / epochs)) / 2
-                          for x in range(epochs)]
-        multipliers = [0.1 ** i for i in range(epochs)]
+        single_targets = [
+            eta_min + (0.05 - eta_min) * (1 + math.cos(math.pi * x / epochs)) / 2
+            for x in range(epochs)
+        ]
+        multipliers = [0.1**i for i in range(epochs)]
         single_targets = [x * y for x, y in zip(single_targets, multipliers)]
         targets = [single_targets, [x * epochs for x in single_targets]]
         schedulers = [None] * 2
@@ -1998,7 +2510,7 @@ def test_compound_cosanneal_and_exp_lr(self):
     def test_compound_reduce_lr_on_plateau1(self):
         epochs = 10
         for param_group in self.opt.param_groups:
-            param_group['lr'] = 0.5
+            param_group["lr"] = 0.5
         single_targets = [0.5] * 20
         multipliers = [0.1 ** (i // 3) for i in range(20)]
         single_targets = [x * y for x, y in zip(multipliers, single_targets)]
@@ -2006,15 +2518,21 @@ def test_compound_reduce_lr_on_plateau1(self):
         targets = targets[1:]  # test runs step before checking lr
         metrics = [10 - i * 0.0167 for i in range(20)]
         schedulers = [None, None]
-        schedulers[0] = ReduceLROnPlateau(self.opt, threshold_mode='abs', mode='min',
-                                          threshold=0.01, patience=5, cooldown=5)
+        schedulers[0] = ReduceLROnPlateau(
+            self.opt,
+            threshold_mode="abs",
+            mode="min",
+            threshold=0.01,
+            patience=5,
+            cooldown=5,
+        )
         schedulers[1] = StepLR(self.opt, gamma=0.1, step_size=3)
         self._test_reduce_lr_on_plateau(schedulers, targets, metrics, epochs)
 
     def test_compound_reduce_lr_on_plateau2(self):
         epochs = 22
         for param_group in self.opt.param_groups:
-            param_group['lr'] = 0.5
+            param_group["lr"] = 0.5
         single_targets = [0.5] * 6 + [0.05] * 7 + [0.005] * 7 + [0.0005] * 2
         multipliers = [1] * 3 + [0.1] * 5 + [0.01] * 4 + [0.001] * 10
         single_targets = [x * y for x, y in zip(single_targets, multipliers)]
@@ -2022,42 +2540,51 @@ def test_compound_reduce_lr_on_plateau2(self):
         targets = targets[1:]  # test runs step before checking lr
         metrics = [10 - i * 0.0165 for i in range(22)]
         schedulers = [None] * 2
-        schedulers[0] = ReduceLROnPlateau(self.opt, patience=5, cooldown=0, threshold_mode='abs',
-                                          mode='min', threshold=0.1)
+        schedulers[0] = ReduceLROnPlateau(
+            self.opt,
+            patience=5,
+            cooldown=0,
+            threshold_mode="abs",
+            mode="min",
+            threshold=0.1,
+        )
         schedulers[1] = MultiStepLR(self.opt, gamma=0.1, milestones=[3, 8, 12])
         self._test_reduce_lr_on_plateau(schedulers, targets, metrics, epochs)
 
     def test_compound_reduce_lr_on_plateau3(self):
         epochs = 22
         for param_group in self.opt.param_groups:
-            param_group['lr'] = 0.5
+            param_group["lr"] = 0.5
         single_targets = [0.5] * (2 + 6) + [0.05] * (5 + 6) + [0.005] * 4
-        multipliers = [0.1 ** i for i in range(epochs)]
+        multipliers = [0.1**i for i in range(epochs)]
         single_targets = [x * y for x, y in zip(multipliers, single_targets)]
         targets = [single_targets]
         targets = targets[1:]  # test runs step before checking lr
         metrics = [-0.8] * 2 + [-0.234] * 20
         schedulers = [None, None]
-        schedulers[0] = ReduceLROnPlateau(self.opt, mode='max', patience=5, cooldown=5,
-                                          threshold_mode='abs')
+        schedulers[0] = ReduceLROnPlateau(
+            self.opt, mode="max", patience=5, cooldown=5, threshold_mode="abs"
+        )
         schedulers[1] = ExponentialLR(self.opt, gamma=0.1)
         self._test_reduce_lr_on_plateau(schedulers, targets, metrics, epochs)
 
     def test_compound_reduce_lr_on_plateau4(self):
         epochs = 20
         for param_group in self.opt.param_groups:
-            param_group['lr'] = 0.05
+            param_group["lr"] = 0.05
         epochs = 10
         eta_min = 1e-10
-        single_targets = [eta_min + (0.05 - eta_min) *
-                          (1 + math.cos(math.pi * x / epochs)) / 2
-                          for x in range(epochs)]
+        single_targets = [
+            eta_min + (0.05 - eta_min) * (1 + math.cos(math.pi * x / epochs)) / 2
+            for x in range(epochs)
+        ]
         targets = [single_targets]
         targets = targets[1:]  # test runs step before checking lr
-        metrics = [1.5 * (1.025 ** i) for i in range(20)]  # 1.025 > 1.1**0.25
+        metrics = [1.5 * (1.025**i) for i in range(20)]  # 1.025 > 1.1**0.25
         schedulers = [None, None]
-        schedulers[0] = ReduceLROnPlateau(self.opt, mode='max', patience=3,
-                                          threshold_mode='rel', threshold=0.1)
+        schedulers[0] = ReduceLROnPlateau(
+            self.opt, mode="max", patience=3, threshold_mode="rel", threshold=0.1
+        )
         schedulers[1] = CosineAnnealingLR(self.opt, epochs, eta_min)
         self._test_reduce_lr_on_plateau(schedulers, targets, metrics, epochs)
 
@@ -2066,7 +2593,7 @@ def test_compound_reduce_lr_on_plateau5(self):
         start_factor = 0.4
         epochs = 22
         for param_group in self.opt.param_groups:
-            param_group['lr'] = 0.5
+            param_group["lr"] = 0.5
         single_targets = [0.5] * 6 + [0.05] * 7 + [0.005] * 7 + [0.0005] * 2
         multipliers = [1] * 22
         for i in range(iters):
@@ -2076,8 +2603,14 @@ def test_compound_reduce_lr_on_plateau5(self):
         targets = targets[1:]  # test runs step before checking lr
         metrics = [10 - i * 0.0165 for i in range(22)]
         schedulers = [None] * 2
-        schedulers[0] = ReduceLROnPlateau(self.opt, patience=5, cooldown=0, threshold_mode='abs',
-                                          mode='min', threshold=0.1)
+        schedulers[0] = ReduceLROnPlateau(
+            self.opt,
+            patience=5,
+            cooldown=0,
+            threshold_mode="abs",
+            mode="min",
+            threshold=0.1,
+        )
         schedulers[1] = LinearLR(self.opt, start_factor=start_factor, total_iters=iters)
         self._test_reduce_lr_on_plateau(schedulers, targets, metrics, epochs)
 
@@ -2090,30 +2623,94 @@ def test_cycle_lr_triangular_mode_one_lr(self):
         momentum_target = [5, 4, 3, 2, 1, 2, 3, 4, 5, 4, 3]
         lr_targets = [lr_target, lr_target]
         momentum_targets = [momentum_target, momentum_target]
-        scheduler = CyclicLR(self.opt, base_lr=1, max_lr=5, step_size_up=4,
-                             cycle_momentum=True, base_momentum=1, max_momentum=5,
-                             mode='triangular')
+        scheduler = CyclicLR(
+            self.opt,
+            base_lr=1,
+            max_lr=5,
+            step_size_up=4,
+            cycle_momentum=True,
+            base_momentum=1,
+            max_momentum=5,
+            mode="triangular",
+        )
         self._test_cycle_lr(scheduler, lr_targets, momentum_targets, len(lr_target))
 
     def test_cycle_lr_triangular_mode_one_lr_no_momentum(self):
         lr_target = [1, 2, 3, 4, 5, 4, 3, 2, 1, 2, 3]
         lr_targets = [lr_target, lr_target]
-        momentum_target = [self.opt.defaults['momentum']] * len(lr_target)
+        momentum_target = [self.opt.defaults["momentum"]] * len(lr_target)
         momentum_targets = [momentum_target, momentum_target]
-        scheduler = CyclicLR(self.opt, base_lr=1, max_lr=5, step_size_up=4,
-                             cycle_momentum=False, mode='triangular')
+        scheduler = CyclicLR(
+            self.opt,
+            base_lr=1,
+            max_lr=5,
+            step_size_up=4,
+            cycle_momentum=False,
+            mode="triangular",
+        )
         self._test_cycle_lr(scheduler, lr_targets, momentum_targets, len(lr_target))
 
     def test_cycle_lr_triangular2_mode_one_lr(self):
-        lr_target = [1, 2, 3, 4, 5, 4, 3, 2, 1, 1.5, 2.0, 2.5, 3.0, 2.5, 2.0, 1.5,
-                     1, 1.25, 1.50, 1.75, 2.00, 1.75]
-        momentum_target = [5.0, 4.0, 3.0, 2.0, 1.0, 2.0, 3.0, 4.0, 5.0, 4.5, 4.0,
-                           3.5, 3.0, 3.5, 4.0, 4.5, 5.0, 4.75, 4.5, 4.25, 4.0, 4.25]
+        lr_target = [
+            1,
+            2,
+            3,
+            4,
+            5,
+            4,
+            3,
+            2,
+            1,
+            1.5,
+            2.0,
+            2.5,
+            3.0,
+            2.5,
+            2.0,
+            1.5,
+            1,
+            1.25,
+            1.50,
+            1.75,
+            2.00,
+            1.75,
+        ]
+        momentum_target = [
+            5.0,
+            4.0,
+            3.0,
+            2.0,
+            1.0,
+            2.0,
+            3.0,
+            4.0,
+            5.0,
+            4.5,
+            4.0,
+            3.5,
+            3.0,
+            3.5,
+            4.0,
+            4.5,
+            5.0,
+            4.75,
+            4.5,
+            4.25,
+            4.0,
+            4.25,
+        ]
         lr_targets = [lr_target, lr_target]
         momentum_targets = [momentum_target, momentum_target]
-        scheduler = CyclicLR(self.opt, base_lr=1, max_lr=5, step_size_up=4,
-                             cycle_momentum=True, base_momentum=1, max_momentum=5,
-                             mode='triangular2')
+        scheduler = CyclicLR(
+            self.opt,
+            base_lr=1,
+            max_lr=5,
+            step_size_up=4,
+            cycle_momentum=True,
+            base_momentum=1,
+            max_momentum=5,
+            mode="triangular2",
+        )
         self._test_cycle_lr(scheduler, lr_targets, momentum_targets, len(lr_target))
 
     def test_cycle_lr_exp_range_mode_one_lr(self):
@@ -2125,10 +2722,17 @@ def test_cycle_lr_exp_range_mode_one_lr(self):
         momentum_target = [max_lr - x * diff_lr * gamma**i for i, x in enumerate(xs)]
         lr_targets = [lr_target, lr_target]
         momentum_targets = [momentum_target, momentum_target]
-        scheduler = CyclicLR(self.opt, base_lr=base_lr,
-                             max_lr=max_lr, step_size_up=4,
-                             cycle_momentum=True, base_momentum=base_lr, max_momentum=max_lr,
-                             mode='exp_range', gamma=gamma)
+        scheduler = CyclicLR(
+            self.opt,
+            base_lr=base_lr,
+            max_lr=max_lr,
+            step_size_up=4,
+            cycle_momentum=True,
+            base_momentum=base_lr,
+            max_momentum=max_lr,
+            mode="exp_range",
+            gamma=gamma,
+        )
         self._test_cycle_lr(scheduler, lr_targets, momentum_targets, len(lr_target))
 
     def test_cycle_lr_triangular_mode(self):
@@ -2138,23 +2742,81 @@ def test_cycle_lr_triangular_mode(self):
         momentum_target_1 = [5, 4, 3, 2, 1, 2, 3, 4, 5, 4, 3]
         momentum_target_2 = [x + 1 for x in momentum_target_1]
         momentum_targets = [momentum_target_1, momentum_target_2]
-        scheduler = CyclicLR(self.opt, base_lr=[1, 2], max_lr=[5, 6], step_size_up=4,
-                             cycle_momentum=True, base_momentum=[1, 2], max_momentum=[5, 6],
-                             mode='triangular')
+        scheduler = CyclicLR(
+            self.opt,
+            base_lr=[1, 2],
+            max_lr=[5, 6],
+            step_size_up=4,
+            cycle_momentum=True,
+            base_momentum=[1, 2],
+            max_momentum=[5, 6],
+            mode="triangular",
+        )
         self._test_cycle_lr(scheduler, lr_targets, momentum_targets, len(lr_target_1))
 
     def test_cycle_lr_triangular2_mode(self):
-        lr_target_1 = [1, 2, 3, 4, 5, 4, 3, 2, 1, 1.5, 2.0, 2.5, 3.0, 2.5, 2.0, 1.5, 1,
-                       1.25, 1.50, 1.75, 2.00, 1.75]
+        lr_target_1 = [
+            1,
+            2,
+            3,
+            4,
+            5,
+            4,
+            3,
+            2,
+            1,
+            1.5,
+            2.0,
+            2.5,
+            3.0,
+            2.5,
+            2.0,
+            1.5,
+            1,
+            1.25,
+            1.50,
+            1.75,
+            2.00,
+            1.75,
+        ]
         lr_target_2 = [x + 2 for x in lr_target_1]
         lr_targets = [lr_target_1, lr_target_2]
-        momentum_target_1 = [5.0, 4.0, 3.0, 2.0, 1.0, 2.0, 3.0, 4.0, 5.0, 4.5, 4.0, 3.5,
-                             3.0, 3.5, 4.0, 4.5, 5.0, 4.75, 4.5, 4.25, 4.0, 4.25]
+        momentum_target_1 = [
+            5.0,
+            4.0,
+            3.0,
+            2.0,
+            1.0,
+            2.0,
+            3.0,
+            4.0,
+            5.0,
+            4.5,
+            4.0,
+            3.5,
+            3.0,
+            3.5,
+            4.0,
+            4.5,
+            5.0,
+            4.75,
+            4.5,
+            4.25,
+            4.0,
+            4.25,
+        ]
         momentum_target_2 = [x + 2 for x in momentum_target_1]
         momentum_targets = [momentum_target_1, momentum_target_2]
-        scheduler = CyclicLR(self.opt, base_lr=[1, 3], max_lr=[5, 7], step_size_up=4,
-                             cycle_momentum=True, base_momentum=[1, 3], max_momentum=[5, 7],
-                             mode='triangular2')
+        scheduler = CyclicLR(
+            self.opt,
+            base_lr=[1, 3],
+            max_lr=[5, 7],
+            step_size_up=4,
+            cycle_momentum=True,
+            base_momentum=[1, 3],
+            max_momentum=[5, 7],
+            mode="triangular2",
+        )
         self._test_cycle_lr(scheduler, lr_targets, momentum_targets, len(lr_target_1))
 
     def test_cycle_lr_exp_range_mode(self):
@@ -2169,46 +2831,129 @@ def test_cycle_lr_exp_range_mode(self):
         lr_target_1 = [base_lr_1 + x * diff_lr_1 * gamma**i for i, x in enumerate(xs)]
         lr_target_2 = [base_lr_2 + x * diff_lr_2 * gamma**i for i, x in enumerate(xs)]
         lr_targets = [lr_target_1, lr_target_2]
-        momentum_target_1 = [max_lr_1 - x * diff_lr_1 * gamma**i for i, x in enumerate(xs)]
-        momentum_target_2 = [max_lr_2 - x * diff_lr_2 * gamma**i for i, x in enumerate(xs)]
+        momentum_target_1 = [
+            max_lr_1 - x * diff_lr_1 * gamma**i for i, x in enumerate(xs)
+        ]
+        momentum_target_2 = [
+            max_lr_2 - x * diff_lr_2 * gamma**i for i, x in enumerate(xs)
+        ]
         momentum_targets = [momentum_target_1, momentum_target_2]
-        scheduler = CyclicLR(self.opt, base_lr=[base_lr_1, base_lr_2],
-                             max_lr=[max_lr_1, max_lr_2], step_size_up=4,
-                             cycle_momentum=True, base_momentum=[base_lr_1, base_lr_2],
-                             max_momentum=[max_lr_1, max_lr_2],
-                             mode='exp_range', gamma=gamma)
+        scheduler = CyclicLR(
+            self.opt,
+            base_lr=[base_lr_1, base_lr_2],
+            max_lr=[max_lr_1, max_lr_2],
+            step_size_up=4,
+            cycle_momentum=True,
+            base_momentum=[base_lr_1, base_lr_2],
+            max_momentum=[max_lr_1, max_lr_2],
+            mode="exp_range",
+            gamma=gamma,
+        )
         self._test_cycle_lr(scheduler, lr_targets, momentum_targets, len(lr_target_1))
 
     def test_cycle_lr_triangular_mode_step_size_up_down(self):
-        lr_target = [1.0, 2.0, 3.0, 4.0, 5.0, 13.0 / 3, 11.0 / 3, 9.0 / 3, 7.0 / 3, 5.0 / 3, 1.0]
+        lr_target = [
+            1.0,
+            2.0,
+            3.0,
+            4.0,
+            5.0,
+            13.0 / 3,
+            11.0 / 3,
+            9.0 / 3,
+            7.0 / 3,
+            5.0 / 3,
+            1.0,
+        ]
         lr_targets = [lr_target, lr_target]
-        momentum_target = [5.0, 4.0, 3.0, 2.0, 1.0, 5.0 / 3, 7.0 / 3, 3.0, 11.0 / 3, 13.0 / 3, 5.0]
+        momentum_target = [
+            5.0,
+            4.0,
+            3.0,
+            2.0,
+            1.0,
+            5.0 / 3,
+            7.0 / 3,
+            3.0,
+            11.0 / 3,
+            13.0 / 3,
+            5.0,
+        ]
         momentum_targets = [momentum_target, momentum_target]
 
-        scheduler = CyclicLR(self.opt, base_lr=1, max_lr=5,
-                             step_size_up=4,
-                             step_size_down=6,
-                             cycle_momentum=True,
-                             base_momentum=1, max_momentum=5,
-                             mode='triangular')
+        scheduler = CyclicLR(
+            self.opt,
+            base_lr=1,
+            max_lr=5,
+            step_size_up=4,
+            step_size_down=6,
+            cycle_momentum=True,
+            base_momentum=1,
+            max_momentum=5,
+            mode="triangular",
+        )
         self._test_cycle_lr(scheduler, lr_targets, momentum_targets, len(lr_target))
 
     def test_cycle_lr_triangular2_mode_step_size_up_down(self):
-        lr_base_target = ([
-            1.0, 3.0, 5.0, 13.0 / 3, 11.0 / 3, 9.0 / 3, 7.0 / 3, 5.0 / 3, 1.0, 2.0, 3.0, 8.0 / 3,
-            7.0 / 3, 6.0 / 3, 5.0 / 3, 4.0 / 3, 1.0, 3.0 / 2, 2.0, 11.0 / 6, 10.0 / 6, 9.0 / 6,
-            8.0 / 6, 7.0 / 6
-        ])
-        momentum_base_target = ([
-            5.0, 3.0, 1.0, 5.0 / 3, 7.0 / 3, 3.0, 11.0 / 3, 13.0 / 3, 5.0, 4.0, 3.0, 10.0 / 3,
-            11.0 / 3, 4.0, 13.0 / 3, 14.0 / 3, 5.0, 4.5, 4.0, 25.0 / 6, 13.0 / 3, 4.5, 14.0 / 3,
-            29.0 / 6
-        ])
+        lr_base_target = [
+            1.0,
+            3.0,
+            5.0,
+            13.0 / 3,
+            11.0 / 3,
+            9.0 / 3,
+            7.0 / 3,
+            5.0 / 3,
+            1.0,
+            2.0,
+            3.0,
+            8.0 / 3,
+            7.0 / 3,
+            6.0 / 3,
+            5.0 / 3,
+            4.0 / 3,
+            1.0,
+            3.0 / 2,
+            2.0,
+            11.0 / 6,
+            10.0 / 6,
+            9.0 / 6,
+            8.0 / 6,
+            7.0 / 6,
+        ]
+        momentum_base_target = [
+            5.0,
+            3.0,
+            1.0,
+            5.0 / 3,
+            7.0 / 3,
+            3.0,
+            11.0 / 3,
+            13.0 / 3,
+            5.0,
+            4.0,
+            3.0,
+            10.0 / 3,
+            11.0 / 3,
+            4.0,
+            13.0 / 3,
+            14.0 / 3,
+            5.0,
+            4.5,
+            4.0,
+            25.0 / 6,
+            13.0 / 3,
+            4.5,
+            14.0 / 3,
+            29.0 / 6,
+        ]
         deltas = [2 * i for i in range(0, 2)]
         base_lrs = [1 + delta for delta in deltas]
         max_lrs = [5 + delta for delta in deltas]
         lr_targets = [[x + delta for x in lr_base_target] for delta in deltas]
-        momentum_targets = [[x + delta for x in momentum_base_target] for delta in deltas]
+        momentum_targets = [
+            [x + delta for x in momentum_base_target] for delta in deltas
+        ]
         scheduler = CyclicLR(
             self.opt,
             base_lr=base_lrs,
@@ -2218,26 +2963,47 @@ def test_cycle_lr_triangular2_mode_step_size_up_down(self):
             cycle_momentum=True,
             base_momentum=base_lrs,
             max_momentum=max_lrs,
-            mode='triangular2')
-        self._test_cycle_lr(scheduler, lr_targets, momentum_targets, len(lr_base_target))
+            mode="triangular2",
+        )
+        self._test_cycle_lr(
+            scheduler, lr_targets, momentum_targets, len(lr_base_target)
+        )
 
     def test_cycle_lr_exp_range_mode_step_size_up_down(self):
         base_lr, max_lr = 1, 5
         diff_lr = max_lr - base_lr
         gamma = 0.9
-        xs = ([
-            0.0, 0.5, 1.0, 5.0 / 6, 4.0 / 6, 3.0 / 6, 2.0 / 6, 1.0 / 6, 0.0, 0.5, 1.0, 5.0 / 6,
-            4.0 / 6
-        ])
+        xs = [
+            0.0,
+            0.5,
+            1.0,
+            5.0 / 6,
+            4.0 / 6,
+            3.0 / 6,
+            2.0 / 6,
+            1.0 / 6,
+            0.0,
+            0.5,
+            1.0,
+            5.0 / 6,
+            4.0 / 6,
+        ]
         lr_target = [base_lr + x * diff_lr * gamma**i for i, x in enumerate(xs)]
         lr_targets = [lr_target, lr_target]
         momentum_target = [max_lr - x * diff_lr * gamma**i for i, x in enumerate(xs)]
         momentum_targets = [momentum_target, momentum_target]
-        scheduler = CyclicLR(self.opt, base_lr=base_lr, max_lr=max_lr,
-                             step_size_up=2, step_size_down=6,
-                             cycle_momentum=True, base_momentum=base_lr,
-                             max_momentum=max_lr,
-                             mode='exp_range', gamma=gamma)
+        scheduler = CyclicLR(
+            self.opt,
+            base_lr=base_lr,
+            max_lr=max_lr,
+            step_size_up=2,
+            step_size_down=6,
+            cycle_momentum=True,
+            base_momentum=base_lr,
+            max_momentum=max_lr,
+            mode="exp_range",
+            gamma=gamma,
+        )
         self._test_cycle_lr(scheduler, lr_targets, momentum_targets, len(lr_target))
 
     def test_cycle_lr_with_momentumless_optimizer(self):
@@ -2250,15 +3016,25 @@ def test_cycle_lr_with_momentumless_optimizer(self):
         # in more detail in https://github.com/pytorch/pytorch/issues/19003 ).
         old_opt = self.opt
         self.opt = optim.Adam(
-            [{'params': self.net.conv1.parameters()}, {'params': self.net.conv2.parameters(), 'lr': 0.5}],
-            lr=0.05)
+            [
+                {"params": self.net.conv1.parameters()},
+                {"params": self.net.conv2.parameters(), "lr": 0.5},
+            ],
+            lr=0.05,
+        )
 
         lr_target = [1, 2, 3, 4, 5, 4, 3, 2, 1, 2, 3]
         lr_targets = [lr_target, lr_target]
         momentum_target = [None] * len(lr_target)
         momentum_targets = [momentum_target, momentum_target]
-        scheduler = CyclicLR(self.opt, base_lr=1, max_lr=5, step_size_up=4,
-                             cycle_momentum=False, mode='triangular')
+        scheduler = CyclicLR(
+            self.opt,
+            base_lr=1,
+            max_lr=5,
+            step_size_up=4,
+            cycle_momentum=False,
+            mode="triangular",
+        )
         self._test_cycle_lr(scheduler, lr_targets, momentum_targets, len(lr_target))
 
         self.opt = old_opt  # set optimizer back to SGD
@@ -2271,6 +3047,7 @@ def test_cycle_lr_cycle_momentum_fail_with_momentumless_optimizer(self):
     def test_cycle_lr_removed_after_out_of_scope(self):
         import gc
         import weakref
+
         gc.disable()
 
         def test():
@@ -2284,7 +3061,9 @@ def test():
 
     def test_onecycle_lr_invalid_anneal_strategy(self):
         with self.assertRaises(ValueError):
-            scheduler = OneCycleLR(self.opt, max_lr=1e-3, total_steps=10, anneal_strategy="CATS")
+            scheduler = OneCycleLR(
+                self.opt, max_lr=1e-3, total_steps=10, anneal_strategy="CATS"
+            )
 
     def test_onecycle_lr_invalid_pct_start(self):
         with self.assertRaises(ValueError):
@@ -2299,8 +3078,15 @@ def test_onecycle_lr_linear_annealing(self):
         momentum_target = [22, 11.5, 1, 4, 7, 10, 13, 16, 19, 22]
         lr_targets = [lr_target, lr_target]
         momentum_targets = [momentum_target, momentum_target]
-        scheduler = OneCycleLR(self.opt, max_lr=25, final_div_factor=2, base_momentum=1, max_momentum=22,
-                               total_steps=10, anneal_strategy='linear')
+        scheduler = OneCycleLR(
+            self.opt,
+            max_lr=25,
+            final_div_factor=2,
+            base_momentum=1,
+            max_momentum=22,
+            total_steps=10,
+            anneal_strategy="linear",
+        )
         self._test_cycle_lr(scheduler, lr_targets, momentum_targets, 10)
 
     def test_onecycle_lr_linear_annealing_three_phases(self):
@@ -2308,59 +3094,111 @@ def test_onecycle_lr_linear_annealing_three_phases(self):
         momentum_target = [22, 15, 8, 1, 8, 15, 22, 22, 22, 22]
         lr_targets = [lr_target, lr_target]
         momentum_targets = [momentum_target, momentum_target]
-        scheduler = OneCycleLR(self.opt, max_lr=25, div_factor=25,
-                               base_momentum=1, max_momentum=22,
-                               total_steps=10, anneal_strategy='linear',
-                               pct_start=0.4, final_div_factor=4,
-                               three_phase=True)
+        scheduler = OneCycleLR(
+            self.opt,
+            max_lr=25,
+            div_factor=25,
+            base_momentum=1,
+            max_momentum=22,
+            total_steps=10,
+            anneal_strategy="linear",
+            pct_start=0.4,
+            final_div_factor=4,
+            three_phase=True,
+        )
         self._test_cycle_lr(scheduler, lr_targets, momentum_targets, 10)
 
     def test_onecycle_lr_cosine_annealing(self):
         def annealing_cos(start, end, pct):
             cos_out = math.cos(math.pi * pct) + 1
             return end + (start - end) / 2.0 * cos_out
-        lr_target = [1, 13, 25, annealing_cos(25, 0.5, 1 / 7.0), annealing_cos(25, 0.5, 2 / 7.0),
-                     annealing_cos(25, 0.5, 3 / 7.0), annealing_cos(25, 0.5, 4 / 7.0), annealing_cos(25, 0.5, 5 / 7.0),
-                     annealing_cos(25, 0.5, 6 / 7.0), 0.5]
-        momentum_target = [22, 11.5, 1, annealing_cos(1, 22, 1 / 7.0), annealing_cos(1, 22, 2 / 7.0),
-                           annealing_cos(1, 22, 3 / 7.0), annealing_cos(1, 22, 4 / 7.0), annealing_cos(1, 22, 5 / 7.0),
-                           annealing_cos(1, 22, 6 / 7.0), 22]
+
+        lr_target = [
+            1,
+            13,
+            25,
+            annealing_cos(25, 0.5, 1 / 7.0),
+            annealing_cos(25, 0.5, 2 / 7.0),
+            annealing_cos(25, 0.5, 3 / 7.0),
+            annealing_cos(25, 0.5, 4 / 7.0),
+            annealing_cos(25, 0.5, 5 / 7.0),
+            annealing_cos(25, 0.5, 6 / 7.0),
+            0.5,
+        ]
+        momentum_target = [
+            22,
+            11.5,
+            1,
+            annealing_cos(1, 22, 1 / 7.0),
+            annealing_cos(1, 22, 2 / 7.0),
+            annealing_cos(1, 22, 3 / 7.0),
+            annealing_cos(1, 22, 4 / 7.0),
+            annealing_cos(1, 22, 5 / 7.0),
+            annealing_cos(1, 22, 6 / 7.0),
+            22,
+        ]
         lr_targets = [lr_target, lr_target]
         momentum_targets = [momentum_target, momentum_target]
-        scheduler = OneCycleLR(self.opt, max_lr=25, final_div_factor=2, base_momentum=1, max_momentum=22,
-                               total_steps=10)
+        scheduler = OneCycleLR(
+            self.opt,
+            max_lr=25,
+            final_div_factor=2,
+            base_momentum=1,
+            max_momentum=22,
+            total_steps=10,
+        )
         self._test_cycle_lr(scheduler, lr_targets, momentum_targets, 10)
 
     def test_cycle_lr_with_adam(self):
         old_opt = self.opt
         self.opt = optim.Adam(
-            [{'params': self.net.conv1.parameters()}, {'params': self.net.conv2.parameters(), 'lr': 0.5}],
-            lr=0.05)
+            [
+                {"params": self.net.conv1.parameters()},
+                {"params": self.net.conv2.parameters(), "lr": 0.5},
+            ],
+            lr=0.05,
+        )
 
         lr_target = [1, 13, 25, 21.5, 18, 14.5, 11, 7.5, 4, 0.5]
         momentum_target = [22, 11.5, 1, 4, 7, 10, 13, 16, 19, 22]
         lr_targets = [lr_target, lr_target]
         momentum_targets = [momentum_target, momentum_target]
-        scheduler = OneCycleLR(self.opt, max_lr=25, final_div_factor=2, base_momentum=1, max_momentum=22,
-                               total_steps=10, anneal_strategy='linear')
+        scheduler = OneCycleLR(
+            self.opt,
+            max_lr=25,
+            final_div_factor=2,
+            base_momentum=1,
+            max_momentum=22,
+            total_steps=10,
+            anneal_strategy="linear",
+        )
         self._test_cycle_lr(scheduler, lr_targets, momentum_targets, 10, use_beta1=True)
         self.opt = old_opt  # set optimizer back to SGD
 
     def test_lambda_lr(self):
         epochs = 10
-        self.opt.param_groups[0]['lr'] = 0.05
-        self.opt.param_groups[1]['lr'] = 0.4
-        targets = [[0.05 * (0.9 ** x) for x in range(epochs)], [0.4 * (0.8 ** x) for x in range(epochs)]]
-        scheduler = LambdaLR(self.opt,
-                             lr_lambda=[lambda x1: 0.9 ** x1, lambda x2: 0.8 ** x2])
+        self.opt.param_groups[0]["lr"] = 0.05
+        self.opt.param_groups[1]["lr"] = 0.4
+        targets = [
+            [0.05 * (0.9**x) for x in range(epochs)],
+            [0.4 * (0.8**x) for x in range(epochs)],
+        ]
+        scheduler = LambdaLR(
+            self.opt, lr_lambda=[lambda x1: 0.9**x1, lambda x2: 0.8**x2]
+        )
         self._test(scheduler, targets, epochs)
 
     def test_multiplicative_lr(self):
         epochs = 10
-        self.opt.param_groups[0]['lr'] = 0.05
-        self.opt.param_groups[1]['lr'] = 0.4
-        targets = [[0.05 * (0.9 ** x) for x in range(epochs)], [0.4 * (0.8 ** x) for x in range(epochs)]]
-        scheduler = MultiplicativeLR(self.opt, lr_lambda=[lambda x1: 0.9, lambda x2: 0.8])
+        self.opt.param_groups[0]["lr"] = 0.05
+        self.opt.param_groups[1]["lr"] = 0.4
+        targets = [
+            [0.05 * (0.9**x) for x in range(epochs)],
+            [0.4 * (0.8**x) for x in range(epochs)],
+        ]
+        scheduler = MultiplicativeLR(
+            self.opt, lr_lambda=[lambda x1: 0.9, lambda x2: 0.8]
+        )
         self._test(scheduler, targets, epochs)
 
     @parametrize("T_mult", [1, 2, 4])
@@ -2370,14 +3208,20 @@ def test_CosineAnnealingWarmRestarts_lr1(self, T_mult):
         T_i = 10
         T_cur = 0
         targets = [[0.05], [0.5]]
-        scheduler = CosineAnnealingWarmRestarts(self.opt, T_0=T_i, T_mult=T_mult, eta_min=eta_min)
+        scheduler = CosineAnnealingWarmRestarts(
+            self.opt, T_0=T_i, T_mult=T_mult, eta_min=eta_min
+        )
         for _ in range(1, iters, 1):
             T_cur += 1
             if T_cur >= T_i:
                 T_cur = T_cur - T_i
                 T_i = int(T_mult) * T_i
-            targets[0] += [eta_min + (0.05 - eta_min) * (1 + math.cos(math.pi * T_cur / T_i)) / 2]
-            targets[1] += [eta_min + (0.5 - eta_min) * (1 + math.cos(math.pi * T_cur / T_i)) / 2]
+            targets[0] += [
+                eta_min + (0.05 - eta_min) * (1 + math.cos(math.pi * T_cur / T_i)) / 2
+            ]
+            targets[1] += [
+                eta_min + (0.5 - eta_min) * (1 + math.cos(math.pi * T_cur / T_i)) / 2
+            ]
         self._test(scheduler, targets, iters)
 
     def test_CosineAnnealingWarmRestarts_lr2(self):
@@ -2388,41 +3232,69 @@ def test_CosineAnnealingWarmRestarts_lr2(self):
             T_i = 10
             T_cur = 0
             targets = [[0.05], [0.5]]
-            scheduler = CosineAnnealingWarmRestarts(self.opt, T_0=T_i, T_mult=T_mult, eta_min=eta_min)
+            scheduler = CosineAnnealingWarmRestarts(
+                self.opt, T_0=T_i, T_mult=T_mult, eta_min=eta_min
+            )
             for _ in torch.arange(0.1, iters, 0.1):
                 T_cur = round(T_cur + 0.1, 1)
                 if T_cur >= T_i:
                     T_cur = T_cur - T_i
                     T_i = int(T_mult) * T_i
-                targets[0] += [eta_min + (0.05 - eta_min) * (1 + math.cos(math.pi * T_cur / T_i)) / 2]
-                targets[1] += [eta_min + (0.5 - eta_min) * (1 + math.cos(math.pi * T_cur / T_i)) / 2]
+                targets[0] += [
+                    eta_min
+                    + (0.05 - eta_min) * (1 + math.cos(math.pi * T_cur / T_i)) / 2
+                ]
+                targets[1] += [
+                    eta_min
+                    + (0.5 - eta_min) * (1 + math.cos(math.pi * T_cur / T_i)) / 2
+                ]
             self._test_CosineAnnealingWarmRestarts(scheduler, targets, iters)
 
     def test_CosineAnnealingWarmRestarts_lr3(self):
-        epochs_for_T_mults = [[0, 1, 2, 3, 4, 5, 12, 27, 3, 4, 5, 6, 13],
-                              [0, 1, 2, 3, 4, 5, 25, 32, 33, 34, 80, 81, 3],
-                              [0, 0.1, 0.2, 0.3, 1.3, 2.3, 17.5, 18.5, 19.5, 29.5, 30.5, 31.5, 50]]
-        T_curs_for_T_mults = [[1, 2, 3, 4, 5, 2, 7, 3, 4, 5, 6, 3],
-                              [1, 2, 3, 4, 5, 15, 2, 3, 4, 10, 11, 3],
-                              [0.1, 0.2, 0.3, 1.3, 2.3, 7.5, 8.5, 9.5, 19.5, 20.5, 21.5, 10]]
-        T_is_for_T_mults = [[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10],
-                            [10, 10, 10, 10, 10, 20, 40, 40, 40, 80, 80, 10],
-                            [10, 10, 10, 10, 10, 30, 30, 30, 30, 30, 30, 90]]
+        epochs_for_T_mults = [
+            [0, 1, 2, 3, 4, 5, 12, 27, 3, 4, 5, 6, 13],
+            [0, 1, 2, 3, 4, 5, 25, 32, 33, 34, 80, 81, 3],
+            [0, 0.1, 0.2, 0.3, 1.3, 2.3, 17.5, 18.5, 19.5, 29.5, 30.5, 31.5, 50],
+        ]
+        T_curs_for_T_mults = [
+            [1, 2, 3, 4, 5, 2, 7, 3, 4, 5, 6, 3],
+            [1, 2, 3, 4, 5, 15, 2, 3, 4, 10, 11, 3],
+            [0.1, 0.2, 0.3, 1.3, 2.3, 7.5, 8.5, 9.5, 19.5, 20.5, 21.5, 10],
+        ]
+        T_is_for_T_mults = [
+            [10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10],
+            [10, 10, 10, 10, 10, 20, 40, 40, 40, 80, 80, 10],
+            [10, 10, 10, 10, 10, 30, 30, 30, 30, 30, 30, 90],
+        ]
         eta_min = 1e-10
         T_mults = [1, 2, 3]
-        for epochs, T_mult, T_curs, T_is in zip(epochs_for_T_mults, T_mults, T_curs_for_T_mults, T_is_for_T_mults):
+        for epochs, T_mult, T_curs, T_is in zip(
+            epochs_for_T_mults, T_mults, T_curs_for_T_mults, T_is_for_T_mults
+        ):
             targets = [[0.05], [0.5]]
-            scheduler = CosineAnnealingWarmRestarts(self.opt, T_0=10, T_mult=T_mult, eta_min=eta_min)
+            scheduler = CosineAnnealingWarmRestarts(
+                self.opt, T_0=10, T_mult=T_mult, eta_min=eta_min
+            )
             for T_cur, T_i in zip(T_curs, T_is):
-                targets[0] += [eta_min + (0.05 - eta_min) * (1 + math.cos(math.pi * T_cur / T_i)) / 2]
-                targets[1] += [eta_min + (0.5 - eta_min) * (1 + math.cos(math.pi * T_cur / T_i)) / 2]
-            self._test_interleaved_CosineAnnealingWarmRestarts(scheduler, targets, epochs)
+                targets[0] += [
+                    eta_min
+                    + (0.05 - eta_min) * (1 + math.cos(math.pi * T_cur / T_i)) / 2
+                ]
+                targets[1] += [
+                    eta_min
+                    + (0.5 - eta_min) * (1 + math.cos(math.pi * T_cur / T_i)) / 2
+                ]
+            self._test_interleaved_CosineAnnealingWarmRestarts(
+                scheduler, targets, epochs
+            )
 
     def test_swalr_no_anneal(self):
         epochs, swa_start, swa_lr = 10, 5, 0.01
-        initial_lrs = [group['lr'] for group in self.opt.param_groups]
-        targets = [[lr] * (swa_start + 1) + [swa_lr] * (epochs - swa_start - 1)
-                   for lr in initial_lrs]
+        initial_lrs = [group["lr"] for group in self.opt.param_groups]
+        targets = [
+            [lr] * (swa_start + 1) + [swa_lr] * (epochs - swa_start - 1)
+            for lr in initial_lrs
+        ]
         swa_scheduler = SWALR(self.opt, anneal_epochs=1, swa_lr=swa_lr)
         self._test_swalr(swa_scheduler, None, targets, swa_start, epochs)
 
@@ -2435,15 +3307,22 @@ def test_swalr_cosine_anneal_after_multiplicative(self):
 
         def anneal_coef(t):
             if t + 1 >= anneal_epochs:
-                return 0.
+                return 0.0
             return (1 + math.cos(math.pi * (t + 1) / anneal_epochs)) / 2
 
-        initial_lrs = [group['lr'] for group in self.opt.param_groups]
-        targets_before_swa = [[lr * mult_factor**i for i in range(swa_start + 1)]
-                              for lr in initial_lrs]
+        initial_lrs = [group["lr"] for group in self.opt.param_groups]
+        targets_before_swa = [
+            [lr * mult_factor**i for i in range(swa_start + 1)] for lr in initial_lrs
+        ]
         swa_epochs = epochs - swa_start - 1
-        targets = [lrs + [lrs[-1] * anneal_coef(t) + swa_lr * (1 - anneal_coef(t)) for t in range(swa_epochs)]
-                   for lrs in targets_before_swa]
+        targets = [
+            lrs
+            + [
+                lrs[-1] * anneal_coef(t) + swa_lr * (1 - anneal_coef(t))
+                for t in range(swa_epochs)
+            ]
+            for lrs in targets_before_swa
+        ]
 
         self._test_swalr(swa_scheduler, scheduler, targets, swa_start, epochs)
 
@@ -2452,29 +3331,46 @@ def test_swalr_linear_anneal_after_multiplicative(self):
         epochs, swa_start, swa_lrs, anneal_epochs = 15, 5, [0.01, 0.02], 4
         mult_factor = 0.9
         scheduler = MultiplicativeLR(self.opt, lr_lambda=lambda epoch: mult_factor)
-        swa_scheduler = SWALR(self.opt, anneal_epochs=anneal_epochs,
-                              anneal_strategy="linear", swa_lr=swa_lrs)
+        swa_scheduler = SWALR(
+            self.opt,
+            anneal_epochs=anneal_epochs,
+            anneal_strategy="linear",
+            swa_lr=swa_lrs,
+        )
 
         def anneal_coef(t):
             if t + 1 >= anneal_epochs:
-                return 0.
+                return 0.0
             return 1 - (t + 1) / anneal_epochs
 
-        initial_lrs = [group['lr'] for group in self.opt.param_groups]
-        targets_before_swa = [[lr * mult_factor**i for i in range(swa_start + 1)]
-                              for lr in initial_lrs]
+        initial_lrs = [group["lr"] for group in self.opt.param_groups]
+        targets_before_swa = [
+            [lr * mult_factor**i for i in range(swa_start + 1)] for lr in initial_lrs
+        ]
         swa_epochs = epochs - swa_start - 1
-        targets = [lrs + [lrs[-1] * anneal_coef(t) + swa_lr * (1 - anneal_coef(t)) for t in range(swa_epochs)]
-                   for lrs, swa_lr in zip(targets_before_swa, swa_lrs)]
+        targets = [
+            lrs
+            + [
+                lrs[-1] * anneal_coef(t) + swa_lr * (1 - anneal_coef(t))
+                for t in range(swa_epochs)
+            ]
+            for lrs, swa_lr in zip(targets_before_swa, swa_lrs)
+        ]
 
         self._test_swalr(swa_scheduler, scheduler, targets, swa_start, epochs)
 
     def _test_swalr(self, swa_scheduler, scheduler, targets, swa_start, epochs):
         for epoch in range(epochs):
             for param_group, target in zip(self.opt.param_groups, targets):
-                self.assertEqual(target[epoch], param_group['lr'],
-                                 msg='LR is wrong in epoch {}: expected {}, got {}'.format(
-                                     epoch, target[epoch], param_group['lr']), atol=1e-5, rtol=0)
+                self.assertEqual(
+                    target[epoch],
+                    param_group["lr"],
+                    msg="LR is wrong in epoch {}: expected {}, got {}".format(
+                        epoch, target[epoch], param_group["lr"]
+                    ),
+                    atol=1e-5,
+                    rtol=0,
+                )
             if epoch >= swa_start:
                 self.opt.step()
                 swa_scheduler.step()
@@ -2485,29 +3381,32 @@ def _test_swalr(self, swa_scheduler, scheduler, targets, swa_start, epochs):
     def test_swalr_hypers(self):
         # Test that SWALR raises errors for incorrect hyper-parameters
         with self.assertRaisesRegex(ValueError, "anneal_strategy must"):
-            swa_scheduler = SWALR(self.opt, anneal_strategy="exponential", swa_lr=1.)
+            swa_scheduler = SWALR(self.opt, anneal_strategy="exponential", swa_lr=1.0)
 
         with self.assertRaisesRegex(ValueError, "anneal_epochs must"):
-            swa_scheduler = SWALR(self.opt, anneal_epochs=-1, swa_lr=1.)
+            swa_scheduler = SWALR(self.opt, anneal_epochs=-1, swa_lr=1.0)
         with self.assertRaisesRegex(ValueError, "anneal_epochs must"):
-            swa_scheduler = SWALR(self.opt, anneal_epochs=1.7, swa_lr=1.)
+            swa_scheduler = SWALR(self.opt, anneal_epochs=1.7, swa_lr=1.0)
         with self.assertRaisesRegex(ValueError, "swa_lr must"):
-            swa_scheduler = SWALR(self.opt, swa_lr=[1., 0.1, 0.01])
+            swa_scheduler = SWALR(self.opt, swa_lr=[1.0, 0.1, 0.01])
 
     def test_step_lr_state_dict(self):
         self._check_scheduler_state_dict(
             lambda: StepLR(self.opt, gamma=0.1, step_size=3),
-            lambda: StepLR(self.opt, gamma=0.01 / 2, step_size=1))
+            lambda: StepLR(self.opt, gamma=0.01 / 2, step_size=1),
+        )
 
     def test_multi_step_lr_state_dict(self):
         self._check_scheduler_state_dict(
             lambda: MultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9]),
-            lambda: MultiStepLR(self.opt, gamma=0.01, milestones=[1, 4, 6]))
+            lambda: MultiStepLR(self.opt, gamma=0.01, milestones=[1, 4, 6]),
+        )
 
     def test_exp_step_lr_state_dict(self):
         self._check_scheduler_state_dict(
             lambda: ExponentialLR(self.opt, gamma=0.1),
-            lambda: ExponentialLR(self.opt, gamma=0.01))
+            lambda: ExponentialLR(self.opt, gamma=0.01),
+        )
 
     def test_cosine_lr_state_dict(self):
         epochs = 10
@@ -2515,49 +3414,56 @@ def test_cosine_lr_state_dict(self):
         self._check_scheduler_state_dict(
             lambda: CosineAnnealingLR(self.opt, T_max=epochs, eta_min=eta_min),
             lambda: CosineAnnealingLR(self.opt, T_max=epochs // 2, eta_min=eta_min / 2),
-            epochs=epochs)
+            epochs=epochs,
+        )
 
     def test_reduce_lr_on_plateau_state_dict(self):
-        scheduler = ReduceLROnPlateau(self.opt, mode='min', factor=0.1, patience=2)
+        scheduler = ReduceLROnPlateau(self.opt, mode="min", factor=0.1, patience=2)
         for score in [1.0, 2.0, 3.0, 4.0, 3.0, 4.0, 5.0, 3.0, 2.0, 1.0]:
             scheduler.step(score)
-        scheduler_copy = ReduceLROnPlateau(self.opt, mode='max', factor=0.5, patience=10)
+        scheduler_copy = ReduceLROnPlateau(
+            self.opt, mode="max", factor=0.5, patience=10
+        )
         scheduler_copy.load_state_dict(scheduler.state_dict())
         for key in scheduler.__dict__.keys():
-            if key not in {'optimizer', 'is_better'}:
+            if key not in {"optimizer", "is_better"}:
                 self.assertEqual(scheduler.__dict__[key], scheduler_copy.__dict__[key])
 
     def test_lambda_lr_state_dict_fn(self):
         scheduler = LambdaLR(self.opt, lr_lambda=lambda x: x)
         state = scheduler.state_dict()
-        self.assertIsNone(state['lr_lambdas'][0])
+        self.assertIsNone(state["lr_lambdas"][0])
 
         scheduler_copy = LambdaLR(self.opt, lr_lambda=lambda x: x)
         scheduler_copy.load_state_dict(state)
         for key in scheduler.__dict__.keys():
-            if key not in {'optimizer', 'lr_lambdas'}:
+            if key not in {"optimizer", "lr_lambdas"}:
                 self.assertEqual(scheduler.__dict__[key], scheduler_copy.__dict__[key])
 
     def test_lambda_lr_state_dict_obj(self):
         scheduler = LambdaLR(self.opt, lr_lambda=LambdaLRTestObject(10))
         state = scheduler.state_dict()
-        self.assertIsNotNone(state['lr_lambdas'][0])
+        self.assertIsNotNone(state["lr_lambdas"][0])
 
         scheduler_copy = LambdaLR(self.opt, lr_lambda=LambdaLRTestObject(-1))
         scheduler_copy.load_state_dict(state)
         for key in scheduler.__dict__.keys():
-            if key not in {'optimizer'}:
+            if key not in {"optimizer"}:
                 self.assertEqual(scheduler.__dict__[key], scheduler_copy.__dict__[key])
 
     def test_CosineAnnealingWarmRestarts_lr_state_dict(self):
         self._check_scheduler_state_dict(
             lambda: CosineAnnealingWarmRestarts(self.opt, T_0=10, T_mult=2),
-            lambda: CosineAnnealingWarmRestarts(self.opt, T_0=100))
+            lambda: CosineAnnealingWarmRestarts(self.opt, T_0=100),
+        )
 
     def test_swa_lr_state_dict(self):
         self._check_scheduler_state_dict(
             lambda: SWALR(self.opt, anneal_epochs=3, swa_lr=0.5),
-            lambda: SWALR(self.opt, anneal_epochs=10, anneal_strategy="linear", swa_lr=5.))
+            lambda: SWALR(
+                self.opt, anneal_epochs=10, anneal_strategy="linear", swa_lr=5.0
+            ),
+        )
 
     def _check_scheduler_state_dict(self, constr, constr2, epochs=10):
         scheduler = constr()
@@ -2567,7 +3473,7 @@ def _check_scheduler_state_dict(self, constr, constr2, epochs=10):
         scheduler_copy = constr2()
         scheduler_copy.load_state_dict(scheduler.state_dict())
         for key in scheduler.__dict__.keys():
-            if key != 'optimizer':
+            if key != "optimizer":
                 self.assertEqual(scheduler.__dict__[key], scheduler_copy.__dict__[key])
         self.assertEqual(scheduler.get_last_lr(), scheduler_copy.get_last_lr())
 
@@ -2581,9 +3487,15 @@ def _test_get_last_lr(self, schedulers, targets, epochs=10):
             [scheduler.step() for scheduler in schedulers]
             target = [[t[epoch] for t in targets]] * len(schedulers)
             for t, r in zip(target, result):
-                self.assertEqual(target, result,
-                                 msg='LR is wrong in epoch {}: expected {}, got {}'.format(
-                                     epoch, t, r), atol=1e-5, rtol=0)
+                self.assertEqual(
+                    target,
+                    result,
+                    msg="LR is wrong in epoch {}: expected {}, got {}".format(
+                        epoch, t, r
+                    ),
+                    atol=1e-5,
+                    rtol=0,
+                )
 
     def _test_with_epoch(self, schedulers, targets, epochs=10):
         if isinstance(schedulers, LRScheduler):
@@ -2592,21 +3504,37 @@ def _test_with_epoch(self, schedulers, targets, epochs=10):
         for epoch in range(epochs):
             [optimizer.step() for optimizer in optimizers]
             with warnings.catch_warnings(record=True) as w:
-                [scheduler.step(epoch) for scheduler in schedulers]  # step before assert: skip initial lr
-                self._check_warning_is_epoch_deprecation_warning(w, num_warnings=len(schedulers))
+                [
+                    scheduler.step(epoch) for scheduler in schedulers
+                ]  # step before assert: skip initial lr
+                self._check_warning_is_epoch_deprecation_warning(
+                    w, num_warnings=len(schedulers)
+                )
             for param_group, target in zip(self.opt.param_groups, targets):
-                self.assertEqual(target[epoch], param_group['lr'],
-                                 msg='LR is wrong in epoch {}: expected {}, got {}'.format(
-                                     epoch, target[epoch], param_group['lr']), atol=1e-5, rtol=0)
+                self.assertEqual(
+                    target[epoch],
+                    param_group["lr"],
+                    msg="LR is wrong in epoch {}: expected {}, got {}".format(
+                        epoch, target[epoch], param_group["lr"]
+                    ),
+                    atol=1e-5,
+                    rtol=0,
+                )
 
     def _test(self, schedulers, targets, epochs=10):
         if isinstance(schedulers, LRScheduler):
             schedulers = [schedulers]
         for epoch in range(epochs):
             for param_group, target in zip(self.opt.param_groups, targets):
-                self.assertEqual(target[epoch], param_group['lr'],
-                                 msg='LR is wrong in epoch {}: expected {}, got {}'.format(
-                                     epoch, target[epoch], param_group['lr']), atol=1e-5, rtol=0)
+                self.assertEqual(
+                    target[epoch],
+                    param_group["lr"],
+                    msg="LR is wrong in epoch {}: expected {}, got {}".format(
+                        epoch, target[epoch], param_group["lr"]
+                    ),
+                    atol=1e-5,
+                    rtol=0,
+                )
             [scheduler.step() for scheduler in schedulers]
 
     def _test_CosineAnnealingWarmRestarts(self, scheduler, targets, epochs=10):
@@ -2614,17 +3542,29 @@ def _test_CosineAnnealingWarmRestarts(self, scheduler, targets, epochs=10):
             epoch = round(epoch.item(), 1)
             scheduler.step(epoch)
             for param_group, target in zip(self.opt.param_groups, targets):
-                self.assertEqual(target[index], param_group['lr'],
-                                 msg='LR is wrong in epoch {}: expected {}, got {}'.format(
-                                     epoch, target[index], param_group['lr']), atol=1e-5, rtol=0)
+                self.assertEqual(
+                    target[index],
+                    param_group["lr"],
+                    msg="LR is wrong in epoch {}: expected {}, got {}".format(
+                        epoch, target[index], param_group["lr"]
+                    ),
+                    atol=1e-5,
+                    rtol=0,
+                )
 
     def _test_interleaved_CosineAnnealingWarmRestarts(self, scheduler, targets, epochs):
         for index, epoch in enumerate(epochs):
             scheduler.step(epoch)
             for param_group, target in zip(self.opt.param_groups, targets):
-                self.assertEqual(target[index], param_group['lr'],
-                                 msg='LR is wrong in epoch {}: expected {}, got {}'.format(
-                                     epoch, target[index], param_group['lr']), atol=1e-5, rtol=0)
+                self.assertEqual(
+                    target[index],
+                    param_group["lr"],
+                    msg="LR is wrong in epoch {}: expected {}, got {}".format(
+                        epoch, target[index], param_group["lr"]
+                    ),
+                    atol=1e-5,
+                    rtol=0,
+                )
 
     def _test_against_closed_form(self, scheduler, closed_form_scheduler, epochs=10):
         self.setUp()
@@ -2634,18 +3574,28 @@ def _test_against_closed_form(self, scheduler, closed_form_scheduler, epochs=10)
             with warnings.catch_warnings(record=True) as w:
                 closed_form_scheduler.step(epoch)
                 self._check_warning_is_epoch_deprecation_warning(w)
-            targets.append([group['lr'] for group in self.opt.param_groups])
+            targets.append([group["lr"] for group in self.opt.param_groups])
         self.setUp()
         for epoch in range(epochs):
             self.opt.step()
             scheduler.step()
             for i, param_group in enumerate(self.opt.param_groups):
-                self.assertEqual(targets[epoch][i], param_group['lr'],
-                                 msg='LR is wrong in epoch {}: expected {}, got {}'.format(
-                                     epoch, targets[epoch][i], param_group['lr']), atol=1e-5, rtol=0)
+                self.assertEqual(
+                    targets[epoch][i],
+                    param_group["lr"],
+                    msg="LR is wrong in epoch {}: expected {}, got {}".format(
+                        epoch, targets[epoch][i], param_group["lr"]
+                    ),
+                    atol=1e-5,
+                    rtol=0,
+                )
 
-    def _test_reduce_lr_on_plateau(self, schedulers, targets, metrics, epochs=10, verbose=False):
-        if isinstance(schedulers, LRScheduler) or isinstance(schedulers, ReduceLROnPlateau):
+    def _test_reduce_lr_on_plateau(
+        self, schedulers, targets, metrics, epochs=10, verbose=False
+    ):
+        if isinstance(schedulers, LRScheduler) or isinstance(
+            schedulers, ReduceLROnPlateau
+        ):
             schedulers = [schedulers]
         for epoch in range(epochs):
             self.opt.step()
@@ -2655,40 +3605,89 @@ def _test_reduce_lr_on_plateau(self, schedulers, targets, metrics, epochs=10, ve
                 else:
                     scheduler.step()
             if verbose:
-                print('epoch{}:\tlr={}'.format(epoch, self.opt.param_groups[0]['lr']))
+                print("epoch{}:\tlr={}".format(epoch, self.opt.param_groups[0]["lr"]))
             for param_group, target in zip(self.opt.param_groups, targets):
-                self.assertEqual(target[epoch], param_group['lr'],
-                                 msg='LR is wrong in epoch {}: expected {}, got {}'.format(
-                                     epoch, target[epoch], param_group['lr']), atol=1e-5, rtol=0)
+                self.assertEqual(
+                    target[epoch],
+                    param_group["lr"],
+                    msg="LR is wrong in epoch {}: expected {}, got {}".format(
+                        epoch, target[epoch], param_group["lr"]
+                    ),
+                    atol=1e-5,
+                    rtol=0,
+                )
 
-    def _test_cycle_lr(self, scheduler, lr_targets, momentum_targets, batch_iterations, verbose=False, use_beta1=False):
+    def _test_cycle_lr(
+        self,
+        scheduler,
+        lr_targets,
+        momentum_targets,
+        batch_iterations,
+        verbose=False,
+        use_beta1=False,
+    ):
         for batch_num in range(batch_iterations):
             if verbose:
-                if 'momentum' in self.opt.param_groups[0].keys():
-                    print('batch{}:\tlr={},momentum={}'.format(batch_num, self.opt.param_groups[0]['lr'],
-                                                               self.opt.param_groups[0]['momentum']))
-                elif use_beta1 and 'betas' in self.opt.param_groups[0].keys():
-                    print('batch{}:\tlr={},beta1={}'.format(batch_num, self.opt.param_groups[0]['lr'],
-                                                            self.opt.param_groups[0]['betas'][0]))
+                if "momentum" in self.opt.param_groups[0].keys():
+                    print(
+                        "batch{}:\tlr={},momentum={}".format(
+                            batch_num,
+                            self.opt.param_groups[0]["lr"],
+                            self.opt.param_groups[0]["momentum"],
+                        )
+                    )
+                elif use_beta1 and "betas" in self.opt.param_groups[0].keys():
+                    print(
+                        "batch{}:\tlr={},beta1={}".format(
+                            batch_num,
+                            self.opt.param_groups[0]["lr"],
+                            self.opt.param_groups[0]["betas"][0],
+                        )
+                    )
                 else:
-                    print('batch{}:\tlr={}'.format(batch_num, self.opt.param_groups[0]['lr']))
-
-            for param_group, lr_target, momentum_target in zip(self.opt.param_groups, lr_targets, momentum_targets):
+                    print(
+                        "batch{}:\tlr={}".format(
+                            batch_num, self.opt.param_groups[0]["lr"]
+                        )
+                    )
+
+            for param_group, lr_target, momentum_target in zip(
+                self.opt.param_groups, lr_targets, momentum_targets
+            ):
                 self.assertEqual(
-                    lr_target[batch_num], param_group['lr'],
-                    msg='LR is wrong in batch_num {}: expected {}, got {}'.format(
-                        batch_num, lr_target[batch_num], param_group['lr']), atol=1e-5, rtol=0)
+                    lr_target[batch_num],
+                    param_group["lr"],
+                    msg="LR is wrong in batch_num {}: expected {}, got {}".format(
+                        batch_num, lr_target[batch_num], param_group["lr"]
+                    ),
+                    atol=1e-5,
+                    rtol=0,
+                )
 
-                if use_beta1 and 'betas' in param_group.keys():
+                if use_beta1 and "betas" in param_group.keys():
                     self.assertEqual(
-                        momentum_target[batch_num], param_group['betas'][0],
-                        msg='Beta1 is wrong in batch_num {}: expected {}, got {}'.format(
-                            batch_num, momentum_target[batch_num], param_group['betas'][0]), atol=1e-5, rtol=0)
-                elif 'momentum' in param_group.keys():
+                        momentum_target[batch_num],
+                        param_group["betas"][0],
+                        msg="Beta1 is wrong in batch_num {}: expected {}, got {}".format(
+                            batch_num,
+                            momentum_target[batch_num],
+                            param_group["betas"][0],
+                        ),
+                        atol=1e-5,
+                        rtol=0,
+                    )
+                elif "momentum" in param_group.keys():
                     self.assertEqual(
-                        momentum_target[batch_num], param_group['momentum'],
-                        msg='Momentum is wrong in batch_num {}: expected {}, got {}'.format(
-                            batch_num, momentum_target[batch_num], param_group['momentum']), atol=1e-5, rtol=0)
+                        momentum_target[batch_num],
+                        param_group["momentum"],
+                        msg="Momentum is wrong in batch_num {}: expected {}, got {}".format(
+                            batch_num,
+                            momentum_target[batch_num],
+                            param_group["momentum"],
+                        ),
+                        atol=1e-5,
+                        rtol=0,
+                    )
             self.opt.step()
             scheduler.step()
 
@@ -2701,7 +3700,9 @@ def test_cosine_then_cyclic(self):
 
         model = torch.nn.Linear(2, 1)
         optimizer = torch.optim.SGD(model.parameters(), lr=optim_lr)
-        lr_scheduler_1 = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=20, eta_min=0.1)
+        lr_scheduler_1 = torch.optim.lr_scheduler.CosineAnnealingLR(
+            optimizer, T_max=20, eta_min=0.1
+        )
         lr_scheduler_2 = torch.optim.lr_scheduler.CyclicLR(
             optimizer, base_lr=base_lr, max_lr=max_lr, step_size_up=1, step_size_down=3
         )
@@ -2737,7 +3738,9 @@ class SWATestCNN(torch.nn.Module):
     def __init__(self, input_channels):
         super(SWATestCNN, self).__init__()
         self.n_features = 10
-        self.conv1 = torch.nn.Conv2d(input_channels, self.n_features, kernel_size=3, padding=1)
+        self.conv1 = torch.nn.Conv2d(
+            input_channels, self.n_features, kernel_size=3, padding=1
+        )
         self.bn = torch.nn.BatchNorm2d(self.n_features, momentum=0.3)
 
     def compute_preactivation(self, x):
@@ -2750,7 +3753,6 @@ def forward(self, x):
 
 
 class TestSWAUtils(TestCase):
-
     def _test_averaged_model(self, net_device, swa_device):
         dnn = torch.nn.Sequential(
             torch.nn.Conv2d(1, 5, kernel_size=3),
@@ -2761,7 +3763,7 @@ def _test_averaged_model(self, net_device, swa_device):
             torch.nn.ReLU(),
             torch.nn.Linear(5, 5),
             torch.nn.ReLU(),
-            torch.nn.Linear(5, 10)
+            torch.nn.Linear(5, 10),
         ).to(net_device)
 
         averaged_dnn = AveragedModel(dnn, device=swa_device)
@@ -2793,8 +3795,7 @@ def test_averaged_model_mixed_device(self):
         if not torch.cuda.is_available():
             return
         dnn = torch.nn.Sequential(
-            torch.nn.Conv2d(1, 5, kernel_size=3),
-            torch.nn.Linear(5, 10)
+            torch.nn.Conv2d(1, 5, kernel_size=3), torch.nn.Linear(5, 10)
         )
         dnn[0].cuda()
         dnn[1].cpu()
@@ -2814,8 +3815,7 @@ def test_averaged_model_mixed_device(self):
 
     def test_averaged_model_state_dict(self):
         dnn = torch.nn.Sequential(
-            torch.nn.Conv2d(1, 5, kernel_size=3),
-            torch.nn.Linear(5, 10)
+            torch.nn.Conv2d(1, 5, kernel_size=3), torch.nn.Linear(5, 10)
         )
         averaged_dnn = AveragedModel(dnn)
         averaged_dnn2 = AveragedModel(dnn)
@@ -2834,12 +3834,13 @@ def test_averaged_model_exponential(self):
         dnn = torch.nn.Sequential(
             torch.nn.Conv2d(1, 5, kernel_size=3),
             torch.nn.BatchNorm2d(5, momentum=0.3),
-            torch.nn.Linear(5, 10)
+            torch.nn.Linear(5, 10),
         )
         alpha = 0.9
 
         def avg_fn(p_avg, p, n_avg):
             return alpha * p_avg + (1 - alpha) * p
+
         averaged_dnn = AveragedModel(dnn, avg_fn=avg_fn)
         averaged_params = [torch.zeros_like(param) for param in dnn.parameters()]
         n_updates = 10
@@ -2850,8 +3851,9 @@ def avg_fn(p_avg, p, n_avg):
                 if i == 0:
                     updated_averaged_params.append(p.clone())
                 else:
-                    updated_averaged_params.append((p_avg * alpha +
-                                                   p * (1 - alpha)).clone())
+                    updated_averaged_params.append(
+                        (p_avg * alpha + p * (1 - alpha)).clone()
+                    )
             for b in dnn.buffers():
                 if b.size() != torch.Size([]):
                     b.detach_().add_(torch.randn_like(b))
@@ -2869,16 +3871,20 @@ def test_averaged_model_exponential_buffers(self):
         dnn = torch.nn.Sequential(
             torch.nn.Conv2d(1, 5, kernel_size=3),
             torch.nn.BatchNorm2d(5, momentum=0.3),
-            torch.nn.Linear(5, 10)
+            torch.nn.Linear(5, 10),
         )
         alpha = 0.9
 
         def avg_fn(p_avg, p, n_avg):
             return alpha * p_avg + (1 - alpha) * p
+
         averaged_dnn = AveragedModel(dnn, avg_fn=avg_fn, use_buffers=True)
         dnn_params = itertools.chain(dnn.parameters(), dnn.buffers())
-        averaged_params = [torch.zeros_like(param) for param in dnn_params
-                           if param.size() != torch.Size([])]
+        averaged_params = [
+            torch.zeros_like(param)
+            for param in dnn_params
+            if param.size() != torch.Size([])
+        ]
         n_updates = 10
         for i in range(n_updates):
             updated_averaged_params = []
@@ -2889,13 +3895,18 @@ def avg_fn(p_avg, p, n_avg):
                 if i == 0:
                     updated_averaged_params.append(p.clone())
                 else:
-                    updated_averaged_params.append((p_avg * alpha +
-                                                   p * (1 - alpha)).clone())
+                    updated_averaged_params.append(
+                        (p_avg * alpha + p * (1 - alpha)).clone()
+                    )
             averaged_dnn.update_parameters(dnn)
             averaged_params = updated_averaged_params
 
         for p_avg, p_swa in zip(
-                averaged_params, itertools.chain(averaged_dnn.module.parameters(), averaged_dnn.module.buffers())):
+            averaged_params,
+            itertools.chain(
+                averaged_dnn.module.parameters(), averaged_dnn.module.buffers()
+            ),
+        ):
             self.assertEqual(p_avg, p_swa)
 
     def _test_update_bn(self, dnn, dl_x, dl_xy, cuda):
@@ -2930,10 +3941,10 @@ def _test_update_bn(self, dnn, dl_x, dl_xy, cuda):
         self.assertEqual(preactivation_var, dnn.bn.running_var, atol=1e-1, rtol=0)
 
         def _reset_bn(module):
-            if issubclass(module.__class__,
-                          torch.nn.modules.batchnorm._BatchNorm):
+            if issubclass(module.__class__, torch.nn.modules.batchnorm._BatchNorm):
                 module.running_mean = torch.zeros_like(module.running_mean)
                 module.running_var = torch.ones_like(module.running_var)
+
         # reset batch norm and run update_bn again
         dnn.apply(_reset_bn)
         update_bn(dl_xy, dnn, device=x.device)
@@ -3018,17 +4029,29 @@ def _diff_fn(p, grad, opt_differentiable_state, opt_class, kwargs, *ignored):
     opt.state[p].update(opt_differentiable_state)
     opt.step()
     return (p,) + tuple(
-        v for v in opt.state[p].values() if isinstance(v, torch.Tensor) and v.requires_grad)
+        v
+        for v in opt.state[p].values()
+        if isinstance(v, torch.Tensor) and v.requires_grad
+    )
 
 
 class TestDifferentiableOptimizer(TestCase):
-
     def test_sgd(self):
         p = torch.rand(10, requires_grad=True, dtype=torch.float64)
         grad = torch.rand(10, requires_grad=True, dtype=torch.float64)
         mbuff = torch.rand(10, requires_grad=True, dtype=torch.float64)
-        state = {'momentum_buffer': mbuff}
-        gradcheck(_diff_fn, (p, grad, state, torch.optim.SGD, {'lr': 0.9, 'differentiable': True}, *state.values()))
+        state = {"momentum_buffer": mbuff}
+        gradcheck(
+            _diff_fn,
+            (
+                p,
+                grad,
+                state,
+                torch.optim.SGD,
+                {"lr": 0.9, "differentiable": True},
+                *state.values(),
+            ),
+        )
 
     def test_adam(self):
         state = {}
@@ -3036,31 +4059,56 @@ def test_adam(self):
         grad = torch.rand(10, requires_grad=True, dtype=torch.float64)
         # `step` is not a continuous variable (even though we define it as a float)
         # and so it shouldn't require gradients.
-        state['step'] = torch.tensor(10., requires_grad=False, dtype=torch.float64)
-        state['exp_avg'] = torch.rand(10, requires_grad=True, dtype=torch.float64)
-        state['exp_avg_sq'] = torch.rand(10, requires_grad=True, dtype=torch.float64)
-        state['max_exp_avg_sq'] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["step"] = torch.tensor(10.0, requires_grad=False, dtype=torch.float64)
+        state["exp_avg"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["exp_avg_sq"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["max_exp_avg_sq"] = torch.rand(
+            10, requires_grad=True, dtype=torch.float64
+        )
 
         gradcheck(
             _diff_fn,
-            (p, grad, state, torch.optim.Adam,
-             {'lr': 0.9, 'differentiable': True, 'amsgrad': True}, *state.values())
+            (
+                p,
+                grad,
+                state,
+                torch.optim.Adam,
+                {"lr": 0.9, "differentiable": True, "amsgrad": True},
+                *state.values(),
+            ),
         )
 
     def test_rmsprop(self):
         state = {}
         p = torch.rand(10, requires_grad=True, dtype=torch.float64)
         grad = torch.rand(10, requires_grad=True, dtype=torch.float64)
-        state['step'] = 0
-        state['square_avg'] = torch.rand(10, requires_grad=True, dtype=torch.float64)
-        state['momentum_buffer'] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["step"] = 0
+        state["square_avg"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["momentum_buffer"] = torch.rand(
+            10, requires_grad=True, dtype=torch.float64
+        )
         # This can cause issues with large values and nan due to sqrt ops
-        state['grad_avg'] = 1e-2 * torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["grad_avg"] = 1e-2 * torch.rand(
+            10, requires_grad=True, dtype=torch.float64
+        )
         gradcheck(
             _diff_fn,
-            (p, grad, state, torch.optim.RMSprop,
-             {'lr': 0.9, 'maximize': True, 'momentum': 0.9, 'differentiable': True, 'centered': True, 'weight_decay': 0.1},
-             *state.values()))
+            (
+                p,
+                grad,
+                state,
+                torch.optim.RMSprop,
+                {
+                    "lr": 0.9,
+                    "maximize": True,
+                    "momentum": 0.9,
+                    "differentiable": True,
+                    "centered": True,
+                    "weight_decay": 0.1,
+                },
+                *state.values(),
+            ),
+        )
 
     def test_adadelta(self):
         state = {}
@@ -3068,13 +4116,19 @@ def test_adadelta(self):
         grad = torch.rand(10, requires_grad=True, dtype=torch.float64)
         # `step` is not a continuous variable (even though we define it as a float)
         # and so it shouldn't require gradients.
-        state['step'] = torch.tensor(10., requires_grad=False, dtype=torch.float64)
-        state['square_avg'] = torch.rand(10, requires_grad=True, dtype=torch.float64)
-        state['acc_delta'] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["step"] = torch.tensor(10.0, requires_grad=False, dtype=torch.float64)
+        state["square_avg"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["acc_delta"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
         gradcheck(
             _diff_fn,
-            (p, grad, state, torch.optim.Adadelta,
-             {'lr': 0.9, 'weight_decay': 0.1, 'differentiable': True}, *state.values())
+            (
+                p,
+                grad,
+                state,
+                torch.optim.Adadelta,
+                {"lr": 0.9, "weight_decay": 0.1, "differentiable": True},
+                *state.values(),
+            ),
         )
 
     def test_adagrad(self):
@@ -3083,12 +4137,18 @@ def test_adagrad(self):
         grad = torch.rand(10, requires_grad=True, dtype=torch.float64)
         # `step` is not a continuous variable (even though we define it as a float)
         # and so it shouldn't require gradients.
-        state['step'] = torch.tensor(10., requires_grad=False, dtype=torch.float64)
-        state['sum'] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["step"] = torch.tensor(10.0, requires_grad=False, dtype=torch.float64)
+        state["sum"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
         gradcheck(
             _diff_fn,
-            (p, grad, state, torch.optim.Adagrad,
-             {'lr': 0.9, 'weight_decay': 0.1, 'differentiable': True}, *state.values())
+            (
+                p,
+                grad,
+                state,
+                torch.optim.Adagrad,
+                {"lr": 0.9, "weight_decay": 0.1, "differentiable": True},
+                *state.values(),
+            ),
         )
 
     def test_adamax(self):
@@ -3097,13 +4157,19 @@ def test_adamax(self):
         grad = torch.rand(10, requires_grad=True, dtype=torch.float64)
         # `step` is not a continuous variable (even though we define it as a float)
         # and so it shouldn't require gradients.
-        state['step'] = torch.tensor(10., requires_grad=False, dtype=torch.float64)
-        state['exp_avg'] = torch.rand(10, requires_grad=True, dtype=torch.float64)
-        state['exp_inf'] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["step"] = torch.tensor(10.0, requires_grad=False, dtype=torch.float64)
+        state["exp_avg"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["exp_inf"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
         gradcheck(
             _diff_fn,
-            (p, grad, state, torch.optim.Adamax,
-             {'lr': 0.9, 'weight_decay': 0.1, 'differentiable': True}, *state.values())
+            (
+                p,
+                grad,
+                state,
+                torch.optim.Adamax,
+                {"lr": 0.9, "weight_decay": 0.1, "differentiable": True},
+                *state.values(),
+            ),
         )
 
     def test_asgd(self):
@@ -3112,15 +4178,21 @@ def test_asgd(self):
         grad = torch.rand(10, requires_grad=True, dtype=torch.float64)
         # `step` `eta` & `mu` are not continuous variables (even though we define them as a float)
         # and so it shouldn't require gradients.
-        state['step'] = torch.tensor(10., requires_grad=False, dtype=torch.float64)
-        state['eta'] = torch.tensor(0.9, requires_grad=False, dtype=torch.float64)
-        state['mu'] = torch.tensor(1.0, requires_grad=False, dtype=torch.float64)
-        state['ax'] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["step"] = torch.tensor(10.0, requires_grad=False, dtype=torch.float64)
+        state["eta"] = torch.tensor(0.9, requires_grad=False, dtype=torch.float64)
+        state["mu"] = torch.tensor(1.0, requires_grad=False, dtype=torch.float64)
+        state["ax"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
 
         gradcheck(
             _diff_fn,
-            (p, grad, state, torch.optim.ASGD,
-             {'lr': 0.9, 'differentiable': True}, *state.values())
+            (
+                p,
+                grad,
+                state,
+                torch.optim.ASGD,
+                {"lr": 0.9, "differentiable": True},
+                *state.values(),
+            ),
         )
 
     def test_rprop(self):
@@ -3129,32 +4201,45 @@ def test_rprop(self):
         grad = torch.rand(10, requires_grad=True, dtype=torch.float64)
         # `step` is not a continuous variable (even though we define it as a float)
         # and so it shouldn't require gradients.
-        state['step'] = torch.tensor(10., requires_grad=False, dtype=torch.float64)
-        state['prev'] = torch.rand(10, requires_grad=True, dtype=torch.float64)
-        state['step_size'] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["step"] = torch.tensor(10.0, requires_grad=False, dtype=torch.float64)
+        state["prev"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["step_size"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
 
         gradcheck(
             _diff_fn,
-            (p, grad, state, torch.optim.Rprop,
-             {'lr': 0.9, 'differentiable': True}, *state.values())
+            (
+                p,
+                grad,
+                state,
+                torch.optim.Rprop,
+                {"lr": 0.9, "differentiable": True},
+                *state.values(),
+            ),
         )
 
-
     def test_adamw(self):
         state = {}
         p = torch.rand(10, requires_grad=True, dtype=torch.float64)
         grad = torch.rand(10, requires_grad=True, dtype=torch.float64)
         # `step` is not a continuous variable (even though we define it as a float)
         # and so it shouldn't require gradients.
-        state['step'] = torch.tensor(10., requires_grad=False, dtype=torch.float64)
-        state['exp_avg'] = torch.rand(10, requires_grad=True, dtype=torch.float64)
-        state['exp_avg_sq'] = torch.rand(10, requires_grad=True, dtype=torch.float64)
-        state['max_exp_avg_sq'] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["step"] = torch.tensor(10.0, requires_grad=False, dtype=torch.float64)
+        state["exp_avg"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["exp_avg_sq"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["max_exp_avg_sq"] = torch.rand(
+            10, requires_grad=True, dtype=torch.float64
+        )
 
         gradcheck(
             _diff_fn,
-            (p, grad, state, torch.optim.AdamW,
-             {'lr': 0.9, 'differentiable': True, 'amsgrad': True}, *state.values())
+            (
+                p,
+                grad,
+                state,
+                torch.optim.AdamW,
+                {"lr": 0.9, "differentiable": True, "amsgrad": True},
+                *state.values(),
+            ),
         )
 
     def test_nadam(self):
@@ -3163,15 +4248,21 @@ def test_nadam(self):
         grad = torch.rand(10, requires_grad=True, dtype=torch.float64)
         # `step` is not a continuous variable (even though we define it as a float)
         # and so it shouldn't require gradients.
-        state['step'] = torch.tensor(10., requires_grad=False, dtype=torch.float64)
-        state['exp_avg'] = torch.rand(10, requires_grad=True, dtype=torch.float64)
-        state['exp_avg_sq'] = torch.rand(10, requires_grad=True, dtype=torch.float64)
-        state['mu_product'] = torch.tensor(1.0, requires_grad=True, dtype=torch.float64)
+        state["step"] = torch.tensor(10.0, requires_grad=False, dtype=torch.float64)
+        state["exp_avg"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["exp_avg_sq"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["mu_product"] = torch.tensor(1.0, requires_grad=True, dtype=torch.float64)
 
         gradcheck(
             _diff_fn,
-            (p, grad, state, torch.optim.NAdam,
-             {'lr': 0.9, 'differentiable': True}, *state.values())
+            (
+                p,
+                grad,
+                state,
+                torch.optim.NAdam,
+                {"lr": 0.9, "differentiable": True},
+                *state.values(),
+            ),
         )
 
     def test_radam(self):
@@ -3180,16 +4271,22 @@ def test_radam(self):
         grad = torch.rand(10, requires_grad=True, dtype=torch.float64)
         # `step` is not a continuous variable (even though we define it as a float)
         # and so it shouldn't require gradients.
-        state['step'] = torch.tensor(10., requires_grad=False, dtype=torch.float64)
-        state['exp_avg'] = torch.rand(10, requires_grad=True, dtype=torch.float64)
-        state['exp_avg_sq'] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["step"] = torch.tensor(10.0, requires_grad=False, dtype=torch.float64)
+        state["exp_avg"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
+        state["exp_avg_sq"] = torch.rand(10, requires_grad=True, dtype=torch.float64)
 
         gradcheck(
             _diff_fn,
-            (p, grad, state, torch.optim.RAdam,
-             {'lr': 0.9, 'differentiable': True}, *state.values())
+            (
+                p,
+                grad,
+                state,
+                torch.optim.RAdam,
+                {"lr": 0.9, "differentiable": True},
+                *state.values(),
+            ),
         )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/torch/_dynamo/skipfiles.py b/torch/_dynamo/skipfiles.py
index ee2ad3f9395ff..41a04626756d2 100644
--- a/torch/_dynamo/skipfiles.py
+++ b/torch/_dynamo/skipfiles.py
@@ -122,16 +122,6 @@ def _module_dir(m: types.ModuleType):
     torch.set_rng_state.__code__.co_filename,
 }
 
-# Include optimizer code for tracing
-FILENAME_ALLOWLIST |= set(
-    [
-        inspect.getfile(obj)
-        for obj in torch.optim.__dict__.values()
-        if inspect.isclass(obj)
-    ]
-)
-
-FILENAME_ALLOWLIST |= {torch.optim._functional.__file__}
 
 if HAS_PRIMS_REFS:
     FILENAME_ALLOWLIST |= {
@@ -143,7 +133,6 @@ def _module_dir(m: types.ModuleType):
         torch._refs.nn.functional.__file__,
     }
 
-FILENAME_ALLOWLIST |= {torch.optim._functional.__file__}
 
 SKIP_DIRS_RE = None
 

From 0286965ecfb842611140d5a5a2cb3d30ffa02335 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Thu, 24 Nov 2022 05:28:58 +0000
Subject: [PATCH 1248/1922] [Dynamo] Fix bug of using customized
 torch.autograd.Function (#89397)

Fixes https://github.com/pytorch/torchdynamo/issues/1899

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89397
Approved by: https://github.com/jansel
---
 test/dynamo/test_misc.py        | 55 ++++++++++++++++++++++-----------
 torch/_dynamo/variables/misc.py | 16 ++++++++--
 2 files changed, 50 insertions(+), 21 deletions(-)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index 109118c68b6f6..782c166b15bfd 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -2747,21 +2747,12 @@ def fn():
         self.assertTrue(same(ref, res))
 
     def test_autograd_function_equivalence(self):
-        m1 = Module1()
-
-        @torch._dynamo.optimize("eager", nopython=True)
-        def f1():
-            return m1(torch.ones(2, 3))
-
-        self.assertTrue(torch.allclose(f1(), torch.tensor([2.0])))
-
-        m2 = Module2()
-
-        @torch._dynamo.optimize("eager", nopython=True)
-        def f2():
-            return m2(torch.ones(2, 3))
-
-        self.assertTrue(torch.allclose(f2(), torch.tensor([2.0])))
+        for i in range(1, 5):
+            model = globals()[f"Module{i}"]()
+            opt_model = torch._dynamo.optimize("eager", nopython=True)(model)
+            self.assertTrue(
+                torch.allclose(opt_model(torch.ones(2, 3)), torch.tensor([2.0]))
+            )
 
     def test_object_classmethod(self):
         class C:
@@ -3048,7 +3039,7 @@ def fn(x):
         self.assertTrue(same(ref, res))
 
 
-class CustomFunc(torch.autograd.Function):
+class CustomFunc1(torch.autograd.Function):
     @staticmethod
     def forward(ctx, foo):
         return foo + foo
@@ -3058,18 +3049,46 @@ def backward(ctx, grad_output):
         return grad_output
 
 
+class CustomFunc2(torch.autograd.Function):
+    # the forward function can be staticmethod or classmethod
+    @classmethod
+    def forward(cls, ctx, foo):
+        return foo + foo
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+
+
 class Module1(torch.nn.Module):
     def __init__(self):
         super().__init__()
 
     def forward(self, foo):
-        return CustomFunc().apply(foo)
+        return CustomFunc1().apply(foo)
 
 
 class Module2(torch.nn.Module):
     def __init__(self):
         super().__init__()
-        self.fn = CustomFunc.apply
+        self.fn = CustomFunc1.apply
+
+    def forward(self, foo):
+        return self.fn(foo)
+
+
+class Module3(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, foo):
+        return CustomFunc2().apply(foo)
+
+
+class Module4(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fn = CustomFunc2.apply
 
     def forward(self, foo):
         return self.fn(foo)
diff --git a/torch/_dynamo/variables/misc.py b/torch/_dynamo/variables/misc.py
index f8975f70fcfb3..7e1c91b68c41f 100644
--- a/torch/_dynamo/variables/misc.py
+++ b/torch/_dynamo/variables/misc.py
@@ -459,9 +459,19 @@ def visit(node):
 
         args = [BlackHoleVariable()] + list(args)
         options = VariableTracker.propagate(self, args, kwargs.values())
-        return variables.UserFunctionVariable(
-            self.fn_cls.forward, **options
-        ).call_function(tx, args, kwargs)
+        fn = self.fn_cls.forward
+        if isinstance(fn, types.FunctionType):
+            return variables.UserFunctionVariable(fn, **options).call_function(
+                tx, args, kwargs
+            )
+        elif isinstance(fn, types.MethodType):
+            return variables.UserMethodVariable(
+                fn.__func__, variables.UserDefinedClassVariable(self.fn_cls), **options
+            ).call_function(tx, args, kwargs)
+        else:
+            unimplemented(
+                f"non-function or method in subclass of torch.autograd.Function: {fn}"
+            )
 
     def call_function(self, tx, args, kwargs):
         options = VariableTracker.propagate(self, args, kwargs.values())

From 786c7faf491e24b5ede1970f55b0b98b68d9f8bb Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Wed, 23 Nov 2022 13:01:15 -0800
Subject: [PATCH 1249/1922] quantization: deprecate observer compute_dtype and
 replace with is_dynamic (#85431)

Summary:

This PR deprecates the `compute_dtype` field on observers, and replaces
it with the `is_dynamic` field on observers.  This is better aligned
with the reference model spec.

Test plan:

```
python test/test_quantization.py TestQuantizeFx
python test/test_quantization.py TestQuantizeFxOps
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/85431
Approved by: https://github.com/jerryzh168
---
 .../fx/_lower_to_native_backend.py            | 16 +++---
 torch/ao/quantization/fx/convert.py           | 53 ++++++++++---------
 torch/ao/quantization/fx/prepare.py           |  7 ++-
 .../quantization/fx/qconfig_mapping_utils.py  |  5 +-
 torch/ao/quantization/observer.py             | 21 +++++---
 torch/ao/quantization/qconfig.py              |  2 +-
 torch/ao/quantization/utils.py                | 19 ++++---
 .../quantization/insert_quant_dequant.cpp     | 24 ++-------
 8 files changed, 71 insertions(+), 76 deletions(-)

diff --git a/torch/ao/quantization/fx/_lower_to_native_backend.py b/torch/ao/quantization/fx/_lower_to_native_backend.py
index f08f5c6073b7a..93c3d07e18805 100644
--- a/torch/ao/quantization/fx/_lower_to_native_backend.py
+++ b/torch/ao/quantization/fx/_lower_to_native_backend.py
@@ -284,7 +284,7 @@ def should_skip_lowering(op: torch.fx.node.Node, qconfig_map: Dict[str, QConfigA
 }
 
 # Mapping from a functional to a dictionary, where the key is a 2-tuple of
-# (activation_compute_dtype, weight_dtype) and the value is a 2-tuple of
+# (input_activation_dtype, weight_dtype) and the value is a 2-tuple of
 #   1) The dynamically quantized version of the op
 #   2) The dynamically quantized version of the op fused with relu, if it exists, else None
 DYNAMIC_LOWER_FUNCTIONAL_MAP: Dict[Callable, Dict[Tuple[torch.dtype, torch.dtype], Tuple[Callable, Optional[Callable]]]] = {
@@ -537,9 +537,9 @@ def _lower_dynamic_weighted_ref_module(model: QuantizedGraphModule):
            input_dynamic_q_node.target != torch.quantize_per_tensor_dynamic:
             continue
 
-        activation_compute_dtype = input_dynamic_q_node.args[1]
-        is_fp16 = activation_compute_dtype == torch.float16
-        is_int8 = activation_compute_dtype in [torch.quint8, torch.qint8]
+        activation_dtype = input_dynamic_q_node.args[1]
+        is_fp16 = activation_dtype == torch.float16
+        is_int8 = activation_dtype in [torch.quint8, torch.qint8]
         if not is_int8 and not is_fp16:
             continue
 
@@ -692,9 +692,9 @@ def _lower_dynamic_weighted_ref_functional(
             continue
 
         reduce_range_node = None
-        (pattern_input, activation_compute_dtype, reduce_range_node) = input_dynamic_q_node.args
-        is_fp16 = activation_compute_dtype == torch.float16
-        is_int8 = activation_compute_dtype in [torch.quint8, torch.qint8]
+        (pattern_input, activation_dtype, reduce_range_node) = input_dynamic_q_node.args
+        is_fp16 = activation_dtype == torch.float16
+        is_int8 = activation_dtype in [torch.quint8, torch.qint8]
         if not is_int8 and not is_fp16:
             continue
 
@@ -702,7 +702,7 @@ def _lower_dynamic_weighted_ref_functional(
         weight_dtype = quantized_weight.args[-1]
 
         # Step 1: Try to select reference pattern with the corresponding quantized op
-        dynamic_quant_dtype_key = (activation_compute_dtype, weight_dtype)
+        dynamic_quant_dtype_key = (activation_dtype, weight_dtype)
         if dynamic_quant_dtype_key not in DYNAMIC_LOWER_FUNCTIONAL_MAP[func_node.target]:
             print(f"Didn't find dtype combination {dynamic_quant_dtype_key} during "
                   f"dynamic quantized op lowering for {func_node.target}")
diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py
index 58846d116ff65..f677e0eedc666 100644
--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Set, Tuple, Union, Type
+from typing import Any, Dict, List, Optional, Set, Tuple, Union, Type, Callable
 from torch.ao.quantization.quant_type import QuantType
 import torch
 import copy
@@ -131,11 +131,13 @@ def _replace_observer_with_quantize_dequantize_node_decomposed(
     # 1. extract the information from activation_post_process module for generating
     # the quantize and dequantize operator
     dtype = activation_post_process.dtype  # type: ignore[attr-defined]
-    compute_dtype = None
-    if hasattr(activation_post_process, "compute_dtype"):
-        compute_dtype = activation_post_process.compute_dtype  # type: ignore[attr-defined]
+
+    is_dynamic = False
+    if hasattr(activation_post_process, "is_dynamic"):
+        is_dynamic = activation_post_process.is_dynamic  # type: ignore[assignment]
+
     if dtype in [torch.quint8, torch.qint8, torch.qint32] and \
-            not hasattr(activation_post_process, 'compute_dtype'):
+            (not is_dynamic):
         # TODO: probably should cleanup this condition check, it's hard
         # to reason about this if and the following elif
 
@@ -144,9 +146,9 @@ def _replace_observer_with_quantize_dequantize_node_decomposed(
         # 1. extract information for inserting q/dq node from activation_post_process
         node_type = "call_function"
         quantize_op : Optional[Callable] = None
-        scale, zero_point = activation_post_process.calculate_qparams()  # type: ignore[attr-defined]
+        scale, zero_point = activation_post_process.calculate_qparams()  # type: ignore[attr-defined, operator]
         if is_per_channel(activation_post_process.qscheme):  # type: ignore[attr-defined]
-            ch_axis = int(activation_post_process.ch_axis)  # type: ignore[attr-defined]
+            ch_axis = int(activation_post_process.ch_axis)  # type: ignore[attr-defined, arg-type]
             quantize_op = torch.ops.quantized_decomposed.quantize_per_channel
             dequantize_op = torch.ops.quantized_decomposed.dequantize_per_channel
             quant_min = activation_post_process.quant_min
@@ -203,8 +205,7 @@ def _replace_observer_with_quantize_dequantize_node_decomposed(
             )
             node.replace_all_uses_with(dequantized_node)
             graph.erase_node(node)
-    elif compute_dtype in [torch.quint8, torch.qint8, torch.float16]:
-        # TODO(future PR): switch compute_dtype to is_dynamic
+    elif is_dynamic:
 
         # uint8/int8/fp16 dynamic quantization
 
@@ -339,12 +340,13 @@ def _replace_observer_with_quantize_dequantize_node(
 
     # otherwise, we can convert the activation_post_process module call to quantize/dequantize node
     dtype = activation_post_process.dtype  # type: ignore[attr-defined]
-    compute_dtype = None
-    if hasattr(activation_post_process, "compute_dtype"):
-        compute_dtype = activation_post_process.compute_dtype  # type: ignore[attr-defined]
+
+    is_dynamic = False
+    if hasattr(activation_post_process, "is_dynamic"):
+        is_dynamic = activation_post_process.is_dynamic  # type: ignore[attr-defined, assignment]
 
     if dtype in [torch.quint8, torch.qint8, torch.qint32] and \
-            not hasattr(activation_post_process, "compute_dtype"):
+            (not is_dynamic):
         # TODO: probably should cleanup this condition check, it's hard
         # to reason about this if and the following elif
 
@@ -354,9 +356,9 @@ def _replace_observer_with_quantize_dequantize_node(
         # the quantize and dequantize operator
         node_type = "call_function"
         quantize_op : Optional[Callable] = None
-        scale, zero_point = activation_post_process.calculate_qparams()  # type: ignore[attr-defined]
+        scale, zero_point = activation_post_process.calculate_qparams()  # type: ignore[attr-defined, operator]
         if is_per_channel(activation_post_process.qscheme):  # type: ignore[attr-defined]
-            ch_axis = int(activation_post_process.ch_axis)  # type: ignore[attr-defined]
+            ch_axis = int(activation_post_process.ch_axis)  # type: ignore[attr-defined, arg-type]
             qparams = {"_scale_": scale, "_zero_point_": zero_point, "_axis_": ch_axis, "_dtype_": dtype}
             quantize_op = torch.quantize_per_channel
         else:
@@ -386,8 +388,7 @@ def _replace_observer_with_quantize_dequantize_node(
             dequantized_node = graph.call_method("dequantize", args=(quantized_node,))
             node.replace_all_uses_with(dequantized_node)
             graph.erase_node(node)
-    elif compute_dtype in [torch.quint8, torch.qint8, torch.float16]:
-        # TODO(future PR): switch compute_dtype to is_dynamic
+    elif is_dynamic:
 
         # uint8/int8/fp16 dynamic quantization branch
 
@@ -396,7 +397,7 @@ def _replace_observer_with_quantize_dequantize_node(
         # TODO: get reduce range from observer
         # reduce_range = activation_post_process.reduce_range
         reduce_range = torch.backends.quantized.engine in ("fbgemm", "x86")
-        qparams = {"_dtype_": compute_dtype, "_reduce_range_": reduce_range}
+        qparams = {"_dtype_": dtype, "_reduce_range_": reduce_range}
 
         with graph.inserting_before(node):
             input_node = node.args[0]
@@ -410,7 +411,7 @@ def _replace_observer_with_quantize_dequantize_node(
             graph.erase_node(node)
     elif dtype == torch.float16:
         node_type = "call_method"
-        quantize_op = "to"
+        quantize_op = "to"  # type: ignore[assignment]
         qparams = {"_dtype_": dtype}
         with graph.inserting_before(node):
             input_node = node.args[0]
@@ -443,12 +444,16 @@ def _replace_observer_or_dequant_stub_with_dequantize_node(node: Node, graph: Gr
 
 def _is_conversion_supported(activation_post_process: torch.nn.Module) -> bool:
     dtype = activation_post_process.dtype  # type: ignore[attr-defined]
-    compute_dtype = None
-    if hasattr(activation_post_process, "compute_dtype"):
-        compute_dtype = activation_post_process.compute_dtype  # type: ignore[attr-defined]
-    return (dtype in [torch.quint8, torch.qint8, torch.qint32] and compute_dtype is None) or \
-        compute_dtype in [torch.quint8, torch.qint8, torch.float16] or \
+
+    is_dynamic = False
+    if hasattr(activation_post_process, "is_dynamic"):
+        is_dynamic = activation_post_process.is_dynamic  # type: ignore[attr-defined, assignment]
+
+    return (
+        (dtype in [torch.quint8, torch.qint8, torch.qint32] and (not is_dynamic)) or  # type: ignore[return-value]
+        is_dynamic or
         dtype == torch.float16
+    )
 
 def restore_state(
         observed: torch.nn.Module
diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index c908e3f3b7644..932b40e03e0f7 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -407,9 +407,8 @@ def get_target_activation_dtype_for_node(
         # get qconfig to determine the eventual dtype of this node
         if qconfig is not None:
             if qhandler is not None and qhandler.input_output_observed():
-                act_dtype, weight_dtype, act_compute_dtype = \
+                act_dtype, weight_dtype, input_act_is_dynamic = \
                     get_qconfig_dtypes(qconfig)
-                input_act_is_dynamic = act_compute_dtype is not None
 
                 # Currently `QConfig` only has one `activation` field.
                 # For static quantization, it is reused for both input
@@ -419,13 +418,13 @@ def get_target_activation_dtype_for_node(
                 # In the future this may change as we add more fields
                 # to the `QConfig` object.
                 output_act_dtype = act_dtype \
-                    if input_act_is_dynamic is not True else torch.float
+                    if (not input_act_is_dynamic) else torch.float
 
                 bias_dtype = torch.float16 \
                     if (
                         act_dtype == torch.float16
                         and weight_dtype == torch.float16
-                        and act_compute_dtype is None
+                        and (not input_act_is_dynamic)
                     ) else torch.float
                 return {
                     "input_activation_dtype": (act_dtype, input_act_is_dynamic),
diff --git a/torch/ao/quantization/fx/qconfig_mapping_utils.py b/torch/ao/quantization/fx/qconfig_mapping_utils.py
index 0b0407c0b106e..6ccc8d07f64e0 100644
--- a/torch/ao/quantization/fx/qconfig_mapping_utils.py
+++ b/torch/ao/quantization/fx/qconfig_mapping_utils.py
@@ -242,7 +242,7 @@ def is_qconfig_supported_by_dtype_configs(qconfig: QConfig, dtype_configs: List[
         weight_dtype = dtype_config.weight_dtype or torch.float
         bias_dtype = dtype_config.bias_dtype or torch.float
         output_dtype = dtype_config.output_dtype or torch.float
-        qconfig_activation_dtype, qconfig_weight_dtype, qconfig_compute_dtype = \
+        qconfig_activation_dtype, qconfig_weight_dtype, qconfig_input_act_is_dynamic = \
             get_qconfig_dtypes(qconfig)
         qconfig_bias_dtype = torch.float16 \
             if (
@@ -252,7 +252,8 @@ def is_qconfig_supported_by_dtype_configs(qconfig: QConfig, dtype_configs: List[
             ) else torch.float
 
         if is_dynamic:
-            is_match = input_dtype == qconfig_compute_dtype and \
+            is_match = qconfig_input_act_is_dynamic and \
+                input_dtype == qconfig_activation_dtype and \
                 output_dtype == torch.float and \
                 weight_dtype == qconfig_weight_dtype
         else:
diff --git a/torch/ao/quantization/observer.py b/torch/ao/quantization/observer.py
index ea2a26bf3896d..f8683024cee52 100644
--- a/torch/ao/quantization/observer.py
+++ b/torch/ao/quantization/observer.py
@@ -1320,15 +1320,17 @@ class PlaceholderObserver(ObserverBase):
         quant_min: maximum value in quantized domain
         custom_op_name: (temporary) specify this observer for an operator that doesn't require any observation
                         (Can be used in Graph Mode Passes for special case ops).
-        compute_dtype: if set, marks the future quantize function to use
+        compute_dtype (deprecated): if set, marks the future quantize function to use
                        dynamic quantization instead of static quantization.
-                       Note: this field will be removed in the near future and
-                       replaced with `is_dynamic`.
+                       This field is deprecated, use `is_dynamic=True` instead.
+        is_dynamic: if True, the `quantize` function in the reference model
+                    representation taking stats from this observer instance will
+                    use dynamic quantization.
     """
 
     def __init__(
         self, dtype=torch.float32, custom_op_name="", compute_dtype=None,
-        quant_min=None, quant_max=None,
+        quant_min=None, quant_max=None, is_dynamic=False,
     ) -> None:
         super().__init__(dtype=dtype)
         # dtype of input of the target operator, e.g. for dynamic quantization
@@ -1338,9 +1340,14 @@ def __init__(
         self.quant_max = quant_max
         self.custom_op = custom_op_name
         # used for configuration of computation type for dynamic quantization
-        # TODO(future PR): replace this with `is_dynamic`
         if compute_dtype:
-            self.compute_dtype = compute_dtype
+            is_dynamic = True
+            warnings.warn(
+                "Please use `is_dynamic` instead of `compute_dtype`. \
+                    `compute_dtype` will be deprecated in a future release \
+                    of PyTorch."
+            )
+        self.is_dynamic = is_dynamic
 
     def forward(self, x):
         return x
@@ -1556,7 +1563,7 @@ def load_observer_state_dict(mod, obs_dict):
 """
 
 default_dynamic_quant_observer = PlaceholderObserver.with_args(
-    dtype=torch.quint8, compute_dtype=torch.quint8, quant_min=0, quant_max=255
+    dtype=torch.quint8, quant_min=0, quant_max=255, is_dynamic=True,
 )
 """
 Default observer for dynamic quantization.
diff --git a/torch/ao/quantization/qconfig.py b/torch/ao/quantization/qconfig.py
index f52bf713c6f9b..09fa02ff3ddb2 100644
--- a/torch/ao/quantization/qconfig.py
+++ b/torch/ao/quantization/qconfig.py
@@ -152,7 +152,7 @@ def __new__(cls, activation=torch.nn.Identity, weight=torch.nn.Identity):
 Default dynamic qconfig.
 """
 
-float16_dynamic_qconfig = QConfig(activation=PlaceholderObserver.with_args(dtype=torch.float16, compute_dtype=torch.float16),
+float16_dynamic_qconfig = QConfig(activation=PlaceholderObserver.with_args(dtype=torch.float16, is_dynamic=True),
                                   weight=PlaceholderObserver.with_args(dtype=torch.float16))
 """
 Dynamic qconfig with weights quantized to `torch.float16`.
diff --git a/torch/ao/quantization/utils.py b/torch/ao/quantization/utils.py
index 662d0068fef4f..984386d205042 100644
--- a/torch/ao/quantization/utils.py
+++ b/torch/ao/quantization/utils.py
@@ -221,9 +221,9 @@ def activation_is_dynamically_quantized(qconfig):
     dynamically quantized or not, this includes dynamically quantizing to
     quint8, qint8 and float16
     """
-    activation_dtype, _, activation_compute_dtype = \
+    activation_dtype, _, activation_is_dynamic = \
         get_qconfig_dtypes(qconfig)
-    return activation_compute_dtype in [torch.quint8, torch.qint8, torch.float16]
+    return activation_is_dynamic
 
 def activation_is_int8_quantized(qconfig):
     """ Given a qconfig, decide if the activation needs to be
@@ -253,25 +253,24 @@ def op_is_int8_dynamically_quantized(qconfig) -> bool:
     """ Given a qconfig, returns True if this op is using int8 dynamic
     quantization
     """
-    activation_dtype, weight_dtype, activation_compute_dtype = \
+    activation_dtype, weight_dtype, activation_is_dynamic = \
         get_qconfig_dtypes(qconfig)
     return (
         activation_dtype is torch.quint8 and
         # for now, the lines below assume fbgemm or qnnpack
         weight_dtype is torch.qint8 and
-        activation_compute_dtype is torch.quint8
-        # TODO(future PR): add is_dynamic
+        activation_is_dynamic
     )
 
 def get_qconfig_dtypes(qconfig):
     r""" returns the qconfig tuple for qconfig:
-    (activation_dtype, weight_dtype, activation_compute_dtype)
+    (activation_dtype, weight_dtype, activation_is_dynamic)
     """
     assert qconfig is not None
     activation = qconfig.activation()
     weight = qconfig.weight()
-    compute_dtype = activation.compute_dtype if hasattr(activation, 'compute_dtype') else None
-    return (activation.dtype, weight.dtype, compute_dtype)
+    act_is_dynamic = activation.is_dynamic if hasattr(activation, 'is_dynamic') else False
+    return (activation.dtype, weight.dtype, act_is_dynamic)
 
 def get_quant_type(qconfig):
     assert qconfig is not None
@@ -279,7 +278,7 @@ def get_quant_type(qconfig):
     weight = qconfig.weight()
     static_dtypes = [torch.quint8, torch.qint8, torch.quint4x2, torch.qint32]
     if weight.dtype in static_dtypes:
-        if hasattr(activation, 'compute_dtype') and activation.compute_dtype in static_dtypes:
+        if hasattr(activation, 'is_dynamic') and activation.is_dynamic:
             return QuantType.DYNAMIC
         elif activation.dtype in static_dtypes:
             return QuantType.STATIC
@@ -287,7 +286,7 @@ def get_quant_type(qconfig):
             return QuantType.WEIGHT_ONLY
 
     if weight.dtype == torch.float16:
-        if hasattr(activation, 'compute_dtype') and activation.compute_dtype in static_dtypes:
+        if hasattr(activation, 'is_dynamic') and activation.is_dynamic:
             return QuantType.DYNAMIC
         elif activation.dtype == torch.float16:
             return QuantType.STATIC
diff --git a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
index 3270ef4ced82e..c852696c62d78 100644
--- a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
+++ b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
@@ -257,19 +257,6 @@ at::ScalarType getObserverDtype(Module& module, Value* v) {
   return at::ScalarType::Undefined;
 }
 
-at::ScalarType getObserverComputeDtype(Module& module, Value* v) {
-  auto observer_name = findObserverName(v);
-  if (observer_name.has_value()) {
-    auto observer_module = module.attr(observer_name.value()).toModule();
-    if (observer_module.hasattr("compute_dtype")) {
-      at::ScalarType scalar_type =
-          observer_module.attr("compute_dtype").toScalarType();
-      return scalar_type;
-    }
-  }
-  return at::ScalarType::Undefined;
-}
-
 c10::optional<std::string> getEmbeddingBagObsName(
     script::Module& module,
     Node* n) {
@@ -480,12 +467,8 @@ void insertQuantizationOps(
       dequant = insertFP16CastOps(g, observer_out);
     } else if (!isWeight(module, observer_out)) {
       auto observer_dtype = getObserverDtype(module, observer_out);
-      auto observer_compute_dtype =
-          getObserverComputeDtype(module, observer_out);
       if (observer_dtype == at::ScalarType::QUInt8 ||
-          observer_dtype == at::ScalarType::QInt8 ||
-          observer_compute_dtype == at::ScalarType::QUInt8 ||
-          observer_compute_dtype == at::ScalarType::QInt8) {
+          observer_dtype == at::ScalarType::QInt8) {
         // For activation tensors we insert choose_qparams, quant, dequant ops.
         Value* dtype = g->insertGetAttr(self, qparams.back());
         std::tie(choose_qparams, quant, dequant) =
@@ -1092,9 +1075,10 @@ std::tuple<c10::QScheme, QParamVector> InsertQuantDeQuantHelper::
   auto scalar_type = observer_module.attr("dtype");
   if (isPlaceholderObserver(n->input(0))) {
     // get compute_dtype for dynamic quantization
-    if (observer_module.hasattr("compute_dtype")) {
+    if (observer_module.hasattr("is_dynamic") &&
+        observer_module.attr("is_dynamic").toBool()) {
       qparams.push_back(
-          std::make_pair(kScalarType, observer_module.attr("compute_dtype")));
+          std::make_pair(kScalarType, observer_module.attr("dtype")));
     }
     return std::make_tuple(qscheme, qparams);
   } else if (scalar_type == at::ScalarType::Half) {

From 76eecd5ec51feb44f92dd60dba1618896a54e775 Mon Sep 17 00:00:00 2001
From: Hao Guan <10684225+hguandl@users.noreply.github.com>
Date: Thu, 24 Nov 2022 08:14:24 +0000
Subject: [PATCH 1250/1922] [QAT] Check the value of numel to avoid segfault
 (#81547)

Fixes #78123

### Original Result

Segmentation fault

### Result after fix

RuntimeError: numel is out of the bound of input tensor
Pull Request resolved: https://github.com/pytorch/pytorch/pull/81547
Approved by: https://github.com/kit1980
---
 aten/src/ATen/native/quantized/QTensor.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/aten/src/ATen/native/quantized/QTensor.cpp b/aten/src/ATen/native/quantized/QTensor.cpp
index 5a9bbfb387e43..b3ff8bd8b3274 100644
--- a/aten/src/ATen/native/quantized/QTensor.cpp
+++ b/aten/src/ATen/native/quantized/QTensor.cpp
@@ -330,6 +330,10 @@ std::tuple<Tensor, Tensor> choose_qparams_optimized(
     const double ratio,
     int64_t bit_width) {
 
+  if (numel < 0 || numel > input_tensor.numel()) {
+    TORCH_CHECK(false, "numel is out of the bound of input tensor");
+  }
+
   TORCH_CHECK(numel <= input_tensor.numel(), "numel ", numel,
       " greater than input_tensor.numel() ", input_tensor.numel());
   const float* input_row = input_tensor.data_ptr<float>();

From ba78b552c90b32af2dcc3f420c9974d2e7215fbc Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <ivan.yashchuk@aalto.fi>
Date: Thu, 24 Nov 2022 09:37:10 +0000
Subject: [PATCH 1251/1922] Use fx.replace_pattern for removing empty_like+fill
 in nvFuser+PrimTorch execution (#89132)

I learned about `torch.fx.replace_pattern` and it's a cleaner way of removing unnecessary tensor materialization from the graph coming from tracing  C++ code `1 - tensor`.

Test:
```
python -m pytest test/test_prims.py -k "test_silu_backward_no_filled_tensor"
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89132
Approved by: https://github.com/mruberry, https://github.com/jjsjann123
---
 torch/_prims/nvfuser_executor.py | 36 +++++++++++++++-----------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/torch/_prims/nvfuser_executor.py b/torch/_prims/nvfuser_executor.py
index a155433231e11..b44f7653ee81d 100644
--- a/torch/_prims/nvfuser_executor.py
+++ b/torch/_prims/nvfuser_executor.py
@@ -352,25 +352,23 @@ def _remove_empty_like_fill(gm: GraphModule):
     # This is a workaround for nonoptimal traces of C++ code `(1 - tensor)`
     # https://github.com/pytorch/pytorch/issues/86612
 
-    # Here when we see a `sub` node, we check if the first input is a result of
-    # filling a tensor with a scalar
-    # If so, we replace the first argument of the `sub` node with a scalar
-    for node in gm.graph.nodes:
-        if node.op == "call_function":
-            if node.target == torch.ops.nvprims.sub.default:
-                # check if the first argument is a fill
-                if (
-                    isinstance(node.args[0], torch.fx.Node)
-                    and node.args[0].op == "call_function"
-                    and node.args[0].target == torch.ops.aten.fill.Scalar
-                ):
-                    # Replace the first argument with the second argument of fill
-                    # aten.fill.Scalar(tensor, scalar)
-                    fill_node = node.args[0]
-                    scalar = fill_node.args[1]
-                    node.args = (scalar, *node.args[1:])
-    gm.graph.eliminate_dead_code()
-    gm.recompile()
+    def pattern(scalar, tensor):
+        # pattern for C++ trace of `scalar - tensor`. We are looking for the
+        # pattern of aten and nvprims.sub specifically because we want to remove
+        # the empty_like + fill nodes after lowering of AOT Autograd trace to
+        # nvprims In the future, nvFuser might support fill, and empty_like and
+        # this workaround can be removed.
+        empty_like = torch.ops.aten.empty_like.default(
+            tensor, memory_format=torch.preserve_format
+        )
+        fill = torch.ops.aten.fill.Scalar(empty_like, scalar)
+        sub = torch.ops.nvprims.sub.default(fill, tensor)
+        return sub
+
+    def replacement(scalar, tensor):
+        return torch.ops.nvprims.sub.default(scalar, tensor)
+
+    torch.fx.replace_pattern(gm, pattern, replacement)
     return gm
 
 
From 4d7a4d4ce613fbf7f5d7910ddc946506f0cc2e0e Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Thu, 24 Nov 2022 10:53:20 +0000
Subject: [PATCH 1252/1922] Upgrade nightly wheels to ROCm5.3 (#89101)

Dependent on PR https://github.com/pytorch/builder/pull/1193

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89101
Approved by: https://github.com/kit1980
---
 .../scripts/generate_binary_build_matrix.py   |   2 +-
 ...inux-binary-libtorch-cxx11-abi-nightly.yml | 144 ++++-----
 ...inux-binary-libtorch-pre-cxx11-nightly.yml | 144 ++++-----
 ...nerated-linux-binary-manywheel-nightly.yml | 288 +++++++++---------
 4 files changed, 289 insertions(+), 289 deletions(-)

diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py
index 4031ee9aacca6..deb225287b3f5 100644
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@@ -16,7 +16,7 @@
 CUDA_ARCHES = ["11.6", "11.7"]
 
 
-ROCM_ARCHES = ["5.1.1", "5.2"]
+ROCM_ARCHES = ["5.2", "5.3"]
 
 
 def arch_type(arch_version: str) -> str:
diff --git a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
index f9ab6798787fb..6b1765b9a405d 100644
--- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
@@ -780,7 +780,7 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-rocm5_1_1-shared-with-deps-cxx11-abi-build:
+  libtorch-rocm5_2-shared-with-deps-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -789,20 +789,20 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.1.1
-      GPU_ARCH_VERSION: 5.1.1
+      DESIRED_CUDA: rocm5.2
+      GPU_ARCH_VERSION: 5.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.1.1
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.2
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-rocm5_1_1-shared-with-deps-cxx11-abi
+      build_name: libtorch-rocm5_2-shared-with-deps-cxx11-abi
       build_environment: linux-binary-libtorch-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  libtorch-rocm5_1_1-shared-with-deps-cxx11-abi-test:  # Testing
+  libtorch-rocm5_2-shared-with-deps-cxx11-abi-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_1_1-shared-with-deps-cxx11-abi-build
+    needs: libtorch-rocm5_2-shared-with-deps-cxx11-abi-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -811,11 +811,11 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.1.1
-      GPU_ARCH_VERSION: 5.1.1
+      DESIRED_CUDA: rocm5.2
+      GPU_ARCH_VERSION: 5.2
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.1.1
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.2
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
     steps:
@@ -864,7 +864,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: libtorch-rocm5_1_1-shared-with-deps-cxx11-abi
+          name: libtorch-rocm5_2-shared-with-deps-cxx11-abi
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -895,7 +895,7 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/libtorch-cxx11-builder:rocm5.1.1
+          docker-image: pytorch/libtorch-cxx11-builder:rocm5.2
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Kill containers, clean up images
@@ -906,29 +906,29 @@ jobs:
           docker stop $(docker ps -q) || true
           # Prune all of the docker images
           docker system prune -af
-  libtorch-rocm5_1_1-shared-with-deps-cxx11-abi-upload:  # Uploading
+  libtorch-rocm5_2-shared-with-deps-cxx11-abi-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_1_1-shared-with-deps-cxx11-abi-test
+    needs: libtorch-rocm5_2-shared-with-deps-cxx11-abi-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.1.1
-      GPU_ARCH_VERSION: 5.1.1
+      DESIRED_CUDA: rocm5.2
+      GPU_ARCH_VERSION: 5.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.1.1
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.2
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-rocm5_1_1-shared-with-deps-cxx11-abi
+      build_name: libtorch-rocm5_2-shared-with-deps-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-rocm5_1_1-static-with-deps-cxx11-abi-build:
+  libtorch-rocm5_2-static-with-deps-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -937,20 +937,20 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.1.1
-      GPU_ARCH_VERSION: 5.1.1
+      DESIRED_CUDA: rocm5.2
+      GPU_ARCH_VERSION: 5.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.1.1
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.2
       LIBTORCH_VARIANT: static-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-rocm5_1_1-static-with-deps-cxx11-abi
+      build_name: libtorch-rocm5_2-static-with-deps-cxx11-abi
       build_environment: linux-binary-libtorch-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  libtorch-rocm5_1_1-static-with-deps-cxx11-abi-test:  # Testing
+  libtorch-rocm5_2-static-with-deps-cxx11-abi-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_1_1-static-with-deps-cxx11-abi-build
+    needs: libtorch-rocm5_2-static-with-deps-cxx11-abi-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -959,11 +959,11 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.1.1
-      GPU_ARCH_VERSION: 5.1.1
+      DESIRED_CUDA: rocm5.2
+      GPU_ARCH_VERSION: 5.2
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.1.1
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.2
       LIBTORCH_VARIANT: static-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
     steps:
@@ -1012,7 +1012,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: libtorch-rocm5_1_1-static-with-deps-cxx11-abi
+          name: libtorch-rocm5_2-static-with-deps-cxx11-abi
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -1043,7 +1043,7 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/libtorch-cxx11-builder:rocm5.1.1
+          docker-image: pytorch/libtorch-cxx11-builder:rocm5.2
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Kill containers, clean up images
@@ -1054,29 +1054,29 @@ jobs:
           docker stop $(docker ps -q) || true
           # Prune all of the docker images
           docker system prune -af
-  libtorch-rocm5_1_1-static-with-deps-cxx11-abi-upload:  # Uploading
+  libtorch-rocm5_2-static-with-deps-cxx11-abi-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_1_1-static-with-deps-cxx11-abi-test
+    needs: libtorch-rocm5_2-static-with-deps-cxx11-abi-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.1.1
-      GPU_ARCH_VERSION: 5.1.1
+      DESIRED_CUDA: rocm5.2
+      GPU_ARCH_VERSION: 5.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.1.1
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.2
       LIBTORCH_VARIANT: static-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-rocm5_1_1-static-with-deps-cxx11-abi
+      build_name: libtorch-rocm5_2-static-with-deps-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-rocm5_2-shared-with-deps-cxx11-abi-build:
+  libtorch-rocm5_3-shared-with-deps-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -1085,20 +1085,20 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.3
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-rocm5_2-shared-with-deps-cxx11-abi
+      build_name: libtorch-rocm5_3-shared-with-deps-cxx11-abi
       build_environment: linux-binary-libtorch-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  libtorch-rocm5_2-shared-with-deps-cxx11-abi-test:  # Testing
+  libtorch-rocm5_3-shared-with-deps-cxx11-abi-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_2-shared-with-deps-cxx11-abi-build
+    needs: libtorch-rocm5_3-shared-with-deps-cxx11-abi-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -1107,11 +1107,11 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.3
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
     steps:
@@ -1160,7 +1160,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: libtorch-rocm5_2-shared-with-deps-cxx11-abi
+          name: libtorch-rocm5_3-shared-with-deps-cxx11-abi
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -1191,7 +1191,7 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/libtorch-cxx11-builder:rocm5.2
+          docker-image: pytorch/libtorch-cxx11-builder:rocm5.3
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Kill containers, clean up images
@@ -1202,29 +1202,29 @@ jobs:
           docker stop $(docker ps -q) || true
           # Prune all of the docker images
           docker system prune -af
-  libtorch-rocm5_2-shared-with-deps-cxx11-abi-upload:  # Uploading
+  libtorch-rocm5_3-shared-with-deps-cxx11-abi-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_2-shared-with-deps-cxx11-abi-test
+    needs: libtorch-rocm5_3-shared-with-deps-cxx11-abi-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.3
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-rocm5_2-shared-with-deps-cxx11-abi
+      build_name: libtorch-rocm5_3-shared-with-deps-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-rocm5_2-static-with-deps-cxx11-abi-build:
+  libtorch-rocm5_3-static-with-deps-cxx11-abi-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -1233,20 +1233,20 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.3
       LIBTORCH_VARIANT: static-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-rocm5_2-static-with-deps-cxx11-abi
+      build_name: libtorch-rocm5_3-static-with-deps-cxx11-abi
       build_environment: linux-binary-libtorch-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  libtorch-rocm5_2-static-with-deps-cxx11-abi-test:  # Testing
+  libtorch-rocm5_3-static-with-deps-cxx11-abi-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_2-static-with-deps-cxx11-abi-build
+    needs: libtorch-rocm5_3-static-with-deps-cxx11-abi-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -1255,11 +1255,11 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.3
       LIBTORCH_VARIANT: static-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
     steps:
@@ -1308,7 +1308,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: libtorch-rocm5_2-static-with-deps-cxx11-abi
+          name: libtorch-rocm5_3-static-with-deps-cxx11-abi
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -1339,7 +1339,7 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/libtorch-cxx11-builder:rocm5.2
+          docker-image: pytorch/libtorch-cxx11-builder:rocm5.3
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Kill containers, clean up images
@@ -1350,22 +1350,22 @@ jobs:
           docker stop $(docker ps -q) || true
           # Prune all of the docker images
           docker system prune -af
-  libtorch-rocm5_2-static-with-deps-cxx11-abi-upload:  # Uploading
+  libtorch-rocm5_3-static-with-deps-cxx11-abi-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_2-static-with-deps-cxx11-abi-test
+    needs: libtorch-rocm5_3-static-with-deps-cxx11-abi-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.3
       LIBTORCH_VARIANT: static-with-deps
       DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-rocm5_2-static-with-deps-cxx11-abi
+      build_name: libtorch-rocm5_3-static-with-deps-cxx11-abi
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
diff --git a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
index 55e4a19b8e8ab..eaa928f3e09a9 100644
--- a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
@@ -780,7 +780,7 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-rocm5_1_1-shared-with-deps-pre-cxx11-build:
+  libtorch-rocm5_2-shared-with-deps-pre-cxx11-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -789,20 +789,20 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.1.1
-      GPU_ARCH_VERSION: 5.1.1
+      DESIRED_CUDA: rocm5.2
+      GPU_ARCH_VERSION: 5.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-rocm5_1_1-shared-with-deps-pre-cxx11
+      build_name: libtorch-rocm5_2-shared-with-deps-pre-cxx11
       build_environment: linux-binary-libtorch-pre-cxx11
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  libtorch-rocm5_1_1-shared-with-deps-pre-cxx11-test:  # Testing
+  libtorch-rocm5_2-shared-with-deps-pre-cxx11-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_1_1-shared-with-deps-pre-cxx11-build
+    needs: libtorch-rocm5_2-shared-with-deps-pre-cxx11-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -811,11 +811,11 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.1.1
-      GPU_ARCH_VERSION: 5.1.1
+      DESIRED_CUDA: rocm5.2
+      GPU_ARCH_VERSION: 5.2
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
     steps:
@@ -864,7 +864,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: libtorch-rocm5_1_1-shared-with-deps-pre-cxx11
+          name: libtorch-rocm5_2-shared-with-deps-pre-cxx11
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -895,7 +895,7 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.1.1
+          docker-image: pytorch/manylinux-builder:rocm5.2
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Kill containers, clean up images
@@ -906,29 +906,29 @@ jobs:
           docker stop $(docker ps -q) || true
           # Prune all of the docker images
           docker system prune -af
-  libtorch-rocm5_1_1-shared-with-deps-pre-cxx11-upload:  # Uploading
+  libtorch-rocm5_2-shared-with-deps-pre-cxx11-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_1_1-shared-with-deps-pre-cxx11-test
+    needs: libtorch-rocm5_2-shared-with-deps-pre-cxx11-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.1.1
-      GPU_ARCH_VERSION: 5.1.1
+      DESIRED_CUDA: rocm5.2
+      GPU_ARCH_VERSION: 5.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-rocm5_1_1-shared-with-deps-pre-cxx11
+      build_name: libtorch-rocm5_2-shared-with-deps-pre-cxx11
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-rocm5_1_1-static-with-deps-pre-cxx11-build:
+  libtorch-rocm5_2-static-with-deps-pre-cxx11-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -937,20 +937,20 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.1.1
-      GPU_ARCH_VERSION: 5.1.1
+      DESIRED_CUDA: rocm5.2
+      GPU_ARCH_VERSION: 5.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
       LIBTORCH_VARIANT: static-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-rocm5_1_1-static-with-deps-pre-cxx11
+      build_name: libtorch-rocm5_2-static-with-deps-pre-cxx11
       build_environment: linux-binary-libtorch-pre-cxx11
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  libtorch-rocm5_1_1-static-with-deps-pre-cxx11-test:  # Testing
+  libtorch-rocm5_2-static-with-deps-pre-cxx11-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_1_1-static-with-deps-pre-cxx11-build
+    needs: libtorch-rocm5_2-static-with-deps-pre-cxx11-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -959,11 +959,11 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.1.1
-      GPU_ARCH_VERSION: 5.1.1
+      DESIRED_CUDA: rocm5.2
+      GPU_ARCH_VERSION: 5.2
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
       LIBTORCH_VARIANT: static-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
     steps:
@@ -1012,7 +1012,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: libtorch-rocm5_1_1-static-with-deps-pre-cxx11
+          name: libtorch-rocm5_2-static-with-deps-pre-cxx11
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -1043,7 +1043,7 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.1.1
+          docker-image: pytorch/manylinux-builder:rocm5.2
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Kill containers, clean up images
@@ -1054,29 +1054,29 @@ jobs:
           docker stop $(docker ps -q) || true
           # Prune all of the docker images
           docker system prune -af
-  libtorch-rocm5_1_1-static-with-deps-pre-cxx11-upload:  # Uploading
+  libtorch-rocm5_2-static-with-deps-pre-cxx11-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_1_1-static-with-deps-pre-cxx11-test
+    needs: libtorch-rocm5_2-static-with-deps-pre-cxx11-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.1.1
-      GPU_ARCH_VERSION: 5.1.1
+      DESIRED_CUDA: rocm5.2
+      GPU_ARCH_VERSION: 5.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
       LIBTORCH_VARIANT: static-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-rocm5_1_1-static-with-deps-pre-cxx11
+      build_name: libtorch-rocm5_2-static-with-deps-pre-cxx11
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-rocm5_2-shared-with-deps-pre-cxx11-build:
+  libtorch-rocm5_3-shared-with-deps-pre-cxx11-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -1085,20 +1085,20 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-rocm5_2-shared-with-deps-pre-cxx11
+      build_name: libtorch-rocm5_3-shared-with-deps-pre-cxx11
       build_environment: linux-binary-libtorch-pre-cxx11
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  libtorch-rocm5_2-shared-with-deps-pre-cxx11-test:  # Testing
+  libtorch-rocm5_3-shared-with-deps-pre-cxx11-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_2-shared-with-deps-pre-cxx11-build
+    needs: libtorch-rocm5_3-shared-with-deps-pre-cxx11-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -1107,11 +1107,11 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
     steps:
@@ -1160,7 +1160,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: libtorch-rocm5_2-shared-with-deps-pre-cxx11
+          name: libtorch-rocm5_3-shared-with-deps-pre-cxx11
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -1191,7 +1191,7 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.2
+          docker-image: pytorch/manylinux-builder:rocm5.3
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Kill containers, clean up images
@@ -1202,29 +1202,29 @@ jobs:
           docker stop $(docker ps -q) || true
           # Prune all of the docker images
           docker system prune -af
-  libtorch-rocm5_2-shared-with-deps-pre-cxx11-upload:  # Uploading
+  libtorch-rocm5_3-shared-with-deps-pre-cxx11-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_2-shared-with-deps-pre-cxx11-test
+    needs: libtorch-rocm5_3-shared-with-deps-pre-cxx11-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       LIBTORCH_VARIANT: shared-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-rocm5_2-shared-with-deps-pre-cxx11
+      build_name: libtorch-rocm5_3-shared-with-deps-pre-cxx11
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-rocm5_2-static-with-deps-pre-cxx11-build:
+  libtorch-rocm5_3-static-with-deps-pre-cxx11-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -1233,20 +1233,20 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       LIBTORCH_VARIANT: static-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-rocm5_2-static-with-deps-pre-cxx11
+      build_name: libtorch-rocm5_3-static-with-deps-pre-cxx11
       build_environment: linux-binary-libtorch-pre-cxx11
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  libtorch-rocm5_2-static-with-deps-pre-cxx11-test:  # Testing
+  libtorch-rocm5_3-static-with-deps-pre-cxx11-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_2-static-with-deps-pre-cxx11-build
+    needs: libtorch-rocm5_3-static-with-deps-pre-cxx11-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -1255,11 +1255,11 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       LIBTORCH_VARIANT: static-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
     steps:
@@ -1308,7 +1308,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: libtorch-rocm5_2-static-with-deps-pre-cxx11
+          name: libtorch-rocm5_3-static-with-deps-pre-cxx11
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -1339,7 +1339,7 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.2
+          docker-image: pytorch/manylinux-builder:rocm5.3
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Kill containers, clean up images
@@ -1350,22 +1350,22 @@ jobs:
           docker stop $(docker ps -q) || true
           # Prune all of the docker images
           docker system prune -af
-  libtorch-rocm5_2-static-with-deps-pre-cxx11-upload:  # Uploading
+  libtorch-rocm5_3-static-with-deps-pre-cxx11-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm5_2-static-with-deps-pre-cxx11-test
+    needs: libtorch-rocm5_3-static-with-deps-pre-cxx11-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       LIBTORCH_VARIANT: static-with-deps
       DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-rocm5_2-static-with-deps-pre-cxx11
+      build_name: libtorch-rocm5_3-static-with-deps-pre-cxx11
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
index ba9401d717a6d..b93f797d7e01c 100644
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@@ -274,7 +274,7 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_7-rocm5_1_1-build:
+  manywheel-py3_7-rocm5_2-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -283,19 +283,19 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.1.1
-      GPU_ARCH_VERSION: 5.1.1
+      DESIRED_CUDA: rocm5.2
+      GPU_ARCH_VERSION: 5.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
       DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-rocm5_1_1
+      build_name: manywheel-py3_7-rocm5_2
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  manywheel-py3_7-rocm5_1_1-test:  # Testing
+  manywheel-py3_7-rocm5_2-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-rocm5_1_1-build
+    needs: manywheel-py3_7-rocm5_2-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -304,11 +304,11 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.1.1
-      GPU_ARCH_VERSION: 5.1.1
+      DESIRED_CUDA: rocm5.2
+      GPU_ARCH_VERSION: 5.2
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
       DESIRED_PYTHON: "3.7"
     steps:
       - name: Clean workspace
@@ -356,7 +356,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_7-rocm5_1_1
+          name: manywheel-py3_7-rocm5_2
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -387,7 +387,7 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.1.1
+          docker-image: pytorch/manylinux-builder:rocm5.2
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Kill containers, clean up images
@@ -398,28 +398,28 @@ jobs:
           docker stop $(docker ps -q) || true
           # Prune all of the docker images
           docker system prune -af
-  manywheel-py3_7-rocm5_1_1-upload:  # Uploading
+  manywheel-py3_7-rocm5_2-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-rocm5_1_1-test
+    needs: manywheel-py3_7-rocm5_2-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.1.1
-      GPU_ARCH_VERSION: 5.1.1
+      DESIRED_CUDA: rocm5.2
+      GPU_ARCH_VERSION: 5.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
       DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-rocm5_1_1
+      build_name: manywheel-py3_7-rocm5_2
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_7-rocm5_2-build:
+  manywheel-py3_7-rocm5_3-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -428,19 +428,19 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-rocm5_2
+      build_name: manywheel-py3_7-rocm5_3
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  manywheel-py3_7-rocm5_2-test:  # Testing
+  manywheel-py3_7-rocm5_3-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-rocm5_2-build
+    needs: manywheel-py3_7-rocm5_3-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -449,11 +449,11 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       DESIRED_PYTHON: "3.7"
     steps:
       - name: Clean workspace
@@ -501,7 +501,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_7-rocm5_2
+          name: manywheel-py3_7-rocm5_3
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -532,7 +532,7 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.2
+          docker-image: pytorch/manylinux-builder:rocm5.3
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Kill containers, clean up images
@@ -543,21 +543,21 @@ jobs:
           docker stop $(docker ps -q) || true
           # Prune all of the docker images
           docker system prune -af
-  manywheel-py3_7-rocm5_2-upload:  # Uploading
+  manywheel-py3_7-rocm5_3-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_7-rocm5_2-test
+    needs: manywheel-py3_7-rocm5_3-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       DESIRED_PYTHON: "3.7"
-      build_name: manywheel-py3_7-rocm5_2
+      build_name: manywheel-py3_7-rocm5_3
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
@@ -802,7 +802,7 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_8-rocm5_1_1-build:
+  manywheel-py3_8-rocm5_2-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -811,19 +811,19 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.1.1
-      GPU_ARCH_VERSION: 5.1.1
+      DESIRED_CUDA: rocm5.2
+      GPU_ARCH_VERSION: 5.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
       DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-rocm5_1_1
+      build_name: manywheel-py3_8-rocm5_2
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  manywheel-py3_8-rocm5_1_1-test:  # Testing
+  manywheel-py3_8-rocm5_2-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-rocm5_1_1-build
+    needs: manywheel-py3_8-rocm5_2-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -832,11 +832,11 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.1.1
-      GPU_ARCH_VERSION: 5.1.1
+      DESIRED_CUDA: rocm5.2
+      GPU_ARCH_VERSION: 5.2
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
       DESIRED_PYTHON: "3.8"
     steps:
       - name: Clean workspace
@@ -884,7 +884,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_8-rocm5_1_1
+          name: manywheel-py3_8-rocm5_2
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -915,7 +915,7 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.1.1
+          docker-image: pytorch/manylinux-builder:rocm5.2
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Kill containers, clean up images
@@ -926,28 +926,28 @@ jobs:
           docker stop $(docker ps -q) || true
           # Prune all of the docker images
           docker system prune -af
-  manywheel-py3_8-rocm5_1_1-upload:  # Uploading
+  manywheel-py3_8-rocm5_2-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-rocm5_1_1-test
+    needs: manywheel-py3_8-rocm5_2-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.1.1
-      GPU_ARCH_VERSION: 5.1.1
+      DESIRED_CUDA: rocm5.2
+      GPU_ARCH_VERSION: 5.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
       DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-rocm5_1_1
+      build_name: manywheel-py3_8-rocm5_2
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_8-rocm5_2-build:
+  manywheel-py3_8-rocm5_3-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -956,19 +956,19 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-rocm5_2
+      build_name: manywheel-py3_8-rocm5_3
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  manywheel-py3_8-rocm5_2-test:  # Testing
+  manywheel-py3_8-rocm5_3-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-rocm5_2-build
+    needs: manywheel-py3_8-rocm5_3-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -977,11 +977,11 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       DESIRED_PYTHON: "3.8"
     steps:
       - name: Clean workspace
@@ -1029,7 +1029,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_8-rocm5_2
+          name: manywheel-py3_8-rocm5_3
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -1060,7 +1060,7 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.2
+          docker-image: pytorch/manylinux-builder:rocm5.3
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Kill containers, clean up images
@@ -1071,21 +1071,21 @@ jobs:
           docker stop $(docker ps -q) || true
           # Prune all of the docker images
           docker system prune -af
-  manywheel-py3_8-rocm5_2-upload:  # Uploading
+  manywheel-py3_8-rocm5_3-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-rocm5_2-test
+    needs: manywheel-py3_8-rocm5_3-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-rocm5_2
+      build_name: manywheel-py3_8-rocm5_3
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
@@ -1330,7 +1330,7 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_9-rocm5_1_1-build:
+  manywheel-py3_9-rocm5_2-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -1339,19 +1339,19 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.1.1
-      GPU_ARCH_VERSION: 5.1.1
+      DESIRED_CUDA: rocm5.2
+      GPU_ARCH_VERSION: 5.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
       DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-rocm5_1_1
+      build_name: manywheel-py3_9-rocm5_2
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  manywheel-py3_9-rocm5_1_1-test:  # Testing
+  manywheel-py3_9-rocm5_2-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-rocm5_1_1-build
+    needs: manywheel-py3_9-rocm5_2-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -1360,11 +1360,11 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.1.1
-      GPU_ARCH_VERSION: 5.1.1
+      DESIRED_CUDA: rocm5.2
+      GPU_ARCH_VERSION: 5.2
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
       DESIRED_PYTHON: "3.9"
     steps:
       - name: Clean workspace
@@ -1412,7 +1412,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_9-rocm5_1_1
+          name: manywheel-py3_9-rocm5_2
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -1443,7 +1443,7 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.1.1
+          docker-image: pytorch/manylinux-builder:rocm5.2
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Kill containers, clean up images
@@ -1454,28 +1454,28 @@ jobs:
           docker stop $(docker ps -q) || true
           # Prune all of the docker images
           docker system prune -af
-  manywheel-py3_9-rocm5_1_1-upload:  # Uploading
+  manywheel-py3_9-rocm5_2-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-rocm5_1_1-test
+    needs: manywheel-py3_9-rocm5_2-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.1.1
-      GPU_ARCH_VERSION: 5.1.1
+      DESIRED_CUDA: rocm5.2
+      GPU_ARCH_VERSION: 5.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
       DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-rocm5_1_1
+      build_name: manywheel-py3_9-rocm5_2
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_9-rocm5_2-build:
+  manywheel-py3_9-rocm5_3-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -1484,19 +1484,19 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-rocm5_2
+      build_name: manywheel-py3_9-rocm5_3
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  manywheel-py3_9-rocm5_2-test:  # Testing
+  manywheel-py3_9-rocm5_3-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-rocm5_2-build
+    needs: manywheel-py3_9-rocm5_3-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -1505,11 +1505,11 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       DESIRED_PYTHON: "3.9"
     steps:
       - name: Clean workspace
@@ -1557,7 +1557,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_9-rocm5_2
+          name: manywheel-py3_9-rocm5_3
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -1588,7 +1588,7 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.2
+          docker-image: pytorch/manylinux-builder:rocm5.3
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Kill containers, clean up images
@@ -1599,21 +1599,21 @@ jobs:
           docker stop $(docker ps -q) || true
           # Prune all of the docker images
           docker system prune -af
-  manywheel-py3_9-rocm5_2-upload:  # Uploading
+  manywheel-py3_9-rocm5_3-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-rocm5_2-test
+    needs: manywheel-py3_9-rocm5_3-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-rocm5_2
+      build_name: manywheel-py3_9-rocm5_3
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
@@ -1858,7 +1858,7 @@ jobs:
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_10-rocm5_1_1-build:
+  manywheel-py3_10-rocm5_2-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -1867,19 +1867,19 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.1.1
-      GPU_ARCH_VERSION: 5.1.1
+      DESIRED_CUDA: rocm5.2
+      GPU_ARCH_VERSION: 5.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
       DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-rocm5_1_1
+      build_name: manywheel-py3_10-rocm5_2
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  manywheel-py3_10-rocm5_1_1-test:  # Testing
+  manywheel-py3_10-rocm5_2-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-rocm5_1_1-build
+    needs: manywheel-py3_10-rocm5_2-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -1888,11 +1888,11 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.1.1
-      GPU_ARCH_VERSION: 5.1.1
+      DESIRED_CUDA: rocm5.2
+      GPU_ARCH_VERSION: 5.2
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
       DESIRED_PYTHON: "3.10"
     steps:
       - name: Clean workspace
@@ -1940,7 +1940,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_10-rocm5_1_1
+          name: manywheel-py3_10-rocm5_2
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -1971,7 +1971,7 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.1.1
+          docker-image: pytorch/manylinux-builder:rocm5.2
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Kill containers, clean up images
@@ -1982,28 +1982,28 @@ jobs:
           docker stop $(docker ps -q) || true
           # Prune all of the docker images
           docker system prune -af
-  manywheel-py3_10-rocm5_1_1-upload:  # Uploading
+  manywheel-py3_10-rocm5_2-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-rocm5_1_1-test
+    needs: manywheel-py3_10-rocm5_2-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.1.1
-      GPU_ARCH_VERSION: 5.1.1
+      DESIRED_CUDA: rocm5.2
+      GPU_ARCH_VERSION: 5.2
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.1.1
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
       DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-rocm5_1_1
+      build_name: manywheel-py3_10-rocm5_2
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
       aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
       conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  manywheel-py3_10-rocm5_2-build:
+  manywheel-py3_10-rocm5_3-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     with:
@@ -2012,19 +2012,19 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-rocm5_2
+      build_name: manywheel-py3_10-rocm5_3
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
 
-  manywheel-py3_10-rocm5_2-test:  # Testing
+  manywheel-py3_10-rocm5_3-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-rocm5_2-build
+    needs: manywheel-py3_10-rocm5_3-build
     runs-on: linux.rocm.gpu
     timeout-minutes: 240
     env:
@@ -2033,11 +2033,11 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       DESIRED_PYTHON: "3.10"
     steps:
       - name: Clean workspace
@@ -2085,7 +2085,7 @@ jobs:
       - uses: actions/download-artifact@v3
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_10-rocm5_2
+          name: manywheel-py3_10-rocm5_3
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
@@ -2116,7 +2116,7 @@ jobs:
       - name: Pull Docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
         with:
-          docker-image: pytorch/manylinux-builder:rocm5.2
+          docker-image: pytorch/manylinux-builder:rocm5.3
       - name: Test Pytorch binary
         uses: ./pytorch/.github/actions/test-pytorch-binary
       - name: Kill containers, clean up images
@@ -2127,21 +2127,21 @@ jobs:
           docker stop $(docker ps -q) || true
           # Prune all of the docker images
           docker system prune -af
-  manywheel-py3_10-rocm5_2-upload:  # Uploading
+  manywheel-py3_10-rocm5_3-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-rocm5_2-test
+    needs: manywheel-py3_10-rocm5_3-test
     with:
       PYTORCH_ROOT: /pytorch
       BUILDER_ROOT: /builder
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm5.2
-      GPU_ARCH_VERSION: 5.2
+      DESIRED_CUDA: rocm5.3
+      GPU_ARCH_VERSION: 5.3
       GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.2
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.3
       DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-rocm5_2
+      build_name: manywheel-py3_10-rocm5_3
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
       aws-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}

From e66245b2b2f590e9428296ef2d0a24893749133e Mon Sep 17 00:00:00 2001
From: Jiewen Tan <jwtan@google.com>
Date: Thu, 24 Nov 2022 10:57:01 +0000
Subject: [PATCH 1253/1922] [LTC] Refine MetricsArena::Reset (#89608)

Summary:
After counters are reset, getters' behaviors are inconsistent. To improve that, here I 1) move the validation of CounterData into CounterData::IsValid such that it's better encapsulated, 2) divide getters into two groups: a) MetricsArena::GetCounter() and b) MetricsArena::ForEachCounter(), and route MetricsArena::GetCounterNames() and CreateMetricReport() to use b.

This is paired with pytorch/xla#4217.

Test Plan:
PJRT_DEVICE=CPU python xla/test/test_metrics.py

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89608
Approved by: https://github.com/JackCaoG
---
 .github/ci_commit_pins/xla.txt   |  2 +-
 torch/csrc/lazy/core/metrics.cpp | 33 ++++++++++++++++++--------------
 torch/csrc/lazy/core/metrics.h   |  4 ++++
 3 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index 3905f11038841..5650a48e646bb 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-d687b0a84269f476866638afd37db893a146387c
+216d221f4d75ddfe9d0bd3ff2e8b92b39c67d381
diff --git a/torch/csrc/lazy/core/metrics.cpp b/torch/csrc/lazy/core/metrics.cpp
index 86758edc4dfce..78aa7f15a5260 100644
--- a/torch/csrc/lazy/core/metrics.cpp
+++ b/torch/csrc/lazy/core/metrics.cpp
@@ -149,6 +149,8 @@ void MetricsArena::ForEachCounter(
     const std::function<void(const std::string&, CounterData*)>& counter_func) {
   std::lock_guard<std::mutex> lock(lock_);
   for (auto& name_data : counters_) {
+    if (!name_data.second->IsValid())
+      continue;
     counter_func(name_data.first, name_data.second.get());
   }
 }
@@ -170,19 +172,19 @@ MetricData* MetricsArena::GetMetric(const std::string& name) {
 
 std::vector<std::string> MetricsArena::GetCounterNames() {
   std::vector<std::string> names;
-  std::lock_guard<std::mutex> lock(lock_);
-  for (auto& name_data : counters_) {
-    if (name_data.second->Value() > 0) {
-      names.push_back(name_data.first);
-    }
-  }
+  ForEachCounter([&names](const std::string& name, CounterData* data) {
+    names.push_back(name);
+  });
   return names;
 }
 
 CounterData* MetricsArena::GetCounter(const std::string& name) {
   std::lock_guard<std::mutex> lock(lock_);
   auto it = counters_.find(name);
-  return it != counters_.end() ? it->second.get() : nullptr;
+  if (it == counters_.end()) {
+    return nullptr;
+  }
+  return it->second->IsValid() ? it->second.get() : nullptr;
 }
 
 MetricData::MetricData(MetricReprFn repr_fn, size_t max_samples)
@@ -366,15 +368,18 @@ std::string CreateMetricReport(
       EmitMetricInfo(metric_name, data, &ss);
     }
   }
-  for (const std::string& counter_name : counter_names) {
-    CounterData* data = arena->GetCounter(counter_name);
-    if (data && data->Value() > 0) {
-      EmitCounterInfo(counter_name, data, &ss);
-    }
-  }
+  std::set<std::string> counter_name_set(
+      counter_names.begin(), counter_names.end());
+  arena->ForEachCounter(
+      [&ss, &counter_name_set](const std::string& name, CounterData* data) {
+        if (counter_name_set.find(name) != counter_name_set.end()) {
+          EmitCounterInfo(name, data, &ss);
+        }
+      });
+
   static std::string fall_back_counter_prefix = "aten::";
   arena->ForEachCounter([&ss](const std::string& name, CounterData* data) {
-    if (name.rfind(fall_back_counter_prefix, 0) == 0 && data->Value() > 0) {
+    if (name.rfind(fall_back_counter_prefix, 0) == 0) {
       // it might emit duplicated counter if user also specified exact aten
       // counter in the `counter_names` but it should be very rare.
       EmitCounterInfo(name, data, &ss);
diff --git a/torch/csrc/lazy/core/metrics.h b/torch/csrc/lazy/core/metrics.h
index 1d629c4973db8..2e263cc7f00b3 100644
--- a/torch/csrc/lazy/core/metrics.h
+++ b/torch/csrc/lazy/core/metrics.h
@@ -81,6 +81,10 @@ class TORCH_API CounterData {
     value_ = 0;
   }
 
+  bool IsValid() const {
+    return value_ > 0;
+  }
+
  private:
   std::atomic<int64_t> value_;
 };

From 307ccb16e870303740005921703fd008b9636f9b Mon Sep 17 00:00:00 2001
From: mfkasim1 <firman.kasim@gmail.com>
Date: Thu, 24 Nov 2022 11:11:51 +0000
Subject: [PATCH 1254/1922] Added log1p for complex in c10 (#89214)

One PR towards #89205.
The content is mostly from PR #38465, but slightly changed the expression to make it faster.

Here are some benchmarking code:
```c++
#include <complex>
#include <iostream>
#include <chrono>

// main.cc

template<typename T> inline std::complex<T> log1p_v0(const std::complex<T> &z) {
    // this PR
    T x = z.real();
    T y = z.imag();
    T theta = std::atan2(y, x + T(1));
    T r = x * (x + T(2)) + y * y;
    return {T(0.5) * std::log1p(r), theta};
}

template<typename T> inline std::complex<T> log1p_v1(const std::complex<T> &z) {
    // PR #38465
    T x = z.real();
    T y = z.imag();
    std::complex<T> p1 = z + T(1);
    T r = std::abs(p1);
    T a = std::arg(p1);
    T rm1 = (x * x + y * y + x * T(2)) / (r + 1);
    return {std::log1p(rm1), a};
}

template<typename T>
inline std::complex<T> log1p_v2(const std::complex<T> &z) {
    // naive, but numerically inaccurate
    return std::log(T(1) + z);
}

int main() {
    int n = 1000000;
    std::complex<float> res(0.0, 0.0);
    std::complex<float> input(0.5, 2.0);
    auto start = std::chrono::system_clock::now();
    for (int i = 0; i < n; i++) {
        res += log1p_v0(input);
    }
    auto end = std::chrono::system_clock::now();
    auto elapsed = end - start;
    std::cout << "time for v0: " << elapsed.count() << '\n';

    start = std::chrono::system_clock::now();
    for (int i = 0; i < n; i++) {
        res += log1p_v1(input);
    }
    end = std::chrono::system_clock::now();
    elapsed = end - start;
    std::cout << "time for v1: " << elapsed.count() << '\n';

    start = std::chrono::system_clock::now();
    for (int i = 0; i < n; i++) {
        res += log1p_v2(input);
    }
    end = std::chrono::system_clock::now();
    elapsed = end - start;
    std::cout << "time for v2: " << elapsed.count() << '\n';
    std::cout << res << '\n';
}
```

Compiling the script with command `g++ main.cc` produces the following results:
```
time for v0: 237812271
time for v1: 414524941
time for v2: 360585994
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89214
Approved by: https://github.com/lezcano
---
 c10/test/util/complex_math_test_common.h | 128 +++++++++++++++++++++++
 c10/util/complex_math.h                  |  31 ++++++
 2 files changed, 159 insertions(+)

diff --git a/c10/test/util/complex_math_test_common.h b/c10/test/util/complex_math_test_common.h
index 15addf687856f..ce1be7b38d84d 100644
--- a/c10/test/util/complex_math_test_common.h
+++ b/c10/test/util/complex_math_test_common.h
@@ -166,6 +166,134 @@ C10_DEFINE_TEST(TestLog2, Rev) {
   }
 }
 
+C10_DEFINE_TEST(TestLog1p, Normal) {
+  // log1p(x) = log(1 + x)
+  {
+    c10::complex<float> x(0.1, 1.2);
+    c10::complex<float> l1 = std::log1p(x);
+    c10::complex<float> l2 = std::log(1.0f + x);
+    C10_ASSERT_NEAR(l1.real(), l2.real(), tol);
+    C10_ASSERT_NEAR(l1.imag(), l2.imag(), tol);
+  }
+  {
+    c10::complex<double> x(0.1, 1.2);
+    c10::complex<double> l1 = std::log1p(x);
+    c10::complex<double> l2 = std::log(1.0 + x);
+    C10_ASSERT_NEAR(l1.real(), l2.real(), tol);
+    C10_ASSERT_NEAR(l1.imag(), l2.imag(), tol);
+  }
+}
+
+C10_DEFINE_TEST(TestLog1p, Small) {
+  // log(1 + x) ~ x for |x| << 1
+  {
+    c10::complex<float> x(1e-9, 2e-9);
+    c10::complex<float> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real() / x.real(), 1, tol);
+    C10_ASSERT_NEAR(l.imag() / x.imag(), 1, tol);
+  }
+  {
+    c10::complex<double> x(1e-100, 2e-100);
+    c10::complex<double> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real() / x.real(), 1, tol);
+    C10_ASSERT_NEAR(l.imag() / x.imag(), 1, tol);
+  }
+}
+
+C10_DEFINE_TEST(TestLog1p, Extreme) {
+  // log(1 + x) ~ x for |x| << 1 and in the brink of overflow / underflow
+  {
+    c10::complex<float> x(-1, 1e-30);
+    c10::complex<float> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), -69.07755278982137, tol);
+    C10_ASSERT_NEAR(l.imag(), 1.5707963267948966, tol);
+  }
+  {
+    c10::complex<float> x(-1, 1e30);
+    c10::complex<float> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 69.07755278982137, tol);
+    C10_ASSERT_NEAR(l.imag(), 1.5707963267948966, tol);
+  }
+  {
+    c10::complex<float> x(1e30, 1);
+    c10::complex<float> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 69.07755278982137, tol);
+    C10_ASSERT_NEAR(l.imag(), 1e-30, tol);
+  }
+  {
+    c10::complex<float> x(1e-30, 1);
+    c10::complex<float> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 0.34657359027997264, tol);
+    C10_ASSERT_NEAR(l.imag(), 0.7853981633974483, tol);
+  }
+  {
+    c10::complex<float> x(1e30, 1e30);
+    c10::complex<float> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 69.42412638010134, tol);
+    C10_ASSERT_NEAR(l.imag(), 0.7853981633974483, tol);
+  }
+  {
+    c10::complex<float> x(1e-38, 1e-38);
+    c10::complex<float> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 1e-38, tol);
+    C10_ASSERT_NEAR(l.imag(), 1e-38, tol);
+  }
+  {
+    c10::complex<float> x(1e-38, 2e-30);
+    c10::complex<float> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 1e-30, tol);
+    C10_ASSERT_NEAR(l.imag(), 2e-30, tol);
+  }
+  {
+    c10::complex<double> x(-1, 1e-250);
+    c10::complex<double> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), -575.6462732485114, tol);
+    C10_ASSERT_NEAR(l.imag(), 1.5707963267948966, tol);
+  }
+  {
+    c10::complex<double> x(-1, 1e250);
+    c10::complex<double> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 575.6462732485114, tol);
+    C10_ASSERT_NEAR(l.imag(), 1.5707963267948966, tol);
+  }
+  {
+    c10::complex<double> x(1e250, 1);
+    c10::complex<double> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 575.6462732485114, tol);
+    C10_ASSERT_NEAR(l.imag(), 1e-250, tol);
+  }
+  {
+    c10::complex<double> x(1e-250, 1);
+    c10::complex<double> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 0.34657359027997264, tol);
+    C10_ASSERT_NEAR(l.imag(), 0.7853981633974483, tol);
+  }
+  {
+    c10::complex<double> x(1e250, 1e250);
+    c10::complex<double> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 575.9928468387914, tol);
+    C10_ASSERT_NEAR(l.imag(), 0.7853981633974483, tol);
+  }
+  {
+    c10::complex<double> x(1e-250, 1e-250);
+    c10::complex<double> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 1e-250, tol);
+    C10_ASSERT_NEAR(l.imag(), 1e-250, tol);
+  }
+  {
+    c10::complex<double> x(1e-250, 2e-250);
+    c10::complex<double> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 1e-250, tol);
+    C10_ASSERT_NEAR(l.imag(), 2e-250, tol);
+  }
+  {
+    c10::complex<double> x(2e-308, 1.5e-250);
+    c10::complex<double> l = std::log1p(x);
+    C10_ASSERT_NEAR(l.real(), 2e-308, tol);
+    C10_ASSERT_NEAR(l.imag(), 1.5e-308, tol);
+  }
+}
+
 // Power functions
 
 C10_DEFINE_TEST(TestPowSqrt, Equal) {
diff --git a/c10/util/complex_math.h b/c10/util/complex_math.h
index ecfd0442b751b..8709fe4a0eb55 100644
--- a/c10/util/complex_math.h
+++ b/c10/util/complex_math.h
@@ -291,6 +291,35 @@ C10_HOST_DEVICE inline c10::complex<T> atanh(const c10::complex<T>& x) {
 #endif
 }
 
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> log1p(const c10::complex<T>& z) {
+  // log1p(z) = log(1 + z)
+  // Let's define 1 + z = r * e ^ (i * a), then we have
+  // log(r * e ^ (i * a)) = log(r) + i * a
+  // With z = x + iy, the term r can be written as
+  // r = ((1 + x) ^ 2 + y ^ 2) ^ 0.5
+  //   = (1 + x ^ 2 + 2 * x + y ^ 2) ^ 0.5
+  // So, log(r) is
+  // log(r) = 0.5 * log(1 + x ^ 2 + 2 * x + y ^ 2)
+  //        = 0.5 * log1p(x * (x + 2) + y ^ 2)
+  // we need to use the expression only on certain condition to avoid overflow
+  // and underflow from `(x * (x + 2) + y ^ 2)`
+  T x = z.real();
+  T y = z.imag();
+  T zabs = std::abs(z);
+  T theta = std::atan2(y, x + T(1));
+  if (zabs < 0.5) {
+    T r = x * (T(2) + x) + y * y;
+    if (r == 0) { // handle underflow
+      return {x, theta};
+    }
+    return {T(0.5) * std::log1p(r), theta};
+  } else {
+    T z0 = std::hypot(x + 1, y);
+    return {std::log(z0), theta};
+  }
+}
+
 } // namespace c10_complex_math
 
 using c10_complex_math::acos;
@@ -304,6 +333,7 @@ using c10_complex_math::cosh;
 using c10_complex_math::exp;
 using c10_complex_math::log;
 using c10_complex_math::log10;
+using c10_complex_math::log1p;
 using c10_complex_math::log2;
 using c10_complex_math::pow;
 using c10_complex_math::sin;
@@ -325,6 +355,7 @@ using c10_complex_math::cosh;
 using c10_complex_math::exp;
 using c10_complex_math::log;
 using c10_complex_math::log10;
+using c10_complex_math::log1p;
 using c10_complex_math::log2;
 using c10_complex_math::pow;
 using c10_complex_math::sin;

From 3fea0d074c605fbe1afdb8c5d7c4fcaef233da90 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 23 Nov 2022 08:04:31 -0800
Subject: [PATCH 1255/1922] Suppress guards on as_strided call only. (#89569)

See comment in meta_utils.py for the whole story.

This doesn't have a substantive impact yet, but will in the next
PR on the stack.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89569
Approved by: https://github.com/albanD
---
 torch/_subclasses/fake_tensor.py |  6 +-----
 torch/_subclasses/meta_utils.py  | 34 +++++++++++++++++++++++++++++---
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 758f4431f688e..9a0ac050e6b94 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -230,11 +230,7 @@ def mk_fake_tensor(make_meta_t):
                     constant=t if make_constant else None,
                 )
 
-        ctx = contextlib.nullcontext()
-        if shape_env is not None:
-            ctx = shape_env.suppress_guards()
-        with ctx:
-            out = self.meta_converter(t, shape_env=shape_env, callback=mk_fake_tensor)
+        out = self.meta_converter(t, shape_env=shape_env, callback=mk_fake_tensor)
         if out is NotImplemented:
             raise UnsupportedFakeTensorException("meta converter nyi")
         if make_constant:
diff --git a/torch/_subclasses/meta_utils.py b/torch/_subclasses/meta_utils.py
index 081f7aa632f91..6e5586bde0a79 100644
--- a/torch/_subclasses/meta_utils.py
+++ b/torch/_subclasses/meta_utils.py
@@ -196,6 +196,34 @@ def meta_tensor(self, t, shape_env=None, callback=lambda t: t()):
         arg_cnt = self.arg_cnt
         self.arg_cnt += 1
 
+        # When we make as_strided calls, we end up generating a guard
+        # that the new as_strided tensor is in bounds for the old storage
+        # for the base (since as_strided calls can "bust" out of their
+        # bounding box.)  This guard is unnecessary: if a user is able
+        # to provide us a tensor with the view base setup this way, we
+        # don't need to produce a guard, because the fact that they
+        # were able to produce the view base means its in bounds.
+        #
+        # Now, ordinarily, this guard would be harmless.  However, the
+        # generated guard refers to variables bound on the base variable.
+        # At the moment, Dynamo doesn't actually guard on x._base, because
+        # according to Voz this results in a lot of spurious invalidations,
+        # and also if the user doesn't directly make use of _base, its
+        # pointless anyway (because programs should be parametric over
+        # whether or not the input tensor is a view or not--unless you're
+        # mutating the input, but that's a whole 'nother ballgame).  So
+        # for expediency, we suppress these guards so we don't have to
+        # deal with this (yet, anyway.)
+        #
+        # NB: An old version of this code suppressed guards for ALL operations
+        # happening during meta conversion, not just as_strided calls.
+        # This is too aggressive: we do duck sizing and 0/1 simplification
+        # as we allocate variables, and we do need to register guards for
+        # these cases.
+        maybe_suppress = contextlib.nullcontext()
+        if shape_env is not None:
+            maybe_suppress = shape_env.suppress_guards()
+
         make_symbolic = shape_env is not None
 
         def sym(x):
@@ -308,7 +336,7 @@ def is_c_of_r(complex_dtype, real_dtype):
                         if safe_is_leaf(t):
                             # Leaf views that track view metadata are created by
                             # creating a view inside a no_grad block
-                            with torch.no_grad():
+                            with torch.no_grad(), maybe_suppress:
                                 r = base.as_strided(
                                     sizes, strides, sym(t.storage_offset())
                                 )
@@ -317,7 +345,7 @@ def is_c_of_r(complex_dtype, real_dtype):
                         else:
                             if t._base.requires_grad == t.requires_grad:
                                 # Easy case, just run the view op
-                                with torch.enable_grad():
+                                with torch.enable_grad(), maybe_suppress:
                                     r = base.as_strided(
                                         sizes, strides, sym(t.storage_offset())
                                     )
@@ -329,7 +357,7 @@ def is_c_of_r(complex_dtype, real_dtype):
                                 with torch.no_grad():
                                     mid = base.view(base.shape)
                                 mid.requires_grad = t.requires_grad
-                                with torch.enable_grad():
+                                with torch.enable_grad(), maybe_suppress:
                                     r = mid.as_strided(
                                         sizes, strides, sym(t.storage_offset())
                                     )

From 279b6566074e3bf2d7a2bf75aef800d5a345db53 Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Thu, 24 Nov 2022 17:11:42 +0000
Subject: [PATCH 1256/1922] Mention discrepency between original impl and our
 impl of RAdam (#89575)

Fixes https://github.com/pytorch/pytorch/issues/88836

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89575
Approved by: https://github.com/mruberry
---
 torch/optim/radam.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/torch/optim/radam.py b/torch/optim/radam.py
index 3cea370ade83a..c389e48ccf3fd 100644
--- a/torch/optim/radam.py
+++ b/torch/optim/radam.py
@@ -44,6 +44,10 @@ class RAdam(Optimizer):
 
     For further details regarding the algorithm we refer to `On the variance of the adaptive learning rate and beyond`_.
 
+    This implementation uses the same weight_decay implementation as Adam (were the weight_decay is applied
+    to the gradient) and not the one from AdamW (were weight_decay is applied to the update). This
+    is different from the `author's implementation`_.
+
     Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
@@ -58,6 +62,8 @@ class RAdam(Optimizer):
 
     .. _On the variance of the adaptive learning rate and beyond:
         https://arxiv.org/abs/1908.03265
+    .. _author's implementation:
+        https://github.com/LiyuanLucasLiu/RAdam
     """
 
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,

From 21e235fcac62b96b83bc964f27fbea9a65b8b3f8 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Thu, 24 Nov 2022 09:23:05 -0500
Subject: [PATCH 1257/1922] Make pytest work again on test/dynamo (#89631)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89631
Approved by: https://github.com/anjali411
---
 test/dynamo/test_repros.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index cbdf69a337aa2..b38f904feb31d 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -20,7 +20,11 @@
 import torch._dynamo.testing
 import torch._dynamo.utils
 
-from test_minifier import requires_cuda
+try:
+    from test_minifier import requires_cuda
+except ImportError:
+    from .test_minifier import requires_cuda
+
 from torch import nn
 from torch._dynamo.debug_utils import same_two_models
 from torch._dynamo.testing import rand_strided, requires_static_shapes, same

From f6e1f8082bfdd9c1670aaeb3c157bbaedd754b43 Mon Sep 17 00:00:00 2001
From: Emilio Castillo <ecastill@preferred.jp>
Date: Thu, 24 Nov 2022 18:25:26 +0000
Subject: [PATCH 1258/1922] Fix segfault when swapping custom allocator
 (#89613)

Just screwed it before merging ...

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89613
Approved by: https://github.com/albanD
---
 torch/csrc/cuda/CUDAPluggableAllocator.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/csrc/cuda/CUDAPluggableAllocator.cpp b/torch/csrc/cuda/CUDAPluggableAllocator.cpp
index a64290b8a16e4..56927c16a0de8 100644
--- a/torch/csrc/cuda/CUDAPluggableAllocator.cpp
+++ b/torch/csrc/cuda/CUDAPluggableAllocator.cpp
@@ -302,7 +302,7 @@ createCustomAllocator(
 void changeCurrentAllocator(
     std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator> allocator) {
   TORCH_CHECK(
-      !getCurrentAllocator()->initialized(),
+      !c10::cuda::CUDACachingAllocator::allocator.load()->initialized(),
       "Can't swap an already initialized allocator");
   c10::cuda::CUDACachingAllocator::allocator.store(allocator.get());
   current_custom_allocator = allocator;

From 473b9f11cb2568432abc373e3d08692f907633a2 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Thu, 24 Nov 2022 08:11:46 -0800
Subject: [PATCH 1259/1922] Remove fake_tensors_available (#89637)

As we are one repo now, they are always available.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89637
Approved by: https://github.com/anjali411
---
 torch/_dynamo/optimizations/analysis.py |  10 +-
 torch/_dynamo/output_graph.py           |  11 +-
 torch/_dynamo/symbolic_convert.py       |  19 +---
 torch/_dynamo/utils.py                  | 137 ++++++++++++------------
 torch/_dynamo/variables/builder.py      |   5 +-
 torch/_dynamo/variables/tensor.py       |   5 +-
 6 files changed, 81 insertions(+), 106 deletions(-)

diff --git a/torch/_dynamo/optimizations/analysis.py b/torch/_dynamo/optimizations/analysis.py
index c4ed04ca8c39d..6fb4ff82e5a37 100644
--- a/torch/_dynamo/optimizations/analysis.py
+++ b/torch/_dynamo/optimizations/analysis.py
@@ -4,18 +4,16 @@
 import operator
 
 import torch
+
+from torch._subclasses import FakeTensorMode  # noqa: F401
 from torch.fx.node import map_aggregate
 from torch.fx.passes.shape_prop import _extract_tensor_metadata, ShapeProp
 from torch.multiprocessing.reductions import StorageWeakRef
 from torch.utils._pytree import tree_map
 
 from .. import config
-from ..utils import clone_inputs, fake_tensors_available
-
-if fake_tensors_available:
-    from torch._subclasses import FakeTensorMode  # noqa: F401
 
-    from ..utils import deepcopy_to_fake_tensor
+from ..utils import clone_inputs, deepcopy_to_fake_tensor
 
 
 class ShapeAliasingAndMutationProp(ShapeProp):
@@ -121,7 +119,7 @@ def has_mutation(gm, example_inputs, inputs_only=False):
     true, we only check for mutation of inputs"""
     # TODO - moco gives bad accuracy with Aliasing. gm is getting mutated in a bad way.
 
-    if fake_tensors_available and config.fake_tensor_propagation:
+    if config.fake_tensor_propagation:
 
         def _wrap_to_fake_tensor(t, *, f_mode):
             if type(t) in (torch.Tensor, torch.nn.Parameter):
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index 4578fb98dfcbc..0c1ddabdc1980 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -20,13 +20,7 @@
 from .mutation_guard import is_dynamic_nn_module
 from .side_effects import SideEffects
 from .source import ConstantSource, LocalSource, Source
-from .utils import (
-    CleanupHook,
-    count_calls,
-    counters,
-    fake_tensors_available,
-    format_graph_tabular,
-)
+from .utils import CleanupHook, count_calls, counters, format_graph_tabular
 from .variables.builder import VariableBuilder, wrap_fx_proxy
 from .variables.nn_module import NNModuleVariable
 from .variables.tensor import (
@@ -515,8 +509,7 @@ def cleanup(self):
         # some of the tensor objects to be held alive for longer than necessary.
 
         # Clear cache for conversion of real -> fake tensors
-        if fake_tensors_available:
-            self.root_tx.fake_mode.fake_tensor_converter = None
+        self.root_tx.fake_mode.fake_tensor_converter = None
         self.root_tx = None
 
         # Note: generated fx graph will hold a reference to the nn_module,
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index 7a16b6b982a03..9b2a39ef3384e 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -48,13 +48,7 @@
     GlobalWeakRefSource,
     LocalSource,
 )
-from .utils import (
-    counters,
-    fake_tensors_available,
-    graph_break_dup_warning_checker,
-    istype,
-    proxy_args_kwargs,
-)
+from .utils import counters, graph_break_dup_warning_checker, istype, proxy_args_kwargs
 from .variables.base import MutableLocal, typestr, VariableTracker
 from .variables.builder import VariableBuilder, wrap_fx_proxy
 from .variables.builtin import BuiltinVariable
@@ -1513,13 +1507,10 @@ def __init__(
         # Flag to indicate whether tracing is used for export.
         self.export = export
 
-        if fake_tensors_available:
-            with torch._subclasses.FakeTensorMode(
-                throw_on_data_dependent_ops=True,
-                shape_env=output.shape_env,
-            ) as fake_mode:
-                pass
-            self._fake_mode = fake_mode
+        self._fake_mode = torch._subclasses.FakeTensorMode(
+            throw_on_data_dependent_ops=True,
+            shape_env=output.shape_env,
+        )
 
         self.checkpoint = None
         self.random_calls: List[tuple] = []
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 481794707efdc..ce020b74c41da 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -319,8 +319,7 @@ def istensor(obj):
         torch.nn.Parameter,
         *config.traceable_tensor_subclasses,
     )
-    if fake_tensors_available:
-        tensor_list = tensor_list + (torch._subclasses.FakeTensor,)
+    tensor_list = tensor_list + (torch._subclasses.FakeTensor,)
     return istype(obj, tensor_list)
 
 
@@ -690,88 +689,86 @@ def rename_implicit(v):
     return v
 
 
-# FakeTensors were introduced after pytorch 1.12, so gate their use
-# to allow pytorch 1.12 to work
-fake_tensors_available = True
-try:
-    from torch._subclasses import (  # noqa: F401
-        FakeTensorMode,
-        UnsupportedFakeTensorException,
-    )
+from torch._subclasses import (  # noqa: F401
+    FakeTensorMode,
+    UnsupportedFakeTensorException,
+)
 
-    def make_fake_tensor(e, fake_mode, static_shapes=False, tx=None):
-        fake_tensor = fake_mode.from_tensor(e, static_shapes=static_shapes)
-        if tx is not None:
-            from torch._dynamo.guards import TensorReference
 
-            def _record(tensor_ref):
-                if tensor_ref.ref_id not in tx.output.tensor_id_to_sym_shape_ref:
-                    tx.output.tensor_id_to_sym_shape_ref[tensor_ref.ref_id] = set()
-                tx.output.tensor_id_to_sym_shape_ref[tensor_ref.ref_id].add(tensor_ref)
+def make_fake_tensor(e, fake_mode, static_shapes=False, tx=None):
+    fake_tensor = fake_mode.from_tensor(e, static_shapes=static_shapes)
+    if tx is not None:
+        from torch._dynamo.guards import TensorReference
 
-            def _extract(symbol):
-                if isinstance(symbol, int):
-                    return None
-                sym_expr = symbol.get_pyobj().expr
-                if not isinstance(sym_expr, sympy.Symbol):
-                    return None
-                return sym_expr
+        def _record(tensor_ref):
+            if tensor_ref.ref_id not in tx.output.tensor_id_to_sym_shape_ref:
+                tx.output.tensor_id_to_sym_shape_ref[tensor_ref.ref_id] = set()
+            tx.output.tensor_id_to_sym_shape_ref[tensor_ref.ref_id].add(tensor_ref)
 
-            def _record_ref(e, index, symbol, kind):
-                sym_expr = _extract(symbol)
-                if sym_expr:
-                    tensor_ref = TensorReference(id(e), kind, index, sym_expr)
-                    _record(tensor_ref)
+        def _extract(symbol):
+            if isinstance(symbol, int):
+                return None
+            sym_expr = symbol.get_pyobj().expr
+            if not isinstance(sym_expr, sympy.Symbol):
+                return None
+            return sym_expr
 
-            for index, symbol in enumerate(fake_tensor.size()):
-                _record_ref(e, index, symbol, "size")
+        def _record_ref(e, index, symbol, kind):
+            sym_expr = _extract(symbol)
+            if sym_expr:
+                tensor_ref = TensorReference(id(e), kind, index, sym_expr)
+                _record(tensor_ref)
 
-            for index, symbol in enumerate(fake_tensor.stride()):
-                _record_ref(e, index, symbol, "stride")
+        for index, symbol in enumerate(fake_tensor.size()):
+            _record_ref(e, index, symbol, "size")
 
-            offset = fake_tensor.storage_offset()
-            _record_ref(e, None, offset, "storage_offset")
+        for index, symbol in enumerate(fake_tensor.stride()):
+            _record_ref(e, index, symbol, "stride")
 
-        return fake_tensor
+        offset = fake_tensor.storage_offset()
+        _record_ref(e, None, offset, "storage_offset")
 
-    def wrap_fake_exception(fn):
-        try:
-            return fn()
-        except UnsupportedFakeTensorException as e:
-            from .exc import unimplemented
+    return fake_tensor
 
-            msg = f"Unsupported: {e.reason} with fake tensor propagation. Run with config.fake_tensor_propagation=False"
-            log.warning(msg)
-            raise unimplemented(msg)
 
-    def wrap_to_fake_tensor(e, fake_mode):
-        if type(e) in (torch.Tensor, torch.nn.Parameter):
-            return wrap_fake_exception(
-                lambda: make_fake_tensor(
-                    e, fake_mode, static_shapes=config.dynamic_shapes is False
-                )
-            )
-        else:
-            return e
-
-    def wrap_to_fake_tensor_and_record(e, tx):
-        if type(e) in (torch.Tensor, torch.nn.Parameter):
-            static_shapes = config.dynamic_shapes is False
-            if type(e) is torch.nn.Parameter:
-                # Always static for params
-                static_shapes = True
-            return wrap_fake_exception(
-                lambda: make_fake_tensor(e, tx.fake_mode, static_shapes, tx)
+def wrap_fake_exception(fn):
+    try:
+        return fn()
+    except UnsupportedFakeTensorException as e:
+        from .exc import unimplemented
+
+        msg = f"Unsupported: {e.reason} with fake tensor propagation. Run with config.fake_tensor_propagation=False"
+        log.warning(msg)
+        raise unimplemented(msg)
+
+
+def wrap_to_fake_tensor(e, fake_mode):
+    if type(e) in (torch.Tensor, torch.nn.Parameter):
+        return wrap_fake_exception(
+            lambda: make_fake_tensor(
+                e, fake_mode, static_shapes=config.dynamic_shapes is False
             )
-        else:
-            return e
+        )
+    else:
+        return e
+
+
+def wrap_to_fake_tensor_and_record(e, tx):
+    if type(e) in (torch.Tensor, torch.nn.Parameter):
+        static_shapes = config.dynamic_shapes is False
+        if type(e) is torch.nn.Parameter:
+            # Always static for params
+            static_shapes = True
+        return wrap_fake_exception(
+            lambda: make_fake_tensor(e, tx.fake_mode, static_shapes, tx)
+        )
+    else:
+        return e
 
-    def deepcopy_to_fake_tensor(obj, fake_mode):
-        with torch._subclasses.fake_tensor.FakeCopyMode(fake_mode):
-            return wrap_fake_exception(lambda: copy.deepcopy(obj))
 
-except ImportError:
-    fake_tensors_available = False
+def deepcopy_to_fake_tensor(obj, fake_mode):
+    with torch._subclasses.fake_tensor.FakeCopyMode(fake_mode):
+        return wrap_fake_exception(lambda: copy.deepcopy(obj))
 
 
 def rmse(ref, res):
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index 6db7cbf87820d..333be76598748 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -36,7 +36,6 @@
 )
 from ..utils import (
     clone_input,
-    fake_tensors_available,
     get_fake_value,
     get_real_value,
     getfile,
@@ -658,13 +657,13 @@ def wrap_fx_proxy_cls(target_cls, tx, proxy, example_value=None, **options):
             options.update(target_cls.specialize(example_value))
         return target_cls(proxy, **options)
 
-    use_fake_tensors = fake_tensors_available and config.fake_tensor_propagation
+    use_fake_tensors = config.fake_tensor_propagation
 
     initial_example_value = example_value
 
     def _clone_input(value):
         if isinstance(value, torch.Tensor):
-            use_fake_tensors = fake_tensors_available and config.fake_tensor_propagation
+            use_fake_tensors = config.fake_tensor_propagation
             # tensor subclasses will not be converted to FakeTensors and need to be cloned
             if not use_fake_tensors or not isinstance(
                 value, torch._subclasses.fake_tensor.FakeTensor
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index 4ef9bf2fab11d..282f8cce0c368 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -11,7 +11,6 @@
 from ..source import AttrSource
 
 from ..utils import (
-    fake_tensors_available,
     get_fake_value,
     get_real_value,
     product,
@@ -261,9 +260,7 @@ def call_method(
             unimplemented(f"Tensor.{name}")
         elif name == "item":
             if config.capture_scalar_outputs:
-                use_fake_tensors = (
-                    fake_tensors_available and config.fake_tensor_propagation
-                )
+                use_fake_tensors = config.fake_tensor_propagation
                 if use_fake_tensors:
                     example_value = get_fake_value(self.proxy.node, tx)
                 else:

From 67497a8fad6f83dc7f5a8df3d3a5863218c7604c Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang@meta.com>
Date: Thu, 24 Nov 2022 19:41:17 +0000
Subject: [PATCH 1260/1922] Update SyncBatchNorm _all_gather_base to
 all_gather_into_tensor (#89521)

Summary: Fixes https://github.com/pytorch/pytorch/issues/88568

`_all_gather_base` is deprecated. So replacing its usage with `all_gather_into_tensor`

Test Plan: CI

Differential Revision: D41479983

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89521
Approved by: https://github.com/wz337
---
 torch/nn/modules/_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/nn/modules/_functions.py b/torch/nn/modules/_functions.py
index 464c56a548a65..099c2b164daa6 100644
--- a/torch/nn/modules/_functions.py
+++ b/torch/nn/modules/_functions.py
@@ -56,7 +56,7 @@ def forward(self, input, weight, bias, running_mean, running_var, eps, momentum,
                                         combined_size * world_size,
                                         dtype=combined.dtype,
                                         device=combined.device)
-            dist._all_gather_base(combined_flat, combined, process_group, async_op=False)
+            dist.all_gather_into_tensor(combined_flat, combined, process_group, async_op=False)
             combined = torch.reshape(combined_flat, (world_size, combined_size))
             # world_size * (2C + 1) -> world_size * C, world_size * C, world_size * 1
             mean_all, invstd_all, count_all = torch.split(combined, num_channels, dim=1)

From 5bc2187a1b54a6c211fb5f567226f17479e644bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleksandar=20Samard=C5=BEi=C4=87?=
 <asamardzic@quansight.com>
Date: Thu, 24 Nov 2022 14:44:12 +0000
Subject: [PATCH 1261/1922] Added vectorized CPU code for uint8_t datatype.
 (#89284)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89284
Approved by: https://github.com/lezcano, https://github.com/peterbell10
---
 aten/src/ATen/cpu/vec/vec256/vec256_int.h    | 362 +++++++++++++++++-
 aten/src/ATen/cpu/vec/vec512/vec512_int.h    | 374 ++++++++++++++++++-
 aten/src/ATen/native/cpu/ReduceOpsKernel.cpp |  22 --
 3 files changed, 724 insertions(+), 34 deletions(-)

diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_int.h b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
index f17cdc5bc156a..391baeb8b6a32 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
@@ -745,6 +745,257 @@ class Vectorized<int8_t> : public Vectorizedi {
   Vectorized<int8_t> le(const Vectorized<int8_t>& other) const;
 };
 
+template <>
+class Vectorized<uint8_t> : public Vectorizedi {
+private:
+  static const Vectorized<uint8_t> ones;
+public:
+  using value_type = uint8_t;
+  static constexpr int size() {
+    return 32;
+  }
+  using Vectorizedi::Vectorizedi;
+  Vectorized() {}
+  Vectorized(uint8_t v) { values = _mm256_set1_epi8(v); }
+  Vectorized(uint8_t val1, uint8_t val2, uint8_t val3, uint8_t val4,
+         uint8_t val5, uint8_t val6, uint8_t val7, uint8_t val8,
+         uint8_t val9, uint8_t val10, uint8_t val11, uint8_t val12,
+         uint8_t val13, uint8_t val14, uint8_t val15, uint8_t val16,
+         uint8_t val17, uint8_t val18, uint8_t val19, uint8_t val20,
+         uint8_t val21, uint8_t val22, uint8_t val23, uint8_t val24,
+         uint8_t val25, uint8_t val26, uint8_t val27, uint8_t val28,
+         uint8_t val29, uint8_t val30, uint8_t val31, uint8_t val32) {
+    values = _mm256_setr_epi8(val1, val2, val3, val4, val5, val6, val7, val8,
+                              val9, val10, val11, val12, val13, val14, val15, val16,
+                              val17, val18, val19, val20, val21, val22, val23, val24,
+                              val25, val26, val27, val28, val29, val30, val31, val32);
+  }
+  template <int64_t mask>
+  static Vectorized<uint8_t> blend(Vectorized<uint8_t> a, Vectorized<uint8_t> b) {
+    __at_align__ uint8_t tmp_values[size()];
+    a.store(tmp_values);
+    if (mask & 0x01)
+      tmp_values[0] = _mm256_extract_epi8(b.values, 0);
+    if (mask & 0x02)
+      tmp_values[1] = _mm256_extract_epi8(b.values, 1);
+    if (mask & 0x04)
+      tmp_values[2] = _mm256_extract_epi8(b.values, 2);
+    if (mask & 0x08)
+      tmp_values[3] = _mm256_extract_epi8(b.values, 3);
+    if (mask & 0x10)
+      tmp_values[4] = _mm256_extract_epi8(b.values, 4);
+    if (mask & 0x20)
+      tmp_values[5] = _mm256_extract_epi8(b.values, 5);
+    if (mask & 0x40)
+      tmp_values[6] = _mm256_extract_epi8(b.values, 6);
+    if (mask & 0x80)
+      tmp_values[7] = _mm256_extract_epi8(b.values, 7);
+    if (mask & 0x100)
+      tmp_values[8] = _mm256_extract_epi8(b.values, 8);
+    if (mask & 0x200)
+      tmp_values[9] = _mm256_extract_epi8(b.values, 9);
+    if (mask & 0x400)
+      tmp_values[10] = _mm256_extract_epi8(b.values, 10);
+    if (mask & 0x800)
+      tmp_values[11] = _mm256_extract_epi8(b.values, 11);
+    if (mask & 0x1000)
+      tmp_values[12] = _mm256_extract_epi8(b.values, 12);
+    if (mask & 0x2000)
+      tmp_values[13] = _mm256_extract_epi8(b.values, 13);
+    if (mask & 0x4000)
+      tmp_values[14] = _mm256_extract_epi8(b.values, 14);
+    if (mask & 0x8000)
+      tmp_values[15] = _mm256_extract_epi8(b.values, 15);
+    if (mask & 0x010000)
+      tmp_values[16] = _mm256_extract_epi8(b.values, 16);
+    if (mask & 0x020000)
+      tmp_values[17] = _mm256_extract_epi8(b.values, 17);
+    if (mask & 0x040000)
+      tmp_values[18] = _mm256_extract_epi8(b.values, 18);
+    if (mask & 0x080000)
+      tmp_values[19] = _mm256_extract_epi8(b.values, 19);
+    if (mask & 0x100000)
+      tmp_values[20] = _mm256_extract_epi8(b.values, 20);
+    if (mask & 0x200000)
+      tmp_values[21] = _mm256_extract_epi8(b.values, 21);
+    if (mask & 0x400000)
+      tmp_values[22] = _mm256_extract_epi8(b.values, 22);
+    if (mask & 0x800000)
+      tmp_values[23] = _mm256_extract_epi8(b.values, 23);
+    if (mask & 0x1000000)
+      tmp_values[24] = _mm256_extract_epi8(b.values, 24);
+    if (mask & 0x2000000)
+      tmp_values[25] = _mm256_extract_epi8(b.values, 25);
+    if (mask & 0x4000000)
+      tmp_values[26] = _mm256_extract_epi8(b.values, 26);
+    if (mask & 0x8000000)
+      tmp_values[27] = _mm256_extract_epi8(b.values, 27);
+    if (mask & 0x10000000)
+      tmp_values[28] = _mm256_extract_epi8(b.values, 28);
+    if (mask & 0x20000000)
+      tmp_values[29] = _mm256_extract_epi8(b.values, 29);
+    if (mask & 0x40000000)
+      tmp_values[30] = _mm256_extract_epi8(b.values, 30);
+    if (mask & 0x80000000)
+      tmp_values[31] = _mm256_extract_epi8(b.values, 31);
+    return loadu(tmp_values);
+  }
+  static Vectorized<uint8_t> blendv(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b,
+                               const Vectorized<uint8_t>& mask) {
+    return _mm256_blendv_epi8(a.values, b.values, mask.values);
+  }
+  template <typename step_t>
+  static Vectorized<uint8_t> arange(uint8_t base = 0, step_t step = static_cast<step_t>(1)) {
+    return Vectorized<uint8_t>(
+      base,             base +      step, base +  2 * step, base +  3 * step,
+      base +  4 * step, base +  5 * step, base +  6 * step, base +  7 * step,
+      base +  8 * step, base +  9 * step, base + 10 * step, base + 11 * step,
+      base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step,
+      base + 16 * step, base + 17 * step, base + 18 * step, base + 19 * step,
+      base + 20 * step, base + 21 * step, base + 22 * step, base + 23 * step,
+      base + 24 * step, base + 25 * step, base + 26 * step, base + 27 * step,
+      base + 28 * step, base + 29 * step, base + 30 * step, base + 31 * step);
+  }
+  static Vectorized<uint8_t>
+  set(Vectorized<uint8_t> a, Vectorized<uint8_t> b, uint8_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<0x1>(a, b);
+      case 2:
+        return blend<0x3>(a, b);
+      case 3:
+        return blend<0x7>(a, b);
+      case 4:
+        return blend<0xF>(a, b);
+      case 5:
+        return blend<0x1F>(a, b);
+      case 6:
+        return blend<0x3F>(a, b);
+      case 7:
+        return blend<0x7F>(a, b);
+      case 8:
+        return blend<0xFF>(a, b);
+      case 9:
+        return blend<0x1FF>(a, b);
+      case 10:
+        return blend<0x3FF>(a, b);
+      case 11:
+        return blend<0x7FF>(a, b);
+      case 12:
+        return blend<0xFFF>(a, b);
+      case 13:
+        return blend<0x1FFF>(a, b);
+      case 14:
+        return blend<0x3FFF>(a, b);
+      case 15:
+        return blend<0x7FFF>(a, b);
+      case 16:
+        return blend<0xFFFF>(a, b);
+      case 17:
+        return blend<0x1FFFF>(a, b);
+      case 18:
+        return blend<0x3FFFF>(a, b);
+      case 19:
+        return blend<0x7FFFF>(a, b);
+      case 20:
+        return blend<0xFFFFF>(a, b);
+      case 21:
+        return blend<0x1FFFFF>(a, b);
+      case 22:
+        return blend<0x3FFFFF>(a, b);
+      case 23:
+        return blend<0x7FFFFF>(a, b);
+      case 24:
+        return blend<0xFFFFFF>(a, b);
+      case 25:
+        return blend<0x1FFFFFF>(a, b);
+      case 26:
+        return blend<0x3FFFFFF>(a, b);
+      case 27:
+        return blend<0x7FFFFFF>(a, b);
+      case 28:
+        return blend<0xFFFFFFF>(a, b);
+      case 29:
+        return blend<0x1FFFFFFF>(a, b);
+      case 30:
+        return blend<0x3FFFFFFF>(a, b);
+      case 31:
+        return blend<0x7FFFFFFF>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<uint8_t> loadu(const void* ptr) {
+    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+  }
+  static Vectorized<uint8_t> loadu(const void* ptr, uint8_t count) {
+    __at_align__ uint8_t tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+    // instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(tmp_values, ptr, count * sizeof(uint8_t));
+    return loadu(tmp_values);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
+    } else if (count > 0) {
+      __at_align__ uint8_t tmp_values[size()];
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(uint8_t));
+    }
+  }
+  const uint8_t& operator[](int idx) const  = delete;
+  uint8_t& operator[](int idx)  = delete;
+  Vectorized<uint8_t> abs() const {
+    return values;
+  }
+  Vectorized<uint8_t> real() const {
+    return *this;
+  }
+  Vectorized<uint8_t> imag() const {
+    return _mm256_set1_epi8(0);
+  }
+  Vectorized<uint8_t> conj() const {
+    return *this;
+  }
+  Vectorized<uint8_t> frac() const;
+  Vectorized<uint8_t> neg() const;
+  Vectorized<uint8_t> operator==(const Vectorized<uint8_t>& other) const {
+    return _mm256_cmpeq_epi8(values, other.values);
+  }
+  Vectorized<uint8_t> operator!=(const Vectorized<uint8_t>& other) const {
+    return invert(_mm256_cmpeq_epi8(values, other.values));
+  }
+  Vectorized<uint8_t> operator<(const Vectorized<uint8_t>& other) const {
+    __m256i max = _mm256_max_epu8(values, other.values);
+    return invert(_mm256_cmpeq_epi8(max, values));
+  }
+  Vectorized<uint8_t> operator<=(const Vectorized<uint8_t>& other) const {
+    __m256i max = _mm256_max_epu8(values, other.values);
+    return _mm256_cmpeq_epi8(max, other.values);
+  }
+  Vectorized<uint8_t> operator>(const Vectorized<uint8_t>& other) const {
+    return other < *this;
+  }
+  Vectorized<uint8_t> operator>=(const Vectorized<uint8_t>& other) const {
+    return other <= *this;
+  }
+
+  Vectorized<uint8_t> eq(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> ne(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> gt(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> ge(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> lt(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> le(const Vectorized<uint8_t>& other) const;
+};
+
 template <>
 Vectorized<int64_t> inline operator+(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
   return _mm256_add_epi64(a, b);
@@ -765,6 +1016,12 @@ Vectorized<int8_t> inline operator+(const Vectorized<int8_t>& a, const Vectorize
   return _mm256_add_epi8(a, b);
 }
 
+template <>
+Vectorized<uint8_t> inline operator+(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return _mm256_add_epi8(a, b);
+}
+
+
 template <>
 Vectorized<int64_t> inline operator-(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
   return _mm256_sub_epi64(a, b);
@@ -785,6 +1042,11 @@ Vectorized<int8_t> inline operator-(const Vectorized<int8_t>& a, const Vectorize
   return _mm256_sub_epi8(a, b);
 }
 
+template <>
+Vectorized<uint8_t> inline operator-(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return _mm256_sub_epi8(a, b);
+}
+
 // Negation. Defined here so we can utilize operator-
 inline Vectorized<int64_t> Vectorized<int64_t>::neg() const {
   return Vectorized<int64_t>(0) - *this;
@@ -802,6 +1064,10 @@ inline Vectorized<int8_t> Vectorized<int8_t>::neg() const {
   return Vectorized<int8_t>(0) - *this;
 }
 
+inline Vectorized<uint8_t> Vectorized<uint8_t>::neg() const {
+  return Vectorized<uint8_t>(0) - *this;
+}
+
 // Emulate operations with no native 64-bit support in avx,
 // by extracting each element, performing the operation pointwise,
 // then combining the results into a vector.
@@ -888,6 +1154,12 @@ Vectorized<int8_t> inline operator*(const Vectorized<int8_t>& a, const Vectorize
   return int_elementwise_binary_256(a, b, std::multiplies<int8_t>());
 }
 
+template <>
+Vectorized<uint8_t> inline operator*(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  // We don't have an instruction for multiplying uint8_t
+  return int_elementwise_binary_256(a, b, std::multiplies<uint8_t>());
+}
+
 template <>
 Vectorized<int64_t> inline minimum(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
   return emulate(a, b, [](int64_t a_point, int64_t b_point) {return std::min(a_point, b_point);});
@@ -908,6 +1180,11 @@ Vectorized<int8_t> inline minimum(const Vectorized<int8_t>& a, const Vectorized<
   return _mm256_min_epi8(a, b);
 }
 
+template <>
+Vectorized<uint8_t> inline minimum(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return _mm256_min_epu8(a, b);
+}
+
 template <>
 Vectorized<int64_t> inline maximum(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
   return emulate(a, b, [](int64_t a_point, int64_t b_point) {return std::max(a_point, b_point);});
@@ -928,6 +1205,11 @@ Vectorized<int8_t> inline maximum(const Vectorized<int8_t>& a, const Vectorized<
   return _mm256_max_epi8(a, b);
 }
 
+template <>
+Vectorized<uint8_t> inline maximum(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return _mm256_max_epu8(a, b);
+}
+
 template <>
 Vectorized<int64_t> inline clamp(const Vectorized<int64_t>& a, const Vectorized<int64_t>& min_val, const Vectorized<int64_t>& max_val) {
   return emulate(a, min_val, max_val, [](int64_t a_point, int64_t min_point, int64_t max_point) {return std::min(max_point, std::max(a_point, min_point));});
@@ -948,6 +1230,11 @@ Vectorized<int8_t> inline clamp(const Vectorized<int8_t>& a, const Vectorized<in
   return _mm256_min_epi8(max_val, _mm256_max_epi8(a, min_val));
 }
 
+template <>
+Vectorized<uint8_t> inline clamp(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& min_val, const Vectorized<uint8_t>& max_val) {
+  return _mm256_min_epu8(max_val, _mm256_max_epu8(a, min_val));
+}
+
 template <>
 Vectorized<int64_t> inline clamp_max(const Vectorized<int64_t>& a, const Vectorized<int64_t>& max_val) {
   return emulate(a, max_val, [](int64_t a_point, int64_t max_point) {return std::min(max_point, a_point);});
@@ -968,6 +1255,11 @@ Vectorized<int8_t> inline clamp_max(const Vectorized<int8_t>& a, const Vectorize
   return _mm256_min_epi8(max_val, a);
 }
 
+template <>
+Vectorized<uint8_t> inline clamp_max(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& max_val) {
+  return _mm256_min_epu8(max_val, a);
+}
+
 template <>
 Vectorized<int64_t> inline clamp_min(const Vectorized<int64_t>& a, const Vectorized<int64_t>& min_val) {
   return emulate(a, min_val, [](int64_t a_point, int64_t min_point) {return std::max(min_point, a_point);});
@@ -988,6 +1280,11 @@ Vectorized<int8_t> inline clamp_min(const Vectorized<int8_t>& a, const Vectorize
   return _mm256_max_epi8(min_val, a);
 }
 
+template <>
+Vectorized<uint8_t> inline clamp_min(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& min_val) {
+  return _mm256_max_epu8(min_val, a);
+}
+
 template<typename T>
 Vectorized<int32_t> inline convert_to_int32(const T* ptr) {
   return Vectorized<int32_t>::loadu(ptr);
@@ -1019,6 +1316,10 @@ template <>
 Vectorized<int8_t> inline operator/(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
   return int_elementwise_binary_256(a, b, std::divides<int8_t>());
 }
+template <>
+Vectorized<uint8_t> inline operator/(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return int_elementwise_binary_256(a, b, std::divides<uint8_t>());
+}
 
 template<class T, typename std::enable_if_t<std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0>
 inline Vectorized<T> operator&(const Vectorized<T>& a, const Vectorized<T>& b) {
@@ -1133,6 +1434,30 @@ inline Vectorized<int8_t> Vectorized<int8_t>::le(const Vectorized<int8_t>& other
   return (*this <= other) & Vectorized<int8_t>(1);
 }
 
+inline Vectorized<uint8_t> Vectorized<uint8_t>::eq(const Vectorized<uint8_t>& other) const {
+  return (*this == other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::ne(const Vectorized<uint8_t>& other) const {
+  return (*this != other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::gt(const Vectorized<uint8_t>& other) const {
+  return (*this > other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::ge(const Vectorized<uint8_t>& other) const {
+  return (*this >= other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::lt(const Vectorized<uint8_t>& other) const {
+  return (*this < other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::le(const Vectorized<uint8_t>& other) const {
+  return (*this <= other) & Vectorized<uint8_t>(1);
+}
+
 template <bool left_shift>
 Vectorized<int16_t> inline shift_256_16(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
   // No vector instruction for shifting int16_t, so emulating it instead.
@@ -1202,9 +1527,10 @@ Vectorized<int16_t> inline shift_256_16(const Vectorized<int16_t>& a, const Vect
   return c;
 }
 
-template <bool left_shift>
-Vectorized<int8_t> inline shift_256_8(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
-  // No vector instruction for shifting int8_t, so emulating it instead.
+template <bool left_shift, typename T, typename std::enable_if_t<std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value, int> = 0>
+Vectorized<T> inline shift_256_8(const Vectorized<T>& a, const Vectorized<T>& b) {
+  // No vector instruction for shifting int8_t/uint8_t, so emulating
+  // it instead.
 
   // Control masks for shuffle operation, treating 256 bits as an
   // array of 8-bit elements, and considering quadruples of
@@ -1276,7 +1602,10 @@ Vectorized<int8_t> inline shift_256_8(const Vectorized<int8_t>& a, const Vectori
   if (left_shift)
     c0 = _mm256_sllv_epi32(a0, b0);
   else
-    c0 = _mm256_srav_epi32(a0, b0);
+    if (std::is_same<T, int8_t>::value)
+      c0 = _mm256_srav_epi32(a0, b0);
+    else
+      c0 = _mm256_srlv_epi32(a0, b0);
   c0 = _mm256_shuffle_epi8(c0, ctl_3_0);
 
   // Peform shifting the same way for input array elements with
@@ -1287,7 +1616,10 @@ Vectorized<int8_t> inline shift_256_8(const Vectorized<int8_t>& a, const Vectori
   if (left_shift)
     c1 = _mm256_sllv_epi32(a1, b1);
   else
-    c1 = _mm256_srav_epi32(a1, b1);
+    if (std::is_same<T, int8_t>::value)
+      c1 = _mm256_srav_epi32(a1, b1);
+    else
+      c1 = _mm256_srlv_epi32(a1, b1);
   c1 = _mm256_shuffle_epi8(c1, ctl_3_1);
 
   // Peform shifting the same way for input array elements with
@@ -1298,7 +1630,10 @@ Vectorized<int8_t> inline shift_256_8(const Vectorized<int8_t>& a, const Vectori
   if (left_shift)
     c2 = _mm256_sllv_epi32(a2, b2);
   else
-    c2 = _mm256_srav_epi32(a2, b2);
+    if (std::is_same<T, int8_t>::value)
+      c2 = _mm256_srav_epi32(a2, b2);
+    else
+      c2 = _mm256_srlv_epi32(a2, b2);
   c2 = _mm256_shuffle_epi8(c2, ctl_3_2);
 
   // Peform shifting the same way for input array elements with
@@ -1309,7 +1644,10 @@ Vectorized<int8_t> inline shift_256_8(const Vectorized<int8_t>& a, const Vectori
   if (left_shift)
     c3 = _mm256_sllv_epi32(a3, b3);
   else
-    c3 = _mm256_srav_epi32(a3, b3);
+    if (std::is_same<T, int8_t>::value)
+      c3 = _mm256_srav_epi32(a3, b3);
+    else
+      c3 = _mm256_srlv_epi32(a3, b3);
   c3 = _mm256_and_si256(c3, keep_3);
 
   // Merge partial results into the final result.
@@ -1340,6 +1678,11 @@ Vectorized<int8_t> inline operator<<(const Vectorized<int8_t>& a, const Vectoriz
   return shift_256_8<true>(a, b);
 }
 
+template <>
+Vectorized<uint8_t> inline operator<<(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return shift_256_8<true>(a, b);
+}
+
 template <>
 Vectorized<int64_t> inline operator>>(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
   // No vector instruction for right shifting int64_t, so emulating it
@@ -1372,6 +1715,11 @@ Vectorized<int8_t> inline operator>>(const Vectorized<int8_t>& a, const Vectoriz
   return shift_256_8<false>(a, b);
 }
 
+template <>
+Vectorized<uint8_t> inline operator>>(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return shift_256_8<false>(a, b);
+}
+
 #endif
 
 }}}
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_int.h b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
index bf03f8e290b60..a2550fbfc1dfa 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_int.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
@@ -828,6 +828,280 @@ class Vectorized<int8_t> : public Vectorizedi {
   Vectorized<int8_t> le(const Vectorized<int8_t>& other) const;
 };
 
+template <>
+class Vectorized<uint8_t> : public Vectorizedi {
+private:
+  static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0};
+  static const Vectorized<uint8_t> ones;
+public:
+  using value_type = uint8_t;
+  static constexpr int size() {
+    return 64;
+  }
+  using Vectorizedi::Vectorizedi;
+  Vectorized() {}
+  Vectorized(uint8_t v) { values = _mm512_set1_epi8(v); }
+  Vectorized(uint8_t val1, uint8_t val2, uint8_t val3, uint8_t val4,
+         uint8_t val5, uint8_t val6, uint8_t val7, uint8_t val8,
+         uint8_t val9, uint8_t val10, uint8_t val11, uint8_t val12,
+         uint8_t val13, uint8_t val14, uint8_t val15, uint8_t val16,
+         uint8_t val17, uint8_t val18, uint8_t val19, uint8_t val20,
+         uint8_t val21, uint8_t val22, uint8_t val23, uint8_t val24,
+         uint8_t val25, uint8_t val26, uint8_t val27, uint8_t val28,
+         uint8_t val29, uint8_t val30, uint8_t val31, uint8_t val32,
+         uint8_t val33, uint8_t val34, uint8_t val35, uint8_t val36,
+         uint8_t val37, uint8_t val38, uint8_t val39, uint8_t val40,
+         uint8_t val41, uint8_t val42, uint8_t val43, uint8_t val44,
+         uint8_t val45, uint8_t val46, uint8_t val47, uint8_t val48,
+         uint8_t val49, uint8_t val50, uint8_t val51, uint8_t val52,
+         uint8_t val53, uint8_t val54, uint8_t val55, uint8_t val56,
+         uint8_t val57, uint8_t val58, uint8_t val59, uint8_t val60,
+         uint8_t val61, uint8_t val62, uint8_t val63, uint8_t val64){
+    values = _mm512_set_epi8(val64, val63, val62, val61, val60, val59, val58, val57,
+                              val56, val55, val54, val53,val52, val51, val50, val49,
+                              val48, val47, val46, val45, val44, val43, val42, val41,
+                              val40, val39, val38, val37, val36, val35, val34, val33,
+                              val32, val31, val30, val29, val28, val27, val26, val25,
+                              val24, val23, val22, val21, val20, val19, val18, val17,
+                              val16, val15, val14, val13, val12, val11, val10, val9,
+                              val8, val7, val6, val5, val4, val3, val2, val1);
+  }
+  template <int64_t mask>
+  static Vectorized<uint8_t> blend(Vectorized<uint8_t> a, Vectorized<uint8_t> b) {
+    return _mm512_mask_blend_epi8(mask, a.values, b.values);
+  }
+  static Vectorized<uint8_t> blendv(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b,
+                               const Vectorized<uint8_t>& mask) {
+    auto msb_one = _mm512_set1_epi8(0xFF);
+    auto mask_ = _mm512_cmp_epu8_mask(mask, msb_one, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_epi8(mask_, a.values, b.values);
+  }
+  template <typename step_t>
+  static Vectorized<uint8_t> arange(uint8_t base = 0, step_t step = static_cast<step_t>(1)) {
+    return Vectorized<uint8_t>(
+      base,             base +      step, base +  2 * step, base +  3 * step,
+      base +  4 * step, base +  5 * step, base +  6 * step, base +  7 * step,
+      base +  8 * step, base +  9 * step, base + 10 * step, base + 11 * step,
+      base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step,
+      base + 16 * step, base + 17 * step, base + 18 * step, base + 19 * step,
+      base + 20 * step, base + 21 * step, base + 22 * step, base + 23 * step,
+      base + 24 * step, base + 25 * step, base + 26 * step, base + 27 * step,
+      base + 28 * step, base + 29 * step, base + 30 * step, base + 31 * step,
+      base + 32 * step, base + 33 * step, base + 34 * step, base + 35 * step,
+      base + 36 * step, base + 37 * step, base + 38 * step, base + 39 * step,
+      base + 40 * step, base + 41 * step, base + 42 * step, base + 43 * step,
+      base + 44 * step, base + 45 * step, base + 46 * step, base + 47 * step,
+      base + 48 * step, base + 49 * step, base + 50 * step, base + 51 * step,
+      base + 52 * step, base + 53 * step, base + 54 * step, base + 55 * step,
+      base + 56 * step, base + 57 * step, base + 58 * step, base + 59 * step,
+      base + 60 * step, base + 61 * step, base + 62 * step, base + 63 * step);
+  }
+  static Vectorized<uint8_t>
+  set(Vectorized<uint8_t> a, Vectorized<uint8_t> b, uint8_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<0x1>(a, b);
+      case 2:
+        return blend<0x3>(a, b);
+      case 3:
+        return blend<0x7>(a, b);
+      case 4:
+        return blend<0xF>(a, b);
+      case 5:
+        return blend<0x1F>(a, b);
+      case 6:
+        return blend<0x3F>(a, b);
+      case 7:
+        return blend<0x7F>(a, b);
+      case 8:
+        return blend<0xFF>(a, b);
+      case 9:
+        return blend<0x1FF>(a, b);
+      case 10:
+        return blend<0x3FF>(a, b);
+      case 11:
+        return blend<0x7FF>(a, b);
+      case 12:
+        return blend<0xFFF>(a, b);
+      case 13:
+        return blend<0x1FFF>(a, b);
+      case 14:
+        return blend<0x3FFF>(a, b);
+      case 15:
+        return blend<0x7FFF>(a, b);
+      case 16:
+        return blend<0xFFFF>(a, b);
+      case 17:
+        return blend<0x1FFFF>(a, b);
+      case 18:
+        return blend<0x3FFFF>(a, b);
+      case 19:
+        return blend<0x7FFFF>(a, b);
+      case 20:
+        return blend<0xFFFFF>(a, b);
+      case 21:
+        return blend<0x1FFFFF>(a, b);
+      case 22:
+        return blend<0x3FFFFF>(a, b);
+      case 23:
+        return blend<0x7FFFFF>(a, b);
+      case 24:
+        return blend<0xFFFFFF>(a, b);
+      case 25:
+        return blend<0x1FFFFFF>(a, b);
+      case 26:
+        return blend<0x3FFFFFF>(a, b);
+      case 27:
+        return blend<0x7FFFFFF>(a, b);
+      case 28:
+        return blend<0xFFFFFFF>(a, b);
+      case 29:
+        return blend<0x1FFFFFFF>(a, b);
+      case 30:
+        return blend<0x3FFFFFFF>(a, b);
+      case 31:
+        return blend<0x7FFFFFFF>(a, b);
+      case 32:
+        return blend<0xFFFFFFFF>(a, b);
+      case 33:
+        return blend<0x1FFFFFFFF>(a, b);
+      case 34:
+        return blend<0x3FFFFFFFF>(a, b);
+      case 35:
+        return blend<0x7FFFFFFFF>(a, b);
+      case 36:
+        return blend<0xFFFFFFFFF>(a, b);
+      case 37:
+        return blend<0x1FFFFFFFFF>(a, b);
+      case 38:
+        return blend<0x3FFFFFFFFF>(a, b);
+      case 39:
+        return blend<0x7FFFFFFFFF>(a, b);
+      case 40:
+        return blend<0xFFFFFFFFFF>(a, b);
+      case 41:
+        return blend<0x1FFFFFFFFFF>(a, b);
+      case 42:
+        return blend<0x3FFFFFFFFFF>(a, b);
+      case 43:
+        return blend<0x7FFFFFFFFFF>(a, b);
+      case 44:
+        return blend<0xFFFFFFFFFFF>(a, b);
+      case 45:
+        return blend<0x1FFFFFFFFFFF>(a, b);
+      case 46:
+        return blend<0x3FFFFFFFFFFF>(a, b);
+      case 47:
+        return blend<0x7FFFFFFFFFFF>(a, b);
+      case 48:
+        return blend<0xFFFFFFFFFFFF>(a, b);
+      case 49:
+        return blend<0x1FFFFFFFFFFFF>(a, b);
+      case 50:
+        return blend<0x3FFFFFFFFFFFF>(a, b);
+      case 51:
+        return blend<0x7FFFFFFFFFFFF>(a, b);
+      case 52:
+        return blend<0xFFFFFFFFFFFFF>(a, b);
+      case 53:
+        return blend<0x1FFFFFFFFFFFFF>(a, b);
+      case 54:
+        return blend<0x3FFFFFFFFFFFFF>(a, b);
+      case 55:
+        return blend<0x7FFFFFFFFFFFFF>(a, b);
+      case 56:
+        return blend<0xFFFFFFFFFFFFFF>(a, b);
+      case 57:
+        return blend<0x1FFFFFFFFFFFFFF>(a, b);
+      case 58:
+        return blend<0x3FFFFFFFFFFFFFF>(a, b);
+      case 59:
+        return blend<0x7FFFFFFFFFFFFFF>(a, b);
+      case 60:
+        return blend<0xFFFFFFFFFFFFFFF>(a, b);
+      case 61:
+        return blend<0x1FFFFFFFFFFFFFFF>(a, b);
+      case 62:
+        return blend<0x3FFFFFFFFFFFFFFF>(a, b);
+      case 63:
+        return blend<0x7FFFFFFFFFFFFFFF>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<uint8_t> loadu(const void* ptr) {
+    return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
+  }
+  static Vectorized<uint8_t> loadu(const void* ptr, uint8_t count) {
+    __at_align__ uint8_t tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+    // instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(tmp_values, ptr, count * sizeof(uint8_t));
+    return loadu(tmp_values);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm512-storeu-si512.html
+      _mm512_storeu_si512(reinterpret_cast<__m512i*>(ptr), values);
+    } else if (count > 0) {
+      __at_align__ uint8_t tmp_values[size()];
+      _mm512_storeu_si512(reinterpret_cast<__m512i*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(uint8_t));
+    }
+  }
+  const uint8_t& operator[](int idx) const  = delete;
+  uint8_t& operator[](int idx)  = delete;
+  Vectorized<uint8_t> abs() const {
+    return values;
+  }
+  Vectorized<uint8_t> real() const {
+    return *this;
+  }
+  Vectorized<uint8_t> imag() const {
+    return _mm512_set1_epi8(0);
+  }
+  Vectorized<uint8_t> conj() const {
+    return *this;
+  }
+  Vectorized<uint8_t> frac() const;
+  Vectorized<uint8_t> neg() const;
+  Vectorized<uint8_t> operator==(const Vectorized<uint8_t>& other) const {
+    auto mask = _mm512_cmpeq_epu8_mask(values, other.values);
+    return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF);
+  }
+  Vectorized<uint8_t> operator!=(const Vectorized<uint8_t>& other) const {
+    auto mask = _mm512_cmpneq_epu8_mask(values, other.values);
+    return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF);
+  }
+  Vectorized<uint8_t> operator<(const Vectorized<uint8_t>& other) const {
+    auto mask = _mm512_cmplt_epu8_mask(values, other.values);
+    return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF);
+  }
+  Vectorized<uint8_t> operator<=(const Vectorized<uint8_t>& other) const {
+    auto mask = _mm512_cmple_epu8_mask(values, other.values);
+    return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF);
+  }
+  Vectorized<uint8_t> operator>(const Vectorized<uint8_t>& other) const {
+    return other < *this;
+  }
+  Vectorized<uint8_t> operator>=(const Vectorized<uint8_t>& other) const {
+    return other <= *this;
+  }
+
+  Vectorized<uint8_t> eq(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> ne(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> gt(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> ge(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> lt(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> le(const Vectorized<uint8_t>& other) const;
+};
+
 template <>
 Vectorized<int64_t> inline operator+(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
   return _mm512_add_epi64(a, b);
@@ -848,6 +1122,11 @@ Vectorized<int8_t> inline operator+(const Vectorized<int8_t>& a, const Vectorize
   return _mm512_add_epi8(a, b);
 }
 
+template <>
+Vectorized<uint8_t> inline operator+(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return _mm512_add_epi8(a, b);
+}
+
 template <>
 Vectorized<int64_t> inline operator-(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
   return _mm512_sub_epi64(a, b);
@@ -868,6 +1147,11 @@ Vectorized<int8_t> inline operator-(const Vectorized<int8_t>& a, const Vectorize
   return _mm512_sub_epi8(a, b);
 }
 
+template <>
+Vectorized<uint8_t> inline operator-(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return _mm512_sub_epi8(a, b);
+}
+
 // Negation. Defined here so we can utilize operator-
 inline Vectorized<int64_t> Vectorized<int64_t>::neg() const {
   return Vectorized<int64_t>(0) - *this;
@@ -885,6 +1169,10 @@ inline Vectorized<int8_t> Vectorized<int8_t>::neg() const {
   return Vectorized<int8_t>(0) - *this;
 }
 
+inline Vectorized<uint8_t> Vectorized<uint8_t>::neg() const {
+  return Vectorized<uint8_t>(0) - *this;
+}
+
 template <>
 Vectorized<int64_t> inline operator*(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
   return _mm512_mullo_epi64(a, b);
@@ -918,6 +1206,12 @@ Vectorized<int8_t> inline operator*(const Vectorized<int8_t>& a, const Vectorize
   return int_elementwise_binary_512(a, b, std::multiplies<int8_t>());
 }
 
+template <>
+Vectorized<uint8_t> inline operator*(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  // We don't have an instruction for multiplying uint8_t
+  return int_elementwise_binary_512(a, b, std::multiplies<uint8_t>());
+}
+
 template <>
 Vectorized<int64_t> inline minimum(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
   return _mm512_min_epi64(a, b);
@@ -938,6 +1232,11 @@ Vectorized<int8_t> inline minimum(const Vectorized<int8_t>& a, const Vectorized<
   return _mm512_min_epi8(a, b);
 }
 
+template <>
+Vectorized<uint8_t> inline minimum(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return _mm512_min_epu8(a, b);
+}
+
 template <>
 Vectorized<int64_t> inline maximum(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
   return _mm512_max_epi64(a, b);
@@ -958,6 +1257,11 @@ Vectorized<int8_t> inline maximum(const Vectorized<int8_t>& a, const Vectorized<
   return _mm512_max_epi8(a, b);
 }
 
+template <>
+Vectorized<uint8_t> inline maximum(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return _mm512_max_epi8(a, b);
+}
+
 template <>
 Vectorized<int64_t> inline clamp(const Vectorized<int64_t>& a, const Vectorized<int64_t>& min_val, const Vectorized<int64_t>& max_val) {
   return _mm512_min_epi64(max_val, _mm512_max_epi64(a, min_val));
@@ -978,6 +1282,11 @@ Vectorized<int8_t> inline clamp(const Vectorized<int8_t>& a, const Vectorized<in
   return _mm512_min_epi8(max_val, _mm512_max_epi8(a, min_val));
 }
 
+template <>
+Vectorized<uint8_t> inline clamp(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& min_val, const Vectorized<uint8_t>& max_val) {
+  return _mm512_min_epu8(max_val, _mm512_max_epu8(a, min_val));
+}
+
 template <>
 Vectorized<int64_t> inline clamp_max(const Vectorized<int64_t>& a, const Vectorized<int64_t>& max_val) {
   return _mm512_min_epi64(max_val, a);
@@ -998,6 +1307,11 @@ Vectorized<int8_t> inline clamp_max(const Vectorized<int8_t>& a, const Vectorize
   return _mm512_min_epi8(max_val, a);
 }
 
+template <>
+Vectorized<uint8_t> inline clamp_max(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& max_val) {
+  return _mm512_min_epu8(max_val, a);
+}
+
 template <>
 Vectorized<int64_t> inline clamp_min(const Vectorized<int64_t>& a, const Vectorized<int64_t>& min_val) {
   return _mm512_max_epi64(min_val, a);
@@ -1018,6 +1332,11 @@ Vectorized<int8_t> inline clamp_min(const Vectorized<int8_t>& a, const Vectorize
   return _mm512_max_epi8(min_val, a);
 }
 
+template <>
+Vectorized<uint8_t> inline clamp_min(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& min_val) {
+  return _mm512_max_epu8(min_val, a);
+}
+
 template<typename T>
 Vectorized<int32_t> inline convert_to_int32(const T* ptr) {
   return Vectorized<int32_t>::loadu(ptr);
@@ -1049,6 +1368,10 @@ template <>
 Vectorized<int8_t> inline operator/(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
   return int_elementwise_binary_512(a, b, std::divides<int8_t>());
 }
+template <>
+Vectorized<uint8_t> inline operator/(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return int_elementwise_binary_512(a, b, std::divides<uint8_t>());
+}
 
 template<class T, typename std::enable_if_t<std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0>
 inline Vectorized<T> operator&(const Vectorized<T>& a, const Vectorized<T>& b) {
@@ -1163,9 +1486,34 @@ inline Vectorized<int8_t> Vectorized<int8_t>::le(const Vectorized<int8_t>& other
   return (*this <= other) & Vectorized<int8_t>(1);
 }
 
-template <bool left_shift>
-Vectorized<int8_t> inline shift_512_8(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
-  // No vector instruction for shifting int8_t, so emulating it instead.
+inline Vectorized<uint8_t> Vectorized<uint8_t>::eq(const Vectorized<uint8_t>& other) const {
+  return (*this == other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::ne(const Vectorized<uint8_t>& other) const {
+  return (*this != other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::gt(const Vectorized<uint8_t>& other) const {
+  return (*this > other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::ge(const Vectorized<uint8_t>& other) const {
+  return (*this >= other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::lt(const Vectorized<uint8_t>& other) const {
+  return (*this < other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::le(const Vectorized<uint8_t>& other) const {
+  return (*this <= other) & Vectorized<uint8_t>(1);
+}
+
+template <bool left_shift, typename T, typename std::enable_if_t<std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value, int> = 0>
+Vectorized<T> inline shift_512_8(const Vectorized<T>& a, const Vectorized<T>& b) {
+  // No vector instruction for shifting int8_t/uint8_t, so emulating
+  // it instead.
 
   // Control masks for shuffle operation, treating 512 bits as an
   // array of 8-bit elements, and considering pairs of neighboring
@@ -1220,7 +1568,10 @@ Vectorized<int8_t> inline shift_512_8(const Vectorized<int8_t>& a, const Vectori
   if (left_shift)
     c0 = _mm512_sllv_epi16(a0, b0);
   else
-    c0 = _mm512_srav_epi16(a0, b0);
+    if (std::is_same<T, int8_t>::value)
+      c0 = _mm512_srav_epi16(a0, b0);
+    else
+      c0 = _mm512_srlv_epi16(a0, b0);
   c0 = _mm512_shuffle_epi8(c0, ctl_1_0);
 
   // Peform shifting the same way for input array elements with
@@ -1231,7 +1582,10 @@ Vectorized<int8_t> inline shift_512_8(const Vectorized<int8_t>& a, const Vectori
   if (left_shift)
     c1 = _mm512_sllv_epi16(a1, b1);
   else
-    c1 = _mm512_srav_epi16(a1, b1);
+    if (std::is_same<T, int8_t>::value)
+      c1 = _mm512_srav_epi16(a1, b1);
+    else
+      c1 = _mm512_srlv_epi16(a1, b1);
   c1 = _mm512_and_si512(c1, keep_1);
 
   // Merge partial results into the final result.
@@ -1260,6 +1614,11 @@ Vectorized<int8_t> inline operator<<(const Vectorized<int8_t>& a, const Vectoriz
   return shift_512_8<true>(a, b);
 }
 
+template <>
+Vectorized<uint8_t> inline operator<<(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return shift_512_8<true>(a, b);
+}
+
 template <>
 Vectorized<int64_t> inline operator>>(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
   return _mm512_srav_epi64(a, b);
@@ -1280,6 +1639,11 @@ Vectorized<int8_t> inline operator>>(const Vectorized<int8_t>& a, const Vectoriz
   return shift_512_8<false>(a, b);
 }
 
+template <>
+Vectorized<uint8_t> inline operator>>(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return shift_512_8<false>(a, b);
+}
+
 #endif
 
 }}}
diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
index a82f3ed3eaa1d..bbf45ba2ecd0b 100644
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@@ -333,20 +333,9 @@ static void and_kernel_impl(TensorIterator& iter) {
     binary_kernel_reduce_vec(
         iter,
         [=](uint8_t a, uint8_t b) -> uint8_t { return (a && b) ? 1 : 0; },
-#if defined(CPU_CAPABILITY_ZVECTOR)
         [=](Vectorized<uint8_t> a, Vectorized<uint8_t> b) {
           return a & b;
         },
-#else
-        [=](Vectorized<uint8_t> a, Vectorized<uint8_t> b) {
-          Vectorized<uint8_t> c = Vectorized<uint8_t>();
-
-          for (decltype(c.size()) i = 0; i != Vectorized<uint8_t>::size(); i++) {
-            c[i] = (a[i] && b[i]) ? 1 : 0;
-          }
-          return c;
-        },
-#endif
         /*ident=*/true);
   } else {
     binary_kernel_reduce_vec(
@@ -380,20 +369,9 @@ static void or_kernel_impl(TensorIterator& iter) {
     binary_kernel_reduce_vec(
         iter,
         [=](uint8_t a, uint8_t b) -> uint8_t { return (a || b) ? 1 : 0; },
-#if defined(CPU_CAPABILITY_ZVECTOR)
         [=](Vectorized<uint8_t> a, Vectorized<uint8_t> b) {
           return a | b;
         },
-#else
-        [=](Vectorized<uint8_t> a, Vectorized<uint8_t> b) {
-          Vectorized<uint8_t> c = Vectorized<uint8_t>();
-
-          for (decltype(c.size()) i = 0; i != Vectorized<uint8_t>::size(); i++) {
-            c[i] = (a[i] || b[i]) ? 1 : 0;
-          }
-          return c;
-        },
-#endif
         /*ident=*/false);
   } else {
     binary_kernel_reduce_vec(

From 73f9fc57410a95288a87abfd346d5753c504c12d Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Thu, 24 Nov 2022 21:41:20 +0000
Subject: [PATCH 1262/1922] [test_nn] split pruning tests from test_nn (#89590)

Ref: https://github.com/pytorch/pytorch/issues/63085

Note: Doesn't need corresponding XLA PR as the migrated tests were not run on XLA (as they weren't in TestNNDeviceType).
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89590
Approved by: https://github.com/albanD
---
 test/nn/test_pruning.py | 939 ++++++++++++++++++++++++++++++++++++++++
 test/test_nn.py         | 921 +--------------------------------------
 2 files changed, 940 insertions(+), 920 deletions(-)
 create mode 100644 test/nn/test_pruning.py

diff --git a/test/nn/test_pruning.py b/test/nn/test_pruning.py
new file mode 100644
index 0000000000000..bd2db02d056fc
--- /dev/null
+++ b/test/nn/test_pruning.py
@@ -0,0 +1,939 @@
+# Owner(s): ["module: nn"]
+import unittest
+import unittest.mock as mock
+import pickle
+
+import torch
+
+import torch.nn as nn
+import torch.nn.utils.prune as prune
+from torch.testing._internal.common_utils import TEST_NUMPY, TemporaryFileName, \
+    instantiate_parametrized_tests, run_tests
+from torch.testing._internal.common_nn import NNTestCase
+
+class TestPruningNN(NNTestCase):
+    _do_cuda_memory_leak_check = True
+    _do_cuda_non_default_stream = True
+
+    # torch/nn/utils/prune.py
+    @unittest.skipIf(not TEST_NUMPY, "numpy not found")
+    def test_validate_pruning_amount_init(self):
+        r"""Test the first util function that validates the pruning
+        amount requested by the user the moment the pruning method
+        is initialized. This test checks that the expected errors are
+        raised whenever the amount is invalid.
+        The original function runs basic type checking + value range checks.
+        It doesn't check the validity of the pruning amount with
+        respect to the size of the tensor to prune. That's left to
+        `_validate_pruning_amount`, tested below.
+        """
+        # neither float not int should raise TypeError
+        with self.assertRaises(TypeError):
+            prune._validate_pruning_amount_init(amount="I'm a string")
+
+        # float not in [0, 1] should raise ValueError
+        with self.assertRaises(ValueError):
+            prune._validate_pruning_amount_init(amount=1.1)
+        with self.assertRaises(ValueError):
+            prune._validate_pruning_amount_init(amount=20.)
+
+        # negative int should raise ValueError
+        with self.assertRaises(ValueError):
+            prune._validate_pruning_amount_init(amount=-10)
+
+        # all these should pass without errors because they're valid amounts
+        prune._validate_pruning_amount_init(amount=0.34)
+        prune._validate_pruning_amount_init(amount=1500)
+        prune._validate_pruning_amount_init(amount=0)
+        prune._validate_pruning_amount_init(amount=0.)
+        prune._validate_pruning_amount_init(amount=1)
+        prune._validate_pruning_amount_init(amount=1.)
+        self.assertTrue(True)
+
+    @unittest.skipIf(not TEST_NUMPY, "numpy not found")
+    def test_validate_pruning_amount(self):
+        r"""Tests the second util function that validates the pruning
+        amount requested by the user, this time with respect to the size
+        of the tensor to prune. The rationale is that if the pruning amount,
+        converted to absolute value of units to prune, is larger than
+        the number of units in the tensor, then we expect the util function
+        to raise a value error.
+        """
+        # if amount is int and amount > tensor_size, raise ValueError
+        with self.assertRaises(ValueError):
+            prune._validate_pruning_amount(amount=20, tensor_size=19)
+
+        # amount is a float so this should not raise an error
+        prune._validate_pruning_amount(amount=0.3, tensor_size=0)
+
+        # this is okay
+        prune._validate_pruning_amount(amount=19, tensor_size=20)
+        prune._validate_pruning_amount(amount=0, tensor_size=0)
+        prune._validate_pruning_amount(amount=1, tensor_size=1)
+        self.assertTrue(True)
+
+    @unittest.skipIf(not TEST_NUMPY, "numpy not found")
+    def test_compute_nparams_to_prune(self):
+        r"""Test that requested pruning `amount` gets translated into the
+        correct absolute number of units to prune.
+        """
+        self.assertEqual(
+            prune._compute_nparams_toprune(amount=0, tensor_size=15),
+            0
+        )
+        self.assertEqual(
+            prune._compute_nparams_toprune(amount=10, tensor_size=15),
+            10
+        )
+        # if 1 is int, means 1 unit
+        self.assertEqual(
+            prune._compute_nparams_toprune(amount=1, tensor_size=15),
+            1
+        )
+        # if 1. is float, means 100% of units
+        self.assertEqual(
+            prune._compute_nparams_toprune(amount=1., tensor_size=15),
+            15
+        )
+        self.assertEqual(
+            prune._compute_nparams_toprune(amount=0.4, tensor_size=17),
+            7
+        )
+
+    def test_random_pruning_sizes(self):
+        r"""Test that the new parameters and buffers created by the pruning
+        method have the same size as the input tensor to prune. These, in
+        fact, correspond to the pruned version of the tensor itself, its
+        mask, and its original copy, so the size must match.
+        """
+        # fixturize test
+        # TODO: add other modules
+        modules = [nn.Linear(5, 7), nn.Conv3d(2, 2, 2)]
+        names = ['weight', 'bias']
+
+        for m in modules:
+            for name in names:
+                with self.subTest(m=m, name=name):
+                    original_tensor = getattr(m, name)
+
+                    prune.random_unstructured(m, name=name, amount=0.1)
+                    # mask has the same size as tensor being pruned
+                    self.assertEqual(
+                        original_tensor.size(),
+                        getattr(m, name + '_mask').size()
+                    )
+                    # 'orig' tensor has the same size as the original tensor
+                    self.assertEqual(
+                        original_tensor.size(),
+                        getattr(m, name + '_orig').size()
+                    )
+                    # new tensor has the same size as the original tensor
+                    self.assertEqual(
+                        original_tensor.size(),
+                        getattr(m, name).size()
+                    )
+
+    def test_random_pruning_orig(self):
+        r"""Test that original tensor is correctly stored in 'orig'
+        after pruning is applied. Important to make sure we don't
+        lose info about the original unpruned parameter.
+        """
+        # fixturize test
+        # TODO: add other modules
+        modules = [nn.Linear(5, 7), nn.Conv3d(2, 2, 2)]
+        names = ['weight', 'bias']
+
+        for m in modules:
+            for name in names:
+                with self.subTest(m=m, name=name):
+
+                    # tensor prior to pruning
+                    original_tensor = getattr(m, name)
+                    prune.random_unstructured(m, name=name, amount=0.1)
+                    self.assertEqual(
+                        original_tensor,
+                        getattr(m, name + '_orig')
+                    )
+
+    def test_random_pruning_new_weight(self):
+        r"""Test that module.name now contains a pruned version of
+        the original tensor obtained from multiplying it by the mask.
+        """
+        # fixturize test
+        # TODO: add other modules
+        modules = [nn.Linear(5, 7), nn.Conv3d(2, 2, 2)]
+        names = ['weight', 'bias']
+
+        for m in modules:
+            for name in names:
+                with self.subTest(m=m, name=name):
+                    # tensor prior to pruning
+                    original_tensor = getattr(m, name)
+                    prune.random_unstructured(m, name=name, amount=0.1)
+                    # weight = weight_orig * weight_mask
+                    self.assertEqual(
+                        getattr(m, name),
+                        getattr(m, name + '_orig')
+                        * getattr(m, name + '_mask').to(
+                            dtype=original_tensor.dtype
+                        ),
+                    )
+
+    def test_identity_pruning(self):
+        r"""Test that a mask of 1s does not change forward or backward.
+        """
+        input_ = torch.ones(1, 5)
+        m = nn.Linear(5, 2)
+        y_prepruning = m(input_)  # output prior to pruning
+
+        # compute grad pre-pruning and check it's equal to all ones
+        y_prepruning.sum().backward()
+        old_grad_weight = m.weight.grad.clone()  # don't grab pointer!
+        self.assertEqual(old_grad_weight, torch.ones_like(m.weight))
+        old_grad_bias = m.bias.grad.clone()
+        self.assertEqual(old_grad_bias, torch.ones_like(m.bias))
+
+        # remove grads
+        m.zero_grad()
+
+        # force the mask to be made of all 1s
+        prune.identity(m, name="weight")
+
+        # with mask of 1s, output should be identical to no mask
+        y_postpruning = m(input_)
+        self.assertEqual(y_prepruning, y_postpruning)
+
+        # with mask of 1s, grad should be identical to no mask
+        y_postpruning.sum().backward()
+        self.assertEqual(old_grad_weight, m.weight_orig.grad)
+        self.assertEqual(old_grad_bias, m.bias.grad)
+
+        # calling forward twice in a row shouldn't change output
+        y1 = m(input_)
+        y2 = m(input_)
+        self.assertEqual(y1, y2)
+
+    def test_random_pruning_0perc(self):
+        r"""Test that a mask of 1s does not change forward or backward.
+        """
+        input_ = torch.ones(1, 5)
+        m = nn.Linear(5, 2)
+        y_prepruning = m(input_)  # output prior to pruning
+
+        # compute grad pre-pruning and check it's equal to all ones
+        y_prepruning.sum().backward()
+        old_grad_weight = m.weight.grad.clone()  # don't grab pointer!
+        self.assertEqual(old_grad_weight, torch.ones_like(m.weight))
+        old_grad_bias = m.bias.grad.clone()
+        self.assertEqual(old_grad_bias, torch.ones_like(m.bias))
+
+        # remove grads
+        m.zero_grad()
+
+        # force the mask to be made of all 1s
+        with mock.patch(
+            "torch.nn.utils.prune.RandomUnstructured.compute_mask"
+        ) as compute_mask:
+            compute_mask.return_value = torch.ones_like(m.weight)
+            prune.random_unstructured(m, name='weight', amount=0.9)  # amount won't count
+
+        # with mask of 1s, output should be identical to no mask
+        y_postpruning = m(input_)
+        self.assertEqual(y_prepruning, y_postpruning)
+
+        # with mask of 1s, grad should be identical to no mask
+        y_postpruning.sum().backward()
+        self.assertEqual(old_grad_weight, m.weight_orig.grad)
+        self.assertEqual(old_grad_bias, m.bias.grad)
+
+        # calling forward twice in a row shouldn't change output
+        y1 = m(input_)
+        y2 = m(input_)
+        self.assertEqual(y1, y2)
+
+    def test_random_pruning(self):
+        input_ = torch.ones(1, 5)
+        m = nn.Linear(5, 2)
+
+        # define custom mask to assign with mock
+        mask = torch.ones_like(m.weight)
+        mask[1, 0] = 0
+        mask[0, 3] = 0
+
+        # check grad is zero for masked weights
+        with mock.patch(
+            "torch.nn.utils.prune.RandomUnstructured.compute_mask"
+        ) as compute_mask:
+            compute_mask.return_value = mask
+            prune.random_unstructured(m, name='weight', amount=0.9)
+
+        y_postpruning = m(input_)
+        y_postpruning.sum().backward()
+        # weight_orig is the parameter, so it's the tensor that will accumulate the grad
+        self.assertEqual(m.weight_orig.grad, mask)  # all 1s, except for masked units
+        self.assertEqual(m.bias.grad, torch.ones_like(m.bias))
+
+        # make sure that weight_orig update doesn't modify [1, 0] and [0, 3]
+        old_weight_orig = m.weight_orig.clone()
+        # update weights
+        learning_rate = 1.
+        for p in m.parameters():
+            p.data.sub_(p.grad.data * learning_rate)
+        # since these are pruned, they should not be updated
+        self.assertEqual(old_weight_orig[1, 0], m.weight_orig[1, 0])
+        self.assertEqual(old_weight_orig[0, 3], m.weight_orig[0, 3])
+
+    def test_random_pruning_forward(self):
+        r"""check forward with mask (by hand).
+        """
+        input_ = torch.ones(1, 5)
+        m = nn.Linear(5, 2)
+
+        # define custom mask to assign with mock
+        mask = torch.zeros_like(m.weight)
+        mask[1, 0] = 1
+        mask[0, 3] = 1
+
+        with mock.patch(
+            "torch.nn.utils.prune.RandomUnstructured.compute_mask"
+        ) as compute_mask:
+            compute_mask.return_value = mask
+            prune.random_unstructured(m, name='weight', amount=0.9)
+
+        yhat = m(input_)
+        self.assertEqual(yhat[0, 0], m.weight_orig[0, 3] + m.bias[0])
+        self.assertEqual(yhat[0, 1], m.weight_orig[1, 0] + m.bias[1])
+
+    def test_remove_pruning_forward(self):
+        r"""Remove pruning and check forward is unchanged from previous
+        pruned state.
+        """
+        input_ = torch.ones(1, 5)
+        m = nn.Linear(5, 2)
+
+        # define custom mask to assign with mock
+        mask = torch.ones_like(m.weight)
+        mask[1, 0] = 0
+        mask[0, 3] = 0
+
+        # check grad is zero for masked weights
+        with mock.patch(
+            "torch.nn.utils.prune.RandomUnstructured.compute_mask"
+        ) as compute_mask:
+            compute_mask.return_value = mask
+            prune.random_unstructured(m, name='weight', amount=0.9)
+
+        y_postpruning = m(input_)
+
+        prune.remove(m, 'weight')
+
+        y_postremoval = m(input_)
+        self.assertEqual(y_postpruning, y_postremoval)
+
+    def test_pruning_id_consistency(self):
+        r"""Test that pruning doesn't change the id of the parameters, which
+        would otherwise introduce issues with pre-existing optimizers that
+        point to old parameters.
+        """
+        m = nn.Linear(5, 2, bias=False)
+
+        tensor_id = id(list(m.parameters())[0])
+
+        prune.random_unstructured(m, name="weight", amount=0.9)
+        self.assertEqual(tensor_id, id(list(m.parameters())[0]))
+
+        prune.remove(m, "weight")
+        self.assertEqual(tensor_id, id(list(m.parameters())[0]))
+
+    def test_random_pruning_pickle(self):
+        modules = [nn.Linear(5, 7), nn.Conv3d(2, 2, 2)]
+        names = ['weight', 'bias']
+
+        for m in modules:
+            for name in names:
+                with self.subTest(m=m, name=name):
+                    prune.random_unstructured(m, name=name, amount=0.1)
+                    m_new = pickle.loads(pickle.dumps(m))
+                    self.assertIsInstance(m_new, type(m))
+
+    def test_multiple_pruning_calls(self):
+        # if you call pruning twice, the hook becomes a PruningContainer
+        m = nn.Conv3d(2, 2, 2)
+        prune.l1_unstructured(m, name='weight', amount=0.1)
+        weight_mask0 = m.weight_mask  # save it for later sanity check
+
+        # prune again
+        prune.ln_structured(m, name='weight', amount=0.3, n=2, dim=0)
+        hook = next(iter(m._forward_pre_hooks.values()))
+        self.assertIsInstance(
+            hook,
+            torch.nn.utils.prune.PruningContainer
+        )
+        # check that container._tensor_name is correctly set no matter how
+        # many pruning methods are in the container
+        self.assertEqual(hook._tensor_name, 'weight')
+
+        # check that the pruning container has the right length
+        # equal to the number of pruning iters
+        self.assertEqual(len(hook), 2)  # m.weight has been pruned twice
+
+        # check that the entries of the pruning container are of the expected
+        # type and in the expected order
+        self.assertIsInstance(hook[0], torch.nn.utils.prune.L1Unstructured)
+        self.assertIsInstance(hook[1], torch.nn.utils.prune.LnStructured)
+
+        # check that all entries that are 0 in the 1st mask are 0 in the
+        # 2nd mask too
+        self.assertTrue(torch.all(m.weight_mask[weight_mask0 == 0] == 0))
+
+        # prune again
+        prune.ln_structured(m, name='weight', amount=0.1, n=float('inf'), dim=1)
+        # check that container._tensor_name is correctly set no matter how
+        # many pruning methods are in the container
+        hook = next(iter(m._forward_pre_hooks.values()))
+        self.assertEqual(hook._tensor_name, 'weight')
+
+    def test_pruning_container(self):
+        # create an empty container
+        container = prune.PruningContainer()
+        container._tensor_name = 'test'
+        self.assertEqual(len(container), 0)
+
+        p = prune.L1Unstructured(amount=2)
+        p._tensor_name = 'test'
+
+        # test adding a pruning method to a container
+        container.add_pruning_method(p)
+
+        # test error raised if tensor name is different
+        q = prune.L1Unstructured(amount=2)
+        q._tensor_name = 'another_test'
+        with self.assertRaises(ValueError):
+            container.add_pruning_method(q)
+
+        # test that adding a non-pruning method object to a pruning container
+        # raises a TypeError
+        with self.assertRaises(TypeError):
+            container.add_pruning_method(10)
+        with self.assertRaises(TypeError):
+            container.add_pruning_method('ugh')
+
+    def test_pruning_container_compute_mask(self):
+        r"""Test `compute_mask` of pruning container with a known `t` and
+        `default_mask`. Indirectly checks that Ln structured pruning is
+        acting on the right axis.
+        """
+        # create an empty container
+        container = prune.PruningContainer()
+        container._tensor_name = 'test'
+
+        # 1) test unstructured pruning
+        # create a new pruning method
+        p = prune.L1Unstructured(amount=2)
+        p._tensor_name = 'test'
+        # add the pruning method to the container
+        container.add_pruning_method(p)
+
+        # create tensor to be pruned
+        t = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]]).to(dtype=torch.float32)
+        # create prior mask by hand
+        default_mask = torch.tensor([[1, 1, 1, 0], [1, 1, 0, 1]])
+        # since we are pruning the two lowest magnitude units, the outcome of
+        # the calculation should be this:
+        expected_mask = torch.tensor([[0, 0, 1, 0], [1, 1, 0, 1]], dtype=torch.float32)
+        computed_mask = container.compute_mask(t, default_mask)
+        self.assertEqual(expected_mask, computed_mask)
+
+        # 2) test structured pruning
+        q = prune.LnStructured(amount=1, n=2, dim=0)
+        q._tensor_name = 'test'
+        container.add_pruning_method(q)
+        # since we are pruning the lowest magnitude one of the two rows, the
+        # outcome of the calculation should be this:
+        expected_mask = torch.tensor([[0, 0, 0, 0], [1, 1, 0, 1]], dtype=torch.float32)
+        computed_mask = container.compute_mask(t, default_mask)
+        self.assertEqual(expected_mask, computed_mask)
+
+        # 2) test structured pruning, along another axis
+        r = prune.LnStructured(amount=1, n=2, dim=1)
+        r._tensor_name = 'test'
+        container.add_pruning_method(r)
+        # since we are pruning the lowest magnitude of the four columns, the
+        # outcome of the calculation should be this:
+        expected_mask = torch.tensor([[0, 1, 1, 0], [0, 1, 0, 1]], dtype=torch.float32)
+        computed_mask = container.compute_mask(t, default_mask)
+        self.assertEqual(expected_mask, computed_mask)
+
+    def test_l1_unstructured_pruning(self):
+        r"""Test that l1 unstructured pruning actually removes the lowest
+        entries by l1 norm (by hand). It also checks that applying l1
+        unstructured pruning more than once respects the previous mask.
+        """
+        m = nn.Linear(4, 2)
+        # modify its weight matrix by hand
+        m.weight = torch.nn.Parameter(
+            torch.tensor(
+                [[1, 2, 3, 4], [-4, -3, -2, -1]], dtype=torch.float32
+            )
+        )
+
+        prune.l1_unstructured(m, 'weight', amount=2)
+        expected_weight = torch.tensor([[0, 2, 3, 4], [-4, -3, -2, 0]],
+                                       dtype=m.weight.dtype)
+        self.assertEqual(expected_weight, m.weight)
+
+        # check that pruning again removes the next two smallest entries
+        prune.l1_unstructured(m, 'weight', amount=2)
+        expected_weight = torch.tensor([[0, 0, 3, 4], [-4, -3, 0, 0]],
+                                       dtype=m.weight.dtype)
+        self.assertEqual(expected_weight, m.weight)
+
+    def test_l1_unstructured_pruning_with_importance_scores(self):
+        r"""Test that l1 unstructured pruning actually removes the lowest
+        entries of importance scores and not the parameter by l1 norm (by hand).
+        It also checks that applying l1 unstructured pruning more than once
+        respects the previous mask.
+        """
+        m = nn.Linear(4, 2)
+        # modify its weight matrix by hand
+        m.weight = torch.nn.Parameter(
+            torch.tensor(
+                [[1, 2, 3, 4], [-4, -3, -2, -1]], dtype=torch.float32
+            )
+        )
+        importance_scores = torch.tensor(
+            [[4, 2, 1, 3], [-3, -1, -2, -4]], dtype=torch.float32
+        )
+
+        prune.l1_unstructured(m, 'weight', amount=2, importance_scores=importance_scores)
+        expected_weight = torch.tensor([[1, 2, 0, 4], [-4, 0, -2, -1]],
+                                       dtype=m.weight.dtype)
+        self.assertEqual(expected_weight, m.weight)
+
+        # check that pruning again removes two entries of m.weight that are colocated with
+        # the next two smallest absolute values of importance scores.
+        prune.l1_unstructured(m, 'weight', amount=2, importance_scores=importance_scores)
+        expected_weight = torch.tensor([[1, 0, 0, 4], [-4, 0, 0, -1]],
+                                       dtype=m.weight.dtype)
+        self.assertEqual(expected_weight, m.weight)
+
+    def test_unstructured_pruning_same_magnitude(self):
+        r"""Since it may happen that the tensor to prune has entries with the
+        same exact magnitude, it is important to check that pruning happens
+        consistenly based on the bottom % of weights, and not by threshold,
+        which would instead kill off *all* units with magnitude = threshold.
+        """
+        AMOUNT = 0.2
+        p = prune.L1Unstructured(amount=AMOUNT)
+        # create a random tensors with entries in {-2, 0, 2}
+        t = 2 * torch.randint(low=-1, high=2, size=(10, 7))
+        nparams_toprune = prune._compute_nparams_toprune(AMOUNT, t.nelement())
+
+        computed_mask = p.compute_mask(t, default_mask=torch.ones_like(t))
+        nparams_pruned = torch.sum(computed_mask == 0)
+        self.assertEqual(nparams_toprune, nparams_pruned)
+
+    def test_random_structured_pruning_amount(self):
+        AMOUNT = 0.6
+        AXIS = 2
+        p = prune.RandomStructured(amount=AMOUNT, dim=AXIS)
+        t = 2 * torch.randint(low=-1, high=2, size=(5, 4, 2)).to(
+            dtype=torch.float32
+        )
+        nparams_toprune = prune._compute_nparams_toprune(AMOUNT, t.shape[AXIS])
+
+        computed_mask = p.compute_mask(t, default_mask=torch.ones_like(t))
+        # check that 1 column is fully prune, the others are left untouched
+        remaining_axes = [_ for _ in range(len(t.shape)) if _ != AXIS]
+        per_column_sums = sorted(
+            torch.sum(computed_mask == 0, axis=remaining_axes)
+        )
+        assert per_column_sums == [0, 20]
+
+    def test_ln_structured_pruning(self):
+        r"""Check Ln structured pruning by hand.
+        """
+        m = nn.Conv2d(3, 1, 2)
+        m.weight.data = torch.tensor(
+            [[[[1., 2.], [1., 2.5]],
+             [[0.5, 1.], [0.1, 0.1]],
+             [[-3., -5.], [0.1, -1.]]]]
+        )
+        # expected effect of pruning 1 of the 3 channels by L2-norm
+        expected_mask_axis1 = torch.ones_like(m.weight)
+        expected_mask_axis1[:, 1] = 0.
+
+        prune.ln_structured(m, 'weight', amount=1, n=2, dim=1)
+        self.assertEqual(expected_mask_axis1, m.weight_mask)
+
+        # expected effect of pruning 1 of the 2 columns along axis -1 by L1-norm
+        expected_mask_axis3 = expected_mask_axis1
+        expected_mask_axis3[:, :, :, 0] = 0.
+
+        prune.ln_structured(m, 'weight', amount=1, n=1, dim=-1)
+        self.assertEqual(expected_mask_axis3, m.weight_mask)
+
+    def test_ln_structured_pruning_importance_scores(self):
+        r"""Check Ln structured pruning by hand.
+        """
+        m = nn.Conv2d(3, 1, 2)
+        m.weight.data = torch.tensor(
+            [[[[1., 2.], [1., 2.5]],
+             [[0.5, 1.], [0.1, 0.1]],
+             [[-3., -5.], [0.1, -1.]]]]
+        )
+        importance_scores = torch.tensor(
+            [[[[10., 1.], [10., 1.]],
+             [[30., 3.], [30., 3.]],
+             [[-20., -2.], [-20., -2.]]]]
+        )
+        # expected effect of pruning 1 of the 3 channels by L2-norm
+        expected_mask_axis1 = torch.ones_like(m.weight)
+        expected_mask_axis1[:, 0] = 0.
+
+        prune.ln_structured(m, 'weight', amount=1, n=2, dim=1, importance_scores=importance_scores)
+        self.assertEqual(expected_mask_axis1, m.weight_mask)
+
+        # expected effect of pruning 1 of the 2 columns along axis -1 by L1-norm
+        expected_mask_axis3 = expected_mask_axis1
+        expected_mask_axis3[:, :, :, 1] = 0.
+
+        prune.ln_structured(m, 'weight', amount=1, n=1, dim=-1, importance_scores=importance_scores)
+        self.assertEqual(expected_mask_axis3, m.weight_mask)
+
+    def test_remove_pruning(self):
+        r"""`prune.remove` removes the hook and the reparametrization
+        and makes the pruning final in the original parameter.
+        """
+        modules = [nn.Linear(5, 7), nn.Conv3d(2, 2, 2)]
+        names = ['weight', 'bias']
+
+        for m in modules:
+            for name in names:
+                with self.subTest(m=m, name=name):
+                    # first prune
+                    prune.random_unstructured(m, name, amount=0.5)
+                    self.assertIn(name + "_orig", dict(m.named_parameters()))
+                    self.assertIn(name + "_mask", dict(m.named_buffers()))
+                    self.assertNotIn(name, dict(m.named_parameters()))
+                    self.assertTrue(hasattr(m, name))
+                    pruned_t = getattr(m, name)
+
+                    # then remove pruning
+                    prune.remove(m, name)
+                    self.assertIn(name, dict(m.named_parameters()))
+                    self.assertNotIn(name + "_orig", dict(m.named_parameters()))
+                    self.assertNotIn(name + "_mask", dict(m.named_buffers()))
+                    final_t = getattr(m, name)
+
+                    self.assertEqual(pruned_t, final_t)
+
+    def test_remove_pruning_exception(self):
+        r"""Removing from an unpruned tensor throws an assertion error
+        """
+        modules = [nn.Linear(5, 7), nn.Conv3d(2, 2, 2)]
+        names = ['weight', 'bias']
+
+        for m in modules:
+            for name in names:
+                with self.subTest(m=m, name=name):
+                    # check that the module isn't pruned
+                    self.assertFalse(prune.is_pruned(m))
+                    # since it isn't pruned, pruning can't be removed from it
+                    with self.assertRaises(ValueError):
+                        prune.remove(m, name)
+
+
+    def test_global_pruning(self):
+        r"""Test that global l1 unstructured pruning over 2 parameters removes
+        the `amount=4` smallest global weights across the 2 parameters.
+        """
+        m = nn.Linear(4, 2)
+        n = nn.Linear(3, 1)
+        # modify the weight matrices by hand
+        m.weight = torch.nn.Parameter(
+            torch.tensor([[1, 2, 3, 4], [-4, -3, -2, -1]]).to(
+                dtype=torch.float32)
+        )
+        n.weight = torch.nn.Parameter(
+            torch.tensor([[0, 0.1, -2]]).to(
+                dtype=torch.float32)
+        )
+
+        params_to_prune = (
+            (m, 'weight'),
+            (n, 'weight'),
+        )
+
+        # prune the 4 smallest weights globally by L1 magnitude
+        prune.global_unstructured(
+            params_to_prune,
+            pruning_method=prune.L1Unstructured,
+            amount=4
+        )
+
+        expected_mweight = torch.tensor([[0, 2, 3, 4], [-4, -3, -2, 0]],
+                                        dtype=m.weight.dtype)
+        self.assertEqual(expected_mweight, m.weight)
+
+        expected_nweight = torch.tensor([[0, 0, -2]]).to(dtype=n.weight.dtype)
+        self.assertEqual(expected_nweight, n.weight)
+
+    def test_global_pruning_importance_scores(self):
+        r"""Test that global l1 unstructured pruning over 2 parameters removes
+        the `amount=4` smallest global weights across the 2 parameters.
+        """
+        m = nn.Linear(4, 2)
+        n = nn.Linear(3, 1)
+        # modify the weight matrices by hand
+        m.weight = torch.nn.Parameter(
+            torch.tensor([[1, 2, 3, 4], [-4, -3, -2, -1]]).to(
+                dtype=torch.float32)
+        )
+        m_importance_scores = torch.tensor(
+            [[4, 2, 1, 3], [-3, -1, -2, -4]], dtype=torch.float32
+        )
+        n.weight = torch.nn.Parameter(
+            torch.tensor([[0, 0.1, -2]]).to(
+                dtype=torch.float32)
+        )
+        n_importance_scores = torch.tensor([[0, 10., -0.2]]).to(dtype=torch.float32)
+
+        params_to_prune = (
+            (m, 'weight'),
+            (n, 'weight'),
+        )
+        importance_scores = {
+            (m, 'weight'): m_importance_scores,
+            (n, 'weight'): n_importance_scores,
+        }
+
+        # prune the 4 smallest weights globally by L1 magnitude
+        prune.global_unstructured(
+            params_to_prune,
+            pruning_method=prune.L1Unstructured,
+            amount=4,
+            importance_scores=importance_scores,
+        )
+
+        expected_m_weight = torch.tensor([[1, 2, 0, 4], [-4, 0, -2, -1]],
+                                         dtype=m.weight.dtype)
+        self.assertEqual(expected_m_weight, m.weight)
+
+        expected_n_weight = torch.tensor([[0, 0.1, 0]]).to(dtype=n.weight.dtype)
+        self.assertEqual(expected_n_weight, n.weight)
+
+    def test_custom_from_mask_pruning(self):
+        r"""Test that the CustomFromMask is capable of receiving
+        as input at instantiation time a custom mask, and combining it with
+        the previous default mask to generate the correct final mask.
+        """
+        # new mask
+        mask = torch.tensor([[0, 1, 1, 0], [0, 0, 1, 1]])
+        # old mask
+        default_mask = torch.tensor([[0, 0, 0, 0], [1, 1, 1, 1]])
+
+        # some tensor (not actually used)
+        t = torch.rand_like(mask.to(dtype=torch.float32))
+
+        p = prune.CustomFromMask(mask=mask)
+
+        computed_mask = p.compute_mask(t, default_mask)
+        expected_mask = torch.tensor([[0, 0, 0, 0], [0, 0, 1, 1]], dtype=computed_mask.dtype)
+
+        self.assertEqual(computed_mask, expected_mask)
+
+    def test_pruning_rollback(self):
+        r"""Test that if something fails when the we try to compute the mask,
+        then the model isn't left in some intermediate half-pruned state.
+        The try/except statement in `apply` should handle rolling back
+        to the previous state before pruning began.
+        """
+        modules = [nn.Linear(5, 7), nn.Conv3d(2, 2, 2)]
+        names = ['weight', 'bias']
+
+        for m in modules:
+            for name in names:
+                with self.subTest(m=m, name=name):
+
+                    with mock.patch(
+                        "torch.nn.utils.prune.L1Unstructured.compute_mask"
+                    ) as compute_mask:
+                        compute_mask.side_effect = Exception('HA!')
+                        with self.assertRaises(Exception):
+                            prune.l1_unstructured(m, name=name, amount=0.9)
+
+                        self.assertTrue(
+                            name in dict(m.named_parameters())
+                        )
+                        self.assertFalse(
+                            name + '_mask' in dict(m.named_buffers())
+                        )
+                        self.assertFalse(
+                            name + '_orig' in dict(m.named_parameters())
+                        )
+
+    def test_pruning_serialization_model(self):
+        # create a model
+        model = torch.nn.Sequential(
+            torch.nn.Linear(10, 10),
+            torch.nn.ReLU(),
+            torch.nn.Linear(10, 1),
+        )
+        # check that everything looks normal before pruning
+        self.assertNotIn('0.weight_orig', model.state_dict())
+        self.assertNotIn('0.weight_mask', model.state_dict())
+        self.assertIn('0.weight', model.state_dict())
+
+        # prune one of its parameters
+        prune.l1_unstructured(module=model[0], name='weight', amount=0.9)
+
+        # check that the original weight and the new mask are present
+        self.assertIn('0.weight_orig', model.state_dict())
+        self.assertIn('0.weight_mask', model.state_dict())
+        self.assertNotIn('0.weight', model.state_dict())
+        self.assertTrue(hasattr(model[0], 'weight'))
+
+        pruned_weight = model[0].weight
+
+        with TemporaryFileName() as fname:
+            torch.save(model, fname)
+            new_model = torch.load(fname)
+
+        # check that the original weight and the new mask are present
+        self.assertIn('0.weight_orig', new_model.state_dict())
+        self.assertIn('0.weight_mask', new_model.state_dict())
+        self.assertNotIn('0.weight', new_model.state_dict())
+        self.assertTrue(hasattr(new_model[0], 'weight'))
+
+        self.assertEqual(pruned_weight, new_model[0].weight)
+
+    def test_pruning_serialization_state_dict(self):
+        # create a model
+        model = torch.nn.Sequential(
+            torch.nn.Linear(10, 10),
+            torch.nn.ReLU(),
+            torch.nn.Linear(10, 1),
+        )
+        # check that everything looks normal before pruning
+        self.assertNotIn('0.weight_orig', model.state_dict())
+        self.assertNotIn('0.weight_mask', model.state_dict())
+        self.assertIn('0.weight', model.state_dict())
+
+        # prune one of its parameters
+        prune.l1_unstructured(module=model[0], name='weight', amount=0.9)
+
+        # check that the original weight and the new mask are present
+        self.assertIn('0.weight_orig', model.state_dict())
+        self.assertIn('0.weight_mask', model.state_dict())
+        self.assertNotIn('0.weight', model.state_dict())
+        self.assertTrue(hasattr(model[0], 'weight'))
+
+        pruned_weight = model[0].weight
+
+        # make pruning permanent and restore parameter names as in base
+        # architecture
+        prune.remove(module=model[0], name='weight')
+
+        # check that the original weight and the new mask are no longer present
+        self.assertNotIn('0.weight_orig', model.state_dict())
+        self.assertNotIn('0.weight_mask', model.state_dict())
+        self.assertIn('0.weight', model.state_dict())
+
+        # save the state dict of model and reload it into new_model
+        new_model = torch.nn.Sequential(
+            torch.nn.Linear(10, 10),
+            torch.nn.ReLU(),
+            torch.nn.Linear(10, 1),
+        )
+        with TemporaryFileName() as fname:
+            torch.save(model.state_dict(), fname)
+            new_model.load_state_dict(torch.load(fname))
+
+        # check that the original weight and the new mask are not present in
+        # new_model either.
+        self.assertNotIn('0.weight_orig', new_model.state_dict())
+        self.assertNotIn('0.weight_mask', new_model.state_dict())
+        self.assertIn('0.weight', new_model.state_dict())
+
+        self.assertEqual(pruned_weight, new_model[0].weight)
+
+    def test_prune(self):
+        # create a new pruning method
+        p = prune.L1Unstructured(amount=2)
+        # create tensor to be pruned
+        t = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]]).to(dtype=torch.float32)
+        # create prior mask by hand
+        default_mask = torch.tensor([[1, 1, 1, 0], [1, 1, 0, 1]])
+        # since we are pruning the two lowest magnitude units, the outcome of
+        # the calculation should be this:
+        expected_mask = torch.tensor([[0, 0, 1, 0], [1, 1, 0, 1]])
+        pruned_tensor = p.prune(t, default_mask)
+        self.assertEqual(t * expected_mask, pruned_tensor)
+
+    def test_prune_importance_scores(self):
+        # create a new pruning method
+        p = prune.L1Unstructured(amount=2)
+        # create tensor to be pruned
+        t = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]]).to(dtype=torch.float32)
+        importance_scores = torch.tensor(
+            [[1, 2, 3, 4], [1.5, 1.6, 1.7, 1.8]]
+        ).to(dtype=torch.float32)
+        # create prior mask by hand
+        default_mask = torch.tensor([[1, 1, 1, 0], [1, 1, 0, 1]])
+        # since we are pruning the two lowest magnitude units, the outcome of
+        # the calculation should be this:
+        expected_mask = torch.tensor([[0, 1, 1, 0], [0, 1, 0, 1]])
+        pruned_tensor = p.prune(t, default_mask, importance_scores=importance_scores)
+        self.assertEqual(t * expected_mask, pruned_tensor)
+
+    def test_prune_importance_scores_mimic_default(self):
+        # create a new pruning method
+        p = prune.L1Unstructured(amount=2)
+        # create tensor to be pruned
+        t = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]]).to(dtype=torch.float32)
+        # create prior mask by hand
+        default_mask = torch.tensor([[1, 1, 1, 0], [1, 1, 0, 1]])
+        # since we are pruning the two lowest magnitude units, the outcome of
+        # the calculation should be this:
+        expected_mask = torch.tensor([[0, 0, 1, 0], [1, 1, 0, 1]])
+        pruned_tensor_without_importance_scores = p.prune(t, default_mask)
+        pruned_tensor_with_importance_scores = p.prune(t, default_mask, importance_scores=t)
+        self.assertEqual(pruned_tensor_without_importance_scores, pruned_tensor_with_importance_scores)
+        self.assertEqual(t * expected_mask, pruned_tensor_without_importance_scores)
+
+    def test_rnn_pruning(self):
+        l = torch.nn.LSTM(32, 32)
+        # This Module has 4 parameters called:
+        # 'weight_ih_l0', 'weight_hh_l0', 'bias_ih_l0', 'bias_hh_l0'
+
+        # Pruning one of them causes one of the weights to become a tensor
+        prune.l1_unstructured(l, 'weight_ih_l0', 0.5)
+        assert (
+            sum([isinstance(p, torch.nn.Parameter) for p in l._flat_weights])
+            == 3
+        )
+
+        # Removing the pruning reparametrization restores the Parameter
+        prune.remove(l, 'weight_ih_l0')
+        assert (
+            sum([isinstance(p, torch.nn.Parameter) for p in l._flat_weights])
+            == 4
+        )
+
+        # Make sure that, upon removal of the reparametrization, the
+        # `._parameters` and `.named_parameters` contain the right params.
+        # Specifically, the original weight ('weight_ih_l0') should be placed
+        # back in the parameters, while the reparametrization component
+        # ('weight_ih_l0_orig') should be removed.
+        assert 'weight_ih_l0' in l._parameters
+        assert l._parameters['weight_ih_l0'] is not None
+        assert 'weight_ih_l0_orig' not in l._parameters
+        assert 'weight_ih_l0' in dict(l.named_parameters())
+        assert dict(l.named_parameters())['weight_ih_l0'] is not None
+        assert 'weight_ih_l0_orig' not in dict(l.named_parameters())
+
+instantiate_parametrized_tests(TestPruningNN)
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_nn.py b/test/test_nn.py
index ceabcb28ac84b..8a05dc68bb7cf 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -27,7 +27,6 @@
 import torch.nn.functional as F
 import torch.nn.utils.rnn as rnn_utils
 from torch.nn.utils import clip_grad_norm_, clip_grad_value_
-import torch.nn.utils.prune as prune
 from torch.nn.utils import parameters_to_vector, vector_to_parameters
 from torch.nn.utils.fusion import fuse_conv_bn_weights
 from torch.nn.utils.fusion import fuse_linear_bn_weights
@@ -37,7 +36,7 @@
 from torch.testing._internal.common_utils import freeze_rng_state, run_tests, TestCase, skipIfNoLapack, skipIfRocm, \
     TEST_NUMPY, TEST_SCIPY, TEST_WITH_CROSSREF, TEST_WITH_ROCM, \
     download_file, get_function_arglist, load_tests, skipIfMps,\
-    TemporaryFileName, TEST_WITH_UBSAN, IS_PPC, \
+    TEST_WITH_UBSAN, IS_PPC, \
     parametrize as parametrize_test, subtest, instantiate_parametrized_tests, \
     skipIfTorchDynamo
 from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, TEST_CUDNN_VERSION
@@ -1754,924 +1753,6 @@ def test_vector_to_parameters(self):
         sample = next(model.parameters())[0, 0, 0]
         self.assertTrue(torch.equal(sample.data, vec.data[:5]))
 
-    # torch/nn/utils/prune.py
-    @unittest.skipIf(not TEST_NUMPY, "numpy not found")
-    def test_validate_pruning_amount_init(self):
-        r"""Test the first util function that validates the pruning
-        amount requested by the user the moment the pruning method
-        is initialized. This test checks that the expected errors are
-        raised whenever the amount is invalid.
-        The original function runs basic type checking + value range checks.
-        It doesn't check the validity of the pruning amount with
-        respect to the size of the tensor to prune. That's left to
-        `_validate_pruning_amount`, tested below.
-        """
-        # neither float not int should raise TypeError
-        with self.assertRaises(TypeError):
-            prune._validate_pruning_amount_init(amount="I'm a string")
-
-        # float not in [0, 1] should raise ValueError
-        with self.assertRaises(ValueError):
-            prune._validate_pruning_amount_init(amount=1.1)
-        with self.assertRaises(ValueError):
-            prune._validate_pruning_amount_init(amount=20.)
-
-        # negative int should raise ValueError
-        with self.assertRaises(ValueError):
-            prune._validate_pruning_amount_init(amount=-10)
-
-        # all these should pass without errors because they're valid amounts
-        prune._validate_pruning_amount_init(amount=0.34)
-        prune._validate_pruning_amount_init(amount=1500)
-        prune._validate_pruning_amount_init(amount=0)
-        prune._validate_pruning_amount_init(amount=0.)
-        prune._validate_pruning_amount_init(amount=1)
-        prune._validate_pruning_amount_init(amount=1.)
-        self.assertTrue(True)
-
-    @unittest.skipIf(not TEST_NUMPY, "numpy not found")
-    def test_validate_pruning_amount(self):
-        r"""Tests the second util function that validates the pruning
-        amount requested by the user, this time with respect to the size
-        of the tensor to prune. The rationale is that if the pruning amount,
-        converted to absolute value of units to prune, is larger than
-        the number of units in the tensor, then we expect the util function
-        to raise a value error.
-        """
-        # if amount is int and amount > tensor_size, raise ValueError
-        with self.assertRaises(ValueError):
-            prune._validate_pruning_amount(amount=20, tensor_size=19)
-
-        # amount is a float so this should not raise an error
-        prune._validate_pruning_amount(amount=0.3, tensor_size=0)
-
-        # this is okay
-        prune._validate_pruning_amount(amount=19, tensor_size=20)
-        prune._validate_pruning_amount(amount=0, tensor_size=0)
-        prune._validate_pruning_amount(amount=1, tensor_size=1)
-        self.assertTrue(True)
-
-    @unittest.skipIf(not TEST_NUMPY, "numpy not found")
-    def test_compute_nparams_to_prune(self):
-        r"""Test that requested pruning `amount` gets translated into the
-        correct absolute number of units to prune.
-        """
-        self.assertEqual(
-            prune._compute_nparams_toprune(amount=0, tensor_size=15),
-            0
-        )
-        self.assertEqual(
-            prune._compute_nparams_toprune(amount=10, tensor_size=15),
-            10
-        )
-        # if 1 is int, means 1 unit
-        self.assertEqual(
-            prune._compute_nparams_toprune(amount=1, tensor_size=15),
-            1
-        )
-        # if 1. is float, means 100% of units
-        self.assertEqual(
-            prune._compute_nparams_toprune(amount=1., tensor_size=15),
-            15
-        )
-        self.assertEqual(
-            prune._compute_nparams_toprune(amount=0.4, tensor_size=17),
-            7
-        )
-
-    def test_random_pruning_sizes(self):
-        r"""Test that the new parameters and buffers created by the pruning
-        method have the same size as the input tensor to prune. These, in
-        fact, correspond to the pruned version of the tensor itself, its
-        mask, and its original copy, so the size must match.
-        """
-        # fixturize test
-        # TODO: add other modules
-        modules = [nn.Linear(5, 7), nn.Conv3d(2, 2, 2)]
-        names = ['weight', 'bias']
-
-        for m in modules:
-            for name in names:
-                with self.subTest(m=m, name=name):
-                    original_tensor = getattr(m, name)
-
-                    prune.random_unstructured(m, name=name, amount=0.1)
-                    # mask has the same size as tensor being pruned
-                    self.assertEqual(
-                        original_tensor.size(),
-                        getattr(m, name + '_mask').size()
-                    )
-                    # 'orig' tensor has the same size as the original tensor
-                    self.assertEqual(
-                        original_tensor.size(),
-                        getattr(m, name + '_orig').size()
-                    )
-                    # new tensor has the same size as the original tensor
-                    self.assertEqual(
-                        original_tensor.size(),
-                        getattr(m, name).size()
-                    )
-
-    def test_random_pruning_orig(self):
-        r"""Test that original tensor is correctly stored in 'orig'
-        after pruning is applied. Important to make sure we don't
-        lose info about the original unpruned parameter.
-        """
-        # fixturize test
-        # TODO: add other modules
-        modules = [nn.Linear(5, 7), nn.Conv3d(2, 2, 2)]
-        names = ['weight', 'bias']
-
-        for m in modules:
-            for name in names:
-                with self.subTest(m=m, name=name):
-
-                    # tensor prior to pruning
-                    original_tensor = getattr(m, name)
-                    prune.random_unstructured(m, name=name, amount=0.1)
-                    self.assertEqual(
-                        original_tensor,
-                        getattr(m, name + '_orig')
-                    )
-
-    def test_random_pruning_new_weight(self):
-        r"""Test that module.name now contains a pruned version of
-        the original tensor obtained from multiplying it by the mask.
-        """
-        # fixturize test
-        # TODO: add other modules
-        modules = [nn.Linear(5, 7), nn.Conv3d(2, 2, 2)]
-        names = ['weight', 'bias']
-
-        for m in modules:
-            for name in names:
-                with self.subTest(m=m, name=name):
-                    # tensor prior to pruning
-                    original_tensor = getattr(m, name)
-                    prune.random_unstructured(m, name=name, amount=0.1)
-                    # weight = weight_orig * weight_mask
-                    self.assertEqual(
-                        getattr(m, name),
-                        getattr(m, name + '_orig')
-                        * getattr(m, name + '_mask').to(
-                            dtype=original_tensor.dtype
-                        ),
-                    )
-
-    def test_identity_pruning(self):
-        r"""Test that a mask of 1s does not change forward or backward.
-        """
-        input_ = torch.ones(1, 5)
-        m = nn.Linear(5, 2)
-        y_prepruning = m(input_)  # output prior to pruning
-
-        # compute grad pre-pruning and check it's equal to all ones
-        y_prepruning.sum().backward()
-        old_grad_weight = m.weight.grad.clone()  # don't grab pointer!
-        self.assertEqual(old_grad_weight, torch.ones_like(m.weight))
-        old_grad_bias = m.bias.grad.clone()
-        self.assertEqual(old_grad_bias, torch.ones_like(m.bias))
-
-        # remove grads
-        m.zero_grad()
-
-        # force the mask to be made of all 1s
-        prune.identity(m, name="weight")
-
-        # with mask of 1s, output should be identical to no mask
-        y_postpruning = m(input_)
-        self.assertEqual(y_prepruning, y_postpruning)
-
-        # with mask of 1s, grad should be identical to no mask
-        y_postpruning.sum().backward()
-        self.assertEqual(old_grad_weight, m.weight_orig.grad)
-        self.assertEqual(old_grad_bias, m.bias.grad)
-
-        # calling forward twice in a row shouldn't change output
-        y1 = m(input_)
-        y2 = m(input_)
-        self.assertEqual(y1, y2)
-
-    def test_random_pruning_0perc(self):
-        r"""Test that a mask of 1s does not change forward or backward.
-        """
-        input_ = torch.ones(1, 5)
-        m = nn.Linear(5, 2)
-        y_prepruning = m(input_)  # output prior to pruning
-
-        # compute grad pre-pruning and check it's equal to all ones
-        y_prepruning.sum().backward()
-        old_grad_weight = m.weight.grad.clone()  # don't grab pointer!
-        self.assertEqual(old_grad_weight, torch.ones_like(m.weight))
-        old_grad_bias = m.bias.grad.clone()
-        self.assertEqual(old_grad_bias, torch.ones_like(m.bias))
-
-        # remove grads
-        m.zero_grad()
-
-        # force the mask to be made of all 1s
-        with mock.patch(
-            "torch.nn.utils.prune.RandomUnstructured.compute_mask"
-        ) as compute_mask:
-            compute_mask.return_value = torch.ones_like(m.weight)
-            prune.random_unstructured(m, name='weight', amount=0.9)  # amount won't count
-
-        # with mask of 1s, output should be identical to no mask
-        y_postpruning = m(input_)
-        self.assertEqual(y_prepruning, y_postpruning)
-
-        # with mask of 1s, grad should be identical to no mask
-        y_postpruning.sum().backward()
-        self.assertEqual(old_grad_weight, m.weight_orig.grad)
-        self.assertEqual(old_grad_bias, m.bias.grad)
-
-        # calling forward twice in a row shouldn't change output
-        y1 = m(input_)
-        y2 = m(input_)
-        self.assertEqual(y1, y2)
-
-    def test_random_pruning(self):
-        input_ = torch.ones(1, 5)
-        m = nn.Linear(5, 2)
-
-        # define custom mask to assign with mock
-        mask = torch.ones_like(m.weight)
-        mask[1, 0] = 0
-        mask[0, 3] = 0
-
-        # check grad is zero for masked weights
-        with mock.patch(
-            "torch.nn.utils.prune.RandomUnstructured.compute_mask"
-        ) as compute_mask:
-            compute_mask.return_value = mask
-            prune.random_unstructured(m, name='weight', amount=0.9)
-
-        y_postpruning = m(input_)
-        y_postpruning.sum().backward()
-        # weight_orig is the parameter, so it's the tensor that will accumulate the grad
-        self.assertEqual(m.weight_orig.grad, mask)  # all 1s, except for masked units
-        self.assertEqual(m.bias.grad, torch.ones_like(m.bias))
-
-        # make sure that weight_orig update doesn't modify [1, 0] and [0, 3]
-        old_weight_orig = m.weight_orig.clone()
-        # update weights
-        learning_rate = 1.
-        for p in m.parameters():
-            p.data.sub_(p.grad.data * learning_rate)
-        # since these are pruned, they should not be updated
-        self.assertEqual(old_weight_orig[1, 0], m.weight_orig[1, 0])
-        self.assertEqual(old_weight_orig[0, 3], m.weight_orig[0, 3])
-
-    def test_random_pruning_forward(self):
-        r"""check forward with mask (by hand).
-        """
-        input_ = torch.ones(1, 5)
-        m = nn.Linear(5, 2)
-
-        # define custom mask to assign with mock
-        mask = torch.zeros_like(m.weight)
-        mask[1, 0] = 1
-        mask[0, 3] = 1
-
-        with mock.patch(
-            "torch.nn.utils.prune.RandomUnstructured.compute_mask"
-        ) as compute_mask:
-            compute_mask.return_value = mask
-            prune.random_unstructured(m, name='weight', amount=0.9)
-
-        yhat = m(input_)
-        self.assertEqual(yhat[0, 0], m.weight_orig[0, 3] + m.bias[0])
-        self.assertEqual(yhat[0, 1], m.weight_orig[1, 0] + m.bias[1])
-
-    def test_remove_pruning_forward(self):
-        r"""Remove pruning and check forward is unchanged from previous
-        pruned state.
-        """
-        input_ = torch.ones(1, 5)
-        m = nn.Linear(5, 2)
-
-        # define custom mask to assign with mock
-        mask = torch.ones_like(m.weight)
-        mask[1, 0] = 0
-        mask[0, 3] = 0
-
-        # check grad is zero for masked weights
-        with mock.patch(
-            "torch.nn.utils.prune.RandomUnstructured.compute_mask"
-        ) as compute_mask:
-            compute_mask.return_value = mask
-            prune.random_unstructured(m, name='weight', amount=0.9)
-
-        y_postpruning = m(input_)
-
-        prune.remove(m, 'weight')
-
-        y_postremoval = m(input_)
-        self.assertEqual(y_postpruning, y_postremoval)
-
-    def test_pruning_id_consistency(self):
-        r"""Test that pruning doesn't change the id of the parameters, which
-        would otherwise introduce issues with pre-existing optimizers that
-        point to old parameters.
-        """
-        m = nn.Linear(5, 2, bias=False)
-
-        tensor_id = id(list(m.parameters())[0])
-
-        prune.random_unstructured(m, name="weight", amount=0.9)
-        self.assertEqual(tensor_id, id(list(m.parameters())[0]))
-
-        prune.remove(m, "weight")
-        self.assertEqual(tensor_id, id(list(m.parameters())[0]))
-
-    def test_random_pruning_pickle(self):
-        modules = [nn.Linear(5, 7), nn.Conv3d(2, 2, 2)]
-        names = ['weight', 'bias']
-
-        for m in modules:
-            for name in names:
-                with self.subTest(m=m, name=name):
-                    prune.random_unstructured(m, name=name, amount=0.1)
-                    m_new = pickle.loads(pickle.dumps(m))
-                    self.assertIsInstance(m_new, type(m))
-
-    def test_multiple_pruning_calls(self):
-        # if you call pruning twice, the hook becomes a PruningContainer
-        m = nn.Conv3d(2, 2, 2)
-        prune.l1_unstructured(m, name='weight', amount=0.1)
-        weight_mask0 = m.weight_mask  # save it for later sanity check
-
-        # prune again
-        prune.ln_structured(m, name='weight', amount=0.3, n=2, dim=0)
-        hook = next(iter(m._forward_pre_hooks.values()))
-        self.assertIsInstance(
-            hook,
-            torch.nn.utils.prune.PruningContainer
-        )
-        # check that container._tensor_name is correctly set no matter how
-        # many pruning methods are in the container
-        self.assertEqual(hook._tensor_name, 'weight')
-
-        # check that the pruning container has the right length
-        # equal to the number of pruning iters
-        self.assertEqual(len(hook), 2)  # m.weight has been pruned twice
-
-        # check that the entries of the pruning container are of the expected
-        # type and in the expected order
-        self.assertIsInstance(hook[0], torch.nn.utils.prune.L1Unstructured)
-        self.assertIsInstance(hook[1], torch.nn.utils.prune.LnStructured)
-
-        # check that all entries that are 0 in the 1st mask are 0 in the
-        # 2nd mask too
-        self.assertTrue(torch.all(m.weight_mask[weight_mask0 == 0] == 0))
-
-        # prune again
-        prune.ln_structured(m, name='weight', amount=0.1, n=float('inf'), dim=1)
-        # check that container._tensor_name is correctly set no matter how
-        # many pruning methods are in the container
-        hook = next(iter(m._forward_pre_hooks.values()))
-        self.assertEqual(hook._tensor_name, 'weight')
-
-    def test_pruning_container(self):
-        # create an empty container
-        container = prune.PruningContainer()
-        container._tensor_name = 'test'
-        self.assertEqual(len(container), 0)
-
-        p = prune.L1Unstructured(amount=2)
-        p._tensor_name = 'test'
-
-        # test adding a pruning method to a container
-        container.add_pruning_method(p)
-
-        # test error raised if tensor name is different
-        q = prune.L1Unstructured(amount=2)
-        q._tensor_name = 'another_test'
-        with self.assertRaises(ValueError):
-            container.add_pruning_method(q)
-
-        # test that adding a non-pruning method object to a pruning container
-        # raises a TypeError
-        with self.assertRaises(TypeError):
-            container.add_pruning_method(10)
-        with self.assertRaises(TypeError):
-            container.add_pruning_method('ugh')
-
-    def test_pruning_container_compute_mask(self):
-        r"""Test `compute_mask` of pruning container with a known `t` and
-        `default_mask`. Indirectly checks that Ln structured pruning is
-        acting on the right axis.
-        """
-        # create an empty container
-        container = prune.PruningContainer()
-        container._tensor_name = 'test'
-
-        # 1) test unstructured pruning
-        # create a new pruning method
-        p = prune.L1Unstructured(amount=2)
-        p._tensor_name = 'test'
-        # add the pruning method to the container
-        container.add_pruning_method(p)
-
-        # create tensor to be pruned
-        t = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]]).to(dtype=torch.float32)
-        # create prior mask by hand
-        default_mask = torch.tensor([[1, 1, 1, 0], [1, 1, 0, 1]])
-        # since we are pruning the two lowest magnitude units, the outcome of
-        # the calculation should be this:
-        expected_mask = torch.tensor([[0, 0, 1, 0], [1, 1, 0, 1]], dtype=torch.float32)
-        computed_mask = container.compute_mask(t, default_mask)
-        self.assertEqual(expected_mask, computed_mask)
-
-        # 2) test structured pruning
-        q = prune.LnStructured(amount=1, n=2, dim=0)
-        q._tensor_name = 'test'
-        container.add_pruning_method(q)
-        # since we are pruning the lowest magnitude one of the two rows, the
-        # outcome of the calculation should be this:
-        expected_mask = torch.tensor([[0, 0, 0, 0], [1, 1, 0, 1]], dtype=torch.float32)
-        computed_mask = container.compute_mask(t, default_mask)
-        self.assertEqual(expected_mask, computed_mask)
-
-        # 2) test structured pruning, along another axis
-        r = prune.LnStructured(amount=1, n=2, dim=1)
-        r._tensor_name = 'test'
-        container.add_pruning_method(r)
-        # since we are pruning the lowest magnitude of the four columns, the
-        # outcome of the calculation should be this:
-        expected_mask = torch.tensor([[0, 1, 1, 0], [0, 1, 0, 1]], dtype=torch.float32)
-        computed_mask = container.compute_mask(t, default_mask)
-        self.assertEqual(expected_mask, computed_mask)
-
-    def test_l1_unstructured_pruning(self):
-        r"""Test that l1 unstructured pruning actually removes the lowest
-        entries by l1 norm (by hand). It also checks that applying l1
-        unstructured pruning more than once respects the previous mask.
-        """
-        m = nn.Linear(4, 2)
-        # modify its weight matrix by hand
-        m.weight = torch.nn.Parameter(
-            torch.tensor(
-                [[1, 2, 3, 4], [-4, -3, -2, -1]], dtype=torch.float32
-            )
-        )
-
-        prune.l1_unstructured(m, 'weight', amount=2)
-        expected_weight = torch.tensor([[0, 2, 3, 4], [-4, -3, -2, 0]],
-                                       dtype=m.weight.dtype)
-        self.assertEqual(expected_weight, m.weight)
-
-        # check that pruning again removes the next two smallest entries
-        prune.l1_unstructured(m, 'weight', amount=2)
-        expected_weight = torch.tensor([[0, 0, 3, 4], [-4, -3, 0, 0]],
-                                       dtype=m.weight.dtype)
-        self.assertEqual(expected_weight, m.weight)
-
-    def test_l1_unstructured_pruning_with_importance_scores(self):
-        r"""Test that l1 unstructured pruning actually removes the lowest
-        entries of importance scores and not the parameter by l1 norm (by hand).
-        It also checks that applying l1 unstructured pruning more than once
-        respects the previous mask.
-        """
-        m = nn.Linear(4, 2)
-        # modify its weight matrix by hand
-        m.weight = torch.nn.Parameter(
-            torch.tensor(
-                [[1, 2, 3, 4], [-4, -3, -2, -1]], dtype=torch.float32
-            )
-        )
-        importance_scores = torch.tensor(
-            [[4, 2, 1, 3], [-3, -1, -2, -4]], dtype=torch.float32
-        )
-
-        prune.l1_unstructured(m, 'weight', amount=2, importance_scores=importance_scores)
-        expected_weight = torch.tensor([[1, 2, 0, 4], [-4, 0, -2, -1]],
-                                       dtype=m.weight.dtype)
-        self.assertEqual(expected_weight, m.weight)
-
-        # check that pruning again removes two entries of m.weight that are colocated with
-        # the next two smallest absolute values of importance scores.
-        prune.l1_unstructured(m, 'weight', amount=2, importance_scores=importance_scores)
-        expected_weight = torch.tensor([[1, 0, 0, 4], [-4, 0, 0, -1]],
-                                       dtype=m.weight.dtype)
-        self.assertEqual(expected_weight, m.weight)
-
-    def test_unstructured_pruning_same_magnitude(self):
-        r"""Since it may happen that the tensor to prune has entries with the
-        same exact magnitude, it is important to check that pruning happens
-        consistenly based on the bottom % of weights, and not by threshold,
-        which would instead kill off *all* units with magnitude = threshold.
-        """
-        AMOUNT = 0.2
-        p = prune.L1Unstructured(amount=AMOUNT)
-        # create a random tensors with entries in {-2, 0, 2}
-        t = 2 * torch.randint(low=-1, high=2, size=(10, 7))
-        nparams_toprune = prune._compute_nparams_toprune(AMOUNT, t.nelement())
-
-        computed_mask = p.compute_mask(t, default_mask=torch.ones_like(t))
-        nparams_pruned = torch.sum(computed_mask == 0)
-        self.assertEqual(nparams_toprune, nparams_pruned)
-
-    def test_random_structured_pruning_amount(self):
-        AMOUNT = 0.6
-        AXIS = 2
-        p = prune.RandomStructured(amount=AMOUNT, dim=AXIS)
-        t = 2 * torch.randint(low=-1, high=2, size=(5, 4, 2)).to(
-            dtype=torch.float32
-        )
-        nparams_toprune = prune._compute_nparams_toprune(AMOUNT, t.shape[AXIS])
-
-        computed_mask = p.compute_mask(t, default_mask=torch.ones_like(t))
-        # check that 1 column is fully prune, the others are left untouched
-        remaining_axes = [_ for _ in range(len(t.shape)) if _ != AXIS]
-        per_column_sums = sorted(
-            torch.sum(computed_mask == 0, axis=remaining_axes)
-        )
-        assert per_column_sums == [0, 20]
-
-    def test_ln_structured_pruning(self):
-        r"""Check Ln structured pruning by hand.
-        """
-        m = nn.Conv2d(3, 1, 2)
-        m.weight.data = torch.tensor(
-            [[[[1., 2.], [1., 2.5]],
-             [[0.5, 1.], [0.1, 0.1]],
-             [[-3., -5.], [0.1, -1.]]]]
-        )
-        # expected effect of pruning 1 of the 3 channels by L2-norm
-        expected_mask_axis1 = torch.ones_like(m.weight)
-        expected_mask_axis1[:, 1] = 0.
-
-        prune.ln_structured(m, 'weight', amount=1, n=2, dim=1)
-        self.assertEqual(expected_mask_axis1, m.weight_mask)
-
-        # expected effect of pruning 1 of the 2 columns along axis -1 by L1-norm
-        expected_mask_axis3 = expected_mask_axis1
-        expected_mask_axis3[:, :, :, 0] = 0.
-
-        prune.ln_structured(m, 'weight', amount=1, n=1, dim=-1)
-        self.assertEqual(expected_mask_axis3, m.weight_mask)
-
-    def test_ln_structured_pruning_importance_scores(self):
-        r"""Check Ln structured pruning by hand.
-        """
-        m = nn.Conv2d(3, 1, 2)
-        m.weight.data = torch.tensor(
-            [[[[1., 2.], [1., 2.5]],
-             [[0.5, 1.], [0.1, 0.1]],
-             [[-3., -5.], [0.1, -1.]]]]
-        )
-        importance_scores = torch.tensor(
-            [[[[10., 1.], [10., 1.]],
-             [[30., 3.], [30., 3.]],
-             [[-20., -2.], [-20., -2.]]]]
-        )
-        # expected effect of pruning 1 of the 3 channels by L2-norm
-        expected_mask_axis1 = torch.ones_like(m.weight)
-        expected_mask_axis1[:, 0] = 0.
-
-        prune.ln_structured(m, 'weight', amount=1, n=2, dim=1, importance_scores=importance_scores)
-        self.assertEqual(expected_mask_axis1, m.weight_mask)
-
-        # expected effect of pruning 1 of the 2 columns along axis -1 by L1-norm
-        expected_mask_axis3 = expected_mask_axis1
-        expected_mask_axis3[:, :, :, 1] = 0.
-
-        prune.ln_structured(m, 'weight', amount=1, n=1, dim=-1, importance_scores=importance_scores)
-        self.assertEqual(expected_mask_axis3, m.weight_mask)
-
-    def test_remove_pruning(self):
-        r"""`prune.remove` removes the hook and the reparametrization
-        and makes the pruning final in the original parameter.
-        """
-        modules = [nn.Linear(5, 7), nn.Conv3d(2, 2, 2)]
-        names = ['weight', 'bias']
-
-        for m in modules:
-            for name in names:
-                with self.subTest(m=m, name=name):
-                    # first prune
-                    prune.random_unstructured(m, name, amount=0.5)
-                    self.assertIn(name + "_orig", dict(m.named_parameters()))
-                    self.assertIn(name + "_mask", dict(m.named_buffers()))
-                    self.assertNotIn(name, dict(m.named_parameters()))
-                    self.assertTrue(hasattr(m, name))
-                    pruned_t = getattr(m, name)
-
-                    # then remove pruning
-                    prune.remove(m, name)
-                    self.assertIn(name, dict(m.named_parameters()))
-                    self.assertNotIn(name + "_orig", dict(m.named_parameters()))
-                    self.assertNotIn(name + "_mask", dict(m.named_buffers()))
-                    final_t = getattr(m, name)
-
-                    self.assertEqual(pruned_t, final_t)
-
-    def test_remove_pruning_exception(self):
-        r"""Removing from an unpruned tensor throws an assertion error
-        """
-        modules = [nn.Linear(5, 7), nn.Conv3d(2, 2, 2)]
-        names = ['weight', 'bias']
-
-        for m in modules:
-            for name in names:
-                with self.subTest(m=m, name=name):
-                    # check that the module isn't pruned
-                    self.assertFalse(prune.is_pruned(m))
-                    # since it isn't pruned, pruning can't be removed from it
-                    with self.assertRaises(ValueError):
-                        prune.remove(m, name)
-
-
-    def test_global_pruning(self):
-        r"""Test that global l1 unstructured pruning over 2 parameters removes
-        the `amount=4` smallest global weights across the 2 parameters.
-        """
-        m = nn.Linear(4, 2)
-        n = nn.Linear(3, 1)
-        # modify the weight matrices by hand
-        m.weight = torch.nn.Parameter(
-            torch.tensor([[1, 2, 3, 4], [-4, -3, -2, -1]]).to(
-                dtype=torch.float32)
-        )
-        n.weight = torch.nn.Parameter(
-            torch.tensor([[0, 0.1, -2]]).to(
-                dtype=torch.float32)
-        )
-
-        params_to_prune = (
-            (m, 'weight'),
-            (n, 'weight'),
-        )
-
-        # prune the 4 smallest weights globally by L1 magnitude
-        prune.global_unstructured(
-            params_to_prune,
-            pruning_method=prune.L1Unstructured,
-            amount=4
-        )
-
-        expected_mweight = torch.tensor([[0, 2, 3, 4], [-4, -3, -2, 0]],
-                                        dtype=m.weight.dtype)
-        self.assertEqual(expected_mweight, m.weight)
-
-        expected_nweight = torch.tensor([[0, 0, -2]]).to(dtype=n.weight.dtype)
-        self.assertEqual(expected_nweight, n.weight)
-
-    def test_global_pruning_importance_scores(self):
-        r"""Test that global l1 unstructured pruning over 2 parameters removes
-        the `amount=4` smallest global weights across the 2 parameters.
-        """
-        m = nn.Linear(4, 2)
-        n = nn.Linear(3, 1)
-        # modify the weight matrices by hand
-        m.weight = torch.nn.Parameter(
-            torch.tensor([[1, 2, 3, 4], [-4, -3, -2, -1]]).to(
-                dtype=torch.float32)
-        )
-        m_importance_scores = torch.tensor(
-            [[4, 2, 1, 3], [-3, -1, -2, -4]], dtype=torch.float32
-        )
-        n.weight = torch.nn.Parameter(
-            torch.tensor([[0, 0.1, -2]]).to(
-                dtype=torch.float32)
-        )
-        n_importance_scores = torch.tensor([[0, 10., -0.2]]).to(dtype=torch.float32)
-
-        params_to_prune = (
-            (m, 'weight'),
-            (n, 'weight'),
-        )
-        importance_scores = {
-            (m, 'weight'): m_importance_scores,
-            (n, 'weight'): n_importance_scores,
-        }
-
-        # prune the 4 smallest weights globally by L1 magnitude
-        prune.global_unstructured(
-            params_to_prune,
-            pruning_method=prune.L1Unstructured,
-            amount=4,
-            importance_scores=importance_scores,
-        )
-
-        expected_m_weight = torch.tensor([[1, 2, 0, 4], [-4, 0, -2, -1]],
-                                         dtype=m.weight.dtype)
-        self.assertEqual(expected_m_weight, m.weight)
-
-        expected_n_weight = torch.tensor([[0, 0.1, 0]]).to(dtype=n.weight.dtype)
-        self.assertEqual(expected_n_weight, n.weight)
-
-    def test_custom_from_mask_pruning(self):
-        r"""Test that the CustomFromMask is capable of receiving
-        as input at instantiation time a custom mask, and combining it with
-        the previous default mask to generate the correct final mask.
-        """
-        # new mask
-        mask = torch.tensor([[0, 1, 1, 0], [0, 0, 1, 1]])
-        # old mask
-        default_mask = torch.tensor([[0, 0, 0, 0], [1, 1, 1, 1]])
-
-        # some tensor (not actually used)
-        t = torch.rand_like(mask.to(dtype=torch.float32))
-
-        p = prune.CustomFromMask(mask=mask)
-
-        computed_mask = p.compute_mask(t, default_mask)
-        expected_mask = torch.tensor([[0, 0, 0, 0], [0, 0, 1, 1]], dtype=computed_mask.dtype)
-
-        self.assertEqual(computed_mask, expected_mask)
-
-    def test_pruning_rollback(self):
-        r"""Test that if something fails when the we try to compute the mask,
-        then the model isn't left in some intermediate half-pruned state.
-        The try/except statement in `apply` should handle rolling back
-        to the previous state before pruning began.
-        """
-        modules = [nn.Linear(5, 7), nn.Conv3d(2, 2, 2)]
-        names = ['weight', 'bias']
-
-        for m in modules:
-            for name in names:
-                with self.subTest(m=m, name=name):
-
-                    with mock.patch(
-                        "torch.nn.utils.prune.L1Unstructured.compute_mask"
-                    ) as compute_mask:
-                        compute_mask.side_effect = Exception('HA!')
-                        with self.assertRaises(Exception):
-                            prune.l1_unstructured(m, name=name, amount=0.9)
-
-                        self.assertTrue(
-                            name in dict(m.named_parameters())
-                        )
-                        self.assertFalse(
-                            name + '_mask' in dict(m.named_buffers())
-                        )
-                        self.assertFalse(
-                            name + '_orig' in dict(m.named_parameters())
-                        )
-
-    def test_pruning_serialization_model(self):
-        # create a model
-        model = torch.nn.Sequential(
-            torch.nn.Linear(10, 10),
-            torch.nn.ReLU(),
-            torch.nn.Linear(10, 1),
-        )
-        # check that everything looks normal before pruning
-        self.assertNotIn('0.weight_orig', model.state_dict())
-        self.assertNotIn('0.weight_mask', model.state_dict())
-        self.assertIn('0.weight', model.state_dict())
-
-        # prune one of its parameters
-        prune.l1_unstructured(module=model[0], name='weight', amount=0.9)
-
-        # check that the original weight and the new mask are present
-        self.assertIn('0.weight_orig', model.state_dict())
-        self.assertIn('0.weight_mask', model.state_dict())
-        self.assertNotIn('0.weight', model.state_dict())
-        self.assertTrue(hasattr(model[0], 'weight'))
-
-        pruned_weight = model[0].weight
-
-        with TemporaryFileName() as fname:
-            torch.save(model, fname)
-            new_model = torch.load(fname)
-
-        # check that the original weight and the new mask are present
-        self.assertIn('0.weight_orig', new_model.state_dict())
-        self.assertIn('0.weight_mask', new_model.state_dict())
-        self.assertNotIn('0.weight', new_model.state_dict())
-        self.assertTrue(hasattr(new_model[0], 'weight'))
-
-        self.assertEqual(pruned_weight, new_model[0].weight)
-
-    def test_pruning_serialization_state_dict(self):
-        # create a model
-        model = torch.nn.Sequential(
-            torch.nn.Linear(10, 10),
-            torch.nn.ReLU(),
-            torch.nn.Linear(10, 1),
-        )
-        # check that everything looks normal before pruning
-        self.assertNotIn('0.weight_orig', model.state_dict())
-        self.assertNotIn('0.weight_mask', model.state_dict())
-        self.assertIn('0.weight', model.state_dict())
-
-        # prune one of its parameters
-        prune.l1_unstructured(module=model[0], name='weight', amount=0.9)
-
-        # check that the original weight and the new mask are present
-        self.assertIn('0.weight_orig', model.state_dict())
-        self.assertIn('0.weight_mask', model.state_dict())
-        self.assertNotIn('0.weight', model.state_dict())
-        self.assertTrue(hasattr(model[0], 'weight'))
-
-        pruned_weight = model[0].weight
-
-        # make pruning permanent and restore parameter names as in base
-        # architecture
-        prune.remove(module=model[0], name='weight')
-
-        # check that the original weight and the new mask are no longer present
-        self.assertNotIn('0.weight_orig', model.state_dict())
-        self.assertNotIn('0.weight_mask', model.state_dict())
-        self.assertIn('0.weight', model.state_dict())
-
-        # save the state dict of model and reload it into new_model
-        new_model = torch.nn.Sequential(
-            torch.nn.Linear(10, 10),
-            torch.nn.ReLU(),
-            torch.nn.Linear(10, 1),
-        )
-        with TemporaryFileName() as fname:
-            torch.save(model.state_dict(), fname)
-            new_model.load_state_dict(torch.load(fname))
-
-        # check that the original weight and the new mask are not present in
-        # new_model either.
-        self.assertNotIn('0.weight_orig', new_model.state_dict())
-        self.assertNotIn('0.weight_mask', new_model.state_dict())
-        self.assertIn('0.weight', new_model.state_dict())
-
-        self.assertEqual(pruned_weight, new_model[0].weight)
-
-    def test_prune(self):
-        # create a new pruning method
-        p = prune.L1Unstructured(amount=2)
-        # create tensor to be pruned
-        t = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]]).to(dtype=torch.float32)
-        # create prior mask by hand
-        default_mask = torch.tensor([[1, 1, 1, 0], [1, 1, 0, 1]])
-        # since we are pruning the two lowest magnitude units, the outcome of
-        # the calculation should be this:
-        expected_mask = torch.tensor([[0, 0, 1, 0], [1, 1, 0, 1]])
-        pruned_tensor = p.prune(t, default_mask)
-        self.assertEqual(t * expected_mask, pruned_tensor)
-
-    def test_prune_importance_scores(self):
-        # create a new pruning method
-        p = prune.L1Unstructured(amount=2)
-        # create tensor to be pruned
-        t = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]]).to(dtype=torch.float32)
-        importance_scores = torch.tensor(
-            [[1, 2, 3, 4], [1.5, 1.6, 1.7, 1.8]]
-        ).to(dtype=torch.float32)
-        # create prior mask by hand
-        default_mask = torch.tensor([[1, 1, 1, 0], [1, 1, 0, 1]])
-        # since we are pruning the two lowest magnitude units, the outcome of
-        # the calculation should be this:
-        expected_mask = torch.tensor([[0, 1, 1, 0], [0, 1, 0, 1]])
-        pruned_tensor = p.prune(t, default_mask, importance_scores=importance_scores)
-        self.assertEqual(t * expected_mask, pruned_tensor)
-
-    def test_prune_importance_scores_mimic_default(self):
-        # create a new pruning method
-        p = prune.L1Unstructured(amount=2)
-        # create tensor to be pruned
-        t = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]]).to(dtype=torch.float32)
-        # create prior mask by hand
-        default_mask = torch.tensor([[1, 1, 1, 0], [1, 1, 0, 1]])
-        # since we are pruning the two lowest magnitude units, the outcome of
-        # the calculation should be this:
-        expected_mask = torch.tensor([[0, 0, 1, 0], [1, 1, 0, 1]])
-        pruned_tensor_without_importance_scores = p.prune(t, default_mask)
-        pruned_tensor_with_importance_scores = p.prune(t, default_mask, importance_scores=t)
-        self.assertEqual(pruned_tensor_without_importance_scores, pruned_tensor_with_importance_scores)
-        self.assertEqual(t * expected_mask, pruned_tensor_without_importance_scores)
-
-    def test_rnn_pruning(self):
-        l = torch.nn.LSTM(32, 32)
-        # This Module has 4 parameters called:
-        # 'weight_ih_l0', 'weight_hh_l0', 'bias_ih_l0', 'bias_hh_l0'
-
-        # Pruning one of them causes one of the weights to become a tensor
-        prune.l1_unstructured(l, 'weight_ih_l0', 0.5)
-        assert (
-            sum([isinstance(p, torch.nn.Parameter) for p in l._flat_weights])
-            == 3
-        )
-
-        # Removing the pruning reparametrization restores the Parameter
-        prune.remove(l, 'weight_ih_l0')
-        assert (
-            sum([isinstance(p, torch.nn.Parameter) for p in l._flat_weights])
-            == 4
-        )
-
-        # Make sure that, upon removal of the reparametrization, the
-        # `._parameters` and `.named_parameters` contain the right params.
-        # Specifically, the original weight ('weight_ih_l0') should be placed
-        # back in the parameters, while the reparametrization component
-        # ('weight_ih_l0_orig') should be removed.
-        assert 'weight_ih_l0' in l._parameters
-        assert l._parameters['weight_ih_l0'] is not None
-        assert 'weight_ih_l0_orig' not in l._parameters
-        assert 'weight_ih_l0' in dict(l.named_parameters())
-        assert dict(l.named_parameters())['weight_ih_l0'] is not None
-        assert 'weight_ih_l0_orig' not in dict(l.named_parameters())
-
     def test_rnn_weight_norm(self):
         def check_weight_norm(l, name, num_params):
             # This Module has 4 or 5 parameters called:

From e8abc62921b076b19bf45d5ebf880684358aae98 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Thu, 24 Nov 2022 08:11:47 -0800
Subject: [PATCH 1263/1922] Cond capture with fake tensors actually works;
 don't raise in this case (#89638)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89638
Approved by: https://github.com/anjali411
---
 test/dynamo/test_export.py       |  1 -
 test/dynamo/test_misc.py         | 67 --------------------------------
 torch/_dynamo/variables/torch.py |  3 --
 3 files changed, 71 deletions(-)

diff --git a/test/dynamo/test_export.py b/test/dynamo/test_export.py
index 21c0d2004bb9e..fb630f06d29f5 100644
--- a/test/dynamo/test_export.py
+++ b/test/dynamo/test_export.py
@@ -1433,7 +1433,6 @@ def nop(x):
             )
 
     @patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
     def test_export_with_module_layer(self):
         from functorch.experimental.cond import cond
 
diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index 782c166b15bfd..bd551fb36a51b 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -1700,25 +1700,6 @@ def fn(x, func):
         opt_fn(x, torch.mul)
         self.assertEqual(cnts.op_count, 1)
 
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", True)
-    @patch.object(torch._dynamo.config, "suppress_errors", True)
-    def test_unsupported_fake_tensor(self):
-        def f(x):
-            return torch.quantize_per_tensor(x, 0.1, 10, torch.quint8)
-
-        x = torch.randn(2, 2)
-        cnts = torch._dynamo.testing.CompileCounter()
-        opt_f = torch._dynamo.optimize(cnts)(f)
-        opt_f(x)
-        self.assertEqual(cnts.op_count, 0)
-
-        torch._dynamo.reset()
-        with patch.object(torch._dynamo.config, "fake_tensor_propagation", False):
-            opt_f = torch._dynamo.optimize_assert(
-                torch._dynamo.testing.CompileCounter()
-            )(f)
-            opt_f(x)
-
     def test_inline_list_mutation(self):
         def f1(x):
             x.append(torch.ones(8))
@@ -2415,53 +2396,6 @@ def f(pred, pred2, x):
         )  # * -1 then add x
         self.assertTrue(cc.frame_count, 2)
 
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
-    def test_cond_nested_fake_tensor_off(self):
-        from functorch.experimental.cond import cond
-
-        def true_fn_nested(x):
-            return x * 10
-
-        def false_fn_nested(x):
-            return x * -1
-
-        def true_fn(pred2, x):
-            return x.sin()
-
-        def false_fn(pred2, x):
-            return x + cond(pred2, true_fn_nested, false_fn_nested, [x])
-
-        def f(pred, pred2, x):
-            return cond(pred, true_fn, false_fn, [pred2, x])
-
-        cc = torch._dynamo.testing.CompileCounter()
-        opt_fn = torch._dynamo.optimize(cc)(f)
-        true_true_sin = opt_fn(
-            torch.tensor(True), torch.tensor(True), torch.tensor([0.25, 0.25])
-        )
-        self.assertTrue(same(torch.sin(torch.tensor([0.25, 0.25])), true_true_sin))
-
-        true_false_sin = opt_fn(
-            torch.tensor(True), torch.tensor(False), torch.tensor([0.25, 0.25])
-        )
-        self.assertTrue(same(torch.sin(torch.tensor([0.25, 0.25])), true_false_sin))
-
-        false_true_sum_mult = opt_fn(
-            torch.tensor(False), torch.tensor(True), torch.tensor([0.25, 0.25])
-        )
-        self.assertTrue(
-            same(torch.tensor([2.75, 2.75]), false_true_sum_mult)
-        )  # * 10 then add x
-
-        false_false_sum_neg = opt_fn(
-            torch.tensor(False), torch.tensor(False), torch.tensor([0.25, 0.25])
-        )
-        self.assertTrue(
-            same(torch.tensor([0.0, 0.0]), false_false_sum_neg)
-        )  # * -1 then add x
-        self.assertTrue(cc.frame_count, 1)
-
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
     def test_cond_export(self):
         from functorch.experimental.cond import cond
 
@@ -2507,7 +2441,6 @@ def f(pred, pred2, x):
             same(torch.tensor([0.0, 0.0]), false_false_sum_neg)
         )  # * -1 then add x
 
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
     def test_cond_export_single_arg(self):
         from functorch.experimental.cond import cond
 
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index 6e60f2be4ce03..979a948fbe8f6 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -715,9 +715,6 @@ def register_as_subgraph(fn, name, args):
             # ops - see torch/dispatch/_dispatcher.py
             from .. import config
 
-            if config.fake_tensor_propagation:
-                unimplemented("Fake tensor mode not yet supported for cond")
-
             assert len(p_args) == 4
             assert type(args[0]) is TensorVariable  # predicate
             assert type(p_args[1]) is UserFunctionVariable  # true_fn

From 76c81e18ff61d8496fd1bf008110412b5ebeaea1 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Thu, 24 Nov 2022 08:11:48 -0800
Subject: [PATCH 1264/1922] Support unspecialized integers with dynamic shapes
 (#89639)

Previously, we hackily wrapped unspecialized integers into
tensors and treated them as tensor inputs.  Sometimes, downstream
operations would not be able to deal with the tensor input.  Now,
we wrap them into SymInt, so more correct overload selection occurs.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89639
Approved by: https://github.com/anjali411
---
 test/dynamo/test_unspec.py         |  2 +-
 torch/_dynamo/variables/builder.py | 17 +++++++++++++++--
 torch/_meta_registrations.py       |  7 +++++++
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/test/dynamo/test_unspec.py b/test/dynamo/test_unspec.py
index fd5396981b740..7ffed902fd9dc 100644
--- a/test/dynamo/test_unspec.py
+++ b/test/dynamo/test_unspec.py
@@ -137,7 +137,7 @@ def fn(x):
         res2 = opt_fn(x)
         self.assertTrue(same(res1, res2))
 
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
+    @patch.object(torch._dynamo.config, "dynamic_shapes", True)
     def test_multiple_consecutive_random_calls_before_graph(self):
         def fn(x):
             dim1 = random.randrange(start=0, stop=5)
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index 333be76598748..0ba7a53b6cbca 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -580,7 +580,19 @@ def wrap_unspecialized_primitive(self, value):
         if self.name in self.tx.output.unspec_variable_map:
             return self.tx.output.unspec_variable_map[self.name]
         else:
-            wrapped_value = torch.tensor(value)
+            if (
+                config.dynamic_shapes
+                and config.fake_tensor_propagation
+                and isinstance(value, int)
+            ):
+                shape_env = self.tx.output.shape_env
+                wrapped_value = shape_env.create_symintnode(
+                    shape_env.create_symbol(value)
+                )
+                # TODO: Do float
+            else:
+                # TODO: Eliminate this case entirely
+                wrapped_value = torch.tensor(value)
             if not is_constant_source(self.get_source()):
                 self.tx.output.graphargs.append(
                     GraphArg(self.get_source(), wrapped_value, True)
@@ -591,7 +603,8 @@ def wrap_unspecialized_primitive(self, value):
             else:
                 options = {}
             options.update({"source": self.get_source()})
-            options.update({"raw_value": value})
+            if isinstance(wrapped_value, torch.Tensor):
+                options.update({"raw_value": value})
 
             proxy = self.tx.output.create_graph_input(
                 re.sub(r"[^a-zA-Z0-9]+", "_", self.name), type(wrapped_value)
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 6232462ede216..a0fe373ea6e54 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -95,6 +95,13 @@ def meta_randint_low(
     )
 
 
+@register_meta(aten.rand.default)
+def meta_rand_default(size, *, dtype=None, layout=None, device=None, pin_memory=None):
+    return torch.empty(
+        size, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
+    )
+
+
 @register_meta([aten._fft_c2r.default, aten._fft_c2r.out])
 @out_wrapper()
 def meta_fft_c2r(self, dim, normalization, lastdim):

From d993041a784e3fbe92a5d537ef855eadae9c8ab5 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Thu, 24 Nov 2022 09:00:08 -0800
Subject: [PATCH 1265/1922] Easy: These tests work with fake_tensor_propagation
 on (#89640)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89640
Approved by: https://github.com/anjali411, https://github.com/albanD
---
 test/dynamo/test_optimizations.py | 2 --
 test/dynamo/test_repros.py        | 2 --
 2 files changed, 4 deletions(-)

diff --git a/test/dynamo/test_optimizations.py b/test/dynamo/test_optimizations.py
index d9f25c5954995..1f69a8fd79062 100644
--- a/test/dynamo/test_optimizations.py
+++ b/test/dynamo/test_optimizations.py
@@ -3,7 +3,6 @@
 import json
 import os
 import unittest
-from unittest.mock import patch
 
 import torch
 
@@ -125,7 +124,6 @@ def compiler_fn(graph, example_inputs):
         self.assertTrue(same(r1, r2))
         self.assertTrue(same(r1, r3))
 
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
     @unittest.skipIf(not has_functorch(), "requires functorch")
     def test_log_conv_args(self):
         model = Conv_Bn_Relu(3, 32, kernel_size=3, stride=1)
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index b38f904feb31d..52afd7db6bbbf 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -971,7 +971,6 @@ def test_maml_item_capture(self):
         self.assertIn(cnt.op_count, (36, 35, 34, 29, 28, 27))
 
     # see: https://github.com/pytorch/pytorch/issues/80067
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
     @patch.object(torch._dynamo.config, "capture_scalar_outputs", False)
     def test_maml_no_item_capture(self):
         a = torch.randn(5, 1, 28, 28)
@@ -1590,7 +1589,6 @@ def __init__(self):
 
         self.assertTrue(same(ref, res))
 
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
     def test_specialized_stride(self):
         def f():
             e = torch.empty(4)

From 72ccce3bafc51d797545111df71c9f9cca10cd64 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Thu, 24 Nov 2022 09:00:08 -0800
Subject: [PATCH 1266/1922] Force test_rng_state to run with fake tensor prop
 (#89641)

I'm not really sure what desertfire's intended follow up was
on https://github.com/pytorch/pytorch/pull/87490 because when I remove
the unsupported() call, dynamo tests pass.  But the change here is
conservative and I think strictly better than the current situation.
The idea is to force fake tensor pop on for the test, and then just
observe that we are doing a graph break.  Clearly, export doesn't work,
so I manually xfail it.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89641
Approved by: https://github.com/anjali411
---
 test/dynamo/test_repros.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 52afd7db6bbbf..05780536bdf61 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -1032,8 +1032,7 @@ def test_create_rand_mask_from_inputs(self):
         self.assertEqual(cnt.frame_count, 1)
         self.assertEqual(cnt.op_count, 8)
 
-    # TODO: make set_rng_state work with FakeTensor/aot_autograd
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
+    @patch.object(torch._dynamo.config, "fake_tensor_propagation", True)
     def test_rng_state(self):
         def fn():
             state = torch.get_rng_state()
@@ -1047,9 +1046,14 @@ def fn():
 
         before, after = opt_fn()
         self.assertTrue(same(before, after))
-        self.assertEqual(cnt.frame_count, 1)
-        self.assertEqual(cnt.op_count, 4)  # rand, rand
-        graph, _ = torch._dynamo.export(fn)
+        self.assertEqual(cnt.frame_count, 2)
+        self.assertEqual(cnt.op_count, 3)  # rand, rand
+        try:
+            graph, _ = torch._dynamo.export(fn)
+            # See https://github.com/pytorch/pytorch/pull/87490
+            self.fail("unexpected export success")
+        except torch._dynamo.exc.Unsupported:
+            pass
 
     def test_seq_append_list(self):
         x = torch.randn(4, 10)

From 6759bc63741f789419ef1b77e523df6b01483449 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Thu, 24 Nov 2022 09:00:09 -0800
Subject: [PATCH 1267/1922] Run optimizer tests with fake tensors (#89643)

This is a slight regression: RAdam and Adagrad don't appear to
trace at all under fake tensors.  But I think this is a more accurate
reflection of the current state of affairs.

Along the way fix some problems on the fake tensor path.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89643
Approved by: https://github.com/anjali411
---
 test/dynamo/test_optimizers.py     | 13 +++++++++----
 torch/_dynamo/variables/builder.py |  1 +
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/test/dynamo/test_optimizers.py b/test/dynamo/test_optimizers.py
index 86d5d7ba6ce97..8bd0e2250abac 100644
--- a/test/dynamo/test_optimizers.py
+++ b/test/dynamo/test_optimizers.py
@@ -79,7 +79,7 @@ def setUpClass(cls):
         )
         cls._exit_stack.enter_context(
             unittest.mock.patch.object(
-                torch._dynamo.config, "fake_tensor_propagation", False
+                torch._dynamo.config, "fake_tensor_propagation", True
             )
         )
         cls._exit_stack.enter_context(enable_optimizer_tracing())
@@ -91,8 +91,13 @@ def setUpClass(cls):
     # test_lbfgs = make_test(
     #    torch.optim.LBFGS, exp_frame_cnt=3, closure=lambda: model(input).sum()
     # )
-    # RAdam has data-dependent control which breaks the graph
-    test_radam = make_test(torch.optim.RAdam, exp_frame_cnt=1)
+
+    # RAdam and Adagrad have data-dependent control which breaks the graph;
+    # furthermore, the break is inside a for loop, so we bail on the frame
+    # entirely.  This is basically an xfail; if the frame count goes up
+    # you done good
+    test_radam = make_test(torch.optim.RAdam, exp_frame_cnt=0)
+    test_adagrad = make_test(torch.optim.Adagrad, exp_frame_cnt=0)
 
     # ASGD has a small optimization that avoids averaging
     # This will fully capture the graph once that optimization is removed
@@ -108,7 +113,7 @@ def setUpClass(cls):
 
 # exclude SparseAdam because other areas of the stack don't support it yet
 # the others are handled specially above
-exclude = set(["SGD", "Optimizer", "SparseAdam", "LBFGS", "RAdam", "ASGD"])
+exclude = set(["SGD", "Optimizer", "SparseAdam", "LBFGS", "RAdam", "Adagrad", "ASGD"])
 optimizers = [
     opt
     for opt in torch.optim.__dict__.values()
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index 0ba7a53b6cbca..dec8bad644329 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -85,6 +85,7 @@
 from .nn_module import UnspecializedNNModuleVariable
 from .tensor import (
     DynamicShapeVariable,
+    FakeItemVariable,
     TensorVariable,
     TensorWithTFOverrideVariable,
     UnspecializedNumpyVariable,

From 4bc56d552700713092e59f8f95222ca569cc5dad Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Thu, 24 Nov 2022 09:00:09 -0800
Subject: [PATCH 1268/1922] Reenable fake_tensor_propagation on test_cudnn_rnn
 (#89644)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89644
Approved by: https://github.com/anjali411
---
 test/inductor/test_torchinductor.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 126fd86d5b81f..ee2eba24ff368 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -2863,8 +2863,6 @@ def fn(a, b):
                 ),
             )
 
-    # https://github.com/pytorch/torchdynamo/issues/467
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
     def test_cudnn_rnn(self):
         if self.device == "cpu":
             raise unittest.SkipTest("requires CUDA")

From 5b15f132722d191e497e76b4db51d8d085e7ce12 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Thu, 24 Nov 2022 14:31:00 -0500
Subject: [PATCH 1269/1922] Revert "Dont clone unmutated args in triton
 autotuning (#89519)" (#89652)

This reverts commit f18f0c70ab10c400947e71be30794e04dcc22acf.

Testing to see if this fixes gmixer_24_224 mixer_b16_224

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89652
Approved by: https://github.com/eellison
---
 test/inductor/test_torchinductor.py    | 19 ----------------
 torch/_inductor/codegen/triton.py      | 30 ++------------------------
 torch/_inductor/triton_ops/autotune.py | 28 +++++++-----------------
 3 files changed, 10 insertions(+), 67 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index ee2eba24ff368..b24fde54b9984 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -5016,24 +5016,6 @@ def forward(self, input: torch.Tensor):
 
             self.assertTrue(torch.allclose(module(input), traced(input)))
 
-        @patch.object(config.triton, "autotune", True)
-        def test_inplace_add_alpha_autotune(self):
-            def fn(x, y):
-                aten.add_.Tensor(x, y, alpha=0.55)
-                return (x,)
-
-            x1 = torch.zeros(2, 3, 4, 10, device="cuda")
-            x2 = torch.zeros(2, 3, 4, 10, device="cuda")
-            x3 = torch.zeros(2, 3, 4, 10, device="cuda")
-            y = torch.randn(2, 3, 4, 10, device="cuda").to(
-                memory_format=torch.channels_last
-            )
-            fn_fx = make_fx(fn)(x1, y)
-            fn_compiled = compile_fx_inner(fn_fx, [x1, y])
-            fn(x2, y)
-            fn_compiled([x3, y])
-            assert same(x2, x3)
-
         def test_permute_linear_fusion(self):
             class TestModule(torch.nn.Module):
                 def __init__(self, k: int, n: int):
@@ -5343,7 +5325,6 @@ def decorator(fn):
                         meta=meta,
                         configs=configs,
                         save_cache_hook=False,
-                        mutated_arg_names=["in_out_ptr0"],
                     )
 
                 return decorator
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 16cff0770a072..e14b417c173f8 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -515,18 +515,11 @@ class TritonKernel(Kernel):
     overrides = TritonOverrides
     sexpr = texpr
 
-    def __init__(
-        self,
-        *groups,
-        mutations=None,
-        pid_cache=None,
-        reduction_hint=ReductionHint.DEFAULT,
-    ):
+    def __init__(self, *groups, pid_cache=None, reduction_hint=ReductionHint.DEFAULT):
         if pid_cache is None:
             pid_cache = {}
         super(TritonKernel, self).__init__()
         self.numels = [V.graph.sizevars.simplify(s) for s in groups]
-        self.mutations = mutations
         self.range_trees = []
         self.range_tree_nodes = {}
         self.iter_vars_count = itertools.count()
@@ -1024,21 +1017,10 @@ def codegen_kernel(self, name=None):
             )
 
         argdefs, _, signature = self.args.python_argdefs()
-
-        mutated_args = []
-        for mutation in self.mutations:
-            if mutation in self.args.input_buffers:
-                mutated_args.append(self.args.input_buffers[mutation])
-            if mutation in self.args.inplace_buffers:
-                mutated_args.append(self.args.inplace_buffers[mutation])
-            if mutation in self.args.output_buffers:
-                mutated_args.append(self.args.output_buffers[mutation])
-
         triton_meta = {
             "signature": dict(enumerate(map(signature_of, signature))),
             "device": V.graph.scheduler.current_device.index,
             "constants": {},
-            "mutated_arg_names": mutated_args,
         }
 
         for tree in self.range_trees:
@@ -1313,15 +1295,7 @@ def codegen_node_schedule(self, node_schedule, numel, reduction_numel):
                 reduction_hint_val = ReductionHint.DEFAULT
         else:
             reduction_hint_val = ReductionHint.DEFAULT
-
-        mutations = set()
-        for node in node_schedule:
-            if hasattr(node, "get_mutations"):
-                mutations.update(node.get_mutations())
-
-        with TritonKernel(
-            *tiled_groups, reduction_hint=reduction_hint_val, mutations=mutations
-        ) as kernel:
+        with TritonKernel(*tiled_groups, reduction_hint=reduction_hint_val) as kernel:
             stack = contextlib.ExitStack()
             for node in node_schedule:
                 if node not in (EnableReduction, DisableReduction):
diff --git a/torch/_inductor/triton_ops/autotune.py b/torch/_inductor/triton_ops/autotune.py
index 285995c6254fa..808241cd02a2f 100644
--- a/torch/_inductor/triton_ops/autotune.py
+++ b/torch/_inductor/triton_ops/autotune.py
@@ -42,12 +42,11 @@ class CachingAutotuner(KernelInterface):
     configs, and does not rely on the Triton JIT.
     """
 
-    def __init__(self, fn, meta, configs, save_cache_hook, mutated_arg_names):
+    def __init__(self, fn, meta, configs, save_cache_hook):
         super().__init__()
         self.fn = fn
         self.meta = meta
         self.save_cache_hook = save_cache_hook
-        self.mutated_arg_names = mutated_arg_names
         self.configs = configs
         self.launchers = []
         self.lock = threading.Lock()
@@ -142,17 +141,12 @@ def autotune_to_one_config(self, *args, **kwargs):
         """Do the actual autotuning"""
         from ..compile_fx import clone_preserve_strides
 
-        # clone inplace buffers to avoid autotune contaminating them if
-        # the kernel does in-place stores. avoid cloning other buffers because
-        # it leads to increase memory use
-        cloned_args = []
-        for i, arg in enumerate(args):
-            if self.fn.arg_names[i] in self.mutated_arg_names:
-                assert isinstance(arg, torch.Tensor)
-                cloned_args.append(clone_preserve_strides(arg))
-            else:
-                cloned_args.append(arg)
-
+        # clone the input args to avoid autotune contaminating them if
+        # the kernel does in-place stores
+        cloned_args = [
+            clone_preserve_strides(arg) if isinstance(arg, torch.Tensor) else arg
+            for arg in args
+        ]
         timings = {
             launcher: self.bench(launcher, *cloned_args, **kwargs)
             for launcher in self.launchers
@@ -257,15 +251,9 @@ def save_cache_hook(cfg):
     else:
         save_cache_hook = None
 
-    mutated_arg_names = meta.pop("mutated_arg_names", ())
-
     def decorator(fn):
         return CachingAutotuner(
-            fn,
-            meta=meta,
-            configs=configs,
-            save_cache_hook=save_cache_hook,
-            mutated_arg_names=mutated_arg_names,
+            fn, meta=meta, configs=configs, save_cache_hook=save_cache_hook
         )
 
     return decorator

From 5252199056ab67e87808b29660279a4608f3cf5b Mon Sep 17 00:00:00 2001
From: XiaobingSuper <xiaobing.zhang@intel.com>
Date: Thu, 24 Nov 2022 02:32:55 -0500
Subject: [PATCH 1270/1922] TorchDynamo: weight prepack for onednn convolution
 external call (#88988)

This PR is about enabled weight prepack using the MKLDNN tensor:
1.  enable fake tensor mode for MKLDNN tensor input.
2.  make convolution fusion kernel support MKLDNN tensor input.
3. do the weight prepack at FX fusion step.

For better performance, we always use channels_last for CPU convolution path. because we test that the channels_last path can get a better performance than block input path, and also avoid the activation's layout conversion(plain to block, block to plain), currently, there only need plain to plain format conversion.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88988
Approved by: https://github.com/jgong5, https://github.com/jansel
---
 aten/src/ATen/native/mkldnn/Conv.cpp          | 90 +++++++++++++------
 .../ATen/native/mkldnn/MKLDNNConversions.cpp  | 38 +++++---
 aten/src/ATen/native/mkldnn/Utils.cpp         | 24 +++--
 aten/src/ATen/native/native_functions.yaml    |  2 +-
 torch/_inductor/ir.py                         |  8 +-
 torch/_inductor/overrides.py                  | 83 +++++++++++++----
 torch/_subclasses/meta_utils.py               | 18 +++-
 7 files changed, 191 insertions(+), 72 deletions(-)

diff --git a/aten/src/ATen/native/mkldnn/Conv.cpp b/aten/src/ATen/native/mkldnn/Conv.cpp
index ec62715129f4d..3d8188c003e1d 100644
--- a/aten/src/ATen/native/mkldnn/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/Conv.cpp
@@ -8,10 +8,9 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/Functions.h>
 #else
-#include <ATen/ops/_to_dense_native.h>
 #include <ATen/ops/_add_relu_native.h>
-#include <ATen/ops/conv2d.h>
-#include <ATen/ops/conv3d.h>
+#include <ATen/ops/_to_dense_native.h>
+#include <ATen/ops/convolution.h>
 #include <ATen/ops/empty.h>
 #include <ATen/ops/empty_like.h>
 #include <ATen/ops/mkldnn_convolution_native.h>
@@ -232,6 +231,7 @@ Tensor _mkldnn_convolution(
     IntArrayRef stride,
     IntArrayRef dilation,
     int64_t groups,
+    bool use_channels_last,
     c10::string_view attr = "none",
     torch::List<c10::optional<at::Scalar>> scalars =
         torch::List<c10::optional<at::Scalar>>(),
@@ -254,22 +254,32 @@ Tensor _mkldnn_convolution(
 
   check_shape_forward(input_t, weight_t, bias, padding, stride, dilation, groups);
 
-  bool is_channels_last = mkldnn_conv_use_channels_last(input_t, weight_t);
-  auto memory_format = mkldnn_convolution_memory_format(input_t.ndimension(), is_channels_last);
+  auto memory_format =
+      mkldnn_convolution_memory_format(input_t.ndimension(), use_channels_last);
 
   auto output_sizes = conv_output_size(input_t.sizes(), weight_t.sizes(), padding, stride, dilation);
   auto output = at::empty({0}, input_t.options());
   ideep::tensor y;
-  if (is_channels_last) {
+  if (use_channels_last) {
     output.resize_(output_sizes, memory_format);
     y = itensor_from_tensor(output);
   }
   _mkldnn_convolution_out(
-    input_t, weight_t, bias, output_sizes, y, stride, dilation, padding, groups, is_channels_last, op_attr);
+      input_t,
+      weight_t,
+      bias,
+      output_sizes,
+      y,
+      stride,
+      dilation,
+      padding,
+      groups,
+      use_channels_last,
+      op_attr);
 
   if (input_t.is_mkldnn()) {
     return MKLDNNTensor(y, input_t.options());
-  } else if (!is_channels_last) {
+  } else if (!use_channels_last) {
     return mkldnn_to_dense(MKLDNNTensor(y, input_t.options()));
   } else {
     TORCH_INTERNAL_ASSERT(y.get_desc().is_nhwc());
@@ -285,8 +295,16 @@ Tensor mkldnn_convolution(
     IntArrayRef stride,
     IntArrayRef dilation,
     int64_t groups) {
+  bool use_channels_last = mkldnn_conv_use_channels_last(input_t, weight_t);
   return _mkldnn_convolution(
-      input_t, weight_t, bias_opt, padding, stride, dilation, groups);
+      input_t,
+      weight_t,
+      bias_opt,
+      padding,
+      stride,
+      dilation,
+      groups,
+      use_channels_last);
 }
 
 Tensor mkldnn_convolution_pointwise(
@@ -301,6 +319,8 @@ Tensor mkldnn_convolution_pointwise(
     torch::List<c10::optional<at::Scalar>> scalars,
     c10::optional<c10::string_view> algorithm) {
   c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset);
+  bool use_channels_last =
+      weight_t.is_mkldnn() || mkldnn_conv_use_channels_last(input_t, weight_t);
   return _mkldnn_convolution(
       input_t,
       weight_t,
@@ -309,6 +329,7 @@ Tensor mkldnn_convolution_pointwise(
       stride,
       dilation,
       groups,
+      use_channels_last,
       attr,
       scalars,
       algorithm);
@@ -363,8 +384,9 @@ Tensor mkldnn_convolution_pointwise_binary(
   // Only calling fusion path for channels_last path.
   // TODO: OneDNN doesn't optimize well for groups > 1 case, it will be enabled
   // at next OneDNN release.
-  bool can_be_fused =
-      groups == 1 && mkldnn_conv_use_channels_last(input_t, weight_t);
+  bool use_channels_last =
+      weight_t.is_mkldnn() || mkldnn_conv_use_channels_last(input_t, weight_t);
+  bool can_be_fused = groups == 1 && use_channels_last;
 
   c10::string_view unary_attr_value = "none";
   ideep::algorithm unary_alg;
@@ -381,13 +403,13 @@ Tensor mkldnn_convolution_pointwise_binary(
   TORCH_CHECK(
       it_binary != fusion_binary_alg_map().end(),
       "Binary Fusion behavior undefined.");
-
+  c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset);
   if (can_be_fused) {
-    c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset);
     auto memory_format =
         mkldnn_convolution_memory_format(input_t.ndimension(), true);
     auto input = input_t.contiguous(memory_format);
-    auto weight = weight_t.contiguous(memory_format);
+    auto weight =
+        weight_t.is_mkldnn() ? weight_t : weight_t.contiguous(memory_format);
     auto other = other_t.contiguous(memory_format);
     auto output = at::empty_like(other);
     const ideep::tensor x = itensor_from_tensor(input);
@@ -446,12 +468,12 @@ Tensor mkldnn_convolution_pointwise_binary(
     // Fallback case, if inputs are not channels last or have different dtype,
     // OneDNN fusion may have performance regression.
     Tensor output;
-    if (input_t.ndimension() == 4) {
-      output = at::conv2d(
-          input_t, weight_t, bias_opt, stride, padding, dilation, groups);
+    if (weight_t.is_mkldnn()) {
+      output = _mkldnn_convolution(
+          input_t, weight_t, bias, padding, stride, dilation, groups, true);
     } else {
-      output = at::conv3d(
-          input_t, weight_t, bias_opt, stride, padding, dilation, groups);
+      output = at::convolution(
+          input_t, weight_t, bias, stride, padding, dilation, false, 0, groups);
     }
     if (binary_attr == "add" && unary_attr_value != "none") {
       output = at::native::add_relu_(output, other_t);
@@ -526,11 +548,12 @@ Tensor& mkldnn_convolution_pointwise_binary_(
       output_sizes == other_t.sizes(),
       "Add Fusion's inputs should have same shape");
   // Only calling fusion path for channels_last path and the output is contiguous tensor(channels_last).
-  bool can_be_fused = mkldnn_conv_use_channels_last(input_t, weight_t)
-                      && (other_t.is_contiguous(at::MemoryFormat::ChannelsLast)
-                          || other_t.is_contiguous(at::MemoryFormat::ChannelsLast3d));
+  bool can_be_fused = (weight_t.is_mkldnn() ||
+                       mkldnn_conv_use_channels_last(input_t, weight_t)) &&
+      (other_t.is_contiguous(at::MemoryFormat::ChannelsLast) ||
+       other_t.is_contiguous(at::MemoryFormat::ChannelsLast3d));
+  c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset);
   if (can_be_fused) {
-    c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset);
     ideep::tensor y = itensor_from_tensor(other_t);
     ideep::attr_t op_attr;
     if (unary_attr.has_value()) {
@@ -554,12 +577,12 @@ Tensor& mkldnn_convolution_pointwise_binary_(
     // Fallback case, if inputs are not channels last or have different dtype,
     // OneDNN fusion may have performance regression.
     Tensor output;
-    if (input_t.ndimension() == 4) {
-      output = at::conv2d(
-          input_t, weight_t, bias_opt, stride, padding, dilation, groups);
+    if (weight_t.is_mkldnn()) {
+      output = _mkldnn_convolution(
+          input_t, weight_t, bias, padding, stride, dilation, groups, true);
     } else {
-      output = at::conv3d(
-          input_t, weight_t, bias_opt, stride, padding, dilation, groups);
+      output = at::convolution(
+          input_t, weight_t, bias, stride, padding, dilation, false, 0, groups);
     }
     if (unary_attr.has_value()) {
       other_t = at::native::add_relu_(other_t, output);
@@ -702,6 +725,17 @@ TORCH_LIBRARY_IMPL(mkldnn, CPU, m) {
       TORCH_FN(mkldnn_convolution_pointwise_binary_));
 }
 
+TORCH_LIBRARY_IMPL(mkldnn, MkldnnCPU, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("mkldnn::_convolution_pointwise"),
+      TORCH_FN(mkldnn_convolution_pointwise));
+  m.impl(
+      TORCH_SELECTIVE_NAME("mkldnn::_convolution_pointwise.binary"),
+      TORCH_FN(mkldnn_convolution_pointwise_binary));
+  m.impl(
+      TORCH_SELECTIVE_NAME("mkldnn::_convolution_pointwise_.binary"),
+      TORCH_FN(mkldnn_convolution_pointwise_binary_));
+}
 }}  // namespace at::native
 
 #endif
diff --git a/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp b/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
index f1ac8f9d53830..9188184b50e3f 100644
--- a/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
+++ b/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
@@ -88,7 +88,8 @@ Tensor mkldnn_reorder_conv2d_weight(
     IntArrayRef padding,
     IntArrayRef stride,
     IntArrayRef dilation,
-    int64_t groups) {
+    int64_t groups,
+    c10::OptionalArrayRef<int64_t> input_size) {
   if (self.scalar_type() == ScalarType::BFloat16) {
     TORCH_CHECK(mkldnn_bf16_device_check(),
         "mkldnn_reorder_conv2d_weight: bf16 path needs the cpu support avx512bw, avx512vl and avx512dq");
@@ -106,16 +107,28 @@ Tensor mkldnn_reorder_conv2d_weight(
     w.reshape({wdims[0] * wdims[1], wdims[2], wdims[3], wdims[4]});
   }
 
-  auto desc =
-      ideep::convolution_forward::expected_weights_desc(
-          w.get_dims(),
-          w.get_data_type(),
-          {stride.begin(), stride.end()},
-          {padding.begin(), padding.end()},
-          {padding.begin(), padding.end()},
-          {dilation.begin(), dilation.end()},
-          groups,
-          ideep::algorithm::convolution_direct);
+  ideep::dims src_dims = ideep::dims();
+  bool is_channels_last = false;
+  if (input_size.has_value()) {
+    src_dims = input_size.value().vec();
+    // if has input size, we always use channels last.
+    is_channels_last = true;
+  }
+
+  auto desc = ideep::convolution_forward::expected_weights_desc(
+      w.get_dims(),
+      w.get_data_type(),
+      {stride.begin(), stride.end()},
+      {padding.begin(), padding.end()},
+      {padding.begin(), padding.end()},
+      {dilation.begin(), dilation.end()},
+      groups,
+      ideep::algorithm::convolution_direct,
+      ideep::prop_kind::forward,
+      w.get_data_type(),
+      src_dims,
+      ideep::attr_t(),
+      is_channels_last);
   ideep::tensor result;
   result.init(desc);
   result.feed_from(w);
@@ -169,7 +182,8 @@ Tensor mkldnn_reorder_conv2d_weight(
     IntArrayRef padding,
     IntArrayRef stride,
     IntArrayRef dilation,
-    int64_t groups) {
+    int64_t groups,
+    c10::OptionalArrayRef<int64_t> input_size) {
   TORCH_CHECK(false, "mkldnn_reorder_conv2d_weight: MKL-DNN build is disabled");
 }
 
diff --git a/aten/src/ATen/native/mkldnn/Utils.cpp b/aten/src/ATen/native/mkldnn/Utils.cpp
index 2c626884d8f0f..2c9bcc016e47d 100644
--- a/aten/src/ATen/native/mkldnn/Utils.cpp
+++ b/aten/src/ATen/native/mkldnn/Utils.cpp
@@ -38,13 +38,19 @@ void check_mkldnn_binary_fusion_inputs(
     const Tensor& other,
     const Tensor& weight,
     const Tensor& bias) {
-  TORCH_CHECK(
-      input.options().type_equal(weight.options()),
-      "Input type (",
-      input.toString(),
-      ") and weight type (",
-      weight.toString(),
-      ") should be the same");
+  if (!weight.is_mkldnn()) {
+    TORCH_CHECK(
+        input.options().type_equal(weight.options()),
+        "Input type (",
+        input.toString(),
+        ") and weight type (",
+        weight.toString(),
+        ") should be the same");
+  } else {
+    TORCH_CHECK(
+        input.scalar_type() == input.scalar_type(),
+        "mkldnn pointwise binary: input dtype and weight dtype should be the same");
+  }
   TORCH_CHECK(
       input.options().type_equal(other.options()),
       "Input type (",
@@ -61,11 +67,11 @@ void check_mkldnn_binary_fusion_inputs(
       ") should be the same");
   TORCH_CHECK(
       input.device().is_cpu(),
-      "mkldnn pointwise binary fusion: inputs' device should be CPU")
+      "mkldnn pointwise binary fusion: input's device should be CPU");
   TORCH_CHECK(
       input.scalar_type() == ScalarType::Float ||
           input.scalar_type() == ScalarType::BFloat16,
-      "mkldnn pointwise binary: inputs' dtypoe should be float or bfloat16")
+      "mkldnn pointwise binary: input's dtype should be float or bfloat16");
   if (input.scalar_type() == ScalarType::BFloat16) {
     TORCH_CHECK(
         mkldnn_bf16_device_check(),
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 701b88e0254d2..2c32062e9a8c3 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -6618,7 +6618,7 @@
     CPU: dense_to_mkldnn
   autogen: to_mkldnn.out
 
-- func: mkldnn_reorder_conv2d_weight(Tensor self, int[2] padding=0, int[2] stride=1, int[2] dilation=1, int groups=1) -> Tensor
+- func: mkldnn_reorder_conv2d_weight(Tensor self, int[2] padding=0, int[2] stride=1, int[2] dilation=1, int groups=1, int[]? input_size=None) -> Tensor
   variants: function
   python_module: nn
   dispatch:
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 254a87c364238..65d6266a55327 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -3375,10 +3375,12 @@ def _prepare_convolution_fusion_create(
             [0, 0],
             groups,
         )
-        req_stride_order = get_stride_order(output.stride())
+        output_size = output.size()
+        req_stride_order = [0] + list(reversed(range(1, len(stride) + 1)))
+        req_stride_order = [len(req_stride_order)] + req_stride_order
+        output_stride = make_channels_last_strides_for(output_size)
 
     x = cls.require_stride_order(x, req_stride_order)
-    weight = cls.require_stride1(cls.realize_input(weight))
     assert x.get_device().type == "cpu" and weight.get_device().type == "cpu"
     inputs = [x, weight]
 
@@ -3386,7 +3388,7 @@ def _prepare_convolution_fusion_create(
         x.get_device(),
         x.get_dtype(),
         output.size(),
-        output.stride(),
+        output_stride,
     )
     constant_args = [padding, stride, dilation, groups]
 
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index ed523d356c39a..ce3a603e399c8 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -82,6 +82,7 @@ def __init__(
         self,
         conv: nn.Module,
         unary: nn.Module,
+        input_size: list,
     ):
         super(ConvUnary2d, self).__init__(
             conv.in_channels,
@@ -96,13 +97,24 @@ def __init__(
             conv.weight.device,
             conv.weight.dtype,
         )
-        self._update_module_params(conv, unary)
+        self._update_module_params(conv, unary, input_size)
 
-    def _update_module_params(self, conv, unary):
+    def _update_module_params(self, conv, unary, input_size):
         self.__dict__ = copy.deepcopy(conv.__dict__)
         self.attr, self.scalars, self.algorithm = unary_modules_map[unary.__class__](
             unary
         )
+        self.weight = torch.nn.Parameter(
+            torch._C._nn.mkldnn_reorder_conv2d_weight(
+                self.weight.to_mkldnn(),
+                self.padding,
+                self.stride,
+                self.dilation,
+                self.groups,
+                input_size,
+            ),
+            requires_grad=self.weight.requires_grad,
+        )
 
     def _conv_forward(self, input, weight, bias):
         if self.padding_mode != "zeros":
@@ -142,6 +154,7 @@ def __init__(
         self,
         conv: nn.Module,
         binary_op_name: str,
+        input_size: list,
     ):
         super(ConvBinary2d, self).__init__(
             conv.in_channels,
@@ -156,15 +169,26 @@ def __init__(
             conv.weight.device,
             conv.weight.dtype,
         )
-        self._update_module_params(conv, binary_op_name)
+        self._update_module_params(conv, binary_op_name, input_size)
 
-    def _update_module_params(self, conv, binary_op_name):
+    def _update_module_params(self, conv, binary_op_name, input_size):
         self.__dict__ = copy.deepcopy(conv.__dict__)
         self.binary_attr = binary_op_name
         self.binary_alpha = None
         self.unary_attr = None
         self.unary_scalars = []
         self.unary_algorithm = None
+        self.weight = torch.nn.Parameter(
+            torch._C._nn.mkldnn_reorder_conv2d_weight(
+                self.weight.to_mkldnn(),
+                self.padding,
+                self.stride,
+                self.dilation,
+                self.groups,
+                input_size,
+            ),
+            requires_grad=self.weight.requires_grad,
+        )
 
     def _update_unary_params(self, unary):
         self.unary_attr, self.unary_scalars, self.unary_algorithm = unary_modules_map[
@@ -215,6 +239,7 @@ def __init__(
         self,
         conv: nn.Module,
         binary_op_name: str,
+        input_size: list,
     ):
         super(ConvBinaryInplace2d, self).__init__(
             conv.in_channels,
@@ -229,15 +254,26 @@ def __init__(
             conv.weight.device,
             conv.weight.dtype,
         )
-        self._update_module_params(conv, binary_op_name)
+        self._update_module_params(conv, binary_op_name, input_size)
 
-    def _update_module_params(self, conv, binary_op_name):
+    def _update_module_params(self, conv, binary_op_name, input_size):
         self.__dict__ = copy.deepcopy(conv.__dict__)
         self.binary_attr = binary_op_name
         self.binary_alpha = None
         self.unary_attr = None
         self.unary_scalars = []
         self.unary_algorithm = None
+        self.weight = torch.nn.Parameter(
+            torch._C._nn.mkldnn_reorder_conv2d_weight(
+                self.weight.to_mkldnn(),
+                self.padding,
+                self.stride,
+                self.dilation,
+                self.groups,
+                input_size,
+            ),
+            requires_grad=self.weight.requires_grad,
+        )
 
     def _update_unary_params(self, unary):
         self.unary_attr, self.unary_scalars, self.unary_algorithm = unary_modules_map[
@@ -334,31 +370,38 @@ def forward(self, input, other):
         return y
 
 
-def fused_conv_unary_eval(conv: nn.Module, unary: nn.Module):
+def fused_conv_unary_eval(conv: nn.Module, unary: nn.Module, input_size: list):
     assert not (conv.training), "Fusion only for eval!"
     return ConvUnary2d(
         conv,
         unary,
+        input_size,
     )
 
 
-def fused_conv_binary_eval(conv: nn.Module, binary_op_name: str):
+def fused_conv_binary_eval(conv: nn.Module, binary_op_name: str, input_size: list):
     assert not (conv.training), "Fusion only for eval!"
     return ConvBinary2d(
         conv,
         binary_op_name,
+        input_size,
     )
 
 
-def fused_conv_binary_inplace_eval(conv: nn.Module, binary_op_name: str):
+def fused_conv_binary_inplace_eval(
+    conv: nn.Module, binary_op_name: str, input_size: list
+):
     assert not (conv.training), "Fusion only for eval!"
     return ConvBinaryInplace2d(
         conv,
         binary_op_name,
+        input_size,
     )
 
 
-def fused_binary_unary_eval(conv_binary: nn.Module, unary: nn.Module):
+def fused_conv_binary_unary_eval(
+    conv_binary: nn.Module, unary: nn.Module, input_size: list
+):
     assert not (conv_binary.training), "Fusion only for eval!"
     # reuse origin conv module, and just update its' unary attr.
     conv_binary._update_unary_params(unary)
@@ -371,7 +414,7 @@ def is_bfloat16_module(m):
     return weight_is_bf16 and bias_is_bf16
 
 
-def fused_linear_unary_eval(linear: nn.Module, unary: nn.Module):
+def fused_linear_unary_eval(linear: nn.Module, unary: nn.Module, input_size: list):
     assert not (linear.training), "Fusion only for eval!"
     return LinearUnary(
         linear,
@@ -379,7 +422,7 @@ def fused_linear_unary_eval(linear: nn.Module, unary: nn.Module):
     )
 
 
-def fused_linear_binary_eval(linear: nn.Module, attr: str):
+def fused_linear_binary_eval(linear: nn.Module, attr: str, input_size: list):
     assert not (linear.training), "Fusion only for eval!"
     linear_binary = LinearBinary(
         linear,
@@ -527,7 +570,12 @@ def fuse_unary(gm: torch.fx.GraphModule):
                     computation_node
                 ):
                     continue
-                fused_module = fuse_func(computation_node, unary_node)
+                computation_node_input_size = (
+                    node.args[0].args[0].meta.get("tensor_meta").shape
+                )
+                fused_module = fuse_func(
+                    computation_node, unary_node, computation_node_input_size
+                )
                 replace_node_module(node.args[0], modules, fused_module)
 
                 node.replace_all_uses_with(node.args[0])
@@ -738,7 +786,10 @@ def transpose_matmul(A: torch.Tensor, B: torch.Tensor, Atrans: bool, Btrans: boo
 def replace_and_fuse_for_binary(
     computation_node, node, fuse_func, attr, modules, index_node, index_pointwise
 ):
-    fused_module = fuse_func(computation_node, attr)
+    computation_node_input_size = (
+        node.args[index_node].args[0].meta.get("tensor_meta").shape
+    )
+    fused_module = fuse_func(computation_node, attr, computation_node_input_size)
     replace_node_module(node.args[index_node], modules, fused_module)
     node.args[index_node].args = node.args[index_node].args + (
         node.args[index_pointwise],
@@ -979,8 +1030,8 @@ def rand_like(x, **kwargs):
 computation_op_unary_op_fusion_map = {
     nn.Conv2d: fused_conv_unary_eval,
     nn.Linear: fused_linear_unary_eval,
-    ConvBinary2d: fused_binary_unary_eval,
-    ConvBinaryInplace2d: fused_binary_unary_eval,
+    ConvBinary2d: fused_conv_binary_unary_eval,
+    ConvBinaryInplace2d: fused_conv_binary_unary_eval,
 }
 
 
diff --git a/torch/_subclasses/meta_utils.py b/torch/_subclasses/meta_utils.py
index 6e5586bde0a79..8adca0335b971 100644
--- a/torch/_subclasses/meta_utils.py
+++ b/torch/_subclasses/meta_utils.py
@@ -140,7 +140,7 @@ def set_tensor_memo(self, t, v):
         # hold a weak ref to self, otherwise it will be kept alive
         # by the del_ten closure
         self_weak_ref = weakref.ref(self)
-        if t.is_sparse:
+        if t.is_sparse or t.is_mkldnn:
             weak_st = None
         else:
             weak_st = StorageWeakRef(t._typed_storage())
@@ -271,7 +271,20 @@ def sym_sizes_strides(t):
                         with torch.enable_grad():
                             r = r.clone()
                             r._coalesced_(t.is_coalesced())
-
+                elif t.is_mkldnn:
+                    is_leaf = safe_is_leaf(t)
+                    sizes, strides = sym_sizes_strides(t)
+                    r = callback(
+                        lambda: torch.empty_strided(
+                            sizes, strides, dtype=t.dtype, device="meta"
+                        )
+                    )
+                    assert safe_is_leaf(r), "the callback you passed in doesn't detach"
+                    if t.requires_grad:
+                        r.requires_grad = True
+                    if t.requires_grad and not is_leaf:
+                        with torch.enable_grad():
+                            r = r.clone()
                 elif t._is_view():
                     # Construct views in two steps: recursively meta-fy their
                     # base, and then create view(s) off that.  NB: doing it
@@ -459,7 +472,6 @@ def __call__(self, t, shape_env=None, *, callback=lambda t: t()):
                 [
                     t.is_sparse_csr,
                     t.layout in [torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc],
-                    t.is_mkldnn,
                     t.is_quantized,
                     t.is_nested,
                     t._is_view() and t._base is not None and t._base.is_sparse,

From d4fcc1577233ac1ef7129d70bd0ab51dbe4aef87 Mon Sep 17 00:00:00 2001
From: XiaobingSuper <xiaobing.zhang@intel.com>
Date: Thu, 24 Nov 2022 02:32:59 -0500
Subject: [PATCH 1271/1922] TorchDynamo: weight prepack for mkl linear (#89109)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89109
Approved by: https://github.com/jgong5, https://github.com/jansel
---
 aten/src/ATen/native/mkldnn/Linear.cpp        | 98 +++++++++++++++++++
 .../ATen/native/mkldnn/MKLDNNConversions.cpp  | 47 ++++++++-
 .../mkldnn/RegisterMkldnnOpContextClass.cpp   | 19 ++++
 test/inductor/test_torchinductor.py           | 14 +++
 torch/_inductor/ir.py                         | 54 ++++++++++
 torch/_inductor/lowering.py                   | 14 +++
 torch/_inductor/overrides.py                  | 66 ++++++++++++-
 7 files changed, 310 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/mkldnn/Linear.cpp b/aten/src/ATen/native/mkldnn/Linear.cpp
index 24bf1282bfd6b..894e54eefb1c1 100644
--- a/aten/src/ATen/native/mkldnn/Linear.cpp
+++ b/aten/src/ATen/native/mkldnn/Linear.cpp
@@ -10,6 +10,7 @@
 #else
 #include <ATen/ops/_to_dense_native.h>
 #include <ATen/ops/empty.h>
+#include <ATen/ops/linear.h>
 #include <ATen/ops/mkldnn_linear_backward_input.h>
 #include <ATen/ops/mkldnn_linear_backward_input_native.h>
 #include <ATen/ops/mkldnn_linear_backward_native.h>
@@ -336,3 +337,100 @@ TORCH_LIBRARY_IMPL(mkldnn, CPU, m) {
 } // namespace at
 
 #endif // AT_MKLDNN_ENABLED
+
+#if AT_MKL_ENABLED() && AT_MKLDNN_ENABLED()
+#include <mkl.h>
+
+namespace at {
+namespace native {
+
+Tensor mkl_linear(
+    const Tensor& self,
+    const Tensor& mkl_weight_t,
+    const Tensor& origin_weight_t,
+    const c10::optional<Tensor>& bias_opt,
+    const int64_t prepack_batch_size) {
+  c10::MaybeOwned<Tensor> bias_maybe_owned =
+      at::borrow_from_optional_tensor(bias_opt);
+  const Tensor& bias = *bias_maybe_owned;
+  TORCH_CHECK(
+      self.options().type_equal(origin_weight_t.options()),
+      "Input type (",
+      self.toString(),
+      ") and weight type (",
+      origin_weight_t.toString(),
+      ") should be the same");
+  TORCH_CHECK(
+      !bias.defined() || (self.options().type_equal(bias.options())),
+      "Input type (",
+      self.toString(),
+      ") and bias type (",
+      bias.toString(),
+      ") should be the same");
+  TORCH_CHECK(
+      mkl_weight_t.scalar_type() == origin_weight_t.scalar_type() &&
+          origin_weight_t.scalar_type() == kFloat,
+      "mkl_linear: weight dtype should be float");
+
+  c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset);
+  auto input_size = self.sizes();
+  std::vector<int64_t> output_size(input_size.begin(), input_size.end() - 1);
+  output_size.push_back(origin_weight_t.size(0));
+  auto output = at::empty(output_size, self.options());
+  int64_t M = self.numel() / self.size(self.dim() - 1);
+  if (M == prepack_batch_size && mkl_weight_t.is_mkldnn()) {
+    auto self_ = self.is_contiguous() ? self : self.contiguous();
+    auto K = origin_weight_t.size(1);
+    auto N = origin_weight_t.size(0);
+    const ideep::tensor& w = itensor_from_mkldnn(mkl_weight_t);
+    auto in_ptr = self_.data_ptr<float>();
+    auto weight_ptr = (float*)(w.get_data_handle());
+    auto out_ptr = output.data_ptr<float>();
+    if (bias.defined()) {
+      auto bias_ = bias.is_contiguous() ? bias : bias.contiguous();
+      auto bias_ptr = bias_.data_ptr<float>();
+#ifdef _OPENMP
+#if (_OPENMP >= 201307)
+#pragma omp parallel for simd schedule( \
+    static) if (omp_get_max_threads() > 1 && !omp_in_parallel())
+#else
+#pragma omp parallel for schedule( \
+    static) if (omp_get_max_threads() > 1 && !omp_in_parallel())
+#endif
+#endif
+      for (int64_t i = 0; i < M; ++i) {
+        memcpy(out_ptr + i * N, bias_ptr, sizeof(float) * N);
+      }
+    }
+    cblas_sgemm_compute(
+        CblasRowMajor,
+        CblasNoTrans,
+        CblasPacked,
+        M,
+        N,
+        K,
+        in_ptr,
+        K,
+        weight_ptr,
+        K,
+        bias.defined() ? 1.f : 0.f,
+        out_ptr,
+        N);
+  } else {
+    output = at::linear_out(output, self, origin_weight_t, bias_opt);
+  }
+  return output;
+}
+
+TORCH_LIBRARY_IMPL(mkl, CPU, m) {
+  m.impl(TORCH_SELECTIVE_NAME("mkl::_mkl_linear"), TORCH_FN(mkl_linear));
+}
+
+TORCH_LIBRARY_IMPL(mkl, MkldnnCPU, m) {
+  m.impl(TORCH_SELECTIVE_NAME("mkl::_mkl_linear"), TORCH_FN(mkl_linear));
+}
+
+} // namespace native
+} // namespace at
+
+#endif // AT_MKL_ENABLED && AT_MKLDNN_ENABLED
diff --git a/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp b/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
index 9188184b50e3f..d643fae22ca26 100644
--- a/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
+++ b/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
@@ -1,9 +1,10 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/core/Tensor.h>
 #include <ATen/Config.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/native/mkldnn/MKLDNNCommon.h>
 #include <ATen/native/mkldnn/Utils.h>
 #include <ATen/native/utils/ParamUtils.h>
+#include <torch/library.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -198,4 +199,48 @@ Tensor mkldnn_reorder_conv3d_weight(
 
 #endif // AT_MKLDNN_ENABLED()
 
+#if AT_MKL_ENABLED() && AT_MKLDNN_ENABLED()
+#include <mkl.h>
+
+Tensor mkl_reorder_linear_weight(
+    const Tensor& weight,
+    const int64_t batch_size) {
+  TORCH_CHECK(
+      weight.scalar_type() == ScalarType::Float,
+      "reorder_linear_weight: weight's dtype should be float");
+  c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset);
+  auto M = batch_size;
+  auto N = weight.size(0);
+  auto K = weight.size(1);
+  int64_t pack_size =
+      (int64_t)(cblas_sgemm_pack_get_size(CblasBMatrix, M, N, K) / sizeof(float) + 1);
+  auto packed_weight = empty_mkldnn(
+      {pack_size, 1},
+      weight.scalar_type(),
+      weight.options().layout_opt(),
+      weight.options().device_opt(),
+      weight.options().pinned_memory_opt());
+  ideep::tensor& mkl_weight = itensor_from_mkldnn(packed_weight);
+  ideep::tensor& orig_w = itensor_from_mkldnn(weight);
+  cblas_sgemm_pack(
+      CblasRowMajor,
+      CblasBMatrix,
+      CblasTrans,
+      M,
+      N,
+      K,
+      1.0f,
+      (float*)(orig_w.get_data_handle()),
+      K,
+      (float*)(mkl_weight.get_data_handle()));
+  return packed_weight;
+}
+
+TORCH_LIBRARY_IMPL(mkl, MkldnnCPU, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("mkl::_mkl_reorder_linear_weight"),
+      TORCH_FN(mkl_reorder_linear_weight));
+}
+
+#endif // AT_MKL_ENABLED && AT_MKLDNN_ENABLED
 }}
diff --git a/aten/src/ATen/native/mkldnn/RegisterMkldnnOpContextClass.cpp b/aten/src/ATen/native/mkldnn/RegisterMkldnnOpContextClass.cpp
index 08230827b58e5..8841d65a2e782 100644
--- a/aten/src/ATen/native/mkldnn/RegisterMkldnnOpContextClass.cpp
+++ b/aten/src/ATen/native/mkldnn/RegisterMkldnnOpContextClass.cpp
@@ -69,3 +69,22 @@ TORCH_LIBRARY_IMPL(mkldnn_prepacked, CPU, m) {
 } // namespace at
 
 #endif // AT_MKLDNN_ENABLED()
+
+#if AT_MKL_ENABLED() && AT_MKLDNN_ENABLED()
+
+namespace at {
+namespace native {
+namespace mkl {
+
+TORCH_LIBRARY(mkl, m) {
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "mkl::_mkl_reorder_linear_weight(Tensor X, int batch_size) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "mkl::_mkl_linear(Tensor X, Tensor MKL_W, Tensor ORI_W, Tensor? B, int batch_size) -> Tensor"));
+}
+
+} // namespace mkl
+} // namespace native
+} // namespace at
+
+#endif // AT_MKL_ENABLED && AT_MKLDNN_ENABLED
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index b24fde54b9984..34c84e69280ba 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -1601,6 +1601,20 @@ def forward(self, x):
                     (v,),
                 )
 
+    def test_linear_packed(self):
+        options = itertools.product([[2, 3, 10], [2, 10]], [True, False])
+        for input_shape, bias in options:
+            mod = torch.nn.Sequential(
+                torch.nn.Linear(input_shape[-1], 30, bias=bias)
+            ).eval()
+
+            v = torch.randn(input_shape)
+            with torch.no_grad():
+                self.common(
+                    mod,
+                    (v,),
+                )
+
     def test_linear_unary(self):
         options = itertools.product(unary_list, [[2, 3, 10], [2, 10]], [True, False])
         dtype = torch.bfloat16
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 65d6266a55327..4c7d94ce9875a 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -3570,6 +3570,60 @@ def create(
         )
 
 
+class MKLPackedLinear(ExternKernelAlloc):
+    kernel = "torch.ops.mkl._mkl_linear"
+
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+        kernel="torch.ops.mkl._mkl_linear",
+    ):
+        super().__init__(layout, inputs, constant_args)
+        self.kernel = kernel
+
+    def codegen(self, wrapper):
+        wrapper.writeline(
+            f"{self.get_name()} = {self.kernel}({', '.join(self.codegen_args())})"
+        )
+
+    @classmethod
+    def create(cls, x, packed_w, orig_w, bias, batch_size):
+        kernel = "torch.ops.mkl._mkl_linear"
+
+        with torch._subclasses.FakeTensorMode():
+            x_fake = ir_node_to_tensor(x, guard_shape=True)
+            weight_fake = ir_node_to_tensor(orig_w, guard_shape=True)
+            bias_fake = (
+                ir_node_to_tensor(bias, guard_shape=True) if bias is not None else bias
+            )
+            output = torch.ops.aten.linear(
+                x_fake,
+                weight_fake,
+                bias_fake,
+            )
+            output_size = output.size()
+            req_stride_order = list(reversed(range(len(output_size))))
+            output_stride = output.stride()
+        x = cls.require_stride_order(x, req_stride_order)
+        inputs = [x, packed_w, orig_w]
+        constant_args = [batch_size]
+        if bias is not None:
+            inputs.append(bias)
+        else:
+            constant_args.insert(0, bias)
+
+        return MKLPackedLinear(
+            layout=FixedLayout(
+                x.get_device(), x.get_dtype(), output_size, output_stride
+            ),
+            inputs=inputs,
+            constant_args=constant_args,
+            kernel=kernel,
+        )
+
+
 class LinearUnary(ExternKernelAlloc):
     kernel = "torch.ops.mkldnn._linear_pointwise"
 
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 9b65431c6a175..f65c3eab3b3f9 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -1014,6 +1014,20 @@ def linear_unary(
         def linear_binary(x: TensorBox, y: TensorBox, w: TensorBox, b: TensorBox, attr):
             return TensorBox.create(ir.LinearBinary.create(x, y, w, b, attr))
 
+        if torch._C.has_mkl:
+
+            @register_lowering(torch.ops.mkl._mkl_linear)
+            def mkl_packed_linear(
+                x: TensorBox,
+                packed_w: TensorBox,
+                orig_w: TensorBox,
+                b: TensorBox,
+                batch_size,
+            ):
+                return TensorBox.create(
+                    ir.MKLPackedLinear.create(x, packed_w, orig_w, b, batch_size)
+                )
+
     else:
         pass
 
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index ce3a603e399c8..8f8c747fb2494 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -5,6 +5,8 @@
 import random
 import weakref
 
+import numpy
+
 import torch
 import torch.nn as nn
 from torch import _prims
@@ -319,6 +321,34 @@ def forward(self, input, other):
         return self._conv_forward(input, other, self.weight, self.bias)
 
 
+class PackedLinear(nn.Linear):
+    def __init__(self, linear: nn.Module, input_size: list):
+        super(PackedLinear, self).__init__(
+            linear.in_features,
+            linear.out_features,
+            linear.bias is not None,
+            linear.weight.device,
+            linear.weight.dtype,
+        )
+        self._update_module_params(linear, input_size)
+
+    def _update_module_params(self, linear, input_size):
+        self.__dict__ = copy.deepcopy(linear.__dict__)
+        self.batch_size = int(numpy.prod(input_size) / input_size[-1])
+        self.packed_weight = torch.nn.Parameter(
+            torch.ops.mkl._mkl_reorder_linear_weight(
+                self.weight.to_mkldnn(), self.batch_size
+            ),
+            requires_grad=self.weight.requires_grad,
+        )
+
+    def forward(self, input):
+        y = torch.ops.mkl._mkl_linear(
+            input, self.packed_weight, self.weight, self.bias, self.batch_size
+        )
+        return y
+
+
 class LinearUnary(nn.Linear):
     def __init__(
         self,
@@ -414,6 +444,11 @@ def is_bfloat16_module(m):
     return weight_is_bf16 and bias_is_bf16
 
 
+def packed_linear_eval(linear: nn.Module, input_size: list):
+    assert not (linear.training), "Fusion only for eval!"
+    return PackedLinear(linear, input_size)
+
+
 def fused_linear_unary_eval(linear: nn.Module, unary: nn.Module, input_size: list):
     assert not (linear.training), "Fusion only for eval!"
     return LinearUnary(
@@ -506,7 +541,7 @@ def fuse_fx(gm: torch.fx.GraphModule, example_inputs):
     # why re-run fuse_unary? we want to enable conv+binary+unary fusion,
     # such as conv+add+relu for vision model.
     gm = fuse_unary(gm)
-
+    gm = pack_module(gm)
     return gm
 
 
@@ -906,6 +941,29 @@ def fuse_binary_inplace(gm: torch.fx.GraphModule):
     return gm
 
 
+def pack_module(gm: torch.fx.GraphModule):
+    modules = dict(gm.named_modules())
+    for node in gm.graph.nodes:
+        if node.op == "call_module":
+            assert isinstance(node.target, str)
+            cur_module = modules[node.target]
+            if type(cur_module) in computation_op_packed_map:
+                computation_node_input_meta = node.args[0].meta.get("tensor_meta")
+                if computation_node_input_meta.dtype != torch.float32:
+                    continue
+                if type(cur_module) in [torch.nn.Linear] and not torch._C.has_mkl:
+                    continue
+                computation_node_input_size = computation_node_input_meta.shape
+                new_module = computation_op_packed_map[type(cur_module)](
+                    cur_module, computation_node_input_size
+                )
+                assert isinstance(new_module, nn.Module)
+                replace_node_module(node, modules, new_module)
+                gm.graph.lint()
+    gm.recompile()
+    return gm
+
+
 philox_rand_like = _prims._make_prim(
     schema="philox_rand_like(Tensor input, Tensor seed, int offset) -> Tensor",
     return_type=_prims.RETURN_TYPE.NEW,
@@ -1072,6 +1130,12 @@ def rand_like(x, **kwargs):
     nn.Conv2d: fused_conv_binary_inplace_eval,
 }
 
+
+computation_op_packed_map = {
+    nn.Linear: packed_linear_eval,
+}
+
+
 # For add: we support conv/linear + other and other + conv
 # For sub/add_/sub_, we only support conv/linear - other
 # or conv/linear +(-)= other

From 472aac9c23353f8e460ae8e9d5362eab0b566fcc Mon Sep 17 00:00:00 2001
From: XiaobingSuper <xiaobing.zhang@intel.com>
Date: Thu, 24 Nov 2022 02:33:01 -0500
Subject: [PATCH 1272/1922]  TorchDynamo: weight prepack for single conv
 (#89209)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89209
Approved by: https://github.com/jgong5, https://github.com/jansel
---
 test/inductor/test_torchinductor.py | 11 +++++++++++
 torch/_inductor/overrides.py        | 27 +++++++++++++++++++++++----
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 34c84e69280ba..589d43f204188 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -1455,6 +1455,17 @@ def test_conv_bn_fuse(self):
                         (v,),
                     )
 
+    @unittest.skipIf(HAS_CUDA, "only support cpu conv2d unary test")
+    def test_conv2d_packed(self):
+        x_shape = (1, 3, 56, 56)
+        mod = torch.nn.Sequential(torch.nn.Conv2d(3, 64, 3, 3)).eval()
+        v = torch.randn(x_shape, dtype=torch.float32)
+        with torch.no_grad():
+            self.common(
+                mod,
+                (v,),
+            )
+
     # For gpu path, there has a accurcy issue,
     # see https://github.com/pytorch/pytorch/issues/87745.
     @unittest.skipIf(HAS_CUDA, "only support cpu conv2d unary test")
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index 8f8c747fb2494..e1dfdea9c40ab 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -4,6 +4,7 @@
 import operator
 import random
 import weakref
+from typing import Optional
 
 import numpy
 
@@ -83,7 +84,7 @@ class ConvUnary2d(nn.Conv2d):
     def __init__(
         self,
         conv: nn.Module,
-        unary: nn.Module,
+        unary: Optional[nn.Module],
         input_size: list,
     ):
         super(ConvUnary2d, self).__init__(
@@ -103,9 +104,13 @@ def __init__(
 
     def _update_module_params(self, conv, unary, input_size):
         self.__dict__ = copy.deepcopy(conv.__dict__)
-        self.attr, self.scalars, self.algorithm = unary_modules_map[unary.__class__](
-            unary
-        )
+        self.attr = "none"
+        self.scalars = []
+        self.algorithm = ""
+        if unary is not None:
+            self.attr, self.scalars, self.algorithm = unary_modules_map[
+                unary.__class__
+            ](unary)
         self.weight = torch.nn.Parameter(
             torch._C._nn.mkldnn_reorder_conv2d_weight(
                 self.weight.to_mkldnn(),
@@ -400,6 +405,15 @@ def forward(self, input, other):
         return y
 
 
+def packed_conv_eval(conv: nn.Module, input_size: list):
+    assert not (conv.training), "Fusion only for eval!"
+    return ConvUnary2d(
+        conv,
+        None,
+        input_size,
+    )
+
+
 def fused_conv_unary_eval(conv: nn.Module, unary: nn.Module, input_size: list):
     assert not (conv.training), "Fusion only for eval!"
     return ConvUnary2d(
@@ -954,6 +968,10 @@ def pack_module(gm: torch.fx.GraphModule):
                 if type(cur_module) in [torch.nn.Linear] and not torch._C.has_mkl:
                     continue
                 computation_node_input_size = computation_node_input_meta.shape
+                if type(cur_module) in [nn.Conv2d] and isinstance(
+                    cur_module.padding, str
+                ):
+                    continue
                 new_module = computation_op_packed_map[type(cur_module)](
                     cur_module, computation_node_input_size
                 )
@@ -1133,6 +1151,7 @@ def rand_like(x, **kwargs):
 
 computation_op_packed_map = {
     nn.Linear: packed_linear_eval,
+    nn.Conv2d: packed_conv_eval,
 }
 
 
From 23ce0a55828999e3b107877bab5879da77437152 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 25 Nov 2022 03:03:41 +0000
Subject: [PATCH 1273/1922] [vision hash update] update the pinned vision hash
 (#89667)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned vision hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89667
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/vision.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index 30711c5bbfd9b..96d764c7b3202 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-4a310f26049371959617921d0eb9b001f4d262c6
+bfb474b9d3ffffec5c3a040c16bc77006f35a94e

From 042b3a98e0aae84f19f74d3a9925d4ecd33e607f Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Thu, 24 Nov 2022 12:00:12 -0800
Subject: [PATCH 1274/1922] xfail maml test, instead of running it without fake
 tensor prop (#89645)

A previous version of this patch graph breaks when torch.tensor fails, but that causes

```
PYTORCH_TEST_WITH_DYNAMO=1 python test/nn/test_embedding.py -k test_embedding_bag_1D_padding_idx_cpu_float32
```

to start failing. Probably another latent bug that needs investigating.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89645
Approved by: https://github.com/albanD
---
 test/dynamo/test_repros.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 05780536bdf61..f545acddd54da 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -952,7 +952,10 @@ def test_chunk_reformer_ff(self):
         self.assertEqual(cnt.op_count, 4)
 
     # see: https://github.com/pytorch/pytorch/issues/80067
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", False)
+    # NB: When you remove the expectedFailure, don't forget to
+    # uncomment/adjust the assertEqual below
+    @unittest.expectedFailure
+    @patch.object(torch._dynamo.config, "fake_tensor_propagation", True)
     @patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
     def test_maml_item_capture(self):
         a = torch.randn(5, 1, 28, 28)
@@ -966,7 +969,7 @@ def test_maml_item_capture(self):
         for _ in range(10):
             self.assertTrue(same(opt_model(a, b, c, d), correct))
 
-        self.assertEqual(cnt.frame_count, ifdyn(3, 2))
+        # self.assertEqual(cnt.frame_count, ifdyn(3, 2))
         # TODO(jansel): figure out why op count depends on imports
         self.assertIn(cnt.op_count, (36, 35, 34, 29, 28, 27))
 

From f1342a660e5c315bb0f1ae369ecf7138a6737f87 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Thu, 24 Nov 2022 12:00:13 -0800
Subject: [PATCH 1275/1922] Remove fake_tensor_propagation (#89646)

You always have to run dynamo with fake tensors.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89646
Approved by: https://github.com/soumith
---
 test/dynamo/test_no_fake_tensors.py     | 36 ----------------
 test/dynamo/test_optimizers.py          |  5 ---
 test/dynamo/test_repros.py              |  5 ---
 torch/_dynamo/config.py                 |  3 --
 torch/_dynamo/optimizations/analysis.py | 55 ++++++++++---------------
 torch/_dynamo/utils.py                  |  2 +-
 torch/_dynamo/variables/builder.py      | 55 +++++++------------------
 torch/_dynamo/variables/tensor.py       |  6 +--
 torch/_dynamo/variables/torch.py        | 25 +++++------
 9 files changed, 48 insertions(+), 144 deletions(-)
 delete mode 100644 test/dynamo/test_no_fake_tensors.py

diff --git a/test/dynamo/test_no_fake_tensors.py b/test/dynamo/test_no_fake_tensors.py
deleted file mode 100644
index f7943c1d7ab90..0000000000000
--- a/test/dynamo/test_no_fake_tensors.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Owner(s): ["module: dynamo"]
-from torch._dynamo.testing import make_test_cls_with_patches
-
-try:
-    from . import test_functions, test_misc, test_modules, test_repros, test_unspec
-except ImportError:
-    import test_functions
-    import test_misc
-    import test_modules
-    import test_repros
-    import test_unspec
-
-
-def make_no_fake_cls(cls):
-    return make_test_cls_with_patches(
-        cls, "NoFakeTensors", "_no_fake_tensors", ("fake_tensor_propagation", False)
-    )
-
-
-NoFakeTensorsFunctionTests = make_no_fake_cls(test_functions.FunctionTests)
-NoFakeTensorsMiscTests = make_no_fake_cls(test_misc.MiscTests)
-NoFakeTensorsReproTests = make_no_fake_cls(test_repros.ReproTests)
-NoFakeTensorsNNModuleTests = make_no_fake_cls(test_modules.NNModuleTests)
-NoFakeTensorsUnspecTests = make_no_fake_cls(test_unspec.UnspecTests)
-
-NoFakeTensorsReproTests.test_numpy_list_no_fake_tensors.__unittest_expecting_failure__ = (
-    False
-)
-NoFakeTensorsUnspecTests.test_builtin_getitem_no_fake_tensors.__unittest_expecting_failure__ = (
-    False
-)
-
-if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
-
-    run_tests()
diff --git a/test/dynamo/test_optimizers.py b/test/dynamo/test_optimizers.py
index 8bd0e2250abac..a4607a8d3db7e 100644
--- a/test/dynamo/test_optimizers.py
+++ b/test/dynamo/test_optimizers.py
@@ -77,11 +77,6 @@ def setUpClass(cls):
                 torch._dynamo.config, "capture_scalar_outputs", True
             )
         )
-        cls._exit_stack.enter_context(
-            unittest.mock.patch.object(
-                torch._dynamo.config, "fake_tensor_propagation", True
-            )
-        )
         cls._exit_stack.enter_context(enable_optimizer_tracing())
 
     test_sgd = make_test(torch.optim.SGD, lr=0.01)
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index f545acddd54da..7bd258cbb3c8d 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -816,7 +816,6 @@ def test_do_paste_mask(self):
             torch._dynamo.utils.counters["frames"]["ok"] + 1,
         )
 
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", True)
     def test_convert_boxes_to_pooler_format(self):
         boxes1 = [
             Boxes(torch.arange(0, 8).reshape((2, 4))),
@@ -1035,7 +1034,6 @@ def test_create_rand_mask_from_inputs(self):
         self.assertEqual(cnt.frame_count, 1)
         self.assertEqual(cnt.op_count, 8)
 
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", True)
     def test_rng_state(self):
         def fn():
             state = torch.get_rng_state()
@@ -1110,7 +1108,6 @@ def fn(model, x):
         self.assertEqual(cnt.frame_count, 1)
         self.assertEqual(cnt.op_count, 2)
 
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", True)
     def test_nn_parameter(self):
         def test_fn():
             a = torch.nn.Parameter(torch.randn(5, 5))
@@ -1699,8 +1696,6 @@ def fn(reshape_2):
         res = opt_fn(x)
         self.assertTrue(same(ref, res))
 
-    # This doesn't work without fake tensors but I don't care
-    @patch.object(torch._dynamo.config, "fake_tensor_propagation", True)
     def test_issue1466_size_aot_autograd(self):
         def fn(x):
             # do a tensor op and a size compute
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index 39a1a6433419f..26efff205389a 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -67,9 +67,6 @@
 # Run the FX graph as it is created to get better type information
 dynamic_propagation = True
 
-# Run the FX graph with FakeTensors
-fake_tensor_propagation = True
-
 # run FX normalization passes in optimizer
 normalize_ir = False
 
diff --git a/torch/_dynamo/optimizations/analysis.py b/torch/_dynamo/optimizations/analysis.py
index 6fb4ff82e5a37..5de5743bd5e22 100644
--- a/torch/_dynamo/optimizations/analysis.py
+++ b/torch/_dynamo/optimizations/analysis.py
@@ -1,4 +1,3 @@
-import copy
 import functools
 import itertools
 import operator
@@ -13,7 +12,7 @@
 
 from .. import config
 
-from ..utils import clone_inputs, deepcopy_to_fake_tensor
+from ..utils import deepcopy_to_fake_tensor
 
 
 class ShapeAliasingAndMutationProp(ShapeProp):
@@ -119,38 +118,26 @@ def has_mutation(gm, example_inputs, inputs_only=False):
     true, we only check for mutation of inputs"""
     # TODO - moco gives bad accuracy with Aliasing. gm is getting mutated in a bad way.
 
-    if config.fake_tensor_propagation:
-
-        def _wrap_to_fake_tensor(t, *, f_mode):
-            if type(t) in (torch.Tensor, torch.nn.Parameter):
-                static_shapes_ = config.dynamic_shapes is False
-                return fake_mode.from_tensor(
-                    t, static_shapes=config.dynamic_shapes is not False
-                )
-            else:
-                return t
-
-        # Our analysis pass should use dynamic shape tensor inputs
-        # when dynamic shapes are enabled.
-        # We don't actually care about the guards that are created
-        # on those shapes though, so just create a fresh ShapeEnv here.
-        from torch.fx.experimental.symbolic_shapes import ShapeEnv
-
-        fake_mode = FakeTensorMode(
-            shape_env=ShapeEnv() if config.dynamic_shapes else None
-        )
-        fake_wrapper = functools.partial(_wrap_to_fake_tensor, f_mode=fake_mode)
-        example_inputs = tree_map(fake_wrapper, example_inputs)
-        new_gm = deepcopy_to_fake_tensor(gm, fake_mode)
-        with fake_mode.restore() if hasattr(fake_mode, "restore") else fake_mode:
-            ShapeAliasingAndMutationProp(new_gm).run(*example_inputs)
-    else:
-        # Clone the inputs such that intermediate tensors (not leaf tensors) with
-        # requires_grad to True are now converted to False to avoid Runtime Error
-        # like "leaf variable that requires grad is inplace modified"
-        example_inputs = clone_inputs(example_inputs)
-        new_gm = copy.deepcopy(gm)
-        example_inputs = copy.deepcopy(example_inputs)
+    def _wrap_to_fake_tensor(t, *, f_mode):
+        if type(t) in (torch.Tensor, torch.nn.Parameter):
+            static_shapes_ = config.dynamic_shapes is False
+            return fake_mode.from_tensor(
+                t, static_shapes=config.dynamic_shapes is not False
+            )
+        else:
+            return t
+
+    # Our analysis pass should use dynamic shape tensor inputs
+    # when dynamic shapes are enabled.
+    # We don't actually care about the guards that are created
+    # on those shapes though, so just create a fresh ShapeEnv here.
+    from torch.fx.experimental.symbolic_shapes import ShapeEnv
+
+    fake_mode = FakeTensorMode(shape_env=ShapeEnv() if config.dynamic_shapes else None)
+    fake_wrapper = functools.partial(_wrap_to_fake_tensor, f_mode=fake_mode)
+    example_inputs = tree_map(fake_wrapper, example_inputs)
+    new_gm = deepcopy_to_fake_tensor(gm, fake_mode)
+    with fake_mode.restore() if hasattr(fake_mode, "restore") else fake_mode:
         ShapeAliasingAndMutationProp(new_gm).run(*example_inputs)
 
     for node in new_gm.graph.nodes:
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index ce020b74c41da..ffea261979b14 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -737,7 +737,7 @@ def wrap_fake_exception(fn):
     except UnsupportedFakeTensorException as e:
         from .exc import unimplemented
 
-        msg = f"Unsupported: {e.reason} with fake tensor propagation. Run with config.fake_tensor_propagation=False"
+        msg = f"Unsupported: {e.reason} with fake tensor propagation."
         log.warning(msg)
         raise unimplemented(msg)
 
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index dec8bad644329..843e3d1edbbb4 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -37,7 +37,6 @@
 from ..utils import (
     clone_input,
     get_fake_value,
-    get_real_value,
     getfile,
     global_key_name,
     is_namedtuple,
@@ -581,11 +580,7 @@ def wrap_unspecialized_primitive(self, value):
         if self.name in self.tx.output.unspec_variable_map:
             return self.tx.output.unspec_variable_map[self.name]
         else:
-            if (
-                config.dynamic_shapes
-                and config.fake_tensor_propagation
-                and isinstance(value, int)
-            ):
+            if config.dynamic_shapes and isinstance(value, int):
                 shape_env = self.tx.output.shape_env
                 wrapped_value = shape_env.create_symintnode(
                     shape_env.create_symbol(value)
@@ -671,17 +666,12 @@ def wrap_fx_proxy_cls(target_cls, tx, proxy, example_value=None, **options):
             options.update(target_cls.specialize(example_value))
         return target_cls(proxy, **options)
 
-    use_fake_tensors = config.fake_tensor_propagation
-
     initial_example_value = example_value
 
     def _clone_input(value):
         if isinstance(value, torch.Tensor):
-            use_fake_tensors = config.fake_tensor_propagation
             # tensor subclasses will not be converted to FakeTensors and need to be cloned
-            if not use_fake_tensors or not isinstance(
-                value, torch._subclasses.fake_tensor.FakeTensor
-            ):
+            if not isinstance(value, torch._subclasses.fake_tensor.FakeTensor):
                 # NB: ensure strides are preserved
                 value = clone_input(value)
 
@@ -689,16 +679,12 @@ def _clone_input(value):
 
     with preserve_rng_state():
         if example_value is None:
-            if use_fake_tensors:
-                example_value = get_fake_value(proxy.node, tx)
-            else:
-                example_value = get_real_value(proxy.node, tx.output)
+            example_value = get_fake_value(proxy.node, tx)
 
         else:
             proxy.tracer.real_value_cache[proxy.node] = _clone_input(example_value)
-            if use_fake_tensors:
-                fake_wrapper = functools.partial(wrap_to_fake_tensor_and_record, tx=tx)
-                example_value = fake_wrapper(example_value)
+            fake_wrapper = functools.partial(wrap_to_fake_tensor_and_record, tx=tx)
+            example_value = fake_wrapper(example_value)
 
     if isinstance(example_value, torch.Tensor):
         is_parameter = isinstance(example_value, torch.nn.Parameter)
@@ -711,9 +697,7 @@ def _clone_input(value):
         example_value = _clone_input(example_value)
         proxy.node.meta["example_value"] = example_value
         specialized_props = target_cls.specialize(example_value)
-        if use_fake_tensors and isinstance(
-            example_value, torch._subclasses.fake_tensor.FakeTensor
-        ):
+        if isinstance(example_value, torch._subclasses.fake_tensor.FakeTensor):
             specialized_props["class_type"] = (
                 torch.nn.Parameter if is_parameter else torch.Tensor
             )
@@ -807,25 +791,14 @@ def _clone_input(value):
         and (proxy.node.target == "item" or proxy.node.target in {math.sqrt, math.pow})
         and config.capture_scalar_outputs
     ):
-        if use_fake_tensors:
-            # item raw value should not be accessed
-            return wrap_fx_proxy_cls(
-                FakeItemVariable,
-                tx=tx,
-                proxy=proxy,
-                example_value=torch.tensor(example_value),
-                **options,
-            )
-        else:
-            return wrap_fx_proxy_cls(
-                UnspecializedPythonVariable,
-                tx=tx,
-                proxy=proxy,
-                example_value=torch.tensor(example_value),
-                raw_value=None if use_fake_tensors else example_value,
-                need_unwrap=False,
-                **options,
-            )
+        # item raw value should not be accessed
+        return wrap_fx_proxy_cls(
+            FakeItemVariable,
+            tx=tx,
+            proxy=proxy,
+            example_value=torch.tensor(example_value),
+            **options,
+        )
     elif isinstance(example_value, (torch.SymInt, torch.SymFloat)):
         proxy.node.meta["example_value"] = example_value
         return DynamicShapeVariable(proxy, example_value, **options)
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index 282f8cce0c368..9626ab8ae082d 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -260,11 +260,7 @@ def call_method(
             unimplemented(f"Tensor.{name}")
         elif name == "item":
             if config.capture_scalar_outputs:
-                use_fake_tensors = config.fake_tensor_propagation
-                if use_fake_tensors:
-                    example_value = get_fake_value(self.proxy.node, tx)
-                else:
-                    example_value = get_real_value(self.proxy.node, tx.output).item()
+                example_value = get_fake_value(self.proxy.node, tx)
                 return wrap_fx_proxy(
                     tx,
                     tx.output.create_proxy(
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index 979a948fbe8f6..d737e460304ff 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -337,21 +337,18 @@ def get_state_from_generator():
             assert len(args) == 1
             assert isinstance(args[0], TensorVariable)
 
-            if config.fake_tensor_propagation:
-                unimplemented(
-                    "TODO: make torch.random.set_rng_state work with FakeTensor/aot_autograd"
-                )
-                # In fake tensor case, this state doesn't matter, but
-                # it needs to be valid to not segfault. Pull a real tensor out.
-                # The value won't matter since we are running with fake tensors anyway, so rng doesn't matter.
-                # However, it is imperative to record the call_function in the graph with the true args
-                # (Not the fake example_value) - for the sake of graph correctness.
-                if self.value == torch.random.set_rng_state:
-                    example_value = torch.random.get_rng_state()
-                else:
-                    example_value = self.value.__self__.get_state()
+            unimplemented(
+                "TODO: make torch.random.set_rng_state work with FakeTensor/aot_autograd"
+            )
+            # In fake tensor case, this state doesn't matter, but
+            # it needs to be valid to not segfault. Pull a real tensor out.
+            # The value won't matter since we are running with fake tensors anyway, so rng doesn't matter.
+            # However, it is imperative to record the call_function in the graph with the true args
+            # (Not the fake example_value) - for the sake of graph correctness.
+            if self.value == torch.random.set_rng_state:
+                example_value = torch.random.get_rng_state()
             else:
-                example_value = args[0].proxy.node.meta["example_value"]
+                example_value = self.value.__self__.get_state()
 
             self.value.__module__ = self.__module__
             return wrap_fx_proxy(

From 58c1a9f634cfcdcd79748bc5974e0af04f83f42d Mon Sep 17 00:00:00 2001
From: Jason Ansel <jansel@meta.com>
Date: Fri, 25 Nov 2022 04:28:36 +0000
Subject: [PATCH 1276/1922] torchdynamo to torch._dynamo in aot_autograd.py
 (#89385)

Test Plan: Run torchbench models

Differential Revision: D41429573

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89385
Approved by: https://github.com/soumith, https://github.com/malfet
---
 functorch/_src/aot_autograd.py | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/functorch/_src/aot_autograd.py b/functorch/_src/aot_autograd.py
index e3bb1e0303bcf..5acbcd41369a7 100644
--- a/functorch/_src/aot_autograd.py
+++ b/functorch/_src/aot_autograd.py
@@ -14,6 +14,8 @@
 import torch.utils._pytree as pytree
 import torch.utils.dlpack
 from torch import Tensor
+from torch._dynamo import disable as disable_torchdynamo
+from torch._dynamo.utils import dynamo_timed
 from torch._subclasses import FakeTensorMode, CrossRefFakeMode
 from torch.fx import immutable_collections, Interpreter
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
@@ -26,20 +28,6 @@
 from .named_members_polyfill import _named_buffers, _named_parameters
 from .partitioners import default_partition
 
-try:
-    from torchdynamo import disable as disable_torchdynamo
-except ImportError:
-
-    def disable_torchdynamo(x):
-        return x
-
-try:
-    from torchdynamo.utils import dynamo_timed
-except ImportError:
-
-    def dynamo_timed(x):
-        return x
-
 MutationType = Enum("MutationType", ("none", "metadata_only", "data"))
 OutputType = Enum("OutputType", ("non_alias", "alias_of_input", "alias_of_intermediate"))
 

From cba0e896daad60708c82a32cc5000d818ee0c7f7 Mon Sep 17 00:00:00 2001
From: Alvaro Gaona <alvgaona@gmail.com>
Date: Fri, 25 Nov 2022 11:09:28 +0000
Subject: [PATCH 1277/1922] Implement old windows in Python (#87082)

Relates to #85366

- Bartlett, Blackman, Hamming, Hann.
- Except Kaiser which will be in a different PR

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87082
Approved by: https://github.com/mruberry, https://github.com/lezcano
---
 docs/source/signal.rst                        |   6 +
 torch/_torch_docs.py                          |   2 +-
 torch/signal/windows/__init__.py              |  19 +-
 torch/signal/windows/windows.py               | 449 ++++++++++++++--
 .../_internal/opinfo/definitions/signal.py    | 478 +++++++++++++++++-
 5 files changed, 903 insertions(+), 51 deletions(-)

diff --git a/docs/source/signal.rst b/docs/source/signal.rst
index 57a1ad6b0e55f..a450c92727f35 100644
--- a/docs/source/signal.rst
+++ b/docs/source/signal.rst
@@ -18,7 +18,13 @@ torch.signal.windows
     :toctree: generated
     :nosignatures:
 
+    bartlett
+    blackman
     cosine
     exponential
     gaussian
+    general_cosine
+    general_hamming
+    hamming
+    hann
     kaiser
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 2ff2e9be315de..34195f938b40a 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -12626,7 +12626,7 @@ def merge_dicts(*dicts):
     {requires_grad}
 
 Returns:
-    Tensor: A 1-D tensor of size :math:`(\text{{window\_length}},)` containing the window
+    Tensor: A 1-D tensor of size :math:`(\text{{window\_length}},)` containing the window.
 
 """.format(
         **factory_common_args
diff --git a/torch/signal/windows/__init__.py b/torch/signal/windows/__init__.py
index 8bd0395cad3ad..aebd89f1c2867 100644
--- a/torch/signal/windows/__init__.py
+++ b/torch/signal/windows/__init__.py
@@ -1,9 +1,26 @@
-from .windows import cosine, exponential, gaussian, kaiser
+from .windows import (
+    bartlett,
+    blackman,
+    cosine,
+    exponential,
+    gaussian,
+    general_cosine,
+    general_hamming,
+    hamming,
+    hann,
+    kaiser,
+)
 
 
 __all__ = [
+    'bartlett',
+    'blackman',
     'cosine',
     'exponential',
     'gaussian',
+    'general_cosine',
+    'general_hamming',
+    'hamming',
+    'hann',
     'kaiser',
 ]
diff --git a/torch/signal/windows/windows.py b/torch/signal/windows/windows.py
index 564caf2b77179..83ae60e7ca069 100644
--- a/torch/signal/windows/windows.py
+++ b/torch/signal/windows/windows.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Iterable
 
 import torch
 from math import sqrt
@@ -8,9 +8,15 @@
 from torch._torch_docs import factory_common_args, parse_kwargs, merge_dicts
 
 __all__ = [
+    'bartlett',
+    'blackman',
     'cosine',
     'exponential',
     'gaussian',
+    'general_cosine',
+    'general_hamming',
+    'hamming',
+    'hann',
     'kaiser',
 ]
 
@@ -24,8 +30,11 @@
 """
     ),
     factory_common_args,
-    {"normalization": "The window is normalized to 1 (maximum value is 1). However, the 1 doesn't appear if "
-                      ":attr:`M` is even and :attr:`sym` is `True`."}
+    {
+        "normalization": "The window is normalized to 1 (maximum value is 1). However, the 1 doesn't appear if "
+                         ":attr:`M` is even and :attr:`sym` is `True`.",
+        "return": "Tensor: A 1-D tensor of size ``M`` containing the window."
+    }
 )
 
 
@@ -74,9 +83,9 @@ def _window_function_checks(function_name: str, M: int, dtype: torch.dtype, layo
 The exponential window is defined as follows:
 
 .. math::
-    w(n) = \exp{\left(-\frac{|n - c|}{\tau}\right)}
+    w_n = \exp{\left(-\frac{|n - c|}{\tau}\right)}
 
-where `c` is the center of the window.
+where `c` is the ``center`` of the window.
     """,
     r"""
 
@@ -98,15 +107,18 @@ def _window_function_checks(function_name: str, M: int, dtype: torch.dtype, layo
     {device}
     {requires_grad}
 
+Returns:
+    {return}
+
 Examples::
 
-    >>> # Generate a symmetric exponential window of size 10 and with a decay value of 1.0.
+    >>> # Generates a symmetric exponential window of size 10 and with a decay value of 1.0.
     >>> # The center will be at (M - 1) / 2, where M is 10.
     >>> torch.signal.windows.exponential(10)
     tensor([0.0111, 0.0302, 0.0821, 0.2231, 0.6065, 0.6065, 0.2231, 0.0821, 0.0302, 0.0111])
 
-    >>> # Generate a periodic exponential window and decay factor equal to .5
-    >>> torch.signal.windows.exponential(10,sym=False,tau=.5)
+    >>> # Generates a periodic exponential window and decay factor equal to .5
+    >>> torch.signal.windows.exponential(10, sym=False,tau=.5)
     tensor([4.5400e-05, 3.3546e-04, 2.4788e-03, 1.8316e-02, 1.3534e-01, 1.0000e+00, 1.3534e-01, 1.8316e-02, 2.4788e-03, 3.3546e-04])
     """.format(
         **window_common_args
@@ -128,24 +140,20 @@ def exponential(
 
     _window_function_checks('exponential', M, dtype, layout)
 
-    if M == 0:
-        return torch.empty((0,), dtype=dtype, layout=layout, device=device, requires_grad=requires_grad)
-
     if tau <= 0:
         raise ValueError(f'Tau must be positive, got: {tau} instead.')
 
     if sym and center is not None:
         raise ValueError('Center must be None for symmetric windows')
 
+    if M == 0:
+        return torch.empty((0,), dtype=dtype, layout=layout, device=device, requires_grad=requires_grad)
+
     if center is None:
         center = (M if not sym and M > 1 else M - 1) / 2.0
 
     constant = 1 / tau
 
-    """
-    Note that non-integer step is subject to floating point rounding errors when comparing against end;
-    thus, to avoid inconsistency, we added an epsilon equal to `step / 2` to `end`.
-    """
     k = torch.linspace(start=-center * constant,
                        end=(-center + (M - 1)) * constant,
                        steps=M,
@@ -165,7 +173,7 @@ def exponential(
 The cosine window is defined as follows:
 
 .. math::
-    w(n) = \cos{\left(\frac{\pi n}{M} - \frac{\pi}{2}\right)} = \sin{\left(\frac{\pi n}{M}\right)}
+    w_n = \cos{\left(\frac{\pi n}{M} - \frac{\pi}{2}\right)} = \sin{\left(\frac{\pi n}{M}\right)}
     """,
     r"""
 
@@ -181,14 +189,17 @@ def exponential(
     {device}
     {requires_grad}
 
+Returns:
+    {return}
+
 Examples::
 
-    >>> # Generate a symmetric cosine window.
+    >>> # Generates a symmetric cosine window.
     >>> torch.signal.windows.cosine(10)
     tensor([0.1564, 0.4540, 0.7071, 0.8910, 0.9877, 0.9877, 0.8910, 0.7071, 0.4540, 0.1564])
 
-    >>> # Generate a periodic cosine window.
-    >>> torch.signal.windows.cosine(10,sym=False)
+    >>> # Generates a periodic cosine window.
+    >>> torch.signal.windows.cosine(10, sym=False)
     tensor([0.1423, 0.4154, 0.6549, 0.8413, 0.9595, 1.0000, 0.9595, 0.8413, 0.6549, 0.4154])
 """.format(
         **window_common_args,
@@ -232,7 +243,7 @@ def cosine(
 The gaussian window is defined as follows:
 
 .. math::
-    w(n) = \exp{\left(-\left(\frac{n}{2\sigma}\right)^2\right)}
+    w_n = \exp{\left(-\left(\frac{n}{2\sigma}\right)^2\right)}
     """,
     r"""
 
@@ -250,14 +261,17 @@ def cosine(
     {device}
     {requires_grad}
 
+Returns:
+    {return}
+
 Examples::
 
-    >>> # Generate a symmetric gaussian window with a standard deviation of 1.0.
+    >>> # Generates a symmetric gaussian window with a standard deviation of 1.0.
     >>> torch.signal.windows.gaussian(10)
     tensor([4.0065e-05, 2.1875e-03, 4.3937e-02, 3.2465e-01, 8.8250e-01, 8.8250e-01, 3.2465e-01, 4.3937e-02, 2.1875e-03, 4.0065e-05])
 
-    >>> # Generate a periodic gaussian window and standard deviation equal to 0.9.
-    >>> torch.signal.windows.gaussian(10,sym=False,std=0.9)
+    >>> # Generates a periodic gaussian window and standard deviation equal to 0.9.
+    >>> torch.signal.windows.gaussian(10, sym=False,std=0.9)
     tensor([1.9858e-07, 5.1365e-05, 3.8659e-03, 8.4658e-02, 5.3941e-01, 1.0000e+00, 5.3941e-01, 8.4658e-02, 3.8659e-03, 5.1365e-05])
 """.format(
         **window_common_args,
@@ -278,12 +292,12 @@ def gaussian(
 
     _window_function_checks('gaussian', M, dtype, layout)
 
-    if M == 0:
-        return torch.empty((0,), dtype=dtype, layout=layout, device=device, requires_grad=requires_grad)
-
     if std <= 0:
         raise ValueError(f'Standard deviation must be positive, got: {std} instead.')
 
+    if M == 0:
+        return torch.empty((0,), dtype=dtype, layout=layout, device=device, requires_grad=requires_grad)
+
     start = -(M if not sym and M > 1 else M - 1) / 2.0
 
     constant = 1 / (std * sqrt(2))
@@ -306,13 +320,10 @@ def gaussian(
 The Kaiser window is defined as follows:
 
 .. math::
-    out_i = I_0 \left( \beta \sqrt{1 - \left( {\frac{i - N/2}{N/2}} \right) ^2 } \right) / I_0( \beta )
+    w_n = I_0 \left( \beta \sqrt{1 - \left( {\frac{n - N/2}{N/2}} \right) ^2 } \right) / I_0( \beta )
 
 where ``I_0`` is the zeroth order modified Bessel function of the first kind (see :func:`torch.special.i0`), and
 ``N = M - 1 if sym else M``.
-
-``M`` is the window length.
-
     """,
     r"""
 
@@ -331,12 +342,11 @@ def gaussian(
 
 Examples::
 
-    >>> # Generate a symmetric gaussian window with a standard deviation of 1.0.
+    >>> # Generates a symmetric gaussian window with a standard deviation of 1.0.
     >>> torch.signal.windows.kaiser(5)
     tensor([4.0065e-05, 2.1875e-03, 4.3937e-02, 3.2465e-01, 8.8250e-01, 8.8250e-01, 3.2465e-01, 4.3937e-02, 2.1875e-03, 4.0065e-05])
-
-    >>> # Generate a periodic gaussian window and standard deviation equal to 0.9.
-    >>> torch.signal.windows.kaiser(5,sym=False,std=0.9)
+    >>> # Generates a periodic gaussian window and standard deviation equal to 0.9.
+    >>> torch.signal.windows.kaiser(5, sym=False,std=0.9)
     tensor([1.9858e-07, 5.1365e-05, 3.8659e-03, 8.4658e-02, 5.3941e-01, 1.0000e+00, 5.3941e-01, 8.4658e-02, 3.8659e-03, 5.1365e-05])
 """.format(
         **window_common_args,
@@ -378,3 +388,374 @@ def kaiser(
                        requires_grad=requires_grad)
 
     return torch.i0(torch.sqrt(beta * beta - torch.pow(k, 2))) / torch.i0(torch.tensor(beta))
+
+
+@_add_docstr(
+    r"""
+Computes the Hamming window.
+
+The Hamming window is defined as follows:
+
+.. math::
+    w_n = \alpha - \beta\ \cos \left( \frac{2 \pi n}{M - 1} \right)
+    """,
+    r"""
+
+{normalization}
+
+Arguments:
+    {M}
+
+Keyword args:
+    {sym}
+    alpha (float, optional): The coefficient :math:`\alpha` in the equation above.
+    beta (float, optional): The coefficient :math:`\beta` in the equation above.
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Returns:
+    {return}
+
+Examples::
+
+    >>> # Generates a symmetric Hamming window.
+    >>> torch.signal.windows.hamming(10)
+    tensor([0.0800, 0.1876, 0.4601, 0.7700, 0.9723, 0.9723, 0.7700, 0.4601, 0.1876, 0.0800])
+
+    >>> # Generates a periodic Hamming window.
+    >>> torch.signal.windows.hamming(10, sym=False)
+    tensor([0.0800, 0.1679, 0.3979, 0.6821, 0.9121, 1.0000, 0.9121, 0.6821, 0.3979, 0.1679])
+""".format(
+        **window_common_args
+    ),
+)
+def hamming(M: int,
+            *,
+            sym: bool = True,
+            dtype: torch.dtype = None,
+            layout: torch.layout = torch.strided,
+            device: torch.device = None,
+            requires_grad: bool = False) -> Tensor:
+    return general_hamming(M, sym=sym, dtype=dtype, layout=layout, device=device, requires_grad=requires_grad)
+
+
+@_add_docstr(
+    r"""
+Computes the Hann window.
+
+The Hann window is defined as follows:
+
+.. math::
+    w_n = \frac{1}{2}\ \left[1 - \cos \left( \frac{2 \pi n}{M - 1} \right)\right] =
+    \sin^2 \left( \frac{\pi n}{M - 1} \right)
+    """,
+    r"""
+
+{normalization}
+
+Arguments:
+    {M}
+
+Keyword args:
+    {sym}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Returns:
+    {return}
+
+Examples::
+
+    >>> # Generates a symmetric Hann window.
+    >>> torch.signal.windows.hann(10)
+    tensor([0.0000, 0.1170, 0.4132, 0.7500, 0.9698, 0.9698, 0.7500, 0.4132, 0.1170, 0.0000])
+
+    >>> # Generates a periodic Hann window.
+    >>> torch.signal.windows.hann(10, sym=False)
+    tensor([0.0000, 0.0955, 0.3455, 0.6545, 0.9045, 1.0000, 0.9045, 0.6545, 0.3455, 0.0955])
+""".format(
+        **window_common_args
+    ),
+)
+def hann(M: int,
+         *,
+         sym: bool = True,
+         dtype: torch.dtype = None,
+         layout: torch.layout = torch.strided,
+         device: torch.device = None,
+         requires_grad: bool = False) -> Tensor:
+    return general_hamming(M,
+                           alpha=0.5,
+                           sym=sym,
+                           dtype=dtype,
+                           layout=layout,
+                           device=device,
+                           requires_grad=requires_grad)
+
+
+@_add_docstr(
+    r"""
+Computes the Blackman window.
+
+The Blackman window is defined as follows:
+
+.. math::
+    w_n = 0.42 - 0.5 \cos \left( \frac{2 \pi n}{M - 1} \right) + 0.08 \cos \left( \frac{4 \pi n}{M - 1} \right)
+    """,
+    r"""
+
+{normalization}
+
+Arguments:
+    {M}
+
+Keyword args:
+    {sym}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Returns:
+    {return}
+
+Examples::
+
+    >>> # Generates a symmetric Blackman window.
+    >>> torch.signal.windows.blackman(5)
+    tensor([-1.4901e-08,  3.4000e-01,  1.0000e+00,  3.4000e-01, -1.4901e-08])
+
+    >>> # Generates a periodic Blackman window.
+    >>> torch.signal.windows.blackman(5, sym=False)
+    tensor([-1.4901e-08,  2.0077e-01,  8.4923e-01,  8.4923e-01,  2.0077e-01])
+""".format(
+        **window_common_args
+    ),
+)
+def blackman(M: int,
+             *,
+             sym: bool = True,
+             dtype: torch.dtype = None,
+             layout: torch.layout = torch.strided,
+             device: torch.device = None,
+             requires_grad: bool = False) -> Tensor:
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+
+    _window_function_checks('blackman', M, dtype, layout)
+
+    return general_cosine(M, a=[0.42, 0.5, 0.08], sym=sym, dtype=dtype, layout=layout, device=device,
+                          requires_grad=requires_grad)
+
+
+@_add_docstr(
+    r"""
+Computes the Bartlett window.
+
+The Bartlett window is defined as follows:
+
+.. math::
+    w_n = 1 - \left| \frac{2n}{M - 1} - 1 \right| = \begin{cases}
+        \frac{2n}{M - 1} & \text{if } 0 \leq n \leq \frac{M - 1}{2} \\
+        2 - \frac{2n}{M - 1} & \text{if } \frac{M - 1}{2} < n < M \\ \end{cases}
+    """,
+    r"""
+
+{normalization}
+
+Arguments:
+    {M}
+
+Keyword args:
+    {sym}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Returns:
+    {return}
+
+Examples::
+
+    >>> # Generates a symmetric Bartlett window.
+    >>> torch.signal.windows.bartlett(10)
+    tensor([0.0000, 0.2222, 0.4444, 0.6667, 0.8889, 0.8889, 0.6667, 0.4444, 0.2222, 0.0000])
+
+    >>> # Generates a periodic Bartlett window.
+    >>> torch.signal.windows.bartlett(10, sym=False)
+    tensor([0.0000, 0.2000, 0.4000, 0.6000, 0.8000, 1.0000, 0.8000, 0.6000, 0.4000, 0.2000])
+""".format(
+        **window_common_args
+    ),
+)
+def bartlett(M: int,
+             *,
+             sym: bool = True,
+             dtype: torch.dtype = None,
+             layout: torch.layout = torch.strided,
+             device: torch.device = None,
+             requires_grad: bool = False) -> Tensor:
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+
+    _window_function_checks('bartlett', M, dtype, layout)
+
+    if M == 0:
+        return torch.empty((0,), dtype=dtype, layout=layout, device=device, requires_grad=requires_grad)
+
+    if M == 1:
+        return torch.ones((1,), dtype=dtype, layout=layout, device=device, requires_grad=requires_grad)
+
+    start = -1
+    constant = 2 / (M if not sym else M - 1)
+
+    k = torch.linspace(start=start,
+                       end=start + (M - 1) * constant,
+                       steps=M,
+                       dtype=dtype,
+                       layout=layout,
+                       device=device,
+                       requires_grad=requires_grad)
+
+    return 1 - torch.abs(k)
+
+
+@_add_docstr(
+    r"""
+Computes the general cosine window.
+
+The general cosine window is defined as follows:
+
+.. math::
+    w_n = \sum^{M-1}_{i=0} (-1)^i a_i \cos{ \left( \frac{2 \pi i n}{M - 1}\right)}
+    """,
+    r"""
+
+{normalization}
+
+Arguments:
+    {M}
+
+Keyword args:
+    a (Iterable): the coefficients associated to each of the cosine functions.
+    {sym}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Returns:
+    {return}
+
+Examples::
+
+    >>> # Generates a symmetric general cosine window with 3 coefficients.
+    >>> torch.signal.windows.general_cosine(10, a=[0.46, 0.23, 0.31], sym=True)
+    tensor([0.5400, 0.3376, 0.1288, 0.4200, 0.9136, 0.9136, 0.4200, 0.1288, 0.3376, 0.5400])
+
+    >>> # Generates a periodic general cosine window wit 2 coefficients.
+    >>> torch.signal.windows.general_cosine(10, a=[0.5, 1 - 0.5], sym=False)
+    tensor([0.0000, 0.0955, 0.3455, 0.6545, 0.9045, 1.0000, 0.9045, 0.6545, 0.3455, 0.0955])
+""".format(
+        **window_common_args
+    ),
+)
+def general_cosine(M, *,
+                   a: Iterable,
+                   sym: bool = True,
+                   dtype: torch.dtype = None,
+                   layout: torch.layout = torch.strided,
+                   device: torch.device = None,
+                   requires_grad: bool = False) -> Tensor:
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+
+    _window_function_checks('general_cosine', M, dtype, layout)
+
+    if M == 0:
+        return torch.empty((0,), dtype=dtype, layout=layout, device=device, requires_grad=requires_grad)
+
+    if M == 1:
+        return torch.ones((1,), dtype=dtype, layout=layout, device=device, requires_grad=requires_grad)
+
+    if not isinstance(a, Iterable):
+        raise TypeError("Coefficients must be a list/tuple")
+
+    if not a:
+        raise ValueError("Coefficients cannot be empty")
+
+    constant = 2 * torch.pi / (M if not sym else M - 1)
+
+    k = torch.linspace(start=0,
+                       end=(M - 1) * constant,
+                       steps=M,
+                       dtype=dtype,
+                       layout=layout,
+                       device=device,
+                       requires_grad=requires_grad)
+
+    a_i = torch.tensor([(-1) ** i * w for i, w in enumerate(a)], device=device, dtype=dtype, requires_grad=requires_grad)
+    i = torch.arange(a_i.shape[0], dtype=a_i.dtype, device=a_i.device, requires_grad=a_i.requires_grad)
+    return (a_i.unsqueeze(-1) * torch.cos(i.unsqueeze(-1) * k)).sum(0)
+
+
+@_add_docstr(
+    r"""
+Computes the general Hamming window.
+
+The general Hamming window is defined as follows:
+
+.. math::
+    w_n = \alpha - (1 - \alpha) \cos{ \left( \frac{2 \pi n}{M-1} \right)}
+    """,
+    r"""
+
+{normalization}
+
+Arguments:
+    {M}
+
+Keyword args:
+    alpha (float, optional): the window coefficient. Default: 0.54.
+    {sym}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Returns:
+    {return}
+
+Examples::
+
+    >>> # Generates a symmetric Hamming window with the general Hamming window.
+    >>> torch.signal.windows.general_hamming(10, sym=True)
+    tensor([0.0800, 0.1876, 0.4601, 0.7700, 0.9723, 0.9723, 0.7700, 0.4601, 0.1876, 0.0800])
+
+    >>> # Generates a periodic Hann window with the general Hamming window.
+    >>> torch.signal.windows.general_hamming(10, alpha=0.5, sym=False)
+    tensor([0.0000, 0.0955, 0.3455, 0.6545, 0.9045, 1.0000, 0.9045, 0.6545, 0.3455, 0.0955])
+""".format(
+        **window_common_args
+    ),
+)
+def general_hamming(M,
+                    *,
+                    alpha: float = 0.54,
+                    sym: bool = True,
+                    dtype: torch.dtype = None,
+                    layout: torch.layout = torch.strided,
+                    device: torch.device = None,
+                    requires_grad: bool = False) -> Tensor:
+    return general_cosine(M,
+                          a=[alpha, 1. - alpha],
+                          sym=sym,
+                          dtype=dtype,
+                          layout=layout,
+                          device=device,
+                          requires_grad=requires_grad)
diff --git a/torch/testing/_internal/opinfo/definitions/signal.py b/torch/testing/_internal/opinfo/definitions/signal.py
index 0bab0006e80c2..19559a8c59afd 100644
--- a/torch/testing/_internal/opinfo/definitions/signal.py
+++ b/torch/testing/_internal/opinfo/definitions/signal.py
@@ -112,6 +112,44 @@ def reference_inputs_kaiser_window(op_info, device, dtype, requires_grad, **kwar
         yield SampleInput(size, sym=True, **kw)
 
 
+def reference_inputs_general_cosine_window(
+    op_info, device, dtype, requires_grad, **kwargs
+):
+    yield from sample_inputs_window(op_info, device, dtype, requires_grad, **kwargs)
+
+    cases = (
+        (8, {"a": [0.5, 0.5]}),
+        (16, {"a": [0.46, 0.54]}),
+        (32, {"a": [0.46, 0.23, 0.31]}),
+        (64, {"a": [0.5]}),
+        (128, {"a": [0.1, 0.8, 0.05, 0.05]}),
+        (256, {"a": [0.2, 0.2, 0.2, 0.2, 0.2]}),
+    )
+
+    for size, kw in cases:
+        yield SampleInput(size, sym=False, **kw)
+        yield SampleInput(size, sym=True, **kw)
+
+
+def reference_inputs_general_hamming_window(
+    op_info, device, dtype, requires_grad, **kwargs
+):
+    yield from sample_inputs_window(op_info, device, dtype, requires_grad, **kwargs)
+
+    cases = (
+        (8, {"alpha": 0.54}),
+        (16, {"alpha": 0.5}),
+        (32, {"alpha": 0.23}),
+        (64, {"alpha": 0.8}),
+        (128, {"alpha": 0.9}),
+        (256, {"alpha": 0.05}),
+    )
+
+    for size, kw in cases:
+        yield SampleInput(size, sym=False, **kw)
+        yield SampleInput(size, sym=True, **kw)
+
+
 def error_inputs_window(op_info, device, *args, **kwargs):
     # Tests for windows that have a negative size
     yield ErrorInput(
@@ -185,6 +223,24 @@ def error_inputs_kaiser_window(op_info, device, **kwargs):
     )
 
 
+def error_inputs_general_cosine_window(op_info, device, **kwargs):
+    # Yield common error inputs
+    yield from error_inputs_window(op_info, device, a=[0.54, 0.46], **kwargs)
+
+    # Tests for negative beta
+    yield ErrorInput(
+        SampleInput(3, a=None, dtype=torch.float32, device=device, **kwargs),
+        error_type=TypeError,
+        error_regex="Coefficients must be a list/tuple",
+    )
+
+    yield ErrorInput(
+        SampleInput(3, a=[], dtype=torch.float32, device=device, **kwargs),
+        error_type=ValueError,
+        error_regex="Coefficients cannot be empty",
+    )
+
+
 def reference_signal_window(fn: Callable):
     r"""Wrapper for scipy signal window references.
 
@@ -213,7 +269,7 @@ def make_signal_windows_opinfo(
     reference_inputs_func: Callable,
     error_inputs_func: Callable,
     *,
-    skips: Tuple[DecorateInfo] = (),
+    skips: Tuple[DecorateInfo, ...] = (),
 ):
     r"""Helper function to create OpInfo objects related to different windows."""
     return OpInfo(
@@ -264,6 +320,54 @@ def make_signal_windows_opinfo(
                 "TestVmapOperatorsOpInfo",
                 "test_op_has_batch_rule",
             ),
+            DecorateInfo(
+                unittest.skip("Buggy on MPS for now (mistakenly promotes to float64)"),
+                "TestCommon",
+                "test_numpy_ref_mps",
+            ),
+            *skips,
+        ),
+    )
+
+
+op_db: List[OpInfo] = [
+    make_signal_windows_opinfo(
+        name="signal.windows.hamming",
+        ref=reference_signal_window(scipy.signal.windows.hamming)
+        if TEST_SCIPY
+        else None,
+        sample_inputs_func=sample_inputs_window,
+        reference_inputs_func=reference_inputs_window,
+        error_inputs_func=error_inputs_window,
+        skips=(
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestDecomp",
+                "test_comprehensive",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestMeta",
+                "test_dispatch_meta",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestMeta",
+                "test_meta",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestMeta",
+                "test_dispatch_symbolic_meta",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
             DecorateInfo(
                 unittest.expectedFailure,
                 "TestSchemaCheckModeOpInfo",
@@ -271,6 +375,22 @@ def make_signal_windows_opinfo(
                 dtypes=[torch.float16],
                 device_type="cpu",
             ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNNCOpInfo",
+                "test_nnc_correctness",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+        ),
+    ),
+    make_signal_windows_opinfo(
+        name="signal.windows.hann",
+        ref=reference_signal_window(scipy.signal.windows.hann) if TEST_SCIPY else None,
+        sample_inputs_func=sample_inputs_window,
+        reference_inputs_func=reference_inputs_window,
+        error_inputs_func=error_inputs_window,
+        skips=(
             DecorateInfo(
                 unittest.expectedFailure,
                 "TestDecomp",
@@ -299,6 +419,13 @@ def make_signal_windows_opinfo(
                 dtypes=[torch.float16],
                 device_type="cpu",
             ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestSchemaCheckModeOpInfo",
+                "test_schema_correctness",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
             DecorateInfo(
                 unittest.expectedFailure,
                 "TestNNCOpInfo",
@@ -306,12 +433,70 @@ def make_signal_windows_opinfo(
                 dtypes=[torch.float16],
                 device_type="cpu",
             ),
-            *skips,
         ),
-    )
-
-
-op_db: List[OpInfo] = [
+    ),
+    make_signal_windows_opinfo(
+        name="signal.windows.bartlett",
+        ref=reference_signal_window(scipy.signal.windows.bartlett)
+        if TEST_SCIPY
+        else None,
+        sample_inputs_func=sample_inputs_window,
+        reference_inputs_func=reference_inputs_window,
+        error_inputs_func=error_inputs_window,
+    ),
+    make_signal_windows_opinfo(
+        name="signal.windows.blackman",
+        ref=reference_signal_window(scipy.signal.windows.blackman)
+        if TEST_SCIPY
+        else None,
+        sample_inputs_func=sample_inputs_window,
+        reference_inputs_func=reference_inputs_window,
+        error_inputs_func=error_inputs_window,
+        skips=(
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestDecomp",
+                "test_comprehensive",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestMeta",
+                "test_dispatch_meta",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestMeta",
+                "test_meta",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestMeta",
+                "test_dispatch_symbolic_meta",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestSchemaCheckModeOpInfo",
+                "test_schema_correctness",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNNCOpInfo",
+                "test_nnc_correctness",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+        ),
+    ),
     make_signal_windows_opinfo(
         name="signal.windows.cosine",
         ref=reference_signal_window(scipy.signal.windows.cosine)
@@ -322,9 +507,46 @@ def make_signal_windows_opinfo(
         error_inputs_func=error_inputs_window,
         skips=(
             DecorateInfo(
-                unittest.skip("Buggy on MPS for now (mistakenly promotes to float64)"),
-                "TestCommon",
-                "test_numpy_ref_mps",
+                unittest.expectedFailure,
+                "TestDecomp",
+                "test_comprehensive",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestMeta",
+                "test_dispatch_meta",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestMeta",
+                "test_meta",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestMeta",
+                "test_dispatch_symbolic_meta",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestSchemaCheckModeOpInfo",
+                "test_schema_correctness",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNNCOpInfo",
+                "test_nnc_correctness",
+                dtypes=[torch.float16],
+                device_type="cpu",
             ),
         ),
     ),
@@ -338,9 +560,46 @@ def make_signal_windows_opinfo(
         error_inputs_func=error_inputs_exponential_window,
         skips=(
             DecorateInfo(
-                unittest.skip("Buggy on MPS for now (mistakenly promotes to float64)"),
-                "TestCommon",
-                "test_numpy_ref_mps",
+                unittest.expectedFailure,
+                "TestDecomp",
+                "test_comprehensive",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestMeta",
+                "test_dispatch_meta",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestMeta",
+                "test_meta",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestMeta",
+                "test_dispatch_symbolic_meta",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestSchemaCheckModeOpInfo",
+                "test_schema_correctness",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNNCOpInfo",
+                "test_nnc_correctness",
+                dtypes=[torch.float16],
+                device_type="cpu",
             ),
         ),
     ),
@@ -353,11 +612,53 @@ def make_signal_windows_opinfo(
         reference_inputs_func=partial(reference_inputs_gaussian_window, std=1.92),
         error_inputs_func=error_inputs_gaussian_window,
         skips=(
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestDecomp",
+                "test_comprehensive",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestMeta",
+                "test_dispatch_meta",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestMeta",
+                "test_meta",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestMeta",
+                "test_dispatch_symbolic_meta",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestSchemaCheckModeOpInfo",
+                "test_schema_correctness",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
             DecorateInfo(
                 unittest.skip("Buggy on MPS for now (mistakenly promotes to float64)"),
                 "TestCommon",
                 "test_numpy_ref_mps",
             ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNNCOpInfo",
+                "test_nnc_correctness",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
         ),
     ),
     make_signal_windows_opinfo(
@@ -370,9 +671,156 @@ def make_signal_windows_opinfo(
         error_inputs_func=error_inputs_kaiser_window,
         skips=(
             DecorateInfo(
-                unittest.skip("Unsupported on MPS for now pending aten::i0 support"),
-                "TestCommon",
-                "test_numpy_ref_mps",
+                unittest.expectedFailure,
+                "TestDecomp",
+                "test_comprehensive",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestMeta",
+                "test_dispatch_meta",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestMeta",
+                "test_meta",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestMeta",
+                "test_dispatch_symbolic_meta",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestSchemaCheckModeOpInfo",
+                "test_schema_correctness",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNNCOpInfo",
+                "test_nnc_correctness",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+        ),
+    ),
+    make_signal_windows_opinfo(
+        name="signal.windows.general_cosine",
+        ref=reference_signal_window(scipy.signal.windows.general_cosine)
+        if TEST_SCIPY
+        else None,
+        sample_inputs_func=partial(sample_inputs_window, a=[0.54, 0.46]),
+        reference_inputs_func=partial(
+            reference_inputs_general_cosine_window, a=[0.54, 0.46]
+        ),
+        error_inputs_func=error_inputs_general_cosine_window,
+        skips=(
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestDecomp",
+                "test_comprehensive",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestMeta",
+                "test_dispatch_meta",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestMeta",
+                "test_meta",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestMeta",
+                "test_dispatch_symbolic_meta",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestSchemaCheckModeOpInfo",
+                "test_schema_correctness",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNNCOpInfo",
+                "test_nnc_correctness",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+        ),
+    ),
+    make_signal_windows_opinfo(
+        name="signal.windows.general_hamming",
+        ref=reference_signal_window(scipy.signal.windows.general_hamming)
+        if TEST_SCIPY
+        else None,
+        sample_inputs_func=partial(sample_inputs_window, alpha=0.54),
+        reference_inputs_func=partial(
+            reference_inputs_general_hamming_window, alpha=0.54
+        ),
+        error_inputs_func=error_inputs_window,
+        skips=(
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestDecomp",
+                "test_comprehensive",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestMeta",
+                "test_dispatch_meta",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestMeta",
+                "test_meta",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestMeta",
+                "test_dispatch_symbolic_meta",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestSchemaCheckModeOpInfo",
+                "test_schema_correctness",
+                dtypes=[torch.float16],
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNNCOpInfo",
+                "test_nnc_correctness",
+                dtypes=[torch.float16],
+                device_type="cpu",
             ),
         ),
     ),

From ab9bee59c9f81094320ffdd8a4267cd3e8111d26 Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Fri, 25 Nov 2022 14:53:57 +0000
Subject: [PATCH 1278/1922] complex: register c10::complex with py::cast
 (#89680)

Fixes #77134

TODO:
* [x] Add test (tested locally with script below) (Are there similar tests in the test-suite?)

```c++
#include <torch/torch.h>
#include <torch/csrc/utils/pybind.h>
#include <iostream>
#include <vector>
#include <pybind11/pybind11.h>
#include <pybind11/embed.h>
#include <cassert>

namespace py = pybind11;

int main() {
    py::scoped_interpreter guard{}; // start the interpreter
    auto casted_cdouble = py::cast(c10::complex<double>(1.0, 2.0));
    assert(
        (c10::complex<double>(1.0, 2.0) ==
         py::cast<c10::complex<double>>(casted_cdouble)));

    auto casted_cfloat = py::cast(c10::complex<float>(1.0, 2.0));
    assert(
        (c10::complex<double>(1.0, 2.0) ==
         py::cast<c10::complex<double>>(casted_cfloat)));

    auto casted_chalf = py::cast(c10::complex<at::Half>(1.0, 2.0));
    assert(
        (c10::complex<double>(1.0, 2.0) ==
         py::cast<c10::complex<double>>(casted_chalf)));
}

```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89680
Approved by: https://github.com/ezyang
---
 torch/csrc/utils/pybind.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/torch/csrc/utils/pybind.h b/torch/csrc/utils/pybind.h
index c582dee1d2f64..bf1553814cef9 100644
--- a/torch/csrc/utils/pybind.h
+++ b/torch/csrc/utils/pybind.h
@@ -227,6 +227,35 @@ struct type_caster<c10::SymFloat> {
       handle /* parent */);
 };
 
+template <typename T>
+struct type_caster<c10::complex<T>> {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  PYBIND11_TYPE_CASTER(c10::complex<T>, _("torch._complex.complex"));
+
+  bool load(handle src, bool) {
+    PyObject* obj = src.ptr();
+
+    // Refered from `THPUtils_unpackComplexDouble`
+    Py_complex py_complex = PyComplex_AsCComplex(obj);
+    if (py_complex.real == -1.0 && PyErr_Occurred()) {
+      return false;
+    }
+
+    // Python's Complex is always double precision.
+    value = c10::complex<double>(py_complex.real, py_complex.imag);
+    return true;
+  }
+
+  static handle cast(
+      const c10::complex<T>& complex,
+      return_value_policy /* policy */,
+      handle /* parent */) {
+    // Python only knows double precision complex.
+    return handle(PyComplex_FromDoubles(complex.real(), complex.imag()));
+  }
+};
+
 // Pybind11 bindings for our optional and variant types.
 // http://pybind11.readthedocs.io/en/stable/advanced/cast/stl.html#c-17-library-containers
 template <typename T>

From 7670c48790fb32245a2716c4bdc3ac9cdda59090 Mon Sep 17 00:00:00 2001
From: Ikko Ashimine <eltociear@gmail.com>
Date: Fri, 25 Nov 2022 19:26:18 +0000
Subject: [PATCH 1279/1922] Fix typo in segment_reduction_op_gpu.cu (#89647)

menber -> member

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89647
Approved by: https://github.com/kit1980
---
 caffe2/operators/segment_reduction_op_gpu.cu | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/caffe2/operators/segment_reduction_op_gpu.cu b/caffe2/operators/segment_reduction_op_gpu.cu
index 7253df677025b..6985c3c3378b4 100644
--- a/caffe2/operators/segment_reduction_op_gpu.cu
+++ b/caffe2/operators/segment_reduction_op_gpu.cu
@@ -493,7 +493,7 @@ class CUDASparseLengthsSumOp : public Operator<CUDAContext> {
   enum { DATA = 0, INDICES = 1, LENGTHS = 1 + (SparseFused ? 1 : 0) };
 
  private:
-  // menber field to manage memory
+  // member field to manage memory
   Tensor inclusive_scan_buffer_{CUDA};
   Tensor inclusive_scan_length_buffer_{CUDA};
 };
@@ -632,7 +632,7 @@ class CUDASparseLengthsMeanOp : public Operator<CUDAContext> {
   enum { DATA = 0, INDICES = 1, LENGTHS = 1 + (SparseFused ? 1 : 0) };
 
  private:
-  // menber field to manage memory
+  // member field to manage memory
   Tensor inclusive_scan_buffer_{CUDA};
   Tensor inclusive_scan_length_buffer_{CUDA};
 };
@@ -765,7 +765,7 @@ class CUDASparseLengthsMaxOp : public Operator<CUDAContext> {
   enum { INDICES = 1, LENGTHS = 1 + (SparseFused ? 1 : 0) };
 
  private:
-  // menber field to manage memory
+  // member field to manage memory
   Tensor inclusive_scan_buffer_{CUDA};
   Tensor inclusive_scan_length_buffer_{CUDA};
 };
@@ -861,7 +861,7 @@ class CUDASparseLengthsWeightedSumOp : public Operator<CUDAContext> {
   enum { DATA = 0, WEIGHTS = 1, INDICES = 2, LENGTHS = 3 };
 
  private:
-  // menber field to manage memory
+  // member field to manage memory
   Tensor inclusive_scan_buffer_{CUDA};
   Tensor inclusive_scan_length_buffer_{CUDA};
 };
@@ -1356,7 +1356,7 @@ class CUDASparseLengthsSumGradientWithIndicesOp : public Operator<CUDAContext> {
   }
 
  private:
-  // menber field to manage memory
+  // member field to manage memory
   Tensor inclusive_scan_buffer_{CUDA};
   Tensor inclusive_scan_length_buffer_{CUDA};
 };
@@ -1437,7 +1437,7 @@ class CUDASparseLengthsMeanGradientWithIndicesOp
   }
 
  private:
-  // menber field to manage memory
+  // member field to manage memory
   Tensor inclusive_scan_buffer_{CUDA};
   Tensor inclusive_scan_length_buffer_{CUDA};
 };
@@ -1526,7 +1526,7 @@ class CUDASparseLengthsWeightedSumGradientWithIndicesOp
   }
 
  private:
-  // menber field to manage memory
+  // member field to manage memory
   Tensor inclusive_scan_buffer_{CUDA};
   Tensor inclusive_scan_length_buffer_{CUDA};
 };
@@ -1666,7 +1666,7 @@ class CUDALengthsMaxWithMainInputAndForwardOutputGradientOp
   }
 
  private:
-  // menber field to manage memory
+  // member field to manage memory
   Tensor inclusive_scan_buffer_{CUDA};
   Tensor inclusive_scan_length_buffer_{CUDA};
 };
@@ -1793,7 +1793,7 @@ class CUDASparseLengthsIndicesInGradientWeightedSumWithMainInputGradientOp
   }
 
  private:
-  // menber field to manage memory
+  // member field to manage memory
   Tensor inclusive_scan_buffer_{CUDA};
   Tensor inclusive_scan_length_buffer_{CUDA};
 };

From 9009e32147d00398861c54daaface2141d78de75 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Fri, 25 Nov 2022 19:42:38 +0000
Subject: [PATCH 1280/1922] make inductor correctly propagate nans for maximum
 and minimum (#89612)

Partially fixes https://github.com/pytorch/torchdynamo/issues/594
Also, small cleanup for `where` codegen

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89612
Approved by: https://github.com/soumith, https://github.com/jansel
---
 test/inductor/test_torchinductor.py |  5 +++++
 torch/_inductor/codegen/cpp.py      |  4 ++--
 torch/_inductor/codegen/triton.py   | 16 ++--------------
 torch/_inductor/config.py           |  2 --
 4 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 589d43f204188..d1d818e34527a 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -682,6 +682,11 @@ def fn(a, b):
             return (torch.maximum(a, b), torch.minimum(a, b))
 
         self.common(fn, (torch.randn(8), torch.randn(8)))
+        t1 = torch.randn(8)
+        t1[0] = float("nan")
+        t2 = torch.randn(8)
+        t2[1] = float("nan")
+        self.common(fn, (t1, t2))
 
     def test_horizonal_fusion1(self):
         def fn(a, b, c):
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index c7e40899c86f3..b320b45980d0c 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -442,11 +442,11 @@ def relu(x):
 
     @staticmethod
     def minimum(a, b):
-        return f"std::min({a}, {b})"
+        return f"({b} != {b}) ? {b} : std::min({a}, {b})"
 
     @staticmethod
     def maximum(a, b):
-        return f"std::max({a}, {b})"
+        return f"({b} != {b}) ? {b} : std::max({a}, {b})"
 
     @staticmethod
     def where(a, b, c):
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index e14b417c173f8..c302ffe3e79ea 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -164,26 +164,14 @@ def relu(x):
 
     @staticmethod
     def minimum(a, b):
-        return f"tl.minimum({a}, {b})"
+        return f"tl.where({a} != {a}, {a}, tl.where({a} < {b}, {a}, {b}))"
 
     @staticmethod
     def maximum(a, b):
-        return f"tl.maximum({a}, {b})"
+        return f"tl.where({a} != {a}, {a}, tl.where({a} > {b}, {a}, {b}))"
 
     @staticmethod
     def where(a, b, c):
-        if not config.triton.simple_where:
-            # wonkyness to work around https://github.com/openai/triton/issues/532
-            # identity calls to force new triton variables (and get access to .shape/.dtype/.numel
-            a = ops.identity(a)
-            b = ops.identity(b)
-            c = ops.identity(c)
-            a = ops.identity(
-                f"{a} | tl.zeros({b}.shape, {a}.dtype) if {b}.numel > 1 else {a}"
-            )
-            a = ops.identity(
-                f"{a} | tl.zeros({c}.shape, {a}.dtype) if {c}.numel > 1 else {a}"
-            )
         return f"tl.where({a}, {b}, {c})"
 
     @staticmethod
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index a0062c4fe4e25..cd32414bc1dfd 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -144,8 +144,6 @@ class triton:
     tiling_prevents_reduction_fusion = True
     # should we give different names to kernels
     ordered_kernel_names = False
-    # should we use natural codegen for where, needs newer triton version
-    simple_where = True
 
 
 # create a directory containing lots of debug information

From 6c48d4e9b33a930648fde7077cc3fe5ec899f0c6 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Fri, 25 Nov 2022 03:31:19 +0000
Subject: [PATCH 1281/1922] Delay verify correctness wrapping to call site.
 (#89662)

There is only one call site for compiler_fn, so we can safely delay
wrapping verify correctness to here.  This will help later when we
change the backend compiler calling convention to pass fake tensors
(but I need to pass real tensors here.)

This is adapted from voz's changes at https://github.com/pytorch/pytorch/pull/89392
but with less changes to the substantive logic.  I only moved the relevant
inner implementation; there are no changes otherwise.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89662
Approved by: https://github.com/voznesenskym
---
 torch/_dynamo/convert_frame.py | 21 +----------
 torch/_dynamo/eval_frame.py    | 44 +---------------------
 torch/_dynamo/output_graph.py  | 69 +++++++++++++++++++++++++++++++++-
 3 files changed, 69 insertions(+), 65 deletions(-)

diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index c612fe3c167d4..1f3138ec6cdcb 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -15,12 +15,7 @@
 from .allowed_functions import is_allowed
 from .bytecode_analysis import remove_dead_code, remove_pointless_jumps
 from .bytecode_transformation import is_generator, transform_code_object
-from .eval_frame import (
-    always_optimize_code_objects,
-    skip_code,
-    TorchPatcher,
-    WrapperBackend,
-)
+from .eval_frame import always_optimize_code_objects, skip_code, TorchPatcher
 from .exc import (
     BackendCompilerFailed,
     InternalTorchDynamoError,
@@ -86,18 +81,6 @@ def fx_forward_from_src_skip_result(*args, **kwargs):
     return result
 
 
-def wrap_compiler_fn(compiler_fn):
-    """WrapperBackend if config.verify_correctness is True"""
-    if config.verify_correctness:
-        # wrap backend if verify_correctness is True
-        wrapper_backend_compiler_fn = WrapperBackend(compiler_fn)
-
-        wrapper_backend_compiler_fn._torchdynamo_orig_callable = compiler_fn
-        return wrapper_backend_compiler_fn
-
-    return compiler_fn
-
-
 def wrap_convert_context(fn):
     """
     Context manager to:
@@ -281,8 +264,6 @@ def convert_frame_assert(
     """Fully convert a frame into an FX graph"""
     init_logging()
 
-    compiler_fn = wrap_compiler_fn(compiler_fn)
-
     @dynamo_timed
     def _convert_frame_assert(frame: types.FrameType, cache_size: int):
         code = frame.f_code
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 38c291c441feb..f8e2bd28439c7 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -1,5 +1,4 @@
 import contextlib
-import copy
 import functools
 import inspect
 import logging
@@ -22,7 +21,7 @@
 from .exc import ResetRequired
 from .mutation_guard import install_generation_tagging_init
 from .optimizations.distributed import DDPOptimizer
-from .utils import checkpoint_params, clone_inputs, compile_times, same
+from .utils import compile_times
 
 log = logging.getLogger(__name__)
 
@@ -320,47 +319,6 @@ def _optimize_catch_errors(compile_fn, backend_ctx_ctor=null_context, dynamic=Fa
     )
 
 
-class WrapperBackend:
-    def __init__(self, backend=None):
-        self.backend = backend
-
-    @property
-    def example_inputs(self):
-        return clone_inputs(self.original_example_inputs)
-
-    def __call__(self, gm: torch.fx.GraphModule, example_inputs):
-
-        self.restore = checkpoint_params(gm)
-        self.original_example_inputs = clone_inputs(example_inputs)
-        self.gm = gm
-        copy_gm = copy.deepcopy(self.gm)
-        self.candidate = self.backend(copy_gm, self.original_example_inputs)
-
-        if self.candidate is None or self.candidate is self.gm.forward:
-            return self.gm.forward
-
-        if not config.verify_correctness:
-            return self.candidate
-
-        # if verify_correctness=True
-        try:
-            correct = self.gm.forward(*self.example_inputs)
-            result = self.candidate(*self.example_inputs)
-
-            # TODO: replace `same` function with the one in testing
-            if same(correct, result):
-                return self.candidate
-
-            raise RuntimeError(f"incorrect results of backend {self}")
-            return self.gm.forward
-
-        except Exception:
-            log.exception("error in verify_correctness")
-            raise
-        finally:
-            self.restore()
-
-
 def get_compiler_fn(compiler_fn):
     from .debug_utils import wrap_backend_debug
 
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index 0c1ddabdc1980..495a4381c6b7f 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -1,4 +1,5 @@
 import collections
+import copy
 import functools
 import itertools
 import logging
@@ -20,7 +21,15 @@
 from .mutation_guard import is_dynamic_nn_module
 from .side_effects import SideEffects
 from .source import ConstantSource, LocalSource, Source
-from .utils import CleanupHook, count_calls, counters, format_graph_tabular
+from .utils import (
+    checkpoint_params,
+    CleanupHook,
+    clone_inputs,
+    count_calls,
+    counters,
+    format_graph_tabular,
+    same,
+)
 from .variables.builder import VariableBuilder, wrap_fx_proxy
 from .variables.nn_module import NNModuleVariable
 from .variables.tensor import (
@@ -65,6 +74,59 @@ def __repr__(self):
         return "FakeRootModule(...)"
 
 
+def wrap_compiler_fn(compiler_fn):
+    """WrapperBackend if config.verify_correctness is True"""
+    if config.verify_correctness:
+        # wrap backend if verify_correctness is True
+        wrapper_backend_compiler_fn = WrapperBackend(compiler_fn)
+
+        wrapper_backend_compiler_fn._torchdynamo_orig_callable = compiler_fn
+        return wrapper_backend_compiler_fn
+
+    return compiler_fn
+
+
+class WrapperBackend:
+    def __init__(self, backend=None):
+        self.backend = backend
+
+    @property
+    def example_inputs(self):
+        return clone_inputs(self.original_example_inputs)
+
+    def __call__(self, gm: torch.fx.GraphModule, example_inputs):
+
+        self.restore = checkpoint_params(gm)
+        self.original_example_inputs = clone_inputs(example_inputs)
+        self.gm = gm
+        copy_gm = copy.deepcopy(self.gm)
+        self.candidate = self.backend(copy_gm, self.original_example_inputs)
+
+        if self.candidate is None or self.candidate is self.gm.forward:
+            return self.gm.forward
+
+        if not config.verify_correctness:
+            return self.candidate
+
+        # if verify_correctness=True
+        try:
+            correct = self.gm.forward(*self.example_inputs)
+            result = self.candidate(*self.example_inputs)
+
+            # TODO: replace `same` function with the one in testing
+            if same(correct, result):
+                return self.candidate
+
+            raise RuntimeError(f"incorrect results of backend {self}")
+            return self.gm.forward
+
+        except Exception:
+            log.exception("error in verify_correctness")
+            raise
+        finally:
+            self.restore()
+
+
 class OutputGraph(fx.Tracer):
     """
     Wrapper class to hold outputs of InstructionTranslator.  Mainly the
@@ -452,7 +514,10 @@ def call_user_compiler(self, gm):
                 else ""
             )
             _step_logger()(logging.INFO, f"calling compiler function {name}")
-            compiled_fn = self.compiler_fn(gm, self.example_inputs())
+            compiler_fn = self.compiler_fn
+            if config.verify_correctness:
+                compiler_fn = wrap_compiler_fn(compiler_fn)
+            compiled_fn = compiler_fn(gm, self.example_inputs())
             _step_logger()(logging.INFO, f"done compiler function {name}")
             assert callable(compiled_fn), "compiler_fn did not return callable"
         except Exception as e:

From b66a53264484efad87d0abc21d0ca01853c072e6 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Fri, 25 Nov 2022 03:31:19 +0000
Subject: [PATCH 1282/1922] Don't support kwargs at runtime in
 aot_module_simplified (#89664)

The preexisting logic here added in
https://github.com/pytorch/functorch/pull/970 was very peculiar: if top_kwargs
was non-empty, then the inner compiled function supports kwargs.  Naively, this
would leave you to expect that there is some sort of correlation between
top_kwargs and kwargs.  But in fact, they're completely unrelated!  top_kwargs
is the AOTAutograd configuration knobs (e.g., fw_compiler/bw_compiler), but
kwargs is the RUNTIME kwargs that are to be passed to the compiled function.
But (1) we don't support this (the function to be compiled only takes a list
of tensors) and (2) even if we did support it, conditioning on whether or not
you had passed AOTAutograd configuration kwargs to support kwargs at runtime
is bonkers.

So delete it.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89664
Approved by: https://github.com/voznesenskym
---
 functorch/_src/aot_autograd.py | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/functorch/_src/aot_autograd.py b/functorch/_src/aot_autograd.py
index 5acbcd41369a7..f567e96cc657b 100644
--- a/functorch/_src/aot_autograd.py
+++ b/functorch/_src/aot_autograd.py
@@ -1801,22 +1801,11 @@ def new_func(*args):
 
     compiled_f = aot_function_simplified(functional_call, *top_args, **top_kwargs)
 
-    if top_kwargs:
-
-        def forward(*args, **kwargs):
-            return compiled_f(
-                *params_flat,
-                *args,
-                **kwargs,
-            )
-
-    else:
-
-        def forward(*args):
-            return compiled_f(
-                *params_flat,
-                *args,
-            )
+    def forward(*args):
+        return compiled_f(
+            *params_flat,
+            *args,
+        )
 
     forward.zero_grad = mod.zero_grad
     forward.named_parameters = mod.named_parameters

From fdd7f73c8bff0c0f4262db7565052ab68fbd7f52 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Fri, 25 Nov 2022 03:31:20 +0000
Subject: [PATCH 1283/1922] Simplify aot_module_simplified by removing
 top_args/top_kwargs (#89666)

This makes good on Chillee's CR comment at
https://github.com/pytorch/functorch/pull/660/files/af30d351cc93dfafb5a94dbcb32983c5ef65fd6a#r843315222
which was never done in the original PR.

There is no logic change, just unpack the args/kwargs at the top
level and remove the inner function indirection.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89666
Approved by: https://github.com/voznesenskym
---
 functorch/_src/aot_autograd.py | 65 ++++++++++++++++------------------
 torch/cuda/_dynamo_graphs.py   |  4 +--
 2 files changed, 32 insertions(+), 37 deletions(-)

diff --git a/functorch/_src/aot_autograd.py b/functorch/_src/aot_autograd.py
index f567e96cc657b..9dfb62f30faa6 100644
--- a/functorch/_src/aot_autograd.py
+++ b/functorch/_src/aot_autograd.py
@@ -1721,7 +1721,15 @@ def forward(self, *args, **kwargs):
     return AOTModule()
 
 
-def aot_module_simplified(mod: nn.Module, *top_args, **top_kwargs) -> nn.Module:
+def aot_module_simplified(
+    mod: nn.Module,
+    fw_compiler: Callable,
+    bw_compiler: Optional[Callable] = None,
+    partition_fn: Callable = default_partition,
+    decompositions: Optional[Dict] = None,
+    hasher_type=None,
+    static_argnums=None
+) -> nn.Module:
     """
     This is the simplified or low overhead version of aot_module. For frontends
     like TorchDynamo, the input functions/modules to AOT are static and have
@@ -1764,42 +1772,29 @@ def functional_call(*args, **kwargs):
             )
         return out
 
-    def aot_function_simplified(
-        fn: Callable,
-        fw_compiler: Callable,
-        bw_compiler: Optional[Callable] = None,
-        partition_fn: Callable = default_partition,
-        decompositions: Optional[Dict] = None,
-        hasher_type=None,
-        static_argnums=None,
-    ) -> Callable:
-        assert static_argnums is None
-        if bw_compiler is None:
-            bw_compiler = fw_compiler
-        aot_config = AOTConfig(
-            fw_compiler=fw_compiler,
-            bw_compiler=bw_compiler,
-            partition_fn=partition_fn,
-            decompositions=decompositions,
-            num_params_buffers=params_len,
-        )
-
-        compiled_fn = None
-
-        @wraps(fn)
-        def new_func(*args):
-            nonlocal compiled_fn
-            if compiled_fn is None:
-                compiled_fn = create_aot_dispatcher_function(
-                    fn,
-                    args,
-                    aot_config,
-                )
-            return compiled_fn(args)
+    assert static_argnums is None
+    if bw_compiler is None:
+        bw_compiler = fw_compiler
+    aot_config = AOTConfig(
+        fw_compiler=fw_compiler,
+        bw_compiler=bw_compiler,
+        partition_fn=partition_fn,
+        decompositions=decompositions,
+        num_params_buffers=params_len,
+    )
 
-        return new_func
+    compiled_fn = None
 
-    compiled_f = aot_function_simplified(functional_call, *top_args, **top_kwargs)
+    @wraps(functional_call)
+    def compiled_f(*args):
+        nonlocal compiled_fn
+        if compiled_fn is None:
+            compiled_fn = create_aot_dispatcher_function(
+                functional_call,
+                args,
+                aot_config,
+            )
+        return compiled_fn(args)
 
     def forward(*args):
         return compiled_f(
diff --git a/torch/cuda/_dynamo_graphs.py b/torch/cuda/_dynamo_graphs.py
index 6c577c3177762..1b2211ed32b23 100644
--- a/torch/cuda/_dynamo_graphs.py
+++ b/torch/cuda/_dynamo_graphs.py
@@ -9,7 +9,7 @@
 
 import operator
 from collections import defaultdict
-from typing import Set
+from typing import Set, Dict, Any
 
 # TODO: maybe this should live in torch._dynamo instead
 
@@ -133,7 +133,7 @@ def cudagraphs(model, inputs):
 
 
 def raw_aot_autograd_cudagraphs(model, inputs):
-    kwargs = {
+    kwargs: Dict[str, Any] = {
         # these are taken from memory_efficient_fusion()
         "fw_compiler": cudagraphs,
         "bw_compiler": cudagraphs,

From 7c1c9c9efe229f83653ecdc824adabb515d33865 Mon Sep 17 00:00:00 2001
From: jlukehubbard <58089207+jlukehubbard@users.noreply.github.com>
Date: Fri, 25 Nov 2022 21:31:53 +0000
Subject: [PATCH 1284/1922] update docstring for torch.linalg.lstsq (#89383)

Previous documentation lacked details about the handling of over- and underdetermined systems, and made incorrect mention of MAGMA.

Fixes #85021

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89383
Approved by: https://github.com/lezcano
---
 torch/linalg/__init__.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/torch/linalg/__init__.py b/torch/linalg/__init__.py
index 3ec9a383546bf..4b551b0d40e90 100644
--- a/torch/linalg/__init__.py
+++ b/torch/linalg/__init__.py
@@ -1015,9 +1015,8 @@
 Also supports batches of matrices, and if the inputs are batches of matrices then
 the output has the same batch dimensions.
 
-:attr:`driver` chooses the LAPACK/MAGMA function that will be used.
+:attr:`driver` chooses the backend function that will be used.
 For CPU inputs the valid values are `'gels'`, `'gelsy'`, `'gelsd`, `'gelss'`.
-For CUDA input, the only valid driver is `'gels'`, which assumes that :attr:`A` is full-rank.
 To choose the best driver on CPU consider:
 
 - If :attr:`A` is well-conditioned (its `condition number`_ is not too large), or you do not mind some precision loss.
@@ -1030,6 +1029,8 @@
   - `'gelsd'` (tridiagonal reduction and SVD)
   - But if you run into memory issues: `'gelss'` (full SVD).
 
+For CUDA input, the only valid driver is `'gels'`, which assumes that :attr:`A` is full-rank.
+
 See also the `full description of these drivers`_
 
 :attr:`rcond` is used to determine the effective rank of the matrices in :attr:`A`

From ed540361e7b47901b8800615774ad68d84e1be48 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Sat, 26 Nov 2022 03:08:23 +0000
Subject: [PATCH 1285/1922] put descriptive kernel names behind config (#89697)

Per title, generated kernel names are often long and confusing.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89697
Approved by: https://github.com/Chillee
---
 test/inductor/test_torchinductor.py | 10 ++++++++++
 torch/_inductor/codecache.py        |  2 +-
 torch/_inductor/codegen/triton.py   | 11 ++++++-----
 torch/_inductor/config.py           |  2 ++
 4 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index d1d818e34527a..f4de42d3e64a6 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -3227,6 +3227,16 @@ def fn(x, y):
         out_eager = (inputs[0] + inputs[1].float()).add_(inputs[1]).mul_(inputs[1])
         self.assertTrue(same(out, out_eager))
 
+    @patch.object(config.triton, "ordered_kernel_names", True)
+    @patch.object(config.triton, "descriptive_kernel_names", False)
+    def test_kernel_names(self):
+        @torch._dynamo.optimize("inductor")
+        def fn(x):
+            return 2 * x
+
+        inputs = (rand_strided((8,), (1,), device=self.device),)
+        self.assertTrue(same(fn(*inputs), 2 * inputs[0]))
+
     @patch.object(config.triton, "cudagraphs", True)
     def test_strided_inputs(self):
         @torch._dynamo.optimize("inductor")
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 9fada4d0d8dcf..825ba54234bd4 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -430,7 +430,7 @@ def patch_triton_dir():
 class TritonCodeCache:
     @staticmethod
     def get_name(mod):
-        (name,) = [n for n in dir(mod) if n.startswith("kernel")]
+        (name,) = [n for n in dir(mod) if n.startswith("triton_")]
         return name
 
     @classmethod
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index c302ffe3e79ea..de84d4ddbeff5 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -1301,13 +1301,14 @@ def codegen_node_schedule(self, node_schedule, numel, reduction_numel):
         if src_code in wrapper.kernels:
             kernel_name = wrapper.kernels[src_code]
         else:
-            kernel_name = (
-                "triton_"
-                + get_fused_kernel_name(node_schedule)
-                + wrapper.next_kernel_suffix()
+            fused_name = (
+                get_fused_kernel_name(node_schedule)
+                if config.triton.descriptive_kernel_names
+                else ""
             )
+            kernel_name = "triton_" + fused_name + wrapper.next_kernel_suffix()
             wrapper.kernels[src_code] = kernel_name
-            subs_name = kernel_name if config.triton.ordered_kernel_names else "kernel"
+            subs_name = kernel_name if config.triton.ordered_kernel_names else "triton_"
             src_code = src_code.replace("KERNEL_NAME", subs_name)
             # TODO(voz): Ostensibly, we should not need this. But there are cases where C++ codegen does
             # not use BracesBuffer, so we have no good indicator of a C++ buffer atm.
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index cd32414bc1dfd..05f0fdffdf99e 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -144,6 +144,8 @@ class triton:
     tiling_prevents_reduction_fusion = True
     # should we give different names to kernels
     ordered_kernel_names = False
+    # should we put op names in kernel names
+    descriptive_kernel_names = True
 
 
 # create a directory containing lots of debug information

From 6c2ceb3871e5502a29c993d3ad174428d51c3877 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Fri, 25 Nov 2022 13:48:35 -0800
Subject: [PATCH 1286/1922] Don't suppress exceptions from backends (#89656)

Taken from voz's https://github.com/pytorch/pytorch/pull/89392

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89656
Approved by: https://github.com/voznesenskym
---
 torch/_dynamo/optimizations/backends.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/torch/_dynamo/optimizations/backends.py b/torch/_dynamo/optimizations/backends.py
index e97940b7311f7..c5011096c32f3 100644
--- a/torch/_dynamo/optimizations/backends.py
+++ b/torch/_dynamo/optimizations/backends.py
@@ -53,9 +53,6 @@ def inner(model, example_inputs=None, **kwargs):
             return fn(model, **kwargs)
         except KeyboardInterrupt:
             raise
-        except Exception:
-            log.exception(f"{fn.__name__} error")
-            return None
 
     BACKENDS[fn.__name__] = inner
     return inner

From c003df59b7632594816ea9784427f5cbec985f42 Mon Sep 17 00:00:00 2001
From: Jiong Gong <jiong.gong@intel.com>
Date: Sat, 26 Nov 2022 14:06:44 +0000
Subject: [PATCH 1287/1922] [Inductor] Record cpp kernel in PyTorch Profiler
 (#89367)

Add an option `config.cpp.enable_kernel_profile` to record individual cpp kernel time in PyTorch Profiler.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89367
Approved by: https://github.com/jansel
---
 test/inductor/test_torchinductor.py | 18 ++++++++++++++++++
 torch/_inductor/codecache.py        |  9 ++++++++-
 torch/_inductor/codegen/cpp.py      | 15 ++++++++++++++-
 torch/_inductor/config.py           |  2 ++
 4 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index f4de42d3e64a6..ea112162c8a92 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -5010,6 +5010,24 @@ def fn(x1, x2):
                 assert same(fn(x1, x2)[0], compiled([x1, x2])[0], equal_nan=True)
                 assert metrics.generated_cpp_vec_kernel_count == 1
 
+        @unittest.skipIf(
+            sys.platform != "linux", "cpp kernel profile only support linux now"
+        )
+        @patch("torch.cuda.is_available", lambda: False)
+        @patch.object(config.cpp, "enable_kernel_profile", True)
+        def test_cpp_kernel_profile(self):
+            from torch.profiler import profile
+
+            @torch._dynamo.optimize("inductor", nopython=True)
+            def fn(a, b):
+                return a + b
+
+            a = torch.rand((100,))
+            b = torch.rand((100,))
+            with profile() as prof:
+                fn(a, b)
+            assert "kernel_cpp_0" in (e.name for e in prof.profiler.function_events)
+
 
 if HAS_CUDA:
     import triton
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 825ba54234bd4..bca8c59830be8 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -320,7 +320,14 @@ def cpp_compile_command(
     include_pytorch=False,
     vec_isa: VecISA = invalid_vec_isa,
 ):
-    if include_pytorch or vec_isa != invalid_vec_isa:
+    if sys.platform == "linux" and (
+        include_pytorch
+        or vec_isa != invalid_vec_isa
+        or config.cpp.enable_kernel_profile
+    ):
+        # Note - We include pytorch only on linux right now. There is more work
+        # to do to enable OMP build on darwin where PyTorch is built with IOMP
+        # and we need a way to link to what PyTorch links.
         ipaths = cpp_extension.include_paths() + [sysconfig.get_path("include")]
         lpaths = cpp_extension.library_paths() + [sysconfig.get_config_var("LIBDIR")]
         libs = ["c10", "torch", "torch_cpu", "torch_python", "gomp"]
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index b320b45980d0c..309b8cf9c8e7a 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -2,6 +2,7 @@
 import dataclasses
 import functools
 import math
+import sys
 from copy import deepcopy
 from pathlib import Path
 from typing import Dict, List
@@ -1366,11 +1367,24 @@ def codegen_define_and_call(self, wrapper):
         if self.count == 0:
             return
 
+        kernel_name = "kernel_cpp_" + wrapper.next_kernel_suffix()
         arg_defs, call_args = self.args.cpp_argdefs()
         arg_defs = ",\n".ljust(25).join(arg_defs)
         code = BracesBuffer()
+        # TODO: support kernel profile on other platforms
+        enable_kernel_profile = (
+            config.cpp.enable_kernel_profile and sys.platform == "linux"
+        )
+        if enable_kernel_profile:
+            code.writelines(["#include <ATen/record_function.h>"])
         code.writelines([cpp_prefix(), "" f'extern "C" void kernel({arg_defs})'])
         with code.indent():
+            if enable_kernel_profile:
+                code.writelines(
+                    [
+                        f'RECORD_FUNCTION("{kernel_name}", c10::ArrayRef<c10::IValue>({{}}));'
+                    ]
+                )
             for old, new in self.args.aliases():
                 code.writeline(f"auto {old} = {new};")
             code.splice(self.loops_code)
@@ -1380,7 +1394,6 @@ def codegen_define_and_call(self, wrapper):
         codecache_def.splice(code)
         codecache_def.writeline("''')")
 
-        kernel_name = "kernel_cpp_" + wrapper.next_kernel_suffix()
         codecache_str = codecache_def.getvalue()
         # TODO(voz): Ostensibly, we should not need this. But there are cases where C++ codegen does
         # not use BracesBuffer, so we have no good indicator of a C++ buffer atm.
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index 05f0fdffdf99e..ac97ddf563f19 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -110,6 +110,8 @@ class cpp:
         "g++",
         "g++.par",
     )
+    # Allow kernel performance profiling via PyTorch profiler
+    enable_kernel_profile = False
 
 
 # config specific to codegen/triton.py

From 031af7987a68f9e14ed05b960b45339e5ac8b975 Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Sat, 26 Nov 2022 10:33:08 -0800
Subject: [PATCH 1288/1922] [Profiler] Memory profiler part 4: Select top level
 torch ops (#86880)

In a later PR we will walk the children of these nodes and formulate a node from the entire bundle to build a data flow graph. This PR simply defines what a "top level" op is.

Differential Revision: [D40220387](https://our.internmc.facebook.com/intern/diff/D40220387/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/86880
Approved by: https://github.com/chaekit
---
 test/profiler/test_memory_profiler.py | 70 +++++++++++++++++++++++
 torch/profiler/_memory_profiler.py    | 80 ++++++++++++++++++++++++++-
 2 files changed, 149 insertions(+), 1 deletion(-)

diff --git a/test/profiler/test_memory_profiler.py b/test/profiler/test_memory_profiler.py
index 6924cb3556592..0cfa04c6a226e 100644
--- a/test/profiler/test_memory_profiler.py
+++ b/test/profiler/test_memory_profiler.py
@@ -1,5 +1,8 @@
 # Owner(s): ["oncall: profiler"]
 import functools
+import gc
+import re
+import textwrap
 from typing import Iterator, List, Optional, Tuple
 
 import torch
@@ -344,6 +347,73 @@ def test_match_schemas_tensorlist(self) -> None:
             (("aten::cat.", (False, False)),),
         )
 
+    def test_data_flow_leaf(self) -> None:
+        x = torch.ones((1,))
+        y = torch.ones((1,))
+        with profile() as prof, torch.no_grad():
+            # torch._C._jit_get_schemas_for_operator will reject any name that
+            # is missing a namespace. (denoted by the presence of "::") We want
+            # to check that we skip both annotations which have no schema
+            # (return empty tuple from SchemaMatcher.lookup_schemas) and
+            # annotations which cannot have schema (return None from
+            # SchemaMatcher.lookup_schemas).
+            with torch.profiler.record_function("Namespaced::Annotation"):
+                with torch.profiler.record_function("My Annotation"):
+                    x.zero_()
+                    y.zero_()
+                    x0 = torch.ones_like(x)
+                    y0 = torch.zeros_like(y)
+
+        leaf_events = prof._memory_profile()._data_flow_graph.leaf_events
+        leaf_names = " ".join(node.name for node in leaf_events)
+
+        # `record_function` makes a Tensor to hold its handle which is not
+        # relevant for this test.
+        record_fn_pattern = r"aten::zeros aten::empty \[memory\] \[memory\] "
+
+        self.assertExpectedInline(
+            re.sub(record_fn_pattern, "", leaf_names),
+            """aten::zero_ aten::zero_ aten::ones_like aten::zeros_like""",
+        )
+
+    def test_data_flow_leaf_non_op_allocations(self) -> None:
+        x = torch.ones((1,))
+        with profile() as prof, torch.no_grad():
+            x.mul(2)
+            gc.collect()
+
+        # The python arg parser will convert the python scalar `2` to a Tensor
+        # to pass to `aten::mul`. As a result there is no op that "owns" the
+        # allocation. The Tensor deletions also do not happen in an op; they
+        # are collected as a result of the Python objects going out of scope.
+        leaf_events = prof._memory_profile()._data_flow_graph.leaf_events
+        self.assertExpectedInline(
+            " ".join(node.name for node in leaf_events),
+            """[memory] aten::mul [memory] [memory]""",
+        )
+
+    def test_data_flow_leaf_backward(self) -> None:
+        x = torch.ones((1,))
+        w = torch.ones((1,), requires_grad=True)
+        with profile() as prof:
+            (x * w).sin().backward()
+
+        leaf_events = prof._memory_profile()._data_flow_graph.leaf_events
+        self.assertExpectedInline(
+            textwrap.indent("\n".join(node.name for node in leaf_events), " " * 12),
+            """\
+            aten::mul
+            aten::sin
+            aten::ones_like
+            SinBackward0
+            [memory]
+            MulBackward0
+            [memory]
+            torch::autograd::AccumulateGrad
+            [memory]
+            [memory]""",
+        )
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/profiler/_memory_profiler.py b/torch/profiler/_memory_profiler.py
index cd652a6a000f9..1317c93de061f 100644
--- a/torch/profiler/_memory_profiler.py
+++ b/torch/profiler/_memory_profiler.py
@@ -11,6 +11,7 @@
     _TensorMetadata,
     RecordScope,
 )
+from torch.profiler import _utils
 
 
 @dataclasses.dataclass
@@ -229,6 +230,83 @@ def lookup_schemas(name: str) -> Optional[Tuple[FunctionSchema, ...]]:
             return None
 
 
+class OpTree:
+    def __init__(self, result: _ProfilerResult) -> None:
+        self._root_nodes = result.experimental_event_tree()
+        self._sorted_nodes = tuple(sorted(self.dfs(), key=lambda x: x.start_time_ns))
+
+    def dfs(self, *args, **kwargs) -> Iterator[_ProfilerEvent]:
+        yield from _utils.traverse_dfs(self._root_nodes, *args, **kwargs)
+
+    @property
+    def sorted_nodes(self) -> Tuple[_ProfilerEvent, ...]:
+        return self._sorted_nodes
+
+
+class DataFlowGraph:
+    def __init__(self, tree: OpTree) -> None:
+        self._tree = tree
+        self._leaf_events = self._extract_leaf_events(tree)
+
+    @property
+    def leaf_events(self) -> Tuple[_ProfilerEvent, ...]:
+        return self._leaf_events
+
+    @staticmethod
+    def _extract_leaf_events(tree: OpTree) -> Tuple[_ProfilerEvent, ...]:
+        """Partially traverse the op tree and extract top level ops.
+
+        Consider the following code:
+        ```
+        with record_function("My annotation"):
+            x.zero_()
+            y.zero_()
+        ```
+
+        The op tree (assuming no Autograd) will look like:
+          <Python context>
+            TorchOp: "My annotation"
+              TorchOp: zero_
+                TorchOp: fill_
+              TorchOp: zero_
+                TorchOp: fill_
+
+        The recursive structure of operator calls makes data flow unwieldy.
+        In order to simplify analysis we would like to select the highest level
+        ops to represent in the graph. In this case those are the `zero_` ops;
+        the fact that `fill_` is called is an implementation detail. We also
+        do not want to group everything under "My annotation" as this could
+        create overly coarse bundles and lose critical semantics.
+
+        To address this issue we walk over the graph and select the topmost
+        torch ops ** which match at least one operator schema **. These form
+        the leaves of the first pass through the op tree. (As well as any
+        allocations or frees which do are not part of a kernel.) These events
+        form the logical nodes in our data flow graph.
+        """
+
+        leaf_events: List[_ProfilerEvent] = []
+
+        def leaf_op(e: _ProfilerEvent) -> bool:
+            return e.typed[0] == _EventType.TorchOp and (
+                e.typed[1].scope == RecordScope.BACKWARD_FUNCTION
+                or bool(SchemaMatcher.match_schemas(e.typed[1]))
+            )
+
+        def children_fn(e: _ProfilerEvent):
+            if leaf_op(e) or e.tag == _EventType.Allocation:
+                leaf_events.append(e)
+                return []
+
+            return e.children
+
+        for _ in tree.dfs(children_fn=children_fn):
+            pass
+
+        return tuple(sorted(leaf_events, key=lambda x: x.start_time_ns))
+
+
 class MemoryProfile:
     def __init__(self, result: _ProfilerResult) -> None:
-        pass
+        self._op_tree = OpTree(result)
+        self._data_flow_graph = DataFlowGraph(self._op_tree)

From 5d66e57de37e309e8aa01f37ca85e23a71ed7f3d Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Sat, 26 Nov 2022 10:33:09 -0800
Subject: [PATCH 1289/1922] [Profiler] Memory profiler part 5: Data flow graph
 (#87006)

The semantic meaning of a Tensor is tightly coupled to its lineage. The data flow graph allows us to identify temporary Tensors, masks, inputs, activations, and more. However one important nuance is that Tensors must be versioned; operations which mutate their inputs can also change the semantic meaning of said inputs.

It is challenging to assemble a complete picture of the data flow in a PyTorch model because ops can, and often do, recursively call into other ops. For the purpose of memory profiling this is an implementation detail, so instead we traverse the op tree to identify top level ops and allocations and then coalesce their children, folding inputs and outputs into the top level Node.

Differential Revision: [D40220391](https://our.internmc.facebook.com/intern/diff/D40220391/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87006
Approved by: https://github.com/chaekit
---
 test/profiler/test_memory_profiler.py | 452 +++++++++++++++++++++++---
 torch/profiler/_memory_profiler.py    | 191 ++++++++++-
 2 files changed, 588 insertions(+), 55 deletions(-)

diff --git a/test/profiler/test_memory_profiler.py b/test/profiler/test_memory_profiler.py
index 0cfa04c6a226e..ad65d6941023a 100644
--- a/test/profiler/test_memory_profiler.py
+++ b/test/profiler/test_memory_profiler.py
@@ -1,9 +1,9 @@
 # Owner(s): ["oncall: profiler"]
 import functools
 import gc
-import re
+import itertools as it
 import textwrap
-from typing import Iterator, List, Optional, Tuple
+from typing import Callable, Dict, Iterator, List, Optional, Tuple
 
 import torch
 from torch._C._profiler import _EventType
@@ -246,7 +246,12 @@ def test_extract_gradients_from_module_and_optimizer(self) -> None:
         )
 
 
+@skipIfTorchDynamo("TorchDynamo removes profiler altogether.")
 class TestDataFlow(TestCase):
+    def setUp(self) -> None:
+        super().setUp()
+        self.maxDiff = None
+
     @staticmethod
     def formatSchemas(
         prof: torch.profiler.profile, indent: int = 12
@@ -266,6 +271,46 @@ def formatSchemas(
                 out.append((name, _memory_profiler.SchemaMatcher.inputs_are_mutable(e)))
         return tuple(out)
 
+    @staticmethod
+    def _run_and_format_data_flow(
+        inputs: Dict[str, torch.Tensor],
+        f: Callable[..., Optional[Dict[str, torch.Tensor]]],
+        indent: int = 12,
+    ) -> str:
+        with profile() as prof:
+            outputs = f(**inputs) or {}
+            gc.collect()
+
+        memory_profile = prof._memory_profile()
+        graph = memory_profile._data_flow_graph
+        storage_to_id = {key.storage.ptr: key.id for key in graph._active_version}
+
+        lines: List[str] = []
+        for name, t in it.chain(inputs.items(), outputs.items()):
+            lines.append(f"{name + ':':<8} T{storage_to_id[t.storage().data_ptr()]}")
+            if t.grad is not None:
+                grad_id = storage_to_id[t.grad.storage().data_ptr()]
+                lines.append(f"{name + '.grad:':<9} T{grad_id}")
+
+        if lines:
+            lines.append("")
+
+        for node in graph.flow_nodes:
+            destroyed = {k for k, v in node._edges.items() if v.is_deletion}
+
+            inputs: List[str] = []
+            for key, (_, v) in node.inputs.items():
+                inputs.append(f"T{key.id}(v{v}{'*' if key in destroyed else ''})")
+
+            outputs = [f"T{key.id}(v{v})" for key, v in node.outputs.items()]
+            if inputs or outputs:
+                event_name = node._event.name.replace("torch::autograd::", "")
+                lines.append(
+                    f"{event_name:<25} {', '.join(inputs):<15}  ->  {', '.join(outputs)}"
+                )
+
+        return textwrap.indent("\n".join([l.rstrip() for l in lines]), " " * indent)
+
     def test_match_schemas(self) -> None:
         with profile() as prof:
             x = torch.ones((1,)).mul(2).add_(2)
@@ -320,19 +365,15 @@ def test_match_schemas_backward(self) -> None:
                 # fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!)
                 ("aten::fill_.Scalar", (True, False)),
                 ("autograd::engine::evaluate_function: MulBackward0", ()),
-                #
-                # Cannot find schema, all inputs presumed mutable
-                ("MulBackward0", (True,)),
+                ("MulBackward0", (None,)),
                 ("aten::mul.Tensor", (False, False)),
                 (
                     "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad",
                     (),
                 ),
-                #
-                # Cannot find schema, all inputs presumed mutable
-                ("torch::autograd::AccumulateGrad", (True,)),
+                ("torch::autograd::AccumulateGrad", (None,)),
                 ("aten::detach.", (False,)),
-                ("detach", (True,)),
+                ("detach", (None,)),
             ),
         )
 
@@ -347,10 +388,8 @@ def test_match_schemas_tensorlist(self) -> None:
             (("aten::cat.", (False, False)),),
         )
 
-    def test_data_flow_leaf(self) -> None:
-        x = torch.ones((1,))
-        y = torch.ones((1,))
-        with profile() as prof, torch.no_grad():
+    def test_data_flow_graph_with_annotations(self) -> None:
+        def f(x, y):
             # torch._C._jit_get_schemas_for_operator will reject any name that
             # is missing a namespace. (denoted by the presence of "::") We want
             # to check that we skip both annotations which have no schema
@@ -361,57 +400,376 @@ def test_data_flow_leaf(self) -> None:
                 with torch.profiler.record_function("My Annotation"):
                     x.zero_()
                     y.zero_()
-                    x0 = torch.ones_like(x)
-                    y0 = torch.zeros_like(y)
-
-        leaf_events = prof._memory_profile()._data_flow_graph.leaf_events
-        leaf_names = " ".join(node.name for node in leaf_events)
-
-        # `record_function` makes a Tensor to hold its handle which is not
-        # relevant for this test.
-        record_fn_pattern = r"aten::zeros aten::empty \[memory\] \[memory\] "
+                    return {"x0": torch.ones_like(x), "y0": torch.zeros_like(y)}
 
+        inputs = {"x": torch.ones((1,)), "y": torch.ones((1,))}
         self.assertExpectedInline(
-            re.sub(record_fn_pattern, "", leaf_names),
-            """aten::zero_ aten::zero_ aten::ones_like aten::zeros_like""",
+            self._run_and_format_data_flow(inputs, f),
+            """\
+            x:       T0
+            y:       T1
+            x0:      T2
+            y0:      T3
+
+            aten::zero_               T0(v0)           ->  T0(v1)
+            aten::zero_               T1(v0)           ->  T1(v1)
+            aten::ones_like           T0(v1)           ->  T2(v0)
+            aten::zeros_like          T1(v1)           ->  T3(v0)""",
         )
 
-    def test_data_flow_leaf_non_op_allocations(self) -> None:
-        x = torch.ones((1,))
-        with profile() as prof, torch.no_grad():
+    def test_data_flow_graph_non_op_allocations(self) -> None:
+        def f(x):
             x.mul(2)
-            gc.collect()
 
         # The python arg parser will convert the python scalar `2` to a Tensor
         # to pass to `aten::mul`. As a result there is no op that "owns" the
         # allocation. The Tensor deletions also do not happen in an op; they
         # are collected as a result of the Python objects going out of scope.
-        leaf_events = prof._memory_profile()._data_flow_graph.leaf_events
         self.assertExpectedInline(
-            " ".join(node.name for node in leaf_events),
-            """[memory] aten::mul [memory] [memory]""",
+            self._run_and_format_data_flow({"x": torch.ones((1,))}, f),
+            """\
+            x:       T1
+
+            [memory]                                   ->  T0(v0)
+            aten::mul                 T0(v0), T1(v0)   ->
+            [memory]                  T0(v0*)          ->""",
+        )
+
+    def test_data_flow_graph_simple(self) -> None:
+        inputs = {"x": torch.ones((25,)), "y": torch.ones((25,), requires_grad=True)}
+
+        def f0(x, y):
+            z = x.mul(y)
+            return {"z": z.view_as(z)}
+
+        def f1(x, y):
+            with torch.no_grad():
+                return f0(x, y)
+
+        self.assertExpectedInline(
+            self._run_and_format_data_flow(inputs, f0),
+            """\
+            x:       T0
+            y:       T1
+            z:       T2
+
+            aten::mul                 T0(v0), T1(v0)   ->  T2(v0)
+            aten::view_as             T2(v0)           ->""",
+        )
+
+        # Out of place is identical regardless of Autograd.
+        self.assertExpectedInline(
+            self._run_and_format_data_flow(inputs, f0),
+            """\
+            x:       T0
+            y:       T1
+            z:       T2
+
+            aten::mul                 T0(v0), T1(v0)   ->  T2(v0)
+            aten::view_as             T2(v0)           ->""",
+        )
+
+    def test_data_flow_graph_simple_inplace(self) -> None:
+        inputs = {"x": torch.ones((25,)), "y": torch.ones((25,), requires_grad=True)}
+
+        def f0(x, y):
+            x.mul_(y)
+
+        def f1(x, y):
+            with torch.no_grad():
+                return f0(x, y)
+
+        # When Autograd is enabled a second Tensor `T2` is created to store
+        # the values of T0(v0) which are needed for backwards.
+        self.assertExpectedInline(
+            self._run_and_format_data_flow(inputs, f0),
+            """\
+            x:       T0
+            y:       T1
+
+            aten::mul_                T0(v0), T1(v0)   ->  T0(v1), T2(v0)""",
+        )
+
+        self.assertExpectedInline(
+            self._run_and_format_data_flow(inputs, f1),
+            """\
+            x:       T0
+            y:       T1
+
+            aten::mul_                T0(v0), T1(v0)   ->  T0(v1)""",
+        )
+
+    def test_data_flow_graph_simple_backward(self) -> None:
+        inputs = {
+            "x": torch.ones((1,)),
+            "w": torch.ones((1,), requires_grad=True),
+        }
+        self.assertExpectedInline(
+            self._run_and_format_data_flow(
+                inputs, lambda x, w: (x * w).sin().backward()
+            ),
+            """\
+            x:       T0
+            w:       T1
+            w.grad:   T7
+
+            aten::mul                 T0(v0), T1(v0)   ->  T2(v0)
+            aten::sin                 T2(v0)           ->  T3(v0)
+            aten::ones_like           T3(v0)           ->  T4(v0)
+            SinBackward0              T2(v0), T4(v0)   ->  T6(v0)
+            [memory]                  T2(v0*)          ->
+            MulBackward0              T0(v0), T6(v0)   ->  T7(v0)
+            [memory]                  T6(v0*)          ->
+            AccumulateGrad            T7(v0)           ->
+            [memory]                  T4(v0*)          ->
+            [memory]                  T3(v0*)          ->""",
+        )
+
+    def test_data_flow_graph_complicated(self) -> None:
+        def f():
+            x = torch.ones((25,))
+            y = x.mul(2).add_(2)
+            z = torch.sin(y, out=torch.empty_like(y))
+            return {"x": x, "y": y, "z": z}
+
+        # T1 is the `2` in `.mul(2)`. The Python arg parser automatically
+        # converts Scalar arguments to Tensors. The same is true for `T4`
+        # and `.add_(2)`.
+        self.assertExpectedInline(
+            self._run_and_format_data_flow({}, f),
+            """\
+            x:       T0
+            y:       T3
+            z:       T6
+
+            aten::ones                                 ->  T0(v0)
+            [memory]                                   ->  T1(v0)
+            aten::mul                 T0(v0), T1(v0)   ->  T3(v0)
+            [memory]                  T1(v0*)          ->
+            [memory]                                   ->  T4(v0)
+            aten::add_                T3(v0), T4(v0)   ->  T3(v1)
+            [memory]                  T4(v0*)          ->
+            aten::empty_like          T3(v1)           ->  T6(v0)
+            aten::sin                 T3(v1), T6(v0)   ->  T6(v1)""",
         )
 
-    def test_data_flow_leaf_backward(self) -> None:
-        x = torch.ones((1,))
-        w = torch.ones((1,), requires_grad=True)
         with profile() as prof:
-            (x * w).sin().backward()
+            f()
+
+        # `aten::mul` creates a temporary Tensor (T2), which is why the output
+        # is has ID three rather than two.
+        mul_node = prof._memory_profile()._data_flow_graph.flow_nodes[2]
+        self.assertEqual(mul_node._event.name, "aten::mul")
+        self.assertEqual(len(mul_node.intermediates), 1)
+        self.assertEqual(mul_node.intermediates[0].id, 2)
+
+    def test_data_flow_graph_stacked(self) -> None:
+        inputs = {
+            "x": torch.ones((25,)),
+            "w0": torch.ones((1,), requires_grad=True),
+            "w1": torch.ones((1,), requires_grad=True),
+        }
+
+        def f(x, w0, w1):
+            return x.mul(w0).relu().mul(w1).relu().sum()
+
+        def f_fwd(**kwargs):
+            with torch.no_grad():
+                return {"loss": f(**kwargs)}
+
+        def f_fwd_bwd(**kwargs):
+            loss = f(**kwargs)
+            loss.backward()
+            return {"loss": loss}
+
+        self.assertExpectedInline(
+            self._run_and_format_data_flow(inputs, f_fwd),
+            """\
+            x:       T0
+            w0:      T1
+            w1:      T4
+            loss:    T7
+
+            aten::mul                 T0(v0), T1(v0)   ->  T2(v0)
+            aten::relu                T2(v0)           ->  T3(v0)
+            [memory]                  T2(v0*)          ->
+            aten::mul                 T3(v0), T4(v0)   ->  T5(v0)
+            [memory]                  T3(v0*)          ->
+            aten::relu                T5(v0)           ->  T6(v0)
+            [memory]                  T5(v0*)          ->
+            aten::sum                 T6(v0)           ->  T7(v0)
+            [memory]                  T6(v0*)          ->""",
+        )
+
+        self.assertExpectedInline(
+            self._run_and_format_data_flow(inputs, f_fwd_bwd),
+            """\
+            x:       T0
+            w0:      T1
+            w0.grad:  T15
+            w1:      T4
+            w1.grad:  T12
+            loss:    T7
+
+            aten::mul                 T0(v0), T1(v0)   ->  T2(v0)
+            aten::relu                T2(v0)           ->  T3(v0)
+            [memory]                  T2(v0*)          ->
+            aten::mul                 T3(v0), T4(v0)   ->  T5(v0)
+            aten::relu                T5(v0)           ->  T6(v0)
+            [memory]                  T5(v0*)          ->
+            aten::sum                 T6(v0)           ->  T7(v0)
+            aten::ones_like           T7(v0)           ->  T8(v0)
+            SumBackward0              T8(v0)           ->
+            ReluBackward0             T6(v0), T8(v0)   ->  T9(v0)
+            [memory]                  T6(v0*)          ->
+            MulBackward0              T3(v0), T4(v0), T9(v0)  ->  T10(v0), T11(v0)
+            aten::sum                 T10(v0)          ->  T12(v0)
+            [memory]                  T10(v0*)         ->
+            [memory]                  T9(v0*)          ->
+            AccumulateGrad            T12(v0)          ->
+            ReluBackward0             T3(v0), T11(v0)  ->  T13(v0)
+            [memory]                  T11(v0*)         ->
+            [memory]                  T3(v0*)          ->
+            MulBackward0              T0(v0), T13(v0)  ->  T14(v0)
+            aten::sum                 T14(v0)          ->  T15(v0)
+            [memory]                  T14(v0*)         ->
+            [memory]                  T13(v0*)         ->
+            AccumulateGrad            T15(v0)          ->
+            [memory]                  T8(v0*)          ->""",
+        )
+
+        # Second time grads are already initialized.
+        self.assertExpectedInline(
+            self._run_and_format_data_flow(inputs, f_fwd_bwd),
+            """\
+            x:       T0
+            w0:      T1
+            w0.grad:  T17
+            w1:      T4
+            w1.grad:  T13
+            loss:    T7
+
+            aten::mul                 T0(v0), T1(v0)   ->  T2(v0)
+            aten::relu                T2(v0)           ->  T3(v0)
+            [memory]                  T2(v0*)          ->
+            aten::mul                 T3(v0), T4(v0)   ->  T5(v0)
+            aten::relu                T5(v0)           ->  T6(v0)
+            [memory]                  T5(v0*)          ->
+            aten::sum                 T6(v0)           ->  T7(v0)
+            aten::ones_like           T7(v0)           ->  T8(v0)
+            SumBackward0              T8(v0)           ->
+            ReluBackward0             T6(v0), T8(v0)   ->  T9(v0)
+            [memory]                  T6(v0*)          ->
+            MulBackward0              T3(v0), T4(v0), T9(v0)  ->  T10(v0), T11(v0)
+            aten::sum                 T10(v0)          ->  T12(v0)
+            [memory]                  T10(v0*)         ->
+            [memory]                  T9(v0*)          ->
+            AccumulateGrad            T12(v0*), T13(v0)  ->  T13(v1)
+            ReluBackward0             T3(v0), T11(v0)  ->  T14(v0)
+            [memory]                  T11(v0*)         ->
+            [memory]                  T3(v0*)          ->
+            MulBackward0              T0(v0), T14(v0)  ->  T15(v0)
+            aten::sum                 T15(v0)          ->  T16(v0)
+            [memory]                  T15(v0*)         ->
+            [memory]                  T14(v0*)         ->
+            AccumulateGrad            T16(v0*), T17(v0)  ->  T17(v1)
+            [memory]                  T8(v0*)          ->""",
+        )
+
+        return
+
+        x = torch.ones((25,))
+        w0 = torch.ones((1,), requires_grad=True)
+        w1 = torch.ones((1,), requires_grad=True)
+
+        with profile() as prof_no_grad:
+            with torch.no_grad():
+                x.mul(w0).relu().mul(w1).relu().sum()
+
+        # TODO: one with `.logsumexp(dim=0)`
+
+        self.assertExpectedInline(
+            self._format_graph(prof_no_grad),
+            """\
+            aten::mul                 T0(v0), T1(v0)   ->  T2(v0)
+            aten::relu                T2(v0)           ->  T3(v0)
+            [memory]                  T2(v0*)          ->
+            aten::mul                 T3(v0), T4(v0)   ->  T5(v0)
+            [memory]                  T3(v0*)          ->
+            aten::relu                T5(v0)           ->  T6(v0)
+            [memory]                  T5(v0*)          ->
+            aten::sum                 T6(v0)           ->  T7(v0)
+            [memory]                  T6(v0*)          ->
+            [memory]                  T7(v0*)          ->""",
+        )
+
+        with profile() as prof_grad:
+            loss = x.mul(w0).relu().mul(w1).relu().sum()
+            loss.backward()
+
+        self.assertExpectedInline(
+            self._format_graph(prof_grad),
+            """\
+            aten::mul                 T0(v0), T1(v0)   ->  T2(v0)
+            aten::relu                T2(v0)           ->  T3(v0)
+            [memory]                  T2(v0*)          ->
+            aten::mul                 T3(v0), T4(v0)   ->  T5(v0)
+            aten::relu                T5(v0)           ->  T6(v0)
+            [memory]                  T5(v0*)          ->
+            aten::sum                 T6(v0)           ->  T7(v0)
+            aten::ones_like           T7(v0)           ->  T8(v0)
+            SumBackward0              T8(v0)           ->  T8(v1)
+            ReluBackward0             T6(v0), T8(v1)   ->  T8(v2), T9(v0)
+            [memory]                  T6(v0*)          ->
+            MulBackward0              T3(v0), T4(v0), T9(v0)  ->  T9(v1), T10(v0), T11(v0)
+            aten::sum                 T10(v0)          ->  T12(v0)
+            [memory]                  T10(v0*)         ->
+            [memory]                  T9(v1*)          ->
+            AccumulateGrad            T12(v0)          ->  T12(v1)
+            ReluBackward0             T3(v0), T11(v0)  ->  T11(v1), T13(v0)
+            [memory]                  T11(v1*)         ->
+            [memory]                  T3(v0*)          ->
+            MulBackward0              T0(v0), T13(v0)  ->  T13(v1), T14(v0)
+            aten::sum                 T14(v0)          ->  T15(v0)
+            [memory]                  T14(v0*)         ->
+            [memory]                  T13(v1*)         ->
+            AccumulateGrad            T15(v0)          ->  T15(v1)
+            [memory]                  T8(v2*)          ->""",
+        )
+
+        # Second time grads are already initialized.
+        with profile() as prof_grad:
+            loss = x.mul(w0).relu().mul(w1).relu().sum()
+            loss.backward()
 
-        leaf_events = prof._memory_profile()._data_flow_graph.leaf_events
         self.assertExpectedInline(
-            textwrap.indent("\n".join(node.name for node in leaf_events), " " * 12),
+            self._format_graph(prof_grad),
             """\
-            aten::mul
-            aten::sin
-            aten::ones_like
-            SinBackward0
-            [memory]
-            MulBackward0
-            [memory]
-            torch::autograd::AccumulateGrad
-            [memory]
-            [memory]""",
+            aten::mul                 T0(v0), T1(v0)   ->  T2(v0)
+            aten::relu                T2(v0)           ->  T3(v0)
+            [memory]                  T2(v0*)          ->
+            aten::mul                 T3(v0), T4(v0)   ->  T5(v0)
+            aten::relu                T5(v0)           ->  T6(v0)
+            [memory]                  T5(v0*)          ->
+            aten::sum                 T6(v0)           ->  T7(v0)
+            aten::ones_like           T7(v0)           ->  T8(v0)
+            SumBackward0              T8(v0)           ->  T8(v1)
+            ReluBackward0             T6(v0), T8(v1)   ->  T8(v2), T9(v0)
+            [memory]                  T6(v0*)          ->
+            MulBackward0              T3(v0), T4(v0), T9(v0)  ->  T9(v1), T10(v0), T11(v0)
+            aten::sum                 T10(v0)          ->  T12(v0)
+            [memory]                  T10(v0*)         ->
+            [memory]                  T9(v1*)          ->
+            AccumulateGrad            T12(v0*), T13(v0)  ->  T13(v1)
+            ReluBackward0             T3(v0), T11(v0)  ->  T11(v1), T14(v0)
+            [memory]                  T11(v1*)         ->
+            [memory]                  T3(v0*)          ->
+            MulBackward0              T0(v0), T14(v0)  ->  T14(v1), T15(v0)
+            aten::sum                 T15(v0)          ->  T16(v0)
+            [memory]                  T15(v0*)         ->
+            [memory]                  T14(v1*)         ->
+            AccumulateGrad            T16(v0*), T17(v0)  ->  T17(v1)
+            [memory]                  T8(v2*)          ->""",
         )
 
 
diff --git a/torch/profiler/_memory_profiler.py b/torch/profiler/_memory_profiler.py
index 1317c93de061f..f6e423950c3da 100644
--- a/torch/profiler/_memory_profiler.py
+++ b/torch/profiler/_memory_profiler.py
@@ -1,11 +1,24 @@
+import collections
 import dataclasses
-from typing import Any, Iterator, List, Optional, Tuple, Union
+from typing import (
+    Any,
+    cast,
+    DefaultDict,
+    Dict,
+    Iterator,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    Union,
+)
 
 import torch
 from torch._C import FunctionSchema
 from torch._C._autograd import _ProfilerResult
 from torch._C._profiler import (
     _EventType,
+    _ExtraFields_Allocation,
     _ExtraFields_TorchOp,
     _ProfilerEvent,
     _TensorMetadata,
@@ -53,6 +66,9 @@ class TensorKey:
     def __repr__(self) -> str:
         return f"id={self.id}: {repr(self.storage):<24} ({self.device})"
 
+    def __lt__(self, other: "TensorKey") -> bool:
+        return self._as_sortable < other._as_sortable
+
     @staticmethod
     def _make(
         tensor_id: Optional[int],
@@ -68,12 +84,20 @@ def _make(
             return TensorKey(tensor_id, _Storage(storage_ptr, allocation_id), device)
         return None
 
+    @classmethod
+    def from_allocation(cls, alloc: _ExtraFields_Allocation) -> Optional["TensorKey"]:
+        return cls._make(alloc.id, alloc.ptr, alloc.allocation_id, alloc.device)
+
     @classmethod
     def from_tensor(cls, t: Optional[_TensorMetadata]) -> Optional["TensorKey"]:
         if t is not None:
             return cls._make(t.id, t.storage_data_ptr, t.allocation_id, t.device)
         return None
 
+    @property
+    def _as_sortable(self) -> Tuple[int, int, str, int]:
+        return self.id, self.storage.allocation_id, self.device.type, self.device.index
+
 
 def extract_gradients(
     node: _ProfilerEvent,
@@ -137,7 +161,7 @@ class SchemaMatcher:
     """
 
     @classmethod
-    def inputs_are_mutable(cls, t: _ExtraFields_TorchOp) -> Tuple[bool, ...]:
+    def inputs_are_mutable(cls, t: _ExtraFields_TorchOp) -> Tuple[Optional[bool], ...]:
         """Determine which inputs may have mutated based on function schema.
 
         Note that we don't need to resolve down to a single schema to perform
@@ -152,7 +176,7 @@ def inputs_are_mutable(cls, t: _ExtraFields_TorchOp) -> Tuple[bool, ...]:
             for i, arg in enumerate(schema.arguments):
                 mutable[i] |= getattr(arg.alias_info, "is_write", False)
 
-        return tuple(mutable or (True for _ in t.inputs))
+        return tuple(mutable or (None for _ in t.inputs))
 
     @classmethod
     def match_schemas(cls, t: _ExtraFields_TorchOp) -> Tuple[FunctionSchema, ...]:
@@ -243,17 +267,154 @@ def sorted_nodes(self) -> Tuple[_ProfilerEvent, ...]:
         return self._sorted_nodes
 
 
+@dataclasses.dataclass()
+class DataFlowEdge:
+    input_version: Optional[int] = None
+    mutated: Optional[bool] = False
+
+    @property
+    def is_allocation(self) -> bool:
+        return self.input_version is None
+
+    @property
+    def is_deletion(self) -> bool:
+        return self.mutated is None
+
+
+class DataFlowNode:
+    def __init__(self, event: _ProfilerEvent, graph: "DataFlowGraph") -> None:
+        self._event = event
+        self._graph = graph
+        self._edges: Dict[TensorKey, DataFlowEdge] = self._determine_edges()
+
+        for key, edge in self._edges.items():
+            if edge.mutated and not edge.is_allocation:
+                self._graph.bump(key)
+
+        # Make sure the version bumping behavior matches what we expect.
+        versions = {k: (v, self._graph.lookup(k)) for k, v in self.outputs.items()}
+        assert all(i == j for i, j in versions.values()), f"{versions}, {self._edges}"
+
+    def _determine_edges(self) -> Dict[TensorKey, DataFlowEdge]:
+        subtree = tuple(_utils.traverse_dfs([self._event]))
+
+        # Start by populating edges from op inputs and outputs.
+        mutable_by_key: Dict[Optional[TensorKey], Set[Optional[bool]]] = {}
+        for op in (i.typed[1] for i in subtree if i.typed[0] == _EventType.TorchOp):
+            for op_input, mutable in zip(
+                op.inputs, SchemaMatcher.inputs_are_mutable(op)
+            ):
+                # Tensor
+                if isinstance(op_input, _TensorMetadata):
+                    key = TensorKey.from_tensor(op_input)
+                    mutable_by_key.setdefault(key, set()).add(mutable)
+
+                # TensorList
+                elif isinstance(op_input, list):
+                    for op_input_i in op_input:
+                        key = TensorKey.from_tensor(op_input_i)
+                        mutable_by_key.setdefault(key, set()).add(mutable)
+
+        edges: DefaultDict[Optional[TensorKey], DataFlowEdge]
+        edges = collections.defaultdict(DataFlowEdge)
+        for key, mutable_set in mutable_by_key.items():
+            if key is not None:
+                edges[key].input_version = self._graph.lookup(key) if key else -1
+
+                # We consider an op to be mutated if we encounter a schema where it
+                # is a mutable argument OR if it is ambiguous. (We never explicitly
+                # see it in any schema.)
+                mutated = (True in mutable_set) or (tuple(mutable_set) == (None,))
+                edges[key].mutated = mutated
+
+        # Then handle deletions. Note that deleting a Tensor implicitly adds
+        # it as an input edge.
+        for i in subtree:
+            if i.typed[0] == _EventType.Allocation and i.typed[1].alloc_size < 0:
+                key = TensorKey.from_allocation(i.typed[1])
+                edge = edges[key]
+                assert key is None or edge.mutated is not None, f"Double delete: {key}"
+                edge.mutated = None
+                edge.input_version = self._graph.lookup(key) if key else -1
+
+        # And finally handle allocations. This step must be last, because the
+        # previous two steps optimistically add input edges.
+        for i in subtree:
+            if i.typed[0] == _EventType.Allocation and i.typed[1].alloc_size > 0:
+                edges[TensorKey.from_allocation(i.typed[1])].input_version = None
+
+        # We don't need to sort the inputs, but it makes debugging and unit tests nicer.
+        return dict(sorted((k, v) for k, v in edges.items() if k is not None))
+
+    @property
+    def inputs(self) -> Dict[TensorKey, Tuple[bool, int]]:
+        return {
+            # MyPy can't see through `is_allocation` to know that
+            # `v.input_version` is not None.
+            k: (bool(v.mutated), cast(int, v.input_version))
+            for k, v in self._edges.items()
+            if not v.is_allocation
+        }
+
+    @property
+    def outputs(self) -> Dict[TensorKey, int]:
+        return {
+            k: 0 if v.input_version is None else v.input_version + 1
+            for k, v in self._edges.items()
+            if (v.is_allocation and not v.is_deletion) or v.mutated
+        }
+
+    @property
+    def intermediates(self) -> Tuple[TensorKey, ...]:
+        return tuple(
+            k for k, v in self._edges.items() if v.is_allocation and v.is_deletion
+        )
+
+    @property
+    def start_time(self) -> int:
+        return self._event.start_time_ns
+
+
 class DataFlowGraph:
-    def __init__(self, tree: OpTree) -> None:
-        self._tree = tree
-        self._leaf_events = self._extract_leaf_events(tree)
+    def __init__(self, op_tree: OpTree) -> None:
+        self._op_tree = op_tree
+        self._leaf_events = self._extract_leaf_events(op_tree)
+        self._active_version: Dict[TensorKey, Optional[int]] = {}
+        self._flow_nodes = [DataFlowNode(e, self) for e in self.leaf_events]
+        self._flow_nodes.sort(key=lambda x: x.start_time)
+        self.validate()
+
+    @property
+    def flow_nodes(self) -> Tuple[DataFlowNode, ...]:
+        return tuple(self._flow_nodes)
+
+    def validate(self):
+        # Check that each (Tensor, version) pair has a unique creation node
+        outputs: Set[Tuple[TensorKey, int]] = set()
+        for node in self.flow_nodes:
+            node_outputs = set(node.outputs.items())
+            duplicates = outputs & node_outputs
+            assert not duplicates, f"{node._event.name} {node._edges} {duplicates}"
+            outputs |= node_outputs
+
+        # And check that `self._nodes` forms a valid topologically sorted DAG.
+        tensor_versions: Dict[TensorKey, int] = {}
+        for node in self.flow_nodes:
+            for key, (_, version) in node.inputs.items():
+                expected = tensor_versions.get(key, 0)
+                assert expected == version, (expected, version)
+
+            for key, version in node.outputs.items():
+                prior_version = tensor_versions.get(key, version)
+                assert version >= prior_version, (version, prior_version)
+                tensor_versions[key] = version
 
     @property
     def leaf_events(self) -> Tuple[_ProfilerEvent, ...]:
         return self._leaf_events
 
     @staticmethod
-    def _extract_leaf_events(tree: OpTree) -> Tuple[_ProfilerEvent, ...]:
+    def _extract_leaf_events(op_tree: OpTree) -> Tuple[_ProfilerEvent, ...]:
         """Partially traverse the op tree and extract top level ops.
 
         Consider the following code:
@@ -300,11 +461,25 @@ def children_fn(e: _ProfilerEvent):
 
             return e.children
 
-        for _ in tree.dfs(children_fn=children_fn):
+        for _ in op_tree.dfs(children_fn=children_fn):
             pass
 
         return tuple(sorted(leaf_events, key=lambda x: x.start_time_ns))
 
+    def lookup(self, key: TensorKey) -> int:
+        version = self._active_version.setdefault(key, 0)
+        assert version is not None
+        return version
+
+    def bump(self, key: TensorKey) -> None:
+        prior_version = self._active_version.get(key, None)
+        assert prior_version is not None
+        self._active_version[key] = prior_version + 1
+
+    def delete(self, key: TensorKey) -> None:
+        assert self._active_version.setdefault(key, 0) is not None
+        self._active_version[key] = None
+
 
 class MemoryProfile:
     def __init__(self, result: _ProfilerResult) -> None:

From 314fa8e977ecf419a781e5964f45521e2193950d Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Sat, 26 Nov 2022 10:33:11 -0800
Subject: [PATCH 1290/1922] [Profiler] Memory profiler part 6: Mark gradients
 and temporary intermediates. (#87566)

Semantic assignment will be built up as a series of passes which gradually pin down the regions of a trace. For this reason it is important to be very meticulous in the assignment of categories.

We begin with gradients as they are both straightforward to identify and foundational to subsequent analysis. There are two mechanisms that the profiler can use to tag gradients, each with their own advantages and limitations. The first is direct inspection of the op graph which is generic but predicated on certain features of the Autograd engine. (And therefore not necessarily exhaustive.) The second approach is direct instrumentation via the python tracer. This method relies requires that gradients be attached to an nn.Module parameter and can miss corner cases such as `set_to_none=True` due to the cache structure of the python tracer. Combined these two approaches provide very high coverage.

Temporaries are more straightforward; we can easily add them by trivial local inspection of a data flow node.

Because this is the first PR in the end-to-end section most of the code is building the scaffolding for category bookkeeping and unit testing. (The actual gradient extraction was covered in an earlier PR.)

Differential Revision: [D40220389](https://our.internmc.facebook.com/intern/diff/D40220389/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87566
Approved by: https://github.com/chaekit
---
 test/profiler/test_memory_profiler.py | 100 ++++++++++++++++++++++++++
 torch/profiler/_memory_profiler.py    |  85 ++++++++++++++++++++++
 2 files changed, 185 insertions(+)

diff --git a/test/profiler/test_memory_profiler.py b/test/profiler/test_memory_profiler.py
index ad65d6941023a..26c6f7a398fa3 100644
--- a/test/profiler/test_memory_profiler.py
+++ b/test/profiler/test_memory_profiler.py
@@ -773,5 +773,105 @@ def f_fwd_bwd(**kwargs):
         )
 
 
+@skipIfTorchDynamo("TorchDynamo changes Python calls that memory profiling relies on.")
+class TestMemoryProfilerE2E(TestCase):
+    @staticmethod
+    def _lookup_tensor_categories(
+        t: torch.Tensor, memory_profile: _memory_profiler.MemoryProfile
+    ) -> Dict[_memory_profiler.TensorAndID, Optional[_memory_profiler.Category]]:
+        storage = t.storage()
+        if storage is None:
+            raise ValueError("Cannot look up uninitialized Tensor.")
+
+        snapshot = memory_profile._category_snapshot()
+        ids = {
+            key.storage.allocation_id
+            for key, _ in snapshot
+            if key.storage.ptr == storage.data_ptr() and key.device == storage.device
+        }
+
+        return {
+            (key, version): category
+            for (key, version), category in memory_profile._category_snapshot().items()
+            #
+            # If a Tensor is live we want the most recent ID
+            if key.storage.allocation_id == max(ids | {-1})
+        }
+
+    def _run_and_check_gradients(self, inner_fn, model):
+
+        with profile() as prof:
+            inner_fn()
+
+        memory_profile = prof._memory_profile()
+        for p in model.parameters():
+            self.assertIsNotNone(p.grad)
+
+            p_grad_categories = self._lookup_tensor_categories(p.grad, memory_profile)
+            self.assertGreater(len(p_grad_categories), 0)
+            self.assertTrue(
+                all(
+                    category == _memory_profiler.Category.GRADIENT
+                    for category in p_grad_categories.values()
+                )
+            )
+
+    def test_gradients(self):
+        model = torch.nn.Sequential(
+            torch.nn.Linear(2, 2), ScaleLayer(), torch.nn.Linear(2, 1), ScaleLayer()
+        )
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+
+        def fwd_only():
+            _ = model(torch.ones((2, 2)))
+
+        def fwd_bwd_step():
+            y = model(torch.ones((2, 2)))
+            torch.nn.functional.mse_loss(y, torch.rand((2, 1))).backward()
+            optimizer.step()
+            optimizer.zero_grad()
+
+        # If we profile the first step then gradients will not have been
+        # created when we call `model.forward`, so if we don't call `.backward`
+        # then gradients are never created.
+        with self.assertRaises(AssertionError):
+            self._run_and_check_gradients(inner_fn=fwd_only, model=model)
+
+        # On the first step we must rely on `AccumulateGrad`, since gradients
+        # did not exist when `model.forward` was called.
+        self.assertTrue(all(p.grad is None for p in model.parameters()))
+        self._run_and_check_gradients(inner_fn=fwd_bwd_step, model=model)
+
+        # After one step the python tracer will also flag gradients.
+        self.assertTrue(not any(p.grad is None for p in model.parameters()))
+        self._run_and_check_gradients(inner_fn=fwd_bwd_step, model=model)
+
+        # The parameter gradients are not used but we still detect them with
+        # the python tracer.
+        self._run_and_check_gradients(inner_fn=fwd_only, model=model)
+
+    def test_gradients_set_to_none(self):
+        model = torch.nn.Sequential(torch.nn.Linear(2, 2), torch.nn.Linear(2, 1))
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+
+        def fwd_bwd_step():
+            for _ in range(3):
+                # zero grads at the start so gradients are still live to be
+                # checked.
+                optimizer.zero_grad(set_to_none=True)
+
+                y = model(torch.ones((2, 2)))
+                torch.nn.functional.mse_loss(y, torch.rand((2, 1))).backward()
+                optimizer.step()
+
+        fwd_bwd_step()
+        self.assertTrue(not any(p.grad is None for p in model.parameters()))
+        self._run_and_check_gradients(inner_fn=fwd_bwd_step, model=model)
+
+        optimizer.zero_grad(set_to_none=True)
+        self.assertTrue(all(p.grad is None for p in model.parameters()))
+        self._run_and_check_gradients(inner_fn=fwd_bwd_step, model=model)
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/profiler/_memory_profiler.py b/torch/profiler/_memory_profiler.py
index f6e423950c3da..3d69a1e0f1bb4 100644
--- a/torch/profiler/_memory_profiler.py
+++ b/torch/profiler/_memory_profiler.py
@@ -1,5 +1,6 @@
 import collections
 import dataclasses
+import enum
 from typing import (
     Any,
     cast,
@@ -26,6 +27,13 @@
 )
 from torch.profiler import _utils
 
+TensorAndID = Tuple["TensorKey", int]
+
+
+class Category(enum.Enum):
+    TEMPORARY = enum.auto()
+    GRADIENT = enum.auto()
+
 
 @dataclasses.dataclass
 class _Storage:
@@ -481,7 +489,84 @@ def delete(self, key: TensorKey) -> None:
         self._active_version[key] = None
 
 
+@dataclasses.dataclass
+class CategoryElement:
+    by_id: Optional[Category] = None
+    by_key: Dict[TensorKey, Category] = dataclasses.field(default_factory=dict)
+    by_version: Dict[TensorAndID, Category] = dataclasses.field(default_factory=dict)
+
+    # Used by unit tests to check internals. (And consequently by
+    # MemoryProfile.lookup) This should not be used in any other capacity.
+    _by_id_keyset: Set[TensorKey] = dataclasses.field(default_factory=set)
+
+
+@dataclasses.dataclass
+class CategoryDict:
+    _values: DefaultDict[int, CategoryElement] = dataclasses.field(
+        default_factory=lambda: collections.defaultdict(CategoryElement)
+    )
+
+    def set_by_id(self, key: TensorKey, category: Category) -> None:
+        self._values[key.id].by_id = category
+        self._values[key.id]._by_id_keyset.add(key)
+
+    def set_by_key(self, key: TensorKey, category: Category) -> None:
+        self._values[key.id].by_key[key] = category
+
+    def set_by_version(self, key: TensorKey, version: int, category: Category) -> None:
+        self._values[key.id].by_version[(key, version)] = category
+
+    def setdefault_by_version(
+        self, key: TensorKey, version: int, category: Category
+    ) -> None:
+        self._values[key.id].by_version.setdefault((key, version), category)
+
+    def get(self, key: TensorKey, version: int) -> Optional[Category]:
+        element = self._values[key.id]
+        return (
+            element.by_id
+            or element.by_key.get(key, None)
+            or element.by_version.get((key, version), None)
+        )
+
+
 class MemoryProfile:
     def __init__(self, result: _ProfilerResult) -> None:
         self._op_tree = OpTree(result)
         self._data_flow_graph = DataFlowGraph(self._op_tree)
+        self._categories = CategoryDict()
+
+        self._set_gradients_and_temporaries()
+
+    def _category_snapshot(self) -> Dict[TensorAndID, Optional[Category]]:
+        all_tensor_versions: Set[TensorAndID] = set()
+
+        for node in self._data_flow_graph.flow_nodes:
+            all_tensor_versions.update(((k, v) for k, (_, v) in node.inputs.items()))
+            all_tensor_versions.update(((key, 0) for key in node.intermediates))
+            all_tensor_versions.update(node.outputs.items())
+
+        for i in self._categories._values.values():
+            all_tensor_versions.update(((key, 0) for key in i._by_id_keyset))
+
+        return {
+            (key, version): self._categories.get(key, version)
+            for key, version in sorted(all_tensor_versions)
+        }
+
+    def _set_gradients_and_temporaries(self) -> None:
+        """Mark Tensors which are unambiguous and simple to reason about."""
+
+        # Gradients are straightforward to detect. We directly check the
+        # `.grad` property in the Python tracer, and we can detect any new
+        # gradient Tensors from `AccumulateGrad` ops.
+        for event in self._op_tree.dfs():
+            for _, p_grad in extract_gradients(event):
+                self._categories.set_by_id(p_grad, Category.GRADIENT)
+
+        # Similarly, temporary Tensors are easy to identify and are useful to
+        # flag since they can make memory use "spikier" than one would
+        # otherwise expect.
+        for node in self._data_flow_graph.flow_nodes:
+            for i in node.intermediates:
+                self._categories.set_by_key(i, Category.TEMPORARY)

From b9f4e0bd886939de2b12304e3bf65dc5b25c40dd Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Sat, 26 Nov 2022 10:33:13 -0800
Subject: [PATCH 1291/1922] [Profiler] Memory profiler part 7: Mark inputs
 (#87567)

It is surprisingly difficult to identify the leaves of the data flow graph. The issue is that inputs and pre-existing parameters look identical until parameter identification takes place. It's not too bad for training since Autograd lets us differentiate between them however I still want the tool to do something reasonable in inference.

Some of this will be ameliorated when a later PR pulls in parameters from python tracing. The current approach is passable, but I will continue to mull over refinements.

Differential Revision: [D40220388](https://our.internmc.facebook.com/intern/diff/D40220388/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87567
Approved by: https://github.com/chaekit
---
 test/profiler/test_memory_profiler.py | 64 ++++++++++++++++++++
 torch/profiler/_memory_profiler.py    | 87 +++++++++++++++++++++++++++
 2 files changed, 151 insertions(+)

diff --git a/test/profiler/test_memory_profiler.py b/test/profiler/test_memory_profiler.py
index 26c6f7a398fa3..ddb0e68d610bc 100644
--- a/test/profiler/test_memory_profiler.py
+++ b/test/profiler/test_memory_profiler.py
@@ -775,6 +775,7 @@ def f_fwd_bwd(**kwargs):
 
 @skipIfTorchDynamo("TorchDynamo changes Python calls that memory profiling relies on.")
 class TestMemoryProfilerE2E(TestCase):
+
     @staticmethod
     def _lookup_tensor_categories(
         t: torch.Tensor, memory_profile: _memory_profiler.MemoryProfile
@@ -872,6 +873,69 @@ def fwd_bwd_step():
         self.assertTrue(all(p.grad is None for p in model.parameters()))
         self._run_and_check_gradients(inner_fn=fwd_bwd_step, model=model)
 
+    def test_inputs_fwd(self):
+        model = torch.nn.Sequential(torch.nn.Linear(2, 2), torch.nn.Linear(2, 1))
+        inputs = [torch.ones((2, 2)) for _ in range(2)]
+
+        with profile() as prof:
+            # Inputs which were allocated before profiling began
+            for x in inputs:
+                _ = model(x)
+
+            # Inputs which were allocated after profiling began
+            for _ in range(2):
+                x = torch.ones((2, 2))
+                inputs.append(x)
+                _ = model(x)
+
+        # For now we can't make any meaningful statements without a backward
+        # pass. Here we simply ensure that passes don't generate false positive
+        # category classifications.
+        memory_profile = prof._memory_profile()
+        for x in inputs:
+            categories = self._lookup_tensor_categories(x, memory_profile)
+            self.assertGreater(len(categories), 0)
+            self.assertTrue(all(i is None for i in categories.values()))
+
+        snapshot = memory_profile._category_snapshot()
+        self.assertFalse({k: v for k, v in snapshot.items() if v})
+
+    def test_inputs_fwd_bwd(self):
+        model = torch.nn.Sequential(torch.nn.Linear(2, 2), torch.nn.Linear(2, 1))
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+        inputs_targets = [(torch.ones((2, 2)), torch.rand((2, 1))) for _ in range(2)]
+
+        def fwd_bwd_step(x, targets):
+            y = model(x)
+            torch.nn.functional.mse_loss(y, targets).backward()
+            optimizer.step()
+            optimizer.zero_grad()
+
+        with profile() as prof:
+            # Inputs which were allocated before profiling began
+            for x, targets in inputs_targets:
+                fwd_bwd_step(x, targets)
+
+            # Inputs which were allocated after profiling began
+            for _ in range(2):
+                x = torch.ones((2, 2))
+                targets = torch.rand((2, 1))
+                inputs_targets.append((x, targets))
+                fwd_bwd_step(x, targets)
+
+        memory_profile = prof._memory_profile()
+
+        def check(t):
+            categories = self._lookup_tensor_categories(t, memory_profile)
+            self.assertGreater(len(categories), 0)
+            self.assertTrue(
+                all(i == _memory_profiler.Category.INPUT for i in categories.values())
+            )
+
+        for x, targets in inputs_targets:
+            check(x)
+            check(targets)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/profiler/_memory_profiler.py b/torch/profiler/_memory_profiler.py
index 3d69a1e0f1bb4..e13a83ecd756b 100644
--- a/torch/profiler/_memory_profiler.py
+++ b/torch/profiler/_memory_profiler.py
@@ -31,6 +31,7 @@
 
 
 class Category(enum.Enum):
+    INPUT = enum.auto()
     TEMPORARY = enum.auto()
     GRADIENT = enum.auto()
 
@@ -155,6 +156,15 @@ def extract_gradients(
                     yield TensorKey.from_tensor(p), p_grad_key
 
 
+def get_scopes(event: Optional[_ProfilerEvent]) -> Tuple[RecordScope, ...]:
+    scopes = []
+    while event:
+        if event.typed[0] == _EventType.TorchOp:
+            scopes.append(event.typed[1].scope)
+        event = event.parent
+    return tuple(scopes)
+
+
 class SchemaMatcher:
     """Lookup operator schema based on profiled name.
 
@@ -537,6 +547,10 @@ def __init__(self, result: _ProfilerResult) -> None:
         self._categories = CategoryDict()
 
         self._set_gradients_and_temporaries()
+        self._set_inputs()
+
+    def _is_gradient(self, *args, **kwargs) -> bool:
+        return self._categories.get(*args, **kwargs) == Category.GRADIENT
 
     def _category_snapshot(self) -> Dict[TensorAndID, Optional[Category]]:
         all_tensor_versions: Set[TensorAndID] = set()
@@ -570,3 +584,76 @@ def _set_gradients_and_temporaries(self) -> None:
         for node in self._data_flow_graph.flow_nodes:
             for i in node.intermediates:
                 self._categories.set_by_key(i, Category.TEMPORARY)
+
+    def _set_inputs(self) -> None:
+        """Mark inputs based on which Tensors are updated using gradients.
+
+        The process for differentiating between inputs and activations is more
+        involved. Most Tensors in a training loop depend on at least one
+        gradient: parameters depend on them through updates, and activations
+        and optimizer state depend on them transitively through parameters.
+        Critically, we do not need to know which Tensors are parameters to
+        apply this method; we can simply walk the data flow graph to build the
+        set of all values which depend on a gradient and then obtain the set
+        of inputs from the conjugate set.
+
+        There is, however, one hiccup. The first time we see a parameter is
+        generally on the forward pass of the first step. We know from
+        inspection of the data flow graph that v1 of that Tensor depends on
+        a gradient (provided we profile an optimizer step), but not v0. To
+        address this problem we weaken the definition of "depends on a
+        gradient" to "any version of this Tensor depends on a gradient",
+        which in turn strengthens the criteria for the input set enough to
+        filter the activations in the forward pass of the first step.
+
+        Note that this weakened definition requires us to loop over the data
+        flow graph multiple times because it allows dependency information to
+        flow backward through edges and removes the guarantee that nodes are
+        topologically sorted. (Or indeed, even that a valid topological order
+        exists.) Put another way, we have converted an acyclic data flow graph
+        into a cyclic graph and we are attempting to partition cycles involving
+        a gradient from the rest of the graph.
+        """
+
+        # Partition the graph based on gradient updates.
+        depends_on_gradient: Set[int] = set()
+        while True:
+            start_size = len(depends_on_gradient)
+            for node in self._data_flow_graph.flow_nodes:
+                ids = tuple(
+                    key.id
+                    for key, (_, version) in node.inputs.items()
+                    if self._is_gradient(key, version) or key.id in depends_on_gradient
+                )
+
+                if ids:
+                    depends_on_gradient.update(ids)
+                    depends_on_gradient.update(key.id for key in node.outputs)
+
+            # We are guaranteed to exit because there is a finite set of
+            # TensorAndID pairs. In practice we do not expect to loop more than
+            # three times: once to identify the core parameter update loop,
+            # once to fold the first step into that loop, and a third time
+            # where no new elements are added.
+            if len(depends_on_gradient) == start_size:
+                break
+
+        # We only want to annotate Tensors which actually contribute to the
+        # model calculation.
+        produces_gradient: Set[TensorAndID] = set()
+        for node in reversed(self._data_flow_graph.flow_nodes):
+            tensors = {(key, version) for key, (_, version) in node.inputs.items()}
+            tensors |= node.outputs.items()
+            if any(self._is_gradient(*i) or i in produces_gradient for i in tensors):
+                produces_gradient |= tensors
+
+        # Don't include Tensors created in the backward pass, as these are
+        # generally Autograd implementation details rather than proper inputs.
+        input_candidates = produces_gradient.copy()
+        for node in self._data_flow_graph.flow_nodes:
+            if RecordScope.BACKWARD_FUNCTION in get_scopes(node._event):
+                input_candidates -= set(node.outputs.items())
+
+        for key, version in input_candidates:
+            if key.id not in depends_on_gradient:
+                self._categories.setdefault_by_version(key, version, Category.INPUT)

From 701da84df791afe1500b9683385553c2dc7c6392 Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Sat, 26 Nov 2022 10:33:14 -0800
Subject: [PATCH 1292/1922] [Profiler] Memory profiler part 8: Mark parameters.
 (#87568)

Following the pattern of earlier PRs, we use two methods to extract parameters. The primary one is the Python tracer; both nn.Module and optim.Optimizer collect parameters and in most cases that is sufficient. As a fallback we can analyze the data flow graph and deduce likely parameters based on gradient computation and updates.

Parameter identification has a circular interaction with input identification. Inputs are defined as "not part of the core forward-backward-update loop", but we need inputs for the parameter identification fallback to give us a proxy for the forward pass. Thus, we mark parameters from the python tracer which limits which Tensors get marked as inputs. While not necessary, it adds a bit of robustness. (As shown by the strengthening of the input unit tests.)

Differential Revision: [D40238619](https://our.internmc.facebook.com/intern/diff/D40238619/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87568
Approved by: https://github.com/chaekit
---
 test/profiler/test_memory_profiler.py | 121 +++++++++++++---
 torch/profiler/_memory_profiler.py    | 191 ++++++++++++++++++++------
 2 files changed, 246 insertions(+), 66 deletions(-)

diff --git a/test/profiler/test_memory_profiler.py b/test/profiler/test_memory_profiler.py
index ddb0e68d610bc..a8de29614c27b 100644
--- a/test/profiler/test_memory_profiler.py
+++ b/test/profiler/test_memory_profiler.py
@@ -48,6 +48,22 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x * self.scale
 
 
+class LazyLinear(torch.nn.Module):
+    def __init__(self, in_features: int, out_features: int):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+
+    def forward(self, x) -> torch.Tensor:
+        if getattr(self, "weight", None) is None:
+            self.weight = torch.nn.Parameter(
+                torch.empty((self.out_features, self.in_features))
+            )
+            self.bias = torch.nn.Parameter(torch.empty(self.out_features))
+
+        return torch.nn.functional.linear(x, self.weight, self.bias)
+
+
 @skipIfTorchDynamo("TorchDynamo changes Python calls that memory profiling relies on.")
 class TestIdentifyGradients(TestCase):
     def gradient_detected(
@@ -775,7 +791,6 @@ def f_fwd_bwd(**kwargs):
 
 @skipIfTorchDynamo("TorchDynamo changes Python calls that memory profiling relies on.")
 class TestMemoryProfilerE2E(TestCase):
-
     @staticmethod
     def _lookup_tensor_categories(
         t: torch.Tensor, memory_profile: _memory_profiler.MemoryProfile
@@ -799,25 +814,24 @@ def _lookup_tensor_categories(
             if key.storage.allocation_id == max(ids | {-1})
         }
 
-    def _run_and_check_gradients(self, inner_fn, model):
+    def _run_and_check_parameters_and_gradients(self, inner_fn, model):
 
         with profile() as prof:
             inner_fn()
 
         memory_profile = prof._memory_profile()
-        for p in model.parameters():
-            self.assertIsNotNone(p.grad)
 
-            p_grad_categories = self._lookup_tensor_categories(p.grad, memory_profile)
-            self.assertGreater(len(p_grad_categories), 0)
-            self.assertTrue(
-                all(
-                    category == _memory_profiler.Category.GRADIENT
-                    for category in p_grad_categories.values()
-                )
-            )
+        def assert_category(t: torch.Tensor, category: _memory_profiler.Category):
+            self.assertIsNotNone(t)
+            categories = self._lookup_tensor_categories(t, memory_profile)
+            self.assertGreater(len(categories), 0)
+            self.assertTrue(all(c == category for c in categories.values()), categories)
+
+        for p in model.parameters():
+            assert_category(p, _memory_profiler.Category.PARAMETER)
+            assert_category(p.grad, _memory_profiler.Category.GRADIENT)
 
-    def test_gradients(self):
+    def test_parameters_and_gradients(self):
         model = torch.nn.Sequential(
             torch.nn.Linear(2, 2), ScaleLayer(), torch.nn.Linear(2, 1), ScaleLayer()
         )
@@ -836,22 +850,22 @@ def fwd_bwd_step():
         # created when we call `model.forward`, so if we don't call `.backward`
         # then gradients are never created.
         with self.assertRaises(AssertionError):
-            self._run_and_check_gradients(inner_fn=fwd_only, model=model)
+            self._run_and_check_parameters_and_gradients(inner_fn=fwd_only, model=model)
 
         # On the first step we must rely on `AccumulateGrad`, since gradients
         # did not exist when `model.forward` was called.
         self.assertTrue(all(p.grad is None for p in model.parameters()))
-        self._run_and_check_gradients(inner_fn=fwd_bwd_step, model=model)
+        self._run_and_check_parameters_and_gradients(inner_fn=fwd_bwd_step, model=model)
 
         # After one step the python tracer will also flag gradients.
         self.assertTrue(not any(p.grad is None for p in model.parameters()))
-        self._run_and_check_gradients(inner_fn=fwd_bwd_step, model=model)
+        self._run_and_check_parameters_and_gradients(inner_fn=fwd_bwd_step, model=model)
 
         # The parameter gradients are not used but we still detect them with
         # the python tracer.
-        self._run_and_check_gradients(inner_fn=fwd_only, model=model)
+        self._run_and_check_parameters_and_gradients(inner_fn=fwd_only, model=model)
 
-    def test_gradients_set_to_none(self):
+    def test_parameters_and_gradients_set_to_none(self):
         model = torch.nn.Sequential(torch.nn.Linear(2, 2), torch.nn.Linear(2, 1))
         optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
 
@@ -867,16 +881,43 @@ def fwd_bwd_step():
 
         fwd_bwd_step()
         self.assertTrue(not any(p.grad is None for p in model.parameters()))
-        self._run_and_check_gradients(inner_fn=fwd_bwd_step, model=model)
+        self._run_and_check_parameters_and_gradients(inner_fn=fwd_bwd_step, model=model)
 
         optimizer.zero_grad(set_to_none=True)
         self.assertTrue(all(p.grad is None for p in model.parameters()))
-        self._run_and_check_gradients(inner_fn=fwd_bwd_step, model=model)
+        self._run_and_check_parameters_and_gradients(inner_fn=fwd_bwd_step, model=model)
 
     def test_inputs_fwd(self):
         model = torch.nn.Sequential(torch.nn.Linear(2, 2), torch.nn.Linear(2, 1))
         inputs = [torch.ones((2, 2)) for _ in range(2)]
 
+        with profile() as prof:
+            # Inputs which were allocated before profiling began
+            for x in inputs:
+                _ = model(x)
+
+            # Inputs which were allocated after profiling began
+            for _ in range(2):
+                x = torch.ones((2, 2))
+                inputs.append(x)
+                _ = model(x)
+
+        memory_profile = prof._memory_profile()
+        for x in inputs:
+            categories = self._lookup_tensor_categories(x, memory_profile)
+            self.assertGreater(len(categories), 0)
+            self.assertTrue(
+                all(i == _memory_profiler.Category.INPUT for i in categories.values()),
+                categories,
+            )
+
+        snapshot = memory_profile._category_snapshot()
+        self.assertTrue(_memory_profiler.Category.INPUT in snapshot.values())
+
+    def test_inputs_fwd_lazy(self):
+        model = torch.nn.Sequential(LazyLinear(2, 2), LazyLinear(2, 1))
+        inputs = [torch.ones((2, 2)) for _ in range(2)]
+
         with profile() as prof:
             # Inputs which were allocated before profiling began
             for x in inputs:
@@ -895,10 +936,10 @@ def test_inputs_fwd(self):
         for x in inputs:
             categories = self._lookup_tensor_categories(x, memory_profile)
             self.assertGreater(len(categories), 0)
-            self.assertTrue(all(i is None for i in categories.values()))
+            self.assertTrue(all(i is None for i in categories.values()), categories)
 
         snapshot = memory_profile._category_snapshot()
-        self.assertFalse({k: v for k, v in snapshot.items() if v})
+        self.assertFalse(_memory_profiler.Category.INPUT in snapshot.values())
 
     def test_inputs_fwd_bwd(self):
         model = torch.nn.Sequential(torch.nn.Linear(2, 2), torch.nn.Linear(2, 1))
@@ -936,6 +977,42 @@ def check(t):
             check(x)
             check(targets)
 
+    def test_lazily_initialized(self) -> None:
+        model = torch.nn.Sequential(
+            torch.nn.Linear(2, 2),
+            torch.nn.ReLU(),
+            LazyLinear(2, 2),
+            torch.nn.ReLU(),
+            torch.nn.Linear(2, 1),
+        )
+
+        self.assertEqual(len(list(model.parameters())), 4)
+
+        def inner_fn():
+            y = model(torch.ones((2, 2)))
+            torch.nn.functional.mse_loss(y, torch.rand((2, 1))).backward()
+            optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+            optimizer.step()
+            optimizer.zero_grad()
+
+        self._run_and_check_parameters_and_gradients(inner_fn=inner_fn, model=model)
+        self.assertEqual(len(list(model.parameters())), 6)
+
+    def test_manual_optimizer_step(self) -> None:
+        model = torch.nn.Sequential(torch.nn.Linear(2, 2), torch.nn.Linear(2, 1))
+
+        def inner_fn():
+            y = model(torch.ones((2, 2)))
+            torch.nn.functional.mse_loss(y, torch.rand((2, 1))).backward()
+
+            with torch.no_grad():
+                for p in model.parameters():
+                    grad = p.grad
+                    self.assertIsNotNone(grad)
+                    p.add_(grad, alpha=-0.1)
+
+        self._run_and_check_parameters_and_gradients(inner_fn=inner_fn, model=model)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/profiler/_memory_profiler.py b/torch/profiler/_memory_profiler.py
index e13a83ecd756b..25107c5ae5dc4 100644
--- a/torch/profiler/_memory_profiler.py
+++ b/torch/profiler/_memory_profiler.py
@@ -34,6 +34,7 @@ class Category(enum.Enum):
     INPUT = enum.auto()
     TEMPORARY = enum.auto()
     GRADIENT = enum.auto()
+    PARAMETER = enum.auto()
 
 
 @dataclasses.dataclass
@@ -108,9 +109,9 @@ def _as_sortable(self) -> Tuple[int, int, str, int]:
         return self.id, self.storage.allocation_id, self.device.type, self.device.index
 
 
-def extract_gradients(
+def _extract_parameters_and_gradients(
     node: _ProfilerEvent,
-) -> Iterator[Tuple[Optional[TensorKey], TensorKey]]:
+) -> Iterator[Tuple[Optional[TensorKey], Optional[TensorKey]]]:
     children = node.children
 
     # AccumulateGrad is used in the Autograd engine to handle gradient updates.
@@ -132,9 +133,7 @@ def extract_gradients(
         and children[0].typed[1].inputs
         and isinstance(children[0].typed[1].inputs[0], _TensorMetadata)
     ):
-        key = TensorKey.from_tensor(children[0].typed[1].inputs[0])
-        if key:
-            yield None, key
+        yield None, TensorKey.from_tensor(children[0].typed[1].inputs[0])
 
     # We directly instrument `torch.nn.Module` and `torch.optim.Optimizer`
     # NOTE: The values captured by the python tracer are cached; they can be
@@ -145,15 +144,25 @@ def extract_gradients(
         assert typed_fields.module is None or typed_fields.optimizer is None
         if typed_fields.module is not None:
             for _, p, p_grad in typed_fields.module.parameters:
-                p_grad_key = TensorKey.from_tensor(p_grad)
-                if p_grad_key is not None:
-                    yield TensorKey.from_tensor(p), p_grad_key
+                yield TensorKey.from_tensor(p), TensorKey.from_tensor(p_grad)
 
         if typed_fields.optimizer is not None:
             for p, p_grad, _ in typed_fields.optimizer.parameters:
-                p_grad_key = TensorKey.from_tensor(p_grad)
-                if p_grad_key is not None:
-                    yield TensorKey.from_tensor(p), p_grad_key
+                yield TensorKey.from_tensor(p), TensorKey.from_tensor(p_grad)
+
+
+def extract_parameters(node: _ProfilerEvent) -> Iterator[TensorKey]:
+    for p, p_grad in _extract_parameters_and_gradients(node):
+        if p is not None:
+            yield p
+
+
+def extract_gradients(
+    node: _ProfilerEvent,
+) -> Iterator[Tuple[Optional[TensorKey], TensorKey]]:
+    for p, p_grad in _extract_parameters_and_gradients(node):
+        if p_grad is not None:
+            yield p, p_grad
 
 
 def get_scopes(event: Optional[_ProfilerEvent]) -> Tuple[RecordScope, ...]:
@@ -547,7 +556,9 @@ def __init__(self, result: _ProfilerResult) -> None:
         self._categories = CategoryDict()
 
         self._set_gradients_and_temporaries()
+        self._set_parameters_using_python_tracer()
         self._set_inputs()
+        self._set_parameters_using_data_flow()
 
     def _is_gradient(self, *args, **kwargs) -> bool:
         return self._categories.get(*args, **kwargs) == Category.GRADIENT
@@ -568,6 +579,41 @@ def _category_snapshot(self) -> Dict[TensorAndID, Optional[Category]]:
             for key, version in sorted(all_tensor_versions)
         }
 
+    def _any_version_depends_on_gradient(self) -> Set[int]:
+        """Extract IDs of Tensors which depend or will depend on a gradient.
+
+        Note that this weakened definition of "depends" requires us to loop
+        over the data flow graph multiple times because it allows dependency
+        information to flow backward through edges and removes the guarantee
+        that nodes are topologically sorted. (Or indeed, even that a valid
+        topological order exists.) Put another way, we have converted an
+        acyclic data flow graph into a cyclic graph and we are attempting to
+        partition cycles involving a gradient from the rest of the graph.
+        """
+        depends_on_gradient: Set[int] = set()
+        while True:
+            start_size = len(depends_on_gradient)
+            for node in self._data_flow_graph.flow_nodes:
+                ids = tuple(
+                    key.id
+                    for key, (_, version) in node.inputs.items()
+                    if self._categories.get(key, version)
+                    in (Category.GRADIENT, Category.PARAMETER)
+                    or key.id in depends_on_gradient
+                )
+
+                if ids:
+                    depends_on_gradient.update(ids)
+                    depends_on_gradient.update(key.id for key in node.outputs)
+
+            # We are guaranteed to exit because there is a finite set of
+            # TensorAndID pairs. In practice we do not expect to loop more than
+            # three times: once to identify the core parameter update loop,
+            # once to fold the first step into that loop, and a third time
+            # where no new elements are added.
+            if len(depends_on_gradient) == start_size:
+                return depends_on_gradient
+
     def _set_gradients_and_temporaries(self) -> None:
         """Mark Tensors which are unambiguous and simple to reason about."""
 
@@ -585,6 +631,12 @@ def _set_gradients_and_temporaries(self) -> None:
             for i in node.intermediates:
                 self._categories.set_by_key(i, Category.TEMPORARY)
 
+    def _set_parameters_using_python_tracer(self) -> None:
+        for event in self._op_tree.dfs():
+            for p in extract_parameters(event):
+                if p is not None:
+                    self._categories.set_by_id(p, Category.PARAMETER)
+
     def _set_inputs(self) -> None:
         """Mark inputs based on which Tensors are updated using gradients.
 
@@ -604,39 +656,13 @@ def _set_inputs(self) -> None:
         address this problem we weaken the definition of "depends on a
         gradient" to "any version of this Tensor depends on a gradient",
         which in turn strengthens the criteria for the input set enough to
-        filter the activations in the forward pass of the first step.
-
-        Note that this weakened definition requires us to loop over the data
-        flow graph multiple times because it allows dependency information to
-        flow backward through edges and removes the guarantee that nodes are
-        topologically sorted. (Or indeed, even that a valid topological order
-        exists.) Put another way, we have converted an acyclic data flow graph
-        into a cyclic graph and we are attempting to partition cycles involving
-        a gradient from the rest of the graph.
-        """
-
-        # Partition the graph based on gradient updates.
-        depends_on_gradient: Set[int] = set()
-        while True:
-            start_size = len(depends_on_gradient)
-            for node in self._data_flow_graph.flow_nodes:
-                ids = tuple(
-                    key.id
-                    for key, (_, version) in node.inputs.items()
-                    if self._is_gradient(key, version) or key.id in depends_on_gradient
-                )
-
-                if ids:
-                    depends_on_gradient.update(ids)
-                    depends_on_gradient.update(key.id for key in node.outputs)
+        filter the activations in the forward pass of the first step."""
 
-            # We are guaranteed to exit because there is a finite set of
-            # TensorAndID pairs. In practice we do not expect to loop more than
-            # three times: once to identify the core parameter update loop,
-            # once to fold the first step into that loop, and a third time
-            # where no new elements are added.
-            if len(depends_on_gradient) == start_size:
-                break
+        # All of this analysis is predicated on using at least one training
+        # step (or parameters from the python tracer) to partition the graph.
+        # Absent that we cannot determine which Tensors are inputs and which
+        # ones are part of the model.
+        depends_on_gradient = self._any_version_depends_on_gradient()
 
         # We only want to annotate Tensors which actually contribute to the
         # model calculation.
@@ -644,7 +670,11 @@ def _set_inputs(self) -> None:
         for node in reversed(self._data_flow_graph.flow_nodes):
             tensors = {(key, version) for key, (_, version) in node.inputs.items()}
             tensors |= node.outputs.items()
-            if any(self._is_gradient(*i) or i in produces_gradient for i in tensors):
+            if any(
+                self._categories.get(*i) in (Category.GRADIENT, Category.PARAMETER)
+                or i in produces_gradient
+                for i in tensors
+            ):
                 produces_gradient |= tensors
 
         # Don't include Tensors created in the backward pass, as these are
@@ -657,3 +687,76 @@ def _set_inputs(self) -> None:
         for key, version in input_candidates:
             if key.id not in depends_on_gradient:
                 self._categories.setdefault_by_version(key, version, Category.INPUT)
+
+    def _set_parameters_using_data_flow(self) -> None:
+        """Deduce which Tensors are parameters.
+
+        Consider the following code for the step of SGD with momentum
+        (nesterov=False), where `d_p` is the gradient of `param` and `buf` is
+        the momentum buffer.
+        ```
+          buf.mul_(momentum).add_(d_p, alpha=1 - dampening)
+          d_p = buf
+          param.add_(d_p, alpha=-lr)
+        ```
+        Both `param` and `buf` take a gradient and perform an in-place update.
+
+        The python tracer will inspect calls to `nn.Module.forward` and
+        `optim.Optimizer.step` to extract parameter and optimizer state
+        respectively (including parameters), so this is generally a non-issue.
+
+        However as a fallback we can also exploit several properties of
+        parameters to distinguish them from other model state.
+
+        First, they are directly used in the forward pass. (At this point we
+        haven't established which parts of the graph correspond to the forward
+        pass but we can deduce enough to suffice.) Some mutable state such as
+        batch norm moving averages also contribute to the forward pass, but
+        optimizer state does not.
+
+        Second, a parameter is by definition used to compute at least one
+        gradient and depends on at least one gradient.
+        """
+        snapshot = self._category_snapshot()
+
+        # Determine which Tensors might be parameters based on forward pass
+        # data flow. Note this these are only candidates; we filter nodes that
+        # we know are part of the backward pass but that doesn't guarantee that
+        # they are part of the forward pass.
+        candidate_parameters: Set[TensorAndID] = set()
+        candidate_fwd_tensors: Set[TensorAndID] = {
+            i for i, category in snapshot.items() if category == Category.INPUT
+        }
+
+        for node in self._data_flow_graph.flow_nodes:
+            inputs = {(key, value) for key, (_, value) in node.inputs.items()}
+            if (
+                # Don't check nodes in the backward pass.
+                RecordScope.BACKWARD_FUNCTION not in get_scopes(node._event)
+                and not any(self._is_gradient(*i) for i in inputs)
+                and not any(self._is_gradient(*i) for i in node.outputs.items())
+                #
+                # and only check nodes which depend on an input.
+                and candidate_fwd_tensors.intersection(inputs)
+            ):
+                candidate_fwd_tensors |= node.outputs.items()
+                candidate_parameters |= inputs.difference(candidate_fwd_tensors)
+
+        # Require that each parameter eventually contributes to the value of a gradient
+        used_for_gradient: Set[TensorAndID] = set()
+        for node in reversed(self._data_flow_graph.flow_nodes):
+            if any(
+                self._is_gradient(*i) or i in used_for_gradient
+                for i in node.outputs.items()
+            ):
+                for key, (_, version) in node.inputs.items():
+                    used_for_gradient.add((key, version))
+        candidate_parameters.intersection_update(used_for_gradient)
+
+        # and depends on a gradient.
+        parameter_keys = {key.id for key, _ in candidate_parameters}
+        parameter_keys &= self._any_version_depends_on_gradient()
+
+        for key, _ in snapshot.keys():
+            if key.id in parameter_keys:
+                self._categories.set_by_id(key, Category.PARAMETER)

From 94f75898ba660f356ed19d72afc3851bd592bcaf Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Sat, 26 Nov 2022 10:33:16 -0800
Subject: [PATCH 1293/1922] [Profiler] E2E expecttests for category assignment
 (#88653)

Up until now the unit tests for category assignment have been narrowly scoped to specific checks on specific Tensors. However as we start to reach reasonable levels of category assignment it's useful to supplement those tests with higher level summary tests to inspect the larger graph and confirm that it makes sense. (It will also be necessary for some categories like activations where it is tedious to record all relevant Tensors.)

The general structure of these tests is to capture a model invocation with `__torch_dispatch__` and then cross reference those inputs and outputs with the categories assigned by the memory profiler.

Differential Revision: [D40868659](https://our.internmc.facebook.com/intern/diff/D40868659/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88653
Approved by: https://github.com/chaekit
---
 test/profiler/test_memory_profiler.py | 402 +++++++++++++++++++++++++-
 1 file changed, 401 insertions(+), 1 deletion(-)

diff --git a/test/profiler/test_memory_profiler.py b/test/profiler/test_memory_profiler.py
index a8de29614c27b..6819109acef42 100644
--- a/test/profiler/test_memory_profiler.py
+++ b/test/profiler/test_memory_profiler.py
@@ -6,9 +6,10 @@
 from typing import Callable, Dict, Iterator, List, Optional, Tuple
 
 import torch
-from torch._C._profiler import _EventType
+from torch._C._profiler import _EventType, _TensorMetadata
 from torch.profiler import _memory_profiler, _utils
 from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo, TestCase
+from torch.utils._pytree import tree_flatten
 
 
 profile = functools.partial(
@@ -64,6 +65,33 @@ def forward(self, x) -> torch.Tensor:
         return torch.nn.functional.linear(x, self.weight, self.bias)
 
 
+class RecordInputOutputDispatchMode(torch.utils._python_dispatch.TorchDispatchMode):
+    def __init__(self):
+        self.results = []
+
+    def mark_region(self, name: str):
+        self.results.append((name, (), ()))
+
+    @staticmethod
+    def flat_ids(args):
+        flat_args = tree_flatten(args)[0]
+        return tuple(
+            (t._cdata, t.storage().data_ptr())
+            for t in flat_args
+            if isinstance(t, torch.Tensor) and t.storage()
+        )
+
+    def __torch_dispatch__(self, func, types, args=..., kwargs=None):
+        args = args or []
+        kwargs = kwargs or {}
+        flat_inputs = self.flat_ids(args) + self.flat_ids(kwargs)
+        out = func(*args, **kwargs)
+        flat_outputs = self.flat_ids(out)
+        if (flat_inputs or flat_outputs) and "_record_function_enter" not in func.name():
+            self.results.append((func.name(), flat_inputs, flat_outputs))
+        return out
+
+
 @skipIfTorchDynamo("TorchDynamo changes Python calls that memory profiling relies on.")
 class TestIdentifyGradients(TestCase):
     def gradient_detected(
@@ -831,6 +859,61 @@ def assert_category(t: torch.Tensor, category: _memory_profiler.Category):
             assert_category(p, _memory_profiler.Category.PARAMETER)
             assert_category(p.grad, _memory_profiler.Category.GRADIENT)
 
+    def _run_and_format_categories(self, fn, indent=12):
+        """Generate summary of assigned categories for expecttest."""
+
+        # Use `__torch_dispatch__` to collect ground truth.
+        with RecordInputOutputDispatchMode() as record_ops, profile() as prof:
+            fn(lambda name: record_ops.mark_region(f"-- {name} ".ljust(105, "-")))
+
+        memory_profile = prof._memory_profile()
+        ptr_pair_to_key: Dict[Tuple[int, int], _memory_profiler.TensorKey] = {}
+        snapshot = memory_profile._category_snapshot()
+
+        # Build map from observed live Tensors to the memory profiler's
+        # TensorKey representation.
+        for op in memory_profile._op_tree.dfs():
+            if op.typed[0] == _EventType.TorchOp:
+                inputs = tree_flatten(op.typed[1].inputs)[0]
+                for t in (i for i in inputs if isinstance(i, _TensorMetadata)):
+                    key = _memory_profiler.TensorKey.from_tensor(t)
+                    if key:
+                        ptr_pair_to_key[(t.impl_ptr, t.storage_data_ptr)] = key
+
+        def format_categories(ptr_pair: int):
+            target_key = ptr_pair_to_key.get(ptr_pair, None)
+            if target_key is None:
+                return "???"
+
+            matches = tuple(
+                (version, category.name if category else "???")
+                for (key, version), category in snapshot.items()
+                if key == target_key
+            )
+            assert matches, "Failed to lookup Tensor"
+
+            # Deduplicate version bumps which don't change the category.
+            categories = [matches[0][1]]
+            for _, category in matches:
+                if category != categories[-1]:
+                    categories.append(category)
+
+            return f"{target_key.storage.allocation_id} ({','.join(categories)})"
+
+        out: List[str] = []
+        for name, inputs, outputs in record_ops.results:
+            if inputs or outputs:
+                # PyTorch ops
+                inputs_str = ", ".join(format_categories(i) for i in inputs)
+                outputs_str = ", ".join(format_categories(i) for i in outputs)
+                out.append(f"{name:<40} {inputs_str:<45} -> {outputs_str}")
+
+            else:
+                # Marked regions.
+                out.append(f"\n{name}")
+
+        return textwrap.indent("\n".join(out), " " * indent)
+
     def test_parameters_and_gradients(self):
         model = torch.nn.Sequential(
             torch.nn.Linear(2, 2), ScaleLayer(), torch.nn.Linear(2, 1), ScaleLayer()
@@ -1013,6 +1096,323 @@ def inner_fn():
 
         self._run_and_check_parameters_and_gradients(inner_fn=inner_fn, model=model)
 
+    def test_categories_e2e_simple_fwd(self) -> None:
+        w0 = torch.ones((1,), requires_grad=True)
+        w1 = torch.ones((1,), requires_grad=True)
+
+        def step_fn(_):
+            x = torch.ones((2, 2))
+            y = torch.cat([x * w0, x * w1], dim=1)
+
+        # NOTE: We expect that all unknown categories. This is simply a sanity
+        #       check to ensure that we do not over-label.
+        self.assertExpectedInline(
+            self._run_and_format_categories(step_fn),
+            """\
+            aten::ones                                                                             -> 1 (???)
+            aten::mul.Tensor                         1 (???), 2 (???)                              -> 3 (???)
+            aten::mul.Tensor                         1 (???), 4 (???)                              -> 5 (???)
+            aten::cat                                3 (???), 5 (???)                              -> ???""",
+        )
+
+    def test_categories_e2e_simple_fwd_bwd(self) -> None:
+        w0 = torch.ones((1,), requires_grad=True)
+        w1 = torch.ones((1,), requires_grad=True)
+
+        def step_fn(mark_region):
+            x = torch.ones((2, 2))
+            targets = torch.ones((2, 4))
+
+            mark_region("Forward & loss")
+            y = torch.cat([x * w0, x * w1], dim=1)
+            loss = torch.nn.functional.binary_cross_entropy_with_logits(y, targets)
+
+            mark_region("Backward")
+            loss.backward()
+
+        self.assertExpectedInline(
+            self._run_and_format_categories(step_fn),
+            """\
+            aten::ones                                                                             -> 1 (INPUT)
+            aten::ones                                                                             -> 2 (INPUT)
+
+            -- Forward & loss ---------------------------------------------------------------------------------------
+            aten::mul.Tensor                         1 (INPUT), 3 (INPUT)                          -> 4 (INPUT)
+            aten::mul.Tensor                         1 (INPUT), 5 (INPUT)                          -> 6 (INPUT)
+            aten::cat                                4 (INPUT), 6 (INPUT)                          -> 7 (INPUT)
+            aten::binary_cross_entropy_with_logits   7 (INPUT), 2 (INPUT)                          -> 13 (INPUT)
+
+            -- Backward ---------------------------------------------------------------------------------------------
+            aten::ones_like                          13 (INPUT)                                    -> 16 (INPUT)
+            aten::sigmoid                            7 (INPUT)                                     -> 17 (TEMPORARY)
+            aten::sub.Tensor                         17 (TEMPORARY), 2 (INPUT)                     -> 18 (TEMPORARY)
+            aten::mul.Tensor                         18 (TEMPORARY), 16 (INPUT)                    -> 19 (???)
+            aten::div_.Scalar                        19 (???)                                      -> 19 (???)
+            aten::slice.Tensor                       19 (???)                                      -> 19 (???)
+            aten::slice.Tensor                       19 (???)                                      -> 19 (???)
+            aten::mul.Tensor                         19 (???), 1 (INPUT)                           -> 22 (???)
+            aten::sum.dim_IntList                    22 (???)                                      -> 23 (GRADIENT)
+            aten::view                               23 (GRADIENT)                                 -> 23 (GRADIENT)
+            aten::detach                             23 (GRADIENT)                                 -> 23 (GRADIENT)
+            aten::detach                             23 (GRADIENT)                                 -> ???
+            aten::mul.Tensor                         19 (???), 1 (INPUT)                           -> 24 (???)
+            aten::sum.dim_IntList                    24 (???)                                      -> 25 (GRADIENT)
+            aten::view                               25 (GRADIENT)                                 -> 25 (GRADIENT)
+            aten::detach                             25 (GRADIENT)                                 -> 25 (GRADIENT)
+            aten::detach                             25 (GRADIENT)                                 -> ???""",
+        )
+
+    def test_categories_e2e_simple_fwd_bwd_step(self) -> None:
+        w0 = torch.ones((1,), requires_grad=True)
+        w1 = torch.ones((1,), requires_grad=True)
+        optimizer = torch.optim.SGD([w0, w1], lr=0.1)
+
+        def step_fn(mark_region):
+            x = torch.ones((2, 2))
+            targets = torch.ones((2, 4))
+
+            mark_region("Forward & loss")
+            y = torch.cat([x * w0, x * w1], dim=1)
+            loss = torch.nn.functional.binary_cross_entropy_with_logits(y, targets)
+
+            mark_region("Backward")
+            loss.backward()
+
+            mark_region("Optimizer")
+            optimizer.step()
+            optimizer.zero_grad()
+
+        self.assertExpectedInline(
+            self._run_and_format_categories(step_fn),
+            """\
+            aten::ones                                                                             -> 1 (INPUT)
+            aten::ones                                                                             -> 2 (INPUT)
+
+            -- Forward & loss ---------------------------------------------------------------------------------------
+            aten::mul.Tensor                         1 (INPUT), 3 (PARAMETER)                      -> 4 (???)
+            aten::mul.Tensor                         1 (INPUT), 5 (PARAMETER)                      -> 6 (???)
+            aten::cat                                4 (???), 6 (???)                              -> 7 (???)
+            aten::binary_cross_entropy_with_logits   7 (???), 2 (INPUT)                            -> 13 (???)
+
+            -- Backward ---------------------------------------------------------------------------------------------
+            aten::ones_like                          13 (???)                                      -> 16 (???)
+            aten::sigmoid                            7 (???)                                       -> 17 (TEMPORARY)
+            aten::sub.Tensor                         17 (TEMPORARY), 2 (INPUT)                     -> 18 (TEMPORARY)
+            aten::mul.Tensor                         18 (TEMPORARY), 16 (???)                      -> 19 (???)
+            aten::div_.Scalar                        19 (???)                                      -> 19 (???)
+            aten::slice.Tensor                       19 (???)                                      -> 19 (???)
+            aten::slice.Tensor                       19 (???)                                      -> 19 (???)
+            aten::mul.Tensor                         19 (???), 1 (INPUT)                           -> 22 (???)
+            aten::sum.dim_IntList                    22 (???)                                      -> 23 (GRADIENT)
+            aten::view                               23 (GRADIENT)                                 -> 23 (GRADIENT)
+            aten::detach                             23 (GRADIENT)                                 -> 23 (GRADIENT)
+            aten::detach                             23 (GRADIENT)                                 -> 23 (GRADIENT)
+            aten::mul.Tensor                         19 (???), 1 (INPUT)                           -> 24 (???)
+            aten::sum.dim_IntList                    24 (???)                                      -> 25 (GRADIENT)
+            aten::view                               25 (GRADIENT)                                 -> 25 (GRADIENT)
+            aten::detach                             25 (GRADIENT)                                 -> 25 (GRADIENT)
+            aten::detach                             25 (GRADIENT)                                 -> 25 (GRADIENT)
+
+            -- Optimizer --------------------------------------------------------------------------------------------
+            aten::add_.Tensor                        3 (PARAMETER), 25 (GRADIENT)                  -> 3 (PARAMETER)
+            aten::add_.Tensor                        5 (PARAMETER), 23 (GRADIENT)                  -> 5 (PARAMETER)
+            aten::zero_                              25 (GRADIENT)                                 -> 25 (GRADIENT)
+            aten::zero_                              23 (GRADIENT)                                 -> 23 (GRADIENT)""",
+        )
+
+    def test_categories_e2e_simple_module_fwd(self) -> None:
+        model = torch.nn.Linear(2, 4, bias=True)
+        self.assertExpectedInline(
+            self._run_and_format_categories(lambda _: model(torch.ones((2, 2)))),
+            """\
+            aten::ones                                                                             -> 1 (INPUT)
+            aten::t                                  2 (PARAMETER)                                 -> 2 (PARAMETER)
+            aten::addmm                              3 (PARAMETER), 1 (INPUT), 2 (PARAMETER)       -> 4 (???)""",
+        )
+
+    def test_categories_e2e_simple_module_fwd_bwd(self) -> None:
+        model = torch.nn.Linear(2, 1, bias=True)
+
+        def step_fn(mark_region):
+            mark_region("Forward & loss")
+            loss = model(torch.ones((2, 2))).sum()
+
+            mark_region("Backward")
+            loss.backward()
+
+        self.assertExpectedInline(
+            self._run_and_format_categories(step_fn),
+            """\
+
+            -- Forward & loss ---------------------------------------------------------------------------------------
+            aten::ones                                                                             -> 1 (INPUT)
+            aten::t                                  2 (PARAMETER)                                 -> 2 (PARAMETER)
+            aten::addmm                              3 (PARAMETER), 1 (INPUT), 2 (PARAMETER)       -> 4 (???)
+            aten::sum                                4 (???)                                       -> 5 (???)
+
+            -- Backward ---------------------------------------------------------------------------------------------
+            aten::ones_like                          5 (???)                                       -> 6 (???)
+            aten::expand                             6 (???)                                       -> 6 (???)
+            aten::t                                  6 (???)                                       -> 6 (???)
+            aten::mm                                 6 (???), 1 (INPUT)                            -> 7 (GRADIENT)
+            aten::t                                  7 (GRADIENT)                                  -> 7 (GRADIENT)
+            aten::sum.dim_IntList                    6 (???)                                       -> 9 (GRADIENT)
+            aten::view                               9 (GRADIENT)                                  -> 9 (GRADIENT)
+            aten::detach                             9 (GRADIENT)                                  -> 9 (GRADIENT)
+            aten::detach                             9 (GRADIENT)                                  -> ???
+            aten::t                                  7 (GRADIENT)                                  -> 7 (GRADIENT)
+            aten::detach                             7 (GRADIENT)                                  -> 7 (GRADIENT)
+            aten::detach                             7 (GRADIENT)                                  -> ???""",
+        )
+
+    def test_categories_e2e_simple_module_fwd_bwd_step(self) -> None:
+        model = torch.nn.Linear(2, 1, bias=True)
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+
+        def step_fn(mark_region):
+            mark_region("Forward & loss")
+            loss = model(torch.ones((2, 2))).sum()
+
+            mark_region("Backward")
+            loss.backward()
+
+            mark_region("Optimizer")
+            optimizer.step()
+            optimizer.zero_grad()
+
+        self.assertExpectedInline(
+            self._run_and_format_categories(step_fn),
+            """\
+
+            -- Forward & loss ---------------------------------------------------------------------------------------
+            aten::ones                                                                             -> 1 (INPUT)
+            aten::t                                  2 (PARAMETER)                                 -> 2 (PARAMETER)
+            aten::addmm                              3 (PARAMETER), 1 (INPUT), 2 (PARAMETER)       -> 4 (???)
+            aten::sum                                4 (???)                                       -> 5 (???)
+
+            -- Backward ---------------------------------------------------------------------------------------------
+            aten::ones_like                          5 (???)                                       -> 6 (???)
+            aten::expand                             6 (???)                                       -> 6 (???)
+            aten::t                                  6 (???)                                       -> 6 (???)
+            aten::mm                                 6 (???), 1 (INPUT)                            -> 7 (GRADIENT)
+            aten::t                                  7 (GRADIENT)                                  -> 7 (GRADIENT)
+            aten::sum.dim_IntList                    6 (???)                                       -> 9 (GRADIENT)
+            aten::view                               9 (GRADIENT)                                  -> 9 (GRADIENT)
+            aten::detach                             9 (GRADIENT)                                  -> 9 (GRADIENT)
+            aten::detach                             9 (GRADIENT)                                  -> 9 (GRADIENT)
+            aten::t                                  7 (GRADIENT)                                  -> 7 (GRADIENT)
+            aten::detach                             7 (GRADIENT)                                  -> 7 (GRADIENT)
+            aten::detach                             7 (GRADIENT)                                  -> 7 (GRADIENT)
+
+            -- Optimizer --------------------------------------------------------------------------------------------
+            aten::clone                              7 (GRADIENT)                                  -> 10 (???)
+            aten::detach                             10 (???)                                      -> 10 (???)
+            aten::detach                             10 (???)                                      -> 10 (???)
+            aten::add_.Tensor                        2 (PARAMETER), 10 (???)                       -> 2 (PARAMETER)
+            aten::clone                              9 (GRADIENT)                                  -> 11 (???)
+            aten::detach                             11 (???)                                      -> 11 (???)
+            aten::detach                             11 (???)                                      -> 11 (???)
+            aten::add_.Tensor                        3 (PARAMETER), 11 (???)                       -> 3 (PARAMETER)
+            aten::zero_                              7 (GRADIENT)                                  -> 7 (GRADIENT)
+            aten::zero_                              9 (GRADIENT)                                  -> 9 (GRADIENT)""",
+        )
+
+    def test_categories_e2e_sequential_fwd(self) -> None:
+        model = torch.nn.Sequential(
+            torch.nn.Linear(2, 4, bias=True),
+            torch.nn.ReLU(),
+            torch.nn.Linear(4, 4, bias=False),
+            torch.nn.Softmax(dim=1),
+        )
+        self.assertExpectedInline(
+            self._run_and_format_categories(lambda _: model(torch.ones((2, 2)))),
+            """\
+            aten::ones                                                                             -> 1 (INPUT)
+            aten::t                                  2 (PARAMETER)                                 -> 2 (PARAMETER)
+            aten::addmm                              3 (PARAMETER), 1 (INPUT), 2 (PARAMETER)       -> 4 (???)
+            aten::relu                               4 (???)                                       -> 5 (???)
+            aten::detach                             5 (???)                                       -> ???
+            aten::t                                  6 (PARAMETER)                                 -> 6 (PARAMETER)
+            aten::mm                                 5 (???), 6 (PARAMETER)                        -> 7 (???)
+            aten::_softmax                           7 (???)                                       -> 8 (???)
+            aten::detach                             8 (???)                                       -> ???""",
+        )
+
+    def test_categories_e2e_sequential_fwd_bwd(self) -> None:
+        model = torch.nn.Sequential(
+            torch.nn.Linear(2, 4, bias=True),
+            torch.nn.ReLU(),
+            torch.nn.Linear(4, 4, bias=False),
+            torch.nn.Softmax(dim=1),
+        )
+
+        def step_fn(mark_region):
+            x = torch.ones((2, 2))
+            targets = torch.ones((2, 4))
+
+            mark_region("Forward")
+            y = model(x)
+
+            mark_region("Loss")
+            loss = torch.sum((y - targets) ** 2).mean()
+
+            mark_region("Backward")
+            loss.backward()
+
+        self.assertExpectedInline(
+            self._run_and_format_categories(step_fn),
+            """\
+            aten::ones                                                                             -> 1 (INPUT)
+            aten::ones                                                                             -> 2 (INPUT)
+
+            -- Forward ----------------------------------------------------------------------------------------------
+            aten::t                                  3 (PARAMETER)                                 -> 3 (PARAMETER)
+            aten::addmm                              4 (PARAMETER), 1 (INPUT), 3 (PARAMETER)       -> 5 (???)
+            aten::relu                               5 (???)                                       -> 6 (???)
+            aten::detach                             6 (???)                                       -> 6 (???)
+            aten::t                                  7 (PARAMETER)                                 -> 7 (PARAMETER)
+            aten::mm                                 6 (???), 7 (PARAMETER)                        -> 8 (???)
+            aten::_softmax                           8 (???)                                       -> 9 (???)
+            aten::detach                             9 (???)                                       -> 9 (???)
+
+            -- Loss -------------------------------------------------------------------------------------------------
+            aten::sub.Tensor                         9 (???), 2 (INPUT)                            -> 10 (???)
+            aten::pow.Tensor_Scalar                  10 (???)                                      -> 11 (???)
+            aten::sum                                11 (???)                                      -> 12 (???)
+            aten::mean                               12 (???)                                      -> 13 (???)
+
+            -- Backward ---------------------------------------------------------------------------------------------
+            aten::ones_like                          13 (???)                                      -> 16 (???)
+            aten::expand                             16 (???)                                      -> 16 (???)
+            aten::div.Scalar                         16 (???)                                      -> 19 (???)
+            aten::expand                             19 (???)                                      -> 19 (???)
+            aten::pow.Tensor_Scalar                  10 (???)                                      -> 20 (TEMPORARY)
+            aten::mul.Scalar                         20 (TEMPORARY)                                -> 23 (TEMPORARY)
+            aten::mul.Tensor                         19 (???), 23 (TEMPORARY)                      -> 24 (???)
+            aten::detach                             9 (???)                                       -> 9 (???)
+            aten::_softmax_backward_data             24 (???), 9 (???)                             -> 25 (???)
+            aten::t                                  25 (???)                                      -> 25 (???)
+            aten::mm                                 25 (???), 6 (???)                             -> 26 (GRADIENT)
+            aten::t                                  26 (GRADIENT)                                 -> 26 (GRADIENT)
+            aten::t                                  7 (PARAMETER)                                 -> 7 (PARAMETER)
+            aten::mm                                 25 (???), 7 (PARAMETER)                       -> 27 (???)
+            aten::t                                  26 (GRADIENT)                                 -> 26 (GRADIENT)
+            aten::detach                             26 (GRADIENT)                                 -> 26 (GRADIENT)
+            aten::detach                             26 (GRADIENT)                                 -> ???
+            aten::detach                             6 (???)                                       -> 6 (???)
+            aten::threshold_backward                 27 (???), 6 (???)                             -> 28 (???)
+            aten::t                                  28 (???)                                      -> 28 (???)
+            aten::mm                                 28 (???), 1 (INPUT)                           -> 29 (GRADIENT)
+            aten::t                                  29 (GRADIENT)                                 -> 29 (GRADIENT)
+            aten::sum.dim_IntList                    28 (???)                                      -> 30 (GRADIENT)
+            aten::view                               30 (GRADIENT)                                 -> 30 (GRADIENT)
+            aten::detach                             30 (GRADIENT)                                 -> 30 (GRADIENT)
+            aten::detach                             30 (GRADIENT)                                 -> ???
+            aten::t                                  29 (GRADIENT)                                 -> 29 (GRADIENT)
+            aten::detach                             29 (GRADIENT)                                 -> 29 (GRADIENT)
+            aten::detach                             29 (GRADIENT)                                 -> ???""",
+        )
+
 
 if __name__ == "__main__":
     run_tests()

From bfdf69a0fd432c692b977d8e5c330c6b89bff919 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Sun, 27 Nov 2022 02:59:04 +0000
Subject: [PATCH 1294/1922] [vision hash update] update the pinned vision hash
 (#89692)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned vision hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89692
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/vision.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index 96d764c7b3202..6874c288beca3 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-bfb474b9d3ffffec5c3a040c16bc77006f35a94e
+72686211e2a8b78e5a5dc8c28be34eb9cfcdad4c

From 552f030cd80c17d2648987c51066eb5bcc01b1b5 Mon Sep 17 00:00:00 2001
From: Yuxin Wu <ppwwyyxx@users.noreply.github.com>
Date: Sun, 27 Nov 2022 05:55:24 +0000
Subject: [PATCH 1295/1922] Let SyncBatchNorm fallback to BN if not using
 distributed training (#89706)

Fixes #63662
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89706
Approved by: https://github.com/soumith
---
 test/test_nn.py               |  7 ++++---
 torch/nn/modules/batchnorm.py | 11 ++++++-----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/test/test_nn.py b/test/test_nn.py
index 8a05dc68bb7cf..552f299e4b8f4 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -5435,11 +5435,12 @@ def helper(self, size, dtype, mixed_dtype=False):
             helper(self, shape, torch.bfloat16, False)
             helper(self, shape, torch.bfloat16, True)
 
-    def test_batchnorm_non_contig_cpu(self):
+    @parametrize_test('bn_module', [torch.nn.BatchNorm2d, torch.nn.SyncBatchNorm])
+    def test_batchnorm_non_contig_cpu(self, bn_module):
         input = torch.arange(6, dtype=torch.float).reshape(1, 3, 2, 1).cpu()
         input = input.permute(0, 2, 1, 3)
 
-        bn = torch.nn.BatchNorm2d(2).cpu().float().eval()
+        bn = bn_module(2).cpu().float().eval()
         bn.weight.data.uniform_()
         bn.bias.data.uniform_()
 
@@ -5457,7 +5458,7 @@ def test_batchnorm_non_contig_cpu(self):
         input_bf = torch.arange(24, dtype=torch.bfloat16).reshape(1, 3, 2, 4)
         input_bf = input_bf.permute(0, 2, 1, 3)
         input_f = input_bf.float()
-        bn_mix = torch.nn.BatchNorm2d(2).float().eval()
+        bn_mix = bn_module(2).float().eval()
         ref_bn_f = deepcopy(bn_mix)
         out_bf = bn_mix(input_bf)
         ref_out_bf = ref_bn_f(input_f)
diff --git a/torch/nn/modules/batchnorm.py b/torch/nn/modules/batchnorm.py
index 382accfef560b..6f078a8f592cd 100644
--- a/torch/nn/modules/batchnorm.py
+++ b/torch/nn/modules/batchnorm.py
@@ -681,10 +681,6 @@ def _check_non_zero_input_channels(self, input):
             )
 
     def forward(self, input: Tensor) -> Tensor:
-        # currently only GPU input is supported
-        if not input.is_cuda:
-            raise ValueError("SyncBatchNorm expected input tensor to be on GPU")
-
         self._check_input_dim(input)
         self._check_non_zero_input_channels(input)
 
@@ -727,8 +723,13 @@ def forward(self, input: Tensor) -> Tensor:
         )
 
         # Don't sync batchnorm stats in inference mode (model.eval()).
-        need_sync = (bn_training and self.training)
+        need_sync = (bn_training and self.training and
+                     torch.distributed.is_available() and torch.distributed.is_initialized())
         if need_sync:
+            # currently only GPU input is supported
+            if not input.is_cuda:
+                raise ValueError("SyncBatchNorm expected input tensor to be on GPU")
+
             process_group = torch.distributed.group.WORLD
             if self.process_group:
                 process_group = self.process_group

From 1def43b96f946d271fb1ae6baa991997413f700b Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Sat, 26 Nov 2022 10:33:18 -0800
Subject: [PATCH 1296/1922] [Profiler] Memory profiler part 9: Mark activations
 (#88924)

This is a fairly straightforward pass: start at inputs and flood fill until we reach the backward pass.

Differential Revision: [D40868662](https://our.internmc.facebook.com/intern/diff/D40868662/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88924
Approved by: https://github.com/chaekit
---
 test/profiler/test_memory_profiler.py | 94 +++++++++++++--------------
 torch/profiler/_memory_profiler.py    | 21 ++++++
 2 files changed, 68 insertions(+), 47 deletions(-)

diff --git a/test/profiler/test_memory_profiler.py b/test/profiler/test_memory_profiler.py
index 6819109acef42..6e42f33d16db8 100644
--- a/test/profiler/test_memory_profiler.py
+++ b/test/profiler/test_memory_profiler.py
@@ -1189,16 +1189,16 @@ def step_fn(mark_region):
             aten::ones                                                                             -> 2 (INPUT)
 
             -- Forward & loss ---------------------------------------------------------------------------------------
-            aten::mul.Tensor                         1 (INPUT), 3 (PARAMETER)                      -> 4 (???)
-            aten::mul.Tensor                         1 (INPUT), 5 (PARAMETER)                      -> 6 (???)
-            aten::cat                                4 (???), 6 (???)                              -> 7 (???)
-            aten::binary_cross_entropy_with_logits   7 (???), 2 (INPUT)                            -> 13 (???)
+            aten::mul.Tensor                         1 (INPUT), 3 (PARAMETER)                      -> 4 (ACTIVATION)
+            aten::mul.Tensor                         1 (INPUT), 5 (PARAMETER)                      -> 6 (ACTIVATION)
+            aten::cat                                4 (ACTIVATION), 6 (ACTIVATION)                -> 7 (ACTIVATION)
+            aten::binary_cross_entropy_with_logits   7 (ACTIVATION), 2 (INPUT)                     -> 13 (ACTIVATION)
 
             -- Backward ---------------------------------------------------------------------------------------------
-            aten::ones_like                          13 (???)                                      -> 16 (???)
-            aten::sigmoid                            7 (???)                                       -> 17 (TEMPORARY)
+            aten::ones_like                          13 (ACTIVATION)                               -> 16 (ACTIVATION)
+            aten::sigmoid                            7 (ACTIVATION)                                -> 17 (TEMPORARY)
             aten::sub.Tensor                         17 (TEMPORARY), 2 (INPUT)                     -> 18 (TEMPORARY)
-            aten::mul.Tensor                         18 (TEMPORARY), 16 (???)                      -> 19 (???)
+            aten::mul.Tensor                         18 (TEMPORARY), 16 (ACTIVATION)               -> 19 (???)
             aten::div_.Scalar                        19 (???)                                      -> 19 (???)
             aten::slice.Tensor                       19 (???)                                      -> 19 (???)
             aten::slice.Tensor                       19 (???)                                      -> 19 (???)
@@ -1227,7 +1227,7 @@ def test_categories_e2e_simple_module_fwd(self) -> None:
             """\
             aten::ones                                                                             -> 1 (INPUT)
             aten::t                                  2 (PARAMETER)                                 -> 2 (PARAMETER)
-            aten::addmm                              3 (PARAMETER), 1 (INPUT), 2 (PARAMETER)       -> 4 (???)""",
+            aten::addmm                              3 (PARAMETER), 1 (INPUT), 2 (PARAMETER)       -> 4 (ACTIVATION)""",
         )
 
     def test_categories_e2e_simple_module_fwd_bwd(self) -> None:
@@ -1247,16 +1247,16 @@ def step_fn(mark_region):
             -- Forward & loss ---------------------------------------------------------------------------------------
             aten::ones                                                                             -> 1 (INPUT)
             aten::t                                  2 (PARAMETER)                                 -> 2 (PARAMETER)
-            aten::addmm                              3 (PARAMETER), 1 (INPUT), 2 (PARAMETER)       -> 4 (???)
-            aten::sum                                4 (???)                                       -> 5 (???)
+            aten::addmm                              3 (PARAMETER), 1 (INPUT), 2 (PARAMETER)       -> 4 (ACTIVATION)
+            aten::sum                                4 (ACTIVATION)                                -> 5 (ACTIVATION)
 
             -- Backward ---------------------------------------------------------------------------------------------
-            aten::ones_like                          5 (???)                                       -> 6 (???)
-            aten::expand                             6 (???)                                       -> 6 (???)
-            aten::t                                  6 (???)                                       -> 6 (???)
-            aten::mm                                 6 (???), 1 (INPUT)                            -> 7 (GRADIENT)
+            aten::ones_like                          5 (ACTIVATION)                                -> 6 (ACTIVATION)
+            aten::expand                             6 (ACTIVATION)                                -> 6 (ACTIVATION)
+            aten::t                                  6 (ACTIVATION)                                -> 6 (ACTIVATION)
+            aten::mm                                 6 (ACTIVATION), 1 (INPUT)                     -> 7 (GRADIENT)
             aten::t                                  7 (GRADIENT)                                  -> 7 (GRADIENT)
-            aten::sum.dim_IntList                    6 (???)                                       -> 9 (GRADIENT)
+            aten::sum.dim_IntList                    6 (ACTIVATION)                                -> 9 (GRADIENT)
             aten::view                               9 (GRADIENT)                                  -> 9 (GRADIENT)
             aten::detach                             9 (GRADIENT)                                  -> 9 (GRADIENT)
             aten::detach                             9 (GRADIENT)                                  -> ???
@@ -1287,16 +1287,16 @@ def step_fn(mark_region):
             -- Forward & loss ---------------------------------------------------------------------------------------
             aten::ones                                                                             -> 1 (INPUT)
             aten::t                                  2 (PARAMETER)                                 -> 2 (PARAMETER)
-            aten::addmm                              3 (PARAMETER), 1 (INPUT), 2 (PARAMETER)       -> 4 (???)
-            aten::sum                                4 (???)                                       -> 5 (???)
+            aten::addmm                              3 (PARAMETER), 1 (INPUT), 2 (PARAMETER)       -> 4 (ACTIVATION)
+            aten::sum                                4 (ACTIVATION)                                -> 5 (ACTIVATION)
 
             -- Backward ---------------------------------------------------------------------------------------------
-            aten::ones_like                          5 (???)                                       -> 6 (???)
-            aten::expand                             6 (???)                                       -> 6 (???)
-            aten::t                                  6 (???)                                       -> 6 (???)
-            aten::mm                                 6 (???), 1 (INPUT)                            -> 7 (GRADIENT)
+            aten::ones_like                          5 (ACTIVATION)                                -> 6 (ACTIVATION)
+            aten::expand                             6 (ACTIVATION)                                -> 6 (ACTIVATION)
+            aten::t                                  6 (ACTIVATION)                                -> 6 (ACTIVATION)
+            aten::mm                                 6 (ACTIVATION), 1 (INPUT)                     -> 7 (GRADIENT)
             aten::t                                  7 (GRADIENT)                                  -> 7 (GRADIENT)
-            aten::sum.dim_IntList                    6 (???)                                       -> 9 (GRADIENT)
+            aten::sum.dim_IntList                    6 (ACTIVATION)                                -> 9 (GRADIENT)
             aten::view                               9 (GRADIENT)                                  -> 9 (GRADIENT)
             aten::detach                             9 (GRADIENT)                                  -> 9 (GRADIENT)
             aten::detach                             9 (GRADIENT)                                  -> 9 (GRADIENT)
@@ -1329,13 +1329,13 @@ def test_categories_e2e_sequential_fwd(self) -> None:
             """\
             aten::ones                                                                             -> 1 (INPUT)
             aten::t                                  2 (PARAMETER)                                 -> 2 (PARAMETER)
-            aten::addmm                              3 (PARAMETER), 1 (INPUT), 2 (PARAMETER)       -> 4 (???)
-            aten::relu                               4 (???)                                       -> 5 (???)
-            aten::detach                             5 (???)                                       -> ???
+            aten::addmm                              3 (PARAMETER), 1 (INPUT), 2 (PARAMETER)       -> 4 (ACTIVATION)
+            aten::relu                               4 (ACTIVATION)                                -> 5 (ACTIVATION)
+            aten::detach                             5 (ACTIVATION)                                -> ???
             aten::t                                  6 (PARAMETER)                                 -> 6 (PARAMETER)
-            aten::mm                                 5 (???), 6 (PARAMETER)                        -> 7 (???)
-            aten::_softmax                           7 (???)                                       -> 8 (???)
-            aten::detach                             8 (???)                                       -> ???""",
+            aten::mm                                 5 (ACTIVATION), 6 (PARAMETER)                 -> 7 (ACTIVATION)
+            aten::_softmax                           7 (ACTIVATION)                                -> 8 (ACTIVATION)
+            aten::detach                             8 (ACTIVATION)                                -> ???""",
         )
 
     def test_categories_e2e_sequential_fwd_bwd(self) -> None:
@@ -1367,40 +1367,40 @@ def step_fn(mark_region):
 
             -- Forward ----------------------------------------------------------------------------------------------
             aten::t                                  3 (PARAMETER)                                 -> 3 (PARAMETER)
-            aten::addmm                              4 (PARAMETER), 1 (INPUT), 3 (PARAMETER)       -> 5 (???)
-            aten::relu                               5 (???)                                       -> 6 (???)
-            aten::detach                             6 (???)                                       -> 6 (???)
+            aten::addmm                              4 (PARAMETER), 1 (INPUT), 3 (PARAMETER)       -> 5 (ACTIVATION)
+            aten::relu                               5 (ACTIVATION)                                -> 6 (ACTIVATION)
+            aten::detach                             6 (ACTIVATION)                                -> 6 (ACTIVATION)
             aten::t                                  7 (PARAMETER)                                 -> 7 (PARAMETER)
-            aten::mm                                 6 (???), 7 (PARAMETER)                        -> 8 (???)
-            aten::_softmax                           8 (???)                                       -> 9 (???)
-            aten::detach                             9 (???)                                       -> 9 (???)
+            aten::mm                                 6 (ACTIVATION), 7 (PARAMETER)                 -> 8 (ACTIVATION)
+            aten::_softmax                           8 (ACTIVATION)                                -> 9 (ACTIVATION)
+            aten::detach                             9 (ACTIVATION)                                -> 9 (ACTIVATION)
 
             -- Loss -------------------------------------------------------------------------------------------------
-            aten::sub.Tensor                         9 (???), 2 (INPUT)                            -> 10 (???)
-            aten::pow.Tensor_Scalar                  10 (???)                                      -> 11 (???)
-            aten::sum                                11 (???)                                      -> 12 (???)
-            aten::mean                               12 (???)                                      -> 13 (???)
+            aten::sub.Tensor                         9 (ACTIVATION), 2 (INPUT)                     -> 10 (ACTIVATION)
+            aten::pow.Tensor_Scalar                  10 (ACTIVATION)                               -> 11 (ACTIVATION)
+            aten::sum                                11 (ACTIVATION)                               -> 12 (ACTIVATION)
+            aten::mean                               12 (ACTIVATION)                               -> 13 (ACTIVATION)
 
             -- Backward ---------------------------------------------------------------------------------------------
-            aten::ones_like                          13 (???)                                      -> 16 (???)
-            aten::expand                             16 (???)                                      -> 16 (???)
-            aten::div.Scalar                         16 (???)                                      -> 19 (???)
+            aten::ones_like                          13 (ACTIVATION)                               -> 16 (ACTIVATION)
+            aten::expand                             16 (ACTIVATION)                               -> 16 (ACTIVATION)
+            aten::div.Scalar                         16 (ACTIVATION)                               -> 19 (???)
             aten::expand                             19 (???)                                      -> 19 (???)
-            aten::pow.Tensor_Scalar                  10 (???)                                      -> 20 (TEMPORARY)
+            aten::pow.Tensor_Scalar                  10 (ACTIVATION)                               -> 20 (TEMPORARY)
             aten::mul.Scalar                         20 (TEMPORARY)                                -> 23 (TEMPORARY)
             aten::mul.Tensor                         19 (???), 23 (TEMPORARY)                      -> 24 (???)
-            aten::detach                             9 (???)                                       -> 9 (???)
-            aten::_softmax_backward_data             24 (???), 9 (???)                             -> 25 (???)
+            aten::detach                             9 (ACTIVATION)                                -> 9 (ACTIVATION)
+            aten::_softmax_backward_data             24 (???), 9 (ACTIVATION)                      -> 25 (???)
             aten::t                                  25 (???)                                      -> 25 (???)
-            aten::mm                                 25 (???), 6 (???)                             -> 26 (GRADIENT)
+            aten::mm                                 25 (???), 6 (ACTIVATION)                      -> 26 (GRADIENT)
             aten::t                                  26 (GRADIENT)                                 -> 26 (GRADIENT)
             aten::t                                  7 (PARAMETER)                                 -> 7 (PARAMETER)
             aten::mm                                 25 (???), 7 (PARAMETER)                       -> 27 (???)
             aten::t                                  26 (GRADIENT)                                 -> 26 (GRADIENT)
             aten::detach                             26 (GRADIENT)                                 -> 26 (GRADIENT)
             aten::detach                             26 (GRADIENT)                                 -> ???
-            aten::detach                             6 (???)                                       -> 6 (???)
-            aten::threshold_backward                 27 (???), 6 (???)                             -> 28 (???)
+            aten::detach                             6 (ACTIVATION)                                -> 6 (ACTIVATION)
+            aten::threshold_backward                 27 (???), 6 (ACTIVATION)                      -> 28 (???)
             aten::t                                  28 (???)                                      -> 28 (???)
             aten::mm                                 28 (???), 1 (INPUT)                           -> 29 (GRADIENT)
             aten::t                                  29 (GRADIENT)                                 -> 29 (GRADIENT)
diff --git a/torch/profiler/_memory_profiler.py b/torch/profiler/_memory_profiler.py
index 25107c5ae5dc4..cf06345a9aa63 100644
--- a/torch/profiler/_memory_profiler.py
+++ b/torch/profiler/_memory_profiler.py
@@ -33,6 +33,7 @@
 class Category(enum.Enum):
     INPUT = enum.auto()
     TEMPORARY = enum.auto()
+    ACTIVATION = enum.auto()
     GRADIENT = enum.auto()
     PARAMETER = enum.auto()
 
@@ -559,6 +560,7 @@ def __init__(self, result: _ProfilerResult) -> None:
         self._set_parameters_using_python_tracer()
         self._set_inputs()
         self._set_parameters_using_data_flow()
+        self._set_activations()
 
     def _is_gradient(self, *args, **kwargs) -> bool:
         return self._categories.get(*args, **kwargs) == Category.GRADIENT
@@ -760,3 +762,22 @@ def _set_parameters_using_data_flow(self) -> None:
         for key, _ in snapshot.keys():
             if key.id in parameter_keys:
                 self._categories.set_by_id(key, Category.PARAMETER)
+
+    def _set_activations(self) -> None:
+        """Flood the graph to identify activations."""
+
+        required = {Category.INPUT, Category.ACTIVATION}
+        also_allowed = {Category.PARAMETER, Category.TEMPORARY}
+        for node in self._data_flow_graph.flow_nodes:
+            inputs = {(key, value) for key, (_, value) in node.inputs.items()}
+            input_categories = {self._categories.get(*i) for i in inputs}
+
+            if (
+                (input_categories & required)
+                and not (input_categories - (required | also_allowed))
+                #
+                # Stop filling when we reach the backward pass.
+                and RecordScope.BACKWARD_FUNCTION not in get_scopes(node._event)
+            ):
+                for i in node.outputs.items():
+                    self._categories.setdefault_by_version(*i, Category.ACTIVATION)

From 406fabff806ea1c1d30bacd92047a02272b57709 Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Sat, 26 Nov 2022 10:33:19 -0800
Subject: [PATCH 1297/1922] [Profiler] Memory profiler part 10: Mark optimizer
 state (#88925)

This is also a fairly simple pass, since we're simply collecting values from the python tracer.

Differential Revision: [D40868664](https://our.internmc.facebook.com/intern/diff/D40868664/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88925
Approved by: https://github.com/chaekit
---
 test/profiler/test_memory_profiler.py | 16 ++++++++--------
 torch/profiler/_memory_profiler.py    | 12 ++++++++++++
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/test/profiler/test_memory_profiler.py b/test/profiler/test_memory_profiler.py
index 6e42f33d16db8..7fccfddf843a9 100644
--- a/test/profiler/test_memory_profiler.py
+++ b/test/profiler/test_memory_profiler.py
@@ -1305,14 +1305,14 @@ def step_fn(mark_region):
             aten::detach                             7 (GRADIENT)                                  -> 7 (GRADIENT)
 
             -- Optimizer --------------------------------------------------------------------------------------------
-            aten::clone                              7 (GRADIENT)                                  -> 10 (???)
-            aten::detach                             10 (???)                                      -> 10 (???)
-            aten::detach                             10 (???)                                      -> 10 (???)
-            aten::add_.Tensor                        2 (PARAMETER), 10 (???)                       -> 2 (PARAMETER)
-            aten::clone                              9 (GRADIENT)                                  -> 11 (???)
-            aten::detach                             11 (???)                                      -> 11 (???)
-            aten::detach                             11 (???)                                      -> 11 (???)
-            aten::add_.Tensor                        3 (PARAMETER), 11 (???)                       -> 3 (PARAMETER)
+            aten::clone                              7 (GRADIENT)                                  -> 10 (OPTIMIZER_STATE)
+            aten::detach                             10 (OPTIMIZER_STATE)                          -> 10 (OPTIMIZER_STATE)
+            aten::detach                             10 (OPTIMIZER_STATE)                          -> 10 (OPTIMIZER_STATE)
+            aten::add_.Tensor                        2 (PARAMETER), 10 (OPTIMIZER_STATE)           -> 2 (PARAMETER)
+            aten::clone                              9 (GRADIENT)                                  -> 11 (OPTIMIZER_STATE)
+            aten::detach                             11 (OPTIMIZER_STATE)                          -> 11 (OPTIMIZER_STATE)
+            aten::detach                             11 (OPTIMIZER_STATE)                          -> 11 (OPTIMIZER_STATE)
+            aten::add_.Tensor                        3 (PARAMETER), 11 (OPTIMIZER_STATE)           -> 3 (PARAMETER)
             aten::zero_                              7 (GRADIENT)                                  -> 7 (GRADIENT)
             aten::zero_                              9 (GRADIENT)                                  -> 9 (GRADIENT)""",
         )
diff --git a/torch/profiler/_memory_profiler.py b/torch/profiler/_memory_profiler.py
index cf06345a9aa63..7d74a76126291 100644
--- a/torch/profiler/_memory_profiler.py
+++ b/torch/profiler/_memory_profiler.py
@@ -1,6 +1,7 @@
 import collections
 import dataclasses
 import enum
+import itertools as it
 from typing import (
     Any,
     cast,
@@ -36,6 +37,7 @@ class Category(enum.Enum):
     ACTIVATION = enum.auto()
     GRADIENT = enum.auto()
     PARAMETER = enum.auto()
+    OPTIMIZER_STATE = enum.auto()
 
 
 @dataclasses.dataclass
@@ -561,6 +563,7 @@ def __init__(self, result: _ProfilerResult) -> None:
         self._set_inputs()
         self._set_parameters_using_data_flow()
         self._set_activations()
+        self._set_optimizer_state()
 
     def _is_gradient(self, *args, **kwargs) -> bool:
         return self._categories.get(*args, **kwargs) == Category.GRADIENT
@@ -781,3 +784,12 @@ def _set_activations(self) -> None:
             ):
                 for i in node.outputs.items():
                     self._categories.setdefault_by_version(*i, Category.ACTIVATION)
+
+    def _set_optimizer_state(self) -> None:
+        for event in self._op_tree.dfs():
+            if event.typed[0] == _EventType.PyCall and event.typed[1].optimizer:
+                parameters = event.typed[1].optimizer.parameters
+                for _, t in it.chain(*[state for _, _, state in parameters]):
+                    key = TensorKey.from_tensor(t)
+                    if key is not None:
+                        self._categories.set_by_id(key, Category.OPTIMIZER_STATE)

From 68836716136564c4452b688b642589b03cc6af13 Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Sat, 26 Nov 2022 10:33:21 -0800
Subject: [PATCH 1298/1922] [Profiler] Memory profiler part 11: Mark tensors
 created in the backward pass which don't correspond to parameters. (#88926)

There are various Tensors created in the backward pass which do not correspond to parameters. We don't want to mark these as gradients, but we do still want to convey as much information as possible. Thus, this PR introduces an AUTOGRAD_DETAIL category. (Which can be grouped with GRADIENT in visualization if one wishes to take a coarse grained view of the world.)

Differential Revision: [D40868661](https://our.internmc.facebook.com/intern/diff/D40868661/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88926
Approved by: https://github.com/chaekit
---
 test/profiler/test_memory_profiler.py | 54 +++++++++++++--------------
 torch/profiler/_memory_profiler.py    | 12 ++++++
 2 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/test/profiler/test_memory_profiler.py b/test/profiler/test_memory_profiler.py
index 7fccfddf843a9..01f2263807d34 100644
--- a/test/profiler/test_memory_profiler.py
+++ b/test/profiler/test_memory_profiler.py
@@ -1146,17 +1146,17 @@ def step_fn(mark_region):
             aten::ones_like                          13 (INPUT)                                    -> 16 (INPUT)
             aten::sigmoid                            7 (INPUT)                                     -> 17 (TEMPORARY)
             aten::sub.Tensor                         17 (TEMPORARY), 2 (INPUT)                     -> 18 (TEMPORARY)
-            aten::mul.Tensor                         18 (TEMPORARY), 16 (INPUT)                    -> 19 (???)
-            aten::div_.Scalar                        19 (???)                                      -> 19 (???)
-            aten::slice.Tensor                       19 (???)                                      -> 19 (???)
-            aten::slice.Tensor                       19 (???)                                      -> 19 (???)
-            aten::mul.Tensor                         19 (???), 1 (INPUT)                           -> 22 (???)
-            aten::sum.dim_IntList                    22 (???)                                      -> 23 (GRADIENT)
+            aten::mul.Tensor                         18 (TEMPORARY), 16 (INPUT)                    -> 19 (AUTOGRAD_DETAIL)
+            aten::div_.Scalar                        19 (AUTOGRAD_DETAIL)                          -> 19 (AUTOGRAD_DETAIL)
+            aten::slice.Tensor                       19 (AUTOGRAD_DETAIL)                          -> 19 (AUTOGRAD_DETAIL)
+            aten::slice.Tensor                       19 (AUTOGRAD_DETAIL)                          -> 19 (AUTOGRAD_DETAIL)
+            aten::mul.Tensor                         19 (AUTOGRAD_DETAIL), 1 (INPUT)               -> 22 (AUTOGRAD_DETAIL)
+            aten::sum.dim_IntList                    22 (AUTOGRAD_DETAIL)                          -> 23 (GRADIENT)
             aten::view                               23 (GRADIENT)                                 -> 23 (GRADIENT)
             aten::detach                             23 (GRADIENT)                                 -> 23 (GRADIENT)
             aten::detach                             23 (GRADIENT)                                 -> ???
-            aten::mul.Tensor                         19 (???), 1 (INPUT)                           -> 24 (???)
-            aten::sum.dim_IntList                    24 (???)                                      -> 25 (GRADIENT)
+            aten::mul.Tensor                         19 (AUTOGRAD_DETAIL), 1 (INPUT)               -> 24 (AUTOGRAD_DETAIL)
+            aten::sum.dim_IntList                    24 (AUTOGRAD_DETAIL)                          -> 25 (GRADIENT)
             aten::view                               25 (GRADIENT)                                 -> 25 (GRADIENT)
             aten::detach                             25 (GRADIENT)                                 -> 25 (GRADIENT)
             aten::detach                             25 (GRADIENT)                                 -> ???""",
@@ -1198,17 +1198,17 @@ def step_fn(mark_region):
             aten::ones_like                          13 (ACTIVATION)                               -> 16 (ACTIVATION)
             aten::sigmoid                            7 (ACTIVATION)                                -> 17 (TEMPORARY)
             aten::sub.Tensor                         17 (TEMPORARY), 2 (INPUT)                     -> 18 (TEMPORARY)
-            aten::mul.Tensor                         18 (TEMPORARY), 16 (ACTIVATION)               -> 19 (???)
-            aten::div_.Scalar                        19 (???)                                      -> 19 (???)
-            aten::slice.Tensor                       19 (???)                                      -> 19 (???)
-            aten::slice.Tensor                       19 (???)                                      -> 19 (???)
-            aten::mul.Tensor                         19 (???), 1 (INPUT)                           -> 22 (???)
-            aten::sum.dim_IntList                    22 (???)                                      -> 23 (GRADIENT)
+            aten::mul.Tensor                         18 (TEMPORARY), 16 (ACTIVATION)               -> 19 (AUTOGRAD_DETAIL)
+            aten::div_.Scalar                        19 (AUTOGRAD_DETAIL)                          -> 19 (AUTOGRAD_DETAIL)
+            aten::slice.Tensor                       19 (AUTOGRAD_DETAIL)                          -> 19 (AUTOGRAD_DETAIL)
+            aten::slice.Tensor                       19 (AUTOGRAD_DETAIL)                          -> 19 (AUTOGRAD_DETAIL)
+            aten::mul.Tensor                         19 (AUTOGRAD_DETAIL), 1 (INPUT)               -> 22 (AUTOGRAD_DETAIL)
+            aten::sum.dim_IntList                    22 (AUTOGRAD_DETAIL)                          -> 23 (GRADIENT)
             aten::view                               23 (GRADIENT)                                 -> 23 (GRADIENT)
             aten::detach                             23 (GRADIENT)                                 -> 23 (GRADIENT)
             aten::detach                             23 (GRADIENT)                                 -> 23 (GRADIENT)
-            aten::mul.Tensor                         19 (???), 1 (INPUT)                           -> 24 (???)
-            aten::sum.dim_IntList                    24 (???)                                      -> 25 (GRADIENT)
+            aten::mul.Tensor                         19 (AUTOGRAD_DETAIL), 1 (INPUT)               -> 24 (AUTOGRAD_DETAIL)
+            aten::sum.dim_IntList                    24 (AUTOGRAD_DETAIL)                          -> 25 (GRADIENT)
             aten::view                               25 (GRADIENT)                                 -> 25 (GRADIENT)
             aten::detach                             25 (GRADIENT)                                 -> 25 (GRADIENT)
             aten::detach                             25 (GRADIENT)                                 -> 25 (GRADIENT)
@@ -1384,27 +1384,27 @@ def step_fn(mark_region):
             -- Backward ---------------------------------------------------------------------------------------------
             aten::ones_like                          13 (ACTIVATION)                               -> 16 (ACTIVATION)
             aten::expand                             16 (ACTIVATION)                               -> 16 (ACTIVATION)
-            aten::div.Scalar                         16 (ACTIVATION)                               -> 19 (???)
-            aten::expand                             19 (???)                                      -> 19 (???)
+            aten::div.Scalar                         16 (ACTIVATION)                               -> 19 (AUTOGRAD_DETAIL)
+            aten::expand                             19 (AUTOGRAD_DETAIL)                          -> 19 (AUTOGRAD_DETAIL)
             aten::pow.Tensor_Scalar                  10 (ACTIVATION)                               -> 20 (TEMPORARY)
             aten::mul.Scalar                         20 (TEMPORARY)                                -> 23 (TEMPORARY)
-            aten::mul.Tensor                         19 (???), 23 (TEMPORARY)                      -> 24 (???)
+            aten::mul.Tensor                         19 (AUTOGRAD_DETAIL), 23 (TEMPORARY)          -> 24 (AUTOGRAD_DETAIL)
             aten::detach                             9 (ACTIVATION)                                -> 9 (ACTIVATION)
-            aten::_softmax_backward_data             24 (???), 9 (ACTIVATION)                      -> 25 (???)
-            aten::t                                  25 (???)                                      -> 25 (???)
-            aten::mm                                 25 (???), 6 (ACTIVATION)                      -> 26 (GRADIENT)
+            aten::_softmax_backward_data             24 (AUTOGRAD_DETAIL), 9 (ACTIVATION)          -> 25 (AUTOGRAD_DETAIL)
+            aten::t                                  25 (AUTOGRAD_DETAIL)                          -> 25 (AUTOGRAD_DETAIL)
+            aten::mm                                 25 (AUTOGRAD_DETAIL), 6 (ACTIVATION)          -> 26 (GRADIENT)
             aten::t                                  26 (GRADIENT)                                 -> 26 (GRADIENT)
             aten::t                                  7 (PARAMETER)                                 -> 7 (PARAMETER)
-            aten::mm                                 25 (???), 7 (PARAMETER)                       -> 27 (???)
+            aten::mm                                 25 (AUTOGRAD_DETAIL), 7 (PARAMETER)           -> 27 (AUTOGRAD_DETAIL)
             aten::t                                  26 (GRADIENT)                                 -> 26 (GRADIENT)
             aten::detach                             26 (GRADIENT)                                 -> 26 (GRADIENT)
             aten::detach                             26 (GRADIENT)                                 -> ???
             aten::detach                             6 (ACTIVATION)                                -> 6 (ACTIVATION)
-            aten::threshold_backward                 27 (???), 6 (ACTIVATION)                      -> 28 (???)
-            aten::t                                  28 (???)                                      -> 28 (???)
-            aten::mm                                 28 (???), 1 (INPUT)                           -> 29 (GRADIENT)
+            aten::threshold_backward                 27 (AUTOGRAD_DETAIL), 6 (ACTIVATION)          -> 28 (AUTOGRAD_DETAIL)
+            aten::t                                  28 (AUTOGRAD_DETAIL)                          -> 28 (AUTOGRAD_DETAIL)
+            aten::mm                                 28 (AUTOGRAD_DETAIL), 1 (INPUT)               -> 29 (GRADIENT)
             aten::t                                  29 (GRADIENT)                                 -> 29 (GRADIENT)
-            aten::sum.dim_IntList                    28 (???)                                      -> 30 (GRADIENT)
+            aten::sum.dim_IntList                    28 (AUTOGRAD_DETAIL)                          -> 30 (GRADIENT)
             aten::view                               30 (GRADIENT)                                 -> 30 (GRADIENT)
             aten::detach                             30 (GRADIENT)                                 -> 30 (GRADIENT)
             aten::detach                             30 (GRADIENT)                                 -> ???
diff --git a/torch/profiler/_memory_profiler.py b/torch/profiler/_memory_profiler.py
index 7d74a76126291..2c5684b64dbfc 100644
--- a/torch/profiler/_memory_profiler.py
+++ b/torch/profiler/_memory_profiler.py
@@ -36,6 +36,7 @@ class Category(enum.Enum):
     TEMPORARY = enum.auto()
     ACTIVATION = enum.auto()
     GRADIENT = enum.auto()
+    AUTOGRAD_DETAIL = enum.auto()
     PARAMETER = enum.auto()
     OPTIMIZER_STATE = enum.auto()
 
@@ -564,6 +565,7 @@ def __init__(self, result: _ProfilerResult) -> None:
         self._set_parameters_using_data_flow()
         self._set_activations()
         self._set_optimizer_state()
+        self._set_autograd_detail()
 
     def _is_gradient(self, *args, **kwargs) -> bool:
         return self._categories.get(*args, **kwargs) == Category.GRADIENT
@@ -793,3 +795,13 @@ def _set_optimizer_state(self) -> None:
                     key = TensorKey.from_tensor(t)
                     if key is not None:
                         self._categories.set_by_id(key, Category.OPTIMIZER_STATE)
+
+    def _set_autograd_detail(self):
+        prior = {None, Category.AUTOGRAD_DETAIL}
+        for node in self._data_flow_graph.flow_nodes:
+            if RecordScope.BACKWARD_FUNCTION in get_scopes(node._event):
+                for key, version in node.outputs.items():
+                    if version == 0 or self._categories.get(key, version - 1) in prior:
+                        self._categories.setdefault_by_version(
+                            key, version, Category.AUTOGRAD_DETAIL
+                        )

From 57d59db03efcfeb5a811b71c31603ab0ddccd5cd Mon Sep 17 00:00:00 2001
From: Horace He <chilli@fb.com>
Date: Thu, 24 Nov 2022 02:17:37 +0000
Subject: [PATCH 1299/1922] Don't allow recomputing a node that *must* be
 materialized in the backwards pass (#89171)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89171
Approved by: https://github.com/ngimel
---
 functorch/_src/config.py           |  6 +++
 functorch/_src/partitioners.py     | 39 +++++++++++++---
 test/functorch/test_aotdispatch.py |  8 ----
 test/inductor/test_perf.py         | 72 +++++++++++++++++++++++++++++-
 4 files changed, 110 insertions(+), 15 deletions(-)

diff --git a/functorch/_src/config.py b/functorch/_src/config.py
index 2dacdd38fa37c..39cf8946e8d67 100644
--- a/functorch/_src/config.py
+++ b/functorch/_src/config.py
@@ -25,3 +25,9 @@
 use_dynamic_shapes = os.getenv('AOT_DYNAMIC_SHAPES', False)
 
 static_weight_shapes = True
+
+# Applies CSE to the graph before partitioning
+cse = True
+
+# Restricts the amount of computation AOTAutograd can do.
+max_dist_from_bw = 5
diff --git a/functorch/_src/partitioners.py b/functorch/_src/partitioners.py
index 712c9a063eaf6..af8db94edf4a5 100644
--- a/functorch/_src/partitioners.py
+++ b/functorch/_src/partitioners.py
@@ -286,8 +286,9 @@ def min_cut_rematerialization_partition(
     fx_g = joint_module.graph
 
     #  add the CSE pass
-    cse_graph = fx_graph_cse(fx_g)
-    joint_module.graph = cse_graph
+    if config.cse:
+        cse_graph = fx_graph_cse(fx_g)
+        joint_module.graph = cse_graph
     full_bw_graph = joint_module.graph
 
     name_to_node = {}
@@ -340,11 +341,14 @@ def is_tensor_node(x):
     prims = torch.ops.prims
 
     # compiler == "nvfuser" is the default set of recomputable ops
-    default_recomputable_ops = [aten.add, aten.sub, aten.div, aten.atan2, aten.mul, aten.max, aten.min, aten.pow, aten.remainder, aten.fmod, aten.__and__, aten.__or__, aten.__xor__, aten.__lshift__, aten.__rshift__, aten.eq, aten.ne, aten.ge, aten.gt, aten.le, aten.lt, aten.abs, aten.bitwise_not, aten.ceil, aten.floor, aten.frac, aten.neg, aten.relu, aten.round, aten.silu, aten.trunc, aten.log, aten.log10, aten.log1p, aten.log2, aten.lgamma, aten.exp, aten.expm1, aten.erf, aten.erfc, aten.cos, aten.acos, aten.cosh, aten.sin, aten.asin, aten.sinh, aten.tan, aten.atan, aten.tanh, aten.atanh, aten.sqrt, aten.rsqrt, aten.reciprocal, aten.sigmoid, aten.softplus, aten.threshold, aten.threshold_backward, aten.clamp, aten.where, aten.lerp, aten.addcmul, aten.gelu, aten.gelu_backward, aten.alias, aten.sum, aten.mean, aten._grad_sum_to_size, aten.sum_to_size, aten.amax, aten.to, aten.type_as, operator.getitem, aten.squeeze, aten.unsqueeze, aten.rsub, aten._to_copy]  # noqa: E501
+    default_recomputable_ops = [aten.add, aten.sub, aten.div, aten.atan2, aten.mul, aten.max, aten.min, aten.pow, aten.remainder, aten.fmod, aten.__and__, aten.__or__, aten.__xor__, aten.__lshift__, aten.__rshift__, aten.eq, aten.ne, aten.ge, aten.gt, aten.le, aten.lt, aten.abs, aten.bitwise_not, aten.ceil, aten.floor, aten.frac, aten.neg, aten.relu, aten.round, aten.silu, aten.trunc, aten.log, aten.log10, aten.log1p, aten.log2, aten.lgamma, aten.exp, aten.expm1, aten.erf, aten.erfc, aten.cos, aten.acos, aten.cosh, aten.sin, aten.asin, aten.sinh, aten.tan, aten.atan, aten.tanh, aten.atanh, aten.sqrt, aten.rsqrt, aten.reciprocal, aten.sigmoid, aten.softplus, aten.threshold, aten.threshold_backward, aten.clamp, aten.where, aten.lerp, aten.addcmul, aten.gelu, aten.gelu_backward, aten.sum, aten.mean, aten._grad_sum_to_size, aten.sum_to_size, aten.amax, aten.to, aten.type_as, operator.getitem, aten.squeeze, aten.unsqueeze, aten.rsub, aten._to_copy]  # noqa: E501
+    view_ops = [aten.squeeze, aten.unsqueeze, aten.alias]
     if compiler == "inductor":
-        default_recomputable_ops += [prims.div, prims.convert_element_type, aten.sign, aten.clone, aten._to_copy, aten.full_like, prims.var, prims.sum, aten.var, aten.std, prims.broadcast_in_dim, aten.select, aten.permute, aten._unsafe_view, aten.view, aten.expand, aten.slice, aten.reshape, aten.broadcast_tensors, aten.scalar_tensor, aten.ones, aten.new_zeros, aten.lift_fresh_copy, aten.minimum, aten.arange, aten.bitwise_and, aten.triu, aten.var_mean, aten.isinf, aten.any, aten.isnan, aten.full, aten.as_strided, aten.zeros, aten.argmax, aten.maximum, aten.bitwise_or, aten.logical_and, aten.logical_or]  # noqa: E501
+        default_recomputable_ops += [prims.div, prims.convert_element_type, aten.sign, aten.clone, aten._to_copy, aten.full_like, prims.var, prims.sum, aten.var, aten.std, prims.broadcast_in_dim, aten.select, aten.permute, aten._unsafe_view, aten.reshape, aten.broadcast_tensors, aten.scalar_tensor, aten.ones, aten.new_zeros, aten.lift_fresh_copy, aten.minimum, aten.arange, aten.bitwise_and, aten.triu, aten.var_mean, aten.isinf, aten.any, aten.isnan, aten.full, aten.as_strided, aten.zeros, aten.argmax, aten.maximum, aten.bitwise_or, aten.logical_and, aten.logical_or]  # noqa: E501
+        view_ops += [aten.view, aten.slice, aten.permute, aten.t, prims.broadcast_in_dim, aten.expand, aten.as_strided]
         # Natalia said that we should allow recomputing indexing :)
         default_recomputable_ops += [aten.index]
+    default_recomputable_ops += view_ops
 
     recomputable_ops = set(recomputable_ops) if recomputable_ops is not None else set(default_recomputable_ops)
 
@@ -366,6 +370,18 @@ def is_tensor_node(x):
 
     AGGRESSIVE_RECOMPUTATION = False
 
+    def is_materialized_backwards(node):
+        cur_nodes = set([node])
+        while len(cur_nodes) > 0:
+            cur = cur_nodes.pop()
+            for user in cur.users:
+                if user not in required_fw_nodes and not is_fusible(cur, user):
+                    return True
+                if user not in required_fw_nodes and get_aten_target(user) in view_ops:
+                    cur_nodes.add(user)
+
+        return False
+
     def ban_recomputation(node):
         if AGGRESSIVE_RECOMPUTATION:
             return (node.op == 'call_function' and get_aten_target(node) in unrecomputable_ops)
@@ -378,7 +394,20 @@ def ban_recomputation(node):
                 return False
             if node.target in [aten.lift_fresh_copy.default, aten.lift_fresh.default]:
                 return False
-            if compiler == "inductor" and node.dist_from_bw > 3:
+
+            # If a node *must* be materialized in the backwards pass, then we
+            # should never recompute it. This is a pretty subtle point.  In
+            # general, the assumption we make is that recomputing a node in the
+            # backwards pass is "free". However, if a node must be materialized
+            # in the backwards pass, then recomputing it is never free.
+            if is_materialized_backwards(node):
+                return True
+
+            # Arbitrary hack that sometimes seems to help things. The above
+            # modification appears to have made this heuristic a lot less critical
+            # for performance.
+            # TODO: Investigate why this hack helps.
+            if compiler == "inductor" and node.dist_from_bw > config.max_dist_from_bw:
                 return True
             # If the output of an op is 4x smaller (arbitrary choice),
             # then we don't allow recomputation.
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index fe6fe461ed672..021432235e446 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1369,14 +1369,6 @@ def f(a, b, c, d):
         self.assertEqual(get_num_ins_outs(fw_graph), (4, 2))
         self.assertEqual(get_num_ins_outs(bw_graph), (2, 4))
 
-        def f(x):
-            return torch.mm(x, torch.ones(x.shape)).tanh().tanh()
-        fw_graph, bw_graph = get_fw_bw_graph(f, [torch.randn(5, 5, requires_grad=True)])
-        self.assertEqual(get_num_ins_outs(fw_graph), (1, 3))
-
-        ins, outs = get_ins_outs(fw_graph)
-        self.assertEqual(outs[1].target, torch.ops.aten.mm.default)
-
     @unittest.skipIf(not USE_NETWORKX, "networkx not available")
     def test_min_cut_partitioner_recomputable_ops(self):
         def f(x):
diff --git a/test/inductor/test_perf.py b/test/inductor/test_perf.py
index d473ff4b74495..2b53c163421c1 100644
--- a/test/inductor/test_perf.py
+++ b/test/inductor/test_perf.py
@@ -2,6 +2,8 @@
 import contextlib
 from unittest.mock import patch
 
+import functorch
+
 import torch._dynamo
 import torch._inductor.config as config
 from torch._dynamo.optimizations.backends import register_backend
@@ -36,11 +38,27 @@ def count_numel(f, *args):
     return str(metrics.num_bytes_accessed // 4)
 
 
+def count_numel_train(f, *args):
+    """
+    Assumes all inputs are fp32
+    """
+    metrics.reset()
+
+    f = torch._dynamo.optimize("count_bytes_inductor")(f)
+    out = f(*args)
+    res = 0
+    for o in out:
+        res += o.mean()
+    res.backward()
+    print(metrics.nodes_num_elem)
+    return str(metrics.num_bytes_accessed // 4)
+
+
 DEVICE = "cuda"
 
 
-def T(*size, dtype=torch.float32, device=DEVICE):
-    return torch.randn(size, dtype=dtype, device=device)
+def T(*size, dtype=torch.float32, device=DEVICE, grad=False):
+    return torch.randn(size, dtype=dtype, device=device, requires_grad=grad)
 
 
 def TI(*size, mx=10, dtype=torch.int32, device=DEVICE):
@@ -386,6 +404,56 @@ def f(a, b, c):
         self.assertExpectedInline(count_numel(f, *inp), """4000""")
 
 
+class MinCutPartitioningTests(TestCase):
+    def test_partitioning_full_remat(self):
+        def f(x):
+            return x.cos().cos().cos()
+
+        inp = (T(10, grad=True),)
+        self.assertExpectedInline(count_numel_train(f, *inp), """50""")
+
+    def test_partitioning_partial_remat(self):
+        def f(a, b, c, d):
+            x = a + b + c + d
+            return x.cos().cos()
+
+        inp = (T(10, grad=True), T(10, grad=True), T(10, grad=True), T(10, grad=True))
+        self.assertExpectedInline(count_numel_train(f, *inp), """90""")
+
+    def test_partitioning_dtype(self):
+        def f(x):
+            return (x < 0) * x
+
+        inp = (T(100, grad=True),)
+        self.assertExpectedInline(count_numel_train(f, *inp), """450""")
+
+    @patch.object(functorch.compile.config, "max_dist_from_bw", 1000)
+    def test_partitioning_unremat_bw(self):
+        def f(x):
+            return torch.mm(x, x.new_ones(x.shape)).tanh().tanh()
+
+        inp = (T(10, 10, grad=True),)
+        self.assertExpectedInline(count_numel_train(f, *inp), """1300""")
+
+    def test_partitioning_unremat_bw2(self):
+        def f(a):
+            a = torch.mm(a, a)
+            a = a + 1
+            b = a + 2
+            c = torch.mm(a, b)
+            return c
+
+        inp = (T(10, 10, grad=True),)
+        self.assertExpectedInline(count_numel_train(f, *inp), """2600""")
+
+    def test_partitioning_keops(self):
+        def f(a, b):
+            return (a * b).cos().sum(dim=1)
+
+        inp = (T(20, 1, grad=True), T(1, 20, grad=True))
+        self.assertExpectedInline(count_numel_train(f, *inp), """220""")
+
+
 # Test cases where we don't do the right thing yet.
 class WouldBeNiceIfItWorked:
     def test_horizontal(self):

From 5d9a69524048ccf24ce1aec33688386f7c617f12 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Thu, 24 Nov 2022 13:57:17 +0000
Subject: [PATCH 1300/1922] Access named parameters/buffers/etc via getattr
 rather than index (#89625)

I'm not sure why this never caused problems before.  The error
manifests as `TypeError: 'MyModule' object is not subscriptable`

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89625
Approved by: https://github.com/albanD
---
 torch/_dynamo/variables/nn_module.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py
index 848f022525d9e..48557f41d0b23 100644
--- a/torch/_dynamo/variables/nn_module.py
+++ b/torch/_dynamo/variables/nn_module.py
@@ -308,7 +308,7 @@ def named_embed(name, obj):
                         obj,
                         key,
                         name,
-                        source=NNModuleSource(GetItemSource(self.source, name)),
+                        source=NNModuleSource(AttrSource(self.source, name)),
                         **options,
                     ),
                 ]

From 1a6709318cbab0013438d0a686b06fe265436bdf Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Sat, 26 Nov 2022 14:28:56 -0500
Subject: [PATCH 1301/1922] Implement guard_source on RandomValueSource
 (#89711)

I audited the pattern matches on the enum and it didn't
look like this one should apply there.

Sorry, no test, I know this matters on symbolic-shapes branch
but I haven't had time to extract out a minimal reproducer.
Take my word for it.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89711
Approved by: https://github.com/jansel
---
 torch/_dynamo/guards.py | 1 +
 torch/_dynamo/source.py | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index 9cbcb93fcc5cc..7768cb14fc623 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -59,6 +59,7 @@ class GuardSource(enum.Enum):
     LOCAL_NN_MODULE = 2
     GLOBAL_NN_MODULE = 3
     CONSTANT = 4
+    RANDOM_VALUE = 5
 
     def select(self, locals_, globals_):
         if self in (GuardSource.LOCAL, GuardSource.LOCAL_NN_MODULE):
diff --git a/torch/_dynamo/source.py b/torch/_dynamo/source.py
index 6b5d63ab850e1..626bdb4b7826c 100644
--- a/torch/_dynamo/source.py
+++ b/torch/_dynamo/source.py
@@ -75,6 +75,9 @@ def name(self):
 class RandomValueSource(Source):
     random_call_index: int
 
+    def guard_source(self):
+        return GuardSource.RANDOM_VALUE
+
     def reconstruct(self, codegen):
         return [
             codegen.create_load(codegen.tx.output.random_values_var),

From 4116df1ea973f497fef3554d53c511caefa5ab8d Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Sat, 26 Nov 2022 11:25:24 -0800
Subject: [PATCH 1302/1922] Factor input deduplication into a separate function
 (#89701)

It turns out that instead of having a giant blobby aot_dispatch_autograd
function, we can factor it into a series of wrapper functions, each
of which successively guarantees more invariants on the inner
compilation function until the final inner function is quite trivial.
How exactly you have to wrap the input user functions and the output
compiled functions can be expressed concisely in Haskell, so I've
included the Haskell formulation in code comments.

This PR shows how to do this for input deduplication.  Dealing with the
rest of the view handling is left to future work.

This PR should also be a slight performance improvement as deduplicating
is skipped entirely when there are no duplicate inputs.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89701
Approved by: https://github.com/bdhirsh
---
 functorch/_src/aot_autograd.py | 80 +++++++++++++++++++++++-----------
 1 file changed, 55 insertions(+), 25 deletions(-)

diff --git a/functorch/_src/aot_autograd.py b/functorch/_src/aot_autograd.py
index 9dfb62f30faa6..93282a055c759 100644
--- a/functorch/_src/aot_autograd.py
+++ b/functorch/_src/aot_autograd.py
@@ -1040,9 +1040,26 @@ def merge_view_inputs(
         return args_to_functionalization, post_processed_calling_convention_meta
 
 
-
+# Wraps aot_dispatch_deduplicated_autograd, ensuring that duplicate arguments
+# are dropped from the inner compilation function.
+#
+# In Haskell types, suppose you have:
+#
+#   add_dupe_args :: DedupedArgs -> Args
+#   remove_dupe_args :: Args -> DedupedArgs
+#
+#   aot_dispatch_deduplicated_autograd
+#       :: (DedupedArgs -> R) -> DedupedArgs -> AOTConfig -> (DedupedArgs -> R)
+#   aot_dispatch_autograd
+#       :: (Args -> R) -> Args -> AOTConfig -> (Args -> R)
+#
+# Then the code below can be written in point-free style as:
+#
+#   aot_dispatch_deduplicate_autograd f a c =
+#       aot_dispatch_autograd (f . add_dupe_args) (remove_dupe_args a) c . remove_dupe_args
+#
 def aot_dispatch_autograd(flat_fn, flat_args: List[Tensor], aot_config: AOTConfig):
-    # Deduplicate inputs.  Suppose you have:
+    # Suppose you have:
     #
     #   [a, b, a, c]
     #
@@ -1060,10 +1077,7 @@ def aot_dispatch_autograd(flat_fn, flat_args: List[Tensor], aot_config: AOTConfi
     #       2: 0,
     #       3: 2,
     #   }
-    #
-    # Whether to use flat_args or deduped_flat_args?  flat_fn takes flat_args,
-    # and the autograd.Function must take deduped_flat_args; everything
-    # else is just getting the types right.
+    #   keep_arg_mask = [True, True, False, True]
 
     seen_args = {}
     keep_arg_mask = []
@@ -1083,22 +1097,41 @@ def aot_dispatch_autograd(flat_fn, flat_args: List[Tensor], aot_config: AOTConfi
         add_dupe_map[i] = j
         j += 1
 
+    # Fastpath
+    if not dropped_args:
+        return aot_dispatch_deduplicated_autograd(flat_fn, flat_args, aot_config)
+
     # NB: Hot path, avoid set lookups here
+    # TODO: Can avoid the zip here too, probably
     def remove_dupe_args(args):
-        if not dropped_args:
-            return args
         return [t for t, keep in zip(args, keep_arg_mask) if keep]
 
     def add_dupe_args(args):
-        if not dropped_args:
-            return args
         return [args[add_dupe_map[i]] for i in range(duped_arg_len)]
 
     deduped_flat_args = remove_dupe_args(flat_args)
 
+    @wraps(flat_fn)
+    def wrapped_flat_fn(*args):
+        return flat_fn(*add_dupe_args(args))
+
+    compiled_fn = aot_dispatch_deduplicated_autograd(wrapped_flat_fn, deduped_flat_args, aot_config)
+
+    @wraps(compiled_fn)
+    def wrapped_compiled_fn(*args):
+        return compiled_fn(*remove_dupe_args(args))
+
+    return wrapped_compiled_fn
+
+
+# Like aot_dispatch_autograd, but with the precondition that there
+# are no duplicate arguments in flat_args (e.g., the same Tensor
+# object never shows up twice.  However, two tensor inputs MAY alias
+# the same storage, so long as they have separate TensorImpls.)
+def aot_dispatch_deduplicated_autograd(flat_fn, flat_args: List[Tensor], aot_config: AOTConfig):
     _fw_metadata, out, _num_aliasing_metadata_outs = run_functionalized_fw_and_collect_metadata(
-        lambda *args: flat_fn(*(add_dupe_args(args))),
-    )(*deduped_flat_args)
+        flat_fn
+    )(*flat_args)
 
     # pre-compute, so we can bail out quickly in the hotpath
     _num_outputs_aliased_to_inputs = len([
@@ -1127,16 +1160,16 @@ def add_dupe_args(args):
     # gets its data mutated.
     # When that happens, we replace the aliased inputs with a synthetic base, and in the traced forward
     # we later generate the input views
-    deduped_flat_args_with_views_handled, _synthetic_base_info = merge_view_inputs(
-        deduped_flat_args, _fw_metadata.mutated_input_info)
+    flat_args_with_views_handled, _synthetic_base_info = merge_view_inputs(
+        flat_args, _fw_metadata.mutated_input_info)
 
     joint_forward_backward = create_joint_forward_backward_functionalized(
-        lambda *args: flat_fn(*add_dupe_args(args)),
+        flat_fn,
         meta=_fw_metadata,
         synthetic_base_info=_synthetic_base_info,
     )
 
-    joint_inputs = (deduped_flat_args_with_views_handled, out)
+    joint_inputs = (flat_args_with_views_handled, out)
 
     disable_amp = torch._C._is_any_autocast_enabled()
 
@@ -1177,7 +1210,7 @@ def add_dupe_args(args):
             bw_module.print_readable()
 
         with track_graph_compiling("forward"):
-            compiled_fw_func = aot_config.fw_compiler(fw_module, deduped_flat_args_with_views_handled)
+            compiled_fw_func = aot_config.fw_compiler(fw_module, flat_args_with_views_handled)
 
     class CompiledFunction(torch.autograd.Function):
         compiled_fw = compiled_fw_func
@@ -1296,9 +1329,6 @@ def backward(ctx, *all_flat_args):
 
     @wraps(CompiledFunction.apply)
     def compiled_function(*args):
-        # Step 1: remove dupe args
-        no_dupe_args = remove_dupe_args(args)
-
         # Step 2: remove aliased inputs that are mutated, replace with synthetic bases
         # Only happens if our graph mutates an input that aliases another input.
         if CompiledFunction.synthetic_base_info is not None:
@@ -1307,17 +1337,17 @@ def compiled_function(*args):
             # Generate: the updated args, including (potentially multiple) synthetic bases
             # that replace the views. The input views are regenerated manually in the compiled function.
             # TODO: think harder about what happens if (a view of) one of these mutated input views is ALSO returned
-            new_inputs, metadata = merge_view_inputs(no_dupe_args, CompiledFunction.fw_metadata.mutated_input_info)
+            new_inputs, metadata = merge_view_inputs(args, CompiledFunction.fw_metadata.mutated_input_info)
             # We're just re-running the original-args-to-synthetic-base transformation
             # that we ran during compilation.
             # This returns metadata that we use during tracing to recover the input views,
             # which we don't actually need at runtime.
             assert metadata is not None
-            no_dupe_args_with_synthetic_bases = new_inputs
+            args_with_synthetic_bases = new_inputs
         else:
-            no_dupe_args_with_synthetic_bases = no_dupe_args
+            args_with_synthetic_bases = args
 
-        all_outs = CompiledFunction.apply(*no_dupe_args_with_synthetic_bases)
+        all_outs = CompiledFunction.apply(*args_with_synthetic_bases)
         if CompiledFunction.num_aliasing_metadata_outs > 0:
             outs = all_outs[:-CompiledFunction.num_aliasing_metadata_outs]
             aliasing_metadata_outs = all_outs[-CompiledFunction.num_aliasing_metadata_outs:]
@@ -1347,7 +1377,7 @@ def compiled_function(*args):
             )):
                 if mutation_type == MutationType.none:
                     continue
-                original_inpt = no_dupe_args[inpt_idx]
+                original_inpt = args[inpt_idx]
                 if mutation_type == MutationType.metadata_only:
                     # We need to grab the size/stride/storage_offset from the compiled forward,
                     # and use that to mutate the metadata of the input

From 2f2cc380669cfbfe15bee925eb510075d300eb5c Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Sat, 26 Nov 2022 11:25:24 -0800
Subject: [PATCH 1303/1922] Add debug asserts to AOTAutograd for input
 consistency with compilation (#89702)

Fixes https://github.com/pytorch/torchdynamo/issues/1927

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89702
Approved by: https://github.com/bdhirsh
---
 functorch/_src/aot_autograd.py     | 81 ++++++++++++++++++++++++++++--
 functorch/_src/config.py           |  5 ++
 test/functorch/test_aotdispatch.py | 48 ++++++++++++++++++
 3 files changed, 129 insertions(+), 5 deletions(-)

diff --git a/functorch/_src/aot_autograd.py b/functorch/_src/aot_autograd.py
index 93282a055c759..102510c52dd27 100644
--- a/functorch/_src/aot_autograd.py
+++ b/functorch/_src/aot_autograd.py
@@ -1040,6 +1040,11 @@ def merge_view_inputs(
         return args_to_functionalization, post_processed_calling_convention_meta
 
 
+GUARD_BUG_BOILERPLATE = (
+    "This indicates a guard bug in AOTAutograd or Dynamo, please file a bug to PyTorch."
+)
+
+
 # Wraps aot_dispatch_deduplicated_autograd, ensuring that duplicate arguments
 # are dropped from the inner compilation function.
 #
@@ -1097,9 +1102,7 @@ def aot_dispatch_autograd(flat_fn, flat_args: List[Tensor], aot_config: AOTConfi
         add_dupe_map[i] = j
         j += 1
 
-    # Fastpath
-    if not dropped_args:
-        return aot_dispatch_deduplicated_autograd(flat_fn, flat_args, aot_config)
+    unique_args = j
 
     # NB: Hot path, avoid set lookups here
     # TODO: Can avoid the zip here too, probably
@@ -1109,6 +1112,45 @@ def remove_dupe_args(args):
     def add_dupe_args(args):
         return [args[add_dupe_map[i]] for i in range(duped_arg_len)]
 
+    def maybe_wrap_debug(f):
+        if not config.debug_assert:
+            return f
+
+        @wraps(f)
+        def debug_wrapper(*args):
+            # Test that the computed remove/add arg functions are an inverse
+            new_args = add_dupe_args(remove_dupe_args(args))
+            seen = {}
+            for i, (x, y) in enumerate(zip(new_args, args)):
+                seen[y] = None
+                assert x is y, (
+                    "At compilation time, this graph was compiled under the "
+                    "assumption that some inputs were duplicate, but at runtime "
+                    f"input {i} was not a duplicate of {add_dupe_map[i]}.  " +
+                    GUARD_BUG_BOILERPLATE
+                )
+            # This is only an error if there is metadata mutation on both of
+            # the duped arguments; in this case, we need to know what order
+            # the metadata mutation applies in.  You'll get the correct result
+            # otherwise, because a graph that assumes distinct inputs works if
+            # you dupe the inputs (the gradient contributions from each input
+            # will get summed up appropriately.)
+            """
+            assert len(seen) == unique_args, (
+                "At compilation time, this graph was compiled under the assumption "
+                f"that there would be {unique_args} distinct arguments, but at "
+                f"runtime there were only {len(seen)} distinct arguments.  " +
+                GUARD_BUG_BOILERPLATE
+            )
+            """
+            return f(*args)
+
+        return debug_wrapper
+
+    # Fastpath
+    if not dropped_args:
+        return maybe_wrap_debug(aot_dispatch_deduplicated_autograd(flat_fn, flat_args, aot_config))
+
     deduped_flat_args = remove_dupe_args(flat_args)
 
     @wraps(flat_fn)
@@ -1121,7 +1163,7 @@ def wrapped_flat_fn(*args):
     def wrapped_compiled_fn(*args):
         return compiled_fn(*remove_dupe_args(args))
 
-    return wrapped_compiled_fn
+    return maybe_wrap_debug(wrapped_compiled_fn)
 
 
 # Like aot_dispatch_autograd, but with the precondition that there
@@ -1457,7 +1499,36 @@ def compiled_function(*args):
         else:
             return fw_outs
 
-    return compiled_function
+    if not config.debug_assert:
+        return compiled_function
+
+    flat_requires_grad = [a.requires_grad if isinstance(a, Tensor) else None for a in flat_args]
+
+    @wraps(compiled_function)
+    def debug_compiled_function(*args):
+        # TODO: Check aliasing relationships
+        # TODO: Check strides for metadata mutation
+        # (NB: ideally, this logic is factored out of this function and
+        # you move these debug checks there)
+
+        # Check requires grad.  Bad case is when we compiled with
+        # requires_grad = False, but input requires_grad = True
+        # (vice versa is OK; we compute a gradient and then throw
+        # it away when it hits the input.)
+        for i, a in enumerate(args):
+            can_require_grad = flat_requires_grad[i]
+            if can_require_grad is None:
+                assert not isinstance(a, Tensor)
+            elif not can_require_grad:
+                assert not a.requires_grad, (
+                    "At compilation time, this graph was compiled under the "
+                    f"assumption that input {i} did not require grad, but at "
+                    f"runtime input {i} requires grad.  " + GUARD_BUG_BOILERPLATE
+                )
+
+        return compiled_function(*args)
+
+    return debug_compiled_function
 
 
 @dynamo_timed
diff --git a/functorch/_src/config.py b/functorch/_src/config.py
index 39cf8946e8d67..87f60fe061e1e 100644
--- a/functorch/_src/config.py
+++ b/functorch/_src/config.py
@@ -14,6 +14,11 @@
 # TODO Benchmark
 use_fake_tensor = False
 
+# Enables optional asserts in hotpath code to check for errors.  If
+# you are seeing weird accuracy problems, try turning this on.
+# For now, to more easily identify bugs, this is turned on by default.
+debug_assert = True
+
 debug_fake_cross_ref = os.environ.get('AOT_FAKE_CROSSREF', False)
 
 debug_partitioner = os.environ.get('AOT_PARTITIONER_DEBUG', False)
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 021432235e446..52aa9877d3164 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -984,6 +984,54 @@ def f(x, y):
         x = torch.randn(3, 3, requires_grad=True)
         self.verify_aot_autograd(f, [x, x])
 
+    @patch("functorch._src.config.debug_assert", True)
+    def test_invalid_dupe(self):
+        class F(torch.nn.Module):
+            def forward(self, x, y):
+                return (x + y,)
+
+        x = torch.randn(3, 3, requires_grad=True)
+        y = torch.randn(3, 3, requires_grad=True)
+
+        fxy = aot_module_simplified(F(), nop)
+        fxy(x, y)
+        fxy(x, x)  # is ok!
+
+        fxx = aot_module_simplified(F(), nop)
+        fxx(x, x)
+        self.assertExpectedRaisesInline(
+            AssertionError, lambda: fxx(x, y),
+            """At compilation time, this graph was compiled under the assumption that some inputs were duplicate, but at runtime input 1 was not a duplicate of 0.  This indicates a guard bug in AOTAutograd or Dynamo, please file a bug to PyTorch."""  # noqa: B950
+        )
+
+    @patch("functorch._src.config.debug_assert", True)
+    def test_invalid_requires_grad(self):
+        class F(torch.nn.Module):
+            def forward(self, x, y):
+                return (x + y,)
+
+        x = torch.randn(3, 3, requires_grad=True)
+        y = torch.randn(3, 3, requires_grad=True)
+        z = torch.randn(3, 3, requires_grad=False)
+
+        # Non-mutating please!
+        def compare(m1, m2, inps):
+            r1, g1 = _outs_and_grads(m1, inps, inps)
+            r2, g2 = _outs_and_grads(m2, inps, inps)
+            self.assertEqual(r1, r2)
+            self.assertEqual(g1, g2)
+
+        fxy = aot_module_simplified(F(), nop)
+        compare(F(), fxy, (x, y))
+        compare(F(), fxy, (x, z))
+
+        fxz = aot_module_simplified(F(), nop)
+        compare(F(), fxz, (x, z))
+        self.assertExpectedRaisesInline(
+            AssertionError, lambda: fxz(x, y),
+            """At compilation time, this graph was compiled under the assumption that input 1 did not require grad, but at runtime input 1 requires grad.  This indicates a guard bug in AOTAutograd or Dynamo, please file a bug to PyTorch."""  # noqa: B950
+        )
+
     def test_resize_input(self):
         def f(x, y):
             y.resize_(4)

From 7b0600bff9e074361336e3c11a4234852d85a31f Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Sun, 27 Nov 2022 19:27:45 -0500
Subject: [PATCH 1304/1922] Add single process version of dynamo distributed
 hf_Bert tests (#89721)

It's a lot easier to debug problems in the Dynamo optimization pass if
you aren't actually triggering a multiprocessing run.  Keep these tests
around.

I think the other tests can probably get this treatment too, leaving
this to future work.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89721
Approved by: https://github.com/voznesenskym
---
 test/distributed/test_dynamo_distributed.py | 94 ++++++++++++++++-----
 1 file changed, 75 insertions(+), 19 deletions(-)

diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index b6bc16edb941a..8d4365acfc9a6 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -21,6 +21,7 @@
 from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.testing._internal.common_utils import TestCase
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
     import_transformers_or_skip,
@@ -99,10 +100,11 @@ def forward(self, x):
 
 def get_hf_bert(rank):
     # Note: use @import_transformers_or_skip on your test case if you use this
+    # in a multiprocessing test
     try:
         from transformers import BertConfig, AutoModelForMaskedLM
     except ImportError:
-        unittest.skip("Unable to import transformers")
+        raise unittest.SkipTest("Unable to import transformers")
 
     batch_size, max_length, config, device = 4, 512, BertConfig(), f"cuda:{rank}"
     model = AutoModelForMaskedLM.from_config(config).to(device)
@@ -136,6 +138,68 @@ def _per_rank_init(rank, world_size):
     dist.destroy_process_group()
 
 
+# This simulates DDP, but it doesn't actually do any process communication;
+# it just has enough properties so that the dynamo distributed optimization is
+# able to optimize.  Feel free to simulate more properties as necessary.  The
+# other important thing is patching _active_ddp_module, which is what actually
+# triggers DDP optimization
+class FakeDDP(nn.Module):
+    def __init__(self, module):
+        super().__init__()
+        self.module = module
+        bucket_cap_mb = 25
+        self.bucket_bytes_cap = int(bucket_cap_mb * 1024 * 1024)
+
+    @contextmanager
+    def _inside_ddp_forward(self):
+        DDP._active_ddp_module = self
+        try:
+            yield
+        except Exception:
+            raise
+        finally:
+            DDP._active_ddp_module = None
+
+    def forward(self, *inputs, **kwargs):
+        with self._inside_ddp_forward():
+            return self.module.forward(*inputs, **kwargs)
+
+def run_hf_bert_ddp(self, model, inputs, backend):
+    reset_rng_state()
+    correct_outputs = model(**inputs)
+    correct_loss = correct_outputs.loss
+    correct_loss.backward()
+
+    reset_rng_state()
+    opt_model = torch._dynamo.optimize(backend)(model)
+    opt_outputs = opt_model(**inputs)
+    opt_loss = opt_outputs.loss
+    opt_loss.backward()
+
+    inputs_flat = [inputs[k] for k in inputs]
+    correct_results = collect_results(model, correct_outputs.logits, correct_loss, inputs_flat)
+    opt_results = collect_results(opt_model, opt_outputs.logits, opt_loss, inputs_flat)
+    self.assertTrue(same(correct_results, opt_results))
+
+class TestFakeDistributedSingleProc(TestCase):
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    @patch.object(config, "optimize_ddp", True)
+    @patch.object(torch._inductor.config, "fallback_random", True)
+    def test_hf_bert_ddp_inductor(self):
+        model, inputs = get_hf_bert(0)
+        model = FakeDDP(model)
+        run_hf_bert_ddp(self, model, inputs, "inductor")
+
+    @patch.object(config, "optimize_ddp", True)
+    def test_hf_bert_ddp_aot_eager(self):
+        model, inputs = get_hf_bert(0)
+        model = FakeDDP(model)
+        run_hf_bert_ddp(self, model, inputs, "aot_eager")
+
+
+# Are these tests failing?  Check and see if TestFakeDistributedSingleProc has a
+# single process version; if it's just a problem in the Dynamo distributed
+# optimizer, you should be able to repro it single process!
 @requires_nccl()
 class TestDistributedMultiProc(MultiProcessTestCase):
     def setUp(self):
@@ -182,31 +246,23 @@ def test_ddp_baseline_aot_eager_multiprocess(self):
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
     @patch.object(config, "optimize_ddp", True)
     @patch.object(torch._inductor.config, "fallback_random", True)
-    def test_hf_bert_ddp(self):
+    def test_hf_bert_ddp_inductor(self):
 
         with _per_rank_init(self.rank, self.world_size):
             model, inputs = get_hf_bert(self.rank)
             model = DDP(model)
+            run_hf_bert_ddp(self, model, inputs, "inductor")
 
-            reset_rng_state()
-            correct_outputs = model(**inputs)
-            correct_loss = correct_outputs.loss
-            correct_loss.backward()
-
-            reset_rng_state()
-            opt_model = torch._dynamo.optimize("inductor")(model)
-            opt_outputs = opt_model(**inputs)
-            opt_loss = opt_outputs.loss
-            opt_loss.backward()
-
-            inputs_flat = [inputs[k] for k in inputs]
-            correct_results = collect_results(model, correct_outputs.logits, correct_loss, inputs_flat)
-            opt_results = collect_results(opt_model, opt_outputs.logits, opt_loss, inputs_flat)
-            self.assertTrue(same(correct_results, opt_results))
-
+    @skip_if_lt_x_gpu(2)
+    @import_transformers_or_skip()
+    @patch.object(config, "optimize_ddp", True)
+    def test_hf_bert_ddp_aot_eager(self):
+        with _per_rank_init(self.rank, self.world_size):
+            model, inputs = get_hf_bert(self.rank)
+            model = DDP(model)
+            run_hf_bert_ddp(self, model, inputs, "aot_eager")
 
     @skip_if_lt_x_gpu(1)
-    # TODO(whc)  delete aot_eager test, if inductor test lands stably
     def test_fsdp_aot_eager(self):
         with _per_rank_init(self.rank, self.world_size):
             # Test with basic FSDP wrapping (outer wrap around whole model)

From 368a3cd9580d3bcd72e04d1e220c3dcaa7e9f11a Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Thu, 24 Nov 2022 15:26:55 -0500
Subject: [PATCH 1305/1922] Don't suppress log messages for dynamo CI config
 (#89653)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89653
Approved by: https://github.com/albanD, https://github.com/kit1980
---
 torch/testing/_internal/common_utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index fc8dff3809c6f..3b3e81439c482 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -937,8 +937,6 @@ def __torch_function__(self, func, types, args=(), kwargs=None):
 
 if TEST_WITH_TORCHDYNAMO:
     import torch._dynamo
-    import logging
-    torch._dynamo.config.log_level = logging.ERROR
     # Do not spend time on helper functions that are called with different inputs
     torch._dynamo.config.cache_size_limit = 8
     # TODO: Remove this; this is grandfathered in because we suppressed errors

From 14763acf29008962762649a07b16c8e8b91e53a3 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Sat, 26 Nov 2022 13:52:28 -0800
Subject: [PATCH 1306/1922] Beef up AOTAutograd logging with aot_id and input
 descriptions (#89710)

A few things in this PR, that I found useful while debugging some
recent issues:

- We now allocate an aot_id to each aot_function/aot_module invocation,
  and print it whenever we report error messages and graph output
  logging.  Check the comment for why this sort of thing is useful,
  and also why it's different from nth_graph.  This number is now
  incorporated into aot_graph_name

- I noticed that nth_graph only gets incremented when backwards is
  compiled.  Because backwards is compiled lazily, this means that
  multiple forward graphs would have gotten the same ID!  I change
  nth_graph to always increment to avoid confusion here.

- I added a simple describe_input function, which makes use of
  num_params_buffers to tell the user if the input index they're
  looking at is a param/buffer or an input.  With the help of
  https://github.com/pytorch/pytorch/pull/89709 we could give
  even more detailed information about inputs  (we could also
  easily give detailed information about parameters if we stored
  a mapping of index to parameter name, but I didn't need this
  when debugging so I'll let someone else add it if they need
  it.)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89710
Approved by: https://github.com/bdhirsh
---
 functorch/_src/aot_autograd.py     | 89 +++++++++++++++++++-----------
 test/functorch/test_aotdispatch.py | 15 +++--
 2 files changed, 67 insertions(+), 37 deletions(-)

diff --git a/functorch/_src/aot_autograd.py b/functorch/_src/aot_autograd.py
index 102510c52dd27..ab1e2d347df38 100644
--- a/functorch/_src/aot_autograd.py
+++ b/functorch/_src/aot_autograd.py
@@ -1,6 +1,7 @@
 import collections
 import dataclasses
 import warnings
+import itertools
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from enum import Enum
@@ -46,6 +47,18 @@
 
 aten = torch.ops.aten
 
+# This global counter increments every time we compile a graph with
+# AOTAutograd.  You can use this to correlate runtime error messages
+# with compile time (e.g., if you get an error at runtime saying
+# compiled graph 3 failed, you can set a breakpoint at compile time
+# for this graph number to investigate further at compile time.)
+#
+# NB: this is different from get_aot_compilation_context, which tracks
+# each underlying graph that is compiled.  In contrast, AOT_COUNTER
+# corresponds to top-level invocations of aot_module/aot_function;
+# one counter is allocated per entire compiled block (but this block
+# may involve compiling multiple subgraphs; e.g., for forwards/backwards)
+AOT_COUNTER = itertools.count()
 
 KNOWN_TYPES = [torch.Tensor, int, str, float, bool, torch.SymInt, torch.SymFloat]
 
@@ -747,6 +760,11 @@ def normalize_as_list(x):
 
 # This is a list since looking forward, we can have this arbitrarily nested.
 graph_being_compiled: List[str] = []
+# TODO: It would be nice to reset the numbering every time aot_id goes
+# up, but this is annoying to do right now (because we don't know if
+# an aot_id will come back from the dead), so right now this also happens
+# to be a globally unique number too (at the cost of wobbling if you change
+# how the graphs compile)
 nth_graph: int = 0
 model_name: str = "model"
 
@@ -765,20 +783,20 @@ def get_aot_graph_name() -> str:
     Returns the name of the graph being compiled.
     """
     global model_name, graph_being_compiled, nth_graph
-    return f"{model_name}_{'_'.join(graph_being_compiled)}_{nth_graph}"
+    return f"{model_name}__{'_'.join(graph_being_compiled)}_{nth_graph}"
 
 
 get_graph_being_compiled = get_aot_graph_name
 
 
 @contextmanager
-def track_graph_compiling(graph_name, increment_index=False):
+def track_graph_compiling(aot_config, graph_name):
     global graph_being_compiled
-    graph_being_compiled = [graph_name]
+    # TODO: Don't shove the aot_id in here; set it in the context
+    graph_being_compiled = [f"{aot_config.aot_id}_{graph_name}"]
     yield
-    if increment_index:
-        global nth_graph
-        nth_graph += 1
+    global nth_graph
+    nth_graph += 1
     graph_being_compiled = []
 
 
@@ -836,19 +854,20 @@ class AOTConfig:
     partition_fn: Callable
     decompositions: Dict[Callable, Callable]
     num_params_buffers: int
+    aot_id: int
 
 
 def aot_dispatch_base(flat_fn, flat_args: List[Tensor], aot_config: AOTConfig):
     fw_module = make_fx(flat_fn, aot_config.decompositions)(*flat_args)
     if config.debug_graphs:
-        print("====== Forward (only) graph ======")
+        print("====== Forward (only) graph {aot_config.aot_id} ======")
         fw_module.print_readable()
 
 
     disable_amp = torch._C._is_any_autocast_enabled()
     context = disable_autocast_manager if disable_amp else nullcontext
 
-    with context(), track_graph_compiling("inference"):
+    with context(), track_graph_compiling(aot_config, "inference"):
         compiled_fw = aot_config.fw_compiler(fw_module, flat_args)
 
     @wraps(compiled_fw)
@@ -1040,9 +1059,12 @@ def merge_view_inputs(
         return args_to_functionalization, post_processed_calling_convention_meta
 
 
-GUARD_BUG_BOILERPLATE = (
-    "This indicates a guard bug in AOTAutograd or Dynamo, please file a bug to PyTorch."
-)
+def format_guard_bug_msg(aot_config, expected):
+    return (
+        f"At compilation time, graph {aot_config.aot_id} was compiled under the "
+        f"assumption that {expected}, but at runtime this was not the case.  "
+        "This indicates a guard bug in AOTAutograd or Dynamo, please file a bug to PyTorch."
+    )
 
 
 # Wraps aot_dispatch_deduplicated_autograd, ensuring that duplicate arguments
@@ -1123,11 +1145,10 @@ def debug_wrapper(*args):
             seen = {}
             for i, (x, y) in enumerate(zip(new_args, args)):
                 seen[y] = None
-                assert x is y, (
-                    "At compilation time, this graph was compiled under the "
-                    "assumption that some inputs were duplicate, but at runtime "
-                    f"input {i} was not a duplicate of {add_dupe_map[i]}.  " +
-                    GUARD_BUG_BOILERPLATE
+                assert x is y, format_guard_bug_msg(
+                    aot_config,
+                    f"{describe_input(i, aot_config)} would be a duplicate of "
+                    f"{describe_input(add_dupe_map[i], aot_config)}"
                 )
             # This is only an error if there is metadata mutation on both of
             # the duped arguments; in this case, we need to know what order
@@ -1136,11 +1157,8 @@ def debug_wrapper(*args):
             # you dupe the inputs (the gradient contributions from each input
             # will get summed up appropriately.)
             """
-            assert len(seen) == unique_args, (
-                "At compilation time, this graph was compiled under the assumption "
-                f"that there would be {unique_args} distinct arguments, but at "
-                f"runtime there were only {len(seen)} distinct arguments.  " +
-                GUARD_BUG_BOILERPLATE
+            assert len(seen) == unique_args, format_guard_bug_msg(aot_config,
+                f"there would be {unique_args} distinct arguments"
             )
             """
             return f(*args)
@@ -1166,11 +1184,19 @@ def wrapped_compiled_fn(*args):
     return maybe_wrap_debug(wrapped_compiled_fn)
 
 
+def describe_input(i, aot_config):
+    if i < aot_config.num_params_buffers:
+        return f"parameter/buffer {i}"
+    else:
+        return f"input {i - aot_config.num_params_buffers}"
+
+
 # Like aot_dispatch_autograd, but with the precondition that there
 # are no duplicate arguments in flat_args (e.g., the same Tensor
 # object never shows up twice.  However, two tensor inputs MAY alias
 # the same storage, so long as they have separate TensorImpls.)
 def aot_dispatch_deduplicated_autograd(flat_fn, flat_args: List[Tensor], aot_config: AOTConfig):
+
     _fw_metadata, out, _num_aliasing_metadata_outs = run_functionalized_fw_and_collect_metadata(
         flat_fn
     )(*flat_args)
@@ -1230,11 +1256,11 @@ def aot_dispatch_deduplicated_autograd(flat_fn, flat_args: List[Tensor], aot_con
         raise AssertionError("Graph partitioning without functionalization is not sound, we may introduce errors")
 
     if config.debug_joint:
-        print("====== Joint graph ======")
+        print(f"====== Joint graph {aot_config.aot_id} ======")
         fx_g.print_readable()
 
     with torch.no_grad():
-        with track_graph_compiling("joint"):
+        with track_graph_compiling(aot_config, "joint"):
             num_inner_fwd_outputs = _num_mutated_data_inputs + _num_non_aliased_outs + _num_aliasing_metadata_outs
             fw_module, bw_module = aot_config.partition_fn(
                 fx_g, joint_inputs, num_fwd_outputs=num_inner_fwd_outputs)
@@ -1246,12 +1272,12 @@ def aot_dispatch_deduplicated_autograd(flat_fn, flat_args: List[Tensor], aot_con
             _num_symints_saved_for_bw = len(symint_outs_saved_for_bw)
 
         if config.debug_graphs:
-            print("====== Forward graph ======")
+            print("====== Forward graph {aot_config.aot_id} ======")
             fw_module.print_readable()
-            print("====== Backward graph ======")
+            print("====== Backward graph {aot_config.aot_id} ======")
             bw_module.print_readable()
 
-        with track_graph_compiling("forward"):
+        with track_graph_compiling(aot_config, "forward"):
             compiled_fw_func = aot_config.fw_compiler(fw_module, flat_args_with_views_handled)
 
     class CompiledFunction(torch.autograd.Function):
@@ -1358,7 +1384,7 @@ def backward(ctx, *all_flat_args):
             if CompiledFunction.compiled_bw is None:
                 # TODO - pass in fake tensors ?
                 context = disable_autocast_manager if disable_amp else nullcontext
-                with context(), track_graph_compiling("backward", True):
+                with context(), track_graph_compiling(aot_config, "backward"):
                     CompiledFunction.compiled_bw = aot_config.bw_compiler(
                         bw_module, all_args
                     )
@@ -1520,10 +1546,9 @@ def debug_compiled_function(*args):
             if can_require_grad is None:
                 assert not isinstance(a, Tensor)
             elif not can_require_grad:
-                assert not a.requires_grad, (
-                    "At compilation time, this graph was compiled under the "
-                    f"assumption that input {i} did not require grad, but at "
-                    f"runtime input {i} requires grad.  " + GUARD_BUG_BOILERPLATE
+                assert not a.requires_grad, format_guard_bug_msg(
+                    aot_config,
+                    f"{describe_input(i, aot_config)} would not require grad"
                 )
 
         return compiled_function(*args)
@@ -1717,6 +1742,7 @@ def aot_function(
         partition_fn=partition_fn,
         decompositions=decompositions,
         num_params_buffers=num_params_buffers,
+        aot_id=next(AOT_COUNTER),
     )
     cached_res = None
 
@@ -1882,6 +1908,7 @@ def functional_call(*args, **kwargs):
         partition_fn=partition_fn,
         decompositions=decompositions,
         num_params_buffers=params_len,
+        aot_id=next(AOT_COUNTER),
     )
 
     compiled_fn = None
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 52aa9877d3164..afe0a3c899c3e 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -960,7 +960,8 @@ def f(a, b, c):
         inp = [torch.randn(5, requires_grad=True) for _ in range(3)]
         f(*inp).sum().backward()
 
-    def test_compilation_context(self):
+    @patch('functorch._src.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
+    def test_compilation_context(self, counter):
         def f(x):
             return x.sin().sin()
         count = []
@@ -975,7 +976,7 @@ def compiler(fx_g, _):
         f = aot_function(f, compiler)
         f(torch.randn(5))
         out.sum().backward()
-        self.assertEqual(count, [(['forward'], 4), (['inference'], 4), (['backward'], 8)])
+        self.assertEqual(count, [(['0_forward'], 4), (['1_inference'], 4), (['0_backward'], 8)])
 
     def test_dupe_arg(self):
         def f(x, y):
@@ -984,8 +985,9 @@ def f(x, y):
         x = torch.randn(3, 3, requires_grad=True)
         self.verify_aot_autograd(f, [x, x])
 
+    @patch('functorch._src.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
     @patch("functorch._src.config.debug_assert", True)
-    def test_invalid_dupe(self):
+    def test_invalid_dupe(self, counter):
         class F(torch.nn.Module):
             def forward(self, x, y):
                 return (x + y,)
@@ -1001,11 +1003,12 @@ def forward(self, x, y):
         fxx(x, x)
         self.assertExpectedRaisesInline(
             AssertionError, lambda: fxx(x, y),
-            """At compilation time, this graph was compiled under the assumption that some inputs were duplicate, but at runtime input 1 was not a duplicate of 0.  This indicates a guard bug in AOTAutograd or Dynamo, please file a bug to PyTorch."""  # noqa: B950
+            """At compilation time, graph 1 was compiled under the assumption that input 1 would be a duplicate of input 0, but at runtime this was not the case.  This indicates a guard bug in AOTAutograd or Dynamo, please file a bug to PyTorch."""  # noqa: B950
         )
 
+    @patch('functorch._src.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
     @patch("functorch._src.config.debug_assert", True)
-    def test_invalid_requires_grad(self):
+    def test_invalid_requires_grad(self, counter):
         class F(torch.nn.Module):
             def forward(self, x, y):
                 return (x + y,)
@@ -1029,7 +1032,7 @@ def compare(m1, m2, inps):
         compare(F(), fxz, (x, z))
         self.assertExpectedRaisesInline(
             AssertionError, lambda: fxz(x, y),
-            """At compilation time, this graph was compiled under the assumption that input 1 did not require grad, but at runtime input 1 requires grad.  This indicates a guard bug in AOTAutograd or Dynamo, please file a bug to PyTorch."""  # noqa: B950
+            """At compilation time, graph 1 was compiled under the assumption that input 1 would not require grad, but at runtime this was not the case.  This indicates a guard bug in AOTAutograd or Dynamo, please file a bug to PyTorch."""  # noqa: B950
         )
 
     def test_resize_input(self):

From 63b9f83bf0542d3122ca85d55b51bcd6b03dc020 Mon Sep 17 00:00:00 2001
From: Michael Voznesensky <voznesenskym@gmail.com>
Date: Mon, 28 Nov 2022 05:12:37 +0000
Subject: [PATCH 1307/1922] Add simple assert to detect fake tensors on modules
 (#89723)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89723
Approved by: https://github.com/ezyang
---
 functorch/_src/aot_autograd.py     | 26 ++++++++++++
 test/functorch/test_aotdispatch.py | 64 ++++++++++++++++++++++++++++++
 torch/_dynamo/output_graph.py      |  2 +
 torch/_dynamo/utils.py             | 11 +++++
 4 files changed, 103 insertions(+)

diff --git a/functorch/_src/aot_autograd.py b/functorch/_src/aot_autograd.py
index ab1e2d347df38..4336cbf0df509 100644
--- a/functorch/_src/aot_autograd.py
+++ b/functorch/_src/aot_autograd.py
@@ -1247,6 +1247,11 @@ def aot_dispatch_deduplicated_autograd(flat_fn, flat_args: List[Tensor], aot_con
             fx_g = make_fx(
                 joint_forward_backward, aot_config.decompositions
             )(*joint_inputs)
+
+        # Redudant with the check above, but worth having in case tracing introduced
+        # a fake tensor. Unlikely.
+        # See Note: [Fake Modules and AOTAutograd]
+        torch._dynamo.utils.assert_no_fake_params_or_buffers(fx_g)
         fx_g.graph.eliminate_dead_code()
         fx_g.recompile()
     else:
@@ -1822,6 +1827,8 @@ def aot_module(mod: nn.Module, *args, **kwargs) -> nn.Module:
         :attr:`mod`, but with forward and backward graph compiled.
 
     """
+    # See Note: [Fake Modules and AOTAutograd]
+    torch._dynamo.utils.assert_no_fake_params_or_buffers(mod)
 
     def functional_call(named_params, named_buffers, *args, **kwargs):
         params_and_buffers = {**named_params, **named_buffers}
@@ -1869,6 +1876,25 @@ def aot_module_simplified(
     """
     #########################################################
 
+    # Redudant with dynamo, but worth having in case this gets invoked elsewhere.
+
+    # Note [Fake Modules and AOTAutograd]
+    #
+    # A simple heuristic for when to use fake versus real tensors is that fake tensors are for compile time
+    # (when we don't want to actually run the compute, but we do want to know about metadata),
+    # and real tensors are for runtime (when we actually want to do the compute.) However, in AOTAutograd,
+    # modules are the exception: we always pass AOTAutograd modules with real tensors.
+    # This is because AOTAutograd will produce a compiled function which needs to directly access any
+    # parameters the compiled function may need, but these parameters will NOT be passed in by the caller (aka Dynamo).
+    # So at compile time, the compiled function we produce must close over any parameters, and those parameters must be
+    # real parameters, and we cannot do this unless at compile time we get a module with real tensors.
+
+    # Even if Dynamo did pass all parameters explicitly at runtime, which would eliminate the need to close over
+    # the parameters, it would still be profitable to pass real tensor parameters to the compiler at compile time,
+    # because some compilation strategies like CUDA graphs want to burn in the pointer addresses where the parameter data live,
+    # and of course we can't do that unless we give the backend a real tensor.
+    torch._dynamo.utils.assert_no_fake_params_or_buffers(mod)
+
     params = {
         **dict(_named_parameters(mod, remove_duplicate=False)),
         **dict(_named_buffers(mod, remove_duplicate=False)),
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index afe0a3c899c3e..aabd03050c4cb 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1585,6 +1585,70 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
         res = aot_mod(*inputs)
         res[0].sum().backward()
 
+    def test_aot_module_simplified_fake_tensor_gm_raises(self):
+        class MockModule(torch.nn.Module):
+            def __init__(self, y):
+                super().__init__()
+                self.linear = torch.nn.Linear(4, 4)
+                self.y = y
+
+            def forward(self, x):
+                z = self.linear(x)
+                z = z + self.y
+                z = z.relu()
+                return (z, )
+
+
+        real_x = torch.randn(4)
+        real_y = torch.randn(4)
+        fake_mode = torch._subclasses.fake_tensor.FakeTensorMode()
+        fake_y = fake_mode.from_tensor(real_y)
+
+        tracer = torch.fx.Tracer()
+        tracer.record_stack_traces = True
+
+        # This test uses tracing to lift the fake_y into a constant buffer,
+        # so we have a contrived trace example.
+        # For a traceless example closer to how dynamo would call us, see
+        # test_aot_module_deepcopy_fake_tensor_gm_raises below.
+        graph = tracer.trace(MockModule(fake_y))
+        mod_fake = torch.fx.GraphModule(tracer.root, graph)
+
+        self.assertExpectedRaisesInline(
+            AssertionError, lambda: aot_module_simplified(mod_fake, nop),
+            """Unexpected fake buffer y"""
+        )
+        # Counterfactual to ensure that the raise is only due to real vs fake
+        # Run the same exact thing except with a real buffer.
+        graph = tracer.trace(MockModule(real_y))
+        mod_real = torch.fx.GraphModule(tracer.root, graph)
+        aot_module_simplified(MockModule(real_y), nop)
+
+    def test_aot_module_deepcopy_fake_tensor_gm_raises(self):
+        class MockModule(torch.nn.Module):
+            def __init__(self, y):
+                super().__init__()
+                self.linear = torch.nn.Linear(4, 4)
+                self.linear.bias = torch.nn.Parameter(torch.ones(4))
+
+            def forward(self, x):
+                z = self.linear(x)
+                z = z.relu()
+                return (z, )
+
+
+        real_x = torch.randn(4)
+        real_y = torch.randn(4)
+
+        fake_mode = torch._subclasses.fake_tensor.FakeTensorMode()
+        mod_fake = torch._dynamo.utils.deepcopy_to_fake_tensor(MockModule(real_y), fake_mode)
+
+        self.assertExpectedRaisesInline(
+            AssertionError, lambda: aot_module_simplified(mod_fake, nop),
+            """Unexpected fake param linear.weight"""
+        )
+
+
 # entries in here don't work and need to be fixed.
 # Each one of these is a bug (or needs to be investigated)
 aot_autograd_failures = {
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index 495a4381c6b7f..96c65f12fa91b 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -22,6 +22,7 @@
 from .side_effects import SideEffects
 from .source import ConstantSource, LocalSource, Source
 from .utils import (
+    assert_no_fake_params_or_buffers,
     checkpoint_params,
     CleanupHook,
     clone_inputs,
@@ -482,6 +483,7 @@ def compile_and_call_fx_graph(self, tx, rv, root):
         gm.compile_subgraph_reason = self.compile_subgraph_reason
         name = unique_id("__compiled_fn")
 
+        assert_no_fake_params_or_buffers(gm)
         compiled_fn = self.call_user_compiler(gm)
         compiled_fn = disable(compiled_fn)
 
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index ffea261979b14..62d967402af98 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -1144,3 +1144,14 @@ def get_real_value(node, output_graph):
     except RuntimeError as e:
         raise TorchRuntimeError() from e
     return real_value
+
+
+def assert_no_fake_params_or_buffers(gm):
+    for name, buffer in gm.named_buffers():
+        assert not isinstance(
+            buffer, torch._subclasses.FakeTensor
+        ), f"Unexpected fake buffer {name}"
+    for name, param in gm.named_parameters():
+        assert not isinstance(
+            param, torch._subclasses.FakeTensor
+        ), f"Unexpected fake param {name}"

From 30e1f2a243aee214e95b360c9ac0213865418768 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 28 Nov 2022 10:27:21 +0000
Subject: [PATCH 1308/1922] [xla hash update] update the pinned xla hash
 (#89405)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned xla hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89405
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/xla.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index 5650a48e646bb..43527ab040fc0 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-216d221f4d75ddfe9d0bd3ff2e8b92b39c67d381
+b969cba3410799d74981ade37a5f07c2c12d33ff

From dadb5c5d4fee36ba08e0ba28109c2063a933bf79 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Sun, 27 Nov 2022 20:48:40 -0800
Subject: [PATCH 1309/1922] Add mypy checking for a few files in torch/_dynamo
 (#89731)

It's kind of intractable to enable mypy everywhere at the moment,
because there are a lot of errors, and also mypy is really slow
for some reason.  I just want enough types to explain the public
types for user compiler calls, going through typing the _C.dynamo
bindings along the way.  This is a first step for this.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89731
Approved by: https://github.com/suo
---
 .lintrunner.toml                        | 22 ++++++
 torch/_C/_dynamo/__init__.pyi           |  0
 torch/_C/_dynamo/eval_frame.pyi         | 10 +++
 torch/_dynamo/convert_frame.py          | 33 +++++----
 torch/_dynamo/eval_frame.py             | 66 ++++++++++-------
 torch/_dynamo/guards.py                 |  2 +-
 torch/_dynamo/optimizations/backends.py | 94 +++++++++----------------
 torch/_dynamo/optimizations/training.py | 12 ++--
 torch/_dynamo/output_graph.py           | 83 ++++++++++++++--------
 torch/_dynamo/types.py                  | 27 +++++++
 10 files changed, 215 insertions(+), 134 deletions(-)
 create mode 100644 torch/_C/_dynamo/__init__.pyi
 create mode 100644 torch/_C/_dynamo/eval_frame.pyi
 create mode 100644 torch/_dynamo/types.py

diff --git a/.lintrunner.toml b/.lintrunner.toml
index 34b673c7e09ac..fa7e484fb3e18 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -141,6 +141,28 @@ init_command = [
     'pyyaml==6.0',
 ]
 
+[[linter]]
+code = 'MYPYNOFOLLOW'
+include_patterns = [
+    'torch/_dynamo/eval_frame.py',
+    'torch/_dynamo/convert_frame.py',
+    'torch/_dynamo/types.py',
+    'torch/_dynamo/output_graph.py',
+    'torch/_dynamo/optimizations/__init__.py',
+    'torch/_dynamo/optimizations/backends.py',
+    'torch/_dynamo/optimizations/training.py',
+    'torch/_C/_dynamo/**/*.py',
+]
+exclude_patterns = [
+]
+command = [
+    'python3',
+    'tools/linter/adapters/mypy_linter.py',
+    '--config=mypy-nofollow.ini',
+    '--',
+    '@{{PATHSFILE}}'
+]
+
 [[linter]]
 code = 'MYPYSTRICT'
 include_patterns = [
diff --git a/torch/_C/_dynamo/__init__.pyi b/torch/_C/_dynamo/__init__.pyi
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/torch/_C/_dynamo/eval_frame.pyi b/torch/_C/_dynamo/eval_frame.pyi
new file mode 100644
index 0000000000000..3428342750cc0
--- /dev/null
+++ b/torch/_C/_dynamo/eval_frame.pyi
@@ -0,0 +1,10 @@
+import types
+from typing import Union
+from torch._dynamo.types import DynamoCallback, DynamoGuardHook
+
+def set_eval_frame(callback: DynamoCallback) -> DynamoCallback: ...
+def reset_code(code: types.CodeType) -> None: ...
+def unsupported(obj1: object, obj2: object) -> object: ...
+def skip_code(code: types.CodeType) -> None: ...
+def set_guard_fail_hook(hook: DynamoGuardHook) -> None: ...
+def set_guard_error_hook(hook: DynamoGuardHook) -> None: ...
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index 1f3138ec6cdcb..a60ba3c100096 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -6,7 +6,8 @@
 import types
 import typing
 import weakref
-from typing import Callable
+from traceback import FrameSummary
+from typing import Callable, cast, Dict, List, Optional
 
 import torch
 from torch.fx.graph_module import _forward_from_src as original_forward_from_src
@@ -24,6 +25,7 @@
     Unsupported,
 )
 from .guards import CheckFunctionManager, GuardedCode
+from .output_graph import OutputGraph
 from .replay_record import ExecutionRecord
 from .symbolic_convert import InstructionTranslator
 from .utils import (
@@ -106,7 +108,7 @@ def _fn(*args, **kwargs):
                 torch.cuda.set_rng_state(cuda_rng_state)
             torch.fx.graph_module._forward_from_src = prior_fwd_from_src
 
-    _fn._torchdynamo_orig_callable = fn
+    _fn._torchdynamo_orig_callable = fn  # type: ignore[attr-defined]
     return _fn
 
 
@@ -123,7 +125,7 @@ def has_tensor_in_frame(frame):
             if is_allowed(frame.f_globals[co_name]):
                 return True
 
-    seen_ids = dict()
+    seen_ids: Dict[int, bool] = dict()
 
     def has_tensor(obj):
         """Recursively check if the obj has a tensor"""
@@ -194,7 +196,7 @@ def format_error_msg(exc, code, record_filename=None, frame=None):
 
             msg += "".join(
                 traceback.format_list(
-                    stack_above_dynamo + list(reversed(exc.real_stack))
+                    stack_above_dynamo + list(reversed(get_real_stack(exc)))
                 )
             )
             msg += "\n"
@@ -207,13 +209,18 @@ def format_error_msg(exc, code, record_filename=None, frame=None):
     return msg
 
 
+def get_real_stack(exc) -> List[FrameSummary]:
+    assert hasattr(exc, "real_stack")
+    return cast(List[FrameSummary], exc.real_stack)
+
+
 def augment_exc_message(exc, msg="\n"):
     if (
         hasattr(exc, "real_stack")
         and len(exc.real_stack) > 0
         and not (config.verbose and config.suppress_errors)
     ):
-        msg += f"\nfrom user code:\n {''.join(traceback.format_list(reversed(exc.real_stack[0:2])))}"
+        msg += f"\nfrom user code:\n {''.join(traceback.format_list(list(reversed(get_real_stack(exc)[0:2]))))}"
 
     if config.replay_record_enabled and hasattr(exc, "record_filename"):
         msg += f"\nLast frame execution written to {exc.record_filename}. To run only this frame while debugging, run\
@@ -344,7 +351,7 @@ def format_guard_failures(code):
 
 
 def _compile(
-    code,
+    code: types.CodeType,
     globals,
     locals,
     builtins,
@@ -353,8 +360,8 @@ def _compile(
     export,
     guard_export_fn=None,
     frame=None,
-):
-    output = None
+) -> Optional[GuardedCode]:
+    output: Optional[OutputGraph] = None
 
     # from .utils import print_once;  print_once(code.co_filename)
     def transform(instructions, code_options):
@@ -372,6 +379,7 @@ def transform(instructions, code_options):
         )
         tracer.run()
         output = tracer.output
+        assert output is not None
         assert output.output_instructions
         instructions[:] = output.output_instructions
         code_options.update(output.code_options)
@@ -400,7 +408,7 @@ def transform(instructions, code_options):
         output_codes.add(out_code)
 
         log.log(
-            logging.CODE,
+            logging.CODE,  # type: ignore[attr-defined]
             format_bytecode(
                 "ORIGINAL BYTECODE",
                 code.co_name,
@@ -410,7 +418,7 @@ def transform(instructions, code_options):
             ),
         )
         log.log(
-            logging.CODE,
+            logging.CODE,  # type: ignore[attr-defined]
             format_bytecode(
                 "MODIFIED BYTECODE",
                 code.co_name,
@@ -420,6 +428,7 @@ def transform(instructions, code_options):
             ),
         )
 
+        assert output is not None
         assert output.guards is not None
         CleanupManager.instance[out_code] = output.cleanups
         check_fn = CheckFunctionManager(output, output.guards, locals, globals)
@@ -428,7 +437,7 @@ def transform(instructions, code_options):
         guard_str = "GUARDS:\n"
         guard_str += "\n".join([f" - {str(guard)}" for guard in sorted(output.guards)])
 
-        log.log(logging.CODE, guard_str)
+        log.log(logging.CODE, guard_str)  # type: ignore[attr-defined]
 
         if guard_export_fn is not None:
             guard_export_fn(output.guards)
@@ -464,7 +473,7 @@ def _convert_frame(frame: types.FrameType, cache_size: int):
                 raise
         return None
 
-    _convert_frame._torchdynamo_orig_callable = compiler_fn
+    _convert_frame._torchdynamo_orig_callable = compiler_fn  # type: ignore[attr-defined]
     return _convert_frame
 
 
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index f8e2bd28439c7..c6d8781922cec 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -9,7 +9,9 @@
 import traceback
 import types
 import warnings
+from enum import Enum
 from importlib import import_module
+from typing import Optional, Tuple, TYPE_CHECKING, Union
 from unittest.mock import patch
 
 import torch
@@ -17,31 +19,45 @@
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.nn.parallel.distributed import DistributedDataParallel
 
+if TYPE_CHECKING:
+    from torch._C._dynamo.eval_frame import (  # noqa: F401
+        reset_code,
+        set_eval_frame,
+        set_guard_error_hook,
+        set_guard_fail_hook,
+        skip_code,
+        unsupported,
+    )
+else:
+    for name in dir(torch._C._dynamo.eval_frame):
+        if name.startswith("__"):
+            continue
+        globals()[name] = getattr(torch._C._dynamo.eval_frame, name)
+
 from . import config, convert_frame, skipfiles, utils
 from .exc import ResetRequired
 from .mutation_guard import install_generation_tagging_init
 from .optimizations.distributed import DDPOptimizer
+from .output_graph import CompilerFn
+from .types import DynamoCallback
 from .utils import compile_times
 
 log = logging.getLogger(__name__)
 
-try:
-    from torch.fx.experimental import proxy_tensor
-except ImportError:
-    proxy_tensor = None
-
-_eval_frame = torch._C._dynamo.eval_frame
-set_eval_frame = _eval_frame.set_eval_frame
-reset_code = _eval_frame.reset_code
-unsupported = _eval_frame.unsupported
-skip_code = _eval_frame.skip_code
-set_guard_fail_hook = _eval_frame.set_guard_fail_hook
-set_guard_error_hook = _eval_frame.set_guard_error_hook
+from torch.fx.experimental import proxy_tensor
+
 always_optimize_code_objects = utils.ExactWeakKeyDictionary()
 null_context = contextlib.nullcontext
-unset = object()
+
+# See https://github.com/python/typing/pull/240
+class Unset(Enum):
+    token = 0
+
+
+unset = Unset.token
+
 compile_lock = threading.RLock()
-most_recent_backend = None
+most_recent_backend: Optional[CompilerFn] = None
 
 
 class OptimizedModule(torch.nn.Module):
@@ -113,7 +129,7 @@ def enable_dynamic(enable: bool = True):
 class _TorchDynamoContext:
     def __init__(
         self,
-        callback,
+        callback: DynamoCallback,
         on_enter=nothing,
         backend_ctx_ctor=null_context,
         patch_fn=nothing,
@@ -123,8 +139,8 @@ def __init__(
     ):
         super().__init__()
         assert callable(callback) or callback is False or callback is None
-        self.callback = callback
-        self.prior = unset
+        self.callback: DynamoCallback = callback
+        self.prior: Union[Unset, DynamoCallback] = unset
         self.on_enter = on_enter
         self.extra_ctx_ctor = backend_ctx_ctor
         self.first_ctx = first_ctx
@@ -146,6 +162,7 @@ def __enter__(self):
         self.dynamic_ctx.__enter__()
 
     def __exit__(self, exc_type, exc_val, exc_tb):
+        assert self.prior is not unset
         set_eval_frame(self.prior)
         self.prior = unset
         # TODO: This is totally not the right way to chain contexts manually
@@ -198,13 +215,13 @@ def _fn(*args, **kwargs):
 
         # hooks to properly handle inlining
         if isinstance(self, DisableContext):
-            _fn._torchdynamo_disable = True
+            _fn._torchdynamo_disable = True  # type: ignore[attr-defined]
         else:
-            _fn._torchdynamo_inline = fn
+            _fn._torchdynamo_inline = fn  # type: ignore[attr-defined]
 
         # Save the function pointer to find the original callable while nesting
         # of decorators.
-        _fn._torchdynamo_orig_callable = fn
+        _fn._torchdynamo_orig_callable = fn  # type: ignore[attr-defined]
 
         # If the function is called using torch._dynamo.optimize decorator, we
         # should prevent any type of skipping.
@@ -306,7 +323,7 @@ def catch_errors(frame, cache_size):
         with compile_lock:
             return callback(frame, cache_size)
 
-    catch_errors._torchdynamo_orig_callable = callback
+    catch_errors._torchdynamo_orig_callable = callback  # type: ignore[attr-defined]
     return catch_errors
 
 
@@ -510,7 +527,7 @@ def export(
     graph = None
     out_guards = None
     graph_captured_input = None
-    graph_captured_result = None
+    graph_captured_result: Optional[Tuple[torch.Tensor, ...]] = None
 
     def produce_matching(source_args, candidate_args):
         matched_elements_positions = []
@@ -559,6 +576,7 @@ def result_capturing_wrapper(*graph_inputs):
             nonlocal graph_captured_input
 
             graph_captured_input = graph_inputs
+            assert graph is not None
             graph_captured_result = graph(*graph_inputs)
             return graph_captured_result
 
@@ -585,6 +603,7 @@ def result_capturing_wrapper(*graph_inputs):
 
     flat_results_traced, out_spec_traced = pytree.tree_flatten(result_traced)
 
+    assert graph_captured_result is not None
     flat_both = list(graph_captured_result) + flat_args
     matched_output_elements_positions = produce_matching(flat_both, flat_results_traced)
 
@@ -710,8 +729,7 @@ def patch():
         torch.onnx.export_to_pretty_string = disable(torch.onnx.export_to_pretty_string)
         torch.distributions.Distribution.set_default_validate_args(False)
 
-        if proxy_tensor is not None:
-            proxy_tensor.dispatch_trace = disable(proxy_tensor.dispatch_trace)
+        proxy_tensor.dispatch_trace = disable(proxy_tensor.dispatch_trace)
 
         optimizers = [
             opt
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index 7768cb14fc623..a5aa42856834e 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -805,7 +805,7 @@ def id_ref(self, obj):
 
 def guard_fail_hook(
     guard_fn: Callable, code: types.CodeType, f_locals: Dict[str, Any], last: bool
-):
+) -> None:
     """
     called whenever a guard fails.
     """
diff --git a/torch/_dynamo/optimizations/backends.py b/torch/_dynamo/optimizations/backends.py
index c5011096c32f3..256618dd5aafa 100644
--- a/torch/_dynamo/optimizations/backends.py
+++ b/torch/_dynamo/optimizations/backends.py
@@ -6,15 +6,18 @@
 import subprocess
 import tempfile
 
+from typing import Dict
+
 import numpy as np
 
 import torch
+from ..output_graph import CompilerFn
 
 from ..utils import identity
 from .subgraph import SubGraph
 
 log = logging.getLogger(__name__)
-BACKENDS = dict()
+BACKENDS: Dict[str, CompilerFn] = dict()
 _NP_DTYPE = {
     torch.float16: np.float16,
     torch.float32: np.float32,
@@ -130,7 +133,7 @@ def static_runtime(subgraph):
 
 
 def onnxrt_common(subgraph, provider, onnx_filename=None):
-    import onnxruntime
+    import onnxruntime  # type: ignore[import]
 
     assert provider in onnxruntime.get_available_providers()
     session = onnxruntime.InferenceSession(
@@ -141,9 +144,9 @@ def onnxrt_common(subgraph, provider, onnx_filename=None):
     create_outputs = subgraph.empty_outputs_factory()
     is_cpu = subgraph.is_cpu
 
-    def _call(*args):
+    def _call(*initial_args):
         binding = session.io_binding()
-        args = [a.contiguous() for a in args]
+        args = [a.contiguous() for a in initial_args]
         for name, value in zip(input_names, args):
             dev = value.device
             binding.bind_input(
@@ -228,7 +231,7 @@ def onnxrt(subgraph):
 
 @functools.lru_cache(None)
 def _init_tensorflow():
-    import tensorflow as tf
+    import tensorflow as tf  # type: ignore[import]
 
     # prevent tensorflow from eating all the GPU memory
     gpus = tf.config.list_physical_devices("GPU")
@@ -239,8 +242,8 @@ def _init_tensorflow():
 
 @create_backend
 def onnx2tf(subgraph):
-    import onnx
-    from onnx_tf.backend import prepare
+    import onnx  # type: ignore[import]
+    from onnx_tf.backend import prepare  # type: ignore[import]
 
     tf = _init_tensorflow()
     filename = subgraph.filename("tensorflow")
@@ -253,8 +256,8 @@ def onnx2tf(subgraph):
         tf_module = tf.saved_model.load(filename)
         tf_module = tf.function(tf_module, jit_compile=True)
 
-    def run(*args):
-        args = [a.contiguous() for a in args]
+    def run(*i_args):
+        args = [a.contiguous() for a in i_args]
         with tf.device(device):
             outs = tf_module(
                 **{
@@ -292,7 +295,7 @@ def taso(subgraph):
 
 @create_backend
 def ipex(subgraph, **kwargs):
-    import intel_extension_for_pytorch as ipex
+    import intel_extension_for_pytorch as ipex  # type: ignore[import]
 
     inputs = subgraph.example_inputs
     model = subgraph.model
@@ -321,12 +324,20 @@ def fx2trt(subgraph, **kwargs):
         # TensorRT fails violently with an abort() on this
         return None
 
-    from torch_tensorrt.fx.fx2trt import InputTensorSpec, TRTInterpreter
-    from torch_tensorrt.fx.passes.lower_basic_pass import transform_setitem
-    from torch_tensorrt.fx.tools.trt_splitter import TRTSplitter, TRTSplitterSetting
-    from torch_tensorrt.fx.tracer.acc_tracer import acc_tracer
-    from torch_tensorrt.fx.trt_module import TRTModule
-    from torch_tensorrt.fx.utils import LowerPrecision
+    from torch_tensorrt.fx.fx2trt import (  # type: ignore[import]
+        InputTensorSpec,
+        TRTInterpreter,
+    )
+    from torch_tensorrt.fx.passes.lower_basic_pass import (  # type: ignore[import]
+        transform_setitem,
+    )
+    from torch_tensorrt.fx.tools.trt_splitter import (  # type: ignore[import]
+        TRTSplitter,
+        TRTSplitterSetting,
+    )
+    from torch_tensorrt.fx.tracer.acc_tracer import acc_tracer  # type: ignore[import]
+    from torch_tensorrt.fx.trt_module import TRTModule  # type: ignore[import]
+    from torch_tensorrt.fx.utils import LowerPrecision  # type: ignore[import]
 
     from .normalize import normalize_ir
 
@@ -414,7 +425,7 @@ def torch2trt(subgraph):
         # TensorRT fails violently with an abort() on this
         return None
 
-    from torch2trt import torch2trt
+    from torch2trt import torch2trt  # type: ignore[import]
 
     inputs = subgraph.example_inputs
     trt_mod = torch2trt(
@@ -438,45 +449,6 @@ def tensorrt(subgraph):
     return model
 
 
-@create_backend
-def onnx2tensorrt_alt(subgraph):
-    if subgraph.will_tensorrt_barf():
-        # TensorRT fails violently with an abort() on this
-        return None
-
-    import tensorrt as trt
-
-    from torch.fx.experimental.fx2trt.trt_module import TRTModule
-
-    inputs = subgraph.example_inputs
-
-    logger = trt.Logger(trt.Logger.ERROR)
-    builder = trt.Builder(logger)
-    config = builder.create_builder_config()
-    assert isinstance(inputs, (list, tuple))
-    inputs = tuple(inputs)
-    input_names = subgraph.input_names
-    output_names = subgraph.output_names
-    network = builder.create_network(
-        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-    )
-    parser = trt.OnnxParser(network, logger)
-    success = parser.parse(open(subgraph.onnx_filename, "rb").read())
-    for idx in range(parser.num_errors):
-        print(parser.get_error(idx))
-    assert success
-
-    config.max_workspace_size = 1 << 25
-    config.set_flag(trt.BuilderFlag.STRICT_TYPES)
-    builder.max_batch_size = len(inputs[0])
-
-    engine = builder.build_engine(network, config)
-    assert engine
-
-    trt_mod = TRTModule(engine, input_names, output_names)
-    return subgraph.wrap_returns(trt_mod)
-
-
 @create_backend
 def cudagraphs(subgraph):
     model = subgraph.model
@@ -628,9 +600,9 @@ def tvm_compile_inner(
     jit_mod, example_inputs, tuning_option=None, log_file=None, trials=20000, cuda=False
 ):
     try:
-        import tvm
-        from tvm import relay
-        from tvm.contrib import graph_executor
+        import tvm  # type: ignore[import]
+        from tvm import relay  # type: ignore[import]
+        from tvm.contrib import graph_executor  # type: ignore[import]
 
         shape_list = [(f"inp_{idx}", i.shape) for idx, i in enumerate(example_inputs)]
         mod, params = relay.frontend.from_pytorch(jit_mod, shape_list)
@@ -724,8 +696,8 @@ def to_torch_tensor(nd_tensor):
                 return torch.from_numpy(nd_tensor.numpy())
             return torch.utils.dlpack.from_dlpack(nd_tensor.to_dlpack())
 
-        def exec_tvm(*args):
-            args = [a.contiguous() for a in args]
+        def exec_tvm(*i_args):
+            args = [a.contiguous() for a in i_args]
             for idx, arg in enumerate(args, 0):
                 if arg.dim() != 0:
                     if arg.requires_grad:
diff --git a/torch/_dynamo/optimizations/training.py b/torch/_dynamo/optimizations/training.py
index 06b66e282a9ec..c42c10c30a525 100644
--- a/torch/_dynamo/optimizations/training.py
+++ b/torch/_dynamo/optimizations/training.py
@@ -148,7 +148,7 @@ def candidate(self):
         DEBUG = False
         return BACKENDS["aot_autograd"](
             self.gm, self.example_inputs, fw_compiler=debug_nop if DEBUG else nop
-        )
+        )  # type: ignore[call-arg]
 
 
 aot_eager = AotNop.compile_fn
@@ -164,7 +164,7 @@ def candidate(self):
 
         return BACKENDS["aot_autograd"](
             self.gm, self.example_inputs, fw_compiler=ts_compile
-        )
+        )  # type: ignore[call-arg]
 
 
 aot_ts = AotTorchscript.compile_fn
@@ -214,7 +214,7 @@ class AotMemEfficientFusion(AotAutogradStrategy):
 
     def candidate(self):
         kwargs = mem_efficient_fusion_kwargs(use_decomps=True)
-        return BACKENDS["aot_autograd"](self.gm, self.example_inputs, **kwargs)
+        return BACKENDS["aot_autograd"](self.gm, self.example_inputs, **kwargs)  # type: ignore[call-arg]
 
 
 class AotMemEfficientFusionNoDecomps(AotAutogradStrategy):
@@ -222,7 +222,7 @@ class AotMemEfficientFusionNoDecomps(AotAutogradStrategy):
 
     def candidate(self):
         kwargs = mem_efficient_fusion_kwargs(use_decomps=False)
-        return BACKENDS["aot_autograd"](self.gm, self.example_inputs, **kwargs)
+        return BACKENDS["aot_autograd"](self.gm, self.example_inputs, **kwargs)  # type: ignore[call-arg]
 
 
 class AotInductorDebug(AotAutogradStrategy):
@@ -247,7 +247,7 @@ def candidate(self):
                 min_cut_rematerialization_partition, compiler="inductor"
             ),
         }
-        return BACKENDS["aot_autograd"](self.gm, self.example_inputs, **kwargs)
+        return BACKENDS["aot_autograd"](self.gm, self.example_inputs, **kwargs)  # type: ignore[call-arg]
 
 
 aot_inductor_debug = AotInductorDebug.compile_fn
@@ -346,7 +346,7 @@ def candidate(self):
                 fw_compiler=partial(prims_executor, executor=self.executor),
                 bw_compiler=partial(prims_executor, executor=self.executor),
                 partition_fn=disable(nvprims_fw_bw_partition_fn),
-            )
+            )  # type: ignore[call-arg]
 
     return NvPrims
 
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index 96c65f12fa91b..ff8b0998622e2 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -7,7 +7,10 @@
 import re
 import traceback
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, OrderedDict, Set, Tuple, Union
+
+import sympy
+from typing_extensions import Protocol
 
 import torch.nn
 from torch import fx
@@ -16,8 +19,8 @@
 from . import config, logging as torchdynamo_logging, variables
 from .bytecode_transformation import create_instruction, Instruction, unique_id
 from .codegen import PyCodegen
-from .exc import BackendCompilerFailed
-from .guards import GuardBuilder
+from .exc import BackendCompilerFailed, unimplemented
+from .guards import Guard, GuardBuilder, TensorReference
 from .mutation_guard import is_dynamic_nn_module
 from .side_effects import SideEffects
 from .source import ConstantSource, LocalSource, Source
@@ -31,7 +34,8 @@
     format_graph_tabular,
     same,
 )
-from .variables.builder import VariableBuilder, wrap_fx_proxy
+from .variables.base import VariableTracker
+from .variables.builder import GraphArg, VariableBuilder, wrap_fx_proxy
 from .variables.nn_module import NNModuleVariable
 from .variables.tensor import (
     DynamicShapeVariable,
@@ -43,6 +47,15 @@
 log = logging.getLogger(__name__)
 
 
+# TODO: I think this accepts int arguments too
+class CompiledFn(Protocol):
+    def __call__(self, *args: torch.Tensor) -> Tuple[torch.Tensor, ...]:
+        ...
+
+
+CompilerFn = Callable[[fx.GraphModule, List[torch.Tensor]], CompiledFn]
+
+
 @functools.lru_cache(None)
 def _step_logger():
     return torchdynamo_logging.get_step_logger(log)
@@ -75,27 +88,27 @@ def __repr__(self):
         return "FakeRootModule(...)"
 
 
-def wrap_compiler_fn(compiler_fn):
+def wrap_compiler_fn(compiler_fn: CompilerFn) -> CompilerFn:
     """WrapperBackend if config.verify_correctness is True"""
     if config.verify_correctness:
         # wrap backend if verify_correctness is True
         wrapper_backend_compiler_fn = WrapperBackend(compiler_fn)
 
-        wrapper_backend_compiler_fn._torchdynamo_orig_callable = compiler_fn
+        wrapper_backend_compiler_fn._torchdynamo_orig_callable = compiler_fn  # type: ignore[attr-defined]
         return wrapper_backend_compiler_fn
 
     return compiler_fn
 
 
 class WrapperBackend:
-    def __init__(self, backend=None):
-        self.backend = backend
+    def __init__(self, backend: CompilerFn):
+        self.backend: CompilerFn = backend
 
     @property
     def example_inputs(self):
         return clone_inputs(self.original_example_inputs)
 
-    def __call__(self, gm: torch.fx.GraphModule, example_inputs):
+    def __call__(self, gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
 
         self.restore = checkpoint_params(gm)
         self.original_example_inputs = clone_inputs(example_inputs)
@@ -138,38 +151,42 @@ def __init__(
         self,
         f_globals: Dict[str, Any],
         code_options: Dict[str, Any],
-        compiler_fn: Callable,
+        compiler_fn: CompilerFn,
         root_tx,
     ):
         super(OutputGraph, self).__init__()
 
         # Mutable state checkpointed by copy_graphstate()
         self.graph = torch.fx.Graph()
-        self.graphargs = []
-        self.guards = set()
-        self.nn_modules = dict()
+        self.graphargs: List[GraphArg] = []
+        self.guards: Set[Guard] = set()
+        self.nn_modules: Optional[Dict[str, torch.nn.Module]] = dict()
         self.side_effects = SideEffects()
         self.code_options = dict(code_options)
-        self.output_instructions = []
+        self.output_instructions: List[Instruction] = []
         # Node => computed real value (see utils.get_real_value)
-        self.real_value_cache = {}
+        self.real_value_cache: Dict[fx.Node, torch.Tensor] = {}
 
         # Not checkpointed
-        self.compiler_fn = compiler_fn
+        self.compiler_fn: CompilerFn = compiler_fn
         self.root_globals = f_globals
         self.root_tx = root_tx
-        self.cleanups = []
+        self.cleanups: List[CleanupHook] = []
         self.should_exit = False
         self.random_values_var = None
         self.initial_random_state = ()
-        self.unspec_variable_map = {}
+        self.unspec_variable_map: Dict[
+            str, Union[UnspecializedNumpyVariable, UnspecializedPythonVariable]
+        ] = {}
         self.shape_env = ShapeEnv() if config.dynamic_shapes else None
-        self.tensor_id_to_sym_shape_ref = {}
-        self.intermediary_symbols = {}
+        self.tensor_id_to_sym_shape_ref: Dict[int, Set[TensorReference]] = {}
+        self.intermediary_symbols: Dict[sympy.Expr, None] = {}
 
         # Enables creating unique node names by tracking
         # all current placeholder node names
-        self.name_to_input = collections.OrderedDict()
+        self.name_to_input: OrderedDict[
+            str, Optional[fx.Proxy]
+        ] = collections.OrderedDict()
 
     @property
     def output(self):
@@ -181,6 +198,7 @@ def fake_mode(self):
 
     def copy_graphstate(self):
         """Create a checkpoint of the current state by copying everything"""
+        assert self.nn_modules is not None
         graph_nodes = set(self.graph.nodes)
         return (
             graph_nodes,
@@ -313,6 +331,7 @@ def wrap_name(module_key):
                     target
                 )
 
+        assert self.nn_modules is not None
         for k, v in self.nn_modules.items():
             if v is target:
                 # it already exists
@@ -356,11 +375,14 @@ def compile_subgraph(
 
         tx.prune_dead_locals()
         stack_values = list(tx.stack)
+        assert self.nn_modules is not None
         root = FakeRootModule(self.nn_modules)
 
         # Add all the local vars to the "stack" so restore at the end
         restore_vars = []
-        val_to_names = collections.OrderedDict()
+        val_to_names: OrderedDict[
+            VariableTracker, List[str]
+        ] = collections.OrderedDict()
         if stack_values:
             val_to_names[stack_values[-1]] = list()
         for k, v in tx.symbolic_locals.items():
@@ -494,7 +516,7 @@ def compile_and_call_fx_graph(self, tx, rv, root):
             # the call to tabulate can cause a lot of memory to be allocated
             if config.log_level <= logging.INFO:
                 log.log(
-                    logging.CODE,
+                    logging.CODE,  # type: ignore[attr-defined]
                     f"TRACED GRAPH\n {name} {gm.forward.__code__.co_filename} {format_graph_tabular(gm.graph)}\n",
                 )
         except ImportError:
@@ -508,7 +530,7 @@ def compile_and_call_fx_graph(self, tx, rv, root):
         cg.make_call_generated_code(name)
         return cg.get_instructions()
 
-    def call_user_compiler(self, gm):
+    def call_user_compiler(self, gm: fx.GraphModule) -> CompiledFn:
         try:
             name = (
                 self.compiler_fn.__name__
@@ -527,13 +549,13 @@ def call_user_compiler(self, gm):
             raise BackendCompilerFailed(self.compiler_fn, e) from e
         return compiled_fn
 
-    def example_inputs(self):
+    def example_inputs(self) -> List[torch.Tensor]:
         result = []
         for arg in self.graphargs:
             result.extend(arg.get_examples())
         return result
 
-    def remove_unused_graphargs(self):
+    def remove_unused_graphargs(self) -> None:
         for node in reversed(list(self.graph.nodes)):
             if len(list(node.users)) == 0:
                 if node.op == "get_attr":
@@ -560,7 +582,7 @@ def remove_unused_graphargs(self):
 
         self.graphargs = [arg for arg in self.graphargs if arg.uses > 0]
 
-    def add_output_instructions(self, prefix: List[Instruction]):
+    def add_output_instructions(self, prefix: List[Instruction]) -> None:
         """
         We call this on the creation of a new compiled subgraph that is inserted
         before user code.
@@ -568,10 +590,10 @@ def add_output_instructions(self, prefix: List[Instruction]):
         self.output_instructions.extend(prefix)
         self.should_exit = True
 
-    def install_global(self, name, value):
+    def install_global(self, name, value) -> None:
         self.cleanups.append(CleanupHook.create(self.root_globals, name, value))
 
-    def cleanup(self):
+    def cleanup(self) -> None:
         # There is a reference cycle between tracer and OutputGraph, causing
         # some of the tensor objects to be held alive for longer than necessary.
 
@@ -620,7 +642,8 @@ def create_proxy(
             frame_summaries.append(tx.frame_summary())
             tx = getattr(tx, "parent", None)
 
-        msgs = traceback.StackSummary.from_list(frame_summaries).format()
+        # official from_list stub doesn't have new-style type
+        msgs = traceback.StackSummary.from_list(frame_summaries).format()  # type: ignore[arg-type]
 
         # Carry module_stack along with node.stack_trace for reusing stacktrace propagation infra
         nn_module_stack_str = f"Module stack: {nn_module_stack}\n"
diff --git a/torch/_dynamo/types.py b/torch/_dynamo/types.py
new file mode 100644
index 0000000000000..fccc8dfe9f28e
--- /dev/null
+++ b/torch/_dynamo/types.py
@@ -0,0 +1,27 @@
+import types
+from typing import Any, Callable, Dict, Optional, Union
+
+from typing_extensions import Protocol
+
+from torch._dynamo.guards import GuardedCode
+
+
+class DynamoCallbackFn(Protocol):
+    def __call__(
+        self, frame: types.FrameType, cache_size: int
+    ) -> Optional[GuardedCode]:
+        ...
+
+
+DynamoCallback = Union[DynamoCallbackFn, None, bool]
+
+
+class DynamoGuardHook(Protocol):
+    def __call__(
+        self,
+        guard_fn: Callable,
+        code: types.CodeType,
+        f_locals: Dict[str, Any],
+        last: bool,
+    ) -> None:
+        ...

From 82016bd6223cda247eee9a83cbb5c84dfb5fe2af Mon Sep 17 00:00:00 2001
From: Jean Schmidt <4520845+jeanschmidt@users.noreply.github.com>
Date: Mon, 28 Nov 2022 15:16:15 +0100
Subject: [PATCH 1310/1922] supress Werror introduced by lack of override by
 #86786 on `bool initialized()` (#89687)

---
 c10/cuda/CUDAMallocAsyncAllocator.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/c10/cuda/CUDAMallocAsyncAllocator.cpp b/c10/cuda/CUDAMallocAsyncAllocator.cpp
index 9edc4f87ccf31..f567a2655c940 100644
--- a/c10/cuda/CUDAMallocAsyncAllocator.cpp
+++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp
@@ -430,7 +430,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     (void)called;
   }
 
-  bool initialized() {
+  bool initialized() override {
     return devs_initialized_flags.size() > 0;
   }
 

From 35276c165dfca6e1788f90220775637f256d18ac Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Thu, 24 Nov 2022 16:44:28 -0500
Subject: [PATCH 1311/1922] Don't modify log level in dynamo distributed test
 (#89655)

Let the developer decide!

Taken from voz's https://github.com/pytorch/pytorch/pull/89392

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89655
Approved by: https://github.com/albanD
---
 test/distributed/test_dynamo_distributed.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index 8d4365acfc9a6..d067d0f099d90 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -1,7 +1,6 @@
 # Owner(s): ["module: dynamo"]
 import copy
 import functools
-import logging
 import os
 import random
 import unittest
@@ -379,7 +378,6 @@ def setUpClass(cls):
                 },
             )
         )
-        cls._exit_stack.enter_context(patch.object(config, "log_level", logging.DEBUG))
         cls.rank = 0
         cls.device = f"cuda:{cls.rank}"
         cls.device_ids = None if "cuda" in cls.device else [cls.rank]

From afe7d962c6012400e0410166db99e71619fd3460 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Fri, 25 Nov 2022 11:00:21 -0500
Subject: [PATCH 1312/1922] Guard traceable_tensor_subclasses patching with
 finally (#89689)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89689
Approved by: https://github.com/albanD, https://github.com/anjali411
---
 test/dynamo/test_modules.py | 42 ++++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py
index ed3b715f72f9d..6dde69effff99 100644
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@@ -720,16 +720,19 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
 
         torch._dynamo.config.traceable_tensor_subclasses.add(TensorProxy)
 
-        x = torch.randn(1).as_subclass(TensorProxy)
-        cnt = torch._dynamo.testing.CompileCounter()
-        out1 = foo(x)
-        opt_foo = torch._dynamo.optimize(cnt, nopython=True)(foo)
-        out2 = opt_foo(x)
+        try:
 
-        self.assertEqual(cnt.op_count, 4)
-        self.assertTrue(torch._dynamo.testing.same(out1, out2))
+            x = torch.randn(1).as_subclass(TensorProxy)
+            cnt = torch._dynamo.testing.CompileCounter()
+            out1 = foo(x)
+            opt_foo = torch._dynamo.optimize(cnt, nopython=True)(foo)
+            out2 = opt_foo(x)
 
-        torch._dynamo.config.traceable_tensor_subclasses.remove(TensorProxy)
+            self.assertEqual(cnt.op_count, 4)
+            self.assertTrue(torch._dynamo.testing.same(out1, out2))
+
+        finally:
+            torch._dynamo.config.traceable_tensor_subclasses.remove(TensorProxy)
 
     def test_torch_function_with_closure(self):
         def run():
@@ -756,17 +759,18 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
 
             torch._dynamo.config.traceable_tensor_subclasses.add(TensorProxy)
 
-            x = torch.randn(1).as_subclass(TensorProxy)
-            x = torch.randn(1)
-            cnt = torch._dynamo.testing.CompileCounter()
-            out1 = foo(x)
-            opt_foo = torch._dynamo.optimize(cnt, nopython=True)(foo)
-            out2 = opt_foo(x)
-
-            self.assertEqual(cnt.op_count, 4)
-            self.assertTrue(torch._dynamo.testing.same(out1, out2))
-
-            torch._dynamo.config.traceable_tensor_subclasses.remove(TensorProxy)
+            try:
+                x = torch.randn(1).as_subclass(TensorProxy)
+                x = torch.randn(1)
+                cnt = torch._dynamo.testing.CompileCounter()
+                out1 = foo(x)
+                opt_foo = torch._dynamo.optimize(cnt, nopython=True)(foo)
+                out2 = opt_foo(x)
+
+                self.assertEqual(cnt.op_count, 4)
+                self.assertTrue(torch._dynamo.testing.same(out1, out2))
+            finally:
+                torch._dynamo.config.traceable_tensor_subclasses.remove(TensorProxy)
 
         run()
 

From 158b2c8c924545fb3d269619a80e24377a91febc Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Thu, 24 Nov 2022 22:36:02 -0500
Subject: [PATCH 1313/1922] Support set_rng_state with fake tensor (#89642)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89642
Approved by: https://github.com/anjali411
---
 test/dynamo/test_repros.py       | 11 +++--------
 torch/_dynamo/variables/torch.py |  3 ---
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 7bd258cbb3c8d..61a26cd60f2b4 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -1047,14 +1047,9 @@ def fn():
 
         before, after = opt_fn()
         self.assertTrue(same(before, after))
-        self.assertEqual(cnt.frame_count, 2)
-        self.assertEqual(cnt.op_count, 3)  # rand, rand
-        try:
-            graph, _ = torch._dynamo.export(fn)
-            # See https://github.com/pytorch/pytorch/pull/87490
-            self.fail("unexpected export success")
-        except torch._dynamo.exc.Unsupported:
-            pass
+        self.assertEqual(cnt.frame_count, 1)
+        self.assertEqual(cnt.op_count, 4)  # rand, rand
+        graph, _ = torch._dynamo.export(fn)
 
     def test_seq_append_list(self):
         x = torch.randn(4, 10)
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index d737e460304ff..9f6a1977e0a6c 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -337,9 +337,6 @@ def get_state_from_generator():
             assert len(args) == 1
             assert isinstance(args[0], TensorVariable)
 
-            unimplemented(
-                "TODO: make torch.random.set_rng_state work with FakeTensor/aot_autograd"
-            )
             # In fake tensor case, this state doesn't matter, but
             # it needs to be valid to not segfault. Pull a real tensor out.
             # The value won't matter since we are running with fake tensors anyway, so rng doesn't matter.

From eb8388f973deac3f4932b7bf3f8d7302673a62c3 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 28 Nov 2022 14:56:54 +0000
Subject: [PATCH 1314/1922] Revert "Add single process version of dynamo
 distributed hf_Bert tests (#89721)"

This reverts commit 1a2dd6b15e0089a9e45ba4feb90c2d0dfac19238.

Reverted https://github.com/pytorch/pytorch/pull/89721 on behalf of https://github.com/ezyang due to this broke inductor_distributed job
---
 test/distributed/test_dynamo_distributed.py | 94 +++++----------------
 1 file changed, 19 insertions(+), 75 deletions(-)

diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index d067d0f099d90..2349fe81201d6 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -20,7 +20,6 @@
 from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from torch.testing._internal.common_utils import TestCase
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
     import_transformers_or_skip,
@@ -99,11 +98,10 @@ def forward(self, x):
 
 def get_hf_bert(rank):
     # Note: use @import_transformers_or_skip on your test case if you use this
-    # in a multiprocessing test
     try:
         from transformers import BertConfig, AutoModelForMaskedLM
     except ImportError:
-        raise unittest.SkipTest("Unable to import transformers")
+        unittest.skip("Unable to import transformers")
 
     batch_size, max_length, config, device = 4, 512, BertConfig(), f"cuda:{rank}"
     model = AutoModelForMaskedLM.from_config(config).to(device)
@@ -137,68 +135,6 @@ def _per_rank_init(rank, world_size):
     dist.destroy_process_group()
 
 
-# This simulates DDP, but it doesn't actually do any process communication;
-# it just has enough properties so that the dynamo distributed optimization is
-# able to optimize.  Feel free to simulate more properties as necessary.  The
-# other important thing is patching _active_ddp_module, which is what actually
-# triggers DDP optimization
-class FakeDDP(nn.Module):
-    def __init__(self, module):
-        super().__init__()
-        self.module = module
-        bucket_cap_mb = 25
-        self.bucket_bytes_cap = int(bucket_cap_mb * 1024 * 1024)
-
-    @contextmanager
-    def _inside_ddp_forward(self):
-        DDP._active_ddp_module = self
-        try:
-            yield
-        except Exception:
-            raise
-        finally:
-            DDP._active_ddp_module = None
-
-    def forward(self, *inputs, **kwargs):
-        with self._inside_ddp_forward():
-            return self.module.forward(*inputs, **kwargs)
-
-def run_hf_bert_ddp(self, model, inputs, backend):
-    reset_rng_state()
-    correct_outputs = model(**inputs)
-    correct_loss = correct_outputs.loss
-    correct_loss.backward()
-
-    reset_rng_state()
-    opt_model = torch._dynamo.optimize(backend)(model)
-    opt_outputs = opt_model(**inputs)
-    opt_loss = opt_outputs.loss
-    opt_loss.backward()
-
-    inputs_flat = [inputs[k] for k in inputs]
-    correct_results = collect_results(model, correct_outputs.logits, correct_loss, inputs_flat)
-    opt_results = collect_results(opt_model, opt_outputs.logits, opt_loss, inputs_flat)
-    self.assertTrue(same(correct_results, opt_results))
-
-class TestFakeDistributedSingleProc(TestCase):
-    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
-    @patch.object(config, "optimize_ddp", True)
-    @patch.object(torch._inductor.config, "fallback_random", True)
-    def test_hf_bert_ddp_inductor(self):
-        model, inputs = get_hf_bert(0)
-        model = FakeDDP(model)
-        run_hf_bert_ddp(self, model, inputs, "inductor")
-
-    @patch.object(config, "optimize_ddp", True)
-    def test_hf_bert_ddp_aot_eager(self):
-        model, inputs = get_hf_bert(0)
-        model = FakeDDP(model)
-        run_hf_bert_ddp(self, model, inputs, "aot_eager")
-
-
-# Are these tests failing?  Check and see if TestFakeDistributedSingleProc has a
-# single process version; if it's just a problem in the Dynamo distributed
-# optimizer, you should be able to repro it single process!
 @requires_nccl()
 class TestDistributedMultiProc(MultiProcessTestCase):
     def setUp(self):
@@ -245,23 +181,31 @@ def test_ddp_baseline_aot_eager_multiprocess(self):
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
     @patch.object(config, "optimize_ddp", True)
     @patch.object(torch._inductor.config, "fallback_random", True)
-    def test_hf_bert_ddp_inductor(self):
+    def test_hf_bert_ddp(self):
 
         with _per_rank_init(self.rank, self.world_size):
             model, inputs = get_hf_bert(self.rank)
             model = DDP(model)
-            run_hf_bert_ddp(self, model, inputs, "inductor")
 
-    @skip_if_lt_x_gpu(2)
-    @import_transformers_or_skip()
-    @patch.object(config, "optimize_ddp", True)
-    def test_hf_bert_ddp_aot_eager(self):
-        with _per_rank_init(self.rank, self.world_size):
-            model, inputs = get_hf_bert(self.rank)
-            model = DDP(model)
-            run_hf_bert_ddp(self, model, inputs, "aot_eager")
+            reset_rng_state()
+            correct_outputs = model(**inputs)
+            correct_loss = correct_outputs.loss
+            correct_loss.backward()
+
+            reset_rng_state()
+            opt_model = torch._dynamo.optimize("inductor")(model)
+            opt_outputs = opt_model(**inputs)
+            opt_loss = opt_outputs.loss
+            opt_loss.backward()
+
+            inputs_flat = [inputs[k] for k in inputs]
+            correct_results = collect_results(model, correct_outputs.logits, correct_loss, inputs_flat)
+            opt_results = collect_results(opt_model, opt_outputs.logits, opt_loss, inputs_flat)
+            self.assertTrue(same(correct_results, opt_results))
+
 
     @skip_if_lt_x_gpu(1)
+    # TODO(whc)  delete aot_eager test, if inductor test lands stably
     def test_fsdp_aot_eager(self):
         with _per_rank_init(self.rank, self.world_size):
             # Test with basic FSDP wrapping (outer wrap around whole model)

From 620873dcaecfaeff3387f7f91c68b7353177ac2e Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 28 Nov 2022 09:55:59 -0500
Subject: [PATCH 1315/1922] Include test/distributed/test_dynamo_distributed.py
 for ciflow/inductor (#89755)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89755
Approved by: https://github.com/anjali411
---
 .github/labeler.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/labeler.yml b/.github/labeler.yml
index e86ff2192edee..13798ad707538 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -15,6 +15,7 @@
 - torch/_subclasses/fake_tensor.py
 - torch/_subclasses/fake_utils.py
 - torch/_subclasses/meta_utils.py
+- test/distributed/test_dynamo_distributed.py
 
 "module: cpu":
 - aten/src/ATen/cpu/**

From 19dd56d5399f1e91044ddcf6cffb539b21692dc3 Mon Sep 17 00:00:00 2001
From: PratsBhatt <pbhatt110@gmail.com>
Date: Mon, 28 Nov 2022 15:02:27 +0000
Subject: [PATCH 1316/1922] Ci andriod cache conda (#89554)

Fixes - T137631662

Caching conda dependencies for android build workflows.
Conda dependencies have been gathered from the following workflow
1. https://github.com/pytorch/pytorch/blob/master/.github/workflows/_run_android_tests.yml

The pull request updates the action from conda-incubator/setup-miniconda@v2 to pytorch/test-infra/.github/actions/setup-miniconda@main as it supports caching.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89554
Approved by: https://github.com/huydhn
---
 .github/workflows/_run_android_tests.yml | 29 ++++++------------------
 1 file changed, 7 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/_run_android_tests.yml b/.github/workflows/_run_android_tests.yml
index ae992baab11a4..d949e193b76b3 100644
--- a/.github/workflows/_run_android_tests.yml
+++ b/.github/workflows/_run_android_tests.yml
@@ -11,31 +11,16 @@ jobs:
   build-and-test:
     runs-on: ubuntu-latest
     steps:
-      - name: Setup miniconda
-        uses: conda-incubator/setup-miniconda@v2
-        with:
-          auto-update-conda: true
-          python-version: 3.8
-          activate-environment: build
-
-      - name: Install dependencies
-        run: |
-          conda install -y \
-            cffi=1.15.1 \
-            cmake=3.22.1 \
-            mkl=2022.1.0 \
-            mkl-include=2022.1.0 \
-            ninja=1.10.2 \
-            numpy=1.23.3 \
-            pyyaml=6.0 \
-            requests=2.28.1 \
-            setuptools=65.5.0 \
-            typing_extensions=4.3.0
-
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
 
+      - name: Setup miniconda
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        with:
+          python-version: 3.8
+          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
+
       - name: Build PyTorch Android
         run: |
           # Install NDK 21 after GitHub update
@@ -49,7 +34,7 @@ jobs:
           ln -sfn ${ANDROID_SDK_ROOT}/ndk/21.4.7075529 ${ANDROID_NDK}
 
           echo "CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}" >> "${GITHUB_ENV}"
-          ./scripts/build_pytorch_android.sh x86
+          ${CONDA_RUN} ./scripts/build_pytorch_android.sh x86
 
       - name: Run tests
         uses: reactivecircus/android-emulator-runner@v2

From e6fa1a7de7bfdef783fe5f19d1a6644bd850b94c Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Mon, 28 Nov 2022 10:36:40 -0500
Subject: [PATCH 1317/1922] Add statement about minor releases, in the
 release.md document (#89698)

* Add statement about minor releases

* Update RELEASE.md
---
 RELEASE.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/RELEASE.md b/RELEASE.md
index d13ca5d11e100..58d938fafddf0 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -21,6 +21,7 @@
   - [Patch Release Criteria](#patch-release-criteria)
   - [Patch Release Process](#patch-release-process)
     - [Triage](#triage)
+    - [Issue Tracker for Patch releases](#issue-tracker-for-patch-releases)
     - [Building a release schedule / cherry picking](#building-a-release-schedule--cherry-picking)
     - [Building Binaries / Promotion to Stable](#building-binaries--promotion-to-stable)
 - [Hardware / Software Support in Binary Build Matrix](#hardware--software-support-in-binary-build-matrix)
@@ -234,6 +235,20 @@ Patch releases should be considered if a regression meets the following criteria
 3. Triage reviewers will then add the issue / pull request to the related milestone (i.e. `1.9.1`) if the regressions if found to be within the [Patch Release Criteria](#patch-release-criteria)
     * ![adding to milestone](https://user-images.githubusercontent.com/1700823/131175980-148ff38d-44c3-4611-8a1f-cd2fd1f4c49d.png)
 
+### Issue Tracker for Patch releases
+
+For patch releases issue tracker needs to be created. For patch release, we require all cherry-pick changes to have links to either a high-priority Github issue or a CI failure from previous RC. An example of this would look like:
+* https://github.com/pytorch/pytorch/issues/51886
+
+Only following issues are accepted:
+1. Fixes to regressions against previous major version
+2. Critical fixes for: silent correctness, backwards compatibility, crashes, deadlocks, (large) memory leaks
+3. Fixes to new features being introduced in this release
+4. Compilation fixes or ifdefs required for different versions of the compilers or third-party libraries
+5. Test/CI fixes or improvements
+6. Documentation improvements
+7. Release branch specific changes (e.g. change version identifiers, remove/hide features that aren’t ready for release)
+
 ### Building a release schedule / cherry picking
 
 > Main POC: Patch Release Managers

From 285e3cde2b4ba6da542d5029deec7e983a8b4ed4 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Mon, 28 Nov 2022 13:02:20 +0000
Subject: [PATCH 1318/1922] [FSDP()] Register root pre-forward hook (#89572)

- This PR registers the FSDP root pre-forward hook as a module forward pre-hook following the recently added support for kwargs for those hooks.
- This PR also passes `prepend=True` for the normal (not root) pre-forward hook. This is not strictly required for this PR, but I believe it is needed for composability with activation checkpointing. (We want to run FSDP logic on the outside and AC logic on the inside, just like how we recommend `FSDP(AC(module))` for the wrapper versions.)

Fun fact: I originally chose the `[FSDP()]` prefix in the PR titles when we still referred to composable FSDP as functional-like FSDP, in which case `FSDP()` approximated "functional FSDP". I am preserving this usage to make searching for PRs relating to composable FSDP easier.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89572
Approved by: https://github.com/mrshenli
---
 .../_composable/test_fully_shard.py           | 12 +------
 torch/distributed/_composable/fully_shard.py  |  2 ++
 torch/distributed/fsdp/_runtime_utils.py      | 34 +++++++++++--------
 .../fsdp/fully_sharded_data_parallel.py       |  2 +-
 4 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/test/distributed/_composable/test_fully_shard.py b/test/distributed/_composable/test_fully_shard.py
index ba08deeafcdfb..32883d4c265d7 100644
--- a/test/distributed/_composable/test_fully_shard.py
+++ b/test/distributed/_composable/test_fully_shard.py
@@ -10,7 +10,6 @@
 from torch.distributed._composable import fully_shard
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp._common_utils import _is_fsdp_flattened
-from torch.distributed.fsdp._runtime_utils import _root_pre_forward
 from torch.distributed.fsdp.wrap import ModuleWrapPolicy
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import FSDPTest
@@ -244,16 +243,7 @@ def test_training(self):
                 (composable_module, composable_optim),
             ):
                 optim.zero_grad(set_to_none=True)
-                # TODO (awgu): Remove this after resolving the root pre-forward
-                # hook registration, currently blocked by kwarg support
-                if model is composable_module:
-                    args, kwargs = _root_pre_forward(
-                        fully_shard.state(composable_module), composable_module, *inp
-                    )
-                else:
-                    args = inp
-                    kwargs = {}
-                out = model(*args, **kwargs)
+                out = model(*inp)
                 loss = out.sum()
                 losses.append(loss)
                 loss.backward()
diff --git a/torch/distributed/_composable/fully_shard.py b/torch/distributed/_composable/fully_shard.py
index 174b2ca89a788..c0096e80e49c3 100644
--- a/torch/distributed/_composable/fully_shard.py
+++ b/torch/distributed/_composable/fully_shard.py
@@ -17,6 +17,7 @@
 from torch.distributed.fsdp._runtime_utils import (
     _register_post_forward_hooks,
     _register_pre_forward_hooks,
+    _register_root_pre_forward_hook,
 )
 from torch.distributed.fsdp.api import (
     BackwardPrefetch,
@@ -77,4 +78,5 @@ def fully_shard(
     modules = list(module.modules())
     _register_pre_forward_hooks(state, modules)
     _register_post_forward_hooks(state, modules)
+    _register_root_pre_forward_hook(state, module)  # prepend last
     return module
diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index e0986d300a65a..71265f68e9428 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -317,8 +317,8 @@ def _post_forward_reshard(
 def _root_pre_forward(
     state: _FSDPState,
     module: nn.Module,
-    *args,
-    **kwargs,
+    args,
+    kwargs,
 ):
     """
     Runs pre-forward logic specific to the root FSDP instance, which should run
@@ -893,7 +893,9 @@ def _register_pre_forward_hooks(
             hook = functools.partial(
                 _pre_forward, state, module_param_handles, unshard_fn
             )
-            state._pre_forward_handles.append(module.register_forward_pre_hook(hook))
+            state._pre_forward_handles.append(
+                module.register_forward_pre_hook(hook, prepend=True)
+            )
 
 
 @no_type_check
@@ -928,25 +930,27 @@ def _register_post_forward_hooks(
 
 
 @no_type_check
-def _register_root_pre_forward_hooks(
+def _register_root_pre_forward_hook(
     state: _FSDPState,
-    modules: Iterable[nn.Module],
+    module: nn.Module,
 ):
     """
-    # TODO (awgu): This requires kwarg support for hooks registered by
-    ``register_forward_pre_hook()``. ``_root_pre_forward()`` does not have the
-    supported hook signature right now.
+    Registers root pre-forward hook on ``module``, which should be the local
+    FSDP root.
+
+    NOTE: For the current composable FSDP design, we have each application of
+    ``fully_shard()`` to a module to indicate that that module is the local
+    FSDP root. We may remove this assumption in the future, in which case we
+    will need to register this root pre-forward hook on any candidate module
+    that may be the local FSDP root.
     """
     for forward_handle in state._root_pre_forward_handles:
         forward_handle.remove()
     state._root_pre_forward_handles.clear()
-    for module in modules:
-        module_param_handles = state._module_to_handles[module]
-        if module_param_handles:
-            hook = functools.partial(_root_pre_forward, state, module)
-            state._root_pre_forward_handles.append(
-                module.register_forward_pre_hook(hook)
-            )
+    hook = functools.partial(_root_pre_forward, state)
+    state._root_pre_forward_handles.append(
+        module.register_forward_pre_hook(hook, prepend=True, with_kwargs=True)
+    )
 
 
 @no_type_check
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index d2d4fbf229b6a..64d65c67ecb97 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -682,7 +682,7 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
         with torch.autograd.profiler.record_function(
             "FullyShardedDataParallel.forward"
         ):
-            args, kwargs = _root_pre_forward(self, self, *args, **kwargs)
+            args, kwargs = _root_pre_forward(self, self, args, kwargs)
             unused = None
             unshard_fn = functools.partial(_pre_forward_unshard, self, self._handles)
             reshard_fn = functools.partial(_post_forward_reshard, self, self._handles)

From 9db835fafbe6d2896b75d2f048651ed6146c5e3b Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Mon, 28 Nov 2022 13:02:20 +0000
Subject: [PATCH 1319/1922] [FSDP()] Require args as kwargs for `fully_shard()`
 (#89573)

I am not aware of any users of `FullyShardedDataParallel` that pass arguments after `process_group` positionally. I.e., I believe users pass arguments as keyword arguments. This PR formalizes this for `fully_shard()`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89573
Approved by: https://github.com/mrshenli
---
 torch/distributed/_composable/fully_shard.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch/distributed/_composable/fully_shard.py b/torch/distributed/_composable/fully_shard.py
index c0096e80e49c3..e3d36c3c87bbc 100644
--- a/torch/distributed/_composable/fully_shard.py
+++ b/torch/distributed/_composable/fully_shard.py
@@ -31,10 +31,11 @@
 @contract
 def fully_shard(
     module: nn.Module,
+    *,
     process_group: Optional[dist.ProcessGroup] = None,
+    policy: Optional[_FSDPPolicy] = None,
     mixed_precision: Optional[MixedPrecision] = None,
     cpu_offload: Optional[CPUOffload] = None,
-    policy: Optional[_FSDPPolicy] = None,
     ignored_modules: Optional[Iterable[torch.nn.Module]] = None,
     device_id: Optional[Union[int, torch.device]] = None,
     param_init_fn: Optional[Callable[[nn.Module], None]] = None,

From a1d625e3f530e071f3b0b033800deb33b04b9570 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radek=20Barto=C5=88?= <radek.barton@microsoft.com>
Date: Mon, 28 Nov 2022 17:24:53 +0000
Subject: [PATCH 1320/1922] [GHA] Decrease Windows test timeout to 120 minutes
 (#89694)

This PR decreases the Windows tests pipelines timeout to 120 mins per discusison as requested at https://github.com/pytorch/pytorch/issues/73489#issuecomment-1322539593

Closes #73489.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89694
Approved by: https://github.com/kit1980
---
 .github/workflows/_win-test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml
index 0cabb8ec469aa..ef77e0055e369 100644
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@@ -56,7 +56,7 @@ jobs:
       matrix: ${{ fromJSON(needs.filter.outputs.test-matrix) }}
       fail-fast: false
     runs-on: ${{ matrix.runner }}
-    timeout-minutes: 300
+    timeout-minutes: 120
     steps:
       - name: Enable git symlinks on Windows
         shell: bash

From c7d2084e3337a403a2a93022cc8bf6b9df69933c Mon Sep 17 00:00:00 2001
From: albanD <albandes@fb.com>
Date: Mon, 28 Nov 2022 17:55:43 +0000
Subject: [PATCH 1321/1922] Update masked.rst (#89758)

Fix https://github.com/pytorch/pytorch/issues/89734

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89758
Approved by: https://github.com/anjali411, https://github.com/malfet, https://github.com/cpuhrsch
---
 docs/source/masked.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/masked.rst b/docs/source/masked.rst
index 3655a6d79fd98..60b9af7ebcccb 100644
--- a/docs/source/masked.rst
+++ b/docs/source/masked.rst
@@ -56,10 +56,10 @@ There are already a number of existing tutorials that we've written to help user
 -  `Advanced semantics - discussion on why certain decisions were made (e.g. requiring masks to match for binary/reduction operations),
    differences with NumPy's MaskedArray, and reduction semantics`_
 
-.. _Overview - the place to start for new users, discusses how to use MaskedTensors and why they're useful: https://pytorch.org/tutorials/prototype/maskedtensor_overview.html
-.. _Sparsity - MaskedTensor supports sparse COO and CSR data and mask Tensors: https://pytorch.org/tutorials/prototype/maskedtensor_sparsity.html
-.. _Adagrad sparse semantics - a practical example of how MaskedTensor can simplify sparse semantics and implementations: https://pytorch.org/tutorials/prototype/maskedtensor_adagrad.html
-.. _Advanced semantics - discussion on why certain decisions were made (e.g. requiring masks to match for binary/reduction operations), differences with NumPy's MaskedArray, and reduction semantics: https://pytorch.org/tutorials/prototype/maskedtensor_advanced_semantics.html
+.. _Overview - the place to start for new users, discusses how to use MaskedTensors and why they're useful: https://pytorch.org/tutorials/prototype/maskedtensor_overview
+.. _Sparsity - MaskedTensor supports sparse COO and CSR data and mask Tensors: https://pytorch.org/tutorials/prototype/maskedtensor_sparsity
+.. _Adagrad sparse semantics - a practical example of how MaskedTensor can simplify sparse semantics and implementations: https://pytorch.org/tutorials/prototype/maskedtensor_adagrad
+.. _Advanced semantics - discussion on why certain decisions were made (e.g. requiring masks to match for binary/reduction operations), differences with NumPy's MaskedArray, and reduction semantics: https://pytorch.org/tutorials/prototype/maskedtensor_advanced_semantics
 
 Supported Operators
 +++++++++++++++++++

From e1f9bd604fb26f85f088e8c580bcc5fddd1df410 Mon Sep 17 00:00:00 2001
From: Hubert Lu <55214931+hubertlu-tw@users.noreply.github.com>
Date: Mon, 28 Nov 2022 18:35:27 +0000
Subject: [PATCH 1322/1922] [ROCm] Replace layer_norm_grad_input_kernel with
 cuComputeGradInput for ROCm (#87726)

We observed that the native PyTorch LayerNormBackwardKernelImplInternal has suboptimal performance for certain input sizes on AMD GPUs especially when fs (=config_m in our benchmark script) is large and bs (=config_n in our benchmark script) is small (commonly seen in [the CvT model](https://arxiv.org/abs/2103.15808)) in the benchmark script of https://github.com/pytorch/pytorch/pull/68238#issue-1051621716 on AMD GPUs.

This PR is to replace layer_norm_grad_input_kernel with the Apex cuComputeGradInput kernel with some ROCm-specific parameter tuning when fs (=config_m) is larger than or equal to `32768` on AMD GPUs. Some of the code changes in LayerNormBackwardKernelImplInternal are from another PR: https://github.com/pytorch/pytorch/pull/87635

We used the same benchmark script in the previous PR and tested the optimized kernel with various input shapes on AMD MI100 GPU.

**At [the previous PR](https://github.com/pytorch/pytorch/pull/87635):**
<html xmlns:v="urn:schemas-microsoft-com:vml"
xmlns:o="urn:schemas-microsoft-com:office:office"
xmlns:x="urn:schemas-microsoft-com:office:excel"
xmlns="http://www.w3.org/TR/REC-html40">

<head>

<meta name=ProgId content=Excel.Sheet>
<meta name=Generator content="Microsoft Excel 15">
<link id=Main-File rel=Main-File
href="file:///C:/Users/hubertlu/AppData/Local/Temp/msohtmlclip1/01/clip.htm">
<link rel=File-List
href="file:///C:/Users/hubertlu/AppData/Local/Temp/msohtmlclip1/01/clip_filelist.xml">
<!--table
	{mso-displayed-decimal-separator:"\.";
	mso-displayed-thousand-separator:"\,";}
@page
	{mso-header-data:"&L&\0022Arial\0022&10&K0000FF \[AMD Official Use Only - General\]&1\#\000D";
	margin:.75in .7in .75in .7in;
	mso-header-margin:.3in;
	mso-footer-margin:.3in;}
tr
	{mso-height-source:auto;}
col
	{mso-width-source:auto;}
br
	{mso-data-placement:same-cell;}
td
	{padding-top:1px;
	padding-right:1px;
	padding-left:1px;
	mso-ignore:padding;
	color:black;
	font-size:11.0pt;
	font-weight:400;
	font-style:normal;
	text-decoration:none;
	font-family:Calibri, sans-serif;
	mso-font-charset:0;
	mso-number-format:General;
	text-align:general;
	vertical-align:bottom;
	border:none;
	mso-background-source:auto;
	mso-pattern:auto;
	mso-protection:locked visible;
	white-space:nowrap;
	mso-rotate:0;}
.xl65
	{color:windowtext;}
-->
</head>

<body link="#0563C1" vlink="#954F72">

M | N | fwd (half) | fwdbwd (half) | fwd (float) | fwdbwd (float)
-- | -- | -- | -- | -- | --
50432 | 384 | 0.38589 | 0.92603 | 0.38367 | 1.15148
50176 | 384 | 0.38719 | 0.91579 | 0.37815 | 1.13761
200704 | 192 | 0.99787 | 2.39954 | 0.98996 | 2.54284
802816 | 64 | 3.66525 | 7.96952 | 3.61293 | 7.69946
200 | 256 | 0.06578 | 0.34613 | 0.06966 | 0.35449
1000 | 256 | 0.07837 | 0.37631 | 0.07725 | 0.37758
6000 | 256 | 0.09318 | 0.3788 | 0.09202 | 0.37989
6272 | 256 | 0.08694 | 0.36267 | 0.08703 | 0.3615
200 | 512 | 0.06975 | 0.34506 | 0.06973 | 0.34208
1000 | 512 | 0.07012 | 0.36363 | 0.07307 | 0.36741
6000 | 512 | 0.09725 | 0.36251 | 0.09908 | 0.37078
6272 | 512 | 0.09899 | 0.36519 | 0.10068 | 0.37514
200 | 1024 | 0.07188 | 0.33896 | 0.0712 | 0.34683
1000 | 1024 | 0.07357 | 0.3625 | 0.0734 | 0.3598
6000 | 1024 | 0.12642 | 0.38949 | 0.12973 | 0.5035
6272 | 1024 | 0.12901 | 0.40759 | 0.13609 | 0.51871
200 | 1536 | 0.06998 | 0.34782 | 0.07419 | 0.3514
1000 | 1536 | 0.07987 | 0.37915 | 0.07888 | 0.37264
6000 | 1536 | 0.15401 | 0.47524 | 0.15416 | 0.68609
6272 | 1536 | 0.15286 | 0.48843 | 0.17681 | 0.72997
200 | 2048 | 0.07054 | 0.34791 | 0.07289 | 0.35138
1000 | 2048 | 0.07767 | 0.37954 | 0.08554 | 0.37464
6000 | 2048 | 0.18744 | 0.5811 | 0.25004 | 0.93338
6272 | 2048 | 0.20037 | 0.63398 | 0.26918 | 0.97018
200 | 3072 | 0.07687 | 0.36739 | 0.08917 | 0.37845
1000 | 3072 | 0.09323 | 0.38901 | 0.09739 | 0.39823
6000 | 3072 | 0.24314 | 0.89029 | 0.38093 | 1.30719
6272 | 3072 | 0.26079 | 0.92023 | 0.38352 | 1.51012
128 | 2097152 | 6.17775 | 23.876 | 10.27952 | 30.10848
256 | 1048576 | 4.51855 | 19.47637 | 10.07609 | 29.42678
512 | 524288 | 4.13615 | 18.80888 | 10.07853 | 32.29804
1024 | 262144 | 4.47397 | 17.88388 | 9.50367 | 31.15699
2048 | 131072 | 4.2458 | 16.70852 | 9.17979 | 30.51708
4096 | 65536 | 4.24412 | 16.43098 | 8.97651 | 30.1617
8192 | 32768 | 4.24556 | 16.09038 | 8.77001 | 30.3643
16384 | 16384 | 4.14642 | 15.80355 | 8.82402 | 30.35291
32768 | 8192 | 4.12599 | 15.68897 | 8.82605 | 30.43423

</body>

</html>

----

**At this PR:**

<html xmlns:v="urn:schemas-microsoft-com:vml"
xmlns:o="urn:schemas-microsoft-com:office:office"
xmlns:x="urn:schemas-microsoft-com:office:excel"
xmlns="http://www.w3.org/TR/REC-html40">

<head>

<meta name=ProgId content=Excel.Sheet>
<meta name=Generator content="Microsoft Excel 15">
<link id=Main-File rel=Main-File
href="file:///C:/Users/hubertlu/AppData/Local/Temp/msohtmlclip1/01/clip.htm">
<link rel=File-List
href="file:///C:/Users/hubertlu/AppData/Local/Temp/msohtmlclip1/01/clip_filelist.xml">
<!--table
	{mso-displayed-decimal-separator:"\.";
	mso-displayed-thousand-separator:"\,";}
@page
	{mso-header-data:"&L&\0022Arial\0022&10&K0000FF \[AMD Official Use Only - General\]&1\#\000D";
	margin:.75in .7in .75in .7in;
	mso-header-margin:.3in;
	mso-footer-margin:.3in;}
tr
	{mso-height-source:auto;}
col
	{mso-width-source:auto;}
br
	{mso-data-placement:same-cell;}
td
	{padding-top:1px;
	padding-right:1px;
	padding-left:1px;
	mso-ignore:padding;
	color:black;
	font-size:11.0pt;
	font-weight:400;
	font-style:normal;
	text-decoration:none;
	font-family:Calibri, sans-serif;
	mso-font-charset:0;
	mso-number-format:General;
	text-align:general;
	vertical-align:bottom;
	border:none;
	mso-background-source:auto;
	mso-pattern:auto;
	mso-protection:locked visible;
	white-space:nowrap;
	mso-rotate:0;}
.xl65
	{color:windowtext;}
.xl66
	{background:yellow;
	mso-pattern:black none;}
-->
</head>

<body link="#0563C1" vlink="#954F72">

M | N | fwd (half) | fwdbwd (half) | fwd (float) | fwdbwd (float)
-- | -- | -- | -- | -- | --
50432 | 384 | 0.38667 | 0.84133 | 0.37916 | 1.01222
50176 | 384 | 0.3814 | 0.87266 | 0.37858 | 1.04399
200704 | 192 | 0.99902 | 2.14386 | 0.98973 | 2.33265
802816 | 64 | 3.66578 | 6.85376 | 3.6092 | 7.00331
200 | 256 | 0.06607 | 0.34176 | 0.07009 | 0.34548
1000 | 256 | 0.06947 | 0.36461 | 0.07902 | 0.37851
6000 | 256 | 0.09319 | 0.37432 | 0.09342 | 0.36927
6272 | 256 | 0.09544 | 0.37565 | 0.09476 | 0.37377
200 | 512 | 0.07935 | 0.364 | 0.07891 | 0.36894
1000 | 512 | 0.07676 | 0.37552 | 0.07957 | 0.37564
6000 | 512 | 0.10472 | 0.37504 | 0.1051 | 0.38782
6272 | 512 | 0.1069 | 0.36662 | 0.10062 | 0.38506
200 | 1024 | 0.07793 | 0.36561 | 0.08023 | 0.35019
1000 | 1024 | 0.07426 | 0.36729 | 0.07345 | 0.35851
6000 | 1024 | 0.12729 | 0.39219 | 0.12974 | 0.51526
6272 | 1024 | 0.13622 | 0.41627 | 0.14252 | 0.52926
200 | 1536 | 0.07615 | 0.36621 | 0.0797 | 0.3695
1000 | 1536 | 0.08327 | 0.38174 | 0.07938 | 0.37573
6000 | 1536 | 0.14894 | 0.46197 | 0.15268 | 0.63814
6272 | 1536 | 0.15368 | 0.48818 | 0.16309 | 0.71441
200 | 2048 | 0.06935 | 0.36691 | 0.07258 | 0.35548
1000 | 2048 | 0.07738 | 0.36388 | 0.08036 | 0.36452
6000 | 2048 | 0.18757 | 0.58573 | 0.23701 | 0.92915
6272 | 2048 | 0.1938 | 0.61628 | 0.26475 | 0.96896
200 | 3072 | 0.07884 | 0.3673 | 0.07724 | 0.37869
1000 | 3072 | 0.09342 | 0.38193 | 0.09822 | 0.38646
6000 | 3072 | 0.24452 | 0.86776 | 0.38251 | 1.3036
6272 | 3072 | 0.25971 | 0.91053 | 0.38744 | 1.39039
128 | 2097152 | 6.06752 | 23.26379 | 9.87466 | 29.81851
256 | 1048576 | 4.50336 | 19.4614 | 10.11239 | 29.25554
512 | 524288 | 4.12649 | 18.72831 | 10.054 | 32.26784
1024 | 262144 | 4.40855 | 17.77993 | 9.38856 | 31.18679
2048 | 131072 | 4.18716 | 16.74615 | 9.14487 | 30.24603
4096 | 65536 | 4.17374 | 16.34444 | 8.94894 | 30.0326
8192 | 32768 | 4.19095 | 16.05751 | 8.70358 | 30.14669
16384 | 16384 | 4.15404 | 15.83771 | 8.80042 | 30.5022
32768 | 8192 | 4.12515 | 15.5657 | 8.66138 | 28.87386

</body>

</html>

---

**Performance Improvement (%)**

<html xmlns:v="urn:schemas-microsoft-com:vml"
xmlns:o="urn:schemas-microsoft-com:office:office"
xmlns:x="urn:schemas-microsoft-com:office:excel"
xmlns="http://www.w3.org/TR/REC-html40">

<head>

<meta name=ProgId content=Excel.Sheet>
<meta name=Generator content="Microsoft Excel 15">
<link id=Main-File rel=Main-File
href="file:///C:/Users/hubertlu/AppData/Local/Temp/msohtmlclip1/01/clip.htm">
<link rel=File-List
href="file:///C:/Users/hubertlu/AppData/Local/Temp/msohtmlclip1/01/clip_filelist.xml">
<!--table
	{mso-displayed-decimal-separator:"\.";
	mso-displayed-thousand-separator:"\,";}
@page
	{mso-header-data:"&L&\0022Arial\0022&10&K0000FF \[AMD Official Use Only - General\]&1\#\000D";
	margin:.75in .7in .75in .7in;
	mso-header-margin:.3in;
	mso-footer-margin:.3in;}
tr
	{mso-height-source:auto;}
col
	{mso-width-source:auto;}
br
	{mso-data-placement:same-cell;}
td
	{padding-top:1px;
	padding-right:1px;
	padding-left:1px;
	mso-ignore:padding;
	color:black;
	font-size:11.0pt;
	font-weight:400;
	font-style:normal;
	text-decoration:none;
	font-family:Calibri, sans-serif;
	mso-font-charset:0;
	mso-number-format:General;
	text-align:general;
	vertical-align:bottom;
	border:none;
	mso-background-source:auto;
	mso-pattern:auto;
	mso-protection:locked visible;
	white-space:nowrap;
	mso-rotate:0;}
.xl65
	{color:windowtext;}
.xl66
	{mso-number-format:"0\.000";}
-->
</head>

<body link="#0563C1" vlink="#954F72">

M | N | fwdbwd, torch.float16 | fwdbwd, torch.float32
-- | -- | -- | --
50432 | 384 | 9.147 | 12.094
50176 | 384 | 4.710 | 8.230
200704 | 192 | 10.655 | 8.266
802816 | 64 | 14.000 | 9.042
200 | 256 | 1.263 | 2.542
1000 | 256 | 3.109 | -0.246
6000 | 256 | 1.183 | 2.796
6272 | 256 | -3.579 | -3.394
200 | 512 | -5.489 | -7.852
1000 | 512 | -3.270 | -2.240
6000 | 512 | -3.456 | -4.596
6272 | 512 | -0.392 | -2.644
200 | 1024 | -7.862 | -0.969
1000 | 1024 | -1.321 | 0.359
6000 | 1024 | -0.693 | -2.336
6272 | 1024 | -2.130 | -2.034
200 | 1536 | -5.287 | -5.151
1000 | 1536 | -0.683 | -0.829
6000 | 1536 | 2.792 | 6.989
6272 | 1536 | 0.051 | 2.132
200 | 2048 | -5.461 | -1.167
1000 | 2048 | 4.126 | 2.701
6000 | 2048 | -0.797 | 0.453
6272 | 2048 | 2.792 | 0.126
200 | 3072 | 0.024 | -0.063
1000 | 3072 | 1.820 | 2.956
6000 | 3072 | 2.531 | 0.275
6272 | 3072 | 1.054 | 7.929
128 | 2097152 | 2.564 | 0.963
256 | 1048576 | 0.077 | 0.582
512 | 524288 | 0.428 | 0.094
1024 | 262144 | 0.581 | -0.096
2048 | 131072 | -0.225 | 0.888
4096 | 65536 | 0.527 | 0.428
8192 | 32768 | 0.204 | 0.717
16384 | 16384 | -0.216 | -0.492
32768 | 8192 | 0.786 | 5.127

</body>

</html>

CC: @jeffdaily

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87726
Approved by: https://github.com/ngimel
---
 .../src/ATen/native/cuda/layer_norm_kernel.cu | 132 ++++++++++++++++++
 1 file changed, 132 insertions(+)

diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
index 693524818fb43..3fb041c61d454 100644
--- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@@ -1029,6 +1029,110 @@ void cuComputeGradGammaBeta(
     }
 }
 
+template<typename T, typename T_ACC> __global__
+void cuComputeGradInput(
+    const T* __restrict__ dout,
+    const T* __restrict__ input,
+    const int64_t M,
+    const int64_t N,
+    const T_ACC* __restrict__ mean,
+    const T_ACC* __restrict__ rstd,
+    const T* gamma,
+    T* grad_input)
+{
+  for (int i1=blockIdx.y; i1 < M; i1 += gridDim.y) {
+    T_ACC sum_loss1 = T_ACC(0);
+    T_ACC sum_loss2 = T_ACC(0);
+    T_ACC c_mean = mean[i1];
+    const T_ACC c_rstd = rstd[i1];
+    const T* k_input = input + i1*N;
+    const T* k_dout = dout + i1*N;
+    const int numx = blockDim.x * blockDim.y;
+    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
+    if (gamma != NULL) {
+      // Optimization for ROCm MI100
+      for( int l = 0; l < N ; l += numx) {
+        int idx = l + thrx;
+        const T_ACC gamma_idx = static_cast<T_ACC>((idx<N) ? gamma[idx] : T(0));
+        const T_ACC c_h = static_cast<T_ACC>((idx<N) ? k_input[idx] : T(0));
+        const T_ACC c_loss = static_cast<T_ACC>((idx<N) ? k_dout[idx] : T(0));
+        sum_loss1 += c_loss * gamma_idx;
+        sum_loss2 += c_loss * gamma_idx * (c_h - c_mean) * c_rstd;
+      }
+    } else {
+      for( int l = 0; l < N ; l += numx) {
+        int idx = l + thrx;
+        const T_ACC c_h = static_cast<T_ACC>((idx<N) ? k_input[idx] : T(0));
+        const T_ACC c_loss = static_cast<T_ACC>((idx<N) ? k_dout[idx] : T(0));
+        sum_loss1 += c_loss;
+        sum_loss2 += c_loss * (c_h - c_mean) * c_rstd;
+      }
+    }
+    // intra-warp reductions
+    for (int mask = blockDim.x/2;  mask > 0;  mask /= 2) {
+      sum_loss1 += WARP_SHFL_XOR(sum_loss1, mask);
+      sum_loss2 += WARP_SHFL_XOR(sum_loss2, mask);
+    }
+    // inter-warp reductions
+    if (blockDim.y > 1) {
+      alignas(sizeof(double)) extern __shared__ char shared[];
+      T_ACC * buf = reinterpret_cast<T_ACC*>(&shared);
+      for (int offset = blockDim.y/2;  offset > 0;  offset /= 2) {
+        // upper half of warps write to shared
+        if (threadIdx.y >= offset && threadIdx.y < 2*offset) {
+          const int wrt_i = (threadIdx.y - offset) * blockDim.x + threadIdx.x;
+          buf[2*wrt_i] = sum_loss1;
+          buf[2*wrt_i+1] = sum_loss2;
+        }
+        __syncthreads();
+        // lower half merges
+        if (threadIdx.y < offset) {
+          const int read_i = threadIdx.y * blockDim.x + threadIdx.x;
+          sum_loss1 += buf[2*read_i];
+          sum_loss2 += buf[2*read_i+1];
+        }
+        __syncthreads();
+      }
+      if (threadIdx.y == 0) {
+        buf[2*threadIdx.x] = sum_loss1;
+        buf[2*threadIdx.x+1] = sum_loss2;
+      }
+      __syncthreads();
+      if (threadIdx.y !=0) {
+        sum_loss1 = buf[2*threadIdx.x];
+        sum_loss2 = buf[2*threadIdx.x+1];
+      }
+    }
+    // all threads now have the two sums over l
+    T_ACC fH = (T_ACC)N;
+    T_ACC term1 = (T_ACC(1) / fH) * c_rstd;
+    T* k_grad_input = grad_input + i1*N;
+    if (gamma != NULL) {
+      for (int l = thrx;  l < N;  l+=numx) {
+        const T_ACC c_h = static_cast<T_ACC>(k_input[l]);
+        const T_ACC c_loss = static_cast<T_ACC>(k_dout[l]);
+        T_ACC f_grad_input = fH * c_loss * gamma[l];
+        f_grad_input -= sum_loss1;
+        f_grad_input -= (c_h - c_mean) * c_rstd * sum_loss2;
+        f_grad_input *= term1;
+        k_grad_input[l] = static_cast<T>(f_grad_input);
+      }
+    } else {
+      for (int l = thrx;  l < N;  l+=numx) {
+        const T_ACC c_h = static_cast<T_ACC>(k_input[l]);
+        const T_ACC c_loss = static_cast<T_ACC>(k_dout[l]);
+        T_ACC f_grad_input = fH * c_loss;
+        f_grad_input -= sum_loss1;
+        f_grad_input -= (c_h - c_mean) * c_rstd * sum_loss2;
+        f_grad_input *= term1;
+        k_grad_input[l] = static_cast<T>(f_grad_input);
+      }
+    }
+    // prevent race where buf is written again before reads are done
+    __syncthreads();
+  }
+}
+
 template <typename T>
 void LayerNormBackwardKernelImplInternal(
     const Tensor& dY,
@@ -1059,11 +1163,39 @@ void LayerNormBackwardKernelImplInternal(
   cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
   const int warp_size = at::cuda::warp_size();
   if (dX_data != nullptr) {
+#if defined __HIP_PLATFORM_HCC__
+    if (M >= 32768) {
+      const uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
+      const dim3 blocks1(1, std::min((uint64_t)M, maxGridY), 1);
+      dim3 threads1(warp_size, 4, 1);
+      threads1.y = 2; // Optimization for ROCm
+      int nshared =
+              threads1.y > 1 ?
+              threads1.y*threads1.x*sizeof(T_ACC) :
+              0;
+      cuComputeGradInput<<<blocks1, threads1, nshared, cuda_stream>>>(
+              dY_data,
+              X_data,
+              M, N,
+              mean_data,
+              rstd_data,
+              gamma_data,
+              dX_data);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+    } else {
+      const dim3 blocks(M);
+      int nshared = (num_threads()/warp_size) * sizeof(T_ACC);
+      layer_norm_grad_input_kernel<<<blocks, num_threads(), nshared, cuda_stream>>>(dY_data,
+      X_data, mean_data, rstd_data, gamma_data, dX_data, N);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    }
+#else
     const dim3 blocks(M);
     int nshared = (num_threads()/warp_size) * sizeof(T_ACC);
     layer_norm_grad_input_kernel<<<blocks, num_threads(), nshared, cuda_stream>>>(dY_data,
     X_data, mean_data, rstd_data, gamma_data, dX_data, N);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
+#endif
   }
 
   if (dgamma->defined() || dbeta->defined()) {

From 390b7ac98e17bd23fe3a9135d22fe0936fd24c9e Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 28 Nov 2022 14:57:42 +0000
Subject: [PATCH 1323/1922] Refactor how AOTAutograd backends are defined
 (#89736)

There was a lot of strangeness in how AOTAutograd backends were previously defined. This refactor replaces the strangeness with something simple and straightforward. The improvements:

- There is no longer a footgun aot_autograd "backend" which doesn't actually work. No more mistyping `torch._dynamo.optimize("aot_autograd")` when you meant "aot_eager"
- Deleted aot_print because it's annoying and anyway there's no uses of it
- Instead of having BOTH the backend Subgraph and AotAutogradStrategy, there is now only an aot_autograd function which takes the kwargs to configure AOTAutograd, and then gives you a compiler function that does AOTAutograd given those kwargs. Easy.
- The primary downside is that we are now eagerly populating all of the kwargs, and that can get us into import cycle shenanigans. Some cycles I resolved directly (e.g., we now no longer manually disable the forward function before passing it to aot_autograd; aot_autograd it does it for us), but for getting inductor decompositions I had to make it take a lambda so I could lazily populate the decomps later.

New code is 130 lines shorter!

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89736
Approved by: https://github.com/anjali411, https://github.com/albanD
---
 functorch/_src/aot_autograd.py          |   3 -
 test/test_testing.py                    |   1 -
 torch/_dynamo/eval_frame.py             |   3 +-
 torch/_dynamo/optimizations/backends.py |  16 --
 torch/_dynamo/optimizations/training.py | 310 ++++++++----------------
 torch/_inductor/compile_fx.py           |  13 +-
 torch/cuda/_dynamo_graphs.py            | 159 ------------
 7 files changed, 112 insertions(+), 393 deletions(-)
 delete mode 100644 torch/cuda/_dynamo_graphs.py

diff --git a/functorch/_src/aot_autograd.py b/functorch/_src/aot_autograd.py
index 4336cbf0df509..2f3d8bb99f169 100644
--- a/functorch/_src/aot_autograd.py
+++ b/functorch/_src/aot_autograd.py
@@ -15,7 +15,6 @@
 import torch.utils._pytree as pytree
 import torch.utils.dlpack
 from torch import Tensor
-from torch._dynamo import disable as disable_torchdynamo
 from torch._dynamo.utils import dynamo_timed
 from torch._subclasses import FakeTensorMode, CrossRefFakeMode
 from torch.fx import immutable_collections, Interpreter
@@ -1315,7 +1314,6 @@ class CompiledFunction(torch.autograd.Function):
         fw_metadata = _fw_metadata
 
         @staticmethod
-        @disable_torchdynamo
         def forward(ctx, *deduped_flat_tensor_args):
 
             # There is a pretty complicated calling convention around what the compiled fw returns.
@@ -1361,7 +1359,6 @@ def forward(ctx, *deduped_flat_tensor_args):
             return tuple(fw_outs[0:num_forward_returns])
 
         @staticmethod
-        @disable_torchdynamo
         def backward(ctx, *all_flat_args):
             # Calling convention: we expect a grad_out passed to the backward:
             # - for every output of the fw that does *not* alias an input
diff --git a/test/test_testing.py b/test/test_testing.py
index 6dc06a8a2aeb8..821a30ab432b2 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -1788,7 +1788,6 @@ def test_circular_dependencies(self) -> None:
                            "torch.contrib.",  # something weird
                            "torch.testing._internal.distributed.",  # just fails
                            "torch.ao.pruning._experimental.",  # depends on pytorch_lightning, not user-facing
-                           "torch.cuda._dynamo_graphs",  # depends on torchdynamo
                            ]
         # See https://github.com/pytorch/pytorch/issues/77801
         if not sys.version_info >= (3, 9):
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index c6d8781922cec..a04bc72aa6cbb 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -37,7 +37,6 @@
 from . import config, convert_frame, skipfiles, utils
 from .exc import ResetRequired
 from .mutation_guard import install_generation_tagging_init
-from .optimizations.distributed import DDPOptimizer
 from .output_graph import CompilerFn
 from .types import DynamoCallback
 from .utils import compile_times
@@ -311,6 +310,8 @@ def catch_errors(frame, cache_size):
             ddp_module = DistributedDataParallel._get_active_ddp_module()
             if ddp_module:
                 with compile_lock:
+                    from .optimizations.distributed import DDPOptimizer
+
                     ddp_optimizer = DDPOptimizer(
                         bucket_bytes_cap=ddp_module.bucket_bytes_cap,
                         backend_compile_fn=callback._torchdynamo_orig_callable,
diff --git a/torch/_dynamo/optimizations/backends.py b/torch/_dynamo/optimizations/backends.py
index 256618dd5aafa..0df57eb4273d1 100644
--- a/torch/_dynamo/optimizations/backends.py
+++ b/torch/_dynamo/optimizations/backends.py
@@ -517,22 +517,6 @@ def run(*new_inputs):
     return run
 
 
-@create_backend
-def aot_autograd(subgraph, **kwargs):
-    def _wrapped_bw_compiler(*args, **kwargs):
-        # stop TorchDynamo from trying to compile our generated backwards pass
-        return disable(disable(bw_compiler)(*args, **kwargs))
-
-    bw_compiler = kwargs.get("bw_compiler") or kwargs["fw_compiler"]
-    kwargs["bw_compiler"] = _wrapped_bw_compiler
-
-    from functorch.compile import aot_module_simplified
-
-    from .. import disable
-
-    return aot_module_simplified(subgraph.model, **kwargs)
-
-
 def tvm_compile(jit_mod, example_inputs, log_file=None, **kwargs):
     if jit_mod is None:
         return None
diff --git a/torch/_dynamo/optimizations/training.py b/torch/_dynamo/optimizations/training.py
index c42c10c30a525..8ecdc1d195f95 100644
--- a/torch/_dynamo/optimizations/training.py
+++ b/torch/_dynamo/optimizations/training.py
@@ -6,6 +6,15 @@
 from importlib import import_module
 from typing import Set
 
+from functorch._src.compilers import debug_nop
+
+from functorch.compile import (
+    aot_module_simplified,
+    min_cut_rematerialization_partition,
+    nop,
+    ts_compile,
+)
+
 import torch
 from torch.fx import GraphModule
 from torch.fx.passes.backends.cudagraphs import partition_cudagraphs
@@ -13,7 +22,7 @@
 from torch.nn import Module
 from torch.utils._pytree import tree_map
 
-from .. import config
+from .. import config, eval_frame
 from ..utils import clone_inputs, count_calls, counters
 from .analysis import has_mutation
 from .backends import BACKENDS
@@ -22,6 +31,62 @@
 log = logging.getLogger(__name__)
 
 
+def aot_autograd(**kwargs):
+    def compiler_fn(gm: torch.fx.GraphModule, example_inputs):
+        import functorch.compile
+
+        # Hack to get around circular import problems with aot_inductor_debug
+        if callable(kwargs.get("decompositions")):
+            kwargs["decompositions"] = kwargs["decompositions"]()
+
+        # TODO: stop monkeypatching here (without even cleaning up, UGH!)
+        functorch.compile.config.use_functionalize = True
+        functorch.compile.config.use_fake_tensor = True
+
+        force_compile_tiny_graphs = kwargs.pop("force_compile_tiny_graphs", False)
+
+        if count_calls(gm.graph) < 2 and not force_compile_tiny_graphs:
+            return gm  # no point for tiny graphs
+
+        counters["aot_autograd"]["total"] += 1
+        use_fallback = False
+
+        if not functorch.compile.config.use_functionalize and config.normalize_ir:
+            try:
+                gm = normalize_ir(gm, clone_inputs(example_inputs))
+            except Exception:
+                log.debug("TorchDynamo unable to remove mutation")
+                use_fallback = True
+
+        # NB: no clone here on example inputs
+        if not is_aot_autograd_safe_to_run(gm, example_inputs):
+            use_fallback = True
+
+        if use_fallback:
+            log.debug("Unable to use AOT Autograd because graph has mutation")
+            counters["aot_autograd"]["not_ok"] += 1
+            return gm
+
+        # OK attempt to compile
+
+        def _wrapped_bw_compiler(*args, **kwargs):
+            # stop TorchDynamo from trying to compile our generated backwards pass
+            return eval_frame.disable(eval_frame.disable(bw_compiler)(*args, **kwargs))
+
+        bw_compiler = kwargs.get("bw_compiler") or kwargs["fw_compiler"]
+        kwargs["bw_compiler"] = _wrapped_bw_compiler
+
+        try:
+            cg = aot_module_simplified(gm, **kwargs)
+            counters["aot_autograd"]["ok"] += 1
+            return eval_frame.disable(cg)
+        except Exception:
+            counters["aot_autograd"]["not_ok"] += 1
+            raise
+
+    return compiler_fn
+
+
 def is_aot_autograd_safe_to_run(gm, example_inputs):
     """
     There are some known issues with Aot Autograd. This is a workaround to catch
@@ -86,107 +151,28 @@ def raise_or_warn(reason):
     return True
 
 
-class AotAutogradStrategy(object):
-    """Base class for backend strategies that use AOT Autograd"""
-
-    @classmethod
-    def compile_fn(cls, gm: torch.fx.GraphModule, example_inputs):
-        if count_calls(gm.graph) < 2:
-            return gm  # no point for tiny graphs
-        return cls(gm, example_inputs).verified_candidate()
-
-    def __init__(self, gm: torch.fx.GraphModule, example_inputs):
-        import functorch.compile
-
-        functorch.compile.config.use_functionalize = True
-        functorch.compile.config.use_fake_tensor = True
-
-        super(AotAutogradStrategy, self).__init__()
-        counters["aot_autograd"]["total"] += 1
-        self.use_fallback = False
-        self.original_example_inputs = example_inputs
-        self.gm = gm
-
-        if not functorch.compile.config.use_functionalize and config.normalize_ir:
-            try:
-                self.gm = normalize_ir(gm, self.example_inputs)
-            except Exception:
-                log.debug("TorchDynamo unable to remove mutation")
-                self.use_fallback = True
-                pass
-
-        if not is_aot_autograd_safe_to_run(gm, example_inputs):
-            self.use_fallback = True
-
-    @property
-    def example_inputs(self):
-        return clone_inputs(self.original_example_inputs)
-
-    def verified_candidate(self):
-        if self.use_fallback:
-            log.debug("Unable to use AOT Autograd because graph has mutation")
-            counters["aot_autograd"]["not_ok"] += 1
-            return self.gm
-        cg = self.candidate()
-        if cg is None:
-            counters["aot_autograd"]["not_ok"] += 1
-            raise RuntimeError("AOT Autograd failed to compile")
-        counters["aot_autograd"]["ok"] += 1
-        return cg
-
-    def candidate(self):
-        raise NotImplementedError()
-
-
-class AotNop(AotAutogradStrategy):
-    """Useful for debugging purpose"""
-
-    def candidate(self):
-        from functorch._src.compilers import debug_nop
-        from functorch.compile import nop
-
-        DEBUG = False
-        return BACKENDS["aot_autograd"](
-            self.gm, self.example_inputs, fw_compiler=debug_nop if DEBUG else nop
-        )  # type: ignore[call-arg]
-
-
-aot_eager = AotNop.compile_fn
-
-
-class AotTorchscript(AotAutogradStrategy):
-    """
-    AOT Autograd with torchscript backend. Default partitioner.
-    """
-
-    def candidate(self):
-        from functorch.compile import ts_compile
-
-        return BACKENDS["aot_autograd"](
-            self.gm, self.example_inputs, fw_compiler=ts_compile
-        )  # type: ignore[call-arg]
-
-
-aot_ts = AotTorchscript.compile_fn
+DEBUG = False
 
-# Global counter to differentiate between different graphs.
-graph_idx = 0
+# Useful for debugging purpose
+aot_eager = aot_autograd(fw_compiler=debug_nop if DEBUG else nop)
 
+# AOT Autograd with torchscript backend. Default partitioner.
+aot_ts = aot_autograd(fw_compiler=ts_compile)
 
-class AotPrint(AotNop):
-    """Saves all the gm models so that we can run them separately"""
-
-    def candidate(self):
-        global graph_idx
-        module_idx = "module_" + str(graph_idx)
-        self.gm.to_folder(module_idx, "Bar")
-        for idx, x in enumerate(self.example_inputs):
-            torch.save(x, module_idx + "_tensor" + str(idx) + ".pt")
-        graph_idx += 1
-        return super(AotPrint, self).candidate()
-
-
-aot_print = AotPrint.compile_fn
+# Uses TorchInductor AOT Autograd decomps and partitioner to isolate aot vs
+# inductor problems.
+aot_inductor_debug = aot_autograd(
+    # these are taken from memory_efficient_fusion()
+    fw_compiler=nop,
+    bw_compiler=nop,
+    # NB: lambda here is to delay import of inductor
+    decompositions=lambda: import_module(
+        f"{config.inductor_import}.compile_fx"
+    ).select_decomp_table(),
+    partition_fn=functools.partial(
+        min_cut_rematerialization_partition, compiler="inductor"
+    ),
+)
 
 
 def mem_efficient_fusion_kwargs(use_decomps):
@@ -209,66 +195,15 @@ def mem_efficient_fusion_kwargs(use_decomps):
     return kwargs
 
 
-class AotMemEfficientFusion(AotAutogradStrategy):
-    """Use Min cut rematerilization and TorchScript+nvFuser with AOT Autograd"""
-
-    def candidate(self):
-        kwargs = mem_efficient_fusion_kwargs(use_decomps=True)
-        return BACKENDS["aot_autograd"](self.gm, self.example_inputs, **kwargs)  # type: ignore[call-arg]
-
-
-class AotMemEfficientFusionNoDecomps(AotAutogradStrategy):
-    """Use Min cut rematerilization and TorchScript+nvFuser with AOT Autograd"""
-
-    def candidate(self):
-        kwargs = mem_efficient_fusion_kwargs(use_decomps=False)
-        return BACKENDS["aot_autograd"](self.gm, self.example_inputs, **kwargs)  # type: ignore[call-arg]
-
-
-class AotInductorDebug(AotAutogradStrategy):
-    """
-    Uses TorchInductor Aot Autograd decopms and partitioner to isolate aot vs
-    inductor problems.
-    """
-
-    def candidate(self):
-        from functorch.compile import min_cut_rematerialization_partition, nop
-
-        decompositions = import_module(
-            f"{config.inductor_import}.compile_fx"
-        ).select_decomp_table()
+# Use min cut rematerialization and TorchScript+nvFuser with AOT Autograd
+aot_mem_efficient_fusion = aot_autograd(**mem_efficient_fusion_kwargs(use_decomps=True))
+aot_mem_efficient_fusion_no_decomp = aot_autograd(
+    **mem_efficient_fusion_kwargs(use_decomps=False)
+)
 
-        kwargs = {
-            # these are taken from memory_efficient_fusion()
-            "fw_compiler": nop,
-            "bw_compiler": nop,
-            "decompositions": decompositions,
-            "partition_fn": functools.partial(
-                min_cut_rematerialization_partition, compiler="inductor"
-            ),
-        }
-        return BACKENDS["aot_autograd"](self.gm, self.example_inputs, **kwargs)  # type: ignore[call-arg]
-
-
-aot_inductor_debug = AotInductorDebug.compile_fn
-
-
-class AOTMemEfficientFusionWithContext:
-    """Pass TorchScript+nvFuser context to TorchDynamo"""
-
-    def __init__(self, use_decomps=True):
-        self.backend_ctx_ctor = lambda: torch.jit.fuser("fuser2")
-        self.use_decomps = use_decomps
-
-    def __call__(self, gm: torch.fx.GraphModule, example_inputs):
-        if self.use_decomps:
-            return AotMemEfficientFusion.compile_fn(gm, example_inputs)
-        else:
-            return AotMemEfficientFusionNoDecomps.compile_fn(gm, example_inputs)
-
-
-aot_mem_efficient_fusion = AOTMemEfficientFusionWithContext(True)
-aot_mem_efficient_fusion_no_decomp = AOTMemEfficientFusionWithContext(False)
+# Pass TorchScript+nvFuser context to TorchDynamo
+aot_mem_efficient_fusion.backend_ctx_ctor = lambda: torch.jit.fuser("fuser2")
+aot_mem_efficient_fusion_no_decomp.backend_ctx_ctor = lambda: torch.jit.fuser("fuser2")
 
 
 def prims_executor(gm, inputs, *, executor):
@@ -332,27 +267,15 @@ def func(primals, tangents):
 
 
 def create_nvprims_backend(*, executor):
-    class NvPrims(AotAutogradStrategy):
-        def __init__(self, gm: torch.fx.GraphModule, example_inputs):
-            super(NvPrims, self).__init__(gm, example_inputs)
-            self.executor = executor
-
-        def candidate(self):
-            from torch._dynamo import disable
-
-            return BACKENDS["aot_autograd"](
-                self.gm,
-                self.example_inputs,
-                fw_compiler=partial(prims_executor, executor=self.executor),
-                bw_compiler=partial(prims_executor, executor=self.executor),
-                partition_fn=disable(nvprims_fw_bw_partition_fn),
-            )  # type: ignore[call-arg]
-
-    return NvPrims
+    return aot_autograd(
+        fw_compiler=partial(prims_executor, executor=executor),
+        bw_compiler=partial(prims_executor, executor=executor),
+        partition_fn=nvprims_fw_bw_partition_fn,
+    )
 
 
-aot_nvprims_nvfuser = create_nvprims_backend(executor="nvfuser").compile_fn
-aot_nvprims_aten = create_nvprims_backend(executor="aten").compile_fn
+aot_nvprims_nvfuser = create_nvprims_backend(executor="nvfuser")
+aot_nvprims_aten = create_nvprims_backend(executor="aten")
 
 
 def cloner(t):
@@ -476,33 +399,7 @@ def cudagraphs(model, inputs):
     return model
 
 
-def raw_aot_autograd_cudagraphs(model, inputs):
-    kwargs = {
-        # these are taken from memory_efficient_fusion()
-        "fw_compiler": cudagraphs,
-        "bw_compiler": cudagraphs,
-    }
-
-    def _wrapped_bw_compiler(*args, **kwargs):
-        # stop TorchDynamo from trying to compile our generated backwards pass
-        return disable(disable(bw_compiler)(*args, **kwargs))  # type: ignore[operator]
-
-    bw_compiler = kwargs.get("bw_compiler") or kwargs["fw_compiler"]
-    kwargs["bw_compiler"] = _wrapped_bw_compiler
-
-    from functorch.compile import aot_module_simplified  # type: ignore[import]
-
-    from .. import disable
-
-    return aot_module_simplified(model, **kwargs)
-
-
-class AotAutogradCudaGraphs(AotAutogradStrategy):
-    def candidate(self):
-        return raw_aot_autograd_cudagraphs(self.gm, self.example_inputs)
-
-
-aot_cudagraphs = AotAutogradCudaGraphs.compile_fn
+aot_cudagraphs = aot_autograd(fw_compiler=cudagraphs, bw_compiler=cudagraphs)
 
 
 def create_aot_backends():
@@ -512,11 +409,6 @@ def create_aot_backends():
     # aot_eager uses AOT Autograd backend with nop compiler. It is helpful in debugging.
     BACKENDS["aot_eager"] = aot_eager
 
-    # aot_eager uses AOT Autograd backend with print compiler. It prints the
-    # graphs and also saves the graph modules that are sent to AOT Autograd.
-    # This is helpful for debugging.
-    BACKENDS["aot_print"] = aot_print
-
     # aot_ts uses torchscript backend. We can use this with both nnc and nvfuser
     # by using the relevant fuser with torch.jit.fuser(...)
     BACKENDS["aot_ts"] = aot_ts
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index c482e55a954da..3472f0e2efec1 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -27,7 +27,7 @@
 log = logging.getLogger(__name__)
 ALIGNMENT = 16
 
-aot_autograd = dynamo_optimizations.backends.aot_autograd
+aot_autograd = dynamo_optimizations.training.aot_autograd
 normalize_ir = dynamo_optimizations.normalize.normalize_ir
 is_aot_autograd_safe_to_run = dynamo_optimizations.training.is_aot_autograd_safe_to_run
 count_calls = dynamo_utils.count_calls
@@ -394,12 +394,17 @@ def bw_compiler(model: torch.fx.GraphModule, example_inputs):
         # in functorch/_src/aot_autograd.py::aot_module_simplified::aot_function_simplified::new_func
         # once torchdynamo is merged into pytorch
         return aot_autograd(
-            model_,
-            example_inputs_,
             fw_compiler=fw_compiler,
             bw_compiler=bw_compiler,
             decompositions=select_decomp_table(),
             partition_fn=functools.partial(
                 min_cut_rematerialization_partition, compiler="inductor"
             ),
-        )
+            # A "tiny" graph can actually decompose into multiple
+            # operators (if it's a decomposition) and inductor can
+            # do a better job on it in this case
+            #
+            # Also, for some reason, test_comprehensive___rmatmul___cpu
+            # fails without forcing a compile lol.
+            force_compile_tiny_graphs=True,
+        )(model_, example_inputs_)
diff --git a/torch/cuda/_dynamo_graphs.py b/torch/cuda/_dynamo_graphs.py
deleted file mode 100644
index 1b2211ed32b23..0000000000000
--- a/torch/cuda/_dynamo_graphs.py
+++ /dev/null
@@ -1,159 +0,0 @@
-import torch
-from torch.fx import GraphModule
-from torch.nn import Module
-from torch.fx.passes.backends.cudagraphs import partition_cudagraphs
-from torch.multiprocessing.reductions import StorageWeakRef
-from torch.utils._pytree import tree_map
-import torch._dynamo  # type: ignore[import]
-from torch._dynamo.optimizations.training import AotAutogradStrategy  # type: ignore[import]
-
-import operator
-from collections import defaultdict
-from typing import Set, Dict, Any
-
-# TODO: maybe this should live in torch._dynamo instead
-
-__all__ = ['aot_autograd_cudagraphs']
-
-def cloner(t):
-    if isinstance(t, torch.Tensor):
-        return t.clone()
-    else:
-        return t
-
-
-class CudaGraphModule(Module):
-    gm: GraphModule
-    mutated_inputs: Set[int]
-
-    def __init__(self, gm, mutated_inputs):
-        super().__init__()
-        self.gm = gm
-        self.mutated_inputs = mutated_inputs
-
-    warmed_up = False
-
-    # these are all None or all filled
-    graph = None
-    static_inputs = None
-    static_outputs = None
-
-    # NB: we override __call__ as we don't need any nn.Module machinery
-    # and to reduce overhead
-    def __call__(self, *args):
-        # TODO: once we've recorded here, we'd like to replace the __call__
-        # implementation with compiled bytecode that copies into static, replays
-        # the cuda graph, then copies out.  First condition is the hotpath,
-        # needs optimizing
-        if self.graph is not None:
-            assert len(args) == len(self.static_inputs)
-            for dst, src in zip(self.static_inputs, args):
-                dst.copy_(src)
-            self.graph.replay()
-            for i in self.mutated_inputs:
-                args[i].copy_(self.static_inputs[i])
-            return tree_map(cloner, self.static_outputs)
-
-        elif self.warmed_up:
-            # record
-            self.static_inputs = [x.clone() for x in args]
-            self.graph = torch.cuda.CUDAGraph()
-            with torch.cuda.graph(self.graph):
-                self.static_outputs = self.gm(*self.static_inputs)
-            # NB: recording doesn't actually run the operations, so
-            # now we immediately replay the graph to serve up the result
-            self.graph.replay()
-            for i in self.mutated_inputs:
-                args[i].copy_(self.static_inputs[i])
-            return tree_map(cloner, self.static_outputs)
-
-        else:
-            # warmup
-            stream = torch.cuda.Stream()
-            stream.wait_stream(torch.cuda.current_stream())
-            with torch.cuda.stream(stream):
-                r = self.gm(*args)
-            torch.cuda.current_stream().wait_stream(stream)
-            self.warmed_up = True
-            return r
-
-
-# Interpreter versions of these passes can be found at
-# https://gist.github.com/ezyang/df2d746cac3b2c7d55c181e37c57ef23
-
-
-def find_input_mutations(g):
-    FK = 'fake_result'
-    inputs = defaultdict(set)
-    input_idx = 0
-    mutated_inputs = set()
-    for n in g.nodes:
-        if n.op == 'placeholder':
-            inputs[StorageWeakRef(n.meta[FK]._typed_storage())].add(input_idx)
-            input_idx += 1
-        elif n.op == 'call_function':
-            if n.target is operator.getitem:
-                continue
-            schema = n.target._schema
-            for i, arg in enumerate(schema.arguments):
-                if i < len(n.args):
-                    argument = n.args[i]
-                else:
-                    if arg.name not in n.kwargs:
-                        continue
-                    argument = n.kwargs[arg.name]
-                mut_arg = False
-                if arg.alias_info:
-                    if arg.alias_info.is_write:
-                        mut_arg = True
-                if mut_arg:
-                    # TODO: not correct for args that contain tensors in a struct
-                    # like list
-                    mutated_inputs |= inputs[StorageWeakRef(argument.meta[FK]._typed_storage())]
-        # TODO: error on unrecognized nodes
-    return mutated_inputs
-
-
-# Mutates input graph
-def apply_cuda_graphs(gm):
-    for n in gm.graph.nodes:
-        if n.op == 'call_module':
-            assert not n.kwargs
-            submod = gm.get_submodule(n.target)
-            gm.delete_submodule(n.target)
-            mutated_inputs = find_input_mutations(submod.graph)
-            gm.add_submodule(n.target, CudaGraphModule(submod, mutated_inputs))
-    # NB: we didn't actually change the graph, no need for recompile
-
-
-def cudagraphs(model, inputs):
-    model = partition_cudagraphs(model, inputs)
-    apply_cuda_graphs(model)
-    return model
-
-
-def raw_aot_autograd_cudagraphs(model, inputs):
-    kwargs: Dict[str, Any] = {
-        # these are taken from memory_efficient_fusion()
-        "fw_compiler": cudagraphs,
-        "bw_compiler": cudagraphs,
-    }
-
-    def _wrapped_bw_compiler(*args, **kwargs):
-        # stop dynamo from trying to compile our generated backwards pass
-        return torch._dynamo.disable(bw_compiler(*args, **kwargs))  # type: ignore[operator]
-
-    bw_compiler = kwargs.get("bw_compiler") or kwargs["fw_compiler"]
-    kwargs["bw_compiler"] = _wrapped_bw_compiler
-
-    from functorch.compile import aot_module_simplified  # type: ignore[import]
-
-    return aot_module_simplified(model, **kwargs)
-
-
-class AOTAutogradCudaGraphs(AotAutogradStrategy):
-    def candidate(self):
-        return raw_aot_autograd_cudagraphs(self.gm, self.example_inputs)
-
-
-aot_autograd_cudagraphs = AOTAutogradCudaGraphs.compile_fn

From 37333503883d607511946c769c3b54967b13cf86 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 28 Nov 2022 14:57:42 +0000
Subject: [PATCH 1324/1922] Change aot_module_simplified to take take arguments
 directly (#89669)

This is extracted from voz's #89392

Previously, the implementation did some half-assed caching where it
returned a callable, that when invoked for the first time, actually
performed the compilation.  Delaying the compilation like this...
seems totally unnecessary?  To make matters worse, this has cost
(we have to check if we hit the cache) and unsound (because the
compiled function may not be valid for other arguments.)

So instead, we ask user to provide arguments, and compile everything
immediately.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89669
Approved by: https://github.com/voznesenskym, https://github.com/Chillee
---
 functorch/_src/aot_autograd.py          | 38 ++++++++++++++-----------
 functorch/_src/compilers.py             |  2 ++
 test/functorch/test_aotdispatch.py      | 26 ++++++++---------
 torch/_dynamo/optimizations/training.py |  3 +-
 4 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/functorch/_src/aot_autograd.py b/functorch/_src/aot_autograd.py
index 2f3d8bb99f169..32ffb6859147f 100644
--- a/functorch/_src/aot_autograd.py
+++ b/functorch/_src/aot_autograd.py
@@ -1854,6 +1854,7 @@ def forward(self, *args, **kwargs):
 
 def aot_module_simplified(
     mod: nn.Module,
+    args,
     fw_compiler: Callable,
     bw_compiler: Optional[Callable] = None,
     partition_fn: Callable = default_partition,
@@ -1934,27 +1935,30 @@ def functional_call(*args, **kwargs):
         aot_id=next(AOT_COUNTER),
     )
 
-    compiled_fn = None
+    full_args = []
+    full_args.extend(params_flat)
+    full_args.extend(args)
 
-    @wraps(functional_call)
-    def compiled_f(*args):
-        nonlocal compiled_fn
-        if compiled_fn is None:
-            compiled_fn = create_aot_dispatcher_function(
-                functional_call,
-                args,
-                aot_config,
-            )
-        return compiled_fn(args)
-
-    def forward(*args):
-        return compiled_f(
-            *params_flat,
-            *args,
-        )
+    compiled_fn = create_aot_dispatcher_function(
+        functional_call,
+        full_args,
+        aot_config,
+    )
 
+    # TODO: There is something deeply wrong here; compiled_fn running with
+    # the boxed calling convention, but aot_module_simplified somehow
+    # historically returned a function that was not the boxed calling
+    # convention.  This should get fixed...
+    def forward(*runtime_args):
+        full_args = []
+        full_args.extend(params_flat)
+        full_args.extend(runtime_args)
+        return compiled_fn(full_args)
+
+    # Just for convenience
     forward.zero_grad = mod.zero_grad
     forward.named_parameters = mod.named_parameters
+
     return forward
 
 
diff --git a/functorch/_src/compilers.py b/functorch/_src/compilers.py
index 55de63e5c344d..da723e5cbcb18 100644
--- a/functorch/_src/compilers.py
+++ b/functorch/_src/compilers.py
@@ -380,6 +380,7 @@ def graph_saver_joint(gm, joint_args):
 
     return aot_module_simplified(
         gm,
+        example_inputs,
         fw_compiler=graph_saver_forward,
         bw_compiler=graph_saver_backward,
         partition_fn=graph_saver_joint,
@@ -387,6 +388,7 @@ def graph_saver_joint(gm, joint_args):
     )
 
 
+# WARNING: This isn't tested anywhere!!
 def graph_dumper_aot(current_name, folder_name, dump_example_input=False):
     """
     Dump the forward, backward, and joint computation graph.
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index aabd03050c4cb..99a776e5678e6 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -995,11 +995,11 @@ def forward(self, x, y):
         x = torch.randn(3, 3, requires_grad=True)
         y = torch.randn(3, 3, requires_grad=True)
 
-        fxy = aot_module_simplified(F(), nop)
+        fxy = aot_module_simplified(F(), (x, y), nop)
         fxy(x, y)
         fxy(x, x)  # is ok!
 
-        fxx = aot_module_simplified(F(), nop)
+        fxx = aot_module_simplified(F(), (x, x), nop)
         fxx(x, x)
         self.assertExpectedRaisesInline(
             AssertionError, lambda: fxx(x, y),
@@ -1024,11 +1024,11 @@ def compare(m1, m2, inps):
             self.assertEqual(r1, r2)
             self.assertEqual(g1, g2)
 
-        fxy = aot_module_simplified(F(), nop)
+        fxy = aot_module_simplified(F(), (x, y), nop)
         compare(F(), fxy, (x, y))
         compare(F(), fxy, (x, z))
 
-        fxz = aot_module_simplified(F(), nop)
+        fxz = aot_module_simplified(F(), (x, z), nop)
         compare(F(), fxz, (x, z))
         self.assertExpectedRaisesInline(
             AssertionError, lambda: fxz(x, y),
@@ -1537,9 +1537,9 @@ def forward(self, x, y):
         ref = mod(*inputs)
         ref[0].sum().backward()
 
-        aot_mod = aot_module_simplified(mod, nop)
-        aot_mod.zero_grad()
-        res = aot_mod(*cloned_inputs)
+        compiled_f = aot_module_simplified(mod, cloned_inputs, nop)
+        mod.zero_grad()
+        res = compiled_f(*cloned_inputs)
         res[0].sum().backward()
 
         assert torch.allclose(ref[0], res[0])
@@ -1577,12 +1577,12 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
                 assert 'test_aotdispatch.py' in node.stack_trace
             return gm.forward  # return a python callable
 
-        aot_mod = aot_module_simplified(mod, fw_compiler=assert_compiler, bw_compiler=assert_compiler)
-
         x = torch.randn(128, 20, requires_grad=True)
         y = torch.randn(128, 30, requires_grad=True)
         inputs = [x, y]
-        res = aot_mod(*inputs)
+
+        compiled_f = aot_module_simplified(mod, inputs, fw_compiler=assert_compiler, bw_compiler=assert_compiler)
+        res = compiled_f(*inputs)
         res[0].sum().backward()
 
     def test_aot_module_simplified_fake_tensor_gm_raises(self):
@@ -1615,14 +1615,14 @@ def forward(self, x):
         mod_fake = torch.fx.GraphModule(tracer.root, graph)
 
         self.assertExpectedRaisesInline(
-            AssertionError, lambda: aot_module_simplified(mod_fake, nop),
+            AssertionError, lambda: aot_module_simplified(mod_fake, (real_x,), nop),
             """Unexpected fake buffer y"""
         )
         # Counterfactual to ensure that the raise is only due to real vs fake
         # Run the same exact thing except with a real buffer.
         graph = tracer.trace(MockModule(real_y))
         mod_real = torch.fx.GraphModule(tracer.root, graph)
-        aot_module_simplified(MockModule(real_y), nop)
+        aot_module_simplified(MockModule(real_y), (real_x,), nop)
 
     def test_aot_module_deepcopy_fake_tensor_gm_raises(self):
         class MockModule(torch.nn.Module):
@@ -1644,7 +1644,7 @@ def forward(self, x):
         mod_fake = torch._dynamo.utils.deepcopy_to_fake_tensor(MockModule(real_y), fake_mode)
 
         self.assertExpectedRaisesInline(
-            AssertionError, lambda: aot_module_simplified(mod_fake, nop),
+            AssertionError, lambda: aot_module_simplified(mod_fake, (real_x,), nop),
             """Unexpected fake param linear.weight"""
         )
 
diff --git a/torch/_dynamo/optimizations/training.py b/torch/_dynamo/optimizations/training.py
index 8ecdc1d195f95..76eeedd519ca7 100644
--- a/torch/_dynamo/optimizations/training.py
+++ b/torch/_dynamo/optimizations/training.py
@@ -77,7 +77,8 @@ def _wrapped_bw_compiler(*args, **kwargs):
         kwargs["bw_compiler"] = _wrapped_bw_compiler
 
         try:
-            cg = aot_module_simplified(gm, **kwargs)
+            # NB: NOT cloned!
+            cg = aot_module_simplified(gm, example_inputs, **kwargs)
             counters["aot_autograd"]["ok"] += 1
             return eval_frame.disable(cg)
         except Exception:

From f51ce0c3f2122762ce1c861741b6570dfb7f29a5 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 28 Nov 2022 14:57:42 +0000
Subject: [PATCH 1325/1922] Make aot_module_simplified accept fake tensors
 (#89670)

Strategy taken from voz's #89392 but my implementation strategy
is a bit different.

If a fake tensor is provided, we use its FakeTensorMode
(and more importantly, its ShapeEnv--this is what is tested
in the new unit test).  Only one tensor needs to be fake;
if nothing is fake we just make a fresh mode as before.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89670
Approved by: https://github.com/voznesenskym
---
 functorch/_src/aot_autograd.py     | 20 ++++++++++++---
 test/functorch/test_aotdispatch.py | 40 +++++++++++++++++++++++++++++-
 2 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/functorch/_src/aot_autograd.py b/functorch/_src/aot_autograd.py
index 32ffb6859147f..992648a432c84 100644
--- a/functorch/_src/aot_autograd.py
+++ b/functorch/_src/aot_autograd.py
@@ -16,7 +16,7 @@
 import torch.utils.dlpack
 from torch import Tensor
 from torch._dynamo.utils import dynamo_timed
-from torch._subclasses import FakeTensorMode, CrossRefFakeMode
+from torch._subclasses import FakeTensorMode, CrossRefFakeMode, FakeTensor
 from torch.fx import immutable_collections, Interpreter
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
 from torch.multiprocessing.reductions import StorageWeakRef
@@ -1604,18 +1604,30 @@ def create_aot_dispatcher_function(
     if config.use_dynamic_shapes:
         assert config.use_fake_tensor, "Dynamic shapes only works with fake tensor"
 
-    shape_env = ShapeEnv() if config.use_dynamic_shapes else None
-    fake_mode = FakeTensorMode(shape_env=shape_env) if config.use_fake_tensor else nullcontext()
+    # Check flat_args to see if they're already fake.  If so, use that fake
+    # mode instead.
+
+    for x in flat_args:
+        if isinstance(x, FakeTensor):
+            fake_mode = x.fake_mode
+            break
+    else:
+        shape_env = ShapeEnv() if config.use_dynamic_shapes else None
+        fake_mode = FakeTensorMode(shape_env=shape_env) if config.use_fake_tensor else nullcontext()
+
     cross_ref = CrossRefFakeMode() if config.debug_fake_cross_ref else nullcontext()
     python_dispatcher_mode = enable_python_dispatcher() if config.use_dynamic_shapes else nullcontext()
 
     with torch.autograd.set_multithreading_enabled(False), preserve_rng_state(), cross_ref, fake_mode, python_dispatcher_mode:
 
         def process_inputs(flat_args):
-            if config.use_fake_tensor:
+            if config.use_fake_tensor or isinstance(fake_mode, FakeTensorMode):
                 def convert(idx, x):
                     if not isinstance(x, torch.Tensor):
                         return x
+                    if isinstance(x, FakeTensor):
+                        assert x.fake_mode is fake_mode
+                        return x
                     if idx < aot_config.num_params_buffers and config.static_weight_shapes:
                         return fake_mode.from_tensor(x, static_shapes=True)
                     return fake_mode.from_tensor(x, static_shapes=False)
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 99a776e5678e6..a815316cae6c2 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -38,8 +38,9 @@
     skip,
     skipOps,
 )
-from torch._subclasses.fake_tensor import DynamicOutputShapeException
+from torch._subclasses.fake_tensor import DynamicOutputShapeException, FakeTensorMode
 from torch.fx.experimental.proxy_tensor import is_sym_node
+from torch.fx.experimental.symbolic_shapes import ShapeEnv
 
 USE_TORCHVISION = False
 try:
@@ -1546,6 +1547,43 @@ def forward(self, x, y):
         assert torch.allclose(inputs[0].grad, cloned_inputs[0].grad)
         assert torch.allclose(inputs[1].grad, cloned_inputs[1].grad)
 
+    def test_aot_module_simplified_dynamic(self):
+        class MockModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(20, 30)
+
+            def forward(self, x, y):
+                return (self.linear(x) + y, )
+
+        mod = MockModule()
+
+        shape_env = ShapeEnv()
+        fake_mode = FakeTensorMode(shape_env=shape_env)
+
+        x = torch.randn(128, 20, requires_grad=True)
+        y = torch.randn(128, 30, requires_grad=True)
+
+        inputs = [x, y]
+        fake_inputs = [fake_mode.from_tensor(x) for x in inputs]
+        compiled_f = aot_module_simplified(mod, fake_inputs, nop)
+
+        ref = mod(*inputs)
+        ref[0].sum().backward()
+
+        cloned_inputs = [x.detach().clone().requires_grad_(True) for x in inputs]
+        res = compiled_f(*cloned_inputs)
+        res[0].sum().backward()
+
+        self.assertExpectedInline(shape_env.format_guards(), """\
+ - Eq(s1, 20)
+ - Eq(s2, 30)""")
+
+        assert torch.allclose(ref[0], res[0])
+        assert torch.allclose(inputs[0].grad, cloned_inputs[0].grad)
+        assert torch.allclose(inputs[1].grad, cloned_inputs[1].grad)
+
+
     def test_aot_module_simplified_preserves_stack_trace(self):
         class MockModule(torch.nn.Module):
             def __init__(self):

From ec3603b6c1bb7474a83d34196d088df74b6e9dd6 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 28 Nov 2022 14:57:42 +0000
Subject: [PATCH 1326/1922] Use isinstance test rather than exact type test for
 wrap to fake (#89671)

I'm not sure why we did an exact test originally.  Let's find out!

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89671
Approved by: https://github.com/voznesenskym
---
 torch/_dynamo/optimizations/analysis.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch/_dynamo/optimizations/analysis.py b/torch/_dynamo/optimizations/analysis.py
index 5de5743bd5e22..d83e57fdca6e2 100644
--- a/torch/_dynamo/optimizations/analysis.py
+++ b/torch/_dynamo/optimizations/analysis.py
@@ -119,7 +119,8 @@ def has_mutation(gm, example_inputs, inputs_only=False):
     # TODO - moco gives bad accuracy with Aliasing. gm is getting mutated in a bad way.
 
     def _wrap_to_fake_tensor(t, *, f_mode):
-        if type(t) in (torch.Tensor, torch.nn.Parameter):
+        if isinstance(t, torch.Tensor):
+            # TODO: it probably doesn't matter if we're dynamic shapes or not
             static_shapes_ = config.dynamic_shapes is False
             return fake_mode.from_tensor(
                 t, static_shapes=config.dynamic_shapes is not False

From 90025b5a31b2dfd34d9a0eb3abd778207e9bece6 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 28 Nov 2022 13:30:26 -0500
Subject: [PATCH 1327/1922] Add AOTAutograd and partitioner to ciflow/inductor
 (#89772)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89772
Approved by: https://github.com/albanD
---
 .github/labeler.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/labeler.yml b/.github/labeler.yml
index 13798ad707538..6d3902e31da67 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -16,6 +16,8 @@
 - torch/_subclasses/fake_utils.py
 - torch/_subclasses/meta_utils.py
 - test/distributed/test_dynamo_distributed.py
+- functorch/_src/partitioners.py
+- functorch/_src/aot_autograd.py
 
 "module: cpu":
 - aten/src/ATen/cpu/**

From 72703170bee2e35d0422465ff752167cadcf8162 Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Mon, 28 Nov 2022 18:49:32 +0000
Subject: [PATCH 1328/1922] Update minor release acceptance criteria (#89767)

Update minor release acceptance criteria

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89767
Approved by: https://github.com/albanD, https://github.com/weiwangmeta
---
 RELEASE.md | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/RELEASE.md b/RELEASE.md
index 58d938fafddf0..22d279bceec3a 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -241,13 +241,11 @@ For patch releases issue tracker needs to be created. For patch release, we requ
 * https://github.com/pytorch/pytorch/issues/51886
 
 Only following issues are accepted:
-1. Fixes to regressions against previous major version
+1. Fixes to regressions against previous major version (e.g. regressions introduced in 1.13.0 from 1.12.0 are pickable for 1.13.1)
 2. Critical fixes for: silent correctness, backwards compatibility, crashes, deadlocks, (large) memory leaks
 3. Fixes to new features being introduced in this release
-4. Compilation fixes or ifdefs required for different versions of the compilers or third-party libraries
-5. Test/CI fixes or improvements
-6. Documentation improvements
-7. Release branch specific changes (e.g. change version identifiers, remove/hide features that aren’t ready for release)
+4. Documentation improvements
+5. Release branch specific changes (e.g. blocking ci fixes, change version identifiers)
 
 ### Building a release schedule / cherry picking
 

From eeb540184c8b394ca44a57d59bac240988feeb07 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 28 Nov 2022 19:13:33 +0000
Subject: [PATCH 1329/1922] Revert "Support set_rng_state with fake tensor
 (#89642)"

This reverts commit 2f8769d680f068cb97a829d7582fac1cdea21753.

Reverted https://github.com/pytorch/pytorch/pull/89642 on behalf of https://github.com/ezyang due to elias is right this is probably wrong
---
 test/dynamo/test_repros.py       | 11 ++++++++---
 torch/_dynamo/variables/torch.py |  3 +++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 61a26cd60f2b4..7bd258cbb3c8d 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -1047,9 +1047,14 @@ def fn():
 
         before, after = opt_fn()
         self.assertTrue(same(before, after))
-        self.assertEqual(cnt.frame_count, 1)
-        self.assertEqual(cnt.op_count, 4)  # rand, rand
-        graph, _ = torch._dynamo.export(fn)
+        self.assertEqual(cnt.frame_count, 2)
+        self.assertEqual(cnt.op_count, 3)  # rand, rand
+        try:
+            graph, _ = torch._dynamo.export(fn)
+            # See https://github.com/pytorch/pytorch/pull/87490
+            self.fail("unexpected export success")
+        except torch._dynamo.exc.Unsupported:
+            pass
 
     def test_seq_append_list(self):
         x = torch.randn(4, 10)
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index 9f6a1977e0a6c..d737e460304ff 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -337,6 +337,9 @@ def get_state_from_generator():
             assert len(args) == 1
             assert isinstance(args[0], TensorVariable)
 
+            unimplemented(
+                "TODO: make torch.random.set_rng_state work with FakeTensor/aot_autograd"
+            )
             # In fake tensor case, this state doesn't matter, but
             # it needs to be valid to not segfault. Pull a real tensor out.
             # The value won't matter since we are running with fake tensors anyway, so rng doesn't matter.

From 776c0285c00acbe3662ebd21c2f59b60dee6069c Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 28 Nov 2022 14:12:02 -0500
Subject: [PATCH 1330/1922] Reland "Add single process version of dynamo
 distributed hf_Bert tests (#89721)" (#89756)

This reverts commit 0d9a615af4007014586c946cb8ffcc911d4100f6.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89756
Approved by: https://github.com/anjali411, https://github.com/malfet
---
 test/distributed/test_dynamo_distributed.py | 93 ++++++++++++++++-----
 1 file changed, 74 insertions(+), 19 deletions(-)

diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index 2349fe81201d6..d5b9f070b403e 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -98,10 +98,11 @@ def forward(self, x):
 
 def get_hf_bert(rank):
     # Note: use @import_transformers_or_skip on your test case if you use this
+    # in a multiprocessing test
     try:
         from transformers import BertConfig, AutoModelForMaskedLM
     except ImportError:
-        unittest.skip("Unable to import transformers")
+        raise unittest.SkipTest("Unable to import transformers")
 
     batch_size, max_length, config, device = 4, 512, BertConfig(), f"cuda:{rank}"
     model = AutoModelForMaskedLM.from_config(config).to(device)
@@ -135,6 +136,68 @@ def _per_rank_init(rank, world_size):
     dist.destroy_process_group()
 
 
+# This simulates DDP, but it doesn't actually do any process communication;
+# it just has enough properties so that the dynamo distributed optimization is
+# able to optimize.  Feel free to simulate more properties as necessary.  The
+# other important thing is patching _active_ddp_module, which is what actually
+# triggers DDP optimization
+class FakeDDP(nn.Module):
+    def __init__(self, module):
+        super().__init__()
+        self.module = module
+        bucket_cap_mb = 25
+        self.bucket_bytes_cap = int(bucket_cap_mb * 1024 * 1024)
+
+    @contextmanager
+    def _inside_ddp_forward(self):
+        DDP._active_ddp_module = self
+        try:
+            yield
+        except Exception:
+            raise
+        finally:
+            DDP._active_ddp_module = None
+
+    def forward(self, *inputs, **kwargs):
+        with self._inside_ddp_forward():
+            return self.module.forward(*inputs, **kwargs)
+
+def run_hf_bert_ddp(self, model, inputs, backend):
+    reset_rng_state()
+    correct_outputs = model(**inputs)
+    correct_loss = correct_outputs.loss
+    correct_loss.backward()
+
+    reset_rng_state()
+    opt_model = torch._dynamo.optimize(backend)(model)
+    opt_outputs = opt_model(**inputs)
+    opt_loss = opt_outputs.loss
+    opt_loss.backward()
+
+    inputs_flat = [inputs[k] for k in inputs]
+    correct_results = collect_results(model, correct_outputs.logits, correct_loss, inputs_flat)
+    opt_results = collect_results(opt_model, opt_outputs.logits, opt_loss, inputs_flat)
+    self.assertTrue(same(correct_results, opt_results))
+
+class TestFakeDistributedSingleProc(torch._dynamo.test_case.TestCase):
+    @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
+    @patch.object(config, "optimize_ddp", True)
+    @patch.object(torch._inductor.config, "fallback_random", True)
+    def test_hf_bert_ddp_inductor(self):
+        model, inputs = get_hf_bert(0)
+        model = FakeDDP(model)
+        run_hf_bert_ddp(self, model, inputs, "inductor")
+
+    @patch.object(config, "optimize_ddp", True)
+    def test_hf_bert_ddp_aot_eager(self):
+        model, inputs = get_hf_bert(0)
+        model = FakeDDP(model)
+        run_hf_bert_ddp(self, model, inputs, "aot_eager")
+
+
+# Are these tests failing?  Check and see if TestFakeDistributedSingleProc has a
+# single process version; if it's just a problem in the Dynamo distributed
+# optimizer, you should be able to repro it single process!
 @requires_nccl()
 class TestDistributedMultiProc(MultiProcessTestCase):
     def setUp(self):
@@ -181,31 +244,23 @@ def test_ddp_baseline_aot_eager_multiprocess(self):
     @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
     @patch.object(config, "optimize_ddp", True)
     @patch.object(torch._inductor.config, "fallback_random", True)
-    def test_hf_bert_ddp(self):
+    def test_hf_bert_ddp_inductor(self):
 
         with _per_rank_init(self.rank, self.world_size):
             model, inputs = get_hf_bert(self.rank)
             model = DDP(model)
+            run_hf_bert_ddp(self, model, inputs, "inductor")
 
-            reset_rng_state()
-            correct_outputs = model(**inputs)
-            correct_loss = correct_outputs.loss
-            correct_loss.backward()
-
-            reset_rng_state()
-            opt_model = torch._dynamo.optimize("inductor")(model)
-            opt_outputs = opt_model(**inputs)
-            opt_loss = opt_outputs.loss
-            opt_loss.backward()
-
-            inputs_flat = [inputs[k] for k in inputs]
-            correct_results = collect_results(model, correct_outputs.logits, correct_loss, inputs_flat)
-            opt_results = collect_results(opt_model, opt_outputs.logits, opt_loss, inputs_flat)
-            self.assertTrue(same(correct_results, opt_results))
-
+    @skip_if_lt_x_gpu(2)
+    @import_transformers_or_skip()
+    @patch.object(config, "optimize_ddp", True)
+    def test_hf_bert_ddp_aot_eager(self):
+        with _per_rank_init(self.rank, self.world_size):
+            model, inputs = get_hf_bert(self.rank)
+            model = DDP(model)
+            run_hf_bert_ddp(self, model, inputs, "aot_eager")
 
     @skip_if_lt_x_gpu(1)
-    # TODO(whc)  delete aot_eager test, if inductor test lands stably
     def test_fsdp_aot_eager(self):
         with _per_rank_init(self.rank, self.world_size):
             # Test with basic FSDP wrapping (outer wrap around whole model)

From 6813d57678c1c46d7cbda6a0043cd1f7316e793d Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Mon, 28 Nov 2022 19:36:45 +0000
Subject: [PATCH 1331/1922] [multipy] Address GetPythonFramesFunction() and
 multipy incompatibility. (#267) (#89315)

Summary:
https://github.com/pytorch/pytorch/pull/89122 introduces internal compatibility issues with torchdeploy. However, GetPythonFramesFunction() never worked with torchdeploy, so this PR simply reverts to the original behavior of skipping the function if torchdeploy is used as a forward fix.

Test Plan:
Running failed tests in T128123281
```
buck2 test @//mode/opt //multipy/runtime:test_deploy -- --exact 'multipy/runtime:test_deploy - TorchpyTest.TaggingRace' --run-disabled

buck2 test mode/dev //multipy/runtime/testdev:test_deploy_from_python -- --exact 'multipy/runtime/testdev:test_deploy_from_python - multipy.runtime.testdev.test_deploy_from_python.TestDeployFromPython: test_deploy_from_python'
```

Differential Revision: D41414263

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89315
Approved by: https://github.com/kurman
---
 torch/csrc/lazy/python/init.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/torch/csrc/lazy/python/init.cpp b/torch/csrc/lazy/python/init.cpp
index 0b773788eff95..aa876a33622c5 100644
--- a/torch/csrc/lazy/python/init.cpp
+++ b/torch/csrc/lazy/python/init.cpp
@@ -306,9 +306,18 @@ void initLazyBindings(PyObject* module) {
         return result;
       });
 
+  // GetPythonFramesFunction() has not ever worked with torchdeploy/multipy
+  // possibly becuase GetPythonFrames resolves to external cpython rather
+  // than embedded cpython. So far this problem has only been observed
+  // internally, so we will just block it off there.
+
+#if !(defined(USE_DEPLOY))
+
   // When libtorch_python is loaded, we register the python frame getter
   // otherwise, debug util simply omits python frames
   GetPythonFramesFunction() = GetPythonFrames;
+
+#endif // USE_DEPLOY
 }
 
 } // namespace lazy

From 0fc7b239f6c28386294331290da03cae32d22296 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 28 Nov 2022 13:24:50 -0500
Subject: [PATCH 1332/1922] Revert "Don't allow recomputing a node that *must*
 be materialized in the backwards pass (#89171)" (#89770)

This reverts commit e36d68af8885f27d8c0b4727ab078bf53e55e7a0.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89770
Approved by: https://github.com/anijain2305
---
 functorch/_src/config.py           |  6 ---
 functorch/_src/partitioners.py     | 39 +++-------------
 test/functorch/test_aotdispatch.py |  8 ++++
 test/inductor/test_perf.py         | 72 +-----------------------------
 4 files changed, 15 insertions(+), 110 deletions(-)

diff --git a/functorch/_src/config.py b/functorch/_src/config.py
index 87f60fe061e1e..53fa5b28a86bb 100644
--- a/functorch/_src/config.py
+++ b/functorch/_src/config.py
@@ -30,9 +30,3 @@
 use_dynamic_shapes = os.getenv('AOT_DYNAMIC_SHAPES', False)
 
 static_weight_shapes = True
-
-# Applies CSE to the graph before partitioning
-cse = True
-
-# Restricts the amount of computation AOTAutograd can do.
-max_dist_from_bw = 5
diff --git a/functorch/_src/partitioners.py b/functorch/_src/partitioners.py
index af8db94edf4a5..712c9a063eaf6 100644
--- a/functorch/_src/partitioners.py
+++ b/functorch/_src/partitioners.py
@@ -286,9 +286,8 @@ def min_cut_rematerialization_partition(
     fx_g = joint_module.graph
 
     #  add the CSE pass
-    if config.cse:
-        cse_graph = fx_graph_cse(fx_g)
-        joint_module.graph = cse_graph
+    cse_graph = fx_graph_cse(fx_g)
+    joint_module.graph = cse_graph
     full_bw_graph = joint_module.graph
 
     name_to_node = {}
@@ -341,14 +340,11 @@ def is_tensor_node(x):
     prims = torch.ops.prims
 
     # compiler == "nvfuser" is the default set of recomputable ops
-    default_recomputable_ops = [aten.add, aten.sub, aten.div, aten.atan2, aten.mul, aten.max, aten.min, aten.pow, aten.remainder, aten.fmod, aten.__and__, aten.__or__, aten.__xor__, aten.__lshift__, aten.__rshift__, aten.eq, aten.ne, aten.ge, aten.gt, aten.le, aten.lt, aten.abs, aten.bitwise_not, aten.ceil, aten.floor, aten.frac, aten.neg, aten.relu, aten.round, aten.silu, aten.trunc, aten.log, aten.log10, aten.log1p, aten.log2, aten.lgamma, aten.exp, aten.expm1, aten.erf, aten.erfc, aten.cos, aten.acos, aten.cosh, aten.sin, aten.asin, aten.sinh, aten.tan, aten.atan, aten.tanh, aten.atanh, aten.sqrt, aten.rsqrt, aten.reciprocal, aten.sigmoid, aten.softplus, aten.threshold, aten.threshold_backward, aten.clamp, aten.where, aten.lerp, aten.addcmul, aten.gelu, aten.gelu_backward, aten.sum, aten.mean, aten._grad_sum_to_size, aten.sum_to_size, aten.amax, aten.to, aten.type_as, operator.getitem, aten.squeeze, aten.unsqueeze, aten.rsub, aten._to_copy]  # noqa: E501
-    view_ops = [aten.squeeze, aten.unsqueeze, aten.alias]
+    default_recomputable_ops = [aten.add, aten.sub, aten.div, aten.atan2, aten.mul, aten.max, aten.min, aten.pow, aten.remainder, aten.fmod, aten.__and__, aten.__or__, aten.__xor__, aten.__lshift__, aten.__rshift__, aten.eq, aten.ne, aten.ge, aten.gt, aten.le, aten.lt, aten.abs, aten.bitwise_not, aten.ceil, aten.floor, aten.frac, aten.neg, aten.relu, aten.round, aten.silu, aten.trunc, aten.log, aten.log10, aten.log1p, aten.log2, aten.lgamma, aten.exp, aten.expm1, aten.erf, aten.erfc, aten.cos, aten.acos, aten.cosh, aten.sin, aten.asin, aten.sinh, aten.tan, aten.atan, aten.tanh, aten.atanh, aten.sqrt, aten.rsqrt, aten.reciprocal, aten.sigmoid, aten.softplus, aten.threshold, aten.threshold_backward, aten.clamp, aten.where, aten.lerp, aten.addcmul, aten.gelu, aten.gelu_backward, aten.alias, aten.sum, aten.mean, aten._grad_sum_to_size, aten.sum_to_size, aten.amax, aten.to, aten.type_as, operator.getitem, aten.squeeze, aten.unsqueeze, aten.rsub, aten._to_copy]  # noqa: E501
     if compiler == "inductor":
-        default_recomputable_ops += [prims.div, prims.convert_element_type, aten.sign, aten.clone, aten._to_copy, aten.full_like, prims.var, prims.sum, aten.var, aten.std, prims.broadcast_in_dim, aten.select, aten.permute, aten._unsafe_view, aten.reshape, aten.broadcast_tensors, aten.scalar_tensor, aten.ones, aten.new_zeros, aten.lift_fresh_copy, aten.minimum, aten.arange, aten.bitwise_and, aten.triu, aten.var_mean, aten.isinf, aten.any, aten.isnan, aten.full, aten.as_strided, aten.zeros, aten.argmax, aten.maximum, aten.bitwise_or, aten.logical_and, aten.logical_or]  # noqa: E501
-        view_ops += [aten.view, aten.slice, aten.permute, aten.t, prims.broadcast_in_dim, aten.expand, aten.as_strided]
+        default_recomputable_ops += [prims.div, prims.convert_element_type, aten.sign, aten.clone, aten._to_copy, aten.full_like, prims.var, prims.sum, aten.var, aten.std, prims.broadcast_in_dim, aten.select, aten.permute, aten._unsafe_view, aten.view, aten.expand, aten.slice, aten.reshape, aten.broadcast_tensors, aten.scalar_tensor, aten.ones, aten.new_zeros, aten.lift_fresh_copy, aten.minimum, aten.arange, aten.bitwise_and, aten.triu, aten.var_mean, aten.isinf, aten.any, aten.isnan, aten.full, aten.as_strided, aten.zeros, aten.argmax, aten.maximum, aten.bitwise_or, aten.logical_and, aten.logical_or]  # noqa: E501
         # Natalia said that we should allow recomputing indexing :)
         default_recomputable_ops += [aten.index]
-    default_recomputable_ops += view_ops
 
     recomputable_ops = set(recomputable_ops) if recomputable_ops is not None else set(default_recomputable_ops)
 
@@ -370,18 +366,6 @@ def is_tensor_node(x):
 
     AGGRESSIVE_RECOMPUTATION = False
 
-    def is_materialized_backwards(node):
-        cur_nodes = set([node])
-        while len(cur_nodes) > 0:
-            cur = cur_nodes.pop()
-            for user in cur.users:
-                if user not in required_fw_nodes and not is_fusible(cur, user):
-                    return True
-                if user not in required_fw_nodes and get_aten_target(user) in view_ops:
-                    cur_nodes.add(user)
-
-        return False
-
     def ban_recomputation(node):
         if AGGRESSIVE_RECOMPUTATION:
             return (node.op == 'call_function' and get_aten_target(node) in unrecomputable_ops)
@@ -394,20 +378,7 @@ def ban_recomputation(node):
                 return False
             if node.target in [aten.lift_fresh_copy.default, aten.lift_fresh.default]:
                 return False
-
-            # If a node *must* be materialized in the backwards pass, then we
-            # should never recompute it. This is a pretty subtle point.  In
-            # general, the assumption we make is that recomputing a node in the
-            # backwards pass is "free". However, if a node must be materialized
-            # in the backwards pass, then recomputing it is never free.
-            if is_materialized_backwards(node):
-                return True
-
-            # Arbitrary hack that sometimes seems to help things. The above
-            # modification appears to have made this heuristic a lot less critical
-            # for performance.
-            # TODO: Investigate why this hack helps.
-            if compiler == "inductor" and node.dist_from_bw > config.max_dist_from_bw:
+            if compiler == "inductor" and node.dist_from_bw > 3:
                 return True
             # If the output of an op is 4x smaller (arbitrary choice),
             # then we don't allow recomputation.
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index a815316cae6c2..c3ac0f19a3f58 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1421,6 +1421,14 @@ def f(a, b, c, d):
         self.assertEqual(get_num_ins_outs(fw_graph), (4, 2))
         self.assertEqual(get_num_ins_outs(bw_graph), (2, 4))
 
+        def f(x):
+            return torch.mm(x, torch.ones(x.shape)).tanh().tanh()
+        fw_graph, bw_graph = get_fw_bw_graph(f, [torch.randn(5, 5, requires_grad=True)])
+        self.assertEqual(get_num_ins_outs(fw_graph), (1, 3))
+
+        ins, outs = get_ins_outs(fw_graph)
+        self.assertEqual(outs[1].target, torch.ops.aten.mm.default)
+
     @unittest.skipIf(not USE_NETWORKX, "networkx not available")
     def test_min_cut_partitioner_recomputable_ops(self):
         def f(x):
diff --git a/test/inductor/test_perf.py b/test/inductor/test_perf.py
index 2b53c163421c1..d473ff4b74495 100644
--- a/test/inductor/test_perf.py
+++ b/test/inductor/test_perf.py
@@ -2,8 +2,6 @@
 import contextlib
 from unittest.mock import patch
 
-import functorch
-
 import torch._dynamo
 import torch._inductor.config as config
 from torch._dynamo.optimizations.backends import register_backend
@@ -38,27 +36,11 @@ def count_numel(f, *args):
     return str(metrics.num_bytes_accessed // 4)
 
 
-def count_numel_train(f, *args):
-    """
-    Assumes all inputs are fp32
-    """
-    metrics.reset()
-
-    f = torch._dynamo.optimize("count_bytes_inductor")(f)
-    out = f(*args)
-    res = 0
-    for o in out:
-        res += o.mean()
-    res.backward()
-    print(metrics.nodes_num_elem)
-    return str(metrics.num_bytes_accessed // 4)
-
-
 DEVICE = "cuda"
 
 
-def T(*size, dtype=torch.float32, device=DEVICE, grad=False):
-    return torch.randn(size, dtype=dtype, device=device, requires_grad=grad)
+def T(*size, dtype=torch.float32, device=DEVICE):
+    return torch.randn(size, dtype=dtype, device=device)
 
 
 def TI(*size, mx=10, dtype=torch.int32, device=DEVICE):
@@ -404,56 +386,6 @@ def f(a, b, c):
         self.assertExpectedInline(count_numel(f, *inp), """4000""")
 
 
-class MinCutPartitioningTests(TestCase):
-    def test_partitioning_full_remat(self):
-        def f(x):
-            return x.cos().cos().cos()
-
-        inp = (T(10, grad=True),)
-        self.assertExpectedInline(count_numel_train(f, *inp), """50""")
-
-    def test_partitioning_partial_remat(self):
-        def f(a, b, c, d):
-            x = a + b + c + d
-            return x.cos().cos()
-
-        inp = (T(10, grad=True), T(10, grad=True), T(10, grad=True), T(10, grad=True))
-        self.assertExpectedInline(count_numel_train(f, *inp), """90""")
-
-    def test_partitioning_dtype(self):
-        def f(x):
-            return (x < 0) * x
-
-        inp = (T(100, grad=True),)
-        self.assertExpectedInline(count_numel_train(f, *inp), """450""")
-
-    @patch.object(functorch.compile.config, "max_dist_from_bw", 1000)
-    def test_partitioning_unremat_bw(self):
-        def f(x):
-            return torch.mm(x, x.new_ones(x.shape)).tanh().tanh()
-
-        inp = (T(10, 10, grad=True),)
-        self.assertExpectedInline(count_numel_train(f, *inp), """1300""")
-
-    def test_partitioning_unremat_bw2(self):
-        def f(a):
-            a = torch.mm(a, a)
-            a = a + 1
-            b = a + 2
-            c = torch.mm(a, b)
-            return c
-
-        inp = (T(10, 10, grad=True),)
-        self.assertExpectedInline(count_numel_train(f, *inp), """2600""")
-
-    def test_partitioning_keops(self):
-        def f(a, b):
-            return (a * b).cos().sum(dim=1)
-
-        inp = (T(20, 1, grad=True), T(1, 20, grad=True))
-        self.assertExpectedInline(count_numel_train(f, *inp), """220""")
-
-
 # Test cases where we don't do the right thing yet.
 class WouldBeNiceIfItWorked:
     def test_horizontal(self):

From ce00e70e9eb1c1fec119d48a9f010a23e852f494 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Mon, 28 Nov 2022 20:08:37 +0000
Subject: [PATCH 1333/1922] [benchmarks] Disabling gradscaler (#89741)

Disabling Gradscaler because
 1) Benchmark setup runs 2 iterations of fwd-bwd. So, not useful.
 2) Current setup shares grad_scaler for eager and dynamo model,
 which is bad as Gradscaler has state and can adjust the scaling
 factor between eager and dynamo run, making accuracy check
 harder.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89741
Approved by: https://github.com/ngimel
---
 benchmarks/dynamo/common.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 95cd0cd4ca17e..27fbbbbcf8a55 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -877,7 +877,14 @@ def setup_amp(self):
             # Since we are not running a long iteration, default value of
             # init_scale 65536 is going to turn all gradients to inf. Therefore,
             # we just use a init_scale of 2.0 for benchmarking purpose.
-            self.grad_scaler = torch.cuda.amp.GradScaler(init_scale=2.0)
+
+            # Disabling Gradscaler because
+            #  1) Benchmark setup runs 2 iterations of fwd-bwd. So, not useful.
+            #  2) Current setup shares grad_scaler for eager and dynamo model,
+            #  which is bad as Gradscaler has state and can adjust the scaling
+            #  factor between eager and dynamo run, making accuracy check
+            #  harder.
+            # self.grad_scaler = torch.cuda.amp.GradScaler(init_scale=2.0)
             self.autocast = torch.cuda.amp.autocast
 
     def init_optimizer(self, device, params):

From ff25406566c335871f4d98181516d09076a5899a Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Mon, 28 Nov 2022 18:14:47 +0000
Subject: [PATCH 1334/1922] [inductor] skip dm_nfnet_f0 in TIMM model test
 (#89768)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89768
Approved by: https://github.com/clee2000
---
 benchmarks/dynamo/common.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 27fbbbbcf8a55..cabbe3c411617 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -120,6 +120,7 @@
     "XGLMForCausalLM",  # OOM
     # TIMM
     "convit_base",  # fp64_OOM
+    "dm_nfnet_f0",  # accuracy
     "eca_halonext26ts",  # accuracy
     "fbnetv3_b",  # accuracy
     "levit_128",  # fp64_OOM

From 73502b199c99daea383d5d68e07fdc6f2efa369c Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Mon, 28 Nov 2022 20:32:05 +0000
Subject: [PATCH 1335/1922] Move Dynamo docs back to core (#89769)

With contributions from @svekars and @malfet

Waiting for doc build job to complete
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89769
Approved by: https://github.com/soumith
---
 .../source/_static/img/dynamo/TorchDynamo.png | Bin 0 -> 349490 bytes
 docs/source/_static/img/dynamo/td_stack.png   | Bin 0 -> 308321 bytes
 .../img/dynamo/torchinductor_backend.png      | Bin 0 -> 122529 bytes
 docs/source/dynamo/custom-backends.rst        | 154 ++++
 docs/source/dynamo/deep-dive.rst              | 145 ++++
 docs/source/dynamo/faq.rst                    | 376 ++++++++++
 docs/source/dynamo/get-started.rst            | 181 +++++
 docs/source/dynamo/guards-overview.rst        | 513 ++++++++++++++
 docs/source/dynamo/index.rst                  |  44 ++
 docs/source/dynamo/installation.rst           |  83 +++
 docs/source/dynamo/troubleshooting.rst        | 665 ++++++++++++++++++
 docs/source/index.rst                         |   7 +
 12 files changed, 2168 insertions(+)
 create mode 100644 docs/source/_static/img/dynamo/TorchDynamo.png
 create mode 100644 docs/source/_static/img/dynamo/td_stack.png
 create mode 100644 docs/source/_static/img/dynamo/torchinductor_backend.png
 create mode 100644 docs/source/dynamo/custom-backends.rst
 create mode 100644 docs/source/dynamo/deep-dive.rst
 create mode 100644 docs/source/dynamo/faq.rst
 create mode 100644 docs/source/dynamo/get-started.rst
 create mode 100644 docs/source/dynamo/guards-overview.rst
 create mode 100644 docs/source/dynamo/index.rst
 create mode 100644 docs/source/dynamo/installation.rst
 create mode 100644 docs/source/dynamo/troubleshooting.rst

diff --git a/docs/source/_static/img/dynamo/TorchDynamo.png b/docs/source/_static/img/dynamo/TorchDynamo.png
new file mode 100644
index 0000000000000000000000000000000000000000..351689d80dc925ba5d1fbdc53c5f0ade693855e9
GIT binary patch
literal 349490
zcmZU*2{@JQ`aX<gh)_u?nJSfJ4jCg!rBX>KGb?2*WF}N9B(Dkyl}sV?RGFuQB=bC#
zA#-Jl_|ALp{XdTV+uwUQ-req9)_T_c-1l{z)AiifJbi5ax^3&IsHoN-S64kpMMW<`
zMMe9WfewFjx}C`b|3ho9a$Ji6|F|=l-NXOi>Tp!Y;k+%y!O8fBIn@;#TWj+J_NF(?
z&28*0Y#pZ76e;3G2gr+#+%PwGxN2*|t98}doJz~loL54OSK09juehkV7_X?9tfZLi
zK?z>XQ@on0T5-SMcvDgFQXN-S)^dJ6-tD4u-f2#0+G%4>+OA~d4b-|?n(SRk0}me7
z$TYWH>vVC5%RV1DDwUF=Z7H2S;SiqF(s0%CSc-O1b4zwYLf*N<s%-7^k-OM7GPLLR
z@_6Pe9igFlb=%dTzHD&kSY4gI;=H}$FxS&+GjVS=6$ZZB|NSp3=J%=6xBmNY{`G?7
zmqr5r_k|36<pO6#L;m-bCLw#nj{nblGNd18uif>3J_fI7+qI6e@&EVn8`nxO{_h7#
z-S-Lq-`{47P%Y8;-`~D>>Xngz%g8-d{>z2#TQ}aP6$nX8mG=9eYjn6}{>p;I<=);W
zCz5s2X^tK}%D~KA?lx86w_&H+t+M{#vvC{#?>j%bIsV6W4TH<0YuB#XO?DlUl#=3?
zliO}!V9@r?asTDZm;bmn{?B(-m(!v=mDkGBJK!L@gX(Z<>(2fAnOqABgK{7I&*kVG
zDbe_;dEL&A%D=*7rT6Ps6_5X20xPZ5r{T?`4l?#b%s<A)#I7~%4d6Po@#amf>YP0%
zPMokCs@?i+c(}|$vb3>rlP^7MCC|z<Jp(OG<}I$#L;06aW2v~$ol9gtb?TJejT<zf
zHEW~m<5%+kdr?hHT@QRsXug*2@{7Wy<jHm5z=7$RnUbAB@;(**TTDxDGqN|0-+lg^
zD_Q?NM}B_3#kFfb)n+~+A)C6oyVJc^J%tQDaGp_^WuJ8!r;cf`nDhEnyX#y3uF#s9
zpEV*eZ{Do4lG=Xr=1n_C$D_{9G6H9!0$v{RD{XDv;;tCY5%k}SqRwLy?z-=vOs6S>
zWWyrv?Cfj}yL<J+JB5UpO)CRRevdY8*s_J*%gZaRU8_51r(Oz2z2cG(3y&gmpqlB7
zqf|uTqepDzRaK_l?;IJ{t$X#ief^<Bhr;At#Gge(P*VwUFdaO2aC&||{?=MP*5Hf`
z@c<qr?!dsn+F=<k7H4^G8XB6u$w`(gSFTJu%P;)-LoRY<Wd*y$Oc7sp^qum5-|Tta
z)Yy}gow=+kFTV)Yq~zysmA2_ReD>@HVf}ZvZ{4CQFE2M5D8J9C;410k>${Jw_465}
zzqiZFRV}jhk_@G{1@7Iyzbr^;<=&q^u2wxCwwB#pAM^E|ev{&1EAj9bT7pX!mJECz
zM`IdvE?-$sx90V^efarxh{bOWE^w%<x%tWB^zYxQc6OrtJ9lo_y!lRlnO|9Z`=W9Y
z`+x7^Mg6gCqqxjWd%vxMDJdy?g@ja%jrqhSB(DEx+J~(|OHaR3K!8q6OpH^^g!aRS
z58OY5wuEckO)dQMVBdkxt@)Q9N?iGTSU7a>?|g4n`Tb3ak8QVDNN)4@I7#iO!mTLJ
z%_=M`>@*Q!Day$kTEiDwbD)r7Ys)GvE$!^=T#S9`>FLRN@EW_osb{yVL-zRAeDA3b
zJ^bG*$-4G4PT{<P?K1zxix;0OdTbMX@}_a>d{VQ8m6fWw`R)e~9@zaFI(50gbu%L)
zqx$jV%8rf)xwyFcCMN!VSsQ*jO8$PNoRdn5c7#VyMa9~Nh6X!(dtVI!Q-L0=+?%AN
z;K<0pFJCl2IZXKc^Up>u8M|VkBldOA4&r5Q32R1g%UeBv{=9E&jOnE&!+lkn0)yU(
zUSr8^FV7|NJ~?sr;zhr&U(bCT8XCy$^(<>nx`;2iVdF-cJ$v@ND<m8ETj7_nSqmE*
zwe#mUG1IN#+Ogwmm-0ssGrU9dwU>>fMv~h~Owuj5@BZt>)FatZ;p^_u7<&CZsJPI#
zCiB)Mr@VLXN^5GE$Zlh0C98~`?)UNIN55AAKO$@k?;p676Sh|3Sm3rht%hDZUcP*Z
z5B&J)Q)#H7=Y4Es&OkM)+$F`z^76jMGfEXriRbqn$=nowCc5&U-6z9@_H+2JP1j+z
zwVhJz$tZlOCD{M%n<}f}x0g*F9S?kbXi(S0OEj|e0<jgFT3aQ&Pls#JC2Fm^uj*4-
zS;<m!tGBwOgnA7<i>ihObCk=_*4ikA?Q?T;^Gmx)#r($is&Q0POY3&Xo-_PPN<7%f
zWdU0y*tocuSy_Yp2DUzXM31KPvapbw91%;){m&yKO9p59`unLBqgnAI9LnV8<|e!P
zuU+A)o4Qtl`bB#BhKp}6E4#YNmYaxa32w%146h!TT;6x=*s+DhMLu3$DpK)Y9}i*U
z;aPqES@ClIjopQ-D{caokE#ZZdka181qJz`PEoyxja{3VnAnzgedpJ&U&9!kIsET5
z#2gzWOZBZfWZ#Q+@0epjCW}A6Ylt>hS2LpU;AD*t9&k%Lcm8}j%F*M{(6X|!HD}~!
zXoo&WG95csRoEk`^5?NZs^7W9+bS<_;oo{GchzFN@q<5{i1ktB_g3Gf;^J~>-MV$y
zymc%!zH%}$M$>&I&I9-NOxJs@Y?9|rWi{ALx}K+PfZ9&HaC)YS#}rpC%E{bkC&R$R
zR2FhZ@qWJjut{IB_w@8MRcKAD)hQKM?-Q}5D}R?rg*}p}MU&T^Q;FS!BB^}&G8aET
zf8X#hJ$Dd2R!T-jrntKL{-T|o-HXF}$Y-gu{=O!<DX%~_3o9)6WSwgPH|oNp6R{ea
z5+Wk3i;Ig12?--Nbd_*Zl$4a1xn$Tj31}2QKV&a@eW1Kozwdj)F`kOV^YJelN2#d1
zmIry#uhetntcQ6-8>AkiPz0oGd#vXEqU>W;E30_C0`l@?)>-IX?<=NWp0M>gf(<}b
z+}*t+Ha3>^f@tRJ*R=Ht^IVzFntFP;`}_OD#Ld@nh!_N*SgcNdKA`fh#zJzSCFPQx
zgM+H7>h1R1zF6dQiK-`0GNB@F+rIt!Sj&N3d-e?e{9dEIWeSDcq~X<3a<@Me6kJsa
zCWmfbRXEgiezMznFi^Ty&w?s9nrY3NH5+(%g2E-Pn3Uh!F#Yo<O;FXE(3-#9!Yl&c
zJS}Z(DskHV4_dZ3{PEyop`V<b1RyNFMZJcNo&7c%&F$N_c|}FpxVgFQZr;3`l(he{
zzW&t`>VGZqmfx;fHTX6-&m7UVWBXg?_%_}T>!aA%2&0zCj^`$xOVl(lc=+?@jc?z+
z@rsFUK>gLpve-bWd$v|Mw6jwr=8V>b3nfOPqN~xej&GV<DOaxeiJ4aRm9Z+_4h`Kh
zH_=I-qRkZ3a4#T$IaxQ8j>mnPno=94OkH=g=~}k#qkiLYbElbsqnC?3cc9i#y=cA`
z92)BP<Hxnvad9Ttg`84WoOG<*0r&1PsH&-zwzua8<-S{0KXb;fpg`eOOw82^Yo(jt
zK5lTIZN4ZPuCeef<c#Reah;*R4h{~tV`6s7%gd7kPQ@ufVaA#8$5-K!yUFaw$|F_N
z!hT=1t;kCWn|ft7)vKhtyPNV>!6cm%o9o`ghj;z_{OYVOdbPG3xj%<J`DjU5@X4#J
ztS0_Pj9rEF7cN{d9{wEJ_TEW^ZvD2~R~zGuzrH?x@4*9~`1n0&(_LL%78`c*br+kY
z7x&v0Q!>MIlv+5M>C|^^#81~d$C`0V6el(YT25Jqy=o7+?w^fO;}aBI*VEI3Gjz+}
zpDB>%Utlx2@56kL+mymRC9{8bEYc6xt~Dmhqv$SE8T9Y%v9i3#DR)z7`S0I`uU~zJ
zhA2t9mr#&ytMHM=G+4R*b!==&L&JuWl9Io&cJa;EW=iOJM7R1<O)vkdcWM)PTw$Wj
z%VyM;W^Cu=R5d%+imnjO%>rz~z`~-YuFkMlYx2>vXAgi41R|QNs_2unQ%pWBC^B!{
zwCOtTp?|Ff?)6K}djkU|WV`I%y>_@h`k0CejZIe`b+9@uYNxZS>(^_OEWoz%OXHa`
zuD|!v(omO{mX0is#p65%;Jvl9w0Kc8H|;q+{+oO7+c#QUTie@r?(oUVZo>h;r6F+S
z=+U(j4KEukG|!!*xufEZV;eF0G9UHN{m*E8p35%*pf`>EWm664fdns(x23N^lTov<
zAfGmI9tW-9K^-OL?OT>m5yQ!>=-dX2+p7G^YHD;3cgWo}^jus^8i2L+0rqukfsdQ6
z9n=xx7@wH9`uvRg9{drN-lpdR;~f>_9L|CyY@cs84t=Vr3I69FAJeLb{R=aLAM5M=
zP+d{h$H&L}Mn=>JkKt2qKa!JAGorYm=6wGA8E09=$cVSJtc<{-bki!6Kfgy4l9G;G
zxUf0zvM~0~3+<#jPlNbIG*aq{ii(f5wW<aN-0mJ8{lmkpB0R-Ee*Bn@UR|ydlAFD=
z#o(}}re=I<6`)LTaBx~~ZdJBrYZ%oNlz04+fJPXV_u(%h^VNUtW`A6GuH+TPNTH*n
zGpTs68P)ClUiJZ8_Kwr8*r9vb*I|b?wY0oK&BIy1X*+iA+{S<Y`A1V%ml4R0hlgif
zOSdv=)!#p}d-v^A#s{oS`zdM0T~f+6E~AUPoSd2}vhq7#345*otx}aqIuPr_v@}tm
z&0BSp!J)c{T2()_7-is4e19>sk8BY#UY>tO_YiD<=gytJ8~L0{p6*2kTuq^1HPb;}
z%WLl6zyGnO<_KUR_Oqz@C-&(@_jep9o1+IGK75GZU|V+wVAV9rc2X@-3kQ`Qcek=$
zX*Gb{6jWw352JJ)(QSc7neAAg_$3a$2|F1s(mIa#vE0#4x|5kHvAn$8H#NnIuZXuN
zc&EL+{go2s*p4U-flav2KY#uteCF~8XVR6TqM}e%<HoQ4>+Y^LUMZ=a!v-NRZFKZL
z-jxqU6U+vc`BJV;1?UP5x}Bt?WZ09hVw}uu{uMyF<+xEVH21P?-P%5-o&MzMQxmiR
zO6o**tATH7>Xg2rAIkP!mL2N=eQIkpC|@rNal`<n*bmh%c0V2{xQOp5Rda$FeK^Cs
zhWiaYEv}wa)F)4#L_V#|57A2@kPpb@EYJ_GqyN*B{dso%G^+d?P^jb;6qcF~IVs^#
zT6MqMd@Xw$N+4;{T3U}!(I%_6f{yeL1}WV=mq<%ROG|4s`ZZR69VN#&vweDY_NbN?
zJE}}ZW@fl$=80I9Q>WHH71A}?#I%o;g^7-Xcw2nmzErQjV@DiMjh{Le#9e`77G`=Z
zwKeA5yKNwSVTbGmCN7d=`KhyWJ0Bn4BlTThEF|r&U-w281cEja51)K?kRrK_R5l>N
zT>=6FZRw^T>+0@Y7QRyWx!-PY+VPJ&@JhA~8@$oWxk#&y@>+2r6#C>-p%OG7(W_r~
z1#l~D+o={%{D|)e8=$fGVJaslCxC)m^}`asF3|aw2z_jqDssdLc^w~L29S={sO9>U
z^k&-mmscpZq}-o>b|$8wxUOzJZa~QgmtUG^&w8JaKi&T^cqg`CZSfi5(47hjJ7#BR
z0ga23QVSonwY7O}QcZihyET$JDLR_Rz+?8p$L}}ByWfY7zH|X)-?eL(QT0E&yPw@#
zEx1+`-QCsYx~yp=%K0cXl>W(yJ7HmAo|#@A?(Wn5emo{s4|lZX+VV)~He3L?#NA0q
zOjOp^<^T#L;3a@lj3!9&&)&E*(LJ_vM<W2d?^T#EZx#O*<+e3PKLq<2_%h9T#w23j
zo)holUcD+E*J~(jwQ&N;0`TZ5u8WWke)jB>hG0+Lb>Dqs*G|qSzIw%vyI2f5C!pba
zIQE<_*M59<(eh6w()<8F+El3`*i=4Ft%<0s#I2c`Z8h8}ExiR>(BkS<Wz^CeH*R2|
zs6o_PTC}rra=?R}IQt~SHNc+CP8h7Cr75qdFrMrp2Vb7lxXH=W&D$B$OFw^RQB+g}
z+gEmQkVGxUU9RQG$UfkllANsS>8VIaBA``Mb2Iw0ig2jJaJ+WXQ9ZqFf`WpNA|g6t
z-{}8KNhorXTHN)ut?i>(z}7%DGYZ87A4Ybtnb|HNiFCVumFKdK>z3xH{B(pU0<4x$
zQ-CZUKY7yMe9^F?^bXU~QKO2E*Vmnvo~}jv1$`$p^~8yFAXAS|?EWVtWTwqDXx9I)
zT~YSbruIgoIPIj0_fv)b`!BvdoYZVGGf?5Y_&s!~ZkMB#l~7Bvu5VVBWM)9R6zTXs
zfBLArJnZ4&;TOU!8_5E6k?`h?%FUZncv~^gWoP-tZy|u3CyK(x+w$%A%uV$Mp~b5n
zJ2u$6x*`pj<NEu32aln&ZBNz$SGos4D@C^^a`xqq2LOS!(Mmj9w{E3uVn-Ws{uX@f
zk9=5S>sYv30%|ap5NGO7#iFI<z~G>mRhww0AS2ZmoV3)Um2J2-=gIux5l;;UmlxPN
zzg-pZ;~*PZ;Rjk#Vsxv)!fa)&jZ$>hlO1cR(Be@xon2hcHgDOF*P$|?90GFkjP%|F
zssIfl$8gAAEoK8GjgQsUC1kUK_mT}p_}!`4b5|XcuZf5AZoF@|Fl|g40wGXXelfQ>
zD?n?I^^i&P+d3eK<i4AkoAdGW(*Q4+4Kbkb65RXZ1ur%eH5GvO*T%-{bK|;1al@LR
z1=KoT+_ka4)3)f2Du2nJ*|7~A925yAmd>{>Xv@$DJPHnx?c&1F_D#clDVZ)<y_|Un
z$_vyHF}uFQK%pj;0b5BOIQ?9v<adNtqL%u}lV#*S<D`9981QFZ_Re_Uk(y?juA}g2
zXdM&N5!`NENBDiMaE+GlH(nJvO#~g_4r-0@^Ts_~UKk9*y@p(+qOZRL=(xc)!&_H~
z0|(12J9=kORT+BM22M_f0)wEqH~|8!4j$YLrueAZOgpm)JqgA4thV;u-@hHmjTR|d
z7I$}dw;J-Ud4`o6_!@f>G<@&gy~U6BouB^QcxK+hC+@FjbxqCKja7aVxi@d#)V>Wb
z*zky3$||fP`(=#%sIp}d(WJmk<Bce+pb&kiVx@RvpJ>lNjHo!Z=~{+{LD03BAq~mU
z(s-XT+``D0a|&1mEEeJ%rPVH^nl`54qdNe_$gC@*yN_n#>aph}4<3A(o*uktVPOF*
z=lgG7dLq^&qZZ{GSQ&H;)xK|V(D(kPJp+mHQKgUxtXCF)C?}<+R<&nd=^q)nx3J(0
z{xml_m)33tFkBlZwnl<dOGSmU@+u*LL04DzKUfKJBvcT-kZSbTWNl9DH)W8Un-d*(
z?l5g3{EOiApdi-3Ee!btSx8xT;M>w-^YmxdE0&><$t`=bva%8-VP<A#?|}mqAhg#>
zS$O*NRd)8~;)$czNfiXlO9S!&ELTVfZA^UkPWHKi8^`I>r^60fa?sK!8`_>LiqjJ0
zV_%1h!Pn$vV*vYY6&VU8u#mn&AX>hIz9(6MH@)48mX5BpzMhqEO#E_=RlCpfvU}Xr
zVP$3N9GmV3PoHvpSR9Jzn{j6SvD@L2EsG|w!L_+15NZpm-(DT`fo{*j6B|drbamCr
zkGHn8@B@RC`LRx2T<YksG)yz12K=geE<)!^4JD-V)lq*<NJjt`pj`Y14y@-7{%BGf
zgYSlAOh`#NijOt$`s*05L#~A~UlSSL2OcitFv14%+ZAI(SsFYp5b_dO8q`y3V*<|x
z{|d_d%N_<<-LZq_`@JXUs;mtRdi33_W-rqD_7@uX<17J<-ohEJjgV%&^7(nm;;&(H
zl8Ibr>i=lzXzV%VmzuX~zMsocRWgS<)j0a1!GioirqWKrxj&1|_wAe6<@Y!1l3j0|
zj`!6|p(vu(=;`Tw>FDT)u?$jqDZ1nZ22WslNQiIXcIkmLUVj=Diaz&-eE1-5)s?pa
zRTd1U6g&HxJ+&buy?|{8*Ttv>(%8M{KLR=Lt^WJFuV3%FoM%UkE(Xj9t%Fh(ierjy
zmSf#{H`;4uE7|^0u0QrM?~r8&$Chl_O`E%AVEEg&;(seiI7N|k5}nwms9b$1xjm%-
z4AbZ*{n*xkIaD?xaVy_L4GBJ8zg?=Wyh|G!RLZ7{oc5yCzf%jtN~>4xTR*IzUk(Dd
z5r+Hd`EzFBQ2#1!H)h+yMFAk_s~dKK(f{<a5)K1r7vj!IF%Qa4&&(|Q^705u$31A-
zCLpUa&eOc;RJS^lDhih-<#05Kg6*!jT3B>9%7Q2DV7P`t)JT-HEq9J>??dc594;Zf
ztb3oR{zm6K`EETKdxhfG@+rGR7Us@|0QaHrD}CeRwad|E{fBwk@ME+v-A|}Chma0c
zu=;JRG=%?{ygW{<14XxK{rdGXE<bk>L=Sa_PgwY^VRBbt=+`vskV@;VPG}0K(VXID
z45y}l??iu~3?H@qEr06d$xNBIkiD%2N}8&ws>T<DgoRhS*LHpS^eN4{Q|j^K$Gi~V
z$68W+b93iC+E*c=uc2c;eE2ZcUFNMmWSL;LV7r5ELov&`KOebVxOwwtzJ2=`A;O}{
zO^wy@>7(+XN<bWbsks+tCxEDoEiK|aXvLrvNA&geqo1aW2n$z2i`g62{t)&;iP2${
zY(g4m2CGcx#@js#bR%y#IF$F5_-@#??ZNfI$`atj&Kzss_wVIYjvT>T^9HN)q8>F^
zghg*!cV}VBYt<jhwGm`Joe$1pDk>@o%c3p`Kqx4dP&wD+Ddg@BseU&rhY~}i4I;wh
z+ZE$ezce+`QehJ<0Bni5{<aX(&3Ij36=Gfg-pSw>G@9o4&L%_bU1RVG^%Ey(?A%^f
z2k$(FTNVJ(C?dlE^1`E4Q~Thxmb=e(p2^Z1{PE+#@eqNM-d-LYgwo2u?ct20ckj}T
zcja%oEWE|C^=)>wpOB@l97qDpt4`l0oMJ*6%+1Y-%5P@&x{gwzU@>uW&umt0@<FG$
zoDq=@dTUS&G53XO7T<@sHSnc?c1lq$MJctMq$Nm8AKA-pvbyq@l$Fe|!!Zqg9od%7
zzv`sZ9mlkR**k`UjwWh#41SG9kH+tVC7XI4b8&Ggk`v~>x$%DaxWfBT-VY^4#jHxp
zw>*~S!p)`a@-GK$m;Nk3vpnJJT9j+54T?*^Eeb#GE;DpT93d>jQq&Sx{Oi|7OLG&N
zd)bM*VBj`!kP<8JU!j5>gl*?KXf7`*5W;vOcH@nFE>u6TNK{{ZiC0Ir{#7Ss8Px#q
zdrq`b?P|7OR#uki@H5T5s=(yp;^L1`e;_+i<jwu9UTVBMfV=+j^XHsJBT7VNrP_FV
zCVgkFEkRP)B2PsOA8wVP5UK6>@#A@pV;tMIZ3_#~hk^!2z<r_LFVA6gqk-$_ZjEP>
zj93W|ZZp2uBH&nn_>|h-mh%~V094YiH47$a2}1brT~ye;d$+o}dhzn&FKa=j4Dd!U
zTK3v;zI2@coX^bq<FV%m$O2W@fiw--;g7`_QMfxB<}(u}beJeN5B8Otq_c5w+%>Ox
zikDGum$EYQIf(PQFce`6Y4PFilTW+s<6faOLRWojPym9y4i|*W%FEB^!`U3H3_JoO
zGW4XIZq0J=iC8E~lXGRf;0nMbqJX#H;WDzaH^+v(>c8Ds^kAZN{Jb4<kN}C@+SI&K
zQY}LJf7yVnN?ccRU$DD<_pS-VwuSM`dLjbFG%$oz-yUmCHHBHymTkEOr>J%=AKT%{
z{)@cq>(EKP$x-a?E~pn!Zn`$BVBVH<pfFG3$m~2jrz>ZduFAH&9kt`tGmt)Jps3LM
zvFuRM)eUdIF6y=7j(*&8-cB9664Gtn^+7ryrm$b_P})K8i8K$#GvPg*M(4b(m6g?a
zV?cDzO}JLi<(ydX1*mL-ByLW2-5-wj;#O8xRw!)2{xE{njuYtvqH^PgI4*Y@Wj_C6
zh6_&?lv3g?!NUYt{c<fEKiF>&#>xJ8_3D+V%TF_naPjX^Yf5g*2LNNkHEd{Wy9e<3
z`t@sG5KoX3^zDh}pMyAUI4~%C96T&6EVV_DA*`g{Uuso>Kmb><ur+6PzSK-yY>+-t
z>jBIo&cI5~bT(DUoXN=n`KwHKRM^?s`-rv#mLKIYr`xH{J^TN%05CGvDy=N+Ja_IK
zuzJ{wIK7lgbn>&8F9%$Sa-s8DtT7Y?V2;p9JNnTVMr(MoM&HfiRfEbVFnpa1Ge)q`
zM^B&9_06e-efo5Qean`eQc`nKxr+wfE_Ie}J)^9x@|Vhne-YFTSL<2bQE1Jil))RV
z0&018@Zu5d(`D}Vu1$BdllNNc`bYNvTGV;xun#<?CDpKg`83o>R>cJ>d|;TUu}Tn6
zn(*bkTSS(D-3=J~sOGz-;1i%@EH71beHFaP4O_R~$F4E?^mreK*BB~bTcL;CvuDpl
z!Hd##gff$NP-#?uj$)=e#Kp4SA=!ebcDznJvxR0RM+lz#wd{^@duRa@GrQy?>eS#Q
zLr)U5_`;iPQ1EPP!1h2jH~_>tasE5oS|?DF3vdU+>IH0BU`5jT6qm*&ph%_FOa<aN
zL3aE6{E#n}2dBeis5Ttm?rQFyu0WzWx&&Q(lB+ZY(8KLKrvk(Rh3*lk6dD%}zs#XS
zCGG87X}qbty}co4tHBgoa9{CY;!4sI6xe<`shM9~Jj2jzI7dn3om(ogdEwyLjkk$G
z`nY6vn4Xrl@5x1j(V2sKNzDV;y_M*3!O_vIs2gDL_=GSG0rDZRNa$I(iXk7)4ET!x
zg}y3tV+^f1C`Z?!9X$V9{MG@%y?aIX2hw*%y-tC=X=G#+1P_CrrSC^`Qtb>shyM)N
zm@!nPf{nG0tIcpOR3OEIy`vo)LGgyeBO)TQu(Y%j>Ues3`uydL1^9~<MJux+=>3QJ
zgLT#?pnwpL2W_M*Ow4rp>q!w~_$89Y=B7v>XzJ;qBt0nXVZT8isOF1~Mp!*4B?{wV
zZSAKOZAI~IR#K>aa93++1t%cd4h+!XQ=Q%1BzUY-)<8H0rz#%)MLfyj8zL6_^BHc9
zS__8OSm)GLM@}-A)8r<#Gu%;uZH`;z18Wa@e3+E=HqWIA>G6(iCb)#K+Ds<50#AS>
zilA`$v>LdrKYR9U*7O~DE$}$#alcmG)>CM16%{GL5vZn>@a}BglBnf+rYKh$?y10Z
zOYd8k914_wokI6R(4q0=2+r?116`89$`TL<zzMj5yTM}rJgr+R5%KJqkid$Yc*Dox
zUo$|DDgq(KC_fg~)@!NQIXDI)oq9@ub>Bq>H#9Zf2B4%Pga)*=p`+tkdJ0-Se)xkK
zAj|F*6fCt&Ew~$dGW40buT5vpdK60N7NpMNPT7iiN5Q=rZH!k+yO4tRoaeqM{`1!_
zlqZJB(^a&G7u*$LcJ+hY?K>aGyKC3Cs67hkkZI;M93u|0#M8h%BgW&TrC0sjl&&qE
zMADl#kDffalaR0%%u#tS`zu5>0BP9ntD-z=uepRC=nNPNRpB-LJD)`mdJfi$*RNln
zmuH*;nn7l!xV5!4S?hs8Y8AfabSaaF=q8-gZM`1zJMbmLOI*SGz-5@dyjF#6YjE*1
zD@#)`ppj_yD46^9?SrLPZW^S}GGY3oHe5m{&yEiu8aE3#mS}$+9eUulKrxrb<q&$P
zjg)JB_4v`bM0cYS4MY?Q+~$e4zE?!V4{j@5Sc6o0zcOPQzPE4R64?jzlazJa9(k1Y
z*o&EY7X&vGF9gQ>lhHfl=w0wJ2Ggs!Ge>K)=L*a=uUq>$T8R|_fWA2~MWbS>g@uK~
z5WHa$sJvtY65A^*tcK*luZ9(w{u}wu^W9EvTuldX>z*FC^yu;9+W;f}y!xGUg6!*v
z$e)(RhFi7J=gUgqWs+B8$4q<UKRWaLuvtTe^;OWc;^$w$N#&Q3$*Y-v@sK0@Uc~@+
zbOi9w>Qe7&Sn^}3f(D+8@r}g!Aj%|`6-FXK(v71qzKpM2p+^WIFE5YeETH9aN?NcH
zsmZqIgUv-Q0#4+nKKLdlC!gN(4V(l*;|qKlPyyH|Rp`0tR_z(W7hm$u_redK!7c}+
z2sp&zDZ;er&wlVE#D4IKwV;yTIzp3Q>#xgj0sWC58LlCPrcnn-kYY@?e>%&9%rLAj
zR4K8su~Ff4w{>=&zx9*aG_##DuZvst=8Yf}wfTN+NBt$YS0zS9KH=7YEuxk!LWCUd
zITKYr+5H}JRh<yG&DE<VpjDcnqK;!N#QT$KAGiaPF!r3--viV&M`;vLE6g@<S;<K+
z>3J0mIkx`gI+LH5XY_B`p|sFr$?nhZtAu0t`<y&^l6YwU%AC23v|kp@^k_f%;z|ix
z=HXxMSEBFe2vx#O4d;GbU0uB}U$hzsi8rBUlU;Ljv-8qKF44Fc?vx6f_W^)H5;Ja$
zJH;tu#|ta0mNx0FVG+w6737}o0dXcNt=38R^6uQZ#@N`HVg34}$mY<%WyX?h*%E57
zT|GN1s}E_85;)suFI-rgsP&1Kr-OzU2;$%AtfO$P&|ZT8RDa7X2!vEmw_LLAhs=aL
z07<mK_(j%7?8di`+y%o*e}8WR2^aHPS!P+fox@6*qlD{11je2&L+7}b{cQA5%hUWM
zu=~bQA#MvOcwk7PuD?g<JLdWK?5Rq=eLp__O3<DG^t}IA7~7?7cn=-g3Vnj(yWXs8
zLHo*E^c4?}g7(GNjic|H_ROEAXXRF<P~vpG&=WUeJ^H~nC+5Z$VRc>~`ND^4g9b+A
zBFI%Rp0@?k>7}#`Y%_v?4FCq(n^}K}Ny<?*zOlqY@}^vZg(xSvS?;TU=b`o;Idv+q
zVs3QSRbcRKVj?3dIv`r!&53OUUZCXkRqu%|hvgu<Ub5$%BNO3&mxYf3LutRgd^Nb@
zP2*@9@g>2kAhwK$dEs4=)bW9H-ez@|vNSJ*qenpYiQ}D}oybsazAPNt9*^@@S62ts
zZ^vn2?Cj-V(W~X~WqM*X?qH)4NduxFhwjjhtJs<-(P=i_vV>ls|LeaSa?HMx3CgOS
zR1kms7i7mFZY_QNKnRKn%|BSCl;1l|Ro-2{UDUcGOSs!tb{!W4k$IVjm9BhyNQa{D
zD_z{&OwrQj*%RvFYOJozD_wr)zz98=W9vWQ*7(69;`+eN)=<G}=EEH#xWH#E({$7M
zGmcl?L!Lgp>x8oV*1L8N&MNE*$YH?c>q%zMmu~&5Vn#k~IFDS1ZqT~TPfq+^f_99Q
zQvc8p9q5{&p&`Uqey|C7?x6e}Ep6?Ak%kxwQzdaAz(9;31J2K8Kl*$~MkZ`l(h|0V
zqoZR{SetFMa-~T+`crSea|Hq@HK8Izh@S3t>g|U<0{6uD#A8Ez8FU5PqQ6pcrz1~L
zpZck(nbfRtX-(gpi$iL?9KPI*{H}5P3Ez7ju@h0vae!<<kbeiBwO&9r1s)&#9vlGp
z`M{?|BBln=!kKv#xgw;jOHsqFFE3mt$uN{?n1(oZ6SGAmiTYbwYD?{7^<4#eZyMpL
zr+)ak?uNa6@Z-m{k*ATJ@c+9sZwj9RB1O?yx#{uajO64s#-F)VSNVHxP7P=Gnx!Ql
zhyQ_`BFHSbEk#5Lpro}viE_uyA;NM*#C9}Z0&WHdN*@QTV_+cof{vN<miHQZ64^mw
zWJLQ`dQJ`pPL?VRdl<{fbcZ|6jE6pXQi6CP#DE{y_kGSOWVLQjfMJJpr3zelxF+uv
zWQyP<D*hSS3BtGhXDoFA3UdnGN!JdEMkCR~yEeW=!-m)b6FK+eYY0s6#70_fuF2(b
zoismJ5D$1z_wWePk}`5~PVbb^Sn++%Ub^HD33Z70gV2ny<>8b<F;4raK4>T7jI2kU
zfCK=d(C0gAnK+2(r0;ncw@dx>=^Q<d2E8*X$b=z8f-J!c_`tvm9K#ORuZunK_kaI;
zz7u-1*2RmaNXc+?w(5RC=&K)nohVa+`}Z4*auO49i=q1l)JLaoRxq)JxlUVBo&lF`
z?sq2UL`XG$NSKW1rm?a86<Z8%=jP@%n;Jt$SU^4sLS_HxC?oQP$(QrL3Q;W~mq?;L
z9NsAD)%B+$q>SsM6i9&UXO5!!M14h$hl2>DQ6fGh>tz#Xxx3IN3L-Kxhum-vQn<&^
ziLt%3ljzVS36H=V5FT;B=X;BQV5ebk$0q*@LVro#BL6Z;hzjYXQ6bOikn7`uxR^8Q
zOz@*eM(JtPBMaQWvH$s`q;gI!i7P5{!y~UiO|E+@4ZT3&N06m>c$#Iau)kt8j?0vG
z<wzvZCgHI#TKZr}Xjg5EJS+HwIAB+=?t{4pDt8xeU<z!y!zKvc3aZJY@bLDSr63R)
zB+NXQXHWvQjNqHW$a-)}<E=vN-ZhPLq}{)GQT(G72pr88ak~-?t%u0Gl04VF4Lc!?
z+RR9hy_}RJ!9@AF=nkORo!k25V#lYPAG0K}0AkkG-rk-2g9S1TDv?0Q{n6bWJ0>I&
zfU0k+7~9$oc>s&lzFCi#l{7bJH+s}T(SUN1+vCavae`PZ*Z=-;L`Nt!5+(l1RMDzp
zkIV3m+UJLU6>9hLm2*Iz&NLb-pxXfemXVQBd!jZ&M+kwIj`D-O9S5gAi@*OH2|*C&
zf-IB%`<u!!4My<%bc#F`!0;zWoh-BUC_Uu}T)|X>@n}A2kE8~oQRPs(mM+{0%vnk>
zNWG#*fk1PX)tva@l|z$zTQW{fW=kH^)MUd&`Xwg|#U($-@&e3}q}yrBKe<mn9#>aS
z3(I3h@(b$46<qTsh|!R<hz)}CjI6I2M{Pb1Dxleh?c14H{CssYDKB5Y{`S@zjA#v#
z$-xDqeSK8eGXi;|D7Yy9B%=27TQy7w?f8(e;t8u5RPQZU>Z2yIQhEF<NV)}Rp1Lc_
zUD0h$h`8<Y^Ii5%b>A|E1}glu_tdz_wynajegx8p&j1(ie($tXR<`5Sp2%$ROsC16
zkcgqMLv(>sgu+jD0`w`;<Z8YX+xYzX^SM51EfA52B#GnbJlB4OXXJr7Vn0B!nT+Gl
zo^6x=`@50A4um%le-1agaOB7uAah<0e^@`^;!mx4AlXIEt?9RuSpZzMIct-jk@3}J
z<KO(@C|^InW1;lp@J2~>k@3vGEDXzNW+Jy2ED1i<2JoJZ_itB+S=-p;bj0V%0iJ*&
z`jf&57>A6nSm)g5PmWU3(xy20M85<h3mcR`^ZWk&ds~k6CBJY7^p0MyRR!QTzX)mD
zo31Q!hYs-}%?ueA<^{Zl*rPnku=#;wQACO4g0_y*o>tNO89^dY2B?FYd&`k@PdBMp
zCvf;_RMbO}!UY;+x_``N%}OhHF3UuD{<#|QyzLzit3Qrt`I`$#@0yu%PQeH7@m#XL
zT;$o!zaV8&&LCu1$c1|DgJcCliG+Ihqm2S-U->K&dD>6GZLSZ+EQ^7XGZoJ0z<s;O
zbyl!&`R7T9O9Yj}H32CC^nz-llVhbD+SA(Ab@{`u&vG(Olb7Se<E-a-p)Nri0I-Gg
z)G*H7)V?MoH&>ceHi7950TGeZW6B?ZSe8*$DN`hFALTZADAVH-SUT}k(1!PGCy%71
zeh2B8hPFKjR+oSzjGYpE6z1(x%m_rmj{ygSp(obqg#HWKqfr!zs@flXj62=cwmaR_
zuVM_^n>DP6Tc8a`c-cgik{U;etvvd5Hv$);Ub!&2AhSY2CnO1UME+SC?>81G-Kf6A
z?HP{n^<4vV9H>YV9fF?Z`-{ETOl>=CfAc1#@|)AjdPIkpGXAnY!^ua|bJs>j;z7YZ
z0K&q)#`U5?kc2)0=s1#nzZT~o^EI~Y*XIvDjI@G4h}Z>F*y9L0upsv=9M~B7{CV-<
zOxSAflUGx$lH1ldK3z-IxF>Tz+IJcf3Q#@=5VkEr8HUrf4u~G2d_Vri+s}_4YE^5$
zTP(gLs29R}%oi`&96x;T;6&zLw{5kOoXnuiP*?uN1H;3s-RpoXA%-2*(a{OYZM2Y7
z0gD8jMXm`u7G)Ik5QtUbn(^bbqh3x{_E}x*KK2Km_H|?HK$l=D4%UbinMI8T@hE++
z6`6CG`~=$zI3(KrCnL)aSz4rPgf8VUBS;_%`9!F&w`EN@b`T(h#--Str5uEiQAHZx
zN7N6K<T4ns)ao;!qBh}(2<wa%!?%0FH%*)h6Qw4B7LuB#(5%7)BiVX)_YckYrBMi+
zdR*uc@5^B@tE{RTgk+^`WtFTOe!s0vhZI8K=wdiFVa(g`+VOEK5|6<S7Ag3>5&7;J
zBvo||FuF)v?{@FF!qnc`xf8`7?4caOblQ#YCz3YpL^&p21OWh<MofNXC7`(|qW;LU
zA>mzuln|&97D~phj|$c@2-5=?tFN#ONcmBAJG?K|QwuoETMWm;RdVkKiA&WMk}z89
zuFZXoWF4id{WSYPm7ic>;C95GaIgsSWUO|pe$LK7i=_QARaII7a+Q7Q2pU630XGD(
zYquAy<CqUZou>@z+1pE;j+CuPer}KCD+wM>OoX3Ha;j;`5J3KyIGqTR*Ck6?o!8c;
zoQ+F=UqE{%TG9GoS~A#(ZLcDgQO~g_C(LK_Mg+No{P5jiSn^6rvICfxB2FB}n2L{o
znU?m2KkVuO88U4E^JM$K7TySoHnA*#9?OB7jI)=gu|KC5hofQr<u#g02n!RdRRv!I
zdp-R4d+3?O9><gf#rqia#KEedGJvpO^-5Lvs9J*{WTb!weib%@9D5n=Wfw0vDc320
z63OGaK<&Rp#M8njocJ0V8s!r8@{K9trPfcVp(NFQ-Yh34M?zm-E5D-QIoajCfA3pi
z0upD^lyEjo!A(j#S;re%`A`8APhZWwSs&GNp9Xg~TwlUY=u=u<WF@`*vz%|qEePW7
zzZ;HWp^0}9KRkn|O8U(Sy*Ot5?P#Ekn*?;ad^`RWtRTR_G0(eY%NCe2iH$|l_Cp#P
z5mM{pPe+<?7?@qTawMtwW>Rm=>G(3TIynqJ6(;@Ir^Y@3);%8c`p6Onhrj5~x*Bqn
z!wO>5`JwDg%kxO9t}NDjnfxAU(1?4AFew@UNZs^UYSF6~FK)S{Ub=MY`r=P>600_J
zpSg21mIg->utQfU2ixOcN$BeXs%YZR*1qc$C`26WAqhRe21X4q5sw2(^hT%XI+&-y
zot)#TFL|}eiI4)Q!2HPWVtF;RPl2-nXx!Zs<^1YWY*osvk)_hhu1=(|B#yUh(|t&B
zABg;$2WxaHaWx0AdceGg+zJnns3+82sD?4mqi%M3`Y-}8l=`SZ#5XV|Rh!&od=U)-
z^^wT1zzQS?j2Kt=mHkMULPsR42iPjf?eZ-cJ16U^AqC@udcU=Eka%kuKUDqs^V>_N
zk{G253*#hV2!uKbd~ZCr8vzEa##+VY=_5(hkW78iR4E$rkjcoKBER0PH{Pzzat`$u
zA-?})%jV`LwTn9~B*py?o)XxEfEk3|eR&;B9hDo;B|c^}$j#o2c2N8LNN45+ukB~D
zQuluC7SBZ6fkX)MFLQl>miXzQq~eCdb}|U9>jZ6i2oDzr<3&cs#?jGH89*eAP9SPQ
z4<!b)0Ap;Bl^{2jb#}flYeh)dV(xiI^jjDGv**wIK7P!eIg_Ejs}wnGz9%ORaBw(Y
zzzHXrO2`+KhXaTYa7x>7aeFP#xb@ihmQVrkV;1I)N{PH_micj6Sy?1kx1#Hy2F+*j
zs;6l0gg*g7e%Qo>|HR3Y2ZjQ`HsDT7f7}^l1e1Yi)sTUvu71GO)3<(e`<Nq^CI9p=
zQ1Dg*;a8$cN_^?VspHNimLsanwq?r|N6|b?RB$P{Hd!48zfN;syzv3@>b?8-)14Ep
zJ30Bmue!_1^U!|ybNH##XbhBBVyGv;E&LGw2-+q-7fE1CzE9bYoHcAU%?lUqAm53M
zlMzf33Tq31M(wVjk-if!n&F{r3zGi{afTXv7Dp0lO_hA!RBuu5xb1iR3K9SaA86$M
zQPA7}QX0;kfaX^yOdYR2*e)I3F}=E`*gD$Q&FEU@)kXnacUB{F?pwrnD~VeV%Ex!)
zE*X3Q8#a2l9)bfBTJgTQKx}#nsc@8Vg^l|&;$cF9_maUUNX1S%|KxdyN=Qm(UHWU}
zD7ym{*gX5#OM_Htw}mlbl%m^yeiF(f&<P=)whee#@i1H~`r9Jj1xYMCVCvzg<JTm&
zaVffU!ZWIXf_C8TL1;K+vPIP5BPDYNm1PhyU<AmLD`up~VC4=uj+3Fn74TxNPxmP!
zG6(-o8PO2fkjTrHfFjg=p1@pRrhV4LEF$WpuWS)9$QsF~)uFV-$Hu<Iegy51YP~Y$
z`M4TE?W#S|?ml4KSl)LhLq4PL^LQ<b5!)iu;|HPw*nxx4v$96c<BE_xr$=fQ6E_$(
zk}zx>o2{Kq@es`{m>VvtHY-IaVH#-`Xv%42gKt5-iTZ()Obf9!sW${dzI2cC8j@F*
zbMp1^nXcS!YqY#DqtS9|$?Xfg`(pPZWF}KC<q|K$7=n(r;j!%~tjn0ssDQZ`?$Hjb
z6JCdn`7C6j@>f4gv#vI#Sv2tbAMb^izaEMY#z5LKudo`peBbO}L5=)6QsZNij}AfH
zCNLG;a~;4`t!EFqFhDrd7U6QdGXVlGkat&$2@XGOkV#oj&_<@l7~s{*xVXyk4`%5)
zIt8u=0Vj<PZuhIB5LxcXU1ntrMj(In08MEC_W+UPqvy|u)J4{?N~|;JV1e#UfFl6H
z!sPp@;_o*|h>`q_r>AH4iifp_{Ztc&=D0P&wEV&M66G7E=HN&oo$d$K1{)oj5eTX3
zB)1F~dwOmTQz^%CeUwmpz=**ww%;6YYcO8D3IiNk9&-KM(Co>Q-zGCpVG>tZkfnX^
zz=}yuP#he76F=P_)&~SwxaIEwgOf3gh0fj;%Jm&ZP!Gi4v*a8I7-#LEm*^~d`u4%Y
zhn3)_z>;JPlcL;@-T>+R7LrOj0jyv;(};C5VVH)_Zpl*u-Nt=+rYcP4x=HBDTL5CD
z`;mz$``tJ9`*%5ZH)08IghOuwh(dEKK{FuYAEa2IDC3?FF82ZhC(hQ}_s!)Q02edS
zbbJ!9m*I+Y$N&fjZkRgM*`~ek$qBf?e86bvt|V;$wFt~U>_+T=h7!SL2sQ&jD}|hf
z*)_xQO_=u;i^H5I2*a0AhyS?g7!Cr&fPao5X!NsxJ38>P|Kos0=!Zgm#4t^|Q3;Jk
zv|>SLXB<XOP@j+wBz|75O?L)oJ-k#<HgF>&BF13;WTwl$9(mN#AMWAVmOTZve7~Fi
z$`;A4`<%1(o7=`(<ejlaDlnB`m6PhaldUVq0+s+LPG~vS7IAMsn3HH*P{RlyhIC7&
zgDM~H@JD|jXcNui(l3|d2~#<&0n(WJHg4P~yQ~Pf0vn1{608e36VtF5O}g@~cfa{~
z6o8sainI2EL|c=Z;FGx%N+oEU5K;J~r9&4zK<Kldg`b5TInkLLlbBd-<ranE$qy@k
z#yI7j?fh74T2l>4^6~&3(j$eGHG81e;Zi;ZAHMPbvH-^4J_eJpB1xtoM)>U1KMyeh
zPWUZG7Z8s&6W>Z?JzZUwRs(nBb@<ze#0T-EhX2tan=Y9Tvn`j%!CY|n^jq@)u@wJX
zV`M!rQA4Wi)jJr4IePtidb7cO1;%Op0@r4fFY+jd{?=FbJ2l;{`7S=?hr$8WIeF+&
zJ~%QFzHr1ewyp;SiMZr=6ON}F%X91}r2P?8D~6tc#l_IHDufk8X8I>40!i5vzcZ07
zBGIX+c~hDsro*okm?H|{ys|h<W@+(eLj@l$ojFUz5psVbs17+7(v{#mrGH$oIU^9#
z4?a{5(H$PAeXG?wxuP6O-8Rgg=am_~#5TrpCPF_t4KFY;`jzS1t)N><tIHBF_k9sn
z3kZK?6`kIyJ$VCHEOX=Ay>?3yD1%kb$HovAm1C{ciD_6=C9=teG9*&gHX(^X*ulq-
z>d0Qlm1iJRi$M&iAW**iLQ;z-c;}>h-Pb^DK@3ubm(9UdK2qLg<DS#cMc-?K)D4i4
zX-tmDF1w(LATmsb+{U|5gmT{{6zhez%k^Hte<0fO0!BFxc*Y-u2}Lj);z~AR!TP`}
zu)uU_#jApN>brbp$IPEcL|9&*BA5p!ig*gA_oRa=uy6-eeaOkNDjgXa5!QEtqNupK
zY!CC4(hH5pc%mblLwESrqKXj4fYWuX&R)yOq!RM4D8<DQE{18{${#MJluR+eIc82#
zDs*i!H&2G+pof+ji58=#LwX@I%j7R98|P4VIF?Yo5*KO=SH8xM3t;|+Je>eJCW3g7
z?7-IAz|L-FKmXou>nWCY7R16_ehnD|63u@wQcG%n_C(C8F=8qg1q}!cgJ1WFql=Md
zWqbR~_IcDV%Fn;(yD~S&IYGKe?hm4LA4ZmTB7R5%TK`^=2S*wWa}bG6k;1=oox&Fz
zK=UAl8wn;sAKnNSKJmNbnkXlU;*z(=HX1*7yd3hD6<l`kWik>fJAR?HW$@SMqUQFz
z;*R<VjW|Wdj;yKK)<z=%tP<u=W+1@FEUu={cb`40-?<tcZhTk#$W|lT$8U_3H>}<8
zMQ6iy!KWjc<s5D`{zxrx2Bs?nZacWvEkI%8C6kNnXZM*<KY7JlP{!;?3A>3W7)*Lx
zo@8cDxZgV;QLmRQv?n*p?X%+WczsW_{#kE$@+W+zvA|?r5H!T%wGpzb0|NsIUY3&E
z*f($PhaT)25)LC*#^aA9LUL+#4bbArLLn%XW>&osf;zl!nz*>QwCrs2CYx3ZNiq|F
z>YWB``Jn5&oea@OA#6lw`B*Su01Bi`-sLAPO42cmgMh1Po<F|^2@#BEGchq;>upN8
zBuywjYT0U^xH7!-qnzOp5oMs9saLcFpRg<61a1Pi6|Iy>GzpG~@W((B3FC@NND$`|
zALU1Qc16W^-#v;S^}KhXcA;yMIapYUc;o_}-+|bRcm41YjA6ZaK@}bz4&OmdP3`V!
z@h_h~t%Ep)wtG5r{xnP>BoWAbG>F%4!O-(MI`=@V5oR*M;M_=4;-*1PYpK*jg6gvu
zPV4#n85~2i1%7UjEkwEiG~`Qxn-lWrBMOB9fq~g?{lDKhbcFAe;Uh=Q!NI}AUi2?W
z7$`N3uPkXCJr3;ILyy75VxY}Oj~=}^m{;`S0~d-IGVY}S@5bilcMJ+#jnNcFy5CFd
z<=UidNle7t84pkIuHbeyxLJ8OzON(mCLksn&t*0slEm}8K%$sx4YO!pSq1$uid3yE
z)h$eQ79E_SX*#L~9kbze8yP=1fBwkv-_(x&Z$Ln=KCxx@uOPZb5Vr!<(vu-6F0(&2
zlLt<K3-kkQNgjKdpMMA*Z%JEi$hq?^EiEG*+55Q_Tvgp#q2Uop1tJ`+@(Wl}xU2*W
z`kX$^0uyBB&#WclNihIwi1bqR{rIsCP!u`Ohx`{Ae2R&RKR>D8LDPbu!KEB4<*A;6
z?T#uZc5O^4KZ}4AnJ~lz+sJp?-8<WVh`a>@dgnCNW0lVyuA!D>Ox2V<3S*FS>T8<7
zB@cQ<B)F`tCoZ*K*?|W`Z1z~?+bJfNuDi9~#>R$B=t6KNa`4Cq1BP3%|DaZfg@>R0
zX@a>jk*}-?Ay==ei%MUSu>bk(I33?!Hij(9kQ9A^>#PcpSZ2&7>LIzNJ$C?R)6lSx
zWe-e03$9FsYzW(53_Ir853>Yn(*B@f%L?3LZ*OnamPQS}-gF{Ok~ZH3P--(aMLG_V
zpeaH!>xh}p$oO*d6vg(Rgtu=iA8g)lw=`$1_cSp)d~3^FeSe6l=Q8YTo*sztNJalw
z+tJKQ-N8_Vzc%T}x~e4?ytVdqqt9a#&%b{Tf}mmQJoTUj$!?=?-$k|$*xrF+@u0$l
znBW0(QsUw&*RBb|sCn2^;1&Y{!@d}=B{;J%V+ztkrrePhdKKA;YDNhT--YL&1YmE7
zjTha&&xkwkjd%UZf8Wy5lFY1Wgo%2$T&X^^S6uv`p{|Lpd<in?fZckqW6QP(Ut%5M
zzDWvhwZtYyjOX0fgv0vN4bbSGbsgSB>%Bwnrl{3$e6;6-;^M<8romah-=ukCUX985
zVWJApaNrDXLl)AX<3K5bgU^z^PN~3{G%WHY;l8KB`WVpD!GsEaKnvM_4cEtKNNj(5
zmWo6*WXD{8(1a)26#Mz*>eq8lPc~f6SJwaXv|#4#^CenksJLWk!fR!5kLTani`blF
z9)%eD$I{^P@8E_R;lLr1MkfoTp5P&j91bCGBVS%}JXu2qN|4B&{`vi+kp4S%045{0
z3x^{I<@WDqLbR$B;{_idrCBpW-X<j!gaCIk>4~Vd4$m2$szRtjIznnNHW7@#qdSZ&
zEl*Y_uj<1+nYvv|`=f)T!C>DV)79My<v^K@7804=Ksh5)ZG$mEiZq(7!*D#2*wl!$
zH8Z1y=Y#(;=5gj9wr1_OaX5S;hK1#j`TQ{yNB9ngY>EMHuJ)!Q9z_N@k%qZyssU{Q
zXvS@B+#9+hxYci00cfolZhK?f>UC`6x1LB<i2ReiML~G}38oc}OwHRes+;&~1!Vwf
zY`y&k+Xe@l$N>;&$VeW<FIY>Clsjl7@C^G<Gl;(PIZA;kT_<_|7c$G`m6ZcnNE!D9
z(YR9)ccI4N2@<2eopO($p-LK_h7?66IDnK2njH@a35gQ#!{CFC<*ABFO(Y*M2WyOA
z29lAKWiK=kQ&Usif3>s^T7nh8NnhSMTH?fc_m5q@U$@hDJFH_;<l5!w>HXB!oZaTb
zWk|AysI^&iaU<Id<!+9Dv2@S2>D~qhr5v33svDjvB#16J`Zswt7={f#uwnT+Y8?cI
zTZpt|rsBAqh3A3IM}Rv@_ZR<Ux*y}IaQD8~gc3;tCGlU6N<mDa;mw8pk@mOLVwAD9
zJQu%jL7E`Dy<UTj7SzcXPhRN*QYOK|FBm5-a{T<Cvb`#-C?nq3&=8z{%O)(P?Huy<
zpTgCl^o#RTcog05K}@(B_!@^)7)=fvh$&cI)t)9aG<2tg#3m$(nXimN?vALHZG0Ly
zFgBKS-si!tgVhYjGo0n9PRv!~jrGQlsi_ShUY>O)EHZKjqHA5df*GmMp<BcAIq@Z+
zrjH~m>G5=A*si7V*5-e|4^Qc!!{kMPmA19&nIqxxe{OGwtd3~49mK85fqF0>Sfr0H
zCkF<u*U->N!xR7>nAFHs`t<43siUuc>c7I?Of@Jlk;w=Q2*BddlW-Z5>39wdv`^Am
z(Mjp|@8AE%lkvDoVc@TTfIq3dTb_j&qg&rETKVHIb-LY3ifkNc&k|f(usX<uDaaG*
z`919vy^`8mW^$<U)Slx@lS4y82i`Vdg<d_Wtwkt}6xZ*cHg|AlWn_?v9+D15BnMtc
zfHu$BM6Cgcnq-t9(5^x41a&bUZ-asL#{y%rSKTIw%miB8mSN6x&;V#@8e1R7&bN*t
zu`BY|&!3oCJ;_9e0RtVV5R}i)kDIe%(r^QbTBA27{yKl|T+jW0h4*)nCT+X6Y11Yo
zW2N6!3oy{&E>=$Wmk}z55EB_VLc}$T>fwLncJQ4rJf%b1ua>Cu<Lwe^OBg~%O#R^-
zo_*}=P$m;x|D~#*ITLTrO5=FC8WilBqs;aE%Xho^mnM*nxeHh&p{^?KmXEvuX;3If
zCo|Lk{kS7<4E;Ahg|-AIkZS@ARt~AW*yH@05|V;CMfct*$1l(Hm8=2t9L*|0J;tC$
z?D2PSH&8^S_SG+7aJvs0O4gP6xw%T@h8*Y=bS_<LtTcj&d4!`D*|o8ENR6sm?)V&s
zTp?UNR-%3*C-CBVI^wDK?%gW}k}LUsW2^2A6&ok#LP@<L9j8&Tmjr+BLCl9C18g_^
zc?X`XL^VIv+nO(oGzoBDjMbif!Rj5_KD>B@0vR9SLm&c9<RLtHg!!fB$GoSPg{$CQ
zlTOv)maZd&Heb@x!hwHa`#e5%A*mS=71)SKsc0r@LBMgI?xTj<OB}lsCAaX<9l%36
zJm+E>xL&ja5B@nT$aT2eGxpUhAAtfmh7UmAz)H?pRvMLU<TzzcIT3sAgZtt}G)WR7
zM<R$kh-hjm$P0-}C;}sH!K^k6FxBz&w1~m1W{Ncn9vR^}`{NEOC7FVF+_6EICKkmH
zN*sBT1&m2kYdi@hz^41%sG~?mmolQ|gR8K%P{CN7X9m`S?u&eKv87P%f(HK{Z{Kuy
zN54Jh_s2VP&m=XO;yt+Kop0f`8-M@w<kVyYuqdR5X?Qk+So6S*@@6~aSp1G0A6Z3w
zmT>v(J(j|VUx4JrzHYP&ya1MnuR~4-AiXLAsxVfk5p5Jx=LW7vw-^UlgO-HH5|v`7
z5Fke~Z3v2+jUU1rp?6Iq_T7&i4*#!VF&PjM>2`#{wsne!2V^LLNF%J<_Q6GhoGO^u
zFVLdAYT6dqj7&^?h=7b-4UmFWOFqH6o_}RAXLk?shuB+7L{!1;v#dKgkX3SC{J9(T
z9;vEC1$Fch(z%FYLQmiLwqaw${6Ga0sbKicqj>U#`7c6ZF<-BmI(6b<Dw&YQ_q1tQ
zIH94@<mTSq(P8rCg)*7tK{h_27ext=xM}@nFGZ`%BO@sI;7{Ghj||vdRgkW(z5z-%
zhfSM?#wd8@WQ&I<ZVgoBJHS_?THByqJ+`gF>+xOTOsE;un2D<%JxYj{^Pk`QhU%kJ
z&l^Cu8YHF)*!NE5lCCZ;DJd!BhTvdm8O38O;t>rn03gON!X80Niin1o&(ldaSqmW~
zLVvY`0RG}ztE<x`!Qt>^nXB~id@I231}L)}aviuCWKi~EszHO)&@Qp_Uetwn!rn73
zeyVF51-|b>#$>k82|iOg_LacSI(0uu#+^H3)EGAdB;$vkp1vT82-aCbJn+$w9p~S>
zH=+8}IXw7;g-hl|=HyBI>E=`q1esd`nUYx>gxnR2+8c_nTglvYAR8vD!<4)VwHcq?
z4G*0yq6hf}A7?}hClB|bP$(q&>N+=mADjabv=Xo~cSRln0RgN5+3Ns9tHJNsY<;0L
zsYR-hkQP)NnEqOkvQiQ2fd*b`E*MDpW19r<u6=SMHgr%GVID|)S_-lou_Mc|z76T7
z$4G<|2GB@jJSzsp5{ju_YN`jP<FMmF5FkLUaz5$%B*&|eFh#_g>`Yvcs{JD}%mI0a
zTqS7OdMYw_g<hm4Hwi$6Qj%qH)OzqEDhvSEm*WmnYf<HY|7iB7IANk4r|wI>J$d+_
zTAWeYN<FL>htYG}f*G5FbC9iyY45AN=D89AhGOJi^=DVIFw#*vdU~Z_zivbVfuJko
zcQL~53oyoy{rBS+IC)!vn>0d7oLjT7|79>HTGsFh!`RwMi3xK<R<8hc{4pFeR(rUK
zBH{&{yF6XS%KXHKJeCTfP{Q8Or$NUaz%KbJ&W+-L2yjET<;82B2!6?6Qq#iz3o<E~
z2r3r1UkhY3-dkiiER*u)O(~h#L(NH<<vxzqU<_5PsYpUXV#B6QzUY-B&u#JT(=j4|
zV2oz{=iy)QU^~>REALXT?a1JBBtnf|>ybZru1t(=Z(%?{J(N;19f!<pdL?z3Hz(b4
z@7?}OD^n_$^zHv~c8!>df)9ZMHwa{Fs{ZL894b;JfwwVsz6NFBM;D$$WM*de0#i^K
z3W(j3$6kSpU}KAU{BfvM<inH|_V*Et{!KXHK%+94A}KY}p1B9aJs=<;VY~s;_z2zX
zfU>KK5wFdXzlZ9!6Wy3rxera?b;lk&9Rsks3R1mlXFb3JT3s33edup-AARsRgswu5
z?X7S1k04&>=njl;Q0Sf*Qkz^9|Mh#@9tIlRC(`eb5VV`>QTQZK^ZiEGd#511`Sq9{
zs-V94M+HIjq1N2p65j6)cT=ySGs0{IkzQi|Y}m2mRB$Ph;{nWjV6Ob0=)5?!q9JGs
zVdAT&w3L(@EQBvf7e&YwSV=`Z^CLqT2ywlzN#Vv#!4-=CG4^=mt!$?DN*L-LOic8J
z9eGF!Sw|y!b)xh9jr_}aXi0HL2Og5R4l;m>wl)W$ba+^kJD!0tf`XM9YaOlNb{Fjj
zT8b%Lpy^+~Vp6hkHp%#6SCLorpA@i9JbzQ*<QY0MZk%k?^Vk*_gi~-5%K%5{kO}U{
z(@xZKo*Umv2IZ43rvBLV3^%~Y)YSKLq#TTBZt_ey@v>+ouZIXKA!YW)R{P?`LC98Q
zx&XqcFVcq`t)d==*auVYE1TEd!FIOHv=j=-F~hf#<PWK%U_O-}52ZrxwhAc@4DaK~
zm%#L=*G@OHvXG|=;DL6&wUXlE*PhnB_gInwzVn4VcDF91<~soCsi(rO7YD}xSV&|l
zWUvPE1W7mI^7h_$LRJ;C50%dhEw_U{k|FQ^kFht8$8zn$ziFQ5k>)|AG@>FxrI02=
zBB@A*(140W^E@dHB%~5T<`5d$g;J)F)K<o%fd<0+J!?Pj`_J$5`Mvwo^X%HW@B6y1
z>pa)F)^Qx`SV$8BcW>XkSx834A6Z41MkdFG84m6$nkNRx89%zHtaa79+wFq(L_J}j
zhjR|C6`ULUDG52;$rUI3vfH|8VE_K>XsxB?Gz}Tmi|aZAjhm1%&T(t(2W%a1!Y-=*
zG3BY~{Sdw#4O$9P-6;wYE_Pi_>$_NiJUa?Z6s>|;5%|obNA1x5I2a~Lt@V)XFz9pZ
z$iqFmQ7ew&dvubBNd*nY^(baR!8z^gkGvIZI}!2`J-i542D85!73Ha<sG#6QTg|nh
z6>s0F4N-DDENBjqxINE*Wkwb*>XP0_R&`Y?5FIYO=<U6t!u#Sy(cU8GQgM(HZ$8ha
zv!ai#sXO)ce1JzB!Mx|VBe6A3)oGqf01;g`x+%5@GS}JJ=_(gIc(9j8UQn~@pC6(R
zz`C}*M`dyR6#M}K8<YU*_Fum~)@c)8_z2p<qG_BqB4a&5YDR^B0uU<q*J4^g#!!XV
zwrv0OdL(e-n!3+R-{GgQA3Uoz;ewor=Atp4sSUH?vO94WxSpgNpcf~E?v*C8+o@?2
z{GN{sU!9<V;8A1J0O8d2h%@ytkC&Fq#TDB#Q4H1BQh)ZDe(BbNI!d|8#;J5ZjU^S$
zteglU_lPp@2&cAY64H*BX|g_9GN0Zg>(rQtf~u~vqGBOT&3%G(XKq1}ll#wi9Wlk}
zOQQvVzC&&gfW0#S&)G$95G)3onvCf6yS91BaMr`Qb%Csqpp~V$9j>p?0%hGVN|<#P
z7V}MW{`u$dPgIdKxT!0*U~zlA<z<A~T*UatkuYCt=<e16)xTtp>-SsBi*u<|i)O>e
z&8NP${;@vNNc2J+M$&+XLC&+eTi<tjIPB2={mYt70$&Jc!kB4ID5h)deu#@DOTPMK
zb|vY-@$a`z0f1ixG>rM-EpOe20z9zc=Yt0t>!F*DC%DrxsQbNS_O|j~B!Nmn6xhb;
zL6nTOMMHWV;!!(9=}OI`S`JLcBX4PLZf-AfCdGlC?jX_*PD|UWb`wyR5K9R{0bpVu
z?Uhv>QzuXJ-w6ymQmER~e%f1I@M+5pp-KAkR63EZ%^_S~01pTSsK9y3q<BApa7@FL
z?5bUHTs!QGee3Wq!nf{4z6!T~mV3*bb)YlK`MJ60Xhm%U-ch6JXA)-Emz4DpTfXzs
zrAr?a6?s$l@$zxd%xXi&ommv~X#*f79T1wk=<=BlqgvJoq$~0doY?01ma%=PvKG{Q
z<GktxmwheE2h8bO);6gV2SJy{ayt)$yjO2>n0!G8V0<sRv-Q`eWH~a4FC0YKpBy%9
zSO5p{H7p`bU-k&igW358)VqH^0y&A*42k)`F(ktcDGNEt#Fs1*hIk^B{QINSeO#x#
zBPOr8yT}z9T&Wkkax~2y0tQU`mDqwx<1l5ZV$pfvjS<jP^OaAfm-PlgeFV~am{h74
zhwFR}QpCE!+agSpG%;O!*;sLNdEjB9iDhiU{81x0IkZiMviuR)TyX1@vri7Au(L7E
zp^EdPf6V_RRKIA=8f@GD>+|AbAJHXW+d@hBh>D@aN}1L#yr<6!KWwUF4P`TKJkpX!
z@UV*!lpeP#NmQ^?d2AVd1pM${v7c82Z!#>t>p%!p-)NGY<p&)f>DJ_o^hzzlDcU$p
zMxfP{Y@X}B0RkTeujxw;ZTvheJY44crxG2LXdiFyK0FYMAx)h8>+kPvAG2j9z1s(~
zCQg{}BBT5S<zCHi4GmeZK0qZ7j=P4=*>PH|I<gGwvKk1#<6rtqNre>rbou(evyieC
zC7hMfwK?w|m6+%)d@JpSO;=Qu5!RKxiB-8<NaG%H<^`K8ZU3~mFtQPK+!*q_9IMo6
z1Q5&x=izU4^j$H{;bS>vj{s8j1ryjG7A9vGPOum)b_qOM&a=>)H%AElLs~LC&EftN
z4dttj0V+PC8RZB@m%XV8SlaE0*P_AUUvSr#Nh;l^3Zmd=>p;1Q`@al(S6v-q;MYtK
zS=M=PgYo@Dxh3pU1-*Ft_USX1gg(c3ekq}D{NN#dx(=9hPk6AhdxWu;OJ2lu1%<~f
zk=yt>c9W7{6p`2ia3^MwDiJ6&{)Zx^cDExXbWD-oD-2c{ghwh#oJ`GA04~xj9ow{p
zZl?!dZ>Tc?>%%=Xx#qp-D@tu`QI?A;dhnnPxM;zuakm5RhQ%cw70uTq<Bi{IkC9v|
z$hw8fHR<{6|8(deK#HdGG7eA^{o!Bj*4o%St?Y=LOZWR8N>!1>$4r_aA>6Y?{=X@M
zrap?9PQFS`>;F;7KNMc~GaNB4P@ekP3DI$JX>3ISL~7b~+N0(HK>T8Q<v*SNxv^>O
z!|$6WgHZ7N(s#H6#F7v~7}jJy>mY%gT5yC2xs3fpLKE}Pvqn$H;eC{xs`m>ECjJ67
zv{=Hz^JCPdDQ{E1s^f+bRJa<&Bst5Bq9)q7V!CxEkrvuW_Kv2J(I8@s|H|)nC^Ln2
zYg19ok=_Qyy}Cy<)K}>pjQNPYb0q&g>{4h^c>>I?UKyrTGooDB;h`6XeC0|6nvtD$
zT5-|!u@-%d{-*_4A^bK`4=UA8z2syz_unQ$U5s&&;L$`={zab-J%${H5L5J)>>`0T
zLCkg?UEK}2)ABk@kNR!*xv4I7M%1Q%7B98i`ejIu^bymV=C+xqS>CD|lQ(_Lo9V~s
z>Gux_(XMDblwKx!&&A3SiA{=|YuYp_U7?B-NDIJ(IbNnQiC)NRg@zNGw}x9B#q{q5
zZcyEd5`d02T-fx>gC?WXcC5^n<ff$q#|zU3>iKqVjrDt=1U*{A15h<;PxvU|RBPqn
zaEjK6b@;W&YZ;4&A~F}c)T}G2iaIK0W{&C9M%L@7@pf1VxXR13wzLtlSjr;D6d!m2
zl#mc9@Le^gY9_wQCp2@eHeT<}A=?g{DL9U0sazuZ7O$t7!4W$M>g{L^4fA|dq}Hn)
zY-cZNX`QALut}i|!7jk5<lwf66Mo%cmjIu8H~Qkl1GNEKaJ+Fp@a>MT?~mli1!Lm`
zTPhN8fkIz-3@?CMN)D06UW6SF5kv4G^xa2#&8*&dFeJ}L7%Y+@XUS$POAVhqA!4zm
zy|uiQ@Ryh~XLaaHebAuTH=}CCw$SVqvpA|T+b~pr@`ao<<9R@10jyC0CP1t_J)Lb4
zT_`kmz<HbNE0^y=Xb=AFukVGnT~Ng?pN99DHRNReEsZEM(d-C_8)h84ms6bRKmJHb
zT6rB?a8qf@P?Cw9#0w6GxvRfv3W#wzh>RO8v@j(814G)5>9p5l_2eL%Ri4=!en?qb
z+C=YU9l8|X1-$W9j0pZB%t=KQh_=kyOA8#1I^ILf054rmOd*cdbC}q)1&hMKHaU6*
zdGJ^Vos-=M96P|lCRiNWU$36DyEws*2H@KcY9*=*3T6RC+y(;%gBD#Q>#@>5KFJdC
zAc8j`y<^{`L;5>7etdQ2t2(iXn$UNJ#^~ix!SJX#;Ttw>IxdPcdi->sKB6Nd5JFN^
z<AU>A?5osYh-kRhA|gj+0$`>{xFvRZOpg{Y$JDhFEDEc$JLVn9P802D1SNs6iFyco
z4Q$~DM@8a)sZPmncfT;(y6XM={W$z!^m~s^NFI7+Y3f|`j9Y+3MM=Q@0wdvz3K!!!
zm?vW%1!mz7b?2hfR$<I)9vh!XGi7W3k~H9x4%W+35Bf|7o-e~mO9P}f>qQM_R=nsf
zX^njPLiDDfIvzOnYX{9Z@3?|8KKq&tXXG)0<%1V5B40fIEjnAw(|a7$`&o*cAxBl9
zo@V5`QL(pfeGD5j`GCeq^W}biey!?HE%O(W>X!|6>V2=SFVCL@eIb(Ryq<#D!Y+^o
z??s`TmQjuwMyxr3RIBKR{IqHJVJCdX42%KPKi*}`h$cN;1CFJd&-1+eI`_<o>k9KU
zG#m#c{GcW4_^*R?x9o5$mo@n)wlb_L4AG4eY<17NT%juWtzB<zeMGn<0H5Snz5n+4
zvw!b3l%zDX?^!nfxBuaO6lB8LOgy;xqxMi#Bs$f6&!6tVxi*i=nu}W;`-S=mGpL&>
zP*GVo`wqplkj&$vkB@l#LdZ$uTQ6*ZDL%%G9UJghBLSqNM$-NQ^oK~{PhL!IYb7rc
z6c=~?-97uC4wS>f2;2C`V*e$h;AVOgOZ116VbVD$Icd_QH#R+tfto<_tlb33!dFNm
zbFtp$+&p{Hz+ZDvVzE8;y9iA_V)rsuH3en6Wiy8f%e>E@H+#R1pl=A8z2nT8@iP0o
zg(-462!N1euoYu=hP-o<m&!89^W@CEN4;xZN;8b;H)5(sp0;I5&sFkmTXqggscMyx
z>SI~ghBF=I#{snw{ucGst*M~Xe~kC`ckb(bxoi72lET1?zA1BveBp96^2{QM+`Ol%
zaG_|GX&)B+S#GX=(Jy#4)M`_~l<Rlz7I4|e!m}anM#JgCBB@%Jdn`I8<`D?1Q~p^f
zwrw0@LWspl*yiRoo9&g^>NQ`w4{cH_X$gc{Vqpo!)v3Cl$UL`&L>F$-u1-psvxELy
zimhV|X+Qv1RWL}-L(v37zh#DoluW_f9Ij7UwEi>8KGy+tCYd&lA6L&5qC#AykRys}
zcwQ;32iPIw4vhG5x$8bud*1Wbz@Q@M!wO^;V3|T4hZFP(-A03dq5u}X{lY4C)VF`W
z_9n@cPgmCJbCA-)rsk5j_f<eIqgb2%<SwD#RH)N~rRp1mqo(jf0`g&^NnVm^b>NK~
z2R*;aO_`F;gS9TzkS0lI-+vMKJz$K|sk3xJq0-=#UZ9AjP|c)$qkDA{W|}Mzu_$#_
zlaQk$!Fv1Q!vV53ktG9F(yFrK_s?m<rN$uCFnRj~%RhX3ld`JcDUzAd^qyc0m}`8C
zR$lZlJPr!8h14-9p!vRgKms+*aI8p!hoEII(xt%$g<fU1w|F|Ip-r%|b>N*uS~X@&
zJKg|PFRmlQfrOtW)$wM;JVl8n<NG~${P@6@h=U3<_OxneDaPz<c=9c6(;Civ%@a}!
zqIWi=7qHlGmOl9s%O_C4F09!8LLr5|Rh-dJ$fRFjo6<eF&e~clYa`EoM{{HB8mGlN
zOs^pye4Xriq4C7I6+IDH49&2NwszfjFzy&B@PYg-NZ;ux6CPFq*wQWPPFDi4S}3@L
z1-{~@S012=xDAMgb(9i!Feq1T`DtElD)p?GTQdl|Tkxg=eZ6HIBpS$x%bNeD(o_8>
z>>3oB(&3`2o02H7Mu&0;gys~=omnJoet1`WDo2hT+a7G0M$c>gV`c86jiU|k=<g8&
z_s~Z7+;47>Tq7cXs~%ctRQN@eDXa=9Sb25;Uho1y)#8Im1HF^K<dkn-LU)M8jM#P-
zUuqi#lc)OYec!Miy|1r00tMMtNEO$cn_E9ykK9U}aDYy{)g3^uJXotfkiGk3W1Rw@
z?IEE5%L<5z+YK`z3{60eth3Ia)xIwJC3d&`+K5v0<>h42eoF;WR9PPXt|7**k9S+U
z!rcH}4j=ZnC_~8I8*$D1Xa_elk?yc!t)0FLYJhbkOgWJj1}GW{qi>>wb^6&6T0ZBG
z_XWcTl{MDf!t*^gql2`ipTZ+Zs%bmFzSo?w4sU_(#AvA_FQ+OeQ*cl>_~*^lr7@0N
z`2|J@ma)QaL-=#Bx}@%76)ZZN^(^A!Qs%9PZRgB%4|HD28A>lN?V*X!NQU2p4Ex%&
zm#GBAM4%{R{rF|l2($v-(Jy2#3c{Fo>Y@~sWy??Yo^D)1Z00z42<6S97YD&j2d-E^
zz_6SV8!s#xbE1gnydR}CrYX@cHl3pbu2Y0r>Ww#tZWX@&^eMcd2`P9PoqsPvR)a?N
z9yV-AmU=^zs@CQ0HTm5EIt)Ygg7t-~9uN4U^7in2|HIU*{%qx#gc>qtlH;~wObGcU
zf*oN>C4apG8m3z&@jcKm<c6l^M-0~a7bK})@S>|wiFXhQG;}8c*xlN^J=0r3XNp0`
zK2RI*U_uD;g)-N8xG~un(vAD?Hpj@SVlB~AwO2Z2;mv(<-Dd0^0ij?r=K|sFRZ)@X
zPXRN$*8GWpA}qJ)bjmWGn1~P*V1+d$x@nPR!!`kZi*`P-aw&54OEmiM*dEh~Sz4Gl
z0vsJvD0%6PkbEMO;>E`^lMafclYPF<#-<DSDTgbs)~|_79MS}=D11!;Y($MKb><1K
zPINqzu^|8<&D(e0j=D}-px-Jk^*bDFomeeKt5z+K&GJb66@%oD4JRk_Vv&{YK)P-)
z;$bUzH!j^)It?Wd5Bi#$88?*=??}PiaoNlW*xaJTL_}*KuIf;RC;q{WKv;(hf|!1m
zU0<J!6S8nrdcoT=o<2Pdv^;vo*gl<s3ir}kup;#fqn=R7gLs|jb4kr~s&jMXLJb`0
zpnC<@F<|oQSEF8@%#1si@nOf$&m&PS6k5gcxeq9FMWcwI1{!^{EcJG`jk1qmG70kz
z$O#r$u;c<s?qXWn<%R`{$_(r_fF0s8cdk#?K^T3s5DS85OFG#N7&`O>mY^I=HKil!
zdUJB?uN~4Ex<qhih#r-mkM7il_%3o!K@|h_-`(9TMMfFprib`Wm=>|T5Y?!MHbV%Z
zO&B<6Obo8qL}@lEW0Fo#5eIMqu<H8BnP5F-q#7~Pp3rv!0+tajL!ERjul&h}twOj!
z2O+8(|K67x?5^-l#kr<91!jh7TiyrkY%|?)X+NtWWm3?^Xp$6#N90CG8n>=`^06W<
zZQ!LRbsWP4EQy~}f@d;bMYHxi<0a@)q(alIRThjo93l{gp1Q;s{!}}YAc&ZP4=+mL
z`)f|zNe|mJ46Q}hl@$_g!xDtDQVUqNHZQ~g%L^WU#|87}uLZ9JI@IJY1jgN_FNzy|
z^G7lyt6*B++I?2{yd)T462;rDKkPA-Sl-*4&aRFi0-}}OOM5~e@)xoAg%zm~e1(KK
zdUmGV6{exWutckq=ZMi|QkDkRG&Sa$v}l@Hjo#F^Pappk!QLOTb34HE;C2)U_HNwm
z?AJ*|x*)}1U?XvOPc}<A4r#{I@-BZ*D|<SS`yuM!2k@p8uk*=z<ZLG~GD&&+Cn3aS
zNnH1;m!7?Kj7aQx&ee0!dx0DZy(UY3AppM6PuY}_vbhT}1*)5v6UzxPk;vgu&ni^k
zad;JX?e$?_U(;j%bYWSfxvK2=ETMh<YuMN}rQ^WB1}~{`m-Ti6Oc2$OT;&MVtz_eO
z+*?&i{oc`-ME#^U*s5qz8hPdn#Wq>$59_?}O^GymeKYg+Wcl-7iqQfKNzTZ*g}um5
zp*|mTmxF(}p>V`^1VO!Ur?Rs<fi}Q<aWJwgF82g^!c{hau2LIlFEKYoxMPY4Gr=CV
z8Vk&I_>JhdW_v|evwSF@XAahagcY3ZU6+QViAylrL$>oPU-6UMJJa5mB!mp}U^7+I
z)#N0_2S&wn9$7|Dc+g<?Vvyu>%V^QjUr(;4r7VnF1jwxU>`l`4o*XF$|4u)!dapdY
zBZudSqgV|*y_%&CpmBV*73ZeF=O@X@MbghEdWsN@>L-1us1RhFkWqsnA}3fN?Ev{r
z6iEYn8$Tfy?m(y<fnCLtgxDW_%nh~&OPW08(b_&1yD^>LH)|Ma$uQMji)MF_d<!IV
zh=N>Ti56@7O~_~vrU9ff-NybITSDooFR`Gx>GX&{tmN^y=`3s2IrwjB`ZF|QzdmqU
zg1*mjfAU9|CU0E}PpjfMIWh8vY@#p0RH8b+5BAyuN%3l4Ib6uWBO|T9xXKG}k%z%R
z=_fIa^9R+?y|PrvH+F-YVmqHK%S<oGMUpVFCP=x&_&<qOFUf9Qx;&r;{vs@fM~`j`
zmMuz$;Do)eH=-N#Jvz*x>*&#=X?nl+`t{}WlKL1{l0c;Nzl0cCxR?lK>h6ty{GR2t
z^!-l`$(ebFOX@(Adm5`dxKzl7ooa;VuiiH&pb*{JZEkjz32hBW(?qA25PL4LXyYWS
zi`Mr6z3<v#Po%ZW`n1~v8T=MWr+>a0ww#o%>0(?JS|QC+D$Y@3#~v7^B}L<GM?ty)
zYe!G_t4^J`vw8HwlT)M4)y$*Kk02-FD_lizDKrGa-L?pzMT%je`afk!^v)eSglf=k
zz!|N*G^)W2ijF=W)4Z^;2#LYw%X%Knl%Fxf{O4iP@Zc>GUAc0ci6~hy#c8{Y-W+<v
zgn=pbs;KD|mQcM2K_Yas7*j*X<{#nsDUO#JpMdTmvi5+}h%=KQkz3<?D=h2<3{jZk
z?P)%w94cXoQ)bN?gaTKCQck1_)VgKVW9PCaVpj)7u;JT3Ukp<Fb(x(aI)&f9eXHr`
zH^i)=cD@_kpl``nbWct}Qs!9o@_<wj)s@h@#O}^_ssn8jHcoI_{zbxV8A0D6JoU<k
zaB9oG+k5WxxTe#G4t1=3b#R*^if*W5VMaEfwzn>dg^G$p6dnS>ZnLanG)~j==qW>A
zB|$H3q|svK=O6|$6CDKT0`gMVcX+o;!G@ufp~82TAJCHWx^RI;01P`?gtagU4&(U9
z$AZVtbE__<0aG+1uptp$_~#$*Y}a6kwtBg#X>)#TWxW`0vdPYlI!Ik5o$D^u(wz-1
z%vu))vFAodv+B?Yi2i?mOc<Zje+D{5Px*|fnTtlDEPrzDq414D>@M;NtP&Rt1)e&=
zk`c~FWNjN3eH9WU&ZCR?dY@1`vuKP^2GMqbGwwP|%MJiuP-@)dsnaL8q979lZ$?I_
z*1tE_>8|jftQ`SEFE?sZjPKdgr#(Pa5TexV%a9)(5O$+#?gC4ZuQ7h}CvBfrrE%zj
zIq+c~_N$Fh!$J4ii1w6kJk5B|dp578;MpYHH-JYXR;{qVX5pnc?l?q3%CB<i<VNWW
z8IQ&25F~;&zY#W}@Lh4{89N0pJJCU5r8t!#P-2=360jGk?(Xyq0c0(^bVc|}@#t7Z
zd@Z4;gh$T_CkOqEy|y%CS^~~UbJiW-j^58GW!G$2sd}k}La2+%Piuqg%*@25Bq0iB
z3B$}D?YHZ~qYd-s?EXDSxI_ki|4gcB?Y7wD;CuGJur2(d`BP@>*s9FTA)w8SjJ&q;
zwi<d=A%J1yg!qwe^&~bCdUA)%KiL3_NwN5j5{0-HFzOU#rQ<VofeXWMtQ9q<I=Jj(
zKzcz&JN0P9SDE+HmaY(|DGh3z(2E;Vu`QUh>*unXr0yibWJeAI3WMaV?!(P~DTXf-
zv!FV1(&m0k-TAEpGMa}3{He@Va;(nnjB-|VxL`wvwu-YzNX}7?370lHj^z(cl@Mbd
zu>BKj(#gy;su`LQ9I$b1#bk06{~m~%x=q)UIjcVEOr_(^xPBa{u;&iPey9;U;9tMr
zZHS(RoPY4QQR=<w<n```gd9E7tQGR{_fWVRF&kF26jMvJxoJC4T3T96HDxf$wKgtP
zwZ|ZG-qL#0Pm+8^bU{KELvN?VSB$x?Uek9MXT*;B%HA-HMOPCT4j|knK*An^L)<LY
zTERe3Va67tqW17>w?y88o|aQ!J1=c)j?Et>bbE&fnod8_untuSbBN4D(xj_$00Ke4
z%_i}h9C;o}6}#Q~Tp6<o@F=*H?KTZ}a&7O(X{RkV7#n-%w=T^uR%YXfT+%Y4BaKDC
z*QjEIGO&94iuEZiJ0_y-r|SJ&`C@dn^68gJ|IG*bjCn3zyPn1ww$kQBZ_CT`EjpN*
zkBEL*bD_hF^CitoAEc#qfCiU;5X*jN`(!2dsOnnt@_JVkS;FA{_1&FkJtf`0m3J5Q
z00r5@XU|S55e=o|9ZGMMh(Z)2gXx#LVFr@5;!M=-+d?=p)jX}6Z*%y&-&B~QeHUrm
z@sJ5xMR$?PuSLVz@7L>aQ)SUvEJ~KbyQpI*$VJyIH64}a?$9q{5Rz=p?UB@csLq66
z77i5EyOt7%g<?6qj7Nd>=qRu&^b^Ky2k?V{i(4=e_LdI^Qo#dE9Fw-mJr6yU^m~3N
zICz4W?qnQ)IRH9bD(ppKg6hsitF)T2hUV>D&X(OmeygvW*SMoNvE)CrJA268nH>UE
zt8OkEFZyrkxdckl7<EPjIDiZ>fQ$^4ih?{DFb=*=Kyt`U`N{EKPX`R#zMX38zmZP>
zAfkd<?JqREPoEx@nRtO`h}uLU(ZhA+R0V|tSQW89Zf4ylW#S!8^t;|3$y62wTW+4?
z-g3DpMl`KKVUBE@(+m1XsI=*g>B^F}dGQogGJy#$t+UV%%RM|aCV)~<WW5LF&DhBF
zfb1b7@ID`g`(oI(87qgg1ci+{;@*gXgEv}QrWtKYrEG#U-@Vad4Kmp!eyZKg%u3^*
z_S@wLqrYKSO=Faf`#6#isy-xOo$l`VviJLH`2MJ#Cq|-D0y(QYcX(8%HVaP<xlb#w
z_X5lFUS3#U>(t0>q0JPg)Bnx28X|^RT*jyGiJ%64RV^hcDTy<C8V`BA562i}(=q(*
zgbavCCai!q&l39P@%?ZpAi57mPd%u|Xg2+>FOLTb$C2@u%=&bnQ6Twe4g}+5-qhu7
zz6+ct@-Q&>I<LjIPEl~ZJv4l`|Fy@NnVDz;Cvv2>=e)|DI7`D!m|qbF1@y`}Y3=q9
z)gnp5)rLqEr&EjGfiocBJdkf3Qr8Cin!KWowY9aR&_`uv9MK}BTPx}Xy5c`v95Z8x
zvSXzO3dK=#<_saf+w^_N`+xuaGt>xVHDE*2*=^Zim|uiV4MM-_K(13Xt+AF;S{iLB
zx0lzB>(xncqYPK-1etci{vHMAqTYM6sshRLj@KEuw~WDD?i58p83TX<pQK#JB@07c
zQYBCqv>a2XPv2`Cc4O};WY7r02klV>z<fH{dRAO@Mq0tb)vI539D3MXEo1HGsm*oj
zKADJ`gPznUPyGDrx6B43X%Vl9O>z^DJbbszwgYkuqkU0Ex3=3HVyBshnXg~};Pl>I
z1G5d?e^RFIX!$)m>(@j|Qh`QNlExgLzT(-msZ&qqTM9f{TiYeV;s)E2u0e(sYBU>y
zo`c>%FI-)<Y9@do2dVzh8j8c5@*OAw#FL>B<M6R#foDoM&(I{jF!tr=grNx$UAKc*
z#s@L0Mo93pT(>6NT|WIRV!nS)kA#&wWB6h0^wG!0%<$44tNX#9W<*g2{)*+O1JX=1
zOlWEpjcU|eR;x}IiBO7sPYnGuj!y6vqpuo%tCh@R{_7sX#)4UrPBmW}Tg(5jQx8a-
zMgJ(+3GJAGNh?;I)s3C9Y^EX>=O!B*&RlxX(6qwM_3KD~(iy>Vi5eIB2Ibq|{>|7*
z7fv)`B~L2PK%UJlzjr>p!Q&f&Bz9DNr@d-aMrc^AD>^bI5w;hE^^ur>M6k39^Lz6-
z1#39`AMQhw;@%bBFw0JH3Y9KwVmF_e>mIcGbN}N;)F^al*Ul<-CorjT`byMk4a3Y>
z9l-i?rBqH1F=P8Hr4J-DlA>aA5WK0C9E8>le*$n!^APo3Vn86z7OTI`A<qRr$g;mU
zdz)ooiwY-|fTg~f0|+4ZV~pZy&C-2XY31j~y)gIIfEb(Sc_GV^K)pOJ;@uTppD2u4
z|4@i`r`+woPjA(#Bb1q=n1jOFkD1hEdWd1*i)ANq)sm#3hnhHgd(0CZyI47sE^Kmk
z#7J^@JOCi5X!X)NyWi`~ncl$5z|p#L=W$UGx<|K&CdL100U&B(X6=iZEd<ich0rcZ
zg$BTq!}vqB_c3<#+hbEw?rrpQOMA6hzI^^Cpc*k2t);oi)~n?GmmOjtH=PTD-5os#
zYGVuIwTZqD;HycVt|=aY1Ep>~;N{gH_2<>t0aq1S3!*@|h+GU9vO#9lf>FK83K5!q
ze`dQ{bHRct#{sQ5kQT->r;y-OzPz3>H>#6eLAdB!I!==La(ZZe`&Fs$Oh<E9IE(dx
z--&s~HMffj3x(EL_v!8Yg(O#^&zol6YHteHm}w9G$dZk@xKT1P2dgvQcL+&oXQ)TO
zWW>8zt=SYqT$@eS2<oFD{db`u6)g%@Dux4@W}b~vpzKYJfMsIDMCGf<GS^=xyOZxn
z?rwGw(B1hHPd{uGqY0?DMib&m><;p!!KC<w84i-f{u4|}w`lZgx1i54JOr6;L&#+@
zikKh<Eu1|&FFKlBnl}jQy-BTPhnRr*B$9Sc!L8#9w4-9xOf}O-cjC6pq*ckX;3^sx
zS@0dTyu0C`rR3J|BVzcLKtPyPsBba*NLu*_ydXpm4*kqIN*xDFOOKY9Kj*nBd~WKQ
zZFr;i76uxi6HoWlQfY}-(&MkxR*3*4=4=Elm_Oa``9hfeFSH|7H^-dGBbfw9norG1
zXXz53xzwjVy32%AgZfnf1Z<MQKXMc*$aTbJWF=k@3e+F-h@Fg5|0gARVvM6ZllQRD
z-A_DuSzZiWwft+i_I1xQ?<fi01h4OODt#Y}V@VLSDk151bhK;Gqu{t(E4H@k>?M4G
zj8i2f7+No8_}l4JMyQ^f|E{Tx)~gZ{u{ghRI!W>iG<vs_yuP9{+FedarulXS+7%gt
zl^ca<ClFOp%B-`u??L$^F5tZfo(y1T_)*7nNDR)ip@tLnuTWK571-z8=nmESV8j}W
z>6e}*oIBTB3}K>`GG>@L2+oG*Z88@>r;1=mhF+^ohPwQMwLlrzA^%I$f6QB5A_3kH
zvf0u%ZA*d0%Bb)mi>>mnXh<8!+6a{90h(RWqv+CYLUL18kl)MchR~~U6%(fGAr!aS
zTwOG<SGyNi7uT`v=O6}^amb*f6hCal5AsbebzK2GD+?lqV_|)SbSVI+k5!mP7V)M!
zQ4q=Kg+$Me?8I3h7~p07E=@T5L?GEZZMee^g~<|=m<cWDnFIiZf>z{h01LrxGKF<4
zVOE9MRp=8GWLIi8HdfE`$&yyxH4B@GaHqw@ZB3Z~qe(Y5t$CQ4Uw%9SrlwjY1tlA?
z@rY*n1$3O?wOWM}VXvv0%ZQpnYSQS80vPc56q)7nAM8jreOD4XNt_up{Yr9k`)LH6
zeyTK?G?H14`mzRRzm*mL)Cc?%Vw;S&<y#Z1cs#7T*k^?n5}pBz5Bv?H?gQW01pV>A
zLQ=x{DyE5q=DEAp$6vZMSak2FxVfVH=q>t6+!is7-Lk;l8wWZO2#fI=x+bh}8R;{e
z^7!+ban+DRVlLtJA))QmM@*bp<7**DuNjVyS2;Q<@B=PqB?_t$0Ve&@lAI_f`gErQ
zibAIY9WX#7)`Pvt-NC6eAsL9I^O|IYN(99?4|)so8v!q#AFQ)C<AWe{&dK}3P||@?
za!avymjmdrsR-Naw!sNHZo#T^<a{JC$2TbqAMWSAI4*Vq3R<hN$AMLRyi9*ghHLB%
zJuDh2#B>n;-5|5C^uVJ@5+Rc}x^>j2KJytGyHrWJy5L6p>W|v&a-TdiG-RsH^?E|A
z<k>;AnnO6tdFCKSNph}zKy5>YIykKG?1}pnMLe+u5Tdk;zZf5kwD}O2nR#K<hJRoW
zpw4L5$eFZo1cO_|kN|voj%I5wS+cuAy?vsXu7Os2>iZ`oAEc>58-q?j;8U#QbJrp`
zxE5@Jh@y>#U$RDrGF`^Db^1r?S;K~hTB4PcNda`<tj$d&$q=cn)z(`Qn=vgDJsf6k
z4SC8DPikswvwj{#%PP*}=+2f=szpH2!HS}13;z}5krQ5Sq2Zni#d>RxM&9J+?3>e^
z+O5#`eY{@C9AkFwzcu_E9u)AuE1ax$%o#=(PzRQ;=w~0Y1p^x++D>T7GP#xsRE=ZD
zc)9{T8_!cM8rOPYC?ixZXfuSuKdZ+_QNExRLW+G-(1oUnj(gNbWGAk%w)VAZ+fsM`
ztx;oXJNyM+m2X$K^xUJ|pzbN3<#lWH2FazDXYB!uJ=<79oM2_08{?<g=e_C1zL>J2
z(dkXJ&2`{75m>y)j)4@ae8Y;L^ckOsT`9#&>G&;{*4CX-E)J5G*4ws_g+vuqzttkX
zpXjaq@#E04@z)wTn}iS~aR5*;X(S~p3YC6GcA`F7owDoF`FC0mh@PAg4~ZthO50h8
zR;4K_VOL6`H$E4ZAlfHA$JFsi#n4yQvzVPJAEX4V2jw7|aMRqh2A3a%*cS93DiSHL
zxv}*2a9#(yuQW6i;3}Sdbb=kP3&+k06&6T?V{MVw78O`RxE(Ry6Zkd8_|lj{<sF~b
zGO;@Q{c5N<Vf+Grx)vKr`Pr|_@1eniK?Cd63><80XlN*u8Z^AEmr5|Vw_h^)4h+p8
z$#d6KMiw+;WI@_Pik0<r$mq@uKY#AfOkRo|G7$R1=g*hc)ie)9$a4?OIubt6O5S_>
zbW6ZpCb!c~w7}9cRy)r|fqD<aEvxIpJQAJ8Sv%~U^Jfq^)G3{lkd|$c|6_C(n6{TK
zo#<TKg(#bBGg-FZQ5m~6%rT?V35xz6KH292xmx=!1wHrXTFE1#plKbug|mhl;>MHf
zi(mo2?{qQ(0?1~l8O3BJZXpadP)pv9l{6SuNB+S_Tbuw4wYd-Sxv2eYn#?FQJD^a0
zdF&P49oO66#_|tT(H}?<mR@q%%(TV2y4KZ-(66-0!tZ7kO~L(6wD$`(S_qnZ2<4N%
z`o8QNy3)d5u6&bdL=>vxq%VdOL#cIb&U{fNTVK0woh1I3%vE02+W*LW;g`l{0JMSA
z0Z;23u&fgaYsh&G9U)*__weA0F2_UjY{bYav^{`&rqNoRg--%^Ksa_AJPhI6@bv|i
z0O=Mc@iWPTLJ4N=DBrjY{g`O!g`RBBp-%)P?+ze3alYIG8x)GHuuEf#KvdBfSa4Da
zCw$TJnRk#V$|5<Kznxhum2Z-jxk%2zKL-?|0{yTRAQFx+_CN{Zm?2P`^jnbiZgVy`
zg5Z;6_6+LZKX25iQzRy&pMtgTzomedcQNfx>&aR7E)CJi)q%V1(%jpyuJN86MBeZ}
z-U@7wDwNG|5F;)!=)upcDk_4uw{GeNekdkVEbtIOJf@-tmyO?)|7r%RYo3M(rm{xD
zhB{41Gp^oxNDt~djIe%^ItY4zT!aTSZ%m)%b(w!%0TC-8ixP?y5@$a4kIc4&Q7nRJ
zdGe$)=fffBWLz*hVZj6jW9zj(yeEl^7?MC@ib{=C-tNrk4Unaxl}xA%kj=0;8ywEY
zjoF|iy-&4pvqeAIU^V|K&<kJa9bN|kqO{K(y6p%7f<op<qBHm=(~)~3eWqNCD%_)#
zDCSP1uKjO<6mf@VBAl$K+unSPeEe>+C?$W-{r+i=Qg!2-)}OYP({*H?j2YQ^Rc9sv
zpT0f5Nk%8*MjOL+VNo~7SIVuavvVz8Q`g)OR}oWS?rJ|UqDU|H*wL`!urAs!Tqpi}
za8lIhUX0xubNaN~^UU8ZZ+E3;S?p=2A=B-7`s{80WR98P*X`G{U8$u770oSof4p&3
z+WGa`@7<{YJ$5bK(;hvtdVI9M)O3qQ_cosXvU6bHzJYMcRy4?Y&)#+eqFhO3%UDH4
zLrB`~{KS^MG;wP4E)G5(6rgKbKlf}MkyH}Sb2Fw#@ob5G0VQfI2vi9kQY-bR?P4~1
zC6h>B!sYZEI@ImOlgT8(E}!prR3Ni(1IZ~$+8%*4n6%@rM`w)z#}>fuR2RobAf>tv
zP&<CyxP?%V)upMeV_Ox8#0r$VZoFQyE{K*&*9e=u4*qV2p|M$UF0*Z;V$WZ`e2a6j
z2C3Q1QU~$IE6v;tNFG1wXMDjY+jp+}8q7mpfYrM;KX3hgD&e`^<@sykHeEv0ImUEW
znvF`r&A`b6FH9Ueb^-p8fgE@ncHf|t#_Hk0?l$)J`OL#{+puw?*_P`*j$dL2hZ%-)
zRk6E%eLdD;BV96dnSdCcXCrS^{RAMSZ)CKiuvT`;WBYhp7F%L%TyZ`7m5fiDCK*)U
z%jeD82U_#K!6i#;HStQ8jt#y+EZn>s9)C(SWB9Z#-Ir4(Z}Kd{IQHr5IvqOFNxB<?
z<~Bc>b0y^F&1~jm-M}!4Aq#F&s=MxdbWXy>L5Le-SB=w4>@`cOWUg_|_>ZR!AI^rm
zb*k{)zNS9*$noRyCh5`<>dTj(q5cVnn|fO793Q+oD<dO^ed%@n{0*4;RbhtF3wEtr
z7tr#<y>(@3bFD)I^o#f0zRa?2Iq{l#GL7`-I?<Wy(W57LTkdoBmTkY9cf%e=WZ0QJ
zeDJ{WVfUGpSfnY;JM*0qQrAz_ltFoGee#naqwF4(kW+>P1l)hUQJXJDnbNjM0-seg
z&e0s?jmiq^Vgdrx$$1+&^&0SxRD2r=C+U=<JJT9yIoa8Ipt**{c6^Sq@tSC3+&+Hw
zw_MBU*reT7^a=a?=yvE~08!bRXy$^&a)i}trl;8Dt0)et+llvkHpV_mySfGC6n<h3
zU`~*Mk<s}0V!A=^c=-LQ4vm<c=$4GlfD6+AlwYS{tf?CP)Cpp*{JRqt4b2}n&IN_C
z-QIMRn@gT3&$@1=O~%Xd-S4MB((k|ha;yo9522R_H`?|0_MhaQ${F2OgOB*Af2U;4
zL`U<I#pX#3JLa$r52LKKT_S_<A_ptvdh&G}0EP>5#%)+U08^T4!opbv8%<Wl?Vav8
zwc00q7$CPQYW|8Qzd}uemdD2X+Dc^mhL6v@%-9PAr9~flJ(?9jCvD%&M^2nD;1z?7
zurBwJYhFEh<-M_^Mrqi`Clwd>Ni4B<t8*>Q!N!JIJ8*M$6NdRU`0pgL+q2I8^i{d^
zvBAX&Xdl&06`YvTj?eyLk)e%KP*m)oAB&lRHMxOUj%WcC7*lDl@?yBObSSo+HBX;C
zGh_)-U}yQREwk9Ve*M22V%HqSevIG{!pgHDgIC)8)&p`!^VF~~Y5P0xu6C5B($~}0
zKANDoH*%iEZ(YaqXD_hlt1q&RXdj~{B#=ol=L7EB#M-b9WT#9iVAXDjG^#Hf-g?KR
zpt#0<=mCQg(<-OgmzM;g00<^~(e+g_;2T60ZaRAW___457kuHOm{q2CHm{3smk#Nl
zhnJptGQ5k@w({dC4Pg$}U;(BrKI9zKQ*pDq%1@Y}1xjqg{FRNbYMsMOlME>U3px(p
z<xdxKk8!q`1*(BrCVokiW>@Tlvkkp^Rf9njk|PhZiwo{)FIr?3Z&&en(EV=+1^Y`&
zyRJ&Tz@=<YZ88NE<l*bU=2S)Q^FxQ$9j^yaoV!<TmZg@KUVp9{loKqhQm9P4dBm)j
z;&(My*sAzBE(U|>F-x8L;~F@ZA(ev{IeQq&R_7`wTlNRFIZrnTlC_EwS$5JSeTGZ1
z_w8GJ8UI@&1b$-2HNTRB_0unHdh?_H;qaDaKvUK9joqsQ5)1`@o~W#BLXtjow;+(5
ze$`Esk<p|zCL1CMR}00Vvo_!B)*Gt4<dduE)~!(#hk<l*1;@o165nZsE1AB<no&S5
zgtu7S;2)=_PgGH9zFKUep|RIKu}X$^wU@U(Dp;z=zdm`Q@D-pJx2*DI90orDbu3dQ
ztSSf3VkVd-2l1y^)Aka-O<n6o(qwL#k)Gk@?HvNET}hNuY|HqK>B&5aE*K;v|9BIM
z&{CE~kGRKD;c(<ZLjwaZv5N8aUl@`6-CIluO7GFNE57yT0Uv_lC=2hF)HKBXniLih
z;ShiH(^goZsv5J_M6nh7_v?2to~h?HEL1wA&Iz#+NI)pCDR#E!Gd?8V`q9de@E#K_
z%KrT~hJ^duwa<?sjBQD8%8$5JJy5+zVxFXuDgS*VH`fa*J%`=jd}QV1^4R~hvv8K&
zpjYknW&M=~2G?mv0V&hw5-wcM#;*{FCTMw}W)&_d1vI_1`EL5!K3=SJ9(0HOlsg74
zbaFNjMr3EsT*-YDTMQGpBDX782IyX%6<N}R=1ZRz?#(`VHU{?g;WW~?JQ>ikBuI~G
z72+UH@I~lU$&Qy&a=a;o-yBYmk56xnPzG>qVZhx}HXtMEwkDO(40svVDS()<6u!1+
zr}oD~@)T_ItxyD%7rB)FdbYj3uC=9(#Oz{W+74=;#(4LpnZBxS#$bgj@jMFp@d1IB
zQ~J>DRs=rDyr*(#Zmg+;fR+q=&ZoaTaAYS(?77G5r%iMu3Ww5wDmKi^1I>QSh^s`3
zq7eTD4Mf$<)L$oP%|&$@$~I!y3UBmeedI-TEjzm~0+AE;Phw?=JJeQYa3Tkjx4c$Q
zmHN|^v~Pk-lavQ%=rASbADh^xRgP=&$}ZlxAw#vOgMO-~(SVU8LS&xh+oaESGoBt|
zS!~iZPI>jMiE67?U$#%GRgCWb=`l?{W*lzz->$%&s%BlBHjn0nF0WEJ?(3Ni>A17W
zr83}<BYnXcRgN~KmyMscrxt&xMO>ppH)k$smCdjq&f#j%+g#vlYUvp=9ne4~>W(k&
zZJw2R-Z&!9cK>4$x5xo3-?Z?9mGn2S#4c21P@FZ<#it$LZ7bL@cM6FG8tDe6|E?Oo
z>$Cbj+ww0v95dwND_I|kt-FsunG`lS>~4g`2|Ec1rJAG9j72`tNY_;Z?-&{!tTpV^
zLa;W=sfCSZWwvE)^X|}0<x!)#);dPF(wfm#e;%e=Q*mhzt9={mOXjj8-s~vro^^S)
z9-Ny#k(|tAw(_%@hLCI5=0gVMv01WWyHaar9`&Py?-BZb(tu;T8$!({%$~i9gsbMI
zx0~;V=tB9m_dI$OyRBx{>vfb@8!RknkzY)@#r0L`-$Oukj>L}&(0=%ZF-X-k{wy^v
zJ(P2oEJJF}j?WtoonlEFCpm|~Wsa}XNJ`7FUieB6KI)vWZx|xq_U&eGf4yblsaq>w
z4TfT_{Z#2rz93nAEWIq@*1bp-`*@Sm-{mtE=v-k|gbm6($*|Q)jYi^8(L$be>DtPu
z<{m`<rE|XQPid^4m{9WR*p=J2jl(P#?Nk_Cm$wo~U}5XeQ>`~SDRd0?JC7rG9^PEJ
zu<^qb_h7ETDM8L*rJs2GENl5<b12$1Ut^<<CM7a=pn){2=0fW44Qk4MwBu9(=H`Q8
zDWK#NDHY8cM^BtkD3};&92OE0QNSTVxqA&2?NAjLePO`6&%SKi%@6940X#}}XX?k>
zGbc@2Oy~>Z)@R1bnxsxC+N3BfJsv8$u6*#bn-t_NJ4Lx+u*KrHXC}L=1bsIOGh{@H
zIuARlxYoC2*HoFErgE(De7&VA*DtJb(c)YXSyqzc^nS?~BBvCsSUN>=PiScJNr_qm
zO((C1@3ILc3e32%dv$x{TF(#Sbr{N7amxM)F!fcV$=5KZj#hFYCuqbxp;-U0lJ637
z?wb1@hv&NeDA|}YuxXd70&PLROu4?cA*aI6@y6&|VsP_lZY)UcG3$%{t~a%X(F-Xz
zydeN(mRz=)tnk)3DN85g?ERs$-COR`W?5A3Jo;iH*`2b&o~b1K-1S_YDajUB?#{(s
zfXZ%ouMY9)RynqpFFF+VD7|@QT7Hb}rcLq>4<n57RdF>WrZAWb|0cS;t}w#OSLbBB
z!m-DrSf)r-xnx@u;>gsOAM)}*6qHzYQaJ{_r=r0c*iT70t77w5<2;J*6N6{mC~2yx
zuFlNAd=Shg>(Y#cGfN!xf_9HHH=R2=Q$GLp*~yE$44q+locc4E%BT3XGl9H5qO~!C
zWXAG?kt@Lt<Az%uhwV@!4^WI>M;6N>2#JZ&pZn{{(l*8k<rNh*sVz0Bs1R?#iW`xo
zT$m$6-1Y`E_>@>Pc~w}x^V})Nodf@@b65uF;p3E)j47WC5kvp_vC7yugg(3(nTr=K
z61Q6(haC@zMig<41@6r&!^~1~v(H+AJuP`0ZZ6_%;j_p$tjXpZF$`>D+}UK(0A$Km
z;OJtZWEnnI366|h4&_~x>YgepA#BdBTnWdfVp|*M?M_x-U6R4KUP9D0uo!9xN(yx}
z#j~Y1-z#-+mHTJul{vxmCNG8|O^~g~!*5D?*K>vecz@!=Uy@vL{l*O&RAXSzzQ1M~
za6KF#R@5uC_GYLSs_MTZb-pfco2W|eeOK7{Y=}7WD6g!OiVMhh8nWm0Y`@8j$`oe^
z&y?@65mltEiIrbGr%ijC()3Ah$&zPd%{uOjw6v@#4Rl&3bdB=(+5$yA*;TiRqLIT~
zBwi72wYK6dFsjZ}{Hoa=d7^6r(1OUuB+nkbWf;2Z%&(HV-7C4{ighzo<4yQq=NXa`
zD({Xy$m9;xh|%pp%v2Rc`#r-Vqz8`vSrf*cCZyeLY;;li{>of(&yD95Vm*?~G;ki!
z@U#RLxcphbi|U6)Gs6-Sjahslv^&-xdB<AAlvx{kGP%(yv0vHS)R|k9e2pX5AD}@$
z0{L!X?e`^6m1n)ZU0x`vBp=DL%o%-Z&Q%Wei_xDPs6Mj%=B5_Jx3?~wzQU8aFK^`J
zEXC-Pp;y{f$+80L>+9`zPO8#aq)H(h9cw+F_Gk5CdpMB*&``LXB~Y{ilt5F1+x~Sv
z(qQGv(5R>tJe6|06DLlTA5p`kBB{iFN#Zow_i$R7M<4J~GUc1Jqm?&=-+d01P-Rlu
zc>eC1h%59rc&J6Eq=+(qId?%AYfwBssdDu88<=*@o9PooZ&sAHxC_C-RgOBZne08W
z@izrh9{gf><)cO~syVCZ7L$#uPdqQl7@VHRqLgQv!Ujvrknr#&<gof1+7z75m!w@I
z5Tq=SK_cCy<SV0wZ%0#*$moYyjyI23SGKfBiHYrDIj1<t071f}9quk?hvs7s+vfYa
z<F~_NML1^(TxeE~8AlkRG(@rED{`x9N-)XiC9huHWJy%>GcF9EhOIt#=~6J``9yWV
z#YjzE>j9A3pZpj`&{?@1nO_w>A$C{Xv4ay)N2%Gp%uz{!XF%4soH37Au5}IAgCdNx
zm;eD@%wFwM`Pp1QL(0oM&E-7*MTsDZ3u-8_j$0Lc=Z+DHXGwm3fqi@i{*ByxDP%YH
z$qj~CjG?|fDfbNN27Z7#$dM8__)j>GGd8WRIMiMG2D53L>$WTTmcL({S~`VOkA_5R
z8ccj!KI$x9tiNQ*NhtA2sihHQxOh`6BbBE<6F|zRJ7qoGaKyLEKP7LR6PI4=Yzspp
z@}?MzC!UJw=42#^xSTt+bSk&a{GS%!_s-_(m8o4il^r{F>`&AH_ArjNJV9y~&XV7@
z-b6=pY_;k3k_{btq}8mCdG0k?-}H@hv7K?suAPU{hooJr6i<mx`ty$Kl2k;!5l7tm
zP+ndQ0+2t&{a1#^)2CSX*uB299C^Uq+zfjTM3FCxl?}xrntOa(3O|W$-}?J`Yc3tj
zHp4Cy)jErCMApKoa7Od^pmiN$(jrYVj`bcReZ1N*P$MxV<xMwdy5G#3tCqUIq&#i&
z=0_7v%A$<M-st>J6IZzn-@bi|-BPBRbwwrVVQrcApc?16^Or8|TvfLdbxbxI`Y97W
zt2iKyF-EYp!Om>v>&^Ryxm1oU*^)lgE-nJ8gcUM;9`G)oCzIJ7E}s%A90eJ+Cb{T!
z%C)Y2r+!LlvgiB^Dw(=6@wTsU#UMj30Gn2BP#T0*B}?mITC1yTp0|jKb9P4Ar~I-^
zCQB)I4JxfG0QeFaZsmc2Ybxu%ky|&A3R5*tAEy;#vG}xgEz7zQRMZKldFQGhxNC1+
z{o>NZV~?*=&Nvkqxp?7dC2$S$yBCzD*5T`{83g;1d@;$XeDnPI`>~(6P9vA|o%2%R
zt9R}Cy`aFZlM^^|K~MLe^TbmjMHy(C6cImOR`w-={Wfv;l|S7HEC|<iQRpS_IGi#X
zyQD>)gTD{%-$gETfN_hh{sYtvAXgw>qE?`Bg7j$eg^B)qAJ#f|T<n-IgoNMa{h5X0
zcw&&v3JPq4drt}=u}QIF9`38mICJshO`@d@plZp>-HGw~y8`7{?O>|yTnF7b>hi5Z
zX|i95y~B)E*_0J^KRW79Gwnf?<d6wv8@4u=wuX_+JIyv?H35$-hien(QJ7KWDF3Ly
zvD_d!2tAsUkQ<6hP}MKOWa5#=A9WG#jX~}!D7M9D%%O94-_1%q_VU2ldu-gIgl$1g
z3AKJRjPQVffayg7RVBODb8$&rRp@YQLy^l@?HxPr*x&u~XG!UlVH25+kR%2tyu5;f
zQ1CbMIt0}(#cAkM`dgC%Vh?GLfr*Lhg0OE>4lQqA?wr(r@L=okz*!))fPYI-`+Qmv
z@68xIFp~URk3MQw9EXIp<|rQe29KLiapa0rnKS<!00x5)CMYW><&+T-a-rlJ1WbDS
z_A-tDrQN^ph`Ep_hAJC^n(EzH`~nvC&&Fm&4jk%kWONaQtUP>(z+=#q)qQq7d-g1_
z>{by9#hOsdc+E{~X3SYW6mq!xuM6%k*iY}7wKX;CXVO&J<r_0Eu4~!k=oljqC=kO5
zbLX1zX<||FgOd|(e1I+zr+{5aq5&A9h>xWCp^9bX07(W^>IG|Db-}@FZo7o5XdZCz
zQ+1Pv)|0~tT8KW|UcZw6`t>c&ybkRyy*%(ugOjfje8nGQI6aYi@})mGCsEgP+Muer
z&K}s-d93-?)a6x%dV1!Di~MG7(Lg?vEkG-~;lub5dh%>3t|ikSK$uF+-g=G1Yt{9F
z+}vwOjjP#y+3|o_8K36l_9m_7aPT;~D{w_Zs8_T|jAv{-$?+KI@#8M&F<6c@u!J#K
z`0gJ`rA{tQ0$@T9<@%z_N0jEQK@H+ES(AHXj$dZ2Z9{cld7X+=wSL&DXwA8;zn&Mz
zY;vA8<SA<h_eDnz8RwnT87LBi71K4o%ePiA_NbDl>D=2YMrSSud=d@69P~~uUqc+%
zwcf9F_Hmyp0uB8IMJmtN4{42q?#vIXSRg6Eh{Ka+`@zYWD1Qs#n%Lw(s#kG)e|_C|
z-$@!vLlLr31R3C+jC?yp95Tc}kts&l8yvOu;l;Qqi`0ymM~OLmeC_2VF_NU`EGufm
z7qE(*er|s3V%~nI4y=Q?6@=&jI<-t0maVN8?=cal$Utz!Z>5x>D8iR;8zCp$Bizj-
zHe1JB?5?7jx0bKWUSWvSIl=+Rm(zsHRf~4*x+}nAGKgx1!RF9@X|QTlFx(c|NQcXh
zW<FRHlr!N}dLR<->i|TY5TfYI3hz@p&A1Hjskv)coJ3n^Y+PjUtDCTU6(}pRah2BJ
z-*|?n9O>LAb29t-GyKL4O~G-%xr{OG^bIw245pm|-qj;OfRP3fMr$|`ZLF+LL)jqf
ziG@sV0Q}Z5KLIAoI&>14Jz$ELW5CyMu#QEEPN7UrO$d#1lg_i~ap>WuGxoo2Cw1;i
zKEv%0l@9&?PqVTNTwEGA)cJ5&QqyRP?JXh_#hyADen?oD7k#0G8W36VF7AMJ%O}H7
z11gb$wc&k$vywg~>Uh?FjDBtd36=Qayh)<Gs=2R_30E^1Zd$>4+rf{x8)g`J-bUUk
z2EB^pgrt_iCFOD0j!xz#GStHc7p%mZ@Fb<YU<(p*QTM|dG;k{2ymRN($D(!ZGSU>w
zzU+{9tX#&CTulYPRms<T)|RVOLg5}y<wNtt0;*a#xA9C1C8ex_f&u*?%PlR-$ENq?
zN(J0RfsrFnn(*y~#+H^hhPi&6MtBv_V?OuT0NJ&8z<GJ?_kY%0#<hi$Aw5n@Lx-e8
z&X;WCb-?sRcD6qm{jm;FzQIbdMw8HqP|(?I+SKjlt&~Ar1PelhQ;|fl?2mgimDtB4
z0uSZ$+r^m6L90`;UZX?btxUSKGOl5Uo2nmrW`PTUPcr>V3+F<C&ZNyu*-}`ylenA@
zJS<aHC%62przg;gMDxSBzmfl5ScBcFj=h-xG{}vvlNRf!#z(pVo(V*JU`a~1G^vL~
zK7rh39d+m666oWej>EE{{ifJ{jh*&xGi^dHY<-LPVv4BTPb=o|cxg8(n6mpz*JNh)
z72Vkq_$$HC#3Zz_vC-rC%+NepA`^5wDuWr1?()PPVRlR%N6pjMKPMI|vU86odbI}a
z+e&PXJ>72g-kd7%z(8?i_}C|EiIb@1dwgmYl;27_JG&s~Iip9765oogCOcz>9z#V}
z2+2z9JZ_xGl9@qLW8yYSo&-54qvN3a|NdxdfavKEbI+UB^+L_gotzLjcw=AAh)RDV
zYz+~j08v9M&1KF7a@N4RZs*RNS+~1~_y`hq5T^25Hf3&mdi9lXnZnl*6eI*nfgpgk
zCI|wkn!8jP_vAE{nD<PFD+1;os=CWa$mCe8j(|xXAzo-axeMEpHO!NrJk$TcCfcU*
zSdgoXnv0_3W!B>bldnz(GCsUyxhgQMt@c&qJT(4^P(2~Ua1Og*+6VRQRwl2BU{%#a
zyIG5k#72;(J4Zxn;yg`fr&s9d#iC``3Ex(D1{lc}LP6l_h}WiIDM(HOtwW1}lgsdX
zYxkdXPBP^euz7hw+T)vJj-5PdNQZ6P$+UrL>*^ZNw8HtD=vLL11<Q#KrZx8^F+9e;
zL<zcO0+2f;CY5;(Usmupm+oKc{n}Kvk{lvOgk(-tG7Lg;LPnX+(xnA}Tj6LP7*DuJ
z7}3qx<L}<u(s;k;Zhz7QQGLHeoo}#W1?kY#$FSJY31aJVTW?IBqJ>>*xO+>5dp@D9
z0n|YhPRw$LMaUL>Q?Y%L?#lzM6@nFnIV!rnZr`erg>NhlvDHc_0?MB|KNhd7b#ON!
z0qEYnyA)2|Nu?Kn=P@3!CUq`Kc8$jxA`B>@63Vi?@$kz7KjOfAr~(&>92|ny>&lfk
zw%>Jz_BaHv%`#|n-n~**kU06z;6WI%r_C1?*F!NK(|LM#3OUmC$T)}6aAc_$<J}>+
zTo`g^uzYy{T0K)vDAVdI>yIZ8-|0Z25Y1<CIyKf?6Tf7^X?U`LIa1TsKiV**1d(<i
zBG?I2raa&E+;;FV>Eb`x`|A|ZLdKzFn*aUJ7ZlwP1dx8W-2^Ksn5}O!&_{r`d*OhQ
zHJ}jmh72P#{7zpZXXiM<iXdG|D6t20On&%e6kml$+vwg}*J@2hP?X}9gx^vq<*Ij$
zwRCC4)`ZOULS|WYP}+VOYere&2Af@Ea-?YmXwY(Bzn;XFAscU}@5UZiSoM2x;V0_+
zK7tqE)DUa|UGg`m4Fxoikg#e=P|oGvg6ik!87m{hcQ3k&plyMsW}inAzGA{Km5~W&
z{MV*t@HrQZz#3vB+)g}Qym~tr0Uh5?6gwh;hNUvty*ot^lY^r)Gm4=Kw&|As{Q1-4
zxj%!$?5#(4_22&M@s!;GW-VVYNXc8B=EWi-7sRi5)at`ue(e&wNw7g;9fAcESQzZ5
z1QJktj$eyD9S8#T1hxtj&~IgA>Z*u>P#!RS$#!BHiMvJichT!+T4N5$uANMz(vThk
zWu5!;eouKUR<s7nHUQ=b#sVM3rPw*8UTw|;*==S_T7hd_au9q^wOPvtGe;<C6|w=o
z53F>Og_`wiB&Fa5Fhv`eteu{65~vcE4I%5FFqrRhezJZVsp68Jb3S{lvX;j$uo>7P
zAHxJ!i*Cpf#LZ_+y%&Pln6Xm;Y3e!LLV#Cz!1mjlez+uyEF#Y)IiyVfyv^pMl4O}3
zca23j5`?6HnX7Ak*#f}qnbqlJ5DP&F6W_OrdF%DLdP}Pi`Sj>??0VOC4b+50q%PzV
z6IslYFJ%07w=Hapqxj&+A$3qFb{{@#vzlOA!Ogt<{7UV|D-+ZN=tZhlxS{!t$f&{3
zg0KoTqh~z_u|{)-_pZ%jxo}6UDHP=89q+isrq})O$vETKkwC{<=6s$oa%8~D%S{Y^
z7b~*ful1Los5ua`*E6zKXk_jWlahF=q<(SdB}}3bH;Lp8z-^nZf_byNdDc1BG{@v#
zoww7QKOdT#+jr@lb<aG8LYb-leA~96w)jGAmx5RXD3l2Tdm^S-9ys`A_IpRAzo^Zk
zz1`j_y_lXki6V;|(eSA6(%ZWvUF5(-fY+;8J`6QDE5H^2`l(iBUtE#}+xAyFGv_~2
z{^|4QGfy?K!r@uK^aZnIi*9{o!<gvj6*K1TqpeIJn<6yUQ+<c>7&*F#!CgDan79ue
zIB?vl^qh%H4w{97XEpyjFg4q~xxig3@pkGqafFe8;b^+z^uj|m>Zn!((<dN*9>_{-
z>)?c*Z`jEZ1-85GR&$T|HP}&WV903{G6~P^V>T;J79nrRl6=JKVmIp28CCUi%p<~l
z6$FzY?~B`cd-j847KuCfuz<Bl6oEHkg_xeF5B$u=8Bbk?_gp@UUVqJ+H^CJ+6};@Z
z`*Yi{)d^8KWp-f=uESg|z}7)Ts|lz}=-{nhT(vhl)nCHn<m#k_Gt@0(jX!+Z@iV(a
z_*D?cX84!RM*B99GUDFa?$Vu4005?Xa)FW(R$1#KZ*uks#A&ZeKdpgnqCloatps1^
z*)nW10~^G29zBW~k#*4x1$1WGFbyo-qi-Xet57>NyxrGxO+&>eU36Dr6oB3Z@WJwA
zXnPme4+4rHS+pf_Vv*;eVL|XZ;Xl~`wGl{dw>xXFBgO=TupgX|P`IgKFnKQN7wP2K
zapTY`J>FDynS_YIm0|yZiHM;HoSMK88lkF53_Lw#U&4d~NNWL%SBGEF<R}-FlD++>
z{;5;qc0Zq2w?4iY)y8<>VA)smBqdfK>NG>L-WUihpG(R#T*3JtU*as?Uq(VgqO!MK
z+a>K5vLc0Y2)Uh4e0PX1po{MR-rTIRjZ=uFPYovZF(76l-1A(}EH5q#O;Qy{BPtQA
zoXJgFF)f<yy9E6yzlh%O-n!()d1RByt8@BFNW9hVZP}(WwxvF3Zg6<`v%9Kb3*Cp!
z`5Cn_xQ#@wr^egm)+%(C2=E7+B$HaUV8H>ft08og31F$iP7Ad<FHg^_0*=E_!sXq8
zf%WdKKip*}Pc|gmP^$<$772IKKe)Ko5J-vKkRw|V58|G3(Xh2EP^uWToRE;X*Kl%b
zU$qEFTBF>cEy|TVJRQ+dAto4gIIxYx+aeAB&Toy&e>CADGR+O8eYqXJQc|>R>!e6r
zpc6o-psYyq1nkxDOu4QS!DBu_P=DphAn2~0$!mV?N>QxYfz#4N6nC7dh-UJkR=i**
zVe4o6K3m&r!!1#of3dUwN|CgXPi39G&Hl*)tFfBal?StW9J5b_FiCJ<FAsF7$EC=M
zs$PBA;LWw00brF55qf0Dk7vf<6^=3z#_Oyrar@`bACHcMqn~;<RS^BRr!J9hBjNF8
zPjVlL<dX$8c@rb<Ugc81#W8adoq4csR}~3~>;ZlkBsA1HM1gQ`vFo%vNl)p3MQbko
z{)}XS4C|YA18b1z`@_0ONT^*f+Q;jx?ASFzMMSvTS<KWTtBq~{d;$IpH5al}_1B}j
zax4h)zpEzVyKVLznb|J8=I-|*TXG-82u4B40%R%r$cZPwmFRIpz|xn9yus5V)|ig5
z05XsEJ1zKI2^Xn#G9GKL-@Ut9T5;1<6D_S3z+bQJXUctJJ-1VxBDsGrb&4%!2<1+c
zkV29SmVaNmAFuWBAHhe5(rc@QW5;=dsROOHXY-yG_m_|`efOj3!;hwYD%~V1qqskR
z^7`$c<Q+0VLMSpW-InAnRv%i_QKDXU`0k(U)C_Joj?BFFc#?#K?>rBd&31A)A@+lg
zI5VcO)|m(*U@{(>`0TZRJ{x29*x8+Z=yBP=4!n>ZWqjo6Scw7>$_OEMV8q&V2cZn4
zFhEiy{pgJN&`twhN=hU@<!Kb@uVW1J#ixt;uWyy;ZN5$?ZVk(;+zwPz<hQ*Rw&(9g
zTol`0mR)E-nZC26$`R?vg{gfdJeFLQ?Rzg4YaTNJi;G)u41m(b#pQo5e6UwM)(O<M
zLRJUCcvCQ$kpCpf*RuGs621P#?T^S71YMVTm&gh@s||3aKIHEg9VKou;&YM02omK3
zR3K^|D!{cpyGnT29*q1RQ(ET`-9tg5-jW!Mm)!*z3fng%IpW-4#hURk5(N`ac95u7
z0#%4PBUVQE4B{w@zSEtoTIY$h94s3BXTeBk$M@#1J7G9}z-fl$>a*0q<mzqgk3l|&
zlTNc8f9*e$2f)82B-9InmJTQW7y*FXm1DP{`|0}cP4QRlq}PeS$eShNqbVc^B6=)d
ze4JEE&@Py(O?ov?tngm*Z(ZvA=SN)*2qX+eW1MQ!#~J^jK2H(*f4?pMzFqJC-m+4(
z|3i250TCY~Vl!)9C);uW|L!!6{Cjt*+3wW2fB!;4;?Ai3m-rjCPDs}74-?;$Kg5sz
z?<<HuWltokir@VCr_zfj{=Yv+)NB8J^<h%y|GoNNe>NUJp>h22`p(~b{r%a*_Wxa-
z_@l>~BP<avM?L1ESVa;N0Um$;!=K+vs6`O7`0f5d`Tt!;z}~+<FIn6EVvP6SmlP;H
z5UcaQf0FzB|9SsT%iDSXdBI!bh5voTvB!Nrj{N&3j$Qxn?>O@Rz2g1<T@rWn&rSV*
z?`Wmse|MC7($44a$9ezvalHaH#207d^y~G1y{Vnn`TqTey|n*N_q-D=Se}#htyWzB
zyFS95{(BbF+UWlI3=i9(|EHTGsc-)0ae6HKdz=6Ktc3qcw$tC=^8f#P<|F^9`;h_h
z;5z8<U#<M_TTi>?_xD3J{!h>K@6Z14G5-D2|K4BYTxng;|E-GfPJb@2SDQ%#bdt=(
zawqa-{2#yURT2OAr&Ies-}2AziPB^LJhBmwZd?AhpZfoI6Rzg{{rNlj|M%6~>HmB4
zb@zweYq$ZQ%uz=z2#g|r6>cgo2%92TC$2DYuiD>7xVom(3|Gc<Hq`?MX?v1-@{5r0
z%c({uqiNEBjQ*x$oT|Qm|MakQ@Sd12MUM4OH3eCy0}~*QzO7&Vqp4Z2mh_XZqbs?x
z&-da#UrO!fh|G`cSW=AZ3I_Ts-fyJo!GTfy`qQ^xlUF?{Y;-Pj$#6fRQ&Z!ca7t|k
znCwaj4UEMzBWj(K$mKUFn%pbsK%@tj$;-{X85fs-7in6%)nKT3;D<s$OWU|NnuiAu
zUakJ$Zob!W!0}o~UQj8BV{#Ib|7%py|4}-kstOKX?&g+UU0p4zaOzSjl>djS^MLDl
z58HoMk~9cqL`HT>$w;Vd*<_@YsH}t(MM>MrI8hRn*-0u%k(5G44k1O7WJJqoIj!gY
z`Q<$S=YL*aXVLHb`+mkfuKT*L8{CKg9a*p^{_O0>VJ|Cd1$`C!d9-QbqQXmRy7n7(
zYP((eWX+7E2VZ`*E3YmcAvXKYX4h5D1Eutuo*taNbLR~@InRl@2BOqUH*PFj()2`1
zDlOr_Ok2s4^i@`}sR;v#{H9;{`_Szl2RD~I9CYR-GzrIsx-Hb;bS7VpiCMaRdjiW(
z>Zq)wWMx{bx@1rE$j}>`LGwi7OdEr0l4@<@E%a_4u20}gKa^W=aMqpft9C8m-CCu-
z_;fw4{86pN!`khipWhr<cD^D$znp17kU>x6m+Ph4i<noxu~y9bAPb0J@rz{i6}%R}
zJ@tz(cHVP$|8pvFDLvA&)JUX3^%pN%)StZ|4KIz$R9i&hJivQN?{<5;CqGS^2=7_1
z*8H-;rln?YEiP9_D_wPRba!pF>9JPg*?i4GDix8{!*Zj|E>a2mdb_^j|Cr_6fjoh-
zPgg}7)B>1@RpL9TA)JB7F)lM~7WcTGc-p44TV!wZ&5fh4oTeVLsqnVBfDZvCNoDpL
z3?M@4r2T4jdm}^EA5})xGOLx_4it~E?)UeWlwl{1PqLjLzbRTAj;aAX>gD(>WSaki
zD?({PkVa#*n)JUri~bCfJ^C+EXt`ji(4IeEvVa70MXFnhh0E80;(S1&$TPnA*>U^V
zfLDpz9SdyY9~L=0fEU2u`SsONdgU0+=*=uQmks0%!^z*vEi|19&)<GpSu1IUplIg#
z{cmvtOYG0t8i*gXi4V52p&x<dk+Mb5>wS9Ejs}E0;hSjeOnE2yHY@wD@R3hTxD08C
z5P(?z_yDM2QN(o<&&(s#SPb{EYS{Q!|8W5bIX=-`vD>Ej=9X2DE7$D|{rl6f7gtN?
zm416tb0zWilV?}=-b#rXSv!%ak4|PaSHc*qmbq83K~}j_6tra=ERQ;R4t#IL^@xMi
zZbIC<6G!t}r-HDtC#KBsoG&UvXp3)P{yXm~X;uK=5Jcw{Ne<x`OoBVl54LT)JbH6!
z$;6o4VKKQqzi)TV%}-ug78i5BdiL&^xPrvX5xQ^Edo){=)eQT6`c>iPgcVQLo1A#O
z)^rmLIC4_%<xGw?kc&)`F8dG{mtAJEEpj$WYssrl{Pou*G#4wdJ$Qe0Y!1o2B&6i_
zvOhuPwjRf6R)K~xz`xBF@s7@r#stRBX(YdzOP8~t>5JeWvXLNKEUl{R?Y-ChhDyMT
zK=*?&Be$nS1+25%$rzZ?8>cs4Z)$o~_#!9Y%;)5^(H9shn)^mK{lhDdt6NfZ%Gc_z
zA7LhnqEydde-*0``x9!hs$d+mM7vO-1UDe<>M$x>^X!w$j^X#Z1roc2>Ex}NJQ$D#
z4l?71zP-%L8!WD4z`Dzgey7*A4Owc8+?SYy7+<Z$hwgbcdGiZ3*sP1q%!YyKP_c!Q
z?$a&aZYjVtuXwJb@Jf?yuO%Mp<q#be74-t(bl^;KV61_mnvzTZ;>{Ufevw*_%gYcK
z<k_4V*)Gc{K-`!(10l`8;G|2W+Vm60kHS@i^)Fl0?=R|hoozlG#XK_{QhaTx$$nzd
zVaa0ap~`SNzM(}C7+f-+ir{6_VUl-yu_DH!z1LoD5W6z-rlO70^pNFVr;i5)xA?h|
zzEexBMr4@q9vrv1df~>fscb~dFXfOD@nrb;3@%%_6UxvnZcyDzE8drER#-r)U~Z{}
za#4F~uF2`F(;;cvQ`*%^V|>QyIB_}c69tRcB4$jUu_XNMgu4q`5KO8$1kPe&{+u9x
zdDRUzdUzJs!Qy?MU6bcy_8(HQO3W=?!Xb<G71uSar=m?wEV?{<v|fC5>}(TZJZ-HF
zt#Fd}TXbMIp6Y2+8zTmqkKpD1I&P&AZNsXvO+l|SR`>?@@+w{pw!AiB#S#k(%`;EC
zmlg558}`tse6^R+ETARex-9#Fi)}uvmMrfHw3}Muihg|MWu<!Y;<pay#giy4Hsu?=
zzsc4|;o7#jlj5vApyaej5@UPGW9V!a+fXc6Y7w(k^Td9wF1<&kSAzP9FvDSH+0uT>
z0#p$m)jn?7nuM)`_t&UeCDH)-M1*R6r+Bq<mCEgTw%;#=la3FtQ7(=+p!LEol92V8
z(q0p<p+-X-+ij3Ue&N1fFB=R85O_bO%O;yG+S3MZUbLu-WKkaHxlbPay}t2spMbd=
zmIu#l300-}a<9f`$tfrd%9`Q}b$<X%NbLvx#R=T&`N+hvUCXg55yxuU*tJ?iEBEYQ
zw{hdSt8AZRkNOibEK$TT0tHwMG>`==i!0_O!^d`0Qc{nqt{j;h=cD1E-OH<u)SfG=
z`zafFKmBqnx3p7@mcv2P1HvHKmBO0TIPjBFj&1l+aw!@QvP$`Aol7EuOsj^e(#0m?
z#R0AO8#jVmqGTnnU_;Br)+;1r&bF>&lcZx`i6s5XQm^0=*{GaZEw309YpL+w<)c$F
zeK`o0Xq<RFb*+Y`zeq!$b-mv-u$~@)Yr#{OGUEx_O-tzKH9uIjEp;-YwQE6BFUCm%
z!7EEK!V}Mo%*#j{oqj2+$|a@Obg@qAd$n|A#Ey*VxKef#$1Lr9>?;5HJmn^TY(QJ#
zzKq&i*&y8~iEl$QPV)UY?jcXQm09tdQYF*Gt9S+5C+bU5sL3Mrxg6|&c(V8mJ8_Go
z`Z_AwNP8%)hQ2jak}SRNXV-n-6|FZ};v&iPmIRrD^ERwKO@HB;-8I3q<y@rnOHF;c
zV0O?QzE|(UV<vH2f>XZ~Vzc1hFk0^%6<sygCZdns=rT*tI#KVXRRAHv(zq{t{Suij
zpA~1>>FmD}){;cK$mn>c+3EC@6I2jTq$}{J2?<R(;J}TT&EyN|S=@ybxGDd>e(I(H
zPzWV#70`~scporeQGp&ga-R5}0>dR=EpW)_WH4S{yy;&`bHj(VytS8_OH5mj{BrqO
zLCYz%QF|zb`L>_XB9bpG-i-_!rew*FA1ki_h6~v1-<%k?^FWQFz*GBnTL5il0f#VI
zLqKYH^+c}6wVl6g41Rw+21nvs%@;t*f~FvzoR-iRzSZ1|I%13W>c#WFCYM+#Op@8V
zBzQSYcB&MAQ~BZ*S1^qnt^2Sqr(NgrjClb8U$&M5!vYzd6JL%}lIUru`0=-iXMXuB
z(eelqE?7)}pU<fi7nW~|ouH;r{()yWm;-~A{_x+Q<g}p0W~jZ%k~WMTs)L^p&44pS
z*%F!&I9x$-aZ+d8u*<I}V-gq?E@0rIbnhNIHYW2uZ=+W!3iW4p{qz?F>2)xyL`@RH
z_UlbgmAZDFDz3!MyZ_~1X<bV@S=kV(363Rag{YU}uPEhBm@wf|@wdNeZu-{3(wI-{
z1R$=<wT`s4i-xT${Q0TVr$+%y7c}))_~(14XZ4ja4f<N`LQc^4tI>8lu#A9!qFFmq
zECd`VT9iT2MU7{K`AyxD5f>pG3r6I?*~d%bpH6%sG9S?$Lgi!;(|{5uw3is&D@*Ul
z>jcnBE2=7#vz8Dd5PQ=dEGdqF>lyZ3x)fNaRPY{9R_xlX+Xd0}gZ`UA4;v<6mgX#7
zB^sI_6r2;pb8xoGSJo}*e%~stmZ4VeMO~XOX2XxqO?daCzERXR1%YD00`-}KT@P<%
zg4(<jr}Lq!66hhou#nsU-ylL2K6P&ITc^iun<loG9}A>1$lRnM1_mj!t&>(@SV4b9
z3)(u~OcW?5HhecO*!Am4!OuEs62kAkk62MLEVgjZd@=$!Se`%MB5jik=W>H6juHLB
z4dJBECs`tc^Bi^IvEXfp8$rroYG~*obYtj+SDU1V)ay$iZo$tK{YAur^n#f(^(!&K
zr@SmqlQXwxgRn?dYu4Hj$5`zA)V1;Rm{dWBf(kcaCgLQ|(Pses5^P}+Qqp$x`6*f3
z2BIcGV;3c2$ZXqsKloGrro_MEremarNsvRf=RLu5qjvp5;I(Y}LdtlUdCV!?62Tne
zP$fW~-G4m=&G}2TE%XL`7KLj9r^?aoBMDz1O-SnZ=qaH;vQ$)1M<b$RuWG$}x51{C
zd&80xy!vr2N)-D<UobXhsi3^k+#zas+S&^Qw5R&w69pgN(=OkJz)M^yT9rH%CaPTX
z4+m)#G<Ej38s9p!e}ChE2|xXS-S&=BW09m%`mjjIlT|GXdJS(58ZxJN(EA$BzJOSt
zd(V2zW{1>@1}SmC#BK@b7k_?qz=_8LQEj%EL~T}Z%toxby%Nr~ePZ<@kZSQ|Qhq)f
zMz6I<e<?7LW|DJ&o%6D*)`<l3>e~Hq-=cuJukGB;o<shy5HA4VRnVS^qm!!>ZY~^=
zx&7k<CBa!nyy!W-EirX*G<OTcWAerMf&TN6ffY&JUtiwQ-auV;;7mbZoag@UB6=)!
z2V37*DfqXrx<&1F;D7-oG`TEB#^l)w8hK0$wmmfveG^qQ{!DeR%jlyE>4QRjnAy;&
zjc%#CCaxNDKmgQ}HyZ}2wl!ED{Po9<Ymm7_|E8#sVeDdXK!fSnWa!31)E=ogf{Uwv
zHF|J^6WaK3(4=hGx_?1!5Y(%}y8B~*T!#{UQUXk=er4vuiimeNz|vA26aBz#PS<mU
zB-BM=f_Xc^`Gb);8W~WW3u(y$@o;heL}w+6&~fKF7JZ}By0x01bW-_VM3#n0)3tw_
zq-LIfUWW=F)mHv<tk_$s{$dDKSMT6yqlf=7LUenv_T-ZWY70emjE5W*MU}x5C*GcZ
zrg_%yT@wWXStMT21TMmTSVvc);fDMTegOfweEqZNT&{<KAnHK6K(e1cHn`VR=qEkY
zf7++c%>3%`?l&H#SoO3v(Y9<8KHzZM&db*KI(pc9j?<X<PxiG(Zv=;iY&sVD#Qd|S
zzM1_Pvj=PA75fhOtF27i&Yk~W*DB;)+(75I(H8FR-(SRE)wVnkJ>9Q%!!+yVNBitn
z^vPaY`Q_K|ZiUVdzE!&vIVJmET8Jm?*H5;coX%9mEG@=oFz!(bN+Mr26WS0+Sc;lZ
zCqvi_S-OuuLsDhCYpzVwue{|+je*gL#aP?>qU>2x?Shc7)MO<1NaOvH9=e6yN!szF
zpXhxs3DKKVa=dEC1p3DZh`a9S<RoUA!bPZ4m{9$45i_TJOf-=4Gv))J8HicfY_ft-
z^n_!mjpCI>y&dW}AJ}v$p~?)Ip#7)n<fR@71xZ_sFYSzNP}j&X;ca>hzHJ-k(ErE`
zAx)jLKAX|QFTd0<4>NU0wCS+RAr*JO{6fk}{3>@cM&I{r_T;nKZ@10=Qte_Dx6Cpz
zzo^;;g>QZqNl@u)7j%P7@{+dnhk$V8<jH+g_jmKo8k-C8&5|S=X4@SmvHO$q_tnwq
z0%faq`${L9&P0w%-hF(6`^kJ>9@`YQQ1gT_Ix72ZQP~ch)$ZW5{u5rUPl!Myha>8P
z4nOprAJpXf!#&E?Zjt->fqc?xnsf5vZ55b?<0s~c$>@ZfJSiqb0=!MW_`(X8vW%|r
zm=t@qhj$k$P}pi~no=)CMP*RgbD&me0t}P(<er_GSx@wITfeta)MxmE)$RgmdT3?Y
zZdANVzJ9%fCt(4k4KyEa)FTr`J5FxPd2F9@1*wnc9NC^IVvY~h96abZRInmyJSAmi
zcbS&t{ZAHQ!Uvd|-lkDg(OAL#`a)tU`hxZh$dsjSk1YsEUAulg-6r1f{6ckg^;R%j
znb159Uafhqmw|ynl2dt)1Jn9r{IcJ@gN-gmSgOcL1ON16-U*OH`pk=suVL;W%p=8b
z(^S@P(j@oE9vijLOc}m1Ns)Lq10xcPx8S&S0@u48Q+!qD7=;_Z!3|#bhkjP?lJ!rP
zYxbKr`w$GEZ658Ga3pn!9~&o{4h`(b210YV4`r1ZUI+MB*;=OLNwno$QnXa$g2r!O
zAJbRoFcUlwwj(058WsHP<x4L<O+D*ce!1PcbsjnEyH1o*-0<{N@{xyY7{q0Wjv()K
z^XAQG&!6vQ0H0y+r+p;nqtjng`gM3%dvZ!`e}_Uc?e+ngZGuhohCdY|#G<~))SxLb
z`OVO4iYcV)O9>^W9r!>l!xX78nb{Tji3(H^#YA*IyEUJ{aF)dUPBFRrxgS|U+HD|w
z-?I)dpqa1EtvUKtPL37b&O%TJ#>U<&S2P*V$FHOUd!~dFYgs}ZVj%Dk65oH#H&j<u
zSt8$QXK(Lg8g&mk$mVeimTbgtP-dMd)J4f(g;Fkbmy`=tZ-@PVr#?q3{_NQ{Oc{`t
zLe-p|YChCQk$;ozeYD&Db{rqAkSBfz4oq=%O?DWbVi7?4s6Wwp<HpO+a%fTX$4koA
zl8XtQr-^42xJ)xlOm0=~UhH$sG^%U5$^LtLk+d=N4ZfpxmYOj~bRUzqwaV8dWtfD{
z^+oWxFUsjlV=SgxTlWVU*}83;m6l2v;|>A?y<ve%?O{v+vEX-nzL@*ycktjn(j!YP
z6-8Cm4$Of8f?j!RhZ3<|I&#iGj-6>}i-$=Wuiq-{{P}k7v=@KRO>OFmpu!Av*irSP
z!iKeY_Tq&nISA5Ts&wb95(?{QAZZD?w3tYJDe>RP(B1j!xSH)@08UEGVTZAH2`gyO
zVNywFh*z`}-^E28K6!F{iM30L#f}N5)h{0vXV~xVeXT`p*C&i0kYVC(C-ZjY;&<hj
z%^A?bO=FG^=b64(>?+FJKTijIQW)Y9(np#*CtC}<!O_{dP13l151PXbFUelkj;*=#
z-Xd5m+>|l7YE2c!F}eDeLj~+wcW<OyE8h$cVDm1pT3e0R*9XA!Aex&peYzOhfaI%`
z1JOnUZQ>c&nj?th!{qG-VO}T+qlrp6Tc{5r_GC@W$mF4WLg?n>=g&f?G(Bh;Vw7T>
znmE&B0t~#`BZeVC5klo|uTeM%%pI}j1RL#umL~&!y>qMIgkXU+7a-+TFg`)))e=w_
zqm=ObtIXJ8*!KsRhgUDF(`YqDm|i7s1#B-wFm=`WynWCU8>unVQ{Qds)1Bc(Ojq;@
z);!UXgJ==c9gm)8X45xQcGrQ&pZ0QnD<F>0A1EbtW2GBici|eb)3{X01rtlGw`2_$
zAHJYDG&tk;N>W*=Js@&cLroaJNG#^MeG$FOLLC&DFa{3Yfxz|yo~b$n;2j56YGw*6
zFOCw&Chyw1F0k`_DMAYs^Ym@7!ro^nOqxPyOJ~!MN5f()VSGI+txk`-wqgVgUFu%$
z<k~_!)`w;9PnSyW2FHrG;~ckdZ<REz+ZYWEn>>s7deVED6bl7(IuYlLNJ;5!5##*q
z#g4q-nPRt<-He<~ZB>%bIBZDz{nX+1iIn3F0l$9lN?ch{y{xIY?df&B>r$rlQ@y@r
zq4B`j`}O8C;w2WeqJ81?`AITzI>tjT-$bj51m8+8Bs{k?4o*|PQ&$3ttTU5V$&5^;
z<s6nk2k7yseWsmNQVVF0CB6?c;LWfHa=LJPCj5NRpZn(h;_F@T-R$Sj`@P*9{~8}*
zm9T;<#0?#I%Pm|3!F#x-^M?j9fOa|!jt;BKEbB7tEBcs7-eaPQ&Av4I#9G;Uqp5|5
zjC?b;<evF~lP_j^7H1HVMGS&sNxO-k=VtyGxvbu46*H+AUL-YUQWAq&YMXpJMx^yv
zJ1Tg7y=9%g&JCGC{rW|?GUvjmB1I#+5y@}E(yUwEq2<BAl#(La5CaRh{w{Cdmr%c-
zrl!1<!;cSBuO%dO7-WxvM~6&$YM+~DrZaay*6It6=-KPnL*C}{dY1>n$s+Y0T{1V*
za>mS=O0V_++o!KNedoJx!NSO@L+;(hD^00-FH?|XqVlV~Yq*}hX|!p_{YAIyO>5{j
zJbUrt^b*}AQ90=k1`<20?DHTF0WmZ^?8nJ}e|q?3^QqeS*iJs-z4n(@X5Bh786u#c
zn*4;{{^`r#pAHUJw&8Lxgi%V0blI!NX4*U3jSrRY?B7;6-YvyG4hwD3qlhgR(%X*X
z3wj5L;JocEjbk$}wUJt3X6C)!7QSZeY1t(JABY1>*}rm|^x{CnvE3?vLLG74zB}8X
z;1f6d>p*NTJYvwkE1HadZ9EjfKpO$X-m(~pj5nlU?{?0Pj=GeUT1kb5hT{9*m<K>D
z+%~&+TD1BGSJ(Q!O^$cEEVC}Otg!&_rm{(J={8w%gcYLa_tweJ&Q|yG!9L1-w4Y8y
zF>2I8nb$bk_m1O<-m-IY2Ab^f-@o5;VdSmiIKX}HFQxe2(RUbLC{y|*?Mufcgz@Pe
zPT9o&zWwTvM`-YbA!5<|22aq+QufFw+Vk{F#~WKBVV0}tI}IneVzGr5Khmc86K}`G
z31yjf4!=y^z!0`_Jkq>ucOU1A)~HyT)V)SAOr{pEBxW{9SthLDh<9GtJ9_u;4Q-`}
zKwnPGv;HlKRQpBvF0r(=8SEVltxcs%)Z(G-bsvrXzD3@*4Ti}+u_wImWnbhzF54Z@
zrS4>1o9C{PvrUEu&blP*_jl)t|AL!c-$PSm8v1WD+LX1C(PAiBXpx|aX*QIb^KR|b
zx&8ae=664r#lJp4y|7peYGfK3d(GgP5pT~NJ-U}B1YxT=>S^FF!o2?#7+A3NGQMmV
zlprz79FahP0WD^K6}h36O@uLEzyLAA!fwL`V<2WZY2#FP`HL^$>CJslw`41+k6*qN
z(^sCd_U#6Y8gRiOl4fE6;RX_4j(KMN8?q`*xnT(jUD2pS?E=B}Z?$o4iWGEqPSX~a
z)*x%~&_Jzr_a{x$Rb~j8)!u!X#j4N#)pY1x(!7&cI@m;`%vt|%LPUOfHw(g_cf(@k
zPJQb<Yr{YJs*id!S|y^89Mod?G>q!2va~++(doH9G?=-X!YZL=B7*}(Or;pzquv5A
z>)>kPGC_^P<9o8>F7I~znuC2+H!$dpHJwJ*?l7G3G4Q#H;GudjPlMO}{&-bWXFS!O
zob?=uvLeF424u>{2s$=WV+VST>nwD(5^@hP9hZ$gRQT@QA>3tQ%~K>29cY=lMepzq
zBw*s8(Ye*W-neev9Wfgog@v9QUR}O}F_GRqJoV5ad7<aXxu6(>^ks5P-{;LMhA7J(
z3PwXK!nLNam&%kjVvIr8uI>6>8?STr>{;*H`htQ2XY*pX@`MLg2mYRW(GwrLQq$Ko
z%G*SvjTF7{Z>yJK2-3Gu<93PGXT=?cCl{!Vipec+`|47*o!6W88HHaTX~#>fRFUV!
zA^Xx&2X7xA+HdLBVZ?5<MMXqLc5rX{b!H13vWA8)lQPlQY3_#^;B{$-vD>t1s;jF$
z7yn1}=G`56_tdrFK}_vgYO%8&zn^z=`!F#d8R(}!@dZ~C2b>-UtxL$*E?u>|D?oJe
zz$C`Vq=Vy#`F^?t=42q~>bs=6OB4GNv;6bVT&d=_&R0hgNpsfuPsB`RTnpqrFLP^p
z(=a-q=a>rln3{q(iA$6A9Avm4X#tZ|Z|`f;jtK9IRpiJ?NgaFGcK^MtnVE^#uSX;&
z_v95&P{i=1-#*b~z1`V4W6db!zGMq7HAcq7bf4>{t0I?jA+xPdNShAR?sZdXTi@p0
zoex3FMP?oGJ;S-C-}g%I&g-phJVFYBF^fCEG8XV!BE3ZsWil9q*7XOa)%`Ryq*;JW
z4y7MEj*F={Zq4o<=s8Ay^Q|A!;A(rw(25CHwSr9!&pQ9zB?Vp48FS}OL(miG?+{T_
zt$zAw<S3ylrWZEVj1j9Oq%Jdx6uUKrr+xVqe*3l>lt)kQKZ#kY_4f9Uh%nIdb+!3v
zJX3sSmt`{tbS9oe7uIKS5SX_YwSbqeUVQ+4W*@zCOH<0#0>xa%>C@Zq2^r+BF<^j<
zxgSI`Bj&)}OimV~SFR-{7H)aGFU|V5`>T<dK^8FWiN9`B^jxDQmIpl&6r?k={^ZEF
z<GXt_A9!JT2IE<_Y!~Q#Y<@Y7$us86DX-mvT_983L1(xY-8od7BbXS&iRSg07dDyM
zRp2*@-Mc&IH}>}kDLyE$hdxc;eZ{;EKsAT^KsD30ZXE>?oe?<-DHiVeIu4Ao0d5y*
zRIl-lGR!EuORJPNyks-Ao{anw(JUaK%$^Y2f$cTR8fjtrE*QJist8ATtV&g!l(2kb
zchzaFSh3<T5ini&pdp1d2{%I07R;V?e(SSLb++Gk(WOw`>5i!lA-|)u{^J6iwJ~0o
zwUH49cw>j6^mfA7fY}M{|0<Wp5drky3Dsd@V<{<7faz)~cofo}tw@{3x>OAoVI$7W
z%`*i;La(t7QcMKliv>;CIf_8TyB=1zV(;Faf=zsCYMdM$9n;F43iaM@yRlM;jAq&B
zNQudUC>A81n*W;Yl~e~SMReaE{j=l$O1oT6%N(PpCsK9I6a5Dc{B&qTR&s}4I*+wl
zQt1i8+rUYRwYAZjwC4Hxgp`_uTU{{5p>x}ld(qs@iJe=Q#!yH`<-DueSbUpktjfgz
zH+&={q$3@sj%$YoK5A9oeh7nonSDezRXEuX>DoP7Y_t&gGyBE{Gliox(M)$f!@$Vj
zD(iwg)H+Jhb?h^5Qlm0)CGCVkliZtjmsfx2qup_jwT(Y|8S_=_7LHvImE*T>AIkQg
zxuso`dr3{^WWrGK1@|q2uDOFk%n|0^0ne)w$RWzir$t9c=OsIj6mE)TV<M#$YCtLJ
zSsaaG`h@R*b7X@GP*Uj2MWwrHGC&7`!Kt}}di3ZKOOF{^<Gq}cdy|*|6bz-d>}@?H
za29zF8Chp@bMvIFpJWA7wP(-#$}?x9qB;sF)qx%$sWj1-2xw7$eES24Lms3@5z*0|
zK>X{-{sR~rW7zv~3^tbbQi4~Y2!7RWA7%SKX!(`1B5v_ZZ<Ar7RHr|R_QD5&atBN_
zy3}~w=tGDn>%IC}$4A^@lQR>0gK3DFoQMhq;Q!D&ZMeN5d6jr|KqeG#y#;V^a&$)t
zcw#MGk(eQ|`i^5fD!bO*_DDaHs-yE#D1}q}NM|L*U2i~jr@H{CSSjPP*=a4f78GN3
zbl>;U5_);%jm{XPZ#g-GPdtv>`e_!{WHm(uVf8tVr``Tt$7}#&%@g>mHi&oL6~2#C
z`SH`IqV|*OTG9Qy<#B2~kru&21nb7bjcjedY|1bp+`!w4Tb%YJE~c^FeT`!sU^Jl8
zuos-5rmR<1Tth=bt1)c~@^i+{Z~DiafinLHkg#8b)*UP}WOS*$lWb}X2L;Yrz*xlY
z7CTxIGmT~3I)hk!iR9kqszlU7-SJi!NnT6K>wXSGVXfO~Vn!wq^_eBiShn#TVo=;2
zO|E4NUC?m37vNUapE>bDJRQs>10yV-u%Z(fV8|2c6OY?*Q5eob%G3ulod`BzKh6}3
z;MP#whc$$*(?;k+GkU8d=m6-9)xogbQj}y4kt%<Pym<QCn2#eEj0(5}VkU&avGo1X
z>^k?VD|iV@Kv*v=^>&n|vsQ!E1;oGpi-PL@v1r@$<EnjP$l272BXciiP`f>Z7J(Yb
zd^!vTF<)M&|3&0DetOiEuhX1{e;2cA=(72GrFcVo5($~qKKSU`-4i4$**=(nV%k~5
zdGi!{$xokudYizhh#E-{kDNH+oIgYdNT&|XgF{qG1T-naNj9G1C&`-Q_}<)Z3D`+9
zi*uP49dScRriq1|dQs8HoA><7FEIg4Pp=zA3U2&F0^ENK7Il%rhzmz5ut#wJvF=S1
z5KZV5ZR+~i8vSr|fHV)JFyXM&+3X0_SWE7+04ynsg}?YpK=uB!R|rMGv+?m=@a_os
zl{%+pyDBBwYW(M%+P7pi|JH_`^I%-On25uJwlL67963O$X#nH+WyLBN@`G54u^Zpr
zQUx;?z#E;XJ|E>Qo7;Cwd!nWD*FRwzVzp%TKp-%d7hiOUf8Fb^FKD@?j(gu_5NAJ*
z2KnI@w^Tp}h^akELReIM7nX$iUeb=D(eM@pqq<j#x4kj_IN1!s28;_xgwe&%p`^D0
zOBZ9LHO+5&2{Vbso9gD)G$J6Uw7L$d;it*3sJeAVO8euX0M(MXJt3|Rh<tY?sH7z>
z-A}21h3kl}EP=ib)>N34!q@xLzB*p*|7#*2Z*pp^(xiX01BBW(!Z;riQyWLPF<-7f
z-7a?92{6TZ7iO*Rkr}Z_YVV@ozvorWM`=Hn%e8!<J>@^JW~Jl<`A0$QROA{D9Z6Zr
z9CZjlC_|{cd^(9vIu%-0;9PyD4i*kb4niAY|Dq_DxS2|5M0j{Bl7(flmaVAMRn#Xn
zX~&NWJ`mc1XIbsFx**dK8mzmc$3bbN+Q*W$0L!%ro$Fla8ufM<@jlVjYU<*sp_6bJ
zZQc6wFe4<zPdiQm=$;pLHkSI7F`Srz;xX4Zs!|-^Pcs5U`A!H~(Y6$Rsk9?7D@{~l
z9tqPEmR(&V2C^wCDQOSLw6WC?$;*NrMf$Ev;Bs!Ux^2O9v`NZ}669)DQ|ANdU!K?T
zvr~PVT(_30#qisk!yLMO3k?qc>_6x<2B7`3ygWaiAkrlf<N=P_jW%bmNC^zh&3jWN
z`9V>x--r>LW*jR2_1nupNA*uO?i)U5mEB~nSVormhr%(_hX(HZ8E*dmm%?aCs?e|B
z&93@7m2+^cw||9JT5|7*C6^_v?+TD6mt3GZ%b<7f-aF_la-ff>#BsbhbxdOOUKrWy
zXfYyKV?*bH8~^z%*)CL%Sm6w<V-@@UY(nB{+s`S(1`kdXa<U?T{jX}oT}}VAwOxWd
z80)-=zuWfNU8O{k02YRr3|><Guez%|<`E;w+!WC8HJPL~{Pn}(^FA3ctTuOAqL7j^
z4T~12*vNxeZ+rIYl@Bshuy+8Stf(p(mRpoj&cIY^(cMM~;R7UAAc9doRpW5i;K3N3
zT;dA;k{lQD!9{&^h8=qtmb3nbqJV<*kIopdD|NEf9129?^BuCv`Vto3LONgvRnqsr
zeos-&=+V8q?-{zn7(~NSLf_xGsrmSeuc(6<nwVrARy!dskl)=5P}{Psa~{AW-yCnM
z=TGwC|Ej4mCZ}@Tkjr%$W1MW{Pm>vP2g-l19rZf)klS_;$Q99i=$}iIE>oMO+$zJQ
zhm3nra8Qs;<X_E;MX+O;7e2G-QDHD_3o9Try^wpF{zeD*3inof5cfj(%lpKU#6c-q
z8TD)39JJi+!4&5YeRM(&A5LpoNHrG^>jmXb`d}whuJ{=p<cD8x^ub~LIm0UA2i<Ki
ztkkp(o0<5jG7Htc_mv&^%uXtU_RLTE>1@+_OyMwRC#QWYmAp^#seg_@k*Af-ValOw
z3-ApIMcm_X#ruDf$Yx~~8)!^E!Hyl>Gd2ISH66Pg?YnJ!DC0?_q|PsVcu4=|2!t8D
zf*TBor3t3Z3$^3rbRtK(2SqY+7W`1tSF*%DG8tKYq4}n7cd~gND!|^iEuiJW<^h8l
zmU1Ls+4S7PAw;246~Qwbe>LL*ue7X-XF!LUhiP^yG9DN>>mE;xuBfnws`4kLzF!-p
ze5D~ZpZ#m1B?u_Mkn?~EF_#z1s?<qIV}KzO+Pq3vuFMVvWT$MV&RAnBVE_u2SWeei
zkOTnH_`}(T$qzhS969A_L5JI_>g%iv<A&k^IhcQ7w05F(`TF<1t)-%J_L08whpA(%
zDBCJ|O*_8__cY&IQOoE2g=O0@pU|Q1yOmwS8`p`4z-+(YYu@4BSA#hKm5nFv>?$nW
zCRd|(liBgqIWzcsk%K^_zz%kW@_BY?1h2_DL&`t~xX*Eq!xYhh%q}Q6NPXnNr-md?
z@zB95zZdKl3nU`6sGNu!H&lhHDPO?emG0lhcWNq{kJemui>Xbit338_4S9(8MN(az
zR@qgy9n*zkD(fd|e&6Ad+h%ieLHX{@r9F?o)0|RXSu<b$-v)R0#@*GcYJE@OfJkHT
z&r_V8$8bqWy7B@g%k|PXv-oR*cY7s=Umxd_*@w;@<TOOA;}l&l6&vZ`718%&+^UC(
z>SFPaw_CR)7bLD!Xy~z{In=FYZtZVRU0iK)f!pjfy=Rr1zsxP#SQz%Iw3^XllcP<E
zwfK9-T)yeYtvh2Z7i%1=zB%IM)9auc1ls&cxq`hb+72GUjpP@H(`d)K_}7PngFApe
z{XDamRZ`P!sxWb~TbKklfumx9MdLnEZUTuJ=rr!nmo?JwAgd#C*T<hI{Y-_gqxhg*
za{RL|zd971Y~C7Em|Reiua=yun2;*JD7iqMUjzz-j8u#hR*YEbOynQ(ge;W5r*HSo
zP(Pr6k2GQe&HPE1O7nM)Kb)4+yoADrc?>0+bWqq8NmopxfFJilf9?(RP?j}-i`4I%
zr<F!E1ng(;-f3P=5ZmlGc$Zd|Tm)hW#VAe=0jH(rz@Nk%8sAQK7)%7p4OS1U`n?MP
z8{5)t{M^Q3(@lCk35wx1EAEiC0Cg7tlJ`}qit(pwET+G%*=+(uKpHDX+%hhk@CB9t
z$JN69OjFG6&ADG4jT|Dg?J87OBjeW_vNFrE_K)4^qp&FG>m)1P?;oGv9(39z<(=!q
zD?pGJKGw|}UcE}8y35cMi^>tD_sG?J{y6@j4U4WRb_V-my(kc#KCLFfwruSapRO*9
z3D1&T0Wc85p+Z!m4HzKsuF<703N;P6o1H^=shd3+FiEWI{&?b<#_}Jp3pX`o)}*{E
ztOWxS@i7V%^H;43fD#Se%ktf=og!z)v-?(!b}fs`)Ite~V>qg1m0OGYz{dRRoe2N7
z5%p48U6(qjo3tB4mz6bFL|@O{etzM&bEE&AmvoJzwQP8brt{4w_KAhsmmP%}Xv8O{
zTvt@hI{D0fq3wGwRSp9M-t!BGc!liF_^0W1z<~o^pm}pImTotX$}t)mNXmYPlE}Z`
zQmax7zL%D3>V<TY_booYX*sZrNO;jER&R7|@u*w0*>F-eYcjFgDxId35%m=}RCXXw
zrFvnX?|rIwKhSM4j5f8Vb63oKm6LAMRK2RXEA`&=);CkV9o-Y#xeg!cpmR7>$5x&v
zh{2-8#K$}6+z^4oE#!KF^fvVX_VX>vP-NT?CxFDd)59N<GpFWAga!^Ff*W!lDjEl-
zS*Mz<Zfjr5)p6PM`+X`~9u5GRwutxRsxsxKd-KdAP5YsW<d^%r-Z*XEynxuQ(=6wI
zbr|b1=L~nBb;O8*9vFT9oQmTK@8&*MV3@PltohhGS9M|JVj1N|!`1JX-dK5v@`P2$
zAq>7S-1dnm_O#|iP-*sB7<pH)I^c@d9^byKaJIYI=C-%zdc-75>G!qk_?ncr6;6_B
zy$^XcXPUfJpmzcz{3YGg`*)i>@(t#HPVh@@Ih(p~Rcw65Hm5I+Hr;RjC=6IYxRR(G
zF^LjvjEsTXN%^yL@~u`Nl0jp)5Y~^HZ))m;#FV1Qi7E0s{QkL?LE;K1BTZSiZpic=
z_?RPu|J5%V_Hxh;#e$9pr?ml%Mfj#6Kee$Z(mLLgwX~Ff&7-@s2ytq`RDgW+0o2sU
zHe%A-D&I5C`EKcjyT4WjmAT%?JrdK|e~L`o&CWOSuN_Z&c71c$;3G5a$BldSX=?|o
z@JSN_XPvj(xUsENc=3x*TRVNLoP`CA$?Xq=WbVZ-*zR<v{)t_>cJ&rRyPN8YA3S_$
z2ttC>1yFw0;qQf!%qgy{b3V5)(r4G!7We7D-%6uhAi>JxPd$zwnS19jBMb$a%mgLc
zdr7wS9R#5KBf$MGsiS}n1c(73sNg6?^g2aBPg`TTVy8rQVHdhndmeu>qVJ8BvsYcR
zUJ?+AT<z4@Ev=`Wja&NV%o7{qZQFGAxK~X5I?3Z@YU*x^P}nT5+5D4`P`Cgshp-cz
z2m!S7TEze9sq@^m#=LBJS~`e{KYKWdHQ&O@dhUWk(U}PL<0kzPm5yeGm1LaCvfi<&
z=P4oABy6N;X=au`A~%Ks)-^RX76q5cn8olV$P;W0l~aDfa>AW;|5@)U0YkGJi4_cU
zNLFxJplKAUoyK{ZlHwvYD<<I4!IB~BGh8k}LraE_{>eK42W})u^2;9QuzO|HUPbAA
z>C)~r=ViZs-2uZ9I4h?zXra69Irm-2M_<;8c-cH!>snR&T^{|{)*m{tWUw-~Cb7)y
z>#yGxF8Ms>JFcJuMB^YWtJVUYhx<5uS*+AhnwVp(Ep6Ai7hGtQ(rj_N@^PQ_bOxcr
zZgwSqqaH?Aa3nxtz$;-P8B_cbq#c3#*^27b;-SFueJQE1y41I6;=co_%k$?+$s#QH
zu9bBz4A|Dl_OoX%g3G6J?y%B95`0CvMP=y@-t3@2&jkq8apbK%0>^_S1XHnWD`9bo
zdWKTGvOEzVDA=o*RRIJvR+}F~ld$S@@Aio_ho5s#q#a5xx(1Jm=oAeva;Q#Z-9gMM
zq(y4f<)=M|V$X_x>;S79R@TNvzbWlSszPM*hr)ns;Bu6;VY6!WI4Ia(985&61v6|j
zw2=P-cEt=6*`b&S3KQN&TXJ`=Qd0)5-0}7N2`y0W%d1k=R=NEiC}<LbXaQaWV4Ws-
zABPWrPivpV*{-s*3Y)1g9;*pFst!}W<iRiJPu0%mk&ESB$L<_SfZ@R5rYZ(aio^#R
zTVK@S%AW5{|LXVwq1U|+*QCF_v5mRMTQkeYo?G~JQ=55c7`F|ziC-==7D1HQk?R*O
zeD2V+MPiL-3<!^Zbk=vDd&B&sXLh;*!UTm8kdr_ofoH{ygmooGN3mV)!D61lV>vOK
z?`oxa>YZ(N#vC|QqeoYJbZQ0>rJ&EFTN8}}hTO5#vP7D1^@(oEoC}zj8AzAzrTO2Q
zj?uv*NA`K?)C;T<Q^+rlMG8%+y8!3X{j5a5(oc4zoX$+;qHZ?-5+zun5+)q4p8Z=t
zL7#vmoeAdOo_X(bjC$%NMhbe~QM}gp=f!$)4nYKsxOs%UgiOLu11|cTub5B$9wHz>
zw>4u6#c*bHz_d`LCEUdwte$atdcPwMyIUwm_f>3;4o=s8P(_?8s={1jZKOap=Eb%W
z#0&x4qQW~_<5&@N?9Z!1Bym+1ov&dOT@WE`%Za3105*S+Zil$p9fcxQQFZRBOqfVs
zTE7mCd6z~LZ7lLbCZXfI;!V^?_Xv_~j*-GrxdHne%CqF)$nJp-EM{)Qg1<+L$B|>l
zJh_?x`1{RKSLLrx{gVq0flj4^4so){<q62~vhwu_^TY5DyHM7=2S6C>QoVi`vc{s?
zAc_i*e8dHZj`H@9B>MNW$3Xjku#(Yf1Vw=zr3Ez>4<61ygR&M<l3ft3nCW)9>6TiP
zqjCfq!j)Ed;`e$o@syyB+S_ZQd+*0o>LQT0I)ssYMG+dE{&bEJ?4h&{dXfOyv7=A$
zfPKD0vfA$9!h%B2E*r(mm#BaE^_zU-uBK;6_;zT7FLQDj?cR~_jeE$YQdk@G^~CAZ
z0vTP+Fmw^B2qlAF6L+Zy<dw1&9F=|t`b<)TovD!HazwJC6$d15)=`2=?e(vW_q%JW
z+XFuR9xgPj&~Qu`XN?*}mt@@qjIV{F)t?E&0RL&z`|S2YvFtru?_q*^M5v%D_Lm4r
zx5vS}Pc3###xiJT#$3|Hp@FTXh^T><TB`p2WH)dLs1!iAB9tf06Cnq-D*4*s!xZBM
z2#qY!9ol6AD2l!34R$nE=Pp>j+@B^1OC)+&;$2T9uc5Y;9=klMge^gE#(~OcKcnf?
zS_ND6@L?$e0=i^aU*@aME_-gY&nP^NorhkwkIJ<`!Nxkgmx5!yAN`IZgxc>i3&H#C
z+m)36_L*x+j+4Kal}0ChoYAS7kF|ciQ#m*dmKpo~{lFS=HzV3l0gb}X90?3$SfD@c
z;cM2ewQ9NUN8!Z@{uto>W%J~2NO5I?pZ|Dewz-I-BCs_ylttAcZgxvps)Y5o>YgVk
z<nKY<E5Fu&tbQjFEu!#CwRb$6g@Z&_9JYPtA_5}M_IS9)2o@OkUZd>9IOyM4M6TCz
zEDGCnR}547&wHZT6^A&BsOb}WF*Ed1-df3a9g$J7c^+FF^!m;Xn5d{@iDA|6IkBSR
zc6#S$RdFS0CEzBlJ~4v@RnLMU`{kFvxpcPiLn^|n;{*X%5^=ZT>XHRx?S7(}A#l(G
zJx0q4swR)uMoW#)!ca>#aK-?!f$a_jGkZ%;r~P1x&@c^rm}xFd0ZmGR#)0JL9e}R+
zD_6>i>{rB;%B!(ug|<rm4@mCUi~^Iu#=fUEEKtpP^SUQrYd?#bxVM`dE3HwKq8zqb
zQNo<6QY6U8m>aqHs?Ih=OxO=xDAGkL;_Fdd69q|0z$Kilm|qPzi0SYFol0IQEU{AA
z{Q;eG@d?kkE6}{j3*q>PpbSa^N4Wi5if#1#a9<@aIOV?9F&}eFTaAn?*D)Ag;JDSu
zYSke|<v<<3>|ILF6Po7fpygVd-%a3}p5DJpa;y~bAmTpK0sz=3pgq)RfZ&yrb;a*<
z?mB~rIZ?f1pGU#bPWHF@oZ=629oii%I6w66t#CDha?VU7Zwf^dXdi%!sR79!{N%2t
zK<K92tz9<d!>Aqy1<!2SG-<F0Msia~{XJ2@BmgCc#mV?png6k{*vsnkdm4)bQyAkQ
zOpOSP1Q9mbsS}i-d*acirqU@EBM;O_4s^u($mr;H4%_l&&SuL18PhX=B+7-MHS&K@
zvDpIK>KCC7a<+FGh(y05T3NT|3G8~}zk{ia#4&?UR23kK-$wyj1jG{<z7{lkLpNq`
zfMTgT-QYY7=-fO-uAi(Lya&2^MSw*%0}auT5vF{~J<alP_;m}+%{u|9ZOpurFi2dT
zY|n#4;0M{<bQAYPjZPLUfQ8%dwanw~22u4M1tCG{;L7)6<nz-BzvPII_HtbTh%N$8
zDOWU9MmJO@_zkYZ;4C0F=0pG%Ku;GQYUj?Kt54L-2QwM#)-aZ;QlTiD@4)5!Kw6G)
zC2)|7<n|e)1v$)r#t%}7Mnlo6&zzg?EHDi5`=l#zRvFTif)w$tcSQt3on!*7dw-Je
zdzyh_n+9K2Jva@5z-lIaL5nB>8QHaKLgD`Q{uxD+w{IWKl@<(a!S#WLNisf!J{V$f
zN-K9YC6#Tz3MHbp9>m^JIkEwnls=73VsfpS5TIh<qN}&*jTh6Hi_uc5Ypiq+fYS4x
z@`RXreoylAd-qhv`Q5eF?NQ`7UL@NzNbokOwEZYtdzc-vWC{yJ`@CyrH67pm#|2o|
zg(SxZTE|(+9E2bl!u+D@?DLqPAi+`b30BsH55|;hHi_)DY-{>LL&H`=a#+y%@$&aE
zDwNvj=eVZ{Yo3)TV<jlokd=ObS3r~|$t*AX&LTN7y}Zl=o=7ojvYGh&Efo1~(O@I7
za=UAmi~LGhFJa|GWH)6>8*wo2gB^|=r#e3LkgbM7|34)ZEPURLZG-T71P6CD_w(qz
z^j1{6hWhy|ojtpwa6zX|$-zTlDTueWq?#x};}h{V!9Qj4bf;b-hJk!0Tm;?lXG#3{
z*Z&^v4A{RH#vUOR_AO3FM@HBy+9dXyOLKVJ9bsCqrraq1y^rm5MNx=`9d=%jjUXap
zdGB)Q1A~IFeX<8sb)-f1M7$(YLWzYFB;$r(PpvsHkUAJw-(umYj&u^MPS}KZ2G;)*
z(+|EZJRgpOC;-k9MIWkH{I)pZ6bA<nAKv0=D3Ni<G6jA>%Z99=mk3*mBL8YS<Y;vH
ztlV<t74PzT7ho8{4d8qK?_)^qK&ga(0-AxrIE60VytUcQ-Fx=jFQ^iKvSU#tr@Uu{
zE-IY@Z3O{n(CHnm;feG(I8BlWaTKID(X!329}sOw{Q;R5<ndZ0FM=V#hZ`HWfgE!<
zBxIP=x|h=l5=AQ@d$bxI)jF1k&YKRxz>7{C+zZythfK;TM{;kyWmqX?NIG<e(AORs
ztp1OkE=ZKXHDtYwB*3-Zp4QX*Z#C(+zyUIC_?0X2G;@e;TDQ({`Epe`Y0td#3nq#3
z*tX9mkR>8e8Wgm9gcW0*%2zOsr3hRpQ)Dusp%izANz5Hr!$3S*V*S0ub2sXVGd+F7
zhGC+yPWXGmz11f~=<dm+n!B1FCE;R_FG6#GcWHFEtd2_~XpLM<98qBC;xb-b4W7Ch
zB4i!f-y*vb)mcN1^6lD`O7%g5rk9P9R7KTX<aCA%2D)!?7Wz8IdDIkRH{@#Wg4`wQ
zuPD^|i?o!3PRd*MQ`}2GqlIu_sXpO;_$D3CJoN<-i+dc=3QBE|?z+wZ7lL0YI8ZRW
z01GS+w-gNesn{%>Po{_0z@(5s`vj{$;!46urfe#9A2OFqdC#-CrQ@z2V%xpqx*Qu^
z2Q~{kKoA09HE{=t+=-kC154B(D~Y-CDmpfDf`h+u<!~p-qo!0fYm<HxM8Lu#GR0uQ
z`F_duA^MjTxB)a_t)(Sb-ri4SIegXaUexra5#{5>?n+E?#ojv_3mkCRch-zj$7&F%
z!!QwrQ`E`x&Sx7?_!YLB*>D5@Ao<FBa6`}$MbR5`jnB%+QaldnKoA6pOz+|Ogrcqp
zhEkiVV?-=0f)iq4adlVz(`+QxcV=M+Svg&QpH*|_bi(>*ldX{8w?wP-c=`Pa7#A$F
z>}+UQRGD>c*>I^eEuFilJ_@%X+zlQtj)4=x-HF;Hz{MUBbW>;iUw0#-hJW8L+aYo7
zQz=(GfANA&AR5O=c>6gmmzd2dR)Fl1SE8%niH+ytp=*PF?*SCML(P1XFh_#fDNw!t
zV}UJp6txrdcoA^1YLbAfUql#!8LdPc%?CQ+M8z6=kTg}eZ~7dR#4Tvrat?%=8o1Tx
zHdDx5Zr$p|xgf93q|w=r@TBbRjV@wzIW1ZTF(FpM5c&Hn{r7-%n>YIsL*p-%W!nj+
zGtm<{K-}!JeQDSgtUn9|T$44Ug7XHpJhyAh#a}6-)h6bJCp58kq5%XLR{$>N$xcNy
z4+~1`5As`)<TDannsvG{iL-b_X6%aNgX_q6zzGhMx&c{<P9J)UicIAclxPrvL~Mxx
z5`8(SGK%j*P8c_voSYD}r?`EsleMYZWMs9^!vm9g2;5QNDgrDebQV{g7Z+S5JPm1F
zvFbyyG0Ly##zKupn~T=ckVf46{o`KW8GvPo6Z%0JP{H(%-SNGzD2~5JUCEfyl!?Uz
zR5Ta1Zn^n4V#92~r$&PMqE>AL4FuuI14NufOjAX>xXDA9W^}!m-P)lIOQGdv_}S$J
znP`ZpKtn@NRz!ao;xKt@7bpflu_1#$d+uD@Rql<W5w7xrEG_s!v>mBTa+^TKq9x4m
zEAhBL#eeVK(BPs-nl*7TL&dB@E5a1`6KQzSE%fm6YWoBlkZnA>-x0itBvX-HkcZR^
zrUiT_-7T2tOcEIA%0@8BVppkIVY(=!i6B{kw<M8GP<e|JaxF3#EV~D%hJ=E@`A~5H
zvfE(nh53bcVKpmq@rUf`eWsFcOl+)LBpBmClmsMUrz9e=%Y&YD77vXYX$h?WBx^KW
zD<Spkv`%>qkPl2Z@#RsyilrY|FiXTTA{*f&Fd_^UXL<U@jl**@B$un!kGnP1a*wLY
z#)WlcJDD{3?nP`uQeIjSH(TsfzaDyI*>qGP?`TBlBehndhZ~z4jE8er5r>=%hE85-
z0A|pJZfz0F6C(=rMFf|uK#_~Y*q5l0l7`9j(&@P)H2o=EVz?4v&F%6Cv!GY@1!5Uv
z|8_FW``X6?q$Ud**Uw{T!ti4v;(*8@aw^dcE6$V{L?PM}0lEHPkfr?dkAV;G`Cs|G
zDoiKX1ZDwSB|=$<uH2q`n){8W&}+ne)P16KQ*d*8UQ0{l9^^OP)1xO~L{ce1n-&-5
zu_OnPUAHmdY?w^heOhAvL>VO>YxP1FEL{pwb})c#8kU-zhV5B>Vw(B|&Xpi=A6nJ?
zZ!W#qw>^mJ1xpnrYXi5M_%l{SsvIL+#Of2XB)94+6beJ7yn_$5j{UnHYl6LCSYO_G
zZo79o7*F7!V2@aPE7WC_*cIr#Q`r^xkcW>Sm!_!2N!Ux#@$`-l9h8-oEqOv7EmHIu
zNy<t@9)kJBggHOLw*Gp0$4l3WO^%o-9-Qs`)I{^22Qf4fh|obqMkGP>HU2pYH~SxZ
z=p`KB#fucei6sd|Z$`mXZFrp8&=Dc(ruJMjE`t%}0g)>TopnKp%r%yrw?@*}c2jAk
zp@2FGqPe)|jCSpW`$s;-Ycs_M;Yn)`N_Yx~*GhVsL<=(&rgP`caei7InRQ-L)zQ(>
zW>}3dRHTdE7~KsUq8vJFNsMY^7ZI&F*(`LYfP7_DDgvLvZ8y-Ca8HCo9L=m#<GK((
z^~xcxaSW)NxZkdhDm3{q+$&6rNT01V3XTcZ<+8ZeSkb%GEc*t|(uO?^9q~U4I!;(I
zSYFkGDBqCDc-^r~1B4WXg)+KInX@bct~h%5O-|}|fHEN2XXT#_l`%>4G_t=FRtQ)h
zd9-JA)527sd$h&D8tf5tY0(j35p$wcO|nP%AYq1dRD1tpg4aL>py$Yxs?M<ZzB4me
zw4+SHU$f$=VSoQE%gs!qTWlY@<2ISq*PYzi-z!`rTdTv{kuTOK7-<}f8+NRsV`84f
z9^cdveu$d8{sI`#qt?oDOa5TiAJCN*X{R7aB3eJ;SC)>jB3z&8BU$bOLrY(cHgh7v
zZE50#*vkS(aDKP~f3Aq1k4Oh)A#Jo_l*1w5T_rH@WvOlkv@3bz?W`FyH`cWVA+e#~
z>GwZN!uwwd0@QL1iJ((*k;7P#b)w@-glk-ORTvIfb#T69_uUQ+Zn8XfX%c!M#)O}Y
zMDSwxqm*!9q(oLUP1A=n-0ZrXAo1aSxxFpeVzzO5kpov2$Ulq*Kavq_;1_=<Wk(!W
zoqJiqPREzk)pgCbov$9*e)Y(1>xV{t6+3rs-RejzMV>wF66QxGL~oBWH5;~K@QQk+
zMA?ePla*T;wr#tnQ{N6=1BZJcDX``$w2jBL3U0xDjQ5!I?dVZIHgnsAj9Y~<^`}in
zRqiavO^M-b_UhgHll~D!BP7+5N@tE*<k%rHOSzo!Gr~-HczA4ln{<TRLSU4RV|{>k
z3Z^2=YxWHf55JmuK=P9%Le7TJ|HJ3cf}Kn*p`f7f+pYi2#W)Hw#7|%VB)h@5_0^j<
z7ZzA`Q1{{ov)2s>9&+<;PW_rkr-LI5q~&}0ojKD7EtOrw9Sx+3D{ngb^VA?np1)Ag
z7i6%v1qE@ju_YA_-r~u3%}pW~A3bwso}Jxb3;<Bp>ye8bZ*cUWf&W9a^*UigcR^<)
zIzPH=P-pR)iq0cu+U(QZvMFRa|Ewt~NFD>ZxnJMDd!Q!%giX7ZOT-D8%9e8f)_|W)
zK^Se{8h*63W9<qG{~`UAK2$_39=g42Fm<H5-@nDe-5^>yaPZ)d@81RBE8GQ;$KmN}
zJH03t^y<}1OI!PEg_h)NdzEM%%~CcudXu&9(EBf6GIgF#Fy66aEH|ew-7_$!U}nc%
zyCx<M@#iF8I&>P^4ms=5MLwI96vT0S)OfLED<#RKq$J^TK{xKA7Es4@D3MMC7}Tf`
zjfKO@SY~GC?0iY`Ax4fe;?>mygCqJ=Bq;jwWjcjZ;)3bXraD-)hQ`KZeO=x&B7er{
z3yPs=cJ2ks;4jFvXDNb=962&_PnUx9r%wYWH54hx_0&7S+?bO{w{N4iP5)4{fCORc
zi%!`sTeqfXWnE6r=qh>mhwC#~CI6^lIs=1;j5k8#B}m^{ler*>X`c!U3#*WyDGp)(
z84DNr{X?I@Y@!jm^+cVNyEDROY9eW_XuGbfuh&6%!>z8UJrduKz+VJ&80w2HL`%HH
zYFpdff+AVT5!>>(<%k6aKesV;Jcok#c4wKEucH>Wc<95&kF8N2V#F~-8NOD9jD>|o
z1C)IjgnPlc<E=C0Y4Q+Y6T&PX;B~@cKF;7n%XRBs|M!e+d$E}x-sr`TSaPyuon^AS
z0F;QtY-k{jc-kCXE?TnwaP>XI3ga`<)4x|$tpR?a)!2s>5d81IW(~w}4A9g(ghyw^
z&ly~8xmycb-e8l>`B96+@r~FvGitf|_~WzA_rnk)r$-ki9_0=)8H*OZsF0r{W1y+2
zDY6hb)lptdnu%j{D0Jexj{=8{0}D^3mrx|Ds7`0Rd2{o>?Md(Z=Sa3JoVv^3@Y{(1
zw`|{jp1{w{%nU)Po4kSj;IU(aXz2>1a~fBB4;{-J4n?g2y+32HvrZd?5ZM%$l^qBU
zUheEWOk8yJyK}@1@!yc2HH&{TDSDLnWCw>Zh9Fga8|W_Ea+?n(l$}sjQ4wN^DK6DV
z8%N#i{q)tV<Et3M>E-1mS~T1meoQ9{d`(XzoN1itF={z~fn-<6FA<Al<QH=EV8^K^
zN^+u3*w2&4mKDrAo6UgXR-zNIslKpzE3SkvQM?IjnK5%eYdQDv#9*)|!j|{N#hxJ{
zX3o9?y$l-~8g~4+*L^374RoSr<mZP#2yy%O>mYK;7X?LsY?#K7>7C6v=O&4^7vREF
zG<%n<r^|y#<=9_;-Nky%Ao+lMyp}L^$BrGA_V(`z)WsSn&Z)^#ju0ItL`{V9K%XBz
zeHvJMFk-s)=+SG1xJrFJlhK|at*JWJp|2QpQSnT&oL+Xv2Ix#Zp{1j9?2*Zq=~{Yv
z8YoqAQyC`ogw=oe;DKmJLLNh#ivp|R)^y1Nx0M3HfkX8E{rkyueA4NBb=8HEI25OL
zP@(tVw=V)YoHqkIN)}Sh@Mv(vZ0_vaA@r}H^>8;ML(KjlKb%K?&)CYYgXW$7NZzoP
z+^>q^B;c6N&Pn11-1*ovtK|}dE1;Clcmivp{|<e#i>Vbk{<(riDkkC)K_^dsO9&2p
zsFz(Z!N6cAJS-V``OnNMT1)wlYQs;8O^n2RAe5PCCGlX;NQ%qDGqWV$(lu~l<N&ng
zp*v8s*c%_O^X}cdg5Jj^cSq6at<D-B3yDv2{u`Am-c&|G!IrR_-d84}Poj?Y$XLPP
zZX1OOX<>$*v2k%<E9OhqYRVYyu1e3I$BrMLhAJgvq3=9;^mlSbiw`k+YfS=#isF>0
zoNsFJ2?_W9`RB?LYc!igTzodW1m?^8$(}z)w;0z_13ktCnk$*v9$Ryvr1R22Iz-UP
zWt}v=)cHyx*TU5bV*o>9E6IkC7K5LXP8@~QmRv<eMNmk{pS4wt*pHvXTrkrR!v&vh
zWc}KGuBL=@L@(Ei#hXv@h3SLP?6Q&Cw)Xai;^X5DA0}`<IB9gp0Lh_^(?MST{=0X#
zpI9$iw1_FdFMH-=K}L>@(aXeq*dSiz;%{JUYb(YLdj3=s?LYuwm2ng|?}SzCfmqdB
zh@0)NXC#>@X>K|c8W?zwX~b3c<vVmJ66uFrk9tbtVW@>x#xdfpsG{OZ?MZ~0fMH);
znG&fk44W6G>q@^xLv`+jt}l5OzQfhEi;~hE&?1p*F@La#!~rbO1K$Q<>0uOp*y|Rz
zT%U=tO1y4m$3uz{qQOr9Gn`LyZmh2O;3wsC>%^qp=)1ndjyd`CiZYEFx9zi#Y9J<k
z|MBC1LVxbk+iL2bk$?Z4{_tU>`NM>$ar*k!2)aItqvIv<ua(KyKU^7jqdRl@hH2dl
zE?x4LafCjY|7+uFn07W36}03`U|1SuZ$5GLrZO+Il2Se*a@;6d9_40TVQgm*tx%LI
zn4vX2D(5Gv86iwx)ho7ut`89R%r)gpR&a)B+0k1SPgMGgiKQ=4Ds#<x^5md_YmKFi
z%`2O?0wzFoz$L(dFj(6B7r{Q=ZZYpl%S%cgAcdCqIgT8h?v2?aD~{h=?*m_AePCAB
z<Hs2V1?H7;kW`K#xZ*T$Rwwm`CfNfKjKq3m00bWm{&46spFu6ZgPhzNt_GCHgW#HC
zR4UBBos6m#WQnGxrsozWnDXA!E8@ILmd!eU6my8ilxSY<)~y>7HIB~9KBysZ_yokK
zN+S$ay~hZT`9?;r3&~7Hrx^?P&7cNlGL@IRZGe<6n<XBiVYUkvf5r@%)cS%-F24P*
z_(4O4_>%yo%v?Bc-aLd4*FF3AiU+M&G7^iX>R3ib4z~I%<Xlh6I#;e-DL<{^Gb}o9
zucv4JsZ%NM;vCj)%{@Hed-aM3s^eANwr_uwvnBt{8z-0NW`4=br#D;lxT*5VrQ`Cn
z?}d*`mz+^ual=XfVM2cN%6vMugf64C^VF>)1`YD9Ol`L4?hL*T{{(@eXJJp@9RK(x
z=QL%O)@|D4AE>9v1O2}-E-}I34-@EMFmK)A*}z)L*f#S=5eqfgM!9-*9cKFe!-wZK
zZ;?f|LFUorOioNn%}K*EnEaiByy4Q}D)cuh)WLez(l9PZt{B*E6$;tcR%{B{(r1#}
zDb7Gi$~+P=xmjiFbjA6ua(e~cP<l|ivk?*e=}zVYoVbrgA3vTOV>)AopP{>gFr@gd
z>UY*|Zf?>6ji`}ao8^*x<3@tZI}lO~*T$t0FBOH$zi>VW73{dTZTIV2unkyiW}wEy
zEOGn&sk<m{VPLhji8IFnVS##Rm)J~`xZ0`-oB@a{qPb?)`PDc9={ZduvQ_D8`AzUn
zZ*{JL9(p07KhX2`F%qEzT+ftDAs^;>G*~zgYNFF1r7s@(nmQEn+zm9D-GB0={DS{I
z-dc2&q|8jlbkN(Y>)H7z#L06AOjpKX4SG110l>{<7+=Vx29);YPRcDE8YoI(E?a!P
zyuLFyF>(3!Eh*Nm2#wdw{~+WMz|X(lYebwfn0QlfP_Hgsy1Zs2>AN+>3@Z(0POV{=
zH@UN2+`;dqmz>;H^V*h9$8U31-oJORsI)X6JBAI#|HX_s^6s?PZ-yfknm+W+_fIyM
zdF-@`^Yr2Kfgql{y7%l+Ti(kxc}olyotu<WTPq@bgmOZx2MrhyOFWhm*C~2pvc;z!
zD}Vp)`7_a-8qlc;I~P`tt_PxdLL8Ofu)J{Y)^c7kdc{KNwr#aoNYAFz!1cG57;WY9
zNDs_~KA5q$T}S&NH;x}UatFrP-?h%q($fW5ydW&M{>{^Md&on(C@Ma)c?+B@F%I^|
z$2N@U>YVFV@pkULf`pD{_IF>rI0hesBX%n#rPImg#Ds)_fC{ihXMgYcb9wG+1URFc
z<a=DTjQo_RZtPQ%GL2sGQFbsSH&bc~G>thD-qnvY{bod;`sBjKD>=K5gqyc*Y!p+H
zo2;LW&HaUfdBM-P-XfrQjJKb)_Cm=5PN1NuwQAK0{*?zq4g4V+iGmT)RR)5C^T<Et
z<>d={|BXXyrP8!^i65jWVtwc?{D08&&OLg#%>MH5(WCT7kM?1UP$p4RQ<EOp#;Zi`
zpMU-#IE>G^8yjBvkQB)@r=!o@Q<#2W<=KF}B#YEH1h##4VI<&k{{aKq_Usu_s#f~-
zYh=i!%YXk3WA1s_!;)?|>ck1-q83JCQJE7t-?<!XAU#hGVK=kN(WtxX*ln$K$!s0{
zx2pX|rfjZ8a?9oE8D0&SzDX_Q)Tt~nDSFI}sQH6;FvP2qOUy1tGKzy`a2<Tc^QZzg
z@FH`&E8`9yR%8$IdnFJ2?XdkVt1Mc_Hr<{_bDmj38G~}POP1|%r>8So!>4_x9$o<f
z0e8C2T$<SkL&g*QRd3L7{^)O_+$$$^VL}&$`>NeMTYURnT)BTeFwGhQDwovRKKFO)
zXTGz4J0kVO(f|W-wT4z;>3>r*t2(+Eut;1(`)y^cDGjXqP4~+QnAP*>2C$Xr4{q3%
z{0Z_`ZcX1j`iAMMRr8=6pzB9~V!dX~%fBD}vRTpOwyJkx_`}f(3IX|Zub7AQ-7s%!
z`J1Q1IeT~R+!6iB8#Zh(ADzM<!NLpR=-R)hw5zr?Ax3Dw{c>t*s^OLZHG>|Y2q|Zx
zCI4ApSN9#qcsAQE_S&_(ct(QFp1U^31I?ONMsBFxeCtU5%q-YsKWG&zIvx;k@L)=G
zT;&pB;XZ$RWb)d~xM~20?N-(E*>Zg*o+2+&>eFW)zKab0Xi#?%wI;Rc+P!-`ZL4_R
zuZ5}2BL)ifUS9eR`7YT#%tP|FGA}!CtphPJBRh5*{w$=Gf0u3laRIKy6JjPU%9_L!
zYtt@wd<I;S5~(n(P$;s3JSQ*>5`%d752zl9m|GRk%5?g5OTMatHYO+QQ~J%V`6nkQ
z!Mw}m+_^)sB!I%I<F*dKTil$bYMdXr)5YZ{l%JA0AL@VqR#mR8KJ?__L+!egnFmx6
z26<M$yytjTHD%?ygp*Yo=Q$>8zB}h~I80|QGaeLZY;0`SdjnVZS>A|;jJl5J%z3(*
z^6wj>>^g<IGJ0fV;NVf$WD8%t>c-}zB$bzy$=$T!T;WryKRDq4>{cfIJZWC<(sSYO
zj)~o!AGh|7Dzx^wKGthkWwx;JwTa;|Ee5cC`}U<v)NsDnhs~@e-Yz}ZTSdk0S*7V?
zqvIQf>2zvLffc0zl7tIY54`Mfenl;;t(Wy>b{rVyj;FnT9n3~2>Q^u_UWcg|rX8+c
znQ`UGnMtWRU(Sc0Nyv4Z>sj<L0qk!SGaSpx%k$00W`}llCR!nTDtsO%>;2(+XV=8>
zS4vin=-3($rjO*-jA4%xr*_|og6Fc^yT$>#yyv2Pt8nm28VIEivQ99_Sb^*WGK>#%
zV{M-F`26(DJR;(_>(?*bzI_@6No~gxQqDPknu*jJsb{($2@ZZp@Vnz%<i$7U3xbay
zzfZ^bTuaNo0+w1lRM0(U+U;{aZ5oB#vo}=7tDG|c4QdDi8Ke44cx&ab@T0}i;NW!*
zt<@(@N?mf#e0IVtvlX51FP&ElMUX_!(7N(<{KX~P|A7N2_is;^6l=f)2S>+^+sa5m
z$@0&gn9N54%v^rbcFNA3<CvV1dfs}oa&4?thJ%BHmfh0?G3e`{!Nxo-CRykA(w{Wx
z47G=ngXNz;OZu=pqwHelILrwibuTPMdGnwON=tn1%WnoYC8fKWt3sD-)uGK#OLs?Q
zIyt_T_1;T3_U>opc_JsnnO9giSWPPiK9HMJ)<J#B(hZT)wY9YuMSoCD;w<3+L@%m{
z1t7M$b2|-A(j<L;p%oa<ru!ly*S<2>=1pE6hWarJDub*xS#QbK)P|oOxp9_k!!7QS
zq@z`NpHrx4e-WftpeaIm)rq^ty3^0x>55F35B5t}{tr#(0oU{Xw(+clB0Hl9#|#Of
zM9RtP*krXx8IhzyWmd|_9v!kGBT3SvNRm;KPLfK?YKs&yp7+iFd0t-s|9PIrso(GW
z{e13mUDtixi%j7}A!>1~+zRSoWMm}TA0dI(!22#cVEG3$h@5zQe)>|{+;eePq6g6R
zJ?{0pgS@A1dS<$no!yQFA-yRG88vX}GZc4El7b8+v}n;neBJ7{7%fWTq{7u60}YYd
z2{+?M2u8l(G59ylGD9HRs2R}oY|jb)A2MZdVE&(8vS^UzKW3iMcM}pC=<Dk%fQ<Ch
z(z4I0hE_wU!^%|B-HVqmiwX<VJDq8sUfe{HNBnrOtP;>!`ly|$sp%JB_zegbzEV9h
z-gSmb>Sp2}+|b~d)}FTJCSZz<kN@e3&t^o-wrz-<CqiEw(Z{10VH+LFwNG~%wJNBn
z$m5qdn$Zat(%E(I+t-JsV}dY3f{O3PsI{+^5Df6Whm0IGYLZv@@1MGCgaC_y467|0
zwrp5u=ArCo4y^14-j0&2;Qjl)_4j$w?}x43{MSA7cO7^VS_20Hzo8hqXr4Ua@MKO6
zMZLnEw6s<<p)SeDh-7x2CRbTwwAz#@k6lg~`><6}^r*p^quEzaCE?;aam4Ggvd2;{
zfMmisi9G>G%XW$hZG@yjQv)8cw#a#x${*lF+f#s(8JA}BA<buI+`D(~rg59IQ)s9W
z432_qfLK6#S5RKA<@SIV^}ZcjbPoL2u5L8iVe{bZyLXK!Z`pEkpZ_^s%uyiaD2&w8
zE2^yY^YZ%pX0h;jn70e(0^CzjQj%Tiq!ebCrUCUe5zs}_ch!5F-iDzTacV;rH>N&^
zHx8%pp}4t^bqykm-g1P2V@m&r7msh%V`$0Q$2V^HV<Go!_1s*n%-4$O8}t26Vl71Z
z)Q_8f)ME)>b6tN#Hg$`NPCkFOVs3Ee%<0o%N-rfONAlX~(sG||^2cgbIDHO5;hQ&J
zMrW4Qjq-f|aFMzB1~5)hb_z60F*S1JNFGIQ-kINEmyd=?f1cj*OAij&1Vv{ZI&|pi
zO;-mh7Vsu3)>)&u@MK@_ZrT}sRMnTvOtR`m#YtAptA9DIS##&rD~;%9A8+H>YH69T
zyBAdilAIkRRt13PoV>-lL`M4DogkvEyL6et@DXpnB3@CmQW1Mz8&t5^7wjh=WJEdS
z|AC8DjcR^F>2E<_jdBOjC3*)34*(W-=cq!`Uq>s91mq5EwD1<R_KHykZTVPNC%k;z
z{7hPU`i!F~TK)Q=xX3+S%&X-{8i&knq^hR&;_B-upjX1?%e+HE!d6^KwXN=KskPjq
z>e|}P%}rx{!Pd%_#=k?zy6b=76)GY1*lDqH%3}2V`?YUWWKmpLDB@pwTU&Y%lpF`?
z31EO9A>fwhEJkKPFHTOMDqEU?-L&f6>>tgmYu2uP7U43Gk9Ors%IRW2-iK*vP4B#X
z`7ii`+M+r0EiKdNG-*}30nRT$5VBjtV8Labq*VYrwuoWKI$)^Fy9!LE<oIqQZUD%X
zMt``|NOXrwiv6-HW>#+V8V7iCjl2d3j~ob$_oeyY&kx)y_p!lsq&GVK2@{fArdGB{
z&o(kPc64>^GkWw1R-qo)w@*|)v=u^G(i}pMG5Ds_Y2fc)*9ExS`C$s(;r0PTLthQf
z;0rS6x;=UK-o19{LO{pa$#f4b=K1qM(W}5Vtyz<aRSGdUR{zgu)?ED_E3MPw)i+^+
zhWm$vminPQ*W)O5A48D%k8sdH&rN%L1pjb9Fcp`8cP=?8&$F_^-M_9yB2xY7(-K}C
z@#7d$u0jrS<)>!p<|hII{1LNA=kfA}FXWaeWWoG@$$yK+bq!~y7;c0T|EB&To}a<=
z$J^9OyfrJA>gwupCxB!G6GWGSEFvn#>}%#>e<}{Zt1iP{fxmvG9~Bmczl~I2570-e
z7A<z=#CLFXnANvk>^%@HIW%=W2%5S8a-f?Nu@vvuzdyP=UlO@uD~;T%9YzeCl(H}+
zWi<L{f!pXO;G*||gn~QdoY+lmG8~Z6-Q8VH*Iq4uQ~f}uSPD4lAiVOoC;%nE&<X{-
zoO+qT?<Gr?kSz38Da(=YxoQhu8W`A<6U$A;c-jcqed^SyBSr_^xN!*e5CW_QO3j<=
zT)eJhU=Sc|2vLk0SaeiOQSAHss8O!?>>?h;WB5KeQwPQ0cOf@XFKB*9bTC-kkY!K1
ziz5@ihlc@0IMd85>*}YAvoGhY7<v<BwD=CO;uSa}c5VP4rDDucRoCXU2h5y2pt=3R
z>fuL_Ntc3yg{~#jZ$USPER`J=|6E9d!x_cn;Wu5(ES=hAg8cro!RC1@BKZ!S?Bip`
z5@J|#ZmvI#->f^J2-bRD)N1vGk@jcKo^7~!^JX&HxMj<h?Wkzg^IlwE{x$=QI!q98
z%%%`G*LD5<vZ5zUS{k(MAi<@l$!;sOIuhK=%T1B;N<Y}A&G61D)b2-9>>ZBJ-(!-9
z%=trYZEqMFQZH}f!{^V3n>BPa`%qW+j_IP_T3Wj)0^4-!mWCJXS3tqXaC$PoM%Q;Y
zGqXL>pryJ1OoGNeeL8(H`>k5GJo0x#?sCU{%`7$VJbXCm=x(h616B_8_h-3p&fpCG
zHNNBVP5Ed}m>@g_X1kRhj&7zCb}X>eiTCdZ6%QKk<2IcfgNs}-sS()W0D6lPLb7mx
zT`}#Ux)HTL!LR<96f3TLnI~b4jlr7;G><1_g=5_};NN#l^WngK=*}HOL3yC9^p;+^
z{r$~gf{!eN%V~I_Ck%p`4NJhjQ~yh5=6I7G5Y-R=^5w`6ovjNmr;G-e1xq3x3io;?
zB%~8Nr2knszFB$(wHO@89(ZAS?u@%&+~WgLyY}bz4H_HYYiex%?g$vCZ=Zr-1e>*o
z82(ENW}Ubl_z2#OAN`K?k_fm`?_-PYss?x+-Ffn)r5L`kkHO*F_u#t6HJ?5uK7L#l
zls0Yh<i$WSkPc_lm#OL4_zcSK;TdfZA0IDqlZPT9LYO4kD@XjHrX%;el)7}yqV~CT
z@nQi$5*v!zXlP6|G&DSVzdJWA*O$q9s$(dJeVkjS*ysrebx8I2!OH4-hp_F`h-xp+
z`D1^6>4|P0d7<}(lN&E5|21^iu3gBdT2N*%gR)c1rUH(n2tls@<`(6%VlySB?m9XL
zz$#C2XGz8sC%Rw01h+jFHPb(4^RG=p4_>@@`r<`&3rY<eHnh2I>S#uhSuEZ^q}n&f
z9pbGiqJ?Z{)h=#GPYWNWG-#xVllDVqsj{gv0O{o;3D-t)zw}IM%kg`cIyi*hJPIX!
zbGgo}^D~k6{zG^8w04s}en0L^JRwEd`|aCCp*Wa!pdy-i>96rk<q7~;^yFnO;bAjC
za-1nCvC(8SDD}qj3?*gddoYua7lUETKpLJ($3Ry_dBu-<t$&c{<u(_k4|VHX2zy&(
zS`3wyG(W!JV*~0n&P+zI+2BQs7RkKPG=tBx_s&;6m$_m&2o4@>%+cl*=C;{mBD2Iq
z5n_1VgQ-*HVInOkpq94daUh7<#}}G2=Ws%@Wx$<VhaaUd`q8{a3*jCKRAP$5ZF31Q
z_Bq*m*f2^OOHTYlm<w8#o+C#_8z+;(i{!5N@4r(TKIE?qWPXKEfmwosLBV)Vw{9m*
zfTLZhujO>l_Y;v^5BA>>t{YC_3>(SBcjGNPb~FVch^ce}ivX=0F!tB?@Im7`w{QP@
z0x&A7svuak=+)9yt5VTXmgGc<Hbzt%locPr+Qnf+E(x0kwugs19o6a~01D~Y)zy_+
zT0Rc3X(>xCuKSDw-Z!{FB~TS#?-%uI<1VJY@;q+%`ZmLVn35fN1FUQMn>I*tkl0OQ
z%fvf|X1m;`1HSpG4Q1*>Q|oImBSY7{pDxPVnV0@M?Aztp{qmQOQff`Fm`AVK%JrvD
zb9P(Wvg=4y%KC-z9W*yKr}4q=Ybp&YX(Bo%#`%jHFOoJ^ndi#LNir%Z_>RFOe2_C3
z;6zFXw?!>}=NlzzvF$l2@EbGn&fWqC2q`e4ys;CsI^gekUbCN{UvzA&4t0h`c<dsd
zllNhB^Ks@EsGDNJr80LC)eOg0@-?8Te1W_wH@nWmmZ{3O8-+{=%S>y6XpQJ7JD&U<
z7_U>{z{K2#tmGi6@R1RN$bcB-%D8WJRiQa<T=A0<J}L0yfBkf$dlqv#(VU*2S^Yk6
zugj-QpOc0M{O?*ZWuA*+fv83x(`V9}vBPfMq$GfnoDADq2;EqZl|UR4%n?RJxwJN_
znchfI`F+h0sx2vR5ih)W@j`~qBoZRvCCV-ZPaSb(>YpUm)*qCPcGk<Q31kuzk+aOq
zT=P5<K7IVS?x87Fd6!M^qFMPN4pkv@i;?q?1VmW@xPQH1=DMBe2#K2MD<Y7@l=N1t
z8V*knGtrRl8)`No*11f?^U{_8?Z0IkYp*|giFMzq7E{mThFJ`!DO)ih^b6={|9UuE
zQua-V=`iH?V!4ZCP{c3Hn0*JyyiY?2LrLMXE-flNJn`W}Ur;u*R`*th#+j^G(Vx3o
zj4Ya^H=gJKR^RH8-j<YeV+wpTBXQk(_S_DFfy7Js+kvOg`gD0|>AwNfLuvyBHNACx
z34j%jUCNuMA7uaQZ_9dCva=TKj-+KW*7DV>ZSkEu@U)$gkA~5Ll;)1-8+paW8E{Nv
zZETk=edB!4-^|SR)w0_u%%?@#XWylbQyaZr8%ACVPbP#P8XR~(Ov*EmMA}YHB){r!
z*B$?bkOvWCY%-oVPYn&94P6-po{SrEF{MPM2j<qr`Tpnkno<KmoCAvT@FewjQB_qM
zs;cvOR|tkTEf@?EwL28xF@JxWr^irmARM;~&j#ZTXKiPf3`gk0YsXV=7#NpS%`j7<
z-^;(&=-h&kpmXOuAs=U<%V&EaH6sT(!YzNr;mLmkm$L^VC^E7Gw-iyt%gp^(|C&np
z1U%(Oaf+5KUFsDY>Tpx9EaHUN+oJznY-{UHUB~GkQQpSMU|=Xa-t{<hBlY!*D=Iw9
zR&5m?a?tA4t7*4Iwg`*F;(6b3w@;Y$(GHzzANdCEgG4-&l?wzQX_AtYlTSa7H1b(E
zV*LjznR27vm>0A5FN#Mk^VoJsRG=y{D!Vk{L<Llp)(0N>b<{IR)hzZV@X5?Yi>^$u
zwWwD&2~5Yc^uogNB~?{>j~+D(c2cKrW#`&Z?A_R9OE)Ecq1F%fY1Y|CoOtifYZF&I
zY@;Wi2PCXiqZxhO?bny4WPhRm$m`cuWcC*=63ShX*3i?>n?K+A(HCGR(Wv150D2;O
zJ7_xS-$l3<a{2O3;0z(yzyyI=M4t%Yz5Q)@YDx+!g?E%$OKFuPB1v(UWe|Dg%5FYq
zI5JXx9L-#8RrJ7J9Z3C~ej-`Nfp0^GBO77Fm-E!-pQG&3PG$x?e(*r4OP4MfE2iRg
z)(4Tq>Wvz70>tu3Q>MtsjUDHXbX4+$e&%n~2D119QaiSA17SWJPE^nZ#IEP#<1JiQ
zfh|$`3bumM*SBob;N|JLxuzgB<cvWyAg2)vVUofjqL&z5^VC>$n<L_aP8tFJ<xy-W
zP@xfQj2A8RU%pJ|6LDOIhUNx$BGgJfHF9L`+H#}ZDGTOK$SXbSI(F<>8EB$dVu+UM
zmt7$Dg+)bofzyFwD+_#3Yvc}^|KYFkD+fL_jb6K%uaa3hiX!Sy0`&|D8OSFIxtxz#
zLmKIM4T2`i+ney+-GZ8hy82E0M4#2@64Ur7@*$X;d_zUtYv8~=?46Qf_fG_Zp?@@7
zIdk%?l)!>$cm46}<>*PJO@Y3A1X8D>t4EF=O=c_)hzi}E$kZ5;$-G*1mIO5VM@`0P
z<|etJ=5<5$5bxj*ejJ~k42EPe!Qw?hNFvc#5%Djr_}oy7RHV|v<~-@Scl6r4YT#m-
zMgXsV8c?=^h$*o^w2=@uA8RtE<*zxzbCgAC%pIJ<D)t#Nf+!yn-@lCe9)Or<?^EK%
z8G#D|dV>05e&We!j0{>#K7aSW|Jq?cO4lqBLv(*NHS46&rZ{{^`?WCO!lg^TxX*lK
zqU@l*|0HrM2-v)N^Q`RcQ|Jw)92iL%kJPoI`csqZ>t^3LPSFk$^Iuk$69Jb9{Brc$
zmyb-1gBx>tnI=-ihu51rbKRGYs+yH1B+&2taOrm_WceTtC8MdcWST*q97Tb64G`Bv
z8N&Kv^{x?wKEJ0Kw@)jjr-O|T3Y!jQ$RQ4Xw<vG%^T&@{P~yaG4!wBMiY0Mk?(1d5
zv_@xSPk4m9@pfC;Tr+kmefI`jJ><c*GN@4nn`AAq7dqEZ6;b&!7c6+$uQ)a)MrPaY
z@9j2?Sjh1ZVG?2nmq+WR4t5<daNrx-ZiVQZH-|E@1pI56dIEh41qp#?v9)#j<$>-D
zD^NJtF;PMRQ&drr%1Fi?8Y{*Qd+F;R=f3*0k_sVn3RK5e-sUKeZ|j1?C^+ZGyN?3d
zWc*?)eHOqGEo^f#fqh==VSWHNnt>uc$+_gaXzOxfvQXDF^<soE^~6Zo_e0%w>(di`
z@IUz#@aHfe8-mYXW#}UQ>)S5B^DZ;<A3rVz$YV;&kEDqdJDXODSBe5~E15Ek)Wq!D
zf}{iR8*Wu`6ZpNH0)~x`Fj*ToyMN5Ok~V}(d1r*~k6=^J&~IKnl@S1Pm^4so6oyl!
z;)k(`%y#=)y28|Wz))=AM>?0b<JHlD4#i)E#HinElV+t09~;X11YIoSpb}b%s8Kia
zuUxv+kD}^4O$}KLWAMEI3?wZSkLNOK&$JeJ<-L;kY}>bQ-jq}v%?`C38@G8Z$hEZn
zTz<L*A#CU!MEHKQkAvs3(1HRh`VCs=wN)N%d3Dn8u+ch<YDx+>(WUo49MWG~?-@cd
zp^TY+=Fx;f4^io%KwfrXPe0ndbxp~c^g)cHMqgj1-K*ER#AHZnv1*`!1aD136q78m
z7zo3;{Xsr9foBIm3KK}CJkhZ~YT9mZ+Uj}01eA;Pn=+C^$=s|(3#pxYj2_*UdI$l}
z2mD2#tvTesg}2DY;(ydGq%vBUx8XnfCHqyYHdMvc$y2knwWTS-SA1Jk)ChjzcO03{
zQJBI!J253GmFli{9O3VvVDhd{pYG0FgzPawC6qDH)Njw8%!*=YYs&-h*?`pd;W{qv
z3JePJ#-Kq4`v_YIx3Q)5+_FU>udXdV%@EE)lQJqW^Jx1ZnV5)J@DCL}uTVX*V1XQn
zW-2O4d3gbn2fL0ka(4dw&#BGmvW@!>&3N`K+Wo5zL4ith2m;QzAtyamvs`KWfuQ97
z#Xi6x4soA5Ffh^s>)#`Do863!{fGeJw@dNLm5-Utlavc_8lV~{_3!xPu<-DVf5zwb
z_4hw`&=}}u&0qf6Q4W@Qw@DZBBEg~MF;l`w?S?ukd%~i1zS8!Nc(s1JEJz@DJG+i_
z9ZjajIdp6_M;tyYRPrI%sotv5yc<G(l1#l{zup7dNu&*s8&kW8diQ-xz7dMxk`B%0
zFI-3^lvh|>Q<--hU4=}Jl8yX%U7Ws+A(n2ROTBL29t+f_a5eYcg>&cL2VCI5)1xCj
zbX>R220H~Q!=Eg*u^A7WAeWam4&D>_m?;PkIXiB(Jq#Vdiq3|QHs<<f(bta#kr2m?
z8`oG-5j{Cz0T}NTS8=oFW<m&9lL(>5M4KTWquFih$MoC4WY~7>-P;PZ5}Tgg9GtR~
zv*yivz-G{GWDnRcp4PT)+iE&2;H(Bt@5^YMH6KvX;fL$ZSL=&agaJjKns;W&qc$@g
zr%-6AA!-#?Psj-Hp~j6H=YIH_73n|x^*`<(Ur!QTIy`&{oyW8INj@$zNr<+E(swNq
za~igXROpg|k-2MXX(_W#$t7er7@4^53fjRcjxZF%?j9Z+0LgQ%Jn8T|Dh#5{inV$B
z`DM!G+5J&b0?Z&?C`J)WQ?hly!5)|t^rFn@H!Dac8(HQ~(~_8#6%{=A@7zGvoW<&U
zYAgq=mOgMw?@Ccx@$)6}P2G3w@&9W9<PwVk1AV5r6(bifDHdI=`I(rQ`DXtSbv)5W
zUnp14=h?p6{L2IN3hm>F?^+QEPu|E-FPp^1qa<y!8*~#F0wkNLuQXYA9BV#^>kmAo
zBwYmu3V}<mc1P}C0#%Vibo-Ns54WeXKB4El6PxKcuS0yvuIMhQ%a9nbfcKY58hYH_
zM~`--;==3&w~Ytj*3P4jL6XuiHd=IPCWC*`m19ovp~!0#CW*A+q55+tP*8%&@241P
zX#gIdG;r7M-8MWU0p>^=sFht4kBmT@c{F9%Ih}COL9~3NF<&2_?!$)NnDtY;xntB?
zM@QLx$UBFZR+M24o|3r9ASt%t1IT=>(10E>8fYJwW;k}W<M-=J9|Nh#8IPSbmS>rl
z#$-j+&nZ4`jA7tnq$|MBix(mvr_+eNIv7niK2ZFhWUd5NYwVI%-NvNQ0(ru!0$=ul
zaDuv^HXhi1tx=7F`B(L&yjcV)yP#;OQs%{N98QOA_2on1^;1%A+}ySx<ax!-WxPIL
z-$@tT@B4#?)pS}7l+H<6^VctSR#N-Lp0nUX;2hBS&i(s}n>YXJa`N&0`?58=;8BG6
z;2}y@OpRqFy<rFSXVgp0qo2*KEtUPS1DrDtv$j=PO}ii{%g(`J+VttF3fwtVBV|8B
zSX`Y|nf3Bz=Wg9j9k*CW=`uuDxAJ;v%9HL7O&uR|{RDDi@Jm2l(7qEVPP`obWZ$C{
z8W7+ZL!^uXFX!YeT>aB4x(R*EJ)$q|wA7j~*Qn{FUnZn$Q+%-08Qi{DW$d|7r~>YC
zTY&~Zrn(wzJOrQx<vDfz`sngbe+iwSKa~)~g-v|+>;MizaHU6V-clGoOigW4Q}_55
zBLb8sn6<WT)8;M+OLa{R*udP_nM#@qhLB_;j$`2VhEfw!rWv3Q%BTN{?4v45U!O4v
zJj0=5_CIfldI%Xx#A0xJ92H|17hU|<C^M0AK|}<h5Ul&YvSW?zuwjf*HeOURtmOUs
z^@(8%4ur2N$qdf=XBU6N)1u2X_KOTU2_6MAdvS`p#WWVV4C=Iyt*<#98gayDWIc-y
zSvhpbkOZDM;YA})(b6^`X`qv<YjRfBQTl>|z_~MK>>u~gG-5%Xt<EPZZ)rNX(LjOA
zpKkC{&AKqXuS~TuY#|0^->K^m4s4@`k%=k<=%&CFM5?%wrpsEOwx)qKhLD#^5upRu
zqoA_#RX>l9za{Cbj}p57A9y%_wc$|06s)5Hss=S}D+PE6!PgN<N;U$krz9z$lRYE*
zy*KpZ6}^b4uw~NKhpg=MHYGjcri8?emyZnWv;dkO6M@{qoJ1t1Jk><B7h(=VbcsE2
zH=^0yF3vq!6+#7*{^EtwqDk)~v}|vw6C9#uuXA2CngD~0?@X#wMENwFBH0<3Y4H5T
zi;MLmjIV|SUd>k(C$aN0TeoTR>`aptK&??J1anFS@T^fVWx(4cq{#Vs=VxA8{qmz0
z?FmHzg|2AhX_`lrs}J<lUWA@T^r>E6VKm$Th<CuJsmh3sL(td%(WfpCzpKMA7yfi;
ztlBWhm4y1j5Jk4lFH;h*$w13C=Z<@+hM@4aUb2KLVCz+#@FY&E^FveiVg&@K12Re%
zLa`0%#JoP%pkkSk{cEbv&3R0;aUUrI099P4yo-*_vtO~|^3{W>gkJrSwSP;JZ%!9M
zOlC-E5~_UAeuSsa>ve__ZjjH*>Mfjb0d>XxV7J<khLmtuu3XU`K0N8a{|@oJMGV|h
z{dV}GALe;vR@9klx17BY^t~EWT9G$1=)8)9nMg}yTkDzWYX3ERI$a(qD1s7Hsqo!g
zux71VO*k_>x8_)6|Dn?Y^pZ%IVG$8Bt3f<>UH8H$HyjCufQe9-a&V+aWMl;lj~Rxg
zoJ2-PeG&IW#!OoUc1I5-T_Yg4aHZ0M!*wh@FilA_*0-0AS9JUlL=7m8___-89x!uo
zc7Y7|@FW0SWF0X)vn{a7z2xL2z}5Wg=a=k;T>^IsM-wZ&4}<a~u8OIKkvERZ9tk42
zlp7Ll3JZVJbW=-3i(sA#A5WEPe&IgZg|aEfLj$M91tF{F+?zdf=3SP5jG|3s!3Svc
zCs3}T$ip^LP@!~wRriWCUUo8{hMQ^)##zh-&(FmDD8F)~{^Tj6DST+}bgb?ui13kS
zL}ko`Db+mxW>QFZ>O?ADSdqHHz<k4x5jn?eTpp+&iXJEvB4CwpfAlu1za$s0AfS!-
z$5w-Ai3JwFQmI+9A<M!oHOJQju8B@@D@K5_7D@UC3V#5O!qgG@HazLRhac&VRT5-J
z?A0z@=!1JpA-%Zf@Zk}U=YIPQd#e?d+StNS4)V`_wiuyDn|AHSlM+S&z%n?Iz~B@u
z(*9GY(kSfpZ~b#Xwdl!AL<4-fFF@bP=+mCyc18W$jmE3u>L!^<1L{O$k4z$g)sFL7
z693A=Oe*JTuX*1rmr$T@m)Q`J57mzbU|Ncave;Q6a4@M`4}--J8y8(_5&lmU1m|3<
zUU6H}A!N8s3_(%=`BOY}@pJraalov~EZdx8=)BMz8fF&$Q-tz_U*>Ul9ub($e4agh
zx+WtFeb|gSbN(h4y|b|D!w60EKt^H+1BC?z5&~3WhuogOG-&3f*F<1uji^?(@7OV6
z!VWFZFTZ$SLl)mt>Yz364v2Bk!oMR}^C6PGYDSk}-Y;f(S1cgs3Q$K+t_nziW{j@l
zz~B;QR8tqsT^8y35%qEAo&){_4YuMnGOK-ze)%;`kQL*DFLH7Y!r<c{!0A$~7&yrD
zeaxSMkK`v${2;(Erb(DGIPM<alrpMFpsvufB(sCS^a8)RKpQ3e9YU+=^XF$zG85Kt
z-56FE13B<O+H;Hn;gO9#bMdHST)9YoF)2l4bRZS8EG$|Y;!Hyjy>obAcf<p`eSIG~
zr_F-XXE}ySGWT8}F9o16M@A63jEREd0ZIJC@eTv702y=m@L8?2biGibou@4%8m)^t
zNE(Mcboi2cBy8ii?^~$#R>9#hBHs`x2MdH#h%!01hTP6R>5us69bJ=5fr$=z;q&I2
zt4~3ebCj20n<W$Q0?;!dBP($j8$6$5Txue<IU-np{Aqc&_zNCk{^`}QX8rm8O6iAb
zV4VjXUD&1r-=92}*F%eGN1$<pFv*;50Is4x!!IZZ7P@*5u5_e0LHt10|F5{unMxZD
z34Y6a5#oJ(YTT-@q+}Ae4C<u_->)~Qvc)Ii@Z_Z*Mqx|xTaAm30v;Mj;Tk#FrH=_|
z*GJzAuwlER%3|o`JBPit2|Nhj5Vw59{SL%B!QdE&-a`LQppb@y24l<5SAJ5>m+uCU
z8^tT9srb+Bd#T%Befso4cG&=_TLA`BF@<qp<sInKHJGE-gZAk?a9~L4%H@NZIimh*
zX~+<)TU}*bNt`)oJ7N~u2KN?Zd}CTfR8*ht-4&MUUjk@LBTnnR?;pEYUJ`Yd8dWO}
zi=F>lG57pT^SV};qrZhor8<#80&4UNWsL@UzIP{RS^xb0Rb&<1BLN6@?AS4GS99-{
z>W>5+ftdhK5(tJcOJjGl`&yzeQ<mXjVd;MjJq}zraG(tEg3yqyuko2EAZ2RF-rf{p
zI4=Z8GJN=yzFPE%%|x~%Ljj4fc`px7Y+GSR0;T|>WO*$-B86w;0-)F5|BFOJaw^xk
z%epZju4H$A)KXyUcprVjQp$-w5dRsyPc!8cO)V}imR(3s7@mNiS`_c@FYv0!?P4!K
zTDd%80p#oa=rx)$Nks;Kk)1t+o>;@9W`;SK*0Tf1suVEn4$9gV>17gFNOlt58FgBo
zt}~H*V^VuqMIq>**w_P|O6PkwZS1eyQg<1=g6XCp6eKq;E^biCi~X?AVkrUcD*>rn
zxBUK{a@fA$PkOg-um1Ztcz}oA`@iOeuC7~1|AluW=7{#rCviJNjF^qIZinSGv!U}L
zgtk?-G^(*j4JWMhfdfW}kDxLWl9MM?jp7l51hz*iN_$Q;afpch%2gC2E6&zOW-o@I
z_4Oad+~E9q^HfoWgI~c7y%VCF33EmDGI&198X?jUo6|(K?b>zs!Gj0O2UFRbi_70{
z+ws=~Tv5x5x8SsCX(E4KZAW9k(W6Le*nQ+kHFV_|v`v5w_}}C<#1qGk3oE}VB|yKU
zt83kAKa1S5pPYm1wC7L^8*0*J;<ci3U*$&bj!Z3s#pP{cH0CQ^>oHm}&;nG=^yyKf
z9MGls!7PCQ3yFm{F<w%W$a{`XPKht(l`3->YP3Kl1yo}ybS>C3-6ufDTTT;2>bNL%
zb4qLgRC#Fvf)gJQ;!MK?Rq$qXD!0DAJ<j<kc@mPjg8t<VG8te@*`Wm)52bbI@T$^y
z_9T;2#F<cr?dj3mwQtX7n@XjLNalHxyg-iUNcS(-u5O#bPB2EIC!ZHz9QW_l$q-hD
z(8{CN_$q>A@|N=_e`vP1n8sdU(jMA_%zu8$X^UVwXfPi*Z^3*h)Ra39o7j>029*UN
zM`+<Vv-bgbpo0K+t(Gj=&R{rqkmGqkD107}(0?@4V3rwqBjznwu-L{%9F|DsLD#Nf
z+1`%8<>=yq1XzH28DW&!KR9L5ne6rJ)~y@pYmG(SNkg5#iz3ndwbsz^Bf8@m3Ss$u
zY;C1M{KJnwyZtJA60`heFN0^F_D3Gl=zGYx-$Z{?!=-vnyiFaOcXKTLZts*|a^k|n
zLoxr1c=uv<yTko<I8JZf`pRLaQO5lab=Vj*&`a0gTx!^;A3qCR^G^-ywW|4~jKun{
z>vQ+!#vQDGa(LK32kV{VLR*lA5s9qZ=w^1<UsiDP>QKaj*R=E<UsMs))@YGzb$55a
z^>|20Ky_VRvTSn`q7wf83zK1hzrrHJL9Y*skYuCj(RNo7UVy62z3|21SxTNBaqr61
z3&z+=q4t?>X4aIAKW>#VJM@<uDRuI|x)AxL^aTQ%p+Ff4yi7~71?nbu!@GcJ_%KZQ
zT6h_y+x6v<`R>rzs3faWfh2*pRJmf*ud97V^2=>Tw%(QWC85=(EqI11iJ*7PukTKn
zA#v)2Ps6$Ut+#K)f)FwO^9K%`Hvco7q)2#kZ`EGmo8&rYnM=DSF3>>wqE`Cw>C^V<
zeLZOr*y~31`2i(eXDY%Zis?1Ls1yT*cw;3lbcpkU1}f-y`%4o9?jK$(*(*C1Utr!N
z#wc)0Ma*P0PSj>I1B<^3{k}>rrLU}BU~PhHg~o>0t#MH*$aBsA8_+{kI#(7%G$&58
zZ6$XJi=4KyIUU~SL$KwK%UK&1cw4g~+{^Cd21W5BMH|KA5S7Z+K-naaZGT@}q!^L8
z_3LN-#n8~I;fJc$80FpenSlDYH7{^`=87be6>CwRm2KRWbazilxmL=5CF{fj6I1nz
z+qUe}H0rTmIXcS~%3K0GrUYG3)6-Qa6Th48F)2PjS7bNd{fnBF?6Fp(&%TqN-|fyX
zmHA0i_lw_N=g!UX6@31DE*aAbF|Dfa_;`C=O&cYdgHPL3JM@Z0TxU`Bb1l<ZUnPt@
zd~Ff~U^}^SY=)@aaDtmf=Ilxhv^{BYg4m0TnMv7fWHz44BYNyt8Z~OvdfWxX&{zm;
zLb~kWW03)9I(Q)epyn-Wp2g9KoLT^~X=r6tpptLCVYCJkE`h8is46Go-6LC=$x}S^
zFNLi(`>^NI3bJXUsi~~IqYY?<;WEospgtVm_2A#<@qIXY`+F0I+dp*qXUuxfAm7i<
z_GNnlzE$$f5XjsXd*SFSu{Djot%YY1t896WMAsM>OO`CK^z(17NaKbL=oq*2cspWg
z`u1(Rh&~4&Uc<MWA>Woj@a*3aCoI+j+1d5+G}06YEcwe~8Mkf8t|T3;3yOh{Op96r
zO$%co8!l<z#g-RBDfe@2gP?HjPZ+B(N+tC<G{@`fmk+Ay)G*{-gwE+69sG=$Gq+*I
zDl07jBm*_WxLT^&_PSSxcG7H2!0-f4*VEUR7sf@P5$Sa2^Q%JI&k0n|d?>y_agApK
z&3NAbPJlfU-VD3;-DpMb$(jPn-6gO%#7*(h|2>X|@3KgFAlR5bHs4ew=)I_|u%)-A
zP5J^WiowecYMQPzRolP&@ap$1On&#Nb-Q++%%fZJD=`BHwlV=J;GJWh3!EcQ8cx|~
zd=DkByJIE{Q{9y$QZ})uLnv~Zn2dEgJc@wg81rpZ`8<X(J~0WDRkFdE$aag~<~6?B
z^j{<-B9YkiE#s8zej<IdnpZ5QO-;ws)Zbeimhr1lz!^#k7P#;bvc^60r^$CGE3c<8
z#d|KmZ_L9nefXYtx#KD<1R%t5<F{C=;kE+B(SV`8K{<`TTkc4up_Q!`(omy^!l{qI
z!*<^tgec5`U)bx4ERZOss?^w`g*}7~*-MdCa(IcL16g1GX@Hmiy(QwLM9H{1{uT`b
zu2%|KeXc~b8md0+iaKi>`3}O?QX?&YqGaS9Lvy|QeI$$-y9?TbXERLx!8wChe{b=Q
z51O7MJF2SQ@fgd@2Qo`TEX%f(_32C0p@G>ACUVojZe*uLwE@G*2iL!RmA|P`+`zrb
z#%tDGbM1C<J8`l5*s)?s^y5j|#9J)HQ<CK^l<cy*fJ*YWJ1j$fj%<OI)sL=u&K+S2
z&0U_IMDh4CC&%$~=^Xf;LYTX4v->CUUptvS9=B;cS?58>5;et!TNU^N1j9SK%T(7!
zk4;6@>l=NGe}skX(Z{V-RkzTwvl%F6w4Y)3Eq_*jr`kGw%R6SR=99FTKZxl8m>i>)
z?k#KX=I)wPQB{?1SMV@$Lf>{qecRz|q(CAc@!35r8LW(VpIf}vMLAj1BTkk{vleL^
z(Fp?#5mwJQHp%-W+##ZV4Nw7LWjUdV0b1{ab;8-)jKE_HUjws%)}1?-9;|JvvuOTr
z<FeIS&g|gbKd~`?wi;?L>k~P!Lkk)P&BK!c{kEO@{<;sKpsYKnhD8*ge}*TT0unVn
zZy*V{)3?>b=XHMYv%HzI!XlsLI{sXNR`3<D&YJrAU1&licF@x2)TQS~CNzIJ+T+UB
z@~Vm@4YJ18y&QW%_jpKF<j<>B4sM&1+w4lBzf!BwhVZWH`d@v|A1I3wXfPJaMtgdM
z`_zRZdywW}q)WM)Snnc@$S9d)>}|aUZ7sy6qC@=G-cXI=I)ey{&p)=)p|*SYtUe<?
zcWk+_Y+j|yhtD?kqqnMTcgUQ+evvxXU5F{Ibo{lM%Bpm_)o1##Nuy0ze9CGQdN}bk
zmSxkd|2yrvH0YsC>F@-h-9QSY93fd$Hg2SOFM=jeypXxzdI2r55vTd>^3H&l2n@=C
z7eWEai2vlDDOmV}nv{RmsPB^n(Lqj9#d(yRlSoZjjcZl`HdrqJdB%JxXw<k#lR`e@
zTRlA<dG)RIc>&9Hk6VY$vkIFR?d&pa-@<0cPOR#(yZgJjq1`U;4GzuUepB;Mv2Lxt
zxBk|-RJx>BRaV}FXQ!mjdPiJiBH-?dh#tgSdd`BE?@HFB1Vpy<S~6;N(~Y;IMxN}u
zc2U&rR<+LpvL;xD4z#Nq@#5$oyLTLi$>fbX<)J6jwtLbQwZLL{z!#CLx=fxj#gS@&
zcipp3pX8<5{n$Coob-(wH<(KrBM76oqY#>9zw<ha-__f_284~k7iVlAKYBEM)~xNI
z1>B3HhyAf41Dclc1a98Tm0Bx}d{)XtkNgP9EjTw*P)&76_AA!lrUChiffTcCh@}k~
zNhHWftt+*nmlyC}RnSXDS+>Lb7t)A}*%`90olFcYF8%M4L?w>ABT8nbRhrTT+~=YM
z_;ms!^^8truQEF+_(a%R!Z+tCH8Ba?zwpZ3;0fDKEL6!|{nze~ufuX<LW1`_sy<*|
z<#4k&-Y=(O$JpzZuF7w+4TEp5^6;9$`Z?k|ub*lx_b0ZOI58s@n;;t;I(K&OaAedN
zv&vVm9P={BBskPZDK*~KWz*a0^||K_>az}qdl|$R7sqENwM`Bm%l@tVC_z3jNNB|B
z4^Z8>tdkB!>Xb^DP&fCw8)$8f9!!+|fMacesK(Hwdebh|>MwGpZ%6v7NIDSfN{E}^
zh%`^i6ySEY%hVq7W*oilnikEVE1duzEds31?k!bRD(382Zh3wK+6I&+vPnud6un*>
zGx+$l2iIDwsXdqzfG{_al5FOsL&tCV8~Co=JTnUEHj4&I+q-*vd5JGGF@c8pb#HI{
zX2yZ0ag=huwOK{zAh&V?u==)U8c-ZHJ?ipyK<E}w`tN|Prej6i?h73!an5~A{>rr0
znx4CDd;VyA@u5wT-a}TVBKa!|^6S~IITEI=;Moo4G>rY0oww!Qm>WxWcvcNC?WCog
zSkwICi6&-#n$Ha*!^`a7XJ*CMZawDVY+;%6BDp}wby3OefTcwDNyuwNabu;XNI5hw
zY;hCl3B|51VXK^-OMHJ+3|8+J8vJf)_c{A2uiOj|9kKUa{|9yVoGZ(*<=GA-z=$GS
z&hkIp>~3sHBtb8+jV#S$V%Lc#tv3eqB$23^JWp#iDU^xp;+jhS&3!o-$8%I;hInh!
z3d=AeUevd~l?F2`+Nt_injq>r*CAkf3?AIdHoOz*Lc>5;t`zeE63t|qPUvXX0d}-~
z*=p$HO`UQP26EVkaCyP=-E8ca8JTQHuT~7Ku)`u$jO75jB=$35t{|JN2p&)K@=`;h
zqN0k4NyPKln^xoXP7EY1G*v)zXv8UjW)@p*erQi*^^_J$Dk{$NV??b<rWfu2)<k?#
z)-a5G>ql*lI1^J0i-j<9Y}rQvfy3o90S2@(Sjt!HKdObBNki|LF}vUJn__-#_V8-$
zmuP;yj2fI%B$FIh<_=(MgjwpttNJ-_OE>B&-)N<!-ck3WlWVoj`F{sDe74o<)C9Ah
zQ*-;*Mso_cv5{`(rN*byCg42CsFITJ_-*LQHtj9$zopB>dGGt1Rg5zDe=Wepva;Ip
zvdG(6Uk$h2+FIT!`s#b<sBK^7hDQCm>Ga*Lyv*T9TzPQFlgg~>A>o0_GmecK9YM5{
zL`8w3xql*KBfA0EfHu3c?w&^?A^-6ksc2<wsGBVxg6X`bH$=iHo4CaY6G%20hZ0L|
z<?Xn&i7)1uQ}sKg*S%+skU*@>*g4C-s)F>iA&0uQ-V5a28RS617s0msUw@s?dV1ad
z*nBofF)}{xLWVUlZrD97%|_5UJ8s>&b^N7S$lO4gs5pEW)vPgY@Dx?bY;Pne;2tNh
zg4zbmm~%D`sWz}bzi$f%f@rEqG1X)L#jG%ZyJ`fMte|cy3nGvw?4_7Hi<Kx>56)y9
zQ?n?tWvP&qSj45a(>SsFX2_!69{UZoIs}5_f<=t*?BG}E9bC4@q$i4kN*Akav!~gX
z$BHW#cRLq7D|5s?&EeTOdrUas0|yRVq{;%#JH^#j;n~-gXY>X_i^yN{qUoaAoNvts
z<YlkkY-$(fSgRlTYs2ZX5pL$2%NoUOI-P9Qv3}nv_2<tWZx*N3mepq7u2olFw8}1v
zVn<qUzI5}UJNm_0v<jQI8^n|;A5r^3v*hMmu&ufGw3EA~x?*;OrvjwKD;#fVsK7Pr
zj?nvAq^t)B4sHQ>qXQH#+B5vmW!4=$Be3-<;eB+psQ1V#$peyuv=6Rd1lR`laSa~=
zkzJ#jAK~fWVV75nk)ReVIrSx{b&EQ&FP$A(#_X*oaH7s@#buO4$rKbGVg-Ca48uHG
zx=6?n;SC`Ij!F}EY}pI*;DI4nE<!`(ks@NCUtZJaDAPB70)M=HE2&$$Z@+G1TW;RG
znYIPYQw(=Fr{em{4ngJ~o~Ea7MOCWgWps3o)@u`wfWEF5X5O7u9~0a1y}A1H{Meg0
zqa$vIx4rmeP+m-GYGrZhy{pB!j;}60EnOMbt@P=9{btqey{+k|nr=0e%{Ah2D9h(o
zMdf)@R5wt6ydvT*djV{8Vs5R=m0SFDT?hA;HjcT;)yFC=-j!~wuS-ZrO3{?`E8!>7
zi)7E_tVEcyxYVCN(&x;3p6~3@WR*unPX4AJA+2n>naxS?P|pb{tG{Wf;9*fURkft_
z<J_?mE@!T|_48X9>dkZ7PRACo%3K73bLPmF8s5WMwt<l8f0u8H`R<=oT*_ZGP~e4&
zJsczie|ZQxN0GTmS@+N;x9CN}CtKXY)VMKHmqOGu5nQ*!t|nr(%W8pTy#{c3J$J9D
zndGzn#P}Ww$n)ey$+EzqwkhwO<JhN39YBcN0ZCy^A4Y$8e&06LZa910L%*TTwlf9x
zWXPna3|^8}#QKsO%x9HdGc<vG(5>+DQ0ihR4bUc?BJl;O6o2S)Wy`&!O+YCV_&#(7
zEH6yZR%YbHSL|;IPY)nyq+y2B=q%sm%=Et9+xT|xu*+ed;@R5=?SBN=e;R)+!?NIA
z<-RqwWA!T&PtLP@H)HAGijI*DA7;BGdz=W-uPm$d$c}2i#=?2=?bJEWyMkB!n3EXo
zloP)xzf#e+QK#n5s^``Au?|S>@@|Dj?9Z!Cb-LxB{f<7VjkukfI;P_fm6+`Ov?o`4
z#)YMQp4jq=+S`rCAOX^`JLZ;($7eFdjAB<?R{nBe_Xh0lZEWV|rjHb64@8`FvGUDH
zNN=A6Oi>>&>9(5p)``8&THEdz0@jX1nS<X@LCol}`J6E$p~-Pr1uD(AL8WpiY1$N|
zv!i!Z!{j9j(js4V4NgwV@7;E>p*z89i(z*uOqv2Tp)PDBOBvv#TC(|87KriWYi|Dz
z*MS|#!~-}0f=bX2v4Z1jmbNKXB+!2}x*Ezn1cu9e<{Npdx>OEvQOkjhk{8Pw0rs{F
zM2(zUzB$*OQl<cdnpOzvDQ@?(uoFgt4=2l01$*JTa?!=GAASv1JYhw|VvvkQ3jL2h
z<}C|uzqXLH6LaR@bD@tE>sK7XJ(}`n(0c2R$*0t#RC?uxS-Q4g_HxIb9|P^*#Cp8h
z8=RB9!#ipP7cF<qZI3xCV;7w8GUd_q*#BVpmZn?k)o(VBUNpxmde)mEPggy;yEwVv
zMds**mATLJmV~8^?b0}WtlP(DKV4I<>aR4jTxFZ39_-rnU9)wOdpuTA<*)X68voqf
zdBwHcQ(5(wsI2FGitd64Pmv7dZER&_bt<5h_sySrX1jaUTH8{?up2Vw^j1|f;*!9F
zV0=i?6?s!&rh(lLMtj{&*}p@5tuD4_InOgQ1@8bn4wT0^*kU~C2>5^i$_gqpGrOS*
z8$|!~XD~e4?2bSC->CNn71#_$qma$YR*K)AbZ-D%2`IDZhh=%1HG~EISyN2yH{l|-
z1Wj9`$^tRKRn=)j-*`(Fu#o;lQa|<7nACvo?49v82}~e6wqg>60*N?A>Gw4{$y3#g
z5iE?_x1t<pDT++YBW{~OI;OXb`KpWmj_g7ax}?QqZZM>`IB8T~4-DL{|J8n2X8|B-
z4IK-UnYfnts<D^F>`M4FQV5g7_|HLVt(6*sj`L^H#bw%R(c<YWpN?~wvo|w-@{`OD
zHjT_*TuaH^x4F@!CpL+TSoS;SRF#XD-R8{sXrGEVUeupE_j=irO}0^^yr<fG9h|52
z*2vZ@ZELXhA(ezhUT30WE6bl{#s-%;)T%vMJ@o12MP_N|^^exphZV-?Pr-~RRC9Pm
z)MxDOM;!|K@$|U;XBJt<pV^odAAsI3PR}@|hpN)XCa23WloM_bA%7E4H(3k-RBz%m
zb&qOo!ubUuJjXNI2WzJEYGaKWMs8<T;kJES`D%ctnDrOw)hc^4V|*5Pgm*CS#<JEb
zBTD~f{$AE9z#VNVQ@_1n;X(<~i*BvoA6#&2<LCY(hK2BczzD_lyV5BN2YsAh5fMi3
z8>*+*nSYxuT|jxXhzeGSO=Yybxx&17_feI%K20T>2+lqpi4)>hUp8>Ck*+{C=Md(*
z@1N-Ysqt@F`xhmThP^FVdL?^E!PJoYwSmi0*n%cNO<)GfX$33J6Vbi%@Hve=m0ej}
zNWsTlNHllNYl4$Lh-V<%4X9($To1pxC}vw^q%MfwX|`&U8pXN*kn9xhCTzR}5@gW_
zC!SroZ5c*VQCNe8E{^rwJsj&<)%Si~ti`zV>C0YEHPQ>dzc?!}Ft9#g?8Bck`?gzq
zhS*vXajo#gkMsIDfW#fYy=ZFaR~9w%d`YQGxwHMg*Zpg9nWzO%d+#!`R7=bGO6`)t
z0V)-xm3sC6I?nN3*-h<c?7H0NS<@3d=6JlUDz^U>@3(UE55vl2$4)z6v}{tI>jl@q
zv(25PXIDm4WLJ&sCUSL+%J{1E&z}vl(2<*RtaOjZwDql}?-D~`YTVNxqRqV)KNz$3
z)qj?D-i5m<DEqc!GvXTSpy_>UoWmbz8;(<r>14m7+3{3%!nN9-aK~%Lt6D_gSi#A5
zv}o4YmJA>515OWBPP2u{pF+X62Lt*a-g(?L9H5q@5WV(IbF(-WrBA1nM#$k55YEUt
zQ+m57pXjNa-Wk!B$!Su!gNxrzh~#3tcMO@mlvPN)WP9wB(DZBts1rv`4q2<0WqYNT
zx9NL&+MRnVp&?qsD1pcHU?f06705Hd<^X%@)6y<z+BWNSpSL*TidHZ$82I$!<;(Ko
zM`*1g7y?oPnr@|gvC#Bl9r=O~#YFAgW+m)&x!Q2nrP)Tk+5mi{y}5SCzTm;t9Lwa&
ztgOFo{+ehLv7qT_&k%j%qL*eXUYZqrHuKNQTBW$sGWqF$*=w$p`}T3^YiVX2`6}g`
zVO^K0-!7Zd^c<!+rP()(a&<AM*r;=3w{)ORO4o-ktj2|AIa$|E^yt>d^UFz(&m;8z
z)AE`W*miGnal<EkVW%7xG&33+vqqjO_dU0!Bjpv8sWeG$do&VVuGn07BnXns)ah47
zH;T%0=_0F3;aR(~eNH?y=<zny=j4lXHcboc(ge5!EJjOqUD*EnaZZ=u+neCa_y+Q(
zzmd$cAB{<<U0j+*3e;_E3WgghEk4r5X!Il=&!yQ?bmvAw7D3moX<!GJzzUwHFeyil
zbbv{Lvp<dJ4Z!_95<p>9)n?<vz2@ZRcGcKB!KrMd)%Lsbmrb)(Q!-b5yZI__NljJ<
zSMLSt%}>mUj$Kd{ywRn5lxuEB!_ZR{XuiS^QW4VWx3l%rw!da&=|9Q3x8}WtT@y{w
zzk$AVi`uFD7YPvS`tr{PJ}TPTYr+}b<bkmtaR=|$BJ{TP5PwtZeUbWos7~%?%(tL9
zVDAvi2nZowSR`Orxt*gb?pjh30ED{N6_{I1$msOouJ_F+n_Pz9ow_+I-OkV`*9;_8
zU|NU(%FqwaPGm7->)zG>qH}~{WzWGGDonskI#Q5<rV4~Q64u@1kY1k3-q0KZxzSqY
zMzU=~+9g;XND4?;u#$j)07~Zb8?1cH!ukwb7Jp-Z2n*Nld@ih-kTXfk+uD1TbLdSf
zvw2!U*0#+wx8I$=w72Gu{SR$qqe9RtYiCzi=}?gu@rBDHUZtlupgBcHOvwUCb0#EY
zPmWo4)$U|ykVyjr1Hpq4B{Z!+6O)wvU$gCDBX2rPTYKWwxzK`$+fO4xon0NOE^oK8
zoaMN_xVF)#pgw!kv#O4&pGn>{{>;zk>YFttox0S?d#hphf}j?yeaDZuG6DlET6q(A
zJTHIGeGP`Psgi|EtTJD=?Bay9s+n@&xU`Hqp%u^uBF2k8M)M8@nq)15`OP{d5iW37
z*aI7)a0iO9xdPow*BO=_lr-Z5GfHS-fxL#5-Zgzbansm{Znon3#ACP@T%NzgUN=X@
zX}DVk9weacsE&=)!%Tdr@MW3Uz=0LsWt?wWf{QG^o%pd*-emP#q}8D(mmu6W)rcf^
zZ}AYQDQR9QP~-FKBeLxUeq7hOyj>fQknmZ9q7xn8Z+3a9>QeaWYuu*D{QKI<ke31$
z(ukY^i~vsrYvIr0uwx6OEW1YSN?Od-;3deWNeUKttJO_?loQuvtRtz&MjLzGK>kIB
z?#Fp}c-&4Xv1)ST$<^>*249MvnbpS@t!NyuRja2;eN@Q`e(5synM|3oEA;qerB2?$
z?4bSW!bOpBXUNUbE@Qj_B6ZCY3r6MK9|XPyd=gF4rv7rhg1Um18n}>|_nTd3re$lp
z`{k5J)eZmeRCueN<vZpiw|a)NVu_23$BmP!6)oz&=WH%?{H|lOxn%v1)RuwEHg~O$
z`T1h7hM&{-JcxL@uFoF=*%i=X7Zvk=D?VTAG#94wM%XC>lN@*dz&Byn6O5oi>u%D(
zs}}uZ|2Zmig;#==+KqY-MoHLdx%{Qq3@B{DD`lgfbmTa&wd>f?`20+{XMlNkxFsxw
ziuvV6i38n1H(>}am^I_S{lWcOeOur<BE{NarDeA5Sj)KL*y6vsbT9q9kz~yuAcaBp
z>#lg0m)Ut+!&jWnDB%wTaMN~OUq<<w)6<X?+zjS^+^l(SN&8E|`TK*N>sS3y8CqUy
zmu&blyDEKF&V-LE<<jszj|U%pTl|-&viK~>Bn$6PWKvvP^-<O|Px`W^h;?&a7%+-o
z{bb+b@N6<M>rv7(GK$y6_?Km`>V2@pyUbiYZH(2r=X32~zX_mx9=BDYSaWlEo_Mr&
ziuh$%S!TOB$82z9{Bw1;j<Kan-+u|8-uISuOT(~vDWALAY#&wM-Ot?hdv@lBjw8bJ
zW_7#MwUM1&nTPr20d-~N$DTx<KX-2Wu~Fm0ob;}%cLfKVMKL^K&Yckpu2~1sk4?Q$
zJjS&({LaX^wtAbAKllFk<kEGx8{I_<ukBbgUM3UtUCQrJxQpBzA8}%vWBwVE3KbH?
z4Nq+bi<`cE`!;T6u*c~)Bb6@(JDcUaC`QQ?b8D>b^wJh?_N>zcl8M?DxplQuhqHP&
z<2+uthELg?xz4Zd&3ZS7O(yH24y@}Q73ZJy(=pa*(~8q&S0ncflP#3DfBkT{5fw!|
zdzh6<r{Fj3c10DOuSf?Wji1t~@RKcdlLNInJXUF-sXyB5W}#-6?zNdtPcn;Edleb_
zy_wW`kL~mwZ>qh2de^LpsI=EmsDHJ2OPRHH*U%-KYwx`=go!x|oQBvr*L}!>klG`c
zOJ20f^_cTeFZkFSh{@Pfj>Y%$)^+i9oz$z1wz6?z3VCq#4dtQwoize`Qu7B)-9ej4
z;Qtak%5z`qlp!fyQy!V>#NvBd`0K~*Upqi?Ie|<aH82c)WU8AY?mK`JAMp2Yu^_}{
zdQOS2_M<Kpj?Z<ct#YzE6~C$Wo^zw@9jdPUTU3*kxw^<F+K*8YRlBri^fes_TpDZo
z4mq@;zg0~WGlPeU&wB1LS13ToLOsv=zD9}KfGk{0b`&}V+xdI6(sgw^``LM+Yt&6A
zeOHH!{I0F9UR^T4Nky^IZj<6I&-w3~jUJ&%*}5*BCaTWo1k(^u3gPK|X1WiLnTP=`
zc;P6ic+s5x3-^~s586L5+QPHEEG)Fuo61<`!H^ml=#^+!Yt`-=N$6Ld)}ndy&NEI8
zedt_n*YM!O#*9NAp5mz*dRF`3mje@fp(5B6x@xq>-m(vmz5VZePBWmuj9=DU)26YN
z{$%PBrA{l{JJ^o-`K4GU{Q2pQc)At&=~u!Ezjy9Z|7Zw=Dcv}gycL2Z_}0&pZ$bmH
zXc}%LdP@D@{SU`R3~J-MTIuJBJ>9BSm~Kd}&TqPMnbMIT9wS@$DX#q?TQCuoNTm6*
zIak^f2DOTxJ373HcUYR&C0Tu0sGePes3m=0<#+9P_3imJxwgYP=QSUbUBSY5Bx%9d
zGgm~QX&`?yfU<>qm2Dqi_&H@vWPJbGOLgXlcAtG_y5XRsbqkLEhOo;cc&SG>W2lfk
zR!CW~TY(E-yRl2r!gBW;?QA(;t8UD;ec5Hz-j1ig>NL}Qm85MNx_oBelwQUSSOk!{
z;{4OCszt5sk3F2~%=|^8))+>KyomTu-=kqLqA*;W6)DY7Uny*1A6+pTJQmUqa&Q=%
ze99tp$D%>5okr^A=<d@#pdR~dj7>>s|1`(E`Jd-mCM>iB4$kr3PKppz4+LOtaC?zU
zkj(?utljSM;@v7g>kInh`s)7*3Qk%!?3L<zTHFq`-%!zrdJ8a@qGE*S_b0u#9-Y{0
zQNcsSfbuseoYqvl96n4}*XTmVh95D#q7dID+UwTGcH+v|4D<HiV)ypE#-MUzMR;1p
z%Y4)Q9!=VNwd4>LRIf)pG24HVQcOQ?DI;nTW_}eJ*0%E*mLTZHY5J50>l_Fva8z!a
zm-jqNulg4LXMd3B9!#KYOVRXEf0A{uJaJn6`P-_8DxzV3Sq>`U!_TkfRlS@|apB=@
zDZC&WqyOwukG^w2W%Mt6n$Z-Wjf)G2FHmV@f7)O{VpAiZcTJi$-Fvz2V0z8G`SP+9
z8+rNLT(llAv|mau&-Kk0&KpO8YICtu8*5u+GDo!LZ<Fw6JW1ykvw1z?GvHw9RX;jA
zJ?L)T=-I+%ld_vnPfi+Av+fqVXxC~kn>@L>s4aM)01~Tz&ed6WDRad)gVsi)ZHGDg
zsSVtl(Wz8xf>L$w`bVZLevY%KD(=wQ=r9+qDr`YWO+4a)ncm}%bpk!3*aX)88r$#Q
z6tEw*VTK+b=dI~&-Cb3wyQ*?|#iKEem%0W2YucN_<XEch<yUG&FG$fwqsLvE?N_?b
zVc6+Mrba$dqgoDKIjplsZV#i4Y5tSWz}g=)$vD*Bb6;VCopo<k<sp{Kwg>G6f{I_!
zoC~~GdE8%ZngP^NN0BQ6K6EVtgeox}s(EjTNse}SU~35w4omXwzPSH9-D;>C%^-{)
zKznz*_e_F&&@7;?xVLA~_}>`O%&%H0?R>{`isd-$N~*Cwp;|PDvY!%*YiJvuO@{cv
ze&gTQ$9C9UTffNU7F+J}>9fKT(yey*i(df9(jKCD;k3=)+NuQqGqU!ue~rOW>4SRu
zPfIlASNe|EDJbmOOw;Bx;4fvMmw#eomHp$DsvAen@|@VK^Wnl*%XqH+hf;cP^#~nu
zJav0#+mmBFI>Wl7_nPf7VWYL0qH^NX6MQ{hM78_emJjFJe*5|rrHN9to|jPr1S4@4
z)2!cWlkg{`Hd}AxQ+KWEuh|s_Yr`~$6SgPz8naqO)8^cgcd1>=dHmmu<<;`DN(hEK
zlNL9Z-|ZbD3c#Ab9DJ^a{##m7Qf)njFTs3N69r_H-J)#P><Q{WJF@dZYc)9l(ZfCW
z?SC>R<y`-fI)@~(LMQEi@-8uD^7)za^DHjR?yuR#w{J?<yX8BYo|7uA8O^5dag~j6
z8oI|<&GvE+-1_6`b%OPuM{51n3V>>dN#HmD<MpFYeV)|ESnYSr@AS{CIUJx+YFu$V
zMr?5nH3Rnb8_O5%<{^(c)r$3s0A<0pszxTweuHK_4b|R<C&z4FP3)*C^jBT`n3|}g
zt)F78rK*YXn#bD8S^W<De&`8i)^Eky2jXblnd#H*))ck-{cFRr?Q1p}Da9;DA|R_s
zM3w^jkH|{-m-ejgs%s1`mL!hUyeF89ns=vkC)>`-iZn#y=!Hg)&do~gq|Ml>1y|QI
zx4&*m*MFQSi6*I)>^<)7PpzkG>FJMpar>rIJ$*A*{JnBz*|qO}uLsGqJGuC`p2R&*
znP0y(ZRYse@h$jB`x0Qd5{j!$)-AO~>IR{wz(QX>w#a)XpMRE>y7u_&{%@XN(mANQ
zHkdm?6}qJ?zyOWJ&oL{;rfB{C8CJjIjlF?kwn(=^Zy*ED0`>Lk-Ma^d9QRJ_W#4Vj
zTDL)(HfzEHda8M!NpIIo(|Bjv*a$gros{i|M_Yt;OE<tSFjjmEm@ZQ$ZVCm=uuUs}
z)3;?opLfZlJ$Y~L(uP~omvya9c(v^HNV_193u+RKi54ApdE_bU7$Upuv0qjmBK`)R
zH9>=bfOa2xf#n%N)<AHH-(082VV!23KaIN|5Z=dAx!;)06G-@~I%+SpoqCzsvRgtX
zFlcz`3X@=Jz3AMf@K0ubiuFXL2d%?$-~7g8n{JWM6~(#jx=Ov5Nus;I7Ma0Qx%cIi
zt|z&im2q_z!pz)sskn>2TZUheEQ1NiNEaVRXc@pnlvPz_y;~9Ax}*oVaCvs93`Gz6
z`_s*X>FF@oLODEpH$30olczp>)jxo0GHys4qgzw^w%cZXthKuB<X&wQWP#NGN?uf#
z08d4{is)iLeQm5#;1lYj&p)aD>JE6Nt(>T$!WA1aB{*{}0G+Si#N9l*HTgZs;Ekyi
zzPuT{x!Lvv1&9y~cisiXuUr@+tBnfS9;AR65AKnP$WDWAtp8%IW;@ZfWv?gI>qldK
zAfrgM%>u$XR_2ak0$oWuSP)PlV-aAZ3~461YVGpN!{uNV^^<tGS}eIOJU-7sk2v1M
z#P5pstlkG_q?t%;JnYYv>lvH80%bFz_cdU-vSs)Ws-6?&kc?nP@K0G{0PHG@o{i3Y
za6af}y+f(%syVYgRnsz-4O_GY1)F>BzX>BcUmogF{I7oW|7!sZ?1EBl6EXd%W(MiA
z+d^V24FQZeD3KMqBMXO9nJbb2@>$#Ndwh)d^uG5D>T<Uu6>k-}`x@I;F>2igKqv||
zdS;gKdV?oJ{_j6>#9IWa%xUMoZ|?!Z^8(vQx^S?AYNS0sf~Fc>53OM8YkS%|+Q=El
zzWxf-c3NLty&g?7y-Z_RWYEhkf|jv7n+JIx4hv>$D=&9#=D$=u_b8lNw{2To_O{Mf
z;@$-boiXj(_%@~xu^1JZmcO4@>Zz=#n=<TpYR_0bJ2;>9c+rSFQV1#TiZt{ff3Vw*
ztPB+H5{6myKUj6|gdQd`w)CxA?DERAv+WTgfypQB@LH2^ZkGlplHGSw??)MFcw67m
zR;IE=gwIq@_1IsZ#v;B7%i3ng!NsE+0eWfBUIEDwuQVTOa0SdrRLipT97lw!;Q=(<
zskX%y1E8T6hpA1PG-*9U8PzD;KfBGu!~8Cl+()~9loNsJuoz>n=y`Hqw3QnXs3Yo6
zYdd{e^2<&{Yp8leuFNEeIErE_r67}<mxu4#xf7+I4?1Dd)NrM1oIeFzUl7tWHiOvo
zfmY_<p#*_mU%#u%qkLdRO$XeCHUaP^E48{a6gSoc+}s3!5Q<lDc^SV)l0SXHf}Ol1
zQPRpX6c`4f`5`2wo<ai4I)yPSZ+l2qi@HmWgYxQKZ*9vJH_ID)jAnIM=hQ8Avg}Cm
zKh~R8F!dHI-I(e5CnD|D#GkptULc2oI|{>T?8Mm38hQbKZkzt@(VBJ{J*ooy6MO+c
z0#y>b*^A6>e`_z&WYi}4BEDUr&+ig>`sQDocx2~imSom{YRsEKYTiNA*63%USBnqV
zpLqn)mRv0U9iSMpzn2mb^lb*T8k!Tw%zydX{Kz{H<zh)7(z1!pHNUH{3)6wPY%YHM
z6QCsNg|=<(6?j%B<D|zmX71dXRmVG9`G4XRbc3MB5+>Sp=umt4x#{l#k<lnG6eCiR
znMr8Xu<ll%eBtuhg+n@fA921n*K&w*Vyb$@0C-WBP_Nw>khEB~YNKP@D;tDyWJFSY
z2Vs8$#&tyUXR)fwsWh3j5pj5{Wn<ydAle2;tXLN|PfNtKQvUJ;VCf8DpKvsv$7;zS
zQ~!q40E7Y#IAyMY)#AAZDS#wD%(cqmJH63;)ON!8ghl|Oqu-Lby7~<lhaOLrCAq+c
z=qm~%>Z|7*%YOM%j_v-5Vu)y6t>4M-#pL&Zu^{gaSfOAD`!9etmU=}P43BiJ>?#X!
zDFm$<{C|J}Is@&Y$EOOeyrewQRL^djQm4u88n%Kd+GqpY@lp>y9M*q;a@PU6xIXMe
zvRcH*nas?vD#BBG@=;ETEW8qbMv*#f`o12QyzWDWI8N`cyZniSj@{3#ZL{0vDktu~
zJXO<JGv#&ldfq?JIMsGL5fXt*b;hZm?Zj11tp1=NnaNjyM@&TS@irndEYlekruv8Y
z%9PBB-za#cp&DKLV#&qe;6@6B^`f}CSMqlid9-JyzZh`3Ig&7DbSq=a2FY-otm*)1
zoiTT=H=_efSleR}ViFnZuF(?oWMFEmvO)DR8=Epma9m;_VjgiJTh}t_@gIqP^aek?
zN_I2ddv(DlxKIR8iOAoCTLs{d^g#DMX5~<op;MHq+tvQ?lmlkG=l?@w(Ec$gSM*7s
zj5SKXib1*Hq?o#0WS1v`Qu(D=8VkJ##2`dAeGtzw#&DRP-pc`Vj+d{w^ScS0Gc2Y2
zy-vHkrY<KF<CPjWB;(5da}KQN9HR|0j$8N68RL-Bb6I%yQzjo6Pidfl%Wr{>%}_OE
ze%rVyQ(Axla}!jdq>ddsc0AqzjGVVf(<llRq+_%gBXZ}4W(C5%V1BlJcK^R8PkyvT
zZw1p80!fjL&_6@Mi58WJbo;g|)g3y6G&vLu77#kZkSJk2pZ&(w`gaYb*<qImd+Bkk
z9v=9Mp#ZrcKXIlih<SI?>1G8oR2~Pc#RMy>)5-dm^B#RFen2ATBQSI#tG6juPz%1R
zsatV7jl8`2vvtj5Cc>zHj8LYaiTJ>T<<n7z#`QwRiSUdr9@t3~&?b;HatF{joMncC
zC?nHTGIt;^4@KeA3zdJO%(bdM;4)G+c(*$h*K@vB2dW0x&y7n+bpNIg`|}q6N5Io)
z10xqQa3j<b_lzJa=%<t-@~PObhF5`zF1T%)^u8d`Oo{HEFttZ`A6t`I?K3By&s<TB
zAonaSID~|NA*g5E^R6tcWp7z0dG_3ns!eq_kB^y=iM)WxqqBev$jJn%A=w+`zD)l|
zDy_h!$L2p@)^*dGzMlS#WR8I0i~=q^(7jmDDj0$#lbdM2d2e5^-W!5p)0n+Wwbf)&
zt2xEaj)}eQ$|A=icBSu0&an@hCv#d_3vMVGol%vmI3V=aHcGap68X=hFJn)67<NA5
zpOdpUppWc2)F+6g=jEx8Nsp)ApA(aWeluxjdp>JS#+?Y#G7gHgso)k#EOwA*B@;&C
zYn0V&XWBa%<q-@Yg?`I|Z*B?rUyu0CgV;LxJW~sW7K1UOO77xtEB&3@DE*Gbi1>xI
zatd_6R=9?{F6-v;(SB6&eFME41X=>NAPsjV5pK7!G@vD-499x6YfLo2%!zQa2-81r
zi_m0|?VkGe{T(!cM`aFEy+<uJHlwG_?HVvzqZJi{aElgf>7(9`&l*#EdnJ{2&gz>g
zh$z>7{mvL|Q~vF-4(m6)Nb{G;8I01i-+kZ4r;s@TzJV{5n8;vr%RH&a&P*5I>14J*
zc0n5HSRv|K)=M-PG&^cRh{!C^(l%IBHTnns5GXT~Lf*xW9cK6TeUi2^u0JAp-o0B{
zWWRW~OPSD8W3&%NhgDvz^9yW~8Jd?EEW>QnhuhoOLpl+aewQG#zm}^{x{yJcvEBOE
zj170aR3||C8B^X<1Tg>K8e7p>o4df@F%Tlmm1%F;b`Ict>uFJ;Z~vj@OWsXjL`tMm
z)Tgp7{r_k>@3@}#{{NRKv`EM*B^Ad?W*R6JhwM!n6b{KqL@H5sMja9=nTL?QQYj@X
zlwBzqQK@LC#P9xeeSg>O{Bf@9qCTJZ`!$}=$9i%o_R5jOA813#b_L)QIX@a3>O@Lr
zJA8St@GW$8*G)`$8t-z<I7o$}0)aWD#+l4FRQGi0^s}7@4{nD%-07tA(y36@Tu(um
z!i2eJdw|5@bim*wUQZ_{T^>a6`xXIo=ps$y^UImD?8Cibnq~|{k%E&{F~Q_uIg!yL
z_W+?G!a5zRH{#)E9+yHA<XYz@1HB>K)R>v;%O;jR6W1ng$TQq@$jZgJdwaBIR&iDA
zvVGI-V~;9n&b%{VR{1ImiZz5lPDI!!h@nRuTMP@-<Q^tkrMiRr!b>wGSEjyXAl1p)
zOOY-*e3PLj+V7am{E|WC181G#^1~QXWu3EVKp}A-$#5S#sI_{nm^*eyyniGWzm(Hb
zScyMt;D!+_D1Zq*A*t&2n7G#=w<Er@kzH*3i^W6C&C7F^XQF9%C##G2iE|9J)=>-Y
zV5Q^<nIRj3PU+yG(^(5Itzs5s1&Pf=Sqsn_$@~9s#pKDT5Q|ujY3CiO2a-AN+X;oG
z3zARpK}uF+Q>fYR1dP?FtFJHQ2c(jsp#TyM`z?+%>xR!`59a0JemHYRL)=qDs%MSw
zEjbW!xd{%%9s9i4yn1<O^wb_6$Fu5-%4$EO&-|}H`EY;Be!o^w>+)s>@=8hQU>!R8
zPh}s8vPUuxC0melgts<pNNL1Fch5+tvo${+N0t=tdE~aui5<)FNFbePcSxWOWJr~W
zp*jRt<LrKU!9&y4)m1tLVnLhmbDUHMEZ8@l8s(*?S@Hel7#ZH(^>8_67hAEQcP+9A
zDJ%(N&wLYa=?SU_QTu*bZFJ?R&tJdV!GTr!HjL2|IUP#K%FuLGZa=^VuMhWR5ExhL
zSKBSqdIZnvWlxLAy@B59({s}+p1E7ZPX=Z!%6(hg)G<+==P;;(tUm_Tg}!b0>Z9%c
zI;KyS-=b;lC>Z{S>Gpbg8Z8IW5WFMN!^^?5?Aig<0Ysqi*D_1>balTzf7@ut9POD2
zWzF!1t}4Cs!i!j8t0n0gns+r6_n|Gv(edt9{nsES?b*M6w}Y}no~Za_P&pBH<I*?c
z>VK{vRbX`|CpAa@DDnz%6u7#++lkZX)goy`QC5vJ>6o2(gJ&ec$l`wVs(8m}j)gP}
z^iO9DLK-aV>+8EP<(`$0iYd4*&Wm=7mBh=tvf{Sf@7!iw{q48AzijMS@_nGT(hB?P
zs2IDM^FGe+m($&B@qi9B?dRzPcldX*p;m#1b$vV8WT=`TXx?f1{|LID?mi$g9&Crl
zO|eSP(3%$0X~6|IXUsMgj6G22DNE+WfAv&f*U|}e*d&KN;?3ayX);3~+1rCG0ItF;
zr?5vVUjK78hC@fDmdXIrKdHt!1M98NAgK|7E(LgbT8AahU^#5XbTlay$E_RW&Y*(%
z6ZhGxd?v-dc+HUgP&CPG8hIPYXGJn5Pi)#bWH9yp+HaTOIpI=4ytxkwL<WZUQX;8%
zj3VLW9!P~`{bTlKV3SM$Thq`R{8d^)Rveu%J?Z1J-|eSn`g0S)!)_&`m7N=N?%d;f
z`@kp4x$<M#lM-LB?%SRyqP286JhAvmP);x{CYfr5CyyVxO*CBYue?UH&NjX5zmA)z
zf|SiiAHFZgDIf{Yl$zHIDtzb!5k{o}Lcn@Mo0e~#^HU=Asq9R8_PirLbFQRlTCz}!
zZ%t+2qB(TA7uF3CwJo(0dD6`|MM)HJX3~p-f>l(ecgOO^m-{Y1_emT5ieyuxurjgj
z<odEqotFLX!-sNjliL(Za)?Ykm!g*(!)s+DP@*J3DgwqOPj0d1cHFuFWLdPSUpJht
z=Dj<Rj`#~WJo$XOJr|WP*!TIq(>FK|vFCHmWPK`q858<)u(fDpiGd_<F|tnU%NgQ8
zgarmv9q6}BY+rcLf_$~s9l0<g4z^1ec!8b}RtvX7pu*%i+);GoG_gauHMvi@YMLCK
z@aj|BOgV<|Khg7ZmJRfCeKWt`VG3!AQ7R%f)UA_+Y6Ge(eCzKIRySMoop4h5C}1Qo
z?5u5hJ5-uEJ1_EZ<eO7DZ6-N`-Upaq%Z-bg1L@f}40k!JpZ@Y5Kyw)Y2~{&k6q3Kb
z(GAiM6Zjy*L}a~zT_S1!iL<6$N=;D*a`?AZ;8<VYdY>4JPflW&UL|{#dmVIf18NXm
zLF~?%N0h0{*@CA;+SOV3!tpQic!5_SCCJn{b%X<b1ELqOU%UX4-6tsl1S2?E4~w8S
zN7$-|N@Ud0Wz*D>*MJxO{5UW%kje~^lCoc~6M=no)N<-A6KX=&ru2MpQE$e-;0C6q
zEt9H#`?IV>-Y(D>=5v|ADv;no-E{6LHW?q9%brgA<kYd|@#&YTf9ZK4>noVf@s1oW
z{hC`D%7*~{<n8lpXvIUoQzU=@gT{N$4!D?bD9duCc7@lNc4KAFb;P9Sl@FpDWV=De
z%J>4wT^2hMVC>Wi9gc0f5;W9OQpgjQy-)Ptu<qgOGe5bB%Mi;G3NvvdazY9)(Oha3
zERYo-z$wmsBvfVQ70(e!0e$=x#G2X#xLl-Uf)s-|$-v_uDSE$>I^Cya0D{f=zIX3l
zpR6b`+0y|LWV8~ni6i6c&A2D?|F~;JZ7$gb@WM2pe0oknkrx409gd4rC)7xwL*Ryw
zQ-%N+)%+CRO=Fx#=2(E6DX7p$X%bJ%k2i0B1y6NaeRS%O7jZK~=%v=X=;W-cFa1Gi
z1QJ9A$PB#Q*wjQe$_xGjII%ZKdTNttS<?|t3GLmh;vr{88x{9gi8KCfRFd)_xufSw
z8bZ)&xgphS2ilsd1tC}x*9?Pb_6=HU@by-`XV}00iZES#CxT<K20C-+QV#ATgNgqD
zie)tDU7bE4a=TSA49t2VE^@YDEpi3#Lg_1S1k4j{&@<yQhc;X9{FRcRMqb7pu=8`s
z;1o*<Z<JKtpGEf8w<SSDSev8fk{f`QaCV~0KXXK02v|FD-)DUaWCC~ZAU@<T3->GH
zkc8|zuBOz{QybhayvoPIMgb{V9USzEX*)U<SM^Sy%Y%S`@gB}GEmS*T*#>GpL4nwG
zG7}0X&&`!{N-V72KgT`edFCPf0)8w>#8e%S2^z&^md{Zj4YYLfN=nzpnSrSR%+g@W
z>>a7L+q8N9*MrI_H|QaYE6aeiB{Tw`3Re`722wCf>IgTc#NXhbs~&YjA)koKR0`3}
z9DLk7z)sTr`Q$8B;U`|*Q_0!IS@@C>K}@`qcnKDo*Z0`%ABT$QP&zkBY4m<C)PvyI
zvL9goJGV$$cu72`)GI%jWhtr|JPT|92`OOuNHPrh&AK~ddhK=NG*$@C{5kWu<H6g=
z6A-b5j5r$WM>TN3X5q!jU(Zc0PEnj0Pho1O@Z!_iLu5qxB!G#_2o^p~G<F|q%!vDa
zF3@d)Nuh3~?w5F$o3B?^uL*^|;;XaTgXxb4`F%yaH{oO|;E_yx_#K*1wqwVCGv2Oq
zH=3BWrnljdaam1I8dfSaifPhV_41n5R}a596>YG^dheF0|90-&X6Vo(-4dD_K0UHw
z#Z%R;37Ml`&-JbSl|Q8Zk)he1;ReILpZ0zpb|KTM{^q9xv-3Xn1UdV<+iaKP@f7r+
zkQGwVvT>zv`u)o<Xs|{*HD$KQW}x7{OB8hPJ;}&uCPNZ<(I~RH2mT1IA|rhUe?s6V
z3y+Hd^+xdlyk2gI%<+mN4_pP_MIZHK-Yj@PuN5m&31eh(auaY2fvhEetlieu&6{9l
zad^Yz?ls-N_CgY7IQp-7b5Bm{UbSN?M1qXqAk+J_S?}mgzsaO@CVu7zlkPh2$?!tH
zsk<j4VOJ=@y5iO2dui3)WboaQG1?LQ1Q&PrVDIARpC4Q~VY>U?#yUbphIS8shig~@
zP&QY@S6@?k`p>teyCRSr)@42FzrkJwAyH6}5&@y6j9!4al*&QUM!>;jdt>(mqDXo2
zq$SC|Y@gd4ysU9$QXJF$n%S&U<DBN$zv}<nO2ETKi*zI2>#7OKmV3hoT-9aXye9lv
ztA`UOcW*5op6Sz@Q8mAdx7zS4WaYkB`hAx-wb2>AglyzZ9AP|HF`M#3#}0_VlddxP
zhbp)@DIp1sn}^pQ1-v5%ELG_k;wc38qGnNETYjWzc3yC28E4iiHoIL(wDt%wgQ3>O
z2U1W}BsxqPazy+fLF-*yl!}Xs#{=)POy7ZQA?uThg7`qPE~6C_dJ~T&uadt^ix#!1
z<06>HAD}E-xe3Uv(o`Ddt}*>|wL0}P#|ozh)oYgm)0s17-1PvsM$OO7gtpFM5Le9K
z<QCe*K4CKoq(%vXTTj*o`9O?;RVRUt{Wv`KiJDHq=-4MMTDNXOB6F|$6XUv5(vZ%U
zOthwiIf$!W%%+rLD%(tMu50*{Q$h+6k4%bJ5x?SUAl@^%z-xu_Ivj%7mfc9zk#{d&
zmruVI;iJ^eWK15~)F--OGg1^m)I~dQZf@>O=oiT@ZqNo^D^~`Sbf_`XapMhXu|@4E
zsRx`x8+7u(O-STc+1A4M>CL+eo&Wml)Zd=hleaxO6yb0Gb9S1=y>lfaJA3iQ#q}WA
z8Pp1E8%9kciX7TOPXhHyLYI{7JPM-T#@xO#foCJY1yHG^So(aO6|~Fk#JS6;`x6%S
z4oNXB+W649kE9j_eV@^N?pNf~N|$CVzL;6)HHciz4FoY^yK{4k9oBbG##~p<G;>>{
zjgOqpa9{bpIXl~tqnKcDk3^)`T*G&3$;VkZF2eqJRzY#`Rw&$@>)xuAd8nGxsdOB2
zvv1;J@z!6lCg_n&&jn#g$ZLQ#2M-%);=`TCFo}H31cb9Q9l|%XwK@jz#D?k)0U<TD
zls#YftUWO+f>IL`lh?9k+&xF~eEDkP&jmqaQsBq<5h~GlCn}7cbSO_>t>gP;gI_UZ
zBl#^Af#mdIQzMO-Wi*}vJ>rJ@@PTmB*5uuJEngnQF)*m3#?@K9@P;@Z?5Lpv^>aDd
z>by1i*a#~etUTPJ?`rhgtIC|i^#6hmtF@_JYsJFkEa8C9ut<LJK#iGWOd)O;79$gv
z+41G%bq4bAuz#B$`sLq-Pb9XsoE)|IX`@$M&GuvF;|B1r9WgsiIRa%|q-g#&ZAt=P
zsI%R<`Hrg;rqb4uCnu>B;^0g;<IyxX$iVDU%1=!fHvDbpBsYDtd3|3+8<=tUN}N@X
zSy#{9x7u!}1VIgZC?O(<Z^^tHPBF`hR3adaYjOV21ijg_W}SmH<Fu;$GKiV!yZt?y
z`h{?xKrzH)GoI`2zTWY87iKsxmQI`M!C|<@#8lAC;Q-QdssRo23S((bLNq!_vc5o?
z<74gAb)F<S_t70W^4lOUj&<FhVZnXm9%Gdn+s<Qe>v!vRSPWeodLA3FrOzj?LAEDt
zHh#E#<w^+Yu@tFHfrMg1{N4opBHuzl4E#;(8fcf>v@(MGxW+p;><L7hn0-p&Xwrve
zE{hj$f8n%7oibf;d`jk}^nBo4RYMfJ?s6!X`uS__4H^RpyC4T=<cR@(+&An|$={Q|
zE-<T5aD*lS@K%u2q@%tu50H@llE~MfWM^jDws&%JO1L$EPez?_*rBStJmA{3)ltnh
zQc;jo27tA@MJgU1%L!Wq;Yj@bYX}D!b*%)|$@FgyP+vLBn9QooV@D2h`{`3HK~gBM
z5rE6hDDzJ&2Q(!Nd?*Aol`lYkPEJmKe*WXLehsmAkDJN$_tgs?jvPY<IJH~6#L@AQ
z+nAP*XF{qBf3JRQ^{1#n<G}3Vu`3>Z_;K}dUfvE)h%dRv3C`jh;VVTXs8zNU&<x~j
zqP@NYpJ>UFP`Ec4ZNptW=)dIM$Ns+-fXY>7NrF!_Z-RB&aha(V)@<dn1YSGq1w&6x
zA<A4GS;7=fm0vyjM(7OmJn+Q)>MYODZ+#YiQg7m%?Ztv}dw+MAaxpRLO6BJoFcjOR
zD>Q%Y7l|B+6^cXGVeq{UPg|Hwms12|3(B1h55h(roqeymb=^%x_0{)af;g(%o&NjH
z{fYmJ5+kJ63n9hp;9K{MiS6IImdI+V<xRVD-oAYsy*{$<&PW=aq7`*+#h-NddVFlL
zjmKZqwbhzBy{a4EJ$&YI#*VM6agUv#JI=4uJQ+K)jd+H~pI4lBkNM+SAn&@XtyWZ~
z-nhQ#FZ8=~Q6f9I-;5mw<`*ZHj*AQmN(s1h){GuOE@uc~czq5Tp>#2KA`3&a$k5*a
zvLJ=J>ex{Y!ID*GtV(lkKW<=J1y*8tdHIj5M|3{oK4-WA2LmbAdfc1RuCs4I?>MPS
zN=o{Wk}1)alsUm9X&wL<tIohn2D%L0wJ|(=a+LaBz7%6|#R9~L+Q!Pt)m6)WulPBA
zL3o|24bI>`8}!wtsO^gQt;d)wyYG_+4;kWyLVyqhsVS*mxlOji|AG#_+jYB%w+aOZ
z=u_oC%T_fHdvS8KQ4kTEjD@M{-nX0TK+v{`(aQ7{q*OC$sd6<I{`*hz@-*V7so`^D
zcw(<S`|N8p`J9}b<6M#&W@Z<h)K6$n&%)@EA5(RONpqpvb13nDVD#CwXO9vtA!?SJ
z6=@yllg>N+-H)c5{PC+dZpaL29#;~!4N|#+Y3FXgc+r;O^Om=w(8V1u4f~d0ZLA$?
z5cTN)lC(I1DM1Mf=BU{2_?nZzzG;Adr5^8$J9|Y_hb`xh+#U7h)>c~sE~sxa34s5{
zStG<I&xaUXCc)Ws52qUlxc@<>PjRoXG$}3mMr`X*O$t5L3XwM42<WXc?0WuU{?zc8
z&b8;1gN%2PeknjM?h=}5t+V~E&DtRcO-<R`5GHAH-V;-lRXThXB$YIX92QU@5*p8Q
z%%@N>4C}o2{vC2~iLrVeTN9X4Ak8D7w1IF8$V7wkV!_n~AP4+f`sq<8zQFMIk^A<^
zX+<f$ikVS;h?S1@8;!C+Mkvin@C{#8Fk#esYIV{9uTL#oj&MoL_(T<;tZS*dkhd+<
zqk+6u6>Myq|MBB7pKS{lEJ&tLA~Z{ZS0h8<h^FiBFn48eL_|9&{e=$`2!OT9T@xh2
zW>UzNn^_SI1;j4sjhxNs%bHm~D)3@*`KA|h{5xS2nVoiz9VbbJYzOY_RGJ)T@bwxC
zP_G}H-P`Vc!g9I+EPH(42_%#PQ@o^rLb?Grn_&=s|APS$EJ9?Ndgcszng=&3y`kap
z{g_+HcLN+786g4^Y_vh1F{j3K{yBS@`_uUsjqOH0jne6H=n+*YzD3ikQ4`b<?E-6Y
zGs-kB-|(ejQwxdDRU>kr5L1zQb*|YvVJ!<Ih;E4JA0D#Z8{@tA_3`wswWO_OFZ$L1
z7Bf?;`fl`|g~b#~GNOf6oeKhOK7`TGKUTI_hBtiDv59u*X%k&H3?w<b^N1UX*;7tW
z&|iUz5ZMX~f~wl3vi-@9k=ZrWpHGz+@17F-#A1Tg8o%@o*HvEU9j_buq2|K5a}pUh
z{;-d|dv(ZK)Hio)AjY5OeSH*H+PZd-sjYi^<tnScw3~N&I=0@&=Y5QR-zP1S9jdxm
z>bE?(?&=l8BQfVE&AIbo8T|9m=U+G$IQE)^q$ma@7o8v9GU?(6hO}*?SkAfOqe%l{
zTQfZFI4hsFhjhpNie38#h7^uK6(h5AP8Cv7F<nT;lXbxc#DA2&KI__M>n;LWB$G;X
z2;I9kDz{Ei;(%Zd)lJoIK{1qq!muOr+HmPoRlwz?=Ysm)O$5dhqa=XY?)c#5g=3Fr
zWqcpiZhUQBohul&2qO7S{qvWbvTEgYK#$gE;J}d3(9D2CXRX;jqMYD^^Z#`*F)3O)
z@TC1?4YMKdBL;H_#$2Hav>{-STZ8kVZnC|pw)^G1O`A4lUbzK1Fd||C3&HLuhYW39
zSN%pb5uv7TX)A{8Yl|ZvoQpRuqs<X!E*Te5Ya+cHQ=bB<5twOb(6_G^0LQ>3dA5-K
z{};!~-s`m9u}7O$#h1aEP(!%BA|k&TdF3Ld?$f6Q<1>8q9!ZEIv_H_-9-}eqw{FFl
z(Og7SEChCHAPoDRJ52%u1Lfdmv<#wcU5iyi=jJl{;?fLt#wq@&tCQ3-U0q$f@Z;<*
zG{u6UuqVX7$5G4<_3-eJKgKU*{-CeQe9kD&R0%0$SIBlG1shRQrS9|yf)L+@Pj$jI
zm_#%kk&8=VP0{4ZQDGPU2ndy^hr5p+*}?QQ^JmttIu!~RHVM$jRb8U}&c>ZSjd<gq
zKHA@6?}bR&2+SoZJv5SsQ6BT0_^x~m{O2}^5D5IuXpoaf9w&(<!#C~QzyBScxh~Cj
zkY2EjnFS8xA2*xiIpyBg^Je2muAX!mX)r-MX#ZP;hq)+RiaLKjTbT}>LyDzJ4!?qm
ziVDgCJTdHYx@4dt4B}Lo{IxxGpkSblXzqEGT(?wSwR11LqN&29Lax6$uP-1Z(;EdX
z_c8Sv!92<vhvzXp$>br|fAoZt4!0ho3fR8teq$At4t|bOGLgK-(TWjAGT3k4kXcgW
z+b|@Q+x75NTP_ZH!siyf>Z@gba`{Ytx!_wrft&C@8w9H|<$(`CA;J4%*ASK8SAbZQ
zNA^psr&4gpF-km?aNd}r_uj`kpUQKk{<fr!e+Q-i`T3VAaZ^yLbxO^*;Q)$#l63oa
zBRVY}ICBq1Fk(!`AdiV0fW+<eRs%EmY&N+7>F3TRF%p0;6XIlPGpkoy0SwB4SI~v+
z%6Ud|y>jb2K<Yo}>Lo+6y!L0W<sBBP)dRQ)CV8;FJGoFKQbX49z!UQBhOLz;#$hvp
z-!F%AgQXZ5vMD>5ei9&IDN*TCNB3ysqp!+R4uByS6jan_^ah-U-@bo`M;UuBV{Tvi
zn>}yC;-5GNUH16+w*P~Yw@rdJJ~;AZzVfGvuqWi3TSBa}Pt{Q1dN&Ik6PWBQ7l7m_
zaC1loE4a*_76-V^NgoMtFhp?Cx4m+>c_L*MP$sAO3iDj&B7ZK}i4(It>!-6b^w#}Q
zL_4v5oJn@Y+{1a9kD_v82I^<l9S%TFi=AKp?VC4m;Jtr&hQ_4HO$kE!bP5BLyasjD
zs=H1k%FMBLZl7Wt=KSYr^N-e+)BU7y!eL#vbm6+kvpbwvt?C_@5I*)(VK=q87i05=
z8|^8eLrJ~6<24HdS|(;vcR?Wu^VsFF*Z0i!#o0M`r@cB9)^3ye8eb=y55R4&)OH0l
z4D0;muYu#bb@ydfcOVxc<N^tTtbG1KJJn*WYw6fgPOlrEn$*2XRA$IHlahBm9a~<3
zgGOvpJ8@#hIJZcBt>77v{C%yM|Bf7y**DW5dVzk;oENv+Y%87{W~2tA9Nc|sHIxLP
z%!Y#pJ5d)(kWP!H;~=~^1BL%V&O-*?AB1+q);8_DJ%9X#oBJKZJ4E(@tgi{ItwxA+
zs0`3d&dbxq=MM%*+Zcjjl%|vJKvch!-sno2AlUN9>X@qgIis#QEC`Azjv3x?k?k~{
z(`Z!X$I6G3GUToQ?}kFp38B%Y5rb{rHRzs;dLnNs`1nzoW_ihyGUKQR9k(5dNC*Uu
z<&*&G88y7uINjbN@1g&aEE!pac5yD;%5oWzxv5MCFRUoV)h0!KvnGucC6Ne#Ta+uS
zFud*DvnL1y4<*{}_whic`dgjQ-UtGYFH?fXKyx!<hxeSu^OB4uy7v|+a2RZ0zgMM_
z=>QUj$<UW_q&~MnT{$At=;G*4Mz`??serYgc4=X2nY%*2AQPKJe}jC)TYDGF44tv5
z%(6?!Na^2RT_*8Kj3IJIULy^jZ89%jk~=<ql7}aOWyCsv9^TnvRk;Er|8PDXHM8i2
zDWaXmJeG?d*hq9~xD^HQ`=7W*CWO`KurnmUowOKnD_0CSoDXOS`d<L<mImA7-Q!l|
z{{2Go5ZEN+XV`jW!`-X*t_=t|)SJ68RR;9&qB?<8nZMV;Il(5BvXYKpt|q=Pu+mG$
zeD2<}XXGnF45XlZUn8Pcri>-3M~v(_X-KQE*O3F7jbxVk(UtAur{xqbXXZ#pjgUnv
z%nz<AYqoLtaJz-A_E6ja*eFImG;-R1u-Rxxc2*urD~eFgONqo{#}lzn58F{>kK_V_
zbuik&?Xk%Pllv3%{ipokeh}6`KS4MhsXXoCwu6h1oS}j20B*O{w+le6bg+i0*>F%m
z47tUDdCasddKu4P7s;F>;X;mAA)G0FcCMX>j8Lrg^w>K@9L}7P_i;~+(r91gz$bpB
z73a0w$mp>)8!vz$l~bFc2d>CJZQCtsir6KE;#tCq*%bZ_#dCy~(Kjf|y%EA6GvHXK
zq!vO+qmy}KHd`mRo#@T?eDb4eVlLyCti$(nb35TiL~lt}K^pCS1uLuAPZx|ys1tcb
z!qrfAF%<<Zo!s7{mxdb>>;x7RB~`rBr~GW=D1xJt^73wNUCT`=O5@^iO$B*vqG3b`
zB9H?PgD#}bu-j20rW2Pg;-{R1HEm@~II}54?!vr)d=&9A$6GLa2$(_U=I;nfHJDN&
zcUsui9B@;cAH}Zcc1D24U0wcZM{+}$)|=GS4JJ3Y2j2hiC-a<{$_W3KVu~2<zA!{g
z!a}t^xqf2jVNzj>p7n3H=E@D#*q-FLvQ9;}#LDk{vOU4VaxMY1?CP_HJCn_C1j69^
z>{j6V%Ra}C9_@-S`e~sXPlyS9W7b`m)`Rh|u>`M>R>r+d|GF&dHEOH7W_BMA_Zx7g
zGS%%??|V(Y?7MY09S}DpXH|}2m_Ci0z`3*UuEcM<tfGNF7>%=;!-L}blEo2$C%%u*
zyzkuS?u|FAV=W3UTA#e6u=UQ<&l4}kHmiI;v3~k9pG9NuR~>nDE4#|XTS@c9ZBcZ1
zdVbY<9z~z>>{)ASx?fq%I?|2NHVVu$yL!CtvEea0bw(H&8%xfP+=5h*R3G0J^_$P@
z+YW)OKxeoyn^2Z9(M(imbfrX}o+w<}ZI^?P>uile>Ul&CBdBkg8sE6r-mS<=G@ECZ
zS+`;*v7-Ua^+eJ|u?3hf-VaeVQwgf}?%liZzX)%D*!E27fmlK$7~tLHuz@4;_QMos
zwgrq~rbf-6sh^w`TASC6lndW^>n$@1c$e}NE?g7iYLiuq=6-aZeX)`XGIGVoNiX{<
z2L)1Q6MWPXcAs%_No+$IF%qN-KWay~$MDWAc9ySWODa8eH`+7ONSpdqa#~pQSTZ(E
zKzw&_bRN5pneSVvjbcumXbONQye+AcU<q7v%v%U~@L)6{6$1G1Z9z?p?=3A|p7nVJ
ziQsacP{6aX_mNsjhrtRIq6P>K{YZ=AJG3?L7y?Gak;Fr!FcWJrKahsQ75voD%HQLD
zDo4L7>s%%`P__#_%K?+}>Qy_j%>)LlGW0}p&hL<G=eo&UGtQy$;}z)usJ<zFs=C!k
z$g=dt!hKM!ffc1p9BX-WuaryF>uI~X*o%-GVYvKQBO@aiDG8ARGmw)4X`tlQOLEwj
zO}}=Bf?e?6QBlmv2TaXhb`3do$lkqesh6pcB_xR2b1ke1BKH*b9m3DX%tQi^Wiyi9
zDd9f+EjeJDnYM1xqCB;iM4j`f|JYsIHB0=jf#uayk%QjO!f42zJqHuIpFFqdHDsT;
z?GaFWeeJ5n2RDx_-Fg0ChC}JtHe2Ua-ng~dljK+`L=g!<g&`viu#43nFrW=oq)ayi
ze{-HUZ(~@Pwu0aHBRWmBTeogGZ#|<%uie79B%m5A1wc{mumcAVs(<Weg#e?%0#Quo
z4f80HD}wXA23czKe$zoDsuJ287bn%WC|Xd)b0r7P%7hw{@5=!UU=WSQQM6dpyeG4c
z&$;TQ$Yr+yobJK}Gx4GjU$)zL;?W5%Afp0mA?R8dHjZ79PW2+U30>`41{%wJ!Hyj}
z-d$71h0O+*aRv0DlDe^u-PLRGFyQ27`W+!pJ+?YS`0;~flsG2GkojNLpon2h3hCXF
zAzPXQq%F&%G#1&07+o0pO-U^7r<;{e|F$&xavxI-x0FE?%f98|D;L%Zf6GM3+tF)9
z^a0x*jRJ`AfpQ}w(pLaxq`arALe+Eo(W5^qa{8sqi(aiqLe7x4Hl%6@$B9L{DHrII
z>;o8^G^gnn*(-i%(&T5uGWSMf9UTw-N^;JKK<2C7{=cdw@I(-_X-p-6I@tzr;6fNT
z>jpcqsC$Ir;(*YN)7Mc;i*oP5T9&L(SZg)x#hh4rYs(dxNipJfuZ69GAr6((A6s{L
zt-GDe8~#U-#x;{es(*$jJ9nl@=D-xf9aS<?p@UYJ#!#Sg(JJWfUe9dfJ~NyQcIhO1
zVvBPLC;h&;_=b%~?V!PftEO8WOt3$>sZ)?5MT!&!2v;@Yk5H$x(J3L-Sr!Or4C*g#
zAKR_V9Yn`u$?S|{FqQM6ti1Kjf1eoBzI}aet-oJ=W%%cx{-1W94EwYrX3UyIE3LBw
z6IH%U^K*0ezm)RQipqj`-JDx~9l(Xji+;RX6K;Y1Wa|B8lC;^mxef6Tu1)Ll?~{s$
z>|=(g4oAYw@Sb>72Z(rOw<2L5v3Bj+#LM=vt99=Dg9wRYRFU~4@Y3fPvPLO5t?U3`
zrzCWN+z>i?M_RLnI&ak8dYnaKAO&-oSAHN&<v&{kAY+EiHp@`WjU2oeZKbV&ezD<y
zJ8qrvr&=EQiI6);;57%%UWb1bW>1(X@M6dZ5ht#~9>&D8S4S=SXEGVe9&rg`Q_=BC
z_?6GsoMbivJnwc`+?o@mVONGcTS>)6L$>ajqUDyg=0W#-{vP2Uv+c(AU48)WWLmBc
z`};>0(@~<wnLy)(SeX66yUo>ByH_i6TyDsZxvi`?6jKIYV#M&p^w$!usvP%NJbL-b
zO7Ow-yKbqQS}M6uJ`>kRQrc*`8S@s;pb*qmz_3Z&4Lyv?-X@05Sy$Tx)B|2zW|5v-
zTf1yq`~J=8qbL<n01slkqc}KdggOE>vw&03aeF@+({mcf9aHZEIeXt#y_l1ZX;G+l
z2xYodr8#RWnuui%mo8%OK)4eo7?Sj9I3=A|mcm2vsw^6h%trn&=+JGMUscQLD;Y^t
zMPiWV<P)fPupZ?!22F&gqGuCwkr2-SMyz-2)X96<CDbQ7$2npi^K-JGN0clq77y11
zXv5PC)$Hj`B#dq3m=uGPMOF4h!|ZwFw@U3Ph7tTj!Z8A(aVs*|DJ%AJ<?*#RN-AK~
z<#I!vD%pgR&n`9{Km>mF)z#mPWL~|jW)gbE9ZI<e$rXTNthLA@X)`vbzXW{P0IAeS
zAtUy+UbK;0E_Wp8DFa#`KaV(2@>B29<If{Iy{>iP{?KpVZT-$~>yE7Zr`4|4wy~yK
z=a;who9W*wopm81OyunjR~K+NB9i+*p+EX}bH5|ZNIrWzUEPpok2f+*yE!H9+$Sjo
zUd%LWBmtHv)9GS3gm*ZNIE}G@<p3A~$|<3%J^uS=;lfSnlqek_Z0imN2$Y|(xIYs!
zC6As0!~7F|dM|J9KjKo3w8ID)a4TK)Dbq!6kkP0VFvqZ?x2vlGJMFmJXrlc9z(W{x
zAyBC#g1Jt2EZ`2uy~W(D+wbmdl|XYQmKI&y%3{pQY9(SM?qrGfMI<2J727G)pEOOj
zJ1_=?zJJKLjj?%Rm+DkzwHRy}VnZ$jn%<~KFGaY3ySK;m^Nq!o0G2F84|7E-_c=WN
zHxtbPMD&4PHQRS*@?PXHc;|9s*@saFJwEfX0IC=`oyNzaSvtOFY#3}QB`3v=jQkQx
zi!MWQNr5$wls&E<5Uti(+3eSj!Dd5j-rV?jc3U#N;J_0%o#>Ng!$1<C+bU-!TmfKT
z5GP7rWYEKC-n)1Zs7<H?1jotqE$$h#q3d?LtY#h!gOHj@PoMAaWMZLvHuF$3?hib{
zX%I!5Jlr8Uc(^Z4`Tma}1d(_Af<W+0@;7)mE`7T$17T0KzJ>edUAOVSM@B+R11ECB
z0Ulm>t7-GDrJd%FtgZWeoGP_R`MHB&0EYd#g#UHJtiZlqE70|Q5fKu8_~gm1Ba0u~
zYqdAfQg0a+w)<6!h?ku*Ue0cJuZz8&{@@L_g6<88gc1N2qzn{s8~ipNo>WycE+VQG
z!N9LvsR^(x|FT3XW<)4gN$R2ZWvuZg{u-{zPK3n~c^$y<OVtk6b)%~9V)DhPQ-C6!
z7cOj$+Y{6qz^{OBE?O+!x`l;BIU4bT2#-UbS6bj^lya6w0$oV*F1r4W@D!iBnqe3K
zOAzdw6(b^H#GnvhT<vh+R4M@sV0>~d9)bntnu<>jPWfX%i=a1f;6S*=;md>AEYtyC
zIVkx#KtXV6>(dJb;mOO(LvoUBdiNQMd@;kXtNUzBAHZa8EqQI!jlOe}mbkbC(nq2J
z{Qr}?XtTxcPJ8@3e;@z5!;X)1%MyHxln-8xszg2?2MX8R_0y3cwen2}yyV`{)YR-W
zJ6gXMlo7Ra;NA4(WJP8rnRVA&%&QeMKbRCF1c=Nx&Wxv|<!1=v`pts3E7ow@I-2^_
zVW^NBac(IqK=W`@&K-B=_=$e~6oT(Qpebn@q+v7Hy|;7`ZQCi(zwR7~@Jf`+o4j9e
z3T>)!@!*!hm@V>d%54&#`tRFw%sSdK=G4iP`faA7TY6HF7fl7|*ycYDDVaHssQm8j
zjVv!%CxKz5^+uf_Iu9fZESc=_CgbPQ(gjTGVMla8o1Qalm3_e;<rKmtfPNi;kL`>Z
z?L{1mAa+E$!J^8dC)1cV3DrUFg@~NcTpYaoiReSX0=7kYRh5aw1nd8=1-OIc{`tna
z;3vr_8~~66<U{Ixz}&kz(pJNJu3(<L>33uLGs94~Rd8Lj-&){1{l@0Km8l0XYGhRX
z(|ZQurwY;a-@0{Zk+7HKJ^1EaMxO&dWO;tALT6I_V8%hcrRT&E1YhV-4Tj`e@lHm{
z2p*5*Ozt3r{L-}0nX!ZBHbRTUAVGU=!_Yrt#8`2V_5%m*HH^=E@F2Tlf5~Xug`fPg
zd%dXDv3YO!!S4~-`YtX{<^k7G0X6|~qqA$abmXq^G^?tX`?XsRo~EtJg+)80Z}<^J
zFWLQqWTG}}Wkdo=N5(Arwmqx8x$<iGd7!mF10SGaLHjvh-EdBuFLuNjE}D~HFW_xh
zE-NESj<f4|^BmATG*o6P-LjrO9qzEQLx)Jk=_WsMNu++G_PM6~y@OH`h5-pprEr9)
z;5HJfo&m*#3fu86^*0?G8|67p5!Fl}j{;}?v(6E#5ly=4gfTVP=8Sh5rGX5%K*vv8
zDP<M-AesXRgJ*+7;^IoXSX%S+%y+4Ar^Syd-3A~CarGsashR8fcbTdo!G(5q!NXi9
z>G8~metko&1N8SMZf*Mx9X7A!>b^UTL2|Zjjv$Qe>#%>#>eVig+Po2gxcCNo^*4T)
z=-i8_tx+6d__H++Q6{$KV{9_KSR&&f;1uZAlDRhMRQz)umNcSi5zaLBNkGP21HxW9
z?;05nf%J3F%#(VQ@G$-;KZdcmGCD0XnKR2L`nhixyKO_qk8dfOd7fb*?QTL}Lq;b4
zD2ff;WIt~7=xtACt>CHirj_rr1DA!)zZg<Gklx_N;t;D^x;n_WbLlUJ3t7NpHZO%-
zR(|UNI!N^-^uGWAv~U6KlqMEy@jqjAK^eL2;$J*gC=;oc!5KE|J1rTRe5l3tN!bQ|
z^&#bRhp`A@)L7C2n@c(ZoeD^ng64~1VOQQz5*8E|Dlv7`y1%u%e-ix?w4S7Pkhg{k
zh_4Ivca95}GpI8YDPY?Bf0^xgG_6+VtIMngicYB+b8>S<>Ba%diG3S+8l?<a@Y2vv
zaVQr!^Hu@Ci4{eR0xBv#gA=I?g<cbzJ=$#KCY{TM9L-wc<u#PvL4p84Q4)OHK_kGT
zN)QIg5dsXHB&9V4m4O6qkoNncOJ*r#htzYTFUPBeh(x`8smp7f?wK*vMzHYlb2GFO
z+`NzoP-=_8gjx(~sWhDlTWsAIE*y2Hf&LT$Vi%E1dPMn6IqP9uZr%4c)t4v%0Gy7*
z>J1n;Q0BHsIm$ga`%`Th@dm7YNi0w16UkT#QCuZ_15lEL`)&6;pe=X|@yUa?*u@SL
zYYCyBK&hf7pq(_IN9BeVBZ!jA<@Ln{UVM8#rIhCkC@cy~petk&LDT%0m`Yl_x#Ju|
z32|6|x>I1{)QfxnocMa78y|$-|1j1tqMLSeDUif7L-|GZi={76_IN9f4feYTgmHQa
z(#6ia(PI*7T9|PaYzSOWv9+yG@30f~U40v2Gi~*4{E!1%aX7P>aXvRrur(?l7t`?c
z@7}yjMF==9+N>ey(5@xlzpqN)ft(iDrAy3_N`1~J{?Kggd}Ik!h3s`19|(>h+)1x%
zx{66JUhHkL_O;V_ZIiz36o@uiIB9iXehE>xlCDS=qA-qxu5c_uR8q+e(~h3>?Z>U#
zd3lFA_SV|W|AJQU=!@oyvQLcX7JK%OU&e=J5_KfN8^dn}7sR?9`bHj!kPAt%8OQOW
z@c;xoq3^u7Agl6YDNwQmlnChu4$oBEh6Ssw(ACIQFQ^rWQEJlycqICN<&2T{Y$oWL
zgc>5t?DZb-tt+$HMK%b?L<zR*fJV0e>$f*lM}9kk<bo3~a_2rxU5m~~rZ5c(sU3UG
zZPI7ks&J)wRdEwe%3zfQzbF<4Mj}4J(9^vjg5hPvXUfFQIrdBEf>pz3#BgYXEl6zr
z(4j-yZcIVj;qdz)=j?LN#pgHP%Xk$<wQfsj_4JD%;_a%O4u4xa`}&GzQ0eqZ=sb~C
z@H{%92o-HN2P}PUhsn2<`NNa!;M(L7!AJrnuptMGvW2SyrP;T^KJ)(l6dqNQ`vvde
zn;V7pN`I7o|HHDNLyP&|b{__#@mV&TK8O7qaO%`QSDN2_^bJ;Fbkv6~Et)#qngWh=
zaDX>dxWs-a1iVl2-QD)A|EB{HW54+otC}=<Hm4<vgdqUwh}|ic0-*Jzvf$!^hokZA
z7&fJ0(h=3>X?2z)0WnLf=(MKZ6?;?OAf0OkrdwoRg)C^hjM2=dZ~BpALvE20=EJ9>
zV2X#}=EM~iLwXgoPrkOJ$mXO>P{it?RQ~t^hqP7Sx*T-EKd}`}J3n`Gas$PIVtM&5
z(2c4sP3c5<T+FuzsZIP<_fOkeK&Kz|^(iSSN{BaQej0oc$xaV?d5?37i(31w*?8OD
z0%+9MnH^P+d}v3DrQhP{gl<&Aj>K|e571d;<6p>k22Tk<VS)bYci3``f+s?>LImFI
zLyot|;UKWVsrh}YR<D+gh3-dCPYyONzeCe*zlLonvOJOf7oaR9@;-C#?r34QGn`(o
zbK3d=FUvrLh1+0d@Jw~A)(!%@VHwDQ&4I4C>ck$3-YI9lo0tzCTtIJTdNmQyg#3<F
znUJIvXT<LXx#CEnFoG~K9{&(ZxPtM4&YzziHTUh(z4r(BNw|1xxl3saDIL#UzHF42
z2LLWJ>8K&MUA}>{oRjh8)umnd>n;lxjGWL4FFSdhEkaYNaKyU>(XG#k?VTod$E?J4
zomiAq)+kLrSLf6Ty+>F6j_X^JIk=%bbFa_ad4xkz7%avbY0R8CbM*dx&0ybjlJ+uL
zlICzj04Y7|&Tji_O_wMJ2S}nCePNk_UcP=eU%<z;Si7Ys25I5x4txHc_y$~FA{(Q_
zAVs(W(;wEr+i+AkrDdmqT;fOr$3TCss(=fwqjqgxRgBpmbq{rTGn+#Zs|F>5MwmGm
zbq^mUj!L*}Ne;lPHht3=?vD_!Zxt28kefhL37$_x0rIaE&Dx5M5yE+B+!V<Kpwd%T
znGc7{#xy{%pZ)?ebfdt&s9Wxf%nea`udRy~qo&zC?DCLtF+-syP^fL6v3t9Y>Fww{
zXIDCm@80&9RRqJWwT^C|)cx7`FqQ=POF-tQR)5bCQCVpLxOo4C+QLIq{O{;*W=8+}
z<XRhzknav#&OfSce6nWfoP(e><^{8cAV7=S5V#PDD15DU^AT5`=W@~jN7-6YKluGS
zmN!EK%h8?x<x6u$Qv}UD%Zimwmc59W=GTI`$LJO~kqB>Qnu^1(fnfm$?{%Gr{?5j6
z;SwZ#thNpNvW<){CG=J)PF>L%W*<|_2!{tiTAb68^9*5#vt|9>y;DQ;q5?I{PE9yj
z{+*F1B}-0rGy(?$G^H|2VrF<KF&F=-en!P5yka5Jl@Xk|B9Re;-qxxj_AgT;@(T(A
zDNqi2kRnp7v%k{EYQa6TcB_hN7tEXYtkm`K!-v~*PO&5VI{g!}>=I;dXu8>`ee+*r
zNy}nx_#P7U`sAV98zHNliD=t-dH($tt4zMzk6-U&<uv5xLJw-T2lEsEu3~uc!3`59
zoSc+v_4eS03*`KHe%N#H;Fz=?i)$G{OxZ^LSy^;T-EP8>v-BNwthV#!eTd%gqv7IQ
ze&plJugy-3J1^-!U`L{ohT`G|lJ<Ia&ciIMH;&u1xX($%sjBwp*n;>@CkVMYQZKxB
zEtc#{Z${58dP<pDiVWtE<(fX^E=XX<-_yyrVT8)h`nSJ>5#6GY_k4RR&TI+Qe=_{d
zM$udX$OgmIiW-)xjydtE&g)DmohjDjXccU2c#J|wN;1^!b@hp-#+i5#E=B7}Nj=Mh
zOHYwj_gb$Ts?*=J3rjCY<%nkPM)+9h{av9nz<`jO)*`;)kp|PcO58WbFdGhwwuOCw
z<v|!5Q9l#TO88G!MAo?F8ed+kjhI~2(?oMj>07{+_S=uWT{fq>u|UTFXM$HR$-9|&
zS}}bNK3XlXY<jC?uo&@5(a_SQ?5=E0wMt|IU2Q5JJJy3DPb?_y<5(>yqR%Ph+gyXT
zG$yF09rdUv(y)y`dHmS9&&7q5j~qu$0GTldv6Yk3(j@kW6JCMu8uQ<!<u9sLfCVT*
zh2@YV8IX=UE|o<)c3U6r2SGIWFNGX%SJ%|^LffGTnZ|F6$VSHqbKswks~B%f;Mudi
z#rOn~3mHKEs&w>XGvcBWyL?dTgzn)2KC;xeQ>Q~+yB6HuC?Pl)D^>9x(?k%MU-IUP
zCLEv5lqqc>O&ambw&|Fz&+bYh5Hqp!V|*5nycbm`3H7iyT!%3bFwj+i5zyER_MWFj
zKvtX%-qwN<J2IMynJ}FF;srW0Cvk_EVJUWH<`d(+9h&|+1e|7IY^*Z#@*-0cJ-jLJ
zmy^sOiZMopenmZ<3C5!$uN}!w@|!mu*u@ltKftPmM3*u3+}m=CiB1F2xXdFRwECN&
zOmIbR1lw0YRnVl|Yx}FH)z9l&+06aeBDl|n<Zb$WTWPl4ENuil`d&YNE*k@NBLDC&
zQ>ILTF1Zc$V(R`tr_<4B$OR$}v78`24iLG5#Z+yi+s#aYUAHi8FF|*cHkINzlx$A3
zv(CAs<QEs8+WTNGkn75oUt7iq6~nO(DuHQ~b-S}3BLvqj4akU_dMAFXK~FvJTrgr|
zo}zB|d+On_-+~l0+or!B<?$!wRFm*mj7t1<=y^T6v*U+Yj|I}o=%7^OJ7C<4oT=ls
zIyFZr7CrnGb%^ts$@wvB=_c^0P(q*z<~X}P`F+*#wMZTunmWOI#BNz>elYLUPtArV
zEMdW1`2e&78(B`EyL8K}{9#Q;0o?#92LbSc_>s_N(evb3h!11)5?BAgFCC!ln3J$x
zM4}*H^z^J?TY5Iu)v3a3d9CKJao5m(x#)6U-N^(0@KnXOhAB((RS_t3Fbp6Yrw@-9
z5`inIGqfztZR*T2O1=#(4j>%i4)Xr;L}S5g^4ZGV+#pv!U!f83HJrL-t&;AH*a_*(
z$lOzAxmxpL@n^Nt^6R`kPO+lPR`-@)|L#3~+B9gS-R1efq(}+<j0SK<Uen}Cu{#_7
zB}5%kqu64IW!D*cldY&+-?zV5R`53NH{*EzjA#YT{vEf>{a5Zyw8bxePBk)%7;Pl|
z=<oW(-(B>R&|8r4Kwwn))yU&*0nr4>M6<-kmb3$$^_uAJH`4PU>BP!dK{AC@A^Z4d
z_-W2Dv=F{hBvGXOgvMpM_R$eS+G+KN)_)L>g$r-9Un)r9k_`*ur9W6VlyRm20odZX
znA|Xy`l{RDSQ3Li!tMg9Uop?D`#DJ3OX4#m>WCMEAlaU$N89g37bMjbfD+h<SYBzz
zE6~Ptm}EZDuhWN&N`i1)KqTRu15TdoR^hc4*Q?O^$OPxinS+1p#ME!iR8@-&qU0Nr
zllOR+6vgq2-wa@RBQvx>zhVB3nVBLFlbaO~g$Ww!V%sGTfc8h$eq`iUcqQCe%{VxC
z@2iUAt~hj>R5Y`Dr5op`aMP02+iP!7dSTT}6YEjd<sN$?vURLCBsYVHB-TZ`GEN_n
z`{LeV0}`p!NWpNp9TSF-LWjcpw(R{N@!rcgD%_bf|5mH$*`vp%T^4)qcRktigmx$Y
zpA{iaWlO;eIrBxC|338qBz-{0Kf4jMDQMOYmaU@v+HaV^0gM3z9pq}2_8cAJK}RPS
zjgTNX3ydku)|3e_a#9QZ#hKumG=rjAENV<n(fN6cA+m-Ows4Z*C=Qm8EY{1k#B~YD
zE6Elh23GnUq3~#q&aG?k?G&PjiiMwp4*~<Q82Az6q;PR>VRqn9kAwCZIH}I3KT@l6
zMz;;nuAr&Df)`wQY|>Nwyqj$W_7dNQme%m<S0MQKHDndw)*3Zxlq5iiUXBH1xlpBR
z6vcnLFij$4K{cpl!Hv%morFY1oW~-%+@(q65O*gqtutmdj9NE&`{uscq~F2<4UHM+
zOn}7yW%~&mOoh2V*|~+A4u>mEvf`ND&*$hE{ErEBSKAnI=eN-)wxw?kjw;S-&Vv++
zYHB%g{5MikstVqVk5ljo(d@%I1FHjbZ<rP}W$ILMCb40gs;Z{&`L+l5{;=#CkbBYO
zGmW`$gQobQu%k`S?XCjwE=LsKPsB5j9B`@V%7Wo6NJitI2`5WTGtyn-)w*Y0GB!`U
z=iTd6+yUzUuBrdwM2fKq5&L)SVzM^7)Z99HbYsV{ANn5Aeh<b$FP)$<XZSF5$RkXc
zQa}ZGV#7IB&X-sA8$F2vGVFLJ=H`mmkBy^P1|2NZcCN*Kbk_%nAU39rURzKQ*hUd9
z2&4ws4OSzeRi>zxI~{)d;6X|6$R($&so$4kai5Fs5swTa)(w;z+%e^3M;MTd2gm-!
zZ1tFpKRTNHvCOhklr6JP=Ge~awRhFCb`yKCBgOfL5DeZ*u17+g?$!_8xUWA&98lp6
zon7<SoF0J9kqezNq+w;lYro}V!wp}5xENZtcy{h~)k$w3f;r;=`fJHYR2wbct#4j9
z5se>^9Q*LWA<G+?c~2(Y>XjekvNQfxoUzNaOX)Kp#oUI+CBBcfd|P+Q<eslYQ@H;4
z?h{aL`Rhh*S>pM<S|hR_930Y?1YINj6MK8k956qn#naC##ON)_9TY6;)OVCsV$c^)
zg@d6%!7uJONl3)v*0S$+k+PB}PGJgPr+dG@$n(WV4S$GkXajFUf<I&hb0dmHYL|n<
zEpN=rt^b<h{NY}>?_29HR_jO<qH{}yq<TW|BmE{ylSb_4E-uyGlq{~z(%>J~F{&5|
zF5zVCQ#4}i#u`Ss2@fF&BN$DD@|45W#mx-^Y;!!Cm@<Xtq~M~kN^m^xLb<@m%xH9(
z62Ht={6*4vVMEQ^7k}xs*?M%URhY*!d<`PS7bb|$!{KJ+7BS(Z;b&{si?HXNVrTPF
zaHUI1spLB{;M88vZv!5ojz~_JXORL$xyIWs7`eVfYS5w6iFK-Ivl?`h-I|6fOD2`i
z*A|V;jEzHD><fF+8*f5vY!}*cQs#u01~R#Q=g!f$v8~59q!q3wev8j3Tr6FwfDgXD
z2Il7b!(zS-nxHJBwvunI{(k+z73ezgI1{Cf?g$^Q3%rhku2%h|L*sET+uNtFi~3H#
zA#kUH0tF;7r$sFyUgLgAfyb$!;?r7^4n5(zC>w&2o@2rbf*LBnbb_B2wgV9L9Y>^Z
z=46glpdlVCA4SL*<t|aH`8Svj(I3Oy(xPk!*@o!YJ!y)z%bmgs-!QXREodS1!#*q<
zEXEUt);S%=8%L1k5u3Ra07FCo2f8H(BLozsZsooQ+5ggiOuh0g!rK3qQ|&nK@qY?Z
z823{(2fddr<u|H2G`0q3*mURJu#<Yl-TpuM)-Lp+*`9mX`s-LUCQ^awbfl5}L9n*r
z%AeQ-;6)v}UL<IN+3F*gwK_(Q6YAI03EeC$JAvRUNS?9OM~kC)F+|~b_R}2c;dO0R
zP{EdqmYvmOApW>f@4yqnjw-9FssaePQ2H8X>V==S;Ao^sHotaK%_X|^KVZb8^Z$4R
z7hJh!$G<e2i3eq9n};kFxNy{D0_4QSb$!0VPdu;-wt{nx4?Xnt?kolKvaueTlWmH6
z=xw%rTDn>-!~RCE?){3K?;zBX@N!`fL4ajwJpzG3&*!Z+;<FQfJD0skxWPuKg@|LK
z-LjD7N)KN2y?$8W1Ut>_m%^wL*G^W#x>IKK@U#Jfi%V#*6c&6sk%`B&Y1xw0;KW{s
zY|sL6yo#4RV|M>#@9$c*8)KrqrB@lz1l?+L2VXg&KmSujHtqF+Q}GtNwQ*Leb<e$M
z*|^)-@VplJ9X0GdOa~)D1g*sYmrr@Ws&M<jx{k-#GBf8^3?TP0R(1TShy|yA$KF#Y
z+NIy}@nkC!QKp~m@#xZrSU=4>t`%<`Tf$STDhE|u(Kj0sJFs!xo^6TOJ&ei|&mCe*
zBj*na4{wcVosw{#_9Q2J4ttqN$YsTIHta{pit(W}Uk6L16B!k9N55H;xSfg)vP9en
zp;fidjPzvEw**-J^_MDOFI`DeiqAIcJzP%Ht}W{-d|SU+1CJK1<qJr!E^69AYpQ#Y
zV^pc-Hfh>*@x7Uvwxq1G+g=8_&TC*H%2r}9HPno2LtT|9*eIGD5{G`c`TIv<sMx5D
zYdNjH+%+CD<W2NilxrJ-oIi(ahIgo@Zv8=(C@dL47)4<P6|1CQ1upEvtLRnOEi^sF
z|F_<nPo18pg@65Bc4F#Ktug&3F4|`5+~?)7SIRYQu3R-*`uf=EtO1d8YerV@9diH)
zdX3SmD$kWm?xme{^f~Y@^~KV^JHEMEppiC7$*xjk_qa|Czsk*)H*x4ZG$7#nYF*3S
zcCly^=DR<ty;WA@_n~zT@uM=*iv&hWd9#2Oefqc>1+;8!#-(bUT9K1ws-u>hdY|fn
zmY%g5^X8@!xzijtrf6g6G{u{SI?hJ-ufgk^H7?EU`i}{Lb3=NrvCZnoTm=aWB*?WS
z_eNW}GHOdc#nbow-wa7)Dk27<pZ(mw?Np!-F#<0daN*K`(x48NzLpB-hMoADN}MAU
zbI&n_Wo2)V#|@;!LcGQeOcG2tu)ueSS%jS-Fq<M1r0iwkZ^+1lp7(M4q0rH|GkX6q
zVavE@$7##hs*vY-YODxAci4^*$S)-WhoYWu!*91ao)urb>{^af*U8F_%slS>Z)8o!
zLuc8AYo<rNiDk6i&YcSRQ-%*k&A!dfE%9CFF<GLLMe`h-V?BBfmcoF;Cd7)yFaFZ3
z@O?ShF43Fk1HZYoGjP{xx*_H%Y8o_RNNo{nU7c*E)7o@T#(Ot|vGyG(S<!0>@9O#H
zBhe2PSVu&sK$R|~AyT4O`@4fvZhu_dyAhD6Ds&__+{(Y-)D<r$nUM0fy=3fo=H1-f
zZQ?O!Zd4KaOZa1H>BB9HQEnL(ttK_Jv%*N*VUxJXU^*`S?fTKnt#>0voPakayQJ<l
zW5q|t-?1HvWP+IC7>Fc@poT4(kDLweD|E&W$sCQ(*UzX8FKIG*s*+O9E9>g0!8=QG
zp4`qViA!(vzpy3|;Yx>#jfPi^4=TRD6GcX!)!*JL&f8J5G2!89;{$rL^cn6Zwi(D9
z-k=b6v;=?5bL4A}$&wo1rg%OW3wIi+2gA?4C<MMr>Ex567%_T4blDOD2Eoh4$t&~b
z@p~}*L*|k?=9{j}ZrHp8K(WMsWBeke$8mYb5?Y@!V^tnF(@IC@?&VViEdsFHocEuJ
z+eRd^5;G5pDq0;feFx<)Dy9UYmHzkOky)TLQA+iY@Mt!VN4fQlxc=(qf0MfJ-SO;e
ziC+EN$ItwnEl`vX9Uqn+72DJ%=<td084unsyGDU!&!HPopqi@qxTh+^UtNgHZBSl}
zZB*2FtBPVm6rAr)Zv<zA$hM@$`)DS%LpkSu*1bqfV`b65EAR*qIU@&;-(jjf0$|gH
zs_4hVCF`X9;zwN$zB$@RCO!<y>NiBcq$vfIgn2Hij=HeO#%^?J*^P19O{h3we)2EB
z@wDM-Q*ltRr8#;z5=e?YfHSsNln>oDPqBi$Iz<T&kzsSlO1PScXG6fi_3K773Ro)l
zOTt3iA;n8R4q19_7R{^ZRHFQU4dz%BXFo-Ra4nQOJuUmbRGVX@w1T?=_e#~yx)7d%
z_|rdxPc4TOkTJK^?|s^D7w0G>-;j|dMTuzzUUE^&?35R0=ST;R<PyvvcgEM-R%Xi$
z7rPI{Pr@`$Fv3-f?5bC7h`%%U%N(a$Yum}6N7%&NMOv6w-VuKL;&+W9SjWq&j|@sY
z6iYb+nlXRB?!Ohi#$=HjR_0#1D3o)A-$F;t?T0;<OybR8rV?6Od^vI1ppH`xNIOqu
z2(7ux)77Z(z`tZ2P_p4>r53+3HFBSKZqIKW)I_OqKJYE|gJs*nC|Kf;#xh!m14<#M
zxz@b2De%1Lr_d|=d8SW~XcLs2q<q)xoi~R{q)9<j_Y#eq_7hp|24-fh6nv}RjH`e0
ztOiX+)!^esZMNdyxcfSoAWTxsuDyNh+^1pU^ywK+#xH9$FZ6^V{N?HQ><YOXj(bL?
z8Q4BBY|+$4#%PP@e*XS6>)*9wuQ;h}|8L*tZV?eF$^1W(6iysK3<^I{gOYRne+)Y%
z_=70xsDobBhB`x<;I@N0<hFf>XyH=Uu+AYV6*{|UthZizcj46{aR}0$D_bszB;AL_
zDj~=m9stDLbsfDb^~{nVy}U%{UcK5hE!)WZ-+4o``f=0`3HW!^(OzQ?RhC4-$G(F*
z)SLAC<;?plSFBjIwo}_Z8Ma0jX4`_tG2RxFi%12QGH4vrZ~T-VWX4Ep{j_QO-k7=Q
zym=JgZ=?u6X>sJ<0d=XbDNCnXk)&2qaS`?a*rmpp*p1f~Llkq`7(rJ^tG|jjE0RTZ
zhtBITVw5L>Q;{hu=a|^HSod8pvLQa94Mt`wZYhxMItwRS>+`KXLTHs-=UORoL#+$g
zh&W@2We5{>hFJFEI?(6u%3nle->n;D)!pd0LuiI13Li5Z9N0}Dj*^Nu&k7^4g}?p%
z%V)rM7eo(;yFV}gmWT<34%G3@Y+iIVag`&64h{Iber4vbZrH))r~O(2VT9Og1bP})
zJ#0{2j*7%{^HGRjlR*KfjN4EAoqnj#fGw@IZ#J0n#%xAJ1a~1vV9Kw-oZ3Qz1k5z}
zKu1;fw5-Q2hv2ZqPg?1Q4$LpRe^CzVDW}3MGl!U8<_Gc*bm}<mJ3CrOm~TFlv?$j4
z>Z~*O-;N+6fq%%<?%Q{_<r%L*2ZpIQn|g=;FTfb^isPw+SKob$&0Za;rYc8VmjpHq
zUl_e$f9~ZQY45&%(ZkON_|HG#qwZ9zOjM3eER{d{C#(BhL(x}-!GydkD{nutuNDzf
zz_A5%5KE&rTfOen)m@M#JQ0OdTw;~y5TI74L{%Y43a(?l`Wf+JU<wNY3U#@#%UgM3
zn+1awyEDCh+NnuvOt!cxZQ%$6TcyAF)8~>Sp`mmY@te3Q(Wb~$XaNF17Oe&hwP#uZ
zrx%<ZE^jgrr(Ds}XI@w7OfF=$&y{skW1rl8KDc$$#^n_ipKE698~m$k5pgVc7h%#k
zm07r}VP(flr=W?G;{5v2eIz!NE~#I%mwASEu`X{j=iW3+Y)g*An5;!ZRA#Qpp4&G*
zB7>Vk2E`tTa4`HbQRltaAm%1;Amevq2;$cli`9#At=Y&*Ur>%GRfi^on?_}p)%bK+
z(!zAl(9UY%NSOa5&_7#vhm}?n-`jovL-)u1)R{ozd53vyArKuMt=8(GlSn6aQmB84
z#Gw>{v0xqA+_P=hOZwG0b2RrK<J|>PF%N0o*W0`kNYC`h&?k0eWDvQMUdXs@VlrAP
z2e~|c-UXmm23QfIj(%ckev%@{9i#a?hqlGBJ{Xoo27`keQDN5K95>hC-U0Oh?jMTQ
zWV;`xE3Y<izmnUP`_!7ne){DEyjJ);tuFWx^~%f%W%vbxX$Q=7KhilOq9F^ssHf_l
zmX|OK6LQgB3RZ@9IaKwl3mTlfy?+H-MVZ_vvIF{K8SEjJ8yFe{4yf~1`rcyLS~_j2
zcz9*{fmjDw!a_RjHHuCojYXop<}X~>h)M+&X1x{kGKC6o{7?=RhyVn>Uft)i@e8Qu
zxZW0}wOqc)#lVp)6qf;!;@F{+A7tV9q~CUCTMzEkn44569L^&~b%}-8J@ZiR@qMdd
zY$#4d)JYwlTg}Ml^~Q%*H6jp#Uw>P~Q+pGCHX$K;_uqTo!b;)k93XKqjHMGynMoZ$
zG$r2$Rl$n6aSb~fKP>I}51o?!E{9p>UrifQUcUE3C@32FloBUpkA{@O3PjeAzDiuu
zitb4>;-=iJe*Mo?a`#NO+!iVry4>k^dc2{A<0RANhfO(5x@=@ci4z#-6z^UH0=)9r
z+|!EuuvxzjT2z@kU+uBo-c?W6;Z14WB$cNFe5y0Y)wvC--+gSs{7bJ}o{b=g-_$j#
z|9B?ZNGPB9n(8V)d$!vL{w=m^i9S9sSPAS{N?eSya_z{#sE!>=E}nVkb$hP6#@>A+
z$7`dgmbo7xXO~}_C3Q<}tw7KGh^6~yH_f?sb1U^fauUZw7MltC2ds5{a(IM-=fjN>
z7f<NFfT`!_jyzjq*088wdzO0HZuOI=ro~OE{*hkbHOu_itF!6ZLzI7Iu7B}tv1wq3
zhD-O|FGyS}Hj1~Gd5>bm$jl7Q9T1ux1QAQ^!dhnP^7x4pQ~phBZ!%iByVu`4DXqQh
zX1A+I=v`93<=BF~jqHP8TUU1tFY&jTS`%2Y;nkw*rk^Yu1rGH}Ju%Py^`gtB^STVu
zfz_3(gSG}qhA!1kI!gcz7J7?~R71mFRJM04xllseQcqSQ<t!J%_1XQl8PO0E5P;`L
zbl);+fK6hxxCa@{lK6CV0%<PKN{4rqsvPSU+gfHMp*`z@8HKX{!?q|GJegv{;i^1h
zyv$Mp5RYnP;5M>&w5@xO!}|s>b@AGYz^+=GsrBbxsT>QRfVl%?30b>WVVXMY*!+P}
zF`7T7WUL3=l?+j=&lJjfPktvN7Ik&k<j|JN6N4)4Cx=e~S2%GWkS;RA1J-9JxHKzZ
z+ue$LdG0HFkoI@ZGuM0;4n3w6{P-UAdmDixkNa(>eyg6YG!fdHaW5jS$6yH093TuU
z;*>>GZg$bXNFyNA>}2wBSa^8gu>}$@OeBKMz?LgG{!cs9U}$p1{PSmf1b!3LfwG$)
zPr<Tsxn{IU^H;B4iB=BPp~!|%9j-W)aTiY0iH{VMIr5gw;n>>yRsg}js09Lv0+98l
zG2$|d?<Nh8crehsi|&y`mKF6mm^bpz-1vGMDvJ3*B4vvvH96DbyBfNlgN|xcnC~Q>
zoj5G$Q56!9RdiP=KueSkba)T0FNg*+_Hs_;QUn3Sw4(qG_|@~DPF0yu-x>gyf_Z`z
zuUL3+Cb*rjDnO%{O=MI7W%jFIuTU6aeo{?v3bkBQ{bx+Lot@p;?`iT4gtz2mr~gMs
z<ukt_=k;rI_l4se=bA3JnBz9Y_2!m{eq&;(|DdY}Vda4LKjc^aYH7V+vqM~Ge*%R3
zr`*^uv(M$&E<N3~f@Lff5T?*39F~%uO{*s1cW9W!W<g{xn`M!+C)TgX&F!?bP#9az
z_i5>a;5+1*`@e@x$m4T`)AoGhjv9z)%PmNeV{vC7XdSU~<p7f{!%_C38*73gmUJ?1
zAD=a3Lc1Nbp=9Fb61xO+l;V}3YjzhC-Cg_Mi+C~n^ThgFSNyHNA9yvz-*LQOGoR|i
z?LljR8RB8s*1bs+g1E_W(^qwvkA<xl2Wxovhi}y-3rG@z235>iro^DO_Sq|SO6u=D
z3)ml19XhG|y!!OKZVLyQ9kOh_J7Q$#78dPX&4z5FJ_TVbKv0Z}A(g<<0Kz}mF|OFh
zWPI)2%|=Wicj|!~aL~i+r%#+nLS-9_l!yL{<mAv6ggiYhr-F+GTckrwfuevf?6f5W
z1%<p=0SGam<}6L`w9m@V_e#Suq-P@0YTuy)4w5aQR&`COFFFkLv?Z)FEAa2BQ`^dj
zBhh0+Cvhu@G!A|-r|eEH=Y#M|?8C~1S8z6<Df9aNag4&1^M1+`REz*Jt$&s)y|ISo
z)S2`fRJIF*C7;TM_pk#l7EqiwTUj@UtN~?7KR|XY@&~9*?+fZ~T$m~DFXr}ZzxiPg
z&r)j$FXES#rSL%DDAsGPN&XCsf~3z7m$!wt=lT3_7ezPy0c|qX;g9cF|Eka=Y)K!b
z1ePGKd(3WMu4o<~6B#U1kN$Mf_)i2}Cmbw^#kvVUgcK%J?AQ$-MmI1PLA;>2&e`xV
zANlNHTZEcdennDBK0h;is|1CRaw)Z&=%R#K=A`0O;DizF`Z+?|d5MBjLNy1UJ-g(}
z?;@lb<rwQ_1QUwFRo^m<E0`nY%<V=w$5|)0zc{4m`D2bAy|Z8vdVYhiR|_?SlYxj!
zMxC)8?n0aE+^w`pBqbGSoBW?wnTdlM6kQ9dmoleMV@VX1LXVgCgOA5azeAjV^io4{
zAfRo6oNA6_g<=8Gl#uLF-%3%1rD*>#mWoe4il{&accd)8*pOsS=r@Ol1=yL4@kWXU
zxCT=4v+AtB43(s`Y00c0l9;c}T{Hi!HR1inZ>$AP(DkbvFEL6FbxL1Z^FBC4N}gGR
zV@V>lRqgv$cx&@F8y`yQ5^AxY6A*!OTM!%xASfq*IP#LmYaGiM!C#j0TdQA3BSjXe
zBevwb?n?ArYv#7NZQI7oSSuyG=fL)Ts)H#kf$9Hm`kb_|6p8k_s~aOSZCC29m8Q=j
zh2#e+Mnv(ys;?v}UhNu2Rkm$ev4x6}S*xT?uODdTY<;yTjc@ixRD_uQ#F;a8ASm!Y
zeg1jgL)=IgF6@a5)n)CXq(NS&O8J1M?n{=bn_;_Vk<)!*C!E~L>`dC?NieVj2e#!b
zV~t1>3k;F`Apc)Kost6>`h@U@Vog-Y9DpLkbE9xk+CPULn^l(tAjHI=^`!NZIq`go
z_SH*KG`7zVy=nKrSK)d>RQ&QLp4(IEm-oAH<!EU$w~4Ab*S~H?hqc7N{>`w=Zgqr(
zu?yv}LUbr&Bw#JAg1kD3$C1rTz+mg7xMl4uo6KLc@7A(5?=z;YGIMPomFcrU?b!u?
z07>OsYc@uyO4`TEiGW_iQb#um>@$h7r$zhr_x{ycY8>0$eTPTn^k9yMLaxdYOx>t1
zd#lN?KmaIqnqsqYT}BeRKP1qPdO(V9`N|p^6cl1qA&24?>R=Jz(_27qi$;~70rm9q
z-KSVF^U9nN6mrCD79u)!jceoQ=a)>7Y#*I*I4GHF*o-MpR-D8_ej(4oXOQrwnleNp
z{hzy6b6g000Zf(l+;Z~CN$j}j1?{;g>FszWe~tli<K*A~@6@!c+?k@fEPBnP&Tcn~
zK782734!*6!kSI}CtRLh5J!ll(D0o#@17Y%YeA!gC}_m^@lSTvdBC&{T3MpQAu9<f
z5+8$`4a|UFTvSpb(Q<if$}B}n3Nl7vkh>*z>mAoAMJ|nZ6NP}vlwU^AeP#4G-}R5J
zo`ON5go>tZgc|{Ww!t+c!UQL$9h}_Jq)h`<;29I}apA)50D7~}{Qbdm-<|`;%D$CM
zy6cR)OeQG8YbYc^L(Doq=_8{NsikFjDPqnhD1nA@n^7MMt4I9I`A?%6X-3^ZTg&g~
z(^xPPM&L`a@jI>Gjf@c^Nr<FH(n$!2%KW&uB7_!?d9w3Rur-bhIaCN7l7UV8&=dD=
zxssv5GCaDRc`q8cA|)t;sz({xSR*9mxog4qT8W&|^)1q8rzJ1ERIL?;58{yTmu}Dv
zg&cz;upj;^wEm^EB%!JemcN&Z{_?wui#dtnDGR0laWoF){mU>lyu*b!@Kof+pEXMj
z)`_?^Pzwx_=P$WXCh-r$6yPd!{eLc%d64M295)XBILJV2b9uV;ua@Fak-R90ZsS4I
zE%329vU!yocnWB^|KQx4{i++jQ@CBtwx()Mm*H%GZJp7hO(FAxlw>K;0^?cAr?^am
z!X-ych4H04KS?Y;w!or1N$KU~#WSyLRyr2!j>Z>GMZ&Bk5>9c%_^ZqfsVJWF+c#;C
zUYEeX`mangV`g~H)@#*O|BtFS0qZ$!-?)E5LQ=9MTVk@4vJ*)Olfu}E7Rpk|sH{Z_
zWyv;F_UwCMw9-x}(j-X{86qX2EGh5j`Zdr0eP2h%bIdcN-*SKN`?{~|yw3ByVsF={
z282@oB-XRnhF=FK6;-T~*<#pCe_#IDw($`u)VQpbV>>QQXA?wrd|`TZ&h6nR78?KV
z-W*p)B<6`oW&B}F3nh}efdUMGi4t9!9U=;*)~1A(X`Zw+RS#}@x7g5k;ed%dLtofZ
zp5b62o>*ey0H6v}PzKe~?W`X?@B>|o50&$_9+!!70{Ic+Ect#uKPxOR&W&loa`%l2
zBeaW)y^x)0C@Tlokr$Ha&rF~EMmOf>&6}C(o}{MiVnvjZUg#dUt!fZbTUuaPAX6??
z{gTdAZ<~I>f>B^8xoGRTH()>84LV4oQ}U>7dMoN9A&CnImXSnl`prwlLWL<~TkGw(
z^`6EM5`Cie7_|6wx;ReOxy~CW9L~Au6n`+uPGK?G#r9pTyYnw~k4|<jUQAR0IGWmX
zU%tyu%ldWQo;?f(Qk@+${H^!YTUNfNh6dd(HcCb;DF)miu2&kK#B{g6>(c?jOWFsi
z5(S0bCc{nXqDzn<9J!pF1E$9!`w{S@^C^NYvFoM)kzAShmkjO65KeAATdUZncpbSs
zi)}?CS^!^JQts4<<DGV<Ia*u;o^Ol%jqIhnjPa)Jof4Kfk}FI`nnKBuu}&O|3Ovwd
z?yL+m12QLd=R8O~HfvTO-Zzi88URGBeCF8(zo38reohap^yoT597RmB%ub|b(i)rs
z>bdmB{-XiW^2#_kxH%ST{!`AicmGAIaiu03x&mo*v*`73M4uqWtW?S_NG7D402v;0
zPN`%B)&uEBXfLU0>GUxIs@I&!S#rfWvm>QYa#~s=97b-TYb-EYbZA^?C=@;AFS3}3
zIA|7L-Dt@TkAfA*L$E!#Btn{`O$hVw(*~oo&5WB(9qSl05bH=G&`CdyD}fuCT=PTr
z`TUMFRfxuzZfnzSPa0gf&JU{w@i&Libn~&Zfr9n$r(6Ln9^On75^{=+xZ<{uX9*%l
zh+s`F_Su7Z1DbHZ`86do2+=D;YT%tOhy5~!;2>ra+#+!~`dVrYLYOlmQ^|$tL!^O;
z0)(v5uoVn(9C7u~Bue@tw>WfqK+?za0<ivI#Fh+C0ruR2tM|x$n;BY2FljC=aY?f=
z8=m!h*%AIeUP{6O5oRGUS>6jROArnU2_6<7iRAnR*w<}&rec3Vp6W7ngvDfXXc;vb
zNENGb$KmnhUou01^4IgJGS?#|e*gD+8FLD2S)VFEsN-PcQ6s~;Hz5W8mJ{x?i6Ejs
zWJvU!QG!oZmtKQLaf?{+kPShm4YIuT>ybO=85!-%b01ed*%tC@{EHLao1jGle8(YB
zK7wEKkpY~9>@I$)YOkaMu4}B)4U>Q&Svt`8qQUna?Ur2Gk@G$$a4X^yyq84nBz6ou
z4o1RB`+!V{r)57t7ipOv1QNcu9Y!ajikN+=<0H?4aY=$J-7j^g`4Yec7o~d;mS@C^
zmm~yiJ(2IB%E72;Ysy#jV8RX@;_+6Kti`?d=V~GR$?$L6^K;HKi4Cf@{2ThZV<@m>
z>Fum}ri4_eOH%4m$BuDwI-TMQG{}0887QJ(KWwe7UH`@Y?xz`Vux|n*^Mr84c^Gm%
z)z!@nVD(5!E}bpnoY0YqiX0+cc3tzT7iAtv-;!nNzVDu!%0c>skb?~A;Y({xthKmp
z1+3v>jPo$bBeWCs9q>m|2%g$|etf~XNsSAxo`eAEaiF4*@_|sKh*B}~+qiMAu1(yv
z^`JYj4n?I*1z)u?bnvnxDP@L@n{#{+utsBvFZ_sQ2TfXSyJfaS8sVLHA39XvyuWsm
zPR@bQSI?YUM|+wb9<_d?5I_kLk_Ji18xWvVc6z7<1X0izfjXJ*w3GMvnfdEybKdpf
z<aT^hrd_?n^52}iW@$qR27cn52Ohr)YMP}*MuQC`<weA_O@PZly<%u8(RbF^jKn`+
z(Zd{)QW1jG;U+s;T688V0D}<xk_61;>J|`TRLz&YDV20)>0(RxrqwE*F4orqL+&m+
zaGK|F1@jmP%whv44Cq^t!CL>4M7+s7x-+fgT6MsPP9IYjauY_?-i3`yInk(K6ty<d
zWi$7&xKN238{UzGVbds29-|nT{4AM~U`8bn<<bvmUSJ`36B62Fwg?53Lh<3oXKOk<
z>3tkd=eZm)bZ`<)4&}8pD`+#)8sKmVJDn#_1m&9X1>r9_8e#oEE&%n89a&gMN9Y}E
zKWz6n3KyI1k=nJqjIp0I+N>LSv}RP_nGqW<6&LlScgb0nMlHpoTngZsPT%Wos2erO
zl?rbjIahvpdD_h5&zT-5q#IHIp=WGpiX;YJt~eI!=^e`)H-$`42HJ|h0j^n~WVrT=
zUl7b#090|fLPaKBWrkbQThIQr90rJlS-yeXARC8s`3$Ph=3^2V6vN=l?V#)6v>|PY
zwI5;{NrHUwqB$K+UICK?dI;idOZi9&&w=o{rAf9z6s<?rLpzLnOPH~QRMGXxh&{}O
zQK6#Uqk|^1mAkgMd#s_s%F5;zlYzd{BnlDC#1sbn448$+IJy_PV@WtEi`qvHhM*;h
z5F+r0j~<C1oAmN6EqAWb^EWm)5HfWCwfsig&Kyc8z_WWa0|<q}!F5N0N67?wy|bBr
z*X^?3We6ojEIp+UE4I&R^0IYT;}tFr*JlerA-1MG8JYG5;b84y5k#}eaK7|wWoUA1
zY~-krjEpM_jGGqRv)fM46Z9JqCBz#D@iKCo9h$}kdT3rI1&25l<eZ=8;`IiEQDPjG
z+~H=b<&7t1Q>3LJRFOfHdQ`HQcF={Dp>}acFg0L6?|%Y$!HtIw&9h&81TFxW?dJQR
zSWt&sR$Gsw@z@4C3TKBlWGVtTpk98ubo<%tcQV2h^>NnXqj<T?y$2yr;1IGIY<=51
zb)s1%(pHG7n;G0ke`eR=!_UfA@I99^?(G;%WqC<h{E|9KVnfr(>zlXloL^^%r0q@C
zv|M(hLREBi!Rh_se~A^z;_7<G1i``F8C>S^g=a`cE`>HJxSD)D{BvHXjPmIeK_JS(
z)s|QR3?Cjh=NEGsByzwx1PyTd>U+eoBjadU!&gl8@&hlxbteK^zv5SMN0kkps=7^;
z?YC7%+L2DDX$ZK*l??*h@lAO-WXHEZ8*wU8StvFD&VfE%%RH<_U=vGc-UGSdn4pr~
z<GL~ew;>&<yugp~l5@)}ODhI@$h<0QH9&KO=WIH*KPGzjXJ<b5{`!ohUFsts&f(+L
z6~Nh!uB9Yfhpb!U0prO3r%7cxWy+pxmoZ=?V%Q4$&oku%-F(c2jpe9FyT!K!GWX*j
zS<F;pvKk2&e?uxpLW=Fxqb%}m9!U^tBWZSZXWAwyJ*8Byzu3lLe~ttg`OwnQa=zVC
zkV!F3pk@~F5u6fv7oYF1+dzD{I#7p$V+a$z_nb&|QU<>RPaJ-9>)qSsPE8l+^)lm~
z?=2R3l+NUIT-^vvi9G$ie|U%;E;q90%;AQ#plImvP=pTBed|X2Y^G0tyy2>KwS#_O
zC>a$I*AmcP8X%lZ_zhy}bLLDJG9~F<EooGyZeYAOW@RJNKt6{XDDXNw&n~`=2Hu+I
zb@Z}6gbeOIXiFSu2A^W80t0N&k2Qv~#u0o!R#lBffKTa$B96q8Gwg2`maf8@jK*^d
z;OB{6Q#ii6$31f0FoG`}9N7(`_1?P@qyV(Nnyx8{nK<Jm1D+_pP^&FEFcxq7wKYi#
zRs@<KuV=euQp=q5t?i@teb>ch1kHFvg7zOP(mF(6TTdY52maTSnyo1>qaTTbF&?d5
zn6^N>pPNU;H}QmVkp!SD9h>V$W%gIuL;3zKPB%W$T>GS1l}WK?{}nF!nrgzS9;8`H
zp}Lu!i^v*;dlQK+h+EuWImL@5^P~$C7^)Cxiak`srnq?Q_S|KsPo1j#{+wtoBV40#
z4U>>VR29C%ML*_b=-_WRD{KahGls<CRCWsw#jUh4a1rl9>WE%L56tO_>xG;pvKN?F
zWCrpk-eXcB0J2<ky;#TplPgbp8i{?150MdgrgvpNtQm?tDITZ+)>Y=rz>WsGZEq?X
z){!G4brsj-)y(qtBzEH1g*H%zPQdvYF7qVGMa)f26=YSEK+uAOEGRd;n03x<bCKG`
zv)y1)Wf@oq2|#9zNqb33O4`0se<PKLH{KZ7et%2O4xvoKotVS}UtM!}huXW;aCBmP
z>1Z-R`kf+lvS*Nj%egM#Ejz}h>|av!XAGH;`+;Wf)JPIBI2B<ds8|wo_d``SbhSp-
zese5+dOh9#R|8T{f%MUF12G8g1tni+W8=MtGwF=o;;elXTLi|4sdU#)P2aw=?ZKCn
z(C7hEFm?tuSU0k;OZ^<eg_d#S1{>3idS0{He%A1S!)df_z_LdAN2k^K3@M`gZ^Ak7
zN%%@Lq=R*cMr$tfEuDN;nU)9c*l`O!IBwS&kF)mNUBKQgYZcatQKj@L$6Uy{I<P<|
zYY8bGB6i1wqiy&PFhP8B5umSn;oE5L4-NWCG9HxsZYP&V<n2S_x^86mGXi@<x4YPN
zGnXaFdUh0gaJpi6#a^m-3$^&%)RIqKDa>W=qC3+wJ?F%W>$S7<wd<c15wpxcgpAN=
z+<5u#zl5%K#H1Z$Uc?S_qDs3lE-v!O5o2azh)FR)>`GyFTAK5ugq_AZT8{dO3)}%^
z1SzH_6=O869nDUiMqLaI9CORziXt)O@3L4Z!H7&6fk{v$5=7!oO;-gstzKZVGi^nB
z|I}x$ul|flad>s>rTd|=t>d=&Mn0Ptv9fYV@q^m7_m*{Q*(A-pgU;6D27iYP)$$Lh
z=;>r+<zKk#Md&W?%%R)6t8Bkd3DYDvT_zOI`QX32ZDS1$7;PUT=6BQRW_9e<%=C-~
zZueb<p?zsOW0GXVH8;n_MNit0vr*@+lG*Fx9)|WykYv(+p*HBlbtVa5R(K<5CW{Zz
zaRT2XI}#=W$h#QB#ktPuxlib5Xn}5tV1m~3&x)PvW!Q^mn>LuA1Uz_PEv6yjx=yzl
z>&7eCQ36<;&yOT21XXDwvqq_cMJ^19bDLCvgd)-|X<#FQqd6`)v^cbBygy=G^qz}!
z8_k18P{=6P8qtTM?W+PV2*--D=LM%g?s=GeXn~h=B8RE>Zr-?Y(wOEZ4Gx^H$eA%;
zNGpP5!t^D15X)kYB4Ln95iC)<%9LRB6~fTsszBWhTzWpca9iU&jhi-wRi;z>KB2a$
zP}nf@A!Iu{cO!w10CWIzCqb3zlzs1OJ^g9}%nF9tkwe%Ndm<U!^P^13XA2ts;qBv5
zT&R>BQe#|UNZ8ysuV(9izUm~r6iV=o$GZ!ZEox;@Q<wtyI>;P1uXYe_;AcS{5@I5F
zb_IhOM7M;ADNap)zP}M32iK*6O48o8t@>wk`z*OmGPiSW`T<}>8M=vwrAy?andgTl
zKXD-Iu8QlsC8e!LMa9d_=hnW^h?}yO$oBK+&&>lj{sGKoXt=A7eqkqwUN_&qb|2Dr
z;#jT`0994jyT*Tgtx7Y$mvORt6B;+f37O4EiUX|owyI|R5Ig;XXQ81(TL757+SF#A
zDS$os2v!ne5oFYMocjdOm*I^|&U^PodjEah+sYqVZnw`G?+-fVXFp<_e235Q?cSD@
zICNr`6sEfbTujQPmm0YtGq8chWC|HjdN#zF^jAx5wp<DAuq`x{Ym8K1=r=;sn_fDb
ze*5p`Z34}O`^-0!9-Fv7f=!tb@P!ix-0L;xDC8t+Khk<hUL#%GQP;J18%}FW=7vL*
z1hx_yE6wsF%}PK6H{Npao!eLZS7i1cx`dm#hjd`1lSUuG8&gmr1|7<N!~tI8#bciy
zWCk5N?1=ji+Z4`){Vztm_UAlGfRmA7(xhc~N~t<4t))}^G<b7N`QS#9zafJI&(!FU
z17bz6xuz7FfJ|bbF+5~>k$HG!sH>&9x!^g#a5IhujR1qDqrf?rBpys~Yr@ah{X`WC
zd7yL4Cgex_@DJkcBksenr876AjU&vi0=ubacc&($v_3C_4g*<ylK8~Ab|4?5%>)Is
zlbj6Hjn%r>*7xHU21FVIDk+W$3K9OuDDOPSTMY0V^7K9!YlAJ@w}#~9<P7YbYDGAN
zW%-J0T?Ty%CWV9Z&b}RObgF^xuhlP2nmswl(qK<W51c9X@4r3!x7EiFUnJ^gXLotl
zOtn66XzJs_?=&u#<9tAYExrhBzP8y~&;4IguGQnbk%Hf*pWBbFh(dutp#9hh+M73<
z>LG4u_tjyZjlmSnTdOjK3wxyD#O#OfD=#jr8TT{;x<xwwM~FojOllc#!D*rGU^4^5
z5lU*LY%JXNAv^(?6kN5nq-q2+t)F<w1=(hFy?tu*CZzq0u46l;_zQR!@L?UF9cS;(
znUX~JQc_R22Y?dQ9=c@HklAE@xzym4M{I!O`Rh)ztpJ51u-8JUIP_BIa!@FU1JZ&<
zt7L%^Xt;Bi-Qy%B1dS2WhB-VasReJ&E037hwXP7QLAsV(N2Ybqcv@)tkB+78?N1&=
zD2?BZD^vL}AWr?@glrcJ6e=%D7rfa^bq;eB^mpr#_LK6`lM(mMABR?4&2yp3Phq+Z
zTxj{g85vfRw#nRWcZS8#{F3p`)W$fPLE=wFjZxhv^R?QHYD+v<`t-S!S!-VsQvT#h
zeLB42s!GlEFej(sfzz`_dm!=_h4Y*x=|^Z4jG*^TFN|R9m_CeIg*jb{7l%AOvfZ(@
zuc_WD-4n<Eq&YpPr}K!3(EyZ`<Y8QH)Y@Xrz%?c<DsoIV_JdQ~)Ku(`!Kjt{n%qF_
zCB~kp=brv4C9h$nxubqwhKl>p6Hl}4%&ly~M?YdNmP{F;9m_@lVcMAIGM1dNw(wW=
zdhkEp8O4J)t`*lepI_C5C&*ESsDAK1u}FH>L#H>dZ{_sHT$`4&l4{*0@aC=3ag7T1
z1qYMoaG>{o)(vY1Bea$A;sdG^v&UHL!n2Pqi_CY=ca4fUx9<TH>&*#`8|KXNlwJKO
z#+I9wqK~=@sFQoz6zqcffyx0i37D<X+@7Pc9H(+a9$^Bj3d~du2`#col+3|t1&p?9
zWztSp_sJBx5ls+&x!5^kC^@p`OSVr`G2n`@kMgcAlx8b68Wikr>wLmpV<orX$`fKM
z(ZNI-lXWE=yII-VkcNq^##y`6kK|XE^FQI5k(!ov&2_MjI&}^{pkC?GcMjLQ!sHwZ
zhkU})WzJV-zF<4U1RtqAotv7IAX9y+*4}&lC)e)k1$i0<z4jh*R%@{FjM0+NP`uy^
z7CM!943l(Srn7_1k`+U46X$%olX02j`l#4<r;jWktmv@f3j2E*I3~^Z=o$j>@7UI$
zBs<&HY4HK^#X%>-JYa!CIZ&_oYGcWs0{3?tlIk)@(m9?x`~WcE6m6RkzC@-y7NgV+
zLqoOO+Kdc}ZK`bJ?B1wkC%R=m!=YVmjIWk)y@=`892XuP5NahO$03cV5h|e+(coBi
zCm*se-LmCN1G|kpTE73E5g**z1*dPCa7uS-YsdM(-4w$z_TW<fb?0NQ&JEVS*gvjh
z!pDf=)y~}$4rAwIZl1a;->_iU8o3m++g{JGx|TQML#h>751w}sT9(^a*1ASL+sqp8
zI5#`g|B3}Wgbw>o-04Jp^zEB~*Mb!QA&!k(-LamQahj@?f83e;3{_QC2j;x+Z1G6l
zg8es1HBLs+!@{)+eh1JK)bFr$BjyB>%!5*a1ns#yT-|Ew!Mr2ei_2|{C<>Ct{qsl0
zf~&?R{o^(pE_d(h6tro@2?m@Kr{<jKKVrn8jQN^BQlTm@92TM*(%`FE$-)+D4Q!lK
zZ&|TT!4!X7+IeRDd@6r7B1gpUcy|J;aBjMEakP4B(&DxTn^t7U0OLG8$tB%ino$i0
z1^Q-;GfmC7Qtyw0N!hoo$mA#}t_|E9plRXWkWY-Xp4P?H!tIJwC$FDll+=Vh+fRpt
zGE0+-h^1LRaEqDFFfK~Eg>bI_`D5v`JeNVl3ofISCD%nJtCZ?d>t1brrHlR2-*x5?
za_Est#~pBqHemzzUfEfhnalo~Gq|=c{pxUNq_-}!<6#2JY>a}Ed4r1_oG9pqk2iU4
z#Gv1{9ap`mM_P>vhtZS<JsWp1<ER~3_h~%c7WM=c^wsh_*zv#wp$|LOH%!=_v!sJ|
zLhh<zTeQZi>N^CrSMfcTf6~}~U2;HZSGz!n+=HtI8iy8lnO0l_x6v<W$tH&GTshs%
zBW1GF>28b;5K6NQY@uZ&lpto^G~0SyW*I`?J^I(3&R@*VWD7A$<kS!dd}_Zt^YN&J
zg#P07`F01U$(!m$`$;zd)>g#qORI_kyWNfqIhXrBZ`_p%11F<hNe%4>9P7UIiOUeW
zuDi!S=&zEug|ab@8~|wYP|5-)qdcTGzJ`asWMnijbZ~e><uvx+6>5&n`vwm;NM&np
zhvNCOcztcRMF%LCoS$WUNciP*b*Faw<GJreKSmQtCEC-w6Foc7Z#x=iCkkRKw&X0S
zTov4#SH9AAUxwA%{@I7`_2#k|vT<fcPzU${cV}ar=jNMPyq5+;dxKr+yA#n~s(rW}
zdcU`?;Q>QmlZ@{V|4{PX<?3L5B?%N3D2TAc-UpDwGH!K)<`!LB$P5aG{BFGO<i`rb
zml96Fods93+@44KEg#6+G_cvUK)M>scf!|wGP*<9@VcdIiUrNhd85dQ)VpjC$;<w+
zn~$)${+pR83sgP5yjt|%`$Egm)7T`&=`?e)$P?Io@F1IvJZiIXX5$)|o_^?Ugn3Hh
z3+bkGaEjVpWft?`>{f*Bf6ctgyV`Gg!dJfY0&j!prpPAzk_^L)jMsJ?<vJ)oy0%vN
zQMPv-FcEnpb4*3O3UQ<P*bJxrrf-{`c01k8*~y6gSM+eKj;2*rfBSEpl_f<7X$}&@
zdG2mD3)&-84E)5k_piS>-=PC+MK8OOn|$}=Nfniqhj(e{Y4_3dS1!Zujps6PHw0=Y
z(G3V0|4}U@!%E*=6aAxUP-lW#2TF30*bX{)zbtTjm%haNShvtn<qewB>~2Q;;k#A{
z=eZ58ZQ9mP4lbe}=yZHnhA@o^IN<0K!A+4EC=^#-by?NGv4`x`Ree+WN_x@tH<=~P
z>lJrUtNDy>4hFuaI*S8BhI--OV7`tG;N}|;`l{2|jIQz}?YqkPwb_(eJVt+aojjs(
z9Ub$qRvY6oS0_&HIiY86?m*`%HKYGVQ$2-Yg-Z+ecy(uoH{a6Yot&JVix&;vVyd@&
z#i9dF#xh5o%IB_K=T!~pU9h9Tk^|(u9ui07LTRvb^OFqaZed5=cWPJWq&b&;5wSPh
z+@B9P#{T%@j6>r%ifjXAybUls5E@EzfvQXFyFY!>&%NvKMr#0pUDBcfdzSa(OAZ(?
zF$D>0*rdet^re3vgtee{Q)hYq?2*QM<=t7$2B9Z0G#YaY@!au_HU?&fc$U=X&L>PQ
zI$)qOh!+{eCnp~Os9a!_7qB%n{AQIdxrf+D0jV+mCFo9Zi`%M{rQjgsU#aId5QuaQ
zjHQo;kM{rGmphA*aOb8@aB;$+#rI*|MH=SN&&jCuWB-aq7L)0cu%`iMTLki3Y^mEl
z_<Hnr6^UB|yDT)W=B_p)gTfny43?eY!8gJw2D#P=rgkge=yrE2C!Vj~s-9gs#$L=>
z0^Ntf5e)LR{P9ZO)=*VX`TTqK+_$yafxYbA5F=}naK<KGT4>1vrPLJzr7NZw?|sqZ
z<EjzL`@XZ|_1V1C=*cY`A7H^c-VHco!HDyeMFYe`MHVyZ5^|XnSwcyua$<CL&I29u
z?k$77v^fnrmN@s4Y=gFu>6vG=Kl8-dsibnv*{SEV+*+{4XACP>GflaA-ANY5KOX;D
zGG7vLl26^THyo1wO}aRJ`o0c`_|G7MOUE=Ggb-<I_%SrS0ZE?BizB3)m5;4h#R=pk
z__6NP{BHvoQ}vi3$%6%X`t~Gihm{X0OB%}RqT?3^J9)2N{*O<)ph#wXBfeX7U^BY-
zawd&h#x+%|f9&Q>?}FUbyk&|F7&EC8T}{il9$i!{CTo9)-FG+R(fi6*!;G_Q58Tdj
z)eB$UvudE*+Q)I83-cz;FOS)7xctiOOQrW6u0-tQB|&rI0z(NtFB((sW6!QUSDPEG
zxDUx0Ulz!E=&Q%Z&EjNiC%uEY*N`4Q`kAPp@Uc@f7?us9-iZ59Kw30i!+#oO=>1ds
zX3=-k<$rgmE`HE)#gO(@w_pDm_bk8P%D*ob$5y46-g<Pz%yoWx`<o96KCc?#m0#sr
z5|fejr)6fNo;upMRjpJ{WGz}?kr;nuc-+drj~FNYv*fzhL96#Y2E|_8ep+yBR=uid
z0wbgKcZ)I&poN{z&z~`eZU(2n7A6|xc(IwmA9L~F>s8D~DdR@8A9y(a%10MC1vwJ#
zAj)AK<yLQERc^<OPre1J)$vD)!i>X~>#3Zqi3}}&{r1hhrAM~bJh@ODotb6iG<1W#
z_s*KDzBk-P^(`IR+T+WO@Z3W?ibJjxn(JK4+qTujV(g3~U*}q9|DEj_qPTk`3W`wG
z5YjHo0mecy^Uqq83a%Ge*(t1zRqIv)-X>2Xbi+c%!jbL;^l1`-6VK$zy#0;z<|BS+
z(9C|KpJw;C<yW!IxaN`@by~LpmXFv22x+XH$?5EOPXd-)%Tuq*-kz5pHQrg)=!AOJ
z-ogyWH%IP%YuRKI(UpzJn{lG)`pY0LGBfxIk1v#&oba-><=Y0S=G8Qgsk!w2L3N<Q
zc)Z=HiWe>|j)zySiBHQLRN8I$p{(50R-d0_8S!lQ57kybJn=}g1t!jB3v6f(ZdDQn
z8N)?58a_TT;6#R_<;Lmr6Ly@qD}q6$nA820&nyg#IbS<%thTFarHr;K4PBr9-Pky*
zT+JpkwOwz}D@ehL4Y*zNtWh<K;0IDv>2Ib_c6QRKpr-xu{Yh=i3D5R7oqpZu7xt%N
zRBh@|XUF=^;j4A4s?XPWuTP)5y7>x|jP{M@)*OoeaedU}zg8dCY8QI!k2kkRl*bhC
z4Ln2j9v>Ned-0vnCwaM-iicHY9=<tt@}mv@V^-d{y}eoc!0lD9)ZQdKUt#-ZNMgao
z=}W`CAJ%>@O>392+W-5q>Tln~W*W4pi-A+^JDc^oyE8j)pQv+i7C{F*9Pm+@e^4Fk
zIC877Z>MKajl6pLog_eAMm*C1_!!Y3J80E?D+OK$M1&L`2qfRwg`SlepGw~MH&$Ej
zx#xDC=@G3t-ks)6{T~+q=J=O2H&eb=wSTduNloCe8jGK?`YM&(Vk@`QBtG=9d%8mZ
zPTp1DpSQ;(?fzI$xO9y7&J)Gz-i<#8?QiB$lXV(Q3*z7abN$Ij^BSA9b*aYoU$aAp
zW;~v#vHqF`+t4=NcN)Xs$+Xf7oc+?{Q_h)AZi!IaWLTFShK3MkK4PeGE%UyU@vz~;
z1@ZLKZoT$7cM@$&n%c(4GGDn5xwz=xo<aV*eCMuhUcB_kxcPbK6l)r$SI&JrBl11-
z>+tsy=X2_c;ODP;Pl^b)GqZf(yn$1J;{e^YHy`KZNZ)(drA?NTrw+Ey-u)u)^T0R#
zwf}wp_VMR|VUzCcU;D0yM(mpR<=!Uq8*M<|y`?ba`Q)?3(IM%t&A5bmw;o<<Oxy0<
z)O*jnwirLSbBuFaT?`y(FbkQ9Hq#bB9l8Npty&E{Rv2okqXE!Lcb5B=FDNW!+TT08
zuE~Yr7h?dly%A%rJQuHFHr8BBCzv@E%KZugKq;97X$cRueCGy65953cR&bM9cbU}4
z;Y7*X)tS{v@sERdz0J7%c;DBj>%B7HmiRga#eRCyAuIoZ{*mf6#N%VY7R>Pz&c=cT
z|L)2k{kEBL-aj+Ee6OW-={yKC&Mj@9R%M0`3-zX76mP$*m;GDn9r7#~7FsX4VAb0i
zqz~`cZ&_lpO4152FX`iwCP<UaYc&w_(Q-OBQbgmPuq7_A=AUfuQ^iKr)$#M%S+CrG
z<?Z95MeD90hN8<vjaMH~tkcJklL^&H7l%)RLb9Af<&48=?xOlWTg=w$-|#*iccFFK
z7z>aPtb2*Uz~>}Vpc(v-_8#sR(Pn;`@r1QO->dIgm#towP@X+t2e_CFz5+EvaFR%!
zvH53>T7#8|3!GG<RD8J30z=5;RMoX-;>$U!|6a*e8`wCQ-^w5s2vGne;G*ZYYiDhI
zO&_KQtvGpUTBG4%h9O_$V&kvnrgh;)SI)ad;8pnZ&0uK>&<*uBd%0+^YeD}>?ZS6w
zguL(fA}qYO^;)PNV~@x9cTS$$eej?`dn9vx*stnC^D3{BjHPfBp|z6%y{e5}tPfdc
z8z-hEz!GJ7>F3#D#h0ivK{adU{h|J~nR{k*YKQXWo`<ja&kLtT!|y?>^dsK4TXv%!
zJMgk&`9H=X<>M#+m0rH^@xqRAMz_mitnKXd9xb_^oohSmdEv0_1KsAgj;`8&VY}yw
z%sKP-Kk$ng1&%zpsWaUaP9_0l=B3AOqcf}a-^f_uJnLLl-npt%i$c<>^25%~nN<tv
zl(x0b$67Nz^=>3xTT<+A@s&(<x^KTUTnQv}R%Ti)(LKOuq&V|jd-o__GPqwC^ohHi
zFuIk!0}v)qP<>bu%s&=2Kc;uU=S9lp)2dw|R7M8%07onuyy;;hD7keRA!|jS(G)Xe
zmqBxmw$7_6Ztk4keqzDU*(0YebyZ2yUcTVe+Xrjz`g!lXP`+X_I6WZ-zBzeqb%@r0
zrcIjIU7uZ1_^YOnrR$RO%wLPF@&!CgXXC_>n%h~IikH^+{!p^?a{Pz$FW&Wi`RMrp
z&tv%=em5+xAV@%-uTSTnt5ifc!2ie`o-PXv@_k-vtWy>~X72Uc4sX}gpS9)q*?QiQ
zapCJ19r!R^|8#hD*u~sz)sWR0kA7U8|9r-d2S>hI?KkMMM%*^~-T0XN%enl)i4*6`
zYkd=(?_LbOyP#X=UAxp8wD6A@w@Uwx)93~kCqMkN<kF9s*Z+BZ++qEe+2_(uHIKZ0
z=F+r?M@w3bo@+hYSJ9+i&(3=OznPkiGc&7NP*wBQa!o(uy?0fULdQQ3N8;jD{9vWg
z-dP{2eMZh7a`=Hy&2TN-9)ad6u&^*ILqv8=+<nIAA6M(@ABtOry@8f(Bpd=gI`8ws
zqYBRq{cfFNHT}`#`D&exh+c@!I9fiaE0L3$Z_|GSS8a?*gA(QdsL)qKz(?L7x(%_R
zxiN29@aLh8*FJ9aA+S7du#ep)-$}DHXL`O^yV~YbeB9H39?=InsHo0vxFd1piJ0=u
zKR;Y9fB!DscF@O9cJ^Z;7XFA`w=ybWPqtfBe)yEr7nU5j+|y;vigxxfiRas&$<8}>
zYPC(H!q@ax{9owJ+~zYH{X;@|GoA7JeYTf~_v63+{_DR+r{!jWGoequphYoM#99cg
zU!ECN(<6VQPuj^Ti=#eIuWA&Y`+jgAi<ONEd-Pma{HC<$fa|j_6(J~jNw-wgrk9^O
zc8<mOriIYNv7q3B!RT8rc>S4!KLffa$IR_}#c1yerxsbS?`34%Sd!iF)4^t&26hgJ
z$S-5_qEQR$*h0&ekvHQ7FJ$^#1YDY3LARn_hfFwC^65tT;zE<4AG7;BElk_3A7N$_
zeepucsl<hfFX#U`7nT0y@A$-+B>!WEGb{ILrhY&2p~sEV4_6b1-u>|@>ixNAb}Jj{
zts1-|zCCU;-$oS;51wW7Ik<9akFLS#R71l$&|~<y|Mv+Kwx0d9t=4FlpVjdX5Ed$G
zY@USr{%T%-FU<Ir4qbFJK$kJU2N?rWmiZ=MzI@>!2j~YZJ2;DJq<Byp?hQz8l48|l
zDYnpvSFo_(^2}|^-33i5cLlud8T<Zn9Fx6LV=k|2cwxc&lGr1c%gbE-Y~TDDgGJos
zZ#CTwTQ>D-ziO79VQx-^>0Y26^m^7i2S%Lg@}iUeSFfOm54DlLXDZj+EPN7s{&`F5
zMyhL)-gJKNddkqf$D&rV8+o--RK59BJJKV^VY2>ikG481bGKfLO82;a(fSOA@NNlF
z0~<U0g}IKHyi6@;TyphqT3M!N;Cz;!BjUm@bPgGB?yc{Uv-|D>h`*G30Qq4ZTWjqO
zus7H=(EfP_=7rOpS~P1xYbFgQLjVA`;%&>G4dZwoaq$I_ah?sIM~)&BS?-f?L}U7e
zX)e3p<R@o;eG=7hk&V4`#H;ie+{<$R@U*u{Z@2;#jF6s+e$MbMEAr1aOYG*qGCs-9
zE~4AqzKBL$&govfd0sQ9VbdGlnN8E;zqTFHZ&h?;bB&*!G;90X))cDcoJ)GsURxu+
z*n3WvYUGd4uAht7s?8lc#3T39pgQYk-{}`>tbh5Kw_IZu@~&dlPp`9j`nqTR2VH~N
zaGT+K9yw3rHZ={;ZRgR%|MF>zw`4Dt`!2_ZH~3)H=E;>)7Y<jfIWRIT;oFC~tA-9A
zQkC{aeag8<%NpH}Z||?+n*9Aj|1+QMTDy!~`qIO~V$-*{*&&y=So~(*(ptx9%1rHK
zD}}moplTpivEL7OvYIxnCh6-I?}!NFk5_Alsg3&SePKfV=1D!R>xVaQ?FFT=ErV*7
zhi}X}GCQ@(W3|nhdLB)Fi#y+-?EUL!eHwk+a`0KhqrRUnS2@ipbljQt>gzhygT+(S
z?$<BAURvheVC3z@B@YtsJxEOVUws{jg<ZYz!3q4(ztxT&Fz)+wCyh5lS1f*^97yRg
z4A<?Q@EGDRpVKb9mx1-{q5%hXZBHrlN^z30R{?-*x*qdp7iQaT>Sb%&uXIsv?%^Tz
zSKcdK5_;Jyw)DQINuRH_Z{Po=>6>8lW2yQx?=$7^)<vZIM@$={;bm)6yy=x)hZTL#
z?6rQe`sLRE!&W=oXZH<GdHrYL@Lo;Rk~F;HOcvew5ck=2<#12c{Fg)Bi;ENcPBPn8
zLey37G;;3G!?(&J{#(YT>c%manp%^;`rqu{s)@zfE&n(tKhH?gNK(0e`EvRB77=mp
zV}FFT`IZt9SMuoj)azB1e-`eF7(D9Brjkb=25Efpu#L!H>G^D5?|!A89ZFxsbv7N+
zp;fEjYR|Ur`|9>w{U*0pKCrSs(Yu#%k3Gl@5@Y~_urHARjQ#iPh}c;Fdl|69LBGHD
z$oR5qL{U|1Wtmsn#vGz0wME5^a!=p1K?nQ1UFlv}lA5SEbs=5RpxL&K4JRcpd}f=Q
zpYGRip>3IaZqC*C+=fR!Tq2pGYe4P(^?RkJOIns)!kKl!pS>^t81rI9{<9U&pIs?0
z4@<27Q0v&7-r_2ip!w{pXUU&2!^#%4{nqaMnOD#Iw$nNJW8_!w?>F1^wkz*>sJvvz
zlZ8;dL=6F>kU*vQ=9x0NdrK6p4r)uU<^6bFX=?tclkCdR&5VaS8)#)E8Z?+|X3<Q`
zcBtx*0Xl@|0M&M#GcLjcpxK7DHPwBJTKBkNku!UbUioj+$T7#t7l-HOpBp3Qwos*+
zgi4`k)9Av~rXxD6nguKW;nvv1ckO=YPN+2ayX1WOoqcn=9s1DhOm<`P8MYcau}hf=
zJ1?I+IB~uA%F^>w20rm<l@!~k;f1~}=dGnerMb>4@`I#BI&Tm<U>g;<MGcvT)Gy|^
zskPdP`t#42U0)cHzUh5z{_^Ro`l!9U^6`epw1w~HOk4dMdNydj-6LDIzWrjY^dE68
z`+N4KZ&A}P<jn9$FgLI9ozt|(MOd>~Kq%gx&2U+^tj(yIfl$W+0#v?l4mJC&fuFa&
z?j8Rn=g|g?sQvcp*@H$sxFzsYCrZAU1Z^lhx4Pl3B<t!UGtN!*`JC10LHgUZvsSiR
zYTIysP0oh_Ex76w9gaGas}ezrf@+h#ZMHLOH>I$6)XcMw3%A&P?bhD^-u#32AHVw?
zJ@^mr21~U|t3&(`2d|qyhU-%#iNF8u$Bet#ms-#N8lRTb_p?dsTASFWkK@+VR7E!Y
zb6L1{$EfUgUoM<3y}x|^{+vYviq=A2#Ms7(;BD_Kfx7UX)s_@r%3wC_%Ie=3mdqd3
zeqp-FWnL?~jnmg{$-*)nb`$M_fNAq>C-%~OQZj!?zrWqf%8~}3RDZlg`>on}arix)
zU#1c3GJp0e)#;N}-hMt5TI1P%=J?#~zgpe2ec*5<vCQig+%}oHJ7Gc-M5p;ZPJU_D
ze66<1ybpyZqjDREIm`%e6_z|%Ez7&t8ov%-t6~#-KDRsL*I`}C*AE&mTV8BrJ?+n-
zTaG*i?1Bup9daq9I~)c&EdD>jv?u%1-_`6@gca0}N|V2oU(Qhb?b!zdbaxP1`C4ld
zCk)WN2qg#{?2vW0UdD?}rNLwul`l-!D}D5K_oFR{B1MmuaruqU8se4hmVap8U7W}Y
zgBSjAoHakb&&0KB*A0zoJVImoK%LTcGt%OBPE=d6DWA$j<!#8Q&oBmTBXe7A9QqP(
zSGcFKhuVCZppo>rN5OhF{+ZS0YtP><+xb4Zhs%wp{*h5Ls;}uGG5S^ht7a4@oyOh#
zoNE2!zOUhHr4UzMRIaH&CV6H4nLQ36<LxId^3>Oy{RhH+t;v}>FCN;FnA}>@YP!=u
zrXLR${xo<r=9uA`N!>*sxb$+2Dl{nz?((BgXH>n3wQFTtW0G0)`tAFIQ2xQm?)ChL
z35iGga>}ukg7_<{saIqkDX`RKzLpRae3EQmv_I<YeR0P7*rb-dH5LF!A=GMCX&BI5
z#TybF<Ya%QSfaUnV7OPcd6;Efgyr=~q2qDhFt3fS7kJk1;RB86sW(&Xjs~du;g?&m
z?(==cn#&_f%io*5TQY59)k53&qr>CF=D%2&q~QyBxSnFnX7zdxD{eq<-M25#+<d^<
z{uw7Dx^cLs_QcGc=QXSD_04TRH8Ylf_6sx&*0UG{SgU}tuc8=vhIh9M8Lx%4s~%M^
zU4wn0@`=nJT}r%{F)7NF(aI%F2{<pNg!E{yt);aoj*kZ2FX??{)!O3jn?qYxp|b$L
zf$(~VTuuJO-nwU{ufKJH@4iskP~E2#Kna)fo%B2*%94!8-saNu6gxwO0%#&XqXkb%
zYjW@qLa3I75y_C&<daaUUQU}cc;z3bBDzg=VhB6`O>1($E7uL0O~07YRsZ<7+wvyI
zAACy>N<OkIWbyy!%eF|i>YU*mVPkH-yK98eA7PXH{~W?sQXXUzq_;H96~R4?WBPAY
ze%y^($yPhlx<rQl#)occw(zv3!&O3eK)?#8DGi7?4)WrlimJfer>3+m+ASY<QybO6
z=E2EUQTcu5-9Fs8#~$x5S(ff`&vcBP)S(XKjSOA1;q#@8zs7Y{o~cB`z52wT%@os$
z=x1)+K>Tc0F7-&A1uZ<)E@mt^IBTm1MVbnyPCN^!`Zn7eWGUKQyx%*am?`acX@-0G
zstXO=&*iva7N%xtp?p*??+ni9s;h<of|OV2uXaZkX=*;rXku^BOU^UvYN5bInmk~6
zzX8DA_}FUh+^YQU$&(`t_txW>STx}whv)Ws5UPC4&79PYd+3w0U%7PgzfEfExOAfK
zMHS`avS<-@|Ki-|S(A1uw~LGM?^-g()y_~*9|~xLy*;S#_|s&#x?4csO`&d4?G%3A
zTi0P86*03n)p~LDh%CLtLD$U9Tb>j4D|RAcLL$V)+?eNWou-(psVldve8nX92Ufa^
zuyUEj+A-ur#VEA*qm<6aFKnlH)Ur5_DpDKhLR9o{OXYlZ`tLFG$&_nk`|42WfQQg;
zhd&ylcf5SpL#OE>z4pp)YZYglRO>8T?BFc<#QG(Bs67JBll9mF`V;Kf4oldebqj+3
zT>p%}^UGuh^5^)mb-j!?eQ(%yk!)Gbp2nLp4Q2akUAf=eLXG<6=Dg+`>JHS+mz8VU
zzIb>nb*P+(J&pH;#<x0P7Qyjv`9OxDKq8<&`L)`)O^Vf4HDCMFS&RI0-@gtXD(|EJ
zDdqou%`D`ua@BftT=#J_sS;=-u&j^mVEMhe8c6}8%omG4TK^baY3|%!_o&C)g+TJ>
zOUXafy#}66m!HOxyI3^ff_pK=Qx-(a(=M&aag9Ut@B;KhH63D~xha%iScP22ABh`%
zqoUht7jo_?&wF=8{fWnaw?7@Br9&j)%%!~ZNPb_r^{AurmmJvEnxJVk1i;ZqNZ1}o
zSEU5vK+kmCFvd?SKjR<MKN$TOo{R4f{IbjrGXhkHC=X-_kJv)p{<&4%VKI@vKg}+r
z@yB1j`uQ*hRpXuoL5R0!-o~!<m>yde>1mtQ{oUY??F=nY+Sli<La3^IHF;zUjAor-
zHTCll7d68%kC_{|?d#R;C*{LBy!?UHRvojD#tP!0a*wbkMZ=v>YdR?dgT4j3!YoYf
zwL1%&tir<U#M_^)t4b@BYaP(Nzyo_AU)h~1ol`yHo+*a)G|pSRft+D@Way~?Nq=(R
z*NafbnSg+e+%1d{>e?c3e#H2ei}BMzG0y$1vq-a<igFL{J`Z*U7|7bgKP$^Ezsv!3
z;NjP;i?>|a+f{lQd(XLVq;`DVmu>TZs(=+keMVv()=@*tb|HC;9Nx(%83bJGTW3I4
zB=U;6&(~{pm2W}7{@;P|ty?B8joqb|qkc?2yT#Q0I78spR+ajax4{X;=AvCm=d5Wj
zSeV@R3nf5XY$wOVn=%={gz5)5>>&Nk%FqlCG-k3y+qVgjx-L8jP>ehEPhW{n`upgq
zBNY-ltLX5N-K{>m#e7C%W#}5_$lWxiZkU>BTGzK@Pwc+rBq>U4;)~6y#)Z=l$u3n@
zfzTn!Y0>6DQp8rbW>WLO%PrW`ikhd#cLs0=Rb=ndmpmK0KNv{4?^_k$Z>eIfJX~p}
zdsTbcr-U!NW%R2PhZAm&3@0{C(|Ata+_|HcZ6E71+VjF^xCB3V(2f(^13oq^14Flh
zXrEs{$v3o^QEhf^s@bV2kfFE!w`F$sBXNB&Q0^&RH7+_@iuI2<7OQ_a*5AFctJ5>L
z!NuuZWC2$Zx`=H79E$Kc*6ZauwonGs#3k2RNmkvYE|}^cH|!Fe&YX&4;IQe@FOPCO
zVGhE2-VpTFnD~7r`)<Ti#izr@do%m-H1;5vn|IxF1-rRByZOIJWZv>G27?h^QeJAF
zIj5(;bt;7dx)EBzDi5`so|bvJX~(F%{29Z8HAs;=R^wvE$$;ayhz&Jz9n?<wU;+Zp
zYRAld-XlZ3xLK~b`7@fZu#ls5{;U^#-uPTO?FP|H0a_INncfGGzG>yIpWikE_vxpe
z=*+dyoZWkzXbQtp(P93Zm^S(nc(fX}c$zV+y@o=hxWp2zbhjvUuusdcf~`^9ZCPZ#
z`vQ`=@c17+{~Ji>LWwVvtpS0|Oa@GTaVWG!VAs?xJl7^Ba?oh>q#Ew6i@~;zzv(t}
zO6tkspN!LusNuzRM0no?ZY$Y!onLqiyVd%dsbw!$NUnqX8CC|NsiP56&%<Q~Ow%?@
z!EQVK*BoVNJZntyGnos3HL)`IBLHr+{CRmh%4n~vQ;$8<rcRZ46+(%_^;LJeZ&UA;
z<p1-X&+}z4vAE(;E@|v5PL~#O7)vkwBsXKfrHL10<I0ET9A8+KvYB=_m$xF&)CI8;
z{%-W-y>Amp(V<D=vCT8-UKI1p?Maze#SM)1z$MizTgH%?i6t3_@&;3;bxhU_07%tn
zKSZ-6ERe(=({27&`G|iN9pW93@xfzO%xz+Pb6&r}1<FUVdmJFctBQbP$sc!TysBWF
zXkhU`-DVU!za3c__l%i}sYpKQrn}!CnCmh8=xG2SSnu>qf%c#@g_nFw*@2FMaz-4c
zrB`*cb{$9<6g2FJcBO|uH<-rm<-TMxsN5-U6S!&2D=uYt3~3LL=|1c*w#A)0;?u~V
zO-*SZoSWKndT--9U5}5`D_zxj-;cplR>VDX2<fqVoTT)W#uY!f0Dodtk}Nh|Xs0{X
ze%u0M-~M4WFDGg!5%l6`Aybf{_jkia_)XLyK03<045w>Ps;1&6N^yE2XWsIV;>tDl
z2ww5LBXvd&1<qT(uG-q>aGx9V`iWx_2Et+rBTiH>kc96x{6lnojKFph#eaBOnT0IA
zmKS|<o1_#7WlOz=11PXI)z{UyVJupLWFNNf=C%vOpD#=+uX*J|<xSGvO3nVX<|6JL
zeI!u2442S=*9ly1YOr_HnsJXZd4l4vCI~a?M!^?#)MXkZ-95qtGj&o2T_Y{Xowv3R
zdFd@r@$iGSJ)A9qo92c?hYs~k$)K*i0r^F|u@wq{NK2@@KHpM0sk}Wq(uEELLYd_s
zUyK$joCQXKhp%qd=&%(O--@=vgi}}Z)84Q?Mxipj8Mh?}t<3sRY#>M>WC+5&OtM0F
zqN(Ccfh~l!z1xiURXqQ=XGLq{`$gpoq5+32X0Feyc7Jf3ygn2JqBs+&Lf_!uFaB7D
zT`G;_D@-{0`C(-OW5vfF%`I%jZ$TR+4^fsI-Nk-MlzynR#pJx^$FqI1w7l?+v0Wnb
z0mT<BTPAQU5q!hTX3d;9UHd`@=t)9}yh@|e@NLijx;viSTsv6y2KIx;5GZpBp%);g
zND|StRQF{Q>*>MHBbD{(9E<}U#u?aq7%sS@S%ATl?Q>_?!&Jn*<RrF?Iw#$w88gI7
z;9K_5T7Pti%ZCP<>qpM$F!ax^5`(a^WAdBl>iNWBTq|yi4;ey`BFrRGzs7$xA<5%T
zEy8T|H@2EWMl65B^=t>bwh=Rti-!51pURiqH=*INF7B@Aq%nw?OtY5<#O9u$-kYYj
zXku{Gh|(GE)!V0H+kCxl-Ol7@Zi@)Y99gX{E@=Qg!@Yw@ukoL<$K8#rLF9(1$Eqf9
zi;-v0s`HuCjWT5AOR>&D0!zE`<G*`tj51CpO?=XBlA7_3v5o{$<sbXhsQbMW-H!$&
z?_?W5<r34j_Cwqg%T3PXG(oaXn<B)}UWM(8?C+nEzN7!g1yGnmG=KXrtmrCB4~9+$
z%-1X|5hFlMYyLj=^_|Bs@r0qcuJ<-%?TTdo#xhWX9sn}K2(Pl);t011btY^gc;BdD
znz1tM6Icv%ibfzp5#KwRE8c&B`oHc&r0sy>GGJN|>LxSN2N~zq|4z%>URd$mz(xWu
zfm7dVZN=iD_VZDc%Ti=$u+c&8(P*d&Uk&hw86QD7MRY>+#&>(dZ6Q@0-Rv-?$AVpn
zSe@qRs`XGl5$&)YF++?kjGkzqElNu9ivpta%YDCTUU-_AWq`H7(-m<SV-x(@XgpUL
zqd0ecO{Mqh#vbDr8RtKG#PMduf8C<M4AOLh`?A$lnIQ!PG)Tflj&MmaKy<Lbe_){K
z*|{@>4v+s2cu?X3qn9Y{#B+yEC)!PPq<YVvmw9bF!!cyHVI4yh*%ca^x3HBtL0z<l
zK()}QahNH<Xh9q+MBgbERbl|F&_mP4a;Az#J~UM78aCBBieq6harbNx(@z;IV>ANw
z5keM)`?&++Zh8w#un451QgmLuhRqoMR<eo`@-9uMpRkRFu-vFPImu8^x9FFu()Sai
z1{z5GNF3zU+F>q(Rcz}yxhfr!>~OlnHtS{Xny)kt1<XZME}OByBYR>|VHNr}xuHRI
z>M1~bSH0ZSMl>-%OmHVeg|yS8N@we~ZHS>Mt=G*8-(=@+_y;S0^z#lbBMb8Hmdm*}
zVJ0X7{%_?aK}eoOpqJ7XXJVPQR{G?Mh^!(OW$9z}A}fz%op=wbDt?Wru1MfoVSn8w
zcVSfi@`@r0h3V9(e?{Nyx-0j+1wBwYc&ZB0e|F6<w;L5>#kNo^#l$}limixiMpfr!
zy5pSq?fNwtKQD;$XE+!l5Eh&&;k>19ei^;$dgOUWu)h>hDcWOoNbDHa36V*Z0%#y4
zm@~oz>nxdI^Ks2h3DpG$VO@y$9$&|mIOyHT_CCziEo8^~_+!>HUH5AlR|RLJQ~wOu
zJiHX=V3DgUEyzF}r^mQm&j0lzSZ6bKEA4wtznFM<<@3fdKmT!hw7ZLCk*Fjn1L}+f
z)~<WLHa+V8#d2?IX(i&sgt3eBN407A)BI$>t9Er|SisvBA7X4CB@t;P|2f1ygX<yJ
za6^S)2~f?_OuiO$QQb)4iIW(oD@Ax_m>2~_wb?inIEOi%T<Cl6K6+IXdHwoJz&VlN
zfktBQumfc_GcUsM!IzEAMbFOp_{4GNOFk<u5UrM#SFD<ZA2`H<@0V6!o+HQ0>HdM7
zcOyG!G@_xY@c29D8M~eYjZ=rSzTf<xMQy}G4S(syqD&Q!T*9A_4Y>sr8@|=LUY{*S
zJ?uX`E1#G9@U{Ru;_2X$ceNpPG(Aw#1Mx@~^F-8X%U5$6NhyiZ<l6%V4ceMJaL}Nl
z3*qq_**FaM6#q`~bvtZkK`G1%Xc{|Z2v>n?bZ6%1WkU{Iv)AcX$oB2NM`Snx){&F`
z@Qk<Gz#XcGDZbpX`o|UOMuB}_9tt&Yd6~>pJ~svxz{^<5$vpAHJ=<-X{<ka_6#&Z!
zw40hggG89wVm{@ra+&+9-^i3h%0Lo!h*&BSDPYU7od?)43QwE``OxAONJf&`?@oga
zWO5P(QN;QL&zL^lfnP=O=9gsm_KS}H^h@)?+i+8G*K&S^MbB-cs*p93A@h08N0TTb
z;^XJd-SY99)FsSngbaE`y19w(JeV<5PmFj-FK$(a&%C07+9ef35}-6LhAyh-Of6e2
zMZHgKTwYZ=Yt_wpuu1{b!Ki6>^bIvx8#&+eyeY3`XDCC#px+*@ofn+`&O_zxlih`G
zQH^dcW8K_6u?3oc-ScZ^w?0`b1M2D#6US9!{Mr_0bUp6A@LC>Qsj0B$ZZRE0OI!O@
z)OPMPoP$0_CDb&-5Z#{w%rn=v+w@V7b92QGTQSV*TSub>c*8UXGDH9x1jlJx`C9H5
znKpK`_DeT$)Q3mvczb&iCqahXZM&r(d1)f6&JBaFea&sSX=3Ce5Smw92U;X(z{s}g
zQj2x3_@nj@&~EVHxBpUWNo%>~5>~}v%EvaL6d}j_8TnalC!J0azlo}C(Ez#INf1&i
z8ka4*cJ=DV3-fE5;AyEzD~(LsK=s_9gA?a2!;J!d9aWdaJ3KhZH%(i4`M+Ci*&oJ8
zsMX_TqYWNvjjOMj;;O_sI#3s+HYU3MS7LvsY+)-B`;p{Q1fK`vOn8qgo=9#7KVMb<
z;2>8Sh==_@rCXr5Rl}x$l`7wj4Ghnw|B^>y;+*VsP3BF8-}uyoVannrFGlaH%C6@|
zV7;#}MZqP6P(sz#uRZ*igJ8+@-8`4g+}CwJts<Vj@#UZ2(aJWE*IoHxuGlCgGqHn{
zKyJJv>iL?c2*c?swa%r1`15>k$|$T!cJQdq__)r&=n3@&W<;s?#=SJje^v3V#v9iN
zu|m!;&eZGM*ZRg>G?VaTxRgR>R-EHz-oM`<=+x=cW9QCOGxiNSpH1>!e}m2=l|%xB
z42iA2uf67aW8VnP?#6;ymH@fReRVxLS6>Q0sgnhg>^}GI0wW@bqQeU7)~#CEvy-I*
zMZZw4Y;5c7klzZtP5#)?=bOFUo+3LousO0FE91ZR?=M0v4-K}7^uWY~6-NmR5@9h@
z;4jF8NC=_`qFa=&zq6~e&*wS1>Z+Km%ivl#9cdz@71IdZ<-|si99%}FKy3se5JMjF
z)ZWxDG8$A03cxTKvLX}BMM+8jW)u7?9Iar<)bnU?T6VnT(a7bk*9WsJ#8&`!K?oU^
zOq%3+Dy)v8CdD|Y<LZC4Fg5&P+Xn9<%%F?b|N6Rq_@zI-&B2kX1v(_vhgz+h2q2qE
zkf^6XR~r59^g(w#Rf)F}m_*`AmJ&2)+09n7FIC-}r!dh>wz9umheQ5v&t@+guysW{
z=}2j50Wj)z7ZM_&o@)yQZs(yxLleBa9GZRUHhBsRPMOTNjzpF8S9HR}A1QyZiXRP+
zOejL~^+=mK5co*%ZuN;O*ofv?^o4S(7+4<)kRk$$*IP1ag1O>+TJFaaH*OS^$1_zC
zO_2z*D~z}l$lx3>{1pQo5@312M7b+Hr{Ft;t1NysX*TPB{4sZJ*>%MRYESXCPoWCf
zNiicHeR7+Nlu=x@I_|WbYf0{ns%Q&;wLwjsNP2#Ls}Oe(;;$H7UfFRm;*YJQx!7Ne
zo1Qk4JGK2r(xYI79ZB(g`uxdAVaZwe98@Gi+~f50+VFeMF;mNjJ^i=v6u!M|zCM65
zz#d9pQNbyRF47{vr^%7=4LYNIMqp{MJ0xV)!hQh?ZU_LpbuP+o%rHX$tuB9u?i<&Y
zJx!=14I^fSb<|ZpolW%~gpRL&i(M>s5@0GaADMYV{#FJ}D<j@E0k;CHcck+8_U7O+
zaSy~Z>3bh|@j?NS3pVPR*TEs{k0;qBLgbQxQtQ6I4wm$QAtZdF02-D>xLYU1dcJoj
zqenJ82J{da&7uLxpyzd0iO;^>-1oC)%@Pkkx$Nr(&?UdBph6OT82>5y`<sIm;<7k&
z=&T#AsO8;a-x+}WOG{EL$lx8wP(YrOfUB~EaGpirh8!;;-EbtX%p?P1WsVh*KR}Tb
z7eM?tE0HHAUC*7jO?>1gPo69nDysVk$21K@2#tlb&ql3zd@lRlF8tVWE*1w)sFz==
z+coX8Xj;#GIuj;zl_3{JJJ<j0GSBDxLV*|nUs!D70ANL%kKE48ixz;Fpa$>pHn?(s
zIHKSybF-?>CEl}o84?h%C_-RhOt40u$O4i{p-E1(zIpSo@)nq^fNU{Sd7|Wc=krl8
zBUvBuC?(8?>xpTH<Air0#ssqYxV$MbpexvY`}k6f<=t*pPRG|Ldzst#!WUWZJeYVg
z>(F{*+MzO!g0Q9>gmC=0nR-1a``fAd#G`U^JGZw34B%l%_koD^jZ)aXD77)Cj5ub`
zh8Hsk7Pifo3Y4B=elN)mD58vDQEzM108C-@lqoGl=fwC`{8P5|ndcXXrB$=R1slY{
zLB`#inQa`>{0#|-JpBTIZJbQ%DR_sM;*!F@KZfMx;;xJb4{nP)rf`IP@^|1nA}N&w
zRK5oLq{x#n!QkO!QrsXpabic-1p7&p(tpPAL#}$pb;%N$04h`I<PAu3h0gWp^{diA
zO{cLI7NhldD;pMd@LY@Y#(i2h<q`$x+pONUMTx!g@J!YLgQcm3j8<l$$jDJLXmE7#
zh$FEZ8`M4!L#XO$raz`!$dN=77>s)3XXGZF$&iLxs{iCZ7pr5Nk>90llsk9Dj``bW
z%_oZ&yH&Ef>YuBC79~aGmJ@xvs^4grmj{5e*i=<ZOeilCoA$dQZJ+)l?z1$Q_QRbV
zHm%2!0lEpAMmorHw{%>ihSAfp`O~=eZebmD1;cy};>sU!#GK_p4_T>>z~bF|_Dsg|
zXc99?NUu)N0-=}556O~IsFiwfzhKS3enf^YT)LRMlwvT<-|*Vy%hrbmgJZIDU)}Co
z*Gfx1tohe{=Cx%;<t({wyLnGMP%H(Slvyw|l-`moSwH9Kn;mD^ymVLzqCf_Y_e{~s
z7%PLg<5l+nw%?Xb>p3mB)tUIOm*_;Yu?4#lQ(RyaazL3D4@@VskQHLwAmDqjiFIOv
zk#=2ovmRZZajYXrkfIJaF!!5fx9hGnQ$WjonRO_Wx&flF&@5QI>BqkuQ@M;4cEA@h
zYAOI&*?)gH0RiVYs!?&G@NcZ0x&)YrSuU$BzG`2TrdIbUKar5&UoslhQp_T0`1s6`
zCgr%~HkJyyf|7ga>u*3b|AKyS8U~Y0D3!yyIrPy|`YmT{j*3DU3>$OnZ3P7dlbSCz
zjrnJ5mK{L2k|fY|7A}vNg$q*B+cp*|%Wm@5|Doji`t#{q_)0LPP6_L%YH^HoqGhXA
zGBsO{0vYh-=q0-@$CDG)Q3@Qc<<SnU?$`Ra9~!Xxp1KSQu{y0u`-t!J)9^-E(gZbQ
z8xTfbr?#JF*0G|>H7tZL^RBT5#_viSb8hdmTP$uW8{6hd)KvOYI1Y8Rnqip-#}Am<
z*!~SCULnnx0cXK?L@X^n6+C<qcwwfoWzEb-O`#z?oVQ;B+~h{=s@=5q@|^Z7A>jsQ
zAwBGNLFO0_X5&@%v>`1}!Y|G3gTqHys7a6D__!*@(X{8%6Qm{gyHP#and^%G-`;hT
z#E+X*ui4of{OT64$Pg<25bqxY7`!4Z=I!lKB6qV^uZ%bwOznDOUbv1!lU4_SnReS%
zmgLNEeMSbZKPBRxmD?dbp5dpoa?1fTm2|fYD8!_&tuqY<so9)lrzWpMpM>!|tE^zD
zJb|SWaB@<25$I5n0RN&sj(gVe{{o1RJ(^7?CrH6&L!p34!)+hdL9?<kwGs(y5G83>
zev{rVz|gT1!}iuS1q0NSzd7$e8)k*+g&f8X^hoV;Jf1J8sCy{&q=cBl-hE%G=zIMD
zoDcpqQ8zZ}*u4J4t>Z}S<(`(6AOVXHI%kOn7VtnJRtwXe#)iP;P*!gp{tfG>$a>vm
z&@!Hp{DCHQOop0F51-eJCn8?05OEKU+8Mi#Aq)rIVjHtL<+2UX>SAWnwWPk1LUHBp
z{<@b_eyj^QK6Oo6*dDic<eLWz@uB<w?^-()?pTsVCR>4@yp8~Eyc|6bos!OzJE`pp
zH4EtBFHg#^!zisD4)RO_0`44LcI+;X+n7_6WygFFfyl`@H?8UJ(boT7U03N7>=OOy
zFawl@zP`T7O!_@D9#A`TTtKF%ix<RY)=T~RS+8K&9GVh_W8**Y;>uhK^ijDrH^Oq}
z(A5){BW`qB{Y$P7dz8|401;iLyE6zflZ-2xjQNUP(nhG>KYSrY$!s4+AO1=ql6>5I
z^k)O?k-|FeAE#zalT@KR>kGak86-eRduP@)r|&feaZ{uN4%0%p`kv@^kWU6o3soP`
zqXh+vx|k8Y^oqmFDJQ(0|JkK{2dZcp7s<xkK~v8-qrDz9DjBUN6Q6PU+enFx^^BQG
z@7;#Zr`<E;Tj}5B-yqTvbEE?IlxCde4MB!~gy+_OyXaT)e~Z;Z?UM9!{j9dmWW&jP
z0yC5CqtjSCz;0Fn&_?|))m<D8K`a%*(|Phl=V`r$q(h)<4oW&f-1KN3oNbSO{b0DK
z0S;DsKPx!3Q=dtdW@L8Cb>O@19#yzn$&B%S=`qYcD!<;h@^KS)hx0}px}xvyO`B}`
zUp#Bwu3dil(JQh}J@!pV_+KA1SVCD}L(Wo3A3!ifmI|_8kle-<0cR6Lp%EF`+BEHF
zXvdx1QoDR=qvO=G<<?0W_<DDDiGbPCyEDA~Q-p9h-d<~Sl<S|hQm8;4G2{b8WQB)+
zsH$IQPG0Q`uehv&g5Ht)J!9sgEQ9^Sdv)$;+lU;SfZ{^g!6N94T(>GA{2&LrHwT+j
zjg%EmFS&OeiDhJXIy~xE6(4_BHj1+pNvwSAT%ZmPtU_^Vdb;6?4b+jAvu1_PtWoor
z`f~G%kRHJ+vipG12GNG<$e;u8p`vA1YLjiuw=U|rZ-TmXZYje0^~xB*x#jZQwQpZT
z#uv6W;yJWj4z4nMs^)izDYkv#ZbvF>K>Wht8tx7a?O~lT#^keW|6Zia{B+ix>>Tgv
z1Va$!gt8g@Xf0}(m?iJu&|1v`*d%4y5Y(yN*b`SW-*X7QeM;{qU5F8%?sRz)@4a??
z^SUz>FT9!K>96O6wndCL-Tn<*wSvF<3aSPdB1{bJ$<m-4sT2LEqSql6`_38gOG-Op
z;qtHFzR;Fai25WaO*R!>jXw#`NRfIkBOWL}nR@2d(80z=Mn7(7)J461%6D2g=-{og
zrbJop{LUS-egowdyCqN=22V2!YGZoIq=7n+BFLO8ZZI|3#l>Z&XXHm$W0>$}BXohc
zKJqx4w2jK&L4x*DrUOuV^5Cm)XwZ9s(sS%{@ITLEfusd7L?S6hs#I=epNC1<NN$$W
zPPbIzO>jE7{h|S8oMmOIzh%uh{g<DgL4sgM(hmshxTkwT!dhBt3Ez!UE^RV=QRuY)
z^FmN!+hiMyCT*efh<61Z!ZLA_j_)W*^1WwU+AVaNpP#Yj=DhwHR?H{y@$s2iGY%pG
zpFKV0o|Uee1OodMRj^PpiI&`&Y8QH5_2s^&35gfXt*?9H@unKG;*C0*K0zTc6!OUb
z=Mh3jJ{tRwwh#Gz$=}<{*M?@l^S|#DckxQS4Uk)K4(&TPTBN^xY;sN9iyUch(WW(t
ze-16AuIr-Mc1wkkNgx%pW%N3C4x3>c4fYGHMX#9XNjix706g`F=Yw%HKI5MCJ+hOw
z4gLCrO9`Wh$IslDf5R8*#BpE1zm|X{*$$dsNIQc6WuNi<_8@fi(0!qVB%bnFMtKot
zmIHECy{lr+v+IxW80wE^C@^@Lh^x?9;0AnieoBXu4A4^WJ-+LNQ=hh~oFW~5)o=ZG
z=?QINkG=}q(lMEGcVpjtcnn}Z>tMao%og<#4YME`MhUOi6=u?Dl%Y}m*A3L&LFxU!
z2)Gz05}_2R?gh^Up`(Ly8$;|0By+-tCUi)v2~$0><$<uJkY}X(hDS=$?g()F<i=`c
zn-f4<VG7rPY1(40DwP-!XLP4)X|H@?%U~f#w_<telm2XH6sPJPhP7lXTC7<!4Ac%5
zL*L<3TeA%k?q$&R?t<-dTK@IF|F$C?!8o*wc*Qe}hKFPapdXF=2*=<@y*eH&*V%M$
z4kjRn+pwc+w?V6ZM9Xq{_Qb)`{{y>@e!YLrj{z026k@?q`DNZ3#!Hth5fd@a1LdbS
ztz7R|6J;>n)J<;c*8=^xx?UR6>cKw$sWKcTfnkAifsmhrvfB2Y!|WX_ro>QLu>q7u
z9E3^jx#@_C((Zr&l)Ac=4f7@i3`J2W)?TatdydL4wIYo&&v{gJTN&F${>}l0hv`xf
zt%U)-<>JNX3ad{r8dzN7@h=pA-EVoBK974$s$4ZfrfkqK+9FpvsM^=Kcns?q_f+T(
zrZwVohA}75`poxEb$vu(1Id^$$#gL$D>Q<b&-lWbu^v6d9FBYhtBPawj)JCgc;Mcr
zzupK<%Pfo@SW7Tam~pF26!?Q~Z#P&2Qk(6_ZiKn{>uUxIupqWwvX54c;4)R(4l&f7
zKv2|e)ORJC#f$gN&6$(DV(InSN>22@emtPRnA76pGm?pi!{AyEeiFB`AE;IC`}_-d
zI%nKCH0T=jpP-yquU>V3=9ydhF0$^B#Pf-0Ii1av@sb!+D>@`ip>P(TL6Syptp1vN
z@BTkjy$4*+ef!3pi_6Sqg{+M1t&ECf@0FF&AQYk`EoCHok1I+GA%u{$BuToo2o)-&
zw6s)`Bt7qw`~SS2=YIXK|NXiz;`jT0Kc90P$8jF#S~;&2YNy=YkKke#vB7L_Y&4w%
zON;<TjN#P`K9teF%26zL<fVw{fXDoVO)@M~lFcweN~GMQoE3Mdx)j;xb@HTwbEE><
z5$6Y?5sRw8gP5r7iW7U^U%#p&d>}NW3qr)fB~Zwsfy1b98g5;(UqtRHO9|R%&ZpkR
z8czuGhhRuD+^awI0OB}9{{vX_fsxYI%StT45RmzEMcP|rvaFA-OxbelY0!H<*y%Fs
z3S<HQ$Fqm+lQl|Wi3~B^asIgSMx~)eL-_-0{yxv!f{FajkbOD0f|z4MUtHLQ52PiY
zxjjdWNU}(b7b*Uk!;RNH`#&u}zxky~_nH$V#b;COk}zv-AGKNmmL>bU<Pr?A1u~Wu
zlod54LsmOtBg#x<n>ICBzP<LP-&q|s95_kVa&rS+HLd#oJ8PZ4xkI1T_)7pc{7qiG
zgIjO;kx^W{qzRD<2mVyaxqwv0sr`IkAA+kAi$ww%-w(!$bH?8eKGq@s33qX*`J)<>
zyE$iY7Xa;s%OmnWeXOpG+JIbmX!8oE<FWDFQQNUgDQtjRY4*tgtRVhJGRExLQGCXe
ztPunVUrmoKC<$%d2mAbPVwuUiSBBydg)pL>T9gh73dQdNnuvqvT}VMOl$B5Co{+7i
zBS!{&sT_}dS?7W3hbtD${}f1#kR~b;*inJy6TP%$nVF^gYz~G@sKV)18?RSXpuiAk
zO0EUrjpa0Z(TRfYOW-GCh?}o;yclbC=Hn}9?TJTI@e>9*njleFZMciW{PRa@g!o+l
z)@w7RT44!1<~eX4CS(m|BY^;tgc+zbeXE2~%N}g%BnX_sV=>_tg@uT-WZ@!<Sb=j4
zEv>Iw{OL*#kA#G9j>%~gvFod8*fv_)trXTmI)Fd*`Za@{I$QnCi_%OU_*<DQg($?U
z4?=A!mKNt6awM`&{R?5Yy~{yI0AQycBSy55uMM@&3y7_*DMNPH#_tPhw{z#tFJHb?
zz?xlj+3VH;JpPZchxLgjstTS?^ZqldiOvMTZc1MH{b&?W6NBwtjyoGjpC#9!*d{FV
z;aTmOV1KMmJ<9le8@e0uzemFV(tZizoL_ZiyTrg>P7nl;z-Ho2Y5cN)PX@LMH_xQk
z+=o<sEc=Ddl&!9h!L%r>6#>ohMIMLG;`a~LkNSPEfa{VA^F6B}%)*EXjkMkumg3Sq
z?7zjpZLo6U|HeN0De2>}94hy&|Jg(~m_7QQ-2#it)o`nzI(FD0GoB@K3{eh>RV~#p
z1<2DhhjA(*NWwZ8@<9lq-{pRzNCWNe;P6q%?3zI2QNbDi`dK3@)?~}e-4&Q6v(~+d
zSekM3ICLm3My&nsq>6V?DZ<e7dCW4L%0u$B7y64@#hKI%PT*uJAgzXFFVqkFIcjrU
zhhNky8Tn^ym_pk-FJ5VdUsuhojX()ovi@h=l~zu_ThS*o5Yj3g61B(1<bihKAIzhM
zf7Wpts{{E9yeyayP^tj1Q2D~2b{o4NQ^jQ-MiscDPoWxN_qxKF=e--7B5{j5tW9!+
z*zSPgI$G|=6feuJXrlRI?N1L-bUaoiBU1MYqDSSh20Qg-cUDb2aiU1XEhHAPl2w=h
zm&~V^APG82E|7<g(VZ;GY#`($hlA2kK`ZacPh3cZ;71D}Sx=|t`Fu8Rjf+UlyQb^I
z>PAP%pw$rT^iIKcp3Dy#`d5$D;dk7G#wPewbqRn?5-=Ghm;@6cH>DIMGxh@IBcw<F
zPY*Ex{RsO)<TjL}MKLB236z0G5G!B6**o}0v(D<Tf~FhxeS7eT5|~A_CQP`Ke}vm%
zYjrb{<kKy8$y{)*Pq6E5gqoiV#DxHB|N4F0*S!*$b$hpa9|MLKn7PYYrj>?oD_9^h
zLaeo)g7%0y22Ar(OxnyV3#Uw<F3wv>f#_#cPWZ-jx1+A<KVSe=(GZCl5yMg>AvbLB
zzy6`26O--97Hd8mJzCPJUEevsNhumOP@v~!p{PQkqKIU3H;wthUCeUGtW}}9#t7%7
z^$#t!+X(1PWTHq1OzTo#sZ%dLgq-DGOg@K%gvhPsw#$-p%oJMCZ<bZ%Y_PD1p->g8
z5#AIPr3iDXBkHyPsA*aFA<Ivx85O$uhYP5Tj7e#7Ae4-V%oT7lKOi(7P)_mvmaH*p
zPBM$eRf+q@uk|zZeO>r-QUo7f?zrckIYt~31U=o!-oLjOQW%PW9+2n@*XMCgxe&xy
zSH(aO{zWIPpghLBD?CIKl37sZ&MsqZid(1J!$HU<CiyJGDnTHQ=zmm~?eTWScVlGD
zH*WG&TVi?(n5{S&3u<SKtp}NpXn{Hy-2XQcR}o3m)dD<Oo=sfbWVBX77au)()X=3T
zMHCMMG~o|fV~}okXAtWFg)%$o;c*l(TXkxbz=zETDO#)_zDC;Y<Btc%wo#qdLr2HR
z!zbw+8Wp0MR9}#-l<T-rL;fJPI9N6j8^2+1mBjsBjt)HtSkDOabd7YIw9U)zuWtjq
zjyHi(dsPKVw;b->yJn}CitiHQh<!=xr%>>Vj0&({Hl=}`aDUsKbD?g7=|X66fMI-P
zGyv`V&6u1eVktnfbpk`6*`HXpi2l?ksZsbmyZPWVkd2Q1S3AF2Ka$`{c|f(mss!2%
zS>TEw_zodmaz$(BIhnQZsWT-!b50JPOHE0+M2BV)u>9d%k*eYI!eli1Sr|qJX|hS~
z!FJ{K0k)YR=VsW}*L$D7xq9W<YOl9%cL%jpR*zR1;jk{S=7a5dlWpnYWmh-9x<A*y
zZ+O_&9bu<O&V1c_^Njy`uRihDh(U_Z4c6Rg5TJeMSoMRov$Z00zRxu3qUfsdpjlar
zb>Y2dNlE!lmyh>IwU5v_=h0wmb-T5!39&=A2Q{3ydBc@{n@#q$o^<fTOxFR8kAV>r
zSHIV0)+a)Z$bxdKJvBv^d)>C!oBCh5k{i1EYVyWh+65Nh{EMFfbd-78=%zjQww~0q
zXX7T<<CP{kw#Q?hRUXu|Eb@iFw}d+*jc;A@_oZlK*%km--SAw5EvN~F44Z=SW!JsF
z82R#6R#+10pF%(KFQ|aT=O6xk9%Lt58OYA3*%UI*w@5ePt!Lz_E&Fe=k126hY>}q*
zXC=x&uJ^MO-CxMw(6F$v_;Rtk(1q@44NDqHeGU%lNJ1c&rclbA*gelLSt}}z?w$Ka
zL&gg3E(c1j9L+{{-lk>XZ{=u{zdpKXeH~rxkrlVDA{d^kOU<!P$YHGS?Yu7ar+lhF
zixvRCM5<x+fXl3@fCBIvSFc@TYm=e^y9+%01^wSy<!AKq<HtxWhvN|D6>q9*kyTon
zh=k-jrTq&GySG?dmsC1z(LO%>^`_CY-Iw!BZ9Oqe$F<60c2U)!-|y$bg&y(^fxQx)
z8a5F;n_fNQW2L@<f#=vn3euwMw{Q0yGv*vzCSp)H$45hQI`VFKL5eC1-3ILaK9JiE
zp24z&e_4HoxG0g&RJ5t)b9VM5780BrYLLzIXVpUNyX~X;_7w{~{&b$7C_Qisq_7qv
zx5Jkt{ikPSXxe@0LGu4y`0qV%Kprth!MDebzqPdJ=G!n<5QxW#@m>9ZNv})-L=FT|
zt?QS}?`7%~%ZXeW+l7$5iU$K-)HrQzQ27v(sq5H|MP(-^jdxfnTO{-IV?Ye>R4rVz
za<Q$gEumoO(4kZ<Kzz7;tBxD@^TwUG=0UHMk|LoZ?ZH2!wO?`TEdE#{*2OBxqLi$P
zpWeqeCYQdz7fH6InqXedHg}>1v}A7t%tEn)uUJwEnz}Auu_6|HEouEo2v0RgENH8U
zngomQNhfwn8^`m0mYOO;6jslSP*w~YwGD`rY*w%>G;LwY03Q6;bNOBl|F|bi+rItB
zX73_{Cv!$zqeT&xk8vZ}%ngs5n%>j4z|d{OXm{X(kkSe?HH7h_9DwVwO21!Vw%H8_
z8CdP}QfboHLreYEA?0CF+Yts!P9mg3RU=CgDB@A=0oJnVnlyBn2g^pUX02M~gW?!}
zesi4&#8)8N9ql&uEd9$OL&HRjw%)N41B*fK=IfU)vm7(f*NmU*?6cI*vw<GfE02ZK
z-WJ%or~-^B;9arebM6gvzbrincHCn(XU-h&5AHx7M0E~kMCFunhA?2F-C%+-aw5ue
zS}Zx`qdrwuRiSm?$CsmOd5l&9!|%M6nLj!ZIua+mF$rK>S^|ZYJ=Q<6=~&D2IB-=t
z5=~0%=MaicaR#9%E($+?p2U`epfm{p(mB=0e)MI7_wL(QPD5H*VzJAej`bXi2VrCd
zq5yw_v*%8G4%stSON-wd92VyDx{w{&Pct*SPu2ApvbSu?%K}&R03V;68)rj!^iNk+
z^M_^LwVq9gbirSK+6k2emVLZK!wf*9I%-r^=!?Qz3B`e&M5M|u2voFMw7%sJjvM_6
zjw3X9?!UuhA37UH(3Gnz`99WV%tAfAm|u0Z?;>{9FNwpboNZh7^v?26vlcC^#&_u6
z{YAr8<0gPYP)y><3iRyem+~<$kIhYv-dfW&_bd$lnPG!8x@f-}w*gn`tir;X#39fW
zx{G6I<Fc-<^f^DL@7cl2Covu+&RBBIX*YioeDVR}`t|JT#6_n^VcAX}9Qf&S38AH>
zrQ&_xxh1Nr)wGSWnG@Z+fYt$oW`27)p8gvH7aAJQqf+6>@h!)V8z;-RMAyZ+Moy`b
z|8g7>p<O>-dxKJ~0wP5oR{pEDY|(-RyV(Pjndx1|a%X|d`Eg_fGZAk|NFff12PR@8
zjTqB_ci@fvh7FUYQw`6h<$So-t+a#%0Q)ZW$4h(0?|^x!OEdN83&tZmcVbQkUh5Ot
z21w@gzyEH_=r)bXuW>6&rn$IL67UJ=vE~4zAX5t~N$VjQY=rU*JJnL_j6v!XclO)c
zdPjbC_7f^{yF52j?Ac7{f{-AwW4EFHKi%to{sb!M%PDTwuJ3T0;$^_;+i%7sBpf5J
zGEyfiCY~oGT)BHU$$pXzZv!*hDbuE%hUH3H|M25mLKmF{yIqM@6H!mFdLmWd5QqQM
zrzI4zBEvU0*p<G;l^t&I<Z1bzBQS^+X_>8UzU-&GwKM=>B9V{zzDw7yACj%wl&V=<
z3py(+&!C#cAWC8upN)J+>dL(vqXKQ^h6VPkl$2*I)}jEp6i}hdcAwND-lVFU+G%Pd
zSNVq(WD3M_fx0p>Jc%!uz(;v$JKTQ%4TI~QUwyjE4I7)5-?v{sbvBJbR}17u3C8~J
z0YF1-I(CfVB1m{cMTW*PvF3bTr0&E)Og&9%nsjEboAFBqKB;Msp~=+nQ}+*Avr~Pu
zNnBjqfWd=%WBg7tmp0x;QrAJJZR^6+n^`;!FOD4@FsZ4fI%-qC9y#QUi;GBnH?V;S
zQjV{IpYh<qM`_pEwr;&8!Tus!2A1P(!fRIjlNULkoFvnd_kq-DtX;~?U!$a=qC#2r
zIwfV=<jKxf8L4Mj9Rh$ZVTC}<2jQSaLx0Rdum|W691_BuudY&bC;bNx9=zfpfr@aY
zF5SQXFYB*pI=P7BeiZnyS3p{Ac}W{KutwU2RYCOdAPC$Gsp^?-l$N}PP29(yuVp9F
z83q^ku|(`NZ)q!{CXyv1nCu5>-L-2xhf$y|T9;>OX<;#=LxO`P3sWP<z<zpQ9HjYl
zo24bvJ`qpu$H?Pr)5gggN3l=i5)}z5u_NSs7!fxvC_^#XWK$Mz$eb$e(u;AGbG&bu
zw4cfnVlRrr1-iNunVF;V;sKZ~O*Fjj=NI=bZz(1Wds}DMpBlbMQHzIxumP#DCTJ?c
zP1#MKs{hDen22L^J|di=qNGzv@75_os>G)gg4<P&Ze}^=OzfO;YoypwH0aFCpglW8
zc~kSK%oJN{@koG(am=+wCW@eO=KT3hhR%1YWL9b)-qCcfs94tlKkYCyqSNOpVK7O4
zvH0%x<;#~<>{68nEHP%ulBw)#qPds90JxT&7c5RA;e&9{wRq_4@}h>aLbuPO_FUq;
zB_zD`#Llm(JHG{Z<J-(xynNxpZV(Du!RbtEP;$!fj?LKd<HwUEHS(YCjZ1c6*@moH
z8&h-Ka5NRruj*a(=Xy@we}@|i<V4p18}YBERUSK7_o!^%71p9<%UraU<PH~Z+8$@d
zc1Jqqbji#@1O-fgin8>>&f3lFVR!*(5l^T-boA)9M0e1r$6#lIT~9xua)nO@jK?N9
z535n^w0l&WQC0!Nr#1g+gbExFnCb9EB%t6ua@(lkXp;$sQ@E%!cAR)4tDy%54N_-o
znS;UY2M^8yIDvedzuiCePujAy{r#2%3OBU-_%{bzTw&O!P^V_$9UQ<DlPkg_L0NJd
z0MxX_n@69`9F`rR9#X-di^>sBzoluS$Hj*aJy2!s=AW$nFKjI>2`k<7^z?{>uop*F
zwBOr1P@M>W7_4uwOjkD&ajS-HX;<PVGan;Z(dPakvCE+oC-OygDEqD~euS_{MkWFi
zQyy&5>`@mzY^T$u{xUf1mpqF-c1(@uvpaVbZHCLN<eai2P#$<Z1+KXJZj<&ruV&=@
z)YPCCk!$gGr1oiR(wu{>can2Gi`A#HIFrhr%OxsvWHZIq%Yk`mFt02$2vS4)ei~t;
zn&bB^f==~2FicWN&%Mg4D6=2^17OU^kFSr9Go~C)D#wH8a?EAon9c@xpQXQVhWRsS
zA17xGdQ9XAfWQ}S+&GMw!1V7cEI;*q%`&~i<YB@St|IhFd}Ii!y>wpm{nPF2?EIHp
zq`JjYwwJ1E?Yc1YCDogeZ=s2<U~sjPH1UP5d@J(}yt)oYa`zWieEHIYKY^c+Dh8#L
zfWrQ%nO{N<U&7dOgxeejO=h`D-MUS6^;PA6yWTPh4-ePmO@Ps=0`_^%vrWDSsSJdC
z<Vf78-(%8z)qIeDO3I}x;^iq$YFa+ZcHO$Gd(o{#UbuDZ7)oYcly(sjI@Gsy?=@rB
zs)>-k?;O_&yeUc*4{e9D>{Vp1E~FmCBKeesxc%rr0GdEl!OrSNCpukT>c^(Kyh2~^
zJZ^VBc<Cx?Fs4)%EMGp8{sj!h$jFGNap=eqzyU(nTF^}{VB&(etA?j9Ueb5BpWh%V
zfIn|LV+J+dmC{=#w#9EFc|$TsM*E>R#4l+*Kg1IT{++=6IMJc!mXPRTx-P&WJ{8vA
zZ@t}B|Ge`8(~PJ&&D8d_c8*2>tU*Tvr44=v`uu*4%JW~@xw+A_dY7$|N719|>+91T
zexjlI-TF{Yhh>zGn>g*tLXaW8JVg-ct@E+GK`+dpwHkQ59!&Gc-}F<bt;ykuehz-`
zh|aP$Zhx!pABukVV2t(5^C=9a;oW6_)?+fYBFoju!6K3IcK`i4M)XjcdP-q>`fKbZ
za;qn%A0sb@FtEhJAWqb$K?iypZ-ED)E|R{7p*N8QAwg=WJ=n6_@dW`of0<5Sch%>w
zrC1S^JtQTE#mXEC$&C5eH6VpSRb{RcWta^Sg%CEdfQ)Y+?HCJHL7p}WyoqQ!Dyrdb
zQXr6)$8RKo`iyxq55h9jB0sFh{&}3O>*o6;%~*FexQM2{{7H*p7y36R(P-y6wYT+#
z1%|4f1I0rjO?^WVs{S&nt@GZ+a(#o->asgV&~ASDS@}X8*w3)V>qi6Mwdmk{cAl@W
z^L$xy`BbJ{;m!-IChP@AcyN4_djfctMt48@8L9+6A`(r!k_&@2|Gw0%v(4ETPh+oA
zusU6hI^L)WX^{XgiX(UTv<a=Ir(Xie6ZAKEV;^m8Pg16n%4y{(U0vLc4}2-v)W}HZ
z<m9%5h{i3YO6}0O^DJZIUMv%k?v_6E)tfillDBe@c+T<)5efIy&~Sx5s!DqcMANkA
z5BK}M+67J0Rbp&$d6>-*61F07yrK5xmA$O^_Kn-I4kyyN$p9F37CBH!E?)d*LaW(|
zz3>-FJY%6Wbv5fc99cb;kEHnGjT;*;9H7pfq}YFh`k=;vWM6)UQxDI{7kTm_+<_@5
zn-jUWZ^myjH*Y-DJK2Uv0;WOk?W3g?ni(|lcbYgDd6OHCQV}d)aIxz*Z(e@%=)=3b
zQu9%+gVZS~2>!>9A0Iy~lgngclGjbet#$T1O@LWut%*1Em2oF~PA>gXw%oE7@VlI#
zWgL_y?cJ4yBR0JqP8&&wqysa>I$l@q7)0QRmr?EIp?GrY7FbzZuj90mDRe>U37O1W
zn^1^q+x)27zJqcEjtex<9bAOK!u9KVvgFU3HNK;(w2vez!@-QF&*WrydU`f$)JWFQ
zxq(?~oI~S$<>Eyr{6*8o3*HM#ZV?w68uZ^S)QgCe#p8${eNJNj@z9I@*S&cci_so8
z{rs#RC>6(5@bg?oE;mK^)GG^tF{V@TBgzpVX@ddft<2!_9UEY(eI`V0YAcffH%J7f
zsUL+-1!WiXT>b_gurPIGLDQs5Q`sKpOriVFmurIHdF>=8!u+@mU;SirSyz})<*%%F
zW(!+na<U8NUO<ouoQ~(Qv0I0}Ub*gSPSzY*_Bwt%CX!t(EXthTEB+1@9MG6Y6-<G=
zfODkyq(Y>nvZo6YHh{+L609_9QP~2-xHC{+<~K`z&xZ$^kxEl90+a=(U%7%bVmDK!
zmGIluA<bv$${aee=3(j9fiAK3!$Zio^W!q-^tidyk57m?`~X$$Jv9Do8LP~UjEZum
zY3Jj+jWX*1ZtgN`SkK1Ssjc&EJ@HWR^73jpuGa`xID#+#sh*IH!o1e?=H@1GXNR!Q
zMeOWYmHUh%47$^T86<yc$lMU_3&IY9)KJ@zFZ<VSnVq-Ylp%q27`KhbBtj5U1VxR^
zb`ZsBj-0$)+;}Hw0I0Qu#sr)1L7PDYQ2b=_`x2z<>UmXZynVm%aHNmJ$q<GSiH$a4
zo<OQQ>3p~1-|CaJUWN(ix_aSBhq~qU`}eafE$>=&8?y)%1fc*#1U<AzHE>JSfPYzo
zA}8e9tgfRsDQ^kM4xoXcH~_#C#}A}VsRrKe?p>%=#*F&r-qOy(#2y5a1;fB7QZG0<
z;)~$RI0cbO5Hi97{V50L)VXsp{yVD%U4Nxy+?qw8QE_ocIBz^c^TO)Ix=JKr?9qaQ
z|GOCO-w;VXPn-`&qq~JN89lS+wu073E_+kG(tA~A9~pMq+xrD2i9MT2nX2z24;kpU
z&!|z>7sgEH`2??7dt$occ5Hrl!hA*J{`X`5PYVFY&79u79kn)$%$20;78$w~u2)~V
zbm=s6^P!@VsI3KgAx<6i4^PW@Hq}u(@WF$Mw9H>;#p8gIEWt!oOY6Lh;L02Y+yvEm
z=8u#h)_<~;s=>KdUbgi|lT%V&k&1i*{e0Hlywo4J$3-mUoOK1-=olcGq17BfJDQ06
z5{{eZeEn^h+k}0#hXZkrd|9-HUzFTPAxs0|25(9BN!?-5waFQa0}!Q#gXVQ}@kupX
zmlj@i+eeMA@GRXr?EZ5`YnU`JhHnu0O}OFwEn9|55lu~J`}OGho=^48czGo<Dl7Mi
zJkfjj@Gpb>i~b-N#%gZA=dv^FM;~9TsDzwa|DQ)o2DS(D8_v-a!v=&C+6c2xQs~HE
zC)ZZmZafKO5(L8snVDjh+zBQeFC}kmVrTYf=RB`*Gr9WMhXSJJzbH-hPq%GDTU|>v
zRN5RU0)h@0P$W83W=z5zL6ibsPjS_%RV!|2KteIWj(U};opx>8_OfdBClNS!C76hu
zF^~L$Gp@q$?O)f+=~rV?%5=;1rGDH%L+_M#;eme9#<ZD_rq?b5bE_V!uQ}gDs;sn(
zh6@kCpaIu$EVR!q@)M}WVARmCjCth5!<V*@ff#+$#+b+ZLE+K?hsBYG0Gn!XHE8Jr
z{zULRqxKkQ45VfR=N37RAf2G`)X&Rt1OT}<iB~2f?2`E^<}kKSe=xUcc`{MZqs%dX
zEZL8pETAm)u|>k@MAVg7lz;h>H725_B|UTKG1c}Bqeq;$M~vW7JbpZZQOP;j7SL^&
zqED2kg%g7KBQP=^J%49~OoJ>VT|?cOat!8pXq*C3MlkNW@`uMEFBBVbhzBCUW46#1
zMMwXAY+BC4(qI~2)BoP3U2$l-J$DDMie~A0aOpH%B~JHf_ggW!v9-5@j%Jv5e|dZq
zr}Wsg*G19gnxP-2&Rf<#Tdl@K$#x$hv~9a~%O5#v(y{>zqDX1Xu&dM^yh;jqMtwJp
zmKK?L3|eH$ZlEIbw+m<ed7b|1Q*~va77H#d;vwU4LeIR|^~;L?{_2ui)bw4~AFtX(
z8{b`^r<eaQgZq_ZQ@#6sm|4Q?cPqcYOH%_9)HckwL6#rB^q`^HyHYDtUX5uk<@M9`
zBfkZ|D1SXdr>}nLJatZrm<TCgoH7~w4}liU_Pi(Y{>|8xMc;k+lB{fuT%}yl^N{}x
zI+iycA*m8TS?fa|Pt}ld*7~qhs&;%MkA7tb_U_Fh&GJQw)T?mFKnU*163+TzbepVV
zKPGP)5>$33qwb19s2<5c*m5@ilC@%3devi0iJ6Rl$@~dHmo8rXkX8-X2-DRVdi=O$
zha=JUB(%${4t1GjozVJ?C9s0B;(nFYzV7bsmD!&^zt0KH`4)H^LJ2sYHa5{a!{4YV
zKTxIU*LRm{gBpFm<n2+9@A^;tbjz^ud9>iSmO`4De+E7Dym&QdyG4V7z)89>rn6XQ
zpS+RJjYdSX%ywK#-M3r%y&c=zp5E)0eLE{Rck0HC16fzW5=wOjEBQLpK|J{VL_)wW
z@Q@34@1CN871P*voBEC$r~khu?Vy7pw+HcI86_KlVkPi-#nX24#TKM&jb&GllA!|!
zejemsw&G*Hi}~Uq_JiQLsKj>|ely8`|1f1c4M-yX&$9{CBEdHgvO?9d(lAK5D`#Fz
zhX}#hu!-ZzxYv?DJE@#iUUl3#ooCl+_o1*%m(?cKOYeg2Q+krE;Ac%`SN;3@hfp$9
zppxM9PG*0qEK#nn@veW4k|-82fjEvZ=wl4ngoY<$p+k;?C+HbmFku3+72;UF4c~+_
zld1Z!=+|q@yI8lBd@Fh+I#H~{N8k+*KJe>9wL76|4?|Nn$IR3{o@%Qf@vheV{!whr
z2+!<86T1w*WQrTM2YP<=gSO@sOx6xuk)fJZRI)Sce9r=x)z?*jHoD!sQImxh$ydUU
zh1Z(q#cuw*Q!s{K-9np0=oyAaG|9+JHOqLEypB}>0kOdj(U#S-Evhw3Qwz7j8RXAY
z$e7<0F2Xe-&#>$SB;%WFx($Dn4K5u=r(mE;p#h!(Y}bj=SC%ZZv)F0onZBnGm;LUk
zoavj1B1PkJwEFkI_QG^Vi$5K>_1b`U%eed^-vPw6*ip1nxa#lki~AWZ@3wsyEF6-I
zJ@r5Q>$7+L%HIWY0Knbq>z${Y7cO7^9Pz?jE*I^I>F-9c_lTw%|D=q)5ngOlySw|%
zjzNhdOie6SUv-}6E6atucke#!sB$E-#CLRk>@GE7)f#OOhv_+t@p7np1SBHeLbZ1e
zFDdAVC%|3M97U;{Yqj{>r9uZ&Y1_T~e73FrVRUIlff5D!e*fMh`E^Eq!?G4c1guBY
zpME|DO5&F+FDE(dxcBz(56i>zuLWC;D|MZ9!N(_R`Na20Ns#q6hPL{HJH*Y^Q`dTy
zYd3_26&dE!eUc<XfRlgt@BzvUWPJm|+D)UOlN=U*(;V0+tbW`5vJ#aP8~<g+x3=D`
ztqjvisWY{xHZbcnefZBMe*e@i)PA2j@QvA!RPDEoJn9>V@4K4n_pi0S(o|$+SwJ`R
zmcQZ?5^^y_5jP&a&AvWAf`bm{m@QGYjo?10d@OTWz1gC==Eo1Yb~9Y<TY)~MkD~6K
zId5KG&Iui1iKN5=>tMw}6eB?%+Jzr0c711&^`~(;H8z1;N!-*|D;3Y4on@RwJ|s;F
z6_!*~_4BK*PVTAcDm~TIwq7{U*SuZeiUbygzd*w?g&T`(v!QnHo=>}dy7nF{+kUv-
zllU-iAGOy>^WL`^U&VvX&d)!<QW`oHnL)!2<*sGUG@Nf9(}$pNM3sC5w&NZeB5bG<
zB`+WdSO(foa_d2ZX!M|DX>w|+RpF~XrYFZPfI8No=L58(&Vnf8C3^<!^F@hVJ>jl{
z>)X2R)e84&s|HyZh1^tC3ROEAR<Yfpx<UBTzYXof{QUf;SGm92RGKupIySCKWsG|Q
zATBTfUxNz@qcII_D`gsfR)&7+7T<oj4AHNB^{RbvVa)C7N4Bk&__sf36VXzC$>zfE
zh7pD*ZNp=>*Vbp=x}P7u^=Ic}`zd-=tir~QFA6sZtv63E7(Z1Pr8Od@eZT?v`Ijla
zL1j5cOOg`^rk?=UdPDIY|Cx9sa_W;gmZ^}Ked+uI%~BTvUvdv>;K|;>m2;$RmYO`t
z)rn{140MX};>xl8zuV7X(<OR(m0%TC=)pw5_o&jdEDOsnAdR3?f5<UZf*2%tS{?l0
z*Ux&zt);?VS{-XvZv7heE5XF-!PhfADvM-?HKo+*t4HFFwMq*O4$fjXv#)PI2w};0
zAiI)TK{5r#0~muws|@h2J4$Z=bsP(j_Fq45$0rZ%D}&j`+<0@cEvbRxavk^%FyVg2
zY7ds}D03*N`q7}<In_<EiHUg;%y!jw?Q2-{sOaxidP8joB|Gh~Ugwfo^3y(d$MG*;
zV;?-!Gc^DDJ@9OyN?XI_$#&t*W6U!0?RItbPp;Dr`%v|vv}j20@TEl=rKw>-pG$6O
zm)&xFbldS^P{>GsMb$7J-=NJQDRp`u^6iYmUW9xO)|j*<=HE7e2BLaHi~xMN<wa@b
z2eaPsO13U^PhfpirK?aVN^^{nn#`|@K~K{9MnlWur^3LUJlVt4<9CES=x~;D6`^E5
zv6_PH6d_jIwL9l-7}*&u^LP<$Xy&&L*K1XbK&g_s?Cg?~gkZ~@rY&1WCnRjXaG=VC
zDh)jiwQ6F@nMoS|eAQYpY?f<H)yo};bEdCK(gS*?Vn%Td0}d*?aM`l|4!0j9H5Fe1
zG+qtrOysm+Wzsk8!kSP1n?ee2jB`ZKv4QquZe6{)pXJ6|X}(Zf!qDb2cJXYjS~>i-
z!-SZ;ruX;=aIUX*AJ3gf7)@N|7i1F?^!nJOXCH27#^2N1e<-y@M}ON)Pdmln)qU1m
zRA<KS2+!CRp8KQiv=4<_GRnQITlr~^8%G7y1UM3#mD(B3|A}lj!(6Cg5QwxNH4>VZ
z00-;TeO*L)mkE0?G+E4T2V)sEhxb9bA)X&>Gu}OYh=6&P`BgEydPQeU)m5VDc<yfT
zXeMAgZO%i}s9O%G`iw~}d-v~0tOAPHY53twFJHZyMESzwnd3Fi4K#^ZqD(Dc8tQC-
zf<{axi5`@xz3;buoNHeNfDrKamG9dNR7@t7S($t|$fW`A;4FY}5=F1xxpR^}@hE&+
z1z<$GcI~3@BCv;a+puwCHk@8IUI=So1Lt^;_msLES%ztsc&CL%M)L`h%uD!!$#VyR
zZN)VP9NMwcJ+tebyVl<47NnmqP2M+2e|B|<Mgx&IP{5l9RbM;VzG2zEj)oVA-g0LU
zs%54`^Z{fDJM5EOjKEhDRfD`_56v?Sa5%%=YT0IY8#0<K)CUn2G@16~zvd=+az2yw
zj08<NHZ6)7oW|Q{WdHNJsH)o0_4>l=O`A5&s{iR-?+mQYwHEInB$xZ-p+Q3N&>BCS
zy4vXPCQT~gYFm9B{<O#IckeW>Ma?0B5-d9(duIQ5dAe)P?w_~zx@8nrZRbb6pv@RC
z;1E@Cz}DPZg6L5PB!JF|+E5fShcDf6tl27F0;A2-|HIR0`JGh_5Vtn8X**->+TNLU
zrMA>QX!_4nHe=L*C`J@D2nk5|YCYD3-HX1~`qD1Vfd{(XIvkxP;#tTEeDs2LWZWCm
zvs4Q_16)Uh*zilBS$`Srtm#7^nmF@uw_8hDW&o)mDbLi}Vs-D*A%`z9!XIzyy`!<d
zd7CQhP0oF9+XU_1W_$X;fhJ%O(Z~y!qD8(qf!8UpV8th?M-D(&$rD=Fy7yB-JU5L_
zyP&8(cI;SK``ox;5l>&gK7;Z=WUS3b)@IopaQkO_xECE2ZRE@uGsF~2z9WUI(5E2H
zktq74$Pv2|{MTsyMv49m;ff440gfE-_08YhX}(b>#m4wRf6pDZi#f?g^*{4<Hf-9o
z7TGe7NQPdhc=+5Usg_Pa+9O_ewb~wAmK{FzTkod^F-aT0QE&Tjrs==cAW(qYffxQ7
z8k9#2qJMk%@L^-Sr}BId9u1qStFa|ZfhO#6?#4jYo;6T$X<aAhS>95k#*NjCpEjlN
z7MLDY$VK%x*iD#F!Z-T+Z%W_B+y9&N)eY|*lK{+4OTRnXjPbjCd9Uow#KTO56Nybv
zbgNz8Co^DG7CQQLha>yEe}3wROnDF8Y=sU&Tig2vVLl09{|c1O87a5B{Qs(b3iz7_
zVT{c`_`=rFFCYg(Pg3#2SQ}%!hP7T)CmS{pxH3q@eYQw*HJ{>_Oj{5oe=Rgm4NkDJ
zlDI%hg&1<tGh~UentjEmon6}XNM1EbaX60gM~)m>JTr5RWQlIm{oaZ0X2F8Tr0D1W
z{kIB5AINFFf{a#xpKaN4=g$Fao$d7p^e?q!H^P*>_+nGgD7-F3b`5VSq8KUEwQoA?
ziCo2&h5ZPxMeWFOhLoeH;Wpc2P9n`e6~kH%9&jQ?K~Ct4C$us+*yUbYN&mzAu<|w+
zHz*@|0iulbOq6N;di2=s>$M$(;jxp-hq>G-kn)8HqiBF!b$0EH*zaaL@LEdYHr^v7
zh}4FH&#nwIUx(O1nkaM?!=Wbuq(x{CUo5`>m3-j;Q+&5<8LpZWb{akTix<DGGZ19T
zArvSaPy^6YoRnd6h|sm`#`Qogs=U6f$v=jz#(hzssm1R&<J*QU@?@`|RN+Rw1*iwV
z1d6B7i)4~)IwAtlgG5iyU*^ugI}?O9N8UNOU+Vwc(1OMR(wUwve{T1^n>TFuh$;a&
z?`fDGW(n>t8%&PnaRW;sK@??2c3z%MztOq3Q(nD#0xNHiohcc4s()(rY!S%41GGV2
zCXw*YojYTHop+wg`oajfafHB_iX5k1qI`Y|89M+E5Y&4Jb0wCc=@Q`(3aize2{5pB
z%E||)`E)8sXnjeKyf{_&M|i?bA5U+3b)TfK+fS;E7;%bws8zocDajYIi#m@RW9Ik1
zpXW@~g-&BUt$}~uVx_;S%*tv!-|*wJu7u_+3F3V0-mSpZV@b9$8FUSYotox2CkqyG
z8m44ur+yL|e1$hh;pbgfYzlWJ7M1`<k;0Y~Xmlgrx@tR&23FNG+Tj9a*IGqo9t_5=
znm8l7^5~9-4j=w}?k>aD3LCr!KUFxTsoumNsH4qPUG{#5mh6hk?V^|NA~_w|GIRXI
z-9}*g2pk0Di@q5VQNO$F`yWC#DHGkhJR1TV^vKIMZ>B)1czc-zrGXx-1=*rpVeD}d
zL?nEZum%C|pfh=UGI93kk%$jGJrj_2TvneohNcd4Vj?`S2}V#ENvVsJ_E8vMzy2N4
ze&15RKBGr}^HNmDN)3(z^y+KVa;T(`5I06pAxk8MLUT%G7CXrQ3#tHTf{q#p2srka
z>S{oo7Q=@5r|q-}X>GIwYM5CAJ-yVio$Z$2TFQaD2CD-u!iSI{pQ0+h6Z>GTl4Yw}
z7SU)exTzGr6p$z#S*Ce8AmH_K>$jfF@NXK8RtO)Zi??ni)M(K_%E$Zh@YhT2#}8j;
zHaH$TCx-Pu?H7kphNB;*hXO7Zu_i`d0ZiseHbO9mY(=n1k%z+E2?&U}&13Yz@raL}
z6N-UxZ5?ky7Kk&G=e(Ff{tur}xC2LuUkBk{geE~o-zInc_<WUF<>05`fJkG<D$H=z
z3{HREPG7ENAMO|VM$AwJvmZVBEV&(FC*8jQ8%gUwO*s6g7wN~gCx$lTar~M{&hYj&
zy}L~GTQY%&y&c1d9*K$bfPntsz>w7Z2Z@tLs|uCzj;@?tzL$(<AZSbXd_U5%8|_%)
z%O5oPG}!QlVA)->PPJ44uAD!Az9^ZA+AvIr{ZG+{z`Vq>d8dYUNWv!5K4LG)sTcDY
z#AuWz(P?QbZ&)@ud`WS}V;nUJ4itl^OdJ{+BJI=<-r~e3qzQmX3(ITv;rObEyLa0#
zR0)H@ho+-LD~HY)z7N_%GCfI~sV1%ZzA$v<2XuZqA=3HUxq!=G31KAv7*$p=%0=YY
z58ZZ%?<ge^G63|(j4WJOcpd++Pgl+v8l%gsGaOQKt+jQ5zi7A{))`g#CawSS_3KmE
zEJSmH6N9=#pczxBJ;W+QuwsDeM*whmj7c1${{Qt@<lTg6H?#<d+-$dbK$gj%P{)oP
z#S>PVaE`1XJD^nJn;quk-wHhgwSv%8%iqU;n7p=ZWqKep7_b;V7cMCHB>gAnh6e}(
z#C#x+>`L@adF`beUTysL!DM9T1~T>Xx<MA*F0y1hm>dmMo4i3dD0KHz$;zBpk-jaw
zF1G=SgMf3#Du13tEC6H?z(J-+1;FFWES#&9L2^spxUMukMAZHGT$wJp7Cf4-A^MD?
zN7wQTqVi{iSk%j?lB5rV!Ba(?&Qv+NuTEHLQ}utZsrg7dYv|vdSX>`LW)L+NT2@fu
zygXT)E5E6>HgoG2_G`RjV4Qlf|KTTO5;FYbjEuU_7e#z#DjzsRv^nVWAPUx~xluxI
zS(;sCtqAG+ICo%R>1q?Y51@Kq8h@m-yQd!M@DUKPXgjQL4bwk!`IrdQWKezAu3d~X
zYBCE*((Qw8VE_L8e&6~*+w%5A&jRP|zS9VmPY5#j1FgE7aA^gI7Q)EL7eyYNr<9I6
zbykNHNjeR_vG~SV6%%@VsCo!q0LunrH;BY}WM>G8e*OC&#8AwVnHeTKXY)Bl$}2tC
zr~gh#Q$n9{K$3+}781Ioer22tNv(ZM=oRWt>4ce$)>Rr`TddBkxIG?4NV0R0)#Y}x
zH)t}w%i3n@`;?WBm#y8P?qGq$FFlY}OG<xHb^--}nWM{7M6o8~Y_dqN{{7cqI3V;K
zP0Mb&Pr^K%YGIe5B!=fNUgXP6F)9TGk%&n>LqXi8bLYd)*M7EK@XGiuQ}&CIddHA_
z?cqo<1X>M?LQNw=whwO_Y_~Kc;&9MqFiXV8@V$y(PnK$J48#|#wODbJacA)`j2z4V
zroO)#8hRd^7_owuQHS~S`ynw!+zf9iYUP-i>3PSyAv+x+Plz#NT~Zbm%9fD1zEC=_
zs-n&|!M$Y6PeWEL3V9$(eB@{{eFCCF)gwhKbFpr4sTcb<$6zMACf~cCx_agL$k)^{
zZ$qsI@h>mkyZ5S-N#|qJ7!7zHAK!cISof@~tnr)8`0FrRDClIeVsHmxA7}`Pf+XMJ
zh-c0AyMayTipq`235B6ej`B@KsK0b+-pv_r3b)B+AgP3fg{eOz6EM6~b5*C3@=-j^
zW#*gd<wX0|E5IPR8xX`yjZ*_(>QBBvCYm>HqOVT!n>R1tzi&Q)6(d5BP{!I0UTnR%
zBaT=8M){M%JAjc=(~cae#385L7Crhqz7=zX5zE83kK%=*`&4)Mwn<zz#pawM{uxb<
z+c<kQ7H0QQRn0kmzgbFAmArPEvko}92<`~_E>o$~e9F%+i41aHQ~EaYOIh!y;s5Ds
zmbu0)PwyfrYsi=}V|ZP7_S5pp1d7V`f}Vsd0hT3aa<2b(<dTN{@G+i{Zom-%7~j!c
zefI3x!t}o*ZBbzmM7-EL1Lq~@txA&)jzG-eZAQK5lk|mwYW!Oh(D;#U`|_D>EK){c
zh$7{|jT^EShn)~XL1UbpoQwvnqFPT3nMOTADH^Uar>vlWJp!lca-l|{+*3D~*lE)O
zQC{-%NLq=NXgyeI`}uWQ5UX0brBqh@8iP@Izq+d&2-bi1a8=}A5Vte`hs+U0kfX9Q
zDS-$2mOzH9%=9~M__g@^52J{(v$IRD&;Spl$#2Csl#+*G)tcK8GF>jsr1&Y3#Ez;L
zILA%&;JhLvy2z*{6$atZzsoe<z4$ZApeI5i(O_dy$~64%5{eV8X+$5>P3Tab6_MKE
z=5$>p$<0_xz7Uz>uwk~1_mab5Bq<5QY)UR67#4gG6N4^0ZIR=DUE#J|Pj5OPK9&_U
z4`>Z{B(!_T#Dz>wgV;XKEjvybvS`JMSu^k7nx^`HT7a2{tTbb`3vr5pAOd@v2R`9R
z`7ccZWYm(qywES_GI=UMl;$TQ=8_mi=0;Ts4$*0ZaU)&>0Oi=F?`-=_)s+g5D2}oL
z*WmvIbwQb@0MZ~Jk_@x=C7?p!($RO)+SVVKR*t~gFlTEE7mc?Ae{Iw9v~oYtvdc5r
zd4cSNzTnBoOadJgVub|J=1R8aTD?VQ!Z$Mg_+%fsy7YXlQjpqhew&wlUMJ26SuF<&
zK&~g!3@^LvLuo&@|8qPuH}abt|0iwM&T_T?9dMJ8!b!hIaV_i&;nI@K2_*oKf=F$K
zpNxi0N<VWs`sVI9tAxyh2_`<I5@6?7j1mX}^ZnTvp5C*TEBigk9qmcwDWesM&M9tg
zZhGC`Vo@SIi@0bZMx;5sy>b-N%y=>C<vetvA;E=snZK3g=+UEpWBZ%lJpty&@cjw`
z@a4J)>k)DFWhwBZJbf?nLN3U^`kx^y@jb$3`3W~(1`JSe<}-D$hG?3YFToqhyO7Yp
zhm;U-bUBQwK+?i7^&2_z>;5t$&_S$5*P>GsP3Ov$v;P$CcxXcK(PGOW7d`g)CTowE
z!<U#56-14}Oz;IHTI}W(F+8%L5JWbpJd4vq&Vc+G4_B6%HxdC60g;<W<kb@=Ug$Mn
zC&e7l_YD3PN-YL7zSTZ-a$XLHBFi7T@A!&gZ+nP9j@VpSR=7{FP87s7$WKIFge1P<
zDn=tx$32f6(L@hTxlI4jV{(PcQ&w7_v`JMR9Jp+-aCW$)Vma8<bd@PhD(4B`hfLuI
zs9Y(S&YV0sVZnmdj1eN{q)ud<_xBnAB1--D6ZJh9ZVOFI*Nz{lteANCr8LJYR)`yV
zzdn6(&u4Z-V=ZMzVuu@i5y`=jE;Mmg-6;b1&{B957Z<z#lRu1sD*%M1En2JrAx~`W
zM+-!mq7ugz(Ju&j)L6x_(;{tC&~N0z4)+w)gf@xInxCs5ul1otOjmCp0aSFLxrabn
zqoen+faVxj+D=+dwGFQ>`=_he3<oTu!lH0Q`8pMgGjtOSrhAiZEt&C@Mw6G$cz}>7
zpKz{NhY1f@k2A$E^O>@Qj8Zjx^O{$M+mHrTh;tnILx4!0n|)nXstmz&;h=AJ7}8|=
zy1Z3&wr#VE65AO6zn6$_4M;McMpP<iKn)NC$x9j<%layGkXl81?q3NG?z67+g5usE
zhdfBePsN_XFPS<jV13f1@qhpR8vL(H=k7o8B0IvNS<>V?Dz}?AkKzMJ7iIDG^IZ;}
zOl$z+Qp_#Gp2#j)joT}(xB`XIY~(vR2CgXCq;n;Cqo5P?VV|!1di5GKsOUzORKQ#b
zth=p_-;qImQN+S`3b#)B^N7P{UnUtUp(M7VKQc|5fg~%zWKHIy9niH_xU3l1c1e5x
z9@m497Q1S@0KQOezvA?-=|uVh24&62+Q>bB%O;XBqdfPker+8zjk{M=6vtnH`DX`)
zT!?C!%kM5XFrbiVN%f5E2Gv0)wY3M;)0x2>iGn0C*1*lQGDY$SSVtu?97_(Fe9ko|
zv?(psQOp9qLC4Bez%-wv<;T7=A1j~;C4_^4|2$u-#?MGv^bIl?LZFz$_%-w*=M`#X
z2Gl(z&jkjb7$P6rFoJHmQfAF)5Wv*1)&0nGpv>Z`v}R5eic@^qg-#}CR&w1~T{nBe
zgvZu8-Me+`mRQy-+%Qx1k9M{Dpb*2V4Isc|gvYrycLlxzvWQx%mV}0cGKh~w0;u2*
zVd|q31uro44)c!i1l|?k3jiyLFh!;?P704{Xgwpz!RqMy<f6-xD8^@DLFf$XMjNfA
zGl4kB$0usX2aBzX8%E}~TZnS-x0Z$#XB3&t061Q`yG^Hu&fCdqRA(&Grc0DruVY>E
z$Bi)MLg4rt(R2KGFA%1R@Uk5z48m+=AX;ReGM{w1&uko0W_hO#$~?$4D5&jE)}=k;
zMv9<A4iPuAC5<0gf<;`BhQ_a3jcX&$v^LT^9-`JK{$Y@ay*46a#^-nh<`VfkFy2}v
zOuxp!o=-C->Ha(3pxDOYg!)#Cylgvjix&||+Z`NkbDWsUtxGYV$sNkLHO%TL45k&6
zO2{3_zAPq^1*Gmfhfp0rOFfDm9&s9c4ZHXk9Bx~5h>{c#zl+F0g6Z958l09cqc`=S
z_92Ur{w7VC@^r}(AILNA+@GOw0B0}yR@0TyK=7u4i=i)&HRRwkegQ}ttd)hzag0&Z
zZeOMe6;_ipbZ=eU?X&?hjV;wW4-iya+?zQ{J&`N~cO99ZpI=E*61D+}4?q10fy6}b
z>c4N_+}$=Pk*A4(6UacE2Ylyw8skUCMB!F^&d_5J{c|`A9?aRo&t;R#7%1d$_f}PP
zX2JrcV0m?Q_md$CWj34??CmxDm<aKyiEOp=@vAGdX-A5Z{tD!{Gbx*2!upbv?(VZ0
zT=Fm5VP@tR@Z%n>40b8Hj8{@h0p8|}&8(5_!|PDBiibSl7l>RZ7-1TL3j`%${vFA(
zAZpU4$*M29mgJn?r@%#|XJ(dH_Duamlp|y{XsV%A8YD6jY3%v=u?FWF#+v<oi73GL
zL<Z9AF@&dB@i4amL<XD{#{e!#;VO@L*rV@zFr+#|0Y-5H+eQ{?1jttO;CxAO@l%rk
zO}~}h*eMn(rp;h5@FI+PXV<SOdzj-hu(Gb+>-6)>9bY;0+`D+idZ$Jk{%Uij>GJb-
zL#ji+-7B&_@=L$^bEs`{c+t|q{UJJ<zB<)o>UPDMGbXj2G_Ww-w69zI&tF-IA%$)e
z`ah}M8J=j?*UGEXBi|$5ga2RY_{a!|O42C#L_#`tF_ivy0|Uc98lSS<<3v5Q&2XKX
zTW>8OiP^o%VP<h@x&*?AnrZ>1DP;hWaG~A%qL4hBZYT8yy67A_`Rdup=LVG<pT)!=
zc3LfUF}B;zd^u{6ERGVu_dW=Q{|g$-R*Q<>zqTTfwH$(H3=99-*2Ki*C+w{%Oq(?8
z%T}&bXF1?%jcmVd2?GYTZfZXM*$8!?YXgGEDld*7nRxoII{^W;Gt+1M{g=~(3Byj^
z^sm0UGbQtf+49@AwI6SUnfdw#1TdMCT4J#%)!@a0)P-f-dVAc{=6bam(z<D*Jz*W)
z=9rugUPgS!0cVT#f-!>!7x!!POpS?J4j$iD291Qf5_>NYM`}Z?2RPxX6v+%x>%}_Q
zam!V9e*b80{XBQ8QuBSEru-!vw`9DMAt!#|c`ozgF>D-}3<f=K-aN^ifO$E&x%+_3
zeP2s>M*b>m9dV&?^e$nqknBeQ_610j*&A5^Ntbgvi?z(p&>{%(AYe8oBv9y1gu=GU
z${GclYRZb_KBE^KIw<ZL@o=@Bod!osrg8bBn5~*(VsiE0m^&<oX+GT#p0s&$(SgDU
zOl_+fy>`^==W%g4kDVGa<-gk0G)A(Q?0^dhunO7=(B}vaak@pD+xriGfDq@ku1XKK
z4F7Dte*J~Oz$F7$vhjm%j_@V3XX44j@ev@1{UF>S=HzE3enI|uTt$1<<jD;YU<i=k
zZ}jK~ddt!%dPH3}_va7NdGDYVyGreRW;D(4bOVqG_sEc(CTIf+wNnh;!Ujo&r=g*&
z0AMO5R@?UN#d4^Iyc)7Kw>fTsa^0mRk?BI*NQ7tv55pyE$MN=^EA3ev#$*Xf&ol2o
z<DVck6m6gk$}`1rg-R7bjBSRyzAy$((bG#*BJ0=px3RHtb#u!&IbtT(hjQVuS0Fy4
zMH2j)zeHFaS)m2|#4;sI9$5f9sAA*EL8kYKzJKOtP##sBPpeqeRDEn~?aGbgSytpR
ze#d!;sXNHMc|g3CMY08<0cKRggkG1H7df^G6Pyj3*IebwG;7-QWTOOeBNS3d+LmR@
zm$TsF(?^ZPZA_N-2##)QnpYCXC*#>+@{v<gqJ6WvbYgTooSV38k+BGwE+&mTbxEoU
z^ncNK#h3*(?jfENdIXQ2(>{960QEN|w|#$oT%We3YwDJRhZ^1*lUH4EUdiFYmHg4p
zn{`e$HH}qU*3G%u4g-TD&ultpy1Thq`}q|4m4_SpPrtYT?@pX!FxA19=Du=S7lZoJ
zqU0H=4zrA^-JDWd+4wIU=z_<R@#f7y2|>yQZJRdw%jvmmFOvuN?=xfDK;QH6_93_9
zS2`i)i;Q$4-;g+#-&;E~=)oYR<{lm%3}MSHVd{^*iAnU*J|<6+9c~-?`@&d)?xP!e
zZkRvcKlO~edmQ3#g{J@fV<I40qee5<;Yr19mW^&$d!6y{NH?u@KYu=$vx91y{L6uw
zx@_69pz+QwF6}AuWT6z_1qhflK3ucW9kwy$9c-Jvi~_ay0&MoNY?uHu$jBd^hkzZ(
zx**n6$$6CT<8j!K$qU-B+IY*ccy7p6!k6#f&73e{cR`b1!?0!HgNQ>1_#5bSQJ=Le
zjFGMlfqL(19}%j`{z_06B1lHcsF5RQg!Ddkrq<KL!|waX1I)f)Zu$3a@@wG^rnNui
zjYCsLN+X7%Sco0JFP_gNYZ}qYTrt&<0a@{NqCTAr!seZ`^jW@}6U}Uii%ra`;U}3N
zr-SFO`ltCmaqMK<--m>YHx^zOFKajQp0uNs+KigP8BaiPCgzlah5~XD;M5tw24}o;
z|9)|b;hO##<XOlm=Ftnz_QPVpLjW4Hl);Z`OUH`qBKHE<#;btmGTRUFEzVy8Vp3yl
z*|H_LXLb^XaZ;CGuHRQ~?|<XQtRMey6PY)!-Ik4h358uR?UjaubgW0bKFAK-q@*?u
z4i0eRGAX0;>$@s=Og78!B#Lm9L}UqO^PLjao@P!kJVH?Kp+k=|*l6}4V$#LOfW{pS
z?HZ`7tN+5P+ns#AX2XW(tn{3=Y}s4u?mpPI8eg5Tr9~S}bqk{pj+Jw%WS0eQY%dDh
z5MvWc9zpaZeNh~RH?|G?rvh7P!-f&sI$bu_c%>PZJ#si(sI^70uGP3!_e)ni=zd&l
zf548^g{jBV^Aj!=O`e=uof=?%p(-k{)uNP^L3iG!nocXfrnCB{&Z$ioUNrVI>rtNY
z#@ab~hfmO`C4(QP*jTqT*shAX&Djdws-NS(Ds?<?FLBjy*O8i2xsCV^U0>`<Ayvkp
z1cO2f;ZI+9^<)Fi;Lss?d3oWb9S#0|;Wc}HYV3s6u-mhZj5ionU%Op>7aVWE)8d5K
z?y8b6y+^CL#b18t2&@4^nDsfUV!~=&i+)gx-xDl$PNw`M!q#cEpMF%ir=~WoIC$ab
zl+4}rVR~H@8xyx(M{i!?J!uQ|7Fa1H8K0}EqGECCzMQPA$6|+$--7Jw{BgTBb^S=N
zktFm$`8z(KU83Y8{p}A7tU6iY*4!Mc_4CMm#YTvF240w!Sa2Ci4}1rXyg4^`$dGju
z!=eKyEPM{5bwv=gQ8zEPDO1{LX4a?r^08xchxNRL8c8Crd4^VDH49I~7#l|JG6(?Y
zP#PR=WT?TQqR*duk7#%MumiK&Jzzr4O_~t3q$72!>edhsvIV4kA<!qaDSF7_k`jiJ
zi(SI*AyOlIAEwC0z2(@MGwsOZ7*mR3ft!6f{k-<Rr5DmF4XJFEIwwa*Pi;Gw8EP4C
zNlWVjY>VuL*dR+jg_NTb{g|5@%^nn}+_K89^tbmwO7mWL&j%Y3rvdIHYKjF*mp)^t
zO%rby0lHWRgH&~;R(Ay|l7%DPm@K7E4?E;GjSnhga}>_{j~v}ZUyf58>0vr;9Bvz?
zWgo(2N`_eA1yk0lbN06t1!k9aL#?OZHt=5VP`YyHxmL4P(#KAXA88z4vk4$OH+c>w
z1axYW(+FEKS3>wmM7%L<-<1q%2XL)?T2@oAjns!0q;Nr_iO6zc2qDo=p6p?>L^qvw
zTMxHl67}S>-i`i>Oz<sI_O^U=DC=7mzHKn45QpQ6b6z_i-;Y6zV&jHFw9(SgU5%Yp
zQmUn4-gVsQU)ulvT@V<4S9Ro@(A$B>y*3wUE&Gre`f+_x)%{x$b(u4^H{Ygx&MBoz
zsbc%tZkr5J5B#&}o8^$A+f#dtT(YS+<W^QG?Pg8O4jnpx-(pK~IrVl~6Ni!_Hb!Tb
zeR6T?8~F9-&0!6!d$b9pKn4MV#0nkqZ1_Cwv9oqGyKJzlUtL1$q5+BDs+Jr1OSTq%
z=(o28Z&Z75t2ipx9>JA`afQk4og;f>9B6Q+ao+#tI+@>Sc($^-TEJ5c4GqOxNqvpu
zCu%5DC7=H8n_9$iE6^Lc*Kf}R;BW6D8Z|UetTbVjlrwpP&%EQTOI)tFWGE}Y^>sza
zeZ<?lJO3VCh73x3czVjPvou_zH^dff9pQ4jt`&yx>>{4~!6vxSUnj+=dx^<|7Q;g}
zr`G;lGhBK2m@zTWo;7ABi7NQ+y?bmMS+IEVQ`t)v7&uDMT-4br$0tAZv9z%{yEhD5
zM%n*F4NV$BEhtPlrhIW!tS>hP%E=<eIx#>vpm-ZNf!LwqFK7yB{Pz9(zZNW6(kItz
zuiqIBJv>w+B6bChe~2U}Ik`O-gNFnw7RkC3Hajq|#^VQj6{rcZ7egt%0Yrf91<wbQ
zu-vkx2cUsegG^I0^EqVSw!k}gRw2fRkFZX&&`Y{?_pWYR^{6LLw!kT(;bwG97#toe
zg~L^>WD?xRi1oqqTiqzwSp+jI=_WKfy*OW;rX2G}1c$O*SZpr{`maKUP##jBbKRs!
zy~lQ~=RM6VEQV<Z!0q2XH(MW%nb(E#P~+p-N}7n1$#?pnI3`Tr;X%r@KpqV-BaxLz
zTLEaFV2bNN?%H$gSVGl~Lvt?pxxNG4Cf9)U;IdP)zqo8(JN082GZOu-=toke!#O#F
z^=^e8Wg22OMJxc1Dwwf=L)=0hnu>t-#1z`R&Rd&0vK|$B?1fJKIO)eA(sK}~M#je<
zt*@^Cm1&({vvi<~?VI|UJ-!bZ@iN#wD{<X|5wGoDKL`pEUjpib`RD6Qn%8OS{#dcS
zxqi@jr@aXm9mgw=slB*)`1S|(7aG_uN-;9)cJ<#;ug;m>iLk!9bM(Jz+L@W9+v*ZJ
z6_zcVqiu2ZY^PZ}u6kq^)$KI2Sia~{eUN(U);Z6rX%ZJk6wSHiA7P!kwXVeMuc>;b
zS4JG&WK<UR>(=6dcY}gtDC1SWcG;r3@P8iM_A7W_HD>)RZMYS&O*b>EJ@eyuvqpPX
z#+0PQD5X1MY`xQ2qt#k9w{62l?jJgG|E%H@R7L{9F_67G7rG5$arkQEtGwsgkWrw>
zAn-FL?X{eA8q2-2!z}J(?kA5#G)$&&Wbt)Fy~@a{-1DCgySOYF(@#(DXks|!layK%
zUNvl~5iN%pR)9ZKtP!mmXUv};8X5~5!f4{NlMlKNJ=*Jn?u60vOo}W*K1B9k^kcC>
zW|wHkMdvOW|I`i)3YtAs1pss3zQ3tpsZlvWY_I+^H~ZfA%aDQu@~E=9i6Q|};APp$
zF3VIy^ymG-Fj02&I`dqX^NNR)2$tEI`3FdzJ8fEHxEvpnALE8{(gFh1d3dja&yl)t
zELsITzGL2Qa`sb-au%p@4KZm!3O!?hJG8+x=r&ksG<l~_of5ZMT0a7ij3Rq&|2zQ3
z>lncRXdnFw%N(CUp3Ns`$Y$QmOr!S^3$Op{rFwMeg5b|fy;=M#xF=*H7_ulJ;Q@$=
z3v07(q;10$1*t-{)>T?ctaPN8W&O;;@v9UV-jvD_f3F5oKf@$*>lFogh6oN04ZTM@
zNZCVKdGcvIFMELRwNqf6TWc!Tq0<u)dt~IqwtleF_Vkd;Dd~)MRJCV#>JwlNy>U6)
z2v=KJyny~P4?YQ|oKh0x?dCR<+Iemp^|CkX#}N!jYD8CI5s4(Qh=Qd3!p^9zlF}5A
z|N1JUdYMmS5LG&EfL{4j&)v-ARhXKZdhPtNK>_!#%yM+j7Z(o{aKAQuy8E6<7h6l!
zd8A-N-`45fTzi|)B|scJ6=4j9>K9rC85DN%@;5VA%Q1UVHQ956mFn>EZ@M%Is_m>W
zrYgYu{e{X2y{vZa+4iDAjZ=q3ajpxyzuerxV%Nt(4vDu*LJfB1HhKRjqCzLRL6ld_
zrk^+SLpslLd(>f3>_1DV`KPw32}@3`-f*Yv?UrQgu-D#iQ!*V2b`(d2Z#GJO{pfap
zc51bLs-Eiw@3z2V$f)omz|(#e5Z!;d72b0Sbetw^sjRu4?$Bt=x!d+He?%2@P~X4w
zroVspJ8WI@1x9E-H_gg`9vVve$B!Rfw1OEk7nukoqj)KT0{(FxaqsPAaKvxkm4#If
zA4DB~Dk?tyrk={fN1s0r>wR-hUx8<2^jSko>mFL1+xj7sP?o_4MLvJtj0X%({y(V=
zdV2f+k~&dm@pp87H9BfvNQman`XaSa-;$1Q{#>|oq`G=kT3Xo3h+TK?|NU$SfX>XU
zLM{9Fg7bwn8y$D4{y6fiY$t!XsP0lydYy}dg^7uYX|iw9M2N3HrdBgP&WMVNx*HJC
z<*dHBZ9}LftR%&Su!4Mf`s`Uu5Vr@JZUr$xvnt*|h(u2-wC&hYkV!G-jiO;fQYBR>
ztdg*&(p?1wi4YPo={dOnuW3!XGYJRzs~@s;AK6P7Q^+3h*-I#V5bV*t_Ac*wdwjcn
z`}bF5Ik&2KIk{CE#GLv=O%dVBm<J8{EaL(}r0J<;<N}z64${_YTwZ?u?3xLzY!x9A
zk`2zZXdUrZge3`oUy_qE+0HKb)3i~Lif<7lp#7Q36f{14Kz*IGx8LtQFE)4UfRV)n
zC+QWUXuVmvFhE*HWTgU-<0>&DU;oxp+jcuB=e&3R7$7|*4TaRN;)WklZ|&E`zhh!z
zqCiX{O@o3ECox2HGNge3VPVtqF8j*0);@jmBzyniqXuPewxKhwF1$`om&~wt-#+1D
zr3~Tkk3k_l2@$MNC<vz;vI<xTNg@i$ZJ>r`WXhtdx^7MT_3I~7ND`tE{fY3Na)t?o
zH8W1N3hm|L_E+Ss$r`kQGDXa6Ai~Tj!=Ra~O+MaGw*qmH^$<}Xno{Y}LuJ^1duNKY
z{S%)U?+jU|8M&%;;vD%^#j%jfgtbMe1kj6eq8L)Tm097Odw1IZX#p6S5s_TDK5(U4
zA}*0k_5?c(c{B?pnwSY9P4xcP)|?lL4_8(}!9jYpzVZ0ciC3uGVvE#6%d|&aHCDdq
zah>FWD@G$Bi#aD$>^X!H=^hp@o`Q0X)lP!p^Ej{ue;iSl7SU2m<E8qP63P^s$<wt-
z-``uezE+gkFzH<ITcZ`tsmerC!t~+qncdKRqf38m+`M@%ecv$OtL1@yVXwcB(NCDW
z(bV(+=CM9LlUE!)9^-W;^XQrD{(Z8aP1rF!$C43e@e>dQKUzXs0Z1>#QzkE4rc6+g
zo&q3)+X^Tn!$7#X%@`U2OwdJ5VOzhRXPzUE$RlXVu)@E6M}iTcftyIk<(Unm7lO66
zZg&CJ*ADH@2zB)wX7YQ2H>0G-t_6T1i+O97pu2K`0R7Z2L}}yyhuIR!$m`dx0d=}S
ztEe~>k3d`_8WLI#%uo1PhIxAq9C#R^lT0A+pU`fvX&o=$lY|Pg0s5Vdxh-AX-3JeF
zs_Vk3qUkv|%!rqB03ns}>eUnZmNHF3Qt7I>RTh-4z310+{K^$8D3=+8;9Qn}{aRe<
z+O=y}MsQ_R8%qtbt7UeBfcEs=yG|`yv|#-@fGtjDXS}_K9qV@X-+yPcGvGB-ls_X*
zt{gvKmC_D83yQ=ZR|;{F0smM_2BjKay<x+lNf%{0nt#Vx=xwme%F3#Vr=5CNEQ<sU
z1=fX$&gDQwr>0i#FDWzMV)6|Gu#ef<Pzepu@15OhS_=9~$(&PGHlN2Y)@0b#<>SPV
z&&ii@e?l4-H)dPE{^g@_`*q=WrVaiBLe(AULLJm)a@%Rgl>5a1I=G)`&D}EGQMKgy
ziLvSJnjBCv`Q}@?wT%FVS-wa>8iI4J4}5QKd=<H3Yjt4*hTjyG2&5QO5;H)jC?r<p
z#=6vvuRq4WcXf3gp9*9k!VAUOVT}(vE?l`Xj)wUiBh+v_khc59Dp2qRW<lO;v<})*
zewN+SN8DcSdan?oKeVO(=c*qQC!hGdEp(!4m+OlMiV(M=Lhy9Jene^F(m}B(N-g9U
za7;09VHl6(GtHqge1-FhTczKsWk?)9U*;c=O%w10QCxGU8_65P!n+U7KB_yXcu>df
zsV}OwYg4^A=Up24sBhoC|0uUHy_RI%_;BzVC9NihdnH#c>3gQCaNsDf6F@Scj`TiL
z`ktXeIRQ2S5-uSPT?CMaZhs{WP0cn4Rd7SmyRH8bp$;?JNyN>E5Hk&bwQ8j2KL+Zo
zC)ty;H{it;S{uk)(Q$V<zLfwfbN`%w;mr8F{2((i@`yxviF*&4@9>@T4WteSYQY(S
z$I^Qu$XI7`E(-mLFd#CFx@?&+W+G_#_N{c$&4vAyL^;ZZz|%91b3tZ9{#|<NX@}|4
zu`4hZ7Z98nIMIuYjD~eHV-f&DQO+xq7Be%WX>de_Nam*FEKy&4V@}^Sw01%SvYrQx
zY9HIeRYE6$MKF%tvu+o}=xkyf+RdWA8l9>aA@u=zoI$!}Xp0!A7}_1^Pqw#)hu2+L
z=il+l!sl#<?cHfCqstcSM}~|YWqNghgqKgB_J)URqk$nI$%1PsI;n)IFMALUWGo2o
zs~l;WC{#J$F_0?%ab4jimA7=K-b2<m^QEI)L+nXrT{$gkhrav#v1Wgzo%rv#98USO
zmw`HcgTt9>_}0=dt+(k_<#FwQAOE~;t{0s%2jo=`*DWT^<xZ?%Eg=c@#tUnvH;;DL
z@#68kdixfwPje1EbA=yXj&3>nJB>qV?#|*_tiZ;l!<R)m3}vb~ewj!Og8HCvNlO1$
zEt?obJ=++#0#$62!I?}Gv!Lm@wS%2q*2j-5U;t`&BLbhq=SED0Q0*Wjkgq~RAQ0}E
z&wLB=5ElDp3u`_;E%-Kal&Ohc%=-DJCcFF%I(p9Qcj(BDTip*g42;Mb@gN{P92M{i
z`gJ6-aKa5Kqv?>uuZP(#4l&I*N8{X1Q|%^?0OUgJHgCJze2!!Z##S)PDClWn26Y9M
zLqVj7wtdi`K{Hov1I*GKW{e1o;)t9(qvwrdsp-FNhc#TSs5zD+!QFuWjE;@Hk*5Aw
zI4=rF7(f<=8Qz{Q{RF0s@{fY>Su_(+q`-=8J9T=OU)5F?V{r>)Jq?1o88nHcFxs)%
ztDhdzTy-oJ5z}<ywmq8$n1w$3c?EM%NaA85BO5S?VO}eieE7PeWfgRRkylW7mx^v(
zcKJLz)Pyk2Eh4MNg0Wy?^!Vh9lE76SU$&?ZKOXrcqn}A<!X-tnF)0=FfxG*Wqa{kV
zvWR*w0&&~Uonv8QC$pAT!~~4yFas*8RIGHGY<q#>Af>R(ed2=&iL^_+&`w{_d}%*M
zs|sNvX@|?29$4ScDQl@}S2x>!lbhp;HeX_Fk6=y2<vt7gW1$4Yy%)bP7R|fr_ZFns
z>}rIQeUm0l#LAGqO?uH$qg=2|InU%&bnkO}t}9!ujH=)FiBw#inhO~>g|q2hboAqv
zkDo2)o*#KV)2Rknl~GCmdWGQ6<J9)=!)~oqy`#adlCI6ShI9pl0&YY~5Cb^OshRl_
z+*f^h{6iPzsL&5H6T#Ca@ZzDWg&_l!1tylD!SDnn*6X)#|AXegeCHBHo|pmgnvVR;
z59(QGFm?`rx#H)hVI2tk5mi5G{vV#s1Fq-2?c;W~D615aoN!V?$SNyIcBm+6AQe&}
z$!J(vA)(ATom6I}NwyY3Xc{RhMYiy~FV6kkuh;!N=XTKl|NDKv*Z6!s*XR2B?*7Dn
zq1Ll#XX!sL&%8Ni^8B;CXmf{sE^vqJaL0XTcW`jespnZ6gbIl;B%%?hP#Qa!q=$WG
zoMNUIJHQ`r=?ns8w$=C7XJ1jmNI^sG!Nd>n+;Qk?()XXvQO32oMg_{VUPT`{rC;m1
z_U-h)lpjBNG9P>oVx3r40-Y@aG2Ek=3ZMXaCdQs}1$m)F9ho5z<ru%MS@Y&^&btEb
zU|XfRP*(7ianMH{og!A4pYpX9BrIE?+z?T|1~DXlQ}~de#$F0qfuR@qNb?>4yzU8T
ztTjS1P&v>G;*Zml%6Gl3NlWEp)4P>#F+SRjN~!0>eKl~nf^QQDsS+G0fZ-hu&X`O^
zOKknod<`*=*h>9GDxl*a_ilw`KI}1<ftNRL-i+X8KdpSTuX6Le*j}_66riuqSA2DO
zoS#oIvBO$Nh7g!`(zbD$j0-nJ0@(GXoJ28FV2cYKm-vIJCeDhS@w}p9BG-QD$IdD6
z1_z$@OYo$%fK(QV7$deapo(^mL(~gAhN~?SVA&TBJ4&G~f_6Mc2tlqMwT->>n9^jn
zVEwss{n-h*AAEv%qUBQY>XmFJM+<YWmCJEhX3-1q-@B~}bE?c|h3rzi9Yb3vRo|UE
z&55znCJ?j1*UJ#-ied%8j>o}^13?nwEAq!|Om_&RHivg<QO=<uk}d)enzFU4kQMx*
z4DWsG($HW_=-k?VTYm+YI3f-0Q`*aO#R_cOf5qI6%I}&Uy2`72dZy%sQA<_rl=UZ?
znlh|SW^DMOxn57MY6Bx*DoZr3lX~;u@9<5az)L|TWU}XTvF~ASAsIoT<rAt>tyW#$
z5?r`$5TDAPXZ!GLl#DE|(kv~j44pOhRPL@rhcbEf-?Ju&uxyCC;L?5j+9P%?yu;L+
zY|>zcL8w-ECvbeaDsifM+2O*OGna9jN0kVugdZQyHS19aedv8KEbQ;n50zU1kY!OA
z|5nz<=jC<m+V${=j6nkjGOW;GT||V#;pdU0%9+=2-G2$CUrM0~)LruYc{<A9FF5DO
z`GA0tR#)sF@NR3lwM8ws4zhi0vgZK&%^kT?yVFqAg78T)?al%tH1t}Nr?oAxe#>st
zjr14{P+n$^j+zD`IG^;n{4jjk?`s~W7l1%QW8D+{nY`Oyh)kd;F)NQY2NR&*%o+Z<
z3r9&JJ}(js5>>5bY(@0c%q5E!foQV>W9{^xlfgichy`DL^jQiUFoW5amJFt`X$2z{
zlU?1mU2>hoKZzWeT8v9&WtBg_B7~QzJ;oPh6xHONE1;rCH>Y{KELrka<;6^)CDVQk
z0)P|m5}O}yW~Cge@lqW}MbR}WN%u*p_W#aGE#<7SP6&$-wv*=xYy_rd{W@@7uI$vE
znTd!NU9e1}V4o(#K3whJEEP8Cw(MY_Q+R>b3(aSf1LX4w_@V$rGbqN?ASt{TR4Z!e
zxLJRVLJ@~773zYmLxDUKXjYj3=Wqa2irF`~))=IN^l`kU&x>R6Awq>Hl{Nm;Sge?n
zU#R2QNe)_Z1y)T4O(=*m505+y$W0G48JI#0c86*x@RgMC3@7`2W%^$xaU=|C1?%C(
zMpGp=w&X}ra4}#3uZev0^~{j4Fg2L7E<;yr`u1%#H;+)uh@upynVFg7>ry}t3B<B7
z3m$;z45rX|d4A*33%a#c+Sx^Zx*$IEktfmHRIaKM-(#C!-)2!83li(MU%OL(G^=Vj
zJk%qDD@>l8V^z@I$hWD}$wh?E7nEBduzYXHH~3<*v74LBX8Uj^(`4raT7y5Zj4tCR
z%qyDBn6XFs`Mk&({V_cdo9h2C6<J9KW696XE7Tf+WQniC!=f1k{dC>3p9_6Mtn%n%
z2j(&3mBZ~|Q3mCq03+sXngr1!mG!lXYdL=WO`{452i^-TdNHz%_29)q?@_#8dKGu&
zVf9)LA_OKbAG?WbFiW!AoVg384?(aw)btjL9Xhbhw-MJ&<~ZIRa|EZRcY^(W_Toiu
zwS8JrGyDwa#a0{4^)Jc3X76;8nq19yz67uJC7u|0whuMaD!oHF&|ZUFb>+w=XTKx8
z$SztT7tfvRK4?&fKLaZ=ltF6`v`qsO$n&J9<BP*(k$i&>H6SqvCW3Z>fG2es#!-3o
zHbR=ocOq5`apwDLEeBf0DyY?=uBh0~oIN}L`j+-&++E?}+mVJ0x>W(^%n^kqIzs6x
z#TYtGyfEm&_+HCO@3iNq3ev|3Jw>aH6O9lA!lL6ZC@zsYI<6*V4>B^g(hu!Ea6s(o
zFrJA3@Z*w9e4E3!pemP0SX2~_9%kMpAG)pCf5wo+rkjEk&Pd$z($gETH_swUgiRz)
zu)R)9bW55d$&Fro-gY7G%FN!JSbA<;+v!KAl+e@gA`#?r6vefip;X_i?>9Vp_z>wn
z8mFXNt$KFqPZSZ9f)BzM6H!R()~(|c=WJM8-h0ZX%`G<K+s6K$>-X;MqB=hhz%4)p
z+Y-d?2z&w~HfdLZT(~`AAi(VqVFu4R;(HZ1$*O1m9qY-aI2qpAl~I~p_dbJaxa8NN
zP|N7J{9*$l+*Qm;5O7b9xwM^AYpRq>W=3};`*9%Yl{$0?h=}OPb=_E$L5h<OMW%rO
zeppHAY7$}Bc*<$2z!SjqCYL8}rE0NwE<Czf8YjDFCOXh-cMn+j#|lD++j55DaR
z)H;2CXq=gx)M;9_ncm>;avHB>&8JDGE1WuK&Sqdeu=;MDl$ZG^e!wGdDFB3q%K{gd
zEZi>zEC64E1`+pAAv6E^r+j7+JRt2LeB31R4oAa@>8KyAmmq==yA0r(Cv5KFJ~7N@
zZt3RiJP~Mx*+yOJ19+)=vz#iu+B^Q;x9@%yBFO$bnbiRMDB(Tc0)AC41Vk6-L!2bi
zv$MBzAKl?S7<U=$^KAfOfP%iw>$=M`$D8mMi|kMY+q?>U2M27kdxixw7(}n1fk}h}
z{iPmvBg*s&*e%7HK`~6CUdVymrNX#hFA^P?P6!h4^qjSy^yxXq@tt6wN4v(`E+y^(
zTy~S50C@!J8k{xPfi`d;kzX+~p{Zql=oa?i81*GQoYbLd6Kth+!K700QFXWBY}%E4
z9`wq*jXoCtP*7almKBW@rD@;RX+0e|(`k|c(1)>w<2nyXR7#{>2A>w$<+8Fc6y^(8
zQwJ&&*k(}>H9J6$R3iHjwSuMQ7bMI~tHSZar8#zvvq;qR)1$*5D0k`7_hS|etB8vz
zKa(NRIYYHfJCa)nyfe~nKA^u4K#5dDjAmp^6od%42X>tdD<D`a5x64($vEnv9mG6y
zZ2aZGGxJ?s3jcoK3N#L?C87+`t;-1auezLH)FiJ0CiQJsu(lJXKAZDQjea%Nn=Y{O
zhJ3{ZI%jCXD5t=aXq<m1>(Mau1#h&Px@sV4hdDs9lLvfj_B0z27E`mLCPAyM4O>iL
zp|GV|LsoE^bBO5+mb6RV-LE1nqX{NEHc6g!+rGarV~7)pJX$o#{26{E*%4&5ZI>?M
zW#2z^8+Ho25bQiFDH+8Dk5Amzz=jCV!A0dZ@greghS!m2Eu&Pm)s>}yJOCgmwt=*#
ztfHlEXzkHg>^dl@_!wv6;?!%pa1G#ax)A1WWoMsc%>(Kr>SlOQX~&h7C1{aSWY3yK
zl(&C3nwJHPA@RS@!@U!z>FCAJgls}opm;|ZIdRB91*QJ2e(Oh6&TK3BrTeOke99Ec
zRvBL*=OL9LfXpYqQ`G2FCy1Ez@ZsWU*K?z6+W|R>(JYUFsXxec<Ov@w3?}N<o7+RN
zU~_gRqUjnuvX+Ak$wb}AwaF`j)V9~$Z0_HHCrq>g6maDGl0(lUO%VAKv()^p<QidE
z0I2|w7f@0O_9i?}y8$D=mG>ZRQ%73&dw1?s!YX|jV0z8>bY86&87@f663GQ;yC)?9
z!>iKjjQILeiB|vnp?^eWDC4+)&>HfuI2WOw^~$;_vvQ~y(Wt0H%#5XY;5Eq%fzRi{
zn}m*tO*5CyOzz#UUl{>K${`*Pc|*n**>Fv}FNq?ns%_l`-7TRT@>@WfLkQ!<FQw`2
zuElH;+{PTYU+5q7ssp9DF}wwkE6G6btcjfD(MVV(#*BJl_VcJMiaSoLBr7#FHCjb#
z&7SeR!LI`^Ap-T+TxX7*2|1YBhI*0u*z@50;cM%@tW`M4ZRIHZSLK^Du(he$ecN8(
zPsAH=G^}bMR4lUL>;OI5-YV+_21A<4Wo-@oBX6fgi><D!Vj4DVC`zt)DEBPdb^aNr
zo%??bR{37~`se3;Y!z_D78{5)CO4PI&(92$X(0+u4y)9tD4?)`<-BwSQ|oJ;u#;K6
z!fEw$2S;N4(`<a0tqE5`5Cl43$mgE865idcvcW1NtUR-_yj&tTaE@xkG?@s*6kI|9
zFovjJkfy?g3B1<1^8}a!;GP>#o_sy<NjoGXVl@NVGO#12ONhcm3(b@t#0d`@bZYvv
zY3Vq|i%x(%iE!iO!6PD{?h$;O62haOUaiGoed?jWv2|+9If>V(ASsORv$LC6nT|X>
z05k+!HQgz=1|itWLTn7hxqz7}<loN|jg#g<M9L-NkC+ZIfzexJ5o3*@Vp8mT0314D
znNcSx2-|u71E?K31%F-wEx?fSRB&$w)sVblt-$?%JN+|hg+e9L&9U;A<(xTlYyHHA
zo3e(P3k0T-zYO<$+g5A{?!z18v){KL4gLpSJ1HjP@l6Ajr`vv%=ld9s&H2!3L;gmZ
zha&8{qzuEV3#y}w#iXf&f3pYO5+*VK&rY&ktlnE>5uDJm1(oJy<!u^{K}RRYhJ@>X
z3v39MLc=~`qd6#IA7|6@nH2t}5h{;#?YRox-rp-?##2UEL<J5n3pUeAdb`HLtHF)+
z&d1k<VgLlze2ZFx`FCg&+2hFs!Th&rzt+N^xOoKEmkm?7xk@SX`lXjO3%EwM!jy-L
zAy&3s)YSMbjW#arwS_upA5nb-@qp_vK>N;Lny5AE&+0Xl<T0T;?u-M0sX!-%nPHBf
zkayqP$*IhBgO#s{L7$m|oyt~vv0=c2RM2!|bEJfjy8Lt74jq0Y8&IM34*PuSOffJ%
z>Y-<gj=p`aWj-52c%W1kE(}~$8-qc(VbEm}_D)zwCnf4ZqGm(1@*-$2KN?8YllaPq
z^&RGiM`TfhQ)^wuX=uFqDnI(h$C?`1w{LGIZ^)bjQ5GKICRNJGlP6t&nEnyqaH|Bz
zr>8->h>L}6$|CEL5}06`7oWO&+;L&=Xf!GJwEOi#RM6hvcs;BX+yvE)lA_{Wx2`ns
zVhygH=7b)KCnoI)Z6XB%O@rV}I7bn95LtG;Fw(gW`RPN-pRwQ>JXgkz2~65!hcLEc
zpEhK7ettebs259Iw=^d*G`F7g^*{R!iWCGVMf;i<pzXVsnJ-yl1gqUCewHp@{v7qQ
z>Z5OgFEi`3ZI1kg*uF%{ivr<_jGVAZz_nYO6wiAZkh`e`CvJ45lt8K?+wp)lVx8-m
z*5m@=Ev0dkoj5p^M7w5B8Xubnx@iJ0#!xpsv@~ASWP<8Ywh?H#5x{|VZwoHTHZGD{
z@tc!gS3!&pEKh3w)|Cp2;~={|>w(1t`*pn%U23kZ&&DjOyOEStsCzzHQj1C}hu)Pj
zE$VnM9;|?ZsnE>BhWdwwcJ96R-~6b1YGqK+Jbk3(yGw9}78hZH7J+!e)UE)X2=fFe
zmB2`(l*P*&jY$g@9M`nvy$BOb6E2=N<nb=p>A<3Ty1)7L;JR5oz{<BNpXZ@!m>ut`
z23svi8fChykOb>R_rUZ->Jb#U3`P=oZd2&QI;U}IG1%HbJH~Se1;it{k6x}!95_!N
zWZqExFp7)Q@~TDjC)KIowZIDRsdFLEE+1yYi+o2`r!b0&dfvp&k%!ICvqR5AMUV4d
z-%|a+D4!*KM46JqWQh1u+5SKtT>wfT(I-(Y>=^OwuYSoj4Xomph+iZn<V1~W4*fgR
zMrC4wD7QjrjPhe48Kv?bckaA=BdEwU`quxn0IJNEVMuFQLB%gKK}40oRQ7KM?&3>|
z1Oy!M27444?5Xgq(tFgFxXlrm!-WS;iJ!Zo>9fpa!_Lf7ji}pFf5vZtf-57+I^dCu
z0uF83E%~^?beX;xwsK1I>y{imhWh0sg*t`L#~w01aq|XMKQpbXPo9Kw#?CQ`Dej|r
z{`+XgOL^dut{^~lY*Nh}93uAgnsoeLx8U38$LzrnPFr7^w!EZiBd-P^fIzYnbJ|h_
zUw!+nL(7qK8~f)Ezji#5#)5(qO%1bL!9;o1(8lM1c`O2VAQ_M~HJ6>IucfSq#)*uM
z-di+_XSHwNqrX=qPpuq~YOp07RmFW7G$CP9n=)(;7cSN#08ZdJoE17B#OO;|!wQj4
z=#L;ifN{Ql>^e3(Q*}Lf&dBdiaws|_l}c|p>ENJu90z6)v+)=k3p2o>X3f0c^{pLk
zJ}>L2tSqa{OWu6c#_%`CI&7mD^~A?}rB$@2Y*~t8`M;hEoMW2$a>iA(->)BcoI-JJ
z2}sIIgcu<GdTZ-eQvAa7b89I>#hQsCgdya6v=>-P-TQk$jG5Nx$Qj3`{6*P*gpwHD
z<{=~mjUZEr=s|y0IfziNd2?ch0`ZGRkZLcldPoc*n%Yvx0@`YJ*Q--Z<N3?gkfM(H
zQ)&hHvKq8PrY}sWNg4NeFLqD}=4}@&pr2{PVd2keZyAdMihU6;(W~+zYcT@&6t{Wy
z{4p;%V0yPhVaNmb93TDrJYD(t0(`+*CuHbZyFnL46k|44EBG0J3AK)_)FRwr5y4Ez
zwT-{jwog6b%>)$9qHF;p)Pr8o-M*UJb~%$mdt`!@cSufLwtRVa2ngoGu(890ttV8l
z1E@@2%M0Z(gL`dAYJollvI%tH{*O_cLgPmc4-56WGx)T%PS<Jsm(UaZcG8&8z{$5d
zDBWr7SpN&03G_Ka3QN(%2C(_YL93c0f+E9TvM&0&(q$InGnU<+OifisUx{A=A@}|_
z!z-bSWLM)&b#HMJ#w75+MY0IQ&O!1aKa}}$hC?pBz0i=L&i+B3__`>&#lBzf#dbts
zFr+z-DjW`uxG*@wKNoEL)7+q@tY>n`h-AGrD5w=c(eJCsW#UD#z5ah<Idm4nnNs{q
z$$*v06PMy~XOoT}-$+ADe?x&G_ZN0p9U_gJBkjC2`;hY73z_~!xW9kx^+KvfZivkL
zLFtR*8o!@gC(HIkzQ}nLqCi{~8D3m8jX6LyTz+ItsZ68+MvCxLS|)mOs`x#7_AI3r
zO09c85je=)*tmhn4?uR}_Zm(gtY2+zw%oCKuT8(TQU-@b6SnGdJ@A)=t}PmE(9kf>
zDoWu1i@y%~HW|19NxFYzS5rOjGW5N|_JC}rF&QZ>6Tukj**of>=X6Gq#92#>cMvcj
z(|?@eZBV@S``<F%1+^lV7j2<c;f(bq0Y})$frB88G+&kkG$#Au;d-3^X~k9M&Gfzh
zCO+PtJ7o6HQO9dUfDIt_vZnG)UFDmm17F=55OB$5;liC=4)e+C?Y3fTaR$?mo=Txf
zpQ5WZ^`=wsv4}@<fGjfv^%K7-Kziy5-DS_mh^vn{!xVmJzdB(b5~7W@+UJC+&mJTt
zI#pCu03jBj`Ah!!t->du1Roh*v?8dHtgidda^S>A+95KNtN~((^$xRVn!W5N>*Xf+
zQXTV#$h9wLrG0CUTj#Dv-m6!eKJ(a$6CLxUATpE~=hl3EKVnBDxsxVAw#vj={T`wh
z?=Nl`Y$uU60__uUBJ|k0sIM(^b8`vFOxK;E*G9qL;~|^iMJNnWL#eOayfJv;eYY}C
z1nPNa8jFiY(0z-M0q(mK%*-;+*z}mhtN#zqI5>Hh-gK?tR@%OP5Y2{deLUB<4K=$#
zQN~~Sa&xcn5)6uwRdt|?1ATP!sBL6WpV<Y#QrSdA<A4rlft#Ce)IpEwsQ=2M<BMcQ
zK=?ZuGr(y8TPbCc^-*^JH0(a$$SB?44z-9%z$Z#xyl{^?*h6={p`O#Lpp#KGAoxQT
zt~y?GZ)VAW<^yls61zW2Ql2H1M`eEe4k^s0PQB+`Y>-s#qU;u=abgk;bnBr<!L@&z
zoY0TI!#9Cxy!Ivkx-Vzurf^Jb1q==`rm6MtK#+HXzQ)FRzjlo~euaEWD*1eHVnKfX
zL8L>(5FsVGhUMoE!7<9XJIwVTEP1c%s)63mD=RYz6S<Ra4LCL}QW`rNPEv(9c~PIG
z*<tR|1}kPC^II$)k&#f0{?T3&+BjvVb|-11A|l52&bRr{53vg<>{o2}MiJ8(7mF_(
zc)L7Pxph{43#_k-R#!x;TBJ?LHUq2C)BOW|PP%dL-r0luwS^Q0_3RDFKqW6~>H4!2
zJdmrpx*Tt0&^~3}WJgCWlz3i2`cb5G;-lbmc(H6Vjd;$Bm6?p+HR-EnjSK*uKC2E|
z6+zq*Zx<^okvj|?O!DcW)y%pV@asG7BE8Rq){NxH79oj<P!?h`4E<)hN+0py5KaX8
zzv@X@lfoDV<u(LdkZztO+&7Ngtb!y)Py`(t(r7x@iHa>Cx($w68vY*!Zg+3FuSXo2
zhc3Cz>~Z8NAPe+UX%3({3N06@vQ^mDwq)J+^-*oR0g=_yjE{w>_+0l5%~YRl4R$Lj
zi;|86Os4ogtc)(Cq54M9C%}nm`H2CO<IbMtWp`f3?*RJ|_>A9@qScTtr1(rtTHe-W
z(~R?OH`|c3k=b49pI!a_;C)u>evh6dTkW62<1XKk`mLgS=4p`YkInmd>njvAitn(X
z>oLkeFfXfu0|x`K4n^mP2V4o5pKskwUuhdcUp_@g*HEq?@B-s<j5BwnYK5gT=9!>s
zvTONnsG{AgDt~L8n@^vrI6I5E5T3&0kPaXKtUqh??hYM0woz&BHJCb4q!wI+){Bxj
zv+dN>hX1pF4oCwWJ2Uga)<IwhQpHn(O}MP8vu*xth2;Yq0A_CoKkyl)5DvG5!=E@~
z|A^_TXXiH|co^s+DmISj{ogw$556oK*qpt=i>9r5;q)WS1`cX%;gO{m-QLnUAzS1~
z(X^AGpdPAb(ng-l$q8EF@eIA5YhHVBOy<TiDPZvH5;5W_d@}fS;%N&XL^Z?dDtMIF
z?S)s9F>f=Ig6dz-jIr_jT^Tnr=(u426WH*UO*V?k@XC=+3XK|Vh@p{ILe_6?o*nJF
zr9^ANp-7#OjPIGHD^A%tIE+OV$J96+IbD8&RiWnklI^nBO{iCahp1%hNiLM>VA(c6
z2}IZ1al@;DcOT9juz&ibrS5vn2Nf2k9PHM<ZCfJ~lUtK=dMtEuGO*9P7#Mi-NcL;W
zADW}90PH(6O2ImTaF`#o2V<rgKeyub{fZ8kNT35Q{ll|C2Qxq=Yu&M>1TOT>K3got
zIfYM)>~ePkd*Ze;SG4X|KT-q?oXCHHdtkoUNInVz7DR>pU|tzF7x$hUcE|@M$k_`Q
z5L$0QcgW$6SbAo7=f3teqn_;``d;F`2_(Rg0^d5%6c$YFb^;ez55#dh9C;BT!o}ZD
zcu+NFmchxTjzx1@u5?=B?5w)Nk(e)-w(7>{=)Ymv&}w(Y4F;$^p_d>rwzs(Sq#eKg
zt3vSLzIE#s=f0K7?n?7(j~?wqkXG=<(b@pPAWHbgDf4nbA6&m3xisqTUeILO1Y~J>
zK-0eWUQ?Pr@h3oNEk}+LK+5K}o}QbU>eYVYvU_|_6MnL7+hx=9yM<&3p7!f-ivErr
z{|^22%iHms*(b$w)y<w5>{jend8G2MBi*A)EKV$re7a@sgV>9YEJGgcv$!&MiqWX`
z4Z7daYP8<3QF^;u;btAq?egglkglIL@=NI;4Z}uFmlnl_#CCjD#Hy#y;oh$fmAQwP
zv6XA=;h5TY?_PiUe4I`c&`)$9h#6(q0am;x=)I*irL~lQ0dfQ+pL%l8g#aU$3jf&_
z7LQz18nIXdN#7Oo>zP0z6kJ6weA+i3yTf=12qk^zGuUq+lvb3+q7P=9QsfNumOv1I
zSCToH;ob4sKO_7R1w~<a4}isQIN~qbR^<ldvXY)b-1eE`>4gmf=!V$YW@R~GyDyXq
zhU6%qMJPiKY`ECn$I30!B6jwbFr(OOW_oi(On|m=0Y49d9qQ#W%dZB11snDnWOsh!
zkv?GD@aXgT+i-{DuO8|b-o9WS{|sLSWr0jf7p$r6ye}Pkq7@O+9xV9h_u^kgcb5h<
zxUyG}RqU(CepCKZZrQniBl;}<2YuWtQbZlDjpNxD537e-+waH|g)#e|tPe8Na?U%P
zm6=JOV36DzVhwEKOz|BI#*l^nHe)|@Xq&i#;FUzPEhARs95JgBdQ&Xkx?<(wdiBsJ
zKV%yqZbTGW8jg(<7p`zisHARSu`qS3w6()QbW;+&yec9HTB00fTB{9GbKb<Wn54JQ
z0rZj9aj6vmD@d9~L|yvBnoRwi4{uV37pm&y6h0mIAR$~;g+_5U%$`uk*doyU%Ptw%
z1Csily?d9H-_@X_lHG3zvam5+bNRGEe)Ij^Gb_W!#@qNa(VPCX)t>C$((zvj^8>O$
zitzmN>~4x5w)DB#wC!?>dLga43-tz~D;vzf%$n=f4gzfw?!R70t?u7*6F+F>nJm3X
z)o~U=Rr^BNsHOU7EZ8bkFJyR%l)EIBbZWGfNSnSgeUFw*)cV&K4v=A#XO)$G8)eP2
z)$Q~&E8lPbkeY=4Yn-A2i=t)Ao$OA+4_ulG%u(?ZW6qzofICr~F-bO#795*XEj)7R
zp}!m$l!wSqscqXUkZ{DIZlu1Xw}}-eSO03?p~E?5M(DS2j|qP7<~hg-w(WKB&gk6N
zkim0D=%cPdIEX*!!me9BuSV~))YH?EiWX2t38A*9=O9+D3iHjC+{ykSC~bZRwuwyz
z_O(1!bl>8#vfV45>-o(VIcDz~{&}cbTmmD@x;7`ysQ*?^UZgAy@w{;A)T!c^Pn*W{
zVw>Gw)9cKnkj6w*I%e$H!pg-=E9S4`X5D@~0x1zYv5_V<Mw2aK8wVI=f|q~+QL(9a
z=!GXNS`?`};PZyKd7%J~>rI?4HFfilm4e|SK@c#DU=4(GgFZc?;K(0y2Wh4R5#i!8
zss%Xq{6MNRR|OG--oSwoI77qFejaLH@$nehl7!GO<myvkx|ED8U$m|=%bMqqmzOu4
z^0#yuMKRffrw2d|{!H@oalSmz4pk4r2JuFbJ&&+r*I0E9sY3FNaMD*yxUWBb$)`m3
z@~N4P{QlSeHKP)FZ7{n|oz}0YTe4o`Fj|V0Zz50WGPsdcof<sMxlw#qssni#J$v<1
zAbJDzX%8C1Q(OCtI}X#)P_bRR``H;Lku%;OEUpWXaYzynOwuYgv3<da2k^FIZLoT|
zS*Egc+WPCPUo%uPaO-*C;6d6F`@%#In02X{P*IWqtdnX15oBso&`_eHlso_~ezb{E
zsqw&@Xpv;|7an8uP9k5>##Ctf5I*LQL8+h!=p9>FyPiLI(CSoK=b&+-jxse}{5k$D
zm0CAL!=#KC@G6X%)jNIqv=lL3Ic9&FHn4A*&Y3y$Vd*6>LQ!_pAJ_Zko!zkUF8$i<
zh*{yG-Ctk$(>e#nDb3!cPp!D(=H4cIt)3)wFZYELBlw^!*$!274mHWaxjl{gVJOQt
zT*?7=&C&1FlFmpRV9%f(1q_q<M}m>TKPNX-tS1|D@*sK(->Z+>V09RvXhJ3gO_goG
zJsG9LKHMEGjXcFWELtX388^Tm;Q`V*&rEet3=eI1IQ!Mp_&00vF&N?60GwS1t=dNH
z#|hC6Z=0|;^`ch|MKQoV6hR1JFIczN%+COzNMU8)uV}RX{nw6^KxPR5o4DODxepLD
zI<04@@EpLi$Par=O9z%J7(n2pCcz$<=i}r}!f*h+ljREj#`Ue&h7S#k$vD@ddGo=z
zBrLv|cvX+548o7|Z#|u|^DW!7NqghDVk}+~118sH*!)ZfU485lmjB6Z`t;oTd(T_>
zg{mdcWS(D?t0j99@N4?!(<As+sP^v=S`V@3zhO=YK=_}twsL7@G@OL-oM_3}52LV{
zH<vYt=m!%|erLpIi*?xCBli}fglljFwH5pn8v-~A0<Qq8lk@QBWy7=zJ<97jg9Z#>
zk@_8um<&e%#Uo&@C5DP=ot|&PbwQ4SsbG+b9QskqJ;VJ3)LoB_lAJ47&%LgyaQ!Ei
zuH}O6*7#*grWpYyExob8j0~JR;_t+}Z1LT$qjhJkse|{m9Ol|fua?NM2EbxTeq5{R
z)29pQz^^b_^!CG=P&3)lAY382L%Bl-F_RDn4T{>>f74hcCi{MaF~j0q@HL;Kw9rIC
z02z&c)vY5&8_BtVJ5>C_Tp)#|S))`JyF~7g)OI5CXAMQvS}iUBPf$=+3Iida5&!Z1
zh}OWR+#a!(p|Ql8`qY3~Q7{#-LYFUJHqT%kytu#=CVA<Dwg4QWEFw%GR{Fv?JdoR!
zYuEf>48$M&VO4o^z7*IdW|1dAnQ21#6X@ZP?;9OhfP59Tr~GtY08>RMH@D+cke2s)
zaohQh3OYOnnXCc54Tt=MTSz+8$GTPF_5t1hI~WEXsk|1gw|8}2o1wFh%o}ud;Q&*^
zZevXhjI5;m0mB5xsw0XdV+JK>9*|c(_91)hFuhMX`5oIOy_Gd3>N&OV-|z0Obm7yp
zN5vzL-;-{JN2C@LeA~Y2Tk;=22njyabg(P-OOoCfur;U^*`(^NSLtxeMZ+!T%$WcX
zzN6|*^WPcpyz<+LkPD^kszXi@85=7aK%B&CaPq*eWgR5~qEpslb?(4$s{{(!K8aw-
zxGu33_@V}46~|dG_Q_>kgJ6U=D}*cMrJwE!wKIf`N29)SVt4J?a}66}8Tr7gkD0Cc
z3o08t3msCksFi;H(7=TLm6g#l<%J@lR0thj12YVuq7N2mD$tL<#r0TL2Z2%#20a#{
zk}bfO7Eey+Ey}1Yek5OcwbUApKqg|0i3Jehnh_n?GZ`qR7)HN}$Z0BCbjk{r%rJH+
zi$D7Vf=2zTU>)v@FV_L`pb+OYtF0~raB^~FARMdKU6?bVF_sO~fT*XKS>MJ61Nu5v
z!utL3sII#6iCKlGPY>V(7fj6#X}_7MKmwJwTZf50-rH>E%z+1|zfL9i_(w)+i@zbz
zg@8T6;S+Lc6~~#FY})huAK*FIU%clOsAeE5%ecgBtcfkISaGpwPUiZFAlEca<tq}M
z-|u<5Mg6*&;*dmYZV<IO7h37}X}P<>`Xh!h1!_8eW=2jR4z;q~eP)z}e5H3Xq9A_P
zyb**W((4ho8Fc41kUkvC?;fQKLo}+Z5=1T7z$75qf*2Js(B8_@EBf=e8aa8MfnJ^k
zzy_+S2R%I&T7^9FZKFP{6PO*4GS&dxAGF#-hsS^PX5mTUE5$dJs?P1ZcyFZ$k9W7x
z3T~^cthpBrGs99lw__oMY*uUY0Z5F?r9+HCdq_tF7s>l#)v#knn$C@eOsZ%QRTy{%
z1weq=jlztydT?%ZeJ)!Ia61CHx_@Zg%}Hivi5J^51H&JFa?!aX;xth7g5BvkLB<dS
zO%N$i#LSCL>p5JX04=~GZx^doC4M0J@+vfq_rngbq&V>8P|3(3B}1VyM1$P7IoQOA
zJT!1h*wZmQHU_d8#h2)u0sO^%o8r{a`oI1M|4<>%@uw7gDk;hPaCLD9evWwOh+GHn
zb|z%%FcG)`gs+lB&(eu`Z!soh;FyR9k}Q#1%x>wrUjQ{aLVr}<Y2B)o0}EtjFBk5-
zP{W%zR@7%WmvH_---Ud|?sRVa+5Qavp$W`=@ZcH)siKz0=d009zt!k0@Wg_PM~UDr
zu6DLpHq^hUU2d9Vrs3Gy40P|C&o<@o-_W08OWUWb9k*nS{e93Zr0BS@?OCkvk3M2(
zQW2Y`#Y$QOgLu#Gc8s2J{)fH0ro^r6NYp}V&fNjoZ(WMG6Ur;1z;M=<*U>{Btv)vM
z%aUm4S<ez-o_n@EV+e_Iiy;BpFX-WUofE&QjLQIloVX)5+qi;vN=2LhBaJF6q%{7*
z1@Y!}$1xmp7z9^Zcn5-G(lT`#wo(@j5U;&y+v(lv-TsFyhf=Q%>h(BGDD*$w)70tH
zM+2B38bY&0<4e<9zI^nAsI#g%Za-0_6s~s4W{(Y$aVTx9`O5RAxW}skm{hGbRM@?=
z+n25!C4b{(qpzEB_PH~-I>so!{kWJB2_f2H8Dt=lSx~%Vt~`1aH!Cm0zQ0h%%;1xt
zp$1@0|H2Hhx8C{o;+RA_k;Ud5ATribm>xz2s934xNJIDois21WjQS9z-7VL$DU6OH
zl9SP79^C2=-PG`Qr(^C8{Q=*@#Cae_dj$nu@F1Yc=()-qX9;BF+gSTU;Y$W12FSGL
zlo$>TY3aJ`&Q1L)dkA<TKz~BiCMRR?rNU&A?cF0rYB8VL$ip>3$;QU58drlvq!L&H
z?BM(%ji`Eci2eg&ovqM!L>3IA1cf+;<Wqg0PlACPm<F~MD&!TFDPQ{^J??a}slIpF
z-9CELJN<72&AKyghl))TU)&isqJfbUAqnB=CPFUvY0Zaw<0)rNZpssfa06MTO5*G`
zm@s=r)I+soMXRybv)|A6>^0c4mqnbqoAbKYl`FlP^i5b&PwXy5E=;4^mDpSHx$te_
z=1tW5VFA~wT0l|CvN#%RB#t1l$2CXLqhv2??@v+nt@5l`$8&=~rj9WB4Jsj{YHX!o
zB9y7KV>5ckj=vatpwIBq*b|TvV7~-#esc+^J0EywSQtBe_jTzkF@_Z&`9aY<7+#YC
z6&D4E=J91LiG6wH*A~--|I-5GiWW>PPKH0SIhA`4<sh++QwRLbjKb;<Ka<~grp$>~
zinU59C+LERG+gbvBEspUqP~Jwv`=$a>ufXnEt&Pl+9c*uUxKAb{|YX-$0oL9`a{GP
z{I|$CX0nx~|4o#@$nM12bMM{>msVT(n-T8j?YKe^^;sOtG#CUZvwr@ROeil|B?Y0)
zu&n0TChIeN=94|(rzH0R<-^Zc9do{T9giW7L;od@b-DSpWr&UXFTC50NSB!0TDBf-
zQy-o<%SAFByM^AoPe6Epe8}eK`w;Pn5V8!Qg`~&#=1OrF6;_Pd7}vltU%3xs<PfNs
zosWvmV=`s+g5m7Y3aZIe{tZHDEq4I>Nv_wisaTUNuWQyQj7kzWx);=*PAoiMg9D2Y
z|I8pkmdg4ZfKM^-V5-QoPv1|Py*)in_d0tn4ZZT^2M?6Fqd5Go&WLs^UA`)|)oc>&
zk7`@zqJE6h!(eO1);`+#nDU8u!rFsoR3lg_h&MWvrO7-ak`zx|O3kKCWt>LFqit-8
zUHy|$+n{WMbH0J6IF?6DmEF5H^-g`S=qfYJ{QLB25oSIfO&xK)YM3V>LC6mVR;A{}
zIh^V{&uH+*j181w-Bp5478V=OyI8rU$czz#zN<a4>c7Eo6SFNSoX?&)Q|^Ay(!J5)
zk-p(*keF-_K-(j-Lw|q7FzX<7;jId{cylA5a>Ox9>Jfwl^t*p1_Hr1fmmPfM>V_7Y
z%JUuFT3HU)e|BSYs8@GLBBPB>Jl;gC?7@Mge&>Wdh;j!&_O2>iY-e}B;#{l&9h}Ih
zc79#y+r3fddeKTtn?kGuRxdoHd*06W50v-{Rpdh*SP(#~#%FISeejzWDT8N-f_AZQ
zn=C55o>Y2{j{R9-$z;)r^HHlNT~JZ|xH{#F7ElW}+>t9x4*~fvd)ugX=m<weJ!~L;
zML1zKTZeZA#3O(-_OUk_@}3y(+REdT$ttld;Z9WD*a|F9U=&YX3c5o)K435Qu=%xC
zH-_;?5_!(6dLGSxPrdU4-H=Yn+zew;F|&5fk~v~Tz_2gu_A^Vg;Z@1v1-6Z$8Py|%
zix~njP4leao*0@3Q4!DtpXiT#4TA{#nvyHKn`&c*fh9yyQEK52{r>&YVIxvaE-Wtg
zt{yJbJJ|+wlUYf*V<d$WzmU{HHdNcr0_@~6Eys=CB6eOD5$oy0Sm91&$sSFc8Nv^;
zl3H>_(6}BHm?9%XnjTl1DYj#%GN=#VQ?v^Cr-v#4m2;Cr(Y1GUHydYm*>A1wxMkQG
zTAlHJ`_vLh8&nT5R$qs2`BObKbyat{nz_Y)H>}p?^^W_M3RDQ{WCDs}N3Mcd*wDS$
zHZhwU>lNC@-qJn!;)M$zh7_EDblktBAYm6ShM)%H34ch9+YMq&>cp@VWGy67&pe0<
zF<yp<fa?*-I`h$(sC;1xR#QVC^bo23gfG*g9ttVNV6M26a+u)~7$&f3J}~;;LU9e1
zZ8}3x=NPUxKRQLEFhsH5d$-6YCmIJYRUIZ0-c!<tA&{09Sx`MNV+8QB>jZ3LRH8J&
z63K9$+4wT+1J-T_84I8ITn9V9!Ek7f5YZHbBuDWQ3z@f<EZ7}yS)c$8sOGMC#6H}(
z*TbO2X`LnR;5zhY{j>ZPBs*pzC>z8KkCJns^P?J|Ryv$MKwOSm!^JOx$oz%rnJ>`r
z@C}V~hSj#Dn5=qZ$s`@d-<lds1DQNqWR>EDa+q4EuTAP9q#gngu}QnID)}}&0oGj5
zLm*2o9p|;PmD-n`jbFa3T+8yU6`_Wg;;k3$@A{AW39q3BJ>+sOoSc6sbc5kmSKrPD
z4;;9bJ@Yrc=Gwimbdo?P>yM<tT+e6ApFxt!57VflQ;|YmsWMu;xc47B>xi(9^CZ#J
z6ry8H&&g@Sw<bx@&q>`uEyqHsQL^}lG3TEB?l4!hmhDxzQE6n|B*h8P`RnHo*)zw7
zMxraTOGGhQn#c@?SUFQmgSLz2U+jWEGlee8kn0#3gmH)Y5+E>M{Qyq{^^<Jw;QiA(
z%OpsZpIO3^g1JZ%WwQowjlc1|MF)Pn8_oCd?y(<~Tp2uHR*52h;r2{(CDTqMNs}W*
znN6+9X9t~YTr%?4SE1~s_2<~PC)7ySP`&oEA%4Sm%`cz-Gf63u>EAS%>?~p&i@u&G
z^@{F9tb&9R49Or`*0B|~`;Z}>nZ3jEiRWA3`|Agq*aeFhizT<LLPfzVn>W~{M@Rjb
zStI6gXe|c06=ljMixw?{!*7cdFKqx~4=LR1UQhPjy}W%an+QVYt=`Pdq(=*&Mg-zb
zKH0?`f+2c+Q~f@Q1#6&J3`6FXl5)Rou!!1g9C;zJL-FZOW${Yf3z2IKP$Au2zI}Uw
z*?~qs#yQ)4B!*IBP#!Q7WNNrgK_$=qkSLr$Ex<D5YD|kfk(}IxSPbxr)H%g`P;u7_
zA0?}Acw64r>E!;57LC$@K@$pZ<>d6jaACZLl6d=yF9#1sY-a(+P!*`X%EPo<Fk^<4
z30S|q@eI9O1B3jbEXB_PqX?e*I?Ckx_uH{{l1<=uWIYh!foSdsZ!LpR6xq)y2|>Oo
z|HXxz9+#K43G9@V3@PC?v@GPdJ%<#BY!)v*=O@Y}g|JtAUGPoXnG#-|<R9dPgcAba
z4RC{A;VU<wazuQ8!g@aj{-e-elMCn)U?1KNYOrnd=4Kc`%tw^^dlMUP1GyYEgp|-c
z8>D1Z9+)+V&oXoqEja%Hz&MK}4ZdL`FO3dO%fMh(yZ)uihY%oTP`d8TthB|wn!YM6
z74^z{2_ds*|JCd$>a{C>AbcD>Co<e$4+IXgnfkuIzdUo?={RMfRPv{%&fKDnv<c8!
zQq3S5lNcy50zdb*6iPA+%j^IWsCr^nQ_PAFI5X_7FT-k>LSaEpQ9_Sk!#X7QqiVa8
zvlUtodn);&<dw#wOI}(jcS2W1;b$WAJv`-`?p9v^JbC=s*2TqV^x92a2Vm5&SM|$V
zFsIZ~Wx&GA0f+uMefp*BIY$P<;S!0{KGY!AA>JQL!@L5y*cblk{ErMEp(0^-%PKZM
zCNS3ebkdTHj10sKNpIX9(WKCzlRCde-SN`=qqdG~@gi4dfO#31cMyT)L*#l{zk2!d
z3<;m_DsUnd04g+5A7KsRwzx2nt$4ecQQOI`$Id(wi)hR1Xq{p%0iIaOU>)2bo34h{
z)m-Jm3Y`wQx|xH*9x2Q`Uj?MZKU!)=!+lrv+DZ+@6(=6LgJ5j1+C@rnM{!K_W?Jab
zt)tlvMtWp9R165R2(z+0SYvbmMqwtzSC}&msu$`u%v2T*8#XNQZ-zi7SzZldAuEJS
zQbtp4@(SW1e0V04VW|Q*s*+n60$GP35hg?vsf-8~!luZG0jl2t9jZXC2(`=j3@aAx
zU^<?)#K;uH2tY)sD7{;%58F?H^xKw>%0b*zcy08yfD?*<e5f|IA>Wnh<z~&A5t4As
zknvfT2HC?Lh@TW_A3eYKg&==FKa+|hxND+Q`kP;hc){2oa5_B}o?@e;E%?#<2Q1os
z;p>Cjw<8OEm_w5-qvA~+=v2Zw0HH1@+9V@F*~+#og|2G^A`uH8?Y?Jtw>X&$rJxPV
z4isB@h+Iw>(ndu^#cr$i0$t$8ftlD5czYq7s+Rl31^k^&Er`Fw*obKt&!_wWY?isZ
zC%y4FR+E`LP;piN#wpiXcLQ?IQSLN!#Xsc+4U9%EwJOboh;cw6erSYG;!StBX~to0
zWM#D??EywmL{uv_OdAO)IiJfV8z80+K{8=LAD-}PWnG?(VUh4DW0BDQJNM_tpC|iO
zCmSN5kkNDb<EIXM90^uUnUH(`zVX<xV-@Z#XB#WMi1o$w?DPmk+fCfL-}Pb2f<-^F
zXZD_X>Lw8aet`j-CT-NWojTPE6h%a3&z>#2di^EmO9sy5zX%pY!GiS#+$r2utLTX~
z<yC`ozlD9ckx&d#gmaL*Ny_qJV7N5sqNc$gV`N=TL%t7l{g_!zMr75hZL4cd8scLy
zu)tKcUE>WE(!Lek5vB+(X)onWIIxI@<MLHi-(<_!ALn7tE1Fqb402FiLBX#1S{kzZ
z*29BCyr)jx_40M+cI`G$U+u`d@N@?y^z0<hX!{CL$m4c;IV&r=FePZ*@jYed+rw3|
zs_`1;(}r2k$odE%uiNTUU(M$2bD!yBWD94FtoC5ef$0Eh3}P;&M9+7vbjp%jB6=sk
zia8JP?-VvW@g-%A$78lTh;t8we|LZ>M*nsEFU0v)_r1mhLZotbe4nRVP9lSmrU&<c
zi2T&y<f66gNvq`ov5n?7JJW2y3C-88P5n?LNa*IxuivS59=39+g14pfE#9=TY}AA-
z8oKi5BQ{l~x9CSZ6EH89`I^?qjZ+R&>w>a8bhCRk%ELAx7m+Dx^KKt8<l-x@&nQ06
zX}ogL-D;I14rSV2MmADzF}fkE5yYnOs^9svg{`8o|5!7rRAzM0JJs*wSY$#MB#S3e
zn-vb+D~kb`5YRnxJu_UIG4RxC-TH6;CKU0jMw>$OOe(y>ckjL*RlC=;C+|+Y*u;m8
zi67<w-1hH!&QyWCav;<utyHebMnt+CW>k4@s?!21#-eEy&r6aV?_J$%0Y<V1B(#!(
z{@co>Rca^3*KD_ZMGCM3-k}S6|9HU;y3ZAb`5MDecNw|0UEn|J?vUTbLyE4Ro(8lj
z+>3b+7izW%z7<tkKTvc|m_PZuO3``9C6IVBDzZUpWKgt_)(#*!XNc6=D;CM#Q(13}
zX~OA^n=GeK@3DC?bU&ynP#Aq*h@TlW6J0xn#7PEmiBzo@gdy;ga0=z}5_#lHUIQZ{
z!OS6{K#*P&#f&Xu#UciV(Z=Qe$+IVMa}Qcid~nvnY5_eO+(E^|Z+YMY|B4cXLkfLx
zyNMla)6DS6E>+=D@AGZco^mFxF;5_Kp;$f*HZ-gi!!o9C=K22ckjaDW?6)(^U&O%T
z#y63JVFiSA?PR0%M-IRKv~{toYih)`%s6UtK#!*p?-Fr%xM(>Eco(Q%{J>;S1jGqX
zd&|Lt`5w`SsgEcvXZ18;J8<*KVdf05f(OB$6_q&yh{tHhUm3Lk+y##B!=m^32X2nO
z>noC?4jmr7HFV30WD8rwAuVDSt^7IWW(BE&hh$IV<5st4h4PRX=Bg*PGmJDWMqQll
zCL3l^WL^MffEo7u=>2o(a%lb##iXOrrTA^Dq}1c@X;go++1`W3Q0}UTCP*z@uk8$;
zdbk93s^)aq@PXqi>*jV#n0F!D8T$^!f)Tgu&*|iWE`Pc|EcL{RF;D*~E>t=7Fsb#=
zvaCWYIxWq~CTxN7v|B9e5MRNtYQZF+TIIb{JFq7n{r()>!FX*zVsT;blBn%A-Wtd-
zJboHO@OU_}+S1BObh`}AGLwesp?Q8Kvua4p@viAiw2(#0)DWRC<I>w<oFGo(<c<aC
zjTsS+Q&Lgu(&dDc)$G|TXFi{6rSVU^ssW>2!f)aL$0@{3g~EmE^Fl_-s(}@cEn>|Q
zk^`3f_}pX1u<oTVShdT!Mxui>LFhPJsyoT^)lm<XzARqkGQN-W_?v#KfS1;mTVz)D
zF3XQzXMD6ow7qy*sclE}K%@oDE(Pusob}JlR3^fmO%Rm`t1f#C>=yrOcNe;>!?73R
z;!-ub-;54G7K1ieniii_sl6B~u9y;aP*|8%Aidu;%@44g$0EnJTM5G?bV*5`#g`b(
zB?(hSQKsYnD=TBrMqvwii=_tJksQ<UIWazwH}@9jJlHcVATZFzAm_tDBW@YCWpCaM
zLsm}wz!6D7h8iiaiBw0gepUoJ;74O^BfhCp<U&hQ4aCk;T9;9j6rW*;wAMgfV@dNy
zzx0<@d|WNoT3+>BYIO*pyMu$p$<tHm7{%?4z*N^tw?5$~F%W^%XSEn$7=^hoqp*d)
zy>e+VL9}JvGp?WS;?sTv5wZC!S1HRfXw{(}FKBi{iW{%g(#OX~=vXjzwdAV-Ejw=@
z%D}V0$Um>D`n#D6?GvyItp-1acc;(59Ht3ZsKLdrfLPU?C1?b<vuulIO7o_wTk@4n
zZF^|W&%$bjj1Q&5^#URCyw&2V5+hr?JIx@SQP&c`-agnk0mmUm#Ux#dCW+n>8Ju|b
za)TLX=$8_oonG`QAg(&`V=>cr@}+*a)|gU-<T0|eo5{FMz~|tbSaXHEnLB05U89|q
zE&jikY}#lpjKxi&733ot1%&>^v8A8?axA*jgle(Eb$Hj|I6QCNx&Y8vq@5CjsJK2=
z7w7Ornl^8q=<1<Aa3J0!XK$VfG+;?sAn(Cb@4){3)a%2Kw5r%ha=Du`gIRR>Ke*g6
zKyvN={c|=~X9&Omf;`vQmd^B;MHIP`Us`d=Y6`$Rh&CO3+Q+U`JACnR$EV&geDXx|
z956o=4H*I=hTr|+iFXX!1O9-BMZy05V-FWiuh4(9qzkcMmXlHv(P!PF;Su91lZg|h
zoM6J8=5iDPYKKxOEm5S+6^Hpx5t__8y85ZgM`3n+vVzKQ=9#j&M-K0@@fC~_h^;9t
zwuq4F+H^*nPd+>%3@{jB?`7=hLkBEsM=e{`QtL-hL`3O2HTOS$Z|xO_=r@k<5AGzF
z1fk#7HK8eEbV4S^^+nvoCzDN!@y_XwJo;GA>-xdrG<f*}{7<AKpyCj@A2vl>sg{Ay
z7$=r>vGfeWXp>zW5G@N=1)xUrHIDp*0k^b4cz)?JAcIPUUUlCX9T*ImF-DDMSk;1F
zoV|c{15{+Ko{mmy{v~+4Y}G(JBEvo4PQaKw-}M_h^Z?E^0%Nd)ShNE&*~B{6QzL(D
zM#?k$P8maKv{0iWrjbmB;7LPq=wsPtQfU9iOw))?6n7{vyy0S`0xGsYG!*2AE{u+z
z?n`P`xLzX=*_Ac6KT%_}RZ|Pvym>;>XA2t}--m-T<ML2!y@U8{A>}|>3Q7<(Tk{sj
zf<DH@#W57L9D-+U3;jNg&adAp7zyVZyQ&=e4@$E#);M+}57@XI7#<Lsq9jwW{C=7M
zaSUW21fEiq3XpNT4j<mZV)DLyx2;>ZPJQeyw7xVTG}OHEwg_;*2fw8hkz~Zs8Hnaj
z??-N3k#B^^h`fPh^Jrd$2#j=frz{S>x9|kAKN2NTldC@Maci(AS}AU;1|=#}8Tqby
z4g*vm7wiuguOb~mF;MJ65;kkm!nxyn!5H`y(sF?;Lvg}3LmV&$Op_pLdq8}6vAm?v
z6g<>6@`e60GlEuxV_}i*1jf++n}nr7HOWKPMJoYz*AvPYhIqth5knB3#sfYAWaERt
zSrQVB5!d_a%RvHZl_z=t<N*n+=PwmOEC#e_R|$G7M4N<>&z|WKeb6|gMiN?{Ps}Jg
z*e&lGG3qv66!XqfgGAjt!T(76_AnMhrDtJYMY2{{a#u^Wp~`M<)#e?iE*;`)Dvo!U
zoV0I0b>>X}dkaapO_+URR78MEf=!`)67m&FMUWV5kWRl04P`o;FM)Bw)f+bgII+^O
zR#<*t=IR=Ap+$VZACH+C28d^V6L~+COc#Ed&f$q%a5^TaRa(wbl;S&z#WI{X$+3FR
zV8e8jEEHLQdV7zxqC7NB+E-tE&k6F7&H|*PFZCD_cPTNkKRPHO1W0&L46=|t@1fy&
z52JpZ$KlH^sy73D-Hxs;VXs7rV691sPaY<eE^KAP-ZNBnRqyiM`e;U+Jo#zD9;RBK
zgA!>mmCw{Kw?8A*=lj!U_PlB~M{Fb`h=qxX-D6=9kcnb-n<m0MoC5A2SPg^*^soRQ
zY-)I^q~|3kWDL9qM1wtR1SbkKUFPNp4X|ZvFj5CKq#U}5IqG%nNyw;(#5xdvkxs#$
zh(9L~`g;&=F{I?9uqO_KmYL_avj2Fy&-YK<{GPkw`T;l9phS9yeg+2d3LWWJA|A!$
z{+AcA^Y|@N{g?(mMw6TT<5RwDQ~Gx+^iR0Er{|}cPxNAyN7<}?-4lfu@u6ya7RE!A
z>|I2^>PEoFjJm;IQz=OqfPv+jyLXlY!on3J+uC`S{IO>=%A@2Gb1~F!Q5Ml}^fG;I
zE=3R5|85LVL;4hP63aJHA;7djahxMJQ?Pu+v-P&@5j}j^CZzPj+i%~I@Bo`M6BVBC
zm#bZ4w(gI#h+4%-1$1+xY8CK}%?N4b7N<Ea&k(CC5P0#$t3@Iq+z3n-xsU>qiH&UV
zDmkZnb{x6r0$s^CfyfRpiQKe#b7$S>R=!k%LWoG+N<}J3UX(9zl{&GA0;Jg%P+T;O
z#C7RAnU<$e5Udrwrg&_C6_of6i6#ya9R$cBV<FAEw$|%Ps*22pQQ=JFkI>{6<8l6y
zk%gV<r;+pfty@=bLT07&FWI%pf7HfYfufX=LU%5Vx|sH&Y$lZ4MqWml7VKw`P&qdR
zBQ0FKwqu9E!mQth<aAa%e3LTmy{7c0mx<WU{Qq^p0)d~<S<3ZG{qf0)gtd{4oorZw
zkXzL+F(E;h(p(+KWU;-OIm9g~9R{0Wh{K$HvBcnDqgba}tncJh=Gidq-a_0y&a!X^
zM9&n#I+5u9hDdrP++AR71Oq^}h<JEPyK<ydt7P+IyqitZP{fu&F8Oiw6_u5*p$C96
zbQw8fZC%2LHHH;8W!+G^pW32;XaCazG*kO_c3!O0$2SiJk0nJ)^AFCPv_?ywJJCyn
z7|Mi}z@8m&(R4k$fFmPVglH`z(pX+x%V{zF^$Vh>2JtI~Cf=w+@r4){F%4U++tjFu
z?Pu4Q#|4kY%!l}2%05Nj%ge;(+f%Z7%Eq`^%THyNHs0`d;g6sWb}TL_JG0{TG=b$P
zA<=@*-VwQ)t{#Ct#OB<YZ?A)P%WhPeC_vpR8>k_Fxt&-9-rz^>*LP|S%3S5A17jlS
zDThW@gP>56w20IIjlT|+!W`UF!3j@WUuxH;%?_PBHKGBVK0+CPTZ-4t1bfIp%<@A4
z$VpLLlb@GGzQ-O~5<TF>MCa>|R+zcu6^cGG{gl`Yv8J7ZY*T3HlbO-e*RoL<YfQNw
z0LikJgINHRTG707eP{ux)cRu;74G}<YwL=bxX&^8GcsC7|2JauX1JzlL-+$PnfKqf
zqzTxEh!WB2)BsyOD=&Y1G(}!2XNOvU8LB17`WVa1C1W>*He@(_M7KezXyFio5Cu_H
zE^7EQ%+}yPE`f-6{I^~%L1))%*GA0T$`lM6$)RSY<CuQuOdN??Gz6zlC%-xu(*(W1
zpGIA>A_CU)=WAOfrIlolTIu-H_i4Z^I=<bT#EL-s+6|Z{^vdaZdGQ%t19{w3AsE+P
zCUgBU6Z4q+rjh|kRKRo*LT%71>bg7~09BWiti^0GX_x?qk0se>!Z9myJWJ7hg=yk4
zfWqKs-mlab`W+Zkl)|jSk{OX5zI8pQ8&wrYzWgBul;BQMWfD5DIHXRiw#e*@#u+Uj
zT?uB`H>vVZOo<1}7DiYs|ET`Kvjr+*HzXP2wZdTP4p#iJT^hOJ0%QVt1D80s>GB~x
z5AH?R&1MCe$VBGK`~)09oKj=d;$Nr;vk-tzA`;VJ34P||l;`kEmIX1G<&x)X>E)m9
z@Xm*x8^MS&YrO!hBIX{5o_9dtb)*xJK(%5C#!KO-BCI??hsuK?#k0z@>xr(pcsY2w
zJj9V(biYCvk6hb@z6-7(!hbIM7El%O3qyI1pJE1^EV&rwhHVO*Wp^GrMA^Z7+xb0(
zkodqCo?I^gAhvzPL3v$_6pQSN0tc%ZrpT!Cq&C7Mob>cNY5M!icpn+Jpsu#IR_<ri
z!{=pXy<A-3nRi}eeBKhI<ub@qua7TaUDUt7<6F9|9Em4`=Ac3QFlLjDs3bfTQ>0+i
zU@O4tMaxzKDN!SER_#y#F}w0H!RTsE&JF+;@MS5u<ui93nQHSreXn(OK_nE<r=uRh
zn@6Fc`whj_uB6qEn6!n|K6%HzeMEAdvB|4&MgE?=C?8WhK!pAD;oA3i@=~b*MW@oS
zW1d4Omy&R>n_994bt|kdh3ZwHZ6tKHq$L@>M>+P`#SkV|2HO}jNYBr=vfTRh`}bPF
zu`?bO7TCUw<-N<<CfpUqO<*4Q#f#C{G8-=Y8@-e4Nd#tOg#T!iWfU>sw+4VV^&E*^
z{;;|E_C2w<K<q<-EV^aJp}I$V#1eJF**A^fn+#sji}`#ya1g<ojLLx8#TTuP3e!)Q
zfeEGz<P2wL$4s{>nd?<}ooslE*BYyB8o-*2R!ufIouqhYPKV)%bF34lA_vWW?Qwo&
zn*l@&Sl@<VJ=Zcaw#t*@1%3`FOk+L>+L&!5OeN<jUp&Jf7atM!ZuPgnr6`${5BoMs
zr-F?Y(uYEd*KB3KQouM*;6i8vfeVl3z7~-kk?YdY4gZ#W8ZYsK5(OTFfeO_nVIOD%
z#ZrpzV8SX8gk_l7V%m_uXh^iu8QNLkuBMD##EdJ4iaW#JCV^?N^%Uw>Q}aab?O>W5
zDhA9_>xrRqc9g65m(bfYETBZ60}jHDgh7y<*t)_H4Ov;!P0Z}rZ{P&{3QCNlZE>vn
zVuOQhWdp0}siBS09|gY7*ibwXWHIu+h4vgxx(SmO+mO{jeKVMz&dPi6!Lv-D@TZ`Q
zApoSXAlc)@;cYTu!hAHhV#U^_%bFes3nl3;I=d4F!ZO;CmRQ((CaU(BMkrgJzxWGW
z(dx}e+1e~~vW6cHON2r}BMVHysAYyAscT$Zl5Ni6k>aP>3Ab6Og9W@jf^;y$GxCFR
zap5-0i0uH#@Ru*IQ3qaT))jB5W87+Xj=szB{Ta+e;4<QY+;AY)q_EvADK3M=e*yI-
zCNo%c(94fwP?@}h06`4M@#_$$Q6Nx;M->QuqFh=Vb_<_gE^0byfF_9$L=Mzx_J$-Q
zOex8g#iIgaOYteAf;b28BJg9m26*-b^@MCBWtAH5T><NW315DQ;VsY>TXRdfmCTwM
zH}`(fds<v-2+tln?)x!a`^`(9-F#}aMHFT2GiDFUIJn&q!cYVgGnUVFZ{rmJ`=2x7
zdtVW2il~)|gR6*@VdxLAwzOf0dqv##460pHVM9eVwZf=FNk~s&2vGn_#|YVHoxJj0
z;?_{F^wtw$m|Qk3GwxR2MFyx6Y6*GBog3Fnd9TjMwicQ-+EwNcR?PJ5f>Q!@3Fb=z
zRH9np_d(J46aNsTobWhFm5C|j+nGZIyu+~rYYCHQQ{c{!N7$3aDA0^dqtRT91uQ~e
zj13K<Mg)WuI3Q4d;GqC<Jx7d4-Ix$(e|z_c_;;q%PSCmnTF#n#LmDY`B(mdp^2i@6
z0{<D8G9TPr(B!-dh&MN{6b7tNV85mS1qYNW`fJ*+?<4u!9~@s=)ZQh<1`CMm$*$}A
zD-P3-ii&!|`Yz-nCn;(GQ#g)AM{_Pb{ood#N|8K<4wv`9ePh~h+30HDpLMH7e6Q3O
z7Bpe${Mbs9xr`&4&`R>Vs52lOV-ou!6Wc3-I)^!VQ8tuU3}&uJ4keqkhzUYBGqrTq
z=vV7>U|q(WRtzolShlpSomR;3DON%EmOiOCIrESzQf(=ksMbL%-qGU<nMBMOx+k}H
zhks}E2GwBpPQ1Hvy>VC~+R@g(nLZbDDN3weS{DJ2(0oX9AVYVPC(8!fD!VW7P-xn#
zf=FupS!IKp$oh=x#3M)Q!QykSWbgX3bpNV?54hKa8n~(BjYlzonnA_`#i?{$=4sX(
z*Xqygy%&Jf)Nm1}iuULkA9UKLy&OO<m3gx<#z!Yby~Du++Klrq12Lfr#R4;(vSk(S
zcp|Uaj&4<-S_|PO(!m~#t)ivd9TMV|dp$GG5T<0P__eP3`LzsmvqE=S?&6Dln8;sZ
zUqhgHUS9rJSng9_;00<!{*BBeL!mw09O_bikc&;#^p#;rCLgw~Fvl~J!&S<!;aR)1
z^06LDyf1~-Yi|;oBd*;4<3Yl9bTeKj_%S7!tlz1MV-bfV%q(#q$&Og4ku26EeTIq|
z$BZX<Fwrsf%kvhXWJ6YzGd|z;hsq#EmrWYPjyrA?F^yHoe{T*Y3`tP_gGy(`uKn0Y
zY^ky*KEf~>Z$$BdM+YT~#<|Wy12Vwdg33+1&8d`>lmX$QE-fv6s4?DY+_)xO9hB=V
zc)kt`l)q}7JVuO<s5Rk;rj5+b%1U`==jfboS$iS9)agxSWx<uJPZthgYf#TgYcTor
zRy&~i*I%;fjRghRCjfsPvx@!VOv#NPEw-SU`^xo4;xA<Pvzo4ew#aZ}>Lydf55icZ
z2=jQo-m6DBewJ}8ZY9g;Az_wyOOFRg10n<Jy#sqGsBC;Yi3vQC-%ReAxQ6vX)_yN$
z@82IPK5g-iag3+v#8yrE&bjq&=}@5Lay>Ju+4MdMdId{zGKnnPy9`Me@ez87{lp`J
zt%+gM49Y_0LtegmB`F&O+11j=Z9Jz`whD3mFCA4sIvjIhKriu#q;B<E97Ru1O6-FS
z0EGKq=-i)qYSJ7)Z59eI5jn~R4{k~NNsmATe_36B)+nGQq453jvnZ*~1EHvo`E#E>
zBDlnk9*$QHrTR7MFmkHrxN7=2akUluiD!Jz3f)Ty$hY87p7ENXXL0_`Gg4mY?HaZ2
zen;V&0cm?`+@PP}=Znuh?tWl4j#TGi=JS}nI#KAzR^Ofj2Fz&Z!v{epdE@HUhH&$8
z$eT@#{zE}-!77LM&e1X*(#>s9lZ{4qJRWyMWKEMdE@{{*?vx|`7`zzMqfGni<9#Hf
zW1MjTOjT9$-`?!bE?e0kNoa1{wQI41PAb?O#T2u;Z$g~@&3(Oa$&x+ELEfV)UZ!~L
zTx9r|pDYwSH|dnpsz0U^H6K2ViMG)9`S!uQu`3q!7iY4t^05UELUv&>niRNzzO*X*
zBTErC1qD@|sldujULVxViNPPhYwsJ(u(n3mz2!q8m44F1`q^9#AUt>WB1ru~BF2Q;
zBJJinFBUEj(xkLx)6-vShHJj)0zTK1o|jr`#es&(w&+Pk@`(08l*|N+O*?i}zpsno
zC*!1G@*^SYZcmrtstWfX-)&DuAXgs$>04R2YQ_ww@R2`1NBdi@vr|&snX+?|V*RnB
zW{kSt^Vqdc7Y$!O)*C*-yz=(z*Wh8fx$dv>bKQrY$}Ka?(Hx%Za;j>O*1bVfd#DUp
zJj1%J;?yU*?21GFwbTBm>gUgzSq6<~oE>dcmz;fj*X3w$x0%B|s;B3aeO(itI)cjN
z3Nz)4AN??4M8*14Y)p)+<@qwL^hXY-nrXC9xT<?~!$9Q+ntuKDn`tOreete@4yqz|
z|IC(G*+bKgP#-s9Z)b;!+a>zz?_}z<jKdOTPKO;`N8TX%PMI;oj|!Z?=QQKYIV?hD
zb7Bq9#|vDgu2K{`1=s%2fm>p>r=<34!FL_SrYJ}hSS+{X{<eDy{q8Ni@WkE^>w%Jw
zc{Vqh?P|n?+0=(uXYad?<=Qe1F126@Mb>Cz<MeIe4X^5o3FN#r?}RlwMs5-2fcdqM
zDFan!?`xcKPFXih0d;l_8!a=R{)}Fmn0dZ`$LAeK<Tm~ZL(}1xqi(0Wd3k|U>1&An
z6vPS{=fwStXmthy3)k;=?@mFa*sWi`HdJHeJ!4bTJ7-+#p;%bgqJvh?p7USdkFaoC
ziiw%by!ZN%3>Ae^B=BL)WHYldd=v-|G+gyCOQqhK-_;h@(;?{n%3(zLHW}S;ei=`g
z<;UF^u8`ZHvfka|;*oBTESRBS^x6pXn-l6GEI$C`YuRD|c8^^E-GTmPLnt)>mGI!+
zN0ydlt5F54bC@IRJmLSZ7S%$=@)LG=w?=MQ;5>Bj;7v!5YIF*^q3yMk4T3vRjZIqY
zC>sLc<`?MQy$F5;<PlYW@9ns*({B{4ta3tWNZ^d0y&dIfGZqM2srK#O-R^<Y{rh@&
z=#c?F$3qm5YWcYU;pxT2zM0ztwQaqsi-t41g5|hJLD{f5?Kb;;e@_6s92QHY#;pUX
z_>~v$&e&bwqL(p8aA_EHEE~qt08(1qeez_;Yui^d^3PgCg;^MeyWqDBY{8j<7+2dD
zF;pEAiI`Bca@Yzluh9|HmZ=py85J7pw{!E6ibfjyLPw}7XKMN>_P3d;>puUCgX2k7
zn=^aI_MwIp9#8~ZU&z}k&|KMnI6|u(>ccl+5ZPe)$!^S)uN&2k#VH+S^6K6__K5WZ
zPI>au+TZx-^=q0+a4mFG4CXJ(2*_#@Ib#d@JZTfYFwnZR>ghnV?X5@FJ!|9i_uYwC
zGEW6Q>|n2MJ9|^9BXt^xRseHdjt`1C%XjZ8QWf2O^eBLr0t{yhq_`4rj)*w&^gFD0
z5Sn&?XEdg+LGH62jIvdWLu^f;R5%_J)4JW}MwEf_c2=*Y|AxQ_HZL0#eQ}}AfC0+n
zzyzP~gJvxr)U&6<>;Q!6s_Mg5u8WD$q2K7Pqm#XO$n~E;*NOo&OQ_(qJE^EVPyM1A
zG!E8dsRJs47s-oMx$b@YW-cC;0d>27s>7<VLx&U@F{8X@@~Q*y9X-?4GxcitDkMW{
z)26#j{OB#Bs=x9}IPu;sD%xIDG>=*p;V&QkOy3sA?+rdZEavs+=>zv3c{_RSdnK}^
zRm&A$zkM@b8SrMz)5(~kt$UZB0}Ot;iT$c!>Wv!I@0iJ&x$`O~l$!fnJLRrW*Lv02
zZ_S9qR=*yoaFh0yu-Beda-X*;oHa9;Z~Nskhr)oLj^sn5-DU>%m>3)ODs{ED_oZxO
z8X_DF_UD~Eo9V&Hg39jHsZ+h06NiGe6%E^-oa5Skmn@CsRBYV1F}StzWM!Ywo_>nP
zC$~#5^vm>8&b02dq`d?@gsigK0UD1<jv!_SwmyvD$JW4>Oz1i~>f!PU3N5dW(rjnc
zd#`!q@B=rBzJLFIMYkoY;pn&HPvss>OX~`aZQH5Iv`FP?kt<A<USO5qq)8Lq!GoLb
zbI{mjV(aEMvUI{Hu>UVPzv?EF{#feJUi!J+D*RN}qU6nV-!Hhphz-6JeNGJ|ymW5%
z6c?el9?#yb+-Vw-d!QVLWq&jw^Ls|$q@>j2FK4xfL)ez<cAKt0*UJ$<4W!98sCcGe
zga=;5t`!dCO4G6tyJ`>R+*Y=KW%ieTZt}Hf##heur4e96V%(3Q`JK3aP5q4-^bk93
z?#k~^0FAVDjx9%I48@T8h&7uSz@rx6%x!Pc_rM1K!otE$TecXlSn*+hX>*@7Yh*`S
zcRjsj({eJdUw4>M?*XDY(B*`c@46$h5#=k3PTTZRwX>X_yKn$|tLhWmj=QGv<Uo;(
zSqaF}tgQ&vu&_4<KWFuL3(NgK!`!~C8G$5W2Z+;z;n|4Y>}>I@XiH7NI-k)@06}dB
zHr5xz(TNj5te|qY0}W2qu0uQqwuvd~DE@LjgRBt_4_DV2efPtGZ2LJ*%<3>QW&ZER
zizJ6AwHCd4om$q>IXvjog5LSM9dcDnUe(VyyRBsmjK^D*S=QFuWSa5Cz7J4B^=nRC
zjj)J9B|IQ<)Z>+Av7Kv9?70xR#DCT8IdrF#PTN&4SoGez(Al}6?y!{xonKXbfx!Gr
zTib}j^>qA;CywDC@z%QJ;oj>`QIR@TGIkzUKaFjw5jWtR*)R?JF9$+vR~>7w;kWZt
z$Hw{{n^^4)#e?s~jSdmhv?V3&ZK|fCl1_^Y)W4oZ(nljBo3*@(cC|UWz6LcX>f|&9
z232!}biT~7vB~-Sdc$FzH-2Ja6`o>Ld14f@wYVQ=U@0-4BY-F$g6jp<Xo6R@I=@cW
zwc*2u5A!DuJ==FH4~~A;{l#Y%w+~Yn&;o%77Fe?6VgF%;+)lrU6wuRCEi9Ul9^bxo
zg^R{kwkd)EEc?~s+Z`MTLkG7e6VuF!a4n9o`{vhAA0Z17Ca^_gBNADJ36Sik+Gkc)
zR|gs&6?l{pb<9Sul-06{!ygosvVqlN&JVnK-M9<meXb?07Jd7+<an?M^ClJqe2yD`
zdm%2^%r!g8J`3(92d-ehLB=^!5cdq7lFUCPCMEInh%fk&HvxWP4sOn*EzfGd%IQXY
z%HWFZ7>1l{#+f_+UhUx?TcgzPTFRLNvK(idbP6z{^1Obrx>EoXI^y@RX3ZMllOO=!
zsq6o8$IOHwv<5MAlLEkl>B(j8-P+QVWH8Hh1@kh*`vu{LO*A*e#>QU1SnrhF3`R3)
zpsgdH-~p)HvLV77Y@3deE4uHxW%uYG6sZXAWn+f02jx{&{*>~}A!jze#AV@w&_I%#
zL&ApqA60Jxj&<6G|I=nkMbw}YDs7U`N}IH6Q`4waghz{%s8m`Asi=&Khooqk_9UXs
zmXJqArlhE}kVGUYQoqk_-ueCh|2gJ(kD1;k&-4A>_qCkod7W1@LW2CKYnW^}z#qj=
zKgl2nD(q=2e{*ZvTvGy1AXn=&Wd6+fnFt4j0hWN#X{uoGeZW<KEO<0Y5mmf%TnUiv
z6b2n@Mh}Q+vs~%QF9mqRgv1svA4_zaxI?~ygm{xw?P6~)$Ec6prgB?@t-B*12LG)Q
z*!H-mCd$05YR~W|f0(sc6sB#T-=SY^-)z^d%c-+A8y<BoC@md(GRJ^(LVe)Be}C@b
zF$)q`XT|6JXt*99ANl3ujQV{4E#~@us!N8IcKK~_O4Om^DN?DE-rylaGI{9^X4-L}
zgOQgqCo4*qmZw=a9m`*y_&`X*g)coJnLgVG*vDaF-y<{p!GbhWSSJ-Qwg$Yw4#Ol<
zbwNL{V5!Zk9Iu@)KElJCnwf|VxWD1eW@b*w*Tl%D$c2PkAJe(>^2ftk3CAx?k3@0$
z%Kx`zseq3M`7ggTHj0{v#fNGmr>ry96nicZtG)u6nTi$E)xC{8S&psP$0me87)va=
z<eT}Xj-=<_ShG}^f{3;YtRp$aRAo23uOnUAsP(Vp7GAU+cFz6_cy6trD7*9?cMoZq
zs2@viZrq=1-%wYVMx~N;?+hv7K%}lirwn6`zKF<zvw?qgb5->+>Ci7Mw~lhx5#t}$
z98_Md`<n?GK}chACAPfRU0B|tj3+}nYN)!<=pXIKmnDhND_&UK$jc!s@v6FXLyWuO
zO^1K17N0KzL<hl5{_pXo<S(+>1Ug?K&?F}<GF*p4?|$kF_Fey)0H!ArAYL<bZq9gT
z6?8Fu>fKxSRU{EB%_7>59FSg7FV+l|VYYlMZu^;2r(Wd_O|e48;Z}OnA8w06_wqeg
z{vAz7-0H{Y7cKqgNmqOe(8vRH_28o8VksyxW|akZ06lAYZh=0v6>l+iMgWMIH;fX*
zvWlENwr}D^z&PPW-U&X}?bY2gXF9Y>S}5yAZ$IEM%dk5GU-F`$qilzP7K30QW|LwI
z=p*E43W;%a9gqJ?=!iO7JZS(q=8*I)EP9fkr?5MQVG2G?l5CRB!98df0o9L%hj<)B
zXC;xGd2EwzdWjz@wt**q4Lx*d7X+MP;8&>Pqyx6BoPt!6V@O?TJT&Z%M_St*1=N1=
zqSc%^U5G|VQ#X&ZJb9?fb8qGf&?fycW45L@fBo|1*q4OqZ2N?`ukw3dH~v@s_&KY_
z+OLt+zi@m}X*#spT<hoqd3DRcA;GtGhFl+Kp%?G-MAo-sF@6|zFMqYX{JYXq%%QX&
zt@!<bNp#PdQfYnI5ML=-mPIhQLsI|Hgkr|<?&?}D0yMl|!kVHO@)8YY0I?Bk-7s%8
zY(9Sf`KZzp-P(O|N4Pg#El;Y7G=WRv{UC<DF)$N3$tdRyhQ!)OtT+OC8p=fJf|8OT
zae08=h))wqWN27!_-W)`BbaCX`uXYC>!CZpckG~`kZd^q+yZ54e2kbu*YQ);BpwMz
zJz+w4p)uv!>-#UH|7iiPnP9;$&vzgg2kzhBw)WJRMZ%wL@nW%w0AGh6*kvF8=d%+Z
z6j%odpV$=)*wB$whsn_>mJ4%qz7^;1xZghT?4+}Wn5%T*2QtO;ZjjtH!`vf*Lk17d
zU_wTYxWg>iHp~1>bfOeUs{1qlx_mic@udL9o0;H6Uf+)4CO7L_7cO)ZK0RWFA@<g-
z9$>8H=jTM2*x$N!OIYxLC-$V_vD~?HJcCao2L-a#SpjYxY^scd@Ac!`FwrWo2yaev
zg7#h$(`X&X%hKdSrlu~FFj;{v7fcTPG$T38M6!!QnGhmQ5hLGzZ2uVj$&0c8#s$@v
z0uP6VIt%nM&b8QtOya&M_KAriyh9*NlS-#fNdG&3y4!3dY`#k0$~Jm>K6Q<&N>0S+
znk!02zu~lmN?a^`qsS9s5vz2Vtlk?7?ow}DUY>G*qCrf@Fl&gSBRVso|DJ$=?q9-e
zW>&kvYcG7B?|t6EZQ1?di0fW9zYyBu)uA2^i@M)%E43uw0d)}r)U>p6+MA-^PTF*z
z&J_9=B5B*gwyS_qGvNDNA05~@ru5>bDb^hm+rE0TDN1Gj_qQR<5uFaNrTX%-?&U1`
z)8G2s0?U(g3Hdh7{oa}fnTP`Hu(X$^rZ9b_q15ZXUk!a8LPCe<ynNv(K&5<7OfbV$
zbs<Fv8yAz>Q#J#`x)o7(BBs=&F!QxF6!(9}m)PEVbO0!&=!Gc~C!tispdm#-6F^d|
zfJsOA(+Pipnx|*vt}MJDvKuRY1X+e@k|@;-LfyNmAWG*H?&tq>T)$owCI3^*bx<`5
z4vt6MXZ-lyg3Ay?3cRv1_*lixoy*VniI|UB{MLp+ewKP-?h-hrkFKtqVQvidqd0Ry
zsVUZsfe(uI6E@nQMTgeU!mCy;+WWH--I18u_|ni&_Tb21LAD;`J2_m*Spd!ZieO8}
zB6{FDJUFpfkt0Y@(1=mCT9j`hN1y-%{!i<&qjtA@Ry+2d)-iaijn6_V-aJc6Qg-$R
z8xaWxNP}^v)ZZ_7e|fJa8gJZr+47|^zNz5gRbW%}5SF^CLZC`%0}u{4+Mk)w1C+*&
zSU=jrkNxF16xUiYamV{7Vedj2sih=2U_I@GOm;TQ4^N=bVomDAOwl``U!mFp>ut-H
zWERO%)}=-aU!$t2xiYkV|Ni|Mpt3ZgLC6I_67_K&<!)dc@JbRCbZkiaHA8rXr1n80
zxkRXk7OPmvzk2dun>N2oZ_ps+k?URnL$nug&*8&<Xes$Ke6!v-a2Yb5JTgNL^=?@q
zy%%`T5*0c5jNE-jW__N|0ahX@T}q7p{?#roHz_e8VdABQ&p+DH53Xn44ggkO4wZ+H
z0BsLpIfp;Ngoc}n=Wn`39_7HWtKo|En{IDy_~8CLZpP_rH;2}DK3cHy?UPH_p43Hr
z(2iDr$DD6M;oZCA7@1HSf7E%=<%^27)lJg)%zrv`e-v@bPxX1MlB%D@#58M5MZNl7
zm!!$YPxVdX0UiL$OSBJdNcLWk8kt`qmf=y=09MHz4@#pn57s{W1oN8#15%;2?aj17
zHGD+gqi=baDCz^wJHkUTe}?{B5VZ)n;2cHKaO2Xd(i!wr6!~Z*UZdO_4P`QQtZ>pr
z&!i2c1Sgci6zo3i0h;Z0Jw>}BW_I8f1#H3aRP0}WDTWPQo~S9qVa6XfjvXMGubEZI
zxbIATqo=CKe&E(IcVuSNHKCm)5!a@VXD&~8BMT~)?eLUVI<HDMzs!VH5VJ0<;c{NA
zu7+VBbD@EF47bI8;@Q=Tu048)rN`X9HxSFFwAq)J-Pwm4(>+WxbXwoqx;h_3vNz_$
z*~*6Wc1EO)8h#gYV}|;Xf0qH3_Fnk9h`~ih6-hL^@u*{hp)JiS@Zu7SL?}U$9(6vx
z%yl3XtF8=OJ?4-4abuk&<HlZ`-EUaF%r4Mb;*!h7<T=ZePjRlq)uJcm{zU2|iVx6x
z^T7$x{gGm}MHBcxzz!LLXePk{p{6LGmdJF?hI}*!XR!BL@6s0;K9Qc6v*BQik*V|T
z4(;066Ljbb7+jfvG@TNa1toiN5`e%B+Io?>k!8Xw{=$`WsC?;IsC`FX$ahY(zR~jZ
zW+x_4*MC}$dX26z%dUAY0|bCkD4a~Uu613#Ivw@RdelE(zJ5(-gc@Y7$I<CSLZ(qy
zwBz{#LWwOIK2z2Ziv-?5(=f1oI%vwslg_s{c{hJu%gtI`8g}lB;=!!l1uJ`YSGn#9
z_&4lAnlq^5<@ryFi*tr-7_UZBqWeUmp}N<KszUns#5KrZ%>b=U>kRx3g=hxIXBaPN
z6M0C@I=Yjxq^qT(^a`1#Xiuzt`48v)G8tLRO~x(TxL{%Vg0BK6;0#ae+g0f5pwqkG
zI5Mp-^uasdFzO_<L$QgDnjHv;V12Y<LaI1*ZERuXToNN8rS`iwkKU1SyVzxO0vvFL
z6cStq99D^B(#LYwh(^3wHf=e(`zCXe2vqWNaD5ARyE$SvkhHDQNO<ZApKhow!COGE
zp{3l1j8BZ(zyI(d;>+E<Ixh?Hb6^d-NAJf_AN8XGTc(lii**S(s;2DNo)B9=3bY)c
zVL9l?;_an?JVJXbETOX~vt}YtSW*Haqk|S_y{D%iUFXc1x#81Q)tzS?GXAym!<+HF
z?i-q}v2>QCyhAfbRI^LjA&lyU8>c{PNN7inv`@BkEA)v4<7RR%mee5nOo|%PEYR|y
zW5-<M?ipqn+wHZ2Flv9MZ`Yiw)%1&^{HKL6GM#_6Fj(u`cO#gZsmOo@RE1{s4bK%^
z2{?Env6B|ua9PP@eYM7I^}7y@o_}r^f`b_N?v9_smlgTXeYn}Ax`Ns!X-7-b?bYMy
zzfs?@>0=W2MN}pTVZa4~pFKJ3-vk5V@o6Vnd}P(yK}}6f;_!)Dj6wZjAQfPJ2azcO
z@NHaBYfPtrNyTKaP}p>__?Z4}iOQ)4qJwB-P}*}Hd6~mk`_$HjKgnnppzuCVdgBez
z?m~6AgC<jklMK5%(oS=DfZT|b+KJ=owFRni>QuWiQEurKs~?)JRI*sL?dUTpTs(14
zmulJO+~1|8oh~gjKdF*S5e&@@Xp7FC0lY<bzDC}a&2DJW^|RRi*QeRpH&*6ZUpq9}
z%E}(04h|a{uXNv<yE;h>X3dC<b`6oP((%Kn5T@?flb^Kk0^^b6-|k&=&YcomCawSa
zwR}b(7#50}XR9|wvB!}NYnZs4h01hj{Ae(K<WRQTjku*x<6XLl7KY;Hf_fF`J#r_>
z#%jEW9|RY#N_aN#_vnAB8nA1aLi7iJh{I%8ZqE7dU?m-S0*XpXS_|K87({9Hc0=-G
z`{B{~djKk^oayI$0LI1nUE?=9>|Du3VkZGXRKcM29hl1iW9spw1~B@@(f3|RL{K{d
zD<>zqE}p&S`~XvrpR8I=1p&h_|0)t>AnpA3Ewa_dh2F2i9>_X}{rRUaJv|;M!mp?#
zCWRT|+J=UxzsE6$Pf9;?EVJ^{r*(C^dw1*R57{IT{-h*L>IlSqzJN61u}D-+RNA5u
zhuJ+58{78Ax|hP28|1kY9Scwo_WR66l!<O8)mR0!0SBxtT?1PXD6<z089XKo$kUx^
zfPrH-)xFLndiQQ<Uw|DI)7ZkC7a#|6I+!LNe$ahU<Wasx2F(Vmc9M4f8b`}2np>%L
z2fPFYthZpnE8&L-;&;_4rz>AB_R5umgELoGwnVI(k{C^rc8|Ymn9y72paLJ8Zi*M`
zxoOk0>n#vIRBmk{i6(QB;aZgl_gDd_BL=0fp|%B_%A^vjV+%$Zl^O8Et2(dNNM>LD
z>Xj~h8sn_vJ~k2$aa#mi1S1zV#lE)@(&H=Oh5xkS(Z2<YN7feuBMfVlp|B%rx8-R<
zXsYnJX!IxxOKxvbM|LQP764AXLvxO74j)jz$qpuy&}uZP@OCdu-8CWq-X!8`VsEs~
z0!#i#`S<)ON1a(`UH87A&yVVLJ14;#!7m#e8MaEl`~Kwo4eQnoB|R3fxsaC>;Gi2a
zjXQ=;t_!J%l2_0uFc9iZE9@U^YQGB)-Bp|K-Je9El)Yfy+)&j)rZc-I(up}Tn`ysj
z)55Ufx_(c)ACWHAwC?wN#iP_zxiAg;yEX3?fHfm$miOwa(lfbd@ie4{-1yf|GCzL!
z;7idqsx#kt#YWeXA4w)s*=o72UZ2O@x_R?JvmLsdte)nq1t{@)lkIwU@1HYHQ$VF6
zdu8A!$;0Hxir~P&U1!f$T)(hUy6je5oALbFCE>p_gmBno)+Gnc4xpn*^k6AF3m{(j
z#Ub&qLZuAl%1i1@1p>QeJ!*op&RiWjA&8pm`A1AlD~8epi|rH0OOOyzvqVPPpV_jd
z?I2TgI~4n3j(|GX%@}7VEhuNlhfc$ao;};g{GuW)QN*Lsw;P@;LEZf3ub1XD(iH7U
z8{R2HA5tm8a)|E4Q4q7cH7SV%dIe<b-tgh~z|0lYz3%&mj`#kiicm&DP9Vuio2s>m
zQ9#3jC8YX6&u6Q0SJsUhl9PzQeDjQhFK=xgN>gLl;S5Eya1!K@ml6NLI9Q!z<>`6-
zaYI{2M@P}?0(-|^zpfz;7VI108X!U(L^J{?s22t9$9(pB#0H`r!u-q!6kB|05LW>3
zl{4~JUB;7GVp+Y%LGuS1#w=hF(FR#wTKI;t)pzcIw{6xY3abH1TDOITLO&uXB3KT>
zB_BzE8<fkt!B<5l5Vl2ajz59(6Y41zHRycZ#+RdThmH`do1s)xmHolz*-W7=Y|tRR
zZ9nRAP*xB%R=T?0qt6j5)%hdzxW8a8{C@7Hz+@2Oihx6=5ROXJ4^dxK(#e0Y!W*i;
ziISG5o4m=6as_`6ZQOAL77651M`9#$bfMB_JUHvB=rvZ0P7O#*KnH*yyQo3jMug##
zigt;yA{>EzsVQp~kI*{P`S^lCnOua~$3~@XOl<k_<0`{Dg5(4?gKdD)5e~%g&HX{%
z*LK|XJj+#kk>sePva6-xu>2YQ{k-@4f&?>bl-Lv%s`|OqVVv=tG}-rDrtSzW$d_nF
zI!!zMXjSz(xTPhJrs{_TZ953iPp^7tTHp6oRT)Gi$i*QYO#5|JVv^jjhNTSa`%TGQ
zF$}KnY=gzFva|=qPRf@LI6ZJWrc^$0{F-|!hJ~FTV`k<D3aT|aaXN+q43O=DD-|s)
z!%QOL9wWzTYwtrFL&?<#ngX1RK0s?^;9#)z5$2CSJ`FK+me_p!P-%QpX3f1hHg|v(
z*$+{uA5)n=`4xTo^=l9Pp#Ea&^5wmW;Ag!%zho}#($68B65bzuYMEpm5{9&wFJFFe
z6g(zIJe{cv;UQ81W9#09E%&80vo}tqKjN-?F?Gakl#jCw5+N0ggx5WEJxGDDnHAPg
zYey_O?7YZgD+M&R&X3sl`sUK*mp51dN+TA-J~jSyx#R6>4Ube07o&S<Fbd#XXr5mq
zwr1SLpE??ADxMKpOypDXCqu~6VhNUpdtLV`t*xk&v530DAa%sRv1gO7acl%+Bl{Z9
zULJ?)3RF&LHC{i<r$wwC`s?%15u<kk&EoXj+k0GZTK=~t5|Aqu?m7?!e+yX*OeTiz
zUQ7q!kz_3CzmUrYOcuK%-&lp_iO+-p@l{!w4`~2hu9D~&nX<{Ucqwq{Z&~95aF!6b
zG+$@K+Wiiipnw!ms1Cr_^dJhvQc|cMgO+FE&)gYV39t6za*5(*>(;H9BBncUIp~^2
zcl8w0<f0c>Ab_aZ+_OB^uTKR*5Vn642WYS?qFrjJ(uh#$#Qv&^jPvXxQIY<cJ?ahk
zBDag;-(5Z%%>yQ$zwzKoBGeE$_b}BYtk>3-P2vhq)DOWzZL3Vn-_%EN+o2O%HdzLk
zwr_%kZr{pv&n%Zpk3QJ^=l8VkYb1Eqx*6`uub8u~<o7E@9{uKh_x|bUTLBNhCafGq
zq4}C-HXM{B4L+C^yJ=jy)r;pLpLT!W=x9Js<ND&Z7Q%7iLdzi(a4c1cK$ajknPj3C
znha$o{KOHjY?UefEYF{LrD#d7ZP7uPOD>RntsDlzonx6JL*<QTEx<}&K03d*sn^J*
z6x<~W8>wfg<9m-Zb>W{7z)!`}=K8_$PgdlEVLU~2OLa?H5QehM8U>HOHEGb16^QLc
zHA+lDx)2B+OD|2T(govZeNP%WP_*b^KQS=%gr_NFcg_InMSfk;XTmO|9}@;0Nk^FV
z&sHzfoEI;Sn?Lacb(ei`)j{pZp0sW%8SB%apk>2m4l`ES&C$(3NP|XiA`NU)b8~Xn
zFadt!a>Fn+$4Z_O9b?z_!cxrDwO#HtFtWFjpU57;ZLqJ`H#T0s;A-ZNPh7#m<5G>D
zJ%f;H$K1KoixhhG#*G^{tFylrf(-Drrzb>n=F+&fEK^5qE__1i`mzjPYW41&e?+%e
zRlN=_qtNSpaMA+=WYSeLA((RWh1Z)-LeCFN2nn*s=G<GiYe;wo7u{w~1I}W}M@auI
zdSl0Km*sR)$q?SvgDXJj{fYMs;LQ1JzRXyK@g*F_{eYie(<HJ1D_}~jPn!L%xdZ&T
zB482VjmUPL8tQv@?;gODL7%tyhYC*#B?bM<lC@d1!D1iovpN*+QxY5tn<*mem?uf)
zmI4HVlKUgu`P!dx8a^1%*CY%hX@P;`ZSQ)2znY(18IOT@7jW_YY4`Y|BF~WVX(1Fg
ziva|;a`XbmJQVk;CqwA_#kE3I<M`0p5Hl<C2K70BH|*cAK1C<u<{F2pn#}9dtkCgf
zmpR$4vBUpWgdF}C(DFFb+*s~ob=0}PnR|(am)YZ@RI|99Z+~XrxW8>(-tJBDkGs=J
zQjP*m?xw#Hv}#gP_>#L~N)bJxP!NfMDS-(gw0|A$u25JXac+iXA5+tFvX6z1JeT>C
zmJE<eIUV0Jhwd2GKkNLJD;29Pt}Kf0z&&`=K@V=v2bq|xDH%Apbu9(^3r^|u56uyA
z6a$l9BwjMMz$`Xz-UC&7H8q%aXBh*wDE1Tx&7^b=!mC8-g+>RXm-QkYU@ca1N{2P*
zq7xP5OhUqujwMVtuVAV;qs`PdK62v1FhiEKp^K;wKOSM8rW@`q=N~%;H#X)xOLC9R
zw7LI8wxAiG>-%Cup7s0iR-dn(Ee-BBmN4PU43&A!rWJ9TAa7YDf`Y=r-9!nBBOeD%
znjT)dM~TfNkA~3-$J?!6zU5svQD}(#j@kz$8jT%o8HK%tTb2-@!n9U|KetZowT^2D
z7<c{Y0iXUp+2Bblh=c+>=fC4Ri+EmMe!qICfk6ktZuk5g&@G089)?#`6D}YxdH(9a
zg?5Wnu>(z*%Ef06$?ne9ZUDH#+*E{L)cpIK=|7Xaznf6;gAF2$3K%``%Ghf%Lp?uR
z%YG82xS$6zfdhS_V{|aimzf<&JXSp}Y0JRh>n}by*L8npN#9?44-XnX#6dFVG7WBK
zU!NSEqQl1~95)`?-y&qKB<5|cZQrg+1R&LJ-P%*T@qCbyiUdX4oD7!6TuvQBbiJ6Y
zC+gkGY$5$zu9huWWWSg$^>af*CU(T)zvM5ilyL{xZ~>f5Efq1uVG5~^VNZd6a3Z*~
zBPD0vTJ||IBg-&)^=!lLbGG@sIqSpqWfyvQ)!>MA<_kx>_HJCdYCMSTn<wX^hg#{Y
z!GW99#8ej!FRbRHrZIlDWT;1C(2bSe%=O~D=a0&ggD>(FA*y@PI*3;B+1IQQ8_9?n
zE_d!33MrqeWwvLy#HY%0jP`q7q<llA3Z?`34kr1es09-9Z>~%t{ysBuuP)tipFft)
z1<5~s+F4eu0CrtmtaNE%N1q9Dv{`PY_j!uE)6~$o2BvUWMMEutRDs09G><QyNyoyz
zMa|^sbC@(eA}NSHH*S>o`g4HsA8)cK6<Qgz*)J%}#JrOb-_t*oaYPsHmX^zVgiJG&
zyi3^|V-jEghrPY7aAp-U(`EA3BMb14NXyAtm3BBHYAvm&I31!m1N#HCb%6Ub{X9_!
z6&tDbfOA+VIe;#iNth0h<*{Qd`ESIoXw;frkk+vPtvg55!)C>O@39Lk6}37#yL2_t
zc5X!%Km8^9m`>;a+uX<tHs{2cEZPAo&x_|au%dO4^+s@(Yu7f&;sg^ypCIRDQa7G-
zGGnh2?~khAcUoUdszerkKCWPelL(1Fnp1w63x8LrybAM|d*(CuaOqM)L{!c-LlEq_
ztj~RwbFXD@-yXKli?H;cM3|p~{6YK~MN4#5?OBrW6a=<$*(Og<+u$DHSFynUsj~L=
zBBHG+F^LxnxJ5??4>wnD+qb2qCF4m<q!AbcqN+twCG=`^*U5|8f$gP=MgYJYs;>{c
z8N(G(b#HbQ8J=o9ruzAda13Esx>YPwUM_}~QASo=Gg@?XGHtyeyHITkMJ3Hp^yA?^
zdD6Qw8|fm;Y+LIV&zrMVO^#K@gWjw>v73W;+e#|!*fu)9!oaMDO2B9($(Yfj-5u3e
zCv25j&!``+)TzVnOm|ya9bretU_CgnkmHilNhMR)UI3M4<q)HQa$eU;LK2*}+~*dt
zAl{dv5iMK{pgPWAu;N*NO@B?~7b-$1st%l1AaL0;tIyyo;87E=6lIXU;p^uQiZx%#
zVZhi)Tywh5DVg?QR6-)hOW>p<LCSvjd3rC$i-`q~9{D2Pe+`>UCj3@+aHs2V+ttcC
z%I(@9uDURDgY@+v!V8EBXc^lunom6}ezs$vX?>-cn{?6qS&qR|OdS<2UA%aY5Q_G}
ztWPt2ZZGIvGl^E(Ic-hNRK(plChP-q63{FdBcr!ZX^sKt?Bg`_wA{|TtQWWiQi}M`
z$JH&5k0Y=H8c)JcL|h=@F99MabS47$1o;+Jd*i2<Qz7R?C&X18ox96qyR`b#r?a~4
z^_(oNi<EClR4rRWL-}~ZnS-WOI{LR3Wl(uYGI==;B(|>?-LQJm?Y<wOCGEA)?xCMD
z>cQvh%tsK$gQ$?ioVsYV5Ov(Al~!hcHqc7{s^NYbFbhxtngc72F-RX%a92@>3>+T@
zF2{zj6DTYYLJ5043yWDZ*OdOJ1+V~e@2sGpK)^+b_!^u~pw(U$I*ez>;k2-*$4-W=
zTcr#95E?omhmY5M8nqU|DcH7HbR-%+O-)6hB2Xopb7C7IOUruDhKkh{!1u}5?x-Va
zNDg`iU@3Zib{<sqXVH~F{`iqdA$AcBH+_21iMj`vZFg(yc7m;^s0u`!<$~jwLR-1I
zuBEcNdNOiYu;`=-aiO7w*Ow1|AoJcjYe0C;wU0qZrE6hRz@2%~WBN1$<WhJaPp~hC
zixb?T+#E>=4817rg^3S{tILn@my-d>Oq-5_+2}D}<%=X+OE}VyMPuQG|G2U&6Mlwi
zmtY>J7~y83#Sdm}*^ckdNPO~EZE>G3Ek@Tg+T1|T$$^;a7bJOLbHd(WQ6L@(oOn5V
zH0hpE{ok7J6ZE>y!I2ft60$Qh)v)WLXh0Gx;0_eE|0MFt8JV(s;*8)^2qFX^M9>$~
zI^GKia=>rPz10w!OhE|17NUJ@0p@e0!J{GBf>aVO@v>cVo4r=P;yi<M7Y<VYP*Jf*
z#9|SZXb(j53z%n5S%E{{6gUW)9QV4{lNtDA3R;1XlLZreroQWea{{?Vc7R&qw^7Fi
zmuvY$f{2Siaf*s97?NJ-Y3+?q+`OqRwgph#iwko2u!bNGZL&6O+I0WVelz9d#7I9g
zM1)5Ugc^MSrXr%@pfE=la-TWQKo~hpnUFko<S_`nEO-#A9CuXeqRjzTqbulsaMiMy
zrxetr<3g{5yUtA?lW*N(!n3p7tk>HYTi=v@v?e*EBRo4u_S7m~&eX+4=czOLcBSC<
zSguLYB`9fryisO!&`<`wXnm*P9~bm-9onbEYBLq3?)22}D=U4F@-PSu7^8P&Mgb!i
zR883U2z1F~;j$<9zBnfWA&*(<cgVe9qwk@z0j8mbPB7p12sZ@h+CAfc$RlNZa@I;w
z+P=?jshPXOj@NZwcFBdw$`bU<gn@}jQ5oC;*O<tF8*rLi-484}4^g<HY$m3bG1M(5
zC%T(oEh)eDa12F%ij@bVZ-LcO^ow&aXWaL8$T7f|`E}$wye45~__L)M6Hbdu73sz*
z5bJe#N~FPLA%#!!sOXGLf0%0zFb09wkfY#}ZcHyOO16Iyz*DM4#vsI)sJ2qYxOw}B
zlqw`_3eZCUTB6Xju<*9Y`i%Y#DMSdZ08Y|^O8(=rolq0d1&QUPVy7&Gh6or`nnd>>
zZJE@5WAf*$v_D{n6ZQiH?m>lm%}dtQw3WypFDI_NI=kWjSTpLVb6G{j$Uf5JPG#W>
zq{UKgP2tmJWpyv;<C?_LW5&GhKmNNC;ts_Cht!JtbCZU(r_~%Wy`aC}p&X=I3|0!y
zFT6oUeoF?@0oyn^chs6Hv!F~385-Qv#Oiy4W^P|Vto~f>GU-ZDp^LBKC=ji0Se>*1
zaA&n+G;zUus#GD9k_9f*-k|o>p9`oDMK*&`Cvn;?Z8Ff;Zv%JkXKgSE$0G0`s!LdR
zJOMJ+R$#AyAAn9-Ss6IyeK_{N3k%V^DzrlEuaNuF+!2|IS8uc=F^YY9)E0sb2pi}U
zjF<+rK@eylxYWMfU7^zSkzpGx!Wp+B_RmG&_^NR7r)>l|A%+yjcF!;-z)rxBOAha_
zO|p6_fat<ORN)5z9~c@!usj*~ZPUl6@(8NLn8d%Rs>rnN?!xq64BqdbI1z9_IQbA_
z!2gIDI$t+~5QzqrNd3~gz7sK-r=RR0cid^t-7k-33aJb@1w6YvQ_=lyDYa|Eu@$Az
zz=2A5mA;yxd+?u^FApM-LNW#KD&S9ALetMTrA_r%eQC(NA6TEK<_el&L4{N3;}Gs{
z{?a4ww#=|odRNQ)mZ3F@T6t!p**P#*Vt{;nOOAJXr{IYXmnRM|?PxL3^e(Wf&`p5K
zCT*!V5bOUY@9!S`Y+>K7LYgd=DKXB1`cw$fkQvT%>_uXOJ3|;Z2^s`Z9Aiyg#zl95
zE*3<;`CbD)JKc8Z)^RkDd|31tlL(&TcRqjKF~C8v2ej@NCT`~q^tA9&<v8?g9EC)L
zZ=6cEDf#llWd<ep3oNGRrHltprW4vDSj)$&A;dv}kpPr_XdFv^|MAgfe2;oBWO%It
zBQVPt#Bq?1Vx*|MsweOo6@47bp6NJGUb@r?QVHCI$cfrYaAl2SeuuFIi<=l*K;BiR
zO^vR57lB#^HB`f=XnpmEvST?UELB>(Q4hrY5kn(LAt~=)7DPxp?im<v7M5mB9goxj
z)7IC7)GThj;3OCx%w4@c_i))LHAY4ZLMIx8=Dl4tCy><mcEk5$y*J+topD-y$1j6P
zx~i%4G9UnsZdt9fBxkS7B4Mi`(tsV67)U}r#4Zcx^JiU;(5!)_0<yb!zjWI1dQo#m
z_Ha{|){a!w(%^td&pBdZDud7GgtXKNB+eiAxUf!Eir;I7b&=Rei@)butRGd>tvBr%
z2iu3aJdv9OgavDVtXK4*z4nW`i;@VJKOlaASPSL_9L{uyAwzwi23;6iO~`lu@y%V7
zeJj_ljjD@Ny0Cd_p$~lgF6mO!P#&3{y2~Kb3kxq)F0)pQdvJKW<iKSuKb7UCp`i08
zF(eGIH4MOSRNmb3ZU+5xnKV~s{VHd1C<n-kKUGN3kuP@?Ig_m)lcx3kbF_IE%lnT=
zs)BC-zPryzkuVw&b;g4@YxLhgX=XB0TArt1G+2fT$ebBe7tdd`b!Qe$fR-}MwYh1e
z=cl@&&td6wA@Ym*9wGG@{eD`uwHP@BY;mY6)b*2@)Hka;ON>-Jddb?!p&=GX3jLc<
z$_eEeD4ft~9lkTi)eYqfrGIPYhQ|mA@d2pKTl{^+1KF{iyeF2S5yrCmO<g?Le=K7~
z8)!d)5SeBdmr`{_k25kE<>QiQWj!aK@XDc`{0yDS7v<t9cC8hQQlz!_@sUSG1e!K`
z+j4{XIWjZnO^;4RB|8#lGmM3JPFNtaibd3nb9OeVgBetOu~5{S&PWohv(O_TMZ3D;
z-HhA+MxJ8{jy96=!mJMqdKU&4UYC?Zx3mJpsy$A~pen7x+l<i2?&~yazQmw*?YTt{
zmWhUaDO~}j*dAQHgdiT0mqz*Y_F0?rWMI`)j(~Y^YyhaUD=o6`73+s9SqvN*84-b(
z`EI1m3{IWI+mN#4ORrr|gBD(x%;IS3X>6aih=7haRm5XQ8{_&)Q@R$~{iDvSqKr65
z_ojf-%L&Pi!GX)&{Je9$?GHZ56W4G)C~&RFm%#pDt6nGrnTa{8F*o92gw!zi_4|TY
z=)?~AH`TE$T<zDFIlrvbH%p|Ke5G;#OF?+HtiU*me)YoDX@QrfowgAl1n5g1sn7)7
z@P;*qCg`3R9#El0G(UB!FPs8EX8yO0IjfC5oTSBcb=n%dMoU+|^#*5&IO|%u*+V;e
z=IP$s(E5f=mJ)76hSeFnwcSx?VTyl|vo^(78d^LR{27|dhyER;ntIT`%@U8)Cu`j6
zy>1V5X`^mZs2L!wH=Hh@9=u(%QhrvY_&P75D$4tICwsM5$zkcLrkJw}235}NEe=!L
zGxieV?Ij;e#khSPE-A*zyz{L8l`2hV2ga*Gl|uN#fYJPC`JN`GusNbUmYaZ^2@J+x
z%SS1!?B!<Zy8nUfp@4vZ-LnYFfN3M%G~dvcnU#)W=8^c=icjc`&<~j=XOY~$OXtpy
zTEbOL3~o+QHR<^;b}<A&1W0fD3}Zn%Q5+3FeK-93smD1NrRNGO*W=Pv^WrG$G2b~?
z1P;lSb^UDSxTrg_)0mhxR2GTjdYSCG_@dO>Sps#zJ7z%W{r|oquyqf=s1Mf5Mb49y
zssD$T%wjO%b(=$oY@%#-tvN51Qa2}Utm<xi)E^iGNB;(e3PlciT&LIq#XHNoa3)*t
zU1#Ye^+BdpA3iAKIT(MNi9ppJuz)2WpK<MO_Extvyk@q3a#2T1W5Y@Q#8*{I$up9Y
zP)^8+2_`{A-Z^-#gXvJaq|}B+5mC8{%94j_Jb12m!|LjfA6J$>Ju6E}V>PuL$r2@7
z=UZ_Ahk8zKLYYw|*NQ`T*S@XqWJaFpSd4VF_SVam&H|pgzWCCmIeS!Bq!I0eN{Ued
zx^yH`bDEaL7F8vG+M+(h6ojG{#UO+N+#Z$c{w80KY>l~$Hx4PZYQw-fldG(wvJP+%
zq{-{oqHg2sYfli9kmB(AI%uLzP}KUZp-4&VA;*UO&e3YuPKagUnL_m_#`~GUb<)Xw
z@Zfca$N9%bn^yX%)>rq}mdSE+Gq_2q*9>XHL_z!aR;foI3{+evzA&EMQt6zNN=Kv{
z-QC5#Vhhcpi~QKYPT{^%5r^XfB(K$1X#IO~){>BgQcW`;4dbw2_7zXxzRtAEzFF60
z7q{#!7U*@?y-pkY&pIRC#?yd-gCkJ1k0ZoW)Du~qC0YSK{0MpAG`)Krte0C444<y3
zwSS<!<IUNBH~1bZSR;9q+J}OuSk&)4E=pRV!8*2Jyz8|!64tsM8^2SBK7ndS*yK1n
zKOENjD)}YsrF_MAxg=fnK=!e+@}#_4>rVm04HF6F%}K*qmUk!r3u&aog)yksUQQ%Z
zpbLW^w)H2TT0M~MU_G9ogKdva=3G(&pZhg3%lfpDlAcE{&%;?#5zPd#;P5dS2O~1Q
zyOZP^XR8w6Aj+|iA7##xN5731=ci_#fV`~Y3}+T8S<1q%NT;V+U;7)@v<I*vX`FJ1
z%Ve3Y38g8BmewhIrXa=qF%Cr51F}a+W!C6_@Pg#<AfSGQm*?3DXF36UEY1xlAqW%{
z^&6q*Cz~AQ5l|V>e@VXlbQ&IB32heOz<BE@*F0-&O%VypkDrrWoDv&XC*_|9*MBs%
z%)TKTFOE*fJf5;^)qGt)>vU+{*2xpd_DT4qz?D%QXKgRZwlBBcev{=C$Hs2!&I3ov
zCHzIY#*bjjhryBUw4PdLOyUnqNB<IbFzDFDGz!Ej&A*FhQ2K|chZ?efR+>?rpKK*_
zRC~mb(Rfm;={>2F?e-oxa32$@)Hijk!%kKjqHglJxztN4rOuclCYAdaKUW_(bEZV6
zHCFr&KB@fwJ#(qAl;%SWq+4q)9jf^650YIX@lPbhfhR5B@%>#=V^m0$(SdHqa0Im5
zy9jn2qQb_xbB~ku26~Ci-D(D$4El2R(BsV|^qb7-3^rwwMqG>P@CPzX&u{*wZh7CT
zNYOH&2Z1tL9JKg>W5@PF3`N^Uxd!*9>Cqgrv-e=rdMJ0Hcd<Phb)@SKr`MEW;^V8$
z|Lztw+vSa^UOBxMJ!4r_RmV763!?d$$9uBgP?SQKsiLrAV)X3UJ(AX-G{T;<T~l*r
zqOE1%WKpIZmF_g6UE6+^U|~{qmqlT3SWJ5#B&mR5cj`ym(oIg1a@#Z+nm7PSa+&bX
zXxcFX*wyc8=V;--fUY)iT(=c$T<#)_+W7NY0VbW{5~Ir?T$D<{9{fG9wRntE+iR6R
z%P?l!Fg>T3_gg0_9uYI!CXHxykhXU~1&1tS*r%}~^ig&p@Hy%%9Hl8#=Swq3OJ3YK
zBOdc4bRReabrg7((7zytb#mJ+<qNl>)ZGQtdv1Z5F3;BQ>8=HX+RKSDKWML{Lfzz9
zwn_ZIrUTD;cmjb7Z~z$cS5h$bx77P%ock#u{9%9J;9)a;WXUdO+R2S5PyC)9s2nq$
zaGQ(ShI}e=J67G@{U%=D2zx@Tc*kq3)3jTYWoyy(RNMye6Zh|*of&jrQtIYpT-cRT
zfpzUXP1H>&=|T<Sig#gi{bJ~19?<iX%qtX<Q07a7X%;}*(o_FsE;>xfBV9v1Cku_g
z`@UULzWdI<-{KGI{o~ueE~1s<ay|Lbf=9W>_K|DStWjsOR>NbFl+sK?c3qy(&!?2N
zJuLn1dCktR<(JC?nOS>Nuu#yrwlr{qoF|m4<X*Eo`|!uh${!D(y_7u9c0BN7%f*!7
zhI((8pFi$5=Rf;e==>sQWQ)b)ooAahmzZ4A$~0C9J8c~KtKiOx^6)3JQ-6N)I}{gm
z`xil3l`m!J3YaKpBh145aCErvuD_CAeIa%se#o_iKq{Rfk+%C^hENkU#)Df5FdK`4
zanT2HZ_C?X=YKY2V0a}xX2>+F?aG#}ynyK0>dKNfa~{a>=D@fovdcbO<q5&vu0~|1
zm?q?THx(^xKAz)Yq8(ha(=a#P-guXD@$_GAw^Q;aN9t@bniA{&G#P>Uh3DZ93m$Ds
z{AqC@>q|uB!{=Y>vwz8cNmkveysD8J|MklEP3pyx+uNnT9(o!je*P7=xuRh05f|;$
z=nG4azTJM~UFiWw7xy+wPIao*>PHW$S}GZ@t(@Wz@YOR)rXg88W8jeL>;AJ>4g7nc
zhH_uk#Yg=v^&O(8H2v7Uhp+Fs_k6zh&F~j%7u~A)5t4k~X=dWP?4yR`Q#?HTmTueF
zF#YI|@8h<<oI{6c6n1cn%>a!zMv>Vgj^|oLo90)f7Hu)UA(iE?yuEhUt6!xJjm!Eh
z9jP76U}DXP_~UEW1_oMextrzJT+ueB^4^Gev(P)34h!jucn7qdCbGT}4hZlB`@u7I
zSh-RWSwd{0R<UH$TlcnMxnhK^N6df~+zVS6g_Kb}^=-ta5P3ZBb^QleX-Xc>c4P^v
zR7y3NiTtSaS8dcN_cp5&p0)VOpD{XAf8(L)(~!>J4%n4%>6}{ed9SN_c8hgIvWsH*
zC-2)ELzjNqqMdBL)o7^8or;b2uN|i7R<#U?T<sXVS^r?>%bVx4FQhc9cy2k-IN0Ce
zr19s3G2NQGJpS(4VpLsvtEehEF0RdrfgiuvjoWZ*?~D5$)Cjw5&V3mZHv4$)*rB=a
zKF_;Rcl(mfxt@)m)*JtNansI-w=H&N01dKrZ!!=T61IZp&-XKGekkCLmB#UQ6YUR8
zH*1q&9g!p4K2T-=<^J5>KjP$*E6WcGq)rS+;;egl-o#TCzw(tc-%m<Fq99~%ay+o-
zZnuIbN))F%NW8r*&<?=ubp4>GmN&*IEO-0G0}kaMO(d;P(BIMr{3v5ZTx!|+S4Am5
z6U?(!zUKU4JAw_h)0_`nsn1QWUQjdma7CZ6EgjnSxpCxtxy8Dw<n33l9PG9*C_z^A
z^Zu<nS9YrGPPV?;JK3&g_?g_L2HOLBec$$)J+W0k6ScPVe$#%|arwe*XUEd$l)B01
zbHD3nyxXCqe<)gdrHH?Mt~|fI<VI&okeaNXLA$&z)_Uvb-oG`jc<m|YKD(#y?7#E*
z_icIDF7}3N{U&o6w;O4{aI2w6ICE<z4@qCU<F6^IOJ9FKUj6--ike!1XHw;v$XYQx
zESB=LGEm`_7xF$b<ra)!2V)Vj)h@cua+<O!E{4sv$$1idq?H@~4wQ5{mn4^D=3;Pb
zp-hKEg=eh5p%B8Z$3kdS)NIiCTT2wS1`O_#-%UC5uiD9D{;tlcm#fK6&G=(~N=Vq)
zIX|CuXzEygf6T64K~MY&-D`%P>gd#B+c@l68}#0aNCRtfa<;e6Q(sg*^zY9JhKHWF
zGwGb)Wq)X?%ZY;-^$p`cZ%!*3v%L4*sdwE@x6J7~P3cL_@|h*}QKu@_>gUhX&374f
zx#Qc!vJsnh7U2Y2TGX~v`<5X+wBF8qeCuxd&yWhAlKVx@J2`V7J;yl27aMHXu4*h}
zc4z&VC|gXJg;*4E;N~L()I$2}Jk|f4SHF4p5vi4xSt&HUFYvz*d;r)%PJUTD{>d@v
zrdz!rBR2ox{7`DTC;Gc(wyDf4GVkj1>Z^+{shI9P6Jpl#=ND(H{T?6m3e2=jW$nZ&
zC~S`%8Km@Lj~klTYBy8!kmP?XuFSmD_2VXU{&pP^S>zm<*WtqB598z=Z=C40rPL!b
zL?OMc(za92z4rNCE!$vuSlSeRc1=oQ^26otO&99EUDxYX>86>=mrt*|9%MIi!}v!x
zl=^S`m1aA#AZvKR#*^N5#~mXp9UFgZn&nxXof|S+WAc?o%jNwis@4kO%RiI)#TbwL
zT;De3+XNAa;xB=wG8D@fb$TaXpkj~^!Q{(eVIO|wmq*rlnOK1m$0|l(hmuXt*T#;k
zD)_QtOjM0y@xlv_x4Ri@T8=e~S(H(>KGp3;euQIy>2$ybreBoL7Vb5#NotFlX5#jj
z+9PF$;cr*)@@Jnd{FJsPEVuow($0G_hLug2_+ed71p7x7-euh^byc2cR59#eW73(b
zVRMSsIGx}5&`M)VyNSD)ZMrdf?|=cC_FcC)hEF#wW-_c(e6b3CJMk@j)$&vj|Mc1M
zIW+neBh2ZW@7DLaGDAQAfpz|9)1^kgi4R0fK<+CxJSx;ol1bvzR>ZM&`_`42?MGXg
zjXgEA8+q%m#~<c)*<&FF;?fj)`<1#0gL32r-Ttv~Om`?}Q<U~~^M3f)@mECW<I9zu
z_$af$UH{rM!D`}FY@qm-!iD$gooyr8g7v21Mc+CVQ?QxZu#_K}r7Mn3l)Rr_?R<Kl
z@i~*wz%vK`S%FTHN%qtgaeiB;na%oUG!1CSN<-eFZI7Mv%~bQI9DmtC5{}IE<TVgh
zMc-ZMk_)TuWG{~_MlVM(sQ}Fas%wAavazneG-klfv-=&wr<>h5H6osZsNVUZiTY((
zU-5xsq&^C_0)M8(=UptUbjrqeRZr~FvH1GbQH0Xg8QF2;bas6*E?f~OoSACQ%SL;Y
z*VZr9G?rSw|MW@hauDNp>qoV^l8Nzxa|_T!)p%51BjDQ^HI8Xakkuax{NXVs>T0no
z3KcMrlRBMe^$Fl|@t2xa)0t6fOHDRBw9Ze?VRy)xTwGHGA<kIz?YjFtzNP<m7mLgC
z_`HwU&$_T5T*X)S)zKwx4u`Z1V3nz@n8gs3CFTN;9=&!m9~^aR<0BW<%947s4N}jg
zMq#%q+AKJ?peEF1?ihYy5rf4>Va;QwJNBqLHFp5RC-B4k@15C?OcE{5%>DCxO#6xU
z;aU=i8C-Q$IJQ*0CsH;K`}d*klRLgvviE)OQ7#5*YX18p-EJx|G0}>IGko5C(2>;(
zoX=#GrD9gAyH6jR759uzBaJPT)sfC;KX^d9w0m1ITj}S1nxs5pdz8f_o8?vam90;2
zKe$<2E4}HaQTyY4Ce6;T>eF_?@`DZ+8~TLJdi-m&<JgyJ<ANLOJHFjIYJIYuh%?2m
zgqmd)KaYODIW8o$VlC4n_9jyMI9p9gfRwo`vF>_);?Dm)Xe*7l|9xVlmakHNHK)|E
z!15#{8u04+Q9WxeYoq01u|_I;3_5Z(5O*>yJLq1l!E`7d5d{(Nq`Td?RZ$Z<`_ab8
zyb+a8zQznluk;Mq*ne-F<(z!yM~TP#FVFokq<6CG`QC-u&BwCe<+zP}k?6Q~sDI>f
zJ&wfs<zYt6=dbljj<Pe_d}D3HZ_{QJoi0kO81cwuO@3%$WzPgx{i?=AofrJ41@MmI
z1tUqLS3g;>QdhAJ?L03F3NyW_(a-mw11~{MZJwWMyR@nkO3(AU2D_Q&zU-*4uY81X
zyZJ&f99eHfyUP^b;AYDtavK5-2_ETPz^rcF<*ARpu{m}hK#RIjSjU(FTiGZTHT!J-
zA&;1%sYdk)JFcCz_l{d*(C%`Go6{kSh5f2+pUc;7ZRwyrrbCs-M(~pI{OE&lWVVLg
zZ7(ehj8Qx?qdgc8@*`3z3M4_zlgH`8xEn&AiaDLHNoVDCWC_lDpuXG^S=aC7T9O$G
z68_cF8GKvN&R`gI!Rx#;F63S0G%*bFTaulTFoI*_O>|v)AVdZIJLfBl9x!$E=g(GQ
zv&uhvRT*?WJfl8nW^|O~QL>Bb$QNIIb(02bOifIR%1%z%_~ufh-EZ&jwk<yE((g=4
zsI@_TQ}MQi4hOuK2H6?>zIy9jiN$hr^|90I+i!-3CpsI1DrID3a6n?88GtvYQxfI$
zkG4WNQ*)<r8FR|#em0ynPBjh{--U0}y+U?M`jL%k01Ug+>?f>RHE{6tK>buOvV&sR
zh`_|5<Zh?SP!Tc1)JDz?8@CQp({NqIn)L%qtj&5hZ2Z%$_r;BAFJ9VKQ3TGzl(ekO
z0j{BJWXgBJ=hepcZc&7-P+z^rT)HVa>{fnm=_jl6!?*u#cVOf8i)(+h*xfhZ5>Q|>
zVD*jX4kIG@dzgiV3dtcdul(m|gm6yI&a-kpDf##QFAVuA+zy=5nnt33e(B$H^oDl$
zQD-_H9NQsbaWAUG0_Vk0o`O|fx-=Mrf&2@z9LZ}|^Zabi-5YYFBz>sy--i_|PHY_X
zn9($p{@6*}XLsHK<}Ji_Yr@{Qvpbv4&h0Yvl*RLh0qXfj@7wU!;w{Ga$kQonRBp{7
zF(+P??6P|Ca)FD;BCzh`e$-A9YnQJqjZwVD!f(R8IMADJmEO4x=azapdH8RJwy8tU
ztF<q#tu+?Ij|q##{s$^LM7RN4ZQt(srvvA*4>TK)4#Ol3W8yMDeoXuKg$-ykP2m<9
zG}s{F?-e4cl-ZN*IjCaURfqkVCaWPNt1_<5*tgPv!F%C1h}n4{ydcWMG}jTaST+AM
zsZWg3t6Td2UPSv7)8F=ZF6*i`Z<JV1D&hve8({B6ReU7R1SOT0q-%RA`>B1~ceIZ$
z7Bn^22h@us8?beym6^QvAFb(0KPJ%NcgdKj*#T0sSVVqlh_*4*t~S`KF9RKCrzuUq
zs|0Vx5&_J+6#Nya9|lFjuqi@gu?>YWe?7DV5SJ(>lp1OSCAUG{g|tJ!65z+XfA(85
zd2S<i;ryjnR;4>~u-RUtkaD;u!o)lcEtP_x_0(!&B8oQ3-{zc3!}`Dhfy6~Z5#xp>
z(Z%g1rk<G<`AuiC*<_#<?jz^53#0`DSHj2%`RpVC@$Z>x7K?Ff*~vXRzElzHvzdzh
zeY=gn{tE+sx{4~s17lP=cgA}SiTd&<J5D{Ye%1Su<lTnve(M-2>2l$*m|($_Cs(`U
zJYBb&m1pL5@d-Q=h|?$3{`~o&%`-=Az4Kye+;pq7-%v*XuWUu2CiF^Z>ZnUF9gCUU
z<>1OG@wSFoChRJ8+f(e?x#_oQ-ygOX8dg>gXQ?5?;_iQS^<|!8`rjFvzcZ%4>AG@m
zaFEOKEgcxN*T&!1H!Uq~`ayrjtmII4p<QXWCqrHiV9@r?){PUEC+fwdqTg_2^RXxv
zuo=Q9mjs1VWA_gy)e1)0N;z`kFNd?x`{JcA*L^wZwXz*rW|l>C*4UT&E4blz>F9a0
z&#DVdUSLW8t31Dzk4#b8ou*nlinwF(jzLef`^0d<)wgFwckdgAu2eqvcr^w*JW#Q7
zBZ(#7W$Q<2O1kFBUHrev+@sBmqT&rlCD-UC`ArJ&uU?qR*Rf^@3BhKY=FdM`6wPF2
zQfuf|PJkb6(UjQakBK+<+;X*yRq7!uLr%b~B@+J%Z>iP%X{zlQ_!8<8#_PnI5z?19
z_4ovv)Fi#>T(YrfDd(wEQ?u6K|78Kiw$}SJQ0a>uBZBD!7kYzCMI;O)ifvvLy;~r`
zb3!;BcHYyLv}JqOdo=4A+v>EP*SCrpPiE&>Z2(w{p?ay1H!wv!F09gX3^96cm*@ef
zM4Y|NayQ=1@ZHuit5}I7JOQb*phfhD3<-8Hn5T*}*DJI^dRFtSm|UZc6n}jVR!AN2
zNX)at)C*%#@#=w!4Q;Q=nJluO6=1P*<gwW<6CBD>7YW@Mx6J#){Aj^e^8KceJp?_>
zR4+xEZdmS}hueqLTQsVS+(=@NiNRT}^vK`Z!S}1KRU}t9Gcdhc>Unv=ph+!eYTrFs
z3H1hJk})4QZA<;M&PpSRD}e97D@yNlew@(J6RI7MsS)2!>gW^%oi133_}|>QnJ6j_
z7qxSS_Cgn%Q%E?I$^v>^#h^?KJPLNe+F*Wa{qM_W3*lQYrW)~O*pKR8mr>Fa%?A5C
zM?@kj?IaF_o?@Ge3kMr^?|5<Hg(BzunQh1=8<#%2?{*`DzgH)$0MDf<?7qLfBwkAr
zUwr41NN5XgS`;{MNG|E+xs~a<Bl+(hjX@H+y2tVxAxBy4cO_TFW|ajSm83_T-ADCY
z*8f)NxDVGVu1$C`c<jzSYez&j-}GFB6(i!$i}%i4<<-txu%KOhaS(DO4t5NXH#5aZ
zSvwKkvb+4>S}&1xLF)@=jXa5)4Am(T{f?t#$37mDazBLLfN**FZj+b|JpA%$f9;dp
z_~lgE!jX`wh5f%O(Jdcc$~GkH?u*H<-)j^)=-laCE>`F`=wTZEaBBk&%>m`bl-Kza
z{;BuAxbbdcoyzZ1I~@DzJ7;Gus^yoVSQ7&9Bv{|PNOgU=`S~8t(Z{BZ|21JYJ;`$O
zN6z^kl?{~WSGIp!Jn}_T+}Dca7j%u4o;NcxLc+FRs<#`lA*$4^F?rduDK|fb^q1Qa
zHplOJb#}|PWpifFo`SjzO!~~*jg}f%XZmn^L6C&p8}e$bz>e4zY#cfu$iJw+M(Zh#
zFq^1)yYsj>%TWEv{KpXUB1bUmD-`q_lD*S!M7ugmP<3L7-X$_GfMbB0+U(UotU?)z
znoNC1HHT1dkF&Y^o(vxZWk>>r5tay4kjcaPHTF~Vy>-OZ&aNa}FD7N~JfpBls_iy^
zbBr&(J*dDU|Fm3ok@GwahF8UKJF|A~S>qP!j#*~^;ZxtS{nM6#LzS-G)lJ#Ic}qu6
zDW_-dL(`!8+aJFlJ5VxZ>+yca(;_RjoTxH#>7917%PX5kpSpBcG*$C7N{<}iy>jIR
zI-xaTUyLH3kCKVE)Ch_*i6WJvb4r5!(Hy5`-HzBKn90%FUJ2DWaL^)n>bV6m@$sP!
z2L5n{yU1D;WtL|PlezoK%l}JG6GrL0GEhEBS7GOoH++=$H)EkJMzw@AbyfO}MnYuY
zR*ye;bz=Pb$G`uf2M_itPJQVCy3$ve`;me&N8YXL^w)y(bGzgWou?sGi~Wy#wPEg0
zcD>|mim{u1)s%&&f;SfznG8&d44kldQ$R#Yh4J6cyM9#|9jX{>!)P|A6N8M#+gBfL
zl&i7lY9Q=JpFxCAZrg$}S?aqD6E8g0A2jH}_RBWsycY({=$7=Gk==yl%cE~uJ~e&c
zXc4Spbb4qTYU$0jmEYqVry7OzLB1()LN-hLP?Y|lA4sP*IrC<f!DmEVCs1+_1DH_v
z8?cWDlHT**uo$D5HX&5M`1Y21;Rz*%VbEyeCvvQI&5mnFq$GR(lY5N5VdJeacldyL
z#!SMPe<POdL&j3JW_dFpfvLenRaH_o+Se6q9bV>98+`lYmVNEOKPDC*)L1q>?UB>N
zsR1^98Y`>DS*)x0)Rg_)E5O#K<L^7G=TYxAt=>6i;jM)`!?G5SU%3D2-kP1abLkkb
zANg$i2HBm}&Ydaw+qWhz9U-+=om^1mVi5I1ZF0fVV2hCROPBjJIB^beh7dy<a>v<9
z|AU+={qi%==nq`(sCKX7{P3@OJ~l3H*gqNiF~f}impOGqmSs+eFXH?|0`2DO3(W_S
zD5|zZ8Y(p{tn_5Jd3Vyb@I+X=_?dHX^ngc7zH-q2a10^??pR_!d(*PJ#p%oa-&j2y
zT9N1&)%-l=?1aHXTM|5@tk)Ffxb?`sYjEvgu2z=S@!mW0E1jlQuRJ|e%j<ZvdHcmn
zOF)NN)u;hmIkIGZTQT&>NHo43h|8_FezZ!1vlRr@JdJr+J|ZIFWUsKd-&G&0?mzIq
z!awz;&*YeCOwWnE6k_?g0BSBLmuoW(H0OO{D8m&WK3IZt5TkeVTeuoapM2Y^AEm2E
znM@N-niYUXeD0-^*vgSF!c>eJ_baSjyH;$dP{2$AV~6MOYijQQarbjKp;f2r?({W7
zPdSj14Ir919rBhBl%8;+7DtuFhHBAj+<y;R)go|O2!gaNVk;!VRpDQW)N;U)2kz@e
zN*~Cu9*bq{4ajE#TOOScMy(vd6b0D8Nu3UAvr-o*{0(^wq6cKdXRT=t&3$)lcSJhA
z!Diw+(V)-`mGSbJI00BPn+_T)d?zUZXx>ou?7q44*Q|e9o7KR;^lSz~aQP&>`XLrY
z`M1sTM*Xqz))CQb4QUpId#AO5y6{5XeObc<mcm@@Cd&6NH&s`znKWIs9Z&wPL0E1G
zwu+GLgjKnVuTIwj$nS37vHJvK=7q=Ut|Q*CVe6z$_kti6viMEa+0u^P*2WB<^!PG+
z-v2G$ER3z<eOOmWogth%+bVQ^g^YOe>eU~TZgjkE63+n?R;x2-5&~`J-k)uzwY=$3
znB`bMrO;<GC>DLW8fEJXkCn;h1X56@hW348=9*}#aE|?R-wC&Y4C|O&3_132zyUuT
zfxcBI5Bn=?;J2EIt+j<-yNQ~^i`3@z4PjCeB$*#1j0o6Zt1OA%Qu4h07?*N(cZop~
zPS5NChW+#xO1g%Oq(h+VAq*{)*x1?(Izk7tXZG0){OkBAds##}x6A&(ANK!NFjiKk
zLUMPAi-zz?%y3s%SHF2-aL72G%q^t7nJa|fEJ#8yDQAF2nD$NqSM9Ykno97eOU+#2
zqpP^=Q<hbkF2i|D+o53J^LVwhv%x$~+FtQh_#smm@F6cE*4<B8$Y68uwjTLy&%W`O
zit!GHY!0!(7yAoJ$`e761O6l+Z+1Sjz-O{meB?iph03P#e4B45kkWX2!0gCW4>ALW
z4Ag(Ct(EDYSS+?~CGeXPtJL(9idgiFljU@J#uVcSx`K%Up?qR-ggYnGeM#PI&xh7z
zVxb^s*wXf@y*!WCcifJcX?^^|T}@|N9R~sAgzD{!Y2F@jM5^INp*?KpPj6z{sGgm}
z^CXE8{UT0XylBX>_O`9g-)tV~(DG^Qe<A$!=s5-;7Sn_QH6~rDP9_e_GQUZTh*}w0
zOo=PBvebaWLzo}^ch+o;y~ytQsr&&O)Z57;ZG?pL@_KqFhzzO@J<i+#_eow(CNW)-
z*L9W+Wi)Hrbi~q01Qo!;|32&ZC7?H=JEl=kVhn6$_!hVnAw9t=4Zw;fYGQBmd8gFc
z$zuzN9TM0rV3&N-mlr*lSPaX3*V%NDbb8se)x$f=QFJner>CQ)_C5W-#=bVP#_YM>
zmUl<g-G-Zoj5fvBtrq0w1hXU1=T_X2!@R;rioWbpkoUJtd+_fUqKc?_U|ktMX#5qM
zBrPq;fug$IftI`f#o5%<@<w&+ZYZqA47=whU6PFLa-9A=dPq1MfBXqV8SgeYh_P|T
zTdu{aPu2ztW6)Fl(RBSYa-B?c4xFjwBd(OZx~0$x7_=)pCN*0^D{RkD2JR<XHy(eq
zdM3j$NR?L}Zob(Fe67o!W#gS4`$znhf&@ZH`UEaAD(d0fE?Mvx;DI~|t7T8|qCiXL
z&u}m7J)j3_bTD+Qw{8D>+iANe@l4z^yVWejed@S>=8C%HC>Q55SB&FPi*t{x5TbAa
zo~}5WeqXGs6@GR?6YFr}`J}NuQ+V~jkqpkid^t%!Vmr=qjL{J7<Yw=mP&Sy+Z(bZJ
zK^T1fz3p<btJonYlnbjg_0;|!e5?>)8!|n9GC>P@B)4Qef?HBkQ^~{l(@}gBYIdfk
z{MaA4f06QFQVD(fgf(l{JpH-0@uCb>3+S(d`9l4OWxl!?ge46&J?(5&cK_c>g+dU8
z1$c*6;|Lz07v{xv0_`m{w$yJ$c<6_SS+C@sB7W!Q#Gy@PLE0Oj`yCmwI&{N$?7VpK
zuzS2!DC_~}j?F*-&JIf_&#)?-3fX<(@y!hDu-r`J(DXl-U75<pTe;NL3A0zekcs(p
zATO~PNaNsvK)L@??A|nkyzY$R_Y>d<$%|_>s^@`u&x3}$NQA4;`SW<UWs0BJH#7%H
zQy|!>+^#%H$GX740X;dOC{jan+Rf9@+ozvlt*jK9gZo8?lq$MM+8kUMfE4aWjr8#H
z(iAThw3MP`_gFjq&lA-?$TM*c`}yvTk6y$jG;OSY3&0In44>hLNxAmrnZ(KY4CTs|
z3z*<DAHhprBC~nANEjY<?TQ(tVzQmihq7<|_UY3A^8Z@Rb8_Y;YL-f#RC<0k9Xx9=
z90Bcspes}-tG#S`a^u9@paWJM6|x$+(t*!+QMjpRp<k>M@<Q$4faQLDns(B>3Af{8
zv-=BJ0BNB@!@6<*-_zP-m6^7%=jiQSE6<Xw0}S8Ib12UqZ94YC(PB>N|6{${p0?5Y
zP!^O^=_wN1P?R?CKqO?j=dwp%h*ZP~av>N-=I|oB-Pq4Gl`M=HSr0X}W3Xwj&I+@m
z#{~;R6H(k#!h@2ur0}K0>L|{eExU_pw|tvMM#FxzBE5=Bav!-~$FE8jW&Rs8RO7mg
z`yOc;Iz%$C1WG~J?;@CT=p_dv4k|<>*d=(AuIEfV=+#qC8yD7hoMKfd966x`g`kt6
z)AGw2I@8%)jNhVhWFBp6eex`3%Z0WR9W(3n+7obb<(NO(Tfb?iccZT8?Dol(c^`^g
zdEVK{$sTtn>YPyw`a%@ke$w3ff%f`lm94)Dy>@$=gYdj+eS!Xd8D$f;Z8N9o7j$ON
zQR`DHHrY6z;ntF<r(C-6yqeLcG(Hq33?>Jx1(Er%#~^P1L%Rp82(lP;Z_uoWSx1eQ
z?jxIvC4}77+Qk)DOn44^eJ7o74fseGb^(3zljXef7R@s2b!uT2GZDNTrWL&!9{)ir
zG;;M4SOW&DsK|+1NrU(G;~tC2BpgJqG9IMTrAwdw{UP@xH+4D}Tm;?{Zy^v&lSy77
zK&OHet{@PZ+eAz&tn{OT`JZbR4x|)UJimSmCiUM3<;n!EHYcNNK|frqdg)L9lyXsa
zs`4m}3}bD5@XUj<5ONd_?8w#TjNqUz02>l^lbqRQRn55W;lRZ7KKyBCG8A)Lg$>4m
z>Blje1d6Kb{j07wHeyOmFSq>{2dp$EFLZizQG-ekO_8>*u07!Bvg$O8Um-^ana)9u
zjbR!HH>7N0c)yz}t>n@*@!I~40o*bQ#)s1Bdcq)s86GhiL2Sa=EgiOPJ^wcE`F+OG
z_6@$A3K~uy|7n@gK7ANUVYNg--v>66Ng>Lg!tZkf(-&jaCKj~|!c*|w|I5v>TM`Qn
z8~<~vSb`@5#bbs*JtHEt$tZGnD`Xd0dLi0e4-aTcWdO(rC)M-3L^aZ9y2F{=cj;*Q
z2b&Ib;=shMemqESSB{eycEc9GX5Mr52s$uz$r<HSZy4HJM`sElhqHQ>ecu8vM(B#}
zpfK>iggaTRKH&8-k>9@e_AhF9ZOR2*mZ41+gL!-o(qsO4iqyV&(sMTF<gVr!sx5u3
zC1FqrYR*J<>JW1Z6Tv%pDSH4(SXQse2y(-RxAJn_C0?6A=dN7YJhXSup7((0u>NM<
z*`Bf6zA~BB9Rth4mbrnFA^*KZUxhs@6Zcc$LDLHCt<^(pPIN#<S?zhdO{01;ZJc^;
z{#lqv@r@paP1k5gxgHa8%6LAnUt%*2r4<X$Rsh;ViHV6OVCGFVC6x(Kf>Y^9sI8ze
zN<O)80)-_!03k@_dPZI-s{|vd(Phudh5w~8c`y6G_kRi?0)ZwiH@4NiX#}SvNXKsZ
z+?O{m93@zkpf_T~2&l}@!q%M^kx4E4YD2OmIUSH)bR?^Y=_Hra%uxy*4gC@~Ss?qh
z_}yyJ|1Wz^wF^tZ<rFoPoUyU-F6N>674R^+7~YZpYX{Z5#s6U@iWNDhirt80czOlE
z5?x+;i`GqHp2nSYnb6fCBnThBbDW8(DeUzmi`Dusg>{I<=j=XWuW$M|UC%L5;#4hj
zGLaS?lk$%zPuZL^-dtlRmS<z?PKak(QS1S$NQhc|2@k_RZ)q1w;>A&c^PmWu!gdWv
z35sd;n5mX#=Ks|os(re29`^K9akjm2YuQE7%`EZNbuUG!zQtc)7r4B|Y<tl+W<AEj
z{`6j|Omr$&K}gty4+O*&nOMKgtEJMM5%C(G$&>4mRwG-mX8_d}3lgzBogV1HonXXV
zFfyEQh@Pr(e+)VzOz{A2ss8=t{!4YnpHo%4XcV|60ziUmQc!COv>!^lr0i~C<vrSC
zSbAeBdoVXYuj^K(k-Cb&xun~L7mkouvhzG1S|>_RuLn#P>H#El!pa8>O7z}noT^_;
zQ8r~tR7^mqbn28`ksE}OH0PZqQ3oOT{~oIvv*>@fZeBabjRtvBn1^6pj1OH|S;n5y
zA?X&V!n*K_2psgZ`^|fcnt`7WDk?|i8E8bMPR6DF;#d9;Q)dFs<=Sp>8Z>FvOsOQz
zq9~#?C`BbjC@N8zB55Ecnu`XzB$_BGBqSnAlY}&ogwSLP6;h<m?`iM-?Q^d0T-ToB
z|9{`-xrepZy>4mM5u=5dFqS5J`13S?&PA?yrASm}F}CkHZ>0_Ta|%bX8H=$h#!xs%
zOwytQmBfyR;(zdVg_*IP{(U}x&m=8Ry;uvdB#~_hywcLrC5%V0bLsPIo4?_KF3;NU
zR}`~=KqAZuuN)xpcX<z)6`P+a%5cBC`i3&_p+(hU`={$g8e37Fia`++o!l}WUbI7j
z+ADiC+PW7Nq8dhA!6(GU^dP9szF>9`y<Ip}I{4ZPe8vQ_2i2*|nC{cV^OU?2)&Bsk
zLxPpXvNx{l`IMA_a&mHrtZH1z5tIcKT+*fO_dnd6w|uj(3+Gb;R-7N`Yq==URNTXK
zRT0}rM%Rv#j9gW>ja>h)7NF`SQ%uj7pZ)<g^Nw8?!u%9-S`<ItOpFN5mW)Y{HIs^=
zEJNX|xFcP)>buEyX-Db8U~rF|7FBAaKP5xT$IR8c6|-!}1){GMOClJ{h;kJwjqL2b
z7ExWFmG4vh$gh!Z=%zT+R@F`|Q1aj4YBsv}o&XuO3pZcnMVjvw<Fibn0j`KzuJ`Dr
z5&)@~=?VjC>;WY_dARtp_9wKjR6N?1LX63NYO0!X32p1vYgaY>T~F<blJMK9m}Sf%
z!YG|Jb`Qf`g-840!)207g5{6A&U7=j(UA2Oz`$M4D6s8iOnkpZ?>K`z#^{+?#3RBn
zv(mFUCRE$PyOB*u^wz<Em|~e9y2?YkR$+S=OsM(?UxH=b_@I}`Io#w%9)E1Nm6=@V
z)9DjN20t3QJAe->aqG24&u5<|s${+o3XqvkRm)6%G_L;Q`-xFM+@rsI%@~JZQkdt{
zEkmAgM^JK3hax9W;xjlyYg@4C-hWEReM&<<z%yM5zj_xu>*?#)l@|C5vQ)U+-J{)S
zbwQ*BAOrgfJ|MvaXLQ!+-4l7F0P<qNu`ncseHHVm;HZH2#mPlvdp-rXgpO*)q*cM6
zj|T-E5}FPw9Y{5u-|pE_cmH$(vm|u7ZE@Q7Xs;+ipwT48$}R5IMw02;pSoIx-J&Ts
zei|ytaJRX`_7`c5d^KyP;dwCA@IH0nHFlMARU6e7b#ldEjCqW1z9*EniLb*lpoc>b
zPr>)J)j!Gd{n-h#LnrJI07L%OhzM!X*4xI_JhdF_ADbm-cx2A;fHRZE4eYK*&Vw9E
z${I}uQWqW-{$n4YTI&d}0gNv+Pc(!ZH-F`b;lt;};y;Gy_s!OqIfJW~ur>DgS0836
z`(elWzrh8WXkMYP;0KH#JApERhU^(LQ%3>v?RD1I#)JD5=p7@ClL{8rWz3ZErB4xC
zIfMy|1eQNy6^tkXB~TLxK94C|Zu9O9VKZaVcl>jMNJ#ICandk^L<Bac<p$nVwLgFE
z*z^ss91E3T`etZdSohBKe}zelw|skXwHcg?raCAjb^V!jH8T^&51UVJg=hIx5Z{&e
zVo|+jgfd7Z#PMFG1H$u6>3~JX*kuWx;>9r#{=UzkwKD%!C3_;R*NnaNqQj*Zlh2<$
zudnDm{e1iH=g!?LZEyQjIp4X>*QMh&%M!W;yUKpOXmoj*f$G_EiQU@Hpa)>2az8kg
zKubi;+HfTFczqHd^3R2D@m?G-dD5guw>!(cgq{}8OE^FE0+dp)R~tb4ASr=q23}8h
z(Sh`MYq)1G^e44qW7(GMX+1nO!`h5(o4Uw)6U-9Y6*4C$wI~CZOIRRg?Y)s|@Z5t|
zD*F73-AtcTw|cIB1vXZKWQ_Z?p_LZx)9Lx@w!R5pgd6*66`z|12=>uIuZS;5E3!nP
z1zHboH{j_c;gZW{?xzl+t7L^63N;j9&Gkd?a1Dsb?)Z`4M*{rep#_h=xl`7DA<6?6
zUbrS$s84kUue02__`8AH($I{}AW~C)e!eH<NG{o0d&g}n?WB`mE1btk5xnq1=QKIV
z-$emk=$mA04g^1XoT*H;B0OOzBvLCjP7lvrMQY;bpC4#;-{s~q(V3w_K)WFd8G!m-
zZcAT~nb~6U=&^oBci}IKg{*VQ9Z9C|!GT*R)c>A|Med=dm}9xN*mz$m9@oW9%J*H-
zM=(YUOI>^{H&0cUDBO8rqvnXP#o@VRn`=u)ivYx^vwZt~zP`FdJ!BfaOqNN)wg7Ho
zt_Kdt0uPa(NK6d<?oe<4Z#A;7q-xL<LmAVH9ZIl`xli&=K5?3<D0CEluPrI1Q+|FJ
z$3_FCa|H}Mw+ujeK2^U`+rt#Do@E+?R?s#`7S+VxZW+VArJLt`-XT{Mdp24l-)g%t
zNx0H2nDV6!0+dVEvcFP_S`gc8%5>&wAQ^DDTe=$DJ*6o(O6un}G5?GS2)*MucT><=
zv;c~?4)ggpH!}>glZ%NQ#Pi|b@}?<Q+-!nVD<4%~IA^Ekkz~<SFyK(@^&$m|1_PwW
z9#;)wj6}`ZoAvHJ=A4|2Z7N%g&-qv}ku{~fLPutn@U+b}v3$T}3^PVj^j!QhG4X#7
zqHDU={e5Lllde%C2(SYJWJ}4;b~e^0*TtDhe%~=QG<}_;<3l<#5RY5a!;Y5rln|!`
zPXQgf-P#sQ)r^KID!%<zuwDp9xdD=Qw2JMx7vDbRs}5_dGEJS-eQ5aL*)d2{)~|0&
zvgjx~`%P$MWR~4lP;3r+VO~f`2=*KAwnr@q&$Xy~`|$pKkw#iczzZS})9KbGDr1pT
z2MN+~jRzx1A{*hAbdTwPb~M(!^j4mlITKK7U^0cjL~-q!fX_c=^#6oS83j<5)RTi(
zom)AN3k8SVdf;AUc2xX#;#+SsX~%EVANh*IEZKJTyMRxV81|y_ZFRNetH}~T*@>51
zU*IsIz-fv764cx+A;*|r(e7?@#-m3kb9NqwX_?Ye|3atJ<}!E}A%^25SWSy}r*!^O
z86K5ba79nCe}r%06UuVxGNBMzD@!%mci_N186W!B{Ro=#XrzPcaTilJUuU#=#^Do7
z<hO=?jmZ4#S8x66$1l+EBeJ2TeRl59tdM6V_}6*dIsWWurepj&rVG<p5i=A}oQqF!
zdOjqiHS0>x4}23rE9CU+=T>}{Dd7(_HTSQ$p!~vKjdif&w<YLv?rS9K(8dTHlAjO4
zVA6hA+a(ZH$lAKK5fv+Kc*vVh9&OL|9M5a^V~aGe0}i=lQ9~*IcVBqt3(`FhuxwqT
z>APn2#P^GFG2LSv?rLdrd9lS=XC5V~3<W%_)8fT{ndJVa<S+7P!eQ!158*pcB~P<>
zk0$Dth5w@R04!Z7=(tcTp-p+4(g`!9v-JP%z1no`@<VY+_>T~zAbx(Tr8~U*f!bNU
z>Y3)8F(zIf7mJbtN&i+kc<n^Bt|$9jX^-8Jg5a{`YEsRqoJrewy^cxMYDDI3)5o}K
z{RqD@hBcOit~>99K?hxF;TTVMH#Y(AI-9p@R?W}bo4Uejhq>0%n=?Jy-f~OczG$!u
z?wCT%gSp_}C>Xk0kvB>_B>nqb(f<9jj_SRy>d8}o-fO_k(M)=Al|RMKik=D7Dl@CG
z&VyMrX4A5-JRVhXXUn;jp-O1L%I}CI_?I(N@!~{Dt7?f_OavTQ2M|if<ObtEF!xK3
znMMX3RVVkl`ySzjFa{IGJiJ44kl1>Hx7l47^S=#IH8lrNjsq7sn>@J~8k*=GhGo)Q
z%9?=<{qC9U9_Z7%yieiX)JG#9r+@t!l%Jd1mhAK<YO(8>bpUl&-Uqe6T)6=bhlu+S
z41`u)@IQQa!0p7ONpKqi@DYFtt*;?O4#)-*`C{(;=YXE6AdUj6o#lD-_jSFztt(o6
z|B=1J7twbaL?+UCZ-mRc-1<_TpWPVNUmslBucJG-PwyOOAx*;9t3##s@OQtOYtvd9
ziw{Ggb;u4J`K8zAhiPdc*RCm-I79HPKwT@8si1sR<qsp@e3jOkec<q6ycIfbk%d(Q
z3c=HKZ*~Kf<NQmi=Z?6lxS0Ne7Ee7mE(M{9<mvfkE`gu~q6(CJ0s_ENnCvUcE-@}P
z9lgXz5?;pXWp3yjXzUo>l1y0KIfmNmcC%4<u92~aX3v|;#^P%hOqcE@FRY`cHs@zE
z4I)>0{@Lguy93VTt$T|rzu2rKjQl|>TzBM;q^T3b`~pHp-ib^@SU-Y!U^dIW7|^gR
z@8!$H@UkgXtP2X>b=aLQ<woqIFcBgIzUq;Hfc>McCJa%s`WkWa^xQMm%e?j6eA*sd
z6P{0iCtHhUQw$RcH%}@H40Yk)`U#gO_+I|uJ<hTCNL*_a$gKq7E^J1J9S(Sy$6E#-
zqspYF8du&c_TO75B#q<t+a#p#x|J%uMcw|Dvdz->2lMS=2>i02@;IcHb?zVm1XZ&z
z*PjxSCwaJElW)Y*2K%FGhHww`bJ{bO>=^9m;gM`IFD~N?ivb4kwFo#X%^k|~EsQ(%
z7;Y8-AhD38FP1HpCfa#P%f`|Q-j}nf+^BK8z*n|Y%TQ^lv08bg1ZkCZug7Nw1@U18
zhbk5-(nS;v4uwKyBNW6BuF=8{oo8!MzO|*WOnWav@h6GD`&^i|J|-#|vq^Ab3Giwm
zxq5t(>1oA=o2zf|23)d_kBDA6dJ?FUs3svOcy>3vvv3R$-X8*&qLmio6~Ot&T<Yh?
zqFO6GI(y(w_zJ7TT6>6h0{ov^DlS3ySz-B`&Ux*d4jI0R+^kbnwt9N*QQbWU=vO6c
zts)lY#%{<hbvlO&FD=^0<0IGR?gXWATKG*h_rhe4i+3g-x#_fI<niLkOPF@SU#lq3
zTE+BEU6dQiwdThI>WH6G_c`OtQ;YNhTD<7;E<A-(SDdW(m>1O@v9wM%Ac0o4des|l
z0(MDIjZ$G#h<c|g*AzxDtK*UH(k1o_38k?GR1Z>EZ?7b5D4B_$N^w8|HgI#Zs5iTG
zX_X+esyV!d%1l)B(9tATbg|P?Z%?=Zs!Kh|cx-t?ljoUHCC{^k1eL@GGyoLhcV@We
zbLC9wL%FsOL@udGs(pK}>r~O^!)${$;|ArFH8HOQwB<~WY!PJbsbIs|H`tnRt^Doy
z0w@%LdO}Z0IbGI@k+C1hJLFo;GBWD9{mVhi62Ar7spyi$8cEEAzScgOP=OUa0(@vc
zF?joTE6Q6rs=h<Cur}c|d3tS6o>A<7<qV3mVq`O4o@mxGlRV+A`M0fKBlav-l97xH
zINdQ=DHsDJCdKyZE8c^a`e&WNyLX4D>e3w3O!IBDId5Bwu0lZqw*Au(35O>JqQEvF
z7N}c-S%84}pzepZU3|fpOH*4qAOq#{f3LqZwKKxt8GE~k0eHZ9@qB?&sd+`8zB>Jw
zVearVR*7tF_`5UM#gU`|NG)}OJd`J{&}|6`8x2I^)OIMx*qw6@><sDZhT1BM>@6Py
z)<`UCGA(kt*V!U!2_+%wLxEKb11o+kI2JT0%)78%B{H#c;nA9IRU2*!tvsJg=lsHw
zQ+8JoYTRd9UaTJ^CkQD_bMbF&o&b>cXJi{UM{P90a~Uj5_{xjl9`87R&KyDJIm3}8
zjmm?<cA_!J$;lB2EWv@pA!KDpBiNgAZ2YJVvR(Ej-MthY-5m%50!b_oo<7~Hd5spE
zx11YYKD`<*#xY<C#hwE`H07J}nkTYiNyMVM$+LT8E4xJe*xP}mEarDUV<XL`y!@62
z2l3h!He+X-axS5P$Syn5-rwxS+}g7YU#Ws#?-#C&SnQznDHgm4*MD1$%h`K8y&vv9
zCI3xr?JsAI9;LE>U#JNl>`5Ck?bX^%C|jp+WTfgvWrtt~n@e6w2ZXhtM`KYts{3)A
zPVUej7bc+{12Y0Y_?~k~8}fga;Y=JS1!Vk3%ev`PU%F9<CYoO^vwZtGbLNQ6%{3{_
zj)lCqkdaOfY6DUlBG(J4EHPK0v=;8w6Q@vxyt=((s^E>x*QGqg$LPiPm+>hddPTJI
zKXVUzigzr2z0Xw8K~~)vMvc<xCNOmY#^4X0@Zp3x9x>?nKfkg%EVL6A&6<dZ#Hp5;
z1_}jrSK5Vv(ox#qOIiz$A1vu+l3+aN9gijaaEMVH2;mHVr=A9ZYCg7us~IQbHkG&+
zHxHyJ6uLA)QNx>#1*VghQ0^=JN09qR|L@sclhl>TCFB+AB*#BH3~|dI$?Yv`n8ZHe
zF9F%xq>6t$*$lY($A|;hqM>lkx2`N;cOyj#%8;%sXDckTyN~*VZb8`WYB!Y+U3P0r
zcaji4oRkx+#Oy&^WO=Wyuj}b}>x=GLm){z)U2gH~g+M@1h{!EBJJBAAJt7TNdGG3|
zT_UviJ$anD9~+DNFBVMo5muLIygZc-kU$>zr_>J<^cgXITg|?o;wWPe?oG;B#U@}e
zg0Y9YSaSK&ndyY8KRzIMkN%W9Rr>FN>YKged0+b;-3_!NjjL2Yo$TC3$7!!yX*Ziz
zpF7TnIAo}N+zP&uSfqtfC$e0Vj8}lAe>9Fp3$)sDnYpo)(SxDJ__A=4fRkdM$Uhdq
z!d#h}ZhNge)E|p$Ex$aS(G@nOch$?t8PGL=s~Vb`qF*O_zYAD%W+$sMIYUNKjkQ23
ze>z8$6cQ3j*uc}k5+;D@2pIy(u}bq5>-J`YiSy5$1%-g=Bzqae4g-E{G*0Uff?%oC
ziFdW0py8A7AE{Ej<F|+4rD4DXI*-Isn7yAo*_SLrvsS|J;`jG{3q<%~_GDYFdof?$
zD?4r60ao#9d=|kIJncRWt-!n&Dit30JyXDb-je(^XCF}J?IRbz16OS=!FLi%nSq}k
zk|g$18lXNEyD4<VlAgq>u_P^$n?QGkUD`jB<|vTmG)_OU+lC%oOiB#g{(WVT=H@Pt
zN@55bFCpPhO$;t2woTgdS)RExF9cWnf+t;>ZhmEY%bUiOy?^7jtFV2BKhgy@7n)|)
zJ=_L-pgZpRqjsiJ2d|RcE&(pDe2nqa^S5y8Kyz^+DfugYxCf)QeHg`AHF|31-*Z+X
znMq9RCh(bs$XocmZ+YjVE+N5|)Q2p?_@Vx1Sb}bSbG!X3{!xU<th90WTo=7mWPfB>
zfDIJA+dKL1TibD~vCW>utkDvULHkK@l(Iq~23m56gD>oCWM*x))8KFs7sa7)HhJNI
z9i(s=LJtMp2_WJ0{qxG7Y+St$nzmb*EVI2P4WpVVl`XSS0m{r0Jq_<_1emTfY|tvW
zedq(`FNplD#U|@1U+(uLCW<{OUryI<Ye)A|GhR+(WBYN*^*tp1V8!L<_lhB0PBGX3
zRc<MhY2u3uv!rw6iK)MTWIMB>M2v_87XMV_AxA@l0lTCXZD`y-{EURth!tc;0<(}P
z4BYy@JLw0H%eS#-TSlj_J0KWf7c!lvKu!|tcST7dwowTSBUbEvIUcp`LCdSQT+AlO
zAv72QDufC0!91=vM}luq^VGcK+GS(=P$3F$ze2#a?`1oFt=W9AT*O~I2qub+9#mRW
z8gh&volKUy_DcB{7gfrx^IfQL;DN;|^c}w&cg&_N0#EVgiMnOEl+oEEM-;50*kMlD
z_h)}g0@-9di4UFJtLjZbQ?~XHk4!{bVN7{=%GdT3!?#8n61RWxLE5pOW;Znj`6Q*e
zsCE@nE$}sfHUTb*&4!oGxOFs(RGT!err*Bf0ajInJHo8htsfr)p}P5RUtL?-`d5SH
z?_P**q%HO-iU=YWTnTGdv84lJK@HS3Vy3WgTV2-A<WobSo&~xDHo@&4P=CAe-?qXt
zYVtlCyRaX0dhh5?b(xbpv*m3Ep|CUhnzeAqki`3tff9qes%p)M`P*LUj}_}ja)Pb-
zh?2*T9x?Wm#(F|$)-i06D`(k&Sk9pv!ie`a>Rd0uchU<%y)?c)TW<j_Dm;kAakJQY
z3PKa<R!U3tdf_{L541}_<b<-7=)Wb#YVC0U_Gu?$RdX-i6$5}=2=}4|D1D8uF5AyU
z7Fuf{gH8gR{9~{%otYr?KOpeQH3e%YxK*bC*6;@(o}aN71l<|7<88aC-z5<WCl54&
zO4a7b-m{4<6+<zjf2Y0im_6J1bk{+@pQtrhEF9N<uzt^AtJdd}R@`aZ=YWB-+WerO
z%ZmH%ZBsC9RnG$Fpd+saKfIs3(%V3Fx%@MO?)wxM&X|7g&5`RKA6sUu8Qx>gbkAM8
zHcl;b>T)esb#qP4lP7P=w6iA$Y+(#%>-(d#+48bD-nBk3%LcpD{m`d%Fl5V~Er=N%
z?lCMw1;vm4X^rE&(Tf&p)=?J8TS|BL`Oa#4NSMLQ?h*ESk6kwBthPr^H7-N}Y7YIU
z{`}mUxbVOxmL#&%&0~F>nduGteiWsk8w1_S2U~#3yY1A)&5VbL1P2+4u=5tf8!10N
z4nWt??wpjTT)sA5*FQ_U?g%q^i4(!|kl9^DaRfW^gEFo>bEb0aN}JdK1bxMHU_B_A
zS<Mh@llAtZzr_$WHRJl@@`dNdTRn6taEKQx(%Foj&3iJG*{$cNxZl%eRD1R|BcQa8
zYdL75->nVtM5miZx7iXe#(#g-uk|U{cH2p5Y{=R=;D&CuT~8j`#Kvt^uC>{)di4$e
ziK=oAUaxEK009o%@$-vR>wtv=wxQVWD`wu<P%?xOpmk23Mj_9hURp4SO^Q2TyY^wN
z?*Iqy`IoG|zE5xPTIX6&_U;{$@msH-B-+i?cuk$+C||w-CP?f9U4yFRCxA)H$8=37
zaXdbHhjo6hg|elgRe=B57ffIfYjjRnBS^T1`r+y%^)z8}bY{n|AER0YbbeOVAfP0Z
zzjhy2*pH7MAkog(@};EGs<tX}z;iS9NE;qL8yFa_9jeq3CQEDp2&;~#qPg+xkRJZM
zB0@2#%`dOL{oA5tSwZzC;?O;mW)!XUpYn~y6uVExgpX@jur6bv_oGbv#|x&m+~+|@
zU2Sf9H1x1=rPgVq()N7Yw(Z*0J)bx33*NFD0KhQCxo~1=AIC96PL62rB`^Lreec_v
znz1V5+v+4#P$V8xK#Ve<Qqo+#Ec8l}jg7mKs=5FCOYg`N>*}2&jWhOjAG+mh`tYr2
zEBdP0FAXWsczJpw)c4x22|X3{*Va4x=+0ny>B0ePT_XI-wu~k~g?092Y19cCFjCNf
z!UecBA~9DOIdX1iCH1{S*Pc3dEcix#w6XOJ1w9t|dP8QmiqlCZ=cy+;@U!@f^~UCy
zQumkssqyWDQYwyF{j|4zUlMO5vzxUw4dYEuOa?HZ{sX^;vMUF552=1;8^>bT7Z>OC
ziIaAuWLoI{hU_Y3+kE@-#8<K(JmjKW*C==ebe^|hK^ZJzFoNM(hJCf%CNMKNe9Dw%
z^+)BeE3*JWtj|!kdMJ$v_hVWl%m*s}>bYaRb#P|HpGU}3l$*;QdhU%w5#7E<KVZ?q
zqN0Mte(K9-1O!wMJ2^tH2PT>gFE#8Bq^};LsTl~l$#To_0s1IjR-5V?%23Hc>sNmB
zwZHPv*CX!T%dCzON#l6l$k8ft!~0;-y#k}pKS%WE_b;|9MkC;0Zg-SQzo^|q-J=r|
z%{nh;0C&O<Ss&dHDM`5xD+6C&ot(Nv?-+XwS(+L5EKM#|S?^f;PR)Z31^uf9=#*5s
zY2ob!)}|jh*p-{`%%r!7Oy3lcWrN`!&SMPaai{uC95r#&sCilo(wf43*v-3<sx@8z
zjqj(gc?;T9o{Nb&M$+&<yW>|s(-Y#;B5v;1FTm^R0rm%M{q>_v&%nSM%u-bOlwCV2
z{%UlLAK5)q=I`6TtIIU$E*-q&uaAMC8qjg{#QV{{W^7%sdU(iCRlZ;VqKP8M;Y^pU
zZB~hO-Rka+Yh+-d?g?vRIX=7}neT-2p`Tnp3ZwE)D!Te>-AGSUe+4}ssj^DPcMH`u
z?{{y{@U*tje*)c#MOy-aVkLoFeYp-kqUHIq!(nSLk)iW?p^=@HHROas2R=`@)S!v|
zJF^n2vTokox&4c-I`+0Rl1+W^(XpVgvXg7`VndmUv5QVvqbhweD(cpT+_JI{A1-G*
z-@4svmZA4+-)Wz6Uw<l(eNz54ProqgE@R+7xQ(crStI+IyE%Tt`#I5j&dOhpVIcDt
zB{6!pdqh`D-9vp-XLQxdajcEn)ihJt=-*H&KYpP?doQOdbEKrqBudLpI?{cp?)8jR
zHN9g*2pCh~G|z86GjF|i^IKT(5;=Jt9x1y;I~2VX<NRRK1eX4|VlzL0b@Vx<jx>|@
zoBEXw_%RkKt|)R)28-==Kyc}Rp0qty2$BTY$L;1A_M5yxfoc44-1vl0&%~hxIi<E9
zA4-<~{_?d<{l;>aW%DeTFStGOfLFe~@o$GXv%t8}8?0u=TAUr~xt;1Yz75lRzQp^(
zor%YPSFfEJkl%#vLd>}X+(=}e(F#QRb|a%INq*x@4nZl6^J?f%(V!oa)4XA{VA9Rw
z8{e#7^e1<;(J5b{pqDtLlTJkIA>zSMas^+x!e;X$n>@7g7^03eo{y|PSgBg^WWFqg
zBQW<aV(UO|7b}qxe*bFjr@TU9dE!Q~WC716DSM`GN&{~wUNH{;(C7B|wqp7Q<KW?B
zUe@OT&pe=7Nzcqw(9$}MN*>@skCKigXuIj!oMRW<Bxi@`zK$rHd?V@SkIvP%lfwoq
ze6e;yeO2D!w1Mss`Pq2{@|q&^miVEQE-THAjd2a^*tBV*e>+u8_iuvcnY^*QTg>R5
z0|!2d+R<WX_;FD;#~4QZylCL4e$rVNBJ}7Fc6ph5wg)Z$JCL_~dPYjRtxmUHCwdJW
z_{Oodgm2~q(-XP+A>U$bKITq(>~oc!TzbdKpI_|6;KJ&{1n16<mT1fsbw*GV(AARH
zln#jX8j}GY5b>`@6^C7u_=az`)fwm>9%#Tdi~TlAuLm!3n!`e&#Yif8^WM|3=y^=m
zi0IcAIb#||>Is8nSgx|f#t)^X=YJ)^@uXK*ov5omG}GqeZ;EeRqOMBaq0yTU6LzVl
zr_S0==&^78{F)JAdHW2BZUb$~flry{)0%Jf?EQC)Xj7UPSN7?2puvd`<|Aj!>9)(<
zAKyT{)44_y+zoDjM3Zit=UwDGjcEkQ*9!vj!=nqT1p-Eeq8qaEj;ExLluEBzh^2za
zB!PvGn-uhh5JRcGRM)?mA{NX6?^33q9p(x~t+T}c9k1jNrH-H#m5E1uj#>vW?a(T&
z-zG>j^^Mx<2X#Y4{T-&Y_t2q3Rol;Mc%Aq4V8H2WdUckKDwS8BYlLr!ZNFQm^=xfN
zQqCI2s;tR6m3w(!w@%Z|=gyh4;``@PfV98xh$&L(8LgrE|3c#1uPU*v(AtIvOy>nh
zR!~#F2ko?BL(`%D6R*tr_5%Q6yhqbUK6QbsO{kxrU!r1ZB-8DV4}<t%8XAeu9I7{I
zvS;i9PNTU6)+zyTM&cgx7AGX0)(tto;H<XLj!bh{QElcGc}xL!%?yY5BB`7chYjY=
zwf#Do%Oc3GurYBOs~;V)85A`J5ZJAuayM%;Vn+L&IrC<9@$_oD?Mz+(8s$z)wzBj+
z7P~3wG@}6%+<sHZ@|G3Hjf?s)|6}i=DyqNN(1$~hwvICmz#yY4r)6+^2M2(8-~i-!
zZbm7^nu$!uzS-XFz);V0++PnKHn^Uxx$>W@-tB7oht`9A@7S!7cC?+)-}UM3>(>{{
zP3X}^XRQIpj$n9gR>oh+O_kw5)hBtk@da@~vZf{ng2N=wnh#6TIi}zmsWHnC_G|rT
zccT;5UF0<G)2%FA?;i5()mw)Rf4MJbDWO2VM5;i*X`r2E*kpW60g(M_#TJdzzWmqS
zXyYG(P8d+7dKbL5S7S6!AHf&n2rP4FBG1Vn&2H2&1y<)gB7aLJY^>;k=-4nM>ip;o
z@xuADrwJ;7G&SLeC%-gy{mVofi8@Jst<zNhvCH4`rc<o7dq$pjv<X$vJFT|GKQJ(Z
zPKz_mf!`OQud}wDu2272t5++~tq;+RZHutz3w7VlVrCDp$24xEN-X$uch)XlBPW{}
zboT67jaQ=ykfT6hMGpAh)O4f3ZW|V(=EGDenZrz@cym-{hyf#{+HrRHdcw>OgVcqF
zhUaaDJdV+aQ481a|J*hXebHfuf*W7p!|T9%oY8}V2Ii#1<=ev#GRpf1D&VMsSXCj_
zx&KCkwC7|NFhbOs%<8;i0VV^j@)Q+vIZI@ALt$d2cfa`8H}%z~53{m-7~t!%U_o~8
zslQ*YQtrE@^75tj($e>B<0w7PekLE}XquRwV11F|`0+;pkNvnp(W@SGgk1}*A^JY2
z{CDmA0&4qLQ{L?j?;mvsw9lQ^$RN>oOq0dljndLm2iMmQHQcl6i5a)zsKtfiHCX9T
z-|>Rt-sHZ#&#otT3km|!^aew1ZTdK7n*Q&9(r}yeGrq99E-NcCGH`Hvucsq&Ixj9f
zkeZ3g<;CqCJ6KmT2Q?ve(2B=@jo?XBs@y}Zn-3;RILX9uaF(F}NYw&;NJVE@`hwH{
zR{C_a3kJTQ9^(@2n`hzfgD*Wo$UD0_^aR1^N{>P_N2v&L_vux%H`vzO($Z8&9DUHW
zsGD86QV;PPy$@o#hT#)|&<JQj0A@FDA|^pIyaFd00p7C&J9Pf(#{_Eg_d)A>lMBNP
zL&TgIJA*>bo%3YQK@3zi)mL}}8*mXISCEoE!g`o>MXT=a>Oh`OdXqvh-9m-8l?GKw
zNvQ-von>M{G~I8jtA+Ird2<a`ef8zfyy;rCn?I?Ry!^54x`LriZ^CTuxgUJD*A6$5
z<u|Hnxz$fQrhqAi7(>`Xmwdl0_H>U_{eRQEME{YhqeRo9?roKYw$UqCDPBK-vWLZL
zc((o!rKRYOfj0qY*d+Fl`Pb4?9aba-OtkAweio&-WNRrjtao{--=RV05eyHuRh2}M
zrfQSP(BMAqq~D1X581+GSnJGCYmQVdMOzwAXA9o*^mlo#w>U<tR06((oQTR67zaR)
z2LVPydRw-9d9IZfbO6gx;$KZJ9*s%wEf8dU6}C533>P&O^LOj%w#a(4jxw{@uEeZB
z8fe867Q)X%MUyPnBq}aAqYr?KNKXM<5o(g1{J-S)Vz&!4k)>R^4<)JI5)CRRH9JMy
z*hKQRCmzvSb${Rd!1E*9R~yWnxyx;N6Q}*8hU=B~HaCp^>-yw&S)Km#SQAXmVlyKY
z^xCv-J02mKr;T-yrq>)O)mg_BpnOghT7>7SZ2h{L@01Vp1o<miBMk62Zyv6wxQB<R
zvgy-O+ErXtZ~bbj5-xynJV=ktrss6u6@esrh4O@ai?qap)T(?ZvQ?E<4TlzlGLw&=
z&zLwoFq^|xs0HS(2;9539eGz;PUEi*Jr$Q3u3COf0dc4}mjqYNp4#hyefsnfJ#tfH
zZMLYDa33K}(MQFNLy_E9W1|vN0crQ{y*uDRH%PL206I)BC`5KX0~Z`qV1;u|-sQ_G
z5oVDm@TnIQ)2#78E9z6Ds-|`${hj8UK~RKNU~R&;nDR#~NQ7<&t*oh;zi?qUV&r@D
zEv&x1mimro@`XCSd!IgHMwpNZf58J`ABsSV^-oRMukF#q2z>zGf%L)NP~Ggfp*>PF
zUpJ`4#>6a?+3o2mk;wL%5zRW?UWcF}XsE+neLMZ(faID^2A#TgEkhmt&@sMeuU@%t
z^Cs(c50U%3=JRsOh%KlVl-4%lVtsXiKWL>24TTsR;FT35ZdEVB%R{UN;op3OBN#q-
z@cGC<8i=PWKYaYyW59rnb2{Fz>Sz0VCgp<rGYOcp;sWWNeVv`VJj}OOqnw5xq+y7&
zJ#jo(X<6dN{X`}5z!B?yyb6{UnOkcunPmtDAg&Lh0Gd8_sybdD=V*N(m_b2Zh8t1R
zGsMr&rOdt=nkgHe`<NQ~G7#<x<r0>c-M+nJ)Qz7n1xg_(3}@$+fF3`ge(b9hixyjM
z`_fiO{dynVjS_G#vsHQKvln;Ub;HQQLc9>HV#F3@=f<MV2R~J8nzQ=fs`&5clxIMf
zlUKF=rb?_o6N)T=S9335->9*p;TWO^8<$_as0)?LE+}X|F?y(bq-?;cl#~?p>w_qa
zAwoLEU3urcZ0I=`8=IHQ$GN<Jbacp^A?>U8S;c^qbXWA-4_xibaov}_PE*06`~Sj~
zg5IKx%mZ&n{kDoPFpnraPo4gh4jkc|t+97PO-HYV{vkOL@3zOqUQV*Gp$XNn9@Kui
zdv7LSD9)HJ604smOI{lE=r8XT@XtGcDmht7Ps4qM>bP--(c|B6M4UN&?;7R4W-L7b
zeUM6fo#=aevm)GNrCWe#YbrK+$hWUv&SynB4+@9!<q^=;{bYSj@Q;6fto+7FSFc`W
ztJJ%<Z-+=7YNL~2jRUbi${TFFZ*J0>{NQN7)^PuTUTx01PEt|bIo~94yryQ-n+8o4
zl|_Yt9j9yhojy&>X=4<sPg8jfm$mMh_IvN2`u7(+m7K8nMz~hjZSgnI>;;W53N5Lv
zUBJaRSD)RD@?L=4L_<b}9#Z}I&d<}{>rCY~MAfL<u7?{xW@aWfz6WACeYI<wQe*b@
z8PKxu_mudSg&EgpRM(AN`cH1&RrR*-=!#F?3f@wkZL1V{>gkVS4#9UA$^gKbMSj~8
z^?>1eR;G?$`}Eu<t+0>nZH$*JkreT9x`(T(s%C+ft$TI^>HkUe8+FgvSXJ;+imDU9
zHbf(hqKV;JJ^#x~^7p%rq3^ok2cwNc^oViehL0WV3(dntC`dF__jv@;aeln3cfe@_
z6z~UvyLIa}jU&z%UptuN1DZ`vCrf<)@guHjbIRxpo&yarI+!`fc9DI_`;TJuF(x2}
zl0_&*om^b(a=#VYyEYf*=l27q=v`4q<03O_u(EPDnmmdHur5XRXi!;v8IUB`{lAYZ
z(ZBiVFdMH<Uh;*q7*z_1By;1+G(muvTajeE<}>eJDxQ1jq?q{gb&&xT?f*jAgVyZ%
z+h+B^EMh<h(-Y(SN+%>GSsLkWIQU>XnI3i*DSu#KV3Sz~9|5XiEhUWR0ORVQ5qk_8
zRBW3Tn{N*ebB0_+#G3^wM_6a9=roP-A)q|%mU<&{clpc60i^3y^hc2|&)i^F-V^5J
zCcD+IJ^1Yh9cpfNn1UYChoO2ssHVm|wLL#<|BBFm(Ks)s83IDyupvgnd+Ysu-Qx?*
zE`kk0Uhx`r2p2<j4`JUB8W}ij;|8(~DeZdm#Fv#)s*<7sJ3nx?a_squZ!{C(DW#>n
zPw9MvMpt(Es6WwZLkRP|Sumw&xkJ&ui11!gQerdpH%Ue$KUiQO%$tPdif18do#Vqp
z!o$K!9vvMjG-y00@<aH*aq8;+w;?{ZeJ|ZBETG)9c=>aV-M^schuQ*3NDzEAa|cft
zCpVANau8mXDB^@bBG=X-S{szO48t_*M@I&e;=}}{%&gP<<R{Rw4n`w0`-JtwL;YsZ
zN0`;08uz~&{!}NY<-EM!vF~4U4{+X4h76%IORtt~{S$DKCF(tU^mslvWB$jovRJSm
z@fwP6Pvw!tFH?>svZEUmie<OP8iOrc5&#|?YD8YLjXQ<~iE+3?)yD~syN7{n%{iv<
zIFsnMpeQr*N$<=|isbm3H&lj?N%;E@9kP#J@$vSrhH>zLeWz}pLNRmm<BE@$&Ti))
zbQH)iG(sEI0<gr%5a^UY2nuLT=fPXx@d9!(fCoVLK4nUfFe3*y`-8UT#~<P^Eq1iH
z0$n~^JAsbqyi!L1y;&zdZ1VV>`-ZvjI!JNa59WPE!XJ~k<NtNsp^n{*+DtH?jWWG>
zd-E7*g@no*Op$P?lae$!C!&H7B5d-<n_VYfrnHl-^?o_B+9rA_^KGSwF=^MW)B?>v
zOo~jmjl;3xF-NsS$Bv1^vUSS0*kqq6dTFp$ZT7k8hBAN7*_kJ@nX)RY+YcT*7-%g%
zGH^&?FnLuFjLc?>0#N_t9j0$)tX&(H=<{GzDTGkz?_agQQErIlZRNrBT0D@-Vj4Qi
z2zFO|F>mURW|*{qZ_S_5vU|Y4@mIdR1D{Hui-NYO*h=w<3zuI|P$mL~2h^V14XXMt
zKg?1U*2|7;`c|C*BwW>b8wL0EHnV#tdMi<U-~FIbbt*u##|MHxvn?*()o7RbU(ePq
z&Rbt!U#~EBY|Vk@t=_3m*S)N8bfB(dtzNgjeP2~5T3+A$|I5$s(Yb2GVxg%A;Jf*D
z{47I?X9ikDtA;0*SZPQC11hjm?n_O#HayPYg!Q-D;%u5d?_AptlwY=SRvo;aZq3a~
zY-QN1#CX+&{(!+!EUY-roMF84LC2sX$2DMBFthwBdZm=B5~qTd5NS9wkO9?@m*-D2
zaDDng&6FoW0TqwFS1Of7Ds?mt?*(QwI)kgl&?hr@r=C2E9)ww3uo5tZwA75`{p0hd
zpRh(FOvq!P%FpkGYxSkEQ2Xht7tQlwHFMEZg_)qxH=d2}z-;KUh9-rK7!5uCl2ufj
ziX0Mxf|aPXM)yAGmUZh)Ua4Dl^#<4GWtU!Du8j=5i2f<$*&HmC0w%kJDvd9AvhTmp
zcDM5ph5hg&DuD7*gW~h;c|t}D7fwT^CfezV6OW@cex9^>Jq0Cu2v>w(r!Ck;P~u#f
zS^p;2J6S8&x$&p`0s`_QtSTDZ#0IjuI=g2&MOB8&muE4Yg$(@u+6go`5_EcZU%c?6
zKqc)tChRct4p!oj3&4!~H(|nc1FwLX(MF3Gi<vGl|A_JjU&@M&35khR`ZJuIltIq3
zI-Z~{qL1}Ny2k6ls->!O2OtdHD6lTiS*q*6(Ov<0mxo>LKD~ft77PJn&?m-KM~obK
z*Ea6$r%(E9Rpg?G0;Q}hZ?!xXJx(&y$)|XZmBBFbD_CjTzC(w>Lxy<qiOG`vrFZr7
z(PdC+#HdmCii%DjIns>`322MM8R2z4jiKO?T-zOyOB1&a$c{@xBi^BF*Wn{aR-URU
zur>`%Ogu%wOGXjXoqS2;ej<Sr17B~3Fljhr4mk47P%uo*NrcqHAnvW%_&Ezs=s+;z
z^s^}Con2jrO`6mfG{n^sqyZ)35WOBe8^UgQu_ZkO|6~!0H?lnum((VBD@u6t4~1h*
zjEscjhwDU0D_VHAcpnWKp^fV2P(Rhs7uLs$!*IeDBlsx)b#!#bXz!o#Km6x>>q@2A
z+v|Jz?reA{uA_|1%%w~FB7g#>!Ik3?=~<+!z@Iy7`qqnu(5qFj)?-`SaC!NXE3Lix
zY$&+CoQl05+nNrf*PLTBc@-eQS*I>~{BV!r@dO7C;VTkyX(I{L?gp_DRW&pIU}mf*
zO{l!{=mX_UdUp0Dq?Pmx)EE@qGH3gopH<GPbFFgMp*uU8o=DHi`XFR;OX73aPJk!?
zp}`!g4QMMdKM>8MW#gA9S`ch21vCNH^AL|CaQF)lP!td6)H22^D<66EXb#a!WD9T)
zhk@IYgYxYahYf41cPuO-;@0IfVmc$$2Y@KW(rCyM&ie(-Vpt-Dn=Z$lW5Rf@sG-=1
z^x;N<qutk7#S9fP337g^u3ZmTS=?J<C_`5$W?#q<Q>Wgp(#j2-f5KWp&sM#7{prc7
zb;g&J#GtN#X&|NrbPxGi<q(o<+W`U`UEfEM!9SOziW(|93fU6J`8y-IVsAW0+NAhW
z%xH+NiZ{ucB1}R&T%;v1*%mJfQEo;Z+803{iLoiH@$R#OS@>2+2w_t{jg955*36(9
z$1rFKn!-?f4=^0A8gUPJjR?sd2(l^i2E7zVTX^rHFPMyqN;kwXd3$=yA(}OUrVUf?
zhh!(myfO1<P(=uy5D|v|Hd(eT6R2_XFF5noPV`-MsB3y1d~g?rRG@31OhVT2E~UA~
z=xqG<7PsvU;g$@r{$L$jR_*piO#y{K1U$)}`a5F&{hhCCqu6TTEhNB)2B=?6t&JR@
z6`zQ#5UG;=QSIA?9N5V1hkQq~Uz}JItke<D-2XDl^g5*jL*}-`slo_|G%Z#Ua%r(^
zL~gXryWr!8579gcY8upm;w=)a5!gL=up5pZAoT%T77->~Z7f7PDeNXV7D9tAfq^em
zE`umIh727VpzWbSeX{uMyOX)U&>;$mQ^$@Sso5mJ*BGd}$2-Y_A!mWEW4$cAA=ND(
z19&(SsDktTz+R1-6>OlsGWQ@z@(~;@)cZ?7E%1JYaC-Z%A8Vj8ay&Ty-DPC%9~nF&
z0oJsF^~-*(3&8zEkqO7Ji0K99_{5gb+HA_I02S*4-y_<q^RFG%$GDtJzPT+cvr2S&
zBzE_#w9cJ6SwZ<ssed|y=UcO<z<HwYw0sI#p<g9Q8buAF28HhcnpY$PhIOv*GC=K6
zyMbGTGW0M7ld*%9dzn&kBHI#}SO6t7&uJ(D#3BR&N|MXW92Wwvw6yf2rt3Q8vxI<Q
zUc881Qc!U+Q`NO{G5@Ux{vo;V;@WW6)6-X?dnKILm3Vi(;p(>-cRjsh8Vwf?oAa)R
ziIv0aZ+(%X5CVDtT-kf@V5egWE91;AQ+X#OocxgC??Mm|wjF5Es+J$daVej%ROVMo
znHJ|(y@)hd2kQX`422FPMqv@LRUo3AaRJOi2{pV?HH$PZio(H-T!5qXanK&36Fe%v
zb-78m2WuyM<wXk@V?e<@L}zG=d&R|ryi@N$R82xIOu#zZmnu#zF!Gs4dW_%!C@rla
zIv^hT{fH4X8Ox@DL+-?k4z?Jl1Sg9u^#ofW!ulQOI0&0#y&l91VjJ)pOK+F7_9p((
z?Tzj|&#&xbxjM}Q<-K?=LLE(+&D45uu5EO_J*zPmaUFRh$gk4U(<=^Gk2HH*IJUNq
zo1*rr40x=esqjnkYcrzogY{km*c=*h;-zEfAtxtnjgtCT3-GTbL~hr~ieFQG8!Tn*
z3K>S&^kOPkjThBAPW33>PFG(MOm{ONzYiY=7hqR6)xUbpLJ>nDfg0;0ga@GVUMh0~
zkL^4R2q@~z*)3CNcNpQB;?b-^7uPU8spttaBc=dT@L~*~|30A89Y~rjWm&-4nfwIN
zD1812l14CGgLpv9dwlxz$t^Z{n4Zw}iZ%(5_9)e?#rSLM)B?ra=-F*wJHYP;Glid2
z<V)jzAQ)T`#e&%929_Z}PF-Kw{1x5aacVH=J%o||QTR|Mw4FC>V9s_(h6;V%2LwBE
zt4>eSSvzXX^6Agh(=R`hKV&uLEri1J7cWGjLdROly%VT~e$b@o2d|wc^PNB_w?JMl
z#T)uSFvXrOA9O!-3nzly3)7BnZ&`xVPJ*xS(xIbE2Sa6J_{ZISEaaz5xt04m|ICO3
z(@FQ#b0vd*e{c7<ss&_U7u^gT7pU?(${~gt`B_vA8(?0*`oA;zD4d3n;>n5>5E!(~
zC4?(il(gAI^_<zN&M7Z#;tTR|R=_A$FaogQBUDt^FNZ%8OOS#0naoIKgytcA`&@*w
zfR$JXiLAtrUP@NN_@I=_Ficzf3^bQyTx|yVQLMjkMuOU^RjZRzwyL0<RUr1?OHb!(
zcNE`m@1X5-C{Kkz7A{3_H9!(^TE5M0VabQzByAo?P}2`k*)5jZZ~_RkQKQywCo>DK
zK)7iE32!cP?IXSzfX?)&JR>?Q$fX~j?{)ow*fnt3f@q4|akqaSBPa~bP~po74FL!f
z#SS{xp{lBWv@8esY`bWZgeLtT(Hsb{j<?hoxK&Z6Y5)3eB^>dn98ba*P-08k)@HYE
z)rx8B#oWP@bI|9%0<fk|M2O`lD0vd;J4!tePPYHPWniuhCora8Vn})9m5cLp)?IdS
z5k8=(V<ro6GX+LcP2xcWU*8UB|IXg=%OzKPeFtBa*!a5*DXRa`9o;m7C<+iA&85Z#
zE#kVrZ0`QXV#4B9V<|F`#U2u08SqHZ6Cec@NLYaIZ1-P9>al^@aRH{VGcoshtOQ9w
zbH~qjgY@+ESEJpbz8>#XGVP(ST=<DY)w$vRZkgg$!e_+dRLiNMDO(9;Lp{g-=X<a@
z!YvT-q1eL1D#-}b*qn3r%h3S>#p36sV{bp2np9M{gek~we#cNa3t)_=L}3i_I1P)=
zMk#d^Pbs<a%{$CK9_Hr<fJTnr{&S1#@(Z9>j4s@v$}?pT8?_3#$R$l~bB^6SfG^7(
zEfXz08hQ~wArQyINJvTd^+W+cS%AoRuYf6;yd`s*ea`K*>&eAo-T{Cbe*omq$io1d
zhXA^WFUdQKC2!+2Ef&(2=S5j2@2ow+wO_Yx-7T!`6%-XI&G~`W=mojjN4K{$<t8b@
zeX@3x8FInE3sw{jqL{@1<1@XsBGq6-K;9eLsyeDVM!MH8L`hDzKp?Z??fw0tBq7A-
z1fE=_7C70XX#%B`NaRT35Ywgs9T1{~ZZPtckDYQ~nwWKQE%6fwD3nKqFS8Q7+S|N2
zMem}iX+9=y3K3M;Q1O3>2GrvG65&QVXwV?zC<~t=$1<LXt#a(OYe%p{jzRkmH6>Io
zri_OG2OBnWVjmFxa#y^f?8|AsMPGBJHZU<inHB>g#pX#+I->rokD8KtgSeBdVwqUv
zR4|0}0m$d7$%FgZ*@xG?2o-_><{J4n>zpiOM#n;1q_li92VD~N-{#xR%ahZwn)BtY
z)=^!G-^(+f!~?K<8L=`a%y8H4-K9iH1|Y=ARScbfX{fxsCme~L#?;xrVIPQW<p|1{
zMGIbEoKmSq3#cEc*4Z!pKBd2St3aHbyvu2JT(kNg1k=ifAJmsgy3Ics+t#9JPY$h1
zG3I^|-rkz!cKHt|+$it5qkOXsgCr$B>TgN_px*NB{zPIWrLf4rw!*K3W=-Pz_wUOL
z75V)hsN;ONac<u}rAKXV7!|4?;DxRU1F-PiD^mS&)!e~tNad*t_l15FZDk}NHRW-%
zM2XWQl1@g*dO?O6o$p^bX6#rSw9aBBn&}Cl;T6G?c%6m}34sA?)j+9NBG-qb6t)9!
zvG-t0uTHux`d3Ojj&e2~pxB^ZR3r!BAZ6rFt&g}FT(^WA+TazysovIH9d9e|!tAdI
zi-hh{mPO}Cg_9^BGqYeyxlWmTZgcIMjf{v}q!)2bqPN!BaGRJxwgLVZJCNE)Nd>>C
zpw<=qICH=$znbi*sLa1!?7eHiCN3YuJv2&U#>XP*F@`qZzP9}M{(VAI!r8Nf=xC)W
zCgH)vlo%A#B)57cb|9?bM~`{gdoP^PuP=vpgze0PBK}qUyLpG;mQ&L~0>oN)F)e_G
zU{`#+x_H9l1G5BhdT5~5DFhzsTiO0Qhl&y59}E*!qPDhn-cM}-&@y04olleiKv%DK
z0~w>C$QivgO*`u4*rcNQq$QH|8I2CsJX)bYvVHn)17)y~jUph}(-jYUm~?KH0WoNO
za)X#NFwtMV?m)i1_~AHc(lKUagg@|poL4Z_A2})^8yN)SnWBHKsw#MYUy@slDqwxR
zjM=gxyP59Ez8y?r0B)8nS&7AmtLuZbv`f#@cLL;L;I2w>?tAUpHMbAfUp#*<tbYt;
zTrclT+?kmu5V^lb7*S#=VxCVMqD(dICb>Ax{n6S9keK(buY2LBbfiOEV;O;(Q1q^%
zBHdH6H6ZTqUSfEYh<S%JF&V<v+wLp9A>Ghjgm!!eYd9?*<IbPIeqF_1Ne|KvB841-
z1*cwvJ+AD0dVvwJcEBX(Rmsyry{W5+kHa)HPGM%nBZc^1=p8U-t0Qy-Ps7yX%&Aju
zMXZ6@Yqj6!<<eTi1n2)+HMw2y+WA|Z?h0i+h8nHNaebAIM4Wdic1t2~Ns#9(3~iq}
z#)rr5O!FC#_D4Fss=-Z*+DC+A9QBTH$N1nVaTj?9&^Lg=QT?0w;xb=STI&2@MK*EZ
z6{Q80ub`4eqXlU8(%pixhBsTr)dEU6dg_!woQZEHtq*k}lOQfU0kscP5%uH@B0W@)
zDl6}{5nbb>NAf&%%6Yf!VzxC?w5G0Bab~BD3d&$kk5SaJlN2hgrKDHzceKc{n`<pZ
z`NPN0T@jjPBesy(=0^&~eBJX4ZvE6aXTUtF{Bc1k5Y3`Wb7gkDY{cThN+lqn>T1%-
z3F~@jEN_;5xUNDaHtQmyX99{C(8Jd%%_de{QAFSe(^tP0+l*I5(>U-)j-5(=)0Z<x
zV$oZ$Kxkp2ET@WWNn)Tj+16BXNdkE8D{~L02zgCcE=tSw3lfkn@*@^@oF-VizzOLV
zK)cK*my;9(9>)~e-|z@)Y6QB7JiyPp5rUW20}M%T-aMl`op0a1DNdO17iF-@md_>O
z^-e7<G^@Ul7huvHlXJq9cldBdiis?OK;f#X_x}W{RzmB15qB+wD9mei0>*E!vDQZ8
z1&(NXeQkF-fOpi}Kv72pbt2t=Liv^2+gvvs%QJX-H=Rmbp&li52!snVRrF+_0VY9p
zfF+`)Aeeycpj$s<Ts5d2Pl#778aS#sx~{O)3xpaf6||t9P1}uS#0ah!rUX1<NPU(e
zj!|zJbLzU#z1E2-j(A(sH=+A7GtK=Nox69hHHv&T-BI9ugzTYoqLyuR%bPm;W_iYN
z#Yr!nwQ%e~5hf>BwBpX=R7xM_?D%b@&cPfk{4<!Lz541UN{|5q=FyC<|GrGF%C!|0
z7|m~9d4m>tk;nSBqC#)$)*rnNpM^!k%axXbdx5kxDm~i+H5hr<iXcukf4QnC{sa$J
z(Xut@MT6T~2GIKZR9bOOLDoqr5?Qq_{2N!F4=*wI7`lCGyw(3BQQMSCfp#~feTJq7
zKD9d@<DK`GLS-A@U9@6!LI9-#KJ8^4OiQH~?^wjoCrhX$UNu8|(*OPFQKLeLlCuo2
zZawmuw$RooZpZImV0lTY4Qm;U2tdw3K;@Mwqb`(cJ<7K(vHC_qf7v7LJ9RQ@w_?=D
zkwkIhLE#Y*@3}^NClc3Vxy^+-h!_VH9!Ay)114k-L_^_hN&6-A@<79t=W=MziEI}w
zZ*TXxTGy|8cb(tATJn*d^%-YjZ;{sFuE90*V#I0ze&@!?n@=}yqk8_E+f?io#2V9J
zB`$I6xcpwUZXW;?i07A0Jq^b9_Yp-R?PMoL0|bCYh@}}9#Rgyj4Y}T&Ibzh*_S-3Y
z-Zu~q1=3_{+>rd?)XVhD0=N_QfD=pBvA?9C+SbMztP%)qkZ>MI&fL^)wo{cjX`<+5
z5=iNQmqnCGp5?#^Yr7#pwRw6@$Hd__ZO!D#0r)e<fEppb-1)l0VCV=UDntJogFn*K
zES@(+V^XWT)T~mrw?|p9)7@@lY30&;U+FbVkc1Co#smqUGX<(tYdPN(AyX7Zw%9F4
z3VCIJ!>t*2M=JsvwZtWhO}9s9FxMucMt`p9$?;bGrEPA^{5^@9k<rV$s2Z~b2to1G
zm|z@!k@;}3idg@w_Gwftt4%+WeMFJla3d={-SBrdi2UFgd#;-*_xu(AF^mdVL)LFR
zRDeFCTvR?nY0^*CMwv#1xcm9@=R5cAg*@{#rt<|A5*#%7-XiJ};4c~#-NSY&0Xf-o
z@O%O=SpQkI2L6GbJEThKRF)0uzX6@+3$|nVa!PGmsUbNwGHOenJuuXg=i{cXuvxe6
za-w50<pW_NA|gV!ad?Ib6d7{vtMT@+`DPBS{8(d^Y*+LFKhY*PjTCoTv#$bUc^Knb
zy2E3jtZd<xRN>uyV1<pQ;{QljWEI>`+x^o)m*H;XtkPpL0j~T1EDcxcNO281IPry|
zy1GeLk3<ypQI8@Z3~i0|mJv@}S3I?e1@Ss+ee;>*={?ZY+|wC+K|Ufum>zaO;mhvI
zeG#wY#yfoYC8@)`WM%cmm-fb!*U?sQYIv?$Z5(49L)F#$(3!zVPz4*D_+3Yw#xa`t
z2Un+y+)+zQOBT}2K_c?EI)RN=A*qN0=P`Ot%OKPP30)kH2UL`>ojk|FMJ`W<4``El
zGL;4x2x=S70%C5!$6~0psUrP0t-@fv9#reLBdQ6~h&$Wlbor{y995pI3{p^mnLImp
z9fl4R5IN*oVi|bJLu7Ws_lA}Cd|Xfvph+QFMxY?9?ck7{X6drQay6PeA@@tQiR~^c
zn+bb4AFhdZ9w8~vtuTY$idrwwKAztImBxxKPbzD)Eq4jeVzWX)!GPF&(bA!lutDy{
zhJqs?7O<<o7PR;>WOi8$OyJx1XNpJ_S)E<wjM-D)!2fQneRG<RON)08Sav{Xo5wMk
zrzYAhB})mGm?OW%&23eD0ngn?^*e~2frJ>ov+F2-eev%}-{JtGfh`x1TB%EIA7F(e
zIA=zw72xlfFcx?`k+y@hw1Se-J^{h9gHz0<D(C_JsM7tNQ!A+c8uLxvFC^1`hTZN?
z)#iv{gg^>J8{gL$Hgk#DRZ4FxnWP6!ISI-k1ZoV`KpHhXUYeJa6DB!zmZ1RbOl~(V
z@k`qnk=s7bI!3Oluf-Flh0^W5P+@`k7=)x=hE{)gc+di#u%e${=WB@x3CkMlp65hU
zbmh7{Nl!1ON*<0zYnESg5=Is{Z<1Y!zf9=b#-+D&OHN6*f1EM(l52C+u<R`Ari$mo
zN**>ZE0_I$b1nN4r)p?6Pdj?>V1~hfrT*3dG|akrVKD{O9DHF0LBxaIVa)9;u<28%
z30+Iz$dRs3Rp{(#%Ah!5y(>poP;Iz2SB)6842qAso#}CKB{TtnpVUoE@<u&=Z1IGI
zt*#!gJBT9BR$1r72R8>`Y~yxw=v>Wj*kigLmHtkS-3jzmCl-$eRfQ|&CAfN0uUvX#
zHm@`MSfADk)AtJS1C1tW*G}bKscu~Tsen9cb~G6~Uys&I+a;ZL;i%A0L(q`Dg((RQ
zAtG;-)(F<b0+^}>F{g~GJ4|Q(Y(0CjirBs0d+J^g5wZRSiJ}v7IA|Ery#Ypu!5wsX
zpomm&`w>z^iJ(mGIAW3P_9e$?!R6*-)|50d{F@z!9Y}ERD0+Z&`ccrroKO{`h55p4
z$-y1J>dqS62uJTr!pZ(bTIKf(2^LG7Qps$9=f)Y~zd+i=j<bk}(L$Lol$Qkm%7$9L
zlIQEZkC5?+HI0W*Nr3s65#^!Wk@w`5-yp-Fb*-(hEj}x3g+vz!Q_1!>)F*Gj5`}ga
z%JmIS1d0ZyW`exMzJ7MB<=vyu<%~CR`R@Ujp#?w2>Y82j%@Fe7p#rEEKUP4%03<<t
zjITH}2n{X%8X0tjxhrfSIce?&E^O}SkY#r#cb`gZWSMSrY)y;Rj8q$Sb#*Zl%Y>+D
z%C^ab0OQ(dZrBst8j$DF1|J7KbSm~}d=_ppZlbbvoLR1Gdm<Z5UwTAlBB+`(3at8H
z#+P19+*<60Xn}d$SLOW=@|4e;L|H)VvaXCaa9VY8W$x7l1`hmfj>}a5i!oy&P5l&P
zd&+i}@_v2drPENqjwF(+m$J>13RA3xTzIPeA41XoZf9D>llq}jgI-iMa7MZf8pP0s
z2a}h5k<Gw#2d56I_LVGVU;Q5X7^9_2|89l#vQjp-ksL4}Xx&ws>o~fBCu4d@Ts40i
zKiHu28JffJYH;6Dgm{Z%&!2Lezhuerrp;-I+Hikz_RgC&r3`8PaRXPvq)4}$<1ly$
z5WhVFZbaW!cfaR?>t;8rZ)9eYC|AP-O><CoVd&U45CQH<<&~|3T$5US8#T>O`g(h(
zsi@qjD!PXy4;1K4nB2C_5nqP=A;3X^;>ssais{{T;FNvF;nw^dAtY{ysUId327f_^
zA}J8wAWTc%0>u((rlD0k%k4jg(XyeE0SZpz2AM?NzvS=Vr?JqbD+qw_weY!-;TRuF
zmn25(`O)Q1&*M-N_NEzu5<^MCkB5LEb~yrTer3?!+w$kzK9%o#1!jd_ofq+JE#P3W
zYa=4faXBg404a}R3QIO0&a$DZ^};iEJf&j8@8!@QbLPnNQDKzX-6^&xiI^WtM_e>@
zYD;}jG5M3{wfx!&on-xsARMf$L*Q9Tk$87iwAg$pE^R5&;w@Vy0dQ7+lkz_QY*B-{
zIdFkXhH9s*p?b%fv<ZZH%X}_4UW<<QJVpo3E<+(v4w~f5?wgrHGeaDL62R5m1@M<d
zC8y?C_(+TcAml-RfRkZZHOC0+@0?EWEqy3wLi`q9eaOTu;|{vsMieRyH7&A@VOclR
zQf~dUUGADanF*<9A+s%CG_I#7DX7+3H}8yj(YZ>WZ(HZ3KY7wSHFE%?A*sZuBh~?Q
zf$`sG8vZ{?Ri&FsgT|0Fw<VFY(2=8{12B7D@{R;S%TBlu0tgxgQw>+PqKgOvO%6u|
zSiFiBifh;l!9?7xa9*B3U5is%5(^yT3vzdQe_Pm;)je$Y>z%PBB`e~VIk_;BY9J?n
zIcZ#4e`hbJaFPAdPd0cB%yLCjOgsicH_UU#Kg7}7ZuE^^fpVD}SFaY4jaW#jg5Xy;
zA5^LVSJD{92%Ar}0pTW5ADM75$#|~uD25_<4Xbrfw>}k7LFtOpf&TK(*G>>ec#ASZ
zxUbY9t>U~}4r_f|8uH;Gi&bECt(hnnG!0X<La{(Ak?S$pD+wtsKTd6uQce3)fc&Sp
z!J*GKdEmiO=@tm0>9FysSyY~Y2N;Z+%Pa{tYXZXN{K(8Y&$#dMp{60v3Kt$RV5F(@
zYWh<&E~`~0E}eWQ`lWTzG@GIpG-4_auP4x|(V&Q!iBD#TY4zI6zjnTM@0C`CYYRMw
zvG=e(pev&PK=LMG8+%$`@kOuNif_8BZEg(oPCb{>=%Z^D8#_?4A+{pyK<TGXg{y~k
zB7jn<3k5$TNg@|9q#Y|5c0WG^8KPbLi7l1l=+TGp0w{U^{(|u*r#3D0UfYb`wBw>6
z2-RL!-+08|-@mC}gt=R;ZJe5h-m}!`foo<|o^c^~3;ibuao*-0)%x?!6nyZgX@5nX
zi4G$pBbViAwpYWGYo@>W2R)|(6+@kn>G9=jmj6%sqEmhZ{*bDW`8>CPj$?RKm{H#7
zJwo&3-{Vx!A!rMQeTA~!%h9ej;X{v9^qn&mV6AgoRpPTo>tfru8%_3Q;gExi3VQbM
z{}{1cZi&Q3E9wb)S4msT9kWMV`Pk<AnBMckuSSk+pC_!~^I8glXoZfBf`TX|<oQ%F
zpRq>4{^eC~d+UrfHZae?b_2HIB#)Gg%1xA@;4ngn0(C%LY(>PXyx|+Sod}`_QYDfm
zFp|mYwjkre%JIRfRERc!_{(J>N(anh`mg<R?r6xT@DTe1{0`8b_&ARHBR3FJ!pzkn
z4+3UegRIte#kiqETZhRhu_4MSq?v&(11Z&U%M<T|lX<?DPs@G%0c;=jfzV4J@qxWE
z?JSd1JtHA{`(Fx|Ehf95*z51pSxVJa<8<P;bu*CI36g2yfSf0f9s%=QF|BU?{i$f{
zmc_X(-@h~#2Bfc6Y);ubtiwu!Ir5`BBqewT+ncqObj~x2jLUub%gq1AYvZ#~k)6yg
z8lP`JWS_kJ<1QU`PJ1<dPOH_$jamVjn|AE@oH=HE2fJ0Rg7WG<PXJq+>|&$h;-Xzv
z_hVz@&sQ0_c5!Cb`lp>YZ!Y5=>f+nVQ4LR4j@s4U=9@GnAQUB&%=n8PqqJi9$<4G5
zW<(#5A6e65_nhr=t|M&YP5}~uZ6z>*C8}dwnmeJX9X@oZx8RStPeDOJ=sWoN>|YMG
z*G_o#Fs0@#P_Om4?;=aj9g%aep<-9v*~z#@BX8g6mPDQWH32^;FyG~+kD9O6Y;)}M
zE#2PMHaIjiG{b7##Iotd?6gSZGy`sPP~<aW{x{g`_4L{90M}LL+atmI{N+nU^~z;O
zX%dJf!ugKTs&_lv4HR~(l^avj-gwuUfu|@Py!~whlgteVRPQ7n#5^~t?R&n~FwM%j
z%a^KsyZbeE`Q@l6lgGIo@8;qv^S4?r@r`BazghqwUY98UML{HUvc8H?9SB<e`t`q-
zgd1ZIAtuxBqnSw@)Bbspma4g#nHZ)jsBnlG4Q@RhI+RUDbARNLy@mP2rtd%51}Ji}
z@VjG&g0K(ilA4LF6rJHNEMB07DczDXqFoz9ySndL=i(CEBMfV&<Z=%My;VRScYa6|
zGC)^wI>R<Dc+q76oHMQw^P~>Ntq)xvD<F=moS=!iRi1ztC1(G9o7fT7)=3)Q%zke1
zo@;V@J+GE#b@SGN?l&Tn51X@;kej({?_SILFG()jKb6_q+b@38x=+>m5qgUbTQ6;t
zIU11fLX+JOAdzwC3y4lAvCux=nrD3Dn*Y-`cu|75&<9iYyteLRKDsx~Zg^bjqRRR3
zA-AGxP1id(@16y*UTA5#fK~)W*NX}<$XHx_7C{Q=vydGkM=#%C&w?^Gw*w#%YAU~I
z?{`YMov3xh+{h@TepQV=8%RLymOGxnd`Z+u5+cph6CAX#EU;I*nCk^{p>g_Z_OCM=
z%Ur6SOzz{@<v++>T^T;635`g^r%Bv*Wxhu3hB7yj3hY*1{T8qSC@$b*3`#?I$5=#<
z?Q=UjXz~p78xjK3<~N?Uxw|Y9UFC=9jK`0`70Y9~zP^Y`W`yRaQKLp3?_s+f@nG*x
z3U?<8fg9e}9|XjN8qs9&kekE7F{4+cwgTi7vMRFuPzE<~S2+P3uWFm4B&>GnF{A$a
zbgOq!KmbPA>5m?jf8B&}Bg0w1hSGhNJt;F{*t<S|Y7%55`Qa@s%qV6egvbs&(poY%
z<=hVB5IMFJ8FzA2A2jm#+=?Fx85`o)>|VC?;mb4qTrcl-*E{xCmIkWT<;x>~+GpDv
z;=a9Q%d(9+g#&L@+b;W7W;<@ykEG(3nq%1)Z{1q<<7PyJ{>`l1++J~JKDo9_9@<-W
zyS=E7_JXyR-#(`;9=XJSn0qxVV(HODR!(<J4jR8P@8{3&T27bqs@<%+ZYtcD9jU&1
z*Dh-JY-&~XOJZmhoi%${evVa-06l|g6PEg<_(hNH76hnVEShKnAoV<BB3f2t2EiHG
z(#@g0!Mn0->>5u$Wx`ZU{{q(_;#&#6!N{P}Fx?$fZqR@U3SUSVRICNHQTwIQCn{<j
zS|Zji3q9Suc_Aq&7B^2^kM$q6VD8+?m`aAPrgEnt+>N-|kp0B`>J5G~0ov$*<hWC-
zD|0TxOm$%**Kvn$(Rt0Y1&7OiQdTa1A{Jf1HkLnoIoN-s%cy$o|CN7jO80EiYUtvC
z>FsJ2$uiYWn&$6!)^*YXeZl>9#P8yZXROxLF&QV+wsiL7r@yr|)f>;x{XeSC1FYx$
z{r}dn*Rf|fvXYGKWMmdXM$$AYGg+l&g%BbVhlC_abt-KMNfa`UQmK%H7A-2}|G0C0
z-*f)g^*h(^d!0jlKJWKy-1l?cr%Y+Z?xPYonzo)JBwmDTncW`UD6X*LL0`PM8^oMc
zu;_v*jU||iFV=;H&1%!KS_&mjHOJQ_2W$|W1Kz|JM)Ns;0ljbNZ|}T!uy<-cyL#5f
zHeNmHclWpIexF{qi>g^mxDL66;VdL;7`^#CF!dAt=l;>B&+g@M`?oEfcC$yOp;^ka
zXUEMkAzpIH{rp)AV3~^b4`t<oODh55dxd9Ob(a>1TSclanXNnar%e@MOH9~1sr;mc
zrt9TZS_ldN|J^<N=}yI;pdK|oe`DI27D;c?JDcpZ511X(-#zhajOFrFqj$wRI7Gnh
z2+1=&T41`|!QP%?Z{eQ$l(g;>jLoT2(a_wkOywzlxs)90SNyz%;r5?NL&jwpmNqna
zmT#pVtlQ>Oi<-!dBaa?EI_LDT-7x;aR43>ozcZKVAQTeW`U|T3{T3<FoHOtcaj;mp
zmRw&tg`)>E;A_G(=bWDqnHkA#CN^fd6CifkJ3{Jum+w~8MAHrjg2jd8;JpmJ<H7};
z+*Ps+rGhFqCsfUBZd;y_Ilw(a4)FYI=X|`q_knAdU}z0uKpWR<UBD12hIT)`iAW##
ziE7Ii76AT=Sm=XzA{KV_a>&W1;z*+C14HpxQfs+&E6$NqOjwgno@Kn0)?Q!@jM@k#
z`y>VH9NNOka}#Qwl_0r!1b2qE4xDcM%p<ulVapC|0pdb8w9ee;>^wVqTsA6*>3KKR
zco+s_cB)n^Mx;RZ#9PGwT3nHA_rJBixBK22m*1*;Egw_eYh}f6A>8ZvW<G`cOpRyH
zo-Hzk5hH>#hl7U5%3|gVSQQ#WB<nn{{wI75XSgV8CRP&w=$*O3Hxmv+E*+HDJto?p
zhK(1WTI>2p039VeS_T*rYUTRyxd#Uip~-zmjBur<mZ2)r3}Z!7GklZ$;K3V=bRT$T
zLg)&J!lQX(-^Lg0*mU<QR_*J}Nke*89Rrev=Y?$k92Ecil#d%4JMM<x*O)q0<Ol40
z5z~s2gdZ;%NW>7J&0B>79PPn=5PSnG<G3OfRfH>IIFBl{2cH(oRccuj!S2R8Xn|()
zeMgQNBZm{#0r~<C^RVbZMa`C7M@*}{?jIO90Q3{o#o*=A9R^2>6Ly<a?>{v=sLAUc
z_CH(1HdI-h_%dr<@ZgGMLuQk@yX(4p?^Hue%k8v!OaNUG901)sc0f;5k46=hDyu@5
zVF+2GFktlPic1L@>%(TBFTEQX_4<OQ^^@d%v6I}Z6(WodnwWiaK71InxdKy>T)~L9
zAU&&#e6s&;DX1p_RCExG)I;pR&x3q#LI@5>s%TsxhYIJnVv@@qm<n<eMEyx8*Plv#
z1;+oqd-uNi`0>EA)ruI;p?*TP<u9QlVc((ySv<~#OZy)>TGT+8;v{N~l&*WgYj`hM
z4-z;NgydG{GSdYkNumqQ8_?_v8CWD;NdTc9VngW+<T9j2INWQJ0@3T}l0~y6*n^gO
zFG4&%ez(>I&4;1*zyg6x`^wOvvs6xkMMDg5BciLLto%;?GaOFuv13p3rEcWcyzaa$
zbqg`$F!R#G(|Bxoyk&<!$M43fSkJ)4tfWdtLojNzud?!-f4i@UkwNpfPVB4c^Mb)T
z)_VPNs`f}CnudnJpaSeuykdDAZgRUGM;NP}87Xae5KJ%<p#rC)7~g25h<3_WQGN9A
zp{Aaw5jt?++(z(%wu3$i3>`z(=6wHfQ9_c{OnO_bBPSZBJ;jkDJ9s^N0D>VR7x`;u
zA;V$x*(KI#U8m{(fs_&MnCz4|PDBakTeEt#31t<i1R}7|&`^j6iJK7P8yph4`qk0F
z=cy#66k$Kmg-By2nF+ireh+pfY9kZC5~5c3Fhrd*kc`^Giaf=p4*QUtCnPkX6`__F
z>zjaBT>9^5mx1j!ll}_RCB!wYrHB!L?YHTbqz4Q&KjHgwEvqq^-7N7uBE>{3A<#a(
z0LM5JB$yu>KDC~%x6bd~x@qB?5V7I<(UeKufL6E0_xtO}jJ~4F1F1rV@b2^H^nkzp
zl#|F;Hw?D^y3gCRaQDdS<ceTd_Y;E#CXE2V^T@iQ<C|XYk+L>a?qA)^7EAn>sFFED
zJW>;4CgJZXk$zl?=@Z>;3kgV^7q=^We<DK0>RAqxc8C7CX?<~vriDq(jpwlET(TST
zL}SupqkImo`};hEL;}KvCm`5ELFly5@PJ8j{r(jfgsEcQym{iJ1o4Yjeo^@T{XG^w
zM05@b30@>jhyYpgC8_2PYyDF5!zVIOK*n~oR!??n+4Zgn30PU8;ir8Q02)f0MwL<<
zZ8rY{<+VI!uJZlJh$~*5t69o@&lb6Y-;v@8lhi|YiCwN0#UE>p&6rEXwkc7-oXlO$
zK2BL)dce!8FKhI2XsFw_?46!eG3bRJqf92jce=L@5I-p>tZ1N;%CzJi_xrj`c-}WP
zHPctkQ=eakUVTATjMb<->Gxq;6VK`Ur&ts(A7Y?}f}pj-!{`(Lwt|*Zo6|f1*dd_E
zQb;uIKGA+Gm%@9*eDsz<xHdVU8N<N5mLpIjD6ODPO(>tnjvYIH*)lEO5zFmTK!CWD
z(T>cXGv{!-9_ol7xU?mm&7NDA!bx&KITl>*n3#}rh&n>Ja?U?Vl*C{VPc;lZ2WiD$
zV;;pYvqxP(sS-)kvR&1h&nCIj+<GgZMWhsfIY|jL4lH}MX8n3L)Lz=}KhK^O#T(!L
z)BLp~Fm=$|op5)b$uw^vV8AAJ?%dh0uOCA0U7SK9mgc;1+c$1X!~&WF+A;v>48g(a
zf|j$aaZtMR?AiaKuViKRgF8cGVG)_zNp!Adf8LyLZEjZ42>A(E{R1}e>NRVYgQ5c8
z1)<hOzJ@mqwS%>-ag!!ZBtlhky;<f+m(9p-*1hq-B6L|A>Yt%Ezw8M%sg+oB9barC
zOGccd{bX@QueRBMa*yv)|Kwqv>SXFBo@T#c>3TndS|p3*?$ZVakVAo-^YB<h|IeGC
z_McthS#U`G<994HZou_`m2zB(iaT&%z11k&@n}GK^P@OC(@l5PoK{W(QdH`y?vZJv
zr{}xsDnlCJB!-SVT6C^gb!0(q`+PpnkRb?iBVObb*5puTQ2B8k8CZ4NW^5T+f79D{
z^wpT7g+*NsHAGtM7_ID?cKyQ@H38M}aUf(c;jD*-GbkGcEB%fFjk!bn;jJVj5*#ji
z%lNTl@pnF8V%J@cBeg{xcb68I^3-c9!ehUR&f%9h0o8=o&aLpAn6ITerrayK5g5SE
zVTJUiXoJ=obaq^nqoO8`YWjgT6Mz2L0&poQ?o8q&avW(Ir3gH3Jwh31d?@Pa-L0eE
zCa~=pTsnNky{5N*l+2do9U2#=Iq03jktV(zQb^HYoS9&=*~c^pQx07MR~h*<nV<T%
zWoH7Z9XxWR@W$OkK0X7`;n9XakKOsx+2Ry;u#_rn7QoKepR8wDtB3bWwm~<@BvW1W
zyt_x{{_nbb+ql10u7NR#lEkq8BaJ4L5ITTdW4%*fzkI>KxLf#3uAG&j%f`|AgFxV?
zEXDFLUr+DhFU4mcej^O$z?xyigP?Ik@vdrDu|9VbdCfKeiQEm#Go!W<;<M<XZBQxM
zH@qcboheO0TK`^I+O}&~U*+!`Uq>9L9VC{)U-{-M>!LPv&}=0c7>4jjt)PN*Eg5*p
znMj#`hXW8<=GD#7ycHe-OvwTMRSB1A`?0Z0S^0JamCZDa@dir2#^ze!HDfqNP?G8I
zP&J6Cmo?g<Qzuv4`uq#&$N2T;a1>(3VM-2dC{=1f>76-8X4E;Q#6U(R{2j>~60hSr
zD@K!>K;dp>lT9p{yGY!z;8WObXK~ta!(&5=c)t?BL+a488*fSM9#;xWv|e84?+h#8
zf*y$E7n_x6ycGyOR!L5M`t*YRDFl%Hu3ppeB`ZibaoyQD{Y3zd;eNcj3mqo+wMUj&
za>?OChXi3|dY8-@c0QVv)uKIxP=w(wuw!w6Nk9_nyHz7(0tHDW9AfFFruj?-!p@Sw
zB+Xr=Q$8R#Yu~o*(_9C7Dd~+zyy3_qDQNNivD7;KqRUrAZP~%t1g;;ECnBQc4^pXp
zdK&xk363M42k3+tA7Y7#?lN-qe5ahMccb3yQ#aZ;nw2-{VGirMPJKu{2Izz~O|sK}
zh4CVKqpC7oFgk`COPmvOJJB&y>h*Ir2u#I4OUM7y)Z+yrZYKrX1JuhpdU}*w&HVlS
zW$*v^A-2trHexIwUHd-&Sf%B}wxjN$av*YHI>VNE*)X^0Y0xPw8l6)A$ESHa--&&5
zdUD>IH@B%^rJ<(lMm#tuEw!kq2vRiMPx*OQI2{T1{^D%><5JQ}z_abV+9+wuHs<k9
z?8lr}t#~ZRVTxT7wk2$FEb`p)_I|(rX=*GT+nPp4nT4bsGmXsn@Bru#oZ!4nC)R8y
zh{vhY2At_as)<p63v4ROgmgRXq|~|mKj`RQ*igx%cI~-{Q8h2ua!@Y))ay!Wek8if
zrmb4#mVJ7`G4PJ!-P6-J`tToaTJfw4QP6W^FhJZ1_>TF?Cv|KE9s{4`Y~A{*nfmId
zz0rDfJa7AXkm-V3gLz`YT+`Mt-GMP0pDhoQ%{O91JCrv<c!Rt}?^6YX;6iKGs#QO>
zKDjh14F|!Dy=D)vvY9xf$$7`C!3+jrExV5&z_|tKnLxS^$kV5wM+>KX9Ly@>d2=BX
z3_ID-uqSm6zAdyXP3vJkrL?_p>(=?IDqHFTcr4M3bJ4*)z?mj*`~A@(-$jy9$>}6T
zqwEj-fedn-;`4$rlXq1@PW*ymMZ+Z=t+aDAH1y^+WzS~H5xV(n3}rBV3^K48oRY4J
z!@>6CVdQqPJD7oHg9uWw5|Op_@y2UkG7UHn2Ov%&n6^zD-3DHk6GMhkl+$kFln~V>
zM@aFPDv$+^6p<rGons0{o#}vy9_vS!!p>nX{98Mu4OVSx3(*CDsF7CrSg4w!O+9<`
zkO^3#!5&BNAcYR`fJ}};+PhaJDLOr5&Np%eSa)+RECO`2?78X<k7iFa=Gy;2Tli_~
zYw!>$j6hi40+U33th8Ob0|b)FqtB>OiwBubfc0;!XHI+1@{{Zgu3N5tc}9ta72p0+
zP9lMEYkp{Q=rL2hO6!ous->l+JD=AJ0r-xO_cpBFy$&**9pb+DP^C^m#|Gn*DvDd+
z)J06KuR1BZy8PqpsGP|1d*@n=>GU7W;ef-PM57f&X8ko?NAv;*WyKmT35(A~!hkwH
zYU-p5MYsxa@6pa6YMuv@V>!|JsmZVl&sL}R%s>}RJwfgQPj~Bh!|3M=5C?(roI$Mz
zZv{&mW0d{rZZx+d21ZQ4t{*;_ncjKESiNCgqLejr5t!pgP|2_KU8KZW$UK@`?>F%?
zP${)<-+n5;j@T!?Q=2R;Z!-?GNfW1}WtYp#%2qu(J)AfhnZgjiVv65OtFbfZ#a!8A
z8G5Vg^I(=EZQ)8P6T`+F^xawa$Q2gq>(6F*RlCatpFL}=U#}kx>W>~EaT1-b02MAS
z33z!ac|j1lJ6i&zXl>M^;Gj7p>M%wn_)O@sA-G7SB<0<gV8CuL(6rT1Xq+!ZW?RC}
za*IxdQNfqh6^)6O-gGKZfHzk~KIj}&ci^D!2{Cy`A&&>1R$6q(U7^pSw<LS0xg*7N
zB^j2<df&@l%oA-ZziMG{DH^4*<hlMN+7}uYaz-Y+gjQR;Ad(EiSws@xK;ouA>3jys
z_dnR>+O<HnIn+?`xDS(VovB!#!mt>Txr*W$eC0NX+)VmB3OmfDOg2HtEX%Y`ifR_J
zKwm!z70<IG`-;bt<fB%;@T{Ho3$1-4Gw$eCCQ(Ic-M{4M*pQ<_D=e7?wi2_SOc!PA
zViec`Ph)(eN1!3!pC)%DbAvfr@p0o=7Ezi-W+F}4g;fupoE=l_nwNjDGN_3v<A?6R
zZHpU`)XU=~3P3+}naA-HNSevIb%Oy^kPOWpG;>`WqqM;K@CV$E{r>o)L0wqur?C7B
zew#v?sOEqEY*Dpt3)xER`uuzPH;g{hc`k>JTRRJEV(hH4&wpgOxhgg@H_uD@?AT9J
z$H!rH0UQAO75AD~%lyOitun1UlOlcgtiHQLwu-x#SI+c2&9=5$bQ>6vb?sxpsLn7k
zNty+V*cvPTG9477)adz9!Y*iVnsb$+U{HNcyJOE4uKxJClP*mqwlDY_p#txzeYrQ;
z^{`%Dj@-9O&{OV2jv)fZDbYtdIk+wJ@6VwIR3?c7us%GHN9V@1@72rGIV)A6z?R_z
zpqHzVA-*Sk2V8;DS9%-nDi#bAcYOW)a`+`OVFJ^MQQ!b7Nr_An&MY&su<I#Wk_I~~
zlZ*jw#LPsnrVMhCE@Ry~xvrQoE=m9Vi%9NO?y==gd8>yGh(H}fUsZ{%CYhfvImq-U
za4A$XGg&M0V}$?=R<{1waecCx1F{Mm=6q0=zVooodP`4E1Ko*&ox4YLWdjDt$ERuH
ztp7kiv(4#w_Lo@K7z99s)YD0JV28?2psHz$NHqS%i}n=s92MXVf`PCR`|VQW-rj*U
zP1Q8<q_**3y;E*)qUc3gefHcmU{}NGI3;lL>|=Z!UvX{e7#q7VhyYCPcE=5?KRu~D
zu6Pif(`ftK-)ev4O+py;K*B}&<z0ubkSzm$CM6>6!U1&TfSKaHDI6Wl{qTFcb|O86
zz7?rAFm<1#5p>zB(r>JBulZqn_i+zWqNcPBn42B@{GSN(MsI`5_KjHKaL~uc+@)e0
z|2D#a<jXv0u$Xt%9mxg4ISObFiK;UqfXBmrOPqA#VWV?xI#3$|MRYqdqjDp?X>M$E
z;$Qnqv-@HygYRKbYe|Ts<J9PC>Rd)dPc}2_&mx<oH#DNiq|Gu97Q|T3H5C8fj8hyI
zD|O_smum-5buV*UMV-huLErWP-AT`*-%3g-xAw?*2L6Vs#p?*|WdE0IZD`kN(IkBt
zG)Zi*+-~?&sPCWxNgI;0pBm<ij6ROL-`?(L{3J$1)Dx<NUS2e_ftf4!IXRVbv@t5J
zT`kkpo}t5Cb4=6gnIwyGorsi%21k^_GV_2`C5A_0T3O;p0l5mLhd@liFeKUB_!iZJ
zl_=;KFrh4825{q&$$YyV90s-uGDM70WRwo{KBk7{x0CbqGtL6z5CbCMlo%m#tE?aW
zB0c?(qNeB%B`T`3(e{f;1$hBm^M1U_ZpgKsXlH^04@>^CaWh;MasGE1W6^_!y47jK
zH#BpiH4|8b8%T6y<TxCrJK{2o>GtSmnyTv1MS6NR3pb9H_ucJZzV&V^P5&i1A3p2{
ziQzHo-kglL-ZC^eBqR;an01BT3=P5@8Zh^SjF5{lH*d;7klOGI7f2SKftglZJ}gqL
zzn(MpuTszN<t2)Xl60g#m8J2cZ@qE}@|^9zBwQ^J%oS)$IwhRV-YvtFlg5r8Z}s5y
zO#}8ZY^`^J**uGB#}zdn-a4Upkf3;jO{33qq4OO_TEE`Z1Ubk6jJg=T8D+|iO5hxR
zmKU?HUH$w2v;Z$ozy^R2i1$$COxJ+ad>cGOlzS5CFJUD6_cvylaHhc^ibMu9Q^1+1
zN3X-b<%$gG3C@7IsMBa&z}>i0Nsj@hJ`d)~_mZ9#EaY>Eee2e(rR}80D8bqYT*9p|
zH(Ezx|AfMjH;55y_7ksV{$`--;sB)9q!*Nfoc+Oi^;W*ddrwMoCIyBNmo8v`{X%xp
z^n(`;WU4SicM#d{1WV4rbjsO|+QMXHHz`r*Be?%0+o%poDLQ{%QXW*3CrrgPMnXOq
zzP^p|r>*0;d*J)tA-Mn?xE@y%y3D`CZY$t(Zj3lw5ts*luH4r7_eVcG@etM3YY`g%
zT^*2;2PWZooKRg5LTN^Z@3V}l?Hax9k0VfO1u9R*HMgSYAe>YDDc_QTjR8boGgZP=
zBmE~KPpwg}Q-cs>ar@b`1<kaLzR&a4DPL-J>)|TW4~{uDY2$kO^sHa@EjoPR*F~^}
zqKE3Rr`CBR6*sNvR1cC%J;vo$UtlYq`>vm+cekZX{jNiIik6+bp0kfzR@{msyDjj0
zi4ErWwrwrry*V~UfkdE(UWm2<?Dkjh$R~8W<?ar6&L{YQ1Gp#e<yX2ZirKj2<UqrN
zFQDyEZkQKai~L4U?@YT#R>(QBvtwTb2fO1hzkV_v*4XR%^}_(@6oedws4Qqp1Fjwp
zx?}S3i&4qfuWvv3cwUK&lrn@4m@kDq)?mrFZ?if40e+;8#a$k1XWX@AW4f!M-3l0y
zOk@1@Ck|8j*%Y%qdiF#h|4G#}W%{raep%lo$=O{F+64=CD<`3!v#S1f^ndW{-B$1J
z#?M`_xx#+GD5Grcx)7nLSl~T`iGaxYn7`#l?<D&k<d;61<`Lu0j<6=fLYHmu@yrm3
z9p}DWm_WOp*N2)?3c=hG7qA&J(mQq)DT-JoWRxJ!6$s%%SqP7Hy6ffE4xOvpk(!d6
zY@VCwrWG64>s#Q>ZWWt@VLvj5CI0kyk(pWZXLaWcRZUQk=rybIRL#|rn%`W4nh|Vm
zTeZzl=Kb8Gg&kVG-8uf!5BtcD3Q_BJ1mrr<dC;q*PS{_hID*XZk4d}~Tf3&IF@#-;
zh?L8!E>eg>H#}hWzgc9jO$to?3Y=$<DW;LUZtdE49PVVRNv;6c$^pO&k~Xg;S0+rV
z{T+NhI=XtgouO3|yLzQ?7?M19>eLLD4Q&hUhb&DpyduE>xV8yl3Nf_w2?%KIza+!F
zVMnN`#f#&GCy+b{lx<9gcTjoLx2z}sMNhBUxi-q(TJASecvfBq$9+r!xbSI|6>#M_
z2cWk%!>2`a1O|BU@Zk>s3&NDN(7m@r)`Kq6QL@a@kbl9ZZoIH}dy$-Esz%5#LYOq!
zlmIz05s$C9VDVy^c`D5kxfQ%}RqS1KQlie<x?GEG7-@2aZFu=|BlZ}TGy1VSsMsa8
zS}J<Mm&2U~mJLTI_4nV?`2K7zL;@WrKYZUY;D{`>r@E2R4Cx$$$l=oS1-<@FXa)Az
zX~~$BvR0hCqGG#n;et_5r)R4#Gk`6|D6s6elcr}l$dO=DW4?n#28<(9Y&v7!D7Ymx
z&NQ(D3S13NK2BgDB*PNB4{L4bRdmoh<%<Oqszc3Hk-XP&LN`8KwE~u2rtX4mlvNDd
zH<4LPlJhRX90)G(VE5Wp>M+)4?%9+iujtx`!OB(B`TTjmkx(D&U)}q^oobDX%GU&U
z5;`p<n4<E1dl|{W8e!?tt#dPqPy@D)y9A9tXGedKW{7{LpiGI~QG?*_R>AW>D;_<%
zMe-iHj>_*jF0e#gSGdC2u5xc6HY=}ayKmyGQ>GG#!k^e(8Z*Gr(J?aTsM=}Z-=Qu=
zRs9o*rtIA`RpURvZ9iMd3;wZHQS<Y{?wb5v3Hd;YE!t{i416H`RiIYX2YBs)Z(R9g
ztT89_=3sbSd*S&`HscTjc_s>qLjNlwdF~awW?~{HKLidUBO|VT)j&;GDSeR$cOhnp
z4iyfFx?d*rggaShMyENR;RpaTE3X*F0=1n6|16WOxc^Z)SXXZg`p<cEY3PtklH<Bw
z+990)huvxGKTyUCHwC8jt0gz1v=VvH?zx<y9M94H+NQaWP^DG`V-i6<-RH<|k2^M5
zz3(h41j^mtyLWf4Ys?RQn>*&Ojep5!Lg^vtmLt2x@1OcM7@z^f@St+iFXf=uYqJq&
zuKN3X`})%5H^CPw8n*anMnCb&4czC@O04I2gz^;BV*mH)>vD$;yl#aZSWUzu17&EJ
zMbHjPNND?RAD=u^_oFfG9Q+?$H#AP_(lMY7FnZ{+%#65*i?1BdjM;Jl&3y+=S2~`E
zK90l?;+kF!r4kw0m8`~>YkwoBPsn$=0&1|;BmYp|G>*1vJ7EY2!)d=xNZ43n;}noQ
znW};<6==;IvJ48s*L|kFLyKn3*n}6uo%(E@V!u{`=lQ?8t!AxS)BNsZ35<fvA`u2k
zS_uFs@>=q4ur+*FQ)kR*445N_1EABl+XwcY|A3g5R{qE`|N8(w2@Q`pjFPb7Y;ISw
ztMyK8yteKoe|eQ&|K7dp?ZPYFUFr0ikTk@El655%I9_#1e_AY_z33kaV27Yb_aNzI
zoa-cRqdP3G_)>iXFO=-x_!Te-*E5T?0X49Z3PFa70ngsEB(IoSHYudqr!0m2Pk;O0
zMx~6^nBjFCx%BJ`_28jJY7}%t9U^wX!<CSrYiKx^rjR3B*MXFRVeqrlXDn=k^q(*k
zNN945BrTCk8!`>VhdEfZ8^UL^OWZ@>oIQE+9+s2C6#it;QEVA6Ub2AcyNGgD3K6$|
zf4dGHIPg*jd1$8w^vIkyMK<ZjL+abjL(=H=#=I<#Kl10nW_h#!?A)=V1iA@v0bzjA
z=IWwrR$ZUa2RwXU_VueUS|T?UjZsli*4cR%=WVgJHb+m*X+t$&XE%1wQ@?-RXBKTI
zwch?J$);ImAe0==Kal)W&I&#{zQ+_bo2_o#qzS2jx5awQVPw1Bo?BZq-Q3(p<>SOj
zR5!h3$moiZ1e<dsq}{Yp70oDsB+q=AzCNY(?Mjnl)TXev1RzPYD>|d;OP9Wy?)WF@
z0xbcECv|YtmEkEbU&<&7S`BW0U55Zq*P2poZ=KBfX%jh>xGq86siyK@B+r$kbIais
z>vZe}JwN-Y>QY!(Sh&;EzVutXa>=(sultN12*`kyMF}aJlR^x9h?hydLEB9UEb$^F
zJ&DamhEPnK93sM$0prIn`*dcI?f2$m{xh1U`{AZ0wYXs7Rs^9_4pzIVwx>RH7v%yS
zU%<y5^eu}0`v;Q+qhMDs9L1;YW%K#0L7tiJgj+4#_@Y5eqc$xX6V72TqXz-r;_z%C
z@Vdv3)NDth1E<$2y(_#5?lVBZe}*qfE(zV`m3f|7IqF?0bo7|j!orDTHDO^<aJX5-
z<oUaaZi5kznjkttWy7iZ_Xf$s2c40I2Wb2A3-$Ypl-ve(9@tdFUFGJ)(2Z|?4=}E1
zwM-H|r0JwjVCl*$65ep^rax$+6Th-CNQV<o4T3=ckUy}wT4&ib(R5wP$KYGXd_P>(
zA6|hVG-|^!t%&8G7_+IsEY&)2&;Jc$iCFv2<*!oGGRbD}2@5+#n?kh7--RvuK@|w;
z1-|SZhm4w|mLPH61`E-}P(MEfECr?$G(6?&kJC6i`67(rtvXcHnE?jKwc8x|{6VIS
zFj88CZ!(;F)d<nnFh=4Lg^SQMS?=__Kujmth}<TF2)?9_z=PbaTes8dc2+JbYk5n;
znt(+fSNs&UbSbKvI*guR2z&<R6_!yiC8?SEhtgBhkYm)(Ie$h*MJ+1=8^T&K>GS*I
zzh%tjCWhV?Y7i?tKHa*id8uyvtu+Xbe?!MbCkjw>>)IBH`q{EYC{g}or%v}d(V}hc
zbxuuB*TM}Bo6B<*)y~801$JgvEFA_99m*)BQ9u?7yHK`E_UU>15>2!R<9if;{P=(-
zOt?U*dr4@+;<JgmV~PK=cr*%F)HK{(@OR_C2K2kW#80R4-km#duZ47EP%Aky+#f=M
zM`;gk`N}RPEHsp3n%<OuGq`qQuqW&v{{nd2L;KqaGkp>?=kz~OqIUJ!YD-JYh|iC8
zDpIGN%2mlLwuMICxyj?1)4z_s*S*IN4%{AMzQ%0TsxOqrjNx2mgmYG+*Wuiw%ZJl>
z@kq>Yk$7ZT1TB>-fVv1_TB+!@z-J^?gp5ft65>)*!}Sxk_TgfeSuS|=*nWJ_F^ii;
zoF~6mG{vx$1N!!D2xr1fY#F4ENT&}+0mhymMSOI6rwE#L2}Q(CI*3C{%sV#$31Mxq
zA01XmdU`^!S&Y%o6J=I~D!^l$R29@FA5-3h|B4>@AogEZ#UKU*bL{r*)ODi<)e4ym
zUr>@UIp_4Gowf72J)$~=&E@w|C*Mg<F8=e08!jCDX?tnsDN=NMK4NZ&Xt_n#Lmk2K
z5#%}HG|_&Ql$zg3C*yU#m4UeK^?iFWiXgz4HwXeCygw%%wm&HYF>VyOc7F>rdz!pX
za9ja*Q5@(1h{I@)cPOUf5oz$yp+20?T>qw1FR-B~yJ!#JiYy?M%M<OH#vsq);$o5|
z2UDekIx=N<HrfKVx?EKX3P<@{RE9t{l-Qii{1Gzp_dHy67kxkIBR#?R!VN~0;HX~f
z#%s;yDiunU!hF%9%jd|)lr%gT46g5@vZ!coPo5FCC*Fo@8H->l{$XY%44WQ2beYHq
zxcG6sq45^#ugorGVjJ#>?Y6eup40dwO#GEG*?OlWF9vf##I>*G9aH1_-#UBd-|EXU
zj=rW!)AkDykIs#EN(wZQzW@^mP&l4w2YQn)0V@_W)A3A2LDcyNA5e6O@`^fza#Q-|
zkq;nYrS9Vpp__)Vf>Pw<Y}q(&v3eMd3ICoSXF4S*GczWx$cm7xN0E4zaQAaI4o>XN
zPh40vfxQnd{k60-fAgI@4K!fjgQuu@piISuvz^Em%I;5P>c~A&;8Jrcwb&|WH8QLm
zgo5(NrMn@OEvZ{H8QfKi_>o+6f!`~Q6GI;*yHK$#Ffs~IUQ|VV^rufY&#v#|x<Xhm
z?MsaNHgmHpD64<g)(CP+NW8?(N!<Ul!3FIK?p*3V<jvru5ma8Z0}a?m*j{!#Phbp!
zcw*U)=%|t;F0IO-uBqb#4G>Qa-Bo!ccNY>5`HNKgdU|QoT?!ALze_zzV1Tm-LYa48
zPh#Y#lcH|AsH+v4b!^vebLP*Zr`+6*0}&EXfSMNJu7drSJ4pc_FG3?bJIgHax3tkB
z-~}nL9vcAdiXxtn*#yDbaCYEKK^kY2CSB^;(MRC;CCX^RgbC>$=>N~l2zvRtXlP0$
z2?*hwv~{wRpo-mAdU$x8p{|W(GWTgNKpr4ZcS_m;prraI=X{jtLsSkqIXPbErk*mD
z<UY8j-9~|V%=&&qzK97V9)0&YK9u(nM|jSJ1Sc^A!L<=^lkwJWD|)vHwcm57#}t_B
zPD^-VaFq&Y1PHF=yhA-p7ske;$K&slgh89Yf(G$=BhpTW@5HSHN2vSw2=g|9XqT2G
z4GlqZBq@&2EX1hh;(U)Tay+Av{ck3RrDIfD$cSp9x`$Go$S5+D`n-2?%=Z#4&*XX&
zIx-H4qMN1KGG=%Aln)8=nF!$|E1JQzhdgTv%++1}*_;olHY8OM34n|@V~PV&5MvqU
z#!V~6eomyThWE7`9K`p4gVh6X=d)L@4lo&Wj{;PmXsCHGzcaJ()4MEwSlW#>?Lm_W
z4HyQjAftcr<4a}DTy)THZcJb|co!!Ma4C(@(NY456+S(E8UKR6i)?fg&JB_kzdJWJ
z<_4Bl{Z;JFNtg`b%LeAgvpHaECEFPyc3Zbu>o#mK<tO!E=sQRWNRC7<$<2KI_F25f
z%yhZPQAsg|2rw;ZWfSZ*l0(yZNbXB+=wI=kA}dA`02<FnAXr~4m1rFPUf!qXZfZI@
zepFh_#(Ubgbfq~dh=>l666Ug9{!U5ZvO2l1_HCTg7C^H0-Mjl0oaXHa*2S3CT`VHd
zX+XU?i6UB{x*4JHyk%N}gFv&CZH=5c5SJ}mCc6Am=WHdnYx*O)73f(?gfNv5)Z!kw
z1(*qH(Ixi?(f%dq2E3N|L4>{C=4P0gihzgu_Z4rrqx<?|O^Pwv3YkO6{U!4UoK_aB
zgr$-O)o{mo`cVPITMWH<n;{5vf_AXGwRcb?9Tc<O%IRhwY%s+7P>zH<ZJG<bA;H@9
zxS-grs)+dqk9XolN=`6p1MYcd7tERS9*4*(ei%9rxe-cA?$Sd6^KRQViUlgC9P+6z
zEGmj%SF2)maAyNYK~I8%#Z-l@LU;6bm`5a4qCtkQO$_}EsJj%^CRKsfv4EhU<7Avj
zOcJsS7sfi{LD%qK(l9T!1rRy?HNOq-o<z?Q(m55+_VzEJ7)M6bLaodBfO2Fz<Q)^;
zi&KiL`Ipjw38{;@jg}Xbz6a`N=u=3J#KuW%OJRLCqNi`&^}^cFWsybe5na;n*)za&
z<fu_hNaSZFw|aU05_=)s$<pu|pPUJ^G&wB${fKYEZ+TAt-}GCHww-G)K27Z$W9OK*
zzUb7OyP?gSHU+uwKK7^6W@=I@_Y)7N{X)IrYef{ui+Yi31B%K16KY)2zqzBZnb@v+
z1RN`2t$2OJPlf9k4qei%_-3p0%xD+6ft9{7gy0?T*&Kud6!j^)S2-!s!nRIzjAr>D
zs*h4Qe_F(+%Li<yLk7UVf9K9_bQGdT2Z$ma&CAzUbDXx<Gx423E8Fk@q}*ksTOU3d
zJ=hJq!i73IqKy~4hxV)Z#)MMtweLvq;_%i>Hvj68B1uxzgz2D8<nMv@IEliDBV^ms
zh{tp#xF~2jLAE0qmy0kW_EJj?cnr=N93Jo3qjVE80FSu9DCGgK($f>sD&HXpq!;8U
z{pZ^7NHG<|)GF-!u?d~T#N3*ud7ZDe?ltidjSq&N$v)}>3rkDE-<3*Vq@<)Q-vO>7
zaeLHlmOiBo=~W(qj!C#Pnu&rh%aDfvO3E-7{=_g!vHNH?DHvqO{k7b2^K}GRd~9@Z
z^`seE=g=QP#Y|Jv+)<u+i0|*MAB~*EOZjBQ;!L+QX(=i983rVS<0!-?kSOF~R?zh4
z%Atl;yaV+2c-l`jH8b*pzW3g|3pjR1?bmPDYoafX!9Z0Mk<+^0<g_&N>(s6U9j~B?
zm{9)r!c2undrMyqk~nd8vkXl0TjDQM4&_ic%*r!sWtJW?qO_z04F?^MA9=BIu~TO4
z;uRT{A^PdnmmUe&Ac(|O8uYqAPa=6Zt>N%ZMx24VG9eY*9txeyCIqV!Ocayt0yeAL
zET2L5qYBA1&F^Ii>qSD*rFlSN_KN#Jf=ony!eM!nbXm^uVO-|&e$Y7`(ejp;-e%(W
zT@nLA{V10PFIqaj`+)*s$SJ}E09r`|Wq(np)82ss2#72mOkw6JWO$1*13)A#KI$_A
zXVB5f2`$w<-&HK+5UyYNWJ;;W2%3;sQDOH!&@Tw1%Ylu3#oOxTLI$5*3q+tye3D!7
zS=6(XcSHX??hq5?UePf?YMQ{CC6a!HPHml1QdZZ00jRh8O|!eU%CkO>SEX%-4u2<k
zRrRWF;H;=Ad@uKd3>jm4i@%1J!Lac#Hkh>1a0SS9(6u&e+Vtb`P2>ie){jF4N)i$S
z7de#&lahNIjSzd%^jX~Mk-?a11c#;yCg!7;1i~XrqUQld6jQDQ6$7EmmxS|@uAooI
zBL5}a89+I*>sY^>A>T2+Oa>DYEjc+=l8z^WUU+{*GH6SPHuvP-BlqtJ!=$;kB|4aA
zjq_hpE;wO&X0;bLhhv+GCdk;?*kbcQTcOEm1Z)qoCZXu)G19oe;ieFpNQR+Rnrqjt
zU3ti?L}Vh8nmHmPOchKQ%qxhpQfLcwJoNJ(jaA7~!E?%v00nWGR6CXvYbHV3fF&@^
zGwHBpb)iBQV=@!=@(wjHt*9iMrv(ROXAe(+7vx0F08;#M>*z{J{NlmLbu5w;#EeC;
zKX=fdJ0UrxWD~5Bzq1KL3&;Wj0;z=X3I}cfx<XP_NiQB}|9uz?yVSwFl6r8pw#v7&
z-C9iA5;aw~>tKfk)~n2DPetg<ZpWl6-%xkUQ~iS7Kk`S7eeV7%P+xL{(TbsWXhi<8
zcX$wRFIp2+2~;X{crque(Cu+!E{4vX-JQ)m_5GKi%z2)(5{Oz9?LuPzU;K;kh1EM^
z&Zw#qJqW4^L^Am7jsRVLAS|P4+))>odEwvIZm_U0`;`UU3L!Il(gsaKhUpA7#PB_P
zK9*o$^b;cZ66Ax0v;yS_?OB6D$1}7M;;%pvDONAj7fy(x>9vOf!;v?L14|1`4ru2c
zvz{OtbZlPpp%9=wB6vq8o#4)<GuqnJ>>9p;FKT$yj2y}r=T<d(^7Dz5(~|5-5zL#F
z7;XTWh<6je83KpB!D@2_VwgHfV7c=S6=_!8T{wZZQyfZPjfh5q+b}2!T_Hg&MdQ*M
z-WxY=_vB9({!a@qj@m@v3WBL#uXK2AhfLXnv-US4c<DJn0e>cEolTgzIJ^31vZ}sW
z@Cmm9#l4efX;_6#UaFLMbl3VK|J8nr*ZGC7tyz-chx8)dPCq|xt^X|j->0l;(aCi2
z<fdwUnjLEttf5h%9+Lc{_RW~Q{;Q0S9kW0GbM<M}yqn49PpWn}WCeyB1twU8J15TB
zJ-K(=ri~ixp8Qm8M%C<BoJAW~PfDz{EdRWiyR04^!$|{8k+QMl<ObqF%k&r2!rRvP
zf1GuRLW3rNo+>}ZYHRPYRiEo|6<ZSM#1R?E`sR)$@P%l){@(@=TezI-f#vcpH#c6=
zmm9X3Fd0t+4IRP;JRqVA@DI;=2mB;L7fI}5Fyz*&Bdit$?4WQ}Hi)Q8Fa!00ih+mN
z{4xb+rUWKPA%Lz$R4^prbf84((EK-~J{WAjpo-o?W*Y40T#=NmT8FroA}WU=s+Xa_
z(FK0Nj3RT`aN3^ROIko*f_uawg(^g%8R=Qm`=xVM<{Hfr0W-va@zAp7`}TmlGWJaa
z?Kk*^VZ{vQiQ<hNhFMou2^B%E(Ur%K9|NSig=g71qbVXA_BWZ`2e=W~{x~1dp7Fo-
z<koO2J;*t6`t*V5XuIgIgaDZFn~-M(VfjQlpK)X!Dh)1_euD?MV6RH5GySSqLhAB>
zZgXSVQ2k5)!=WY7?TRD}qhgZ=8|?|DdINyqnb-->pI6as$RBF8cH`5mVK@-1VnSxL
zb6jwSL`@-h@E+YIVikql^?4nY$Xes%19(E}O8-hxUS+?5yajJ~Q?gdHyfu7Ypi{oO
zW5D~yg98F^k<v}wG~UEPCK^>Hzp?<{y?ef9pYg_2u(p|adZX<$%#%iq8IzQf!^`Jr
zEB7ldG;u){kt-Te4(Psk8E9NsTvvU4$tlDGnZQ+Q4o@<cvkxuIBg(|5_j)AU;?S>*
z$S=M%);8BLYOC+OY}K=mQE@-m<Hc!4S)F<R;{1UJ5Zq9#+@z|Lh(F{=J1Rmvw{=UM
z)n3(Up_@BNm_b1cYBMr39iDoIwrJ6UO@EtYXzV$N#G`I(Z{5843MvEXK}++JF<T{Y
zrkokG(a0wW@hs@w^VTo(A!;BChoRc0M&&Lw;cpb~og+LgX~pvH>BK<l3pN|0l0oIE
zt=8woZayZk4F^~$<GF$0HO$ODTsUwB6p(ILrbDolWdN<Dl$1^M*YTBb0@Sz?Xhgjt
znel{T^~?U2p7j$UA^up3@Xbq`pXt5c%t<M(banf>EiOBKPoG)TC3rmg4ISDhqQB+1
zds<*J1Y?Vzk{eMPVZNTVlG!rlPTT>JgqqULQCouNTgJV!7%a0s#S<hz`OWQ{)LJN!
zP<S4#RGO<5b{G(mr2P3xZjiVF1X*GVr-C4KH}bV^$c8^Mzv4TUm`XuSF*)ovsDEXV
zlHBDelEi(Ccrw5!CXr-FDS+vrPo+wf4q(x%&~@&w1Iwx!{S8Aj4oLx<N0<_P_Cjf>
zR5Bb&4iJFO0W#r%WIKuZW`GfO%>pGijw+5cIL-zE{cfeFM^<#4Iq8px{sU7}Qmn>j
zq*;XLAn|~sx<hzm#u0DWb@WZb0m(~%)W<ijjo~1${nu0%!7|VnuN$Coqo*gz8tsH8
zZvd2fs9CinTc7#{SMj#!%NKDgV9$a?I_}}|m`YcW4Q@;pcWdOfR?V!OI=(M=nEYFI
ztA_rMZu}bVN&C`ZpT-@|objQ}!qV1@th3;i0NY$_D*$*oKsO^srR3!UPC+R{hyu<G
z#K=O}=m!x7<%)BPMhd`U(UK+4A+TUhX&L}aLCVCcUe+ag!KQ6acy~aW*g!%gRc6;R
z?fo4B935XqX8TW#kAAlL<bZorWD9k5o$i<%iw{Bj*p`<s&YvTXiC?2q+f-R%O3O+q
zL8f;Qx!p9#v%yhi85+mlitCaG^otk`B_&4!Hz4IK#~T5vhY++X3}`gaO^akDN@5=0
zVHRJ~%6x_z@Qn!kuj!^gGdmj`)q_EnUl><XiuSq{Vr0AzWMGjQ(hXJ)WdW`=v?|{7
zGFH#d`0S71kzSkl<D$QY@{oGo>kP5Tc9TgWTHdaJ-c;+HRC1TGv&eQi%!qp5MIu4>
ztSuW5c`3xd)perkh<4--K;sLdd(3a`FPzuiB`u(+Guwg-P9{DQPb1MJLj9q%xNi|2
z?)PZq*~c{sA^N{N-Wj(3-W<AWww-0+cSf^0PTYj(Syom3ww*f7;35c5aw6XZKQH#l
zNZWEH8V!#_XSiMfTvV!)q9Y>SFeC*WgWHP^WNUPf-ntjIO(*Y;W5aMqQQH}B)At6!
zTPt@;+BN+y2s1<b2uhJxE;kh5SZ)bhsXxmOB8YS;338$sUg*?<vY)Hksk}7B^3Pqv
z&b3`}<^L*zVRPE5t41fR4b58A;uZy*02o|`8XGqFuYZ=l<R)%@v*(vJsbF1Yhd>Ys
zIX-2WvWkjC?4$gk6wltEac6Dlb2^p6>tFNPg;ZE!v$e+^9mFFcb(M!-wCF)20BQ;J
zyF`<(oo;P4Cko#o!)M6213CrZdXB2fbn$$Go~gBZHfpcw`rha=BpA6k$4{SrwZ<Eg
zeb>2x8__`?+dB~l1COwt+`6nVl@nW6CLHAt@00$HhJc)?neY%8y;zK8wlS}NK70Il
z!{k#w+!ZC?dLEoMv){{dp7R4Yzf><BUl|#|TID-FykxNvd+ZJ6`}6qGU>PcYzFf^b
zh@!3bRO*<0-V*OXi$QCSc5=PbHmnYkEvsJM0kkR}VI@D;C}HsTqAsgVuf(rd4$35o
zglDVinWy`tj$)ajq_BZVmI52igIjfjA~+o!?^a2K76=<j-!12(T4=*j-Hym)CiINK
z>^lJYUF#OB!uC1G?_m@Ont8CL6YNYJ#;1)EBsmu}bNszg2$^=2dG*@XRd*hbKrG)@
z4rch<e}3)tvrhu1g`Y;+C)Jb&Ou|MgK{9lX1YXU&I3MSW7+wmkKtRtx4*_*aQ{Gca
z`LUU*c!$K_i*Z2iGAiN{8RY_{!+D~v9^-L(4Wt@>K*mhr*MlOulsVo_tDpEK0lVp(
zd|s>_7-4m8-1@j|GEMo_Gp0<rbK!_T6c-7Goc5c!LM62hK*uyF0y+f=|5@;TZl~V5
zkK62pj1~1XMboL)34vU!?0$Q`kU6RDg+<Xc%?#D~;%8hVIr<`RmvdmL@()0K=^2i@
zg9;X8guBpwVTVJ*8GL$8(9*F~zS4+5au8;+b=JGVZdx3q-?P5mO5hzmrZH73aEhDW
z8!fQt#wiY%)FfQ;t_q7jef>vl8B3Dv`@BzKR@WRWix%kW4w!V7s)wk7^uiLa8An_V
zM}t+oU-!LytWtRN=RwzXeBU?I$y|Jn2qv3x80Cw)*{g$pqA1-~iNH+_0ouS0&C?<;
z2LxE$R%ydt=2mF%=JRJx`XWn*NCik(q+QgW>>iyZOH$VKnuA|LVdn|`p37A;CUIxU
z=sHRabR=zTwKWc$p$Vw#1?~*HRM=u_hTfOHUA=V<YFvB+s*eApRaF&2P2-(4+k7;9
z<Fxz%KXCfXfyvk?8cazL1a-eLJH^h<?!=#eCISSBodR<`e4W+luIIS}>X?-yR1-9;
zQTBR71Lw(@87jG=@`(&Q`SNsXo8g2qwf59>{X5{&4OMW(m-$)8#(QXYnmnif!tj^f
zkQ5av4awjK{Na3#199_;kG9+>xw$A^(XZ5ACrJQO5{8lwMKQKD!)cMXj<hgzYeXtv
zLJ@?R{vBgbmmB}RwtSaan(<GEkhIsPCCbFiWMm@@TB`k&t7-unEu+BV5s`Uss=jV2
zu+ue|Bmxouz&Ck$2^f|lHs>~{X`}W*6BFIxF7(jpF&h;+Ip^+Qw5w%*`g5uv)L74k
zCOy(Wv-{8)3MH0D!_-FSiq=>3$RPWXyOr$M`qbc4K$yVxPHk}80+UbYE4Bkp6Opn)
z`B)gX)_*1es~kJL=0u88EPn}caZ`)hxbtKYO#!|ludEA<`|Eb0N&pD|)&u$6orNO;
zvS0%RmgD2nq9%q#aL+R9t{IB3fOZKc!{GHBi(j9zmAoGb-4r2NL4hQvxeqlKkg5)>
zqFsliD5YJp`m4(}gPAM|LZ3t=z%k}}@+9*2zDi2hGP@LmVaUe?tK$jWA=F|9<SnXt
zZYLyyFf#oq(x5~nSYJDEC|w3|c1#|s3-_n&pT+bI|0N>xWYMrBx;}lyXyD(O)WT;t
z!+<efg*YgvNQk$N=XnU**|?u3LH6Hsdi{I=Mn8`WgWy>XWK{FG(qIzz2x7-6!2xEv
z{4ceWTK%Xp<n)2jq~2XHu!BTefEx7CbX7h%_WBmF1&O|x;2G*KSC?jfFGaoqqfB8|
zLa})Q8lIZLM|*l}B<Da+4?&v<B4pT+|I|msXL~&x`zEOU>mD!xKW?IULu~!Ayh>$2
zM0v-AaYcUxZ-a(@9-O{g+vkL*PpigL)w;h9yZZ98LeEA$@<O$|KbK6*3TtCN?}q=o
zf|l&ax~RINzrZM1vWA@`KM<#XlYnMhq(AK2*DIr#F8o2U7XO*;ZxwY8fs0G*Lw{}E
zgku-%gnJ?FW*tQrc$dbb4*%;8GW#}qRKcApw$n1(npQ!h7&tFgIsTok-yd0o<HG@}
zyXfsLR}Az$@(6+TaU<NyoPA#7q6r}bAWb^+=ijFA0Ki)_!0gXC{nJ*YK-IHyEzD<z
z-?)KY@XW-*g-A0Q9avC?Bh{HS-9XnRAVr)hVq&7#mPwWTA`DycD~=Co((|F~`9|A`
z9ajm@NNJvm;4>+=$Gg!6)rIL6$-}#6)p=pYG11j!=v!nKo9}vEyQZbxV~|^-)yv6d
zAvTi}vdi!KaXJR4KR`#XKRhiy{x_Ugl&)+FsulPz+YYKEKXBD;95YdUQ_c6WuB)g&
zN|y5Twq<zDb5#VGwtJm2-vnlj5Vnhpl++d#wX$yfhhS}7tt7uquk*n~b-X}h>eVe6
z+E0vT9FYwF-N${zNexYSz__-=J$Ty(jK1mqsTSb^C4xrG(bjH_<eZDp3AIY~uS5dH
zj57>+Iw7O~cwAptalrX;eaS)bixB>ZMuGSmPH36SE$EWSfd>ux)K+&R>HE^~0xihI
zU%*Sq@Si`De{!avyL!KFH_QS47dUSZcw_E_?e7i_kl8u-c+MGgJn89~4Av;=M3d24
z3mYiRCGWVzs^`n&Uv_p3byfVzUOP=IoOIA8p^I_k=6$+GHv5P=2cWS58>0~lozNcw
zPHxqom7YKQ;H{HNK}Hz7*x`)8L<(?{^Y})hJ(KyTd`#F_LYZ&NI)$KXJTlC(P@#hM
z_zS#=jBA1+vH9ffE|FPGJm^1eT%>XSwl`yIseC2TNz4M2iU86tUcHjElI;u|5(^n^
zhMZR1*ggplHRuHZ<%glzLYOwo%<PK6km~2Nht_{^VkOs!!1oxxKpZ9VBzZ@L>Oaza
z77U*c4R$<be<i$0B@k03hhqBT9pmTwE~5OQ8O0OEtDY66*z@7&kk(ud<Pt@C_gvIN
z<CMllqLd!0_id7YUbLBK=9DN-@zzPpd4D17t{2o?SV{QMm6N)}F5KAY?=ZLXOZ>LY
zS8^ZH$$7#WFp0zsU2HYHbsoO-Go1V9y6(l6(_cNRXoJ?@-{b4#-~U^{N4T}`JEon7
zwso9w#E?!c2*!?!on5B~AiIDV75L?Q5Lp5zAm8!0-?3{~P1tB~&kMhhQc1gr%wECd
zRNx{GehE~Tt9!010*&Eoo-5+0w~LvCO2Tm2iaICW2A6~kj7hKlQE<}eAXDV@MHlAx
zQmI8uYKude&z2l0I2@)vE?PfeFyhYKY}Tm=9`UHB6k22PS56594Rbibnad?ge;66;
z&Bn<{tIPyaQVz`E<^qpaamf$U&@iVELiwa@CMJe{PtuD#<AP;gPUen0uI<AcGlB?U
zio0@X_`rd)!lJsbX?fXP21>!w13IQPkBrU_%P|PAL#@t<FoQINTOOxB=j6mOY>!4M
z2bep4Z7A_lyRGuU96^8#i^iRX0|ct}v+DQ;$sH&WSmrKBF$5gEq7oz)1OTRM+x-~-
z&K9X_^x?sT-zP=b!o_g2zTjIJmjdem43RMquAj1M#8>_fF(wD8bva(@{FEyeD!EDC
zAd<m9$)6Bi2F`))EbfehXO0$*z5fNy{4KlUz0ahYHBQ;mk&Da?e!^>!l9;G}7XPe?
zwaOWd;ZwLs5(qf1ab<{TMua2uRq6i8&jytIBRpAc{iw34Dj1H7<LsY^cW&9gj92DI
zG!gQQm$8G9z5m1?N=8~>Ohq@@(9!<d#mxYBD2&8<E$-Fmb@22)_QizhU7%J+|0(1_
z?y9~`>hdJy#`E*(aY7>?Fx|RffI)QlkM{#6Ek{>flE)E1Wg0^0DO+qNJ?HC?8-5#_
zkm0YO@Ps~DQUM`{Hs#z8ZLd1>&&TUQ94!5RuEdkYoa2$*?9NRa*@1N>nl#V^{ws3@
z0KEdUS9~b%#JTmb{og%KytZB2{#i=Zl8f`p^1FVS@$BsObnOo(6n%}uay&Mq1}Shq
zpvo1#zWm!%<Y=#F_9*inQNDy7*Lv>W(}R9t=akJe8f|Fb)A1l>JFy$Nyvc&HE&DWf
z1G^CR1Q-q47d_XA9Rf3Om!&t35q=`e9WzF^lWNaSA^62@O5V5bbR!i}P#hW<3Q42D
zZx8$2whEo3-p`aX-3DY<>_AhXIA{gX?zD#%qsS-i3oR(W8f!i6VE+E4^}e}<-6gGs
zIg^1&)3nV=cuQ<%%OB!T17ZE0nz|)7u_HJ@m7-EO8D!@;{onlgY62*(N_)+tL4`-d
z_H5iZ#5Y4_%+s%HGUCEEEcSnsawXPHFiFn#H|bx8F^0{^A$fs~7)7k3Hq0E=?T2x>
zIVrv^L<nhFcr&v2R&~^k;3d9`UT;e^6fZE%IEm#0zKtqU3Sc&Gg2#PtKJ|G=?f?!I
zBO;Kgh$V>&3iwU>UZ-9)A7>iY8gT@Ow-y6aAaV_{Ruko;re7A8cOAYA-A3`EU13X=
zgC9pS7JxM8k2l&uQ*GhEFJUuuk2qyNrrVL8_vUE2BAt8-hE%+0g07OA<A^Y#8U^L2
z3O%K)cb8%t!-*`xx9DdJxN{MVlqFV#s!=0gjh^gtqBmPshLVzsA`%SwI7OKmUZ>MK
zQ%vEuEXg+P*`D)&QNZBX^kSzxHOEiAIR77T2jUMzKQ!cW&x0$jcqJyic(HJzwJ5E=
ze2M5+ypw87b~{Lo45a4L!n&aCIiO8SYAVsPyD9v5??CW2cim=q-Qgl|xEt*^%SyRn
zYJh6PPYWi#Vq>>#Jk|Y#LytU1uS3E9M~Wv~&arn~ccQPPBDUH2J+6MBwIY3uO_Huo
zn=$`cz%8eyKQUAIG<Moq6o;gXHF2onYe2^Vhz?(!Phs+ZSdX|V2>b&w20x<&W|aCY
zusV49p0;6zOcm>c3KG_)rPg}&n5W%aHkLS1I(aZJbj16luAy{-5ypu&m0L`;7~lcj
zY>##;uAmWbzlnziU`jwxu+dIqpBJICXSpEq;xG`WB)tHjCO|;N;4mo_!5-`3qkyX~
z*sSNYZ=<zle3<$eyNu*5!|o^a`4DF49%r@Ga-C{hQou5`xyN?B&IlaCjUm{S>GWf9
z!6`vU!oJnxKdxNqwLXs2?@GG*)WQl*Mp9nTZA}Z?P&nsNk68v|8aQ28gi&7OqDV-2
zJywx7`QOZr#TF}~Pha0YAoA2g)=OFS3=g=W;`gL`_wM767g=sNOtg2@>i}?N3aR}Q
z>VLHD^Wt~?ty{N-ugq_IIk5BRsy*a)of$#Jdym;JL!28xo=y1iC5~BLz8(=7#;~HK
zf9bn>f<AzFKM^kogb8SK!z?p>RwgbZ;gk9Ml$V`pZA9;GQe1e|sOa(dzJL6&Yh8j7
zDD2Z)t?t5lsQHyB4IL`+o~Jr!(Gwu~YtE76gIpQ|SuoFEVZf_Urm_jPRGYW6)(QAn
z?hFRpa?ssl%qrho#rW174p)fT%qRsUNx1udGl<syR<{0!A*dbdt8MP;{pa>?k8T&N
z36U!f9{v29talho0w&q=-<_0}PWgsq3yHh({q;@z-20c69Cm$E*&{UcbR4;6PA>){
z+U2DPi%8=v4rhc@U>pcZj?z7X<-kOC?=bk<oy#N;(zHlt2-&bSa&te_0r{I}#Ke8r
zI*=)seFj|TGXVi|jDAx)NLx9Tn}b=l7x;^ahfbV7pJncs#%7QJBz{Xh;PuUyB2;^v
z3Nu_>pUOcB7o}b2*RSabEC3u+qu?1<Gp~EMZWa-zxA>KibBG=JG$DP2^>o>sXQ$tI
zg=+rC*`Joq+vu;>PxEuwm>1>4LknM+hi4Eq2Xs9>>`g(y&tV4b*MN)dTKC6e6rxSk
zRU*vd43;fR-_V0YC1#=m5=FX^!itMN#n)c01%#5J_0VHve0rZwa~rlcB#m@q)Kbkf
zT`e)W1b@!{U4PyZx90mU&F;SSqs@9dXB3GtDpv?3DDqnC&}EN-6c0T%%MU&j9*$m=
z6o&AQ)u}BU9S!3d9wKRa7~N%39ANi@lyKlwH~)}^_<#(z`OF{qe#<M0S%<0_!H)|p
zg*gRi4<_*wJW>Wk3Ny=y2!rs&(XA<}(G!}=z%aCukby{{NzOX%F^6vwVP83dMgfC`
zwOZHf{5Bxd`l$t1qU~l>1#sVALY9}g?eY*R!ID*@@VKD(Z{j^Kf=dkv2kImsG!Ul?
z>_h#F$k*INW8&el$;wJW$0}w_LnrkcCLeqMyShwH_cl9XbyV9YI=AZUpbpJZ%t?3v
zoxMzif>>iu%Un!4GTLbsO6e2a6bxA5mI8<)fHT$mSQ=STe1q7*=8e10ykzL}0-Y_d
z#0Nc|{<@GN2}Iy%l$MVc^FY_k!T+ZPs3kOnfQ}Amrn2)J9cR!Q=%PFo$2*gW;HHDi
zTw=zUS#<Xq-m6({vtg}~B@^jRd~cv~TH`JDAX@MDniJ0jEzi6`m4sYE#bvbLFZ*J%
z7A?9zznf+9m6k?C5F)mu+}~d|vCX1KTg#SgYdRq&^UBl?#TI7*!aiMnPtqm@lEUDV
z{WJ-V)~$`ca>eXv4kMHR@C3@;T4@Msx}!QHyJGxXhnibf0|tg2N#1lcy|BXKTWI0w
zt>KwP1wqvs^Agfb)Y}Z|aLB6S>d|fA%zoH9t~69H-(zKvRA^OwVL&SGN${sUuHrmS
zx?F5R;#+|D+Cx(!0v`Bk(M{t_1x=_}8@$_VZD`2pLMyx2>1KDHPKKxfU<6*pqt*rm
zy(ydfaJNy1i1lepnXMAPkx&$RX%T*jkOV<FUNTE6g2dDt_+3QI0D%Z6Kn&nX#NAf5
zk5#+0lw$-4F0uU|w(Azx8qR0x7><Y249aNot%1|v-ElZSdhY_*Xp?A#!dqBNKDPnm
zTi}R9;D?TVyu&F3I}#)$Djr4)aayJo-aGzQi$wYfUvV{0i|Em6_<ye6rZZX|J%1p4
z&zJ81bYFN#cp`#TvD}c8i_{pRW$Lv7#~OM)8#rLlw(*AHnt8uMJ%$Y38PXno&LO;v
z&|nb^kYk7C!eOG!w5s$VLV+K_{Yc94CBuDQ6cOU%5^xboOMilu5f%P5ucIgA?++%P
zxm8~VUl%ok#cx&M4a}!~?fuFSK!>`%UQ>Qf@7@=5|41v(;rLA}{@PXDb|f%48a>f1
z@+evS#edHTz=|OPeJBSwv+X2asQG8LOGzK&zWddq2AjQymHB`BFgYM^gWp}d=(#iZ
zWLbn;>o3mAj-Zax4KR4if%JKER}KjDIWH$31UWUTjULn9UQu&xxIspCo^4e3=&+R|
zm$rAd*k`wW<Hy2AyTWqp`YLr}o|LpLgvG5u2g_L^S{|ZS`Ju~E9dSy)21~vM^9C1$
zyS+;-PAyMhjB;_s_HIYPFk44w*>Rdkpfrz8=0YJZ`B@$Nb2k5<W!87XgdT8cHm(Qa
zw97#e>H<I+A%Jq7vL4}r9Zs;Yn?)Bc^p^RK6g~q74O-6D=5>sKiv|_}V}y$aZjw3P
z=uo^MF=-JvenzkM@7UOpOGoHf=@bCZp;4z?oKI-Fh?4m^=t~|{JjqmFSnh_68hB<V
zRSlk%_2gCZOF2iGI!77{g*XMNtZxaM5rv6p0L?ueP7zwT`ss?u#fb4wdiG07N{Zi-
z+ji=I34m2&`^|2ANaNzM1U_5TOlR+l@JTCS=#jf`B0@`u83dQW*%UjV({>?K3r>br
z$mq7h;F-w;xK?Il*UsRBkv&0F&9l`vKQ+@%cxA*)QkE^E2yWYHJ~i&Z%_3OA;SZAS
zNI`Q>C<MPEYT^|%!NPPS+~LnVVlkm}#djR?;=HBhV<jtIrab$%bij8^h7IE+#h>=l
z%V8P&o`8%EKI)6P;rU-fAlw_)CN~eCl(U7gSPBUeCSZnKzvq1r%7bW$FqbFsnbXkV
za#@=o+$+2`j)h27x{6nOj#<GxYRuRM58r=`X`Xsb9gEHoRaX7*V~KMv7;a-kYWKs(
zY%lvqq-1CXg(bC`cC)bgOz#~0V$lXO{9ab*by?qi-NuUYs?U){VX2vwDHZmeQZ5y8
zk;IHo$*Aa1<o|M=G0lf~V%S5Q$?FC6dT*4>5e4HU`Mtub1M-s!zt*Q_Mb8_VLlYv^
zT{oEZY=3zFix^E;MWF`K2|B^ppr0ahr<B<QXJkYnOOF$eYojxqGG~eaS7<0uD<j>J
zA|EhKa;-8l8}^9VUJOnk$rYgCm5(>>hy%M6st;BVHMc8~&Meebe(Ch!Wc7XSW9ot_
zx(nE%$gV|;$8AWD0{@a0xn%&=e)bv*i~bbM#SX=Gj#c)P_|w3aQ_RfzBSWc^$8vi~
zS|bV>=+>)bb-^}BYCf&T8$$9pRik2>9O=+QTXoMx$F{1!{lir4Sx&)R$S^vsPu*yY
zHkWI0Lt^C$-+Q{5NVE|th=dbbWV1^E=fBanlhpy_WAs*3oThC+V#N^(T53ja2ZI<}
zHz|72TEX?p*M&d^uclp{iMWe<Xv~neo$2cK{MJHPUXE9ksP(C|aUsY5LSQ*EkD>BF
zlvRlg7;~s3UO{(T&AJd76(tkVP`KHkaOG-W&e_DJ$}Oga&XCFunVLy>hUAnz_ns|B
zKgFvhOP^C>7S@KK1O?>LF$?R*Z@LA5$05$7VYk;UHXynMo<!HeBWohTH%z0&LLisU
zfY~A3UUKHx*iWqb*q7Q14Mo9Ldm-{ccceB23@E%m0O2*uU#1_TEruVFkW`MgCJ0k`
zjpDuNKE~obi<#3?CV*Kvg!HJH*iVya-7QG{SX)S)j%^s?wBol?%T*WL&TQDA7wcSm
zKQ-ejUTj=<(f&GN3?fZQNz5)@_cJ5w&BryB**|NtGgV%kII=-sQPXumkFDliGfssi
zh1Tpa{+QgwWh4(Py5_Y-cv4bX{GP~I@J5MECI)GV-{OFe<2gE+%>03Rrw2nG1i_ZR
zOKEdPp;cuMtJfz2%r752qh9kQT#368dH-8PM4}%Pi&om_T3kwR_-A$a&7Aa6tVjg*
zda*kk+{1jJ6vg`fvD{Xae~e+3_{*bY+^@IZIIN3A8Z&G~*75{AS3R#^+&G#xO@xUe
z@Z&e+1;@v(Z0AWIOeKsJeluuYn`><cYqq1U$ip|2CkjU%uS7;uBXq`<q~m+iFOP&p
zxXs|gviT0l-&HV8v@3q@Ajz|+ajE+0KOA=?(<J|&KBIS+l(Mj1Z{`E$J7G7FLzSS@
zdQ_W8m^1(Se0Vn;r%baDuMnNhv3+VzG0W)H=rvIMh~yY-OQ<pw4z0m=Z~iDrlKJ*B
ztAx*&r{-ep_c9pAI6#qv`$tk+NJT}oubbWEP|TGJ7Vxx0)g!uioc$Q6)#hrDjqruy
zwinAs5K9`UzWO&%HFO3|1E^iEuSX*fh{aZgnX5x4c6@0_h;XK<&h6zF;l5=SFqape
zy(SD*m0YQ$km8<uf)q8Cjc%R>vEQoKGyUav8F>hDm4#FQ*x!cFCZUP+Vr&T;dlkc|
zY3Q%rb4ct>Ky5mYI{BY0d(jfXI~xY1#L#oVZ<3_;*Xn0~uy<1fjIS>7lQ2cDWWd%}
z9AcaW)Hh^EB+8GWWAAlz*Tr}&cEh?I;pw;W)jIK{%YoBry@EDeK=4sFO1Tqd{4db4
z@bAyBc0V@jJ84p{h*Kwf@ex`be2kPH%l#C6PcGRFK0T|Sp14X<d2Y4#&Tord6`_-J
zOCx#p$ZkN`)eN)f@XnrfzGE`k?`1n?W&7m6@ZXO-i6)6zDJtJy)KN$c461t~C%P5#
zo!%cX0N5`W$+%n$E<4Kl@)||Rz}GFh5&YA(&uj8gGaE%8U#*(aiSgIV7hm+P(@EyW
z*5=2GI^ZXAu(3OSjOAL#4F7O(6c#7mA%im{qnrHJPeFNB1h7cVAzme^!co*;O3lwz
z$e?M$!YdPaIV;r)%^8iuA%Q8DqAu0K?r1dj=uPjYojZ5#z~RFMCFid^TTO6qKTNaY
zG8<sW1DDi%z78H2@#J#(BwjLuGZf7UwSh-K0t+7{*YzdqoD@zSt)bvIdmXXK<ERfX
zAbbolzpj$(CiBCLjE-U}wg~r{KM>QeM9{FYgGofx&Y?+OiBSb8kG9()&Dt-EZ8P9o
zNvE>&K<=>9rc_dbadWRZ+Qzj1cLNT4)K+y?20;7QvNDj?c|{Ai_+6MkPy$WZtN<DW
zuM@%6QzvFuz)Wu)8Q?`5S1I9XR#u}aK*W7}JM^<3?ok<fz{wBy?hW%I$X4wS8(Iqa
zlkj7=2Ce9CK;WhQmnl4v?*$>rB7!9j(bYxfc)%$%YV?;y=6FeBVe3YFIP~byL9CPr
ztEBi84wud*EdO!N<css)6c;CRE=Uy&%Wlp?GwgqqdI#mu*~k4|HUb1O%hH5TC^m)L
zqx>|0#}FDLR?2~XH2?AatAq9kekJCfHf;$efHKMQ`-+i-D$Xm@mI*beccs(>p8tyf
z>I9`U1JYkvC{vz_pAG>AVzdB{BY|+nwLdDQ^Af=(>MZ)}-Rm?T7TI>oYJT(kJ7-E|
zxOyl1Q5lohn`Imt30Tarb*H8O!XAGl@c<>27+DWePaZ&&N-T<jdt3#(*QG`=f;;QW
z*y_0Q)DGS{ng#|vxb6g^TC`{yH@bTR@2Ou<pt$e*Q2W}t$IEA2xiSiZl^}7lOfbmO
zb_Y2Yx{&@~DiI0ZUOvRE0c%l8H1yqSHLur(CMKNI@m)oW<`}jiI#Xrduxo*V=I`zu
zG(0o{ksyUqd$fIoF9Vn&(AcJA7N+i$aXzE(OS7<x7wgO4_}siztDZ|%TxsZ`oxqhQ
z-eI_Ko(5W2!bfKME+TJ#u8E0>XYv}1LXyOQtplLrEsQBw6$zMt$1q9Oc00Fj+Vu8L
zlV0!nr(l!5%*CJ%QCH~<M<tQh{FCbE@buGG2A;Ts)aVdo0=`AmFsgDo$f%Bu-2uZH
zIW)coL5ypvk)3}+p<4eN61OLJp43WwG7<wQ2_=6IB~7844lWoL&Bsmpar>~=Qbonv
zZR-V<ZjP_#S7JbunDYOxdeg}0yzlL4#ZN=IMC46R_<FBB;)C+BEpCospxze(cTY;r
z?fK(7yck&MR@f*3Dv0&pWWh&Ux;Tcy>mu=#FdV?x)&m<;_(W!-x&5?hXASx$U@K;i
zy`yg7sV^UWmeH0H`opnA!S75pOy~l)8wx&(wR<MRc*B}hWA#w7_8By&86!G4>fKj7
zq>lVfAv_h$Aa{#6exuIeMsk|;qYqJhYTb@pN&4(V>vd|Vu{I(ebSCsIJk}!*S5aTT
zEiRTJNc5DF1D06w3=tp>CW%MkWE77LuaIh;s7<Bhqu)popuPYG;d??0eEfQ@_D-7}
zGMt#DgcpoAM5N(Le6JiZfUWy7E#w?zG7#MKED|(g2iud`CE{rgEKy#Ko7g&IIf+1@
zYyptr#91X<*w#|+@!pY$P}OnaNMWp`q!lnM<~#^TX)m;ZudOmRAS9PV!FiFxrzYb&
zFaY7v<JK2bTizJ)V!3s})hE>|(A{GB<oxFmOEZTT?BmuIdl2YCfe@xRU%O>^=!9Eo
zHwtHOkC_|jrLh^XS=bdMEFw6T+2dPnTdl@JjpKpcM-W(`($y#^ZW(AM{xfp95c__*
zQGyy(w9qp3@{WmsG1!tLjWY^_cN9|a>eb1=^cOeHU$3U%mG*-pxX{w$Mn<7z)&ga}
zNnZ-W5k4=!xF`ZUA_ApSUgd6_y;2fMP=LyBt(@ck_*Pu-q$I3x-1Yeb$G_Tf^hV;G
z7^9)z`Yu*-BRSLd)g+&FBfD`qM_<2N)CPYPLR!HiH+W?Kw!zPpct<wmn3s1K(_twn
zFq``OT<5n-B)*8_y!r4g2mS+dn{?@GbUPsCMjNYsUg)P_NnpyR7;ZF&EP5EWANzQp
zi4$)nD4r$|m7b`_jV`=N-Y$Gfd6gQi4>B{HBe1T{>wR*uT3k#xg)xf%Aanwu&`>Jv
z3*fCjzAVnm&v!=O;*vDTO-oc(vu1tVq{ld7_7gbMjKEZ{)O?cRsG!byc)*`J1eEcO
zg+*oV0jWi#pQq-dZn_;BME+{giB=w7cZYM7XT}nyjh?wbzM>2|SyJ8FXt{Yp(8;(i
zI-1EjT)wNmj>c;VRL{F<zOMoJ&W&Xo!dG%p^2_^Am|&1K4;@2bw)tP2mB>s_YUTbl
zZ&^5lfXZh-C3L8yJiabO&-YGi|Ai>xr50e(P;~^8Ml7VZY(lJmTHfurvye>7H7hxq
zAg405o^;zjFLvs=Dnjo>Ahm|i#Az-gG~rK3l%r<l*}6<9X$l?&Pg;rrZa8&{_$!<n
z@~2Teagi#JWvkr6LAb~N$JKkl^}PT6|3#G0AcV9y>DVe;S|3^2l!K6wWJTFhNeg9k
zjAWH9N~p+a%F0Lxk&+!LWJUe&&komh{r=~6uIqNbXVmBOe!s@^`B={-b2Peo%4c9C
zSVi9CPtfcO0U8^%XtCCQ9|gT1|N2J;VhRodwNF5rf4eoOFHvwZ$PRB}$ID-{&2K&4
zICoKC@x$!$bMM(1TwG=Zsj}BGpQ1OT$mVP{#Ig}!YeW1XnzjoCVbgwjyidYZ6-DkX
zXLp0T{y!g*pYFz#AgnoD69~JdG5cT#uSO*)dK&t?8Tm!ACKI?c;FSGlZ#^33erMD8
z2=cD3N|+4T|G5>L5C3SS7TAe52cwwn>Y3u)R>gnA|Lwfj4&3<o2gfRNKIUOWV7|>%
zX#1zLQjYl%UJ7G-;plk1fn)PR?YX*8rN+dR?2UDTW?#PY$B@a>lE!s_Lg{s}AlT2J
z^GW;TxvJ7BAl)+G5Z<v#lY7U1m3i20-oL6MXOyXqOtnGgFA{5#91;^T%f_|TdQpu;
zmoitR5;bG!|NXa&`1Q^7?1)&1I-$$K#>vgn`AT+nCgX{?O=5<`+>P{`w%MNF#1$(!
zvTU|1_ikufe%7K@#MxF(RZ~Xf10rsYrt_(s&p^Ex{UPvdsmBCSMo21sAl~u)!)rZ4
zwfkxJsvomtb_F-zGSFW!uOhcU?l^?D13?WVv}#j?j33`(IQLw`NVUQfIWjrQJ>f^6
zxD#_%gzjQkFnp8*M#8F^vF1>}&>{&K0Fqj}%ks}|SeDEC?P`o_34-;Cw+l07GeB{N
zali0g41N}>awN#SI}$}C>YDPm1G@FU7<j?rZ!?H=KM4_%v1{}u(&_$qy>Hi~Pvka1
zDj{sW5<PRoh>d-7RP1+eXJp!o?jqEneG<9(`wymCYV-Ob6e93>2d><P%hpv#1)^!=
zZ;k7)wdF^LPXCu&>9YB#k9{p@KY?;cL2q<n(k+S>y!<62?wM*eFoDMbaIKZs=)|+7
zC4F#FTS(Oj`u6j!Q_9=1P0BpdT(&&UyGr_%bnh@1(v1BU&{4k^>>XIGiR6-GCZ^nq
z{g?By9Q_3;FI4%Uf)Y)<c(Dm-@2vYzOf+fu_ohejnz(iMW9;B4mB9sP-<CxM>DP*_
z=&%awE>b$NCo>Cc5>jW{liTiSkxow`I*8E=2p1(IiJ7EcsZDNjBTMEO$C*GX0Cr+F
zM<>2{8xXI^C~<<~1B`F!{&dx;1!JBK*{tAb5;FiOUONMypIym$;Gn$T<SlX+e7vQ}
zC7p<K%RTg-<4D>p`d}G_3eXC&rg_HkA0Ihd2>7C?_EZvdnxg@oMeI-S0b365yZnyg
z{~ugg+ooQ`TmbON8OnJkSMuj?XuC{O!B`!D!?C*?GAYajOB`ag`cVo)-KkV9{Al}8
zqc$(CTdK;@B}uxJ-|*|ImRA4IePz7n#sp1V@uaizJ|y3p;}?B~&&Y`r9VoP6?k>Ej
ztgaVm0z1pk6`ua$w`cWvMoI)Fr4(&hkkOor>4xjMhb}7*!8u|vr{njoiXD7FLIuSB
zNhu%`5~P1rGt%&TDFqh*CN+rHxk&@|OuSsZB#;mePH0Jypv_~@;c)U0DP_S_z{mtn
zX>vGYx2NF(a;JO6#xB411*yy>F?q=Ylwns$dATbDGUJ5r+;&hSN<7(vXcWqsgN_hG
zB+zt5@NQ7|P@gkETKK1n9vy<@Uzk5PNz}3^MEFPVyV}FNNw5Hzn83-~IyvKMsfUIY
z!3Jd-ot}7qUA*UB=0Db0IwwEtX=Fu16V5y>EyEA8#^JY%67nV$L)N{bNn?i)Y{-Qw
z>k0K8iqZQ;-?)+?uCJH8%~mhqT-!Ejc?h%?L=RUt;46g)c~IeX?ny?cJj&l>l9)In
zZ=9T+4kykqC>hl#i27*loAZ)HCy4-$3x|gxzopdkvd@4~w;X&CjGyGn6UhNlBrS#i
z<90e9*=nn`;kRm3j#BQD#2Ay<Wc5uh9NmrGyoalG9sS!`Y^a;17V~gI5aphXhvj!5
zfTpYrKtd@}WIJY|^F@P>HH<-HgkVtS0LG<M?EtBUA0W3zq)yG7FFTcS?gh?5wuXof
zoSmQC(0JA7(#VL123<!Zx8TAQsTdF@-V#A*rBWX^kGF-7_?%M-N`o!!Z!CXQ#c(1?
zmjL8MYs5e<oHpqwahk45uB-|Tj!EJHio2F3t3M?_?pn!x?Ay0@hmv>ivT4#KHW8kh
z>VU8V$d;_jmk(V&IvN_C2!L$<@9`mB7*0tTw6t{IeTGmC{w)&{mxipHisA*Sx(IvX
z5_5AOeCbh@mg;h3rT0q8vqI28q8P#;fC$ay0^^-8`lr=?0I~B+y8;T7E?&W%{PU|%
zbq^vX=&HnzR#CAfz@1p!ZtgnFYMFs6T!7ZDJ&*1C_3PJ)(~U&h$P_`!kZZ3-=U@B0
z$=$sEl{`)A)i#5kp?T`jr%hVJOu7<8@09S&QEC$IFkiVJ-k)V=Tc*4={y&GhuF<5e
z^PNE>5#iw0Yj9?(+CL|zD9dnyus)@!moL9E8DOd_7ccGp6*vawNQjINY>mhHm+PUg
z{xnMT^UNGRx$F5rTbQk+d_<MI94i(DWVnCf+I@-M@&NQ9N0e{%kGSd<CvMD|%(7&9
zw$Y5%`4jXU_g*-Dc=65;=%g;DZCdm=76AYX)TeJ22le*;_3OuK=7r$UzWjocjvCl|
z)yot4?=b|IQJPIX^-x4$1FU|G+}30#&Kz@Qu3k(8wHq`jWV=Vt^$}J+_{Xx-z=|AW
zD!n@EJl4}_)jLzBUz|M@KgG@XW}82r`6=5AwaoHwu59+Wx8w#eykP--)pZ|L#sy)y
zJ8~e(y306(0_D&FblqRUhUn;E=<xMv<#bL+dGJRIa4W_DB$R`C@j35I=b-nC&+Syh
zK<GWJkeq`1vCaT56GTcO9wdxR$e#*=J?hj-Gjxd`Cx$8zSxRK4Oxa4_)|->9<E1@&
zOwtY|=;6a|nGZK^Xu}ajvEdqI^tv`(AiK!ORb#>b;h`JY#Ww3+$IgK9ykg8V8CvVx
zu=bl&TDRmpkbFd-5e6aXt*D*3;Md$)vzm{np#2nK$mOZ=TIgpWiaVC)b6@;(?Dg!<
z0|y)^G{4n1W7-Bwd+r;Vpd$jt`Bw)I@|)5I7sMHy#VO2ZdtP!+CQYU3FKLsX-5d5t
z6eLvH>t=fSO;IC+YiW^(SN$%Z_pJWI`_%HAqJpD)UcEHiF6RlDEv+!s)n&J^Z(}E(
zwe|M)PRS#TEK{`Ot8j0e<nc^6I6if1JZr6TZah#VcVt%?Kujg&^Rw81sn~-)$5Zke
zVnZSRN9ZpGc=^#pS$)4c{87cVcklYN9-FNqGuc@B;+Y840z<ORK3_*`{@;JBQ~#|D
z|26w&zO?SD+koyHCxrFfx$iq}`ZjO2d7q6(#~`4P_zD7V*XFp@L{`Bt`Zod9*KLDL
zTHY{@T-c6sqzwyMHD1Yc@@GF7SQ7E(`SUe3(M6<NQD0!}EohXth=-}ArRB?15AKjg
zjkuVN)}DxH^2gE<Snt_^AsobkO=)f-Z*-4tH|e-ziElgeqDQ|v&_BymSCkX#wD?YD
zAuD!Sck#jwCmKv5c@XC;gG;9ZYPEHCzTJa)NgH~0^#m?tyHGxJz48}K{n8qWuUtwS
zpk<0J7lmcglfnP@0tgJw42HINtoP_q#iWZXBY4F8?D-4FJY~f@Cq}e79J(JM5Xel%
z*r8;xSv$gz3XzwJg(yWS*12-b(cPnO%#CZ@`o*$E5YcQGZtp8n`n4HE+1C}Jy3r0b
z6+4y7iUuyd`gr|6I`~<A>aW^QW6J>5%;{!DFJ3U9{Q3=zLHv%<wD2<`|6x?u2lf8w
z*<|#|!#T_ws@gheX`eNh)82+0a*R=(Vacj1<FS)wQ{nzF@z#DC%9wkP+LwFwl*%0J
zSdenTi9x<OnbwB1sFkMGreD8F!4bbCcLd3m3hquSX_4jJjX&RQ_nQA%(KZ``x5dVa
zd6H6%VD5jZ_@BP>P^p#HCF|kC=Arhrf9|<4$48c5=L?@3$xI<MRCbgQuG@Pw7;xzx
z*eK^gS@Dm0i^HmtTL%cR$52A5A&PTufAROFX+<ABs($XtO~mnlb5@{U&&e8;Tc@p+
zmCx<Aj`jcd(ymc9PuVEA+wp69-B0~u(io@OFjR6R*Cr{Z-hi63G*^u=<DPGz`N$-p
zmE)<_Aw)KC>O!2i0m-l4aG}qJc`^NRRjb^#ogY-NWI|%`UyGUeI%vn5zuIgJnA2gd
zUsO=jVJ^vf+jHyPZaOeRX~y5RPag<~uzv7JXP}$fnh}dl+L**#eU>?-QQV+++m|LK
zloUKQ`0i|UeRIc$YiC@)^-8bRkgt=Rz2^?ux@b@3#ixX4gYd?$Q_tyn`^l3~ya#z#
z-7NZb%|`xWY^-kH*VI;_)0<{Dw)-iQ@L6{PmNzgDxndB0+$`6-AR7qDuG2OOxmL$+
zwrQb?SiKgdTXuGKy#*g1Ouy76oT4e%)ZxnXlgiC*MhsHuv{-mn&n%(=M-F&Jl>W`_
zn~xsXb#1h%&q$q3J9ey<cdlgOJi`@4y;JgQV>~dLP#GJqzj#1#nBfq;pk`>m&S7og
zV@<O4UWDg7-##Nkev4Zp7hitCc#dkcEa_aOIVj2B7Oii7<i;^5{+$sD3t!W<)I*q#
z7X3QDeU4;vNusb^2q4q(p09?{z2h_+p&9CS$9>+sEN&4)wOXAnd>-ZK8b-*3iC6-9
z?qX0;*52w4&CMG_L~G~lf9BPDS<}AS&LgNbAZK>|>ofm>>(<~v<6W=2G|$Ms7voe+
zOPwWGBCYdY+HsOZBfUN}Ruo@FkCIbRpfOS>9`eRiXYB~hS?SlVUQH8$A+3iVV=sB&
z@&&K-+1>x`9yrl`=n?N`7)EztTDIY~>uu-GFiGuxN@20xbS?L`R{sTe5+dp-`KS4c
zosc}=z<cfi=<2ac_5ItYsiN3s$&wz@tMNqy?&5g<>0lZ5=Z)L_)at&`!l#uI)l#BW
zo9OyzUNynasDr~h-SzCLcPzV2w9WprkK20AF!8w9+mrqasWPFuS#)^>)z%%>*hZ%o
zy;2&5jL}+SdCaC;c5YeFciQ$rs8|#_2lxBitKiiFK5ZvoWRUH?9K-J2n*sp<v)8PO
zN7*0uYa4%L0>scq*BlC0E_w+!b2~r3K0VOQo5N&|KzR4Zf2j+w-j{ij9zFGt4W*H2
z*x>BW1rL{IJ=KtRqa9#5tE#tgbbJ>k+RDEtGmit%iEeN+;Cv_6%a(oy$&N|p)xYPM
z4^b2hZ2RDB(Dv|N=O<9Kf{m5-ep5W(r?O>?c}UiX2@_gYp=`<@YxIlJRE<Eq-p#YP
z`RB<s{#n*vVbKpVSsWQmVGl@~qMOrC5{Q}3P)OTFxZKLFMjAhds6Knh0_l4;DPmXl
zmI$T2_uLmi#zvn$t;6QvFsRWZfByP4Xt|rh<rW#&96PBjl_Ia+5_>$+=U2rDQ=KUN
z77hJfOyqlx)SY@_kisG!l3A|9aVe`T(fzC1XxUoscNyf@K0mwBrp|XlG#VGFgw~%$
z7s04n)ro#DL7cBGKBbse$dmKh)mOad*gy?n=%e(Te)s$H!xw0F?KP8M6xw8@j<a73
z3vK*yBYpmN+hf^!?~+Smr72kh3KxCuTuW5*1`=aZRXxm0*=xy?V!E>oWb7*-tu$C|
zGOfESVV%3x`r4S<St`jvW2~<6#!gq$S4%szp^a9dN6(3#{RgYQdcRl<rw|TNW5TL#
z4;eM;&YGULUrw^T#(IAq=IOeed9!&@F){T8jFQ=Ecn2Gh8P=my%Fl&YUL3PCRu4qe
zEs3Q|F~5l{$Z6wue}kXCai{97`Tgy-N2sZu4++j{eJHoyq1-n|&Hi0I_)W5*TC9Kh
zYQw$z_SI&xy}gf*6}-GZJsAiA_d_i<<~K&rt|e$dJG4Iw*xbA^T`wlv#?PiiR_=^o
zpBT@l@^5c$`55WK>$h*ykqwn|ghca!1xH(e7f-(X-Hg^dP9&c@M`mysqKNv)QsnnF
z?J&+)@(65R*13KEejwOXgJ{!dM`L1b@dJ<8Ftp2`GdkMT>~@>zLD%1S*_wpcS?PN;
zDXAGYOi2TSr0m_N4_a1YR#dRKvpu~QEoykCbe)5VsI+4*e0iiD6u}vLhAqYNB2e1Q
zNaIZWjSkx1jd#5>{MB(<#g`?}Ws?4bsjX%nJ90JxDty6lR>BuB66SgJ7lkrMjm&hC
zh?AmIxB6xH)mDX?_H8-;ytdFj=yt7)s^WIIfl8K@WoTv;c`!DxKAYa=iB*Q^nT?~n
z6umc8op7^JsKbB(s?f+vhCqVjk9W^(tv<Q*-bSUmWBM)6zv+GUmACR<tgdW8F#P&%
zdJ->OPUu@yLc+b0<!2{q92oUReK5o9jKZ((9n+5Dx9-F|_XLgzn>lmlSWolVwt4ev
zjNaavUkzsO_H;EnY-qgX&eK^3M%811e~9<OL3oKN<AR+S_UY4-H<BfR!!()`7he5~
zgz<g=rF(PV*jkQ>UT}mNAYd`*YA>PMpvo0)L~S*5HOwC4w>*QR6e-Tl@Y<cUZ~^1z
zW(h%%PQj%ri$wltF^n%vTD0;`%^v&xU70l?Z*S+YL+!VhvAEW|-8@%NFo`4C>08%F
z=00RV(ERAA_qYD~sp;qSfh)E*E+x8dt)PZ1Okl8);{qcJb{mu)AN%V<ip~_s3%Yxo
z<BqBpVQdI7^Tp;J9`pUdlP8u_&V_|(I33vkoIXGHek}IKAn=iTNsku}u`VVCsz0Xi
zcZ<)JBpe)JI?-kZif=NdnE4$70GC$*HHTad?#YEJ6X{ih*9$W0@j6?q^}3@PljI!<
zH7f`RVsth9p_jN*PoNr;$?DrCF02QNB(+!A+v&}t%S-uK;E9tY9v40O*{U#VlOo_n
z+}Z<(GzJO@1cbEK>CZhXG`LXXqK8SmG3{_vl+o+9j^@8Ng`HW%oI6f405>n7hsfCb
zLezGX2*Xr%S<`u&^Wx@mEuj=t`TRf>fAPrgDUso}HvI4%je-r-Y5nKVpM6VTHE$H^
z2nvnDk}<Tkk|)of-+B3p6>JrMGyLmy{L{2NQhPq`DT%>Po-`!j3lmf!#KvWy0s%C6
zX-Xbr3kKy4XjBloZBJ0rtDqIL?-fCn7Q$S|hSvq-^Pq1jT=BGWl6A3U0&i@xsdRts
zqSCf%xCaSL9~txxWieN$3K-`^zv;`DC$~Smzv!&f4Kq1p#zpk%%XQCsZUx<#ijE+V
z&<>Gnr3M+S3MS_U>be<%C7?1JN5`f}_%?3Y((QTW6e<Bm=_#%3(e~5yp1nd%4&vub
ze>G?)2N^u4(#<_ePWX4WZr!T9$NPZRC(fR9*kOooE0f}b_1FEm9wM{-4q{H?<tu$?
z;s%OB1X=Ti2Cy!x0($QjLK1kYcVXps5%r<KYhBpo6Ka7BPKgqyfzLN!Un&B~u;)Lp
z{@OCud*)v3?B$3yrb62`TPYI~Erc$}n|nkZzH39z^DR4DZLqe@EaLfmuBeD2+_<n3
zSgCclZi_}t_~+}<fgz4D0Xc4ZT8CHk>1uTQm8lYRlMxh8VP(#e#}n$Ddb>9G@gUFS
z_B!fSp=G(G)ph|ZM3r{Ci96e}F|i%pfRxZ@iHwN+4YZ9M1o=(eiTZ>yurUc2%3Fr_
zikga2DsRP+&sA?bs)dGxkV|fG`muLPf%vy(^r){s(bMzO-doY7W4DxY%&U4m7Z<kZ
zr}*p*A9d&Epui$(X-!PK(K`LzELtGLo<=rUgRXXM_$@fu&Gj@?!4vgAH#VVMuBmM5
z?RK`j4b|eDwkFB(->U7sJWW0?_fFl=s#&ujDsbNquZBR0r{42X($d!UgG;wD8o5_7
za?O)@5RaL`<6rVkP>^mg)%`+TPrJhVDXHH*9F99y&YAGnja<|B`9&T`U#vTAJ2z?a
z`}gnN=FeC4)ec%xTI><`S?|cEs=k3MR;idJqEwm4wh~F)Qaxmamh9*kN6Y>_(Qe_h
z1@q0X?_C{y-erbsVPCr&7gl_YCJh+?95!g-+->3E{z&z%yjTbjf!x%ZXZqJ)bz~w6
zPT&C2I11%N^5tA@h;qZf-vInj*c&v)mtLGaAF?TMXWr<Tdm7CHlUsJ`*=m^S(czb;
zZ6}Z}boXxklwk|szj<Th<g_hsQ9=5kS!a`~|GJRSd{Nv{ue^s3L#QG1<_2Z>iL^U)
zrYgrW-k0AuI|w`thcIBE%N{xXn-y&y(#a*bIirfSB~Vx~QZZ-FyTgaZmuLQNaMpLZ
z{qttGyovs^W6zqj+3*YCU0JFi+m8=QV^gY@ul%s$Y0E|D25<pmYH_OLyq8)|b=R<F
zr@NR__i|z@Kkba%4aD-xvW+iMBg=uu^{ir(N)b&KS}2b>&SfFKrr`8GUl*KlS{`<H
zppSl!#~!souu49C@9Ox;idj!kXr-&IG@Qppn`)?;Hy@+5sm~F!a8K8;ojUI1&=sM2
z6<iFV)OyflaL?qI4OEmaO!}Z+8#$!};~|$XL<ys(<vRG&wI1EN+357=C{80Sg3=DV
zn`&X7%og>)M7s5D^**uSjMJX5Ek)^%M^N9`^0X(XWB@`aM@66Go{Ot2OX#bGp7;!s
zAl03UCDdTaFNef2V{quu(AmSX5kasiHIWx8U`DQ8yM}uryH{5`hnRcDZQH(D{eI7X
zM~wE!?a+jFh<gtQ`N6vCa6!%DFF80|$T{klNBYsPrLc&SLr<!j%ai0qXWkr)EMrvc
z2OG<b6=xQ#-%|eZ=iaikYYxUA?REM)Jupg$cu`NYbq|hcg0GtJDE;%MoH}yk>b*No
z2Syon?W#@JE34qZwn=XjI(kC@s@PbQN|x>V$#-zCBk4D%#-AC#W_e=$Ew&><;*{?4
zKWjPpfbw>@ZE$OUk1>nf92~NJ9e??iEwbHbRXs$*>8}Yy{tLwrJFGPZz@>N$7NE{>
z^8I_;^-Wv0$VK=J+fe3T+a`+ae79DQkUvFU<5B~yf1(EI^t91348Qtqym1R7X{hbk
zT4l@XcV~39<b<=jxG|$ZEoen)bssR1MFtydx(|n@S5`up+0S5zyLP)ek)`i#i9wmt
zJ<>VJa>Dw-p2o}T?z9{0z5M3Bo;$6RH2<~gI31Rg`jKI6zTLl91qQbWb$t+Pu>3+n
z$;yM9qmJ5nCum;09VpU`gxW{VQgC4O{q;E)GY`Xx(isXJj2uwpjpPR@zijgd49GrX
z;Gp3+#5DS9s%7@`k|0wLsevZ%|2pPo#ImA+Y4nuNZrt^F)9S8?gbi(7Rh@MzVETdu
z@lU^Qd9-bLzr>_vP5#>b$@E91ZZnzI<JJ4g-Vwu&x(*#YI2}Wiq*B)f4>h>GCG)A)
zNcZD?{zV4Fxe@lWWrPx1{6JXk`k2;YmBQQVHu_U}W@s)~w9dHZ&|)}l-g4ZCAlk*M
zR^OCi>n~S)%JIDE5ExO@bcA*EPb=Ipt?{=`kMFu!)NWUpwXknNyT(CBRd$&ohz%l4
zU>c0v>v!)m5e9XIW)WRbl>VbNFRC8@wfWB*H9uz3x<zwd=eFz4TOu}>>lu#x=I-W}
z3g88+z(6kbeL1?DYv1<s&@}&NQGZ2&$Ia~98`FoX_oD}foQS5%r`FU`ak0McO#ig(
zu&TAR*NU{X)BrcHaS?+odXL&(&v&eo!a}LN`QH=!+uKJUI;4sb3xH(x#7;q%rZz9z
zJy5al^6z7;!GfDs(W%So1Mv8idUi({-cv73YeR5wR(;l55toFz%n;<i4-V#6sStAi
zZt;r4LH0V1f_K#Xp;kM?;mvfVx{-=1&p2Cc6=g~ZKBW<E%+O}XviEy5Y1y(q73P;O
zi}Mym-l%)-iAJb><F`fHp%JB}^T23rb^1f+8#AN*@SX$zbJscks~sAa`{sE`a8@X(
z2-rUAY6R7WgWq`l@)FOEBXv~QL`0jW&*?7pU$c=qlxJvUa`N&x+G`zmZ28tjJ9IE8
zJ&5t6{Cvqqz2>mvG&P${2uXe8^=#{ed6K!5_1~ii4$SHjXzDR)#=(i53~on9#I;o?
zb2xZW$^C%ky2|BExq{Emo<BcmVkfNiwbA@{_5SgBgl34_n8z6og+KLn@U6Q?cFb(u
zy2S)%XJ?5ycS*{Zpsu~cmziH2UV^!}rtEKB@G^V)#zD`mzWO@~u(s>&Mdwc2z|@MV
z4ZqPPF-+~+ck@}cLk14CroQ6C%U8>N!|U}@{<D-Ck9uv~%3-TU3++ul=L1-!R*tTX
zaz=33+eTTVH=b_RT)1YLWz-svrk$llwD!6U8<vcf7#@^Mz+Qo$6*gIP)$)gny-9f2
z%r-5Q|ISvq*r&N~nh8JlMg(DWwmoT8I<r?t>hOIq0oPUnd07f!DshI~;d|)`GF$(p
zK9SGLvtGV@IYulxFs!{O0^}EWZGy7~(MTHYAM^fRSI+#63c2b|Zxp)TP>s8{L6;Ew
zE0#<>SEiOU(rXCLaHskZFP!8Ca#8XodNvfcfoQPSJ$f{wc0!e<Rz5JK)}g!-9uLuR
z{7S0v8@yyMoHjBsFF0ClUyj<@zGp&R1bg8RQ-`IIuQu@Is`p=WI6JXlVM2yS_!TR`
zx1t7aiNJ}q+ECOL_|v*0fD3|OQ$m=cR-qMQ52I9<^;YC@!Ytf`^T0PIHd$S2bg<sr
zR(5|Nq0tS@LTVMyz3EUlg8kt0RJ;GCExaD%gorKe7&T~(J74f~;@4RDGU47v`pBTc
zI$bVI(o7iNFepP=X{UMk>snL(dq@h!8ZepRbq&?^+Ot}keeE*(P%bw#HI#%n=Q%HF
zMXOGG9w?LIkTcY;@Cd1ez?_QFuc~Sl3ISGzv;55ep1>KWf8%$oH|@fAdE=$Av{gv-
zmApoq_9#Yr-RKj!bLV=*FPwTV*6M~pG13Gk$HkSl?KUvm^;AjnT~|k){`^Sj=4m`9
zfoNrzJ2_qrVOybK%g3#XRP2M=_h;@LNYJ^XX3bq2)oJ$MnQLsk<Fks5N1d(>y3Fe9
zHhXpk=$P`M+>rKh=NnP)6<3{Y&Zm~qHyDoSEESU{8{ChZ{Gs<C>*d~8Uo6a9^fkY<
zna_vfi=j#Hb<_h&qQ|tmFv-CE!o-^gU3GLpbKq|Z8495Kn1jrt_paEu))z`I8r<^M
zNSLH<;N9`l`8{LC_Ig~Wtltek)3vWlO6=%|1m}pN)Ilc|yU|k|Nr+4cEZrb&9dF2w
z+*22qqU}|s!S;jW@>(Mm5;QyieQchkb{)+>*GZNmZ(Lg??cT?*xvqRYh!h$-HiHDm
z@xaKqShIXwKQu$xd#nt^FhMq|L>JRaD3$fnJU+8$Nz6IxE~OtHUP$%KkGfx}ZgZhd
zbivjYb>q|n(q|jCyi#HPwd>t4c60lTdGK!b!*Od$M<wjlAwKzwjo;G|MY~gyF4Xbc
zZFJztyd?o!MpZ<wNNsx|<;>Hq7gfJ1-Tw4rM9I$!w><JS5*~kAwT}bk1j=4$XhgE&
zwfgoveE399b^8GW<jqQ=G;d`!7BEsYo{`o<NN4(CgFUZpb4x1ZHhDNF(80vtPz`-3
zz~RVoE`Qc*_-?Ng|MU*-yg;*CaMtEaQ*F4~DWJ~G%VTuuKWKy0&<n7w5jWyJksn3l
zX1{O2@$idpy?WCa<Sl~CDLkh>Yko7pId@gdyWLkCS_3!GADOmmC;jMvUzR@_FV$G`
z#M5f`^-iBF;tLjcSZ{Kr<&DKu{_}07m=5ww(iyhm+`y9O3wxoE^hHa~sfUc0Kk2se
z=7!oA(^Q03n$ge`t_G%oE=ZLF2N+E**vf~vFek2fpMa+{6%^vZysaiuRfq!*9BS}^
zJ7c4UD$+9k5FShQN9bv)<_4Rnv)w}1^Jw^1hHB(iY4z83Pk7X%)0djA^>Rgv@YMru
zm87Q5tF`-j;P_W(8SN)BBq&6i@A^URdj`E4<hR{xQ@b(0F0^{>os~TJp}F3_M@JrL
zJmqm-!-V52{)&mqPy6<E_+KaE;x2AIJNfa1UMtSMpSz2ji5r^!0dG=}6g;F3#_oxq
z*y%FgOt?yPGAEyO7;4J&OJ>T^lkx1Q&M?WTlluEhWH0S=W{AQr{OHBU6WR=pQq%t4
zl+F^*?Xq3woj11Yd1US(i@fcXb8pSKxOK>J`zOvAkEd1a)773_v~`8)OZ~8A_0s~!
zmp+_yKmX!@a}LQ$7bpDqF~0kb{AqI!yWcIkSoh@I6BomrAcXMuflM?%eG>s1t4%VW
z3s!y_nizSY9)TyK3gjAy4e!Y0#M%88M_-LTo4QkH*dHyx+2+xv80+lf@>CZ+{mqr6
zUs-D9+uveRyjgfwB$4ide^9#MGAaJ@;oPVrYk&qSRt3(^?{#<K*F}v-8m8rqD6wB&
zJ*8u7^+0-3@(uNvl99$lhKIcB^~jL8BLN#&HXL{mu6q~NGu)vj%B7xq8nQAZk9I_c
zoR&3L=xWu*A@(=Tv;SRlv{tYw5TdZ>^baT&A6RKVnLK;i6Wf1-vpD2KA!hVo&@zC}
zWnIZHV^IRvrM>Yo%icd|7Jxo@I~}$RqVZ3jf4^r9@dWeqs(p4F^(I_DSiGhbr3Vna
z^y<^*V3SZ1Udb0#0W4F1B+G>$z%oYnh84GiR}LOH&=->g25ls*&a_x{RR5e?nf?U|
zzfI{5v-Hn|Jc`}SEvYBn5z}dGQO?nT33w;w=o<d6#U>}=+d@K=J*rM^M|J*bi)rxh
zT9E6t?)E!<rz$M^&fh|D_AwKrKUxcdEEBH;K_da+$0*^7QH|My9)KG_*J(F24u}xb
z9LPByL%A-r|M$A+o8hoqjnY$RJW$cNWUs#&leR&yE}o$Rr72g<CiIo^W3&tmG*J)(
z*eek;Hg)@;@SAJ}ReF9Q!d|xx5a^6Q)Xf{X1lM{s`=&#LL)Keh=x8k}5J6KA-~@n|
z(@?pfCy`E2W<k<O1TGI&bRvTnSr$DO1rYlh5da5Y&i3(1^Pben@A2xtA8s5yXag6%
zLQ%YdQM|ZiParwre-gP-59gzmq<>#@$`!>EDdBaLUVr#NIi#cbvy9iw?ADwg^>I(`
zn>BEkraJzUhVY;3h}%}|8p4qRG~>j527!|aRv;yoGDT=&#9!*2_<uEDv2}w<!S-1b
zW=Q40H)fJo==DzL8nEdJr?i0qBXX#8hn0-Xy=&K|R+paH(`T5BETgumsp*4FEGK)q
zHVRmK&%Hk`b_gxnMmtMA<LHej=rUN8&hK6GA3cKCF*xz0Luh->6dsQfHLA{>KVL;F
zLpM70+Ya;5CgJ533ZtC6ch@k9q7iXYrT~QXgsEy{9Id^e_pWbv(Sm;VVaE6E++6eN
zT_Uz^d*$AvzKqh4BsT^LxBA?*aj1YW!R^gkX{hcT_9~!7i<~uo`WS~U9!4#CIlA6i
zibrHE5iC=1!oyCuzR=blIu6ZOe*j)AdsX#K!mpNeM-y+25ft}ot;icM7qvX{<Xyua
zk9~>lf`KcRDK&I;e4hnIDRqof)*rm_jy}DN=e-x{S*zy3=YJPxItP2cGT2nTG9aQs
zcNkmyR|tS!mzHKBI4(8ce+1A2@BpREE~5#a%M&BH?k>U6mGgrp9xiHp7Mz%+_PxQa
zCmGpjDt*(l$yAoCx=guxK}zb(IIk%BO<M6qmKQ*hPM1AZ_~z}~7GEwTFbLy8;-k{{
zuKh=5Mlq$Dol3ls+4hXTT9BS7*NT`|gK>T>eUfD<(@)vNtA1HM%+Fs3HusEZk^WrI
z^mHRG57W{3-{xzhcIZy8!ZA@J$`TaCtL^uR7ai2PhW}s{Xop|FWw5&R{rigMW7_Gc
zE9ovSa40;t_>`c5PzAh2e@gY@$<;raft>sISsGn^%k@-|JXxMi7!+>Y*evQT&JjP(
zW~zw%_p5IBFOv}eF=MQWhyjvC$bM1lX|0?(Uy&tCC>cj)pVh0Tn*KSt+5~hx)}i&s
z4N36DKWB#&nFQE^b&Ews?8)R3dIl&f6pcf5UQ%Lln#Wb|8Jz#Ev1_eY`8Tk>>JYFJ
zH2%26G{|vD6R&EKD7qWu@JlgH$@FsH(jnnP#*ERydK-#^n5+mLTDtP&T`Lxmkk~Tu
z+MOZ~QV(lOYCd3=OeDloTG#uhVwwvwv~L8qtwZ84{Xx4*>%aYzYsf!i^<H(iEBqR(
zD6cm3xarWM2f(HW!mps^x(da<mj&*lkEVLHU2DvFs2KS<xj+V3;2&zlZs<uQ3u;aO
zjuR5k9DVdF!KZqaP^?5C;8Ofrxys@@hJ&aRC$y+y7~^0&X;Q)7x%<1Ef__AdqY*Ui
ziudkbo^?eG2dRO!)gO~b{cbCx>xCYw5R6$npk81nMeA2yy;a8u>O9f|wVKi^e}_<e
z^Vi;v>SlhQ=Jx*OZ9rb67+mGRB)|oygghC+#;Il|S3=R*K_QXHunwPt(#7^W)(e3-
zVuUY?S`f)e6K<g2q?W3!^lowG373~#yOPZ+>I8redxUazn)k_3{LlW5w++AIBg?-V
zhoQxCn(b&)4!ZQ&BM1z1&Z}L|T$wzedPBB$NCVl2)CK}rCxUS9^zxD_%1u~DEfwVt
zDb@YNg1``u#~&^qy4oLgCEPgY`h%qT29oMFm#~hKC6NzfV>{u7Vv?yoz>)G;oyJ4E
z|2f!#y~={N%ux81H!%NeWk()bXc*P7>Dqq&gJ)T4E!|#E2}@xHWDsqsTuP)IN^?e|
zx9W+NrDZEPUD}E?4#U;EcklMq-(#{3=0+E616hT_+xyqviqb6+83_?j7|10?OAdTS
zV;gns_<nV0q+5r_Ub#chyx2OS3lPL!pD=DZ9*RQ8<h=K|DKnbWxsP=ntn0?x3w<1*
zVGbuM3IddN&p4ixv__nJ2%ZA4jzu#$YtZu{>=~%d+Z>$OpKWe+y>-oDvWqLN7r=o;
znFuvOR}@>RJ#>u75z7EjrsPET-+0}>rPc2{^?$CtW`Pzn5611i<DL-KE?ecM>Siud
zf2v`8*-&civzJci9#U^1sPQ!jH)+dj0{6rsK8>^zH#axk69typdhR!~kr>sSUtA5i
zQ8SyK(rL;X+6XuE;K75vA0q!SQ6D)1qN4`697G+597v&~7<MRkZ29Zsqb|SkN{-!N
zkv+y#2QjgR=n2u%pa`S!{c&$zcOxTZd}DrmGAE~IC_-wjTDfE5{<3(mR6xku_uo`3
zXNB6Cgn!>@p6i|veg3>2+X}sk#6ob&Wx0P-H2xnnG0PexVYQ)Bs&DsG(DM%{HCp`j
z_2~7e`$O{Af4<NI)SuNUzisZ^ju1^e>%h4^Vi=ycXU_)0k#<_HXvziAdily7eSdEj
zDjBM!eJLqV6FT9xCxsg)VZZ!rNsQnul;jTnO&O(xgem@K(_j<5|0zbJ>gQbf?ruIr
z2me+R_zg}2S~iUe53VT(b<dwiveyR(>)xyMK&4352o_s$U$HP&l1QA97|Gm(2Ig1i
z1LU1S>d#bFUj~HlizP~bImIVQN^#_>XsF`O3gt}!NWiQji16$XG~B}IHyFaA7_OzF
zwz|t+v(f1u;kHrKKpci`WR^PD)Sa=G4hjoB8!c*!%a@)u1Yt6ZFkBlQQ*n@Nuzq>t
zyyq5oW_{qJDZ17^PJvYoB8IunC*Hf7o12qhiry=L<3dAiT*!G6;`(!^qeNcANq+8S
zV4aJRJ#r^RGR@HuE2V_%0V>F=<NC1rjibpY+;QvrDYI}JlYqfo*0_RvD@yYA;R2eQ
z5#lzE7hV%?LG(FTmtntt`ux5uq#T;#j^E$7{_1qIv#YQ1H1@b0{M=*SX%`bE7(f@Y
z;i=rPYxbu?MWoVm`niYnj%=jXph(o@{TXd@ILo@IGhh}3t~F8sA;5UydA)hOMCtJ3
zZCqY5!+|%T3|U2k>aeF@E21l=S#n$tX#Q3il1~ok1-k{pO^le~!6DH2>K+n2WM<{-
z*ROpMTwK`j{%MMVs4?VKq1V!(uE{8k;YMH>WF|Inpl94<0fhWW5Iw|~->5g$MlWF0
z^2R|vpRd{g1leaw%lXIMYA=EtbNT-9#nO`FNkm-wqh%!Pw-pi}AhR!f-#1aOBbziL
z;*EQ^>%Jj@(T*{|+H2YG*y|0rz^JL$OzZ><`Q_`^4Cs^6T&OvWcTN)?kvQ)-qtW+}
z-y3`T#;en7YiI#aSqEMj4ruy3gv;Ebpx$a<B14{a_W534nAKk&+N#w;HzI>7<P-sh
zr-Cgzo6@6TX>B>J>M|no>U0WxVWfm%{?TM@tylhokC^2;jd-K)T6dq>=tsBkEG+o^
zdZdKQaSRKP4_~(l$;i8R?`*zucK`xq;*b*1lU&6O%k^{$gjfK_6YvX=FcpNF4AWOf
zv->V7eeAm|?(6&fAif!r0!UR!edqf4nfV)VRam^CQw<I_iS9Q0e&KMRA0kOJTx-n1
zATyywxVH59rU8=K1kwjhDXQPvM>8My+BNJaj^He2YKU}`ss@)Jf6|j8Lo))f_d$B$
zA=)yOF>zl9e{Fj5;-l>3pooi!qzqB)cp6%@0B@g4-yQq)8&UyDSP`eyj~_qk&fW7L
z^LRQbsTdW8vSrhu!;lJ*m*_w2P=?cFZ?U8(y~=MoV88&)!`bgZ(2%ZE;}P{21QLu&
zLvs24eOhbe2<o5f-P+DMVKuzwt4d>GVtK?}5jCPg@8D~19n=FjZvGe7x0!WrpaS9X
zw#iu%42$?hDHTY({L$<<B1Tp45C5lyN~inCVbvricF(Ka)rV{cNs7mCSmnp5foT8d
z>j730lirnwhjYvBxTCBJnJO$uoY&&T67Wpr*?i&pQkw(A4t;9wI=FZ(r{U=v?6-1`
z8fzv-wYB<Q4^6_aAKw81gwD;Nhb^NxFJz0@Or2_s@oFu6m&{m8{X#8^$W)x>5I`Jb
zfybu)Rm+!2T>xyP@93iN^Sa^Jym4c5!|cA$v0Sar%1pPh0T0H^9h_1-x^UUM_{+%e
z7hLa{RG64Qum0Otrd4en{=3>ATbXuQX{5eBsrxCV$wPE{Lri1ltI^nT@Rpq&sHn|S
z5Su6cM_oCDibRk?EyO9I48}zRDD?W;_OZT{yc|n9>X-I<KY>JzH5?t6CwjSWqzSCS
zFT8jE{>-skNTAUKj6U}yjAC;ynMqPTE=YEA1*A(!EdRNp?Y8UaSi^vaiX{FU?brFh
zlLexM<oF|GG=q^ivV$n3<`kT7j*rhju}6zgfO~5^LrfZcpEvFH^X`zbk^^$#0!{rp
zYnl<Z08VQnk#KHcf;d6LOmaU~McM*}of;qki_-5ma)dWi0lyI}&#0JYpRWm3Cx5uC
zrEmSdVXkXPu$wU(RucdNMK3U*<g=<abN15y_nKemH~t35@&;Un>XN=mTA6PJ*&)gW
zON$$#(Lgs<3=5o2fFct&8zL?{s{Y~iYcl>1%+?>uLpij2xjGn?U0Cj%?wS)K-W2?}
zpWhuRV9jkIcsAcA71u@p?e3Man|jOejAP^CYe8mUro;G@jt^Pd=QHV3LI9X~BchtY
z`TirpXUAjmJ4QG<21?7((?f|O*3VFlC+BM$<$Gyr8n}_<wQ!|K`Uq8%bh9W|?Br_r
zAAP9H3yqvBOJlNi`frN+n`&emB?s#Z!P#gO1oc5c7v+OIVQ*{9wc=lpCub<fm6DSF
z(n4iogdWVU^8%0EhaKBSf292?rdtfS;rgl9uvzOxz<S|g9ApwKOMq=r!a@s<9*nd#
z8amW7axKVRFR)(9%f!2Hyw1GbG1~(eshBBn+>Wt}Pq8WVPRylUg0O*@!z<J6`B(ck
zjG-2ENFE30Csk@X?|xrSYLSO5LlV(0QCe^M_g|H>6T{em)M!D=Un?5d{nHj2wbGmV
z%~q!;K&*C-qS2M=6=o6HCFjp^e4w)cyrHqoq5yx>Z6aw1srZ+bPCe{9<!tVoOqf`X
zmN%c`@fVd?-n+2tSf3+fj|0VAnp&P5%Em#VAw3V5rQ}yijA9u=?`Z0x*9J<y{B6PD
zII8f^W^#Cel00eOWl`@|d_%nWBG?0<YjAEpW(Z>{auLyy)GKOg+F<5H&Yanndm0u<
zMtgIti*5f>hkygnXD+DU_~aTtE&`Fs)gNF~cZb3c_$1cRd*v@J?VP1fBSsXu|Gfgu
ztAxd&&ro+W0Pwh1H<9Tk)@7zL;CJ4I3Q^iou?gUvLVg{>zIsZegy^4qa;$QN`-&Bn
zCp?`OdwG598+C=qkHD7maC`Ul#ZDZ}E){dd?qx5Vb89Sl(C!+l`)NcC34(+2vWw?S
z2l75^Bl~Nmb;Wh&%+`3isi81_eA`&rrq<1%7w|+3qU%3n#$QbI@q+;dk(V3j+~W7?
z#MrUhLTM7EheeDLrQdzzZA+SYnIS9bKyhtz*8k6oPS*-_n|>bY`^}Vt-;#kv%3XmC
zWK;tN3kA8Tp)CdXUdchRD#<<+3LEDm|A_0VoJe-YyEOsuN0uQ<Skko3kZ4+Ksff5q
z`hTVqjF((?Dt<agvUmDQtp76+-RUTBo#j=!O_&gw*Jbo^$8)hsh!<bJ_-sOZN9EC4
zpAhJu95`(W7>FmuKq@k<&$%1t&YU)NsxU8tv64<?OP>seS>wr$j!sW-W;<X_7&&^h
zKV_`u_T2C*+o+7Vu+y>Sup4DA8{a|nJXE&SBA5dj!$+C2Yfyh34sy`(!Xc9iO9JWL
z?DMS=f7;eHuY&4~)Opx0oN|gzx`Keo;w{|j^q#@o#8;-9QA9|o{2Lxyh3%`@r~0dp
zgw22c@@2XA@bIhC&(KwCvtmZ-NM^3QC-7kDuXe8}?ImEuYQw^l*E?lQn13Mv-~v8J
z9Q0?Zew0MzxXF3W(-5$oRZ+Xvb&_WCst>bUKDQXd#gf;(?XaD?X4xuI0^^x|CbzJh
zMiUoYYSNRGJ%QhUk~U|Ln&?&~3#^#(K(1W|YRDuGn#-C3rFJeY6`%PT!%4sa^{blM
zhD)$FBM<kEIwCV9NJXQp;&d@afd&kUt<E2u%icaP@4HLLl^%cYlQm<fD_Y|gQV*D#
zx-(*>CH9Y{-B1pSA>qukwe8`Zg@6XSL?10f9%ROn$Gx#mMl5|qOMi=Fop65rni632
zS@e*E1GInxWA4#~3m0CaSp%>K?BReYy;wO<E=%-bu}bzN<4BoH<Emc64EvG8F^y9|
zgzXdWttUk%wi1jY!#zRcN597tUT)>D=OtBE{Ax=NLU|{$`*JRI9acHH0jZ*A<~mY)
z^5ltsDLIld_*3{ej?{qMHww-+akQTQbh8iAzX613AcT>+?&#5Wi97yELG=49dR-?L
zSuK){ME#nRqs&P~37NO@I5n~;0%4#3w+jtcZZLAn{&tN-x1?ws07C@<Ay^wYVJku3
zIP^@{aw3DJgQ)W;(ojD7GajdoNct(kgXmmZ(!p|P)+oNY@An-$)_@b1KOwlz6o>UW
z!Sa+~q=60&dKT3Y%x#_>cOj6vS{x=YMTiVV$2sLC?HBTM=qGwXnaQA<kaj`dV2+!T
z5(xpF2>tB#?(vSzP}kCHNeJ*NWac$i9`d`wh_X--Um0@tx=-5D3)0Y|F7F_Fku(Gp
z0#nJVl>j-a3r>6hQw${QkcLxHgckNtgooaBZ`$*z*;%?T=uhc1(9nTDLdtNV={(!i
zXlUR6*{(@Sfpg75erEJCpLKo~WV!69(3U+U81d(dVTwr*;$V%>7)97m!YgIy8g`RR
zG%bOg%I9ot>8z9TrHYClSNHr=#6Zs&kDD&;a_TnZID*;9v4+gF-`<AD4ndp!EM1b4
z0g_zCPb$O&Cm<i|Qb{7wHVx1h7#P?MVTbB3%j-98T=MyO*+&%nbg9^dl&fzH89MY5
zmFEWO(GEt=v&BHO+ox)o=)}JLD2Y?OvEm;TaReocC4%KI`3>}OGQt^5PLGX9VxVB?
z_zG^{y{mJ3soTPKrl!6*Zd{KG1FFf_1I)vi{`&p<{;W4jMZ3PEqC@_-Auj9UWz?ei
zWEYc>hb>43Ko`jBl7u|SrCupbwZHGj9x$dIwujYR!Xnh-bO%r*qw&Fo6-l%ET?3-l
z7s2>l_k@zFQT<;I<cdNh$>_oxB~OoV4?g3`K<grpyIo9sCYP=3dZnFvX6Bw!ziR+e
zvz_s46mildi+y$}4Vg#^#2O*1h7c{^Ps*iO@Fk#1pm<_gq`HvsO@Znp$pNZ`=-E&X
zG-R8I>Xxclwc6nzBiI<mDuK)XNV*sv4OB-t9g49J3egA<CNr*E^wstMHPXTx>~=Y5
zy7t(beH%}X>wwOb|0eM2bak*Gx79Sp{26-!xT=;w8!Rs8;}f<Fo^gA47#(=OuLK@a
zAoP0v>mGvuJr?cf?qRi-Gw@1__4|zg-4MjuIuhAI6hZkugI{XE(fp?%(wC*JM&W>2
z)&-PRKyrAeupyn!ssO59VSF^HWWuWuGgt=<cAtLhfX<{yaH7z-74up6Y#a{)d-v|$
z8wtC_K8?@}ce9-u`Hv7?(Orzjh7X$CwRxu2_gfRT*E{hDYoJNLzeT5quOK}7S|1Jp
zr-=*dFo<Y!QL~h9b?_F&FDSPES@kvLvd}zu@hBuAc%&od7O<lR!(kw405V-P0PvFf
zoM(*J5i7@!5k3<vl;wQjknXvpR1dd}vJypvz;RdxCVILz8aV|btfq0Z*FCQlx}E~k
zP)*EcfV6nbN#a}pfRgTJF%_U#0y%kd@mvg#7hdm3JipL}G9aE{&z8Wab_iu<S)d^y
z;kFe=Axl-Pv?7rGH8-cH$g07IU9d^RwNekO()4WHx>b&W)^y`Ae92S5wz1}+7r)*9
z;hzGstK-Z@-)A1Cd5Y9gM^yE#NYh7+h8nP`pJqs~y(thW3gs*&naEHzB<uyLIGTjX
zlO!EbfeiZWawaR)<K9jMFrXO=az3}1FfM(59-T?59O&dV6L;9@?$bu0Mzh{3-&*%M
zWna<|zSg{yUyDUcL@z5DL}-E6Lnq)@3ivVM<F!-QH&kBo{O*zT-Rk=D3Jz@WtU~Vl
z-QP~m{G$bsA=~yA)$yHwADh8$cfvIWGO9q|H);XreoM%|0MoUo4eGs3Gc=CA%B|`G
znk{W8r4}>If@S&gM1@@y0~X*z3(~d+kW<G@vtXh&3^3>5in+uZk;(y?b6|v>hvv8a
zB~_h1&Aeb`GxbE$qTajE8?jbnOA6+#h+P*FV#Cpg)k6Aj)Q<v>0d(I@F^S`HK)W=%
z!1-(b#jTo4T9)ZA{Ly&L@^c+`dA+W>mse4AU-e|i@ALov8h}3qN&2S@{TS{Aa4)3>
z?&jC0gxmJ75l^?pS!t}^o)<oj@owDRQh3gJlH5dYPI)o3jmEuuw_v`TD5bj)lP0w~
z)qs+(ID|-w<c-!ZUsB%cZ(mQ%JAujXho|<Oc)RCaZ$!a3aQXjw8bP<#rR1NTeBQhB
zo%fqoRxD{%*k@(SdAm?km6npXnt>k*CMQj+?~uT@-t#^;SorIom$O?cO{#78=9T%x
z?XTYhdCO7my(ndACx&+c%GTam>1j3p`p<@nCh7rs>vYO~oa&^TG6b#YsfP{FcRB8v
z@rRl0O3PDKe&2O#oB^A^OvXFW<YNM90!k&S#Bv&uX@G5jP1YF2L^Oy}LPo42$dV2V
z^`=0rqTa*vxXNnxOr=-<Txxb@`K=k0Nt7C{r<yliH{-!N=)LaNIx1;Re9t)Lot?Z*
zKmU_)`Te67x7V#Gd!ettY!W9S(xQzePcjZ#-#=>=`|3wjyT%9B(MGaRgaQQ})ol=H
z*e~4w`h<-o%e>Fz7udSYPa5E9^@flQZsmzC$tzBjcAWF|RdQU#Ze>zTJ|{cu%K_-X
zO0Gc?dUpQ8<Boj_do{VzAC2gdF}7MNQlL}&h$8Z?`=Y~p#@qbSAg%WZYZSc+QVJ^C
zbS3^Tn#X6zn)`Fv$U0{U;-jL>mo-7hFKP-(o&d@_Dq+dMso8_$$M>F7h%<`Q0{KmQ
zhtEBheohR1Wk!!|X#IHrfE6@y``e!hZI{)Al!)t&!#lR*&ikTvS8COpcD{~X!PeRt
zPKqCoOmy=GoGi#QYtj734yWXSV=tB_&F!A=`(k&VWu9h2TvTFtV&_>adFriNwOX}n
zd*RuMe9r@;fM3paqC|F2SZ=NMcG(l}hyI%Vf@Rnh3<z~zCN`0%F~feEdOY({t-*a%
zd=0`v>@!e2q&8OJ<){C{{)O1^19uVZbSY8N3y883_>a>@DtC|aOU?FS&8A^t=}Rxa
z@!x+f=BUl;9!W)S_p76Z2VJ0X$Bqrz|CAdQG4)Y&fekA_?{(MT?(uc&?$WX0yPN!L
z=lr;RmkZ8c6AdO;YIseURo%Lt-GJnIhu_zJ=(@kS-u+9b20N*LTRGRD9Y0m~ZE|Hj
z&C+JK8XK{pIyAk$SL^Pn<UTw~BE&PL$XnMbS&c#xcUAWGpABjp=qQdD0&7$4{J+G-
ze=?JBVmx$Uo;Pma9?Y>VGsJ30VpK9Xuv%~!F#8cvpDHE0L<QZs^DLqF$Q{!<_wN1Z
zRdJ}r?M<b9S`gKP%uV-RKijLGVfK^rem%MQ)H`hP<kIWs13WVVmiWypj_Ds)*?YcG
z?5n(}mnZbxo%ePe6*PNK@6DC#zbtCHo|{dDdv!p?imh4GA!M8V{aCaH59j#D`Vg1R
zzzdGk%`ujr$Nlr4TILT{)96_3R+PnYUBcrKu0&sD^kx0(FY~Jd(PGFfJ<0K<X}o>!
z-ixCrPzU2Ikx51Hm9<ti@OhQ-DKYf)oRc5BEs3uD88~Y8<Xa_GlLGS#Jcsq4yrF0|
z;=o@Kw-%ha+-lygq!D9(yuElKY0<^f9(SuA#|6E8rhjz#&Dn2f7u=0X+)``u{Odu<
zx7<2k9N%R{zWt6<xl_6zp@@o&)!@SAPek23$=zjpX8$>hn!mAITd6wOxW$gUTd!WY
z(P6R`D=~bX#c)^ZQfj|4qVe`JwKcUn?KxGg9J=%xwics|tC!Y|F0jqD&ij6<Aph~k
zX^R$UE(~fB6F2wg**#WP>#SGB`pqexdu%tuVET~uD+x?YC;iWL8Svvx&vt*BgiM81
z?q<_}Rcw@eZE9pk7H8?WrE{hoPYZ0=66{k%bl#kWsQEqE<QZ-jf%En%R&8vjI%`h7
z)k-xWZ|RSA&UzY`_b%cL>>hceWZ~pIel;?iz3+*kE~hw6CmV))6TO+{x>rhn-d_y_
zv%gF3g_Hjz-^gncU^{%cCLh>Ht=50;$!`<e7`L#aQUXL_-;y@ou|}f~HI+)-uT(kQ
zYtOe($c{fAIR3c)r_24QYy>q#b*GdHWRf1;zuTW*msuOF8XDW#9#``k1q>#;gYdYn
zl79@oA_}fO8x)FZ4oAjNE$fnICA1F_&a5D|wz$;;moHT){CSYfb!x=T7o|UVMAzT4
zu*SPp&-M~M51O-!t4#00$VeNMI;VTFQ_7Df@}rG5Z`!2HfMGJVC3VHuexILX3?$!T
zED;Kefow%lzdU->;%$^llRqCi{b;M`P1-7cpJo8t<rvbw(1LRJz=fe`nsYb^T5CUd
z1r6Mq<+?WA%4hzn(Wyl%>t<=yK}^hfekJnd%j(MoyMhLc-SeS1Wu{3)n{Z2`N`18<
zi`5KWT{;@840)n_=|I9NcjuVs$y?eDJkasWoudhl#!vLHnZ0hDcD%9a1hsl?HYyou
zoeU3M6S1a4_`_8`WqooZvfK1(Y|`B9?Bu*l{as&nxYf&i=GIR;UoFccn0LXusm3k3
zv9XP|^mh93-+`l03=3{MK6uWw<2R1^UT)lLH4B241n&hgzPIv>v^V)nLcWp`Nim0p
zXlQNnghlbZQhARFo$64`gj?YRPL1u_<-Zd7d7EQnw`gzbiBlwIC>{cVk<gP67EzWk
zQ1N^UQ=(ROHi;gY_VnpfqpR5%GNQ1-i0c)!aQlC+E;_AwgSw~Qni;>{Gwjf*!`aWv
z?ZO~igFsDt^=@En{G*x`C?u49odM1(8SYmPIt1wHT=mvx=Q<0Px+%P<NwFev+6>87
zXt;*0WHDZC+E4d{@rTY@hhMGnsvEa;*%4e6($qMg&o}VEsF-OrIP8LKtj`_Stm`~}
z!h~Hfmoy}JL4IPdt~0+E*;+jQ5tFU5bb%6Ye%#%!D=ZN4yPekRbP@G?%N);IxR!#*
zy`wyNU$yG$^qORcO^u7{HI0sPsp)Xc4&=CHZ&o6t>*pkE#lF=Z<pF~atj2B{UvZHG
z2Je-}k|jY7CZ|eH@+zezTG*)XSz8M)@9+p_H*UGXnKW&?{l2c}-j|l%{E)A(Xs0#}
z4kuaLc|Ize6CERmUSxd*2k%_JaifcI;K05Dn`>`60{?dEL+-+}JD=7Mb{hj=KkVzO
zOMgD#f<%K3iq@CL*F3IJXZm>8q!$a%`WqV=pS&^9X?mE-Zz1BssSa1PLiXjpIp*kE
z>-<1py?sE)lrQK7$<`$6%fo9RA5gvKvD|s%HEc;I-#<?~Exf(L?sE52UI~I*(>M|p
zx=t_{_Rhw^52vgPcH95?c(xam=7=^5MIuqWVH#<rzt_Y<W7}r|yrG9#<0Jpkw|hPi
zl3Kzi0e>ZhVJhDBFbzl|2N%9XsN+pEtlK`zi_`AaJU6_9Llz%9uf%A};Eq|@&j$_R
za*7|cFK7D`S6N2&P5w)0d42Sfe8`!{&B8D1@QNQCaE$5Lj(=ufJM6#Zt#ImQGC4T#
z;C!=FDP1ji(r$BJ_jxH^4!6@Dceh6{X#zzlfonE{2d{O0YX18r8b@~>8kbkw_ue&!
z-KX0OxQY}NGN}%Y1pK01yBVGxjiWbB@4YIK?$p7=Q>g!EJ&)da2V^GT0Oym8s_<A4
zKUsbApXZsW(qPll^s9q&r*AryFe5r=rhw&rmc;I(Z7@F3?vwke*?FC$Hbkf-u*|&V
zHzo{3y7}#!D3v5=fTIgpa5{AL=%C3uck$igj@6O@Le#>-i!c}aW>nXl83x{;wJ{-9
zQ^^)lzht4V?DBTQk^u=39TD);BIM-ePWxJ6{^tl-)6OjSnwiP;;JR(~+SWA6iu)4>
znL$kM!#+%a+5iYH{DFY2%1THjg<h<2-^N{NN_x|m2KlqbIyz28?+A}!&vXg#;GViX
zoa!G31I6^S9kXLGX)jr7hJRQt2kI-C_5(y0czP$STQmv5?Y6iN{B-Wj8OeXZj$*gt
zY4a_l4)CvXdvRJ@EOu*tLvL(Eubscwn7pNdKO=Xa_6Q6|yQi3LKzx4pTMV`adJhti
z6IR0(eyY4yyWILm)>IQaUPJFPf;E+<`dzd0;byuGD4rcV7wVKY)=wG#p!GqFzl@{u
zfopuR@g>P}m*Fukk1-ql;MJ%jci(<pQMByrqe-2e(nn5n)oA;BA?{QQ@2p5H-Yz7l
zh1P`j=Hx{Pl_|5$X?!eNuE=GeIeFVkzKBd<_iHOjD96jl`BUMa#+`C)C~upLzDGXO
zcBT&P@nnsNVZVHtN|f2OqtW+j*w=<<Ki(Yqen6`i05kHRh!h*GZ6R+sar;XK@MM6Q
zDx+WFXbo~If8@WfojJ3`_BzIp^yL2a=}3qFh^t=pat-P`q-urC?Cok2E+6^swuzmZ
zS<d|}z!9+={eY4$Sbzv0g;0S<DSEc>`{|Vw7?c*moXU$q)|3v0bi(yU3tAR<g~xw4
zYX7CFuCAMd+J+Y&+y&AS@(N}y=M!f(X+;{-#yCNRwu4k=VSI_qY^bR@fMhnMSIaCL
zp1zPV3wyC}`Fr*lOgD>g-ge{np%?y)r~7Q&Z1eVGX3Gg5ay%P=P^P|=Lyn@dlQ>H8
z^a$s~klM6ownW!=&fd|jKC-jR@G&)n385I2+kyuQ)rGvI=<(^1*Z<i(%qcxTza?DC
zz`=u0&bk<X`h%3~w{LfO?R7J8xU9NGZ|}NH=w9=@xP&twu3DoVx~A$;w)$5-P0j<8
zM6*xiM_D+I8{DRmK$6nYkUE(AVTpQ(LW0C4nncO>>95iykVRQ&9fa5cXv<*0+1v9p
za$-e`jSb|2A#0YaM3_Ozl$<+z=FCG~mFXt~A&dzY7&qo0be9B177|NJ2QRP_-O9%M
zXu{V@WXk!a+5p_33YVGuWdCl(I7oU-lb{fo;8h9M6tPN|F1HWrg6j#W0(eQHpAldk
znR(sEetG16+N;sXc3UtaiClgEsJ~eOd{39$i}H74W4m_}Efo|*?XX!BV9PK<lkuKE
z&0I-|NLgD7&6|6y82MhGIz)b%sj1tQEqBwa2)JA0VAAqVRo54Xr@!==8rqwm%Kfjk
zDl;t5;FQ8C-0zXx;5iT4gkN3%v`*$2S7H;GF{>%F=#=Da;L;v2XU?(1A9O|?(Gh%1
z=HAOt<TQx~&!2a$p}xL;)7(is1i}hMlT3`fAi+S#QxQ&zuY-vRWhIlVTOb+uwK!6A
z!u82e<<S-j<~aC3x-^1>3)F=L74l3>#X>N!IK?gFdwtMmLT4o{gY}X=eAUOsydCLP
zMOwfu0F6B>{vwmAn!=J$220ljUMQCXOH`99RLW(+IT2?OKCcbwmkRKIEGq@ni0X?N
zh+;OGV(8Etdn@xKX;x?}ab6A@lqLv2jEBgG7!1R1xx`@?T3TAFyp53DU=9UCkV=#q
z<H*z!Pz{}#Fjo?FviG{jzi2%qJ&8J4@J)szrsQ_-(WCfcWhs#M0Lo~YigfBSfaW^T
z8$QZ3`QFS3mP95B$rhMFvb;T^;AP~Vu#cj~K}@?UZ&nvw5gH?dd$k=YJlsYYpA<EN
zE}vcGe^t0J_Ql8N^c)TcEtxPN_!4Ife${;)R@~x<nY(Q`iIfWY3%{TsRq#5I49CRv
z;%I}5ee6^HYrDm)MznYy@(Fkc3_MFC$p$BiKe2?x!1|6!FJv>8?hz*zsblG2LS^XK
zjex3nE<Cbs`Bns1U-GRyWOZ}9*e`0<VPhAR&WnE~M6~MYl-!tw`D@wIrMfAq9L*X(
zc1eDWq&bLKEd1&T2yxUKxd#`a!AD4y_0BlvVo67)-Ae2f*`D=bOQ^Ism#E(`m<ij2
z!lvp=Qi_1ZHOD@3fOT)XMcp|GAA|l>i0?8I2^b){qv2R$!pji%NX1QFjh>|OJgu(!
z+SBvp`b!&|3g%0oOMR+(zUB2}gPGY>r%{Lszzvphkp#pu+!As>&v+T6OF1cuLf(PM
zUXl{x)C?51bgNjOIJ+Z*?sE?c0wpRfc<q)cUzaUk4tpnQUzCi<74g~vygk?M`}ww=
zn!5VQagJz0#el!Ts!EsN%6DZ+vH64?;@_<z3_?@<COoe>YdzZ_dXD^p;tQfTzPHJ~
zH=c}|3>EV}h!Cpup#BSo0&1Z#lK%$PW4!ylb}{!-Bnyf=Sm2|WL0PC1Tu2bUa_iP*
zzWh39yHh<YvpQwEBp**d-Fw%~w~|sz3X$bc!oqC(_YWwqh#q0e>yorZQb>rUDME*G
zhFa+aHw9a5+&cr8W#9|)BU7q>&6lxfze@K=smu6;q-3AEX8#sHbH~6g^}TW1mvgM9
z1P9vr%yp$&9DtN8gHKlu52(-0veK%paSIo=Vphtf(_zOL_knvxNrFR>U9=i?eQbn7
zU8ZxM3e%PV6d4L27>mG_d=n`bq=Eu1fG?0D2BwY#;=O2}<IaCnXM^@xSE2UQldmY#
zd8ZL;D8b8&c&|@e3{%C7O>K_-AXr!tIDNd{xtNJy1FWr;Wb}mG7;KK3rGYPs3z^B0
z`b??IVtZuYZ@Xo1Xr<_{<=8}5C@KoQzDu>xFN+~nWVa5H-5YXK5leEJQ%<rpBEJ?}
zON>d)<EBi+(QpDTMFgktyx5)6K#1d6Q^-SsP@hRQ9Xi<QBaIH_xYg7VynR_PG@k;K
zVoo8=zL5E{4-)6L#G6q@j-PCo#ZSUwLf<TT1#w*rXT^XdZCQ-MqDop0m55dK_u(Y1
zDl0LWkHLZ(q{gLBT_*81Qi};yf9sZJ%8z?-ct(^VCq+^&d8ouQp{J7lQA}!H9|~P9
zef=;gtym77=bb64t0eKYLZqVh+e@h`<2Qc({Hd5rx?x4<^IQ-;SA6L}6;3|CzNlDv
zWM>%6jO43qTguNmocCPQQgw*_g73Gz<K@q~J$<VBqMb~^!>=n#o`}utFT-734=$*d
zHOPUxf9S3LK209(Yfi&b(<LgNhPUclKD13(!ZindtLJV6iKlU>6V$j42m`C!Bd`Ab
zw6&d^Q@BQUazDyzBQjK3W`nC~Y9_64plHj$TaOrI<EBl1wDT^^`^ZuWq2iFK=}1hQ
z(CkX52K^~kDQMS*(92mf>V>qWZ3oX4caKmU>;=P30}C0~BD!Xd2-JK><895kqDA@J
zg)6goIb;2TQ5hk5l3fSEVaKs=GIYoE#C|rALhPI%k>Cn52rE-wPT|yy3u!6fFcZT!
zKxP$*2n9{f`-PkS0dW$Igh)Cev}yzvnT>}VwrP`j<|mSsz7=;z^O#Zr-XwXD<Gq>C
zpdc0Nuft-eQgd;nYU%0*6T4L_h3TxJV^3fZ6)2u>BZ%P?wv>}z^}TEh<vsj<21Ut?
z0XZ!SU%sqMA;<g`Nf+EW_<%ss2rA@SCxH&pUGYh3EPm(YLQb=?j~~+!%9Tr%LH75?
zfo+JD&63ncczzM2pejrgMK9VPQ6`cfQ}+3@f}QGn{jyB?2Fg+j{xE$E)tNXK<@%K{
zbt<9a9x2~LA8wCjVw0pwYU${#k^DH>lyZIvIZBq?x&{9nEXlrgYrxW_J!R=08=r9P
zZIyP`t&W89aW2Nc9n&DU`@@^7Yk*5i+BI9ID-s(w+V)>Wp5O7S*@zU1$8Dm+W2*A$
z*?oDJsi;a3N7d#C>o6g)0gA(f!!4RLvZVg7L;=ZYaq&}dHLu#X3kDSL0Hf^g%!z!l
zuopxA*7J^(w&hQMNv+hj?d3TaxUJY4fSRgGSn;A5Ci1ibENLPslad0!ysKG>!;Xwu
z%NlNZIq%);*TPt@9~<ZNwlK|K3PK_;#2Yr_`c@+gi@VRxZoN8PBI<}f)QoTlq*{;+
z`0>Z@_q@S@fM`=giCTe3wypj9Ob$`329lhLi#KC);l=UXhT`mDEgZQN-)75T5(LBe
zZMCFR^C(KMbL#u2j~_ia@wmpw>u-t5%Xh<u4G&MWqYRZK3d9-I$xJ%+`1pJfJzmR{
z%aO5pkG%c6hPI~KVw$}zqf0KC@h@%d@Xu-rKgb|ZN7?>^51f4Z%TwYhd-O<p`m6Qn
zBMFH<bz7f4y~yNdQh!=B62!m{8m_tKkQ=_RrjZMsn`mNwE;)Jq+<}&T`({77`zQm#
z89f6Dc`^cAO0(LD@k6N>8CEV(uXtQwT5Au>U$WB5X37-9mEPXUN?aF=EJLFE-S_|F
z>d)hHZoBSrJS0i;KoTXH%FwJbqzSnSxhf5sIW-SRMU&7(WmXxBD>M&Eh0+{FQi_r$
z6loGFslIEU>3)8{k3XK*b6;|HzK{2D>|^h>*IujE!_yOd?V(4TP_BS-A$-BaH}}ya
z`Xrhn2FYvJWU8O}{B^7<=mfyDSy)IEQdjqzrzui>f_6b=z;B_Wj${WT46tJY=4YQ&
zb{z>J2qVdBwo056M<RwMSlO2z(tZG;1oz(DjggNYtw&s4bw1_STxf<PQ2Ghv5K=WX
zG|S5G8#|FwaY3+bMt2D=?5-XvD07f~C=+WRj%*O77K9+kk){YF%%D9?3ow~pFM=`<
zxw`~>u`z&)<cSl<G&wtyi<499=+WP2hpMfnfXu|pSr=A2UD8odeq!Nz?r~lBPu+be
zM#8PLmo0%@7yNDlDh+*Y%HBPoxlPRFvao=94qM2DZodJh!3|*=3-~(t3$#pTp3<<E
z?b&?k4w53|fGh#6tyZ>8FOSF9emPv`YpsVon~Y1|Z`IHRa<bsT>wlAQ;9%N74!fus
z5XrhckFlc0!Y?4B2V83q7CnRbfp7}tO(7_zK)$xHuy961p)iUwpH1KP15bc1NUGi-
zP=Z<Hdx?pVZzBI##;2+|Qog0*UTZrm=g(1R(3U~9fKm+vRnLTObi|uNxjN%%{=A=5
zIuXYQEoB2KZRpuRf_&DstzI`AQA1i<y2`r5w{HIxDI6hk%hoDriVF~P3$mQ()xob)
z)6Ff-{3g%;A3D;V`Xb3q?A#KX(7_A~@?qRaosQ>1>`sD8bv5DYUUT!d_V(!@py0x|
zDt(^~9653rW?sHNK0a*MJ~THk1R#xHZ~|oDW5<paJkzZ$(Xotgi7eT%{P2^motrlY
zq=#jy;fKne*Mj;E4HO>bCuFt-M59bt4QeSsrg@Zi!ig;Xdpd;faBl^8n^_~QdwF|v
z4V(0IcEVP;Px=GfzicN8lX{YcRj<2ft?@EkAvnW=*Ge^?VJUU#`S1qsW5>)MNc1}9
z<S&U{bO;e1HWv#ObnJd_@H~82*m8+43I|SOkiEwFiUAe*{TV1H!Um3FKd2N<iua#S
z0vp&}_VtZb8FCjyol#yPOPt99n56MZQE*5I+udC&wEV?@PacQjB9wVyT4ASK_V%=W
zw-Ew`--WRFlMXsrrp|v3guku-{mY?HZP#uC2wa8NGL<%Bq%B-`?u>L7Ly}LaOIn)S
zQ*T{?%*l?bVwvQh)yoE)O>-UA!rX%hBq)LwAH4x)dx!A7;xF4Rq~k}um497b?aui#
zO5pRn?Axhri(VG#J2O1N9tH)O7=4i&e+mDEMHFO0wTk9uYDplNvQSIm^v#<s0X<)V
zyAHoQUnOXoCUt2$Sydvv!<1^K*(%ME9j_m1e0LbJ=xA@3v&$$|IC5;MpYD|ofE|?4
zl$W2r0he12Ttb?r4|m*^_v?Ic1!1`zgczEyaEftNFc()ezSL)YT65L^P~X*U<s)wn
zo(}}rzhlP_J;rxqge8$CCmjZ9_M!QN6Gw8qRM3JR(|FB5W?^NM!k3TOdQSh?n+hqP
z1p;rYeLKpb;#^#f9b57PP@2l-Cj3fxDc>B7{GJ$R{Sc*o9lxKwI@De_KK<9$Z-*Fj
z+7#t>KYMm=dslS6QIFxJ^>xZ88%}>oSKzJfe0O;K+~<2rnZ@=$?zA%VxeCVZFJJW~
zw#w=;I8G{MzC6MY@Ghhzb7^!m8fLK2Vqw9g^hpp;NuEk6br9rzBFd1y5~x_hi6aQ%
z;kE<jIIu{9XRNu4V2bS-Weg<i_?0VHUJP)~`fs<r=;<^y{5_*Xh@<0x#q8SjEl`5O
z_})w;G|{@te&6grisa)Ys_lT|<y!=!A|85YiL5eF5w$Hewf$|1(TjlH_aAsW?K&p2
za3W)}hk#>&y=104`un%)x2Qt~`x$O-{X&K^Jk-9SKRZr_&XZ`Gz1$fW9<wWR?p(4n
z1M}OV7z17e0y46sSv183u6@nyaL}o#b44X6rL!1-qj?%43LyZ2^-5CNsBn<CbLS+1
z1f;NT-3*rL)2GAIk9;-t-8mm(A`4drr?MV6og}l$La7Todrte$qu$<&5&v;aF_=dK
zFsQHiL_{10GbjTM2w>SJ$&XqXpNSgfSCd43d_3b-XBFQ`$<;DmKf92BxFb)nj(pb8
zFb@bBWoihXxJN+7s_T1uE0{-Y3o2d=&mdEk#Ym{_>)+eqNr<cjY3+1Wmu1Qb_M_L!
zf8?A7T1QJ&E-Uy;H1YLZ;H}GvYYVUUN99kas!Z#nai?YrOAI6#GBThI&Bup;)AYkv
z93zWxe>tutp^h)`!Ee+V=4ILjie}&#P?h8J{n>KjG=n}2;KwTI7YP!yx3{yfV8@f<
z30o4|-wCMIpPz_OUXGnrc4wz0x(lEtfq^%}E%%hGf9NW+r2xF(q{|Y3q#w1E&C@S)
zI5?<)Axt4jq#kesGG8D8(^;-<@)f~+q0tre$^mjY`D{vA1|Xo!2wI}aGmIkPM=6NW
zgC+b8z}<PFNXJ4LDDsu$9gq+kz|>hgO)<ekGZe}NW*a{QHA<|Lm96pmMhP7D%ok77
zOWRL@nTQ&8v(zP<?;oFm50LGB{a5R~=T#7xC-kZq7iZ6h@u;bmi>;{%Nvxd^Xnbh0
zK>osPJ!&IWR##WY3w+0l(n&%Gf@N<?nHlhO*w{3=#?4^?oumJ1%O5tfKtodzTGsvO
zrI2HoqTf8YM?J;O3SRasw8LnA2?axD-eYV%VsoLNgiN1W%MK0+!(3w1XbuUT_M~wO
z6%dn&|8A5BC#?Xqh#qAn6JG>*CO1-1ko8IBrR<KqB|7I`7%U=x6&nn5p%}aHE+-TW
z@3MV^C}C()Kk-sHR0K*<$+-`-Yd>;2u@TBC@cZLsp~)ngvraZL)Ahtp(ubnZ{1Z}w
zI%^fCXjxc5fF$QIQ$0U8-!8zNxC8(Vd9(oVw+%E|c42;`WvozIazP}jW7CfG3jiWe
zh>_O!*9VrqKZS6aZAB9nA4Z%6Zb$j<Gq!(sDV8<dUZcIh^{NT;_AgUHR(Lgk4$c=T
zP~#xb?zFaEi~~48@C06YAub+SjIJPNK$4a?AM|X{B*5Wfg)SSpTYaxs!W2yJ?sm-n
zgLI#&N#@WYa(t@LXxbOtC?t;*Jn$cBOaU&s>!F!JR6D@4A_5!soB)7M93%`!VlJVK
z6vxpKrx_R+Uyy=q8Z;ePXnMmP1B;E$&8S6ba@D=PWHv5P(j!v`g^l7b4Q^qJqkV{c
zYb{YG{@`LqnNQ6-6eV29p*u3rn<wdg_*=13`6M<myp`Y$QuG382iuD*IhlqB(2;}X
zZU9;)jm~D@h{B2dm~ts#_aBaSuSVHHY-#q{vk4x}hx#YMMQA^71;Gpts`PE%%d(OZ
z5_EzUf;Q-#^qoLmZpX>cM8Zrr2_e$t4oP(k%TpW|S@<+`$aeSiI47m_VgP_DQdDVZ
zpiDzqgHSh?J##KNtsMu{M(0zhizrNmGs(vhDGZiekY>iuO(O#epp%}2vlJK4fc%j(
z6fWeQMi0nDBvgs+n5tuJg$2+n%dBS@r8W$D(Y4&+lw8HS#$U`5WjKN)(SvDz4g3&9
zqhjcn_kX#2Ge*x<M)-`m1h509j?Kg7BUj(ZPLSI$9af}wOfob9#E>1L+YA{K<Pa%?
zL}aI;ElPF<e84c=_VSfykqk945&8z8iqZ-#>@sTbqEbNDE$;U1Vqt&KQ~&B~JWBBg
zNJ3M|1S~+3fx5KR!Nh%uyA70L;Blx#5&fwcB})Vli{;HC@C^tMqOlx8p>SQK`;8qX
zlK%-U30c4LAA^I%@ZAa~t}{A0XmgfzzOO+ZOd8k(jM8u#X5=vW7d|mGF@E!%QVHHL
zIQ2wRMvaGP8|<+_l7BX}52@=z^AR8b4Ir4(w9>lM|Ae&toglMvTFl9UQUo|Wrf=2Z
zj74xFk(^m*Ac`h|LF*7!h+ql;PD<ILzUwDOCmyHl^g{HC5WT|ZEg=Mm29er&dNwlO
zuZa91W!b52a+KF%k(kNGm<lz5P!Xd9Jcp_Di_8BD2}wDUZN%Wfx&h!|;(u^|AMUyx
znTGZ^g1fO~03gJ(@TS2mY%8E}t;2_Z1kA33z!pVIg8Jdna00WvRAQGOy14y6A_wEA
z-Jn$jGoiMG(M@D%SIo7p6(bNuq<Di~O&od-!#U_VQ|*o3fG8d|?bsjGPqb430h0Mz
zJ4xmM|9e$aV|{Il)FlZpu<3fHJ_Wj1CgDq9$Utk1;xiI;=Ub=scw2Pe(*z8`NaHwM
zMJQ85!poA(lW*_Gh!hR7lA;j4L_1DgSvhb*6lLU5LXHGPTt|-s#E4+u1^9nRKnIbv
zHTmI(BE<Ka`g(@XUKGp-2w3v9v?rl>aJsV#-Qz6)mnk+Nq-;)nqaktbDQFf+&yS=j
z!w24ypgSCRCDJg#Pef%s`T6}jx$4kZjV(C;+>UFd-O8Tph8{oB#gsEpIfaQ%65zoA
z0TQRD$QNh~4~mH`HL-l=Z)iV|5epzfay$V#n+5}Hnj9gLGg(}14c=z^iU<p6#4IS+
z9-N|>yrg5CXgS6KO5|Hyh__a+Oy)}{$Js$utB)}ql)ivNP(hRY96IHUj?qwT?^rms
zjQ{aQEgy>(FY<2UQz9XR-65M(w5#bt<yU!JY%>n4UQb_IU~lGUc6o#jG}*V{s+7fu
zRg(E5YQmwrL6ST`A5liZ-;Xpk;FjT6J|>Vb{Z!O%i+<b6Y?LivzsG{~fIAo$6;)k8
zEZ~pL?ddtBesi9+%DlA}TKEuLUv0g;Gy@F(y!kihq`Pz7>C5K%;qz9*>&601(*hF2
zBcUg1ixdh$`#2d4H(C8QJ-1ErL`0Ka_aH)Z5E|hCzSQx0(M<)B7ja9}%@U({I0?dQ
z`RBlh>-C^S2~f*juxm$>fjGv>#s)e+_wnP$XSfWy^@uql^q@e8aBUc1DB*JLg)gqH
z-AJ-!S_(nY=*;xG>xNuAWiJ%(#u;to$giS-vrjTcsBqBv*&3{=rLc@EORo0f*#~G;
zMZxA57m?M#@Gdk6(4Di@D6ilDe4m`Y;e>|JRbiVe_IXYK*YIZX=+Vt;yo*CU|8r;W
z9{Tlbi;Yd3_Ik7Q>hQE{?j2<&lHs~nth>k48EGKjws}V&^M#`7yRmq-Mm94G=@Y9H
z`wpJ^IDsbhszm~sH;!jl)+-~WZ|&=gMDcgET{vaTHrznUBk!wjXQK%23Q#_Dho_Z(
zoO8}ODLne!kP_t^8v)@S8QUZy+OqoEr_r$64~s3jJ%1c|Tm0rR*B<_+H-4*{uI}>w
zx@w+sliI1Bv@TlQ@*;i)eCG5a&zKqacU13}rIaP&+F-f?SQ`z2g$u*X|0=Ro%$A*|
zDS5+wfBctp`B%#SY8AggA%J&`A(sv3c4&PR|CIvzm&i(u{qb9F&reU0$8AU<HZCrX
zx*?DJg7=f&1ek<6L|Ah_X{$Y)-aMpqZV4I|g}5%!R5_UYzNQU+JUS$ZTk{<Mkp+C!
zEl$^IIF95)zzKQpH`>EQ|InIT8XI<IPo%)Rc;s4utFm@qu#^1n?p+M2G}l<;bL@eH
z;b&{^Ng6PI{rVMQydVAc$!`ILX@%PDx}jP&Np^Ex(C4gq`w3yMi}xC}%W2vLfB{}T
z<Q%pJYkjN(aHPUuc1Zcmo1+32^H(3P(sg`De5!8;F)m`7{JEcNPX+wH@G2s>PuNu<
zahbt_yHr(~f7EK~(TyELpIum^b)vh8VWjLbDO-q$hu^&A@Ou@BIO$K57(3LiY&vIB
z;~F&)y5lvy1-GYoWxIjMg+ewQd*N#&x)wNylmde(6=7?KhXwdC%}Q%+3DxIl`Ej8%
zL|l2a=IT^YP_$i6qUmRH$v3n6G$dpT?n|(iL?Jq05R0k?C=<?6bgW^U5hDm>m8Oi5
zm&}v&QBwE98Y0asH?Z0M-)`HhCbdoY*9u2+p<Kj*CLjXULI@|HQ;xWYWasZYQjx{v
zriduB`qlS8`86IB^?#3?WEggn{M6Ef%8Y32dzA0Ie^M%$mAM7brRHKm{(7xl;=o^I
z<D+C0#_L3Cvom~M2XiF0s$eLWe5b%nL(qNwRtmC0TP}ZzA~U48nki^sK}xy!%@wUB
z>ZuJ2nx1o9iU!9k;upn9^t{R`13@TY#sS9cOVJeiy1H6Dog_qrPR?mVjv9Xbz!v!_
zFmGGS$2mK*dAW8d?>M=S(K5LTv)|~k5bTY|jnh~6>T_^%+Bdb;>>kkj&Z+*vJgj;e
z`<Jm?Q}qbL`@rYR%3og^{Y>_6dy7@Ng2;MH<NIxIGEszmrqM*p44(^TSK0Hbb6?*Y
zKwOOJ0u%7<o71E5%YS!tMQBuZg(MEvbKSP`>@UvC<e%vy_a4B+s^;C=`Q;lBI?gRO
zVC3cGP!}Jm7Iy@8uN&SLkxQU)#R}NmHGiZmDPwH=R4#Mp<VnsNW3)2@F|UkRz64_e
z$oWsatbNE|`{Nf)&)}wXb8p+}n$e65Zt@0H*HZY51;HhT<Zbh@W9tFZcoz!b{?wJ0
z-zpVNBbdO>AfVdhdtH>}B*g_VK)0YDW^t@XktCo%8WU8E#Cz26$lNF;Uw|8&{xc1{
zB`V{Gx2%lKSOHiJlmVcMwuvSZd0&ClYf7902<u&PJ{rhJakkUcJ7YDo;Wtmo*nl1q
z&Hv)o(N}vv3NF<p?y=>WaFQb#f7s^T&dG@~e2sm5`=Shk0Sp1~FwHiCQv_TZv<*JE
z)FAagF{zb+^coRr_}xw|?xlzCeL+7A$Z1HD{IX??w=Z?l8n_Uh@kb>b{4XN8kjYDg
zm|6s;<q8`bJfC{AImoZC6YIeE9tKNLIOF<pQp9;v*p7XuAvgF}+7ICey)NrNmas>O
z-4!*q`bs6#4$e%VA102Tb+dTVm~*9b+oP~Ao|liioQ{qNxpw02-Ma%(E>PPfsKq>K
zWf_J{;OoJKNIB#&&;YvV<X1z4AvIZ=ZYMu8H2Pd#?fbm>QncpA`<`I^<GB%0eT|z)
zX|AT9UrHWQ^7yTkVu#*nK#?Nx4sH@{*dV{EfrtR<FDu6vs~VrXNML-fOXWT}+rZi%
zedE0XE<>LF{?A_L5B>Dj{q|sN>CVw{=c6~H?0=7MJpcQ&W6q-6ZA<>Jb@kyteak%~
z9ezJ0<)zm-2I|dtDD!=LRs{(eSojfp7m^w&mBfX`p9-BHW7AxJ&j_|#6%FqvKxNc}
z0|r2u2l@zOJJyOWFOscWm#ZR(xle4Zk~r<tsP|<)enQrF&K4o82-g{sIwk*gRGx$6
zhtVn-?F0Vkvd*12dB>q;bf26_L8OV<%gt}%>&JB@`4Ln*9>n_(t$VRe&O9}1d((LD
zg6BR<a-D7XU(Q&1z4WEy&OI^v6qxi%Ds22`H23zorVoZjxBQqm9zWkM8Xz=FKv`KC
z-~4rC%7PYkamLh+4XL<KgNbN7(bOtUY`#!z5Qk@S%a_rJbMMyW+9x8rLRQ#@j2ij>
z!Uu8}u@os5z&&#?LC-gLF3qr@)E5z8*O-2%uP1-#^!5727Mr`7^T0|=HgbIUsisTb
zIPRV4!FQ%{-nT}2qF>?LYHpYN^`xZ+CzcN^#-yU&t9MwdE|k|x#f^@0u^{I{5-D<y
zm}`F*KQs;6AC#~l91-aOq7p@Dn{Tj^`&XH7#h<QuE<9OG0aLP+d-`-U5N#0ty0<F<
z-VYJALnp*I-$w;09LQj-fRqr4CrLSp6DP}26s%q(&?^uxYaWq26g}Q-*`#yqL-tQs
z+1hhsob`D}(k4<5e~=zIu=YfjWzVsKs4pcJQ8I58>ZR268ZIEi#jREZ<|!b^C(t@T
zKL>piXs-V_`o#U4d2NDSTYkPjD7RKnXaW~`4Q$!Bj^$3D3L7PM6Yd)Oc9N(u*cqt)
z-4Wu^N&_Xu_<A<Nw406fy>bII!Ep<dohVvUIXV4z*Wsktflbne>wMk1bzVo0s;Y|4
zi4PUHzT-f>Qf8m6ufW>u4V&uxHGW>R->b`L+}ScWtA1R}31wq5cQ$9VF5~CTUFj-s
zZGSn%HVtKMc-!4#T3MezG~{Wm;Xh9*>Vwjg{->sfb}bw=;L`#U{xzBtW!w1U2#BQU
zV^PWlI^#QMwe{2e^(PPdO5C8f?<feMFOA7_5<S7tqOh9Z<2UxnAbfp|0%Ly~2!TW;
z!LH&H74^|>QnzhDw}Fa&0A9@OD@=f|CSc3az1ne!=j6p^p$CVh#YGS|F>f60=LqhJ
zuz{;G^h?hPSbHq1*^dhYfMALL#Hb~DPq#BN__5WE{41^+*?=mVxi3Dsybe7GH)xw7
z;U-}cY_lWp7Kt1qoQ=pO$TD*<jWtP57b!+iM)}i<kl3|Lv`FROIs|MaGQ0xOTAzu;
zfl`_3B~V?_3ix{1S6X&-**a%JZcras0XYSIZg)Ylb12(w4B~@dfHwYo%L|jA!Z*cm
zJvm!1ZUWtTQuJ(vAu&{JZ{_EInEbr#>sf>_(V%lo%yV>nn17Bu#a+#Oa8RKY>dg<m
zP(+V!E6_e%B3x)w$%eaK!!vG^JimAkBjJq$dKj%(YT^X~$oZDZH~BexRDku;ol{r$
zc5E-}ti52!(`1D&Qm}pe)4)EYVJ5V1NN&(1lfmQ|#N<0Of>jv|aMDQm6y*n53IXvy
z(mYqh4C(`9_==+iDj#+uePG-)V9h)PXaczzhpbzpN0sN)-e*VB77e_+=$gu*vj9MW
zfiuww`B5XA7$b*9veyG)7yOa~r%m!~0O4$bhb{}qlIxWxE0yCy^X3t9g6K!~V^gGc
z34TX7xPtRgPLu?wOaTf;p54(AZ@VuhBHI2RDk0GGp0;LhP5=sUxoDj|pmFd_X4cGr
z-=;j7jGq&ro1e&KQJZ{K5kt5|l><QK%`BhgATMo%7By~LZosF&6P`9SG^8|bW-hRT
z(NuyX>9nvV(k=<N1U!Twl+B9KK8R*GI<1~#oE_bB%<K%DqT8uk2hpIzhaZ@mNzn^N
z(Y1dn#a#*6z_Fb`%b=q}x;en;_vRgbHf-#VE4=9){0l^&s1S$VFsOUl+qPAtInQE#
zM9mwdE_13k407J^{be$lq+bamZa1cPmMMg;nS-`BX88b`le;Ow$GD<c*6d-#M4YpC
zy!s9=5=6yfWXaeOpa_V))LTsN6}b>(;dtObz2NeGh)dF%q2f6SU6Q$=*${!3ruyDE
zO#8F;%i2p8<DpLYG}ukt%F7c0%#L>xjA~uRdD`Und|Ui?TVH4e2Ytp>;<fI$9)^Y$
zk<ZXu4)8`f@LX5cnQ;t&Ba#N}6k_0_GruH#<>a2WW&3-uP;BQsdbDKu@>$5ylzoO}
zz%OC>lPXuM5WVvKB3)n$;DNQaw+EsEW@TmNY+++UGkoFdQ!CE5c%&#@E;Q*XD8$Ql
zTnhmfMl_O#4;zyB1p@SJ8FR7TK+!QoL84_>uQs{4P7miJ@&kGhljTBZ7o<q>M>J@v
zE~;5~Zl&bDkS>2fu!Lw45CVB2WSDbp_kI0&G&4JM3@Kg-Sjup|s+gO(&l9bWVsKs4
zU7A%<>OJPy*geGs_<O=6g5BK@vrI04))KP8WuG&rn6yRSzO~a2efO}($se^7I!1{3
z)bB+Z=c3hqH=P`{z-?29O!6f4%}&>LilWO`JNJTJ(m((FQ;cY?j{}7J>DJx5<w^XR
zu1)X8X|o|w#d{={FjB{@c5|4I8dAp19InFlnq8~4tf`X!=UPcgR$^=yo|r{K8DNmG
zuwV=vW~?%9`{R-1LY@_5j<bMcffezczParc2pDKhW{!f80P5TNuF}<D`GW4pxH%6s
z3OHVKJ*;(|os&xsfZ%cQo;MA_fTSMt;^wt$oN8(X>YaP;8vvkJTy&R(xO4D1R9_3y
z7bN9P4e}WK_lL$jge@^dJQ&?K*10YMJA*B}sqrx)f^nMFw0&~#Zgr1?G75m=Meiew
zR=2gw4c~u)dSXH1t5gLS)J9@rVi?8PTbBpWc|VR{<|sUG(2i(O?}QG_PEdsQXIH~&
zIU*+~N4;Z{SO2Y2uS|d$a8x)|TV@V04w{3;k9U-mIXho1w|S3i60qz19~)yau6@(z
zd+58zAf?bx-?Kdm<-)m5j}clyKI9g8D03s@i(}Qh4yxCn-QcR^ig6%R^5S2|ni@x`
z?~k^t)K}kr@U<C(jTya=f2eCebO3o!+%HdDDZq0}JO3*vn7B1aTG<G(j6n=gqw+es
z?PgZMaP>cqEjs~sX`2t<-F+WwQdPZ!=yii<aBGmG#rfmG+^#-(g`-pBjanxY<RnDi
zVqiTynE|0Iry7P4gfsgjL)-!_*Ia=>+KJVSXT88_Aa4RIlBT}d)p&YREMWAF(sTh0
zPuWc;Opts_Uz}$ls)TLiMm9JY+4*3<!iihyQ6I!P=%n2m^!rh>6(J_%oz82tLxv!;
zBK)Jo7A<FsqhFS0x@%p)O@t1IzPsz&w8D1QZk$K$>sDigfO~WsWnOp<xF_Y_g*NJe
zA!0Euvr=$JD`m;olfOv*qq*7nNDF3!B&B2@dJi(7Q25lzpGVPUaxj)eOycUF;7B6*
zjcFdj=Z0<H{bXW%6t?SE-i_bDU5gZnt@*%T?r4=o3uJ$2PFJh+3BX>Q`{?GOx3-a?
z5u=a#^5vUaK=6t?zi|)0Cjc#C5}xkZ?exAr+j~{n7&c<q_P6Av1jJS5^m;@!!0CzD
z7wE9fEkBHFfqq%|D11@LXzj&^oit&3H{wx3!t?fA{tud^kB9ux20&yA(6+&O-1jQS
zpVA&+{fs{C?4(ifDIa+Gw$Nk!8X|jD+(<E{WcjFG*ZB$5)03Fk${dPY-<vao+^KJM
zQwfHJTFu{p?kjKB3sP4_Q@5&}Ud_3vC}q3%m}x6Vt84$<A(9n^NY~+44LZmiEPQ_&
z;#8p0;Cy;%dH2nuUsvkiTyWFx_e$ItFS5PSjs<UJ!Qrc^3MK^*O*-v;pekyy)U>@!
zR6_C29ZygX5Yb+&VI*R725%jy1#}P=OZlb$55dyV2ntz+18$8}3`M;dXpZoAbgW@y
z7l`#E2c;4Mi~}MdqHu;sJ>Q4v2an(-frNL`BDHU*qz#RM0ZghyU$<Jg&P5brBeTr?
zK#nEeJ9ZG@Zc^;iE*&D-7$jJXp(kLP$DNO+DNLv=nY>3^K5kPcpl|@ewwKQuB)*yZ
zX35wHzRjxhjyC0ztI?hZ->)jZueW!u&_j#VU~q5N>o`;z-}CL9S>kC8JRKA+#%RT5
zZ@q?|PoMVxn+2u*i`sEI-yS3OqKuS1-PF=zS@#E#1;U9+O$Trv#Wh@yj<!GN!+S_V
zH0RIA>miTD@nIapE!v#%sa)yqqoD9fOIOBi*Z;f0Rxg{)u{VXiZ-FQJ>0JUt2t|7Y
zJ&zp8HhFJzJ~Set3B?U!5u^OS)YQ~_-qR|F)YJP$nS%gp`c7o0Jj%#;-7X=L-o0jW
z@bN``Lw&tc>yUxEg~bi+Kqq8kO1||m7Rsz4Exm;P*ffJWJBR9RyZ5LCn7#LLZZLPT
zinDyLD=8_FDv{!8vz#=Is$<WgUTHP3cjNeQ2NstOZmCk`r-MkhIP+W_pZ%UM&#;1R
zQe4CfSZ&ofUDedo9DocL>A9ci?W5^U{0>IyY3AN-@qKu2&JFlgEX*u90Hsse`^O+t
z6t>;(^wi+*1uG0C51>dR?@Ew@XgU)BM47E!no=g%_LyMU3fUX7c*blmU-<R1AT%f_
zFe~f9w-G{LFLqojEh`IC9S{-~J@cvZ%GK+CVifGpcd-_M5a8lov{P3|$St?j9z<2F
zu<4EIS9Z3mfmO{|t^(mit?PEIYaXV2J9f~>={?4<$`MvyY_jd>=+N%zL;Z_1Hq)7#
zi%V$X5L%Yb=c)vc1-e0=5)(oZChrk9d9w-@&L2V(J#^_mmpNDVl6Z`i)bmv?YHDgV
z?rM--p8JfEo_YZ`ql?$AYgpBb9_Q(Z^+X}INOvzs?-PaU+-Gd*?q*G&(}f?t9opsS
zJ~}cI>c_M2w0q~rj|2~uUcYf;jZ`o3!9gIJ+ox^QG55nPz0&e>9u*g}`sN||IRRZm
zacD1EA*#;3x!$HdTd5CezE)o!hMOAWUhn#*1n+b)<Y$b*u(4}9F)#+9r56?lkY~oV
z&prRsckRCPr6w&y>jU3<W2h7|Vb1@?nZd3axAN*WUi_n1^Re;1Nt{4s6`Ps)ufgjk
z)1p7nP7=!M1vAwLQ;Ur1OZM9LpEUj1&bgsdol>Naf*2Q|=^H2>508%CM)F1SaA=%;
z?s^rHvK?E+3mV04n1ey<(-s0n)Pp76><uFyFVH+0u6gHkyBaY!!88B|Pm64z;jHb{
zkQfnKBDH3Xl93%d8d}=dyS{z-k}j-P)!b}u(5>J4{lJcvf4AF6$A~Ogu)0wY&^NL`
z+gIL%@bCLEmLwKh?T!0|5}D@C&d%it(~Y6>aPMA)gvr`?4ZfWBNAq0eb~|}|B)ut+
zvoc4)gVN4obphA}Zo;bbKOwuum(@GZ!mCiidK`+TDCuloc@*FY0Fufay8(KVbV6wl
z;7R5+n1k&`GTInG3iW#uQaVM8D()5vEs0ikmq0=1{<Ur;1apP`)8>?)&3ax1z9-Q*
zAazW-dB}__w3eyp62-R@#AdMv-w%oJk#n`P>pI&>YA5Gp>#BcCk6arX928KjxAh-<
ziAkf1?)>xK=<#j%=uL+e#2=rE!BhyURtzGzKGD1Zb@5zW0rHQkFVE?4`rpZ<>C_p6
zpoO8mnm;jq4E<K)Gtu_R8((9O>l+)h1KI)OJ3ESL5ELidxt+gSb8$)=Q~rR@{HJbW
zMO29-{{bouBO*Y%Ra~@AL$0<_sFYMn(dH%I13b}&fB%j+T=BQ0;Fzk_gF<yB2B;^E
zO^>$C-W<1SL^*S}{GA<*v0hD3(~!oJJqn67W>FK)GF9mE3dCr&&tzl63n%&#CU#^U
z<sB1H-eRC2SR<CHtS&10?N$hrvWP7Df4zv`M(3vKI`D@=dV&Xi7F6><2o_-o>n)@H
zfyW?b1u#JyfFLN2wmD!B?ucv5OuT}r27&*#LPz$Tj${OQ6hKVLp$f$#c0Y^EBV{5J
zlh`nZHc5C6GtFC1F*`9@3i1C-?!bJw5T&KWUsmxA6GTYXQA@3ol8=N`96BvnC2|)*
zW$3wYYRRUmupLDY!p?vYHgHl_4#No;c7e?i7w{=vQrIT0ENTdp1{!<!iee7=b}<Pr
zf~z7hb&K2J!MS#9V{F;71yb*r{1Rf)rH_s*KLo%QB^;WFXQAc}e0kPZri?h6MgRNa
zx{30k+eBkud_x2U&~yffJe<yK_kTuhv<t0c41*C6gbp5eL=<z)?7mI)ik`6>GLoK4
z@^4kP0EGfW!l4+6q5){kr$<#eftW*CwtzrKm)qYrCOGx3Pi7_PqOxXNE-7DJR3u~n
zS`w%h*iT?*5x$Rijxqz6^%g6uTepjuTUA2j->FBIK-*~F`q&pqARnSGN6vg^sIM_=
z&fT3i%aBH)LTas$`THv+=WYG#j{B=T44V^u8IasC>q45DsM~0;80!Q^J!0?)EYjBC
z?aW=R@Of&HB*)c$^=^24U(Qu**f(3UIrdb>Y=ddDcz3b!8MtkJ>dSkUudvXgb)i1r
z*o#}rQbhr+U5Op*!`h#B$s`3<h_eTV1ZGr7O?MG|emQ${lY-8LYoCU;wjBEKK(8_W
zj-E(wwZ|uiqX#c{UfZB2?wMa*oS++*CtK{VFIbbY|DOjAZ#dxDgM(iRmLieCOy9&z
z9=p1<zY;`CdDVH1FX;%zwm#QY=XGm4H7^cyC>S4|NJtP0x-q$YaY3ajlie-;i}PRc
z>i-PvB%jrJd)=m?uhcp;3gyw0z_CEqSY4LPln{|nRr@R1zhW9}VG}mW56>(8-Z%QI
zt7Zn=K1!(qPcW1|3=UV>>c9WO|F4YBc3y_zd_L{??=qQBj{ek0wRl3!01PJqPA|_3
zoqU4M)qmCOW(OHYYn`)yeP=d?$bw*E&rz9~nQ2?c-0Kn&a#J5FUU(@Q7N+-({2uCy
zglJhThDeiKA|%Wkk{e=M&F+25GB;}e-q>6I=lV-%(Jgj`W8`_sZoizL2~8g!Gh6Oc
z*6r#B+;aEb(b_DfaXpef+a*|<_s&TP9)D>C4IT@%^~EdYBy}=e3Jb0dNw1Zk5mZ(r
z!)X`8rdNBb`{=2gm#!$feh`HcmD{6c%AOON<KsWYW+#6+U_RuYQfQ@D`BL$)f@N@^
zWu!)PYEQ-?-wp4T3_L$OYV^x=wq1Vn*)L~5*HYze$)ZI;ri;F)#`cznd0&5#Z7{y^
z_05_O+3~(xGW49YE5YFezG3^z2k?vj&#6}~?6<8YC#Ts3eAww|ro2+%N=PtWH|RNd
zS0&Xpha)~_lNHZUp{vKkR*%Fv8;VcGF+7Z%qY90kPvkeA^!@Bte9B4jlv~!jW#>QD
zGzPB-JG(!(=mlqj^n?ZMhstqj);#t;)NOjzI&I|Gopt%#`VT_l?4CpzjL&z77H6;T
z*8FsJ<Y<Du$n@hZ8y4{pArHRQk>$2On_`OU-PD$t%wpMRvIUGlY%V~v99Lf$eex)X
z$Ev+~uwnRsb9G|g&Gg2t3JqUg#m->kS;Tp>IvDs7r>HV7Y1$#wtnm9Qa<*OkbMPTf
z{&q7=TB5?3#zKR*xXj(qiXn~ZIrBMAafEOd2Vlt2YeWChC-}cKr3R@Om!9>F33F5(
z^WGTUDYW%yl9W>Cy|sb^cXskFc^E~ZK)@7!P;uKm-Y$cbI0el!t-DZm3HuckZ13Xl
zT~>K8yxMbv!A<=?_uq>ZbIsUme#zCXh1<d<Wr0+VJSQ}=xCq?Zxih%9jU|Ai&{DJD
zNR<CSpY;q42@&i$avKl|9DaP7vzmHjIA<K+{4_7R)AdI1{I@}mqAkxZZiS$5875@u
z{9$U`;>pp{JYoYF%>FAI#mc8+rIl`oQFhXP2greLYap}*5e#U;PwRo0JUcu06-dab
zdVFPth*xm2zIBL6CV%Sp)j4@`A`hPnU2St=RO}CO=rrXL_DS4ZI6omFGw*6bLVaH0
zCB0aeH$C!oX~}JVTQU~y8ei8pG|I=$+IlcoignTFEQKbkl$NtX25pyXMx<XaKJGYZ
zoa3S4o2!0a(s8(_wJp;zVSD7Az$Z%G)7f*)`&*Jn{yXtC;oFO~N-M(nf2r>A?CEZN
zxve@qWi+&2_C>dG^RGpbv*`7KZl4AUB7p>kDT|Q_lyXl!Fu$nqH69^{@JlUy!K*5}
z-sj3gado=BpzPtUUC;q5<5<jeBtCYD_ciD06Ek@p3Ofjw#e6<=`V-f^w|j~W&7t;o
z=*ji&d|ky;vgd*n&n20e-!pCgZkKAymUvP5<<gIdspW}HYjT?3G)L}hw)?h3q&)FO
zzHRg6Ln?m04;ucA(K`WV#FN@^8Vwpcc0N<KwslX}Jnq+^!E>4lCRG9CLsqbU>uM?R
zuaL<&$R2Gc70fB<+qt`K?ibap<{SK5oi+cn3p&=xAHruCzk&m&WuHk<vrDAMA#Cz{
z{8JH7!r;<^(z~d$0mQXpyn=+>G5<_C2jZdex^RBM6APhri39i8Vl6FJobn|v%P-96
zt{Q2`y;9hrY@J>$b`t!?aO6a~KjW)kzcT%JfMN6(H)We<YBnTH`3UNtxtG9E(88mw
zKiswwk9dYPn8+Fu5@Bk8Rpt0OgQJ}{R6W<#IS=JD=0A@c<PRom$uQTcpJ5G-a`wT`
zFdrk`HYnOh4H-(<&Z)oO_G$BlcId3&SndK1X@!RVMdNk2o4X!=92!lwW-v8qOU0&E
zp(5$|J`ctpxx~#ZY5X80tO%-7kst9x-XeJOZ|LJI4T!hzAMs&f{_oP={W6)NE05zx
z$EV^2FQPs#tI>WX{+Rf}zM&aRn?^zDyp-a;Ax%TWd@&G>N>@(($}`6qe_kta9pB8t
zyeuj*M7ybZbltOR$hKH!@|XPYheH-K{>V$i!8;&s1uyI3dNBDVRQn7Ttvp^#Y9ask
zAVs2#%ObOIFD>rD)r_ydfVm>=VpH3^IH9VnP=>QBk6T}Gq>%XrqKd1gw;q$j%eeN}
z%<bZ4EG=8iz@+9{I&<_tZg|QLPu*aVO}z|fP>g#OT})+Cyx0`N9Nh9FH~xPYjM4*P
zD<%HC6nW<RO>ebkm=H<!e@nVloPNjT6Igt*W=#Hj|Eaqs<au$`ZXblZycA{^V?~g1
z6(&LS|JY7i&RyIbI79Dc^!yflSwA<VPlD?c5`b=gWIoHEsk0+EBgoqL5aZys_4t}^
zUwJgc6O@-VL_%#1?WlD=p3Kyi5{G5datp9_lY7J14Pu_wqu4X;e+^;dSc`aCMR+E+
z+`J73=yjDY|KIx*iWxZl_gSzv5KRB?IfV~JnXp9<gflrk=?9oQLI2OxVvDn?x5{LO
zhop4xGQ4DsPu1s~A2iLp!M2%`PR?J$2`n*w3LpFbzK-Kjm!d!OW{Nr}`MF0ymOAb&
zvsLha%OtK`;?!Yd#MpkGnS`v^!jy*=oS0f?VIx~5ZpKn}W~afX{LewEkV@B1J(y&U
z>V;T_kG!-xt9rnS?*$C{4o|H~7qDGyqMVuAUliAv8O^ECvBzyGdoVklM6RiIREgM+
ze}F%NGp^k}?NBMhN#pC};{iK0_{hm(w!nB@V$I;_7<p&-Y!btzLNF`$h_3!^dl4nq
z4`)bA6hkn_i`zDX5-#iXHqCh5YRO`mepu_aVt5Abhh25|1t8eacJG^+p`j0e&bK-4
zOJHlTC2~<XMtnOUmT@9(g|Iw>^XO3T!b`SqW==*F|37<}5<L3T`B<j~S*J;4^4W2U
z-4F>n4k|q~XY$HCwDq-s9$`}ERqmk!5A15<J~Z9!R;4QU+xy(XYrmH0@TcT{7J1;@
zq_lnY={3bNt%p?yfIMmnYVY-UWB79}1D}TeDJbTj@zY@vd56CQ;>y*&zrSW4J1hAh
zcx_ljoafxm8;QJ(goH4_L*l&bb@mg1;<w<+O6E0jZ$1v5Pw9ROF*+y+3mrkCcoqmL
z`GmU_&a{Jnp|13YO3dyMuNp%TNDq;nWvsHrE#bb9aYNPP`O)iNa9tP_>FU+?liCB6
zB~CT<lp*TZWI67E|C%kzZ(;(`nq#I2O}ftaJMVD+KgaHtuwB{1vR%>kMywb}&Nb>j
z9mj5RneupK6gC(G65izF_2d*sUWzTOt{_EUuhLWQ@(?}X4<Su50xNjxA-4q`&IZ7H
z9a_Jy1uv<T1M7P{hg*KyEhH{357&RZviXmn8i_+Z?HH)DZ#gg7cG=9uI?6jjL62H)
z7~A-k*3BNsqH`X$W!NyL5A2SE@Z15m{#Go-0)@8<O`||8$h!+`2%9LZ&_*!g79=e^
zCs$zAaWGe0XLr_Ee)izT1wqH5(W4uqKr-T+0mR6{6^dHME)Q){oe3*B2@_oyJO~@i
zK|o}?W>c1kV$hH+&f4I@sldwjwb&HlEK$~gG5yo=h}th_ydD9nx4{-CrJHdGzJool
z^3@pu&E?^v9X)dx51ZNsN8;)d%2HFRy=AoZk^aQh#l1zs2Ej<YKaWO>%lnbgZOt|8
zyqWjsx)=@)^&AQG$HUsfAr^Vcp^e$!C+;PLVV&Wf#fqf@#9EG%!HKl&>Wg}D=X+~o
z;wguV0&YZm514|{qx^>MQVgR}yOyX%c=JX-|65g=1E5^i`};mBzey38G^IeT%$WA_
zjs&xPSgJM^7)o&0v|HLHvFe}~Nh1Z{W%X)zf!v2)o(*76$Sp~u0UrSsghIT2n*Di_
zt%pMaHx)08I(9#B#%b=3H4L{z!VdnA;J}eU@e-$sir;XKpvfHa!pZuug_jlOXu={h
zkMVPov-g;L2q6J-S&el}NFXu29d`xDz6ZHoe@CQR9M%B;1%4+a)4X6`$R6_sYHqT8
ziX8{YyGXwv^gv8}8^$8a(DuZxHR#F1B}{N`f?*<d)0Q*B#}{h`84R}@;@FS(2dmL>
zq9G>>|1l5Fg{p!WKuPTT_x>$eW+1=rIu%IiI!jGAV6a_~oQ<77XC%sd(+M`NBKS2h
zg~74LZn=A@jY&}n8sG^Z8kez8k6Gs$fv?hFKiZnh|7BuD*zQh8DV^OH%>>pnjG?=H
zW$WXG+L7YyV9_LfSKXIC&N_V>4bf<VctkE@0GL|QPNc{%{^PD1EEqD?gT@qt?S!=X
z(1uDJ>QEHat4%}cY?^Ea1uif~3cG(w(CD(-xYs2X1721-`yiW<pNSh#MKS;!9x9kH
z7kj~zIQ5jn1tfVwSH0btOX7jz1`B?#44V!I7K9)syU}N0COoYJF+xFH&7UGT2Zibz
zcslHa*@#4mGv;WKhqn|5B7|)8@)RXj^^k5y4wf}n=Ft2o$}9yvO?1pC)m1<TKA~J+
z@IQ-`Qz_lU7=S`ht%=uLu=j9{_Tq*)Z~SqVYrr5;9TGz{!+R&;B-B-QP1&mv4EVmk
zCD|+dK<j`P;ZU#{QjU9STB|^}54>xT$<df1juR9N`1G$9KjDr}RRh3H@DF6c+~4iL
z(ndmwq$rQ=LlbzI`S1-tv<J}~`8g))X!DWJz=y{w7-(@&+_|rRUYNt6_UvN)A99zb
zxBxP{Oidfb$A<UTDOGW@P9Fx{mgJNxGR{~R2c1gZacTd29GESDD*CZMX*3Um!COl#
zMV05_fKh@u1w$agJb7*1h4z0X{Jo^ur{hMFL2-fs*qI2m7|3FIp6CIO71f-*Ysj+^
zJ<l$LiL6mvtzKX6-|dGg*(_WLphbFh#RkWHa4~O4Kh(K{T&7saejKta`DHAL2*H{`
zoa)$J2&^=jix^OM^nK?bS?kesAK)6+f)>|}L+2$)Pz=nd5}Jjx9Q(RQUoF*mj^9go
zv(i7r*sH;#@aoRuC6OBf5rftB1(}nK=oBTDE9w=~v>mK==E3kFxS=~MUlhj=qeyO8
zd+YZ(?m1|JcK`h2P>Ej+p*M1Hp<4#sk;0xM9c0Ok3+I--NXp~y+d+@H_54WN@5oh^
zLce$3d%Dfg@G+$3hM`Nv4NnO=BQyv`?M{f&qq{OMAwfe>8=g~~!ZIxA!YiODH3JKB
zY{-SgONTLD#LLclt6No2TODgF3zWzq*hP_Z1~v~FkfuSM3&!54W{PFRb@)-+*Cg&)
z3jzl>I#ZEff9`-i2E9{sJ*?sXLKAB^<OBHSWG2p&8?LhaBdP3xxlr6Ws9vjp+e};k
zRi1|<g&5R<{-@6B*D5$QKA6q3X;}QQ`BtFV0pC@Y;6edwVqZvH2<C{|v1L-yny*;;
zg`$;^`f_X-!=yRCza~y&(Th<iMDwJ`LI4;CDc~V&!RVZ%g=Fns-Ev61t@7X-;zn$@
zDj=<ycy#{>zr#o^HjlWalUfUq_oR-h>+7*n<WhwTBvGkO4}wA%Fbt+iO7w7PP?wo6
z>(87jz0R7<FKT0~G}6+4u<5N5^Dbj~-nr--02dG;@Qi6lp<siHG8de{7P1n-JO7$o
z^e?UkoNj_qKvBYi4!qf8X}BQBJ|+m$R3!HG%3~oKiSDO1Kg0Wv4PHbFLquJ_Sv~94
z<1pP?1(Y>FBBAv{LSUFqb~wR#;@w%KfsGGcBau{*Ej?!C-ULMlS+`=PNIi_$a=@E$
zS(G2$z{}o>Us{UW^9&foW|~L0pJxSjlOV;?iiumg(H0&1mVR`0!IJ$(>lVQr5g}*A
zs#PU4xlQ8G5SYUS42Hc>HY|ZlF=MdTcH$Y(J{0$hGA5!7{bAM`0LL&yCYrdjfmwvI
z?{5WE%ULPn?8NpZ>kO{}8y}4G<=~)ze0RY7KyA(Kt?qa(?AP}g?K6D2r@`YJuxB(C
z(>|xxDRAL?;N_xv53&{}bP>l-*5$pj?u`v937UgaVS$xv9&sWOF2_tNvW|XC;I<<F
zOa8ueCSC*`4+y1|1Kw%-d33Q3Kae<3i`ej2;DsTBoI%6l@NG0}b8UWHR>)cW6i7YG
z#i45G-<d^fCyFH|I4pdKo8^rK5AIv=MZwEpL2$%{mz5WRgy(rKXznJJP)SP8zi%%B
z&9>8cI_|g?acC0ee9NdzXf8ndLB`@pKadd9C_nx+noWs@qc1O|bl3kxVG%c6@@xUK
zBf!r&LpN;R=inxmBx@NV?ZPKRALiWpqdAPkb&qkXT;{<qAhEzF31>qE@UD@o^A5N}
z!I5p+#XTq2Smr-&{D}|*vl~Q^eI{ivu425HgM7SSZDLT3lp{gACQ@&~SdS}31F{0e
zmYIc4ULh8w?O<*{mdy6`9UZKC%EQ6aGZAN~lf8N5P`=N|cZbAVNLE4mpdG4n{?*-`
zV$uq7(w#(61!V)5xMSBVS^{Fa;0wB;D~d=!p^5sQkd$l`?=%&FHFoR7OLycGcFlQ;
z<VJ^gI-gi3SuWzdPoYW|8S=aOcH|}z+FtIAr^66s6WVz6$5Tk%rQ<vMKqWFt);37U
z%^BWqWI+$?2uCIuojK$L3&{b9W{qxaJch|Q(g!SnyHlcQwG1czhvB{K1qyb&WbuV+
zN1(-Lkc+AVG9b-Z)q&L{$wp|MJ-%l-2qq^fzruI{DJ;n|B0NhabqtLTAeUGO*@i;&
zv&obd+sz8HceJX^I}fU)6tmDL3?cFnCedd=WtZimKVO_@-v?5(!WXo9vgKXR5t?Gg
z^-NL+l%}(^jSgTcImz!ailUsZa%?9UT+<{f*@-%uAcm#ET9L%M&?VZ6KtAo_L^$HI
z2y#9=0Pci)>3PjbpZgk8Zz^8<4mphO(YjSJ@g~;Gw&tQq8@7{)PDw~?=l#QyGO)db
z#S|wG+hFs^tQNF$7YBS#xGkNMGaYff908fA(n)3yxk01&r(!J1IyR--Xob<uL(0!H
z&d}x{9Ybsjq?%;iX}Z{8zsth~(tA=^cP*-eL1*L9s*V9G$g&{)$2xjpe%h^kp{F?F
z?|Dh!HfVbAk*!}4k5~lBqO=yCpy6Sf{cUJcQBu%3Pw+I=I}}&_#zzj&gmc08&s5En
z)C4Iq^TTyjDM!{vE5fGV+R%Cobs4F}As->JP;$~jTP?{$^~kpnjLbjn8|uL2PNT3C
zuZ-^Xo)J0}@ockr04O>gVbUp2$?gu(M)Uj~4#QBFhfQDAjy&4vH1Vb8_AI2*)<188
zX<)3m^lH@@3R~iQz;CSVs%w5I7!&YL=*o)#s7|RAsWrWKSw~D{jq{SEnMzNJAzQde
z(6_-GwnixG9~~nFN5BT59EQPu1!BZtgjxbzAFvaGaNOtPx=7{iI`Xw{HtxJ)q*HKI
zp?ZRFU*vZYjf(SFr(vgQL6`%1F*m9wKn#svdGb;^M*9lNspvt9r(pk@m9*#F-@s21
zV}*G{Lh5>#xtx?CnPM&Y9^?>hjocYFZsZ6Eo;o5Zv0Tg=p18K;*UO}R9t66b?3c8H
z0|tw|BBSZR(Q!-~TE~oeSrr`XIZ0s}1CGwCK~*kCm28)6o4@9&<>qP6Eq9Lo?B0{u
zu_7$cZ|rv)SS9ew8HOK5e^bl2((?lXG1y+GGQ`>SlesB4b}=zWEv<_I{Ia6|804jx
z9lEm=Z=7b+R_)E<#ikubZkN7@I2{NI<nTQ_=7#}iHK!;Oc%>t^nzmx1-{)h+IqS%1
zw9Y6qipvg*3!x;CV<#}1h7|Lf%mX_q8lcVsR-Ew4?}CZ>C#GjlN=`e6<I8`5fgH>w
zyN}yYi^NGI2Y6g5$vr?Y0pj2)mp%A#8_kR0FNvo4{$%v=&9`<mJb=)K(B}zVRFoJ1
z55vUIEnKN!u(?;r)XH!o;YyT}h7(@G#hK4jO&|yF2FhD3GFGIKo|j@3g|yWvd2cZH
zY{RYvVoQ2D5~{(!ZGt58QkEdLf_<(IiRJ_!zW)wR=o>0JP2lQG{~g)VSB|Y?vb0KJ
z$W@pPSU0h<6VG$6m{{-v;|ug{h`k~5)e{@9tM+cBP};Ol{slPpTVX1bhCF)MXA?<a
z7)99&vr*lPp$3fx4|>61=$6c!w|7^p9x`UsT_hJsWhQ>29lMiGunatESM2QxEC2fX
zzd7)x)wka9&#lLi#0^A#@j$A47MF;@r#YOsQoE2zkq0hduxS>l^C+StDUn6W@Ay(5
z&~N8E&XKC;hcB!MvwN5E-<!z3yzZ2<V>x)WMruZeij1+h8y>c<Js(vmg~Gt-Q{t-1
ztQ(`d&rAO3ZC<c>#4%B&`k>~$z0uEPz#RWSbQ=|)vHG&kDko9S;%)b2{%!D}!h^O4
zk?hrZNnu+5_7;=ws5g;#-je{94&3xL`Y&V7iV4IDSjS=x0{1MbyCYl#Z|tSY3U}Vt
zItw^jH&`4?F1qFq0~jP6CE)iZ$1*feMKfl`nkcF|)UA;I0elXX-BSWK^lGhy*$|~P
zve6ORd+^y(7Avk3^Vhz2@hoYm<2;GQiRP%J8XNm(^CTQda^wjo*#Lw<Jr`-k+5PkJ
z7JN^2zGq~%k<>EUJ}KSEOtmXuDs9{r0~!a%;(zZ{rJ-Ev$ofzwsw%H6D6grF2ur0Q
zH2_WYU5y0l<0qn~J%0D{YvYPA39|V+#d#ixA*#=$mi;DWAT&v_MlVBVAFDYFXQozU
zZr#2h7Z4Hg4Q8-$ZpIB<oV^n9ot_)QD{Z(g;q$Jwu}WcL(edw&2FAw5u}^RU(mgDo
z#c<&5b|<vxlg-xs`1Vc!_gOJAN!opTy+2*z%PL2Li<C0!Yk6UHPwuad-|@)qZ)})-
zrE;b^?<@3H_c0jt4rZCIF{2|WL#&0joH(P*w>NraSe&6c@k<7#47>wdc{9WRwN0qP
zuoR@M4_1^PHV>}+RD|w3!LCog97bCj)7x#xm$jiNnUylXj+8;EK*Z%u!U&|3A@s|h
zJJdet6f%wE?3>4)d<ZF8c56e`DrxD&w5n^P-!Vq0jWh;{Ji=yju8ba5hVU|6t2M10
z8m$vji4J72w%OZCKX+sLo$jM04F>vxf<i)u#>)VudiJ?+t^I>9WCh^PFd-X#Dkxk`
zzxBr(Sg$T#ujtFnW!Sn6f-Cy|IlfukX`3O{LzFfxmaZvEZ|WiaZs4~uQjyVzRt0fz
z=w<b&U8>H?8wXOWih<K#IS3HD1ZN^XX^4x4nJje>-g3&@)x-Q@!dFQhq8KD_wN!cQ
z&s0zQ{Nv0KtL5w(r#e6M4VgJZX^9NW0=`>iA;){uTj}!fHu4#2;o)l{;xBZMnxE*m
zw*~1m@_TDxHhdnSEQhfT9DxBqN+d8ufg%gr6|#-Gr-mYa@6MIGz-$rxUwcC{d}Y;-
z5g)P`)7-=mW<#Zj(I5lEtfCm?<88PsQusAc037ShorSV45ElqC#8XgLGcZoYwF`Fd
zeZ`%Y9*0sx<3f-GwxcWv@46<blfZv(c3{AqH90KUr^H^yElSN_gL@xPa3FsxMh1ym
z0gywUL*ka9o~B?D4Ms`=FG8wBsyo31LHi9ES+d*e4swEnMpt-9_Ux22j?1XzNvs}s
z!3iJ)5HOM2d#qT7W{q$&qmlN&wPg{0F}pYCp=Oj;Z6IS4k&C87uu~iz*o+FB>MmIe
zLy_EAs|kgX*M=KsBk>H^u3a<j*@`R;BW^eS$469##N=fXBIa+>IWL$k^5FZfYk4Vq
za{N(Ev3zQ40-6k^UCSO>-%Vnm4v6!Tn>}j61#fuKtALl4sq?ymDBu9dQW!H$kZ=4O
zuY3Q9B~rxVdw_zVQ*-m*M;Ou}r4Za`1s_J;ednaSR_ts%b5iBKJEocso4cmTTwpeT
z1|sS0p~W+O_KkVrBEe-!RZci96`vTJspX20NS4y2I8))Ibdoe3P$C)trAx2u4Me_Y
z3$YkNhfb-~R;BH_y#I`dVnSnr@}V`Qd`q@y`Rphhd^&UO0!0Twh<Tkxjs}YB9(H*{
z!&2ZL2$_P~g}C$+48-R%A#c(Pe*4&49yE&Nv<6XC1ux?+uRANYUr)KeYR_1XIzC@i
zg&#sOpg3UMVJ3DY!+f2gA(@3<ymSd@4yV{AQJfCLU%=CeUPnnAHY|ysiu2<3edY1Q
z0v}dM9f9P8FS=oIw{XEP7878NE&6jMHcI^+QksQt#s$9S&Tc6Cn((#AQ0MBP^Yl;t
z<usRu8pyOKowGP?x;BtyXgJ}WT)SQ8<k{~NTWDAU;1Dpxk#J|Oly98e=a<lkAGYK=
zJGB(oAnT_+M`o-jc-i)0ZPDx$Ev^rHd}}?fA0iKsQk5wPJFvJ)X-D{dIeAFMAd}%3
z%`vGtkBy1RJE`IfqsypOWNph@$p;$@dQ!ZXt}-wbLW*(1S{R)GiZS6m@{tR<Q1zph
z*o=8MlzS&JOJ3YDK(xOgYZA<c|8-j6x#Kft8NK~*t)&;`&_-mlRC#0Qhv|9@m7Fj@
z;NoD1WUNBxh)Rr|n@08-G%nIBG<53RE~rfh0%c$^Y7`Pk!tma0SSPcLuN7x|Ui@)a
zbuNw(89RU!MCo)`Tq`bq%>uu%SHMlt&)BRlnE1R8095$)Ce+JN9eIVZJyX{u?9;)P
zDcWRVZe{c9Cax13q<DCRjs|1Py<A?Qh%<`jd8mr4{_7?8gajL$SQe?5n)9nTU;7PS
zZK*q2a9%PN@^E|)?;=;buuQKAMRk%N)twy_eO^b7`Kci_pF1FHvvhbYDZ9`+t#xfd
z)HGY)T7}7;sC&fUV-Ywz<+Gix(8A8d*04KX%XRowVZNI+_H43vA{PJ<8EAF~B_ypC
zaqp03u<4wav3kma724!bp3H@8Bsx0us!QH%D*BLjkZLwvPUwhXfM+&r9u0Gia|1!S
zBzz0E=!tmyuD9t+C@}#TX2>2J5N4<z`Vt+Wx_kQ6N~G71DIcIjvrr_^I0w!uOe)tD
z{5g0SoY<mlq!!>L)AffU2FWgy>Haien$a*W60}am*%8O}osVIGBo--Urr-*oXvi_v
z3FN|2aBO)_-j2)r35*q1Ik-=&DR@Cf?i!Se_)ZAC(=N{XgaGhx5fAq@aMYHF7@6>%
z4Ee0D*pvy%G@U!tk(gA$ZMNgBALcRmYsMY<H%Pkqvfrqd_geF&5jD9*>bj5lUr(#3
z2F@=!_>#CB!&AJp<7ujy1NOR?qH7r3T*q*i*ix@c^*UKslLS1KJNW*nNr$1lFLr_F
z3bqg$8?xwLI^KJSdJg;^G6bntjg?}5AqBz%_#o7byoF7=So)HIp*F<ip4{BI0f`R{
z87x9lkML>(JZ=~&;T&jDV1bcDBax*^W5b~3MutK(yn^*a;s~TO0+ia|BvF8{=my*0
zs|{1ljJ%(~f%FmWr^huv#E|1)uXlSEw20~NXFTwJVnN;-c<VTxPz~oY&32l7>oG^Q
zQ=+x-3h02tJMX#_hof|km~<Hb8^66*Z^<>b*`>9{?myAFGF+?weLQ1@#m&IZO4jG2
zAgT~yKy8XJusNy1X@f}9HLO_XJK}0jUy-^6x;MCbO98!1W5G3+B(rC~=K0B{6<P#I
zDjU$DrDBYTf54`|QRGmmx`Qhj$rxU-7dipl3(nfb1g*^<j<cv<YQY{z$oyXav7Z1x
zi4aVXHwyW+mfW`<?_N{4;X1Y>u3Q$YQ@h)Oo;rVPP{^6scE0Z3??_1<yq{SC-?PzD
zkY+f~W}Qws0*OuHoBlY4$|5;!JG1v=))tR+9pUL~mL7UzNL5)Jddor88h+0fJP!6)
zOm$|wZNsd=IZ~TPTDs8*m1x|=%TAGP^GKFGlIFT5<LG#MA*rkX55ps5mti!gVn@r!
z-sssE{jHiGx>_Fo@_HuD3MLttt!C#&fMYT?CG4;QDo^r-J{B$uWzOQwIJSf|p~4VE
zm4m(oPNJa{`|<AuPg)&rx)m?Q<*OOYZH-Q2ea7!aY;yI$c}b~^<&)ttRC$GzsRE(X
zyu6=tz8oux^TiJpPsGFo4Xs+a^1j?&J6lNB^{Sz#(xZli;EP)NrkzOcL3IWCfHwfM
zdsEHFK{hXWenY5{`YS@?E-z1^PJ~Bu7*Z<&d-1hOK(P8;@}Q}Q+7;6-2HSsroZJ>=
zn2Cs33fR57F1xjDPvU~@S!hfG>P${c`aUSFbKa2aezz*p294f-Y85C}dbk|gA8T>9
z%G>?Lk<Xt#z4`r9x4{GQrTm+2R&zw(R<v~0E9dGx6e!LVWm>^K&<wM2RA|T919=B|
zGFHlU9*dfNaaNFR*_q?m-RL)<-*W&!D!>Dp#FKTts*-wFoSmI@I@uSHK^hAS;8;x~
z-k7FnLv8es5ZMjuWjSY%Ha|$p(mr;Un5^YFz^_*2@IcAA(ReTt+ixy%AUyYvuA0Tg
zCtKV80g8&T1&!VUJRowTLLAa$Ybmbcr?`u|`|iZ8%9*f>G3p){;OSh6fSK1N!6~9p
z@tdD%ubKZ6hm<7ysYWJ50#M>1)qD^gppHwFen>dPVH6~8MLV)qMS!O!OIh~_&P0cj
z2VewB2S6#o(3T5o9>RC)ha>!af#o|&0OztrmJ8^Q1_ozy4M7VRb%hrY<-{Ky&r^92
z4uA`=?OOC8#UwNSd45$xJw#cq*Irp#_0{UoaMSZqe%5Jdpr|7oN8SX!0=L+vy@|fv
zc@fCQOgaxgTY5#I;?>9{X?bmZrg@IOX#2HE-QU?E#~B^&`8@J-o^rq$EM{h8`3bVG
z!xLa1ZH1MGV-`<l5ecXy|GNMwnI(7*QkAF~0ZZUU-bsiZg*|v)OX?KiKA3kzSPN%l
zUj?rZ4lVF65)7dJCr(pH9Q)ncQ23;@Lr9ZaMAW2)WC0t1mE7E1w5?MW4GygGe)2)p
z@?Vq7vPhN_cS{>16?cL64vD1W5d$kf1&Xuhv<A*elFHpRx<GDUh~m62B#MK2zy=$f
zS_>;zlA|lrv<<nx*T-{P`%$M7j*Aq30&=4SVW?jJM^nD~%{Mi4Pf}m`<1Rf#-wmMF
zz1RLARPqmwUqDiYD|;?M$yiwG385fEOo>C0gqVoKPGv12g!rd%z;=LUmg4Eejuar)
zA<~z$z(YtXa>Qp71SM=i_n<LyjyZGM2E5RTBaSxE1TI+bC1+em5Ujn+1K`c&F&bpl
zUm;0Ka%(Aa^&d4upSey%&I9AOR0a~`Q(M0u<2eRqi$%jm8q*Gc*$tpRQH%$PE?Bq7
zIFZUd1?hskg{bRI=>&E^<7WpH7YyIEU=M-{Ie^_2haE*O4wQ<3VoZ%Z0)I)ffie?d
zS7G;FZ1&#u4plkcqIrWp*tWnV7Gj1-9_$x1#a?AQo8k1Lx)nv}2lPjW0vXY+j`0(k
z#}KSah4yH@FUw;${D0A!U7QLFW*IRq`+L1dN0F@8sPBx%4=9x~M}Ksk)yxTs>sBRz
zm`r#)95it~Rmcdih;q;J8+z1YjcS@!RY5Paah{8^hPFOUOOOW_Sl`5jAan(N1kl{k
zZKlZ=1Vd_DfKz>j`jYt0S{P?0oElGTMcrJC*nT~{qXwD}MxBmd-IQg}g*<B=4_4O3
zqY5+zfzdEgn8zTtAnT|<8+}RV{7D3H>T#A~jfqx;xm;_|Uf=>o_9=e_St&(Ft9~=e
zUQ`TVmqs4A;mrnvip?mOWPN}CP`&=IroP}z*w%!+@duYjAY%e~2Dw;J=l$j;-+mNa
zg`W<Zj2!^llLKAZ1&_xjTL~l!v&g4k9rJN)sGf_cT!exOS{=l+?igsr5dz7SbUCpr
z+zV&UJK3B%pMivP&~OoxqEg)yoX^6-e?dn}-Jq!ZCweU19-kup|16=62xjpzr~fQX
zO+`pj0n`cVRJ8a{Y$TsBMDqEVpAJABUEV?cc=QEkm|S*m@f~VIk{{u<Z8uIdw!>pc
z{t&7_$BoQrP=zGEj_iXe1PR$fc$)qa%(@$XzwQ5^BcA;`U8S!dV=~M}9Yh>R!yYvN
zP7gqyrk<ekjzW?}17!oYDGASay;TJIhU}8qLFpK}?mo>;3_b*}iBbOv9<+utKeH=!
zs?lV=j6>Z$ks;R<M~mBNcyB#y-YsRYf7##UX{t$lQk-3sk$rCa4;6mLVjytj7mcKv
zodu-!Nad0_tkXRf#bxF3{Ctu73oC<n>T1Z#EuygEn-bm&#!{lx{5LT{P-xM5dwY{D
z8FFki5J8obu`XhzD<4=XMM{zt#(Qo@vkD1q4m1P04{xgBcQCGOJg6)xq!DO^k3{SN
zSz<k=(SuPpqrd*%yYssB&;g^MC-?{SF`HnX0O2+cAbS}_L~Up#1_G|ES4k%F)b}SR
zQnW*7As)B4x1Viucq}_wjn>OraH#+@dn0zsdi7f+P7p5UUh9ve7R0?U4257D;^R}a
zgs5}f{bYNC2eLkt|7{?!l~S1o!j{MhN$hJ}k_IMgbog;?*WCYiId$uaeF0Oc)A;#V
zjNiQJ1-bQsbJJXkH7U@qX6_BN_|VNnxq@*8YhfhuT@9&Xq_j$rL9ut;`}B;=Ovi8K
zJ2wol;$%Dj?7)p`)K!jtNBuYg4mEpdYM0P-9K&Sq?Mrnupq>p@ge1Au2z@^P^XL=4
ztc@tis7RxW8@U+?XdqsYr)2&`u>8mu75*2LINkL9H6NuMy8Okx=w*`#9jSBRN=M7i
z?aO~xw$x?k*^uUieuud5pF1a*xz1u7+sE)JBGa(5IEK^$!zENA2AUSQ>Kat3kvKI5
z*sQ=9iBk1&NV+O`9@`oC+O&)QD?Xr)byOt+eE}#zxFa1lE54AA)AQzKG~;EH(XL%?
zFU#j?*Ti_P5i~R$0BUV!ITx2-4t}TO`&8t9$mpmK|L4zOH7G=O^-1%IT7cIMD~L*w
z_}86-ZOELIA!-X)haeqlZsEPu52A}jAsrXQp-r7`fj0*a;eB9O)y#U+D=S=VNY+tw
zXVw?c?@B6rqre3gAYE8cwPCqZc?vWuL<=^&5+W{{n#N@H;$iSM=-znnKw+;}+bO#@
z2||a>%aJp)nm6F%F`Qu#TVUw-7^n*$&q@&xwke|oL}L_k<;D$*p8v0`YXOHc-Q%NF
z5)(xx<ksw2N*lS9)FzBet!kdklqP9Z(`A~`O&49Zh@I%73q?6)w$rhTYN*vxQA2D;
zI+BEttjjSh>bR8K{@(91_3X3r^z?j>nfbnVzVE&K|G&%sb$&G14AXCTUB{Vw!`SlX
z8a!~z{#}+>X7{GENYwbFkX1hPz$8ioeHwl@<L4Y6kJP${7nf+V;H?UY6kwqxb!uRw
zhQX~kcCkn@TmqiQIvSuU@O(t{sGf6bODzYi(B7P#;2Oy=${lz=0k%W1c0^eVaNLo$
zH3^k26bd-*OySD*I=4HY6gGNVK%3tIh9`a}sHvg>n%ogrFoA%0qsmSFj(KTf8?8}D
z?oQw)iOugGNb2ezgb_`Nl!gcr;r}9^HwxTy*6@$81!iwBxe<n+XiNyHDM;GNzd|;&
z#-k=f9jR$o)xxdKDd@kb0iWn}rH6#3C?h}Cg;a4YH&d%VAK7K@+-fTnd31d|L`rVn
z#=dfZA_)Tws3s;fQ|`s%q0_RWGy>w2t(UxRg<Opi95n6tp&Th4CTIMAjtwxaXjh9y
z3g+LRf7avJx6SXe<q}rP8$g<n#qV5QoqXLS5f+60LBknLiL>D4t*v+Z6_h98ZdKi{
z=#9v8<<|UPzd;HIxm#YWr3d=lSRK~UyW)x{-PGmSJ&1%7n84f-wIPD((OxV${1p5C
zk-hB*v(BIl9{bDthQ`o^Vl5LqdK3B{aJJc^TR}}kN{jiNI}W;)AlV8j-~YFawJJBz
zFwEVnC*bO?dp0$OqIC<iQij+|h830DPc<EgW!r4$?R4bim@hr&uNq96kW;fxVnU#k
zlPvpDBvHaB>BMwO7JlBVZbZ0Pb`k#~qr>d2_sgz(-?n6zB-cW7ZhWloQ?wbuQ1v;6
zqy3`yLCHg6LMR}#v#NEW!N3q*)xX#UIA0bXJW3giK_q_+)CDj#s5WM#ZD5MGmoywe
z4F5KFNR?}RhiGn7KS66JduK$~+mTI@S!mhmL|8T*@B)%d7q*LjI)wQN@n$LH8YykJ
zGGNywwa}4GREOjiV4pQos0l<mIOIjiQ5Vx^pK5pcTK*vh@^qx&gis_ZV`N!MG=-G>
z7f;KoK1DWTgQyHEeI5eaxnO9Vx47ym*He?<^gKetA<bK?InjP;vk_!kKuJh>5N_qw
zXCkFZ$bIhYNAVO4&LUwhWMqI90qnW!Y3i6b>kx*{l<EEQ%;hbPele}P!O#<A3Z4vk
z3fK=)El>e<3}J^6FfeaIG)49SLpR$GjDi$w2T4!Ldt=E6iWn9|QS?KDx>*{EIUfej
zqa@H>izVC(VM#Mf>~t{uLN<Vii%@ec*S)r|&Bb7^;V@UJ{6y1kX=bJI#v^W4^S`IT
z4bt(jgul+%G=b<>lL7XV$%W`6+{+w^qNESvVb=Nj0!nG-#8d|nTB#1tm*eJxfOrgk
z);Mkud<Eb#?@d9;J%>v<MndR|5u5?2h272zQ4wm2teXwXLWb?!#O5oqlAv%43)E~e
zKh&u~Vk_TsgvG>4DuiwTgubNe&SDsxzpJ9GFDX0K?7Bs<MOoU@-`^xOPXyw0-FGsS
z0_<0QKN$jv0(I$85h|0jR85Xc=;wdYIvVjI$t4*Va*v=!u^{N0tg)XSY)_7<83Aks
z@AMCyIVWT#=5awE8tUt82i_Tx^)GMO1*o;Y{#O=9?2Lx{G*E+#yZwLJOuw>V+9IN%
zo{sb<8Z!M4MhV?`<*Vmn$!xh_Kp8cR$Egu-Pd&|&gi0TbCW_vTqO{AOZ>cQ!s7E!f
zR4|2_2)?u}m?IjXZ6(`boPeJyzLAwh`lb7b&E^7JEGc%ywgq#kD=7?3;WyE|{A^0Y
z6r7K1t$t=JI`1cp7-I_fJ1*J$XAX6xc?EPgmsyXcT2#Pqv@4Tl72YG8zZqz&f1i-)
zAP(k<4p9yu13~sp7;AuHdL326$3j`<Jj#z1J&RJSJ27{qnb1|=N;mLDL#?ZF-NCq)
zBCSCCA5)Dt5IhpzVEa-nd}pA(l>UPJHnKkkq~Q6to&2XM?J#X)1ck1|uPzb85lpLd
zzokq!XoTjAn}MzXMaY7ukZSWH;4Owz#%O`#B5F?gGPM+4TO9FUFt}h(3$kteVU>aR
zk$#k)f@m{bkotc4pdfWbvDZ<WJ>tY~qDA;NOb5Cez{@;GKl;I#)>HPNt{D1W=yOLm
z(0_=eT}f&~kEbq#lGN7?ykQAc*2_X2@RO`HN4kio?n(M$k<>_6+mt+C=Fv~aC?MX2
zc%MGI#qbW=Ks;Fn-Ub}>;=jHJoO^-U^0dpO$j9)AL+VRgY_Yhv?=8%pig*8?OJCv3
zyTu*7w0Ueg{dA1zr(-1_`-JMg4~sH%a%#*}BI1_R8o6gOV-}rl%>LV#NEr=zpWIrT
z{=mvF-=H569LWd)Rv3Ksij>lm>rw4N|8=^J+!V_=BBcCm8#J|fAr=~_MuEMlZv6G)
zmkJw>AzedYvZ2NRC;9Nt8Y%WDK_E)*fUrm*W#Eh`5&F@yA{16?K#d@NVBNqB0(pJ3
zYvK)*PFHR2oV7o|uh*l!@P5d{ar9kK+aG1dv2bqy8GE~99?3qSSDvkqVvcYc1zK`$
zV8zS<GikteRa98o`;N1k(N2*kng(}RUKg?uYeWhC??tYGU%k}~N}5Q>aZtuG?P@Tt
ztvy2iqwnm>sR4xOhMJCvPU5i)q9!9dEztQnAXSLRe9v98UJ|*fX>dhug(zu7wg0*j
z7p2S^hClB;4x{HN?b7-~AB^cx>ZHvuZ&=$C^l^1)&G!=b{KWA59hON#7Gp(wLuf$O
zjc%*)1|05jwr?yO+}Em)TWfFKU&}R64gU0gIX7+o!FUTT!`kZbUh{1USj548k$rdR
zVPOBc&lZ7hg9895AJ`K%R`sC?h;m6G6e3;3qR2I{)N)V%!#hKIy+8HYPAkdMz$$4i
z+gH0kGrRC{y5RC*|EzC3>RM7HJRaL{_1(a)Uo<4Nr?)Gn2P|4s9aSB~w%ict{_t({
z9OGV<IA`}le&8!ZO^s+dY(h%>6J<UUWd`&{Ty!x#eOsj&Vs6^TGwS20LYKglZD?2z
z=7_jQp<=Aq6Qrk^+WJDrS=_&FMcMr1<6(8<d)J|K+eo2J>PPK7`FR|JKp-+B#MmI=
ze?X{Dl)veMtS2elA0PA6FKMi4tW%~7YMcR2k&V<5h&=wKY#{dD`{zF9snS88o~Z+m
zLK`Qhc+85~Sd%SXUb-o2(b?LNmUgR}eIC;?Z}gWkb(Klt!>!1Gy-u43q}tiOD4a3-
zR=d%HZJFQizWF=wE5JOeDFZnaIDq~={@_rPYFSFN)yt03LsDoC5T@33WIgVA>&)9%
zw|!OT-Z=9ZLr8%U?h^@=^W|BhcIYYPts`y*t8H#XF9%Qv04wrYLTMtCAU>sXe(cL8
zezgQd3Szlb$!6PdZG4ebn?wcIw5|!0&K^pzS`&0XMQXwToLyd9EZnFZBQGD5?v%cA
zWp|h)wS02-Qd5#*U|BzZyV)C(I4#YHVFxfcctuYICJVg{dQ2`q&bj;J!F&C>T?Zv+
z%vF2Kn>#$+y5ovUhaOm-=9bwvDEt%;nP=N(8aEsJnoKSo4(T|}WgI_dtsQOj+u%gj
zi>iS9qZVF^Hr^0$GveQCi_-;S$;^LU8O|8X$dDy47z|6VtD6V!R(=rl7Qb$Y*}bwh
zDYUz^MSA3Yaod47hs|vaMqb>d@%SKKo>jx^=p)8@&tgsb(CP#^4PI_+`^of?j4c{P
zADzwrjy|wB_xU7~95krE9rX#u7JB5R=ll1QT_4SV#SbWS^XHRK_;Qwc-gI9d{wEFH
BZj}H4

literal 0
HcmV?d00001

diff --git a/docs/source/_static/img/dynamo/td_stack.png b/docs/source/_static/img/dynamo/td_stack.png
new file mode 100644
index 0000000000000000000000000000000000000000..d20b3250453c503aa77b7967862c1390f05c4c4d
GIT binary patch
literal 308321
zcmeEu_g|CQx3!>A3}Qiw(yc(0BE47XNKF7k69Iuhr1y>jDos?R_a3AZdQ&M%?<I8U
zz4!K>%zI~?dB68_-}wXX598$H4CI{i?6ddUYpwmfRZ)^5fl@)woH;`xC;M3K%o(CR
z@DC-?dGI&6t1KSiUj(*lG7@KU+Gys_oMAX4_gEb6q_;G3aR4!Sa=KpeR^I3hy|2no
zl@Ez6a!MR2;;(KobHJHYB~tEE5izH}HDY@A`u+XaO;#C7P1j%~KSI@}>P`>7oDS+L
zE4yYN>Y3+O#aG48y6bC(+aAYr4=-Z-^;}OqcTGHN>Q7D%S8GVl5}dm#cIIFGQJ|J^
zgCPEkn{EUdArXVunSb*~+!+G;m*@WdS7>>8DO1$&DU$rF72#J-C^CHZ->!zpDE|yW
z{6i1x2&#Xx_&<K_Ik(aa|MrBg{&)`7^YrK!JNLidGSQyZm4A7dL=5li3D3C|?A05f
z{`Hoxu1(PW%LjZ_j3EF*v}d#)@V)0>Z<%5IyYRn@1+Zi>3gWA4KYvmBXWsg^KO<J_
z@A~iJ6D-;LAqB(u$9lHQ>XQHVXS}?w?Eb4z{^P`-J`}5E4A7@Z^ZVC7LqNasuY>tq
z!T(->|M6h|_X7OSvGl(e;Qx~t|F;7C&x!GWr~nKl=lZ@pHPpyP8e<SyFLp*8$A+VX
z3Z?#=U-$!wuY-qO?2rDV4*nqbW~<)&X1CDuvn86>vhh=h;0oX7_^T^X=Sip{MjVD%
zG+BL}Z~r&t!6*@5yxmkINdKdFPX*WJ)|{YF{W)ljFIp3V^WDm#BRW=JAu~w<tQ^o2
z_W!n=5#0gFJdk4OzxlHLb6_<_E?aXGOEzm?FGM%9)eLjJ93F_VX<|+@`uU;lu;A8z
ztOC@xPBMmZbuN9!9IF4gr_V{9j<@@(nuJsv2rke(!<ATCvQ{i{sAp+pGx<NG#~+k8
z;nltU+4xp)ye(1O<}Ky+i!Hljo|Yba6Blf|<QSdXx71SQ>p8MY#yp0EPLDQUeD#5(
z+0A#Q=~cPB7_aqyEos;}&G5G?F!dRn*|1Ah1j~P{)+n;JFI)S`V1coET<@bs8pE9G
z15|gpy?HWB^wJ-zSt8Y{bjpw9Fn;H}jO}mv=1c6(m~OHsygO`|Q@J6}_UXF))?$DA
zu>0;v#gfNjc1hINR~KZ2A0Ga_xQhitLCNo=3^u#-|2|W(9U>juR_+%|1Nr$+GK`x)
zr<wQX%oKGi1uRF<5aADyp|Vz4^w{YBlPIHxz?+65$6MVh7x=)&TZ_!P72lHHlUNQP
zaLTRP;fUrz%a^ap(7PKg^kq}emoCBi&fQmvnS-=?>`zmw7ruDXU5wT)Y!^8s3l5ls
z{f>tZV0jY_{|!P?BOAywc*TFRH`%(-o24$))m9Pjw(&DfCWJ1{tUIm9@Gau=EBsDR
z2tALt!<~0W99uI~lUr=tg~Lh1FvmJh6TYFrs$aAojUD5#<|Yk6)RbUdpAHG{NC&dh
z3OWqWODOS8oa1!eu}FS)<2MjbumVeBj>&zX>H8nG@Fs;;nN6zMaB2SjuK>2^qJ_As
zVTU2KG;#TKb9l({9?ez!vDHR!oA*VS#Jl$~3FSYpf2So&=eqRBw)+XMlz@gxtx3D^
zE|=)(5$8m^Y(ZH~H%rOzb2{g#P+is(<#-W2SHAOq1&i1OKA`#ZHFKc<5ztVvXPYyv
zY5FxDFKWFA%miI`S4OJzm+3*Ynh9RGAPw@uyLYv#HK#Arl;dB<c^sH=+As7Bu(_^{
zf1T;mwhXcxPVfr3T<Wy`+;%CyQCfgUaJtw_zgFgJ?rQa6u!e5g@dk%0J`(@flO{lF
zychmj*&jzEX6jvYdNOo0D|-59u|Jnr^kgr2e?C3FQa|)`qnT}{U*D4(L<Vl%a<MOa
z2HYA>&y)T1evW4)BMzKM#cqd#Xu@mv6h3cJ+=gl9SSX1e+jS>N5!==hTzDNvDM%(l
zFd`W5z7x5>>UnA&%=e4Pb}rF>rcR1gx4dU-b!<#Ag8deH6Wxmm>m(dwPLKIeY-{xl
ztT=%kI_KJQw7*$}5{B-~xhj<VVZMNSdar8&O%Ts-I}_ouy&(11hBdiuretkL?E9y8
zAisBf#ybU|{nAIjWA%?um20hntCfavo+r+QFGmE_i!GiOHqyB|8N`6x;OtEUal)sI
zLWgB!A^4J_tkjKw-6G}S>qd$RlKz)pv_w1-Y}*}mU5(^y<GW)xtGxYq%xyF1rhr|>
zO<@;19L4)~{#hDM(+&pPZx<O;1ax;~!1+V5Ua;uOwQs~TKN;f#vmxOmOW7r(93<2{
zMd;$@F>(o*UZvBxhiV9|3&+xXdqQ!7_?IK)k^XcpOp7xuuYYerjS>L#@>??hyl^)1
z;~;-;NC-85X4ymvx*g1?zu1`iX+}(+;$sUgR&=%dVVVumPOG)|!S*6Gf=TugHcsV7
zI;chb_Qn{EZJc?ov%x)<A8CdlF*LZ=bd*1!EvLCoaLxuHHs;#nv)Cy^&&~Ow<;^z7
zR;1m|ph>iR7%cMI9qA<-3&aJ~P%%2i7Go1D(I;+*D>B1Zjdyl#4ozqRth?*pg#Smq
zI$uh?IrrITzgsmB9E2jitK8G?e(w(dsSrKHlT0B#=5e?j?5Xd0jHO(7TJ7enbkY4}
ze|DyHGPpUDpL1~U92J-8vlp#V*lvOlp$*wWqo1U?SZ&r|{LR(Wckfk<;j>O(Q`711
zs&aMgE;Lat1ec=0UN-m@i2SJ-FU%DTvi<B8p{s|E@gP6mKtvoiik=uajJZ3uzv#sv
zUV!YSXUyPkX*O7pnhENI5wCl=0?p7WLwPmG`!XUI1ZQ%*B-V}(wiR3mF3`K>If`=p
z-U&1w6pnAZQ8rS4t0oL8=OFlWTKMcM;U+lI$THj671B39j7hOKRI&)9JIQ1z)<PfW
zU>7!Aw|h2noHu7iU3uFQF{I!9ukdQ>So<=A@*LwoqXmAcSRZla-mid}25QSKWIzKy
zYuOa3ZNIK#BuJKx&I2ipxB}yVepYVNjNSo2TSV`aS@|qoS1FabMPIW%WLe|P;&}7-
z+I$rNb1>l|Tt@Y0!oX(@CPG5g;99Sy{!K*j{@V8|RZs?fw{@w+sts@;k%uc4xgC1+
zloSugDc5dt7`>SXU#+w`U-d;^5Sl+t51q5X6){gQs@uP1jC1-1&s4iNZD@T4d;!A{
zWb1e?E&AoVD)L|4{W^x_cJt3seP1FU`Lss!QoB=}BSA2B%@QTgU7RbdQcqW^rmQyY
zyse%r9h7w8JNaLg&{qen`j)46nb1F0O+$#v=5{}@<3!_%BoMKU)4gSr!GemXC&#)x
z#?cZ?V7X;S8_lhtoJDtQ3cVcWv)%61WUM;Ra{mey*N@8e25RXrrZlD4s4nb+7BYqv
zfq~068kqdC@^d}!=9Wvwc21r|aWNOqGwaw6p-Uncb1K?soW{NLY5TXusCi#<xvo|@
zj?c74adWDtDaw_%f|zLqcdr#-l0c*+=DE&oE;tBlHlY-#_(8fnn+%QhYi7-yn8mGu
z952YI^DJ$yRLq#ubeLu!^5@@j7&IIoSlcc(+`k_l5rfK`bJd3{f}1Uz?KgbSfFMzQ
zlk;#mbr`H7&2e>f!r}R%#2v70{=M;6K@|43&KX!gq{Uroq|-3~OizA(xK|}=p13<5
zuD>H8dV0`bb6a=kB4f-ri-&qEBPqS$EkHe7%pk<QzIa{D<^F+ga9mi)uuXtjXOb_o
zZn>Se?zZR2CO!U&A=`6dtR`{JX`0*1Aj-HE!dbbADOtI)G;@nrqCL;T2z~EA$fMml
zRpwjD`|j2*YBes_UFEt<7){TJFM0p>3dx3mU2c`y!2kYS#TY*PQ4aMykL@{8JsAT-
z3Xh{y>SDJZSh2jD*9UN>GI_4%?P=_3-cX6<?n(TX{&DxW*b_vK&Y*U!?bGphd<chT
z8}Gt4f#yn(!sSfL48HrtRPz??IUFvR999#ezB)EBr!Q^o;;SOa^xrSh2@M=t4kV6u
z07Q(2Z+m;=vqzE_?nAM~VAn(aYN5XQ5>#&!@y?XzN`uRXs}~N}8|hmCK)w1r#EA!y
zSE*a!uuc=0obp6$?BgrUMkeX_TxW8(gU98-zrcwKJE&98hS<V?0%^gEZkw&V_&kI!
z8e1OTu|bnR?u2mlW@+*wvNX1Rcu|^(wMqbAwg4(}^*Bo<(WigI+gnVB&?Ls%FOt_%
zuj_kjBxhq!mL^+{y73VoK8wT(I_?PPqC>Yq@NfEv04Sp~S7v@qJB$jXzTJ+)g0lc3
zsHhN%9&df@mxtx7${)0K<r~VZ>Z@>nQq<4I>@vl4Cd-5<#^ZGCdl564>2Xba&in>^
z-Zdu&&AfcMcVcbZ_+QMlL{408>iZp|PuW1Tu&96g{+qw{4X~fDuGr~?gOAE0gZP^D
z)HSn_#x}5%ZRQxZX0%)vsnte&4zcD8&`rw|-jPp*{d^urPL{WNElZ30_x2`3o|L>C
zaeF|!PpjV^`|~4XwB%*}yCx)&e1t?~EZ(7aWTL2Fmfo5UcnR>H*-&wI*|-m}A|9}d
z==Gen7%ZqiIvhLQL9($vEvRz?)#b&>(f*7-y~i7uAD?zB{>%^?)_h_v5DNl&$YwZ;
z_UBPKE4#m<(mN1O$no(rKK?6o@s-(u!U+UatMhDZxW?yKS40M49B^hYYL0g}U6--p
z_-nqKtCjy;`qo+PQdx2d(8m-MwS~UA^IkClK;st+J(+UWHdV*3zj3o5i`LvHdaD<|
zUZA%tf7DaH7^bQB2*9uCr}@u)yG3xa%lb2IvF5=73rw@_19<q##6%!WN$wHIu~k_C
z(q2nyxZVQdR!ffxcU+>RKh^OrKy}+E6)xMTO>o~EG~P0`na}rRe$+&K2pu6T_6otM
z&wCjDtx&V#3-#jm6zX?*11bChWTpD>Wp}QAjh2ZAfI1P<K~$0I%EG-dPQTvHB)q!3
zA$(G?T5XQDEwY}JX1h@*#?Va&T@R!c<e4`EU5VW$h*}dD`TSUA?-)?X_vzRaYZ<NI
z0}T3IJECg4Pdj!tQ>u2b(8RSJibvG%TM3tkO0JBE$lsbBt#ZZV(ul591(U4}jT{}b
zHqleJTnS~?z|Sn&F4d5yy-bub1qQ)jJ2R#3hYsqMPU{vCthT>#(;GQZI~3p3oc}ZD
z{PCaOz&b#%b3U(olTqP%IIlU6v9nnzn~eguWoQXxg|<fR8V~nzw!wW!9_KykG;uL<
zGdh>K#1xsfZP(3uN`0<uBz6O#zP6fEvGOhj0~0|RavC;Vnms1p=-hmTPaj5EVgOCI
zrpSGic%qEX4HjG}RXME7&qqd|Owyk+bvS5?_gmLplZgN=EEU!OA;{?ao%kE`<;5hY
zOW|JappbCVp&q=;h4lZGLX6bFUhZT0DgOcQ-#-?sfvtj6Xc`TgCs%~aLVxZmwqLa(
zn@KX;-N$1jODZ08jZ*wvhjk1I5E|^}K#|$ULB}!o0G)gHe0AW{O_oPn-O754WjZw1
z?<=xv>N%&b_RxDCS$6~E@*_NLcL$#qUsmtGnB`%&tlIu;2C~(ZZNVM<<5D6JzV3g8
zo^F8*GtOx_Dul=WemJ(?pmx5=ORJTG>`Eg0M#=jb1%^RX+=1VOL9n>;C<gIA1})R|
zYr!LbMw>Sc9`jDJF?8dR^k08FOc}w0jw%BXD3;BrCqq?_i6O@~^7Lf4CJi8s>RC;x
zD~O$EDf-U<CVl~k7wPP1k;5fUJgC?8*w4jq`_tIG;XhjY{$#o-tjd#8$I83pndS|D
z+>;Tf$)M`+H0VW)*!uuUv`Wv*2FpNtQ4#htQ!cHb#pVVVsH(HD!T2GvD5GOknYF*2
z2kf_suy{6YtYv#}f2pvY$~T$wkwx#dZvG1hdpL13zUK5~syEVj9<3J*%QHF=&{i-I
zir|+UgggXTw?ul^DGY|F+=;`Q_wPu|{Xrr8v0d*ikYnU(9+~`(UHm6P2;6w`8!;PK
zce?VGVI>wac+h0ZN2PZN`1B#z@}g@Co^;VBGfYt)$KQB~hwVa}ic=pmvKs)PL-$lq
z)t<c)=tdJx>v?+YSZFmNk!;oKM?&2)(;8jXByqayd1?ZKXN+SHGmuyevue`o<_W@*
zd_<Y5{p)TeL6+%FvO0n|w+%Tqr%B#4#mMI%Tq&A+$fndIWaEkxQIsG~@^@X*1d48C
z?C?R80H%Jxt{AaUHb4*hp&BorJm__N0qv^Cup}Yr(<;cr;@rXi6u{M(Cpgck*>4X(
zP^4jqa351k9n%eOaQCA@tLZcrO6E0KYMtDFTOsiQf*OuctkOI1RIoRKjpJK~T$a)I
zw<KHA<2|D6aOJbr2MbwR)x0SS-%kFlMPddZp412Q4RWsh4kys*NQe<@fOIQMwV|r*
z3tGhP!(L|^@3%(`s8h17tRbaXftv``pWBHPnn;VMB`;%DwHaA*XzV(~N0XXCDzlXT
z{O~D5IFlz!li4GtoJV$ZDz@KAc<WrAK;L8(yq91G?UPm10;pqC*!~ZeJ?o(7f@tXc
znEtHh65AuvY^fM4zzw%&i_X{kewkLlNnCu0Z~BvZB}&b>=~F&BYHJnrId7cDy)QUq
zilxp*tR&jK?kFP4Je-I>Jxbp{Sj??D)K5$T;hpOKNR*URkldM|iV=jH%&M(iyfRdI
z!8y|^_T*ZRtVh5XowBaEiq0#dM_2dh%BMakR3?OclbF8|3Ei6$3>5l>S%|nCkYv>o
zB69F!j#D>mW6Qmphiu6cP<iT@&*3XxAvu!dg*~v|>8idQT|rKOpoda79e-a=GN6N{
z5dB#B^UVLK`2d(8=cw7AdCm7`g_l^R7VCokcMJC*#dd%qvD{&K;TXl+@8==+MeH4g
zHt4X07q_soY5GIFsGc9Tn$6yJ=P(;ekBRS}?lp>5%JW%Y!Oq^`d=ZFm&gA|32tN?s
zLXLx~C^ctj=8ZJCf}niIv4zWQeKY2?UQbzNrNereWo5*=k+xdBpKV6F*y845YG!$@
zeZQ_rLoltma1O>X^OrK<d%g_KjEIrffJSz*A_QiVQ7^N<F)c~UYdVRC6e}YYhpb;l
z!Ii}$>(H|bKijhVe3gaPU-x6C@DvHAF7>@CV0Nm?Xh0gz*QLJzqag&(!Jl%<eKLr*
z@)SdojL!HaGg59}r4dL%#U;zkZYHS7daR33YXjUt`nY(9J-1m`-LT7I_6$Juf!iB^
zb}`9)u29|xK>0I~{i%<3@qUwsL@lv`{lpv<ZtaQ=Z?s>Q)e|2q6%V~k4C=nN)_JP`
zIKKVhWoo-kc%ItmTTeg(%ydXzA#rC}`b{K`vx?O=j=W>J_V?W49Y~a6tT_PLhTl)q
zr;T@6$gK7Upw*vJs3|NlujP1mY!!Q5)|e?G$=3x!-Uj$g_~kG=eJobZ_*#v_1?yO=
zy|oEF?s%r%9y}8Z`nyS;d`8nYu;I;><h-yCJurE1-kSAc+xEz2){2L$Ie~Pu;CX|V
zZEIaP9a#*47$+%d+=>sT6^v3%mVSCA@l3!<JiFus-;jIu_=psxF4;_8^-wiHXrr#Z
zJY*=f=mNNZT&K&3kDj1tx!AKmy#T_ZAcw>jUEtc(51=sV&(U4PwyMu%IIaJB0cfdp
zOi^Z#-lthHjNX!EWs?w8XJ>hccUtUSoPYTkB1nPdTEK>iSeeK1fs#0L^hX~K=uBpT
za;93UMYJGzSW9%E5T>KCp>fnG{!Eo{<UJJK!O|rv=zZq*q2`jk#N<9*w(_eXHq%Yc
z?aactHi=ZBFiY5bdYLmPphEC`WK_QVB`tDkJBV{U8KwfYUG%We?3;9~A)(>xaHaoS
z&)3BWu(NwH&CS2(tE)dQ5+JUr7@w(0+<69VF*@v!2!+|beyV6&b)V&E1kk8@SxhVu
z=>lYVD#jF$>l}~ZiDCx9?AZmQZktcK!ISBz8NFOir&|d7_If|{08kju;6Xnu>&S6G
z>K5Oh(m>lmZ2s8oacM1j6Ca%D`?W1rFd9%kT32#AbU<U9J!Ez^93q3>J=$!KR$!~@
znU5$AB2$5@bn+Bgopvd(IkSZJ$lp`tUj#^f7v`AB5{YIKvjrr+AVRF``Eim|fFdZp
zX#b!iN8ccdhg#17;cN<*`$q9aEtLZ+qkfo<Cz(Df3H&4$4r9^g*`OJCq$L)g&pYa5
zm;yEB(Pl|;obRf4WeHf=14y<KD#VKTsS~(Gr^+s~>_=80XFTVe!JdjEJs^pd%%<us
zn4e1y8NK&9z5R^u_ai@~+rp;6kUya?XwV;_g!w&Bj%u&Ijf*{~{$QHyGGO2z`5>qC
zTh^Bd63O06?EqdDy6tZaq$R6_W4<B2Xm##Ku*R2Ym%Q9YcHO5wV8wjSI-pDFzZaZn
z5-XzrH)>`6I)E=C+QEOJwLhTs)N3yazgHAE@+j5RPzze56+Q3En37t`;xMrR^De?b
zBr}(J-@~OGLG)OmNqeBCR98|ej-?cSN6owtp4)jvw$XjUmoi3Dq!_A>QAvUH=+G1|
zT!v;bzy6q$xU1Bi_M~+&w}#H?onbJ4!$Y0Y4(4+OFIuV6^S)|ZR!w8SDcGBKY9Z)#
z{9y4wD{0n#tlUR6?|p%e-No`e$&mSHPyJZ+TiCp?ES($^(9ER)5{8+LV{tHHHwBhR
zhE04Q9w+DGPG*O=M}mD4cxNqTfdf;WghI^Ixtzx&!6UAkl+S*epUjf_fTU}S=4C5Z
zUmR6|aAKtca0TkOgxr;w_uk)dVP#@S+8w@W5_4_OP34B#GU>4&VdX(fTVG<IFUnwh
zz%V3Ls2;d*`m!SMByEb@my8Qq#TJqMJX~QgUXNJ~Z5f224at%e$)4Nk@vbyEp+5;V
zg~E1L-FC)t8ex!uguJT$g`L0L4DSj6uaI=KV7vT}%H;KEl<oT~%<BxBCn^u3d>;2m
z#8H%9598zD4$sOdZ_@sv)*PKD*~^31MjT28idV;KYABztnfE69p4gRCLa$SGn`>uj
zu#7Ew0+9QKMe)P(mv_ogy@#N&tnlm!UIH88#^X8|cczV4xj!x*f{48#<djFLqcGd?
zD+^gUFe(YXY8r~;u^<mGs-bv<*k@SqaUSJoikZSKD3E9N;Fy>?_5g5;1Q+*yRZDuD
z3rsD5&6m)!=GcKcc5|%PBMU%NZ-j8nm7N+A#3RXFiJVI;DxMoZ8AlH_0*;{pzPj+8
z3P(H71^yd7RO~<GSp2u5b{G5El4GDKvrduVaUE$!y3L=UvZ34cyyHv#RxY|<=_<T~
z(ptq6Ie2{ls<?1GBLKqT`i*J@_<dVpz56BV(GrN!x15(gwlz8MP$P`gWd+(6e|jn*
z9^#(}T7CV5?*4ePSgsFtBOnVIvSR3ad;1H#`KfR3uMC@cyLF4#e-Z*@2B6WD+N+=K
z`bRk;xHtgYn+xxEKip}BJ!Hw53qP>NPQ9_QMrEx75k~;KEk~mt@CaFHZN~V?LJ3)m
zaLqyw6+3$dm;XpXox!+#szH3ly~#LMx$g(7Dt&DVha@AhsA}~ti4|SP5pq-$@=Wzc
z(*yV8BGXhzo}DI_*+@AylFRJF>M%Cpi<fSBV^i3JC!k6BLY}wP?1n9XQejAY)1E}v
zqKoFT-rcYc#MfS-jURWE{HgeDQdH~X6JWwnhdKM7q87V}XhQL?w}z^nPji|iQPYOR
zM!x(no)48`U{d@g%_5kNG%<{X`%f5q6@aRGeC@^qBD-6cI&N3M)L!4jq$HV`_L$g4
z*zTjVc3<OApPAL~2Y;VldbGtaXf<A|rEP$D7jHpM)&hS&0eVzU)b#~kyY|J=_TogS
z9ma`bSH+jbUBu+zL__;gtl&u4?dd<b{&wdftBZR=Ko69T7d`NxY_yeYmv(O`K@as2
zXC3<1qittbf8UT!faV^T7%KmT@9=dfL51iNs1I9no!@tn0CgI#fClbMS>MI$_ni=~
z9Qi+f*>nQIWF&K2x<kYuYqAIsW?5;;3332TIX;!$-)t9c1?_v4JXcg>&B-=fIJ4`}
zRPK)?-wwX4z=8W5&+CZN*+a9Z)gul!Lqt#3kiK(qjyExW2E<vVG>w+g0c_QLdF~;&
zL#CGZZzo7wRHxs9BrTNZ<4dT?u|thXO16@skqn52zAa-#o`5!XrffEjlXuLmaCQD8
zq`_`1qbU^kNs1Q5k__pCH-Ls=0lRHoHOe$wzS$;NH5;k0$J(@x4M9@00+dy`KNGE&
zr<k2@f$qao`1ocP8GneEe)s{8Ntph^{4jjrX}~`8Z22YO^sO>7P2)@9w&&5wyBHt4
zc{f`1Si`iPev_2AWT<~VH`$TPR;NtW!o7LA@^pmw56jnO%H6`6wEAJs$vD(&8FeMS
z86oZ4OFtghKF+3{gr=59lzJ$YpL5e`n=KE*)n}qdz;UmX6A%5?ic`b^2|%U7fA^mx
zfC3?)IxU|Ov$E_hP%c7*dv+dY)BQq<As}?v>ng64ED0=nm3$lD$|wp199JyvVbs%u
zZDqx>lZ#{*1&T~XY-bd6Q$H%$hg8-al=xQq%p~|K9VcmrseU^bs6&}olve{p;XS3V
z+~<<}OQixRq;I`tm)!M5I;I+$epMm%_mE_%ln$Yb4OV9qPGhl3iG}ZH0#dZv7ByB~
zrTXAOO(I4ccRzT7s;8rj6kPznc`Ktyq2MqR5XX87)o{PjJ1AmGY*k2{>yv1nC&tjW
zLsSl1&5;d`q9`47ojn>%t14@-4DN*NcXGe9FiL58QDb}Ha1`+nMLe<Y1cOVQb?8vZ
zVrv!7>B#(M<T7J1koWbWc5%zRbDV_-h9{W^fjMem*Xw1h3ei0q$NB;)^T6~&TkWfr
zS5R`;o|4Q&JW$tg#^q*X(&&?(2F2b{jz?P_n$4L7#?5KVb}m<9?fagF2IsC?p{6mI
z08G;4L0|``)|k5mB8h{1P4x}<LIdw!vO5I%*Z+DrVs}Bd3mq7^_zzIWBt(d^?5Xke
z93xhH@Fk(fL|$S!j|?hjY})yl*bU>e?g<)mL0Wl&NC+{-4Hl6H30I5d{U{;@?B>P=
ztF{;Wqk*Q?VRlelzmo83u75MC#s?}MAcqlR2QqnX(<=(gZ<knT7Wbe!VGlH4Hs)aL
zZi^y%@m<sSEoH#Rv=8yXQ<*#;<X0XJTYV>)0m==LP)oSJhZzv@DG#`N#qG19lqrS;
z>pkkqBo_T?P$hAeN(%=AAg=6=QA5Uj{7!N{>v<rqlSKi0O5EzLEhIcwPk5>4!^>O^
z?X%}E<<TkFYN$x$CboHkD{#1y410+Dp%{2e($8tvg+`Jkz9)%oiAa0@!QeXiM#rX1
zdleo9CI<_^@(_{1K$@|1YHup_A9K(3`o%vR^Kr`#`dQa~yH`nFRS*k9V3rD&0R>no
zuV^6Q!MZ9w(%u}-<`@+{CM^&mVvgv{0OXJ$I_+5ml6_u8g&NB)NucOUv<oU6dj<Id
zdO8zo13fM~&*~#)^rd~t4kEr?h6L(VZU19}IXgFL-?D2wFNf!_R)IJlE^AiEXSlO`
z`@>_)suEBSW(3KF)d7pHYkuQ?3-)d?!Z-&gV*Up!39FGH<hg-5H=-1ueh{4L3}Y7s
ztl**@+2^i&AL;V5A{+Do4rpTi@eJYr<zC+l<ISqG*k|{DXJYV4%$vwmRh3xRLo#6Q
ziqM*vcMHp^9C@5vYCUM3vmG8Fhl_*8JWofm;7JS820kL<EU`VvrKShDMX|W7;3V60
z0l2CRv784>T+#a+6wCng)Km$115uHfp#RxiRgu6ap<bQk9cJWXGZE^mbuU5_q7kUd
zCo>TTr?BZp$j#PWSH5(Kn(Xl+x$jdl)I}<BadO0iwZgvVsTAImrkP3gYFG<3#O0wM
zht)kvnYfYtpvvq;>0lZ&GKK*242WJwA<v+5(rZ_^xa3zFUwj`jgRPN&LAQl8V|96f
zq0$06&ac~R!O8Dc3uZACq2yOvMD5z=kUH`8E;)1iA3O&z3j4Y#E3=j24V6RgfH|81
z%B1p7nZ)ksm*qd_Zv;@VY#?92))oYM+GKCVCmfD$Yx$Z9LQLSD3ahr^wZ6-^?c+M`
z*DnM$PHKkYA`Kp#_zKscT@lfpoQ#U4GOK3pjA~xexA-BCK0+eN;xx4Go6a34K;5k`
zBSt!RQxa-XC!1z*2S=Bn+RU%By|BxrC}uO_nbZg4$?Yh2&z|j+LIFffE@h*{pM2%J
z4|v$Qg|f~5=c6Pg(1ye_1$51OY5b7z{=_u-NnPm?Dmlq_h20Qv#H0MZKHu%FpCVLX
z8edAA&EU(@H5*r`4(iJ>$^#x)+8L^zQOO#r{@igy)noK+vNlzPN7#ak2gYuro2E*b
zxkM-nt7I60?D8fxa#$Tf9)6ANF%c#&vAzNDd}(AWaxJSsdU=Fo3bDmQEH?qm`~Juq
zv<?FX-sxLeQAWh=uP@pU<~&7ck-aY<@r9G^$rd#rD;&^WyC%Q*-g<>JClxFY@1N2Y
zhqZxnZrS70U_2*K;kdejUB?0GF}tEvgG{|P!+x=^Z_;<LmTf$FHhw0|a~KF)FXlQD
z+tPvFeN)7(RGqrsh|w8nor)jtKlu=+AJ+5%L2MJdw6{JP>l42pl5W1#ogUIJ-)fv;
zpA55R|3Exvy9<0OJRoJIY?`<i+?f~Y=DfRNKs>qci~AKPQ!vRqoZLR>OH3&ctF&qY
zaa9tSzni|Si@dvV_!{}T;imK@?W$HRB8&9^$TN8ww(2U#us|Pa0YVX=Mee|^mbnG<
zReAl!rVLjrz#|-x<7fEudj3J8yw@hgs3rSK)P%AS#DjioRNIYMaHVonDH#%wz@uDj
zh`~00zanaZmr91x8gg*~3e=Pc(!67}5dnfm68K;PcV3cJNb!}z_t{Vhac|f$(i3o?
zN`>3R=S*jb*(y7-zP<M}&Cdtyqf(>epsYACB<U9o$n(6mGcD`FMgwB9I<|$a&8t{-
z`%t4Wb<#}~5WD(OOu>n}MLv$jp*5|aU|oQRX?Emg5y?9EgtRZagpG<8aWB8gXO*zq
z&O^bbx75=l<6RfPTK>a<Z*>*(JoTA)1cfm4II|!OZ91XTD^5zlJha2=dBsIE%e@P*
z=!KK)s-unj-!HQU%!_YcMxD)Yum|<Tp=>F(I5dk;C#U^c&2^@^w7l2fNQx7!Ou&}V
z(d6?rS*U=xn+Kpjwf~gpj+ka`Ga%lZ8_e2y8A+GZYs$su9@4R?-)sKu($K8xFD)0*
zOqZuLl4JYu$Y$;VVfzKREg<sz1+dw%so#n*YLr~RF6|B5G_&A6Wo_9;lDZG6|FQ-U
z)xiL1qM>y8pHB0sI4_(#AfAM)ulJvR)$i4Ux(vX+-HndNP-i4b>3qjx{3&#M9d19;
zp)i0~Nd(kKqWn7mb3Mz$hIy?H<i?IbB)iL0sLY2I%KMczKW_Q&6+iEKKCv`d=qTVX
zL4y=hNM+G1kjrY4jClW%<O`s$W`NTNI;@DW{`F<fDEi|`jyyf){B<rK%TbFR&=<z?
zsCLNTqN_=iT4lObb=25>yK?2?(6209R!a=OZTiR<umU9anOI-)<+Q7O%pldSDy;m<
z&&S6_{~QVS1wZ%|+5PE;uQRDacVi4-xQwa<*!y~JA3_o6F|$}&;+j{Spi7+JP3N;B
zn2fakpANa_X!UR90Q7Ko<gvKd(dyYD9eaHYOhspP5is_hF|I3RdV<&l>!0_+Bf|Au
ze)0&PoQQf`R_`}tEhO(+478JC2jLa1rGD8`;pSHCIs12mlXtSAszZLGlI6P=-eE_L
zosNu%57z7bj!*<7@J`^3nf;7~=YX|I4tf?|vv_C*$Wiy*&8+j0ju*mLKNr8v%#QIn
zIZ!nItvOElf!j9h6dQ5xpVy7Y`{ekL8auy1jWJyZY);zpQO*U8Z32X_6)jXL1O0e-
z5(v^9c%4wl;snKp!m0qiuGSsoyBQtsyS?TGSmq{IH$c%F@HKTL83K&>aYewa>u&N6
zlJ!o;K`1WQ9r%?5FDFApSniiUBr1;)T;;K@p@b(&g&}=_iITIj{!sC9_&df^;F@#H
ze4XqdiMc<?3y+mYVAN=#F&U`0=&36>GgpKGsyi(Rt+_b1>F-+f9hBr{h?)<ww{%!C
zT-PB|)%@X*Z0d7gh34A2{Yf3Z{ztC}MO@exg|ib)o`3S)d|WFt5zW~y#AzqS6j?#h
zbu2*vnp$q4;RG&EZ=B?u8LoSK{Rt5NV~QO|WWz$pbP9bF;q$ZLP#lCG4&9uDSQ=;X
zgc~$~JUlzx)#ae0U7OAT+Z(_n-pi3|Zvp~d6ksL?S#85>*nk}`<2Yfl<B&agwnCv^
zs`clz^9t{7LINhvl@)z)l4!5ac;<jApZ2e(A+`X(i5l+|(LW2C5^+X$AkJ!^1S`}b
zVIzJ8!K@NS9iG@ng$e$Ulc=E!+)~6sl@#IE14=PowhNi*=U?V7Frs+F-ac~PFv5EW
z7c+rF<^_;cZ!J2AQ!`!VHu`ZD@6%fi-vjejHS=Am0k)+wjUwxH*WD>|dP(Jp52&Et
znKmKj*vl+5BH%hmhdKB7HS)l%C9@mlKQS2$^~|pVyC@g1jS>xO>_#a(fTdp-csbQo
zy$m2LFKGM@lw^t^97ni*ex>FiMdB!E&>)gD)5Q;CIz^A<>P@nMN$$pAr0<!Fn`CXI
zsziT7dwk!n%6v9M>o|OEp2Gv^XRA-C?w_MHk@}F}rJVF65jOKUlj{Ce;VV5kvhljW
zu>JgQ@{!<l_QOx7mM%&QH%+K9!^u0!^O&wQhD=;Dq$&qf+67E%L-2i95rRm}Mqk7u
z!x;20r_p&MAmEm0r2ZWH+~)k@8ZVqrN*t-(%Y>MW67{Uk1TKOUPG5Ju2Z4ABGzwT4
zr3B&~x*Wq^T^9IJUX!CA|0=V{6LAB+8&+MxbOsA-b&<rZrL|iNz0C(v=04CckpG7T
z`V&aVY7eG9Dys>c7yO+=jWj?@`J#%S`mg7XN2kD_h)N{u;kf^lIK`)wsNX9DH?=k2
z&6uqYm|)Q=--ra?9$;OI1S(-`w0}1Ft72zhuR1$KUGFRo9bg(D<tmy>mS>$u241!m
zhRQJC4YSC=e8lXSs^$VIM`?l%VqetZ4tU~Kt;g#1WC*4FAV2^N5YyB@B7>^%F_=!}
zs*p80qZ?H!E)TgfJ-zoHM8=qQ#1^$jBK45l#)*5=CuSw`RJi?}r_G_jDv(YVp;l@F
zG~m%g(ZLS)8}M6FiA*d|Mrz>5h%U*qARbM@Ehu_FLEI$fEK<ppnghIq{~*hJu{Z%e
zqO|i+g5@&$vd4AR0r9VN+C-fBH53c4Fsx>RSQm9M#z8BRy^QUsG53!g3#XUY06adY
z;iG;p3P?eMIo;%Clb~@Yl8)yD{F!nY@*|;h@jK<8n6T8%dexx<z{PT}k(GNOrqn9F
zGD#Y=p?Wcq<PS7n<J<wUB0Yi0_`c0~f8}$m>a2FU67Aw<v%`{+x9*V05$#`XW5W13
zqCK)0VUyqXDZH9fAn^>yFln~4ZPm7N=a`@_1vv!tXw5_k-y-Fb|8kjB&#G1=J*K{e
zs3`^H)7jaSrpvSOo*uZ_?CYjqTQvxz#39(sAAYL9^1x}*7L%gh$+ojCz@#`$O7uP-
z1HG4Ze!7d}K^yc22A3lC3hf5Kcp6A5#jVfxD=_+*K3?dICT|}q=f+sa(RoPvk>BVQ
z7yZYy)I1hzIP3t73*vjEgSo%os@yIC+4YHILw?doh1IyKir+-MlH<|w;my<t+5_I<
zMM#fD81IKx<=#pQZ?5ou9;Wf+WAaN@&TNOVi<O&goT$z@4!CtD;G5Dw<mp}QwmgSb
zvAej3XQwXRV6QcnXGR-lRj1CzNiYV^5keg@(5cO?vhhzS_1&zuKHfC+INsJ&uXfw7
zHJn1D!@GXcIB4`nfZhc8iq}PS-c_w<9du9;z^39ms=HgNcSl$z=~DQvE#_7d)8@vI
zOg7s#je;>35bb_M|E+_g21(Lp(*F79e`ZPCxgwRscUM6NcNh5#Ha9`KrW)|T_dds?
z9;kf4v4><>D4bd5`*i;bkc`=9<6K|fPg{XFerD6Bb7F;%zaxL}Idp$OiPR`otIqb~
zIVQ#ACOQwqTlk>Fgjd2fxOcq+?fxHRW2PbXtMeorc()L`@Y89|;nO}=$=(-DpIIup
z(@^))p!^=mfK-`@G>Nn3ck4pK4dlqVeEiVXn%VO`=N~j_6`6+iAN=G&mb|1L@KYe3
z`swF`fNDWV5S^XBTzU-5lPT<Us`{%>4$N~~^<dvA*5sxK40aF+^5{w~Am)@#dNK<7
zVTR}V<19+g5%#3XhVgU><Gqc^Fk$iUt0xM(!_e;aUW{rl3Hnw%ALjEh$RK@9${=XD
zO=9ge?mkC+R;$(e`u)Q_1((E_1gl(y=1ywM%Fc``jh=kx%@?_(IKWHXp}if}_nCLZ
z?xXY3dgIyl==pbY;vZG9!iW(C`?<l+1o2n!E8qVz0GuJbc@;=91V&1~9~B<W8x?v{
z3<E1OHDA4j)>RNacB`W!muUFdmxqy83t4QkP$3k%Tsk<y);QrIZT6PBJcwgY+#+b9
ze%8Tb+9uSje>gB82vzuuz=>~;z<ZwnI>IRX>SiOo=Q+AYp76S{63embWGSr<5eyRV
zSnmHWjbjBZ=Vz25=xdR|sa-FZ22J9*p+tkyqr18VX!KX1n#mAa!M-BPu~7$?)zxBo
zA?SsSCgqcSFp0dck)vArV(Lez5_;<Bei@K=R!@|SQ_8J!ZA~O=O}bL#(ty85kg3N-
zxi|Dky2HNWoEur!02pjgOLW9(ii;tT8`SVM>*sbjS#MG75;0YII?&<9?SyKAw#wib
zcxJ8PWAu9FDgqc_2dB#l!KlGlAF`Oo3i=iHFk18s)%iq|_BdwE%6G{9w$u+}fR}ro
zjUk(HQ6KE1_+p>^CJk!uGJ@Vk!fcK8|6-55oj}5>oBW~rr<doCROTH+t}-Vc_Q<CJ
z&^L+z*g&_abO}W=k5Gaw^-m($4P*{kYL0<STt(ZfqT_K5KYvZl5}4{*`da?i7q#n+
z+bL8;DuQ9)zO*raO1|okbOM2j&Ujk*f+l=s<c@e)0x6?veJaSX^)_vl7dn6C#}T}B
zuwiBFaO)*yIkhsAqP#aZt`_##R+arp!2CV=Pis|yct@cf;!R$5y#@LMS)ZEegVj9Y
zF*>(3aa)k=Dk_DlhX!E8WpqNO>toY5GArY+XK!1eaXvGND&{A75!v$c9m3<JYjPUv
zP4%!A9!z<fNgcSrZjOSpF}5u&k(}RYcpel3awvPgKR13Kgg@RX95mX9a?NCyUjJ23
zY!O_n5o?VQD?*GELa2|uj;f<G3c`He{gm!F&RImf&(kTh3Dv*uUfl=Ul#P?Ow0Lqf
z`I+fJB8;c-MJsfe<@%~gzjIc31WfvX1acEW+jcJ@s$?T&aewggUqysr6Qn8*%eI^U
zkY2pMkgD9f3}Nu`qlkY1&rnI&EN1aCw?Urimv9pB_bAJ@l2KVCVzDv%s<q6@1;6#P
zFG?2822`@>woKPX4et9xJ{c69y*YO>a8GcIQM2cY-${zws`~l@=SfFdl$V0j155LG
zIPJ@-ZH=)$QRk3jFxL`^CnsFWX%8O^Y~oeq-c*Za+(ulH+{gPf!2BpDaK#sAfqwq&
zJVIn|{Os&_G;TBViGBx>D_bVU8&h!?`L(R_49ZLkxT{h577l-U0mhyHpf|6JNn_@^
zPDZ6}&X1)G5P%=g`cc1pQ+}v)7z;cWdJ6Fwcf!0vkk^<$Ju?rQ;w^b(*x>Z_z0sWv
zpAR^_Oqg5&NY?9FII>>$K@Ze}yU_{jpoM!{^43<#nnu9^WgPLaCw2DFi{Pvx@T&jf
zL(CcC?U2&TQAg4`<R2{%#^<llS!_KeeeuH{5QW<#QtkOh?Ntd0331^l;hXk&W|}y}
zZ(k8A5H50v;Ad?q1qL_#v=*V`({j5^b=Ow**`V_56?b>hHR`ubpP^66gm{x9q+$jC
zHZ}GbD9+bggR1^{za;J?NUZ}S;03)97$FT9a)G$CvJ%Aqmy@3@d(-aX;*|cSmw!y-
zxOe=3L+D$@S@?Z0LZ;Dt$o0B)iP9p*ivsO<G8lebYoC$rzpPgSw31f5BgExL?Xr%x
z;XPvC$B|+^&qCT#>-C(05&hA8rY^GItnnu2iwMwm_Y1gJF}49C+M)wajJP>UTg6Le
zqB0Nt0sg42&yR8N+$)KPfF_OYfB9flB|rqBxlWC$NMVrsG=Up^*pi>F98bp&zs5b@
zoh);Ot!1KICHB;1E3VhI-|`%y8;Ys>MY=Xv>0CGnd}H6P`s4a2t}4(x{20%f$jb1{
zaO4mrOu^Mdmj{$m>)5V&zUz!iTo}9ST=>zX&Fy}DL0UG`JKE&6z>@hL%qNNVIB(Hu
z7*p0vP)5Mts2eENT*LZKgCh*87G%*WW$&>+hyG#}B4UV6oT+S_8!jsZWR}x74V*>k
zd}h(DfXUnFTP(46)<><J>CIe)7kiSpY01w{mc9u7U3i{d0t4=Zf8|E}g-m!ujF+3F
z$#44j!L;VW7a;r=EO4%p1Q>B<+c6NixbQ00%}38)Z>#J}60p_~g;Yo=blk;eq}K}%
z%?%c&<6UAF(W%?3ti)O#*CT+Gsp|SJ?|WYZa~lsYCL?8mtoR+|Ck|Ss7v7c#5kxF@
zS^Hp=A++iUZHsnj_*z6;;%=x9tA}?Ca1M@`*GvlN5V8Ya6-wUMNdvSHmwx7@qF4+I
z^N&v!ceS@le68?a&fH_I0en*~aDo0{Ykm@2ls^&>^&J?Omk<qIp5KF+2<Wj~-NHJ6
zPM0O1uQEY+GiJCC;m1FK|7y{dAGFBsT~^WG7|Tz6q(?so8B|#eEmo_@_V*P(L^a<Y
z+V4Kz#alAW^G&HS@FW;dc}<tF4zRW{f!T)&W@VYLJnnRdL3q1>5U@xkwk5akB-BQa
zIg$>|CO6MMK6$&qDc9St18ZSw$)s@I9kvbW-iGa^ocXk<!hLr{p*Yc`DJ!=+s&70F
z4IFrO>==0pF48gD9u38>cf)5ZM3T4Ml#3o0wZ5wP7cJ6i<z_Ht;6*A`3X3l5M4;DW
zR3Bs|BsSfEUasTE4*~;eV7B5}{H_l0j@7YdTlr8NUO&X>e#k~3b^5n}(?0?)E6-a+
znz~YOg#vml3dqoct=y~9nQ#WRjPJ!-7!<oWI^vt6R!v?B!sOymlG)+TvY>=86#bZF
z`)<y6>_QtLAD4z<GkA=QDNk);ew$d#UOubiW9J-%aQ^8KZ~91sr9nsfQ_~gakfBrs
z%47#w4@1zU3$n!FFlKXn0QO({vhi_O2ji3`g+h4P+(hbV)?enwAQD&}CK;mEH3>Sw
zMk->SH}a?wilAP7;!_@gibSRRoGl*r$2CGA1V^waCcYF`ep9AT2@Fa)?U!N|z6w{B
z<bn~{tQv|XQDDkV<}oVc(Gd~d6g}KN$pYg1m+%7+c{hC>w;wu?RJZ-t5W}>y&~}e{
z8ZZJ>H>%buw3o1dr?|YH<zZs3kf!nxIbT*R86#ki*^O<Tj!a%uDB?u${oJ<i*p2l*
zdtt>inrbSAVM2~EcIvS+m|$~lZ?)5ymFKyJ_@y4fGRJv8Uw?G=bKENz4<O+3+K&xN
zRIxTbr~d0^dOHCXyl(Ml=RdP*%=M0>{A}H}QtQb|F9q={L$@`bd*m3L8`{PX7%*MJ
zk7Nz<Y@dvNdCJM@yfLLqQS5`PxqDd%ObD7&lcvk@S)}+@B!`IL44O}C-oms^Z=r#2
z!f})_heTJqeM&eU<DRhq2YBfAlkfK|5~bu2FWwo>iI{gJh(B|Bf?)8$A%Nw)iRoNo
zy&Q|z0V3orhyGC6CS5o7iRP#~93j?!n(KMooWUVIU=3z0sIhbfLF^NEd>_HN85aPo
zl6bcE99SI%5ljbQ>;;W}n*`CUY5-Q`jo8mHJ?5U|1qxubw~4AgBHvOhV(aGr2Dc!l
zR=PyJ)>+7Cx&0vz-0C_0<)ZE{6vYl<fIa8IPe-ygsz$8jeZO4jH&>)`0%l30!MOW~
z0OinmlwnGTPfn|`scV0SPxCrQ*%J_MK|UM?{wwCb=46-pT=#)Z(H_;CLZR+*f3YeB
z#duH&&yx{1>A(7J2D4y|xlP%a=YBu)!)KARI{rDz*$-a%#AEh8e1U}dNFT6*1P?<@
zv?i>B#N&jn|0cA#=|PDYm!IGXUr>tjmv90Jv=aLyIoP#B`~rVXAM{l)6?YRbkbe&o
zonGELZrI*r_LP$IH4$`JniIN!Q0OvAmr#V-;b&Iy?n*sw2KSeozk`4SKiIN46Lak^
zEQMEne!#o|@O*k2vW2loz)(Sii=Hhby~T+60i|j-$82t`TqDvi@db0Il=XwT_mi4n
zrH3D8g{@-+>;oyZUindA8bavlU8+?bO?xszRf#Y6=L1|z#ey?4kPQ%wqym;OhaBXG
zh>sHDz!qgO#f+gH0*+M5J)9$WF~B8io`)O!8>^)T-_NtXe*eYY5966p)rTKJ19vqD
z2{C)5c?!2S2BR=-0ig!>-G;3jotH(woj;3r0QdpV{tTQMeF$uO)Un+>76^ON!>p*+
zFz-|j(CRJLLT~@Xjo~G4X~5h1jrt=k5T+{R^6x0wUV^Wf71k%~3YCgT?AMAXC?DGl
z22%s|#1G4Z*$L{|3Fukknp40F00h%Wfh%Z13m{+n@=H}00J<EnK&ue{0B*}jU?drG
zq4T}86%*2RAiZi2Ax4Nda1SJ3ocFNK9(YmZMK~a4!0o<M_h?ElM~8paL;k4Sniuft
z@a?AU%~2l8DrNF7e8Ad|juto`xz~|cwul2pH^jVa?3gTkOMb_(zkLeKzB-RlZh7?X
z-gft3rb+5khTg(eU!N8&!7OBwKOIdp#2`Lr7Z^ULcmE9Qd1=mKYX0_fixlFleeR3f
zL@dPpJE7$zn;JLNrr8hyJ&wb0e+Bavc>CACwV4(8+4!8lTWi5ccc~Y~b^s}hSJ$Pp
z(+dokOj9(7StW@@@$;Si&&6tfDDm_xh}AS{9+D!GR3LGA0d7bxcr-;#--Y7O6j9+6
zab!I|dWgC)w^n0{*wggw%-8k5UV&Wd;)e^Hf_DJ`Rjuzfm^~7rMv3e6MgybcC{x|F
zS-dBrGC@A#-FfRcQR{~z?Z|QX+?U61NfVV5choP?IX#;@PFhKTk%=SFq>c8OLW#cn
zfQdSHURY=LCYh0{l@#%av%e2DlB~U_gHvYWW=)>lfYi%=9zFv?k)OD8W#`@C;>N$~
z-nM1?OyxXFfXVHXxa-|@ReoJIYacSti6F1Fqn{#Sn-<Q+_0C5oFr*P?VZM+#Fxjo)
ziTq=JS_eIY;{L@@`DdIhWxsGdIej{B_uT+<IO_O1A<-*dL5e)wD;RcV{`jK>6N62U
zs?0r-<c8_+Q$A=n3?6O4G@gOt$w4!1(@Kl;BNEoBF;*#78WHcFyfvH`3?Ssf2q6IS
zD0*K6Oin^zQnJtR&|O*89_IV_Rh)g_t5eIfz8|uH8ePu+H&aa=1GwGiA7B0r*zioj
zRDl<T6N3M?X~uLXpe71_arA$AI$0DE?^%F>C%zAXAu=GZF!83H51R;;MhzEBQOkcw
zMF*v<va^<{?_aLv5ra2XBtH`aQb-W;u2~|IT<RAc>qJH!KNuceVi^V}|8UwYqc<$u
zCWwN9Ttk+}F2u}RkJ#)W{WKeO7IPJJ<jkuga4ZCJwt9btI+@}LlyfdF(6EyQCKfN^
zt_Y;6m<Qr}u_oYc0cqgX0;=f~mGQXG%qepmWIKgeTh-}A_GJa1HgoOQh<j<EC0*fr
zy+VTUB%)@wE8965VtSZx)kl9#b;$hI=|Wa+aq;9&R|_!w!S`&nYIi60vw5w|a$XO`
zgHtQ#xP`%#UG;)@C~-YJ)uf7m)?9<rwopJ79Ta`tgGEoA*i_OM>`>O}v*I-EXCPz<
zEQUQ0jHKVMqAXa)={Ze6z<V<mwbIj?cn|0(xVDBbmpfu}73!s{9N{dmjQUhUUTBE%
z^U~o)Rl?@tJ|E1K6DZ*6c5k5;3x`<DX787$-MlT7J9l%)Zy;1<<L7_+VF+MT$ztX(
zFsVqV#}M)R@xA8<wwuVp8O486avIP%%`xOBY)}w2uRuv-0quj{nyjiPJP(;Nf_w?j
z78e^7W3FBdDaEwEUdOVS`tyIL$ihB@11&sx{9}%`xDOoZg2>eyDN_?GK+zk_#CvM%
zDyTe&0Yqyg5LQhaUm}Hia~CKdYF%w6bcfkqDa_R?!H;<WMtPC6fAZx}acg1ZBSXJM
z*3e>GbbIP6^F;N;Yc+`wJKv=v80K{>es0v!jQQj4jh{HAuUQqTgUH+E?cpeV1t((6
zM}O*9UEQ~yY;X?1{q$3)l?h`cOOvDEP%*yb`_ab_xzm%a&C3-ED}x|*R2F%#Rth|1
zA_(J;isy<fD0jdUrh?EBH>*tFMeueX-ih|FzM|tWXq>NX4J_JtPPcCovmNH+>+C=J
z9p#@o+X*K9jD@8;gyRODlE2&<Ttad;x~RpI^hjgCmX#{9-X$T&eQ#ojGW%SdwY8>4
z^t}!GH!ET^z2WYnYO?uYSa1ehd!B<M5<wjt|3U=RZ$5I07k*2T#Yp{PIWJ&RGg9yT
zeyIn8EuNn+X^s3{c)=&{FJvl$36-}XHF6CjHn+(yv9QkfcVwPy@P+Z)E#rDky+`j|
z8mC&D2Yy}r2)J6L#+*CCOnTN1?MiWQuEB|T2D#MlQvusxIx~(t_?e3IpGk30n`{gT
z;Se5k-y-&`;kk!nUR_M>dkIE^-fSPui$q9hVb&5+*FA`1GF!wG#aJF=$|cJEVAf@R
zAIL~m$mtOgay;<23KQX_ei+V7`?vbzl4xVH^UZ?AmSZoKgty_A;v%$?#5nL~nwrf+
zW!pCg0t@arwcZ%lX<E>a&v&O&BQj-q;GL4aHb)I_uH9(?ohf~!GV!R#2|y2-{jANF
zOk#-|RuFEu`*bVs*p|dOUvx&&r?|2zm+hu<1$Blew=TG|$@QzeAGjx!wyXW!kEH6I
zwaX5sK6<gE5T4DbD^cbWq6gmZljDCG8#{wHRlVK`JUFSCPm68=Ekn%Kp-#Cr<HD#M
zTpZpj=FU2VbOrY1HrL|wB#<MrtwIv4U$04Mb5+=qmH}pNaOT_CZ{BDauzRKa`Q-o5
zj8se^MK_$ljIDNV`?Tgr->bS(Z;E?lbh6XojjCzMC>da7@}h)W(<1XUr7jN1?ku_v
ziC7=&qNTc2oog*U4pOUhiW2TM19uzyd|EL~;B9PAmvoI^i0u&=VApf1Tc!JRV0RJA
z(RIY>&L$09<G{R)FSHfs*plejOhpmgsu>L$*7usqu<ws2{OSF0h}=xVC;x}N_YTLh
z|NqBxMYmOvoidUU5eeB>R3tMqqNvD@gzT$Al+lpN-g{+b%S?-G3RziM+57uEyDRlB
z=Y4-azu)os9lzsv|8pPLo$ET!*Ld#ncqWqGwY!A70^PqES$3}+KZMT(V0ayT`x$dM
z4@U4DuB}Q?qfrs%yXKkF8Zk_1q7-P?+Wsm&<qc$DoexYQJwMu%O0zDi6V@;MnN_?B
zS12hP9rw1Sa~qDF4kx}cL@Lh~6d2s(8wW-7vEUZ$({;@mPc{*B@S&Q1=@Fr!4i+#!
z0MICn*hFTzk%%&@hnqs~c*?>djdK&lfU?g1-Kypzk)+O&1rTnL9^cs*s#4pB6u?eT
z9hly`ntMaSZ@NQ9U)otF@l?!9#V`ev%GlxCM|zRYFwK20F2)}lWDPvCW8a-5vaGY0
zn)^SPAt)Gsr`yT&AjfCVKl`hhLLA_Bot;>_)q}czO4A^H^!`|hD|Dml#dz4{;?-Y`
zC-!N-9s;ET5eH6=URKku5;Pm7G+T;Wv<o>)CDG`>*&uynr@Kp|^}g}678#$N{qJ|M
z^HoE(s|h;l4|A(B09;BQ#5nFV=*(5HhdCKQpB`5lr%I0R(PN%aOnhy;TmG+2ln%<z
zqR&_R-c7>ao7uFd1hZS;Q@c67{Z!|Yn8``@z$>Quhua*7gRi{h13y*Xfv~07i)X+R
za3pB3$|O(KJj%rLJ<HS-8<w=yAAZ@!9i7y1udD7FMCB<Dt7o55mF2d)e2S`RdOrsQ
zsyG9UDKQO^4o_dIrbavkV1R1@?;<rS39sOZC;>9h8z_6BFw70^vIUi`*Axta!Dv?O
zrOl_(WyS!eCEmP2YjqWHA)WyHh&?>dDk}Sc(>^ga9;>`sdf~wd&gfad=yEXMdNP2M
zs-H}BR;5(2>tY$H&a)xNxXp$}C2S!qf4wgw#f<V&%N%*f=A4J-IdbP~K83fOs#O!d
zNk<%-ut@%I<l$9<_XldJ3YBBR%yyel@j>r<tLuqW0xR|GcbKAfwxheXc{M7%!iQ;0
z-c3LVT}J#^@)A<vtn*A>GblW~hD~Qbv58E?#C4a^@%ArV`yaH6Bot{KBq@qDF^c3A
z$iYgrP95*iBf;fs&WyOa^BuB#za%=GS2h->p`=at8M4)_y7w(goEEI1z45^$q-GsS
z9y?faUb+Gn+cDug4H7OSQ=pawTh3hZ8Rw(qRk@0<YxE#u<kU%)?5f+QgL=?XUOW|y
zlmV!j?-^GfIauPY>nA?@NTpy`ON4(lgog`F7_D+1tGW$>i_&#*r@jP9ONx%s^v;pY
zeexVH``gvUlR*pVahysC@?KOX*pbb0Dkj}#i<<hV^7x7DFRG$h^4kNO<yAgKrw>Cy
zfo9)wLe6!&j;nz*ZoU9;6XPvPEDr$}o|nw`#->0`c@Vn|uO!amaw2M-vb|KV=E!d^
zQZyNf9~Xr%(L|Z~g6G()%G)$S5D1m5pANO}AxhivXoObO)(n@!Y7@e~Zcw@-UFZ4^
z0(sOw0t<-W$P`%$U2PWt*<vQz(RA|I?U146e)|gJ@uxu;d`71UoC@|L#EkA@I01EJ
zGI2WIM;>abdY?SeG{)*vmPB0klagpV2+r97h%sD9>yZC&yCKI^O*9-a>{{hZ_P2~g
z5WFaqZaSof$J#+<46UjLfS{4_Dl6MRHXXwITlm`|PY!5oQkbfDrBa5L%6#F$@EfO6
z&lxy6pIRx9+5@6X;H=9+42^#~G>8|?nmj4#yaJ9|0gX4IHgB`Q$km#F2-A0tC#0J~
z)vn3w3+iOk5xkz3P~(s@A3Wnx_E(}AWlZ~{SCdr4>X)ji8|%d=Bjq$s8$zo7xW0QK
zl?7$6a!ec;)A{l&qhNhTZqn#kh^s9_(&#p<qU!1OMTr9~P-azB8<$>7+%Zl(b?8!E
zDM@qybVQ7{ge9@pIUXOU64ya+P}17HLK~npOMrNGsDwBsUhdBI9gAIyg$=G$4^J^!
zg9Y0;SwhvRcIi~DmUs}c_26SVJ1#kA-(kA@bNoUt1vp>Cb808n^&h14S8rlcf6-us
z74hi2k~n;kgsWt_{-oaRL|*CY&wztbl_WEiZPHnoXLkT{doFR|cjaevC7MbTweKYm
zJSvuXfBcAMXx1CQ^@yRn3PJ%+>UnIxQb4quwyv>0Q=9VDUG7}gR+pG9-$fqlGzg=X
z&T@<M-n|${Aikn~hQwhp?wFXmne0H%Q;5$6jxBgn=dde0c@R)xVl)kU{1HSNUy@wB
zV6eAffRd9I+EH?4i;=1iV4EjU`N4F)$+%JEXv*a;zJ*O<YRBRMN+e@)FGdU;ipesh
zicb(}j8}QZvcfg!{;8FxaR3VlFMNrgdW0J9DUjsm9i$XdRz4MAIIVr0j@W(hbZq*q
z)uEyUqd-o9_&vU36zU^2YV*?1iHZ(neGC>#k1TadJjqddJ}EgSk&f7qQ|!J^_aSFR
z$l%wKs^9h=nL1+r3V`WU57<XKvP`x`uW@Hq4BRVIAr72Z7xT7e%uuo*;UJ6BP)OKT
zG@#~n{7AW)jzNFO?a-4ZItfMroHc{O5oD=Sst$f!q)F8FkBNOo#FrtDHa@@V|0pR{
zlAQO{vqQtm@n_7nW)7%wFHGw~o?S#<y>oi!ut-nx^=NEVqA;7OX1s~iqMjCH%;If6
zd#-Co<hG6;w*ApT_SOC*4svVPiFZ)wB=6UJ<Nk(qY~=0~U@R4!2Zvis&HZ?Uj~Szh
zI|6zz1zz_P$bIb9o%ZV8sYb+PLb%6w3bNwVorqf0or4yFLe8CwGsRLuUgT;-Tx)z(
zA63p_TM}G}Taij!(d%L{Cj%8*V)`nqvX$u?pUB~Pow2X&!?*Rmujan9h&}6>GxVZ}
zWapijeDK*9dzWe+5NLd^hpCkBUHq&f8xQ)^I4Ssk$^qZ&WyA~#a`#E?0I;YvHc{?|
zNmz`@oy5leQ!8vQYpAwpa)_x)cS~ops1_U{xINOC(7U2qD*X1GR&1*0)Q{Rcccjfk
z-<~RO%dhmyhR027JSn~Qm$UlTTT_9hd<q%LyN>IVtAA8hXI9_(vdLus+Cr`CYrq)n
z<G$n!eQA1*S@E}hy=Z3d$}sRHcHX-!O%>G`3ElB}{E(oTq28`;E*tUJifScBUFh9`
z&f5-d?1MAvI=kINmOaYUi7WXK4<M3**Q)2}m7$nGk9?c#aHQr=Wkl>cUBuuX^W<^6
z#QK=T8&d<dbL}@%^om64Gj@n3yh*w~2hv(;=<w(Do=pGuNUg@w%=y`0977<43jw77
zZD5=AN%`a+^(B?tP|rpzY%*HT(T*{xw1pP``8|LR(P^4ehL}*^iP%*Wx0+p)9)-h+
zN%Fl_K0fV5e8d{qQNlnT-QjGWhB+4ZNGgmq9mM($N_&ucWe@wmy4JQ1L&CHK2K0q4
z)3Q6i_&T5_IMgtUP5Ii63C`}?Tti{2T{-6O4ZmVE6d*ZQ&2A4I7EH-EP&ewtoF$tN
zB?5Q|1|p>~Ubg!aQM*(cY8B<9qulDwlXv*v@<MxWYl8cQyB;rlZm%H?v~srqd@zE>
zquDMBNjCr#R}e_b_et|{<1%p}w%XKZ34Q{Gei02dn(`)Bcz&USZzb<%@zmX*e&F7C
zSCau!UvlItu;RBre)|E3H5|G3hm&s!*3&6J$gFK&IQ8B{jgLYI_~A|glLsO9jWquA
zYM>ipCxSFU9w_~XM-V<Ef!31_1Y}}p8`!FG3*WG=I<~*~wIb&$yJMNj0T)76PnKuh
z%=)q57Z-8cCFV<mA%$EV82(aXyZE#+&)Gd!eugCZ769G@a2;Oq)xN(kb`!>m$Y!wa
zIaHBW&0R+DF@&Lvu~%ew^^qSh`eBA!%K@J$7M+o&{`TCTsDZ-?^9^Gxcieq9Ku;P%
zDA#?wEN{>6cToxw%#VdKXuVPF#~Bkw%U}Z>)t*;d#tQ^$+aUb(AMe6NLKSsfeDRkv
zLf`#`b?^XxRtx}A@&UTnbjTY7XBD~>9<gR#-1+;?v7g|YE_4R-{|a&S_ZOFy&OxC=
zEEHE2GtlVSM<8ga6F+(<))(cS1q8Qe<;#`HKfK^M@$~1m2Bg-@ZU@v`Mglb1vFD2S
zGJnv8preJ>_gaAU-xB~Jd7RtXO^SX$ME;KrQmW&i<n}T2fIWgLDOT22$X_EEZb;Rw
zHz#?E&+k962a0}1zP{x3PsB$*Lier!V!nyb`3QAh^4PXpo9Rk#^!i%9f5Hk90jN~0
z=8e4doBHwTnCy3msAA1!1Va$)WNKz|9;*MiIrv{$0C(^d{R1&$nR+n@jtvAEr^UxX
z`Wi#Y&BMuSR#Lx{4--9dbG=Dr!RveZ{s||=!wcoP4G>xCO4<eO1Vd14$*lSD%l9w*
zv?d`4yIClj{SL`W$V3)}of&R=^QHflv)?n5CeAAWGg57TeA^$s+n0*eg*%Tv68-}@
z-p7ET7m5qE_8$LGD>^PV^(n~mQ(rcp@gH`z8!oCo;w9gD*kXOnP6Xf5AY)5&(9QY?
za?Y3|zdvIWZVm3KTjMvTfnC3NR0>D>8v5xk0Zdh&qvA>ml<Cj~QLz7kuAk}xr{|PN
zz5WG{=I`}z9K{?coVcF%`ROaCw82N`efds4+`sRr=IfXr6!D#GHenJFpr8(amOuQ7
z@GvoAE+eh}b|YMAh4qNan!IH8Yj?!gA5w0Gt2)+u^%Ljshl9h=K*g~;)YQv;6a&*n
zq5*LIFkxI2e>jAvh^r%5wDUE=Ph{}z7&$610QSL$bf$i!@}(Dgn+5bp#&y|veKX%b
zVeMf#-EUg|NXOT|Q_kkOU}$MEhBV&Is7#XIi9;F5-cok9JmmV-lKXnfPvt)POK;<Y
zt!xX18uZ)p(Chuwq*H!fC;yO(9G74<1S`asU4AEs@LPmPqm2_Zso*pVj2qZgQ_tk>
z*;97$4?@oZ=<h>t>+h(*;0qqZAyp1yP@hmNBmB{A^J|bfs$wJsKSbI0qXGOpB{<`5
zM}<ELekLx0#o19)hq<v)(Ol>xXefj#iPOS%Y=7V@N<^1mZyflr{^;+Yu;e=c4*8*R
zT{v!>Di?x)cj$#u3B4{SeE-6_6m$!MaV0|RIe(yWGFcS_qV^!-U%)vlIz%IQeE>BT
zXHRYW2POJv{M5fiGq#x+k@Fz$uLLgW0tlfYl=&j4;-^LJ*#1P&50K5>8z}sJ#aMF6
z)*SPoYY^Ic(^Gl-4FJHrL9o1`Nbn+36okOW?cpe6{ln4Q0wLJ4<oto~*Col%pTbL#
zGHqJ%pHMFU;kTs-IFCgI_3z7GKk{EbQ9go=Vxk5r{`D_ffh$^lS~0(K=%3ey!$4`7
z>bdD)+JE~CJm{NaEFjnUlT7;KzbT&s)*!wUO7Vv?|MR}i0iM@+Ap?@%Ir0bF{<<4t
zdEgq{yDJ9&`WK&JO`ngOt*4QG7zuye12pM^YS_o?pt$Z}{KGHYKneJD+~WS3WHwm-
z-(TV2MI_PjTCd-w^agj64(cI;K7-(Q>A1n#2-v_lDjgK&``5qtFH<0tw#w*e$6&r>
z-?e=??-p4_o#4ZLG1omjB9HXG6qixpQ*F$`KSuhF%!_T0@PJ(XjuU%d?%5$7nMdAU
zx5tA^d|)T1Y_x^+4g3^l>GQYf4G0HvLqobGI+h%Z9XpqtJXaP|FLnoyPh8Ur8(5yx
z3-g2_PWXgmOjx%+en=s}chaWMyYa_A{>SgIh8ReWzFPv)`j5Z-;n%ok_;QNRha-P}
z@2?MF;>W`adp+n366&8HjWvZYHy<C{XiL95+W#>yv*EOa{J&X7cs6|5l*Mkt)B5?*
zFXSQj&+ov<w!z8%{9UFxZQzycyP_Lk#7|$pzy|#XbVb`K{>=_AaWj2vXn(Q$r{8b9
z{DTzWI^5~>B-;2Z>nqHnCYvxe@7wmzHv+fzm<arU<KyN3Y8l!jg!5T~)_?tr%ie^0
zjSs2as5$+AwT%Cg^G`JMze3K+oVc2rn)FNWBJ%|xBy`};ojVUqLa3!t`3?7!kZ}G%
z%>8DN4!A9dW0tnew#BhW(kFFta*|h2u<GtVNCInwM>0J)5b<^bM_XrnNT1z&r;<QL
z!~WL|vckfZaf@q{OHHq$86CmCL!+HcE~bzC1)asmVJSKIiK=!zY9Ye`!RZv^2`kPz
z2garpvh~B}yPBB%$&|w)ihR+YhKm56p&Opn5yQh=qqs!f?iKOa<TBgHIVDkxGRaYn
zynio^H&e+5H8;HaE2%r|c(sFimo7b<cWom>xq6Wi#4ZBXlEuf8f}%?9jU;%uCaSv1
zFAv4^hO(l`qVUfkkuQ#?&yX|upTxPiJD9{R-tqBiO3B_?k>Fu-k8E<T$<c%z!@60_
zr%Z^COpSh1q6E$OxFEiou3uEKb)GrZF|=F79*5`CJzv_7hbd>TGmVei>5{Ou{%Ga!
znjL*&jU&5>v?SX~Uy;r*3)(8rk-<}LD9<Ruoh`fK^rXcn1+t%qKJ~B{Y2$73SGsC&
zw@!)OVGMn%x8NbiK0kRvMD`*Zn;Q{oFRwAGa=M7;so;G53Bp(hF^!p}H@@?7Nqo1-
z?(LH-8l$d7mA3vypmW6RWX$miDUXK-DJu!QRt-5yaf2@^DvG=1%+^db%j7NAB1Dr}
z%d%k>@mrS~x44_QO5DQwmnwzr927tmi+xg{NodTK@$oRnC<pg8s?xtq*y3={RFP!9
zQG(`%*frA0=_I?Ra3ujkwUsp&d^eNLk{O)`MQ4*3J5l)z>jJX#O9M#&8I%7N9PJ^m
z?)&rOwo>dhMB<}72a0tzU5wSR-l?317oKRfY!@2GlwDx?K?e1XuYtYr^1Tih3pRW^
zCPaQ{NU*NK^PuM{)e7mF+6~F36DP|wq)Uo6p(&UbXJEfm)EvNYg+zoieJLzOO{!bh
zDvERXuk9N(&6aT@TAps+u7r9cb$Otx2Mj}*u;Di`QsS&;CO23s{gtx2itt@}MtUDG
z;=;PyL^b44cEiL4mzvz?90*E=;HJYt%Ojr%rqAaqow-RQ`8f9!9@?eJf!b9ap4Hxk
zhcU+Rwem|ybW$r89#Ay4+UkkDny-|5-#Jl^Z{mD@?d8vONqbQ!o|hhIH8bKfJwD+D
zo^#qtpSU^B;FU?Zn?AmdrlucgfJKp&k`2=VBC!Px4v8}RA2*-xiYA(7`AjjVRVY83
zQN6Cq|8O0HYVZJ6zBvjeWkQ_o4e_R^e0)dZcg<%g+Z0>b$CB*NMnx6x>JGTA!Fe4;
z=3(`L#-PN~eFsyWFP8l|Lqw?U(T)b$aP_)45h}U5YKA>1o5zN0!X4-lh%32adGOnb
zW*;@*zpWAQ^p>H1*l<?pvx#R)4Y7-l23230>#LyM$0F#11|hivd$3GgxOeZ~dB`wV
zKEZc<I{bRVIX5I+iFE1^aWv5)X;)~50h%@#1;vyV)yIIrx87H0VxmMk3v<*CRMg$P
zU1~;>MbTqyTU$|4VHWt=5p6YueQ>z!aXu>4c$hZ?Puv|1(v8f_NSSeG2&Z>UdoeTO
zWuM7XAxC?}K`JoS9&}X`kqxSaw~@<U7Y{MXcNHYa-@4e^lG(=Eg?6m|N^q)9v&Yo&
z38M)gwV&5{a3uVlMCz7n2Di*@Y_cp8ti1hL3SQPZoBm`j{KhGDA;4mG7NL8v=N=Mh
zyYDf~+ZD!Amx(2vtl(7|rMe+DX`eSg-Z$><<iIwEK|57<*vvk&5lY2#_W1SJ$$YtY
zhOP!)i}CRdP8u7(kgo6|?sdWQb~JK51d25S%;y)`ZkLr(-$y-@Ac|IqPxQEUuY|&4
zgd)G?4;21mZyf%x5!{m)HB31h(+7UD-MO7v;*xxP<kZ~Slm_W>4nzyBTxgf!aS~f<
z^lZ^Q8=siPpH$@VlX%_~)k}UD;|8b1ZU*Ks1}kyJ-;Qsc?8C!>#R<%VpAtPdjeQpP
zx+6Qs7N3osU3ytGzUZb?``s(sQA`w^H8B`VnQ0ow!qYKj;_l`lWq&dGFFW314x?T5
zTB~@aqPrP8Cx>?a!jwQv`l=6(p3@?@r|&Vy3w!UGJi~)2=kynQ%H`p(t4Sca;CViZ
z!0}LrBhV7H?7+8X^C#u2_;S<Hx;gq@;1s|@c&KZfW=&2nBhJ$?Zts#YzpJXMnwNC?
z2-=YnPAh@@aedf%)D0VmnS2&;xF~LR$Jo%0hV!~ayRN+E>wa~1)mqCqCZG%{PMZY?
zw$x7k7oJbtonW!D*T&KOF8?C9@L6$XJTS;)N#+Q-NM{~CnyIWF5<1D9t=3_O^m?YZ
zn<Tw);7*Zf-QRcKTZv1&gnj+Se!EQNGr&<jvx@!ra?E7xa~1wdGdt<%5_Y}m-@_Y(
z0koxrr4LZswX1VBC-;pZk-2M<$_p9ZRC6hz9Y4tM_YWMwu+DDn*mIbEa#<42!%vdL
zR=Dfs?%{<&-I0(Ply2wgN<Hl~iHM!=%ehF5W;oT|hes(nF|c97b?{`aT;<YS9)P)Y
z8R?h1%hI86(*tmZL}*ClwirS?ZF4$|g6xmX$=V}7Sz~TW3AXQs1bB`Kh<CI8%qa}M
zHqqYARKA}~ST%gRV8Y#$KBVHdLB=6r|0QU5-VVJ?x>^8#$*+6PTC(a1OwP-dH#M1=
z%<}guiPA7}Q9^l3_#=2r<>#P8B1-MjcW2x!&MH;9_H&&jf`R~>4$;)SPfyYzVLdb5
zsmI`3t<9a?c<O4}Gr(mnhbpzMSEfe&ZCO$IyBY+zrX{3M-g8a^3>d?J_n;eZVPfY<
z$0ZI!lC>Ogz%~Dxs8Mn`h~#h|M-&3!g?`(`z?7xc>jWfaNB-LV;IzW1%VF;QLJ|@T
zRaI5(&hATS@`{YZ4ekT)N%T?7#T~pwaq=CXowxzGV5{8+E18ZPu2Vv;6vgwE2W(fL
zN{COwpoW`NnrAY`%wag$)mFItrD^x)#Z{DH&lep;H@q4!pu|95^+k}FF!Aoi+`oTc
zACd0aEG#Vg1_lPQ_#TdrCkNx9Em|0`hNHLj0IzO?s?P!ba-_$t+$1SQV{Qd-@t|L`
zLGf`@V~wZ#Cy{s72hiSV0Mdvu(=w4wSr$h-=EWS=&gJ1~@S<AX1&6>6CucWn^HcBi
z8x?^J1^;H*BrZidhn9lQfYS;gE{BCHtNC<8i1T6o*@p58TMSq9rGv4FoWrM-S7AI4
zre~GGfH+#gwhQ+!11h9uZ1Xdzd6+;kmp+>F{%!c-iPFvGLdBCluWuoP7So~7t;Kl_
z+RU~FSr%V26u7$BeZl_ig=9s74m1*E>H-p681ow@d~lq}+(C&h?hZl*-E%JEbFqFA
z4zF}NcN8zYvCIiRMH%;E7w}d<RHR_1eeY#tbO|)7W*%DcBSA)vl&i?>7C5iKH;teG
z809oW`%L(bs>C8>6ioW_E%X;ORO1*GMPs=bygs3+acuZ5SW^FBlsegqBJ3B!2(8O>
zn+c6oO*FKPV<yTOS9uF(QY_6|<2G+Q%!JHaYAEr09*#^kGRvr>&Kox`AYcG`3#d6>
zV+xfhq)?-9t{`n_$=OP~_Mwi&>A6mrytZfNa7M9fq2B@dRUhHiRKI6>XkH%Ed60v5
z=s!;3dmK6U$ag>lT4l;*Bp2V&nRfsal|%0OQTrH7;<zbPvharAg+^M!>Yn^*-+6CU
z>z=IOJ}P$V4wxwNcx7J&OH)ppVNm|KBs@YNkR;zE$681P2j5e#B0Ami%pNelHYO1|
zFgjn>9WzDznylWumm<1-kfS3aq24b$`ogYwDkObn_I>Sq2O^StD1)*_@<&oh3Py8u
z-im(6FFc%LoDlO+u841hC4UTN2J+gCeaLUua~O)5s1hp9NYg$h`n?ulrp($#=F20f
zyx%1tvky(7Ie?BGBCqSkC(NRzjdiOv3^Jz%^rT>KYZy@F0kw7Ks>Wd!&L=YV<!D`i
zcThY$6n$TtWnkGC5H|#qbJ|T3)W-m%xm%}eQNxf+Hi%cfqPO?n2rV5Q5ufY;I-SfT
z(sMk53hbj&-rcn`Fz>??COhTpOCqE~=I5eq`1s=RSl(dd?XsoLx#GqFUqGb347tdG
z{OVE47BjhbP#4AsW0IC__4xfAE%N#wrbA1FVu|SN5Y)vmzR2COQV*jw8=WI>9f>(Y
ze@$}Z+u4NMgUE;sQxi62CS2B#_{EU@UjPu+9YjNtv7H6rDv|j`Cl)~p?bc-wvN005
zYpYA1kavEH-K}(B<}b<lBdrFvxlg$+Bq%v^I(<IR_{z+u#;q1+qU?sT3X`vghx*HF
z+>}oHm5o%s$V9spZw~O&DpC_3V{!N%L-wiAyUzl1F>;OVpx2`Da`R~;g{~=RK=qWI
zzadEoy6l9r1ymCgt}UjmX@~p91v-FMb2ravW19&KJY#+07IR#3DnL0(e1=NWhOjPZ
zA1DWm$VckJ;?6<~b8H_QDB@7Vt9{48GZ0DvpUawfFS~bz`oo#laDZ~;-36SgK>s*+
z+Z7lC;xSh^Aig!M%{;fG3`US>&cDfk2pbu@P>e1V)R2mPPM0<`MM}uZV3-{5$_njK
z1eTgVDK2n(ZJ4Cw9X~&%5~<QS8iiUk^W%jd=qMM5h#i<1N`zz~w@FdF+<zeVb1zIC
zrV;G0X?W@U*KQl-XFc9rJ0(;f5=j8Iz?D{Dbk&vxm_2e+T~jwZ=6$%uGbnp4hf)#Y
zl|?NhQa!549TWQuS{l)!?(5OXx|q_wz6-ZxwlwKkd7ljCJXK{1@Lt=I!JV&SoWI-;
zBLx&T2bh*@)oPac=1FIlY3TsKdv23lYO5_f;4*nT{q@0DH**X7x%HtRaC=hnQ@K|n
zOV`RvmS2_Jgc3E!Dz1X94TWGVFc(h&4Q+*LBv-)QU@!CLN~P^woz3-xJ{hrHAf5H2
zd<0EQi3URK^borsZrrEA>8s42G3&ID+v3!7C@do*qc9)=97GiLFM`Mdve%;^3l8wg
z<Ap&N?*8Q5P6zYuiXm|;@7hquvC~BHhYIqABS$n#7S$0$hO6Nwj1miNY^e^f1=^yA
z{;I%qP<<Xkh22}tn>t4o_;{}Xa>HBu<?AFogh*rW`!oaxtsLfxogtE~tUHxC0$l{D
z9|{2VQlMyDHEdCB9){saIM+fY&~Bdm!N?rbmkmBOCBhKPx+=qswu5{c*g@yp`>+Et
zS(5n&0+$g^b!YKCY~UeFS|3fIQas^);3kaUNa$y12WBmU_VAU5bCvXZFvM=6Da|GO
zODCv-tQq-2W5i5kTxzW@gQl9!gQTiU_3teTxc6(NGa=KhyWwBA!89tTR+y|+UU+Tl
ztAt9g+G#4^wVIvg%w0HFG#;X&8eEv!tZkaDKBDBh*zi)6n&(Vw^U7o_pzQHEjp%eG
z%Fh|wgupnOZWx~ux;g^`gLg;(h2*UGh;?v$ydP%~?~bNY)<C5!8VxDn6Yj;oS4~iP
zHc;SWCIV7N7+P^zNzB*mzJlP8gJp77y<9g)M9T#yQapIK)mHY|ET<!c`a$M73A-Gr
zMl5VAC+c7j4;fF%@)wWX%KYBOP=uQU9E+!84!n8PNKs-3^p&Hk&AK-m%y4(4KO|+z
z)Y4nPY3Y5J#ry|gL)FK&YDI4k_uk3(lGyFaTmBd1Kq}^p(THv{&>S68swM@kEiTJV
ziL?-j#Cx+TMw|w$EW_lX@3vpLhK_lY>cl6Ynq>Lv69v#<=}xwJU)aKHqeX+AaSI6i
zZuh9AtK;yJ%ft=Fo1jP!Akz!DOF)<3ra7<;Xwu>q#qCI$(U~D7R}o}*A$QjlK<Ily
zgG=$3RExBNvHYZZ&#aNO)!{TwnJ4OLF!M1kWk{~0`QVEIaBk*vsn<T8K_fpxN*(YH
zTUS)Yac4+KPnHfOXTBw7TzDa4IK78KUjXzqw5QSB?Ddl@zL;h}+Y7pmxNBw1Ze(m_
z($95QGfW0<9B5$sE2iTaFpAoBbwKWLm<Rj*)Z^4Cm~0kZ%)jLd;#A1R$IdzcEEreA
zEfL{Wt}Tud6S(a+{amJ_x<S4rJDw$#WR;ezxegqgH$me%Nz)B>r9z58@!ZO|tE(0a
zSeGVctW1#N3%_4zuxS`<vdhDwvj8D`8HiqSjYn}xcsM>vdzQ;jXz$vmmm+*zezPWM
zx|~-Q46Gx=4njC)8d?vQd>5bz4@7ZV-&7l=l0JxxY10B_UbeP6zLq04%PB!0_xg-g
zB^5u0bJ9LxSVgmjTL94b9u|D2@0vf3*v3X5Xg14+%~!BX1ux#SbsiAydc19+aJJo)
z*~4pfv}=`nrqe#T#nNReh^{-VWObxuTiDWv0Zt0vd(XpdF%nSlFRPV)?)e?ds&NF@
z#w>p*mfyt=90*Tn^UNL_KwPwEU|F0p^}~y7zgjuI8^)e->rUC<a8qI^h-5>f+9F5=
zPFoCa#V0&OCd)uWY035e=v}xhk<ZXQ!jG>A;3t<sEGn)+zy9{^XIF+b>Bj<wRpbC9
z^)#3dEW=~BaRJ+%2c;8g1xWxlEXUy7CGb1V^JRL;75O$jAl+vU1ADtcpeEgX(4!W-
z9cY@rlj4TFm53r~-TtEKdS=B1uiE878OcO>mUfu`#U*pJ@`Py3h}-6E@$x%NY${J2
zcU>KDmFR8BNCj{?z8S5ou+`}}eb3#tkN863+lo&zL{P`kz;)}nFBd3oae8ULSC5%K
z>Jr+v#lb+DCLFZF;6EYRyP4x6$^0E(-&n;`FBv&=un`*$^UZ&0&MnSn)*vMw1r^o}
z$X8pTwJfP%ylC9n^49Y(D2dax+bRV8i+9eNKkN2jR{RLQm~E=P88}-rsdUU?z^6P=
z7|0XAl70^_e0~h@Vpz?x-W}g6If*bUjL=N$&d`JUK`Hx&pw2_R3keAcNL;`jY^3v5
z<8Z%W6GpDHMga-OLAIUNGD(OjgRVj?B7PqqA9wH^46676RZsOIMx~v~Z+DV-YI0lA
zsFvY1KR2a^i^U3PUif9OS8rs_7-I0H2rq7%->+BrYzOZNz4vCB3eUihF<M4?G`0h_
z<C4*`7=zFZ2rmt`enbrEgT_lMU}un~itCBiwQ5&7xJ(f@Tu~}l&>cGLc>a;$*oVBU
zVP5y`5GJI%Z_^_*27@^p_laa5I1HmW%Ak_HbHb9@<Gz6#$2Fupj9GEsc)&Y*uisjt
zGta8TmShdRYyFKamA&OA(np}b$njNz%xGgkrFc#0V<L&uHqV7nqGW{AXF+bQ#dgZN
zVSTU%FDu6qNyzNJs)pcn8BH%Su1zwQGkfkj*gKF`G^-8ZVB&`I9~SBZiz$7DKRtT6
zf}pc$f$?o9wggSr8W?O17xzaB>7#2C+t=zQCGMzR*3i;-&mS--8n>?&E$dq8=xSOK
zOI-pyAhSiE?{=xe_z8xK)Rur+*;2lR-b|TDbf(wjYF~3u*Yc-)w>mH?1z83g{X`6V
zcYjYkY#cr{gdG{pzHz==s!X)kj})R_L{^Ei@6xi2g{%~;v|rc4Z0B^CwHC)S32JdW
zXn)Oqt+c~=0~BHNM-O-3q!=Dk5S<-C<VqQ`|5|$^?a~2&Zrcj8bnUAvzFU+dp`tL*
zkh8(<UbWlGY;MU`7#e0%ZYc(sz4{|5xuI6cnqP2(Ga!{4-2->FRn^EM0$)_Sr3+-3
z2UIX-4)I$VoHBnj2qvC6o&0e8t<#0V!hBo|SO3Au{>gd5l0F4e8daaJdk!`^Sx?4k
zV+uowFL(WJmaHw6Shn8>xDwXo+O=h+evZ@&Oe_-i;6b?0hqjM=v&lZ+nTsWGN`;2i
zbTB;avJUjz3`foz0TsVm>x@LStoEpUK0niLQ7~EtF}e1M6~vC^gh|R3ytM_eCo=Y>
zM!j!Ve)+T?U}yD#spS+g7D`Jg3P?Iq!L~M4bVgZJyS!xrM?p+-@4iRjl)qIrd%4cd
zQg$^ZfK^`ToM0MiIFk^xj8xW%yZGMR{-ha{3y)ImUpE6L&aPCjlN)2j76+sBJFd2j
zCg}5B$ZXQkX)0n-Y|evLaHB$@(MRcs;NP*sWDd-&IL>Nwa1<2y8Lj~~HrGhX*42tN
zWJq}E;^N_=OVAlY3|2$v<dUW9(tG{PH-aRlDcKqw<2|{iej-7lb{4l(olMdW_ZvY7
zAZ2+F(d6L9$2Tq^f=)y>3*M=D{n$Gb2&IG-mjp;3{7A7`yT(mD$*!+^?qbNpfWLGE
zHl|+47(nR_`^dI=2^=$-?3V4`yCxRuJS8Y4=JIM|Y)7Gea~qAWjY{oqpX#$I;DqbL
zwDYUu4oG;6Gb#DhTqbmhbMCraipO6}=Zdt-38b*P41}!?g_VJw#^0+m`5b5+f)5Gu
z9eEIritVEEqh5OF=W>Vz3)Vr)&GK!c`)72|pFht$E3OB(wRC5nM9E@PniOe!g(>(e
zX+bc?#C2Mxn_Y6@M0s{0NQ&;QmiE&9VHT-<bKe!9?Np+C*a?PFYQM7Es!so4J0UV?
zUOWqfaVLup=~3;Ax^a)$TYLHN=qGbaN1d~aR)B5f4pqdFW4cx*br3*3JC}L*4ciur
zd-~z_4W5$A-DF);ajuJT0a2v};p~d7dElQ*i&b$ccAs$Mk<g*n@WzXzrA;jAA-^~o
zpX?*p9&i0Yi*z$g&h1Yy^`p27(~a{Hr%cUd4S1!Nzf9(<M+L%wvqVWWXTKty_IV$H
zMS~!x8i+&Dje&+6VyxbW1#!ECEf#nY$ku`Hv?XD*1I<P>)C9p~TjcChT1UM12H&xe
zo(7JOSAYgRL2_hSt`vdtD<hZLr7~A6y<Pl}z|pC;T9=$1pNwr&z=q#iE><(pGpUB?
z^72zBxz!h%jN?M6dot8J&hWjaVqZ!IMriOW0nZFkIolB!w&~iQ2cclDb0BsDKihmo
zki^vE+N?9@g=Q*^Y=K=M8X(jtz6x@dZGY|u>yBdU=CGjPU{Wi~W<AJ>Wa@ctNAt&-
zcxizxpFeZp!ov{Zp6+V6`pw9?dfc`5uDaC|g_2an?E7khcZ?Xw4g{7gU(^Cmam0mz
zeX7F$WtlsP<D!$3tJo7xj{uW2=wtJIBlteK*ABXbpLgFo$wOr6)Jtz9Mjy9Q(OYPe
zrnR<B^s_QiA|mvt;wYWR9lJ<Ik~0;nuVy)Xgvb~Pblm87m^by-Xp090cCo+l7Gvc)
z5Ig*PDr5a${Mlg>VppZP@#T2@#iZhazxAN%eZtqSkOk4;4ILt@^S$MuEV|~#%(I&`
zy1}?kaI_??syHE0GXm{k3HVo1i)Pz>N9@5?rTc1;Bfh-Z;cP>&`-l*u<NY{vg*Hf&
zUR96V#LDOd)>j#1J5+>(jh<g82w<t~d3Ry5tHu;UDXF_cvLR5CqbnmSR5c{d$E5C9
z$?^ia6N+saYL`PHJmov^vXGm2eFMLzf$ESfb+NSr9i(&d@2NWV6de;g&ap)jT%Wj?
zdqWVTGT^d&6w}EAqCL0IE*l`By<t6a_O^=4?})jQ{TGLU3)V=j7GHDce%OJKyGNMc
zcpJp|kx(&}L1|pm3>Z9~fM0RBN=2h)0F3jrmy>!yW-u#4(*wdEB85vG<J*vlKK-4r
z^nz0-KTv=@NCJ>L@9)@|L1;5zh85u37F$6yQ@PBSGkPF_e&*DErPV$q&aPXU!ERY#
z^3_f_05wY=e9$~BdU{!6!S14i61DDx0-CXU1G2Yt$!WP1Py)qKP8rr_G7PB3bj{%D
z(}kJZeR&56aw8KEdhh`7&%7N5Ue6yxGJ_8ZY3f6+X~cvM(E6O`+gpmMU6y+pJprqX
zCpn{PXZLn6Yq(^LD!s(`+!|_T07LsG1OX3Ll&X*_sHf4Yd8^v&)HMpWI*yDwE(_|M
zj&0NdeTtZc-O#<W4U#1@cP13YeD`m0PSg}L;AUu|DnE{B>a#@r&hV=73;P~-1T6<0
zEMm{pm`AHy#o?fUk@QaUZ6Y#R3fZ6@7XLc?wCHP<b+zDQ>Md@iX2+|HDt@ACrTqqx
zSc0wRZoD*y0mzqFqbdASw4Yu$(O`7@usCO4I1-LdPp!AKAVI78sEdHmo98(Tc5JvE
zaC%cy!O;<+iu5X&S*7||L$zodpw~{!q`4N&l(RnTg#hJt@He!&G<UFXsqD@7cJcA`
z)hLs1O6nL=Ja#sB5zQne4RUj<O~3lVfv6A51lA;b2Jl&`4bHx?8M;}@z}eb~*!t<8
zo|tai+5|v+y27g?G$r%L&NlCw(<YUr>QGirJ@EedNtA#WAtlr>#gUnx+Kc-_6zxtS
z&$#bw*iv@n+0`$B-H2%fF>c#eQMvWSd6m>UH$!aO_f>caK~C2h@nsj)wb{l`h(3AM
zrNovpp@K$>nR}pGkLxw$_^?k(p_A2cSxff0o-x}5%zD&_9j}NgSnZWujnZ#5YETf`
zI_@+eTJ9@4M%z;n3jK9OYUwpmc|AQnRu(*HFTsDg52D6zi?yk6QiRb8lV0v53zU)0
zc^qFLKu3by`aE3C?uWXzZRu}CMs5l;Yq3PTS6i!GX7rX_D)6d)U&(a;HcGH*(@!~l
zUW}B4yK#Wj6{e4uQg<CXLWHLC4w6A4e^UoL5m`Kqtf&P?Cpr7QikiK!8y9x}VEI1F
zinnNTj2Fo#mA^Ae+UxI#J1)aX#M@VVl{lCcm&Ftum1juQWrTLe7xqKVrPXGq&`r25
zqC+ByT&p}DPLK=bT(c#oJ&wz|TsuE+-#=c4mOpaCE+7?TUFW&>D%0W&nZuqD;ylW-
zPDMCVLZbeWS(D7UG_>%D@+OePm_sML@I6k1tG|#**MDStBb#`xbu-$r5gx_EV_brC
z=?9`LiZ50dG{@Rw!zD-v=cefzN6@xRsRd+R`bJd*caS97o&Mg))eD-B?Mg4FrMz1W
zd-RCV$~yr4p1XH|>`Zmh;l@c3?S(F~R$gJ@)43k={0&aTO&XPg<hPbY$Ck2AKC5nK
z{$20<_!r(n<l^2ta<N0=;KB=b#ZfQjTNmo84}lS{dRGDe$!-Pq59deF>|k08IJxez
zk|qj{n^^4^?9>d>;x#A9BFVVbFGyjZRX;={9Beq$(fh3&al<m>*Gowtnum|JyERoR
zOZo0r{F{zuTT%H*8-#1tjxUZ-RR#4kQV^NEc{qc8ixa-HbpxAw*1o#^trT;!t*u(O
zfRs?z0X<I)Q1YjYF^;%3oN>knLovybM*HwG`HS1w%*%c&PXc)o(3%;3Dv7L%#cCgp
z-06Mk&>Qw*?aPr0=W@5ohqtX}$^2C{R*6PV#0Nk;%u{P`!gW*LI^qT8X8DyZ45oVr
z`Gc4C)FxR7X0tbQyFDBjOYlJv0@xh5)3(JW5j>1IF%*=LPBrRfAMw&S5(Yym+-XUz
z$!Q;8lM=+O)+*2cL$s9a(q3qCvLs_tw#Q}FaAY?)^<aYB_ureNW~jhevz#+gve1qo
zL%Sk(sIS^*vqTAXc<SLQZZc4}S0fd!p3dIfC!4xoq&EodJiHNc`MNma2~y1MbDgS&
z_n0V!a}WEju8T=m_b|0Kcs+DP(@<oWz?6^E-8V<hgb0Q15XI{fR?nM=uuW?`q}Qd4
zbi>u3Rab<bw)8?ZpE-O$ry6S;GeZcU;0b{<8ynkj>{JB|G1XQR7EE}@`WXgB6J-(4
zRv6uM{0u$5D4g#=4*8E`wUAq+lnPg=W8Y%iBMDR=L_X^k;WK|G>SSviSqkH&yVu1x
z_s;-Q0m`FHAhBTf7Y5(sczD~<Fq3oas|B+rv+^^=!~EbCUOf^Kf!iJFUZt#5_(Z`W
zaaCgMFB+8GD*S1Y8EPH39~=M52Tem_#uN@a37mT{?}?TwmY_B`d+jX8-pXbWb)OSF
zT<{&u(34e*NC%0f$~ctt@?ugjqAP?;M@eyEWoBOaQoXA%<wJ$FwdwOS!RMg{vzkq4
zJM{)CRE@Ss2-t+f5oS?Ldofzr;A<8f@o(=*iU^w@2f9{w2@qp2W4b+4?}2HOsG`^2
z{an4MI*Wnc8--Ig;o3m&J>Gh;3%1OQDe`dckt!fsR>R@D8X6@vdR)_17)%&d5b$Tc
zBOvPhg$ngLT-bqC*g;Y<9YbnV7iBZg7becB`Psq5B$s}fH@=Q0&3D?BPt24gHMsUX
zp{gqUUJGzVihV6kQ@~fotZ6nvb?K7!Xoe&u>bnRpu!71AC>-B`5Vm5ugq40unP4{F
zgJD;@of5|QkErnzJ9X#@6IYYfQ?6Z7tB)0YMloV)N32v}>Lp_%?2@F*4NUPv4gCIr
z_Yz3;BP5QHj*r<94+j(Zguprxx(;NcJILNJ-H;496xrd>(AhTFX1iH_w6mqgp*>H?
z!=Z<@wZZqHBMR9Pl08y@XSX%Za%1r2SnDiW*?IEcmnv~<nNP%sJdf>aaFuHc7o=us
zY}j&72lYWe6?`8gRJ$Pv{T`MgZE?0KeA5j>H^Xd`>#t**2I^ybF9`6(g=DLZpHL+5
zUh8_3ab_}aUP&x5IdI>xkj61R^s}Dk1UgO%9{@??zo*XpBYSb;PHe)e9d&8ZO_#(5
z_j^~Xl@&xawI|q`RYkMo?=Q*^8OwBtqY*S=-)RUQ)pn@R!|{?2vQ4P!ItfhN^ZZyF
zzVp<~(9B@Q@Xb)IxP9U`h9AzMd-m@^*k^K&A!Wma`irz__R*S!B9CQNNwnx!y(H5@
z?%nrfiPzpu(>$8rtL=Vmc^~?<??>!`dgGD3Sf)8lwyIg;TF1#vvH2g59|_9}t9Db4
zV##@e*WBFZ_vC)pj!!i!v+7K!Lgh#H5IBqxlU?no-X(O4OhM9Qw10OtbLc#SjM+@&
z&}q~gc!{uotc5lS9!8p7S#*Y-r#S4s>mu&7C@Y6BpR8LPV_7F^J76g<RI`yv2d3QJ
z1cjTI4vn&pa-`Z@MzIt<VXjJ7oEiSK5Mkk8iP{%{4R;4ig&VB~@JC{TsTxjE*uu9S
zm3qIkAM`?_8=2p&FKfh3?)}7pru=_MJ6Npv?`-{tfBElX!Fv4f$^MrN{!0dk@$&yy
z(dGZO*b9mpYo6kFq1Et%2*i<Hck?s<Dhv7XKh=gYR#jJX;e~m55lNwHZ7=|k@S4ZG
z1SFwxg~TvO%P2n!(%Xd(6VL-cgruN7YL&Cv_J9>p_ZSPwCiX?>0II#K8^V_l048V(
z&>8U=NTLGadcx0Ovhl*~<y)Yz?IJFA;XzWZ270R=A9xAg@7UGBihvsRodizcy~s~K
zZc?15MSUs06g0+=APVf92j~(7R0X>Uw~W+_$I);?umNiUue5F;cK@!o{;UN{@fjVz
zYRq8*Xjv&A9K6);2?gc>uGL|cugG<6LEEEUhT+*#{IE8mxnuCHJ|q>_GO7Yib0|NN
zhLTPGn+mxX;7f;~VLe0g&kOsm#{Ca~nd&qnjWk)VTbih^xC(k-5ZW#A2^%3Y(MpE0
z8swpYKupDp5j)}g7kEIoUn8OXgM$9P9)xg3Al^j(wQX(~@%{+W_6z3%?M3WoSCj!-
z3I8Az-qjQmpr#|>?w~cxZ|m2)Uf;{c_xJ^IL(*L0k(%nqP;qmi6GO1^w~^xxSA<B5
z2V(l*L~sjJ1D?>A{j&D>@#CCpz7b*e3i9&uYouTmqn_MrUtp%shnI-qd&Dft^b=l3
z!f40HC-%uNQKB_wx?zbRNGR8;b`nB?*aL7`MT6HKqBGwLQm|N(fVA6$*TFB`Ey8+G
zV-6gMh#(3wIRRT3JO=Rx8gbNOS$OUmJa<KC<~Zz{5&?9m8&!oyZ<Hfog_^v5`l{Ss
zu=XP0yy?gRbQAig`jJgIDYz-hd}nWVG-8A9SBwO!St2X=3*j`t33}6FCx0Tv-`9s*
zMQX0P8tLJ9lo9t)DpzR|WfS;>50qiEHx;t?!d|B#0N-#s)Cl!#nd)5OxvN1AEXd76
zg~2>hk(_2^Vv<;A`VVJ@|D$9Z4(%tm1Z7X*B=9z#Ix#T%Ax;a;a(M;KXe~`{SaP5*
z?^Xc;a3#*U!2xcrm@z@SNPI#YB<Mo$L<X2c*$`fL(5?lwVUdg@66lM(AQozz2uYRC
z+1MPCS5(v}{@QxCrkMuX2+yT03H=zcObWn;#ecKm50SF5I9z~ebB(5?8mJm*X@;6n
zOB`n7ZzFfEj6_4E5n1q)NZF7Wa84jD`;r(F5%uv2&h2q77PU>9kSrdN;LOM()<w$i
zH2M8su*D!Qzcg)cg6}h3fn$EH6TS)kV%8=2`d&5`6-;Mg)eFO(Ca@xZq=ozwH>4C$
z->@#h*Y~n9G5%K@ebcN6P5t>~|DS22|FXd!sr7%A4F>H>jjXK?LLNu?pG<fj5JUMl
zrw}9d3n#DZ>N1(M0EE4D2}~F1CcLsiFKDiM%{)VwQrK`r7QB5UhTz4VApv_n?$p`f
zC;WVT@2pZ*i03Q(Rh%>rWZRr6{0sd-%2M#09fD2w2+3#_al?I~w8;rbWzO#NYMZ4d
zpU*peS$ls?-~1VSXP0?ve2C>g0k&KOt7D&|XNn&LtjOcH_NdWxym_a!LO9>3i07SM
zaYBl(uNN(yiP%?f;e<qlrVw+~dY?b_#E_x2Vj5mp-*o7$rZ;;d&g|hrjg8}B=)khL
z;5G)m8f3ILv0{;2il0yDg?N}Q?w%N8yKsn*aU87@0VKH!q7B=HuYxv&pg3c-&CJdT
zE3v?o6B{t_-)D?lMT(eP8r~zKGy&UA8Fsh{P!5tu^BuO%@0C6tqDW>^pp4r84BA%~
z1inY~yN4&-N+*z9`_SGC4JW@+R|Szo)2qod<(cEjs1QE5=Sb<s+vXt9rT$*nG*u^{
ztjCa?{TF9HhEQkj)ZV|({$eS+;<y)-_drn$Pp)SqDlKt@gZ97+ibdR%tQ~{>k5cv6
z0I!AXV?*Le)K+J>@>$q)`xR>>&qK^w_lh#iVzY#mwNmg$)uLUs>hauBT7Ts6?;`c*
zFjx`c-njWJQI)v&ZhpOa4nq(B$ufhCb~S>XhCM-0A3HCvppbpWJQpvleL#NPxaRW~
z)E5(GA-N*KfUl;b7_N^p>~FZHW*cTG*n_1P7Db@KYv6mcK#Vw8F9EsH-6S^rm2pAH
zx<xE}{(*Dra^Se%t62akV<++J0b4G1-i89mRC99H!yDC%;s7^9$;u%HI@k$qSnGzM
z19^+nz3M9p3JOp@=81aCz!AD|U`SO3wuV4NJT|ETS+86%Zol83wc9u5_{sd@)xs(I
z4Gkaq*lVBqe75#I>SCF|9@`EVeU1=cL^zg<uGNdep-150!5v68Gt0zpKyGP;tWJ9m
zkPgbiw}6|<f7yT$S@tX@WLHDeiPE@P`)}x44_6G-H&V{umVXa;>OpeEL&TCigj4h#
zG7<WyDq6(u!ml_79D-he3Zav;AZJ${91suDn#*4-HtF46?a?hxCUJ9zL%DLzHPM3_
ztdQ>8cZ7l)2S+@LNQDzY`3)AO?AOAE#7AwOqvveM1_56v?h3yWoMN~?W|X0!NfFsR
zPlF?I-fl{?z63z#1kNOG+#iFuk(#(ZKHW)YP!Ctr%-KJxI*PSGfeS4?{Yds9bhs!u
z_03?X=pdB{(tGRP^692vB(fVE_vp6sos~Xnl7M!1AjO}7-(2eDjELU_@=F7cQZ;}<
zzZNPgS*oL^Hn8geO>Wpk?3YKW7i8m3m$EZdQf&rtI?WJt2aRm;FgHM>(G_MKfWu?b
z#6XknehU*5ll&?}yfE4=I~f_%KA1WvrM4&1qSBun1nO0LK56fS`>)5>-rdl&|0BP!
zu<*)D+G!Yz^-tu(s|Doq4f?>Ez0SOpJ<q>AK{8J#`Z@~H!=IG*Ao^XP;S_wXOT}iM
z&)=yqk>H%$o7Wy-%ZSc14oC!{qmCYV%wb&CJ_!m63g#_#?L_lx<J=$0sL+~OflDw*
z&vUx592i8{G|U^|7^(@rqBu?Ge;}1`1pD%B;RhsCSx>mvZwL-~J6K5E?DLO0RiAL5
zgK~!Ng8vU6D31cMGx?ligUu~+^NNV<geYCyS)GvA4QAK<hQwFKuA^s=vFnYv*zgA!
z<7yj_i%kj>UaHPLl1)h>{c#VNs8LJ0HsOL{?|oT(FTvC)4JYWC4n15MHw*2u_XH}s
z3jVO@pZ9al7Id&-%lj7io<A3^z4w`cKAb$ilw>vm$$Vd;vo~5{)XxS4BB_95YgqE`
z)k0KH2KC&?9y!Cge-BK4Z^}lckZg61$nUiP_&Z_Ct-IhAwAQ%x^ZD`<bYI?_TR?qf
z-zf<51)8f+!dncLzDEQkI6m?_jujt8x3!Q%Y7ekRNLUUqdt#H?4#vgC$|asVjrvNm
z7ipmTTpXhg0=vHU#%8+JzIgFs+6Dj(;fym+-S_+XD*mv9(lsPN3hjBwK0yZ9U(hT}
zl(B;ju&sFy`_VJ8=Vp<c?fO827-n3^uc7Emam)%evkly13od$F+tNYUt2IPj?%x*V
zKtmt{dA1c^WVQ{qND3}}t-s}_SEw;FNSR|K@+|LSU}3!W2wWIc<q7vngQUN)_6!ky
ze>jOBViT*1xS%(Y|Bodwc<~jEF&&Oo7BoW?YLFj(LWe|OKWqv^UElY&*8#+A^(y*m
zzAeii!R6Q`Bx8(D4&R;yaMiH-e_%{x6~a&!R%3PL0cteu+Y+#_Sg6hWzIc=m|6S~V
zPZp8(|DVz9#9C_s4ev;t!}gD+9PYl#Jp1nIR1QUw4A&LNNcElKxEZA;(yu0W!oV<p
zP>!|#RNe!Nqe84+j{|ok?5f<?AF4*2_u$m)Yc6Y7W`m37K8rg-FGp>1(VUUE-@we<
z<h8}M)vzvCz458fY1!*sMmTT(4^GZEp831b`rmV%C<?Ew|LJe&o}j2h|BCZ55}p4}
zMEot$cE$XIRR3E@kWEZ(`v>v*_ZMG)o67!wQA1<*FT@V0mMReZbpKvT7XG_}pM!{$
zd?qZ1hR^on9zdZjh(Z=XMeP|3rN|F>C>QfO414Y0NaGL)^w>nU^qPX|uZKkJd%Fah
z#e|h7rONM<dH#ju-R}+BF)}^ZwxPAi$jPZNyGA^?N$y;{ljHLU2EnrWe_TVUGFUwq
zKFofZJfmXp`I9O`z*?x)0(U_3;H?aYvzC+$-t@wW$*dOl*NJ|E+r~*iYOE&9*y@m`
z@C)z^POD$+&(IC%(#f|m=1bGtbje@p$Y@S-V0W0lt$FVDPq}P$ujaC|i5(=lT8x6Z
zj<%WxoKbLC`Ou82NW%p{D>esHn`z)5=iDqtVtZZ)iN@h@)LJjdRD{NxMn9?xnU2iS
z7#1@(S%=#f*Pb0OeisN3FAF?FA{+8aX(^(`n|lsFG<9+k%S?RmP3t#OXuU42bJ{GD
zXM1qJsU7$0Y>lfMvjsJz5RD=#b)KCye-SrZUjB0XG-pZfBgTV*j>#6|rFEZOQe0e9
zbqSw1UvB@rl+t`o&k@t%P+O3#Xg(8CJQ6Z?=iSiC`;n?;L%YR0Qw63|N$m@dEsQLt
zM1!DC|1^`-TiY*_5*wU9@U|hciA9YHKHdK9{=I94X{0O=(0#2YE-oH2L_unII>l+e
zYkyze%bgJ)h6}sLZp9&IjdKCte8`h0VMBy&EmAo2N@dILP>cjO)8qQ5*aGHo!|te;
z^tlDLXGFtHwOzC*!1AgCq(V`<tEM6CyK~zYGIM9iMOUt5H;fd_)Dz9yoU9kwh#X79
z!6v=X{_Z#+iEQF2reTpdcc;Ud6jQ#pE@gX!=mJyGUAtV9k?1ht#<~+LE7s<@7=|MI
zPWRLl&FV7l=_}KbH04hX<6Oro-bMEdyu=Ip($e#cXS^)E@I>hS@5ao3%9BS(j=o_|
z8vawo@vY}$FcEw6+MZSl78jp7E#cQ@GGVEwbT;&_2N88AAEddYuer?lneX$*`^M!D
zzByK}PlgLN?z-KYa#`;I&xQfxEr?KT_(Tkwr@i=%uym>7x6W6Pdk2UvK9GK^Ta-|_
zG4DAepy|ftkQX$+^Yrhp;ZlL4Z2}q$Hau)Mgyq|{VCnQx`~PN%@FWnUy8T%j261@9
z;R}xiHGVxT@I*9)xi>T=N#o%Qu|8BF7QT9t$W7!UHNEPJw>RXX&G3c$#^(us-4a5_
z$wENKJ6CqcCkbV5xKM0(Ed+cHnm<C`jG*-*$55w^Yj;x`^_7S@m5?--iSlDc@H-BU
zYDc;GttE*!c<lQ|7muJ|q_(j2%!WQ4DRk2)#crkfe_^Zt>h@Y8+k8kwL|aXhGoT(y
zk|xJ}2VZ9CBmO#a%AcS-g#Umb&4#Be{WszKddftNOG}FQ!M!7Vdp_<uVzhDB4KD&b
z|F{?i(mWPC{>@*-w!pdPA2(*+NXVWmMb6gz@_(M~C7tljE;|#y&nAv4G#d-IEl{$+
z-VpmMt3WDxXlIQ3Mht?(g9uPf*7N_#I@5n4AVlq?s%YQZNSM%GMnnPqeuQ+tivSx)
z^~ae$px&7K5!m$JEq;iM2HMtLUIG)qdwaP7)D^^@N_a3y@uT9nPV1b3!5$w!KN^aj
z7SLvD<KvQ~8;70Ck)ozadic+}t7o$x>UB;-Un{wxFFErkLVmy}-|)Ei5zSv%rfwve
zpD$_Ww|Mleo*Kp}K+vM+;NuOC!2pioPSAm0-wPbi7eTPm%W%w4Wmog;-OqL$7Ac0@
zO`-jtGW&-~M7sld5k2>>FM?p%li1i;@2e0oyM4CTlus_E{&kB;@!btyj)!qZj>>8?
zaKewWahxwust8-wM!Q6yS&`Lh|7_3>YK04a%(NM*KO`ZcmrP&c>*I6PX*<b=Cyc-4
z2y**8&458iNK&nF;REk?i`yKuVOHfuBwX^xL*uRZ`ueU6s&&ive|Td5dslPt;K9tQ
z>1mt9l>(&%wc&XN>K~W++XvF?AWf@ur(Ls+KK$@Qc7t_eBM<ZV{(LBZ#|;l!08(1j
z8x<r>4QE%E0S~Iax$&z!AlS;@dz=Jp;Qy5uDap%6LxwG~Z2l^gPR+Adf$ZiCkfYN~
z`|iY*en1LM+q<Sve2;k~8FQx#AK|s{-d{LkjQA39|8PtG^8=|Da;QGkKRY%eJ8tO`
z^&YOL&EV4ita$0KFke~S4Wr_u>Bs}#fI46Ij6|S5Y3Nq%hRv0w{)lDw;qoslWkFUt
ze`ezsA<DE0QKmn?2&^I)Zt_OtX;KvZvyJCU@nXL+N2cIM7~vqHItnCL6~OhqbAvY*
z$}EMxhWh@<T6OvG>JbeR3Dr`g=QSPve~SPuCRj1SbzAxkX^#mpIxM1op}o8CC>bZ7
z4eKd9ka<(R9xW~tR3!(t%h+7}a5#qi1@@z#>+<=00Lf|aptpF0U{{fHp27e<CxTQe
z+<3$Qt{MKWovS_ZIDpYfy|Yk_GY01P5TsRag^2^(aRBfjyc<EisVtt&PE$aDQK4JE
zC|$`lOF01FLn_=huFQTAXtSpcfYCBCA}&w(Z@_8bGibnmwN<pEg)tJh@yB!U7tfsY
z5oneL@Cu!dxYmn-y7d47@)+isdjc{G*UZ9w&M<H^z^fjD9@>zN1gjQOyy#AC+3SLl
zFhBQL&aL!Lsms!6AOh_pK!sqZ)&p1{U=0|@O#|dl){qkg!sJL-n}vz?hJD+#Ht>Zz
zqM)1+ZOC*UhhS15I3+`{NaKW;03;w@v@&X`v@~Ags(mp)T|2)Dsf@HiY}JJDk&W~$
zP9Lexd-ncs&6SDIVE++!2y9e_FaDl&HQ0{;iKSq*OEW4ee`kO_47rWFu}y>n5TYFD
z#N2m4{QUxAl`a>tuX^zb3qJ;3#Uxp5v@zRffl=2$-BPA#K{-Lk$fyJ<B@&%G1zWR9
zC1KLK&AVeXF$nTEH=tBjS}DwR*qvQ$2wQ!yI^ii9jUa0sQ{L<13ZP4IfKrmHf4R@f
z&+&{y=TvAQf>f3T=$xckI_%r+07f+y5Zexn7EZmbd@Ec;YuDv$CjzL-^-EoAs}Jp_
z_O)aH1Y`94#0tPKS|5<hL^4(0lIMHI5ApSA;xQf|*ic4?Bl5V)(i7p6p@#ZBFDUdw
z0Mc8y1Asz@fZSyf<gEJon3L~2hhpfC>{_xx(1ZamH@B@ZVtF;^y+sg$Xx!6Q@+BZG
z8jw;_jJA<Civaj?J)nc$dn-QoA!lqVC%1^ZsCW0cte;aK%lrJdNI}OT3fkSfQ`$4w
zF_N|u#<8E4Hlpp)bi_UCT)2Dp?rEy_=j>Cl7u7le7F6Y?S<kcB!()AbOQBactO!78
zHUQqrGbp~)Z;0StWZY1?6DKi~Fa)S4XZ-8W)5J#kif}sju`n7hxWFO-fU5za-iN0R
zA3sdvN4)JTARO&cis4g0<pKIT2mc6x)1G)0&H(Bd2B9}PAK!|O%S+DxANJlmp6mAg
z<BljPMOMm+iin1tkq9MwW*3F*y?2xmTC%dUK31}_qJfgVBO|hR_P&o(_5EJg_qslH
z-~Zl^-#;G8XS~mKoX2^b$MJl<Ip9x2tZN_E5oK`p-WhehjcO0snpF^4c!ZF0(FLp`
zI>AmY2^hqnUVDy^F4DoXi0s+~V5*O~AR(|=BEtiT2{T|gc9m5{|Fm}h*p7)uqG8yU
zPC37P1d0ULb{jSF+KmI@UJ4;bo2U$)J#5X10?PjA;AwY96|O;r8pI&7)l`{ynY%I+
zxl~eOoK?z=n1;$rd}|Xq%^)P8>g^(Pe)M@bqt#*ZIYb>rwn;Fnl?c{_$@aQ<E#Mu#
zyLc%Yt3H<@qGGw`;FpS&8(b`0o6=X+v=sA?x(*z+{LJ>v34;s#lQ2_n1I9@j&4gxv
z1BcOWj7TK&W{&D?4(c?C!b-in1R$l|it8Nh&oU{$80?(_u(()kk^tg#%}2DCBn``2
z0$?Y3p<SnJObC%`n&a-#o}bGjz^RZCw5AzY7~(18bM4zn*Dw%mUqeK&FSA?|CuVeX
z^o^^9EY`Czao9aLQVKLsd_M>#uVt#tc@n6Gqd@I9PMaYZF#*hum<5y9`A_V{CQblx
z;vaWNg5OU8xLV&d!_;aV7jgMk0W`>I{RXKS2yM;=7rZVPFz5kvo7gm~gts%pH%0z_
z--7niDcg!{^My*_ri2NBsi4F;gU)u7Fwt{a^@Xsl)-yG*lZM;tfV9&tHY-P<P2PGp
zBUYG8FpI<mM)6NOT=YMn*bX#WHD#cnO^pK56|=eu$fE~nj*GdyeN(!@sy*v(15CL(
zf$Q_b0K+q@r&|Wz&J~yucCD_!8fk(IjBe*D87Y!}$Bl8F3#@n)ln82BA+HCJK<#>C
z`<=LWyO3PVi`CKF|GH_TAXX1FX1=XppcR%1wMOIF@$yqI;GvS5WkCyU4zZSGPP=^F
z+YA<QUN<zlu=3ZwQY+;r*8q&1?j@PoeIyZlpn}ni$f@&Py_@;G3M{znv$Ea_4C}do
z`9HOsQ%Ry4B5_@e4M}6vkG}!{Q?nkBkTGl2uJes2$HI-j084um=#5fUU^LQi2RnZm
zXo4Mm02$c%5;wHoEbv%%3Xz;k-*<SmxUO&h^JA=sgl<%T6r@+LLNYT-H7R@wVX-KR
zzrW8sHAoT{&1*IaICY~>^~-1)L^%$HbIeh)m>lFO0a9TznyURVu%%r<6ujF&u9E`t
zol*1kCd~s5@UQ0vU~Fyeqf$$!5OA>xmP%yw?0tjWn-s;JnpPi=k0S8ZYPIU?M`%U$
zzseyXs>s&Qa-B4823C{Mu=+H}KqyQ+Q>5KXj#=%Vb_)fXa|Wg}=)&f5{E$i|vNF0n
zF`b}Xpz$05(b3LC6k6Elwbx(ogBuKXav0rst`Tj(<^bK+Wk5|i&dO4AApjORz=If_
zmI}%?*aYX}4zO*WRi}If+~R<kW`|xzW=cU{xppwKUsf6P52g?slFgh9k-TkX^@tqX
zT!xzhr~{K7$^wGz5K(n($%TzkSuwe9dt@C%vi0gu0GI2`N$X#^08Mtvz_T{Z{v_>q
zX`~-v{JO?-o!+2rb{;^={8297KaP&;Y&pIAgfvldfvcaSW#7$}@v@pFgFOa`0x*0b
zz36RM3ADP0Ju7wgNrO){GQZyH1ysKRb(S&}2E)VG=0_}B8uK-53$X=hzzQjcvcC@^
zM+DgPfm{X~Yk>OYN!h}F+(v}AZ5^H{%A^*{Oi<51XA(?hu^^920<zx$>$_gn0b<M}
z;q@*<>Ei(JjA<9N%gpyhU^$uWfmTG<?$@xKb_fClTiW1oCePWRmjN+vMyyK`?ob>C
zdsn<|#9{A}C4Yl)V)PO^?fCI(6Dah=AP>%yV`K#1a`qWm18wZc5#e$8dxi#Ebxrkw
zvTFQOmk6%Q)3!-dO4ECh4)%$XWy4ZQ?l4WH9EcBKmjrC+)<Xu1#yW&>Z33=wWm1)J
z#eT+Y#(FTcy>;I?tQil@Z$mP(wMe`j-yP{`_+FIdSZmzGBsA-%TkyDZHapn$ROzU+
zIcC)gFD8N}x$S%H5vlcx({2xuh6iTaZA6q#60wdouJRDByq7nJt*1Uc54pdIK*OrY
zc}<HGL*UDK1n_Il=K+$ZFTGY{AoN&QHJ%{uy_rxpY2zpD#g`aI>&YQ9+erZL)|%-o
zY;^Fk+I>7fT{5SSU&_EcA66xBBBN|`^<>XySZA1wJZG+zXV!IB4!AGBYqpM_dzK5R
zeeP0eyigfnefHCo0xT_}?zF7$K<EAo>oa6GvR=hpyv`082-Dc*Q)6wxgKf*8xTSAe
zUX@KA&~(?c?6x-v%g(cw^NmOyBU8xU`wYXSgI~i}f(X1W$sHBp4HTcCw*t~~-4X>y
zd&S{w#w+&;dfr<qFh$uYbd7Tp*0=DFjh%L1xvGA{hkUGO0K8Q;JN_aNND1hBKL@P2
zd5mf)W>Q!VuaKcZnXd;tNJ@UnJpTOxGv!4mvb*XKh&B(g%^&xCe_EjAY6_@*NfKTp
zOfghqyUf!`HOvTv@ARk#x}ovNhr7msd&yE$E1~-t!-Kgroec~xMdoMUfp7g(osI|8
ziOz5p+fOdNyE)pFI0~J875SYcMxC&o8L*LFj3O9Cf(jhH0o^&e1zF;ZcGLdE+6gqy
z>Zx2o*8`UFLS|H0$HO_oU>e@}tSXB&!D${+gOYhNwKSBsqjM3Wv7tCI9hPIDXTfII
zd)t8NLE&LL;HR~B6Ahsa%iltwl;mhI!9^s5BX;fB+r!yAKLO*v^=?7Y%-g-q*1*>8
zH%tazt+n=agVT6MD8r1x5naE8bnTUDRh0}!*a&LQ`=9cnM%-Y_eHxR#Q3kKUe{JbL
znj)W#G$^v)$q(|*H;4w%-0q|SKTRj#h4Q4C6nK;q-xT>6=wYE(lT<~C$)$b{i-?e%
zHMmhq;+YN5!QwFu65N1<Zd$M&a0ybHa=VW;h$)~ueCTi)9B7)sQPj+9R?aCJSR=9V
z>yF?C9<xePeU0CBxrx{uU=mVR=?G5TI1ss_T~aLT+}W%@sjvXZ^A~W1WsW?|x}qfz
zfZ-iC%pnM?$hKM-Do|eEk`C&2={NWe-v4(OcKcdl0wvlTyh1|J{V|7TjY}`g-Sv3S
zh@G8*@~m!dS|KMp7U!gSVu%Wcu(zZS|1`5u*mR|D-(_m$788L1s(R)i$xh&JDpKc=
z9}Ohv;5@uwSLwL&Nk`lMUTh)mDG~*~m2Z!m`Fm!A)Nq~?e0AAc>y@?Kr<7n>j<cJL
zKY4wn-d!eUW5OVy`?`??-gP6<{W{<;H38&MnTt#07>O`TFoEU-oGHYJQyUHKm!b$-
zFS)ZyP#fC<KAy)|zI(cAR#n!i>9M|R7Ps)h;-jrXRd51Y%H39DF!z2JB)Hh2Hiy7|
z{mdx@#Y#cn&)y&*_AQspAt_!)egKiZsK0>=9I@IhC}XP^^fH_-<|BEF+x3D=CMwl6
zOM1JbNM(5AmR+e$DI^|*1Un(l>=W)@mpL`#R@w~8H$|!?{p7aORO}MemX}(W@?meZ
z*<2as$*=?ruVi-Qm&3;C4-uSo9(?b@uMeHSl2pZtneOsbM+RlPAogFKi~krD5DAHY
zaiCk8^}@}-d(-;m2H@DR1%V%PNsiHu=;ZE0)z*vxGq0CQvu>0po%`|~Tp32eG5O#K
z@=RYEeWWMADbSzll63*x&x%sr<Y;gc6PK;<uIes@I5FO5xP0*Fz3;xlGI$|IARP2f
z-lf#a`$~sW5IPZ5m`R>xI73!ke`&I|68T7KRc2g8Nj>Zmr(H*{20W;Ak+^jzv7{38
zbJ{Rb*P{V=uTuCvND`3Ex0*F7Nfb$(F!t|N=1;9^Q~CSjGaA~0$Cw08!=!ZkX<35d
zj*L{h8uBSW8Y6AM%u}`tE@lTy&#n>Vwv3y)2nMBU0AybSlgdp@u5yEdU)K~+#<uUw
zEx)O?9AWG!G!PtDBbp|cFc6DP|89`ESnugX$wSDZ1?1Ch|1P^rq0|yI3kX(nTUNeC
z9P8uI+*r)TcB~XO0J{{mvamkZ=@G2Q>LerdgePU=dYK*50~tBHUe_fX;7)UC_9KlC
zr`9xWu3@%*-?a9UAuii@JHaIoRk!c94bXeR&?m>wUOe$KKUl(F%eDVi`mn$-AIFWD
zYFkF)IhgUUry8%3JW1_NP3&pBvf#z7s%5L(dGoP!6Zj{}-swof;_1i|aLRS>g{naG
z@mqSVSn@`YMGZLRYPM?x#p`gK313m4<fq`De!u@pZKW8c)|02F)Xo{JduKD0lNK}j
zs15qgNR|rA)vUZ{5|Y<cE&e1k(KinwL<L4s9j#1uOqhbU-2i#D`Ec}J&3Pv_3SV2~
z1Jz+xkeULL-D;Y67%<%qk#FYwj@pww#k{x~J<PjZ7?#OAUs}m!G!hy2ek#NjYOL?D
z#Vk4$bmk3g_%D^UnE?Hj9ak&w=<BCvqz<E$Vo%eI-B=?gPLO)vytBoO{bXOWw6rnT
z!4$!G<Ft%?Ns{!DO5cQBB<<I##bURXLfMnT!zaR%Js>?vB9d+A%{E3O*!ofC5=^`T
ze;OMjiHUe{h(kHQ_zePFSr#VmqEz|Ow<kzM{JMQuBue`vC*G4vJb?g}-6-s%rU6@^
z*CZ?J@Rl?}Tk-ULQp!9hO_kCk?=#OCgGNQVoWz7-?ps3E=O_}>@j1z=#@0mL5pE5p
zbtl=yZybJ;dCeJi!3s<HjQhN1Y`8276R`DF^<R5coee-`@?y@`AYLn$pvT6X$f6g0
zO+VwSWo^VZmZ5lLW(5>twDAl;U!b%|!lU9a$;=91l2yFl=}JWDGX^E43A{B1^yp>q
zL#EO{>2`N`GKmOf-0yH?wd2ANt$3UB9>J@|4S}n8KKAJy<QKD5+%s=uH|KYu(wb>b
z1_ib|@{$GM65R3KYHkTcwkGl|frf2AcpC{bVTdm}Hm<mI(D7s7n0Q_p=N_-uILM62
zW9(yi&yPKhS^riN&4|lvX>x7Q_j)D5&061AN$_FpBgTmW#J><yQK*=e&SSJ2_M^W%
zxC}sft%*&A>1&w;YL`9?0O?!`9q<e?>>)asNbsEV2%-vS56Dj%+FVl3S?FK0oL8DW
zt2Vp9E5vGNvL~2`yCc>pwUe=AteeR$!>`-VNP_xJ9-J8|bcDcD42Uv~2qY+ZVo&AY
z*8t~}Wekm(ZdD|28%AJk(uS-zVDr7%g+Ot}-tX)B?V-c1yIdIN$uFiB2AKrdAB#Tg
z#h-ECiZ@e$=dIAJwGv-{RIrD7+Slmsc?(`Q5%~2y5)nCB`wU%vQ9TlOmckZVj%5YI
zp><=Tn;{Q}ZcEj9I{lvGAn-{Oj0yrfjt+vGDXMyT4Bxz3Vo&YxE=v=Y$tiA<j%AR)
zh(J3N=J?j!g)I&m5VxBvlm-ZKsXwD1OVt&os3*uM)*F|01~4#_#XV9Hr<SGo8ja2=
zVYCtvu0tF#sYRSa&p=j5IyAte0X#B(;~pe9U;~PMJf*MgM5t}^fREnRZOOcPZ~*aT
z{TpELv}r)=lV}>ew<U7^rWRTba($8!mJ6cPvlR70?Y>$uNF)w(5@D;;dICG`g@Sr9
z3l6gZI?otvase^US)x^9jSkPx7QTQtHF`GKvF4ez(M};#o}_-Sjy&cdi-3Vjmf#az
zfcI`fDuLnQ3%Ei^)oOMsk6*Vc0|_sVz3g1eD^6Ou1Le>h4uT7puo~uR2xU7NbcKZA
zyA-`Oe0TyEL>Jur8KYSw_?|VM)Gqa?hb12)!#DUOKlC*^E1~$Ftrm5-?O=#11;18B
z{nf!RbHn3pDfXI;u<5IMs}-F+OcSD9yg07CRK##yo{D8<%Dzm4dO(@4bB}Zd`{_LX
zH}7TG&`E80Usg-oA_P{2CLw}iYafz2F$+yUJMQ~3WsAReMMkUIVX0^4E>8<lb<17q
z-nr<d=OEb>f8%O@6fdB+4P485>MmH8bc)igHt0^lIaXmpQ<2s(w}leqoXCo`xDF#>
z6%_T0cukw(on1se<HuO1Ymg!#&`(k^B^2TstPUp?y1EvP(|x2Dk{dtSNtD{pvza?y
zU4_8$mnnsz190zCzk{be0c$pa7Amwjf7_mtY-Thwxpi!>z^tFF6k_6IUQ<=!v`&?u
zr7Ls7iP0}E;4G5GvqSm|O0N7tM*fVEW3E|`g)ic%0(LhY<17~T4P;?r4!lb@M!{T$
zOPuY#G<xxOs}H=<+eFUSJUo<3TOzt>P=^qHL|eOIXgIHtF$6SBoLncS$Xu7pcMCOG
zm{BW}S6vU0nB;ih&Pc52ETslJe$Xo18gLp<@<yC`%MzlONn<UUsNmo2+#mL$(-PDQ
z5|af6Pb%wdLOe18X{GfW=PgvqgGZo|U<n}j=fi^2jyZPJMx0wn{GzEjdNKFzIM?1~
zqy$!B-MaZjh5b-fJ7TU!?m3PEn#xu7VkzT*CXX(Bu3bJ1XC9+YI0;3*CJiC&H_Y$g
zVh)N+qejGf#g^;DVdM3nJkd>9%7|^U0yhz?L3kNC{NOPZt`3NOQ}4EzgGnxgIBPSk
z?(SLAXo$p~6yXhF^xBQi7TtYfrAI0)r4gA@l+JR#0jTN^b(15J($d&>I79He!aR&=
z=V;;I#Y(aIWmqT_!J{2B)Ewohx5%qJxDKNjmVcC=3W%!|DQd$xBhlP-Ttm5c4FKRz
zk*Zlz9`cIB4kg;vw9buyPoybFEpH2UxYjY4x03}|HDmyS6Z-x;qz8RHuHZQ`%#+**
zK?ju42GIki4(aON@eIofe0DvK!}GP~Zq_P-*^6*8{gev<rPhqR9ufs4cI3arEN)tE
z06rdjbXkw@8d5+V3oF)__D!hN-JKxrnGOXBiC04@B$`?yo=zXBcZW%#M1#M$xm0$6
z1|k{Ci4{|lNK9@v&uQAlJ7b8@s#VB<`obc_xXx9Wspbu|mu=d1UcW;W(YU-xyr*UF
zX2pQL@$SN-@$)IzhNPSr73=nV1Gnx#={fUjF;7)byUfCd!CTSJPlFn%=&fud;Z55A
zD!Mu$d90h&&e5-2c+m37pktQXLRvGJVx{%`o+-xcfDcG=%~4RKl<{u$URUs~rI3|u
zw)(`zla)6RpRxu*S0)aw)QR&jOa)(;f<f4athik%2riaVc<a(W`(BXfA4!7h6_`;M
z$R%zg?W|LFcl|srd#maeL=v>;o@3Z|Fz!Jo$;CABH!0|7CkR2U)3{gjBq~DM{mQFB
z@k_|sTV-W@At*FpI?pOsX)-Uw^M%u!JLsT-G01)_@4^;zFO3JO@h}im@b|Uc&DM3B
zcuUHEuwuV@o?fPhtoH+k`%2Tp`6YEA@){;CA(8epaL2U?LS;S%W~KekC{Q_?2rhoQ
zdjEKKLaXKg1RD%UHvI>xkY1jeBIC%?_t3JE5Y-RQj=~KaRlL5l64veU$F;X}F+}&w
zT{AYza2LFAA)Lh11mTtz^7zn_Z^ustt)|jsyW!npFgbW5d^erT3j)ha9~iw3kPsS@
zEjx@U#;8x=DhY|7vfJao5~2rAi!908uzFv0ZV1KQT?)1k_kO|<_Wc{vg~CaV_cAQm
z36iBGI>AvULx-~!tGC0yC*_-AUAHT+kqr6mP#PqWon7nrn5r$D@Hw8O|2!z(7~o&7
zZWr5e%<EL(>SvP;URu_mj~JQ-T-<EY7Enm(<P@O5NwX3@)fTuk2(!R7+qoVh)T-Xr
zMiS|za3_%jRQDpYk#vCzUYv-*9w4kheTRJ7VUk_#ERWY!l4!!U>PF^1Qm#rFUe=6U
zg?AqHNEUP++$2|(NjN*Gsyhybq`js+TFhI`E<O=UHr_tIg-%wwRjJ7k4fT<o=Wxq^
zd(<tTvAlJpG<9D;*LcqKf#C9%mKG@jwxH&K-q3qz=4MKEM`JDCHiXLBWZN{c@RNNx
z%oC#a%}x?F6`98@PpR;TPrS{6IB-p&5+02@(NhyJ;3tD#VfL}UpQ6`UeoLv}J&+V>
zCW%gg!wZM9_e$b9fcsVOUS7W5vCg^)HnvC65cW(2SJ18QlWzMHUY0bw_q)4MM70Tg
zldg&U)Sk0ipwV%aWZf?+;H{vYoc|eFZz&`eHUB|E=v;-R1_b?1W8Ml%o3~vTnP=;H
z49k6C^eebf&px&AXJ)jnRnA_slv0^Yp`5v$zb5Y(=(KR@jNx1@&uLtd#`=}}yyw{8
zUOeF3@lKz83+&Y6=!J9xE>KZ<D^KEm@(4HK@}-$ch*{pqo&v4dr|7JOIXGazIK<gW
zm9}^}r&Di&VTSZZbfGx^g)|aQbgk3HX~IQk&^})ve5*Y9`<4Fsa=5eKbmMKsUWoOd
zQ;izc%xs)0&#esQlf-dPIvHOuTq3^cX%=W-I<BR)XJ6?`tvS2hYriOYmMRp-L=r<a
zEIcBda6%s}p=0!-MKSJAUlYCu>fVdv)n$FlkS(e(@qO;WJ%O7^K`E(=zUj;QX<wg&
zX-pr=9#O-)s=nt$?UCb&Wc8HArz;-H1UtNERM#Fu(l%GCe3jV`c|HPrlu(_uz5$zy
zP{l*#K@$GqXReE%*;+~Y8`43XPuY6gg8>oWl4bP-Etes60WV7w+UOR(I#ELES_b~)
z7(2<uYzG~^d$Gu7w`CimICeBTLxdZ@yWn+kkfdD)4(_86DY1EXkkNsxUYWMup^~Es
z!cmXId#maMET`w{FO(gBC;m9zK)wsfi8r2Sr$u`2ZzeOCupbwplD+vzdu9epUo)5F
z-rA#br==QX<l@P{@P22pn_C9op{05r!$+d&d;PLEKBz9{6}8jBwa+Ms1%)q^@;+q_
z;uyP$WSc9dSV$H}Z@%Bp$rR&ar4D+H{uWZVcp2MF+XbXzzCTG;^2veQxL3MM0dN>f
zd#o_7eWXeX1t3n^`T6xVxrn!{BW;@zgEW5bQk1Vj^6fS!y5gf87fU&2m+O~Mu1<p#
z+x!<kF}9r_iM`|BTUjSDn~v@^BH1ac5?n!Km(@-=!gzgs_`>O+0i*eetD)@D*~N-@
zfnbJdXH9r|9Q4%TVGUkWRQZWrV|vS*tv_Kzeu33G5Z!cC1gQ+JRTaS-27S*cucy1t
zMfRGke#s;4C&+Oqu!nn(B>wYxrLXqxl7!2-*v`=#STE=BA;Vm4$2S=>`4yZn8kh4A
zZ6-Bwv-LiLREjd!aS<X&Te#i+6&7`e&oR3gKgJhBhatpq)d|G4)Fzk}3fH7djTt`7
zWo@lyoeWB03Fc)<?a36Hb4-C-R$YUzg3?LTQ8;kN)E}(if}@+PrAxC82v(6@XdYyI
zs>>9z>=;}w#-FgboYT?G>_yM?%tD0!lTvNqVcmPt%S?9m{@wTbc~k7{#%|lq5ky=n
zKN-YWjBcCc>co$jKB*I!t5JI$Ql-G9&J$D)yHqEUwOAiBE?2g-!aRKtZkoXAB+`(y
zG-*XbpFB@;AbBU52d*Z>=#%VJQ^oTt>)XJ)=QnOof3=0~1yBT$eA;qiU8!f1=`DBd
zThZE@PrYSGadF+`eJ9EJ0E0{(hWlhyVPe5UgnStwK6V@8a?)848B)M=*@v3}K1Xi+
z&HY%fyZW}^PXC<y5%^)=;PHf#KjeMTsX+)+yf}#Ct&OZK-Drey<`t57Uw3y%V`u_F
z5*<7)q=J$Z7u~t4n-OK-!v_b%G8V6M;50&RMn`Lv?|H^&WEY=+@>#)G%d3HA>QpJd
zati6>_0%_FR(2UOv{=J+5qZ5!9pCWtDuJQ1V%L%yeH~F4#9vN&{jh{cMJ9W8P6PO<
zTFa-~eCg%+=E=A&Wz4#Xk0h!xzEzpsFD{qZ=$nv+BAWfKfpT)!e0o7{=S<d1-BRQ%
zRZQ{E%79V|MO>r<Wi_M7t?Wn{>O$^b`viGQqydaHlbazV+NiMcm9!rGzAZtqNqLPH
zGJe7e{+mCk^k?U=C-k6v;dLU5OR7Uehhx9@@7(H0WE9zN`9-cQh(HI4a*7EX9`#~r
z*LF6wv{CJ-aAj2DY;RNq-5OheEuA94{z68ZY>=s<xVi-{jFirLKV`r`sEXRH5QbNz
z{NFG2rFLtX6=@kKEa<Anq~uw>Z`81y54q@3eti!oA!4c}L+aHVr)#p<%T7m7awH|O
zbD_6~>}lH!>DV<~l9zr_+OM}ZrjvR5nS<1eAc)~#-gxvL^&@m6JJo9=ta5)K*2q4{
ze@1D7{T9^7UPHGet+Lp^W?mb%@})|e?u+Hgj6#&8^bHcYD9dI0;)^9H>bdT{=a3>J
z<U|N$g$5`?l4X?~`qxnNb`I|*<Z3X@72E)5TE)HVke90v@$6YmiS8hrDPvtdQ4UZB
z3JX!bhBMX`q$9W%Q0TS8a{_@g`=|M4muRr3qYK9(4CNe;2+CAA3FMwkf=gk;zKlUB
z>a2FtYVI2g#V=or0rnA9eEfta{;NCxVf!g*A@{k59-gcr8~km72W=2KwfV$WP?HWR
zV;YG8>_sU9d5~u*91Wf5xG1Llub%~3G{Udp1uKh5DN?^4EC`?dRi9TbD0nGlSfe-<
zSedN<j~`}o^XAQ@w~!tVNzNQu*HE3Aq3m^twn!sU?N+gEU+;EeNT<dJ6CenP$I6!*
zI5`!WM_-J?_MkgFeV}+t?ee7KmoxyRhOYG))B3U>6{gql<ex6#$S4}aC@=iFf&Q4U
z?*n~<P;0|?RsNYQ^z(o2qRyUkgUs3=efh8)#hlRPA%?5}Lnfrb6(!h0vZ-(n>cu4q
zTWk*}?1p-A;bv026P?@VH;^%TGH{1yK0b<k+N=zQ)@v5Pon<nFJGt0k*D%c*#D!){
z!1Q(hL5TU+SxJ8Y+yQ}AxdOspsj25FUV*AulVbGuC&R-g`M+`jFt$OUamEQ?s@YD+
zBc?a^baJoyk#2Bq`mM4A))e(?$KV^pyvzsl;a;3MT+~Cjj1Vc3kKUnMzplJOspDeV
z2`9U?g@-ND9>M|8NH&ph?D;~=7vSUW>)`+AaSy%%4Zr3Dx{VPVxK4uk1UqsfOwf#k
zJk+h5sMnS*`(W)>ZPf1z3781pBz-6y95ZfiZl+RAM|sG(Cy+#M(zgHg4bDqv0`K3?
z&G&^$kNFH3_TGG%uh>4eoB?v$P91b%Eg+w{E%uQbhC;p%a?-3iVb@5L>@`dXY~pIy
zYz_`i0FGiO8MxYha&sUQ6UiDbgS9UY?MIDY6M+2w{+%zXZ|sKL|0H-)bqctnbbF8w
z`zdAx{3$(@3aGCA?h<@0QXp}6PR1(baorLEe-u@viN%D=x$kctgi-)z3y3>Dmw@{%
zWyqnOc{3WO7c}}VSspU+|ClU=N@A$Wpd{u;PQJ~lnU|rZr=lz?sW`?pK}K;@t=>tc
zY8*-$JlFBZfCkLOq?oEyUFJZJJ+@8(B?1N{4H~719<pf)L$K<CaZz4p5oj1TBh$qF
z`R3zX_{Y+KUP|~5Pe*VIYIfEh$2vH2s6z0|CdzGWm6nw?(9Hf>*#R<m;4nlS3CB(w
zPC@mZ%dRkBmM&tIM2f#s>>t65xGHr5Yj6-xALT~7QK1CsUtoSu$GuQn(vfJ<^NbxU
z%Iq%i2bh{qw6FECO6J{x>MF%w<y<e}D?cfnz}|6$FkPf{AV2;uC2oJrkN=kvuJb5k
zIjjL&@WJCpdVA-LpPwI;eF&wUhb->SGq-;B1>4@1T`sWIS?){4-mUa<Xwzxt`FkBA
zQmN7t!CpP`Z}nS$tXjDVQ#730WqS|S$$Lkl70xa7t9t7b{Eba9{4&-h$c#qOl>~nL
zx708QAHwnj0|U1K5;tIrz4U<ikP;dYe%8O0M*&gj_jN%~-Ssi|E1;=4LP8`H!v~6}
z8$<qEVVR5SnP!(FpN>jlpF&EgZ!2LF`b&K!Ny>V05L7N+X2S|z+AeiSLZuFWsa!;y
zTypV%f89<{w~4|F9^Xh{4bJc#DAax5uDg8U3x`SAqaVj2*s-8?>I7g9vCO6)RXyC0
zZuu^W8S#Z-6+=Dht?k`A%@}(bD*YKlWP9wC{{Bu_Bfl-q1SFO0U83l)u*)!lVz*|E
z@!X()ZzMeVS>#xw0mi4t<~QpTRph-(g3j$AJN|RZYk{)lC`)n$)}!EdMc#`y2?+_x
z{EL_Rs`d*}aZ4g%Gq?@qC3q~sjDh$`$;rccK>^r40|)orc26fm&6bJ(ukOu+ea9xS
zNd&9ydw}A!9fmkD`y%5}c0YV$tTA+)RzfZ@)-l;G^x}p>ufL2*1aw?GGaL3=P*iAh
zH;|T!Dm>N5$?1ISE;ld&ca>Rx4k~>^MEU#omk@eubxp=tH3Dm@5+E&M-RxUqhk2#=
zYh3<VR=8c$lq0T6t46}V0HuVG#GXTY^4`xGB^7IB|MuERyHlLl`xoyp+9ww6;71$r
z;Li#F<B}&8AM4?1duY>9u_f0fz8}LN+>R_nw%&~iO+&2f2AJ*W*fmiLKhlp@|Aa#o
zL0w+6t-@ZcMt&zM+*7;5bm9a`A(mHOKhMqml;r~6ock+-wllw;+H^0J4l8L5O!JtM
z7GX7Rht5Gi*Bpj^RulbI`)`ASnd8qXuo{<LsC+YLS9@0m%;IpaE5F(-0h5Ymwh(KH
z(jkN_t%>;imnmO|dN5wMTr1xAw6ymWSc6pCWw1G@4E8T0Lk^qr&x#PO(qOEM4_I&@
z=<`^Ohc!4oEIOJUleUysQ?2qViV%Aop6h9t6xMoG0wq+;wJ7rUY4N8Ozg^y$(ilSL
z07je>zz|9X;J!sXC;Pbu89ib5B$KdG-Y;Qedua?SZ4&s2u=yCh^pF*=l|uk}n7<9A
z@W|0G)UigSx9f86ABNH&d5$MP&HX$AtTCli9RJ@6xnDDMdteiGsk-$I`exogobc;O
zjd8(jofN4Jfvr<(AP>9MK*jP@T>0nv6jv^`1D;DN1o_s==|4ZnL)aM?ZmdJz3@xf~
zV3(lS*7l@YDEz)N3xdLAbaoZ6KvJ$q{#+n`F1<sEb}h@*2TF8z#~i=<l&rO){nI-z
zxLi^ve=bj?VP*Qb5bL_2e*j(Xum@^tF(LvPPnCMUCfm7{W`({J$*3`1*aVOCM;~^Q
zr+f*2;gZ2v)TzuF^TX2o^JI948%As2d7)kp*(^y)>amb?TPRIS6kb;|s?mvUjM<pe
zX$-*|1KZ=kf!#PwqM{h{w#TfHP!>X$yT9pPTdk0A^!Qq!-BFCm<(viQCV_zcxxzf|
zn@`!bI!V*a+IUqx_u9rp)2rp4#;5pRKX2BX7P6(E*Zjn+y}hoqSWvY*xhuX6>zccZ
zDs>asVLos{m9~-!)q1Mn>-ffdlM5nvfkXS7cQ0nR@OIuVXLf3<c0K1bsm^{&Xj(Ga
z^XbZ$oOR(S&8~C)e$e3~ik{jV1(*=qs;a7SjI@l*{t0){wRtXiZfx7n15`qIqSlb?
zTR2c?)0+5+_U}>2Z4Xxw+Lxi3ZRb|pGg)UoeY@tw3kTfE%~NdJiH<#=V~89VZ*^e&
zXX-mTI$j9o4`ZF@1e~yTZ%gK$Asr(8{=|MUwfl|14VLjKtq4~Sf%c%&ft$fwQ!0jB
zxZR{FM9=0~JenpGJPz-f$81jdY20qWi`d{v(5UYkHPc%d8?l@0*3Qzf>(-X|eP`}M
z`9hCn>d0rPXlNkQaBQ%VJj2DJqi%O>MmptuM%}09($(}2N`=e3Ev&Mv<3tqaT;r^k
zBB!kj&A#F}E%f&4yQ})HJ)NeRw!B62R&Z~-yFFHmwJM9gEE>cuo`2uZ6+&lG(S9UQ
zMd+FFEtwN|JJrOC>E0Ibmr<z~Z<yP=<*cniEzBLJhRF-bMm|n?Zl^^xN->TzJ<(f4
zi{ci1;&`E-wrI;@qN8cwODszLSVI3Y*CG4_u82?eoERMQs`-~}UawwHZy4L1l2ON+
zmOB?`UTl2euFINiV{NQlPB*=JMrI_}pi8}qF$aXMD#*j?-DNvVlcKg>`-m5h+l#(%
z*>^krcwaM-yzwss^^e7I2PzdGUol{PCoq`sibeE7DSJ-Hfd%z&#mt=61LR$5IqkdJ
zn9dO329Vs02-Ya@(wHl>XKnvB73S=kaVWBR3wLhC++mU{M8Yn_{@lLsW#{)UZPV_`
zve+ynr~rUs>f?`*@{lx=Sg~6tP>8_%_@#k&*;0qo_kn5N=PR7`POOyBec|PlcBUYG
zR&5Dw!uJnNj%<AlKkWX$*qrhmXn$#{0St^(wc@UyCSJ^Q#oMcs=t7=zu-&lz!-OdX
zP!FNV$6B`3APYt<+y(+!98imSMY;NyA!?xvlci6IdeGF-amWelL+tR}Lg$VVW#0qU
z)TtrzqQABctep=N;R8CV-Z{aH@U~dz#jigJL;W+|^v~vD?_u=Vp7&80|3Am(U!Q>G
zTmBD62KP+=ha>A7p!qfBe>Z38A8Z%n{)d(D`<aNQES72a_b>UsDf^2F`X8Xg?=8ar
zO<73O{=X?(UiJTT=wkpIFcKd^CBY*g9q5MkZ~lH+9D)j#m^jnY-N2{=CJM<E9B@yc
zsK|mm#t;ASfByba*GW)hAE%oQLKO`Mz#G_TIL?JNj)2~KW*@xjUBj(<h=2YuivGh4
zj(CD^en5^5BsDxQ5~H##I2>{WAFQ&eXouMT?ys`~JAc1lucAUUbM-_ya2ZaOl*2hO
zJyN^@1UF2+23Sh`-jCRq3OTtCxO?;<PbiigdP?O<&2OpN9>V6}N;r)h+rkIW=Lu6c
zl2^fzn}2!U_A0sTzc^$6G4{fAsI>Y<^Aq5;c={ZKP{WVgFRM673%Wl%Whp#mLmBQ4
z=;9v)tBN4O5Yh6gBiOIEx7{8eIGCtiGq;B$nXnExh4aeAWWUuwD@RbfeTI61=!&4|
z=ov1qkp8etbWvpXLH##3$Ee{yC=0g`#p13^tadpQF$5s*)xF(`JqHU-!0Xv)mR;>}
zMYD_v7sAvzEaWqzkB3fTMGkJa%@dpyDqn6VBq2BtqI}8r`6^f$G21g;7?l8QUm@=B
zJW2r^3u#Lw)xj5c>WRyUX|cjow!aJ?B5P_ERy+{I5_aDM8h?mBgo(#@Sxvi#!G17=
z=Eau5d|54a$H4j%uK(|i)!0gD_wS|)nl}muXJ#poTTuRoqT%gHb}AEQN&jjwKfLoL
z<<FI*Yztw_wfvJ<nH1a2!$IkxvHDIZure<KeMyOF!y2#q(U;?>FEzr^vW+7L)zVZa
zJO1T;lWBc#umv{Zu?UE-;lUyk6SlvKz9~`|@{j=(3efb0F1?h{|E!gN{l)bpL^4B7
z6*=L9C{z$7zCvDW@MvGoi_IzQ_r0k1EFjde&wfi#5Z<YNFbqQVBx5HJK#;EEH$gho
z@4~7uH=&w=1fF&u=&aOVUutju6|C9rhyQ)j2LMKr?b(p0L4yD@^kY1cKkDM`#{b(d
z*Du5TxSKmmivn1a1Ry(;*x1+@ewDhhH7L69?*{pQKV}5TN68!)RAGT#H^c)CO8w}(
zAGo48N4o>R6_SJ#GD=mDa$Q2Q40Omg$mcxy?OV!);Z^&T_yzYtW1OHKq?uoX^$%<N
zzxCK3Rwkl<<pSg$`$d>53>q==tND42@W0Lb)m=77gp&0>1sLY?zP&#eU;jmXfUWdm
zhpnJ@d%D2*eINCagt2;%2P4z=CymveaGS<aUb|ojJwK8Jw?<5DQPI&;L*Lu521kC(
z1J7NsAhe0l95=iIL1-38ud1to%X`1tKQ-^a7kFm`kp1I0z}m+l;JWnux1;~p<-Q83
zgUMN)iN9Az+64hy1L4D?XW@xo5TnvGh=(cx4W~bU>Ngjws2}6t0+(uR(b)momy3oz
zI`s2zpN@FsE_62!Sq6MCkO@<^g)UK2)=@{RU@#h3*8{w~0s;qW0?zhmX7=v69~6`X
zl+JiFoS4_fF<!s5E+e2o5F=#z>?OSF#UU7wfVipX-#+I)NO-S*QHB$ZqOGO`O4}FB
zkW*zcV5IoluYx5f1Ya;GZ;ixh?<&PY!VeyVr$t%5X8-<;FD&4FdkrRK;K4?m;elV9
z$L;<#*OArvN9V*IKzqm!36*l)J#ZV|@0-bs`SfqE{mZ*Od(g73YpaGYI>d{v2hipE
zoF=Gxw(B0idhfPv`0m0pQgMr+8^sUGKx8Dn5d(v#`AHc`bN#F3`|;enVD~Vf?;JtG
zmM_krh7L@y|LzSC0s}+z1IH}<$J=w%V$Z)^Q66$ZjH6T~$;Xc$-`VQ>y*0qW4FCn@
z9N%R#Bq`$t!Pw<X8!%)Wi-*R3yNCbHd4kX)OixXBdjx&P+8Nh*ky+1~j*fuW%pq^j
zkFlsfh5Zjc6rR&|^0aCixY^g~@!`S%;mlie_AKwanI;JOC|mD(Ts=N1hZs}q9CBhJ
z_NR~aI3(<3enrYql7xUn&Xbi*X;k^q^*A2}WqJbD694&w&kgDKc+C?(uOPULi}PIU
zq`2^-wUV_STg{xhPFwx@iSU)I;g6&rJ}(q3J>eLSnik!GgGVfk?MDhY5;25Y`}X6*
zt5S-DL8HZ}g_*<Z+XoEY`Ko(6a0%!=e)l6c1KyusYD3KnuWWFEJfB(~gWx~B2|f~s
zZfg6nJZo?vlBbwW|EDjaZ#V-VPLyy{6h2&dc&|6i`rZ3E2daXn*B=k>Bf;9jpMUVA
z*aJa*5`khd=!#Bxda{{DM<lOEBYRIz99!@F=R-S1!1`%up4fj7byHBergd%W_;rG1
zrQubJV}ELGgCn%@{B(qsIC}I{_=n`+t${zi3-q9&!C%O>Uss4Hf)g`4P!;L9bN5i^
z{=JO99hB`MyGaU>=vu$T?Xlm~fd*4?89kxmvpR+z<9B0<{(77MgvoG1q#5A%s8e$K
z<-_yMvOGI??TbAu$J8tQ?8u+S>l+1pedK*|_<GMN3TD^=Z@k6hFl>k=2z{#4?HX*k
z_@}3kV-+sH$B&Dq34JJ<m==BI%uf{S<z7h~r6a->TDbg@{ZCI}1Isl=ra1oD0eGd{
zOJOw}I{RYi|JS}IwA=IVH$OTh(U@zo@27wlF|%8&jwVMwWR#%jE#Jx-Rq3?k;Wnsy
z_O{Ee%bF|xu-w<3H^hFk&VJ0KST7iS76*CWWAM`0MqD%I<Y@Y!pfq?a{6}Q#sRawF
z32dKf|I{k_1{1W<C=A@84~6&QeOXjZ5D&zpJ(ATdPma!4pW@D=IWk3}EZAj=!&Q9g
zkF)zQJGdKPpPLXvLsOzXa5)GU4VBLJ4eWYAHX;wXD7Zqc#WB1Le`=m#CUo|YQw|Sm
z+$~%)-=t`6Ewj8Rn6u)hxt#vH?&ZEFnm2T2ln6>+aaXs=n|OMo>W{O41e!`-NKCy8
zjbOe<uX>u{t!FQH3R%5~i;J(imVdlU;*SlCI0+35@0{2R4K$4N=t}xTEB;VYC~nfw
z8S`54d1ug(Lki^dU6N8=$NHMz9=WUar}ZU#3lA3mkb(t08i5sDvq<*IjLRBqJbeTC
zBStpm$s<radGo-Z8s{NwjZd)5*&bPX6xJCV`AnBhD<p5RPLZV&e7hU;rJT6cS=}k^
zh4r7h{%>ZSStFLjwCIaz%aNJzGdH~CN0>SNy?v?|_16CUJgm^JT#+bOwC>r+;CTi!
zo-}{(kY%<wcDU0zZ)Vqx_CI}y93PBWNbxahVl-lz!lkA~>mI?FnUP@eo)-_?Pc7EW
znI&tU%@6vM4I;=@ffZ=H@tGe!sKLh5smo>shc3!P-S%^tL=biO%Cml4q1@qWsSjR%
zdZq?^Sa?N!4A%beN{`ch1)~bWOvY!D9|Ajo*XCL7aK9Vc7qIV>X#E<&SXoFQ>!`)t
zr#kzQbVi<^UAd=G?vRALztW6l!GJbyr?u@@qK_y4IDg_{2Q(u_3*{>Q`WT;Dmo79-
zdDC*|1;UY=RB<Xd)K8CF-u%4#)1OUs6g^CC(oBY7crXGoVrKP<WO*DqAFo_C`-bSW
za-W;ze`pu+hy#omjk_P;MR+CsDPm0Q<3y(*x*7Qj+JeJ!PfbUBMR4dktt|`CtoVIb
zm!dvW-R`4P(94Hm%aQ+-@w_p`Be=|b@-b9s^xl7V{tq8Q@Sc!(=IapZ^P+39m&s$~
z%BwOeA0K}{VD|2Nb97pO`6U^eKaJKw^dxt-pM?EEfj@)?*voG4y8fvz42!^sB|Xj&
zMPs9mD~4Hq6&K5AJ#@(ah+3yhA-Q|H_|xhOt&au_e?M8!zBdG$=<Og5=Znw~7B+YY
zSt>B}K8Jqah1-AqZ$A>WDD`DiU-67|+9~7~YuH!rDNgS4|M4U6b_!j|$Y1xU_ufN#
zztimYb06C;x7b{t=4)%7_)~{~cnS2R&3MRg339Sz%d;>EbXCf;R|uDnWhBQSz^S+=
zt$m79u-C<XfBY~YQo>4n#k$>j44!f`Co*5cOlw??bKl#DX$pq!W=y3y6uSFQ!%IA3
z2a~qjZMVfHTGEWfn0``BCNfp(S5LDe)ARee+AxVM!p8o-b<`>edSi0B0>WUT9X^}2
zRmy|3Y{RhYefgtZ@eqEATv@b`cX%H5lq07rQ&wf%DHKO|QS;#ci_X}B8S>Q(O?2WE
z5(4GfMqKNmno%`&vKh<vxt$m7#Owvv%%Q(<{_Nl0gzorqbi3JHxRy9{Jh{*r!AGe3
z75>o^!uEUCfn>+YVNl2U>2#TkeH5=jz~5H(H!|?vNbgs`*Ls=}6XcCHId+|qdH7=j
zwOLn{d+rQmSw8cRB@nxYmn0RgySg+@7qtuwNCUHyS@E0?3;{NDcn0_X_7yjY;VW?b
z#8A(fkzP*gCna2dg%d4j0TkqdZn0;l|91#K;u^%b=JSm{f9yoj<Q@an@-NV?`Imvl
zgU!oAW4pO{`2CpW6y?yj=T{SMXxJy>yL^#*&>e?aV&3<DySxAT!*h!Ckf2j8-&b!^
zCmENrjYW=jO>60;ix!oC`Ec0hAvZyY>H&m2@rXynJzud15q=v9_7XhI-IuUsCwehI
zDm0I=uQ|}3Wccq*g}0WJuMEeSBQjbvz8zGZTSMWL@ZG;XJv}k>$U9R{^q<6JzMwI&
z*I>`Gcc=Ajem4T&tK0woxAt~3T731Pi~yW6X4tz7s$s*|ri!zickn9WbxQT!IUsZf
zP(P2=pva=u17~NA%UrE$Du=^pVrUH%8k=al%!<uHeP%dNI_uJ3_4|jM7vI0j6GP|%
zoQolVT=L`?)N`-CoO*PXU9DAg@RcNm1604%0N40NWe|<={_`XB)<Ce0IR5ln{I}AV
zj(Ef$SRmaajqDQ|7T6+HzR9=Jc0K)+Xt7hsEQGdTLxs67Ve&Q=^5p*35CWUza-eZ4
zS_466>DfAzteQsyN;K%b18`mA{h9%_)p9DoLs!3W@F@pyE~3MorwaMMyYQvfk#ztL
zpcZf)N20!jzMemB4fUYm04^^bWLg_b%Zh+r==kEg)Xi1(5<s(uq?I^ehtui<66IM`
zCS31@NlC7Dx<5sF4e*RLP%PDI-GB9TX)YCy*|QZW^Xzmz^z1AyIV=aoHvw;tqLwL>
zSfnj8f6(i)Dy#}U-bZRwFIJw+q8dK%*C~a}S1i|mXRKXjxd)4dO@9Xvy=$QiT-(|x
z;M7asl}eu1H^=~TH5ig`0mB{d$GLP>HEQGyUtUc4laq>4Tbmok`!#U{^E!<O!#Hn0
zY8<|gVgD8!$ldut0O$!B0s@PH!d(Q+9%Dj51RwHh6OETWS8?bjg{ReMokos|2obgG
z-y!%$2}`Z_J{MYQnIz!U`G^Ral~*0W3#8^d_ma?+c~E6}^st-^**}b~6Q0|A0$n%>
zqxk&#7F3*GSMp0Btzt<_*VTq;Rg;|5P#;>hKA3X88gQvcl-Jat!0udDy%zzedSB%c
zbIx~niPAosq?MH~!F-GW5GacSCi$6_WJ`QN{<*-{q#fE!ge9A+<MQKz?(0B~#b~m=
zmYL7$<THKoW|#QaEAGtmiIM?Ow(j1#KYZX~Y#OX^o7Taho5(P&<C^hSsmxfit~|$g
z5t%HsA%ozs&fd0vjKgC*fL?IU0es}u@RRd}>cTx9vLolmyXN>OUflVt7c6*R{Wx36
z%-bj%ehYST%9%2Ply}MmllAVK)>nfxC-qv`D@_>_cFN1me7#2(w5bp4ncEWx!oB-w
z1zxqkHGjR5B+o@DZolV!?dWANA~I(%<saYy+fc|MO3)d_7dl1gl|rfxY;;Hp#@!Ny
zhwTSTe>-4qnEoH8J(e0r`d}cfV3EN9ssQIa+v}${(hxTUbYZiR)Jb50G+v6tteqid
z)}XgJP{8sext`qq^DRmayNQm7W)%Xn@83R(Pi}&b@IVW}_PJ2qXcjqH?09>Y&LUCX
z%A+l)eGg>0Gdhj4bC-=R{u$Sc8q*x>#EI-@b$ZO^jzqmPcLq~u>b{3>dmd}zT*%1K
ziQO=t=y(xSoX*yG1WLn-T+9n~P5$YIy(FNvd`~h2R%2AnCek$jY#BhNU#553eM+hQ
z7<N{#254~lK(vV(S%zBlvWNph%du0|hy-=SRb>t^wb@gqd2_oqH>TZ>C`XQT6=bPd
zyut5UQDSkNFrwtK9cziuGgiir(Sa)LA%ucDjcOcQU7!p<{W;xkAK?)E^>#hybn}wg
zyJ|q;Npl$?;^+2p;_ftLoLir(m+1Ne)Tf&Fex%YdLavANgYCK+fWmy8?A!dk&B<_s
z=(7MW9qycb!mEvU9)*1*G{t356oOf`JgXyNzQ~K;u7g|4tz06+^~>PIcRZKsQtkig
z2gBW-vO+7z2zg7uA`gMJ<%H^3Lo&Ho@=N@hrIpkJ0ADcAg!1+1z)4s*LjW1PT~pb0
zleMU~+q{Z3g+s6M{z&kNw(Uh`U<D|laMAsI$L79A<@P}}>tSV6M`RhBodAMd1H9RW
zE6E*A*76QU(T+o-w)-$)bMOT`3mrD62#>4;2POmybv|?7T#*)}uxoS0E$DSC4hLhR
z1|_k9+Buxs?i;pq5Z;lPzSVMGKA7A`l;YZT&wLz(t||oLYo8Tb`eOaYL!#^1wV<8K
z!Zu)q)6`(sm^uUGUd5mco1XZHwMJ)Bcf|G_0%)=LaMT$CZ($G^pSgV};Qdi_Y{;OP
z(J+TI4oyr(|G+)Z`CSck+{7(obGlpqvl-NqhVfY^IgZ5Li*MTv#p`8`fE~TV-|jv8
z;HYQ~Yy~$F#(#*0-r1<A9H6DQ07~9FKcO08m<b%`Bsz&SyODb*L|WxwZxfsYdTltc
zCeAs1@jh|Ad*H%Qm|lFoLHA;Yld|?T!#E!&X3oOQ;ltv3c$=^$BC}iuS>FVHd>wxA
zE+Eb7CSYTModbZd62T8)-4&l@`r@6z{3L0>_QF-uw`QNkygZ^G*YghSOIXwT8U>V&
zx*DhqZafV5i-&xGPB!Y49NX|0PdOMlbw$wNbm6y0rZ=YUNl#u3KEhLntOr|NT+baP
zIb$f;&H8*_Td^MA?!?pGHqQDgznUd&-l(<pqp5P&s+Z-A)=@a;v9fb8`GJSfa=LHx
zWh!l&+b!B7otB}BiL}ZKv%3Y=$IEC<aQ<e-4R_;6ulK+fC_bswa{hSzHK)<U1mOlu
zT4sa+*HcqQoV+G^E<AoES8-*-qFdf4NGB;%`uCA7uK+pqHexe(ENX+v_Fn<6Z4=x5
zMn~9RHo5@2BYFA7MSD*#2f}YYcQMk4mSx@${fH-_>YkUlTiOf#f}tio)vrW>fZ#g4
zs&TXprX9Z#X>z_erY4cl7Gp7wn5m1iMPUAqDAN(<Q5zE)MNV8|d%g*99x8lxvQvih
za+eK)1s5+9$q#*=esSYn4M4DC3__lsN*k_5D8yION6cOwwHm!kVr3X=d~I<-KA;gX
zpW4mBhEHllPcjli6!Klvw(IoZ3jI|JiK8)c6||G(L~ApZr%Czt{@sBLp={vkq;p%o
z!2z^;iHp>(pHwHQVM$SOIDWrr4ZNFn+x@#~9byZiv~;|47ptmjAEKR;?Z_EGMjzZ$
zJKF=V-bf^IpLRQPQZR`^)8b8wU@5>C<UifN%3gTv)dbp8=1QksPuucGz4TDu{2c2e
znCwT&6oCrA6=kv$hvVcb02D?nu*-*??1M?A!?%YLHGs`g$w>pcA){06$poX2w|vdX
zvOLX6(i~YO-zAkkiv?@=$Q}_aIvW<+D))xn*;ScKqCK|pYcJH%*8^*z?1R(g@X$yu
zW0iT@)N0X-8u(_SqNW_CtS?z&L>YsmP4!yG8fl<PJfINf@Zs#0{GPkoddd}5`EPQW
za^CId4+HQ-T1I`~9f?>PI1T0wC=Z08EtrB}*|g{-p5w%g)ex7a*MV(b&3d8#v!MC#
z*-K_btXt$I1~plG`;Fekk6pMAjvE{gY7vY-`f;<t0Y>rrBQ=kSp0%30uW96facNsW
zW`0dXCw>)RY80G`_z&GSR&4`^0g|q+czh6M?>VGO)BemooC&k0v}9~}lPHm^e_3g8
z9FQGZpIfAPx^E(WY!S~EVB5lnW88ZLKx#zZxd1uI?KkbZ`LA?%c(G#UML%Vpc{FYX
zRwmNAjxRYbbYQQ}vdGDHjW@n<#z>B<>KqA7#c#E&qM`hfuAR`zop+=%;Ph)l1iFb(
zY%RH~Z9sJbL;DKUMCRke&fW4?b6sdXDzd}#1Vk}zGjJD+pzU8x7!Kh1nrih`858d!
z`_qk1{rz%9Nkh2zn=6=sy)@5HoNLv;m74hUTmW1CwfR~X*H5lrURiRH&f~cd&gR!j
zVluzqyRaTN>;MJsn${E9Rr$5E?ar|Pu1SzzJ790<B_Et2FaVHiPGq37l*yhd7-BRk
z0c=8r&p9!Y?=N>t+dw%&q4i0FVH@R&h-t^`^}*8;?Ip4?LWQ#KixZ!Rpw9xM?~o16
z>bP5(zrbP570nCX^-Mob`V~$LTYd)v;SDaUWj4kD6OC!pryr^8Qh((FXqfQ<uwpz6
zaZ)h;IY0uZ{H&zd&Nus>RBxvurZQ!2^lluXmXrC|el10xTgpCHa5R;_E9@1@8&!NI
z|3*P;z=uMenXqtvy0=U(RcK?rIpys`or+<Qj^HHR0Pj5(-KIns)J7w3HlNaWZPyhA
zWA><Ag(+_8OXbOv<~wE3ar(bbpf4Vh-7suo9G5Gm-!OXx0x#k?cN?a|kl#in;9yaI
zY{R~6P0Jd#RQ+!JPkG@U>XRvhK{QBor)K8syRL~-{&Z8oF*6|3wU%i7*aEOm9(`cl
zVm`M4XOk#lMz-KS0OESSz;-E^oYv3`sLDpRWSud`O2J;^mrDRLW3lbE8;GOtLZ>Gg
zp_Ol%YwhpXx7O>o!dNq3#amqFYTwVne(Eazz9GQR-Z!>s*WGm9<wJD%_5wi2KAOOm
z%XL%_{{z`XL;ZSO-#v_k>q44NV9$Y&Muto^)oaRXN>Rr-^=razAR!Z;aXIxOIv;#;
zG9n_aX>hRU7nWNxB~8=L+iJqxd{ga!PW~{59g?HDhkb+xQ80Vzk%cg<Twz=MFWg#F
z@?(+bS0AV-yRF9w2XA&f)n0Y}_Tgcx;FX|#f9qUf`p3dg356oAt6rA4cn1vEnPxjc
z@tRtL&mu;O59c%kPRq&>DVz&^341E1U55Fi9AA3oC^(qH+RRw_2)+@uPv7_b(spR)
z<4U`2rS9ozM&*Vdl-I7^B>=<UZMB*hZSW;@vxQcs)jZ-XI(KqFOCz*KSI5p9&>(8;
z{Hj8wTt$kqsumZaXbn%Vi$0<fqjTJ7SAqHbmL(3cYbIv#`l=)Uy-U?+Yzszv@4(u5
zrluFjb-1YOz=ib&sYvcx=9+8}C0I1!CySkUPIw2@Jt7N1p5e~Z$p+o@5$PGQR$Wpt
zLfvpw*elPK1&w3kEqiFNhPjI<)q`q@8BaHs2Z9r-?$XVaE%yi1!f~NTW%%tIhK4$r
zw4EzsmJ_IpALBp56|%gbxInv6bJu--i*WsaBPpF4^r^Q3PPn$3-3G-Q<LSrWMy`_2
zOveEb=CfgZwnin|N2~$p-tx>$FeY+YS%ve;2KapTdktFl55wVony=T4f9u?78@Lft
zQ*I;klINEw^SXN?a^t$jvpedNL#FP@rY9{50LdhAktXk}<M)qDfw**2Bl$*Zw|a~%
zyHr?@Ev=?id~JvmjkIFMEI)VdrWua1j|NRFy@;1vFYUiCWVv^vqjjlWf=5KBf^D_j
z0|`@M@_r`(1#&2V`{+CsWux;xs<>|U)j&|%O7>ej!tS%Me)-V)9jMVp>o<smU_->*
z*>vqFU02Yjj48$X-O4m~KKT7TK0rf%M)y6z956jnhNL*-GV?_vCT`L{IMJu-so<Ol
z=Lv4pyr7kp5LNBli0vCO*LW;VU3!oPP?<XSSr%U<bg^_S<It&f)Te)Udb#e7C(;@>
zAuEf{gb3<y!>&vsPfqRrt?=~$ff)~~GuMU3vNJ)uLLaafoP-#&mmg2a2b{1^(P{=5
ztKytFz%+vRHkSt!3u<vI@Yi<?z^UOjl5OfgQj?w3DFH|CI3mZ)y6mnbw!^C7I$yb!
zL&m*l!50+^-U6wj@6WPC<eY>?ddtc_LsX^~(4&pBA+G>q<{pM%&84U4+}AC0zTQ1D
z^i@v{P9DtQ`O#CSA#bIdh1luc^j*6zJ01`%zuI~_OgU}FHMAX?{g2V}v?GQv15dol
zv8fI)TeWNjUz+yct3RtETpsg+tuq>fo2)QG4CfD{9AKrir>7Be!s4^eoGb;$Mjxg<
zhhnl!hMH{GYaqW^nWjZiW-|3{$BHEATw6c@id0_gkaUX~aDOIi8eizCb}{(yh$a})
zvrWHA4MAM6X5&187!Hh9w<;()^kv#+1OqH4Wo%jDDqDzyL);{Dx;@afh6FRJ*|q?C
z6%JqDEC5OmL*U@?Y@eqGsq?-ydcVfl05ytTel1%+7~xR_z%dgnrV(;^B|noyCCV0x
zY-xm6DJ(W077?xihq)<HatvAB%)Ppm`y6#2iVG##-QSZ)gn{?<d`hp`c#=ZHgv4h{
z$>#-muunQu9HARJp4K2gBt^IJJ{3m|@H=S)pDqkv)meQc&#IM2QNu4DME#ld%gF&a
z#4%4NU4xrrLUcXuQOSZ<GtXXcb;}BTmv7HwPLe&pvGYRrL{;J6FMy(s^%c}K;3c0p
zuW^>Mk>9xOR1T>9aN8`Q5L?8$Y~;M@hzx5<HgFfS^sAqo6xSJQe4cfp<^l-(k)PKs
zlbit=4Lt;{66tEXIgYdv3?C`9nvh9egMdr?o>4&395w1C!|YHnG$2wRg_ukt?BK3O
z`A-Wb@2lg}*fd;xgTd3Q3JPR-nETEFVrcOevFcx$HDj}Zw|+^P1mM$05Tz_pC_ezt
zrRAmAq<9N^Egj*yVL$&kI*=8@A*dD(&4D8Hpz$IzUlMoS>x6gNT+}qD&ADs!ju?|U
z_{PM}n}YWmtj-66w}#I%3s$Q6)QfW51rwpUqBu;yQ{58l#4*!`m?tV0SGJNYWSND@
zI-f=-E75$S!Dto$dR<>AG=A^c!sWOldD2e6bg2bBH50NYEiq0yYxB!pU~V{QB07~*
zjT&KEMBZKz7kCYjVdFXE4;i>axQU}($K5q)qrLcnx}_Eb4!9!aZ?XCuucju<AEgc6
z+reej>*i2xrdnS?$gg5bRv0s{p4R?_K{PG|R>?Uh_eTqk4C8L|ylr~w!_U{QkbBNM
zK%}%H$Bg0CGwnMNL`aY#513zlZmqQNGJODc_A&B(P1&sLI`e`z+FWy}aS0BImftJh
zc0uSv%i+Ss376ka?8wZX4XHTVjpAK6F`6x>{vIDz(k?%+{B0$F2}ok5w2t2%8Zx+a
z=H71fPCSveAn{l|BDWRqo;sZifH66)p31(Ie(%I(C1v5q@<wU#4WeJXZ4P5jOmbdy
zhZ9O95}Xj9yv7M)2<NQ-AHKdjkm~RMKT2`SD3?lEDVZf1*&;h!+-qDbAzXXs+H_M2
zm5_D0My_iWu9-wb_BF0OQrRQfGrx27o}bU>`}>cs?rWUqIp=vE&&T8O1dRqh6c&E#
zxlzGeJ>Hq4n^tO0B)5}CCXOg<TyH&;r=!R$4Y_m{I5O*<;+dsIe1~0N!+0zmE68l@
zr5H!Ki)%kYCDw%>nd}e9ZJv=I20rPiwWk;O(i<WN63y?9VeWKhY1CPF-x~%%tMSH0
zvbTaDSH@yn@ZNgxMUK3B`EQ@1u7a~7dd|b-pgfplqpkFkLrfTVrx$#s9r;vYG4flV
zr;fU3s>SH6_K1S@)@;T@<LJ}|%Z&c1mS_N6NJ9iLN+RC#Eidph;XAue$vGptgR9Du
zFGCE?5ZH>{8FxENL3k98tBC3>s(6&0fHt_}s43Sz9mIb1&SSl}RMIQ5H*}`?OcN}Q
z(d7GSv<@O&lE1I}b3?g6GJmOk#GCPv$3H>$s=*zAMlJNm4RFuYXq{HBlOxug5l6w>
zCsyC8Qso1W*8Iojs&Nk}kW7;E?3TdL*^{@E<O9*AZv>>`*}aO77`E?v0X#+(S9*>|
zH~W>)pwT12n#0pS&k|CyoTqBYpj#L#agK5ote|n0W2Re-lCs}@<fWNFETPZ#G@4_@
zQu(4Tb|?>H74Q7|7&Yy83WKNpQT9GS_;`BNDwKZX!|E*2e(U3n(MYR*?~$}#_d`G(
z?R(?T22M^{DVoF37XYyE&(M%AC#xpi=S^Taj0CqH{s?vA1O448)6ZKkgRu`y180q6
zf1n&zK%an>fp!3yI0HLDOY%;2m?EXxLP-n!wX^I!ZVEGrHD-)PfF2{`rCmb1lN9W(
z1st-V+7F7r;V_YRTnzWp^{wFIa|x(4>5v}Zu`fk=sEdKyYXzHjG{l`?t$`dR?!8sZ
zy*M9ZafQpB)PlVX9zyw!l%|>8`8Ww*ND2RkOn-Z@sXV#~4KwJw85@e*?i!s323aQ&
zmz%C81uEp(K2VZ!=f1GlH;!vh?`FLq)=4dos?FAvqjX8*WhFZ6zY=!9H^pGj>@@z&
z@}}&`_<3!<S02CwED?N>Bt)9v9jO$%@V!#Nz@c6edXr*8oatS13{ly+oTy~5<w`H-
zB5nm*Uv2lmJ!laq(h{U8VAgcBskr*S5HpG=YnkjP>A3?Sih0Cot#ysl!o%IS)z%{|
z?3Nyb*CCdKZ$BXop^0Cb(^`K-2%g#vk!cDF$SY0=(`Y>v^aN)wqc0Z`bLvVx{sa~&
zJ3X*Prcx${c|}8<31t2FNE5U$!aEaKcJZJC1YtG6z$+2B;t4^NXE%n&%A32}q4tKa
zq{O<}=7B}{{N~ezt8!^jIxhaX<O^KzAG=xWqUfk6IChj?_N41>^~lNTYw$!8{ii8!
zwV3y+#}K9l*>#lmn_aUN!#UVwUAa|w2}swo4TTKExrPP(EMkW|yZj9?{l%s8RV$rZ
zlnmBFx$6OwBTs(+Vd_*0QK+j(WRh}rea`jeh~LUs<bCIlEah6|aYl3zA9?=+%uW*e
zawMR;Oqt9tCY8|$4V*O@Jx~h{;En`ssMJlGh_7!03%H^z$%@R}l#<&_A)H>hKyoE&
zGYh97iTF%{$?y?*N?07bd@|B8)F*9^VdTUbHz`<UD3x~EJ;KRKM6}mgf@1o^nJf)}
z*O+l1EEm>{_k^7}hFH~LgXC}EE@Ie<%W7%0g>0oHgbm`#cpz1|0xFpez`c-#c%fMu
z>8>E?lu3Y^)N3yI*%u?2U*HxkK%wfPGp0*S&pFRMyYjOR0>5gk38544!qY9E<AL6X
zufDpFU{?I;L!gj@hcWXsVFW5E<|k+K`AJ_|-p4LVYq@S%OAIKU@Ns3}CJSM+vU0wx
ziKNS4v%QsAuB{Ahfu3ve`jvbO*H}^(^Lk`=7T>Gll2jSy)e7D(8IHNdSZyqh>22^X
z_#xVVqIX-iudTFq1<Ub&`&kgF=RziA3D}~4;0|Pm89^%e&gP2@fPPJ;Qztg5PGMFk
z3E0peo3xj`wz1FN?s|;UM<4rSO4()3hb3LxzbIq<;DBzYIC-{8$azq6uO@|oYoYRd
zp>10IjRBV)9O<!=JKDl<N}cTVRp7G3OpyKyb!RS??;W1y0w*_>$sQ|Ma<nQ<0Q8WN
zy+Gk@T~<lC2EMHP8F}~j3crK~>4kX?Vl-k^r--=3IASFB%q%yF4&?mOV*(xwzmGp~
zpT2lHBEqkeus+2^w;{Hv^jhYx?3Hr9OurFYL13_*7tLkA3C7|ZjL1M-htOj*peOXQ
z`%4>JDV-^&&deqS=nwK}r)L#>`(c1!2m<!4b_ADig?WvawWsjok)swfV0n4OyF6^&
zAz4Ey8RKgb*$G3L<2wY-t+gs%1xkiR6SY%z-mkELx1%&@h*tZUt#dezPLC!(GMaEx
zk;UOrtWTQ>GN5wMer-2DFar4|CpBT)6LE6UG<kXwN6{b<W)gVky<Kf`cd{FifvoZC
ze(yMR`S4k<FxwU6VhAzCyT_yhMfjRweUnY2AROwW|51=;yZ9JFB{sLs8+1^}H&GeO
z2h-wm^BuOb$JB=))EmlFmk{obJunfaBDDp38++KiH=O;#thc6&D{(~Mm=F3%AHjrp
zN=HvK41MsL<qjNibNz;ysmyQ};bvxUTIuCbUUtu+Z>DkIp?6EH0kjbvR`3gkHx}km
zc0Th69-wdRhr_TE`n5a#A7yzOtX?U<orv{a+35U?)X<1?QbH_2nYy<V^Xqtst1=+#
z&J2toJPZUNS)HLauE}#ib;$eCGHE$U;CVj|5<+P)TlBFRUGqTZ5)FCv*GG0VqwRZ`
zLyY9!Bx(!~GwfKq^=r**eszSq`-J$k9ERgEq%t=<i8`2YIOmnUzJC{`A?MH6`Rh)U
z&aT?DdOrkLx2X<N9cp3$rPspCrWBH&wEnw5>jGCX{)*V_2HzgW8R-*Iu@rRePqZiW
zC2im3^L?X;q~fki!;JugN}L@WaJZRNO1g(Bn8^@eC%D~K(|5Z*DjJ=KI-YbFVXqo{
z&4EP@Uia<Qdxl3yNqc-?^yt93g*Pkoj<ri+yo!ib9^KjVf$xXPkg#lSqR%U377YAK
zBPZm9!`B`L?U}q(g#DE74k)CpY&~@XFhFW)bNt3Kz@>3UM+K_G-aaidWb(1ujeo;C
z;x(_N9Gjgq)cwseHa4l^nckGNqo|mc>65Ah`i6zRUpq(Nq~0=Oc_Ea^vR;B*E&$Hp
zuOo6A?$^6P`lmSbWSE2R4U9V^W>BSpwm=v4RFd<VE2`4Ei!VYKI=)$i(h)7Xr0m1s
z<Zx)`vKoHok)_6a!;R0(hYeKDj2g!Y!iAOF6-*-P#j4ccjn{FizIgozYRT5K5I&56
zk-d^1ICv!H1(W?-g8a`p7g0E5RNb2VT5dPj?+?Vcf+N~QZVomQ`SK%TyW^W?T#X^p
zAsgFeBz|#Tur%6Y^xJ@cZYqFkF2CLQYVo|v=ouk`yu4w!%3NM!!Ka>%vVol3WZ4}$
z*t$*A4LgOHKL!%_*`e}hmKgOP{?I->yjD?!2|>K^3n&#Y5M42QOJyEiv*`;`3dZ=T
zhQR2eSCL^ieyLa4DZ!RJ^A6r0Irw}NecbnF^tG?Fo&fLZ-ZAb~yS{WznOmw{*my`{
zC55nliNKm2@>t2wy}-#fd-4vA@bK}IO+MinzkadYss{F`nN51XmZJ%`2yAyc`vAi0
zk2$`I2H1MqIkWum1JtJh)@^g|_7zHif!1-7q)f}lbCOqARN=u+7|v=|&O*;4|12RW
z_EQJh>Y2Ki&h7flu8BJoENVT)HPRu>VE*SY>SFF08k)gVrd?ZQ+mvI?%KWzqT;oaN
z6FTiklx327@4a&55a0PsA4?78SZ-8QhO6`-?v=V(94%2BHg%&Rolb^$P|sM=xu#UX
z_j~Fo-{8HTN_<nqFKyxSx4^>xaJxy}d6`YNq{lunY7K#4&aZtXRq_Uwm<rmTGn&L}
zGNZPc*&xaRAj6wq<<Xw2WX|%*xH?_+fe7i8Ca8*_SEejtHIv?SO1A9!e*<tzZsOb}
z56<4|HqhruEmPuZ)L9c`kwb7nMH)g}YrVcZ*C0G6AHswR{64Y_dC8wX!Ei?Qr{puq
zFPV|Ylkh&AlMcPNd-6^6{HtLhid1Yge(}b)A6g{S5w+<Ml}G?ZItIW)Z_Y35JRHjU
z2Dd#X7g5RMqHobW<!2j8!#c%s=~YZ4>5%ux<ZH3sd<COd0`Tj+UTr4jiN2MfWc!NO
zK6e#Cmv=nIwq_)^Nvjj~*#Y3>HCQ9@+`=a@3l3)>-sPFAaPx-ySxRw<rEdnKMW?To
z3KKo5L1v7`TC%7|<&Eh5_%;|UNA|-Lp0Gp8a+}PN>Z-Gi4Mk1NzJ<6DQ!|#-QvXKA
zm?>i~c4#rZc?oF^{#0XPfn>L=(S97Fj43M$?vi6h9c4YaxCxxKIf;Ru5m;p};T73u
zwtVMV(IejB+_tqAAJ}t*b_CR`dX=3aB`J>amsJ{Q`c~x~K0G-7dj1wdrPX2`_DGFA
zEzBC-cGEGKa3H(Ql%;8I)--;`o?^_tGSUnVG%cT3tCBuGwFNsQ;Knlg9->fb6uCny
z;?1@&O3I^22)tTt`jja>&L$YgUUK4JOdBR9H&ePn&SjNw57)RSew`fRPqQDQw2wKS
z6LT)+{?U;q`VkbGSk1Jz4ILS^E$}GKAs7oICU}J&<@>?eG!}_6!qF{~NH&=QcQ?Lu
zsBuOKT(&v=>-!VlPf%mMYgR2UuGDGwferD%L(5iG?)#+@Duf&98vKJJTHM?Si=Pqo
zOi5>#51Iw3oIv=%LM}SoQ#ygPa8`vb(nVEpB@tM&Qmx}>4cbLx6n@>E*yd^v^0E{-
zz~mc4oC#vUc+dHLl;edqb=8dB4}*ey^+d1@{l?S?c{!n=C9c6xI7o@=y0ZcK{zO{v
zFN@@^Z!pBCohOMhQjSJTf*VuFPYBNrDLm@_0*|k~lF|J394Xip0DE;^-QQHx`f~$g
zk!NzppK?S%P80%PXT1$ks9A?6f!96RoJK3{$_!CXj8_+H=1YDpv>t|(nUU&YPkLaq
zfggElD5X7FI607oI)|3;wbm{={tGsHnZ5Qt!E-}c$h)?NLT|xKv{OKoO>$9?E%X*$
z6o4&ueQvvaJc;q|{v@h3&cmK|%;u@>y%hK^pO)ME(YMH>0<dq<6^l_=gO!S7HEqp<
zhQl4fvz0<nC;e0EWcw9k!#+fev)m_^36f)RVJ$%#Go%#v&T9zwWEhkzG9_M`sF_u?
z&~H571#4tJ=!%K#)*-XIvzoL?PRr{(&|g<ZUV|+QLu$8rjf<kHuxJj$YDCTGk{WhB
z*~ffXu<BxWyhI@;&)jg#Wp6hDs|;s<4C<)kfYQR>9xqJ6x7dZ5)~{c3u8`g;_VfU8
z_$QkUuFqB2sy!lcEKar(!U^y3VM<!;`f!ghl%);r_OBv<6^_a;LP;%Y@u+qZDnl8e
zw&Pms=%Ghu6?120OY)ipnEj%iI~9gh4}xng_hACeRJi&rZyLEut0C34-eBw@#n0Jp
z9krg}W8}``a^jj;N2c>Vr0>Q&&MO$-sXwHS(#vYRfH4tEbjZuG&oCp$Dit*D9Jn#E
zovqglDkQ-S_iw@x8n`RkZzx2Q7+z}x`}I*mnuIg5$mbO9+ry)qrAtAEwkL*bf6*d}
z-4iG!Uq@soA3kT2GuQ^u@(7kTHP|Gx)%U?Z=GtzvKVO!qmeD_}znhd2v)!4jBnDaO
z!`)KFKQo!ySTQ!ZKrh#Rk9TVLfRfoDtK4A(J-^G;)wkTr_YWyYB6IX`@%g`eq~ywT
zSFs9WpkK0!N#29+!$sy6!q2)dHL*iH24*Hy%JWvcks{C&UG0>*Cf|)%Vw~B~ERIk+
zP1IP=7LcjrS$G{co&rvrf?l{qjEq-|Kxu}XYkaSCwM%@YyN?NRaT~ZolS&3ldkN?B
zDlWHd&Eh|%6%q%|QaWtZaD+qV`&kpv(R7%~m9x1%p&C@oh+F!Y7bKEjguwO&;!~;1
z$eJ0hOvh#Xx^v0gHvfV&W@PXa;#|v8#e<`~f{!zw<Scv?QjU8Ky}hg~-(!KZ7#3V0
z?akI0B=SJ-!dF}K+YJC;VNf+amO?3BB99MIHr5j~8$M1CE1cN=PUNT#dC=VpQozX>
zU8+|)`lF;>UJsE1HcMsgJ|SYaKBmmlD;xA{xsXKLDhdrFmA8pgaY%ke4)PhtLyPsi
zlPUeXe5y?@HtaS^B@*=2J8fPHT;Ny#3HU#DR#W0qIkSY>Gm_90DbQ{(Oo2iBPUH31
zxP`^QXS^9J1wllQX0*HpLf7k9tJ7qx7u8p49*5B&Bc>Y&Q|N8@VDhz-+FYom!GhvW
z!nV|<#g}gJ0wNdaF$l4Txo*z{8a@y+H2ignw_zW{v*Lq6f6DS+4e5zee-Wsd{sS#?
zW#@g%*E1mp8AK@UL>vtb#5pj-VjXOdH#Uh+7!jA;eLVo`qDB>VhcPu);DkfxFek)u
z1a$m;&7G)vb`})5Y#D%Ee3JIGH>E4B^M{E{rbLB@MG#KNaa;P3{)2kA8W<h$<Ie>&
zb5~P2L*U|1DSgQ*+cQtcxDKxqiYFQ5YWdxm(Q@!7hSx<>V>xX)B`9pr%=p_5$_45O
zx;BQIwao&_t}?4&4oJhs?0{QLfd~FTmQvb+8)q%9*NiyTHEY&Mn!S%q_ym^j{Fa?}
zB_pnsc5jKN7C^P<)+^Gm>eYpuig2>yvr!MR_RY;fdbzfhEv~)%F^+0&;=qz1rhjq_
z_K}C?_^`iZ7fn;;lhMwWg8Vp710f&`0iK~7-{0NV`y=t%Q&xQoAmXq;R~X7#Q^2%%
zQ2w5Ok|)cK1bvc-(QMjuSB_di%roSJ=YjWduF&mmI81RAYvEWxJi(||cOv-QCSVUl
zI;&(v@rS$y#g)Ss)92g7{CJB>392dsAs<qo@bX45un#zfeP*2d0MG{?ald6iy8BdC
zt}l2LA?mJ%M}|?;L3sOcr2B<x_V;+NJsMFa+&sUoTK+)LLl$xg6Rj-V+PR=rt$?wF
zKU9emR=UoK5d-D8TI_pA^6uHBn=W3-Iaw&^!m1vVa_D1OW#$2s`U56qxeZUS28IAz
zOZsRDP*ps;Nhi!rzsP-o7Q>bU2-Q$tTEts@kosYt-5+pW{V}R=MIC=2kpLs!p4#yW
zz~KZmDQ#;U;s6hlb~Rw4yjo!91Z0D+H0z7n$nmOQYnwr!cqai000sS7NS!<MpQ{8V
zzWogR`SD~jPmFBw#NBUUUJI+F<$R8#L3}D>>cke!GeIG$!bQ+6e$gG)fL{_T(iFNu
z_nS-i!h+X!;A5?Z1to*}&6jH!E+UGbH3BvSOARY;R3>(w>rXBKtPlQU9<JTSzP8ER
zdV^cr{Ah4sLC$iwW7vLhSg#&5N46EbfFgAfuf;BB%$P#jD{%ur>``R+eBw^<rCLA*
zz~6Dlu(2dT5HL6&;)hZ5*<U}XtM=f|4=9@vAXxDCCKUYMjJ^bY#&eL+Jk5WX{wY5p
zxzkYs%olYY`>^*iWV1Ao8@E?35@t3H{WDWnqQ<l6u4yas-TbQiYP$i~qwZ7R`0M7p
z5L7^mjlh=v<mSL<(Yii-NIU3Y!;H8}tId?V`=I`~kA2t-z)I;kg5rC4adapQc1Ytg
z0^xSih=s|(HjV_VCQzC(iYw?VY9gGLj+r+D+7MULt^6ih?G09kbxNs3C6vGRT*R4r
z`|;iUhc?6xcQv^J7p30I(8Y&G56FTR-VdFpcIG1laD6jJag8%s+q0l`tvKr91mkeB
z2BJ=Ev#23^`&y@j64ikk4i``*zOypbF#&i;_<qDdWu8*Bl4uoUmnv2SIZ(FrOE&jH
zroegrjjUG=nGt(K+4E_jdR;?kh7P+<Z7x&V-8;#A&f4YM$mua$fSppr9SFW{(L4mm
z)v9%AZ@|j&gJdn@m4b^8&cUKlMJNLkW!4}vJEj5pyj7+Xxj0>^845@WriHyPcdbdD
zsTSqR3uS?poj)<MO3iuoxB<p0<$00a-jVz|k=lVr)#iwK6Hr#k3`Tw0g-$X{s}gJ3
z02RZHrB*wG^T1J>fSoVSJ^uXAJ_CfH+#cV?bgd-u$9A!c!VeD$K?}O&69F=1VQY~6
zRsw9PY1MX?k_oFFtr&fljn-AOMCma6b338PD+2MH1fdQ{S(b*SzbphTyYVIb4nUi=
zWIuqirWk5(i~%vSDA--0&|Ch4%yzz+<h}!H+}bJkbdGM=TcRqTiHSYUb#)(f-emq9
zbhH2#chYwoA|?ddnZ2%tzfu(bJY;2hxIeh-!x%1tW1!@L5EQ-`L7>@W#wNj@CC3|m
znE{K5&bT3&mDoe!CCn7JnsId(Ul6;VI{xCT0OAcO#_P@I7>=}~ho;_oMASrss-;bQ
zmzKF|=oQT6AsmxA9)T4Y#}7YDh3O1dAp191o~@X?2y;LgWG|WxkmS7s+}k-J49Ct$
z2vsCzH3k=aIr7md?lo*mgS)n3hNCLN>llQ4Q#pFcOD+DLb&{5x0wb}xc+kL95P|<G
zn9^OMP;iFu;(NsT*?u(uyTzODZml*G`yIMos;noCzHKrzr5o;&*ok#i`*edP7i1b~
zUY!~f0Ls*nEv<0V;uFbe$V3KQh=M(;K1X$PUYYFn%tA6&={zd=rGSplekFHYf@X6i
z+tfFUJx$;m|Hc?&jp|m6zuf&5i$?Zeucz#gI=JYp$a|=++%=p#rN6M+itNjd#~|6b
zwzJNQyp$W(1A5)LbVHgM6!OnPHa=ivV;59tK75~x$Xhu}>HVSB+wID3_g=w|v)v?t
z5fkaOAv7yt45A^DlDkfPLhh%kdzWv;js+;ko9+vY|H6=_I6^@OlGjGpCO}2es+(6`
zIrcUnbLy?wQO>dCQ`!;)l}`W)v0jOq1mx6FdnGkt-qOcCdj1Es7xy&^*>Q%E0tHhs
z1mp%nP-$R`)vrf_*F;Gkr+AORJvI0Phto&~?dpYlqteSRQE;=l%?mUEV$%u^omn5!
zpYiUAx%UA80{d-A@GIUqj)3kvo^;DcSCf1NK5Ye>J<oa<Rz+9&Bt7woD12R5#QrPm
z#V_+Q>h)`@u7pxCfIufXr-J6h6QG^+%%-Gvrv*4>>xHP`=;{0UQnJ0vL7}$<G)`PQ
zl;UBY{_N!1V6D9`0gh{&JEtrx1;74|T@=2W0Adg@3nMoHb+B*tIsBmp!eNHqYKo)R
zdo2P<#-~Op{LCG<$#649e!d7_Z3=Fg3hKR)$NPS@;l6-Z&_E2wRlrp7TfAZ&Bjxhd
zT)~J&aL@e<7BH4n+QXBcuL*ijx#K2BNE_c{bz4B`X($i!Jf6IZ-|l40%8h)A9qdBV
zvsNaQ6f!5-l+6;t;;?yuM^I!l461{WI=)+Arb_9Fq#x($rk(~gx5oqlTXslk!1mzi
z0y$oD>>%+>r)N2`ZPPjL)8o?yUD8)(e}Q0~y#~!7&8iN;5a-NiNsf2SR#Qs9Jh#wY
z>GJ>>O3e71Zx1hGMbPrJQKjiA!7;P$`Rh99kuJ|iGTk$m7lL~I>1)r$^dQJA&RTIt
z$dFccRCyL5!e^?+!k$BMmjC8l4A2SQXy<njo-^ycGo<U@tGoWh(n=WK__;8cbp6O_
zIhVP`G;4&3ZqfAx7=l>(DyjNSz17lDgkdSU2Kr(xEqrIuXT)E^VY9;Vx!0b!$DY&C
zP!}NH=$iLEFd6gD?B~zpmZDB545pa;z%e2g@Jd;G55*I@KVZzoLbF^KBHqSfOI<Mm
zaCbYfJsiEnn)IYgil7~4+ONhP2B?k=JD?ZWF&DMiXDI)S9#))>d{PC%_Sp8kw=a(d
z6{q$W!F=YEFw&+rR<_cJCWRzf<9xslkOOO!Byh2vhsp_0D{$=E83M-XbfUpiY?a>t
zlS3S!*siU${hC)76a*?VU302=rUzG0i0oS?pNmJfUp-z`j`c|TY?xg6MwyBq(XlAl
z+84=Q@HUN{p;dP<m`^B^?n_pau>&j4&<?2<m&s~zcZ~1sSh-(U`|7C<&xQuh;IoK(
zot~i_1`Dvkv%&Rdz4Jig5G5ysu`eB!txF))q_a=Ce=WCI_RWvKw;-1}$xp+F(5_2+
z9K10W;g8_$@_a$fO-LDI#2m8Cc`^=US)+L@an48LeWIGSDzT<szci@Mm|N4FQNn$A
z)4)PFk>mC-Od;wN?g_Rq$HI4e{YP5GQJQABwm@N2R^TO)s?zl~w7}1}&5fR)Aoxv=
z$wjZeNr2Pe95by|(neMn#Olgd^4=uuPRr}RfE5`LL~?3bp!#1_sf-X>gCMQX>{VJv
zn}cp#cR+qM=e-NX>Zy|hg;!5P5bK?rGg6ME^y}h}@`Tey&#i#6N0h)sB_d=_v#Q+7
zG60aSV+il30wZO1B86}v<>nUcjD*EZ=SHRWb;xA^w<<~gq!ff}Vu~{<2e#0x14EQ%
z*vr-maHn8=FFhj6Bitiswy~ng$zdJ4hU@jB>18>$B<Map=|Zf9Wv9bvUaHEf@f#Oz
z4>Hn9Ecv}U-q0w)G(}MxrUrORQMCSlEuMe<q{&1Z_`dSqaJSPE;6+Mk2@~N(!}ho5
zC?rml)^RqJ4DR7|%nc>x^GLN=F2aedwc`u-Qh1HJ_1_n%W_pX&<<M1vJVW5r!0U#u
zI%N(g<Su<kZ-PLsINWYw2inHe1ms8i6AG8W7Ik{-&RYc~vW}Qv{zEy0QPaRDO=eT&
z2!^Nu==9)Fk}Q$MYbbE$l$X!~t@g7B<a7Ia-B`zkYm{>zi%HYlBeTPA7Zfn^LyiR3
zpW9?K{HF@Scj*{SGW@z0!f*%IB(&UeRaeZUuW5S|C^e3jH5WJJHQ*N<<ygm`d*2h7
zvo|k)7PjE1UA4{p3>VlJskUaPGPrFc7gTM=7m}XV`r=Av?{Zk}{F4IS`PDa5W%VVO
z&}P%_xs!}?3tt61x4G+J$v7~qa!L1G#$h6QWaVc{@@TBSDKWp@)qRo~y|6U6)NPIH
zg<?i_aN8#HU!bh!py6I7=j@2lyBl@KFp^3I<T;<6o?sk|lCRkng0g{9gk{gSL5{~3
zX*+8-*6bipHsXACVrdZ@%i(CqBYskjfEi@(j>F!mi1EXArH=*j_}`4j2$2(C>ZWlZ
zK6yqE>hb5EEXV~<i5px$Lr)_^R=Ttzud1+airz$2t(;BYtsbj@(YDIn&#(gJC^L$D
zO*<}7;iGvuX!%E+{Aru;&|ll|t_BsK?*@W{#p{{X)NYQFHaZw3cx*--i4(3VB3qj{
zw|*Td=eYYNbOQ8_pX*-*EZkwfMF4yC<fw3bkuJ+M$WgtnN;9g&)BNq#=eH(66pTI!
z1ha*I1=_!YFVu`W=?2M0spS5^J3M{pHodQQABA!fQ;?KtZGbr~)0pj=nim<t%lGyE
zK&8D$J3C|wpQ`lq_!e)VnIN~bW$YC~_iI3zU%#ga1n<6IVR1^`oh_USK>E<apObPz
z)`><Q8t-v2T1+pCj)w@N=&4%K9vU86C3*-Zg+3@m``$nrMqqbp2O(tz`hDLV^llC=
z{1U1pDb47Jcij-=dfD(q$uh{aoD}%N>)gQoLUX)yG@*#E`<9yE54&MsKwvs*cz3B?
zSAHuq@`ht!<sInF$TRMBQZ)CwC7K!J%EY-jB&|>h%e@AVk^mAw>|IcfbDOCggASdN
z@o{S9?T&0?tiY@tM<rx9)&Rm-B4CE$NLJIE>g&;A0SkSA)Pa`!A#jZaRcC15Kx5Qt
ztz8$MHDgrui<$Q(ezsD%SG~81lW?{NKbZxal@AW~Q-VSW8|_zdAu$$?EN*<FP3VAG
zW9T!W5TXf-$X-LZQ;GZ1U~m9s58TN^M{|rqR&$Xlh0X)Z${c49;jf)5;VLsznK6YX
z=43jS^G%{VJ<ez|!2|lbw}jy;s#KS2Q{y=q3`Af*a+E?xgeEh%9$@Wu*I6LB1ezDB
ziR%WPS3V6pqO4Kjz7S~(uMU+u2~+gWl>7H+z~|`YYl#(GlEtz>di-U}l~M#=<(|k=
z=vL}DOkHA@OcI5h$rXU>D;gH{5?mSq)bvk9{=Z^pzvygzsEKyNK0`Z}iEt}(@gdT=
zuNL7@U*MG4I0raT4LTC`88?J&cO;HNXr>1{*GK$a=ZLU(ADx$RJ&2_9?fRSVhw*R6
z^L=RCV=w%&$*WFzD_p3dfH0>6`5je`=24%G(Td(6S3t1*u%?(~)^+bCdl`)iNVGMn
zL=o3`et;TIkQJ16F#;LqP~XX3bwq5emy!ww8*X!A;5%bFhbH7+-nmp1@~~dgU737R
z$@yvbmNn4Q%@dP&bHyw1+j_BiNiTGgY*xchQufqO+Upi(GS$~E4hI7|7g2AJRr`f2
zP*%#t7--)bATTa$5!P@7xdnAdU;lVF`h&A=rsHu`T4qBo!39%j@YKJ>H4q;XC4jV{
zkJ@0-nftkE^;W3i<@OS2L<%{arzDJqH=ZjLhR5%UqfTQ2DDr0fJaPyo&RjdAirsBM
zymGYpq7~y#eDo-<>e=dD7S`@rB7I`T>_;N8%Njo47x`F}V^li2f`q}ITtPk~UdXhN
zgr4_~Ksxf%rmVb8e}D;gA}HZ|MawZ6NcN9J+SPSGpgC4$Ndc#!$0J?e3&gUl4K5I1
z7zJ}>;(4#p<$FlEX+J$O^RR%ZgAUrFV;BGK2WYo$503)Votc1+Vz(XEb!a%f-uEQD
z(Aw;a3DqSc|8fLjl&|z8iaF6oXrbME7%Jp3y4RmJ%fUdrm6t<^#_F=nRRYH53C6A>
z!??*5V_*sNuhYbz>DZqsnpZ&xZ<}~xim$!L4Q81nS$#eg7EX!`p^w$TeQ<9=vNq9}
z=0jOBq@d^_E22!#`w{8xkDB$C6Iw!g%RIyil8s-y>zX6=50Kl;BUbB2jgab^*BKB(
zr^M`GaQAwKz;Bi~YRQe|28CZ&AW!YTE~bYwMDn&Ng&tZzRT^Cw!7kCA-Y7_ObO<+d
zl^~JnYUj9Ta{p~zVLV^Z=O%L_ZhCFED&@K?bD#d~$uzOhTz_6MoWz}$Qxs0QjZ{j!
zaHi`(9=(yAa;#m|$t9h&2*TB=Ye+u@$Gs%6?i)nj!P(Q0d-R<0vV_O!nhGv3fn4Y3
zSWbB8g|@Ey!$a0G-=8SAfOFc&tlMe@j4WLKOi6C?yZohLhz$U{mJ`C+%c3mt^k$uF
z@I>Xf>q>ETb7+Fl_cDi$<u)MfdO9wWT>N<9GR--qOss(T0G9JZ@%9T@KaZ&3nu~-;
zZire{=#%`x?%iNOSyc&rRv{5X>2e5Na*H55;>W|=`2bI*#{>dSU*I(7-kl^oOG~2#
z#?`BFX$xD%xj8MaUo?oGU$%@u_~{7KuA*N~KF>lK{CW%L*8Gv`#lVI(hh5WXKaES*
z8G1eAXW(2n<dgNQ7=q!0IjD77tF}LqPRPQo#sD__6t*-}-Q1wYRWGD38n#*`x{y;k
zi?`N3O57MENzGVz9Q?W<igwMt6>XU<Ja4?>NwWfftsgU6vF-yPISc5gpAiIMTl#mK
zvv&g9C(DT~9y&sx^t=iVtGVql#Qz{v{a&=DG^qj|&6r6V%FO0lnQo7HCx@DZR?{yK
zFE>hexb3d?sxhx6-5`I!@-}x*D3dvrlo8?GkQ*!xO3KW+NiYsk8}$i;*ficxyAM?Y
z^M-XURWb`nt7?CZAjH{><d&rKo)jN@iHl?<I30&<m;sq6B{h8<Z(vI&#b?IMR@D>1
zfwqTsYUEKLC)~sN7(zZ)C_9Ih4wFdk_*8Xl2u8^QP|Kbt$_G~Y0TOTKyGDSPncqDj
z)M&v<QPo76VXJ!d#3L!xeV;Q0RcXTqJ8QDGFF{ipJ(mn#-h4=qw_h?)j-4Zf#6e+t
z6>4K#M2?>k^re-1g-wP@rx-B@#3}<pJLMN(YF}%&I>TO$WSqH@qnL(V&SiPAwo1X*
z#v0xz$9dTho?r<N_-)c3ddW#nzm5}w%PAMw6ql}aTHgU>lqiC`JsyP_4dVOpS`zJN
zZfgDw_C$+UB|?cMh_<_E3+^@e3oWf97ta#kJt{GDeBf>UaWvKtlP@Ifr*z#MW3ZY+
ze$WBPq||jmkL1h;&$Sm+84dSt0=)0tF?MM&qY(WfHFSQH2_Q5d@7~L)nP@~@XRlen
zhnbsg2(*nBD85uo9Z|3cL_cA@yXX0{vep?P6Vtya;w1gcug?tWAr$4(KF|**Kuo7M
zoa~L-FG7ya$US|dB!L$0sVV4nAC8f}SW|v1N_A9M;{e+ARi~z4Hd{Z<N<iK^%H^G?
z%eWoZj~6}SRiM<2N`{mOemF~bH(r#<6*FR2-F|#k{xXIjRJn~%HzRK#hp|_dF}JTD
zl2!-0=OgAfbttZY|CMZo-l4O<ts18&){)nEB=D$dfkzMETvI>=Rn85&#5n<vH|{t(
z$p`v2)h0b1H{S5Fk>4tfX7yx^0OJ7t4Ks0*eAH<m2vCQL=!tw{yz?13|8Zqmh}qMm
z?fs=?z&K<h*1clx0T~DH7pJ@Rg*j_y2`|&f$<%nu+8{ldG}-HCoVA<-gLY>vlL6|@
zLy*8Vok_}CQqxbrxjqVbdC!aynhm*&2?Gqdnokst6B=6pJNX);`Ab`Is$sz36Ow_G
zSrJYq&O~yt5C@GGV3uwlc{xch=Yv3;%N;fzqH_qdU=i}VRMkddBTY0sBv_`n)zftZ
ztqx_ALU`fI%s)^n%)T-9qK2_JRO|^!hoW!(i*-jb7oQRYH?8Q2?+g}SVR4g@wY&4K
zdRoeR*RX+W{rH+k9Oa_DR~%26215^Z+PIA{z2qel73Acb(Qk_yl-r9SH5>B@GjtnM
zce|-u4kvkyPF30hh`wkNcC{ey)w6-JY3bP*tSgI9muHcspK$ZxRWrQ6b+ml9A0Dln
ziEQiNm7MumFwg%2-%vebi8r@=>aXL1&gVbyl7tly&7DuN@E;a=K`nrGHo6OI7bIl7
zWXh)&H5r_+tvu|Pphr?Es{G~Mc}3t<{<`hunic((UpI6hC;0DWapfz?``yQ0g}17;
zI|5E`YYEDsSz;+jQ(7ItXTT-kJmsf2wwR^p0BASAw2Erc3fbHG?OJHr<|=i2)j@r4
zX~ct0qG5>KtbZu!3ZXthAaGN;s2n{E@00)G|5i*l2o!yQ1MP;5=WYJ~5pp05@ee}k
zMUsX*8-NakIWRm`FAaDCW@^eLiJ``Ut$->#@j`AGhS!n!nEFl<mqFYiZLeHp%qXNq
z;iUFsf`@))M7=?hqHdlKqCH6b={jJ1%b+sLfH=9gBsab&kEsRtK(K08<ssvE^OWcN
z=R#4%yFg}&YI1j^HjGUTGX5}Jsl*Ct#_y&moh-T28cA5;JMRYQl@~eeka3S+rq~==
zxY4T}7v)5gnGt6hz((T?bwRq;tco6KB7DyCUzyHHlK`|ZE8M%ASWiOdH`8?twX+1R
zoiZpkBKR^ouII?=2qkN$w`@=Eu8+nDY;QWxdD)=TaJ{B9X<v~Zwld+LE@zW!Mwa;k
zEY5p<=Xfg6IPGUZPF}f7noSyMpK=ApD*v=h`>P=TM`=EUCmqEBEF2#Q_=zy7pMVJ_
zc4VrljrtgpPimKn%$%g934oiZ6i6WD;O5s0;BdY4j?B4`7u1@u7j>frRPe~B10tZX
z>aXf74i?eiya9F4HK+{{+XlRB%*1}?yRPMFC;#d>j~#gbz2Hr!eXVGDj=&MrXa-6K
zg7~aX=7(dH<Fc2^Dsv}|S^A_wp)f=9S`-I_Z${qcIZt}Si-lCQ0P?w~&2SA_K<;hz
zwx-voe#xaEePh))?8+9U6sKw>Uvsi^_7WflGTCPXr!K>p64(C`FQMwU>hICr0mz9Q
z8?Z)<)EuEGM=PmuA?@I-?OX;p&l4Q#Eh9iZ6pI&cvrRDXdG$@tUy67+w~3VXm7>Gp
ztH|Kex_VFuaRtBS&iA0}v;PEWo&JFOQIt!ak+)hj<6lc77w{TR3$PH1`ZN3cn+m~2
zx<#chWG$%xtRfHpc-YDvqL>sS(7Xapj(1)I+I;gIpdC8oNvuv%gc5|W&$}=2FNDD9
z)h4grUA$ESaN@)`Lu77lLtC4)@)xk#iJ;~|n)X8JBBfdS3qT;avCb{53prl=7uAOZ
zIvE61Ej;tA|Iefp)JMDzeXW;6b%+P;Y|NbHQwvi}pxxOX-IJhz9IgF^*(yd;Cw8WF
zgGCd}JRT-qbI(YW_C_rV2-mE1>N(OA_Y7X#2HAxqIC=RdnCa=1I8hPJs&|%}qc|?c
z@@hjCba2HK-KNv)kE|VX_}Y_5dt5PN)coISC@;|dF^!=*J%!<4)TI4TCFxM3nzNt8
zC_oqVK1YE1UhB!fTzN{q?fWqRNkgl`lN5jWwP|pjtAG$CLO9S<?rK>A_!-F*`RQc5
z@!8PZ)KxzyE&_?5a*?R3uqy`nImKai`{9L1=O@H}Y-ny^#c`AQ$8h~MWlC>w6{6em
zdYLVspYoV`+?)g6IFI|CN?Kb$xwKRH3eY2Z0z*(`kg5IiHItM8E++iJ?thph0-YG2
zNV-Uj4DGttGJ9R6k;<*US5_0iC1*}5CDmu3aR~4a5mK_zl-T}z&jqO-wY<4szm&s2
z|MmCh>1@<4pCjCK#S7`wLf?TBrS+wXFJWvCz#NT{P#$9Y=bZtcP>OJ$DeeFCwnQ*j
z09|XVo*FyC(>TH0Cs7I{@dO|IS&Db+UoVD>Bu<I134PCufBx&=pNA+`y#w#)Hl+`J
z5>2nMHT^P>)MXd7ocNEw&!%YTFmX};m#N@9{~~}clL0_m+gqmq#DRB!zR9&c_ACBh
zIdP~21t6a|`Q-io<patlyx8g*)I|Zu#FK}E_jX$WazdUme=Ya#R~iaBv;~NAy{7%w
z^Zxy0ByH+_u^=!mF!F2?%zqW;2c)jm4?{iw?Hwt=15^^C31J5*zJLE1%D)Dcz%Amx
z=2S9(%1Z>gmCDJGw||;<u^suW_m6iCu5Hc>zO4DuwVZz&wEg`@HplM*+pN}?$u0vW
zQ|&=(SFZ+aEDGfn0|C+Be*~s}I2eAhQ_sN?^zV@U^VigKi$GVA{Mx!9>3aC#L?H6d
zFwl9l{MSwIKgfU%NDK$fMHx>0?<f2B(_XJ3J@}Ig0A56TGMH{&UE~QfFo2<FsR80N
zMO*=z!f?}5Ky%w(%Xs#hNP~s(f7?*~`)k2hacP75NMcf;4EccBJNlUoD)x0doTBbi
z1EY6#K&jqAa1Tw--)^chd-|=VN=N?||AL*%m&Q>>Dt!LZe^@X4^ELhPZYbo+b!zYi
zNvhQDR;i+4(#{_`%OYkX0*Ch2m6Z;K53GrLe);nSD;m+J02{<R-FUr@%9)zkji2AD
zzrK;u`0v3_Hvu7*bfGl}JW*)KflzjrM;X*EUmAW%7Ia3UivM>0Rg|N4sc3Lfv*I-E
zFTpyO{4b~DcYENBIHp4xAx)-Nb?oITal7RLrKtjA&--muXm+@mxYHvo_!uazxBJ6r
zZx9O1lXEg4@^G9}*P|w@zt`nI-htu;5GC-XVmBDDZbeXq$slS(&l=TG`z)yh4>B<^
zi>?xi<^K#UQa|<9E^u_dUZeAj)jTzDSnA&*4odXfM}hUer=r9NBycYjFVgrvQ#$Q;
z@Qe%b<odH|(<+?i4T1MX-b)KPx6761#_qNC8#;Id8~ViYZr*>~EPc3MVcsLcBjv9b
z?MVY&psQI=y=tf9$#`$yHTTcY{unuPiW}<DOQJas6w8l(!1y*XSbeD6u6tY|a?`Bh
zW!zX|#nQHfn<y|icc~plr`GzX#hpH^iTY^EsJZQGL;HC+r)I6-jY8$--!YdYC<4Su
zs{`Bxl}Y~eegAEJ?G*zaa-p&(g`b+8pA@+m&6%}mHGE4a=N|i4%^$Nv2x7!7-Cf4Z
z3bfsyf!*_^T!`+_+ei1m;5Q8RcHgo7c_kp*UaUQ!SdqhC$ky=Szh>z?aM#N*FFM;0
z1$J#Q!FS&dpAd4sc-T8~L;}~jeaJr&#cJ9v&-!&BlVuR)v8XoaWV6vA--cf^SV6u1
zE3#m6R5h0HY;~nQa<#<S<A09jpQ`|^S3G!@wkm82J+(9cEd{2sBSm|cky7Uw-jzKm
zRwy4!)yaKju!ixiP1Jgo<23($zS>G~{Y+=jskUFi9S!>N0<sJ8j}~G`x{g0CuKoWl
zen}i)G7Nl^c7IM=S`%{SD?i&IbtSHM2ET+eW>vWe0h6V2eXkJMbOcY>wFUQVjK|LI
zIQ{8tXNA@WY5x_u|9)L02k_O>^N}9%w5720{<ynvOvF2lqnEE$yCf>|yf=)3?Aa7-
zWUkSxC<RG;X!!Qlwwzt&|8E0tj{`4;^=pYE@N#)q^f|U%No}CAn#J!0Nw@3kB(0c5
zdj6Lf&Zd%WJ==3p`Ry0rn+`kTc8~reJpbcYGDY1o?D+mEH;j%dOg66K*x5tWamVd#
z7S*`fX*R@$zQhEMbc~uDRPZ?iY_sQxHBDdKDW22+e%ycm3Z=LeGDVY_ro=*v!)2E;
z1axzT#eoI--f{aIix&$8bmb61v^#DI|JTm@`}@7Zzys@4M5Y|1mZG6u{(i{X9as?v
z!NgLmeP3Kyi6u70l&O5lS<JTi{~Mo&vQm6-xaF%=(@#^k(k@TQsz$M~q(%?SR}}ap
zxkt}|gl9l2eFUYjCiSUoqjS{d$%A*g{=<xCh5su$|M}pG6sKM<o_NJ{=R-*>6Yi$Y
z{isvU;U7ZJg(%w6%z3_`pR%E!nYy<_ijiyV(p8Y#YA<!Jj0>aXH(X;Kd-vz8`*U&p
z=SCM=2*#z0n1E&eafr(ItF-e{6?XXo$smSf7JRuJ^=j#chq4TXR3`BF_(tv|>ec^P
zpBFhpv)Z#(O&%-kj5lxHl$mZeU1(o1d8)AS^`tMQT>@p);3i-riz{MixF2#6CPuOz
z?5%KidgeyA;x-G;3UM@S$G2W{_Sl=6$lvT-D!YA}a|OP+@TKpMn@@ibvTGh2->kA?
z{M#@$bdAMr5MOQAk<X<=9^YvXncLg0<B}I7Y{h-w`2C-z&QeOZp2AYG49sqW&QWaQ
z0?YmVZqYrjgF6k&W*4nolAvH;1xBD^=T@xSg@qTDWr_$RG-pr#am&!-u!@G=RD(Tj
z!HILvC^8xFDBxtFRF`^AxFgR~b>98F3H9}Rp#$HL9?$=HOaypLMA5MZvNy0)UapAq
z8G1Nf+UOwcsPJw)e6X;Ze9h`mm1?^ER6~JJeD>05a$wa&{MQ@5Grg&6ltWao(f9Qa
z(be54`{uri5p)%n4CGt{_Zx#dZWo#-hv$7BnqS_PoUb34hj_gCece)3a9zitenIM&
z>gf&lJ)E-HtD}Sfw725cMM;GA)<YA}@9dmg<9Z)Q+WUQ^csuOBCgcSs$oiD=rx;Gx
zrj&F?@@BE+?+*sFxu9Pid`KVPUUVBh1Ah$(NO~lHbV+SU+F=ki9{9a1h0**Bdon%b
zXHa3BQDorng$jlRvAX2cFG?*=t->2OOSdA@*g{WP<Fk%J3zlNv+IC&N^3wI)GES5A
zO8g&rBf#0F<iPeCj-HQRGi$A)p8I2v_h>7wV~gKDfiAfuEb6c<#KEQ-=;<**BQxnw
z`~!z)6a!Qd7@0`x-yxk|0j~YkC;y>hM#%OflX!@XOSP~mgPbnk14Ty3^I-Hyr$S}h
zYy8&iP5pje*W?h7|0ACb6+Hx9_<1pPg+qnx|EJ|NELyQ7Ty|lyRhmbj`njihiHo?d
zI)2XW$*xmVoAM$so5}n9hZ>;N0=)g%A}Te~b!tE49SpTx#^)0H0T*vl>-$^~GqyBM
zv^z{|xpU(ae@w?I2vx1z$uVqU#as&I&O#KKX_kUEOR&nG3U-CJxAW0$1*W~JpfH*g
zG;=Q~NvDnBcZl$v2b&|)8A0yFU~0dMfy$vNvV-ZGjJ}k8kAx0YEnhu1*R|!?bo}=U
zLW-h>Y)%D61zzrqI-GKqa-{>*QB1Ea`-G}%Sg&JM%?j4~?$_iDWkxxcj2mw24T@~<
zMG>f>s8yAZX=@cR*EA{N5ZVTo+OEXjxDo-q^y4f(Y_FqE=swlnAM@`_ZU+@sgYJIS
zdG*suBl^+1jNc*o_Yx*{J$+6@7hk}d#wa#c@3VNR>DYnezox&Oi7?o2+Jesq1J`81
z#*nW`ZE{e`oy7;=X^@@Q_?q$+;}9`{MP=JGk0R!MZR8&-!v2YaOCx<w!W8~}>{nh`
z$nM$Zf?t{62$Nb)hk^-iwo-@p?_7tn!sQypFB44P#S7~C^PE9dXQ$fH(>U~3IClr0
zaN7T-^l@rGC9L}qt-hS`gr^7pxkiGCGUXPky7LRW;$K5AhbNyv_(|{FnF;CY`ep^9
zmR^S0zdh!x)U)kcD}oa(WnQJ90okS3H^gL$HO<mJ^p53+r5);1M$-{m`-=!^lNw#)
zriGrAvX(#AnEA2zKVR5~qu)1WC1ZHv{FdxrJ0WP;*cX4c8^pwd#wbRsD^iz!&rku-
zHzcPx;r>C#c*S^48TZf>q1l1ubWPooLcVRv)EZDr$NE^nR~^!3Tb};M9$0GeZnsz)
zNnXoW(|SSiGA3q*oC)jtDLLbAExh5B$NmEneR}^{$RNW&2GaBeYaJ>+<T>M9qY{UW
zH$(bFf=OLU3@KQMT|dYy&*U)M+*bD=*~)V+*q!tU3DXN^A`<<q)N5DsfCJ##p`fLp
z=y5E5!m7LM5kNoopZ*U_G?#(tm4oM*A-Yt3zV6r@xF7;8{Y7;QN{PWpTB!55N?*=7
za0LF30kS73V%yzII9}|&Nl?@^yb?ca7>W*4aM40<_V4{L)M`7kzubkM2V24rC;R1e
zSJZh78Z45M7qHp&F@c%6tD7`M-&}cgD6K*4a}}Q&S{w>Qq8t09e&61oBc#ZDs5mG(
zc^o$6V+w~hp8F(9S=i`D`rC<fvNyM|#=lvXp$ov`x{JNHa<)yh`La*Ybc4_=?K1&6
zb6*+vdhlJqiEOEK>D-h-Wpe}7e;ibyP(C(R+WEG_=#8a2b&u*$v^GqiJ59KquexA3
z3ENo?Kli(2gz8Y>fN7(O<~Ch!_8P0lvzs#!p0ul8{q*(Il)+}_dvb9uv-i4u#oW8(
z5QBfphk?S_m}}Oq!r{;%bmzSdDe5Q|lAU4rVfq=Oo-dZSeG=)T3}1Jc^S<r4|KRf%
zK%~_ao>V)-^Q8N}Aaq6pLgjFBr?{bD-`SfKO>Lq8#SwI6EX;og&4JKYAwitH@0y4A
z#}{O&4wm0rJl+O;N>+Jj%fzP(d|<M?)jn@k#s_~0XhV#M+cfWAi43ej3~C(fkK01k
zw<qqKW^*-PzPEN#eoN5q%>kkPtHDgDOdH$Y0_im$(0r_&@VNDRaBNPHBs1n`-&*uG
z#BioL$g%!@;&c>s6sjhUl?y-M=P66;RTUEGG@0nzxK(NRIl8?{dN=y(&i<ML5HiNn
zRK@sPF(X1Rg>#rEPrfay-Z?8m2>`%86yl6r98=SQv;Ib3(y1xuR*q<Dd9=CRA8RcG
zwa?elAXU@vFE#_pAF$1Gk>sh%2r<!pL7(DQr#9aR{$UwTTM9aarukGM&4cnaLfQ8c
z-ep-rHnWSGUC--qg7}-d9a^g25U|+sQSwO6{?USDNR4xg(Ye)@BIWGOaaL&mp4RDX
zHudi$LDGJ_qysS_ZNN3}S&&sSakZ%J&Bbei|5%4yOFVUsuJxhhG;kaPuEc(oNw@vo
zM3Y5bFtBa*PVSD=+20eSih`SNrC!sW4X<Jdi~GIjeCCF^;v0TNOW%5TK!7~%X4mlv
zvIEp)l7Jsvk|^s}+z;xTEf0?Kv;b%F>`x2#3T#=q=3)~gY4-oW^i8lkMwpXyq*F_8
z#mMzm++vK4P4$v=yR<)Wwscv6T=h0qCQD8yf8WFoHG0_aT?6MEi^*~u`u$bhgJ#CL
z7A!m-MU1BHm)$;{wtHcqD@U~Z$YN+*N3p7F0o9UC$xm$b%<9`_g3fC2H4SEPTA_>#
zf)Z0i!TO&sgzB)RtY!hp;b-W6Unzjvf%Acg`-}G2-{vz+e9DTomm`pHgUXy^&Sew>
zS}xcPt`%gQx6D0svF2+t!9!(39$L%5l!QEoq)MZ>m=#<*yLOsMecMTpE-Ia|9J)U^
z)f7+Q1VQqJhQyVl=|Odmj;<{>TP6%~rSR^5Zg8m6A|~;MQ<Lqji@)Rd5X}&)`60JT
zjs0jOaHWrOi@z>Z@KeiF=-s98hk?BiN`7mKXBp^hk`u4mu@McEfViQR;!;ilA*X6j
z(|$kx;;LP=3BaXWj&KIRN(KN#Gg#<sCMl~rL$@_W&AisQJar3Lt%3FQ4xpGhlwI3C
z3W4t4Il_6zL;3eg1QMTt2=0JBy=WQJ-w6`eqY#UnRb<N_c@~0N1JTzNovVk#_LtZt
zYsB&<Qp&&-+tE9~lKaj~ckaV#OPi-EEr8-+1^l#%O5cB<q+OwX1Aea4l%tdY!jCW(
z=0jtSGkZVgV^jQIQq(juaP|~<H6+S@Q?<2J{_FdrmD?`emqQ8^{G<0b=>wrNN3&=G
zX$KAW*gWq2ertbZ8H$;QW;QckM&m{zUtO-<@qkYFfih()@TatbzTp!E#&cInI#kcF
zn8bNk!M?oRUbw1!|Jbzm1mf7n;wie(bcZz8{>1>Z3!?S%<{R%OpM$^+jReQJ+$qfT
z{r64V&0Ne6hTV+ZqJKvei~2TK7JIY45jpAGzZX`*d6+8^n}NduZEqj&C%LL>m7=gS
z#yCMSss=?eLuLE6Kg>)eY)qZ9mTAEL3@lJ;bl}`203FY@Js_*P)RhzXR)uu2mfvyz
z`9O6&MAx&XKlzHD)CT}s+Lm$k{WZOq_{_Pc!<pW;!le~t{=F|Ior7g9Rmq72Vr6c<
z3mKX}KiQIK4Jx!I``rU52q@q;;7dK(DNP|jnM!j|{>*|9)vzFiuu&9S1qvRm6xMoa
z?D&;V<C(1X3a5(?E7mLm4)^pq*7fWMYB7kSMwgGz=$SqnP~e|~@A~(}+Mu->-T9iA
zv&syJmH)JGDos%AK*{-tzOg5A5Qn!$*5BG;fo*sFEeh7$e=D56h0QVFD&EXPqvXR0
ztrtKY^*I0(A?{4HBu)UE&|JOkb5VmX)vXkv1!nzY<nV=8K$3I#pdI?n<5O2M_)p2I
zHd*UzizdrYf;|?iV7iZZe7*>}Hl&@@>g9eSKBW9Ahr!11cX+2iq;{~4^3!(x`tg?0
zzr%_h>}=Jmz8w!nw|}kVC?o=dlZ)#ef~2cnQ?Px%;3&zFJ|SQP@o?~kxMO@Js7SP)
zdn9!GFe68be{eo2l^XJZ;wVuUt1w^M)xsFO@xIe)VE2kBz{M%8F<yY?4E0mB<S{E?
z+$0{jwZcDqshxp#>wCxT-;>l3K%YSEmN0MR+P5;n=OFeqq3)9PxYzHM8Gla^Db5)f
z-nAxD^nuEg>0dX0fcmN8(_6YVWOTq5g$17i)Dnq>4}!`m_T{Yr+&_t!u>!5e{%pVy
zUQKcG79Fi#J!>bUplX4hzHrs4Pvk1lYp{g@gZ{=jFmG00Juka-J+0He|M$jt5J25x
z2cBwwIBK&=^Yj*@@BX3n0QLN3Me($?+WRR~l;!S^B#+iBA|xm{8I;>zUr)`q8=krq
zGyG%UDOPT^yYIcR|2(lZ^azLj8eklDmAj7SM>KoTiroMXiX0-Ko=fZE`2V>2%784l
zrfWq|Kt&`KDG6y1X^@mIX`}^}4!H@XLr_9WQd&y7yG2mCI~8f9BqhJOIOlo4_j&&v
zIdI3``<gwoX3bhKdcOb@rKYX!AGrJTL&BuUooKQLw-Hq@{<=Fhwd~oA>hobcTG5F&
z91Kixn0MwKeQ*A4(upi!{f=Z0r3G;F?$tR)3Zt3?BOAm?DgXI+*NJ=Ikz`L2lN`Q$
zc6_jPeTMGN|FM&kChlD<Ba!eHjLl@V=#1wZpl{xW$hJG~JAU{Z7}GD-V%Mlb8Ixr>
zTz(fxy{v*+NWpA+^*z(QXLHbs<--0^i-=C<l?^|bIUKI&gxC<i%9-T6l0|<3AA8^Q
z(Hib%5nVjNPEnFw3x?Z)w@d!STi^3-egJn#n`blCUyS}H!P=}goA!S@UZvq8x-N?K
zUj`L5!=C+H%l>n*DdJ{+HhTQ=3;D<%tPLk9IICaZhc1)NV62~)2onY}`k(2oW|}G1
z1vL5oh_4pI&z{5NkMWel2p5QAuJIMXtX#)=)IywdGVIiKO5m*Xa!;0kTvi;6Crlz<
zDj&-=BYkJRh4xBV$xP^%zM2@SS9z0OLE`vr1eyHBud%|6xv#c@dU^k!b5JTzxh#0m
za3)HR$njhqQ0XqXKrCJI_}483Ne47mH!5<ub<Gw(Gg2Ui0$oOQ3HBfOAes(n%r9a_
zFunwHmspV0eOMU+olh>7wEo=ht%CcL+YP@Hu2d38jF-!?Q=={T7~T|R_x29>QIRZ-
zM@VgS!A_E(<Z6wi_|2OoRK^l&@~tT;!q`Slq>aA=F5(CHUF<z`c5V<dc>H)*en0+W
z;`(w;eutc(s^0Ff-jJQ5rtofUhRgoC)2}H|GYt}#REXndY1NUZSqFB7P3|uLt946o
zjJb2CLY6sL#aHOJueuSpDLxvmrwEupZnjOwYuY#a_M<YNjS&_*Y4tHA+hU^Qg|cMF
zjXdG&aKLVFutgKX6oH!eo=AH>3O`*C`3G($2HJFVURq(U@9h`Ya6K=ee~_ZgE|j9q
zT<BP=Y`)qS$3P{^?*+RwIVM#WIpc9Q#auCoD!O(z-{tvivwwzWf&MUi+Y1ZiYHlIa
zEOoDc`BUt>|0=l0l)1Y@n_IwI8YG16nL8P$WP0~=kM`@<>?U0q1(`~3T*MSf-ov8Y
z%sSx<w({|FF?ysRg-h=*63wbd%A{U_IABN;l@K7GGxDAiy(S+U7n4HL-rWoEhm>aM
zmOb4=uAgE#g_<wH4}k1P9m;JOBA^}j8mXKY0+{kMDMHcp>P-8nzF&L+BR9b{ZQdAH
z&%-~$wW8Zqo+!6cK-^Wxw?Mu4X@+8gIT4%w@8+F~hLgM%cKGxL=US@#jl8ecbvbON
zA1?k#8C^~q&eMs5VEviO76xm47m)cn{RwysBAq}mma+eHUSH*USV8?^doHjOync-%
z-BRFuSVU=?XVvmUu(^-OZBF-qG~)Bb*Rxniuy6jx)E(^IKe*ahBdavdy1&i)ES>Fq
zi=>4wq6;l2eHDLy!S*p#h8HCzuCz#1xF!e(#rN#KJOvFA(d)=LutfLfe#l!#FhS6J
z6P*%PN$sOZZwZHWg?i$x>US5=DU~jvWAn+FzCI+IW0S#)8E?LdI;@0w$(#oQD5@}y
zjt3pgs!^wrAI|;Jele?o1C>ogZ0?JWs29sT=G34`l!9dAE3*-l;H`06IgY{N1aie#
zu#D(HY+#?($TB&!GklK8*Q<#tJ42}Dx-g&WWVMob2iTHYM~s6{;`dvDM`jB4NJ)(^
z!E8d?4qEo}uI_yj#2-Q9pKT)+9<_l<80kU6SwN6F7e97*x!jS_bd&oNF5>_pWc$;S
zE}$jt^cYe}HeRjyqwwZb=?nzxyC6B6jh{P!P}Ft&cRNUJo!>t>{2~FSIPE4pLx?*7
z|3)#d)$`PEq-senX}UCKSI~y;DlKs`u<_(@(CuKKpQ9~Ydi(p5pZgh*$AR*KmkK6&
zom$jusk2ep4?cp{uH0MR@nYYC?r361wT{$68iV*ez0#GOhHb81MHKiD(cxIc$3j{J
zl872{MW+IGQkc*PGa>=e-LN-dnons)Vv#T>z1g3qQ&D_X3uZ>F9f5=%&zr-tiO@aC
zMxu?H=xuuOWZM2_K^zK;xuPxv*L<#`O6)sstMQ2Rko!T+ZXt;1U2q{|CjIphP_MIq
z_B@Z&l068A&#On?UxBGo9FE%9EY&3eCvd+Bjte=f;q#wq<Knn`4IMc%*YQMtsJ;m&
z*$)kvza1oy$O%&E?k_sBJz$KU@OyvBs0?~lzI@F6i=Q6H-s8Vm)XrUIGgI#rw#2ET
z)IEppv-Xe@6+&Zp&+j7IJgw3a1|3z>pW+Oze-18=m)P(Le;v4PogYVW+NTrILmLEf
zIw9kVf8?IWy%tq`fHz6Nh{_xO!i^;wakv5|MXyf(&IYn|g3*PAcA5FSA1-Uv99{`%
zC@*l)evb`T<ZqMyfnScaii7Wj^qUhhK?i}e&Uu$nqwCP=doWoS2$q>~k3PLbN81mN
zMgp}xA1JToWu?k4{|3(}q?l1i4E+{$Pa&&iPMXHJdL4u@?Bc)9_aYW00%qTl8VphK
z9kQ5Fw%Pz8hVHwgnG#L8h*y!{r#x5KoCTOoegFw~l(H;i%S?*I)d85YskKn#{fSQf
zoy102yAUx48s=12es*1_fn?ur`q9qvz;}|AY1P&#Q&21FK3QaYHEH{f55}jz;uI{b
zV7Lj(&7~UIZQp&N*~N0{GCC3*!7TPb4c)?#WFAAWv`S-oR!y!&@jqOESJO`4i@4kn
zH!)CnC0X9_nuwcptp}l;+6cM6luwRPn5LJcZV;ZPD1<^GdzYQQQi@T4uw7LMl#@;D
z<nFM{7k|vR94yt(K|i>YqG1wcD+(8arwFcBdXzTf2DMLKF7f$3())_^dda~=ThKZ#
z_f`{-B+Ta+2hwE1s|DXDxgWNT82~TSqzk%3R*2)65~z_A`bz?_m=uwiR~ngMl$oQd
zh)JgU61MkrMYZ(CL-*s?h+?MJepts(w9FG;EJGxadVAz#PQ%5qR$1m_J2(b1#a#_&
zCmSE$vW?jCjk!!k$eNiVY@be!p4=Mywhy$y1M+t@KPC#g#3JTJxnKp<?EumB<>`!0
zH`vM8!9^a0r_we8k4dGM`AM4slWZ8}8l-rBC>e!embnFLp8IamE*KxverLD6XiP?8
zW<&ynmSXBIjW?onxZ$oOG~KuRFMZBs+<i$S+5_LYeYj3By3}GM-*T-_$qK*s(hz*k
zXr9)MhO=pBX9iZ1N$Rc8s6r_^^V%8F1c6uFh#Go{jHH)H87AeQEFz~v-}9(q4sGi9
z5|gSyTFH-*RbHmUjhL38E=ix0?GVaSsQr<^f~BX{gehlP!W^BLlV9B~jeA~M!5OZf
zE3aKkf&<VkcL?Ii`~?%!buI!|9$LLPTB~>!?+Y3jFSfy~gI_Fzo>j~`Zhmy423SdX
ziX!^9?<@CLN7!CI6nzyH)8S+aUKt_w+hAeV9}n5$u*F<r32?-Q(|;Upe!X$^n*(U^
z#sa}n;EFMAYb?wT^gWp)Pj@T5e4T_>jPR^J8{JrRT}0vzTalRVz8_1i2Z%>XuF}X?
z!L5Rjpa?U}jaLXz+rWThWT<&if_IA^38{Z@A?lC^f3&1sN{?6jrd5WgTWT8SbozHO
zLAMpWYGQ_Bi**AfQa<giO+@GLjc3`SA)`0XG=H<%G@O*1O;tNsrY@5EUoRWz1C{rm
zPxvSxSr`tc{qym`3)x!HzB}ly=-B?Q?Y4L~fBYN9X>Rsz2wKh73?It`1U*5vT7+^8
z?n6=fKb+Njul<Sf*#=D<X@)By-%3!^gyjD@{Rny0l{UjiO(~#<J!N|poAKPZ2FgKH
zzcNFe+V(=%HmkA=KW26vzsFzX210m|5zb7xp)!hHEIQ@Pw|gM@c|P?FnEQ1J1e$oP
zT?qms_G{xq)fKlgenA-LHM*VNg{@jjoLm(Pd<^|vr1T_3wcd5gDr%Nn?zL}J_+2NF
zm3AJEZ?+B_E6J~4oFEr;o+F!7H!nb3L^@x#YtZGpG!qr>-HK#Yc=z!tVHNdtutl<b
zmY@FCyZ-CcE3?xGJ+qr8!~LY}M95(?yX)CV$HEcW$opo}rPvu;nXPZt<<B#qhVfWU
zlsk~0&Sm$NnyD^+*#v5pE$R@_Yj(Kt%J~*n_#&T|+`CYtdxena4HOzavM3mj-xPkR
z29$*`*|4&ToQJdg2^yxq#NyP812ZC`J$)?suV);u;VCNz;A7(bE&Qk)PsVk={7AGp
ze-_5+aOcLUzhthJaIW`6$<_AOxx6|rfPtRbX$ksb!3PpOi`GTyojV1|s|5|yq;{R`
ze8N#j9l&3-0J<L9s`qN|r#IAS+SXO`@jXM|fCU(>si}Bhwn9BqS}AqWuuM63Q0m7v
z4ASyb5vDV36JZ${cgHXPhQ1~qKSp}{2kLGpS{E@<bGZY#S{(1f4LW~UFA>}c5gPFl
zPmkIsUaK>(>*B(QJAjvn=QPp}i`Z)*NNrYCZU-&yAG{c2KH6qP2Lv@z-nW*YmJB;7
zWl4?~({F!_s1}`BERWFp9Z36P*zXb&V4)?mDKqSvc8^5Ad0tVy_mWN#*3hyql10%|
z6e;577B*o^P^>39uP}h5)I{@2wTdOFYoa9@%B3Ics*Z9ZN-$HSI_-o!tW~k9T=?iW
z$9I({ci5{<!k>U~!zIMxU5VkMgZCOt;!DroS0M|xEQ4TSVsfpRZFreo))wv9N;&@K
z|Jj7!H|cjCR7R0f$Pym35+buIoa4WGg<Dj@i17|u^+4=zon6pn@0zYDUW+c+Um3$S
z=Hw7@SyqgQ5X)eW`xlG3X8wj@psp(Lf$>d*Wskm}Qi%o$ZGXOU1gvdR%|*m|sW7J{
zkoXSYfT>Ndf$mlS&UMDfQP7Ew(_93p1ysMYQ;VuKt&S{{+aCN)EG%u|(q`Ee9zdAl
zonzx5r)J#5O>Iz47aaJ+Llzq;*Jt)Q!FWv*EdZh0{oP(fW*NTPW`<n~1v5r}*9cU*
zL3Y?%#(WnVQ}5<9rEISBX9I?hU<tT+D0n#Do(WKdJ+@68M9yI-7aFE(G54<*Bn4e{
zbRAh$)efsfjIRnd<!<%nXtGJE6~S;ul$oUIWwbBo&qYZi_MeFPAgJ2xLBDIrIRhAR
zQ%dA2<|Z5lPql!Os}o*|^m>p1d(pL7t|3w>l2IKIfAB(w96u{2tdI;bR1mRfM^uE9
zSap@#=6x#usGN%5UHIP*hc~+Sq*KH?`x-;l@ueF~`k}p#uG}lD$QER&+7V~q4^2I}
z?-BMyl5NV$nx%YiUd65ni*C*}ZPfkX=arJq2OYduJdRH)8In?(SBLdpM5_PU%cy-9
zKr<S4Uy|X2Zbib+ukYgE>Yf^(0&BQj7wPG`6hOa+-h-><hBM#KowbRP@qFDNO&u#P
zi-02F_YWhpOcL)3%JRxKSWx9R@RkY8@6%63FUW4^U8)qFhin8}*We17eP|V2B>}<d
z@r1M0QkLu%ZVZG|mu@`7<)!goqY1`FePE*t)HdrlOH|kxopYZ*3-0+-Q=$FQ;@d?e
z`H6?*CnDTfavYoAFiCQC1f`$_g(3EW;V@Ppbue$!&iYhMa(kWbRxb{EE&5Jj0X|px
zABeLbHS)C8NuOwM+)U{2TX>U4&vu9yj@UXH@`J$zTPNU57IVV{<pksJu_i9>{zgAW
zO9Hf#pKhQ2CnmbTYbk@&zxgTz%JTpfN{Du9l1lmN%2uZDqno0$80mo@Ohw)m%;8W)
zufR<hQF{%CZw@z=^UpL@(Yr|<?&;URq}RsL?-w+jR*yNa=5>MRL}c`#EKVQy_qh8B
z<LE-Ov)^17KAMnVIUmnMzRHdWeRPLJ4o^A$kTc7h6cqOli&*J2C}pL^!DfV6E`}u{
z*2QI`6y~@i<t)~@Sk0=RM~gZPt=MGW`pJFvcC2uu%;hW=;Qa9*!!|L?OmYd8>o+|L
z@%Bj_M-4xz#LFC{9Wlu>))tFoc}aTfj!$klzVh>PHGf(RSPERp4{pMhd@oeX{GRlW
z@OfezsHKX{M)LKhAM{>Y0xGf?e;)G`B^GH8w{V2ZxCX4Ez@nN$y4>Wx-=7d+K<5{r
zp5eqJc37X}`-5v!HBTfJL#+0JYBMZGoT$gS^@ghb0~v;a=SA{a*Sru$c`%v4Fc&#(
zkLwF6S%VzWxI6qU;{Rvil~&P@^KtS$>7;j5uRgSdg_nvJNsVv0w;9H344Gh9P1vg4
z{`!<%ehrwNAsQ%_guJb8RZ0P=ViylEgz#eM^CANF=I0-rbn(p0as8x-!&x5RYn$Wu
zfH~1&n6ANQO{dx>N*4?N8X`;^QxP1$N~aiqi<e;64#wIDpIce{LIzGwBkT?DbvkZt
zLF)SeNesSpT;)rtAzl$|1BVE_d+&<}{a4AiuDF1*^E;G2s(@*ZRGkQyYt(NoA#SyQ
z!NKCu928kSq2~#v$OW&*SiQyWSj$zvnMAT|0kWTrzd?Jy_YxKc+7L}|fk8tfK9O7$
zGsA(Bes|@sYAKp^gfx#58_A-vQbF*SPj5LPg<gFAM5#}*6_Qf13gYYmJVrgZp&Y8F
z-muQt4=5xYFF-DlM=d^R{605@A(A?Ia(xYqKvzYK7*F2g1$nNOYO=)321$C2|8otW
z7r$^l4PzciJQSTx)OHan_9tfCdT51z02SaDnIpoo)~#<IxDse2YYn6Q1Z`O9YWsH^
z8l|oTBDg=@ob9h~e9PgMEjm3720w?RLsx)y-s4{dDa8V$^?4R5E{1sfoYs@PxZy%G
zFhn0<G-$|`e*Y?C`Hpc*K9+foR0w%Dd7=)@`+l!RF{<=eGj2SCUQ?j`G4`H>D|_j6
zhdSU+&36h|2or>&FZ>1<Pi%ryKc@=osdla%`BLhzkovP-n+nQ5ZOwxm(nzKg2KbWy
zd*%t^&_t%!r&e^_<Q~UcwW_3J`l9daNzfyz+oCA6s#=7hL>v_hSS9BJTcS>6`i+Ox
z>IXfs)2A!1^>&)^AM+B`ib~wrFmF(kn@3$ZgCx8Y?7PY65|dFt9FKbz%$M<$bqm!j
zcIPj;<z7#_Xh<RBxH%SooloFn&fvkf|5U*LMwI?XlsU8HU7-rK$pp$!Bhjy-LrlPH
zf1U99t33iW(Y1unC~F1x;BC=MfDH{SBkgC_KUZDBX5q$nK6k*fGSqW4yzbU`6Zva6
zj+=e1eZ5=NAJFDMt#ka&dajy!KOs3q?Wt@M(Kkcw^?uY5&UmWDRCSqh?eqh;LwmZ@
z%odL)TETMTB{vwMg3Gcq?bvvm=B9?FUHdCgKtU67`3}i^8I<IHG;*(kyIPZ}he{J$
zS5Sjr&_X2uLdu_+w|A`+tx)S*{C@{y8#%VaaV{VWeiK=6Oy<=gE4GP3jN^@LKe^h_
zU9Q%HRQ$~(HCr9Oqv@bka;$c~eA^`3Z?~QHn)Fk~3Hp}rPZ;}<&%G#NwjN@Z&B?e<
zxM7B4^Kd+rgu__=%fs2B_rh1*CUnz;P7j`yKyhJWc<_RZI;*-pApF~Z-<Rn4n;SL8
zEmhwW@Bj%!Qd+#=h!Rq8lj=SVu4uihW#-pJv_^rr`S9WJRBI^mB0g>gZo834bLda(
zKw`IfP99NtT)eZpR|sh=6daCHYQKoT9tiXb^@Cb-9z!kC@}t|WX@D35&S^1rFS|A1
z!snt>SHEQ&;&OuctJ+KVXvFf^SeJ&(vH-YPxI9(!>bc|WHfUB4>V0+^cAaR4scJ)h
zhn0wtij{g0%m~JnT^^~WRs0+akvr(Iiw-K~PfymK0__swz)N<`=pLQJ#AJuME)_+d
zo-8MtpKHoi3O5qFb0Kzzm)$N{d~L-xK1+l(XopEyGkH({HTb{A+w+#G%!#mDjdLzO
z_fuqNep1+&Tl9@YvgsYTvh}o94;c&J**|~?PQCb3)cQ8csL$WaJledtBz*`!<UA9&
z#ZJY|X*)&DU#GTI4~XV!C-?!x@mS}XB=i<p1uQ)e`}2+|$%!kFyX?7*fJry0#jV&L
zRYkt^OgZJM@5%V`#mQ0B5%qUUr*{(y;m`T+Xv^M+qTMIydJH3gsQb~eO!bp?k>B=?
z;{xJUWPdT^ah&5t|G9sa-h1wkeV?+qj@n@u)*unC@Fy<)%;Rx-T#O?{6$zs8b7fgO
z>UeAfjIu=zfWXG$EAoEPR!G6*i*g0IqTx51sN+b?%!m|6z^_lL<!BRo>*abc7T$M*
zYW3{CEQc{udFhxtd*GgDM3c<wpayIGff1d<C`bib5qh@L>bPi~3Q|{5{3Wwb2`HR=
zO*5+O%vK@InKAebX<sO|{`wxmBSmWK!_^IC-U#9Zv;cJzJ5xa|s@nn28QLCthA&5I
z6ydS*F&(Gg4b0{Agk~OF4?AP)veYj8r;Q!VKz^^+)Up}HGQAM?810@HE^@4CA^?r>
zQIuEtT8xG9x^Hi03_|icSVs-OEs-t1*yMY)3x=LG%cEC7meN55jZnb9uffBE?ut#>
zZ=T5FCB!xEnL7ZBAFr}jCNtXoaMZSP<hQ!}CA>j8^ycR~3cCBP1|BCb=4KJIL5%Ji
zMOB&7Ur_+gbkRqC6n}V{!j*RY#?<2>=62udPmkU;;?h`t5VVVbfxs`xOW+RAI=Tj)
zm}bE+ij0ig4V0P<@IGR$VjDAt#oo_=2EN1R4^7+|T~Z#3&gpHdAPhfxFLUwkQ+(p@
zCnqgF|Jh}vAV(S!G#AejbN0B4Y1VJ*0757n4f*c14)wth9NMuxUc{=^#6vnM$CSL3
zhf=f^tf5HnO7%`>zM^>Y?t6rByFmnrEJKPc1OFzM)1NsN)-rwSpT7Op7-X&e<Zr+I
zW*c|ucQt*lZ$j!JxYl+34tM9@EyRa3!n#Xia2jsna68k%4UM$b+BFHU<iCT7?<-7&
z)r`oosw2$T-rljk4|h^UnU;Sj(680^#{giovaVjvh_B{fn$Q18UnG%tXGAcCJ%n74
zpRE@E@x#7DJM}@K-1q@239$kI8U<EfX4%M3Z(PjN>vt&P{3mpI-LRm<yBOE_F52=;
z!W5qkozGX?o4mLfowWFc8z3+!MZ2YzLwI*nfOrKKFRD0ZDO<6B54y7$U+$;!_6CJ@
zM99cdDE&|a`SaagY8}n7S5YNPhH^%iY_iH5KkNQp`6~1I`84#E;_uCN-_#3o5f>`~
zU}7*p(W#}3kNW0Y%kVPtB*5-!tH*hl$IC3-HlydY6OKPnww3#*x#-em+M~(OT*TX#
zd7V0-7~nRfuE`>I?NqX~E&a-=(Jc9fL%=TBZx?9`YtpphkqnvfTctSj6xlN>(iNBf
zC0s~LXcoOr!r8j4XNTBnMd03y_7fTAF8}CozUR{17xyEo6Z2*XI_cuD`~HC;v(S%w
z1ja4Y`-F0JieEN9kgla1iABn#fDz|}*O#lhm(bP>mXb9?-xN0}W4(yD+`%HdSPIz8
z!meJD)8H3|^yh3ErHe1@^tbdYAQQSxMxbNpONbW7G}o@ed^zscBGSA(`dt#TtFl(s
zswIQd%gF5&f>YO~C1lP}D*PmuW$l+KMMq6To?EI_`PjY1ewY*`(0$x;h;`&dy5EX;
zh{qV6LL-BVOu(`9ZH}8*op5j6QzqMt2Uj~dSax6jk?D}XNryz}11S0ZLmU)#o|i0$
z|Ja_%D#LO`aP9A18A^X)SU>)_uDD|GQqG^@6^eaP`SMSI96F-#jtp+)=p}gaE~ka(
z;;@LGB%Y(*k7L3yP@Ky{Ihr!7njV|0I{_*?OJ5XIC~U%#jX=zpCClneLiC1zIj5N{
zby_%4ao5nJKy-3EQEA*y`1FSr+^ef``gyv)Ej=Udt-%TAqHx{xvil&tCD%HTE*JeU
zPEBXpoR6hVi@sYW$Yri{K#>C^7JVIJ@GO4AKY|I140uz6NLyB|nVZ|%W}+vb+NZQy
zeEmJ#xBq5Vjh}wv3GLj6p(0dPM|oAe#Ei^gFjG4HOqJqZZfU<P23xdXdHo~%B*xvu
zC#Kf+AuBVoAMXXup*jbE)v{FS(dJmC)K}ftmI`$oVeaCY8^hHNa9heI(AC9%Y1B5g
za~b!a+ksLM%}?=Hnt}n6zM3a__A`z7Je+<MWnG=G{rS@+IppEKld-qfp77uV@H>aS
zQ_J8HWnMXXYE6m;fgqf5v!rf=mGExJYhf7w`nAU{BHBQ-Zm1u79<G3v@RlZ`&x`wx
zg~u;o4n#Y_MUqNsyLYE-!<0#Mxo@&M|IvGeAFAfn+*Pkve^?BrxUp3=!MK0FQ?xl-
zJJWMz%kf_Y_RfdA>tV?!mU#|q3ggp{E8&8XcgZ__lX0l~eg{W<O(_MxQ1H!?`+mk!
zcxwRKc?cY%1aR)owr;sH)h3pt89rO<iYi;tJl?F(HCJd!E+8|z&_#D3eKbdfmvP|B
zSp0eDjbu#t9nm#n=O6uCcwq!C{(8)gCtIwS!4}L?U%ie#`u^v732(Bo>2Yxq=j6~^
zA`&;W`5B&-+$w9K+>fTT=rdxE4opB1xR5=EBSuGUD;{c<JNqGK1;n1SU49hL=edJJ
zve7GL|0D97R|Q9<JBb{-r>-b!U6Xql?t^FFFC8p|_+aV50I_N~kGs_-{YEACWtW|$
zo3cNMIo<uOS5_sd>+!P2$bZi`Es~j#O{ipyA=YozC0<JAefwFY-A(QkITTn-wB%RE
zYq!qo2T2AcBlDQ`Ye$_ZN3a?&t`hOadB&!S#obE}=IR>x%fm5RDZOopCxAR(_IQQq
zFfSx?Y_CFTejZck=`O~FeqUG$C;VO+#iFx7_VN$}0hm5;pTM2XiAb_KiIO{7M{3CS
z9Pn3~;(IQ6e%LcSc8P=up-wxB8xx<!WyXJ%R0&v|sk?NoU!-3<8tbpLU|R$^15|(5
zI|d_~3X1e!j2fknX?czu*TxBRox8Y3(APsuL!__NmJ+YL8XtZ=$FC+^^n#XtP_gKd
zHh(;miSvf8sW9Dp)9i?Yx)HTQaxA>hK@`H%G6tHtOTERU!IrCPIAh~qbwF=4kn8Ir
z5gK<`NTS4KlO0yaWdu8PM0)+*x+5-Wr}(tiZ)%?&v0=r%{&H2%8s8cBYWsk$r8mt^
zXi(Pc)#rZV<mu(}YjO;^JKsO88J51<nC`y%S|=>kkDLS?=w77Sa~}SBxROOYR?t3N
zDkR~yDymvubovJdjXwe=G4{6zoyXZwzcXcc;w^cw1IZ)4{M0B^@c<TC!ZpyU(~S3c
zcl(WEjH$J{<!62S*=NxHf6t%mJQRYI!VFtJ+)bsDUkPljp~)VTO;~_WLoje@3(P@L
z{WfC@RZl#}UBkYHb1Q)xBz39++KGHhb8^ZCSps@E{WlGoh%qo<C5to3<4C`LO}nJ}
z`>|1WvO=cVv%dX4XhCbJj0@-8=&D$=S7vuFCa;I`KbMN)%$0tlR$T7%55#1?I{~M=
z48e!fYzrtg=5N}E4?E%}%Q!>cV<5|@jWyaVNOCo}+$=$P7Aek@+|W+zhI2C({&f4`
z<W?PN9aiB8My`;PI=-DJ&#luSOJSsW-Z$L_OgozhMmZOTmH^VoD?r3jRXCy^=U^xs
z-IFg}eE6cBpafhF=QwlhH}LM7*+{0u3Ar6c*z?K;e;*&{6_wWMOMJg4U>`bA-e6fH
ziAGYiwP7Xans!OONBFEZY#_7(R$ovNJ5!~-)@hs6Ho)p^bRrRFm2oO+aw(7yE1QPD
z{}#<c|Ec2W7O_Q5@1pG_b*u#slh^PFd}lJ5n2g<9bK?cODNelt7_#<gqCGU$a!@F6
z%wr=@g&7TmmY@=i^q89m1eXouYn77B!t=6JWZM|1EbCujU3kv5I&MJZ?AiA<><^X;
z28HvFQo-D8K`awX?h^i?yC%1F+NmcFc0KRETAw^jW(QS>Vxgyqa$&GtU;sKg|6k>7
zO4;N22!g%4d6U2A+|zCyJ2td5vS^pKW)%aYZb6M@sK5M)M~e`glA@NaHo7FAVo#O`
zx$4ma#;q!DB72*|K$XM4Q)QM}qq0FVHzs81XhlEzFium6c0kiLcBIrtwD@&-LQ@z=
z7X-r9__;sC8<)F)sj-pP##pxGqB0Yv9KgMWSQyhs&kYWhp&o4u{5?o0vAi)lUaQX7
zn7nbX%Zs~?{!FTI^?h>&1`~qGUqGLw!rftxkBnfjwauVF)=G=|solnw(Dw7K{r5OO
zL566M4>xm13HpnELsJ??L|D(jy!b}R^ENya8Ckz%0z=@pDW=h8KZ->eMDt3{L!rkB
zpt*G#w`JsfT1p*3(TCJ3pIg2-=0i1K&#Xj_-XquUcF3MIty=%-Hj>4w5JTRbN1dyJ
z&9ZiL13KV*E=l}5_HgwGxZLBP%yMffbSWwK<fM@0x#t-BjBlMj;RMMFW4SCA{N6Ql
z$oA=1jw5c;kJbbv%>**7N%#GpEMIs&kfD(9$UU84?c$1Oah^0ikM(4wO6)v|9Er<X
zh*N?qZ)nd01bS0C0?c*rlr-APM>H-r{>d@hEXt3VrZ)7C0bBd7XPU=O|Mb!9Xw<zf
z;WlR(_fGcjbH`7zzU+AGb+eYIsNTf8SasutDBJkFY3h}HF^Lqi6Xh&!?8rlG*SXg}
z`FCC)^*!>|wP|j5oc)@YbNWJo>64G5@lvTT`I7VA?`6N3TH4&}S@{e+0$8rtRmte=
z8nM*rZ?*%8`<XYf>_X7DHWP)4hLFCB*#5izqA})h2ejYd2i;Y}nR~$2&W15X4b$+g
zd_U)VIM&NQUr#i;aWbAfkj0ASH91=yPFQRjH4&|-ZMH4FpSiA%<!^~7zn<|rG!a-S
zE}yb31f*Z5b(8X|Q+S^`z`_4>p2fZ6dC<hvplV%m{K9Fao{-_Ua9{rEa9OR%-`!25
z9FoW}k)e4ix7xVziKd%?9Z6iJZ(NLX2>}y0J3{xKG!xc92}aO@Yc);s1juc2)1g&?
z^^f6Rb84C5nk;_kodE#Gtkw_%SlX%uO)$79-4MPYm7QPONop$wFGmjwDMIr<WBfI6
z4+s($R4{~`ZS&Nai}QXr8NX8c4;KJXi9XdEzU0O<?!64?SZdw@ic-=`$;3ma55SJ9
znzR-5Ub+2_jdBg1s1FPO8r5P1LOx-XR-!=FZxUnqL_|sc!eRtvF^Sfsd!~Syydrh6
z&qHT*chRm8<hw3|IS2!E@vLbtN3w{|<Xrvpo)4@R*aQ~D|85q`r$Ks4`kExakVaU%
z>4)pN&G>jbOFMqE2swt#@Ef4gNw3yG({w&wD4?X-Ap|{$K^k{q*aL(gR#B1P+h{Wj
zfGgZu#Ir}#^*5f+?vJj6+9FaxgQR?ExJH3m1)~o0ifSAKSS}}S7tMT>6uyc_h7qLj
zIW0m}k8e4Wd=*%nh1F<nr`j`S<;QEH`xT^~G{)Cdz1<6(3Wg+`vFPG|982$ioNTn*
zj~M@85v#LvGN*8qu5evTA!l@fn_k~--_OHMG0WF5EQ%k~wf})?GZxZ?D91d=+jLsi
z*4gebMT#}GHzMr&<Tz>?&oq{Rd<R)Z)%9iE;;sbHi}<+!+7E0duRfXws_A`<wZ426
zUV#!6rzwaeXwbNAu^1kl2HwwoEy|V)M8CE8=CA{!Ag*CQ&MT<gFr_aBbEcn&I}K4H
ztJ-xJUKwcDSS?GO0(A6)_wv}G6DU~av5f~_e+uBA)3T3!5D(c0#p0zKE+=0$0vx^j
zX_tSB4Ck(A-LpBytMv1V=Zol<O}TE<cdNw)sKBLf^H%4RpEDWS2}XzODVZ|_DTv|d
z@YC=(`g_kQJ!g_VO5G8*jRN3lB8JAazm@=HuAEJS#r8pkruUEVQu`lG(yNgGk`5qL
z>EK2Mhe(y2dT)v;8&sIe^xoU+cNV5pXIIaho&@7{S(f%mIt`V5qs56JE6kG)eGNt|
zI>J}u2J~miuxz{ox*j~S1mXT9I@X<@pfeYd<zegkzsB1jYs_m_l|o<}iW^}Rzu{)(
zQ-d*KsDW{w>^Dm^y4~#9w$W`~zJlD}R`{FiMY5e74l9Gz1WcGR7B1_z$<T|%MUJ>-
z6<&dl-Q2Y!ki+G?bOomqFQ7PNi0#i^b<MGx1C&U%*bJnn@X;(8<ENOe-ix%-0K@VW
z<45NB^w$oK1gm>rZ-@&NXF}O1AI8#4n7M;<`A54ps0`M5e~s$$&*4&;BxJro2(8eb
z?WmIXrb+L&9f{C=cx*)K|7qI&$f{&EfN4S3B2b4g1AIq-y7_xBGPU)`mp9vDyTe66
zk329}L!Rj<KP2e@ewYrfDBsvvKatGACQM@R^EJL{2asWL&K(EOk)AN#=koc!oW;y(
zr+2aQ-6v!k!WF`3u}8*Kpx48^`0EW0e{0)g@^pHWJz%0Gz4Gt>!w{J;SvZHg4a%2%
zROE32L97~rd7q(ro-0jv%ut9UqD_mh2GSI&U)?FNo`<$vm31c>kH%5WO;cW#In&6j
z%f4ZvzT{232hh(0t7GPOQjZ2%WFtSju9Fnx(Gw>52;&j{-O$1Mh~OZWydMupy&m7o
z_$Z3Rtn;Qm^6+F}y-u>wWe+jfOifZno}NMMi&eddH%l-_6=_FV!}82_L1fW4KAeKq
z?`)etpNhqEh@O{ov$T+?=uc;DT=#msNdQDe@ZO3Izn6vdeh{i|SE1oos#nhfwX7+e
zZ(7BzB6t<zArW>IR+!?jI?OlXl)x?o)VPOzo;q3#Uuqe?b9N)}3)^KI5;8Ya{A+}A
z)bO+l-%kI!_4(wF<Gn>{|1#`IRPnYpca2*;K%-t+-zWK@PT-Jzq>&lHI2urU;dI{{
zo)8;Xl2nzMzb(&vzfl$ih%*hba)-tGu%SBqWAm30OhJ}xkuX^L`98q*i`Tc229jR2
zecMr0zbO|`uX<W?0ICbVqsbuAtp~gw^XYVG0lXxw^bTm2M)fhM(KxGNH;c6-Np|Q|
z#KdGjP7*Q(87{uA)w9S4k6YMu`zOchb$0CBda9^$5#H0aTlXnH>jgdf_36ituO$Cj
z{D1bL|8Zl@NGfq5<9dVbhRoUV12p8$vc1{ZLdV*_gYaf4VHDti#zF{>B1oY!Iawym
z@Nu65A_@uI=*--0bNqwX28>e6zrpm02Nc1(s>I71*T&0AGA5MGbD{YuzTA|>L`?1>
z%Y3rM_q-K%aT|Ig(@2YW*s*R`P0Ha!A}eFAc9tm4RK%Jd?+Qkqp&pY?FMRRr9|v<$
zX%S+gW^XJR-VZ*_`hC1Lu5$VBrC}^g|Iz?LEN&VVznoEYp~y<9$t`+y^_hCNu5+?g
zGeVM)CfEbU&7xZdpN2D&zgD(Z1gkuq_OS#SsZBsiAv@xrv;bOPty%SuqUAg>ja1iY
zvkR*NpuG$F!N1bJZxdA+x82T(pjsqvNbQ2+(B_<!x?s;H)3^_OQKH}d3QR<5GHV2=
zU;SBBT9{V|<8Y<SzA8T45?{oN<L#M=Y8EUuv>D1)pOV5O;Wq21DVP8@QXh1<DhWEd
z9IoRCad9PFL+ZBwY5#nr`QfHNC=|a}Kgqds=rQT?pbF$@k)ZSe^T(_z9yhBu{BEc@
zP!UuF-L_!#py|WA$2E<MwcFhs#JVp<wdmTR{ps>nmop`z%5ITApYK%dzJdGSbn02<
zSO$-<rzxQ#_cA@|{GwIgR_KDpIilaD7(nTN-#Y*u2StH}m)E*G3ld-*wnQKNM2#-n
zY-My8?Q%g^j?PUn1_kjw4k!USIqjcP0U|;}Pq!>iC%1d^k<;wMGBCmPEm4oQ0tIMN
zLSv}%jqnfDX31^8r@HhIhB<xD+)Z;xC$QDAEb`?F(u(7@ig!?#m90i?MI42)T-S-&
zQ9r%4L$;PtU?&`|GxU3`ByYb;p_T5+M=-Fbe@vhuFRkkLZn&iRm9?@q-|kJd9Wr}h
z6W_uzYl=Y3W$x2PsZ{vCP*SdD_RsZh2d-$+;!fJ`LdF5p9L`+%?YVC+<a9FZ4WLC~
z5o0)FPlhhUC0KV2=fBpU^2g)v^W?K%>gezXdMFVxaYUrZU1ExU`$$5rpz{pw8l+tQ
z9|P>sXV0E}35pda_>D$^^@Qd*g9#r9Xa)6~I(5K6;4_DEtQAeR-AMOuN#)XJPZ2s%
z+J$kXhs1ieUbE4Obdj8Ro&DDyM^b-|2a#af#|O_WjzPt{+jhxzwbe{wUF7tM727nB
zR#(&LiwWcKrHVu^S7p-AD_4z<p_8Lh$DsL?`}41XYfU8gjH}%?@E2l{mdqsdclkz6
zfuiSSE#7iV{ABXpW>&LcN~Jftj=SwJ;Le6%Z})0dY0i7DVI(EEs(#mMzh-<eej2Jv
z2~{Pt>sg7q`N#91o5!JjbV~IuJBWLneq%9^hy{gi=q{@7)-R<893RC0q<wi62;2AL
zEQ@9I&+uv!5VM-Y)q)#A>SnDwZcpu=LG=-Z^rx}QX}v;xgvzP%56O3<093DJ^-Vrh
z)89&l+NNSIeHOI}EVjtt3;j9=f=31<rwxx@;$CwTP`dfQHwahy!o>|}5f1*U-rt;z
z_Qaw>M?w|@jc4FvY`UOOWXA?!(mlz!v12w}6pf;&^-zvG?)4`;G%rf_c7$#>XRMmJ
zgi)oUl>l<1!{*LEr;1<5A(E*4>q4hgS8g;Y(RI2}e(8n5nmE-mlb%FHXBxig2yL*Y
zw5)bqi+xqq^+k&9u*|R@YVhhqaIw!17h8iZEq_Znf{#UW%-}QGj^}ZpY8V+l1-wsd
zjVFe_@uR^4>%4a07rSu$8<5QVApR$#SMPZCRQF3W^hEx|C(<17{b{9^OYoK+X>kY{
zT`=%!CzH&oIF`4{TAB2cG5s`XHN@BwpFU?yeKDYarOis3Dju2itAz$p@X>pfFbu*f
zLAcylHl9u4qcr{yjjXlpolt+80<ycD*xaj4Wol?L@<2DLKn_21hDG+GBXCR>{PENc
zSRW!>l-pe95n)eT4R)%Ar=gP&x7`P<<Sy`j(dAr63IhK&ojKVIixV8v>0Ki^yM^+*
zhM~G`kg@H`&uknf_Bd||U>Tp-gya@n`G!ny{O7LsD!sY6o_B~L#GCfnirb$U_EY2m
zgtPGXPL$`kgDo^D@rjp7mcO6jt|Z>?+bllCH3wkx#UpQT6j7a$K5(gX&3t!IPC&bG
z5Yp@m>_PanP7FeZP3Vih#b>^ycP0>C_u=m9;&Fiw8<6I^uTJcx2y~7{F>4tiqTu({
zAC25*H}d%<{fr6k8`9!*dKQT*`s+i<+XabV*BJk8!d7e-^<PYYR>yEQ-aF}QKgXG(
z;}TeC$8SoJ{!EMca7!*F7usCfzBjO+1T*28WYX7#`RL$zR!Uf?u}Y_>>0cGpI@4hf
zb`iHwpyHgEl;uo0`f6YwB>n(s=%qMjFrJzL;H>gHov6d=ckrGsa|VJNF6C_`A59p%
zf`-^D-C($*?)v{cZmI#+&zR|y4CQO2lGfXYi!&DH3Z%$O#NQw+^=SvZXJ)B()3r{L
zV)2xBlbZz&b%TXtrK+x)C0ezxnW`MJ?vt|-Rg#9z@eZfS;4kVUZp?Y?RSj2KQh{PK
zY|K?YqVLireL*dA>C_$G+%DViolz@l%U#pZAZb3barAta=HG1$RYaS9bk+H-{eYbr
zC+Sn^#dtg}<YJ=1#gsj3-H4{?#9>FaoSqFSN#{TM57&CoxAJS8Wj>#h9KNUD+p!22
zQk0vXUs<z;sKf~?5LGY+zP>Kl{p|MP$1{ilofh>%cDl@ZLHw$d4X1||%3)2wIl~S%
zup&goF>>F%A#{*ToxDZl?++=rS)H0SDHMTK;0EqMS|ramR%SK9tyHUONeVrV1U}$G
z*kv;$OaRF-X1jzg$+aJulxvC2*+)Pzq^GKn&A_t>xba|B)4>&L(YH@rS{_@IVVTg}
z`mmW!BGJU0PQ;{HEu8L-Ol7FszXI~~o%HoT#BL$Wr)tj3RnmV0NgW}1*To~}`zf7<
zUuG(Va5ANq7NUHw634RYRo|-O9Sexhrx^~P2wS()JYE=%{E-*BS)lY@HsSMz3Ss)D
zDrq@P<`wW-<UQ$gO#-S0-;gY!7SSpbOVZH*F4d+?N9TEm=>I=WKkzoOU2BiBAI=m?
zp0@{+h0w9nm&HW1DT;N^D@rFMW3y4jqfoy4Iph#i1W$OA>f(yTwsp-~l&Zym0uNV-
zNe}B$FO7FtoR5m`(C1bev3wzc--VA9Im&nLXDc!1$$xr6gB{9CnMr^0USGcf(p?N+
zG(0+=n@~xJ=aVa|c*-<V5=X-f3oDzvKXb6qp((=UZ)B$*iYE0g2hZ_Z{`BH-rtoj1
zW%PpKx@O1o09SD!%DgzlJSQrHLMg)`zsQwI==?R0CDu7mpBU7<C!_6yh8H0VkBgpW
zVtjKrsCk%G(#@YUch5eFDJZhvX-u|Pumrr11Q_zwBFMX=W($O}rr};St6RjrUJBXg
z0y3-#R+Jk0Tvd~%&(vm~m8Tr(S}13&0*S9j=%`Z$<KCwy5h=TOr6_q4k?~i*xCZgR
z<ba@2Aop?9Kr^I-$dL3T{ih){y@r<@tKD8)Tvh*_q}{&zgYaN!Y{)@M>PL7OM&o_t
zF;VmLH29G^!?wN=!`x7jgKHg&5c!qVAydl^5Kc%qI#cSUf4RlJ0R4qrxuE%OOSsp`
zltpL*4pn7YY+_3bPoRZD9vm2`s}lwDueBWuJxLptkp}}OtzQ-*R28u%6ch!kG?r*9
z^WeVt=Jaztqx3IW6BzDLO&B1QgvI$97Gs?nK=U{Rop<P0Fw-pN>+f#8(S7aRKHCyV
zwvaFPt|4&}Mbku2Cx8_}qlO*lECZxD;_bll+M>gjE7<}tw)J6wt6c(~5K*)UNV{64
zdo^;UZ_U55K@})#Gx=ANfHy+!#{f6hivi*Whjf;Fn~j&cJTG2C+jv!xK*|1*M=v4X
zajx<go1vbWx6yDN@m|e*yj{0#e}7v#bLLWg-+qB`OQg;jLKjA;i2Ci^{^k+<a;(DI
za)trxE^a*eKyP*!nnxdk+qJf1#$`irAX`1>eiy&PWgJpdWO%gt@Z@Np2dbp&Zf<$%
zJJvoHmhnQ_Ww(tV{;y0LHxnTHMUq#|q)#(Xs!V*4|HUZNxa!K%mwEcLPLI}q;G7(@
zf)E=8_p;W;lBgcBjUJoT?EK*hlwi<`h;3o9O;dG(DG^d)a;)dr=}}>oqG-N|Htpf;
zn6Lr0<oo*|CD8r1NMG9qd<AJA=S@n{cy1BZ`FPBQNy&p)FUp<uW;-D7Ynaw^yeOdC
z#3G3lY6hjGB3nSI9Nes^U0|rPkhv{<dhpeDu&C}h`oIus*$4B+baWqJa)n5t$`MB2
z_?mpcDZYxH&gT6)3isEdCMhXF5}8WNF5*<V2rt~$NM#JpFX@&<#EvpdcGQ>Lvy^D6
zM5L#0*}eehwkT4|Ik8j^PV4jp^U2C0JGN#0u$PL~DtNg*4CS2mcsXv6*z=`i=+`+9
zZK>Cw7+qitKbN7QX=in~?^R5*0Jq5ltdX6Q^bH?itMI6m7`SfvZrA-hQ5^O?WF6ek
zaF1RCo;IJAjP&BPaDbF|6X1DVu+`K>Gca$rfw^fp4~YclJ5rSNloT?;VMigwfTI3D
z+3G+Q*kQ#a``&hlR!~zn?Hs`QpLz)4dq9PkCO3iXU$H{tQKDj#@=3Sy7h2YdRqwt>
z7tqQ;NA2-iml-qlpT$6p&-o-z&Ju=HcY(`OM_s67OU7aJG4yP04xyidtK}zZF|Fd~
z{@bc`xj@M(yUIT>YBWf?^5O^(gZPk+FcL}jHeV^W&H1K*JRKgpL1n3yS-;|^U(h(9
zHmAN40a#Wn^b7g3vYV<e{=F^JdmosVQo2QB#&LXXM|xYTw4)4V!vJ+YMWCHI$<&mC
zIDp?V%S)$+pF9ha?tlzX-)<oVrsOh|QzgUlS;|mA7C?wP2!iyoDE%cPQB8l5zf35-
zeAYrRm|qxmrLEgxnU-eMNd#NrwK@fg-@iio(i5q;wtpcPo9z`w&A$NckQL1$$WU5l
z|A~70!|E;ROzJPu%@0sH{gbkov|bSqpLX8q(~*~b;<ayBy{NTi2{tvKHll%O*7rBq
zbf&(}tS?1E#*fA^LP?s6!tDyZ`ctAn4}4OKpUnAa->xB;%_;h^b*YwMjEcvHQiytR
z;in@bf{c9mZ;(y%GD5Fycvg!>p%*>$%CRM8ix2rBCjzK_(~=YEEDuuFb;Y~SPKYVW
z7P)Jr>wOEDYK4K{m}JUm@jlc<bk#}NB4fe3cnLISxz^0GCW`@%lSyrY=I1CS)6W1O
zLp<D!A}ITz7|E`{S5AK-O{vwj^D>e|Gse+&1M87L|9KPuR@j;^W87p*=J{uUIxY^X
zg8vz8ehEB8Llz`oX}t_5Y%FFm;C(;SQs1{s&7D*nK0Sk*zd2#vvsqjOEf1LFIub$%
z3N!iCJgZe1nPzr)q_V=2UYRolHSXpIwvQ!iGZ#j%Xtb`L16lC8hFGKe8AhrUQRKV-
zT+{y$*6&%J(^ovB0rnlfge;CF3_FDfIpw#~Td@kAzEjhW(m)ps0-%ijrV@WPjLv<R
zc{I<0kPxlLOZ^!FURNUwoNitVF2av;_+F16u4dQQ9khI{eV@msoR7@$h6<vc&d{=6
z#7TQ@!*bsU*(4cC+#MJHH`ioh$czxQA=Db~rG7yVaU@TtWQG+mlu>32@jUYybi>`{
zKiZ@U_67JSEuj%9V327!4#(e5Oz6SqB7<*#(r!Q~xAp)?()u8kV6eDI#&7?!SqONb
zMqNPzWWB(;0$m(8)VFu^Tv`KbkGs;BMt2m4NpD_9zHl0T;R&m9r=QCQA-m=)$f0tC
z&B&Undhs0Dxoib^p36);Pmx=-`4Rl%VZ22-aa>@C2YzJNKx~5h<;zgx-SUF$xfj8k
zGxTa4jRC&ND(aiZF&Qs4%YdtaPgoZob1f)REu1IKNaAM*ar<7z8izbVUX|vJ=Nc_z
z&*!$7J1OU@p^oPFxP*WE{T-w<na6s)-@h}N>Y+BOn)A@b8Tk!dI#?px{`|iB+F-$T
z<17#ANQFU?1lPA1$oi$D(-@n>qocS3BVQZ}v79o?e&$S;4V6-<U+X@XFU0gA2hA_g
zbIpk>zy3cF?kV+;lc_oP?r8=Kr0IMCeMPSE?Dw0-&-LO}<mWXKEQ`!SZ5ktEL(F#t
zTklpn^#SQZ<ToEkJpJO0fdmk~XYN0JiRF*>);L=3`oC$hy6ayeCXf71zk-N07Jr%S
zzodB@FV8iRH?ve$_ISqcZXri9E!=bFu40ey;YVy7q2+kh6r|COD}Dj!W~t2R-ii{3
z6i@hqREQ=y{~?~dp!^{WX5C#paWG$za*bs`JPX~@arbyliBaL7r~f=2An*M_y3sLH
z)E4sBS<9RF$fn!lZmwrly}P69bt8XtIp$G{5AtSF@JJS>Wom|s+^rxN7h)zqFB+w(
zypR!kgi6O++MOT}$Do=)!#*hA{Cr|=!AJRCM&f_r$dsz+-snQ4)~$mNo}RBxyKLhT
zxuQ4uNrMh<Xw<*3O(s9=pn8(`I2Q0H*E(sbk#z~hKm*+qcBCzMdnH$^M7|@6rR4sx
zcu*LAu750!_kRx@Fv>S>(C4hf(Csb$F1UGcGY!$EXPET8ZoFGGf8BqZ1YS@Jo~s7}
zyOFnF!#9d|W&9eiab83vbD|+6DLxp{MzIS03sK8jgyzUn;nJ*s5vrt#*F}bU;Z<S%
z&bP<&!aGux`!YV5$PuH3B!5xtVeAJP8hPZmZX%4a3xU}`F%NtnP7~Yvxi+z<5uN{`
zT4Li>w%VtTFgPC{OadAL3*8A88i4h+UlF-+*}@rowaz2bzi$<)j#hc_++pE153)~x
z@HXpTg+B{H7q<R+LC;BR0KODb<3Wt_(aQXCE5*;X5Ht4+M&tRgLYik|ibP;1;5}fa
zvk(_=3izh&K%L7z!F$T7yHx8k`^Ps0|Lfh9gwe`PsP<%5zenVJ#Xq080nEu{Sl`8J
zAYV_Q1e|qz8)9PNF*L$_J>^0Wq50$MMntnbz8A7I^h~gcFIr@tv0gl*AUv!Z!R}?d
z&`&`Exxd5dLGr(=H%s*r@mz=3<@sE5lUfpy>*C7CeK7JkwD9{`h1f#l{ipwXrA6Qb
zNo&SuEh~0DMe0sF7&MK8*`$2d3;^;59-izBWB{(>?;kD#Yw;5O<p$Pwe{E!+%t56q
zNjtX7C#dD#pDk=B%Ns>=mlQ^6@Nke-BcV$U%0pCpOd`I%yTJp5K<A*4TK=z(CRNGS
z|5X$C&u-I%Uoflwz{gfbG6bLY1>F0$wjps^yspg90gW}gflAw7+GIk%yhXZhHr9P!
z$cVUzTqKwh5FhE~)v|9~Y=E(Ol<&iqc=we7wvm4uEls#Fpm+mZz-Uzms9$-&FukD*
zC`u#?Dl0xTO46gL_gb%z<IxQL?-OysD;C%exjVp$8ye)jE$@I;3!A$DV-EHzC_+Yl
z^fE9#ROcBbS=C$qN%uJMcbdrRWR+b|Q1DN+xC`roLx~p;&-En8kmBd7pj{nNI|EA?
z%t|&)iM8i<vfR9Uo=aq-m--_uj}j1eh<g#Gm62UEsJ-atVs0gQ`^JBt8bvsx5#jf&
zoUl6MczSeSe!(yX0s2pk%(|7mh!)JJ6%__jujPTy0xk-S#QLud^<xiS<|$o8`wP2e
zsE!<rR#rGWV+lv2Qr7r?=rKpS-bLsYJ;oyYccmC5lVb{{$bL<#vLADvf4l|E%6DRK
zamdW=DSiHlfVi22K$%foPc+MWA8y+$_anb%{lRy#SMh(Lm-EQsp=mFr9nhS7vgOh@
z(dS~J?f$v?D(F7A4Zu^fd0hz~6dEl*a!h9bt|+oZR(y~fK2w=sgFmU>r*|3{x$iV>
zzI4%r4M;2fdoGx~?l+NhXKVda+2BTbvgC1?$u;$-lAo_nny<*+{q55CUcBL=KadI=
z(!t`Y<%~G7D``7=K)-o_h^s8wr2ESJhYMhlc`FzyRpdG!Xz)0DNvwJmJ{7jV%Z`u&
z8U^<G%MKz-RVGm3Pp%aj7vk;s_?fe+<MLwdHTj(gJks-Dc^)rv%kV}+4@bT9Eg|@c
zmmj+2EIHQ~|3N>>ES&Q|mPIL&ov$XZn(`(dbm3rcLY!cn{87tTrZTl1!o$CW!`nDI
zY>+9LsTSG@HLW~?bgxLub2;yrW{M_1M>{>?se8O#FZW9;>v^f}c^u>da3jq)+nl3U
zOpIc}&vrYj%b_vz{Pa@Rp*LZ>aEfu??ic70M?M3kHJa!wd5hPZ$CD+Z9_LGnd&Mi+
zgvouQMO662t(74I;9U$=R36y5X294A@|YWgb);Rh2<Yo5oc+Lm=pC)^+*S(dI%^Y*
z_D(OE5MV}#HJ-iqsr*b3pnKjTJ3&5^2abgiI;NYs5y@@ze^$s(guw6ytLts;Th|DY
z>--}o`dBwL5exp)*z<eEOB}CI&}8djWNVhJni9{9YY87X=Au1YnXaqh`{{$<36Qci
z0$qx?=&yc3jO&f#tjgPNFTi1nd8eIpfF0DcS~^d>9~zq@)@w$DyayVRGW<w;&X<-c
zMnV3vq*nVDLV$!sN<k#9i}CQ}PWfgHKQqJ1KUt_8WozBL8HU`O=i#sw?P6SO<cybA
z%-^MFy0x#1_<p_TK)hM<hyflx1b~M<1C$N=?kolr<06t{=VNE%hX=nxy1-&ZH)_v}
zMgaY|;m$D{#pHBx3DGSE<h{{_a^b6r0E5-+F8x1mMDHbLEjVskj^!?$^Rcg`sSvp%
zBm6x*F2xbYy+D)WINz&KY&8Ao%REori8G8=jIN>p>CAwQKy?0wg>LXKQQCdI&JzQL
zF<*JiVA!M?G_H+}dlIeSD!NkwC0ofi5{p~@K`^HwI8bh-1LNplpFGPpHnCgiWbcaQ
z>cIMmG+z&OqS3qx(F6QlUh-X+`{0bcwK>v37?1fLONJqq6q}%pWSkH~u;`RJ^XMRG
z8WwHEZ?@4sM1IIX`9tWc?EUNq_Dj74;tGEMJh`A@fQc&kMQiPJkWoj7Wy0_H+Z2fe
zt4eRL-JX3IJ>g>%NqRI|^sEc!#JYgpr&}0zdU6y8fLBMU<yi5DpAltNSlLkq?oP;^
zC}IBU^!RTEGC?=;5Fi)OgAocF|AcbHLQiLP^e?;;JyrIl_3j0)-$<!7RrPqPHPz4U
zAoZTZfL!B0i;0WoJLyc2vO4_CZtds3jo7`g`{$3%ziGe)uR&z~KJ+c^uBuMKMI^A=
zFZZ|9f-uCeBS44Tb_jd{8oIII^)qNY2+C~!%P>@rl#F$UtoB=QL3INDb}Q=y85u_A
z!XTvI>vynKKZP-nPOyJ6hT$I8??KWpfC$S8I9>sDZ`h_0;47@Rx|9ANYwsOT^&38n
zM@OMylT`MI3aMnDjFilfG9oFIk(F#m6v`-+O)4`QR>-KVipWT|keQXepX)y7D1E*?
z&+mC&uirntPR@DX@9Vzyy06<~#T;@=!J7RL$cmygf&TmWIC?s4TS)f@PJ4uC2Ix*q
z^15c8e-JZuEiHj5c+_L_^b@9lf}gMM*r>Uk;o;E5&&~kPHVV5VcTa$NF{IjKA|KvA
zRczWGsz1ncY>Ge}JTcM7`*J1MH^|kTxBd0VBm`URzFY+p8e&A}plcxl4sw<{9ce0t
zXs70lIHW^jHHI>v={&5lUC}rJ5+z}W*ljuT!FE+|VzE#kUjQPup~~B3X#EJhcCtGP
z-GK89Y^5rt(6XKjV~&mDHsyq9D4bVTQlzNk6E-C3aE5lv*GY-J?a<KLm;5-;e<$O#
z$Z;6uRL&Fq9oZ&dESPxASuK@4Bu$Nq_ZVmwUkmP`S>&?b?njPy7E|To!;=*?@&YA&
zhPqocW9A?0AawoSDjN&Nf`c{o{VU<w6%EzMLm#FFzHnNP)6NbEeTFj6u<H>xmm^|a
z>We**+L@xs+s~Sv3-+GLA%WtG1)7Uxlq{_<(FVtL&OEAupsVYr%3V0WqQwN7F$Jjz
z*~V;bT~HUCMR|}ZmdpK;XJ=aR!xS<dIp=$r=FIj%rZpiuW1-0<5NuQ{I$~x~IQ{kW
zu6YAb+0BY&TC{vTqsIm!KALlzA;>Rr0--JV%#&L%2}c)i7;L&Wn=(`NMTVfr6xg06
zD^8$>OPf!c?7MaJMdW3E@0GCaUleS=E@&@7Gj1StKa@>A-<*IXdw7mz>(}KanC~J4
zGkOfU%wVoyO31r3r^&v{V2+MJ3DXbCA_3>1IblmqeSxF(Bot$Hhb@YiX95Szpr!eU
zZ5jT$O)z5{TMM9u5Mb<hr&NwM_gnd{2H#TIoi1?Hlp~-sUGiEN)(%d=C={Jpf=s6v
zug}BcXRki;_d#whV{rl$QB{O#Y=q7jWV=Od&1iY82SMp`dcw@EFpi2cvO!MI*Sv0a
z$4k-1x_O}4#j1OXX$vmdm=*H15x#E55qrc5Mq~%YfMde`^ar9C5C8qGeJ}OmLK1{k
z_ZZ+6JRy=m%rIvN2LfLe(nH_Fb|lGt$JC)MLiWuM=z|aeN6+n?XFBNkz{e`9E6bC0
zW+!iO9t$o$VZ!<HPMay*tsq!zfBJ3=49XNsU}}_r;z~8-0sHlTK^yDQK<V=rltW4O
zb_icAm$&7C!`?x=E@-AGknS6K%XcX}m(T4RSU1gC&ekITy3xIbXKQa9-mEmDkYYfp
zD?vL;FW|8}t@D!KmEblURt0_!P!oE;Xgzd5Y6Fwf_a8sJ3#ifveG+x5Y<D$-$_F~N
zWWR;XMDV)*#p>}7fbuCnn)gfQKsivXruiw@_dZCge=1N<h?Cnc_T=XauASn2;TMNm
z8dpy#_X^t9q9Quwj11IqOx4~A$cw{anq9NeK&TE-=x?11ciA2~@|3I<3io$0C%|lo
z7_)2K%e@O-`9C4!`A7`<YvRqW94N~Meu#p}i|xo+cCr*OXzKkCQjKP&&7Om>vKrbP
z4QE$v$L`$;(&JH$HgldH(mte`HwQDgq!orW*xfLcBNcg%+4t)#^o98%gQx{#&f}Es
zJ#Ck#_2r-W!bZKsWy?=NiI;3wOoFd7ocnyZt4Z8jY9H@kXn=7+Y(_5bJ!RJ7eHjgW
zz`uG|hmzn#^;i3!-@ddw+hq%f(s&@H6n(lN`@ATq^86_s+YfnA&#@gM^9%Iy0+TLN
z1<s@~*PKRV8Y;G;>*l@=*{(E$9#DaC-L}#XAD@J2x|40z4ITIq9i3n2z)jFSMD`WT
zfLZSou8Gemt%;7^^C*W0H?_glaYm2?ev=>f=|iQ!CR-on)NvSQEavq{NTJ6GvPm3y
zv_9q!#T&UjUnk`+4emxB%}(Ue^gN@nE2IeJn9@kHq7-~O`D?|aaAN{djbkY=Ljprz
z#@@5tmN;z_Cd95!^wvRk2#(<<;C*Ay==cNYT_+UK<d|NCtQ!;E3Eg1;a0`tOpF?ix
zUZphcxP&RaIJcPy^OmfXmwA+XzrwkVu!b`5x1w!edQ_TNQ(6u9u~GYT+v-|IuRbrl
z4B4iO%Rf;ugSL-ez^A1)+m<p?%H?_>`2cdqq28W5ng(sIf({PL(Sb+1$?Z`71eunT
zpBSD>LW{U+Txbhhe5ihhn^>VoolHxJ-IhhWY~iN6q-ghIu=~BdDlJOfxYK3QJD=%{
z_pCVbvs0s%Ah2^@p6;gRni;`D6$K<lc0*tKI|4{oOPY_KC_BxDKi~daF|zewy!2UP
zo%$knReZgBG1c$8z_unhr+iO@(eFG|npn+UQSJ&R!NzF!ax(4qJI!SrA!e881PQik
zluoc*Ma4CgEr9~t17LA$+o2mi`|o*)l*8QHcU$*J9vyoHiJBcxizxY9Dy~ss=ZBZz
z0#Fe+!+Q`_dfs01e|nfj<nmoH=)0S23GS%{pE0_BPp?W+WA;^@Noa2TY9P?h3hY;B
zyBh|X297IQDWKR_H4cR}(4C;AmDQlOgJfj*WLwXE=i*)&?}|MJ?|1^QA!KlQgS2=n
zV=idLc?wKva2>PSRF)t=u6dU_4&`b*v09`T44m6ErQJ1x76dNO4pX}8-_B1KKFx|3
zsCB+jH?NV)$!C1j^QAIOVLvBegw{crj8HS&z1HaRB!j!F8T8-eX-8ByxeRV-mHDZ`
z!j>G%DBf-x1)fv;v$f85+Gd}F!AP8Cy=TrCy3g+Mb615?vn4a=1Ql=gLqoIP!Y-KN
zdKdyo{kP%xiDV$p=Z2p_FwzIzOQMkSj-H-`(c3M2BM)-7!U0Ltu=P2Js~2vKG;XiB
zQp5P%{eW;2cuZprU{Li?A<nj?9V$UpFppM)*6@JRCY-U0y;UeATZIkf(cly|KU5yJ
zIpdo$g!#yh*D6|W{1kW$@l$v+Ok*hQ-W^?j?ZqZ9m7>pq*@m9c#!R_TRMcy!Fu%oz
z%UFMMOkT*O?NNo0vs_w!$9f<12)0P<fFX{(xS>cmO3c=u&n$BJY`y37a8nIBav<jL
z!$bt)I^=}EDSH28TPE2)$4B5H9fr<IE2yemwvgq>?MBWp{82j?Cz^g%z09gTKjK3O
zv<E+M3D)=a`Uu4@GhG-3HaP*4TrBsvjvwncT{)zjbDeAS-bK5#GwDOp4{oHunPR6#
znB$;`k|vdtpUE0|L7^<zVKGP?uHaD4CS0LhU4t|uDrjYnzp=Su8}(Z?g(UxSq;{V?
zz@9x8v}p&|5Jl^)jNH7XCyaY=1Qs%;m=k@m3hA!WawQ{zY~ix#Uigt`F{Q}uCbz7Y
zE-0U?m(1gYVQERqFpox-N?rAk#7LHA?9CczRxtkx-3o86L(}~Q9+k+@$w|0mr5u@&
zGhe=d2YgOFn2tWV4mScuS&Y>Kl2~Xa?zUa6u#adhoeLi!Ua^AnynD`OL!jCRzG}An
z0bDKkWJVAtG7tB|RFDsg&mbFy6qD*QOtfr4)`;xAE40Q}G3r^|)2SF+c}%G^%gw~2
z<V^0(Ff&k~yWIjmL={AY9TGNYS;6Iuk2Yp?xYNZ;>Y9oOo-9myflRFIv^B*YfdvX#
zdCH8PdXii5t=5#}-;TVrp*``ol!k+#lp?XUr%c>8CpqiJbU{RawKuv)kT#AlOcP21
zl-o~#7H?^Nz!@YX720+YGLun`g{93b7B7y)?ihejPB{N~-25j<{hFSEiK&$(fE6cB
z4LD_81vTH?oJ*$+wV%~PCauM${w3u6&y_7f=P6EwPbIw0r}?7?oY;6Xg(CPUCpZ`#
zJ%qx%ivyPZaQgVesj?sL?%>7ke7g9HakSr9B<5Modw-YkCX;Fwe1iVOL5NjP#_l#g
zC5f|r`pb6n5W~`dX>`WPC0aHtGO&+8KBki2%6dh=7-1WtkUFe|ve+;jhmO3Z^sBj6
ztj))*QGG;BB2C)zJj-v#waJsSr{M-%wW?0IEw$v>)X(PnT@DQzfhR7*`36zi{z?*4
z<Yf2UDLv?ZA*l5ml#4B)oZtN>Ck{MEG&wRl?v(lydtLvExCcrMoi}>6Wj3DCQ@CYm
zH+~L0r`;N-L30ncgnar8gC#;A9;r%?FBbKf|G~uB&!U${oO}~Kd9NB*aXtNXRG!Ph
z(SD`J+`={B)J5BU0OMu8$9aIg-_)m6Zfz_Rsndsf#qHRRSI`d|1%CeCgBK6@KE$5A
z5R@R>MmN@0%YycuMF+v=3^UhCDK%eiLC~gKUA)5er`V8SaM0C`k^N9>Dt`au#xv`o
z{f=^;Q+oH3{I}q-6Mq*&S*TTNRdhGroZ!!}se5|5@O_!t_wi2e)Pa~68HZ`V)7w6t
zpba(~KowJYzLh#CHQ9le(6@B9dxxJ1R~9rRcsRjK9Tk}I!zs4{ExOe(yZs(hZh3-x
zuNMSnW0=G*r*DIJpz#{H^|J$C9#_MWGK+v7)UJ#M%^UCwKkcL(7Ju?`1T|m0J~@GI
zFh%2Ns)rT3w^MkzBQ*lE?ym0g`Q{>M*8J+J6Ka%d39d{9HFHnQ?wBOzoxjVRbfBs9
z04n2P(@{ggRgsT3Xyw?3?zF$aF)20D=4>NmT(FnOp@6P2R)&$f;C*aKHMvT%jvp^J
zp{m$Gl9h*k3vNEIlk5}q(eV<HJiaiHrps-vO5D23!|0~087OSO!qh>CB?3LU6%!SP
z(8n&9CKse{-m8kgo&mg*gJby%<c#n9IV^5^pt}e_TgFV@tGj{*Iet|7EVkA+6+LAF
zUd(k>DL-qNoYHgu=qINWpHB*;XWfSxrO?21wpAlF6T&9Oksg?rECihix~hezeY6xL
z9?KSz%b$Bcou@Ozz{Nz+jzh>*Tc$tgnMKX#C@$#O_Il~bB{FI_;jVrMlR)n5ReDr|
z?9r2s)UvSBaKr}<+eumzkEG;XF?DCAASjO@Y;ZyBaj#llx5*S=m{7hK=0U{O9*|nm
z$N;m5ZvR9x*puyEPBdvOqPoq-ESNNi4pi*a8{*SG@W3{e+9q$iB}@gxrIcDUUD=1`
zzk*w1&`|-=OOhBwC)IG3IcY&5WFP{(Ro!tlFm%jF8vG19=q8re<?`=`(eA=v@f|Z<
zK#8C<Tx`>EdetMhAAuuO8x8kf+%tY``-A+oi_2)N!J}B}87-?lX3U8AXVCfEkIEvS
zOKE%;WJN!2xMuGwdnX@B#5+3Qg?3B2p4SQSLffVM?~YB4Ps*v_iJUA=@*Ji++-i)Y
z;(%Ww)V#)69B>&meREB<Q#?Y(qbLv5cr|z;J9Vk+UtUy1jR_H`Hf-~df*&0DEXvEh
zz7WHc_aB@VOxo$eFC^9{&ro)wC;W(IB<joNo2UcE?k5lHeMhGSQ>-nZRnio8CVe#w
z&1{2k`q1+L=%Q|z8fyG>Qt>*3h_`gvP+fP|JxHJx<}cWiT!92!$23S_tT{BbCtue#
z?<&FdK{X*YtrLcdKuY;rZIk9pTg<!snb(Uhy4GuE#SO5MLM4^qgJq_8ePm|gX6*yk
z0%?Qqzcg8$QO}x51UAJh7FwXoX!>3^-C_5N-S?cU9G}5;0nekxZTp9g0cK%5Vz6ux
z7|OhPK4Wj`57D`@8p~jfkDF)qCZj?8N8hpdIiiwR<RSC!qiEJhIdmQ$A@dRHkZy>T
zX%CZLh2Zq=X`MU;<G8msH}`w<9{Yq+mx94c)`v|hgRvT)!72XkYy0j=?rApgA#O86
zQYsH7|4w$J)94~R#V5?*G+v@!u9d1=nH?Mn(AynQdTv@(e2|NiBrF}>Z_pPkw>@QX
z02GP{Ohmj}Y6f@<WE+5XPMeI76T^^}2x{0A_URPcvlq9=zW%aJBFsY;+JvJ@gCrw&
z7sC>tyu1uW4BtEG_P^qf)jAf`X;2|BKymTY@H@_tGb;NzH=)cka@y8noG%C~T_q0(
zQtoiSM)!E{0_eruPQLjC0VebuL!@&MhF1kLAF&9ZoU}~}(0yTqi-08VJbbSVQ_fXT
zkcy6rqVBW%N8vz$#AUdSAdvbBV9+kwjqCUVpn4J`4fEz~PCGi?sS7DYV~GlCA-y-M
z|2S0t!bj-p9+*mNFn|NS<aQ^|?|!;?A390mpzrVPNtxq*aU0&n&PCL7;tAOcnacP6
zp5RK#yG`tJFdL=NTW)oT^KIEQjrYR*VB3Mvk+EZ1O3YzM%k=eyb|<Z-)=fEY%ysvd
z5@vD#-hTXba(cVTV*{EJcDo(8@E2=CV#@YF$YM3j`ZTpv@}d)~I^h{o4(%Ezs_1(w
zHAjRGoO@Mz2(H#`xnPpvLE)nHZr6b3?vCdsR(^!?;7in%%yLEC3p-Dt)$&7;W>G<=
zBQDHCJ_}VM;$LABD4p9jJ6cEH<1^kfc2kCGCM-rz7Ztuhet**jH-LHfS|d1LK)mUY
zp|I=RH9ORFz4bewN(jhg6?vOsM}bP$;|2rdCaMHJJf^@)-RDwQu(=XfG;lW$g@H(U
z04%W#huI+~ZsY5Yc^CDde0f!C0|!KTP`zxcKdd1y*7DkZ)z3C&wm@UG`n_x7gPk{t
z`Or0Bf^|uqXBsCn3!d*dM6K_3TB7V8sm{lANI(>fMuPe5yvrhb;53VQss-H~B>Uh0
zgaZKPJ2$;N<B3Wci=GVO8zS)@4+*Nn?%PUk;)#t;VEt~d>6*71LsGgGTJ=wm<4$9w
ztOgcF`Ew!$Sre~*x+9?`%edjf8W3y9!G6F{FNEIBNIc*QDg|{gpftDzl3p|{s~+0_
zB6mJ^6jBn-fpX)IyHq%}YE{QCI0cn5pBG7dr}^;<Vk1z5ETSHH6RS&J%Dq5CRp<Kj
zk~SEfI)(U9d%bC9cH`KE+8*6M6d@4j#1;~7Y4eDbk7Z&oxF3IX0yQ06u8!2sc#O$<
zb7R6d6P-wxKM13->Y?WwBd<sfjq2a%n9LUlska5x`DEy&PlembL7g<9_>Fp{p|<-1
zA2nQa;nwDqaH~I&y}=>6)K4jB;WwueWmbI)=3Qcv&FuWt($CS^#&>hfa@WZ#-zus~
zTRA}=sB%rLROUvu_nZpfLcV`69YBwtR)WNs5^vxOc4{WvYXzCnTb_&7PYWkj_GQ@A
zL%mepfYip{=~`dPsD3@6=(7&xTu~cj<4kI(zBI;R3$Bk;f$yi^REv`Ga9xc4>Me8j
zmza0%&+`^<?b@LHQ;T*7a51h5Bp@VjuW1Q&vJCEuuX|gv&2q}>(ic+9{o`yIT6*Vo
z_V{fU(+<xH!~WHJiIh^X3F!q*N2l3|Yttux-KC8>WdyzZhhf?^c6Ttu(chpuVfTjA
zW2741Qohbtu8|0FAqpMF!XCFNbI5sOX~|zF#DVv_bGEqMqk<-GR-tf(r9bXya{XpZ
z0v)!BE{0iW-3Eyt@5{$)Jqju>xe&!w&Gqs%29BVaWWenwyVxfCYeeB9R?^eksL7*l
zZOM(@8E$b4F(cbXh=@&$?lW^>Qcsq+MHp`T5+q?&D$}S>G8J=)bo71Zjsb~y+5{PW
zr&n*@5$kDM_dQ)+X%}A=b22<xmxY34AnXZ29F*Mx8kSq%EssqF##UbSD?tai!8kmL
zW)EN%Dc=QSSF;aQRRxgc##X6_ZA2|M!4#-xM7Vk@E$Igi^=f*<BeXeLh8738u2xg4
zyrQ^a;6(k4%O%5?cu<M@mZt}&KJU4gNmO!y;sm~evQKR%mBuWFn%2~@kjt`nZ9J#y
zS!fy)1_wb|H@Q^P8~bgHqs{rO68@>+Gw|Lzal6uOxSgEpBlL#Hj@qoIy!0u*Hl(nD
zU9E83X3gR<U@VgKr$r}3Y9vGa=O}22W(PM!d(FU&UV)z1uvEG+CimHF5nQH`*=GCc
z$AudZXeq;JkIX>ci5^}uo$e&x4M?qH!S#p1tu#C;kf8p-tG+jv)pDnNvkCc5X43dy
zHDdeE#CaN4(m7~md?SC8aW1t0Qb$J_+lih8bcaF7^4*@zaYLWC6ym*3XmnEA@KR0J
z^gxu&EDWP<gBfdLH)UYj0)9@T3L6D$c7eyB_yor1j;7i>-uvTt^~{E1y?gg;pch8n
zlJtk|q750@6*Z56MKYN%`VD^UUbn71e_}CfXigJUP+mB^i(M_JY1d=Ca>tp_)pl<3
zSY7O#^pbB2`RT+dPEanOfo@PuC^XyE7kRB*P-EXO64P-s35t?X{0kTHM1$yI;Lg6n
zo2f4=ZH7`_Qd~NUCxkjJz`}i@hXZ)g5A%_G>C3mJ&yu4z+NBdL<mrts4J)!RuI#v9
z85MqCjcq#sbf!JK>XzQGCNdg)Hf96H0%I{M<?C`*fWQurI`+b*1h$WMysv#E*DR4%
zT3VYJ5{PbHNzdKmN4wSw!1)CxX-`6~S1wTr(^jamT=30ZQdLSjk}IDuh5j2TSHIn2
zs7JzvjxT9k@#SLu7N-hLE0W+`nK6$74z|^Po6M)hL)7CL=AvFo&!ol%+UV>Sm^reB
zsw4`D(DADnr96F*I!tP#_SC!{bqnCdWkqURQ_SOUcU)K9eO6e%Pz|T-!l3z$rkz*e
zu0hn?29`3`MqVa#4uYmZ4S8SZ?kFZ_bO3S#t`~JAMdz{60?saz{$O0;bA3LiNOcn`
zeh$ZftT=Kj>~I%llgA1RCLeUr*)|c#BF+(eP=%wKs3%S<l?<=+7jc~$R3}GWfA~d1
zfw0_9$=%;9+KqbHsqg)9hrMqc6(>YsDrP1Mh2i{7MVre^{+;eRPI21lF1FZ0hj;j7
zJ+^BVyxa?*z(}cIP$zrORph+w8f$Tp-TTq}DPPC{1F6$RS!#M8<E;E6UjlWLpLBg<
zeAMaYfBU=H<O$3FS|@-9%6f45!irhpqVf4-nI6T@#2-J=A#Z?C;hJF8O0EmrJ-4bB
zLzcVL1P1{*D}i*`2)7M1$)R%<BPC;{7v)%QQzxI5Jbx@W-75V6v!F}UqYt~Uwt4Mt
z<q30Kz9yS~Mu)uzk5PgK!{Y%Q@+eQ8ak}GN)DIqX#z>rUIe&p!gNjsxTprRNmZz;1
z*p&_N5LfO5&OH}pQw~nNR;Z`S(&O!)^1i5BTs-DxE|Ng&cHGBQV%dZ)-rF))Iw{3M
za*1@e=ys2Gu9x=AY#kH@SCUct=Q=MIeZ;2IC95et=d-XFtt^`vO2|?(Ab%3!Df;8N
zKpln19)0I`>Ita53ZCpL23_)k7JV)~Q8Kndn?$E}_x+Q!VDN<xUyxv-<!B`G+L~P8
zU*;Kv)%Rj$xgn3;SFw+J+}~-cQ-kV6cU_2&_PZSfB}EGhYkBco;*(#SNuF;Bjy#6Z
zq6!x_9O^8wDVR+ZNekMdQT6_j#Q%UFX>@1O4*T;fg*jKY9XNq0Ez-!o?CpMshL5}P
zL9Pl$s*HRRwMD}Q*x>3*>F-B1wxE4?lh3Is+^ARK7KV!G2F6ydy)$__$&V-K)8f2^
z=}@om|3p^FDh;3Z_+vLV1A#|gD<dn#x`9LIPg_V^WYj#lDtJ7g=F_qGqR*06TQlM}
zKm27|MQKSy_X?UjaR%Rje3ynV=X)0eC4;QyvzX+wrx-rXDTof`HsmCfE4%$`dmC_5
zbk`KNJ$<sXPC9hu%bu{A#<oI!$DrzVxwMA!kp)z@Ht;PE3w@@!;UhVe8=5^<KHRZ!
zVHJF+a<Q!oL~b*I%SGeXOyECuh6EB}$QPS(K*q|YgI?eLdw|Y>=1CfA^OCcj9O|m<
zL8s^)_k1NZJ!0dX5y^t2q=;7GU)VyVA%&h2Rv$foPBr%LK}<M9Oz-mQ>4og?3L8Sb
z6qwjcmB^hM3Y%hX+w0Nrc&8E-10~Aou)jEj&R6}1U5RkwId_EiTG<iX?i6eO15tB|
zy<Vrkv@V=+!(g-XI{3M}ChBCXG^4KG*1(r$v6g5}y~z*wxgqOUg;LGR^m7kp)uzj?
zG(xUxZ%kE+kABCxtyJLG7mot-;-xsJ&AT6Ns0?6NR@ROQ^>!SM5fs&0ex~oWe|5b5
zklw;PsD3)t+XQn5;Qi@%Cki?7omV_PkF27;!2Y|v5G1DJ?g`_M)Z(|U&Vtc@c?;8Q
zwfDB?d%+E-ThG4@Sm_+YXql4)7poNuePGnItg;>D^YWHR)eg)PEZ?dVYwSI@i=a`1
z4<HZYu%Z0_#M=wn;MWza_q#yDQz#E_z|YDdBHM&|d}Uj|%IwpDkNZg{wz##e&Pt4z
zvTv{*yzO_=;bvV}nbe(?`Cl`Jom!Sp@0JO4f0=&m^g#CBRU*j;(SW?&lk%Z|2|c0{
z4hv&ef{rY3k3U+6aOf5Z-jLkZ&C2prC?WD__jm;(y-sb-a^Uff8Lm{A9~GV|Togay
zOm^-<A5XhL+yGuh07l0lD0?6IH#QLrPr^M_+8%#jg6TnrAQ!JPQAv(UnQI)ql_cu1
zavg^MLixQP;zyYFOkhhxerUg4J8U_RbQ=HvWZ%8q@l2w$b8}m|V?$N>F8pS593>sO
zlAsS;@={k}_pV~~+lO?2^sC$<B6A<IlRoDz-~Ii5BLAvqV$gkK(J$FYQ<_sQ6RR0W
z;;xe(8%$X;Pxl()W-koevOkqY_1i+nl;MTver_Ct1akwpj07|%@g&|mwhoo}kJvL5
zOT&h%wskfsY@;8B_>J0rzng6tO0}D0^+Q$;myoY*kxv;~&IQnQQ|n~u|KlSC>WO$8
zB4_sh@(`fnzXLGVVEEs>8ta8pn&Z9Hs0;f4DEoiG3k2Ns|MGjbU?UI|T1hc)LZ8|H
z?WUl!?*ASr1Q1s^C|!)&_DBEe4*B0~e{EB6B7n#>|CdPk|C2mi-X*Jx9;-D=HLFW%
zvI*>4*A(J^G0e6XB!}w~5B?UFHPG=~b?{V|;p)P}`F@$z!Mlc}ad3)-Cv)G9qyIai
z2CD%_rV%nP>))cTKziOLh~-wPG>CP&v6ekC?)#tU<3f$$T-BRiB8jLZCpoQVVv@kj
zv-Uy-ih;3&Vz|7@X}Y&dE;S=zF8P0Ac5d}&JH?yqi-j^P!*3sHDAL^96t=hV(Mj%_
z$sGw7mA<gWoJiWLv2&|WAuGRTifU+x>}{ow>dDC#_d~xYRqmDDNW;y0pJQfq+F*6Y
zYQ*Jvc2nU@<mh*|!t>_`n`hm&jSaj1ENohMHm2aYfrK=h25ua#KBLD+LW=Qcr?xgy
zA#>d&rEynEcnc|poSF@5bcl51os@9YhYugVD1S`G{|!C3?gRa4xT`(5)%Z00EjgnE
zZc|KT>JLa88qEx&VZ)*y$B-p@)ieuej}2P%Q?7p$UVy%LpaUpc1N8)9-Q)Y%x<tCS
z2}u&(p}qsx7k4L-hr3NPd_<V$9}oM`QjhO#xE{V09!M2r>(X47%8@1fjxDa_KKH4*
zxw*NivUgJRZTk2<uLB+4A0J3cD(%qbaFqw^Swc>RaXs#9ZAN%J{u(KhfxSjPoL8v-
z_U-KVzRR+IKTALay&~-YsLKT&wEs8QH@Ix*GAS2lf~myannOGP!WD7D>z1k7&(_sL
z!47!9J)+3kGN-&-jYd&xZfc=6=RJ1a@7A7F;-!AheSA{fcrfR)HwFH$xIF;2Gsa#5
zy{ebVjA`62ReB=ZEF>gkUc#RHUjSs&gfGM!?fZ6{>}uu+qY_~wOLEwfipO}0$kD+n
z=Uy1So!W(Qy_K_j69@j!f5D^9kL#nNR8>}fqHu>X?8*wyDZ(<;0}6l;CaGDdpL$|-
z0KEn_dEtNa41yb*PS!nL+D9Gasg};V`3^7s4cSb<3d@`u&{FS#Etq4o?;$=UF1_|4
z^j!RWc4<m#C#m$M4h=iPvlwRv>hVGC+GPZ0upYLAS2S!30Y!h|pg}iYJ6y3O>1&X8
z868>?*Rq8iMjX;?K%c);kk4(cX*u!NUJ392xUZJK3VMva2Dw#~m8*^xrBz)$>_}i1
zPwXBNk~m4F3_K}|$oHLl#T5_~B)Z_g3iq>KT`ExPnDWu#{l}{OoV9zqL>w%1E-_lL
z6Tio82ej?Mv-^%nPtTg#<Zbf*K>;8oz5#yqv_m?Cxc^OTAUhkh4Dg7*N<zY_wu!KD
zbp_lUGj9ttX6>d?FWZ*GLxgz)?jAY$++FYXc*1+JIN&78H)4s$0THQy2rWy+^xFi|
znXSz89EuELe=7F}kdh_%eV=u|Dq+3&NPwrYO(zYw1<Q_>%G=LFl({NRHkMI8E0{JG
z%EQCc*ZjD39a+O^D>u1z*YwmZwT&<%1OnJY{2<FsF}YT-_FXKt#1u1di;M8pCXQPE
zHoA=puX$yq*UMKSKs?pgkDfr-aYoRzY6>7|E+A+M4)egq5K8RXufYLp6%X6j^G6z8
z^0D@Ne{;w>)B|snC?hmt(6?Jv;v4gtg>Mwdu(p(LT_eYUZ2Y|G!<1HUm;4RuSJ_2c
zE#nvz6qE#0A;c2|ZbzLW>=!pg321e+PQMF)_yC9RZF>+-O?bNy74GMR{yV8jto@et
z@HAiPvT?$=CCoFo^$@Y&paC)#SYTBRTitOFIK;PW8&UqI42*`VZ{4_g!NTI%*z#~N
zfqbwRY&%&W<=mXun^Cpf(*(wM24EWOI@2TJN1R{_m?;X5{X=4)OkI*>uXp(!flW$0
z0h^n)K662co=fud3`)ZNe#@T^I7uLYQ>gwL$R}YD)dI_O1oBG+vS`UTVZlGoz_yh*
zg161-+ka8myW)35Soj{WjMn|Ia2_H?;$>jfWt#>(hI*;tqH0_(Edm5d@^7rchx?vE
zx7Yq6qzS5x?il*79Nk%@*Uo}=tY7UIt+{<fcq7qH{k4cXAIP(;<?Ro|g~{MmdrNr2
zNnz<B4&cp)F*!uOUwaE4fI^B=8+{EL$(9Bi{lI~t>V|g0ve-O8+Q&kV%W5E7r-4&L
zuM{Xh_zPnE9!e@PV1$la8xi&t9}n9lYu{0ixR?Q!NPV65b&dJfZ8?6GwAr&pYdUs^
zxvlN1)r~{<h)b(qgMD~e2M@v~<CS1B?G}}*L>XI;G@+yLySL3l5rpUXfUIg_v(?Xl
z9n_WJtJ79dC-F?P9>+k*e59T}Y&$_f0=+&Exr@cb88)4KAC?HaUayOP4fZewrt;ip
zJfI(rC>lU^j_mhJI7*5`#j)Ss>{GPYHVfJBLja)?Ok+sYM<0a0+B?`8I#D`0y88C)
zEh|$8V_nwRGzJ5O?dWdPP_v$%9&_ON`u#<!e<)Hy4LG~yYsp7M=Qv=%r;~#YR0%)?
z=zJM;2G19mRF2wRqY=6xY9u<B9Mf;EQ6`?BG@Vph!!2|E{Y~w2X%1_ACJrRDr~T!C
z)*ZM5J2l+lAWL}f)UkN&*A-D1H_yg(dz1zvGsUdsNMwf;fJ>>+jR&#AFVMoFrq8QF
z@h{an2aof99rn%OAIs1F&6}axfZAsgrX7R@O4tDUO^;I55CGxa;6*$lvcvKHwM&yH
zCiT5tju(oQ6Kw;+p)LqP2w}5-$?Yv*XA%0#H;G%bL&ocgMoLm1#q14}KRiOOQ!<fX
z*3KfrNszQ#;BASx+Hr!piGKiS&2GDX2fQfQgAbjV`%UcYq-(7gWeL!tja??sLh7eO
zj)<lPw4n?MYa}2$UI7~gzul%tz+e`GZ`>(h6M5e?G60sm0D<(bH<g13f7_7?kErKu
zL$+#x@(`=vz2#7q$#X3r*wYHHYzg%K!wBf1M8LV^Rw6_=4^VzLcB1_m@d5a97mkqs
z=E&b4a39GJ6(=6ePWJPoU=S0?e9uUP-ky`Kt3KetRm6uz9q{~Qq8;b&$9{jn=^_<t
z+qxgASa`D;Jl69CQ4046sJ_*wne7jTe&d=Sex(_ao<&XAMLZR(?}lx_bCW$IcsUlm
z|8k@;D{wBU$9zpKEFrB|kEe(nQtp(Q$9})z&j+pJK!$^Q=0x8E%L;t8;F*jM00#EG
z0B06-G1K7#(c$4IVEn?BC6}dhuKRW29~B}a{8<Ze5H`)Trni@C2ay2S{ebi<c_WM}
zu;uSS!mXWczW>-VI9E&GOHv#_5}XvPV+h##%aybOE!3uUfanDL^-?y?Wfw|$@Lq}B
z{eNIBB9r4Gi1#G5`S}&xB6L(!{Ie=qT%h~`BKEFu0SO*g=V-w`?m~!T8oE$U;O<6R
zUr|g#*`4Ky#1e=LvKh?||Je^eKefy!u=}EC9wyO(YpcMD`9yS{`#?xV1doU!iN@vv
zMce)JVu@VA-9_eJvrbvqQkr-?2#QyWeX)WMsCJR8`uX|kZxB9@@pt8eDNJ_=`^V8D
z)nOE(jMTk0n5dL@2Xr^$Ic-NeHtm{P72o-;8qE%mtV~hyKTPSo#l$1FhUII7bxG3v
zbfsjY*HEFv8Bj&&C!xoHA+{En1j4S8JwdaP9r;3we;ou!Ej}*4iuMY<OJN~0G(s27
zgZ$8Khwh+X80$W|@eeC2!KS;S){GP%SO~Q*j+INWggOnb&#*Mg<B`?MrB)VuhGvUT
z1axntHru{M)bY7<&4w2aCx1(=5th`(<jk(>!;Qao)gO-YRGS>5I&ZhD_VN(Xa;`H!
zxb@BqhJ5N2+v3U20iD-LhWQJB?48gq+y#v5t45cl4x#A6!;}P^2;D%&kA(&A#++6H
z-+Lob@|bWxSxC-58B?e){6pm9ytO032GvddDRYmVYX)Y%RuaA|F#=xSG@OQy3FdBt
zW28uQj0UvXr2Sl=w>RnK%a`X1Y-N7a0vs>e%00yrVq#+b%8}2<=*Kk*P35DmoFxG8
zmudoiwTVFYb&tcV2WG-Etbm_x-Tq1R__6WIz+jhJ_S<m-v-u%g1Lr8$*+l~1f#y~t
zZRVGVN&R)`_VqsTRMFNk5QWqF&)3XODspwgVzRaaIQ07Q2EZ?L5;5)(f^)PgL`6&#
z7y@88aD1OLl>e~XT^}ae6_VZ|qO*gqg5H3rHL3Zcfg8T(lzJm>kizw;)tJbLadv4I
zF6VF%cKn<8=z|@FNzh=Vz}=8p-;BOtt+9zEB4FC|!0X)^x^#dEN*;jm#WvyH|5$*N
z(JZ4|MCa0&BuSiA(DKfSx@|-@rw)Gz2IS4RHegiT(3VocvWb%q-4Bt(x$_PEw*G&B
z!8%|f)8?wQZFtp6Dkk(Boen(nCLqxa>|oCzUJlNYYR1J5>g?4cVWndJn??jIRg*Sf
z%;(|d)g%3pGF9)YQhdHXOOMD{xCC%1=q(kiP{c(>T{(HmPPBfPLP=5_gKz%1G`ywx
z0}tWdmAt|I4wb4+CPDp?;o$>%mwvgXU1K1=+2|<nz<Zk`h{=_>gZBrq#5pswutd%7
zytR8gB63yV)Un1z*y>*jHC_<ND3{~1|GB0^ECeMZ{3O$t-ifct(Oq>=h~9IzKp|r9
zkwtpg1Uk*oK489HlH@e%{z&kb=4C75rq&_BfwA&Tr}?z*c~KJ93cJR6^H<mL1XFhd
z;Tsxz$pDUo(RtElkryr$b5+wPqb1|&2CtJedj|F7=QG$#;g{MGjdeAIM(I2Cpti%R
zS$Ac|u6cr>xsDrUaK3&;5^v!NBzoF>nGU!gA!-34<ND3=2B7@oQ-R^7C)00<RC%h+
zJX}aA9ORDIYFy4ffCom%0%zfQ!`N81()3%Dk>nqaBBL`~k$R-*$-6k6H@g4ZK^0<L
z%vPHT99`or4g|(JkS$E}wCR(?^@+S%q<Kd(o-#UVb_t<E-3s@!hT&zTyX%->u7C47
zPda$n;L*I?tr-5fL&Z4*8!@rmgqOw-0rj*yFARf~#Zr+jizf<zwq`B%&UKy?B47C8
zNEK>D*j_f{_;TLYR_lpBI2WhPl1NNzygi>tiVf@^s)V)hU&b4gdc=#~U+V=Rm1byF
z{{&RU7LzS#^j&WtiL)5&aAz4Linf_xZAO0VSWWECKZw92Vp_B(A_ThX^$RP;n0|wc
zQWg&1E?q^QH5I8{SY>tVh|osq>f&!|)oRnRZzO(*;}@y*z@6!gDxB-^FqJ}?3IZeZ
zWnv4Iy3<o}$I0R>#w#X@FLk)B<#m363)LHDyBi33WnXQEyw&;p8k=1sYeqgxm^Kmh
z!7@!{5u0FJJ*!UP9}AxxiA`%_YbA4i#mY5KQk*Ka?JF@D&4R4qgg)L~QN*1O8oaXN
zBeD2tM&EGw=u1YXL&en8hBG|fuJrAM-!Je|$2|*(<L>rwOU?1fa3)%GMAySO3o~Bn
ztS0_0y61xB*_zeE9@;Ln&(^Xgyr3JJ@wy?9v~W-T;b)#|X(D-U&PhEp`TrGW_*qFM
zneOg-h7f-$%0s#2Q>DY6nkprxy?UlZ{FelScXkV7iM*>&gZPIR^KUVX+h9&K#~qyf
z<LR#<H8~AC^WKH#r=QH#?UPxrCP<3&rG>wOBL<ip0?0IrrE-m7rK(f$+J0?sc5f5t
z>iIeHMxvWQskLPYL@EOT2)<BtKrfwF;$owVf9~zeL=(Ri(gUW~a-5Mic#3(|Xp=v0
zQpHCq?Vk6%p~n;oi+?CWSn4o!WznW)^AGKBjyI<W8m<9OZvc+RfP{%VJi+ISO{2}b
zc=2N1Wl_MHV1@4wU0<TpgI5HEmEe~<!m+s|agd<KRV!2w-}SdDWvqNqS-rSr3dyDP
z<V=%49}D)^U<|3ui9}*PXgz@O0oa+tmDknzZ@739Ajjo{Vs5akNPh^3lOi{fIVK`4
z9s5fQgrKzkxu=?)Z>8X@l*KE(Kbq+Gf_K%~tV=33FCBk<5@}0>`Qs?uqP>0-#ZA$q
zlTwLSzmc#Ow>_MkKX{c$W`ax*w&)Z%_=`lQhJvtbbwyB?;-mS8L4(9^#?6CUG~lUr
z<VM3M4MVBg*J10QTtmhlvR@e2H!Dq?uUZKZ;!cA^h1#|%BOmTDU3iVK!OiV+M`N$r
z*WA4Kk4;QzvMpXHvgk~mozm1ZBrJ=K08advrPl}9V;m`YReyNKF%Zf{R#sLuARAfm
zK{EViz{t;uvFW~v&scLQ5K@iQK?`?R_b4kX&&8O)IeMa^ri7SZb6`YR44$RejBmY|
zzlnML!=}I`FT*O+@@MGL_$AV6?Ip~WPsMR({%~k~C22K7LFVMtg5f>K&uv7)9fN&t
zJ+0FTASWkdb1Uq(9qbBOzSKG2%HTGz?+%_C{Ix@64xM(2`P`e<7!kzU9QSiL=;hhB
zu}ao?WkkruV8^y>d3dvbnbIV!ugYQ6VKTdUxNkvs&!kn~y(F0?BH=U1bbVa<7CLJF
zq~G9Wg_JkipX(f(C2IfmKFlq);90=nCf8SSy%T$5*)86%<2QSTbmU|AVys?TMTzkr
z%G{NQ{I^W;&ATRBwi2`V^vWPSMl8nSuvfl2*bV~biUtRmmS0Iq_2GT$OVpL_#@DZW
zCT2>uo)S`A$8bsqsGlD@J}GhHgJ|6pkx_~uyqNOQKzZxrIBdtkyqlDBy}OIuB`%iX
z*#NR}%|g~w{GfRC)+yfK!U^Qk1BSIK)5m*V^`y;|;t-%`+HYbQ{}4E`O_7R_*BdCt
zPTq?yYMSZP3BoV@U)@+K0LdVoj}GWdp;K&#V=2L4e&0c7C;4zQA#t+xI9t&}K{zTX
z6f^uUK>!J|1xfBTwxvPBltov$$?}!9>n56U8_9?LYYDSIJhcu697^{znCGUWbvIHE
zXqA~#lf;2zq@m5mKWFw&kbff~IRQKb2FJ!o1iL&=71Ypklp2OmV6eHGVZ}Vx+hRKQ
z_2&tFxwdO^>&0Xp1O)LHvIlHF3SmKP)kqgU9G;O!DsAVM9&<VkyqU8vhA8y#d;6__
z{`<iJvIi-Xqk5=Vw7?~#!w~1b*>zV<W0Fe!Ho`0ZxqlMUr;tI|TYV-1Un&$3dInxu
zp8w+3@PE3wzwvPs)o`NIk&mV>fV*B{Kb`5NmgB6pK_WKoOi)DG9|zz!wtjy|ehO&Y
zpWB`Q4};TiUE6g5biplOVhv%Tc*i>bVeKjUGi<4nvn}W>-T8v>$rbmimz{L`x}z5g
z%iU+0p36MQ4!AP9h3UX9%I4&+qM>EX1SdWVb~j;$kcI8U=xR?4>L4fIt(9l@f5~x}
zzLI4$Ty31&x>NIK`+39nBd<(Ar}GIVkE_cpj+h3g#!c@0p2Lh(Yc>kV%Z})A0cLM;
z2206ab#;|2+j2xxuth#Qrq7Tu@_CK>eu;|}cH2y;0@gyLwXIAk8f8$Q>Y!GMWmAs5
zx{%315HQ>fX*<TJyu3WIQ^1#sLeVHCYH<XG{Oc<wTe0zt!UoHfNU3jNNj!zbJlO~9
zDi|ml-0cx%h)pZ&-hUrJgunBpmuQ)By9W-MhIw5MR%MUM^AeFg{M0Qgkym;^^#11d
zRoClmHJwj6KD$U?9chxfv{KM9SZuR=@Z*fgzRU{&B7D5diHF3*ns20YNVl=1*WdPx
zKv=^<936}II|tIRX>6*`84#m2?XSbLOv8!z#@X5%B)^kL^cqI|;ky`chXE$~b69GA
z?fe#{-0zRx)X3#7f1Y-Xk8=y?9)mNy@kvqqtB=Ezgb`C60(nRn@tp_K#rGo5zWP(d
z7>z+-KmWYkg$2ZiLbSLu-($ml)k*_QB-XU0J$3l-H0B0@PqEXOL@$jIX*T^r2E6z<
zLlttcU7ZSjUCUKF20oi8Z&KZI7f~yIMMd$V+s?-DJ%|e~fii31np$yRWbP~3;_`Yb
z+`%oOh5RQXMyg|==;F$jGOFeY6_;SB6!Qk%zlxrU7r^}#HqaEUN%B%lW13rv?`*vO
z8eb*7!dE-G?O><e)j6TF2MGkahh5rQzpLC_I`!&MC({A_Z?+1ulH4t3n%wFnB*&6N
z5w}zRgp5&_)4<Oi$D79*CsuoEw8sa3;_L0opu27`;@>((I+8k9#oJZSzpuCYLy>jF
zcC;mk3W_|=Sjg=sc&!auP}AD^oUCDr{RLe10UT0T(r`OiSDV3V<-ClT*tvylUBo(@
zz*vizU06o0$Su;5K(Y@jo*jd?9U-_I|NauB0%gXrM=XZ7y)1e$M2wdI&h&0#6KY!A
z5YxfAqr~1GY!vP^WC%l@7nM<Btc=0T;n0a(6fH71dQ_u|8OC3b`dBkxg!GXkXO=!h
zvg7dsulIpmONAGnpF<6eGRFEzrGIo93V@^W8pXiVM^K&c3TN#QK2b7~x*-Lt9;H|*
zOCadlD&?fjO=NHOsQ27{TZhL>18FsDujPt+$5LTY<AlM}^I0EYvyhb{mEP#pxF0?D
zq#?S9DrjP2f>|5Wr_?VI`-yHX^F<WoA%S2w1GdFTS+`5iq6r|LiWlpuUs~3Odzcr^
za)95V^q#b1(TEXHmiz>pbTJ<)bw^{<?gx1*{H=g+`ta_&aDBo%C<SZvM+{4n##=9W
zq5KT2LM*;OJe77aKhnd~nhC6%PK0B7+h2S2#E!v2g)Bpa1Qk8X`H>eGZO4#nq4GVy
zg!yhn_+%`kmcsjbdrczKmM*6aNqz3xKd>o^k%%#(ecAz8!kP{0#Khfkw%P+-^`kY4
zLJD%o$2|ludfLp7VsWe(nJ$@Yf`1JJ@0u8=giG%w(nll0!_D-Q{@x~7H`EERvPTb3
z;s&|+_S<e`u0t<Z&z$tcdO$vFVK_w;ahMscnPbBleOg@18N?_~-UONa10V@&4Ib&#
z$+xft;H-16&Z&<>-u5t~1p_?4BObwS<ABoB`CF<O#{I5HPE4CsBW0yMn^eB=AG9e|
z#|=?YSuZ*9<6nezB8yC$QBzet!dSO--jj0WN4uT~@84|4Eu>SD#B9vp_1IZ0eGk<g
zqOZFKs3q)1au<LCxDp3kH8!6TV8<F|!N<qb_^d7e{#whDB<<<g4fi{qPhB*&!T<QT
zb71rUxc@eR`?VC~wX^pyxTa<npON*10xN`zrj}fQsRmVuL`+kwuG{&0P^Qh#Wwem!
z3L!K#oM-F!=AU!z`@8ac57z4G@dx1b4$bDnC_97`htkbtR_c)E<}LX6UJ$-49LPiQ
z5T=iQfTxSjTECoXzL}@1haD9KaQT!sG}DN+9b^i~=gw8y5j=}j&e|on!p6<+yu7?=
znP*|Eo+TRq1)F@+Q25he5Fh2Krq8l!64jG8v_gLlrAmkP2D`}#&?lbXke1}DO?02v
zw+)eNodd4@Sa8)63iVL-Fb!`hM-|g*Oyovs4ZkF9Pu>$PZYtBUky1hr@P+bi$STa)
zJ<_2fR(;fYQDilelZ|{5Y$Mq#aMS!(IyNUSjOwwHkS;^Q5ylP|Au<`=!p_{DxvuTp
z@^b!d$3}dMg9DwJ&hqEk(tIh3uf$6H?{qEX4x!SR`4CaW47PfF*xDYps-?@eDCe^8
zNFdXPRxSQY0YZW#*}nt^_nT_Jp&L#4II6puz{#}%;0%fCW)OI%3{q)}tSl_2CU%}U
z^xTEsWvnU?#&Cn4r@tyw1qYl+!)OQWGpv%zR8>{4=a)9&nHUA<)+SE((oVNtB}U~~
zCer3EJ69vVj;##@e#HHvFcDy^xa~sb`Y|`?va)B%0D`!9^JPTWkCaoi;}f?oUt-;|
z1y5)(OgGu{xfSNsVU>MB#B92{DZ=DfQ8pqJ6s;NeofRs4cBA@YYD;mGOzP`aC}5Yw
z14=e=rx^kssLkFSi&$ot(u;`S9nzC0iT#_s<BNdXrrTWvP<V#B0wJB2ong8^RFhD2
z*VKx<@8mEjJE`T+n@UiF^%>H5Nlc%w$%v2~8_T%HVADo|ll2jWnWM`h(81u(4j=Ej
z388wI*6Dcd5?;oZrJJs4J}zG}?g!oculj8RSn^Wz3FU4j_Tc3X>BHuV_F=}0%J-o}
z3E>jG38ym3A8XT_t>6mhE7QTw(+v1|BnD2}N`k+=>FS^C5ndwZ0l&ZW@(A8o=rPE#
zS1giT`X*(^g^fprNrh&LNrXr;vR|8A6Wzh_E<gtV5SYMY+fMywG0eJT8R+lZ*qNKf
znwgB_dn;BVxWS~NE{1aHa8zpI-wS%0?pW-c?x6}BZQJ4f{scV<@4|nWf(lqYU0f7_
zcfJF?D08`a<UO$z0I<?)^F(DJ$18GZS}R>*+IP10>@9oubJy!7|5ljmRS-C1TY)t$
z@Uh&4{vxpST5h+~QL2%XK3t%SuHK(@V0XpgciRpgA}~D>5HLf){KrJgv+yihA_?#B
zqv$^Z6gz}mn_YOBG$M@wtITqCEeEu=1a@7e9sYKi0^eIwRBQ@f<Q_&?qZcgA2kRe7
zh!Y1;Gvg_theQ?gn-RA`8xR$0{Qy>b0A>U#oF>_fS5V>aZvz5R;xfexclcq09^3iX
z5R7UR(mW%U`PhWiVEaz*@&fP(ray}YdS_nd?b7VpCIWshCCbUY-Y?7!k87c}LU9*!
zZUjSZ40bZ_7cp!v*sf1V;Q=gpfS7954WIC$p@c-kRuAZiV@IMv)2fc(jlxbwMV+2=
zXmuM#CfJSamGyBS@3EWHZhMIS2z7WI7{R5sHbS$OKlET~_O=Uyef<cDbIn@@2T`6N
z^lZrl7k7uGCjMm^D8tbmo&jaaj73{Q$q@|LXJA-|4!5iL-=tE`scUIFe$59gfsLwp
z@g#tle+4B9xk57C!V?Tz0<WogyB<QQMVyCQf)(b|^$A75j~rc#F6XX)1g7GHdcT^6
zWBD7x@9?zX3&3hUyTg?7$`S&rfPjFo%e#4q`4DP8f9wo7l{U?tS0a6cvTz@2Sn!Z=
z1O<<pCnPr$Ux{C8K*Q+6eeS*QB&$_kCn;*>5;-6n#mmz(diZ`+EiJai1-}i}3ZgBq
zP~s7`RAS};Vt`)(Z7Bv5+38I?no|E+Pk``8A#x!3vE-o;{EzOQ0o#<da-0_nX)NTU
z%7Ksi`}ue_QX4HD*9B#yS_x3802heLIO3tfaR626ZFH*qVEPO&eyDkF)xYx`hf(19
zmPd~REz*Fxx3241y@jGEvXV9N`M=j+_i{je2Vli{(oPirYQ6-lmh^cz5<S(h%jSHk
z!9iY7mVY>79TnCVX@IIw(yb_~4+yX7PO$Hf)-TCu*@TSXBgw|$H7Quz9Pzn)$Po;F
z{?v%4M*LPl%cXDJNcmO5M}>~vcK?JHyA8C!mDht0kQS2O@}^&E>||1`^A+9K*Di>6
zd2XzO`JA==H1?vbxW2K8z=6o1coz@`RIG_{>}>g4Nf8OCjs-wTt^D_bGa4^Xif(Ib
zfeD?aRdcz;%a*zhjoPQO{I95Lmn20lUeN|m17y_n=;z~WWidPh6UdG@IU3*>d!&!g
zG4Cfsw~tS~rE<Sk-t<{|@n=^1^;Q~|m)ObKIS`>Nvo8rWHI=9zjxPv6bej37{BIDq
z0Ek1@lX6s2p@DFLj6Do1NkmQM0ix|N9XCD;j<28bm>@g<9R}|OZ<kpbw}&|>PanSN
z<sS6q3_Owc?3EJE)Zz+8cZ&r~Od5o1dLkyu9tVo^7mm4&$sI9<<!P)vzk|GG0~{&2
zHNF3aLkht$Vt!o#MjU$hi?&9!nkU6PI6s@;qz$h(_zt&8M&-Z-Da@IzFl(ePNqRcP
z>+`oP7@Bp~ZMv}@rc{adcu{%Gy&o*uNL6~o7*}?ODGA*#D!hfk_4pg+TOcC#fWW7%
z2kChJ!!dZ?!)5~mq!t##q@QaGO&-fl_ApW4^VtD(<CXMcbIlnSz=DBynySq56tjry
zuaLNCtGpi%WUCio0*dOG`)onIW6@;HR4K2ta14yd8%$AiyYi^d?BHVt-_;y+acm|J
zdzVOyTf|PNdJ}@PP;f1iq8BfJU8gy=Vv*tqINuXu0(tMxI9AV>2i8#L1OoMoFKjjv
zhCUpCEjwp2zJRL(6E6meCHDJB&2`-zgz*|`!k^VV7V7Qc3bUvE(zAKyFau6m&84N@
zd9>Jnch(kzbhy28TLnjUGG7+l-KL^uo|c}Yq~UggQ;7Q-jMaStyzi8WO>hQ1n)L;A
z#NDr*1E8;=1v;s1(R5}Vuu~`5-bsNS5pj-+j{d1Rmj9b|2ad<jR~NswhRMDG7bUZ<
zkh}i8$a+=F=tB;b`}obljjVH6ojY&fIBfmir0Dw`@wPBYDG>$=NTNF-Ef?XMo5OW5
zlP<0U23{qZN)4Yg2+;m<=Q*6hpL)ksbnPpxXtPPkRQs5FuKKCfUS639bg?S4-NJj;
zmEhQMZ(jN<u{<K;I2eU2P`J?eJoYdSM#FHUn<XDdh2LW#t?umUaR{ID48x1lJVLBk
z-a3y3N4X`7a<<eqhRpE0N-oiy#Mc4``E<m_@;+?uOxZ58JgAlg1JtA(S8e1q-}uEz
z4GCCpe(pAMwH~Jbh>fR8Q%Oy!!vU$Mby}BFfw5(lCxSeb9ouXa{f(5CTy{w?4EsJW
zld*Bjnw#Cc=LVb9;7K(lboUr%bXL{DPrU}lqr=<bQZ@TZ_S;qY(szrNehBuDn(koA
zq4b6ATr!8DfxF7b%lU96Ft{d~Lnz!5E=E;Y_?la!?Zu$w;t$bZhhZ8g*60NQesj7J
z-_uW2D`ZT*J+HgVo?1-qo$#`bMh%_FN`xT`H9B)V{?gO1GgtCT-~wYQG)pdzu``2G
zJtInlsqCH<>}AzH`LeXzp-#GEF))-y%m6Mz8iXk@s;1F4$DbRhcCP+<Sp0_OwO@&v
zjNUF&`ih;GT47391l(Zx+I@SYogwRdrDEM;$7~VvvV7R1<RedL$Z;y7?S~BD>Sp&E
z!GRv|6lw#6_d~A%a;g0-51w4J>7Wufao3L4zfYars-#~`<VPIx6=DKDIxA>3`VQ?(
zZviDck_9TmL0>nGGg7>x;k#F|mL1#_o}?)K>t0@g|ByIbFy*iSN;KFV<oOBAI}|p6
z!M&3ie4&nA<ea;0N@<(~$8NOG7R?sIh^TiDniRC-l5glfVbp%O{b-jbv!*zV!>NPW
zMg~vJBR`Ro!Km%|2Od<p@YX3fe(B~dB4Q6d{Eh9PVuU<E#m%?o5iB>sIk~?jE3JdG
zP4id4VK>6=;-`?3>U{TL%UpfjO6VCwx9J>`a3mOo<357Xb`vlM*|^0rw_Ti6YQj%V
z8AhSn^_)$Ety&MF!8_hI2MVX^a%RgXr21oBtXer=Bpj>>a-Vukj|)%Ueo)77H}ALi
zWXsZ*lMS*%!n+k^7QRf4Iu;k)Zk}2FO64{nHyD3&d*m0Jw4QbjPX>m^-6t~1UFC0F
z9U917sZk12pz(F{Xn4G*(N^5r>62a^(k}_h**TMDD|?<hr-~QdTI&U{wPfA61u_?q
zvX!3nwqyBi#~>F2^;##cHq8LHY+~DCJc@&UbQE9D1Pk-Nez9|PCFjoKe2qs3d2iNL
zn%ZWVoGacf?-R9yS9W*vb+~-}$t;-YK^Q1z)~vQ`*&aq&sd$VaO^oJ?Er5y;dWBWH
zo&fXzv^jknf*9GK!mVk^mO3=IeAVS)>iO+Gk&yGgtUUNZ&E2*IY)u_Z=8x?b^Y`Eg
z4zmn$`*~4w*VqGIt}{;#N1bxV=~?A*Oa0iZX72PJMwo_opOCi9BHhBcFFY2UHnZ7x
zNI$B9esp!YT)tjEo-<1Ab)8Y@mZo|vvBthcL#|}s{M>yyZE7k>#_cn6=k-=gkbdXW
zpRTSf^~)r|F|+vqB}u#NqxpNi<n6L=?{QA1*>Gk1uS15bPX*ofqCv}h;U-txk9*y8
z0);+i#x4KMYPpj3^v46e1L>e9_wsK~hf6I64Oe~`*1;X^?K?8(RmP#NJq1_#CgDvy
zzv2k#)^_C?&>hNxH{NckLpPPo<UDu%Az*%c_w|9qR-6G#4Hn~QJl1_Q_q^_nD~<HN
z{qs&WF6Td$l})&h-KdaA%^RmpS1L)8kGhl!DMRENUehy{U5A4|UQBW>xB$~LtDURM
zo%N!{^VMQZaE0(}wv;Z66`nac(dSyg7<LW~B%Zn#<jS=lhO*~v=sBANS1D%x$fj}4
zgKIABrt4ibsyJnpf?V2q>YjTn58H#~_&mVLPpwc~ZGFvA>O`E?Q5eux52MONO|5C3
zpbG{T4jC6&aSXscDyD~jcE=@uM3?9$BE4m}SuBGVn*O!wPN8c7k6%mIUL4XXjyyjZ
z*%WGjsWBqQQ_W$fP6_<;A|Um6bpWw7{m-@tdeaW%Tx)p3mouYd{FbP_zE(kPnV|c`
zovgmdt2u8xKvOFNPR}BL^6`R7?*7#2(l4r$mRFy>Jnlb!S~64PoGHw5j2j(VT`pd2
zN)~TuOUPH(V&3Uc0GBq@XU0xH3#|8jT-D;kE#4s+v;}Sr$<G2>C&FAchdqvtMcgw3
zT{o#z(WS2acMLR?aCw0-fWS|umqg`Q(y>Rn_^yezEDJ%`<_~nlt;szCpKt!}HZc4U
z)o8}-F9H{1r=nduOhVbs)Htfr(Nr{(y*JSRV@`4rowge~_8aI{<M5WHbFO>}HY)7M
z3Yz_*bD6;TFXYpbJ<K&`E}hoylTlL*NzxAGO=^-8rCG3M-=3?~Ot&V{7}4SIwB#e9
z+xBiVR@HQaVbY0%FbMpolW+19dY7sD58rSpG(I60Ne8M($p}cPSyb+ppME?BGgAh^
zh@3zi%)<WqD93>nKhxa$mrioAR))70KRXtMz&+NJ58Uv!R^ZtSS;O)#AJ&~L`kh82
zdg*_h!FYAhLOv~lu{?Aa9)ze#I~`|5`a>56=GG&5WcKLb@3KP^H%WIkGcue-es>Y(
zNVm!=Vyv$=y3N9m{ne|)tjU65Ec{-n$%vZhFDjU@>)$Tij#YLkocCDi_o$2NT<T+P
zmqf}NY`#$onp3(2)8Gv|l9dlVV`K>K(=6mAm3o=TR6JJ};PFd~cj&sxgsU{%3vLeA
zAq>uZePrkX!{?Kl3P&Aw%jr+-8-pov<{$;5pE{pD63i;|gwN+&oQKm*-MD3C(2edi
ziDwtZAeVqYlm_ioq|#DddH;^}Fblu{^&a&aQ#-5^6`zJ%+_@MIce4(hdmDF#Bn~c$
z6mT2TjEnrD;H{AmL{2fChQ=(Lqk(>3+F?L`w6Q8&FmleHUzTR)S#ag-sOCfBmbjK%
zi<evP4#RM+rQ??(G_qT*3ihYNVwwE>Zhazg=30&)&M^63EdOfgvHVEIF+KaMk|-3f
zn|c(E4uVT}t!cK}K~5`197gi&Y>er4ilW@35YETnDNhzn9#AUN%%)?=TX)G6&2D9>
zh%VF6c4~#j(>ha`#bz;q|3%qfhIO@mQKPUTD%c=cq_i~BDIy&LQql@acQ=9<G=g*r
z(xr5RNT+mncXz#W;a308d7lsGeAw6KLWSR2cg#7*9CM6kQ%m}tJ&~q0cWItjAWF#|
zDeC?IU`W8+fqGJGt@VH5vqIU0z#yNUm7Vt*)e}xLV^@+D`;mioX(7D8VMw(SS{;|o
zJ1%?ws)BopQF9EDry{Nj;2ze$A!kwsL7OQhgegD21pN1=dzUg0xX4?xBuv?jR&BpM
z*lIvdJ>z6x<T!|^ZVh=-L1*bN<kMSIbk6Lha}vg3Yq_BBOU6CJ-m1%fjD+hNwHkaR
z_xE0{g0_c!*g}5UR{CaxLYbL7fho0;tf5_hM^q3koc)}W<Ac_MX%Ju&15H*bNlnQi
z2M``jM55bi2EKf(cDx?UTI2golBLA2IDkiGx2OD=*$^r@JgS`>9Lf~V*U>JExkycf
zWIFe{Oq`Au)Dw4$@o8MGLE`h-L^K-jQdT3a<K?mH>;sVM<~*7edL#<A%wj*;e9Arb
zBrqyGP5Nayw>o8%Gw%B|d0vwK|M=?dl~+LASnwlh<^ATR*L3!Zz)aOCxqGsmU(PMi
z6_VTyRJkt!5Z!eI2~1USle<<l{fU^c<m|!*(8(iyp%FFbS7T^^sQ!`Hzz0=~MBCLg
zTL&7hlO5+1R_?ue$*1bD4DC-jmzXrmV7s*5!*<R}m$OOk!nexubc$VnDl2vmMlNM5
zk?@0q9YsWf`6JYub+~|`HF+BrE{|bk`j#U<cC?x@?n(|g1b>CNVymVqP(kkc47q<4
zc235wJL|Y9WI)W`gxrP)xzoJoCD#tEZ{E?%tyPsEi5H}wfqc)xu8J6+VWAuCg+Kx2
zfpu}$o$uP#!?xOuPDP<IsP04-SdO`R$bOSS-6VQ-i^lbBUwJ-;hUdg|jt1JR=(*jp
z8)HvS(Lj#NSC~d`J_=EFSgU?;%z6y!_kVDe|MSXsc?!2dq?^almJK6MiVjo0P4*+c
zv(hf4O0xFuH-5-^&J#evQtM?8ZzyN;J!{F!n&_pngJs6$h2Sdqe>l(ans4hYEeZz!
z14AVrbmY7dIZsiLWByTc5V7KIdf1NKdo4xWJ~4J(scK!j8)yk7piUBP+KdFqpjye&
zgP|28*~0p~q?{~7FE30>1;-^SIWBYj?#FrR$mNqp^<#g0*jcT2h__clG=1usjZ!_0
zz>{DT?t4aoU#UHL5M~pQc%$ph+&$YXhnu+NF6j_!xb<ar0gR7D=<M1&IkvG|yDzJ)
zy5{wiPBUlfq9!9@Oz`>iW{0G)y`kG$8GMce2&x&T61Z|ZU#>e+;5n@Rz?*4v_`x~6
z=3@bZrdD`19y}l#{sXDWz9J@u6CoJ8`%u?KAZ6!QHV>*ALnCQj%9Ibf|AUTCVcpTG
z(1yZZ*TJFW{>>Ad_`9U^je9X6mOCgI2Kq)(N6S}-e#43vpbLOM)XG3KIK0x`=rGaY
z>vUmm$7l90J^sZVSh1f{_)T}nBQ2hJ#HDA^r2J(1AhZn%y%wPR_f-9s@h_o*xI%$9
zB(5aQG<=US*5|Z6S1V<r)<4h1)XR<lGN7sih0YAOEUxwDG;6tS&5?BEH00aIuw<>?
zNMWmtpB;ErEH_1CGgbX4#T4sVo`77W7;WLJ?&uG%hi{cX(RS88s$nf_TmP705Z(8N
zmrHrgt2t5Nd>k0pnsiH>r0;uN;+B3uma^!uq_3`TFNxY8yDq*tkW$T{z%XJ{HXxUv
zTSg~2l?cccO{C4Zd-i)zN1xRWVyF!7s(ZeXTYQXa600PiCooFOr+`SM;f#GlqB2j3
zH%}#z&*DPD5?pO_UW_euj*aAx_UZ+{%|LD|iKGbHgSTlRL(*R^Q(<Fsk?$&(1>R6T
z@P?A2Dt@8x$XH-(y0hM!Z!dCuqIJXg(mTuijgZ#kE1F(=O^w`Jm`U^DA#$n;sfpP=
z=0yuzRmz4K=PzuPI{4|+RPL-2HNEt!iCZiVzpWs|E~)ts+}DBH`N7P_jmPdt;K?Q<
zz;+~BY#)TyUy>|l?M>!<`9L<Hhv{N7xxcB2KHf~s@DXaD<?`9J8yFa7nefeogq=M#
zZPR`xuD=<OoM@whvG;dXKUlj7hEa{dxLcL!b5j-BJZ?y@LM*-HaD}W8Ay73w+~mOY
z7R<BX*@#kPl)*1R+fAPZ@Dai4(7l|bn2ikUQvDX?xcO(Oc4593JV(W3bv<@UDzoG`
zPYyRasMOXx?y7rmyE9wK&?E&k0v6br<d3Lix7?xf$@GXTSHjC@wtQJ)bfYUgwz654
zIeu3O_uXyh>qK9-A@%z{wV&6k;fZxp>T{zPbX<p5-Ff>E!>oz@RZkP;;fWe0yzDMr
zW6db(WIc%NVOs{5b}ia*hI`Kc*Gx`%K)?A$lD=g};yst~;iRJsjih?B^6=7DmxD^$
zUr+{SP%JO^1nV0AL#baFxlGTmn!a|}eZVu8iLF4$cbU#%?6QoTLzptE2vH}di`Vq6
zE~YRlSN`~jq>zq5Bp!bSTNm4!Xn`&yn@Kgkf#LOW!Z+)zx0<hm2YMdItp%{!FlM`&
zTDp~PEP)YJmlu6FN&cXcpqXZySb6GIiig;SC1U<#*QANhnA(a~fgqoS!QCl*Vu^%}
zvNNWgvgT9az`ayE=<i8rb(m(HGxfDsY0Ry*SA!t>bfVDUT3uB&hK;*sO9BcD<bT#4
zQsRe^vk3yvska~Y??7r><9t0X?3btd;kcmAG>8j}x7u}QYgzU+Z`@k=7}wwg$q387
z!<?9UKUz|CZ{IEbk2A&Wt4x*>sV~$VmOwejt{;rvIB1K9l*ViDtOptJY$rc?>+xfd
z6oCf`&PlWT$uWNgHJ<_oMN=MHfaC5@f@QY6%NLNAhK(jM;O&MkpHLl?<YI5cByYc!
zukCsrZQg2w7mb1_TR`Ug6C;G<8mpjZ#epny4u{+u9G@KNQydk^s^24{Y1DJa-W{}P
zt5BIzpQd^j_95i%Y&IJ^neVqI`mDMCs+SG)MPuMMfvajN0L^7HHTP03|HvZ<D8)R_
zC^!@49o3;4H1su`?+HuT*Twep$czM6ggI|xhl#W?0C9YoQVN*zL3@_d5U^NX_%Hl;
z0`hBC8sDWN$bM$husQ~fW%c#8RQM}if!6<=a9iso9#aQdRC%Rb@bV$4cJ^J*NwUIf
z)z4FUhMG3sTf`Q$AeU(;M|`C-MIbjKi3CeX5kXeHJ>;eDQz&t5j0L8%U^%Og+oUJ8
zX9LjFJZpvErwG~-m7HHfhQf{n!nDjKfULc3sk56N(yVySb3;x(oR(RwLhP5_`An>t
zJ<;k*&eO3>1ole@lrlSIr|Lzz&zT5TFC2SJ9qiS$v$;{iL*fYMKoD~^IlwgEIK{%=
zQLXVMt}SCy;f!kB0-77I+*?`O)Bw4$2TAkz@-L0&%ifo?CJ>qrMSt#IXx%V>7NsA%
zv)F3cyu)0M{%|BE)aoCBm=^;;Pmgbx_hwcRKmkl;pJp4ef#PvHT=?^Ik9;X{Fe2RN
zNiPyOtfo!4jaQ<v^T&|*jC&APOc0#x-aQh$F$>mTVu(BBF?wF%`iDn{FYW5Z)E+LR
zp6o@R^d<GS>w6MP<b?^R5{KL2C3YPxp)egRW=j%BFm%O^i@1Fo-~H<Ad^j-|rk|n9
z9tm-S{s^{+b=-VwGwVpiILv5u8@?ghW}^;M=6Zr5+Ry<vX>j@B)EwmBYEjDCsvRx!
z00CR?L(bVR7oDEc%8#8Q^OT(w5?lhYHtkgSe8tR%%5s8XMaNJviz2wV_ED*OgMxH2
zxw2z&@GpMct*s_)9&^FKuOULRSND;K#xi&l+at?aJG|$hE7OVjG*{5S*Qz-f&8@eH
z<O;GYp{aOwvtPU?+QCLUvy=tN$5odSJr7Q6CmA4;O3?w@=elS;%xcjuk>7I=H{$f+
z%|1h-tAVdBBw#V?D7HVThIls_L%MzUw&Hw^x>x$g&?k>ViB~L#4pllOqJQ3SH(yDN
z+S>i9-!{#9<(%f<yiFFNcX1uuaq&xHYe&K<*vm<J5J$$Qh-}|UE0nhdG>kc=p5KoI
zg|wNb60<L^xCq<R);On4FxwRsgC0ulaPR|6?=%d~&OmV52(^>_o>s`qp1eLun{j!F
zcQ_~MtcS}PlumLpxO4(<MF&@N6Dr_B(+FAC%;1vsz)S--&hffrHoNpfLok4!gN&~s
z_cNMP^_of_QtrmTy~aVIKp-O*l|bur=9#QeSZcMM^H!>Q{8o?pBkuh+->9-2)e7=T
z%9H)flQ4ar=G)aHAEskup9_4P2N<k9twMb8cC8Qqk_2HHDqThuNZY{sg-TvEhMe@(
z1?W(;deI4t#&>Pm-oiQ(2uz>8bw4Ai0uk}gTN(#}_NQP~t3HsK`Mu_zH*szl6(6&@
z1A-eu=5AASWb3MKeq^nak{vPtduf=f(YpB`X-Mxrz5g4@ukR-wjSd}-J_rAWI`;GL
zk1lk5t8Jt$8*xb`;!=xap>h53ZAPl=q-C3tt$iWtN{3B|=GOf9@jAN2!N{yp=JiiT
zW%<pCC+o>O&Na<+CMi%=tN)`IUjevK+orqJ3&F1vPT{^7;YZ4-L@it!O`D6_kBTuK
z>hLsB)zsf)3eo3TqP$kkO&(#w=Rxec-!^WO3UH;9pn`O$4uql=p&gn78^})YY36ZW
z&?`G9v^odEaSm}Sy*uECVk@s;nkHMU8J<9Eq;Qw4j0bzZfw9)$e6)=gQBp?+ga=b^
z6PcXb5FEWdSY^`XDzw2eMnd{-;=rx36axwhpA(OrY9%>OJ4nJt&~_~JFdCE$bGwu-
zxE=5Jl(X!Ad?Rbli2%1}FXH}6UI<e6J@DaLubayhpeIjR;b%)xJ%D;PbiC*(4dvZx
z=~wAWvf>TzO*9BJ9`)y<VF;d2wCYMsSY9A=>(;$m#`8Zw`RPi~8M8FaG&dWKphdOb
zabX`h;PUv5FY>OPw?mKEG&5%X67ry0t%mCOX~>ugWXDn0|AMV42#3a*94Qx>eHPq3
zaE|m$59Qskh9OEwD(rsL{GxPTgXq^9(n`YEt{*sF{qTq>1trp9y`ILn?@zC>WXQ5)
z8RYE60B+mzTHrYxyUt3PcO~WZ+q*}1$ft3H#3ba5+;?oocXJFDWFsk1cu#{sMYrsu
zT3mB+j}%E;CDcaPg#u?DxZ<bx|GfTa5N(Kw-~emq={q?fF^>0@jk;c57k^#D491mp
zX)zL9=4<;;w-rC)AkdkjkpAn7-X@Z=GkckEViY&rSAU(>{ZRjyclQ*di@)Y<xv)Tl
zy9Ic5R(dTUtS9kFt5~(emU`b6qytmsNMq#XJf<)C3^|`1UF`uA3X|?{7?(Ws9yc$+
z=Cf6TwiAa$ThB!b2;ICYsy7LxF1xg!cStFsDt-U9=qf|5*`npOKAwXib-A5kU|<kR
zp}IUq<=gL@ap0p>>~U&IM6=hrhRR1cy!o-5K&CbSjql8F4k2>gQNp5B57(pNE*33h
z%yr60E-moKjZ2{Nh7G{>`y{XUIxuL0Uuv{h(^taSt|NS!YnY-^7^!b2W@D!Pgv=8W
zba8+Dc6sl26baVLK5v9k=^@8^9V=Q-s3d!_AB?&W`Yln7h-?B3`!tir7chPyMcE*m
z2jxI)mIP`pQoiPG*f%N<`$Ny7bt>!x33C<z+%*@lL^XmfhF$j&2AO-@aG3H&HUcXA
z_`bU##62%vIDJ#Iwk$6t#C*AMCLy?NGky+!@?l!p_+6i!XR$@%soPMx=U@!$V!v=-
zAfC*tCky|IG@s@eqxYJ;Xo;R<-H+bOK`d1e^TUq(_d>YF-3>cPK+3eKrYhoar8_RG
zc77BDJ-xdfp2XEelZ;){DNR^`p5q0@NIx+<=#F!+`l{>gCJxbzV9mzqiP@+=L`Y<<
zNFJB@bh?@sVac@!W>9>r_;&8e5t_snAM4`Q_QRf%9eGm;+a8{jg_L_b6zxU(7+m!S
z4p_QFwlj+`n0J93(PWsr<S&E$@Vvu?ne;!ifqw{b#o=wS8Z|h8Y6lso<j<4y5tuTy
znMWTC)-Z4`q(fT(I)6An-d=YYJ;!LItMG#~W%yO5VHtxOqkE<?gqn`W(Foe73UmkW
zxH7OP^UC`XHB0Uj>A_n;jKB8Lh=M~6XUwyiWx$}u67erUx;MYphK=YxPK1mXBy!R4
zk(!FjxmjpD(oolMj!lsO>!U%03vk+Elh2Rdts+;#&&G1`LOK`TPH_#As43&eXe`xj
zYUa5%@<jaWbwWH@6AzIVu_E?sz#P=SYV&M_W-nh|`CSXB+fD^~PfW$;avoDsia2g(
zM=#yk_b?@1wE}0^mU{q#??h#!-E_X*FIV8cSM5|a;*BJ3L*UeUwU#QQAU0iKu?y)E
z*7R6I`=SOzuX!mq)6sArg6m*gR@eD-pBm43B|zbRZBDI?xX!{M+%Yc9KW$I_!an;|
ze&Hjfi0R2Csx<l`?d|fTwRg~rG?nLFfO^VkQ8$lIQiY&ZkSn?kV6M2k2be-1hfIE6
zrFkp5;c6z+1BXjBpb(*6Ga|{NJ5~bg18(<!J(y{_U%0EG`+G_i+Q(7Le+RTFf_S$j
zhJkC2aR1Uox=mQ5-%Ua8+qL=OY8ZNKWi4#$O|07cr{JS3j<>UtoVoh};-MoL^Q90;
zi$vfEDQMRVo{#zsMhjr5C?a$WWUp(-B-*|~yDq@{rR$SSsgWt@cr>bQXKfy>IG6Wi
zB(7XvNxEmg*(Q{~$ViEXfb|BrVpwvohzv09M@)|pgc~;<+lJJvi77@SHoq8xh%c<N
z56({Yuuw>B_^kySKw$^gzy*xUOg=Y4UdZhT^k<FTyu*G9Fyymkq!-$W7vE6a6)>S(
zL=3QNO2%!y2%pL7c)Sr!eIcoGTh3M^GZ95>ZKdywT<yCS(;r}D5X3TQ^0NjTqo`3{
z$m2#2fbu+<_hTQ*y9H#?M2+ko903sZ@M!enBb|dC>Zok<7WPExvvbzv2dczU-Sdj>
z+G_v^TG=(oM77_Uho-rhDeiM~PpPM}LLNLC1jO{WMs6zQu72zhPN=k#E~KJ-!knL!
zgf&KN$OKCmXUa+m%~1?MiW0k=zrKA6guUy2fq&CJFHly|W?B~f%l8Ho_BX925yf{&
zT`^4!BxP*@Ks#QKTyY>z&uJGeEH(i4`6XIY5{1-lXbmNt;w;cFEUXP#sJuf(S^SD>
zaWj6y*`j^^W5({YwAYl>?Tt3lwq!n99BWlKwJmSn4??>mKoJ|B2kw)tmke`Ky>7Y$
zCT+Ga_huK?c*pgs?KLQ{_$=ueeW#OlOzx7FYVJYergsW^GtIu&o9DfP@dE?^+j(n~
zDNm86{NiU0o4!>mj`StXp9y@V;WM)B#`zJ0cEYu6>Kopsov0^kyF*#|MEZ+fiols=
zNVTIa?NPt4<1=<UYyi<>XkGX_*M?^Um#IU49Q@$9#!LWt1W7aHt=7^6*{Gu8p)kbJ
zg<GkGu_;ZzauRwiS`$eH$=eX(*b&r~vBrC^dPL&?c;CQxXi-!8Zy**(civDJ>E-kV
z{bBwkeSx;+cNXqp65k0gB{@xZkGpFL4jT-dwps+W*Q+u3`fhb;^>1x~4JmU<a@Cqz
zX-<+j=2>5Z-Pm9r<)H!EQKZ|*lG3o*e?0N%IPUf61C*N>W$vSLtTe97P&@XgDaT@p
zLB9}k<LEqCrVPq0{^~D)!{lHT*#Mf)LbZue%U(Kqo-V|Eegt|B_s)JX@YJJ&ZphP#
z*%YhC5lO=vZsc4qyGu<Ylx087=Pm(c?ya38_mPT{wz}sU*nLO6cfZ65S9OmC!J74h
z<EO6&0PWil+)CKXv8w&n+|^S&yp<uyoGh>J>*(0#sn)r@&9+DE)1kj5ZV~3pbr~bc
zL;KNyQhMC&;HTzmTZsG^7ozbq`@fXJ(<c8Nci5J~Pe?$c`V8S25&p-ry;uLdOG0`Q
z2vheT&)^~kN1<|SBu3t%cQ8C`080tTPIOl5&S*`#k#oGi(AqZwRs%z5!VwTN3v0ar
z!oCxiH2YAYj?nr8JFS<kpxh^e1)%~$O0sv(c>;l<kkZftPpGwl;b&-Y&ca6d@j|&0
zeh-tSwQR2KM)=6!8d@&ftvRGgioni^7mopz?@a=HF3dMdFaIOdm2|oQ5!A~`Bo_Lf
z9Iu=_HMSoZ=<j!cewQBZQDCB|q&c}XGg&2#DQ+jUx(3^L_^z!{4?Ao$<7FCJKHk2|
z0XZ<vSpM{_RtMDSX=NHpN`9oQrg{6MkG*;y!8KTqeVV&y3n%?^psSl-N<#Ur8sEa4
zWCC$f9lvXs-vVJJ##H46m%a=Q%8Mf|T0^z)=N;j&A&b_6B#>!B7*!EWCjOJ&BC?hs
zcBnDAo2|2!oX>Vk;UBwEN*-Rd#?l~!kSHUt_PiSaiPiAur)*<xIpD9+te=IP%?_mg
zQQQGn+iLR*XTpp>ic(A0lMd7S><vK&{rBCry9P~M=IWBTd%}l4SlAyC*xa_+vW!&#
zEq*T~c9wh)0RCg`GNAq37BWW~dZBHSvWx61B^%G&9Hu*Hf&F%OEIgSfwqF`ozbjeb
z+bkGH7~|qGW2?9~P$EgtJ?uA8drjJTi$p%g9AGeguS<Dc_&b8^C{U16Rxd)e6Pdt#
zv{kTON9?<3a|FG%0K^Ja@O4Td<yiqCr0?AiH~77i1G<xSI=q7xwN=Q$(oIPexysih
zWgG3<1EM;iwR~`=_Ps!S@^poCYTV+ua5@+RazZK;+3{o%0Gq2lYD%)r9y10G4i17N
zGLC}PUl9Z-eTw@MwOZ!Kd6~YGghi+Wygg*4r)C{j?9~w?=;;W%!&0B4rklN<--(1-
zn5Pk5!sh2huK3&cc3P16kth=*SeS}OuzXyX{sm@H$hKuFB^Y1-Sx*;)2veAR!RaN-
zS4_Fbv6Vnw91-($ir$ErX-av4iOqu~0BS|FouvqVEQ3lDD_}#)N57G5^I28a4Lgj=
zf3f`J*!xBel3GP|kIeUGBCMY5l%JbSz>M(Go`urhu5nak<A$`r10UV6R&(sL!3gt7
z;0@U^cS^sTshd&MnwNGHRcA-@y5?XlY+FVn_l8G6Fwxew+$cXeDEE$$c1tBf-Gt8W
zR}8BKA<tg`>Ord?f%=Ys_hg9b*9}A}+eeLUwO}_IMf58l+Bz&g%APl#o1@WDz7XJj
zkb*5>L+2`q(eLSuK-CDB!74Y$Mops$jjy`RwO<D{Yh)x*i-Db;Jq-OW(yJy#<SqN&
zhtmF~61e~`c@6h(fRt4vJU~A|qqQ)9B^GG-zBJFKgL&A&cj6o5uVC3YnB>#pYXt;H
zCcfJ;T_UkC;jf_5X7jULbSX&LNCctf*$r<*_2lyk%XR@a8{5!`wBXC|ST_N;bk^;V
zV3jbH1y?eQ>hk>5GRAqwQ|egWh|_`M{pb*D*=~2@ga+Jc9m}Qxq)OO?+PI9=5!IBa
zHyd%E%1C0I=Vo#$g*`BzJ0dUkhW7+6(S75T|CyRU2g3t**}k9s3#AGo^%&nF%M+m~
zf2dJ7Xr6zI$qevpz=e86*KU%Spbsq>A(uKXl{Z8Rgb*J?s`GlksELh?lJY_I349DX
zLs|U?W}ceKWzcg=1hgv8l%yXZ<-w%YXYXQq70UO!a=_rTW%LBvx8d@TrxO{w7h>qZ
zg#5%9^bC28VM~_o6^0|o?&;^M5Qaw(tTdAPyKC<Qef$b~KRqyjlt$rzieJPvso2Er
zQ890}INIvDZP!93^nw<K9$Jz4BRhnl-P}6#?;QM(hQ)8D`V~<3)E55r{yr~oE|F`v
z9eTO++_ebX(s92dnszfn!l&YIslN%5O&#IU#eayhps~geKY(QM!oOt6AEJ_9%vErO
z`f`4};5!?wVah3WA+vOw^8fkM0RL&-1vNh$oCU+_<$ru(DNVRi?-`fH|LOVYAp;F>
zCGz-DP&qCk>oAAdWR@)hlSO>Z?u}qf8oa#ML+l%5$ned%tGiW1Fxx_}52c$xpVP+Q
zmMldo<QR(9EhT@S01<JVls1DYdbwMWNpiMRrlvjAbccKJAnj3VmP@W2!Z)+C*AoEZ
zxB@(E!LZ=JyZ_g17lVqKaZkFZ7+-zKq(*LQME*S{|J0U14Gif%XhvYmKmRgi^Jw?S
z^>!pjpUdRl{_ZOQuwtv!wAKxjx0s;c40Dx;VBfvYZ-}}E?`zltl-YrQ^a}wG_}_sy
zWUu^_gt5dZfZllr!vD;G+^@zTZ0=V!p+Bq9?S2O#?AiUq8K!T}0)NUxFn*4q74KG*
zhBxhDl=d2fkRBPz{vwU@PrQ-$1b+Fxb~20*{5~iy<LIx>@$mGimvBFX*jRLX7a;18
zsO_A@c&Od1YnlO55%w5;V$Qi*ajpYH2$$hV2&xLOwhz%PBIv|Jy@bf~#Wd4n1Kkuv
z4`mt*8Fl~u@S@Aq&%eD0djU$QYkV=pMf1n9&;KZ{{m+QyXf%qni?%0Co{orFB|7<6
z1fS_pR|a6E{S?|h@#Ws`MHts(BvN|tNY9L;Vf<I>419@V;QHY2`rMd>sg6WthMqs0
z4DB<VhD7n=*BLNfgv|y~6``CCZLdRy$W*#hx*~xw-3R;zEtMT8I%yF4hV2J4JQqG>
zzQase90qtWk@*gtZ1I1bpNh{g<*EIjo*W=F4X{z=hE?^6rPN*E)zBd=n>qM3OZUdW
zU|a&J^m}#+U3fpitM|ykndW=Ldl%u&6zEU?omi2whiB=3gmUK3^>s=n-rQEgby2$P
zFvB?_f>0wAE&!uNs|a$u;-cDWR1YKuR;e<;Vm2VFkTCR6;m@l-i*kZo@f3gGjqJa<
z%t6SzbOHR7L_CqrU*eiKG3WxDu-P)*6B+ogNUST?1lh3vl^^`?xIIL)ZDA-<rUV_+
zP(y<VoF+g1)17c>M+iSw3*r%HTxd}dyaM5^CBQz)Us6S=GaUdSheQGBVAYl_hK_}B
z_(Kjf<l$Edy%>clPu6=0&&NSBx#RJ_^T_+c3EuC9x%ursmnbhH|I+YXQVRx<S5V5~
zZtg(1bQS<O<Sj13$&*09ft7P}F1%;qZ(jO_o&cxr{~SJp*DR#~ue@lv#PZK(nMc^t
zzK(gKD5%HIE(l9*?rXqPQG@_g1a_Q>iau=P9}#AdCXv)#__qxP@$Z0~OaxzI{J{CA
zAk6!}L%GO$#>~PK2#nm{06O^%6od}dzE(>R4TBFO@px9f_e*C5=<Zd~g9^w)7kTpX
zs#mzIkFsbOU?%wgP2k2^4ULFMTQ^JpJ<Ha~2XFP(fWarY+z@fNHw8uiZee;vOZjpA
z{OP>#=|@9DK3ea2^WO`6gb251CJw`V4h*769&HbL{JmBwkqg^JA_w))7IVo%0mpL@
z7-Hy9SNKu;^S_BsgavubodbFkf)EH9m7L@LykL;W{ipQzdo{s)?QFEa!Dro5{SnwT
zs18LHW{3#565Qf8)W~Q29oIff!Qg29@d+kv{pY`*2~8Yk7j5pG{w)xXfH9Q(M0uEL
zHG%W>egSILzk$*bbVcSa$36Y~11}-6=D%t3@V#Ge;c>6qAUbZmqPnmb7wI&<fS+D0
z69dD`bASI4JV7D^L$0yGzj{?tm*5GW83c&@eu50-3C7JB;0d&lkw}Vv|MoL7mE{($
z)$sR2A*G<1OyTd&;%5LU5}A}=$3iL~3tz_IXe#~vo2b)>PMX8tw*k_`?4xl+4uRJF
z{m5^KLAXek{f7Fa$QC0;->u>Ual?z>@f}Y;*6GVmdI!%)lh;S|=k?`c>o}d0sz}W9
zb$N%Z5pOkoc*?vlK~LUP{6gN~QzKp&D<@us&LQcL<Ia$7ZkK#`>(cV?S(QLYMv@1E
zl>QS<hkqY{h(G+k7-j!&G4=&M3XnltJAjPXQD6@Dy|6mqAnlq22fenNlgJ1t97{ZS
zpYx0H|InKz;D)Tpo<6$EJiJp`W6AW7e=t^L=(y?0zsxCeZRG=j(=L~t*>C0cBp+V$
z3T?QmsxOnY;i^kTa_9XKDd&v@!NlP$Mx)4R)Zb&0l*mulWleu~#zh*opBLXy2uReE
zngG_82m?39`-n^O=_riq))=DK1|OFhK5oiV($?RlVM4Yf{^_#+-?!xSeUm1J+EW^S
zhc!aQnrmWgjS!MF`u$qFV5j~cuu6piB7Of`lQs!h^FRde^Z-_MMn%Or9x%pe+cZ7f
zECyplxXYs6f^T?r&+`C9_N#U2f@;Uc$g2Q}c7WN2+B0ga&~MB>9R&ivs5<Z~+L2Dm
zZs<6=qz#`nUO^bI04zCW(f6R=v#0O?ep#6hIAa^1{AO3BAlIX}W6mWw*f1W;$YD<m
zN-8p=kT2LERNTbOVVE^&ms3y;+*r0#w?g=mGLqcPG|<OCrEwr^V3}R+X$Ie9R<EvL
z(b1$hA5ml;W-}trRY7*W9V7=}-~X0q`F-kBs6c|=>$gJFJZ*2o@3$%e+f*;BElA`|
zMFo6$K`>PlG?Yk9CO-)!oV&*MqNtd;`#CWJF(1vBwI7whO)iOSiZo`Uvb|~DIByow
zmwQl`=dQIv_q87lMHkhB>>D6;iAc(0rvyREsA#!p!GlF&<Np5&J)IuM7s7DVp186g
zJ}0mWSuot%vKvK!ghPR9p3_Ts8riXKusLj8eqQ6G#xtwC-?8y`@U617813dD(jP;j
zfITG=)0MO%E!&}!6bN}ksbFsOuJw=TK{wjP><VB+1j$1{Q!GVn$s}MRO(j~W1J4B!
z(E6-j-t;%_5SGK%aH3N9eGmN`Oa0$r)w?LbNif+57?$lwJH>j^nYZZbZq9;jfCP&O
zOiMlKhnc;5flf>)KQRkdD=&8>gd1__7mOS&TkC?OD1JAS*OM4Oc>H&Rw*vmoyU)Hw
z++CjGHZphNwsl>URsDTbQ9!Rds*5WU1}9LOTCk9_;s9r5eC5+!%%su^fJlN7cC_vX
zB*jB!vU08K&zS&WGTypm0?(GV*-_$iJ(BwLp&jHLyyUXhrX~6HIA(~^0craN$}71O
zKyC!oY`X)RKLf3x7KcnPCKGdNd_my}95(Vq5($@k@UA*_vS|X(xVXT%zTzgA2qU;M
zFQ7T`6g)u}ErOBkstDSM`|SMj2MAZX9qB9AeAhBfYeN7N1O<xf!F`;Hre+9a0PY>R
z?Q5}N;<&eSD=$%fc6{(I+<ANDq!Sf#$?cp5%2X_)GBW-bAt=psAFFWiWui^Cr)%l0
ze>^|kK}bM*djr5HFyMIRFe?0CvhzP$2!eF<S#NyR8wAVuwEWru5D0`Jm;4+cASD87
z{}5VhzLn2-U3%kgN^+UnX#qvokz=EY@iJS-y0|WI6eY-`9o9G9;x@Mkperl5Is*$S
z3N@v~a&^@)e_$Y5LFad7Wqj}`pM%oX9`Ejjyy-t(wi5z-Z4{%ff|E4`;m*JdObt?)
z4RY;nv{!4|11b2Yv5c7Rh!$#jh>KuiJp9TZ+90L#j41Z8fSU*Z+oVnBvIXZJ+io@c
z^wREO?G?xE{(}wss3^7r)s<D-kz7gb%#}T-iGB3~lP}UL1m|7&z)398&pJBypP||z
z*{IWQt#<uh=}6jd)7?uS&Y(nB_UX%m?f9$ryfx=}!t#2;nIC<At@UJ%IqHoH{`>0$
zM#GLfs@WEE&tx)G*uf<IZlmsm@1tXScoFWF(mTsT=~lAK7_;s1p0>w(GoEDKx%vtl
zfRoP*^Aj`0<rxlyJ&;MjYPRW0kxNj@)e|@=x82a%*qezC-`ki8_vP8I19kx;x<cYe
zY=F!!@W{r#e`zjMFI%g*DNQlkgCoG?n?|+A++2iO6L5R=v1+F>z&<%lK3(ZC8p?SI
zFjjtS{K)g4Ke;XPEj430&$ozKmJ15<BHpk@$HR-zy$|+kG<W7mZlfCv<TVdM%PYZn
zr0h$=4m3}HrYYyQ047BHrN4v15(VnYhYLNKtZ=^-kQQ7@xrWCP;(n9-7IQK1IhtUz
zU|QVGBqx;0H&V@hRqr2rYh${Z@hT3>X6ZPL6flA1V9{JhqOW3^L!maOy8{Brx|*+>
zUUsPQ!PDokUT)eGJ9oCa0pUzUahHPJZvuF{>fU)rOL#*ST+?EUt(*R{8$>L$Dxxdk
zLN~^0o1%77{CxA=Z-2D9>+m<%H3<~YgWazg*o1w?wHIt5bfGKTtyD4jab94V>*a?^
z3?k~}&i)4|DX$xYreSO^hr=&v9=xjQg>X&zE3ZkiPho8U_=1%?*OM8Ioh%stwBpED
zr*JRFXo%z6PaudlniV1ZoTm3zamM>^#}@nZSag2ywHktP4y}qeXI-duZ;10&@}M7<
zh%z{OEW{B?)xl@MT3^n#Li^*Q=ZAQjPOc1RJv{vug#k*-%G&<Bmoqc-oK}kz25f2g
z;AzGy3=hQT8mnl5R_D%teXz9Z;^S1d?Ox61&AE=ZVQY3CsI(=ke2jLRvmvSaok=gN
zdF3)x`tlX5BRYX%tIlXLT7jkg<F%ROGyceIe{3;ixInOld4E6^ND@J9QP`q|#FH(Y
zD_o3PO(A@WPJ%tQa91g1NULGXe-3IxZ)ZmpMa<+dhtItKx1|uPGhn!HG_%%jdcTzv
zl<If@TMO2JFcr0gza;EU70<q!<f-`_FFjbF{_xVlxQ}8!)546<WB5#cRWvc(ow1%+
zz@(>;PFGuK3TH!Ilx;YRVbf%NKr?jR=L<s+z{_m^yMCSD{RHck;WkAZpXN>2ee_+a
z3cjIT5b<BPz^uwwts;ZIhhQUb4ksimWC8RyUh)9Q|8?KPYyzOP?kjj4IXdWYZhrYu
z^0+|AZGp;-I||Qne=gBwS-m>?EH^J<J{SPjFCD$Ky7ABCEg8dvb1LPwQ|cq;AeHm~
z?J!bU5HnrYkjtoqH{_6gS!CYC>GP)Qf)Kv#*F+nk*eD2mzdQ*2HJc*X&BeikecTOw
zKIie<3qf(>$<N|rBD7F$t2=zRg(-e6`=##m@U1X~;}qpPUq2=-4;3fcTD|KSY6zr~
zNLR|$xCPH~Jsb?B$*w*&wZYsg5HKIeH>L}*7e>KCmyX!l8F99cn)-V7_U?Ji$i2e*
z@NXJqQ{~eu(7zw@k;6IvC3$Ay-?HtAz)j$^OqKedH!I=l*cD_yQt-eURci}hXXkwZ
zr#8WKq6Rq^Ug6$@FE`)VK*jEDD`5z$GyedMXrGiZoO0$)bqtMq*`WCuc6w&LPKssb
z{0gCi<U1sk%qW;YyE0Tcr^lhaHL$tRLl=KdIad#~op**p^mBXUQspI*%ac?$K?5K}
zt<<hs{2*A}i2<8R-mCwanNb<cCch7*f9X0?0^kFkdQWDAGTKd7-uGa-Tz}*FZ@=o_
zOF<!xqE>$R?yaP|5&GNzZuNFPctR!mc62!ZqsW=tJzEz%H3A;=3C<_Srj61nJ+LLT
z-~e53jrYOl?pJ|PX46cJvw?*H-(NT0B3_2;r+Dx9U`6x7J!ZYng|o4q;}?QwXMoiH
zV1FN8ioxb+w5T_t*VKIf11_fd_w}WV*Kk9geNdyN!u$nWlP>-^*6LGmg*J@UCm0PC
zNh^!CH1EKC1A}qNn&4mwFdPW7?Y8D4!s+0-@{oPUVp?;0H~+WAn9i18bOx(<iKyfI
zzwH7;VJ*owyZ`{XQRj@Jh_P$amr@1*-89%8DjeU`dUuzcFwt+mE49|U<J-$8an95f
zQH`0CS<;1OQw71;q+8I+(RNsV$5jF0rwJ0j*IKPnhEDr)<k`Uc2rY9w2!pDF?povH
zi+ETQ`*92Xd4|e$yB@h#?#9S)dk7D(+%FJ+QOU}+T#z>(b-7sMo_neOM~&F%1~d=C
zV8((-F(StrV|9P$ZCIV7;dE2zsw?jgFiw?O6*_a~@1Y`GL7Tnpha|@ZmiJfdi_^6p
zpZ=ND8`MKop8erRI=^C2NGko2=OzJnM#=T20-{<p72wNH!vvjYXy72;baS}2US|>p
z+d!NzPrt+AS}ns8)(*3FOGJgft{)zXhk|mpmPFeNyKk8n4-21}5rr}7`tJeV&<j-?
zlNzD?20c%Pt6~~y1}8>46(!3iKYJ?P*Z5-wP&8(Et;Wwb26A)*3L#DN!VQdw2dr(S
zOe`gnf^{2n7cAWYJ-?#TP2ha^a=JCz7=G02>hml*zbYcE`gQpwk9;F~4zkrm<@|0H
z+-G{jTLoL!z4txCZ!7K7>&1l^#i@MOrGoqh-qq84^9%|*^0Y0+7&qO{pc3)I*QhM+
z5B6_9FSbhm6jG~N&sH)Bnb|a?R~~bGu3q3Vmf&^_B~2bGzK8E8PZ3yYFI39?iQCvg
z?YyKWpH!?#t|75DmzI6OhJE>~z25TT76jr!T&LZ!2Uj;1GRwo@;rqqIS(=p^;SUT7
z+3MR8z}RmZ;n0bGWOO>T>G-Z$D`h^{pPLGA&0ai^QjRaTRhD=udb>ptIE(SNz-Q#U
z{@|=wFs*76q~_xppKb<p-+k9#Vw-IX(V>~Kl}j_>X{4UJ!DiI6RCyY6n0)~zaUq!$
zMTkd*(2DBM57${e&UJ>g`2P}hP5lPlb>~pXDwIs!>_+@Y#di{k+DRU^hd#Ad2^qdj
z@>F!v@0QiEd;eV7=ZVk7BnkS%CAJ*zW3Hp|he41MHYN!8ciY`qB^*7zA}le9LdR-z
z=|9_97aa`0m|8_U+KH*o4{s%NwgdaX`<qk@UU>bw_NE5r9WI-15ck!^>Fg*TIqq-%
z;%=Oqf%?%yH`9jBkvnvMI^39gtgh}8@x-{tqLTsnZFUR(xv?DlX<t0f?m^IwG1U*4
zsAHdr&_sI3U)>m9_USD`V$GaT_+0GGC-HhiL6%ky#>eq2@YB)tw1%Ay49y`JT*xzL
z@gwB(k%{EAemY-d!4g#Lqa4yzYHuxO;-5sq|H-}`k}M5$(Gt;xfH!qi`?O8>@{C7}
z+%q;^)ZSgfemMOqZsZV$!y;*JsMvZmC#QRRY2YC5GS#YdL?1K{Ukdu-8<L5Kj1ENK
z$@nj`L!TjI7)z3lYkG5*QM^(44;SE^25v?PZ98{L-s^_X=qj1&=NO8i_ov}>hI7pA
zrWe1_RAaDY;RyfjqKUmu7V2m(@sCgM`)MKxP49bpyuUdP=+j}W9;7=#aH(vl{(T9!
zxS<Xb;TC_dVpE&y8qW0nHcL9$wPXK#eGnU7bS-{QT(Z%v)g2@yLztPfiVI(I2VX?c
zVgQsJJvE4t;B<0iZ4OhSA#g4^#W!MEJVM&v?jaaJ@`6VPl38~fb200-S9K=KkjpAi
z$)|lIdRwBx6oEJU#s1{@s2RZy<P(Y5mu1tQGiYU)37R?6v}quhqPrgn7rHWR!_wSL
zwQ}|Zo=HOgCU$e*RGrk*z`ovKbp*F5lDRYd6zxpd(kUmUW`~xA#MXQl&3A9y04!os
z?T{G%n=No=a5|}{w;=WHAGGXI6>L#nX0XUekx$oSzxE(V1f^^|C_)=+i4{_>U#i@D
zlK~0J`9@JoZ&7)u6f$!7xqrUDdKqiH_uTB@3beq>HahU<uSkSGB;CB%pQEekZaTR<
zk#YX=tq_-U7nKe&cP>Uc<_n8u@L>uRinS<&)xxcX&vtvLn66~i)fPsgo7>~Xyd%7F
z%A3QOvo#ivf#p@ZIp#Cd;i;fv`i}Hr99%^=H@HO!Tzd%;R==Pnp8N06Ac>s^tS-N4
zPYB~_qW9`8Zx#`5^Mjjw5#s5AtyvERf}HN3ad9A#(m)|t>|S4Edu=V@QDG*8C-V85
zL0{Haa-!fnN18%>@AtPB!l6{zyTln0x$qc1l{()3m(rf9$q1q^9|q#d7KtWn)*&Gc
zftyH?3oCbW>hQixjR<+9ERzDaaPWj*&rc_NgX*KYFPiWeFB3JZaq9b~LFWj~Q6ly|
zbl%eYpo-B9qwuGo&Q#3SQXF@Y6=jg|2si6N+An(DX?Jg@e_p%;i}@{-%Qh=nwb&{v
z_*QD^;7dxm6xlSM6560MLY02sa!n@$8y^e#2)-2CDOI3W?_D6J_>{B^rQS7r5oyjy
zQg`*iDbm$F$k1rk8sOVTOQs?IQi7G`kkycSo$olJ_!6MNrSyvpO5`kCXt7R&A)-Um
zz*wW*-><%{ZqPiG0oouPW~=K|)wm5q<kEAFk0%O<>mzz{^m|%LHrvIt@`tRl%c##r
z6qi=<rz_`QQ=-g#5<`f_CXj^Ya7m+4k>q#Yjkpi%qtp^OJ9z1+o<xhi%jTtBx?t{d
z2Gz`)mt}-lInN+8Zz@r9cn)(!N4nawQUoW2$#Sx~N8cNq+fgHopfGeLgoKeWkNwf@
z<fL$9LPN_GQ&af!r#XLOAzyrMzDEVu!+bTONN%lr6ow)p6<U{R9|;8<*caM_2yzFh
zBS>$1K~x}});F8-8?8Bt;gY8A2%=G@4vwbJpjqCkw-mU>iYc}i=*Ef3!=%?)C&Tno
zo&3`$+@J{3o_PBB5N5|apM|CTk_gf#AT)WHllaSgiuMyg@qn!5R$;WTUo0Eh+A5{@
zm!o>O(Hdhz%x;}Qiy*hxm_?r{<tSzI%>vp(k0%2p`Ev7dM9##SBB;;T3>nSH*X}#d
zFluJ^8)=K1yzVd%8L7~@NcPt25|BjA^?uikc+r-6oqbf;g1k0TR?J7<$f@Xhr2UxJ
zgDVU2q~Bk9-(wz6h}X%29A0tZij0|LfM-}k<6~q}o+(}|_*w_HAxQZnx8KpNY~H`O
zpOh5pAPNDEn(4yc_P6X9mcVL=G*Hr!pxL0C9#%=Jp`?D}d*G-ZeAX?S-tA)*#!G+r
z(`y)Y9u*&07jgU7xWXFz?mrnHl*<}h{HEJMM)i!$ONXEKj-6k|hc>fc2)f`nWI{6h
zDO(T{e-A4WKBAeEg~#@g!h5PISOf)g)<Ko%8S6zd?YHnzy((x>r$i$b%cUb*yY;iA
zm)m7b%udKxGp@S_O?JKvo}TV{n%Nr3*~4TsSdf&yFGsB;C35CV-GfX=|8rib9FZKB
zSyh~%WtD-w$7niURbT`umu_K7FH(jmAFNF2eYM;Z$awHf1my!rp->oI5x{=kQ7v>$
zIPZz*joNRS`q3nq=rW$S=~$2fe+q4irTiXzABdV3%Y*DqCh|B#`9-&nmd|;|*{}-)
zi#U+2McAv9X9uTL2z^rbAd|Tw7ot?|R7xtfep8_Su?Qbk`r{Q=55f(P&ULc{1WKcx
z!7V1~{PaR-<#EwTWr2-*Xl(0Xd_nOm;wz+=+M{f%KflA?efLrn2aP`x(uJfI!j1n)
z7fdisjt{!`vZG~o&oTU2#b@L%E}Qyetgs1PrkY#}|FU8G-HjXbGIoAzr>+Sp96lEv
zOy1F(&5oMgH6c)(_SzAyQ0Lzvz8SM=!Yy1!c(xuw&;+gl)_Jl|zJt_3bn5u5+^UHo
z^qj6bHC}fGzjIrO_^rc9QaYi+@Qucf$M2-NBt-CD81<$4MD>I_c@+I^HJg`BV&SaD
zEWp^{9er|yl(3kX0If9DN!P1-#Q1yqa9(J_+lh0O6%SQ#T>Wt|=uQ=_vW=gP@yYJQ
z^|<e)gCCgG(QPSZlWz39!~HV2U;`tmO%R@xJLMPH=ab^bGoX{K-(0#gjAJvIQx20-
zsYv?fiArLIeu-)w4VRGrT%_l>r=0hTsPu?j!f?GX<%mpux3@vGCYRp;s-FD87bGcd
zPL{L3?k6Zas>UW~y>5ITHclF>q9N#gZ>mpz=J_M>&j5VQTB5rkB}Kuu+PMF#^E_sw
zpX$hF9jmwtTE?I(pV=$BZ>stB!qss#oc}Vhrk?6vJkJDssg-1_Ic{NX)fEP{6*)D+
zDgFfWC2GY~y;7(6=K`6>Tg;^jcGa%;SJ5peb!7>W>aR{PTl~*TLKh!(vRzZiZ0>Qi
zQ&dxKr3gijmD?WFzmHK9B*KHfiG=Lr(i^%?|5(_QApFEzdlc3Z;Fw^tT)$%eV;&Z|
z34?bufIV+B;w4I)kgw0p1Yud?51!&d5ihZ<rxf-tHi8V7A;g!t0l{UguyP{j!T4D^
zbS+sU7hirVOks*x!Rs4GvF{CBe1jW5*_X)TH^8Mc+`IMy8jVY}H20)9=@@<hHmczs
zvFLwh6j<+TlUvXW!f9@!WygDvM<u<Nu1rfR$-O1(<@a-0C{2=M?kii*t>(rDQZ@jj
zGz;8T#bPX}A_IM<ClP)RW^ti{&J^_F?em(+nwi9Q#9|tu&s?~o(^@LJz&+{rgzKHA
znGJ%;C9clG?_oW6;mXH_X9t7D)+-gdb5U&tYNXPh(K3O{)nh+D)T+qiy7)$zOyYJG
zSuRwt7vnUAF*h>lcT+EO3|+_UvMC)AJ`?|aFhfA?5u4>)do^Y!Tce~y+<~yEE3uG|
zSGezXuho!9&PY*WwOWfDCf0n^F=&xB!Ehc;E#r`;8Qq4N@v~Q7f_Eu(lT^;Qa6vR8
zzs~ZB^bcSL72$K#2owT$`&~n)YZp)n{T5IwFR8oov$R;gpJNmcSJ4jk-rI8|K;J(e
z-x2Ey%iidh-M-(Xh&Eu{Yt{EcHG<*B9@Yo^9=L_yi9){rW!V8g?Es>2o{C;X{KZZX
zII<E{AUE~ynET28yw(S=8K{jo9LK_IN*;Y?J#^)V8kF*BcgR<zInnt@Fjq0TdXI!h
ziM4OCx1$C|aL8w>;dO{JJOjP5+?9Bb9y`cwG}VIZgsvZxi-+8>j>PX*$WSSqUBEP_
z$g9oZ&)@>3JdZt?Bv$Q|H&ry#Tc;90-zyxmSQ#ENlY8bBgy%?x;pTSf>5os@#@243
z{UR}w)a;n-JvXbkZmZoUwqEof^q$<c*d6<zye7=?fddDCpR7T@&2z#bNo4LCRsE_n
z(avSqI8(GsYfz*wdkVBELCtD9xPU3IYqZZPUH?VMhkbb+Gn##%&ai&Yqa#5AhxkWm
zaV5vDa9tJ@z2v1wnAo}ivd+*B;h25)H}`k@V5Zg1mM0Su?Ae9D2!IpdbjUp~HZS}*
z$87}d5|;5VHk{hcp^S+X&m?yOgFi~|cntr<bZ?q8V>TJ(m6XSHuw_3Wxf*nt(C;D=
zGJlX|q5eIhXIzL-?4BpL;vWLjKK5DFhl4S#Urz=IJKgc;FR1F5s|~Wd;eD{WQTG~U
z4in!E<B5%_@asd_jQT0=y7<ELrF0ic@#Red#PnO<6{1G!mLE{f7RB@gsM==~ZIc#X
z>qk(cIx+ye`o1<l*CnO5ULK6SSZ`IURvD0UZNI#wqzc2Mz5S<Ap(KgSasj1kY6wMP
zeF2`sW*EL*@p-c4war!K>%V?>!wJwJN(D(~jksCba!mV7oBQrBCrQSwhD?f!C@4if
ziL~@*z4lC`*IVrUJ`IAK5A^2Av%cXJ#J$I4@T0G4ykAvT(>Nq-s%55bCePyb`sL0;
z15Wmsf=E?^8S-)-5~e+11gBm9MY8HIRh0=jl-A@K&0uv|8NcTf(1T$H?7Mu0?Qzcn
zU!<JC0=t+IhezSY&4oV0sff7T*|fT=+CXIb815OI8qnhwnW6A<M`S674zu-Latvd7
zPx-+zXQNB4qUA#OmL+QTK;L*26sTSM{^JWpR!jFfTEB_}P@j!#pjv;bkqYd6s<CH>
z<86zcIo6n%(l_3`amQ$wJBM-b<d)M!Xvf*#s1<^1__d-|9{Y=azKhDe+OpX%YIi7}
zNf&$|x$|>_UZ2LD;b)U4OEvio3%ZM?8Fx|L34U%$9|rz<LVHtq6MvkPBx_({an`}~
zy_v0bC=UEIPh@|LXV51~p-#S%$<^y(6T(2}oBXQz$n9E}qQ7?uDh~krv2Ev8Up3-A
zj$in24Nnh%wW$ZIB6-f*_UO_iA-|qjX&0X7G((Zb3RSRmSBqG}cie1?ouxfkFiQLk
zJg?>);}M<&dx1>~qdHY12B}!KOek){VKg?0?W0FV;0?YjgX5UZ${W_*1*Q{ejJM<0
z_vxEjZEKBHIasrPeZCo344XlV#NoTxnlb_%i8B_qevDMnKBV+|A-1MtN4HAp?oXm+
zdbyB<C|My>O({?O(O0*<`}{_VP?oM@q3=oXub5lTcQSAc%uzhb|B5ISed(3hiHxm?
zSV=XrPI(qD+LU8aPN}AUw7<m=`<U5(RBTPjUBzn>l`NkKJrz?fmRs^7`TiSTS+vp7
zn}LxUQGfBaGbnQ;ggf|-_>U1Vo<aoWCC~NF!^%9F0YV0etfN2@f|~b8R|u)_7~yrl
zdQVlV<z4{|djpa)Ccy)`L_-~Gn7h=fMV*}h>dt)u4KOlu=nz(`kXvIFV%W0pl5kg?
z(B66ERDx~EY&kDWrBtifQWv!}kRJ}M)@tjJnCHZ%Y%rX!nPFYL+!Vz<K%v*D2>r5x
zPv(BbB%<?Bl(q$9hi<~)mu*XH<5iJ0b;``7Sbl13=I$>~J`|845P*todzJgm2~@G<
zWtqH;n=;=mgLnuxJOxCV=;(yVaupopVSA*B4f#Aav$cg0Krc5^rzy*)CH<5a^r}8$
zlpWa!JT@FGX!H0kCS;55a2R0$`Y?}Mcc{y4DT2hbTU3jjUHR#k^$!Ql^NfbVe%Xra
zE0)-NPyE=hbQq^YQ+-j-2&zrb(8{Bo5n~XrIw9d7E_XhWTGp)hXlwu8x_Q<`g#-fD
zZR>aJr)`hlFZI>iH^IuhDW*Kz4jp3}{ysceKYSSo{X|ge(UdToiMyOCKfW5ANZ^ns
zV{d2uTgj~wy{Y(Mj-iX<Vd+0`3afRSLVU+@7c_8r%yE03bQS7a?>?Xq1l$ue;@Dc~
z@ilLGo}=@r?d=9V3PHNeCpCA`3_-w=^1^Hi2ib^o06x7fQ)u>jWSL~R?XZOZW|6S{
zxYevq@3$|Ztj226(Q4*Wovo0+;IU@lPwvdN#mz35J^p0=QNgzQ>4OCRSVDoILcCWc
z`+lB&Zgl;@-!yw&)4sBlyWxoG-2d*L?OLVmo{(0@jP4jQWx-+=9UN0Y$v%V&bnP$i
z6r-K?leg4H_q>e~Bgf0qY-ri5zs6adAeb(L>eYKpb!K*ELT{6$V+)IB%z^rS2VFCQ
z^nQ@WE{WbVmFQu6%s^BQk3ddPJ>UwjznTaqyi8RXt2afCs<j8CoD7I}9u|Y*uLW^1
zAzfPl(UmFODFrjyVn}Ew)Pm@xT{*QMV6MTA7O&lIyNF7JbVI-9`_}*0F*x1(6~VkB
zm*06%1!?HZko$SCk#4mepNLk`RFl?i{XI*B_NZAzt8LdwmOt2KNq3!-x`a|qO8BV>
zir=L@KjrIrQ}bpm>@!u>)mYktVa)n9=-0_4!d^{^V&+Ynn8K#45m*uSdZ^uki-7Lu
zFF9a^$6Xu;YHircw`F??47TOnET^OWP^51%|Kuqi!iTMVO;Y3xO)hgWsR`<xpElK`
zg^|jKZk%eL)Rle$N`lK+1912RuHs-N5&C_SeSdmh=f&QH8gCq&@3M!_p%SFoc8W)%
zw^nHf#L^p;RyyU!+lx3$#o2>T#p=_9H1MtYBR3)n&hxm(c9Rfxe7O6#5_{$_r^mx@
z9kb7NDAUgP*Ikq7u2<b@d1mhcb(bDgu{0t@{mfRa%CUfXN2KbuteTJXLU%fq-J^h*
zr%#2=#-6<jb7Ucv?Ql$Fr0K?{lwHLW?U}=0rDW$2k(yNK*PK}xPqCqOcV)`UF591{
z8riw!9#~|(g3s#3|9oBy!x$0dZCYpyqRG}Mr$EDc2;ur&$w3<4KQIa@qyM|Let&t%
z8_GGpfi*7G74XhZyMv?rZ7jD5sFbZ13*F88-l&OelSs7=ovcV~a_GfB3WZzsF87;E
znM9w=17D3FSl6eixqIcmN|oAeVLt1|og$go`88;@B2E7I8joEC&Epo(KptmLK*a$x
z&rW4BXkboIeA42kmbg73dLp747GgSu8hdfw=&Tq1rkKMRHQhzRM3WyzR0`2RmnNL<
zht#VL8V!Jss8y1%pIm|R&H51nWFj;I{nnpB!S(@F9@33Y&SURxd#FS%=892&VGI=A
zTI}nn3xrFxmpSt&2(4z$fmoZ&vDr6XB?$^hO@Ty-C@yxRu(0x$Vr8}o?6--YmaU?=
z0m6j0tN2=ETDdmFRTv9*lGNE3cij~&ulHdna46XMn$=DewkrR^7nvnH8?9Uwtb?bf
z{dkEPaYhG(@wEe;Gmu=H6tyF<H};`Fr(0-H_^uQ)P$X%Bn*IG8k7d!BzD#}RO(%-m
zd2pBaVw!O@`kcGmPptNVmj}8&0j<T2{ijFk|8K4S`|fv$HYP;t@849>n>C`D9r@WL
ziUg1eYWM@9LT0dHk!R`}_Q<*Hljg_bO<pBwDF`U6AQut6m4#m*<`;Xh)9&MvPnhZU
z{V#7+92Z4NR;5e)-2^>m)-iW=BF#G5Mwb1lKH&Rs>cid)D&Vbj|K@rT{o_-PIT9J@
zL+Dex(+!T}d*5dWsIds5F)(p5gzQ%fNt$$Lm<clL2-%64p|+V;Dhv^h@oVwl@KY8h
z8|#HOU}($B_Wt0O7jy0LnpMP3+G4J(xe^JJYS8p7E)bg>)*84J4dZa#dnUmVn%g7J
z&H#H{S)&>nz>F0xAbJ_(_6l>`cIa&p`J|d=59^47Ca4sj87Z4GF^6?mVvt1|q*`;`
zp&fnZ!QF8iI{>bP)2bHPTzWp(W5cd#iylYvRm>{N{TAu>*kQ3j?3zG>sfNJrqZMj_
zTwV3j!B5aV=ZJew7Z~yO!j(It2l$IVn`Y(Ov!xOf<Yt4SdW^bPT+k5A;)jH0!~f$P
z{U0mlf(2;22nKHO6qyT1K6b&N2=jDuVw7kj>JWxx%Tv;nPkw~bBC7#)a&@*#NNW@!
zVuvgr^56OYG4>|lRBzGycu7U6IEcz{aE^J5WQZj5EL6raWQYt0T|<P(6d5v4Wr}2$
zoRmt&%oHl6%<~)~^<P_cuY14W-}m`HPtSes<#vYs*?aA^-u1rkQcn_nG!QT(Hw4Af
z*=6Xyj*35XUmQt5rt>4mf_a9dMVtoYnpkS}J+x-=*<?G2020JDMHp6eLE}*vx(@OD
zSEqWZda~jiN5LRQcw>F#whhl{0#u9j#wVRe)YweMxdnW0cXQI^81WGE8uaR0uf%<p
zGmC|eQ&)%zp%QEJqcKR)`^Pt{uE!1+5J93MYjUdad~vnz1KmcC7F*_53ul(%YEEge
z1iDR9tdp<dd@2EP4ehg{IW>AUdExK{D`OI=kL#O!7Yah<gjnIC<bja=WhwV?MN!ys
z*b2%O7KP43!}P9gQY+{Cy7b^tJ^xxY&MVNv_k6yO%6b#{g?_JkZ3wMSYI<uyYi=nn
zy%KuLW>q5x<+D#<>146#a1i8N2IEElzW@In+cuvGb|+8`&iK^fiJCG42WzWv1XA+S
zJyLQ#uUrbFC<jnO(uHEr;nB}G4wSQc54nBem-luL*0B=`e{ai#_!2~bmFnNg$Q~A_
z@B#{m{Dre+kxb2=<kv(}2Vzls^X_oXQ~tnxUq3)QsA95D$jZCpbYcc6Y3rE}c-7b%
z`FwYIa^-ebD&3L-5%0ZUF)lgz(>`}-?ox<$r*eg~6uNvG;vVKFccKXxx*FcGMq|oh
z8uq@yxR!^#X~v<i!YSLlK3wSII&l?OjXifn-p;)?r_oon<C-@sGr1+qN!ZcEqIB%q
zs^g>=%e$x}5p`6;DVHiwejPmcc+B>RW}n|7`E4p9cN$F9(m9FL;;KUD%4M2<JelDk
z_E*e?TJ&6cd;vP8w8mrCp5LKYo^>!)F_RRJ6NfJ6kCNn2oRL~~jm)TuALn1=6e{r}
zF63RGl%|wU-PM9E&h;k6>f`@$5VV5e3DFD`&^08|hOtrk4Orm}l(Uz%Jp)ooHe~eV
z0W5^-NNW~amdKg|Ql86YOb<P4=d7zj9&$s$-~8q)b^~0$gTB27BBo_9_6I<6Z1<zU
zk$}~ah~ekruP-otbyI&J5TNtg5DDklJ}2A*BDCZ4l~M%preXf7-rBOr++dj9ya$fo
zWg7A!J&az(M@<owu}NhUyEmY@UFX^XJhP@tb*eawC10y(h=B86L+GUIXz}YE)wcCs
zjO{gp_SxfBz4IL0U-3?%arA28CkZ#n2_@Rh5)Q}_^`cR7`WM{7TvFPHMwQVKuvim_
z71T5uD$iQiSV?!z_n3`vj{H!pVtYn6Z6FsXvA``*Uj!J#C(P>#ReLo|xo(aY3tzp)
z0AFpcve^A~rB+Q-)ma_Pr(VBP^ZJ0?M(#+6)2c7LFirflV+c2L&TQ0$y-9E7VbPTJ
zZr=gS-1^FmI5rcf`T>uB5P^TH%U@Io^q^E?;ku6+ZX45QErKr1q-iPXNe|9~RfhYX
zVN9<lPn9LB;tob!JUxHTROLX&Cyz-!wtCFQ^i+@6vpx@?n%SYsz3CK(+6JI6#|T2&
zHuji0<a%PtuimR*4*>f~;I!nW^fvI-sD5VaIVh!cu2EdB8vE$4IHsw`(3@#U;ov-|
zO2Qj_X)0d0b{1tB&aEZ|vd9S3Xef#lfNr;7sj{=diG6;#P{{&!jt~;r6r3LWKrip$
zsG50bhTq!jkxA5=bL}oM#`u1O&T-YN2@{5+7D4!s`S%>;=Wq|Z^vOJ1@$^waO?+k1
z7Zs{`9(zVFYkOV=r4Kvah#;U+a2Au)%P%YO^RPgqbJR>Fc%avETDBU8q?ODnt{;2$
zT*++^pzn2GITsDZdxGnHJ%xx1_t50odjiE7nuNLttx>q%i<5o7%kA*q&`gndTV~T$
z=iv@T(+E)iN`H8MU!CtZVaEgoe*lc&(lY@G^WN`uQ9*47g(fyepnbQm)b70uRsiu>
znyy>|c;H`|*}ra-?V5<YVr{XqgN*#(u6>7V%4P1>kd3`08nFW_mo8&>^EkKsT*FKA
z59L?7nTd5uy6N<M<iTgS#B;{JtOFDBQp@p_N5I0Hyyx+*<Wj!<Gg4^+6OgJ@nu~xA
zo1_n9PxyIgt`V^Wf32gxu^VNBV;sRdFu{9pE*_AB^U~1}nHn3Fs!9K9)RW)OXM+Dh
z2#5>~=|7LTu0*;F&HY4BZE|T8A-ib(aBhNql=VwZxhUR1m&bHHi0AV0<Jxl1-BUYM
z%evrMYMD=biV%JSbfB;r6R1ItB22}lioEOIS`x6BAe9QCjbkr1xUZ4IyPNfS=b_kb
z@;wM_H@z8bV;-6yQZ6U_<RtBvgW14|Iigyz0=_%?=^`z9{6@=OV^gR3<p(CuVJI4%
znwyR(_1N`SeQdZg1suup?un~f3n9DqmhSrNh)}zykD(UcsjvOsX+EdB%SPHB&?@l3
z<DwRC`>Qur?N+^u>xiBi4x#1AK5AUxD`P@A7c$>f?KsyMI(>=ie>}4Hd$Z*KZ~?ZQ
zKVNzhJo{Y74iE82E$CF)Q_TSz=Fojue&R9hw0BB&ELK<|7qs`J)e}X8LViTesgQ6J
zmz0WzhSi3q>aS9C90L#e1k`Zo<bR?<XDvRwH2vYY@kjUNj2!V`UK)Mc?yuY#)GEc!
zDM!d{hF&0Uk%3>loVFR^aH6nP_<bx#Ef?-7uiA;!s=^7io?M;!302wC#KX@(E9I*p
zAGG~wvF#GMZ!^k3-Rod_=>HK%8fX!d?-ED+Zc>83Puz=9==neDQ%*laa7386JG4={
zq%YO(<I>>e#>a7!N1|e-X+vqhPKfClk!S|HrN<lnCPFHNvs@Rdo@zf$HafSY7^wv!
zu83jo!O_Fiw9>&NTphQQBcKb(29#J#>E*ZFO;?m6jifx58rI&|L-sDNm{o~$(=nn0
zdZFBHh-zcghlKwiGXHDpoQa_82SXM6N!(RlKyccwR6->|^ih;$sQsCl2mbin!ftaH
zc{+J7s7Bgiimk)6uMRc#Y(&~d7-iz#s3<mm9R;N25$iH1Kt3KQ_i$*77y3VQ4@7f3
zgRgg`Y|tU0=FkvflyC{sPf0de1|FqS!THu>rvvvN1B_qI=Es%yWkS%V7KMIf$-r$+
zvO6N%<wy20OZQ=t+==`<TMFT-%f9lVT%p>BT0@M*ifzQN`{g7+ip4YLwR6PKUMY>;
z|L14<iFkm|r7p=+avt1XOU32apbrVYFqD~`XP@J}iU)E5;L{ei>BKF9oZL-AkzYO}
z<#I=Q0Di2aANam}<GI4pq$kMjK0SR$=+}1XfkyKL&vHis^gN3r&#Q+^0q!jz2V8~|
zh#Rdb8g*$)1LA-Ft1KmC(j*vltd<fff@v@J6fl+!#qgX<1O#AY-!AbZDArzs%Eb~D
zB0*M=eBFK;#N{9w6p$%*5fD8~NDeIr>%y{K#yh*EfGVk1+NB|x^@f=afFvSfh0V-T
z2?Hza-=TEt&dv;#=_Zw)*-c5Ha)V5v^U?k_==^ueU4)t@+lWXIdImLJZAfnVpO<K0
zM0?*#ARc&<|L;SB?Sk!Lktn!v8tX@j9(H_18X$6sz;BRu#H$_pRfYeD$o&5Z|2|3E
zNSIqmfi$c5861IyVF=5mjvm8bT;0=3e=xI%)9{u<?UsVHTfg6bmtE#2x@`n6J_(ee
zA$J*Tf2o!%CS*AQvpxh77&buo%~JXQe=U(+6QhTa#^4L+ND_cBwbwQPfER_VXj|Ez
z6>6YDi)O(V551><FPg=j7l>@V2((c{a|%GTzJt5#>oGT%{eK4_LJS;kqmPG1cKTv)
zg64G;8Rz+}HbBuN2ot?BxPYHQ-g@iaZ@MRC5q!Tdy#M2CS79P@6kQSPnpgX$K;(hn
zoMc$?J(2M8vVE@I6u#R?;KhW%!}In#pn|G30fjHIHAV8(f#0BsF9Y^7ey+no2oXrd
z1tqip{aw(TDy91POWAAR{ut^#;u_-VxS*UWaPH5ip}az(PrX|4Q@id}pfxjtW!g{D
zwUt>1@B;~t510t#c+Om~kVeMyd&3SL1o(#dh`D*!e=(agY@bKF%~!HVmCNJnzkMt7
zez`MJ^KvusAEw@)#yVw#n6E?Cw>t1E*e*1P`ePK#(kBMZh5tSiFJae^3a10>|KDF`
zh}{=zvVPz=@_iAw^WKc<(Oc%ffOp#R?9$mkx1L2fEM0SPmKB*?>Wd=NSIywYs9zQ?
z?mQ9?Fq)#bG4Jm<9B}~c%~$*-yZ*B`1LU#Vq;X10Zw9{)Dk@${b}|l-k9@mH_JB{Q
zl>>i`W}>y!p3EO72!W~e;!Bb)Ksm$|(;|%9DgHbb3Wy}p+jCsV|K2mdzl;qBCC$sM
z?bG!9R-|#Btjo4!o}E?lRg9Ns?N)G392h>&sYB$yUgfj=Hs1rIBwLr73c=ETNp*Mr
z?5*Dil!nr`$X+TqqPLn^PAWl4tURKgYap^nF-=NLb=|j}qtx@%?5`#Am4h$fD7|%Q
zDNn&sV6CwE69fLNx=bo{Q2=ZCWT+=m&T;tDcE`Q$cG`Lnb;G(|){8q^9WnAEIA|tN
zZ9b<V0bdODD%m(NKo;V7dA2a1ePni_1CBL%^53h_x1F?F&M{Zmuy3$Oh|1_*<o##&
zdaNi8*Bo3>A|2v*U;UP2`opQf-_P~}d`dBf-`|3S6&<@pV3;NWWSRs>0$uxSeQweP
zbnSlspmUb=-wTgbA@yUez<r;i4(OefdlP*`vcPM>dD5x1{p#fW`*Z%%#NxQ-UrW7^
zgyOG>eu~)Y(8?Dx*Ufi~FeJsz*Z>K<BNLIQ5nXYp8<(&TLLlO*w&z%r{re3;j>Aeh
z)J?7M#=No`q6nvYX?l_ih77a*c_Lcl{nGE#u7vjMkXy&$Ur(7*p9HA61*qv^HuC)H
zv;pA!!woQ_6ZCVih+IP-t)_o3$DeImqA25c+omg9pUSY7%lLNAeKFEfkWtMB4-ow<
z{qr;_SxkP@VsMIZ{Scm7@B8&hgLr+R*_DX!faVL#tq9+bG^VBwlwpB{OQ)f-?fri?
znMJA(<2xFf(im%Y%-1rzcb&Z*8VPB~*ev{H=+7~SP5yP(u7|l*mi=m=^yc5Z8v)ll
zSLR2{{LPQ7RYWT4@X79VEZ>pGKcGx`mkjekb~XhBoW{O}Jt^nD5-<7hBk$XXgTq2=
z*<Idsz+TGKTE0*{qL;&3zG97(YQXi^0{EPSkd(at*MG=trv%>!U-5w;wt=$}rq!ff
zaKn*c=YFd86`KbpNO_hcF+dB<inG}dvljjICHJ<Rgl)CppsD|B8~=%m+kB%~v8I|%
ztQC}B@4RV^>8l{~QVEx0+w3#Qe1Tx5Q{mb6`q%gLAw%rGk%y(NA%^qNSX2YL!QPuS
zzv(iU@>jsUIZz*3I=R4=eEBdfV1#aa;eiWZ4ISZrH}U4X^Y-tz{k@4~6v$|f`!$6d
zSLbrDeW1?Opjwx^Tp7>1xju48!b9f3q_g>4eV3?LfqHSvEDupp`JarjZmIzKumAkm
zoOZGwL}qos)TvuDfb1k+R67?nB#f?ae0Az#6f*NCf+Q@x<d#`mDTo^^VO)az#?s8k
z4-|)vX7+A$ZYNdogZGNM+Zz9$U6-zoHKsD+Gb|UutZGp@Vj|iVFNM}eWu3luuB68L
zz~<KoP{7)fF#CRlSzl(W-0U}yP9>s{6zG4JDly!bHtrDuUo5trbQm(ous$N*19-#B
z#6F-ox2(<&s`nHahqCho(!E8ptb=!ZvegpI5cOX&R8^2Ve=@-;HQ4rEe-U^11_5_)
zHa?S;2y85qr5Gqg>4TqelF6Yvlyl~=^M8Vl)=|=+&)kPVJ#pE|$cu7~W}(sa*O7_{
zLP^bL!g_!N<S|E)rfRqkQ71N=pKG&^Y<m&`eoI0<u(9Vo`G!_#<dr5>&=Y{XZnVH>
zvcY9?8b1+v>P6O|tk4YV9tSvLu~Zs>cpgV~NAK4Pv@>oF@5&I4njAh@-n;k=Y+72t
zRpicOm*$7>eGIraozK)y{fD3X_wMkWqT(ztyuG1nD(85~p>4qxS0nxHN$JDAo6(~L
zHN(n-g_&TPbx6zY-kDYX{NcF>^`*h}SbpQnME9rQfo79t1pr1fa0#83=2@UNH*BI(
z#0N4GCo@YCsT>q&)h<_UzJ3MAnBk|4^GHkpgDxD}9<5BgZ36j~$g~mR(JM%Z4OX6(
zI?^=1{b*)#3J&l;XY%ixJU~%KHUwf*)>QAUocnXFYNF@t_IinAGmg?;y=LT99hJ`b
z>+%qWL;US&<->g>ltFA*sy5iQ&9@E-KV-(j?C}UamA!QQBEUklK3jk5^+hnLjT-)(
z@mkKV6a2?46~Y-3t}+N)wTb{xt?v1Je9EykS-eWyY{%~XWk-wmW-Y_$fK4^oFzoTS
zgz<DN<uN$)>E-6lbXzZ1NEt`SXgK^-tH}*dL=q2m3X__V)6~CJlr4i|VC1`iNy+H1
zW8mw8YEt|3rQO@fhG@2vdmzYDwV7=`=mU}ML$nuAXPUu2DPr1`O(<38a|SsN`xWR$
zD1GRoJ?#oiBD%bjwc;@X7d`=Fvn?4698v5zisVLT)819MEeth|fQ#KC!^D?BW>Ekk
zo9jdfW)uRxZGA*%^$dluhdBixv5E(6`pdP9RjHJhKr`rCtGxX7L2P^;w57z053+AZ
zaq+)*Qh+*^k;*6tw`ZMV`1OPiExL?&BpXg_rUO_*c#0OGB7?Bs^w|&=Ck!D}*sO+Z
z5xSz6uJj4+0OOX1H~EIg)w=v_onLd^uaV1Ww8@25kImp5Fj*2n<;P$5C>QX5O8^Io
z)8C2O^->;>Ww;KbZrz|&ldmNC^aN1jN76(27c3CxI8h@-A}h?f;4fTpc#nzd)8VhK
zD<075xQL|FMi<x@c_`so?Xk$vXaD!*uzmM-)(Wxx-_*9<)0Cd&xBaDubwPR8{*Wc$
zu?$i83Ypi%LxefWwirn!OA-ltHW8UxP67Yk5<+kJkFlQ!<Ld{j|9*SaqOgfnQn)K2
zG2MReMuM1=S;skC`<aJlB~-Q5V8U5ghd0na?_0NlP*<`P$mz8d_?LkU#tZj`I-TF@
zopa~Ew6-)p0*=ZpftTv?Pp$NUhN}e_WS6SGFQ^K8F56|S1P$nd4xr_!`(jAf&K_Cp
z6<G4jeR#${CDZTRtHGUQ?U2^M?hWN~8mr91BGy_Zah*&gc!h1nn!)$1E^c0T4W=ae
zw5ts}Uw*r-DdmRv{$!caKIgt`X#GTd_?RBUEEQUvayxz1H_){CtHy}EIMAO2v!Ip0
z(w?PCxA=Xrjn35@1p0?-FJH>Qt%<|bWwV<SFgU6OC_^Pe*Xru&0lyso@VIbP3@lrU
zYc#%T5p3W~#SM36id8{NEz?W?n+t#E{=ogE8h~G99JuoBh3KT=OP~K-7e{69iow3~
zI9h^tF_0xaL$yxXuP2*STeSKAMQFn#J{l6&3Wd5Irnme`(mN1X><HB5jqWoucx-<~
zd;x*cGl&1W)do0#oe5lDFRE-JBz(bwvi%m5)RUYWqF3I>gW6B>>;CA91$gGifr}7C
z96(q-wFkcAlEyK;%r`H5cSwz9DoK)mNH;N|vKv1iUE?Kj1X}Y!;tLIIe1U>qVSY0F
zXbbJFe8I3aM@DO#=F`*WX{r!;_vOK3{x+JxI}Ve1Aj>=xnlml37_b@iP%<HQ1OkZ(
zmE&ILsoH5E9gYAS^Rf4fe2^x5PW-4L0~1NY-Jy@Pqp<1gE&@lCwdLzOTnfcV=~-g-
zibgW~B(kKmfeVmRVccXA_Nwj8{fq{0@<&QQd*O0m@LfNIMl}fHW+t8!x%ZDAT#1d5
z7vDAB|NQy=^SL1Rw1oKgE>%OJ&hvNMS8uASR&JJBk=d)P<9ZJOoxNU`4|d46dI8<g
z5=9&6tcU{9^u{|7%DD>g5HzkozWnwz@IjlWAP^-a?%IgOLOj8frTtyi{;oVS6J(s&
zDjc2F7jL#3w^A$VO=Wn>r~Ukm9Q$&jvai<IycJL(f$dUb(qPYNW-{B65!i8{0f3eL
z>aBb<6Q7<{9+JN1HZ~dBH3ylY5KMfIrWL@=ht()!7{Ec*q;|ZU*W_t0hS#pQh^2@*
z*ccIhKpgq%Mfv49C<>1RTOi>%lz3Jx#{3)@dL>R4Z@?{;dwYO`i=pPpn8Yi+R!~nC
z>maxix?v$K5|2Bic6>nxpZsW9x7r@Tz5y;zZJ5y%M+6U#qfhE|CHns>d`r>)xEu3<
z0p{<)@T&qL!T_NgoD@u<0WqL^v$XX->wwFrrWl^MGtAy}AQ8DR0ve988JqkJkob3R
zJpAhK#IQJdZgixET2n;g{we&~5F1IT?QbM5c>FKA!p-2#Pf{E9`SQCGyzEX5TGCrW
zjUiq^vje@GDS?bG)C9p3twIhUp^tm<{o7&-ED5jE&Gbi*3ntR=1@q$`I>pLSVoqB;
zpb=O?;Wm;@F%k9no$1L*pu<by+^Niwfb=0;PA@>H%;$95|9ImYm<w!yg0M;)|FmZU
zDSb%K8YIrtSM$Ab1M}1+$XX!Q@s&Pk+m$Sza6R?{*=7rvB_4Ku)~6LEC27imJ=Fns
z^Vu?H3|QV300v`Psq`R4&fbLNn0z!Nh!#`5HX!~ejGmzY`R?t^I@L$T%ZB_K&pcQ5
zPsvh1=5*mrt7;!zZ{yFya6fXK{SHyMRVf5|K@@Lxlf$6+2YCLygS?h)vxo|UgrDxv
zM~O--DR2i>BCiuw3^Rn*p?c|(J%1{t&GUSBjx6)=O9Z<Z7CBCGJr#Zx@`_67>{x~g
z)w;jAD$lRTx)07V!JIiBFbDv#yf?l=FYc8-$aj*oo3)>)f@RAo5Qy>IDGisZB4tU`
zd~E+w4Y~su4b@f_!VoAm-Br5c1jm4szes~U0fQt*0D#*6u~3c=bAh;1cLFrGOhsa1
zwvc{VLb9;`<Mjvoo=NLQ@@SV7NJvN`!5VFn?tKkLB>5Mk%iJuNKyKuu6BBydpWW1-
zi4cH9U}0vtu=n=z*k3`{8L2EFw#CVI$)mE+%u-K|;Uu}z{ZJo4wDML9NFa+kRE6}M
zRDVt1>+!BE{02g&X5dWQ1NVg5;|O?OC{HpKH8IMwXCuB@dL^i)EkI~+vABmx%=Sxl
zfP$@>;<)rAAnKs%mOb;4YnPT4htd!})gan)pXu)C3dLKGp6f25ZTW+#gyi2V??{$8
z(S0N5zGEvHkGg=bMXfZMS7%e&r(em(AHpIaEmZ79kjhl7rlDE|iO%!$&}k)tHN`bX
z&uWV9G8-K>j%c}X2Ew}gVA%0$m$;&^wFu7$71kRduQ#yAHg<o0d4q4ee<hRq5LKS0
z71zy%HYo*GntbB{ieP<E`&~kE&O4;1fZ!viPPnLCjx%`zsdy-VG^OgmkxC>XXcyI&
zeF)8iBxs3z+mDX3p|&b!$G5jIz1MD<D0hPUQQYY}r9-VaMSP(Il$_1bpE5Iv$_qD3
zjp8B?qe;Qf-;%?7M>^Q{6g*0{``(v_3Cnf<eCxVSSr1%Qw&G`xS+`>~j53d*#!0qn
za3+#cSi~Oexe6eQf|X%<!N482@e^w?J>PW>X{OgA(^wlSLvC{?^$~=eGZ=dOr@D#Q
zPlBNzTf`b>QkQFz@3SjD)oW`FNva!lyH`%<DChEZ4zqyv!02D-2vH52N5Z^vI>=7G
zaA;wd;Bhj!7#{|?CBYvMw?sg_cnmbcB@A*nk$0f8zct{slxU>&I!?d=YIxg6kr@gF
zap~DegD0$B>&%tS!-($)@ux+zaU`CU)+m8OuN;wNjjpe)f%IieJiO)r#vAUrmXP)L
z6;2tn-=Z7bhae$)l->qR<;%qS%rlr2V5f>`)UY0p4ds8rncV~(IQ;^i)~Jdxm_a*N
z#(I{t11fA;@j9{f6W}G;453_i37<a;%_R*Fcf!R;`euE?*Wy|7U?o4ZPF&6So80Kp
zUP4Etr)qpuyyf!LyMue>^-<4GC=Q8h6eh36%^0ST|JCSa18q{%o8|AD<Aq)k!b_SQ
z`dyzZCqDxFlXfr3BkYoe+8GM^vVvCQM!^1>=L#zfb-C(tRmb>-=jS8#6)%;11`m3S
zPffw5O<jEBS=e`Lh#_mmu-47yC*;EV7yc4_c0hSdUOo;A)3F&w;Voh%xJv<b0<a;d
z|MhI?%L!<S7f9oq(mH<O3#$;xXwq1uWbS?@139;3HYoVGq?%)2SV22C8&*iXX&O?V
z$H^y?T_Ktn%!`k6$`3+7!WZRToqn(XJU3(a8lvE_Mxji!ZL;Poz@kFPy`y;&;Z3xK
z!~DT9ID>@;ElI}>3wQ`Cy$Efl4tAtDiI4QZ(D-N=NGT#<81q=}s7d<n{8xNR9w1m#
z2c@{#lbEC^9XnPhT|hnfSfVccZJcr~W#49>;Zb&H;MyXA#h}<M0NT4D_!9Ec*^uZ?
zlS<p)m5%CVV?uCL`zSCU!kz0QWoD5?dlEw0r%*}7%edV<kKMRBEO1Ot*Rfxr-&NaZ
z_+GPxuY*`M@gVP^F!o$3>;jP)wTUU{9M0Y&Ds3A%s1?R2DO%?ES}MY@Vdyv&xg)Yg
zSp*06t}&))o#>utHWvz=@2<CuUpz(_Qerz5U5U%=TweeqtAi_vq)tyR&}5B<k5__e
zz(D#3*&CUr8YnHK{_q0#pS18iX$bYpw;+)6FKENUaqQH@9x6UiAU!^+dHYv(dl)h&
zNXxj%Xn41a>$wrVaXaJ=Jb{Ce(u~)?k~ang-Qn6>T=n$C%b6KqI&lAm{ddyc1Y>Q|
zh_sj-riBlz%pdd{`8u^-BA2xbq=Hq7l`*1?xr+Qe4}QYr@)5X8=AJ%;;`b~?beE|d
z$$oORK2~N!w}B}oCqvZU8#i>)MV-vA$H;Hu9GxVlS6i;n^bwsr$9k3_5S21qO^$i+
z`IYJk*7=au@NT+t#A@NW+RXn^fMDq~o3-(z&#7-<-?QuuF(?sE%<7-?y-n(d(I%?e
z3t|TS^8w?io38wDOi4q++e|39-QvRZ5H{9C+n`a5g!_qFv1{vL>1!3^pPqU4X4RH;
zZM5f|EA;OH1FhojAC(_-T(Y5%HEn8XvUPA30Mf!EPKBnXU}X}HV2thC2QiNO3C7Vo
zqUFw%-&ILxe!eb_r#HIe3Kw)qc=GoDK$JC<O2+^;o2t$a+_BR)iX%eJpKjhA|NKHT
zg8gOgA!Kum!{_U1tTKs**tJ^Pb2xXsYl{|44RoGGILU~nAS2OaSLp8<hKr8p$dNGD
zUTJKsJ9K5}ShmqwnzuvYo)U1Ghm91TXRp18MwL3naqzs++D%~4HaZI!Lr9HuYxuBe
zIwv-4lE3hLZi28?BHSf)B40zO*CGAlJryV?hM`oxgUKFhPtVB{9IjPn6(CaL4FMjA
zCxVxpIe!<E8<Bo|l3)&Zk7l1B!6>pT+SsIe|1);e+eXGX(ny`q?(~hg4=n(d<u|Zz
z%oFx^KATF-TYIxxZP_9D-JmT)aDH`AmrGa;4l?X;>U^<)b4aHNb<Q$H!Q3A%KoEyn
zu!+a=)cCi=zE`*gr?W?AIZQ^IZ_CGTFpJv7qv5+=NfXD!nZ;Aw`od@j*}tTAPxD(@
zB-J?2czXZ#@iEOdc%xt?FBF1da3}u2y*(7BddElFhfv=%&NQ-P0Q2$Su7Ah5Jw86t
zk{xXp-p~C<1GZ^vZfBr~rFSQ(Sj)V=w^p~B^H;;D^YmT0ozfmMElD08qERN;2tJa<
z(YD77`XM^e?CM7${k!b+q+#q4Cgx4yg_D*P8|?+V=f*jSferFC#+YjjzKX1Af?2%<
z&>N_intS~1^|#aM^j70P@s3Ja-&xaXtg_?N;mCSi<tbJ~Qh*Vz-5laUz$N6$e#RPV
zp*Y10(i9_P<d6vLeEhL1;_n-tvF8kNJA=_&t;h$sy=FZyU+-HN3XaNo<}=$N>=j~%
zs55w$*SY|ie<cyF%<i@KsQkzE5#p8?AkPvdIL_w*9`?;UErwHRw8c22hOk#XDr~g#
z`^Rd0zB}|6<XKt7ml}w5#z0)jjnsNw*hl5jm}*n#{LuQ5Za}C+9Hz=$uVmmz3S$b3
z)%eEQ@N<}V#95)~4Hy}H5fXShmu7b<P1JDd3cIAgwG3B^+;%`)hm>xiL>Hmj{{E`M
zWJ6frA~WWJ%J0Y9egtSIMM@WzKR-9W_p-R#SONxm8{}}zSc}!wFeF5InRGwYF9wSO
z8gZ9~FQ>It2!`TdX_Oz&UQ5AeRMD;^SV|aHQLRYdy#`2WYyM86B^#A2P_WIa9*{BR
zi!-_cCpNJoL$Tf^Yxh^oGe{MB={)7Lx%R6>njJoEF0RVDEBMeEE`d`wNBBD87&C@@
z3Tu=wvQvP!WCO;O4PpT&_2JgRa32py$HMxepPCVI^vJfP8Q|_K8q(i}Dt@GTOmI2F
z-L#uukII68Em1W)obo|eh05}P^y<ZG)sz9IV@7Zh4c83v-oI-(_xYv5&skpa6Uxca
z4PNqol;m%prdLi?4KWOcb95bw%m^zjV<*2}eP^H#H%5r7JW%_&k*iTZ<aTs>)1ju}
zW7(}yt#?{402_e%$zVlc8nA@KLEB(f_0aiFLjXs|lVLj?Rbr*fb7UD|8bSLgk~*@+
z&b}Tm@6L6!O)9)KOiU{P`IUT`k>c5UQhbtp7{7$(S$VW6a8nt{t!DmpU~C@=ULBi2
zCh%<cWZZe7<EuZ7oh7S7LtCO}&*8mF4lx6G$_;mo_DAs%=<!(G@?navlve`W;g}{U
zDh%qDFsn*N*vL~Q!^0CDAOKo~q9hT-XZ?r_EM%;bNrjV{PdW{k`omR0t4167aGvoT
zi@tj_SMVm#;w-;eH58@V43tu{?0ceY3b)9T6OGQUEl(T1DF8mGCGbLhd)$wSK?2x<
z#J`^TeWAF^PJ5Fly7P}v4-TJjz(^4@8gViUgs)EfufZcU?9d8UnSA4sGF-a=rk2OS
zO3R6p9i*G(Rwv#=7oY{)lpXR8kn2Oz$xQbJADLumM0dFzcbK>nG)&|1LvP<AM}jYr
zW)r{#&!pGBllH>uw-cXo^VH9z*DcBg8m&>$F~Cv^5Bc__#w<i4V-L!}sy}+3{imT1
zcRI&*`IYwv)UG%MoAkWWF<R1*<jGZV{Gt?j*rC7=iLoZ4`!wDS=seF`l2Yxz4}=@I
zQpgaa!|iMt^vCxA?Py0ZYb}jeOgPZi5z2`SK$QO~tl$&(7Rv*EsX{ufSCF~k?hl1L
z%gSaqbzk!sOu-<_qfnozXH{0%KJ5#7Dlr4t2!4hO+$%boOc<&|nEN7ib3^h)4vp@<
zY~ma@8J=F;H+=Pn(~paTKllQHGy3r<L~8N`9ukdpT3rIa2OKi9L_|Owft2UgH-q!0
zy<k&0KAxS~+ShSy#x+;{L5KCJNtddbV~Sp@mWGy|U$cbmD<_mPam7)_CM3+{L{Pt`
zG?#_E=bmX|R9pcv_<0R|0uLc}U|t&FiaT{*(!V1?@5$vzy-gtEH{gF)jMPs$^z#=R
zw<l=nx~Woy_*fz|NvneZvp5>NO|L)id&sQLH<xoo2Z6721i{TWj+(<uM6%x9y$D}9
zIKdfME-lRo!d)f%fe~(YL&9Y&LiZ=CMM5qUDl2>Kc`%C==CyY7nrC)w6wafb3r*Vy
zuXxzfZ2!ranSlawHkHT<S!c)4(?7m1YPOxf4=`msWRj9!zN5MiIhUo2t_fOwe&NfN
z-#A?}IE9NG&S5R43A+@RexD{SYgOn&R)aU6wYRq{SMVv+t&)E!-o)62@5um5xB9Y<
zYp9tbKl?2veHF@5rqS2tkI(fb3ahjLfeTaux1~h!%ccTr0B?ky7vqV{QJp;VCZUuj
zP7gF&ld2{pX}ad0*fo;rSERAaFgm1i-eVv5R~WbTUxiNOUk>_*wgTPNlw_EdoCB;{
ze#*PV)haxTt}By0PQ6%O<|D&GNp|FmUV|C4oOFOAN09<7=@tY(k_G#iV*SfAsc|Uc
z<H6{GpU-wKL}miqkS-=iInSZyfsspxyuyBonJCKBROW;y2!m^G`h!pE7zFk_ecKT)
zWZn$ZTH?;6wZg>dEPBD`olFh+nSf}es>6ss5n=a*q4=I57q<E%&(FZ5DzK?pEa|hQ
zafq=~zEuj<il+&CM~IwU<y1-N=p<p;sj}2MA0eEPFJ9>b&#a8K@#$p%EYF@iB5D`p
zuJ5Ik!zvW-Z_s*XDVg=<)WW)H9#}_Og7h}kHd)C0Chc?lshA5yA^;m*4SY@diKD>j
zr{{Q~d{qGR=qDI&4$56H(NMU`{NzI4s3+X03TFIofh+mYixAg65^km{QV3Vjk>H<w
zS?g|4=EQcL8Es7&bW{F*nwB4;XF%o3!!6d7+el6gF!@Ubw*fN`0PKoU@D30pb^JPi
zpQdIPN17nVEm3sI-fDdCqJD$Y@B4*tEMP{Vq(jyNc-D8t=86W{>)@+!h<s09R$*l=
z-Ha&V;-TaA$*A#qBIQh~`^X_Q<<KC`U3_Qbydp^mM1@^VGf<Vy2!$jNDqC>Et%zz4
z6MYE%szYjQf%MNk0JaTF&rGJJc|1IP7^eW%Q$iR&B_+Hf$42CwEz-WgjE$p6>kxUj
z;m~7XvU0OgMV*5|ap%w!sQHx2I;YT6HG_V8`2}Hifnl?&iIbBA9k?++J|uY2XOuo9
zE^1EEkG)n=e5cF|b24(K0`bdFvH38Gv$v9KG(%2yZD2bhYB&;YwT{{Kwl}9pK5p;=
z7ft)41Rp}2oHS@Cl53AsXSqubeiXSm0*(I@ZC{?wuvy)SU+enBuU~>2@kn7KZ-u~~
z=6DFRf8tYQXk6tGDNef^>H<?-O{cqK4-&)4T~7F{kVZsahxRjjOMiPE{(?WV$#Iw!
z?9}}IG&@qd^i;Jpi?u)*Z)!3&>kg~ZfP;&1W1jS^Y2T|j&O@6qf(de&&R3Bp@BmM+
ze&(2i{nsYNo-phg%sVx?zC3M9?EMZ}&@z8YjJ=XkiQ^Yp)os}0RF5S?y`~Yi$yOG@
z6H0m#por5OS!W12&<YAio@}GT1R<EvXM2ma5Vb~@D3A9!-XMal$BDwO9yBh^-^(M|
zLgYU16>E*|U&ZWg)aIaS6Wr@qr_llpdQ3=#>f*9tTxtG?4uy3N?Rt3fhaQg-nzTui
zS~F@+V^%6M-cMD16uOMO2vD+Pmd*7TYh@QBXF-@!{tKC_%5QiBpgH2RWB(kZ>;PA=
ztxGBbfV*=UT-r}Ljh-LiBezK&#$WDwI#e;lE8vu15=S`!BY1>3Qr*(=6gtSHJl0Kx
z)rxp_oYy19Ujs^+qnK|w$n0ah&qDHQ7A<9p{7twv8|-MQf3-S(VS6&~$yTN4<$DKj
zizSfpOBt1$yL6V@xB$LHBjPJaWfE|(pLR$#@RZ!C{-eC$fl3DVMj#srK-(|mFMfRD
znhoHZlhKk^|9I~i2FKGkDTFYKIBu!WE7Xe>h0%G)K$dAxjg^C0{9eW6rdA$?3<I{x
zso)oSH!4~ko|`&+*k;o}MoaPJij1N@FGOS+0k^TBNrRJpYQTX%{^PZ~?>V)&8XACY
z0^I?7y+wM&{(j8eM>vlS=yE^u7v&az0W&VOg<2j360FGyP95?yWFp2qLb2XAUTHq1
z7FmJXH3`^+bi|RI4WNe)_WAb>2tJBF66Md6`v%E~tH7S4ViITrm!bQj|0c^{*>s>C
zC*A2D<)bpAjM??AHYFfoR;ZqVn&V-If#>p+jWK^9Q*iQA*3zY~K$-#$;WA_{s&3EO
zolj2w(EgsvH`8%3#4_4BuF~Z;M@aeJS2sjE@Nnxs@vLv&TJ}cpZa*2hr<%zjD!1Y*
zVj8`>a!(4b5Z_EPQsP=psVvnxX|Mp*H=JvkH3d8uN(gd0%ii(Q!qZTc=}?1W<u4=i
zTCJVIgzB)1yjH2u4XEXm$Q8qPynto2mggW&0>MO-pym`OCxQ{nG$7F00B~(RA1M_s
zcQ~Z?a%X0+&6}La;e|mO8@FJ7mn&K#an6h;;XH(Q-{2P9!RT}5%a`P&WnC&24egj@
z0uA-S?588rUTZGEp7U$>607eZIx5D#E4%<R(MA9&3`*z*XR$$lrBlT=Gp^$&q@gFN
zu>Uj$&j@;dCsia40#yK^qbCpyoIwmmwM$#=i2TLzW-vSImCy{K-D~K*a<esuwc|Uf
zo`<o9`b>~O7|-^*X6+${As3?!Q3v-(*V`Z}HZ^c~9()p4Gz%8NyfZ~c%%~N~`ou}?
z$W~^6GG@+$ETo8oDcxHkQS?e;amFJ*!93%8@`I#8Ux0`NVq(7<X^rDT8-Gckjc5p*
zb^TT``4DJ)oajdRr8}1SL=~ZB4kp`ShH+hA*cd}&bt3%5Ax;}~epG%lm-h2}Jpluj
zI=4o@65=*Xf7=kUGgCTF6QKfCf&{O~-uq&BMI5<0K;xJ46`_Vz<~-{CeYO2Gr@}*M
zupp0D-X{v{*$4hI@fgo)-&j4SI}|k#J7nep4Usze^6{w#aph|6;8S4dX-=+C9czeu
zX41~vs}cxM<f`}D^IDpXo|{ljPWD<8msGBCsXJsv8eZroN}T81)mreUnQnO)+H*R|
zZqPV35rW2q`)B&#ZAwO&?kab5pSX<sKZM-I8qzk(#QPt$W=<1WzzStR`5{fWd(%73
zLX&5_flSZckmK55AO1cw`AkkVkQ(m`uB7N~5pmP(-)`htJ?+}Q!^}z`$fH^)ixxt-
z6v-)B<`!I<v6tVcS5)6DW_gdIDiw8W(u*~lG2jPsbi4R%G2;_732Yrfa-`$@#3m*B
zyY$@T65my-$%8R8!#H0}E#-y`F*bSKcW=Q^Q1e{c-KwW1{Xh`I<B&%*>fUf0pYKTq
zYJh@l1VThl=JfW_)@L%?M_rPJqf1X!htxB~&u+Cb-_#Avc?{IFwV>qX4>|M!?rM@A
zuE;iI`U}W{RDfBqM8v8*D=WDi?uQ2~c>c%*udgxM-_KK8)dnbA3=Yc-GxdzuSk(J|
ztIgr^Q*n+|nt-hS(a8P9UZ~NU9x2%CdZ!DCD@Z|$8Z~RX6Z=^CJx$*K1onLb*tQ(Q
z``-H4hAiubwC?1ftD3whS+?kI*xN&I_4jetzI+Cj{Fa@W%4cy#ti`vDg5HdRpCz@5
z3&-{@e{$-O?7LFup&%xA=D#u0GT8wSgClXh?%~!DLwR%CKx)mMsNl~fj9x*X_c$+D
zgGcuroUbe`#FlGIeKL{8G@Dr~{f!raI*7#Z;)air>J&XsFf-UUpH`)e7>|lDn;)nh
zfp%S)*;8CP*SUh^VH&n#<&vzb_dt}2<<&PMzXVsl)dWEHcE`y-H~?D*b5!X&kd_>9
z)77kW%(X{yaVlpnU=qierwu&0@!D-L1mI%6zU<y$*CE0Hjh1p5o-PSdw4huyc;zzv
z{w)Q);0xV+gJUtKU4*dY6dL1^gU#Qemt#595UCK(_x@Z=H#U&DR)>p;HzA(exUk|O
zG|=_0z9Epm4(L1-;ix7XtJ{}ft#Eeyp-McBE=CCY6<)FJJnuR;c(h~1CcQ!0w0GQS
z!dqkcn#23DM&SKt1%)Q3<KF+TbtdrZF;YXkE_6365C}6gD^>j(t3^f{MYR^BC@tp}
zK=n;2C*goCpTBVPVW2}aQ#F8LT2n7@=t9o2J;FQGBz!PJU7rx3R)vDAVgllD0-cVw
zuoUyupjZ#VWTbn*^`vc84elgc9_Usc4&%|*W~GY;n^{XaqjXo*i^Y(a)8*@HNMmq~
zcn*E5ySHwb`-qWXY8t+%beso3QD0?<fRBtye~9RsAtOMX;H$^}r7VW0!v@>9{2Vn<
zj>sN69$d*uCie3R-{F8Cz9%{U#Z=a2_o=A&T=8Q92y9&c2BSIku=9&Pw5fVYtI53H
zh+bfQ`qVq2#p|$JdGiL)b!a3|^6gD>{v^7)D5xVEEJ6JhyEm%la0rFYtw>Tul;*+}
z*)&HnCY?s6w8VBV)~I|)Z8N(Sieq>T7eS<p(OV!#@b2acwsc{q6^A0Bl>QV!dfuGx
z8)CoynC9Zyl0tHxAUSnFWUqDuv#8%*ZOtzq1s>9aozz{e8tGI0)Q^4!ta=q|wMk9F
zdJh1k_C<GVwfnzoeSauo8&a|(n9SIZyv)iuh~P|_d#J(8CatZeBM!)VSog%afTfM%
z_Exb;NTo1<Nxvcog~#801sOTiKOL(Ry0W!@N^efN^8Kd}<!-4O^(E}N=_SF(kWofn
z(`xZWxKV^XNn;Re#^EotxP^`<H1FgQqlgm+k#UdBV||`2ua2lG*BV&b@}@5dZNUfK
z)v#mtE>%fV`-gqT?54Gsa=#{ABtj;07oOLXHS4=M!CrnfMF!Mqfwb#n9or%7vY@N7
zOkWKWiQi-pSRM`dq1b^fKcEacxLwLWrh-FzxU_<!`G$d<REI=Y^FfH!j3l{aULxy8
zx99e(KieT>J(DbN38~>f9R4lKZE**(&szgo&Rf?cGe+i~^j!+-mjO8(`HEq!XbI03
z2QIHWP=<3>tkF!YwBLO;2p2ST{eg=7RLbhNZ|x?v!X4R*r`q9~{`vHj4W1+c%NLKT
zk;XA`emD@b_+wkqrUxWm4twa{?d<p_voyhe)nnJ?uWmA27LX9++DHBFf5aRAn|UNM
zK%*S-&16y-@sLw#KRd><0K&{$({+PA(rIiBe)RJaXwyfrMm%U&Tx`jj)ih1#ch=9;
zJ+t0c0UPKC>OGzGY*hJsj{x3u3*MUKgJbO8%`daL;*{dZHS{TM^uW1qpG7DYUf{Gf
zq3`EB!QwQupPCvqizC=caznfb@?+ErLu|U=u&L@@v0Xr8k5D8n;h%H*A@k~pEmI0E
z7NzKaq!RyIvxPE5CO+`C_Q4gUHyOv+F#;?T<gU8bsILqnP#n)vhn#aQW4-N6V$%9L
zroah*^(j3)JaUwL9wJeFymf;#h`<jM*wub+{h?B-%z4EmFzLkiyOW#TbIrxbJH9KF
zzj6g16tl@ex+CUWL>1Z^{9cyXD<7BQ)u-XUY?LgU#<};h>UT#$_#oI()FdbJnaXZ`
zVKNj4#5BsbgaIF!a<ZYM3XWSjiU%}O4tSK-COU1dG<JhDeQbjI>=4_$jdDn2=X1j)
zp!w6VqDTb^-fztmy6&(7WECf0U`dV(l*h>bSvdHG+#Q*!AkXR`k1}~VNBO0@M>o$j
z3t1-XlJ;Xf?U&&nOm?A~9}AqXT|H>FXalQ0gjPfRnUvbEAtt|{_z%F!93{3qer&`M
z(sceA@*{1sr049Dk6R+=E^HE_*4>fwkLy`6Yk7632XDV-qHxDTRFl#i;=wq&+fziw
zf6;}1znDJyVQA5zKS~{VnQbJ}fZ2HxI)tzfa{>eJF?en61xjCCf9WUuNd?j-{n4Pn
z>r0UZ=qcF--uD4~JUQ3V_~TpR%>VR@2WxJ%7aI<Q!ttR*SZ%EGB-Ss_{{Y=9l=wAh
zljQF4DoeHt@)9}eb^U5AnC&@ttXOYG&^z!XgFEeo&i%b9VQt0WhhN0DEVj(uAu`fn
zPr~@idva3tD<lF0ImehjcQXt0x3qB)4|+MeWSLJ!-lqT1Ke^POkg7<taz`BAoQRmy
z_Ok~{x5`$4uSP1(@LO5%D?Hz!@Wnc5^2=QZb&z+&xFQeTh~ZJm#+=Rl1>6hx0wP!0
z<wa*^d&(>7xfVJY2rX9x7(Hc~;ShTR6)rk${q=(_PjQbAI>fyev0Kz5z}<ezu)U9(
zSGicNB%P_ntNrdN@!I&t%`YlbM(*z#wrFsPy+(Sbt!+x^soB%jRQl=C`Y9GjBY-DR
z&0h*g;P0cm&G#vKyaJD7Ht&Pqfo_P?*NAonJJHy&<JG1hjz-Mai14*zMWxW1-|&`&
zgPwAT%iix5Mc3fb;$Rh6PIdfIHf?SXq5`Z}!8F<8mTRlw9Ldb?8zz&@Bd1?l$VnyI
zijKw>TKTwcGFxRXlZLP@bp(wQzLN7Sbhw2~ZK@ZCSbZ_Ht8=I2`I*Q^wkUQeX;DUB
zqSH+M&(#7V5!jPUSgix3!ulRE5<qpL-|Z(i_)cSIO_&AY+}N**Es8~pq4@TO*V+D_
zGaIRsU!{!uUw!Pq(1qt#xw-EO%0K*(6qEx09<9+F%N!)miA3koMBol4foJrK-{PwX
zJ(8IN_>y@omb|?dg8wWiq`iV@6Nq{TAG6-CA?Ja$IeCZI)P>@g=7Dk!uf=fhwgg}J
zwt5~y*TGr|^QR*F(;hhmK{98=2JigarZkeJ$g<_I&U0_CjOTjDJ5qdOsf$hcCm6;g
zfFvg#=20af?>aDsr6-R?&L^cH2Py^6<9G?ziFoM!CIM=YxG>zLXn94AW;4xzTSA9Z
zvPEJ`>aOA^6H&w-4dyucyCc@9vyY0$2@ZDsmhVOE@hzHh!7QzMX{yOj-Du#5`8aSG
z9yuL~n*qib_rtEmk%Jbb1_(-zt!I!7aZ^A8rZkDslkcDl>JBaxEl@=?1B28Ar?yA<
z9SAVa_nd`Js+#lW68p%+qL;&1-`uiUIK@ctEHldTZ!*GI{PI4BbL`!`o8m}2zf7<d
zqdV)gq8lT?HA2-a<d&|_s%Cio%iP_E&eM>xA}A2|I7}IxTba-C=HWujAQEoY1>D63
z73X`qO8eZ9`MU_mJ0)*mCe|h*5~vxN-DR77efcW`h%kX2*4Y&Oi+uL`v|vesYot6J
zbK3Mf^$x2Jhm+1vrjZr6MBZoKeEto3sQg^_G-bVCX<_6A9E)`WLpNJz*`eq>j+^eM
z?iRrp<$Abb`3TnSFgaq*Iz5B(u}#FTS1>HYfATV*RhBRPc3J}Y!V*%`*a4$r!|{h;
z9#VY*io5ZtsH)A_a5o+uWNK{OmVy}YGHhf8hw*{sDB7>qA>|o9OcG~)-D|$GED=gm
z)Cyy48m15xW`71<+cWH6tnn`mI>F}!)K=<{QV&{|pmA4~0y-j>{%`@VPCsmR1pdxQ
z%klm)*2k&Wf4G-Id3h`R157LszP37dEBw?i?L!0u#Dm89^ke@-ddOj^9DnY5UV6o;
zRw_5dLu5A)^MBp&T4SW0;}dMLCVu(?k%dO<jtg8%UG-}$Ag2RzX1BNC5!i05q^M8<
zkUu5<U3ZT5tA|F!PB-!OOiz6n?;MKeRUneznTa(dqXOh9-=_VpZ|;TXRgYo;*GEUd
zusREL>IQ!I0MD%EOjN}SX@gms&f(jko9p?v8R8!mCqw}nrtK%4G~@f%Rx+{zBLOns
z%lJIY8>B%`OY1{64-oc*ub?cmA^qgR5i!O2^&c}F3r^QhYW`(m<wmmE@Jo^D-YMOB
zyl&t}_vw2MOlUMuh$0e!cRR?0F`xW|d_<rt)erJ3cHR3hL^#|A5KpxaFfpU9530y}
z1p@YLVXmMd=Xe{I{t6(Ii^$Q6N5KwJLR~kY^|aa71`;mg1F5iIU&Rv^&f@klw#~5m
z>sA`#*e=C+r=mXZ%(w3nwA#1p*!_4;w$c5X3j`G;8QXFsrQ-BbePN#&bK&(PW9wxb
z8}Da!M{15N$ELz}y^eBrM%c73F%`c%ydmBF>GFGp0ix4+pqneh9#Vs>9n~rUpsES>
zCXrMrJ_UUBX5hLVadU+y*9--!WlJ7(*nFpLV0j_9c7X9AhrxQS-Ij1f2=(c482=!w
zcD64%N8q{MhXZ?qKflg*erfmX+ZmwRFv|7oJU}2r8CB0%pQp=5qM07rC}?v_)}DYc
zT83GLXw!_ST0K_hBU?fIVh$1MN&F<xlK;vW%d@*<m^{R=2s4R92&>{?CEg5e%jmFi
z!CzYMJ17aolct((`CzP?SjGV+$u+h}9v`|Ki`vtXfh`~$ygrpe-g~y|)6X~xc@~S1
z0ZOV%4asgTi@N?AEY24EyAQPrX<iD-l=Z(AbIZMhIYAbutvjNI7Y_bj*yi28@pRo}
z<$agbf&PA1qQoc>97^-s-nTgeG2}0=>#SH3_=9Jq#4pcxT||m&lRYh*R?Pix>opdK
zd%lEveYaPS7d*w9h6yXSXCwKlAw!bpah>GNb*da4lFSqXLL<J^qs?L(MxWoxl6{be
zW>(!Zf*DoKP+XW5^62`a5p-y>8(=OcGG4<l>{lm%u#pW6lz=BO6I9!&D7RDPTqMD)
z>dt-6vHA6Ze*KeH6|2@4jeEPR2eVvfVq_KL#}ghQO}F_sD1ED`I5YEZ^9_BiJ$W@m
zGx*w`<K7RHN#(G~KBUqf5^^%66R|JMLiu9f0=in6<T*p+h-yri4^@Lh*#cCAH(L82
zCnM9_OkQ9zUIFfBqEj0@z^0|Cccq!r?Je*M@aMJSi&pX<!0wML6nauh7Uc8KAKmS~
zf=-%u@-(E=nIbrMo})cMe>r{g*D~sui_;X3Qw2>61iHWUi)lyf3z8Tf(uuwuCV?lR
zRfjbUVPZO_K`M9mJgMUgeHboU`k<fC#D_F5tPYZ(DlUDw>xji2(|dKP4ai+<1Q4HN
zjFMMHR0W>7ysR{Hm>}#RzqTh_pEd+Ok&HZb#Ebjsfr??&zZU1$q6(7O)JPFSJjP1S
zIO_Gj$P@p)xEIwP`qa~{Z)?(Mq1~m$$}D97zsO?8`fBC|PkO_@)1LZ_h(ch`|N3$h
zc=p($8>~T;H61BbgkRq}a=dj4ID4Pk99I!oc}UFh?#j<|`j==IVcbyyNF^jktJ=WJ
zo0_Sk3fTaQz}%FiTkj7>KvyIXnY9qAG_x*{eE?VZCe$2JOOyUct%V#VY?r}cd^U`0
z>PseU3rb%PEPN`dTP16pXrKQ66CvPRME$LtlH-DQ8xV4>dl_{9elEKFzGC-N%L=5c
z)-{XjReJP@gN3odGLO&#y;szlhTYQ;ylo(<v{j<XeHsFw)!Fl=0qq_@eLIg_XfIt0
zH*5o+s@V;@)wwSta3Gpi8RieL-Zdg|(uB^+VSf1<zpNpE!{hODhd`Mm4)bdWMD<#e
zMewaUOsWqgl&_uly`U8qc_m;2>~nZZPJ(+M5h((i;ptf*<Qo~kl=cyt61wvZ(d?20
zV8|xG_}3Q?%Q{tP-+3QXfvvGXru13I-8}tZ@YYRt&{wP{wc-t<_Otnq$uWL;C3Qym
zzYY<lZzkzLjr6sib&^jNCCgIy1!ajn+#DpV5jWvT*jVR2gq_ktiN;7C61KcmdNLZ!
zNRqCU>sw!dj({;}!=`SwNrIP^2~04%`SbRTRxpfDaNPh#dGi@nzL>->REi9yNa^Ur
zSa6Ii(r$Mex<wFg{HqlAydOBHHur%T74eyh(^D_5_vl+O-r(f!$A%rTR}acnY7+fx
z#;6NIdTEEk5qFJKU|OSuuQ?>;Q3m&`ZMr4$r8I*6uqBLYR@F5LKxa4(KKfV5<q6_H
z=PpR*#P+y~`u?0juYQfkOcw@!-qG4qkeR6q=V|lIKDZ&1?8cFwce~#<_}T;W?{8J8
za@bLT>9Wqvhds2D#fPmqgr~q7jVrVAx_x3H2u*lNN%NYorLzRT*yhzK__ZLA5Hw!3
z?FaKv6vi!#e}V!rOQTDk8%?UTcq@PT0kF$ORX-z}MGvBB*Df{6q8FI}h@s%t$}J5Y
zirLXS`ewoS&QI*+C#LB|tRw1Y!Qd*4Z<f30+elIpm^BLDw<V522V<U|H8vzP)9(Qq
zp#^(|$?5H-YZ{3{$w7#auiKMb1W(V43bt`oo#8XuE$3WG0o(kD<K5etGla>;z|?}@
zzw@2IewJLH7x?*(ie_z2KT14MPWr9T^5rZt-M!n%uRXYw`9=dym>-oJwEmm<E2l69
z{V?O+S_6hvj*bGeN)T9cuU9EpUc+49k8{G(_mdej-@Jzhtn+p7^Nj<dK=yO2J0|u9
zrV@5J#}+eQ1L5HyxkdE=8eJ+zgXrzM=G);KIB}d+a+;Pk8%;a1D{T?or}q(J-3Sb*
zzr6Yr<@0ezn>5v9v(3rejYwFMZD*h}CZcyKhVj^C-PUya3h?KUdUcPhKq#mZF%U|7
z%|gXPqr!QnD|YipKt$TthRc0QgM!0FIOd9N{#*8n519o<3Fa-|NsyCLR?bcS{i`9x
z7x?UATMJE`a>N-PZpv|7aAtwDve9<peS!Td1cw-Oq<jTo(~P}C_md<^kS=AOf9)Zw
zWYb;y8OFZ%;w!)#%=G5@oiE$KPBO{W%is(7KoTsVSY9e>ia-yhTj%x%66Q0C#SsCg
zJ5+e`Ee`S-3}xl9X3<q=14MRW-i;ajM-Z#T#J$h!tcm;ZL)&^#k3l2+d5tTew6|~G
zu|z4`J@2)F9@WIsis0kNtKW^FoC0jwvfeqq+9%~pH9+UxiUu&L_N88YS>&X8=FoT<
zbS~rcHb+~6Z_*yuP?a;sFLhtI?&UBAbDpA?FfnjZ<bcc2>YGNHyrkc8bu+*glkVZ0
zLPHA8W^?}pTr5cf>9u<rTDQzY|B4E7S7~{BaAWmnnpCLR_S+Xkm7fod=uAYu-H^Mx
zj6#;D`2(h48D+B1q>B*DlVHk0C)V+lnxB?=jNVxxs6|#tS+3oDLArexR~qLHvW#S^
zxYW?{odJ9=%YOiM-V*eJt=8V>-=P`bC`CPVscXdQB+|Bk;jr)o)|hVmzT<Nzxc(Xs
zvb=#}VM^~C=!z0f?-$4AtwATbr@bfB2|9y`KmdGvZxu*=4j|+`JNpy0cG)k_-?G}x
z(t?J$`<+x2l;@?5E$1t^UN)UqSar0Cw^U7`<HyFj=ZthgFR*8uflFi$13<rcp@vNo
z^dIRNt{c6SQd+E=jg&<J=Nj51`slX$`;V{1xX^%DV?M_}>B{!OY1N7vXlU{cSgF1D
zVinrFsgqEy!VWKThT09(n9p?8nL$%HVN1mYg3|ZMKQOnDuV$yyII&{4Jr@jAZFs`C
zpBu~o9O~Lo*VY*<<u1i+z`&?y9)87W@FGMH3GL(AnrSn{u3jp&MLk~M1Jl4Sc(yP1
z{#j#Jy+U%+fh3q4-5l|N$7TwAlWt0{zFhRlJ0NuN#WwIY?Qeg8<Vv@%gCHR46Th~2
zpTE+*VPLqNKYjN`1^N8`wl+{T?YmMu2%XnPC7YWSSAfvdrgFW`RlAR}&KOdeR$bc@
z=Lqx4A`iK7B(Cz<c-A?XDp-(Ll~=lTqkUD!^1CXWk+3WOR#2_nbrqLyIQ%Pq%iWD4
z|Lilmr=NJUZMBN8+=rua#Ubjbw`WE?U8R82AYY^=1B(|w2`i<%3<<LoGrVdp*=K$~
z;MvHC_!=sUrvuN?63F4jk2eQ`fJ`z$+a9v0<1h-@x~`2f5G?vX>G#7l`O8dhpM7t&
zO}Us9?EMMTK^?RkuM)=JmEK3(nZx{C{x4v-dACp|Bilzkpc}Nn6G<ADeYH;$NN3Ja
z5L$}RIqe^`t~?y@Qof=S*xV0qJLws~HhcEOkrNH}OV)FI9wz#}p${gyBg4{S^Z3jc
zaBYl`7fb*AzW8t;6LY$u6b5lyc-`DK0j==n)o+mNO%wNi^W|wjXlNJdI{p6<_7+fC
zuG#yrB8_xODBTDM($WpmAR-|k1}WW$bci(4AzdONNK1pHQc{uv(jX<G@a;#>Idf*#
zZ+-u@&aB0mnHSzC?)%>R+E)O-a5Xe(!3<`S%P}fk*O0sx|L>Ryw0E6IT^o^txPS&(
z%7eFy^Z$K^&lEDF*lMjyTX5S=!XgSBl)9KLwXb~*nJePXz*X|p9#M-;cN)HKA))){
z<jnkrMPe37H*5Uj<b)+-d(7)csf@?*hv6!Ejak$l_MQ1ifm(D@_zR;l=j<=iCwXO4
z$C7JV({W*oRM9TbeNMc}PG>{1n*EcOpU5oH1W6o2e3D2y5C#Y0L9r{G^&HtDiD)d!
z;joC+U4}ZUo1puP$Z@AVh9I4m7F~f7QAnM2%SRw!JhH7_dcm0BdA}8vyQw*t7jajB
zk`$Zt_`BfZ478`|7BGohIH|{8Xa)xUyOPLDZ;|jbs5TkbD-pHg252AkBP4_jHbJq3
zyR==ii08QfQQ@Isq@o3|{qlfuP&g^(bLz$YIc>->w>7VoUPBO-ai=il<t+ud-?hY_
zHxdQaZ_mV+jrzP#_UMe=bLC>=n)+hq-z(l%i>TW<@@1S&$qP@kcY?7YLE|*sZ)*zg
zj2ETBtxAGl_mP+mim<R|B;G^-H2gZB#si1*oO{+E_&x)?!wRv1310(!^;)PBl0O`t
zF}3=JAT~;X?ULzg?N46lSojdrgBo<-5pZ-Mn9QB1eCWv~;$jF&;brh1AN+1V^j<Kw
z9gf`I6i(afi3bnfGbP-7PU1=<%49HU&)7(2G2rJL1(<~tg+$vbMnVeWNmreQ>8-fF
zWJczCZSl%-kW8ZxA6?KLt5;PGMUOs#i1+5W-x$K^2a;B8(3Doh$b*(S3i^Yj-`gIx
z5_B=rLpnzPe&a@^pZcpP34AEZm584i$lSp=qOnoF^&_Ov8~J}lg`b<vmBJ8~xU5&|
zvtqScFMM;K(y0Y!(LA4!>1x7YkQHO9>Y-Or98@|2P@?Q3Wql6w4s35!s4O}8-#**f
zy_GVT!e*kjBv6Yw#SBh?<KdGRq%Irx6vOrnXWKCIff$ax6p|haq*2}?L^9kDuE}Ns
z6Xvlc2HzuubJi^Z0U28{T>0T%mbvK#1<6z*;*>jaIn|ngY=YQCOAfku#|NQIFS=_6
zn08v|f}9MHq-hAjJfb#7&gq8TYt<o<e%IW;nXB`}F(Q(0@Z5e{lBMY!g@QxQr1yif
zJ`1d=dl1_;UMlM{shDNA`$%QQ{aG*Osnv($_f3d3_>PtZS~iHV9)RkPLPF+^?3zFz
zQlXH<z`+ohQ&{)chWMwf4Sm|^zWi3qvhr2(554y`_p`N9V&iW}Tc#i%(GmAmCb+P~
zZmz5>$hxNF@w^9V8zjLP#B1LR{`V2I^9d<<r2^_Vb_3{)E=gZDP+Sn*N63aiEc1Mf
zo}a^pZg1W4i{%mo(A(#AihzQk501PHO)^s~6t;w@Zl;~TH%4snyqwmGW29}H+aM+f
ztNLQS0j7~-IPM$YSeWht3+U2NE*+~1{l3&lfDGQ2s3zfZ9LXp0Pb=YsW^a)Oad8m{
zYqK|ge9fki@B}{LO|TkptbJbkyYA9Jj@0@8oTsqCJw$=?+UKGf@hQVln3`eAGbxq}
z<Wo6RF-pCpo<A1^xEE11OP{b-DV8g!9z6S3!}PCLX#~v|5%bbM!9w4<ia2Q>Wy49I
z0^jz=vkp*-FMB`ef|kc3LTDl6wu{h+)5iF<>}jCO-3!8}N#Cs$-KuGGoGNEfCRiZ)
zUb7ZqWA2AyU{e*3Br)<SGm{qA45b{CB7Dm0KsBvKtD!;=IF*%-clJ{30z;*Nxv`E|
zuk8})V=zPgj=D(KppfgcjM<!#^#$0Zo%W|DJf{#zIxHxn6qAco{%U9bdIy^2n#qs9
zFXX?GX4m|8`46|LI|E*qV?G^RSY^7BbBK=%_|Eh_T19jfQA4Z1@5d(fuc=e0%Y$g)
zjzceCB&R>^P}^=d>xS-P&gcv}-V`uFxj*U*+M*<o*aXdF$Dz;+DT_f4DTU3;enhkl
z{5s7?RNJ*HSB4?3?8BhX_jwGYi|uf@NJe{q5-Krka0@LOUss%RhW#fIyr+1xISaoa
z<wtmi{kIZu^-TX@1$p-6&WPo4lmkcK)oy2et$1KqJ}GxD{{7f;=%D4Nyu;4%pX&49
zTg*Q{k0Kk;8ns~69RPip&QJNG(_s`=_?3fk0;QH<huro|V?UHCi(&j;h`$I@3<?(C
zu*fICLNYV;Q8MA>o}D`#hCSoMNe!wvYzlmU^Y|=OGhKCqMo{!>sr?ATfxE?@BSr-_
zRm1VDx4#1kmlt3HqoD`paSV;*7Z(G>K|&b3k~&;te{b7ED`>*d{)ans|5R&daQxp$
z$c5}-kF#TxR{;MHZNNDcBx3RQZhrBF(iIC}e*boADSTI_h&=38D4(+*%KZ0p|JO0}
z=S{Fgo_3W)=h{om5wY__lCNqO!YHT-KF@6tUDTUoCcmMjYg67q7FrA1gTXv#NA>B`
z3$Q_RVB~+{rVd=`mz!Ooc^|HU=Y1tOSqm3;P+$eq{P+$pOA63?N09*YWe5`uw~>GM
z<X0c`t7*D;30F>K606#YRAk2xGpA&bfqygvir>p62->i<Bc_MM*^hJ6U+&t0mxFt$
zr0Fiqv^>m8Sl`m0OhPv9C@l##`r+u=>CwA;wFQ4YXabV&JC~bXeSaS)|K95U_vg%N
zq_HqqES!NAB7})xMk<XY;h?x3L8Q(U5)4L&bqRf-5Z6iu;|IdXnp9iEPZ89@yu7dC
zeV(h#RRYtUvYoXf1mTq;wb)Bt$H$_i`Y|qEP!K!-dk<UIuoHYP;r&Pv*db*H#aMg#
zL!J>1N8RaVy0qZv(?>{a11xn4Rff-?{2z|FE@)4o6*+MuLS&5<G2&w^l|e%)23joG
zaAP#IDE`$B{L>fw>xTkc*yD7Lz84<A3yp<o=($;io}XxP1KQ#AkoE{D6-J5U0y-|>
z-!Q=w{t|`i+Rs@5E{}uHmtltrZ8!|)8{iRIV8lvj4qit>`E=Cyl(yt3P&JT#n(oyt
zF5m1yWamOm!NPmFYvxd7uz4ixzUbeN*X8Yrs@Q<OMj)p<3}-qB2_JG>@^7g^RE-Gf
z8m5}>i<S{)<&<@$s^LMXge#WUn4zQUV3HzKGPEdxjIc!MEQ+}PUl(~3xya?30$unJ
z8si5TCU9!LN0OsOTs9J&6QI@^wgy9spzDS%8_-2BgMfaZ_C5#~jm;;o25@0&`%zVC
z3Z@9zXRU(klzg<A$ag!20A<8^gF?jVkr`gi6}oDR*`~(Y`5>x3_;kActIhP^0rFL2
zcY5{Dp!eGj4N#0;R!*pT^pPH!B_O&yq>nQn^%W)XegvGvv4@yV>0moiIijV)v}K?p
z1vm8@xP67Iir$JsFCYRWpbpuTK3}X7VgLW#l{C7L>T$6}?~`2S70kH@m9SvUok___
zr*#Z>K^Ee<K%*soT#i6La1A{rpqg?wbqsYV*a3Rs^_*s9VkA#S-y%1jmkW|ooNojt
zn{`2A>rL3q7#?4kMWQGWt1jFu3REQWG_`iKdWy2NBkduWXW*Xsoc=t10D9u^O%yJP
z=?m!Le65$MW3q+<G||uF+vMZB0Jc30X|-9jc#1hsJpt6^v-OoBP?N$&9waLxyt>#*
z+Rp#R8<7*~fBuoYPXs6wZ<pLFBKQ#P`KcC`(B6KCMX-dem&<>B02F;&p*nWvY_cH#
z??6(S(w`OpB$m!F0B>Eo_9z0oyTiJpD!nZ1kh2O>7&YhIMohYqOez09xW;?T>ggLs
z5mfjV4KAOZ^|;ZJRSW}FhGg)?%)yCoS(quBY!iyY!5T5M{1n?dHk(-bUW^WYq(%#Q
zX=p~<E`i7tEMU4>bh1dj4+dtGTH(7t5xYpBWAbN5fEH~c)oDnna81_~xpeS*^Zl!*
z{0)NqbIAP952NbHnL{IUfUNYOW;ob3bB(_(BKE`3FBf6i)_hI{@qBtFs_7so;%^iG
z$|45IXC`ut1jTBJr^y7onRt-GuOpRQ4sO*g`b=ArImDtM=a$jVYoS(PPEC8n;g@fM
zOznudF59;grbl6tvrc<r$D?r4kk#D+Soq&t;r~9G|9Vo-5O9GDwo=RVX0U)-7w#bt
z>O*jaxwY&9FBFi{G$Q(H<d9_0ek^x@s_+t#3tN}9pi^|%5z#*%1NEZ}x5C%TgE0e7
zwyCk(_i*<BnsVn7gHa_PsA~)vn?bcPh|B}((Ld!~tsZY(eWY`ook<2$2_0VFv!k=q
zj0u>^Q6>w4i`(AavxB?w)8<vG|Bn++mKFo$KI!+LC$nb)<VH@Kc>QBc$dibQXhxg>
z!MO)RJ?CYCI7)~)L2m;H)|a7}JVNEm5pqP!q7E?3!DKj~KuIgR4etGjb672~cX9BR
zKz_-K7$CN+>jomfj)2hw$LspLvAy4$+W$Q2fB$P0Qff?6>m2|rkKE34tM6f$$2Ult
z4-qsoJ_u-}_Denl=`B~BUae;#hYTWT%>x=QTCq0i?*;YG<M`jh?cWD{rZ{rY(JYo2
zBXgannC%436kh>!-~+?9-q2APH!kP;O+l^N5NG^n(W1ekRhH5}`cJj&`-7?3tMc~C
zk%|ubVvm2fvB;N?M~VlLinWB_>snF`^|vpac{jlxSRf_6k7W^EzYWm|s=;`dZ^BIV
zh7|r8*qU9>KUt+mR!Vm;QfeZYz&{H`1B3^DsWANzFZOR4e8x8OKCtBKcdXO1%dXSc
zij}bf9A1`K{m(@gCBd&5z2RY5|L1FHV*UK}^F#ZuzqtTMv%l@Yzzir9iC|z^(1!!E
zADg2@W8|dB0r?_|%J5A71^UAuC`x$B-HwIr@-ss>SuvkF&iiHL*=ug}m~#cx*)Ki+
z+|Ykk3`s0}gN~fyy8pUCnIW)&v(X#sUj5HL&qPbRtF}vr5h(t1JCiC(R<dK|wdg;;
z=g$u$5%Ajr?-eHh*L7y*Ais?be3}09!aFcGH}t#Au~nT!Y&zX@?Tkn7<&M3DNjxT)
zfJ1o1%Kyw{*5B~UY3v?DDb9a=HAz%uNw{Pp$+4^d{YNe#e*|53l<hxXxt16c+x`1&
zE<tHcfaoZ@4%WAC9g-~>@qYOdKcqB=A=dZHFQHc-*p66~^#13*{HtI3&oxTMBU`>Q
zhFU+_-!)7D*6)t2;R%UBw|@a6D`(-YUA33!Vu@oV(+f0MKG^(4MGF79Y~(Bd-?dSJ
zBuaI2@be5TVrCr^l~G3I;r{stY9y(L>EMGtE4O}Aqt8D|A>!Jq#uT39=^1z6UZlh=
zqF*^weDVK$r9YoiL`KwKN_4dT>yNXC!2=@Yr?w3L`#h6;PQWUcd=<-G`K+ua)hF!g
zPpmxkSd-%Yd!?d(-4n@LtoDBXpK&?4*7@7$N&D$UsVq#n4)hw7P}jr-Rc_J$=a$2I
zKx2VZq$RpY`tM8d6=9mopJIO_8X0gS{!B9d&Uc27>Pr&DmM{Oj^_%dR>Q-M}ZJ$4x
zBySz3r>?h74aq;WNWT3^DM67{`4OY0rq~VswR@%Th~LMjmi~LZOW@0k3^n^MWKR9l
z@9JFjHN)}vGNx4(DKFFZt6$_ypKW<LZOBviY%lA|pXaG_<$OX&e&Xy(yh3L69TMgr
zSf|tExJtA-2KQdin8R~cQ2*}%iVhoWF-1;uBi8?1R|NrUP(hdL&*;p2PMJI3QYLLB
zI;I)T@2w%0nm-Cui(>GrZ2ydGHJDI0MvAW2C(aJ)`Z29Ms9UBU?HkTex8RE3V|a<f
z5heK^O<Z*IjdmN&E#s%Z65_ySlli|YkpE-4&B}M)r!CkAr~De>&9}My{k-_LXrG&O
z*qXk4)0Xlt&HHnuadDEhX>NY2{Qj4@a}2_*N+}a4!>dWmBGxHyislyFWxmNuJc?IO
z^TbH=6G`^nmTDDQqoKQ|pG%sN6~&{W3$xRIb~%`k)5GH}R9qMT=f514Klk>^tP#tn
z>fSUKJ%_r1S-X2Oy|bC@`|`RPJK5T+9?O50s-^GK%}(vF+iI6;+eNspxm6|Upz0s{
zCC?3FzvmU&p7SBs`f4uGmAjUp@*(F~_v%mG!hUf$2vQb{jQ@QvM$Vr%n3G9B9EL=`
z*4F)NDE<-Ek843m{}@k9`E7h$Voxre#>T#J^&CcYO8U}uhMbB&UpkjwgPkt&?DM5V
z?@P>>uLbPYC+#cc_yWmLa-GkVrcTSgJ(0NdNg6BpOXlr@kD6ltccsrtha5VX25+GN
z{A;OFqMT+ui3#+c%~fD~TlB!qs_13Qb5@ZTUy}OKp0-=nY7F!L`3A>CtbtL%0_a9m
zv0(;A3+Z>r4;e2Hnqfb+#>kbfJChZ5Q%mG|`SRpggj0Dxc3(-W)(jbX-Zs~LjX847
z7a-dt1gt}bj$Xj~%X>WAgt$*R-();0Jp(Ls8Bl#*tCI2{*R>)zerxseIOH;KqLOhL
zJ_3Z*(zan&emF;om=DGsm4NZTDCDnug*ie77i4L5W0dNjS9}_)Ec5Z>9mc}v9Zx2<
zS?F?k;of0?#2;+nwjAEOTNFL&jXc{-7pyqt?nRtibV)khi&<Ru>Q4sE63?;Ted$nI
z_^#%0qLsSV$6HN%+@Xbgo_S8rB1mDgQ-(q6xSCu#1h*t!eh!K=KJdZva|dM>U-B3T
zuK1t@Q?|0zee)2itm}(ikw=qt10w5*2(|1DI5n_Omdb}cRW||H5XlRpko@hU2$SnX
z=-Ul5;fsAkh!a<KoC^Qh?na3f7f0^7zUVmh5(p}~k}jg5i~94q_G!QCUf-xa){~+&
zdyRX@iIZ7^s?4;!Yvdkw?XH1!c?absX40o|q#p5~A>mVbZ!hf~YKMf>is}>Nn=Q9W
zTWM;4>P*NO-0zQX&hJ6L*-z@L%)|r0=yp3*1fPpFVEBmcD}Hi6%;pmqKKFwgYT`$6
zSwcXlO2!3>P3h<8dBB3s5nMzccsuo7>q>-dc${Pa!mc2@-i36of6%T1Wt8I(kFR9{
z_C*fkqmQ0RmZqpGNumqpeA$W(NbEsYz2qhlYkXo?M)vMwwxPTqC^QtBai<f`p+1k~
znowxpu{1rZO>|5&4E#i-lW^)@JSwU@r(MiY{3KKo3kxzoK00kpPq=r1GHFodV-<{L
z;>zTz?2D1fHPqx+v-y(k#yZEx47k&u{IIBmIhs(t7zBGcj++LLQV|{Kp<>H7J4-LY
zyo>AhucF_e&YvBiN45E`<dM4l%FvawNiFM3OGDyD3@CxLbNTt1rP4MrntE&kJ=h<8
zM*q~JQ4;4RmMqGt7yYhsuKeuFxNB*fc*dvQO&Laax(kKKGpaa=w`}tD{_iC04^%WR
zD=20tz%o%742D}^jEW>b4L5mvw!Le)4RvTg$Lj{O?|RWx=Rf6b0$A`Ez?x`-z&d5r
z9$6E>0+;7toPVz~{Jmuhuzl8Zcfj&WH|b9PRG6eQa>mqkslWP*c6FW*lVvV>C1<Il
z)qN+&dU9RY7_}%#vNXRp`v+&aI==+rpOVg;2(xZ4hg3^D{p`E3B0Kxl{7i*=cYmQ`
zryUlLw`RQA<2h^0y~MU!<g-6|;h!<61M^dv=JOcTCP=1sxX5^!57=P!i>#N#I)1ME
z;N}+sb^<~d<bYmw<C<(^JJNI7g)H1<7I5$@nu$IG($Yn?WoQv!jD`YCGV$}s=j<(l
zS{}G)TqXDAMK+R~bIu0;Eb=UZ9reb%&*tB#7P-Su&hF+{JmbAt^aceBF@||C)8LNS
zRz3hQ(6zcO%A(O}tpSAwg&$aElBi~wIi%FzN00Y-$Ds5*b>Hr?cdVLAy}5TyEV+YO
zLvdHVJV2tw{Wg(i>X19>NnzpE4;8MR-91V_)xrlwC-a)?a&w5k0+QT>;;YgH?INqB
zJCTVdA{*~6$W5pAA~--+&ocOi>|JtN8+q^Pi)m2ML;RMDo6r{$bCL)d<jp_Z?0@~Z
zoav9puF2s<MObhdh0AvkxX=ym)&D}(ihL!T*lNBEY2#*>2$048J&AJMCMzzf2m>0E
z;=<9obk4yLy3REPJ!hnzklaFbu>We0<#^juIi^AdPpmdFzLfKduXL!7-@=1CDch4)
zlr3oiqaB{&MO6==KydytVoS)%=Zk<kqNLY>O%(oO?!z^^xn|zCr9U3y&mwSeF+&Z*
zFCaFI&bFY6eFvbR5B`&`=>WDp4<K(%#jYw$hus&b@E%*nh{9K%SU=ng=Bzh0Fbxa6
zHR@qBj5J6}*)OaAsZMj~qMRzbb@OG#EEJ2aUl9vIJKYb!S=2|>nc5V7s~2_a76K`)
zPXz5PyIqGloZFmz|7?m1w6CblKBRS>O{K|u^&cp+<pjT`#3GUYey{%57rrwkhPqol
zBbHtZ*i&$Q*uC0Sp^~2oF7aE&Aue+AKBB;to$=aJML?br++Z_jCjem{_~Q!L-HNR2
z2fq!Y5(B-PjgHHG*Wf&om-ifi<Xkv+o`zXuQ^rvFB7TjjKta+ckAyJ+VlY*!TU0EY
zsfOVH`4?W>{#DjN@vT_DO0tos7O1zHSv(=fQU;cT$&g>eyeJNs-8aMNz2|RM8U2oi
zm$jSrog0y>YbRD@=zLjScC0fboGd%GdoPP<RF!r%E-#SaS5A_TMU(F5L4(p)3C&2|
zqK3Mb*bI2UW~lnczNT`&ZcXq4d>uZ-yrZ%eiT4BOq{2?H_(L+%_YT7>H+&2Z-G0P_
zSNN^lLVIX>8%f~+SL41`=tl?Uq}f0uISCH2Ysk`&mjfeL0hl02KZ#$rBrM>A>Um`1
zX>t8ug_K4vs-fcxMx*Eg7A&Ut!iOk-4?LxJU}H*#^-weS$YEk%>rSX2zCSw&jDs9e
zlsbe=!hmJ{N@1^D#Aj~AJ5rL%k(S1&DJvsqOPuIrWU!6hSN&Ibl?OWt8p96aGG+7j
zvp{weLbU_ijAZbYAl(o@FeFst2l`tYaSIsUZZ7G(KKKp}kPSL&Er`!x-+1{w$E_cL
zg<1f`*KqrXwz}Bp9Ah>szArKjqK0isUvOo+7bsFj>GG?k^csP1Xce|fX~+K6yk)9k
zGJd{ZVL!6YOoTyaCu<Dkfb}m}nu0h|bH9fPkKcLU((NGnJ}#(SKfzS4PA6~V3=eRG
z!8fN+b#Yl!Y2~5)!N}sJf<gMI?HjCa-gav3gRyx>G2;q@bg0TOmVY^gKrG#RWl%Xu
zF~9+$aUa#=Xv-$(B1O!{HHa+e2Gpyn$3v^O9}y1s12Vs5JqNrTvlsdgl6q{seD4VS
zH4L~v@?!g_6Y%=`kUO3lBqF?84TiU51sz4qxV|9W^-wdZd^k8zQv#e>M7?ZLq@_<-
zV1+!;gPr2NzaMBc>>cZcE2@l!9`5+x0*N3qI(!093w8(V(SjZZJJ{7z?nMYQ7n|lH
z7p|6~<+sjV_cg0M<xadHie)tN29rcezHAp#8ge@>eCq<J%`a81nW&;=5!$F*KdEIG
zM0a*78T~SH_PB&kMrQn3eAPFf7kV(F$T95zF&i;Fd@$|rqySiQx>^!xlot%z`@g)8
zP2-QDe|5zSR%-e=#!H}NDM`?40$(?DmTbh4+Us~J0W=)K=<~xsQF{nx{mWw6a0c+h
zVm0kNsBTuk$zSr0kq^IQ3Py18qahwjGJ3{8>O5Z&z}a$3UhCIi-K*~!OVq9L*vNv1
zaK0is3xvUYFddMunzDR;8oZ49Ve_{bgK2xwc@c=$bA;MY@C|vF@el<!xB}}AWAMN6
zLbYS-d^L_|=ol%oR`88~!)JRVgZD@<vz3zZVMYahJSB7|8g!`GOr1(v_(l)JgFa{>
zPyYr=ZP=5czDUMHwwdp`X8nM{FBBB0(rdJ!g0IcBN=Oa3|7Hh9evcJBby`$k7v;Tz
zv0Zvp2cFig@(_J|-$&Nj&jlP@NM8;DQJUo0JdiPX0DsYwsYM8%sosYZ6ld?C@Gd_n
zkhI5~TWI6!UvTCbzf=Pv|7VWcqwFaB^37Gzc^yfVROZfqqX6uwl9Z<ETv_SJ2{G-l
z{%WQLZ-#E>$%&1O<!phg^b8+NItYII9m0gTJ_`#Ea?e!gE{DSe4M}Lwb*dk&>7hkp
z1AqyYSkj5ipYnXpPUAmzpV`P!f~|lzyXDuBOMurC5rn9;EQxNkhuSZ!z2xZY`ZA*J
zCpCIime%hnckAM;5Z#-*s#-1TMDU^dqLbaYzlt2+7-AtH-XRv|yj!C8dq|M*o|w0}
zFCNz*ti$>)i+02rj=6pyYZItemNPzQ-rFS0uq(v7nGvIZxqJ3QuTx3DbAJDCOjs#>
zbcl^?P2UM09i<eAKVx7IQgf&7rAwIP1ag!K?AMk5RilrH`R25y8_J4_Pa|AfYQQ;{
zVT<TVIs)RU4S{G0(1wLSY4+iOp^u2;a?-;G(|&JMxZVz%Rm(W*vG|r<Fhk_&CB8al
zjHEw4xa7tav=i#2K(T_YL~AP^|AjzLOA~)Hf*6yu;xRH_rWN)zG<*^h;uI6+%~+}?
zek)!>3nfGO5?YlMYEi(s!j~@~RIfza0olw|u2$*9?`Mf5Q!xL-11QQSzV=GC8)2a(
zsf!=Cmpw{pk7lvy!nk-Ts$2uKBpQybSS;6dWITkmeA#*wp1$!ZepP*E?0Y1j(1<P2
zl2zgsCyK5#AU5n2to+)4K{l2llGE}eM*RvSi`hut&II@t;vAN`fv@HLoYtZ^x?13t
zs={!RH#zpPfdqtMf<?LH9%vBU)`fEHvFoO8NzYnP{bFpKgxBn$-SpC3IR})Q`D(}$
za6gZ+FQlRUKKj;InY(X}b>1@+RYQxNLr1`$q0Ae(@MA8u*Lj=qAjhJzCrna|&<z~j
z`@n5`!@1V;XlusH8%i<`HorGq5=}hG@;_`PTF_aNJe3TMhUf;49}s>260SxbCi#mH
ziX4g52ue&Jl0b&1OKyoKYlzVmVlFsudn+$Fk=pdaokyC3?8PMrHHEkm23W~3%o(UO
ze+&hy3^$>|W%n=96pcYgL@Jz`%tWL1Q;0$k<Qx5tdS<)vXTb5ZJ>8A;NyHTxZ$|Qx
zgiUWw*29k0mkMF=6dg)vLhz>L=oX}6j$HLkbD3o8-Q;|~T3Uye7x52Z^yMb4+IZ}B
zXW1C^o9Q<uJiA!;MH#X%WSLaBYRzLF#O3t;eHMg!?pB(3tX{{RC+K*PV)Nk7VDYy|
zs|`;yr?BYJefv$D*(PuKUk!urpn*bHpp@f9f!88u0_Oi|f%bt5AGG-c-nlP5mhGz2
zr&6~M*aud(BolE6@u95n2IH|Oo2EKbWYgp=Ra7LrcMtL*`L7=%;<~nAd~Pf9A<QQW
zL+%}+-q&Z@*EPCk8gfu*PM$przSkOnRsLKDun6BCT1jlCPxOfZ#$L{V+G8RrtDBT_
z1|fn4HkwjciY~=zH^_~80Xz4Q2gqc+Flen>9WT#DW^VMw)S@odFE}n*w6v-<EKN0O
z84VUTB4o+RR~Zl%dkjUy4Ssj<*=%e=%tev(z;Z9Y&0nyUHi{t3&Ar*PU$f;#$1t1%
zD`Xq9-dbz~q!_41nB~YQUU;<dnQW#pvhcYXUOo96XLQG86uWHP?Svm9AygJEJKa`~
zxBjf*Xoss^DgZ`V{NL`(Jip@c^<|jw*Ljg0J0Lt!X}M19LdB8<_8W58af<8D_o6SC
z@Yy}O_nd!>b+C=3h9f2Nw35{N;=N-)MfBp%6>pT$Q}UR#hkVen&B<M#edog@I6EQm
z=|uH53pN{jiHL?JLNOmD>{b}pR*rTa_0{#4%fv)Q;-xD4czex98W0Eq02tDF+giQV
zXHEqz>K&x~kH5`M_-%<R&&P|NIT*9~std2K&meV>VPW?73M=USNe`KgVhMTY&D#Bv
ze=Vh#=6xNi$wg+L1JbGO2C#|axYwty`=QYjv@>!xlmeY$MoCodb4OaxjlWpGj&(-}
zHNATFiDxCua=K=^n$BkpL5nB6*u{C}@tcqG!c|NO&z1)>wsZ9t_Ze4PE^PzZ+Ra9?
zWCW-xI?~iy`pz)<)6`bnbpgJrCtJCpZ^$-epY!@-;0uJjA8H3jQN+OO#d1t|1?wDa
z)Q902pnL5@f;NOd72+TRU3hBv*(KtCKebh?b{j_0btfSVW1$Aq`oVEjg5qi5WCywS
z0t|CwaS?XUjuTGh=@0qpG<=UfWDfZ|PZK5>8_;%m@^j@kSUD#8ohd%<pU0D69WK%5
zizLqv{S|j!XxSF}_kpHe$C+m;X~xaA7crg)eP0ZNT%!S%tE4aL8tBnqNmI@{_<T#b
z)xSM#eSJ4Qmf5N22JY2(fJ#`Id<_G|mI(9{VxDa=p~T-jr>Dd#H(rjoTy0P-ZK)Hz
zjF{~#(=m29U^`-Ee)kTfl=_7^a0Xo)KqFIo8l9J)=v#nJhl+E9qohpQ(o{j*<^%vo
zT!9&(r0)=y_t5QDSI2F;)+%#EDq-mWxf6Xbf6GwU!*<vN0n^3&c7(b4;YT7ZSayjc
zbakh?I)n-LEr7XJRy;xzT9>>D&}Q-4s`yj`fygj9W{r|8$^BYASlF4p&b|rT$h}_s
z3PSSi>n+mXK}5sXF2s2bsFVGO(YT_M@|d&4^pi__hKs<By@t34b}k|2;!@3sYzPx@
z_{em;f4Y2W!h(S3j0rolqwk>b_XrlxLQ$a6I*ZxrB@MZW81_of_ZFsupZ-S`cP}Uy
zmz^4YG<oE#QWe8=#u`rI`7XT!!AF3AuMCTwrYte-bAxfpw(Ll8O9=IvEs{%o)*U>c
z{8hV3rZni8WIrJ$@nL&tPA^kMY3+H9U|Cvx6xaeX5SQXg%YvK>-|auEBoA<qmq#ue
z*zZ3(tA#Pt!xGUDNmase%i>pvh&laZJ~D|-g~V{Q?G-R0?ME7r&PbXJd*$a;bXvDT
zB!f_qt&pB%7><S1E%hGe$w(?(9G?+{q;q|wqxBM7xuUl@hWGL$V~a&S=-;|%&E|ZF
zL)`5C$hg!#neMG9JGbX70C`^3p*|}P$tGkG0d3IDq966Z>VX!38c?K@p!Gydx>KyY
zC#?%n6+fw4E-gxi6(Y}YUZgV(?Sr9zpplv&`XapZ`<*WncR>T@?(J_n^mGF7m@Dk`
zi%EU?pKrL<(Csz^rfuDvyhZlluKQAaL+UF#{8qG{>iO5^x?Wo1pj7HWcqP|Xi){Z!
zmGM#RsgyJ@u@%GQm_kL_wgVcv5R1If<Q7q}ZL^TehMHj)Xc~sqQUv-%-?aPG%+cGK
z+Aw|XjxCU(3ncKETVB98sCYog{AK=}^0l<k*!@5qOCVlYj!_bSs<3!E97Uhp487Cx
zFYy|`m0s~V9h)1&IqMZCdC&7LscfqBP%VQcxmvH3n(~QrcBrMY7Px#Reaiv_$=&wP
zHI;~#Wf^iD3lpueIyP%9)m3OY)2#U)@`XCC?5@nYP`)l$zdoycVoqAx@Yi%OlMAyZ
zRj=D!Jvm#tcB1TDmoy0%gVG{A^Fu9}+)$*$Dva{;Lm85frVnlA$5bFfl{fX3?1-P5
z;eH1M7DvFPy09}LID}~MNs7t*pfeF6%S*V=;;Q60T8F@2J5mcU0s@!&sKSq20EzfV
zd@~n;b#WMQ1LOg(SjH;u!5coHNE-jTx$FL`OZ-8Q3z1X{=XIl!L>6P}liQM3fP0yd
zao!nvXNZ!J4X$qYjedX@BN23JOB>aTdNsf|@{G8R8MWZo6gzX#R|k~xV$qkYw|@3I
zQjm0ZjFKJ-qX>AOr+fm3t8=ODnh0TmJ4&hCW1EnXpJm1Ve>I6wg1(HlV$(S!tyi}E
z%QF1AO#R4*&c{Z8#02=X`D+r}={er5p<{S>Gdp<3C2_hnxekw5-Rzij7l}#o@U1m!
z@odX(`JW4LQf--=ySVMJgh%b+&hqdxV<PjNy$N#$|EV`()u-__GJ9Lb{e)hhcG2#h
zGP@-|iC^2koh<NM4Yx_C$g!csOJ1Dw?k<21dxuZIN7fQM3gTrfaNWH+D|k5RXs4cc
zn67xhN`q4BjlH&b^i-y2^h#}r_d;o5&O5zPadD-|yIQ`HRu||qbU2hm3V7l20-1Ph
zr-d8=$u!sf0kuF1mr>mpZ$2_w#J^MnXS0kdv!)3OzKeJ=N*AI7u8v4H2Urmh#`jLY
z!$1jz1t|3R7)W*=elHN2kY=o1XG>#eynL8kA2c5VFqPp9Q0YT@0;CA@(|IN{1Z{#P
zSnS3XG(q-^2Ysdqt&nnk=K9@8PojHd;XG58YLO1?BQ9|*7MNT}32seuNu}vzL&S{x
z=PU|mjYHkAxc<aDAD_X`U8K<IB}{-ru_~l{HAE7=+qJGWEQopIKYN73RYTi*%=*5{
zq8IX#rw!cj(fw#RzmUQuBXqW}@BeCkVLC5E8)|@m5bxppBCZ9R5_^|GSJRo$JuEzH
zYoMnOxK1GqH<ezauXE`Qg3CX|zgpsBlpt+Bq76){HgPn?`I`%1Gv$GC->%6BfD?w4
z!Si__;^?MzfykF6zN>C8Pco{-)nvh?p=)NI5bxe->$1`vbc^2lv70|dghp<K_zPJ~
z>2!ZPo#i;$)|^I<EyfiwHSUhqM0o20=Rf^q?{j9c<+_DsD+Yi_h3nU(Uk1~&cVPu7
zev(>dDVgQXC^9b}MB;YFyS_14K2}h7_a+`Uk5z4unRTJHyLZJ{CEDEh{$O_2X@tdq
zZgRd1O2&}gn2~Je15DaG_QFfF9*N@Kb^0KbcHH8SPP*(ePWSP>KmdNJTm@vgP^cX;
z;i|2otqVht%td;CXeuYQGgz=bC5SwGkb~Ri&1gZJLwo;H{+L}U7W!%%fnGdsVz=ws
z2;1R{8~8OKTwcvf@5LoPg&JFMzYfABlK_}vh`#)tEkE_Q-9&gDB19Z)WKNj7Uf}&o
zMbgiJ5c{-j9MG@^`>;E*_ffjp&_6k1^P!3~e5aj3Vt@O964a=j;1cx%`+N#bjdXPS
zfHTTIXcG4f^9^@lw@F6qA`qL;Ya;2Us2aHr5v^BdUDmN2uPIMsk&^gQB9rjv_+@o}
zPf($@dz<uSHb|8Tm65aW1-skBJ5Z>mZmTtLT$^Q+4X7<%xYbjy`0~P0y4NsWC0F@p
zmnrvkoDtn3*GlQi?ziGIDhU7EV;v8K3$Kr6dLNj*OwDn$p|O$(iMgFmyE#!*Txxln
zW)FY*%bmwV>l@FN&qRiXqM4h<7_4P_&P10TPCn*ozmgzb%fO6#AsJ^CnXw#1)=_(~
z^Y;DOiO%E|^BG(z9<@{Ip?-b>>pj;?<+izK!Nr>gM|TL?Z&9vTKF0Sngqq&jB|B=H
zIli0i)i?{6cY@Xy+2c5-9kwe5qns%GD(61>_2QzgP*K}CgtFS>yA0QOlofPj(VWrv
z;>TWNzTYypf=SWZ^mMb@e&Om@HGczKv-s=z7mR*EfYvirJEF&L?Gg1G(W4LCTGI1P
zesaz{J#;%QgsV`(M^dYiQBYtE-9YnqGQ)HvcvyqVwc`yafEoEBk}IPgM-&d_jfi$4
zll#{4K$;aIR0z+?nQL83le{70A+*u5h?3%ns#rSrg^uN%;o&#azJe|_rOrV+9y7XO
z{Dd~*41^i|0QlMV{Q^5O4f`Y7>P#X^J=-MjK8*}I8uTSf_&gb7Q#HmO4v7hy5FfpY
z60ZoYyD~Lt)$f@IAPwxnl3J$tSGlUJBGp4lWVkRc<H>!gZ23ZoHZ$yg_pN{6&w`^g
z!5OzU(Op$#Y(q_?gLPZ2OM1o+8gb#dV@xlC%l6|814;P~>2DUQ19U!uN_n6T%Pbi%
zp(z87)@x6F>{S;uE@tdLz9uUf;$ios)xD=``Qu9&4DLoj>6EgSY=U+I(OSdo?fPwT
z8Ou-)tWUGGbMmz+jI1x!8*f}*%f}=M1|%wSyL-`6NjPBCR93OJ!!>yQoaLRc5*EkA
zc<&d=*q@FNlgLyz?)O_b7+&1<8@z2&s|crb3da<`m<Z*Fsr?R|w(m?|2YFv#9;<yI
zUl1T3UxaHKj8Y?U;?i(oM5Z0g>KqFTDwaIDN)E2YH0O^g-2O%EOHbei3{)BQy++{9
zp`X-UZ^=7Fr?Q@NVx&{O1%+=KTUz3cG-%=V-_`-lAr>tkQ+C;_HPyG^S*R!;8_c{s
z<VO!JBoc5!eOf?|I*;We<O~(&Y!dilxZwok;DGt|^A)Z|hyIq`!2SEVZ}cK66!Rr8
zxi8U(dg~D2-FrqAvDqq<#8{Z1h%2Z3+T_bU|L%aaYj~Z&G({p~=kz0nSd7JEQ11aK
znIS~J?uLGX=t*(6<S@ewal9HZe+#Rg`Bz^S+fu;%=qHr{qV`4oSNT|-7@EYJ2in7d
zMuQ8i+-waONgSgmjJ@KUS^7hYti=!((G>+F@4qNJzHTU^V4dwbFCyFSDX^^=_yl8_
zq|iGv*i}@Ep+11N!RMQr%H(31$*=iT$AR|_HuJxTO_gu=Sf!d0V($t+TWAsbn(2Sh
zBAfI~=BbW3f=pv!3@GGi%@N=QH{AB5Gcw?6Ve^?ub7zi0rJ}Kpt2m5fzHK#k?ounC
zR{Z>}FLylc8*%Jt#!@!?Z(D5LeSW`9;Kk2=mN8zVlntUEI*X5r`{^}|QA}5ByfE5@
z-li)nw%jNFa)N8GKH|kfzE-k1Jd4;<JGi&J22K1g7_FvJa4gtfsm@1|i<7_@lxOQ(
zmXA0L!Ait<@jl{T7J2jasPvF3YW~8l(&q1(TC#?@Us828WKs<A!hWV6m1)(rzlc#6
z-eQ72x?nw>UgH8P!RMN3>Dx|o<_47c_|w{m0ADW9NjK*e_On;jlk6(iGWzX!A*R-J
z!Bz`(2!)vvm)c0kyQ3YKzuH?GCpzHWrI4vG4mn*zAA-cpQ*b{eg^^BH!nZ;l!dx9S
zhODf{(l|ZNig|frQqCB9dTjKb?_lk;5^(;+TJ$-R8M5;H>RGOHSovq?M<j)nWlJs{
zNR(RDWl{Td29z!~wUQuVak^7%F7!IJ)CC9BVlNc_Gw7`;Q|Ke*G>!Wdxo>^#aG89}
zUBpLsv(7D9{ml6lrVa<fD*8P%{8Kfs;`zR5P1nj_tngH|WH|fG`h5cUr(&*C(!m^;
zT?f;}`oT&f-kD=adJ_T*LQ|RxAI)(1OJBq@Lo8NRX8S48wi;5eJoJCB{`LU+RZo-%
z6CKAerWt#{pv+(U?8jc$I<EVQc%x>fL?~Q#j)NxEK&y7W58RQEG~NN{f`VYwa!R(Q
zMXmU<(f|&Eb#*SyOM`eB_TM`rk>L>s|BZEAYPI4|CKq~^#uqcLsIS#&zWzD#7?3AB
z6fovlST>1YGiyml-NT!nUGUq!erWZbtCh_M#EuVONEUSPqpbNn4a-kuOuc1PY83z0
zyG@bD0}7nIsSy{d%%7DVMH@OQ|LC3+#%S(MyTxz4ebHAv(Jhy#vw&!_+1Ih%qSRz2
zLSL#;p6De${gs((@$6MK>Pkvz;-v|~O*aq;{HVelY)gU&tmXKvGwZ?@r;>GuQT|G%
zuPFX%Qmfza&2gS!x#<JLU=gbI<r>M&@Vy8cfVfH!COI$Hp1MLy2a~OteW2Wn0z2VK
z)ZTll1AeB3X0X5?Kp1jw-R0Hfg=yoBXWtFp!N<&Vue4p^wXDE#6*YPxwyM_KcFHA9
z-BvlL+}nY|>Ahsmw7}jj+G3&^Wsif-eU+<=UQaZe8pdaQ|D}B6i2EggBJgC}jz0{?
z_>0{K=?^b-SjqKR%<>Q~UXnJYH82YZtJleRCixlJLu`YMFaPZWa}mWpMOmO!dBU9D
znQz;mWlk(qHCwvzA?o__Q;gMMf2kfWPr?k>5@=y=9B2nIaGOQU`7+=o?{HDP;Ober
zZV++(vuRrHubL0xl>9vnu;nhpBxq;GU3lNL3#?lkwEJ6JAEN3s2(B!mK1;zps2eiL
zzmjv-_*9%$R+*@Q!;!W(o|*IgLCwzOx?;=vT*nWTK{Z3v;TMaUHjj&1lk3*R7KU6b
zh6`sh2oCO|#t>speZZ^p3(TD_l6`xne#M^aH146XjPRJRjo_8BU-p*~5s1v^!;N3@
zU-oefhN^;s9$A1-$L?f=8$;hU0`!etM7V97{rwcT*Sux=MMXzqFEf*0bMk&gX+i#V
zZ6$ZKUb)Tfod_b9$$_QxQ8!hsa{MLhw_3ui%MuF!Q4ebHhro@Le0lkSU`XN;6u05d
z-|U6Yo67wHEI{s#ATyz+omK?en=-q!n9AOT7iQ`olFA%VxG$A8uL<{CG|wFf;k69K
z*MH^?{^1F;G$zlPR#fpf@zaMEZSl4^Z&^5d+x$8NMDLSfyD`t0`nFz~zl6f4Wjw*f
zr=*VOfX}=zUPWAUP{u<M&i~19xU&DypZr`&8xCQ^^_?seoCzVR9;G3Zof9*O*yA_8
zat6i?1Zy2=7YTmMwa0hX_UdwP=|Yr;N~Z!RJ&nt_seS*46?%=gsIAt=Oj)m|6r}%p
zIbhPPng8A>MKzv&V>dXo4Tstc6MfHT=0X^ub<bP`)x}gpJA(s;zFI*GPl97R4F(JB
z`^z?m5A=z;rWiNpb;Pz^sPhmvDrs%(xn*UKM?;QC!i!A!A|=DtuO5!qZd7Jcwm*Fm
zPHx6A5c`o(-**r0YGLjpB!5Ud6FBR!es%jwj(R2jJfrDC!u$tpznmJAv>@WM8lGA|
zS6A_sb{`!};xTr`xA!rus4?qe4s$^vXI$sn8ehjr^J!6tZ1m85B=5$$w|$15fzPNh
zY%{IIn8Y}fDWivJ95zj)@<H1=%hdL}kmc*f<)VoQFWpdRWm`%m+k5VK7y7%ColxWq
znAu3I9=$3~!*m@#qa(Pn*b{QQleDIpC!bxfMs~lYn&+L8h~8B_njYubvWu7BtU!rT
zlo;o@v~X`$F!V#Pm7$Ly)BbyJ5~iK^X=$Y}e&^}D0YkDGS@<oHv_bYuSVz6F7-{e0
zX)yg2ACtysQ143F5wcm+B7y6V?O8d&PzG&~or}{2a?ac6<E|3e3!g>xrUbSk<3#OZ
z4FBson#A`FeZIQhwk$O<YKgtm@#LHl3QarLzC?nZzC!R-yW@jJX8$%dp2XBidV(`Y
zPr5Pj=q^-Fr8{ixVmHV9ZFy^OYaPZh(A3|Ix}-~=lgSaY=x#+4ZW9A3S?C-Z(f!Hu
zuaE62bwvs|_Cz^(X)f@sElx<bcf4;em_07=tT#@MPvKvERA|^pgh_lASYfZ#1AhEr
zfopK14q<z;JMYH13gGRa`d{xq^mL}{JOC|(jZiaT$u{)r146!=05Kh&nyzD+9+aUI
zd9C!Ud=_<!G-(_;Mw06rU)x8qCR#q-A8a|Yn>N`3C16P&hIYzsscH}2Scu_ANVHdP
zzt;ugvOW+4P4C>p^D2*sW1j20bIyvi<Q{FiQVe$cQli!D5r`%CQk`uy6Yt_hV@7D4
zQj%>Al*rUl7yfe2o)dc{xG`)i;E?%ze-UV34rE1IrBt~T`3^6fb2qV1Rl`pjQf)pp
z0z08cftK7=_D-*>3_mKKx6lXFqX!!<vC{j{YpACVg^C;-Q$yxE847ASb&WA}Yhgjb
z!so2s_Z=sa+wmm08&B7{456c;i5u<#eC079A5yUCiO}cD&U6Oc0H<m^@Hnj5FyZcr
zAF4>*d==*zTuU<%+{lsJ7(WuhT8{6?-O99INTzAdRL+&u%))v0N`Qua)n3CumR1l2
zCO-ISU+f*86W`)8MRlzdWVW|j1p)RWncLygim^JJrraZF9G#{ia?E->^XAbUedoK4
zz3%OX$SE;9a-yUAhZ-kVV#OLy&WS9^Z5gzywPe~Fb14&YK_litHaCQhLSj&YvIwB>
zwF-7Mt;y9SyLfwjUv0v|T)#o}9doLTdM(KbT$8NMP{B`p^&pXcsQ&;XP^jAEd7XP<
z=H(nc8CGI)o=!Z#M-K<GN<9$~jOeaj38FbUqHo5rOl1<zr?n@+UglC8;xcX!@_zL0
zqN$#G`aMM9{hRT|`-d7Rn|X>U8|s;9Dte^z&9AVRjh`z`vy>s?QHYFii{!rF?N)V~
zz`53?3bWZ*{&O>8`r8{|;pDi88I7^p7AuQHwvP#8mp`x^B!z>1bNEE+Ar<x0@k|G~
z4;F0inOUi9nq*4v(qLp&!w5TkO`iobPe`Se`*x5MiFIGQT+BWRqL!jlt>wr{#WdXB
zEUmgY)VzDTjV*B>(~8IX1KMo7{my)23O}ppH&v@EU9XZQTx&DX&%`^ujr+azrr=eI
zAy)jBMpp7FW<_7Ku~RBiIDtb*$Qx2bDaHyNv6@7W@v7R$y1!_-j`P<@RAgv{o1C`J
z-$(sW8ctSnw=$B?W#e|+bkK-lpPVAD0anZOQ<;Xob;Km=wfBZTtcX0KWIcNuyUp_-
z6;avkwA+b891pOHrGgt5Cw?$7?kEi?Gr_l@$K;8|B~FjJgYB!ZBYV4z=4A(2Z$Z4a
z?YMNk-}T!<TRFlL0giZM<E7fFH_J5Q>gjD?cDZOu*|nPCRzJfO{ux44ZnEeGBkM~W
zrJFnIebI*;j~+k&{v}pjoGlcW){LF(a6{5CUfzP{!OC+xj6hGFngFS4Z;u@hV{u>|
zlJ#U%SK3jMR`<J=aiUxv>Olhy{lE_AAfdyAG|AMhw1?zU)Ug()d*okl*DOw~n9sRP
z**v5aw56VuO+XX$7Kaq_XQzT;Xb(8`kBuJsUSrSV95?z@J5;`_<pM-tlE%ArzL+|E
zNa5_hKjQKiO0K9tt32LT&zmvDLeJqn{C){LyzT003=RdPYrQnJX~h%BEQMiP5KZNk
za$r2m&rG_6E!kb%eo{I-Z_k$Zi)sQTgAv0lW~Y25tn|o2&jWAf*X_!=Ir;JTPwvjO
zy)0#nsGzQ>2o%KB75MBl9A5fmP14^JkLv;iXNAiVSC!7)-Eerk%qMV6a8#3;tTfya
z#5x#qc|X2Q;)yI)v`|C4XcPC1Y-26I!8FlN2KZ*LI;#fcK3j`d<mK4GlUxryF*Jn7
zXPF|YWgp3%S4#Hwis#7fuG21w)A{L#Qf|sU172oeSvqr+V};i;Xb8?DTM8a!(zz+p
zBkkw+K;1rhG9!;l9B)IP;iA`51v?aeh0j~ih3P3*_Ke{r%q4V|B_w`1=Q=@Y>?`=d
zIPh66hqKddN2VNQfheNyjEgy1P&!{jz5UbFKV2@S3_-XX8xEy*{jBcA1YQi_eo|J@
zI@sETdRxcrJVx-Pgv-})XhMhxdQmU<h88y>o`>2_#0_I)3}xS!SKhp3Ipk`=tX+7&
z6sjG5*~^!Xhd!R9H5x4EII=@}$7(O(v)=2;;bP@k5U=5F5w@)eMq|b?;BuvB?!lB@
z&=M}c)MGAk=@6yljDj0)q&F*pCda}59ZiFzT?`keP`?F^o8NsSoD5E)#Y6L{WJjaa
zkkHuB2Cp*7gzI4OGg^|sonCuAHmA(9JQ25zXQgP)IfrMl>2^jC65fqSZ!b;o8`BoB
z)D&#pjr#H<XBwQ-{zn&v*k;+;2pun^6%4YcWv`2c9a*(gDh+Y4tkMkB=8(DOvdo$E
z!ldW32JSi;^#Udi-oq$?>2!a4-pPY4P&?8*OvbK%^2yVfj@rxm$n?|6n{}}xh!o+6
z!^`uFC$rTxNEfq}_oyt`ZMInkq52@JbP?Xv<#YDr^sle`7H`^jT_*RW4R^XH!~qZ3
zqkJ^@6@@(=)f4N}j{}gm=wU@gy<n<JcaxyQ@hgr?L7THk>cb2fOX{DY*khr$xgF33
zS*_T@9--LVirHs8dkvFq8$!i3ftG0;&8iH`ygO{jq&A&(46n%`Ygx+T$9nQzyGASV
zxgzq9c%|RKr7E)VDn^$7UhOyz3R+@tl&LA;=HCwKVuXbA3g1!o@GkfHVx`+0PIPmX
zO<ERHEEn)#8TfF@pCw+1y%PWHH=B(3Bj2pqY3Gq+>Akc!uWsO!w)HzCC@Ft1;#!^6
zKU-6lD2SnX{`2*VW)`2IA}LhaIUg*$tdNdk)@b}^v|fZih;tbtgj(I}!qBjq3d-pt
zdM>Cuarssg>q%XXI`)z4<Hcu;sb$is3m5{ET)XdqTQaYDDXaDVaY!3Mt;%Mc)-;uh
z@?;R^R~V9F^>_3t@eG9>qu`XZ+PqCIHLCZx)t8H@mKN>NhbMa#K3u*cw5VRON`)sJ
z&aHd~(Xa0(WZh`7mCSZS6$^mvNpTQRdn3B3uR6bJe@I14k4eG}{Wdu48+&dpr4c<R
zo*FMz9_ip~Lt(v@*3!+Nt3NV6iSs7zel(H6s29q7;+aFwHrB0g4Q29Zu{LBh!k54P
zXk5%ek2NLNCfm?QeP}<*ZyMpS*OQ+CI#<i*px3Gr$Xbj1iyUQ66ymTc#d^1Y;fhC(
z2+DR`^$Bh;z8g!oV8IzsMC`9D%%=q(MVsuejdFZzR@b@1JR<?Jcfar-F=6qB+wOq`
z-^UBH4A-$rafR@DWiKyh#+FPWbiV<TwGx~0tzNF+TRmm=Bp0XX*w!c#1s|usC8SuH
zY@0+|=as@pEASAGBYhztn(%PQ_4MF1t_n@M`C^ED`KK!z0!;d!dU9#1T%NCQv#itq
zv{{<9JUh8#wU~PvB3X{vUVFH6)O<FjQ6dmmXPhA8HB_~G-KJx=T}_)bKI{M~=BBp^
zQ&91DW5j%gFZ$U`No5=c)NV0ZSaG#3Fdp2}rbzfOZ7Y$)Wan-6I?QK+ruB13+rh?t
zQt5N;4Zpy2_S-b(rp!vqA;L!qa;iOS(tMftxZ*vtzUSq>)g;#n{KD=?Ny?Y%cE&Ge
zjjXoUVz9hgn;3C+EADu<Wr0=O0nwbwC8=%z{bfX3=m~t<h3DhUP3cSMnO!&+P%rxu
z1Vp_P<9tqj_h;%FlC$-Y?s>cuj~yXa@9Qg^!48R`H+h5#tALk5#oShlietjLJl;y4
zfEk8W>V0OoMd6Bje2-AHwxsiyRnoMq_mu_=VktFMX2K)8&!CgAeE~^PB@beH$KJNd
zqjuC7JCEw>FS%W*$)^d4F&r!0KX2gNNy}wes*IWJwq@vUm96!p<8$vpt<`*j_|PY>
zR%eu`;Y3c2`c7ImZZ-OE^&ua+PCvJaaeNnTSs4*_`ud@JIctc8G#%*hw8`*Nc}V|V
zcDZ2XIByn?p=qRQB%Wbh@O=;MQ|hM&=kuF{)3)wpnDF^skSoDOZ6j+(l~_bFm;=S_
zO5gO8sKqQ#^5@_4X0ohA?YcDWtZ(o#qv6wJIgBPe@mikL{2*{?75_BL+0NOJ@fr+0
zDXh?3u&vfsjD}`dTaJa-<`QR1LGzvhEOLdq!?NRcX46S$O6mj{%+44Rnj0!mnNU0N
zFdI?Wo^MffunT^!u9Ylj<Dtod@w7mlP@UE~_C2{lt@x+VZ*HaRbSR_Jw93=kn-(>#
zoll*e$I3V7cG%fua41Th_mLqRHO$8J-%^OWlq8JOS>FxDxn2Lzn<hF+%tx}uMifme
zecQ?K?rn*80DM$7&Ehbzb2GkWbe{KnfBFG&{mgMQPhv#Lr~zPIK>)Mj9vEqG*>_Zm
zcj?C3R}JF^>$bZ&jEjeIFgQ0qYm^*1q#tQ{dX5g?CiioT9oAe-->{Zz-2k^D81GMW
zoR~&8=MVV!j(8KYqMs*wNr3ac`bnuL&&0RSw^|LovL-ueoMchB9uhR=5TXaDq91>I
zxJz(f@LA9kp80$9gb4yY+>Z?o?CYY2y}%Dkk<)E=eSw5R8guGZ@E~FOI5Fw8rQ3Nn
zIiB~Q0kqs}A{`qAa|wg7)rOidBR{gzmix=lvf&zySEzT}tB$J}BvT&<4VGz8T$D8(
zvEjzk?)2fpzBZB9#B+tyc;%kyAWoc$QyU6T7Hj9=*4x3S!@)Tl738ePUSOk@UP<oT
zJBZjnTcsHHtcY$M4+8|Tit3jlZoPKQ-KwN^cFSXSMEq|i!$Xu(SMsxy#xX;t4U+NG
zntXTl$_!F1{wQhozQE3^q_#WqJb^Jr4~c?r31E#|W1`!XX{%2;*9#Cef!goG$aqMs
zkjWQ?anU%d#tt%SQmr{+FZ_o_%wIRCTzEQWbi#<W=GEF-coP^Lqc~E+ZWvh1cGw5I
z<T5@S#xhkKx}f$Zu&FG`oW?==T*vt&lbz|Z^Auz^GOW2eYDw+Iwm&yybHosboYY(9
zZw$$6J5;l?!E6V(fr}!vcS3)4=c*zhYe9=*&UR5hpsViAt?Rb3Q#KbTzexRA5NtyJ
zkfqpuU=^ose?rWb?>!E#F!yMV#+XMR*bjN;?<iV9|KXEKSp?aN*<CvxI#S*)y81Tg
zzve?@jXZHT&LhSNxMq(6KGrbS_AuKBo;dX};gI&}B8f^hBJF~hr`hdBr9A|1D`yGf
zE1r4d@Y>2^y`7knqFt8Y;A)9YO((*CxaOx6C_d_u$iyENKpuB<Zn4Keg=%Ro)-Ffq
zCTdaCQ>%)vt_(G0d|*V&JxUx~Ut{u3KH~PqrEJ7p4;bOr^MOvXHz|b(zvgeY1}*^_
z_S{OkZRO^bM9v>lbv$ZOwPcoA>~caa0V7pUzuXP3B(P<FlIftZ`Tx=N=HXPgUHdrO
zmZ{jLB=bCmka?cREutu7Oft01V`UpMl?Wws#z-QOF*6}U$dt5^j2S93`K?{|^Ssad
zyua^p?86^!x<5YGb**cybDirvt&6MS_%}qKPmK_6bW7*XYrr$c(Y-zv0Uc&-KieI~
zPUeXzjRhPFdo^}bd*jVoC->=5cIDtSFEkY;qd52T0L7rKU#?TLgLm3DWBUWPWb+x1
zhu@GPo}(2OskMv`^$~aO@J!Sd5j|0VUTh;yFSnDT;wUuazJGeJ{K>R>*Ez0>-H~CX
z^w=ACEsM(gDHHz^P})GQ81kKzpx#E6ZdTs-a=qXt^fSyce4A#2+eFN^?v01Oe}o^m
zN5%XSL9JUlkCuulzk2Aq$UMWKcV~u!FIkvO2wjzJ^B%!6@^*N%9Zvg3vCWtZn=VqR
zeyFpi?1vICv!tb~DOrFlH^q@<gHy4;sp13jSuTq2<^q2KPcgt*e(YTgxn<fe%a03L
zoktyxk1#dM7{Lrb_o+MM8m-N;BIYFQpHD@4uV88@yhzV<n~hmVt}lO8s+OrP6#W+$
zK;e_~U~++wd-yA5KQ}Srqu*MGZs<GGR(RSwC1O)}Wh0<&<fS(BWXN(USW4IYHO88U
z^-4GQ^1kaVr>~`Yy`pqmW8iLRyc)MV4>BZcFOv5yNLJ+DnF4iPGzH(_gZWCSS^KR{
zl%tpb<O7?N$B&#=3KP*=&|`gHt%T0jJgh#N7JN7DEsvs>@T(ds%AdnMc4~~M(S7KB
zvD)4pLQYY;;_rsB3cKHnG5A*Jr!|U#Ac|#LPQEXM-9AmH3;U{tNosx!`W&gV7Y!6n
ze7Qw&Zpx8iXTjU!>0Oh}C=gh|Y0$@tifp=_$7u`l3)9Hj&e0@bCdi(W7A8M5IG&BQ
ziHe}S65PZc+m&@ECYFqnLeqC@n_GigWPPrm^L@3)Z2g6RU(bOXZ?WWEr?jV$m1wRh
z(bY1AdU}G858aV0ZXWg1Y&{(rOS8|Os+Rv592Vy7E~ffcgmkw_JByusEz19v+`30s
z`Q$h$Qiv4&u;BR5W!q_nWv!!{Eu+p2;qtO2yVf3BisyLY*$MiWlQ7z;xU4f7lcMub
zy!ZB^4NZ@OR2ELD+GLJ%CZ<-@cl|AqsilZ@N5t*6<evY(6i=mrmeecy*_`BA*O;;o
z)KxoivuAQxq@r8!rC+8OB?*JfJ=&A?iJQx#{k!EoilZX$u2HNimwKDmzx3ezjLu4O
z_!(VE8C1cWk~-nu3H`T>ax#q|OlEm&^6Azo(rDV9^|uGm(sp$17~h^AAWOU0ym@+b
znop{6MUu2sUAUSQX*8C|n{n>gl{Okz5;v9bVE}jPf8VQaldj^wx9v6YuHysn8|#zC
zf5ke{tCEJk;KwHPd|-c{{P+u%RJtXJ=Yqr?CKONkZ~A-LY3|l4OY1X|lsDuIts|+T
zYAutI+$+0{wx3{TO3|;zrj0Dci_ts0fK)NGIhfcl_-ZwaG>L)d7d*rJ$`G1ng<<@s
z?nti3LC?9dH`761{DygZpXuKV2Ax=G((rLwPNKVPD~=d~Day9VsLI1AIrBctwr{;Y
z&9&oiLR808ojSr~8ICyG>|Ta#%v}Ap_}(yam1Rs#gyCwYJa{qq)`foggja8AN~xiA
z_WQmc*^3Pj2_2Vzcu&;pY|GT0<8|5(<;-}c5yp2aij04X+QTsJ_whk9sFVdrZG`S4
zf)92!7%$Z@*QaP?PqaUJAnJbgL%O8fqtC)i>JH)WLPW2h)VJUkOqzJKC+iro&~lOo
zM;_Kgf#BkmBsE%~I^sa)uo!QB5ot4cSN#PvdNOyA_AyWOER)LZO*(T<9^avArb}e;
zY%HKJZ~Hu^lyT}J>j&DC#m4RmQ8ni24&qU<wDynnG31FuPojcDLxoP#>H5u)XA&|$
zJ0BMrZEjAN3A$0ETD+xO$j6fA_mqoo@7Yj{crM1?InjSIPS^NRvh$KtLMm<ATd#!W
z;bZ;BbA+t(JXvBT-S9I_=kPO46H&#=@b|)JQ!m7jP+sxD9l5h2J_cvRW)EY+XpON=
zVcT~Nr9Tf}5mku~vkCVr#1E+Ywi1rgqxm8=2k#R|abnC)y!Csi_f-F}h=u#}NeaL5
zG9v!lS7f-uDLlpe*%W}nRgt<6Dr*6JWnoOrucNnh<Yf~2o6~H{ut*8z8SsCKu~p}P
zZ*|vC&g3rMXmD03Pu%7L?<tW;f=jj6rOL)gxnwr_>Ku~1gtt{OiAI6sjTs{4W)CG@
zoNrTk<a2%x7DJVL2r8c{x&MLnF|B*NeB?LR4~%wPnTjmgx)S0?m!y`~+>;3S-_$J%
zLXW|YI%<NOQqG%x`FFCps29aI!)BXXL7<^0a@~vCjHF$_G|p=u2IH7ky<l=KATl*K
zbeCjP>{3;Pu4U&o+n2|>>H9JoJ<NiqRJV662{8O})qa{msJ1ZyH8<+U{`6ZXDLW?w
zteq)ltWHtea<Vt}=B22qxRR{V{`&9$u;eNIjb#0Si(7RvQEc=uI=B`7#kkp;W$~Ju
zt->h#vw7L_ZSoYNx{d%KDl&?szmH7`;`3M~t4i1v{%I%Q0ObD~g)<Q~yy%4s{k-Bd
zJ9OU|FP!-U@Ss9)zO(a|n(g;eEjSRmq!^k6O<nnO^rJa#*Ivb5??87Y=dA|!th<tY
zpV>W1S9wkB*1W%N7y2$%v-{kLdFwg=yT=lBMje#=&rm$B*7WEFUzPWu(rb(<XNX-+
z$1DFbc(t_VJ^tcz@<y0z3cG~BtOns>foN1R`5Drlq7NYSR~)Z`;fjwCXi+|E+Ak_&
zCX;mI)t@3)C4*L{F2VA7)mP@bQJ=)#%^6Q0hKDJ8IKM(QaC?m9pQs<&P-?|eOx?`%
zpCK#sLh%)id&dCrHQHhSmrR;Pv!vzZV<I7$S|t!gXXIZ=#Qlz}zdCWaH59w=n1fWR
zW+_+goX!-(8VyvQ8+!)qbe?6a2kNrS@nm<Gcd2UotI?l6^%J_;OOPf|A2AtIaFziR
zGl)qEc;KPVQ}s35jUE#o`Mp1uy=9r$->HV1J!7A%UG`S)SMH~eGnG-EKE-lU%I}h%
z<kShE@^z}6t6N>t<*F~a^i<`uBHa?^BP!|>UFpXf(}#qd-@oB!tYUsx#ZuTl4OP#>
zvl1`*R1G|CePg`bALW5r`#Gc+Av+eol)R>S6z^=VtaB&k`gmq!+0A4Z_Ufv6`jVe-
zK<+hH?+PxA;26hoMkZGVr6aYgIQ2T}5{;VqrWn~mmMRza7Ta{Pp<_X;M8}S=Pm1S;
z21t9QfFd*hD-fFRA$}j`kH#A|`82p#>vPE8dZe#_mYwKeUIEtnFUqeKZ(0=mVIFWw
zHTc_59)(wQ99}MIULxaBZuh*Fiiqp@q#p13hsBvm*2`8rh)OLS1b}sBBbWzmxV)UY
zcfkQ6+*i)O56lpz;fZ>0JR#m9agI}(%XpK2K$bBkr(qyrnOotpFXr^gZ=?@ob{Tx_
z9(J8w3gB%LW|a^T&K87fHAWv;Rx>}$zW<FyphJOhZda@nnPE9=r|~FOS82BCE>+qo
zupf$hahHgPd4%*BlV}ZVa5gKN&FiwFh>Y&dx=nUd+pzHa{NakOD(sTw?TW*_TB-UW
zt@0Te4=0o`^t|iSZ^8a(vrRYs-j?)e=q(N((<D-2lFR8)2bhe7Yl#WH9X3d50$sia
zV6Jv>7;MTKU=%s?N#e9rw>eBw#z_WIHxf!YUr>0N=e9^ad+6ITw;CS%IwSM_NiUuk
zSLc2|2oX}u5&c$9tg<P5YCwVuk*o#<hPwRnq2I^k7#fl-pw!pev@Xz?m)*Xn)573h
zHo~^u(5--c{Me@2l4_Tk1LZPgj3wwm(_g+$!nSH8cj;Spa~ZLc43)`US5n&+7Nimw
zDpdG2$#!=QG*b1Qs}k$*T_^Lpb#JC)ia;c6rALGKCuMs0%r`f1sNH^LS-p23#r#3)
z))p0IA_uat)#kE9BTLkGU6yZ=`fcq_!;wTuE=tQ+nRu!=U0MN<yj1pu+AJwE#0C9r
z)Z~Gh3t8}>vU)853!s{z0tQqz95wNx+Ig>9E@obcnj=lxd-h2-x~m7l$?Cc|Yu(ZP
z;o|2D$R%vZne}PMy9|CgQw$|bwg(@z{}hQZF=}&DZNm!PZTs8j*4^qt*3fppK@RN<
zlepO?-a)u!zGg(BQ*%IvS2e0xK0ZTEVD)z}y8FzaAzw6J?V2{dI@DC(^<wx2Za6?}
zQ>}~=E%}K_aZ76iRIz6bIIhKZ$waeET5{%}?$cBqy;ygMpG%mZSc(*C$M*2>_}?7N
zqTgpZWlY7CS&i@tWSZxcpE0TDVRULdzBW=l;YlE3*hGadp&NdRXZlGMhC;Br%Fp6^
z`W=U3>!7D1=$W5bvk9aUBDikGfsm!q;avH!ws6$s)9VK+vts?{@=;h60{eiaDvIAA
zYWgaJff0R4tfVoieK3j3k;F*vEaj)#wlWX9Ic|fMngV?>M~dK-AhDOP_NnXH7*C`e
z)tDsKOw8#~xX3$jl4_sI<W6}hCK{90FV6Zt5%r0i%rXRHA4C34Au4N!VdJ9t*w4#I
zV;~=Q@yAt@L>4|1M&y~PPqIpV*{TraUd-yId#sW)P&h)>itK`Z@8rBe<=?ruVOc`7
zfi-ArTZ|OymX}@EW3^8A&{@p?!&AbGuLhp{7laf@HkNZ2SEL?9l+7AZm%cKG6id@^
z4p&Ow`3^lgJg4_P&;-(#2tC~Vr>DYF5PYm?9zHTDG=@_zbv0IQTm|nX_Cztr_u&=E
zBn}p$!lGOgbV6V1g_+yz1ga`?B$Tg7uY{(<=%JfV2I?L@IV@w;oFGdN(8;X?7*@OL
zv7D|kHQlSR!k!vsO4(-d>N(7-PY=qHn2T)M>8?%6j6dKf-?%TdGR(O<`K6MO-i$2`
zt&m&wQ)T`1WSs6awL@1?(5H5REZ@<el{CA<KNp^_CF`*lmGSZCln%d5Dz+k3-kqup
zd#$m1%?cMpV;sZsz8}MLoR#JlxRlg;8`xrZc;1tB%qhL|WBC$bJYCzDO!Kjn$aVl&
zGz+pvHQ~L`baL-t?lUV?Z0vWcr*@b|G13`dVEfsCcWlVLamBSaQf3BoU0M)h#F<F2
z_^BndhAc;L@*we8kz1mS3vZF=g4E(;KM!A7)$vUU>9^2~<+|DPbiVp^o$ka7@dyr_
z9JfmE=8siN8tj8X61s1;X87wKJ|Fc7C<Zyy@1C-@ZsG0VAC%4L>M}$s9xQiDUumEl
z1|d1D8lF!SFKPBy4=;o?DNZG&y0cQpLX;xO_80Zr`n{#+nb7ryA)Itsuliq<NX~nB
zD(a^^JEEvoYW%g;N3=+@gIUA(p`It6Ae;0=tyPC^OAlU6xbYlD%ZolGj1WG$t^D)7
zM5Y+Opw5-1Yb16|Q?lbt6|x@ub0B#6t5{VMRO|D`?q}t(gma|vMm-)2P7`<38~HZ2
zKHbVR$XZ7B?(n-xN;pm^E6Lfz?KVha)Q6o@W!*XlO}JlsiNlOB%N|U)b*h<X_!(ob
z_PLBKZx<O74dNEA$p}os<r_u$GqpCDunCYsX6i_rmvW~Stump%&W_shOGUg*D^~v2
z!uw}-{H>SO`gY%uqW9tPNR9R*l#2QCo|0-FHe0s)iRe8JpB9#FwM$|Q<)5u)+UkZG
zKR$v9(Q`dy57s_C!{je`MyH1Pu;-|9tK+3DrkI;4+i<Fn8JX=-lEk^^5p3!AbuMiJ
zS(Wh=GK?<wy=K_79);ZP1OEy@PvniE9=VxWhfa~tyYnH0kVv60o!cuBcOSDS3zR5{
z)m}?JhD+)uPtV>wepgXhJ?S=hI%q2~fLQwjyol&Z7OBaSXS|BB1=3QjKM>^ur`n?b
z>1D1zGG6CNkpAOM<h0NItS_QA%ES!2h5l%*bMpXRVq$_fO8T>BlGVX=Z1!?WiJym2
zk#;Z7x`&(2A@B*U^@yc7@lGJJ#2i&!(mrSHcqcD~I+wMaO81)zz|Wya8&Qjzmo>%f
zk=jy4>RGBbq6p93oSGDdd4A&d{-Cw`fS<>iWbU6n=Q+|l?|QeKa10D12KHym)GwbM
zJfAmy9I}MQdQ+$L)%jYyTiPtED~dxa=T;uE3rO7Z-Ss546Jfq51!&uVFXv1ExDLIC
z&h(<2@VJ|1{1(tQf=X2ZYH67EmPN2<3QWd=6P}~gl)t^v7;Umi%SVFMO&sWH^kv9L
z#`&p|RQCx|(C#8NiB&IE$+hOL#;ANC3x{S!kW-=jpk=orKtnv$Ew~oi`2mKK<N%7k
z=-|j&1fc|4%$Hr*H`?*QvsBi+j1noBXGN}+-byYK8kG@-oi!X-6x(6^(nFdR)~wL>
zEi5X0?XwMThqnGwdQ;@CQPLGkA1}5x{<rI7B%RqrxjnK`yAg`pb+}Lpg^e<n_gBEt
zt9oFUY7Xz!;RS5?R49!U3%~e$t9?Hkij|4k<!fR4HtV5!2bG)9B6R9~{&!hd%NVn7
z&M*PtH*$vBdA#2n#N>NHEtYOn;g<C=gspSkja_+2p~Mxg*XIAzPx>n#^A(?21(PiW
z{)5x<D-s5I+<|%G%=H!}x4!=IXC_H;A^NzmPX;OM1*gt%`Lw}S$l_6&wvN)1sqFn0
zM;S?XSBVJ=+OwO?J~|B4bc1E%ZP5qMni8VCSzNpE;@sBV6dI~|KX7fqrec}2d}v_<
zZBzX&vTCV^t{$pqZVDzdSA$qyhkqRBRxp{^fh!36{+e70efp(&y|d%o{DwZ)bv!%M
zu91lPn>84$9OHXj7DaQZEnj0Mh=!v?h{weI#)aFgeAQK&XN^M*khOWO<nUL^C(q+&
z+wzOakIm!)ceZj`+XFaNid8+vWlBn|`7c2p?S%3*O10vP$~t`VfV1B7#k?Eyb$>n2
z{{AX{v`}<k#k`43RUDCFN#v1~@xlg7H>6}XcA70u@o|2$RsA5}KL@hSE+Cjsy2jGQ
zbIM7{DK^K-x#u7#BH!Qcw)Tz78rSj4r3)f`x~Y-#dq}#Y+Ca~r0(DK=S6{W?$@U@n
z=e^^vR_p{Gx%{i*fj^4pKOrAR1uZqB>rJ%BE5{=~QN$kPAD5y8>J+;57)hVJ>orWy
zlv2KBx75ew1{qs)H_fN-g(wAJQBeH^oQFO~@T0L7Ob}L0?DNm!7Dmpgv-dq|u6!!=
z)casPu`j{Y&=$Mg2Ne1b?A}@mKS3)xs7T!7dGju}ASS9=fLGX(Qo+~atq^qyQd3E0
zHt}xE>kQwv{DqtYEhe`l4(4Vn6!kqJysLv3UYQ{Hs~(FlRLOyeCpPSNRd;5!hqoiv
zp3y$~P0PprE#l8DsEFC6D2VgfY@fTK&Z8s*U6cXeM)E0Arwka}7cTte)*NUlFsO5f
z=@I2|Ra9xXj6AO{aYy|)I&oRA`R)a-8Og^s@mn*K!v%&+Mp5^f?8Hz9_zC5WMjJt6
zDTHmyi_72%p@E}42;ID68+s&o<T#$`_1XNN6cwMgWb5Jeh4P6Cul<wl@L=<1Sf%pl
z<~VRPJ9tkAyuzHqi9su&Py$NjytSx_9lcNIp%oC9O^b8xGP|}blVJ4l#C-S4lgm7a
zIX=|fMC;oea2k9-J!6*s09<cbEXZ=G2l*oMU%VF8x~F!!2;#cvE9*<eyraZ20y8;7
z<@Va(=E5uiQ*wW#JaS0S>YSx#$g_Xdu8UV72fIvkM}GAj&YSkQMkCDXn~$79jk8yg
zb>K|~z@RkmE?dN{OE`ltd+rwSLZ-UqyP<JCqQYJbKS)zzTKK%Q_J&sR`Hm<$J0zuw
z45215<->K3GYp`6t(gA;`h4}<uIp@Gr!$kENxl^EYg41hv^uWK(K#6_tLPQ<+c!L4
zor{>Es5%P9>z{!9vFNNHa#|UE-`1Xf`(W|sAc~ZBcX-hDG{_+S6kZhhdvpBr_568+
zG3OqC>~yG<TJpcu8{25gq_b99Z<=PWt(xD{6osw+f%?+-6EZLEz<E!B2Nv|@k}OGd
z#g(6}hRtwFilOqXY2|~kifLoctjqFet*>wuPML}mKKAF^*>|{Ot<Ur(|1?9o3udsn
zycDH+x(II2ep<~oV!PPv>!co}?q_PW245Ffzhr-Pq622Orf^MRbv;mHEs^=c)m(Vb
z^B@@1#(QsY7PX;@kwlar6m-4yLCae{m>Z&p(biA?bCzC2gBlU}s_J=R?|4t)twY9P
zFj@TeU8b*iyOni`Ho2%&(sXQ&NmxD$Rrp14M##VqcGpYZZ2n~<Lc+GB?QoI5P%qrZ
zb|$0xZT#uue$Q;!hnIjL($T;2%f(}J>E{ppyBtWpoRaou-?lo=pL|IiH5VZ1bLl^W
zLm)4RHHXU1SL&Mg8{ORl_a}Ua&_!~u7Ug}2a*tiklj8H%oy$fMl49&K68G~ygC4VO
z?!#WCLF+Q9w=h^vZg(FL=F?9<d2SSKG+){#*K69WbBOh`oMEkD|0Z=DFNlp-_QD74
ziJPDksV<kt^T?dD^x1aJnRVICVIYEo4oI}Eo2{W`1P~OZt6y`InW$O8XrG1Vn@Lao
zu!>$tujWhc_FoP9ey)(z30U(MIR&5(H-%?xpYY*WuUzIO1}Z-hQ}>v24+b@p`s|k}
z39ElLXnkm(USiK5gNd8;4<v&x5`KStfJ=R<&z%%*KPVn$k9qI>At!=k?jmS!SBT8*
z*1>X9aNfQacWm_aS>Vr!@|3c`TtuA(`1gpnImsO0e&JQVZ3S4J11gs<fGE|$ywMpT
z@BW>(_!Dr2@k5(W|M>>Co7X1mbdr~m<AYH!R!B<D#J(m@SGgD!Vi@xJL>~Ec4PHn>
zk&r|{PzKwF7rnlh{gDG(?XBh$rD1gvYl@t)40>iTlWD3$%fJezjx}(lzlTft@y_YV
z#ai#h8M2_uZ`?C><j5xwSuj@o1iyv6lBNl+&?HdVU7m0&9E`a^idF_icjw|N6Ms;S
zIF?{!J|@r!Bl?NQd5^LhBnTUCEPXl}Jui8k^}4@~fEPKVV8czp6X6=+IdtNMsu5F<
z^-MUT-<6zy=+GU1hcQ><?sd*&UV2^U^GD0p{dIRh2EF7~pVB<wFs!R<K0X1NU&ez@
zWL5T!FQ@yw+^48gzk6JumZjxtUhFFhYrKqN4K%!ya%l#`)T+ai%!?3<9k}Dy+acE%
zgv{HZnpoiN%1km0-FCyUN^B`se*iCqaF+Isw2BPzHf!F_V7t~uM!=p>G6Hj_G1%e@
zg>lb!!)nDM+2dbXCK+rlQyNyE?9;!uj>32jkXO8mL}abMSFTDz>V*8yCx-MquC+@A
z!Tsmc%xiPRLCxp5(ND9RJ!kt{_3YT!3YUyU(+%iO09#$8t5ju56TywrN1!>`YIoW^
z<;sQ_QDSaQOD;T#5}cE&r>3Y?mUyGI*lo<~d|#AYuGEu#sgmviDwYD;a-{sY^Ygs)
zk1zCiWBXyNIxY-6UBWY>p1)4naU##dE`5Cdn1&(W^NmL_lXF#k>PNSF=$?538c<lz
zOeqH2{cY_rHFna_6+%_9aE^<o#XnDXx?M3C#eM>&bg6Y9jFp)Mo()==YPfQXY8NjD
zn6)Ws?XT3lJCApOkZUlj-D)MIUNvwTDM`5Tv6xY`bAKNajh^cU30m|yWywO9x<TN}
z0m4@@$n046o_PP+l+Ya$CumjcUHk31q%&0g#-qSlIwR0~Wdh-^+Ds-B9o(`Mx)$5i
znJJB8quaZNWNRYM8g(Tgi~sjW<sH0d+E;}3Ub=r?zMW>1ckN{6VB}1u4VonVVO7?t
z?IlSMv7nxd2?S(cV?&r29tMBx_0(l|MS6Ftzy9(XJt*x|^N9SD4qw$JhMklS+7>#Q
z#uDp>%Yqk6_~EfHBDkVtJjwEG6v1nG@Fd|VQ{I3aC_@Zc$5H^Lo=Ua*QuS_%hVU3$
zBx$}zG)<=bA;-s)-J(~jO6;$I6S;n~4@y#ocyAq&VzSBb$!h+m%)|Qt$$*qMsoQ$v
z84g-&66GYP8YU%BLDVz1C$CnU_Q@0Sih-BW1&6UPTn}zsog4GOs?CG%b>}a@hGmEv
zLFrVQRa`SHvZ|}1QMFZaq(RX%f2N(eQ%!TWMVmeO9m(Zn)3y8An~xA#+4o65ssFPV
z|L#Z;cyF@^8c_~H8g}?#5vT#_;&pY7c17Y-QNqQ;Lqd3^ydO~aIU3X4grAC%CzN*Q
z_Hb(Zok-BGxRF6r+eEVX{Md=pUzt#OjZ09>ncPC<!eEz~oZo6mdd#VDY8AmDh|pHi
znSy4;N$gSBWlE$B<O-g`%taOPfiG0Uj_E}aT`FvkWe|$w$P?42cV~Y5YMFj520_SI
zsQK$V$Ow|&f5*25nn6SpuViz=jOeBiD@6af{2XK{Bg3^`*El|P*8bfAI_#y$m45pS
zRr51Vv*M4we|iHtTi>yC^g2aNfBsSfvIR~yB_eU2cgwj3P28(z-AV^*_);rW7<OPp
z-g0kyZOz4LS_bqL*7zT@%haXAgzexnV=mpwN86QH>QdMFz6?;Bd-D!@whC)=U}hr(
z)Ry&n%af;A5JehFwjPl}QB9rPd(nNbZH)diCg`vqAy<?=EtErzafx?UOR)ch3lnP0
ztHTb($*~i^CmE$&%?B4ZzyO;99Uy`KZFVNao|cEXRNSjYp*0S|H%R_f4}HPf(M3cS
z$cgH}{Aund+pYIHILL+{<Fy3TNSrHxp~V9v?E8Fk<?+{Pbv7FmkE4|WV1J`eUck|Q
zmnII<{*Ubte+NOg^EK<N3;qS+cY^_Giq!L!?X}N!(up(vZiJ-oGZjBw?|BR8y};GC
z!GxFF1mSJBpqrAasF}2n(?kgo#stmC{fItP0ssw3DR*%acJAQItI936e%*NgFD^hd
z2cZl<-k<6R2sU1u+z0)GcOY8G1(Ay30DZaKuKivQ?!*R9eD@0>SS0e`8Tp&zUVk3x
zp9e$HW;gM*g>@Dr{s{pZaVl~iMgw=)?@fj{j6VfiGkvhV&+y)c(J^7X8SX37DJ=-A
zeuzDZP!T1MEb5K#jRI%LhdkT8l-OB7<gX$|j*6k3kmy(&?XjR2q$OA0pm7Uf{o#Pg
zTpW3MHA~|g1iQx|K6Me41q+ah+RPrv#RQcR<lwDHsvm?I^%TJdOjO?MU4f+=TL1B+
zgf!=o{Oi*i|9Yc16lq<%A4ccoTOL3J-3G~KVDP_-)=0s4r=y%0&g4^`Yzij+A7tOp
zznZq75v#E{!KxtBC2Mr`!bZbGp7oF~zF-sc4y;x#40nSES`VO1L{m4U`i&5%sh4ma
zY^{uQfm~HGAaoigvZXuYQX}*%2;S+Uk9F*YV+Booa$fO@|D5(eFEfgokUZE<a%@HL
z&w*0OCd-rC?KCV>p~@)txO(TXG9A-GM)vZY>)3fvX<~cZSJn=XiRHkJ)NqTPxTdbt
zWRKYTB9<;?D6qP5DljCL3Cxn4Dm!DqeBvG0wJ_pNRyf0N81uyIeWe{<`w81`?4$Ri
zWNMb+oV}AajsJNx4mee?53c4}Q2leb^LTMzr)Y=Sh-5Pt$zT9f6SNm=awq^)Jp;Ba
zdUv>9{@JH=Nbh+pd)D!Jm4_jZk-vH`6wLj2=q}Tvwl@U8%-d%hX2qrnApAA>lqbEl
z>tdqH0$I4>S9Pq>LLoE23x7i){wqow!Ha#cc+<n7Rq5~6iVwMpOrH#@c3vJO{aTm_
ziP{x6IRF$dfbX(+Gu2y&TKz_j!pU$zrH}?`+H_$Z*LLhb8Q*&0kowcWmZ5R+ESPXC
zf)bKiCkO&@ffB)ew&PnV@bf5S3-^z%=S_Uc7anwt^PzG(wf^TugMF<+ioeuCRqbg1
zhBQv2rms`#9!6gyGg!Fs$FqcNK}3l^iZiRk<3~pbZFV&uj>xp>+ZppRm3YKQR%k8M
z;3pD^UFo0m{)yZ}a=mtTj>Y5%F7n_|F>=@cjG+W%4mu%-tmD`X&wuvsdP+!Q&;x<*
z;hkm`GJgX?JPk&Ug*DGGH$KO)f=H+7cU^=of#G~60cEO@;Y35w8#<wj!bMdezonC}
zmmGY9_&I*7VKor89ve@aA|)RGPi6$muBOxUHL8owii254>dBw~!u}d*u&U^7OZpx2
zY8t;FI10kT`&>%O!Oi`8!Q9#XK2`*V9uFRbQ4bVPRJjS=1b1My<<-Wc+cjX#L^NS1
z6$>wx5@`}09Lo8w?|D-vrOMUp32E-^Nlo*kbB1x?qL-g8QM7u$)xn-kx9J37y2x=g
zh-&hry*5m0X!g%u9HgITD+EY$o224ZyvfYdVzw?|=yo*x>c=Jiz5^R-v6^j+mz24E
z+emzfrVZH7z4DmVD!TMCa|JlTNOiE2z$N|0_9ui=onXRIW4()N_*XYbz2C*+t>nLg
z79Dz_9jXOkyu|LippI{_)54#GBwi5Rqf4s1Y)!0GBz4N9DC@`Bg+(a~FKULT4U66V
z+F3K^*QyZoFNt}^tGGU<%;mn9>7(9)_lEm|B;T<7k&9Lix4Qi+>}L|Po;{?JXk<90
z0yNStYMS6?i7g(f{<F;&$re@>3trPM>QB?d8{XwD19vhow0l}ypqDI&kI3ENca^nv
z!$OHHj~n+|y2Xi%4{b`Cr`Ma`f2VK%*}cGThdV#}mGi+PN>_SyqEa6(j`N=4E6bk+
zqsC^}?a&$Eh~A?0d>)LEyh<^UI`P?Zxn0XiYUZ(PJOB6;!@FjO+<%TeI|af$bIpP-
z4WqBLyl(eBq4dtK>z&%#*584GB)w(Z_rLS~6Z5lCVi^lVLuh3F3s(}dwhR?8xpDaM
zwTtXj*;8|<QmS)pk0^9hj;8Jo;VvTQ9Q+A4%9i432S#<b&OQ8nx?Pcj&6O$@)bs8z
zLi1HkSKfCeex^M(=)7F)pCGbAh8*`O&qlDZS5hdMkQok=sSh?}9<FboX`(2x*{&(a
z@>xDGj{9$MAJ5viy>74IL=|L+xQ2+htrT$hq?zrm?9jhb4`rnHHmj1?&(<gT4?`G`
z#TQ=)E|0!6;wLXsjz@&6A|g0#^sYKE?5=#Sdvo5<zJWVlf{<^UH73YYi>V7yb@NMU
z`X-3k7VT}re5ek{v~iAK@{NI=ZHY9g*`eDDF@jCv>v(}T)<imu$=%AS<ke&nW2tld
z$g%q%tslCze~8g8``J_ZU6qNVDiVItSY?@kYI!`p-^pF6XDgO=nS-#Alx{Ef?$tvF
zMf|$q&zdyW)S2(VU4Lu|9Fx<8t!hod(m)7fYyrie>r}l3KF<{}&IuoWape*?wsO%6
znvz}_9h)Mc8LRflH7zz3C~^det%Nuh`D116MQLr@Mhka^9mBo{@$IbMb&92@i_>x`
znH^!e<$fiIs#Ybn(W1@mn#?xCeka~92(O+=src)B=%1L;gu>A5(gnWwR@X$r<TXa{
zT4<0+LXY}Jlx|tG;Rn(anDAOPj|9IkMu$fS2VKGGUBX`1>cikqO;F)Q1ij+=yK4w(
zFxu>96J&loZOg?x`j6g0p7xx%e!f3TwCAy?AF&OHu;a71>e@#P2_dvtXnmREg-d^V
znj@8hk2HeZmz`?|uGH>m1ihafkl(MbD*TdD%}1Hruk;WfqJNR*d2R@Ny}1@ZyEArY
zy-fMt1uW^0vbu2@Kd<`@51v%NcDU>-B3$v=)OWeq;pUYppQraMS9>F<)_DH?fV{Z*
z)b{>baf>&O^2)C)^kqMu^IylgrZw+-a%Uf%?=OEx&3_Yr3a}y6*+E82gcg-IlHX^@
zUt%v&DUJXpzS1)F*RJ0sA2vFUxYl9jO>cYkbD}@e4A;7CR9{f4OWM3fvyn<<yz-)}
zLHi4>@att&x|#~3zstVMWpk%$+(qE)_Q!g%-~rX=t-e?)4MzX$LK>&)@e&WO6BD>=
z5*=<9yJ@F|$8REbQ6vF10*@bT<=d_J-MalidKq5z`eT)W%;&1tAEa&SywwsQJ?y{q
z#ax68nGkYYJVDv|m`BnWFFty{izE0PL{sw}>PIqvu+jG7SqK2~JQ6|t(hfW|#lhR-
zWsJS-S?-WsiQt{}yi8>PZ#*I6p-UqDqD<V*meT#5NlS*fl!AjiKwLpAek^_fCC0K>
zSj{Kd+IbhhBm`tKC{nxB(t7gX5^qa|%I<9>S$qgBVN7g&8k2R*`+Ap;jrxLl)y>Oq
zcg$Ed8E+?C`*Q$rX9$Oi;?y!1)~o!4@8fTy8ABBYj}DJn9}u4S0sXBWK+p<IZq7;Y
zuXpi%_D}i@$~8}<+%B6qMILA53_Et>2Mgkh@>?#d;E1jH7$32w-?u^|i4n@-uG#lP
zZYj;miy`h*g#LUo6!G`2D?u7GZcm(WKUlp4$N9G3-A$-X|91uebq(5w?gzzH4vR8(
z{0oetA^hCH=brT)B)%R3R>NPIX_*5lq$kWi>VPb2o@I@v$=l!n{4qU%`tX(eXSsm&
zW)jE4z0KknFbV64V6aZn_W|iHd|0|HnYF@S3c@r2;N%;e+q(VW<}>w62?9Ki1$-_o
z$aNg+d>FE-)Bd`kJpI^;B>v$v9)={w8a<^vgw&+VxdZ)Qk4zU6vS><TBy{qIq`4^m
z=l#`H{MMoq<v;zH>&XWNcMGg5+%@>d+hDPokSr9_c@A9<1$xHo11bQ*wh}ab#ty|=
ze=&MW1Hm5R?Qza!M6zW8jNR{n);_~|<A5VvgDe<@k3-0znFDV~wbPQ<aTKlmmu9%X
zmEXyV8v4oGZO^j#c5$(0;A)+h^1R2e6T!YVqK4MI%mxeAz{d2$HC|45dpY2M@otA&
zAjtex*snX+@~>~$oVS4O^1qkC9Jw&lTJtAHe&9s;ZL`W`O#08>FD0QbmB(vJ(-44#
zY47H(`gmkL_c{=WbRm4+*W6nDd<CyD<LDFTs5yQ36AorKfZ)O)5agt$?6)$;0lDLL
zguSWIhb++%0Zd|rktt)@)er31b(?KwlSIb&CQ!lTY_Vw(C)k*jf&8>N=(q}R`W@Vd
z%iM++0}@i*_|nB1@R&M-6ftbI4!amKG^rpKk=0~CAr=0=vMn8UL8NyAm65QPqr*5A
zL=zdBfzLrRk204bsZPKWkrXiR0SJ02e?9csd<YR`)U2d!7LgVXfSpG$k3Kk74E^|b
zVDuR5?2+<C8Ehhq3+F*i-pwImI54<zUg`d?*IN7^baxVi+=Q{nAL>o77o$^WzB%l}
zkYEqs*qJmxpf(lVT>6*@b(v+paQmGNs!}hIl)rlWSXBN)l_oORFK-pQ7HQXFp@bo>
zpL4ZNCq)VvG7u|yqz0@A{qKbdxrtnFRd{<~%LUQ0`R&J^n7=7SJy-V4S2<s+qoqbm
zEOMoCytgUKLatCT>%o3sV)1^YfnY^d0}}z?0F!w13+lQaXoSzHo>`3LZ`*f*wG)R9
zhXufE?1rpTi51ZRYwChj#;m^(vC{Z?!?&FxoR!K~SMEGW(`jLPCEPBGD5#`aic4cZ
zean|?UH6>GnJZOE;3TMdh!<p>7%sSd|LigcHkcMaVGe%x%0;PSwZt>>Lm<~6c;|5Q
zvFFb{II8d`pYa?}CFuh9*Tl_Q;V<@0KZfvit|FK9zmm6qZtB1H^`EcAJlS>I*=Idy
zce48nytM%sN`@ODv#^&!PTiu077K7~7f1p+1qZ;NAb3Ch>h%2nJlt~Ub6E4D%X6Un
zDcG(VO~Yg9$2jw=Znd^&BQWYNo+|hD$conI?SKFa!;p6X!DG!GwWik>W0nIKfjX1~
zBGYXWt2+m87jk0rprkQ~USKpkNvDB8udzfulCWsp`79lkf!Fumyb15%2RJtO6q`La
zks|8T02n-XpmVMHN4655std<$w(PJ6Bt+eu|0IO}0iOQ-PL6g{lBXT`sD4;}7R$OQ
z3}e?Nx8GljS3!~Gd1md-#EO0ejhP>n<F^~`z<J{V1W|YC6ETA&`<McrvOToLBv9iH
zg+k|Y*@pv=rq*qbp&xqGPBBP}q4_Z;oiMWy{xFmIB`G;!ViR9!=>;&Uc?gB81yCsG
z7uQu#V!=C@W~W{7)mU~O7ruuKRUyb?^k-ed4qHJ3zy_82Y@#e^+kF?rI={l8Y%-Pd
z!Fl%6(lzM?E@G3xxYy>d9J;(8|JV$n7CTfrALYbKtVEc0%9ZkeZ|8r<9F=J@nT&zl
zrui@SP@zuG^IyRkWIOd|hMHOym^TDg;)Bhl;M-)u=_VxE6@YF$#8a0Y8?WBlG?N7;
z*Zol&i6xlyiHAdLVvxr47IFW)V^O$*Z=c7lEsg8XxIWPz{tO9~On`OkilG;B03a)r
zPTjsUE*W&lzABJ_u&x86^6h#_Fn5{<N*sX_5qL@pLbX!Ur0MxCo&!sBJN&-Wz{IDb
z3w%!$0z6GBq}(U@Asgg3_nN*(+Svjk_y%Nwxb6ire3}CA&Q~)J;Z8n;Ds?3kBSRq-
zr~i~v|H^>y>G$6cb{!=~7Cck$oPQH%n@E->Y<%36^Myr)7GBG`xdVY7RYQ5gb*2v(
z5#r+<gi`G)*Ktk~W{e6BO=xdV%Pj3WmOWt(H(-0Y=UjJyAA2=Slg~j#BOOQ2d=f}i
z{;~tGQSdHMJWU?tu2Fcef}J05BKYys;2V|jw9kHp`z#G_&`|X3DI?Hotoyts9D~Z3
z5FK_F!lBF2Ugw_R8$ojrHU$C+DcSLgR5S>V464DsCaYB-b{N}*dJ~@g2GZqetFLq#
z;(Rp5UpdC^g_}Oa^x9~IslZ+AINcJVo3RU7#T?AQZoEh|=bE{-1HR+k09cd`QqK#q
zpOyp3^B-mkJ-xJqi7A<sN&%t0cK=iu|L2hCu+t(6h63nz4kFpnM(QY3ev{TG;OTw^
zeNhu8_X3qh4%QlAo#n%wPyGFNTQqIH-`dox+lPT4<M25z8a#VKtH_e=!|>)Hp(k_-
zUlU_OmL@8n<1b9IJN1-F^6*ab?BxAcg>m2f7Jz)wa3{Vy2cp&u5t;UX;7=WYMm4GT
z8E~=jef9;KOE=d8g_b1CfOJVTg01kERe&pbEHCyRDg3(3%S03GGH~tm+^)9+O33Gw
z8BnlsQbGDlz&wl3Axx#P#A;4xR%f>HG~r84vd?G2ZKOr&4_D^FjXKY?$fQHisW+8p
z2mMW3<rL+cR#y{ui|_J#3%PSpG$l<`y6D$<&JCY_17Q3QAlGz>HXh`RbY9%uDK%~g
z^ii(ukW=8c0gn01xHbtJ6-lz@f2A`2&H^;TLIj8n(~a_%74Ua4sL22HDZ8_`$mf&e
zJJ+5316Hi~H3Q}&xP7aCg~79Ai9Q*wWw!1OdwHo(n9wP=NeIAr5LCdr-hc4SbuQOK
zd}I7gyQ4SDpb^K3WX1T~WK6UX_n>86H1y70T04Gg5meY7f@*8i&^3_x9sdEsGTg})
z&rTRCe^I{j#y#1qPgc8Z!<}3#Qif`csSW<xxsr-L@yztjF-sttWX7GK&<E^nSRG5{
zq(4BmV0XRXp~QofzC}p?;$ubM%6wn%dm^FjUOjzmc3#50V;O>h;T1HUNK?g$e$gP|
zD^&)R$_MNG-l8M7LK1r+6S?MKL&GyqGy;8SK|FNXp$SLrJYK)DGENRELqM5Z{rE@^
z&@$6Vql{>F_Rv@^6QY)nv@@=+Bc~Oy*?AiUO`ZGYj?w@368rld4qn`^Ru*EV=&;l-
zttrqCn}debMLWbnm<q{ae%aX}$UTyjJ!iT1Vcb?oraet$-ChbwcFD@eY)~tHr~#vh
zWp6Xw4P<;q6{mdh2G?sJ5X96JkXjEr0{@sqbVlK_cJZi3%PA4Hz+mX3dxCKt?;#7{
z-RbwCi7jL1D&6Je^n(1_j{pbh0!8_P-)Sx?ArX6Lw4WMuhgDD_^<=)M3zm5hZqT8F
za;anl?*Cq}^~AND!#H<G+Mlm|n~DtE4L-8r!qYH*;JvHUM(P8+cpiZ+?`R%imb#(B
zt~oR3(sizV%?e3u>ijJ%*tWQSu$_J`JR{`R>Pb#bUGtshUi;l2bDF&oDL6oid_M-4
zBx696=W3R>qSAA)4E!G0Fk9RbVjb@E4Q`~gf!Ze$PI@3Eg8=ZW=L{CiSarap<KE-_
zIrMm~k4tl~CoQTU7eq9PN(ihP*ud8@;3Av!`jL97C3CoqzY+VWPXoi5oTg5S7-6WN
z|8Ho)Hy^~21|y#qm|=0$?+xYfdbC4_;VX2n%w|GCweD&@g&yCYSpM^-t`*?VpL2{M
zFVu}Rd<jWq@qR45&^H<B4<n$&?qC+#FaTDFYWXXn7&WTuC6)Ra%_a_#oRb$0;K9h8
zlWv1UG#JD`Q$Zz(9HU|t9!u|)l7f)lrnq(Dy+Kd0aqeA+1w6WW;b%|76{P@8gbq3r
z%=2cKtqTV-WO209Th~WRGwMMH<cU;I;<6zV;qL7t@W9fH4mQbO7WbiC%K&~_6knqv
zV}M^op68N>-j}&j2Ww5lE?0K!)7g40J#EIK{FOdmF0Fg<Fh*8TcV^m0f#hY)x756#
z_4Lj_K1yTgUWL;Y<8Msh28mEwJ@973mw;eV9a;XseETxzUa(q7tdrnVW#m3e<IT-4
z*0NubJn2dd14_&VWs|4F2B)6A4nt%;yT`r%|4Qvpj0p59o5xhW_+flBZOND`c3&T^
z5n?{gMrdwt64iX7GaSjoF5?|<hL1kicop7bmNkJ!T++{UIcU2(f^4B@8@2YS9<}ZQ
z$k({%1J5@>E-wO5i>`?U>QH4N$%UQA0&w)f0+SX2Ki5ZicOyo#8iy_#kfTLwZJRT2
zy&z;hRv!giSo-^9;kFnDaAkQjc=CRYQ@+OYtBtp53e^hIe*8SNCy9{s&|yo6UMFwi
z&$F=K(GEed9hq0~u8}Ud<jcHF7&QKvMd+_Idn?P?#x0K`jJ-4>>?(5({KI59W_hAa
zAYIQ~%ets!$ILMz9wB1g&p?{D(0=*~RaEEnb${@d^uK&gV+R_b@%UC^KB*K7O6Srv
z+_gm-+Gtg*kzD9_BN}V8wc^kB|46H#?HdttYlz(07^BBRJ>M%dMD@hFCDLAJw&T7d
z<fwy{E|+$JBG@2o?MA8^Vo#OV5K{TJ%Zjw<4TwPEaeFg?jDb7V!%c7Rz{oLU{?AZ}
z6N7p{4H&P1UGOlbIr=wb5E_Svkbhrvrkt$aS|Dz&nS*_O>2PGoChvxPxGJRQ=d3+J
zsJU2@8#S379QELlh*fdAH+#L>lGp|I<WnZK%^7?d`-wiv9V>9JTs*pD6dBrh#<Kc6
zo?U4f$?<&uFz_hl^fqKPbthL;(j)to?>0h<Y2&f9OHW3KK(aHhGl|zt)oQ(gEJC2R
z#DZ_U+;jATh?IX-OZgc}!b@^>I0hGFl<28xUJS4_$=f~`G+kCqJm)OJjTul;wj`jY
z9P|iaL~Y{*`^hxHC}uZQodm9QKQnq}(S%1m0_>++g4*2zN(JXMHSygohyO|7{+-GF
zlh1`vmmOW##`ZUobh=;>NDkB$$SPF;Jj}J!LdkF}YfXlo@NoUr^n?U~lKG0`7DD6X
zh{Gu{zsg${U%Mv@nwiFwcP9(UKO!z<O_%u%el0_N)}m5h;#*1}^U(EqsthiD3E&xc
z9#B(}G4>Er0kaTAq?nGt-TUQc&oA;=QYQ36_d28M>Im^zo`#^LVtgMC%5d+=6xF3O
zt_wJ8t2UyuXn<2JV}nX+JuWi}j2qZH`GMJk>;sWs%ZIbWmU(YOo?j?#vTV90(28%7
z?)0ah`=-_O)msPq;E6!>0)n`D>OPaS2moLoi<9et-$S<jFI>E&n_|cib-5lyHZ%=8
zCOm#0?Ak6NKHXp+04g0%=4h)Bw*JZjv>B`R&~{WM_M440XZjcA@`md`X6K<bRAN1I
zaB9&)*8_pWiF9!-tcHKeY_~wNyQV5*`zTEXm5JS7nY-}#XTtv@c!wbIER;kAMo)Tv
zm8j4pYm}Qx+pKraYlx)ILqWFo6muyE<vy;>2;Bvu={}$~=RvD&>Ee*C)q$ef?v|WK
z@ebHn7`rP0Z6iY%^c&*!RH*7I45V&Wz$lt#e&*m%rm76`LpM113PSyy2%-K|yW+w&
zjK1Wd_teJh*?Nw4vyRRdkae|<uoV$EHL+f2nhc->75vv$T8){vwk)2=`<3EMe?p$m
zceFR!hzIsNg3dmUe~5g4w$E1tvmOA7FZU5VQRVFXLK-`pzde?iOjMepy9GSQT}6=n
z&ptZ)sbinfxqDn3o2Pu^nIrXM`nwTolA=Y8YKG}nZ`g364$~9E1xh7;dw@rm#u+yU
zh?R7*!hKk{CXwDwyw9cgeu{l*_@$2<`r9H1FPDczJrPpV(emfYzjt!wN^JBZj;js>
zNWzEfcqa;d(;q#L^Z)4+{!jh(j1>3F9j=I-2nEyJjt&4NH4+GjmAZ0Mi&O~HL}VN(
zV|O4$RLA2HH;Qp`g-HfL+j@;hzaX$Az7lo~h5y~G;JGAL{}DBU@C8Vc_c$h5WvFA>
zb}FJly!E{~AG3RWTx$%>SdyDORK*|?1^~n0p%=O+d9lX6lQNn2J-2zN3-t~sj#tum
zWh_s8T~7EYV=^jT$_xw!@y%1uh-S%aW9d!2no`&clQO+-G4K$k8?U6qrxCA%cRhJZ
zLe&x9GA>;8NZrLVcnWl7zs!#AcNRSbQ!#$eE^(_cLk`!6;>qfL>KSH)?ntHEH@&ZY
zq@JRqKotD4T#zBY{%yL`%i3?W2AG|5kaxcGe>?X1ZAltO*yxKi0s?jdT@6)J|3l!W
zNr(vDlZKkWfW;a*4$s?~QEa1->)4cP`%l?fMFwDf_%uF5P?CNqlZpsIF0aG9T=6E|
zDaeMb&<YzG0?^*IPIacy&(1Lz$+4Dv&%GW1UO~i**BXw?@D>QtI{PKuzC?Ae3$Lv2
zEf<kSKlx&kbh%+g-?AqMN`9{X+yA|YAzCWRhRR^?C(ijNlJOG|-pZvnZ2E%2m_;oY
z=}03YfQ_#YwE#U=x>!H2K@L+=7+dLd$%UtC#G~yk-+sgNkU=%8yA=G$g*#xAKJ*nP
z2=)CH8H9d=EnYWxDn1%k8jcKnQ8NrP{5*Q)2n6f<vx*TU){U#R%lDiW*uOQqkXFw(
zX=+nN1(jJ~lgtWq=9fZ;t;U9R=>aynl)&}yuokc;hXz|m8W!816t`sIWnQQvS$xa>
zOq>|4z-4%aqvhrHgQA-{(j@K$LsMJJ9;x+U{}<6jyDulO9(_nhN>5PO>rWn>9%+2%
z7<&dUGr{K$AZQKWJpyd>p?jC)i-J#bKRrG}HclK$R#I4gOyE*!`emnd??GL?^>~q9
zb16g?>{fMNgOuzMldVdIaAVZ#o$tZOr{b1BOGsOGnhnX(4;DwrFqxs}PlL30E_f$s
zanybJQfGk3f&||h?>#D7_q;70vr)75cAvRNb4ga{)>)A~P<cq+`J9>4rxY0<lg#O6
z6e-|n3(>bT_1u4#YAQAWl&f+&9@A99vWQFFgN(HWZl<$TrMlS!ktfr6BV6>Ljb>6n
z;3@Bjt0m%NRs_;V;>V(4(97<*$0$J)(|XM4#pTMR=kcp2e*sKcNS`3HNt{LiP4s}4
z9@Kb~GYOXMS3)3p3!@_DAsgS`cWf`eew+PK5M<jlr#R-uArH>Nh@vf9Io@^k9%INw
zN4-jKF`VEn)0hxdeD!&uU5i=j;1{N%i>Umh#+4#Fz(Gx3Tt4?^=tITwLEzECi4NUh
znd`DOc8jB*tez??D^Td^H|gRM^>z}g&ReS>0)QJ~^v!0|sR_@O!EER-FiE+_L)ENB
zMre>Vi)#E0Cn-)OWU+gD*+(IkT}6y6sYD9jtAhpQ#khErzd^8-sFr00!L3iH%r}jP
z8brWfRH&HJ<uqwSIeHn&ef*`L$r%V3%N(E)X#R=;1e@64Xx5?}xGba#43WJWOtx>Z
zZoGZVq8ek{GMDm+N!s0;uBh&eeB!bJf{jl)!=T%z_N1Lap||CZdxTFgK56}e6=U-l
zP<;TkieRVdzY~^!H|hQb|1{Bvxg9vv$(|fTzp}+wg(w8vw<#oDztX$g^V^*TVMLt4
z5`(RWYtf>YHP0(>CYq0CwNgW&K&g3}7%e&ugq&vVJJvCRa%RV}0IcEh3o6%%NZ5ta
zCqwYSAyOT%qL5lT<-Ihd&|cTohCXOE^ngJ|8YcDga4C2rX<5zFp^LI!S+&~yT+4|S
z(R)V8*WppCV3w_nw|W-_LC`rLev-Le7J!jf4%H@JLC$@O*b#l$d3OTzIP*1vjvwJ!
zb!;vpN%Dkz+OJ%dx4mu}uj}(eZ#48BvX6OeGE#<wQXF;E(sc~2h+1h0_zp{frcqo_
zWPzgQ*UGpA7lHY_N<0AP%Fl=InfEk1qH>%L7zA$1`7GsSvRNZFX#GnIy&_eu{EsyN
zMGfJ{av_g)Cxor72i(AQ>sVEGeLYSvo3UG#lOXBwyfj3Zr)x{#MXC$r&w$NXY+3=;
zeb{Nfl1-dV^x04h(~;(lZOaSV%_oi-RFG90*|FNLrXq4gNLL;UTjYexT~{7y!lWy}
zcsi{+_Wa~(X*k2@lEYWde#Jqs=+cW;XV=PnVa6&U)en_8oAr?|YJ`1<O$KB0(B4#V
zURMSdWdj7y^a7!K6SEb#skE8uINX!l#J9xt{`Y^M<`8|v(SGBR5axS1iAg%HP3~`5
z+N2B&I3cCj7ya`xb<dwj{ff^lg5FUkWt=Wf!gWlHQOriAV{-)v2{?z`tiVn!^&*q}
zuP_d7AV^#2SaDcBnM8BE=+r>r=Q_V03#W|yLFXrkP;%GT9GK!Subso#esa*Xbi~o~
zs=fzAmESN`Qn!o3i_ztQH&)7?s9B&4-CPjCI_~~!hFd~P@jKzQ9gfcez_-ek3S}?T
zV}M9(nihFG^6DDf^Q!=pnf~n66TQAB+U6O_>=~sZe2v^yqZf2EQv!Q$x_cD$>*DN>
zZlmVsjV!;Zp4uzwH`{kIr~+oef?uiIFtJk2qsaZB%_SZ{lgpNKpwyR7KKU+brqfhM
zB<}9T0Ejk?zrZ`vhv^0MLh<$5;4NNkKOW>VaZ`l)FVrU`ttpUPz-&6@UbzR2jZ>;#
zZ;?BxFRtoYZ2B)BmT=CUk~`4#oAg$%X>2l-NDCp*W|(leOo_BGzc9w)fNP7Y!S~rZ
zXT+!g{=LlYdsg<w(yHE*sxxWJh6uL2j}c)f)4fYfYh6Z5=}9L6p?+cMv;Mi+Fs03M
zXHp|j@{N65U=vGea?xbz<m<>$2P1Y;63OlHo<jF6u6OePjbt{dBML;L56*s3ZxRNo
zLkQ9mmEF{Nyh{`ts*4Ld!Ks*!jeTm0IBBI2%hv93`4+f7ElHNME~ei!`AFe+Do?N?
za_$s(^|52m3z*^gdl?g*i`OpTIuD=~!!w-^P`ygNZds&{$wZMzB1*ulF1|Des=5yS
zGQu!Zu0V(VL<zLOvVdMQaheSpd!IUeWRz%e5EFPhEoqBiBUPkvO8KI4g7t|RamAG)
zUV|dFlI3Y}v49_6b9#lP*QBiz6$$yU4-Uht(<FUB$i?qrh&;qD=x<(iq!DwtG}yRS
z3uNo$<*R^QThh<|Dna{5Odd~|5=@gcGbd_#;jptlT{=;C75P*&{lYIuPW@SyiKLzR
z6A;G*(9^tmb$v2q&jG9TpUX?qR|!AvQLc+~V{lN%3;`bcx-Vv{%EmC(2`bDN67D$y
z5iiV0J6P?u2lMqQCLZ_<<25GDI|2E7W|;+-dExKpbsH(}_*x}7jHZP<1kd<L;y31;
zy*=j{23u2<%Iid2g8Sq%gYO}yy1&kZs}sna&v|(%V-L{>2#%eI)m(!t&A{kpgiN|F
z)+jFQ-KYPvqC$v*`#R*o^*KGBcvz`Pk#PTk@0e-jq&oS=Y{ve@`E9*K6+%Aq@9yH(
zuW<lz8pyTBND`bP$t%auVotVv8vw16M3F}DPfYr#I6=uktLQ0v)>ZGrIolQ_5K!wk
zNNzQ_SD%1;yL+r1hT_OHm;&L3Knuv1O&{a9muODThfO%GLFIa#{Y;90+30Z5g?vD5
zbhlMh%t|d~rCXCRC!~;@hKe$0@zxZ&j<)K^G<BKgQj)<S8?Muo+NixTqQX4r)9u~+
z^r+_qMEn7*ym0ky62)JpekoLAQAAu9jx&mDup8L_f%LTRoSbI-|LFP?XsX}teH_nm
z<RBb#r8s0JLuD*Nlvx=<A(T{PipYEnQA%dw7&1o4P#HqUkg<#zGe?MIp5wn?yx-67
z^E~VSUCU~T;+*?+-}~PC+ShgMS{!JT+mx*yx`;n*BV4MB^yQ0j@J}YoiW@-JCT6Vo
z#k)ID>nL*!-sLf;pXvWoZ`DV4>Da68a|oMFr`^y+e&9`@>d21_6Z=xUYj2}-hwA$5
zHyTOjYdum=?iP6Y$sZvhku<h1WC|gE#!lZk8Tiuax8DKs2RgKv$2gf&=@AHC7$x|E
z-{c#Z^rsRS6Lj84zbBtL>o2TVRJ%UCIl3}44PGnX$mLu`UUnDpu}p6cq+LpYNvR|5
zbgB7}Iq?g5cWeE_82b-iiwA_vWBtP2k;g&QkQN7#>^)C(&#w0&ESGaT*JXJ%KV;UW
z?lElmJD7Qw^#!cd6h_7))W%7x$R(OaQMY7dpG7LU9zUP=*04Ykyan>p+z7Q|wlYt?
zX{!X^dI4U{Wgq5TvPti95jwQ1!pO3)vv3q`bP!GQuhtP?MwCk6h?C6W6s+iDLn{Dg
zR5iuY&56qPSoy7Gd4W(H9dda+6kW7z=))gK2ENX!Rd%xo9dF^5ocsC{u>T63XRbyy
z5!s}|c7ch{UH*3PnVoPC)RR>g`HvmY&|nrZ6@qTNeYLuMQMIaP`R%8AbDE{|BNb23
zaM5iI+dF^AGrXmVGNp$Tx;}R=eR^!|s#S+d>n-}W+h-+QLxA<~UzLU8FDIaL`AA#0
zLCTvo$=w8<gYYYa;4_9Lb(*o=irt>BL-IiuH_nbpdx6_FH`D#iFU@)*evH?#)KL0}
zSaxboYdKuVgN&_IgI{-)6Wzm_Lk)5ohWf~tuh}^*(>i=-uyvRpZFweVxoZo^H_hmc
z<7MW!Hho#K1Fv+*{0*f1Z@W<V6Ul$f)s@zpN_5{-aZZ1GqDE+5;rpn~la=$vsq?fJ
z0=(ug5(4=(bN{Xw({PkL`m=b{1--m-WKN*AN<(J7gjEEP(f@3K{ZA_Ch%6V!muz8<
zTj4Lu{Wi(Ex)1NSh)3~gN?QrkspF+cXYTJ|+kUP);65W0;041+RiN$Kr8)_Gjt0<3
z3$3@aavsj?Z0+Sza3c{sSp$KpYuZ8fAWWc^h>F%t3N>xEu3}J9?LFfl;~c`!)%C_e
zQi1}-Y6BQXl)$XkNo=jzUu2}&QM&@25F-(p1h>F8uJ}kgtf6_T3T+9EdCe^t9lr?R
z38l_iDY9CPhhc{a<oU1e(B`~v)BaTbVyzWQIDG-8PWMS@W9aeEXcWMvy(GpgkuAys
zmG}>qY{Tvdtq}&RwGu<EpaU+~Zm^G@eG^4lgk>d=T#RZvlh*0rmUsI*d5(3yoM>ro
zvPQ%k+zw4lD2u790`w6h^~BLEJjHT36(d}+!l5k`Gq{M@Y3M-fuf%l-zw9Xb$vIzT
z#p5q)d)$&xJAtLYIhabWBy1Qnjk4HswtrgNJ2hRCFX8vPk^V%yYwEF*33*H!)%U7&
zz_V3nOmY&c2A_ZD-V9i_UiG<yvA$c37biEiBBEN(zyf=8<W8OFb_UQWk-q!H;1RT}
z;m4E&kUr^II7KuZo^dycoc(aW&on02kG9rP#WMIkZ)GZ@i0QdEJW>?ZIo0z=e81g&
zR!yHw%*@|5I{W`bRS#424t~ow4`5%+Vff`RNIXTssL>)|s==SBnPt-~k>GTr@YWN0
z`e_n{uZTEJUmwOR>r|&Y;Td`@n<7=+<JdF33#KE5ySkuNnJwX4NMYl{k?>MBf5EYd
zKT|4$%8&Jcj->eyGS(Zwv{lcRiAv_uKZ|t!p!B&dVRB_HRx3XqwtLqu9Qa!%+8L6Y
z3p)-+sopV#C~L(NU61E!_WGPAlQ;p>>mMLcLB`<E6=4dvCpI5orY#S1UA_bN^esu$
zL))tq*eRj|8S5(cts=(PThJg+ubM(K<UIEB$P(!cnY&aeqvAZuN+OL$oJ_}%Q$L#@
z#}@EiJKLVV$*ck#1I}GiION~3YQ>m%7Rx$4T33|`S&^DvxGIqeD=h8S+sVPPG8II3
z0&RUm{oRj9VTzU~kTnWz&Y498FZdG4LWEVc0~Xo>Zj>BUq~)!Wn(L|mF2Ih;MjCea
zdqxG)&dR`+?cv*0Gl<o_$G?0ZI2}z)P5d~@j?UNDdgMSr-2qdRox0pN%dP{9&k>Qm
z1TNq%JH`u(=-=eHQ(cngsNkzI2cd7}U8|7L1E9NWa~zwU8j0BGU7-254gbF=E{35^
z@@jimNz!FW3dRhr1cBo+PV*i-BTkMn!Y0o$@uzch5F*KQAFrnVyQg`3oDOAZGPuVV
z@cS{W^?s?j9Uk!mD^Zib5q|GxtpH0rKKk?(9*=xgpRXugGd~jk)5{T)tJVbt0lGc*
z4eg{U?1SR<n&0#J4_->qTZwC`TDR!zh@>R#xk;*e-lWoP<zCj++_?DL6CTgm|FC1;
zhYl&wekle2%go=Taz6;hUh{IO^Sb9dn7>eU&;y;oiG$2Uw$oyqOb#+i-m?w%7We~Q
zZ1%PI(;>PreUe>(n!*1q)?DWP_JuR$aJ0}p*a+;w8P=c+I<jjzx#ndqe}ox$G*l=K
z@JozR-l090G@2YMtjVlYj9nRV3)0h=JINE`=4XH2zbRck=Ao=1Qd&6@i^xkj#b2%r
zoGukrm?EJom{a@q(Zw$6>Q!1y^2o&}hao;dlbd%!JcB09?P{v*6hdmyS*-xsoa#&t
zq#VudCr{~{XPmLet`RlGx(*{PvL?-@yT=dp)FE~<D~H`2UPTbe|0KX;ezPXnT|E8M
zG2?z;zPTSgRVm#Hlj3^hUGEgHioY(XH*bN%?)k0n{Qv6_6B$&v=bu<-GnQnVMKrQ)
zXgomdG)n=BLh~mVQV<ArJQ4h4WKCOu${_Bln*Afh40<hjD^~DDh(w0hyME-JD=_u~
zu;DhN3Np#|)a@^^&vY2%b4U%2O5(qAM|wjaC$0QgR{Y5Lk9Qls42!+BV$wAu;)E0c
z(BhS|nyL=ABS!b<JnaUp)S~dul}ga-bYA7&BLY6uhO)D)PqQ|kF1yBFkaDnl3e87Z
z%X??eOI>_V^Y^9SvIADf&|FQQL7#cnAfE#8?!k3%6jx^heXf6e*gg8BH@2uT1B}0Y
z1)$s<r#p-}M$Q0O7m#KVHgrnANkHoT=*uL5Pz#E@AT5VR?wAHRbg}cNbQBcc{Z>A7
zOzLv_kQxj2<LXS0$e^Y2+Y5%T-af(czzp=5Z{k{aO%{%JT;r&SVghwBOPYJv&Dq|9
zdnVQas-oz2+FAuIl6DsKNS0FDwEs=M?_AxV%hrKmXaDs}S=+eQ3bZ2*>sItpx%`lO
znLEYzLu-4O8frO^s%GmvnGkHdl(y2u6~IooI4ae|cZj5ukV(uGy;*hbe-rNiv~H7w
zd~7S8v;Q8e?90yF#rqr#;fKfOZE8i+p<(s_lPBpWLsBsY0v*0fM+wG(R2pSIHP7bc
zQtiH_310B_Y)zE))#NB)?Mnsi*e`TtA~R|HMU}t;NS(MZDt<3={zYu`AS4F%erCk3
znmrJTU^G;TCs||N?qQSa3;9X}?NRMih+pTD>&fqjWt{HxKi4L_u7oDpIFgdt3)^Qe
zLFX>4JNLR^6}{Ga&Bkjub$t?IsaxBwyJm9=$+t_D<k$TpcC@BNG>)$WH?HfT2;rp$
ze)*(Id)<7rk8qsNNWB`>5rZpP#r{Td2I%F|r;a}=<s56e$IE<Mbv!a|(_=T~52OqS
zPICC?U~SE)ect=antE+mBHHF-Zxl<hgsE1yzXx;v=CeI`3ZxkRNoIGmTze3>T6JjF
zTQ7WZiO3V}v?}M1;*|>wvt=}=zhm_L-3@X&mb@;~tc;OzivU%BG6nitg)G9NZP_#r
zcbgW|)Y<M9t-HOrWXbe_+GD-Y)j7q+TT$`QJW|&@t9E|XeQ|>E*EeA}vop7IRF~)s
zixn6pp@K{sTNory5iPkfjTd{YQUPh|#vSHnjSIwOwv+zYc>Xde{r~5)mOo*(yP(Aa
zJQir|9K`S%n#a7LCP3m{CF$Vb<&ySZ@!rWG^L#pzdb%`afS{g15mX=5aAEQeOSo$k
z&4Ek@hbA@}DJTTrCL5RZ-70bj|5{=P4fC$&-{^rcua{Y24PCAGhp7mo*fS4)YA6f?
zS;(oXaR&HRSwBa=N$N;dg<-_pmhT{A`3mp4Rm&cpL8@(gdXL6%ZUK`c#xeP?96<Ua
zV6M%>FLhZ;tKz}|(kguc7GDCZ9Ut+<8m*4X2&2?t%ZLxuUoU3I0H7u%BWb?$0SGZ0
z@{He4=>jKgv;4{LTNqbRru~?mNmsOSJb3Koy~M_sG`_daT|Z47gc@0Y6`U9rN0H?R
zM7&D$+R-4{h0-}jzCVCtK0GPm+Z->9D{PYcQ@#`#+s6h|SqP~`<{6%n(r5*#n~sM7
zH+my)Jj3&z%U`pM_0{`m{0X{W&XiKa92Kz<cgV(g(ykSK{DGLb3Q%XucBE4W#AB7p
zZBqbFKV?yi`H^l&&Y>K`7E5kx*!QRAnz1(d;|(h{+1rC_|1g3heg%GHEa@dpITO^m
zO$k)Pq`3h5g+irUt8x}{=BDPyU##Rcsxssy#~K(;p58u@%#Kq08bEKMJMv?=q-c*m
zA9Rvrrpn-TpovXY(U1Sh%8eLQqB5v>3kC_emNLhoDM37_u=q$QB(9$t9GHLqisbE~
zLj`>gzFoOQR-SHSDuaTdn-87Becc)AHW7@>;Lr^hRibblafxg<C)uo;Tgmh%6^ZA4
z|M#JC-lcMCY?QicT0t6NivsaJezXI-vm`9ov~?x7Y?w}da#Y;o_61BXd14>d=EUci
z@5IvwLR@*)LSfplVyW=2QDrrb2)=_W@`p(!FJdk&A0Wq+a(vKY!qU&A+>n~UnWXSj
zXPGjiFFp)pn{k%PB_3B{8SqU?w4kuynm<j}(WUr}po+`Q8#HMyW0?!!A(@Ax#dDR?
zsCY$wIhQIo_t71J8m<tcvq}u8j*Ta@R)E#ZJ5UBX)k~6`#lM~06I4e?t7K|TO#>s$
z)LE>m#K5Se<<!*|kwk2p*!BstxSD`X6NIx+m$27dkhh`eW%uZQ!MM&^Id5q#%XU3>
zR_ecVU>_0mY`o$6kgxZE(+ZiaAgER_2S^Aw7qw|$-3blq(Lr2aK!9tvsBDajjZt_p
zQa^SMm*<Lar-RT{gkO`Xmfe1!nMXrQ0(-l<?~=>B!iI`D<(iD3ID@|*kO(SaWZjL@
z%4vM$4d8Z0A%3%JqPN_RTVP5m`ba~R?AgSW%g&fjOA(z9&YgBcfKO=xk)$rtls+j7
z>=3Tpl@3d!Oi2K87%wCABacGbbgg<A{+iiw;^i-mo~y9d?A0qev=N3B)%+f--@mUy
ztQEwAe?!DG{pwL_PFCJa+!5~dIyGz#wREZz3O3PMMH~ci=Yl`rq$UW=33WbiL<bG0
ziOb_f;SDp2HXPcoCPfyv-rhM~Hjy;#3<;*3(itra@sj%-E72`y^zX??nQ%I@*07@<
zD!kFZ`HH~IKfhr0#fS=7-r6{y9c9{J{h4~oM5U)I-^?ceEAZjk(97=_R(Ef_?sUT+
ztg8d$Sm~)x=iM#neBMex3F-Kpn|j7@lhNe0<_X%e&zziUM7bL$Gas5@pFd)q%=eo4
z@b-u%ieA*$57(g$cSI90>k_TtX9oYHfbA=G%wTEFzDU>&<#f19QyAdps%GrB6`pDZ
zJ;BkOA~SvfCKx8m^vCeo25;vK!Fd<l6da=)(}uA>4ad0L&(2QQhjuee=W9$qn_ZJP
z46rf;B3JOvGcV95?z}v7q2C(p&C`$F)gUp~0N!9k|I-i0AAAv0lc#z`4xklY?+%i#
zi^LyG$l1ctD8%tw%cW?1dsPoy<Sc))mya7a3V}qXHWd)*pmIF$!VnL~kza*WpF(*K
zjipXB$FPW#&dJ#;EY_HJa?LCJJxui$^?+JYptE<(v%BDCeOk6Xt<GizkO%5+8;;4n
z?Qd`-AVp9>|8N>8GeN+L@Vx$@o$$2U6+e$l3;qwOW8TLn@gYk2+OQRnO%8}6nmNol
z5*ub0Fqybh^Bd?XSewJA$~i{3=FQ{W?bZt!|4>YgS$#9xj9(Kc|EQGqNKqgOaB6|k
zgAxm)uYtfg*#P5hve4XR>*#6{3}yhW!^h!C+aW%E1=g9{m+xv$tXmc^gw)a^?h=)M
z0_jh;`j8+lPho3R#)49+2xnpl_e)pv+ltjV`V(nXK@KD5{C0gwIxb2-ks>JZyKQp4
zs{oeBbawPVeZ~&|8C*7~r}UfLIlTHSJ>9R-wI{d?@%PfIGOhj%e@ERd>{F9C7l{^Y
zhl5}zlnqtQDaUaYgxUG+93oZ;&ud<F|B4LJ<r`A%d5aK!@_-rU-^}K<Ze9wkGlrd2
zxPH`tPtw5@voynlr>*mVx>Y*nKGQr<)dkuK{2y!w?=cpR_9ILtV!yCgVUv&Op+9ux
z-bXgLlf5r8Ip$u(YkSql{lhE>dQF!5_mVxOhtx^r5&fT!YI*rxVbbf~L0&n0-=0*6
z=HFz6(~@%GK$0_*4{sc$<l`70yP!2PtFE|xh|e8*|BO4#=cSKl%Qa!$Rn1l@8SSb^
z=rIajgJT#&jgaG7{DhHS5*PNgs7+`*>C2o9G9V1xg@NRP;Cv@=G#@IgW>wyV$aY5C
zfsz7x-j{MeMmPPQ&jwJj9BuZ;pnezmpRf&(SgpqC$j9Y&YMs@5;&l{!3?e0FVEv;3
z2;=Ue&H%QwUiC1?QM~%)Y@Z;dH_x)7{^bIEI+&Ht=DB>O{(Bow$$)%A7qBPmXPnx@
zIs*=?mvC8zqYe>9$1ILs|563?!zNJ8kM>HWZTh-vBSeinJmwB?-i~}dV*HHp%G84l
zypt^tM=z}MNyH!t#gax9(5YQnbH<J%(eG@e0sfsUQ2bN`X!0&0)t%Avh&F0Tb1$I9
z^72>282r91opLI?*PHwR11$ZX1%nToQxhc8DzqYJ|F3O=JSvPkc{emr(>7!J>8yRx
zP2^@Cfz}_MmMpE3mQ2t|Rem>tW!go<`+<=$m+*HWgGZjcE-_OnKcJJML;<wmlG%aM
z(<j==jj!8+bxm3syT`o3k73Q{$E56&)l&U86HtngP*^pAUiEp*zn(2D6$i~bfSB|0
z)K@(jGweJ00kPX(4g-@QI4!SajPeoCTc0V*zrLmBe9L4T917Jz*g!v*00WZfRbx>W
zCxXuByfd{gWIc6Dr6V28-WV`at3Z`<GSP*Z%k7sskhVgId<pa2KG6(pxHOqgTGh)t
z2rox{B>DlU_5?CnOoV=(dC}pFUP39RJ7u-!IErSdh#VA)(S;$IXC1qZte7gESp&7a
z(@C>&!x{53%%X)Yb_v_!$^>^u1VWz7DuEM9q0^F1g-LZ<<C#ee7A{l4jrGF?-@#PV
zQ-m|S_LfPfO^rNB=wMj!KG=BcB0PmJt_%16yO_m~VB5%K?;PoLBD3=7rANMr5hTf6
zHg@_$F%wjmItZ9|GFB2Fr+Rp#-o_QwQ)yuB9-^-hiCSoZ{z#Y;wLdc!edw@gasxlF
zNj%U?a(~5(TVJhC$MVRx=|Xw#|LKA~s)|G>*m6uSwU~5_BZ#@p6m5JO*8#v%<jn|g
zps#ja)g4BfrVxMz9)7>CcHky0=3MB*98l&_cn4z0W^pBWcluv4U@X<aq2YFXPE=)j
zf~!g4sbtXG(8NYfdECKE$1$qQ^8Ft`O=9w|q{a0}R+5IWs_ifTA_lbXv<`v#xQt%<
z13@s(>9_cOe-^4KHLH|&0hab89qoOSlE(4CvWrll$P)SNJ_e6NS4#tvw&+(P5G5FC
z%Gr@-p%sgo{>tVy#nIZ|?7MS;7^xVFVsF9jttnoGA%E8lPbRhViOFHBnq?Y)dvZVW
zP^uj`|Blo7P~!VT`>X#4(<@ncS-`8|W}NV!^+GP>tKb6yo8Mb7c;*@A(kIQi$9EnP
zj?;wcP|XS3nrJv4zZ;~5qqG;?lrIx08Af0YHGjW%S4-2b``cqsWNCB0ADg&ijKe#s
zFn43@WjlmMKfQERJo*|DV!@`&*t7?FLZ%aI;h(b_KwGDPxc{xtQvC=RPizj>G9JEE
zGB3}FW=Vu4Q3rO0hl3I@q2I_yzO(pD$!!d`ccPm9L^rj{pH`P)PhRNC(*?a09kO37
zK%aB1lV2AHy^k~NKoF5ThIse|`+3v!2tQV8)bF=OMH%1nOs<zoBh8~!0cIh?<T}(6
zm9T+VEog<JFcf%+;ydltOZUWHe1Z{}pSk1LI64aOPa^mZW&O;n+`QNN{rDuxFFH|O
z_Q9It`@@scjg<)T2r@~yQ^`&vDI~Ib{3L&ad9=6doH+BpLo9j3oosi_h(_*pWYWh+
z%B|H%MuaOwYfQ7PYdo2?zm$EGM@)O5)2V8OG=8pBsw*vy3oC^)(Fc8e<UCgn{^Z@D
z`=O2$D4Ol9`5M~eAN}4Lx&8(C0b?CQ`rY=6e$YD{kA;8m0VPH3rYErM>K?x}`6wZ{
zaJhtUz2{n|8+D+d!+hptVA;NEC^Fr9h5zjt;saTO6~!4JR?<pZQbDk$2$CZ4>}=Z1
zd0+N*=>c8Rfmt>rw;RX%@DGwE@jg26l~b12jDAT<ddt=Hkq_0td+Qj4R#-X#heT(D
zKza!q#5dB0nv6pvN{|T)e=oMvwlBo{WP+GOCrgKW;+^k11C<SbZxvoLmS~I5PsvKw
z`Xs4CxK#857<I6_IHtoX&<}L%(l;n|oN26lIHZai<vM_8?g)ITB!_LQG6^1~#wVvw
zD!zr4zinegxtH6^Qvwof@3tRhLgjZbkRQE%9#6V)vn3o>eN2_OQ^O{A^9SK6XX0s}
z3NY2ck^I5LcF0~%`*Zw8bfh<GvkW8NZ$cxL)cJy>J7PWN{?UwO@p(#6Tk<jZ+`+b(
z0>jBxQ4-OgLwWYhvBz`f`sL0nP(^)bG*d<{4(tth4Ye+_{~F$SW7>v>*;{fzG>jVp
zB`U+81*<>E79gslGFHx#ZJN^GHrn7Zb6j}h9+&5F+G+#&Daj1bIt^T&>CsMf`^CSa
zv%B-x5{6%)C+P6Fj7h~Fc-MD)j1KL+YoF7=WRyDlPiBGpg!Xky;rdi@dwTC46CoKC
zqh>7_XjpVPWXOvA2PM4E-QPRlsMp~Ax5r!gmHX@k4NPy3eC)^AntPoNNuEGAea9Tp
zNVMIP$fGHw{&pYr7iT4tqa)bQ(jz|t{Tv993aoU9HvzJnpQ1Ig%6#Y3Pe0A+<Audq
zr*G5M_Cf1hHmrJ{v))3zN<jYI0DYUb-}ldSw#_5XTv#-UM~wD!tHR+UANC!Ta0aN1
zVWTUzv6V}@BM~!Gi(bfH#o^z<=~4|2V)^H>`tYI&T36h3q^S0fn~MExf!Dx~i_E+S
z_m|^3%IS}T`qQ2OynNXi6NNquHL3cCeR3_%5IxIz^^K(hohA4u<&;);BR^E0i((4J
z?nLi>q0I;vexc<bQ8Z)dfy4r+vZJf~$XW84a^jHpkfQD*EZ2Vg`YLeLe9()UppM2<
z9qakMk<qfaq)d#gWOTl=O%NFq+P6v}(t-*=R-vyojadJ}h$yqST0h|pJ=s8;ts&ds
zKOZ<n^fLQa4x`EHu49peB56w=^4feZffq$$G6A*r+Fghz1KB{Gc&p;Q7*dV)ogE^b
z1!UNDvA}w7^D)BaLtAq7{%<Ql1Tz(H#Meo`ZnzuI$u6Q&v={w@Xb?3zUnVj2LaZ!5
znxt%<(N|^$@umT(KMNRvtR)YFqm=csk?`Kq$|G?@>d621n)hCO%0;4AwzkWH!VMti
zvc4BCI*+^*c@$!Q^yvyFj$31hSqkN}Krs-Suki1;+Q|C&c<$r{L+3`iq<l}Wj}Qzf
zJqC)#qW1sMfBqcBnaLF71$&h|Vht&{CAjzAlhCnof=NzB53!iG&%AqfFMAB$aW(lH
zMMe{&^4g)5-;r&kvb4XANDRQV%jv@Jr{vte4{P)07dZEJ*WP>G+hzx#BP<I;l^6)?
zRKIBPBuWiD2(d|ioI+IGU_3DPI3ry|FfH-&_Lu2>%?Fkl#k0otPWH}RIpey&<;8*o
z<P~Eb;Su<I9O(1-pP$c#+I-@C%q&*c{evO@M|)CQar?2o-RD6Oo5&!wUI#ITK7V(|
zI@g=vfBwqs`7Ncr&Mnq-Zwslnl2yYGc1bLl*!wjC$gjCq(rftNzlJ0RJ<Dp){la@S
z!dqcmq0O?#^~nCSqd_}?$5zML@iy@xHPl5^hIm+i(12A^%FKq`$oT$Q!f=vo6*Uc2
z;@Em$5qX172i-^Lvv>@25qW+6nuL)B`9B}8muuXv31|EotKC~cSBADEYDV`uce&cE
zH6F1S!VjsgWk*Oor`=N9dnch9^4?lT259|AHITK5?Vr~FzB`ehv-bgn-3WEkICJeT
zrf=-zR6fD_d%<X&qBmintq+&D6S9rV_aoyNiHvU9NgRoLjk{2zwtt=up{pa~n(xVx
z$q}0N_fRE5MPm7q^FM<6WO=BqIuj&HDZeRQRo{DwBc$Mf=Hc^20HsNc%x)fcl>%|P
z{F>kMeX3)S9Es=P?G>qpF&j#cl=C8e7m$~WLBqa9i5;@}AL{{mHj3B|ovGmfFy!-t
z=x6xJs{&z(gJ_or^ajz$jM?A0<soR5<O67p5L-quMl+nDWfet<Y8TV^*G^y3TcrDE
zZBdKJFK*DL`$lIZVlAtRa{sIH0C=^K{`d38zh)3k(6WQx{)k>OlfrI+=&zfM(K~;k
zPj<>_>N3L6rH?yz?;9U7)7`MF>S4Hi4(9Rz!<*6DR-IV;i0oi%POTGrKNDCm`G2nW
zf2VUI5g?NbAc*J0BjL(I39|RY)Dj=}A1YP=wLZ$k+9751jNS0a{(6_viQb=2sYNsc
z`~tD8*Gk1gHV2^=k3#q)U=&Ps(*RVFMy0?4O%1{SJ%PVW2j*9LHJ|G?m3*?wJWyZn
z?Z3lQJR^~DRKwr><|<Y2+I2NwS@*wQ$N^khiEpm|`izao`0Q9p9J*n#r0$=-my$`M
zm!E^B>>8z{cZE6Ygbt$98F&O8t+2v)Xt73Lw=Lyv;-efYMyg`S{wBb;I_i{V6Jt4y
zY00OUUy|+pQPa&7;?vlNjnB}UlMArsx86$_=z|zY{%+j*bGGs$xh&s2#U`e}bMHjb
z!HHD5h3li<Up*B%QKR;QgO4S2NQo_QYLoyPJB~?uJAe*Z4z(IA1(9K6L)ZzEB*t_<
z?e5nm!*?pYfr~2y{>+g&>1w5wF)Ve#2hR8W5FeVo{}W9ZI>F<T^hJI0`2HGmg-V0x
zP-6Zzj~dX5Z?^n%{}<w5WtiJs_s}@_t>Z6!c5Vu0-|UY)cfo1_{%Zk8vUlAzco~O6
zzB1Z(wS%<QB(Gxi8e&TGW`<T~3TO1e=A~YqabMR;Ip0~t8*V-k;iwh~2DvE5`E*Wt
zq@DfR5+=`#CV&BU(tD?fuP7nJS`tn|L#%wP?gOKQbRzE$^GWLcSK5cPZCUS?@8=3|
zX@U>hSPi?uQfq=Zyuw;PpLpFFTvh9!>lbo&5trZn+<&*$VxZwQ>^%7f1iQ!;3_9eF
z71?r8cs(n0#vXDY6P~3UGYR9CchJ5S7=65%fyDDd)U^5%I_~p7E;96@$DL|Fo1dLk
z!PD|e|Njj(_-7PSo~6_mTBe_!ZQk2ja1i+x`424n7?gsxdK%;$+n;vVc8!adf%Xvv
zFq|4-DaOM!R7JeWRLL%L1N^{>X|9wXI>L`CyUbw|uNZdp&_f<QbZq4rWIyi(tu)NY
zN0x4TH-CrYm+;zY89ZnAOgyW&v8#mWD(>d!f5NK#99jdhOLUxUqDS^Qk#mG=vHcTo
zdn51WgP+jP3<U|ob=;yV3t|(205rno^NM%DbHA*4qbv}hzt-vis)>>xAxrrOnVBHO
zKYt}^OBoO*`~4U-YSX>p+M1~6-7|GD6-bQ5;}}GxV^b>ttH3+XQ|N*rOh&uK^}XA*
z7BUoyg^_~26cY9f&TRKe6zi9Q!;TNb;dNP|7&q1ro%89c0BH!AFvm!Jm`(Ph3AfHG
zy@+(%ff`R`MmyG-1_!DEd<TrajRFpse(d9V=o33_O<&i%mp{(3_b$y+l$g*4IMj%{
z3zQeNp?5YW+21#QdbD&K@iT)`?PHs!#b4M&Z1nd1Nrz^xg`n7JdC7zKB^~xW@^Fig
z|8pUslMZO<J^uYoZ9c0o0NHo*>?m8w_fr_UTq8ft6WgxmZZ%-*LskHi2N;kB;gP(P
zVec5y0}6kY<y~+fxp(LYoDT=>bM*-oE$k%4GaO}2KkW_~E_!1$5@hrb(j4OpW)TtI
zya`sFmcaEaL_#SOBnS;)Ug|KV6Z`1_<dlo5jffvysJtO!{udo)@xMgbMO7mI$(Ws*
zo3Y;ZUwA*|M^!*u795I#p%nhw>y$R?d-e<9lZK&rzXwWnLFI4)kqH3JL<P|3bO7OW
zKo)OkcdtsJv9jHzU74j(U8F)URLmkh067fi8%3*A^rBBZz~_?3kk_c{{bi^6^BCFh
zV7K$^u$bcC(`5dA;B`c)sM~iCHh~aaQEyUS=byg;u1JoG^Ah{P>kFbdJ0fDLxpyn>
zrE|!lVogvm@^9prqRV^xmH?o1sRQp6@j|GhIb%0uj5KJuvlq^`WrUr$)C4ZmaiLF}
zhC(2`s>3ElsGDSWQ-?KXSH$mjD9M3FD&>5DCqr_mP>pf%ba768h9hQ4D*tyox7jt?
zSH%zO&s0vPoZSid)d!Abx4y)Rfe8Ooq$VPun}G5~NbmHC>wkb>ja%;lV^?kgt5Eq$
zj1pMym?BAQcfRrddqG27wpaYWc?<G6(Hm33v@sGwE4S{$nG~wYRx*0#e*d>C0}l{`
zPVkhrglx7DI20?xu!~gysa_iDgoY*}LK@?nZD+Mdyk2y*Ww}AnpfL13+L{b3b;6FE
zzTE(!O9YUqC{W@xg<WFfzu*5ZEJ91B@x9=ct<~T9(pQnm(1@qXWdFG#c6yXn*0FV?
zW8oyXPG3+aK5q!*Y`#o=H<@({CTTXpJmr{?j>&H$<~5HSVKNJ9FiBJ*50sIM2uuyZ
z2$3a4fH8Dn0jtde3?_D!D|PW4ByVVjP|ZD;Nj@a}Ql>V@{m%-x0;-vK>+i@Nnfnb4
zv!>y#c1GA4dE6qjJC6vKK|fi)DG*S+t0Un{0RJ+ZxIk~c8RBqwaij@Af)NC_Us+&4
zvH6U{X>Py>!OszC(Hr;nPteT45S{@&A+1&dB%tA-%74vhhFEgwwJifpxB=WZjw|c;
zs#ISzfkmDGEJLL)P%A;E5p@2+c?tsWVLEZU{-;M*N+@D$W!G&o;;)s|N9EE=kUaJ9
zm6f}4XArU8K}L>&1oE@K3(nRw-wY1{<2&404YGK|UKMeQwDr+N99@3z+%{E32~#K;
z4UnRN(oMjW?^SOqI7!$V<8?1w{zpEh6ya0(k^pw6Q4E3Y#~OE+DYM>guKu>3{RxXL
zi1>24{eH}98w&Tq(YJnW<oxCHH_}93!=Un(-hH(@nA5eE2?gECmwxO>KxO{;12M;0
z;H<vc(k$4DM1TZQa%Od;4ic0maOM`4y8DNxEfCgR8#mSmo>b3@p&%V7f)4$=&oG7h
zPZKxBR^ypHolib}QvLc{+4WM^JcnD>HR`Z+=R-`|u7^a%1E<UWq;1{iI5<4em#m4E
z^O~`&gUyCAz-W`6+(KgAXu>MzLIQ^XAVGQ;!~gDG0$@8z1l!TtpZ}BEGd}ZC9Vacu
z_*af<@ILXSj(GFbL2-b;>p-ngjLAczE7G9hiv~GRnZO!YPDUda6L|l%rFbLi&r|f_
z#$cL&m{FExiwr79@MwZLdMVfFDJeRI@-yJ@<5bL2L1RS#xq}Y4HB4->{jJth%+d=O
z0%NX4XlEZ*mQ#7P3KN<b2SdKfrJ0=m3?bm#)-ps!zW#Hy891#pNze++cr9Gq#Ym-(
zr#y)BzDbNzL9Xx7bMtXTaCO#G6E1c@0wnZ;M7px|AGlM!Z4UJ@8S%S5x~wvEL~(o~
z1U*(rtBR)`M(?L8mDr$Wzb=;8|E&~oj%W?mFUU<P-*(gOIcsYb0R>~DBkS3{&t*C6
z^=eCJN0()#BN7!{9l@L8jd!nTmA7Wz8TDu(x*%v7k|#hoy!NZ-tq31&=5?Um3BVRE
z(;!G)0miaTIW2FV!{X5bm@x|1dBRP%96e9pvo!vfO3rr@-O*Z?`YwWjk|j7Ssur&S
zD=J;BC~*3lh*Z^Ya!zhtWCpY&@H1|IKQ}b_Zvmr%BZXV~)`8#{8T#Hz&yOQUiNb#w
z2=ox|%qC)kN_)~&8Fap$Cc$)vUG1VwdlxX3pnxi+59(y&0`W=<^N{v_LG;w;@90zZ
z*C(5A?(qGlWEa`Le38Ag_W>kw5AA9fUU$Rcml|)yoHg=!K7&3XT`_=C^|=gm>BoT`
zrI!^y`|4w@YddR@Ybaau9HYM@7>}JS9&&p#^TCK^uCM6QNVlJR4;VGu??QEi7FOJg
zFb;J`)W&}>Vrk&%so(3#%iRfHog$G;g+92YKMAXYN=2A@tXGV@_lksXfXTV{h!ZMQ
zKE%=n{&IW1rgXdV9`I$&CwG}I7HBy}&nH9@QN+=G&eiT(mH1`zdcKDDfC*nrA9qw^
zdMd7(B<Gn1-Ylpvv6>sG!|}kKV$zoZYDoRJ4?7>8y$^UPPpFkW%uDG}eeQbSL*)Hq
z5IjWjjG@iG_ws)#WBDqK6vd<1!Z&6Y#>&P(@T(y`I)%*aYe4#M%WKea6X+p$)IvPJ
z1-6OG$}`j0d6Y!SNQ+;@%ruHK8bZKTq;ZEZ5fM|`Q|_xaQCF)}2Rif3FGMl_AUtRZ
z16e(<o;m%`k<CX-KaU&_()jHTQ@PUNisFwCk+WdP0svJYpc}3kyK++&76mS{9&UFc
z^$46UJ@4uaj#`zhTG{Rj#^AJglbugmJ#4!#i!RbPMEXp7-aCKh=_21^W8}T%w}I=-
zwxVTxeZMw9i*N#&_!TK=AiODx>aRf(k0<rHQ~U099Lc$sMb+|P#r|Y-36Dw={i3mt
zx^Y6CnuIyT`SCw##QrBEtR$VZQPG!u_&<8EK7J-p_AC2MWKb<zW};g*;_nhN;Q?<5
zac&f>YS`XbvV;WaNY~3>79|JY-|f6;q5~s&)4t|`z}sMRWhTbi6A^9kpTy~PkY>HT
zyn~F3TJDFi@znp+W-tssi2<$X$9oJzw5}p)lqSXji>+t>l&4+Cp|62gT%?eZPK5+g
zA;ZWVhCIIu3=(LutC<N8<K9S&aYj;eFzz(N{wZ5!j6QG8i`Sq?b8Ee3Y0kjHd1pI@
z&JG0vt|<6jM>$50Yy(R!jp^W|-v~S+OPFDLFQLG^<{;80)C0xy-#0Lc;uz2xH(pER
zTbe?g6cB&Ig!of<9Oeq8#_J8gEaDdtTKhlgBW@h+DkCsX+OU^e{%4yVpjg);PA`{D
z{7lVU8smxib0P9F2f;=T#v1s4vUC$TayJh=uB4AZ6SCkmgsO5i(u3xqozqS;bztTH
z`fD1LpsJ91Ml?Z7Q(Y^CA@k2hY|`o(0k_Y<vI@c7f63o=o>1A-`SN960QL@$l|%fz
zVA4#M?xq{KIhxx0-gp6)V~qC?4S`P^DlA+!bnyq=j#uLSYzgkOYjR-z=^JVDteQ@R
zn-SPAw0tyt18w=Y=~YyT<2HB7@V0js5>*fz7KHAG8HMhKL6*);*PBT}#FVZP@qml1
z$wFBtnO3+_bU))UfA{wJL!%=~Lxz_uG=JUxdRv59I06`(BAW~yFCm?LA#DBA63P4>
znAUs%q4wb2&w17r0V%m5TC~?w{7>k@;e+Ui83!u0r{hU`8}32i_+cH5;j@rJ4IVEk
zBMyFMGx_*1Dl^GT=EuWfFquW}C9c6W=&e)fu;T0~5p}0QR-rw8KbMSxOfGaWR6&g!
z(bNw#;kw_2wuWH4EB2nH!-0dFS;tkHtRoh9@w(+uGIaot;@$nzrSyGvz6aE_sc%=<
zsu?}jEP=H%GC?IuncLE0e9D|>ZjXB)CR|`Y#mlW0c{y`orGV&J_EEQ=W)Y>0X?tJK
z@Jz@bO{qnE7xqT-DQA)|t**k+d9WQqETHXjSoviNgsbAnTNmK2;!|k)w6h{I;TS8T
zK;2DZXiZdYyH8wM03++j)DW4b-zO_g$sc2{UO0FE#-D0D`HaLf1vp?GA1G*EV9l?S
zD9Ch29iq1fN%Q)VOe<JiTPv!!`>0sM6Zot2=#Tp=sloqp0lw~EsL;tmtt&pKR^E<@
zp2D-TNd!T+z!x0ZJx0!jHBATd3vS0B>K%v$7dpiN6^a5d;^r=F<U%UJ2^=z^%Ky7)
zH<*oD>0EK;iLX0qD_?)?!Lu`<7M)HsM$2KV<-b|b5<)M<iKP`dQcBUjlFdqij%%Uo
zf(3W9!_#4}t*=+g{oF^|mKsjHkQ&+r7)%|>M*2V5^qpFf#DwCM92xAXOMoZ+R9L)p
z*XOzZhxk*EH0RJc@3SM~^#~Gp$IM2072IL;K*nr!e-Irl%Kq=^T!*$TsK*T{Zs#Yy
z1|J<AwJTDShgc;vaF^jgJhLJqZYGGg=$ANg!8fNkPQewIiHsl)S-Y8l8Kv(iuQss<
z<Nydkws^yXs3}N-R1x{Q=tk%Tn#SzotMQ0pv~;mau#++xW?6`MZmmYa%_!3?jSw|q
zFjlZ>{k0>u^6*G55T~WFl9CqzJO9FLx<4NT45<M}xY&T&$IDc3m(j}%kpo;5-{$kh
z;L*@=%S$Gi(F%1?<9_?F^<`N3F40Qp&f=-VpmO8sS^zz>8e;UhBj>s46!rKGTrsL|
zCphQv$(92p4$78_L|^u92hJgW`+IDhT(Zw!PP^sT@`UM*j7uGhAfRYedgUFT0&VI$
zAFlfG5v|o1N5$1Z?G^E`+MGd)1*OnH%EB%ydePLTF#SHGi0D(c--cr!C~El?V1n0(
zgeA-H<dvh&4{qe=`OG|T<R+ia!*zc?eLSmBheGS}X8R@fT!PqWStk@&1oU~tK&&Ot
z6MN<6am1=It>qIGh=^ZXRGzqr8FkP~AiutKnR{-}DDz8IH=S=31*hwXjP|c<Wxldy
zb*CYjhU1}Yg`jc~g$=KkKX<G4g?u)th(jtQ$^#b(e-W!^Jc<sKH1Z=xC;tH)#-I31
zpD_|<4R~~qCwT8)sL-oS^pf(j3>KhLD9dsuyal!)MErX9j$iKYvhI6|35m+TtJmK*
z`k&EVbp;;k1i0a73;cwcXpumL5u5%4dX>nxm78jzGU_jk6Tc&Dvb#rfi2Uo2D)tw+
zV_W1g5vp<7pujrZ>GO)-#&*=HTo*A2u<t*o|BD&~uex_3q-#=m&wC-BMmjVsXca8y
zOO&wkW*w_e5CMiS*k&Gh+w3bZ<3dZ>QRLHfT?OZZCZJp0bT<-dM@Am?a+NSXobKSx
zss*76Rk!-pvdcMHh;`AkdyiGftXVOt^MCSdXd;kF8c`33B;-=jWCh>|@%-7_RE;#U
zvk>LGHqlvE3nWC-4V6jkd3^c$={L*<AHu}rH{xnMBrNPX285323<1HeehCv+r@#v{
zn+rec1v8c1<#W<^>-={VRR+O3BrQAtpyqoTjUI_gHaX~3m=eza;Gd@$K6X2J521=g
z5UPp%g7$yn%!rE5jXY^-WV<KlqX`nlV83y`Jj8o7Xz4|=*Yjw8GW4&y3>q*MX!PwI
z*CM&n4ECBS1Ro7nIc&RGjVi6pE$83;;-jr9&Sd!3{|HUNHeK3MCRgtZ(CYF6k`MBZ
ztoYo?faLWQOpQ(JA(HtXHEHmt3On@!2$iQlw2#}Sv3P-e^9`dF&y~f0Jd;&_7f8ec
z$2=o#`Oa#vHrR7+KxLHXx-fQ8L*?z`&Y4w}NrCa6W6v#Qr#l>8wQg?*k2Mr&JMe~T
zeXC#da<ans*I)->Sd_r42*W`6TQ^-QtHfa!EFU_k)T#2S3vLQ74l!fvP-cx6;BkF5
z7yOjwiT*L6SE=q^;1G~VU#^y+nUOXQ6z5XX>_c~72l7v1>Uu+##xgQ{u{8sX{u`&;
z-rZKdcKR%<)Jes1aR9>W%UDic;{t}<t?v6jo{!>2#`~GqoWb93H1?AXH*yO66Cv)u
zVpn|lZx1sZ=FNN=+u@Gy-0QS#gCU>^6(_sD@Zzg@ZLdq9R%Z(bfk=A+J>~-9gitGJ
zzl9I7f)vD8^C?sV>R_5b?J817!+%Kw-#O;?loDgymiVvTic@?9uAcf85RCZlcEWhw
z<yU})Wd}ha+60+snm^O#L4rQH(kYnj;_t|Q%7m1cEix<9LO!k#v4OiUpd?<>Eu5_=
ztMbP(yVX&=Z0xDvI4~H+qF-p9!sHSL;_N#?l#;78{hicv1J<SU!%LQ17jc6J&-Y}!
zTo~`-!b+fQ2RuqvDZanXuoI~jwU_xuI-Qo^T}WOl2Emdc$Q}G@PAe+i&vnPi2*7}y
zaCz35)Y1o&JWt%0KM=o?&-ow-q8ml^ABgw-Yjr?cYVY;PktCw->TukuKA$5PcJB)t
z_B{B7<u~;49j8jGt?uVGAzrxybSqRnn{GLP0Tfvl9Q{=?9c&NVH(<WY9{kiKE;dKj
zgGi{HtTlQbR<hZk&KXXflY?fk`&^+jaz?<94%!p9`TFSosoeNDi6Y+#hxWdA43#BO
zdh!yxyLD+_R8m3nivN4vH#mU}xU!znNP|>gz5k1%^wmuiZ!s&z%6{`)yuBltqEWA5
z%|-39=Vu69#S)YGbj~37J8TjRQrQ$8#{_VgQPw;TxhEj!vUzbK)F*AcstP874SOZt
zfu|m4BcTxLFG`V-(}ci=?-wEkz+N1&K+^ct3V2qq1j<A_kp+gC^UjW19pj~?sD-g#
zuV~4<+g~#9WR^pbNma|0u?{|sXknSMd07Ka=&4<`<eXqef5)DjF{eMCp3Rc=G5@je
z)M3sP81vW^JGKn0aUENQXkvqD20t~kX`l*5@qr|X!7Ha}BF?x_CmipI7_&dEgYzP?
z&9VTjJKpq=_Z@|b1AgMHFa-U{KNjp$KXptQPNB9d#8!N+z~nNDuRr&oenU_$F?5IB
z{J7tWRKbap{5_J?Wa1;V|Aj?$;@6+iLnzyakIwU`EPcz=PD*3c?CS;v=7ovq)Q`P}
z6V6j@3cwYN);p(jljCm}AdKl9GE)v91f@*;(TBw4E((W;%XONM5H3mwPiO?$Cw(@p
z$Eoa7o%f3edB7UmPEo&iN^Qa?GSB4d(L+HG2s(DJc0l#dX>#S4vo7oDKEnt4s)t0>
zcxOrIz4AWWTjV!$&3l9O%EimuEYGumIe2|y-ikJd3aP6K{b16@Wg2Pwu?t7r^!Sqp
z!hJ^$w|Ybn(;+=pYYV?m?3#poPP^>5@N*S1O7k#&#8mX9oTpody8SrbFSXNim6gyP
z(=DE{eEl%Pv*&D(u^Y@OVKC*+F0YcjHdQHnh=L4s%-s$43+76JKGUZON#7fPsN=Pr
zex7qzL|_O!VEx+3fr0<DVR3i@W_!(hw|JB8@r|+v)eZ5;>Jm?*`Y@iMn~iES6Gj&-
zOtBJi`J)l{iSH)?ERGa5DU-_3wWt5=^k>KfaXZR>j3dkZw9=rIWAV$cGeb9fM={O+
zc^;AryB;C9wa9JR)N|VJ@+Nt3A`L?r(#2&Q7-=TezT12Ag{A*7!e~<`Sl~Oe8^=Oo
zc3X&)^>X0xhp}%K9dd=6sy*yy&W&e>$C^ai{z$A&jMc<Ej6om6lo*jOEO8EWW&&_3
zFc>L2zx@YBu3bD1jX#1h488|z1HNydA2%wn2%xVu`@H~bc{t$fRZ6Xhn-;<r_tG44
zp%w~f#np;|*G)#MiWRty>wK^=x<rF0V#J)^c#--KXz^dQ{`Q&!zIB>1MYHAn>X~#v
zR109zIoB(hAg53_rmUW2;x$UtE9nP6A(zNC`FIuF%;MThmc~Zth(~FY2Fr-SNnJk+
zz+TJsLF~yr7Jv%E0tl~Z`gdjkutC&EE^zH%z*QV0xBZ%ju2G6tuz!TGqz~0%#JEI6
zb_VQa0R<W@Ex^2FEK@_rn%*6;&?d)Ii`w{4<Cm$SH2IR%$rE!l69pcfM`3VUY^2By
zIm-ko%@<BaVC3-ubizC!|1fWXlvog27mg=Wsv7OmCSanYmZEb8qe-u8hqChY2nY4S
zj!TC4GxV#wNpoJ6J9emPf7z19tn!4D@ET?CPBXYX_v*7&xPy=RdXd_WDahv3bI5G~
zzdI6c&+4yOW_FmrXbw)*&3=I(4IV85A6!b-iqfuL3gZ!)u><zF93udlVNyzCfl5&F
z%EMlf$GS914Ir~}61oXtQU~0_>U(sbWR2jAz|)mWjM11KvO1v~9t+A>u8F)sEjM%V
zZ0^nuQbnFLFyfuRhrwd$Fr>cpjG@Wx`KW^_9G#_TfAlG_pb*k_l5y@6oYqJ54z2Q4
ziv|UiAE&!QWKjzhLq8h4i}q6pRXTKpZPeUV<DxvheBt3Kqv5Kus|G#ND^mK-zZTxS
zpW}4ux%p>!XzihM*RID|0jAFt26}eSeudFtTSZ7%qhetgK@LxC$dMo)m%tTrMPg><
z%M%zZPCgm$!`?yNc1+Rd00|0>e8mTtcjzWl`w5ZJ00XM2QeBVLq9hZcRHkM{k9j|K
zjmqr_5nfP-FOM;FOMcq;8ezBd(@qPl-D;JwnJt;2!Ez>^a5r$-d2fARdp}a|HCrEU
zP`MbjRmnV>`h95igSCsqrZW_Myifse;2$VE6AXDc6(wOluknPnvU0*}QN|?GR$D8A
zL)}<_DLQ!3Jl=I7+V+pf<XM(xr|qsAfA!7d=iG<4c(h%s8JP9oeE9HzW_GFlh1Z$V
zo{Bl=@d>?hDYviIrn+}kgJ&D7*YjK|X2QIbwtncYeYl76mF*(TD*F)c;S0BLBvlJv
zlVA6;sOwD>mkbL~(NDhE7-RY5io1Ng*{gDt+D^)T2Xs*>`IP2*MNj4(^xihwYI|>X
zMr59w;D7FOQ}xYkg_d67ldaT}8^#}$Kfe-)I@sLL#1uT{<=DsE<-{3UY2#jdg3SSE
zvy!-0Zq>dj_0Zk@<2SCl^}bfG)yl1RDWcK8g3=@X2CL52{Jd~tq;sGp40#ThFA!;G
zjv7AXh6nQRlaVJ!S&Eb_#w=IA@l&|75J`9E$y4!--^Jcr#k4|=O!$;Qx#d^Ep;v4c
zrssHNr&UjW;87jxoS|FX_+lkj<XXy{R2Y)Z9cNomTg$#(#V({ML-*K?NXixp5TkT(
zsOC<(f$<5}dDXWYt-IT;jax%&mCDJ#!1G+y*5%`OeVWUkc^gw@bdE8I6xceYvWovw
z)8l%|-N~85D00I4r<GSm`=g&um!r(IUDqY=X%#8npvV!)|0SnV^2l>ouJ!V}FS^E`
zY&Nn}Brf)H`!@BR6sSM9yU}_+-yI$F>CaZB-Ehx|FWVcf-d+V7QxB5120Vv@Ev4gy
zd)$Ej&2!~5@1R9vK1=0%fcnc+YFdvG(un-5LeDXVB*TFpjQ1^@^DmxHTCe35uFz;d
z_1?~{Gd>(+Sg(_7$>hSeGatzNY3t)B_mS-@uTG(Q`*QhuP1AQ*`v}CsiIRcu`s@rO
zNx!bHav|Ga9F6wqCO+W%ciEBkzs87NzDxJOFS88L1LmrjZDxgdOMRadx64A7Or!B)
zb{jt0<;KDCZI_EN>?c{@zWVTp;WzzkpMRM(eQc6Xa_f5iIj_8nD>a6z{GUDpF5_B7
z6>j+SUe{`tw&%So9K7sJPi}G=({n^@GdJ5OK=HO!dCWY@kmnb#_abjZO!+wXRwc`;
zq#L^{H|)@CRX;y%H-9?kXd`r!N!?nxP5P5^iA*-SZqY!$_>PMZ>oq5ljgFYgGfym=
zqVGElGci3{R3~I?onI|@A)8-oq!;r{R4dGURmP>X*(BWM)7D@>^S3(LS!!E*Khm#H
zvCe&kAM{OTdvbl&{C}@guD|;+CA7CGIY-Eg`Tr|cm{VXj4_Tl4^?@h5V_>~ba>&Kd
z!}0HPJCB=#y3cKQ3&=b!k`FxaBXf5<=pv0mGovim6tXuDxu+;8$FqD|{~YlA*?}gJ
zza1jn0KT~`$__8d24B_?`n>Xr){}4JBRsV67dulgRs*aSd5xH(Lza}CKg}|mr|d4?
z*lFg(GB!I|7GqeIgEAVU-;OX$HW=h>#dvSTjKA1jf58$(9bBzd@Xb(2?5O#x6if9~
zMRSkctHotql{^<GUKOT)=CptJ@nd{#hk4)0X#eNw=1bT8AI^ELeH5t^s<Z8T2!e4Q
z5SvwZ{5e0)L94nm+qz>jymkV!LVA9lR5SK?X9Tg(U3Rxcg7pAN(rw!iK)(;*<4`C$
z@v2#^|L@!$z$flS)Od3vLgK-U@l!W8J7U7^)*svT78?jvt@xc^_p7;M7MP2jpT-u~
z>r$;Q2mRJTV}zLaKGH6t++rgpk>g;|nE9GZ_k;2XZ+5v}AE(b^NxkeGF|XjzG!!u2
zBQplZ4$6)L4)xr-)93$YD%<mYR8HphJ*oM@Fhnq#ve^!@n?=rd%SfPn!d(-+BKY$+
zGGJ(6oJUf1XF6%;Zo#`Qkf=Nw@t|vaV!xg+=ahM3c&jhzZ=YIJV6}L)Oo5G&Ue6N&
zS!Jd$i%)m{oV0t?{5jmRz~7*qnJF}7vo-r<wJhCYqpd>=hP&@EZ~iaS)<2P7>(z;w
zr3oh}klW_5{C0{Hh|owct#g=?w&(qG<LsTZDZ}JhZiCfDucR4QQ8lgIuD4#!1;=XQ
zhn6!wCoXXE?tHv6{rKyLcYohdt-rJDpFBe5zWMFpNfbJpQm>Spel;r_LTb|P_9RQy
z&%+-jJ{XjbF_A})sI_hdT{_44XruW?<)2f-D<;EH+coF6YsRd17tZh0SKpJ>jCt|&
zfG*l1&StH&)ieD3?$Y_;VjF#-PPZ5C1F51|MiRpC?%!eUof_|T+Qt>QPR5^Tdsr-%
zPs-X2r>9!9f}Lwunsj5Z>2k_v`r8GA5ID7l7D8N(Nwq?QH=%s>{o9kW-spjwk3QJs
z-%Y^jPm}#_Q<2Zlq`N-hhxQQA?F&O<Og9~N-sn<m!GA4^L2{FN-aBPqxxXQrtM__L
zTuc_T4Jrhl#2i1r^W%Kt!0c_w@5VRX${jrx&&uT{*e+Q7inGp}E+7-`QcBj!=WY2^
zRMa_sDuhGchj<O8KZi$qc5~XUj2LG8I=?%)ODNy$dTZ-3WT+z`K=f!X_D~;{wlI2&
z2^%jiVSlt3kyH-zHntmHFn-_=*!7rn@v=(&APYfT+iNq^yR5t5%F3j4>mw~kKh}W}
zqC@+O-@kvS>wBljT-IAc=9{dMdCWP+eB=tGW!JLc@14?4_zl}{nR6uKU|VXxwjNuP
z9bM#8qF&T*Bn&RT`Ly|F>)~_vG1j5^$NDtI+JPv`s>E3Dg+H11Wl2bqR<@66BFBn}
z3|39C@A-djM)-u3{`F$G;QhShR-c2mW15d91aG{oZ7!LG#rHhf*WVxK+FIXNkzJKI
zmEe7;PP*=zrrA?ebE4JcmcZ=kXYDWA8cru|d{Lf=6S(g0$G-CV5;v{)dQAj_mX!Gf
z$CbXH-aD<kkIa&^J%8y2-#b;HEI-As{iq4=38~H7GkJN3n@weB&1nU54n=-y%eC{I
z`CNDnT%qThD*bi+ZKCh9@2=NJ+}L`5L!Hw&(suL#5?Ye}&RRu~%gy5hND@+(m`=Jd
ztGj*fDf+IxU8~I{;yJDU=FoSGq}N<dTi=JhhIg*Knrd;Wd@%8#x4;BnXr^tfY5!67
z0ZG!TP{I>rC9YyH>#=PAclR7FmhZ~fE>B*Y{^IarrS1CePsvY@7Wfxu;U4f5k_HQ!
zz@cTM_uBbZS8H;M&^&^dCx=g&Cw<lS`vj(r+VBu0x!5<`0>eurgSazo;Yt<GK2{fQ
z011^z1cGBqLnB7ez~hPMQk3=Lt4Ko~!4^+&^JQ0X9r(fKg8QOlz!!b%@9*KA-^x;)
z?sLoMT*`kOkHCDbr8kr;ZPXM<*!c~4V#=>}h&!cryO4(PK7!1x==IS^>UG4o7Wihy
zg}suBmqgtAn!{hG=zdFly}A@ze~#zT_zCux17pm^Qf-MM9-~wUS0~)~VV`C2p9Em5
z1ZOKilIoQHA@i(q?6j5ls@02Q3?ylV$yal!dP~FQx3>;o-`+8JM`3GVky8=jqkaDr
zIg<U#Gr&IMxrdyW|LiW}JbG;HWA7N=c5_%ccRAeJ^`>ML^XF4E!+!^dE8d=EF>$)G
z=tF+aOu*#W{qtkP{)2;q=^}26$Ct*e-d)HYMc?MNhoBd=Ci@7xrqKD4la8irwcmQp
zc?vd)HT?t5kII-SKr61Htx{52AhR7B|4>|UUl8l=?jB71|M7L*@l@~sKPNIuDTzp=
zva*WEI!;*?I>aGcNeC?~+es-hDmSAtvNu_Ua4KbHZ;47Adt}ew>vN9orF*}>f9|7u
zobw*9_v`h1&G+lwu-Lu2(pzP`a_d%MXV<k?u{bTRt63kPXunF;UHVeIGMy3mWp!m1
z3r_6tN00u^=$++0>go<Eos1`wY1ENh%N6XSH}0Q(nM5P6n8;)BL!JWXb*6ntxsEqj
zu@G8p?AZ>nsR`@gWSe7OZ8(Bf_P-f+JUQ!V^KeWq(i8i-xQ3+4R{1%5O!-L;0tv1;
zF(%F^T=GJ!@SEt_iEeS!5S?qEXyln|_9^-!(w~DOreb>m8fpp>99-q`S=bxq*yrw*
zo#*N4U$HBB5<XETf0^xN3;DWOTH-Y7B`$7fWq6{5(&FybOZmy*mk$!0Oe^XVxottA
z9j8lAZLSO6UPJ9V3X1R$=pB7MDVM?o+Ofoj%R|ahsnKC3Sh*jZJ&l3cDX>J78PLJm
zk*G~1hlYU0zmXWf(HINIOUR*2emk{mCafj&da}es(!^5Li`(rp16U8|9N|WBv;0we
zDwh@}I~fX;>F&f^wVF-B7bA}bI6Udyj7HpzC;hTxPCtfK?`Z>}HM)3$WwSr)U^2}w
zm<soZrcTtcf)k~xjUkw4$)}i-L@_E4lb4f^oo(u~MHp&}u(~;I#_f8hxcPf%YPnP|
zE9B9(h|)=x_7SFx+lt781KZQnG_BR!A4XV5c8#gs+(KleKpm(PBL)rXezKbTmB_2L
zR6JWGj0oTT?m2J4YHHV-6F>ZVr!zd<K4eQ;SA&t|GCCnHT7W`7zv+mBHC3O3H95$q
ze;}_(J&G971J6+#4L=ixPeL;>?OxxA1=t+yZIoS8Zi4KHWd&lCjYuy-&hUkS>%>QP
z!+Oghtsz=LVYlkND9S#r%xiT#Ma=Ma_ioqyE!950vWQK)$6pH4$xqB>hjc$&HK|JT
zF~fr8(!XCag$^yaIk3kr#oZ&jX|Ru4onZdUG>QtD{GO+0t$?>K4`LyBVv;cR@^^0E
z-g|(94dF9Y{iwkzI0MUCuRP~jL@YFx`AQ6-mrn&P&x^Vivll#Pp70+i68Os`&{E!$
zU_ezlYM=EkJm>qx$4^~ee)}BdSo|xTR9c4=u~7%pvKv{oTGfk2O!)OOdrU9~zUT#m
z&v^2aMDfj{ASvf@l-bGAHzk@*_ADQGdqn+i*3lu$Far8b%BOmf>6e9pn9hm1hf$TT
z4&XBoh*S{R{3#tOWW3XO)@P=EhldsWTt{nhuxpv4tMaX%PjyD6`^Q#S#(3mrOyycS
z?E9RfvqEH?m?QflhE(gXqbP%EQ|XjHEvpu&S`NSF9}fDus$kDCd$LcT110P+UlpV*
z^L?kXko|07x2zNVr>iW^&~!5_^;+$axCrmzklo{ZzlK`u1d?&{arTH-Tx{lpWixQr
zoi=Iz;B}$%s}h1|zji+8W^{CPWERe?G_p3sH4p+r41R?jsd8R;&s;1rpA@-}<kr2C
zAGw&{8&>Pe{lXhOPN&I)>4t;@IUnGx{0Ye>w&JDxIB_4)uQCX=*753N?EY6}UkT_q
zELQ3iKS6p@7rp*SRk$;8SFb~0tG^qYIn%hMe7z(hi&0t_Lc(9=+euV3(MBl>2VM#j
z61FlwPHz_s>r<v2)Sf=GBj@hNw><TbWqIy8HtT+U5)a-UP3e%)Y(X!;<TDA?&tDUS
z|4P|1qBWF6YwZXdJygL}{g$5)L@$!CxW%Bo<TF3}Gb5X=h(4(38g+Q>mQVyKfG5!L
zGJQlv(zq6(-0W@H9j8B3(awq8wPitgt%;FgJgMIdP6O5|LXDTdzkS5z0$-2leT`OS
z`0%f-e-&-gx8EY1U+I>RS5_vsGV$@)E+i#6*_41jE(rxQ2`wMbEoe$K1qtymG99!g
zoq$2qFq+3WDf8S%`FH(&KfafHFuqwBObP1z(kH%n+I?Eb<M{kX<0VDagS*w!%<p<>
zH(aKO9Pm%tzY~xw6Q!@OPu&q;-EDGT1x^4F5n<GTm-m}cW~HJ>g*32mrNinoyHZm{
zQF3Odisg9l1ur8qC%#-cC#ql!P4e_%`UDHzhJt3dhVEQk)rgB-nM+ZI+lW;K0LHQ#
z`<v;s>*228M`x<Gp7zeYnrQCgqJMbgv7cpL7z%xGpO_jqu=oYAxjsqV>L3|lr1Jw`
z=QCZCQ&_G)<a~bD0wl*-GVR)tEg>C|{xuTQsE%DY^`<1aU>7X2?}++wBIBv{sB;Cb
zt;umES;_oM)s6wpBlBMSr=oh_Zx#OH*k-)+(fAPJ+z&-muE^4A(M~I9-hJ)D`e?f_
z1V;{}Yce7z?oL6$A$sR;1&#cpg()#xOR+td(!KJotA{m8ZPmTBDxytGYhULBD`5(h
zCZh%G4DI~_xy_XB@!~EWL$w*p9T|tQ?vp{Y;p4zoXMw$1a~*e@>*+_GODuEy+VRCp
z67N}S;(baU;RiP`3<d&U?cW6p_&swJ7GDr)AZ(>CKF+wPR}-yTJu4xoK5KTZ<Evfz
z)+uXeO2bI_?nhv^$SL&UWyiO~h`7<(2i84hZgp?X`;eD5kpf|A1~m8{hmm+%zkl|(
zuR3WpM5F5;HaCC!N#!b4YJU}B$u?ooz)A=Q{GQ$u6N0i4{Q1--hF{JwAk?;%>9ldn
zr<<0|;mQ5#aS6$PEh7sTz>B{}*T-<ro*DJ>KF+*g{Tb0XZ{gIt94nAgP04}Spatts
z!x=b<MCMtS;DZq}IXlP0X66Rjfv{T+1D+&)t79RF#oFYmdQl^TXg=+Qftfku9ko2N
z_qNpM+oNQSDz{&vIgXr68eI<4jE~ZB`lZrWDg4yA9T#~zLkWn(DMLKDqr-6-YAQ}h
zLi1~hmiT;wH-ioON%t9M4X%7@T?C2Cm}t?5`K$pi%wv{>HaOF|Tg9Awa7@)t_Tb`%
zbrhpURl=uUSVr-zl*GMGKHYgACjYS~k##1C^=!I6-=W1{Jd4?7yApmx=(M*e-#Anx
z>@*d_5&m}Wqwu+OkL4k3g4gDOGgtR{P&e-2`|G$`VuGqgK{xYw*l63}US$6~u>Us)
z>N(f%zxq6<eizTCWc29_ucsMPjAhQ>aLP9p7O$lIYI1rRR7o?X>JdOZ06z!sly)(8
zmJ63MmTFrP>ub>bm~F)!VxLa8y6jsIL#F2YIGgm7J>s-eu^t{j#)3I#Pj4xRxk5K*
zM>&*(2-*#G452J_{jqPtzoZjeZgTEgp;>8Sdj4r;$lK}Ef#uk|xNq@Cd8QEZ8`3t>
zE-J8b38){x`$cRjrsJYc%*b0)1`AeV#n)BZX}^oHi<U8-Sh<>XQatCCP;W~M#r@kS
zets&Q`eQEI%f=@58sB)jjo(GS{Os8`VtKGSMha`|p^V4^p@g<cH#!HH&i+)xs7G~6
z5^c^+4vYD;q|d$%iy*Y=3E6;To&-7&+D5Oxc^CP|C;qxA#-X7ZiN)_nYM7e!FQqJ;
zS+vg>o4bU!Ll2LR3OQJ0m1;ZpS%)gy?-va~ep0Y2k<F-Pi~Hagmz<lmhvpia1#Wy5
zr|O{j73=Z(Qcd9wYg@Z`7a@?<fV2n%L2rO9=*6~+9tqCC?@17f1o^TK1WuJKCe8%e
z+^XN2DJA@^g}4&C<1_W`=<K1px3h$=e=jK_giTG=L)ePO?^M*8pW$lW?pd=EmMrgO
zYkA{44zcFf9Mf0alsu(+NzXc#5rt;E=%@i-uGR#ujrft?x;7bU*3jn|^jxDOk!QL_
zd;ZL-blN7ZoLYlVNxW|v-Has&LDQN~&4seh$bl6k!j6}!a{B4f_i?qmy<OKe|LW6s
z!^;}tGW7~t9}(xIVr!1>%cCBn@hUxg52KubuC^pFbOr@B;NNkNv`0cL)pxu~Gqrb+
zcO9|?29|KlOwgp|4+*9fKIef)%X%7|77ye+bne*~hzseVZ5}AaEIy2Whb%htCf@!-
z<v{I+f|rp?wWIc4ayN@zdnDN$W^7%4Xr|3>xg65$`|$PN@frv2w7Qt9repFl^IPfW
zRNWugKLRR$fKbNXnhO@Rq^+~WU}WqhA@1`yo&)WAq0ZcIbJAyInj08Sw#oLcF2lDL
zr;She`J_~;WDTQn@6~WK1C_iSJw8LX&on;G`$b~2I;s}>WrfuiLp?DYdA4_vsjnI8
z>dcL&XP9u`27VOlMI|h-wA-!^6Ci=yt0wJF$;W<zli^#$l1gz=ku1d407gwFQS;1K
zE$>V0bq_-oY^vR@q`^V@yb(1|cW>xRq!|4srmiDtjR9|y%&-!&RmDb@Jb5%9`p52d
z?i<RAxDs;F->s1!XI3_k#^=FY9Z&4`BZTH<;&tX@PP!t2iL-N9lcgtze6Ym0$OmUn
zeo+Z!CC%T{ZFD<!&%ndL;i>?%v8(khRbL~5S!WEYmRq?j7qNy-gitoXa#yO}=PNG8
z8tZHG9WCx|Ue+j1zBsdL9zyd_Tr0`*M6OeiW39JC){lnfIj!kxwAZa2y138ohFfXo
z>;#9GNQ71Z2)Vqrl$M?4!u3xhRLafSab+aZ?xUI%F7VwJT%<lQs92R8qCY=oX6*NQ
zBCc&;e=3@!ap$PK_vQF_p7~<jE8k$)HJi%%8TTz}_~v|+`pOH#u7gZry_==#gO5^u
z&c-=WI(=Hb{<+BofOC3hiyY8o+i4I9jMqy(GUa1Yia6KkBf*2L1D;pK|CUP9Kjw5O
zs-8Tfv)S4k(&YPV>qJi4u&Wy#&D>Pdh#vepG#}iBY97x~WLcZ=!IZwI&=6jVFw-3h
zJ{|7;kBB3KZ8PFz+jN%gc?NH`PqMVv{pTi|bp%^maMRJW#|9f{z@$++Vew2A#(4!Z
z_}Ta7JSPpX4Qpa$HFP@W`#UvCSCSRC0X(!gIc6$^4j*P|8k5o8Q^2?2>^^@KAMRdO
z+3YO{n;xF1mx9?<&%&n9e@GG~Z#o7w!71b>@;W;^yFTyDiH*DF6oHsdGN!#p-SA$P
zZH830)VmPZ2iz{<C&rQ;(rmj5&@((3lM+wQz`Z9C94n#)iYSeoA`QI`*fxx+ictNO
zSJDyq7E1xptxoy$BEsVT>jw-h(eNz>WyR<4EfEc{czm{}UPI7(BVE@GQJ3WDiA{KW
zj-;Y>0g8U%|L1o}!%PF!2cJul0E-o<{@eOqH|ECaviC5IhfR+Uflp4kzyt`_+S&a7
zZX(nhHX-&&Lm1fvXq`xnZ9z)w&9;B-MnnX@g{ObehX`FIC=AX0hHeKpE@LC5qmIBY
z(l3XHAe-HeMBa99{KB7eB!cy(*<HO8`Boih^n?R1$mab2qE=`w>>~BPaX%nIT?RpT
zbyApTb-HAw)s5MI6^ZIP$*;7oE*L+wdg0Jv8E#xf_5Tuuvo0PE>iM%8Md0?GGzxWx
z`3Ds0enJqSDDSo=MPyEGeY@IJq0VLI|99Uew}3i9Z0FfQok9-Px;#E(P}1_DIMaBA
z5IuL#kL!p-0#cfR`SP$duc#<3x!W}XA3+k<4ZO*~5(oR^y{{I7?9YIcBdfi$v(qsv
z(V?;VnD%qJIcpmm_5o8Z!O9t78_n*E34I5Nr;(w-i}b;WkF%BakKv!aXgvC#WGAWs
zg@r9!k+3(ssfQA7H%py75SsU_*vAt)^K5=l*6d<J-ol6s`PYOiR9Ul<mMV8$B&501
z*IS#v*Nj0Fi?h)T;tCz2^bH;uG#y9>a0teYRg@63?OG4T=ba<5Fg^~{2l`daqd;2u
zKbZgoiEsfl#iwKEJI=s4?4g?c7JMO^YcV(dy^tHVYw+}$HwhmrdbLQ)GC71&oIOxK
zmCnx`a92?j(FlJI$l?P;JiX;C^2fS7%#7yn#K1HJ;SUw>>C{4@+tF8yzy6eaH}=t}
zGrMY@#2^F_s)g-rP(7aVU;M%n0|T-st35{syn~sc(xG&AaB$#e8s|$szuS7A+2y^g
zpelC)c_{~Xl?!$6{XELwCd0K$ZGEo#H&>fb0>2z&Iiv<*5{wS2o!MZg-I*hErPG(%
z)AL@V<&H;YsPCqTUz(1&rQzB|3Q~XprOe5J+A|Yg+cw1a`v86q2vj=?n$6E?DkmI1
zv!c7aS$L;|kKBNGpC?vhI%28yu1m`cne99zS{-D5;Nq3|)$~7kn!th_g~-{C(=ccN
zr7!<rY{=mE#$h+yc!l=ZnUF5t1wG}|b1HuleZw)}wE@rce%c)a-t@zA7OLvxO&e;x
z893OZnKS%pbab?MP{HtS$R~+OJ<v1%-p%h%Ht<CNWiW1HuN48p2Il7GJU$t5`pnCN
z78Wm%Ajy9+Jb6u90e=}u`1`~R=Q=Ypqt`qTa|fFztdYc%z8B|qNhM_HYOdzFH9G$X
zLm}hf17J&1TO6E`EuA#cWs1xT$r_aE2{Md0@?K_whkqtkzY`yK)GrNg5dVuM|M^4&
z-1Nwaz?b`B9uph9wFtLMR$fRpGLmt`k+d(2ZdavKbHoc6|HnGkwi*RG39HCkh#2#O
zsP-GG9j$pGIsA0Ha`$rS<uzUCTn=3@s6(ub|2>X3;>51SXRTFD+K(C92w63o5zKX&
zqBr4+%uO3B@BNp2{k<8BlK?1G3JBj5w18E5a&#{yQ~}3PMYAwByTl)pp5Ex%Y+MY%
z6-e4AJ>v}6g5QiX)R`FI@5xSf#y)3ypbFvCjo;pQ$kwMoi!%<UBY;7%bvJIp(#j_#
z9fht+p3Tv9vmnUFTmHYlS2cj&f8U(y0^dqJqOSEZ0bak*p@@uDKR=~E9Z?W--Q)jV
zfHyDDu2iF*8@^?&9#v&lP(U*$pdXQvrrxmS{`)vG-S8B&rLqk(GY+@ljTHJs+_Uv%
z0>7D-U;q-=cdTIn*$XR5k?Cd!O|{+5aos$bkU`nwTfVGigb8Qy!^=vPu9xP|lcK=?
zI@pEXU<YV+<^tVBcHz^L=U|Wlem}7%FB??Jn~5^T3ym5}UCV0xQ}=A%bGrLoDfI&_
zy8YuAOIADu7$MtksEZ*<G<gOh*NMiNt#I4GHtDPA-Mz8&B-)xtp$xw;t0^mE|2Qf9
zF+9z5>7JCsmnFtTD|=ldH{nykNSh~##XZE#Z83~b8S%u{PZ|V&7J0cwtURiM<ZEpX
z(&2#{&zgZH7$c+k!13S&Pf6NhjoAg+W=JV4Fu-o*+m9o=J%txCx}3XrTvxkD`ApT$
zbK*#J4hA{Mz)QEsg+byDzEJ_8KwBtg>1%}hd0Ez1+1aMFzW)B+Th}PzwpGBTY}d%?
zl)2Jn4c1M!GMZ*-^7rF!A||lymz-xXS`kf7(LTji3XWzM=de9O<g>hT<WY&lt+At>
zNl`x#s$?o~x{n1$2EJmF-TYw8$;oVC?VL3C@NS)neHkHAF4X!{UO_X0X+bFpQJ`z7
z${<FILSO82U_oqs%2&~RzC?L4mDN%6E(d4#d$I2KM-p&&db-OV-L>AggJpFXrrLW<
z^2NxUR9Aq}%N1d3t1vo5KbG22E9%{H|06A}>UrSwCm)hEVKfVD)MtLQ4V7!KU~TT}
z*R4in<SZ$yTb+|(_Q0#SW?IfQ%Qhut`YyI$O5y9-y0YQ5%-<&gb?r2sg<$PrU8a>Y
zvuxS-@)$Ym0s)H1ArpK(U5@TYfPlcm$CRW$ioMuykldWeuM2pIqIi4@6`3PVZeleL
zXKLmd-}OLTio}Cp5Gq#{!#_h`60+FpxQE{5F^#{HY2C&XEwq7Y3A~h)Zc0a0SwU<N
z0*EmrvbH4^7<+M_xVs4-2I{cd6-7kDpOR^I3pO&9H5=|q@duuh)EY}V<~y@KnF#|d
zL(R^V`N&xk%{?)WsGC^E&SYO%%ftKMMT~xW7e>o42i`(p7>TjsP&V2V75DEAYNPn+
z=JugoPxw3v+>m;Qaae+(#wqiYycJnI@K;d-4bc<lxJu$Ka=eSU8a%^=*}wpMc$2?t
zO1xmi3~h1s4en3YKV#9bInI{6qTiG>@kHH0Z{D|CZ7p=Oo#rpxH=oU1ZQ03LSXWNT
zl~@YSYWRr-V(bZ?=|_ycQmT&R@7d=FWs?Ssk(UdUH>lhoW?mOdFiMMVU(}wmoS#w*
z<1Y@1<c3ad$dt~^yoJgY+Cx5{m;~FOW(+sF@EAPWgs%Yd{*G&m&}=*2W9pRd@Tpao
zyi+nif^8FtGOIF*jr0?CcH$;B_Mgt*L;6!_Iw6@~rhaIce~a0y(5n((*~aiA&Ppb{
zU>3>mr5!=^2@PUT`d(?(<+2K=LkIV>zB^P;lHtF#50nFP@=pt-v;`Gw9nt@)q8Q2A
z^Sd-qTPTA`X5Iv=(Q~4NLmRkb>Hs7{?`K(oFq&|oY_ujUSrM*M;C_5<BaY%xc>6nO
zqs%aJJj4^jgwHa2^L<}?+#5=l<iw8qoA78*TjVMm8dZ!*7YNiASAp8&c-pwF8_$aK
zU?x8?Na!<DVF+#+e|C-Rr>QFM$hODj4i2k?0*T##lv9GE&IrTg6AEyy&q$+9`ozp{
z94*S*GYU&DP`GAZ+(X&Fm$IMv2)PHm8aTezXTRD5kcyng<0SGSNgh{pbr)<=hcAVF
z<Eao1LC|O+7U%KZ!-hNqVT$TIyOK37q;=PD*vgAKd6ecZa4#Dy2Dc(Cu<Jses?HY?
za~0h^#}m#F5)z#Qf95;0T8FHOP|L2<9J{siv3f(Nwv%y}k(psCY;M!ge$sJY&@{ni
z^3X5FjtLJA7hcKUhv@rt7Gkx$NjM3WlC`!zE3+UTCi9Po7T%TV)PcyC0e?9lm_!H#
zf&V0?*Fh#^0W)t3fl-iQc))$10e%zS9kl4eO$R18Y}Ip7mQ0Za6dYByYS#vd3}mMB
zha{@ffynl4B>@NtF=Ze~?3<-!Nq=Q|gVD|8DeS<s7lX&;^HYWz-SbS@lK^H$nF7Lc
zcqNo#)s>Gvl}38a^m7*?QtHP~JRl1tizgwA`av)&JE6w{g^UIj$#v$(<dY7P$Q$(p
zqkAC%%8m-J$35&s6e=4y@9pE8-XPBVC;mr8J^`0qWYkq&<Al>XXkcus)NG|fl2Rlr
zQTpze`BbF&MYw$pMP`5s9ocq>;Q^;DhM((_%A51NSs%RGb2A*Ge+$X7@Rti<+nKr}
ztc@|nTVLS07S;HC2#0eWqy$!NzHPCOEZz`<in6fO#Q&tAe-w{=n-B}bb02>pMRL<b
znJ+L6pXpXkY!m%;CVG+b3O86i*%YmYFnJDNX0mT>R*1Ge3dHlOZO)5_WOB6qg{kME
z7HrH-s=tAJks<UU;Glt$9umgIK!K4N)9vWmJF=SloZJ8bRRO0|%V$Y{OTw&h7oKI}
z0E69^JXmN9W~SH@m-|7ogD6TmDWaZ9yUP0u;-V|o*RS)u!aB@P<fu(7-^R17TCI6B
z1x%o7FU!kL*t$46zMjl7?8^Jn4il``R2ASONl#=RSrkc4^u;!N0!wpCo7wr1r$tU2
znax@OO3gmtE^?lX^xyXUA;LF=ZIsDNe&_ZM3Uwaej5a)kF;N285@oeE1XmBkjGD*B
zBn*B1l<SKQHzMJpkW!T-vb%Mti^o+t#?>Ahd~0!N4HcSz(oKBMC7f-V-E9yLQN@DR
zb~a5`2Ko_koW_<s;6#ZXGX9uwmB3SC{9*`V6lX8ZN)-tG;kLvdS;HI^k5YHVDo0%h
z+X7qpT7-F`r11qO!?Wsa1ad9;K97E?0-Xjr%Ja955;=>vX5rx&uaFx~MxdjWHvzA-
zLFaD5Ye42^L80tBm8nJ@5B+AR{1cVmS}l-MCZ35BNM!u{;HOAqn)T{%v8??&Gnh5=
zCYU(>JMdeOvtL5Zn-xSW51coI(qQ+2v}kN0lVB~=j*93z|I(hEb@u@8n3HQr`rVc}
zk*z7IaP^DZG+2{PfOpkaZ>JH@FP!%Bkzb`*2J@?tMsEaqG_{I&0&-<^%;oz_VO8tN
zjsdYOvm<{pw6%$Tu&K*qd5_d)L<*67R`JosOR$q|=Q-KFfUoQG=}O9!82y}rMfo2z
zhRv#A>efOjoA60ArL{Fs-gJ;@<<~NQGEulq^l5$Tj*TYjX8f?)ZGi+PkA^6$og6*!
zFDgW5cvVC{+_*{;1%Rc4tOBIK0p6dcbP?Z{HAb05UqS8ij1n*2aFiMNj6eXi+ak{S
z48#;%|5UyBBSm;)lA#o;Y}~n#-3U2Uly32|#m*$fKTXBe#wf?}1M9Bx0WB>ptz4YT
zPotFslpuv48<D``T9(t9=|Zlmj@FbUG?dQbCy`dl#3|kAe*MRdl--19f>p;;BO+66
z7-g-S2H_UPr>l01!(oel_25c%fI_iUX>dY*$64{UDS^o)3+DzkkK7BI>DHDt!^H!$
zjV93Qb0al833ZggMLnt^!hy^$=FJbZoA1|X=3GSCSu8h_(h8i9NV1+XU3$}gHi49f
zPXYB@&43JKf#WPu)j>}M3ActJO0{`V<OGtvT)+6b7)<+C)svC?sXxRapGVo_vm$Np
z$fgBJBbiD%7)*8BKZD5-d?D`O$}d>B|2iw;qUmTYPn^lqzF%IkepEzZ1W;-Vc0WaL
z=1XDP+;!)7Ex<yM$C(Zd&2VziB6atL+*Iv-p=mXd5sB9r${7%bT^Cl{gx`gaHY*6e
z=h$G3nmiCSwJW&)SQmLFyyVu{DyLG}$INcF5wr`W0G;XX3rtgIJo3gMO=uLb$3{_D
zt&A8&$}5SW@wj&pQ~kldvOm~#1i3LLtbkjMEXqYNCBcMpQpE-i?P+@UbK|@tBYsGZ
zABW`lk*!2fO;T72&pYqm6hekrK`264D2;0{xxSJ$N}<O}lYrO)`1AOzLFt>QcM<26
zq!C(bg6yAj3P;?4U%T5nI;0vVrcP?jSm{2UTgk$v=tfvY{8G*!b1VK3UxDM~o!V-$
zfc{{*AVCb$UVh~_f4ko^7+}aG5h;9iP2_QpPdX*hRNo|oqKp>;xm`=ASYx6r!qRa=
zRb|4IiuzisuhtwH%0AxA;jb0T_4~}d$a$N;K*z<4JzD?|4u2DXU`9;0!0O8>JuLkH
zH-MIH*Hv>6-5$ft3xXw;o9c`0_-1_^%n5&YRw79;0I<bl+{4mOfaPKpEk$;8wkd$#
z=iK~qFK}4sOHY&9ZPbP#v<#tsfU%do&np8pq42;ul*jO@RkQH+`URTl*k%L<QYV^_
zA~}R)9(Y%1NQUP}F9+T`LNfL6>9E{_C-x-SD5pzJE@oV0atn>U#sF=~;a7GLL~ks5
zbp(EG5{$AB9-xb|eglbrC?HGlKevY8lK7>~INWx%oup|*)F@6yS_Y&T0f|f+QEI`>
za4lmhF(2`kg25gQWk(iM=YyFE@f?1bXjWyX8<7%p@y2=bB27>z#dia=b}Gzk{Bamn
zWt95q*VDP|qN4Cpv?WvTu;uqf0GEf$q(qU3F+S_jx$@H5hr`+qs_v`vgM*SVl{lbo
zwxo*=5<l*x%DNa~$L9m}MeMgXJw@A{<xpjMx;b1X7+N{6*PZaYV^DylN9Cr<j4Y>o
z>tD;mymSL%lHQ0tMPsc5np1Q%9TY2Gm*(;Z8B2=<-X6xPn;%;nKTjHDhwhhJ^B76I
zjr>H5fmgryh~bQZVkBLt1L83$P(8bA_iUf22D%H0>01EWDv4^bB4bX_?iS>Eykjml
zE~e2F>mZpiF@53TM!*FQ>7x0I$}O7z!|3R>T^w1J*Mmos#kmKv1&o^CA{VXts0O|5
zzMEHMMKaa1HBU>(#p@yRg9C}|BcZ4r_oL;XT=b(*VmYx@e(C&U_exblLiV>Ch=%%m
z;mDDrP(ze%Z{HfqQcTvSX@x})j9Cmr{V+0C0UoPd)hxA-`rqDDTo`kDEU$6ey!oi5
zNRGvPx}@}wH0*qR6if9qDZO_c0;GFXZvIqJN{EX!Ro-!qTWqf2&#;vpH&iVJEt}x-
zj@qU-JlaC_T~f>8Nb=opx;S&qHXmB8T@O4yGz^XnP}nZJv>%FX><vA;*^~!adfNjK
z2BkK`zYt8=S-!>j`#TMdFk-6GUC)2z6e&>6l*yEC-&HLW4|}x0{p_b1C&DwfoZrnI
zYRi(`A&|q2zeef1g8uZhZ_hZNzV{K;$!5pO0FAVS=aNIIFd0z~j7zJF{SfyiR8wg$
zb7@Rr_*N^D!hQ`PtiFMeQBT0!I2^BNHjs6dQ!)4yy4iI^F-j0`_ANqk+0c4dgXJw%
zebYWta-^;ZLLRf9(vdUM-P%MuS3MHMoE^a;1qiZXTstmKqGpZW)Z1iUu}8*N-Os(d
zb4$^Mfq2ej^jie?fpG3JJBd3rq!_p%0FT|_p|{AYZYaQ6q^KV}izySf8sP3RZCHQg
zq&YfvGEC`?T*y5e_y{Zgt6rBziz(FhO~RSe2^B|iW$)dQVMJsAf4}+r>_No*7HXdQ
zfF<?8x}ac%fK0^Pm6(!J_+!1>Fj2!);}EdiwRl<X!BcVRi1y*(;X@N8nqPpsd?Pe|
zE^AGOeOIkZcuBKcRwZJxJAZEWa30aLJ}-65Y&E^Cq*ENBYiI<V^3*4-Kx7ps@Ma~}
zvG@oukv@8H{X2FE!4}I03B2V&#RdUmp9P7UQ^WkmmU`t0g-!$2l;`o1_gu88TpBzA
zf7Mq1=<d56Y$UPmvNy>kv;)D67Y%ksko@G<QepFfCk6xZ(Gblz`C9kKhVw;ss<|5x
z^wn)VRwB*5=u>FT7Xu4o-1z*vE|vUarJ+B)WYykdOj%uQkgN@U5Tvm=ZJ2}>g0!Xb
zu}Q)9*L=z?|21s*R+@!jp6!8llJis2^FMD+9=OXV%-%FlX-9!t<rles1FM^vkodmH
zB(Q`FIhRMw4C6<6HV#9$OPRGJpy<Z4I>I0wiHC^lQ36Cp;fVe$0oBKfy5&42FLdt9
z|7@b<<`J2`fo(JiG<(A>(QJ=ssx(Np=Da5!LPq}wEKOLDLdiu|u3jkFtw&0B;WV}Y
z^n0+yWu29sAKN<bchTGGM;&GSzR{h-dpoR2MwO~OASmmPMCLz6JMd`7Txl!d1M&AH
z{96$-pyE4~!|?NNE|a%VMzV0Y<Cieev|^rSGFE@3xJp*%C*Kui)We_SldJB2%O7Y{
zf}4B~7`K#gU*yyzInaGbZEo$&R;vvH8JdWx|M2aV9B(91eSjP`VtGp_I2lP(S~_Vo
zQDWK!H4c`M?|Dwq4fKJ>9FyARwpCT|OE!O7kWenu>a|{pcM-+r5km;?FTqLv;E<PH
zM;nA47Abv`=Wc(y{A6uEYE)e1S%rm#iRj1OTyGgYqA%5E`JzpCEbK12iw)_m^r)59
zY1Hj|lTV|{_eFD0ozGFG;jpUe7A&lM3L=+lmG<jo3jdW-YvP~kqhYx6W`R9UJlHE*
zn3Vp3oksQW7tNV56OBZ3ocgiEdz+<0Z=3^u!_z|8S@njxJ?Re1KW7S>XJ@3?YlQU?
zoarFxt>l?fG(^VzbEC2DXVQr}3+K@uV^6i`EI)Ke(x{#^p#!9O7ZG3OsCKb>nv@6Y
z0RFS=ZrZz1YY{jp8}1V)zVo5gID{L2gE3y5s&CJfF4JTMH47YlN34SoDu#)dZq|>F
zlBZ3yBPrn<uu13(koR71t^5D1^*4`@KEafrZ0QzuPajX~&CboWr$oPR=I5P}-eG8S
z8#RBU_5%Oy<A&!`&ahbAj)_+!v8fh7*UJDuQsf8O>mLK-AJbOz5rcLJDU|c%O?fn~
zer<2x+s=Dc2=74Y`(9RlM-dFDl9>=L`9lEtZ6;`fvTbVtt3Sqq(e=QKSWqCEK^eF)
z3%3nMYwmQ4YfeYv_H*35jTFh6=T4bee~X|YmX_!PspDQ2MQ28s)E<gYLDJhMV!&Q8
zd->@R)cE&Oq4ILq1s)<J+^H*Oy_NAxc&^d&eBP7VuV24DDvYwB)MI>5@@wa4#WndA
zmoH*bjZ<%0qQXg%aRrf#s*A0b|Jpuzmbyqxb5w0I9V>*a9#*2|zko=zpXIC{eJQLa
zrl4cP8bSztFRQj$N-dDB_4HLi<X78pAl!)uV+{YK=-R?f?tpPsey}`9C?sgiy}+)O
z!3&+Tz_iSAiz$fqoT%ZTu8F|6h%66}V8IVhSJ;IvO*<C$SM=S2;X+>l6ZlaZw{GOC
zzez%fr_?U8z=q`OY1f-4p5!<XsrH$>wJR1=|CU3K7&l^qEub`p`nw?0Zym_$L~MWu
zECVyX{^s{T&J7U%FTrl6ixEeu+$v9@A?E3|e~14;b>%3F;<8(I@`+hYHV*~qfAlWq
z3f#>ZJn=3<D*FSHT7aqm(BGZWH5()0>nPH+C-i(>LaLMoC8sszCn1q0z+2CALHu&2
z9P%Sb79D>R)U}`0+noQEd{)&n;Xlc75q}2T*zS1k0g$Z3_NaxT%GIar=6loIv5uQy
zIu^t;SgWP}Pu{_6nb4h#Bz|})f1i%pzI_sN1<G8vmc9tc5APDs7tn!4HU98MqdQ1D
zX@o+c-xa!l(28`rEJcJiI873%VmLV1Ljws}`EQd*du#mBC)(glibl!9Fnb5fEtuC>
z;$KXCLjloDRYWL8<<{8CQxXofHfNUFo6bvD-Pu_mPovyMv(3;%m-h<ew}^O(@c1%%
zH|rcyQm+qCQZ({ai{?g-&OFb#Na%dBlN*UQBPV|upT#^~KNB5=U6@v*bPK-c`o)pn
zDc(|_W*dT-v&F<Q`+eEN-Ck<ctJUBK!f(`dz}8C-UEaui1WGDjqZ2mY8{aU-HXkO>
z8YPd%2`@E<%-)P-^qADAw90$^n)#{QLdRX#g(nW-d6eSpqq<CWK4_t><!Mqv^s6>p
z#MYKKCw3r<`_DR7z<@|hy_buMQs|lrhRTJ?;4$CDXHevBH;`U@2VK=s@cOmHo@*m7
zd<fhg$`-73K0mbYEnFf}x?DMiw+{j_VT1&3SWna-axq}t`B`vf0%rDzs3?g&w%bPj
zTP7bKxPCjkR8B_LqZ`xlSNw<e(DwG54>y+Wvtg~X%yJEROs@7Yh5FaMoewOb+<*O`
z#M35?KWC)}gB#b}E79F7Ds?;@(G2Mk-!8LX2ni89*lPbK!4i`&aC5V1>ps%2)z1Kt
zcLjy7TOH^kf(%5dAGy?v1-@!kA$?Co5r34@?XGM5iC>@E6U;7#34K_@q5g7<<D`|W
z%ciYc*Dzyw$z>_mx$)?ICBO4&BHH+u)G*j5sqFa1(c-m{p4XfnAeGWq>jI+H#AJ-E
z8wNpTAI!p&Gt>Ju?~(y16vFUzJkY~Bq?>E}EtHDsj_a12pB1$+a2pW)(ntc{oBnrO
z$Ub*7K>8JO+14LM12T(<G#-b?=Qv;NJ{g&omge?@(^|IBvfbr=$g-@3?dZqu7_fB@
z)#&C37Ha1R=0*Q#$WSEKkfVrwTp$7=k4Z+u1|9huCwn|GPm|G#CnHa2JU{T@gt10j
z^oUg`3JhV_=ESKXo6q?qncAyMrnEhPWKv+}%KrdK3RE0Q9^UAf|7-hGc0C5-F=VTn
zdE3B_b1+xw&BY9oYxJUQVVoKt!+#0;Z(L$|h|mvs+O5E#80usNvdvsa{G8anMMT?l
zRU)TrR;KH1<@!sN<PAr$!FbmU!q*Co|4l@eSA=*kv?7ys3%!Zo?Gs`}RQU8eVrsRW
zQ*ZY0z93aX()b~{>%Tt4*Ka)9f7XcMKs~2h5J<`4>TN7b#E!8V-nD!dkv(v;k&Q`~
zoFSbC7h%w;om|8E9|VUyGdD=*9fnhA(HEXpcKIV1wZ>n^Wv}_COCwI|pCzmRO#b`8
zJTa}Plsk0LE4SnO<W8E(>k~!c;o*n663$BBb4})LHxI)+fBB5N-`DigLv_7+?$j%z
zWF}khVEpF}PjF@Ok7xQOU8_?rFN?L}syp;Z(=MF5Ul+16AJ97;NOuTe8O;!-OxABl
zFyueSfueGg=1IepH-m^!s=k#%L6n6*m5ZrA@%(WRah`yOB|8+EQYl>q{i57R2ue5!
zWmDPYWB3oMK^+_+8ElWZ!tV{ZQ_NMT*l+<hFI~<Z-G#S*FFQo(BJkn(6WZu7lC<an
zE7Wff_uiOGShrUB-4N_UVn?y~DA(1yuHGlUf3VQ3E)3<U%EO(^%&L^G>bo5Dv(Wox
z9204SCa#D@&n60C%nj$^wfg}xU6qjnebzfxsOcn40X-BzTg{%U3KrGNgG*Kn_s^o)
zcMv_X#(PGQ$o97)XObG-qMAsq88R7%!M!-R=%GAsJw-q4DKAmIFf(b=R#b<SG9zRG
z-_-`erUJH<qY+NHHV9|id3+^-$<czV<ct-|GtlyNN(wH2pk_IwxP96>JK1!F#YiQy
z@b|hR45h7uqhnBFj%H3=GlOZ#Qg8<Gphf-iZU1j{_JDnTYVB`vl2^BhDzYI5WkLsw
z)nrk0`$NLmcJn%)={c~vg}Rhnj0BqSyL(IxWcuAqW?u>(W_8oclW9#ZzT$;ed(UdW
zZ$#P}F9oiG?crq?o!zHZO7gO^MXe8|kpsSrNJ6;IWX(1Gd#<*>2rVrwc$Ka5)>p+0
z$eU1Ai6Ww7{P$nkEH~g`?sd|Q>J9|LbCV-2>&$VAoU^7j)qh=#FvXeh^eG8cF?BNB
zf#w-W&k|>*9kb4K?jzTSuB+}@cL-3)5L|R3Q>(R<+Gv(c|0<N?NzV)0?Sk~tG)*`f
zi|{DyoWNEMntAJ6Gj_Hk{9t&2`Ct|-$kD2X^>%4ldUE@NO;Rv|mV_4o4SixA3u`&J
z-)t@gr1N2j+&6Qkb9;R_QuzZIpb%x9^zx-+_GbolfWdU!YcAhMwRPs|(=$((4K0t!
zB^$dLxh%N+zNF;`rE%ulVq*O_$|F>-wjUPDQrYYmJO6Ag3~7CuGkL0sKFvBY$zQ1R
zvScO>?!~#<SKnn~rSFQ`&cAhbTk@;0PpNcEtzjSBW^3;=E$nfWMfi^wR68Hzq#Lq6
z1I}eC{+2<byg0M_Zj;X6gHDnO1gjUMi4~4+crqjSqJfXhg802Ov9k=gLaAPLOf!1-
zh|X?G-(_vV=UP0&2Ya(?h}J7@=sP#eYCS{TN`}WSx6dv%sC7n;D9fx04LKT**s_zj
zKNCv%%a#yAmO_3YVs2H<BP{vXCX@E#+EhLrtiHoiM6(@dI||}|{if_c0$_bEim0nK
zBdImBs21sOlkPZTlwTuS?2s{RJw#Xf#s5*DO$i~(<i(EkXQb;9BF7Om=byL&%2yvh
z_fyEt;xRgw`2C_{eZswhw?^HY^=5-u=dsS0JN0J`+-94n|CwuTc3b9hv|N>({?_M0
z5gGLF49d-EV(L=q-JlkvCF_bZldQbpg5HdLctoLiw!?1Z(EW_z5SL67sln`1<>crM
z8<h7>qPJ>KINh+a@YC+<!3?>L$BUFfR=&s7*B+Hkk#?QSH1kmsmwX5=Sa3DJ%NE_4
zWiD}Cv$A*?S=^Nn_pb6{BcxCJIO}kac-$q~8oj(F{p>a1!T{&b%l%S}Q-A6$mTb(5
z<d-b74_R)*zuFCDynLeqZT6v%)<-X5nOZ_{7b{1D#&G)>Yxh2<y#47w>{HSbO*#N3
z8uuF)9mGKOaXTkg?6n1L>Q1{DZN?B>c11+&Fe691Tj$)p9P99zH%9enH}nnNmj0a8
z%BGfQ-VT4jERaSiszx-C=5|xN$Mx&i#kR99Oq>6uSSw$3M;5~Vb8Bq&7;*`IuA7cS
z%JwF$9qy?)Bi(@T;}-c9iDjwC^5kES%(^B%2Wn_=@}}1-sbB5NlZsdmxa5_);cw4q
z?V0j1KRKa1Zt6Cs7I8AuJZK@EeChU|%MYQCG2+$P1IaSDke_;cI(#ZcUKQIG%xhr_
z8S^ccXWpEe8FJflC#y(>W0GbyN4js#9$UvJL=_X<rqb-lVx^bMLo_ov*9ZN`OCSRA
z<T(zE0P%LCCaScocfN1C%y5wHeR)RQ3u8Kgj^{!HT8H1h%FWFs#@Ode$Q^!&Iyul<
z@XX_s;d2g0!N2kH_dik(W+o>Gn!68`&v;lbq>6j|oQq2GRb39ezn1eU2516j1O$!y
zU}l8~R!>|)WazGK2LUU$^Lt3CxLBdAGAnXTOm^1^eV(}=K?Yp`A=&>@unR~&aN<A)
zS~+??Hw!K_IKMT)+G0s}6ybgkL=FASVsnBvL83VpcYPtfI%vnFR#Ijowfc!|ae{7B
zj#pbx%cI+WwhYc3WnZ1b<LA|N>Q??>`c3$YG`p$XC|$jeisSSiyEEW26MTc+NQ5Pb
zArac60nR5qF<SxqQX8<cIX5~QG%sg7HP%j<GLdFT<YV~d8V-edKN~R5=JjnwHnfQ%
zsc;rg%)(0%j$T%R3Aw@sQGEBH8gJmFC&3z?kiBY^h;zCkO>yPR1kGG(_GO*DhIy5N
z)beI}#eat6*JDu>Q~@V^Dc=wHiHRB;l(EF^XHw%9apLZ{C)N{0Utr&it8%tq3(8}l
z-t{K-`&TEaA4xoal7Ip=0=!P<7zm=*4$1`cuo672PFbPcC_O0M_1J`X$o6Ou!vmMu
z;jCN;R|FA5$k!uCoBYii3>I)zGC@tPp9^DRigKfJ&Z@E=86LJyEfG?Ux{ep}k_(<1
zbP+QoSIKTufmDH&u@~3y4<nZgrm@HWy|U*J?rUFSv7n$p(NiwaOhHa8c&mlT>Fr@}
zF$GhGOV?R9U5#m+a1?e+3i0;nIFVtyMxzhA%XP+mwCU!z*{0^K73BQAN;m;q5R2#8
zj%Fyzt2B1(#GQRfM_C*WLj^4TJaSJJC1-QjU2FCzIbRJ$X1*wN{SY)i<#}J+$l)RT
zhj|4HPlWm+&CeA^5Ka}S!<jwGVGHgUbix>3%R|`+kHi(!S|>aSK$sjfj@HCK6=v%e
z<w-!0&WceAc56c$i*t_N=h3O}d``tCS8&Lca*+1&X1O8lIgPG_h@v90rCUdZbW0Sw
z2|z~t+=3?Qwsl4aAW{(s%GCQ|E@(f-_&^=2uQGo*<M9^c4oD9pr-%tY4<+{=y?Os-
zHL|h9L^X+X{od~)&}c^fVBMJvo!h+kcP#*rnez-3E1;mrj_GvB*Q?oQ1Apc@v!7x<
zSeJxh!*g7WJO_mZq*Y=PP&b7cPDT!Ay$Cm|BY(CLEB)X;qxqrDabc971nL|TdFZDu
zT(hdqKGjL9#dz9y^5a*vU}*A^)MqXHA3I!IA^sw8@irINoB%>1)#Txs=MG9y2}v)0
z(120rpNSRd4Doi4kBgC5x1sg&gqX{kNCC+tagYs9Pot2?lV=~?vy0eHAmkr{m~Cp|
zUpfWD_cLqDw;0)y_O)>jYlDd*fvf?-kT^u2Tj-~0rg9`%YZOf)P?q+Q1y#3c7piTG
ziR3WC`lJ6nmWeg6fGOM2k;5Vk_}f0kg@v7O&SUky@dQ^PMFCN26c;1KkB6_-tNrH)
zGzj9eP}y*baN!eM7=>$xv?~~hd+Rx^`0=!~Vzi5igQ72KqmWC4|Mhza5c&Yms9xr@
z_X6*5=kOVMDO~!~(Ly9YUUdt}fr;}y^4`_>=+e_XnguP5EK+dkzsCY67ln4?E!={C
zOjr7Fye)Z)i`Ky20gRP6J`6bAExHqz{8$mnu@<ZS&N}{Ys#0!2LJyDdb=y$#yAt)%
zc-#ik&elFs!s;W1*XoY688pX6hliW;?~+}>-{joj1X0qY3PRRi{3Rnps5hqhsw~{}
zHhnvNXA^!IwQ~x&x2!dlbiP@+u523_@BWiQL<HinHmzJDB6z8n`*F@&#BW0_6UH26
zpS(b`U?;xq-2Zbi=n>qcKFSU@8Pb#Dn3(AEeFZ9>1T@7-4#xlOuwwoxFRpuu30KF8
z$!8{fcy|NZNGS2CA0D=aD;k0;%YLi5Ymt;?+R`y;{(zbO-$mi;>G2$v(C$~5vWsZI
zX`}?@Hr%gVRok!lgdIE3jbxi!sgk$2ZRbGytrZj!Pa@g%avay?A@dtMh(e_MsvAg?
zxf3j0|I9*O{3zx3#*_(^)XHBrcVwY^2~w7F2@lca)lbi4Nxz8#6WGZW+Gn{jPyBkV
zZ);8t^{pST4avy#4I74GtWda+p0mQlmyT9b=M<1cc;y{*@W``y%6i0eV1OOyzbhM)
z4!12_zeDn1ge?$S5PvY%0KG&Wm|&kCej`IP(&?Chzg`ej-!ru{Rm_Vd0vHKYl{-&I
zXXjAgoti%!D&z|$^1#GwcN}$5Lo_rqsWJT*U~W4e7y|LZcj4UD5>6Nw)GJjTcb`k&
zI%Zt|zMb$7979eMh5o=ooyMuoukN@v{lv@r9dW71jdBbRL}5aAj{8LYt|hck<|H>?
zbpM)XIz-9?{WB24)qh{f$DhI%G&_GkbVrt(5&vx~(g4tC$xs$v%6x)!w5f0#bLsvj
zp|{nFHZ{4AQJCG)!9nX+(&4M-Bq+@^;9PYHN_M)k7QS9L384wGf5f8~ME6p>i>TNZ
z7r=0WbDY#l_Ep*b=g2E@%EhzLf=*I$d8*M;P2Hrqq#qT=xbKD`B0hM3+E^&Q*=}np
z-}8R`@sdG@9nyq$4~Cj3p8-w9G;FtlnQdTKuUG+X{qr;G;n_m>$(#gP8w2+Gj!Kjn
z)Ec$p!WsfL45WUNGva9cw^m%?YbL+0uV2&ak{1ednu`~7uFXE+H+?ne3mMT7@x=G-
zEjG6IRNBlgqcrCiPwbgFu*_cV9{S$4`c*aO2$aDl!@+2>>n^i~Bc;^IGNqBjdnB3$
zjtR#MWo}^%eMQKI-hRDrf7NvI+^CW4Ka>Go?XILA3rx?TTexuc@_t*4@Otd3%f&2>
zIsuQT3yb>v(xDaDHgI_HP@O#>jmFke(P|p@PbL2E(J}ka`oksgSeX1E`-#sH0T@b`
zOI7zBvyAm;kTBc;hDeLwODnVVZ3osJX`EynSPOlhd+jq^y8L$cbv2v4Jg4ID*~N%R
zE(Bjg^Nw9mu6A`utA-qo2%u3AIYy6m4z|1myd32LqPp_g@G)U0WD;(h^1Rz0bdMlL
zHF`79<<OW@t;>x4IfD5!)1qXZDzrwJ_LY=LzW4J<14lR5d6%gg;@(V_$bTAp!CxCn
z(j;|pke3Q6s6lG_FG2iG)Qhl%vXWn7w_CS=!<H?{aVqYEnvrMGuo<3NXtf!H^n?@)
zZ?^ozpZWTAh{2e1t90EGbj&~Q)Y<t}yycqZEGtSwyomTR<#1sO#(qQl-Yc9U({XRK
z-Nxf;?q7lw2=#HcOP0?MSex$f^e<YL58d2>F)_r?y>2^ZIK)qK%zlSbp=suXit<9z
zvtgfx@pHQ4=4lV5-)Q1#Y?h#_q0KFLKlG6C(s$+{c!<$?qJQRfiQ8nkGWys&fg1gY
z)RxZ(9!X^OmX@-45wzoMPf+C2<w?QrNkQrkyE5soH+NEIf7Uc<FIhc>{`F#qGuh80
z>sdTMnA5H;^zEs1txdE8S`(R|Hy5eyMpZu$qU4GVO-CQ-X}p(d3-MJGxGEmF_f!-Q
zv=ERn>mExA6KcftKtm<fh{?Ya4k3>?4%8zJxq~#%NygcgB#n+kn`LzVRD%lgx{LUH
z-u1=2BnMlw5}^WObT2%`DDZ{n&ZWG0IvS@M$KmM>cbn7@6Ea}mv*H{6!o905x^+tB
z@kwuy0u&J}eppnMBGRdbq6lg(vw=^aPdiSB(WKIi-F_+k^$f$)3;U7InmgMP3zpvt
zFP)n7Gna`O91ZFmgKlG2c5QiOc&p-Y1w5Q5mX8j$C+fKW)OiAo{l1IRwQ?$cbvf(o
z^+sm6v$0;P6AwP3pSv7+<Z`5BeDjB)7p<hUCnoB2Pu)xTGkWZmwam6iUSt`}y{YS?
zzM{K$>BbbICEfBP1<gXU-W*^pqXa(w8Uzi;_f|CP7#pU%>z?!P4pk5##!D1gru~S|
z`qdb@GSv$&9~j%2r>mTooqM<AoY0$|JDcTMZGS+&)v!wSp|WSFcH_U6T`w>;B4q-9
zn@XXXpq=hg>$D;i!hkeUZ1?5f$ti*~5YVN6T{FX{{`xAGbxC8->spPg4r%a4kPOln
zDF~i-$*ZU3>fNQ{{P||p7T4%ij>WdJJyrIfEeoLyVc6Rt!!fD4pZQxN`ow1J7YhUq
zIm1)M1qoSl8)Q!v>7f+8IYl29#gd3|w=6en9;EJY{S2TBjq9M=PwuW1T&{@ziOc#H
zj)N9I(~2(J>0A-%<BGc3jMHJ7lmT&9DTW^1!uik?J@f^<(sgf9F)ldJwfA7f(XxG_
z(%)SP{m^OE<`z75R9pE?{B^$C#h(d(i|3K8;eyNzlYA0cq>{hi)aes=lordtL32h<
z#=<9EvX;&bv^%KRiIpSnqp{DK_zG@1AXl1FAtZZLjwXlqQAxg!rup?7lPc+n9gSS>
zI}(t_b&p8bBV=^kAKv%`jWQ=cW@!%WxfsRr18F2iJrih_3>~c!rt1bT8h%Imu5-<a
z^&4->io8)CP=j=r=JuGo?{OlPqVd^D8k+fSmsrF$*j8SUtK8e0X@LaFxcMG=PzTco
zpUe6ng>+K*eR<UK1<F38_lfoX?h~Ly1F=Q`3LF-5lKn4&mMS8b&fIF!P-dSq5;!?+
z(>-HDT?{=3$({UU(RYvQFWq5&%jxu<G2%wAfJPBd@?5p0{+@B4p2qBm=~PJ#q7^;G
zfylqR!81{P@3*qv`Te!lEk!ZpTC^vj2UPDCi9@VV<c6=H>y6ZaUOWzo_I}8XVeoQp
z-w?Brfo^+7jcjZxxBSib>EupjRSBs;XS_o8wx6U1hy|@hLXz;-ddU8jNB3n8&1cIk
zW^e9Z9n>j#hJfw!E$SbQmI9$^@{=<3%q!)Y0AFPe=&OEYzwAm^df{k6ob69&pL}hy
zlm~rtDZI(61}ct9KUS5`IsLwUKS;|fXk1(M6_E}R1H75aP2Fg#H>UAs2xq2X+@I1>
zM7|1$bpSu}3JAy1z=*sZr#Lx!DufDi`R7C*e6mt7!^0cDvQ9(#-38ZI3%~pns_0(&
z+<gdXoTd{2?TPyxpD4=?YEaM26P4wjefhTCG4YEx-!O}cy6n`;9bK(YGhBZjxz`*R
zU)Sx*k)Gf8toM2Gz;n978N@8{ak_F40=?y=KrbN12+bU)l5{`~sU&n=WC5CAKFT-;
z4L?I3w?aLX_aSgKV(aUI>uGo2bNTKaPozpcoUbJDR+)7P>CKPenO)Roj-?*zAnES;
znmG1{oVCawINF5YpG0(<N+3tac8}Frk4S?X@2D!2(UXrA8?V_u?>vOOig0Unv0E;`
zYvkJNtbC-^>@8@AKib(3?TS@zRq21tdVS3u+M?>;&4o_FI}QO*B?Pf`&wZ*u8hH1*
z6!*J4ap@W|s>nK=(_zuGx?Hj<kq^D!3~whRjU_Amp_RY5(2sqL%;Tb0)^3S&);+{%
z5O{>%!oAt5X+v^WIR~y;H3r9$0^FD|P~KsW#vncs<pM`F>Th^48j3F8TY8o~Y#pGV
zBeyVyp1m{n!~O1~{ZGu%zrvnMvr&Z7{B`*?vgZmPc=&vN-}5g=jqKh|#R;n!jo)ED
z^va+;YXo@$U{X_VT2rNDwbf&}6>068HMWSL-xE+*dA~-Nf5@$vPNtg6TGih-)7LU{
zUTB0Ks##YJt?Rv`i<NLOIz@vxLxToM9{wKl&kEGquMt3w(xJe)I{$Pyis%m_^u|dz
zcp-sQ{^*8MtUZVuKY9kfRtHUXu&aIPCn%sEcK?nCJR)!Mg^))u)B+^?LN~KNJSzYl
z<kR>aPsm9WPq397^)aauyms?pW&deMzs%1=#;Xg5g6({5OC#G>x=I!ul~=w%FL2cw
z+1W1*KR+ZJEU6eToS$qfLyIFls&&m~5j#=;ZY1(d1$)~nJf5tpRZ($mr|Uw%*j&Kb
z{%cv9w~OG>4QvEFZou7Y<YE)2DFc?5O2qhG9<2@u2j5I>sr?#0{m)D0p9hUCe;qRv
z$hXga-ZfnA1n)uYV49i6u1;ZtI2K-VB)iH=@1mhN!10Uh=%xy*YZcYY4^A6~ld9zw
z7|U0#-3IUQ@Pw^Ep4q#|6}>*vHDQs%aLRoC@x>syzh(2f0;dOKkoG7a-Ry0Vw~r58
zosL#iIjeZ@S)!Jv`NO(0j^%oe!$_~*C#$ooE5?(tJhDk}3w|e;&)~_OynLOP_!6<u
zcCrG15tldAFy;a6##P<NsXauTS_{A8hW7oo=|>1h6|2G(6{9ZfPqBE2se#9JU%Xvi
z9>WHp7l^&^N`>RCrHQiEt5t}v!Gm!4tX^3uJPpWJ@_4DbcB#7ZNBY~z%HE4RTrNyP
z3+?P1B`jWvD^p;j-I!Ru@OAhubotI4gEw!36o^$yQR$wKu!ct$NfBmn_1yzS4<hVi
zG7JWdDaT2qD2i}`a%og7;AX>FxR?Q#5=57ewtRqYF2d&<u^)S{TW3WETTV=#%cZ~@
z&p$Oj1JSb?L&@ZR#Kb6qrxNNEPXA<f|HgdPP$GFiW;zLLvwQ#i_<gw-r3S_DGVWDF
zQHOxc$^^OP0cg&jUr}t@JzG4>PV-9v7YUEc4-k%{!2;pl;aumnUBE^=Z`*|#1;rA(
zrU7{8<5yU3@awl`-wtHBPp1T3C{8<zi=q+l`mulZ0%hDN7&@cU-M_oir!@*J?rP(S
za(+QecG6oI%U1>bU9OW0msuWy<#IvktTgy4BG-_1sDk{y<`NvYIA%kyf(rq!HIMJ@
ze`7iAO_}wx^PY>i7>%s4IEtIu==Qhb6!(dT#xe{B16jHn6aC;V7)(l+{F3$;`jytb
zkXC^pR4?+u5sp=)wYsj>vwUcZx;4;e6ZY<l??1~BbN~ILDtH$*GjS!T%QrZ&qpM48
z1v%WsnTKZ;cesyxg?O7M-%Y&~mT87hgc|Mn3hX&jyLpda5*|VDWQH(bs_*Ut5%*<!
z4^VGIG%`=1$I_1@E-U4MOQ$ExJeJDtsqoA89SuC=KJ?l>Wio&Gbku1Ak45!Xl7*QW
zfssiJSZr{8<583?MNW=R9em_=C2R{$u)aC4h}3IZ2)M#_ujMkhg}-wrY<`%PzQa!{
zCN`Z2CZ2t5m!JMlOi#XLPdT;grHVM3lBrz9H~o(TT`y1i>b#8}e?oe9V9xfw^p^wq
z%Olq#6+-(PRD{*siTD4HtFMlWvTNQK1OpUM!2$sTX)p+v4h8A%5|A!skysi<6a*}~
z8<vvpu#oPOR&r@!>H5u$@ALki_xleYS=l?znKN_EHP<=gVLrN_7Hu!R0+~gXazPE8
zW{)(=*!Je~z&ues>;I%l4mzED4>Q`z)AX4|&YiDbm854Lh(&s(m>WCpG(oj&;<(4y
z@gO&xFQ=Q~*{1mHfoqQ_4HGT~C$jsX&S04bpl`tic0!e(eg=8a<zPaA!$%1EVO$EB
ztjiySP6@&{kaN!U8tx1;Zvz0>3~oSv&|Q=vh|~V)sDCv+G8$?V|DR9sc7asA89lUy
zjM%?UZqtMfOva8B>GbqNDg+jZ6#{TxzRdm@`Tp3$rE8k@E!=(h-oeCnDc#*eJc9Hu
zL5`>=FZ_!uz<uCq`$qApybK#vOYSm*`_rEL)5lqsl8>|VODfGUV2AF%y;<S@kEeDw
z2$;+;ioo`Jz#n?#QEG*-htdWxbgvACRpV18O2>m(b+zX?_kVMqH%#7LN6y!3AU{|p
zr%Ug}M^6iw#Z5lz(<{+5nggK3|0HN>J7-B-^>-P7?nvFI->DjqbK1Yh&7f5kFq>cL
zs?#ri9F-Sb>z=T&bJYpq81#datc6N%I%`Xt9{+c~@<t!vI9@#cC<1yaMfS^#x%$uQ
zJ*VRV3C1HIu%&Y}s(O|h^c6wRqEXrXN%jgAt>S_@O0_q!v$~K=E>$+1k$^291!TI6
z19%9eoT%CKB$B~pIDnr1HTK@t!o&v!d5-GM-<^&>?6o}Mc;RP{Y7+ege#Rt{C=hvd
zvKvG&R@459I_y8<w1`ZwtOs7wlgp)1CHDPp$-(-l^kCQ`g<&RgHm)Sg$rOWNFGV8X
z6&+P>%#1i)<o!oW=Sxbzzq9*sRu4MW=^Mt4LV&x0^vLjhJc>l#hB9b<Fg6689+Ez!
z3phY?h({JX0oaXXeo2$0K{$^&WXD#Lu!(3zxS%smCzSfO8RB36F`-*}5NrN9z4HV_
zW||~DgrUA-13uUmS#8u$_p;gFW{+GSQi-y|AD;-Td`(sb2jMFq$Dqsatb?InE$fO|
z{HP1v2zGMLg}DCSZu$b!3;TRVCaWFfU}>ar27h}=`nPjbF6uUPTlJDbY{Jh<Dux#x
zs>ROVHrSihATQi=^4xIx0L?@1yq^5zUyPh;9xz#W?Zt=D6)In$J)8n#N!5al%#SGu
zAP;RJ(s5wtDAmSapS$nH6lz@Y2io9FxaG}jupNX98=>+}=u)}+Tpd1aoYFWeWk^Bg
zVT}mex1E|e2W{&4{wFnh$^6#cBVvC)Y;RccS)XB0lsW+YM{Gz4y2OldP-MtRTr_dF
zW_#-?b-CtL=1^BS`s49xFi>r_hL6M7XwJ&8xOefotC+b%Rv+;GZkqslbgO1fr-u7)
zV;!UuPUoBiadi5N0_y;VL?cwT2|>a}&B2}&8+XX?qVo^_n}-M0CQ%QkhwM*1ztxco
zy%lF`%M?(PA14r35u{q|0$stP5!?!HnW5Y$BgbI}n#tfM5EKWSE3JOy>*3T@-I)gV
z!F3<V+uDDKVyiNJQDpQwClU71#z51D1}pv5NcDk*8$@FC2di_1UGh3^iFN<G6S)fn
zAQ2<sv&a$W{~+%3pe8_eMQ&V*Kae5z51nd!=~~Ga^J4;zl9O2k+Kb@~zQI;$c=7=!
zM(LNmoK5|=vnqMrs^t&rrw%a@^ssn}!wDlC6LL(v4Clf7u*w@Q^Tu>KrK3o%CB`#Z
z4YttS*>MF<b1%kMsyS{Vr<@N#XAHJsXnC%nr^-@}10Vc&^uz|}^DBmtRt!kv47O_g
zB&R0|f!Z<msS@Xl*+dGlXC6=maFk8F|E^nai%dj_00bW`3^rUi7}K5fKCEiqB*lGy
zcUss$Q!oQB!cU=jO_5T$NHq+1n1djR6T+0@K#(nKC%c9y=f%%5VK34ysBpVIOiE@4
z^~MA6FC5pi<BEv$Yc5WcpI^*qLCqdT`qA6~5K%?CL(wE5IDV;3Q~OMs#g8P6g6bcs
zl62Ws1U||rAfbrk^(PsjSE>#oFFpuCF3X5ySLD!Dj%avq(P?yJEq!JcKo@V>=hL?B
zh5V91JJ96rcxkul{=uK?=kISZ`}qG>$P1{7_sQ~MIzR8Pspz;sMe1XuRmQ}^v-wj`
zk2T=5Lw5g~e|$a*lluKnKK{fZMMn~M;}jgx2N8E0Wbb^4Xt=SoDj>wdi$b}?=q}1l
zeOO{Hei6>o{RdXL=j01l{51AOQr@6k!d0~!n2C3iE1|?68i0ed1fVCKG%Mk_`D5vY
zj~G980D2~uMC(y*|1^!VA=CphmImp)GUcwP7C)cz8TyBtknzud$`8aHA;|U8yDgJ&
z_cGxLdaon@{UK6LbYe5vN(e<wL?J?NK`0s*c4k>>K=w8*5k)=l1aWN5ccw3D(Wei^
z^IaP{7jc)0p9!CU<?zThYl}4nMyB<S?9-xQr_o^nFc`Vv44*bXIuG+mZmUqIS9Udf
zy(c$hV+Bz7UpwmybE+YEVY8jL3*Uqd`xKMA*B;nzLIZ~IqF(LHF<Wi1X8oJZ!=INV
zey{Zz@Lu6}<a&OboCk1O3r^=(7Si?7lgDXM7v02Oa?hL2J-9+HAt6Vvf=?!K0X%Xl
z^cAljciqONT`{xU`5nO7jLpoMt#EI*x7gcWyL(;Y%`Ge&9=Y;D=G;TI0P_>ym_?}K
z?s4Nt3E<U_{PW{sD!laLR?;Lf_^9)mNG&7#-3AwWFOkd0KpCD#`SlZ%s4ZQ6N5k(l
z?2yhdfH!neoEs`DD9}~O)vsC!)z*f_ib@ynAM8R`?AE?MP2<xAjVzg)j%LT*3g591
zXIy(T2#u9B(4+1f5u(qJO%gpA%C)k{hIZwb_1Zjs#Z_&L45nMYsr${Nw$cCigIu%g
zQdJml&T}%pdR@=GHB(?^*<4yxi%eHrc*%zh5|XY53*1?r(=L{63Ui6tpNG@l?~q)@
zxmDWi?`~Dg`0POgEz_yyh}J%zsao1^t8Kz|jw3G9(ZaNXm_poeg;Q%Bb79i0@KDNv
zDD0kW_^r>VoT^h%#W6647JWH-4b}6nyyCp}?#g+F<*8oiBl)lKAZGLyj)oD!MaZ-g
z!3aJ~^wMJ(x^lhe%ZGb9Z%)Dha=-vC+-=@^aGpuE3ii)EZdF}~H&Cbgq!gd5sWM~y
z=lcaGAKtE1IrNts9&$3=8H9mVyFC!YtP0t}h8_D2j~74>^r&Fr{r*ZAP;VLejLtWN
zH`%odpNL)8c5KpNuUpXg(Wm@qM6scTchovac;9*S4|X8XhZ*ZofJ=?qR;OfnoMfD#
z9Aq%pl@^S26DR3x4Zr<03s@c0;EXQx4U=MYOQHxWv1rK8E*UBBcktaVH@GK7i|tPO
z^pQ{_@<W$Ae~XiTVfMBrx`jJtylQWKP-1^XV)<IP>L;c91Uu7i?3|_lXaO$wY26p$
zk^HYgA*OYfV=Q}Qc4T#HPwR`q^Iuny1$aXhGI24+mxfopL~atF*nDh1fWnCpiuSl}
zihP`_qf&e|0ThPJBXdS<H+1oCIYw>$wqS5$)YDG8#P&)NtC=hNn!~}~MwCgYR;tAT
zvr6hk-{<>#OP<qfWh;%W{GTIXe(xEm_+C|`xGe19_Icc@rfPklNvl|1&6CezW{k+6
zgxj@@-bsgDBPFG*vgd`*c89(8dOu|jLX@?@nvN+L1RlfOw)EG__cRMJU_-_Og%&Ex
z1|%CZ<#}Hv9GRpBTVe$((v78_KT4A&{iLdqnilur-|z1Ik#y77rIOQi=4YV3Zt~%q
z(<_1mB~H1!|2UXTI@DF;8cRFja3~_RrC&Eo<rxcozN(*?g4M%W`dB0UD{FhgSp8WU
zws$34(2LAudt-?%H7eBx(9o&0FK-Pzu^g+AOEP~FCVozfnYn7V#S{}4I&qob(K>nD
zuCPPGbZ4Q2H-_(-{TH@v&n3-P;R82x$Ab%#Yg*P897|+aSqJF;K7s!;6=fHBoYuMa
z)D=FiTsyTErJ{s;1JHKJnARQN@lY=t25%}OEw(XJAXy$ds+t!M+EzsGQ!4Y}+r_V6
zc^-J?#Pvs`uT;c%-&`L2w~IYO*l>Y<?87d~=mof5kt&jYZ>7DOLFa1C+=9l^Ga~V^
zF#FxB^S9UgiG8pEpSU#UyEAah(7~I#WYnVqXgsTH8u*So`XtHyJ+8XZzN8l-#mP|L
zPUcWrVN=o2k&1utf+aM~)m2HMty!3Ajj^KSQ`P}5g!Jraf%9bou2hfpQN68zCLz(a
zZQx&`F6b+rm<i+^dE)cHFjDvNdWR$(x6OW==YH4LU3LRg-k;`|3ZYmaD7!&iqC`cj
zHhglBGF_S8A|*tfWaxoRo`n=oiJr@3h-s}Cfg$;4#}hBk$NMXov+&$MwEOsYFIw|p
zYTnkF(QUnwgam@ye_gC43(!)jyEHV2uPB0vS5@F-^gBZMj0rv`V+_2iAzy3f!Vx-!
zjdmKq?2kPsZ-zz+=CSVUTC31MIJ`~xK)_5Y3pz*h_h=UnJL(j`a(=1pGY89pmv`9V
zg9`J^Ty9P4bscb&fuRX}&_qa_a}JO1Qiy`q=~HV?&~OR$BYzN&>uPbCjxm!6rqx^d
zXj9aqZrYhF)w=(K2K}aNtyd@JKz(xx27r5Euq3#<<cjJxRF4hGgm<5yp#;EB^=3Ym
zq9gpaakRgf(g38amLsl9g5Zxw@1ZkHS_?mF#tFHTI3{5Q!}Kv3xSq@~i{31)&o<L-
zadzv_)p`|*Q1=F_phA@@6UL$RvY{9T@RPVurO7#HFMj4Q8kwK{v_xHIHu3xLw{&E*
zwA+sSKo(agD|po^bW;Sl4tEl0>(b_8+zm<_o!EesGL|DTc^uO%QPXTrMML&YTO3hT
zq2u5`Z^gN<N4S8)2qe6^Fi?d4!osdod}{DPtK;P^_l@CcqW9W`=7#G-XoZ3ww65Xt
zMGi%);CEt31mMf1E(=9{*RE)Y#{MiY<KIaq3=YlOhF~gJVmnNtUc&6~_Ve@O{8Vqd
zOIYUMR52C`aHN=OL#DM9hUP+9P3R!(R<XZOG8$d`t}d-n3OKbWLZLf5vzfVXLKUL~
zT=FAs8^7#|6O+cCEu0M1iTp~&-^>P3G9i$whFo~}(|FCO$Byqv@aZ$JSRNgBOIN(0
z_IBa#INhqQxx0Edg!L#A!Q(>-;Z*#LwmkShZwJzTVPm|{^Fe2qTf^wKpD<c?V5!|e
z>t9qmO|*;MSZ<O!ebYz0*VoI3>l~xJLuhs69TWdmyQTM9>r&~k*^*Y!NSJrSwVPTN
z+)36?s9BNFT1D9u-v)Pb2kVvW&h1DX4lTIfsyu60yuZVI<i0qp=E_k{Rhsnah{yAX
zP^;+dvd%z6n7Ac3HO&R&Aw}RBOUVu~2MYrjzH&Z300P3toA7Tg-gW5&FTJuC<bo>q
z8lsgS5PLvl_|*2cEUkh!`<}4=?xe+Huv*Yzn-ybSf%bh;I3F^aZ#sGEzPr#${q;$Q
z&($Zob-hf=Erp0r9WF+CVnUA)&N4sQ&)@7x5DyeX(%+Ppo*z9L*Mq~|wnmnw_hk}i
zgP=Y^aw<2=dzWY~!(-m6qgeiRN2E49np*$Di&;V4%O|=^YN45$A!coM@<q^Zb7&sw
zbLqCxi-P1!r@b*3zyVaX(T^W`|MVg+11h?}*E{vvZKn=xHlG=6Hfpt1>pBF#9*X5-
zZj|&r2dgM;?g`W}z!77dBDH=niE&=!cKY^O^l!<$iAm3gLr?K2!>=zRpcg#nid2SC
z8%M>wQTekuQ=UQVf_kXo5K2)kvyR4g=iTA>>to)tJo0nXOl8fu`@Ix-GdwB9veW?$
z`X%z}*#P3YrDMmh!9(mL({KmgQ;KFzqTi<SS%Zl;z+t@L-VnMi0^nGr)2MqNf?k)1
z5MsLE*GuZipey{5F-{{t9cH^5xa>v*^eCcF6CO}~5`Nn~$h)oh{^ft5bq7(Uu*#Op
z<(8oALo@w-j(+T19E0Ft0Gxm|`^z6$*Y9Ee8MRhFn_J#^lzVlcy<IWs-4jiSiZ8tg
zg$>--LL&H`a<nWnQkpF@v!`~WUW$Yg!Mn*6w3H#QaA;r1dqsj>C%ZVJc#$<wOgLip
z3X@}d;AtcIqXSt9^Cu~m8%Cnhb)jAx$21KRG2~$<kS~jzN3KR%(2V+@tEm&slk4t@
z32Wtz3H;~jm-pS2dX%ef<<_13{^wHHIruik0{}3T?8R~^UK>Z{J+4YkQlga)pGb&s
z{f-`yZkg=Aa5X%|HWi3MYtTRHkw?0LOv}prEgr}-62Z5zwen3zUmw$p4e%nraQyJ$
z)M2t7o1~%?{!Z-Qi~Tn#c?GYps!^MBtgpRYgg9<io<LF@Msx+f^hD{gey!^&|Koa!
zBm+hBzoV3_SF4}+KF-@B?rAGIvJ4Lw2mI({gE?p^XevYI1~g-1D%4Iv{&025v-D$M
z*WOtW2n{vk@#qy~z@~^#l$q|IJ@U_&4=<yPqzMe?8Z3~oe-gg+A->@e6a0CtC(O$e
z=EvH#6#`-Om#L`(Lve|JG0<CLf~BL6LNl7byg>*3eZs$>;jKCVYKs`HRwOS;hf~1g
z27aO+!}HQV^ztI_lSqY2KE`aLT#%1e2}k0f$?6?X)RBLg0|Zofr?eo(-~?o~ouq<d
zK56sYtK!iexSxjN;?Fx)?{6HebqbtD#v#5O76NvzrQ__)v8q2k?dfA5KAkkWclb<H
z$doDR^Zgsr5fEV!Pi5Ina{|6z|Hg}VEGb?A0s%ai={iaYLOT|oI=S`KIJVTQq%hv~
z7NHQ8+4PaaEB<?U{$;>$O0-*Jt+B@8yciJhk0k{T`3r5Auxz&<4lx!&URwDG4ZA4(
z;aRRrr-%<99S%=)v;5L3dT6EpOsRK24AD1vX+O9jGGUvK|4!IhKpBkQAHKZBUWbeE
zwbC43gT4%|@t&Xe%_-!H|DD%Y*C5W!lbdAZQ#|-!b9nt5WA2XWRJZB^sU?Z&I8Gx&
zSBKnRfPkjj5&4H%+dF`B#f0do=G)+~axH%5r=T{+p3ALHwOH}f2kFB$G9m7y>DqtK
zx0gs3eAj3)`R?KE359fzBF}vdp8JNq_C;0Dfp4mcM;OCrXNkAh7px$wpFEX~o^F+}
zGhhP@Aw|PtjQs>7PI9zM@UKe<^;#5P>&@KHmL}m}EWIxA_kQ0HGd(ZCCyKn}@(IPy
zeoX}MuZOTzBW|MQ<aO+OFN0dM`nOkG#nlm-;H{~*P04U?C3N$YOWOp_J_3^P<gJgW
z5@2S8ARZhVEUdwu0u9M2+QRaE8pNO;7Qpqm+Bl{WU)U`HPQqpg#RAt;JTzcQQ)EUQ
zXHidy?r?q@b{vd&eU6a_B><QxRWg`1!l~h!*T&5l@{0w|#7|le@6mI9*<*8-*0m$?
zK@aV*AILJNc$ffpp_FGydx6|V6Ydho==BvbchG&im9pJIFj9d^+w1FZUAzH#jHFuz
z57Ii15ugF>KOSKSI5Kj)RvQ~=B9`w=_0N4Mk%bbC6&#VMFOIxhn+NvbjVD>YYK@r=
zy^Gw|lHrk`<U$;`fJ|0T^AlY#|9p51wl<+@+xZekmrsokypIrSQOpb!WnO}ut1Dx~
zV`pLN)%BqIFB#|{S_1wV=~}<0L#T4ZyODIW$~-a2o>SY!H3(V0$$Ib_17_i?$gQvt
z5Yx7&XwJaPG9e2toWek%$m`0o(tsx2lTPH(v`N@>OWS2*_<A7c=SzJMa6XUxyME%_
zfK@IxkRAbh_Z-5l33iT3gdUe#ce~Ap7??Ny1a!I=L^nhrvnPc(tK^@L*kGJR_`uSV
zjcHM&FuJe<FwUWc(d?X3t?T&QV&n1XGiQDRY&W6`7jiG}7?rzg*2)FkalXs`)^tl9
zwH?jeZ9d0&^P38>L*3IUVfvQGPG6>+yFJ6U1un{)NcV%V!5a#A6%l$pLdSZ)xbL_~
zgVtm4LAo#8N;@4bi~Qd(%w4BoP4pF*(=_;eJDOMG$_KB}s1IyPXU}mA!&6ETuCHkA
zELP;Y%p~|p&$61Qo#yGUm5t)b#x+5caNkS!o;=F>5nnKY1ekrA+hT`r4Oo!w8_R_F
zq3uAYMlj%%#yq;iI0HSM=yH(dOz#4>f6vSKCf%R#qeSjC9zU@o-im}kI7M#(aXegT
z0}M~|-o_}M<>)7B3tVon(001TDz#l@2lP=tY3O8D@$rPSUHi$B)W^yw<cdeU%g?pH
zkAUsCwgVS$A#3U(-1&9%`D4C_*&?96JXYvf@DW00@U$N~Ur!c(IXPp9we_g7frXfw
zkZ@J^xHbGOH<cpA)O<l3Pn*zA7O*lg2$2!(*=MCw`XoUMbY<w5p1)r+MpE0BUhP$}
z94yY>+Mdge+NMF-C*uZ7?2r+8OzP~ppLgS7{<MF`-ynn-9byKCF!L{zj1Ug+i1#N#
zk2dlQvCHbg3&^jHfE_ZT*#-Mvd+~Z_U0oG5i;<=IdcXD}dT*^S3Oa>yYvC7%2j9>N
zmfPA?AMCq!0X&QT!}=tB%rLqo=PbVVD0%o%vU~Kc;_Vt=h&5wbf1Ufb3|B^s;qMRc
zS1|6^eDY9-(;<pD7kb1X|BKeCh<8JAke(<}N!BL(rVIyA<N|qbj3+EFn^nluB2eh5
za3DhJvLNp;{UFD4TgRp-<R_$ffmas&(KL9qE8alvH{zE@g{L2pG!$C)VEK-|+G2ug
zV}=&6I%rJQV)<M%(2+EA@8+!zfcLmnu~hB)cicV+Xyi}({hvWo0zY=2%GmXA(8LHm
z2<TiJIEW4-#XoEJwjq2RS63&I_MSXjh$vMd&UUe4NoaNXD|r+^O7`yf9X!t17F4-i
zl`gA_g9S==h+mS^*&ZXK3BOy~XW%*PFyDVW;wZ-*EWB5fb@kR1N@p1Oz$r+LgBIL&
zLkXybJt}{KlQHH$c7;KwRzLQ=qE-HM!2fyH|1-fvZeOJHV2#l$>cL?5tWBApr23Sp
zWsag@4-l70t>vuQH4@q!JGu%aOcei8<z0vLA3{+CM~{Ug<)R42hIFx$x*l7TrtR^f
zsKFpe^c@xkxV!hI2F<^`IBDv4#W8(mYi7snr%6}~sxDdTLv*8hq4fzEMxLpNg1;QW
zf2Q6Vfu_A~=wf8TQ(#C{wZr#e5WPFWCO=Nq=`)}El(`8xC3dsjlJ%Bp^F=hIqI`z2
z>?K2;Y&E+pQ;>CY0f87`@`!(-@PiAq7&gKM$Xi~&W!--Z4Zls%fPR9GSI~7mtHrBP
zV75QcxJBP}A*3rH$BCWuf=1UJ$8G(U;tZ9vNbn@|IpXaKKE(60f~r|E%a3n!{nttl
zofkrhpU|{M;32&t!r&=g<3X(05@KuG&3PKG*MyE#bfIh+3v7aazMV4-7F<uCp@+Bn
zSi%xReuc056s^F$X-$~TOr)}Dzc1UQ+!As32a@bU7xrn-W?;IMLjF7utWQ@;k{zvf
zM?0#}OBx4dHYCVJ@t}s?fqyUs+Yg?jz<+o|fZe#^Y=7~8Nqt4=6Bc)8xu5Fri4_1f
zULc0YeJcxd-4K02LhUHQlH4b<wmW*<;Y4j$_k75{OB6x{;gYXoxdYCq*TB=g0GI<*
zh&1J_CeTO`_ns!AfaGd{+L>J%E1}T$`yURTRcwb!AkcvSYDC0cv&(9!TVT~KV6!n!
zcaXqP2J1<7X{72LIuUk?@g6-+)_h4}$WUCX&}NWl`XaF3F&4w+&x~j;J~SapA5poa
zfqC#Rp8&ZLJw))61+^C`nB>NwsPIHza~Fu5nAs5^T*|c!eW|49-c(V%lB(UAAKt5M
z-br*DYRfjh%mY#42w?`oxotB~5q%};M?8w*Bul}t9PjXB`Fp=!B*q}b;M7b~Fwb#1
zw@+F?0QuVrTXfz)w~HQ+fao<4?J^aC#BHW>a_Sojk47q8T9s|gVxEP%%x0uX)5f!m
zr;Eue55$W;Z-zFfBV07ZJ%#u6q8C8l$_4C76K(}+i?@NMezkfFA0k4)U*9O_Ovw29
z>i;@k2FL`N&jbxqJoKu4a6#`ekdwQq?{=YM_nmV$WgphoUla>P;F8Na$_b(}0yb{6
zkr`)}A8B$>ixF7%0){Ub*OjI)*5~xo<N_Kx%7+8>!iRcpdLxgtIaY8`07wB~HmjfR
zZy`kySj@)2RaNx|g(SayPG|;NSF^6N1>iEL!1ujb(Cz5}rlA>{IUh{nBW^s`R0wU2
zB!bldI5Nq?+WJ0%`Tym+>ES3IH|0C;q9tw;|0n)OqZhGaqtIhS7W1UDf#TCqA(-a7
z+geF=1>YYxpGktXZ4Iy8Fja}}B_M?QTx$ySL$!;5bu3^ff;G1XQEo)6pzBSuPj8a1
z5n^?_k|fE^GP0$)*l|1#Gl~Te0#y@t-fs^9ZG~nzo58gNudJ<XAj;oR1mz+*19@$#
zq`wcIKJxYZKSAy=(w#`wfkj(B9abQdTl>YAv;`Dt5l%jK)my1ctAFs)PfVpCQ~ywS
z$xs0PzSa|*&=UswM<)R@p~7D5c^68oBPFZ#g+WHMw{Mb-2%(CQ3ASR^xM)CdzDlV*
zMdm_5kGf6-$SeA?^6D;|zGetzs#q%J?{^*#L}SlcoCL*11zYb|^9Vk>NmjMhEf8KC
z3m%JMkg3fX`Fi$aLnNKV^TW6-LP@Zv+qYEw9~Wee#Km)fi)s<}5;K<_#=8kYBye_Z
z4V98{iJKs<Tb6;*2cQJ<2Kgr^9U2i%X6xnCwRma4y`7CV97jNkl;_R@uNiX;!fj-j
zCQW|q)wW`vu2Ueli{!P_zq17tt2M3b{O!B8L-(!^s%L9QI4%rC*cAV)n+GLd9a&|F
zTl`r^U04dul63BrE(*jLGC{mzR~g!@D?^1bO<S(OwB_1{O2Cy+9b1$c?NF-84Ado~
zHT@FH);*Y#r6p8SWtq1wMkWC@+RzMz7`KgKr&RE5@~@=VJ;)iQg(&z#*~Z-KWwwUi
zU;2r_U$B@+z%IOupLAAnkB~f4<Tp&r4O{7p6od&xbwO9Gz!BwE+7wVOA9^9!HDfX2
zdEh?HfcUJX*KVl0-7iwjuM-L}N6N}W+NX^0E^1v34c{1QpLdnsXquY_E|Lcv3r3!i
zq~wLo>U5hE;We!M3V@4Cdj~$mq;DEyIf}ZKB@F#f^A5FJmraHeKW7{8SRVT<(e(0)
z^O!f{E>7?hjV`GA&H#LP!ct^jmRxKz7zs?i;8H5M36I+_U)cy{hJZ8o9p&%`YWl5E
zVtB|aXkWJcg|r2zrAy8nB=tiuJ@Q3S+J*nYw(Khj+#2Ou=_RT3VL38(75m$BEg-z5
zoXcQpFM?E)cdZO4({NztZm--%iW3}Fcx41;9AB&ea|>t9DfO}c^6uIA;AgUI7Pfkw
zgFjoH2#EuWqvz}=))ic-(qbT%TR!N}^c3MtS)rnW>H3~?n*euPhRV<}VU$)|Mn=(Q
z^BeCox7#}JMgyj+6+p7vtPtm$m#3pmQaA~Pv?458&k+#K%E2nDYPmR`?eDec%|tGr
z`6bqgf`6<4DpOHFAs;83LMUmw%0Dk%pky0E9sj9`pLsr>UMAo?yW5U>?QcjathXaW
zWtG!!N_PdOi3tGX8P+j)!4YAL&vDrgf~U2HbeOu_r{$_2&F>e_=Dl8xUNk9Ks@|DW
z^*_mXA;M}WkY|8t*?~$H1^d66!Eem5#T9Q&MF9=T-<Cn7Do=`WePBEa3W#Xq1}Se^
zJ8B;5@N>@o6}TnuA9S8Ws5S-Rz3weZ8=ze+x6J@|@IV|YU447ivMY)O3Jo5uwiuns
zaO0rY18ScamZ~=N)4seLbl4LHr<)L*Fc#4a%{yr=>3rKM&39mz1*N4ckH13gKK<KS
z!9PF$I|1iEEh#;7^C;GEJ%5iDiPXqaZ-}LnxFA2avsB}x|Ee3UTwz%j`yCe%um7d{
z{i<2?)0gL*0rr)|n7t6*U4FhQYYT;c9;{>O!TydPzj<H@-{eP?X7EB`y#5G@I#T7@
z=j_mfs+?MbVvh}9IJT(<B#Aw!dy*$9f#>=TiR+0QDllz#rhS!QdF)T7K=wh9)YwZg
zUvhzxvOO@;M?kyop`CU5wP2RlOwI{Wv`Z@zN`**4EiEz7VAYlSzPB%w4Xb9ZPPIgt
z*_KV7NATBqnkej6dgR+kTT=&>b4ARXEA8UsR6`4}WhV*+80q>m_6-1E$BsF|{+w&q
zh3!=8C+QA0C6mWzT284wSGv>-RdSR!Ys&&GA=_SEv|LxV4(~Lbo%0k-DcVP9B-Q^m
zz>7By^G4LH-AT8ZU#Vp=Q*vANW<8_|=}wz!^;nK??~Oayoi_4&gI|MS_!DVMF1K?(
zg4Nd*sdyxfJ?u_3spKjtoIUAGw>X=wg;~wSG}1r?8*x6fCpU+~pt!lVi#m%;51a-o
zbP#b>kW=;H{hm)Xd+R!o4bx1!{04J*T&P+V+rthDHThc;fQFv2xaXG7Vq4EHVS2+t
zvA3QY<l67`)$C6S<gQjHUz~nQIn;qpLlBf;a5rh-c1h%vWp}#LyVyM;5LB4<={W_W
zUs2&Qjei7Hx6$O5O<hh}RyZFS%GA{qJ5ndpR}|+0OEvB&=!pnVYyz16RW1U+$wU18
z@wA>A)%ix$WpCSA>C=BG`JRiZ$?n?Da|c1tXMD27B1lsTO19`n!M4(*MIbL3I7yHp
z2zzr<LE6D!oN1{9^$)iV4X=#uc<lu218jd1^%J{*ZJxY!4Po1^JLXtt)hyg+*``o-
zG>-~}q>SYE4u|U*XyNH-^tESc+(}dzdpfZfKef=te$BS!xSME@vhc|?<9>bIpcB-5
znt9yf5_eS@WopJs5bu$mTmxY<=29a$^4j%5JK-e1>nU7n%MrH>#h5Gd6;2Yaw|z<<
z{lGx;UhhBfBmN7ahnkziL!sUDTLQxf@$>B3+Yqm|-L2IYSiJM9HXTjM;6_XCiZ9*`
zw~wWsokR%+YXfYYecYSLE2~!jH1lp|=hjKaXo|Vd+;LFW(aLhi)<7Zs+kn|n40CB=
z^PQ$mN1fMU7sqKmb@!Nc&4kgaJ(~=nn%9sd0eL2>X9>b8<66ci<L4ppp7?e{q;}*=
z(9wUi0E{;umP@4_k$KkRsK4iPeKP2w3=^(y3|v?9NfPGTrvTSB9Y>38bf&ZEDl$~l
zSt9y*4Oy;ddii79C@o3WXH1>vvP&F!RGtOGnPd%47u0-}>a&3QYwc8oQl_8ELY=Ls
z;b~%h5?|HsTx+PlW%tX;I8*^W;^LXp?l;<LiW!`O{i&padLF;dS7{Z^=Qo|77pF7j
zvg&0sqh`0^v7e}K|3NmOc$K}S?JtZX88R=sS8_)sPyX1ci;pDUTvW0ysLi#58fj~k
z-0>?8iu!8Tm*f#$j#e+5EWLC{KAFIBZM1UPh!AA6<3#U3BQS2AWGU-0de^}1kMF8F
zO?E!vdZ+H<C};Vf)c*ir5qA&<xH(@+sh<AWfz001lgaF3N#q-AV&Sp5%*nTn>rUOZ
z87NHT><pQ60htIt=X{4faa}N-T{$2UydR?5_&lwkw1<zz1AGlA*ZI6awH?XtWSiw)
zH~bSw8AH5mTdvx*hg2Jz<Fg7Oq4?j}lD*CGU{i?Q0gyha_=pU6<a`vV<v<X{e+gG*
zqV(S?gAq(=H*##pKt6SAy5&7cl$y$ZseTe+a2{UY-&x{Mh-3)U?L0X#GwLDN^mGA+
zxY|HvG7ixX{M3-~!_~=LjS|cDJlK~w$g19+m5Txmfuw27v)(_$3+YKg_AFWbqKSaR
z`)DvT38<Y-2G|a;?@|DPX=go~x*kC-DBkxZDf94Zs6p08T(0Z$l|rg9^{%vw$FzC3
zK?Y0aPO#-$R=68#FzT3A-mzL8laiNtRv9Jti?-}I``uLf(EW~QDYsdgll<hWFl&t{
z>yKD&ZGYC;d{fcsbbm$6t>1l=_!aegoug<_T5;4<6`n7xZYvZRfdv>>P!qZ`0L;V+
z3x6OrFUPc@A`1Pj`ZF?*ZZki65Nn4)D<_)Yxe@wHwr|&-2l+jNaBW{SizT8G0Lm2X
zWZ=ShJo4-}z{+&*VKx?Rpxl1tSxL8@-ElWnRD%bH7TFgqvltWAyqh`208{SCbPu-8
z{+<=i@gPw{_dQFWyG`2S5t@RI<$H$x`RMs+dwB!SD3;Bn=?Tp3ZjV2AN!5D<eqavD
zl;v215JF8AiC^4BD0Mv&ij|C(!VHYe$H_phMyOPeX52|^55&$ne-)UF^{vW!S0e2D
zT$Kt5-pr+L@5BVlwe(Cv)K*@imp>oh5fW%WzDi=S=^KN@B(qCQb_HRVReYn^i~Fx}
znkTpyW{Zh=m^@neLsa!A(T@WXoF;xcHM@=#CBsM`B@Z+e&zP}>rvLRZG(yVNfb8An
zYu{G_U$ep+p~E0DWnnBeDG_T*btzG!Z((P>QkUJ&Lqd85YP~K|$RVr}R?Za*jjXv`
zSX#<%AL$#JzduM%Bm2-?Dl8W%ZS%v@V83}^B#Sx~wXgxFoYxZt3)^{Q1YiyXPo`_f
zEvgL9x}|V@)lryk);#2-{7IeTuHanWZbX<W8|*4RqpG@>X0m8SHIrwec4@16XOWdG
zHjjPYD>&9WMCJ9sluF9YDz@_}7CEYy^f*UxTd7}YF;Y*fpsWyO(u?1f=iJ-kt}4WF
z_)&gOp)KiYdhV#^BIwt{ecOH8fpLEImml3!<peG^jioL4(%rh5(L!fPWWUWx*&)xU
z3t8ju_sMapr2W7AlSe4iEY5<uXq;_BB`d?;--jDRZ44wHbFqt-0q2_B*<ovz`)5=G
z>ndIrIeC%v3_baJT2AfNP_D%#cC|-;NaJ;#fBB*k;^IweC|@e~UrTS!5WGp8st@{T
z0bxO}juqLF)F@6h0+PYP;xg={a}*1yTqGB>>V>Bo%N?CYB*H)C66lDHeR@6Cby&6C
z&9@3P^j%VB%7}a`cO6xMp;aZ*TD{l}W5j<*ZgS4;e6N_-%(Jf%olbcYA$@o0(^s-o
zm({7(+p1d+WgILl?$$7rPl4DYRNFEm@w~NSq}P>)ChYT_G(Z1WZI~|UjUm~bIl2QO
zVJL&;ijzvMxSBA2=rMpfM?yLb${N&u4lGYtiHwBj)aa!-<8fs#W6_&c{Zt$G=CW%%
z_vQ>*?{btp$k4vBu1jU4zIL-E80&5|qFrolfzOhs%Badd`-7an^<GH@puHAcTHNB`
zBoLcX+wF}YtayazRtf+z?g+`8vXZ;SnWb6?8w2mJ8d(AIiPwf{!ZNfxoz<@0Z&zhZ
zq88SkBtnS6E;z@y_9#<)Vtfe8IuUy|)y53hG$KssSWz*IB;*5xk%oSLwi~}J!u!Kl
z?DBv{k8<I+@W?%$=OuPk3!Q-XQiYY*=+rLcM;W|Uaiy*jO)P48MhkoP45FhfD*|ay
zom-J5%tHz755rTB_=hU4a^$7#O`ONC`w!>tDN4wg!HzSuAz55WAd93$1{Z&HV*C5h
zL=ut>O44MDQJ@db(|l;WoSFFEpo``d&As-F`s<uas5w(lkip#6o@%ir`Nf2vv~f(K
zZ!rXx%{^Gh+=Xw6jO1~VWlv+7{ybK2(b09WENxZfrTQfTL85}u+27NuCRw;cR_(t;
z)x$&)L1H(@;b(pCS=MK^>VYs_s%*wX);K9x(?M!UlDMh%MgDhmPgnSe^Z4ncY2lG|
zfb<$BuW$Hz3^n>L{%I3fm1Vr4z9ct&%e<X_w5xr!j!IoKVML+h(t(a$l?lj89s%!5
zJ-xFI(oNp-RG--E5`q?0yRM++ksJ<O2si<t?<`W(dyi-=B38eopJl4eXY7gj2$@YU
zlj<*<B#A)#FQ?1T%ycCA6|~wGMG>Mf*F`3ulX4TkrM#o$n;<5)$COMd2_(;WGmvB-
zyFgQ-;7fQ#>BrMSo013shkRxwfVKxuQS-2=YQUDy_jVWS$ylgnn?20WO`VR_DlomI
zCN^N-#4ZOJTx9tMX`wmUv#%MvRX&leFjH-(cgu5*6F>DK6dSm0{+fgMl$xNlT?{d|
zyF&WMY(k!qVjv(R0DDdBB5bx9Ex5u|VN&fznO_uY-#c}<o^pkBtm(9}22~e~_!*Q+
z_EgOK8E85E-aT|~|4>riT%`R3bUV*~-Wb0A^7L5xc2KPGj4tI!2kDc=Y7ZH{({w^Z
z$Tl^L*r~{vs918F%0G3e9~$?J0o`P7hX2DPM2~;lax3`Fs5o{6bDPt=tf#QDG{tfh
zgUUNAmmpjv;0hE~O7(ye!><1t^ihP$98w^+M9RuMi!?=K#JfTwCK)-6a>>$omQTNJ
zkovDSNxHox#|S!snsXsLie%V%K<4_^S|w<$F8WGW%6Y#}x8t0-`U5tVhis_(jZ!OV
zZ#R03r*3NP*!=;vKN2aSOb7f^`0{tmV5q6=d0EL10+pX9M_*X1pVjk7IBzFB(pkb%
zu?t)ukGs~dWQeL+xP!H^hjyNnmz&;Px+_MkQJ6dcggBkEG*&bDQ(y^Bw0Hhz!Pi^&
z=DyO|*I#uRWcfJd{>1e7Ylow5uvv0)!7NPQk+(g3H*}PR>!I(egRb^2;jXp+=ttWB
z9eWIAdYV`89@~$(Y?H1c-BK1VtBmaUTW?2K!wUsG+XHl=tdPGd`|u!pvUROe(eQGW
zKo1DGu?C5VG|d85Ve8W$!E{TNPt<X#{(V@t1sf%OvMl^&BhEl3SPz3sanXC>;++>g
zu5LAI;IYL}9Okel7$5)JanQyLzpJ6vuc2iU@=a~Nv!EZTow%G)X;bvv-!e1H5D7FG
zt-vJYYqOsH$lnH11FFZKh=YT^=58$G(z(o3%R1PAw+bdVb$S%%S#C)=TZx7!;HMN5
zdeao>%wA`R^=KE?*tV}VRh8hfj}G6;+zvi9@6YD(qi01LJ2aksf85qg!>S_=Ql<eY
zhw+EL?&>rGfvAnXfi{P9PpD5Mp)5$+Po45lx1LLlvb}8iM*D-#U`4hflY~_UtMjDZ
zSy?-J8&KM|E;^1$%j7dtP<A8Lv;w#Ym7bB@P6awMC2My8b_@Tm*d2r{C+0Znd!)}p
zk5z0@mRj|I4d!9$t?)sI7(KT?-(7~uFW#zsbMf}I={714KA<CFp0(LQ@HP^+x7A&v
zkRWF;4>|KGvk52ahu!!qvG`{V5fJ4wC@K)p)tF^r8l>C0z8sVI$K^*$c|Z>LhEQy6
z0DnrdsbKE=XEC{reXReLwXG3E8ivo0Wl`hQ{8XEv&c1<&^Sxh-EkHKr5NftjZ0TYE
zvdmP28V}TuUUZLGPx0L3WyS9q2Rk(fP1K7a8u}J^YYq#!Hg&4`PHyb7KT8mjI?J1z
zfVh>biLF4k^=%Pl-({;TrA~|VYvuYkdJ4>ZF)T{#(s8^mvYF0L-N4hRaw?z4pp@HQ
z1*4`e-hDDw8~I}GOSE4_;FaZ$(P7L_#lYELN;iI<hs^$!2B5{tSbQh}!OmCJm^)YF
zHW^29OcM><rdwm8(D5KSY?)y{LQG${xDJ$SO#7Q&#@^yZVd;js)FN<r^7UeNCz+k&
zqlosB=wDZ>XAu)K?XGvVY=YcC|JKBK$*Bl2IqbEhRO*|<4w={hY=*PlDOGRDrjHE`
z9Kmx}f(_IzUYhCm_#AC?Y`qVm5ehn`Stl2I3(ORfqTFiUewdQtb@Tlqqa^l_<quXl
zcf?d9`)SS8#_}{$(7K!(dBvkcoQ$1)(#d^hh-wi8%b(;7+1Rb7rtM*C{k>n8D)fLg
zn}S?{!})|eC)53;+K^#9_tKwviVvZh6(F=+S?>xg7<swE<6cuHdWZd~BT`%AMe83W
z&YDOZukWGeAesB-_>1Vo#tXC&D?*0+m;R?GC0wIi?r>u&Ot|n0&n6)n0!(4eOq2bH
zi>mJ_wZipcY0t5&>1&v+rC8U++nBlup~21q3bhiY2k-P8e}3OGL#kK~KQrvqetQ7i
z5nD=|=D^@``|~Ce!-^|8-F_bLSiLIvoJ9MYe%ru#$RwkO7XgA*Hu5DLr!y26bk+D&
zYvTa|4+jkg6<;pAY_GehV%sF1t>Em!w{r?y*ln8<&;<)~)~dfA_gheD>s#}Q*WV;1
z$(`8|p{lEDBEXV_VCO$}#=1;hn%;7XUI6y>X>8G)t5_S)FuKExXq$EfX!e-tNDRm2
zDPNhTe$@_r&tPAh@}M;%+!;xlGgkFX^MAftOV%>JIeV!2?t!Y4iaR<eoRFYKmG^3#
zGh;8AcDhrQ)C!#JfVxU+M;vM&oW<*kCv{GE8ytFjcC^MbS)6$yDav}Mg}CJ0Fa>k=
zk_~Y1l?6Ga23@E9LYcb;N!Cw4Yp>mhEoYJXRhUV4EnbT9lO=t*K@L6#!~L9(k~nd@
zYs<Cl)Vn~C@rk|%DeeWqH|s@VnIr>Jygzw|o`|=G2?McHng4m9mh=92_W22?VdOBH
z)}xFidq73Ew#-5$%gxSw&GdCIL7%|3uct#*#^XXC(~)&BHGVs}EpFm)pDAhbk9gBp
zqvDqZ6)`~DqRYGATVfvm;t`B@p3nQSLS_}#R2+79bus=(%t(DllnN2XUw<Uedh7yE
zHvW0D<}rCj0RM+s%71=Pok=h8QYbs&!P&};Px9}0%3G!D40~n60bjoKxS8?R3wtw@
zPd+bopX!t|q*DGiksHqh$Ge&yL#7?SRIL<i!4jTemMq%lIB)iwFK4vL@|{9rJsHMh
zHv5L&?1~zOS+z}k?M=Q)n!H3C*Fx%Keq|fMaBkCsw%spWYW$rA#YiCpr4A%k%;Xf;
zB2<;{q>=jatClqmQu1`H3e}{t$O7r{wrdGY!3^r7zXg)4p&S^V5K<WEb1Q#EU6Hf<
zo&NhZp<tn<%GEcqxtOS&K}5goFxT7Saz=wHfHHQK$9LGl<7pw_n{Y6Ms*G*fJRxqO
zsu!6@dP!Kb6s^(ICfb62>?t_c11Q$XKYqXh`jlp-9b~jZ5Swx_4tgs_5ONhyYDls2
z+Ad!$L_&s9Et6R{54HzPU6JoGEtFMhKV5A5g|hJkNp>kOtMncc%~1`W!~HYIehebA
z0@X<FoYYzNaw!r_ffzY8vR7uh#s3yBtH*ff>CW>y)@SZLuBu9))?y!|N~r>(T%yU2
z%E}|igUSnE_sZ6->esD68xXR~*}clR7C1F%OHwiu+5Ie$yZt$#>58hegqm%4E(`nC
zPu8qdt4961dw^RU0lHh2Cv;y#SRq}>@^^hlkJJFT-p{Sxwj3F>UXK#>kcr#_PQ}Hp
z1*aXFFsl6I`_!x4AFLv!jQR?gzk;2A+)gL`+aj-EgA^OJX(m-%`9~Lzl?KL`#zv($
ztFj-;O!qnSaAnepUG_S>BhJRmU#$it-I<yTCf>{Nd8UTt;hPmWD;`+H@H_G3R<8Ij
zaX8jWat`589}Kz*7}k=eE3op(W_}}$97=V`jS|?Y-^KBk-QTwb5L7%0TOU(f=Kolc
zO<e4DIGqKIkhejfy#k33^TV^<s>3=V?}siNQ1)KBYbx71g;l4-XZ>7?c&nfOQK~=H
zO1#N4!_oyd1w@nl5tVBbT@!oU9cHsOq_R^c*O!2;n&VSU|B>@H39ETIpd=%|)JJo&
zaH-~Cgo}&oo?5+(4T%*Mb}YG#9N%0PojP5>7o-=G6)iO0ALgY%<BvzE`)?o@WAsqL
zG{5p1)$ZVX#=`UsO#r3#;xX-_KQxoVRWrwQdL%3D(q`mHhi9Be$^Ou))Hux5ebXCY
zmOUw-oi<<fxvQlQIm-^sf)O{K?as&xom3sWucqVAUG*T*Bt+F;m0UbA%PI@a${n7}
z%cY8wuxoc7gaa<9W|oHK?MQ#}=CLef?H6)|*ya>{izv<$H-^I!5>N-bt5M}$at3p~
z*#Syh9(sK~v>Yz1Jj}bvPu`BoeacAyUV}BQJ1a-LD&h`kC*RkN4NJ*<WA-xT6{K&c
zHXBJDXE`xt<48>BF2~?cC6?GDA@WOkNxN^6X|qfvYHKY0f4qvwd!qdF%N>5du<z$>
zU<+dLPuN6f6mh<4BH2uDeHE+B*B77<bb5>HdSW$_luZ-o#iydX&q`vavTwU&-Wl9U
z_jwhXsNp64`OKAqkExG>e1see;;4wL($hgal;2OKMLE{Ks>=I<($QMmr+Ft!R?Shh
z=%!_X0aM63tV1S_xEzF&#gD=wNqtp!2VfVE6`oB+tu%mJu%!Fxnqa2?@VWpn5DgoV
z?YGsnEmA5MIbjoXW!OW~z|UlNhy}$}A4<0Q%UEDeD*r0|Bu7<W6kI|@+Dkd7(`G@{
zN{abY)W1KH%Szu3r1_}cyd`b~DqZH+%{bOpGWEY*L6nD`9v(3nVikloQWjb}W$!7S
z>JTfTgwn8z?5o`CiXw0f$Rr2(PF?oUb!ORwBhSTH%g=NQlh~U}T@7hOa6=T?A2><_
z#lSuMtZ>0NGZ*k(2XpqUP5ITnsFR|+r!mHoqVy%iA&aNVGgy9z@T%HgWVRM25wZa!
z(@IT+aTQ?*+Oxejyzg*yPGf*)#(CDju7&B=Z?iLwf;PiMJM0E2jnbf3OSr4yK|sBh
zTV~sXRcob4`pDtv)P0nWT{2bV-7r2i;ERf=+-l<77rj2MrT}J2Rrz|!ob?sE#Xm*X
zLqOiYy-VYpMQOgu(^4_=@ej}N$JwjFRh63kR{jgp7fxVlBumpARM!8kIReS|jkiVQ
zMr+Ekd-Go_PPwAcnPE$m>nBu-9z*YsI~e`M<-ph);)SH?*W2Y)_ditae&n}Ri|QbK
zHgcf>&#m?q5j_U#Yp-U`?n1pQ;w!mu<(RzzRwD7jm7?T`(bcy7y`88t*Ci&-3~)Z4
zg{{i4u@^O%v0W6~MXYApmoNGC<!Mqu!J7STt61+UBGvxZd1;YFd;F-|YT!9@5|tFG
z!C9C565InN-+N!?mlmhhqdL(RzCx99GDg`TDsV}o!durbmFZtSC%DwJqG5}^Qn@qZ
zf@N7C;udrYiOAWASNu@T=rHVOfjO38_HHC~>8Wq3v<1m}x|m8{Y8$C)iYaKbxJlHh
zupd1dnbYo#UKp9KDIYO0<P~)CfTEon>q@%)do1QB=OZjRaUbO)8R@2hwvgn~yJSe-
z6b?cZBQdut-q#3UxUryL_w(u+1|`n&vaDv5+@`EnGB<+^`I#8A(#R>LhYT}y_jwsW
z&vJsTB7^!um9ODwxWz8`Ufy&}X8qEXf{B?c9uA!@6t_#FW~gD|N#MU#S7z8zUW@2C
z?Lra&#f_G|=5!R#T&cnw8PiLY=1nq-bun#yqUxTexwADL7vn;<+Wvh|L-DQUuP?r>
z_Lk%V%OZK={Ja?&2JS@#?g^>X<zlNSr^TUY<(9jq%}*N7OP=0hdYGI-NP0zv{N()P
zi~}w9_xQ^<4jdjK@9@(kHMrB4mx>=G^vAOic(i`}{^PF{`5#Xsa+4^3T{KImvd?dG
zBvrZ*8>(VUs{LhnFqiNO#!!Znxg6Pj+j{q$ngSU~Dbu|K5#PwrJf!-x5^I5@Q4NG_
zHjGSwY9aHj7a_q&*lA?{Uee_JUL<ywNq1a*p%=|7D5<PjKUg?i|L*!~xh5zs>|?iZ
zny?I?b~$2bvCE2iW)2wn)JLk}>z|L)g+mXi3F~cS7bBviX<kGP_RBah$0uGmzLq7^
z^ZhGV1gprT4L05#j<tCCv&te$DM<pQ*1dr4Iwnd$r`9V360g;ubCcx<AccsYP9I5~
zuELzh@z^9y&x>-(&JHMYUsnrJ?z;ORCSb2K)5m_DG+*=~l-q6f*VwCMU3#_59@szQ
zEe`vzIyV8J(kV22)i%>?0Mnb9(z_cOyL7M`cfi#W$&Id_-?)~tFCa_hP&MHK;*fiT
z<t_WMEVv2dA!aLK4Wr~FB-MydM3VDHnQ5&vV_*9=%?3D&?C2iY{a}*KhcXF&)8&kA
z6B;=+XH~MUT;DQ7<x9FD5^9&Ut5C_`s%_WRe0uf4z_#B6=}BKwX1|>hr91p;4-^sd
zZ@=o4N&cc5el=?oVc+-xJ7lN$fL^e<y}W@hXhfppDJ8v9gxc^et5*E-m)s0|rtKU2
z&G)n~fkeF3{c*PC*f52!4atSADu!Q+xqde;`&F0hMLT@Rq!!6G8A*H)py;QTp(RM4
zx6}2N_)3#4i5D)i<9VnNI;E`rbo@x|$*x^mf#yhVR>mg`N2^fd8&m#b)-2P5pn$pM
zhc4cNt$J^4d)G_@q)F!ZP|LMPN!5KxhF$!5C#Gfm8N*cm7Y`Xx)(;%dTfZZocxrWs
z<5Zc@m@u3^dz7~aeMU_|or<O=)r^INz+8%MCn6}88RrtAFA)v;?IoEP0Kc$yljJqV
zIF|r@3YA-;A&-%|>*(z*wZV>;7@;J~+apggVL2;~sfvNI&&peLaFQ|_%)1n+%$i9g
z9fkv?PsgLf#wz7Le^8D+n(4m@++tL@^4fX90zBykQ+o1TgU42mMy|dD)r%n>kZqcw
zxZr7?l?UP9sJB_E?dgeL)}w=}SG5w=Q^B1A9mOJ3LAzd$5g;+@E){1;2Y$IXVhVMl
z4Beij`J0@+rSlh#7sVK~Gc>+;!CEFQowmzVt(9z>T`Hf;sbtu%{Go0x)i$B}px#!y
zxb3v|mY{J@QPT9bVN{OUo?iT{Vx$6lL)b^t1Kqp>VdnA2G)Nhu7gR)#ZR${?{wImt
zPh^r$Q(wN!;yA?RlvV3J(%J9OCD#JW?N=o#(EzX3sQxQf*>>WKGjrin*y=fLgKd8;
z?N~QVqIRnDw72bgrOd0JKQzq6s5~i|lrz*TG8}TRh$%VW=e=BhRV_WWKZ@HiE<{rz
zi$+83Pe+S+9{n3ab<EMPc2}O(jL%(t9r{Pv<eK2J;`apm9Kjaf-u~V}?Z|U2xxM)u
zc=gvonSP9z&7`H{+`}`t$ad6R>HtTdZnLuJEj6^Vv*eB?4K{wF+Wq%pu5ybbQ_EhY
z?Mt-=sl82hxzq*Kh==4x4uKih-U|wEyQ+wCMAjVnMVoW#Kk57{6_E{+_2ZA9t}7TE
zy0;>#dhJ?C0_rNRNgEQ*^nCd;IZg}Q1MZqsFJHYsH)othn?6+#5Y+g$DSagvF}Bk6
zXiIp`vQoM;z}bpp;HzN~S+`%*Ce#;Y6*N|T5XK}Yb06p1VkFnrQ5jM|bw_D4@#9np
zO{jq|$vo4M#3yaqR@os1uL(u5V~adh-4=dcB<yVyi_;nr^?Q*%hS+N+yftRMk!ehv
zf=CYRsNaa^S;|+$NK_)*TUXALv~Q_t2It=oW6{rGsVTvgclS$MXdAUHr999RCQ+oq
z^y<&MHd4RJ599BuRwBftMC)*GTpd%9vpK98zJT=V+=xH>873BrLb4p%#Mujy+U_Ti
z(*`7DuDC=Il3?dmUZ?usB#EXZNn`3yjxuM*rHH(w_UP0(_g##AeMMm%l}?ju%d$50
zW{2nX=vm`M5A2sD5+Tuq#iWaE|7Zb%q%Zp7RLGB!-5c`~ARks`eUZq^>VTFExEX&$
zKEYCQrDtt0&sZ`|Ipm8mX70Wn&be=Oo!_$kXp;GL+0kCn&$J$)$#yxY>wl*Wd-c(1
z+OaAs{lL5@hcgL6{l`#14{HnJn@y)_FV%1D>J$TbWjOLUBfkN5fnS$P)W6)-WHjOk
zxjG-`1uUlNog{6<YLUJE%lxDFW;Llq@y)sIO@^4zB$3zDo$Y8YwTt;t*z~4;8?1if
ztvCAmt5zhBaHzzo29GkuwkU_3?~yDG^}{xzJ-+^HNx4>K=@wn7R70vg5tOy|ajvv*
zd_(JjE6CsWrrh*6Cw|KK;!qJj48EbMous~)x^wtlwX4_E%t{PkCxns4t`gNVyCc~4
z2AuC#+u`y_o5X2nZ4bEx0HM#Onr#h(l2(=O$;&i>`OB0WMq#Oc{t{4Q<AbF8i{rb3
zn9)iLTl~BZK@3w1CuPdAJ{&kAH&|@58v9_*q$MR(o2BvU*$Q@oxNq{OkBT4hegoS>
z8a9b*aX&cY{nE-&d8`>TVUAUzpQ%6miPA`v4Ef!oJ%T-(%)(&U=gaYot=yu^{=s*`
zGfF#sfdM0-Co0~29IUbBTDv7a-XRyu!i2l<|JpnAXsWyK@gts)DXvNpD&ZC(nF<#r
z^L))irb0sI5+M|22ubFdD-FslLkh){p~y@`h7?bf%)fmu^;A#KcYW4xt?ydjKR*4@
zy14K2K4+hO_Sxs0y<g`%Uf`vXwBn|@lDDr*f!&v`R&vxa-bU3Wv}-E$s7~X4$EmrN
z{dDw_8N525B*HA)jMxLbCeGAtJAjUD5hb8y>dBX7g6s-0;M30)_2)smHl9(I8^yK;
zd1!lWGMJY7s|_JD#k!Q5t&J|Gq!AmsO|XV$n*hhD4rf`~^IJ}Cv@O`9(}t64DV+q2
z#>vUfd{r}DhT5rGx#iNN=F$V*#67`iZMEm)B9qE0Uz{~Z$WxXsJi4UyIRk>B1xCfw
zOk@;#-izP$XUk2p+^B)+)$+ZqF=J%f)Nd+-IecVzt8lAwEQHhVKmTH1bV2PFqiy(x
z{P?*!*{TM8YUlB;6I}7R{g5EbJIY|LE8jBzXS`^$xTvGGbCs_;;lTD>vPyrm{w$!J
z()-}M`bhM-DstOJNt5jqU06Z4IRLDdY-4I4inAe&(WO$Y4O(T^C|d;@(PxvLd6+$T
z&X+yRxw*1(DaNGuqL*?HESS@4b{?lcsMB!k3%g&6Q%>cUcYc3_efQKNd<QCctIV7#
zkdVijz_y)&uv4|q+Z@Xw?)ejTC|c+hByWgDGZj{$G!l_B-NTbb;Mc@m-g*9+K~eAB
zxC`Jc-+~mYeG}%_OkW$p2K&sHs|VZOZ{1!*pW_sct@Yg3$;0pv;D+x)`BXBa01(oA
zWNG?|>f#E7xg9jj#o8ergDvbEy>ZxHT(j$o@pxuEgbDMVlJ+2A4^L-o<zptB`9pPb
zLnZh&Cts<G@=08%8fXXDg(HLu<R*B3UWKKJVe86LjnzlB{?kuQ@2d6$NXoa$U^=Ut
zGxcltPZgSPtXd+I3P18mVK{SG*ou$IeSYNT=rx~D!_S)zRYK&C?yT2c)>mR*xZW^b
z-sO$N@fVDOV04gU!$?UzKP_<MYDR>w(!1Ne_vS)l?^{rRo`l&|EtBWQNT#!@zIf55
z-P5q(5hpExoa~W`&ARp%C5KxW?XaVH8)Xe=;*k84nHRQ8A1G1!7N%wFmy($=FPUy^
z@C6-`MRK7X99i}S-+>;)t$Y+BmicES<Q*3GxgS)QnztJM2obUN=`CT!iEwuF4mjVg
z{wy63^=Ci&Mo)DZ&OpOIuw277*cjx>et^onFp5;^@qqLbM@20D3<4ohe>tTmeg2K#
z#)MI2&3H47`ku?Vu^!_uYm1G_KZVCkB49`2s8K&QIG1F$(=wcbPPIMSm1c279Iuc3
zXXm8C@n#)--{EW@PBy;PEIaAIB+`w1<(YmH=lUQNHoh(|vwM~-w>Z)DrQ$pAbi<W)
z+QOmkYJY)4pTsG4{MdHLx^k5M5@||pV;j~N5pPzV-NL$&)*bfFnw-7et@rbt%9b`l
z;U?~(2X(em<&*7TnkJB7O;8%PAe`q0lV0a%;Mt>L^PHbT+=h~zYAi_KJVY^SuefOQ
zm(TzQV^^kZ(JKW3u@2FC`sNh4%{e8$uTSa)TU4CMDjZ&ZeW0VT6u3EuWex_O7lHi`
ze@6lQg%&DFlEI_^HJT@gZ3H9Dn#`xlCBS6+Zfi^e9SUqT^vk2TFl&*3&G9rMC{K58
ztp$s40$?!_Gxg^+6)#poL|i~w&=!7Uo4X}L)WqzXGphizh()ZK8#c3(%{@ANNAWr^
z0gE)yg^$f&OH#jA@KPz$_144i+<@i;@q_o1AqU}=4GV8oR%SiM{!@{Ka`Ag(SGE2u
zl2CtP%^rX(ye_bct#HiGRfMUdLR0MN8ANcRt{J=4@idqze#N*kqRWqO?kcq*Z?EMd
z?}_wNRdATE@b=6^Pr3b#n80p0_Wq={3`d|}Dw+xgVZ-$dMr~gk#scNdJq(T+f(bJH
zlgNQWXRYXS4>s5to>x(hm@kCG_I6g6KLy3j7GTdiv_{{~r}38Uyr5UI>|5<)+!Upg
z8u6z069vru=OGbK1?HETG7WtK{<f~Bnq+<t<Fw*rn}7hMgqDB-yJkj`;PaV0=bg6X
z`+}W{q8rQd+v6$EA5O7ujIou(H{EU`pKVd1=g~1V=*Vo~--<nMd@5V-`XJ*ZfpTBs
zdNw}r)T;)hXI}>UORY@_3tN$A*6xZSnJ(HLCRRhm6WueLAlE3=WzIsuj$$kyK4nzK
z90~BpL2$mS8qd={^J-sG`5QrPOR|)gS)H#ScA=64MO6rHLV+_G0)r19u(<JRQc0x0
zk{WN+>TbAT@|e<bpDyz_L>mb~w6kZE6m^G<tq9-Z$V&QzsNr_yMyK&xI`v^}i5%BV
z!(8J{b)xR{%?@Rdi3GlzMdvQmB_*m!=LKKUt=Pk<y4suiyIJX$>|T@kmF@TO!Ah3+
z_DN<vW5Fq7<~`2wrX|{Mp8HVgnCZE(n<PUN%p*+-7JRB!j#7J2NTHEKX>F-#J2hTH
z>zE^7JcQpXGrxb3aqk}3;&%v$)~%N+L6N@H4UUX0?9zs19q77orNB7ZORawy|3TqK
z4)r1L)d9%7ICtY(L3r2<EO8D$PJ+OfmOR*eIcGwF&j$o#-C>B99IKSU-){5NOsQjf
zGPoI~)}y?7DiRG<KiT#R+qZA`yl?p8`5S$-ZlEQnrtlJGjcLevoZ+Of=8y?NdbmRK
zrFk0FBiWE1c8dYJY80grLd%ag^?ro354GAzvAEo{C7`^$c(0V22ByB5kT(=0J6i*n
zj_|l<2wX1izFNG2@Ax5~B~M5RbQulDvkgOx&I@sQIFoz<c~x33d2@yoQW?VivDEcr
z+msJfxMo=L;Fxs8jT;qt#Zsr`G!|hEnV=D1^7tgoH>x|k^$v((_sl>HhIo7EQq6wz
zKkh<^^q{VB*?L4!Qt|C~ax#gsXCUrao^-F{B1kEbTE$T4&mLD=tWA393PiOeX5N_;
zcP}o!eaP0K?ix$ftAPY@7w2B)Bl-K5Jv~~^twL(nxxoV2(QJ-36gzJ<;$c#@xUa<-
zwgtqQbCz;D0&c~poY5g&g6*2-`3J>i5sMUO($7@fQBRhV_Y)|#?Gg-3T<V78iW6T3
zF0N*c6k@KtbejY78V^ROWt@AgO#9Fbr2KS@i;OLP01}fB>>HmJ#{_r5F1SUOzaN^=
z#G%(pI2dyLeEp>yy1TqQB~NgTX<womj6nSsNP(Gb9a!!!?D`<b_>nNZr)521o>Gi#
z!P~00<ZM#~HxY`!!3GeGPzPV_=AC_BEyB9?Ge&93q)Zf=Lpj(ylH>W-chUlQ3-Y3|
zGoEq+P`~@y2K5e*yyY^MSv~2!k&e{{4qQxUYjhQ4ILYw`HIK!mOo#PcH_NG6?jC*~
z^#YP(3}$0td)&N#irsKRp;3+?6dA*>ZFcrh;N60}*f<rdw`Zu^cp~t5FTTOb;!#u-
zB;gz;vayQFr#kQ^@}qf59~`1;Ax<>{?%loC1=ElNgj#+e*903(tzy2dT;fJkJ*^;t
z3=b@~t3d&nFRgAFnGLSA1Tyvf3EjZDNxfEZT<!|xs$^zSt4cUlDXA=!-!&?B5bry!
zD25lLwxbpyFqRhEX{7#T8!SBaz&CCcVuXYpx3Xz-RtpOQts?eV6OaU6BECZC!v;_@
z-#Pi7!#4g>ZU+NtFfiBfU^VjW=pCE4a#?Ly8!q3DlKtS$v<oZJVKIBVA;8n3)bwN3
zst>n{rh|4Z6Q2CR6M7$JBiH?W%a$9J(yl!|prCDkk?(RFWLa_NrA{0)vd<4RaPP8?
zch|Pw!$+_#worCCS3GC5G7f%=OKxTY@HcCNwvE4+V5bo69eJQbC7;Ud7`r><p}z)%
zL+HNRChP$_iCLD+h2!GaYq<NT1A6KqMb|-xwC;>gqYhzvr0gEenr`YhIgHo+^}hiL
zS&HtuICiS2<;r73mWJH3tUF!Vo*z@ryaT5~m80XQ+#yLb9;JCrDNktQ?@dlfeGHPh
zQ4SfiTTj&npv>P1%!vIxj;LILE=~~>dY?w`W#DN(H;~H9yVT)%3MdgtBy^A!P6<%V
z>A{x>G$GUIZmUo3ZFfYRbtjUjT#Ekv<0jgL*)%8Ie@>iaa3X)O4GuuQp!k!1<`lbq
zxp_=_uNal7laHEUmD*(xIt>NmZ(lApD~H3ePgw7qn(&xSpLE@MM~uOa(z!sEqx{a|
zIemzVzr`YO!dKIp88&9S-Qg7PB!rP{l016O#@^x80|OMc+^?mVdU)18M<bX_=sZEU
zF{03>byxNd{H&(_(Z&E?6(Rf1`z@DSKdJT$gkz^+)y}78qtJLE@C@Rci4R@bO(qIQ
zV`W{yxN&}`s;n?y2oq1|B>a_v{?6%p5Jlbe$?D?+`%PQ<h0(ch>xzRm*G@iZ*q*Rr
zgE)Bk_6JI5%908bZsq|S&*Xl5%G9_;{cA1zc>4#zQzxn2ZJ+1!xD2~zSU*w&5Az}L
z{N{tA3ut1=^uG+yQs1$y`mGjc;_~VJsTSKt3WRK6T2H~{S2Z6?I}xt5@%@zh&ndTH
zc}6I$1R;LC1Jb<aS&M+=|GM|W%!?6?g^#x_2@B7`CYuUR-iKVR9f8sXd-U>0LNLSQ
zaEVKkmI88bKq=1bA59r)C=kY{T8GrIjzOM^AmnYO%>#ODFu=QaZ#YCX292lLq6DhL
zejK70&V9?p16tNHY^PcF;qeeaRq?hMd41k+1r{~!7>c_g)9LQG=dYIPjSn6V*>~4T
zmQBKguQ{U<6GN5D|Mp9E$~Exai@NQORXl2JCfL1<QkP;W36F^^2iVb{s1Z5@c@b&}
zTPl&|x1&M;JQ|IwM0_-?q@k3W%$BOGDl^xQBRVxK!Zs1;#+|E4<#o3+yK6)OqhSmr
zr^Tx$h_#|bO*OYYfiL~zCUp|WA>{h(-Rl-8zS$Kkey#`c(_ZUahFX-ruI-@u+Eyzp
z2y^PQ6mX+x)FiSKjL`(#tI0I0_;E>42>w!pi#-t@kkcLY;eEivpV2f!Tc$UC^^}gK
z%_U)+6U*&O_7}+2O3!N8Zzykf%Q}WICg8KW?2(S$N9{gVg{M7sLlYxn2wUs2fhvzS
zT!0L&YqB!b20UVEN`i298v2<P)1vA74l<@*7_9J4$~;mw;N?l3(F;t_y17}KN6zo&
zy{eUNDwCX=t7N=U-0BY7BmK7++NqicBVr|?E?r4D_gY%6K%q_C>CjS>a(D5|9dkEx
z2W(ilyHGN7lQDKU^`ef9yi5j<>gXMY>)7|5^;ci28~4~5+^1sN8w>G%%~Y(%Ov;q;
zS9L0K^u6AOiY8FY*T%_;QFoxsf?AOmi)ED_`0I9${VAH}?yuZ!J5&g~(W5K<uX=CS
z+$=AM&6Ooc=ooH?OJo1sGYY4GwkSWeh-Ku07AM0yISYR&Ladz{86-t=Fua?xPu%-B
z38NyC4J!R@2@Rk2XmH#nWmgT1G12n1hTYEthJxIF+M_gzgTWBgAjZkyWk4vY&(pXh
zi$Ocf=in=exK?nfnLW2j&S;F>QW8$|fXvXiG9Ng8xJJfAO^5GJWgLyllq$T?0e;_P
zzynt{6qTLLc%wtMF<?Ay(+-_7&}Nk97+p_jx#=%Ut#Suwh`~d;)5`6sY5VnV<}gui
zqajT`AInt-I4A?iV~eUa7)^LD%b{|JeeNjbVq0yr5a^P^b>Ee+ouUp2Ox?V%DZgNt
zhTi|sHsm+J=oNpX9kyd}<@gCWxKSf{3*l&jkYf_e)TV*o#aG{6;pGv_K*=TG@=j$2
zSU4;cg|h?HQM?82aC{aEd9r~MPx=(Rk1S3;Y;rcFp2=y@j%|>-PFGV$K{1)UkCv8J
z^&<P&Ob);x))*0XkSmleKh3%YfE$|rQWH`4)%6XE(Dr`Xrm+`?3I!hT^f=r{^6J|}
z`7Q7&CmP0u(({*S*C;|2GSRvcJemO5HX#SmY~Ft|r5~-1W}MaR%!V0UZR|1;NTyMS
z>$EQE-K|lW?Rxr`vrvXB0GHv8+iHZ8a40cb?t?3QaPM5;aqL&8&qy?dhyi7EF)R#c
z<(DJF&Q^VA;_ZsZ@TCVEPz_}Kcp9BbkC1x_d8Rrw{3KOeC`A0Xd6J18L0=T|lO<(O
zx|n7EK%5PzFDS#*P1KrZo8Ky>=f%e%7HRkCZ+NF&$>$@Zv{L1m5HkMM8p_A~>VbQv
zQ{~Ua3ZdLz%bnYZP=hT{j_obQCSdA(_dl$aQldko_#?_)_2lF>G<sT3<PKVG##O7M
zVka(%>XyX(B2_y+=#19~tN%je30KIXQ|!2aGFl?|q4@DQ^`^Z~$VSM$pUQV~KiNv3
z9fk{{P$InE&F?SCQfVXR_LED)VGbS*qX;Jr5pU{(+9~Bn!w%yMUc%k}s`0tCW{O5j
zy-AXC&segoYL8YI_f<WH(Q7yna#Sdbzhqa5l$i2*p<h?Y7Ak@>6-swOo&Bs3)#l5R
z0Al&1z+FRM+j_1Enu1M?1g+TrLS!_K*H1u(T{5HnN%wtocG9vNK0<-d0jY$;lJ{0k
zg)AqIdORTa(mG4!#JDb12F(<&xsT#=S0Qjaq;o?&Q4e&GHXVv4qWL1&pbcHOPB(!k
zHvu>a_2m~VyX83|arv3$y-Jc3Z6>){kZZPoggH1l6aZLtWSRVrGtD6tGt2JecYX%P
zX|n#y3m;3F&Np@AtLLFc72w+tZr-v;fQsPYmg#9i_&JN{AGBKH5}ElWDeCDG(qx`?
z@0`h~$5%tN<vAV`%hw*dbemj#|2ol!#4ne+llj6|6GHA1D<XMOB4G<N?;tLv0~lbB
zpDb7vNpL&#E@AVL%g2BfsaYen{Z#&kJ{Qer$5(B}50FV0CYXnX;zTGKO#GGYblyLI
zSNMcTEQBihR^c>k%JTECdr9$c2k~?Kh$2&f8)lL*D!Y)`Bp{SeH(}%{0}sE({*t#Q
zm5sEMC5t(8vdZVi-}@|XWoI@*18b0?!N9+fmWy$MYU)MIdT&HXl2%2~Gt)oxU~{Si
zS$s<c21+VrTfFbGOYP!02J_YP@v3zIjr?xP8F(}~>^tj?ZgEd&nQo7hQphe-4~}~3
z0ANMRXr_tQl?7aglBKcA`dfT)o_?@#yZ@AB->dP*Qfo^d#9)gQq`wAJ0fPeEsr!At
zKz6mxU?hR)bhaEfDZg=UGKH?)yTSJq0NZ$Y7>?&0-<V=ZzxBIQ#9&gr=xgRR^V-2x
z?h6VU#X@RAc4XOXC^W+g>Cl&V2?ynY35a-m(k`_Mn{&!=rRMCk`s@u`jT)4D@)Pvh
zvu|$L72!!LD;b=;>kq{HLsDa5beejP7;H8Yax+Z?(4!wz=O?9#I0kxakl@zD0)Qd&
zm?7ue^W=PwnG4IpgT{2bPbl8I!c_N;vMrw^HAy1rNKn}N3)V%X!30kqG9OKWpK*3z
z$uivS#DQb`cTeLEQOsUV?C)%=+Jt{cWmu<dNz1hDQ&N=Zkql|>Y3&y*qxzIOroI~L
zBcbACYnA^l{`06n4+n7seM<q-{{_3$CP8F;oZxefcs?^E^HK)fY0$Qx=OdFgHbp|y
zSQpy2nSdCK`;DKx57}5eYH;GYrX4U)WkAT1e==GgO`#)C3-HE#9e;ht=mghJGHEw~
zL7L4-B)2b8xZSQ24O#Bnd{)8!m-ZuzF-dPxIO#q2u`ki(RfVgir6sm0Dr)VuVW1xq
zVlEg|f4DkMDl;da9;Ocr<)Tcb$M_xDMu)%;&4+nvNQ^Tm+f042lHYQaQ6H|0A;Je9
z98b1vLSV2Q1YWnZbJ>rE*0(bN60*${X*zESu5XUZb?i{F3nV+XS^fG8u;sfo)N&B3
zPE9=!d%+K>)ei;3D-hMM1er`y@*&V)rtH1mDD(6$AdduhMD;_hulSZn-`ugpTW~JA
zI(*^mB?`@jlh4`t4aggqrhNti=lf@NhAJ72bUhUPwQR(NbR#x{0YS>v8S>x4$zjIb
zOrFMHF+#gECLn|Ne1yN$dzsZdh`DH0z&oFo%$E7eg7<qjrwx(|2aDr$4HLx<K$D@K
zfB%<@u&{<NS<wvg4AFe-|2%jqn1t*z8bwm0p_;oRl>D^(H}q3;$ns(qSirNHk=KsK
z?hf$LI!8FGwV7OLzS6ZkTUMRiM&y^pAui=8NS&)De+O!o?>&H@@uLI8zd4A0e{PE2
zzMqn*fw!<Bd@q;E>aFA>BokWI%So@^qz4TaG$WGa%txe7op<@ow+a6$AF3^2Fy1^j
zw}BVn*r+?D^&TBiJz!XnL)UzT6HPDUFzTa{N{3Y#6e@guf*&F3-pn<vz~enVsjO4U
zrA}u9d|X)rcPuTaFO#43cjMA<h`l^gG+|f9vj69o;X=?r@^ltI1Wic!0}UkKCQdYv
zBH**EY(5M4uIkma!zId&wgH)cTr9X}CRT5BEGJ<F&iYxaQ@EZ+W;b4W$?=BvuaXYO
zwyDi?Wp4LOa%VCI8{V+~-6O<LFTrKwR(<%q6scMv3d>y-KS7Qf`3)hbkccZyWj%0Z
zhuw4#3IlkS+GZy^F>ueUMw(EL+2<qM9}!RYZsj5`=gz4UeVIkQCk$)V12+7vx*2NK
zUR4<b`$`8C`!FuDFs22Nq0X~C8X6j!1@m;%>ia3PWoe?tMy$34lYVdcyo=?lca{y(
z_T}yBL0WzTSyduX$uEGav(eGkb0i4vC2G4ADYod$UU|PwuZBMjOJ7enRdTr_x!$d!
zv?FeS(H$>kLWky=wdzFdBAwb%^>HfULP<gz)jJ(fk?SB+MsYqGYBXMzL5<wR&(MJH
zDTK(9yP00t?t)p;SCmJyNz1H#`m^qG>tB>4csa>R%g&Cto`-YIZ)rY$4K?myP(D*D
znmvoqjce~>xR5-e8|GBpVpOGbWjF*SvthYG*PV-IufdnBRKb|NKAA9cjKuFej6QAk
zhqaK^nleGwkHxmrSVTLg^+(S2GaOH?<cTuV?>^v^&Jo);`2ow;M&;#QlBv!7hvnTT
zxkTg{aoqaiSjqb7<N8)8=bv&avqLR>s}Kcn?IgVf{Hmv1iqi0F+!8c-NHp^}48#au
zwqd6y?27aoZU`MaOMY}{?1|%KCb^wbS2m{DZc@%V$&SZ`SORe@`|-dlC>|{W1d2yV
z)c#p3-hhOJ(M<8Ej1Iy*b1-N@qZD{Vh!H(JSZFQJ<{d~Jhft8_!Si&`asv#dVV2?+
zAHOK!K#d1t180*D$|RK>8~+<%n-WQ8nAKRBof@LfRXYxb{{h0;Ha=`FMFZ_`b4TJQ
z1|#JuK@r7xz(!~dKTD>0Mt+D!;JMTN6-I;4jK;5w6xvDgtwE!S<|u`R)QfNzRxB~$
zd{3fG(5T{oPB-2GMWBRzFr~_oJ}@-=zM`@nmBLZk78L$5m{-13<amc}fB(Z07MU@8
zw|l!<4u0!wcy^FtGP6uSZQmm{BjliIg20cqS{NY{=qC#jFOfo@5GPsjW3K?L`EeYZ
z$;{<s`X6VVwH2)52pdQy425lfoOsl@k_P!JYxa|aldSCc8;{`tl|P=q^E&AHkpgE#
zCB~^DXE`A85_(@Md=9X%Acj)-iNR+6MQ|v#)^qNf8~nrh(cw>z<*el0e>TaeKHH%@
z0;kkvd6144S~XcSHH<^ixlmk&orE~cw|e|!%+Zs1RIY$Rvz`CUDHpVDIX&xUEWm&t
z);$I{7*rve6FQ&f2yq(^i^#n-)vjpsZ*6R#Z~XWP#|6UH(YjfQ`r`+Do`x9+ZsK(1
zHdsOZ`28OrU*8@_O&fx7#B_7dGWzQGW8i<Z`I~5S(2Sg`>B51!;xoKROS(3J16<O4
z`2BCZrynZ~<7#a{{C-^h+dFd+^mLsvtWW{3X;T;E1~2d@w(;*<`es?c9WtPV4?LRo
zzm&G>U=MfFuT`N1stzHroFF0%H_&A~_o(mh%Rnu73O0yaO8GoGFMWWnGI_5@G73GT
zh$gt<E_3^Te-r~I_z4#glfZyrswI2I(QJOj3<mvD*6cTL_MZjd+CiIiLgO|dL;js&
z2gWW8y1nhMVL{h_(=t+c(217V^-mGsieGU78PXvX;8evcNmR986eF7FjZez`w_%6l
z>gwtkz05^X_ZpEQxs3xcs9$l$!?x(|h9+XV99j~tXB+4stgp54FUinrJ;{;gyG;5S
zd;}FaIk~5%o#?WMXNWO8`9k~@vgSHU@NlNpID+BxAbHFO&Hkt_-_)1klJDFT^5Vm?
zQi9blVX>cK3g4Z`j%7l@+&^l^6=d>+SPuWvIO2~wvK(U@?D*O<txqC%goNxd*;T$G
z{r;^6X;=3l{w$6I(W-MS?9qtRK}!?3lo(-teytL2Gieo8>H_YCHDY98(DdJXg^y@?
zLN>wh?&I;imz7enGH?bK(%61d;u?=qM+Hi(wHe=rUIq33@jq~!tLO=YA?=Voi(p1#
zp#Q9Fi^O3~g-dY;Ma7;{Ziaf`kT1KQvHD91B!2o7OSxac5Z{Pxf~Ewbkts$<K=s$y
z0Q!9hH%dJDLfBc()Xtw7*K74jIZxIffqXjDU!Uokt|NmbftmK<cu$wiMfcum7mE-o
z&Jq>3ek_531@)5=lu_^xF%f@<ON4z2!VHzwS5$vlXxJXBUE7G9e@K$P-1xW2{zro(
zkYGodo=qe3767{fiqDP;pjoy0IFKxFqD`M)duF{FZv1adx?3cfW5#l46s?JGriC}R
zktE6?-RFey6A)m5R$RZnbiG?5VG`yh9DG8&H(-Gg$b}2@pCAB5jJvh7(+0CM6Aj6q
zkk<cFlYBHahDL@VNvfbgvo(BtgcFF6Dw!kbZzt>50&s%xmD+%>5~!KDZSaG6bSDv$
z912?6UE-qi@71?pYWc*wpz*q=;95wGb}F&591OakfBL+{jm2+R+BK%$&$_rK5>O|Y
z$Qu+-p|}4H>SoA6y5uIHNb^5DYC#-WO?!2wdH2?JK<|-~uOkbgEN3WpkB7rHaL50x
z`%+Q=WLe0e2EuL%_!)>W{+~Vhh`qs%8w-zHu06gAGxeYrszM~<fS0;FdG@cp{_|GR
z`hnYj`Xt;8xHT3gb>TDW6v-{F_C&irZH8)3yldlsx!wQ&5`$Z8Dbndr_SG?(q>gtI
z$=q+Mz<QKmkobS}yw)`1AKD@ddX1v<i2X@Q?E6=GTlfrz$&Y`YId&&Mzm?~R64AHK
z`43cg>$H!+?WW5#ER2hSLu2=0@A|B(wy5{CpJ$&6^M+#(jp1HhU%r&?lLaYpR{fMD
zpd~4~P+vQ)=Cedy`)&VTF{q~d`m|TFkCWjxF7A@LqIj`V(0HVCSiWhr>q(d{HzRF)
zjyCLiOU<qRRX5>!o@reJ$_ggpe&Qhv&!IgSx2pRx**VLvBko#U-up&d&`<*Tg$>{u
zXp3;N{hOA9A@u@kZ<`Mt3~tH{vlM;GoR*GF6rmxXjra@z#vJ)h45>Y0%(cGgTtmhG
zL-?qVo>FN%=02-k<I9kf>&wP)ZZL4Gl7vrZiXvh8rf?3;l#Y$m<UI<CD>Y4oSgNm9
zUoGCYL0HRQATvhkV>HAb|83X87F2&kxKzs-dwZH<%PV*6md#fn*kAUVP^D__xytq<
zx))?;18|kNXpZ17MfaBPFM7>wdKf-17dzl%uwXiS_~LRG%qDSrb*1HIU25H0J0N(b
zWR}=T-w!>0*COKY{7NXWEYrbWd9!*Zeo7m+VmDXkK8@YiddlroWIE?}FeQk{7r8tC
zgPdXg>T!-4Z%V1%wd+`R6M<uTwY&V%jsSC#KYSM*L$QAi-4y1_ci9mA^pubOh@0Py
zOR43%33v~K&*JxP4p!Gw=vJ|O{A<%;U$8~iWPeCm2ZqR@&+GS8{_x?{CS&O)Gda?W
z>CRu~aOcTZ86Zt6j!o=ElK5h;w6TZ8E29hUc27(w8N^hpt49m!M!0alFNs+83H=l9
z7-1pPIC0B&w+5#NHa=o=&~0*ozm*&~@y2DKh5Lv5WPel82`|4^!Imkp=RJc(_c|%4
z7<je_O!M;V3IC?*zc$A20_le2B|0vS-<11nmH%5TNbkQw_^%MwjKco~P4J_SO^c7{
UHm7GLfq#m})Q;YjJ>mcV0BBC8K>z>%

literal 0
HcmV?d00001

diff --git a/docs/source/_static/img/dynamo/torchinductor_backend.png b/docs/source/_static/img/dynamo/torchinductor_backend.png
new file mode 100644
index 0000000000000000000000000000000000000000..84e37aa7c4b63e7120ff77efa0b5a996e0d87ed7
GIT binary patch
literal 122529
zcmeFZbySpJ_cksF5(79x4Bah5D<L5bBPk$_bhoq&ARR+@2nZseAkqvqbV(^lrywDn
zlEUxi`8?0_{XVgN>;3Ef>s{+!i<x`YiGB99_u2cLeG#RhrbviSg@5PH9YVO0oaUW7
zSW0*9U=!fp1Ku!q7gGR#Fx)j2W$u&@)BL(~hv5!fPFmaN#dZ$%cb0C-a?rpsU9ea?
z9x+XoG_i--T^O;rh$C+Myp@|Vl#oj4YuobfE*>#U`@Avc+|hB%pT(@Kk9EH?e`N-2
zy^&zK@zB+k7}q?RV7ItBTZw-{MFs(5{FkSMKZu_t;k(cAxaWWQ6&w`#8jE-ymx%%M
zKc8B5hM>aV^{?+lzQ+B}M~UH^#D8nWq~V82tYjYu!u@Zp{x;m*{~QZ=^Z$nXFU#`(
z7W2Ph?EeHR{whnI_n$m`a((tL(>$8<0+qg*_3tpQQIo=iZxK8X!PQP`3nPDFE41LV
zQsU&d+fXdJ9LfA(TJ6zOEIFxvc|J-3ZO9a)oxnjtU_IZ})S+161rd!S&q-q&9et}*
zAtU(n=zoL0)E-HK)aa!}30>st^tO+s<@V*R*B3#H!=mdn)zjA5QEC4^v&h#Gw}uI2
z`dW;oW8&}WzEe9%biDgBSzV|z;{M*)!bih@N6S|#3_O-k2V>Jr8v2;B5k!x4jXlSu
zE{P<LHL6D>FSYAh>@#q`=d=y*wmLea&#kN5FhLpJ<R;CUA^~SVaE<^klHE(^H`si(
zjI_Qw=oN0-sWUt_0sKmk2WFoC@63kns)0d|F;$y;_vsw(5mKcDB*kqgV>-T|bsywk
z%e2Ym$Dv15Oq(OQ2d6CJ=hOY47t6gR$5UeX)%{E>Tb|)=(4z^x&KNq0S4({ozo$B5
zDEOH=Gh4j-?%o#%b1$&vc#Li=mQ@eiuf1^}Uf7B>Ffzo<3%K}@8F2B75Xtf7aDdmP
zykU(>5Kxk5PQU-@yp7Nz!u{y#XhgWisyN3>pqD-nF*#d&FH(vZT(J^-5X;1qf!1o_
zd%0S7@Z{z^KGUggn(BeE*R<6;!cYBd*=2Lc4?=)^GFU7Lq#Z@ca&IrN<`_QBy0o@X
zioUcwq0X%iD-`(sp5MAs)UppOxYvA?m1&*TKPa*i^X&;1DH)`MDyVw+@qlCPxDMga
zMembqvk4>jp>m;<Irov+E{Msr-(Jg*Sp@CG`0roKOkY7*xqiE%f~Rxt!VgbI0<Wt@
zcTp>^^3UxpqOdC3vMu67v+Up4<oXVV+?TXvd*j+c{JH2O0KjN)f|O=x)-WW9j-g`i
z<v^y`F3RHVdqbC&TG2npV}5T}&$G@hwmxT5T4yjG!AVpfKKt3xHcWd1ec@wP{Z$6!
zE1x!)x+w**O>CpTIa751!dd+A`^!GYhzwvE>$|g%c>&8&>{MlPbHQgmZ5TLm(s@?U
z7jHffm)#Fi=56_2*0kG5NPB(qInc?u_0R7MDGhaI%zRbafLX6ur=gN(@8keOc{>2C
zqlfPu4_}Bac@S&5(dg1E;`D&|PhYqaLVYX_Ro3!qCtu9Mss{xO&T6-Mv|C<a&bgm1
z__=ht00O;}F5FOP*eY)-5W#b9f3TIL%)`GDC*~}i_1+;`N}X6_<88=b5Dwkc*WY<J
zSL<twH`m_kzZGMhW7~d2(9^}>ML4Y5mwuX<!S<-bjg(RbCQ}ot&W@Dw#JgH1Z{<^$
zIQcGhcRI>iv2pRsdR`!1CJ%E;%JT|=_Ia~OWA8^}DuSx|;~(iD`^0Alz4SvY$tnWi
zKo-do!-6$MdVkBcEC-&W-KIUE8t*99G`{+IFGJSlf(;$wpchOP!;iNz0LyC5??-8i
z11f3x9-B6MF)X$x=ghJaBN%tS;}Yo5L*-DGwzb`j@3X#g7_<7;Fe7j&TyR`l>%@HW
zyESA31nis_<`m<25L>p_myxvE2a(Kie1JQ$F)4U;G$cF(XV0<RZCI!4T#l3RI1|7Z
z3q(<>h#z!WX-`&rww;uIMhlFoi8+n@&i_cbe$84PW|$%FZsOgz6z2^XsE|<_Tr7BN
zP;g}K*V@J_rmsg>b9?->AG5Lh0qan&J|0!RkK6vI(Jp!2ny}kP^Uk(7sb?8(iaOx&
z&Q-XcF=v96DEkZ60Q-nt<^n%z+C?@$B~Xi?I;hV3Q)Y&9_7C^;b-XE%Qdm<QBo2K8
zQ&{v_iS7HtF-(y(VKChCBb<6rQ<*!}%hu%DS$#R+=5jH+6#$n1^oX-(G>jicGgoze
zv6YSFMWfyg9a`zV#dwZOG5ZYMsMm{UYAjV@zC7dDe9)6fF>r-#wwtQ^BC3o~i7Pfk
zX@BPhC(zps6B4g}-<Z+wxiLQUq2B1!YRl<@pq84bl`;6?35hbX5#=8Ga<^!4A>h)f
zQ(R3R0PfK(aC3bP%lm!?&AVUE170p1Uj7+u<T6!Z_G~TaL1DZjh~PODM2esq`*rKp
zDsQ_Z>7U3!V-F@GM&jw!ab1(x(YsGBHr_ppmY8Mj?+u*-i9hk=y4-6G4Dp-zRKGy=
zKL0sqMBB7Yo>LuQz(rrcsy@ZuRgTp|LTfRC!?q~PdYFFn@5>1K0KUb*n-=e91~qCq
zlaGVg^AG}AeoN@Eua5^{)dbpK{4!!x>kP>;Wt6(umPFFjS~MaBW%B%20rZs(84Z$`
zr0wkjIQs_boT?KNxH((p4V>0)g*ncI8<8QU9srxfhsck9cSxeR%26F}ycc{4+eNBt
zGTv+nFZftENe=2~9MAGw#PBP~reFWAZ_PZJF#NpONVy#^Mrm<4#N(qWttWncwvt^H
zHMMT!J@09q`dsG&q2l9Be3s}Mliht6z%bTo#?%}>){L>8>FF>QG=;T#kJjRAT`AJ}
zc3u5sf5IPfk5FK{EpWdbCw$%-)^_&0<>vbG+szHI2Qg3bF3MzK|7JqM%;QAYGB#Qt
z1YkzG7G|#oh#h~Goh5#o6}JC1UsjnG6N_9R>3Pfk&b(;|Ve`=TarL;WjwDg@DcEZ3
zh0vU<tot{4@<(P2D?^sasyg1{o2|GWZ(d_1#W9ZrNDpplq~Q^Z0;Z|pjB#@@*-DVy
z3#`2ee%j>IhW!o@A+J^HHOmq{izhN>w}^wa?dPA<#XP$T-(jx>UmJJ882#b{0hdW2
z?YgCjyNgsJCVyWH^Zw~ZzTZ9g%LCl|E?#nG8^9T1j)N{6t;zsIq++=PlQ><2qC&6U
zf<0Bf(Rr^`d`=;T!tD2q!{?0@ub5L{pLRf{5AbwFR4po7O`7QVBb31l{@C>ejaVy|
z!%<r)#*vjjLD~k;%NHNmEv>r5GEEO20D(XbD-fzzHwzla&Q}cyFV5}r+u1PU=8cp!
z<xFbN=P<sGYM`d2lXhxYP1{nr`@oe9bCk~aXX0IM&Z;Svik$_&z4`CdT~H$*BgjY0
zd1qx3*M<u3^UXD}MQF**XU))H{$83fzbLiFsjqPWgj@P0jsQ=7WRUvPDmWF#nKUxx
zZ1za<3)NkYy;?-?XJE5f+Y7w$qkMj1Oxw)9b7w(sOGWbh=lhf740&p&nrC(=-DGBz
zH3K|NDKA{6$;#0en`^)#oakHv0g0U3^Gyu|1{|{J#`=uk8p&P9hmx7aK8P85wtQFu
zWFj7MfEI@<hY{}+GTkTet#dpl`Zh!Hh(MIkfjq=vR0H5Ys6Qr1C8gT#(JGpXsf_K&
z*tc{;4a-g+YgKe_Kh1ghh4ZeB;5XBH9v3s4R9c;SQ{$@TXs)$S<?QjNnY{zY^>6Z7
z61;5rFPawr@VtDtEq8z&-F>qDT}z$Y<NR~$_2Vj!W?-ip<gTAH2e$AynUfSxP4@1h
zs+qYDuZ)614-Cx0$euQdcT9DFc;__z2dyHrd#=LI6meg)n0H6`O<KfDtnEDMur$U;
z2Fl%OBA^uLGV(tee>V7b(j-u#ilZ}@2xNb)aB+mkmeHmn@+0)wo460fxs5JvEdV+b
zv$G9xotq{M98R2C$N3>8On_OF(WDl>et}-A+Mfm7%Amji%mQkHIC$H@rZ3hfM!z=$
zq1eA4cqWUDR|5Y;%&<?LjNi1B^=J`sRBPZl@@5Ytq;DzkMHI`&lRUbm%1^)N^}(**
z%YIhsk4y@o%a3c+gdcJit?kq=F2vA?R-GU%OE4y>1{-#Qn~=OsYMP!{kFAR{99eQ|
z!0@eQ9Yd#aQiRaXo5ri%Mx9iri4ieUhv$CYZ+t-q&U&P?LHZVJ&F++fWm&K0J359q
z$WI}a>+jQFd>n9sN(7>PnBna{P9ij?$7AZZK~26(rc+fv+gdP)$999B|1Ko!IS>{-
zRs?pYbv+xjrVfah2}OcOXV&FejB6ZYOn~|P&}_{pc<IA2Jw~3C%pFZJ2dSHr#!5xh
zhf0lVWk^uykv@k2yb{UFrPeC2+VUfsar>h3vUrp#sBqpB(|Q-M#ZSfzE7P#YdSyLf
z-MBh3`l?NCWNl&hOB1U=NV{C&1(QN7TJJfxA@#tEu?>0*0y~So`B<CW99wjvr(4v(
zK;HI?F1jdXu~B6~B9sF}+Z*s_wz%LYfO;M^ctPq}QqEajZW{d&8|Y5r@EFrS1{F+x
z2i@d^7H}ipxgJ?cXtlt`DZ?+<tibE@Y}b9l7q<CfRpz5^fW>nY0~rNVo7bB-vBAP*
zHL9{K0%9z-&R7L`ECg0YXk+3CzrGcFDNmTGt20fW3;Mh$fn#HKRnfGo9!=EEo?yzj
zl%+fwBtPxnbrPE)gdY9yeGbi#<xB3e{i^CRvzVcSVc%l9=cY&VwFqN*pC-G*C-JK<
zk0<7*_dI^H=)QSp(q%r-c%TK1@ERu}RytG20p_^V=-9hubrkEUgsMUV`-W2`Lup89
z1+Vpxn%E90!SU=t&;+*p+k3!ntL>4%Z_`^g&vUk)G+M9bOiV_b`$ZT;zwoOlyF5Q9
zFYwwoc~*5*Y*gkhyj00Rx(RXdtN5D?%tdtljeFMIH(a8=j!(COfETSIXuV2gPiJR(
z#rB%V63F^=ACs-G<wwv(&k61%318z)@LcZHn>;8@)iLPbkEbBF+;byuN0FLwo0DTU
zh07Iw2W`IA#)ORu1N(mX3gEMw1}9`ckSmFr0uJ6;PldN}qafU`iXNXvHmy;d7szs4
zpLhsXO?25;ZwJ>3Ldo1mB)lyvQQtL{Q~jg9j4Oh-O#W=@67HBU1anbMxZir{o!i5d
zKQSQVc5>2Zpd=X%2j8TT<sj1*PXYeZ^FuVS|5-<K7WH|mn_}*OdcMT@iU|n*v^R<%
z`i;kuvDyUEH*t4$X>)mlJrW26Iem@+z_uG@#Ht<B0IV2OdmA7$?xZl|2v-7l>_G|{
zp)xa;a_CKs4>4LQuJFbx==(rem_Yd}OTqS?I}qx6mNIthH*P%}t&s|Ro5y}xIe|U1
zJXizUa%DIR_sglt`NzTUFY_z-A;hbAZ(0w(aGlBu4*_}E^vhSyA_q=VSke{7ra{=6
zi{$g6ip{J>s^`2Tf*AE~I6oUQ?TRuB-c~iwA|t8vCgy<*h;Y8?Hg^O({QMBl)(n#d
zH4x}Nc>G-jciOlbw2MjYj55<Th-u&gZC6>OB{+roP{MiSjeoh4^Wa-x9xuOB?(({-
z$`m2zbT$gn8LJYgYz30kp=5PQyBEHzWSyu)A##Tzv(9k&1JTD4AJkWtl{0%;*h(F!
z{KapR1|U7yZPCtc9G#ijUCaYAF!z4ebU7aBkta6@4^gkRK1^vL-0LvSp_h)3AS`dT
zT}n;X-pQEDh8g?POo#FYHr;xCZFA{}OKeOA(0#W2xd@ey5k#~zDGi*H*l$977k|h;
z85UWQyP0{8Fc+CLH}~%(6t3B-B0RZ(DIMOs_ftZxD2&>r+3l@!Px!jc(4)!0x%g-A
z6y)3Ob~JT9{-{ydA6%h&Eyjdl-Om$h$b`(**~JJ2_lz0upUucFX~M=PfK;z$jp%A&
z^T;0nvtp*%whRo*n<%M4T9inxhAl0|*vDqdF2Y?q{anMmfNuf!p<RLT+EWRfiwko4
z9zj{_+y`U}ewH=M^FzTjrPyotVHf?u5y*N6CxuJdAltEvo@!43?F#0Y?zsZ%kmE2L
z%5-{7UBS-<IadyzgFu+LuEN7xF$)}X4AJ>*CvIK-bk{FSHguWsf43<AcK$Zm%64tP
z@W+Jb&fjBtSDj$`>;TBv6@$KI5bVdt-=oKoB;MAk1m%)O#&<(NV9Ati3i`mSgD_Q>
z6_3usa>VgBMI218_f}n|?5W-OGEYnC>w;xR>&Cze88O>H2u~<Y+`u5lfgqjE9I`l<
z>N;u<!MR_P2^U7jG>5%XN*^GCYv$*?Q;g?<ZTxi7O<RFKdo^QM{D>y2j_XTxP--Wz
zyHL%qqqp(3l?WKRk_*DRB}Ool*{@SYIcsL+O!nQ?4KpfzECdFzMs*KyS<WJvEg3f0
ztoZx0o=2UMCx7Gm+$;bQrV%X~KQ3(cGlLed7A}mbKar?<ZL7$*c2AmrK3Jp8ypSYp
zeO5K|X={Vx(Oh=(fZkld$+&JJ!|T2zmge(|p7UnVPRFpLn7juTlM$9F-2;>~<2GYg
zqCTsGG>05RRoFY&`b(lpX+5d%r}`tpi{{W0uOPf~o*kY@v2suF-lFG(eld5tHILkD
zzj8=AkhqPk0mm&vo}X=;j5szV3m%T3KQqi!JnInb6XzZg%&0X+uPE)sWXI5!O?%G)
z-i^B9-t^AXr&e#iv+cUWGQPkNg09qJM}vYP{?g5>GhLT`TXnLF_eGXo8>z0nLXcG7
zGdSsH8l)Bx)4l6i0c6&9Z-b2%Mg%52iC&1_-z$i3(JG(HVCuXPX~OP1;V)*nji?-R
zoa`->uw(kwbPr0Xg4lsa`O}a{YhHAR;qmjxBQ)mbZQ{(@M|Sz`qR-!Y+hZnrH1R>q
z38su0p9ZFNV5Q%GxRRrN#+NjNS;>Ty>e_A2F~h>RyA@xC?e01nkn7rkr>`Exc8L2R
zV@4;xE!wY-=V}K)2~uSi@<wlifa+l%AKWG}gY_;B3_>Gg9{hQ)5;xosfAVEmh{!7I
z+60Vfo^(jwhj=#FkHO*NbEvE(?tTNh5RawHt6e7S`hAq0n~5`U1jexi*W_HDjFZ1j
z-hX`hUC~K|oQbl!v&Zk9P6y%<Im_UW8nMR<8u%hlZcN%hX13G8!%t!<r#xyjqoaa8
zyZ<m(KAKvwq|ZNL^^Jz<zB87bW${VLyEuDWucl?Q3S291VO4}XC6j^0kEH=2Gx`Dd
z__OZQTBRQ?ZEA-uftRyIW3zt{?)NR$7I!=R+Q=Rut+$sbyD!3H85aEgc4~Dyt%*7w
z?&;;3zF{vS9eFt6gNuK)=c9(nYu*6n<>7JEImddmY;j%|3RZNzzxoG~KJPXA=~rf-
z0FPcj!-Wl=*Os*aU~P#}KGFwkRM~FLBTjB%-kdrkxJbRu!yWgqDFH#Y1KJMroj&Gx
z6^DS%y)I#y<NO@EFi}Xb!M9+rV%s>ZgP$q?a+g6Ue<Zlr*3TmsIMB=kqV(tmN;Itm
z)UTOBe~BOCEnm9@{~*h|)Zmg1nR)JQ6tf%(`^NMtbPS(fHO1@FVo!|y#rJ*b-nk^&
zi!zWHTmANC?K@LXX##x7Z7p>PPfO{Q>N@E==F4Ot_?dh4KHNZ#BpsuAP23zxVr>pZ
zB@@)|sVr=tQEUe=Em(4@Y-WD1NEkE(zeL&0G|Xn#s=^Q%F)8{R^hN8wMtKyxW%^Y`
z!j^~fY_D~qc}pDAE-FZ6S=^B73_;vtE=HgrL1jj<dpg*9a__O{kFmD~UlPWdVf@_1
zzgYNyn<6+O?yk*kJr!w^zn)mBShgwwe$ZTvYcx<Gq+Jq`W1**2q2|UdZyLg|(7!y3
zp`aoDt1$WTbWC4M57`p3Xr1U0jYAiZCO*CxuF)K=BB<1}ahtVgyu;IB*9_74eYE2z
zl!S?OWZd;M=E#-AjPa`Y$)68j{TA85Oj8ilBXwyx+2gu(La#R*p*YYsc$&DKbRvXW
zkAb;}B~_M4l*PbR>umTEbF2Epqkfj;htFkIKzU#!61T&7q9Yu4&%lS_@yq+?YdnZO
zUmGj`h}p??%4IQXa+rT*xYb?c>F2G--R|4?cj0Ld^)J7;HEY42=h#TDPK@|voErD#
z5V>KNZs~xIWOs*ZV!?~`(v&R*?>`^OGf)HGVOqrVs^n|Ac@x#k%@4&2NRj$bEaJG8
zyMw_Ux@SuKa98hr|6bz$oTvx%_L3SZ&4;9NcO~t85p^pUkaWHMs+B>a&&IAJu9;_p
z()yN+dZ=Kvr1yj*9%U+{D*FC3dZv|kRAyQ{o|J7D@gF?pQeldkPxL0(epPyi(Ywq>
z`h+D`Hc|{d?OUZpX8hB_soRnlS(`Ac6AJPh&8q*L!QNC*qR!B-#Vghj>8x)-l^`=!
z$1&i}9e2_#(k|~d{#@6oBzis1LjQ~b?5?6I&0tkLt_y}w9}F;s9uQQc<lY?YD4G9S
zZp{2?m2m0<^_Mi`8RF*N^Bj${sq8uJ<FUeLX%?##Kh|d}svSBNI~78zd;jPa=1bAZ
zae-13Anxk1{0I-JCF=09wYXCqfKJ`<LHkR$$Y!&ldPlwLbM&%0gy2%g2Nrj`hks&H
zcYSH@w!seL<1_DyEO}6dKf0X7@?O~50^i|znMZYs&q~iaIweJB*5<tLB=S$HVn**K
zZaX`Vqh^D1@SHb_Bpl{9cF~{VzGqn(9=Y{W5V{)c`LizSl%&9Z!VSJfQXUk^EqU-I
z-B_SJ-X@rW`#AoT?8BIwAz_P6m7~G=+{gyk5&rE(!0+3v;{`g*p|=IY6J6F3K|hI;
z;(yhtk2UYsOM7?xi0OBOf@xKsC>l;pujnOeOmC-xnW@C0|GGlTG$L$e%Bt-+Rla$?
zs1VI;jdJFMBgsvS%g<JJA78%wrgh($1@WD;hPpI20jXlk9)m5##i{wDH}rAJbkG@P
zms6y2+R_UcF(gz*431@CH3-JVwd}LZ4GR(*X5<R3+FdO>Aw3{b`<RVK$TqG&nED~2
z-$zHwB$w6OY^WSIB1@NumHv>eIltp){AJesM8az3)~<rFJa!Tg89XK?t-3IF|5PYx
z+8F24rN`G`1L+9<wmz}aXm#e*zd~#6od{j-b&t5r^SfX1F77OOd^7d}qA=xY6GY3q
zyu8JvU1Q#pb3$1ee`F=_!y8Krt5l85D`g(rP@fV77HMD~!(V&|+lLytw)e>ddj}+u
zu{ia7><rL}3Vcl+Y>1N7i9-Eu2vfy`(SQ}_Pch1=EywJ8k770UtjpSi6TH#JQ+=i@
z%-R7**^6>a78BkqCl!@uhFsSABH`tGJm?iaVNR_S!*pH|?v)N<dQfl~>w?MwUNxK=
zTIk)~($DQHU{Hy%axc&dW?ZTPJMWVud$7MUA6pB`_Cj|DED&fp*HNSWQH*4qs4vUq
z8LKVICSX3Lhp#hYky?E-q35h=DZdpkf+L!u6tc(4o=z*tmAG@KSK$Vs3DlownvhZQ
z<}hQn$N?F-yyAc`d~ik^i;#_WJ1t2wnU*%MV?-`*o)9_>NAGeRNuE;AbOvKj60)%&
z<T9sNGt-ZrneqqE(5bUQC~Zq!G~0S3@ECWwwsXG<58BEm{)l4yaW5O>=FUFw8>@rx
zrTruXC&429&doH7E@>W~5Xr8-btdaLX-fQqWC1S_LTonjMNHmEYy|DbR9AYP;IrP3
zEFV~QW-NrZKrFumMY*ZKLR)`wPEathaQ&_5_<`le-Tv)UVOVOLOD`<U`wBvht)Zl_
z3gdk45t+<0O7T5(N5)c*^<|Gc%}BljH!1Jj&Z|RwmSe)}LY6@idJfAtHA66g=_U`_
zC)cO9A=z@0_}()YAo_G_>nzN3l(Q2X_zf>l_`;G_N}@OMk6M8FnS^ZSJ#jg^dViHE
zh&KvnZt*b2Z^2K*LwqqL0k3t$>O<L9<H-_Fe~Udi#rpJnqFTaJ)vKFi{5_-cC)1Y6
zk7#;M36V<6@6Mo>de{tadrrl4qV&wB!uBZJqx_vqf4iZ+9-@1q`2*PZX$kxd4gJyn
zi&_hyu8y}18fIoGtZZZxZ}g-sxyv1N6De#CYUjFz(`0asox^hr>lk$ouj`bPriL?^
z6i~>WGqDYWml`ffQp?Ke*}_}tBdWc%25{m6NF!>}5ya7ji>5DE>Y(ZRQ_gZAUsz$`
zUz=pcJK!C4$~*Gj!40_)rqqG>ZgbM>D_(kV&y?f0lI9>+_FZb?X0c^o=cL*sf4YhO
z-}j^@j;bUBAFE->wcXx2$nO!D5FHm59R`{62Pd>gFpLzifPxF}yg;Zx6e=Wf8)Eho
z%}%4_gK|i6yIGR4^L$yDAV@FCx6VJ9JJpbehbk;aK`ht%6@sx_8hsB1ph{I|>6e<G
z7n%<m|5O;~CLu!IWcT$Es9jb&H*T3UnF^b#?($s2seBvEFLDDs+X+cf)NFTc0cWX%
zxI>|d=bzG*3t6`PZAt#jL+tINowH~3SNd%>c(_|6VkIG$SqJ;Ilj}0QEW@$c306#B
zYJG+)9~5xQj}|bcTy(Xuv@=-7c84x3d^^QP7q#RY%Z0*2Nxb=Ewm_JcMlz?A7C6ee
z&*W&!RHjtB4B`*gTKb#rfQai;R@3uNb8WU=S6o5ClO*cmKm5tpTNGvv3h}){iVlee
zEmFaa3RwfxI+3`3a2-QjIAQ>?`>t+0jCaQlaw>Zf_qtv17usRq6?(eRwY54@5Q}t;
znvi^QhqEig2HzWeDVFzeji<O76KP!UAT5(8Gl6Z%Xz^|H-dfMg_@ysy2k+bB0#};8
zU{ZS%z{Gvf5Z_9gRf+hsU)6fS5b62(42GI<azEA@X~^;>38vGta2Mf`8W(g=6V8@~
zNi(rY&!@tl80g;(&JRA}vPhOnU1kA6S-bHT$0daxm|)Ik$EvBC0osUZm3wZDP|an}
zwUMiZEqdzOrb;FK#{Ra)q(KBk6|1hW7O1Y(sqyQ07~9OYg3VCY=u=sJ3e$z$`U>jK
z!ddc!2c(X1^AEg`vNGzLhE)L-{%aH4DUkY(I<)I0QcA{T&gWh}|I0rg#tOPGz0xyS
z1l<m-4BLcfhQx#Y2toCzuX!wEwmlKuiH1|y7VAm#>|Q$jpa=-t0L|XOy(J!oqXk|o
z+V~Tq;5UYPwhU4NAFK*GTI&}B#0=W1d{mXAl>H-VQLU%~**;86FW8l=^fcIL`E8D~
zpMP+eA3^ew2Gzzx65FfMX*d$OBLltRE|nU6I+4sG=mTYHCV|kb<oL7o&aJ38&oA@5
zJj*Ggm&_mWZe`@CtT@n5noc%{QCB=Y*rBU-$V~Yb^%kj@6b5xIZ_}D^Pt?AT#J<=W
zSWD@~{}dzC$V2`;3R`Ug+gp=5DHyg!#*5fxi!9zQu^9g;x#in6toCFPVPYhmY=b<<
zzL9bBmMMe0j1*1LK}@QGQm8&A`;nhU!C32(WVbuAdj0P?rSn|;aR2;tz`k3CBE2NR
zDw9_kY>pbq5{~yHTqZ!;p5|C0y%|eQg~Bt)!_ZoKLX1LPo`N{5m&6yp&w4s)DeUDL
z%mlfPUzm1+^Ly97tU2gL!f9wG>+PoT1|IA3IX|*iPFt7l1&;SAn1c!nyn|d$Ih`NS
zlX~gvO<DS%r;iV5Fg!8&gDA}8sMI@1O#t$K1}`s!7{&L_u8{dm4L&(9YkkM}WO-5@
zg_A@S>!o)w7}WRGy<c_~<aGO$h#&Sbhr>e+GtSNi5B&L=r_QUnm2E$}6lXb9%#zz2
zDp~aG!JA%QEkwa()4Z72_!}ye)8@(924FGz8r4WDgSHE3jO2|QTjDm14V%}acfi>_
zZBQz|pB976?HBxkcm!;X6Sm<Wl<qH0<L`NKHCG)&24Yo-8XGm9-D_iMb<fKbhh;o0
zCiDBz(R6Q8(y#c@3oNa8m-Zna#uU~agO}wlilffd!Bpj8IXsqhAiUkk0=?qe>N99z
z<e+m|DU%cK$q><6X6=hzLiBSl29@D%GMsNSR#Q*Q!n~d=*38bwm<nR;O4d{Dy5;ht
zYhg6=-6Bv+$AChzJO&MqGErXT*LsswL*x<6*VYlR%jKnE_#fqghh3*rIf@LUmzb4n
zc$(YU@AzLhVs&9A@umv(g0n5^rmdA}gqFkh7dkP6&m4TU5-*M5N5>;hZ(Pb_`jC97
z&-svBKXbi9EPD6M7Cj#)M$gZ~xG!Q={h%jJz)=D7?NLE9Gond-`z@W%QqVUZVX?IG
zK#$_a_K27zi*lD*KYwj!Bf+H67EPSB5U+Ogpl(ae$6_#%iXx>4`fPmqcYj-14e*GY
zhitWO<I|O^e=u$xVeP%1PuK5Vw+<PPaEi-GA!Fv&Fr2t{H_GBt&uui2n5e(Z^RACJ
zMwnylZ=XYddbwJ7Wf>{cU9?#yMOl_4U`KUpX~GTL7{eK>rdX9t7s_I$MF$`BEJr7%
zh0nA&8@NuF?fewMBFJbGTh4!*={1j@ckueUqh0UiDq&K@5=^)JDor~Hh#J;CnY*14
zJB0^rkMhTlQE*4xxg7)Su;9q`23`>%mV~9<rJmw`&H9nUxKoMl*sxUjH-yI3^CQBO
zLz`&>;W&$vdKHKBT>2{-W^QXmmpqMnMA+YIGM9-uo91{Ve>_AlB6OS(hHkS;59ZXn
zyp)OE{7>(?C09JCR?c<N>08A@D$32GP&y%3^Y!17Rw?`Z2ry%s8d8PU#Os^KrYYR_
z$n#u(fcW0L&p{vbBt@g-(d!{79eue5*yL~86+=)B@w;MMF(6<6o(QDt_;HQAwYq6J
zOKKk2V_Lx*tavQA92td0R*SClQfHAX4%AO+Rc>U~-Z?L3BEYedoJ2C2m~e5)?XK!l
zYx070l9JeE<WpyQ>S#`4`jy%KQ~g2k{d@eZJgC&Ak3XP`I2jgT6hhiPd6$1U|FiS)
zmD*f7+TqMx&6TDpSfBjTt!0~or+)6G>a^zeAiRtG$8-fKabLDv@W3tK5$>Fq!4mFu
z>Z-_mlxjRW_wg`BIf=8{FrJspYo-<vM4%seZX}VkY5pyoZ_V8auX{3b3&efp58}XL
zUf@P&aM~RR=wUHtgqi1c?U<Q$`IA=_DqGBK?xD9IyXD}r?At54EgBoWBb(%Iij@pv
zQ5K5Qc;avO1oCt`s<)+s2Qdl0L8bt<X1DkrH6)%4W7>(m53@#43+c<Tdsn|BgK%<I
zJl^S5P1#Gn6hDWlSI~7)eeIRt=HOf;pWBxyfAsr|1navRHGQ;nk`?2y9}c-p9<s#g
zr02wckJ~R-^C#WVtGE&WH@-@?3roXeuBSEC1?O1AYjCuliJj9iUasTu0qk<ET5t8I
z8}HL5^rhJ8SA4Frulws>pzpr3B`fl8O@MN%4@-R)+K$kLU1Sxts(RML-kl%JLquAA
z@sA{j|0oq_5*<Q}ZE`%D0Y3%9?Ndznq|wimP=PwZj;-0Mxm!{xxXfd|larZ{Ds4nh
zB8OG=)j4<M=VZ>6MtU|jr{&?cyR6yO86OFGh`bO<QcZ^<_1Jf3?lr^?DATBET6OdS
zdJ|_?DfX+1{n+{I3n&Wr6<n;^R};c6L62Ta#<httqnv(%FekAdROYTt5eC%frxhu9
zV4^f`OFm_$-$_wL!S3l}(4HFkc9?NX5GH^Kdh{XPk8TUH_y_rQ(^@K${a2*5m;K?x
z8`lebVQ-w&>naBKkmuyFy$L-J2KFL<Gz-gQYT*vXHpx)7Ywi3-LB+0gCYFcT6HrZC
z`ctkk>fl)KOaIndri7Zy^EHW7%N=-NC4Re1JA3#UbPi!?)IMdkWo@YJ7m(D$)?bz4
z#yF6EnH#m`9@5kkWF9rUYM{)@y_ReqX7+8pN8kW2*4faO8Fb1rX#(TR*qX{_65<kO
zBRS$Y&6&YF6<wSQZDozJb5>z8iaeX>0j{hVBqM@LB6b<e0)0oCbe;_Mf~65B-3Rb0
zwBR94Zw3*aILhwb1L)r2_xBd^QdoyT@nBM({_IFwGlif<6Cd8XRljobYfrhH-mAPE
zV}XPUpA~F%ag1yJA5k}zv;7@Z?ZI&Hjb2yp!M&<NTZ<|cg$RvJP4cJE#4h0v%PT*T
z+o~p!?1@&oY|fx$rnvj>|EQ8EO}TY^VNF@i*fK3gPCFYRq}QjW*#$9KImsN1A`FX%
z(rDQyJ9^R|_2g)WE?2p7d$Ui~y;=&eaC<pzl`h|O?$#*EHn5#rb)-ITXU8>r{ndvD
zJVWNhsu0bKZ9yY7nm*n~zMoPwDl{j!e96Q`k9EgnX*xj}YYhb}H-Nx>0+;Z6{Y}Z`
zL=bi4My}GIIem3X>u(F(?CR$|IXlE@zF{i|`$VLyQ@cBOouY8;6q6`cm7AUhmPEi^
z`?@=9tjLg`cb#pM3`xxIk05iZ_}BUcBMg(aM&e6}c*N!(#7OKt5gt~Uqm$$8VRdrL
zDgJ||!1t~82`wjoX){2r+;$qfmgP19GO3FfFDG`soIOXlb6?~^1=%^z8hG7xq29`q
zFJTT>e2ZsnR@}-YRP%S<R8s6Ter1hi9Km;vL%;ntnB$|{xOgA^Zg6({w3;`L><?>p
z9B(f5k(5yRDCV!3v}DG5gD#Cxruu4BTBp%KZDHl-5r4bh*9HD{wp-W34f;l9I}S@|
z*==TBlyy0_H%y*_tA1(X(WH@kEq1#4KFI2cz4Be{54hiN3t8|N?&Kd=+O_i|y~6Gb
z)9-7T1-;&i;7#%yeFT9=4g_`VaiOsXin5)l&O3VAYf(bUsx#|?a2uSwcxpE$8EGY1
z-_0>BVmWrKSdBU=Hg}lgoUx5CE+vi9Ivn0)J=ibIhE-z`G4v<$aybnZWGY)UfR;*x
zH}T@Wgzb*a6-O^1t@757yw`H&q`nv;khp#w+u3gg+dHC}7aHVjyQm1B#a>S6f(uZ4
z5IO%a_i5i6_HnqrV}?f6Se~~cH(Chf!S>xbYQ7HPY;{nn$IwO<6k<cQ=q;><<ek+>
zx-WW4kHj>2JXpwoG7q9r=aZkwT%Xg!bt4y4F@+e;8G8x5*e^RbC!TVHAa->kJAE&+
zP8ZZ3=WZR9gBrvtT`3;fYZoNEDV9LrNphokEq-zqC%<ncy;9_=XxZ~BVo(PjMq#Rl
zI;)B@i^61Hi0vv_GF(ez*$OG#syjMgYh?89La92&Y&UL|x3ev4{&cP>Y#6B6@K}V4
zKPfDccWnEFh2*-f)EcP{F{EyHX`r9xs|;~vQgfqk40GF$g8p_B+pdt6rCOP~xDO)s
zyBq3`kb&q}!z%oji=A?M{3tNK@|4R<Z7k%gT3Y>RYP)<L%O~fs(24eF;lY_JSQ*MK
zj4?wkzl>iebj>=MK8~_ul<u@SSZ$2EfZa6<%TS}qDD04s_mkNTc7svnop7Ag4OCG}
zXSl=|etC7@nwE=}9n+HyxOfhJx2H8VJ3YukVlk&2Sq=U|5%P;Y`AcCx2|pfRx1wbw
zIZ^(@GORKdBq;gYS0-XQ8pSs;KI_7qQyfJnD#6=HEIM6mb;pbB34Dnblbz}M?djKv
zq@lQ+`oV%dPL{+<P*9m53fjbmcH2KMWa4O;z|*5`a901mXK>GQ{@i6<BD3B(6bB|+
zb^JM!lOHMPMK+(~VR(hFkHfFuNfp%%Id#`ulJu(@^r<fSO@@h0?D0MBeNq};1g&w`
z!!o)<l+5|A%}gi<oQ?hqPzEz|j5wc(ZBwLoQohgj)7jJ_czN3H<I%!>^osuk>vo?2
zn;le$$1v9q>KyrG`*<4r8G*Zx7t#c%u;I{q*{9k)G!-^2wzlVW(9-l}DY2MnS*2n&
zkCfzIA|9w%$JiNx8RejpSG==Zxo`Pw6$(9$p|&74?Ao8JK9gHg&?9_*XPt6ga@p`H
zUR1Qa1+#`tQ?AP<k^lM`p|BRT!fqsTq;c3Yr)`Ezt@-W}WtOfb<>J%>X2}WF2&x+i
zWf<vVa7s5nw4^+n$6`nf=}md62=lp|8T1Q&J6<WZJ1`?_%?*3F(qV*As+%`OOs{2G
z9pjfwA<}$#{(|XY*qLDcvR{y)cUE75RoS)$g?H%7rZaTE-iiQDl`(~AZ|#@kD0dnh
z<|?d82)WI@W$Kvy9zO!e0=q}gPL<a42=tDA*GU<|mRg2-ar9_d?DMaO7oANd+2+0z
z=nWCb2F~s6yhyZ_HCg|t#H3}S`_wb@4(cqhV5X%eWRHqfwPIOlsd{)<!$_}!*_nl#
zLUw2ULPf=~dMSZR$_NK3>J@7KjF~0Dv@UQ&;#7|0k49_Kl@#9cTH{Gua+E@!u!NmJ
z%ER#Og~J0c?#GDsRm3hGItHQ=SsC*ytb>O|457kTi7+D$YIumG>B5GJJ7;}qo|eWP
z4M(k4n=S{KD{}Ej)>p!4?1WG6@pEq{k-iHxc0mlUFa+X8;|kd!iORbv+_@3unIh~~
z-7n{Bmw&s_ime1Ufv7iyor$~*>=xrVo#b9~jA{0>Cs=ycA2x_oe6<*?`29eWf(xeX
zSc<)1rq?(n)&bM0@UeaF!!fFG`3A-mMO%Z~I04U2kJlncVk>O_=%dAap>LZ*Aek0!
zp<ZDEY-VA<c!;WhzcU5`{QE67OKm&>X`};^Te#3p*>HlWTnKs+PEDsT+SLoeCHuj~
zBZ<^DX>?iiL`++pg&#Vrf~sikg$KbC+n2!>{yN#aO)2MhmZc0BY!+)XsSnUHxZcPe
z`=6jYo2BuW)OD!u=Cnv`9j>4$|H=15gAL|LCZW&@TrUfO_M}ClN}d64;Ss@Djo2k<
zx<)w5T8blSjV8Ejia?6xX%>k2cW7CmVFP-Y3<I(cL>QXwV(%P-Lhs*|!Mss|hewG?
z7cbu&x={iJPB=;|nI<p6^qi7cyM<Br$vwjcCH$o~Wpr5JogJY_TKzoHhefBbI9W0O
z@6^3h`|l}!p6U%Q1zvg~Mh9d0SFC$G*1#rX><O+CUN16D*Esi!Uz9n`EziC`u<AR&
z&VN^470k5uOFhwo-(h&`qWe!sYRpJ9X0=t%n|WC|J`vQ~nVVQc9t7;9_4A>YrwP6O
z9YF}exxghf#!JF$kV+1F_og|RI@xqiJ_Cz+_glT*8&l%G;1M1^Igu(5i3F58--qm=
zvSx_r6~0XuW_=+x+n`^oX@JY#J(4FOmki1W*jH9EponJ9*ao__vF1y}bP6I(){dXF
z>Nr2f0u1LHQ-9T3cd?7BuhFC9V|Cm~ep;l|GPkFPL^&72vLbI*(h5Dc^vf6H*0Z<F
z!#0XqSNH68>%Fr5ou%~dLP}CkpH5?Q5zQ$tF9(dAOEUFXlj+a<$msp4l<pAc++!o!
zQ%4Uut1!-Nx>FxuJFJR@dcx_RMWHnG82DO4#EuA?P;O==)B_bve>xvzI2sK;nY!w?
zPmOGPpVPqZOn;=kxlBz(91c#YRPq9q^*L(EX;f`Rce4slD3=V)F*hqmL)WNr`O8yd
z3{5n_2L+Q8zCcM*|7{u8KGDHWSK*`Nco2P=-Vd9~?t+LY!wEbYL~Nl{L?m<bpzHy`
zKACmIV7=0zm0|n$<S;R0Sd9$5{zLvCwHej!?bQ3a9G6d<x8gIF?}eUxIPQ}B(TH+g
zs*|DmQnTDh*d<MS-d4?54Z8jWOPY_ohEKi?+$IhyIE+9{WGYRqhawyr>ax{%KaI?v
z<={O}gNfzWxv5uD_e`y2vIv~7iSB38uS5QzP8*7oU?QUeq*Kn4$H{Q=34=NMPo%lJ
zyYMoqIk?@%0)ql!uo{OU!H!wtj)5ve##tywD5c3&a={;_1V%>+?=EhW$4eUXy9Mcu
z6&d>tA6RSeBblLvjOp0sWem+z)71u*;v8Qdc+eFs;fn7rb4jAdjh~wk@br{!r}T=`
z<68UXUZUbrufL5XTw(;#>WeqNe|}IxJ0g7gnK$*(5su3B*mBjBzlR{WvaMH@nKRpH
z;xgHr2kE=Hl4E0--|ac2lkaPx$~ZS;ZUcLJ7@!vYZLN;}TZ|qfe6GT=@)L&42tm+u
z-`+xOduwTRsHm6&_=rKy#SrHG$;VF2V$ho-9=t*=xW<CG9L20Vl3;%8^#YRT&9wLC
z>lNvMahW?<?-07H1!HLVj5rb0wOsg#=9=^2@cq1{%(Xr7qcLf{B|MHC1Hz#IoU8LY
zJ4qn1v64%_4nHEd>FA5dm|y;CnQzN1OV7ZjvOOAFCDAq6>7A0p-gO76SwkirhJBBc
zeXawCBu`lqD!E0%reNuATF;TvG4_ch|K2GCGkLn(m||MYCd18wQkDUw18T~_<vi%R
zSFLG8g}}rs8gmOh2Fkx#qI;JHCS+$bS(ccBsWl7$@Ld?;%dCyN_I@W#PZbY?V+FKF
z^1~+VyqlO-A~0Rn>|7QsGAm4ZoSV{7o~As5p{W$2&QZOIml6(GSA}K5U2U!_I<Bg0
zF_s!%lF$2{xK8gdQKgx=nN(Slnz?lpF6L-x(oax0|7frX-r<U55x=qH2XDLk;<()>
zz8mU|Ap=9cx~lhG&};sAvRXt0r4g5nRNST^E)7QEZ+&hJuz&0-$A-l&w_PKX-Jl~r
zec!4=skGR*)&0pRx$tY&$#;3p>(S6WZ04qA{m@^{lX4WSxFY3LM#6B|kIObc7&?Bm
ze6^>m&*58wP9RXw!%yi`w{4p4tdd$&*VTH2UeAH<{aj%Sets9UU&b0I-E)~5D9-CR
zXiAp&e#xu<4<$ZPp0*;5OH^ftOq?&p5lPi_VZ-P*g{~GDk7%<pE*&*;$}!q(9`axi
z1kS+&Q;&e6f`U=qtr~1HJ$Juh3M*hKz{uudK1RusOi2eVO77oOme<{Z)1P={7=KF@
zbWIFPOr2zFyUZ^#5#H`l3H{NcDs?HM0N3<GR3amPEw5zOnn-fdUmBHtt<d{z<u&+<
z5Qr<8G-7}h#C&ch07~+RO$t%+tQ?OjdK^n^ZTsbZJ&SU@4%LRC2Q-Io)eeVV81rK#
z$J-kf*9;ogo$})Gk%GAX$3^`aroesuP#=#<(ln!KW%9Vn!=1*1J(Z_?@8UxuF0hE%
zN}RK&{<N1R)3T@_An)h4>Fu0Mi^&rsjUVBQx#(4)f>cD~a*n@RVp?zeoDYVr2$>2E
zVupl;w;a*l4D&os3o|pwn*)xjnxqAeCKA_uy12YZs*4_3T{Yx2+=xVU-=#rHHn}Pw
z+HzvoNgcRnjk_nw*(M{YRQQnF<p++YVc|>NM~SR-7&}XpY}X)UoF1w9vz^)VRi6u)
zMMM~_CR>SQ>&4j_z+vJg=H_!+7j-9Z>T+QGYQ+_16Gj1*ya$uii{w*?oNInJt=e4+
zF-ykw$_-v63c^~+ITBy@+SEfFO^??v(tg5cLEub%i@nQH^;(*R>tfK-s@+qc<|RF2
z#lihk(OqNcrY||CxbWOqrBYr>CZ_{6ke@O?*pFLkkba{)i;GuFc|b9#C`Syv)=T9J
z_hQSO%E2^ai*xT`q6o@Q#kS$15l!b;Y<s`A`=>`Z#+PDGL-Xj+=n!#qpD_$Oa6v>G
zA=p573ZHh5XM#;l<*OpJ&g0fS_4_wp21|AMuw(ld{c)2r!m4X_6wUj+VommBWX?sr
z7ZFk&(CZlI)piK8D;w)+BUj0D$L&!2GpK)CpWP@6_!@7N?!`yv*Xoj*U3gyy8c)oQ
z@L0BetmbJQTClIFk~C|3Y$}wyG0E~nG9JM)^cgCfa_Y9anC8j3-j83KGQQ^Y;Bpqa
z)q9lG`$$!|4zt~f_787p;I@naouzIY6U>fSMLfn!{{|%vR?K|JzH?F8G{~P}YoQwc
z9$W0A9<p%$?$gAnT(N{xa%YXo0aYmI9gN#dMgMDssdK$Z(t8Q{ct&?ZCf_vU&4S6u
zk<!tlp@Q!(j*rUMvfIUKB0UvL9X$d74oVAZVc&Vx*K&-f|4XzgtuTkv#>cMn4*VHm
zZz;>wrjUwl<XB28`)W@6X%b<)K&Uv{#Idn0jgYh~1|A1CBu$^RA7{a2N=OInrn$@B
zRlKP%W3~#}7AK4$H*iMr{f@Ccxj!bJb%j1Ru_o`=4h)q5*Y|x*4k@ciD&dD+etnuY
z2m-Oz?;}L?P4e4)l^UkxdHpmz;42reYgvB_VLUE6P5LTa8{LGPQS0m35$qR{Y};!;
z0x-F{a27Cjz_m$w0+J}uxlytKZHR!DK;2AQfOuXv@B5Nf=_djoWb!^9BXolzW;EX1
zSu`->abxU`hVWB^<^aNnM6^We>Gb+#43fJ10j@pPk^IA7zr&xo$voBq*i$CB0LL>u
zQTxfO;MeL5W*G6QYGn9g%rK4QD(#<k>>YQ-2K3E~etxJ=waAQwsHDsfecVy29sL|2
zN<K+Eh>Q|@ro#vYyQygZ3)Pk(>XrdZ&-7Iq6aBEYi@KkhuPR5?I_T*HLGUOmHa<};
z#*v0*&~m&=0{VeoZ%uOgrt^J_Idmjks-mSCyFJk~s%xGx!PCGxtodZi=YO%^gRWVC
zocKxa^Olbs7%;)PGCr(BM^>;?Q{z*HN6=zR`i?(rq|{8qkHN(zj_F*gO&1?tm8!n>
zNWijXCDK%?=+um&fCI&+adMF5Rc*!%v)cZTo<KnL&xZ%N^=M?5dmDnazu0Xu$%FNo
zc}#vBgzk~sV@KmaR->k+m!!Gn{E5{O|EL8hm3;YUOmglWcoTOc`iXd@W?!^y8ZA(B
z^S?;_wy6MebnrG2kUr_x-{YMd`?6io(`BgTXi@ub`TrdKpC7Dl!+N1-<LsE6Sdqf!
z#b2~8p3MvV^XI>R`05X&dyP--*-H=M4gwYQw@hBjjgwJjUfTCVT9E%W{$F$1paZ0v
zxA@F8nbHREu-<*I_hJOE0wi1$Ev8L?c=@l~XOG(|z}@&D0}Vk;fJ9j{4e(8=U$p?d
z&Me>_yV9EFyYaus2#is3d)GTo392n*!vg4*k|Bn%#%x)(Oh-TwbS6NDdCNxnUG+a2
zL2nyT-LEaWhlReraJXevKEkqpKTB?csMHVqbK)Q+WlZ{S!USzXfE8-te}=S|G6g8k
z`~U;YM^+sueZAwp6ofPJ_H?u0-v!c+2L#%TjWW=_i^<yxRHa*`=~KOG+^L(*0xG)4
z|NoniU@|MlJp>rs^cy>}_a!W$#vb3Yj$h3G$HeS!C-&(;!@*u*Da)apzwqpq<dy|{
zY9B5$4`gir8y4W8^4qek=U0uWf1m8FGNi}=^Zd}kQ9}6dn*SSOjc=)ArDvXQ|JvtY
zLj*mzB_4-8)1dwr)BFb`zVRvG-*peRmj7$d&l!N8@!!1_`Zq}Y_0{7R%;n3I-M|U=
ze|G*qEc<_x^>1G{Zdr`R-Jg-je+AoLJ!=O6JwJ#Ep7~!E`9Bl<`urB|(I(_TEc`F4
z__wbmm_X0ZDX?p$|I4K0-j?3-M9Ertc>c%M|EFgSY@p|leE9Yu|5XjTw@mm40merE
zwP*a>WXz28`@R1%ssEb_ev6F>Wc-Sq(?G?)jh53cKRo>sV13hG`R*^ZFDxbv+vPhv
ze*QlqJibjQdNsCW{#pl5_5}I9*UyqN5*0T#*j(~V&2NUt3|J~W+-p3X{wW#&$Z!7t
z&<;0oq&#T$CG)l6XIv_7CXN#iLs-;JJ$H$c3MX0p1CoEJ=`0Xfq79wD>n!LF1|I!*
z>!YMDQasLa`X{v-f1eXTm&b&sD#$wh<+(OQFyz3KhwrF$=wWOApbL$`cme5i^R%gx
z({}^$12et)n97;nnVZCa4HL9^>ofoGYl8lQr2)&5LGb>K7-w)7$VvE~;uGFBe}pk3
z+rxV~G}+?yI^r*IXk4W{44%ZB(D6Lh`THC$!T<yYP{L1S|Mr4`SYlb;r`Y`U_h(ld
z$Dh*J>dpf83j53#@9GsOiTpaIE!c|s3&>KUfQ%`f&uaf}BN?x`!)btPbElR@u3%fp
zoX*^PU1vjZ?d6LoYTj>!$7b4Lf5+cfF93WqaeCx`tq)K*Q``j>CW;JA&`~TmSf}^+
zD&0e-gphu*qLc0v_A{EzNO6K~yixHx)!(!d^y*gby#w97Te;u9#M0~1oS|gzg37Q2
zYwtpB9YRo_qoaw@dO`;+GxqOTb8DVztpDBmq_%)b*9>bs`fJjpEMJ}Z<z$rFgTC2?
z;Z9*j)82hQ?!!T>j!zh&_K#(r1DgC)C?fgG@}r4uiy4^s3pYwOn$^G+^2E6zI`Skl
zjyB_*_GDvG=q;*$NXZ4XJN}~dA?NnR*AOgo>B8-{3}V?=`_>xCIJjr#`6)pW>%pX5
z#L8dZet7&>_nS%pBxn_l{h@%8f&bcJtIO~_iqVMz_I&y|rZ|_Ce&X3<ru@Rv&eFd-
zlw&@?3rL9j*f-u<vPZI16ISBXj9j}`lW1e3S26C{a$3rd0NG1`zS67aDLeW1O)?05
z>+yKQpXJ;tA!r!PF>BRRN7VjE3gDP8-4JugiX^f|?Xvyd4*oy({yZAW_k94zGmQ+Q
zvF|k)OSZC$Fm|#e!q_6A>?uo@Vr<#7WlbSuU$aCcA&QbL5h|r@iAsx#@Ab^o=l%Zt
z&gZ}1U*B`S=kz*f#%rGYx$kS=?(6DDZZE2xP_{vc1j=KYnfHs&drQRjD;Ha~q$=AA
zkcdk4JNKS0pJiNlv>tDs7M+_+c~couW9`r5yJ3YqYKF;h6A}pIn(-DGK7rEN*(tWu
zB;=t^69qkAh2ggyhrHdJkh5gy78r@A60xc6Ms5#o#%tsB9TTPd*0%c2XHP7N2Rtm)
zw9Z;6rSM#tIPw<0pbcA~h;j6z2If?p+44*M{7(W@H|8-<Z;G3+4`d52r%5qZNlB6=
z;{Ot^q$f<V&*PiZ__d^D^LY-~8=%uLJtza-36X>`OQw2!vuNCMmVex;A%y}jA&H6`
zdR7<V?Pc;L8N1W0$D77mQj{HhQHS5^P}9c=vF)>yKT2k)0)!C!sN1E)koSJ%$0i8W
zm*}Nd_Is6L<dHg5104I2GZ#-R%k)Yv4oO>8QrNj0o?2KXV`s03sqSGWZKjghkLx^z
zm-E8zQ*&hV1-!UF2zQ;Ig(<%Dl$Cv-ed^oUE5VJ6I@L!Os(aFAdCxbIv87!A)e_7z
zOn<4+ripccrKNyoy=*{L`!(DNl865~ey%h=-DQ*H*zio3XT$WsW^K7ktdP!3*yfee
zVDJMF0{&m*?3lRjQqxbTTO(_2u-IzC+zK|uAd=rdJK)taGoHvTUy(jE+=N2gFdZqA
zX_iPO7TFq6puvGy;L*p-c;J}s#sybbWL@F~pvytc&xM;;io!nK+wS}cck7417&=*c
z?c9+@;E!zNrseqC;wKz}2hY9Jmks@~7L%T3A7FhRnqjb2)+L-iN&V%0o{#QR>ZkNi
zG3+wXRU|bB)WQ&oUSha+yy?=SWe+p0Q1ut9zm~<{JC6R;2txZ|8?6E^u>$)-*ECq`
z)}ir<^wGoXQ9*Fei^mQZ%4g$Na&-`{dwUA!c{WV{8Kd10s)L&;IncnOnyL#iKW9QL
zq#VNj_>bytsY3bI`vH*~a&Y%L*eM;#7c9@uU$q*y5v5A$NfUcD>7Iv>_h=pn-ry_u
zf)7A^f*)1g!{%ne`V+392{OHdn@_S(sn>pIy^n`Jl}pAs_Ri)~C4GKn`jUYoSaH%w
zBa(UQwCz8i!rm|5x}os>RoJ~da2a%H;n_`d9j#ZfPR5_Vz4Xb0-XQlr(A>Qmm>HFJ
zecvU2vi-7|25;`<we&~&pk!fK{R`cMhQ+*htON_jP6vI+e-DjIL>VTpsBaN%>?_K)
zX_2-a#+o~8M2XIRjiH|hMBdjw+?#Nn0i6m_r_S5&5C3=_=~~~*E?(U!c0A+VC6;@Y
zN_|;Jl(y`1ZdV*kqZk?gecoXqRCU|%J9~XB%^R?kld4o|;2yrHnr3@EIxQge<Go<@
z-)ma3y{zVP{=3ELS0_y}fkLF1cL%8<U3K98*VEe{KgUWz<Bbb1mp-}nR%3*2^LVuG
zWK=51vmb7vaFDwZ2(2U<2YWD|77mk&tk1-^4Tr)FxA(6j_b3SJCY~v8M4z-jd!PEI
zUsbw^a(Rw(ngP-eh>!YakG#keC7Gv(zi4}7!~{~J!w*CnhwhbDLwX67<Yrz$`YsFs
zgdO@_ggceKZy_xpT=Tq|7r6M@pF$H64pqV0&dmyF3UO-Jl8B|jJsNqFbNAvTbsaex
z#8OR1K6_QuQ6xScVz%zcH9^EB=`JvguFa&ey-&x+AatckYq^8;tdNFoDZv?+{J%|}
zRy>s?Z|TGbU6QJTe!RIn1Ra{VU7C(DktEQ`%1fe+&=%>!9v$L>_BBRS53{I(kM)H&
z(L><;Fx-#xrR4^LiKX(whx|aK9RM?JlJo8TmAGcOEPH5IRzZ1b?jz6J&vad(8eFn+
z(`?Bgc$+`Uk%^<7yNx5m)a((qjF}<;6G1pRqe~tjN5CB*6?x0)RO?KC`?L07PkFa)
zC?kDk)3}W|v(k&1-d+tI^pvq9+%lEpu=h*-E;D@(>DbYp=V%ERVh1(iE;I13(_V%S
zA5IfG7`w@2wao=#(&7j8?K?le-`k!E@Ya#S6&904C0P&|fc2nkU^@Z&xr~}*8I@{T
zrDN9$tpSXeYQ6yuT$^7w(QDa^nMmk*`0gxA&!LroGOgKy{QN|Xh^wqMYK0Nen5X?n
z-x2Ao8*<^!qm$<zOAZ0yPxzg=N}bkPZ{T~Rzep!l$6(67;1z|IrZj+-RLUw$07*8?
z3z<B|%qwuGcX9<9Di~j`YMyd`kMwqeKLtrXJd$6tiv*XypC8`-{lz$G(V3mc?m;M*
z3)1~Tw9GeW^`M_z@(>9ZN6)+aGpmFL*7J$#ce_t+9k`TYS|MV6uamF7sD}O<rGGxP
zXgYALj9Y~i`R$?ti5c^{>AQGkLj$_ty8QbXt%1FoSOYeu1z{?Oc!w#i-MkG0eigjX
zE6WvJMYq;z5DS8VxiXKbIqV!2pc+3$?8@QK`i&qlPInI8*!n7Re#Ah8`idSg_SJzL
z@sFQL{0V7th4?LdO@k~N(=5dQAgKv}(z~+D+#V=(L4s7&t)Js9XB~VOWKo`AvQp6-
zQ<QRpk_8jVn;CP*->YL|{5^C;_(Xi}-TEd*Kck)=mGrB{k|(N!j$PdQ&~jCyKxNA9
zh&876wV%n+Tr0s%QE*cYM@)>0Vx>O!ag52T9RYnoFp%1DAEQwL*g`zZ0^3P^pD3w#
zPVsnB9tX3&zH!H^OTYXRjc%ctM|P-xdvR*^(wir;&iyHlH@3IF-^<&WaCr4g1=a1a
z^_fE!y3+V1TB{sjjW=eY=kE-xt1_56Bv1iH0|C8+1OPAg*{FicdM1@nODkxA8h{Jo
z6IP2C>SIq{{s~<~WXXC89DSs>7>5gYj&C2=kS4vPb06F@X8#mEel%>cBujuEH5IXJ
z&elMEe+g7coXm}`)h|!g71jSN9vuRZ?2~prrhXsIa(g!D)-F_1vg{M3+2Go?r3z6|
z_gnEtaX<g{<}%BxgSI2t5P*;(et_F|?)v*yGQ<8XJGgHB{P6v{Uco#(D%jsJ%IH_u
zU^(qQN2ulVtdSC;C~Oe`+}8m@1rKn|M?ox&#o*fs2cdMI8`J~Pf5+1`*RAxM2K}g!
zQ3k{YMiQa>T#570&#hmdkPZmWCWq!TdOB13va9xiX!e+Js=XF4<>m^vOnJ_H0R7~_
zc<5b9@)gp>`ye|WVxP}J6AD95nuImg@6)n@glEM^n4C0GOdoUL_Nz)?pGb-w$e|dy
z9vAk_=(Ptby;hjr3DI!Z&q!wN3rII^mz>}~8xuSxB))F_{)*zkfWXEYd?A`&dmKN*
z%V$qU(Iyhl*y1tY&<DEEMg>*tR^StRI73n7o=~zN0<P*ma$p;$McM;pNMD{=hu&N2
zJ87H>!wq+ouHXCf(PwBb;lOnd4&M6c-KuVLu`@DJ%yvxubEi0_bOveJ3-i3&%5$KJ
zm7&Xd<c7IsvVEIm3jU5~lycbUF})n*26GAVZ74JYIw3AxrBFA!;Ji+Yzp0xf^U}NB
zYjFs!`X4BIUCW>9JmWthpU5zus<&7A-mjTYxjjrWW5e$+`IZl$xK6arwhBFQeS7|K
z`P?rhLkIsvvKLPHj+jP!L+q{Lc-?5tZhcRkIh*vgZJIrM?FMeAtC8M8_ZxXAMLa%-
zxeI*`cJa?6t?UphU<j`4pZxI>+Ff~F`ka0v+X32$rDh-Wd{I{#j6HxfN75<U;mu=B
zH`fao_NlY%pBs6zY?Zlub`Z3f4<dB?(_F5oemf}f`=jRg4K^z~X%GLnhxWd`Y_8DI
z&s(|gLpLd57~bcG^o5JO`V<P@*h-wXCZAqnztgeiu-a&8uU^yfZK|8gCKF7vNPp@Y
zLvMcSkI;<B#~Zr&EL^{Da|W(hDkMm?oq1>0!})L?`u`-_T>#-6^g-Swc)tsdSq#95
zR_JpVE>B@tiW9kh5EPwfMosqVsIYph66oXX`7DRov$e_4pKqtdN!7-sipt#fPn|Dq
zRUd{SsvS&6pG`gwX1kQqY^V5uQ{~Qo)cAWC=a%9{*L|+8Byv-@!}Jhn5iv!)dkRFI
zT4;atq^2Cyw5{wkTx4OcD<Zszo#8xdq4E2|LgD4X{J>c^FgtQf?U#pJTi1=Q2QQWD
zhZCaqn-R574QZ?-KrtIsEc0-bUwX~UUK@+Ak*B4p^SDy?%w1zG8?4YgJK}=iHSA$g
ziQ9Lsm_TzOqpZO^?(v!oqTu-<d8^XpD8}xNw+)Y_5+uUrG&_ju!v`FE?KNW`%KTbs
zROH(Pj0J|Se`3+^7kluB)c!d8R5Di+i|-m(cUP`6Y2P;OvS=|^linT3^Fwv5YiG_w
zRN*3+Rn^)nxr0gG4@mqic}m|fpU%G2)T6R{c@R@!h6udP4B#d;KBB)N=x}uszLH7!
zWsO1TSafPOA-9A&VWY}&uUf@QHp^}{57Glhl|e+G{CpGg1VcUGd~%>$dDPXcaW42E
z^c2$dY6A<M+b(3q5IRg=cxKXJ%T9Ep`T|YXnn(9k?ME&GW&?**m+zIJ$%G4V3!gCb
zM3xt;dTt!Lo~`fF3<n>HgMc~n{8LXCr)u!+?}yo6qi^dd-TV`4*LnN)`T&s@8j}Ts
zO18W&77%v<8m#0^->qvV>{##viV5Jkpxb(M#HlH2Dk=M2qd<53u6i-U4ovX0B9XQH
zw*$&xT@PCynt3VTq_L)+;@B@p0tEzn+#R(6#iGE}e@edF+xf2GWa;}18?Rh~*;gkC
z?(#g`4Nxy$87?~&BX36}?4b8;$j0q`-OVBfc2?CS*bF?EI$uZ7-<9HY-iEu=8RjOF
z$N8)YCaN39wd~>~qB}dFXvvz9Vei`nmG1i@*UoD8Bt~rPutZz4_jl62N%jMiZVyqZ
zSe}!~6)rG%DDS=>-K*$r?tPLe3|fM5nSE;eh}Wf|4^`|uVy05};=bnMeVTo8h7$GK
z4+;Z*C*AJM_!*K*7<tshDL7rKX`P`dMf@gDJA0(G5L%m*UM>APf5&YcU-Q_w7W4Ar
zA>vX7B5J-uL<+BZP~ROw#)My(DB%f%dfi|znugt^sPGDOrZZL2#;ydXIX!T=IiJ0L
z?Vj?E`GK@Vw{>h={n6O-n!B-8*wAQBez2iAf6yjSSBXr2hUQTh>=jnt*3lKJMP(F^
zR1hNObv*f<9B8O^RMNN31ird>EP&bJ%_h;gD~Gsm?Z(L%E`dOtqIWM`X>}jBKAms0
zCoa!GM2+@(Cs8J#1fzSHW0Wb2LE%N0UFdpmP7U9_o{MYX?BoI^C^IwKsriPF*j;VD
zaFz>2`-fY`SwEvR3s>=5EAdg<32wuJVO4{U=@x&wej8^cEdfR8kWeYbBh+?L4Nu6H
z_7>9QX8R#g72??#EXKHui4aQKacUy`#7>*zxVXm0br)>GGhliCnK_FTeX{IzL$^RE
zG(p|HH&Te!?E?6Ao293)j%WWAU|Sye*REUV;*(f3Wa3Eov(v8jO0I0Cqb4L{(dYn%
zl3Vi^=8og=cf5LZPzg*=HtuBgOmy4q<nLu%8t_4P3hj;Y=)JGW8?d_Gx%K=&M8T9$
z^Fn{H!zoGa9hM`IK~tOdkne%cpzr$oD`LYN(K7R|pnE7Uty-~NpIEom!s%Ov$4F7X
zxU-~Cn_Ej(_u?ZsIQF-ed<2a=YktR}_0w4za|7YmEO+1Hs#kxVxNqyPNQvS7AO|=3
z!T&Tw1=+a=&)`)m2@x;pEd`wS^Uq6Z62`&TyO9l@97jWx==GfrI+yQh;gNZZV}2pT
zn0v&CN$~*#y?#T7^FZiZg@dehU%`uuS26`%a<jiXx~sW<;pDdWgu$;kmASsk!lYPz
zB5TSf=LEB2m&ilMpop^9;sye&mb^&m!qVB3V{^Nn_KvxLA8DvMp^9S-M7Qun3<+DC
z;8I?5zY~vDb-872vTSEbLLa-#DJ4-;A0^OeS}El0<vbAr)@zRX*7}v|6BP;{E~C8_
zrQ)>Hrly&f$`EpQhHwpDR*X*+!Wak8p>vo_^dIigry?mm8Y1=jFCfixv)8{ep#$_y
z>p&Y7|Bhy~(U46j3N=SHPSZ0C;`R`CMYnPEkKH<S;|!}#yn*ahf==K0?JnC%M@Gx|
z8c*9_oGbZZ*2sJzV@EM-m3>`?Z^FgI+5;H>>Gn>|kcG$4gf%_({^^TfAc)}1Tss{<
zVdEi*L<JOn=ja@+w{i;T?88aD`GysH$QO8RqT*Kn4_8$Yd`i#L0ccu!dTZ^nI^Krh
z(vf+Q*ZL+_u#xp;yx8M)e|6zXOwDdZ|9j@6VTfaP29kx^fl<KJ)tIQkDy<^g^baQv
zfZecES(`j3lwl3!EsMeRxB=TxFI`@K>MD<q;UzQ)s{+&>)FW$Cy^O4zV8AHQ%B<I;
zxtq+<EFvzo9pBvD`}W_u<`nSa<Ym&mUvMv-gI~h$Y>B}m1#PSi`)7k*5xY>Z-CTzY
zH14eZlnBDQ#fqdm@9zmB_Tybv7W-;CkK&;4-X4ior6(9(k7T;tn$@^#L8mnY0xS$g
zNk<e=#B)C*Zl7z;dsAv<FWQ(od<=2*a42g?L<}4`@(}3t>tQM@be#QZ&9F~D8Mp5-
z{^;3-(o9n=8JuI$Ck}Je8}VO{<Vnnst0cz1?sm$uG<)DgJ=b>otoq8m?X71*%^=4e
z-}C%ZYIu9L$`<@J@$-FLr+5G63glBX7H2;kD{f%P;oNb5NTA%&<qK$|Y-qYW7?OQ-
z^fh5RqM_$Nz4(Sp0sn<%g_oWzo&xncXYbVqMKKDWHxZF-{()w`&NE`!a`WuI-=DMY
zc{0aiHR=f6HoYR(aJ!t`GF<nMCM$e*&o6An`&R95cVIRM_RkpeMo=b-*tDOC6%c79
z(Xo_ODs`v$nd9!;3mdZU2~1^fzu6P^FgW4@f2-f?sgoS&bi_g(p7{>l{p3rUlk5-o
zD`;hAn#R2?`VFZMd8A?C^OHS(8t>|>Wicadpjj1ofA#VBWV_wUSGd5}(B>R>K$9A2
zm&|&g*9u#Cw%+&g9`)ZTqECJ-eZDg=+=35G{qybRQ{Lc(!V@O&f6hL5WnY=68JYJM
ze}m8f{!cE_uR~9Al2Iqc#(&IW2<hQ?7KP&QmUE74gZ7X&FAzF49qpX%mc0{v-MIH_
z9DWAFe63c55<Au|$OpINc6!Yc1h~u4R1<@3QkjC%p*IK-!w3BqhRq&6<aibFwb*&H
z-=^PDi`yaadnmL^jt>~WSVX;XK3{-33)^y$=#CYPG?H_RKHj93fMIVZ-b$(OmFgVr
zbbid=o>+SgEO=LFdwJAD_rWJ<@oAfGOGR@R_GbL?&8JpLW9eSjJ2fO+)5sBV`D<UG
z7qMRXjf>RJV!l4|Mp{A(T<W=6KjV$fihcU59H^_`JBnx-aDRp_P_zYFLbqwK-jCmp
zA5PhuLue8EiE(B!XnQkvwU*=mKH*D${*2G{6S8A?60}EyXdo1P*|@HkqzE-hx-Pr<
zvc%SF{ARSXnAYS&d~#Oom<T6M7il3E-KP1CXCgSVRe&gm@BO9UPVA2!IIR(3OD(XM
z%X?1KlXY5U+EZ=)(?OwD_x?v<hDe=haHs7x9}stXwyWlo&Cg-WiMd@dtz)hC?aFta
zq}!WVd0+@p3m(uPxJGWY<o7CzejQ!Jshs&J7L)#KmfVIc+M?(K4hD@c_BYP_Ii7VC
z|EAsXA%5}0Ck^$al1%|}1iOpF=J3B-03?gRciiF@h#VJ8>*C&vlkS@rWmZ`9r|MPq
z9<kznXn}iA>w830Vhe2ie9_h=Q1(mqgqKbl*W7k_>~y$BFEgEzD&m9qo#)$kl-LXI
zf4!vmDwL(y*;mpT8-!t3d9YhH`__$1Q@E6<M;2E0Y$drRuifjxDGuJF3V}xkZ-IKk
z0=@p@I33jDrd@g&z4*SVZ_xTN7dqO$!m9Lg7(NIPd2=_*;nF11Xp-R%PSC_=`s(fa
z{>}EIIF0O65|70A>Gv+}xnm7YnL=FOL~|$*KeSRmOZ8p5V|;b&{iSL>0fT$VRhwkj
zp&GJJBg$H1AfHT+p;qynzEMu6<re~dqqa8DX<+`oxCZS7ORh{KO)arUG=-Y+15Wv*
zx`@X^p+J+Sd?dof9Geldcf$Ux$GTf~G(C-b-cjnpdtXu@BVMuQX_oZ<Wr7{JT`@02
z0%%Lv-)FyqK*lj)cGUGv)W|ub)YC|__`#2vEG=BVXbwIz!b=y3D@nuof*!nTH4$C#
zv~;yjhRUVHLjotPqkm=onvitUh>0`Qd@ty5Y<PU`=Gzd)di?!daVeeaNpq4TITTs@
z-#PFM6VSgs&7$QAwTDQUdf__5_J9u#!HYsjgKifSdv5Lvs9J&7M$o33;WD=UzDW+x
z_KPD%F~i;RHF*~}iPqTK9)IzAo~$u3n$&2*HEZpn)=TWm=vDleI9p7^5s3_-LX{u9
zZ?+<$;}6u(3DA*TzpzW_#O9^m!UfMjJkAwtEl=ekjy<mL`cr3D2thSD^Terlu6pr1
zcb-0^{#d&6>yv9A&R;&a+>W%sEh?>b?)cPjyWaa@)X0v<Go{{v5u4%&xo$&M2lEUP
z_P=P*B@*>R;t1AYv%?M=tGhzOWQLjB=A)T2VKOb)D?N;DIUC%uY&}&N>Hvm2>)4z4
z+mEjYcs<cUud_|_5D!1J++AoS(KC$0EP_pt6L4oCaD**=06!!UeTu6cYDA=^KT1j&
zLW{t{J0N-QPF!9D|KANOI_)vXu7#bx+sknoU(P6@8>=%Hf8yIAZ^w}Umo(p(1~w-Z
zyClZBTEnkejN1p!^yd#swXS1#3LBQ*!W$Nv&>FbZmzk-mN#}o?6o`{~dEBjMl%1^a
z4}cy2alUd2B4urauKcx4JC7f>d_7Ple<RXUY`t44<7CNTq0{|K(8rI{P7_1Sb(Xx}
z%_Y7o!pYV57VhBwaaP}xw{8%btl4u4)u3JX_0(`j4)%U4hl@=)ej^4vTgf39Di>c{
zYtQ=LLCAhMxn2)?Km9}BHW7zqmXF!Z<y5ydAod&keWwS?A%-=oq$Jbk5q6&o<Ym0h
zh-+r@Yzumv9x>xL-%~XB4DP^SF(YXFDo=7kWSf%?Ys0nvslKX*)pgx;3{=MwdRz_K
z*jJS?Ve5_I@*^NGIMxTuG#5#4&)>OX$1i8tq1>nKEnl--Ql;;A8tl4Qf+}){^h0aq
zguw6w+HRSNJ;y&mWsdP+`zTQ<#l-)|+M5#`mjh;5(S`UYG#baO7x-OqqqymJ3^nn?
zT4IiK>89lI0?<#lc{Fq?nhi7QLF}o8*otdqb+Zf-YB$_)J@gZa-of>dYcN8=Y7ot0
zgiRVUk{!wr6OLNamFPADiE-i}x}~=P&rWzjD<V7;%OqIze9zx$2F`A-YuS*^d6|+Q
zP2_4N@{2M}MAwtk4$FJE<wPgsv0bkth)$;qA6XbYiBt!fMhb5!w)Eb({@MO|l>y`t
z+*z$I*FUi(8`eW&&iNZXEJvM>9L@jU=C7nCsgN?4I{HS+bRQBg?c#)bE_%ehj}Lzu
z92Muod!bg!S8vnZbZ5TC4uZ46xyiDfmjy2N#FKiP53uW}Zu*&gz&Hj)8D>9KO}M8H
zeXH|5jBeNuuo+-=-nl<N9a*(X1~na!TAL1eVM6YPe!iW@rn}=97nm(ns8}4_Ow(rW
zeSGGlZoOn?##pI*rZPEX7mQrhKEW&B1Tn2HL&CUVn=}$@)!bpotj(;b(}YeNU)|y0
z%G0iUXX<vJ)oCOVOc+D3x``>j2uzzp+aLVTcBeu2cWT85j7X-_<rdWTJU!-?L`bVl
znwGOYRoO@m7B3;e;)_wEYOt+Av?mE|#ofDiAkl<|GUkGa_Dt<b(h(46o<SO&3Jl*v
zVx*s5RGbztRY5g>QbNV$-_fnta}QK0dw~4FG@Fpd7KcvjCi;`BtBzlX<2>4v$_=gr
zB!xaICezAo&#ml~iF=}HiZq0Pe=pFU&t{qgQWt7MAeRn^K!W#MyjB}W(C&I&vmdpv
zg+1jL<x{I?J{B;N&-)ndtTQ5`X+bu5RgqNtQ-&OF36#KV!PkBJ>SsJM#x%bFjS(`T
z3@q`R&8++t=V5K_f+^(0A)%wnaN<O?=DOay6X{F^jFhza8G+ZkJm$k+Tr{uO_j^D_
z<2F=ljf9=5*c(TXq>T#r2?KXN2bkU>Lyr{|5L!v(O(mGo*;tTu{Z*v;FzrE+E*RkN
zgdFMdu>$exX;}qGbv_ucYfSo&QiK(GW@O3GbU>gvaiKy{z|<D?4Lu%Hu%o)ELuiy-
zuO%8C&iD4wkSlb|VC|{UMj#`Ggxp<XG#S7nKNC%WN}G30Y!%AnN&)SzB&~>OC(n{H
zC48GY`Kb4h@M!J&aUbo1X0#KjC!m%p2AwuF%e+J`Cd<(5O4jODj0ua^`p^CaVgKi}
zigv;NS^44f|D=%sdom?1g~g}HOMgcSp8iN5-vi$grJ42pcqq8Moj?#Z&$aWM(y`k>
zNQe-K?13@R9?FUeoKf)_W-_K0P+$(2I|##8;=IW<QDrp1D;u1rH_XtR2*GsjbMhzG
z{InqzNE5@$-DGebLlRl>wj6+~GaOVZkHarAP`i0?_>PhF)&{a&wNusC;l2NrJH4od
z!>3GN`t|WxAJzY)hcpS+p7Ey750RJ6L$L6^wR{Y|F;az~JhqE>1h*kI&hXEMANs_P
z_P!Ejeq+mDd7g~+|IV;PphJE2VZumM`E$wx0;tE5Eq0N(?(HiH(@Os7WFiVfh{)k>
z`7Ah^v@g<$JW;}RO`VdXL1d*AM-bh@tmY9@$|ykmx(C4VM>caQ4l{Z^9Mv?w#g5%U
z_I;pbPDqjjtD#QyBjjxb%m^_%agJhum}M}cj0Rk+7MBs8iX|9zH5(=Fyd=IAySfK?
zcEJIlw;AX;Cd*VNM0JBLZm_^3NWAPh83Zy^wz&IFQDMHRzg!<*upl8z97j5_KVpdE
zhI)o!I#Nnrl2M^;Va54I<{w~t-poetyZAMgoGgWo<e3}C$@d1NQ8g9<P9dJE0&hB;
zB1_(rw_*Uh%s`usA&XuPZ<suOT4HH54RR{eNblG8KR?{qx9|=6ak?T6^ZR(u4=NX$
zI_MM}z_hN)*DGm$$e3+85JRLC-U!1f{LEpcuY6S09v|EJtNkvk{dm3eJfs_RwnOKK
zwf4wE5cp`xuYx|HVL9fMjo4KH4>(UE2$AsyZrC71R;9mrDYkv~KBNQtu7Toq{XP?T
z)bnw&%IX;t$llkqw7lY8gX}>LlF=GeUaoQ_QL`_!td^_W3*(kkrS^nNK^+1PWsfBJ
z3}5?O0|uaj9-BeP&+(3Lw%&JoSD>3H!>Sy=FchA)PI}aKEG1Wc-P-5d!<1^sG7sTq
zd4NFZ;)H4R1|C)nPHqn-7urC2yXY5!Tp!y0_+p>S%^rLq$b1ka$t2bBb;QUk`%F>X
zAj}1;ajeccGq17$FOr4Kq>taoadTrAWc0<`koRB4DsJj@aavzlkb=WwNt;kdYWAw$
z>Zwk_C)%hgeO9~LPnSC-`FiqCRX2^xfTQYNy2+%6Odz=v=3I1j8)u7qv%+!W!u>(D
zUn@VwN4NwWnd*9(wyFl@+f>QacLMM_>T!3>mZ^-7>V|&N*TklF2AkD2xWvnSXVh5|
zn&{UYf^iZ<Ffk%;gOx8UeMgO9z}ibgsXINpc@!j0ODfx;x74Y=hn!3h<lTT1(PdGH
z_XgrKeArpF3Mk~Po%{m@7H1fta`z1rNgD+>eXmXeLl1%Vm3LvfC_&1-^m?%SgRn``
z3D+DK?dBnLvg$WtBfOr`E?Vh8tgx7XX%xY1nrSJ0{v=a{vL3?N!AKlomPfXU2+@2I
zDr(OmTBFN)=-J@q>i!a>qGOHuesm1==Dw{p8d(_nl3;V07JB24?B1sq`nsaBsv-W)
zYTL5%G@X5o1LnoLhuvc)+ND1-kCw>%R}2S8d!nz05GoY{8v~zxyFK&T2##Yg)#MPT
zkMm2f7J27mh|TzN6lIa8`x>Sr$=G@|jdh)jJq|%6n<=+Vdvq>FbT!tS4xwKxDH}1&
z>4E5)_;IPl3R*K}dXR}c2SbXMFD!Stzl#^IIf#@67*2?u`wX*1m~(y#qOhXhAl)PT
z+*r|`^@B5naOCJEZ>MmDtkfqQpGp7l8)nzY@_5#1Xub4}L#iD@8r0qW0x`wCe5oGB
zfqgnwNJTQq8xdfVpWA9cvRGkSRifoIGx7FZ@kdU4g%G}U*zdRdI=u3reX$@%G9>aF
zZ+bz~^G`QNyKhUDrz{14NKBl*gcJiMLy)t#^UpWiFGyq>qNUS`Q1fx80-5W8)MiYb
zWh}oG_O0>8e1#s(L>8g{kBo4UkJiOc3>#2fThNJ+T;R&sd8RT+s+SXoV+!oMr)C*<
z@@*qI;7i}zUj=DM0F%6jfK*}w(Jpwg<UR0TEc;iYH}+SNeI}2c4@3-&*z{I(1v6mQ
zOPDkmt~Nq6SJjgCsw$s5*6_An0vejTLN-334`=x81~AssgHi!Okf+|QnHi5MW=c^?
z)78I6HbnkoPMdn5yqhC9W6YGszc=IF_d|4lg}M$7r5Eu4%3DrfT2FAuzI8pg;!P_x
zI}!KjdN&FgF(Sn_2SSFAmR5&uG7yrAXJmn2XpTNqrhfAwhR6FLQWHZqqQRwAkL-^S
z>H~1Q0hMQpVn50~H!?5gV)%f}BtpVBib(j$$sSO9c@nD123gIGm<}f^v)g*s$Go}`
z^qvbDPSrAPVln&*Ju>Z{LDzZjp+g5^`H=94SPcVe9Zd`O`EHs%XERHeD&Nm+DDlSr
zwhx4ukwG@GRXS?F*B<mxt9*rv1bbyKtT{Gz(}`!l`u$!}l2#aGMd2iz)s~xeDzYi&
zX~i5gg_j57Yu7lgFWz$^2$5nCH^~k-v-bV+-eA%Ana^<l#HAwAV=q)&#z?PSg<Axp
z<bdFhMWkZNss6_!WI7WtT20wKnCD`W5uf90Rn9>bC%D1wvq7%O{S1OFEh`(buoiew
zbW>A`T9amaQmwC1jS!KP8(W95L9<AR!L_QsFaFZ6iF;cv?_mak-Go!>m1tTN^)|R@
zu1GRmX9HSGixN2mx}myn?_g1+?Ve0NH2x3Nm<<^#&Ny#<xv_m=`6fea11{6FE<^cU
ziq;?%4R1>20b(i)JnR1b^_KdO|6ZDVzdt`d7Lvf~REy{U<6BNiD8u$?P$aJ9C7JHZ
z>?fVUkv2XIHE3ZELI@>$(P<511cE@2>grCgfIv2-A$f*BNS?t1#cbfN`g<1#i*^DG
z59y#nJE5d_n?vXx4jz24yM&4dO5G+wWZm@Z_THVIawd{z7$Pd1cRE|CsgQ{#@{nqG
z-~5OKsaxsZs6`HI{L!Iv0(P7pc{i12$?@kf)P_B0q~E#(B}~W&wbD0%b4fUX$^TJM
z4_<*VeIa_*tIV;y5e7W5d*_*~FrYej%y>U$OQ=xnGpj~!IFJGW4I?IPNNq@g2&R9g
zR3xTcGU+bVISK@Fw!CI+SSU(R_|4@@+O5EEm{i2jGZZ=69pGpdarFeuAu+b-gs6mZ
zQFzEO()mC34Jm~ghUK&mRzV&)^)ch3??viLRwDKSJAZ5#lHz_R=FHI}I9Jz`Dev~D
zbO<Rlc+Qw0VFR{6q$9z`>UY=tWx0m|@U#ba#B(S9h$bVOlST<Pt;J*2ui6c9tmhwj
zUF}I-O-`>N7yMfw)}xqUVF2&|Twl<J>kFTzdLbA6LG($qJ#@I0UKK|?+DWieb|=)x
z&`%P}WSjB)ge4@IewJ2(|BTQ0XE0@WswDJpw)YDpXiuJf0-Ce*xfN7Hiejn{oF4>{
z=iLl-kfrCb)>z+$444F`G?xDX5|q4kM@XvmN2WDjzX$^|JGR4`PFhEQk0(b|0hH0G
zY_hCdV9fGLK>18xgMvb9#xZGCZJeYzgMF&4@g9b<Tym`}9}->Mnc`0i_?X4Le(C1?
zdGLwCw#A}4s<NhyEMs|;1Ky?T=_V`}e}+JGHjGM}b*%P<LIAz;Fk4CI%1M}aaWh+h
z#ps}hmVayWU6}M?9QHJL$dv9>6cxJ2G<U90>vAr#+M*0G3cC|;82lC~U3-+lBBu78
zt1qhFmtNYmF1hfB(i>$cjpNbb<$8D+8PoNpO=`Ye>!Iakg8tX$UgJ2yF|GqmWVfb5
z6Rdm-UCob(m4+sz9BLk{6_-9-qS&3)oK+<cO3|XmgXz};4t>_bXutO2r#hysV6PN~
zSJe1pIJEqJK-rtuiO_W&*@wL6<?Qy+r$anj#42wtlJ-ZuczmAfu8ABl2BbvKIR?o<
z2@SfID}l)^soUaV(mL}EM7ir(N~!-GIGS6rZhgHcQ7L;xt0wO$WLVkm5K;pst?If9
zkAg|(^89J*A^!JeU)87_4qEo4z-+^_1?PW%x=V@=xse!5_C^I#5SN9U{|`_VxXApe
z+AK?3?QF=LN3Ay7c2CIrzlZsOLo<{AvIYzv_;2G{UkL3_7(}X3PCy@%Qd}9(zGp^J
zv;MmEDVz~9lPiv$1xa-wd7+4`p0W9R+kbqtaJJtaZ}22uG)m))j>h<E*h$B2?>jxh
z1qFlLQU-aUb1WzpOEqNpo!@UL_nnpN`?>*<xcw2t567bj;}-;jv#gxEMt76#V;tDW
zY`m=U2Ef6;&~VAJ=wn~*a8KK>Q|7Mpq0;(g+sM$(bYlhU8pDe)%Eq;uR>V{3@{hfW
zFptYzwMPai1j^e=mZxUNdOMC5w~D$Y^$5*FJA7`XM85|;OGFrP6LHIFT{Gau?Q@VE
z%K@rTxxmRT8_QA{u@LzKD2V?pBKN0EhNq!YOr$#Q*q>O(*8RQI;(Po0q|oz1O0`FP
znERQ1qOLFU&E~zWv?QzHTZog%>ShmyU4aPg4>K(>DTAYoB`wb&JY5Ub6E3-z49}~t
zTf`9SfNm6iv(jz-?Ao~}8wy4n-VoW>kik;?Ct{bEG(w1@9Q(bmvDsbHP3GmQidR4?
z6?Z9EKZ0_hHw<f!?i9Wd@$G=JgovNdZ)~yN-R-L#E8>&|cl8d@&E>l0cOPnfcNo&S
z#@%r7_mT%xzmEFZPp8<Wiqr4pd#;M&m(Kfs8@;-klu!fV3LL5fxn)(Gd~FI$|6yrV
zVZOmf?Ep{AQ~>|9Jz~_>+V<6*d{C6p=sTWaDu}rrAH6H#Vhd7!>I&24JT`whz<e{&
z>@s_ZEHYDv3TX}Q;_)0_dkJOSSn4QAGo)lRCgWb~#B1)i^mRva?MMC~Dm>?aPP>7@
zVW>n=$k#GTDZU^Tt4WMP4E@<<FId=E>PU~%eKDbgtMt}BKsahXGagm-F<h#_eh+-S
z#oi2J{T!_@{qDu$FZ_>@y-QG3`kTQQ#twn#X~0B<zH~7*V}G+c7OB$<6D034I8s4Y
z@mC(3CGq~C`u6YCvT$(*(($-Aq-@`}Ns7C8jv6tUeQS%}M8-12s0;RC7q0#gY0Z2Q
z9e|Z<(qDI3^6l*aW9ZRusNK3E)APgW!t3O7y=+#TJ(-AwQO?y`zxE5MCRyXj2}-j&
ztHgQ#JxpknYu$P=54M;bGNI*pQym6zSZ`+B!{`;B0~;ndvcB)(DV@mZEB0r^GYMj2
z!q^7x^(Gt5fTo9L)nv=40=cc%#B?kK^s<;C&dsbwLh&kJ^JvVQ3ew|UnY-Pn3%iGQ
zu_)6#a{-gyYx5^kVFooUKF?pp8Op#wBVBeDqDkvKcsC}4Vl3{Lk78cG5XvM9Fkd`z
z>8DHN<D+E`vEe;yoL~YCK?cSLnd>EL5^sxDI=eF$H1e)XQ!to?A?MTx9ZYaLt0K^h
zu@q2C;7z{SfmF|CT%XLM%l4>-qQlhJ>BnawXCm5u#h&#t-mAE6JH)b*$wth#_}AMK
zcF=6Ggj&Rq_Yf@o22m2fe8QQCCz{NK+9lsH*uYZ@!-dAu^zq;D`>AUVtSx|Pz*0h0
z`mc=9qs{Nw_bU?h%9?-hN-L098pCG2MFP1A5)H9drW##JP`t~qDh|niS5}92Fu2OS
zY;QuU>C`K^*<$nL$*AjEc17304l{B}JCJZJbw`KW$MH||*tOoM&tnoNi+N`&$l<^L
zzR`0ghhU~`VEn|k@8wmxv$FiX2NoR@*zGjWiu+2k%x#{1d|$Ok5-m7`R3xYiqF%ed
z&6LXm*9aJTz!k*h;F#p}QT>hO$E;fFFxi&(w(@;tY7gd!@hKFohB*@xt#X2p@pVlJ
zmKQzs=Amv^%Ma}LuM1IiP{_MIs2ns^b?Pxct}^wUQUzoWNZSu)JGx$FKH1pm@DP`F
z@|;&jA?qEvGj>O=olkTY2;8MLpQf*wX7qFW$$sfVCw#i?J4@DG148GYq@H`4xJyt*
zp4<Hh-N~Xu+Y^DO`IcKQd3<Vu6zj*1@!;9zjogXBYp<p}RG<C|x^()}#^lrJ9i(6?
zDua$xSqv6;(8_UEF?U>;%jZ_yqA=ulr##<jrwi&DA6_}RCr`4k?Tm3^tL0dC!TN{#
zVGjv*o`-ky{nge;dj@sAZy$1+f|0P-WNeL^R%4ksr6o)6#(Mtrw@_JUeR>Cypbe)F
zzvZ%ylKq)_K!N9`oYSpqt-9$q-P_EJ3h%u7v)b};o~f(Hso8?Q&vNDi3~tRamKBSU
z|H3}*WoGihPaMa&IR1PLHc2QrK_Hm<yxWaJV<cFKa=F^3GP6%u62d#@C4F)v`?*z0
zdoogApuWMqmFv|c-$<TmT>IpZw}S1_cMVw`o;TzB?#!pM?76t0)%fCL$%TwtmwF<@
zjP93D1(iNaYLoyw{?983-9k?<FAJ$2Zjaa=Wt?=h&^V9VxgHyS>7~-U!UL+SzE>uT
zs-@jaI8#_Be#8#H?>%tImt9b!f;}clS^f8$Jx}+3PPyz1hJulc@7#xDx0D3R;?KTo
zo9OUPFWPtL7Vp=03G-6^Od^3|*!%jt(%iMH2c2q7`k&Im0Uf!hs0L(go(K`byHF&Z
zFK?-Z+DtIot!5{7%Lg>Zb($ZZyN2*&GuDk4neK9$ejvcNXS|Q;N9f9_eUCdg$7)@n
z4$hO^qd>cB_<bu>zn*@cCLnrvsqALMi}m;3f$c9>?meb5@O;Wscx1i*F^k4=w5ipd
zM-~l`zO6a7z{+p9BJdTw)<X%e>MQL(x_ecm^s?Rx{hmu)7K-3~=!(uw=8pUJWc$R7
zk-^N>9aJcEsVEMY<5XF}LWSaC0i@vRghVHa&v(!DwFi8hvP{MZIkgSkP35o0xx?P?
zacK)>ifSLi#pyT*hI7s_X&C=z5sc<t<~YNhs6CL#Z5{TjRS=dnW5T8skBlrCgm>9c
zF9q0QGpd>n?K<Chj_`RwPd)gd)PczTo;1|^Y4Fl*_)^J3n0KfMri*a16NQe!<8WMN
z9-{PAsC4GoXZZCu>RYOxuo*@b100s6?ta}6*HN*3;9Q}12zLm^sx&Cx`#EG@T@x(H
zpRzsEy71}Y?{m6|5>(5_fzYr-0|KFe`346ILVpDYrp#9BSW}-v-(;P7^d(T-4_U6S
zMXfBt7{3>lx9fgi37LM|a9VZQTJMQ;g!9PIOc^XC=X8!4_J#4(ho$&0v<MM!s-)6A
zyL7J_wfpqc?60p|^zsRLIZzw8K6*<{{@0kppD{s(&710*Hz!}tertA9tedi+ll$}Q
zD6wiM6Ap+&m>P}G8x8b9q0?B;Y!=`|4H`m>Dn<?j4dmvIJo{G*VAw#T{^P#hk{?ui
z9{%;3MqQ~~xnj_X<g(S-D4sN{9S+dSW5=d=woy;t*g?Wm=bE$+_(lmC1f)N+Qr1oA
zxt%+=-c)<hYqdu8W^ElEFB2fCXS7?Xxw&}(*~bo25<Ny=VHQnC2q!#3MWkiehHd(9
zLvhqKK2SF=#e%`VaRG;T<CeouZI4!?g95Aux6{iM02;d_083}zigHt-+-M~lE4BJL
zEz&HvIZt^ZBlA|cCtCZOi`3`$ADwKL8sJ<m8GK1S_o+td^E2F+&_woU=MjP6xPU^X
z6}F^&X2PXhBjg<uBC)>e(>*hXJIG3cARt}^=)Rfl1C-ze2S83R*Htz{*WEh#N5~S~
zziBGe-+PTya)f#6F?p?OSgYw=I~VLNmtKM=qvvWu0xz9muB}R=I;nknUTX5j_RkwW
z?f3I$zm~2%3G6%{rszLEi_Xl^Os8q-VC#!USU8;U5Y|c7uulfy+S6;BGV7^P-&)*=
z7BVimHgT&<_WF9_N><p<td^NMkxO$T4T@m*U7Q_nQeiz9U8-f2C4hic1Hig#<}jTe
zeEx{)W>$6bMD6>tpx4MVv!JUI|L5n^Ftzs2S8ES9FX7E+OU=(ziKiK5LHPq3gXhQL
z42p6r(sq#c(^WBN_AI2$v<KQYI>+;!3inrD?_p_#fS!l#!0h+V?=CH^y^_0#Q+^4f
za;$Op)Z+z5N7&5IBXpmg8y@cLE?J=lOYW~bLS%MizhK<(vZ{KdewRU&j@jYaKdXVK
zTRJzYCoFd<R<NH@zNjl&M1zn8fet{xYGchui`6$NNa(^3OMK0>n^>&wmSz(S<J``<
z@iS*7=xNAoGM{&F`yKjyO#OOs(9h_e$oqASjvp`bdVOO8A<ZVC@&FaefR1%%`i`=Y
zgJJ^n{03^0dS}y|IhKk?>h}&n_1?9a6<5XOcfnF4Mg@zasja#4I=MN@31rc<A4HRD
zdj%Vyl%4J!)d0Ab$x%X&qlAC<MvEwOlTOmU6V#4Bsb^0s$XYLaEI;iAaQT04A2&=W
z_;~wTP*x4y(K);40o-yi<`_BFGc2*$%FnSAZm?0Hyzp>5jQKkW+m>UArKt@)b1Cnc
zkbfXk!6@y|L!^9^7b>+izFamKL?k>=#(_;4<vA1yI~<^|YKSqgy*sC$KDR02t9sLP
zsv>wM8y$aAKV8Qn!T<42CiByMqjQgr{BKsJLJ6`i1S^}^qWF*49;)RjB7hcXxY_*K
zO%|2P-&`d_77|1>8h^@d|2!T9MmTjF#^`h1sY#j#%Iy3{@>6d3XgXd!h<}tvBe4F;
z*48$!EFy%N%+dq27=7W1Vlfa8q#^p-`Dx~MS@>F=P{9+^ULsp}Yq7n@uMd6CFTd(|
z)lEi+UQoMqTz2#_0`C}51zb**qR2UgMN`y8rP?z!O$T3^w6@y&9#UGA(XKC8L8~}F
zv{(|u3?#TPq$cz9&m+ti4s+qC40hsgkw1S&dFkb_8k#XmRj~Wm>%|HX7CXasZ^YZv
z@CWV4{?7m0C1&dLBOL@L`!N`OvokLK$QktkZn|5qymTWZSct5(uO^OPcX=e6R+{T6
z{ejxuYx5+VneRs|eT7LgPdO)oip1jtg3L&axhH%s#+x#lSOyXETh1&ORYz7v<gGcB
z@b-QaukB#o^W^7A!?&v;!M?2ev&r^NE_dvcb}x4Ru}eixjrl5s!z~()`y=dM5J-|L
zUCf}Lbe+iarxranem67-_CGlrG0%kS#Z_Lx;(gMc+RvoVhHcCs2Y|-~;PL1&9kKyn
zj`~tM==bxi-K8B;m)U!9H-CiBPOx78{HG!!<F0Do=2~am>ziVSSxmi_6klGX2%K%;
zwymTYvhLf3uS?!hvUI;qtf0T;;FB;*M&GvG{_I!FdFxnj{a|&N&}lXMcHqRlAB}3u
zCnor+S<FP<eL$#~;Ft@`xHVVWkU7|p<i8U9yx~mbMeFnp+ZeMSpMwt^lX`wNW<>ds
z26^8K;yB!7)xCCvW4bWZk8bifEgA7@aZyQYjy#D!d|=0-(^tdO(SH1JL}d9CSYAUR
zL6c0H8R+-M;yH>*&gst$W8R%Oaj=D`a8oU2)a;QadGS&ZVmZ1h71RiTjO+@;<OcL=
zB+NGy-Wbj-Bm{Y69=}?-YV&xs_YQe+pj0W)i1zMn?8pNM5s^U)hV2oss6_LO3(27R
z`$t5={(rwG1M2^s@Zr4E1p_=ARYSfd-er$J`WM+ma1R>p=2(%@PcooenQ2!{2zP$f
zd2w=ld>p@AX7-O*f>0>ti--V*KV;LcOf@aX_Plv;x%?mDq%mVt@MF_rhUAssgEIcg
zNTDm73&^QQ)wQe0QYlqkTYr9?e(vTZX50y>X#C`A!5rlsgv3%+&)wC@6hcg6j&;P3
z^TY`{_!R#*&6LfnbQZUt<tmu@<UPRh6tYdpCRlI^;0&r*%A_cNsna45ip(h|>H&Ek
zC*Qh67SmLyB>0+?eT{6zp-TmUVN7f*Xeo~k=ZA+Es~B1##sn4WCbo=D-|5*n@d}>N
zBPFqji!F_E_5~t=Fd|*{7~#nsq(b^pYW&acYOUJKU{F!^*2aoW@he}nA)m?-kD8af
zm>{T+#tXV>_kzqtmQiMdpFCQ0qWpvx*xdJay4l}NF0)`$s-8~elX<es5&ej2V74Qq
zF*}&$A!#R#>vD?HKW^O*YRAmo%bx<2S6FPy0v!dGhzW#ZR*fL<84o(fpzfJP4&?+y
z`1(wPyapixUXA9lzxtQesp8Zq8^m~m#$90gx7f&sn@AI-7>cC96bqR$>Ob1P0mD1~
z?it-M$jZ|}K@DEibhb%&OI<xQFwOC@sSAY)04IAWIAH~`v%k4b8IgjpqXcv!?wR`k
zwH;D4oPjUu?-@*Y>wo+BGOJhJcV)6sMCI&jLWDqt5#^P|QgFf!nlg0+m3S~=3>qda
za{nF|2gg<YxBjp1pI#}rzk7W@QkGg!pLu2Xza<8NG?Y<fQqVUAJIe89r3{5I>fmcp
zBxUbt*gLI4o7msI14~dgfPEna`-@kP>$PcVZa({u6w+7@M*OpLfc9Nb%zt5o#nsuv
zs(SzDs`T&@Wz7Gz6*vhE1^IyA`sz<W7lC!a8IH|kwSRdNmy#RKgfDMu-K3Z%5|7dN
zHY&<V2#)RH;o4UyaHpli=reozSqM`w&JJJwuw-1N6o7ShU)SpY7jsl76>9KLo8IoJ
z)+i#I9RT5SI!=u=_s5lr$Sd#%OxaR+)k2kmICfBIhk&R^xtI3U6um2qdqzt+02S&L
z@W}!L`DeIeAR>wwYN!4_R07+4NZDo_vQ4iGV}G}a#&1&4id!=#og*}yetKPk-OZm2
zN7#WGD*(LL(&{wjATE;#=e$@|R0klC6(eAw0ZunRZWW09MBfHb?f0efFa<!SAjB4!
z|Nq(2pI_fk|N8b(<ueW0M2!7+SOWx1&W)9F$cv1Q;r1NP-NlDW+y4#aPZ2oiDLClV
zqaZ%=PB^m(zAtBi=O!|}$HMUeU2*w7ycFPoj{*i<>2UlK8F2&j84r?o(lCz3aCq%Z
z!{|EwNmI&AwlwG@?f46nNMPmNopG!wWWbHm-=<ZWiV{1PHn)t%Z+N{$LPPLN$NwdT
zfc1Kh@%cI#U5q=1Pe^X2GVm2Unw~0_i@0dW?+@+*BZ1S#fQy3Hog}BTOt&VgwageT
za-;a>O-okyGQ@vmrK}y^g#+%2JhLmBg40TR36-HzV}jZ4levjz)*Y5Lf29tVcm+Ht
z)hDYIK@-fepn`K#5xVs~zWpBw7a&$IO{^5fqKXly8<flYj7e)-l+nGiyX5NCzihAo
zN?_!2g$IQR7-%I_9^K>&mY~z0GI}_PCB6J7w`@R!+SCUzQJ`Mh)$rUicNae7`|6MY
ze&C8QeH){>e)&Hnjv+FY@&Y5H0?y!-mn{cgPEF<7J`kod$dil`(XCLXAc=rN1S_{S
z8v-7jB&pPPUBN!G=WFim%_|g{NjeT(Ufz5+7!f61&JagvbmLCRxHT~<`Ci$Y$R<bu
ziLfLnDZeijyvPQN#fF`8jnh;4ODBq_B1iMcxF{9@aaTJumW_O4#-5QuAnW8ij#j=R
zMmfDUyeUrMt~>cnU#a77>Q_W0!H#`{hpwxJG_)rF6CDS20I96D78KP($aLvFhPP;+
z%+Sj@9xdw6Pk-hgI*$W7&D{((DCiU-wYe$yQyDzhvAh3QFrrIA>gH&QSL`GMHe0an
z3|fE2Td{Ov!SXMJjsS!*Y;5hx5IRJ1>OXef)tn<G`$W23;IWhBH>KPLR$Y8=6@+}t
zZtE&?Q|Mj8^TtyhS@&l@oe$=<I%P))d2LcCB)g6f#Fy%XX%NLpcer-vYm&pDpsK;D
zjZ3Cbjj)vIHL>S-oLj@6Pdx+2uTW_Ee~JBKp@TlT#zANUwX3DSKY;Etp*Qj((>fCL
zLE{Y++K(V-B^*J0VLw$(rw{R+h96&k*KO$^QPm+IDx5F{N4)*g;UoEoOr$xvyu)8b
zx2zBL+<3@j?)mv`7ip*7)?p^GvA<>kGnfVVskI7YM5(yZ9ye};Se=CcV0-N7XeI^Q
z5CSr|voV&Bd`n|0%XMX;Z<bOvsx>GLDHA0F8|Rqk@K-Gr^PA4*lp0hU*SZzGBgY7U
z(Zx`@DMlqiV-&29V&rnn(jB@Nm}cVs!r?z*1d!<Brq^8?F#@HpdXg3X8g$_VF*w+9
z8yi*f!GcJj*Ag0qU)fV64(T0WV?h269T`54P{yT`oT_Q(_8cQiTwF>GAa$nMiX1V~
zH!YCD)qm5E$k+U9gOGAS_Ulmyi<WlOxUv4cv1Y!8#b4NaAtu3Jvj$?824(dY<d7-^
z9gl!1%bJx={$W=uHH`Ji{s>_lZy|{%%B$Zjf+?g@&=DS9+>viZVU1&S$E&rN=^IyP
zJ5&<n_5Vl2{<n=n7(va4_teN6!000p3RiZ5ul0<LEEWh(sCj>ViNRbVjj8?LSD1ln
zKSh?lBsBHsl%D2JT7^!rzpU{eo8d<(LE?q`5Yt~X4U1a~1Jo?gQW%7S41~l!1GL|@
z@7O_E{jb4+Bfv{278M2Sf3*QdKkXB0$3KbnZ>j$nc7K%uqN@RXo%A~>p#Jwm6v9dW
z^Xk7JQtkrEeoF25pI86=0LpUbe;%ZO<G(hWsP!7`ES>`k5JOTp5mV0L_<eQeX5}0R
z9bGF|>c1L20QBBhIyl?z<BpS1i+B0E%O|Ve@m~2|Q@rsyj%QKJbgr9iNB5m1I=JxJ
zy%onI$Nf(LfU}J-6Zu=_q}EK+IT-TIFk|u)Hj)xg$RZhXOt^E4`TN!$pCTfpYKj;f
zd(h16u{7U|Lt|}NS><Sj**i{Pq-PjA;yP`{c{RC8Uv_U>HA-(WUa226|D;zw;Y0C-
zEStl&w<NYywzo&^M{fXJ`~fM-{OTeHrAa^PGQzSy$>P?#4%R0>uTw@v#h9Kqd8_!U
zc2V&|WmDIhgZ+x&@@7c0!Pyzznk%2b`wl7izx?X)nczH8W3Lx~4z73@b3Fb7TReCH
zB=GAzeJa{2sg%nCYDzWUi)54jATOy?tGB+>C&n-lH`yLo8}6($V_|^7a*{w}qLYea
z(UxqYmm+U3HQl{VK^x}{Fa+Ov$O;J)9RI*>H}cvANlo-Uiuwti)Op_->}C|t=W%~N
zVO7(#^ZSJd^_M~>1qX@`%uF1Q*<bL}{>MGbcL$?q?`|pg{CW?L$9UMOKqO?;4-_Kk
zaj;it*s`E?R{wX)$rdl64zgVkBajm8$Xe*m*`r>+s{b<~;djRgf=tHVO5m&AVDHL&
zWj9kpsbbz$PJPHySZStl&Z0;@ReIKYYoecyoBz%UFa8j776iU9;++|pS71o*VejuP
zpHP0O#YSX}V12brjo0$hm=_>8o`L^c*cAI2ON<0SF!Xy<M0_y&8PKMBSzqlWvVdv3
z&VbCm>7`69G7KK?$m|nwikL`2$E%F7-5O*kujnrA!cTC7QKD)5Ey)c`3y{VbVX1md
z5f$?EgY^3GDld8&750DJ(ehsStP1QBWKfRjsR}w$DLhqb_JpFgA3%m+cl*VR<Q*hm
zFfjVYP8=~TU?I)59P!m?f8p1?BtT{x$B(Ely78PE4c|)7YQbWN&Ljv{phzyT+sE;T
z5IO_MnFHho1CEKI(0$<AaGh@7FHgD(SP)+ik|y&$XJXggUg8Ji4lbknY>2<K6DWOm
z-7(fU!YkNTXS6+Kr7ArTk$X=E@)K|jU`26%tja_;!~Xm*@ERaK_~U@w)G<k*su%z!
z=f?t8^tc!fXD!~6NAY)O(TG^>{T0kxf7Qox(RSl>2U`~s#6QRn&qi^ZTVUkaL}~Er
zC&ko`AEHR1|9~L8s}2tKz<Xbb;BW%L_f2yvDZH4&m4o8HdN7F|=yyjC*u0pD+CicP
zu^{}StU|lMgh2RD#O*MgXZ`loG-ljEPk3Y9OTESa5(Xg4x^&U;pO&S?zCyZUvh(6E
zSC_ajA1^PjVmoh$2$Sj&bc{PaGW_3m<SqeIJ6j`yOp5L?u!nUKFuOGFmm`e<m@9t<
z$y30bMv%tEF$qF2|2NBJ|2|6Wt!o(0q~*guFE|VoSWI*LFHQ3>zql8+-Mn8OmBt_+
z1qSCq$Q=(X<e5@qBtZ09&kl-{;nQV`JnzMY5VI$ouD&VzGxGvS_YtZhLy8h1fx*9P
z1RJjr-n9TfzUiaMhN}aa>>MOpOA+5%Y_a?0>lqDrL%yvXYS_~My$!LM%Ql{{Xn5Ca
zI7FleGc{Zlf(-&2Q=VA{XQ#?=>LOxV!CFg_RhQ6W@YBDJQg$ZcXNPKSJ`?`A`{^cr
zLV*L2%)QPU6A;P0J~^MeCsAk54iyt9l)V4{Veh@;v26eN;kr^Ob(<kFLiUJ^va`um
z_Q<Aep^WS(*)v<pO1bQr>|_hs*%6YNz4?94%RRoo=fCH_=a1+9>wf7v&+{0c<2XL!
z{eB<NHx=Exu&0Y32XEnVaAp*o(ZrsKauN_`N!`d|IEfO0%#iCDCl{0?q&Y6+NJnAQ
z;JDeG5INTS<ZvtWVi#~bG^G9YYiUCs7?Kv=cY((vL60g3LRU7C;hzln3t|NCY4};2
z%dN!!#zh=Cn}p!cf=Ill0kKc^Bgl%#K*IU5`V$s*>`a<7<imDu4O{zg2cJ(mVbUv@
z3Ogw{QEw2)DMDMdo@ZgRNeus0hTg!)i=3(u_D%s4df89cDv3P>|NR0kl72abLj>Jo
z=>LJi=E3A~)lM^TPn%j<u?6fRy~insUpb`AQsBtI&T(1UgoX9ELCi`Uhn%z){=ax(
zDY_#WEVlo*Eh#7bUSc&+ig)>TcSu$ioPnV*14*SCs;C=?>x&&2oD4ASzw0~y=9w<Q
z?C|)TDjn|v-#&>$jTh1@r4U<pEdTU@368^nfnYeY^l@gu6df)eZx?9qDz~i;nZOyp
zXAcq<v%#@nhgjryXMgm$3o5mAkDgnT`W^?jY%u8>j1JIMBA5_{v6E_IB4ACfHo1+6
z`Gmu6!w}-6aR-I73&Xc=Hs6YK23dm-ikL~T<D;c*cyxH6y`cMwCgXdy;v6)90k%u|
z?9b}4;4jh~TXYX>dx}MReSs&uH(-+b%@*(CoPim<EOmn!2m78`-Q+7sf}(8At&A4_
zk^$~?{Ns5HrT2LlqvNw9>pX(x8aze)=h?#J*a}^zqrGu4x85}BXuZSLSc8p1{hCx5
z4ry^NCUII9=*Ftv32{p>)86~K9<UXDtEvSNjyT{mYs+%(mo<rS?j?=yBrVxa1>S*H
z#Y{d%5=&0VK%Pehz+Z^0?D}8}ges`PZg*@UoBm4RaIWhY%CYNv&9b81D@nSKw7pl|
zDp~m3&I>}(xfd)kbL+&$J=hr|WnR4nL@Z=$HJm_($84Z#$+2jBJ7m21k$sR(y7G>9
z|0+PF-8ZA&3tw|<(;17T@>uUGVmXKJ{^Qiy`}Z5@t{eP#PP1?51jxaKn-hQ~X@MM%
z@1y6?p^K9hu=BkFlT}Q;tnMTJ1Uma9tKiK}*wJ>_ApU+W{sTm#D@<*oqGqq6=pLwc
zQS0;fen0?lMKp&KG#z7rk+)?7saM@_j{#?oftL!?r8q<9Dgou+NxhzYcJb-T(c$uu
zX>57#yCJ>aEMNeJJJik6!So`5PT7jO2pV>l_2C1+x;InrG|Mi1m<2@dTfJX`1M#A@
zC#;?VJH}MtQw!QDik+JG9w@%uj^1|6a~rizZ<8j|ixQb63~Tn>(DZUtnpTZ#$CG;d
zbWZhcc^a)uxOKHnDo%s4aK`rwY3L}Q6W%MhxVo24?VmV7dqQ3c0P>AJ8VU-@#L#P|
z;OEuNz>IiZ1$dD$<;JH!s)qV<OCJxKJZO~_+wB^3J+;y~P`z2QQ2MdT#>r1;<f;(J
zW@=O)l5qVw>-FbsQGSZM6uCW+Av~1SLw}1kbjdD5Ik!+}bWY<rYg(PP#t$eiJ8znK
zZJG(4U-9K~H)<MUNaV0^br$epsN&Z&fxQFC8qfc>#5n`!$7Ofo9JN}gt{=7Jy7px~
za;t6UBEQ}Ej?3fsyWAshe0SQtwwnDm_G;gKKd)`Sh0$ftE%qz%&h>&!TUE;*MSj9l
zR|h|O?S15O%wu`$K2R~n=k}9wyeutn+5MN0*+h4E=qd$K7_dl+%tY7XC0SDot+js0
z2$U;NkDuZWS$3=b<;JgDUTn)nD_?QY_vVCmADCqpWjV@Q8EjgnPxlVlCUORYx9?QR
z>Gay));W=yTMf}#bJ`55*$zS*6#lGKy(hxA7j3W;JyxR}x*@XqgQ@kv;Aq33==ar?
z&R?QipV&!dL&Wx$!Acg|3=SWY{_<-3=zeFjFVB;2uT`@;Qg5>`Hc6y+6y)JLAK5*p
zEY4l7By04jSsbp;S#9##a`uWH-EI_E+JBR!n|~!7;gt<00y{#T^LGg`KwYG{TR+Tg
z&(|Ez58f9FZHK1wn=N-hjt=%<o)9NBP<(`bg=EUOOmZD`b3Lp%7_O;GTPTpT3O)fK
zC%IQzWIy@xT+pw2$lg5J$TARdo4_-OJE*_6U*9?qh*wV+{`O$O%xql0sW&XPdi~wt
zd(n-|ySY?kE?q(Typ#vs(}eW0#yIkQ_JFp)B9UT0>g?Q~em2Gb@e_J8*SX4SdcM4t
z7blhLE9C~s$;CxLBw46Aq?a()G|}9&OXlIFvDhF%azRzd(aPj>3iSm}_6u1e<?QWV
zhrhf`5xZo2KfvqJ9MW}(FjOtas8B(hzF6CdS7D)X4QqCt2#(9_?RGtr>Cw{OHnP6&
z4yZt!VM|)$tS%eOxzlfZ&|jTZ+`9o#s0DuqP&Qz9yvTG9KyY;5lz+xWZUO20c~8zh
zt|cSw<=gZubSF{I5ihvhg(Osb3pSG_wBO@~@9M^5q4dtxYe_`{Rlm`AK@{`B`@Q!L
z)L<5?vP%^ZSW!`#=|bR|?a`(!qD>at;7SoB^Q69md99UfEA$V2brNuLkX6PnZAvVg
zUG#WfXT1EC4%cp}$fN7URE%`A6<;567Ue8)i?C&|P&on#@lZH}M>?EDHD>acW?9`O
zQ`P7zJ!IcM+8Q81U29+?a|wCQ%y+;fJb1FXr1%U58;7CCT(*Sv8&d3vvs7bhq@Wz%
z&>L63<;uXintWD-c@IGv`8p<i>`_>eUb()XZ6;zxnodF!_w)7BGS6u?gLsjNbNAvX
zAJsd&LVz$ucVuo77)dF<aslbfnga3k;MMA_Dz2ag(~6r*Pn(2H5o{V8*QK7Y!wrL@
zlAvJ+rx2|vfqpX<8vLiVrwW;uflReBD7NO4ALmMFW8pp4k?_EdERdt#gham2h%B*P
zKyyb#6qN)2wUa3#N%reRuye{$C~%*1K&tHukF^GNd=0jJ7jAXBbc{$G>~|jR*S+=H
z{zB6c3c{H~3Ju0d1?Im}>18PE`I8aJ=V=R?z3)wh&Xqr@27xCr))o~{KKi(7UXQDy
zJ35zFM{yN_94)-Xb*AeLU}0#mU?A*;(672>n$!8y&~6`{$Ilnv&+uKDp&H-v7#F~!
zd}&wjACAXaX2|cuR9CDXI({%Z-cAkQ$bEb+k1Ah$L50uOBNbG(D6X5>N)v~#%gooo
zQ2^C``ScX^SK?6SiPtyqZ&2`>_4710YB(G}+Vgti*U;1)rX@ft5$Pg(Au;T<;m_fp
zjIzN}+k<Zp1{baY!Yj1LB)#oDXqPg^4%*+1qIY@`p#0ae*U_Hye&tPxcYPNHU6o%*
z70`F2T1cP=e1CtCljU4{DNuPeD4eyGv?tA&S8hSHyRXZ%Dn55<QSboo>u{l~oVUTz
z`7c$^pFJqENqYd$2xP|yby874C7lDePvg^au=65WIRFZa$hiqC%Gm`ye>r<H0osjP
z(?vnCDuCsNdegn5^?RK%r+6>xsd#$0C+<HH+j{cE@29}$-2$4uf`z-z<vr{t0(t6j
z!}iNSg7Zz1%S`+*C-G-Rmy02cTfcW-OZO10x;1o&5^7}6`zzDkbM1ari1f5-WL2nU
zO{mB`4_D53J=7!tP!b{1!!-!*)AXIJ8r#Vhe~LTG=LpEt!S2J`Q<ZRbFEKBxT&4Xz
zP|=xnG-G>cx@lFjX;tzuPGIpW>0srOW1h~f?yLR8pFphBz->{Qsh*Rg08gPO=jl(K
zS+}vZH-l75c7EE-h30;-<SDBlTYA*I=vSE01lT+SzM(mXQOFTL=Ck;NJMFYQmNdE=
z^YQDWr(Y;d$Ja#0Z|HpBEBQ%Jvm>{eg70CzxksG#(pW(gM|z;VzoZbsa1E;GK8|jQ
z*l!Nomz(xagEPaR<;`aCdvtEKNmlr?2i5?hg=iVFmpsL!%G4+;@LRG^x?KF|zrkiq
zA-m(RXWlw5_cd|zk6X=_+h7`i!>9(HHr{x{ov6~&up;2KCZMgzlqp1gwB2MzxUI%P
zbbn(AK?4qF{JCJ{f23Kn+czi|yXsUwaimkTp)>6Bfl&yrFP>v<O|xoFU7NjhXr>J0
zS>GPah8>zaW$BDsk_N65zN}Zva}SB*f2jM*ijD+9?`l@O?7)Izn{%`(omgcRoA!p`
zSfT0*u(&U_|3yqV+A_41y5NfvWEE*3plg}DLvR*xUwOef=6&**)LR~osmAfwE7vdM
z=Ww$oKdnDy=PrIYwtO&V<3){E)+muP{c{&EgrR<w<g7G71^RO;v2zz#WqgrI9CoTQ
z2@~P3nBmKdABEGT4ME?2t#&b(VK%r?IuoL2&+k&t-&ipEc*Wn<W0Lr7Gcd5rY);Ax
z3)gIx<mu_y|J2_CaPIxl<sfkXtQTWB!*DOteGNM8)(JEMdZFvSn!%QW6EBEOd$Nq{
z>rx}6^kOnwO6cN(^+M~?e-o-Ayy@aWiO(itn8OZ#*M`zg<Ya8puj-055a-!tRD;g)
zga0>ZcdAUp8x5W^f6CaRh>3%2umUsnO8BZXoa@@ze1t*qq+!U(HC|9_DV*G>7O=X4
zET};MU@qRp+Xqmku-j)m{rts*g3T6Ip^T9wr;o#=1U9Z0Uue<tljgJ26m-Zbz7J@y
zKpd%^=CI`_Z}%SLd2VE}CH<_#eI~j$*xBJU6<|AK!rBKA-<|$NKoGiL3sRcNvGtfQ
zwW@DURK0uaVX2ZWjz&T0or2f)tH^2FLG1~4gKrw<^3{Zb>jLG0pp235oN`!#4PRx;
zPnx3@8nZNWQjTh&Jogo$ohGmSrV<@FubCTfTlBZ<$U2;E297t(p5j(#n)58l(z7Y0
zR}DVz|K~#%eMYVH(ZLjx*eyIk<HVUb3bL!#if>t|r#-KTxz*ECM~Hw^Igh!STjT?a
z`m`_?5>~`OdVcg7w$yJgu!}?&3*a992vwNU`P1l>!+k&HsnpGtd_NKE;kOO0CxH+<
zS1P8n{;fLjzT(e--@*;_PAZuRbiM1M1&xyS_Cw?Wt6*EzzuSaFxaNqg?4!b%L{|ce
zB2L*y(Xg8o-z$FDr~H@K<ZIA%HYvTK)I%>3f=icb4dmz5g_0GsuBD?t_($Ol{)tb&
zamEVC2u0C2ncz*S%aV93J$akxE36z=DvP7gB^xV}T1|-i2BI5n_9;rM7xfgZZR<5p
zN70WSzZ$aOb9@*y{v6k>RW{pbm9l1qQahWhcPdVRHM_-X?Md|?U#_n*F%5EWTx1mo
z{Pprzw?eec-U+)hN_^C+>??q<tz~9d9F9ryk1$dri|^NQj!S_TC#4H6+4hC<tXy<k
zP$O;vHbt39c`Fjlm=@5ZS3V3`T<g#DFS%4M+G-@DD^?l@Yd>#jSEGPb#y)EteVSn<
zpS<UG>wIYa?CZGICxdPJP8v<yjMZgYHpQ~VcXn`NF%d*7KZx-$=rG&{<Eguz!dss@
zlNMRkoB>7u5FF8(>cwL9VOE*DQra=`F+6#%xqVqkaX5}dG#ou&_$t4}EN1-N2d#gL
z%_2s6)i>Z?b~-s=1SdBL33PW_HQyZEl<}%WHljOv7dj@>$v+WVbw}-eODzp_lQAT@
zeJ_z7mF70SHx@#w<Owzx-#~HOUhPxUkPv#{FH2R8JPSBxEgnrP!j7W`8_hZ%>Kr8F
z8V6^Q@YJ-2wiREQ8};K9&30|XZ!&HvvFfW_)UY<%%zBsuWfji`7;QArXUvWuUN;G|
zg!k(8gE;;b#HkOVe*EhaV+IAud%<is&A3Wkv7J|9+B+BBi&gD8;#P+)imlxcE3zSb
zv+t^|W8c(i>^8<EJaH~$*c6Zs+PZ9Fbol|lIUHqiY<9lbc|Ka`SK*V$Sh~Nm#;8Nx
z<idUWnoNh(xWBZxNmr*!lO)aVH};H>=?T@H%I`@m)N}M#GKGu-m2qsid5pW1`AQf%
z!ot~h#a&*YgK-1K^R!|!gdei7f8h-UkZ)DdX}jT<vfQXC&^kAmb7q;NJZaKHKg2Lc
z+_Tj>BRMF7vCzx#)CW-hZez?mwlHQ62?>jtDRw~66NCpB-TImG2Z#<Up>?s6A(&_B
zmX;EIb(w^w^{99~YHpik;Z>uq^~j7ZPW+8=DRClI;>n3JcPXI;PN9ahE8+FeZ4_TI
z({K-^TDnbx&RE6cb){&{KIW(l{KA4m=@$YG^rWXk8&rm`pk-X8z7x)Ck#rsa*I6Vn
z=F(@SKEf0UMK(6B3l#G&>dk{E*R#qWH_??B^tV<11g2vWI*)~hn~q6lhFy>@JK!73
zT!bHOL3I<T1iY&HC9)iEW2WaQ<tF{%^2xivK@k%>{3#~trnmC6Cvl87>&YvV!JLn4
zRxg!<{aLrRG7d2EgRhH_6r!V~CRDp^AYL9?%-ak=jAq~uH!9uocF)tekRRz!>+KF!
zIr$5o%d>ye7RHd?)tk9m8DOlgMVuEzWn;|!O+6GK9ZNP(Ohkf^h;Llk*ZjTKvr_jc
z*~)r{5X=T62K8>=yF62~Pc0h;f_pms%F;UI4Vz!Y^_R>}h?25|Sny5rS=XhG9GMKs
zDc*t@N2Z$kZgvTgk@5O)Tw$~yqdKGV=FPm{OQHp9YV73{pYO51q)g*Bp409l{H&ws
z$mS=uk`ULZnrjgJSD^A$bbmm+Kq}K5@on{9A&CrJJ7b1>+Xgc{u`O>lvd!8_<2ddT
zx3y8vx7F9>hPFch#y@^W0dlTo5H$XLB=(N!8<Zux18q#W61HeXZ3AU442GS)86>O7
zw~hGXHu&5H9a1jmCHn4P%Tg1jE^eC_tAFk)%&fokI96x6yJ%E6P}REsQxF;s0KbM+
zW?x*MNuABr(qiuVs{KC{DdNKG29OFFoFNFx6a~Wd_@-ISPGZd`liQ=xPt}(i94$$!
zN+GE(G``b;%}r!&1`-lY6XM&aeDBzT0yLiNBNAJ3Rnu10;8d;yBxPVdB`=&lzMdg4
z%AItrMn=&p;5xTN*!Rq966spHymj^DCBz5Z5*0T<$h_I+=l&mmUxlBai*(<ts-<U$
z6UNkcNHy>|OqLL;RL<)yCoIycI<hBF^e{LC|46fZp|w6OcC;q;b}{nCvj4Z`_nvd_
zS-q}!-|egGLW{f}_!omm3uzzugvH$c02tH8o|7{o{!ORQ!o{SLFG0x6tF%!{c;Nv2
zd+mqUOp^Sbeh;;MZ=&U?c!!00O-8JkFevjZcSyRsT8>}$%<eSb)w(H<WjBMwI~)fI
z7y^3gM<zIu_EphdM_XQRJ@9C5i&rP2cT88pX@o8`JVytUe@IZ+faF5q;~}*AXT%j|
ztZ27+RP<Apa7HcUDa%{(G~CvT?AMUt)6TZ&p79sGWTP)!$120jE|ipIxh{$0&+U<E
ziB8&oQ^0l)Cr}om&=FQC%7)U?>Bo(CxeAh(?!aDZBj(m5G*<9)KSNY~zOvqGKKJr`
ze7$)Bh8~rS>p|LAFz3wUX`AXn7V;v1mdEFv3;mNN*tbo+R++rIi+4NbmBF;=bMFv?
zsw5;(157pKQNxJc%@T6Y(N-7a4Efy1Xmf9Jc5hnnzE*uPEdN2g#cBIRt(!a&FINdw
z9a@7hoNWBIHlgIcc0e?IM;i6od3iPWI^kixQ?3V1LA9LAebtd0OEhT`sZ^71aqnO<
z=)bZV1Pcd5h^Yrv^Xr+=$uQPSRIeoKiW&!vuSSJn3hUJZ$!%1G=SuBEtYkgtx0(H}
zt3$>>f_V5MmmBalY~#=;S*Y_|+O16KWm|h%SDzS5$-Q85lyy-az9}|Ge6mFkOn(qJ
zXp#``N9pKo#vx{(qRAD+K_28foaE}@FlCRV7$zc<MMvMur99~biX9zw;gektjApcm
znS2JM{gSNbT)(?|>i31w?+0;IQ6VX2B}kcN3OqjJ-GaEzUFn_R_!k-+P8%#$<|BDa
zv*EHvNt-@S+LaT#@4a^4hfrs<`36}vZg>GJFspC`P2HHFQC8__cX)kxeZ!_BC`o;Y
zQ=@#=(}<d$B-h~`TYFsqE=!Ed(~iy%X#*2h^}a3#rl8f$TKgRG2$G;(4XT0)l^1k7
zcR32|EUn)r(x{(onTI?n)os~VoBq`ske+=woJq*uK*6G3!&cC@goIK<cRCXrN$C}g
z)nxm4Q)cnF(WZm+KB23wRNRito@S&R=6gWwDt}BBGkIXbtAg`-r9#OuDvyS~#rR^J
zbW`O<SZf~3$Gt+%j%67zm5?-~D)2Y_V(kT3$`{-H?NY(&lBHFDqdx^Ra`qMdSgsuN
z!=18DkEI(r<___W62|YtkZhq)zj|{X6~6I`cz>Vr28VlavKDWHVYFLQIZMH-79M)X
zxh1k5%1Q5|P)Ha=sG3alugdv+Z(*o==Q2Fh7-aH5JeVNPnfIa`%p%-1l%x_Un;oDs
zHpqCiL$m*x=Brb3OBnemNf7FnthzYSytNwo)cvVTc&UxJfrVbd$$@q$tWiOP(%gk+
zU4%l1BlG%O3BK&p^cm73D*^XjqbFEw^OC~0Gp&BEc4j%{29!7W=eq#sG6d5RHol^D
z<DGl`vJ89aePLt0Sn2r>U-#UYcWqn_qO_i;TNV!&igp-4<XD*SRuga<SZD=cQ}$1`
z{uwrM^ub7LXmS|6pre5Vw3Du>G68WOlD;c8`8=>HZEXKlYarcHHfDc>d{C)`x`lP4
z9HQ(`Kd*B1@0Ycz=N7g;WHZ>yJKD`F+F*<s6fBj$IyteAfuONeHLL$+0I{)+_o`FN
z4&Ko!*AY*UjV&1!y3XgRhhlVqo-%k;egPKT>as5FQI8F>5_lXyFX%x*=$#&Vr}zN7
z9AQUh>Rx&~f2H@VO2G|xe=dK-P-z%9C6H4QO?S%HM70r5w1H;zjl@6=(O)69?j~6T
zMV29Pxb#_0nSrZni^;0LH2BJ|m2Pq5-B2kxaQ0lp@M%w)?MT!U21cKXz|Bionp4$0
zD`pF&9gANx>adua!E$y{0Br#SvRP-uFK_5>IyKq;$f;<s`DNUU9o*8_nbD}bH#9J|
zf_CepAmPT)I(Y?T;;w~0eHLs%vlElPf_7MVNjP5Cw2QxDS3c`+ySW#~N}{Sac{~3p
zub#g{JT(EmEZr50@;7q9n>UeA(uT<Bwy;uTD96eP8Y>r)ux(P&4i4&q7)izG4-}cJ
z;QDx!CR+&?%MMH15lpDDPu_Q~54|<rpqZslTu~P#d!wH2T|u?<j=!iQq`99C&jxC3
z3-q6F345v+%+5+vkZgC$pXDvF2ZzjuuXqEi8;^~X!sFhkIZkbM{Z+tht!N+MN%Z+k
zr6EJE>V@=HA;yMIlZk=J=6FH4o<*Hr&N%AM{t_8kP6$t9Oy^qvq5pUd?$Qz*I+|mG
zo&rH)L7t8~Z3dphpOg<j$gXT7w75Zu{AoS6$=8!G)SS`pxdz)aEVyQug-^<RmF6lw
z6Q@7{a&}0Y<D4~w{TGTP>jm-GsGW=*#DjbNZ3|7Nq!i!z`?alx?^K#XmMF<dIJhbI
zs`{{>=Wfs2Hyl4S$xhTip8>tn(WfNR!io(YqzxO)<xzR(<>s7DYHuDAioZbj)x}NF
zsOqP_Ss5oxtxq6XtG*p(N1tIu66TU3?J}hBX=Fv5D(zlO{1WkWusH3IT^AueTWip#
z4xJMM)afQ>(bA5AJRKEgy<!<tS3u*a=#JDZaT-6JaDa`K!AhX!*Bld<bxB1(m+n_@
zimiz?-x!nzuEys4)L(Oo(pK|Fu0AVGdbHze--&ybekAy}WsnTpC9=hwInAAeHdS*>
z|IGkCD>~+YoJ<o6O~o!2F4{zcyfP=n^;hN}CzoaRsilT*@iQTn<EX6Fn59Iv6d_o>
zhTmc2j6Q~S6V6J*wud*bKo*k8&drlN=P!NZ*MapI+N3IC`^=}O>_ZcDG8gORA~B=)
zhSb<)O6wEovzA^K&5o(#nf2#n<}2zMJILZFt6!B0{^0LqQOKs~*@;<v@+;)4zhCF~
zTj?V&(QP#JKGgd^&MuQ*EimVZm@LVXQLJzFL+Uz3J;o$5y}t%L7YCe3GA)Oc6fXvL
z5v+xha}yi&q!Lm!49Glvj8ue%r|a0A2wB6rEi5&JxqPxGOTQv#W;02JY{{)j?-I)w
zGOw%}W?!#vj;IgQ{2B=C65^I-4bJLcw0PWk^F3x+%|>&?)FVFQp`AVgZyRgA49ipt
zD$D?^N6`P5HPSj0eF)wKXA-X@S1u@*e0Z=BML5piDKwcPuv9X4Y6kKiBZIp{9vm?V
zu{D+#Beir(ExQIE9vBy+t*LYz>m9<(#P$XU-*Gx-zT?y>!IKZ34RBMZiq(DGV!ivd
z(oq(pPu$~{+FAeHj{T6LqM9UZbv@SQ(ucy3CngL%NExC-&)<0rm0|!9s;SOo4B4jc
z`Ri?k9}I$7S?=9w%_CNY0ANskuNJxfJTW&04oG>NzqlLTwG&PU+XRr=4C+)r8sW+)
zQb*qjv}3L-{cdkMMu_*T(V<|Kv;*BLW?L1F1ffM1w;NpR@3R(U1e)9cJ7_PfU^|KG
zfx4pNdyc*r;wIo%nqbE61zX%$d8W*#Fah`QeO+=LsNmFliD@AJNmmeUeIX)G2>h9+
z-}FpxcR#fYNfI{>39{0gzwh{QcIsEbqIEC>Gqv7gx-y^OwI$*nR=Q%XVygMd*7ge1
z(IuW0yZZz9lM_4Kko{Sh5HK*K>tBWxK6Sngi9#@uE7V|B=4noTGlAUt&{`l<J@4zd
z1a4c7@b8(^AePYr0GqO2$F*w8&uWS$7`%+S8!{iIbU1FDX5muPYV^k#Q#Nr*PCL3L
z9E!GQS!Z7*qSsG|;+A<Kov6E(F!0)nZ2oh7uG;i?!2anj0ppZWaKc?fmM-Vv1;up`
z)7Q;VO-uW(<wV3jdcZXch==ZTX+hu><bwk_(WtrMfNbg(dvOwk_+kz8{dE3*p~Is~
zPC4fVZ6i9h?$A4Bn7h`a;#NW&6?6R0HpU$ijFUH^-VA)|a5TjWletcW$JZ!1P#zXn
zs7$R#XeBTe)KbdPiPV3oK*&x=-xD<9&A88}FA@e7H|E}+PP1GYS-nM=-e8=tvim)R
znHobM?tgZH9D*^~KGNEPG3j9D02W=OGF6Bq3AMBDsXEth(9c?~{-GWStR^@qVf{tN
zxt<fveV##vHTq)*M>YShMs^EvPg`J6Y`<R3_8RjMByn!eSM01OX$jExuLkTkyp7(w
z(^vc+lVnD^c%r8`m<GxDrn%3c0cgTl*Q(|JsfDm+uOLHC@rd;(iKu3eyO24fOS(pz
zH;Lu&FTssoRkMSG^O|K6i#b77T~gtfDl~5K17dA3xS|4(K2(riau`xU<MKiYj5lNM
zpneq<wDN-pUtn5KDuzR@uQ(~wqli+~=aI*}4vkB5=&0x<<+^&<>RpMIWNKA1Qjegq
zsW;qo?^!DY2+~+b%pbRlb_R&g{PJA-Wob(|?^u7I!KTzXgg<!E&*WEB93zpeF4OK&
zY8O$7ujNyTKK_9<C*uGDdMAfKxTZ1_sj8KvX;YBW?{&vk^A1Na*L}ieFb>px;oz9|
za7Ir|BAomjX=y>5QgC*l60;bcT!8O06Xl;bnL^79_K831WE-fCd6wsLLdxfCVO#=9
z&w9cOR>y(0H78hMPLK3~SL&?`P7@aa0;1-5EfeOq630q}ueVZ5G|yBVU#*<0ta!se
ziN)BcVV~J{@75LH&1(S`43MJnGsW*|I9t1^`PDS-A>D-l+wVVz|9sE5>9jpN>7RV9
z-@G|&`=zK%#)TFC2#$hnvtUscngVf)UV53Q^)X*eM2yoVgUgty301z%-{JO|t6b|y
z44Ivs+;Yf%)GDG2^I$hD(*kKeKgds9Sxk@mn|n3b(lBY{Fcwo~V@ojJ+_WBtjVh~b
z`$%hS@{;Q2jnyP+l=x3){(}Xe7cyp$&+a1}{?u4n+qUqc=-d`i!r6jY7`fH2?J8Lh
zUs27@u!|z=2<ejGUTnB+m<=OSQ2P2YF%ava6-xIdZsBGtgGY&Z6w%B_+MWYJNj|@~
zR@l2TE7f4efCeQhoJPRfUAAgt0WOUsY*-iGU5k2NO93m=s@{nW=>Kd4suqDk(bbS}
zY+Z7$*}GU;d0r;@ZZa#awv+c#x!d-w{|nM`3$W69&k?+caZ0!QovD#Y0Jd;n;@Ak~
zUfso$lNoykJiyvUct52r>NIV>C$w|7eKg_(ki6@2xe=EA$4jiFMjU>OUsy}J3A7+3
z-zAB(8{xV0iRCf=4;6A&=?5Zv-~`Bv`Q}R;--fQ{-mw8FRUx;l002<wU^oJJ(=7?K
zAQCAIj(Gxwr!;Zv0e84PrE@fO2jocG-mIrHQDxrG79p|%j$5!cSkg?tsuPf;0T_8%
zf{po<Hy>mJ`Y$Xw;oxSYX(d9NL^lft0p-D4@vBXLv+y{ty(hLB5%#qMuB)N<Is|bD
zM;*h$Qd9+@_g~OcD&ulM6v;zsD$?31`YwP{BVF;8yih$PH{mwNLeRgrmz_3d8y6Z|
z2^~J%kNH^na1*XzJZgShb-0n&3H5^tG_g!hBo=EGT0Ydk7(b&48Ulg{uIm|ahZ-RZ
zcv&Zi1xWa4oHzSZOlJ37T2)!^9e|cZ+$g9r0vw>|?SZw^LP66bp=D4t#8zJ!KidE0
z{rNo94VAwG?FYZ~>v<ViR97Kdec4OVDq)C*0pz<~aq%R|>e5pv%EXDlO^k2dI<?|9
zK-Q<#7({70g`W(&AaI&Ddef|$+Ey;3!Qun(Sgy2Q7+3)mwXw9Y=0ZjBR7$81OM3&(
ziK*f@+P2)GFZXE!o2da*AK-#9G6HgBhRr!rg`up86x2#dRrOI+P>2x}cQ^-@$nvN7
zy$FX8?1Qna!$>7<0h=e~yUTc83ap4gM>-zX+$(sAV$Qp496=)HGYq7*^SFHv;m^U4
zb~;<=C$eCK1w+U@3~ac651f3NUxER(;V?;(kEkBfiyK~mwsRLp@(|5xA?PnGnp5*q
zxI^&#W2o~<{ID|EyHBdKHRCnV&em)5=t<%N9CZV<t=R;Cg@)Q6>+A+_X8xuIdPM@Z
zo?&o-6-_JeE$R2yLo5m=2(IyyJuq2}0Es<;3aD-+rH3foy9Yh4;#a3f;q(JzTRZP#
z1tB?PSRN3xCli3>$^QUE)xNcJB3zwpGfXuO&=WJ~vM*HqYUe**{SRYlh|oVQKIRP(
zen5cd<j@voI|Sqb<@YCra22MY5FGf3iVUap8OJwp0zaq=$+CnGfY(|`YASH`Iy;PI
z{*a^vtT`y;kiq*nNstvC1%t%$vAUcInkQfSTm3blKX4dfdB)rQ#DF@873?0_zJqAo
z3wNL^*_%ofkjn>TnlCtqc+vxqr2U7J+k&hF8`nz+vBCQstS-ipGK>re$bhw2895OH
zmIS5_7;SApaCxaN{_g5AjxYueHg@#PX?X2-u=uX4SU>Y$mMJ5oTjJe`vjAiFAK;UO
z+4C8X&jWvsbVZ~!;)X~SX4O<50H5A~sFU)d{lA!paR|b76`kz^xI*aK@5|q(&s~6&
z7f-#0GUIW7^bvxAwqi<%N#`38?R_s#1lo7tTZM3S&%x3mvb0ve77o2yKaG9hItP5Q
zcu3U-S{)2;zp=EWU}%DBz*kbnwF<Y+W*Nb<sBj&iOTUA=eSshd2#`p-pINT|%VYd+
zW@+=O;f=1ZM}3cpx(Gq{Ni;U&1WDkhFik&ggsw4(8{UMun<(~;B+v?i?{a*7bj)z;
zXI1j_>a?Xc@Trfzimf92BXv=mj<dNuE~ewmAQns;ueOG{fK-4P;DnGbo={Ib;e8ig
zp7gk!+<OB0_i|iDK@N*YjRDJNXPa?{A1@$gcL@-w-w_-SP3r0w+{k*tXIL_HEdxCM
z%332DP{-}iHooT-RNiQK6aOe`>6jf}4p2=8+qpon+rKGhH|@cBo}?1rK)~~ck?@#2
zf0hCvZN4cCLc-_K(Ao?vK{N*!2vcJY5ulKRG@3vQX04DeITm6w+BP1*S&Gwg!<&pR
z=9VLd@L%>4#}&+K{lda32ofo6Otj!eUM&ePJcj-{p&In&HWvn;Jh;8Ny{r{Jip2?U
zKqFk~%?RR<>EuCY(j8k?BAAmOteM(5`1ihxdfv6>IDs^8RwXEJ^CZluDj}@`1ut+}
zo*-Rug!a$9%oR7}MNLy#U#Or$_plt3sk8u?drhv=@R$WcDuO&{4#$zjvndjkDiT%v
z%cT03GOY!tN?~~l8v%KUxXW2edO4XMhx1rD5|C8+n`N5~+miO^w@bLe63Ld{8X&lk
zaQ7-Qtk1-AYAb>9nIEv_hAsN{!lcZFyuNKngp4<Dn!@pL{6lB^AJ`qPnui^%FB0_o
zaO@}~Ya~~8XdR;H<t%@{M%FhyI0Q-F;&do=n%CLZZ{>bwF(UJ$2$Tq(9Y&4Yu-`$~
zv`-c45Rm(byLk7*oy`w$Inpj159&Xo1a^k-iD*xHV>V=g56xSNB^rwJpw>{4aCkA9
zB&Wg?TIw(inB)QC|0pWRWEYE9YQ~1a4y>M_mIMTX72}ig-J=TA=LocbH&w_G*K;md
zIe9xr1<(&({yYt)!8w^<kp|?mc)MJcBD*bP;7Am{er+CXM-kZ$8Z_Ycf@|I`C*@yU
zTj^8^1I`zIiFqsHlZKqTu(P}vlH&feu`RgHKlUr0@XWzZ#{9s{Au5vJQeX0!I0i;l
zO-H}#6(RGzyZ~kTm9S%896Ql0oxk=5MVsQa%WhU0+U;vuyC@?#hJ;XWpL$rY&BIik
z@HT~~P49NlUBB;bEFS5l6et!nQR@@g#h>K^#{R7K8BXABnU$I>pI~*#G?zZkP}27C
zabSp_Nl#~nK9N_0)o$CQzrfKTV)im*_0(#89vS`U+s~l)y!D@C8Dqh7dVo5FJ8z#m
zYK=%w$n|xSqWL)9)>sYX(qH4t$@n7kgavqjP;f-F7ZOv&FMMjRkq<N~_3(HSotgt2
z`|IxB8AkP$()?|F){m<ge_b%h9VY5<Th82F<M*`D@3Fb)=g1e7kDnzH^%9GK{TuZf
zE$(g}9L^y6g8#Cm(6VHP;k!#UWQ1`q#1s8GGrT1ePtfW2WGKEe4-RE5P&8DooaUjY
z5M;G{A8RpWOS&;$Zq<8G*>j$wEH3<kUiXsZv)U*)QB*C_j#u-+w73T@ajNfhbE0B6
zb0f$e^$1NlXvgLbTb>^no;sy4Y-kwarn&KZPEd80J~8)8p(;n~bk3g8*KzK|M-y{<
zT)R}+yA>wUdQQO<RHigaz>IkbJlV@md(2ecCFj<u!n1Se9wA~pKH@#e$-iCfEhb)Y
z@~e3ZD`{tl;g@h?jpG<svh)V$469O!w4uJ*xG;Wk7fDbp1<sJLbqPQ^|6I*`G(4Z@
z-3E*p>~u%K@|78~3q%7`Ll1{t@2=i#s&H2E=ZxhVg~6sjLugUqE&JtiLOk?vdRQS}
z)yK8t2f?35NZ3`rQGatyk+)&D##z`ax`hhB9sB1m$N%&H|Nj3=<Fn3=<%?KZW&Nrs
z`&%pEf6|Ot+tA0MBMC5pb3!)P3%RptX&TuCvH!#kj&tKTh1TN}T9Du-sI)sl!ObQ`
zZlw?v`B%8!F{71$JO}oJV)XkT*e8?Y#e+!$>=+I+1%b0Y`8w`V5l}Kr$(;XD?D;RM
z!hR-IypJTIP>DKN&6i2$)$G0F&;KtqLENw#MHoyX`$El^Ze?x%Eo<<0ytEF}C|d1r
zUjGLOV`fRQ4;oIP9Mhy%fAg#?m;Yg4enCI~o1p7ah4V6hl7+~~-wb_A@FGg2?jGCh
z7vc{(c6<s~jG9wRWLKZOpqsZHH;)C52=yoX;^_Pb2a7*Kk2{7akkDWUr`8`QfxV0Z
zf0+CJgR#_Xm+Qvqaz_^pE=Z)JS5MphLi@75>GAqKf)#4|n_+`IXDEeoR5f44u;15Y
zz?TkI(X#!bdHO1$56P))4DB7I!ZD}E0?1KKOMozH&mMOu8RdwV%g^Ejx-(sL=;Ga5
zOc~u@Dce;q+#yXenV<F?YCL8ci^HjsDt0_|CxL>TVxiKzpQ6LuEoCM0OKLmfqs_1o
zg?hI}$2_(R;y|yFzrkeo4?`~DZc0I}oGm)PA8)|-noKl^FpdOct@#T03$uS8rjjT<
zH2kjlC52tX@Qe^)-y(1EEI<7EJFfm+?B;MDPU~ZZDkO36+dM*Ax-hqyg3=oO(UJ$l
z5NMnf$;anEjI>i6@x>St_1<9}vn14xoVW0I^Cz|`a+D?Z)ft@ZnOV`w*Bm80kKc@e
zS#v#ey#L4>_MO(`C>>J$5<FGt6B)r9WlRKbtbG96^(za7ONhVkeHGV%&$sa}UP<%z
zi72DKd<Gd>?JU!qi<u0_$X-_j_v!kubS8FY9-y)~%Ph~gH&eJvHnb1WT2wsD|1Cgi
zc@s1o;Z@487G$~<&&wlylqAt|JRhR;z&}Uu4Nnda2GmL4IBBRLGpF?ymV!tB5>~5+
znBe$#s3gN@>I5-ZY9lhDGT@kA)_cWr9qEfTsI#%~lIn~~1-*0WbZ<)E-m2vA{?!5+
ziKcIrt&o>)XMkU5du**waLUO!jExjWtyDofEgE;7zt4CH%i8)+-w|GW=32Pdskirh
zwM*@s@U%6)?#vTtU1JZRil*bWKQ@s}aL_Ww>gGUr5K$P?h?g+Yi;$_mOYuycI!a18
zCf>(nxBUdgJ-DJ75jz88S+0MC-R*(Q`Vi^<ha8M9`t(pJE4KnBGaq|q{6_^uCPM+0
zrI=E{|3zV4U)a!_-zY%&+NnoM(g@kqTLY$E!&)CZPA0@)*)~p_{;rJzGX;M=e_jh*
zp=+wPMjP0lN`plmVBH%=w){B~3ZZX`rB)xbPMtLRM@J5#4w!b!yV2NbCor%0fpL^x
zr$P0h+FoAE7@w8L&bJKAH@OZv9&2rW;=?Sw5`;+s;!nK!Z_881@YV#~z<RQ^rtq>v
zduL5#=x0F{kc1*A$LZNJQ4460+%e{OyM2Io--vy|x%%=uYb&-Bc&CRHQwRfR)qo79
z#mLTmkWu4ght;Cd`maL!tTq8>D3mk+nQ%Q-`cLRpR2$AU@9C`@JDjJGiFl51ATYN`
zULtl9h>{>Y@P>FS=G>iN*2}g+Y}ia06iDJ#tS}&Yio;0v&GdXog9cOgHijFni&U6w
zCR5Q@yM4Q5D!~BaK-%l9(CH5ecZ{HG*Y&*l)1+U}^(^9B@}p0tDU@3c2-~r8x>Ht@
zeqQYO9u09L<PakD6SIJzd*}QV{?f|T-mV;R_c$@QAJ&@6C&RBnBT?a27!YC;Ucw%+
zVBM5I40~$?4XFA2Xg>1O+^@k-R1EkH?6xzPAoOEUdOb&ozd{&U|5khbNy*)BPUl0y
zE%}@w4JYNDp=$RcZCJ+^2<1-_Rpof<-{eA&9ef-7<ChvLGJvP0Q}Xl=!+*VVc86KX
zUDAyC$y$bmkgp7R-^llPoJLMVHF>zaGK`L;Mn$fBq&ps_?6}SggTsoG5NrnujLu2?
zh-{|H^4tYb5#&90N9C=jsu=@={Dgos)j5J$@ZY=NWmpl?UO#toqIs@8j*D9Biha>6
zMk1oG9bzywP&xS=P$R2CWluUE@pW&He}ljRQ$sc<{Y+d8%-B;`FZzUc#Q9!d`W-<A
z`W*ed27TzHQu@G*ojvLm$p=bq!sxqv1X?7KFM^cAUhPLKf%hv2F{8tUImihGYZ@xw
zq$7_X&vi^uS&1gqzOuaR7fDQ>Ag-qsdF1h``$J@TO_(o(<a`6LH*3Fvc}iLlP6OwA
z5fvo*08a*!Xc7;`98uzIAprMdiN9jG$r8evi6>u~cWq-nr4U4vSs*|n`uZDUk2{8A
zXXR;VolBo2vhcEaJQ2znzff}Z`d#mG<!j1WBy~hrTW;><sSwlV|A#jH{;akVzso0L
z&XB=?D?K+;MW3OdqY;FRwep~iJR2-GB_cOA1)L%Kg_M6Siqq~x;Pc{{GW~{DTQF(C
zcgw2xHz0ZOT3k<w=vv|&bSS)@zcz6`7U8X2hkgBvHQ75l@#M27twPvkE(NZC`>hMI
zrk`=^-D*LGp|G6LUKQ_7>R7>-_DU1W4Jt)HRS<ppm+*WN)pq`SifOghs6g-zTL_7E
z=>V|My;IKfQmGS7d?x8v!QjcKCN0?A=pDo8LaIs?Lbkg$TaZPdN7Y3nEudwCzI&U}
z>-D3dPp`yZB{Kx__$rg4=Papm`5c8m!dT}+$3F9i_(0wOBR(o%n(0AcEm$c$qFSU~
zlIJ3F67`eJoQPKODto|h{+<42^_-dIqm`|Pe6V@`tKD(k5Hx4#Sv*c;7d0n#{l$A~
z>02OVt%h)7&M7nso_^2CQ1Enqs63GpZ1FlQ+*%4LdpJ|{k~s<Fu3oVa^ldDs%i0nE
zxFTpcqMt{8I3R!jqqj;^rYidmPkVI^a9O{?%A18uj-o>Hl1>4OA3gg0H(Y7PzG-!c
zllX8I`PeOpKv~@5$wj>;h!4m?x8%Wtiy37An#D`szsfa7O*1w>6XiG3-yg$?-Pkxz
z58+(1&muSC>7}3E>P#sOcno833z|2xFdBctI}}G{-)`W+`4B2|!8t=t;1_#GiuVw@
z?X<lFJ=~VP503Kt6c9lJwwu9MLqirtOlrQG@`&}z^F&YDw+bFiBR1+Gki|R4egngR
z5$VlOfAs6))mRUSQF(Q+86?5@<>HS`9Nc0U;7~;)?@x>y#i!7Fo>S4t1}%|nqZD}-
zUk?s=E8d*vw4Uk~d62pK0MnIH8jLhmDftn1e~PE%K(TgWE?Xg}B;D?`vAu8b3tS&V
z1sKPdl^v39ovbxYGCQsFy65CGdQJL3H2t_V1Y4+ghU7DospjvYjPrqDCe}F^U~Htb
z=UXcgX-J;CgQ@msK6|bIn*1l{EFo_TI21Alb1kJI(}b>zDDX(@V<0K`KF3f|1UaYh
z@LE<lZ4#_~vV<FMjEB6xnec$wyMxhv<kbo9-;n#u7|5`cLLMEMApgQ&ik65luP^Ww
zdA0DA6-f<*p4K>bhaavmeHG5&pOHVK*@Rw1wh*L#RVSnR0F`-8@#+<A)3XWY<J>qR
znH!QD2$((=W#bcBcEl)StyYCE>;LxN-d8=GJ-tnW0ayaWFjVLd6-?j;oNl}l%6m~-
zF~wqp>NWWNfIFL7zO=HuD)AwRHSz2<7xQib#XjBv42yt7WVKEmq||0%Ep@-owSo;J
zvR3O$1mpBy<T`7)y&{!JQt}ls`R<2E43MYWS|>sqbeIPC)cqmlf{J{Qa9{dd@SSA&
z=IiM@9q7{OcFBnCk1V`D=fV`gpYe7)Uwly^HQ3|c=S)@|(Y_4fX;Zo7o_T9AWYt!}
z3sy>#B8lU&#ryhw8e0FFPRz^*%1My8F_?=nM(jc_*+#3|SxLn+?+saG+~XcMGW%1o
z>q2tT!T+}k>;atD*X`0Rn2CrozGaWm=wf~K!+T&&M0tyB?3qIoj52bKZZ0bXk+Tgj
z!jRqOiHT9Uiy|g7U;S*WN|PURQ93nKF(v+~pE{CYvzAM4*_%#Hb)<?>^JTCPTP<fu
zbj2T6y77qvRD2=78C*O;k_ZU}45<g(fbA$vcFZ~H)a-fA@u(pKJ*=0NRLg24AA4KA
zuiU8MP_9~3Q@<ibF~a8kBH)+OA;%@365SU83coK&LU~7H1b*@Qn$}X?>4ImE?drNC
zA@XhY`A7%*ycY=?5Ey6`!TM?c@ksVrtpKR2W3&hSK9oNu8D>AqH#;Bl2CGCUNn&1}
zHB?!cw>l<dOZlTLmg>>V?>bQdC%i#)PU<bJ5_*(liqyEOM$Ey9L1!2KZd^o_0%rHp
z_udbYF2g4P=srA|^J>wCRmM;d=H_?ty9CH1)@Ia2@TbTnQ*iJk<i1OWX`xjjGN*`|
z!pqV)S$bJrrR1XB3x;V&dH0hkpjUF80XCKJ`;ywY47B-6cIoZLSrQRAf~iQJj`p^<
zw2}M^TKoE2YoYtEw<l?jmZgK$g+}TpbJQmSxaf%%!ZC`5&*0-H`2*<)<1n?UcIj=l
z;V?^e6>WR>N)CFDrJ}DY31ArN!Z10mK_#|=m~>W#fa?IJu~@%WLVKMLTF`r6%>!h8
zl9%n$ZJ3F4t4k=%-z&3Yy+L)_@hXl(J-)~3AP;{!8}H7H%_(;zBo6vGbf#=z#>pOq
zLjj=z1??@^K>BcY&&$TZL>&>tdWGxcXy{`lvOvP~Ti=Jsr(q!_GOen12}^eL4Dom0
zst}`0z&e8q`l8|?zHD6!UP~SD80O6jw2_D?i(%oVpW_Y04m%~WIZ@OUJy%=LgFxS0
z*S7|Y3>M`bEse=g2LLrxsD=z?vq~GTLBiB$Ri)${+Z?HL4#s|j=`U4t{&AWpWa@Al
znUp?$u(CHAC~wdU@|NwR%h`#&Ed5UFTS<#?{?AwH#*m#t3cHqV%w*n#cn+6B*(*3~
z!>8BY`O?uZi3eh1vgbszCTjAx7+Pds*T2P+Khw8=UXm6>HRM&=a|&^G$+UR8(^>>>
z+mPzC*EnJl)<(SkRxid)zxwTg^1wb`pw=_TvB|t0GK!Vu$>vx4cS7B9%yWlKrz|Kq
zQe$v%9Uw#Y`+n~RFzc^#UCG=tEK*=58j%TN82PP+RTJZ<Lk1I5*YxY<<1&>kZDV@G
zIBO9#uUA$om+0~@dq4aVQJUh<!kcJe;EY6bCsFFi>G||1SLowYkE;QPa0W9XPQQ}#
zJrptVhx4)+jXYb*KpahXB_3Z>V=7rkWO*8@pd3%i9S+j6R+*oDeMMT1Cvz!<2S)~x
zYLLFc!r*Om{;N)K$;}I;i=~w=%tS~0zNT@Dd1{E+izh22TPSruP?Pt(yCRw~JnIbQ
zA%x$Gs7Wj88g@k`kpCtODvBo4wo;gUeovJJIh|k|G{WN@WW8C9+d^h#!&As`MASgW
z{N<6`JNoPYhBMY$hGS6)+c9R*O7$cO7X`A1ktFU3?>9K*%J_$z|6l>Gs7I1@k+F*K
z#k3LqQy&rp%^p&liHe1iRa>vg@#>S7X}|ce?KO0VRE2GjRUF^Wh;qry;I>*67*`6|
z_RDUf*4u_ZP-xL~-W(rW_XG$C5QD;HCvAS8QLoKiRpL(KRsFSdW|R*VP@d*~H<*Bf
zdZ#lnnAJj1kgo&LK-KHoM~Z*{NQUB_E<Z1`Ad!}K+~*mTOTO|2kySAr5?9`uFV{P=
z@4qOqY55DTEZcmik0|B;t+r;ETZ@^HtEPr{N)S==2g!A4!|VEcKWpBTcxG3$hbZZN
zEZZZDdjp%Gs|`j5M}t(0kDP^!mPUc{sKalN|8i=@V`&QA<RJW1$;>;?=LCI`uJki}
z+i<f;G$tGO%F;J@v)esIt={vrHE;OAH+lw(?!XT42jP`c<6snt;_e@SalSmHj*)oz
z(8n+LmsTE1iPM>zJaC@$mp+$jlX~k^2{NYn`rc2u7U^CX*k-=B7f`jy=nA#G{qO1P
zeG>1&1ZCdIH3$7-_l*N%yY3ydE>`@qpEEy`>q@_VSE#KP^#)OzZs2@W{jVtL&#wp>
zs094m2>;c7@n%Hr;1Z3y^L5Q_UZGhi-*DiALAfAL3u?ss-)a;uCo0;v<b7<}T9Kw_
z2vT($3Qhm&T-#D(-aJ3F>DpH84fuvQKw3X2x5+ZaCimB(xyl4>s~`8|oEF$Ivbh${
zSN;9l4?g1xL}?$fYnTjMFbW5#RL-2A8h6NXBTLA*+R8JYy3zI3F=&`$mo%%)!^D1A
zhF)&hoMnIkyZ(?z|G~EZ`=aCjxenF3mU>F9jpMif`>+3Vht0pQK)#MJEf~jYG1xrD
z-%Uau`lkSe-R%B*mm+v|JS%53uq831`G0@le{P(?_Ui9H8&1N35+IO2hqcUq8*>u%
zPX+IPUV*%e7~F5gN^kZ5XTtoS`shEO`}@0bWiZ&27qeaer{T!w#s9-Y{_h8B>mgJk
zwl(b22}5MD!$#ofkt`A5cw^x9xQ!U8GEPN<DULZsPj=qdAH&{u@lOx0BU(r4391h@
zE;T<7!i1-hTV~V$&?@0<tE|2fSARq%dC9w#Cjq&U=AT9y21Dn$OA~lL5`8!?wkno*
z|3}yP51)eqZwC;(I~k|?->AY_4;|q}ze`5r(9Jf>dR@4>|AEp)d20<L*opV#CDH1X
zBkLJvo4G*{F@y-=-+}bbg?(2~zPw`yC-AD=2=Ad+(uBD1f@aP#cRE3`B{~XykC5nf
z_M&Ee$p3uJ9JX8w@0n}!gj!qX;ghG_^}o1Ng<kcg(VL5X9)%7rCw%QNn3@X?^FJRF
z*MUO`E>M`)MKBpfR+}Pp={HdfsPN2}G<v*B_M<V5eoWIpq=X#1Qgg8-+T$6i6^5c=
zbpfbg&W3yJZEmQXxh=<)Ez4OR@TcDVmFIA8A2w%%{Z2ZR`$6P#Kx<1>)-P%RXK-lf
zzDfHChCM&?M}2DkE@(@@OUd7e_X*ms$@s=}wv&zJVfs=xe)WIi*Ha&iq1w3b>?u0!
zra#|b<@vNqA^`US)OvYFN-sOMtP;|BD8|A$8%KR3mow5!i{<4*kA7PGj`dff(R#<4
zs{7;Xv&M1CTRU7{$>R>;+#98dWUti@b}9*Ews|)6Q#}Db5g^(XFMjxAcseS6a+ZKz
zQqT(XS=T%%4ZSL`T}k8eg+@_839F|}?p<3>)=6_x{;_nisP1F*n+Lw9K|j>ZIeJNu
zWF^PT+2+O!+a+0AX_4LSm-2=@Gt{XI$e4Zet5uQ+$0VKB%(U(M<u59#(;HnN%ppuu
zQee)i{HpVI;)gr>1CwT3+cylv`!-NwH05AAvmCs&tJ2KfeawVC56^tBJ;l*2m@!Kt
zNo@P))FN&mb|%?C4wuR_I<djcqVJ2_%9_E-ZMSH*oZTVJ<C)LWD`t9rgbkgP4I91X
zxRf)|d~+qCg4#5X`t*NqBtm__TEb&$Yox?VbDm<#)<2`Im~p+%M|?abfDm0yiOfu}
zda^J@`uy2F<NOG*sbBPTtA-5KYHZ^IqIh>Fo){MBdz@ieLq%0rH)m#P7VoioWD29-
z2ho~m*QB+I9OsfBetUPHk#^<Tt=Wv%4<>eXX>Trfu1s(q4!bhGT9Cj7aLVVbS#h=3
zpXu`r-Siq?nE&kgAw=V^E2~Ea*B&sxRBbroJfr6E^ed$__1hqg%vg&~kK!1sQivXk
zqwg#@+VoT0a^JQyR=@~Jd9O<3>$%gi4D?X$5$Naxa*6i3i1+@QRxkTl=dSBLwoqJc
z{%;>#{ok~Hh!Az{6+}g;zb|J$FN1NyWB1x@-J9p}6xHEKn;zkWT}99G?!tX=G++M!
zm=mc*f|ggDieo=FA5&%U^>~#$Wil9{?0be(X%c~WFEvD_N;7Su+`#RZ)u-6<?|c!u
ziBdO`Osq8;ixqO5kS{jt<ISA>^*W<OhwmmgPgQ2|ng#;yNfr1!F!Mxu*pq&O(&wC0
zpcJNetM~Q(^ZLT^`fGK031C2S8irIxazAo^P88nkt~IMWtZ?AAV}8kKBi8im8fPxC
z%9YICf%`r-HDx@l=|`e5lW?ytOQ~yShu#Z18<hv^I4{}1ZMK)oC#fX)jn+5bf>*|s
zi;GHp%wyqc@EdV=s4cJC`Jj6EsoQ+AeRE1QV~J!>**;}dbXPNxD>u+vp620@bx`?U
zIc!Yxk<{NjR%`4uMlk~dr%qX_N&nVM&a0Ku>UQh-gb4TYGb&x7DqQ%L>&_IH;cL)@
z8<FwQek}j=?uC1E84JbH7HhY@`s~`2^mrycV@TOFa7>q52^m6VpL4vTp2YTTW;jIp
zbsbSs2<#s}NzH`>8Sw8-dR!aBOL)P8im=~`PKl`|e-lM^`jP6)q(f4!^Pu+TlepS+
zG+xFQ1)P$@?$N#)&l$be+WoqwZAXaBR7Fs-IHGX>sA=q{uMe?TeQCUF|46yU$;zHN
z?Su%$^K*Wid7?q>Ifa+TUnR5SNSRlOVqnUjqsx-c7wyMydH=M*W3Jzbn~Lev(r{TF
zDzZ`Sr=b0qlP=m)`7h~&N*K`$=084Um9%c4!6KS9!~v3B3({P!%1*BmN7C)pD{$ce
z6qs>BwmVlcEvFZ9_V42+W6^$pQCSwFZ4H8qH~*SG({j#XC;oj<$&<k8OvvmNxN}GA
z#wO_)fgIzmM2{^k2E$GguOzD!eHW)1AdF^MJq^IU%{Fg5avKI}HOM__mLKwrKZ>%Q
zYInQl$^{~RwcRM2mwz-xM)>rO+Kc1gB{t)o;rym<S2#Ds*oMO>YJ0I#bwd}ZDxJ0S
zW$wdWbk)&Xvh!t4TBkt@;NAm7?&V|JHsR|lAitBOZZ(KMz7taA;@ppeWWu~{P;KrL
z#mxVQr?(7?>U+b5f8r>E<N(q!LrRHs_W(n83kXPeskDSJGz<+QH6Yz0jUduUcO#wB
zjqm3Fo^w9Jb-~PDYp-WLamTQYH+5FG#}DEsy11$lZ9XgC_saQH+A6D^6ueus0Gk<p
z=j#Ha@-G4_@%;7X6~@cB&L3m|0Oo>Yx1YpG+R)pjBkH9QBd_GEP#OJEBc6I-3>1#V
zd-qyLpd)YF)0al6G`Pr#KJcYXf+a>mK~_r$V|2RfDKi#AZZgDCE)spubo-#@&;+QF
zNmM86&Z}UH?%S&MtBReCf5T*0r{wn=BT05>?5+oV1uTfRSO);ZCZ@9?3yWRaJ`tS2
z<Bm#|Ng@`1rig#Pr~cUzrt{v-GqRJ*{oBeT#z2W{`_yk#w<85s*l!->rS_Ns-e!%O
z-;UPIdq4=r>JT6Y)c`^qbzzgPy)4h*OJ0CfK;6+tH0vvtmxT2_i@cv%&XqlZ9?kPW
z%Z@1Dl(ga-z6R9=n*^Pmm2V(T5}T%s-_;_U?3Bs8SZ_3)zE_PKOLG5BHe@2(26RRq
zWM^XiPxEHv#gF6n9_($+BgfbP_Qg!*fydXqP9n8|7{KK{Fw(E)!~dX=Q_UNE^7qS+
z@IOeNN9Z-1^RDX+4?dfJ5NUZ(aXWkgFI9Qa{&BAPbkntPa#iEGR4|QYO)!`bU?LsV
zvky+R$A0bn5k7i<klXhD&&hPIi)Ap5lggWF49BmV>_l(*ujbeLi;muNJ?fR5t2~Gn
z<veH2>3Puj+G`l{b0W#<Aq8X`0On^gW(S>ygXcA%vt+~1{YO!+aa8i`zCs^BJTKHd
zpj?f}EgA<OG=`*7j8^zLU3wW)27=lFu6?A)Goq&HPA7gD;dI}Gw3I2zi#S771E7gz
z^zi+BWBS6DhS(z2(TBQjei(P)S_5#~_IO$&K#X0^&zfzr`(JX?@Wn*$LXPIUF(Ywr
z<Q<^E#nuGK>V0#>9<wYteQ=xe0h|Lxrq9G4Xd_n6zJDmc>+C-8)oTC&mP6yZ;@@h@
zZx-?R+y4SCYQjwqL`wbmMu9!^_D_t#j&e_c=Khnw#^L(j%|fQ#1m1?<&nEtA0xcjU
z%RsF6c%D^O@1G@!yD^DO;|JN6mR3i(G+r#_9dVzl<_Vv%cixX5DSZ9B&#6lq#Y=rX
zg9Z%&_S68RcqYry&2C@lK~pKSW%&tbj}ss%HT9+GfXM&0J3#d2L37RVyBJ`9T1|Ty
zNNZ!9_#iejU~*QhX9@Vkuay3V5E=udEeF3AaTCE%?sJX9mlgjLCz$~ZUjOwewF25F
zpKoej_E+wF+gU8_mr}S5@Qt*)-zz>g0f6KMAd*iMDjhgEJq3op{j>;?8oO2eZ(I|d
zKuix}g907H9!*OxUN0oBGd&zdO52TmCjhw~Q8Jg1(F&Quy?b4t{ji5MVdAGb{mtl+
z22S4<Y@Y_Fyo@0smPr+_2cfV1uYitSQ;pHm^<>%08&pa&gU{b!+UEUl%$XjVBy;U1
zfJB9p=U}-hZz%UW<4y7M3pBGB3m`W$#>z=OR%h-psvpDTJh@&8jGB`tbyP7x(%F%3
z81U7L;Z3(F#YhOkc)7IZcY2w<z9`KRQvNMqxpu;@I_3e`Qe{WVrHuRC6DB$|H*Ass
zgO80mTZ)L&s(QUBUw{OkLB%D!8bYL+TE;`z^y4$2TT)|eMnXO0yTMcQAl{|&vMt$t
zm2>G*ss2MV05u*2d@+6Y>i<1>{mo=?JQMa`llMoDF88YWyPZEo4ErLj1(h!Ku0~;;
zl})bk6F}R~)>0$1{;2{{&4;{20MwbFWxe^~kCXB`9Jr1c)Jdb*uY-yXPEyt1%0%zf
zC6FS|@E@pG{zhUJj^8hR`D!iFe!N}WLNQI*RFjMjI~2U80f3u^-<<+0m}DgMSnT!?
z?hkj*IU(2G8Xyy<`mSki!Ku69AD|T`N*P3(r@lcgW0(co`KuRjrZ*Qsktc5+2nhO}
z*UUUT;oj}Z(9r#M*1heVA^U?|k~Ox+7r^$l#yKC*l;msbdXT5t1H>7PEhhwocq{d@
zxe}}Di13y}ibv$VssJ#5tffGGJML5UzIU=cHz(W`b@V^~=k1y894I@6A{Db4FH<30
zKff?uD%-p+!WR9%NPkD1?+t^F)%vuqTk>Puf{`7l+qBB!^48B3a`*tEbQ({_iyjc~
zZ2O<{UB7wYsH}V`^YIFPfM8R{^YY5S&14FMT1v(5E|H|IKw$1S|DK^jlXW#<Z6y-j
zxs{^xKxw9ypVTW#Z`Eu*U%KC4`u$nJD=H@{<J8vs5bzQ9KL@DSz%je+F05;MIb}8z
z&+u!?=a18D@MrmtlC&6+tqz97QR8JNlNTx4e)>4u>0HkQS|ua5CtW*#I#1Xvz>*9M
z04gKjtQY$u1&y-ao;bla75&wd$&G94c>~GO;D802`1*C>Fio)yuGn5xiM!KJe9^y9
z<{Z%l_?KA#(Vzwp45KZ<fvy@Jxp)n5b)RP63@XWy)BP8|AMxN1Inz;Oo#prL{L4kN
zwbO?hhd{uSQNZ#+i!Zi$FZO;bw)#PAC8_COCvOG-HQKhov7E2fSh%YQxZ54VCAPp`
z=FfYzbI|ZRi?lxgNN#5kMYqwd3PR1U0vc|3fOF%s0c@cSzc0lZ(lrM|=8^0#>zDu4
zyr)r5AfA}3zz<=(>jkr;KnzO6Ajofsy1t|?H@gt>t*)D4tITf&Hv*uvU`K@hGdM=Y
zv(pJRyzK@1Zch20X==HbG<g?TH~L&Jx-SMFc<AYg8T06z&HH5CRC^uwG7kdF|D{w6
z5X31m@LEqwsO>4nYFGO6H!pFUXQZKJ5tBryt+wnQ_6k7T*X@smzxf&`oyno(&HztF
zFMPczbF&H|WO%arWj&`dc86Ig=v@!+8q-p(fAKj;OJp}T4`+0vy<YUc*@wSXIIbOI
z+U9Sm#^!7ucBt-UYOxcyO#vsCRJ6`213`vk=n5co;3O%liXL?)SLuC%=G1s@S*V$s
zq(l!1@0NxpaxWpPz3;C!n;cqh=QyM~I<TdJezEWQ4|%S<{AX;3PaG&EpG{=WpyKjv
z&jxb3Je}cpwVv|SJaB1I9scr=WJtKEJx?yWFX}Ic{mV*xhU3w1{hV#{r7h>PmYxb(
zMj9YwN2XKb_N+H%k(_6SR8^{260JclT)}g(6?n?#NLE7nclt1u2!OL8uka}aNCEQL
zK!D^EOATN<9O1Z(Yw#xUNu!-<V;VT#GTESD_}$FMzhQNt=>fi}*H#5-R(-x*K8GzS
z@N#ADUIr@w>1GA2Lx1zvCDQSn*t;%<T`$!8V4um??t>w`B~3G@5(+<Fo%&djy0MJH
z!%>Yx;Qcl_bup0iNIfcnh0PG{pb7GjxLkgGGPiVJXR!&v?2*A28&}`A$PH&I9>=Et
zMJAtNf9kp9cTuJ-@r`Gl9;#DW)zZ-*;4oczEMnk-Z{AC2B=u_d+Fy7UJ2aPnkVR^A
zrKHa=YQd^My$4432r?>9<HaB}Olk)B{FcrDzSMK+=4NjL*=Gsj7XXDx5vSxMEk-05
zf+Xp*$2`Im!p;VC%;uxWxahDp8I2=GzfpO>F#GsD+z~m_N*lmlbHhCdTDX1@Yn|rX
z=Mbj@M6iB`ma|E}ZJ8N-SnV9MjDrs5HItW3(hHWGvh9=Sf$fr#bawg0+y;2X)#s-7
zMjzIb^iXHtjEs3_e}%h`$dWQ9JuNZE7ybLuz^*9Yu(9UBmpvF?ygu)x|H+`h2H?)j
zpJL)&<n{0FVz4Kke$Xt8S~snQ0>ZQBeAuu_!*jD&laUuA+{fOuS4|!L%~l|A!o+8v
z5$^^#8|CP{s-E|Wvi^VBzkB?huCnlK(^<ntIw)bG5ZQZ{T-S8xv(-wSkQfD4q_b=a
zi|psf*9QMyB{9}_;=_u#k&T3(xK0Lmm|^LF-@F`QJEYnPOz58&E4YgM?Ckf7vu!1m
zk}#-K0L<X=7RqRqW|<s<__OfWIy}Newoh_L<38F8M(+JeWUo?&j&NQH8^VBI_^6=8
zUK=;}O9XAKnIxm2wU`j21tB8_nUC65pPT6*NnDrYHFA&cMOrt~?FqZ(o-4cc>^JU~
zZ(RO~JuLJAY{Wm$IDTI+JXo;Ga2x>a-Dfm>WEbJuFGsp;mKx;Xo`#A(@qUHwMmaTD
z78Ic9@1loHk?gLSss=Y)l<OIrSKf|fj~Eej*7C_I9nJ-Nk1=vSCl;TT>46?yQ~KA7
zQ^3HX*K%tZiktI)IA5I`7B7En@E}NbGv4<L$-ZADZ9|aVu~jrztH@)x{i<l85qk3U
zfcrRGVAdutrt3+d^crqP2%yx+d}ZW&^lR?a(43N1!r%AU#N5-Kpik_$hu(dALvk&L
z7K6N-4re8pZ1+KPD%r^`lel%@bI5>wZa%dkvP&};9bO~dT?!@+lhhSfanO-uajdL2
z*>Dd3c?!5Ej}5Q})kc|M#xRchKpbViu3hg|;2$K^hw%{&ey@&3Ji9(fPedttY#9OE
zXm}`Lnc@T-WXqiTCMek_Tn@(iO8`uWL~-wLWH_tlI{DE9l&lVIEfpQB3)X)QEW$~u
z_~}m-7D*vW$-^(nt_vW|UJ$i56D$;t=m~msSbL@uN&TBK<+VBgyvxtJk+Wj>HV`CJ
zLL?gB0qU@+9N*`w*O$bvx9Edk_5nW-@+Dk#xD%Yq2pB}Mz%bjTcb>!(uy;{S?yV$y
zfRFu*OFC95_*_=#5RCjX=%-TRo+Y$0>Ce$}8eOY$6Itsirs8@D_~Msur0w#<#}#mo
zoz%q6w0Qt+hyro^5zY;!Da9c&F>epIAK#NOkPawKF9hQakho^qI|O^UCSL?qDZ;~F
z&Ia>hd$31H9)Z!2O3~=0FX~BU#j=t0TX9SnUar5M>!E`}bt!M2v&cS{4lGpE)RDGd
zTX?@@?Cp&i-@Di5E5o@yrY6A$*OR;q>H1i+@o-hR=>r*~u+;ptM0xF~;xDz%9+I)o
z9&oK1{PYj2;q*rKi<UX8fx+s(3XtE)l@e`sR3J8CbLv41A5~1u(R&^g0b~|kQMUbe
z44$7fTU%arh*kjMJh|Ws)r8H2siXg1n(HKejd2Cq-yw2$Iu<#oz%F5yL4_=NllVjA
zPZZBU+4seC2P!BQG>O5}xet@ptp_Hvu<>f>@~^U|g+pU1n=JXZP!#saESW`K)x2B(
zI+MJ^9Q;MT4o?usJTEEfV(_Wwe7!3cm-LK{t*Y3?yt@^ksvSc<DQQY%*vujH`RqzE
zI4@WWL>y=Xsz%0_D)Q<?k{n_FzQ&V^vnpLL{b!Kw0+)D5u!yI)XFmWOZzY9)AwRqT
zMe;#_`#tO`9Q|vvyy<`<LfXs}G(*6RO!l`$UHJh;D(ApXNd=9^X_ekj+d?LK0>XA?
z^Up|@Z-)d(n9^v^pW=UVx>v?*QgqFk&5`ePa^>c89IAeiYyd=GdvfZ?Z?qa3oohH=
z^RxlCt~JBb*tlZSia&c#e4AgvvpUlgiIfa1=rkL-G(+_NGZ-MJ6{_=3f4WaZUUhdq
zM~QfhME)W%DqXL7&?+?AVh4>z3oK3prx5GslQ{Z|MV|`6#A$Ii9!f~Fgv03cR`cHu
zY-%EsTKUsa;H+g~anUe#5gfmW?~)-~0mLk94w_F?tHkKu(0Fgbj%t96IM%A9tgde_
ztmWetK`y+9^ukjvo#vV)kc?2HmUYh{OTLY)MIE9JdWsiV>!*t(D{`wv{tMFoz5y_G
z$1g&)LYDfc3zd3&=q$s@j-q=si-Qq9@pMf1zY77v;#Bid7iDkjCth*{IeamaI|Z7f
zqUqp*r{Ovx(8ZchPCSuP`filY=L6|p(fC`CG==kLHNmfi-{|9H6VYb6zj*AZ7pn*v
zej8DM%scVqSr7Zo8t0IYZ8eFKI&$|ZFcsaL9j|}RIk!1lrJ9A7w-K=?e$X&iW|g8E
zl?}HYGWp~4NxG3qv_B#`2=i`bI94O)$pT*Qw9E9+G#FwA;m!)_o9k9@GWEZ*%%vI%
zje2eK#eG6YFPxI6Ejrc3wAn0#NQr{?jVyu-_Wj-cN75xVg=QUA>?s8no8vGT65T`3
zh*(QQJ?B83d3`y|G${eUzz7bML<B>58-<<z3M_bxF%(O4f9Ca#nM9Zb7ecG0=@jrJ
zZe41Co!fNeKJh+xgF}^u2AYLdF*Ml-4&X-0!lGKzsauBNM1dt=9tG2@2lY@zxCe?~
zl`90e*@ayvhXrzVMV^0+{#bIJ5&|atTu5MUw+;4?PPQp_zc<JS_5SNtrjy^x{d{iV
z-*e-RnU63~f8DPrzlF92JLcebU~Eu?8TAbtJXXr6BlgClO9k8sTt20iE(IL)6DhKK
zxe!P--5&RgknaqOF@JWER!UbzG><VC5y&pQ3C6|o>V-#yc<2LVlY&eiW;_l&GbF@e
zn}GGOo622z?t}Ra88!%xrJ>+YkrWW1yH4kG$68WK=ly9oAuv0xnCz3LDS@%)&xCM<
zfyan!uq~g>6hX^Rt1i;OKMRrw1<Lk<qLS|4xaIv3B)!+7Lp7&{vXYQOW(=>+gI!{d
z)wbxkR?-{N9-qkzC-${BFvFy?%B>fuTuoPUiCwlE(7B2rV_{TEKK&%Fu1-BECM<@D
zE+(T~g^;j)Q%t9bRqOeNiY~1%OmPR0QC}G@8KP8+Lp#ntGJdua+)(Z<#0Y+B7v0n)
zHOrYXx!>6#6U5vpa135}#+xJawIw!~i-l!OnP*U43;mM{uqkkK?6xaHfUtsWtCBRO
zrwJC8bvv+RNiRX+X9po5kNp!U;>Tl1HG6DywD;c|in;F83w#!gm1Mc`VZ01+c%F3n
zt8)W1fx<*M(rMLT^bUqkGEKC-%DQ|AM32xynkD3-6ey<B${tMzsd`+*4Ri^ZbYi8w
zhPdLnxIW<R@EMHI8D!zJz7&25tWunUHdcz%^p9AlgiWshhXoMg7TUYgeM@|3`!zEe
z{r9?yf-0f<XS%S%^edj3w+WW2YMQi&v;9u~CEt?}g7)~QQp<WJ+vq{zg<y+M5dm-l
zi`}yCCe{C>hN8f9pZRp5t)_p>IpulzZ72BYFuxKXhWS~KfHFj0(m6^0;n316FrB_w
z70M)F^c3tq$0!kX<WGk7&X!Jb*>(CIq3(mU-8hEc9#yP|dB)3<*op|ctNQ~?^HBe{
z2GD9zka$i<O?@oy^A1wKlf0*BS7<R-;4;O?<HUdaT?{G&{FMF%mRV4#(&)eKrG;_K
zUp~-<KRe=kEA5x8sce3e8v6Ko?7IcesW+*WEu^cD@QGm6nqIFj$Uoqx^1Q2AmY_w2
z3_fOf8uBD3wYdT)$P^fCZiR4HU|5}m9oSqlwo7p%-}6&U8aXKXOw_ix8Bz>|F<_%<
zPsmGdoJ(2*fJH{iLz(Q*zeniA3FG$QvX}2VA@U6IRkGwa^yh@&%%)@#mMDL)Ds!$+
z+0WoYBH*p4SSCmWM;X7HVKfLnF-(`!U##2l_A?azCc<=H3@B&%<OYNWNAcR<UoMsZ
zsidSLDEXS3MV#q4XDJ@HQ@vj|_wJ-{xg`g&oIAuUw(W~cF({L~yalMHQQ(vv8apsZ
z5zc07xqiap0aJbeKO4pIWh)-w4Eaz0Dw1bNMty8zu*VQ8&B9tM8^B_{i|zH$@30D^
zeuud0kmUU_-$X*`hnjq4NY$#({=psmUS%|Jg#wG`LVWaNbst9Klnnm>pqb1u6tiz(
z^LeY!p=OZ3NQ3^vBd8qI|1bQPKEj26!(^MHEByO!LxcoqFo7MH3c=!*EkhD9xm|l0
z>}zThUuS~7eIoUj3jbJSqKXB>3(`X;|MW=Y@ay*=@?CM3Oj)0eBJ}b1w!5<RZ_~ua
z7ql=fJh9W^fe6o2B8*>CUOqrILOA%Uhjt2C7xFsCfo;3Uzthv9&vDUfVt3b7m`gX8
zRoa`p%I{-hoMx!S;imAp;ofHW-)W$fA{u3gJ#qrN5guHtFUEh1-|Qs}q@^D5v>{`p
z|I)?>fE>bO%$(JPh0u|F3ZSeZ<UvyI$Vrq|#T{OtD7+VXk4MTyN+BPIlRk;CAq)}S
zzBy=eKF*tpj;57ZRG-viol+8HRNO3w*zM4m-&q?u#;9lgogH(Ys+)5Xao;KWT7W0h
z_c*z@vp$+Y_D4<ztq%edfw9D}qHT*qC^gD*!2arXdEwohqt(~UcXqaN5v}3-c}uj5
z922h@xU??}uRmV5YNG^s<tl%PppuG-BI9|dwn*2~?76TxS4)A~%XA~}+qg3moP+dj
z{NVlO&PY#Kg-ID$8}g`is#H&fU7}$%f{kh7?Q4sQjxYH(^p3x#Mx+555Dc>gOIjkz
zctzUqWrn`a=kb=#l5ZbjS{2tWt9#(L&SPq<>V7L4JroN2a2+axl_|2A@K$@XUEvqa
zGxMN3>tvpnyDb~f8CC6L#j7NYUD}`2(1@H0`82A+Ei7`aW<=uoiQR5TmVkCR&&bkG
z7u|T1HA-O)uk(bFVCTzERq*hL`Z~2lIS9k3)q*!~4gay{ZOm0<619v#gZ0|?V~ZEm
zZU*e(=)XxbM~G}@LT-s?KiXoD#1dcm3rvJ<1?%jh9C_H*#$sJ05lNqwr_kfn;gu{A
z&_{ZTT}R?D#O;0d;~F6Sct0-hLqzkGAHK#L)^{))O+>>j!OgsJ)86cUCb1`G6S{`G
z^l3|4$*Pa#tAkf9!@VN;POYeYZVvLpv;4by{4JfNk7Tzy-qE#^O_5b722a15S!dbo
z=YhQDA&1R(i}>75aBKp;WeSW;%~=%z!Aht2&>e~!>RhqcyS!qdNQH9~aV*rN^||f5
zMOk5?65dc8Wi9Eb;GcBYKV6I(*#23tc$dO@d(q#-0ZT{in6wP>-1jEX=;fe~x$|5}
z|E5h9yPD{pT!m6PZEYmsFZBC!YQuU1{`>vQNi*2gdz+DG&Y>p!t}9-S!Z&Kh9TTDo
z@2Kx;jcX}01Ts`cMc(waruV~SK@5&WFRPjs;=nB#t5=4h``!JV<Y7u6B@0}b@+b=+
zpe}esqHj(jcgpCS0J2!HM~q0iu)k#&Z{6`x-YP&Rb2vihH?L{Pi=0QhLgM68U8kzS
zgs=Wc2FzHdJn260!3LY2{$xv=dngoa>FZ+WJFW1{K_lgHN6PSkqf!q$O%KdYaz^~{
zb2EtA&kk|Y4%}E|BHO<6aH;@1P!7C`dS%8!5pi_gX`(^ds>)S*;%D1dW4q>=*<I?V
z2TK1aX~;SV-Hz;RR+3<pnx*WlP4L9EXZ+1N<x3qF_g%ci@kVHu8MCDHRNJS8K{fIu
zpVEEIsvQ*(4*z&r*GZ^^vmLggPS^Kq_Mrx}alYJmRzc+T*x$d8@S^g7-e#xGC|45+
zuZq{)vzXxtjT$F9ce#!4cw8hyAv7WcH@_T_%z0ORUUlAD%k!U)&G`mgQ49KLGn@7b
zE&_Bd>t^gq%4Qqg0I^Rd3eT##S-X<7-SVcFu&Uv7_h>KQ?fjH3{`@J=KuH{p+!2N)
zjCpWS=TfRssKfBda!swor3M`ZRHYsj%x~#h<exxd5LfV7)oZ%+uPSR(Y1{qv>Xb=w
zVnPzD1*J4>KK30J$25DKe@tcPxO<z2;Ke748Zcp}wE@AHG_8TUzJ93!I@VM(&EDb$
z@r{&DaCC3lB34CO*}_v!+VN`b$<!4KTO2w1t=RQeo&p&Q4K=?wazyHF<#w8R1WVAu
z8P>~k>ly}fN}>+&e7F8O-)ilb8MqX88vU`7i^Sbh(BhLk^!pbWXOd7Q3v~FB=|2K$
zpVYydu6!E>I=Xq{`P3FtOv}z)$or&GhQ9}YM8ikaq+~i`9iR(M8KZT3Q)$+d(=~+8
z|4@uj9`L;G1~Y&<BWeTkQy-C;2@=7Gr8n$%T^GJ)`Ogw&M#w~NdI~RcwqDXHn5wz!
z9=+~m*|-oXGh8LN{v3}szH--z;IHz5efQf4U&B$!_*plbrwviSLNzblA5_G`6w@zn
z_US~1L0CjTNTaLQdY}d42-CDuT0j3<!CR+>^<*oc7obc6N!{zF4G<M7RCH*(p*r~5
zhs-)(G2t=qL>3N9vlKs(GwORDT_#7rzC}mK7%z?bVf|u0%EiF6a)5IsMJ`UCUyMV5
zIG(+Cx4mP{>8&(PajCdf66FT9Vn&c^KssafGkoPJu>KxT{);DcalU>K)>((T87uQ*
zGD<m;r>w__ZR0%oSUfq+U*|QwQ~7r<m9qH!;+>9Pk5uUq1+tr?6=#X=<kSt{N1ANv
zw;_1W2UWuU;vW*?34)=#Pzj{d4;Z|!#u<)(*!J%}O<fW1lM@!<zu}j*u~#50>Q4@D
za8s_|Vwz(S!nea|6lwsevyo+Wx9Gd-tc|%MRdPH^WET6KfF_ba<}7RXJo0_7zlq|f
z6yDu>QZvJiVAC(N%oeG7N-rDGsVL=$5tgr?QRp{BuEBYc5_H1~V=KlXAgiWm-oDSD
zy#tYwBpRb@i5Y>6h+}V7Oi#i}L>xW%O1<{T0^xxr(P_&RPn@%Rq+8jr+6{CO4@s47
z-8duWxh#9$6x`L~+6HEylCiFcZ%YY6?=El$pLOsUpU`8Ef(eg9j@Ul|F$Y>Lbv#>X
zaHM6Y559XfT`I1)Qh6^3!*<wi01U81Sz2&%+HYY0(vIB|1?uv*(LEG%1@Mr|oXUmY
z8o6<|4S3<*1rZIhAUw3`i`?*e%SHM>?PQ#PE;gTFw-Rs6IsVJ=Xj8MAkx7E1wl<0s
zcx1W_b%3S<<XWuIr)c^;oh)Ef1D7cUTc~wE8h<X6Z)}>PLRj=ZYv}&5b!y4bFId_;
z2G<344$j$RI)qJ4#+H^u5L~e$8B9p$t-y7Bw)+|fihxp?^G+bJ+p#v%;nF$zFx#^H
zmYdY#!1*D6NG@y1UWNAb#~arH$NH7f@UfAJXU&^md~kwF%l)JFIY-!?bn@a3z(ukI
z2-|o$+qX&D(N)O9$T~=vB!$MPc83oXc>mXWvVC3%5fi?{DpF5uFUX^gXY#krTHgpg
zcBaw~U;ayaIo&EXAo;p8#1|#LT62%yRF_M6EP+iWfs?3u@~L6851+nXHX_QtmB0uI
zg-s=V!l3sZneu!@n@h6?Tj-X3=FBkK6`S0*lXvR1ClXhlw-m?iz;I~+`Qp-n74|Zz
zbC<zcUBC}H^CN6FjOvlRDfJf@ZHbChp<nMM%S1q<weqi}%eKSH57Z}{o(CQfAn%MP
z4vQ2#sD~@uqWMC`gPfvy-%KjA^!?yG%wWv+0?sKz{Lj`E_Ue(&18c#b)x13nm`{XF
z_r3}9neGO8rYrK4D}b<y1mVjCI)fr>aWW{ss>A=V&U7L`$J7$A*ZKVI>Bw>RcAaxt
zOe&X;<l?>`n4+AUkL1QZ;cn`Dp}ZTlErPb^g6*XWH-^7So4?lDy!mXVE&h!B=1rwK
z8)+bK?%Uy^KJnwn@^R=-HUbTX7&Q7Mk0p+whkEhzs(6<#{JPWC;<B-&4DpHts&ci@
z4Mb_{#Olg#A@Wpa2%V$opQ0Hb)or=UQ&^{`-!UAqEosroRxBH28&ldo!n-e^?|QtF
zfQyRL)uo*gg_=yV$xKJd4%rokXqFu`?!FAxD+EV!OzleS0<%n4)Kex$Nid{<J+moX
zvucuVaq_!k2-s1)1|#-7APoLBayPmT4KyTPVkpL}W#A)KW>E#_s(5*VV!{&z+0=NO
zamT1^F*J@3^%Mb_{P1gT#M4}TnLDAYwFIMDWgB{?Z`u)xgAsLWA5|%yk|`wlqqX0(
zsf0s&FD4BuLSMs-d#A!qJU8n5mm(DorFXE4*yF5hL3qDKd6$n9Pm?<Tiu_syP#r2@
zA$#<}<y+Fj=)_xIYnBU0{vW;fZZ{L-FFvU*sZ<<d-yH~qgJ@p_bu!D(bG}9tew#`t
zwcB_97!n>+M6!CGDf4oGOQ_h3)GG^G#n{rb6hajT=VqKIp#QWVHjm`<kF)3{{;r~M
zIevEw*c#ek^(Tf*GbA~OxG&sSZo>`}YY95#4dO<nr5$N+adFI2(2Hx^^pRpiibl2O
zuBuJHBZ^^yO<ffBwZ?rn40K?=Gdgx9s;PjdejRhk$zzdVt~Tb$(q(<f-2}ff;V;H*
z*}!NNgRl&7;5|KoQk2cF;K}#&I@`+;PKupc>mV)-1tPraU~NLv=GTO--^2!4OIWdP
zySG{uSoj4=q(>@a#m@Cnvkoe(VeH$(SGKgWgbe(ClpG}IgymA8OGdjpZ7Y`N-HwlO
zMT^_e`|U95X#s_s{902Cy4(@(lP9*eoQLB0(fkt64btZfwhsiY!#@fE(+{42PU-89
zX#zV>#H66aDx--@w}CK(wAZd}&_}T}4L;k%N_i5R9<0=3ps_SYkdBG1{Pqc^KRgLx
zy|O&2f<&vu)i+cVfZ`qlGPz-78$GLP3{R!2TZgUK$47HB-*IMy*;lmi^Z51XAX7j(
z8azPmnIA?{klW#~En=(n!2_yTsXnt&DqigssNZnCQ=%nCUZt-zRn<du7MwGTSJ%q5
zgz!O~nv8x?5$A81bSMSueapW_l%YB|=If9Y?LRj*rV7NckGL{)^A=;Atl*a>;XNC6
zDGauTsn*VMle*6zkq7b)!ERXevT_v`JEyNYW^rIp*{2@TD9MPpXY}7NHPk>%_G!*=
z>Py?7=M)Dr#Jq_G>^i3jwGy}1it8TXqqw%dmp(kBb>Y6QxSAcbw5sCYSCw(3r29N{
zJF7E<+48xSu01u;f5T|~r#E^mDk3+qgz0Rb!_w#pUL+`UE9@mwHmVY4Fg|;-No+xn
z!LvV82Rj#@bNp^>)EAY=<POXL8E@v}j>?fH7b@Pgd)70%B|6xF1RC61ZT}w2#9dO_
z^N)P+TZsO)U}K;9E}+d<Tq?*=5YuhxO{~GN1`V?SVGz?`39U#3si`lQ79zP0Y9aWF
z-j349;K?tyV*!{B6M!PAQgVO~uWo%RWWLIAa2w6FGrvZ3w}-3iNe9ljbW*I8H-87<
zGBDW8?UQK^ov5#o9!YVI-zIc>&3B$5uclA|Q4t~;&wEJzdA%d*<3k6-;xHe9v1GuU
zClS1yxhbP?%Y5-_gFo*-sX$I-%RYb@P@Q6_EJ8Y>8?(7|Ki;#~T4PW%%CByMgHou@
z8R;of9=}*!vh8a8#BXTG*v=d8ey(a8W`?(ec1DE$Y(HG;W2<l|JWa6zjC<_n9;J8@
zjehw%rX`p-7HVI5aqs_w*&Zk*Chvj%w4lL-%!Xc3JnJ|D^8?*SmWsXftS>5S{Fa`K
z5N#h?4U*3)qvzrVJnF@U+rJ^2#z;-37EJS>zwyswOG^Yf6+Yn&suYb{7={oSilV8m
zy2N9VC}7I<m9d&p<$}|G@7n7S64!Gg6~jah4q`~A(ipNhYy&K`eS_WtO{0AUoy^*i
ze5~XVT1~s@ddc5~o?-vd@FFRZu)nAMjS%5<#xB`Z)=0*cQaE3gf_cKxunaz<j@4+C
zN|0LAI{~_urX?`qxzo7&!r}MqA^T?-wi@(QPWh(-p1hx|F?VUh26%&l_@3cD1MPB+
z(!12TfMws)+h<}Y+2?bxV8C}c(mk4eriMX*I7fLL(lHztCRC>A)q1Awydxn@dsBCK
zM5UPoY`=fL&&mLV74ifxT9(8B?rXcp2PM2M#2wX{@1dsiA~5q*8EvSs{q;KRPwdua
zZNrZnx#c>epADPXmjybIT<15tdXhM2!90nkIbr5cw7Xn!2cW*_7bdxZP-~YLiozcR
zw}hif4oM_hBq$%Tumv|g<w{4NgA|kz=TM&4Gh+?6Y`#IDM?BNF{*)ScFRJorF)V&}
z#h%8NqHOODtckubW6eD0b7E2cURd`n^BxwLANUH1?e=eYf|&)Q`aHI2W`cPAx3$6+
zbmhAZHMoib0uCX16!#9<DI`Vg6b0e?Ar5vHPja~jG}D2kzOCDU?jK6-6Pz{9NxHF&
zmip(NW_p>L_9ev|TM*XBvp8POKAg*8lB_{MM&8jS{$NF}7O}dw<q_AnM*)!r#shxG
zYTya2vR8xk$+he~+`RYD03;66PU;<t&f7~)1~o$up6!m-f+b*%mRV9LZ73l_MrYN3
z!?2#F*#n<Igb?V<+|Il(XKAF2H;|xXwxZePyOV*Q&hgt=?;8>^+&`W5jy6853bnX6
zR3h?5sl`E0({(0GZ%yp+y`%q|g|Qb`F`k=W%a|Ou-W<$nnh0))w-ls<-^MePlUAkE
z51X)HOu(VqVtsO<=%nTH4P@~Y8{RG4qH{~U^Ylv#QP^SWxCIMGK2eV99J8i2a#^`V
zyW<;r<{FYmwIif2<2G$iki>lagYWbJNs#0K47e3?@k)<yMm@`bg!zj~G2xo@mJ9w&
zr#PH@)Wa*F9Afrol`}uH9s;uo$PU*Hnb9WWl|XvKCVD0?6X_7_Qrev-Cj`#-CtD)=
z5bC{I02z|7HPq1MKJIGJNx2K8E9)OIe|hN`1;Kmc7gcU^jdX+1HgL?ZVj&JkBm}70
zZ6=fo#+QT%JyS8rb6rYcZiPTe86Yutqql8}V?Ltj?|fvobT#hz`K94I<e(?shS~+;
zT;b3a0!3R>kmLy&!{gN~Tj|erEwK{!^=0XJS+0Tw<}}+xFsf0_&=-6@073=dbMC!w
z=%@?g3dRS{bB5d0-$a$7nh7}JtMwP^Ufs{5lUVPw*DXpQiy!_(hu6t-;=I0kNwh7!
z7LJzca1Wo=2j+Cowxtm^5@oI%oNvc~p8MRpG=`2{xn<%}TSs7Aa%n(yE)`rUP{Js+
z8g^l`h8WuLhvD9=ZxGZo{E}9IRD#uY!f^>!N@44MJ{)G{11zE<$nrDkTiqq)Zv+T>
z1>Oj>;i00BmhnRY;eBA5r%}Jqrse^40YM{Jb@H&p9v#zi9RWZAeje|ij*m6mN*vAq
zQA@dh2Ez|E#hAK2i2Hm~PctdPRs3V66L)gh&kEbqvzBW5RQ$30N`pz0RUqFJVzEvQ
zeX4fW&Yttm)Fq|$a{c`XKTP@;MG_1W+zquVWRNLVs)#y#0Rja9(_o2!ISXh5(h4<-
zJU&+#g(=tvBOx0y>6h$9M%r4nav||I|4z;|Dh@d@U?}u~y*vjN;f3X2jLWgw^m~8U
zv8W?34{(Cix%Ibh*veNpKE+t8%FT`ails(IN!H<xRbn{?VO5#N@kkfx<Ao_ak8(I@
z9**yZWv?NqR9l*#?c<IjkjavVkK>hD+p0DO0Z*s!O18y!oxR}^QO}1?Z`dIW0>Hpt
z9rSFQJMJ-Iyp%Qu9=H}dTHG4qB0aA&3c)hQ$C6nZWj%txI>eVyIGVLspg5JJKmsUt
z4pPxw+aDJ6#O=wS7LLjv7#u%cf7#|MvM1iH-*kH}zzOpQRWg!0d}IYD1*nv4{(e$?
z88+*`|7{AzRYPh))oe7H;cKS*&wqnU2j+^jidSzoEw4F&P*aV58s7<)v<c9)Hb%`t
z7&PsoQS=k{gjZ$(w=RS`MFl{n9U(~rl8&sDi(iw{sBomfK!pue>@e?_C4MN~RE`Iu
z=A4?!$}q6P0^qv}urxquJfSdw4)FwUXv+mnJ_J+zFFFc~`SYu1Hkz`6X!k8tRPg!D
zVOy3>NSvD{Ivy`-5pzP1K_+7jz<nvOuH(>(i(c9wxK5`K0(764yP%W9cEP9Q0E$Dp
zxWQJUyqxx54A}04W;ih$9ltaM?VHQxV0<QT*jCNuL?Sc{s+?+pmOBv&!IG}lX6y+6
zwv`*1u8}8~X7)4m7mo851>7f`DU@A-C~t70&}3c!G)m{Ut1nxp3kpv(%v(b&ZTNTN
z(-EUkrsY*V`Cd4}{;7+Fz#=@3mW*Qd&3-i8&VnIC5XX9V)ftAO7y7TPJ)clYntV=`
zPFk4!CFZ5)l(C*N8_d!sFPdL*)GvaGOL*Jf_%QG_0C;+jIvy6JUI2pc+5^bR(+1v{
z%_j2}3qk|lqqFH3RTv9CdE&Mkk#*iA0LDRCy)FY2rf7sng}$oNy?Fg~QF+3yc4sqo
zlub{xgra5K<Y8-!35K6hdU|Qi4pkE%IYvs>5C+bL?}w7WD7yZZuG7ztZ<F*)(%?!2
zMIC0JWtw!E!=);2><@r;3Y?8Gc0*-AYlJu_u_B$0K>Ce30yRs(0E3X3#67$OxhVIh
zHI!1F2ceGhF=B%F5ilm&0tJ!{ePG_-5sQ#qfT+8n51cET?v+4TVTNs;cJI|jno45+
z+ZKiI)ia`+wt0#;wYB<Kc@_qFP8ZuH-w+^Bh6D_@k~~Q}sdKz!rh)Yu*~>529Z}yA
z^<UEIvVnEZgf1sAG?I7(<u39dz=$>EY0Gd=yp;EDnFY-5KQs5KL@4?EMJ7Kt=wlH)
z#Km6slVlNf@sr~)`t({VK3OP9U}FM95;Fq*Ytc`r5f+nKyl%oQ`=2dKh1*iNxG{tb
z1iCG@D#fykkR*Ee{^8Coegj~~#&qJ>u2aVE$yvJ!`G_8c3^+Heleh!Xn7O0b$-q1k
z+|hs>c-eZn8lHXV*3Xer((J(#KcVpz7MFadadzUfj)f8qE-}qGB*@;leS}FhjDba0
z_9D1MJ>@s?!?F=SItzIp@p(}I0e&IGjc~5HsUnbC-BEk}e!<fU$YFO|%^c031linp
z=8f8SX87X$>@_dQxe1)>yn_)X2<`P}AxTV#(OmgLlRU<);tN^u*%mO+<8~k5lAr4a
zAkWqvnk6h45g+M>SupM~N_S<xB6j>dWuZ(9%DtV9tqZqzCIG^1;4!}FD~~^bIf>&t
zAgRB_-KGZqBpd7zC^>7;+DDNKe2DZ2edE=>c|r1rT|@ga%ksvZ;$)grBZHRCmEFAO
zk7Tpu!8SS>G>=LKCovuc+?Q-1I82PxL>hhz@20Yq*PCyVQ#v<SH2|na?`vhp*8s?Z
z_z_m1@J_sTG8BtTRWWA#qF??u{Z({%3ovssJTGgl(Do?@Yd?J?5rOXe65jW*fVxNw
zsP4z_pP#?|59lBA|8HDBN9%u?1N&y6^6(!6{pr;OQ0Qt<II&nR<)XjZ=4uw^(YK#?
zA`IxBs%{$lWE1c0RoYw*R^W{MA&4U}WgMpzz@^}OvlP!T_+F*Nc07@cVh?DnIlP;-
z{hHjK1xRvR^>ci8V1f$U0Xqf=S_;QABr!ZJhmKw|hu?-R=KoHB%?pg2PkIWdP*vnp
z9FdZs1Qb1PDku<^D-nbjZm-SkwUrZ=jvW#6@ZgH_P=;B<Z!Lz5ScELzXuBUt;y-I2
z|8K=#U~qs#NETVy@^{&^LBdi{wr_I1@v7?7pM6?ykCm#Vfv6^KldNwZ-cDW^0|)+@
z-6u)oTm-9p9AbnXSRA_pduw1ffN%QbQ8XRPlXreLS&$yZvZFT4^e^+9q}uV_4_&qm
zl+sJv?ZHK#Lq`Ci@i*dBy(tGcW-35l7V?C>cy+j#N@_K@0NaPCO;~tC82Tf28c@i#
zvjPCiJiu6fGN*sG`;k1f?DOXW8@$VY=i$Gjn)bJc9}jXUc)6N$7J+!T0r(Aj^|zG+
z!+{L|Dro=mg|OTIzk;<bhB`rXtg>;4kJ|-xr(UJh>L55sE}Fvf9xy3p&T_15f4VWQ
zr8W@(7uHjh|DjT|;?mO?q+>b4K@HuPEQ9>LP&lAb=#4jQoBiVP#UmXkU_TV@>*yIz
z5SRtp{9i1<>wvp6F|TC)V&K6dB-t@+i}-brbVbBctMaj1q5SXCw>={y10Vn<jCr5~
zn{)mc7|nM@k&i9KY>g2=!hiIB#1eO9D7~CT*mx&K((n|ih9~`{kdHDD0AQP1eR{6@
zs-pGUPIEOyD2IY!@C%W!w`?{mSlgiTn<gmy<$+FG^(?+DlsXWA27NDkogI3xQ(A<9
zAuc7I^%{kK_9dKk8SPg{i(xxL-x{ot=e&F2?=$g7P2z5R9vI|qFiTv8r`Z)3d7qIv
z>izHCeht<Zia>LN6lOcJu=hhvZh_rP3%LD5y8&eKd_XI_)$3;ed&0jvKn<N)!6&<!
zlCn^NDGi+i-mZ*<zxlrKjt2m0zE(<*xE_fe07mgfYZlj|B$yiliywa80vhVdt(<?l
zV+8;-hU&4{HIO_FM)CZ%Nd^EzoxA+)AfnTJwd=S80+-%*ixoHj_#)6E4rd3)1Yp_N
z7ebngA|uRf50M548!?hPc?eb+plula4)5m8;?>!dX$Ek)FU<qfs10EH_9|BN7<mQg
z_-6{~cd$V=S1Z33fIWhL0FE(A?!a*yM?u>&p2GTRP5AE39Ri<_Ai6|-FaM_YpQRH*
zT&9R9#d0+m7+VSaJf^@QpAm+?zuVh<dwA;~VgQPu(C(~Vk$^#5kOE^tSP2IIJFxVA
z$?vI-8zP2Gnt^vJoIlNbr%36Vb;|<t(W6|}TOs`kFk!A_VV(P7%LP-&>cgva#FkC>
zpJSm8<s*r~jUSKkmFc#$E5FSis`!w~NxIRo2<7s~J$xPz`VPP$xrn~C$E;Fuh3AOf
z@p}LkoA7v{-t`aaCs{mQ$UUiL96R9GGZbN`I%i9U@es07yKezB<Wlq5q)~pJRKx&q
zu2X#jx?-qBJYKvkwlMWOODdQpeqt3n>xv&!H&8@)B!*#>v5$>$Al@5MA)T&2HeS`*
z{L~Nmzxy#lvK_3=6@m5$>cq?s?=YvT2QY6r=+oJ6QzmnosIq0DRNk*31x|HG)eN*p
zjxGHR&~h<jO1H^v4U+1^f%iP9=C6r_*adK^V4+NMjb6{S;$d-=kSMrb+9}=1{7|W=
z3dt&9yuc6eL;asmJS0#Lg8!H#C1vCJT=2d^e4ab3@d0I8l1Og2`J-{#JTi?IaAH!p
z#t8u2Z7*;}ULe0W2BmX0{|!pBh!N@|D;IbF56GFOr_BI-rM=(S)+NAevR4&c0o4G|
zaF0){v!dY2m$k#vp)XPaIM1ZA(0Zam2HeS2uwk#dAN5a-Vg#7CkGTyW?5udZ4-vRb
zktj7Q6$yy#e&h#0x*;FcLWfILNdE)~j8th8)XqlH4U>U@xX%1=gJ)Q#7*7K10C9yY
zYrg*-T#mtlaWJ8&$AGt7@o}~H0zRJc0~Sa=*1&T@XApozI{k=}p?r%-7yEa6vQd!X
z#I3~H;^9W7tpyU6ebcmG#{&%U8b90szkLh9pzBmKetRzAiR`_3ATYkZaN!E+Nw+Vz
zeW5fUXEteIJ;)#mpgO<6ajpPA7?l;!Fc2rK%bSn;(B{8A##E*ftk<5}{^taahiSg;
z{SQYk?9~tOrEQN2qx|`tvxNJ!Sr}v63eUi+U;u9Ig%h;GR#uBjb8fvF0JwyrKWZey
zfD;w-U;H1N!UnnZ)`pGrk`uWzRoQGwkhBJarO<zhjEHoTbf>kRcmUH~9-eOUp>E31
zp5tK%7(#3ooLvA;4E&*~#9>MEVfoR~NMg!H+!8)^)!=ANk>t+{>m(gr1w;qCbQSF#
zGqo*nZ1{nRqHq77bUEUSXS)PazyWQMem%GU1f!L{*AI|fmxtB>FZBd;0O;nuO7}Tv
zNC>74khopJPiQMDNG~R>ke|~A*#&;|xNHV+sDn=_9n>T5?U^$IKh{=o+GnywL*q&A
zs`YcL3o^A?C-$)!db%i`Kcklk&f+&E^(RwL8-im$YPWaATmuY@J?yKGcnFf-b}KKS
zU#Ze6qoc%<hawJ%-o>gTv&;W%-YSD~ddyLxZ_jmU$95<6<2}R7SN?d!ARS_K5!{Id
z2`0EjlSrlm$&-rIn@oT(uJ_tiw%^47yE}?pO9Wh)-9r;0@!1WCy8d*`uK*ZuMi$V(
zu}U_uDq2Xb{kcS!4GQPtU?klK3`FmH07S5zu8EiBk&o*`9x#$8pP|5<sof?JA!!K|
z&P>qLLYFblU>&Q#i)jDbZM6VyfP_|oUBQ0>fWgz74qNAZ0qkzC^Yq`9pYHe<z*Vf+
zdXc!Ftc-=4k9+D2&L6W$S8*k?S89=k!sB=cb3lFY5Mss-Tqz8%R$!RI1Yx#|_yMJd
z`n|^!{VVxXytwcDx5yzw41;D!;emowWNGmifFD+YGr|P`{#25}3j6?*JG#t1x2K7O
zQgpJhiBBYFc!+b*^f7|D`IJDDsRcNTT73uU;-(NAng6<IAX>ftz*Us-!0B)<gpKwc
zJ`MR~4;X-%qEwtm<hGX|C{Y}PG+t}*RzqSJQU1i8&pHPmaOlmL=Ht(GVw_%cI>`Ls
zjPCPopO>*VSjo6<Bo1T;NpJJ1Fv$OS#!;xkKZAaS>RkN?5L5V%&?TEf_HsL$p73SB
zB9rUECevuDr2c$F4?)nnd;2>lb2_}Ns*7R+pi9{^f8P;fq$3!GvCw~%#zGM?W5l)s
z!&o8}(3E~5<Ism*<B4uIj16FyxOoP76OeFn?9qp8@gW&bzLyk-&g#{|B9tdsmI~1|
z{V$u|isMORYHXC0H?n@%2Z$Y(NSX{_3dl1zhFuK-Ltw=+sOU@+4RMSeWxD`O(}25g
zQq$K!fJca<-6*yr$|16;>z2ujFKN#1E|w&C{AzB~G<@+9T0hJ#fbKEbq}!<0#M8%t
zfJHlZ(dZ^nNbg%ckb^ZbuK<`c<!3blI*aZYT)D}2qrKdC6Pc%4hgW!kaa!3??+u<w
z(S3^21+ekmXrS18j3@Ym%41mS?nAS<45a7Ne>!&3-ZYkQ@v{V|0RRV>&DCN5z-4aT
z_V4a+4rH)@hXkWYP66^N`_#>F<QN1Av2(OL&HbIZL!WKvICBW5>rA%Xw1YEWiQNMZ
zGq{FNhQx15y)IY6h^&c6LT`=IHDG@qQcGK&CepLERRG3=i1<&R{)sOD<uCpB!vnN2
zYhR&4r|0+6%pCxUoIi(+RyqTrlcDhs@MuG~;_t>9fJa(5)jPZ+d881Ajw<Zc1fz;^
zTAw9o&>;#PT!GKh%klYi@d%a=<Sq198t4ej2`{TJUK@%V&otj9sgMZ2`WcGLMiuEq
z!HzA@pOL7Q$j*<-&2YWBPy}H`zRZ}<bp(K90-IR@{*^Ppt1PCl5Eq4e0esgN{&J=)
zR~(;ykcfpAd`lr=$rWZ$$-q6MtT<G*E2d}JAB3m`{a3<=usVGveGXjm%oC6Q{SA`<
z;aj6Mt6|Qg&c_dc^=hYN2*j|>*+$HpO3`TR>ms?2Emd19TG9jrF*62mz_@g`T54jR
znwihq{m(nm8=rwoNg=@o2O^ZGD*-B|8=rk21b4SHVlIKl?EzLy8-S%y@lvVkHWv=R
z#On_%(xFbBg&KJS00>9d4#_5s4oRm0@VN?lFjwf)ORkr5eO*VuRoZ0mHeMD^_66v@
zK)puU<{5(4j{7+UfWg+gl~^Ft{+9;-5F#2E;wb@4)Eqm?Q18O`sVJXr80yM?>A)O|
zdn%LUKnBp%aMM71GzDsLh4Qs`mW!_4)CxKj_V7Os*I=46wHiN#CIGNZ{lEcg2bu~q
zsY^i2YMp^P*+%=}vwX8VlYax;6_9VDP|Aari^WzPm%FnmfvunGd{hXu0MTGbs+;aL
z*Uc~{%?D_BXl}8JMwh`paNF_=sw(K2izcY>8)yvLi{N1p=IDjGXCHzTl0f>!ujm<r
zKx${e1v)o%5KSrYyK)5D82s4;&jFkaS>OW9uu}nwO6#9EE$PeSaYIvp_mgl$f4xmf
z3HSf+YpVIP2O!z=7>F<gVB>fiq5oUW(+qfaD5)4MyE<*#mVc#aRFyHRnnl^DMu$ih
z@eT7fBe3)(m1e-&rm)G@?+8}pOFTJ0S())Q0cv%}+=`Yh_pv#ghqFI=nx?UTbn_d{
zEge-7J^a1pB=6){S1XSd?}i9Zr?sjzj$#I6ahoXlE;_<qB_fx6{6hIzI=c5UOm0oR
z0(T*s-TdcyU<)K&nt*EW>!o;#ExZ7_`mkRO`INz1Juiu@FJ__{5e#?nOq(B-Fn<F~
zKprzz3*7j;6jOg8vm2P;B_|6kB(sAcOfg>iDZpJ2Qur11P${Iv-u&Y9H+ukU^0=+_
z*i<~rO|ERqB(s$R#-Q@YKNU@Zg&Tl3PtvGDYS$=Os7ewxUGdkzsh3af0SYD2_q!V)
zxiA}E|ETSKvG6{@MMB@6wPoPdPaLd2#-RS!#4)hf`k=p!8ePc^kFR(EZhM{afzR&V
z6S!1}sB30xWa<q=5QADV{z=3|_j_d3iGcs@2OfYpz_>))Ll6+0DvNq050S`oM&4h^
zZ82w{46HNxtdQO+Dc~1Uql_QDeOSPk8SrSDc+<!6MO&r~$)s$G;m1b*fNJqu?a!5{
z7M<t0@;+i%<eNZhw|Jb&Dj4c5b|B^a56B6KFUq5Qfc_gdsj-K~5bVuA(O!lGv^eJ1
zNnInyB=|N-!`DMAmp#mI8~L9--vQ7Nm%^A98%<zw;`P1{D4h2w32Iwro<MRgm!_WT
z->h_d)HXIgYp~oqlIhw-Ze+!EaZvda&yg}HwMe!0hlV_LQd-wXR?qLb2?MzPKY&2#
za8zmVR$&VJW&IC_#_a+}g1A_m_;m^lE95pIEY50KttrkV(=cA}`YpLFJ=Ln;yU(p1
zYqY+9+k0~@X96T}red&sZwc!&V+4l$2&GU=ZpvxqA=iJqj;Z5F#4oOLsEqkXebr8M
z6`{fr4ZuY<wnkoHGWP^SG0x-BmqkMgjB=i$ULSNR=?|DC&~|ic!0C{vu)si34ft7W
zR21i0MwcvQ<v~^~CQCdf{A0^{hkn{pxS<HCf6)}vGWn@_y##?U&IF%Fa-ZTBkz4{S
zNG9v7LH;@Ri!YHabZt%OB&7n7f`)TROJ=dQ^Ddhd;pqQI(|N~J`Tl=AI_cQwAY`6%
zke%%8b#M-mSw<x@GnAFR9V^Gm3gHkUBT7Pch(cC2NoHj4e%I;y`TfyfdfeyS*L|+*
zevjAl_2zWgn&;Fqoz>IgCJnk)5gyXI$OXw5?LS3agty5(_0CfBd)JvjW5)kFw84db
zBvZaIy(^_G&`s~<qnMipZZpUzHY*VBek{8prgU)z1hLu;H2Tu!sQB}Lu4FkU;Tk$v
z5}|DEa^%LsESgZ7SIE#oC5~{R$A(RM9Q)lI)3mXh5c(VVVWYWRP>qCeI0&ORVdSAj
zRQjCuAvSQ7!NqRlv{re}CkycNb8x5L^X=!C)6D+D+{o#qbyN(UkM?PHu!peCeJ!17
zE?XpTImy~%*#XWv03IdpPD!F}zx!^NRrZjo{a4fZ<fVBK*wMC+Wy(T51{T5V#r<Gu
zZ5=eh0zx8-uNwFw*zR&|)!RqGf4wBKNNNd|f5|NA%u}O>s=>i2Gfd+9O-GvN_(ttK
zTrae}rKUSJ2#4Tz?0JvOXTz1a<c_Xpk1){$5bhqp1;F#m`EC^}+-`1J&M%@tagl=(
zb8hIRjQ``@A*De@=(p|o$!3626$sW{SN<VWzx_pSV%+`nyC$GjAfI5jO8F)mGQYR{
zGB;;3_ft$~;ttKx&C(&T-7ZaCf+Bm`^<5&of3eKhX9EURN&cs*!{a;X4{xa%v6w<U
zd(g=reVLVpp#1p$UHw%N($~BILp-ypxVcJ2qIBs!_>BocN@7%EUOJTxXP=DF-Nndw
zkK*ztgZ`ersTYOi9z)0-dZ%u&Uxwayl*dU}2B4zUhIf5M7eL|TUAgM7i=6)bRZV$A
znjhER`BBqU11L<$)Bd?Q%ziJL&5zzs=F5e%4YbInqhpp9{<>=uSSS&o2t^;l5kEsj
zWPRzDl6ob+-N=}c&L`<lLP$ogTKLRG5F(F&l#h;~XrW)hYi*1%fx4b+$?ia7;>@nI
zk!*1<-TEoEM&nMFeSS0@s^cAalh>`<GY=)6{`$k#>U9&Ge+!AT%1^~uEAT2BDo3#B
z)}TL42hY+xud-+cPX2+y)l)t>Uq=qeHjO~T$w{kiO$H$38~uSnm^_jo4_j5T_Zk;2
zNR1$W$aVZ$8Ytf71jOr6!h6N4$B@!$nKggNX(Mn-?M)XIrseUJV1Q?i$nl2~u&U*X
zDqA0BP9GLKI|j{^Dwgv>_?nyVX@~_nO;YHoD~J#2DnaQCtW}p~*EVn`I>#=`zO&#w
zycE@}$ULPMgD&2s2`McL3DBu>SIb1IpCd60B^ZZsZ`5!JSc1$3%d9g<Nid3%VEnF_
z;D??26H$M_b}QAMObTUif%diuV|EJ2Rcc(Amt<yJ;`QGI{^?*!Za^Q!M}k<j!Rlac
zyL^JJ2Y<j1ftw(DGc}&#ty*XW&?&jHEytU*U+&y2)r^B0i?}@pxv4>c0<H_ml%YPX
zlaTLxHBr`JY|AxpJBz?(b$JBf_o3Zm%;zc2M7i^ozTmIMHa-=gObIs>ycnV<5MMNw
zvaPsNvJ5T82Q*zKwcL35T@k3a$Uf<CP?m2$$!2+yV_NHU8i37#-ZkKZ`99dt_^IZ=
z`90xJwJWML;}li$U0Y;o>a4z4@l8~r#Vu$~BagoB$(+2U>j)(!`RaC=uHk^}`b#u^
zC!dy(7=>z-`30-GN-pwJ-I<Gw+Ol}W55jU2KU@3jCVA5F<G;c5msPK_vzCB5f>caj
zdqHddC9(W-nD_V{^rYW)dBbU~(jC$~<=lQheBfHiI}WSrH%*U#R{&iE8eVeuQ8!Q2
z!w34@#$=6*gMoKK^#5L7dMX5LA*{%6nb?Jw!~95Hmp?H+TI}LVU=Tbalz2^eq?U;?
z#d8&oFy8aA)K5>NP=_HNrko#V#T&*&;Ft}<X=IlWRTJ5`dE0oU#cf@nA-#WDq+KXH
zRk)jVTYj0Pff{dvTun3Ze|gHea|}8rJJRwdKa!zzds+tSWi2dE9uy$wQyjftBfr?!
zzO^z_&poFj$={epA+1h<>D+nw&1nn-h1vont5Pg?%BIldvoCt8!N=fu1U_YhgkJby
z9RzqT%M7MpH})Q*3I`(G_MO(qKbNV7carHcufk~J6T6H{MXMgHd)=H@<XUJZC-3(@
zrWyR52Bm}6=66eY+)m$45>GqBI?RD`WtOQ!Vd{8&Lfkoi$tOFDzR{M3<9k7|Jz#L7
zaA-RZ#%J!y=W6O6?r3Zk63s~(#u*qmBzsyFbqybQ4TcC~kB?qE=S?#{`6CALX9Jf7
z-$kAxv{UbIHK@s7Zn7hzq(I{&X-b89eqV{gG4Tg1f+P}KnSz#^iV-q+&!6+jZkW!0
z@cLjr#ga7S#<OCFf$Nc~!dS!kyGfl4Vhx-G$A~qMv`B+A&@tz>Sy!zngggh$h-Oq;
zMEXjpUmie;$}~*j!_;?cOV&a7G9+#B)m?iaLJJ>0!{bo-;di~$=L=|ut5xlECG^1x
zyivDGos{vITTGMid8M6xHA6#%=q+N9ZD^=rO-`3DVX{3?SqJcz(Cca@l|dbt#G+8v
z<uNwd@7^$V{S+j_S7eU}6^zbQWaYiXdL>gwMl_emh#RV!lal;Z=UeNF<nY}?Dg&pq
zyI$QBcW#erL!!+gd9OMfv)q5fJJ?a!NgPbTR*Vyn=jk;c8ZK4EuxT1jID$sNuh8wk
z<@_pJQ=`F^K`DiYEm0RD0(9jXIs-%OzzHHnO&u5)UDx&5gdR&eH=hJwGIzs2>-jlf
z`w<{1)*r^brbf(Nrx<JJZ&hMnsj7|fT6&K3$IpJkx0So@Or7aPdYEHgTB@aL@F62(
zj{>@^2Csrw($>5Ci!6x@IQ_yzNq_0fZwUO>69TUmGSK+u9r2m;EW+CztsAj2l_O?O
z>U|DU(NY5ZD`btu*C478EGCOkdC^HNMp+l4ntHbKNbjr6r0Uf))ItB>Q0N2d=DhI5
zbHnGN3594mIN1qD6wOVKIEC}0HtNImVfS5I>YMNV0_lU4NaIe@A;_e%@Q}4Mj6P4k
zH7HI?5IwI(e>`xx5+kWI+7|$I;%bi+DRr&rMq!#0+Kxf@h>d5@^;@!=&D_G{Zb~+R
z+ZF+yNN(~0F|^?^jmUE@f_dtnr(;|NmLu4o*dy)st4~#;uNAs1GV6*gG9OE`nXLUP
zF<F)k;kFDI-!OXlG7BpRTpIGWe-FlqsuDJ*c&wH|U01tPr=E_Rb-)T8JO_Oo&*xlo
zUTx;_H9L#gFZr%(4)D3Jeo`xBoQ?bD5S9$90tohaCk=(EkADjd=;jU$kGLwVfLNy#
ztc^-LXU7;~INFbW4@yO{XJ^N@AM9}*!_S_pqn~j~bmQbh?zfET!8*HW3tXG5j={oA
z?}cSO`)EJ$L({+I0@Dq4K^wZP3)oj3(|p768)TnAQ~O=t2UR}Q-yAzS{T~(<w@NIc
z5teV3D7(Gn|By;ggHmTS-krmGJCux;)wA{t`2vFdTRFn(8Kul;rB9x8?RD7(8+WY4
zb0eSAHzOMR8EiFslaFtTMhtJTt^+<~ReyOP&mG>;0wxqZmDkuIBZPQtf`bg@aa}p|
zadCCjUw<6AK~=OwXJWWU4ub=op)t3e^V-x?*_GnA+NTugQ-94QLlHZn-;!(x108N$
z%*^NYU*$QbG+3~?y)~oMpTb{!t&)=0HsA`b3YG43h`5Qoj8wYL3U_ILk1W3)Do2L>
zx&Xo_T^p(`qm2b2_dzVLy2FJ%*0wA1AY<$?IWL%O_2Zd^b1S6MIt5BcLrfEI&t1cW
zlYQ;>>o2ets-zUI(bML1>7kUp6xihPPBkZm$}Z1s9jJOSIH(kh5E=5>n6VY_g^LNX
zu0G;Wbq{IkrcB8N=(dFLaQF^at9)Tn%b7TT?x$9cYLt68^Hu@c>XF{}W}66siL*~W
zq%k=|bSiXOuLM@Uqp_kJCl{EKx~?t->NW&O<)~plzASEXkXwfWrh5Nc{}5CoTVz<Z
z*%S3%b<ftB=2gUvI%!cz;B?i(ccwoREh_RA8dOdse&;ME6^bg#6wr*H=okaD@={Fr
z%}4-X=GN_RdxKcBoEY>`&CghSkJDZ6G<p2wt?TvVe!s96%lmSnchq+7if!6_j7G>G
z>(K08KUQT{vm4`c?a|=`#fuELDkh|3)PonozhYzS2SzjEFZqO^@pXOQHfoArm%WCW
zhI+klHB~+Oy^Fc<;w8PH41SpRR9*>WHzzZHv%&{V=JJakp5q2p0rAWW>RD)f`@+i6
zDZ<7B;FJ&Pv&R9=O()9Lz;~8VXa2J&+1-Vhg8lUG7DZ1+vwf9NC7q89Q*6goc-^$X
zE#4RX6ESA8G}SL>vE9yju~)i8@(E2dK|ey^B|~%pH;eXrixP$^cBTOyp{>DI>Sv1Y
z!#+Vr8gs9JeM%S6Zu4QC)v6}kS!-N!gtgNyuFr|DBD?E_UX9_q({INUa6FXGo8+q6
ziS|q~3W1#E&@=8*eE3w)DyLCX|1H2HFMRgb@<U=sFqyh>ZDRFod2KaASB@*-TpENV
zt}-P8W1<%KvW0Wg;bJ$k|6Tfu@0mgzi2~(wle}ZKBk#4;<_bf#bC<R{60W)Jd&~}R
zcyp`Ei)$=0`dw9uNl~1?Z#9D|T)@BXIHPFg9f65V14&jKx7D3n>d%BixxJjvJYj-0
z*+nI-4@{2@zwi^dFCiWJmrZsQlb1A(SQ2a@`Y9|bt>{VwAMu?K3X^&CD@KaH16A>k
zy~BY~6`RQFN~W4l7<GesluzIl9a`qj-8^$2N#Hm|EF=#xbwedn7)k>|!~N8Lo`(nU
zJU627wmq*R5Z@k0-x0#3PB$$~r-`9aO3UO#PJgsP|5BF62G}KDdB^yuXDa=gF2mpB
z_jQ0nuY)t*sZj~8`jaBXA6UxLb7*v1l~uD^EO0Hhv*7AFzluSc^zd5X(LM1%t}@-M
zB%itWA^X|TVU%!W`=Va#JJUh20ueNMWh9+@%;OlP)>HSZ?>na<D6SD}VTb7Ndk95h
zZ=CTZr7VTf<YVE5${ua}Y&%lGJJX>E8^KA<>>#`riLg|#V@s0C^t_ew=jjrY&%sk3
zHfQ@to5C@%TzAFyzQWhp)bvY`vq);>@3H;kvC#}f$cfcoa#6d_T!sQNPQOtjp8@X7
zj#`#!BNR3=9kLd<M8NI2Rp$NHO?I;Gt=x`5%)RJNdvAZ<sU)9LpVO2hJwfrPYqq4y
zj3aPnGU=f~N5;cX+%6Qjn)e$`$MvDhp!A`;<-?<9L?!iIl2L}?hYNFV#|3A#CrRO}
z238qHmzG(eqT?PBHc#vlujf<$5bS&-YTfeoeXe|R2m2$EuVFT?bty{^!~N?QS6V%G
ztx&Fj<|zp`w|QKWmgl01sc{u5^PY6qG^e1~$EGAci`zJF!<uDRJB>oXxgQFs8Fn8|
z=f5uhrLG*~I4Du+%pik!d|zN3w`DC%MV=ldaRmm|ysbj9n>NV4<Q-=(H=3y0jJzo}
zV%@ORR8{omlN)U}UYhXz;F)Jv&FGs4t?!ebJzRSmuO|zqYl2_sMUG15-^I$2$A7!0
zoe4cwox#P0@U`~%fblQYGuwUTPnpGUw*-4fUK(>g{C_O~$3?5{q$<q_c}v<T9Djc6
z^><~;Li^4&0~#*xyb5fYzU}u*W^3TxZV_U5No&?htbUcu&_X}s?H|6=|2;LEzyB76
zNUaxXD;XPw(jnY<->xF(-dtG>iRw9te6(DDhywG>LhEhVI+57x?KeSQg5^H9%;_Uf
z?B_}4H@k#!iaa`TUI->IweC)(n5!A3ZnmTPA~_?+(MDA6qTR5;3k7=N!HrRyrhzF{
z5><Svknc-y&->-}Y|)56-yciq3cbLY<Uzl5UQKz8?qE&%w(h5)Gs)|Dw7v}9E=TB}
z`KWl@>%+yYo6dt8x>!yF6pe_rJx!ugIY%sN#>mgy6oS1>>FTOkRz3;`iKnce6aJgg
z@iujSXuMRaHPKIPp}#LD@;on`RkWT3MuKMv^w8vU<axE#0UWM6pzRWKM&5UjyiBO)
zO&VE#xgu7aS{3m!K6E^+hU3!#icScpJf}5Y(5rZx7c@WHbFg-H-i6NX8MG8%6fXC=
zdKzwxN*b?y2#tnyt99EMd{x`)V6sS$V7gbQlb*Z!nQl0N2Qy}&=1XVUH(i4h`(VJG
zns8S?za{%LY56WANi>2#jFT@FVcD#+j|VTU)>UrC`iTlXKI!UjUZw|p4^|_owS18^
zxnB5QUqv$np~2>|QW;^|#w8FDj<`E01rfUDb0BvqKB<#c;D`Mu^-{9KYPQVSPSJ&q
z?$(sA-+Psr4@;DNq=>dTH4+0~)iqEUlN0`dJA{a7^&4jJxjvyUNq+*kW3c7kP|Q#N
ztft>PG%hahSs`l;HRv~AzBx-E<)H7pg$1or8+qaWWuMsI7K-J*Rd`NzU9ha`9fB}3
zV~v+pGtwv);-kq2Jx_t_h(V+jkn0YSh`i|)bJ2fut6h8@hK2AS*X+*(De?dL?7OI$
zvUR^ZdfvM@x$O+p{Kp3qrv60c2NR$c&pB~hFXt>4<wA=e0kQi}9ou*5$*6Ft9Hm>R
zEVVCkeiaOJt4-Q94tJvN>`9j&3xc5Ulm#y0FOB)$sy`9!=5DrO#a;$WecnCT^m3};
zg|H2>-vn^FW%E;Sd>M6i7T|Qe%NDbJw8N!eGw(R$25q)nj7Fqq&5SwtEz~DHnoFii
zdKdga4_K4vd|f$z`piRi^7y**mKszv1ZUcXOMf9ED)TXWL7Kir*`fOISFXjH>MhK*
zeM1c|P&{_suJ;JzI&8{gD3Lh%UQ?xiS<2aD!WP|uFa2Sr)M189@yT&&?B7IIYth-+
zs-%^o-k(|~he?YwjCVg*pS)cz4Lxw8xvB3GHGRLl+$$>hCLXhJ&-*5G!v{47-s{(`
zb}Mm_HcD#z<}=ru;Q>xur<kbSeI`O!Wg3?mj}7i|q`~d?QNQ{VU;`v2Z7qFDLdk<6
zotp5@X+1j6-H9UHIsPwI8tu^)!>U@i@^I#uXS=nlKBMxZH;wJSqCb}R2byj`<q7-G
zi}s&I<}t8$obBevUe$`okzmc&9Hde);aBJ1%l3Mdjm9(Rck`8t^nQ?s-$Y+UR&qX8
zT!Nf$<YguKd*k~?m&Zg~lP3P){fq~pTeTeLmozwUp3|Y=S6_RtB~&X#^#;FLiAy`i
zvRzFkANtY&HTn+U?BM*DM~B8w<VTB|RnU;?Pyd157aLWqyyLXBJiA$s96>|Iq{#j<
z$GI&{HM>T0^?7J_dnEVUBKd&xf;~);Tb1uS_l?FJHL5mhvsTmFj8fgpnUsnQAmsP7
znzWsr-^6V`l=4Haso65U86W=o2EN(2>ATe1d82~wB1tZSJhk}hOYdQSf$0a)MoZi}
z8ESHw28!f!8i}(EAi^$)Y@T%s1U`wp`qW6S@&}-I*#)^q;A)L0wnBaE1;uur*Gkw~
zU}~rmrK_%}X9j}icJ#Lh9LyrB-6c&yY=ObSpn<jf{jUw-&%T;FS*J-B>LTlnMV5TV
zFG5aCfcrw&J5ldw#w*nVeZF&;o=jRgl}E(d=@F{&S}72w{$^#7harh=>I0K6<SD;z
zttyckqy&iU?TB)%%BjqY9C%cBL&$&Z(^_O#_lb&3Hj;%HhTk?_QoVGqE+-7}{tcmS
z6R<gIe{w&ElKHK#Ja)Uj@JZmxCrJq!38hW9gL+PozP~~aw<@(4{B2PsB5Az)Ehz8I
zw~&I~gre}|jwrfTzQ|83SpqorAo0!wY5gnCCm^ZqgKmhf2Yd@EC9N!u%D+D;TJ6py
z)^%&s((+G-xwUAwAN#=NrIot&uvf=P?;87e1C3c4OwbiDQ#vPy{PLQ_Q9-8T$s&ui
zBERAG;*O%PQ344wyxS>=Xg~A8ru=V4H%Dkc4pObM>c3<j;6n46{J=+aYIeEZz&}Gp
zr)cdH;Btf4ztcw4{G9JU{w=`Y5h)O26uB*Y^8yQYlfx>i5#^xf5cOo5i<`lr$$){^
zPkis?>(s-i{VI8DRZeK$Is4B}NigAMgFK0=7~aK9hH|YC63z#E+$Vk?%W@&zs}&z}
z@XrEc<o%>QtPsZEp63#tavfBRSU|W#31tChGvstH_M}O4zP>pnYNU>){PVg6(IIku
z)-$ooJt!LSn+p_Cx{M+ybEv6}dylp=OdzH!NdafrePQpTx!UgY^Vj(?C(X#1MM^8r
z<G7mRCs(aQ%kolKA*6?0paj?rqx5m<cULK7ebjn2xUo5LKXy>=$tW(N8k=V$@?-X+
zQ+e&J*PEttUXVK(ZO&J=(MujN*^RV=QJAddqO7Y}Wc4!r)Z5jWud3SZ&<*)lWtL0d
z9r(>*F8&uEfUqzB!}Qpm|D1M%U*^4UW6OGnA;*45Y+dbE7++qP&rrbN$E^N<ppQR%
zCrk_e_?mMiJ@Yh^t*O4*JO@vHYc<jKrf=?sz(wux+MwA2*Is%ym8tZU*FDp;I%I?P
zJi>2H;?y8mzS>T9(A3*8AjCvGG*;Np5u1T`KrmUJ)a+u<eV+7HOs8K_rj=1Wq7F}V
z#rr{Xj?gTrhd7_#X;?@!+Rbx<^NSH$?J%6hPYC7ayvwuK3UkdXF45!+sL`1zo@iP=
zU4|MJhEg{QWg|i^Kk~%XEX>dP2Ovzxm=PZ1>+=fNix)Vf=hHdg2+V{?V+B58a_`=>
zP5q>o)^vFB#?h2b->dqDcSU=Kj~FGVLKwuEa4|a(x1v{|5&bV~$p_|NUaLb+2YQgY
z93J1wSBn1f^PC8r(%wLy9Us=<as9`&&%c_Au$WEOB#1!=)2sNFr%tMc4lacmQ5?Q}
zd`sj=<xA{4T^RSpy9cPWrErzzx=1@VeZFj#4M?L*bh^Mc``j#{HO>i*I<VRk{c*+k
zqQQ*hVC~6P&(h3`OT6l2+kqIrPbXsU2p8$A7-BHd%howg+@WZ?`Mz&OLBjC9>xZvF
zBk8T(BwhcWnt-{!tjST1Y9O7hbuCB<)23OYl#L-~%P#zo=%VlKIt-0kK;=h9O9B;9
zm79|HlIyvb!p9*G`JuV+TJr3!`r_|T`xr0_s}*{{;vzMV#WrqSI|l9R@tW&|^FGBX
zF5@SZg>7{X5`o=!COGnr3GJvQts1rBB#w)I2KLdx>bJwhiUP;4xl}Xa&(<g^4e~^U
z*Dno{P*)NQ%eNhjN+g5>+&*O&X4$z1pCB`bB*S~<u%hIRf`T@j0ycV!a!C+}fU56f
zGs+HZaoIJGw9M&AR8-F8qN~;mK8k`y2QtL6{qCt2mf%bDF|4|~vLYLeSYGhU93d|i
z*18mkfFu79miz~2`dt#^px01^N9rig3vPX`uanL&JWx8>yq|$iimz&qfw&XTN`hVI
zNaz5S|D+GXjVc<-TN-Y*ruh~}ISe5Z5bi$sF!InV0r$EM1b5HTh4Y7tn^Bt2Ye;FH
z5GLxM-4<d?5Lewy;?0mSF(*H-P^zoYgSOdx&R>*nRs4!TU0}-t7a&0J94bbU3S%8H
zodxP4&nTwdQC5QjYX~<HV*@T8k3Pv!wv|^51Bku{7!-iM?t<Qq{v|4zb~|H}g{cHq
z>8r*&Z+pGD$p=HvZ;S?Ph<*$=y+r$MT-iL_=$f=`nup_{^Q4UKiX8vu=`|2mia9cF
zrFdL(pvn6cs(Zg{P>bZ1y_1sNr*{L5fieCWezgWD44vKL-;-QqyXlDseSa=_@lAao
zG+jY7cfv)or`Uc_TUo!lMMa7XB?t{vSZNfNQwfkJE#f{%3p{>fa%GX&tW&8#*$e_A
zpC-J5N39^Z-&elJXAC<GC6CA_554%kmZ*fwcXMFTfL+JWCn+-V)1e{Z`MS)M1KhAH
z%=K`GD1{PE7$xC%KjQ)&>9bpI7N~28)q#E0zilkbl%TKRDENI<=_}J;fAHcphtih>
z5tQ^Q&kcUoxN$ft8VIds7Ciw$xV}7k;y~j_J8r4bZw5$U9umC-XB(h>()m7^<wOdB
z=b)Dz=@|QZSG&x(bW?4INwrzsghiCmkcO4|gWmXUks5;-=50i+KF=xKO*kHb)qdl!
za^*h$O8rxHuu<k=AdCh=DlIATz~5eXD1B4IgBRjwW$#*eG#Onc8Adt;$>FdZ?(J~)
zk5V)vCvUgRV-+zt+$JZCPMN2cBp;!1o79RlL|3V{C5X{M#>j*GGWU7kha<qfIu!j}
z-b={fgU<x=A*O2cn{od@aOGxv)tjJCT%2Mwr9@55=TAwxagEKxk-;f*(&ObfK6&D0
zVYWLmojtyqc%7t96SLinnm>9e_d%acR899al7gZ!4X>wI2Qytr(rXl|UG;j75&0-~
zBhgq6nbQB@PeF6~515^4WWVtIYoxwyQvsp9WS0zNP-jZ7lWwCsx_rf#YmEO%k252c
zafpGX#G<}%-}q4x?LJHW!7C-B#r3Pg=-^A4Yp)9rK1A0jA4sNTs$TyKdabETN*D(j
zdvzE)4s*qqOH`xueCW%jE99x`Fqd8Gz7*gVtcVl>Zkbj@18ah?kP&-l0NKD(wDy5@
z6fAUeKmGxJZ;oD;pp!`RljR306-OZpY)fnm9Kypqr&pfZHgI+2ft;}70uxF~7Z_qc
zy$9Eq1D%=&c9+=l!}=wyr|m8e7}`G^d!ms!3hWEAJ<UgR0R(hR9%A_`*uuDI^&)dJ
zjzjIy{hFH>cmm}nF~v2%!h6n7?4>6+Skzj<q3fePLotHFp%DA2_;UiY>BlqpO}(lL
zb#vfON)%)>gG0eJTqA*M+?=D~PcaSlSWf5L#ib6a7lUJ0KdgyyKg`&plG-$4MBcqf
zy94Bn$$S^HXp-Lr`LtjMW}B6bnFo3LN#=s0sH$$~n_@0J(uRFf%o31<MuWck2eNF6
zK_{y?CmH{I9kFFXa%So+vu-w^d{zm7^}MFjX(P!Pf}_HY+^=oy%b~I(hlHR|C|6I1
zmw9)h1am$Dwh}S>qS4eYg9m%ps$A`cL;@V!As^O7NuILkAhKuHNkSqG6fxWb+M>$_
zRW1{&IRSg$7e3(jbb04XiRQO-Ros_nsGC-w?AmNS&2t#N?f&tWfJ)LcyS4?sWEsct
zg`qzHH3>#t!$9jG4eV)PAqH7FdaJ@EG&9*OwS7qkC<v)LJ=bB0j;a|WdY&ow6h3r$
zwNU8L%TKZO9?rn%k%ILxm_+N6$L8ojWisV+?HaJ*Ck<gE`!%IHFu$d9C6}U<hX<XQ
zO-mryL6Vzg!wCWdd`a_HUi`De05}t@IFrSf{@SQ0V^lCYvKZfpHIU{2?yPEB=l=Sg
z9Q=u_3@{K=JlRwWI}rGr8?<jIv$^nNw`2S1=yOL7pP54u<o1;BL9E|#$Gh-73U2cJ
zT0(sT_)*!7Ra#<ga~cvacI@k#8$n8=o4-HEbpbv<S#Q7m$$k&2^sl}zv+R||EK8;$
zFn|q>d}I2d>>rE~0WZ#%&Tn;d7!`<AzLgnY6w}doO@eRGz;NDd;1y9)Ujo&8<_EON
z!fm`i<LSm6I}myuJDELjZVy}y3jkZ}1|0<d!3Gj=lAxC1^S1O4R02M&(Q$iOq&SZ~
z$C>Lez1lb)&H+e*asW(O{<01#zY@k$v>>8yPrTT$$I_sobMWYG`(HC<Ca*ro@GTnK
z|L=;x)ifrU;dMz=w@Sc7nB3KFeS(8j*m0Am&Hp}-Iz-?$YZ*iZ^+X%ol!FC5d&NIR
z{1n%V0j??QV&Rko;yuYG^aN>+pZ^a^_yR9>^rhx1g{UNXw3D;%YM-rWyx8u3>HY8L
z(a%8KCAZX^dL}r_+>aiA|6=XQK|SI(G62vO0B@lorECfY{GsRqOy<FEzvw^ELXVBK
zNt(3!u3+A!sEW5k7Le*e=b<M<Gy8cr|J!_gWF&wu(sDAh1#hso8U>h`4A1eHe_xhb
zt$AOg_?lDOmF9Ca(i&yP*k%&{KeF8azAhPah~5q+b<ZUg1)iHt@JDJ{iGP3a#RYG~
z>sgL)gqG{<Qv<+gnRVz0@!u^fB49BbC4)`nAdTE;XS(uZW->-<MR>WbcUbe^ZRl}9
z{2$#Njb>?-Uw0HBWl_(kClXBHao^X26|3z)O>RF$U}XzSq{6lo`rZF`>xK}C+M<`6
zjz2CbH}AeR-4a1RH+u15{KEe}6dW<=W{VGN>c-%sg;erOq>jMH|2uhbD$0gVR(gX@
zD{;y=szfc4(I?Nxz5n~JEgH?#p_N?aNqe|;Lmn;+-QpFl7_?gP#bZiB!N1i*P?G$e
zHIbx6-}<}z7yyJM(EwmbGTGUY{qGXh@M7&9Oskqd5R=ZNP^a(sJN%F%>Pi&orl@ZJ
z>2t{a_cfLjNK54g{o;1XhgA@0GDikr{hzq^-+d$Ef*BNAhm8p+0d4%bbXHlHHdjK>
zw{9X@N__5Y5F>Ab%!Zlf5j*MC%U1hnErt~;5PA?#)?!&w&*D2jtV|q0$p(^q>rE?#
zAa{GTq)u(g(QVb^m;W|e74(48<x(SU$5Rt<A~&fke?D8e@o&k&1?WjVOYRu5hK4`Y
znued<)&4uO{3<WNn0WaY%n+lLT4uJ|R9_TKt2R+!YUpwJxYK{PuA=(l;Q&{(cz!7h
z_wROmc$d+0SA3JKEaShwSHt@~LY?DwVp=qnQECL-1fUDuHe34P|9<K(phk=#_-js=
z1V`%-BcTXvI@`<A_zS~Ke)<C=J%LIBqoS%?t{+dmKuY~J9dr^$i6;8Qp$4z+DAlTg
zRoN5<6yOrLizXm;kF+;t%|`y)n0%zhDu@&_wZEl-b`{hrbOoBEvzf8?QY+LwkKF;u
zkqM&T&+;dmZED$fz)*!=An>s#wy#&b?)Vi!B^XF}G+B=3J;ziasx~-^@2iCWUuaU&
zyrE3lrKpie1o;zKz`#gp)S8iHt5hxR-@~s#y+o&3JXcb{-{To+*T&sjhI0e%f~2YF
zW+mu~t3QuLvR5etA~9S9>EBKK?~7(YhxF5j*^L0n8_kVGKUf>963#EL5Bc)^U3~p4
zaijrJt=UHuej~7hH@NV=a*|EpK;^D=XeH;B@V}`hIp8-z>C$)=aHF8hQFtnJK4i=&
zt29b1EhJmf@PE7Ng*WkIYDXaC+55D{Y>twU58lZQUxY!3{Kfg>>7MYiMBX2J2zFye
z!a+w{s0zLHf=DL(cq;dLpLUpchz_iX<KcMCw*U=9&Az@ulHx~b!1Mp;pHSs&L#H*t
zvb(@>ZNKq~loc9FcWP$_nSSz%{SD|tWD>0~SyF)NeQXUx6a%BH5r=S}*^rC9fGLsm
zDDaiqwjK+YOJQ?pH6IaWj8L}U;EJAL+P&eT`M+<k4BvzoI}Ni0vDI>8n9=s<%ekWi
zO`{D=9`{N1fhA(0?&R=WvwSMQh0!_0X_UR83<$LWQC+t;U_mRvy5v7rfp;YYZ&W1@
zCafaj(t;LzqyGEymte&oe&b>-UC6=uSH*rz;x<0WO!*e||I*9>+P}Gf4>=!$aVEd&
zdF$O9E2%-*>u|UtFx80Gz!0L`th7*)lk$a99l=4uZ-w4h`*X0I`rpIN=7J4gDzmQf
zDJM^o=Z=0@BbU4BS+@-os5j6iM7g+)7n8vPk?%VMKc9zQppzmF0FnpO1xyTK&0qwX
z9U~Bl=XLrp0qtjq#&3r6t}f;+tIaA;|C)W9SsQQCUH8F4+hg83a$~U~#VrW|0O;?{
z_MUdI0~HGHJ}<CWoUXCRdZ^#ugfT;=;QQr6ISBnEo6RPhCz|2YU4WWSGtwj{4hj-~
z^G#PSBaurp6M*nP1H$iFz(_&B6PFQmc4%(q2PFEvzytSIKYdL`JrWUNvqJLvp{xF}
z`6Y;zy7}tTz$=Q*@dO{pwEwow<9?Nk8Q^LzOIIL&zE=o?c>NFczeUkAg^;Ox+ss0f
z8o0YdE^?%&FdM&8A_emTQcSJmC3`{NSj7F>KPnOExt*Q?^WY;|uX7sBegI5*Z>I5x
zxOp!D0Oy>>OSP`p;F@9qnItCl#(QvkZ~`P@WO?eD2ETt3gqTbK)sVS+kjBmuQJPki
z=~#MC;obx|(Lxjv*D&>5=dt(&P_DVv$Qy7lvU30j&I@6eMXk`A{6>WD;Lz#iPQEet
zBd4n`=;Y6*MQ}h|1EbbH_7PdR&CFgPjV7)zf1#4G58pKRHMe!ZK-`qlckEMD?oPC8
zbY*u5g7^1GopzY#?y-{wfde6V$JW4cFz&P&kd(#!m*URVCD)(CTAZjFW>9qA%5nm4
zzHYdN42)mu_v)4c_Xul=<z(1yt%j2g$Qfy9Rsc&O3TkSB{9#Q1kxO<nfIi2UgVLkj
zP_aG{5Emzsby~|0F4LYaB;V`>|E9O>TU}|d&jyRt7km{0!(m;+9|9{6auGF7nSLmS
zyPHC=439&a8<yvW<d6oiPlD+&4`@Ir`xxA^rb60jGk^_PJj?GfL^Xpuun-?W)AKM@
z0Nk61iCsSW>n^-x9HT5xikwPRfvPfJ4-Fq>5uh*O#-0!=wo<_ujND#8zg_W4tkfyZ
zKLMsTl0qXynla;kD0wdtY-2+914M4f8Bo3ho5BE|$5Qksfb(w2ucscCC|Mb_GoVR5
zAi5NZuSYX{1^5H&l(5CmOuEmS?4CwDr@oc}aOlqxnpP{bnRodLoHj(qCb}f8B&w~f
zjg2udNfGr1w9qsD0>!*OEr)_{jm+=5ANj9`W9Q*O*8AHk=sk#DKS{s1EkK$Gr4o}N
zg@B($IEJo9bM1kdFmj7kuj;ev+#L`4#>9`O>#otA9RT9%M^V_#$VL6R_e98N50rPN
z;Z}e;uFG?!*S&7JcnVVjO+Uhu-QnBRgbU&4As2>y4mO0|^jZO8dau{ZN`zS{ut5e4
z8tDUE5o?1j*s;2}Zsvhnph&H@rB~;Bcw;wj>}<tz7_E%b+`RYdijm|@+Mc5uOfSjC
zZQ6ENV+#b=JOd36vORBkjGAl{<*4mX3U3Jkpg2_g4={5P+(V9={eUxjJhFK~Jl5uB
z%3clzy|34=INQ&Du7!a~DEJpRizITvAQd0ad{yZ%8sX#)!bY{GK#Q21^hVu21!RB<
z*sLk$$1sszkcxHK6_2zPg~CRJj{95#t-mP!^>6d}PzAghuXbBU#6?$?za1CArXIA=
z2ErO9c`<cF(Y>zA8Nfe8VUm;P?Z+{fWA7PKk%m1lFr!ihk!Qaj#0pEtBjg9B!Tuy)
zqXg0|;cdXyqN$@#y8y?SK~OqQB<NEwsb@f8lIuk-ZypeP1L7OsC>cUbAeqWfgFBiv
z@MBEEPlScB`KXX8izvrC@4$Qk-bAV<pocT7zogWseG5<b1}L<%fZ2+8bOXZE2Fyu{
z86i6a^h=*ArLYYUUk7<mGpN1>=5TL_kc`>*3b*WqP#e%gSg*SY))t+lO3y0h#Myqu
z9!%l@GLiLPxa#-ZqZ+I(DKNDp<b0$z#n$INYfy{2(O?WHOtnweU|emOGGC#_uEtT=
zM@4nG8ypADntO)R9@93+U>M(*qMB2Z;|vheNJk5GATMML+9Vzc-H1J!<jr-vI)4Ye
zSL)SY@GoGvpmbyzMGB~ZqI}z_z=)WSe?jj|8ax|s^ERPozM>CWOMw3pCV){R$HZZS
z++qu0Xq{)iHR|$QthsYl#KCu<S@U}{?Yqje@YyC=#|~i#aA4#%_+OIbekoE+ltPut
zNvhVu+IT7IbpYXs!EdbExZeRi8*$?$wXchflYsK+UhI-KcrjVs3*#UP*d{z->Z@gM
zp5ovei*r$&59$4d0~Z})vr)k6=4ID%K#cMR3@<T0wykVdU&M{wm{yrGzcT+AfUx8=
z=iT)Np9k%}vQU@HTzQI3J=&n>1`o|O|MNfYTlt&B0-0iHyDnMDqfnh!b1e57Dzx+S
zil+QiV;a*QH4?j7<^@CnmAT_*rN4(aL2R+jFhOG73m^cir(p8YTLxwiP2A>Y{Mm1S
z^tJ^0gEM52DCl0yaU#5I*y5}yp!k0+K#yRzQbZvBWEGhZ2qwL<jGuhAI2#<~fY46o
z$Ro?q3uT~bf7cR<CB3}xastMFzQi_OhXIayT}b|Cd=<u_7JQ@J3`*!TRq!xgz3bqb
z8LiCcdkdRn_a$mfzrdBaKgeDURgMt7BNo#>p4d7k``)F6ZGKA^shcGwx^(Z)V#jR^
zVfHLCt$r?4DptHz3>-FXjgmi$GA%qAZ6(zG*Apd2Ot_-kH>n#bP`tj#8|p`0#A*<C
zXpC|O_mS2Dc~lGYz%bD_uY~5jL-69o@XnH#Ihj5?{#)%&Q)}oA42dQYjI@0~+GBUi
z%%M=~2A{%nO=U0O0W1z+c<+_);1`Jc3wb78T>P|`*&c#;@R{OO(1u<Fg%wuEtUszA
zrq*jdxmIz1@)PrKDU!l^(XNM4LcT@(l0QFku7{vzQ7}3kFLw1i6@;QMtT1J^z&2`W
z2`~mE845j41nfcE4KS1|;6mIfN9ryBKjGUz1<G?8jtDNBOW4T)+x>k-fWWb3V9M2*
zT~X`KlqunWKpi-={8AZpm~%WA#wWvJnBjnpu0vAiJ^#KS*wG@~1MsFQy(sxhxj|>|
zLl~437J#SjSNXMHmys8Jr4pf*$UsRj*`ao7+6D6i2o)cW-1VM%>t(u$j9@f(7Onb>
zCg@RrlJ0@N3Bl>d{{`<$F~i$S6JTtT*bv8lS#^(ql@?9m3E%Pq7eFlDz|cF{cflM&
ze<C&O?!_o}Z(rRVw}N3}S@a(fnB!_MslTv?43@UO|2nwgHqbTtO5wTTbvad8C$^>x
zt|$3MvC6>#Z9qLCuD*ah{0=-ZoGyJQ9S{eT(zqq%Duio?41{R}*&LxK05g69w|_LN
z%tcOj1())u?c+ArN086e6i#kt_!?{l*2cW&m@MQY!6gF@lxqbTnZe*yfJCh!c-Kik
z9t->tFCCiuVbqMo$V05|tE9k+a*v0jj|v99wTs1=s(5o5<wgBE9Ju_D6DU%&)I76<
ztiOf4Fv}d!Jo>>xDf{I0*_EE#`kG%aZ1rTGvTb}LPW3GNT^pb~6i>LB`~lgBlqSua
z(PtU#x&MA{h?OD400t%Bc%QAGL`b=9c%<eYI6LH4pjg-lbLd33RgY!@e}!bs*gb;m
z!~><>Api$+CE(=sE3P}23NU;IPHP1Ty69Dxc8O!5nlV`2Pd}C)3$3O<HI88VIxESU
zDE!5cnz&f`^`YJIPCD+rh{6b(LiNnb!`rL0q;%vh$#OEx1vgdOLq8VR*K-nf5J?ny
zp*~CCBpZ~{+mh*CUHDqT@Xf{OeCuoIt0Rq|VgdSUFtCk3^_uF*E_effX3){5&;I#+
zsTAXT%|G&*Af&)mK8=qtjxOl6F2#U~5#Q0eYmgw#dhmG~%vKJO!Fs!N^&vtbnt`+S
z4D~{vix}e-L3ijvm>UJ-i`IX#7TZmLBD@$lj&mnuHS0wO%6iGqV#n%Tf$v3762*-a
zm&5@cT*D8lt#7ZbVc~8F)ihkXgxXi?cyWWMCmhFu`7(S^<y5uOCH+%3Fm{6eTbcEd
zRRxjw?aPbe$jl&X<Dekr(f-jf+M)f5A^l~puZuYF-e8o9=XRA>Y4xCeqT&*mDl*%n
zpp(UCdnTm!@L5Ec^o|I~FFsA)rF_ECPmd%FNN$-A?6?6aa*|HKAmXyt4A>u@?~iR=
zUu?y4Mz{mh8(WRr%0tRygv<IYNL1@I6Uu(=7qsiB_0?~2(}n<pXb(0VJWk&3EGX`<
zL@RKI+y&-Is_1yCf>=cqevChTLVLtO+!j`b4<$0++AqBNK20q>d0!)rd0<4Q3%*n2
zT)Of5!^Xp>3VUO-h`O9opOXQOFPZ5WXo<?;OC*KrvSJkmRrhi9o?q?Vr(tVgi|Tg8
zq%WmBWVez}h*5qbv5hOf(gVrY5DUshFI<DSBe@*Y8Ju?(`b9$SBPlTDPoNU|agsaF
zQW5TWY%FtP`$fs{1|=m9qz;vV_<`kDK?<_vmezScPGYJ`bhCL&P0^xPBYMX=<ch%j
zi!d57Be_SyPNG_B>%h0fRXeZcO(jTZi$dAy9_+eW6>Nbp;^oyq=lf&K5mb|#IpJ|<
z^D>X9D=ud72^sBvYRm~Vz((CM8ztb_l_Fj$y{XQ)iRIGV*{{ratg5MC0womoz2`DU
zc~nk+P-4MBaa8%d{#=>#AwEulM)p)<WfO#1Hw5wJkt|`F?-xbOnd2)rP<!2-yUjgJ
z?Q>B-OmwYi`)Ggmep|OKwQ4vnqsEW$E5~CH8~vl(yRu_ed0cw`m*i%FY+mll4Y{*p
zXyWy?a_<Qj>BnFY*gN^0;(G#=dt&MW8$P>7(_H;Vq9d?sExb=xxYR}Gv%?>cEq>mW
zrfB>-Uw!c0bHvnzHZ-md*lpyOcDRY>#jy>YGB7e;_K@UOgRFssz~Qh<Ll}S5GB+cs
zs)OxYZs$IwZM|JGMla@pP0tdYrWr?M|J-b&rG5LUGfGO$3&2BJ)KX)J1DTEP1cMNC
z^{|eWzyd5pjr*x9UOc?+NQy{=f^^uM^!+q_NU%mcI+&0KjMl=HN4#_LR;C{W->Z{m
z=)T;-16*G%RZ(?By=*OewVXOxQW%b6N0n&3@yyQc;U8C`J&*bJQfs2)>dUQ45mioG
z@RD=l@<Wj3fb^CiycdN2XrNO`A2!)i#SLf7+w<{~K5~GWJlFJ(8V0Y60lLKb+Y|&P
zG--jcD!SIDM#op57gd!#`*J@eR{7hm2KP=Xg1oo8f3cZk>%`#Mio*JOnq10{9z%UT
zC18(0CxY%G@^;}aN?<JBco)s@Jb~C(v6d5aisAQ9FMJ<K)nuRCRd&lYPvsF2pi?2e
zOu~=6oT!86rIS%j(YCL8P-c+ImP104_-sPk;}fux=>?`dB6|)XAAU#=kpt-jJv;n{
z7V;5mYk`Vn4qpH5SBoKHs)p{nS8f8Sros9u3oyF6=)jBVNJR&@>-S3rSp{E<Byn-2
zC{j5LQnFt|u@cuWPbFfo{M}68u?=my9YHTyR&A(aO62>}HxM9;59m@Bt}Blq?$FAd
z4-p3;;1&Ijz@19Y#H3v*f0SB1$D{L_2s5X_z^5Lp5Yv)UdlxX<MXS@XZzh?}v@OI0
zl{u-{ugg0iZ2Dv5>#HzeDK?rrGra5><eLw{QhYJJ#8PKmfISk|H|@Va{*C6hkgm`p
zA)Cr+LXuu&+N2gVzQ#{YW;pz^YV(1{>nnLL3~VSNtwhga_%&dcd4NwZnK&1#ly#0u
zqz4#dWD4EMYcS3?c@0RJa9DMTa9~tdOGJ&E!qpn~WY)MeNbjat-Ic=8O_^B95-k%J
z61592Ktlcb21uw>PB7A2T=%yg-jMZ`p{|dfk>oG$j{vznnGTfQu>g)6xH2#k){=oj
z+h`@iaFtQ*I!F#2N1v?6FZJkRDir%Qta8pN<FLh*T>q`&qG(i6!WwOKL-5-D8Kk-a
z%q14nz@QG-63kEf3Nodf?`gp8NkUxq{4Zv|+g-GSmV7k9;$81R$*hj2{Ck44*Ejrk
z`hpAef^L+$_X$!yP>J{9uDugRq1>c}H8rdc_px+mI?|Nz=Ea6^vB%-cokdCQQn_Ar
z&(JF6XVCR1`ExLCp~)xIp`;z&-N_-ptQIpKNP)=th5(t*6NIL-qO;k7U~V=rz_J6w
zUA5{d-Wux4*m}GJNAEh}IcKaw7Vo>7=M&Hz+HkKKz-|uF`T0n7VRgmF&4@YERd+Sp
z0=WvN7^wXaPL~SQe(|ubBbR>rq?UAPG|oVER2rfVJla}^ROsUWfKmxBL@vq?+r~|+
zwN_A!2zw1Zc3gbngwCS#950DLFPBV)y=bsW5z7Jda@zw;%IxKz%Aw>`I|FIO+ISw|
zLR&0(vt<;4w^E+dpA$6`(>LlD#w@BVVtF*{IFaVh9UQ~Ll$UP2`ZY4*T?WBEmEoj+
zxFYp3D}rrS<DM4oXN=(M^wo=s<;yi4k~P8Xo<sU}z?>YDNp1WzFNV^}GJ6bU*x3uS
zURL2&`41IAH<4!Qi`AEp$x(urk(u_@$|o2IS*%ay73?h~%>)h?U}IIcAU61X^B(5e
zOArGmhu*da_AlKlPXjrW5v!YRF7?4HJNl5!c;#M2Oc3vif@c4@ZO^Z=(QyI5E++|h
zFPi(wTm%zUlrQF-L2L;Fj{b%in<7!5l&>8+f^(W$t5rqAoRG%Os?T_0{(|J()-l$m
zkSF{9J?U0Sfh>r3pFJYwYNurc+;WxTc>DL*SI}0io}bQm(?gv8gcl$N`=7;)As?q_
zY(MThbYRjiI?fP*n~QIdS0;yCZbc{2DC@qDh`7`kt=my!pBVwk59>PL4Q&g@C8qm^
z*RBODOjY+EKKoU<F%z3+2Wo;}bhKtA@9U=-*g(J9wEajE^vb0+Z4X_$y)}0+wMYC(
z_<ffTOV6K!Mzh*n1>q|OSufNI>LWJV94_N!iBe3X7D0{aG{$RdtPLY6%$FOyOSudn
zzVXT<ikN+c#=!H9PQ+0eH}?=t9c(A3eh6hZ#={b60|AQB*^^B!wtio(LF3ZkPdQ`?
z@w#ygEIR{txD+yy>KsZsO;Zn4GrfL4<_-xumRZ)p6-OGxV@)+U+a*Wg5o$dNrFlC1
zkHSwukJ9BC!HnJ#W&HY|X&UZw?GfsQ8Q0-VHXZk&C~G~};{7?PihHS7oA(|@G$T-<
zliDe7t*IE<N%S%CL>_tfnI0ib#2`{HZFt0ub)j-&^>O>oz&-kLBaDmk$XK7+xcKIE
zd_Ke0Abz9A$3L|F+2t?ac#lGu!ZvT*{t+(LNq`+rCzNR^oYnEE;O=LBnEmE$5IHmt
ztIX?Znym?XclNMjuH-iGA3`IXI0+v)ce5_#Il7xBUdl_Hud}RJ90#nWV9_5vWYiAh
zsM{n=A4&eH`u2|*IZ-9LRC}|oM54lYj4gKp6JPVdZkO_Is-BkEDME|K2AIsITnlKU
zky*f_<#;|%7SsE!i+vXQ_EyvAmmy=)@Tok3_a($@PtGuuo3)@<kg$bukY0{N(huHY
zdJ*6b?0dM^q&c7_-jEMPU{v)&f}>#U)q49ai&wRYyXQ(f|H<mpra+k)YZEPr67L^z
zQ^R1fPk?-QqNmSR;oINm)sOM?fvIeM>=3HMH-y9Xr?L3kgWgcnUFrej8U_DQus*kz
zQt>8(?zpDJn)^72rTClSTjm!&|5kOL3#nQx68hN!{qq(8F&<O8IS*)BYqjF2TFr+<
z`ACOV5!ypJP)+K&*CTS;tJGf@25<Xs?<<>fq3A+~Xp}-B56CkeiJ-yT4|eJ+g%n<d
z@cuyF#WaZStKXW(*P9S9D=?S&;(P<8dOpUw%DOpY9|WvqT#hBLmu{8MDZWc_HS>rG
zAWP&T2pxKXX?}0XkJa1`$3?Pd%lCJfNa9(RX<T_Q^=%bDe+%7=@vEx#zCbz?Rzy$_
zg{d=oS`gw=IB8C9UC5T8V|<k#XHSkcCwH|~dAW8?pZWC{%=7~lzBw1=Jpa51Mll~3
zm}Z3XA96Z@IS_+g6;8Mx5{VCGR+W!eI<k$hd~>9xC=|4mgD;pY`5u3cl>(QBZw-98
zPCFPNn(Dw@L01>HGvW#6>juH~@(CY(_E#L(Uhq&;jhK~ozaUStBT$GljhP>mu&))m
zojZ)gC?w`7W`wpy+tcpxHRW^zV-aiEw~co2JdBd{Y?b!}jj(Y@O=k<|o$?{+_RVkO
zt%n_oMq+y|frZbB-Y=7&8QT5Rz*GGldwwzYd>WMkbI{GjB(O4N#FgREh-on=PqK1z
zcHq%y@11^u#?szXCecCN`Z;>e`>}js1iLHA&wlO-49VRUZF)MD`De_(wRs;YhQ~86
z`WZcv67Ns@gnuVB+?l(7o;r?37j%kzLxbsf6hdlU<L3{^9cl*12jinPes0Qx5R$bk
z9=ytg;dgnqAfY@)!WUHfffHz%i;#h%csSKD9h>35IkQ<)Yd66i(!95gIQ$zIvNNRz
zx$py_kg;SQy{W8pA-`=Z7XwEO#-^gA6~SQi^OxseJ}bz61Zus!eP!>aeHX!lq5nm(
z0ZLAs)VEA+lp>OQIVnlc97;3;veJ)UAV_=l1HEgj9YvGT#G7X@*Uu+J&F<AXEGvK4
z;d_wq!Q5mU2$(xBKj7vduRTJ-)Z(rQ#Iuyld>G~sB=uA#?tk6<l_vFk{<pwYua`#E
zs$Jujn=&s;7?fg$@0DLSPg>2%Hqj0(g|d@S9Q&}RpY8&2wrlO~d6nbKcIn7y3L1yE
z?#ah%dg^Si3ls+5Voy6UxzU#t3d=<HxpK5(XcIXNj8$%P0_(d67rPrxS^V*`fsXH1
z@HT<eS|1R?E@t~)*&i7JIP&T0(U&sn&<XrQ#(c~5<%mG`Cp(IKqC!vDe8&?NrLV%d
zQ`j{O(dCLzK2d{Q??MG*;y5$n(XKfB`~n&UOVkTTDuQtuX)#AOc@zR@Ab)}bE*1Ot
z1?kgQ{Amy3=tPsR%xcMshSM)VmeqlgiQ(rTh^K@{U7`U;pw^5>|Gpvh-DVLCe%3h?
z%t*N^d{@(>2XqyOxAdjaM!n2CnO|U-iB*1?!OKN^SBX1OWOS;E<(x!mR>Ylt#=xqN
zZggT=CG{%OJeZZ!$W-Q%{o`-bf}~+jm3v)Neg~CGQo`(b#!*sQ?HdZ2Tt?&Oz)3Vr
zw@6RoqUymdSDhFH()VW@t<oBECFq>7c-+`B!OW&eRp<Zf<q?O)zk_o_e{tD=05j1R
zmt#oGOV5p0_d=I%f_j*XiyQkmPnE2Dpk!8AkL-KdawhNiU^=e5YU$sD>9{jsyNjVh
zt}y|n4;g&Jf^NL}*F_R@4i~JxF401m90PYf<qRMj9p*scPt4}2ku4>fdCw|L<jSE`
zY@}K#L#wH(sxTejg+xuVTJ;k+GV`e}xvjgJKJ_FQdDzxQlU|9DY19rqG*p**E<i$|
ztSit=&ZtNR<iX?lf+v3c>LJj?I1GHSHlVzoI0FTK%$tqt2^ZDa0sd5=KKQx+Cy?=b
z8dSk8gGTMsJPJ*=3oKPz>#Nq6Jv$ID+A*TnJ8M8>Po<^Mpb?$K5&BB!|JZx$sHnC+
zU>L_iK@>p{MCnFAkPzt+q`PAzR0IL(Zbm^=kXE_`K^loc8UtwtNy$+J>1K$5fo~tY
z_qq3ZzV-h5t@W+-{&zT>z4xzk?>$~2L~<1+FAhLa*@ohsK_Lyy=Q$<yZdDgJGnTcj
zNcm6*_I{s9P}fFB)cOn$eNMv<#G7))FYQP6tLDv)CS<k27)rSAh4^Qz8tw6CNhv7V
zEms|E@6;pg?6l7qax0oVj+OJvXrs7I$u4@arQSL@_ICVF>p0ijw=Qhbs60$&zhK8X
zJT86oVDmb5$;95eSk%6x$tQiYSsdF{zq;=?JLcCN)Q}LuMAz=pDmm#QEW~>2Q^RV&
zY7g1aY+#h{K(^z<9|D9vfixudjk?uAFOsfNMiX{iA{$wBV2hD|3+I8Y#EMtbzQfQJ
z6^3euF=W%qm8SJ8Vq-HyN)uA6J{VpZLcc8GRLa6sS<<z*&_^jlO3ydtnee?+&$axg
zhRpVb7^IL)2@jYgq&ZO<o~QYu_`WP&QbGs{)z<}Ed2beTc<z<`kpC1S@hHo{r}qhC
z<=)e5sd9upjh%<OyydX!4DRx1lJ(_)TpBpZBigx*kv!ij%&kt}WTC6SNVU8_)hIsL
zYUedye{tn9J^;HDkg^=MxsegWBoHvKCK*QKV7KZ!Tz4Z&>rjQlwYqMicz5mm(c09F
z>Kcu)w=ZRbHdC;7{AaxaXRSlj6)o8>`81zo)UKb`Z~|RnpOH>;&(F#;>H@0YW6;)l
znGq800k!ld81{@`<sa;asOB#(>93}lJ!d>z3v>EZW`<<;%8zm*r=p0U|2@=d0E?BO
z)^d)z#&pSF`R0jwq~mYWpw*qg)uh{v(tFP{c1eZU$lU2-%?-clC&x&tt;0!{_<LvE
zv<SH}r<c2>YRDQlc$><1rvwhC+;AO3wV-pKfi)hExVN_l9=jjsx!t+h#KWv|DPH^G
z3#D9ho0JEw&M=3M%675c?DToyqQlf4bb_SUUb8lN3PvEDWvijA6whg~rC_0NWvYSO
z#aSHTuFzaWiuMJ+q>VWHte6`HR|GV!$;*%Z6#K1iCG(8bq6ubH{cb)ZU4!#*-6Sq^
zUxV-_H<&$JT8RJad&d(W+uZRM2%)sbhs7!h=g)nLHB{uPB(dWCfKO)IBiaPJ0g@&V
zu>B%|*sFBcg|u6_pAw-y!fXA1j~LS3ZMU3ocfVzF#AfdpuY|tXGnK6-oeYabI@UZ8
zCVYJ_gF<8ca#f=?)k&1Uy2#cJTw|gS;ZPDT-8dzS&zAN5tc^K;(0rb6a`&s)Wow@)
zu6r5ZSRcnwc|9GYViqJtUCZ$DG^jkQ<{GJkj+^Uqd)4Ni^hhtI>P9)qyuHQ#=#;$d
zmVBPkw{*KLNT^%PR$LR@yr`nLLL{@t+({MHv*}9ZiP?XR?C5UDFwy&G_mS;QImL#4
zoZJDD_SDn5^Vg$_**PQTrq9DAW@8bsdDrQk4|r5lr)UogBDi^+xYmENJ>#Xg)7a1O
zY+D&(r+`IdxKYD`L5uW?OBAc*JpZ2eP94^)e#ftAr-t))cs`z7TJ46h+t_F$i7>18
zWv#GatILF5HU3(EZ*H=Um#PBO;SKF?)e;<cqxbSN@c9ac(Oz3@Wf(^ho^OJkf#VaQ
zL?|D!C47<IskV`!`U7E4Bl`dskyV?!B~=)b*a&u>DB7momKHX~^=CCVlo}*SNE9WQ
zHNbuC?I9A`r3Fh=@j2?)cUv4ysr&wz9e=HLlMI}Rm<V@aD|W@R8mTFN+OfScTcN}9
zi(n^8Qq=`cqHdnKT%m~z&n&sCQlj$V-qoJzx}c$pT|;%UdvUTpLr<5cH*>h8ZHF9m
z_pT7vs%N(w$o$e+ou`aeFV*rb)DiEh>^%L%E^E-q(qDP5Skl@f+pGQ;LrdDK|7>^a
zkA?N^>ZXPd%MMPGg=qDx@~{Y+b(VlL(s|@yX|qgwZC%YK9t%@XjTV)Zmz8P07)*6w
z#<p#Sq5siN(2>bW#?8c5YV6@MRt9@2e@`5BEnC6=Ys$7f8*`H9(UxHy`aDJ5nGKe#
zYRAOEmyI}NQ<eAlFGerJ*{-N=DHA0zl)PNb_=p(kgnybsFcGR2F=mst5TERN+v2cV
z%~<zvZz-G&Gw~ia!Xqeo$}kGMmxdkT(_Ku=10TvPwssd<5VLPCl$zU(ZJ&6<&zIl$
zWiPW=EkS*o6BDrF7H$J>Pi{HMwA(x-j`7iS0V?KE?vW<fGJ5#<Q9(9wO>VE-0uZHG
z_^8aRNU^l8IaSy-)bsh9YQFcLI&^}&tq+b3ZH=78+GE<(eei=vmwA>cP-q^Jnmmkm
zSX*S!%V4VmnM+lt9GF-~uNs*(vCp1SAN_LRZwm{1$~u_oV#_%y4-m^IsSM~Dp?RHi
zXqLyFk}UOVGF?f!{7`v_C7+QAr^94W|68;)N$I5Foz83a?I!v`hv%OKTO9^`S0me|
zv#Tvcpi~4VesWULQoK;Ff1)2nbOI)dx^_#2?#^)2(#Vc*Z1b02)oMS?R(*|JQpU!A
zKh-lQEEv)50B`hDLp`%oIYwXn<FhIS&LN-EhDr!m=J8koNq3iF2hJfula2oJNVEr`
z^L@e3lN%E0=Ze`8mVvO=5DwdAnSK}7#-{VXcOp`2zy8I9JgFI;7UA^u2tC^6Lk4(l
z;7SEr`E(auIxBP1-Mn*VGXb(Bm4g`WxEn08OvubdiwK<WWkYdK3{M6CdxCiMajejI
z+mZE&<y_k@g95KQ)^dtZ$(*ht<v*G=**XNF+FH$1nJLOBZu5gFT7jl@Du<sk*-wi3
zg}!Mn<(tA{)0u}|QyP^L;vEN_j0%{Pho%bBex1>;4tRu2)#4U%wBSht3*}MFvGL`f
zVmGU%a>UbHNmuJ_t!=F&&5Ow5^kh8@--k&r-EC1b--w+zaL*E5C{I_ens0<7=brTT
z@||e+#pBuemp4#Ty^qhSNbC1_IKcLMxJoB6ZX>yivv3lM#A-wW=Ek57a~<rZykg>Q
zfUl3NyqVq6Q?dHH71k}KhYO{S=-mciT!E-^Z8L+n=H7@vLc(38vEmR`2f5CJnqaFW
zf`n=zIf0_Ls(dCa)YZOW;q<eT9p_02;z-il9ZmH7^;JDKN`<5p#?{S@cU2NFoMDG)
z>>?(U6}#x_hT`4#O#APrK+Da-B}{VsuFpC8_X=Tf*1<LQ{ssC)YKrKh3hZfHT>i<p
za>~bXw`$#JLiLY?x{k13wd&#$>@RV&zxZnJaF9|k3nn5G%5pA8KSEaLnE$*lBS1HJ
z(|CvX;S?<F6zk0DBM-gj&&3RIntaU6;@8W@GH7pZA>-uf1NvQ`jrx~}-#D9p7|KnN
zq}5c&a{>m>W$qQ_%Re-$z$kx_m#4D0tnqVRJB$SmkkXayw|P4E*5@?~bCOLHf9{a{
zDNc^$^XE7H{5x7!3WBnh$6hSVMF+^_tRN#d0b~eEGuZw$OAA7-Zw9eR6rpQ6JljJ$
zLCwyA$>Ig6!t1Tusc3#!Y(KtsKOHs`7GXbJlBN;$-7EP^x<a~S1CJC>6G8lUW61=+
zHX&TVhEYg5*YvUA7b>M-AR_lk<FnN|nlPgaE)Shu=D6d01IYQCip>Hwe^8+AD?Jqn
zVBBi?&LaJO(jLwpZvEWx@4Wzv%ljr(^^B=!>5}8M2c3Kh$Xd>5k21Z@CCXVm-D>7V
z8&WwY`2?&0|9bJem|%Wj3HR{l1&O{Kj6ib42Zo&*6W{rOmuK=NlO!s(xMsJOsiIRS
zjY+wv-;s+g{|epzUN9uv9DkHPUCZ-O99@6Cr1YsC-MpT3f$XC2mkS{g-}*b)5x}f8
z5^twP>hT{X#xUlQXm&8$sARa{S5~XArK^;mJhZ|WO9`Zs8vDfTc`hEql*9`HYcowd
zd!HtaHqpdaHFin^iAD?qeVWkdp(ihM_tUN}eK*)C2>TfwApKwkn5`2qJ(`=pJJavy
zo!X&GjsJ?HtlnFlu}LgpyPG?1vEXU`V}M0jn!V_BDE<io3xmZrDoi+Sz%-an>CU6I
z?{1t+@g^?hX*#b)waPch1_7)CBwGVbd~rWw>!yOUZqT>lA-9a$m!sEtuh!$w>ZeFg
z(?%&iE1E;4XzLNPQ!Sh|ptH!d#VbqSq|6c7i6gQ)m^C^_0*OTpEP%gblv;nO0&PlY
z+0zH-q!mYolm;&+^$}{E&d6vEhMCzvxawiVGkcDOnVC+`gf5)=-D!@Q%2!!=xm(Z2
zMEKCYLD;pe{AF{X&di7(H&HB#Eh|^_AEhj$z9wJi=$FB1r2W2BA`*~g6cC$>H_@kX
z9bc_045OroKKrYP8a4~}3_#44%P1T5pgq)n?`CD%kEI<87JP_LuqyM-Wkq!yapPfq
zWq_vL{$3--qJQ=?i`?6Q#hS%~h1#3qm}J+0h1y{XKslE9SGOR7AA@2k2fKfoo9d1B
zD{>g_NA_9Sf{#XFV>d&ar-|Y0ajUEsA?E-2Rxq*|&C`<q(qr`NeB6E2<?<(Qcm`WZ
z>O5D#q%}L4cq-<<zfDX;^bbTk^gkQRjj1H~-_yX>mZ<c{r`nB?r<GSYsN%AAQj+uV
z6qj@n2%#KwIYgAdmbs*nsrH@K_)@VSwYsJIi4*N}W|@CL{TmM8MZbd;-}@o<j#2p6
z`#JFbTOES<U-Ll|quv8#?(YO|KwAFK=R~M);C;3i>db$?2HHvn0@alhGyh=pXIwUT
zueb`k_+PU?sz*Z+9WimTgX`b3gZGNG;Qb`=)&F@-62T9|r0#UQ*1v#(h#YYZyl)`W
z|DV?smym!?p69tCul8@8FPVY&zO?WD=QXn+Iyl&BiBVhTzdvUdtOf74u4(_zYlP(^
zK<Mw%ng0JZI}rW<%YZ+m{r@uq=G1=7fS)b-ZAdF%zO?unSleb*IGmk#{$ck&%3WrK
z0-GXhlyB;uVmB(e!*k7|v3{D<{>llMG?01UQy!B45{;)+%7U?iIe{d~PZv*eP#V7g
zQpF>k&rnF!7KS3Dy>>~awl;m(^3Mq7?As+~bwigTZY$lKqGLh)pa8PQVpqZ<SdJj1
zTwsDJu28i6rD#t`riHcXf<-PG2XYn!8P*08@XR<pk`u5^pkQ5fM+1Qzu$k1+uI%o|
z_n*d=<opti`@^5_B#-u&^v<bmh}(|&7?qlfE(}%N4n`CP2j{$MBL%MU-H}2@aLyg)
zqO+O{qTqL<$7S>+{%ApO6ap!zt10FviMdxy*b1woFov{bUb-ltg0_u*OfBKsL@)WR
z_N6+*lYi_A>J%zqh?X+KJA{N{P7i_T%ggnnycvU#;eK?5zmJ2P2h)Be@wW-mqn@j6
zY?C?BK@EP`9b`I2Kw~Ewb%bGJBqXX*)PJ<Oxeq?7h{|c0!L#lijiI;rG05boZ4fM_
z0!VJ!b5PD{Ahi3gyuCvC$MUd|RuZ!f;uPR)Y#y=qNQ=}Fi1Kps;qeEgB&x3mGfa;i
zDtw5!mnK4=HrwekzwT2G8gsMH{cm2vpI!t$WwBTU#zjdHpBgf#@y95l(&4qB?><#~
zid)*C+W~;@%*o8WE=XY<9h_-(D=nYr7*xv>uo>3Py$D^|j&CfA5qz21qNrv$AWmfS
z%H7P`7b)Vs8ULcvbz%r7$Hy_({gg-GBqF3F%2Kp&8a`%j6X#jG%g5AD%!G(<CT5l4
z#`Rs_fE8hh0w$c;vw)NG;f3$pEYnjh;j@h&Z7;)+>2XIMf7|H-*xr)!@@Uo9WiL1L
z+&+D!@c5(Rl0*#MAci9z3NkAJ-5$EkP+l@YXbZWX=1xBr6TaJ)*ifTtVm4frGR>!K
zIygy}HeknRO}z)O01|h)p;4OLOFDf@S|3jTfi3~$e$Og%7xAovZnJMQ(Qpzz9YE?Y
z=7XFtKagj&wVzHkyY)P)%+FHt)$tNt#+l2)pF_!LZ|GUMR;D4%!mYlt7ib1Ie}ZVV
zsNK!t{I0o#*sQbWv&<5G!J@>mk_cL$0uB$cv*UX&-ie=psR0)8RoPU9H=|&!EB<<B
z&z767eu(%Z>X%F`GjIyJ1|Qqn7;?Q~a9<E3Y<&Vp8eVhtDrxI7*wsm~@UtF!oK8@{
z;w#mvdPanTeDsQ84||y^>i&}pNO<YS`)?1SwG)#NV<gccd9{wpV|zoFl15Hu)`9{Z
zfR+&sgG&H`lDIDZ{10xbmKmA%hvzOJaXkfN2Cs=w{4m&ss8c6ke&Dn1DOaICRLcpr
zw2Ga0qYL~Qh~nvLPEu=GmcSmz)@-NxLW<sOO1$B<Q)NH=!nIOqBCE027y0a7@%t(n
zl~D~4Y{6n@wu@g=Ub+ofM^YhoEKSWHVKnB_i9MlklD~!h4lM5i5Mb4zfMWq}z6*0;
zL%14|l`>i{UWLJ5^Dr};SU1^SM16z9IkLa;CLZJP2tN8a20xI9-dtAx^%2M$Kq0nj
ztJ3qrrCXL`r%PNHu^aMt<X++L`u(7kP>+FxUeF#D@FX$-lOY<}0pQPk0<6l(SUnPo
zo0^%Or_0cS9H{5V5jY72_+7wGY{g*VwsZ#IBYcPz0HLO3&xIl-nLcTv7%uYi5|2^O
zPL>0V;q95@xqVRbH0B!F$)06AM=ALtajTvn<ofXiKt*-3PWl8)?2Nlfo>hhe3Ei=c
zL{$N?8=(jTyWJkgR7o+6ssuN`2Or-jHq#0QR5E)UJ6?YvXe{uK-ow?I_>S^%C}GXo
zd*f3%&n4SqOMnYCwfU5p43@u{9~?ZLQ}3u5jG%?&f~?k4$CDi5VwX-P|D}Ab@R6MZ
z@#J^H;FTr^$jDw5?O$Lns|FUqNr)EG#%4L<?Q;^_(m`2|lA*FnHV_=8^_oN?lpe5e
znpq3!$5?ux0%*-O{NQYQs|GkWgrz0`SseGe_JVN|nCk&UNe%eSIKR2X(rS!c_;JiD
z8NOtV;ARyNpK;B2xWV9f3X1rxtU1Qx*+fz3Xf|AN$@SQ#^UE7Is?7D70K?e<t#zI3
z$WWnVB&sx%ACp0RjCw@^ETsln>C;?Wqhtp*bwj!QBrwOE0Rqv-o0me#EEta%3+sGj
z@xj5*vTpM1pjrp0_z|HB5Mx-g$CMZ{fcaOTjuDHc?l5>H^~3!#9&Ygq-)A5|oGZas
z2K$bKDNSP+wk07+&9ndqTAI>V6oZ?UVP?75M=8|~x7(G|a*Y8|`HY(*MaWj}XaEX8
zOl$EheoASa*L?k!$`_%8x<idg3682{NKVNq%h<XhCkZGl0!Z3Fce)BpZi6bhrP9+W
zi<8&xa|`>x0KTNT)_db6XE)aC>dWT9pXXox1}KRG4?A6ZI}~zHnQGdx8+>!4d9{AP
zd00wAWo5lKmp0Kw%0XQKQv?@KQH*&zoa8JwGdCRtQU~G$s=1Lx>5y?0O=I{W;?A9q
z^4ZU!0wFM#d)IVt0d`OWcI3}P>5OqkbK}Q07W>#DErW2#_MQW)i{NkZlTchT1}ZH0
zVG7cr`zU#@9F;g>p`5_@qwdRW!hziSX+$W6GbGA1BgK~$f)Pe1ZmAgM|HAMN`?QnY
zLK}Jla#}9H!XgeIUb*Y~^AmB<Db0pogHc4NK`_X040jobtyyMgm}z(o<Kg%?IJsj(
zYti64NcQPxd<lDR9pclyk28M8V}~+rNuD|LgzN6|5W@e7u~GS?l>vf1)j^MAmmAQ_
z3Wb-Wr|rltXfC%KWr&^`Q2JI(*ePGe1WEQ4tykYp0kx4h;8rvmii<&kpQHB%ffI}V
zoqC`ryHO#k$-!fBEqRnF4}=S8*{1uQ-_6hG#F3^Vn498ovY$&VngSok<_4&JY^&T?
zT80P`59Z>wodHis6grdrRvWtb$}mm*fCV}^w{HML8QZmOB;R6lt(Q8oXn99RP-rvE
z7zgOvTLH|EjdnGVp&D2#oI}UtbGXv54y#=3%whV&5h?f(edLm<w`i6p-6JDEp&+ya
z*{9)+*?p<>M^zx+Yjy#G_v_#e(5^GYwWm17a<P2U+_oF-WJ-#0`cZhXYK_^Vwe>zL
z7#FRv{Y}@oNRbCbx~rFvxby=Z14uScfu!cFW(fkQ9BVHJw+eE&Hq)NKQDj@IBh7B&
z>s=XdV<GvQx9qEPj>Bb*eso<}2+wA~)qzxKu<D5u&uviPN$CuUD*EJ&hQxtk4$4^i
zJ)+uFgc7PMl+6JMlVqPrC=bJ4nj&PqgegGmml>xY11jRcrX=s@&vU$$(Y>uK6lHUV
zf0nl_*Lj*62z8<mHt$w9<P0oRf`h+QfFu`11*|0kQyp!e?A!lf-H@f7l<{^oyEb*o
za@8Vo@0yDQax9}REaDPn#O_7lGn;FHt}QM&6>}ggEm7$6OZH@{qn@Rdv1bp0uR5;7
z)Xpn;7GI3*nV^AJ5k~kfnH-cpJ#_&nCWs2AK0@Y@2<Fi5R31E!Hgig*5o5^g;NpE_
zyFRo)E+!Wn_pNTd1OA#mlD_s?fu+@hCU~+MfKeJ4Ilsr^P$97u;CF~9bqL4e!_5Nj
z4&TQ&SAj6DY`{5(|A#VmNGWI<GXd&O4vKLi3~H&4l}~N%Tmcw6p8-0@qdx&f!~DS0
zUXhQDHh2<^C!54A4oe%sN!%%n@7?U9iFkyxV85gBQF#xm{&s#1Asz|_@LmAryLT0$
zuZU3dAg<)tTWg$lk{LsvXax67^+@pRy)4jk$sJCyyKU)@dK=32IA%SAM-)Of8Boi9
z!NCDgs~l!euIjn$tap@@wH6l;V(<tCQ#+$XehxHWlA52qgPdLBdm!>()AAO8CDelK
zlA(Y@_zyFd=L*P#om6|BBbXYA(voujk=xHm$h?+ifbV4mY=Gml1eNei_I{1m9~9Qm
z4#A=9PozB0B;_}ce+Jiub!~mrbAZgNHi*rdH3!*&WCHaZ@VC9yj*kBFrnu9$q$-yQ
zmFsMEtxg6ojwZ_?yfj(0AG7)E8nlozFl<xLA&k`F9k!x$X1cOMZ`hBfZN>uAX<w!a
zhuudi-vgV&x6&oZIRu9RGcJvwE4s3V&kZvDaTj|aN&1ZqGX+z0z-!f;Oysn6gXmO0
z#L?!}Q@;*VKpJhY*{>uar0J7BU(*VM)9+P0YM1o!$vGBRQ=pQzxUf;ke^fMI28JS<
zUEuNHPn=wAPgDdIEvsO)^K`W3bgzdBO<zm$arK4`gwu+Tjrf30;V&5bYWH!l*8~!L
zEmBwHE&E_&xfvSo`yR*eY`>6za)#z>K<{RuAcc(5*j2laI#z$!H0kr!<4`H<<m+#m
z=|9DovB`E+#q!llOv&dQM~2@)1hpOvsS^mTZmF1S{=S(PG+g~{UQW+}W}Df*CkYXl
zS7^VgzV0LIuG+<ttuA#Lq9)>pKEOw>y;cCgrbuIaPn(sUZZAvs;0H_ta}fWj86C_9
zcH5sl{LUQ>0D_Z)@-aQhtPtQ}mrCX`$L~*}WC}Geog2(%R<7%fuzr{y+p<e5RuanN
z(lhfSRTrnX-4|{#Y)FTAPLY@mge0;dw+B4m)XrRf%UPZFndW1i+=ay$`d#M_k91*r
z=afbd&r(LHFI?B9gT{Q<1A@Yp*bj(b21lXi#YpLL8RM7EsN{+e-zCL_-#E)F*aA`9
zYPgo266Yte<n~qs8u$>HzRD}&5FNuWCi(%yOqQ~1MxPFP6_9=zEHnzgp8H{DXEB;M
z?Yvo;M^#$MPcVpCkPc|4Rjr%?^qk(D`XtuOqnfeZ8xtS9L-n0z`z2*0{bgr*--$ro
z9?s*y{ZGqxC=CE8D2Wn3&7py<M@CtBM*X<TTknAH=>6H2b)uD}-^p;~+hq;NNX7xk
z)EmR~A&Pz`>k{R*J)=_OCU~%!{bdEFN1`+ei(GOY6luv@u5}<odk(vB0?GpqZ3tX?
zQFR{#881rw6o*`_iS)g~lBMs*D`X89*H(xgCr{R(vYDhHmd_5xeWiWq$D?{q*fe|o
zk}tKAp7K5E*!2t+QSYec#GlGztL4SM()kePWrhHAQ$O=M$VO#0mn0OrUJ6u9k6?%&
zkcas5{pYnsvrL|5HI_Vcx7&h!enzz{5L|pvLR&^YNDrRRatMaai{WU$Whs@P9i;6@
z3TsBE*=aINGF*sQGA$Fub<2XZrPVF(HMC+csEGAVbU-m4b1$<XE;HxQm79J2QWT4z
z`%Tw}%`S7AH|{51{ZlsB0%f>)<NyJJR2Ri_8esoQCZ5W68%;@{cS#(49<#V1IW8hy
zatGP=EathCcEPdkaX}sVh@qE|{Ai-`PjTdq6>a*^*A2=8_tpP!s+Zci(u|9mZl6^5
zxB_<;XcC<+xZoB<OEJyVPhwNR2I4#C6CEqwoOTV#CI6fMBoQzERAin2<vx{dNC9Xu
z#8bx0{k*Lt&8Z`lxzjE3;v>`3a>}kn1&b+gO@=>$lV<)$Z0v!jEXqo#iUC4EMS0Ub
z{D;hw*jxz`70!)dzMRWr==J4a;*ugiZf@0vx<9uDstVi>6pCxAYeE?#N`|;L;;OAF
zpODARK{acpo#p<vhTmLe4tdEk>0|P+$bs3>D*F^{QtoGA*(MuxhyaKmm0$wU6p@4&
zw?y#}na7@xewEu6{~Ak{-HOu&*H*HHouUt0v2uGjX7;CM{Tj-PRJH7&G~1f5mnPoo
zMK_vXIVveo&!SQ^L@t(2R+-I!5dDpzuD};!ng`o^p4R{?5QJ(1iLJS&!vda*j4bCe
zt;VVk<zjI%PxjIE80YEU;Ot^@+G}@iBG39@x5fWdOp8>2JmpRf8G=Dww=LVipw73^
zR-ETMpA9}D;{?U7@qAB*oe>(qx8bS-%~$RhptV3842%nLS`i3-DH012NNrtop>0}h
zK<*@o%5Qu~gwd+Zr|0sk0KQ+gJW0p!b;$$xq5^f*AoxJn0tqKmF+6(aaq2tzrITat
zgXwQTbF+p$922P^>H&WN+&^y+kq88-8&#Xy6q*?6RrR*>*#(yN@${!Z7yZ=#2(HKH
zMQ(_*00pQVyn3kT2d!afIlPr}|8cB?K(m?RsUoznVecAx)?s+lMY;LC{l}q@fCoRS
zVZIvwFUcZFmCNju;Lxy-&$%{cZkK{Sn(#Q}8J$V*r~Hl{17W2DMhE**xugB=`ry3O
z6+1b|&UAucfv-@u>M3|Gyf`P!J%W>a#NJ-Z_#W*%y)alP_9&|kct_EVY<_jhh@!=s
zep85$V51O1D6&rXSYU1+yA$2&;=8(LdAF5a-;gJNbMOkKu~ihM9}(rS9&U?5TYh;M
zLnkd*pb9No0gPK$Bbg}l97e_t|KPbiYF*GSqx3iguXE%Caa;23{6Rp6<n%aNl&wQz
zd6d;V;bjba*j9&s$^7L=62;&Bd3MKh!AC*R;vg5O3B;Y4tWtw%u5NxXGw0w)9?s(+
zP&YPO!+X13R4%qLBieSnblIo9EhfZfMp(PlBRyPd7-w0g#ymn}=!n$H)2h)sm7E{6
zf}QTSxiTIOV1tv;1Hl%*-7N~qOA?|+)KRZ0%=_nPn7z?6*QIde*t^((1(S4McJbQ|
zwZoO6Y%|ow^72*lX#d@0(PFHSKY#q6gng0?_+AkpAC|<hG0z7{_Wg@29gW%AcyiSc
zcDI1op5Z!vdJ{$7y#6yM+THPT8o-1hew+uWrGu&sNfF{l-&?hmP?ceKcTyevSukib
zu=Z)PBif!?zaO*ERBhJ7<9F{&)8>PKlm&+qqdrGH??ABu{+b&o=DkuoD)4+6z8zq4
zb=bJAIJsnECs1!W9?e6Q@0H-@njURc&@gUZngxs9PVV{@k(=+F?<1bP_<i1aQ(GI7
zd<x@apgFeE1=0}nZ)IM79sf~P8eyWE+4q%~-IgkO1v7j(_7Rz6NBRX{^_?UGVQA1C
zD311(=G8$F3nCl=(vma!zZr9ZuI<?iw&XuFQ@r;`GOzZAR+UoixLdmPyCE<LYNr7&
z;}Tc`89@{knLDYzZfJh!RUguB*7(Rhk^x%bYt`UDlI3uT_E<@~G@NSFhh`ui;@1#V
zB(iC;7X{BW0%iPy`6b1=tV>iSOKv4G?3=z_9Y5G^DfYwO)Opr?8{S;7ui`lN&l;fa
zg&_umKrA6|nUUrc!8&r%^GzNnb$bj^PGC+@btn&&L!{kb<yuL=7sm6@*UZkrz>3_v
z{qe**s#+@}K~R@P=cj3h<)1m8Z0;5yz{xkrW5eJ}4Cm-v+;<4CI-~e{c-ZtdB>wOM
zXz7!GzE-guzI@T7Y>+ig`c}r;4GkpW@;P@o=juP+fC{Lp`a5BLD8`4sX6Y3v_~OgH
zIX@P`+<rG8W^XGg8DuVy(Ff24aM%7Rc|N3gcZx%i#rrlySZKk1P<dCc3mS-oX8HRi
zc({ZB8ckHr`ma~Vv)O`3ICnzU?vJ7W=f7u}fe$1yq5St8U_f&vFn|3kOjS_N_TMX<
z0>12Qa1qIWEeiBu6nGokov6{jNc*4ZPLcqZi7<r!dk(O=paYPNzf$ghuYfuMus+Gm
z`}n^W1zHPr&PEB@p8uKn&jU#8V6i8nFaFax{R6H#WQvy$b^hHRfsnQ&0_zgBz4$}K
ze<sTXAqUv6^bRWB{6ic7XE0co$O`;m*1w?y<ll=)I`{Wpfd9*iW5xLYj1^^4C~!N%
zOxhKX^{JMgSqT?9*uE4~=fn4gsiw^tY^|hP^IjVuA`%mUa+XjTnUW|o5RLy!qY5cM
zfU8`>d5uaU`X==Pyk!Oh)N=gSN|`hSNcP`suYc-qmEJC{hPI!SY5c(B78MTfNZ>L$
zeUGIp8R$m(Ss*A&EY9<NZ@a^c{wV1jlKF)Hp$ftkw=me~v>FIWQZMj4>(3y`3r8r1
z4}?i`(W@v={gI|TYGuSd_I94oi4-92VxW2^brFntNI?<uD2VU(8ft=@sR`K9E~uTn
z!>AtwgZ~DOAnAT-KB)Y1(Oz;ZjzNE|BBpkbbuy*O;j364Mc3~6R5a_tQe=iAP5On)
z;{z@9GG0y$EeOZu-$)R!V60Pxy5Y^>Vlno@1;>{hh|EI7aIh6JvzJDMgr{~pWD@{g
z#)yJK;iC77(M5UKs(4U=0BW4uPYZ&L<`8(n2^9jee&O`@rJFzgU=l#GyZ`pQ!VdVW
z`6tlG&&zi}0sJwb)1R_xa5J2E!QO)O2iJF%g4@B(Q{X@0tM|@<k~#%y;q)NhGraMq
z<m~~-K{J=BfwX@QnratJwZ4xy2lPJ^=;7nEH>%(R<1$3RGB2Al&G<+%sOMUNx#E^D
zUj&6Wb)ejd_zqxaoAK~#X0`hIUhgTRi&p7+??KrSC>LW-+kt=HCPr(#z20K|h9TD!
zO!)5th$4tc6H5jtK>7~UdW(PIn2Ti^nRqYFp6idlRDf^`^%E@lIfWNgzLEf4tow8d
zRKVLLvH{wM7o)rC<qa~3P$Phy)?*%5L60dPe4a<+3D`$M@eGu}+<+}X*+LZkcHZt5
zM4N)mkEP}frdD-W`XS~dOPhQs&b8*##(w0~0C0X~C;-KoE+{D+E75ZSMSozf#H-5o
zq22=mzCc!a2I}GcJDy8~LgZ6vIWHiBK}PIE+B;H^!N~#qo;eNx`>n~M&9LD=)A4F*
zW2Q6Zp89+7P_!R#J(dkH9K$2lyDz$ZGfwk!AuOc;RlT=WIT4mqurXMy1DhJ>B>kVw
zJkJ4=oTJwZ6oQ)<Ac<OZJ;w<oDmXZ&Fdh;l#1Cy$F2kd8Zf}~?UdKkF46yWW!z;kI
z2rxi+fzJTA?q+pBA>rklgJ6vq@PUG7p&})7Zx{k6Z;u<30>7r(>v8O-pCS5bniQkG
zZ-BW4BLP<QUT-fj3!VfJo|RC!2-H>34O$S)wyHk}U`K=5Mm*DAP%%5(fsgV-Cyp}@
zpudozSBJviv_@?z7@QMyRg;Xv-a%a!8YtSb%(DYbr~o~NF7%>)DiMkWUVs!ZZwd_1
z%Mn$68f|n6d^yC@TiH^|b7OJN585*63s#%QJ--ZU{VZU$5D9q$NS+bk8765E2PCZ+
zz>GG_n*#asUt5wO{^UVu#RxvOO2a%c1Y4^c!B+<|+9v^6pi`IRJo^r6%UXj9d6LiR
z0+73ggG`sNL`@WQ5M==A0_<vOA$;_~bNXorF`z<1#fREg{s%F3XK?2vE!apfQpw$K
zcOfzU=m;bSWOn(vkcDCd;QMUFnF5N&0tnc>TOe^D25&FAmve%*ya17+TD1qH6qigh
zZPhHs4#+3SIzWifEJR!olwrlL;vw>!lD%`utr)cM{s{>L%fWQh<l0gYaREkuHJU3x
zg{iB2Oh1_jl?8-BUWGRd*uh1BTTv;d3iPV}30j)l02DEjh;@iuWeAV7LiPN1thji&
zWg*WEvK4QrbyE&LX$L=z4X95H0HyZ5#Q;$sG73Wb28Dq7r2ve^fV|&;`pnuBcbUUy
z%^LlEHqTkqT)f}-vr{a99MBF7DMg4f=U?kHJq)N=C8Qqj25JSVf8?ech)BtR9pS5g
zA`0!mWg}&OSgM(?^4)yp9vd@Vz8<w2&;-DR!XP-&zbx5t+fDhew%bjRD!Zm;K$|*0
z0Dyin6#0NvFM(WG=tBKH$mM`sb!r_lfaIV&*m~T<1loGE&+#Dfp4%DQ9gs63yg)<C
zi`NFU{p0|GXC%$yAnE@nT(Q&wuhT6xffASj!g%1ShahfpgSXm!P<8bGGy@`_bN3+N
zKH1)hA5d6@Vt|^-kc0CC7E9{5)A#SX_Wy}6|0#q1XV`ZZZS2!^Aa4o;`Q?^yTDm`R
zZUbPwpNlw@rau8IoVsfWHkP<SdRH|&06E5gLS2*xus1^^tEL*k2ql2O5_cXHZ#J95
zz~w>xBYVT87Ge@E0@7T-F8&Y!SoeRZ<%MdifMYpqq7WgefOM%B$?HRsrl125It6I|
z9q8h^_pyEA0np@WduRbcpa(5D@gB?|u$p&()m$SVk^*ld*tO3qFC59}RsmiUp$tG=
zZ8$-=tQCv^-HjZLHVj|`!qNhj+k}L(?_ZUG><s~YgctKr0@4q4L$z(}puB<;ga%2<
zO79>;Vh0f!3X=arE#}^8FrSmo4p1+IEWjm@9y)vw3MfD``u@17O9=#xZHgD_O#Ua8
zSpxC7w-gX!M5Ys<5cbUq)NTy~SP^d4g03kUAS_?yeNiYcW&?96elZgH_p<+x0&M35
zmpHh&01wgau-OE%d2ZlREBAK7X$5G&UY_MTWe7GR6gdEnTv?L^@*J>O+;+oji2VO5
zHb(-Q)6dS@0rY_hGx0uc&j^-O1dv7qmnZ|8x`Rh|gexG*A^@+bdbJ8_N`ZX@(SPkD
za0VN0rf<p`Q8AY}0|EXdX$*N6!N)*-;O+`Qj{a%>8$(Lo5Kszrf^WV8!}QBnAV&QH
zBxv2M0k#J8p_TvcLsP=b?`s?ew(G1I-=|HXl#q)7!`=a1m%8v@a`E@xR!HdyKuYm0
z?z8}%s{oyct#&jJI$&!lJ2<AoTXLtSJpUVzZ;axr2xA2WKn%-QfG`H0b&6EXy-<hS
zpaar)@N8*>h|Qb^{@2WD1`;lcIUVqCwp=77a~b4zH}Lx5LmX?h!wn_g;9ggYad1Y2
zmys~1M4PcGo}o96AF4K|eLST3uj1R^TSP$3?^qJLkePVa<k!<Fd#+bM*CTVc-9TK^
zRkJeyy1X2K60gVWMu9i}1|ZBPod|Az2x}sYHJOdCM|w#a9iC)d3!Zw7bfhcRJi`fg
zh&`W$BH(^txYa8x23`=Mo&#cV_E8qx`ay((PTnZMV(=~mJnrPm?+Y*j>2dd0taH;F
z_??WGkIGM}1of<dj2RUGWOx4sr~{%5NJWU*ap%#6uy4jN;S1hB$t;Yew%<pANa`JC
zNY?;Vs(no*NhuP6epxw6=QaZC4d_l6kGu=_y|^1IuopXJg7F=)I~j{TB47p5H&bV+
zu{blVLH+*3>b1sg`zEw^n>=G*;rkQ@PDpw69LonwkFDxcV9U=smD~o-6)0wsj!pi8
z4y<Xvt_fK`?AKtlF0;BRqcmZoN*lDn738`Ooo5o$pJ!P0R)Wr0SgRUHtC$7i^MYzW
zK<6wxS7q@^M-SJrhhsrVeWnb!V{pouzq4O*zz+IgKos~8C?c6?SqL<|^)U%-0!Ndk
z9KGts&S?+|O7R6#Dq6mMzng+qW&<5vO8q7Zy`d9y2j8Oz#_JY`ok1fZq$8YS0%LTc
zYbTi2PE`D;1xPmL2N8CwEcUCWO1v5t`9_`9IV?7lhpCwe-<SecXpXi4L{nMk1Lm$*
zdpMj2oMxn$fHG_@08=W_AV2U#Aefq#4%qAQYOL_?L{91+e04fdTu*7*OKCa6TO1X(
zU1Op1Sp&zY{1ia*ZT{>GrfGIJ`utw>pVJ56Hc>C}^9mULz11T^lT+hh)2#HWN&V4&
zQxL{W;09rRAWx>s`?qjLP0h{|rh^|E8S$-{gDwoU?7?T*@tW~FM{KeiY%Rs$1Xs^+
zgv*o@VaUk=cK{v1nv&V6!CV@=T-lYe`#ght1)OWm;1$+$Y+NI2a_;rzUQz^NnFpcs
zUt(Fr!@P$*+7EJSbO>6uw?O*ucm?hJ=GZ-d^i$S<h-z;yt97O`Av{ld*|w$F(W6k_
z>HYUbnV(cLcd4}fcf*3b*E@_F%Rls|tqj;BE0k?%c$YWZDqm?Ge9>CkeBFR8#MdF4
z_M(n<qp%e8TjnUzalvJXP2b%_eZW<wjig=lzkq%HiZg;XG@>Dsa^={hlzj<iztvi3
zL0Etkizo0WZD5u+VbQ|kU<q^h$VHu~Z-xaQb+j%^bKbZ2No{Ivf!pCHi(Rr5!K<ya
z0h6;A!J$G6EOH;a_Ee0T37^F@I!5TY60j*2v}t{{Cq8Nx+;iAMm9hKf>s9N4E~UU}
zCCwUq!vMct&O(!$mQRP?3md6lbPeNJ4RRp;^p3Nt!0@E%f|x|rYzGA3+o0X6EOe2R
zjS3w4G@=&p8|i$ZE^z7BZf)s)?bN31(Pk^RZ{GvMld;E#1BGUCOMd~r*_SScTE$=@
zlvUQ&xDLN$%oQec)nF0==Mx;y!G_9DwL=@r@Q&O;%cX>>sf?q|punC3D6>v~wd5P8
zbHBP_*+Hw1%Ms5Au@l;g@L%lg5@l{z+xGHr^`ZucP_bgB%(|<eoOYi&aWfw6wrb_r
z<mtV%L&Tasa2b%@P&!n0y4a<k7_{khs4=@w$d|?Cy-<=~&}mWf|DyEh)2kxUjj+RB
zrmHe@Ph{ThmYv@(>(&%=yD9W(2H7}+yj+L}XW{}bvLF;WXZ8x1=oDf-9tW<|2Du17
z=3{%gp!7JFOX??SN_0M6NH(l7L!N<{-EEG;17rX%fpUp%*WLhz-o(=Jgv!GHl8vli
zDeY9rB_Ed_vJL3a-4rx=%s2SnY$P~{_nSjPpa*xJP_qLR=cu|(Y#?KUC*z*qw^K`D
z;HO;_JZX5MA~yCuMD2elndtMKTs`PnrN(YMU=6JXZs(3oXTSwb?ucdITBxhvG7GdP
z)^h*E#<bTqr6Ay|^D8_^{k8wJ*EO=E#vjB2+>KMbjZ^P(OCCt{=_sGRbLXg{d>+$t
z(s2=3CEzU~8DP)s1l9oEoC=GrU$b89lOMX!UfuFwZ5{Zam8Y6jjvwdKH=7O;j$GGI
z%C4QXA@=Vc_SabVI$FU>o1tDwTGYDdw+Wj|3vnz`*Ug{p%T#3<>Da~+>VmxF-v_N=
z2sNxoMuDH+;2@sCL3q%*z>8yXj*#jg*OT3GlQlY+);hFu>JB&Pq9>4c&9A;XG+hQ8
zbSV<Ufca-6<bZZ!1iN6~`SMcV!NM1(rGA`9%h~v}Sq2VsJ?b2efb&GzNH3bSB>1-{
zq}FMn$%-A%jpb8td0f~n!r+Uh{2FKddV0Dg_)SV5JYyRZFdEKjcUzM^nmwX_GZ&Yz
z9vvo?)Ap$u$HtZvxOO^dZF(l7HW;3ju}+ua;<%-g@xc0vaV`(xNzmGP_4o5sXzm+X
zR-fAGP1t^e!1iVKs_(TSXdBUg1tiU>GHuq=tqm(4rC)nGNMf`I16us$Zy(7-11&#n
zt@gB|afXzC$xSxs*A=X>KeTWWQwSd-X~IwFEjzXnTq?d9bVDzsT+mK&(#j|f9CF@a
ztbb{eFasT|#dLIUC9Av9O|`kFEa0g~yTOsRgtIzZ?VY=cQG1DBW2i@5@g1*txde^|
zh^-dXKZVOgo|fJHm@?sb;jwkyd<dI~m$hB%*yJ{vRdUnn<=X>to2%gFiA~8=QP&l4
zx=gFM<0USsAIn=oWS{~>WeX(8^OXI?AQuCDbD6f^4}J<9wlsNvj(>h^5Q`L|X`fwF
z>LFYS`sI`=iQ{+jQ7f&>a?e-Fn6(N~%q^aNYd01{!LuCL7nfe?Gr!;qx$4OX{WiIG
zHAl<99cmE0YM!Bt^1i)T%bV1=)ny~IS@eM=FL;0!d*Bzex_Z%qFlaGRH6JIl%DD5P
z)w))N(574%!lfs9*m|_fl#*{)8nA31?$x-Nmf~^mg1!~IdOvkSgh<Dy)-d{@nbLz-
zQ>gR)mM-9??WWy#QTTVziS=%l>JcA}A%+Eem*H{VCN4Vgdv3}fHmeVZu%mp}BwK>>
ziuO|v&F2XpouM6T>L4o+S|`_)hPEOmj-4Ly|D2rg<Y<lQP(Eftt^BQxdC0c-aWoQi
z1iYeK0yuTMdb3?k-=r;Naw~VL4g?xv0?$nDY#0UMjKpL+$nK%zx;s}a@M=M}!g!AG
zPb+R#5(=IV=6%_@eOBBukp4Zh{yjak^%01@4S}N#jonIq?{<<`MU*;NyDHNJymD}3
zHGX1fMT!^1Evs%Tt&nb6_)pFTm>U%|^mGoS1#N(X`s<xd2c0P%u7^t*go>1)s2!e2
z{9Iyu%wd}^gWl+Ku72JN(W1o(y0umX$0uMlb`2Rx57Kw)XO}g0W{(as1Om5|g4|aZ
zJ<Xz>Zy?VE?N<lwTXz@+hV{43kJU{qJEnZIA@vn{ybO(*{HP+=2_l=)g*&rw4lKSH
zTf37HkVbZM<4X;p!PUym4C?ur00(J3Xm%Ck&%H4F1LhPSlA}2$Lblt$-l-jGCG+Hr
z?aI81*ewn0z}~hG>gbIC{>_xu(Y)-0ug2py5t-uC%I{+H1vHQ=zd4TLz{%+FTw;fm
zjt<Yxq^SY-=@tQj;F;?K63j_AUL_1KtbYb^M~aT}K4YnO3-61B#;vz4X)Eq|-L;oU
zeF~I3d)vue10+Y(m)ksWnbyd?CU8-JZvAeHz_r80AVO&i5{n<|zE@6X7E}Caku3+D
zYp)xpxFfy(Tx<ODRsZGTz-8+YxdW=U8Syc(JLALrWE<84nh9(y$WM)u<3Gt$`h(^Z
z0&Vr0d~YUrJQI?e#vCNc>RC|^7uD!WZ|tuDX=z&3waRnT%{!2};c#)>{YePzUK+-&
z?|~}vkgp>{C5%2v75Bhm`$38sI@i5uef1U3ULmw8))NXenEnJBU(c8OJnyO=oQZKl
z?N8E(_Uj&w$nM*uh*Fb}cyM{tffBfwfa%!N^;r&yu?tLY(+|W2rU!}bEcQB#Xhv8b
zV0~?Pv3HDQ7uY;U>#1&7eH*V7(DS)KzGSLrjohzzEkBg@j?t&RIE>RF5SPheZNl~@
z6&+*dBC~y8cKhjaNaz0HaucCCJcOz)a;Z0-k_x-Qk1fYlLh(n6?WLJ@V8sH(w!TFj
zv_+-zN18O~z|z4nmDD;fq^yVmd-g5peg;=z1zrL0tEyEG&S(s!`+)4=`&Yk}SyEX?
za3OKU(Q8|AQX(N%&bxX;rF(HeywmGAJ-C&uW=;nb3vY%u?S?BH7Ae2;Y>UZcZ$j1u
zDhcBkgk?8{&({z}MCq<?5<t$N0ISHS$2A#k6b;Ziq^X(>Lb-uGNLq`ETdNX?Pz~^*
zOVR$o|F8e%D3v1j(HRd-6{RU3Qu5e|Z;*KFx^W*Ig72P-TNrq<(TZDkD|}$(t2AW(
z0o(!KwD(%t{+ZSKkb_dLtd%c9E)sI;rwzpPLWE4;1X?jXFXGaXM!)>tF2R)~VIbWn
zySM6dYjSosa&<Sdo4RO$GCZKk0aN+T*eEJvFDYZB=wqy>)wH_Dm0!6M*2vMQ!?`HO
zqCmxYe~^XXdk%R7)XR2=E!K!rf{IVuxTqTU4>U#uKQ#4wz!X&tE~QV&XP<c%8S$oF
zKhD<&$~~&-e#Xd1PAen&JgN)EWq%Gp20Y{9tJy=*QlD-X1*ejB_qyXl^m-ZifhVgv
z<{uVEx~b9X@5gQg@Uz*;Ov}qyZPsPDZA#WIk8df!KKgpQyPqVlM+HwKZYF^8%s(wR
z%SM>}S>=ts)P&guHFXO%<#J;?bd`hT##Fr1=gsAvMT;YR+M@9Em8r9vrJAAb;kM=_
z3rZ^~4EK`Je464W@;~Rbg7D{tAB_Eiv*C134N~cuo8J*GpAHM03%i_>$fGdG|H@5Q
zkrGNDrM&|C!+p$EGY+P?#3sd-J09oY^7-phg<*NMDQb=1Fz<o`q9Wd@FzH7{<g}4J
z>i4~&v7?2E*&d9lZ*wBkOkSZ1gRV803^9j7Q%qX9y+eWn61FN_Rb(q+6!1lLXkj-;
z3!jsQ+_b)nRnO+}x>Hc?h7EP!9OwMb8+AX|xdk`5)9CeUsv76Z`K<lk2hzxUdJHYP
zvSLkMq>_b7dcg@9vyF5<x=f-M$KVchFSpD<+A1c+IFiPbZIkN^`UAaW3s8l{8id4L
zkWfm(3j|_(lDf3#7R37Z>k-|=5?IHh&Y(R3Ou^VJNvx90Mode%`PQ2>Y7=@~#$kVu
zucO(EBC!}|mO~9a<ONMYTiNZ_!|%T*lN$RgzdwW}7X_3pw#CP&6WVKLbTn5VUX_#?
zxhyWwSY10<d?Uf?g$g*FhJASSN#NkqIb_TY4K`(Qx`|jefke8yeK#1`028&XhtuR2
zZW#~V*)uv?uvjJ}T1&t7kF${8a0tu|8hDc7Ag1LVgHtH*z$l0BkI%}aDMsqArW~pX
zu77BiUb*8^2aC-mHWS|9uy1vjPNuog{ps@{4{O~}zjBaOvNjDzPmC;b!$vbeIul4v
z@nXy@0m?%$X@PSYNIJr+;CsX0M6R;WJlM(P>!{<G>N1rYY{^PlNOPsKLKn-%D?D^7
zg>%m6M8XyqJUcc)eQ&_S3Ha~-uJ3J}X7HaQGa#%#BvTgh{iW8lrB=Nn6kEkz-mjW_
zVle8_YLX?plH6;jeYBILmuh1YMhs&X6+&$?`wRTW!KU@OCw$*cWjFfL=EZTr%(J4Q
zA~OSV3jrIS5{WGel{&hfC8{&D#V6Nmdx&qFSc%#*0uu00X=K}sb`VA~2qWF@$qO-|
zx2lQn`h3JZL*duKwLkr{=!)XfP`&gF+BNmJD$K70*@qwGs3^`_eZw5^kyYP#Y1}p!
zTcn{Yeecdwox!8S(j#2cWc)XrGSiU7Vc`iCShmo?JFM`EhMu8N8b`le!7U`LE5yz$
zs_Ag}2!C!e>`=zWbUa|y4_mCz3Vs6-_I@m~C?jr-6=h9mTSa~<pw|dr8j_E{@#Mn;
zv$TVn7#&T$5xfziZ^-R<r_MSks{I<_O(H_kkrZ8L_Swp23iwQclec)cV3tbEvB~e+
zAG5~FqL{!|=WL}Bg^!{AU34<0MQKsM(&xH;vT47e`Quo;num`D;ccUDX}!12WrW9o
zN<Vc<Zay6yY6B{9q|k0fJe~7>gq+Qp$K&5USGQHhuKHO?XynR!i1%@g_`J0ZALVN{
zjl5}l`Bm(G-qA+tKpu}lXU}#C?i<?-MmUyrF$pO7opxb8+46I){i8QDmJg0^_(8Ye
zvd`9TV5?s;+Fqg5#h#MLPC*mMMF;uyMhmjDTzj&bFm&y&X3OW>DiWC2q3e&3OrhU%
znHCny;#q7I&kwG~S@I#TTw|spB4VgexqVAt#H%LAo>)=l7UJ*oZT~9Gf(oI@lY8@Z
z=R^X2rP?y$L>b6^oE!I2nre1af;OEdooyAb3@GVTPBhIceDH5~!f%UXza~f)R!-F6
zHyx0~!+BaF6T;bQ7k*DGNZQ}B>QE-Mxd-E6g{0;}53+5;q9aV(^&`fsZP^t)e0$KI
zzlp^qGxVc!zww);3Y~k)C?R5&;Q8AAl~utvC=pJT<QE|L>|xeDEOtC>!i!&XanQHp
zulwjsda?(1lyHxEUcy6Z$(7uz59-G8@h{a!I)YYRzX&1OhMyEg-0KiFWyH1UZeoR}
zSi?(JMt|SDD>pRAeX<%u7{@GEN;{f3_TOvS=aWH-Jsf4=wqMO5>__w}lel91CO5~o
zn`cDc<ubh6kc^#p`Wjff*S+DRpqQ@<DuZgvCd;gdOj?xaDfW@OzljCBKZ-Zy#+(l$
z5+i2!S?`{U42beiNnO@2RHC^aFjSE~_<_7NOCWTFgA4n}C<@=X+UQZ5o`J@KU*VDU
z3a{tBp+VXH`h%^7`R&ceN&G&Gi>cEy_QUtedAQlGEMyCA(_PmoUP9x{3uZ+I^JN~-
ztRAE_=}CdY)5M+E#o>!6`2?c8#z4&mi)8R4NuMflJS_WCabxH=-J05@H67VJ#>yo9
zm5FQDF?xhi^=}1H0}mG$`8p1uv~3cugqzLDXBb15Iod1rixe!0#0RsJ)f+}d=&B;n
zA@Ydw5c7dkU@s6<O}4|>fmU7vq)oQIw(T?@_N))TXY(+Rpr5E9O*D{`GSH=98|VD`
z1rOx&E)L|~%rUpv6t$>G;_PdU2{owrr8Y*HSMP2_E7TpynXiUZ4yah{Damg9{`)%F
z`iiNyh{wCr#19%4FeZvw@!`*hXp_6;^61`IEVg}mOUKgZIn7Zg!NGk(Pmz8w3S3M!
zVH3XMSZ6aN(!L_AG1mFpME#1}!Qtvr;I9Vq0M4;dV}eiry*yARY%&?2*YvWh*FeL2
z9wyNHTq|eO$BpL(PlM#Ib0}Y|ZXp?`zAzSGBbFR;pXq3wDQKlHAUA6Hm+;pbTrhjn
z(M+W7%)KA?2h@%?32RfsE`Sae)DQurXotNRNw4y?I{gAKRvRXDQ{#k60xdo_Vfbtf
zrt=Q2Ek6G2n1EMB{^3$@!Z&rNwz;PZ>mgiHcfpTTF)8)47UT9gx1O639-1CyciyYS
z*<KHGTu;IKf28&QXsxQ=A5&}fz8~+No|@9)`OBDbX6pr<%H&a%ZzX?El7~t%jefUs
z^nxa5jZLn%0qwX$vSa*a@(^$S&^%Pmu{JgKRI-)07dsKwk!|ahRU}%08R`m?ax&??
z$zp)}e$jg<Z~9At1b+z+v-q?ZZ?>k;a*?l9fo3lK1F^SDtoqWmQo~`p5iZ~&p|c(j
zpk@ZtNlJ;U0O@sp^W~GV5i)Hekqa(!6nOmoxX_!M<rdP13l@aJ7I3YamuKLoa&0U0
zTCtN^vQypWR08dAu7a4`1SnFs_RPd{IU=_<uQdL=@}gDtaNQ4E8hC3+qg^g-EHZ?a
zZv9)OW(&H7Z}VlKwAH)GCYcR(WVJO{qI_g74}FbRuC-NI%F@?<cah@d4g02T`xb^3
z=ddfDuT(Pig6>{hP5O2!DOOLmFSqDK+`Kl<p-}iiQLXt`sLnCv0czJZ$ehDHCyi`1
zAy@YPk_C(<?o<-_E9YXV<e`@KPbEGkAUj|6Cke>27y86aFq6K(H<ah`O1AVgGfv@d
z%4m#&o!9amz0!wr6|GXguZtC3+=Pm^d>b~7iexjG&;^$+;Zah%=D3BR(wv8E%BPly
z6rgw$6pFt5`qQ}t;bwWaBh86o#t|X0@M=ask>Z_bKSM9U5d%dUPL0G~k+)gnuW`82
zAcFO%JfW@Aw_lUZ!C|yY+CEii>UVUDMW<^D4SVrWzQgj%Fqy3^v50$Z)oXh<8t2<c
zUe;j=E3-7M>L21MSo^PllwQoildN4N;vf#kUH<=+cb#EPWm{V#Dxi*KMi3N4DFcKi
zy$IqcO+Y|E3<^lE5k-oCL86W|NKrtF0R$47AT3D8aS#GhBy<LBfDqb99r|4xr)2Ik
z_j|tI_xi)<JSXSuv-jF-?Y7?cMH5!eb}8HaYmYVd(kh~K=U<JwM7(A1c|ao9a?#oZ
z1vm50KEK`jywll-5HY#LW^1!DU{le>M+r(%=TOR9_F|Z2v56q1Eni=p>7B6sgZ^}0
zK{)GBS2T^w`^}x4Kkd(uJM&CO`<6pU_+*z$cQI8$@Th*Bay4k_;iH(j_?oAbYLgK`
z+gPqkjduAw4F&kyae1R?2i~1kf6C^Q=#eIugi)_Y=X1m!9d_dF6gwAMObzbequv2d
z$bui2<33}r8R1Ei8%qr^^V_3B)6d^xT%nM&A7?L^te;UDw=_Pnav_}0X!2C7iHPTw
zS9^KyH*ImJ*!KTcoUmB@I=aE&$KP%m{dV(4#=8P@H~-|Dhhkob{hH|7f8s3h>xs0+
zR7?ofk=Q8_TRH-}m1GUPnEUykh1KZ&KUUH!pOIl2Z=<Smrnxe8+e399xA|Ue-b`pN
zu6C&p@+fiE-Ues8q2jx0`(tWdZY(V9cJyzPOwL#KY;R5*_juUrZD+#YbwET=AeQTO
zgNnaOQ&a;_QbE%f_(NZ-9!g|<lcRs7L5*Hm`1#C{*`KVKEd~0g2;TbR58q3STy5p9
zF)YuleAuN&aozb|LNV^$%V|RKCSyzyshzH>#^Dz9esfS_ZrzuLMnjypvJPcNKBTcO
zVY7bcAS{#y=spNi@)}amq)*vqkZ3yi{B>~3@Q>=!lzHFEP(kTN|D8R){f=U$-{djs
zXk(U{9ZoI*)=~A*%+ZQ)A6+KIZO!V7-}0mv^KQh3g-i3Oyj64)u%%P4)=@pUir);~
zD0>6P5k-Bt!5jS3i497hX9P`8$Yvyu)q006v?j?W)+$zUY?0N<xbJutH}8M_Ue^9o
z{D!vLJ!#RC-8CAsxu0ph4Qcn0|MihXI=O-h+HzgxPhlzo-c>HN<w)h}y7+qpe(}Ug
zA2G6VOlt5lX`hTO_-qk0V9`i!Ady9Hlv&L+$UTygHoZ2vuN>ukqTS@l40$)Kt+ua`
zIJZrMZZunRTe@yBuKQ8=fth62`j1SNffDf^43$}&UXLWW8rV}OYo`mD3zBUb8J>3u
zGH>p1bN-Q?wuEIS|4vx4kM00ZY$@2J*KS{{^pe0ABWq%n@F8qbM3;VlA$kAu(B)`n
zH!W+45glJ;bX0If*CfteHbD60E1HQ!&xQE6oZGLy-=GAmo&!xq{cXm!J@ro7dS*7f
ze&%JaN^xk1T>jv=XnoW49oU&%a&Ds!Mc;HA2v1NjOmfM%#=sPEyx8^BQJ!9*V_hJp
zMGb=eiFJ480h9I}bgt?)!9kAiXL-aXQL^jB)p@V*G1q*_3+i|3rp{~|?e<-Eb-T<Y
zJBv1*ASVYx#_WmCDF;h^)2fJf32Xu}0kpB0MqVk7Uzs|Tj}C@m_Q#4O%3|Nq7gQB`
zC{G-jhV74H7C|&CaC^oP&?0~=u0*k4VF+-$H=^T-8H;N^;lomG$-mh4yELy}yI~Mm
ztMs##O-km#;KoDi^(zW<qzZ6k#<XmFf;m-fU-yG7oe1ucEcZ)8;oJ;^8oF6z77*C4
z+FPqLljXbX+${Zs-z_<-GIDi(%57Czb#T+F_>!|9HsB1U<SQ+-vWw14gN{$(p;sNg
zJmoGQ8lCpQ#sxLdTC8lUOlximw(UGHGdZF*=h3rUnD|3`m+lo<Vq~ir1$HwOiCX?b
z#%VckBf@21iA8Aa6;IM6f5;wl&(LXyeTQpwA(1ATQkqHNIJisKLroShV%u3kv>lHQ
zSM+~f^o*lbUQ)DUnYuci!27BxyQ$l(F>|#%Ld<iZplGSjw`(>~9B+c~ViJac+bG~~
z;|Dp0-D{pxT1(U73$1SSzE>vU6xc^`6&YBb!o+j8cqo<N=R(}9$jhV7+(to@51Y)T
z9ox7!su!&I$IwdzXCxiwify!+4$s2Mgg&2L@Zydv+gCoFD%~qVDrqtyoW5*@xv`*_
zQ92&ILxP6ulutgVF)4V&YYNljb33w3)LBT=_h{GW+D3(eM<m!wOCRpBQ|0G4<ZH;#
z6?nLFAI>z{H98?cU6v1fQl*;~-8gaddgiFWBU)+}sLN_Ssye`+#y)6OJFB_+e*4P#
z@VCceXkM9TdzrUPzx~;v#E=riciBO_oA%^--D1ULWX><GsUzWvA_Z=o0-1BQCBnG<
zqWA{hBc+tY)Fskv-MD;LdfHfReq6fkK)CZEnqd@vF1EXAqmb#y&KUp)RRu!GkzrB#
zHRQY?qyJG(<C9!tyI}2f2uWOJCS2?QPWl7hGqA_$Wvzrn>_#5Bx`WbVhVE7#nWo)g
z3cKs7jq-a8e0X*D<Mo#7w2Z!W8C^6CnAyAFaWvjl*#FWS(GVIpgR)Qwr{bLwcCpcQ
zB)3|nC0wZBhok=e!bcc#yX=I1x$Co{PHd^U`le?3rB}0hp5!G=QqQ}U+0WTV%d*=<
zw%m4lLfVI8S6UTHtmPFMQ02AS64aJyZSKHpWZk}aZfl|Y{LIv@iw@jzq%(T8&nJ-K
zx*5I>I>0qeiRuu!^Cmw`^2*#i0h?p=II;?RKY!3qd1RtLrhdY6#V<kdrR~Z-@naFu
z<)e<(HH2Sk#xCEdld%FZ9vx|WU|i>1fA=;*M5v)7!bkgXE^^Y!ZF~}gv(NkYjzOb_
zd8rz*KuB+OKA}ayAc@DWg^&R2<8rbeAXV=^(RSkEl=Z}nwey_eVm?iqQXc=btZ(o0
zk9(a)J|5=AX(}`sb45t%a4F>WH&G4nOt1d=H{KMAi4i&sEao8PIePaz{mH71ITfGf
z7jbbw%CA^Ip@?tB8WUkMzZ)H_KLKRXc0g32{v>|O(Z6@bfUoY(k4vj%>zvi#IMMi>
z|KM@+i#;)092|^)cnI3CsaGHg^@5{U77k=WM@wHDkSj?C>C2O=a;f4;y3u0Na?bH)
zGaH2-GQWvLt^%?KcQ>Q_J1`*Ne}HEImV$@f@({`?0v*9?U93JVWkaiCshwHWXn+#~
z7ZQ%CZ{?pxFxOdt{ET7;uVUCvgOml%+|M16XHNoBNJS+V1G%(|&jFioUX>f5Zkvd2
zzBEN;t7Kc3m(0KO>bpj>mt5`Pl_*#vU_C(jY|o)@(0Bht{!|@=mfs)J4naUM2e^JX
z%n6y@;HquNFo-O|0=c3F2d5Yr&H?F8vl247LF&CQzWxYx<!VogMJk@*8=33rkywjH
zH*W}uaCa&wy|8Bi*&uIxr6T~tX2}M>qfGbYt1JcG!x~T(fD=OI(&o@-1O!5qn2YTv
z03CS?1IJ!$!hp#p+h&@rk<?xT;E5!ha}J6u0Ax;gNIZap@4<E$JATy=p|&}o`rm8%
zASTZ=*m#(u1rLU$K=Col8Rqb<JqY(1Rptd4v}Q1%tCAOh%dlK&*TS}G6>qvLricxa
zkw=X7%>YgK=H_J{X#QV}jzKJ7U_9TYBZ(A>sPX=o$JeB{mv$QD*0kPTM^peaw72$j
zp9rq%1KX6-&#h4+{yg6JOG1eau(Sd(n%}2kpdc2++kZf<Lc(GL)ctWbw4;QO>;#!L
zL}DSHG~NgemC#M(4Rgqy@cJ<EJMhfPSYH8=XQtqU{!LC(D*#2;gON7aVei3REMQu)
z5+5o6IGY86{g=QVaRv}e?TijfXvY%Rb819R0Z<PJqZ{0L4~Y$M!d>i;12FSg;KMRE
zp?2(!jtJbR4$YsLE{p<1>=AfvX#p`V_ZUN+1!zxEhhyL_rz#GZg#Q8X8w!P~6P~34
zsM`o$tJE;HhmHVL)mM#ZAc}!UT|e25fJvxR@Ez&O>oLcm=C_PbuK|z0Vu;*9Z1aE6
z3}s6L($bT*iZy_;Q(BE%jrs)1cjs07of6SCSl6I5*zDSVKzw8B5#F)Lvyilo$PfLh
z4u824oRU2rrOCGbUtR|tkl5Jz9XK$+Kyl}y-GKE=fT&q$^_GSL5V&p=^tFr?^tJb*
zb^|yt0^eeuByz&9pw1QiyE>Nyfb;TRNVN&9$+Lw0F3$oH@?|S>rV?<b;n|Q_KrOF_
zqS#cA?nW*o0N2-5O--RFBsO-*D&r^wJ@Dd&O&R6z0654;9yFwRgX=TK_9_4X`rsN2
z7H24BU3NxDo_+~vC<j-^;UNeMD|pN*H6DRMS=b9;1*dk~41|y5*kZPNQm>g@C8UFy
zTPj_%<7<d8l5>s&+O>d1^>orjz<5pq#>;Ae3dL%z(8E{Dug*_ywM<;&8K4L*CS*FY
z4o`5L0j6UAH&X!(!r>ra5)vk0WhVf`A+IyLO$Lc4q)DY&lvA}V7kBxBj1YSkVD<CJ
z744^h1Oq)1W0R16Xgy{FjFX!-S?ogt4GXXwCW5cTRH7+7a8hz~1%2qBqYjSP2IHE1
zdD6J>_Zq7Bl9bV)gW@mvWGj$1xgh><9vyBl+Y`WC3lZL^HviQ9|Bp5SqeOUkZBqD$
z?%xSreq8kj1=RiPhyZ3P%*xww<pCP?=Ka<fMt_L{y*a)n9@w?H(=Af6MfKq~W`<BJ
z16>$Qk{XS7!*wfoJ3j|p6u_u)_#jkSpgB4?I7M1-4rV{iRdBl%!(JFAS*JG^L@@|(
z?FvQf232;mBh69NI6KDrMln>(MR4ek#DDX>AD}!ApIaHorC-A}j~_V$(7IQkO&?3o
zY~a{21fceo+fM`G0f>{I<xpO07~+Z2qvQto)0<FTB=(ACW)k?S-1uR0fUbXmqN@FJ
zLCmH3b`|Kd-;-#!RL?rvayttZ#DZ*>k%ljoE$HK6BxXU<Lrl?sQZ>5_y^n-_OnTE{
zRT6*V{a~D@6aaO-#%HwwW88zZ%ATKtXT26r*Fz*%!m}%cw2g=;@Fzd=%nMD5o6~W-
zwH|?aTVrm5qmK_+M1~B{9}W}->V_Z`LQ>T?M__ytxIV6?T?kM7x(aRLNAsEYj<ldO
zNh76d%7%D|Za8Q$NaPRep9p6@iW}+3VXP7U|Lh0iJoMfm!vc|{iRs8(-EvGM=ye2n
z;4XN-*(7m6(njjg^%TK)1}g77;wTHC3l;(f3mrw~7*uPknqeg1ykM2&VBY&I7m{Iu
zVNW(sexMyD38-i@3ZYeJkjx8#kX;Rv6k1l~teiTFV9qck+{r;oB&<OKd?cLroEyX;
zfwQ$}LE(Cekri4-RZT*>>nH3bVKPOl2LHK)Zm?1!6s_s8c3StRu%N(m#7%fwVrmb|
zw^U7a0To9awPX&4cq5+?dY@ki{=x_e-~k0sWHS>+KH+>iGO$<yoX?y)&`Mauh+Nqv
zV3&L=#S(#&#E!+<PY_%Ld_9WmM+QEf6%tVCYeuw?i=cQ!7xMw;9kCi5yj6-~!i%U~
z{2;z0j{oe#;oUv{);>R=JTf@N;-nG+;ZH#Q`}BIl05{FKNByhXATYC^V-LG7bkQ^i
zzdJw*cDQR4uL$66At*f}!1OFMC0&#79a=J;JY`+SGqtC5=}=F$2gZsm);fM}xxDN1
zKxxnfA=@x8qhsRoSOuPGS9J3-jpy|0Vz!A)m#l_PW77o}L~r1!2hpHt&6I7>H!RBv
z(t)t;v;qAa_@KpMVv@~INYvA)y6BgI)-jctWutqa2J^XGwph^A<7P$<znbvhYG*nF
zqvL|b@d0oEG-Z8dhHd@c_=y{ESQ@SLsJoS`x40)9HA|QonX;yi?e?=t*3=fU2Chtq
zVTTiRy#dh3Wba-uhVf<aU|4kKO`-YrrrVvx)3E`&hAa|8!v-F&8-6?)T1>Oi@^jxM
zO6_fiz+OkZ1X2F!1@@~ZBSZ+6c<faq;Z@?XI+gy|^pAU=D7TSKV0vwbsKXtTb{9l*
zHMnwcRZP_ZVx7j(CXKlqvirS|+h`NPg0EeB2yU%KBg;IxHurrk0=^Eu_-O%)*1x~s
zTDxGKx4%x7zkl-uboiy^NZ3XF?FH7=DXQBazn(_}$cvQ|uu}iy*4hQ@YWQ#UMAPND
zYpJj=_}dHLLTDQ#ertv>u&njmgEUIBD_ao3gyjX$Fa!vGGe-vNP4p%SW-gu%n_<>6
z;LR#FG}I2Ung6X6c;PQSJ=R)kp2vV0IJ~cm|NXQ71?I0>-H>Zirre;sJhu-1>1mxj
Kp8do5>;D4rHz)l7

literal 0
HcmV?d00001

diff --git a/docs/source/dynamo/custom-backends.rst b/docs/source/dynamo/custom-backends.rst
new file mode 100644
index 0000000000000..2c8b338045e62
--- /dev/null
+++ b/docs/source/dynamo/custom-backends.rst
@@ -0,0 +1,154 @@
+Custom Backends
+===============
+
+Debugging Backend
+-----------------
+
+Suppose you wanted to better understand what is going on during a
+compilation you can create a custom compiler which we’ll refer to as a
+backend that will print pretty print the fx ``GraphModule`` extracted
+from dynamo’s bytecode analysis and return a ``forward()`` callable.
+
+.. code-block:: python
+
+   from typing import List
+   import torch
+   import torch._dynamo as dynamo
+   def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+       print("my_compiler() called with FX graph:")
+       gm.graph.print_tabular()
+       return gm.forward  # return a python callable
+   @dynamo.optimize(my_compiler)
+   def fn(x, y):
+       a = torch.cos(x)
+       b = torch.sin(y)
+       return a + b
+   fn(torch.randn(10), torch.randn(10))
+
+Running the above example produces the following output:
+
+::
+
+   my_compiler() called with FX graph:
+   opcode         name    target                                                  args        kwargs
+   -------------  ------  ------------------------------------------------------  ----------  --------
+   placeholder    x       x                                                       ()          {}
+   placeholder    y       y                                                       ()          {}
+   call_function  cos     <built-in method cos of type object at 0x7f1a894649a8>  (x,)        {}
+   call_function  sin     <built-in method sin of type object at 0x7f1a894649a8>  (y,)        {}
+   call_function  add     <built-in function add>                                 (cos, sin)  {}
+   output         output  output                                                  ((add,),)   {}
+
+This works for ``torch.nn.Module`` as well as shown below
+
+.. code-block:: python
+
+   import torch
+   import torch._dynamo as dynamo
+   class MockModule(torch.nn.Module):
+       def __init__(self):
+           super().__init__()
+           self.relu = torch.nn.ReLU()
+       def forward(self, x):
+           return self.relu(torch.cos(x))
+   mod = MockModule()
+   optimized_mod = dynamo.optimize(my_compiler)(mod)
+   optimized_mod(torch.randn(10))
+
+Let’s take a look at one more example with control flow.
+
+.. code-block:: python
+
+   from typing import List
+   import torch
+   import torch._dynamo as dynamo
+   def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+       print("my_compiler() called with FX graph:")
+       gm.graph.print_tabular()
+       return gm.forward  # return a python callable
+   @dynamo.optimize(my_compiler)
+   def toy_example(a, b):
+       x = a / (torch.abs(a) + 1)
+       if b.sum() < 0:
+           b = b * -1
+       return x * b
+   for _ in range(100):
+       toy_example(torch.randn(10), torch.randn(10))
+
+Running this example produces the following output:
+
+::
+
+   my_compiler() called with FX graph:
+   opcode         name     target                                                  args              kwargs
+   -------------  -------  ------------------------------------------------------  ----------------  --------
+   placeholder    a        a                                                       ()                {}
+   placeholder    b        b                                                       ()                {}
+   call_function  abs_1    <built-in method abs of type object at 0x7f8d259298a0>  (a,)              {}
+   call_function  add      <built-in function add>                                 (abs_1, 1)        {}
+   call_function  truediv  <built-in function truediv>                             (a, add)          {}
+   call_method    sum_1    sum                                                     (b,)              {}
+   call_function  lt       <built-in function lt>                                  (sum_1, 0)        {}
+   output         output   output                                                  ((truediv, lt),)  {}
+
+   my_compiler() called with FX graph:
+   opcode         name    target                   args         kwargs
+   -------------  ------  -----------------------  -----------  --------
+   placeholder    b       b                        ()           {}
+   placeholder    x       x                        ()           {}
+   call_function  mul     <built-in function mul>  (b, -1)      {}
+   call_function  mul_1   <built-in function mul>  (x, mul)     {}
+   output         output  output                   ((mul_1,),)  {}
+
+   my_compiler() called with FX graph:
+   opcode         name    target                   args       kwargs
+   -------------  ------  -----------------------  ---------  --------
+   placeholder    b       b                        ()         {}
+   placeholder    x       x                        ()         {}
+   call_function  mul     <built-in function mul>  (x, b)     {}
+   output         output  output                   ((mul,),)  {}
+
+The order of the last two graphs is nondeterministic depending
+on which one is encountered first by the just-in-time compiler.
+
+Speedy Backend
+--------------
+
+Integrating a custom backend that offers superior performance is also
+easy and we’ll integrate a real one
+with `optimize_for_inference <https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html>`__:
+
+.. code-block :: python
+
+   def optimize_for_inference_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+       scripted = torch.jit.trace(gm, example_inputs)
+       return torch.jit.optimize_for_inference(scripted)
+
+And then you should be able to optimize any existing code with
+
+.. code-block:: python
+
+   @dynamo.optimize(optimize_for_inference_compiler)
+   def code_to_accelerate():
+       ...
+
+Composable Backends
+-------------------
+
+TorchDynamo includes many backends, which can be found in
+`backends.py <https://github.com/pytorch/pytorch/blob/master/torch/_dynamo/optimizations/backends.py>`__
+or ``torchdynamo.list_backends()``. You can combine these backends
+together with the following code:
+
+.. code-block:: python
+
+   from torch._dynamo.optimizations import BACKENDS
+   def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+       trt_compiled = BACKENDS["tensorrt"](gm, example_inputs)
+       if trt_compiled is not None:
+           return trt_compiled
+       # first backend failed, try something else...
+       cudagraphs_compiled = BACKENDS["cudagraphs"](gm, example_inputs)
+       if cudagraphs_compiled is not None:
+           return cudagraphs_compiled
+       return gm.forward
diff --git a/docs/source/dynamo/deep-dive.rst b/docs/source/dynamo/deep-dive.rst
new file mode 100644
index 0000000000000..c60047c2a3d8d
--- /dev/null
+++ b/docs/source/dynamo/deep-dive.rst
@@ -0,0 +1,145 @@
+TorchDynamo Deeper Dive
+=======================
+**Author**: `Jason Ansel <https://github.com/jansel>`_
+
+What is a guard?
+----------------
+
+TorchDynamo operates just-in-time and specializes graphs based on
+dynamic properties. For example, the first graph above has the following
+guards:
+
+::
+
+   GUARDS:
+    - local 'a' TENSOR_MATCH
+    - local 'b' TENSOR_MATCH
+    - global 'torch' FUNCTION_MATCH
+
+If any of those guards fail, the graph will be recaptured and
+recompiled. The interesting guard type there is ``TENSOR_MATCH``, which
+checks the following torch.Tensor properties:
+
+- Python class of the tensor (tensor subclassing, etc)
+- dtype
+- device
+- requires_grad
+- dispatch_key (with thread-local includes/excludes applied)
+- ndim
+- sizes\* (optional)
+- strides\* (optional)
+
+For sizes/strides you can disable this specialization by setting the
+following parameter:
+
+.. code-block:: python
+
+torch._dynamo.config.dynamic_shapes = True
+
+The full specialization mode allows the backend compiler to assume an
+entirely static graph. Unfortunately, most backends require this.
+Operators which return dynamic shapes will trigger a graph break when
+not in dynamic shape mode.
+
+What is dynamo doing?
+---------------------
+
+If you want to understand better what TorchDynamo is doing, you can set:
+
+.. code-block:: python
+
+   torchdynamo.config.debug = True
+
+which triggers useful (but spammy) printouts.
+
+For example, the printouts for the first graph in the ``toy_example``
+above are:
+
+::
+
+   __compiled_fn_0 <eval_with_key>.1
+   opcode         name     target                                                  args              kwargs
+   -------------  -------  ------------------------------------------------------  ----------------  --------
+   placeholder    a        a                                                       ()                {}
+   placeholder    b        b                                                       ()                {}
+   call_function  abs_1    <built-in method abs of type object at 0x7f9ca082f8a0>  (a,)              {}
+   call_function  add      <built-in function add>                                 (abs_1, 1)        {}
+   call_function  truediv  <built-in function truediv>                             (a, add)          {}
+   call_method    sum_1    sum                                                     (b,)              {}
+   call_function  lt       <built-in function lt>                                  (sum_1, 0)        {}
+   output         output   output                                                  ((truediv, lt),)  {}
+
+   ORIGINAL BYTECODE toy_example example.py 9
+    10           0 LOAD_FAST                0 (a)
+                 2 LOAD_GLOBAL              0 (torch)
+                 4 LOAD_METHOD              1 (abs)
+                 6 LOAD_FAST                0 (a)
+                 8 CALL_METHOD              1
+                10 LOAD_CONST               1 (1)
+                12 BINARY_ADD
+                14 BINARY_TRUE_DIVIDE
+                16 STORE_FAST               2 (x)
+
+    11          18 LOAD_FAST                1 (b)
+                20 LOAD_METHOD              2 (sum)
+                22 CALL_METHOD              0
+                24 LOAD_CONST               2 (0)
+                26 COMPARE_OP               0 (<)
+                28 POP_JUMP_IF_FALSE       38
+
+    12          30 LOAD_FAST                1 (b)
+                32 LOAD_CONST               3 (-1)
+                34 BINARY_MULTIPLY
+                36 STORE_FAST               1 (b)
+
+    13     >>   38 LOAD_FAST                2 (x)
+                40 LOAD_FAST                1 (b)
+                42 BINARY_MULTIPLY
+                44 RETURN_VALUE
+
+   MODIFIED BYTECODE
+     9           0 LOAD_GLOBAL              3 (__compiled_fn_0)
+                 2 LOAD_FAST                0 (a)
+                 4 LOAD_FAST                1 (b)
+                 6 CALL_FUNCTION            2
+                 8 UNPACK_SEQUENCE          2
+                10 STORE_FAST               2 (x)
+                12 POP_JUMP_IF_FALSE       24
+                14 LOAD_GLOBAL              4 (__resume_at_30_1)
+                16 LOAD_FAST                1 (b)
+                18 LOAD_FAST                2 (x)
+                20 CALL_FUNCTION            2
+                22 RETURN_VALUE
+           >>   24 LOAD_GLOBAL              5 (__resume_at_38_2)
+                26 LOAD_FAST                1 (b)
+                28 LOAD_FAST                2 (x)
+                30 CALL_FUNCTION            2
+                32 RETURN_VALUE
+
+   GUARDS:
+    - local 'a' TENSOR_MATCH
+    - local 'b' TENSOR_MATCH
+    - global 'torch' FUNCTION_MATCH
+
+At the top you can see the FX graph (which we already shared above).
+Next you see the original bytecode of the function, followed by the
+modified bytecode generated by TorchDynamo. Finally, you see the guards
+which we covered above.
+
+In the modified bytecode ``__compiled_fn_0`` is the return value of
+``my_compiler()`` (the compiled graph). ``__resume_at_30_1`` and
+``__resume_at_38_2`` are both generated continuation functions that pick
+up execution after a graph break (at bytecode offsets 30 and 38). Each
+of these functions take the form:
+
+::
+
+   __resume_at_<offset>:
+       ... restore stack state if needed ...
+       JUMP_ABSOLUTE <offset> into toy_example
+       ... original bytecode of toy_example ...
+
+By generating this `resume_at` function we force the remainder of the
+function to be executed in a new Python frame which recursively
+triggers TorchDynamo to restart its capture once execution reaches that
+point for the first time.
diff --git a/docs/source/dynamo/faq.rst b/docs/source/dynamo/faq.rst
new file mode 100644
index 0000000000000..decb3e2024de2
--- /dev/null
+++ b/docs/source/dynamo/faq.rst
@@ -0,0 +1,376 @@
+Frequently Asked Questions
+==========================
+
+At a high level, the TorchDynamo stack consists of a graph capture from
+Python code using dynamo and a backend compiler. In this example the
+backend compiler consists of backward graph tracing using AOTAutograd
+and graph lowering using TorchInductor. There are of course many more
+compilers available `here <https://github.com/pytorch/torchdynamo/blob/0b8aaf340dad4777a080ef24bf09623f1aa6f3dd/README.md#existing-backend>`__
+but for this document we will focus on inductor as a motivating example.
+
+Torchdynamo supports training, using AotAutograd to capture backwards:
+
+   1. the ``.forward()`` graph and ``optimizer.step()`` is captured by torchdynamo’s python evalframe frontend
+   2. for each segment of ``.forward()`` that torchdynamo captures, it uses AotAutograd to generate a backward graph segment
+   3. each pair of forward, backward graph are (optionally) min-cut partitioned to save the minimal state between forward/backward
+   4. the forward, backward pairs are wrapped in autograd.function modules 5. usercode calling\ ``.backward()`` still triggers eager’s autograd engine, which runs each ‘compiled backward’ graph as if it were one op, also running any non-compiled eager ops’ .backward() functions
+
+Do you support Distributed code?
+--------------------------------
+
+DDP has been tested and works, support for other distributed training
+libraries is under discussion.
+
+The main reason why Distributed code is challenging with dynamo is
+because AOTAutograd unrolls both the forward and backward pass and
+provides 2 graphs for backends to optimize. This is a problem for
+distributed code because we’d like to ideally overlap communication
+operations with computations. Eager pytorch accomplishes this in
+different ways for DDP/FSDP- using autograd hooks, module hooks, and
+modifications/mutations of module states. In a naive application of
+dynamo, hooks that should run directly after an operation during
+backwards may be delayed until after the entire compiled region of
+backwards ops, due to how AOTAutograd compiled functions interact with
+dispatcher hooks.
+
+The basic strategy for optimizing DDP with Dynamo is outlined in
+`distributed.py <https://github.com/pytorch/pytorch/blob/master/torch/_dynamo/optimizations/distributed.py>`__
+where the main idea will be to graph break on `DDP bucket
+boundaries <https://pytorch.org/docs/stable/notes/ddp.html#internal-design>`__.
+
+When each node in DDP needs to synchronize its weights with the other
+nodes it organizes its gradients and parameters into buckets which
+reduces communication times and allows a node to broadcast a fraction of
+its gradients to other waiting nodes.
+
+Graph breaks in distributed code means you can expect dynamo and its
+backends to optimize the compute overhead of a distributed program but
+not its communication overhead. Graph-breaks may interfere with
+compilation speedups, if the reduced graph-size robs the compiler of
+fusion opportunities. However, there are diminishing returns with
+increasing graph size since most of the current compute optimizations
+are local fusions. So in practice this approach may be sufficient.
+
+Do I still need to export whole graphs?
+---------------------------------------
+
+For the vast majority of models you probably don’t and you can use
+``torch._dynamo()`` optimize as is but there are a few situations where
+full graphs are necessary and you can can ensure a full graph by simply
+running ``torch.dynamo(..., nopython=True)`` \* Large scale training
+runs, think $250K+ that require pipeline parallelism and other advanced
+sharding strategies \* Inference optimizers like
+`TensorRT <https://github.com/pytorch/TensorRT>`__ or
+`AITemplate <https://github.com/facebookincubator/AITemplate>`__ that rely
+on fusing much more aggressively than training optimizers \* Mobile training or
+inference.
+
+Future work will include tracing communication operations into graphs,
+coordinating these operations with compute optimizations, and optimizing
+the communciation operations.
+
+Why is my code crashing?
+------------------------
+
+If your code ran just fine without dynamo and started to crash with it
+enabled then the most important first step is figuring out which part of
+the stack your failure occurred in so try running things in the below
+order and only try the next step if the previous step succeeded.
+
+1. ``dynamo.optimize("eager")`` which only runs torchdynamo forward graph
+   capture and then runs the captured graph with PyTorch. If this fails
+   then there’s an issue with TorchDynamo.
+
+2. ``dynamo.optimize("aot_eager")``
+   which runs torchdynamo to capture a forward graph, and then AOTAutograd
+   to trace the backward graph without any additional backend compiler
+   steps. PyTorch eager will then be used to run the forward and backward
+   graphs. If this fails then there’s an issue with AOTAutograd.
+
+3. ``dynamo.optimize("inductor")`` which runs torchdynamo to capture a
+   forward graph, and then AOTAutograd to trace the backward graph with the
+   TorchInductor compiler. If this fails then there’s an issue with TorchInductor
+
+TorchDynamo Errors
+~~~~~~~~~~~~~~~~~~
+
+If the error that is generated occurs with the ``"eager"`` backend, then
+torchdynamo is the most likely source of the error.
+
+To debug these issues we recommend setting
+``torch._dynamo.config.verbose=True`` to get a full stack trace to both
+the error in torchdynamo and the user code. In addition to this flag,
+you can also set the ``log_level`` of torchdynamo through
+``torch._dynamo.config.log_level``. The available levels are the
+following: - ``logging.DEBUG``: Print every instruction that is
+encountered in addition to all below log levels - ``logging.INFO``:
+Print each function that is compiled (original and modified bytecode)
+and the graph that is captured in addition to all below log levels -
+``logging.WARNING`` (default): Print graph breaks in addition to all
+below log levels - ``logging.ERROR``: Print errors only
+
+If a model is sufficiently large, the logs can become overwhelming. If
+an error occurs deep within a model’s python code, it can be useful to
+execute only the frame in which the error occurs to enable easier
+debugging. There are 2 tools available to enable this:
+
+* ``env TORCHDYNAMO_DEBUG_FUNCTION=<desired_function_name>`` will only run TorchDynamo on functions with that name.
+
+* ``env torch._dynamo.config.replay_record_enabled = True``) which dumps an execution record when an error is encountered. This record can then be replayed to run only the frame where an error occurred.
+
+TorchInductor Errors
+--------------------
+
+With TorchInductor as the chosen backend, AOTAutograd is used to
+generate the backward graph from the forward graph captured by
+torchdynamo. It’s important to note that errors can occur during this
+tracing and also while TorchInductor lowers the forward and backward
+graphs to GPU code or C++.
+
+A model can often consist of hundreds or thousands of FX nodes, so
+narrowing the exact nodes where this problem occurred can be very
+difficult which is why we highly recommend you use our minifier to
+create tiny reproducible examples of failures you’re seeing. We can
+minify errors that occur either at the AOTAutograd layer or Inductor
+layer which you should try in the following order.
+
+1. ``env TORCHDYNAMO_REPRO_AFTER="aot" python your_model.py``
+2.  ``env TORCHDYNAMO_REPRO_AFTER="dynamo" python your_model.py``
+
+Minifying your error is the quickest path to getting it fixed.
+
+The minifier will actually create a ``repro.py`` for you at the location
+set by ``env TORCHDYNAMO_REPRO_DIR`` so make you have right access to
+that directory. You can then run ``python repro.py`` and confirm that
+you are getting the same error.
+
+.. note::
+   For other compilers such as nvfuser, the process is similar but
+   instead you would leverage ``env TORCHDYNAMO_REPRO_AFTER="dynamo" python your_model.py``.
+
+Why is compilation slow?
+------------------------
+
+Dynamo Compilation
+~~~~~~~~~~~~~~~~~~
+
+TorchDynamo has a builtin stats function for collecting and displaying
+the time spent in each compilation phase. These stats can be accessed by
+calling ``torch._dynamo.utils.compile_times()`` after executing
+``torch._dynamo``. By default, this returns a string representation of
+the compile times spent in each TorchDynamo function by name.
+
+Inductor Compilation
+~~~~~~~~~~~~~~~~~~~~
+
+TorchInductor has a builtin stats and trace function for displaying time
+spent in each compilation phase, output code, output graph visualization
+and IR dump. ``env TORCHINDUCTOR_TRACE=1 python repro.py``. This is a
+debugging tool designed to make it easier to debug/understand the
+internals of TorchInductor with an output that will look something like
+`this <https://gist.github.com/jansel/f4af078791ad681a0d4094adeb844396>`__
+
+Each file in that debug trace can be enabled/disabled via
+``torch._inductor.config.trace.*``. The profile and the diagram are both
+disabled by default since they are expensive to generate. See the
+`example debug directory
+output <https://gist.github.com/jansel/f4af078791ad681a0d4094adeb844396>`__
+for more examples.
+
+Excessive Recompilation
+~~~~~~~~~~~~~~~~~~~~~~~
+
+When TorchDynamo compiles a function (or part of one), it makes certain
+assumptions about locals and globals in order to allow compiler
+optimizations, and expresses these assumptions as guards that check
+particular values at runtime. If any of these guards fail, Dynamo will
+recompile that function (or part) up to
+``torch._dynamo.config.cache_size_limit`` times. If your program is
+hitting the cache limit, you will first need to determine which guard is
+failing and what part of your program is triggering it.
+
+The `recompilation profiler <#recompilation-profiler>`__ automates the
+process of setting TorchDynamo’s cache limit to 1 and running your
+program under an observation-only ‘compiler’ that records the causes of
+any guard failures. You should be sure to run your program for at least
+as long (as many iterations) as you were running when you ran into
+trouble, and the profiler will accumulate statistics over this duration.
+
+.. code-block:: python
+
+   prof = dynamo.utils.CompilationProfiler()
+   @dynamo.optimize(prof)
+   def my_model():
+       ...
+   my_model()
+   print(prof.report())
+
+Many of the reasons for graph breaks and excessive recompilation will be
+fixed with upcoming support for `tracing dynamic tensor
+shapes <https://docs.google.com/document/d/1QJB-GOnbv-9PygGlOMXwiO9K6vVNm8sNg_olixJ9koc/edit?usp=sharing>`__,
+more careful choices for guards and better tuned heuristics.
+
+Why are you recompiling in production?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In some cases, you may not want unexpected compiles after a program has
+warmed up. For example, if you are serving production traffic in a
+latency critical application. For this, TorchDynamo provides an
+alternate mode where prior compiled graphs are used, but no new ones are
+generated:
+
+.. code-block:: python
+
+   frozen_toy_example = dynamo.run(toy_example)
+   frozen_toy_example(torch.randn(10), torch.randn(10))
+
+How are you speeding up my code?
+--------------------------------
+
+There are 3 major ways to accelerat PyTorch code:
+
+1. Kernel fusion via vertical fusions which fuse sequential operations to avoid
+   excessive read/writes. For example, fuse 2 subsequent cosines means you
+   can can do 1 read 1 write instead 2 reads 2 writes 2. Horizontal fusion:
+   the simplest example being batching where a single matrix is multiplied
+   with a batch of examples but the more general scenario is a grouped GEMM
+   where a group of matrix multiplications are scheduled together
+
+2. Out of order execution: A general optimization for compilers, by looking ahead
+   at the exact data dependencies within a graph we can decide on the most
+   opportune time to execute a node and which buffers can be reused
+
+3. Automatic work placement: Similar of the out of order execution point,
+   but by matching nodes of a graph to resources like physical hardware or
+   memory we can design an appropriate schedule
+
+The above are general principles for accelerating PyTorch code but
+different backends will each make different tradeoffs on what to
+optimize. For example Inductor first takes care of fusing whatever it
+can and only then generates `Triton <https://openai.com/blog/triton/>`__
+kernels. It can also
+
+Triton in addition offers speedups because of automatic memory
+coalescing, memory management and scheduling within each Streaming
+Multiprocessor and has been designed to handle tiled computations.
+
+However, regardless of the backend you use it’s best to use a benchmark
+and see approach so try out the PyTorch profiler, visually inspect the
+generated kernels and try to see what’s going on for yourself.
+
+Why am I not seeing speedups?
+-----------------------------
+
+Graph Breaks
+~~~~~~~~~~~~
+
+The main reason you won’t see the speedups you’d like to by using dynamo
+is excessive graph breaks. So what’s a graph break?
+
+Given a program like:
+
+.. code-block:: python
+
+   @dynamo.optimize(...)
+   def some_fun(x):
+       ...
+   some_fun(x)
+   ...
+
+Torchdynamo will attempt to compile all of the torch/tensor operations
+within ``some_fun()`` into a single FX graph, but it may fail to capture
+everything into one graph.
+
+Some graph break reasons are insurmountable to TorchDynamo like calling
+into a C extension other than torch is invisible to torchdynamo, and
+could do arbitrary things without TorchDynamo being able to introduce
+necessary guards to ensure that the compiled program would be safe to reuse.
+
+   To maximize performance, it’s important to have as few graph breaks
+   as possible.
+
+Identifying the cause of a graph break
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To identify all graph breaks in a program and the associated reasons for
+the breaks, ``torch._dynamo.explain`` can be used. This tool runs
+TorchDynamo on the supplied function and aggregates the graph breaks
+that are encountered. Here is an example usage:
+
+.. code-block:: python
+
+   import torch
+   import torch._dynamo as dynamo
+   def toy_example(a, b):
+       x = a / (torch.abs(a) + 1)
+       print("woo")
+       if b.sum() < 0:
+           b = b * -1
+       return x * b
+   explanation, out_guards, graphs, ops_per_graph = dynamo.explain(toy_example, torch.randn(10), torch.randn(10))
+   print(explanation)
+   """
+   Dynamo produced 3 graphs, with 2 graph break and 6 ops.
+    Break reasons:
+   1. call_function BuiltinVariable(print) [ConstantVariable(str)] {}
+      File "t2.py", line 16, in toy_example
+       print("woo")
+
+   2. generic_jump
+      File "t2.py", line 17, in toy_example
+       if b.sum() < 0:
+    """
+
+To throw an error on the first graph break encountered you can use
+disable python fallback by using ``nopython=True``, this should be
+familiar if you’ve worked with export based compilers.
+
+.. code-block:: python
+
+   @dynamo.optimize(<compiler>, nopython=True)
+   def toy_example(a, b):
+      ...
+
+Why didn’t my code recompile when I changed it?
+-----------------------------------------------
+
+If you went ahead and enabled dynamic shapes via
+``env TORCHDYNAMO_DYNAMIC_SHAPES=1 python model.py`` then your code
+won’t recompile on shape changes. We’ve added support for dynamic shapes
+which avoids recompilations in the case when shapes vary by less than a
+factor of 2. This is especially useful in scenarios like varying image
+sizes in CV or variable sequence length in NLP. In inference scenarios
+it’s often not possible to know what a batch size will be beforehand
+because you take what you can get from different client apps.
+
+In general, TorchDynamo tries very hard not to recompile things
+unnecessarily so if for example torchdynamo finds 3 graphs and your
+change only modified one graph then only that graph will recompile. So
+another tip to avoid potentially slow compilation times is to warmup a
+model by compiling it once after which subsequent compilations will be
+much faster. Cold start compile times is still a metric we track
+visibly.
+
+Why am I getting incorrect results?
+-----------------------------------
+
+Accuracy issues can also be minified if you set the environment variable
+``TORCHDYNAMO_REPRO_LEVEL=4``, it operates with a similar git bisect
+model and a full repro might be something like
+``TORCHDYNAMO_REPRO_AFTER="aot" TORCHDYNAMO_REPRO_LEVEL=4`` the reason
+we need this is downstream compilers will codegen code whether it’s
+Triton code or the C++ backend, the numerics from those downstream
+compilers can be different in subtle ways yet have dramatic impact on
+your training stability. So the accuracy debugger is very useful for us
+to detect bugs in our codegen or with a backend compiler.
+
+Why am I getting OOMs?
+----------------------
+
+Dynamo is still an alpha product so there’s a few sources of OOMs and if
+you’re seeing an OOM try disabling the following configurations in this
+order and then open an issue on Github so we can solve the root problem
+1. If you’re using dynamic shapes try disabling them, we’ve disabled
+them by default: ``env TORCHDYNAMO_DYNAMIC_SHAPES=0 python model.py`` 2.
+CUDA graphs with Triton are enabled by default in inductor but removing
+them may alleviate some OOM issues: ``torch._inductor.config = False``.
diff --git a/docs/source/dynamo/get-started.rst b/docs/source/dynamo/get-started.rst
new file mode 100644
index 0000000000000..44434d49e525d
--- /dev/null
+++ b/docs/source/dynamo/get-started.rst
@@ -0,0 +1,181 @@
+Getting Started
+===============
+
+Let’s start with a simple example and make things more complicated step
+by step. Please note that you’re likely to see more significant speedups
+the newer your GPU is.
+
+.. code:: python
+
+   from torch._dynamo import optimize
+   import torch
+   def fn(x, y):
+       a = torch.cos(x).cuda()
+       b = torch.sin(y).cuda()
+       return a + b
+   new_fn = optimize("inductor")(fn)
+   input_tensor = torch.randn(10000).to(device="cuda:0")
+   a = new_fn()
+
+This example will not actually run faster. Its purpose is to demonstrate
+the ``torch.cos()`` and ``torch.sin()`` features which are
+examples of pointwise ops as in they operate element by element on a
+vector. A more famous pointwise op you might actually want to use would
+be something like ``torch.relu()``. Pointwise ops in eager mode are
+suboptimal because each one would need to need to read a tensor from
+memory, make some changes and then write back those changes. The single
+most important optimization that inductor does is fusion. So back to our
+example we can turn 2 reads and 2 writes into 1 read and 1 write which
+is crucial especially for newer GPUs where the bottleneck is memory
+bandwidth (how quickly you can send data to a GPU) instead of compute
+(how quickly your GPU can crunch floating point operations)
+
+Another major optimization that inductor makes available is automatic
+support for CUDA graphs.
+CUDA graphs help eliminate the overhead from launching individual
+kernels from a python program which is especially relevant for newer GPUs.
+
+dynamo supports many different backends but inductor specifically works
+by generating `Triton <https://github.com/openai/triton>`__ kernels and
+we can inspect them by running ``TORCHINDUCTOR_TRACE=1 python trig.py``
+with the actual generated kernel being
+
+.. code:: python
+
+   @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
+   @triton.jit
+   def kernel(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+       xnumel = 10000
+       xoffset = tl.program_id(0) * XBLOCK
+       xindex = xoffset + tl.reshape(tl.arange(0, XBLOCK), [XBLOCK])
+       xmask = xindex < xnumel
+       x0 = xindex
+       tmp0 = tl.load(in_ptr0 + (x0), xmask)
+       tmp1 = tl.sin(tmp0)
+       tmp2 = tl.sin(tmp1)
+       tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)
+
+And you can verify that fusing the two ``sins`` did actually occur
+because the two ``sin`` operations occur within a single Triton kernel
+and the temporary variables are held in registers with very fast access.
+
+You can read up a lot more on Triton’s performance
+`here <https://openai.com/blog/triton/>`__ but the key is it’s in python
+so you can easily understand it even if you haven’t written all that
+many CUDA kernels.
+
+As a next step let’s try a real model like resnet50 from the PyTorch
+hub.
+
+.. code:: python
+
+   import torch
+   import torch._dynamo as dynamo
+   model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)
+   opt_model = dynamo.optimize("inductor")(model)
+   model(torch.randn(1,3,64,64))
+
+And that’s not the only available backend, you can run in a REPL
+``dynamo.list_backends()`` to see all the available ones. Try out the
+``aot_cudagraphs`` or ``nvfuser`` next as inspiration.
+
+Let’s do something a bit more interesting now, our community frequently
+uses pretrained models from
+`transformers <https://github.com/huggingface/transformers>`__ or
+`TIMM <https://github.com/rwightman/pytorch-image-models>`__ and one of
+our design goals is for dynamo and inductor to work out of the box with
+any model that people would like to author.
+
+So we’re going to directly download a pretrained model from the
+HuggingFace hub and optimize it:
+
+.. code:: python
+
+   import torch
+   from transformers import BertTokenizer, BertModel
+   import torch._dynamo as dynamo
+   # Copy pasted from here https://huggingface.co/bert-base-uncased
+   tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+   model = BertModel.from_pretrained("bert-base-uncased").to(device="cuda:0")
+   model = dynamo.optimize("inductor")(model) # This is the only line of code that we changed
+   text = "Replace me by any text you'd like."
+   encoded_input = tokenizer(text, return_tensors='pt').to(device="cuda:0")
+   output = model(**encoded_input)
+
+If you remove the ``to(device="cuda:0")`` from the model and
+encoded_input then triton will generate C++ kernels that will be
+optimized for running on your CPU. You can inspect both Triton or C++
+kernels for BERT, they’re obviously more complex than the trigonometry
+example we had above but you can similarly skim it and understand if you
+understand PyTorch.
+
+Similarly let’s try out a TIMM example
+
+.. code:: python
+
+   import timm
+   import torch._dynamo as dynamo
+   import torch
+   model = timm.create_model('resnext101_32x8d', pretrained=True, num_classes=2)
+   opt_model = dynamo.optimize("inductor")(model)
+   opt_model(torch.randn(64,3,7,7))
+
+Our goal with dynamo and inductor was to build the highest coverage ML compiler which should work with any model you throw at it.
+
+Existing Backends
+~~~~~~~~~~~~~~~~~
+
+TorchDynamo has a growing list of backends, which can be found in
+`backends.py <https://github.com/pytorch/pytorch/blob/master/torch/_dynamo/optimizations/backends.py>`__
+or ``torchdynamo.list_backends()`` each of which with its optional dependencies.
+
+Some of the most commonly used backend include:
+
+* **Debugging backends**: \* ``dynamo.optimize("eager")`` - Uses PyTorch
+  to run the extracted GraphModule. This is quite useful in debugging
+  TorchDynamo issues. \* ``dynamo.optimize("aot_eager")`` - Uses
+  AotAutograd with no compiler, i.e, just using PyTorch eager for the
+  AotAutograd’s extracted forward and backward graphs. This is useful for
+  debugging, and unlikely to give speedups.
+
+* **Training & inference backends**: \* ``dynamo.optimize("inductor")`` -
+  Uses TorchInductor backend with AotAutograd and cudagraphs by leveraging
+  codegened Triton kernels `Read
+  more <https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747>`__
+
+  * ``dynamo.optimize("nvfuser")`` - nvFuser with TorchScript. `Read more <https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593>`__
+
+  * ``dynamo.optimize("aot_nvfuser")`` - nvFuser with AotAutograd. `Read more <https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593>`__
+
+  * ``dynamo.optimize("aot_cudagraphs")`` - cudagraphs with AotAutograd. `Read more <https://github.com/pytorch/torchdynamo/pull/757>`__
+
+* **Inference-only backend**\ s: \* ``dynamo.optimize("ofi")`` - Uses
+  Torchscript optimize_for_inference. `Read
+  more <https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html>`__
+
+  * ``dynamo.optimize("fx2trt")`` - Uses Nvidia TensorRT for inferenc optimizations. `Read more <https://github.com/pytorch/TensorRT/blob/master/docsrc/tutorials/getting_started_with_fx_path.rst>`__
+
+  * ``dynamo.optimize("onnxrt")`` - Uses ONNXRT for inference on CPU/GPU. `Read more <https://onnxruntime.ai/>`__ \* ``dynamo.optimize("ipex")`` - Uses IPEX for inference on CPU. `Read more <https://github.com/intel/intel-extension-for-pytorch>`__
+
+Why do you need another way of optimizing PyTorch code?
+-------------------------------------------------------
+
+While a number of other code optimization tools exist in the PyTorch
+ecosystem, each of them has its own flow. Here is a few examples of
+existing methods and their limitations:
+
+-  ``torch.jit.trace()`` is silently wrong if it cannot trace e.g:
+   during control flow
+-  ``torch.jit.script()`` requires modifications to user or library code
+   by adding type annotations and removing non PyTorch code
+-  ``torch.fx.symbolic_trace()`` either traces correctly or gives a hard
+   error but it’s limited to traceable code so still can’t handle
+   control flow
+-  ``torch._dynamo`` works out of the box and produces partial graphs.
+   It still has the option of producing a single graph with
+   ``nopython=True`` which are needed for `some
+   situations <./documentation/FAQ.md#do-i-still-need-to-export-whole-graphs>`__
+   but allows a smoother transition where partial graphs can be
+   optimized without code modification
+
+.. |image0| image:: ../_static/img/dynamo/TorchDynamo.png
diff --git a/docs/source/dynamo/guards-overview.rst b/docs/source/dynamo/guards-overview.rst
new file mode 100644
index 0000000000000..4991a831940a4
--- /dev/null
+++ b/docs/source/dynamo/guards-overview.rst
@@ -0,0 +1,513 @@
+Guards Overview
+===============
+
+From a UX perspective, TorchDynamo is very easy to use. The user invokes
+``torchdynamo.optimize`` as an annotation:
+
+.. code-block:: python
+
+   @torchdynamo.optimize(my_compiler)
+   def fn_foo(bar):
+
+Where a complete example looks like this:
+
+.. code-block:: python
+
+   from typing import List
+   import torch
+   import torchdynamo
+   def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+       print("my_compiler() called with FX graph:")
+       gm.graph.print_tabular()
+       return gm.forward  # return a python callable
+   @torchdynamo.optimize(my_compiler)
+   def toy_example(a, b):
+       x = a / (torch.abs(a) + 1)
+       if b.sum() < 0:
+           b = b * -1
+       return x * b
+   for _ in range(100):
+       toy_example(torch.randn(10), torch.randn(10))
+
+This allows TorchDynamo to capture the interpreted Python frames, grab
+any and all relevant information, and speed things up wherever it can.
+The speedup comes from a few places, and can be rather dependent on the
+backend (my_compiler above) provided, but the one speedup we care about
+most for today’s overview is **caching**. Caching itself is not a direct
+speedup, so much as a critical enablement to allow us to prevent
+recompilation. We dig a hole with dynamo, and caching allows us to get
+out. Its a speedup from that perspective, but relatively neutral when
+all things are considered - however, it enables us to hold perf
+neutrality while then enabling backends - the true source of our
+speedups.
+
+With even a pass-through no-op backend provided:
+
+.. code-block:: python
+
+   def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+       return gm.forward
+
+We can see TorchDynamo speeding up Python execution quite a bit, even on
+regular Python, not just PyTorch.
+
+Caching and Guards Overview
+---------------------------
+
+TorchDynamo operates through caching transformed (by TorchDynamo) user
+bytecode. When we receive a frame for evaluation, we check if the
+**objects referenced in the frame have changed** in certain ways, and if
+not, we read the previously transformed user bytecode to evaluate it.
+The details of how we do this will be saved for a later writeup.
+Instead, we will focus on how we can identify whether or not the
+**objects referenced in the frame have changed**. This is a critical
+piece of functionality in TorchDynamo, because it drives the entire
+invalidation lifecycle. We refer to this functionality as **guards**.
+
+At a very high level, the vastly oversimplified TLDR flow is this:
+
+1) We receive a python frame
+2) We convert the given frame from (1), passing it through instruction
+   translation
+3) For the objects captured in (2), we create tracking objects that are
+   (a) tracked on an output graph, which is an internal specialization
+   of a torch.fx.Tracer (and the topic of a later writeup), and (b)
+   guards, the topic of this document.
+4) We process the guard objects created in (3), turning them into a
+   generated python function, check_fn, associated with a piece of code.
+5) The check_fn is evaluated whenever we encounter this code a
+   subsequent time - if a check_fn passes and evaluates to True, we know
+   the code in the cache and the code encountered here is the same, and
+   can be safely used. If it fails and evaluates to False, we know the
+   code in the cache is not valid, and can be thrown out in favor of a
+   new entry, through recompilation or a graph break.
+
+Python Frame Evaluation and PEP 523
+-----------------------------------
+
+The functionality of TorchDynamo is based on 
+`PEP 523 <https://peps.python.org/pep-0523/>`__.
+
+TorchDynamo installs a frame evaluation function on Python, via
+`_PyInterpreterState_SetEvalFrameFunc`. The overview of function
+selection, thread management, and cleanup is out of scope for this
+writeup, but the important part is that TorchDynamo has a hook where
+Python can hand control back to us during evaluation.
+
+The function we have installed is ``convert_frame`` or
+``convert_frame_assert`` in the ``nopython=True`` case, but glossing
+over that nuance for now, let’s take a look at ``convert_frame_assert``,
+as ``convert_frame`` proxies to it anyway.
+
+We can find it on `line 20 of convert_frame.py
+<https://github.com/pytorch/torchdynamo/blob/main/torchdynamo/convert_frame.py#L200>`__,
+with a signature as follows:
+
+.. code-block:: python
+
+   def  convert_frame_assert(compiler_fn: Callable, one_graph=True):
+
+This function wraps the entry point of where Python invokes TorchDynamo
+with a frame, glossing over the nuances of ``wrap_convert_context`` for
+now:
+
+.. code-block:: python
+
+   def  _convert_frame_assert(frame: types.FrameType, cache_size: int):
+
+Here is what this function does:
+
+1) Checks if it has seen this ``code``\ (see: f_code `here
+   <https://docs.python.org/3/library/inspect.html>`__) before and exits
+   early if it did.
+2) Checks if the code is an unsupported case.
+3) Checks if the ``cache_size`` (second arg above) crosses the limit
+   defined in the config, ``cache_size_limit``. If it has, the function
+   drops the frame and logs warnings. This helps to avoid constant
+   recompilation of a frame as it generally means that the frame is hot
+   in an unexpected way and caching it produces needless overhead,
+   as it is likely to get evicted the next time it is encountered.
+4) Passes the frame, alongside a function that creates an
+   ``InstructionTranslator`` through bytecode
+   transformation, via ``transform_code_object``. A few crucial things
+   happen under the hood here:
+
+   1) New code is produced through ``transform_code_object``.
+
+   2) An FX tracer named ``output`` is produced through
+      ``InstructionTranslator``. 
+
+      This can be a bit confusing,
+      as ``InstructionTranslator`` is not an `fx` tracer, but its stored
+      in a variable named tracer, and its output*\ **is**\ *an `fx`tracer.*
+
+   3) The function produces guards and stores them on ``output`` above.
+
+   4) The function produces ``output_instructions`` and stores them on
+      ``output`` above.
+
+   5) The function maps the newly produced transformed code to the initial code it
+      read off the frame. This mapping is worth remembering, we will
+      refer to it much later on below where we cover guard failures.
+
+5) Using the transformed code from 4.1 and the guards from 4.3
+   the function produces a `GuardedCode`.
+
+Now that we have learned about frame evoluation, let’s review
+``InstructionTranslator``, and see how it turns the frame we handed
+it over into TorchDynamo internal types.
+
+InstructionTranslator
+---------------------
+
+`InstructionTranslator` does a lot! We won’t cover the details of
+everything it does, but most importantly for this document, it produces
+a mapping of ``symbolic_locals`` which maintains a mapping from the
+frame’s f_locals to TorchDynamo internal Variable objects (more on these
+in a moment. ``symbolic_locals`` is filled via traversing the frame’s
+locals:
+
+.. code-block:: python
+
+   self.symbolic_locals = collections.OrderedDict(
+       (k, VariableBuilder(self, LocalSource(k))(f_locals[k]))
+       for k in vars
+       if k in f_locals
+   )
+
+We will get to how this works later, from a few other examples that lead
+us to understanding ``VariableTracker`` and ``VariableBuilder``. The
+important component here, for us, for now, is the invocation of a call
+into ``VariableBuilder``. ``VariableBuilder``\ ’s call implementation
+proxies into a function called ``_wrap``, which in turn both constructs
+instances of ``VariableTracker`` and calls ``make_guards`` on them. More
+on that later.
+
+This mapping, in turn, is critical as each Variable has associated
+guards, which are then passed to ``self.output``, the instance of
+``OutputGraph``, an fx tracer, mentioned in 4.2 of the section above. If
+you recall, this ``OutputGraph``, stored in a variable called ``output``
+is where our guards are stored before being passed on to become
+``GuardedCode``
+
+How does ``InstructionTranslator`` do this? At the heart of it, there is
+a loop that is pumped, which drives a function ``step``.
+
+``step`` is just that - a single processing step, taking exactly one
+instruction and doing *something* with it. Note: These are real
+instructions processed by TorchDynamo’s ``transform_code_object``, and
+it’s pretty cool. 
+
+.. note:: This section purposly skips the details of
+   `dis.get_instructions <https://docs.python.org/3/library/dis.html>`__,
+   and how we set up the ``Instruction`` class.
+
+For the toy example above, here is a snippet of a what a few
+``Instruction``\'s may look like:
+
+.. code-block:: python
+
+   Instruction(opcode=124, opname='LOAD_FAST', arg=0, argval='b', offset=32, starts_line=8, is_jump_target=True, target=None)
+   Instruction(opcode=100, opname='LOAD_CONST', arg=3, argval=-1, offset=34, starts_line=None, is_jump_target=False, target=None)
+   Instruction(opcode=20, opname='BINARY_MULTIPLY', arg=None, argval=None, offset=36, starts_line=None, is_jump_target=False, target=None)
+
+This is the core functionality of this function. Take a look at the ``opname``,
+and then take a look at this little snippet from inside ``step``;
+
+.. code-block:: python
+
+   if not hasattr(self, inst.opname):
+       unimplemented(f"missing: {inst.opname}")
+   getattr(self, inst.opname)(inst)
+
+As we can see, we check if the current class, the
+``InstructionTranslator`` has a attribute set matching the operator name
+(ex: LOAD_CONST). If it does, we invoke it, passing the whole
+instruction object in. If it does not, we drop the frame as
+unimplemented.
+
+For the LOAD_CONST example, we can see that we do indeed support it,
+with a relatively straightforward definition:
+
+::
+
+   def  LOAD_CONST(self, inst):
+   self.push(ConstantVariable(value=inst.argval))
+
+Passing over, for now, on the other details of ``InstructionTranslator``
+we can see that this function creates a new instance of the class
+``ConstantVariable`` , with a value, in our example case, -1, and then
+pushes it onto the stack.
+
+There are dozens of such methods - see symbolic_convert.py for all of
+them. Generally, we implement as many matching methods to python
+bytecode instructions as possible.
+
+Across both the logic downstream of ``step`` and the logic from invoking
+``VariableBuilder`` - we now have a lot of ``VariableTracker``\ s and of
+course, we’ve spoken about creating guards quiet a bit. Let’s dig into
+what Variables are, and get a little closer to understanding guards.
+
+Variables
+---------
+
+A ``ConstantVariable`` is an instance of\ ``VariableTracker``.
+``VariableTracker`` represents a tracked python local or stack value.
+
+When it comes to representing an object inside TorchDynamo, a
+VariableTracker does exactly what it says - it tracks a given variable.
+Its an extremely flexible class, but there are a few points to keep in
+mind:
+
+-  It manages the ``guard`` relationship around the underlying object
+   through:
+
+   -  `make_guard`
+   -  `replace_guards`
+   -  `add_guard(s)`
+   -  `propagate` - ``propagate(*vars: List[List["VariableTracker"]])`` -
+      Perhaps the most important of all, in that it combines guards from
+      all the provided VariableTracker instances passed in. It visits
+      the guards and combines the guards from these onto itself.
+
+-  It acts as a proxy on behalf of the underlying object, implementing
+   methods for the rest of TorchDynamo to get information about the
+   tracked object:
+
+   -  `call_method`
+   -  `call_function`
+   -  `python_type`
+   -  `as_proxy`
+   -  `is/as_python_proxy`
+
+-  It stores the variable ``source`` of type ``Source``, from
+   torchdynamo/source.py. This source type is a relatively self
+   contained class to help us organize and bookeep where the original
+   source came from, and helps provide convenience methods for things
+   like getting the name, and importantly for us, producing guards.
+
+And this class (``VariableTracker``) is built around subclassing,
+somewhere between a full Abstract Base Class and fully fleshed out class
+- it leaves many methods raising NotImplementedError - with reliance on
+subclasses (see: torchdynamo/variables/ for all subclasses) to fulfill
+contracts and custom behaviors.
+
+Knowing what we know now, we can see an example of how an instruction
+from ``dis``, ``BUILD_TUPLE``
+
+   BUILD_TUPLE(count) Creates a tuple consuming count items from the
+   stack, and pushes the resulting tuple onto the stack.
+
+In our case, our signature will be a *little* different due to the way
+we create ``Instruction`` objects, but the gist of it will be the same.
+Instead of passing in ``count``, we pass in an object with a little
+extra bookkeeping, and of course, we deal with turning regular old
+python objects into TorchDynamo notions:
+
+::
+
+   def BUILD_TUPLE(self, inst):
+       items = self.popn(inst.argval)
+       options = VariableTracker.propagate(items)
+       self.push(TupleVariable(items, **options))
+
+What is happening here? 1) We read argval, which in this case, is
+analogous to ``counts`` in the pydoc for the equivalent instruction.
+
+2) We ``popn`` the items, in this case, the signature is
+   ``def  popn(self, n: int) -> List[TensorVariable]:`` this hints at an
+   underlying contract - we are returning ``TensorVariables``. If we
+   take a closer look at sybmolic_convert.py and
+   ``InstructionTranslatorBase``/``InstructionTranslator``\ we see that
+   the only thing pushed onto and popped from our stack are
+   ``VariableTracker``\ s.
+
+3) We call ``VariableTracker.propogate`` (remember it, from above?) This
+   takes the guards from every single item popped off the stack in 2,
+   and recursively traverses it and combines all the guards into
+   ``options``: ``py  return {      "guards": guards,  }``
+
+4) We then make a new instance of a ``VariableTracker``,
+   ``TupleVariable``\ out of the ``items`` and ``options``. This then
+   allows us to install all the appropriate guards from the ``items``
+   that make up the new ``TupleVariable``
+
+Note: You may wonder - where did the first guards come from? Propagation
+is good and all, but don’t we need something created before it can be
+propagated. Yes! Remember that ``VariableBuilder`` above? It calls
+``make_guards`` as it creates ``VariableTracker`` instances, from
+``f_locals``. This in turn calls into the ``source``, to have it create
+guards.
+
+After all this, bytecode translation is done and we are one step closer
+to producing ``GuardedCode``. We now understand how locals become
+``VariableTracker``\ s, how instructions are handled, and where guards
+are called on for creation. Before we can go into seeing how code and
+guards are combined into a GuardedCode object, we need to dig a little
+bit into those ``make_guard`` and ``source.make_guard`` calls above. We
+can then understand, really, what was going on when we made guards
+alongside, and on, ``VariableTracker`` instances.
+
+Making Guards
+-------------
+
+Guards are just python objects, of the class ``Guard``, however, theres
+a good amount of detail around this little class.
+
+Looking at the definition of the dataclass (and therefore, ctor
+signature), we see that it has a name, a source, and a create function.
+
+::
+
+   @dataclasses.dataclass
+   class Guard:
+       name: str
+       source: GuardSource
+       create_fn: Callable
+
+The name should be the name of the variable.
+
+The source here is an enum indicating what *kind* of source the guard
+belongs to [Note: not to be confused with ``Source`` and the other types
+in source.py, as stored on ``VariableTracker``, as discussed above]
+
+And create_fn is the heart of how we go from having this simple
+dataclass to actually producing valid python code to be invoked for
+knowing whether or not things have changed in between invocations, and
+whether we can safely read from the code cache or not (In case you
+forgot what all this was for!)
+
+The most common code paths for getting an instance of a guard are
+through ``make_guards`` on ``VariableTracker``.
+``make_guards``->``source.make_guard``->``return Guard(self.name(), self.guard_source(), fn)``
+
+Or, in a concrete example:
+
+.. code-block:: python
+   
+   ...
+   elif istype(value, range):
+       guards = self.make_guards(GuardBuilder.EQUALS_MATCH)
+       return RangeVariable(value=value, guards=guards)
+
+Since ``source`` was set at the construction time of this
+``VariableTracker``, all that was needed here was to provide the fn,
+``GuardBuilder.EQUALS_MATCH`` to the ``create_fn`` field.
+
+This ``create_fn`` must be a method on ``GuardBuilder``. The reason for
+this becomes apparent in our next step. Once we have all the guards
+created for a frame, we move on to ``CheckFunctionManager`` and
+``compile_check_fn``.
+
+Remember that ``convert_frame`` function way above, in the first
+section? Before it can produce a ``GuardedCode``, it needs to run the
+``CheckFunctionManager``, with all the guards, to produce a ``check_fn``
+which will then, in turn get passed in alongside the code into
+``GuardedCode``. This is the same ``check_fn`` that we store in our
+cache entry, and the same one we run to know whether or not to retrieve
+the code stored alongside. For reference, here is that code:
+
+.. code-block:: cpp
+   
+   static CacheEntry *create_cache_entry(CacheEntry *next,
+                                         PyObject *guarded_code) {
+     CacheEntry *e = (CacheEntry *)malloc(sizeof(CacheEntry));
+     DEBUG_NULL_CHECK(e);
+     e->check_fn = PyObject_GetAttrString(guarded_code, "check_fn");
+     NULL_CHECK(e->check_fn);
+     e->code = (PyCodeObject *)PyObject_GetAttrString(guarded_code, "code");
+     NULL_CHECK(e->code);
+     e->next = next;
+     return e;
+   }
+   
+We now know how a ``check_fn`` function is used, and who makes it, and
+what it is composed of, but what we do not yet know is how. How does a
+list of ``Guard`` objects become a function we can run later on?
+
+First, we iterate these guards:
+
+.. code-block:: python
+
+   for guard in sorted(guards or [], key=Guard.sort_key):
+       if not config.guard_nn_modules and guard.is_nn_module():
+           continue
+       guard.create(local_builder, global_builder)
+
+Calling ``guard.create`` runs that ``create_fn`` we set on the ``Guard``
+class above (don’t confuse it with the ``check_fn`` we are working on
+producing, the names are similar, so it can get a little confusing). In
+our example above, our ``create_fn`` is ``GuardBuilder.EQUALS_MATCH``.
+So we are now invoking it, passing in the ``self``, the guard itself,
+in.
+
+The signature is: ``def EQUALS_MATCH(self, guard: Guard):``
+
+And internally to that function, we can use the ``name`` on the guard to
+get back our original object, querying it for data and type information,
+which in turn gets us to the most important bit: appending code.
+
+At its simplest, ``EQUALS_MATCH`` appends just one line of code:
+``self.code.append(f"{ref} == {val!r}")``. Where ``ref`` is the name of
+the variable, and val is the value. It might produce code like this:
+
+.. code-block::
+
+   y == 2
+
+Pretty simple, but if we append a few other kinds of ``GuardBuilder``
+functions on (For a more complex case), and then combine them all with
+``and`` in between each statement (as we do), we might get something
+like this:
+
+.. code-block::
+
+   ___guarded_code.valid and ___check_type_id(y, 94367738391392) and y == 2 and ___check_tensors(x)
+
+Now we’re talking! Let’s see what we have here: 1) A check for
+``.valid`` (we will come back to invalidation later on) 2) A type id
+check 3) A value check 4) A tensor check
+
+This becomes the heart of the code our ``check_fn``, which in turn, as
+you recall, is evaluated the **next** time we encounter this code. It
+will then check:
+
+1) Is this code still valid?
+2) If (1), Does ``y`` still have a type of ``94367738391392``?
+3) If (2), is ``y`` still 2?
+4) If (3), let’s check on if tensor ``x`` changed in some specific ways
+
+If all of these are still true, then we can use the code cached
+alongside this ``check_fn``! Joyous day! [Note: a deeper dive for how
+and where this happens if saved for a later writeup, but reading
+``static PyCodeObject *lookup(CacheEntry *e, PyObject *f_locals) {`` of
+``_eval_frame.c`` is a good place to start for the inquisitive reader
+who has made it thus far].
+
+If not, then, we can move on to recompiling the code anew, and storing
+that in the cache alongside this code, and a whole new ``check_fn``,
+again to be checked on yet another subsequent frame.
+
+There are lots of other such functions on ``GuardBuilder`` which get
+coalesced into, at times massive, strings which then get evaluated as
+python code and stored into ``check_fn``. Our example above is
+illustrative of a simple case, but I urge you to read the other
+functions on ``GuardBuilder``, or better yet, dump the ``code`` variable
+in ``compile_check_fn`` to really see what’s getting produced,
+especially on larger, real models!
+
+Summary
+-------
+
+In this, we have glossed over: - The role of ``.valid`` and invalidation
+around weak references (and potentially soon to be NN Module
+invalidations) - How the C++ side of guard functions
+(``___check_type_id``, ``___check_tensors``, etc) operate - What happens
+when guards fail? - What happens if we produce invalid guard code?
+
+Despite all that, I hope this has been a useful read. We covered how
+user provided code, wrapped in a TorchDynamo context goes on to get
+traced and tracked internally, organized into ``VariableTracker``\ s
+``Source``\ s and subsequently ``Guard``\ s, and how those ``Guards`` in
+turn guide cache entry selection and invalidation when handing Python
+code.
diff --git a/docs/source/dynamo/index.rst b/docs/source/dynamo/index.rst
new file mode 100644
index 0000000000000..d34f6a7d27552
--- /dev/null
+++ b/docs/source/dynamo/index.rst
@@ -0,0 +1,44 @@
+TorchDynamo Documentation
+=========================
+
+**TorchDynamo** is a Python-level JIT compiler designed to make unmodified
+PyTorch programs faster. TorchDynamo hooks into the frame evaluation API
+in CPython (`PEP 523 <https://peps.python.org/pep-0523/>`__) to
+dynamically modify Python bytecode right before it is executed. It
+rewrites Python bytecode in order to extract sequences of PyTorch
+operations into an `FX Graph <https://pytorch.org/docs/stable/fx.html>`__
+which is then just-in-time compiled with a customizable backend.
+It creates this FX Graph through bytecode analysis and is designed to
+mix Python execution with compiled backends to get the best of both
+worlds: usability and performance.
+
+TorchDynamo makes it easy to experiment with different compiler
+backends to make PyTorch code faster with a single line decorator
+``torch._dynamo.optimize()``
+
+.. image:: ../_static/img/dynamo/TorchDynamo.png
+
+For more information about `TorchInductor`, one of the backends
+supported by `TorchDynamo Graph <https://pytorch.org/docs/stable/fx.html>`__
+into `Triton <https://github.com/openai/triton>`__ for GPUs or
+`C++/OpenMP <https://www.openmp.org/>`__ for CPUs. We have a
+`training performance dashboard <https://github.com/pytorch/torchdynamo/issues/681#issuecomment-1233828468>`__
+that provides performance comparison for different training backends. You can read
+more in the `TorchInductor post on PyTorch
+dev-discuss <https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747>`__.
+
+.. seealso::
+
+   * `TorchDynamo deep-dive video <https://www.youtube.com/watch?v=egZB5Uxki0I>`__
+   * `dev-discuss topics <https://dev-discuss.pytorch.org/search?q=TorchDynamo%20order%3Alatest>`__
+
+.. toctree::
+   :hidden:
+
+   installation
+   get-started
+   guards-overview
+   custom-backends
+   deep-dive
+   troubleshooting
+   faq
diff --git a/docs/source/dynamo/installation.rst b/docs/source/dynamo/installation.rst
new file mode 100644
index 0000000000000..21f010951820a
--- /dev/null
+++ b/docs/source/dynamo/installation.rst
@@ -0,0 +1,83 @@
+Installing TorchDynamo
+======================
+
+This section describes how to install TorchDynamo.
+
+Requirements and Setup
+----------------------
+
+Python 3.8 is recommended. Python 3.7 through 3.10 are supported and
+tested. Make sure to have a development version of Python installed
+locally as well.
+
+TorchDynamo is included in the nightly binaries of PyTorch. You can
+find more information `here <https://pytorch.org/get-started/locally/>`__
+
+Install GPU/CUDA version requirements
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To use GPU back ends (and in particular Triton), please make sure that
+the CUDA that you have installed locally matches the PyTorch version you
+are running.
+
+The following command installs GPU PyTorch+TorchDynamo along with GPU
+TorchDynamo dependencies (for CUDA 11.7):
+
+.. code-block:: python
+ 
+   pip3 install numpy --pre torch[dynamo] --force-reinstall --extra-index-url https://download.pytorch.org/whl/nightly/cu117
+
+CPU requirements
+~~~~~~~~~~~~~~~~
+
+There are no additional requirements for CPU TorchDynamo. CPU
+TorchDynamo is included in the nightly versions of PyTorch, which, for
+reference, can be installed with the following command:
+
+.. code-block:: shell
+
+   pip3 install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+
+
+Install from local source
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Build PyTorch from source:
+https://github.com/pytorch/pytorch#from-source, which has TorchDynamo
+included.
+
+To install GPU TorchDynamo dependencies, run ``make triton`` in the
+PyTorch repo root directory.
+
+Verify Installation
+~~~~~~~~~~~~~~~~~~~
+
+If you built PyTorch from source, then you can run the following
+commands (from the PyTorch repo root directory) that run minimal
+examples to check that TorchDynamo is installed correctly:
+
+.. code:: shell
+
+   cd tools/dynamo
+   python verify_dynamo.py
+
+If you do not have the PyTorch source locally, you can alternatively
+copy the script (``tools/dynamo/verify_dynamo.py``) from the PyTorch
+repo and run it locally.
+
+Docker installation
+-------------------
+
+We also provide all the required dependencies in the PyTorch nightly
+binaries which you can download with
+
+.. code-block::
+   
+   docker pull ghcr.io/pytorch/pytorch-nightly
+
+And for ad hoc experiments just make sure that your container has access
+to all your GPUs
+
+.. code-block:: bash
+   
+   docker run --gpus all -it ghcr.io/pytorch/pytorch-nightly:latest /bin/bash
diff --git a/docs/source/dynamo/troubleshooting.rst b/docs/source/dynamo/troubleshooting.rst
new file mode 100644
index 0000000000000..da73f90269279
--- /dev/null
+++ b/docs/source/dynamo/troubleshooting.rst
@@ -0,0 +1,665 @@
+TorchDynamo Troubleshooting
+===========================
+
+**Author**: `Michael Lazos <https://github.com/mlazos>`_
+
+TorchDynamo is still in active development, and many of the reasons for
+graph breaks and excessive recompilation will be fixed with upcoming
+support for `tracing dynamic tensor
+shapes <https://docs.google.com/document/d/1QJB-GOnbv-9PygGlOMXwiO9K6vVNm8sNg_olixJ9koc/edit?usp=sharing>`__,
+more careful choices for guards and better tuned heuristics.
+
+In the mean time, you may need to diagnose a particular issue and
+determine if it is easy to work around with a change to your model, or
+file an issue for support.
+
+Also, we are actively developing debug tools, profilers, and improving our
+errors/warnings. Please give us feedback if you have an issue with this
+infra, or an idea for an improvement. Below is a table of the available
+tools and their typical usage. For additional help see
+`Diagnosing Runtime Errors <#diagnosing-runtime-errors>`__.
+
+.. list-table:: Title
+   :widths: 25 25 50
+   :header-rows: 1
+
+   * - Tool
+     - Purpose
+     - Usage
+   * - Info logging
+     - View summarized steps of compilation
+     - ``torch._dynamo.config.log_level = logging.INFO``
+   * - Debug logging
+     - View detailed steps of compilation (print every instruction traced)
+     - ``torch._dynamo.config.log_level = logging.DEBUG`` and
+       ``torch._dynamo.config.verbose = True``
+   * - Minifier for any backend
+     - Find smallest subgraph which reproduces errors for any backend
+     - set environment variable ``TORCHDYNAMO_REPRO_AFTER="dynamo"``
+   * - Minifier for ``TorchInductor``
+     - If the error is known to occur after `AOTAutograd`` find
+       smallest subgraph wich reproduces errors during TorchInductor lowering
+     - set environment variable ``TORCHDYNAMO_REPRO_AFTER="aot"``
+   * - Accuracy minifier
+     - Finds the smallest subgraph which reproduces an accuracy issue
+       between an eager model model and optimized model
+     - ``TORCHDYNAMO_REPRO_AFTER=<"aot"/"dynamo"> TORCHDYNAMO_REPRO_LEVEL=4``
+   * - ``torch._dynamo.explain``
+     - Find graph breaks and display reasoning for them
+     - ``torch._dynamo.explain(fn, *inputs)``
+   * - Record/Replay
+     - Record and replay frames which to reproduce errors during graph capture
+     - ``torch._dynamo.config.replay_record_enabled = True``
+   * - TorchDynamo function name filtering
+     - Only compile functions with the given name to reduce noise when
+       debugging an issue
+     - set environment variable ``TORCHDYNAMO_DEBUG_FUNCTION=<name>``
+   * - TorchInductor Debug logging
+     - Print general TorchInductor debug info and generated Triton/C++ code
+     - ``torch._inductor.config.debug = True``
+   * - TorchInductor Tracing
+     - Show time taken in each TorchInductor stage + output code and graph
+       visualization
+     - set the environment variable TORCHINDUCTOR_TRACE=1 or
+       ``torch._inductor.config.trace.enabled = True``
+
+Diagnosing Runtime Errors
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Below is the TorchDynamo compiler stack.
+
+At a high level, the TorchDynamo stack consists of a graph capture from
+Python code (TorchDynamo) and a backend compiler. In this example the
+backend compiler consists of backward graph tracing (AOTAutograd) and
+graph lowering (TorchInductor)*. Errors can occur in any component of
+the stack and will provide full stack traces.
+
+You may use info logging
+(``torch._dynamo.config.log_level = logging.INFO``) and look for
+``Step #: ...`` outputs in order to determine in which component the
+error occurred in. Logs are made at the beginning and end of each step,
+so the step that an error should correspond to is the most recent logged
+step whose end has not yet been logged. The steps correspond to the
+following parts of the stack (according to the image above):
+
+==== ================
+Step Component
+==== ================
+1    TorchDynamo
+2    Compiler Backend
+3    TorchInductor
+==== ================
+
+The beginning and end of AOTAutograd is currently not logged, but we
+plan to add it soon.
+
+If info logging is insufficient, then there are also some backend
+options which can enable you to determine which component is causing the
+error if you’re unable to understand the error message that is
+generated. These are the following:
+
+-  ``"eager"``: only runs torchdynamo forward graph capture and then
+   runs the captured graph with PyTorch. This provides an indication as
+   to whether TorchDynamo is raising the error.
+
+-  ``"aot_eager"``: runs torchdynamo to capture a forward graph, and
+   then AOTAutograd to trace the backward graph without any additional
+   backend compiler steps. PyTorch eager will then be used to run the
+   forward and backward graphs. This is useful to narrow down the issue
+   to AOTAutograd.
+
+The general procedure to narrow down an issue is the following: 1. Run
+your program with the ``"eager"`` backend. If the error no longer
+occurs, the issue is in the backend compiler that is being used (if
+using TorchInductor, proceed to step 2, if not, see `this
+section <#minifying-backend-compiler-errors>`__). If the error still
+occurs with the ``"eager"`` backend, it is an `error while running
+torchdynamo <#torchdynamo-errors>`__.
+
+2. This step is only necessary if TorchInductor is used as the backend
+   compiler. Run the model with the ``"aot_eager"`` backend. If this
+   backend raises an error then the error is occurring during
+   AOTAutograd tracing. If the error no longer occurs with this backend,
+   then `the error is in
+   TorchInductor\* <#minifying-torchinductor-errors>`__.
+
+Each of these cases are analyzed in the following sections.
+
+\*Note on TorchInductor naming: The TorchInductor backend consists of
+both AOTAutograd tracing and the TorchInductor compiler itself. We will
+disambiguate by referring to TorchInductor as the backend, and
+TorchInductor lowering as the phase which lowers the graph traced by
+AOTAutograd.
+
+Torchdynamo Errors
+------------------
+
+If the error that is generated occurs with the ``"eager"`` backend, then
+torchdynamo is the most likely source of the error. Here is example code
+which will generate an error.
+
+.. code:: py
+
+   import torch
+
+   import torch._dynamo as dynamo
+
+
+   @dynamo.optimize("eager")
+   def test_assertion_error():
+       y = torch.ones(200, 200)
+       z = {y: 5}
+       return z
+
+
+   test_assertion_error()
+
+Which will generate the following error:
+
+::
+
+   torch._dynamo.convert_frame: [ERROR] WON'T CONVERT test_assertion_error /scratch/mlazos/torchdynamo/../test/errors.py line 26 
+   due to: 
+   Traceback (most recent call last):
+     File "/scratch/mlazos/torchdynamo/torchdynamo/symbolic_convert.py", line 837, in BUILD_MAP
+       assert isinstance(k, ConstantVariable) or (
+   AssertionError
+
+   from user code:
+      File "/scratch/mlazos/torchdynamo/../test/errors.py", line 34, in test_assertion_error
+       z = {y: 5}
+
+   Set torch._dynamo.config.verbose=True for more information
+   ==========
+
+As the message suggests you can set
+``torch._dynamo.config.verbose=True`` to get a full stack trace to both
+the error in torchdynamo and the user code. In addition to this flag,
+you can also set the ``log_level`` of torchdynamo through
+``torch._dynamo.config.log_level``. The available levels are the
+following: - ``logging.DEBUG``: Print every instruction that is
+encountered in addition to all below log levels - ``logging.INFO``:
+Print each function that is compiled (original and modified bytecode)
+and the graph that is captured in addition to all below log levels -
+``logging.WARNING`` (default): Print graph breaks in addition to all
+below log levels - ``logging.ERROR``: Print errors only
+
+If a model is sufficiently large, the logs can become overwhelming. If
+an error occurs deep within a model’s python code, it can be useful to
+execute only the frame in which the error occurs to enable easier
+debugging. There are two tools available to enable this: - Setting the
+environment variable TORCHDYNAMO_DEBUG_FUNCTION to the desired function
+name will only run torchdynamo on functions with that name. - There is a
+record/replay tool (set
+``torch._dynamo.config.replay_record_enabled = True``) which dumps an
+execution record when an error is encountered. This record can then be
+replayed to run only the frame where an error occurred.
+
+TorchInductor Errors
+--------------------
+
+If the error doesn’t occur with the ``"eager"`` backend, then the
+backend compiler is the source of the error (`example
+error <https://gist.github.com/mlazos/2f13681e3cc6c43b3911f336327032de%5D>`__).
+There are `different
+choices <https://github.com/pytorch/torchdynamo/blob/0b8aaf340dad4777a080ef24bf09623f1aa6f3dd/README.md#existing-backends>`__
+for backend compilers for torchdynamo, with TorchInductor or nvfuser
+fitting the needs of most users. This section focuses on TorchInductor
+as the motivating example, but some tools will be usable with other
+backend compilers.
+
+Below is the portion of the stack which we are focusing on:
+
+With TorchInductor as the chosen backend, AOTAutograd is used to
+generate the backward graph from the forward graph captured by
+torchdynamo. It’s important to note that errors can occur during this
+tracing and also while TorchInductor lowers the forward and backward
+graphs to GPU code or C++. A model can often consist of hundreds or
+thousands of FX nodes, so narrowing the exact nodes where this problem
+occurred can be very difficult. Fortunately, there are tools availabe to
+automatically minify these input graphs to the nodes which are causing
+the issue. The first step is to determine whether the error occurs
+during tracing of the backward graph with AOTAutograd or during
+TorchInductor lowering. As mentioned above in step 2, the
+``"aot_eager"`` backend can be used to run only AOTAutograd in isolation
+without lowering. If the error still occurs with this backend, this
+indicates that the error is occurring during AOTAutograd tracing.
+
+Here’s an example:
+
+.. code:: py
+
+   import torch
+
+   import torch._dynamo as dynamo
+
+   model = torch.nn.Sequential(*[torch.nn.Linear(200, 200) for _ in range(5)])
+   @dynamo.optimize("inductor")
+   def test_backend_error():
+
+       y = torch.ones(200, 200)
+       x = torch.ones(200, 200)
+       z = x + y
+       a = torch.ops.aten._foobar(z)  # dummy function which errors
+       return model(a)
+
+
+   test_backend_error()
+
+Running this should give you this error (with a longer stack trace below
+it)
+
+::
+
+   Traceback (most recent call last):
+     File "/scratch/mlazos/torchdynamo/torchinductor/graph.py", line 246, in call_function
+       return lowerings[target](*args, **kwargs)
+     File "/scratch/mlazos/torchdynamo/torchinductor/lowering.py", line 185, in wrapped
+       return decomp_fn(*args, **kwargs)
+     File "/scratch/mlazos/torchdynamo/torchinductor/lowering.py", line 810, in _foobar
+       assert False
+   AssertionError
+   ... 
+
+`error with full stack
+trace <https://gist.github.com/mlazos/d6947854aa56d686800259a164c62100>`__
+
+If you then change ``@dynamo.optimize("inductor")`` to
+``@dynamo.optimize("aot_eager")``, it will run without error, because
+`the
+issue <https://github.com/pytorch/torchdynamo/blob/d09e50fbee388d466b5252a63045643166006f77/torchinductor/lowering.py#:~:text=%23%20This%20shouldn%27t%20be,assert%20False>`__
+is in the TorchInductor lowering process, not in AOTAutograd.
+
+Minifying TorchInductor Errors
+------------------------------
+
+From here, let’s run the minifier to get a minimal repro. Setting the
+environment variable TORCHDYNAMO_REPRO_AFTER=“aot” (or setting
+``torch._dynamo.config.repro_after="aot"`` directly) will generate a
+python program which reduces the graph produced by AOTAutograd to the
+smallest subgraph which reproduces the error. (See below for an example
+where we minify the graph produced by torchdynamo) Running the program
+with this environment variable should show nearly `identical
+output <https://gist.github.com/mlazos/0458ab828aa403c779fe73c012aa5982>`__,
+with an additional line indicating where ``minifier_launcher.py`` has
+been written to. The output directory is configurable by setting
+``torch._dynamo.config.base_dir`` to a valid directory name. The final
+step is to run the minifier and check that it runs successfully. A
+successful run looks like
+`this <https://gist.github.com/mlazos/e6ea41ccce68a7b1b8a7a09acb1b206a>`__.
+If the minifier runs successfully, it generates runnable python code
+which reproduces the exact error. For our example this is the following
+code:
+
+.. code:: py
+
+   import torch
+   from torch import tensor, device
+   import torch.fx as fx
+   from torch._dynamo.testing import rand_strided
+   from math import inf
+   from torch.fx.experimental.proxy_tensor import make_fx
+
+   # torch version: 1.13.0a0+gitfddfc44
+   # torch cuda version: 11.6
+   # torch git version: fddfc4488afb207971c54ad4bf58130fdc8a4dc5
+
+
+   # CUDA Info: 
+   # nvcc: NVIDIA (R) Cuda compiler driver 
+   # Copyright (c) 2005-2022 NVIDIA Corporation 
+   # Built on Thu_Feb_10_18:23:41_PST_2022 
+   # Cuda compilation tools, release 11.6, V11.6.112 
+   # Build cuda_11.6.r11.6/compiler.30978841_0 
+
+   # GPU Hardware Info: 
+   # NVIDIA A100-SXM4-40GB : 8 
+
+
+   from torch.nn import *
+   class Repro(torch.nn.Module):
+       def __init__(self):
+           super().__init__()
+
+
+
+       def forward(self, add):
+           _foobar = torch.ops.aten._foobar.default(add);  add = None
+           return (_foobar,)
+
+   args = [((200, 200), (200, 1), torch.float32, 'cpu')]
+   args = [rand_strided(shape, stride, dtype, device) for shape, stride, dtype, device in args]
+   mod = make_fx(Repro())(*args)
+   from torch._inductor.compile_fx import compile_fx_inner
+
+   compiled = compile_fx_inner(mod, args)
+   compiled(*args)
+
+The ``forward`` method of the ``Repro`` module contains the exact op
+which causes the issue. When filing an issue, please include any
+minified repros to aid in debugging.
+
+Minifying Backend Compiler Errors
+---------------------------------
+
+With backend compilers other than TorchInductor the process for finding
+the subgraph causing the error is nearly identical to the procedure in
+`errors in TorchInductor <#torchinductor-errors>`__ with one important
+caveat. Namely, that the minifier will now be run on the graph that is
+traced by TorchDynamo, not the output graph of AOTAutograd. Let’s walk
+through an example.
+
+.. code:: py
+
+   import torch
+
+   import torch._dynamo as dynamo
+
+   model = torch.nn.Sequential(*[torch.nn.Linear(200, 200) for _ in range(5)])
+   # toy compiler which fails if graph contains relu
+   def toy_compiler(gm: torch.fx.GraphModule, _):
+       for node in gm.graph.nodes:
+           if node.target == torch.relu:
+               assert False
+
+       return gm
+
+
+   @dynamo.optimize(toy_compiler)
+   def test_backend_error():
+       y = torch.ones(200, 200)
+       x = torch.ones(200, 200)
+       z = x + y
+       a = torch.relu(z)
+       return model(a)
+
+
+   test_backend_error()
+
+In order to run the code after TorchDynamo has traced the forward graph,
+the TORCHDYNAMO_REPRO_AFTER enviornment variable can be used. Running
+this program with TORCHDYNAMO_REPRO_AFTER=“dynamo” (or
+``torch._dynamo.config.repro_after="dynamo"``) should produce `this
+output <https://gist.github.com/mlazos/244e3d5b53667e44078e194762c0c92b>`__\ and
+the following code in ``{torch._dynamo.config.base_dir}/repro.py``.
+Note: the other option for TORCHDYNAMO_REPRO_AFTER are ``"aot"``, which
+will run the minifier after the backward graph has been generated.
+
+.. code:: py
+
+   import torch
+   import torch._dynamo as dynamo
+   from torch import tensor, device
+   import torch.fx as fx
+   from torch._dynamo.testing import rand_strided
+   from math import inf
+   from torch._dynamo.debug_utils import run_fwd_maybe_bwd
+
+
+   from torch.nn import *
+   class Repro(torch.nn.Module):
+       def __init__(self):
+           super().__init__()
+
+
+
+       def forward(self, add):
+           relu = torch.relu(add);  add = None
+           return (relu,)
+
+
+   mod = Repro().cuda()
+   opt_mod = dynamo.optimize("None")(mod)
+
+
+   args = [((200, 200), (200, 1), torch.float32, 'cpu', False)]
+   args = [rand_strided(sh, st, dt, dev).requires_grad_(rg) for (sh, st, dt, dev, rg) in args]
+
+
+   with torch.cuda.amp.autocast(enabled=False):
+       ref = run_fwd_maybe_bwd(mod, args)
+       res = run_fwd_maybe_bwd(opt_mod, args)
+
+The minifier successfully reduced the graph to the op that raises the
+error in ``toy_compiler``. The other difference from the procedure in
+`TorhInductor Errors <#torchinductor-errors>`__ is that the minifier is
+automatically run after encountering a backend compiler error. After a
+successful run, the minifier writes ``repro.py`` to
+``torch._dynamo.config.base_dir``.
+
+Performance Profiling
+~~~~~~~~~~~~~~~~~~~~~
+
+Accessing TorchDynamo Profiler
+------------------------------
+
+TorchDynamo has a builtin stats function for collecting and displaying
+the time spent in each compilation phase. These stats can be accessed by
+calling ``torch._dynamo.utils.compile_times()`` after executing
+Torch._Dynamo. By default, this returns a string representation of the
+compile times spent in each TorchDynamo function by name.
+
+TorchInductor Debug Tracing
+---------------------------
+
+TorchInductor has a builtin stats and trace function for displaying time
+spent in each compilation phase, output code, output graph visualization
+and IR dump. This is a debugging tool designed to make it easier to
+debug/understand the internals of TorchInductor.
+
+Setting the environment variable ``TORCHINDUCTOR_TRACE=1`` will cause a
+debug trace directory to be created and printed:
+
+::
+
+   $ env TORCHINDUCTOR_TRACE=1 python repro.py
+   torch._inductor.debug: [WARNING] model_forward_0 debug trace: /tmp/torchinductor_jansel/rh/crhwqgmbqtchqt3v3wdeeszjb352m4vbjbvdovaaeqpzi7tdjxqr.debug
+
+Here is an `example debug directory
+output <https://gist.github.com/jansel/f4af078791ad681a0d4094adeb844396>`__
+for the test program:
+
+::
+
+   torch.nn.Sequential(
+           torch.nn.Linear(10, 10),
+           torch.nn.LayerNorm(10),
+           torch.nn.ReLU(),
+       )
+
+Note each file in that debug trace can be enabled/disabled via
+``torch._inductor.config.trace.*``. The profile and the diagram are both
+disabled by default since they are expensive to generate.
+
+A single node in this new debug format looks like:
+
+::
+
+   buf1: SchedulerNode(ComputedBuffer)
+   buf1.writes = 
+       {   MemoryDep(name='buf1', index=0, size=()),
+           MemoryDep(name='buf1', index=0, size=(s0,))}
+   buf1.unmet_dependencies = {MemoryDep(name='buf0', index=c0, size=(s0,))}
+   buf1.met_dependencies = {MemoryDep(name='primals_2', index=c0, size=(s0,))}
+   buf1.group.device = cuda:0
+   buf1.group.iteration = (1, s0)
+   buf1.sizes = ([], [s0])
+   class buf1_loop_body:
+       var_ranges = {z0: s0}
+       index0 = z0
+       index1 = 0
+       def body(self, ops):
+           get_index = self.get_index('index0')
+           load = ops.load('buf0', get_index, False)
+           get_index_1 = self.get_index('index0')
+           load_1 = ops.load('primals_2', get_index_1, False)
+           add = ops.add(load, load_1)
+           get_index_2 = self.get_index('index1')
+           reduction = ops.reduction('buf1', torch.float32, torch.float32, 'sum', get_index_2, add)
+           return reduction
+
+See the `example debug directory
+output <https://gist.github.com/jansel/f4af078791ad681a0d4094adeb844396>`__
+for more examples.
+
+Memory Profiling
+----------------
+
+TBD
+
+Graph Breaks
+------------
+
+Given a program like this:
+
+.. code-block:: python
+   
+   @dynamo.optimize(...)
+   def some_fun(x):
+       ...
+   some_fun(x)
+   ...
+
+TorchDynamo will attempt to compile all of the torch/tensor operations
+within some_fun into a single FX graph, but it may fail to capture
+everything into one graph.
+
+Some graph break reasons are insurmountable to TorchDynamo, and can’t be
+easily fixed. - calling into a C extension other than torch is invisible
+to torchdynamo, and could do arbitrary things without TorchDynamo being
+able to introduce necessary `guards <./GuardsOverviewPt1.md>`__ to
+ensure that the compiled program would be safe to reuse. Graph breaks
+can hinder performance if the resulting fragments are small. To maximize
+performance, it’s important to have as few graph breaks as possible.
+
+Identifying the cause of a graph break
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To identify all graph breaks in a program and the associated reasons for
+the breaks, ``torch._dynamo.explain`` can be used. This tool runs
+TorchDynamo on the supplied function and aggregates the graph breaks
+that are encountered. Here is an example usage:
+
+.. code-block:: python
+
+   import torch
+   import torch._dynamo as dynamo
+   def toy_example(a, b):
+       x = a / (torch.abs(a) + 1)
+       print("woo")
+       if b.sum() < 0:
+           b = b * -1
+       return x * b
+   explanation, out_guards, graphs, ops_per_graph = dynamo.explain(toy_example, torch.randn(10), torch.randn(10))
+   print(explanation)
+   """
+   Dynamo produced 3 graphs, with 2 graph break and 6 ops. 
+    Break reasons: 
+   1. call_function BuiltinVariable(print) [ConstantVariable(str)] {} 
+      File "t2.py", line 16, in toy_example
+       print("woo")
+    
+   2. generic_jump 
+      File "t2.py", line 17, in toy_example
+       if b.sum() < 0:
+    """
+
+Note on other outputs: - ``out_guards`` - a list of lists where each
+sublist contains the guards that must pass to ensure the traced graphs
+are valid - ``graphs`` - a list of graph modules which were successfully
+traced - ``ops_per_graph`` - a list of lists where each sublist contains
+the ops thatare run in the graph
+
+To throw an error on the first graph break encountered, ``nopython``
+mode can be used. This disables TorchDynamo’s python fallback, and only
+succeeds if the entire program is convertible to a single graph. Example
+usage:
+
+.. code-block:: python
+
+   @dynamo.optimize(<compiler>, nopython=True)
+   def toy_example(a, b):
+      ...
+
+Excessive Recompilation
+-----------------------
+
+When TorchDynamo compiles a function (or part of one), it makes certain
+assumptions about locals and globals in order to allow compiler
+optimizations, and expresses these assumptions as guards that check
+particular values at runtime. If any of these guards fail, Dynamo will
+recompile that function (or part) up to
+``torch._dynamo.config.cache_size_limit`` times. If your program is
+hitting the cache limit, you will first need to determine which guard is
+failing and what part of your program is triggering it.
+
+The `recompilation profiler <#recompilation-profiler>`__ automates the
+process of setting TorchDynamo’s cache limit to 1 and running your
+program under an observation-only ‘compiler’ that records the causes of
+any guard failures. You should be sure to run your program for at least
+as long (as many iterations) as you were running when you ran into
+trouble, and the profiler will accumulate statistics over this duration.
+
+If your program exhibits a bounded amount of dynamism, you may be able
+to tune the TorchDynamo cache limit to allow for each variation to be
+compiled and cached, but if the cache limit is too high you may find the
+cost of recompilation outweighs any optimization benefits.
+
+::
+
+   torch._dynamo.config.cache_size_limit = <your desired cache limit>
+
+Torchdynamo plans to support many common cases of dynamic tensor shapes,
+such as varying batch size or sequence length. It does not plan to
+support rank-dynamism. In the mean time, setting a specific cache limit
+can be used in coordination with bucketing techniques to achieve an
+acceptable number of recompilations for some dynamic models.
+
+.. code-block:: python
+
+   prof = dynamo.utils.CompilationProfiler()
+   @dynamo.optimize(prof)
+   def my_model():
+       ...
+   my_model()
+   print(prof.report())
+
+Accuracy Debugging
+~~~~~~~~~~~~~~~~~~
+
+Accuracy issues can also be minified if you set the environment variable
+``TORCHDYNAMO_REPRO_LEVEL=4``, it operates with a similar git bisect
+model and a full repro might be something like
+``TORCHDYNAMO_REPRO_AFTER="aot" TORCHDYNAMO_REPRO_LEVEL=4`` the reason
+we need this is downstream compilers will codegen code whether it’s
+Triton code or the C++ backend, the numerics from those downstream
+compilers can be different in subtle ways yet have dramatic impact on
+your training stability. So the accuracy debugger is very useful for us
+to detect bugs in our codegen or with a backend compiler.
+
+File an Issue
+~~~~~~~~~~~~~
+
+You should feel encouraged to `file a github
+issue <https://github.com/pytorch/torchdynamo/issues>`__ and expect a
+timely response.
+
+Before filing an issue, read over the `README <../README.md>`__,
+`TROUBLESHOOTING <./TROUBLESHOOTING.md>`__, and search for similar
+issues.
+
+When filing an issue, please include - your
+OS/python/pytorch/CUDA/triton info by running:
+
+.. code-block:: sh
+
+   python tools/verify_install.py
+
+-  A minimal repro script if possible, which can be generated by running
+   Minifier
+-  A description of the error
+-  the expected behavior
+-  A log (set ``torch._dynamo.config.log_file`` to a valid file name to
+   dump the logs to a file and
+   ``torch._dynamo.config.log_level = logging.DEBUG`` and
+   ``torch._dynamo.config.verbose = True``)
diff --git a/docs/source/index.rst b/docs/source/index.rst
index e4b6a124d6bdc..e43160f668fc7 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -42,6 +42,13 @@ Features described in this documentation are classified by release status:
 
    notes/*
 
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :caption: torch.compile
+
+   dynamo/*
+
 .. toctree::
    :maxdepth: 1
    :caption: Language Bindings

From c851f36a720dc600625c966f86a01bc324df4164 Mon Sep 17 00:00:00 2001
From: eqy <eddiey@nvidia.com>
Date: Mon, 28 Nov 2022 20:42:30 +0000
Subject: [PATCH 1336/1922] Tweak formatting of note on macros (#89598)

For readability when viewing the rendered file e.g., from the browser.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89598
Approved by: https://github.com/kit1980
---
 docs/source/notes/hip.rst | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/docs/source/notes/hip.rst b/docs/source/notes/hip.rst
index c54e201489705..103c5db7d460a 100644
--- a/docs/source/notes/hip.rst
+++ b/docs/source/notes/hip.rst
@@ -130,14 +130,21 @@ NOTE: The CUDA_VERSION macro, cudaRuntimeGetVersion and cudaDriverGetVersion API
 semantically map to the same values as HIP_VERSION macro, hipRuntimeGetVersion and
 hipDriverGetVersion APIs. Please do not use them interchangeably when doing version checks.
 
-Eg: Instead of
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-If it is desired to not take the code path for ROCm/HIP:
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(USE_ROCM)
-If it is desired to take the code path for ROCm/HIP:
-#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11000) || defined(USE_ROCM)
-If it is desired to take the code path for ROCm/HIP only for specific HIP versions:
-#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11000) || (defined(USE_ROCM) && ROCM_VERSION >= 40300)
+For example: Instead of using
+
+``#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000`` to implicitly exclude ROCm/HIP,
+
+use the following to not take the code path for ROCm/HIP:
+
+``#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(USE_ROCM)``
+
+Alternatively, if it is desired to take the code path for ROCm/HIP:
+
+``#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11000) || defined(USE_ROCM)``
+
+Or if it is desired to take the code path for ROCm/HIP only for specific HIP versions:
+
+``#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11000) || (defined(USE_ROCM) && ROCM_VERSION >= 40300)``
 
 
 Refer to CUDA Semantics doc

From 1dc600c8091cc53c64d810a9760e32858ef0733e Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 28 Nov 2022 21:04:33 +0000
Subject: [PATCH 1337/1922] Revert "Move Dynamo docs back to core (#89769)"

This reverts commit be2816db181cc4d9a1822feb1202dbd2e8c87918.

Reverted https://github.com/pytorch/pytorch/pull/89769 on behalf of https://github.com/clee2000 due to broke lint
---
 .../source/_static/img/dynamo/TorchDynamo.png | Bin 349490 -> 0 bytes
 docs/source/_static/img/dynamo/td_stack.png   | Bin 308321 -> 0 bytes
 .../img/dynamo/torchinductor_backend.png      | Bin 122529 -> 0 bytes
 docs/source/dynamo/custom-backends.rst        | 154 ----
 docs/source/dynamo/deep-dive.rst              | 145 ----
 docs/source/dynamo/faq.rst                    | 376 ----------
 docs/source/dynamo/get-started.rst            | 181 -----
 docs/source/dynamo/guards-overview.rst        | 513 --------------
 docs/source/dynamo/index.rst                  |  44 --
 docs/source/dynamo/installation.rst           |  83 ---
 docs/source/dynamo/troubleshooting.rst        | 665 ------------------
 docs/source/index.rst                         |   7 -
 12 files changed, 2168 deletions(-)
 delete mode 100644 docs/source/_static/img/dynamo/TorchDynamo.png
 delete mode 100644 docs/source/_static/img/dynamo/td_stack.png
 delete mode 100644 docs/source/_static/img/dynamo/torchinductor_backend.png
 delete mode 100644 docs/source/dynamo/custom-backends.rst
 delete mode 100644 docs/source/dynamo/deep-dive.rst
 delete mode 100644 docs/source/dynamo/faq.rst
 delete mode 100644 docs/source/dynamo/get-started.rst
 delete mode 100644 docs/source/dynamo/guards-overview.rst
 delete mode 100644 docs/source/dynamo/index.rst
 delete mode 100644 docs/source/dynamo/installation.rst
 delete mode 100644 docs/source/dynamo/troubleshooting.rst

diff --git a/docs/source/_static/img/dynamo/TorchDynamo.png b/docs/source/_static/img/dynamo/TorchDynamo.png
deleted file mode 100644
index 351689d80dc925ba5d1fbdc53c5f0ade693855e9..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 349490
zcmZU*2{@JQ`aX<gh)_u?nJSfJ4jCg!rBX>KGb?2*WF}N9B(Dkyl}sV?RGFuQB=bC#
zA#-Jl_|ALp{XdTV+uwUQ-req9)_T_c-1l{z)AiifJbi5ax^3&IsHoN-S64kpMMW<`
zMMe9WfewFjx}C`b|3ho9a$Ji6|F|=l-NXOi>Tp!Y;k+%y!O8fBIn@;#TWj+J_NF(?
z&28*0Y#pZ76e;3G2gr+#+%PwGxN2*|t98}doJz~loL54OSK09juehkV7_X?9tfZLi
zK?z>XQ@on0T5-SMcvDgFQXN-S)^dJ6-tD4u-f2#0+G%4>+OA~d4b-|?n(SRk0}me7
z$TYWH>vVC5%RV1DDwUF=Z7H2S;SiqF(s0%CSc-O1b4zwYLf*N<s%-7^k-OM7GPLLR
z@_6Pe9igFlb=%dTzHD&kSY4gI;=H}$FxS&+GjVS=6$ZZB|NSp3=J%=6xBmNY{`G?7
zmqr5r_k|36<pO6#L;m-bCLw#nj{nblGNd18uif>3J_fI7+qI6e@&EVn8`nxO{_h7#
z-S-Lq-`{47P%Y8;-`~D>>Xngz%g8-d{>z2#TQ}aP6$nX8mG=9eYjn6}{>p;I<=);W
zCz5s2X^tK}%D~KA?lx86w_&H+t+M{#vvC{#?>j%bIsV6W4TH<0YuB#XO?DlUl#=3?
zliO}!V9@r?asTDZm;bmn{?B(-m(!v=mDkGBJK!L@gX(Z<>(2fAnOqABgK{7I&*kVG
zDbe_;dEL&A%D=*7rT6Ps6_5X20xPZ5r{T?`4l?#b%s<A)#I7~%4d6Po@#amf>YP0%
zPMokCs@?i+c(}|$vb3>rlP^7MCC|z<Jp(OG<}I$#L;06aW2v~$ol9gtb?TJejT<zf
zHEW~m<5%+kdr?hHT@QRsXug*2@{7Wy<jHm5z=7$RnUbAB@;(**TTDxDGqN|0-+lg^
zD_Q?NM}B_3#kFfb)n+~+A)C6oyVJc^J%tQDaGp_^WuJ8!r;cf`nDhEnyX#y3uF#s9
zpEV*eZ{Do4lG=Xr=1n_C$D_{9G6H9!0$v{RD{XDv;;tCY5%k}SqRwLy?z-=vOs6S>
zWWyrv?Cfj}yL<J+JB5UpO)CRRevdY8*s_J*%gZaRU8_51r(Oz2z2cG(3y&gmpqlB7
zqf|uTqepDzRaK_l?;IJ{t$X#ief^<Bhr;At#Gge(P*VwUFdaO2aC&||{?=MP*5Hf`
z@c<qr?!dsn+F=<k7H4^G8XB6u$w`(gSFTJu%P;)-LoRY<Wd*y$Oc7sp^qum5-|Tta
z)Yy}gow=+kFTV)Yq~zysmA2_ReD>@HVf}ZvZ{4CQFE2M5D8J9C;410k>${Jw_465}
zzqiZFRV}jhk_@G{1@7Iyzbr^;<=&q^u2wxCwwB#pAM^E|ev{&1EAj9bT7pX!mJECz
zM`IdvE?-$sx90V^efarxh{bOWE^w%<x%tWB^zYxQc6OrtJ9lo_y!lRlnO|9Z`=W9Y
z`+x7^Mg6gCqqxjWd%vxMDJdy?g@ja%jrqhSB(DEx+J~(|OHaR3K!8q6OpH^^g!aRS
z58OY5wuEckO)dQMVBdkxt@)Q9N?iGTSU7a>?|g4n`Tb3ak8QVDNN)4@I7#iO!mTLJ
z%_=M`>@*Q!Day$kTEiDwbD)r7Ys)GvE$!^=T#S9`>FLRN@EW_osb{yVL-zRAeDA3b
zJ^bG*$-4G4PT{<P?K1zxix;0OdTbMX@}_a>d{VQ8m6fWw`R)e~9@zaFI(50gbu%L)
zqx$jV%8rf)xwyFcCMN!VSsQ*jO8$PNoRdn5c7#VyMa9~Nh6X!(dtVI!Q-L0=+?%AN
z;K<0pFJCl2IZXKc^Up>u8M|VkBldOA4&r5Q32R1g%UeBv{=9E&jOnE&!+lkn0)yU(
zUSr8^FV7|NJ~?sr;zhr&U(bCT8XCy$^(<>nx`;2iVdF-cJ$v@ND<m8ETj7_nSqmE*
zwe#mUG1IN#+Ogwmm-0ssGrU9dwU>>fMv~h~Owuj5@BZt>)FatZ;p^_u7<&CZsJPI#
zCiB)Mr@VLXN^5GE$Zlh0C98~`?)UNIN55AAKO$@k?;p676Sh|3Sm3rht%hDZUcP*Z
z5B&J)Q)#H7=Y4Es&OkM)+$F`z^76jMGfEXriRbqn$=nowCc5&U-6z9@_H+2JP1j+z
zwVhJz$tZlOCD{M%n<}f}x0g*F9S?kbXi(S0OEj|e0<jgFT3aQ&Pls#JC2Fm^uj*4-
zS;<m!tGBwOgnA7<i>ihObCk=_*4ikA?Q?T;^Gmx)#r($is&Q0POY3&Xo-_PPN<7%f
zWdU0y*tocuSy_Yp2DUzXM31KPvapbw91%;){m&yKO9p59`unLBqgnAI9LnV8<|e!P
zuU+A)o4Qtl`bB#BhKp}6E4#YNmYaxa32w%146h!TT;6x=*s+DhMLu3$DpK)Y9}i*U
z;aPqES@ClIjopQ-D{caokE#ZZdka181qJz`PEoyxja{3VnAnzgedpJ&U&9!kIsET5
z#2gzWOZBZfWZ#Q+@0epjCW}A6Ylt>hS2LpU;AD*t9&k%Lcm8}j%F*M{(6X|!HD}~!
zXoo&WG95csRoEk`^5?NZs^7W9+bS<_;oo{GchzFN@q<5{i1ktB_g3Gf;^J~>-MV$y
zymc%!zH%}$M$>&I&I9-NOxJs@Y?9|rWi{ALx}K+PfZ9&HaC)YS#}rpC%E{bkC&R$R
zR2FhZ@qWJjut{IB_w@8MRcKAD)hQKM?-Q}5D}R?rg*}p}MU&T^Q;FS!BB^}&G8aET
zf8X#hJ$Dd2R!T-jrntKL{-T|o-HXF}$Y-gu{=O!<DX%~_3o9)6WSwgPH|oNp6R{ea
z5+Wk3i;Ig12?--Nbd_*Zl$4a1xn$Tj31}2QKV&a@eW1Kozwdj)F`kOV^YJelN2#d1
zmIry#uhetntcQ6-8>AkiPz0oGd#vXEqU>W;E30_C0`l@?)>-IX?<=NWp0M>gf(<}b
z+}*t+Ha3>^f@tRJ*R=Ht^IVzFntFP;`}_OD#Ld@nh!_N*SgcNdKA`fh#zJzSCFPQx
zgM+H7>h1R1zF6dQiK-`0GNB@F+rIt!Sj&N3d-e?e{9dEIWeSDcq~X<3a<@Me6kJsa
zCWmfbRXEgiezMznFi^Ty&w?s9nrY3NH5+(%g2E-Pn3Uh!F#Yo<O;FXE(3-#9!Yl&c
zJS}Z(DskHV4_dZ3{PEyop`V<b1RyNFMZJcNo&7c%&F$N_c|}FpxVgFQZr;3`l(he{
zzW&t`>VGZqmfx;fHTX6-&m7UVWBXg?_%_}T>!aA%2&0zCj^`$xOVl(lc=+?@jc?z+
z@rsFUK>gLpve-bWd$v|Mw6jwr=8V>b3nfOPqN~xej&GV<DOaxeiJ4aRm9Z+_4h`Kh
zH_=I-qRkZ3a4#T$IaxQ8j>mnPno=94OkH=g=~}k#qkiLYbElbsqnC?3cc9i#y=cA`
z92)BP<Hxnvad9Ttg`84WoOG<*0r&1PsH&-zwzua8<-S{0KXb;fpg`eOOw82^Yo(jt
zK5lTIZN4ZPuCeef<c#Reah;*R4h{~tV`6s7%gd7kPQ@ufVaA#8$5-K!yUFaw$|F_N
z!hT=1t;kCWn|ft7)vKhtyPNV>!6cm%o9o`ghj;z_{OYVOdbPG3xj%<J`DjU5@X4#J
ztS0_Pj9rEF7cN{d9{wEJ_TEW^ZvD2~R~zGuzrH?x@4*9~`1n0&(_LL%78`c*br+kY
z7x&v0Q!>MIlv+5M>C|^^#81~d$C`0V6el(YT25Jqy=o7+?w^fO;}aBI*VEI3Gjz+}
zpDB>%Utlx2@56kL+mymRC9{8bEYc6xt~Dmhqv$SE8T9Y%v9i3#DR)z7`S0I`uU~zJ
zhA2t9mr#&ytMHM=G+4R*b!==&L&JuWl9Io&cJa;EW=iOJM7R1<O)vkdcWM)PTw$Wj
z%VyM;W^Cu=R5d%+imnjO%>rz~z`~-YuFkMlYx2>vXAgi41R|QNs_2unQ%pWBC^B!{
zwCOtTp?|Ff?)6K}djkU|WV`I%y>_@h`k0CejZIe`b+9@uYNxZS>(^_OEWoz%OXHa`
zuD|!v(omO{mX0is#p65%;Jvl9w0Kc8H|;q+{+oO7+c#QUTie@r?(oUVZo>h;r6F+S
z=+U(j4KEukG|!!*xufEZV;eF0G9UHN{m*E8p35%*pf`>EWm664fdns(x23N^lTov<
zAfGmI9tW-9K^-OL?OT>m5yQ!>=-dX2+p7G^YHD;3cgWo}^jus^8i2L+0rqukfsdQ6
z9n=xx7@wH9`uvRg9{drN-lpdR;~f>_9L|CyY@cs84t=Vr3I69FAJeLb{R=aLAM5M=
zP+d{h$H&L}Mn=>JkKt2qKa!JAGorYm=6wGA8E09=$cVSJtc<{-bki!6Kfgy4l9G;G
zxUf0zvM~0~3+<#jPlNbIG*aq{ii(f5wW<aN-0mJ8{lmkpB0R-Ee*Bn@UR|ydlAFD=
z#o(}}re=I<6`)LTaBx~~ZdJBrYZ%oNlz04+fJPXV_u(%h^VNUtW`A6GuH+TPNTH*n
zGpTs68P)ClUiJZ8_Kwr8*r9vb*I|b?wY0oK&BIy1X*+iA+{S<Y`A1V%ml4R0hlgif
zOSdv=)!#p}d-v^A#s{oS`zdM0T~f+6E~AUPoSd2}vhq7#345*otx}aqIuPr_v@}tm
z&0BSp!J)c{T2()_7-is4e19>sk8BY#UY>tO_YiD<=gytJ8~L0{p6*2kTuq^1HPb;}
z%WLl6zyGnO<_KUR_Oqz@C-&(@_jep9o1+IGK75GZU|V+wVAV9rc2X@-3kQ`Qcek=$
zX*Gb{6jWw352JJ)(QSc7neAAg_$3a$2|F1s(mIa#vE0#4x|5kHvAn$8H#NnIuZXuN
zc&EL+{go2s*p4U-flav2KY#uteCF~8XVR6TqM}e%<HoQ4>+Y^LUMZ=a!v-NRZFKZL
z-jxqU6U+vc`BJV;1?UP5x}Bt?WZ09hVw}uu{uMyF<+xEVH21P?-P%5-o&MzMQxmiR
zO6o**tATH7>Xg2rAIkP!mL2N=eQIkpC|@rNal`<n*bmh%c0V2{xQOp5Rda$FeK^Cs
zhWiaYEv}wa)F)4#L_V#|57A2@kPpb@EYJ_GqyN*B{dso%G^+d?P^jb;6qcF~IVs^#
zT6MqMd@Xw$N+4;{T3U}!(I%_6f{yeL1}WV=mq<%ROG|4s`ZZR69VN#&vweDY_NbN?
zJE}}ZW@fl$=80I9Q>WHH71A}?#I%o;g^7-Xcw2nmzErQjV@DiMjh{Le#9e`77G`=Z
zwKeA5yKNwSVTbGmCN7d=`KhyWJ0Bn4BlTThEF|r&U-w281cEja51)K?kRrK_R5l>N
zT>=6FZRw^T>+0@Y7QRyWx!-PY+VPJ&@JhA~8@$oWxk#&y@>+2r6#C>-p%OG7(W_r~
z1#l~D+o={%{D|)e8=$fGVJaslCxC)m^}`asF3|aw2z_jqDssdLc^w~L29S={sO9>U
z^k&-mmscpZq}-o>b|$8wxUOzJZa~QgmtUG^&w8JaKi&T^cqg`CZSfi5(47hjJ7#BR
z0ga23QVSonwY7O}QcZihyET$JDLR_Rz+?8p$L}}ByWfY7zH|X)-?eL(QT0E&yPw@#
zEx1+`-QCsYx~yp=%K0cXl>W(yJ7HmAo|#@A?(Wn5emo{s4|lZX+VV)~He3L?#NA0q
zOjOp^<^T#L;3a@lj3!9&&)&E*(LJ_vM<W2d?^T#EZx#O*<+e3PKLq<2_%h9T#w23j
zo)holUcD+E*J~(jwQ&N;0`TZ5u8WWke)jB>hG0+Lb>Dqs*G|qSzIw%vyI2f5C!pba
zIQE<_*M59<(eh6w()<8F+El3`*i=4Ft%<0s#I2c`Z8h8}ExiR>(BkS<Wz^CeH*R2|
zs6o_PTC}rra=?R}IQt~SHNc+CP8h7Cr75qdFrMrp2Vb7lxXH=W&D$B$OFw^RQB+g}
z+gEmQkVGxUU9RQG$UfkllANsS>8VIaBA``Mb2Iw0ig2jJaJ+WXQ9ZqFf`WpNA|g6t
z-{}8KNhorXTHN)ut?i>(z}7%DGYZ87A4Ybtnb|HNiFCVumFKdK>z3xH{B(pU0<4x$
zQ-CZUKY7yMe9^F?^bXU~QKO2E*Vmnvo~}jv1$`$p^~8yFAXAS|?EWVtWTwqDXx9I)
zT~YSbruIgoIPIj0_fv)b`!BvdoYZVGGf?5Y_&s!~ZkMB#l~7Bvu5VVBWM)9R6zTXs
zfBLArJnZ4&;TOU!8_5E6k?`h?%FUZncv~^gWoP-tZy|u3CyK(x+w$%A%uV$Mp~b5n
zJ2u$6x*`pj<NEu32aln&ZBNz$SGos4D@C^^a`xqq2LOS!(Mmj9w{E3uVn-Ws{uX@f
zk9=5S>sYv30%|ap5NGO7#iFI<z~G>mRhww0AS2ZmoV3)Um2J2-=gIux5l;;UmlxPN
zzg-pZ;~*PZ;Rjk#Vsxv)!fa)&jZ$>hlO1cR(Be@xon2hcHgDOF*P$|?90GFkjP%|F
zssIfl$8gAAEoK8GjgQsUC1kUK_mT}p_}!`4b5|XcuZf5AZoF@|Fl|g40wGXXelfQ>
zD?n?I^^i&P+d3eK<i4AkoAdGW(*Q4+4Kbkb65RXZ1ur%eH5GvO*T%-{bK|;1al@LR
z1=KoT+_ka4)3)f2Du2nJ*|7~A925yAmd>{>Xv@$DJPHnx?c&1F_D#clDVZ)<y_|Un
z$_vyHF}uFQK%pj;0b5BOIQ?9v<adNtqL%u}lV#*S<D`9981QFZ_Re_Uk(y?juA}g2
zXdM&N5!`NENBDiMaE+GlH(nJvO#~g_4r-0@^Ts_~UKk9*y@p(+qOZRL=(xc)!&_H~
z0|(12J9=kORT+BM22M_f0)wEqH~|8!4j$YLrueAZOgpm)JqgA4thV;u-@hHmjTR|d
z7I$}dw;J-Ud4`o6_!@f>G<@&gy~U6BouB^QcxK+hC+@FjbxqCKja7aVxi@d#)V>Wb
z*zky3$||fP`(=#%sIp}d(WJmk<Bce+pb&kiVx@RvpJ>lNjHo!Z=~{+{LD03BAq~mU
z(s-XT+``D0a|&1mEEeJ%rPVH^nl`54qdNe_$gC@*yN_n#>aph}4<3A(o*uktVPOF*
z=lgG7dLq^&qZZ{GSQ&H;)xK|V(D(kPJp+mHQKgUxtXCF)C?}<+R<&nd=^q)nx3J(0
z{xml_m)33tFkBlZwnl<dOGSmU@+u*LL04DzKUfKJBvcT-kZSbTWNl9DH)W8Un-d*(
z?l5g3{EOiApdi-3Ee!btSx8xT;M>w-^YmxdE0&><$t`=bva%8-VP<A#?|}mqAhg#>
zS$O*NRd)8~;)$czNfiXlO9S!&ELTVfZA^UkPWHKi8^`I>r^60fa?sK!8`_>LiqjJ0
zV_%1h!Pn$vV*vYY6&VU8u#mn&AX>hIz9(6MH@)48mX5BpzMhqEO#E_=RlCpfvU}Xr
zVP$3N9GmV3PoHvpSR9Jzn{j6SvD@L2EsG|w!L_+15NZpm-(DT`fo{*j6B|drbamCr
zkGHn8@B@RC`LRx2T<YksG)yz12K=geE<)!^4JD-V)lq*<NJjt`pj`Y14y@-7{%BGf
zgYSlAOh`#NijOt$`s*05L#~A~UlSSL2OcitFv14%+ZAI(SsFYp5b_dO8q`y3V*<|x
z{|d_d%N_<<-LZq_`@JXUs;mtRdi33_W-rqD_7@uX<17J<-ohEJjgV%&^7(nm;;&(H
zl8Ibr>i=lzXzV%VmzuX~zMsocRWgS<)j0a1!GioirqWKrxj&1|_wAe6<@Y!1l3j0|
zj`!6|p(vu(=;`Tw>FDT)u?$jqDZ1nZ22WslNQiIXcIkmLUVj=Diaz&-eE1-5)s?pa
zRTd1U6g&HxJ+&buy?|{8*Ttv>(%8M{KLR=Lt^WJFuV3%FoM%UkE(Xj9t%Fh(ierjy
zmSf#{H`;4uE7|^0u0QrM?~r8&$Chl_O`E%AVEEg&;(seiI7N|k5}nwms9b$1xjm%-
z4AbZ*{n*xkIaD?xaVy_L4GBJ8zg?=Wyh|G!RLZ7{oc5yCzf%jtN~>4xTR*IzUk(Dd
z5r+Hd`EzFBQ2#1!H)h+yMFAk_s~dKK(f{<a5)K1r7vj!IF%Qa4&&(|Q^705u$31A-
zCLpUa&eOc;RJS^lDhih-<#05Kg6*!jT3B>9%7Q2DV7P`t)JT-HEq9J>??dc594;Zf
ztb3oR{zm6K`EETKdxhfG@+rGR7Us@|0QaHrD}CeRwad|E{fBwk@ME+v-A|}Chma0c
zu=;JRG=%?{ygW{<14XxK{rdGXE<bk>L=Sa_PgwY^VRBbt=+`vskV@;VPG}0K(VXID
z45y}l??iu~3?H@qEr06d$xNBIkiD%2N}8&ws>T<DgoRhS*LHpS^eN4{Q|j^K$Gi~V
z$68W+b93iC+E*c=uc2c;eE2ZcUFNMmWSL;LV7r5ELov&`KOebVxOwwtzJ2=`A;O}{
zO^wy@>7(+XN<bWbsks+tCxEDoEiK|aXvLrvNA&geqo1aW2n$z2i`g62{t)&;iP2${
zY(g4m2CGcx#@js#bR%y#IF$F5_-@#??ZNfI$`atj&Kzss_wVIYjvT>T^9HN)q8>F^
zghg*!cV}VBYt<jhwGm`Joe$1pDk>@o%c3p`Kqx4dP&wD+Ddg@BseU&rhY~}i4I;wh
z+ZE$ezce+`QehJ<0Bni5{<aX(&3Ij36=Gfg-pSw>G@9o4&L%_bU1RVG^%Ey(?A%^f
z2k$(FTNVJ(C?dlE^1`E4Q~Thxmb=e(p2^Z1{PE+#@eqNM-d-LYgwo2u?ct20ckj}T
zcja%oEWE|C^=)>wpOB@l97qDpt4`l0oMJ*6%+1Y-%5P@&x{gwzU@>uW&umt0@<FG$
zoDq=@dTUS&G53XO7T<@sHSnc?c1lq$MJctMq$Nm8AKA-pvbyq@l$Fe|!!Zqg9od%7
zzv`sZ9mlkR**k`UjwWh#41SG9kH+tVC7XI4b8&Ggk`v~>x$%DaxWfBT-VY^4#jHxp
zw>*~S!p)`a@-GK$m;Nk3vpnJJT9j+54T?*^Eeb#GE;DpT93d>jQq&Sx{Oi|7OLG&N
zd)bM*VBj`!kP<8JU!j5>gl*?KXf7`*5W;vOcH@nFE>u6TNK{{ZiC0Ir{#7Ss8Px#q
zdrq`b?P|7OR#uki@H5T5s=(yp;^L1`e;_+i<jwu9UTVBMfV=+j^XHsJBT7VNrP_FV
zCVgkFEkRP)B2PsOA8wVP5UK6>@#A@pV;tMIZ3_#~hk^!2z<r_LFVA6gqk-$_ZjEP>
zj93W|ZZp2uBH&nn_>|h-mh%~V094YiH47$a2}1brT~ye;d$+o}dhzn&FKa=j4Dd!U
zTK3v;zI2@coX^bq<FV%m$O2W@fiw--;g7`_QMfxB<}(u}beJeN5B8Otq_c5w+%>Ox
zikDGum$EYQIf(PQFce`6Y4PFilTW+s<6faOLRWojPym9y4i|*W%FEB^!`U3H3_JoO
zGW4XIZq0J=iC8E~lXGRf;0nMbqJX#H;WDzaH^+v(>c8Ds^kAZN{Jb4<kN}C@+SI&K
zQY}LJf7yVnN?ccRU$DD<_pS-VwuSM`dLjbFG%$oz-yUmCHHBHymTkEOr>J%=AKT%{
z{)@cq>(EKP$x-a?E~pn!Zn`$BVBVH<pfFG3$m~2jrz>ZduFAH&9kt`tGmt)Jps3LM
zvFuRM)eUdIF6y=7j(*&8-cB9664Gtn^+7ryrm$b_P})K8i8K$#GvPg*M(4b(m6g?a
zV?cDzO}JLi<(ydX1*mL-ByLW2-5-wj;#O8xRw!)2{xE{njuYtvqH^PgI4*Y@Wj_C6
zh6_&?lv3g?!NUYt{c<fEKiF>&#>xJ8_3D+V%TF_naPjX^Yf5g*2LNNkHEd{Wy9e<3
z`t@sG5KoX3^zDh}pMyAUI4~%C96T&6EVV_DA*`g{Uuso>Kmb><ur+6PzSK-yY>+-t
z>jBIo&cI5~bT(DUoXN=n`KwHKRM^?s`-rv#mLKIYr`xH{J^TN%05CGvDy=N+Ja_IK
zuzJ{wIK7lgbn>&8F9%$Sa-s8DtT7Y?V2;p9JNnTVMr(MoM&HfiRfEbVFnpa1Ge)q`
zM^B&9_06e-efo5Qean`eQc`nKxr+wfE_Ie}J)^9x@|Vhne-YFTSL<2bQE1Jil))RV
z0&018@Zu5d(`D}Vu1$BdllNNc`bYNvTGV;xun#<?CDpKg`83o>R>cJ>d|;TUu}Tn6
zn(*bkTSS(D-3=J~sOGz-;1i%@EH71beHFaP4O_R~$F4E?^mreK*BB~bTcL;CvuDpl
z!Hd##gff$NP-#?uj$)=e#Kp4SA=!ebcDznJvxR0RM+lz#wd{^@duRa@GrQy?>eS#Q
zLr)U5_`;iPQ1EPP!1h2jH~_>tasE5oS|?DF3vdU+>IH0BU`5jT6qm*&ph%_FOa<aN
zL3aE6{E#n}2dBeis5Ttm?rQFyu0WzWx&&Q(lB+ZY(8KLKrvk(Rh3*lk6dD%}zs#XS
zCGG87X}qbty}co4tHBgoa9{CY;!4sI6xe<`shM9~Jj2jzI7dn3om(ogdEwyLjkk$G
z`nY6vn4Xrl@5x1j(V2sKNzDV;y_M*3!O_vIs2gDL_=GSG0rDZRNa$I(iXk7)4ET!x
zg}y3tV+^f1C`Z?!9X$V9{MG@%y?aIX2hw*%y-tC=X=G#+1P_CrrSC^`Qtb>shyM)N
zm@!nPf{nG0tIcpOR3OEIy`vo)LGgyeBO)TQu(Y%j>Ues3`uydL1^9~<MJux+=>3QJ
zgLT#?pnwpL2W_M*Ow4rp>q!w~_$89Y=B7v>XzJ;qBt0nXVZT8isOF1~Mp!*4B?{wV
zZSAKOZAI~IR#K>aa93++1t%cd4h+!XQ=Q%1BzUY-)<8H0rz#%)MLfyj8zL6_^BHc9
zS__8OSm)GLM@}-A)8r<#Gu%;uZH`;z18Wa@e3+E=HqWIA>G6(iCb)#K+Ds<50#AS>
zilA`$v>LdrKYR9U*7O~DE$}$#alcmG)>CM16%{GL5vZn>@a}BglBnf+rYKh$?y10Z
zOYd8k914_wokI6R(4q0=2+r?116`89$`TL<zzMj5yTM}rJgr+R5%KJqkid$Yc*Dox
zUo$|DDgq(KC_fg~)@!NQIXDI)oq9@ub>Bq>H#9Zf2B4%Pga)*=p`+tkdJ0-Se)xkK
zAj|F*6fCt&Ew~$dGW40buT5vpdK60N7NpMNPT7iiN5Q=rZH!k+yO4tRoaeqM{`1!_
zlqZJB(^a&G7u*$LcJ+hY?K>aGyKC3Cs67hkkZI;M93u|0#M8h%BgW&TrC0sjl&&qE
zMADl#kDffalaR0%%u#tS`zu5>0BP9ntD-z=uepRC=nNPNRpB-LJD)`mdJfi$*RNln
zmuH*;nn7l!xV5!4S?hs8Y8AfabSaaF=q8-gZM`1zJMbmLOI*SGz-5@dyjF#6YjE*1
zD@#)`ppj_yD46^9?SrLPZW^S}GGY3oHe5m{&yEiu8aE3#mS}$+9eUulKrxrb<q&$P
zjg)JB_4v`bM0cYS4MY?Q+~$e4zE?!V4{j@5Sc6o0zcOPQzPE4R64?jzlazJa9(k1Y
z*o&EY7X&vGF9gQ>lhHfl=w0wJ2Ggs!Ge>K)=L*a=uUq>$T8R|_fWA2~MWbS>g@uK~
z5WHa$sJvtY65A^*tcK*luZ9(w{u}wu^W9EvTuldX>z*FC^yu;9+W;f}y!xGUg6!*v
z$e)(RhFi7J=gUgqWs+B8$4q<UKRWaLuvtTe^;OWc;^$w$N#&Q3$*Y-v@sK0@Uc~@+
zbOi9w>Qe7&Sn^}3f(D+8@r}g!Aj%|`6-FXK(v71qzKpM2p+^WIFE5YeETH9aN?NcH
zsmZqIgUv-Q0#4+nKKLdlC!gN(4V(l*;|qKlPyyH|Rp`0tR_z(W7hm$u_redK!7c}+
z2sp&zDZ;er&wlVE#D4IKwV;yTIzp3Q>#xgj0sWC58LlCPrcnn-kYY@?e>%&9%rLAj
zR4K8su~Ff4w{>=&zx9*aG_##DuZvst=8Yf}wfTN+NBt$YS0zS9KH=7YEuxk!LWCUd
zITKYr+5H}JRh<yG&DE<VpjDcnqK;!N#QT$KAGiaPF!r3--viV&M`;vLE6g@<S;<K+
z>3J0mIkx`gI+LH5XY_B`p|sFr$?nhZtAu0t`<y&^l6YwU%AC23v|kp@^k_f%;z|ix
z=HXxMSEBFe2vx#O4d;GbU0uB}U$hzsi8rBUlU;Ljv-8qKF44Fc?vx6f_W^)H5;Ja$
zJH;tu#|ta0mNx0FVG+w6737}o0dXcNt=38R^6uQZ#@N`HVg34}$mY<%WyX?h*%E57
zT|GN1s}E_85;)suFI-rgsP&1Kr-OzU2;$%AtfO$P&|ZT8RDa7X2!vEmw_LLAhs=aL
z07<mK_(j%7?8di`+y%o*e}8WR2^aHPS!P+fox@6*qlD{11je2&L+7}b{cQA5%hUWM
zu=~bQA#MvOcwk7PuD?g<JLdWK?5Rq=eLp__O3<DG^t}IA7~7?7cn=-g3Vnj(yWXs8
zLHo*E^c4?}g7(GNjic|H_ROEAXXRF<P~vpG&=WUeJ^H~nC+5Z$VRc>~`ND^4g9b+A
zBFI%Rp0@?k>7}#`Y%_v?4FCq(n^}K}Ny<?*zOlqY@}^vZg(xSvS?;TU=b`o;Idv+q
zVs3QSRbcRKVj?3dIv`r!&53OUUZCXkRqu%|hvgu<Ub5$%BNO3&mxYf3LutRgd^Nb@
zP2*@9@g>2kAhwK$dEs4=)bW9H-ez@|vNSJ*qenpYiQ}D}oybsazAPNt9*^@@S62ts
zZ^vn2?Cj-V(W~X~WqM*X?qH)4NduxFhwjjhtJs<-(P=i_vV>ls|LeaSa?HMx3CgOS
zR1kms7i7mFZY_QNKnRKn%|BSCl;1l|Ro-2{UDUcGOSs!tb{!W4k$IVjm9BhyNQa{D
zD_z{&OwrQj*%RvFYOJozD_wr)zz98=W9vWQ*7(69;`+eN)=<G}=EEH#xWH#E({$7M
zGmcl?L!Lgp>x8oV*1L8N&MNE*$YH?c>q%zMmu~&5Vn#k~IFDS1ZqT~TPfq+^f_99Q
zQvc8p9q5{&p&`Uqey|C7?x6e}Ep6?Ak%kxwQzdaAz(9;31J2K8Kl*$~MkZ`l(h|0V
zqoZR{SetFMa-~T+`crSea|Hq@HK8Izh@S3t>g|U<0{6uD#A8Ez8FU5PqQ6pcrz1~L
zpZck(nbfRtX-(gpi$iL?9KPI*{H}5P3Ez7ju@h0vae!<<kbeiBwO&9r1s)&#9vlGp
z`M{?|BBln=!kKv#xgw;jOHsqFFE3mt$uN{?n1(oZ6SGAmiTYbwYD?{7^<4#eZyMpL
zr+)ak?uNa6@Z-m{k*ATJ@c+9sZwj9RB1O?yx#{uajO64s#-F)VSNVHxP7P=Gnx!Ql
zhyQ_`BFHSbEk#5Lpro}viE_uyA;NM*#C9}Z0&WHdN*@QTV_+cof{vN<miHQZ64^mw
zWJLQ`dQJ`pPL?VRdl<{fbcZ|6jE6pXQi6CP#DE{y_kGSOWVLQjfMJJpr3zelxF+uv
zWQyP<D*hSS3BtGhXDoFA3UdnGN!JdEMkCR~yEeW=!-m)b6FK+eYY0s6#70_fuF2(b
zoismJ5D$1z_wWePk}`5~PVbb^Sn++%Ub^HD33Z70gV2ny<>8b<F;4raK4>T7jI2kU
zfCK=d(C0gAnK+2(r0;ncw@dx>=^Q<d2E8*X$b=z8f-J!c_`tvm9K#ORuZunK_kaI;
zz7u-1*2RmaNXc+?w(5RC=&K)nohVa+`}Z4*auO49i=q1l)JLaoRxq)JxlUVBo&lF`
z?sq2UL`XG$NSKW1rm?a86<Z8%=jP@%n;Jt$SU^4sLS_HxC?oQP$(QrL3Q;W~mq?;L
z9NsAD)%B+$q>SsM6i9&UXO5!!M14h$hl2>DQ6fGh>tz#Xxx3IN3L-Kxhum-vQn<&^
ziLt%3ljzVS36H=V5FT;B=X;BQV5ebk$0q*@LVro#BL6Z;hzjYXQ6bOikn7`uxR^8Q
zOz@*eM(JtPBMaQWvH$s`q;gI!i7P5{!y~UiO|E+@4ZT3&N06m>c$#Iau)kt8j?0vG
z<wzvZCgHI#TKZr}Xjg5EJS+HwIAB+=?t{4pDt8xeU<z!y!zKvc3aZJY@bLDSr63R)
zB+NXQXHWvQjNqHW$a-)}<E=vN-ZhPLq}{)GQT(G72pr88ak~-?t%u0Gl04VF4Lc!?
z+RR9hy_}RJ!9@AF=nkORo!k25V#lYPAG0K}0AkkG-rk-2g9S1TDv?0Q{n6bWJ0>I&
zfU0k+7~9$oc>s&lzFCi#l{7bJH+s}T(SUN1+vCavae`PZ*Z=-;L`Nt!5+(l1RMDzp
zkIV3m+UJLU6>9hLm2*Iz&NLb-pxXfemXVQBd!jZ&M+kwIj`D-O9S5gAi@*OH2|*C&
zf-IB%`<u!!4My<%bc#F`!0;zWoh-BUC_Uu}T)|X>@n}A2kE8~oQRPs(mM+{0%vnk>
zNWG#*fk1PX)tva@l|z$zTQW{fW=kH^)MUd&`Xwg|#U($-@&e3}q}yrBKe<mn9#>aS
z3(I3h@(b$46<qTsh|!R<hz)}CjI6I2M{Pb1Dxleh?c14H{CssYDKB5Y{`S@zjA#v#
z$-xDqeSK8eGXi;|D7Yy9B%=27TQy7w?f8(e;t8u5RPQZU>Z2yIQhEF<NV)}Rp1Lc_
zUD0h$h`8<Y^Ii5%b>A|E1}glu_tdz_wynajegx8p&j1(ie($tXR<`5Sp2%$ROsC16
zkcgqMLv(>sgu+jD0`w`;<Z8YX+xYzX^SM51EfA52B#GnbJlB4OXXJr7Vn0B!nT+Gl
zo^6x=`@50A4um%le-1agaOB7uAah<0e^@`^;!mx4AlXIEt?9RuSpZzMIct-jk@3}J
z<KO(@C|^InW1;lp@J2~>k@3vGEDXzNW+Jy2ED1i<2JoJZ_itB+S=-p;bj0V%0iJ*&
z`jf&57>A6nSm)g5PmWU3(xy20M85<h3mcR`^ZWk&ds~k6CBJY7^p0MyRR!QTzX)mD
zo31Q!hYs-}%?ueA<^{Zl*rPnku=#;wQACO4g0_y*o>tNO89^dY2B?FYd&`k@PdBMp
zCvf;_RMbO}!UY;+x_``N%}OhHF3UuD{<#|QyzLzit3Qrt`I`$#@0yu%PQeH7@m#XL
zT;$o!zaV8&&LCu1$c1|DgJcCliG+Ihqm2S-U->K&dD>6GZLSZ+EQ^7XGZoJ0z<s;O
zbyl!&`R7T9O9Yj}H32CC^nz-llVhbD+SA(Ab@{`u&vG(Olb7Se<E-a-p)Nri0I-Gg
z)G*H7)V?MoH&>ceHi7950TGeZW6B?ZSe8*$DN`hFALTZADAVH-SUT}k(1!PGCy%71
zeh2B8hPFKjR+oSzjGYpE6z1(x%m_rmj{ygSp(obqg#HWKqfr!zs@flXj62=cwmaR_
zuVM_^n>DP6Tc8a`c-cgik{U;etvvd5Hv$);Ub!&2AhSY2CnO1UME+SC?>81G-Kf6A
z?HP{n^<4vV9H>YV9fF?Z`-{ETOl>=CfAc1#@|)AjdPIkpGXAnY!^ua|bJs>j;z7YZ
z0K&q)#`U5?kc2)0=s1#nzZT~o^EI~Y*XIvDjI@G4h}Z>F*y9L0upsv=9M~B7{CV-<
zOxSAflUGx$lH1ldK3z-IxF>Tz+IJcf3Q#@=5VkEr8HUrf4u~G2d_Vri+s}_4YE^5$
zTP(gLs29R}%oi`&96x;T;6&zLw{5kOoXnuiP*?uN1H;3s-RpoXA%-2*(a{OYZM2Y7
z0gD8jMXm`u7G)Ik5QtUbn(^bbqh3x{_E}x*KK2Km_H|?HK$l=D4%UbinMI8T@hE++
z6`6CG`~=$zI3(KrCnL)aSz4rPgf8VUBS;_%`9!F&w`EN@b`T(h#--Str5uEiQAHZx
zN7N6K<T4ns)ao;!qBh}(2<wa%!?%0FH%*)h6Qw4B7LuB#(5%7)BiVX)_YckYrBMi+
zdR*uc@5^B@tE{RTgk+^`WtFTOe!s0vhZI8K=wdiFVa(g`+VOEK5|6<S7Ag3>5&7;J
zBvo||FuF)v?{@FF!qnc`xf8`7?4caOblQ#YCz3YpL^&p21OWh<MofNXC7`(|qW;LU
zA>mzuln|&97D~phj|$c@2-5=?tFN#ONcmBAJG?K|QwuoETMWm;RdVkKiA&WMk}z89
zuFZXoWF4id{WSYPm7ic>;C95GaIgsSWUO|pe$LK7i=_QARaII7a+Q7Q2pU630XGD(
zYquAy<CqUZou>@z+1pE;j+CuPer}KCD+wM>OoX3Ha;j;`5J3KyIGqTR*Ck6?o!8c;
zoQ+F=UqE{%TG9GoS~A#(ZLcDgQO~g_C(LK_Mg+No{P5jiSn^6rvICfxB2FB}n2L{o
znU?m2KkVuO88U4E^JM$K7TySoHnA*#9?OB7jI)=gu|KC5hofQr<u#g02n!RdRRv!I
zdp-R4d+3?O9><gf#rqia#KEedGJvpO^-5Lvs9J*{WTb!weib%@9D5n=Wfw0vDc320
z63OGaK<&Rp#M8njocJ0V8s!r8@{K9trPfcVp(NFQ-Yh34M?zm-E5D-QIoajCfA3pi
z0upD^lyEjo!A(j#S;re%`A`8APhZWwSs&GNp9Xg~TwlUY=u=u<WF@`*vz%|qEePW7
zzZ;HWp^0}9KRkn|O8U(Sy*Ot5?P#Ekn*?;ad^`RWtRTR_G0(eY%NCe2iH$|l_Cp#P
z5mM{pPe+<?7?@qTawMtwW>Rm=>G(3TIynqJ6(;@Ir^Y@3);%8c`p6Onhrj5~x*Bqn
z!wO>5`JwDg%kxO9t}NDjnfxAU(1?4AFew@UNZs^UYSF6~FK)S{Ub=MY`r=P>600_J
zpSg21mIg->utQfU2ixOcN$BeXs%YZR*1qc$C`26WAqhRe21X4q5sw2(^hT%XI+&-y
zot)#TFL|}eiI4)Q!2HPWVtF;RPl2-nXx!Zs<^1YWY*osvk)_hhu1=(|B#yUh(|t&B
zABg;$2WxaHaWx0AdceGg+zJnns3+82sD?4mqi%M3`Y-}8l=`SZ#5XV|Rh!&od=U)-
z^^wT1zzQS?j2Kt=mHkMULPsR42iPjf?eZ-cJ16U^AqC@udcU=Eka%kuKUDqs^V>_N
zk{G253*#hV2!uKbd~ZCr8vzEa##+VY=_5(hkW78iR4E$rkjcoKBER0PH{Pzzat`$u
zA-?})%jV`LwTn9~B*py?o)XxEfEk3|eR&;B9hDo;B|c^}$j#o2c2N8LNN45+ukB~D
zQuluC7SBZ6fkX)MFLQl>miXzQq~eCdb}|U9>jZ6i2oDzr<3&cs#?jGH89*eAP9SPQ
z4<!b)0Ap;Bl^{2jb#}flYeh)dV(xiI^jjDGv**wIK7P!eIg_Ejs}wnGz9%ORaBw(Y
zzzHXrO2`+KhXaTYa7x>7aeFP#xb@ihmQVrkV;1I)N{PH_micj6Sy?1kx1#Hy2F+*j
zs;6l0gg*g7e%Qo>|HR3Y2ZjQ`HsDT7f7}^l1e1Yi)sTUvu71GO)3<(e`<Nq^CI9p=
zQ1Dg*;a8$cN_^?VspHNimLsanwq?r|N6|b?RB$P{Hd!48zfN;syzv3@>b?8-)14Ep
zJ30Bmue!_1^U!|ybNH##XbhBBVyGv;E&LGw2-+q-7fE1CzE9bYoHcAU%?lUqAm53M
zlMzf33Tq31M(wVjk-if!n&F{r3zGi{afTXv7Dp0lO_hA!RBuu5xb1iR3K9SaA86$M
zQPA7}QX0;kfaX^yOdYR2*e)I3F}=E`*gD$Q&FEU@)kXnacUB{F?pwrnD~VeV%Ex!)
zE*X3Q8#a2l9)bfBTJgTQKx}#nsc@8Vg^l|&;$cF9_maUUNX1S%|KxdyN=Qm(UHWU}
zD7ym{*gX5#OM_Htw}mlbl%m^yeiF(f&<P=)whee#@i1H~`r9Jj1xYMCVCvzg<JTm&
zaVffU!ZWIXf_C8TL1;K+vPIP5BPDYNm1PhyU<AmLD`up~VC4=uj+3Fn74TxNPxmP!
zG6(-o8PO2fkjTrHfFjg=p1@pRrhV4LEF$WpuWS)9$QsF~)uFV-$Hu<Iegy51YP~Y$
z`M4TE?W#S|?ml4KSl)LhLq4PL^LQ<b5!)iu;|HPw*nxx4v$96c<BE_xr$=fQ6E_$(
zk}zx>o2{Kq@es`{m>VvtHY-IaVH#-`Xv%42gKt5-iTZ()Obf9!sW${dzI2cC8j@F*
zbMp1^nXcS!YqY#DqtS9|$?Xfg`(pPZWF}KC<q|K$7=n(r;j!%~tjn0ssDQZ`?$Hjb
z6JCdn`7C6j@>f4gv#vI#Sv2tbAMb^izaEMY#z5LKudo`peBbO}L5=)6QsZNij}AfH
zCNLG;a~;4`t!EFqFhDrd7U6QdGXVlGkat&$2@XGOkV#oj&_<@l7~s{*xVXyk4`%5)
zIt8u=0Vj<PZuhIB5LxcXU1ntrMj(In08MEC_W+UPqvy|u)J4{?N~|;JV1e#UfFl6H
z!sPp@;_o*|h>`q_r>AH4iifp_{Ztc&=D0P&wEV&M66G7E=HN&oo$d$K1{)oj5eTX3
zB)1F~dwOmTQz^%CeUwmpz=**ww%;6YYcO8D3IiNk9&-KM(Co>Q-zGCpVG>tZkfnX^
zz=}yuP#he76F=P_)&~SwxaIEwgOf3gh0fj;%Jm&ZP!Gi4v*a8I7-#LEm*^~d`u4%Y
zhn3)_z>;JPlcL;@-T>+R7LrOj0jyv;(};C5VVH)_Zpl*u-Nt=+rYcP4x=HBDTL5CD
z`;mz$``tJ9`*%5ZH)08IghOuwh(dEKK{FuYAEa2IDC3?FF82ZhC(hQ}_s!)Q02edS
zbbJ!9m*I+Y$N&fjZkRgM*`~ek$qBf?e86bvt|V;$wFt~U>_+T=h7!SL2sQ&jD}|hf
z*)_xQO_=u;i^H5I2*a0AhyS?g7!Cr&fPao5X!NsxJ38>P|Kos0=!Zgm#4t^|Q3;Jk
zv|>SLXB<XOP@j+wBz|75O?L)oJ-k#<HgF>&BF13;WTwl$9(mN#AMWAVmOTZve7~Fi
z$`;A4`<%1(o7=`(<ejlaDlnB`m6PhaldUVq0+s+LPG~vS7IAMsn3HH*P{RlyhIC7&
zgDM~H@JD|jXcNui(l3|d2~#<&0n(WJHg4P~yQ~Pf0vn1{608e36VtF5O}g@~cfa{~
z6o8sainI2EL|c=Z;FGx%N+oEU5K;J~r9&4zK<Kldg`b5TInkLLlbBd-<ranE$qy@k
z#yI7j?fh74T2l>4^6~&3(j$eGHG81e;Zi;ZAHMPbvH-^4J_eJpB1xtoM)>U1KMyeh
zPWUZG7Z8s&6W>Z?JzZUwRs(nBb@<ze#0T-EhX2tan=Y9Tvn`j%!CY|n^jq@)u@wJX
zV`M!rQA4Wi)jJr4IePtidb7cO1;%Op0@r4fFY+jd{?=FbJ2l;{`7S=?hr$8WIeF+&
zJ~%QFzHr1ewyp;SiMZr=6ON}F%X91}r2P?8D~6tc#l_IHDufk8X8I>40!i5vzcZ07
zBGIX+c~hDsro*okm?H|{ys|h<W@+(eLj@l$ojFUz5psVbs17+7(v{#mrGH$oIU^9#
z4?a{5(H$PAeXG?wxuP6O-8Rgg=am_~#5TrpCPF_t4KFY;`jzS1t)N><tIHBF_k9sn
z3kZK?6`kIyJ$VCHEOX=Ay>?3yD1%kb$HovAm1C{ciD_6=C9=teG9*&gHX(^X*ulq-
z>d0Qlm1iJRi$M&iAW**iLQ;z-c;}>h-Pb^DK@3ubm(9UdK2qLg<DS#cMc-?K)D4i4
zX-tmDF1w(LATmsb+{U|5gmT{{6zhez%k^Hte<0fO0!BFxc*Y-u2}Lj);z~AR!TP`}
zu)uU_#jApN>brbp$IPEcL|9&*BA5p!ig*gA_oRa=uy6-eeaOkNDjgXa5!QEtqNupK
zY!CC4(hH5pc%mblLwESrqKXj4fYWuX&R)yOq!RM4D8<DQE{18{${#MJluR+eIc82#
zDs*i!H&2G+pof+ji58=#LwX@I%j7R98|P4VIF?Yo5*KO=SH8xM3t;|+Je>eJCW3g7
z?7-IAz|L-FKmXou>nWCY7R16_ehnD|63u@wQcG%n_C(C8F=8qg1q}!cgJ1WFql=Md
zWqbR~_IcDV%Fn;(yD~S&IYGKe?hm4LA4ZmTB7R5%TK`^=2S*wWa}bG6k;1=oox&Fz
zK=UAl8wn;sAKnNSKJmNbnkXlU;*z(=HX1*7yd3hD6<l`kWik>fJAR?HW$@SMqUQFz
z;*R<VjW|Wdj;yKK)<z=%tP<u=W+1@FEUu={cb`40-?<tcZhTk#$W|lT$8U_3H>}<8
zMQ6iy!KWjc<s5D`{zxrx2Bs?nZacWvEkI%8C6kNnXZM*<KY7JlP{!;?3A>3W7)*Lx
zo@8cDxZgV;QLmRQv?n*p?X%+WczsW_{#kE$@+W+zvA|?r5H!T%wGpzb0|NsIUY3&E
z*f($PhaT)25)LC*#^aA9LUL+#4bbArLLn%XW>&osf;zl!nz*>QwCrs2CYx3ZNiq|F
z>YWB``Jn5&oea@OA#6lw`B*Su01Bi`-sLAPO42cmgMh1Po<F|^2@#BEGchq;>upN8
zBuywjYT0U^xH7!-qnzOp5oMs9saLcFpRg<61a1Pi6|Iy>GzpG~@W((B3FC@NND$`|
zALU1Qc16W^-#v;S^}KhXcA;yMIapYUc;o_}-+|bRcm41YjA6ZaK@}bz4&OmdP3`V!
z@h_h~t%Ep)wtG5r{xnP>BoWAbG>F%4!O-(MI`=@V5oR*M;M_=4;-*1PYpK*jg6gvu
zPV4#n85~2i1%7UjEkwEiG~`Qxn-lWrBMOB9fq~g?{lDKhbcFAe;Uh=Q!NI}AUi2?W
z7$`N3uPkXCJr3;ILyy75VxY}Oj~=}^m{;`S0~d-IGVY}S@5bilcMJ+#jnNcFy5CFd
z<=UidNle7t84pkIuHbeyxLJ8OzON(mCLksn&t*0slEm}8K%$sx4YO!pSq1$uid3yE
z)h$eQ79E_SX*#L~9kbze8yP=1fBwkv-_(x&Z$Ln=KCxx@uOPZb5Vr!<(vu-6F0(&2
zlLt<K3-kkQNgjKdpMMA*Z%JEi$hq?^EiEG*+55Q_Tvgp#q2Uop1tJ`+@(Wl}xU2*W
z`kX$^0uyBB&#WclNihIwi1bqR{rIsCP!u`Ohx`{Ae2R&RKR>D8LDPbu!KEB4<*A;6
z?T#uZc5O^4KZ}4AnJ~lz+sJp?-8<WVh`a>@dgnCNW0lVyuA!D>Ox2V<3S*FS>T8<7
zB@cQ<B)F`tCoZ*K*?|W`Z1z~?+bJfNuDi9~#>R$B=t6KNa`4Cq1BP3%|DaZfg@>R0
zX@a>jk*}-?Ay==ei%MUSu>bk(I33?!Hij(9kQ9A^>#PcpSZ2&7>LIzNJ$C?R)6lSx
zWe-e03$9FsYzW(53_Ir853>Yn(*B@f%L?3LZ*OnamPQS}-gF{Ok~ZH3P--(aMLG_V
zpeaH!>xh}p$oO*d6vg(Rgtu=iA8g)lw=`$1_cSp)d~3^FeSe6l=Q8YTo*sztNJalw
z+tJKQ-N8_Vzc%T}x~e4?ytVdqqt9a#&%b{Tf}mmQJoTUj$!?=?-$k|$*xrF+@u0$l
znBW0(QsUw&*RBb|sCn2^;1&Y{!@d}=B{;J%V+ztkrrePhdKKA;YDNhT--YL&1YmE7
zjTha&&xkwkjd%UZf8Wy5lFY1Wgo%2$T&X^^S6uv`p{|Lpd<in?fZckqW6QP(Ut%5M
zzDWvhwZtYyjOX0fgv0vN4bbSGbsgSB>%Bwnrl{3$e6;6-;^M<8romah-=ukCUX985
zVWJApaNrDXLl)AX<3K5bgU^z^PN~3{G%WHY;l8KB`WVpD!GsEaKnvM_4cEtKNNj(5
zmWo6*WXD{8(1a)26#Mz*>eq8lPc~f6SJwaXv|#4#^CenksJLWk!fR!5kLTani`blF
z9)%eD$I{^P@8E_R;lLr1MkfoTp5P&j91bCGBVS%}JXu2qN|4B&{`vi+kp4S%045{0
z3x^{I<@WDqLbR$B;{_idrCBpW-X<j!gaCIk>4~Vd4$m2$szRtjIznnNHW7@#qdSZ&
zEl*Y_uj<1+nYvv|`=f)T!C>DV)79My<v^K@7804=Ksh5)ZG$mEiZq(7!*D#2*wl!$
zH8Z1y=Y#(;=5gj9wr1_OaX5S;hK1#j`TQ{yNB9ngY>EMHuJ)!Q9z_N@k%qZyssU{Q
zXvS@B+#9+hxYci00cfolZhK?f>UC`6x1LB<i2ReiML~G}38oc}OwHRes+;&~1!Vwf
zY`y&k+Xe@l$N>;&$VeW<FIY>Clsjl7@C^G<Gl;(PIZA;kT_<_|7c$G`m6ZcnNE!D9
z(YR9)ccI4N2@<2eopO($p-LK_h7?66IDnK2njH@a35gQ#!{CFC<*ABFO(Y*M2WyOA
z29lAKWiK=kQ&Usif3>s^T7nh8NnhSMTH?fc_m5q@U$@hDJFH_;<l5!w>HXB!oZaTb
zWk|AysI^&iaU<Id<!+9Dv2@S2>D~qhr5v33svDjvB#16J`Zswt7={f#uwnT+Y8?cI
zTZpt|rsBAqh3A3IM}Rv@_ZR<Ux*y}IaQD8~gc3;tCGlU6N<mDa;mw8pk@mOLVwAD9
zJQu%jL7E`Dy<UTj7SzcXPhRN*QYOK|FBm5-a{T<Cvb`#-C?nq3&=8z{%O)(P?Huy<
zpTgCl^o#RTcog05K}@(B_!@^)7)=fvh$&cI)t)9aG<2tg#3m$(nXimN?vALHZG0Ly
zFgBKS-si!tgVhYjGo0n9PRv!~jrGQlsi_ShUY>O)EHZKjqHA5df*GmMp<BcAIq@Z+
zrjH~m>G5=A*si7V*5-e|4^Qc!!{kMPmA19&nIqxxe{OGwtd3~49mK85fqF0>Sfr0H
zCkF<u*U->N!xR7>nAFHs`t<43siUuc>c7I?Of@Jlk;w=Q2*BddlW-Z5>39wdv`^Am
z(Mjp|@8AE%lkvDoVc@TTfIq3dTb_j&qg&rETKVHIb-LY3ifkNc&k|f(usX<uDaaG*
z`919vy^`8mW^$<U)Slx@lS4y82i`Vdg<d_Wtwkt}6xZ*cHg|AlWn_?v9+D15BnMtc
zfHu$BM6Cgcnq-t9(5^x41a&bUZ-asL#{y%rSKTIw%miB8mSN6x&;V#@8e1R7&bN*t
zu`BY|&!3oCJ;_9e0RtVV5R}i)kDIe%(r^QbTBA27{yKl|T+jW0h4*)nCT+X6Y11Yo
zW2N6!3oy{&E>=$Wmk}z55EB_VLc}$T>fwLncJQ4rJf%b1ua>Cu<Lwe^OBg~%O#R^-
zo_*}=P$m;x|D~#*ITLTrO5=FC8WilBqs;aE%Xho^mnM*nxeHh&p{^?KmXEvuX;3If
zCo|Lk{kS7<4E;Ahg|-AIkZS@ARt~AW*yH@05|V;CMfct*$1l(Hm8=2t9L*|0J;tC$
z?D2PSH&8^S_SG+7aJvs0O4gP6xw%T@h8*Y=bS_<LtTcj&d4!`D*|o8ENR6sm?)V&s
zTp?UNR-%3*C-CBVI^wDK?%gW}k}LUsW2^2A6&ok#LP@<L9j8&Tmjr+BLCl9C18g_^
zc?X`XL^VIv+nO(oGzoBDjMbif!Rj5_KD>B@0vR9SLm&c9<RLtHg!!fB$GoSPg{$CQ
zlTOv)maZd&Heb@x!hwHa`#e5%A*mS=71)SKsc0r@LBMgI?xTj<OB}lsCAaX<9l%36
zJm+E>xL&ja5B@nT$aT2eGxpUhAAtfmh7UmAz)H?pRvMLU<TzzcIT3sAgZtt}G)WR7
zM<R$kh-hjm$P0-}C;}sH!K^k6FxBz&w1~m1W{Ncn9vR^}`{NEOC7FVF+_6EICKkmH
zN*sBT1&m2kYdi@hz^41%sG~?mmolQ|gR8K%P{CN7X9m`S?u&eKv87P%f(HK{Z{Kuy
zN54Jh_s2VP&m=XO;yt+Kop0f`8-M@w<kVyYuqdR5X?Qk+So6S*@@6~aSp1G0A6Z3w
zmT>v(J(j|VUx4JrzHYP&ya1MnuR~4-AiXLAsxVfk5p5Jx=LW7vw-^UlgO-HH5|v`7
z5Fke~Z3v2+jUU1rp?6Iq_T7&i4*#!VF&PjM>2`#{wsne!2V^LLNF%J<_Q6GhoGO^u
zFVLdAYT6dqj7&^?h=7b-4UmFWOFqH6o_}RAXLk?shuB+7L{!1;v#dKgkX3SC{J9(T
z9;vEC1$Fch(z%FYLQmiLwqaw${6Ga0sbKicqj>U#`7c6ZF<-BmI(6b<Dw&YQ_q1tQ
zIH94@<mTSq(P8rCg)*7tK{h_27ext=xM}@nFGZ`%BO@sI;7{Ghj||vdRgkW(z5z-%
zhfSM?#wd8@WQ&I<ZVgoBJHS_?THByqJ+`gF>+xOTOsE;un2D<%JxYj{^Pk`QhU%kJ
z&l^Cu8YHF)*!NE5lCCZ;DJd!BhTvdm8O38O;t>rn03gON!X80Niin1o&(ldaSqmW~
zLVvY`0RG}ztE<x`!Qt>^nXB~id@I231}L)}aviuCWKi~EszHO)&@Qp_Uetwn!rn73
zeyVF51-|b>#$>k82|iOg_LacSI(0uu#+^H3)EGAdB;$vkp1vT82-aCbJn+$w9p~S>
zH=+8}IXw7;g-hl|=HyBI>E=`q1esd`nUYx>gxnR2+8c_nTglvYAR8vD!<4)VwHcq?
z4G*0yq6hf}A7?}hClB|bP$(q&>N+=mADjabv=Xo~cSRln0RgN5+3Ns9tHJNsY<;0L
zsYR-hkQP)NnEqOkvQiQ2fd*b`E*MDpW19r<u6=SMHgr%GVID|)S_-lou_Mc|z76T7
z$4G<|2GB@jJSzsp5{ju_YN`jP<FMmF5FkLUaz5$%B*&|eFh#_g>`Yvcs{JD}%mI0a
zTqS7OdMYw_g<hm4Hwi$6Qj%qH)OzqEDhvSEm*WmnYf<HY|7iB7IANk4r|wI>J$d+_
zTAWeYN<FL>htYG}f*G5FbC9iyY45AN=D89AhGOJi^=DVIFw#*vdU~Z_zivbVfuJko
zcQL~53oyoy{rBS+IC)!vn>0d7oLjT7|79>HTGsFh!`RwMi3xK<R<8hc{4pFeR(rUK
zBH{&{yF6XS%KXHKJeCTfP{Q8Or$NUaz%KbJ&W+-L2yjET<;82B2!6?6Qq#iz3o<E~
z2r3r1UkhY3-dkiiER*u)O(~h#L(NH<<vxzqU<_5PsYpUXV#B6QzUY-B&u#JT(=j4|
zV2oz{=iy)QU^~>REALXT?a1JBBtnf|>ybZru1t(=Z(%?{J(N;19f!<pdL?z3Hz(b4
z@7?}OD^n_$^zHv~c8!>df)9ZMHwa{Fs{ZL894b;JfwwVsz6NFBM;D$$WM*de0#i^K
z3W(j3$6kSpU}KAU{BfvM<inH|_V*Et{!KXHK%+94A}KY}p1B9aJs=<;VY~s;_z2zX
zfU>KK5wFdXzlZ9!6Wy3rxera?b;lk&9Rsks3R1mlXFb3JT3s33edup-AARsRgswu5
z?X7S1k04&>=njl;Q0Sf*Qkz^9|Mh#@9tIlRC(`eb5VV`>QTQZK^ZiEGd#511`Sq9{
zs-V94M+HIjq1N2p65j6)cT=ySGs0{IkzQi|Y}m2mRB$Ph;{nWjV6Ob0=)5?!q9JGs
zVdAT&w3L(@EQBvf7e&YwSV=`Z^CLqT2ywlzN#Vv#!4-=CG4^=mt!$?DN*L-LOic8J
z9eGF!Sw|y!b)xh9jr_}aXi0HL2Og5R4l;m>wl)W$ba+^kJD!0tf`XM9YaOlNb{Fjj
zT8b%Lpy^+~Vp6hkHp%#6SCLorpA@i9JbzQ*<QY0MZk%k?^Vk*_gi~-5%K%5{kO}U{
z(@xZKo*Umv2IZ43rvBLV3^%~Y)YSKLq#TTBZt_ey@v>+ouZIXKA!YW)R{P?`LC98Q
zx&XqcFVcq`t)d==*auVYE1TEd!FIOHv=j=-F~hf#<PWK%U_O-}52ZrxwhAc@4DaK~
zm%#L=*G@OHvXG|=;DL6&wUXlE*PhnB_gInwzVn4VcDF91<~soCsi(rO7YD}xSV&|l
zWUvPE1W7mI^7h_$LRJ;C50%dhEw_U{k|FQ^kFht8$8zn$ziFQ5k>)|AG@>FxrI02=
zBB@A*(140W^E@dHB%~5T<`5d$g;J)F)K<o%fd<0+J!?Pj`_J$5`Mvwo^X%HW@B6y1
z>pa)F)^Qx`SV$8BcW>XkSx834A6Z41MkdFG84m6$nkNRx89%zHtaa79+wFq(L_J}j
zhjR|C6`ULUDG52;$rUI3vfH|8VE_K>XsxB?Gz}Tmi|aZAjhm1%&T(t(2W%a1!Y-=*
zG3BY~{Sdw#4O$9P-6;wYE_Pi_>$_NiJUa?Z6s>|;5%|obNA1x5I2a~Lt@V)XFz9pZ
z$iqFmQ7ew&dvubBNd*nY^(baR!8z^gkGvIZI}!2`J-i542D85!73Ha<sG#6QTg|nh
z6>s0F4N-DDENBjqxINE*Wkwb*>XP0_R&`Y?5FIYO=<U6t!u#Sy(cU8GQgM(HZ$8ha
zv!ai#sXO)ce1JzB!Mx|VBe6A3)oGqf01;g`x+%5@GS}JJ=_(gIc(9j8UQn~@pC6(R
zz`C}*M`dyR6#M}K8<YU*_Fum~)@c)8_z2p<qG_BqB4a&5YDR^B0uU<q*J4^g#!!XV
zwrv0OdL(e-n!3+R-{GgQA3Uoz;ewor=Atp4sSUH?vO94WxSpgNpcf~E?v*C8+o@?2
z{GN{sU!9<V;8A1J0O8d2h%@ytkC&Fq#TDB#Q4H1BQh)ZDe(BbNI!d|8#;J5ZjU^S$
zteglU_lPp@2&cAY64H*BX|g_9GN0Zg>(rQtf~u~vqGBOT&3%G(XKq1}ll#wi9Wlk}
zOQQvVzC&&gfW0#S&)G$95G)3onvCf6yS91BaMr`Qb%Csqpp~V$9j>p?0%hGVN|<#P
z7V}MW{`u$dPgIdKxT!0*U~zlA<z<A~T*UatkuYCt=<e16)xTtp>-SsBi*u<|i)O>e
z&8NP${;@vNNc2J+M$&+XLC&+eTi<tjIPB2={mYt70$&Jc!kB4ID5h)deu#@DOTPMK
zb|vY-@$a`z0f1ixG>rM-EpOe20z9zc=Yt0t>!F*DC%DrxsQbNS_O|j~B!Nmn6xhb;
zL6nTOMMHWV;!!(9=}OI`S`JLcBX4PLZf-AfCdGlC?jX_*PD|UWb`wyR5K9R{0bpVu
z?Uhv>QzuXJ-w6ymQmER~e%f1I@M+5pp-KAkR63EZ%^_S~01pTSsK9y3q<BApa7@FL
z?5bUHTs!QGee3Wq!nf{4z6!T~mV3*bb)YlK`MJ60Xhm%U-ch6JXA)-Emz4DpTfXzs
zrAr?a6?s$l@$zxd%xXi&ommv~X#*f79T1wk=<=BlqgvJoq$~0doY?01ma%=PvKG{Q
z<GktxmwheE2h8bO);6gV2SJy{ayt)$yjO2>n0!G8V0<sRv-Q`eWH~a4FC0YKpBy%9
zSO5p{H7p`bU-k&igW358)VqH^0y&A*42k)`F(ktcDGNEt#Fs1*hIk^B{QINSeO#x#
zBPOr8yT}z9T&Wkkax~2y0tQU`mDqwx<1l5ZV$pfvjS<jP^OaAfm-PlgeFV~am{h74
zhwFR}QpCE!+agSpG%;O!*;sLNdEjB9iDhiU{81x0IkZiMviuR)TyX1@vri7Au(L7E
zp^EdPf6V_RRKIA=8f@GD>+|AbAJHXW+d@hBh>D@aN}1L#yr<6!KWwUF4P`TKJkpX!
z@UV*!lpeP#NmQ^?d2AVd1pM${v7c82Z!#>t>p%!p-)NGY<p&)f>DJ_o^hzzlDcU$p
zMxfP{Y@X}B0RkTeujxw;ZTvheJY44crxG2LXdiFyK0FYMAx)h8>+kPvAG2j9z1s(~
zCQg{}BBT5S<zCHi4GmeZK0qZ7j=P4=*>PH|I<gGwvKk1#<6rtqNre>rbou(evyieC
zC7hMfwK?w|m6+%)d@JpSO;=Qu5!RKxiB-8<NaG%H<^`K8ZU3~mFtQPK+!*q_9IMo6
z1Q5&x=izU4^j$H{;bS>vj{s8j1ryjG7A9vGPOum)b_qOM&a=>)H%AElLs~LC&EftN
z4dttj0V+PC8RZB@m%XV8SlaE0*P_AUUvSr#Nh;l^3Zmd=>p;1Q`@al(S6v-q;MYtK
zS=M=PgYo@Dxh3pU1-*Ft_USX1gg(c3ekq}D{NN#dx(=9hPk6AhdxWu;OJ2lu1%<~f
zk=yt>c9W7{6p`2ia3^MwDiJ6&{)Zx^cDExXbWD-oD-2c{ghwh#oJ`GA04~xj9ow{p
zZl?!dZ>Tc?>%%=Xx#qp-D@tu`QI?A;dhnnPxM;zuakm5RhQ%cw70uTq<Bi{IkC9v|
z$hw8fHR<{6|8(deK#HdGG7eA^{o!Bj*4o%St?Y=LOZWR8N>!1>$4r_aA>6Y?{=X@M
zrap?9PQFS`>;F;7KNMc~GaNB4P@ekP3DI$JX>3ISL~7b~+N0(HK>T8Q<v*SNxv^>O
z!|$6WgHZ7N(s#H6#F7v~7}jJy>mY%gT5yC2xs3fpLKE}Pvqn$H;eC{xs`m>ECjJ67
zv{=Hz^JCPdDQ{E1s^f+bRJa<&Bst5Bq9)q7V!CxEkrvuW_Kv2J(I8@s|H|)nC^Ln2
zYg19ok=_Qyy}Cy<)K}>pjQNPYb0q&g>{4h^c>>I?UKyrTGooDB;h`6XeC0|6nvtD$
zT5-|!u@-%d{-*_4A^bK`4=UA8z2syz_unQ$U5s&&;L$`={zab-J%${H5L5J)>>`0T
zLCkg?UEK}2)ABk@kNR!*xv4I7M%1Q%7B98i`ejIu^bymV=C+xqS>CD|lQ(_Lo9V~s
z>Gux_(XMDblwKx!&&A3SiA{=|YuYp_U7?B-NDIJ(IbNnQiC)NRg@zNGw}x9B#q{q5
zZcyEd5`d02T-fx>gC?WXcC5^n<ff$q#|zU3>iKqVjrDt=1U*{A15h<;PxvU|RBPqn
zaEjK6b@;W&YZ;4&A~F}c)T}G2iaIK0W{&C9M%L@7@pf1VxXR13wzLtlSjr;D6d!m2
zl#mc9@Le^gY9_wQCp2@eHeT<}A=?g{DL9U0sazuZ7O$t7!4W$M>g{L^4fA|dq}Hn)
zY-cZNX`QALut}i|!7jk5<lwf66Mo%cmjIu8H~Qkl1GNEKaJ+Fp@a>MT?~mli1!Lm`
zTPhN8fkIz-3@?CMN)D06UW6SF5kv4G^xa2#&8*&dFeJ}L7%Y+@XUS$POAVhqA!4zm
zy|uiQ@Ryh~XLaaHebAuTH=}CCw$SVqvpA|T+b~pr@`ao<<9R@10jyC0CP1t_J)Lb4
zT_`kmz<HbNE0^y=Xb=AFukVGnT~Ng?pN99DHRNReEsZEM(d-C_8)h84ms6bRKmJHb
zT6rB?a8qf@P?Cw9#0w6GxvRfv3W#wzh>RO8v@j(814G)5>9p5l_2eL%Ri4=!en?qb
z+C=YU9l8|X1-$W9j0pZB%t=KQh_=kyOA8#1I^ILf054rmOd*cdbC}q)1&hMKHaU6*
zdGJ^Vos-=M96P|lCRiNWU$36DyEws*2H@KcY9*=*3T6RC+y(;%gBD#Q>#@>5KFJdC
zAc8j`y<^{`L;5>7etdQ2t2(iXn$UNJ#^~ix!SJX#;Ttw>IxdPcdi->sKB6Nd5JFN^
z<AU>A?5osYh-kRhA|gj+0$`>{xFvRZOpg{Y$JDhFEDEc$JLVn9P802D1SNs6iFyco
z4Q$~DM@8a)sZPmncfT;(y6XM={W$z!^m~s^NFI7+Y3f|`j9Y+3MM=Q@0wdvz3K!!!
zm?vW%1!mz7b?2hfR$<I)9vh!XGi7W3k~H9x4%W+35Bf|7o-e~mO9P}f>qQM_R=nsf
zX^njPLiDDfIvzOnYX{9Z@3?|8KKq&tXXG)0<%1V5B40fIEjnAw(|a7$`&o*cAxBl9
zo@V5`QL(pfeGD5j`GCeq^W}biey!?HE%O(W>X!|6>V2=SFVCL@eIb(Ryq<#D!Y+^o
z??s`TmQjuwMyxr3RIBKR{IqHJVJCdX42%KPKi*}`h$cN;1CFJd&-1+eI`_<o>k9KU
zG#m#c{GcW4_^*R?x9o5$mo@n)wlb_L4AG4eY<17NT%juWtzB<zeMGn<0H5Snz5n+4
zvw!b3l%zDX?^!nfxBuaO6lB8LOgy;xqxMi#Bs$f6&!6tVxi*i=nu}W;`-S=mGpL&>
zP*GVo`wqplkj&$vkB@l#LdZ$uTQ6*ZDL%%G9UJghBLSqNM$-NQ^oK~{PhL!IYb7rc
z6c=~?-97uC4wS>f2;2C`V*e$h;AVOgOZ116VbVD$Icd_QH#R+tfto<_tlb33!dFNm
zbFtp$+&p{Hz+ZDvVzE8;y9iA_V)rsuH3en6Wiy8f%e>E@H+#R1pl=A8z2nT8@iP0o
zg(-462!N1euoYu=hP-o<m&!89^W@CEN4;xZN;8b;H)5(sp0;I5&sFkmTXqggscMyx
z>SI~ghBF=I#{snw{ucGst*M~Xe~kC`ckb(bxoi72lET1?zA1BveBp96^2{QM+`Ol%
zaG_|GX&)B+S#GX=(Jy#4)M`_~l<Rlz7I4|e!m}anM#JgCBB@%Jdn`I8<`D?1Q~p^f
zwrw0@LWspl*yiRoo9&g^>NQ`w4{cH_X$gc{Vqpo!)v3Cl$UL`&L>F$-u1-psvxELy
zimhV|X+Qv1RWL}-L(v37zh#DoluW_f9Ij7UwEi>8KGy+tCYd&lA6L&5qC#AykRys}
zcwQ;32iPIw4vhG5x$8bud*1Wbz@Q@M!wO^;V3|T4hZFP(-A03dq5u}X{lY4C)VF`W
z_9n@cPgmCJbCA-)rsk5j_f<eIqgb2%<SwD#RH)N~rRp1mqo(jf0`g&^NnVm^b>NK~
z2R*;aO_`F;gS9TzkS0lI-+vMKJz$K|sk3xJq0-=#UZ9AjP|c)$qkDA{W|}Mzu_$#_
zlaQk$!Fv1Q!vV53ktG9F(yFrK_s?m<rN$uCFnRj~%RhX3ld`JcDUzAd^qyc0m}`8C
zR$lZlJPr!8h14-9p!vRgKms+*aI8p!hoEII(xt%$g<fU1w|F|Ip-r%|b>N*uS~X@&
zJKg|PFRmlQfrOtW)$wM;JVl8n<NG~${P@6@h=U3<_OxneDaPz<c=9c6(;Civ%@a}!
zqIWi=7qHlGmOl9s%O_C4F09!8LLr5|Rh-dJ$fRFjo6<eF&e~clYa`EoM{{HB8mGlN
zOs^pye4Xriq4C7I6+IDH49&2NwszfjFzy&B@PYg-NZ;ux6CPFq*wQWPPFDi4S}3@L
z1-{~@S012=xDAMgb(9i!Feq1T`DtElD)p?GTQdl|Tkxg=eZ6HIBpS$x%bNeD(o_8>
z>>3oB(&3`2o02H7Mu&0;gys~=omnJoet1`WDo2hT+a7G0M$c>gV`c86jiU|k=<g8&
z_s~Z7+;47>Tq7cXs~%ctRQN@eDXa=9Sb25;Uho1y)#8Im1HF^K<dkn-LU)M8jM#P-
zUuqi#lc)OYec!Miy|1r00tMMtNEO$cn_E9ykK9U}aDYy{)g3^uJXotfkiGk3W1Rw@
z?IEE5%L<5z+YK`z3{60eth3Ia)xIwJC3d&`+K5v0<>h42eoF;WR9PPXt|7**k9S+U
z!rcH}4j=ZnC_~8I8*$D1Xa_elk?yc!t)0FLYJhbkOgWJj1}GW{qi>>wb^6&6T0ZBG
z_XWcTl{MDf!t*^gql2`ipTZ+Zs%bmFzSo?w4sU_(#AvA_FQ+OeQ*cl>_~*^lr7@0N
z`2|J@ma)QaL-=#Bx}@%76)ZZN^(^A!Qs%9PZRgB%4|HD28A>lN?V*X!NQU2p4Ex%&
zm#GBAM4%{R{rF|l2($v-(Jy2#3c{Fo>Y@~sWy??Yo^D)1Z00z42<6S97YD&j2d-E^
zz_6SV8!s#xbE1gnydR}CrYX@cHl3pbu2Y0r>Ww#tZWX@&^eMcd2`P9PoqsPvR)a?N
z9yV-AmU=^zs@CQ0HTm5EIt)Ygg7t-~9uN4U^7in2|HIU*{%qx#gc>qtlH;~wObGcU
zf*oN>C4apG8m3z&@jcKm<c6l^M-0~a7bK})@S>|wiFXhQG;}8c*xlN^J=0r3XNp0`
zK2RI*U_uD;g)-N8xG~un(vAD?Hpj@SVlB~AwO2Z2;mv(<-Dd0^0ij?r=K|sFRZ)@X
zPXRN$*8GWpA}qJ)bjmWGn1~P*V1+d$x@nPR!!`kZi*`P-aw&54OEmiM*dEh~Sz4Gl
z0vsJvD0%6PkbEMO;>E`^lMafclYPF<#-<DSDTgbs)~|_79MS}=D11!;Y($MKb><1K
zPINqzu^|8<&D(e0j=D}-px-Jk^*bDFomeeKt5z+K&GJb66@%oD4JRk_Vv&{YK)P-)
z;$bUzH!j^)It?Wd5Bi#$88?*=??}PiaoNlW*xaJTL_}*KuIf;RC;q{WKv;(hf|!1m
zU0<J!6S8nrdcoT=o<2Pdv^;vo*gl<s3ir}kup;#fqn=R7gLs|jb4kr~s&jMXLJb`0
zpnC<@F<|oQSEF8@%#1si@nOf$&m&PS6k5gcxeq9FMWcwI1{!^{EcJG`jk1qmG70kz
z$O#r$u;c<s?qXWn<%R`{$_(r_fF0s8cdk#?K^T3s5DS85OFG#N7&`O>mY^I=HKil!
zdUJB?uN~4Ex<qhih#r-mkM7il_%3o!K@|h_-`(9TMMfFprib`Wm=>|T5Y?!MHbV%Z
zO&B<6Obo8qL}@lEW0Fo#5eIMqu<H8BnP5F-q#7~Pp3rv!0+tajL!ERjul&h}twOj!
z2O+8(|K67x?5^-l#kr<91!jh7TiyrkY%|?)X+NtWWm3?^Xp$6#N90CG8n>=`^06W<
zZQ!LRbsWP4EQy~}f@d;bMYHxi<0a@)q(alIRThjo93l{gp1Q;s{!}}YAc&ZP4=+mL
z`)f|zNe|mJ46Q}hl@$_g!xDtDQVUqNHZQ~g%L^WU#|87}uLZ9JI@IJY1jgN_FNzy|
z^G7lyt6*B++I?2{yd)T462;rDKkPA-Sl-*4&aRFi0-}}OOM5~e@)xoAg%zm~e1(KK
zdUmGV6{exWutckq=ZMi|QkDkRG&Sa$v}l@Hjo#F^Pappk!QLOTb34HE;C2)U_HNwm
z?AJ*|x*)}1U?XvOPc}<A4r#{I@-BZ*D|<SS`yuM!2k@p8uk*=z<ZLG~GD&&+Cn3aS
zNnH1;m!7?Kj7aQx&ee0!dx0DZy(UY3AppM6PuY}_vbhT}1*)5v6UzxPk;vgu&ni^k
zad;JX?e$?_U(;j%bYWSfxvK2=ETMh<YuMN}rQ^WB1}~{`m-Ti6Oc2$OT;&MVtz_eO
z+*?&i{oc`-ME#^U*s5qz8hPdn#Wq>$59_?}O^GymeKYg+Wcl-7iqQfKNzTZ*g}um5
zp*|mTmxF(}p>V`^1VO!Ur?Rs<fi}Q<aWJwgF82g^!c{hau2LIlFEKYoxMPY4Gr=CV
z8Vk&I_>JhdW_v|evwSF@XAahagcY3ZU6+QViAylrL$>oPU-6UMJJa5mB!mp}U^7+I
z)#N0_2S&wn9$7|Dc+g<?Vvyu>%V^QjUr(;4r7VnF1jwxU>`l`4o*XF$|4u)!dapdY
zBZudSqgV|*y_%&CpmBV*73ZeF=O@X@MbghEdWsN@>L-1us1RhFkWqsnA}3fN?Ev{r
z6iEYn8$Tfy?m(y<fnCLtgxDW_%nh~&OPW08(b_&1yD^>LH)|Ma$uQMji)MF_d<!IV
zh=N>Ti56@7O~_~vrU9ff-NybITSDooFR`Gx>GX&{tmN^y=`3s2IrwjB`ZF|QzdmqU
zg1*mjfAU9|CU0E}PpjfMIWh8vY@#p0RH8b+5BAyuN%3l4Ib6uWBO|T9xXKG}k%z%R
z=_fIa^9R+?y|PrvH+F-YVmqHK%S<oGMUpVFCP=x&_&<qOFUf9Qx;&r;{vs@fM~`j`
zmMuz$;Do)eH=-N#Jvz*x>*&#=X?nl+`t{}WlKL1{l0c;Nzl0cCxR?lK>h6ty{GR2t
z^!-l`$(ebFOX@(Adm5`dxKzl7ooa;VuiiH&pb*{JZEkjz32hBW(?qA25PL4LXyYWS
zi`Mr6z3<v#Po%ZW`n1~v8T=MWr+>a0ww#o%>0(?JS|QC+D$Y@3#~v7^B}L<GM?ty)
zYe!G_t4^J`vw8HwlT)M4)y$*Kk02-FD_lizDKrGa-L?pzMT%je`afk!^v)eSglf=k
zz!|N*G^)W2ijF=W)4Z^;2#LYw%X%Knl%Fxf{O4iP@Zc>GUAc0ci6~hy#c8{Y-W+<v
zgn=pbs;KD|mQcM2K_Yas7*j*X<{#nsDUO#JpMdTmvi5+}h%=KQkz3<?D=h2<3{jZk
z?P)%w94cXoQ)bN?gaTKCQck1_)VgKVW9PCaVpj)7u;JT3Ukp<Fb(x(aI)&f9eXHr`
zH^i)=cD@_kpl``nbWct}Qs!9o@_<wj)s@h@#O}^_ssn8jHcoI_{zbxV8A0D6JoU<k
zaB9oG+k5WxxTe#G4t1=3b#R*^if*W5VMaEfwzn>dg^G$p6dnS>ZnLanG)~j==qW>A
zB|$H3q|svK=O6|$6CDKT0`gMVcX+o;!G@ufp~82TAJCHWx^RI;01P`?gtagU4&(U9
z$AZVtbE__<0aG+1uptp$_~#$*Y}a6kwtBg#X>)#TWxW`0vdPYlI!Ik5o$D^u(wz-1
z%vu))vFAodv+B?Yi2i?mOc<Zje+D{5Px*|fnTtlDEPrzDq414D>@M;NtP&Rt1)e&=
zk`c~FWNjN3eH9WU&ZCR?dY@1`vuKP^2GMqbGwwP|%MJiuP-@)dsnaL8q979lZ$?I_
z*1tE_>8|jftQ`SEFE?sZjPKdgr#(Pa5TexV%a9)(5O$+#?gC4ZuQ7h}CvBfrrE%zj
zIq+c~_N$Fh!$J4ii1w6kJk5B|dp578;MpYHH-JYXR;{qVX5pnc?l?q3%CB<i<VNWW
z8IQ&25F~;&zY#W}@Lh4{89N0pJJCU5r8t!#P-2=360jGk?(Xyq0c0(^bVc|}@#t7Z
zd@Z4;gh$T_CkOqEy|y%CS^~~UbJiW-j^58GW!G$2sd}k}La2+%Piuqg%*@25Bq0iB
z3B$}D?YHZ~qYd-s?EXDSxI_ki|4gcB?Y7wD;CuGJur2(d`BP@>*s9FTA)w8SjJ&q;
zwi<d=A%J1yg!qwe^&~bCdUA)%KiL3_NwN5j5{0-HFzOU#rQ<VofeXWMtQ9q<I=Jj(
zKzcz&JN0P9SDE+HmaY(|DGh3z(2E;Vu`QUh>*unXr0yibWJeAI3WMaV?!(P~DTXf-
zv!FV1(&m0k-TAEpGMa}3{He@Va;(nnjB-|VxL`wvwu-YzNX}7?370lHj^z(cl@Mbd
zu>BKj(#gy;su`LQ9I$b1#bk06{~m~%x=q)UIjcVEOr_(^xPBa{u;&iPey9;U;9tMr
zZHS(RoPY4QQR=<w<n```gd9E7tQGR{_fWVRF&kF26jMvJxoJC4T3T96HDxf$wKgtP
zwZ|ZG-qL#0Pm+8^bU{KELvN?VSB$x?Uek9MXT*;B%HA-HMOPCT4j|knK*An^L)<LY
zTERe3Va67tqW17>w?y88o|aQ!J1=c)j?Et>bbE&fnod8_untuSbBN4D(xj_$00Ke4
z%_i}h9C;o}6}#Q~Tp6<o@F=*H?KTZ}a&7O(X{RkV7#n-%w=T^uR%YXfT+%Y4BaKDC
z*QjEIGO&94iuEZiJ0_y-r|SJ&`C@dn^68gJ|IG*bjCn3zyPn1ww$kQBZ_CT`EjpN*
zkBEL*bD_hF^CitoAEc#qfCiU;5X*jN`(!2dsOnnt@_JVkS;FA{_1&FkJtf`0m3J5Q
z00r5@XU|S55e=o|9ZGMMh(Z)2gXx#LVFr@5;!M=-+d?=p)jX}6Z*%y&-&B~QeHUrm
z@sJ5xMR$?PuSLVz@7L>aQ)SUvEJ~KbyQpI*$VJyIH64}a?$9q{5Rz=p?UB@csLq66
z77i5EyOt7%g<?6qj7Nd>=qRu&^b^Ky2k?V{i(4=e_LdI^Qo#dE9Fw-mJr6yU^m~3N
zICz4W?qnQ)IRH9bD(ppKg6hsitF)T2hUV>D&X(OmeygvW*SMoNvE)CrJA268nH>UE
zt8OkEFZyrkxdckl7<EPjIDiZ>fQ$^4ih?{DFb=*=Kyt`U`N{EKPX`R#zMX38zmZP>
zAfkd<?JqREPoEx@nRtO`h}uLU(ZhA+R0V|tSQW89Zf4ylW#S!8^t;|3$y62wTW+4?
z-g3DpMl`KKVUBE@(+m1XsI=*g>B^F}dGQogGJy#$t+UV%%RM|aCV)~<WW5LF&DhBF
zfb1b7@ID`g`(oI(87qgg1ci+{;@*gXgEv}QrWtKYrEG#U-@Vad4Kmp!eyZKg%u3^*
z_S@wLqrYKSO=Faf`#6#isy-xOo$l`VviJLH`2MJ#Cq|-D0y(QYcX(8%HVaP<xlb#w
z_X5lFUS3#U>(t0>q0JPg)Bnx28X|^RT*jyGiJ%64RV^hcDTy<C8V`BA562i}(=q(*
zgbavCCai!q&l39P@%?ZpAi57mPd%u|Xg2+>FOLTb$C2@u%=&bnQ6Twe4g}+5-qhu7
zz6+ct@-Q&>I<LjIPEl~ZJv4l`|Fy@NnVDz;Cvv2>=e)|DI7`D!m|qbF1@y`}Y3=q9
z)gnp5)rLqEr&EjGfiocBJdkf3Qr8Cin!KWowY9aR&_`uv9MK}BTPx}Xy5c`v95Z8x
zvSXzO3dK=#<_saf+w^_N`+xuaGt>xVHDE*2*=^Zim|uiV4MM-_K(13Xt+AF;S{iLB
zx0lzB>(xncqYPK-1etci{vHMAqTYM6sshRLj@KEuw~WDD?i58p83TX<pQK#JB@07c
zQYBCqv>a2XPv2`Cc4O};WY7r02klV>z<fH{dRAO@Mq0tb)vI539D3MXEo1HGsm*oj
zKADJ`gPznUPyGDrx6B43X%Vl9O>z^DJbbszwgYkuqkU0Ex3=3HVyBshnXg~};Pl>I
z1G5d?e^RFIX!$)m>(@j|Qh`QNlExgLzT(-msZ&qqTM9f{TiYeV;s)E2u0e(sYBU>y
zo`c>%FI-)<Y9@do2dVzh8j8c5@*OAw#FL>B<M6R#foDoM&(I{jF!tr=grNx$UAKc*
z#s@L0Mo93pT(>6NT|WIRV!nS)kA#&wWB6h0^wG!0%<$44tNX#9W<*g2{)*+O1JX=1
zOlWEpjcU|eR;x}IiBO7sPYnGuj!y6vqpuo%tCh@R{_7sX#)4UrPBmW}Tg(5jQx8a-
zMgJ(+3GJAGNh?;I)s3C9Y^EX>=O!B*&RlxX(6qwM_3KD~(iy>Vi5eIB2Ibq|{>|7*
z7fv)`B~L2PK%UJlzjr>p!Q&f&Bz9DNr@d-aMrc^AD>^bI5w;hE^^ur>M6k39^Lz6-
z1#39`AMQhw;@%bBFw0JH3Y9KwVmF_e>mIcGbN}N;)F^al*Ul<-CorjT`byMk4a3Y>
z9l-i?rBqH1F=P8Hr4J-DlA>aA5WK0C9E8>le*$n!^APo3Vn86z7OTI`A<qRr$g;mU
zdz)ooiwY-|fTg~f0|+4ZV~pZy&C-2XY31j~y)gIIfEb(Sc_GV^K)pOJ;@uTppD2u4
z|4@i`r`+woPjA(#Bb1q=n1jOFkD1hEdWd1*i)ANq)sm#3hnhHgd(0CZyI47sE^Kmk
z#7J^@JOCi5X!X)NyWi`~ncl$5z|p#L=W$UGx<|K&CdL100U&B(X6=iZEd<ich0rcZ
zg$BTq!}vqB_c3<#+hbEw?rrpQOMA6hzI^^Cpc*k2t);oi)~n?GmmOjtH=PTD-5os#
zYGVuIwTZqD;HycVt|=aY1Ep>~;N{gH_2<>t0aq1S3!*@|h+GU9vO#9lf>FK83K5!q
ze`dQ{bHRct#{sQ5kQT->r;y-OzPz3>H>#6eLAdB!I!==La(ZZe`&Fs$Oh<E9IE(dx
z--&s~HMffj3x(EL_v!8Yg(O#^&zol6YHteHm}w9G$dZk@xKT1P2dgvQcL+&oXQ)TO
zWW>8zt=SYqT$@eS2<oFD{db`u6)g%@Dux4@W}b~vpzKYJfMsIDMCGf<GS^=xyOZxn
z?rwGw(B1hHPd{uGqY0?DMib&m><;p!!KC<w84i-f{u4|}w`lZgx1i54JOr6;L&#+@
zikKh<Eu1|&FFKlBnl}jQy-BTPhnRr*B$9Sc!L8#9w4-9xOf}O-cjC6pq*ckX;3^sx
zS@0dTyu0C`rR3J|BVzcLKtPyPsBba*NLu*_ydXpm4*kqIN*xDFOOKY9Kj*nBd~WKQ
zZFr;i76uxi6HoWlQfY}-(&MkxR*3*4=4=Elm_Oa``9hfeFSH|7H^-dGBbfw9norG1
zXXz53xzwjVy32%AgZfnf1Z<MQKXMc*$aTbJWF=k@3e+F-h@Fg5|0gARVvM6ZllQRD
z-A_DuSzZiWwft+i_I1xQ?<fi01h4OODt#Y}V@VLSDk151bhK;Gqu{t(E4H@k>?M4G
zj8i2f7+No8_}l4JMyQ^f|E{Tx)~gZ{u{ghRI!W>iG<vs_yuP9{+FedarulXS+7%gt
zl^ca<ClFOp%B-`u??L$^F5tZfo(y1T_)*7nNDR)ip@tLnuTWK571-z8=nmESV8j}W
z>6e}*oIBTB3}K>`GG>@L2+oG*Z88@>r;1=mhF+^ohPwQMwLlrzA^%I$f6QB5A_3kH
zvf0u%ZA*d0%Bb)mi>>mnXh<8!+6a{90h(RWqv+CYLUL18kl)MchR~~U6%(fGAr!aS
zTwOG<SGyNi7uT`v=O6}^amb*f6hCal5AsbebzK2GD+?lqV_|)SbSVI+k5!mP7V)M!
zQ4q=Kg+$Me?8I3h7~p07E=@T5L?GEZZMee^g~<|=m<cWDnFIiZf>z{h01LrxGKF<4
zVOE9MRp=8GWLIi8HdfE`$&yyxH4B@GaHqw@ZB3Z~qe(Y5t$CQ4Uw%9SrlwjY1tlA?
z@rY*n1$3O?wOWM}VXvv0%ZQpnYSQS80vPc56q)7nAM8jreOD4XNt_up{Yr9k`)LH6
zeyTK?G?H14`mzRRzm*mL)Cc?%Vw;S&<y#Z1cs#7T*k^?n5}pBz5Bv?H?gQW01pV>A
zLQ=x{DyE5q=DEAp$6vZMSak2FxVfVH=q>t6+!is7-Lk;l8wWZO2#fI=x+bh}8R;{e
z^7!+ban+DRVlLtJA))QmM@*bp<7**DuNjVyS2;Q<@B=PqB?_t$0Ve&@lAI_f`gErQ
zibAIY9WX#7)`Pvt-NC6eAsL9I^O|IYN(99?4|)so8v!q#AFQ)C<AWe{&dK}3P||@?
za!avymjmdrsR-Naw!sNHZo#T^<a{JC$2TbqAMWSAI4*Vq3R<hN$AMLRyi9*ghHLB%
zJuDh2#B>n;-5|5C^uVJ@5+Rc}x^>j2KJytGyHrWJy5L6p>W|v&a-TdiG-RsH^?E|A
z<k>;AnnO6tdFCKSNph}zKy5>YIykKG?1}pnMLe+u5Tdk;zZf5kwD}O2nR#K<hJRoW
zpw4L5$eFZo1cO_|kN|voj%I5wS+cuAy?vsXu7Os2>iZ`oAEc>58-q?j;8U#QbJrp`
zxE5@Jh@y>#U$RDrGF`^Db^1r?S;K~hTB4PcNda`<tj$d&$q=cn)z(`Qn=vgDJsf6k
z4SC8DPikswvwj{#%PP*}=+2f=szpH2!HS}13;z}5krQ5Sq2Zni#d>RxM&9J+?3>e^
z+O5#`eY{@C9AkFwzcu_E9u)AuE1ax$%o#=(PzRQ;=w~0Y1p^x++D>T7GP#xsRE=ZD
zc)9{T8_!cM8rOPYC?ixZXfuSuKdZ+_QNExRLW+G-(1oUnj(gNbWGAk%w)VAZ+fsM`
ztx;oXJNyM+m2X$K^xUJ|pzbN3<#lWH2FazDXYB!uJ=<79oM2_08{?<g=e_C1zL>J2
z(dkXJ&2`{75m>y)j)4@ae8Y;L^ckOsT`9#&>G&;{*4CX-E)J5G*4ws_g+vuqzttkX
zpXjaq@#E04@z)wTn}iS~aR5*;X(S~p3YC6GcA`F7owDoF`FC0mh@PAg4~ZthO50h8
zR;4K_VOL6`H$E4ZAlfHA$JFsi#n4yQvzVPJAEX4V2jw7|aMRqh2A3a%*cS93DiSHL
zxv}*2a9#(yuQW6i;3}Sdbb=kP3&+k06&6T?V{MVw78O`RxE(Ry6Zkd8_|lj{<sF~b
zGO;@Q{c5N<Vf+Grx)vKr`Pr|_@1eniK?Cd63><80XlN*u8Z^AEmr5|Vw_h^)4h+p8
z$#d6KMiw+;WI@_Pik0<r$mq@uKY#AfOkRo|G7$R1=g*hc)ie)9$a4?OIubt6O5S_>
zbW6ZpCb!c~w7}9cRy)r|fqD<aEvxIpJQAJ8Sv%~U^Jfq^)G3{lkd|$c|6_C(n6{TK
zo#<TKg(#bBGg-FZQ5m~6%rT?V35xz6KH292xmx=!1wHrXTFE1#plKbug|mhl;>MHf
zi(mo2?{qQ(0?1~l8O3BJZXpadP)pv9l{6SuNB+S_Tbuw4wYd-Sxv2eYn#?FQJD^a0
zdF&P49oO66#_|tT(H}?<mR@q%%(TV2y4KZ-(66-0!tZ7kO~L(6wD$`(S_qnZ2<4N%
z`o8QNy3)d5u6&bdL=>vxq%VdOL#cIb&U{fNTVK0woh1I3%vE02+W*LW;g`l{0JMSA
z0Z;23u&fgaYsh&G9U)*__weA0F2_UjY{bYav^{`&rqNoRg--%^Ksa_AJPhI6@bv|i
z0O=Mc@iWPTLJ4N=DBrjY{g`O!g`RBBp-%)P?+ze3alYIG8x)GHuuEf#KvdBfSa4Da
zCw$TJnRk#V$|5<Kznxhum2Z-jxk%2zKL-?|0{yTRAQFx+_CN{Zm?2P`^jnbiZgVy`
zg5Z;6_6+LZKX25iQzRy&pMtgTzomedcQNfx>&aR7E)CJi)q%V1(%jpyuJN86MBeZ}
z-U@7wDwNG|5F;)!=)upcDk_4uw{GeNekdkVEbtIOJf@-tmyO?)|7r%RYo3M(rm{xD
zhB{41Gp^oxNDt~djIe%^ItY4zT!aTSZ%m)%b(w!%0TC-8ixP?y5@$a4kIc4&Q7nRJ
zdGe$)=fffBWLz*hVZj6jW9zj(yeEl^7?MC@ib{=C-tNrk4Unaxl}xA%kj=0;8ywEY
zjoF|iy-&4pvqeAIU^V|K&<kJa9bN|kqO{K(y6p%7f<op<qBHm=(~)~3eWqNCD%_)#
zDCSP1uKjO<6mf@VBAl$K+unSPeEe>+C?$W-{r+i=Qg!2-)}OYP({*H?j2YQ^Rc9sv
zpT0f5Nk%8*MjOL+VNo~7SIVuavvVz8Q`g)OR}oWS?rJ|UqDU|H*wL`!urAs!Tqpi}
za8lIhUX0xubNaN~^UU8ZZ+E3;S?p=2A=B-7`s{80WR98P*X`G{U8$u770oSof4p&3
z+WGa`@7<{YJ$5bK(;hvtdVI9M)O3qQ_cosXvU6bHzJYMcRy4?Y&)#+eqFhO3%UDH4
zLrB`~{KS^MG;wP4E)G5(6rgKbKlf}MkyH}Sb2Fw#@ob5G0VQfI2vi9kQY-bR?P4~1
zC6h>B!sYZEI@ImOlgT8(E}!prR3Ni(1IZ~$+8%*4n6%@rM`w)z#}>fuR2RobAf>tv
zP&<CyxP?%V)upMeV_Ox8#0r$VZoFQyE{K*&*9e=u4*qV2p|M$UF0*Z;V$WZ`e2a6j
z2C3Q1QU~$IE6v;tNFG1wXMDjY+jp+}8q7mpfYrM;KX3hgD&e`^<@sykHeEv0ImUEW
znvF`r&A`b6FH9Ueb^-p8fgE@ncHf|t#_Hk0?l$)J`OL#{+puw?*_P`*j$dL2hZ%-)
zRk6E%eLdD;BV96dnSdCcXCrS^{RAMSZ)CKiuvT`;WBYhp7F%L%TyZ`7m5fiDCK*)U
z%jeD82U_#K!6i#;HStQ8jt#y+EZn>s9)C(SWB9Z#-Ir4(Z}Kd{IQHr5IvqOFNxB<?
z<~Bc>b0y^F&1~jm-M}!4Aq#F&s=MxdbWXy>L5Le-SB=w4>@`cOWUg_|_>ZR!AI^rm
zb*k{)zNS9*$noRyCh5`<>dTj(q5cVnn|fO793Q+oD<dO^ed%@n{0*4;RbhtF3wEtr
z7tr#<y>(@3bFD)I^o#f0zRa?2Iq{l#GL7`-I?<Wy(W57LTkdoBmTkY9cf%e=WZ0QJ
zeDJ{WVfUGpSfnY;JM*0qQrAz_ltFoGee#naqwF4(kW+>P1l)hUQJXJDnbNjM0-seg
z&e0s?jmiq^Vgdrx$$1+&^&0SxRD2r=C+U=<JJT9yIoa8Ipt**{c6^Sq@tSC3+&+Hw
zw_MBU*reT7^a=a?=yvE~08!bRXy$^&a)i}trl;8Dt0)et+llvkHpV_mySfGC6n<h3
zU`~*Mk<s}0V!A=^c=-LQ4vm<c=$4GlfD6+AlwYS{tf?CP)Cpp*{JRqt4b2}n&IN_C
z-QIMRn@gT3&$@1=O~%Xd-S4MB((k|ha;yo9522R_H`?|0_MhaQ${F2OgOB*Af2U;4
zL`U<I#pX#3JLa$r52LKKT_S_<A_ptvdh&G}0EP>5#%)+U08^T4!opbv8%<Wl?Vav8
zwc00q7$CPQYW|8Qzd}uemdD2X+Dc^mhL6v@%-9PAr9~flJ(?9jCvD%&M^2nD;1z?7
zurBwJYhFEh<-M_^Mrqi`Clwd>Ni4B<t8*>Q!N!JIJ8*M$6NdRU`0pgL+q2I8^i{d^
zvBAX&Xdl&06`YvTj?eyLk)e%KP*m)oAB&lRHMxOUj%WcC7*lDl@?yBObSSo+HBX;C
zGh_)-U}yQREwk9Ve*M22V%HqSevIG{!pgHDgIC)8)&p`!^VF~~Y5P0xu6C5B($~}0
zKANDoH*%iEZ(YaqXD_hlt1q&RXdj~{B#=ol=L7EB#M-b9WT#9iVAXDjG^#Hf-g?KR
zpt#0<=mCQg(<-OgmzM;g00<^~(e+g_;2T60ZaRAW___457kuHOm{q2CHm{3smk#Nl
zhnJptGQ5k@w({dC4Pg$}U;(BrKI9zKQ*pDq%1@Y}1xjqg{FRNbYMsMOlME>U3px(p
z<xdxKk8!q`1*(BrCVokiW>@Tlvkkp^Rf9njk|PhZiwo{)FIr?3Z&&en(EV=+1^Y`&
zyRJ&Tz@=<YZ88NE<l*bU=2S)Q^FxQ$9j^yaoV!<TmZg@KUVp9{loKqhQm9P4dBm)j
z;&(My*sAzBE(U|>F-x8L;~F@ZA(ev{IeQq&R_7`wTlNRFIZrnTlC_EwS$5JSeTGZ1
z_w8GJ8UI@&1b$-2HNTRB_0unHdh?_H;qaDaKvUK9joqsQ5)1`@o~W#BLXtjow;+(5
ze$`Esk<p|zCL1CMR}00Vvo_!B)*Gt4<dduE)~!(#hk<l*1;@o165nZsE1AB<no&S5
zgtu7S;2)=_PgGH9zFKUep|RIKu}X$^wU@U(Dp;z=zdm`Q@D-pJx2*DI90orDbu3dQ
ztSSf3VkVd-2l1y^)Aka-O<n6o(qwL#k)Gk@?HvNET}hNuY|HqK>B&5aE*K;v|9BIM
z&{CE~kGRKD;c(<ZLjwaZv5N8aUl@`6-CIluO7GFNE57yT0Uv_lC=2hF)HKBXniLih
z;ShiH(^goZsv5J_M6nh7_v?2to~h?HEL1wA&Iz#+NI)pCDR#E!Gd?8V`q9de@E#K_
z%KrT~hJ^duwa<?sjBQD8%8$5JJy5+zVxFXuDgS*VH`fa*J%`=jd}QV1^4R~hvv8K&
zpjYknW&M=~2G?mv0V&hw5-wcM#;*{FCTMw}W)&_d1vI_1`EL5!K3=SJ9(0HOlsg74
zbaFNjMr3EsT*-YDTMQGpBDX782IyX%6<N}R=1ZRz?#(`VHU{?g;WW~?JQ>ikBuI~G
z72+UH@I~lU$&Qy&a=a;o-yBYmk56xnPzG>qVZhx}HXtMEwkDO(40svVDS()<6u!1+
zr}oD~@)T_ItxyD%7rB)FdbYj3uC=9(#Oz{W+74=;#(4LpnZBxS#$bgj@jMFp@d1IB
zQ~J>DRs=rDyr*(#Zmg+;fR+q=&ZoaTaAYS(?77G5r%iMu3Ww5wDmKi^1I>QSh^s`3
zq7eTD4Mf$<)L$oP%|&$@$~I!y3UBmeedI-TEjzm~0+AE;Phw?=JJeQYa3Tkjx4c$Q
zmHN|^v~Pk-lavQ%=rASbADh^xRgP=&$}ZlxAw#vOgMO-~(SVU8LS&xh+oaESGoBt|
zS!~iZPI>jMiE67?U$#%GRgCWb=`l?{W*lzz->$%&s%BlBHjn0nF0WEJ?(3Ni>A17W
zr83}<BYnXcRgN~KmyMscrxt&xMO>ppH)k$smCdjq&f#j%+g#vlYUvp=9ne4~>W(k&
zZJw2R-Z&!9cK>4$x5xo3-?Z?9mGn2S#4c21P@FZ<#it$LZ7bL@cM6FG8tDe6|E?Oo
z>$Cbj+ww0v95dwND_I|kt-FsunG`lS>~4g`2|Ec1rJAG9j72`tNY_;Z?-&{!tTpV^
zLa;W=sfCSZWwvE)^X|}0<x!)#);dPF(wfm#e;%e=Q*mhzt9={mOXjj8-s~vro^^S)
z9-Ny#k(|tAw(_%@hLCI5=0gVMv01WWyHaar9`&Py?-BZb(tu;T8$!({%$~i9gsbMI
zx0~;V=tB9m_dI$OyRBx{>vfb@8!RknkzY)@#r0L`-$Oukj>L}&(0=%ZF-X-k{wy^v
zJ(P2oEJJF}j?WtoonlEFCpm|~Wsa}XNJ`7FUieB6KI)vWZx|xq_U&eGf4yblsaq>w
z4TfT_{Z#2rz93nAEWIq@*1bp-`*@Sm-{mtE=v-k|gbm6($*|Q)jYi^8(L$be>DtPu
z<{m`<rE|XQPid^4m{9WR*p=J2jl(P#?Nk_Cm$wo~U}5XeQ>`~SDRd0?JC7rG9^PEJ
zu<^qb_h7ETDM8L*rJs2GENl5<b12$1Ut^<<CM7a=pn){2=0fW44Qk4MwBu9(=H`Q8
zDWK#NDHY8cM^BtkD3};&92OE0QNSTVxqA&2?NAjLePO`6&%SKi%@6940X#}}XX?k>
zGbc@2Oy~>Z)@R1bnxsxC+N3BfJsv8$u6*#bn-t_NJ4Lx+u*KrHXC}L=1bsIOGh{@H
zIuARlxYoC2*HoFErgE(De7&VA*DtJb(c)YXSyqzc^nS?~BBvCsSUN>=PiScJNr_qm
zO((C1@3ILc3e32%dv$x{TF(#Sbr{N7amxM)F!fcV$=5KZj#hFYCuqbxp;-U0lJ637
z?wb1@hv&NeDA|}YuxXd70&PLROu4?cA*aI6@y6&|VsP_lZY)UcG3$%{t~a%X(F-Xz
zydeN(mRz=)tnk)3DN85g?ERs$-COR`W?5A3Jo;iH*`2b&o~b1K-1S_YDajUB?#{(s
zfXZ%ouMY9)RynqpFFF+VD7|@QT7Hb}rcLq>4<n57RdF>WrZAWb|0cS;t}w#OSLbBB
z!m-DrSf)r-xnx@u;>gsOAM)}*6qHzYQaJ{_r=r0c*iT70t77w5<2;J*6N6{mC~2yx
zuFlNAd=Shg>(Y#cGfN!xf_9HHH=R2=Q$GLp*~yE$44q+locc4E%BT3XGl9H5qO~!C
zWXAG?kt@Lt<Az%uhwV@!4^WI>M;6N>2#JZ&pZn{{(l*8k<rNh*sVz0Bs1R?#iW`xo
zT$m$6-1Y`E_>@>Pc~w}x^V})Nodf@@b65uF;p3E)j47WC5kvp_vC7yugg(3(nTr=K
z61Q6(haC@zMig<41@6r&!^~1~v(H+AJuP`0ZZ6_%;j_p$tjXpZF$`>D+}UK(0A$Km
z;OJtZWEnnI366|h4&_~x>YgepA#BdBTnWdfVp|*M?M_x-U6R4KUP9D0uo!9xN(yx}
z#j~Y1-z#-+mHTJul{vxmCNG8|O^~g~!*5D?*K>vecz@!=Uy@vL{l*O&RAXSzzQ1M~
za6KF#R@5uC_GYLSs_MTZb-pfco2W|eeOK7{Y=}7WD6g!OiVMhh8nWm0Y`@8j$`oe^
z&y?@65mltEiIrbGr%ijC()3Ah$&zPd%{uOjw6v@#4Rl&3bdB=(+5$yA*;TiRqLIT~
zBwi72wYK6dFsjZ}{Hoa=d7^6r(1OUuB+nkbWf;2Z%&(HV-7C4{ighzo<4yQq=NXa`
zD({Xy$m9;xh|%pp%v2Rc`#r-Vqz8`vSrf*cCZyeLY;;li{>of(&yD95Vm*?~G;ki!
z@U#RLxcphbi|U6)Gs6-Sjahslv^&-xdB<AAlvx{kGP%(yv0vHS)R|k9e2pX5AD}@$
z0{L!X?e`^6m1n)ZU0x`vBp=DL%o%-Z&Q%Wei_xDPs6Mj%=B5_Jx3?~wzQU8aFK^`J
zEXC-Pp;y{f$+80L>+9`zPO8#aq)H(h9cw+F_Gk5CdpMB*&``LXB~Y{ilt5F1+x~Sv
z(qQGv(5R>tJe6|06DLlTA5p`kBB{iFN#Zow_i$R7M<4J~GUc1Jqm?&=-+d01P-Rlu
zc>eC1h%59rc&J6Eq=+(qId?%AYfwBssdDu88<=*@o9PooZ&sAHxC_C-RgOBZne08W
z@izrh9{gf><)cO~syVCZ7L$#uPdqQl7@VHRqLgQv!Ujvrknr#&<gof1+7z75m!w@I
z5Tq=SK_cCy<SV0wZ%0#*$moYyjyI23SGKfBiHYrDIj1<t071f}9quk?hvs7s+vfYa
z<F~_NML1^(TxeE~8AlkRG(@rED{`x9N-)XiC9huHWJy%>GcF9EhOIt#=~6J``9yWV
z#YjzE>j9A3pZpj`&{?@1nO_w>A$C{Xv4ay)N2%Gp%uz{!XF%4soH37Au5}IAgCdNx
zm;eD@%wFwM`Pp1QL(0oM&E-7*MTsDZ3u-8_j$0Lc=Z+DHXGwm3fqi@i{*ByxDP%YH
z$qj~CjG?|fDfbNN27Z7#$dM8__)j>GGd8WRIMiMG2D53L>$WTTmcL({S~`VOkA_5R
z8ccj!KI$x9tiNQ*NhtA2sihHQxOh`6BbBE<6F|zRJ7qoGaKyLEKP7LR6PI4=Yzspp
z@}?MzC!UJw=42#^xSTt+bSk&a{GS%!_s-_(m8o4il^r{F>`&AH_ArjNJV9y~&XV7@
z-b6=pY_;k3k_{btq}8mCdG0k?-}H@hv7K?suAPU{hooJr6i<mx`ty$Kl2k;!5l7tm
zP+ndQ0+2t&{a1#^)2CSX*uB299C^Uq+zfjTM3FCxl?}xrntOa(3O|W$-}?J`Yc3tj
zHp4Cy)jErCMApKoa7Od^pmiN$(jrYVj`bcReZ1N*P$MxV<xMwdy5G#3tCqUIq&#i&
z=0_7v%A$<M-st>J6IZzn-@bi|-BPBRbwwrVVQrcApc?16^Or8|TvfLdbxbxI`Y97W
zt2iKyF-EYp!Om>v>&^Ryxm1oU*^)lgE-nJ8gcUM;9`G)oCzIJ7E}s%A90eJ+Cb{T!
z%C)Y2r+!LlvgiB^Dw(=6@wTsU#UMj30Gn2BP#T0*B}?mITC1yTp0|jKb9P4Ar~I-^
zCQB)I4JxfG0QeFaZsmc2Ybxu%ky|&A3R5*tAEy;#vG}xgEz7zQRMZKldFQGhxNC1+
z{o>NZV~?*=&Nvkqxp?7dC2$S$yBCzD*5T`{83g;1d@;$XeDnPI`>~(6P9vA|o%2%R
zt9R}Cy`aFZlM^^|K~MLe^TbmjMHy(C6cImOR`w-={Wfv;l|S7HEC|<iQRpS_IGi#X
zyQD>)gTD{%-$gETfN_hh{sYtvAXgw>qE?`Bg7j$eg^B)qAJ#f|T<n-IgoNMa{h5X0
zcw&&v3JPq4drt}=u}QIF9`38mICJshO`@d@plZp>-HGw~y8`7{?O>|yTnF7b>hi5Z
zX|i95y~B)E*_0J^KRW79Gwnf?<d6wv8@4u=wuX_+JIyv?H35$-hien(QJ7KWDF3Ly
zvD_d!2tAsUkQ<6hP}MKOWa5#=A9WG#jX~}!D7M9D%%O94-_1%q_VU2ldu-gIgl$1g
z3AKJRjPQVffayg7RVBODb8$&rRp@YQLy^l@?HxPr*x&u~XG!UlVH25+kR%2tyu5;f
zQ1CbMIt0}(#cAkM`dgC%Vh?GLfr*Lhg0OE>4lQqA?wr(r@L=okz*!))fPYI-`+Qmv
z@68xIFp~URk3MQw9EXIp<|rQe29KLiapa0rnKS<!00x5)CMYW><&+T-a-rlJ1WbDS
z_A-tDrQN^ph`Ep_hAJC^n(EzH`~nvC&&Fm&4jk%kWONaQtUP>(z+=#q)qQq7d-g1_
z>{by9#hOsdc+E{~X3SYW6mq!xuM6%k*iY}7wKX;CXVO&J<r_0Eu4~!k=oljqC=kO5
zbLX1zX<||FgOd|(e1I+zr+{5aq5&A9h>xWCp^9bX07(W^>IG|Db-}@FZo7o5XdZCz
zQ+1Pv)|0~tT8KW|UcZw6`t>c&ybkRyy*%(ugOjfje8nGQI6aYi@})mGCsEgP+Muer
z&K}s-d93-?)a6x%dV1!Di~MG7(Lg?vEkG-~;lub5dh%>3t|ikSK$uF+-g=G1Yt{9F
z+}vwOjjP#y+3|o_8K36l_9m_7aPT;~D{w_Zs8_T|jAv{-$?+KI@#8M&F<6c@u!J#K
z`0gJ`rA{tQ0$@T9<@%z_N0jEQK@H+ES(AHXj$dZ2Z9{cld7X+=wSL&DXwA8;zn&Mz
zY;vA8<SA<h_eDnz8RwnT87LBi71K4o%ePiA_NbDl>D=2YMrSSud=d@69P~~uUqc+%
zwcf9F_Hmyp0uB8IMJmtN4{42q?#vIXSRg6Eh{Ka+`@zYWD1Qs#n%Lw(s#kG)e|_C|
z-$@!vLlLr31R3C+jC?yp95Tc}kts&l8yvOu;l;Qqi`0ymM~OLmeC_2VF_NU`EGufm
z7qE(*er|s3V%~nI4y=Q?6@=&jI<-t0maVN8?=cal$Utz!Z>5x>D8iR;8zCp$Bizj-
zHe1JB?5?7jx0bKWUSWvSIl=+Rm(zsHRf~4*x+}nAGKgx1!RF9@X|QTlFx(c|NQcXh
zW<FRHlr!N}dLR<->i|TY5TfYI3hz@p&A1Hjskv)coJ3n^Y+PjUtDCTU6(}pRah2BJ
z-*|?n9O>LAb29t-GyKL4O~G-%xr{OG^bIw245pm|-qj;OfRP3fMr$|`ZLF+LL)jqf
ziG@sV0Q}Z5KLIAoI&>14Jz$ELW5CyMu#QEEPN7UrO$d#1lg_i~ap>WuGxoo2Cw1;i
zKEv%0l@9&?PqVTNTwEGA)cJ5&QqyRP?JXh_#hyADen?oD7k#0G8W36VF7AMJ%O}H7
z11gb$wc&k$vywg~>Uh?FjDBtd36=Qayh)<Gs=2R_30E^1Zd$>4+rf{x8)g`J-bUUk
z2EB^pgrt_iCFOD0j!xz#GStHc7p%mZ@Fb<YU<(p*QTM|dG;k{2ymRN($D(!ZGSU>w
zzU+{9tX#&CTulYPRms<T)|RVOLg5}y<wNtt0;*a#xA9C1C8ex_f&u*?%PlR-$ENq?
zN(J0RfsrFnn(*y~#+H^hhPi&6MtBv_V?OuT0NJ&8z<GJ?_kY%0#<hi$Aw5n@Lx-e8
z&X;WCb-?sRcD6qm{jm;FzQIbdMw8HqP|(?I+SKjlt&~Ar1PelhQ;|fl?2mgimDtB4
z0uSZ$+r^m6L90`;UZX?btxUSKGOl5Uo2nmrW`PTUPcr>V3+F<C&ZNyu*-}`ylenA@
zJS<aHC%62przg;gMDxSBzmfl5ScBcFj=h-xG{}vvlNRf!#z(pVo(V*JU`a~1G^vL~
zK7rh39d+m666oWej>EE{{ifJ{jh*&xGi^dHY<-LPVv4BTPb=o|cxg8(n6mpz*JNh)
z72Vkq_$$HC#3Zz_vC-rC%+NepA`^5wDuWr1?()PPVRlR%N6pjMKPMI|vU86odbI}a
z+e&PXJ>72g-kd7%z(8?i_}C|EiIb@1dwgmYl;27_JG&s~Iip9765oogCOcz>9z#V}
z2+2z9JZ_xGl9@qLW8yYSo&-54qvN3a|NdxdfavKEbI+UB^+L_gotzLjcw=AAh)RDV
zYz+~j08v9M&1KF7a@N4RZs*RNS+~1~_y`hq5T^25Hf3&mdi9lXnZnl*6eI*nfgpgk
zCI|wkn!8jP_vAE{nD<PFD+1;os=CWa$mCe8j(|xXAzo-axeMEpHO!NrJk$TcCfcU*
zSdgoXnv0_3W!B>bldnz(GCsUyxhgQMt@c&qJT(4^P(2~Ua1Og*+6VRQRwl2BU{%#a
zyIG5k#72;(J4Zxn;yg`fr&s9d#iC``3Ex(D1{lc}LP6l_h}WiIDM(HOtwW1}lgsdX
zYxkdXPBP^euz7hw+T)vJj-5PdNQZ6P$+UrL>*^ZNw8HtD=vLL11<Q#KrZx8^F+9e;
zL<zcO0+2f;CY5;(Usmupm+oKc{n}Kvk{lvOgk(-tG7Lg;LPnX+(xnA}Tj6LP7*DuJ
z7}3qx<L}<u(s;k;Zhz7QQGLHeoo}#W1?kY#$FSJY31aJVTW?IBqJ>>*xO+>5dp@D9
z0n|YhPRw$LMaUL>Q?Y%L?#lzM6@nFnIV!rnZr`erg>NhlvDHc_0?MB|KNhd7b#ON!
z0qEYnyA)2|Nu?Kn=P@3!CUq`Kc8$jxA`B>@63Vi?@$kz7KjOfAr~(&>92|ny>&lfk
zw%>Jz_BaHv%`#|n-n~**kU06z;6WI%r_C1?*F!NK(|LM#3OUmC$T)}6aAc_$<J}>+
zTo`g^uzYy{T0K)vDAVdI>yIZ8-|0Z25Y1<CIyKf?6Tf7^X?U`LIa1TsKiV**1d(<i
zBG?I2raa&E+;;FV>Eb`x`|A|ZLdKzFn*aUJ7ZlwP1dx8W-2^Ksn5}O!&_{r`d*OhQ
zHJ}jmh72P#{7zpZXXiM<iXdG|D6t20On&%e6kml$+vwg}*J@2hP?X}9gx^vq<*Ij$
zwRCC4)`ZOULS|WYP}+VOYere&2Af@Ea-?YmXwY(Bzn;XFAscU}@5UZiSoM2x;V0_+
zK7tqE)DUa|UGg`m4Fxoikg#e=P|oGvg6ik!87m{hcQ3k&plyMsW}inAzGA{Km5~W&
z{MV*t@HrQZz#3vB+)g}Qym~tr0Uh5?6gwh;hNUvty*ot^lY^r)Gm4=Kw&|As{Q1-4
zxj%!$?5#(4_22&M@s!;GW-VVYNXc8B=EWi-7sRi5)at`ue(e&wNw7g;9fAcESQzZ5
z1QJktj$eyD9S8#T1hxtj&~IgA>Z*u>P#!RS$#!BHiMvJichT!+T4N5$uANMz(vThk
zWu5!;eouKUR<s7nHUQ=b#sVM3rPw*8UTw|;*==S_T7hd_au9q^wOPvtGe;<C6|w=o
z53F>Og_`wiB&Fa5Fhv`eteu{65~vcE4I%5FFqrRhezJZVsp68Jb3S{lvX;j$uo>7P
zAHxJ!i*Cpf#LZ_+y%&Pln6Xm;Y3e!LLV#Cz!1mjlez+uyEF#Y)IiyVfyv^pMl4O}3
zca23j5`?6HnX7Ak*#f}qnbqlJ5DP&F6W_OrdF%DLdP}Pi`Sj>??0VOC4b+50q%PzV
z6IslYFJ%07w=Hapqxj&+A$3qFb{{@#vzlOA!Ogt<{7UV|D-+ZN=tZhlxS{!t$f&{3
zg0KoTqh~z_u|{)-_pZ%jxo}6UDHP=89q+isrq})O$vETKkwC{<=6s$oa%8~D%S{Y^
z7b~*ful1Los5ua`*E6zKXk_jWlahF=q<(SdB}}3bH;Lp8z-^nZf_byNdDc1BG{@v#
zoww7QKOdT#+jr@lb<aG8LYb-leA~96w)jGAmx5RXD3l2Tdm^S-9ys`A_IpRAzo^Zk
zz1`j_y_lXki6V;|(eSA6(%ZWvUF5(-fY+;8J`6QDE5H^2`l(iBUtE#}+xAyFGv_~2
z{^|4QGfy?K!r@uK^aZnIi*9{o!<gvj6*K1TqpeIJn<6yUQ+<c>7&*F#!CgDan79ue
zIB?vl^qh%H4w{97XEpyjFg4q~xxig3@pkGqafFe8;b^+z^uj|m>Zn!((<dN*9>_{-
z>)?c*Z`jEZ1-85GR&$T|HP}&WV903{G6~P^V>T;J79nrRl6=JKVmIp28CCUi%p<~l
z6$FzY?~B`cd-j847KuCfuz<Bl6oEHkg_xeF5B$u=8Bbk?_gp@UUVqJ+H^CJ+6};@Z
z`*Yi{)d^8KWp-f=uESg|z}7)Ts|lz}=-{nhT(vhl)nCHn<m#k_Gt@0(jX!+Z@iV(a
z_*D?cX84!RM*B99GUDFa?$Vu4005?Xa)FW(R$1#KZ*uks#A&ZeKdpgnqCloatps1^
z*)nW10~^G29zBW~k#*4x1$1WGFbyo-qi-Xet57>NyxrGxO+&>eU36Dr6oB3Z@WJwA
zXnPme4+4rHS+pf_Vv*;eVL|XZ;Xl~`wGl{dw>xXFBgO=TupgX|P`IgKFnKQN7wP2K
zapTY`J>FDynS_YIm0|yZiHM;HoSMK88lkF53_Lw#U&4d~NNWL%SBGEF<R}-FlD++>
z{;5;qc0Zq2w?4iY)y8<>VA)smBqdfK>NG>L-WUihpG(R#T*3JtU*as?Uq(VgqO!MK
z+a>K5vLc0Y2)Uh4e0PX1po{MR-rTIRjZ=uFPYovZF(76l-1A(}EH5q#O;Qy{BPtQA
zoXJgFF)f<yy9E6yzlh%O-n!()d1RByt8@BFNW9hVZP}(WwxvF3Zg6<`v%9Kb3*Cp!
z`5Cn_xQ#@wr^egm)+%(C2=E7+B$HaUV8H>ft08og31F$iP7Ad<FHg^_0*=E_!sXq8
zf%WdKKip*}Pc|gmP^$<$772IKKe)Ko5J-vKkRw|V58|G3(Xh2EP^uWToRE;X*Kl%b
zU$qEFTBF>cEy|TVJRQ+dAto4gIIxYx+aeAB&Toy&e>CADGR+O8eYqXJQc|>R>!e6r
zpc6o-psYyq1nkxDOu4QS!DBu_P=DphAn2~0$!mV?N>QxYfz#4N6nC7dh-UJkR=i**
zVe4o6K3m&r!!1#of3dUwN|CgXPi39G&Hl*)tFfBal?StW9J5b_FiCJ<FAsF7$EC=M
zs$PBA;LWw00brF55qf0Dk7vf<6^=3z#_Oyrar@`bACHcMqn~;<RS^BRr!J9hBjNF8
zPjVlL<dX$8c@rb<Ugc81#W8adoq4csR}~3~>;ZlkBsA1HM1gQ`vFo%vNl)p3MQbko
z{)}XS4C|YA18b1z`@_0ONT^*f+Q;jx?ASFzMMSvTS<KWTtBq~{d;$IpH5al}_1B}j
zax4h)zpEzVyKVLznb|J8=I-|*TXG-82u4B40%R%r$cZPwmFRIpz|xn9yus5V)|ig5
z05XsEJ1zKI2^Xn#G9GKL-@Ut9T5;1<6D_S3z+bQJXUctJJ-1VxBDsGrb&4%!2<1+c
zkV29SmVaNmAFuWBAHhe5(rc@QW5;=dsROOHXY-yG_m_|`efOj3!;hwYD%~V1qqskR
z^7`$c<Q+0VLMSpW-InAnRv%i_QKDXU`0k(U)C_Joj?BFFc#?#K?>rBd&31A)A@+lg
zI5VcO)|m(*U@{(>`0TZRJ{x29*x8+Z=yBP=4!n>ZWqjo6Scw7>$_OEMV8q&V2cZn4
zFhEiy{pgJN&`twhN=hU@<!Kb@uVW1J#ixt;uWyy;ZN5$?ZVk(;+zwPz<hQ*Rw&(9g
zTol`0mR)E-nZC26$`R?vg{gfdJeFLQ?Rzg4YaTNJi;G)u41m(b#pQo5e6UwM)(O<M
zLRJUCcvCQ$kpCpf*RuGs621P#?T^S71YMVTm&gh@s||3aKIHEg9VKou;&YM02omK3
zR3K^|D!{cpyGnT29*q1RQ(ET`-9tg5-jW!Mm)!*z3fng%IpW-4#hURk5(N`ac95u7
z0#%4PBUVQE4B{w@zSEtoTIY$h94s3BXTeBk$M@#1J7G9}z-fl$>a*0q<mzqgk3l|&
zlTNc8f9*e$2f)82B-9InmJTQW7y*FXm1DP{`|0}cP4QRlq}PeS$eShNqbVc^B6=)d
ze4JEE&@Py(O?ov?tngm*Z(ZvA=SN)*2qX+eW1MQ!#~J^jK2H(*f4?pMzFqJC-m+4(
z|3i250TCY~Vl!)9C);uW|L!!6{Cjt*+3wW2fB!;4;?Ai3m-rjCPDs}74-?;$Kg5sz
z?<<HuWltokir@VCr_zfj{=Yv+)NB8J^<h%y|GoNNe>NUJp>h22`p(~b{r%a*_Wxa-
z_@l>~BP<avM?L1ESVa;N0Um$;!=K+vs6`O7`0f5d`Tt!;z}~+<FIn6EVvP6SmlP;H
z5UcaQf0FzB|9SsT%iDSXdBI!bh5voTvB!Nrj{N&3j$Qxn?>O@Rz2g1<T@rWn&rSV*
z?`Wmse|MC7($44a$9ezvalHaH#207d^y~G1y{Vnn`TqTey|n*N_q-D=Se}#htyWzB
zyFS95{(BbF+UWlI3=i9(|EHTGsc-)0ae6HKdz=6Ktc3qcw$tC=^8f#P<|F^9`;h_h
z;5z8<U#<M_TTi>?_xD3J{!h>K@6Z14G5-D2|K4BYTxng;|E-GfPJb@2SDQ%#bdt=(
zawqa-{2#yURT2OAr&Ies-}2AziPB^LJhBmwZd?AhpZfoI6Rzg{{rNlj|M%6~>HmB4
zb@zweYq$ZQ%uz=z2#g|r6>cgo2%92TC$2DYuiD>7xVom(3|Gc<Hq`?MX?v1-@{5r0
z%c({uqiNEBjQ*x$oT|Qm|MakQ@Sd12MUM4OH3eCy0}~*QzO7&Vqp4Z2mh_XZqbs?x
z&-da#UrO!fh|G`cSW=AZ3I_Ts-fyJo!GTfy`qQ^xlUF?{Y;-Pj$#6fRQ&Z!ca7t|k
znCwaj4UEMzBWj(K$mKUFn%pbsK%@tj$;-{X85fs-7in6%)nKT3;D<s$OWU|NnuiAu
zUakJ$Zob!W!0}o~UQj8BV{#Ib|7%py|4}-kstOKX?&g+UU0p4zaOzSjl>djS^MLDl
z58HoMk~9cqL`HT>$w;Vd*<_@YsH}t(MM>MrI8hRn*-0u%k(5G44k1O7WJJqoIj!gY
z`Q<$S=YL*aXVLHb`+mkfuKT*L8{CKg9a*p^{_O0>VJ|Cd1$`C!d9-QbqQXmRy7n7(
zYP((eWX+7E2VZ`*E3YmcAvXKYX4h5D1Eutuo*taNbLR~@InRl@2BOqUH*PFj()2`1
zDlOr_Ok2s4^i@`}sR;v#{H9;{`_Szl2RD~I9CYR-GzrIsx-Hb;bS7VpiCMaRdjiW(
z>Zq)wWMx{bx@1rE$j}>`LGwi7OdEr0l4@<@E%a_4u20}gKa^W=aMqpft9C8m-CCu-
z_;fw4{86pN!`khipWhr<cD^D$znp17kU>x6m+Ph4i<noxu~y9bAPb0J@rz{i6}%R}
zJ@tz(cHVP$|8pvFDLvA&)JUX3^%pN%)StZ|4KIz$R9i&hJivQN?{<5;CqGS^2=7_1
z*8H-;rln?YEiP9_D_wPRba!pF>9JPg*?i4GDix8{!*Zj|E>a2mdb_^j|Cr_6fjoh-
zPgg}7)B>1@RpL9TA)JB7F)lM~7WcTGc-p44TV!wZ&5fh4oTeVLsqnVBfDZvCNoDpL
z3?M@4r2T4jdm}^EA5})xGOLx_4it~E?)UeWlwl{1PqLjLzbRTAj;aAX>gD(>WSaki
zD?({PkVa#*n)JUri~bCfJ^C+EXt`ji(4IeEvVa70MXFnhh0E80;(S1&$TPnA*>U^V
zfLDpz9SdyY9~L=0fEU2u`SsONdgU0+=*=uQmks0%!^z*vEi|19&)<GpSu1IUplIg#
z{cmvtOYG0t8i*gXi4V52p&x<dk+Mb5>wS9Ejs}E0;hSjeOnE2yHY@wD@R3hTxD08C
z5P(?z_yDM2QN(o<&&(s#SPb{EYS{Q!|8W5bIX=-`vD>Ej=9X2DE7$D|{rl6f7gtN?
zm416tb0zWilV?}=-b#rXSv!%ak4|PaSHc*qmbq83K~}j_6tra=ERQ;R4t#IL^@xMi
zZbIC<6G!t}r-HDtC#KBsoG&UvXp3)P{yXm~X;uK=5Jcw{Ne<x`OoBVl54LT)JbH6!
z$;6o4VKKQqzi)TV%}-ug78i5BdiL&^xPrvX5xQ^Edo){=)eQT6`c>iPgcVQLo1A#O
z)^rmLIC4_%<xGw?kc&)`F8dG{mtAJEEpj$WYssrl{Pou*G#4wdJ$Qe0Y!1o2B&6i_
zvOhuPwjRf6R)K~xz`xBF@s7@r#stRBX(YdzOP8~t>5JeWvXLNKEUl{R?Y-ChhDyMT
zK=*?&Be$nS1+25%$rzZ?8>cs4Z)$o~_#!9Y%;)5^(H9shn)^mK{lhDdt6NfZ%Gc_z
zA7LhnqEydde-*0``x9!hs$d+mM7vO-1UDe<>M$x>^X!w$j^X#Z1roc2>Ex}NJQ$D#
z4l?71zP-%L8!WD4z`Dzgey7*A4Owc8+?SYy7+<Z$hwgbcdGiZ3*sP1q%!YyKP_c!Q
z?$a&aZYjVtuXwJb@Jf?yuO%Mp<q#be74-t(bl^;KV61_mnvzTZ;>{Ufevw*_%gYcK
z<k_4V*)Gc{K-`!(10l`8;G|2W+Vm60kHS@i^)Fl0?=R|hoozlG#XK_{QhaTx$$nzd
zVaa0ap~`SNzM(}C7+f-+ir{6_VUl-yu_DH!z1LoD5W6z-rlO70^pNFVr;i5)xA?h|
zzEexBMr4@q9vrv1df~>fscb~dFXfOD@nrb;3@%%_6UxvnZcyDzE8drER#-r)U~Z{}
za#4F~uF2`F(;;cvQ`*%^V|>QyIB_}c69tRcB4$jUu_XNMgu4q`5KO8$1kPe&{+u9x
zdDRUzdUzJs!Qy?MU6bcy_8(HQO3W=?!Xb<G71uSar=m?wEV?{<v|fC5>}(TZJZ-HF
zt#Fd}TXbMIp6Y2+8zTmqkKpD1I&P&AZNsXvO+l|SR`>?@@+w{pw!AiB#S#k(%`;EC
zmlg558}`tse6^R+ETARex-9#Fi)}uvmMrfHw3}Muihg|MWu<!Y;<pay#giy4Hsu?=
zzsc4|;o7#jlj5vApyaej5@UPGW9V!a+fXc6Y7w(k^Td9wF1<&kSAzP9FvDSH+0uT>
z0#p$m)jn?7nuM)`_t&UeCDH)-M1*R6r+Bq<mCEgTw%;#=la3FtQ7(=+p!LEol92V8
z(q0p<p+-X-+ij3Ue&N1fFB=R85O_bO%O;yG+S3MZUbLu-WKkaHxlbPay}t2spMbd=
zmIu#l300-}a<9f`$tfrd%9`Q}b$<X%NbLvx#R=T&`N+hvUCXg55yxuU*tJ?iEBEYQ
zw{hdSt8AZRkNOibEK$TT0tHwMG>`==i!0_O!^d`0Qc{nqt{j;h=cD1E-OH<u)SfG=
z`zafFKmBqnx3p7@mcv2P1HvHKmBO0TIPjBFj&1l+aw!@QvP$`Aol7EuOsj^e(#0m?
z#R0AO8#jVmqGTnnU_;Br)+;1r&bF>&lcZx`i6s5XQm^0=*{GaZEw309YpL+w<)c$F
zeK`o0Xq<RFb*+Y`zeq!$b-mv-u$~@)Yr#{OGUEx_O-tzKH9uIjEp;-YwQE6BFUCm%
z!7EEK!V}Mo%*#j{oqj2+$|a@Obg@qAd$n|A#Ey*VxKef#$1Lr9>?;5HJmn^TY(QJ#
zzKq&i*&y8~iEl$QPV)UY?jcXQm09tdQYF*Gt9S+5C+bU5sL3Mrxg6|&c(V8mJ8_Go
z`Z_AwNP8%)hQ2jak}SRNXV-n-6|FZ};v&iPmIRrD^ERwKO@HB;-8I3q<y@rnOHF;c
zV0O?QzE|(UV<vH2f>XZ~Vzc1hFk0^%6<sygCZdns=rT*tI#KVXRRAHv(zq{t{Suij
zpA~1>>FmD}){;cK$mn>c+3EC@6I2jTq$}{J2?<R(;J}TT&EyN|S=@ybxGDd>e(I(H
zPzWV#70`~scporeQGp&ga-R5}0>dR=EpW)_WH4S{yy;&`bHj(VytS8_OH5mj{BrqO
zLCYz%QF|zb`L>_XB9bpG-i-_!rew*FA1ki_h6~v1-<%k?^FWQFz*GBnTL5il0f#VI
zLqKYH^+c}6wVl6g41Rw+21nvs%@;t*f~FvzoR-iRzSZ1|I%13W>c#WFCYM+#Op@8V
zBzQSYcB&MAQ~BZ*S1^qnt^2Sqr(NgrjClb8U$&M5!vYzd6JL%}lIUru`0=-iXMXuB
z(eelqE?7)}pU<fi7nW~|ouH;r{()yWm;-~A{_x+Q<g}p0W~jZ%k~WMTs)L^p&44pS
z*%F!&I9x$-aZ+d8u*<I}V-gq?E@0rIbnhNIHYW2uZ=+W!3iW4p{qz?F>2)xyL`@RH
z_UlbgmAZDFDz3!MyZ_~1X<bV@S=kV(363Rag{YU}uPEhBm@wf|@wdNeZu-{3(wI-{
z1R$=<wT`s4i-xT${Q0TVr$+%y7c}))_~(14XZ4ja4f<N`LQc^4tI>8lu#A9!qFFmq
zECd`VT9iT2MU7{K`AyxD5f>pG3r6I?*~d%bpH6%sG9S?$Lgi!;(|{5uw3is&D@*Ul
z>jcnBE2=7#vz8Dd5PQ=dEGdqF>lyZ3x)fNaRPY{9R_xlX+Xd0}gZ`UA4;v<6mgX#7
zB^sI_6r2;pb8xoGSJo}*e%~stmZ4VeMO~XOX2XxqO?daCzERXR1%YD00`-}KT@P<%
zg4(<jr}Lq!66hhou#nsU-ylL2K6P&ITc^iun<loG9}A>1$lRnM1_mj!t&>(@SV4b9
z3)(u~OcW?5HhecO*!Am4!OuEs62kAkk62MLEVgjZd@=$!Se`%MB5jik=W>H6juHLB
z4dJBECs`tc^Bi^IvEXfp8$rroYG~*obYtj+SDU1V)ay$iZo$tK{YAur^n#f(^(!&K
zr@SmqlQXwxgRn?dYu4Hj$5`zA)V1;Rm{dWBf(kcaCgLQ|(Pses5^P}+Qqp$x`6*f3
z2BIcGV;3c2$ZXqsKloGrro_MEremarNsvRf=RLu5qjvp5;I(Y}LdtlUdCV!?62Tne
zP$fW~-G4m=&G}2TE%XL`7KLj9r^?aoBMDz1O-SnZ=qaH;vQ$)1M<b$RuWG$}x51{C
zd&80xy!vr2N)-D<UobXhsi3^k+#zas+S&^Qw5R&w69pgN(=OkJz)M^yT9rH%CaPTX
z4+m)#G<Ej38s9p!e}ChE2|xXS-S&=BW09m%`mjjIlT|GXdJS(58ZxJN(EA$BzJOSt
zd(V2zW{1>@1}SmC#BK@b7k_?qz=_8LQEj%EL~T}Z%toxby%Nr~ePZ<@kZSQ|Qhq)f
zMz6I<e<?7LW|DJ&o%6D*)`<l3>e~Hq-=cuJukGB;o<shy5HA4VRnVS^qm!!>ZY~^=
zx&7k<CBa!nyy!W-EirX*G<OTcWAerMf&TN6ffY&JUtiwQ-auV;;7mbZoag@UB6=)!
z2V37*DfqXrx<&1F;D7-oG`TEB#^l)w8hK0$wmmfveG^qQ{!DeR%jlyE>4QRjnAy;&
zjc%#CCaxNDKmgQ}HyZ}2wl!ED{Po9<Ymm7_|E8#sVeDdXK!fSnWa!31)E=ogf{Uwv
zHF|J^6WaK3(4=hGx_?1!5Y(%}y8B~*T!#{UQUXk=er4vuiimeNz|vA26aBz#PS<mU
zB-BM=f_Xc^`Gb);8W~WW3u(y$@o;heL}w+6&~fKF7JZ}By0x01bW-_VM3#n0)3tw_
zq-LIfUWW=F)mHv<tk_$s{$dDKSMT6yqlf=7LUenv_T-ZWY70emjE5W*MU}x5C*GcZ
zrg_%yT@wWXStMT21TMmTSVvc);fDMTegOfweEqZNT&{<KAnHK6K(e1cHn`VR=qEkY
zf7++c%>3%`?l&H#SoO3v(Y9<8KHzZM&db*KI(pc9j?<X<PxiG(Zv=;iY&sVD#Qd|S
zzM1_Pvj=PA75fhOtF27i&Yk~W*DB;)+(75I(H8FR-(SRE)wVnkJ>9Q%!!+yVNBitn
z^vPaY`Q_K|ZiUVdzE!&vIVJmET8Jm?*H5;coX%9mEG@=oFz!(bN+Mr26WS0+Sc;lZ
zCqvi_S-OuuLsDhCYpzVwue{|+je*gL#aP?>qU>2x?Shc7)MO<1NaOvH9=e6yN!szF
zpXhxs3DKKVa=dEC1p3DZh`a9S<RoUA!bPZ4m{9$45i_TJOf-=4Gv))J8HicfY_ft-
z^n_!mjpCI>y&dW}AJ}v$p~?)Ip#7)n<fR@71xZ_sFYSzNP}j&X;ca>hzHJ-k(ErE`
zAx)jLKAX|QFTd0<4>NU0wCS+RAr*JO{6fk}{3>@cM&I{r_T;nKZ@10=Qte_Dx6Cpz
zzo^;;g>QZqNl@u)7j%P7@{+dnhk$V8<jH+g_jmKo8k-C8&5|S=X4@SmvHO$q_tnwq
z0%faq`${L9&P0w%-hF(6`^kJ>9@`YQQ1gT_Ix72ZQP~ch)$ZW5{u5rUPl!Myha>8P
z4nOprAJpXf!#&E?Zjt->fqc?xnsf5vZ55b?<0s~c$>@ZfJSiqb0=!MW_`(X8vW%|r
zm=t@qhj$k$P}pi~no=)CMP*RgbD&me0t}P(<er_GSx@wITfeta)MxmE)$RgmdT3?Y
zZdANVzJ9%fCt(4k4KyEa)FTr`J5FxPd2F9@1*wnc9NC^IVvY~h96abZRInmyJSAmi
zcbS&t{ZAHQ!Uvd|-lkDg(OAL#`a)tU`hxZh$dsjSk1YsEUAulg-6r1f{6ckg^;R%j
znb159Uafhqmw|ynl2dt)1Jn9r{IcJ@gN-gmSgOcL1ON16-U*OH`pk=suVL;W%p=8b
z(^S@P(j@oE9vijLOc}m1Ns)Lq10xcPx8S&S0@u48Q+!qD7=;_Z!3|#bhkjP?lJ!rP
zYxbKr`w$GEZ658Ga3pn!9~&o{4h`(b210YV4`r1ZUI+MB*;=OLNwno$QnXa$g2r!O
zAJbRoFcUlwwj(058WsHP<x4L<O+D*ce!1PcbsjnEyH1o*-0<{N@{xyY7{q0Wjv()K
z^XAQG&!6vQ0H0y+r+p;nqtjng`gM3%dvZ!`e}_Uc?e+ngZGuhohCdY|#G<~))SxLb
z`OVO4iYcV)O9>^W9r!>l!xX78nb{Tji3(H^#YA*IyEUJ{aF)dUPBFRrxgS|U+HD|w
z-?I)dpqa1EtvUKtPL37b&O%TJ#>U<&S2P*V$FHOUd!~dFYgs}ZVj%Dk65oH#H&j<u
zSt8$QXK(Lg8g&mk$mVeimTbgtP-dMd)J4f(g;Fkbmy`=tZ-@PVr#?q3{_NQ{Oc{`t
zLe-p|YChCQk$;ozeYD&Db{rqAkSBfz4oq=%O?DWbVi7?4s6Wwp<HpO+a%fTX$4koA
zl8XtQr-^42xJ)xlOm0=~UhH$sG^%U5$^LtLk+d=N4ZfpxmYOj~bRUzqwaV8dWtfD{
z^+oWxFUsjlV=SgxTlWVU*}83;m6l2v;|>A?y<ve%?O{v+vEX-nzL@*ycktjn(j!YP
z6-8Cm4$Of8f?j!RhZ3<|I&#iGj-6>}i-$=Wuiq-{{P}k7v=@KRO>OFmpu!Av*irSP
z!iKeY_Tq&nISA5Ts&wb95(?{QAZZD?w3tYJDe>RP(B1j!xSH)@08UEGVTZAH2`gyO
zVNywFh*z`}-^E28K6!F{iM30L#f}N5)h{0vXV~xVeXT`p*C&i0kYVC(C-ZjY;&<hj
z%^A?bO=FG^=b64(>?+FJKTijIQW)Y9(np#*CtC}<!O_{dP13l151PXbFUelkj;*=#
z-Xd5m+>|l7YE2c!F}eDeLj~+wcW<OyE8h$cVDm1pT3e0R*9XA!Aex&peYzOhfaI%`
z1JOnUZQ>c&nj?th!{qG-VO}T+qlrp6Tc{5r_GC@W$mF4WLg?n>=g&f?G(Bh;Vw7T>
znmE&B0t~#`BZeVC5klo|uTeM%%pI}j1RL#umL~&!y>qMIgkXU+7a-+TFg`)))e=w_
zqm=ObtIXJ8*!KsRhgUDF(`YqDm|i7s1#B-wFm=`WynWCU8>unVQ{Qds)1Bc(Ojq;@
z);!UXgJ==c9gm)8X45xQcGrQ&pZ0QnD<F>0A1EbtW2GBici|eb)3{X01rtlGw`2_$
zAHJYDG&tk;N>W*=Js@&cLroaJNG#^MeG$FOLLC&DFa{3Yfxz|yo~b$n;2j56YGw*6
zFOCw&Chyw1F0k`_DMAYs^Ym@7!ro^nOqxPyOJ~!MN5f()VSGI+txk`-wqgVgUFu%$
z<k~_!)`w;9PnSyW2FHrG;~ckdZ<REz+ZYWEn>>s7deVED6bl7(IuYlLNJ;5!5##*q
z#g4q-nPRt<-He<~ZB>%bIBZDz{nX+1iIn3F0l$9lN?ch{y{xIY?df&B>r$rlQ@y@r
zq4B`j`}O8C;w2WeqJ81?`AITzI>tjT-$bj51m8+8Bs{k?4o*|PQ&$3ttTU5V$&5^;
z<s6nk2k7yseWsmNQVVF0CB6?c;LWfHa=LJPCj5NRpZn(h;_F@T-R$Sj`@P*9{~8}*
zm9T;<#0?#I%Pm|3!F#x-^M?j9fOa|!jt;BKEbB7tEBcs7-eaPQ&Av4I#9G;Uqp5|5
zjC?b;<evF~lP_j^7H1HVMGS&sNxO-k=VtyGxvbu46*H+AUL-YUQWAq&YMXpJMx^yv
zJ1Tg7y=9%g&JCGC{rW|?GUvjmB1I#+5y@}E(yUwEq2<BAl#(La5CaRh{w{Cdmr%c-
zrl!1<!;cSBuO%dO7-WxvM~6&$YM+~DrZaay*6It6=-KPnL*C}{dY1>n$s+Y0T{1V*
za>mS=O0V_++o!KNedoJx!NSO@L+;(hD^00-FH?|XqVlV~Yq*}hX|!p_{YAIyO>5{j
zJbUrt^b*}AQ90=k1`<20?DHTF0WmZ^?8nJ}e|q?3^QqeS*iJs-z4n(@X5Bh786u#c
zn*4;{{^`r#pAHUJw&8Lxgi%V0blI!NX4*U3jSrRY?B7;6-YvyG4hwD3qlhgR(%X*X
z3wj5L;JocEjbk$}wUJt3X6C)!7QSZeY1t(JABY1>*}rm|^x{CnvE3?vLLG74zB}8X
z;1f6d>p*NTJYvwkE1HadZ9EjfKpO$X-m(~pj5nlU?{?0Pj=GeUT1kb5hT{9*m<K>D
z+%~&+TD1BGSJ(Q!O^$cEEVC}Otg!&_rm{(J={8w%gcYLa_tweJ&Q|yG!9L1-w4Y8y
zF>2I8nb$bk_m1O<-m-IY2Ab^f-@o5;VdSmiIKX}HFQxe2(RUbLC{y|*?Mufcgz@Pe
zPT9o&zWwTvM`-YbA!5<|22aq+QufFw+Vk{F#~WKBVV0}tI}IneVzGr5Khmc86K}`G
z31yjf4!=y^z!0`_Jkq>ucOU1A)~HyT)V)SAOr{pEBxW{9SthLDh<9GtJ9_u;4Q-`}
zKwnPGv;HlKRQpBvF0r(=8SEVltxcs%)Z(G-bsvrXzD3@*4Ti}+u_wImWnbhzF54Z@
zrS4>1o9C{PvrUEu&blP*_jl)t|AL!c-$PSm8v1WD+LX1C(PAiBXpx|aX*QIb^KR|b
zx&8ae=664r#lJp4y|7peYGfK3d(GgP5pT~NJ-U}B1YxT=>S^FF!o2?#7+A3NGQMmV
zlprz79FahP0WD^K6}h36O@uLEzyLAA!fwL`V<2WZY2#FP`HL^$>CJslw`41+k6*qN
z(^sCd_U#6Y8gRiOl4fE6;RX_4j(KMN8?q`*xnT(jUD2pS?E=B}Z?$o4iWGEqPSX~a
z)*x%~&_Jzr_a{x$Rb~j8)!u!X#j4N#)pY1x(!7&cI@m;`%vt|%LPUOfHw(g_cf(@k
zPJQb<Yr{YJs*id!S|y^89Mod?G>q!2va~++(doH9G?=-X!YZL=B7*}(Or;pzquv5A
z>)>kPGC_^P<9o8>F7I~znuC2+H!$dpHJwJ*?l7G3G4Q#H;GudjPlMO}{&-bWXFS!O
zob?=uvLeF424u>{2s$=WV+VST>nwD(5^@hP9hZ$gRQT@QA>3tQ%~K>29cY=lMepzq
zBw*s8(Ye*W-neev9Wfgog@v9QUR}O}F_GRqJoV5ad7<aXxu6(>^ks5P-{;LMhA7J(
z3PwXK!nLNam&%kjVvIr8uI>6>8?STr>{;*H`htQ2XY*pX@`MLg2mYRW(GwrLQq$Ko
z%G*SvjTF7{Z>yJK2-3Gu<93PGXT=?cCl{!Vipec+`|47*o!6W88HHaTX~#>fRFUV!
zA^Xx&2X7xA+HdLBVZ?5<MMXqLc5rX{b!H13vWA8)lQPlQY3_#^;B{$-vD>t1s;jF$
z7yn1}=G`56_tdrFK}_vgYO%8&zn^z=`!F#d8R(}!@dZ~C2b>-UtxL$*E?u>|D?oJe
zz$C`Vq=Vy#`F^?t=42q~>bs=6OB4GNv;6bVT&d=_&R0hgNpsfuPsB`RTnpqrFLP^p
z(=a-q=a>rln3{q(iA$6A9Avm4X#tZ|Z|`f;jtK9IRpiJ?NgaFGcK^MtnVE^#uSX;&
z_v95&P{i=1-#*b~z1`V4W6db!zGMq7HAcq7bf4>{t0I?jA+xPdNShAR?sZdXTi@p0
zoex3FMP?oGJ;S-C-}g%I&g-phJVFYBF^fCEG8XV!BE3ZsWil9q*7XOa)%`Ryq*;JW
z4y7MEj*F={Zq4o<=s8Ay^Q|A!;A(rw(25CHwSr9!&pQ9zB?Vp48FS}OL(miG?+{T_
zt$zAw<S3ylrWZEVj1j9Oq%Jdx6uUKrr+xVqe*3l>lt)kQKZ#kY_4f9Uh%nIdb+!3v
zJX3sSmt`{tbS9oe7uIKS5SX_YwSbqeUVQ+4W*@zCOH<0#0>xa%>C@Zq2^r+BF<^j<
zxgSI`Bj&)}OimV~SFR-{7H)aGFU|V5`>T<dK^8FWiN9`B^jxDQmIpl&6r?k={^ZEF
z<GXt_A9!JT2IE<_Y!~Q#Y<@Y7$us86DX-mvT_983L1(xY-8od7BbXS&iRSg07dDyM
zRp2*@-Mc&IH}>}kDLyE$hdxc;eZ{;EKsAT^KsD30ZXE>?oe?<-DHiVeIu4Ao0d5y*
zRIl-lGR!EuORJPNyks-Ao{anw(JUaK%$^Y2f$cTR8fjtrE*QJist8ATtV&g!l(2kb
zchzaFSh3<T5ini&pdp1d2{%I07R;V?e(SSLb++Gk(WOw`>5i!lA-|)u{^J6iwJ~0o
zwUH49cw>j6^mfA7fY}M{|0<Wp5drky3Dsd@V<{<7faz)~cofo}tw@{3x>OAoVI$7W
z%`*i;La(t7QcMKliv>;CIf_8TyB=1zV(;Faf=zsCYMdM$9n;F43iaM@yRlM;jAq&B
zNQudUC>A81n*W;Yl~e~SMReaE{j=l$O1oT6%N(PpCsK9I6a5Dc{B&qTR&s}4I*+wl
zQt1i8+rUYRwYAZjwC4Hxgp`_uTU{{5p>x}ld(qs@iJe=Q#!yH`<-DueSbUpktjfgz
zH+&={q$3@sj%$YoK5A9oeh7nonSDezRXEuX>DoP7Y_t&gGyBE{Gliox(M)$f!@$Vj
zD(iwg)H+Jhb?h^5Qlm0)CGCVkliZtjmsfx2qup_jwT(Y|8S_=_7LHvImE*T>AIkQg
zxuso`dr3{^WWrGK1@|q2uDOFk%n|0^0ne)w$RWzir$t9c=OsIj6mE)TV<M#$YCtLJ
zSsaaG`h@R*b7X@GP*Uj2MWwrHGC&7`!Kt}}di3ZKOOF{^<Gq}cdy|*|6bz-d>}@?H
za29zF8Chp@bMvIFpJWA7wP(-#$}?x9qB;sF)qx%$sWj1-2xw7$eES24Lms3@5z*0|
zK>X{-{sR~rW7zv~3^tbbQi4~Y2!7RWA7%SKX!(`1B5v_ZZ<Ar7RHr|R_QD5&atBN_
zy3}~w=tGDn>%IC}$4A^@lQR>0gK3DFoQMhq;Q!D&ZMeN5d6jr|KqeG#y#;V^a&$)t
zcw#MGk(eQ|`i^5fD!bO*_DDaHs-yE#D1}q}NM|L*U2i~jr@H{CSSjPP*=a4f78GN3
zbl>;U5_);%jm{XPZ#g-GPdtv>`e_!{WHm(uVf8tVr``Tt$7}#&%@g>mHi&oL6~2#C
z`SH`IqV|*OTG9Qy<#B2~kru&21nb7bjcjedY|1bp+`!w4Tb%YJE~c^FeT`!sU^Jl8
zuos-5rmR<1Tth=bt1)c~@^i+{Z~DiafinLHkg#8b)*UP}WOS*$lWb}X2L;Yrz*xlY
z7CTxIGmT~3I)hk!iR9kqszlU7-SJi!NnT6K>wXSGVXfO~Vn!wq^_eBiShn#TVo=;2
zO|E4NUC?m37vNUapE>bDJRQs>10yV-u%Z(fV8|2c6OY?*Q5eob%G3ulod`BzKh6}3
z;MP#whc$$*(?;k+GkU8d=m6-9)xogbQj}y4kt%<Pym<QCn2#eEj0(5}VkU&avGo1X
z>^k?VD|iV@Kv*v=^>&n|vsQ!E1;oGpi-PL@v1r@$<EnjP$l272BXciiP`f>Z7J(Yb
zd^!vTF<)M&|3&0DetOiEuhX1{e;2cA=(72GrFcVo5($~qKKSU`-4i4$**=(nV%k~5
zdGi!{$xokudYizhh#E-{kDNH+oIgYdNT&|XgF{qG1T-naNj9G1C&`-Q_}<)Z3D`+9
zi*uP49dScRriq1|dQs8HoA><7FEIg4Pp=zA3U2&F0^ENK7Il%rhzmz5ut#wJvF=S1
z5KZV5ZR+~i8vSr|fHV)JFyXM&+3X0_SWE7+04ynsg}?YpK=uB!R|rMGv+?m=@a_os
zl{%+pyDBBwYW(M%+P7pi|JH_`^I%-On25uJwlL67963O$X#nH+WyLBN@`G54u^Zpr
zQUx;?z#E;XJ|E>Qo7;Cwd!nWD*FRwzVzp%TKp-%d7hiOUf8Fb^FKD@?j(gu_5NAJ*
z2KnI@w^Tp}h^akELReIM7nX$iUeb=D(eM@pqq<j#x4kj_IN1!s28;_xgwe&%p`^D0
zOBZ9LHO+5&2{Vbso9gD)G$J6Uw7L$d;it*3sJeAVO8euX0M(MXJt3|Rh<tY?sH7z>
z-A}21h3kl}EP=ib)>N34!q@xLzB*p*|7#*2Z*pp^(xiX01BBW(!Z;riQyWLPF<-7f
z-7a?92{6TZ7iO*Rkr}Z_YVV@ozvorWM`=Hn%e8!<J>@^JW~Jl<`A0$QROA{D9Z6Zr
z9CZjlC_|{cd^(9vIu%-0;9PyD4i*kb4niAY|Dq_DxS2|5M0j{Bl7(flmaVAMRn#Xn
zX~&NWJ`mc1XIbsFx**dK8mzmc$3bbN+Q*W$0L!%ro$Fla8ufM<@jlVjYU<*sp_6bJ
zZQc6wFe4<zPdiQm=$;pLHkSI7F`Srz;xX4Zs!|-^Pcs5U`A!H~(Y6$Rsk9?7D@{~l
z9tqPEmR(&V2C^wCDQOSLw6WC?$;*NrMf$Ev;Bs!Ux^2O9v`NZ}669)DQ|ANdU!K?T
zvr~PVT(_30#qisk!yLMO3k?qc>_6x<2B7`3ygWaiAkrlf<N=P_jW%bmNC^zh&3jWN
z`9V>x--r>LW*jR2_1nupNA*uO?i)U5mEB~nSVormhr%(_hX(HZ8E*dmm%?aCs?e|B
z&93@7m2+^cw||9JT5|7*C6^_v?+TD6mt3GZ%b<7f-aF_la-ff>#BsbhbxdOOUKrWy
zXfYyKV?*bH8~^z%*)CL%Sm6w<V-@@UY(nB{+s`S(1`kdXa<U?T{jX}oT}}VAwOxWd
z80)-=zuWfNU8O{k02YRr3|><Guez%|<`E;w+!WC8HJPL~{Pn}(^FA3ctTuOAqL7j^
z4T~12*vNxeZ+rIYl@Bshuy+8Stf(p(mRpoj&cIY^(cMM~;R7UAAc9doRpW5i;K3N3
zT;dA;k{lQD!9{&^h8=qtmb3nbqJV<*kIopdD|NEf9129?^BuCv`Vto3LONgvRnqsr
zeos-&=+V8q?-{zn7(~NSLf_xGsrmSeuc(6<nwVrARy!dskl)=5P}{Psa~{AW-yCnM
z=TGwC|Ej4mCZ}@Tkjr%$W1MW{Pm>vP2g-l19rZf)klS_;$Q99i=$}iIE>oMO+$zJQ
zhm3nra8Qs;<X_E;MX+O;7e2G-QDHD_3o9Try^wpF{zeD*3inof5cfj(%lpKU#6c-q
z8TD)39JJi+!4&5YeRM(&A5LpoNHrG^>jmXb`d}whuJ{=p<cD8x^ub~LIm0UA2i<Ki
ztkkp(o0<5jG7Htc_mv&^%uXtU_RLTE>1@+_OyMwRC#QWYmAp^#seg_@k*Af-ValOw
z3-ApIMcm_X#ruDf$Yx~~8)!^E!Hyl>Gd2ISH66Pg?YnJ!DC0?_q|PsVcu4=|2!t8D
zf*TBor3t3Z3$^3rbRtK(2SqY+7W`1tSF*%DG8tKYq4}n7cd~gND!|^iEuiJW<^h8l
zmU1Ls+4S7PAw;246~Qwbe>LL*ue7X-XF!LUhiP^yG9DN>>mE;xuBfnws`4kLzF!-p
ze5D~ZpZ#m1B?u_Mkn?~EF_#z1s?<qIV}KzO+Pq3vuFMVvWT$MV&RAnBVE_u2SWeei
zkOTnH_`}(T$qzhS969A_L5JI_>g%iv<A&k^IhcQ7w05F(`TF<1t)-%J_L08whpA(%
zDBCJ|O*_8__cY&IQOoE2g=O0@pU|Q1yOmwS8`p`4z-+(YYu@4BSA#hKm5nFv>?$nW
zCRd|(liBgqIWzcsk%K^_zz%kW@_BY?1h2_DL&`t~xX*Eq!xYhh%q}Q6NPXnNr-md?
z@zB95zZdKl3nU`6sGNu!H&lhHDPO?emG0lhcWNq{kJemui>Xbit338_4S9(8MN(az
zR@qgy9n*zkD(fd|e&6Ad+h%ieLHX{@r9F?o)0|RXSu<b$-v)R0#@*GcYJE@OfJkHT
z&r_V8$8bqWy7B@g%k|PXv-oR*cY7s=Umxd_*@w;@<TOOA;}l&l6&vZ`718%&+^UC(
z>SFPaw_CR)7bLD!Xy~z{In=FYZtZVRU0iK)f!pjfy=Rr1zsxP#SQz%Iw3^XllcP<E
zwfK9-T)yeYtvh2Z7i%1=zB%IM)9auc1ls&cxq`hb+72GUjpP@H(`d)K_}7PngFApe
z{XDamRZ`P!sxWb~TbKklfumx9MdLnEZUTuJ=rr!nmo?JwAgd#C*T<hI{Y-_gqxhg*
za{RL|zd971Y~C7Em|Reiua=yun2;*JD7iqMUjzz-j8u#hR*YEbOynQ(ge;W5r*HSo
zP(Pr6k2GQe&HPE1O7nM)Kb)4+yoADrc?>0+bWqq8NmopxfFJilf9?(RP?j}-i`4I%
zr<F!E1ng(;-f3P=5ZmlGc$Zd|Tm)hW#VAe=0jH(rz@Nk%8sAQK7)%7p4OS1U`n?MP
z8{5)t{M^Q3(@lCk35wx1EAEiC0Cg7tlJ`}qit(pwET+G%*=+(uKpHDX+%hhk@CB9t
z$JN69OjFG6&ADG4jT|Dg?J87OBjeW_vNFrE_K)4^qp&FG>m)1P?;oGv9(39z<(=!q
zD?pGJKGw|}UcE}8y35cMi^>tD_sG?J{y6@j4U4WRb_V-my(kc#KCLFfwruSapRO*9
z3D1&T0Wc85p+Z!m4HzKsuF<703N;P6o1H^=shd3+FiEWI{&?b<#_}Jp3pX`o)}*{E
ztOWxS@i7V%^H;43fD#Se%ktf=og!z)v-?(!b}fs`)Ite~V>qg1m0OGYz{dRRoe2N7
z5%p48U6(qjo3tB4mz6bFL|@O{etzM&bEE&AmvoJzwQP8brt{4w_KAhsmmP%}Xv8O{
zTvt@hI{D0fq3wGwRSp9M-t!BGc!liF_^0W1z<~o^pm}pImTotX$}t)mNXmYPlE}Z`
zQmax7zL%D3>V<TY_booYX*sZrNO;jER&R7|@u*w0*>F-eYcjFgDxId35%m=}RCXXw
zrFvnX?|rIwKhSM4j5f8Vb63oKm6LAMRK2RXEA`&=);CkV9o-Y#xeg!cpmR7>$5x&v
zh{2-8#K$}6+z^4oE#!KF^fvVX_VX>vP-NT?CxFDd)59N<GpFWAga!^Ff*W!lDjEl-
zS*Mz<Zfjr5)p6PM`+X`~9u5GRwutxRsxsxKd-KdAP5YsW<d^%r-Z*XEynxuQ(=6wI
zbr|b1=L~nBb;O8*9vFT9oQmTK@8&*MV3@PltohhGS9M|JVj1N|!`1JX-dK5v@`P2$
zAq>7S-1dnm_O#|iP-*sB7<pH)I^c@d9^byKaJIYI=C-%zdc-75>G!qk_?ncr6;6_B
zy$^XcXPUfJpmzcz{3YGg`*)i>@(t#HPVh@@Ih(p~Rcw65Hm5I+Hr;RjC=6IYxRR(G
zF^LjvjEsTXN%^yL@~u`Nl0jp)5Y~^HZ))m;#FV1Qi7E0s{QkL?LE;K1BTZSiZpic=
z_?RPu|J5%V_Hxh;#e$9pr?ml%Mfj#6Kee$Z(mLLgwX~Ff&7-@s2ytq`RDgW+0o2sU
zHe%A-D&I5C`EKcjyT4WjmAT%?JrdK|e~L`o&CWOSuN_Z&c71c$;3G5a$BldSX=?|o
z@JSN_XPvj(xUsENc=3x*TRVNLoP`CA$?Xq=WbVZ-*zR<v{)t_>cJ&rRyPN8YA3S_$
z2ttC>1yFw0;qQf!%qgy{b3V5)(r4G!7We7D-%6uhAi>JxPd$zwnS19jBMb$a%mgLc
zdr7wS9R#5KBf$MGsiS}n1c(73sNg6?^g2aBPg`TTVy8rQVHdhndmeu>qVJ8BvsYcR
zUJ?+AT<z4@Ev=`Wja&NV%o7{qZQFGAxK~X5I?3Z@YU*x^P}nT5+5D4`P`Cgshp-cz
z2m!S7TEze9sq@^m#=LBJS~`e{KYKWdHQ&O@dhUWk(U}PL<0kzPm5yeGm1LaCvfi<&
z=P4oABy6N;X=au`A~%Ks)-^RX76q5cn8olV$P;W0l~aDfa>AW;|5@)U0YkGJi4_cU
zNLFxJplKAUoyK{ZlHwvYD<<I4!IB~BGh8k}LraE_{>eK42W})u^2;9QuzO|HUPbAA
z>C)~r=ViZs-2uZ9I4h?zXra69Irm-2M_<;8c-cH!>snR&T^{|{)*m{tWUw-~Cb7)y
z>#yGxF8Ms>JFcJuMB^YWtJVUYhx<5uS*+AhnwVp(Ep6Ai7hGtQ(rj_N@^PQ_bOxcr
zZgwSqqaH?Aa3nxtz$;-P8B_cbq#c3#*^27b;-SFueJQE1y41I6;=co_%k$?+$s#QH
zu9bBz4A|Dl_OoX%g3G6J?y%B95`0CvMP=y@-t3@2&jkq8apbK%0>^_S1XHnWD`9bo
zdWKTGvOEzVDA=o*RRIJvR+}F~ld$S@@Aio_ho5s#q#a5xx(1Jm=oAeva;Q#Z-9gMM
zq(y4f<)=M|V$X_x>;S79R@TNvzbWlSszPM*hr)ns;Bu6;VY6!WI4Ia(985&61v6|j
zw2=P-cEt=6*`b&S3KQN&TXJ`=Qd0)5-0}7N2`y0W%d1k=R=NEiC}<LbXaQaWV4Ws-
zABPWrPivpV*{-s*3Y)1g9;*pFst!}W<iRiJPu0%mk&ESB$L<_SfZ@R5rYZ(aio^#R
zTVK@S%AW5{|LXVwq1U|+*QCF_v5mRMTQkeYo?G~JQ=55c7`F|ziC-==7D1HQk?R*O
zeD2V+MPiL-3<!^Zbk=vDd&B&sXLh;*!UTm8kdr_ofoH{ygmooGN3mV)!D61lV>vOK
z?`oxa>YZ(N#vC|QqeoYJbZQ0>rJ&EFTN8}}hTO5#vP7D1^@(oEoC}zj8AzAzrTO2Q
zj?uv*NA`K?)C;T<Q^+rlMG8%+y8!3X{j5a5(oc4zoX$+;qHZ?-5+zun5+)q4p8Z=t
zL7#vmoeAdOo_X(bjC$%NMhbe~QM}gp=f!$)4nYKsxOs%UgiOLu11|cTub5B$9wHz>
zw>4u6#c*bHz_d`LCEUdwte$atdcPwMyIUwm_f>3;4o=s8P(_?8s={1jZKOap=Eb%W
z#0&x4qQW~_<5&@N?9Z!1Bym+1ov&dOT@WE`%Za3105*S+Zil$p9fcxQQFZRBOqfVs
zTE7mCd6z~LZ7lLbCZXfI;!V^?_Xv_~j*-GrxdHne%CqF)$nJp-EM{)Qg1<+L$B|>l
zJh_?x`1{RKSLLrx{gVq0flj4^4so){<q62~vhwu_^TY5DyHM7=2S6C>QoVi`vc{s?
zAc_i*e8dHZj`H@9B>MNW$3Xjku#(Yf1Vw=zr3Ez>4<61ygR&M<l3ft3nCW)9>6TiP
zqjCfq!j)Ed;`e$o@syyB+S_ZQd+*0o>LQT0I)ssYMG+dE{&bEJ?4h&{dXfOyv7=A$
zfPKD0vfA$9!h%B2E*r(mm#BaE^_zU-uBK;6_;zT7FLQDj?cR~_jeE$YQdk@G^~CAZ
z0vTP+Fmw^B2qlAF6L+Zy<dw1&9F=|t`b<)TovD!HazwJC6$d15)=`2=?e(vW_q%JW
z+XFuR9xgPj&~Qu`XN?*}mt@@qjIV{F)t?E&0RL&z`|S2YvFtru?_q*^M5v%D_Lm4r
zx5vS}Pc3###xiJT#$3|Hp@FTXh^T><TB`p2WH)dLs1!iAB9tf06Cnq-D*4*s!xZBM
z2#qY!9ol6AD2l!34R$nE=Pp>j+@B^1OC)+&;$2T9uc5Y;9=klMge^gE#(~OcKcnf?
zS_ND6@L?$e0=i^aU*@aME_-gY&nP^NorhkwkIJ<`!Nxkgmx5!yAN`IZgxc>i3&H#C
z+m)36_L*x+j+4Kal}0ChoYAS7kF|ciQ#m*dmKpo~{lFS=HzV3l0gb}X90?3$SfD@c
z;cM2ewQ9NUN8!Z@{uto>W%J~2NO5I?pZ|Dewz-I-BCs_ylttAcZgxvps)Y5o>YgVk
z<nKY<E5Fu&tbQjFEu!#CwRb$6g@Z&_9JYPtA_5}M_IS9)2o@OkUZd>9IOyM4M6TCz
zEDGCnR}547&wHZT6^A&BsOb}WF*Ed1-df3a9g$J7c^+FF^!m;Xn5d{@iDA|6IkBSR
zc6#S$RdFS0CEzBlJ~4v@RnLMU`{kFvxpcPiLn^|n;{*X%5^=ZT>XHRx?S7(}A#l(G
zJx0q4swR)uMoW#)!ca>#aK-?!f$a_jGkZ%;r~P1x&@c^rm}xFd0ZmGR#)0JL9e}R+
zD_6>i>{rB;%B!(ug|<rm4@mCUi~^Iu#=fUEEKtpP^SUQrYd?#bxVM`dE3HwKq8zqb
zQNo<6QY6U8m>aqHs?Ih=OxO=xDAGkL;_Fdd69q|0z$Kilm|qPzi0SYFol0IQEU{AA
z{Q;eG@d?kkE6}{j3*q>PpbSa^N4Wi5if#1#a9<@aIOV?9F&}eFTaAn?*D)Ag;JDSu
zYSke|<v<<3>|ILF6Po7fpygVd-%a3}p5DJpa;y~bAmTpK0sz=3pgq)RfZ&yrb;a*<
z?mB~rIZ?f1pGU#bPWHF@oZ=629oii%I6w66t#CDha?VU7Zwf^dXdi%!sR79!{N%2t
zK<K92tz9<d!>Aqy1<!2SG-<F0Msia~{XJ2@BmgCc#mV?png6k{*vsnkdm4)bQyAkQ
zOpOSP1Q9mbsS}i-d*acirqU@EBM;O_4s^u($mr;H4%_l&&SuL18PhX=B+7-MHS&K@
zvDpIK>KCC7a<+FGh(y05T3NT|3G8~}zk{ia#4&?UR23kK-$wyj1jG{<z7{lkLpNq`
zfMTgT-QYY7=-fO-uAi(Lya&2^MSw*%0}auT5vF{~J<alP_;m}+%{u|9ZOpurFi2dT
zY|n#4;0M{<bQAYPjZPLUfQ8%dwanw~22u4M1tCG{;L7)6<nz-BzvPII_HtbTh%N$8
zDOWU9MmJO@_zkYZ;4C0F=0pG%Ku;GQYUj?Kt54L-2QwM#)-aZ;QlTiD@4)5!Kw6G)
zC2)|7<n|e)1v$)r#t%}7Mnlo6&zzg?EHDi5`=l#zRvFTif)w$tcSQt3on!*7dw-Je
zdzyh_n+9K2Jva@5z-lIaL5nB>8QHaKLgD`Q{uxD+w{IWKl@<(a!S#WLNisf!J{V$f
zN-K9YC6#Tz3MHbp9>m^JIkEwnls=73VsfpS5TIh<qN}&*jTh6Hi_uc5Ypiq+fYS4x
z@`RXreoylAd-qhv`Q5eF?NQ`7UL@NzNbokOwEZYtdzc-vWC{yJ`@CyrH67pm#|2o|
zg(SxZTE|(+9E2bl!u+D@?DLqPAi+`b30BsH55|;hHi_)DY-{>LL&H`=a#+y%@$&aE
zDwNvj=eVZ{Yo3)TV<jlokd=ObS3r~|$t*AX&LTN7y}Zl=o=7ojvYGh&Efo1~(O@I7
za=UAmi~LGhFJa|GWH)6>8*wo2gB^|=r#e3LkgbM7|34)ZEPURLZG-T71P6CD_w(qz
z^j1{6hWhy|ojtpwa6zX|$-zTlDTueWq?#x};}h{V!9Qj4bf;b-hJk!0Tm;?lXG#3{
z*Z&^v4A{RH#vUOR_AO3FM@HBy+9dXyOLKVJ9bsCqrraq1y^rm5MNx=`9d=%jjUXap
zdGB)Q1A~IFeX<8sb)-f1M7$(YLWzYFB;$r(PpvsHkUAJw-(umYj&u^MPS}KZ2G;)*
z(+|EZJRgpOC;-k9MIWkH{I)pZ6bA<nAKv0=D3Ni<G6jA>%Z99=mk3*mBL8YS<Y;vH
ztlV<t74PzT7ho8{4d8qK?_)^qK&ga(0-AxrIE60VytUcQ-Fx=jFQ^iKvSU#tr@Uu{
zE-IY@Z3O{n(CHnm;feG(I8BlWaTKID(X!329}sOw{Q;R5<ndZ0FM=V#hZ`HWfgE!<
zBxIP=x|h=l5=AQ@d$bxI)jF1k&YKRxz>7{C+zZythfK;TM{;kyWmqX?NIG<e(AORs
ztp1OkE=ZKXHDtYwB*3-Zp4QX*Z#C(+zyUIC_?0X2G;@e;TDQ({`Epe`Y0td#3nq#3
z*tX9mkR>8e8Wgm9gcW0*%2zOsr3hRpQ)Dusp%izANz5Hr!$3S*V*S0ub2sXVGd+F7
zhGC+yPWXGmz11f~=<dm+n!B1FCE;R_FG6#GcWHFEtd2_~XpLM<98qBC;xb-b4W7Ch
zB4i!f-y*vb)mcN1^6lD`O7%g5rk9P9R7KTX<aCA%2D)!?7Wz8IdDIkRH{@#Wg4`wQ
zuPD^|i?o!3PRd*MQ`}2GqlIu_sXpO;_$D3CJoN<-i+dc=3QBE|?z+wZ7lL0YI8ZRW
z01GS+w-gNesn{%>Po{_0z@(5s`vj{$;!46urfe#9A2OFqdC#-CrQ@z2V%xpqx*Qu^
z2Q~{kKoA09HE{=t+=-kC154B(D~Y-CDmpfDf`h+u<!~p-qo!0fYm<HxM8Lu#GR0uQ
z`F_duA^MjTxB)a_t)(Sb-ri4SIegXaUexra5#{5>?n+E?#ojv_3mkCRch-zj$7&F%
z!!QwrQ`E`x&Sx7?_!YLB*>D5@Ao<FBa6`}$MbR5`jnB%+QaldnKoA6pOz+|Ogrcqp
zhEkiVV?-=0f)iq4adlVz(`+QxcV=M+Svg&QpH*|_bi(>*ldX{8w?wP-c=`Pa7#A$F
z>}+UQRGD>c*>I^eEuFilJ_@%X+zlQtj)4=x-HF;Hz{MUBbW>;iUw0#-hJW8L+aYo7
zQz=(GfANA&AR5O=c>6gmmzd2dR)Fl1SE8%niH+ytp=*PF?*SCML(P1XFh_#fDNw!t
zV}UJp6txrdcoA^1YLbAfUql#!8LdPc%?CQ+M8z6=kTg}eZ~7dR#4Tvrat?%=8o1Tx
zHdDx5Zr$p|xgf93q|w=r@TBbRjV@wzIW1ZTF(FpM5c&Hn{r7-%n>YIsL*p-%W!nj+
zGtm<{K-}!JeQDSgtUn9|T$44Ug7XHpJhyAh#a}6-)h6bJCp58kq5%XLR{$>N$xcNy
z4+~1`5As`)<TDannsvG{iL-b_X6%aNgX_q6zzGhMx&c{<P9J)UicIAclxPrvL~Mxx
z5`8(SGK%j*P8c_voSYD}r?`EsleMYZWMs9^!vm9g2;5QNDgrDebQV{g7Z+S5JPm1F
zvFbyyG0Ly##zKupn~T=ckVf46{o`KW8GvPo6Z%0JP{H(%-SNGzD2~5JUCEfyl!?Uz
zR5Ta1Zn^n4V#92~r$&PMqE>AL4FuuI14NufOjAX>xXDA9W^}!m-P)lIOQGdv_}S$J
znP`ZpKtn@NRz!ao;xKt@7bpflu_1#$d+uD@Rql<W5w7xrEG_s!v>mBTa+^TKq9x4m
zEAhBL#eeVK(BPs-nl*7TL&dB@E5a1`6KQzSE%fm6YWoBlkZnA>-x0itBvX-HkcZR^
zrUiT_-7T2tOcEIA%0@8BVppkIVY(=!i6B{kw<M8GP<e|JaxF3#EV~D%hJ=E@`A~5H
zvfE(nh53bcVKpmq@rUf`eWsFcOl+)LBpBmClmsMUrz9e=%Y&YD77vXYX$h?WBx^KW
zD<Spkv`%>qkPl2Z@#RsyilrY|FiXTTA{*f&Fd_^UXL<U@jl**@B$un!kGnP1a*wLY
z#)WlcJDD{3?nP`uQeIjSH(TsfzaDyI*>qGP?`TBlBehndhZ~z4jE8er5r>=%hE85-
z0A|pJZfz0F6C(=rMFf|uK#_~Y*q5l0l7`9j(&@P)H2o=EVz?4v&F%6Cv!GY@1!5Uv
z|8_FW``X6?q$Ud**Uw{T!ti4v;(*8@aw^dcE6$V{L?PM}0lEHPkfr?dkAV;G`Cs|G
zDoiKX1ZDwSB|=$<uH2q`n){8W&}+ne)P16KQ*d*8UQ0{l9^^OP)1xO~L{ce1n-&-5
zu_OnPUAHmdY?w^heOhAvL>VO>YxP1FEL{pwb})c#8kU-zhV5B>Vw(B|&Xpi=A6nJ?
zZ!W#qw>^mJ1xpnrYXi5M_%l{SsvIL+#Of2XB)94+6beJ7yn_$5j{UnHYl6LCSYO_G
zZo79o7*F7!V2@aPE7WC_*cIr#Q`r^xkcW>Sm!_!2N!Ux#@$`-l9h8-oEqOv7EmHIu
zNy<t@9)kJBggHOLw*Gp0$4l3WO^%o-9-Qs`)I{^22Qf4fh|obqMkGP>HU2pYH~SxZ
z=p`KB#fucei6sd|Z$`mXZFrp8&=Dc(ruJMjE`t%}0g)>TopnKp%r%yrw?@*}c2jAk
zp@2FGqPe)|jCSpW`$s;-Ycs_M;Yn)`N_Yx~*GhVsL<=(&rgP`caei7InRQ-L)zQ(>
zW>}3dRHTdE7~KsUq8vJFNsMY^7ZI&F*(`LYfP7_DDgvLvZ8y-Ca8HCo9L=m#<GK((
z^~xcxaSW)NxZkdhDm3{q+$&6rNT01V3XTcZ<+8ZeSkb%GEc*t|(uO?^9q~U4I!;(I
zSYFkGDBqCDc-^r~1B4WXg)+KInX@bct~h%5O-|}|fHEN2XXT#_l`%>4G_t=FRtQ)h
zd9-JA)527sd$h&D8tf5tY0(j35p$wcO|nP%AYq1dRD1tpg4aL>py$Yxs?M<ZzB4me
zw4+SHU$f$=VSoQE%gs!qTWlY@<2ISq*PYzi-z!`rTdTv{kuTOK7-<}f8+NRsV`84f
z9^cdveu$d8{sI`#qt?oDOa5TiAJCN*X{R7aB3eJ;SC)>jB3z&8BU$bOLrY(cHgh7v
zZE50#*vkS(aDKP~f3Aq1k4Oh)A#Jo_l*1w5T_rH@WvOlkv@3bz?W`FyH`cWVA+e#~
z>GwZN!uwwd0@QL1iJ((*k;7P#b)w@-glk-ORTvIfb#T69_uUQ+Zn8XfX%c!M#)O}Y
zMDSwxqm*!9q(oLUP1A=n-0ZrXAo1aSxxFpeVzzO5kpov2$Ulq*Kavq_;1_=<Wk(!W
zoqJiqPREzk)pgCbov$9*e)Y(1>xV{t6+3rs-RejzMV>wF66QxGL~oBWH5;~K@QQk+
zMA?ePla*T;wr#tnQ{N6=1BZJcDX``$w2jBL3U0xDjQ5!I?dVZIHgnsAj9Y~<^`}in
zRqiavO^M-b_UhgHll~D!BP7+5N@tE*<k%rHOSzo!Gr~-HczA4ln{<TRLSU4RV|{>k
z3Z^2=YxWHf55JmuK=P9%Le7TJ|HJ3cf}Kn*p`f7f+pYi2#W)Hw#7|%VB)h@5_0^j<
z7ZzA`Q1{{ov)2s>9&+<;PW_rkr-LI5q~&}0ojKD7EtOrw9Sx+3D{ngb^VA?np1)Ag
z7i6%v1qE@ju_YA_-r~u3%}pW~A3bwso}Jxb3;<Bp>ye8bZ*cUWf&W9a^*UigcR^<)
zIzPH=P-pR)iq0cu+U(QZvMFRa|Ewt~NFD>ZxnJMDd!Q!%giX7ZOT-D8%9e8f)_|W)
zK^Se{8h*63W9<qG{~`UAK2$_39=g42Fm<H5-@nDe-5^>yaPZ)d@81RBE8GQ;$KmN}
zJH03t^y<}1OI!PEg_h)NdzEM%%~CcudXu&9(EBf6GIgF#Fy66aEH|ew-7_$!U}nc%
zyCx<M@#iF8I&>P^4ms=5MLwI96vT0S)OfLED<#RKq$J^TK{xKA7Es4@D3MMC7}Tf`
zjfKO@SY~GC?0iY`Ax4fe;?>mygCqJ=Bq;jwWjcjZ;)3bXraD-)hQ`KZeO=x&B7er{
z3yPs=cJ2ks;4jFvXDNb=962&_PnUx9r%wYWH54hx_0&7S+?bO{w{N4iP5)4{fCORc
zi%!`sTeqfXWnE6r=qh>mhwC#~CI6^lIs=1;j5k8#B}m^{ler*>X`c!U3#*WyDGp)(
z84DNr{X?I@Y@!jm^+cVNyEDROY9eW_XuGbfuh&6%!>z8UJrduKz+VJ&80w2HL`%HH
zYFpdff+AVT5!>>(<%k6aKesV;Jcok#c4wKEucH>Wc<95&kF8N2V#F~-8NOD9jD>|o
z1C)IjgnPlc<E=C0Y4Q+Y6T&PX;B~@cKF;7n%XRBs|M!e+d$E}x-sr`TSaPyuon^AS
z0F;QtY-k{jc-kCXE?TnwaP>XI3ga`<)4x|$tpR?a)!2s>5d81IW(~w}4A9g(ghyw^
z&ly~8xmycb-e8l>`B96+@r~FvGitf|_~WzA_rnk)r$-ki9_0=)8H*OZsF0r{W1y+2
zDY6hb)lptdnu%j{D0Jexj{=8{0}D^3mrx|Ds7`0Rd2{o>?Md(Z=Sa3JoVv^3@Y{(1
zw`|{jp1{w{%nU)Po4kSj;IU(aXz2>1a~fBB4;{-J4n?g2y+32HvrZd?5ZM%$l^qBU
zUheEWOk8yJyK}@1@!yc2HH&{TDSDLnWCw>Zh9Fga8|W_Ea+?n(l$}sjQ4wN^DK6DV
z8%N#i{q)tV<Et3M>E-1mS~T1meoQ9{d`(XzoN1itF={z~fn-<6FA<Al<QH=EV8^K^
zN^+u3*w2&4mKDrAo6UgXR-zNIslKpzE3SkvQM?IjnK5%eYdQDv#9*)|!j|{N#hxJ{
zX3o9?y$l-~8g~4+*L^374RoSr<mZP#2yy%O>mYK;7X?LsY?#K7>7C6v=O&4^7vREF
zG<%n<r^|y#<=9_;-Nky%Ao+lMyp}L^$BrGA_V(`z)WsSn&Z)^#ju0ItL`{V9K%XBz
zeHvJMFk-s)=+SG1xJrFJlhK|at*JWJp|2QpQSnT&oL+Xv2Ix#Zp{1j9?2*Zq=~{Yv
z8YoqAQyC`ogw=oe;DKmJLLNh#ivp|R)^y1Nx0M3HfkX8E{rkyueA4NBb=8HEI25OL
zP@(tVw=V)YoHqkIN)}Sh@Mv(vZ0_vaA@r}H^>8;ML(KjlKb%K?&)CYYgXW$7NZzoP
z+^>q^B;c6N&Pn11-1*ovtK|}dE1;Clcmivp{|<e#i>Vbk{<(riDkkC)K_^dsO9&2p
zsFz(Z!N6cAJS-V``OnNMT1)wlYQs;8O^n2RAe5PCCGlX;NQ%qDGqWV$(lu~l<N&ng
zp*v8s*c%_O^X}cdg5Jj^cSq6at<D-B3yDv2{u`Am-c&|G!IrR_-d84}Poj?Y$XLPP
zZX1OOX<>$*v2k%<E9OhqYRVYyu1e3I$BrMLhAJgvq3=9;^mlSbiw`k+YfS=#isF>0
zoNsFJ2?_W9`RB?LYc!igTzodW1m?^8$(}z)w;0z_13ktCnk$*v9$Ryvr1R22Iz-UP
zWt}v=)cHyx*TU5bV*o>9E6IkC7K5LXP8@~QmRv<eMNmk{pS4wt*pHvXTrkrR!v&vh
zWc}KGuBL=@L@(Ei#hXv@h3SLP?6Q&Cw)Xai;^X5DA0}`<IB9gp0Lh_^(?MST{=0X#
zpI9$iw1_FdFMH-=K}L>@(aXeq*dSiz;%{JUYb(YLdj3=s?LYuwm2ng|?}SzCfmqdB
zh@0)NXC#>@X>K|c8W?zwX~b3c<vVmJ66uFrk9tbtVW@>x#xdfpsG{OZ?MZ~0fMH);
znG&fk44W6G>q@^xLv`+jt}l5OzQfhEi;~hE&?1p*F@La#!~rbO1K$Q<>0uOp*y|Rz
zT%U=tO1y4m$3uz{qQOr9Gn`LyZmh2O;3wsC>%^qp=)1ndjyd`CiZYEFx9zi#Y9J<k
z|MBC1LVxbk+iL2bk$?Z4{_tU>`NM>$ar*k!2)aItqvIv<ua(KyKU^7jqdRl@hH2dl
zE?x4LafCjY|7+uFn07W36}03`U|1SuZ$5GLrZO+Il2Se*a@;6d9_40TVQgm*tx%LI
zn4vX2D(5Gv86iwx)ho7ut`89R%r)gpR&a)B+0k1SPgMGgiKQ=4Ds#<x^5md_YmKFi
z%`2O?0wzFoz$L(dFj(6B7r{Q=ZZYpl%S%cgAcdCqIgT8h?v2?aD~{h=?*m_AePCAB
z<Hs2V1?H7;kW`K#xZ*T$Rwwm`CfNfKjKq3m00bWm{&46spFu6ZgPhzNt_GCHgW#HC
zR4UBBos6m#WQnGxrsozWnDXA!E8@ILmd!eU6my8ilxSY<)~y>7HIB~9KBysZ_yokK
zN+S$ay~hZT`9?;r3&~7Hrx^?P&7cNlGL@IRZGe<6n<XBiVYUkvf5r@%)cS%-F24P*
z_(4O4_>%yo%v?Bc-aLd4*FF3AiU+M&G7^iX>R3ib4z~I%<Xlh6I#;e-DL<{^Gb}o9
zucv4JsZ%NM;vCj)%{@Hed-aM3s^eANwr_uwvnBt{8z-0NW`4=br#D;lxT*5VrQ`Cn
z?}d*`mz+^ual=XfVM2cN%6vMugf64C^VF>)1`YD9Ol`L4?hL*T{{(@eXJJp@9RK(x
z=QL%O)@|D4AE>9v1O2}-E-}I34-@EMFmK)A*}z)L*f#S=5eqfgM!9-*9cKFe!-wZK
zZ;?f|LFUorOioNn%}K*EnEaiByy4Q}D)cuh)WLez(l9PZt{B*E6$;tcR%{B{(r1#}
zDb7Gi$~+P=xmjiFbjA6ua(e~cP<l|ivk?*e=}zVYoVbrgA3vTOV>)AopP{>gFr@gd
z>UY*|Zf?>6ji`}ao8^*x<3@tZI}lO~*T$t0FBOH$zi>VW73{dTZTIV2unkyiW}wEy
zEOGn&sk<m{VPLhji8IFnVS##Rm)J~`xZ0`-oB@a{qPb?)`PDc9={ZduvQ_D8`AzUn
zZ*{JL9(p07KhX2`F%qEzT+ftDAs^;>G*~zgYNFF1r7s@(nmQEn+zm9D-GB0={DS{I
z-dc2&q|8jlbkN(Y>)H7z#L06AOjpKX4SG110l>{<7+=Vx29);YPRcDE8YoI(E?a!P
zyuLFyF>(3!Eh*Nm2#wdw{~+WMz|X(lYebwfn0QlfP_Hgsy1Zs2>AN+>3@Z(0POV{=
zH@UN2+`;dqmz>;H^V*h9$8U31-oJORsI)X6JBAI#|HX_s^6s?PZ-yfknm+W+_fIyM
zdF-@`^Yr2Kfgql{y7%l+Ti(kxc}olyotu<WTPq@bgmOZx2MrhyOFWhm*C~2pvc;z!
zD}Vp)`7_a-8qlc;I~P`tt_PxdLL8Ofu)J{Y)^c7kdc{KNwr#aoNYAFz!1cG57;WY9
zNDs_~KA5q$T}S&NH;x}UatFrP-?h%q($fW5ydW&M{>{^Md&on(C@Ma)c?+B@F%I^|
z$2N@U>YVFV@pkULf`pD{_IF>rI0hesBX%n#rPImg#Ds)_fC{ihXMgYcb9wG+1URFc
z<a=DTjQo_RZtPQ%GL2sGQFbsSH&bc~G>thD-qnvY{bod;`sBjKD>=K5gqyc*Y!p+H
zo2;LW&HaUfdBM-P-XfrQjJKb)_Cm=5PN1NuwQAK0{*?zq4g4V+iGmT)RR)5C^T<Et
z<>d={|BXXyrP8!^i65jWVtwc?{D08&&OLg#%>MH5(WCT7kM?1UP$p4RQ<EOp#;Zi`
zpMU-#IE>G^8yjBvkQB)@r=!o@Q<#2W<=KF}B#YEH1h##4VI<&k{{aKq_Usu_s#f~-
zYh=i!%YXk3WA1s_!;)?|>ck1-q83JCQJE7t-?<!XAU#hGVK=kN(WtxX*ln$K$!s0{
zx2pX|rfjZ8a?9oE8D0&SzDX_Q)Tt~nDSFI}sQH6;FvP2qOUy1tGKzy`a2<Tc^QZzg
z@FH`&E8`9yR%8$IdnFJ2?XdkVt1Mc_Hr<{_bDmj38G~}POP1|%r>8So!>4_x9$o<f
z0e8C2T$<SkL&g*QRd3L7{^)O_+$$$^VL}&$`>NeMTYURnT)BTeFwGhQDwovRKKFO)
zXTGz4J0kVO(f|W-wT4z;>3>r*t2(+Eut;1(`)y^cDGjXqP4~+QnAP*>2C$Xr4{q3%
z{0Z_`ZcX1j`iAMMRr8=6pzB9~V!dX~%fBD}vRTpOwyJkx_`}f(3IX|Zub7AQ-7s%!
z`J1Q1IeT~R+!6iB8#Zh(ADzM<!NLpR=-R)hw5zr?Ax3Dw{c>t*s^OLZHG>|Y2q|Zx
zCI4ApSN9#qcsAQE_S&_(ct(QFp1U^31I?ONMsBFxeCtU5%q-YsKWG&zIvx;k@L)=G
zT;&pB;XZ$RWb)d~xM~20?N-(E*>Zg*o+2+&>eFW)zKab0Xi#?%wI;Rc+P!-`ZL4_R
zuZ5}2BL)ifUS9eR`7YT#%tP|FGA}!CtphPJBRh5*{w$=Gf0u3laRIKy6JjPU%9_L!
zYtt@wd<I;S5~(n(P$;s3JSQ*>5`%d752zl9m|GRk%5?g5OTMatHYO+QQ~J%V`6nkQ
z!Mw}m+_^)sB!I%I<F*dKTil$bYMdXr)5YZ{l%JA0AL@VqR#mR8KJ?__L+!egnFmx6
z26<M$yytjTHD%?ygp*Yo=Q$>8zB}h~I80|QGaeLZY;0`SdjnVZS>A|;jJl5J%z3(*
z^6wj>>^g<IGJ0fV;NVf$WD8%t>c-}zB$bzy$=$T!T;WryKRDq4>{cfIJZWC<(sSYO
zj)~o!AGh|7Dzx^wKGthkWwx;JwTa;|Ee5cC`}U<v)NsDnhs~@e-Yz}ZTSdk0S*7V?
zqvIQf>2zvLffc0zl7tIY54`Mfenl;;t(Wy>b{rVyj;FnT9n3~2>Q^u_UWcg|rX8+c
znQ`UGnMtWRU(Sc0Nyv4Z>sj<L0qk!SGaSpx%k$00W`}llCR!nTDtsO%>;2(+XV=8>
zS4vin=-3($rjO*-jA4%xr*_|og6Fc^yT$>#yyv2Pt8nm28VIEivQ99_Sb^*WGK>#%
zV{M-F`26(DJR;(_>(?*bzI_@6No~gxQqDPknu*jJsb{($2@ZZp@Vnz%<i$7U3xbay
zzfZ^bTuaNo0+w1lRM0(U+U;{aZ5oB#vo}=7tDG|c4QdDi8Ke44cx&ab@T0}i;NW!*
zt<@(@N?mf#e0IVtvlX51FP&ElMUX_!(7N(<{KX~P|A7N2_is;^6l=f)2S>+^+sa5m
z$@0&gn9N54%v^rbcFNA3<CvV1dfs}oa&4?thJ%BHmfh0?G3e`{!Nxo-CRykA(w{Wx
z47G=ngXNz;OZu=pqwHelILrwibuTPMdGnwON=tn1%WnoYC8fKWt3sD-)uGK#OLs?Q
zIyt_T_1;T3_U>opc_JsnnO9giSWPPiK9HMJ)<J#B(hZT)wY9YuMSoCD;w<3+L@%m{
z1t7M$b2|-A(j<L;p%oa<ru!ly*S<2>=1pE6hWarJDub*xS#QbK)P|oOxp9_k!!7QS
zq@z`NpHrx4e-WftpeaIm)rq^ty3^0x>55F35B5t}{tr#(0oU{Xw(+clB0Hl9#|#Of
zM9RtP*krXx8IhzyWmd|_9v!kGBT3SvNRm;KPLfK?YKs&yp7+iFd0t-s|9PIrso(GW
z{e13mUDtixi%j7}A!>1~+zRSoWMm}TA0dI(!22#cVEG3$h@5zQe)>|{+;eePq6g6R
zJ?{0pgS@A1dS<$no!yQFA-yRG88vX}GZc4El7b8+v}n;neBJ7{7%fWTq{7u60}YYd
z2{+?M2u8l(G59ylGD9HRs2R}oY|jb)A2MZdVE&(8vS^UzKW3iMcM}pC=<Dk%fQ<Ch
z(z4I0hE_wU!^%|B-HVqmiwX<VJDq8sUfe{HNBnrOtP;>!`ly|$sp%JB_zegbzEV9h
z-gSmb>Sp2}+|b~d)}FTJCSZz<kN@e3&t^o-wrz-<CqiEw(Z{10VH+LFwNG~%wJNBn
z$m5qdn$Zat(%E(I+t-JsV}dY3f{O3PsI{+^5Df6Whm0IGYLZv@@1MGCgaC_y467|0
zwrp5u=ArCo4y^14-j0&2;Qjl)_4j$w?}x43{MSA7cO7^VS_20Hzo8hqXr4Ua@MKO6
zMZLnEw6s<<p)SeDh-7x2CRbTwwAz#@k6lg~`><6}^r*p^quEzaCE?;aam4Ggvd2;{
zfMmisi9G>G%XW$hZG@yjQv)8cw#a#x${*lF+f#s(8JA}BA<buI+`D(~rg59IQ)s9W
z432_qfLK6#S5RKA<@SIV^}ZcjbPoL2u5L8iVe{bZyLXK!Z`pEkpZ_^s%uyiaD2&w8
zE2^yY^YZ%pX0h;jn70e(0^CzjQj%Tiq!ebCrUCUe5zs}_ch!5F-iDzTacV;rH>N&^
zHx8%pp}4t^bqykm-g1P2V@m&r7msh%V`$0Q$2V^HV<Go!_1s*n%-4$O8}t26Vl71Z
z)Q_8f)ME)>b6tN#Hg$`NPCkFOVs3Ee%<0o%N-rfONAlX~(sG||^2cgbIDHO5;hQ&J
zMrW4Qjq-f|aFMzB1~5)hb_z60F*S1JNFGIQ-kINEmyd=?f1cj*OAij&1Vv{ZI&|pi
zO;-mh7Vsu3)>)&u@MK@_ZrT}sRMnTvOtR`m#YtAptA9DIS##&rD~;%9A8+H>YH69T
zyBAdilAIkRRt13PoV>-lL`M4DogkvEyL6et@DXpnB3@CmQW1Mz8&t5^7wjh=WJEdS
z|AC8DjcR^F>2E<_jdBOjC3*)34*(W-=cq!`Uq>s91mq5EwD1<R_KHykZTVPNC%k;z
z{7hPU`i!F~TK)Q=xX3+S%&X-{8i&knq^hR&;_B-upjX1?%e+HE!d6^KwXN=KskPjq
z>e|}P%}rx{!Pd%_#=k?zy6b=76)GY1*lDqH%3}2V`?YUWWKmpLDB@pwTU&Y%lpF`?
z31EO9A>fwhEJkKPFHTOMDqEU?-L&f6>>tgmYu2uP7U43Gk9Ors%IRW2-iK*vP4B#X
z`7ii`+M+r0EiKdNG-*}30nRT$5VBjtV8Labq*VYrwuoWKI$)^Fy9!LE<oIqQZUD%X
zMt``|NOXrwiv6-HW>#+V8V7iCjl2d3j~ob$_oeyY&kx)y_p!lsq&GVK2@{fArdGB{
z&o(kPc64>^GkWw1R-qo)w@*|)v=u^G(i}pMG5Ds_Y2fc)*9ExS`C$s(;r0PTLthQf
z;0rS6x;=UK-o19{LO{pa$#f4b=K1qM(W}5Vtyz<aRSGdUR{zgu)?ED_E3MPw)i+^+
zhWm$vminPQ*W)O5A48D%k8sdH&rN%L1pjb9Fcp`8cP=?8&$F_^-M_9yB2xY7(-K}C
z@#7d$u0jrS<)>!p<|hII{1LNA=kfA}FXWaeWWoG@$$yK+bq!~y7;c0T|EB&To}a<=
z$J^9OyfrJA>gwupCxB!G6GWGSEFvn#>}%#>e<}{Zt1iP{fxmvG9~Bmczl~I2570-e
z7A<z=#CLFXnANvk>^%@HIW%=W2%5S8a-f?Nu@vvuzdyP=UlO@uD~;T%9YzeCl(H}+
zWi<L{f!pXO;G*||gn~QdoY+lmG8~Z6-Q8VH*Iq4uQ~f}uSPD4lAiVOoC;%nE&<X{-
zoO+qT?<Gr?kSz38Da(=YxoQhu8W`A<6U$A;c-jcqed^SyBSr_^xN!*e5CW_QO3j<=
zT)eJhU=Sc|2vLk0SaeiOQSAHss8O!?>>?h;WB5KeQwPQ0cOf@XFKB*9bTC-kkY!K1
ziz5@ihlc@0IMd85>*}YAvoGhY7<v<BwD=CO;uSa}c5VP4rDDucRoCXU2h5y2pt=3R
z>fuL_Ntc3yg{~#jZ$USPER`J=|6E9d!x_cn;Wu5(ES=hAg8cro!RC1@BKZ!S?Bip`
z5@J|#ZmvI#->f^J2-bRD)N1vGk@jcKo^7~!^JX&HxMj<h?Wkzg^IlwE{x$=QI!q98
z%%%`G*LD5<vZ5zUS{k(MAi<@l$!;sOIuhK=%T1B;N<Y}A&G61D)b2-9>>ZBJ-(!-9
z%=trYZEqMFQZH}f!{^V3n>BPa`%qW+j_IP_T3Wj)0^4-!mWCJXS3tqXaC$PoM%Q;Y
zGqXL>pryJ1OoGNeeL8(H`>k5GJo0x#?sCU{%`7$VJbXCm=x(h616B_8_h-3p&fpCG
zHNNBVP5Ed}m>@g_X1kRhj&7zCb}X>eiTCdZ6%QKk<2IcfgNs}-sS()W0D6lPLb7mx
zT`}#Ux)HTL!LR<96f3TLnI~b4jlr7;G><1_g=5_};NN#l^WngK=*}HOL3yC9^p;+^
z{r$~gf{!eN%V~I_Ck%p`4NJhjQ~yh5=6I7G5Y-R=^5w`6ovjNmr;G-e1xq3x3io;?
zB%~8Nr2knszFB$(wHO@89(ZAS?u@%&+~WgLyY}bz4H_HYYiex%?g$vCZ=Zr-1e>*o
z82(ENW}Ubl_z2#OAN`K?k_fm`?_-PYss?x+-Ffn)r5L`kkHO*F_u#t6HJ?5uK7L#l
zls0Yh<i$WSkPc_lm#OL4_zcSK;TdfZA0IDqlZPT9LYO4kD@XjHrX%;el)7}yqV~CT
z@nQi$5*v!zXlP6|G&DSVzdJWA*O$q9s$(dJeVkjS*ysrebx8I2!OH4-hp_F`h-xp+
z`D1^6>4|P0d7<}(lN&E5|21^iu3gBdT2N*%gR)c1rUH(n2tls@<`(6%VlySB?m9XL
zz$#C2XGz8sC%Rw01h+jFHPb(4^RG=p4_>@@`r<`&3rY<eHnh2I>S#uhSuEZ^q}n&f
z9pbGiqJ?Z{)h=#GPYWNWG-#xVllDVqsj{gv0O{o;3D-t)zw}IM%kg`cIyi*hJPIX!
zbGgo}^D~k6{zG^8w04s}en0L^JRwEd`|aCCp*Wa!pdy-i>96rk<q7~;^yFnO;bAjC
za-1nCvC(8SDD}qj3?*gddoYua7lUETKpLJ($3Ry_dBu-<t$&c{<u(_k4|VHX2zy&(
zS`3wyG(W!JV*~0n&P+zI+2BQs7RkKPG=tBx_s&;6m$_m&2o4@>%+cl*=C;{mBD2Iq
z5n_1VgQ-*HVInOkpq94daUh7<#}}G2=Ws%@Wx$<VhaaUd`q8{a3*jCKRAP$5ZF31Q
z_Bq*m*f2^OOHTYlm<w8#o+C#_8z+;(i{!5N@4r(TKIE?qWPXKEfmwosLBV)Vw{9m*
zfTLZhujO>l_Y;v^5BA>>t{YC_3>(SBcjGNPb~FVch^ce}ivX=0F!tB?@Im7`w{QP@
z0x&A7svuak=+)9yt5VTXmgGc<Hbzt%locPr+Qnf+E(x0kwugs19o6a~01D~Y)zy_+
zT0Rc3X(>xCuKSDw-Z!{FB~TS#?-%uI<1VJY@;q+%`ZmLVn35fN1FUQMn>I*tkl0OQ
z%fvf|X1m;`1HSpG4Q1*>Q|oImBSY7{pDxPVnV0@M?Aztp{qmQOQff`Fm`AVK%JrvD
zb9P(Wvg=4y%KC-z9W*yKr}4q=Ybp&YX(Bo%#`%jHFOoJ^ndi#LNir%Z_>RFOe2_C3
z;6zFXw?!>}=NlzzvF$l2@EbGn&fWqC2q`e4ys;CsI^gekUbCN{UvzA&4t0h`c<dsd
zllNhB^Ks@EsGDNJr80LC)eOg0@-?8Te1W_wH@nWmmZ{3O8-+{=%S>y6XpQJ7JD&U<
z7_U>{z{K2#tmGi6@R1RN$bcB-%D8WJRiQa<T=A0<J}L0yfBkf$dlqv#(VU*2S^Yk6
zugj-QpOc0M{O?*ZWuA*+fv83x(`V9}vBPfMq$GfnoDADq2;EqZl|UR4%n?RJxwJN_
znchfI`F+h0sx2vR5ih)W@j`~qBoZRvCCV-ZPaSb(>YpUm)*qCPcGk<Q31kuzk+aOq
zT=P5<K7IVS?x87Fd6!M^qFMPN4pkv@i;?q?1VmW@xPQH1=DMBe2#K2MD<Y7@l=N1t
z8V*knGtrRl8)`No*11f?^U{_8?Z0IkYp*|giFMzq7E{mThFJ`!DO)ih^b6={|9UuE
zQua-V=`iH?V!4ZCP{c3Hn0*JyyiY?2LrLMXE-flNJn`W}Ur;u*R`*th#+j^G(Vx3o
zj4Ya^H=gJKR^RH8-j<YeV+wpTBXQk(_S_DFfy7Js+kvOg`gD0|>AwNfLuvyBHNACx
z34j%jUCNuMA7uaQZ_9dCva=TKj-+KW*7DV>ZSkEu@U)$gkA~5Ll;)1-8+paW8E{Nv
zZETk=edB!4-^|SR)w0_u%%?@#XWylbQyaZr8%ACVPbP#P8XR~(Ov*EmMA}YHB){r!
z*B$?bkOvWCY%-oVPYn&94P6-po{SrEF{MPM2j<qr`Tpnkno<KmoCAvT@FewjQB_qM
zs;cvOR|tkTEf@?EwL28xF@JxWr^irmARM;~&j#ZTXKiPf3`gk0YsXV=7#NpS%`j7<
z-^;(&=-h&kpmXOuAs=U<%V&EaH6sT(!YzNr;mLmkm$L^VC^E7Gw-iyt%gp^(|C&np
z1U%(Oaf+5KUFsDY>Tpx9EaHUN+oJznY-{UHUB~GkQQpSMU|=Xa-t{<hBlY!*D=Iw9
zR&5m?a?tA4t7*4Iwg`*F;(6b3w@;Y$(GHzzANdCEgG4-&l?wzQX_AtYlTSa7H1b(E
zV*LjznR27vm>0A5FN#Mk^VoJsRG=y{D!Vk{L<Llp)(0N>b<{IR)hzZV@X5?Yi>^$u
zwWwD&2~5Yc^uogNB~?{>j~+D(c2cKrW#`&Z?A_R9OE)Ecq1F%fY1Y|CoOtifYZF&I
zY@;Wi2PCXiqZxhO?bny4WPhRm$m`cuWcC*=63ShX*3i?>n?K+A(HCGR(Wv150D2;O
zJ7_xS-$l3<a{2O3;0z(yzyyI=M4t%Yz5Q)@YDx+!g?E%$OKFuPB1v(UWe|Dg%5FYq
zI5JXx9L-#8RrJ7J9Z3C~ej-`Nfp0^GBO77Fm-E!-pQG&3PG$x?e(*r4OP4MfE2iRg
z)(4Tq>Wvz70>tu3Q>MtsjUDHXbX4+$e&%n~2D119QaiSA17SWJPE^nZ#IEP#<1JiQ
zfh|$`3bumM*SBob;N|JLxuzgB<cvWyAg2)vVUofjqL&z5^VC>$n<L_aP8tFJ<xy-W
zP@xfQj2A8RU%pJ|6LDOIhUNx$BGgJfHF9L`+H#}ZDGTOK$SXbSI(F<>8EB$dVu+UM
zmt7$Dg+)bofzyFwD+_#3Yvc}^|KYFkD+fL_jb6K%uaa3hiX!Sy0`&|D8OSFIxtxz#
zLmKIM4T2`i+ney+-GZ8hy82E0M4#2@64Ur7@*$X;d_zUtYv8~=?46Qf_fG_Zp?@@7
zIdk%?l)!>$cm46}<>*PJO@Y3A1X8D>t4EF=O=c_)hzi}E$kZ5;$-G*1mIO5VM@`0P
z<|etJ=5<5$5bxj*ejJ~k42EPe!Qw?hNFvc#5%Djr_}oy7RHV|v<~-@Scl6r4YT#m-
zMgXsV8c?=^h$*o^w2=@uA8RtE<*zxzbCgAC%pIJ<D)t#Nf+!yn-@lCe9)Or<?^EK%
z8G#D|dV>05e&We!j0{>#K7aSW|Jq?cO4lqBLv(*NHS46&rZ{{^`?WCO!lg^TxX*lK
zqU@l*|0HrM2-v)N^Q`RcQ|Jw)92iL%kJPoI`csqZ>t^3LPSFk$^Iuk$69Jb9{Brc$
zmyb-1gBx>tnI=-ihu51rbKRGYs+yH1B+&2taOrm_WceTtC8MdcWST*q97Tb64G`Bv
z8N&Kv^{x?wKEJ0Kw@)jjr-O|T3Y!jQ$RQ4Xw<vG%^T&@{P~yaG4!wBMiY0Mk?(1d5
zv_@xSPk4m9@pfC;Tr+kmefI`jJ><c*GN@4nn`AAq7dqEZ6;b&!7c6+$uQ)a)MrPaY
z@9j2?Sjh1ZVG?2nmq+WR4t5<daNrx-ZiVQZH-|E@1pI56dIEh41qp#?v9)#j<$>-D
zD^NJtF;PMRQ&drr%1Fi?8Y{*Qd+F;R=f3*0k_sVn3RK5e-sUKeZ|j1?C^+ZGyN?3d
zWc*?)eHOqGEo^f#fqh==VSWHNnt>uc$+_gaXzOxfvQXDF^<soE^~6Zo_e0%w>(di`
z@IUz#@aHfe8-mYXW#}UQ>)S5B^DZ;<A3rVz$YV;&kEDqdJDXODSBe5~E15Ek)Wq!D
zf}{iR8*Wu`6ZpNH0)~x`Fj*ToyMN5Ok~V}(d1r*~k6=^J&~IKnl@S1Pm^4so6oyl!
z;)k(`%y#=)y28|Wz))=AM>?0b<JHlD4#i)E#HinElV+t09~;X11YIoSpb}b%s8Kia
zuUxv+kD}^4O$}KLWAMEI3?wZSkLNOK&$JeJ<-L;kY}>bQ-jq}v%?`C38@G8Z$hEZn
zTz<L*A#CU!MEHKQkAvs3(1HRh`VCs=wN)N%d3Dn8u+ch<YDx+>(WUo49MWG~?-@cd
zp^TY+=Fx;f4^io%KwfrXPe0ndbxp~c^g)cHMqgj1-K*ER#AHZnv1*`!1aD136q78m
z7zo3;{Xsr9foBIm3KK}CJkhZ~YT9mZ+Uj}01eA;Pn=+C^$=s|(3#pxYj2_*UdI$l}
z2mD2#tvTesg}2DY;(ydGq%vBUx8XnfCHqyYHdMvc$y2knwWTS-SA1Jk)ChjzcO03{
zQJBI!J253GmFli{9O3VvVDhd{pYG0FgzPawC6qDH)Njw8%!*=YYs&-h*?`pd;W{qv
z3JePJ#-Kq4`v_YIx3Q)5+_FU>udXdV%@EE)lQJqW^Jx1ZnV5)J@DCL}uTVX*V1XQn
zW-2O4d3gbn2fL0ka(4dw&#BGmvW@!>&3N`K+Wo5zL4ith2m;QzAtyamvs`KWfuQ97
z#Xi6x4soA5Ffh^s>)#`Do863!{fGeJw@dNLm5-Utlavc_8lV~{_3!xPu<-DVf5zwb
z_4hw`&=}}u&0qf6Q4W@Qw@DZBBEg~MF;l`w?S?ukd%~i1zS8!Nc(s1JEJz@DJG+i_
z9ZjajIdp6_M;tyYRPrI%sotv5yc<G(l1#l{zup7dNu&*s8&kW8diQ-xz7dMxk`B%0
zFI-3^lvh|>Q<--hU4=}Jl8yX%U7Ws+A(n2ROTBL29t+f_a5eYcg>&cL2VCI5)1xCj
zbX>R220H~Q!=Eg*u^A7WAeWam4&D>_m?;PkIXiB(Jq#Vdiq3|QHs<<f(bta#kr2m?
z8`oG-5j{Cz0T}NTS8=oFW<m&9lL(>5M4KTWquFih$MoC4WY~7>-P;PZ5}Tgg9GtR~
zv*yivz-G{GWDnRcp4PT)+iE&2;H(Bt@5^YMH6KvX;fL$ZSL=&agaJjKns;W&qc$@g
zr%-6AA!-#?Psj-Hp~j6H=YIH_73n|x^*`<(Ur!QTIy`&{oyW8INj@$zNr<+E(swNq
za~igXROpg|k-2MXX(_W#$t7er7@4^53fjRcjxZF%?j9Z+0LgQ%Jn8T|Dh#5{inV$B
z`DM!G+5J&b0?Z&?C`J)WQ?hly!5)|t^rFn@H!Dac8(HQ~(~_8#6%{=A@7zGvoW<&U
zYAgq=mOgMw?@Ccx@$)6}P2G3w@&9W9<PwVk1AV5r6(bifDHdI=`I(rQ`DXtSbv)5W
zUnp14=h?p6{L2IN3hm>F?^+QEPu|E-FPp^1qa<y!8*~#F0wkNLuQXYA9BV#^>kmAo
zBwYmu3V}<mc1P}C0#%Vibo-Ns54WeXKB4El6PxKcuS0yvuIMhQ%a9nbfcKY58hYH_
zM~`--;==3&w~Ytj*3P4jL6XuiHd=IPCWC*`m19ovp~!0#CW*A+q55+tP*8%&@241P
zX#gIdG;r7M-8MWU0p>^=sFht4kBmT@c{F9%Ih}COL9~3NF<&2_?!$)NnDtY;xntB?
zM@QLx$UBFZR+M24o|3r9ASt%t1IT=>(10E>8fYJwW;k}W<M-=J9|Nh#8IPSbmS>rl
z#$-j+&nZ4`jA7tnq$|MBix(mvr_+eNIv7niK2ZFhWUd5NYwVI%-NvNQ0(ru!0$=ul
zaDuv^HXhi1tx=7F`B(L&yjcV)yP#;OQs%{N98QOA_2on1^;1%A+}ySx<ax!-WxPIL
z-$@tT@B4#?)pS}7l+H<6^VctSR#N-Lp0nUX;2hBS&i(s}n>YXJa`N&0`?58=;8BG6
z;2}y@OpRqFy<rFSXVgp0qo2*KEtUPS1DrDtv$j=PO}ii{%g(`J+VttF3fwtVBV|8B
zSX`Y|nf3Bz=Wg9j9k*CW=`uuDxAJ;v%9HL7O&uR|{RDDi@Jm2l(7qEVPP`obWZ$C{
z8W7+ZL!^uXFX!YeT>aB4x(R*EJ)$q|wA7j~*Qn{FUnZn$Q+%-08Qi{DW$d|7r~>YC
zTY&~Zrn(wzJOrQx<vDfz`sngbe+iwSKa~)~g-v|+>;MizaHU6V-clGoOigW4Q}_55
zBLb8sn6<WT)8;M+OLa{R*udP_nM#@qhLB_;j$`2VhEfw!rWv3Q%BTN{?4v45U!O4v
zJj0=5_CIfldI%Xx#A0xJ92H|17hU|<C^M0AK|}<h5Ul&YvSW?zuwjf*HeOURtmOUs
z^@(8%4ur2N$qdf=XBU6N)1u2X_KOTU2_6MAdvS`p#WWVV4C=Iyt*<#98gayDWIc-y
zSvhpbkOZDM;YA})(b6^`X`qv<YjRfBQTl>|z_~MK>>u~gG-5%Xt<EPZZ)rNX(LjOA
zpKkC{&AKqXuS~TuY#|0^->K^m4s4@`k%=k<=%&CFM5?%wrpsEOwx)qKhLD#^5upRu
zqoA_#RX>l9za{Cbj}p57A9y%_wc$|06s)5Hss=S}D+PE6!PgN<N;U$krz9z$lRYE*
zy*KpZ6}^b4uw~NKhpg=MHYGjcri8?emyZnWv;dkO6M@{qoJ1t1Jk><B7h(=VbcsE2
zH=^0yF3vq!6+#7*{^EtwqDk)~v}|vw6C9#uuXA2CngD~0?@X#wMENwFBH0<3Y4H5T
zi;MLmjIV|SUd>k(C$aN0TeoTR>`aptK&??J1anFS@T^fVWx(4cq{#Vs=VxA8{qmz0
z?FmHzg|2AhX_`lrs}J<lUWA@T^r>E6VKm$Th<CuJsmh3sL(td%(WfpCzpKMA7yfi;
ztlBWhm4y1j5Jk4lFH;h*$w13C=Z<@+hM@4aUb2KLVCz+#@FY&E^FveiVg&@K12Re%
zLa`0%#JoP%pkkSk{cEbv&3R0;aUUrI099P4yo-*_vtO~|^3{W>gkJrSwSP;JZ%!9M
zOlC-E5~_UAeuSsa>ve__ZjjH*>Mfjb0d>XxV7J<khLmtuu3XU`K0N8a{|@oJMGV|h
z{dV}GALe;vR@9klx17BY^t~EWT9G$1=)8)9nMg}yTkDzWYX3ERI$a(qD1s7Hsqo!g
zux71VO*k_>x8_)6|Dn?Y^pZ%IVG$8Bt3f<>UH8H$HyjCufQe9-a&V+aWMl;lj~Rxg
zoJ2-PeG&IW#!OoUc1I5-T_Yg4aHZ0M!*wh@FilA_*0-0AS9JUlL=7m8___-89x!uo
zc7Y7|@FW0SWF0X)vn{a7z2xL2z}5Wg=a=k;T>^IsM-wZ&4}<a~u8OIKkvERZ9tk42
zlp7Ll3JZVJbW=-3i(sA#A5WEPe&IgZg|aEfLj$M91tF{F+?zdf=3SP5jG|3s!3Svc
zCs3}T$ip^LP@!~wRriWCUUo8{hMQ^)##zh-&(FmDD8F)~{^Tj6DST+}bgb?ui13kS
zL}ko`Db+mxW>QFZ>O?ADSdqHHz<k4x5jn?eTpp+&iXJEvB4CwpfAlu1za$s0AfS!-
z$5w-Ai3JwFQmI+9A<M!oHOJQju8B@@D@K5_7D@UC3V#5O!qgG@HazLRhac&VRT5-J
z?A0z@=!1JpA-%Zf@Zk}U=YIPQd#e?d+StNS4)V`_wiuyDn|AHSlM+S&z%n?Iz~B@u
z(*9GY(kSfpZ~b#Xwdl!AL<4-fFF@bP=+mCyc18W$jmE3u>L!^<1L{O$k4z$g)sFL7
z693A=Oe*JTuX*1rmr$T@m)Q`J57mzbU|Ncave;Q6a4@M`4}--J8y8(_5&lmU1m|3<
zUU6H}A!N8s3_(%=`BOY}@pJraalov~EZdx8=)BMz8fF&$Q-tz_U*>Ul9ub($e4agh
zx+WtFeb|gSbN(h4y|b|D!w60EKt^H+1BC?z5&~3WhuogOG-&3f*F<1uji^?(@7OV6
z!VWFZFTZ$SLl)mt>Yz364v2Bk!oMR}^C6PGYDSk}-Y;f(S1cgs3Q$K+t_nziW{j@l
zz~B;QR8tqsT^8y35%qEAo&){_4YuMnGOK-ze)%;`kQL*DFLH7Y!r<c{!0A$~7&yrD
zeaxSMkK`v${2;(Erb(DGIPM<alrpMFpsvufB(sCS^a8)RKpQ3e9YU+=^XF$zG85Kt
z-56FE13B<O+H;Hn;gO9#bMdHST)9YoF)2l4bRZS8EG$|Y;!Hyjy>obAcf<p`eSIG~
zr_F-XXE}ySGWT8}F9o16M@A63jEREd0ZIJC@eTv702y=m@L8?2biGibou@4%8m)^t
zNE(Mcboi2cBy8ii?^~$#R>9#hBHs`x2MdH#h%!01hTP6R>5us69bJ=5fr$=z;q&I2
zt4~3ebCj20n<W$Q0?;!dBP($j8$6$5Txue<IU-np{Aqc&_zNCk{^`}QX8rm8O6iAb
zV4VjXUD&1r-=92}*F%eGN1$<pFv*;50Is4x!!IZZ7P@*5u5_e0LHt10|F5{unMxZD
z34Y6a5#oJ(YTT-@q+}Ae4C<u_->)~Qvc)Ii@Z_Z*Mqx|xTaAm30v;Mj;Tk#FrH=_|
z*GJzAuwlER%3|o`JBPit2|Nhj5Vw59{SL%B!QdE&-a`LQppb@y24l<5SAJ5>m+uCU
z8^tT9srb+Bd#T%Befso4cG&=_TLA`BF@<qp<sInKHJGE-gZAk?a9~L4%H@NZIimh*
zX~+<)TU}*bNt`)oJ7N~u2KN?Zd}CTfR8*ht-4&MUUjk@LBTnnR?;pEYUJ`Yd8dWO}
zi=F>lG57pT^SV};qrZhor8<#80&4UNWsL@UzIP{RS^xb0Rb&<1BLN6@?AS4GS99-{
z>W>5+ftdhK5(tJcOJjGl`&yzeQ<mXjVd;MjJq}zraG(tEg3yqyuko2EAZ2RF-rf{p
zI4=Z8GJN=yzFPE%%|x~%Ljj4fc`px7Y+GSR0;T|>WO*$-B86w;0-)F5|BFOJaw^xk
z%epZju4H$A)KXyUcprVjQp$-w5dRsyPc!8cO)V}imR(3s7@mNiS`_c@FYv0!?P4!K
zTDd%80p#oa=rx)$Nks;Kk)1t+o>;@9W`;SK*0Tf1suVEn4$9gV>17gFNOlt58FgBo
zt}~H*V^VuqMIq>**w_P|O6PkwZS1eyQg<1=g6XCp6eKq;E^biCi~X?AVkrUcD*>rn
zxBUK{a@fA$PkOg-um1Ztcz}oA`@iOeuC7~1|AluW=7{#rCviJNjF^qIZinSGv!U}L
zgtk?-G^(*j4JWMhfdfW}kDxLWl9MM?jp7l51hz*iN_$Q;afpch%2gC2E6&zOW-o@I
z_4Oad+~E9q^HfoWgI~c7y%VCF33EmDGI&198X?jUo6|(K?b>zs!Gj0O2UFRbi_70{
z+ws=~Tv5x5x8SsCX(E4KZAW9k(W6Le*nQ+kHFV_|v`v5w_}}C<#1qGk3oE}VB|yKU
zt83kAKa1S5pPYm1wC7L^8*0*J;<ci3U*$&bj!Z3s#pP{cH0CQ^>oHm}&;nG=^yyKf
z9MGls!7PCQ3yFm{F<w%W$a{`XPKht(l`3->YP3Kl1yo}ybS>C3-6ufDTTT;2>bNL%
zb4qLgRC#Fvf)gJQ;!MK?Rq$qXD!0DAJ<j<kc@mPjg8t<VG8te@*`Wm)52bbI@T$^y
z_9T;2#F<cr?dj3mwQtX7n@XjLNalHxyg-iUNcS(-u5O#bPB2EIC!ZHz9QW_l$q-hD
z(8{CN_$q>A@|N=_e`vP1n8sdU(jMA_%zu8$X^UVwXfPi*Z^3*h)Ra39o7j>029*UN
zM`+<Vv-bgbpo0K+t(Gj=&R{rqkmGqkD107}(0?@4V3rwqBjznwu-L{%9F|DsLD#Nf
z+1`%8<>=yq1XzH28DW&!KR9L5ne6rJ)~y@pYmG(SNkg5#iz3ndwbsz^Bf8@m3Ss$u
zY;C1M{KJnwyZtJA60`heFN0^F_D3Gl=zGYx-$Z{?!=-vnyiFaOcXKTLZts*|a^k|n
zLoxr1c=uv<yTko<I8JZf`pRLaQO5lab=Vj*&`a0gTx!^;A3qCR^G^-ywW|4~jKun{
z>vQ+!#vQDGa(LK32kV{VLR*lA5s9qZ=w^1<UsiDP>QKaj*R=E<UsMs))@YGzb$55a
z^>|20Ky_VRvTSn`q7wf83zK1hzrrHJL9Y*skYuCj(RNo7UVy62z3|21SxTNBaqr61
z3&z+=q4t?>X4aIAKW>#VJM@<uDRuI|x)AxL^aTQ%p+Ff4yi7~71?nbu!@GcJ_%KZQ
zT6h_y+x6v<`R>rzs3faWfh2*pRJmf*ud97V^2=>Tw%(QWC85=(EqI11iJ*7PukTKn
zA#v)2Ps6$Ut+#K)f)FwO^9K%`Hvco7q)2#kZ`EGmo8&rYnM=DSF3>>wqE`Cw>C^V<
zeLZOr*y~31`2i(eXDY%Zis?1Ls1yT*cw;3lbcpkU1}f-y`%4o9?jK$(*(*C1Utr!N
z#wc)0Ma*P0PSj>I1B<^3{k}>rrLU}BU~PhHg~o>0t#MH*$aBsA8_+{kI#(7%G$&58
zZ6$XJi=4KyIUU~SL$KwK%UK&1cw4g~+{^Cd21W5BMH|KA5S7Z+K-naaZGT@}q!^L8
z_3LN-#n8~I;fJc$80FpenSlDYH7{^`=87be6>CwRm2KRWbazilxmL=5CF{fj6I1nz
z+qUe}H0rTmIXcS~%3K0GrUYG3)6-Qa6Th48F)2PjS7bNd{fnBF?6Fp(&%TqN-|fyX
zmHA0i_lw_N=g!UX6@31DE*aAbF|Dfa_;`C=O&cYdgHPL3JM@Z0TxU`Bb1l<ZUnPt@
zd~Ff~U^}^SY=)@aaDtmf=Ilxhv^{BYg4m0TnMv7fWHz44BYNyt8Z~OvdfWxX&{zm;
zLb~kWW03)9I(Q)epyn-Wp2g9KoLT^~X=r6tpptLCVYCJkE`h8is46Go-6LC=$x}S^
zFNLi(`>^NI3bJXUsi~~IqYY?<;WEospgtVm_2A#<@qIXY`+F0I+dp*qXUuxfAm7i<
z_GNnlzE$$f5XjsXd*SFSu{Djot%YY1t896WMAsM>OO`CK^z(17NaKbL=oq*2cspWg
z`u1(Rh&~4&Uc<MWA>Woj@a*3aCoI+j+1d5+G}06YEcwe~8Mkf8t|T3;3yOh{Op96r
zO$%co8!l<z#g-RBDfe@2gP?HjPZ+B(N+tC<G{@`fmk+Ay)G*{-gwE+69sG=$Gq+*I
zDl07jBm*_WxLT^&_PSSxcG7H2!0-f4*VEUR7sf@P5$Sa2^Q%JI&k0n|d?>y_agApK
z&3NAbPJlfU-VD3;-DpMb$(jPn-6gO%#7*(h|2>X|@3KgFAlR5bHs4ew=)I_|u%)-A
zP5J^WiowecYMQPzRolP&@ap$1On&#Nb-Q++%%fZJD=`BHwlV=J;GJWh3!EcQ8cx|~
zd=DkByJIE{Q{9y$QZ})uLnv~Zn2dEgJc@wg81rpZ`8<X(J~0WDRkFdE$aag~<~6?B
z^j{<-B9YkiE#s8zej<IdnpZ5QO-;ws)Zbeimhr1lz!^#k7P#;bvc^60r^$CGE3c<8
z#d|KmZ_L9nefXYtx#KD<1R%t5<F{C=;kE+B(SV`8K{<`TTkc4up_Q!`(omy^!l{qI
z!*<^tgec5`U)bx4ERZOss?^w`g*}7~*-MdCa(IcL16g1GX@Hmiy(QwLM9H{1{uT`b
zu2%|KeXc~b8md0+iaKi>`3}O?QX?&YqGaS9Lvy|QeI$$-y9?TbXERLx!8wChe{b=Q
z51O7MJF2SQ@fgd@2Qo`TEX%f(_32C0p@G>ACUVojZe*uLwE@G*2iL!RmA|P`+`zrb
z#%tDGbM1C<J8`l5*s)?s^y5j|#9J)HQ<CK^l<cy*fJ*YWJ1j$fj%<OI)sL=u&K+S2
z&0U_IMDh4CC&%$~=^Xf;LYTX4v->CUUptvS9=B;cS?58>5;et!TNU^N1j9SK%T(7!
zk4;6@>l=NGe}skX(Z{V-RkzTwvl%F6w4Y)3Eq_*jr`kGw%R6SR=99FTKZxl8m>i>)
z?k#KX=I)wPQB{?1SMV@$Lf>{qecRz|q(CAc@!35r8LW(VpIf}vMLAj1BTkk{vleL^
z(Fp?#5mwJQHp%-W+##ZV4Nw7LWjUdV0b1{ab;8-)jKE_HUjws%)}1?-9;|JvvuOTr
z<FeIS&g|gbKd~`?wi;?L>k~P!Lkk)P&BK!c{kEO@{<;sKpsYKnhD8*ge}*TT0unVn
zZy*V{)3?>b=XHMYv%HzI!XlsLI{sXNR`3<D&YJrAU1&licF@x2)TQS~CNzIJ+T+UB
z@~Vm@4YJ18y&QW%_jpKF<j<>B4sM&1+w4lBzf!BwhVZWH`d@v|A1I3wXfPJaMtgdM
z`_zRZdywW}q)WM)Snnc@$S9d)>}|aUZ7sy6qC@=G-cXI=I)ey{&p)=)p|*SYtUe<?
zcWk+_Y+j|yhtD?kqqnMTcgUQ+evvxXU5F{Ibo{lM%Bpm_)o1##Nuy0ze9CGQdN}bk
zmSxkd|2yrvH0YsC>F@-h-9QSY93fd$Hg2SOFM=jeypXxzdI2r55vTd>^3H&l2n@=C
z7eWEai2vlDDOmV}nv{RmsPB^n(Lqj9#d(yRlSoZjjcZl`HdrqJdB%JxXw<k#lR`e@
zTRlA<dG)RIc>&9Hk6VY$vkIFR?d&pa-@<0cPOR#(yZgJjq1`U;4GzuUepB;Mv2Lxt
zxBk|-RJx>BRaV}FXQ!mjdPiJiBH-?dh#tgSdd`BE?@HFB1Vpy<S~6;N(~Y;IMxN}u
zc2U&rR<+LpvL;xD4z#Nq@#5$oyLTLi$>fbX<)J6jwtLbQwZLL{z!#CLx=fxj#gS@&
zcipp3pX8<5{n$Coob-(wH<(KrBM76oqY#>9zw<ha-__f_284~k7iVlAKYBEM)~xNI
z1>B3HhyAf41Dclc1a98Tm0Bx}d{)XtkNgP9EjTw*P)&76_AA!lrUChiffTcCh@}k~
zNhHWftt+*nmlyC}RnSXDS+>Lb7t)A}*%`90olFcYF8%M4L?w>ABT8nbRhrTT+~=YM
z_;ms!^^8truQEF+_(a%R!Z+tCH8Ba?zwpZ3;0fDKEL6!|{nze~ufuX<LW1`_sy<*|
z<#4k&-Y=(O$JpzZuF7w+4TEp5^6;9$`Z?k|ub*lx_b0ZOI58s@n;;t;I(K&OaAedN
zv&vVm9P={BBskPZDK*~KWz*a0^||K_>az}qdl|$R7sqENwM`Bm%l@tVC_z3jNNB|B
z4^Z8>tdkB!>Xb^DP&fCw8)$8f9!!+|fMacesK(Hwdebh|>MwGpZ%6v7NIDSfN{E}^
zh%`^i6ySEY%hVq7W*oilnikEVE1duzEds31?k!bRD(382Zh3wK+6I&+vPnud6un*>
zGx+$l2iIDwsXdqzfG{_al5FOsL&tCV8~Co=JTnUEHj4&I+q-*vd5JGGF@c8pb#HI{
zX2yZ0ag=huwOK{zAh&V?u==)U8c-ZHJ?ipyK<E}w`tN|Prej6i?h73!an5~A{>rr0
znx4CDd;VyA@u5wT-a}TVBKa!|^6S~IITEI=;Moo4G>rY0oww!Qm>WxWcvcNC?WCog
zSkwICi6&-#n$Ha*!^`a7XJ*CMZawDVY+;%6BDp}wby3OefTcwDNyuwNabu;XNI5hw
zY;hCl3B|51VXK^-OMHJ+3|8+J8vJf)_c{A2uiOj|9kKUa{|9yVoGZ(*<=GA-z=$GS
z&hkIp>~3sHBtb8+jV#S$V%Lc#tv3eqB$23^JWp#iDU^xp;+jhS&3!o-$8%I;hInh!
z3d=AeUevd~l?F2`+Nt_injq>r*CAkf3?AIdHoOz*Lc>5;t`zeE63t|qPUvXX0d}-~
z*=p$HO`UQP26EVkaCyP=-E8ca8JTQHuT~7Ku)`u$jO75jB=$35t{|JN2p&)K@=`;h
zqN0k4NyPKln^xoXP7EY1G*v)zXv8UjW)@p*erQi*^^_J$Dk{$NV??b<rWfu2)<k?#
z)-a5G>ql*lI1^J0i-j<9Y}rQvfy3o90S2@(Sjt!HKdObBNki|LF}vUJn__-#_V8-$
zmuP;yj2fI%B$FIh<_=(MgjwpttNJ-_OE>B&-)N<!-ck3WlWVoj`F{sDe74o<)C9Ah
zQ*-;*Mso_cv5{`(rN*byCg42CsFITJ_-*LQHtj9$zopB>dGGt1Rg5zDe=Wepva;Ip
zvdG(6Uk$h2+FIT!`s#b<sBK^7hDQCm>Ga*Lyv*T9TzPQFlgg~>A>o0_GmecK9YM5{
zL`8w3xql*KBfA0EfHu3c?w&^?A^-6ksc2<wsGBVxg6X`bH$=iHo4CaY6G%20hZ0L|
z<?Xn&i7)1uQ}sKg*S%+skU*@>*g4C-s)F>iA&0uQ-V5a28RS617s0msUw@s?dV1ad
z*nBofF)}{xLWVUlZrD97%|_5UJ8s>&b^N7S$lO4gs5pEW)vPgY@Dx?bY;Pne;2tNh
zg4zbmm~%D`sWz}bzi$f%f@rEqG1X)L#jG%ZyJ`fMte|cy3nGvw?4_7Hi<Kx>56)y9
zQ?n?tWvP&qSj45a(>SsFX2_!69{UZoIs}5_f<=t*?BG}E9bC4@q$i4kN*Akav!~gX
z$BHW#cRLq7D|5s?&EeTOdrUas0|yRVq{;%#JH^#j;n~-gXY>X_i^yN{qUoaAoNvts
z<YlkkY-$(fSgRlTYs2ZX5pL$2%NoUOI-P9Qv3}nv_2<tWZx*N3mepq7u2olFw8}1v
zVn<qUzI5}UJNm_0v<jQI8^n|;A5r^3v*hMmu&ufGw3EA~x?*;OrvjwKD;#fVsK7Pr
zj?nvAq^t)B4sHQ>qXQH#+B5vmW!4=$Be3-<;eB+psQ1V#$peyuv=6Rd1lR`laSa~=
zkzJ#jAK~fWVV75nk)ReVIrSx{b&EQ&FP$A(#_X*oaH7s@#buO4$rKbGVg-Ca48uHG
zx=6?n;SC`Ij!F}EY}pI*;DI4nE<!`(ks@NCUtZJaDAPB70)M=HE2&$$Z@+G1TW;RG
znYIPYQw(=Fr{em{4ngJ~o~Ea7MOCWgWps3o)@u`wfWEF5X5O7u9~0a1y}A1H{Meg0
zqa$vIx4rmeP+m-GYGrZhy{pB!j;}60EnOMbt@P=9{btqey{+k|nr=0e%{Ah2D9h(o
zMdf)@R5wt6ydvT*djV{8Vs5R=m0SFDT?hA;HjcT;)yFC=-j!~wuS-ZrO3{?`E8!>7
zi)7E_tVEcyxYVCN(&x;3p6~3@WR*unPX4AJA+2n>naxS?P|pb{tG{Wf;9*fURkft_
z<J_?mE@!T|_48X9>dkZ7PRACo%3K73bLPmF8s5WMwt<l8f0u8H`R<=oT*_ZGP~e4&
zJsczie|ZQxN0GTmS@+N;x9CN}CtKXY)VMKHmqOGu5nQ*!t|nr(%W8pTy#{c3J$J9D
zndGzn#P}Ww$n)ey$+EzqwkhwO<JhN39YBcN0ZCy^A4Y$8e&06LZa910L%*TTwlf9x
zWXPna3|^8}#QKsO%x9HdGc<vG(5>+DQ0ihR4bUc?BJl;O6o2S)Wy`&!O+YCV_&#(7
zEH6yZR%YbHSL|;IPY)nyq+y2B=q%sm%=Et9+xT|xu*+ed;@R5=?SBN=e;R)+!?NIA
z<-RqwWA!T&PtLP@H)HAGijI*DA7;BGdz=W-uPm$d$c}2i#=?2=?bJEWyMkB!n3EXo
zloP)xzf#e+QK#n5s^``Au?|S>@@|Dj?9Z!Cb-LxB{f<7VjkukfI;P_fm6+`Ov?o`4
z#)YMQp4jq=+S`rCAOX^`JLZ;($7eFdjAB<?R{nBe_Xh0lZEWV|rjHb64@8`FvGUDH
zNN=A6Oi>>&>9(5p)``8&THEdz0@jX1nS<X@LCol}`J6E$p~-Pr1uD(AL8WpiY1$N|
zv!i!Z!{j9j(js4V4NgwV@7;E>p*z89i(z*uOqv2Tp)PDBOBvv#TC(|87KriWYi|Dz
z*MS|#!~-}0f=bX2v4Z1jmbNKXB+!2}x*Ezn1cu9e<{Npdx>OEvQOkjhk{8Pw0rs{F
zM2(zUzB$*OQl<cdnpOzvDQ@?(uoFgt4=2l01$*JTa?!=GAASv1JYhw|VvvkQ3jL2h
z<}C|uzqXLH6LaR@bD@tE>sK7XJ(}`n(0c2R$*0t#RC?uxS-Q4g_HxIb9|P^*#Cp8h
z8=RB9!#ipP7cF<qZI3xCV;7w8GUd_q*#BVpmZn?k)o(VBUNpxmde)mEPggy;yEwVv
zMds**mATLJmV~8^?b0}WtlP(DKV4I<>aR4jTxFZ39_-rnU9)wOdpuTA<*)X68voqf
zdBwHcQ(5(wsI2FGitd64Pmv7dZER&_bt<5h_sySrX1jaUTH8{?up2Vw^j1|f;*!9F
zV0=i?6?s!&rh(lLMtj{&*}p@5tuD4_InOgQ1@8bn4wT0^*kU~C2>5^i$_gqpGrOS*
z8$|!~XD~e4?2bSC->CNn71#_$qma$YR*K)AbZ-D%2`IDZhh=%1HG~EISyN2yH{l|-
z1Wj9`$^tRKRn=)j-*`(Fu#o;lQa|<7nACvo?49v82}~e6wqg>60*N?A>Gw4{$y3#g
z5iE?_x1t<pDT++YBW{~OI;OXb`KpWmj_g7ax}?QqZZM>`IB8T~4-DL{|J8n2X8|B-
z4IK-UnYfnts<D^F>`M4FQV5g7_|HLVt(6*sj`L^H#bw%R(c<YWpN?~wvo|w-@{`OD
zHjT_*TuaH^x4F@!CpL+TSoS;SRF#XD-R8{sXrGEVUeupE_j=irO}0^^yr<fG9h|52
z*2vZ@ZELXhA(ezhUT30WE6bl{#s-%;)T%vMJ@o12MP_N|^^exphZV-?Pr-~RRC9Pm
z)MxDOM;!|K@$|U;XBJt<pV^odAAsI3PR}@|hpN)XCa23WloM_bA%7E4H(3k-RBz%m
zb&qOo!ubUuJjXNI2WzJEYGaKWMs8<T;kJES`D%ctnDrOw)hc^4V|*5Pgm*CS#<JEb
zBTD~f{$AE9z#VNVQ@_1n;X(<~i*BvoA6#&2<LCY(hK2BczzD_lyV5BN2YsAh5fMi3
z8>*+*nSYxuT|jxXhzeGSO=Yybxx&17_feI%K20T>2+lqpi4)>hUp8>Ck*+{C=Md(*
z@1N-Ysqt@F`xhmThP^FVdL?^E!PJoYwSmi0*n%cNO<)GfX$33J6Vbi%@Hve=m0ej}
zNWsTlNHllNYl4$Lh-V<%4X9($To1pxC}vw^q%MfwX|`&U8pXN*kn9xhCTzR}5@gW_
zC!SroZ5c*VQCNe8E{^rwJsj&<)%Si~ti`zV>C0YEHPQ>dzc?!}Ft9#g?8Bck`?gzq
zhS*vXajo#gkMsIDfW#fYy=ZFaR~9w%d`YQGxwHMg*Zpg9nWzO%d+#!`R7=bGO6`)t
z0V)-xm3sC6I?nN3*-h<c?7H0NS<@3d=6JlUDz^U>@3(UE55vl2$4)z6v}{tI>jl@q
zv(25PXIDm4WLJ&sCUSL+%J{1E&z}vl(2<*RtaOjZwDql}?-D~`YTVNxqRqV)KNz$3
z)qj?D-i5m<DEqc!GvXTSpy_>UoWmbz8;(<r>14m7+3{3%!nN9-aK~%Lt6D_gSi#A5
zv}o4YmJA>515OWBPP2u{pF+X62Lt*a-g(?L9H5q@5WV(IbF(-WrBA1nM#$k55YEUt
zQ+m57pXjNa-Wk!B$!Su!gNxrzh~#3tcMO@mlvPN)WP9wB(DZBts1rv`4q2<0WqYNT
zx9NL&+MRnVp&?qsD1pcHU?f06705Hd<^X%@)6y<z+BWNSpSL*TidHZ$82I$!<;(Ko
zM`*1g7y?oPnr@|gvC#Bl9r=O~#YFAgW+m)&x!Q2nrP)Tk+5mi{y}5SCzTm;t9Lwa&
ztgOFo{+ehLv7qT_&k%j%qL*eXUYZqrHuKNQTBW$sGWqF$*=w$p`}T3^YiVX2`6}g`
zVO^K0-!7Zd^c<!+rP()(a&<AM*r;=3w{)ORO4o-ktj2|AIa$|E^yt>d^UFz(&m;8z
z)AE`W*miGnal<EkVW%7xG&33+vqqjO_dU0!Bjpv8sWeG$do&VVuGn07BnXns)ah47
zH;T%0=_0F3;aR(~eNH?y=<zny=j4lXHcboc(ge5!EJjOqUD*EnaZZ=u+neCa_y+Q(
zzmd$cAB{<<U0j+*3e;_E3WgghEk4r5X!Il=&!yQ?bmvAw7D3moX<!GJzzUwHFeyil
zbbv{Lvp<dJ4Z!_95<p>9)n?<vz2@ZRcGcKB!KrMd)%Lsbmrb)(Q!-b5yZI__NljJ<
zSMLSt%}>mUj$Kd{ywRn5lxuEB!_ZR{XuiS^QW4VWx3l%rw!da&=|9Q3x8}WtT@y{w
zzk$AVi`uFD7YPvS`tr{PJ}TPTYr+}b<bkmtaR=|$BJ{TP5PwtZeUbWos7~%?%(tL9
zVDAvi2nZowSR`Orxt*gb?pjh30ED{N6_{I1$msOouJ_F+n_Pz9ow_+I-OkV`*9;_8
zU|NU(%FqwaPGm7->)zG>qH}~{WzWGGDonskI#Q5<rV4~Q64u@1kY1k3-q0KZxzSqY
zMzU=~+9g;XND4?;u#$j)07~Zb8?1cH!ukwb7Jp-Z2n*Nld@ih-kTXfk+uD1TbLdSf
zvw2!U*0#+wx8I$=w72Gu{SR$qqe9RtYiCzi=}?gu@rBDHUZtlupgBcHOvwUCb0#EY
zPmWo4)$U|ykVyjr1Hpq4B{Z!+6O)wvU$gCDBX2rPTYKWwxzK`$+fO4xon0NOE^oK8
zoaMN_xVF)#pgw!kv#O4&pGn>{{>;zk>YFttox0S?d#hphf}j?yeaDZuG6DlET6q(A
zJTHIGeGP`Psgi|EtTJD=?Bay9s+n@&xU`Hqp%u^uBF2k8M)M8@nq)15`OP{d5iW37
z*aI7)a0iO9xdPow*BO=_lr-Z5GfHS-fxL#5-Zgzbansm{Znon3#ACP@T%NzgUN=X@
zX}DVk9weacsE&=)!%Tdr@MW3Uz=0LsWt?wWf{QG^o%pd*-emP#q}8D(mmu6W)rcf^
zZ}AYQDQR9QP~-FKBeLxUeq7hOyj>fQknmZ9q7xn8Z+3a9>QeaWYuu*D{QKI<ke31$
z(ukY^i~vsrYvIr0uwx6OEW1YSN?Od-;3deWNeUKttJO_?loQuvtRtz&MjLzGK>kIB
z?#Fp}c-&4Xv1)ST$<^>*249MvnbpS@t!NyuRja2;eN@Q`e(5synM|3oEA;qerB2?$
z?4bSW!bOpBXUNUbE@Qj_B6ZCY3r6MK9|XPyd=gF4rv7rhg1Um18n}>|_nTd3re$lp
z`{k5J)eZmeRCueN<vZpiw|a)NVu_23$BmP!6)oz&=WH%?{H|lOxn%v1)RuwEHg~O$
z`T1h7hM&{-JcxL@uFoF=*%i=X7Zvk=D?VTAG#94wM%XC>lN@*dz&Byn6O5oi>u%D(
zs}}uZ|2Zmig;#==+KqY-MoHLdx%{Qq3@B{DD`lgfbmTa&wd>f?`20+{XMlNkxFsxw
ziuvV6i38n1H(>}am^I_S{lWcOeOur<BE{NarDeA5Sj)KL*y6vsbT9q9kz~yuAcaBp
z>#lg0m)Ut+!&jWnDB%wTaMN~OUq<<w)6<X?+zjS^+^l(SN&8E|`TK*N>sS3y8CqUy
zmu&blyDEKF&V-LE<<jszj|U%pTl|-&viK~>Bn$6PWKvvP^-<O|Px`W^h;?&a7%+-o
z{bb+b@N6<M>rv7(GK$y6_?Km`>V2@pyUbiYZH(2r=X32~zX_mx9=BDYSaWlEo_Mr&
ziuh$%S!TOB$82z9{Bw1;j<Kan-+u|8-uISuOT(~vDWALAY#&wM-Ot?hdv@lBjw8bJ
zW_7#MwUM1&nTPr20d-~N$DTx<KX-2Wu~Fm0ob;}%cLfKVMKL^K&Yckpu2~1sk4?Q$
zJjS&({LaX^wtAbAKllFk<kEGx8{I_<ukBbgUM3UtUCQrJxQpBzA8}%vWBwVE3KbH?
z4Nq+bi<`cE`!;T6u*c~)Bb6@(JDcUaC`QQ?b8D>b^wJh?_N>zcl8M?DxplQuhqHP&
z<2+uthELg?xz4Zd&3ZS7O(yH24y@}Q73ZJy(=pa*(~8q&S0ncflP#3DfBkT{5fw!|
zdzh6<r{Fj3c10DOuSf?Wji1t~@RKcdlLNInJXUF-sXyB5W}#-6?zNdtPcn;Edleb_
zy_wW`kL~mwZ>qh2de^LpsI=EmsDHJ2OPRHH*U%-KYwx`=go!x|oQBvr*L}!>klG`c
zOJ20f^_cTeFZkFSh{@Pfj>Y%$)^+i9oz$z1wz6?z3VCq#4dtQwoize`Qu7B)-9ej4
z;Qtak%5z`qlp!fyQy!V>#NvBd`0K~*Upqi?Ie|<aH82c)WU8AY?mK`JAMp2Yu^_}{
zdQOS2_M<Kpj?Z<ct#YzE6~C$Wo^zw@9jdPUTU3*kxw^<F+K*8YRlBri^fes_TpDZo
z4mq@;zg0~WGlPeU&wB1LS13ToLOsv=zD9}KfGk{0b`&}V+xdI6(sgw^``LM+Yt&6A
zeOHH!{I0F9UR^T4Nky^IZj<6I&-w3~jUJ&%*}5*BCaTWo1k(^u3gPK|X1WiLnTP=`
zc;P6ic+s5x3-^~s586L5+QPHEEG)Fuo61<`!H^ml=#^+!Yt`-=N$6Ld)}ndy&NEI8
zedt_n*YM!O#*9NAp5mz*dRF`3mje@fp(5B6x@xq>-m(vmz5VZePBWmuj9=DU)26YN
z{$%PBrA{l{JJ^o-`K4GU{Q2pQc)At&=~u!Ezjy9Z|7Zw=Dcv}gycL2Z_}0&pZ$bmH
zXc}%LdP@D@{SU`R3~J-MTIuJBJ>9BSm~Kd}&TqPMnbMIT9wS@$DX#q?TQCuoNTm6*
zIak^f2DOTxJ373HcUYR&C0Tu0sGePes3m=0<#+9P_3imJxwgYP=QSUbUBSY5Bx%9d
zGgm~QX&`?yfU<>qm2Dqi_&H@vWPJbGOLgXlcAtG_y5XRsbqkLEhOo;cc&SG>W2lfk
zR!CW~TY(E-yRl2r!gBW;?QA(;t8UD;ec5Hz-j1ig>NL}Qm85MNx_oBelwQUSSOk!{
z;{4OCszt5sk3F2~%=|^8))+>KyomTu-=kqLqA*;W6)DY7Uny*1A6+pTJQmUqa&Q=%
ze99tp$D%>5okr^A=<d@#pdR~dj7>>s|1`(E`Jd-mCM>iB4$kr3PKppz4+LOtaC?zU
zkj(?utljSM;@v7g>kInh`s)7*3Qk%!?3L<zTHFq`-%!zrdJ8a@qGE*S_b0u#9-Y{0
zQNcsSfbuseoYqvl96n4}*XTmVh95D#q7dID+UwTGcH+v|4D<HiV)ypE#-MUzMR;1p
z%Y4)Q9!=VNwd4>LRIf)pG24HVQcOQ?DI;nTW_}eJ*0%E*mLTZHY5J50>l_Fva8z!a
zm-jqNulg4LXMd3B9!#KYOVRXEf0A{uJaJn6`P-_8DxzV3Sq>`U!_TkfRlS@|apB=@
zDZC&WqyOwukG^w2W%Mt6n$Z-Wjf)G2FHmV@f7)O{VpAiZcTJi$-Fvz2V0z8G`SP+9
z8+rNLT(llAv|mau&-Kk0&KpO8YICtu8*5u+GDo!LZ<Fw6JW1ykvw1z?GvHw9RX;jA
zJ?L)T=-I+%ld_vnPfi+Av+fqVXxC~kn>@L>s4aM)01~Tz&ed6WDRad)gVsi)ZHGDg
zsSVtl(Wz8xf>L$w`bVZLevY%KD(=wQ=r9+qDr`YWO+4a)ncm}%bpk!3*aX)88r$#Q
z6tEw*VTK+b=dI~&-Cb3wyQ*?|#iKEem%0W2YucN_<XEch<yUG&FG$fwqsLvE?N_?b
zVc6+Mrba$dqgoDKIjplsZV#i4Y5tSWz}g=)$vD*Bb6;VCopo<k<sp{Kwg>G6f{I_!
zoC~~GdE8%ZngP^NN0BQ6K6EVtgeox}s(EjTNse}SU~35w4omXwzPSH9-D;>C%^-{)
zKznz*_e_F&&@7;?xVLA~_}>`O%&%H0?R>{`isd-$N~*Cwp;|PDvY!%*YiJvuO@{cv
ze&gTQ$9C9UTffNU7F+J}>9fKT(yey*i(df9(jKCD;k3=)+NuQqGqU!ue~rOW>4SRu
zPfIlASNe|EDJbmOOw;Bx;4fvMmw#eomHp$DsvAen@|@VK^Wnl*%XqH+hf;cP^#~nu
zJav0#+mmBFI>Wl7_nPf7VWYL0qH^NX6MQ{hM78_emJjFJe*5|rrHN9to|jPr1S4@4
z)2!cWlkg{`Hd}AxQ+KWEuh|s_Yr`~$6SgPz8naqO)8^cgcd1>=dHmmu<<;`DN(hEK
zlNL9Z-|ZbD3c#Ab9DJ^a{##m7Qf)njFTs3N69r_H-J)#P><Q{WJF@dZYc)9l(ZfCW
z?SC>R<y`-fI)@~(LMQEi@-8uD^7)za^DHjR?yuR#w{J?<yX8BYo|7uA8O^5dag~j6
z8oI|<&GvE+-1_6`b%OPuM{51n3V>>dN#HmD<MpFYeV)|ESnYSr@AS{CIUJx+YFu$V
zMr?5nH3Rnb8_O5%<{^(c)r$3s0A<0pszxTweuHK_4b|R<C&z4FP3)*C^jBT`n3|}g
zt)F78rK*YXn#bD8S^W<De&`8i)^Eky2jXblnd#H*))ck-{cFRr?Q1p}Da9;DA|R_s
zM3w^jkH|{-m-ejgs%s1`mL!hUyeF89ns=vkC)>`-iZn#y=!Hg)&do~gq|Ml>1y|QI
zx4&*m*MFQSi6*I)>^<)7PpzkG>FJMpar>rIJ$*A*{JnBz*|qO}uLsGqJGuC`p2R&*
znP0y(ZRYse@h$jB`x0Qd5{j!$)-AO~>IR{wz(QX>w#a)XpMRE>y7u_&{%@XN(mANQ
zHkdm?6}qJ?zyOWJ&oL{;rfB{C8CJjIjlF?kwn(=^Zy*ED0`>Lk-Ma^d9QRJ_W#4Vj
zTDL)(HfzEHda8M!NpIIo(|Bjv*a$gros{i|M_Yt;OE<tSFjjmEm@ZQ$ZVCm=uuUs}
z)3;?opLfZlJ$Y~L(uP~omvya9c(v^HNV_193u+RKi54ApdE_bU7$Upuv0qjmBK`)R
zH9>=bfOa2xf#n%N)<AHH-(082VV!23KaIN|5Z=dAx!;)06G-@~I%+SpoqCzsvRgtX
zFlcz`3X@=Jz3AMf@K0ubiuFXL2d%?$-~7g8n{JWM6~(#jx=Ov5Nus;I7Ma0Qx%cIi
zt|z&im2q_z!pz)sskn>2TZUheEQ1NiNEaVRXc@pnlvPz_y;~9Ax}*oVaCvs93`Gz6
z`_s*X>FF@oLODEpH$30olczp>)jxo0GHys4qgzw^w%cZXthKuB<X&wQWP#NGN?uf#
z08d4{is)iLeQm5#;1lYj&p)aD>JE6Nt(>T$!WA1aB{*{}0G+Si#N9l*HTgZs;Ekyi
zzPuT{x!Lvv1&9y~cisiXuUr@+tBnfS9;AR65AKnP$WDWAtp8%IW;@ZfWv?gI>qldK
zAfrgM%>u$XR_2ak0$oWuSP)PlV-aAZ3~461YVGpN!{uNV^^<tGS}eIOJU-7sk2v1M
z#P5pstlkG_q?t%;JnYYv>lvH80%bFz_cdU-vSs)Ws-6?&kc?nP@K0G{0PHG@o{i3Y
za6af}y+f(%syVYgRnsz-4O_GY1)F>BzX>BcUmogF{I7oW|7!sZ?1EBl6EXd%W(MiA
z+d^V24FQZeD3KMqBMXO9nJbb2@>$#Ndwh)d^uG5D>T<Uu6>k-}`x@I;F>2igKqv||
zdS;gKdV?oJ{_j6>#9IWa%xUMoZ|?!Z^8(vQx^S?AYNS0sf~Fc>53OM8YkS%|+Q=El
zzWxf-c3NLty&g?7y-Z_RWYEhkf|jv7n+JIx4hv>$D=&9#=D$=u_b8lNw{2To_O{Mf
z;@$-boiXj(_%@~xu^1JZmcO4@>Zz=#n=<TpYR_0bJ2;>9c+rSFQV1#TiZt{ff3Vw*
ztPB+H5{6myKUj6|gdQd`w)CxA?DERAv+WTgfypQB@LH2^ZkGlplHGSw??)MFcw67m
zR;IE=gwIq@_1IsZ#v;B7%i3ng!NsE+0eWfBUIEDwuQVTOa0SdrRLipT97lw!;Q=(<
zskX%y1E8T6hpA1PG-*9U8PzD;KfBGu!~8Cl+()~9loNsJuoz>n=y`Hqw3QnXs3Yo6
zYdd{e^2<&{Yp8leuFNEeIErE_r67}<mxu4#xf7+I4?1Dd)NrM1oIeFzUl7tWHiOvo
zfmY_<p#*_mU%#u%qkLdRO$XeCHUaP^E48{a6gSoc+}s3!5Q<lDc^SV)l0SXHf}Ol1
zQPRpX6c`4f`5`2wo<ai4I)yPSZ+l2qi@HmWgYxQKZ*9vJH_ID)jAnIM=hQ8Avg}Cm
zKh~R8F!dHI-I(e5CnD|D#GkptULc2oI|{>T?8Mm38hQbKZkzt@(VBJ{J*ooy6MO+c
z0#y>b*^A6>e`_z&WYi}4BEDUr&+ig>`sQDocx2~imSom{YRsEKYTiNA*63%USBnqV
zpLqn)mRv0U9iSMpzn2mb^lb*T8k!Tw%zydX{Kz{H<zh)7(z1!pHNUH{3)6wPY%YHM
z6QCsNg|=<(6?j%B<D|zmX71dXRmVG9`G4XRbc3MB5+>Sp=umt4x#{l#k<lnG6eCiR
znMr8Xu<ll%eBtuhg+n@fA921n*K&w*Vyb$@0C-WBP_Nw>khEB~YNKP@D;tDyWJFSY
z2Vs8$#&tyUXR)fwsWh3j5pj5{Wn<ydAle2;tXLN|PfNtKQvUJ;VCf8DpKvsv$7;zS
zQ~!q40E7Y#IAyMY)#AAZDS#wD%(cqmJH63;)ON!8ghl|Oqu-Lby7~<lhaOLrCAq+c
z=qm~%>Z|7*%YOM%j_v-5Vu)y6t>4M-#pL&Zu^{gaSfOAD`!9etmU=}P43BiJ>?#X!
zDFm$<{C|J}Is@&Y$EOOeyrewQRL^djQm4u88n%Kd+GqpY@lp>y9M*q;a@PU6xIXMe
zvRcH*nas?vD#BBG@=;ETEW8qbMv*#f`o12QyzWDWI8N`cyZniSj@{3#ZL{0vDktu~
zJXO<JGv#&ldfq?JIMsGL5fXt*b;hZm?Zj11tp1=NnaNjyM@&TS@irndEYlekruv8Y
z%9PBB-za#cp&DKLV#&qe;6@6B^`f}CSMqlid9-JyzZh`3Ig&7DbSq=a2FY-otm*)1
zoiTT=H=_efSleR}ViFnZuF(?oWMFEmvO)DR8=Epma9m;_VjgiJTh}t_@gIqP^aek?
zN_I2ddv(DlxKIR8iOAoCTLs{d^g#DMX5~<op;MHq+tvQ?lmlkG=l?@w(Ec$gSM*7s
zj5SKXib1*Hq?o#0WS1v`Qu(D=8VkJ##2`dAeGtzw#&DRP-pc`Vj+d{w^ScS0Gc2Y2
zy-vHkrY<KF<CPjWB;(5da}KQN9HR|0j$8N68RL-Bb6I%yQzjo6Pidfl%Wr{>%}_OE
ze%rVyQ(Axla}!jdq>ddsc0AqzjGVVf(<llRq+_%gBXZ}4W(C5%V1BlJcK^R8PkyvT
zZw1p80!fjL&_6@Mi58WJbo;g|)g3y6G&vLu77#kZkSJk2pZ&(w`gaYb*<qImd+Bkk
z9v=9Mp#ZrcKXIlih<SI?>1G8oR2~Pc#RMy>)5-dm^B#RFen2ATBQSI#tG6juPz%1R
zsatV7jl8`2vvtj5Cc>zHj8LYaiTJ>T<<n7z#`QwRiSUdr9@t3~&?b;HatF{joMncC
zC?nHTGIt;^4@KeA3zdJO%(bdM;4)G+c(*$h*K@vB2dW0x&y7n+bpNIg`|}q6N5Io)
z10xqQa3j<b_lzJa=%<t-@~PObhF5`zF1T%)^u8d`Oo{HEFttZ`A6t`I?K3By&s<TB
zAonaSID~|NA*g5E^R6tcWp7z0dG_3ns!eq_kB^y=iM)WxqqBev$jJn%A=w+`zD)l|
zDy_h!$L2p@)^*dGzMlS#WR8I0i~=q^(7jmDDj0$#lbdM2d2e5^-W!5p)0n+Wwbf)&
zt2xEaj)}eQ$|A=icBSu0&an@hCv#d_3vMVGol%vmI3V=aHcGap68X=hFJn)67<NA5
zpOdpUppWc2)F+6g=jEx8Nsp)ApA(aWeluxjdp>JS#+?Y#G7gHgso)k#EOwA*B@;&C
zYn0V&XWBa%<q-@Yg?`I|Z*B?rUyu0CgV;LxJW~sW7K1UOO77xtEB&3@DE*Gbi1>xI
zatd_6R=9?{F6-v;(SB6&eFME41X=>NAPsjV5pK7!G@vD-499x6YfLo2%!zQa2-81r
zi_m0|?VkGe{T(!cM`aFEy+<uJHlwG_?HVvzqZJi{aElgf>7(9`&l*#EdnJ{2&gz>g
zh$z>7{mvL|Q~vF-4(m6)Nb{G;8I01i-+kZ4r;s@TzJV{5n8;vr%RH&a&P*5I>14J*
zc0n5HSRv|K)=M-PG&^cRh{!C^(l%IBHTnns5GXT~Lf*xW9cK6TeUi2^u0JAp-o0B{
zWWRW~OPSD8W3&%NhgDvz^9yW~8Jd?EEW>QnhuhoOLpl+aewQG#zm}^{x{yJcvEBOE
zj170aR3||C8B^X<1Tg>K8e7p>o4df@F%Tlmm1%F;b`Ict>uFJ;Z~vj@OWsXjL`tMm
z)Tgp7{r_k>@3@}#{{NRKv`EM*B^Ad?W*R6JhwM!n6b{KqL@H5sMja9=nTL?QQYj@X
zlwBzqQK@LC#P9xeeSg>O{Bf@9qCTJZ`!$}=$9i%o_R5jOA813#b_L)QIX@a3>O@Lr
zJA8St@GW$8*G)`$8t-z<I7o$}0)aWD#+l4FRQGi0^s}7@4{nD%-07tA(y36@Tu(um
z!i2eJdw|5@bim*wUQZ_{T^>a6`xXIo=ps$y^UImD?8Cibnq~|{k%E&{F~Q_uIg!yL
z_W+?G!a5zRH{#)E9+yHA<XYz@1HB>K)R>v;%O;jR6W1ng$TQq@$jZgJdwaBIR&iDA
zvVGI-V~;9n&b%{VR{1ImiZz5lPDI!!h@nRuTMP@-<Q^tkrMiRr!b>wGSEjyXAl1p)
zOOY-*e3PLj+V7am{E|WC181G#^1~QXWu3EVKp}A-$#5S#sI_{nm^*eyyniGWzm(Hb
zScyMt;D!+_D1Zq*A*t&2n7G#=w<Er@kzH*3i^W6C&C7F^XQF9%C##G2iE|9J)=>-Y
zV5Q^<nIRj3PU+yG(^(5Itzs5s1&Pf=Sqsn_$@~9s#pKDT5Q|ujY3CiO2a-AN+X;oG
z3zARpK}uF+Q>fYR1dP?FtFJHQ2c(jsp#TyM`z?+%>xR!`59a0JemHYRL)=qDs%MSw
zEjbW!xd{%%9s9i4yn1<O^wb_6$Fu5-%4$EO&-|}H`EY;Be!o^w>+)s>@=8hQU>!R8
zPh}s8vPUuxC0melgts<pNNL1Fch5+tvo${+N0t=tdE~aui5<)FNFbePcSxWOWJr~W
zp*jRt<LrKU!9&y4)m1tLVnLhmbDUHMEZ8@l8s(*?S@Hel7#ZH(^>8_67hAEQcP+9A
zDJ%(N&wLYa=?SU_QTu*bZFJ?R&tJdV!GTr!HjL2|IUP#K%FuLGZa=^VuMhWR5ExhL
zSKBSqdIZnvWlxLAy@B59({s}+p1E7ZPX=Z!%6(hg)G<+==P;;(tUm_Tg}!b0>Z9%c
zI;KyS-=b;lC>Z{S>Gpbg8Z8IW5WFMN!^^?5?Aig<0Ysqi*D_1>balTzf7@ut9POD2
zWzF!1t}4Cs!i!j8t0n0gns+r6_n|Gv(edt9{nsES?b*M6w}Y}no~Za_P&pBH<I*?c
z>VK{vRbX`|CpAa@DDnz%6u7#++lkZX)goy`QC5vJ>6o2(gJ&ec$l`wVs(8m}j)gP}
z^iO9DLK-aV>+8EP<(`$0iYd4*&Wm=7mBh=tvf{Sf@7!iw{q48AzijMS@_nGT(hB?P
zs2IDM^FGe+m($&B@qi9B?dRzPcldX*p;m#1b$vV8WT=`TXx?f1{|LID?mi$g9&Crl
zO|eSP(3%$0X~6|IXUsMgj6G22DNE+WfAv&f*U|}e*d&KN;?3ayX);3~+1rCG0ItF;
zr?5vVUjK78hC@fDmdXIrKdHt!1M98NAgK|7E(LgbT8AahU^#5XbTlay$E_RW&Y*(%
z6ZhGxd?v-dc+HUgP&CPG8hIPYXGJn5Pi)#bWH9yp+HaTOIpI=4ytxkwL<WZUQX;8%
zj3VLW9!P~`{bTlKV3SM$Thq`R{8d^)Rveu%J?Z1J-|eSn`g0S)!)_&`m7N=N?%d;f
z`@kp4x$<M#lM-LB?%SRyqP286JhAvmP);x{CYfr5CyyVxO*CBYue?UH&NjX5zmA)z
zf|SiiAHFZgDIf{Yl$zHIDtzb!5k{o}Lcn@Mo0e~#^HU=Asq9R8_PirLbFQRlTCz}!
zZ%t+2qB(TA7uF3CwJo(0dD6`|MM)HJX3~p-f>l(ecgOO^m-{Y1_emT5ieyuxurjgj
z<odEqotFLX!-sNjliL(Za)?Ykm!g*(!)s+DP@*J3DgwqOPj0d1cHFuFWLdPSUpJht
z=Dj<Rj`#~WJo$XOJr|WP*!TIq(>FK|vFCHmWPK`q858<)u(fDpiGd_<F|tnU%NgQ8
zgarmv9q6}BY+rcLf_$~s9l0<g4z^1ec!8b}RtvX7pu*%i+);GoG_gauHMvi@YMLCK
z@aj|BOgV<|Khg7ZmJRfCeKWt`VG3!AQ7R%f)UA_+Y6Ge(eCzKIRySMoop4h5C}1Qo
z?5u5hJ5-uEJ1_EZ<eO7DZ6-N`-Upaq%Z-bg1L@f}40k!JpZ@Y5Kyw)Y2~{&k6q3Kb
z(GAiM6Zjy*L}a~zT_S1!iL<6$N=;D*a`?AZ;8<VYdY>4JPflW&UL|{#dmVIf18NXm
zLF~?%N0h0{*@CA;+SOV3!tpQic!5_SCCJn{b%X<b1ELqOU%UX4-6tsl1S2?E4~w8S
zN7$-|N@Ud0Wz*D>*MJxO{5UW%kje~^lCoc~6M=no)N<-A6KX=&ru2MpQE$e-;0C6q
zEt9H#`?IV>-Y(D>=5v|ADv;no-E{6LHW?q9%brgA<kYd|@#&YTf9ZK4>noVf@s1oW
z{hC`D%7*~{<n8lpXvIUoQzU=@gT{N$4!D?bD9duCc7@lNc4KAFb;P9Sl@FpDWV=De
z%J>4wT^2hMVC>Wi9gc0f5;W9OQpgjQy-)Ptu<qgOGe5bB%Mi;G3NvvdazY9)(Oha3
zERYo-z$wmsBvfVQ70(e!0e$=x#G2X#xLl-Uf)s-|$-v_uDSE$>I^Cya0D{f=zIX3l
zpR6b`+0y|LWV8~ni6i6c&A2D?|F~;JZ7$gb@WM2pe0oknkrx409gd4rC)7xwL*Ryw
zQ-%N+)%+CRO=Fx#=2(E6DX7p$X%bJ%k2i0B1y6NaeRS%O7jZK~=%v=X=;W-cFa1Gi
z1QJ9A$PB#Q*wjQe$_xGjII%ZKdTNttS<?|t3GLmh;vr{88x{9gi8KCfRFd)_xufSw
z8bZ)&xgphS2ilsd1tC}x*9?Pb_6=HU@by-`XV}00iZES#CxT<K20C-+QV#ATgNgqD
zie)tDU7bE4a=TSA49t2VE^@YDEpi3#Lg_1S1k4j{&@<yQhc;X9{FRcRMqb7pu=8`s
z;1o*<Z<JKtpGEf8w<SSDSev8fk{f`QaCV~0KXXK02v|FD-)DUaWCC~ZAU@<T3->GH
zkc8|zuBOz{QybhayvoPIMgb{V9USzEX*)U<SM^Sy%Y%S`@gB}GEmS*T*#>GpL4nwG
zG7}0X&&`!{N-V72KgT`edFCPf0)8w>#8e%S2^z&^md{Zj4YYLfN=nzpnSrSR%+g@W
z>>a7L+q8N9*MrI_H|QaYE6aeiB{Tw`3Re`722wCf>IgTc#NXhbs~&YjA)koKR0`3}
z9DLk7z)sTr`Q$8B;U`|*Q_0!IS@@C>K}@`qcnKDo*Z0`%ABT$QP&zkBY4m<C)PvyI
zvL9goJGV$$cu72`)GI%jWhtr|JPT|92`OOuNHPrh&AK~ddhK=NG*$@C{5kWu<H6g=
z6A-b5j5r$WM>TN3X5q!jU(Zc0PEnj0Pho1O@Z!_iLu5qxB!G#_2o^p~G<F|q%!vDa
zF3@d)Nuh3~?w5F$o3B?^uL*^|;;XaTgXxb4`F%yaH{oO|;E_yx_#K*1wqwVCGv2Oq
zH=3BWrnljdaam1I8dfSaifPhV_41n5R}a596>YG^dheF0|90-&X6Vo(-4dD_K0UHw
z#Z%R;37Ml`&-JbSl|Q8Zk)he1;ReILpZ0zpb|KTM{^q9xv-3Xn1UdV<+iaKP@f7r+
zkQGwVvT>zv`u)o<Xs|{*HD$KQW}x7{OB8hPJ;}&uCPNZ<(I~RH2mT1IA|rhUe?s6V
z3y+Hd^+xdlyk2gI%<+mN4_pP_MIZHK-Yj@PuN5m&31eh(auaY2fvhEetlieu&6{9l
zad^Yz?ls-N_CgY7IQp-7b5Bm{UbSN?M1qXqAk+J_S?}mgzsaO@CVu7zlkPh2$?!tH
zsk<j4VOJ=@y5iO2dui3)WboaQG1?LQ1Q&PrVDIARpC4Q~VY>U?#yUbphIS8shig~@
zP&QY@S6@?k`p>teyCRSr)@42FzrkJwAyH6}5&@y6j9!4al*&QUM!>;jdt>(mqDXo2
zq$SC|Y@gd4ysU9$QXJF$n%S&U<DBN$zv}<nO2ETKi*zI2>#7OKmV3hoT-9aXye9lv
ztA`UOcW*5op6Sz@Q8mAdx7zS4WaYkB`hAx-wb2>AglyzZ9AP|HF`M#3#}0_VlddxP
zhbp)@DIp1sn}^pQ1-v5%ELG_k;wc38qGnNETYjWzc3yC28E4iiHoIL(wDt%wgQ3>O
z2U1W}BsxqPazy+fLF-*yl!}Xs#{=)POy7ZQA?uThg7`qPE~6C_dJ~T&uadt^ix#!1
z<06>HAD}E-xe3Uv(o`Ddt}*>|wL0}P#|ozh)oYgm)0s17-1PvsM$OO7gtpFM5Le9K
z<QCe*K4CKoq(%vXTTj*o`9O?;RVRUt{Wv`KiJDHq=-4MMTDNXOB6F|$6XUv5(vZ%U
zOthwiIf$!W%%+rLD%(tMu50*{Q$h+6k4%bJ5x?SUAl@^%z-xu_Ivj%7mfc9zk#{d&
zmruVI;iJ^eWK15~)F--OGg1^m)I~dQZf@>O=oiT@ZqNo^D^~`Sbf_`XapMhXu|@4E
zsRx`x8+7u(O-STc+1A4M>CL+eo&Wml)Zd=hleaxO6yb0Gb9S1=y>lfaJA3iQ#q}WA
z8Pp1E8%9kciX7TOPXhHyLYI{7JPM-T#@xO#foCJY1yHG^So(aO6|~Fk#JS6;`x6%S
z4oNXB+W649kE9j_eV@^N?pNf~N|$CVzL;6)HHciz4FoY^yK{4k9oBbG##~p<G;>>{
zjgOqpa9{bpIXl~tqnKcDk3^)`T*G&3$;VkZF2eqJRzY#`Rw&$@>)xuAd8nGxsdOB2
zvv1;J@z!6lCg_n&&jn#g$ZLQ#2M-%);=`TCFo}H31cb9Q9l|%XwK@jz#D?k)0U<TD
zls#YftUWO+f>IL`lh?9k+&xF~eEDkP&jmqaQsBq<5h~GlCn}7cbSO_>t>gP;gI_UZ
zBl#^Af#mdIQzMO-Wi*}vJ>rJ@@PTmB*5uuJEngnQF)*m3#?@K9@P;@Z?5Lpv^>aDd
z>by1i*a#~etUTPJ?`rhgtIC|i^#6hmtF@_JYsJFkEa8C9ut<LJK#iGWOd)O;79$gv
z+41G%bq4bAuz#B$`sLq-Pb9XsoE)|IX`@$M&GuvF;|B1r9WgsiIRa%|q-g#&ZAt=P
zsI%R<`Hrg;rqb4uCnu>B;^0g;<IyxX$iVDU%1=!fHvDbpBsYDtd3|3+8<=tUN}N@X
zSy#{9x7u!}1VIgZC?O(<Z^^tHPBF`hR3adaYjOV21ijg_W}SmH<Fu;$GKiV!yZt?y
z`h{?xKrzH)GoI`2zTWY87iKsxmQI`M!C|<@#8lAC;Q-QdssRo23S((bLNq!_vc5o?
z<74gAb)F<S_t70W^4lOUj&<FhVZnXm9%Gdn+s<Qe>v!vRSPWeodLA3FrOzj?LAEDt
zHh#E#<w^+Yu@tFHfrMg1{N4opBHuzl4E#;(8fcf>v@(MGxW+p;><L7hn0-p&Xwrve
zE{hj$f8n%7oibf;d`jk}^nBo4RYMfJ?s6!X`uS__4H^RpyC4T=<cR@(+&An|$={Q|
zE-<T5aD*lS@K%u2q@%tu50H@llE~MfWM^jDws&%JO1L$EPez?_*rBStJmA{3)ltnh
zQc;jo27tA@MJgU1%L!Wq;Yj@bYX}D!b*%)|$@FgyP+vLBn9QooV@D2h`{`3HK~gBM
z5rE6hDDzJ&2Q(!Nd?*Aol`lYkPEJmKe*WXLehsmAkDJN$_tgs?jvPY<IJH~6#L@AQ
z+nAP*XF{qBf3JRQ^{1#n<G}3Vu`3>Z_;K}dUfvE)h%dRv3C`jh;VVTXs8zNU&<x~j
zqP@NYpJ>UFP`Ec4ZNptW=)dIM$Ns+-fXY>7NrF!_Z-RB&aha(V)@<dn1YSGq1w&6x
zA<A4GS;7=fm0vyjM(7OmJn+Q)>MYODZ+#YiQg7m%?Ztv}dw+MAaxpRLO6BJoFcjOR
zD>Q%Y7l|B+6^cXGVeq{UPg|Hwms12|3(B1h55h(roqeymb=^%x_0{)af;g(%o&NjH
z{fYmJ5+kJ63n9hp;9K{MiS6IImdI+V<xRVD-oAYsy*{$<&PW=aq7`*+#h-NddVFlL
zjmKZqwbhzBy{a4EJ$&YI#*VM6agUv#JI=4uJQ+K)jd+H~pI4lBkNM+SAn&@XtyWZ~
z-nhQ#FZ8=~Q6f9I-;5mw<`*ZHj*AQmN(s1h){GuOE@uc~czq5Tp>#2KA`3&a$k5*a
zvLJ=J>ex{Y!ID*GtV(lkKW<=J1y*8tdHIj5M|3{oK4-WA2LmbAdfc1RuCs4I?>MPS
zN=o{Wk}1)alsUm9X&wL<tIohn2D%L0wJ|(=a+LaBz7%6|#R9~L+Q!Pt)m6)WulPBA
zL3o|24bI>`8}!wtsO^gQt;d)wyYG_+4;kWyLVyqhsVS*mxlOji|AG#_+jYB%w+aOZ
z=u_oC%T_fHdvS8KQ4kTEjD@M{-nX0TK+v{`(aQ7{q*OC$sd6<I{`*hz@-*V7so`^D
zcw(<S`|N8p`J9}b<6M#&W@Z<h)K6$n&%)@EA5(RONpqpvb13nDVD#CwXO9vtA!?SJ
z6=@yllg>N+-H)c5{PC+dZpaL29#;~!4N|#+Y3FXgc+r;O^Om=w(8V1u4f~d0ZLA$?
z5cTN)lC(I1DM1Mf=BU{2_?nZzzG;Adr5^8$J9|Y_hb`xh+#U7h)>c~sE~sxa34s5{
zStG<I&xaUXCc)Ws52qUlxc@<>PjRoXG$}3mMr`X*O$t5L3XwM42<WXc?0WuU{?zc8
z&b8;1gN%2PeknjM?h=}5t+V~E&DtRcO-<R`5GHAH-V;-lRXThXB$YIX92QU@5*p8Q
z%%@N>4C}o2{vC2~iLrVeTN9X4Ak8D7w1IF8$V7wkV!_n~AP4+f`sq<8zQFMIk^A<^
zX+<f$ikVS;h?S1@8;!C+Mkvin@C{#8Fk#esYIV{9uTL#oj&MoL_(T<;tZS*dkhd+<
zqk+6u6>Myq|MBB7pKS{lEJ&tLA~Z{ZS0h8<h^FiBFn48eL_|9&{e=$`2!OT9T@xh2
zW>UzNn^_SI1;j4sjhxNs%bHm~D)3@*`KA|h{5xS2nVoiz9VbbJYzOY_RGJ)T@bwxC
zP_G}H-P`Vc!g9I+EPH(42_%#PQ@o^rLb?Grn_&=s|APS$EJ9?Ndgcszng=&3y`kap
z{g_+HcLN+786g4^Y_vh1F{j3K{yBS@`_uUsjqOH0jne6H=n+*YzD3ikQ4`b<?E-6Y
zGs-kB-|(ejQwxdDRU>kr5L1zQb*|YvVJ!<Ih;E4JA0D#Z8{@tA_3`wswWO_OFZ$L1
z7Bf?;`fl`|g~b#~GNOf6oeKhOK7`TGKUTI_hBtiDv59u*X%k&H3?w<b^N1UX*;7tW
z&|iUz5ZMX~f~wl3vi-@9k=ZrWpHGz+@17F-#A1Tg8o%@o*HvEU9j_buq2|K5a}pUh
z{;-d|dv(ZK)Hio)AjY5OeSH*H+PZd-sjYi^<tnScw3~N&I=0@&=Y5QR-zP1S9jdxm
z>bE?(?&=l8BQfVE&AIbo8T|9m=U+G$IQE)^q$ma@7o8v9GU?(6hO}*?SkAfOqe%l{
zTQfZFI4hsFhjhpNie38#h7^uK6(h5AP8Cv7F<nT;lXbxc#DA2&KI__M>n;LWB$G;X
z2;I9kDz{Ei;(%Zd)lJoIK{1qq!muOr+HmPoRlwz?=Ysm)O$5dhqa=XY?)c#5g=3Fr
zWqcpiZhUQBohul&2qO7S{qvWbvTEgYK#$gE;J}d3(9D2CXRX;jqMYD^^Z#`*F)3O)
z@TC1?4YMKdBL;H_#$2Hav>{-STZ8kVZnC|pw)^G1O`A4lUbzK1Fd||C3&HLuhYW39
zSN%pb5uv7TX)A{8Yl|ZvoQpRuqs<X!E*Te5Ya+cHQ=bB<5twOb(6_G^0LQ>3dA5-K
z{};!~-s`m9u}7O$#h1aEP(!%BA|k&TdF3Ld?$f6Q<1>8q9!ZEIv_H_-9-}eqw{FFl
z(Og7SEChCHAPoDRJ52%u1Lfdmv<#wcU5iyi=jJl{;?fLt#wq@&tCQ3-U0q$f@Z;<*
zG{u6UuqVX7$5G4<_3-eJKgKU*{-CeQe9kD&R0%0$SIBlG1shRQrS9|yf)L+@Pj$jI
zm_#%kk&8=VP0{4ZQDGPU2ndy^hr5p+*}?QQ^JmttIu!~RHVM$jRb8U}&c>ZSjd<gq
zKHA@6?}bR&2+SoZJv5SsQ6BT0_^x~m{O2}^5D5IuXpoaf9w&(<!#C~QzyBScxh~Cj
zkY2EjnFS8xA2*xiIpyBg^Je2muAX!mX)r-MX#ZP;hq)+RiaLKjTbT}>LyDzJ4!?qm
ziVDgCJTdHYx@4dt4B}Lo{IxxGpkSblXzqEGT(?wSwR11LqN&29Lax6$uP-1Z(;EdX
z_c8Sv!92<vhvzXp$>br|fAoZt4!0ho3fR8teq$At4t|bOGLgK-(TWjAGT3k4kXcgW
z+b|@Q+x75NTP_ZH!siyf>Z@gba`{Ytx!_wrft&C@8w9H|<$(`CA;J4%*ASK8SAbZQ
zNA^psr&4gpF-km?aNd}r_uj`kpUQKk{<fr!e+Q-i`T3VAaZ^yLbxO^*;Q)$#l63oa
zBRVY}ICBq1Fk(!`AdiV0fW+<eRs%EmY&N+7>F3TRF%p0;6XIlPGpkoy0SwB4SI~v+
z%6Ud|y>jb2K<Yo}>Lo+6y!L0W<sBBP)dRQ)CV8;FJGoFKQbX49z!UQBhOLz;#$hvp
z-!F%AgQXZ5vMD>5ei9&IDN*TCNB3ysqp!+R4uByS6jan_^ah-U-@bo`M;UuBV{Tvi
zn>}yC;-5GNUH16+w*P~Yw@rdJJ~;AZzVfGvuqWi3TSBa}Pt{Q1dN&Ik6PWBQ7l7m_
zaC1loE4a*_76-V^NgoMtFhp?Cx4m+>c_L*MP$sAO3iDj&B7ZK}i4(It>!-6b^w#}Q
zL_4v5oJn@Y+{1a9kD_v82I^<l9S%TFi=AKp?VC4m;Jtr&hQ_4HO$kE!bP5BLyasjD
zs=H1k%FMBLZl7Wt=KSYr^N-e+)BU7y!eL#vbm6+kvpbwvt?C_@5I*)(VK=q87i05=
z8|^8eLrJ~6<24HdS|(;vcR?Wu^VsFF*Z0i!#o0M`r@cB9)^3ye8eb=y55R4&)OH0l
z4D0;muYu#bb@ydfcOVxc<N^tTtbG1KJJn*WYw6fgPOlrEn$*2XRA$IHlahBm9a~<3
zgGOvpJ8@#hIJZcBt>77v{C%yM|Bf7y**DW5dVzk;oENv+Y%87{W~2tA9Nc|sHIxLP
z%!Y#pJ5d)(kWP!H;~=~^1BL%V&O-*?AB1+q);8_DJ%9X#oBJKZJ4E(@tgi{ItwxA+
zs0`3d&dbxq=MM%*+Zcjjl%|vJKvch!-sno2AlUN9>X@qgIis#QEC`Azjv3x?k?k~{
z(`Z!X$I6G3GUToQ?}kFp38B%Y5rb{rHRzs;dLnNs`1nzoW_ihyGUKQR9k(5dNC*Uu
z<&*&G88y7uINjbN@1g&aEE!pac5yD;%5oWzxv5MCFRUoV)h0!KvnGucC6Ne#Ta+uS
zFud*DvnL1y4<*{}_whic`dgjQ-UtGYFH?fXKyx!<hxeSu^OB4uy7v|+a2RZ0zgMM_
z=>QUj$<UW_q&~MnT{$At=;G*4Mz`??serYgc4=X2nY%*2AQPKJe}jC)TYDGF44tv5
z%(6?!Na^2RT_*8Kj3IJIULy^jZ89%jk~=<ql7}aOWyCsv9^TnvRk;Er|8PDXHM8i2
zDWaXmJeG?d*hq9~xD^HQ`=7W*CWO`KurnmUowOKnD_0CSoDXOS`d<L<mImA7-Q!l|
z{{2Go5ZEN+XV`jW!`-X*t_=t|)SJ68RR;9&qB?<8nZMV;Il(5BvXYKpt|q=Pu+mG$
zeD2<}XXGnF45XlZUn8Pcri>-3M~v(_X-KQE*O3F7jbxVk(UtAur{xqbXXZ#pjgUnv
z%nz<AYqoLtaJz-A_E6ja*eFImG;-R1u-Rxxc2*urD~eFgONqo{#}lzn58F{>kK_V_
zbuik&?Xk%Pllv3%{ipokeh}6`KS4MhsXXoCwu6h1oS}j20B*O{w+le6bg+i0*>F%m
z47tUDdCasddKu4P7s;F>;X;mAA)G0FcCMX>j8Lrg^w>K@9L}7P_i;~+(r91gz$bpB
z73a0w$mp>)8!vz$l~bFc2d>CJZQCtsir6KE;#tCq*%bZ_#dCy~(Kjf|y%EA6GvHXK
zq!vO+qmy}KHd`mRo#@T?eDb4eVlLyCti$(nb35TiL~lt}K^pCS1uLuAPZx|ys1tcb
z!qrfAF%<<Zo!s7{mxdb>>;x7RB~`rBr~GW=D1xJt^73wNUCT`=O5@^iO$B*vqG3b`
zB9H?PgD#}bu-j20rW2Pg;-{R1HEm@~II}54?!vr)d=&9A$6GLa2$(_U=I;nfHJDN&
zcUsui9B@;cAH}Zcc1D24U0wcZM{+}$)|=GS4JJ3Y2j2hiC-a<{$_W3KVu~2<zA!{g
z!a}t^xqf2jVNzj>p7n3H=E@D#*q-FLvQ9;}#LDk{vOU4VaxMY1?CP_HJCn_C1j69^
z>{j6V%Ra}C9_@-S`e~sXPlyS9W7b`m)`Rh|u>`M>R>r+d|GF&dHEOH7W_BMA_Zx7g
zGS%%??|V(Y?7MY09S}DpXH|}2m_Ci0z`3*UuEcM<tfGNF7>%=;!-L}blEo2$C%%u*
zyzkuS?u|FAV=W3UTA#e6u=UQ<&l4}kHmiI;v3~k9pG9NuR~>nDE4#|XTS@c9ZBcZ1
zdVbY<9z~z>>{)ASx?fq%I?|2NHVVu$yL!CtvEea0bw(H&8%xfP+=5h*R3G0J^_$P@
z+YW)OKxeoyn^2Z9(M(imbfrX}o+w<}ZI^?P>uile>Ul&CBdBkg8sE6r-mS<=G@ECZ
zS+`;*v7-Ua^+eJ|u?3hf-VaeVQwgf}?%liZzX)%D*!E27fmlK$7~tLHuz@4;_QMos
zwgrq~rbf-6sh^w`TASC6lndW^>n$@1c$e}NE?g7iYLiuq=6-aZeX)`XGIGVoNiX{<
z2L)1Q6MWPXcAs%_No+$IF%qN-KWay~$MDWAc9ySWODa8eH`+7ONSpdqa#~pQSTZ(E
zKzw&_bRN5pneSVvjbcumXbONQye+AcU<q7v%v%U~@L)6{6$1G1Z9z?p?=3A|p7nVJ
ziQsacP{6aX_mNsjhrtRIq6P>K{YZ=AJG3?L7y?Gak;Fr!FcWJrKahsQ75voD%HQLD
zDo4L7>s%%`P__#_%K?+}>Qy_j%>)LlGW0}p&hL<G=eo&UGtQy$;}z)usJ<zFs=C!k
z$g=dt!hKM!ffc1p9BX-WuaryF>uI~X*o%-GVYvKQBO@aiDG8ARGmw)4X`tlQOLEwj
zO}}=Bf?e?6QBlmv2TaXhb`3do$lkqesh6pcB_xR2b1ke1BKH*b9m3DX%tQi^Wiyi9
zDd9f+EjeJDnYM1xqCB;iM4j`f|JYsIHB0=jf#uayk%QjO!f42zJqHuIpFFqdHDsT;
z?GaFWeeJ5n2RDx_-Fg0ChC}JtHe2Ua-ng~dljK+`L=g!<g&`viu#43nFrW=oq)ayi
ze{-HUZ(~@Pwu0aHBRWmBTeogGZ#|<%uie79B%m5A1wc{mumcAVs(<Weg#e?%0#Quo
z4f80HD}wXA23czKe$zoDsuJ287bn%WC|Xd)b0r7P%7hw{@5=!UU=WSQQM6dpyeG4c
z&$;TQ$Yr+yobJK}Gx4GjU$)zL;?W5%Afp0mA?R8dHjZ79PW2+U30>`41{%wJ!Hyj}
z-d$71h0O+*aRv0DlDe^u-PLRGFyQ27`W+!pJ+?YS`0;~flsG2GkojNLpon2h3hCXF
zAzPXQq%F&%G#1&07+o0pO-U^7r<;{e|F$&xavxI-x0FE?%f98|D;L%Zf6GM3+tF)9
z^a0x*jRJ`AfpQ}w(pLaxq`arALe+Eo(W5^qa{8sqi(aiqLe7x4Hl%6@$B9L{DHrII
z>;o8^G^gnn*(-i%(&T5uGWSMf9UTw-N^;JKK<2C7{=cdw@I(-_X-p-6I@tzr;6fNT
z>jpcqsC$Ir;(*YN)7Mc;i*oP5T9&L(SZg)x#hh4rYs(dxNipJfuZ69GAr6((A6s{L
zt-GDe8~#U-#x;{es(*$jJ9nl@=D-xf9aS<?p@UYJ#!#Sg(JJWfUe9dfJ~NyQcIhO1
zVvBPLC;h&;_=b%~?V!PftEO8WOt3$>sZ)?5MT!&!2v;@Yk5H$x(J3L-Sr!Or4C*g#
zAKR_V9Yn`u$?S|{FqQM6ti1Kjf1eoBzI}aet-oJ=W%%cx{-1W94EwYrX3UyIE3LBw
z6IH%U^K*0ezm)RQipqj`-JDx~9l(Xji+;RX6K;Y1Wa|B8lC;^mxef6Tu1)Ll?~{s$
z>|=(g4oAYw@Sb>72Z(rOw<2L5v3Bj+#LM=vt99=Dg9wRYRFU~4@Y3fPvPLO5t?U3`
zrzCWN+z>i?M_RLnI&ak8dYnaKAO&-oSAHN&<v&{kAY+EiHp@`WjU2oeZKbV&ezD<y
zJ8qrvr&=EQiI6);;57%%UWb1bW>1(X@M6dZ5ht#~9>&D8S4S=SXEGVe9&rg`Q_=BC
z_?6GsoMbivJnwc`+?o@mVONGcTS>)6L$>ajqUDyg=0W#-{vP2Uv+c(AU48)WWLmBc
z`};>0(@~<wnLy)(SeX66yUo>ByH_i6TyDsZxvi`?6jKIYV#M&p^w$!usvP%NJbL-b
zO7Ow-yKbqQS}M6uJ`>kRQrc*`8S@s;pb*qmz_3Z&4Lyv?-X@05Sy$Tx)B|2zW|5v-
zTf1yq`~J=8qbL<n01slkqc}KdggOE>vw&03aeF@+({mcf9aHZEIeXt#y_l1ZX;G+l
z2xYodr8#RWnuui%mo8%OK)4eo7?Sj9I3=A|mcm2vsw^6h%trn&=+JGMUscQLD;Y^t
zMPiWV<P)fPupZ?!22F&gqGuCwkr2-SMyz-2)X96<CDbQ7$2npi^K-JGN0clq77y11
zXv5PC)$Hj`B#dq3m=uGPMOF4h!|ZwFw@U3Ph7tTj!Z8A(aVs*|DJ%AJ<?*#RN-AK~
z<#I!vD%pgR&n`9{Km>mF)z#mPWL~|jW)gbE9ZI<e$rXTNthLA@X)`vbzXW{P0IAeS
zAtUy+UbK;0E_Wp8DFa#`KaV(2@>B29<If{Iy{>iP{?KpVZT-$~>yE7Zr`4|4wy~yK
z=a;who9W*wopm81OyunjR~K+NB9i+*p+EX}bH5|ZNIrWzUEPpok2f+*yE!H9+$Sjo
zUd%LWBmtHv)9GS3gm*ZNIE}G@<p3A~$|<3%J^uS=;lfSnlqek_Z0imN2$Y|(xIYs!
zC6As0!~7F|dM|J9KjKo3w8ID)a4TK)Dbq!6kkP0VFvqZ?x2vlGJMFmJXrlc9z(W{x
zAyBC#g1Jt2EZ`2uy~W(D+wbmdl|XYQmKI&y%3{pQY9(SM?qrGfMI<2J727G)pEOOj
zJ1_=?zJJKLjj?%Rm+DkzwHRy}VnZ$jn%<~KFGaY3ySK;m^Nq!o0G2F84|7E-_c=WN
zHxtbPMD&4PHQRS*@?PXHc;|9s*@saFJwEfX0IC=`oyNzaSvtOFY#3}QB`3v=jQkQx
zi!MWQNr5$wls&E<5Uti(+3eSj!Dd5j-rV?jc3U#N;J_0%o#>Ng!$1<C+bU-!TmfKT
z5GP7rWYEKC-n)1Zs7<H?1jotqE$$h#q3d?LtY#h!gOHj@PoMAaWMZLvHuF$3?hib{
zX%I!5Jlr8Uc(^Z4`Tma}1d(_Af<W+0@;7)mE`7T$17T0KzJ>edUAOVSM@B+R11ECB
z0Ulm>t7-GDrJd%FtgZWeoGP_R`MHB&0EYd#g#UHJtiZlqE70|Q5fKu8_~gm1Ba0u~
zYqdAfQg0a+w)<6!h?ku*Ue0cJuZz8&{@@L_g6<88gc1N2qzn{s8~ipNo>WycE+VQG
z!N9LvsR^(x|FT3XW<)4gN$R2ZWvuZg{u-{zPK3n~c^$y<OVtk6b)%~9V)DhPQ-C6!
z7cOj$+Y{6qz^{OBE?O+!x`l;BIU4bT2#-UbS6bj^lya6w0$oV*F1r4W@D!iBnqe3K
zOAzdw6(b^H#GnvhT<vh+R4M@sV0>~d9)bntnu<>jPWfX%i=a1f;6S*=;md>AEYtyC
zIVkx#KtXV6>(dJb;mOO(LvoUBdiNQMd@;kXtNUzBAHZa8EqQI!jlOe}mbkbC(nq2J
z{Qr}?XtTxcPJ8@3e;@z5!;X)1%MyHxln-8xszg2?2MX8R_0y3cwen2}yyV`{)YR-W
zJ6gXMlo7Ra;NA4(WJP8rnRVA&%&QeMKbRCF1c=Nx&Wxv|<!1=v`pts3E7ow@I-2^_
zVW^NBac(IqK=W`@&K-B=_=$e~6oT(Qpebn@q+v7Hy|;7`ZQCi(zwR7~@Jf`+o4j9e
z3T>)!@!*!hm@V>d%54&#`tRFw%sSdK=G4iP`faA7TY6HF7fl7|*ycYDDVaHssQm8j
zjVv!%CxKz5^+uf_Iu9fZESc=_CgbPQ(gjTGVMla8o1Qalm3_e;<rKmtfPNi;kL`>Z
z?L{1mAa+E$!J^8dC)1cV3DrUFg@~NcTpYaoiReSX0=7kYRh5aw1nd8=1-OIc{`tna
z;3vr_8~~66<U{Ixz}&kz(pJNJu3(<L>33uLGs94~Rd8Lj-&){1{l@0Km8l0XYGhRX
z(|ZQurwY;a-@0{Zk+7HKJ^1EaMxO&dWO;tALT6I_V8%hcrRT&E1YhV-4Tj`e@lHm{
z2p*5*Ozt3r{L-}0nX!ZBHbRTUAVGU=!_Yrt#8`2V_5%m*HH^=E@F2Tlf5~Xug`fPg
zd%dXDv3YO!!S4~-`YtX{<^k7G0X6|~qqA$abmXq^G^?tX`?XsRo~EtJg+)80Z}<^J
zFWLQqWTG}}Wkdo=N5(Arwmqx8x$<iGd7!mF10SGaLHjvh-EdBuFLuNjE}D~HFW_xh
zE-NESj<f4|^BmATG*o6P-LjrO9qzEQLx)Jk=_WsMNu++G_PM6~y@OH`h5-pprEr9)
z;5HJfo&m*#3fu86^*0?G8|67p5!Fl}j{;}?v(6E#5ly=4gfTVP=8Sh5rGX5%K*vv8
zDP<M-AesXRgJ*+7;^IoXSX%S+%y+4Ar^Syd-3A~CarGsashR8fcbTdo!G(5q!NXi9
z>G8~metko&1N8SMZf*Mx9X7A!>b^UTL2|Zjjv$Qe>#%>#>eVig+Po2gxcCNo^*4T)
z=-i8_tx+6d__H++Q6{$KV{9_KSR&&f;1uZAlDRhMRQz)umNcSi5zaLBNkGP21HxW9
z?;05nf%J3F%#(VQ@G$-;KZdcmGCD0XnKR2L`nhixyKO_qk8dfOd7fb*?QTL}Lq;b4
zD2ff;WIt~7=xtACt>CHirj_rr1DA!)zZg<Gklx_N;t;D^x;n_WbLlUJ3t7NpHZO%-
zR(|UNI!N^-^uGWAv~U6KlqMEy@jqjAK^eL2;$J*gC=;oc!5KE|J1rTRe5l3tN!bQ|
z^&#bRhp`A@)L7C2n@c(ZoeD^ng64~1VOQQz5*8E|Dlv7`y1%u%e-ix?w4S7Pkhg{k
zh_4Ivca95}GpI8YDPY?Bf0^xgG_6+VtIMngicYB+b8>S<>Ba%diG3S+8l?<a@Y2vv
zaVQr!^Hu@Ci4{eR0xBv#gA=I?g<cbzJ=$#KCY{TM9L-wc<u#PvL4p84Q4)OHK_kGT
zN)QIg5dsXHB&9V4m4O6qkoNncOJ*r#htzYTFUPBeh(x`8smp7f?wK*vMzHYlb2GFO
z+`NzoP-=_8gjx(~sWhDlTWsAIE*y2Hf&LT$Vi%E1dPMn6IqP9uZr%4c)t4v%0Gy7*
z>J1n;Q0BHsIm$ga`%`Th@dm7YNi0w16UkT#QCuZ_15lEL`)&6;pe=X|@yUa?*u@SL
zYYCyBK&hf7pq(_IN9BeVBZ!jA<@Ln{UVM8#rIhCkC@cy~petk&LDT%0m`Yl_x#Ju|
z32|6|x>I1{)QfxnocMa78y|$-|1j1tqMLSeDUif7L-|GZi={76_IN9f4feYTgmHQa
z(#6ia(PI*7T9|PaYzSOWv9+yG@30f~U40v2Gi~*4{E!1%aX7P>aXvRrur(?l7t`?c
z@7}yjMF==9+N>ey(5@xlzpqN)ft(iDrAy3_N`1~J{?Kggd}Ik!h3s`19|(>h+)1x%
zx{66JUhHkL_O;V_ZIiz36o@uiIB9iXehE>xlCDS=qA-qxu5c_uR8q+e(~h3>?Z>U#
zd3lFA_SV|W|AJQU=!@oyvQLcX7JK%OU&e=J5_KfN8^dn}7sR?9`bHj!kPAt%8OQOW
z@c;xoq3^u7Agl6YDNwQmlnChu4$oBEh6Ssw(ACIQFQ^rWQEJlycqICN<&2T{Y$oWL
zgc>5t?DZb-tt+$HMK%b?L<zR*fJV0e>$f*lM}9kk<bo3~a_2rxU5m~~rZ5c(sU3UG
zZPI7ks&J)wRdEwe%3zfQzbF<4Mj}4J(9^vjg5hPvXUfFQIrdBEf>pz3#BgYXEl6zr
z(4j-yZcIVj;qdz)=j?LN#pgHP%Xk$<wQfsj_4JD%;_a%O4u4xa`}&GzQ0eqZ=sb~C
z@H{%92o-HN2P}PUhsn2<`NNa!;M(L7!AJrnuptMGvW2SyrP;T^KJ)(l6dqNQ`vvde
zn;V7pN`I7o|HHDNLyP&|b{__#@mV&TK8O7qaO%`QSDN2_^bJ;Fbkv6~Et)#qngWh=
zaDX>dxWs-a1iVl2-QD)A|EB{HW54+otC}=<Hm4<vgdqUwh}|ic0-*Jzvf$!^hokZA
z7&fJ0(h=3>X?2z)0WnLf=(MKZ6?;?OAf0OkrdwoRg)C^hjM2=dZ~BpALvE20=EJ9>
zV2X#}=EM~iLwXgoPrkOJ$mXO>P{it?RQ~t^hqP7Sx*T-EKd}`}J3n`Gas$PIVtM&5
z(2c4sP3c5<T+FuzsZIP<_fOkeK&Kz|^(iSSN{BaQej0oc$xaV?d5?37i(31w*?8OD
z0%+9MnH^P+d}v3DrQhP{gl<&Aj>K|e571d;<6p>k22Tk<VS)bYci3``f+s?>LImFI
zLyot|;UKWVsrh}YR<D+gh3-dCPYyONzeCe*zlLonvOJOf7oaR9@;-C#?r34QGn`(o
zbK3d=FUvrLh1+0d@Jw~A)(!%@VHwDQ&4I4C>ck$3-YI9lo0tzCTtIJTdNmQyg#3<F
znUJIvXT<LXx#CEnFoG~K9{&(ZxPtM4&YzziHTUh(z4r(BNw|1xxl3saDIL#UzHF42
z2LLWJ>8K&MUA}>{oRjh8)umnd>n;lxjGWL4FFSdhEkaYNaKyU>(XG#k?VTod$E?J4
zomiAq)+kLrSLf6Ty+>F6j_X^JIk=%bbFa_ad4xkz7%avbY0R8CbM*dx&0ybjlJ+uL
zlICzj04Y7|&Tji_O_wMJ2S}nCePNk_UcP=eU%<z;Si7Ys25I5x4txHc_y$~FA{(Q_
zAVs(W(;wEr+i+AkrDdmqT;fOr$3TCss(=fwqjqgxRgBpmbq{rTGn+#Zs|F>5MwmGm
zbq^mUj!L*}Ne;lPHht3=?vD_!Zxt28kefhL37$_x0rIaE&Dx5M5yE+B+!V<Kpwd%T
znGc7{#xy{%pZ)?ebfdt&s9Wxf%nea`udRy~qo&zC?DCLtF+-syP^fL6v3t9Y>Fww{
zXIDCm@80&9RRqJWwT^C|)cx7`FqQ=POF-tQR)5bCQCVpLxOo4C+QLIq{O{;*W=8+}
z<XRhzknav#&OfSce6nWfoP(e><^{8cAV7=S5V#PDD15DU^AT5`=W@~jN7-6YKluGS
zmN!EK%h8?x<x6u$Qv}UD%Zimwmc59W=GTI`$LJO~kqB>Qnu^1(fnfm$?{%Gr{?5j6
z;SwZ#thNpNvW<){CG=J)PF>L%W*<|_2!{tiTAb68^9*5#vt|9>y;DQ;q5?I{PE9yj
z{+*F1B}-0rGy(?$G^H|2VrF<KF&F=-en!P5yka5Jl@Xk|B9Re;-qxxj_AgT;@(T(A
zDNqi2kRnp7v%k{EYQa6TcB_hN7tEXYtkm`K!-v~*PO&5VI{g!}>=I;dXu8>`ee+*r
zNy}nx_#P7U`sAV98zHNliD=t-dH($tt4zMzk6-U&<uv5xLJw-T2lEsEu3~uc!3`59
zoSc+v_4eS03*`KHe%N#H;Fz=?i)$G{OxZ^LSy^;T-EP8>v-BNwthV#!eTd%gqv7IQ
ze&plJugy-3J1^-!U`L{ohT`G|lJ<Ia&ciIMH;&u1xX($%sjBwp*n;>@CkVMYQZKxB
zEtc#{Z${58dP<pDiVWtE<(fX^E=XX<-_yyrVT8)h`nSJ>5#6GY_k4RR&TI+Qe=_{d
zM$udX$OgmIiW-)xjydtE&g)DmohjDjXccU2c#J|wN;1^!b@hp-#+i5#E=B7}Nj=Mh
zOHYwj_gb$Ts?*=J3rjCY<%nkPM)+9h{av9nz<`jO)*`;)kp|PcO58WbFdGhwwuOCw
z<v|!5Q9l#TO88G!MAo?F8ed+kjhI~2(?oMj>07{+_S=uWT{fq>u|UTFXM$HR$-9|&
zS}}bNK3XlXY<jC?uo&@5(a_SQ?5=E0wMt|IU2Q5JJJy3DPb?_y<5(>yqR%Ph+gyXT
zG$yF09rdUv(y)y`dHmS9&&7q5j~qu$0GTldv6Yk3(j@kW6JCMu8uQ<!<u9sLfCVT*
zh2@YV8IX=UE|o<)c3U6r2SGIWFNGX%SJ%|^LffGTnZ|F6$VSHqbKswks~B%f;Mudi
z#rOn~3mHKEs&w>XGvcBWyL?dTgzn)2KC;xeQ>Q~+yB6HuC?Pl)D^>9x(?k%MU-IUP
zCLEv5lqqc>O&ambw&|Fz&+bYh5Hqp!V|*5nycbm`3H7iyT!%3bFwj+i5zyER_MWFj
zKvtX%-qwN<J2IMynJ}FF;srW0Cvk_EVJUWH<`d(+9h&|+1e|7IY^*Z#@*-0cJ-jLJ
zmy^sOiZMopenmZ<3C5!$uN}!w@|!mu*u@ltKftPmM3*u3+}m=CiB1F2xXdFRwECN&
zOmIbR1lw0YRnVl|Yx}FH)z9l&+06aeBDl|n<Zb$WTWPl4ENuil`d&YNE*k@NBLDC&
zQ>ILTF1Zc$V(R`tr_<4B$OR$}v78`24iLG5#Z+yi+s#aYUAHi8FF|*cHkINzlx$A3
zv(CAs<QEs8+WTNGkn75oUt7iq6~nO(DuHQ~b-S}3BLvqj4akU_dMAFXK~FvJTrgr|
zo}zB|d+On_-+~l0+or!B<?$!wRFm*mj7t1<=y^T6v*U+Yj|I}o=%7^OJ7C<4oT=ls
zIyFZr7CrnGb%^ts$@wvB=_c^0P(q*z<~X}P`F+*#wMZTunmWOI#BNz>elYLUPtArV
zEMdW1`2e&78(B`EyL8K}{9#Q;0o?#92LbSc_>s_N(evb3h!11)5?BAgFCC!ln3J$x
zM4}*H^z^J?TY5Iu)v3a3d9CKJao5m(x#)6U-N^(0@KnXOhAB((RS_t3Fbp6Yrw@-9
z5`inIGqfztZR*T2O1=#(4j>%i4)Xr;L}S5g^4ZGV+#pv!U!f83HJrL-t&;AH*a_*(
z$lOzAxmxpL@n^Nt^6R`kPO+lPR`-@)|L#3~+B9gS-R1efq(}+<j0SK<Uen}Cu{#_7
zB}5%kqu64IW!D*cldY&+-?zV5R`53NH{*EzjA#YT{vEf>{a5Zyw8bxePBk)%7;Pl|
z=<oW(-(B>R&|8r4Kwwn))yU&*0nr4>M6<-kmb3$$^_uAJH`4PU>BP!dK{AC@A^Z4d
z_-W2Dv=F{hBvGXOgvMpM_R$eS+G+KN)_)L>g$r-9Un)r9k_`*ur9W6VlyRm20odZX
znA|Xy`l{RDSQ3Li!tMg9Uop?D`#DJ3OX4#m>WCMEAlaU$N89g37bMjbfD+h<SYBzz
zE6~Ptm}EZDuhWN&N`i1)KqTRu15TdoR^hc4*Q?O^$OPxinS+1p#ME!iR8@-&qU0Nr
zllOR+6vgq2-wa@RBQvx>zhVB3nVBLFlbaO~g$Ww!V%sGTfc8h$eq`iUcqQCe%{VxC
z@2iUAt~hj>R5Y`Dr5op`aMP02+iP!7dSTT}6YEjd<sN$?vURLCBsYVHB-TZ`GEN_n
z`{LeV0}`p!NWpNp9TSF-LWjcpw(R{N@!rcgD%_bf|5mH$*`vp%T^4)qcRktigmx$Y
zpA{iaWlO;eIrBxC|338qBz-{0Kf4jMDQMOYmaU@v+HaV^0gM3z9pq}2_8cAJK}RPS
zjgTNX3ydku)|3e_a#9QZ#hKumG=rjAENV<n(fN6cA+m-Ows4Z*C=Qm8EY{1k#B~YD
zE6Elh23GnUq3~#q&aG?k?G&PjiiMwp4*~<Q82Az6q;PR>VRqn9kAwCZIH}I3KT@l6
zMz;;nuAr&Df)`wQY|>Nwyqj$W_7dNQme%m<S0MQKHDndw)*3Zxlq5iiUXBH1xlpBR
z6vcnLFij$4K{cpl!Hv%morFY1oW~-%+@(q65O*gqtutmdj9NE&`{uscq~F2<4UHM+
zOn}7yW%~&mOoh2V*|~+A4u>mEvf`ND&*$hE{ErEBSKAnI=eN-)wxw?kjw;S-&Vv++
zYHB%g{5MikstVqVk5ljo(d@%I1FHjbZ<rP}W$ILMCb40gs;Z{&`L+l5{;=#CkbBYO
zGmW`$gQobQu%k`S?XCjwE=LsKPsB5j9B`@V%7Wo6NJitI2`5WTGtyn-)w*Y0GB!`U
z=iTd6+yUzUuBrdwM2fKq5&L)SVzM^7)Z99HbYsV{ANn5Aeh<b$FP)$<XZSF5$RkXc
zQa}ZGV#7IB&X-sA8$F2vGVFLJ=H`mmkBy^P1|2NZcCN*Kbk_%nAU39rURzKQ*hUd9
z2&4ws4OSzeRi>zxI~{)d;6X|6$R($&so$4kai5Fs5swTa)(w;z+%e^3M;MTd2gm-!
zZ1tFpKRTNHvCOhklr6JP=Ge~awRhFCb`yKCBgOfL5DeZ*u17+g?$!_8xUWA&98lp6
zon7<SoF0J9kqezNq+w;lYro}V!wp}5xENZtcy{h~)k$w3f;r;=`fJHYR2wbct#4j9
z5se>^9Q*LWA<G+?c~2(Y>XjekvNQfxoUzNaOX)Kp#oUI+CBBcfd|P+Q<eslYQ@H;4
z?h{aL`Rhh*S>pM<S|hR_930Y?1YINj6MK8k956qn#naC##ON)_9TY6;)OVCsV$c^)
zg@d6%!7uJONl3)v*0S$+k+PB}PGJgPr+dG@$n(WV4S$GkXajFUf<I&hb0dmHYL|n<
zEpN=rt^b<h{NY}>?_29HR_jO<qH{}yq<TW|BmE{ylSb_4E-uyGlq{~z(%>J~F{&5|
zF5zVCQ#4}i#u`Ss2@fF&BN$DD@|45W#mx-^Y;!!Cm@<Xtq~M~kN^m^xLb<@m%xH9(
z62Ht={6*4vVMEQ^7k}xs*?M%URhY*!d<`PS7bb|$!{KJ+7BS(Z;b&{si?HXNVrTPF
zaHUI1spLB{;M88vZv!5ojz~_JXORL$xyIWs7`eVfYS5w6iFK-Ivl?`h-I|6fOD2`i
z*A|V;jEzHD><fF+8*f5vY!}*cQs#u01~R#Q=g!f$v8~59q!q3wev8j3Tr6FwfDgXD
z2Il7b!(zS-nxHJBwvunI{(k+z73ezgI1{Cf?g$^Q3%rhku2%h|L*sET+uNtFi~3H#
zA#kUH0tF;7r$sFyUgLgAfyb$!;?r7^4n5(zC>w&2o@2rbf*LBnbb_B2wgV9L9Y>^Z
z=46glpdlVCA4SL*<t|aH`8Svj(I3Oy(xPk!*@o!YJ!y)z%bmgs-!QXREodS1!#*q<
zEXEUt);S%=8%L1k5u3Ra07FCo2f8H(BLozsZsooQ+5ggiOuh0g!rK3qQ|&nK@qY?Z
z823{(2fddr<u|H2G`0q3*mURJu#<Yl-TpuM)-Lp+*`9mX`s-LUCQ^awbfl5}L9n*r
z%AeQ-;6)v}UL<IN+3F*gwK_(Q6YAI03EeC$JAvRUNS?9OM~kC)F+|~b_R}2c;dO0R
zP{EdqmYvmOApW>f@4yqnjw-9FssaePQ2H8X>V==S;Ao^sHotaK%_X|^KVZb8^Z$4R
z7hJh!$G<e2i3eq9n};kFxNy{D0_4QSb$!0VPdu;-wt{nx4?Xnt?kolKvaueTlWmH6
z=xw%rTDn>-!~RCE?){3K?;zBX@N!`fL4ajwJpzG3&*!Z+;<FQfJD0skxWPuKg@|LK
z-LjD7N)KN2y?$8W1Ut>_m%^wL*G^W#x>IKK@U#Jfi%V#*6c&6sk%`B&Y1xw0;KW{s
zY|sL6yo#4RV|M>#@9$c*8)KrqrB@lz1l?+L2VXg&KmSujHtqF+Q}GtNwQ*Leb<e$M
z*|^)-@VplJ9X0GdOa~)D1g*sYmrr@Ws&M<jx{k-#GBf8^3?TP0R(1TShy|yA$KF#Y
z+NIy}@nkC!QKp~m@#xZrSU=4>t`%<`Tf$STDhE|u(Kj0sJFs!xo^6TOJ&ei|&mCe*
zBj*na4{wcVosw{#_9Q2J4ttqN$YsTIHta{pit(W}Uk6L16B!k9N55H;xSfg)vP9en
zp;fidjPzvEw**-J^_MDOFI`DeiqAIcJzP%Ht}W{-d|SU+1CJK1<qJr!E^69AYpQ#Y
zV^pc-Hfh>*@x7Uvwxq1G+g=8_&TC*H%2r}9HPno2LtT|9*eIGD5{G`c`TIv<sMx5D
zYdNjH+%+CD<W2NilxrJ-oIi(ahIgo@Zv8=(C@dL47)4<P6|1CQ1upEvtLRnOEi^sF
z|F_<nPo18pg@65Bc4F#Ktug&3F4|`5+~?)7SIRYQu3R-*`uf=EtO1d8YerV@9diH)
zdX3SmD$kWm?xme{^f~Y@^~KV^JHEMEppiC7$*xjk_qa|Czsk*)H*x4ZG$7#nYF*3S
zcCly^=DR<ty;WA@_n~zT@uM=*iv&hWd9#2Oefqc>1+;8!#-(bUT9K1ws-u>hdY|fn
zmY%g5^X8@!xzijtrf6g6G{u{SI?hJ-ufgk^H7?EU`i}{Lb3=NrvCZnoTm=aWB*?WS
z_eNW}GHOdc#nbow-wa7)Dk27<pZ(mw?Np!-F#<0daN*K`(x48NzLpB-hMoADN}MAU
zbI&n_Wo2)V#|@;!LcGQeOcG2tu)ueSS%jS-Fq<M1r0iwkZ^+1lp7(M4q0rH|GkX6q
zVavE@$7##hs*vY-YODxAci4^*$S)-WhoYWu!*91ao)urb>{^af*U8F_%slS>Z)8o!
zLuc8AYo<rNiDk6i&YcSRQ-%*k&A!dfE%9CFF<GLLMe`h-V?BBfmcoF;Cd7)yFaFZ3
z@O?ShF43Fk1HZYoGjP{xx*_H%Y8o_RNNo{nU7c*E)7o@T#(Ot|vGyG(S<!0>@9O#H
zBhe2PSVu&sK$R|~AyT4O`@4fvZhu_dyAhD6Ds&__+{(Y-)D<r$nUM0fy=3fo=H1-f
zZQ?O!Zd4KaOZa1H>BB9HQEnL(ttK_Jv%*N*VUxJXU^*`S?fTKnt#>0voPakayQJ<l
zW5q|t-?1HvWP+IC7>Fc@poT4(kDLweD|E&W$sCQ(*UzX8FKIG*s*+O9E9>g0!8=QG
zp4`qViA!(vzpy3|;Yx>#jfPi^4=TRD6GcX!)!*JL&f8J5G2!89;{$rL^cn6Zwi(D9
z-k=b6v;=?5bL4A}$&wo1rg%OW3wIi+2gA?4C<MMr>Ex567%_T4blDOD2Eoh4$t&~b
z@p~}*L*|k?=9{j}ZrHp8K(WMsWBeke$8mYb5?Y@!V^tnF(@IC@?&VViEdsFHocEuJ
z+eRd^5;G5pDq0;feFx<)Dy9UYmHzkOky)TLQA+iY@Mt!VN4fQlxc=(qf0MfJ-SO;e
ziC+EN$ItwnEl`vX9Uqn+72DJ%=<td084unsyGDU!&!HPopqi@qxTh+^UtNgHZBSl}
zZB*2FtBPVm6rAr)Zv<zA$hM@$`)DS%LpkSu*1bqfV`b65EAR*qIU@&;-(jjf0$|gH
zs_4hVCF`X9;zwN$zB$@RCO!<y>NiBcq$vfIgn2Hij=HeO#%^?J*^P19O{h3we)2EB
z@wDM-Q*ltRr8#;z5=e?YfHSsNln>oDPqBi$Iz<T&kzsSlO1PScXG6fi_3K773Ro)l
zOTt3iA;n8R4q19_7R{^ZRHFQU4dz%BXFo-Ra4nQOJuUmbRGVX@w1T?=_e#~yx)7d%
z_|rdxPc4TOkTJK^?|s^D7w0G>-;j|dMTuzzUUE^&?35R0=ST;R<PyvvcgEM-R%Xi$
z7rPI{Pr@`$Fv3-f?5bC7h`%%U%N(a$Yum}6N7%&NMOv6w-VuKL;&+W9SjWq&j|@sY
z6iYb+nlXRB?!Ohi#$=HjR_0#1D3o)A-$F;t?T0;<OybR8rV?6Od^vI1ppH`xNIOqu
z2(7ux)77Z(z`tZ2P_p4>r53+3HFBSKZqIKW)I_OqKJYE|gJs*nC|Kf;#xh!m14<#M
zxz@b2De%1Lr_d|=d8SW~XcLs2q<q)xoi~R{q)9<j_Y#eq_7hp|24-fh6nv}RjH`e0
ztOiX+)!^esZMNdyxcfSoAWTxsuDyNh+^1pU^ywK+#xH9$FZ6^V{N?HQ><YOXj(bL?
z8Q4BBY|+$4#%PP@e*XS6>)*9wuQ;h}|8L*tZV?eF$^1W(6iysK3<^I{gOYRne+)Y%
z_=70xsDobBhB`x<;I@N0<hFf>XyH=Uu+AYV6*{|UthZizcj46{aR}0$D_bszB;AL_
zDj~=m9stDLbsfDb^~{nVy}U%{UcK5hE!)WZ-+4o``f=0`3HW!^(OzQ?RhC4-$G(F*
z)SLAC<;?plSFBjIwo}_Z8Ma0jX4`_tG2RxFi%12QGH4vrZ~T-VWX4Ep{j_QO-k7=Q
zym=JgZ=?u6X>sJ<0d=XbDNCnXk)&2qaS`?a*rmpp*p1f~Llkq`7(rJ^tG|jjE0RTZ
zhtBITVw5L>Q;{hu=a|^HSod8pvLQa94Mt`wZYhxMItwRS>+`KXLTHs-=UORoL#+$g
zh&W@2We5{>hFJFEI?(6u%3nle->n;D)!pd0LuiI13Li5Z9N0}Dj*^Nu&k7^4g}?p%
z%V)rM7eo(;yFV}gmWT<34%G3@Y+iIVag`&64h{Iber4vbZrH))r~O(2VT9Og1bP})
zJ#0{2j*7%{^HGRjlR*KfjN4EAoqnj#fGw@IZ#J0n#%xAJ1a~1vV9Kw-oZ3Qz1k5z}
zKu1;fw5-Q2hv2ZqPg?1Q4$LpRe^CzVDW}3MGl!U8<_Gc*bm}<mJ3CrOm~TFlv?$j4
z>Z~*O-;N+6fq%%<?%Q{_<r%L*2ZpIQn|g=;FTfb^isPw+SKob$&0Za;rYc8VmjpHq
zUl_e$f9~ZQY45&%(ZkON_|HG#qwZ9zOjM3eER{d{C#(BhL(x}-!GydkD{nutuNDzf
zz_A5%5KE&rTfOen)m@M#JQ0OdTw;~y5TI74L{%Y43a(?l`Wf+JU<wNY3U#@#%UgM3
zn+1awyEDCh+NnuvOt!cxZQ%$6TcyAF)8~>Sp`mmY@te3Q(Wb~$XaNF17Oe&hwP#uZ
zrx%<ZE^jgrr(Ds}XI@w7OfF=$&y{skW1rl8KDc$$#^n_ipKE698~m$k5pgVc7h%#k
zm07r}VP(flr=W?G;{5v2eIz!NE~#I%mwASEu`X{j=iW3+Y)g*An5;!ZRA#Qpp4&G*
zB7>Vk2E`tTa4`HbQRltaAm%1;Amevq2;$cli`9#At=Y&*Ur>%GRfi^on?_}p)%bK+
z(!zAl(9UY%NSOa5&_7#vhm}?n-`jovL-)u1)R{ozd53vyArKuMt=8(GlSn6aQmB84
z#Gw>{v0xqA+_P=hOZwG0b2RrK<J|>PF%N0o*W0`kNYC`h&?k0eWDvQMUdXs@VlrAP
z2e~|c-UXmm23QfIj(%ckev%@{9i#a?hqlGBJ{Xoo27`keQDN5K95>hC-U0Oh?jMTQ
zWV;`xE3Y<izmnUP`_!7ne){DEyjJ);tuFWx^~%f%W%vbxX$Q=7KhilOq9F^ssHf_l
zmX|OK6LQgB3RZ@9IaKwl3mTlfy?+H-MVZ_vvIF{K8SEjJ8yFe{4yf~1`rcyLS~_j2
zcz9*{fmjDw!a_RjHHuCojYXop<}X~>h)M+&X1x{kGKC6o{7?=RhyVn>Uft)i@e8Qu
zxZW0}wOqc)#lVp)6qf;!;@F{+A7tV9q~CUCTMzEkn44569L^&~b%}-8J@ZiR@qMdd
zY$#4d)JYwlTg}Ml^~Q%*H6jp#Uw>P~Q+pGCHX$K;_uqTo!b;)k93XKqjHMGynMoZ$
zG$r2$Rl$n6aSb~fKP>I}51o?!E{9p>UrifQUcUE3C@32FloBUpkA{@O3PjeAzDiuu
zitb4>;-=iJe*Mo?a`#NO+!iVry4>k^dc2{A<0RANhfO(5x@=@ci4z#-6z^UH0=)9r
z+|!EuuvxzjT2z@kU+uBo-c?W6;Z14WB$cNFe5y0Y)wvC--+gSs{7bJ}o{b=g-_$j#
z|9B?ZNGPB9n(8V)d$!vL{w=m^i9S9sSPAS{N?eSya_z{#sE!>=E}nVkb$hP6#@>A+
z$7`dgmbo7xXO~}_C3Q<}tw7KGh^6~yH_f?sb1U^fauUZw7MltC2ds5{a(IM-=fjN>
z7f<NFfT`!_jyzjq*088wdzO0HZuOI=ro~OE{*hkbHOu_itF!6ZLzI7Iu7B}tv1wq3
zhD-O|FGyS}Hj1~Gd5>bm$jl7Q9T1ux1QAQ^!dhnP^7x4pQ~phBZ!%iByVu`4DXqQh
zX1A+I=v`93<=BF~jqHP8TUU1tFY&jTS`%2Y;nkw*rk^Yu1rGH}Ju%Py^`gtB^STVu
zfz_3(gSG}qhA!1kI!gcz7J7?~R71mFRJM04xllseQcqSQ<t!J%_1XQl8PO0E5P;`L
zbl);+fK6hxxCa@{lK6CV0%<PKN{4rqsvPSU+gfHMp*`z@8HKX{!?q|GJegv{;i^1h
zyv$Mp5RYnP;5M>&w5@xO!}|s>b@AGYz^+=GsrBbxsT>QRfVl%?30b>WVVXMY*!+P}
zF`7T7WUL3=l?+j=&lJjfPktvN7Ik&k<j|JN6N4)4Cx=e~S2%GWkS;RA1J-9JxHKzZ
z+ue$LdG0HFkoI@ZGuM0;4n3w6{P-UAdmDixkNa(>eyg6YG!fdHaW5jS$6yH093TuU
z;*>>GZg$bXNFyNA>}2wBSa^8gu>}$@OeBKMz?LgG{!cs9U}$p1{PSmf1b!3LfwG$)
zPr<Tsxn{IU^H;B4iB=BPp~!|%9j-W)aTiY0iH{VMIr5gw;n>>yRsg}js09Lv0+98l
zG2$|d?<Nh8crehsi|&y`mKF6mm^bpz-1vGMDvJ3*B4vvvH96DbyBfNlgN|xcnC~Q>
zoj5G$Q56!9RdiP=KueSkba)T0FNg*+_Hs_;QUn3Sw4(qG_|@~DPF0yu-x>gyf_Z`z
zuUL3+Cb*rjDnO%{O=MI7W%jFIuTU6aeo{?v3bkBQ{bx+Lot@p;?`iT4gtz2mr~gMs
z<ukt_=k;rI_l4se=bA3JnBz9Y_2!m{eq&;(|DdY}Vda4LKjc^aYH7V+vqM~Ge*%R3
zr`*^uv(M$&E<N3~f@Lff5T?*39F~%uO{*s1cW9W!W<g{xn`M!+C)TgX&F!?bP#9az
z_i5>a;5+1*`@e@x$m4T`)AoGhjv9z)%PmNeV{vC7XdSU~<p7f{!%_C38*73gmUJ?1
zAD=a3Lc1Nbp=9Fb61xO+l;V}3YjzhC-Cg_Mi+C~n^ThgFSNyHNA9yvz-*LQOGoR|i
z?LljR8RB8s*1bs+g1E_W(^qwvkA<xl2Wxovhi}y-3rG@z235>iro^DO_Sq|SO6u=D
z3)ml19XhG|y!!OKZVLyQ9kOh_J7Q$#78dPX&4z5FJ_TVbKv0Z}A(g<<0Kz}mF|OFh
zWPI)2%|=Wicj|!~aL~i+r%#+nLS-9_l!yL{<mAv6ggiYhr-F+GTckrwfuevf?6f5W
z1%<p=0SGam<}6L`w9m@V_e#Suq-P@0YTuy)4w5aQR&`COFFFkLv?Z)FEAa2BQ`^dj
zBhh0+Cvhu@G!A|-r|eEH=Y#M|?8C~1S8z6<Df9aNag4&1^M1+`REz*Jt$&s)y|ISo
z)S2`fRJIF*C7;TM_pk#l7EqiwTUj@UtN~?7KR|XY@&~9*?+fZ~T$m~DFXr}ZzxiPg
z&r)j$FXES#rSL%DDAsGPN&XCsf~3z7m$!wt=lT3_7ezPy0c|qX;g9cF|Eka=Y)K!b
z1ePGKd(3WMu4o<~6B#U1kN$Mf_)i2}Cmbw^#kvVUgcK%J?AQ$-MmI1PLA;>2&e`xV
zANlNHTZEcdennDBK0h;is|1CRaw)Z&=%R#K=A`0O;DizF`Z+?|d5MBjLNy1UJ-g(}
z?;@lb<rwQ_1QUwFRo^m<E0`nY%<V=w$5|)0zc{4m`D2bAy|Z8vdVYhiR|_?SlYxj!
zMxC)8?n0aE+^w`pBqbGSoBW?wnTdlM6kQ9dmoleMV@VX1LXVgCgOA5azeAjV^io4{
zAfRo6oNA6_g<=8Gl#uLF-%3%1rD*>#mWoe4il{&accd)8*pOsS=r@Ol1=yL4@kWXU
zxCT=4v+AtB43(s`Y00c0l9;c}T{Hi!HR1inZ>$AP(DkbvFEL6FbxL1Z^FBC4N}gGR
zV@V>lRqgv$cx&@F8y`yQ5^AxY6A*!OTM!%xASfq*IP#LmYaGiM!C#j0TdQA3BSjXe
zBevwb?n?ArYv#7NZQI7oSSuyG=fL)Ts)H#kf$9Hm`kb_|6p8k_s~aOSZCC29m8Q=j
zh2#e+Mnv(ys;?v}UhNu2Rkm$ev4x6}S*xT?uODdTY<;yTjc@ixRD_uQ#F;a8ASm!Y
zeg1jgL)=IgF6@a5)n)CXq(NS&O8J1M?n{=bn_;_Vk<)!*C!E~L>`dC?NieVj2e#!b
zV~t1>3k;F`Apc)Kost6>`h@U@Vog-Y9DpLkbE9xk+CPULn^l(tAjHI=^`!NZIq`go
z_SH*KG`7zVy=nKrSK)d>RQ&QLp4(IEm-oAH<!EU$w~4Ab*S~H?hqc7N{>`w=Zgqr(
zu?yv}LUbr&Bw#JAg1kD3$C1rTz+mg7xMl4uo6KLc@7A(5?=z;YGIMPomFcrU?b!u?
z07>OsYc@uyO4`TEiGW_iQb#um>@$h7r$zhr_x{ycY8>0$eTPTn^k9yMLaxdYOx>t1
zd#lN?KmaIqnqsqYT}BeRKP1qPdO(V9`N|p^6cl1qA&24?>R=Jz(_27qi$;~70rm9q
z-KSVF^U9nN6mrCD79u)!jceoQ=a)>7Y#*I*I4GHF*o-MpR-D8_ej(4oXOQrwnleNp
z{hzy6b6g000Zf(l+;Z~CN$j}j1?{;g>FszWe~tli<K*A~@6@!c+?k@fEPBnP&Tcn~
zK782734!*6!kSI}CtRLh5J!ll(D0o#@17Y%YeA!gC}_m^@lSTvdBC&{T3MpQAu9<f
z5+8$`4a|UFTvSpb(Q<if$}B}n3Nl7vkh>*z>mAoAMJ|nZ6NP}vlwU^AeP#4G-}R5J
zo`ON5go>tZgc|{Ww!t+c!UQL$9h}_Jq)h`<;29I}apA)50D7~}{Qbdm-<|`;%D$CM
zy6cR)OeQG8YbYc^L(Doq=_8{NsikFjDPqnhD1nA@n^7MMt4I9I`A?%6X-3^ZTg&g~
z(^xPPM&L`a@jI>Gjf@c^Nr<FH(n$!2%KW&uB7_!?d9w3Rur-bhIaCN7l7UV8&=dD=
zxssv5GCaDRc`q8cA|)t;sz({xSR*9mxog4qT8W&|^)1q8rzJ1ERIL?;58{yTmu}Dv
zg&cz;upj;^wEm^EB%!JemcN&Z{_?wui#dtnDGR0laWoF){mU>lyu*b!@Kof+pEXMj
z)`_?^Pzwx_=P$WXCh-r$6yPd!{eLc%d64M295)XBILJV2b9uV;ua@Fak-R90ZsS4I
zE%329vU!yocnWB^|KQx4{i++jQ@CBtwx()Mm*H%GZJp7hO(FAxlw>K;0^?cAr?^am
z!X-ych4H04KS?Y;w!or1N$KU~#WSyLRyr2!j>Z>GMZ&Bk5>9c%_^ZqfsVJWF+c#;C
zUYEeX`mangV`g~H)@#*O|BtFS0qZ$!-?)E5LQ=9MTVk@4vJ*)Olfu}E7Rpk|sH{Z_
zWyv;F_UwCMw9-x}(j-X{86qX2EGh5j`Zdr0eP2h%bIdcN-*SKN`?{~|yw3ByVsF={
z282@oB-XRnhF=FK6;-T~*<#pCe_#IDw($`u)VQpbV>>QQXA?wrd|`TZ&h6nR78?KV
z-W*p)B<6`oW&B}F3nh}efdUMGi4t9!9U=;*)~1A(X`Zw+RS#}@x7g5k;ed%dLtofZ
zp5b62o>*ey0H6v}PzKe~?W`X?@B>|o50&$_9+!!70{Ic+Ect#uKPxOR&W&loa`%l2
zBeaW)y^x)0C@Tlokr$Ha&rF~EMmOf>&6}C(o}{MiVnvjZUg#dUt!fZbTUuaPAX6??
z{gTdAZ<~I>f>B^8xoGRTH()>84LV4oQ}U>7dMoN9A&CnImXSnl`prwlLWL<~TkGw(
z^`6EM5`Cie7_|6wx;ReOxy~CW9L~Au6n`+uPGK?G#r9pTyYnw~k4|<jUQAR0IGWmX
zU%tyu%ldWQo;?f(Qk@+${H^!YTUNfNh6dd(HcCb;DF)miu2&kK#B{g6>(c?jOWFsi
z5(S0bCc{nXqDzn<9J!pF1E$9!`w{S@^C^NYvFoM)kzAShmkjO65KeAATdUZncpbSs
zi)}?CS^!^JQts4<<DGV<Ia*u;o^Ol%jqIhnjPa)Jof4Kfk}FI`nnKBuu}&O|3Ovwd
z?yL+m12QLd=R8O~HfvTO-Zzi88URGBeCF8(zo38reohap^yoT597RmB%ub|b(i)rs
z>bdmB{-XiW^2#_kxH%ST{!`AicmGAIaiu03x&mo*v*`73M4uqWtW?S_NG7D402v;0
zPN`%B)&uEBXfLU0>GUxIs@I&!S#rfWvm>QYa#~s=97b-TYb-EYbZA^?C=@;AFS3}3
zIA|7L-Dt@TkAfA*L$E!#Btn{`O$hVw(*~oo&5WB(9qSl05bH=G&`CdyD}fuCT=PTr
z`TUMFRfxuzZfnzSPa0gf&JU{w@i&Libn~&Zfr9n$r(6Ln9^On75^{=+xZ<{uX9*%l
zh+s`F_Su7Z1DbHZ`86do2+=D;YT%tOhy5~!;2>ra+#+!~`dVrYLYOlmQ^|$tL!^O;
z0)(v5uoVn(9C7u~Bue@tw>WfqK+?za0<ivI#Fh+C0ruR2tM|x$n;BY2FljC=aY?f=
z8=m!h*%AIeUP{6O5oRGUS>6jROArnU2_6<7iRAnR*w<}&rec3Vp6W7ngvDfXXc;vb
zNENGb$KmnhUou01^4IgJGS?#|e*gD+8FLD2S)VFEsN-PcQ6s~;Hz5W8mJ{x?i6Ejs
zWJvU!QG!oZmtKQLaf?{+kPShm4YIuT>ybO=85!-%b01ed*%tC@{EHLao1jGle8(YB
zK7wEKkpY~9>@I$)YOkaMu4}B)4U>Q&Svt`8qQUna?Ur2Gk@G$$a4X^yyq84nBz6ou
z4o1RB`+!V{r)57t7ipOv1QNcu9Y!ajikN+=<0H?4aY=$J-7j^g`4Yec7o~d;mS@C^
zmm~yiJ(2IB%E72;Ysy#jV8RX@;_+6Kti`?d=V~GR$?$L6^K;HKi4Cf@{2ThZV<@m>
z>Fum}ri4_eOH%4m$BuDwI-TMQG{}0887QJ(KWwe7UH`@Y?xz`Vux|n*^Mr84c^Gm%
z)z!@nVD(5!E}bpnoY0YqiX0+cc3tzT7iAtv-;!nNzVDu!%0c>skb?~A;Y({xthKmp
z1+3v>jPo$bBeWCs9q>m|2%g$|etf~XNsSAxo`eAEaiF4*@_|sKh*B}~+qiMAu1(yv
z^`JYj4n?I*1z)u?bnvnxDP@L@n{#{+utsBvFZ_sQ2TfXSyJfaS8sVLHA39XvyuWsm
zPR@bQSI?YUM|+wb9<_d?5I_kLk_Ji18xWvVc6z7<1X0izfjXJ*w3GMvnfdEybKdpf
z<aT^hrd_?n^52}iW@$qR27cn52Ohr)YMP}*MuQC`<weA_O@PZly<%u8(RbF^jKn`+
z(Zd{)QW1jG;U+s;T688V0D}<xk_61;>J|`TRLz&YDV20)>0(RxrqwE*F4orqL+&m+
zaGK|F1@jmP%whv44Cq^t!CL>4M7+s7x-+fgT6MsPP9IYjauY_?-i3`yInk(K6ty<d
zWi$7&xKN238{UzGVbds29-|nT{4AM~U`8bn<<bvmUSJ`36B62Fwg?53Lh<3oXKOk<
z>3tkd=eZm)bZ`<)4&}8pD`+#)8sKmVJDn#_1m&9X1>r9_8e#oEE&%n89a&gMN9Y}E
zKWz6n3KyI1k=nJqjIp0I+N>LSv}RP_nGqW<6&LlScgb0nMlHpoTngZsPT%Wos2erO
zl?rbjIahvpdD_h5&zT-5q#IHIp=WGpiX;YJt~eI!=^e`)H-$`42HJ|h0j^n~WVrT=
zUl7b#090|fLPaKBWrkbQThIQr90rJlS-yeXARC8s`3$Ph=3^2V6vN=l?V#)6v>|PY
zwI5;{NrHUwqB$K+UICK?dI;idOZi9&&w=o{rAf9z6s<?rLpzLnOPH~QRMGXxh&{}O
zQK6#Uqk|^1mAkgMd#s_s%F5;zlYzd{BnlDC#1sbn448$+IJy_PV@WtEi`qvHhM*;h
z5F+r0j~<C1oAmN6EqAWb^EWm)5HfWCwfsig&Kyc8z_WWa0|<q}!F5N0N67?wy|bBr
z*X^?3We6ojEIp+UE4I&R^0IYT;}tFr*JlerA-1MG8JYG5;b84y5k#}eaK7|wWoUA1
zY~-krjEpM_jGGqRv)fM46Z9JqCBz#D@iKCo9h$}kdT3rI1&25l<eZ=8;`IiEQDPjG
z+~H=b<&7t1Q>3LJRFOfHdQ`HQcF={Dp>}acFg0L6?|%Y$!HtIw&9h&81TFxW?dJQR
zSWt&sR$Gsw@z@4C3TKBlWGVtTpk98ubo<%tcQV2h^>NnXqj<T?y$2yr;1IGIY<=51
zb)s1%(pHG7n;G0ke`eR=!_UfA@I99^?(G;%WqC<h{E|9KVnfr(>zlXloL^^%r0q@C
zv|M(hLREBi!Rh_se~A^z;_7<G1i``F8C>S^g=a`cE`>HJxSD)D{BvHXjPmIeK_JS(
z)s|QR3?Cjh=NEGsByzwx1PyTd>U+eoBjadU!&gl8@&hlxbteK^zv5SMN0kkps=7^;
z?YC7%+L2DDX$ZK*l??*h@lAO-WXHEZ8*wU8StvFD&VfE%%RH<_U=vGc-UGSdn4pr~
z<GL~ew;>&<yugp~l5@)}ODhI@$h<0QH9&KO=WIH*KPGzjXJ<b5{`!ohUFsts&f(+L
z6~Nh!uB9Yfhpb!U0prO3r%7cxWy+pxmoZ=?V%Q4$&oku%-F(c2jpe9FyT!K!GWX*j
zS<F;pvKk2&e?uxpLW=Fxqb%}m9!U^tBWZSZXWAwyJ*8Byzu3lLe~ttg`OwnQa=zVC
zkV!F3pk@~F5u6fv7oYF1+dzD{I#7p$V+a$z_nb&|QU<>RPaJ-9>)qSsPE8l+^)lm~
z?=2R3l+NUIT-^vvi9G$ie|U%;E;q90%;AQ#plImvP=pTBed|X2Y^G0tyy2>KwS#_O
zC>a$I*AmcP8X%lZ_zhy}bLLDJG9~F<EooGyZeYAOW@RJNKt6{XDDXNw&n~`=2Hu+I
zb@Z}6gbeOIXiFSu2A^W80t0N&k2Qv~#u0o!R#lBffKTa$B96q8Gwg2`maf8@jK*^d
z;OB{6Q#ii6$31f0FoG`}9N7(`_1?P@qyV(Nnyx8{nK<Jm1D+_pP^&FEFcxq7wKYi#
zRs@<KuV=euQp=q5t?i@teb>ch1kHFvg7zOP(mF(6TTdY52maTSnyo1>qaTTbF&?d5
zn6^N>pPNU;H}QmVkp!SD9h>V$W%gIuL;3zKPB%W$T>GS1l}WK?{}nF!nrgzS9;8`H
zp}Lu!i^v*;dlQK+h+EuWImL@5^P~$C7^)Cxiak`srnq?Q_S|KsPo1j#{+wtoBV40#
z4U>>VR29C%ML*_b=-_WRD{KahGls<CRCWsw#jUh4a1rl9>WE%L56tO_>xG;pvKN?F
zWCrpk-eXcB0J2<ky;#TplPgbp8i{?150MdgrgvpNtQm?tDITZ+)>Y=rz>WsGZEq?X
z){!G4brsj-)y(qtBzEH1g*H%zPQdvYF7qVGMa)f26=YSEK+uAOEGRd;n03x<bCKG`
zv)y1)Wf@oq2|#9zNqb33O4`0se<PKLH{KZ7et%2O4xvoKotVS}UtM!}huXW;aCBmP
z>1Z-R`kf+lvS*Nj%egM#Ejz}h>|av!XAGH;`+;Wf)JPIBI2B<ds8|wo_d``SbhSp-
zese5+dOh9#R|8T{f%MUF12G8g1tni+W8=MtGwF=o;;elXTLi|4sdU#)P2aw=?ZKCn
z(C7hEFm?tuSU0k;OZ^<eg_d#S1{>3idS0{He%A1S!)df_z_LdAN2k^K3@M`gZ^Ak7
zN%%@Lq=R*cMr$tfEuDN;nU)9c*l`O!IBwS&kF)mNUBKQgYZcatQKj@L$6Uy{I<P<|
zYY8bGB6i1wqiy&PFhP8B5umSn;oE5L4-NWCG9HxsZYP&V<n2S_x^86mGXi@<x4YPN
zGnXaFdUh0gaJpi6#a^m-3$^&%)RIqKDa>W=qC3+wJ?F%W>$S7<wd<c15wpxcgpAN=
z+<5u#zl5%K#H1Z$Uc?S_qDs3lE-v!O5o2azh)FR)>`GyFTAK5ugq_AZT8{dO3)}%^
z1SzH_6=O869nDUiMqLaI9CORziXt)O@3L4Z!H7&6fk{v$5=7!oO;-gstzKZVGi^nB
z|I}x$ul|flad>s>rTd|=t>d=&Mn0Ptv9fYV@q^m7_m*{Q*(A-pgU;6D27iYP)$$Lh
z=;>r+<zKk#Md&W?%%R)6t8Bkd3DYDvT_zOI`QX32ZDS1$7;PUT=6BQRW_9e<%=C-~
zZueb<p?zsOW0GXVH8;n_MNit0vr*@+lG*Fx9)|WykYv(+p*HBlbtVa5R(K<5CW{Zz
zaRT2XI}#=W$h#QB#ktPuxlib5Xn}5tV1m~3&x)PvW!Q^mn>LuA1Uz_PEv6yjx=yzl
z>&7eCQ36<;&yOT21XXDwvqq_cMJ^19bDLCvgd)-|X<#FQqd6`)v^cbBygy=G^qz}!
z8_k18P{=6P8qtTM?W+PV2*--D=LM%g?s=GeXn~h=B8RE>Zr-?Y(wOEZ4Gx^H$eA%;
zNGpP5!t^D15X)kYB4Ln95iC)<%9LRB6~fTsszBWhTzWpca9iU&jhi-wRi;z>KB2a$
zP}nf@A!Iu{cO!w10CWIzCqb3zlzs1OJ^g9}%nF9tkwe%Ndm<U!^P^13XA2ts;qBv5
zT&R>BQe#|UNZ8ysuV(9izUm~r6iV=o$GZ!ZEox;@Q<wtyI>;P1uXYe_;AcS{5@I5F
zb_IhOM7M;ADNap)zP}M32iK*6O48o8t@>wk`z*OmGPiSW`T<}>8M=vwrAy?andgTl
zKXD-Iu8QlsC8e!LMa9d_=hnW^h?}yO$oBK+&&>lj{sGKoXt=A7eqkqwUN_&qb|2Dr
z;#jT`0994jyT*Tgtx7Y$mvORt6B;+f37O4EiUX|owyI|R5Ig;XXQ81(TL757+SF#A
zDS$os2v!ne5oFYMocjdOm*I^|&U^PodjEah+sYqVZnw`G?+-fVXFp<_e235Q?cSD@
zICNr`6sEfbTujQPmm0YtGq8chWC|HjdN#zF^jAx5wp<DAuq`x{Ym8K1=r=;sn_fDb
ze*5p`Z34}O`^-0!9-Fv7f=!tb@P!ix-0L;xDC8t+Khk<hUL#%GQP;J18%}FW=7vL*
z1hx_yE6wsF%}PK6H{Npao!eLZS7i1cx`dm#hjd`1lSUuG8&gmr1|7<N!~tI8#bciy
zWCk5N?1=ji+Z4`){Vztm_UAlGfRmA7(xhc~N~t<4t))}^G<b7N`QS#9zafJI&(!FU
z17bz6xuz7FfJ|bbF+5~>k$HG!sH>&9x!^g#a5IhujR1qDqrf?rBpys~Yr@ah{X`WC
zd7yL4Cgex_@DJkcBksenr876AjU&vi0=ubacc&($v_3C_4g*<ylK8~Ab|4?5%>)Is
zlbj6Hjn%r>*7xHU21FVIDk+W$3K9OuDDOPSTMY0V^7K9!YlAJ@w}#~9<P7YbYDGAN
zW%-J0T?Ty%CWV9Z&b}RObgF^xuhlP2nmswl(qK<W51c9X@4r3!x7EiFUnJ^gXLotl
zOtn66XzJs_?=&u#<9tAYExrhBzP8y~&;4IguGQnbk%Hf*pWBbFh(dutp#9hh+M73<
z>LG4u_tjyZjlmSnTdOjK3wxyD#O#OfD=#jr8TT{;x<xwwM~FojOllc#!D*rGU^4^5
z5lU*LY%JXNAv^(?6kN5nq-q2+t)F<w1=(hFy?tu*CZzq0u46l;_zQR!@L?UF9cS;(
znUX~JQc_R22Y?dQ9=c@HklAE@xzym4M{I!O`Rh)ztpJ51u-8JUIP_BIa!@FU1JZ&<
zt7L%^Xt;Bi-Qy%B1dS2WhB-VasReJ&E037hwXP7QLAsV(N2Ybqcv@)tkB+78?N1&=
zD2?BZD^vL}AWr?@glrcJ6e=%D7rfa^bq;eB^mpr#_LK6`lM(mMABR?4&2yp3Phq+Z
zTxj{g85vfRw#nRWcZS8#{F3p`)W$fPLE=wFjZxhv^R?QHYD+v<`t-S!S!-VsQvT#h
zeLB42s!GlEFej(sfzz`_dm!=_h4Y*x=|^Z4jG*^TFN|R9m_CeIg*jb{7l%AOvfZ(@
zuc_WD-4n<Eq&YpPr}K!3(EyZ`<Y8QH)Y@Xrz%?c<DsoIV_JdQ~)Ku(`!Kjt{n%qF_
zCB~kp=brv4C9h$nxubqwhKl>p6Hl}4%&ly~M?YdNmP{F;9m_@lVcMAIGM1dNw(wW=
zdhkEp8O4J)t`*lepI_C5C&*ESsDAK1u}FH>L#H>dZ{_sHT$`4&l4{*0@aC=3ag7T1
z1qYMoaG>{o)(vY1Bea$A;sdG^v&UHL!n2Pqi_CY=ca4fUx9<TH>&*#`8|KXNlwJKO
z#+I9wqK~=@sFQoz6zqcffyx0i37D<X+@7Pc9H(+a9$^Bj3d~du2`#col+3|t1&p?9
zWztSp_sJBx5ls+&x!5^kC^@p`OSVr`G2n`@kMgcAlx8b68Wikr>wLmpV<orX$`fKM
z(ZNI-lXWE=yII-VkcNq^##y`6kK|XE^FQI5k(!ov&2_MjI&}^{pkC?GcMjLQ!sHwZ
zhkU})WzJV-zF<4U1RtqAotv7IAX9y+*4}&lC)e)k1$i0<z4jh*R%@{FjM0+NP`uy^
z7CM!943l(Srn7_1k`+U46X$%olX02j`l#4<r;jWktmv@f3j2E*I3~^Z=o$j>@7UI$
zBs<&HY4HK^#X%>-JYa!CIZ&_oYGcWs0{3?tlIk)@(m9?x`~WcE6m6RkzC@-y7NgV+
zLqoOO+Kdc}ZK`bJ?B1wkC%R=m!=YVmjIWk)y@=`892XuP5NahO$03cV5h|e+(coBi
zCm*se-LmCN1G|kpTE73E5g**z1*dPCa7uS-YsdM(-4w$z_TW<fb?0NQ&JEVS*gvjh
z!pDf=)y~}$4rAwIZl1a;->_iU8o3m++g{JGx|TQML#h>751w}sT9(^a*1ASL+sqp8
zI5#`g|B3}Wgbw>o-04Jp^zEB~*Mb!QA&!k(-LamQahj@?f83e;3{_QC2j;x+Z1G6l
zg8es1HBLs+!@{)+eh1JK)bFr$BjyB>%!5*a1ns#yT-|Ew!Mr2ei_2|{C<>Ct{qsl0
zf~&?R{o^(pE_d(h6tro@2?m@Kr{<jKKVrn8jQN^BQlTm@92TM*(%`FE$-)+D4Q!lK
zZ&|TT!4!X7+IeRDd@6r7B1gpUcy|J;aBjMEakP4B(&DxTn^t7U0OLG8$tB%ino$i0
z1^Q-;GfmC7Qtyw0N!hoo$mA#}t_|E9plRXWkWY-Xp4P?H!tIJwC$FDll+=Vh+fRpt
zGE0+-h^1LRaEqDFFfK~Eg>bI_`D5v`JeNVl3ofISCD%nJtCZ?d>t1brrHlR2-*x5?
za_Est#~pBqHemzzUfEfhnalo~Gq|=c{pxUNq_-}!<6#2JY>a}Ed4r1_oG9pqk2iU4
z#Gv1{9ap`mM_P>vhtZS<JsWp1<ER~3_h~%c7WM=c^wsh_*zv#wp$|LOH%!=_v!sJ|
zLhh<zTeQZi>N^CrSMfcTf6~}~U2;HZSGz!n+=HtI8iy8lnO0l_x6v<W$tH&GTshs%
zBW1GF>28b;5K6NQY@uZ&lpto^G~0SyW*I`?J^I(3&R@*VWD7A$<kS!dd}_Zt^YN&J
zg#P07`F01U$(!m$`$;zd)>g#qORI_kyWNfqIhXrBZ`_p%11F<hNe%4>9P7UIiOUeW
zuDi!S=&zEug|ab@8~|wYP|5-)qdcTGzJ`asWMnijbZ~e><uvx+6>5&n`vwm;NM&np
zhvNCOcztcRMF%LCoS$WUNciP*b*Faw<GJreKSmQtCEC-w6Foc7Z#x=iCkkRKw&X0S
zTov4#SH9AAUxwA%{@I7`_2#k|vT<fcPzU${cV}ar=jNMPyq5+;dxKr+yA#n~s(rW}
zdcU`?;Q>QmlZ@{V|4{PX<?3L5B?%N3D2TAc-UpDwGH!K)<`!LB$P5aG{BFGO<i`rb
zml96Fods93+@44KEg#6+G_cvUK)M>scf!|wGP*<9@VcdIiUrNhd85dQ)VpjC$;<w+
zn~$)${+pR83sgP5yjt|%`$Egm)7T`&=`?e)$P?Io@F1IvJZiIXX5$)|o_^?Ugn3Hh
z3+bkGaEjVpWft?`>{f*Bf6ctgyV`Gg!dJfY0&j!prpPAzk_^L)jMsJ?<vJ)oy0%vN
zQMPv-FcEnpb4*3O3UQ<P*bJxrrf-{`c01k8*~y6gSM+eKj;2*rfBSEpl_f<7X$}&@
zdG2mD3)&-84E)5k_piS>-=PC+MK8OOn|$}=Nfniqhj(e{Y4_3dS1!Zujps6PHw0=Y
z(G3V0|4}U@!%E*=6aAxUP-lW#2TF30*bX{)zbtTjm%haNShvtn<qewB>~2Q;;k#A{
z=eZ58ZQ9mP4lbe}=yZHnhA@o^IN<0K!A+4EC=^#-by?NGv4`x`Ree+WN_x@tH<=~P
z>lJrUtNDy>4hFuaI*S8BhI--OV7`tG;N}|;`l{2|jIQz}?YqkPwb_(eJVt+aojjs(
z9Ub$qRvY6oS0_&HIiY86?m*`%HKYGVQ$2-Yg-Z+ecy(uoH{a6Yot&JVix&;vVyd@&
z#i9dF#xh5o%IB_K=T!~pU9h9Tk^|(u9ui07LTRvb^OFqaZed5=cWPJWq&b&;5wSPh
z+@B9P#{T%@j6>r%ifjXAybUls5E@EzfvQXFyFY!>&%NvKMr#0pUDBcfdzSa(OAZ(?
zF$D>0*rdet^re3vgtee{Q)hYq?2*QM<=t7$2B9Z0G#YaY@!au_HU?&fc$U=X&L>PQ
zI$)qOh!+{eCnp~Os9a!_7qB%n{AQIdxrf+D0jV+mCFo9Zi`%M{rQjgsU#aId5QuaQ
zjHQo;kM{rGmphA*aOb8@aB;$+#rI*|MH=SN&&jCuWB-aq7L)0cu%`iMTLki3Y^mEl
z_<Hnr6^UB|yDT)W=B_p)gTfny43?eY!8gJw2D#P=rgkge=yrE2C!Vj~s-9gs#$L=>
z0^Ntf5e)LR{P9ZO)=*VX`TTqK+_$yafxYbA5F=}naK<KGT4>1vrPLJzr7NZw?|sqZ
z<EjzL`@XZ|_1V1C=*cY`A7H^c-VHco!HDyeMFYe`MHVyZ5^|XnSwcyua$<CL&I29u
z?k$77v^fnrmN@s4Y=gFu>6vG=Kl8-dsibnv*{SEV+*+{4XACP>GflaA-ANY5KOX;D
zGG7vLl26^THyo1wO}aRJ`o0c`_|G7MOUE=Ggb-<I_%SrS0ZE?BizB3)m5;4h#R=pk
z__6NP{BHvoQ}vi3$%6%X`t~Gihm{X0OB%}RqT?3^J9)2N{*O<)ph#wXBfeX7U^BY-
zawd&h#x+%|f9&Q>?}FUbyk&|F7&EC8T}{il9$i!{CTo9)-FG+R(fi6*!;G_Q58Tdj
z)eB$UvudE*+Q)I83-cz;FOS)7xctiOOQrW6u0-tQB|&rI0z(NtFB((sW6!QUSDPEG
zxDUx0Ulz!E=&Q%Z&EjNiC%uEY*N`4Q`kAPp@Uc@f7?us9-iZ59Kw30i!+#oO=>1ds
zX3=-k<$rgmE`HE)#gO(@w_pDm_bk8P%D*ob$5y46-g<Pz%yoWx`<o96KCc?#m0#sr
z5|fejr)6fNo;upMRjpJ{WGz}?kr;nuc-+drj~FNYv*fzhL96#Y2E|_8ep+yBR=uid
z0wbgKcZ)I&poN{z&z~`eZU(2n7A6|xc(IwmA9L~F>s8D~DdR@8A9y(a%10MC1vwJ#
zAj)AK<yLQERc^<OPre1J)$vD)!i>X~>#3Zqi3}}&{r1hhrAM~bJh@ODotb6iG<1W#
z_s*KDzBk-P^(`IR+T+WO@Z3W?ibJjxn(JK4+qTujV(g3~U*}q9|DEj_qPTk`3W`wG
z5YjHo0mecy^Uqq83a%Ge*(t1zRqIv)-X>2Xbi+c%!jbL;^l1`-6VK$zy#0;z<|BS+
z(9C|KpJw;C<yW!IxaN`@by~LpmXFv22x+XH$?5EOPXd-)%Tuq*-kz5pHQrg)=!AOJ
z-ogyWH%IP%YuRKI(UpzJn{lG)`pY0LGBfxIk1v#&oba-><=Y0S=G8Qgsk!w2L3N<Q
zc)Z=HiWe>|j)zySiBHQLRN8I$p{(50R-d0_8S!lQ57kybJn=}g1t!jB3v6f(ZdDQn
z8N)?58a_TT;6#R_<;Lmr6Ly@qD}q6$nA820&nyg#IbS<%thTFarHr;K4PBr9-Pky*
zT+JpkwOwz}D@ehL4Y*zNtWh<K;0IDv>2Ib_c6QRKpr-xu{Yh=i3D5R7oqpZu7xt%N
zRBh@|XUF=^;j4A4s?XPWuTP)5y7>x|jP{M@)*OoeaedU}zg8dCY8QI!k2kkRl*bhC
z4Ln2j9v>Ned-0vnCwaM-iicHY9=<tt@}mv@V^-d{y}eoc!0lD9)ZQdKUt#-ZNMgao
z=}W`CAJ%>@O>392+W-5q>Tln~W*W4pi-A+^JDc^oyE8j)pQv+i7C{F*9Pm+@e^4Fk
zIC877Z>MKajl6pLog_eAMm*C1_!!Y3J80E?D+OK$M1&L`2qfRwg`SlepGw~MH&$Ej
zx#xDC=@G3t-ks)6{T~+q=J=O2H&eb=wSTduNloCe8jGK?`YM&(Vk@`QBtG=9d%8mZ
zPTp1DpSQ;(?fzI$xO9y7&J)Gz-i<#8?QiB$lXV(Q3*z7abN$Ij^BSA9b*aYoU$aAp
zW;~v#vHqF`+t4=NcN)Xs$+Xf7oc+?{Q_h)AZi!IaWLTFShK3MkK4PeGE%UyU@vz~;
z1@ZLKZoT$7cM@$&n%c(4GGDn5xwz=xo<aV*eCMuhUcB_kxcPbK6l)r$SI&JrBl11-
z>+tsy=X2_c;ODP;Pl^b)GqZf(yn$1J;{e^YHy`KZNZ)(drA?NTrw+Ey-u)u)^T0R#
zwf}wp_VMR|VUzCcU;D0yM(mpR<=!Uq8*M<|y`?ba`Q)?3(IM%t&A5bmw;o<<Oxy0<
z)O*jnwirLSbBuFaT?`y(FbkQ9Hq#bB9l8Npty&E{Rv2okqXE!Lcb5B=FDNW!+TT08
zuE~Yr7h?dly%A%rJQuHFHr8BBCzv@E%KZugKq;97X$cRueCGy65953cR&bM9cbU}4
z;Y7*X)tS{v@sERdz0J7%c;DBj>%B7HmiRga#eRCyAuIoZ{*mf6#N%VY7R>Pz&c=cT
z|L)2k{kEBL-aj+Ee6OW-={yKC&Mj@9R%M0`3-zX76mP$*m;GDn9r7#~7FsX4VAb0i
zqz~`cZ&_lpO4152FX`iwCP<UaYc&w_(Q-OBQbgmPuq7_A=AUfuQ^iKr)$#M%S+CrG
z<?Z95MeD90hN8<vjaMH~tkcJklL^&H7l%)RLb9Af<&48=?xOlWTg=w$-|#*iccFFK
z7z>aPtb2*Uz~>}Vpc(v-_8#sR(Pn;`@r1QO->dIgm#towP@X+t2e_CFz5+EvaFR%!
zvH53>T7#8|3!GG<RD8J30z=5;RMoX-;>$U!|6a*e8`wCQ-^w5s2vGne;G*ZYYiDhI
zO&_KQtvGpUTBG4%h9O_$V&kvnrgh;)SI)ad;8pnZ&0uK>&<*uBd%0+^YeD}>?ZS6w
zguL(fA}qYO^;)PNV~@x9cTS$$eej?`dn9vx*stnC^D3{BjHPfBp|z6%y{e5}tPfdc
z8z-hEz!GJ7>F3#D#h0ivK{adU{h|J~nR{k*YKQXWo`<ja&kLtT!|y?>^dsK4TXv%!
zJMgk&`9H=X<>M#+m0rH^@xqRAMz_mitnKXd9xb_^oohSmdEv0_1KsAgj;`8&VY}yw
z%sKP-Kk$ng1&%zpsWaUaP9_0l=B3AOqcf}a-^f_uJnLLl-npt%i$c<>^25%~nN<tv
zl(x0b$67Nz^=>3xTT<+A@s&(<x^KTUTnQv}R%Ti)(LKOuq&V|jd-o__GPqwC^ohHi
zFuIk!0}v)qP<>bu%s&=2Kc;uU=S9lp)2dw|R7M8%07onuyy;;hD7keRA!|jS(G)Xe
zmqBxmw$7_6Ztk4keqzDU*(0YebyZ2yUcTVe+Xrjz`g!lXP`+X_I6WZ-zBzeqb%@r0
zrcIjIU7uZ1_^YOnrR$RO%wLPF@&!CgXXC_>n%h~IikH^+{!p^?a{Pz$FW&Wi`RMrp
z&tv%=em5+xAV@%-uTSTnt5ifc!2ie`o-PXv@_k-vtWy>~X72Uc4sX}gpS9)q*?QiQ
zapCJ19r!R^|8#hD*u~sz)sWR0kA7U8|9r-d2S>hI?KkMMM%*^~-T0XN%enl)i4*6`
zYkd=(?_LbOyP#X=UAxp8wD6A@w@Uwx)93~kCqMkN<kF9s*Z+BZ++qEe+2_(uHIKZ0
z=F+r?M@w3bo@+hYSJ9+i&(3=OznPkiGc&7NP*wBQa!o(uy?0fULdQQ3N8;jD{9vWg
z-dP{2eMZh7a`=Hy&2TN-9)ad6u&^*ILqv8=+<nIAA6M(@ABtOry@8f(Bpd=gI`8ws
zqYBRq{cfFNHT}`#`D&exh+c@!I9fiaE0L3$Z_|GSS8a?*gA(QdsL)qKz(?L7x(%_R
zxiN29@aLh8*FJ9aA+S7du#ep)-$}DHXL`O^yV~YbeB9H39?=InsHo0vxFd1piJ0=u
zKR;Y9fB!DscF@O9cJ^Z;7XFA`w=ybWPqtfBe)yEr7nU5j+|y;vigxxfiRas&$<8}>
zYPC(H!q@ax{9owJ+~zYH{X;@|GoA7JeYTf~_v63+{_DR+r{!jWGoequphYoM#99cg
zU!ECN(<6VQPuj^Ti=#eIuWA&Y`+jgAi<ONEd-Pma{HC<$fa|j_6(J~jNw-wgrk9^O
zc8<mOriIYNv7q3B!RT8rc>S4!KLffa$IR_}#c1yerxsbS?`34%Sd!iF)4^t&26hgJ
z$S-5_qEQR$*h0&ekvHQ7FJ$^#1YDY3LARn_hfFwC^65tT;zE<4AG7;BElk_3A7N$_
zeepucsl<hfFX#U`7nT0y@A$-+B>!WEGb{ILrhY&2p~sEV4_6b1-u>|@>ixNAb}Jj{
zts1-|zCCU;-$oS;51wW7Ik<9akFLS#R71l$&|~<y|Mv+Kwx0d9t=4FlpVjdX5Ed$G
zY@USr{%T%-FU<Ir4qbFJK$kJU2N?rWmiZ=MzI@>!2j~YZJ2;DJq<Byp?hQz8l48|l
zDYnpvSFo_(^2}|^-33i5cLlud8T<Zn9Fx6LV=k|2cwxc&lGr1c%gbE-Y~TDDgGJos
zZ#CTwTQ>D-ziO79VQx-^>0Y26^m^7i2S%Lg@}iUeSFfOm54DlLXDZj+EPN7s{&`F5
zMyhL)-gJKNddkqf$D&rV8+o--RK59BJJKV^VY2>ikG481bGKfLO82;a(fSOA@NNlF
z0~<U0g}IKHyi6@;TyphqT3M!N;Cz;!BjUm@bPgGB?yc{Uv-|D>h`*G30Qq4ZTWjqO
zus7H=(EfP_=7rOpS~P1xYbFgQLjVA`;%&>G4dZwoaq$I_ah?sIM~)&BS?-f?L}U7e
zX)e3p<R@o;eG=7hk&V4`#H;ie+{<$R@U*u{Z@2;#jF6s+e$MbMEAr1aOYG*qGCs-9
zE~4AqzKBL$&govfd0sQ9VbdGlnN8E;zqTFHZ&h?;bB&*!G;90X))cDcoJ)GsURxu+
z*n3WvYUGd4uAht7s?8lc#3T39pgQYk-{}`>tbh5Kw_IZu@~&dlPp`9j`nqTR2VH~N
zaGT+K9yw3rHZ={;ZRgR%|MF>zw`4Dt`!2_ZH~3)H=E;>)7Y<jfIWRIT;oFC~tA-9A
zQkC{aeag8<%NpH}Z||?+n*9Aj|1+QMTDy!~`qIO~V$-*{*&&y=So~(*(ptx9%1rHK
zD}}moplTpivEL7OvYIxnCh6-I?}!NFk5_Alsg3&SePKfV=1D!R>xVaQ?FFT=ErV*7
zhi}X}GCQ@(W3|nhdLB)Fi#y+-?EUL!eHwk+a`0KhqrRUnS2@ipbljQt>gzhygT+(S
z?$<BAURvheVC3z@B@YtsJxEOVUws{jg<ZYz!3q4(ztxT&Fz)+wCyh5lS1f*^97yRg
z4A<?Q@EGDRpVKb9mx1-{q5%hXZBHrlN^z30R{?-*x*qdp7iQaT>Sb%&uXIsv?%^Tz
zSKcdK5_;Jyw)DQINuRH_Z{Po=>6>8lW2yQx?=$7^)<vZIM@$={;bm)6yy=x)hZTL#
z?6rQe`sLRE!&W=oXZH<GdHrYL@Lo;Rk~F;HOcvew5ck=2<#12c{Fg)Bi;ENcPBPn8
zLey37G;;3G!?(&J{#(YT>c%manp%^;`rqu{s)@zfE&n(tKhH?gNK(0e`EvRB77=mp
zV}FFT`IZt9SMuoj)azB1e-`eF7(D9Brjkb=25Efpu#L!H>G^D5?|!A89ZFxsbv7N+
zp;fEjYR|Ur`|9>w{U*0pKCrSs(Yu#%k3Gl@5@Y~_urHARjQ#iPh}c;Fdl|69LBGHD
z$oR5qL{U|1Wtmsn#vGz0wME5^a!=p1K?nQ1UFlv}lA5SEbs=5RpxL&K4JRcpd}f=Q
zpYGRip>3IaZqC*C+=fR!Tq2pGYe4P(^?RkJOIns)!kKl!pS>^t81rI9{<9U&pIs?0
z4@<27Q0v&7-r_2ip!w{pXUU&2!^#%4{nqaMnOD#Iw$nNJW8_!w?>F1^wkz*>sJvvz
zlZ8;dL=6F>kU*vQ=9x0NdrK6p4r)uU<^6bFX=?tclkCdR&5VaS8)#)E8Z?+|X3<Q`
zcBtx*0Xl@|0M&M#GcLjcpxK7DHPwBJTKBkNku!UbUioj+$T7#t7l-HOpBp3Qwos*+
zgi4`k)9Av~rXxD6nguKW;nvv1ckO=YPN+2ayX1WOoqcn=9s1DhOm<`P8MYcau}hf=
zJ1?I+IB~uA%F^>w20rm<l@!~k;f1~}=dGnerMb>4@`I#BI&Tm<U>g;<MGcvT)Gy|^
zskPdP`t#42U0)cHzUh5z{_^Ro`l!9U^6`epw1w~HOk4dMdNydj-6LDIzWrjY^dE68
z`+N4KZ&A}P<jn9$FgLI9ozt|(MOd>~Kq%gx&2U+^tj(yIfl$W+0#v?l4mJC&fuFa&
z?j8Rn=g|g?sQvcp*@H$sxFzsYCrZAU1Z^lhx4Pl3B<t!UGtN!*`JC10LHgUZvsSiR
zYTIysP0oh_Ex76w9gaGas}ezrf@+h#ZMHLOH>I$6)XcMw3%A&P?bhD^-u#32AHVw?
zJ@^mr21~U|t3&(`2d|qyhU-%#iNF8u$Bet#ms-#N8lRTb_p?dsTASFWkK@+VR7E!Y
zb6L1{$EfUgUoM<3y}x|^{+vYviq=A2#Ms7(;BD_Kfx7UX)s_@r%3wC_%Ie=3mdqd3
zeqp-FWnL?~jnmg{$-*)nb`$M_fNAq>C-%~OQZj!?zrWqf%8~}3RDZlg`>on}arix)
zU#1c3GJp0e)#;N}-hMt5TI1P%=J?#~zgpe2ec*5<vCQig+%}oHJ7Gc-M5p;ZPJU_D
ze66<1ybpyZqjDREIm`%e6_z|%Ez7&t8ov%-t6~#-KDRsL*I`}C*AE&mTV8BrJ?+n-
zTaG*i?1Bup9daq9I~)c&EdD>jv?u%1-_`6@gca0}N|V2oU(Qhb?b!zdbaxP1`C4ld
zCk)WN2qg#{?2vW0UdD?}rNLwul`l-!D}D5K_oFR{B1MmuaruqU8se4hmVap8U7W}Y
zgBSjAoHakb&&0KB*A0zoJVImoK%LTcGt%OBPE=d6DWA$j<!#8Q&oBmTBXe7A9QqP(
zSGcFKhuVCZppo>rN5OhF{+ZS0YtP><+xb4Zhs%wp{*h5Ls;}uGG5S^ht7a4@oyOh#
zoNE2!zOUhHr4UzMRIaH&CV6H4nLQ36<LxId^3>Oy{RhH+t;v}>FCN;FnA}>@YP!=u
zrXLR${xo<r=9uA`N!>*sxb$+2Dl{nz?((BgXH>n3wQFTtW0G0)`tAFIQ2xQm?)ChL
z35iGga>}ukg7_<{saIqkDX`RKzLpRae3EQmv_I<YeR0P7*rb-dH5LF!A=GMCX&BI5
z#TybF<Ya%QSfaUnV7OPcd6;Efgyr=~q2qDhFt3fS7kJk1;RB86sW(&Xjs~du;g?&m
z?(==cn#&_f%io*5TQY59)k53&qr>CF=D%2&q~QyBxSnFnX7zdxD{eq<-M25#+<d^<
z{uw7Dx^cLs_QcGc=QXSD_04TRH8Ylf_6sx&*0UG{SgU}tuc8=vhIh9M8Lx%4s~%M^
zU4wn0@`=nJT}r%{F)7NF(aI%F2{<pNg!E{yt);aoj*kZ2FX??{)!O3jn?qYxp|b$L
zf$(~VTuuJO-nwU{ufKJH@4iskP~E2#Kna)fo%B2*%94!8-saNu6gxwO0%#&XqXkb%
zYjW@qLa3I75y_C&<daaUUQU}cc;z3bBDzg=VhB6`O>1($E7uL0O~07YRsZ<7+wvyI
zAACy>N<OkIWbyy!%eF|i>YU*mVPkH-yK98eA7PXH{~W?sQXXUzq_;H96~R4?WBPAY
ze%y^($yPhlx<rQl#)occw(zv3!&O3eK)?#8DGi7?4)WrlimJfer>3+m+ASY<QybO6
z=E2EUQTcu5-9Fs8#~$x5S(ff`&vcBP)S(XKjSOA1;q#@8zs7Y{o~cB`z52wT%@os$
z=x1)+K>Tc0F7-&A1uZ<)E@mt^IBTm1MVbnyPCN^!`Zn7eWGUKQyx%*am?`acX@-0G
zstXO=&*iva7N%xtp?p*??+ni9s;h<of|OV2uXaZkX=*;rXku^BOU^UvYN5bInmk~6
zzX8DA_}FUh+^YQU$&(`t_txW>STx}whv)Ws5UPC4&79PYd+3w0U%7PgzfEfExOAfK
zMHS`avS<-@|Ki-|S(A1uw~LGM?^-g()y_~*9|~xLy*;S#_|s&#x?4csO`&d4?G%3A
zTi0P86*03n)p~LDh%CLtLD$U9Tb>j4D|RAcLL$V)+?eNWou-(psVldve8nX92Ufa^
zuyUEj+A-ur#VEA*qm<6aFKnlH)Ur5_DpDKhLR9o{OXYlZ`tLFG$&_nk`|42WfQQg;
zhd&ylcf5SpL#OE>z4pp)YZYglRO>8T?BFc<#QG(Bs67JBll9mF`V;Kf4oldebqj+3
zT>p%}^UGuh^5^)mb-j!?eQ(%yk!)Gbp2nLp4Q2akUAf=eLXG<6=Dg+`>JHS+mz8VU
zzIb>nb*P+(J&pH;#<x0P7Qyjv`9OxDKq8<&`L)`)O^Vf4HDCMFS&RI0-@gtXD(|EJ
zDdqou%`D`ua@BftT=#J_sS;=-u&j^mVEMhe8c6}8%omG4TK^baY3|%!_o&C)g+TJ>
zOUXafy#}66m!HOxyI3^ff_pK=Qx-(a(=M&aag9Ut@B;KhH63D~xha%iScP22ABh`%
zqoUht7jo_?&wF=8{fWnaw?7@Br9&j)%%!~ZNPb_r^{AurmmJvEnxJVk1i;ZqNZ1}o
zSEU5vK+kmCFvd?SKjR<MKN$TOo{R4f{IbjrGXhkHC=X-_kJv)p{<&4%VKI@vKg}+r
z@yB1j`uQ*hRpXuoL5R0!-o~!<m>yde>1mtQ{oUY??F=nY+Sli<La3^IHF;zUjAor-
zHTCll7d68%kC_{|?d#R;C*{LBy!?UHRvojD#tP!0a*wbkMZ=v>YdR?dgT4j3!YoYf
zwL1%&tir<U#M_^)t4b@BYaP(Nzyo_AU)h~1ol`yHo+*a)G|pSRft+D@Way~?Nq=(R
z*NafbnSg+e+%1d{>e?c3e#H2ei}BMzG0y$1vq-a<igFL{J`Z*U7|7bgKP$^Ezsv!3
z;NjP;i?>|a+f{lQd(XLVq;`DVmu>TZs(=+keMVv()=@*tb|HC;9Nx(%83bJGTW3I4
zB=U;6&(~{pm2W}7{@;P|ty?B8joqb|qkc?2yT#Q0I78spR+ajax4{X;=AvCm=d5Wj
zSeV@R3nf5XY$wOVn=%={gz5)5>>&Nk%FqlCG-k3y+qVgjx-L8jP>ehEPhW{n`upgq
zBNY-ltLX5N-K{>m#e7C%W#}5_$lWxiZkU>BTGzK@Pwc+rBq>U4;)~6y#)Z=l$u3n@
zfzTn!Y0>6DQp8rbW>WLO%PrW`ikhd#cLs0=Rb=ndmpmK0KNv{4?^_k$Z>eIfJX~p}
zdsTbcr-U!NW%R2PhZAm&3@0{C(|Ata+_|HcZ6E71+VjF^xCB3V(2f(^13oq^14Flh
zXrEs{$v3o^QEhf^s@bV2kfFE!w`F$sBXNB&Q0^&RH7+_@iuI2<7OQ_a*5AFctJ5>L
z!NuuZWC2$Zx`=H79E$Kc*6ZauwonGs#3k2RNmkvYE|}^cH|!Fe&YX&4;IQe@FOPCO
zVGhE2-VpTFnD~7r`)<Ti#izr@do%m-H1;5vn|IxF1-rRByZOIJWZv>G27?h^QeJAF
zIj5(;bt;7dx)EBzDi5`so|bvJX~(F%{29Z8HAs;=R^wvE$$;ayhz&Jz9n?<wU;+Zp
zYRAld-XlZ3xLK~b`7@fZu#ls5{;U^#-uPTO?FP|H0a_INncfGGzG>yIpWikE_vxpe
z=*+dyoZWkzXbQtp(P93Zm^S(nc(fX}c$zV+y@o=hxWp2zbhjvUuusdcf~`^9ZCPZ#
z`vQ`=@c17+{~Ji>LWwVvtpS0|Oa@GTaVWG!VAs?xJl7^Ba?oh>q#Ew6i@~;zzv(t}
zO6tkspN!LusNuzRM0no?ZY$Y!onLqiyVd%dsbw!$NUnqX8CC|NsiP56&%<Q~Ow%?@
z!EQVK*BoVNJZntyGnos3HL)`IBLHr+{CRmh%4n~vQ;$8<rcRZ46+(%_^;LJeZ&UA;
z<p1-X&+}z4vAE(;E@|v5PL~#O7)vkwBsXKfrHL10<I0ET9A8+KvYB=_m$xF&)CI8;
z{%-W-y>Amp(V<D=vCT8-UKI1p?Maze#SM)1z$MizTgH%?i6t3_@&;3;bxhU_07%tn
zKSZ-6ERe(=({27&`G|iN9pW93@xfzO%xz+Pb6&r}1<FUVdmJFctBQbP$sc!TysBWF
zXkhU`-DVU!za3c__l%i}sYpKQrn}!CnCmh8=xG2SSnu>qf%c#@g_nFw*@2FMaz-4c
zrB`*cb{$9<6g2FJcBO|uH<-rm<-TMxsN5-U6S!&2D=uYt3~3LL=|1c*w#A)0;?u~V
zO-*SZoSWKndT--9U5}5`D_zxj-;cplR>VDX2<fqVoTT)W#uY!f0Dodtk}Nh|Xs0{X
ze%u0M-~M4WFDGg!5%l6`Aybf{_jkia_)XLyK03<045w>Ps;1&6N^yE2XWsIV;>tDl
z2ww5LBXvd&1<qT(uG-q>aGx9V`iWx_2Et+rBTiH>kc96x{6lnojKFph#eaBOnT0IA
zmKS|<o1_#7WlOz=11PXI)z{UyVJupLWFNNf=C%vOpD#=+uX*J|<xSGvO3nVX<|6JL
zeI!u2442S=*9ly1YOr_HnsJXZd4l4vCI~a?M!^?#)MXkZ-95qtGj&o2T_Y{Xowv3R
zdFd@r@$iGSJ)A9qo92c?hYs~k$)K*i0r^F|u@wq{NK2@@KHpM0sk}Wq(uEELLYd_s
zUyK$joCQXKhp%qd=&%(O--@=vgi}}Z)84Q?Mxipj8Mh?}t<3sRY#>M>WC+5&OtM0F
zqN(Ccfh~l!z1xiURXqQ=XGLq{`$gpoq5+32X0Feyc7Jf3ygn2JqBs+&Lf_!uFaB7D
zT`G;_D@-{0`C(-OW5vfF%`I%jZ$TR+4^fsI-Nk-MlzynR#pJx^$FqI1w7l?+v0Wnb
z0mT<BTPAQU5q!hTX3d;9UHd`@=t)9}yh@|e@NLijx;viSTsv6y2KIx;5GZpBp%);g
zND|StRQF{Q>*>MHBbD{(9E<}U#u?aq7%sS@S%ATl?Q>_?!&Jn*<RrF?Iw#$w88gI7
z;9K_5T7Pti%ZCP<>qpM$F!ax^5`(a^WAdBl>iNWBTq|yi4;ey`BFrRGzs7$xA<5%T
zEy8T|H@2EWMl65B^=t>bwh=Rti-!51pURiqH=*INF7B@Aq%nw?OtY5<#O9u$-kYYj
zXku{Gh|(GE)!V0H+kCxl-Ol7@Zi@)Y99gX{E@=Qg!@Yw@ukoL<$K8#rLF9(1$Eqf9
zi;-v0s`HuCjWT5AOR>&D0!zE`<G*`tj51CpO?=XBlA7_3v5o{$<sbXhsQbMW-H!$&
z?_?W5<r34j_Cwqg%T3PXG(oaXn<B)}UWM(8?C+nEzN7!g1yGnmG=KXrtmrCB4~9+$
z%-1X|5hFlMYyLj=^_|Bs@r0qcuJ<-%?TTdo#xhWX9sn}K2(Pl);t011btY^gc;BdD
znz1tM6Icv%ibfzp5#KwRE8c&B`oHc&r0sy>GGJN|>LxSN2N~zq|4z%>URd$mz(xWu
zfm7dVZN=iD_VZDc%Ti=$u+c&8(P*d&Uk&hw86QD7MRY>+#&>(dZ6Q@0-Rv-?$AVpn
zSe@qRs`XGl5$&)YF++?kjGkzqElNu9ivpta%YDCTUU-_AWq`H7(-m<SV-x(@XgpUL
zqd0ecO{Mqh#vbDr8RtKG#PMduf8C<M4AOLh`?A$lnIQ!PG)Tflj&MmaKy<Lbe_){K
z*|{@>4v+s2cu?X3qn9Y{#B+yEC)!PPq<YVvmw9bF!!cyHVI4yh*%ca^x3HBtL0z<l
zK()}QahNH<Xh9q+MBgbERbl|F&_mP4a;Az#J~UM78aCBBieq6harbNx(@z;IV>ANw
z5keM)`?&++Zh8w#un451QgmLuhRqoMR<eo`@-9uMpRkRFu-vFPImu8^x9FFu()Sai
z1{z5GNF3zU+F>q(Rcz}yxhfr!>~OlnHtS{Xny)kt1<XZME}OByBYR>|VHNr}xuHRI
z>M1~bSH0ZSMl>-%OmHVeg|yS8N@we~ZHS>Mt=G*8-(=@+_y;S0^z#lbBMb8Hmdm*}
zVJ0X7{%_?aK}eoOpqJ7XXJVPQR{G?Mh^!(OW$9z}A}fz%op=wbDt?Wru1MfoVSn8w
zcVSfi@`@r0h3V9(e?{Nyx-0j+1wBwYc&ZB0e|F6<w;L5>#kNo^#l$}limixiMpfr!
zy5pSq?fNwtKQD;$XE+!l5Eh&&;k>19ei^;$dgOUWu)h>hDcWOoNbDHa36V*Z0%#y4
zm@~oz>nxdI^Ks2h3DpG$VO@y$9$&|mIOyHT_CCziEo8^~_+!>HUH5AlR|RLJQ~wOu
zJiHX=V3DgUEyzF}r^mQm&j0lzSZ6bKEA4wtznFM<<@3fdKmT!hw7ZLCk*Fjn1L}+f
z)~<WLHa+V8#d2?IX(i&sgt3eBN407A)BI$>t9Er|SisvBA7X4CB@t;P|2f1ygX<yJ
za6^S)2~f?_OuiO$QQb)4iIW(oD@Ax_m>2~_wb?inIEOi%T<Cl6K6+IXdHwoJz&VlN
zfktBQumfc_GcUsM!IzEAMbFOp_{4GNOFk<u5UrM#SFD<ZA2`H<@0V6!o+HQ0>HdM7
zcOyG!G@_xY@c29D8M~eYjZ=rSzTf<xMQy}G4S(syqD&Q!T*9A_4Y>sr8@|=LUY{*S
zJ?uX`E1#G9@U{Ru;_2X$ceNpPG(Aw#1Mx@~^F-8X%U5$6NhyiZ<l6%V4ceMJaL}Nl
z3*qq_**FaM6#q`~bvtZkK`G1%Xc{|Z2v>n?bZ6%1WkU{Iv)AcX$oB2NM`Snx){&F`
z@Qk<Gz#XcGDZbpX`o|UOMuB}_9tt&Yd6~>pJ~svxz{^<5$vpAHJ=<-X{<ka_6#&Z!
zw40hggG89wVm{@ra+&+9-^i3h%0Lo!h*&BSDPYU7od?)43QwE``OxAONJf&`?@oga
zWO5P(QN;QL&zL^lfnP=O=9gsm_KS}H^h@)?+i+8G*K&S^MbB-cs*p93A@h08N0TTb
z;^XJd-SY99)FsSngbaE`y19w(JeV<5PmFj-FK$(a&%C07+9ef35}-6LhAyh-Of6e2
zMZHgKTwYZ=Yt_wpuu1{b!Ki6>^bIvx8#&+eyeY3`XDCC#px+*@ofn+`&O_zxlih`G
zQH^dcW8K_6u?3oc-ScZ^w?0`b1M2D#6US9!{Mr_0bUp6A@LC>Qsj0B$ZZRE0OI!O@
z)OPMPoP$0_CDb&-5Z#{w%rn=v+w@V7b92QGTQSV*TSub>c*8UXGDH9x1jlJx`C9H5
znKpK`_DeT$)Q3mvczb&iCqahXZM&r(d1)f6&JBaFea&sSX=3Ce5Smw92U;X(z{s}g
zQj2x3_@nj@&~EVHxBpUWNo%>~5>~}v%EvaL6d}j_8TnalC!J0azlo}C(Ez#INf1&i
z8ka4*cJ=DV3-fE5;AyEzD~(LsK=s_9gA?a2!;J!d9aWdaJ3KhZH%(i4`M+Ci*&oJ8
zsMX_TqYWNvjjOMj;;O_sI#3s+HYU3MS7LvsY+)-B`;p{Q1fK`vOn8qgo=9#7KVMb<
z;2>8Sh==_@rCXr5Rl}x$l`7wj4Ghnw|B^>y;+*VsP3BF8-}uyoVannrFGlaH%C6@|
zV7;#}MZqP6P(sz#uRZ*igJ8+@-8`4g+}CwJts<Vj@#UZ2(aJWE*IoHxuGlCgGqHn{
zKyJJv>iL?c2*c?swa%r1`15>k$|$T!cJQdq__)r&=n3@&W<;s?#=SJje^v3V#v9iN
zu|m!;&eZGM*ZRg>G?VaTxRgR>R-EHz-oM`<=+x=cW9QCOGxiNSpH1>!e}m2=l|%xB
z42iA2uf67aW8VnP?#6;ymH@fReRVxLS6>Q0sgnhg>^}GI0wW@bqQeU7)~#CEvy-I*
zMZZw4Y;5c7klzZtP5#)?=bOFUo+3LousO0FE91ZR?=M0v4-K}7^uWY~6-NmR5@9h@
z;4jF8NC=_`qFa=&zq6~e&*wS1>Z+Km%ivl#9cdz@71IdZ<-|si99%}FKy3se5JMjF
z)ZWxDG8$A03cxTKvLX}BMM+8jW)u7?9Iar<)bnU?T6VnT(a7bk*9WsJ#8&`!K?oU^
zOq%3+Dy)v8CdD|Y<LZC4Fg5&P+Xn9<%%F?b|N6Rq_@zI-&B2kX1v(_vhgz+h2q2qE
zkf^6XR~r59^g(w#Rf)F}m_*`AmJ&2)+09n7FIC-}r!dh>wz9umheQ5v&t@+guysW{
z=}2j50Wj)z7ZM_&o@)yQZs(yxLleBa9GZRUHhBsRPMOTNjzpF8S9HR}A1QyZiXRP+
zOejL~^+=mK5co*%ZuN;O*ofv?^o4S(7+4<)kRk$$*IP1ag1O>+TJFaaH*OS^$1_zC
zO_2z*D~z}l$lx3>{1pQo5@312M7b+Hr{Ft;t1NysX*TPB{4sZJ*>%MRYESXCPoWCf
zNiicHeR7+Nlu=x@I_|WbYf0{ns%Q&;wLwjsNP2#Ls}Oe(;;$H7UfFRm;*YJQx!7Ne
zo1Qk4JGK2r(xYI79ZB(g`uxdAVaZwe98@Gi+~f50+VFeMF;mNjJ^i=v6u!M|zCM65
zz#d9pQNbyRF47{vr^%7=4LYNIMqp{MJ0xV)!hQh?ZU_LpbuP+o%rHX$tuB9u?i<&Y
zJx!=14I^fSb<|ZpolW%~gpRL&i(M>s5@0GaADMYV{#FJ}D<j@E0k;CHcck+8_U7O+
zaSy~Z>3bh|@j?NS3pVPR*TEs{k0;qBLgbQxQtQ6I4wm$QAtZdF02-D>xLYU1dcJoj
zqenJ82J{da&7uLxpyzd0iO;^>-1oC)%@Pkkx$Nr(&?UdBph6OT82>5y`<sIm;<7k&
z=&T#AsO8;a-x+}WOG{EL$lx8wP(YrOfUB~EaGpirh8!;;-EbtX%p?P1WsVh*KR}Tb
z7eM?tE0HHAUC*7jO?>1gPo69nDysVk$21K@2#tlb&ql3zd@lRlF8tVWE*1w)sFz==
z+coX8Xj;#GIuj;zl_3{JJJ<j0GSBDxLV*|nUs!D70ANL%kKE48ixz;Fpa$>pHn?(s
zIHKSybF-?>CEl}o84?h%C_-RhOt40u$O4i{p-E1(zIpSo@)nq^fNU{Sd7|Wc=krl8
zBUvBuC?(8?>xpTH<Air0#ssqYxV$MbpexvY`}k6f<=t*pPRG|Ldzst#!WUWZJeYVg
z>(F{*+MzO!g0Q9>gmC=0nR-1a``fAd#G`U^JGZw34B%l%_koD^jZ)aXD77)Cj5ub`
zh8Hsk7Pifo3Y4B=elN)mD58vDQEzM108C-@lqoGl=fwC`{8P5|ndcXXrB$=R1slY{
zLB`#inQa`>{0#|-JpBTIZJbQ%DR_sM;*!F@KZfMx;;xJb4{nP)rf`IP@^|1nA}N&w
zRK5oLq{x#n!QkO!QrsXpabic-1p7&p(tpPAL#}$pb;%N$04h`I<PAu3h0gWp^{diA
zO{cLI7NhldD;pMd@LY@Y#(i2h<q`$x+pONUMTx!g@J!YLgQcm3j8<l$$jDJLXmE7#
zh$FEZ8`M4!L#XO$raz`!$dN=77>s)3XXGZF$&iLxs{iCZ7pr5Nk>90llsk9Dj``bW
z%_oZ&yH&Ef>YuBC79~aGmJ@xvs^4grmj{5e*i=<ZOeilCoA$dQZJ+)l?z1$Q_QRbV
zHm%2!0lEpAMmorHw{%>ihSAfp`O~=eZebmD1;cy};>sU!#GK_p4_T>>z~bF|_Dsg|
zXc99?NUu)N0-=}556O~IsFiwfzhKS3enf^YT)LRMlwvT<-|*Vy%hrbmgJZIDU)}Co
z*Gfx1tohe{=Cx%;<t({wyLnGMP%H(Slvyw|l-`moSwH9Kn;mD^ymVLzqCf_Y_e{~s
z7%PLg<5l+nw%?Xb>p3mB)tUIOm*_;Yu?4#lQ(RyaazL3D4@@VskQHLwAmDqjiFIOv
zk#=2ovmRZZajYXrkfIJaF!!5fx9hGnQ$WjonRO_Wx&flF&@5QI>BqkuQ@M;4cEA@h
zYAOI&*?)gH0RiVYs!?&G@NcZ0x&)YrSuU$BzG`2TrdIbUKar5&UoslhQp_T0`1s6`
zCgr%~HkJyyf|7ga>u*3b|AKyS8U~Y0D3!yyIrPy|`YmT{j*3DU3>$OnZ3P7dlbSCz
zjrnJ5mK{L2k|fY|7A}vNg$q*B+cp*|%Wm@5|Doji`t#{q_)0LPP6_L%YH^HoqGhXA
zGBsO{0vYh-=q0-@$CDG)Q3@Qc<<SnU?$`Ra9~!Xxp1KSQu{y0u`-t!J)9^-E(gZbQ
z8xTfbr?#JF*0G|>H7tZL^RBT5#_viSb8hdmTP$uW8{6hd)KvOYI1Y8Rnqip-#}Am<
z*!~SCULnnx0cXK?L@X^n6+C<qcwwfoWzEb-O`#z?oVQ;B+~h{=s@=5q@|^Z7A>jsQ
zAwBGNLFO0_X5&@%v>`1}!Y|G3gTqHys7a6D__!*@(X{8%6Qm{gyHP#and^%G-`;hT
z#E+X*ui4of{OT64$Pg<25bqxY7`!4Z=I!lKB6qV^uZ%bwOznDOUbv1!lU4_SnReS%
zmgLNEeMSbZKPBRxmD?dbp5dpoa?1fTm2|fYD8!_&tuqY<so9)lrzWpMpM>!|tE^zD
zJb|SWaB@<25$I5n0RN&sj(gVe{{o1RJ(^7?CrH6&L!p34!)+hdL9?<kwGs(y5G83>
zev{rVz|gT1!}iuS1q0NSzd7$e8)k*+g&f8X^hoV;Jf1J8sCy{&q=cBl-hE%G=zIMD
zoDcpqQ8zZ}*u4J4t>Z}S<(`(6AOVXHI%kOn7VtnJRtwXe#)iP;P*!gp{tfG>$a>vm
z&@!Hp{DCHQOop0F51-eJCn8?05OEKU+8Mi#Aq)rIVjHtL<+2UX>SAWnwWPk1LUHBp
z{<@b_eyj^QK6Oo6*dDic<eLWz@uB<w?^-()?pTsVCR>4@yp8~Eyc|6bos!OzJE`pp
zH4EtBFHg#^!zisD4)RO_0`44LcI+;X+n7_6WygFFfyl`@H?8UJ(boT7U03N7>=OOy
zFawl@zP`T7O!_@D9#A`TTtKF%ix<RY)=T~RS+8K&9GVh_W8**Y;>uhK^ijDrH^Oq}
z(A5){BW`qB{Y$P7dz8|401;iLyE6zflZ-2xjQNUP(nhG>KYSrY$!s4+AO1=ql6>5I
z^k)O?k-|FeAE#zalT@KR>kGak86-eRduP@)r|&feaZ{uN4%0%p`kv@^kWU6o3soP`
zqXh+vx|k8Y^oqmFDJQ(0|JkK{2dZcp7s<xkK~v8-qrDz9DjBUN6Q6PU+enFx^^BQG
z@7;#Zr`<E;Tj}5B-yqTvbEE?IlxCde4MB!~gy+_OyXaT)e~Z;Z?UM9!{j9dmWW&jP
z0yC5CqtjSCz;0Fn&_?|))m<D8K`a%*(|Phl=V`r$q(h)<4oW&f-1KN3oNbSO{b0DK
z0S;DsKPx!3Q=dtdW@L8Cb>O@19#yzn$&B%S=`qYcD!<;h@^KS)hx0}px}xvyO`B}`
zUp#Bwu3dil(JQh}J@!pV_+KA1SVCD}L(Wo3A3!ifmI|_8kle-<0cR6Lp%EF`+BEHF
zXvdx1QoDR=qvO=G<<?0W_<DDDiGbPCyEDA~Q-p9h-d<~Sl<S|hQm8;4G2{b8WQB)+
zsH$IQPG0Q`uehv&g5Ht)J!9sgEQ9^Sdv)$;+lU;SfZ{^g!6N94T(>GA{2&LrHwT+j
zjg%EmFS&OeiDhJXIy~xE6(4_BHj1+pNvwSAT%ZmPtU_^Vdb;6?4b+jAvu1_PtWoor
z`f~G%kRHJ+vipG12GNG<$e;u8p`vA1YLjiuw=U|rZ-TmXZYje0^~xB*x#jZQwQpZT
z#uv6W;yJWj4z4nMs^)izDYkv#ZbvF>K>Wht8tx7a?O~lT#^keW|6Zia{B+ix>>Tgv
z1Va$!gt8g@Xf0}(m?iJu&|1v`*d%4y5Y(yN*b`SW-*X7QeM;{qU5F8%?sRz)@4a??
z^SUz>FT9!K>96O6wndCL-Tn<*wSvF<3aSPdB1{bJ$<m-4sT2LEqSql6`_38gOG-Op
z;qtHFzR;Fai25WaO*R!>jXw#`NRfIkBOWL}nR@2d(80z=Mn7(7)J461%6D2g=-{og
zrbJop{LUS-egowdyCqN=22V2!YGZoIq=7n+BFLO8ZZI|3#l>Z&XXHm$W0>$}BXohc
zKJqx4w2jK&L4x*DrUOuV^5Cm)XwZ9s(sS%{@ITLEfusd7L?S6hs#I=epNC1<NN$$W
zPPbIzO>jE7{h|S8oMmOIzh%uh{g<DgL4sgM(hmshxTkwT!dhBt3Ez!UE^RV=QRuY)
z^FmN!+hiMyCT*efh<61Z!ZLA_j_)W*^1WwU+AVaNpP#Yj=DhwHR?H{y@$s2iGY%pG
zpFKV0o|Uee1OodMRj^PpiI&`&Y8QH5_2s^&35gfXt*?9H@unKG;*C0*K0zTc6!OUb
z=Mh3jJ{tRwwh#Gz$=}<{*M?@l^S|#DckxQS4Uk)K4(&TPTBN^xY;sN9iyUch(WW(t
ze-16AuIr-Mc1wkkNgx%pW%N3C4x3>c4fYGHMX#9XNjix706g`F=Yw%HKI5MCJ+hOw
z4gLCrO9`Wh$IslDf5R8*#BpE1zm|X{*$$dsNIQc6WuNi<_8@fi(0!qVB%bnFMtKot
zmIHECy{lr+v+IxW80wE^C@^@Lh^x?9;0AnieoBXu4A4^WJ-+LNQ=hh~oFW~5)o=ZG
z=?QINkG=}q(lMEGcVpjtcnn}Z>tMao%og<#4YME`MhUOi6=u?Dl%Y}m*A3L&LFxU!
z2)Gz05}_2R?gh^Up`(Ly8$;|0By+-tCUi)v2~$0><$<uJkY}X(hDS=$?g()F<i=`c
zn-f4<VG7rPY1(40DwP-!XLP4)X|H@?%U~f#w_<telm2XH6sPJPhP7lXTC7<!4Ac%5
zL*L<3TeA%k?q$&R?t<-dTK@IF|F$C?!8o*wc*Qe}hKFPapdXF=2*=<@y*eH&*V%M$
z4kjRn+pwc+w?V6ZM9Xq{_Qb)`{{y>@e!YLrj{z026k@?q`DNZ3#!Hth5fd@a1LdbS
ztz7R|6J;>n)J<;c*8=^xx?UR6>cKw$sWKcTfnkAifsmhrvfB2Y!|WX_ro>QLu>q7u
z9E3^jx#@_C((Zr&l)Ac=4f7@i3`J2W)?TatdydL4wIYo&&v{gJTN&F${>}l0hv`xf
zt%U)-<>JNX3ad{r8dzN7@h=pA-EVoBK974$s$4ZfrfkqK+9FpvsM^=Kcns?q_f+T(
zrZwVohA}75`poxEb$vu(1Id^$$#gL$D>Q<b&-lWbu^v6d9FBYhtBPawj)JCgc;Mcr
zzupK<%Pfo@SW7Tam~pF26!?Q~Z#P&2Qk(6_ZiKn{>uUxIupqWwvX54c;4)R(4l&f7
zKv2|e)ORJC#f$gN&6$(DV(InSN>22@emtPRnA76pGm?pi!{AyEeiFB`AE;IC`}_-d
zI%nKCH0T=jpP-yquU>V3=9ydhF0$^B#Pf-0Ii1av@sb!+D>@`ip>P(TL6Syptp1vN
z@BTkjy$4*+ef!3pi_6Sqg{+M1t&ECf@0FF&AQYk`EoCHok1I+GA%u{$BuToo2o)-&
zw6s)`Bt7qw`~SS2=YIXK|NXiz;`jT0Kc90P$8jF#S~;&2YNy=YkKke#vB7L_Y&4w%
zON;<TjN#P`K9teF%26zL<fVw{fXDoVO)@M~lFcweN~GMQoE3Mdx)j;xb@HTwbEE><
z5$6Y?5sRw8gP5r7iW7U^U%#p&d>}NW3qr)fB~Zwsfy1b98g5;(UqtRHO9|R%&ZpkR
z8czuGhhRuD+^awI0OB}9{{vX_fsxYI%StT45RmzEMcP|rvaFA-OxbelY0!H<*y%Fs
z3S<HQ$Fqm+lQl|Wi3~B^asIgSMx~)eL-_-0{yxv!f{FajkbOD0f|z4MUtHLQ52PiY
zxjjdWNU}(b7b*Uk!;RNH`#&u}zxky~_nH$V#b;COk}zv-AGKNmmL>bU<Pr?A1u~Wu
zlod54LsmOtBg#x<n>ICBzP<LP-&q|s95_kVa&rS+HLd#oJ8PZ4xkI1T_)7pc{7qiG
zgIjO;kx^W{qzRD<2mVyaxqwv0sr`IkAA+kAi$ww%-w(!$bH?8eKGq@s33qX*`J)<>
zyE$iY7Xa;s%OmnWeXOpG+JIbmX!8oE<FWDFQQNUgDQtjRY4*tgtRVhJGRExLQGCXe
ztPunVUrmoKC<$%d2mAbPVwuUiSBBydg)pL>T9gh73dQdNnuvqvT}VMOl$B5Co{+7i
zBS!{&sT_}dS?7W3hbtD${}f1#kR~b;*inJy6TP%$nVF^gYz~G@sKV)18?RSXpuiAk
zO0EUrjpa0Z(TRfYOW-GCh?}o;yclbC=Hn}9?TJTI@e>9*njleFZMciW{PRa@g!o+l
z)@w7RT44!1<~eX4CS(m|BY^;tgc+zbeXE2~%N}g%BnX_sV=>_tg@uT-WZ@!<Sb=j4
zEv>Iw{OL*#kA#G9j>%~gvFod8*fv_)trXTmI)Fd*`Za@{I$QnCi_%OU_*<DQg($?U
z4?=A!mKNt6awM`&{R?5Yy~{yI0AQycBSy55uMM@&3y7_*DMNPH#_tPhw{z#tFJHb?
zz?xlj+3VH;JpPZchxLgjstTS?^ZqldiOvMTZc1MH{b&?W6NBwtjyoGjpC#9!*d{FV
z;aTmOV1KMmJ<9le8@e0uzemFV(tZizoL_ZiyTrg>P7nl;z-Ho2Y5cN)PX@LMH_xQk
z+=o<sEc=Ddl&!9h!L%r>6#>ohMIMLG;`a~LkNSPEfa{VA^F6B}%)*EXjkMkumg3Sq
z?7zjpZLo6U|HeN0De2>}94hy&|Jg(~m_7QQ-2#it)o`nzI(FD0GoB@K3{eh>RV~#p
z1<2DhhjA(*NWwZ8@<9lq-{pRzNCWNe;P6q%?3zI2QNbDi`dK3@)?~}e-4&Q6v(~+d
zSekM3ICLm3My&nsq>6V?DZ<e7dCW4L%0u$B7y64@#hKI%PT*uJAgzXFFVqkFIcjrU
zhhNky8Tn^ym_pk-FJ5VdUsuhojX()ovi@h=l~zu_ThS*o5Yj3g61B(1<bihKAIzhM
zf7Wpts{{E9yeyayP^tj1Q2D~2b{o4NQ^jQ-MiscDPoWxN_qxKF=e--7B5{j5tW9!+
z*zSPgI$G|=6feuJXrlRI?N1L-bUaoiBU1MYqDSSh20Qg-cUDb2aiU1XEhHAPl2w=h
zm&~V^APG82E|7<g(VZ;GY#`($hlA2kK`ZacPh3cZ;71D}Sx=|t`Fu8Rjf+UlyQb^I
z>PAP%pw$rT^iIKcp3Dy#`d5$D;dk7G#wPewbqRn?5-=Ghm;@6cH>DIMGxh@IBcw<F
zPY*Ex{RsO)<TjL}MKLB236z0G5G!B6**o}0v(D<Tf~FhxeS7eT5|~A_CQP`Ke}vm%
zYjrb{<kKy8$y{)*Pq6E5gqoiV#DxHB|N4F0*S!*$b$hpa9|MLKn7PYYrj>?oD_9^h
zLaeo)g7%0y22Ar(OxnyV3#Uw<F3wv>f#_#cPWZ-jx1+A<KVSe=(GZCl5yMg>AvbLB
zzy6`26O--97Hd8mJzCPJUEevsNhumOP@v~!p{PQkqKIU3H;wthUCeUGtW}}9#t7%7
z^$#t!+X(1PWTHq1OzTo#sZ%dLgq-DGOg@K%gvhPsw#$-p%oJMCZ<bZ%Y_PD1p->g8
z5#AIPr3iDXBkHyPsA*aFA<Ivx85O$uhYP5Tj7e#7Ae4-V%oT7lKOi(7P)_mvmaH*p
zPBM$eRf+q@uk|zZeO>r-QUo7f?zrckIYt~31U=o!-oLjOQW%PW9+2n@*XMCgxe&xy
zSH(aO{zWIPpghLBD?CIKl37sZ&MsqZid(1J!$HU<CiyJGDnTHQ=zmm~?eTWScVlGD
zH*WG&TVi?(n5{S&3u<SKtp}NpXn{Hy-2XQcR}o3m)dD<Oo=sfbWVBX77au)()X=3T
zMHCMMG~o|fV~}okXAtWFg)%$o;c*l(TXkxbz=zETDO#)_zDC;Y<Btc%wo#qdLr2HR
z!zbw+8Wp0MR9}#-l<T-rL;fJPI9N6j8^2+1mBjsBjt)HtSkDOabd7YIw9U)zuWtjq
zjyHi(dsPKVw;b->yJn}CitiHQh<!=xr%>>Vj0&({Hl=}`aDUsKbD?g7=|X66fMI-P
zGyv`V&6u1eVktnfbpk`6*`HXpi2l?ksZsbmyZPWVkd2Q1S3AF2Ka$`{c|f(mss!2%
zS>TEw_zodmaz$(BIhnQZsWT-!b50JPOHE0+M2BV)u>9d%k*eYI!eli1Sr|qJX|hS~
z!FJ{K0k)YR=VsW}*L$D7xq9W<YOl9%cL%jpR*zR1;jk{S=7a5dlWpnYWmh-9x<A*y
zZ+O_&9bu<O&V1c_^Njy`uRihDh(U_Z4c6Rg5TJeMSoMRov$Z00zRxu3qUfsdpjlar
zb>Y2dNlE!lmyh>IwU5v_=h0wmb-T5!39&=A2Q{3ydBc@{n@#q$o^<fTOxFR8kAV>r
zSHIV0)+a)Z$bxdKJvBv^d)>C!oBCh5k{i1EYVyWh+65Nh{EMFfbd-78=%zjQww~0q
zXX7T<<CP{kw#Q?hRUXu|Eb@iFw}d+*jc;A@_oZlK*%km--SAw5EvN~F44Z=SW!JsF
z82R#6R#+10pF%(KFQ|aT=O6xk9%Lt58OYA3*%UI*w@5ePt!Lz_E&Fe=k126hY>}q*
zXC=x&uJ^MO-CxMw(6F$v_;Rtk(1q@44NDqHeGU%lNJ1c&rclbA*gelLSt}}z?w$Ka
zL&gg3E(c1j9L+{{-lk>XZ{=u{zdpKXeH~rxkrlVDA{d^kOU<!P$YHGS?Yu7ar+lhF
zixvRCM5<x+fXl3@fCBIvSFc@TYm=e^y9+%01^wSy<!AKq<HtxWhvN|D6>q9*kyTon
zh=k-jrTq&GySG?dmsC1z(LO%>^`_CY-Iw!BZ9Oqe$F<60c2U)!-|y$bg&y(^fxQx)
z8a5F;n_fNQW2L@<f#=vn3euwMw{Q0yGv*vzCSp)H$45hQI`VFKL5eC1-3ILaK9JiE
zp24z&e_4HoxG0g&RJ5t)b9VM5780BrYLLzIXVpUNyX~X;_7w{~{&b$7C_Qisq_7qv
zx5Jkt{ikPSXxe@0LGu4y`0qV%Kprth!MDebzqPdJ=G!n<5QxW#@m>9ZNv})-L=FT|
zt?QS}?`7%~%ZXeW+l7$5iU$K-)HrQzQ27v(sq5H|MP(-^jdxfnTO{-IV?Ye>R4rVz
za<Q$gEumoO(4kZ<Kzz7;tBxD@^TwUG=0UHMk|LoZ?ZH2!wO?`TEdE#{*2OBxqLi$P
zpWeqeCYQdz7fH6InqXedHg}>1v}A7t%tEn)uUJwEnz}Auu_6|HEouEo2v0RgENH8U
zngomQNhfwn8^`m0mYOO;6jslSP*w~YwGD`rY*w%>G;LwY03Q6;bNOBl|F|bi+rItB
zX73_{Cv!$zqeT&xk8vZ}%ngs5n%>j4z|d{OXm{X(kkSe?HH7h_9DwVwO21!Vw%H8_
z8CdP}QfboHLreYEA?0CF+Yts!P9mg3RU=CgDB@A=0oJnVnlyBn2g^pUX02M~gW?!}
zesi4&#8)8N9ql&uEd9$OL&HRjw%)N41B*fK=IfU)vm7(f*NmU*?6cI*vw<GfE02ZK
z-WJ%or~-^B;9arebM6gvzbrincHCn(XU-h&5AHx7M0E~kMCFunhA?2F-C%+-aw5ue
zS}Zx`qdrwuRiSm?$CsmOd5l&9!|%M6nLj!ZIua+mF$rK>S^|ZYJ=Q<6=~&D2IB-=t
z5=~0%=MaicaR#9%E($+?p2U`epfm{p(mB=0e)MI7_wL(QPD5H*VzJAej`bXi2VrCd
zq5yw_v*%8G4%stSON-wd92VyDx{w{&Pct*SPu2ApvbSu?%K}&R03V;68)rj!^iNk+
z^M_^LwVq9gbirSK+6k2emVLZK!wf*9I%-r^=!?Qz3B`e&M5M|u2voFMw7%sJjvM_6
zjw3X9?!UuhA37UH(3Gnz`99WV%tAfAm|u0Z?;>{9FNwpboNZh7^v?26vlcC^#&_u6
z{YAr8<0gPYP)y><3iRyem+~<$kIhYv-dfW&_bd$lnPG!8x@f-}w*gn`tir;X#39fW
zx{G6I<Fc-<^f^DL@7cl2Covu+&RBBIX*YioeDVR}`t|JT#6_n^VcAX}9Qf&S38AH>
zrQ&_xxh1Nr)wGSWnG@Z+fYt$oW`27)p8gvH7aAJQqf+6>@h!)V8z;-RMAyZ+Moy`b
z|8g7>p<O>-dxKJ~0wP5oR{pEDY|(-RyV(Pjndx1|a%X|d`Eg_fGZAk|NFff12PR@8
zjTqB_ci@fvh7FUYQw`6h<$So-t+a#%0Q)ZW$4h(0?|^x!OEdN83&tZmcVbQkUh5Ot
z21w@gzyEH_=r)bXuW>6&rn$IL67UJ=vE~4zAX5t~N$VjQY=rU*JJnL_j6v!XclO)c
zdPjbC_7f^{yF52j?Ac7{f{-AwW4EFHKi%to{sb!M%PDTwuJ3T0;$^_;+i%7sBpf5J
zGEyfiCY~oGT)BHU$$pXzZv!*hDbuE%hUH3H|M25mLKmF{yIqM@6H!mFdLmWd5QqQM
zrzI4zBEvU0*p<G;l^t&I<Z1bzBQS^+X_>8UzU-&GwKM=>B9V{zzDw7yACj%wl&V=<
z3py(+&!C#cAWC8upN)J+>dL(vqXKQ^h6VPkl$2*I)}jEp6i}hdcAwND-lVFU+G%Pd
zSNVq(WD3M_fx0p>Jc%!uz(;v$JKTQ%4TI~QUwyjE4I7)5-?v{sbvBJbR}17u3C8~J
z0YF1-I(CfVB1m{cMTW*PvF3bTr0&E)Og&9%nsjEboAFBqKB;Msp~=+nQ}+*Avr~Pu
zNnBjqfWd=%WBg7tmp0x;QrAJJZR^6+n^`;!FOD4@FsZ4fI%-qC9y#QUi;GBnH?V;S
zQjV{IpYh<qM`_pEwr;&8!Tus!2A1P(!fRIjlNULkoFvnd_kq-DtX;~?U!$a=qC#2r
zIwfV=<jKxf8L4Mj9Rh$ZVTC}<2jQSaLx0Rdum|W691_BuudY&bC;bNx9=zfpfr@aY
zF5SQXFYB*pI=P7BeiZnyS3p{Ac}W{KutwU2RYCOdAPC$Gsp^?-l$N}PP29(yuVp9F
z83q^ku|(`NZ)q!{CXyv1nCu5>-L-2xhf$y|T9;>OX<;#=LxO`P3sWP<z<zpQ9HjYl
zo24bvJ`qpu$H?Pr)5gggN3l=i5)}z5u_NSs7!fxvC_^#XWK$Mz$eb$e(u;AGbG&bu
zw4cfnVlRrr1-iNunVF;V;sKZ~O*Fjj=NI=bZz(1Wds}DMpBlbMQHzIxumP#DCTJ?c
zP1#MKs{hDen22L^J|di=qNGzv@75_os>G)gg4<P&Ze}^=OzfO;YoypwH0aFCpglW8
zc~kSK%oJN{@koG(am=+wCW@eO=KT3hhR%1YWL9b)-qCcfs94tlKkYCyqSNOpVK7O4
zvH0%x<;#~<>{68nEHP%ulBw)#qPds90JxT&7c5RA;e&9{wRq_4@}h>aLbuPO_FUq;
zB_zD`#Llm(JHG{Z<J-(xynNxpZV(Du!RbtEP;$!fj?LKd<HwUEHS(YCjZ1c6*@moH
z8&h-Ka5NRruj*a(=Xy@we}@|i<V4p18}YBERUSK7_o!^%71p9<%UraU<PH~Z+8$@d
zc1Jqqbji#@1O-fgin8>>&f3lFVR!*(5l^T-boA)9M0e1r$6#lIT~9xua)nO@jK?N9
z535n^w0l&WQC0!Nr#1g+gbExFnCb9EB%t6ua@(lkXp;$sQ@E%!cAR)4tDy%54N_-o
znS;UY2M^8yIDvedzuiCePujAy{r#2%3OBU-_%{bzTw&O!P^V_$9UQ<DlPkg_L0NJd
z0MxX_n@69`9F`rR9#X-di^>sBzoluS$Hj*aJy2!s=AW$nFKjI>2`k<7^z?{>uop*F
zwBOr1P@M>W7_4uwOjkD&ajS-HX;<PVGan;Z(dPakvCE+oC-OygDEqD~euS_{MkWFi
zQyy&5>`@mzY^T$u{xUf1mpqF-c1(@uvpaVbZHCLN<eai2P#$<Z1+KXJZj<&ruV&=@
z)YPCCk!$gGr1oiR(wu{>can2Gi`A#HIFrhr%OxsvWHZIq%Yk`mFt02$2vS4)ei~t;
zn&bB^f==~2FicWN&%Mg4D6=2^17OU^kFSr9Go~C)D#wH8a?EAon9c@xpQXQVhWRsS
zA17xGdQ9XAfWQ}S+&GMw!1V7cEI;*q%`&~i<YB@St|IhFd}Ii!y>wpm{nPF2?EIHp
zq`JjYwwJ1E?Yc1YCDogeZ=s2<U~sjPH1UP5d@J(}yt)oYa`zWieEHIYKY^c+Dh8#L
zfWrQ%nO{N<U&7dOgxeejO=h`D-MUS6^;PA6yWTPh4-ePmO@Ps=0`_^%vrWDSsSJdC
z<Vf78-(%8z)qIeDO3I}x;^iq$YFa+ZcHO$Gd(o{#UbuDZ7)oYcly(sjI@Gsy?=@rB
zs)>-k?;O_&yeUc*4{e9D>{Vp1E~FmCBKeesxc%rr0GdEl!OrSNCpukT>c^(Kyh2~^
zJZ^VBc<Cx?Fs4)%EMGp8{sj!h$jFGNap=eqzyU(nTF^}{VB&(etA?j9Ueb5BpWh%V
zfIn|LV+J+dmC{=#w#9EFc|$TsM*E>R#4l+*Kg1IT{++=6IMJc!mXPRTx-P&WJ{8vA
zZ@t}B|Ge`8(~PJ&&D8d_c8*2>tU*Tvr44=v`uu*4%JW~@xw+A_dY7$|N719|>+91T
zexjlI-TF{Yhh>zGn>g*tLXaW8JVg-ct@E+GK`+dpwHkQ59!&Gc-}F<bt;ykuehz-`
zh|aP$Zhx!pABukVV2t(5^C=9a;oW6_)?+fYBFoju!6K3IcK`i4M)XjcdP-q>`fKbZ
za;qn%A0sb@FtEhJAWqb$K?iypZ-ED)E|R{7p*N8QAwg=WJ=n6_@dW`of0<5Sch%>w
zrC1S^JtQTE#mXEC$&C5eH6VpSRb{RcWta^Sg%CEdfQ)Y+?HCJHL7p}WyoqQ!Dyrdb
zQXr6)$8RKo`iyxq55h9jB0sFh{&}3O>*o6;%~*FexQM2{{7H*p7y36R(P-y6wYT+#
z1%|4f1I0rjO?^WVs{S&nt@GZ+a(#o->asgV&~ASDS@}X8*w3)V>qi6Mwdmk{cAl@W
z^L$xy`BbJ{;m!-IChP@AcyN4_djfctMt48@8L9+6A`(r!k_&@2|Gw0%v(4ETPh+oA
zusU6hI^L)WX^{XgiX(UTv<a=Ir(Xie6ZAKEV;^m8Pg16n%4y{(U0vLc4}2-v)W}HZ
z<m9%5h{i3YO6}0O^DJZIUMv%k?v_6E)tfillDBe@c+T<)5efIy&~Sx5s!DqcMANkA
z5BK}M+67J0Rbp&$d6>-*61F07yrK5xmA$O^_Kn-I4kyyN$p9F37CBH!E?)d*LaW(|
zz3>-FJY%6Wbv5fc99cb;kEHnGjT;*;9H7pfq}YFh`k=;vWM6)UQxDI{7kTm_+<_@5
zn-jUWZ^myjH*Y-DJK2Uv0;WOk?W3g?ni(|lcbYgDd6OHCQV}d)aIxz*Z(e@%=)=3b
zQu9%+gVZS~2>!>9A0Iy~lgngclGjbet#$T1O@LWut%*1Em2oF~PA>gXw%oE7@VlI#
zWgL_y?cJ4yBR0JqP8&&wqysa>I$l@q7)0QRmr?EIp?GrY7FbzZuj90mDRe>U37O1W
zn^1^q+x)27zJqcEjtex<9bAOK!u9KVvgFU3HNK;(w2vez!@-QF&*WrydU`f$)JWFQ
zxq(?~oI~S$<>Eyr{6*8o3*HM#ZV?w68uZ^S)QgCe#p8${eNJNj@z9I@*S&cci_so8
z{rs#RC>6(5@bg?oE;mK^)GG^tF{V@TBgzpVX@ddft<2!_9UEY(eI`V0YAcffH%J7f
zsUL+-1!WiXT>b_gurPIGLDQs5Q`sKpOriVFmurIHdF>=8!u+@mU;SirSyz})<*%%F
zW(!+na<U8NUO<ouoQ~(Qv0I0}Ub*gSPSzY*_Bwt%CX!t(EXthTEB+1@9MG6Y6-<G=
zfODkyq(Y>nvZo6YHh{+L609_9QP~2-xHC{+<~K`z&xZ$^kxEl90+a=(U%7%bVmDK!
zmGIluA<bv$${aee=3(j9fiAK3!$Zio^W!q-^tidyk57m?`~X$$Jv9Do8LP~UjEZum
zY3Jj+jWX*1ZtgN`SkK1Ssjc&EJ@HWR^73jpuGa`xID#+#sh*IH!o1e?=H@1GXNR!Q
zMeOWYmHUh%47$^T86<yc$lMU_3&IY9)KJ@zFZ<VSnVq-Ylp%q27`KhbBtj5U1VxR^
zb`ZsBj-0$)+;}Hw0I0Qu#sr)1L7PDYQ2b=_`x2z<>UmXZynVm%aHNmJ$q<GSiH$a4
zo<OQQ>3p~1-|CaJUWN(ix_aSBhq~qU`}eafE$>=&8?y)%1fc*#1U<AzHE>JSfPYzo
zA}8e9tgfRsDQ^kM4xoXcH~_#C#}A}VsRrKe?p>%=#*F&r-qOy(#2y5a1;fB7QZG0<
z;)~$RI0cbO5Hi97{V50L)VXsp{yVD%U4Nxy+?qw8QE_ocIBz^c^TO)Ix=JKr?9qaQ
z|GOCO-w;VXPn-`&qq~JN89lS+wu073E_+kG(tA~A9~pMq+xrD2i9MT2nX2z24;kpU
z&!|z>7sgEH`2??7dt$occ5Hrl!hA*J{`X`5PYVFY&79u79kn)$%$20;78$w~u2)~V
zbm=s6^P!@VsI3KgAx<6i4^PW@Hq}u(@WF$Mw9H>;#p8gIEWt!oOY6Lh;L02Y+yvEm
z=8u#h)_<~;s=>KdUbgi|lT%V&k&1i*{e0Hlywo4J$3-mUoOK1-=olcGq17BfJDQ06
z5{{eZeEn^h+k}0#hXZkrd|9-HUzFTPAxs0|25(9BN!?-5waFQa0}!Q#gXVQ}@kupX
zmlj@i+eeMA@GRXr?EZ5`YnU`JhHnu0O}OFwEn9|55lu~J`}OGho=^48czGo<Dl7Mi
zJkfjj@Gpb>i~b-N#%gZA=dv^FM;~9TsDzwa|DQ)o2DS(D8_v-a!v=&C+6c2xQs~HE
zC)ZZmZafKO5(L8snVDjh+zBQeFC}kmVrTYf=RB`*Gr9WMhXSJJzbH-hPq%GDTU|>v
zRN5RU0)h@0P$W83W=z5zL6ibsPjS_%RV!|2KteIWj(U};opx>8_OfdBClNS!C76hu
zF^~L$Gp@q$?O)f+=~rV?%5=;1rGDH%L+_M#;eme9#<ZD_rq?b5bE_V!uQ}gDs;sn(
zh6@kCpaIu$EVR!q@)M}WVARmCjCth5!<V*@ff#+$#+b+ZLE+K?hsBYG0Gn!XHE8Jr
z{zULRqxKkQ45VfR=N37RAf2G`)X&Rt1OT}<iB~2f?2`E^<}kKSe=xUcc`{MZqs%dX
zEZL8pETAm)u|>k@MAVg7lz;h>H725_B|UTKG1c}Bqeq;$M~vW7JbpZZQOP;j7SL^&
zqED2kg%g7KBQP=^J%49~OoJ>VT|?cOat!8pXq*C3MlkNW@`uMEFBBVbhzBCUW46#1
zMMwXAY+BC4(qI~2)BoP3U2$l-J$DDMie~A0aOpH%B~JHf_ggW!v9-5@j%Jv5e|dZq
zr}Wsg*G19gnxP-2&Rf<#Tdl@K$#x$hv~9a~%O5#v(y{>zqDX1Xu&dM^yh;jqMtwJp
zmKK?L3|eH$ZlEIbw+m<ed7b|1Q*~va77H#d;vwU4LeIR|^~;L?{_2ui)bw4~AFtX(
z8{b`^r<eaQgZq_ZQ@#6sm|4Q?cPqcYOH%_9)HckwL6#rB^q`^HyHYDtUX5uk<@M9`
zBfkZ|D1SXdr>}nLJatZrm<TCgoH7~w4}liU_Pi(Y{>|8xMc;k+lB{fuT%}yl^N{}x
zI+iycA*m8TS?fa|Pt}ld*7~qhs&;%MkA7tb_U_Fh&GJQw)T?mFKnU*163+TzbepVV
zKPGP)5>$33qwb19s2<5c*m5@ilC@%3devi0iJ6Rl$@~dHmo8rXkX8-X2-DRVdi=O$
zha=JUB(%${4t1GjozVJ?C9s0B;(nFYzV7bsmD!&^zt0KH`4)H^LJ2sYHa5{a!{4YV
zKTxIU*LRm{gBpFm<n2+9@A^;tbjz^ud9>iSmO`4De+E7Dym&QdyG4V7z)89>rn6XQ
zpS+RJjYdSX%ywK#-M3r%y&c=zp5E)0eLE{Rck0HC16fzW5=wOjEBQLpK|J{VL_)wW
z@Q@34@1CN871P*voBEC$r~khu?Vy7pw+HcI86_KlVkPi-#nX24#TKM&jb&GllA!|!
zejemsw&G*Hi}~Uq_JiQLsKj>|ely8`|1f1c4M-yX&$9{CBEdHgvO?9d(lAK5D`#Fz
zhX}#hu!-ZzxYv?DJE@#iUUl3#ooCl+_o1*%m(?cKOYeg2Q+krE;Ac%`SN;3@hfp$9
zppxM9PG*0qEK#nn@veW4k|-82fjEvZ=wl4ngoY<$p+k;?C+HbmFku3+72;UF4c~+_
zld1Z!=+|q@yI8lBd@Fh+I#H~{N8k+*KJe>9wL76|4?|Nn$IR3{o@%Qf@vheV{!whr
z2+!<86T1w*WQrTM2YP<=gSO@sOx6xuk)fJZRI)Sce9r=x)z?*jHoD!sQImxh$ydUU
zh1Z(q#cuw*Q!s{K-9np0=oyAaG|9+JHOqLEypB}>0kOdj(U#S-Evhw3Qwz7j8RXAY
z$e7<0F2Xe-&#>$SB;%WFx($Dn4K5u=r(mE;p#h!(Y}bj=SC%ZZv)F0onZBnGm;LUk
zoavj1B1PkJwEFkI_QG^Vi$5K>_1b`U%eed^-vPw6*ip1nxa#lki~AWZ@3wsyEF6-I
zJ@r5Q>$7+L%HIWY0Knbq>z${Y7cO7^9Pz?jE*I^I>F-9c_lTw%|D=q)5ngOlySw|%
zjzNhdOie6SUv-}6E6atucke#!sB$E-#CLRk>@GE7)f#OOhv_+t@p7np1SBHeLbZ1e
zFDdAVC%|3M97U;{Yqj{>r9uZ&Y1_T~e73FrVRUIlff5D!e*fMh`E^Eq!?G4c1guBY
zpME|DO5&F+FDE(dxcBz(56i>zuLWC;D|MZ9!N(_R`Na20Ns#q6hPL{HJH*Y^Q`dTy
zYd3_26&dE!eUc<XfRlgt@BzvUWPJm|+D)UOlN=U*(;V0+tbW`5vJ#aP8~<g+x3=D`
ztqjvisWY{xHZbcnefZBMe*e@i)PA2j@QvA!RPDEoJn9>V@4K4n_pi0S(o|$+SwJ`R
zmcQZ?5^^y_5jP&a&AvWAf`bm{m@QGYjo?10d@OTWz1gC==Eo1Yb~9Y<TY)~MkD~6K
zId5KG&Iui1iKN5=>tMw}6eB?%+Jzr0c711&^`~(;H8z1;N!-*|D;3Y4on@RwJ|s;F
z6_!*~_4BK*PVTAcDm~TIwq7{U*SuZeiUbygzd*w?g&T`(v!QnHo=>}dy7nF{+kUv-
zllU-iAGOy>^WL`^U&VvX&d)!<QW`oHnL)!2<*sGUG@Nf9(}$pNM3sC5w&NZeB5bG<
zB`+WdSO(foa_d2ZX!M|DX>w|+RpF~XrYFZPfI8No=L58(&Vnf8C3^<!^F@hVJ>jl{
z>)X2R)e84&s|HyZh1^tC3ROEAR<Yfpx<UBTzYXof{QUf;SGm92RGKupIySCKWsG|Q
zATBTfUxNz@qcII_D`gsfR)&7+7T<oj4AHNB^{RbvVa)C7N4Bk&__sf36VXzC$>zfE
zh7pD*ZNp=>*Vbp=x}P7u^=Ic}`zd-=tir~QFA6sZtv63E7(Z1Pr8Od@eZT?v`Ijla
zL1j5cOOg`^rk?=UdPDIY|Cx9sa_W;gmZ^}Ked+uI%~BTvUvdv>;K|;>m2;$RmYO`t
z)rn{140MX};>xl8zuV7X(<OR(m0%TC=)pw5_o&jdEDOsnAdR3?f5<UZf*2%tS{?l0
z*Ux&zt);?VS{-XvZv7heE5XF-!PhfADvM-?HKo+*t4HFFwMq*O4$fjXv#)PI2w};0
zAiI)TK{5r#0~muws|@h2J4$Z=bsP(j_Fq45$0rZ%D}&j`+<0@cEvbRxavk^%FyVg2
zY7ds}D03*N`q7}<In_<EiHUg;%y!jw?Q2-{sOaxidP8joB|Gh~Ugwfo^3y(d$MG*;
zV;?-!Gc^DDJ@9OyN?XI_$#&t*W6U!0?RItbPp;Dr`%v|vv}j20@TEl=rKw>-pG$6O
zm)&xFbldS^P{>GsMb$7J-=NJQDRp`u^6iYmUW9xO)|j*<=HE7e2BLaHi~xMN<wa@b
z2eaPsO13U^PhfpirK?aVN^^{nn#`|@K~K{9MnlWur^3LUJlVt4<9CES=x~;D6`^E5
zv6_PH6d_jIwL9l-7}*&u^LP<$Xy&&L*K1XbK&g_s?Cg?~gkZ~@rY&1WCnRjXaG=VC
zDh)jiwQ6F@nMoS|eAQYpY?f<H)yo};bEdCK(gS*?Vn%Td0}d*?aM`l|4!0j9H5Fe1
zG+qtrOysm+Wzsk8!kSP1n?ee2jB`ZKv4QquZe6{)pXJ6|X}(Zf!qDb2cJXYjS~>i-
z!-SZ;ruX;=aIUX*AJ3gf7)@N|7i1F?^!nJOXCH27#^2N1e<-y@M}ON)Pdmln)qU1m
zRA<KS2+!CRp8KQiv=4<_GRnQITlr~^8%G7y1UM3#mD(B3|A}lj!(6Cg5QwxNH4>VZ
z00-;TeO*L)mkE0?G+E4T2V)sEhxb9bA)X&>Gu}OYh=6&P`BgEydPQeU)m5VDc<yfT
zXeMAgZO%i}s9O%G`iw~}d-v~0tOAPHY53twFJHZyMESzwnd3Fi4K#^ZqD(Dc8tQC-
zf<{axi5`@xz3;buoNHeNfDrKamG9dNR7@t7S($t|$fW`A;4FY}5=F1xxpR^}@hE&+
z1z<$GcI~3@BCv;a+puwCHk@8IUI=So1Lt^;_msLES%ztsc&CL%M)L`h%uD!!$#VyR
zZN)VP9NMwcJ+tebyVl<47NnmqP2M+2e|B|<Mgx&IP{5l9RbM;VzG2zEj)oVA-g0LU
zs%54`^Z{fDJM5EOjKEhDRfD`_56v?Sa5%%=YT0IY8#0<K)CUn2G@16~zvd=+az2yw
zj08<NHZ6)7oW|Q{WdHNJsH)o0_4>l=O`A5&s{iR-?+mQYwHEInB$xZ-p+Q3N&>BCS
zy4vXPCQT~gYFm9B{<O#IckeW>Ma?0B5-d9(duIQ5dAe)P?w_~zx@8nrZRbb6pv@RC
z;1E@Cz}DPZg6L5PB!JF|+E5fShcDf6tl27F0;A2-|HIR0`JGh_5Vtn8X**->+TNLU
zrMA>QX!_4nHe=L*C`J@D2nk5|YCYD3-HX1~`qD1Vfd{(XIvkxP;#tTEeDs2LWZWCm
zvs4Q_16)Uh*zilBS$`Srtm#7^nmF@uw_8hDW&o)mDbLi}Vs-D*A%`z9!XIzyy`!<d
zd7CQhP0oF9+XU_1W_$X;fhJ%O(Z~y!qD8(qf!8UpV8th?M-D(&$rD=Fy7yB-JU5L_
zyP&8(cI;SK``ox;5l>&gK7;Z=WUS3b)@IopaQkO_xECE2ZRE@uGsF~2z9WUI(5E2H
zktq74$Pv2|{MTsyMv49m;ff440gfE-_08YhX}(b>#m4wRf6pDZi#f?g^*{4<Hf-9o
z7TGe7NQPdhc=+5Usg_Pa+9O_ewb~wAmK{FzTkod^F-aT0QE&Tjrs==cAW(qYffxQ7
z8k9#2qJMk%@L^-Sr}BId9u1qStFa|ZfhO#6?#4jYo;6T$X<aAhS>95k#*NjCpEjlN
z7MLDY$VK%x*iD#F!Z-T+Z%W_B+y9&N)eY|*lK{+4OTRnXjPbjCd9Uow#KTO56Nybv
zbgNz8Co^DG7CQQLha>yEe}3wROnDF8Y=sU&Tig2vVLl09{|c1O87a5B{Qs(b3iz7_
zVT{c`_`=rFFCYg(Pg3#2SQ}%!hP7T)CmS{pxH3q@eYQw*HJ{>_Oj{5oe=Rgm4NkDJ
zlDI%hg&1<tGh~UentjEmon6}XNM1EbaX60gM~)m>JTr5RWQlIm{oaZ0X2F8Tr0D1W
z{kIB5AINFFf{a#xpKaN4=g$Fao$d7p^e?q!H^P*>_+nGgD7-F3b`5VSq8KUEwQoA?
ziCo2&h5ZPxMeWFOhLoeH;Wpc2P9n`e6~kH%9&jQ?K~Ct4C$us+*yUbYN&mzAu<|w+
zHz*@|0iulbOq6N;di2=s>$M$(;jxp-hq>G-kn)8HqiBF!b$0EH*zaaL@LEdYHr^v7
zh}4FH&#nwIUx(O1nkaM?!=Wbuq(x{CUo5`>m3-j;Q+&5<8LpZWb{akTix<DGGZ19T
zArvSaPy^6YoRnd6h|sm`#`Qogs=U6f$v=jz#(hzssm1R&<J*QU@?@`|RN+Rw1*iwV
z1d6B7i)4~)IwAtlgG5iyU*^ugI}?O9N8UNOU+Vwc(1OMR(wUwve{T1^n>TFuh$;a&
z?`fDGW(n>t8%&PnaRW;sK@??2c3z%MztOq3Q(nD#0xNHiohcc4s()(rY!S%41GGV2
zCXw*YojYTHop+wg`oajfafHB_iX5k1qI`Y|89M+E5Y&4Jb0wCc=@Q`(3aize2{5pB
z%E||)`E)8sXnjeKyf{_&M|i?bA5U+3b)TfK+fS;E7;%bws8zocDajYIi#m@RW9Ik1
zpXW@~g-&BUt$}~uVx_;S%*tv!-|*wJu7u_+3F3V0-mSpZV@b9$8FUSYotox2CkqyG
z8m44ur+yL|e1$hh;pbgfYzlWJ7M1`<k;0Y~Xmlgrx@tR&23FNG+Tj9a*IGqo9t_5=
znm8l7^5~9-4j=w}?k>aD3LCr!KUFxTsoumNsH4qPUG{#5mh6hk?V^|NA~_w|GIRXI
z-9}*g2pk0Di@q5VQNO$F`yWC#DHGkhJR1TV^vKIMZ>B)1czc-zrGXx-1=*rpVeD}d
zL?nEZum%C|pfh=UGI93kk%$jGJrj_2TvneohNcd4Vj?`S2}V#ENvVsJ_E8vMzy2N4
ze&15RKBGr}^HNmDN)3(z^y+KVa;T(`5I06pAxk8MLUT%G7CXrQ3#tHTf{q#p2srka
z>S{oo7Q=@5r|q-}X>GIwYM5CAJ-yVio$Z$2TFQaD2CD-u!iSI{pQ0+h6Z>GTl4Yw}
z7SU)exTzGr6p$z#S*Ce8AmH_K>$jfF@NXK8RtO)Zi??ni)M(K_%E$Zh@YhT2#}8j;
zHaH$TCx-Pu?H7kphNB;*hXO7Zu_i`d0ZiseHbO9mY(=n1k%z+E2?&U}&13Yz@raL}
z6N-UxZ5?ky7Kk&G=e(Ff{tur}xC2LuUkBk{geE~o-zInc_<WUF<>05`fJkG<D$H=z
z3{HREPG7ENAMO|VM$AwJvmZVBEV&(FC*8jQ8%gUwO*s6g7wN~gCx$lTar~M{&hYj&
zy}L~GTQY%&y&c1d9*K$bfPntsz>w7Z2Z@tLs|uCzj;@?tzL$(<AZSbXd_U5%8|_%)
z%O5oPG}!QlVA)->PPJ44uAD!Az9^ZA+AvIr{ZG+{z`Vq>d8dYUNWv!5K4LG)sTcDY
z#AuWz(P?QbZ&)@ud`WS}V;nUJ4itl^OdJ{+BJI=<-r~e3qzQmX3(ITv;rObEyLa0#
zR0)H@ho+-LD~HY)z7N_%GCfI~sV1%ZzA$v<2XuZqA=3HUxq!=G31KAv7*$p=%0=YY
z58ZZ%?<ge^G63|(j4WJOcpd++Pgl+v8l%gsGaOQKt+jQ5zi7A{))`g#CawSS_3KmE
zEJSmH6N9=#pczxBJ;W+QuwsDeM*whmj7c1${{Qt@<lTg6H?#<d+-$dbK$gj%P{)oP
z#S>PVaE`1XJD^nJn;quk-wHhgwSv%8%iqU;n7p=ZWqKep7_b;V7cMCHB>gAnh6e}(
z#C#x+>`L@adF`beUTysL!DM9T1~T>Xx<MA*F0y1hm>dmMo4i3dD0KHz$;zBpk-jaw
zF1G=SgMf3#Du13tEC6H?z(J-+1;FFWES#&9L2^spxUMukMAZHGT$wJp7Cf4-A^MD?
zN7wQTqVi{iSk%j?lB5rV!Ba(?&Qv+NuTEHLQ}utZsrg7dYv|vdSX>`LW)L+NT2@fu
zygXT)E5E6>HgoG2_G`RjV4Qlf|KTTO5;FYbjEuU_7e#z#DjzsRv^nVWAPUx~xluxI
zS(;sCtqAG+ICo%R>1q?Y51@Kq8h@m-yQd!M@DUKPXgjQL4bwk!`IrdQWKezAu3d~X
zYBCE*((Qw8VE_L8e&6~*+w%5A&jRP|zS9VmPY5#j1FgE7aA^gI7Q)EL7eyYNr<9I6
zbykNHNjeR_vG~SV6%%@VsCo!q0LunrH;BY}WM>G8e*OC&#8AwVnHeTKXY)Bl$}2tC
zr~gh#Q$n9{K$3+}781Ioer22tNv(ZM=oRWt>4ce$)>Rr`TddBkxIG?4NV0R0)#Y}x
zH)t}w%i3n@`;?WBm#y8P?qGq$FFlY}OG<xHb^--}nWM{7M6o8~Y_dqN{{7cqI3V;K
zP0Mb&Pr^K%YGIe5B!=fNUgXP6F)9TGk%&n>LqXi8bLYd)*M7EK@XGiuQ}&CIddHA_
z?cqo<1X>M?LQNw=whwO_Y_~Kc;&9MqFiXV8@V$y(PnK$J48#|#wODbJacA)`j2z4V
zroO)#8hRd^7_owuQHS~S`ynw!+zf9iYUP-i>3PSyAv+x+Plz#NT~Zbm%9fD1zEC=_
zs-n&|!M$Y6PeWEL3V9$(eB@{{eFCCF)gwhKbFpr4sTcb<$6zMACf~cCx_agL$k)^{
zZ$qsI@h>mkyZ5S-N#|qJ7!7zHAK!cISof@~tnr)8`0FrRDClIeVsHmxA7}`Pf+XMJ
zh-c0AyMayTipq`235B6ej`B@KsK0b+-pv_r3b)B+AgP3fg{eOz6EM6~b5*C3@=-j^
zW#*gd<wX0|E5IPR8xX`yjZ*_(>QBBvCYm>HqOVT!n>R1tzi&Q)6(d5BP{!I0UTnR%
zBaT=8M){M%JAjc=(~cae#385L7Crhqz7=zX5zE83kK%=*`&4)Mwn<zz#pawM{uxb<
z+c<kQ7H0QQRn0kmzgbFAmArPEvko}92<`~_E>o$~e9F%+i41aHQ~EaYOIh!y;s5Ds
zmbu0)PwyfrYsi=}V|ZP7_S5pp1d7V`f}Vsd0hT3aa<2b(<dTN{@G+i{Zom-%7~j!c
zefI3x!t}o*ZBbzmM7-EL1Lq~@txA&)jzG-eZAQK5lk|mwYW!Oh(D;#U`|_D>EK){c
zh$7{|jT^EShn)~XL1UbpoQwvnqFPT3nMOTADH^Uar>vlWJp!lca-l|{+*3D~*lE)O
zQC{-%NLq=NXgyeI`}uWQ5UX0brBqh@8iP@Izq+d&2-bi1a8=}A5Vte`hs+U0kfX9Q
zDS-$2mOzH9%=9~M__g@^52J{(v$IRD&;Spl$#2Csl#+*G)tcK8GF>jsr1&Y3#Ez;L
zILA%&;JhLvy2z*{6$atZzsoe<z4$ZApeI5i(O_dy$~64%5{eV8X+$5>P3Tab6_MKE
z=5$>p$<0_xz7Uz>uwk~1_mab5Bq<5QY)UR67#4gG6N4^0ZIR=DUE#J|Pj5OPK9&_U
z4`>Z{B(!_T#Dz>wgV;XKEjvybvS`JMSu^k7nx^`HT7a2{tTbb`3vr5pAOd@v2R`9R
z`7ccZWYm(qywES_GI=UMl;$TQ=8_mi=0;Ts4$*0ZaU)&>0Oi=F?`-=_)s+g5D2}oL
z*WmvIbwQb@0MZ~Jk_@x=C7?p!($RO)+SVVKR*t~gFlTEE7mc?Ae{Iw9v~oYtvdc5r
zd4cSNzTnBoOadJgVub|J=1R8aTD?VQ!Z$Mg_+%fsy7YXlQjpqhew&wlUMJ26SuF<&
zK&~g!3@^LvLuo&@|8qPuH}abt|0iwM&T_T?9dMJ8!b!hIaV_i&;nI@K2_*oKf=F$K
zpNxi0N<VWs`sVI9tAxyh2_`<I5@6?7j1mX}^ZnTvp5C*TEBigk9qmcwDWesM&M9tg
zZhGC`Vo@SIi@0bZMx;5sy>b-N%y=>C<vetvA;E=snZK3g=+UEpWBZ%lJpty&@cjw`
z@a4J)>k)DFWhwBZJbf?nLN3U^`kx^y@jb$3`3W~(1`JSe<}-D$hG?3YFToqhyO7Yp
zhm;U-bUBQwK+?i7^&2_z>;5t$&_S$5*P>GsP3Ov$v;P$CcxXcK(PGOW7d`g)CTowE
z!<U#56-14}Oz;IHTI}W(F+8%L5JWbpJd4vq&Vc+G4_B6%HxdC60g;<W<kb@=Ug$Mn
zC&e7l_YD3PN-YL7zSTZ-a$XLHBFi7T@A!&gZ+nP9j@VpSR=7{FP87s7$WKIFge1P<
zDn=tx$32f6(L@hTxlI4jV{(PcQ&w7_v`JMR9Jp+-aCW$)Vma8<bd@PhD(4B`hfLuI
zs9Y(S&YV0sVZnmdj1eN{q)ud<_xBnAB1--D6ZJh9ZVOFI*Nz{lteANCr8LJYR)`yV
zzdn6(&u4Z-V=ZMzVuu@i5y`=jE;Mmg-6;b1&{B957Z<z#lRu1sD*%M1En2JrAx~`W
zM+-!mq7ugz(Ju&j)L6x_(;{tC&~N0z4)+w)gf@xInxCs5ul1otOjmCp0aSFLxrabn
zqoen+faVxj+D=+dwGFQ>`=_he3<oTu!lH0Q`8pMgGjtOSrhAiZEt&C@Mw6G$cz}>7
zpKz{NhY1f@k2A$E^O>@Qj8Zjx^O{$M+mHrTh;tnILx4!0n|)nXstmz&;h=AJ7}8|=
zy1Z3&wr#VE65AO6zn6$_4M;McMpP<iKn)NC$x9j<%layGkXl81?q3NG?z67+g5usE
zhdfBePsN_XFPS<jV13f1@qhpR8vL(H=k7o8B0IvNS<>V?Dz}?AkKzMJ7iIDG^IZ;}
zOl$z+Qp_#Gp2#j)joT}(xB`XIY~(vR2CgXCq;n;Cqo5P?VV|!1di5GKsOUzORKQ#b
zth=p_-;qImQN+S`3b#)B^N7P{UnUtUp(M7VKQc|5fg~%zWKHIy9niH_xU3l1c1e5x
z9@m497Q1S@0KQOezvA?-=|uVh24&62+Q>bB%O;XBqdfPker+8zjk{M=6vtnH`DX`)
zT!?C!%kM5XFrbiVN%f5E2Gv0)wY3M;)0x2>iGn0C*1*lQGDY$SSVtu?97_(Fe9ko|
zv?(psQOp9qLC4Bez%-wv<;T7=A1j~;C4_^4|2$u-#?MGv^bIl?LZFz$_%-w*=M`#X
z2Gl(z&jkjb7$P6rFoJHmQfAF)5Wv*1)&0nGpv>Z`v}R5eic@^qg-#}CR&w1~T{nBe
zgvZu8-Me+`mRQy-+%Qx1k9M{Dpb*2V4Isc|gvYrycLlxzvWQx%mV}0cGKh~w0;u2*
zVd|q31uro44)c!i1l|?k3jiyLFh!;?P704{Xgwpz!RqMy<f6-xD8^@DLFf$XMjNfA
zGl4kB$0usX2aBzX8%E}~TZnS-x0Z$#XB3&t061Q`yG^Hu&fCdqRA(&Grc0DruVY>E
z$Bi)MLg4rt(R2KGFA%1R@Uk5z48m+=AX;ReGM{w1&uko0W_hO#$~?$4D5&jE)}=k;
zMv9<A4iPuAC5<0gf<;`BhQ_a3jcX&$v^LT^9-`JK{$Y@ay*46a#^-nh<`VfkFy2}v
zOuxp!o=-C->Ha(3pxDOYg!)#Cylgvjix&||+Z`NkbDWsUtxGYV$sNkLHO%TL45k&6
zO2{3_zAPq^1*Gmfhfp0rOFfDm9&s9c4ZHXk9Bx~5h>{c#zl+F0g6Z958l09cqc`=S
z_92Ur{w7VC@^r}(AILNA+@GOw0B0}yR@0TyK=7u4i=i)&HRRwkegQ}ttd)hzag0&Z
zZeOMe6;_ipbZ=eU?X&?hjV;wW4-iya+?zQ{J&`N~cO99ZpI=E*61D+}4?q10fy6}b
z>c4N_+}$=Pk*A4(6UacE2Ylyw8skUCMB!F^&d_5J{c|`A9?aRo&t;R#7%1d$_f}PP
zX2JrcV0m?Q_md$CWj34??CmxDm<aKyiEOp=@vAGdX-A5Z{tD!{Gbx*2!upbv?(VZ0
zT=Fm5VP@tR@Z%n>40b8Hj8{@h0p8|}&8(5_!|PDBiibSl7l>RZ7-1TL3j`%${vFA(
zAZpU4$*M29mgJn?r@%#|XJ(dH_Duamlp|y{XsV%A8YD6jY3%v=u?FWF#+v<oi73GL
zL<Z9AF@&dB@i4amL<XD{#{e!#;VO@L*rV@zFr+#|0Y-5H+eQ{?1jttO;CxAO@l%rk
zO}~}h*eMn(rp;h5@FI+PXV<SOdzj-hu(Gb+>-6)>9bY;0+`D+idZ$Jk{%Uij>GJb-
zL#ji+-7B&_@=L$^bEs`{c+t|q{UJJ<zB<)o>UPDMGbXj2G_Ww-w69zI&tF-IA%$)e
z`ah}M8J=j?*UGEXBi|$5ga2RY_{a!|O42C#L_#`tF_ivy0|Uc98lSS<<3v5Q&2XKX
zTW>8OiP^o%VP<h@x&*?AnrZ>1DP;hWaG~A%qL4hBZYT8yy67A_`Rdup=LVG<pT)!=
zc3LfUF}B;zd^u{6ERGVu_dW=Q{|g$-R*Q<>zqTTfwH$(H3=99-*2Ki*C+w{%Oq(?8
z%T}&bXF1?%jcmVd2?GYTZfZXM*$8!?YXgGEDld*7nRxoII{^W;Gt+1M{g=~(3Byj^
z^sm0UGbQtf+49@AwI6SUnfdw#1TdMCT4J#%)!@a0)P-f-dVAc{=6bam(z<D*Jz*W)
z=9rugUPgS!0cVT#f-!>!7x!!POpS?J4j$iD291Qf5_>NYM`}Z?2RPxX6v+%x>%}_Q
zam!V9e*b80{XBQ8QuBSEru-!vw`9DMAt!#|c`ozgF>D-}3<f=K-aN^ifO$E&x%+_3
zeP2s>M*b>m9dV&?^e$nqknBeQ_610j*&A5^Ntbgvi?z(p&>{%(AYe8oBv9y1gu=GU
z${GclYRZb_KBE^KIw<ZL@o=@Bod!osrg8bBn5~*(VsiE0m^&<oX+GT#p0s&$(SgDU
zOl_+fy>`^==W%g4kDVGa<-gk0G)A(Q?0^dhunO7=(B}vaak@pD+xriGfDq@ku1XKK
z4F7Dte*J~Oz$F7$vhjm%j_@V3XX44j@ev@1{UF>S=HzE3enI|uTt$1<<jD;YU<i=k
zZ}jK~ddt!%dPH3}_va7NdGDYVyGreRW;D(4bOVqG_sEc(CTIf+wNnh;!Ujo&r=g*&
z0AMO5R@?UN#d4^Iyc)7Kw>fTsa^0mRk?BI*NQ7tv55pyE$MN=^EA3ev#$*Xf&ol2o
z<DVck6m6gk$}`1rg-R7bjBSRyzAy$((bG#*BJ0=px3RHtb#u!&IbtT(hjQVuS0Fy4
zMH2j)zeHFaS)m2|#4;sI9$5f9sAA*EL8kYKzJKOtP##sBPpeqeRDEn~?aGbgSytpR
ze#d!;sXNHMc|g3CMY08<0cKRggkG1H7df^G6Pyj3*IebwG;7-QWTOOeBNS3d+LmR@
zm$TsF(?^ZPZA_N-2##)QnpYCXC*#>+@{v<gqJ6WvbYgTooSV38k+BGwE+&mTbxEoU
z^ncNK#h3*(?jfENdIXQ2(>{960QEN|w|#$oT%We3YwDJRhZ^1*lUH4EUdiFYmHg4p
zn{`e$HH}qU*3G%u4g-TD&ultpy1Thq`}q|4m4_SpPrtYT?@pX!FxA19=Du=S7lZoJ
zqU0H=4zrA^-JDWd+4wIU=z_<R@#f7y2|>yQZJRdw%jvmmFOvuN?=xfDK;QH6_93_9
zS2`i)i;Q$4-;g+#-&;E~=)oYR<{lm%3}MSHVd{^*iAnU*J|<6+9c~-?`@&d)?xP!e
zZkRvcKlO~edmQ3#g{J@fV<I40qee5<;Yr19mW^&$d!6y{NH?u@KYu=$vx91y{L6uw
zx@_69pz+QwF6}AuWT6z_1qhflK3ucW9kwy$9c-Jvi~_ay0&MoNY?uHu$jBd^hkzZ(
zx**n6$$6CT<8j!K$qU-B+IY*ccy7p6!k6#f&73e{cR`b1!?0!HgNQ>1_#5bSQJ=Le
zjFGMlfqL(19}%j`{z_06B1lHcsF5RQg!Ddkrq<KL!|waX1I)f)Zu$3a@@wG^rnNui
zjYCsLN+X7%Sco0JFP_gNYZ}qYTrt&<0a@{NqCTAr!seZ`^jW@}6U}Uii%ra`;U}3N
zr-SFO`ltCmaqMK<--m>YHx^zOFKajQp0uNs+KigP8BaiPCgzlah5~XD;M5tw24}o;
z|9)|b;hO##<XOlm=Ftnz_QPVpLjW4Hl);Z`OUH`qBKHE<#;btmGTRUFEzVy8Vp3yl
z*|H_LXLb^XaZ;CGuHRQ~?|<XQtRMey6PY)!-Ik4h358uR?UjaubgW0bKFAK-q@*?u
z4i0eRGAX0;>$@s=Og78!B#Lm9L}UqO^PLjao@P!kJVH?Kp+k=|*l6}4V$#LOfW{pS
z?HZ`7tN+5P+ns#AX2XW(tn{3=Y}s4u?mpPI8eg5Tr9~S}bqk{pj+Jw%WS0eQY%dDh
z5MvWc9zpaZeNh~RH?|G?rvh7P!-f&sI$bu_c%>PZJ#si(sI^70uGP3!_e)ni=zd&l
zf548^g{jBV^Aj!=O`e=uof=?%p(-k{)uNP^L3iG!nocXfrnCB{&Z$ioUNrVI>rtNY
z#@ab~hfmO`C4(QP*jTqT*shAX&Djdws-NS(Ds?<?FLBjy*O8i2xsCV^U0>`<Ayvkp
z1cO2f;ZI+9^<)Fi;Lss?d3oWb9S#0|;Wc}HYV3s6u-mhZj5ionU%Op>7aVWE)8d5K
z?y8b6y+^CL#b18t2&@4^nDsfUV!~=&i+)gx-xDl$PNw`M!q#cEpMF%ir=~WoIC$ab
zl+4}rVR~H@8xyx(M{i!?J!uQ|7Fa1H8K0}EqGECCzMQPA$6|+$--7Jw{BgTBb^S=N
zktFm$`8z(KU83Y8{p}A7tU6iY*4!Mc_4CMm#YTvF240w!Sa2Ci4}1rXyg4^`$dGju
z!=eKyEPM{5bwv=gQ8zEPDO1{LX4a?r^08xchxNRL8c8Crd4^VDH49I~7#l|JG6(?Y
zP#PR=WT?TQqR*duk7#%MumiK&Jzzr4O_~t3q$72!>edhsvIV4kA<!qaDSF7_k`jiJ
zi(SI*AyOlIAEwC0z2(@MGwsOZ7*mR3ft!6f{k-<Rr5DmF4XJFEIwwa*Pi;Gw8EP4C
zNlWVjY>VuL*dR+jg_NTb{g|5@%^nn}+_K89^tbmwO7mWL&j%Y3rvdIHYKjF*mp)^t
zO%rby0lHWRgH&~;R(Ay|l7%DPm@K7E4?E;GjSnhga}>_{j~v}ZUyf58>0vr;9Bvz?
zWgo(2N`_eA1yk0lbN06t1!k9aL#?OZHt=5VP`YyHxmL4P(#KAXA88z4vk4$OH+c>w
z1axYW(+FEKS3>wmM7%L<-<1q%2XL)?T2@oAjns!0q;Nr_iO6zc2qDo=p6p?>L^qvw
zTMxHl67}S>-i`i>Oz<sI_O^U=DC=7mzHKn45QpQ6b6z_i-;Y6zV&jHFw9(SgU5%Yp
zQmUn4-gVsQU)ulvT@V<4S9Ro@(A$B>y*3wUE&Gre`f+_x)%{x$b(u4^H{Ygx&MBoz
zsbc%tZkr5J5B#&}o8^$A+f#dtT(YS+<W^QG?Pg8O4jnpx-(pK~IrVl~6Ni!_Hb!Tb
zeR6T?8~F9-&0!6!d$b9pKn4MV#0nkqZ1_Cwv9oqGyKJzlUtL1$q5+BDs+Jr1OSTq%
z=(o28Z&Z75t2ipx9>JA`afQk4og;f>9B6Q+ao+#tI+@>Sc($^-TEJ5c4GqOxNqvpu
zCu%5DC7=H8n_9$iE6^Lc*Kf}R;BW6D8Z|UetTbVjlrwpP&%EQTOI)tFWGE}Y^>sza
zeZ<?lJO3VCh73x3czVjPvou_zH^dff9pQ4jt`&yx>>{4~!6vxSUnj+=dx^<|7Q;g}
zr`G;lGhBK2m@zTWo;7ABi7NQ+y?bmMS+IEVQ`t)v7&uDMT-4br$0tAZv9z%{yEhD5
zM%n*F4NV$BEhtPlrhIW!tS>hP%E=<eIx#>vpm-ZNf!LwqFK7yB{Pz9(zZNW6(kItz
zuiqIBJv>w+B6bChe~2U}Ik`O-gNFnw7RkC3Hajq|#^VQj6{rcZ7egt%0Yrf91<wbQ
zu-vkx2cUsegG^I0^EqVSw!k}gRw2fRkFZX&&`Y{?_pWYR^{6LLw!kT(;bwG97#toe
zg~L^>WD?xRi1oqqTiqzwSp+jI=_WKfy*OW;rX2G}1c$O*SZpr{`maKUP##jBbKRs!
zy~lQ~=RM6VEQV<Z!0q2XH(MW%nb(E#P~+p-N}7n1$#?pnI3`Tr;X%r@KpqV-BaxLz
zTLEaFV2bNN?%H$gSVGl~Lvt?pxxNG4Cf9)U;IdP)zqo8(JN082GZOu-=toke!#O#F
z^=^e8Wg22OMJxc1Dwwf=L)=0hnu>t-#1z`R&Rd&0vK|$B?1fJKIO)eA(sK}~M#je<
zt*@^Cm1&({vvi<~?VI|UJ-!bZ@iN#wD{<X|5wGoDKL`pEUjpib`RD6Qn%8OS{#dcS
zxqi@jr@aXm9mgw=slB*)`1S|(7aG_uN-;9)cJ<#;ug;m>iLk!9bM(Jz+L@W9+v*ZJ
z6_zcVqiu2ZY^PZ}u6kq^)$KI2Sia~{eUN(U);Z6rX%ZJk6wSHiA7P!kwXVeMuc>;b
zS4JG&WK<UR>(=6dcY}gtDC1SWcG;r3@P8iM_A7W_HD>)RZMYS&O*b>EJ@eyuvqpPX
z#+0PQD5X1MY`xQ2qt#k9w{62l?jJgG|E%H@R7L{9F_67G7rG5$arkQEtGwsgkWrw>
zAn-FL?X{eA8q2-2!z}J(?kA5#G)$&&Wbt)Fy~@a{-1DCgySOYF(@#(DXks|!layK%
zUNvl~5iN%pR)9ZKtP!mmXUv};8X5~5!f4{NlMlKNJ=*Jn?u60vOo}W*K1B9k^kcC>
zW|wHkMdvOW|I`i)3YtAs1pss3zQ3tpsZlvWY_I+^H~ZfA%aDQu@~E=9i6Q|};APp$
zF3VIy^ymG-Fj02&I`dqX^NNR)2$tEI`3FdzJ8fEHxEvpnALE8{(gFh1d3dja&yl)t
zELsITzGL2Qa`sb-au%p@4KZm!3O!?hJG8+x=r&ksG<l~_of5ZMT0a7ij3Rq&|2zQ3
z>lncRXdnFw%N(CUp3Ns`$Y$QmOr!S^3$Op{rFwMeg5b|fy;=M#xF=*H7_ulJ;Q@$=
z3v07(q;10$1*t-{)>T?ctaPN8W&O;;@v9UV-jvD_f3F5oKf@$*>lFogh6oN04ZTM@
zNZCVKdGcvIFMELRwNqf6TWc!Tq0<u)dt~IqwtleF_Vkd;Dd~)MRJCV#>JwlNy>U6)
z2v=KJyny~P4?YQ|oKh0x?dCR<+Iemp^|CkX#}N!jYD8CI5s4(Qh=Qd3!p^9zlF}5A
z|N1JUdYMmS5LG&EfL{4j&)v-ARhXKZdhPtNK>_!#%yM+j7Z(o{aKAQuy8E6<7h6l!
zd8A-N-`45fTzi|)B|scJ6=4j9>K9rC85DN%@;5VA%Q1UVHQ956mFn>EZ@M%Is_m>W
zrYgYu{e{X2y{vZa+4iDAjZ=q3ajpxyzuerxV%Nt(4vDu*LJfB1HhKRjqCzLRL6ld_
zrk^+SLpslLd(>f3>_1DV`KPw32}@3`-f*Yv?UrQgu-D#iQ!*V2b`(d2Z#GJO{pfap
zc51bLs-Eiw@3z2V$f)omz|(#e5Z!;d72b0Sbetw^sjRu4?$Bt=x!d+He?%2@P~X4w
zroVspJ8WI@1x9E-H_gg`9vVve$B!Rfw1OEk7nukoqj)KT0{(FxaqsPAaKvxkm4#If
zA4DB~Dk?tyrk={fN1s0r>wR-hUx8<2^jSko>mFL1+xj7sP?o_4MLvJtj0X%({y(V=
zdV2f+k~&dm@pp87H9BfvNQman`XaSa-;$1Q{#>|oq`G=kT3Xo3h+TK?|NU$SfX>XU
zLM{9Fg7bwn8y$D4{y6fiY$t!XsP0lydYy}dg^7uYX|iw9M2N3HrdBgP&WMVNx*HJC
z<*dHBZ9}LftR%&Su!4Mf`s`Uu5Vr@JZUr$xvnt*|h(u2-wC&hYkV!G-jiO;fQYBR>
ztdg*&(p?1wi4YPo={dOnuW3!XGYJRzs~@s;AK6P7Q^+3h*-I#V5bV*t_Ac*wdwjcn
z`}bF5Ik&2KIk{CE#GLv=O%dVBm<J8{EaL(}r0J<;<N}z64${_YTwZ?u?3xLzY!x9A
zk`2zZXdUrZge3`oUy_qE+0HKb)3i~Lif<7lp#7Q36f{14Kz*IGx8LtQFE)4UfRV)n
zC+QWUXuVmvFhE*HWTgU-<0>&DU;oxp+jcuB=e&3R7$7|*4TaRN;)WklZ|&E`zhh!z
zqCiX{O@o3ECox2HGNge3VPVtqF8j*0);@jmBzyniqXuPewxKhwF1$`om&~wt-#+1D
zr3~Tkk3k_l2@$MNC<vz;vI<xTNg@i$ZJ>r`WXhtdx^7MT_3I~7ND`tE{fY3Na)t?o
zH8W1N3hm|L_E+Ss$r`kQGDXa6Ai~Tj!=Ra~O+MaGw*qmH^$<}Xno{Y}LuJ^1duNKY
z{S%)U?+jU|8M&%;;vD%^#j%jfgtbMe1kj6eq8L)Tm097Odw1IZX#p6S5s_TDK5(U4
zA}*0k_5?c(c{B?pnwSY9P4xcP)|?lL4_8(}!9jYpzVZ0ciC3uGVvE#6%d|&aHCDdq
zah>FWD@G$Bi#aD$>^X!H=^hp@o`Q0X)lP!p^Ej{ue;iSl7SU2m<E8qP63P^s$<wt-
z-``uezE+gkFzH<ITcZ`tsmerC!t~+qncdKRqf38m+`M@%ecv$OtL1@yVXwcB(NCDW
z(bV(+=CM9LlUE!)9^-W;^XQrD{(Z8aP1rF!$C43e@e>dQKUzXs0Z1>#QzkE4rc6+g
zo&q3)+X^Tn!$7#X%@`U2OwdJ5VOzhRXPzUE$RlXVu)@E6M}iTcftyIk<(Unm7lO66
zZg&CJ*ADH@2zB)wX7YQ2H>0G-t_6T1i+O97pu2K`0R7Z2L}}yyhuIR!$m`dx0d=}S
ztEe~>k3d`_8WLI#%uo1PhIxAq9C#R^lT0A+pU`fvX&o=$lY|Pg0s5Vdxh-AX-3JeF
zs_Vk3qUkv|%!rqB03ns}>eUnZmNHF3Qt7I>RTh-4z310+{K^$8D3=+8;9Qn}{aRe<
z+O=y}MsQ_R8%qtbt7UeBfcEs=yG|`yv|#-@fGtjDXS}_K9qV@X-+yPcGvGB-ls_X*
zt{gvKmC_D83yQ=ZR|;{F0smM_2BjKay<x+lNf%{0nt#Vx=xwme%F3#Vr=5CNEQ<sU
z1=fX$&gDQwr>0i#FDWzMV)6|Gu#ef<Pzepu@15OhS_=9~$(&PGHlN2Y)@0b#<>SPV
z&&ii@e?l4-H)dPE{^g@_`*q=WrVaiBLe(AULLJm)a@%Rgl>5a1I=G)`&D}EGQMKgy
ziLvSJnjBCv`Q}@?wT%FVS-wa>8iI4J4}5QKd=<H3Yjt4*hTjyG2&5QO5;H)jC?r<p
z#=6vvuRq4WcXf3gp9*9k!VAUOVT}(vE?l`Xj)wUiBh+v_khc59Dp2qRW<lO;v<})*
zewN+SN8DcSdan?oKeVO(=c*qQC!hGdEp(!4m+OlMiV(M=Lhy9Jene^F(m}B(N-g9U
za7;09VHl6(GtHqge1-FhTczKsWk?)9U*;c=O%w10QCxGU8_65P!n+U7KB_yXcu>df
zsV}OwYg4^A=Up24sBhoC|0uUHy_RI%_;BzVC9NihdnH#c>3gQCaNsDf6F@Scj`TiL
z`ktXeIRQ2S5-uSPT?CMaZhs{WP0cn4Rd7SmyRH8bp$;?JNyN>E5Hk&bwQ8j2KL+Zo
zC)ty;H{it;S{uk)(Q$V<zLfwfbN`%w;mr8F{2((i@`yxviF*&4@9>@T4WteSYQY(S
z$I^Qu$XI7`E(-mLFd#CFx@?&+W+G_#_N{c$&4vAyL^;ZZz|%91b3tZ9{#|<NX@}|4
zu`4hZ7Z98nIMIuYjD~eHV-f&DQO+xq7Be%WX>de_Nam*FEKy&4V@}^Sw01%SvYrQx
zY9HIeRYE6$MKF%tvu+o}=xkyf+RdWA8l9>aA@u=zoI$!}Xp0!A7}_1^Pqw#)hu2+L
z=il+l!sl#<?cHfCqstcSM}~|YWqNghgqKgB_J)URqk$nI$%1PsI;n)IFMALUWGo2o
zs~l;WC{#J$F_0?%ab4jimA7=K-b2<m^QEI)L+nXrT{$gkhrav#v1Wgzo%rv#98USO
zmw`HcgTt9>_}0=dt+(k_<#FwQAOE~;t{0s%2jo=`*DWT^<xZ?%Eg=c@#tUnvH;;DL
z@#68kdixfwPje1EbA=yXj&3>nJB>qV?#|*_tiZ;l!<R)m3}vb~ewj!Og8HCvNlO1$
zEt?obJ=++#0#$62!I?}Gv!Lm@wS%2q*2j-5U;t`&BLbhq=SED0Q0*Wjkgq~RAQ0}E
z&wLB=5ElDp3u`_;E%-Kal&Ohc%=-DJCcFF%I(p9Qcj(BDTip*g42;Mb@gN{P92M{i
z`gJ6-aKa5Kqv?>uuZP(#4l&I*N8{X1Q|%^?0OUgJHgCJze2!!Z##S)PDClWn26Y9M
zLqVj7wtdi`K{Hov1I*GKW{e1o;)t9(qvwrdsp-FNhc#TSs5zD+!QFuWjE;@Hk*5Aw
zI4=rF7(f<=8Qz{Q{RF0s@{fY>Su_(+q`-=8J9T=OU)5F?V{r>)Jq?1o88nHcFxs)%
ztDhdzTy-oJ5z}<ywmq8$n1w$3c?EM%NaA85BO5S?VO}eieE7PeWfgRRkylW7mx^v(
zcKJLz)Pyk2Eh4MNg0Wy?^!Vh9lE76SU$&?ZKOXrcqn}A<!X-tnF)0=FfxG*Wqa{kV
zvWR*w0&&~Uonv8QC$pAT!~~4yFas*8RIGHGY<q#>Af>R(ed2=&iL^_+&`w{_d}%*M
zs|sNvX@|?29$4ScDQl@}S2x>!lbhp;HeX_Fk6=y2<vt7gW1$4Yy%)bP7R|fr_ZFns
z>}rIQeUm0l#LAGqO?uH$qg=2|InU%&bnkO}t}9!ujH=)FiBw#inhO~>g|q2hboAqv
zkDo2)o*#KV)2Rknl~GCmdWGQ6<J9)=!)~oqy`#adlCI6ShI9pl0&YY~5Cb^OshRl_
z+*f^h{6iPzsL&5H6T#Ca@ZzDWg&_l!1tylD!SDnn*6X)#|AXegeCHBHo|pmgnvVR;
z59(QGFm?`rx#H)hVI2tk5mi5G{vV#s1Fq-2?c;W~D615aoN!V?$SNyIcBm+6AQe&}
z$!J(vA)(ATom6I}NwyY3Xc{RhMYiy~FV6kkuh;!N=XTKl|NDKv*Z6!s*XR2B?*7Dn
zq1Ll#XX!sL&%8Ni^8B;CXmf{sE^vqJaL0XTcW`jespnZ6gbIl;B%%?hP#Qa!q=$WG
zoMNUIJHQ`r=?ns8w$=C7XJ1jmNI^sG!Nd>n+;Qk?()XXvQO32oMg_{VUPT`{rC;m1
z_U-h)lpjBNG9P>oVx3r40-Y@aG2Ek=3ZMXaCdQs}1$m)F9ho5z<ru%MS@Y&^&btEb
zU|XfRP*(7ianMH{og!A4pYpX9BrIE?+z?T|1~DXlQ}~de#$F0qfuR@qNb?>4yzU8T
ztTjS1P&v>G;*Zml%6Gl3NlWEp)4P>#F+SRjN~!0>eKl~nf^QQDsS+G0fZ-hu&X`O^
zOKknod<`*=*h>9GDxl*a_ilw`KI}1<ftNRL-i+X8KdpSTuX6Le*j}_66riuqSA2DO
zoS#oIvBO$Nh7g!`(zbD$j0-nJ0@(GXoJ28FV2cYKm-vIJCeDhS@w}p9BG-QD$IdD6
z1_z$@OYo$%fK(QV7$deapo(^mL(~gAhN~?SVA&TBJ4&G~f_6Mc2tlqMwT->>n9^jn
zVEwss{n-h*AAEv%qUBQY>XmFJM+<YWmCJEhX3-1q-@B~}bE?c|h3rzi9Yb3vRo|UE
z&55znCJ?j1*UJ#-ied%8j>o}^13?nwEAq!|Om_&RHivg<QO=<uk}d)enzFU4kQMx*
z4DWsG($HW_=-k?VTYm+YI3f-0Q`*aO#R_cOf5qI6%I}&Uy2`72dZy%sQA<_rl=UZ?
znlh|SW^DMOxn57MY6Bx*DoZr3lX~;u@9<5az)L|TWU}XTvF~ASAsIoT<rAt>tyW#$
z5?r`$5TDAPXZ!GLl#DE|(kv~j44pOhRPL@rhcbEf-?Ju&uxyCC;L?5j+9P%?yu;L+
zY|>zcL8w-ECvbeaDsifM+2O*OGna9jN0kVugdZQyHS19aedv8KEbQ;n50zU1kY!OA
z|5nz<=jC<m+V${=j6nkjGOW;GT||V#;pdU0%9+=2-G2$CUrM0~)LruYc{<A9FF5DO
z`GA0tR#)sF@NR3lwM8ws4zhi0vgZK&%^kT?yVFqAg78T)?al%tH1t}Nr?oAxe#>st
zjr14{P+n$^j+zD`IG^;n{4jjk?`s~W7l1%QW8D+{nY`Oyh)kd;F)NQY2NR&*%o+Z<
z3r9&JJ}(js5>>5bY(@0c%q5E!foQV>W9{^xlfgichy`DL^jQiUFoW5amJFt`X$2z{
zlU?1mU2>hoKZzWeT8v9&WtBg_B7~QzJ;oPh6xHONE1;rCH>Y{KELrka<;6^)CDVQk
z0)P|m5}O}yW~Cge@lqW}MbR}WN%u*p_W#aGE#<7SP6&$-wv*=xYy_rd{W@@7uI$vE
znTd!NU9e1}V4o(#K3whJEEP8Cw(MY_Q+R>b3(aSf1LX4w_@V$rGbqN?ASt{TR4Z!e
zxLJRVLJ@~773zYmLxDUKXjYj3=Wqa2irF`~))=IN^l`kU&x>R6Awq>Hl{Nm;Sge?n
zU#R2QNe)_Z1y)T4O(=*m505+y$W0G48JI#0c86*x@RgMC3@7`2W%^$xaU=|C1?%C(
zMpGp=w&X}ra4}#3uZev0^~{j4Fg2L7E<;yr`u1%#H;+)uh@upynVFg7>ry}t3B<B7
z3m$;z45rX|d4A*33%a#c+Sx^Zx*$IEktfmHRIaKM-(#C!-)2!83li(MU%OL(G^=Vj
zJk%qDD@>l8V^z@I$hWD}$wh?E7nEBduzYXHH~3<*v74LBX8Uj^(`4raT7y5Zj4tCR
z%qyDBn6XFs`Mk&({V_cdo9h2C6<J9KW696XE7Tf+WQniC!=f1k{dC>3p9_6Mtn%n%
z2j(&3mBZ~|Q3mCq03+sXngr1!mG!lXYdL=WO`{452i^-TdNHz%_29)q?@_#8dKGu&
zVf9)LA_OKbAG?WbFiW!AoVg384?(aw)btjL9Xhbhw-MJ&<~ZIRa|EZRcY^(W_Toiu
zwS8JrGyDwa#a0{4^)Jc3X76;8nq19yz67uJC7u|0whuMaD!oHF&|ZUFb>+w=XTKx8
z$SztT7tfvRK4?&fKLaZ=ltF6`v`qsO$n&J9<BP*(k$i&>H6SqvCW3Z>fG2es#!-3o
zHbR=ocOq5`apwDLEeBf0DyY?=uBh0~oIN}L`j+-&++E?}+mVJ0x>W(^%n^kqIzs6x
z#TYtGyfEm&_+HCO@3iNq3ev|3Jw>aH6O9lA!lL6ZC@zsYI<6*V4>B^g(hu!Ea6s(o
zFrJA3@Z*w9e4E3!pemP0SX2~_9%kMpAG)pCf5wo+rkjEk&Pd$z($gETH_swUgiRz)
zu)R)9bW55d$&Fro-gY7G%FN!JSbA<;+v!KAl+e@gA`#?r6vefip;X_i?>9Vp_z>wn
z8mFXNt$KFqPZSZ9f)BzM6H!R()~(|c=WJM8-h0ZX%`G<K+s6K$>-X;MqB=hhz%4)p
z+Y-d?2z&w~HfdLZT(~`AAi(VqVFu4R;(HZ1$*O1m9qY-aI2qpAl~I~p_dbJaxa8NN
zP|N7J{9*$l+*Qm;5O7b9xwM^AYpRq>W=3};`*9%Yl{$0?h=}OPb=_E$L5h<OMW%rO
zeppHAY7$}Bc*<$2z!SjqCYL8}rE0NwE<Czf8YjDFCOXh-cMn+j#|lD++j55DaR
z)H;2CXq=gx)M;9_ncm>;avHB>&8JDGE1WuK&Sqdeu=;MDl$ZG^e!wGdDFB3q%K{gd
zEZi>zEC64E1`+pAAv6E^r+j7+JRt2LeB31R4oAa@>8KyAmmq==yA0r(Cv5KFJ~7N@
zZt3RiJP~Mx*+yOJ19+)=vz#iu+B^Q;x9@%yBFO$bnbiRMDB(Tc0)AC41Vk6-L!2bi
zv$MBzAKl?S7<U=$^KAfOfP%iw>$=M`$D8mMi|kMY+q?>U2M27kdxixw7(}n1fk}h}
z{iPmvBg*s&*e%7HK`~6CUdVymrNX#hFA^P?P6!h4^qjSy^yxXq@tt6wN4v(`E+y^(
zTy~S50C@!J8k{xPfi`d;kzX+~p{Zql=oa?i81*GQoYbLd6Kth+!K700QFXWBY}%E4
z9`wq*jXoCtP*7almKBW@rD@;RX+0e|(`k|c(1)>w<2nyXR7#{>2A>w$<+8Fc6y^(8
zQwJ&&*k(}>H9J6$R3iHjwSuMQ7bMI~tHSZar8#zvvq;qR)1$*5D0k`7_hS|etB8vz
zKa(NRIYYHfJCa)nyfe~nKA^u4K#5dDjAmp^6od%42X>tdD<D`a5x64($vEnv9mG6y
zZ2aZGGxJ?s3jcoK3N#L?C87+`t;-1auezLH)FiJ0CiQJsu(lJXKAZDQjea%Nn=Y{O
zhJ3{ZI%jCXD5t=aXq<m1>(Mau1#h&Px@sV4hdDs9lLvfj_B0z27E`mLCPAyM4O>iL
zp|GV|LsoE^bBO5+mb6RV-LE1nqX{NEHc6g!+rGarV~7)pJX$o#{26{E*%4&5ZI>?M
zW#2z^8+Ho25bQiFDH+8Dk5Amzz=jCV!A0dZ@greghS!m2Eu&Pm)s>}yJOCgmwt=*#
ztfHlEXzkHg>^dl@_!wv6;?!%pa1G#ax)A1WWoMsc%>(Kr>SlOQX~&h7C1{aSWY3yK
zl(&C3nwJHPA@RS@!@U!z>FCAJgls}opm;|ZIdRB91*QJ2e(Oh6&TK3BrTeOke99Ec
zRvBL*=OL9LfXpYqQ`G2FCy1Ez@ZsWU*K?z6+W|R>(JYUFsXxec<Ov@w3?}N<o7+RN
zU~_gRqUjnuvX+Ak$wb}AwaF`j)V9~$Z0_HHCrq>g6maDGl0(lUO%VAKv()^p<QidE
z0I2|w7f@0O_9i?}y8$D=mG>ZRQ%73&dw1?s!YX|jV0z8>bY86&87@f663GQ;yC)?9
z!>iKjjQILeiB|vnp?^eWDC4+)&>HfuI2WOw^~$;_vvQ~y(Wt0H%#5XY;5Eq%fzRi{
zn}m*tO*5CyOzz#UUl{>K${`*Pc|*n**>Fv}FNq?ns%_l`-7TRT@>@WfLkQ!<FQw`2
zuElH;+{PTYU+5q7ssp9DF}wwkE6G6btcjfD(MVV(#*BJl_VcJMiaSoLBr7#FHCjb#
z&7SeR!LI`^Ap-T+TxX7*2|1YBhI*0u*z@50;cM%@tW`M4ZRIHZSLK^Du(he$ecN8(
zPsAH=G^}bMR4lUL>;OI5-YV+_21A<4Wo-@oBX6fgi><D!Vj4DVC`zt)DEBPdb^aNr
zo%??bR{37~`se3;Y!z_D78{5)CO4PI&(92$X(0+u4y)9tD4?)`<-BwSQ|oJ;u#;K6
z!fEw$2S;N4(`<a0tqE5`5Cl43$mgE865idcvcW1NtUR-_yj&tTaE@xkG?@s*6kI|9
zFovjJkfy?g3B1<1^8}a!;GP>#o_sy<NjoGXVl@NVGO#12ONhcm3(b@t#0d`@bZYvv
zY3Vq|i%x(%iE!iO!6PD{?h$;O62haOUaiGoed?jWv2|+9If>V(ASsORv$LC6nT|X>
z05k+!HQgz=1|itWLTn7hxqz7}<loN|jg#g<M9L-NkC+ZIfzexJ5o3*@Vp8mT0314D
znNcSx2-|u71E?K31%F-wEx?fSRB&$w)sVblt-$?%JN+|hg+e9L&9U;A<(xTlYyHHA
zo3e(P3k0T-zYO<$+g5A{?!z18v){KL4gLpSJ1HjP@l6Ajr`vv%=ld9s&H2!3L;gmZ
zha&8{qzuEV3#y}w#iXf&f3pYO5+*VK&rY&ktlnE>5uDJm1(oJy<!u^{K}RRYhJ@>X
z3v39MLc=~`qd6#IA7|6@nH2t}5h{;#?YRox-rp-?##2UEL<J5n3pUeAdb`HLtHF)+
z&d1k<VgLlze2ZFx`FCg&+2hFs!Th&rzt+N^xOoKEmkm?7xk@SX`lXjO3%EwM!jy-L
zAy&3s)YSMbjW#arwS_upA5nb-@qp_vK>N;Lny5AE&+0Xl<T0T;?u-M0sX!-%nPHBf
zkayqP$*IhBgO#s{L7$m|oyt~vv0=c2RM2!|bEJfjy8Lt74jq0Y8&IM34*PuSOffJ%
z>Y-<gj=p`aWj-52c%W1kE(}~$8-qc(VbEm}_D)zwCnf4ZqGm(1@*-$2KN?8YllaPq
z^&RGiM`TfhQ)^wuX=uFqDnI(h$C?`1w{LGIZ^)bjQ5GKICRNJGlP6t&nEnyqaH|Bz
zr>8->h>L}6$|CEL5}06`7oWO&+;L&=Xf!GJwEOi#RM6hvcs;BX+yvE)lA_{Wx2`ns
zVhygH=7b)KCnoI)Z6XB%O@rV}I7bn95LtG;Fw(gW`RPN-pRwQ>JXgkz2~65!hcLEc
zpEhK7ettebs259Iw=^d*G`F7g^*{R!iWCGVMf;i<pzXVsnJ-yl1gqUCewHp@{v7qQ
z>Z5OgFEi`3ZI1kg*uF%{ivr<_jGVAZz_nYO6wiAZkh`e`CvJ45lt8K?+wp)lVx8-m
z*5m@=Ev0dkoj5p^M7w5B8Xubnx@iJ0#!xpsv@~ASWP<8Ywh?H#5x{|VZwoHTHZGD{
z@tc!gS3!&pEKh3w)|Cp2;~={|>w(1t`*pn%U23kZ&&DjOyOEStsCzzHQj1C}hu)Pj
zE$VnM9;|?ZsnE>BhWdwwcJ96R-~6b1YGqK+Jbk3(yGw9}78hZH7J+!e)UE)X2=fFe
zmB2`(l*P*&jY$g@9M`nvy$BOb6E2=N<nb=p>A<3Ty1)7L;JR5oz{<BNpXZ@!m>ut`
z23svi8fChykOb>R_rUZ->Jb#U3`P=oZd2&QI;U}IG1%HbJH~Se1;it{k6x}!95_!N
zWZqExFp7)Q@~TDjC)KIowZIDRsdFLEE+1yYi+o2`r!b0&dfvp&k%!ICvqR5AMUV4d
z-%|a+D4!*KM46JqWQh1u+5SKtT>wfT(I-(Y>=^OwuYSoj4Xomph+iZn<V1~W4*fgR
zMrC4wD7QjrjPhe48Kv?bckaA=BdEwU`quxn0IJNEVMuFQLB%gKK}40oRQ7KM?&3>|
z1Oy!M27444?5Xgq(tFgFxXlrm!-WS;iJ!Zo>9fpa!_Lf7ji}pFf5vZtf-57+I^dCu
z0uF83E%~^?beX;xwsK1I>y{imhWh0sg*t`L#~w01aq|XMKQpbXPo9Kw#?CQ`Dej|r
z{`+XgOL^dut{^~lY*Nh}93uAgnsoeLx8U38$LzrnPFr7^w!EZiBd-P^fIzYnbJ|h_
zUw!+nL(7qK8~f)Ezji#5#)5(qO%1bL!9;o1(8lM1c`O2VAQ_M~HJ6>IucfSq#)*uM
z-di+_XSHwNqrX=qPpuq~YOp07RmFW7G$CP9n=)(;7cSN#08ZdJoE17B#OO;|!wQj4
z=#L;ifN{Ql>^e3(Q*}Lf&dBdiaws|_l}c|p>ENJu90z6)v+)=k3p2o>X3f0c^{pLk
zJ}>L2tSqa{OWu6c#_%`CI&7mD^~A?}rB$@2Y*~t8`M;hEoMW2$a>iA(->)BcoI-JJ
z2}sIIgcu<GdTZ-eQvAa7b89I>#hQsCgdya6v=>-P-TQk$jG5Nx$Qj3`{6*P*gpwHD
z<{=~mjUZEr=s|y0IfziNd2?ch0`ZGRkZLcldPoc*n%Yvx0@`YJ*Q--Z<N3?gkfM(H
zQ)&hHvKq8PrY}sWNg4NeFLqD}=4}@&pr2{PVd2keZyAdMihU6;(W~+zYcT@&6t{Wy
z{4p;%V0yPhVaNmb93TDrJYD(t0(`+*CuHbZyFnL46k|44EBG0J3AK)_)FRwr5y4Ez
zwT-{jwog6b%>)$9qHF;p)Pr8o-M*UJb~%$mdt`!@cSufLwtRVa2ngoGu(890ttV8l
z1E@@2%M0Z(gL`dAYJollvI%tH{*O_cLgPmc4-56WGx)T%PS<Jsm(UaZcG8&8z{$5d
zDBWr7SpN&03G_Ka3QN(%2C(_YL93c0f+E9TvM&0&(q$InGnU<+OifisUx{A=A@}|_
z!z-bSWLM)&b#HMJ#w75+MY0IQ&O!1aKa}}$hC?pBz0i=L&i+B3__`>&#lBzf#dbts
zFr+z-DjW`uxG*@wKNoEL)7+q@tY>n`h-AGrD5w=c(eJCsW#UD#z5ah<Idm4nnNs{q
z$$*v06PMy~XOoT}-$+ADe?x&G_ZN0p9U_gJBkjC2`;hY73z_~!xW9kx^+KvfZivkL
zLFtR*8o!@gC(HIkzQ}nLqCi{~8D3m8jX6LyTz+ItsZ68+MvCxLS|)mOs`x#7_AI3r
zO09c85je=)*tmhn4?uR}_Zm(gtY2+zw%oCKuT8(TQU-@b6SnGdJ@A)=t}PmE(9kf>
zDoWu1i@y%~HW|19NxFYzS5rOjGW5N|_JC}rF&QZ>6Tukj**of>=X6Gq#92#>cMvcj
z(|?@eZBV@S``<F%1+^lV7j2<c;f(bq0Y})$frB88G+&kkG$#Au;d-3^X~k9M&Gfzh
zCO+PtJ7o6HQO9dUfDIt_vZnG)UFDmm17F=55OB$5;liC=4)e+C?Y3fTaR$?mo=Txf
zpQ5WZ^`=wsv4}@<fGjfv^%K7-Kziy5-DS_mh^vn{!xVmJzdB(b5~7W@+UJC+&mJTt
zI#pCu03jBj`Ah!!t->du1Roh*v?8dHtgidda^S>A+95KNtN~((^$xRVn!W5N>*Xf+
zQXTV#$h9wLrG0CUTj#Dv-m6!eKJ(a$6CLxUATpE~=hl3EKVnBDxsxVAw#vj={T`wh
z?=Nl`Y$uU60__uUBJ|k0sIM(^b8`vFOxK;E*G9qL;~|^iMJNnWL#eOayfJv;eYY}C
z1nPNa8jFiY(0z-M0q(mK%*-;+*z}mhtN#zqI5>Hh-gK?tR@%OP5Y2{deLUB<4K=$#
zQN~~Sa&xcn5)6uwRdt|?1ATP!sBL6WpV<Y#QrSdA<A4rlft#Ce)IpEwsQ=2M<BMcQ
zK=?ZuGr(y8TPbCc^-*^JH0(a$$SB?44z-9%z$Z#xyl{^?*h6={p`O#Lpp#KGAoxQT
zt~y?GZ)VAW<^yls61zW2Ql2H1M`eEe4k^s0PQB+`Y>-s#qU;u=abgk;bnBr<!L@&z
zoY0TI!#9Cxy!Ivkx-Vzurf^Jb1q==`rm6MtK#+HXzQ)FRzjlo~euaEWD*1eHVnKfX
zL8L>(5FsVGhUMoE!7<9XJIwVTEP1c%s)63mD=RYz6S<Ra4LCL}QW`rNPEv(9c~PIG
z*<tR|1}kPC^II$)k&#f0{?T3&+BjvVb|-11A|l52&bRr{53vg<>{o2}MiJ8(7mF_(
zc)L7Pxph{43#_k-R#!x;TBJ?LHUq2C)BOW|PP%dL-r0luwS^Q0_3RDFKqW6~>H4!2
zJdmrpx*Tt0&^~3}WJgCWlz3i2`cb5G;-lbmc(H6Vjd;$Bm6?p+HR-EnjSK*uKC2E|
z6+zq*Zx<^okvj|?O!DcW)y%pV@asG7BE8Rq){NxH79oj<P!?h`4E<)hN+0py5KaX8
zzv@X@lfoDV<u(LdkZztO+&7Ngtb!y)Py`(t(r7x@iHa>Cx($w68vY*!Zg+3FuSXo2
zhc3Cz>~Z8NAPe+UX%3({3N06@vQ^mDwq)J+^-*oR0g=_yjE{w>_+0l5%~YRl4R$Lj
zi;|86Os4ogtc)(Cq54M9C%}nm`H2CO<IbMtWp`f3?*RJ|_>A9@qScTtr1(rtTHe-W
z(~R?OH`|c3k=b49pI!a_;C)u>evh6dTkW62<1XKk`mLgS=4p`YkInmd>njvAitn(X
z>oLkeFfXfu0|x`K4n^mP2V4o5pKskwUuhdcUp_@g*HEq?@B-s<j5BwnYK5gT=9!>s
zvTONnsG{AgDt~L8n@^vrI6I5E5T3&0kPaXKtUqh??hYM0woz&BHJCb4q!wI+){Bxj
zv+dN>hX1pF4oCwWJ2Uga)<IwhQpHn(O}MP8vu*xth2;Yq0A_CoKkyl)5DvG5!=E@~
z|A^_TXXiH|co^s+DmISj{ogw$556oK*qpt=i>9r5;q)WS1`cX%;gO{m-QLnUAzS1~
z(X^AGpdPAb(ng-l$q8EF@eIA5YhHVBOy<TiDPZvH5;5W_d@}fS;%N&XL^Z?dDtMIF
z?S)s9F>f=Ig6dz-jIr_jT^Tnr=(u426WH*UO*V?k@XC=+3XK|Vh@p{ILe_6?o*nJF
zr9^ANp-7#OjPIGHD^A%tIE+OV$J96+IbD8&RiWnklI^nBO{iCahp1%hNiLM>VA(c6
z2}IZ1al@;DcOT9juz&ibrS5vn2Nf2k9PHM<ZCfJ~lUtK=dMtEuGO*9P7#Mi-NcL;W
zADW}90PH(6O2ImTaF`#o2V<rgKeyub{fZ8kNT35Q{ll|C2Qxq=Yu&M>1TOT>K3got
zIfYM)>~ePkd*Ze;SG4X|KT-q?oXCHHdtkoUNInVz7DR>pU|tzF7x$hUcE|@M$k_`Q
z5L$0QcgW$6SbAo7=f3teqn_;``d;F`2_(Rg0^d5%6c$YFb^;ez55#dh9C;BT!o}ZD
zcu+NFmchxTjzx1@u5?=B?5w)Nk(e)-w(7>{=)Ymv&}w(Y4F;$^p_d>rwzs(Sq#eKg
zt3vSLzIE#s=f0K7?n?7(j~?wqkXG=<(b@pPAWHbgDf4nbA6&m3xisqTUeILO1Y~J>
zK-0eWUQ?Pr@h3oNEk}+LK+5K}o}QbU>eYVYvU_|_6MnL7+hx=9yM<&3p7!f-ivErr
z{|^22%iHms*(b$w)y<w5>{jend8G2MBi*A)EKV$re7a@sgV>9YEJGgcv$!&MiqWX`
z4Z7daYP8<3QF^;u;btAq?egglkglIL@=NI;4Z}uFmlnl_#CCjD#Hy#y;oh$fmAQwP
zv6XA=;h5TY?_PiUe4I`c&`)$9h#6(q0am;x=)I*irL~lQ0dfQ+pL%l8g#aU$3jf&_
z7LQz18nIXdN#7Oo>zP0z6kJ6weA+i3yTf=12qk^zGuUq+lvb3+q7P=9QsfNumOv1I
zSCToH;ob4sKO_7R1w~<a4}isQIN~qbR^<ldvXY)b-1eE`>4gmf=!V$YW@R~GyDyXq
zhU6%qMJPiKY`ECn$I30!B6jwbFr(OOW_oi(On|m=0Y49d9qQ#W%dZB11snDnWOsh!
zkv?GD@aXgT+i-{DuO8|b-o9WS{|sLSWr0jf7p$r6ye}Pkq7@O+9xV9h_u^kgcb5h<
zxUyG}RqU(CepCKZZrQniBl;}<2YuWtQbZlDjpNxD537e-+waH|g)#e|tPe8Na?U%P
zm6=JOV36DzVhwEKOz|BI#*l^nHe)|@Xq&i#;FUzPEhARs95JgBdQ&Xkx?<(wdiBsJ
zKV%yqZbTGW8jg(<7p`zisHARSu`qS3w6()QbW;+&yec9HTB00fTB{9GbKb<Wn54JQ
z0rZj9aj6vmD@d9~L|yvBnoRwi4{uV37pm&y6h0mIAR$~;g+_5U%$`uk*doyU%Ptw%
z1Csily?d9H-_@X_lHG3zvam5+bNRGEe)Ij^Gb_W!#@qNa(VPCX)t>C$((zvj^8>O$
zitzmN>~4x5w)DB#wC!?>dLga43-tz~D;vzf%$n=f4gzfw?!R70t?u7*6F+F>nJm3X
z)o~U=Rr^BNsHOU7EZ8bkFJyR%l)EIBbZWGfNSnSgeUFw*)cV&K4v=A#XO)$G8)eP2
z)$Q~&E8lPbkeY=4Yn-A2i=t)Ao$OA+4_ulG%u(?ZW6qzofICr~F-bO#795*XEj)7R
zp}!m$l!wSqscqXUkZ{DIZlu1Xw}}-eSO03?p~E?5M(DS2j|qP7<~hg-w(WKB&gk6N
zkim0D=%cPdIEX*!!me9BuSV~))YH?EiWX2t38A*9=O9+D3iHjC+{ykSC~bZRwuwyz
z_O(1!bl>8#vfV45>-o(VIcDz~{&}cbTmmD@x;7`ysQ*?^UZgAy@w{;A)T!c^Pn*W{
zVw>Gw)9cKnkj6w*I%e$H!pg-=E9S4`X5D@~0x1zYv5_V<Mw2aK8wVI=f|q~+QL(9a
z=!GXNS`?`};PZyKd7%J~>rI?4HFfilm4e|SK@c#DU=4(GgFZc?;K(0y2Wh4R5#i!8
zss%Xq{6MNRR|OG--oSwoI77qFejaLH@$nehl7!GO<myvkx|ED8U$m|=%bMqqmzOu4
z^0#yuMKRffrw2d|{!H@oalSmz4pk4r2JuFbJ&&+r*I0E9sY3FNaMD*yxUWBb$)`m3
z@~N4P{QlSeHKP)FZ7{n|oz}0YTe4o`Fj|V0Zz50WGPsdcof<sMxlw#qssni#J$v<1
zAbJDzX%8C1Q(OCtI}X#)P_bRR``H;Lku%;OEUpWXaYzynOwuYgv3<da2k^FIZLoT|
zS*Egc+WPCPUo%uPaO-*C;6d6F`@%#In02X{P*IWqtdnX15oBso&`_eHlso_~ezb{E
zsqw&@Xpv;|7an8uP9k5>##Ctf5I*LQL8+h!=p9>FyPiLI(CSoK=b&+-jxse}{5k$D
zm0CAL!=#KC@G6X%)jNIqv=lL3Ic9&FHn4A*&Y3y$Vd*6>LQ!_pAJ_Zko!zkUF8$i<
zh*{yG-Ctk$(>e#nDb3!cPp!D(=H4cIt)3)wFZYELBlw^!*$!274mHWaxjl{gVJOQt
zT*?7=&C&1FlFmpRV9%f(1q_q<M}m>TKPNX-tS1|D@*sK(->Z+>V09RvXhJ3gO_goG
zJsG9LKHMEGjXcFWELtX388^Tm;Q`V*&rEet3=eI1IQ!Mp_&00vF&N?60GwS1t=dNH
z#|hC6Z=0|;^`ch|MKQoV6hR1JFIczN%+COzNMU8)uV}RX{nw6^KxPR5o4DODxepLD
zI<04@@EpLi$Par=O9z%J7(n2pCcz$<=i}r}!f*h+ljREj#`Ue&h7S#k$vD@ddGo=z
zBrLv|cvX+548o7|Z#|u|^DW!7NqghDVk}+~118sH*!)ZfU485lmjB6Z`t;oTd(T_>
zg{mdcWS(D?t0j99@N4?!(<As+sP^v=S`V@3zhO=YK=_}twsL7@G@OL-oM_3}52LV{
zH<vYt=m!%|erLpIi*?xCBli}fglljFwH5pn8v-~A0<Qq8lk@QBWy7=zJ<97jg9Z#>
zk@_8um<&e%#Uo&@C5DP=ot|&PbwQ4SsbG+b9QskqJ;VJ3)LoB_lAJ47&%LgyaQ!Ei
zuH}O6*7#*grWpYyExob8j0~JR;_t+}Z1LT$qjhJkse|{m9Ol|fua?NM2EbxTeq5{R
z)29pQz^^b_^!CG=P&3)lAY382L%Bl-F_RDn4T{>>f74hcCi{MaF~j0q@HL;Kw9rIC
z02z&c)vY5&8_BtVJ5>C_Tp)#|S))`JyF~7g)OI5CXAMQvS}iUBPf$=+3Iida5&!Z1
zh}OWR+#a!(p|Ql8`qY3~Q7{#-LYFUJHqT%kytu#=CVA<Dwg4QWEFw%GR{Fv?JdoR!
zYuEf>48$M&VO4o^z7*IdW|1dAnQ21#6X@ZP?;9OhfP59Tr~GtY08>RMH@D+cke2s)
zaohQh3OYOnnXCc54Tt=MTSz+8$GTPF_5t1hI~WEXsk|1gw|8}2o1wFh%o}ud;Q&*^
zZevXhjI5;m0mB5xsw0XdV+JK>9*|c(_91)hFuhMX`5oIOy_Gd3>N&OV-|z0Obm7yp
zN5vzL-;-{JN2C@LeA~Y2Tk;=22njyabg(P-OOoCfur;U^*`(^NSLtxeMZ+!T%$WcX
zzN6|*^WPcpyz<+LkPD^kszXi@85=7aK%B&CaPq*eWgR5~qEpslb?(4$s{{(!K8aw-
zxGu33_@V}46~|dG_Q_>kgJ6U=D}*cMrJwE!wKIf`N29)SVt4J?a}66}8Tr7gkD0Cc
z3o08t3msCksFi;H(7=TLm6g#l<%J@lR0thj12YVuq7N2mD$tL<#r0TL2Z2%#20a#{
zk}bfO7Eey+Ey}1Yek5OcwbUApKqg|0i3Jehnh_n?GZ`qR7)HN}$Z0BCbjk{r%rJH+
zi$D7Vf=2zTU>)v@FV_L`pb+OYtF0~raB^~FARMdKU6?bVF_sO~fT*XKS>MJ61Nu5v
z!utL3sII#6iCKlGPY>V(7fj6#X}_7MKmwJwTZf50-rH>E%z+1|zfL9i_(w)+i@zbz
zg@8T6;S+Lc6~~#FY})huAK*FIU%clOsAeE5%ecgBtcfkISaGpwPUiZFAlEca<tq}M
z-|u<5Mg6*&;*dmYZV<IO7h37}X}P<>`Xh!h1!_8eW=2jR4z;q~eP)z}e5H3Xq9A_P
zyb**W((4ho8Fc41kUkvC?;fQKLo}+Z5=1T7z$75qf*2Js(B8_@EBf=e8aa8MfnJ^k
zzy_+S2R%I&T7^9FZKFP{6PO*4GS&dxAGF#-hsS^PX5mTUE5$dJs?P1ZcyFZ$k9W7x
z3T~^cthpBrGs99lw__oMY*uUY0Z5F?r9+HCdq_tF7s>l#)v#knn$C@eOsZ%QRTy{%
z1weq=jlztydT?%ZeJ)!Ia61CHx_@Zg%}Hivi5J^51H&JFa?!aX;xth7g5BvkLB<dS
zO%N$i#LSCL>p5JX04=~GZx^doC4M0J@+vfq_rngbq&V>8P|3(3B}1VyM1$P7IoQOA
zJT!1h*wZmQHU_d8#h2)u0sO^%o8r{a`oI1M|4<>%@uw7gDk;hPaCLD9evWwOh+GHn
zb|z%%FcG)`gs+lB&(eu`Z!soh;FyR9k}Q#1%x>wrUjQ{aLVr}<Y2B)o0}EtjFBk5-
zP{W%zR@7%WmvH_---Ud|?sRVa+5Qavp$W`=@ZcH)siKz0=d009zt!k0@Wg_PM~UDr
zu6DLpHq^hUU2d9Vrs3Gy40P|C&o<@o-_W08OWUWb9k*nS{e93Zr0BS@?OCkvk3M2(
zQW2Y`#Y$QOgLu#Gc8s2J{)fH0ro^r6NYp}V&fNjoZ(WMG6Ur;1z;M=<*U>{Btv)vM
z%aUm4S<ez-o_n@EV+e_Iiy;BpFX-WUofE&QjLQIloVX)5+qi;vN=2LhBaJF6q%{7*
z1@Y!}$1xmp7z9^Zcn5-G(lT`#wo(@j5U;&y+v(lv-TsFyhf=Q%>h(BGDD*$w)70tH
zM+2B38bY&0<4e<9zI^nAsI#g%Za-0_6s~s4W{(Y$aVTx9`O5RAxW}skm{hGbRM@?=
z+n25!C4b{(qpzEB_PH~-I>so!{kWJB2_f2H8Dt=lSx~%Vt~`1aH!Cm0zQ0h%%;1xt
zp$1@0|H2Hhx8C{o;+RA_k;Ud5ATribm>xz2s934xNJIDois21WjQS9z-7VL$DU6OH
zl9SP79^C2=-PG`Qr(^C8{Q=*@#Cae_dj$nu@F1Yc=()-qX9;BF+gSTU;Y$W12FSGL
zlo$>TY3aJ`&Q1L)dkA<TKz~BiCMRR?rNU&A?cF0rYB8VL$ip>3$;QU58drlvq!L&H
z?BM(%ji`Eci2eg&ovqM!L>3IA1cf+;<Wqg0PlACPm<F~MD&!TFDPQ{^J??a}slIpF
z-9CELJN<72&AKyghl))TU)&isqJfbUAqnB=CPFUvY0Zaw<0)rNZpssfa06MTO5*G`
zm@s=r)I+soMXRybv)|A6>^0c4mqnbqoAbKYl`FlP^i5b&PwXy5E=;4^mDpSHx$te_
z=1tW5VFA~wT0l|CvN#%RB#t1l$2CXLqhv2??@v+nt@5l`$8&=~rj9WB4Jsj{YHX!o
zB9y7KV>5ckj=vatpwIBq*b|TvV7~-#esc+^J0EywSQtBe_jTzkF@_Z&`9aY<7+#YC
z6&D4E=J91LiG6wH*A~--|I-5GiWW>PPKH0SIhA`4<sh++QwRLbjKb;<Ka<~grp$>~
zinU59C+LERG+gbvBEspUqP~Jwv`=$a>ufXnEt&Pl+9c*uUxKAb{|YX-$0oL9`a{GP
z{I|$CX0nx~|4o#@$nM12bMM{>msVT(n-T8j?YKe^^;sOtG#CUZvwr@ROeil|B?Y0)
zu&n0TChIeN=94|(rzH0R<-^Zc9do{T9giW7L;od@b-DSpWr&UXFTC50NSB!0TDBf-
zQy-o<%SAFByM^AoPe6Epe8}eK`w;Pn5V8!Qg`~&#=1OrF6;_Pd7}vltU%3xs<PfNs
zosWvmV=`s+g5m7Y3aZIe{tZHDEq4I>Nv_wisaTUNuWQyQj7kzWx);=*PAoiMg9D2Y
z|I8pkmdg4ZfKM^-V5-QoPv1|Py*)in_d0tn4ZZT^2M?6Fqd5Go&WLs^UA`)|)oc>&
zk7`@zqJE6h!(eO1);`+#nDU8u!rFsoR3lg_h&MWvrO7-ak`zx|O3kKCWt>LFqit-8
zUHy|$+n{WMbH0J6IF?6DmEF5H^-g`S=qfYJ{QLB25oSIfO&xK)YM3V>LC6mVR;A{}
zIh^V{&uH+*j181w-Bp5478V=OyI8rU$czz#zN<a4>c7Eo6SFNSoX?&)Q|^Ay(!J5)
zk-p(*keF-_K-(j-Lw|q7FzX<7;jId{cylA5a>Ox9>Jfwl^t*p1_Hr1fmmPfM>V_7Y
z%JUuFT3HU)e|BSYs8@GLBBPB>Jl;gC?7@Mge&>Wdh;j!&_O2>iY-e}B;#{l&9h}Ih
zc79#y+r3fddeKTtn?kGuRxdoHd*06W50v-{Rpdh*SP(#~#%FISeejzWDT8N-f_AZQ
zn=C55o>Y2{j{R9-$z;)r^HHlNT~JZ|xH{#F7ElW}+>t9x4*~fvd)ugX=m<weJ!~L;
zML1zKTZeZA#3O(-_OUk_@}3y(+REdT$ttld;Z9WD*a|F9U=&YX3c5o)K435Qu=%xC
zH-_;?5_!(6dLGSxPrdU4-H=Yn+zew;F|&5fk~v~Tz_2gu_A^Vg;Z@1v1-6Z$8Py|%
zix~njP4leao*0@3Q4!DtpXiT#4TA{#nvyHKn`&c*fh9yyQEK52{r>&YVIxvaE-Wtg
zt{yJbJJ|+wlUYf*V<d$WzmU{HHdNcr0_@~6Eys=CB6eOD5$oy0Sm91&$sSFc8Nv^;
zl3H>_(6}BHm?9%XnjTl1DYj#%GN=#VQ?v^Cr-v#4m2;Cr(Y1GUHydYm*>A1wxMkQG
zTAlHJ`_vLh8&nT5R$qs2`BObKbyat{nz_Y)H>}p?^^W_M3RDQ{WCDs}N3Mcd*wDS$
zHZhwU>lNC@-qJn!;)M$zh7_EDblktBAYm6ShM)%H34ch9+YMq&>cp@VWGy67&pe0<
zF<yp<fa?*-I`h$(sC;1xR#QVC^bo23gfG*g9ttVNV6M26a+u)~7$&f3J}~;;LU9e1
zZ8}3x=NPUxKRQLEFhsH5d$-6YCmIJYRUIZ0-c!<tA&{09Sx`MNV+8QB>jZ3LRH8J&
z63K9$+4wT+1J-T_84I8ITn9V9!Ek7f5YZHbBuDWQ3z@f<EZ7}yS)c$8sOGMC#6H}(
z*TbO2X`LnR;5zhY{j>ZPBs*pzC>z8KkCJns^P?J|Ryv$MKwOSm!^JOx$oz%rnJ>`r
z@C}V~hSj#Dn5=qZ$s`@d-<lds1DQNqWR>EDa+q4EuTAP9q#gngu}QnID)}}&0oGj5
zLm*2o9p|;PmD-n`jbFa3T+8yU6`_Wg;;k3$@A{AW39q3BJ>+sOoSc6sbc5kmSKrPD
z4;;9bJ@Yrc=Gwimbdo?P>yM<tT+e6ApFxt!57VflQ;|YmsWMu;xc47B>xi(9^CZ#J
z6ry8H&&g@Sw<bx@&q>`uEyqHsQL^}lG3TEB?l4!hmhDxzQE6n|B*h8P`RnHo*)zw7
zMxraTOGGhQn#c@?SUFQmgSLz2U+jWEGlee8kn0#3gmH)Y5+E>M{Qyq{^^<Jw;QiA(
z%OpsZpIO3^g1JZ%WwQowjlc1|MF)Pn8_oCd?y(<~Tp2uHR*52h;r2{(CDTqMNs}W*
znN6+9X9t~YTr%?4SE1~s_2<~PC)7ySP`&oEA%4Sm%`cz-Gf63u>EAS%>?~p&i@u&G
z^@{F9tb&9R49Or`*0B|~`;Z}>nZ3jEiRWA3`|Agq*aeFhizT<LLPfzVn>W~{M@Rjb
zStI6gXe|c06=ljMixw?{!*7cdFKqx~4=LR1UQhPjy}W%an+QVYt=`Pdq(=*&Mg-zb
zKH0?`f+2c+Q~f@Q1#6&J3`6FXl5)Rou!!1g9C;zJL-FZOW${Yf3z2IKP$Au2zI}Uw
z*?~qs#yQ)4B!*IBP#!Q7WNNrgK_$=qkSLr$Ex<D5YD|kfk(}IxSPbxr)H%g`P;u7_
zA0?}Acw64r>E!;57LC$@K@$pZ<>d6jaACZLl6d=yF9#1sY-a(+P!*`X%EPo<Fk^<4
z30S|q@eI9O1B3jbEXB_PqX?e*I?Ckx_uH{{l1<=uWIYh!foSdsZ!LpR6xq)y2|>Oo
z|HXxz9+#K43G9@V3@PC?v@GPdJ%<#BY!)v*=O@Y}g|JtAUGPoXnG#-|<R9dPgcAba
z4RC{A;VU<wazuQ8!g@aj{-e-elMCn)U?1KNYOrnd=4Kc`%tw^^dlMUP1GyYEgp|-c
z8>D1Z9+)+V&oXoqEja%Hz&MK}4ZdL`FO3dO%fMh(yZ)uihY%oTP`d8TthB|wn!YM6
z74^z{2_ds*|JCd$>a{C>AbcD>Co<e$4+IXgnfkuIzdUo?={RMfRPv{%&fKDnv<c8!
zQq3S5lNcy50zdb*6iPA+%j^IWsCr^nQ_PAFI5X_7FT-k>LSaEpQ9_Sk!#X7QqiVa8
zvlUtodn);&<dw#wOI}(jcS2W1;b$WAJv`-`?p9v^JbC=s*2TqV^x92a2Vm5&SM|$V
zFsIZ~Wx&GA0f+uMefp*BIY$P<;S!0{KGY!AA>JQL!@L5y*cblk{ErMEp(0^-%PKZM
zCNS3ebkdTHj10sKNpIX9(WKCzlRCde-SN`=qqdG~@gi4dfO#31cMyT)L*#l{zk2!d
z3<;m_DsUnd04g+5A7KsRwzx2nt$4ecQQOI`$Id(wi)hR1Xq{p%0iIaOU>)2bo34h{
z)m-Jm3Y`wQx|xH*9x2Q`Uj?MZKU!)=!+lrv+DZ+@6(=6LgJ5j1+C@rnM{!K_W?Jab
zt)tlvMtWp9R165R2(z+0SYvbmMqwtzSC}&msu$`u%v2T*8#XNQZ-zi7SzZldAuEJS
zQbtp4@(SW1e0V04VW|Q*s*+n60$GP35hg?vsf-8~!luZG0jl2t9jZXC2(`=j3@aAx
zU^<?)#K;uH2tY)sD7{;%58F?H^xKw>%0b*zcy08yfD?*<e5f|IA>Wnh<z~&A5t4As
zknvfT2HC?Lh@TW_A3eYKg&==FKa+|hxND+Q`kP;hc){2oa5_B}o?@e;E%?#<2Q1os
z;p>Cjw<8OEm_w5-qvA~+=v2Zw0HH1@+9V@F*~+#og|2G^A`uH8?Y?Jtw>X&$rJxPV
z4isB@h+Iw>(ndu^#cr$i0$t$8ftlD5czYq7s+Rl31^k^&Er`Fw*obKt&!_wWY?isZ
zC%y4FR+E`LP;piN#wpiXcLQ?IQSLN!#Xsc+4U9%EwJOboh;cw6erSYG;!StBX~to0
zWM#D??EywmL{uv_OdAO)IiJfV8z80+K{8=LAD-}PWnG?(VUh4DW0BDQJNM_tpC|iO
zCmSN5kkNDb<EIXM90^uUnUH(`zVX<xV-@Z#XB#WMi1o$w?DPmk+fCfL-}Pb2f<-^F
zXZD_X>Lw8aet`j-CT-NWojTPE6h%a3&z>#2di^EmO9sy5zX%pY!GiS#+$r2utLTX~
z<yC`ozlD9ckx&d#gmaL*Ny_qJV7N5sqNc$gV`N=TL%t7l{g_!zMr75hZL4cd8scLy
zu)tKcUE>WE(!Lek5vB+(X)onWIIxI@<MLHi-(<_!ALn7tE1Fqb402FiLBX#1S{kzZ
z*29BCyr)jx_40M+cI`G$U+u`d@N@?y^z0<hX!{CL$m4c;IV&r=FePZ*@jYed+rw3|
zs_`1;(}r2k$odE%uiNTUU(M$2bD!yBWD94FtoC5ef$0Eh3}P;&M9+7vbjp%jB6=sk
zia8JP?-VvW@g-%A$78lTh;t8we|LZ>M*nsEFU0v)_r1mhLZotbe4nRVP9lSmrU&<c
zi2T&y<f66gNvq`ov5n?7JJW2y3C-88P5n?LNa*IxuivS59=39+g14pfE#9=TY}AA-
z8oKi5BQ{l~x9CSZ6EH89`I^?qjZ+R&>w>a8bhCRk%ELAx7m+Dx^KKt8<l-x@&nQ06
zX}ogL-D;I14rSV2MmADzF}fkE5yYnOs^9svg{`8o|5!7rRAzM0JJs*wSY$#MB#S3e
zn-vb+D~kb`5YRnxJu_UIG4RxC-TH6;CKU0jMw>$OOe(y>ckjL*RlC=;C+|+Y*u;m8
zi67<w-1hH!&QyWCav;<utyHebMnt+CW>k4@s?!21#-eEy&r6aV?_J$%0Y<V1B(#!(
z{@co>Rca^3*KD_ZMGCM3-k}S6|9HU;y3ZAb`5MDecNw|0UEn|J?vUTbLyE4Ro(8lj
z+>3b+7izW%z7<tkKTvc|m_PZuO3``9C6IVBDzZUpWKgt_)(#*!XNc6=D;CM#Q(13}
zX~OA^n=GeK@3DC?bU&ynP#Aq*h@TlW6J0xn#7PEmiBzo@gdy;ga0=z}5_#lHUIQZ{
z!OS6{K#*P&#f&Xu#UciV(Z=Qe$+IVMa}Qcid~nvnY5_eO+(E^|Z+YMY|B4cXLkfLx
zyNMla)6DS6E>+=D@AGZco^mFxF;5_Kp;$f*HZ-gi!!o9C=K22ckjaDW?6)(^U&O%T
z#y63JVFiSA?PR0%M-IRKv~{toYih)`%s6UtK#!*p?-Fr%xM(>Eco(Q%{J>;S1jGqX
zd&|Lt`5w`SsgEcvXZ18;J8<*KVdf05f(OB$6_q&yh{tHhUm3Lk+y##B!=m^32X2nO
z>noC?4jmr7HFV30WD8rwAuVDSt^7IWW(BE&hh$IV<5st4h4PRX=Bg*PGmJDWMqQll
zCL3l^WL^MffEo7u=>2o(a%lb##iXOrrTA^Dq}1c@X;go++1`W3Q0}UTCP*z@uk8$;
zdbk93s^)aq@PXqi>*jV#n0F!D8T$^!f)Tgu&*|iWE`Pc|EcL{RF;D*~E>t=7Fsb#=
zvaCWYIxWq~CTxN7v|B9e5MRNtYQZF+TIIb{JFq7n{r()>!FX*zVsT;blBn%A-Wtd-
zJboHO@OU_}+S1BObh`}AGLwesp?Q8Kvua4p@viAiw2(#0)DWRC<I>w<oFGo(<c<aC
zjTsS+Q&Lgu(&dDc)$G|TXFi{6rSVU^ssW>2!f)aL$0@{3g~EmE^Fl_-s(}@cEn>|Q
zk^`3f_}pX1u<oTVShdT!Mxui>LFhPJsyoT^)lm<XzARqkGQN-W_?v#KfS1;mTVz)D
zF3XQzXMD6ow7qy*sclE}K%@oDE(Pusob}JlR3^fmO%Rm`t1f#C>=yrOcNe;>!?73R
z;!-ub-;54G7K1ieniii_sl6B~u9y;aP*|8%Aidu;%@44g$0EnJTM5G?bV*5`#g`b(
zB?(hSQKsYnD=TBrMqvwii=_tJksQ<UIWazwH}@9jJlHcVATZFzAm_tDBW@YCWpCaM
zLsm}wz!6D7h8iiaiBw0gepUoJ;74O^BfhCp<U&hQ4aCk;T9;9j6rW*;wAMgfV@dNy
zzx0<@d|WNoT3+>BYIO*pyMu$p$<tHm7{%?4z*N^tw?5$~F%W^%XSEn$7=^hoqp*d)
zy>e+VL9}JvGp?WS;?sTv5wZC!S1HRfXw{(}FKBi{iW{%g(#OX~=vXjzwdAV-Ejw=@
z%D}V0$Um>D`n#D6?GvyItp-1acc;(59Ht3ZsKLdrfLPU?C1?b<vuulIO7o_wTk@4n
zZF^|W&%$bjj1Q&5^#URCyw&2V5+hr?JIx@SQP&c`-agnk0mmUm#Ux#dCW+n>8Ju|b
za)TLX=$8_oonG`QAg(&`V=>cr@}+*a)|gU-<T0|eo5{FMz~|tbSaXHEnLB05U89|q
zE&jikY}#lpjKxi&733ot1%&>^v8A8?axA*jgle(Eb$Hj|I6QCNx&Y8vq@5CjsJK2=
z7w7Ornl^8q=<1<Aa3J0!XK$VfG+;?sAn(Cb@4){3)a%2Kw5r%ha=Du`gIRR>Ke*g6
zKyvN={c|=~X9&Omf;`vQmd^B;MHIP`Us`d=Y6`$Rh&CO3+Q+U`JACnR$EV&geDXx|
z956o=4H*I=hTr|+iFXX!1O9-BMZy05V-FWiuh4(9qzkcMmXlHv(P!PF;Su91lZg|h
zoM6J8=5iDPYKKxOEm5S+6^Hpx5t__8y85ZgM`3n+vVzKQ=9#j&M-K0@@fC~_h^;9t
zwuq4F+H^*nPd+>%3@{jB?`7=hLkBEsM=e{`QtL-hL`3O2HTOS$Z|xO_=r@k<5AGzF
z1fk#7HK8eEbV4S^^+nvoCzDN!@y_XwJo;GA>-xdrG<f*}{7<AKpyCj@A2vl>sg{Ay
z7$=r>vGfeWXp>zW5G@N=1)xUrHIDp*0k^b4cz)?JAcIPUUUlCX9T*ImF-DDMSk;1F
zoV|c{15{+Ko{mmy{v~+4Y}G(JBEvo4PQaKw-}M_h^Z?E^0%Nd)ShNE&*~B{6QzL(D
zM#?k$P8maKv{0iWrjbmB;7LPq=wsPtQfU9iOw))?6n7{vyy0S`0xGsYG!*2AE{u+z
z?n`P`xLzX=*_Ac6KT%_}RZ|Pvym>;>XA2t}--m-T<ML2!y@U8{A>}|>3Q7<(Tk{sj
zf<DH@#W57L9D-+U3;jNg&adAp7zyVZyQ&=e4@$E#);M+}57@XI7#<Lsq9jwW{C=7M
zaSUW21fEiq3XpNT4j<mZV)DLyx2;>ZPJQeyw7xVTG}OHEwg_;*2fw8hkz~Zs8Hnaj
z??-N3k#B^^h`fPh^Jrd$2#j=frz{S>x9|kAKN2NTldC@Maci(AS}AU;1|=#}8Tqby
z4g*vm7wiuguOb~mF;MJ65;kkm!nxyn!5H`y(sF?;Lvg}3LmV&$Op_pLdq8}6vAm?v
z6g<>6@`e60GlEuxV_}i*1jf++n}nr7HOWKPMJoYz*AvPYhIqth5knB3#sfYAWaERt
zSrQVB5!d_a%RvHZl_z=t<N*n+=PwmOEC#e_R|$G7M4N<>&z|WKeb6|gMiN?{Ps}Jg
z*e&lGG3qv66!XqfgGAjt!T(76_AnMhrDtJYMY2{{a#u^Wp~`M<)#e?iE*;`)Dvo!U
zoV0I0b>>X}dkaapO_+URR78MEf=!`)67m&FMUWV5kWRl04P`o;FM)Bw)f+bgII+^O
zR#<*t=IR=Ap+$VZACH+C28d^V6L~+COc#Ed&f$q%a5^TaRa(wbl;S&z#WI{X$+3FR
zV8e8jEEHLQdV7zxqC7NB+E-tE&k6F7&H|*PFZCD_cPTNkKRPHO1W0&L46=|t@1fy&
z52JpZ$KlH^sy73D-Hxs;VXs7rV691sPaY<eE^KAP-ZNBnRqyiM`e;U+Jo#zD9;RBK
zgA!>mmCw{Kw?8A*=lj!U_PlB~M{Fb`h=qxX-D6=9kcnb-n<m0MoC5A2SPg^*^soRQ
zY-)I^q~|3kWDL9qM1wtR1SbkKUFPNp4X|ZvFj5CKq#U}5IqG%nNyw;(#5xdvkxs#$
zh(9L~`g;&=F{I?9uqO_KmYL_avj2Fy&-YK<{GPkw`T;l9phS9yeg+2d3LWWJA|A!$
z{+AcA^Y|@N{g?(mMw6TT<5RwDQ~Gx+^iR0Er{|}cPxNAyN7<}?-4lfu@u6ya7RE!A
z>|I2^>PEoFjJm;IQz=OqfPv+jyLXlY!on3J+uC`S{IO>=%A@2Gb1~F!Q5Ml}^fG;I
zE=3R5|85LVL;4hP63aJHA;7djahxMJQ?Pu+v-P&@5j}j^CZzPj+i%~I@Bo`M6BVBC
zm#bZ4w(gI#h+4%-1$1+xY8CK}%?N4b7N<Ea&k(CC5P0#$t3@Iq+z3n-xsU>qiH&UV
zDmkZnb{x6r0$s^CfyfRpiQKe#b7$S>R=!k%LWoG+N<}J3UX(9zl{&GA0;Jg%P+T;O
z#C7RAnU<$e5Udrwrg&_C6_of6i6#ya9R$cBV<FAEw$|%Ps*22pQQ=JFkI>{6<8l6y
zk%gV<r;+pfty@=bLT07&FWI%pf7HfYfufX=LU%5Vx|sH&Y$lZ4MqWml7VKw`P&qdR
zBQ0FKwqu9E!mQth<aAa%e3LTmy{7c0mx<WU{Qq^p0)d~<S<3ZG{qf0)gtd{4oorZw
zkXzL+F(E;h(p(+KWU;-OIm9g~9R{0Wh{K$HvBcnDqgba}tncJh=Gidq-a_0y&a!X^
zM9&n#I+5u9hDdrP++AR71Oq^}h<JEPyK<ydt7P+IyqitZP{fu&F8Oiw6_u5*p$C96
zbQw8fZC%2LHHH;8W!+G^pW32;XaCazG*kO_c3!O0$2SiJk0nJ)^AFCPv_?ywJJCyn
z7|Mi}z@8m&(R4k$fFmPVglH`z(pX+x%V{zF^$Vh>2JtI~Cf=w+@r4){F%4U++tjFu
z?Pu4Q#|4kY%!l}2%05Nj%ge;(+f%Z7%Eq`^%THyNHs0`d;g6sWb}TL_JG0{TG=b$P
zA<=@*-VwQ)t{#Ct#OB<YZ?A)P%WhPeC_vpR8>k_Fxt&-9-rz^>*LP|S%3S5A17jlS
zDThW@gP>56w20IIjlT|+!W`UF!3j@WUuxH;%?_PBHKGBVK0+CPTZ-4t1bfIp%<@A4
z$VpLLlb@GGzQ-O~5<TF>MCa>|R+zcu6^cGG{gl`Yv8J7ZY*T3HlbO-e*RoL<YfQNw
z0LikJgINHRTG707eP{ux)cRu;74G}<YwL=bxX&^8GcsC7|2JauX1JzlL-+$PnfKqf
zqzTxEh!WB2)BsyOD=&Y1G(}!2XNOvU8LB17`WVa1C1W>*He@(_M7KezXyFio5Cu_H
zE^7EQ%+}yPE`f-6{I^~%L1))%*GA0T$`lM6$)RSY<CuQuOdN??Gz6zlC%-xu(*(W1
zpGIA>A_CU)=WAOfrIlolTIu-H_i4Z^I=<bT#EL-s+6|Z{^vdaZdGQ%t19{w3AsE+P
zCUgBU6Z4q+rjh|kRKRo*LT%71>bg7~09BWiti^0GX_x?qk0se>!Z9myJWJ7hg=yk4
zfWqKs-mlab`W+Zkl)|jSk{OX5zI8pQ8&wrYzWgBul;BQMWfD5DIHXRiw#e*@#u+Uj
zT?uB`H>vVZOo<1}7DiYs|ET`Kvjr+*HzXP2wZdTP4p#iJT^hOJ0%QVt1D80s>GB~x
z5AH?R&1MCe$VBGK`~)09oKj=d;$Nr;vk-tzA`;VJ34P||l;`kEmIX1G<&x)X>E)m9
z@Xm*x8^MS&YrO!hBIX{5o_9dtb)*xJK(%5C#!KO-BCI??hsuK?#k0z@>xr(pcsY2w
zJj9V(biYCvk6hb@z6-7(!hbIM7El%O3qyI1pJE1^EV&rwhHVO*Wp^GrMA^Z7+xb0(
zkodqCo?I^gAhvzPL3v$_6pQSN0tc%ZrpT!Cq&C7Mob>cNY5M!icpn+Jpsu#IR_<ri
z!{=pXy<A-3nRi}eeBKhI<ub@qua7TaUDUt7<6F9|9Em4`=Ac3QFlLjDs3bfTQ>0+i
zU@O4tMaxzKDN!SER_#y#F}w0H!RTsE&JF+;@MS5u<ui93nQHSreXn(OK_nE<r=uRh
zn@6Fc`whj_uB6qEn6!n|K6%HzeMEAdvB|4&MgE?=C?8WhK!pAD;oA3i@=~b*MW@oS
zW1d4Omy&R>n_994bt|kdh3ZwHZ6tKHq$L@>M>+P`#SkV|2HO}jNYBr=vfTRh`}bPF
zu`?bO7TCUw<-N<<CfpUqO<*4Q#f#C{G8-=Y8@-e4Nd#tOg#T!iWfU>sw+4VV^&E*^
z{;;|E_C2w<K<q<-EV^aJp}I$V#1eJF**A^fn+#sji}`#ya1g<ojLLx8#TTuP3e!)Q
zfeEGz<P2wL$4s{>nd?<}ooslE*BYyB8o-*2R!ufIouqhYPKV)%bF34lA_vWW?Qwo&
zn*l@&Sl@<VJ=Zcaw#t*@1%3`FOk+L>+L&!5OeN<jUp&Jf7atM!ZuPgnr6`${5BoMs
zr-F?Y(uYEd*KB3KQouM*;6i8vfeVl3z7~-kk?YdY4gZ#W8ZYsK5(OTFfeO_nVIOD%
z#ZrpzV8SX8gk_l7V%m_uXh^iu8QNLkuBMD##EdJ4iaW#JCV^?N^%Uw>Q}aab?O>W5
zDhA9_>xrRqc9g65m(bfYETBZ60}jHDgh7y<*t)_H4Ov;!P0Z}rZ{P&{3QCNlZE>vn
zVuOQhWdp0}siBS09|gY7*ibwXWHIu+h4vgxx(SmO+mO{jeKVMz&dPi6!Lv-D@TZ`Q
zApoSXAlc)@;cYTu!hAHhV#U^_%bFes3nl3;I=d4F!ZO;CmRQ((CaU(BMkrgJzxWGW
z(dx}e+1e~~vW6cHON2r}BMVHysAYyAscT$Zl5Ni6k>aP>3Ab6Og9W@jf^;y$GxCFR
zap5-0i0uH#@Ru*IQ3qaT))jB5W87+Xj=szB{Ta+e;4<QY+;AY)q_EvADK3M=e*yI-
zCNo%c(94fwP?@}h06`4M@#_$$Q6Nx;M->QuqFh=Vb_<_gE^0byfF_9$L=Mzx_J$-Q
zOex8g#iIgaOYteAf;b28BJg9m26*-b^@MCBWtAH5T><NW315DQ;VsY>TXRdfmCTwM
zH}`(fds<v-2+tln?)x!a`^`(9-F#}aMHFT2GiDFUIJn&q!cYVgGnUVFZ{rmJ`=2x7
zdtVW2il~)|gR6*@VdxLAwzOf0dqv##460pHVM9eVwZf=FNk~s&2vGn_#|YVHoxJj0
z;?_{F^wtw$m|Qk3GwxR2MFyx6Y6*GBog3Fnd9TjMwicQ-+EwNcR?PJ5f>Q!@3Fb=z
zRH9np_d(J46aNsTobWhFm5C|j+nGZIyu+~rYYCHQQ{c{!N7$3aDA0^dqtRT91uQ~e
zj13K<Mg)WuI3Q4d;GqC<Jx7d4-Ix$(e|z_c_;;q%PSCmnTF#n#LmDY`B(mdp^2i@6
z0{<D8G9TPr(B!-dh&MN{6b7tNV85mS1qYNW`fJ*+?<4u!9~@s=)ZQh<1`CMm$*$}A
zD-P3-ii&!|`Yz-nCn;(GQ#g)AM{_Pb{ood#N|8K<4wv`9ePh~h+30HDpLMH7e6Q3O
z7Bpe${Mbs9xr`&4&`R>Vs52lOV-ou!6Wc3-I)^!VQ8tuU3}&uJ4keqkhzUYBGqrTq
z=vV7>U|q(WRtzolShlpSomR;3DON%EmOiOCIrESzQf(=ksMbL%-qGU<nMBMOx+k}H
zhks}E2GwBpPQ1Hvy>VC~+R@g(nLZbDDN3weS{DJ2(0oX9AVYVPC(8!fD!VW7P-xn#
zf=FupS!IKp$oh=x#3M)Q!QykSWbgX3bpNV?54hKa8n~(BjYlzonnA_`#i?{$=4sX(
z*Xqygy%&Jf)Nm1}iuULkA9UKLy&OO<m3gx<#z!Yby~Du++Klrq12Lfr#R4;(vSk(S
zcp|Uaj&4<-S_|PO(!m~#t)ivd9TMV|dp$GG5T<0P__eP3`LzsmvqE=S?&6Dln8;sZ
zUqhgHUS9rJSng9_;00<!{*BBeL!mw09O_bikc&;#^p#;rCLgw~Fvl~J!&S<!;aR)1
z^06LDyf1~-Yi|;oBd*;4<3Yl9bTeKj_%S7!tlz1MV-bfV%q(#q$&Og4ku26EeTIq|
z$BZX<Fwrsf%kvhXWJ6YzGd|z;hsq#EmrWYPjyrA?F^yHoe{T*Y3`tP_gGy(`uKn0Y
zY^ky*KEf~>Z$$BdM+YT~#<|Wy12Vwdg33+1&8d`>lmX$QE-fv6s4?DY+_)xO9hB=V
zc)kt`l)q}7JVuO<s5Rk;rj5+b%1U`==jfboS$iS9)agxSWx<uJPZthgYf#TgYcTor
zRy&~i*I%;fjRghRCjfsPvx@!VOv#NPEw-SU`^xo4;xA<Pvzo4ew#aZ}>Lydf55icZ
z2=jQo-m6DBewJ}8ZY9g;Az_wyOOFRg10n<Jy#sqGsBC;Yi3vQC-%ReAxQ6vX)_yN$
z@82IPK5g-iag3+v#8yrE&bjq&=}@5Lay>Ju+4MdMdId{zGKnnPy9`Me@ez87{lp`J
zt%+gM49Y_0LtegmB`F&O+11j=Z9Jz`whD3mFCA4sIvjIhKriu#q;B<E97Ru1O6-FS
z0EGKq=-i)qYSJ7)Z59eI5jn~R4{k~NNsmATe_36B)+nGQq453jvnZ*~1EHvo`E#E>
zBDlnk9*$QHrTR7MFmkHrxN7=2akUluiD!Jz3f)Ty$hY87p7ENXXL0_`Gg4mY?HaZ2
zen;V&0cm?`+@PP}=Znuh?tWl4j#TGi=JS}nI#KAzR^Ofj2Fz&Z!v{epdE@HUhH&$8
z$eT@#{zE}-!77LM&e1X*(#>s9lZ{4qJRWyMWKEMdE@{{*?vx|`7`zzMqfGni<9#Hf
zW1MjTOjT9$-`?!bE?e0kNoa1{wQI41PAb?O#T2u;Z$g~@&3(Oa$&x+ELEfV)UZ!~L
zTx9r|pDYwSH|dnpsz0U^H6K2ViMG)9`S!uQu`3q!7iY4t^05UELUv&>niRNzzO*X*
zBTErC1qD@|sldujULVxViNPPhYwsJ(u(n3mz2!q8m44F1`q^9#AUt>WB1ru~BF2Q;
zBJJinFBUEj(xkLx)6-vShHJj)0zTK1o|jr`#es&(w&+Pk@`(08l*|N+O*?i}zpsno
zC*!1G@*^SYZcmrtstWfX-)&DuAXgs$>04R2YQ_ww@R2`1NBdi@vr|&snX+?|V*RnB
zW{kSt^Vqdc7Y$!O)*C*-yz=(z*Wh8fx$dv>bKQrY$}Ka?(Hx%Za;j>O*1bVfd#DUp
zJj1%J;?yU*?21GFwbTBm>gUgzSq6<~oE>dcmz;fj*X3w$x0%B|s;B3aeO(itI)cjN
z3Nz)4AN??4M8*14Y)p)+<@qwL^hXY-nrXC9xT<?~!$9Q+ntuKDn`tOreete@4yqz|
z|IC(G*+bKgP#-s9Z)b;!+a>zz?_}z<jKdOTPKO;`N8TX%PMI;oj|!Z?=QQKYIV?hD
zb7Bq9#|vDgu2K{`1=s%2fm>p>r=<34!FL_SrYJ}hSS+{X{<eDy{q8Ni@WkE^>w%Jw
zc{Vqh?P|n?+0=(uXYad?<=Qe1F126@Mb>Cz<MeIe4X^5o3FN#r?}RlwMs5-2fcdqM
zDFan!?`xcKPFXih0d;l_8!a=R{)}Fmn0dZ`$LAeK<Tm~ZL(}1xqi(0Wd3k|U>1&An
z6vPS{=fwStXmthy3)k;=?@mFa*sWi`HdJHeJ!4bTJ7-+#p;%bgqJvh?p7USdkFaoC
ziiw%by!ZN%3>Ae^B=BL)WHYldd=v-|G+gyCOQqhK-_;h@(;?{n%3(zLHW}S;ei=`g
z<;UF^u8`ZHvfka|;*oBTESRBS^x6pXn-l6GEI$C`YuRD|c8^^E-GTmPLnt)>mGI!+
zN0ydlt5F54bC@IRJmLSZ7S%$=@)LG=w?=MQ;5>Bj;7v!5YIF*^q3yMk4T3vRjZIqY
zC>sLc<`?MQy$F5;<PlYW@9ns*({B{4ta3tWNZ^d0y&dIfGZqM2srK#O-R^<Y{rh@&
z=#c?F$3qm5YWcYU;pxT2zM0ztwQaqsi-t41g5|hJLD{f5?Kb;;e@_6s92QHY#;pUX
z_>~v$&e&bwqL(p8aA_EHEE~qt08(1qeez_;Yui^d^3PgCg;^MeyWqDBY{8j<7+2dD
zF;pEAiI`Bca@Yzluh9|HmZ=py85J7pw{!E6ibfjyLPw}7XKMN>_P3d;>puUCgX2k7
zn=^aI_MwIp9#8~ZU&z}k&|KMnI6|u(>ccl+5ZPe)$!^S)uN&2k#VH+S^6K6__K5WZ
zPI>au+TZx-^=q0+a4mFG4CXJ(2*_#@Ib#d@JZTfYFwnZR>ghnV?X5@FJ!|9i_uYwC
zGEW6Q>|n2MJ9|^9BXt^xRseHdjt`1C%XjZ8QWf2O^eBLr0t{yhq_`4rj)*w&^gFD0
z5Sn&?XEdg+LGH62jIvdWLu^f;R5%_J)4JW}MwEf_c2=*Y|AxQ_HZL0#eQ}}AfC0+n
zzyzP~gJvxr)U&6<>;Q!6s_Mg5u8WD$q2K7Pqm#XO$n~E;*NOo&OQ_(qJE^EVPyM1A
zG!E8dsRJs47s-oMx$b@YW-cC;0d>27s>7<VLx&U@F{8X@@~Q*y9X-?4GxcitDkMW{
z)26#j{OB#Bs=x9}IPu;sD%xIDG>=*p;V&QkOy3sA?+rdZEavs+=>zv3c{_RSdnK}^
zRm&A$zkM@b8SrMz)5(~kt$UZB0}Ot;iT$c!>Wv!I@0iJ&x$`O~l$!fnJLRrW*Lv02
zZ_S9qR=*yoaFh0yu-Beda-X*;oHa9;Z~Nskhr)oLj^sn5-DU>%m>3)ODs{ED_oZxO
z8X_DF_UD~Eo9V&Hg39jHsZ+h06NiGe6%E^-oa5Skmn@CsRBYV1F}StzWM!Ywo_>nP
zC$~#5^vm>8&b02dq`d?@gsigK0UD1<jv!_SwmyvD$JW4>Oz1i~>f!PU3N5dW(rjnc
zd#`!q@B=rBzJLFIMYkoY;pn&HPvss>OX~`aZQH5Iv`FP?kt<A<USO5qq)8Lq!GoLb
zbI{mjV(aEMvUI{Hu>UVPzv?EF{#feJUi!J+D*RN}qU6nV-!Hhphz-6JeNGJ|ymW5%
z6c?el9?#yb+-Vw-d!QVLWq&jw^Ls|$q@>j2FK4xfL)ez<cAKt0*UJ$<4W!98sCcGe
zga=;5t`!dCO4G6tyJ`>R+*Y=KW%ieTZt}Hf##heur4e96V%(3Q`JK3aP5q4-^bk93
z?#k~^0FAVDjx9%I48@T8h&7uSz@rx6%x!Pc_rM1K!otE$TecXlSn*+hX>*@7Yh*`S
zcRjsj({eJdUw4>M?*XDY(B*`c@46$h5#=k3PTTZRwX>X_yKn$|tLhWmj=QGv<Uo;(
zSqaF}tgQ&vu&_4<KWFuL3(NgK!`!~C8G$5W2Z+;z;n|4Y>}>I@XiH7NI-k)@06}dB
zHr5xz(TNj5te|qY0}W2qu0uQqwuvd~DE@LjgRBt_4_DV2efPtGZ2LJ*%<3>QW&ZER
zizJ6AwHCd4om$q>IXvjog5LSM9dcDnUe(VyyRBsmjK^D*S=QFuWSa5Cz7J4B^=nRC
zjj)J9B|IQ<)Z>+Av7Kv9?70xR#DCT8IdrF#PTN&4SoGez(Al}6?y!{xonKXbfx!Gr
zTib}j^>qA;CywDC@z%QJ;oj>`QIR@TGIkzUKaFjw5jWtR*)R?JF9$+vR~>7w;kWZt
z$Hw{{n^^4)#e?s~jSdmhv?V3&ZK|fCl1_^Y)W4oZ(nljBo3*@(cC|UWz6LcX>f|&9
z232!}biT~7vB~-Sdc$FzH-2Ja6`o>Ld14f@wYVQ=U@0-4BY-F$g6jp<Xo6R@I=@cW
zwc*2u5A!DuJ==FH4~~A;{l#Y%w+~Yn&;o%77Fe?6VgF%;+)lrU6wuRCEi9Ul9^bxo
zg^R{kwkd)EEc?~s+Z`MTLkG7e6VuF!a4n9o`{vhAA0Z17Ca^_gBNADJ36Sik+Gkc)
zR|gs&6?l{pb<9Sul-06{!ygosvVqlN&JVnK-M9<meXb?07Jd7+<an?M^ClJqe2yD`
zdm%2^%r!g8J`3(92d-ehLB=^!5cdq7lFUCPCMEInh%fk&HvxWP4sOn*EzfGd%IQXY
z%HWFZ7>1l{#+f_+UhUx?TcgzPTFRLNvK(idbP6z{^1Obrx>EoXI^y@RX3ZMllOO=!
zsq6o8$IOHwv<5MAlLEkl>B(j8-P+QVWH8Hh1@kh*`vu{LO*A*e#>QU1SnrhF3`R3)
zpsgdH-~p)HvLV77Y@3deE4uHxW%uYG6sZXAWn+f02jx{&{*>~}A!jze#AV@w&_I%#
zL&ApqA60Jxj&<6G|I=nkMbw}YDs7U`N}IH6Q`4waghz{%s8m`Asi=&Khooqk_9UXs
zmXJqArlhE}kVGUYQoqk_-ueCh|2gJ(kD1;k&-4A>_qCkod7W1@LW2CKYnW^}z#qj=
zKgl2nD(q=2e{*ZvTvGy1AXn=&Wd6+fnFt4j0hWN#X{uoGeZW<KEO<0Y5mmf%TnUiv
z6b2n@Mh}Q+vs~%QF9mqRgv1svA4_zaxI?~ygm{xw?P6~)$Ec6prgB?@t-B*12LG)Q
z*!H-mCd$05YR~W|f0(sc6sB#T-=SY^-)z^d%c-+A8y<BoC@md(GRJ^(LVe)Be}C@b
zF$)q`XT|6JXt*99ANl3ujQV{4E#~@us!N8IcKK~_O4Om^DN?DE-rylaGI{9^X4-L}
zgOQgqCo4*qmZw=a9m`*y_&`X*g)coJnLgVG*vDaF-y<{p!GbhWSSJ-Qwg$Yw4#Ol<
zbwNL{V5!Zk9Iu@)KElJCnwf|VxWD1eW@b*w*Tl%D$c2PkAJe(>^2ftk3CAx?k3@0$
z%Kx`zseq3M`7ggTHj0{v#fNGmr>ry96nicZtG)u6nTi$E)xC{8S&psP$0me87)va=
z<eT}Xj-=<_ShG}^f{3;YtRp$aRAo23uOnUAsP(Vp7GAU+cFz6_cy6trD7*9?cMoZq
zs2@viZrq=1-%wYVMx~N;?+hv7K%}lirwn6`zKF<zvw?qgb5->+>Ci7Mw~lhx5#t}$
z98_Md`<n?GK}chACAPfRU0B|tj3+}nYN)!<=pXIKmnDhND_&UK$jc!s@v6FXLyWuO
zO^1K17N0KzL<hl5{_pXo<S(+>1Ug?K&?F}<GF*p4?|$kF_Fey)0H!ArAYL<bZq9gT
z6?8Fu>fKxSRU{EB%_7>59FSg7FV+l|VYYlMZu^;2r(Wd_O|e48;Z}OnA8w06_wqeg
z{vAz7-0H{Y7cKqgNmqOe(8vRH_28o8VksyxW|akZ06lAYZh=0v6>l+iMgWMIH;fX*
zvWlENwr}D^z&PPW-U&X}?bY2gXF9Y>S}5yAZ$IEM%dk5GU-F`$qilzP7K30QW|LwI
z=p*E43W;%a9gqJ?=!iO7JZS(q=8*I)EP9fkr?5MQVG2G?l5CRB!98df0o9L%hj<)B
zXC;xGd2EwzdWjz@wt**q4Lx*d7X+MP;8&>Pqyx6BoPt!6V@O?TJT&Z%M_St*1=N1=
zqSc%^U5G|VQ#X&ZJb9?fb8qGf&?fycW45L@fBo|1*q4OqZ2N?`ukw3dH~v@s_&KY_
z+OLt+zi@m}X*#spT<hoqd3DRcA;GtGhFl+Kp%?G-MAo-sF@6|zFMqYX{JYXq%%QX&
zt@!<bNp#PdQfYnI5ML=-mPIhQLsI|Hgkr|<?&?}D0yMl|!kVHO@)8YY0I?Bk-7s%8
zY(9Sf`KZzp-P(O|N4Pg#El;Y7G=WRv{UC<DF)$N3$tdRyhQ!)OtT+OC8p=fJf|8OT
zae08=h))wqWN27!_-W)`BbaCX`uXYC>!CZpckG~`kZd^q+yZ54e2kbu*YQ);BpwMz
zJz+w4p)uv!>-#UH|7iiPnP9;$&vzgg2kzhBw)WJRMZ%wL@nW%w0AGh6*kvF8=d%+Z
z6j%odpV$=)*wB$whsn_>mJ4%qz7^;1xZghT?4+}Wn5%T*2QtO;ZjjtH!`vf*Lk17d
zU_wTYxWg>iHp~1>bfOeUs{1qlx_mic@udL9o0;H6Uf+)4CO7L_7cO)ZK0RWFA@<g-
z9$>8H=jTM2*x$N!OIYxLC-$V_vD~?HJcCao2L-a#SpjYxY^scd@Ac!`FwrWo2yaev
zg7#h$(`X&X%hKdSrlu~FFj;{v7fcTPG$T38M6!!QnGhmQ5hLGzZ2uVj$&0c8#s$@v
z0uP6VIt%nM&b8QtOya&M_KAriyh9*NlS-#fNdG&3y4!3dY`#k0$~Jm>K6Q<&N>0S+
znk!02zu~lmN?a^`qsS9s5vz2Vtlk?7?ow}DUY>G*qCrf@Fl&gSBRVso|DJ$=?q9-e
zW>&kvYcG7B?|t6EZQ1?di0fW9zYyBu)uA2^i@M)%E43uw0d)}r)U>p6+MA-^PTF*z
z&J_9=B5B*gwyS_qGvNDNA05~@ru5>bDb^hm+rE0TDN1Gj_qQR<5uFaNrTX%-?&U1`
z)8G2s0?U(g3Hdh7{oa}fnTP`Hu(X$^rZ9b_q15ZXUk!a8LPCe<ynNv(K&5<7OfbV$
zbs<Fv8yAz>Q#J#`x)o7(BBs=&F!QxF6!(9}m)PEVbO0!&=!Gc~C!tispdm#-6F^d|
zfJsOA(+Pipnx|*vt}MJDvKuRY1X+e@k|@;-LfyNmAWG*H?&tq>T)$owCI3^*bx<`5
z4vt6MXZ-lyg3Ay?3cRv1_*lixoy*VniI|UB{MLp+ewKP-?h-hrkFKtqVQvidqd0Ry
zsVUZsfe(uI6E@nQMTgeU!mCy;+WWH--I18u_|ni&_Tb21LAD;`J2_m*Spd!ZieO8}
zB6{FDJUFpfkt0Y@(1=mCT9j`hN1y-%{!i<&qjtA@Ry+2d)-iaijn6_V-aJc6Qg-$R
z8xaWxNP}^v)ZZ_7e|fJa8gJZr+47|^zNz5gRbW%}5SF^CLZC`%0}u{4+Mk)w1C+*&
zSU=jrkNxF16xUiYamV{7Vedj2sih=2U_I@GOm;TQ4^N=bVomDAOwl``U!mFp>ut-H
zWERO%)}=-aU!$t2xiYkV|Ni|Mpt3ZgLC6I_67_K&<!)dc@JbRCbZkiaHA8rXr1n80
zxkRXk7OPmvzk2dun>N2oZ_ps+k?URnL$nug&*8&<Xes$Ke6!v-a2Yb5JTgNL^=?@q
zy%%`T5*0c5jNE-jW__N|0ahX@T}q7p{?#roHz_e8VdABQ&p+DH53Xn44ggkO4wZ+H
z0BsLpIfp;Ngoc}n=Wn`39_7HWtKo|En{IDy_~8CLZpP_rH;2}DK3cHy?UPH_p43Hr
z(2iDr$DD6M;oZCA7@1HSf7E%=<%^27)lJg)%zrv`e-v@bPxX1MlB%D@#58M5MZNl7
zm!!$YPxVdX0UiL$OSBJdNcLWk8kt`qmf=y=09MHz4@#pn57s{W1oN8#15%;2?aj17
zHGD+gqi=baDCz^wJHkUTe}?{B5VZ)n;2cHKaO2Xd(i!wr6!~Z*UZdO_4P`QQtZ>pr
z&!i2c1Sgci6zo3i0h;Z0Jw>}BW_I8f1#H3aRP0}WDTWPQo~S9qVa6XfjvXMGubEZI
zxbIATqo=CKe&E(IcVuSNHKCm)5!a@VXD&~8BMT~)?eLUVI<HDMzs!VH5VJ0<;c{NA
zu7+VBbD@EF47bI8;@Q=Tu048)rN`X9HxSFFwAq)J-Pwm4(>+WxbXwoqx;h_3vNz_$
z*~*6Wc1EO)8h#gYV}|;Xf0qH3_Fnk9h`~ih6-hL^@u*{hp)JiS@Zu7SL?}U$9(6vx
z%yl3XtF8=OJ?4-4abuk&<HlZ`-EUaF%r4Mb;*!h7<T=ZePjRlq)uJcm{zU2|iVx6x
z^T7$x{gGm}MHBcxzz!LLXePk{p{6LGmdJF?hI}*!XR!BL@6s0;K9Qc6v*BQik*V|T
z4(;066Ljbb7+jfvG@TNa1toiN5`e%B+Io?>k!8Xw{=$`WsC?;IsC`FX$ahY(zR~jZ
zW+x_4*MC}$dX26z%dUAY0|bCkD4a~Uu613#Ivw@RdelE(zJ5(-gc@Y7$I<CSLZ(qy
zwBz{#LWwOIK2z2Ziv-?5(=f1oI%vwslg_s{c{hJu%gtI`8g}lB;=!!l1uJ`YSGn#9
z_&4lAnlq^5<@ryFi*tr-7_UZBqWeUmp}N<KszUns#5KrZ%>b=U>kRx3g=hxIXBaPN
z6M0C@I=Yjxq^qT(^a`1#Xiuzt`48v)G8tLRO~x(TxL{%Vg0BK6;0#ae+g0f5pwqkG
zI5Mp-^uasdFzO_<L$QgDnjHv;V12Y<LaI1*ZERuXToNN8rS`iwkKU1SyVzxO0vvFL
z6cStq99D^B(#LYwh(^3wHf=e(`zCXe2vqWNaD5ARyE$SvkhHDQNO<ZApKhow!COGE
zp{3l1j8BZ(zyI(d;>+E<Ixh?Hb6^d-NAJf_AN8XGTc(lii**S(s;2DNo)B9=3bY)c
zVL9l?;_an?JVJXbETOX~vt}YtSW*Haqk|S_y{D%iUFXc1x#81Q)tzS?GXAym!<+HF
z?i-q}v2>QCyhAfbRI^LjA&lyU8>c{PNN7inv`@BkEA)v4<7RR%mee5nOo|%PEYR|y
zW5-<M?ipqn+wHZ2Flv9MZ`Yiw)%1&^{HKL6GM#_6Fj(u`cO#gZsmOo@RE1{s4bK%^
z2{?Env6B|ua9PP@eYM7I^}7y@o_}r^f`b_N?v9_smlgTXeYn}Ax`Ns!X-7-b?bYMy
zzfs?@>0=W2MN}pTVZa4~pFKJ3-vk5V@o6Vnd}P(yK}}6f;_!)Dj6wZjAQfPJ2azcO
z@NHaBYfPtrNyTKaP}p>__?Z4}iOQ)4qJwB-P}*}Hd6~mk`_$HjKgnnppzuCVdgBez
z?m~6AgC<jklMK5%(oS=DfZT|b+KJ=owFRni>QuWiQEurKs~?)JRI*sL?dUTpTs(14
zmulJO+~1|8oh~gjKdF*S5e&@@Xp7FC0lY<bzDC}a&2DJW^|RRi*QeRpH&*6ZUpq9}
z%E}(04h|a{uXNv<yE;h>X3dC<b`6oP((%Kn5T@?flb^Kk0^^b6-|k&=&YcomCawSa
zwR}b(7#50}XR9|wvB!}NYnZs4h01hj{Ae(K<WRQTjku*x<6XLl7KY;Hf_fF`J#r_>
z#%jEW9|RY#N_aN#_vnAB8nA1aLi7iJh{I%8ZqE7dU?m-S0*XpXS_|K87({9Hc0=-G
z`{B{~djKk^oayI$0LI1nUE?=9>|Du3VkZGXRKcM29hl1iW9spw1~B@@(f3|RL{K{d
zD<>zqE}p&S`~XvrpR8I=1p&h_|0)t>AnpA3Ewa_dh2F2i9>_X}{rRUaJv|;M!mp?#
zCWRT|+J=UxzsE6$Pf9;?EVJ^{r*(C^dw1*R57{IT{-h*L>IlSqzJN61u}D-+RNA5u
zhuJ+58{78Ax|hP28|1kY9Scwo_WR66l!<O8)mR0!0SBxtT?1PXD6<z089XKo$kUx^
zfPrH-)xFLndiQQ<Uw|DI)7ZkC7a#|6I+!LNe$ahU<Wasx2F(Vmc9M4f8b`}2np>%L
z2fPFYthZpnE8&L-;&;_4rz>AB_R5umgELoGwnVI(k{C^rc8|Ymn9y72paLJ8Zi*M`
zxoOk0>n#vIRBmk{i6(QB;aZgl_gDd_BL=0fp|%B_%A^vjV+%$Zl^O8Et2(dNNM>LD
z>Xj~h8sn_vJ~k2$aa#mi1S1zV#lE)@(&H=Oh5xkS(Z2<YN7feuBMfVlp|B%rx8-R<
zXsYnJX!IxxOKxvbM|LQP764AXLvxO74j)jz$qpuy&}uZP@OCdu-8CWq-X!8`VsEs~
z0!#i#`S<)ON1a(`UH87A&yVVLJ14;#!7m#e8MaEl`~Kwo4eQnoB|R3fxsaC>;Gi2a
zjXQ=;t_!J%l2_0uFc9iZE9@U^YQGB)-Bp|K-Je9El)Yfy+)&j)rZc-I(up}Tn`ysj
z)55Ufx_(c)ACWHAwC?wN#iP_zxiAg;yEX3?fHfm$miOwa(lfbd@ie4{-1yf|GCzL!
z;7idqsx#kt#YWeXA4w)s*=o72UZ2O@x_R?JvmLsdte)nq1t{@)lkIwU@1HYHQ$VF6
zdu8A!$;0Hxir~P&U1!f$T)(hUy6je5oALbFCE>p_gmBno)+Gnc4xpn*^k6AF3m{(j
z#Ub&qLZuAl%1i1@1p>QeJ!*op&RiWjA&8pm`A1AlD~8epi|rH0OOOyzvqVPPpV_jd
z?I2TgI~4n3j(|GX%@}7VEhuNlhfc$ao;};g{GuW)QN*Lsw;P@;LEZf3ub1XD(iH7U
z8{R2HA5tm8a)|E4Q4q7cH7SV%dIe<b-tgh~z|0lYz3%&mj`#kiicm&DP9Vuio2s>m
zQ9#3jC8YX6&u6Q0SJsUhl9PzQeDjQhFK=xgN>gLl;S5Eya1!K@ml6NLI9Q!z<>`6-
zaYI{2M@P}?0(-|^zpfz;7VI108X!U(L^J{?s22t9$9(pB#0H`r!u-q!6kB|05LW>3
zl{4~JUB;7GVp+Y%LGuS1#w=hF(FR#wTKI;t)pzcIw{6xY3abH1TDOITLO&uXB3KT>
zB_BzE8<fkt!B<5l5Vl2ajz59(6Y41zHRycZ#+RdThmH`do1s)xmHolz*-W7=Y|tRR
zZ9nRAP*xB%R=T?0qt6j5)%hdzxW8a8{C@7Hz+@2Oihx6=5ROXJ4^dxK(#e0Y!W*i;
ziISG5o4m=6as_`6ZQOAL77651M`9#$bfMB_JUHvB=rvZ0P7O#*KnH*yyQo3jMug##
zigt;yA{>EzsVQp~kI*{P`S^lCnOua~$3~@XOl<k_<0`{Dg5(4?gKdD)5e~%g&HX{%
z*LK|XJj+#kk>sePva6-xu>2YQ{k-@4f&?>bl-Lv%s`|OqVVv=tG}-rDrtSzW$d_nF
zI!!zMXjSz(xTPhJrs{_TZ953iPp^7tTHp6oRT)Gi$i*QYO#5|JVv^jjhNTSa`%TGQ
zF$}KnY=gzFva|=qPRf@LI6ZJWrc^$0{F-|!hJ~FTV`k<D3aT|aaXN+q43O=DD-|s)
z!%QOL9wWzTYwtrFL&?<#ngX1RK0s?^;9#)z5$2CSJ`FK+me_p!P-%QpX3f1hHg|v(
z*$+{uA5)n=`4xTo^=l9Pp#Ea&^5wmW;Ag!%zho}#($68B65bzuYMEpm5{9&wFJFFe
z6g(zIJe{cv;UQ81W9#09E%&80vo}tqKjN-?F?Gakl#jCw5+N0ggx5WEJxGDDnHAPg
zYey_O?7YZgD+M&R&X3sl`sUK*mp51dN+TA-J~jSyx#R6>4Ube07o&S<Fbd#XXr5mq
zwr1SLpE??ADxMKpOypDXCqu~6VhNUpdtLV`t*xk&v530DAa%sRv1gO7acl%+Bl{Z9
zULJ?)3RF&LHC{i<r$wwC`s?%15u<kk&EoXj+k0GZTK=~t5|Aqu?m7?!e+yX*OeTiz
zUQ7q!kz_3CzmUrYOcuK%-&lp_iO+-p@l{!w4`~2hu9D~&nX<{Ucqwq{Z&~95aF!6b
zG+$@K+Wiiipnw!ms1Cr_^dJhvQc|cMgO+FE&)gYV39t6za*5(*>(;H9BBncUIp~^2
zcl8w0<f0c>Ab_aZ+_OB^uTKR*5Vn642WYS?qFrjJ(uh#$#Qv&^jPvXxQIY<cJ?ahk
zBDag;-(5Z%%>yQ$zwzKoBGeE$_b}BYtk>3-P2vhq)DOWzZL3Vn-_%EN+o2O%HdzLk
zwr_%kZr{pv&n%Zpk3QJ^=l8VkYb1Eqx*6`uub8u~<o7E@9{uKh_x|bUTLBNhCafGq
zq4}C-HXM{B4L+C^yJ=jy)r;pLpLT!W=x9Js<ND&Z7Q%7iLdzi(a4c1cK$ajknPj3C
znha$o{KOHjY?UefEYF{LrD#d7ZP7uPOD>RntsDlzonx6JL*<QTEx<}&K03d*sn^J*
z6x<~W8>wfg<9m-Zb>W{7z)!`}=K8_$PgdlEVLU~2OLa?H5QehM8U>HOHEGb16^QLc
zHA+lDx)2B+OD|2T(govZeNP%WP_*b^KQS=%gr_NFcg_InMSfk;XTmO|9}@;0Nk^FV
z&sHzfoEI;Sn?Lacb(ei`)j{pZp0sW%8SB%apk>2m4l`ES&C$(3NP|XiA`NU)b8~Xn
zFadt!a>Fn+$4Z_O9b?z_!cxrDwO#HtFtWFjpU57;ZLqJ`H#T0s;A-ZNPh7#m<5G>D
zJ%f;H$K1KoixhhG#*G^{tFylrf(-Drrzb>n=F+&fEK^5qE__1i`mzjPYW41&e?+%e
zRlN=_qtNSpaMA+=WYSeLA((RWh1Z)-LeCFN2nn*s=G<GiYe;wo7u{w~1I}W}M@auI
zdSl0Km*sR)$q?SvgDXJj{fYMs;LQ1JzRXyK@g*F_{eYie(<HJ1D_}~jPn!L%xdZ&T
zB482VjmUPL8tQv@?;gODL7%tyhYC*#B?bM<lC@d1!D1iovpN*+QxY5tn<*mem?uf)
zmI4HVlKUgu`P!dx8a^1%*CY%hX@P;`ZSQ)2znY(18IOT@7jW_YY4`Y|BF~WVX(1Fg
ziva|;a`XbmJQVk;CqwA_#kE3I<M`0p5Hl<C2K70BH|*cAK1C<u<{F2pn#}9dtkCgf
zmpR$4vBUpWgdF}C(DFFb+*s~ob=0}PnR|(am)YZ@RI|99Z+~XrxW8>(-tJBDkGs=J
zQjP*m?xw#Hv}#gP_>#L~N)bJxP!NfMDS-(gw0|A$u25JXac+iXA5+tFvX6z1JeT>C
zmJE<eIUV0Jhwd2GKkNLJD;29Pt}Kf0z&&`=K@V=v2bq|xDH%Apbu9(^3r^|u56uyA
z6a$l9BwjMMz$`Xz-UC&7H8q%aXBh*wDE1Tx&7^b=!mC8-g+>RXm-QkYU@ca1N{2P*
zq7xP5OhUqujwMVtuVAV;qs`PdK62v1FhiEKp^K;wKOSM8rW@`q=N~%;H#X)xOLC9R
zw7LI8wxAiG>-%Cup7s0iR-dn(Ee-BBmN4PU43&A!rWJ9TAa7YDf`Y=r-9!nBBOeD%
znjT)dM~TfNkA~3-$J?!6zU5svQD}(#j@kz$8jT%o8HK%tTb2-@!n9U|KetZowT^2D
z7<c{Y0iXUp+2Bblh=c+>=fC4Ri+EmMe!qICfk6ktZuk5g&@G089)?#`6D}YxdH(9a
zg?5Wnu>(z*%Ef06$?ne9ZUDH#+*E{L)cpIK=|7Xaznf6;gAF2$3K%``%Ghf%Lp?uR
z%YG82xS$6zfdhS_V{|aimzf<&JXSp}Y0JRh>n}by*L8npN#9?44-XnX#6dFVG7WBK
zU!NSEqQl1~95)`?-y&qKB<5|cZQrg+1R&LJ-P%*T@qCbyiUdX4oD7!6TuvQBbiJ6Y
zC+gkGY$5$zu9huWWWSg$^>af*CU(T)zvM5ilyL{xZ~>f5Efq1uVG5~^VNZd6a3Z*~
zBPD0vTJ||IBg-&)^=!lLbGG@sIqSpqWfyvQ)!>MA<_kx>_HJCdYCMSTn<wX^hg#{Y
z!GW99#8ej!FRbRHrZIlDWT;1C(2bSe%=O~D=a0&ggD>(FA*y@PI*3;B+1IQQ8_9?n
zE_d!33MrqeWwvLy#HY%0jP`q7q<llA3Z?`34kr1es09-9Z>~%t{ysBuuP)tipFft)
z1<5~s+F4eu0CrtmtaNE%N1q9Dv{`PY_j!uE)6~$o2BvUWMMEutRDs09G><QyNyoyz
zMa|^sbC@(eA}NSHH*S>o`g4HsA8)cK6<Qgz*)J%}#JrOb-_t*oaYPsHmX^zVgiJG&
zyi3^|V-jEghrPY7aAp-U(`EA3BMb14NXyAtm3BBHYAvm&I31!m1N#HCb%6Ub{X9_!
z6&tDbfOA+VIe;#iNth0h<*{Qd`ESIoXw;frkk+vPtvg55!)C>O@39Lk6}37#yL2_t
zc5X!%Km8^9m`>;a+uX<tHs{2cEZPAo&x_|au%dO4^+s@(Yu7f&;sg^ypCIRDQa7G-
zGGnh2?~khAcUoUdszerkKCWPelL(1Fnp1w63x8LrybAM|d*(CuaOqM)L{!c-LlEq_
ztj~RwbFXD@-yXKli?H;cM3|p~{6YK~MN4#5?OBrW6a=<$*(Og<+u$DHSFynUsj~L=
zBBHG+F^LxnxJ5??4>wnD+qb2qCF4m<q!AbcqN+twCG=`^*U5|8f$gP=MgYJYs;>{c
z8N(G(b#HbQ8J=o9ruzAda13Esx>YPwUM_}~QASo=Gg@?XGHtyeyHITkMJ3Hp^yA?^
zdD6Qw8|fm;Y+LIV&zrMVO^#K@gWjw>v73W;+e#|!*fu)9!oaMDO2B9($(Yfj-5u3e
zCv25j&!``+)TzVnOm|ya9bretU_CgnkmHilNhMR)UI3M4<q)HQa$eU;LK2*}+~*dt
zAl{dv5iMK{pgPWAu;N*NO@B?~7b-$1st%l1AaL0;tIyyo;87E=6lIXU;p^uQiZx%#
zVZhi)Tywh5DVg?QR6-)hOW>p<LCSvjd3rC$i-`q~9{D2Pe+`>UCj3@+aHs2V+ttcC
z%I(@9uDURDgY@+v!V8EBXc^lunom6}ezs$vX?>-cn{?6qS&qR|OdS<2UA%aY5Q_G}
ztWPt2ZZGIvGl^E(Ic-hNRK(plChP-q63{FdBcr!ZX^sKt?Bg`_wA{|TtQWWiQi}M`
z$JH&5k0Y=H8c)JcL|h=@F99MabS47$1o;+Jd*i2<Qz7R?C&X18ox96qyR`b#r?a~4
z^_(oNi<EClR4rRWL-}~ZnS-WOI{LR3Wl(uYGI==;B(|>?-LQJm?Y<wOCGEA)?xCMD
z>cQvh%tsK$gQ$?ioVsYV5Ov(Al~!hcHqc7{s^NYbFbhxtngc72F-RX%a92@>3>+T@
zF2{zj6DTYYLJ5043yWDZ*OdOJ1+V~e@2sGpK)^+b_!^u~pw(U$I*ez>;k2-*$4-W=
zTcr#95E?omhmY5M8nqU|DcH7HbR-%+O-)6hB2Xopb7C7IOUruDhKkh{!1u}5?x-Va
zNDg`iU@3Zib{<sqXVH~F{`iqdA$AcBH+_21iMj`vZFg(yc7m;^s0u`!<$~jwLR-1I
zuBEcNdNOiYu;`=-aiO7w*Ow1|AoJcjYe0C;wU0qZrE6hRz@2%~WBN1$<WhJaPp~hC
zixb?T+#E>=4817rg^3S{tILn@my-d>Oq-5_+2}D}<%=X+OE}VyMPuQG|G2U&6Mlwi
zmtY>J7~y83#Sdm}*^ckdNPO~EZE>G3Ek@Tg+T1|T$$^;a7bJOLbHd(WQ6L@(oOn5V
zH0hpE{ok7J6ZE>y!I2ft60$Qh)v)WLXh0Gx;0_eE|0MFt8JV(s;*8)^2qFX^M9>$~
zI^GKia=>rPz10w!OhE|17NUJ@0p@e0!J{GBf>aVO@v>cVo4r=P;yi<M7Y<VYP*Jf*
z#9|SZXb(j53z%n5S%E{{6gUW)9QV4{lNtDA3R;1XlLZreroQWea{{?Vc7R&qw^7Fi
zmuvY$f{2Siaf*s97?NJ-Y3+?q+`OqRwgph#iwko2u!bNGZL&6O+I0WVelz9d#7I9g
zM1)5Ugc^MSrXr%@pfE=la-TWQKo~hpnUFko<S_`nEO-#A9CuXeqRjzTqbulsaMiMy
zrxetr<3g{5yUtA?lW*N(!n3p7tk>HYTi=v@v?e*EBRo4u_S7m~&eX+4=czOLcBSC<
zSguLYB`9fryisO!&`<`wXnm*P9~bm-9onbEYBLq3?)22}D=U4F@-PSu7^8P&Mgb!i
zR883U2z1F~;j$<9zBnfWA&*(<cgVe9qwk@z0j8mbPB7p12sZ@h+CAfc$RlNZa@I;w
z+P=?jshPXOj@NZwcFBdw$`bU<gn@}jQ5oC;*O<tF8*rLi-484}4^g<HY$m3bG1M(5
zC%T(oEh)eDa12F%ij@bVZ-LcO^ow&aXWaL8$T7f|`E}$wye45~__L)M6Hbdu73sz*
z5bJe#N~FPLA%#!!sOXGLf0%0zFb09wkfY#}ZcHyOO16Iyz*DM4#vsI)sJ2qYxOw}B
zlqw`_3eZCUTB6Xju<*9Y`i%Y#DMSdZ08Y|^O8(=rolq0d1&QUPVy7&Gh6or`nnd>>
zZJE@5WAf*$v_D{n6ZQiH?m>lm%}dtQw3WypFDI_NI=kWjSTpLVb6G{j$Uf5JPG#W>
zq{UKgP2tmJWpyv;<C?_LW5&GhKmNNC;ts_Cht!JtbCZU(r_~%Wy`aC}p&X=I3|0!y
zFT6oUeoF?@0oyn^chs6Hv!F~385-Qv#Oiy4W^P|Vto~f>GU-ZDp^LBKC=ji0Se>*1
zaA&n+G;zUus#GD9k_9f*-k|o>p9`oDMK*&`Cvn;?Z8Ff;Zv%JkXKgSE$0G0`s!LdR
zJOMJ+R$#AyAAn9-Ss6IyeK_{N3k%V^DzrlEuaNuF+!2|IS8uc=F^YY9)E0sb2pi}U
zjF<+rK@eylxYWMfU7^zSkzpGx!Wp+B_RmG&_^NR7r)>l|A%+yjcF!;-z)rxBOAha_
zO|p6_fat<ORN)5z9~c@!usj*~ZPUl6@(8NLn8d%Rs>rnN?!xq64BqdbI1z9_IQbA_
z!2gIDI$t+~5QzqrNd3~gz7sK-r=RR0cid^t-7k-33aJb@1w6YvQ_=lyDYa|Eu@$Az
zz=2A5mA;yxd+?u^FApM-LNW#KD&S9ALetMTrA_r%eQC(NA6TEK<_el&L4{N3;}Gs{
z{?a4ww#=|odRNQ)mZ3F@T6t!p**P#*Vt{;nOOAJXr{IYXmnRM|?PxL3^e(Wf&`p5K
zCT*!V5bOUY@9!S`Y+>K7LYgd=DKXB1`cw$fkQvT%>_uXOJ3|;Z2^s`Z9Aiyg#zl95
zE*3<;`CbD)JKc8Z)^RkDd|31tlL(&TcRqjKF~C8v2ej@NCT`~q^tA9&<v8?g9EC)L
zZ=6cEDf#llWd<ep3oNGRrHltprW4vDSj)$&A;dv}kpPr_XdFv^|MAgfe2;oBWO%It
zBQVPt#Bq?1Vx*|MsweOo6@47bp6NJGUb@r?QVHCI$cfrYaAl2SeuuFIi<=l*K;BiR
zO^vR57lB#^HB`f=XnpmEvST?UELB>(Q4hrY5kn(LAt~=)7DPxp?im<v7M5mB9goxj
z)7IC7)GThj;3OCx%w4@c_i))LHAY4ZLMIx8=Dl4tCy><mcEk5$y*J+topD-y$1j6P
zx~i%4G9UnsZdt9fBxkS7B4Mi`(tsV67)U}r#4Zcx^JiU;(5!)_0<yb!zjWI1dQo#m
z_Ha{|){a!w(%^td&pBdZDud7GgtXKNB+eiAxUf!Eir;I7b&=Rei@)butRGd>tvBr%
z2iu3aJdv9OgavDVtXK4*z4nW`i;@VJKOlaASPSL_9L{uyAwzwi23;6iO~`lu@y%V7
zeJj_ljjD@Ny0Cd_p$~lgF6mO!P#&3{y2~Kb3kxq)F0)pQdvJKW<iKSuKb7UCp`i08
zF(eGIH4MOSRNmb3ZU+5xnKV~s{VHd1C<n-kKUGN3kuP@?Ig_m)lcx3kbF_IE%lnT=
zs)BC-zPryzkuVw&b;g4@YxLhgX=XB0TArt1G+2fT$ebBe7tdd`b!Qe$fR-}MwYh1e
z=cl@&&td6wA@Ym*9wGG@{eD`uwHP@BY;mY6)b*2@)Hka;ON>-Jddb?!p&=GX3jLc<
z$_eEeD4ft~9lkTi)eYqfrGIPYhQ|mA@d2pKTl{^+1KF{iyeF2S5yrCmO<g?Le=K7~
z8)!d)5SeBdmr`{_k25kE<>QiQWj!aK@XDc`{0yDS7v<t9cC8hQQlz!_@sUSG1e!K`
z+j4{XIWjZnO^;4RB|8#lGmM3JPFNtaibd3nb9OeVgBetOu~5{S&PWohv(O_TMZ3D;
z-HhA+MxJ8{jy96=!mJMqdKU&4UYC?Zx3mJpsy$A~pen7x+l<i2?&~yazQmw*?YTt{
zmWhUaDO~}j*dAQHgdiT0mqz*Y_F0?rWMI`)j(~Y^YyhaUD=o6`73+s9SqvN*84-b(
z`EI1m3{IWI+mN#4ORrr|gBD(x%;IS3X>6aih=7haRm5XQ8{_&)Q@R$~{iDvSqKr65
z_ojf-%L&Pi!GX)&{Je9$?GHZ56W4G)C~&RFm%#pDt6nGrnTa{8F*o92gw!zi_4|TY
z=)?~AH`TE$T<zDFIlrvbH%p|Ke5G;#OF?+HtiU*me)YoDX@QrfowgAl1n5g1sn7)7
z@P;*qCg`3R9#El0G(UB!FPs8EX8yO0IjfC5oTSBcb=n%dMoU+|^#*5&IO|%u*+V;e
z=IP$s(E5f=mJ)76hSeFnwcSx?VTyl|vo^(78d^LR{27|dhyER;ntIT`%@U8)Cu`j6
zy>1V5X`^mZs2L!wH=Hh@9=u(%QhrvY_&P75D$4tICwsM5$zkcLrkJw}235}NEe=!L
zGxieV?Ij;e#khSPE-A*zyz{L8l`2hV2ga*Gl|uN#fYJPC`JN`GusNbUmYaZ^2@J+x
z%SS1!?B!<Zy8nUfp@4vZ-LnYFfN3M%G~dvcnU#)W=8^c=icjc`&<~j=XOY~$OXtpy
zTEbOL3~o+QHR<^;b}<A&1W0fD3}Zn%Q5+3FeK-93smD1NrRNGO*W=Pv^WrG$G2b~?
z1P;lSb^UDSxTrg_)0mhxR2GTjdYSCG_@dO>Sps#zJ7z%W{r|oquyqf=s1Mf5Mb49y
zssD$T%wjO%b(=$oY@%#-tvN51Qa2}Utm<xi)E^iGNB;(e3PlciT&LIq#XHNoa3)*t
zU1#Ye^+BdpA3iAKIT(MNi9ppJuz)2WpK<MO_Extvyk@q3a#2T1W5Y@Q#8*{I$up9Y
zP)^8+2_`{A-Z^-#gXvJaq|}B+5mC8{%94j_Jb12m!|LjfA6J$>Ju6E}V>PuL$r2@7
z=UZ_Ahk8zKLYYw|*NQ`T*S@XqWJaFpSd4VF_SVam&H|pgzWCCmIeS!Bq!I0eN{Ued
zx^yH`bDEaL7F8vG+M+(h6ojG{#UO+N+#Z$c{w80KY>l~$Hx4PZYQw-fldG(wvJP+%
zq{-{oqHg2sYfli9kmB(AI%uLzP}KUZp-4&VA;*UO&e3YuPKagUnL_m_#`~GUb<)Xw
z@Zfca$N9%bn^yX%)>rq}mdSE+Gq_2q*9>XHL_z!aR;foI3{+evzA&EMQt6zNN=Kv{
z-QC5#Vhhcpi~QKYPT{^%5r^XfB(K$1X#IO~){>BgQcW`;4dbw2_7zXxzRtAEzFF60
z7q{#!7U*@?y-pkY&pIRC#?yd-gCkJ1k0ZoW)Du~qC0YSK{0MpAG`)Krte0C444<y3
zwSS<!<IUNBH~1bZSR;9q+J}OuSk&)4E=pRV!8*2Jyz8|!64tsM8^2SBK7ndS*yK1n
zKOENjD)}YsrF_MAxg=fnK=!e+@}#_4>rVm04HF6F%}K*qmUk!r3u&aog)yksUQQ%Z
zpbLW^w)H2TT0M~MU_G9ogKdva=3G(&pZhg3%lfpDlAcE{&%;?#5zPd#;P5dS2O~1Q
zyOZP^XR8w6Aj+|iA7##xN5731=ci_#fV`~Y3}+T8S<1q%NT;V+U;7)@v<I*vX`FJ1
z%Ve3Y38g8BmewhIrXa=qF%Cr51F}a+W!C6_@Pg#<AfSGQm*?3DXF36UEY1xlAqW%{
z^&6q*Cz~AQ5l|V>e@VXlbQ&IB32heOz<BE@*F0-&O%VypkDrrWoDv&XC*_|9*MBs%
z%)TKTFOE*fJf5;^)qGt)>vU+{*2xpd_DT4qz?D%QXKgRZwlBBcev{=C$Hs2!&I3ov
zCHzIY#*bjjhryBUw4PdLOyUnqNB<IbFzDFDGz!Ej&A*FhQ2K|chZ?efR+>?rpKK*_
zRC~mb(Rfm;={>2F?e-oxa32$@)Hijk!%kKjqHglJxztN4rOuclCYAdaKUW_(bEZV6
zHCFr&KB@fwJ#(qAl;%SWq+4q)9jf^650YIX@lPbhfhR5B@%>#=V^m0$(SdHqa0Im5
zy9jn2qQb_xbB~ku26~Ci-D(D$4El2R(BsV|^qb7-3^rwwMqG>P@CPzX&u{*wZh7CT
zNYOH&2Z1tL9JKg>W5@PF3`N^Uxd!*9>Cqgrv-e=rdMJ0Hcd<Phb)@SKr`MEW;^V8$
z|Lztw+vSa^UOBxMJ!4r_RmV763!?d$$9uBgP?SQKsiLrAV)X3UJ(AX-G{T;<T~l*r
zqOE1%WKpIZmF_g6UE6+^U|~{qmqlT3SWJ5#B&mR5cj`ym(oIg1a@#Z+nm7PSa+&bX
zXxcFX*wyc8=V;--fUY)iT(=c$T<#)_+W7NY0VbW{5~Ir?T$D<{9{fG9wRntE+iR6R
z%P?l!Fg>T3_gg0_9uYI!CXHxykhXU~1&1tS*r%}~^ig&p@Hy%%9Hl8#=Swq3OJ3YK
zBOdc4bRReabrg7((7zytb#mJ+<qNl>)ZGQtdv1Z5F3;BQ>8=HX+RKSDKWML{Lfzz9
zwn_ZIrUTD;cmjb7Z~z$cS5h$bx77P%ock#u{9%9J;9)a;WXUdO+R2S5PyC)9s2nq$
zaGQ(ShI}e=J67G@{U%=D2zx@Tc*kq3)3jTYWoyy(RNMye6Zh|*of&jrQtIYpT-cRT
zfpzUXP1H>&=|T<Sig#gi{bJ~19?<iX%qtX<Q07a7X%;}*(o_FsE;>xfBV9v1Cku_g
z`@UULzWdI<-{KGI{o~ueE~1s<ay|Lbf=9W>_K|DStWjsOR>NbFl+sK?c3qy(&!?2N
zJuLn1dCktR<(JC?nOS>Nuu#yrwlr{qoF|m4<X*Eo`|!uh${!D(y_7u9c0BN7%f*!7
zhI((8pFi$5=Rf;e==>sQWQ)b)ooAahmzZ4A$~0C9J8c~KtKiOx^6)3JQ-6N)I}{gm
z`xil3l`m!J3YaKpBh145aCErvuD_CAeIa%se#o_iKq{Rfk+%C^hENkU#)Df5FdK`4
zanT2HZ_C?X=YKY2V0a}xX2>+F?aG#}ynyK0>dKNfa~{a>=D@fovdcbO<q5&vu0~|1
zm?q?THx(^xKAz)Yq8(ha(=a#P-guXD@$_GAw^Q;aN9t@bniA{&G#P>Uh3DZ93m$Ds
z{AqC@>q|uB!{=Y>vwz8cNmkveysD8J|MklEP3pyx+uNnT9(o!je*P7=xuRh05f|;$
z=nG4azTJM~UFiWw7xy+wPIao*>PHW$S}GZ@t(@Wz@YOR)rXg88W8jeL>;AJ>4g7nc
zhH_uk#Yg=v^&O(8H2v7Uhp+Fs_k6zh&F~j%7u~A)5t4k~X=dWP?4yR`Q#?HTmTueF
zF#YI|@8h<<oI{6c6n1cn%>a!zMv>Vgj^|oLo90)f7Hu)UA(iE?yuEhUt6!xJjm!Eh
z9jP76U}DXP_~UEW1_oMextrzJT+ueB^4^Gev(P)34h!jucn7qdCbGT}4hZlB`@u7I
zSh-RWSwd{0R<UH$TlcnMxnhK^N6df~+zVS6g_Kb}^=-ta5P3ZBb^QleX-Xc>c4P^v
zR7y3NiTtSaS8dcN_cp5&p0)VOpD{XAf8(L)(~!>J4%n4%>6}{ed9SN_c8hgIvWsH*
zC-2)ELzjNqqMdBL)o7^8or;b2uN|i7R<#U?T<sXVS^r?>%bVx4FQhc9cy2k-IN0Ce
zr19s3G2NQGJpS(4VpLsvtEehEF0RdrfgiuvjoWZ*?~D5$)Cjw5&V3mZHv4$)*rB=a
zKF_;Rcl(mfxt@)m)*JtNansI-w=H&N01dKrZ!!=T61IZp&-XKGekkCLmB#UQ6YUR8
zH*1q&9g!p4K2T-=<^J5>KjP$*E6WcGq)rS+;;egl-o#TCzw(tc-%m<Fq99~%ay+o-
zZnuIbN))F%NW8r*&<?=ubp4>GmN&*IEO-0G0}kaMO(d;P(BIMr{3v5ZTx!|+S4Am5
z6U?(!zUKU4JAw_h)0_`nsn1QWUQjdma7CZ6EgjnSxpCxtxy8Dw<n33l9PG9*C_z^A
z^Zu<nS9YrGPPV?;JK3&g_?g_L2HOLBec$$)J+W0k6ScPVe$#%|arwe*XUEd$l)B01
zbHD3nyxXCqe<)gdrHH?Mt~|fI<VI&okeaNXLA$&z)_Uvb-oG`jc<m|YKD(#y?7#E*
z_icIDF7}3N{U&o6w;O4{aI2w6ICE<z4@qCU<F6^IOJ9FKUj6--ike!1XHw;v$XYQx
zESB=LGEm`_7xF$b<ra)!2V)Vj)h@cua+<O!E{4sv$$1idq?H@~4wQ5{mn4^D=3;Pb
zp-hKEg=eh5p%B8Z$3kdS)NIiCTT2wS1`O_#-%UC5uiD9D{;tlcm#fK6&G=(~N=Vq)
zIX|CuXzEygf6T64K~MY&-D`%P>gd#B+c@l68}#0aNCRtfa<;e6Q(sg*^zY9JhKHWF
zGwGb)Wq)X?%ZY;-^$p`cZ%!*3v%L4*sdwE@x6J7~P3cL_@|h*}QKu@_>gUhX&374f
zx#Qc!vJsnh7U2Y2TGX~v`<5X+wBF8qeCuxd&yWhAlKVx@J2`V7J;yl27aMHXu4*h}
zc4z&VC|gXJg;*4E;N~L()I$2}Jk|f4SHF4p5vi4xSt&HUFYvz*d;r)%PJUTD{>d@v
zrdz!rBR2ox{7`DTC;Gc(wyDf4GVkj1>Z^+{shI9P6Jpl#=ND(H{T?6m3e2=jW$nZ&
zC~S`%8Km@Lj~klTYBy8!kmP?XuFSmD_2VXU{&pP^S>zm<*WtqB598z=Z=C40rPL!b
zL?OMc(za92z4rNCE!$vuSlSeRc1=oQ^26otO&99EUDxYX>86>=mrt*|9%MIi!}v!x
zl=^S`m1aA#AZvKR#*^N5#~mXp9UFgZn&nxXof|S+WAc?o%jNwis@4kO%RiI)#TbwL
zT;De3+XNAa;xB=wG8D@fb$TaXpkj~^!Q{(eVIO|wmq*rlnOK1m$0|l(hmuXt*T#;k
zD)_QtOjM0y@xlv_x4Ri@T8=e~S(H(>KGp3;euQIy>2$ybreBoL7Vb5#NotFlX5#jj
z+9PF$;cr*)@@Jnd{FJsPEVuow($0G_hLug2_+ed71p7x7-euh^byc2cR59#eW73(b
zVRMSsIGx}5&`M)VyNSD)ZMrdf?|=cC_FcC)hEF#wW-_c(e6b3CJMk@j)$&vj|Mc1M
zIW+neBh2ZW@7DLaGDAQAfpz|9)1^kgi4R0fK<+CxJSx;ol1bvzR>ZM&`_`42?MGXg
zjXgEA8+q%m#~<c)*<&FF;?fj)`<1#0gL32r-Ttv~Om`?}Q<U~~^M3f)@mECW<I9zu
z_$af$UH{rM!D`}FY@qm-!iD$gooyr8g7v21Mc+CVQ?QxZu#_K}r7Mn3l)Rr_?R<Kl
z@i~*wz%vK`S%FTHN%qtgaeiB;na%oUG!1CSN<-eFZI7Mv%~bQI9DmtC5{}IE<TVgh
zMc-ZMk_)TuWG{~_MlVM(sQ}Fas%wAavazneG-klfv-=&wr<>h5H6osZsNVUZiTY((
zU-5xsq&^C_0)M8(=UptUbjrqeRZr~FvH1GbQH0Xg8QF2;bas6*E?f~OoSACQ%SL;Y
z*VZr9G?rSw|MW@hauDNp>qoV^l8Nzxa|_T!)p%51BjDQ^HI8Xakkuax{NXVs>T0no
z3KcMrlRBMe^$Fl|@t2xa)0t6fOHDRBw9Ze?VRy)xTwGHGA<kIz?YjFtzNP<m7mLgC
z_`HwU&$_T5T*X)S)zKwx4u`Z1V3nz@n8gs3CFTN;9=&!m9~^aR<0BW<%947s4N}jg
zMq#%q+AKJ?peEF1?ihYy5rf4>Va;QwJNBqLHFp5RC-B4k@15C?OcE{5%>DCxO#6xU
z;aU=i8C-Q$IJQ*0CsH;K`}d*klRLgvviE)OQ7#5*YX18p-EJx|G0}>IGko5C(2>;(
zoX=#GrD9gAyH6jR759uzBaJPT)sfC;KX^d9w0m1ITj}S1nxs5pdz8f_o8?vam90;2
zKe$<2E4}HaQTyY4Ce6;T>eF_?@`DZ+8~TLJdi-m&<JgyJ<ANLOJHFjIYJIYuh%?2m
zgqmd)KaYODIW8o$VlC4n_9jyMI9p9gfRwo`vF>_);?Dm)Xe*7l|9xVlmakHNHK)|E
z!15#{8u04+Q9WxeYoq01u|_I;3_5Z(5O*>yJLq1l!E`7d5d{(Nq`Td?RZ$Z<`_ab8
zyb+a8zQznluk;Mq*ne-F<(z!yM~TP#FVFokq<6CG`QC-u&BwCe<+zP}k?6Q~sDI>f
zJ&wfs<zYt6=dbljj<Pe_d}D3HZ_{QJoi0kO81cwuO@3%$WzPgx{i?=AofrJ41@MmI
z1tUqLS3g;>QdhAJ?L03F3NyW_(a-mw11~{MZJwWMyR@nkO3(AU2D_Q&zU-*4uY81X
zyZJ&f99eHfyUP^b;AYDtavK5-2_ETPz^rcF<*ARpu{m}hK#RIjSjU(FTiGZTHT!J-
zA&;1%sYdk)JFcCz_l{d*(C%`Go6{kSh5f2+pUc;7ZRwyrrbCs-M(~pI{OE&lWVVLg
zZ7(ehj8Qx?qdgc8@*`3z3M4_zlgH`8xEn&AiaDLHNoVDCWC_lDpuXG^S=aC7T9O$G
z68_cF8GKvN&R`gI!Rx#;F63S0G%*bFTaulTFoI*_O>|v)AVdZIJLfBl9x!$E=g(GQ
zv&uhvRT*?WJfl8nW^|O~QL>Bb$QNIIb(02bOifIR%1%z%_~ufh-EZ&jwk<yE((g=4
zsI@_TQ}MQi4hOuK2H6?>zIy9jiN$hr^|90I+i!-3CpsI1DrID3a6n?88GtvYQxfI$
zkG4WNQ*)<r8FR|#em0ynPBjh{--U0}y+U?M`jL%k01Ug+>?f>RHE{6tK>buOvV&sR
zh`_|5<Zh?SP!Tc1)JDz?8@CQp({NqIn)L%qtj&5hZ2Z%$_r;BAFJ9VKQ3TGzl(ekO
z0j{BJWXgBJ=hepcZc&7-P+z^rT)HVa>{fnm=_jl6!?*u#cVOf8i)(+h*xfhZ5>Q|>
zVD*jX4kIG@dzgiV3dtcdul(m|gm6yI&a-kpDf##QFAVuA+zy=5nnt33e(B$H^oDl$
zQD-_H9NQsbaWAUG0_Vk0o`O|fx-=Mrf&2@z9LZ}|^Zabi-5YYFBz>sy--i_|PHY_X
zn9($p{@6*}XLsHK<}Ji_Yr@{Qvpbv4&h0Yvl*RLh0qXfj@7wU!;w{Ga$kQonRBp{7
zF(+P??6P|Ca)FD;BCzh`e$-A9YnQJqjZwVD!f(R8IMADJmEO4x=azapdH8RJwy8tU
ztF<q#tu+?Ij|q##{s$^LM7RN4ZQt(srvvA*4>TK)4#Ol3W8yMDeoXuKg$-ykP2m<9
zG}s{F?-e4cl-ZN*IjCaURfqkVCaWPNt1_<5*tgPv!F%C1h}n4{ydcWMG}jTaST+AM
zsZWg3t6Td2UPSv7)8F=ZF6*i`Z<JV1D&hve8({B6ReU7R1SOT0q-%RA`>B1~ceIZ$
z7Bn^22h@us8?beym6^QvAFb(0KPJ%NcgdKj*#T0sSVVqlh_*4*t~S`KF9RKCrzuUq
zs|0Vx5&_J+6#Nya9|lFjuqi@gu?>YWe?7DV5SJ(>lp1OSCAUG{g|tJ!65z+XfA(85
zd2S<i;ryjnR;4>~u-RUtkaD;u!o)lcEtP_x_0(!&B8oQ3-{zc3!}`Dhfy6~Z5#xp>
z(Z%g1rk<G<`AuiC*<_#<?jz^53#0`DSHj2%`RpVC@$Z>x7K?Ff*~vXRzElzHvzdzh
zeY=gn{tE+sx{4~s17lP=cgA}SiTd&<J5D{Ye%1Su<lTnve(M-2>2l$*m|($_Cs(`U
zJYBb&m1pL5@d-Q=h|?$3{`~o&%`-=Az4Kye+;pq7-%v*XuWUu2CiF^Z>ZnUF9gCUU
z<>1OG@wSFoChRJ8+f(e?x#_oQ-ygOX8dg>gXQ?5?;_iQS^<|!8`rjFvzcZ%4>AG@m
zaFEOKEgcxN*T&!1H!Uq~`ayrjtmII4p<QXWCqrHiV9@r?){PUEC+fwdqTg_2^RXxv
zuo=Q9mjs1VWA_gy)e1)0N;z`kFNd?x`{JcA*L^wZwXz*rW|l>C*4UT&E4blz>F9a0
z&#DVdUSLW8t31Dzk4#b8ou*nlinwF(jzLef`^0d<)wgFwckdgAu2eqvcr^w*JW#Q7
zBZ(#7W$Q<2O1kFBUHrev+@sBmqT&rlCD-UC`ArJ&uU?qR*Rf^@3BhKY=FdM`6wPF2
zQfuf|PJkb6(UjQakBK+<+;X*yRq7!uLr%b~B@+J%Z>iP%X{zlQ_!8<8#_PnI5z?19
z_4ovv)Fi#>T(YrfDd(wEQ?u6K|78Kiw$}SJQ0a>uBZBD!7kYzCMI;O)ifvvLy;~r`
zb3!;BcHYyLv}JqOdo=4A+v>EP*SCrpPiE&>Z2(w{p?ay1H!wv!F09gX3^96cm*@ef
zM4Y|NayQ=1@ZHuit5}I7JOQb*phfhD3<-8Hn5T*}*DJI^dRFtSm|UZc6n}jVR!AN2
zNX)at)C*%#@#=w!4Q;Q=nJluO6=1P*<gwW<6CBD>7YW@Mx6J#){Aj^e^8KceJp?_>
zR4+xEZdmS}hueqLTQsVS+(=@NiNRT}^vK`Z!S}1KRU}t9Gcdhc>Unv=ph+!eYTrFs
z3H1hJk})4QZA<;M&PpSRD}e97D@yNlew@(J6RI7MsS)2!>gW^%oi133_}|>QnJ6j_
z7qxSS_Cgn%Q%E?I$^v>^#h^?KJPLNe+F*Wa{qM_W3*lQYrW)~O*pKR8mr>Fa%?A5C
zM?@kj?IaF_o?@Ge3kMr^?|5<Hg(BzunQh1=8<#%2?{*`DzgH)$0MDf<?7qLfBwkAr
zUwr41NN5XgS`;{MNG|E+xs~a<Bl+(hjX@H+y2tVxAxBy4cO_TFW|ajSm83_T-ADCY
z*8f)NxDVGVu1$C`c<jzSYez&j-}GFB6(i!$i}%i4<<-txu%KOhaS(DO4t5NXH#5aZ
zSvwKkvb+4>S}&1xLF)@=jXa5)4Am(T{f?t#$37mDazBLLfN**FZj+b|JpA%$f9;dp
z_~lgE!jX`wh5f%O(Jdcc$~GkH?u*H<-)j^)=-laCE>`F`=wTZEaBBk&%>m`bl-Kza
z{;BuAxbbdcoyzZ1I~@DzJ7;Gus^yoVSQ7&9Bv{|PNOgU=`S~8t(Z{BZ|21JYJ;`$O
zN6z^kl?{~WSGIp!Jn}_T+}Dca7j%u4o;NcxLc+FRs<#`lA*$4^F?rduDK|fb^q1Qa
zHplOJb#}|PWpifFo`SjzO!~~*jg}f%XZmn^L6C&p8}e$bz>e4zY#cfu$iJw+M(Zh#
zFq^1)yYsj>%TWEv{KpXUB1bUmD-`q_lD*S!M7ugmP<3L7-X$_GfMbB0+U(UotU?)z
znoNC1HHT1dkF&Y^o(vxZWk>>r5tay4kjcaPHTF~Vy>-OZ&aNa}FD7N~JfpBls_iy^
zbBr&(J*dDU|Fm3ok@GwahF8UKJF|A~S>qP!j#*~^;ZxtS{nM6#LzS-G)lJ#Ic}qu6
zDW_-dL(`!8+aJFlJ5VxZ>+yca(;_RjoTxH#>7917%PX5kpSpBcG*$C7N{<}iy>jIR
zI-xaTUyLH3kCKVE)Ch_*i6WJvb4r5!(Hy5`-HzBKn90%FUJ2DWaL^)n>bV6m@$sP!
z2L5n{yU1D;WtL|PlezoK%l}JG6GrL0GEhEBS7GOoH++=$H)EkJMzw@AbyfO}MnYuY
zR*ye;bz=Pb$G`uf2M_itPJQVCy3$ve`;me&N8YXL^w)y(bGzgWou?sGi~Wy#wPEg0
zcD>|mim{u1)s%&&f;SfznG8&d44kldQ$R#Yh4J6cyM9#|9jX{>!)P|A6N8M#+gBfL
zl&i7lY9Q=JpFxCAZrg$}S?aqD6E8g0A2jH}_RBWsycY({=$7=Gk==yl%cE~uJ~e&c
zXc4Spbb4qTYU$0jmEYqVry7OzLB1()LN-hLP?Y|lA4sP*IrC<f!DmEVCs1+_1DH_v
z8?cWDlHT**uo$D5HX&5M`1Y21;Rz*%VbEyeCvvQI&5mnFq$GR(lY5N5VdJeacldyL
z#!SMPe<POdL&j3JW_dFpfvLenRaH_o+Se6q9bV>98+`lYmVNEOKPDC*)L1q>?UB>N
zsR1^98Y`>DS*)x0)Rg_)E5O#K<L^7G=TYxAt=>6i;jM)`!?G5SU%3D2-kP1abLkkb
zANg$i2HBm}&Ydaw+qWhz9U-+=om^1mVi5I1ZF0fVV2hCROPBjJIB^beh7dy<a>v<9
z|AU+={qi%==nq`(sCKX7{P3@OJ~l3H*gqNiF~f}impOGqmSs+eFXH?|0`2DO3(W_S
zD5|zZ8Y(p{tn_5Jd3Vyb@I+X=_?dHX^ngc7zH-q2a10^??pR_!d(*PJ#p%oa-&j2y
zT9N1&)%-l=?1aHXTM|5@tk)Ffxb?`sYjEvgu2z=S@!mW0E1jlQuRJ|e%j<ZvdHcmn
zOF)NN)u;hmIkIGZTQT&>NHo43h|8_FezZ!1vlRr@JdJr+J|ZIFWUsKd-&G&0?mzIq
z!awz;&*YeCOwWnE6k_?g0BSBLmuoW(H0OO{D8m&WK3IZt5TkeVTeuoapM2Y^AEm2E
znM@N-niYUXeD0-^*vgSF!c>eJ_baSjyH;$dP{2$AV~6MOYijQQarbjKp;f2r?({W7
zPdSj14Ir919rBhBl%8;+7DtuFhHBAj+<y;R)go|O2!gaNVk;!VRpDQW)N;U)2kz@e
zN*~Cu9*bq{4ajE#TOOScMy(vd6b0D8Nu3UAvr-o*{0(^wq6cKdXRT=t&3$)lcSJhA
z!Diw+(V)-`mGSbJI00BPn+_T)d?zUZXx>ou?7q44*Q|e9o7KR;^lSz~aQP&>`XLrY
z`M1sTM*Xqz))CQb4QUpId#AO5y6{5XeObc<mcm@@Cd&6NH&s`znKWIs9Z&wPL0E1G
zwu+GLgjKnVuTIwj$nS37vHJvK=7q=Ut|Q*CVe6z$_kti6viMEa+0u^P*2WB<^!PG+
z-v2G$ER3z<eOOmWogth%+bVQ^g^YOe>eU~TZgjkE63+n?R;x2-5&~`J-k)uzwY=$3
znB`bMrO;<GC>DLW8fEJXkCn;h1X56@hW348=9*}#aE|?R-wC&Y4C|O&3_132zyUuT
zfxcBI5Bn=?;J2EIt+j<-yNQ~^i`3@z4PjCeB$*#1j0o6Zt1OA%Qu4h07?*N(cZop~
zPS5NChW+#xO1g%Oq(h+VAq*{)*x1?(Izk7tXZG0){OkBAds##}x6A&(ANK!NFjiKk
zLUMPAi-zz?%y3s%SHF2-aL72G%q^t7nJa|fEJ#8yDQAF2nD$NqSM9Ykno97eOU+#2
zqpP^=Q<hbkF2i|D+o53J^LVwhv%x$~+FtQh_#smm@F6cE*4<B8$Y68uwjTLy&%W`O
zit!GHY!0!(7yAoJ$`e761O6l+Z+1Sjz-O{meB?iph03P#e4B45kkWX2!0gCW4>ALW
z4Ag(Ct(EDYSS+?~CGeXPtJL(9idgiFljU@J#uVcSx`K%Up?qR-ggYnGeM#PI&xh7z
zVxb^s*wXf@y*!WCcifJcX?^^|T}@|N9R~sAgzD{!Y2F@jM5^INp*?KpPj6z{sGgm}
z^CXE8{UT0XylBX>_O`9g-)tV~(DG^Qe<A$!=s5-;7Sn_QH6~rDP9_e_GQUZTh*}w0
zOo=PBvebaWLzo}^ch+o;y~ytQsr&&O)Z57;ZG?pL@_KqFhzzO@J<i+#_eow(CNW)-
z*L9W+Wi)Hrbi~q01Qo!;|32&ZC7?H=JEl=kVhn6$_!hVnAw9t=4Zw;fYGQBmd8gFc
z$zuzN9TM0rV3&N-mlr*lSPaX3*V%NDbb8se)x$f=QFJner>CQ)_C5W-#=bVP#_YM>
zmUl<g-G-Zoj5fvBtrq0w1hXU1=T_X2!@R;rioWbpkoUJtd+_fUqKc?_U|ktMX#5qM
zBrPq;fug$IftI`f#o5%<@<w&+ZYZqA47=whU6PFLa-9A=dPq1MfBXqV8SgeYh_P|T
zTdu{aPu2ztW6)Fl(RBSYa-B?c4xFjwBd(OZx~0$x7_=)pCN*0^D{RkD2JR<XHy(eq
zdM3j$NR?L}Zob(Fe67o!W#gS4`$znhf&@ZH`UEaAD(d0fE?Mvx;DI~|t7T8|qCiXL
z&u}m7J)j3_bTD+Qw{8D>+iANe@l4z^yVWejed@S>=8C%HC>Q55SB&FPi*t{x5TbAa
zo~}5WeqXGs6@GR?6YFr}`J}NuQ+V~jkqpkid^t%!Vmr=qjL{J7<Yw=mP&Sy+Z(bZJ
zK^T1fz3p<btJonYlnbjg_0;|!e5?>)8!|n9GC>P@B)4Qef?HBkQ^~{l(@}gBYIdfk
z{MaA4f06QFQVD(fgf(l{JpH-0@uCb>3+S(d`9l4OWxl!?ge46&J?(5&cK_c>g+dU8
z1$c*6;|Lz07v{xv0_`m{w$yJ$c<6_SS+C@sB7W!Q#Gy@PLE0Oj`yCmwI&{N$?7VpK
zuzS2!DC_~}j?F*-&JIf_&#)?-3fX<(@y!hDu-r`J(DXl-U75<pTe;NL3A0zekcs(p
zATO~PNaNsvK)L@??A|nkyzY$R_Y>d<$%|_>s^@`u&x3}$NQA4;`SW<UWs0BJH#7%H
zQy|!>+^#%H$GX740X;dOC{jan+Rf9@+ozvlt*jK9gZo8?lq$MM+8kUMfE4aWjr8#H
z(iAThw3MP`_gFjq&lA-?$TM*c`}yvTk6y$jG;OSY3&0In44>hLNxAmrnZ(KY4CTs|
z3z*<DAHhprBC~nANEjY<?TQ(tVzQmihq7<|_UY3A^8Z@Rb8_Y;YL-f#RC<0k9Xx9=
z90Bcspes}-tG#S`a^u9@paWJM6|x$+(t*!+QMjpRp<k>M@<Q$4faQLDns(B>3Af{8
zv-=BJ0BNB@!@6<*-_zP-m6^7%=jiQSE6<Xw0}S8Ib12UqZ94YC(PB>N|6{${p0?5Y
zP!^O^=_wN1P?R?CKqO?j=dwp%h*ZP~av>N-=I|oB-Pq4Gl`M=HSr0X}W3Xwj&I+@m
z#{~;R6H(k#!h@2ur0}K0>L|{eExU_pw|tvMM#FxzBE5=Bav!-~$FE8jW&Rs8RO7mg
z`yOc;Iz%$C1WG~J?;@CT=p_dv4k|<>*d=(AuIEfV=+#qC8yD7hoMKfd966x`g`kt6
z)AGw2I@8%)jNhVhWFBp6eex`3%Z0WR9W(3n+7obb<(NO(Tfb?iccZT8?Dol(c^`^g
zdEVK{$sTtn>YPyw`a%@ke$w3ff%f`lm94)Dy>@$=gYdj+eS!Xd8D$f;Z8N9o7j$ON
zQR`DHHrY6z;ntF<r(C-6yqeLcG(Hq33?>Jx1(Er%#~^P1L%Rp82(lP;Z_uoWSx1eQ
z?jxIvC4}77+Qk)DOn44^eJ7o74fseGb^(3zljXef7R@s2b!uT2GZDNTrWL&!9{)ir
zG;;M4SOW&DsK|+1NrU(G;~tC2BpgJqG9IMTrAwdw{UP@xH+4D}Tm;?{Zy^v&lSy77
zK&OHet{@PZ+eAz&tn{OT`JZbR4x|)UJimSmCiUM3<;n!EHYcNNK|frqdg)L9lyXsa
zs`4m}3}bD5@XUj<5ONd_?8w#TjNqUz02>l^lbqRQRn55W;lRZ7KKyBCG8A)Lg$>4m
z>Blje1d6Kb{j07wHeyOmFSq>{2dp$EFLZizQG-ekO_8>*u07!Bvg$O8Um-^ana)9u
zjbR!HH>7N0c)yz}t>n@*@!I~40o*bQ#)s1Bdcq)s86GhiL2Sa=EgiOPJ^wcE`F+OG
z_6@$A3K~uy|7n@gK7ANUVYNg--v>66Ng>Lg!tZkf(-&jaCKj~|!c*|w|I5v>TM`Qn
z8~<~vSb`@5#bbs*JtHEt$tZGnD`Xd0dLi0e4-aTcWdO(rC)M-3L^aZ9y2F{=cj;*Q
z2b&Ib;=shMemqESSB{eycEc9GX5Mr52s$uz$r<HSZy4HJM`sElhqHQ>ecu8vM(B#}
zpfK>iggaTRKH&8-k>9@e_AhF9ZOR2*mZ41+gL!-o(qsO4iqyV&(sMTF<gVr!sx5u3
zC1FqrYR*J<>JW1Z6Tv%pDSH4(SXQse2y(-RxAJn_C0?6A=dN7YJhXSup7((0u>NM<
z*`Bf6zA~BB9Rth4mbrnFA^*KZUxhs@6Zcc$LDLHCt<^(pPIN#<S?zhdO{01;ZJc^;
z{#lqv@r@paP1k5gxgHa8%6LAnUt%*2r4<X$Rsh;ViHV6OVCGFVC6x(Kf>Y^9sI8ze
zN<O)80)-_!03k@_dPZI-s{|vd(Phudh5w~8c`y6G_kRi?0)ZwiH@4NiX#}SvNXKsZ
z+?O{m93@zkpf_T~2&l}@!q%M^kx4E4YD2OmIUSH)bR?^Y=_Hra%uxy*4gC@~Ss?qh
z_}yyJ|1Wz^wF^tZ<rFoPoUyU-F6N>674R^+7~YZpYX{Z5#s6U@iWNDhirt80czOlE
z5?x+;i`GqHp2nSYnb6fCBnThBbDW8(DeUzmi`Dusg>{I<=j=XWuW$M|UC%L5;#4hj
zGLaS?lk$%zPuZL^-dtlRmS<z?PKak(QS1S$NQhc|2@k_RZ)q1w;>A&c^PmWu!gdWv
z35sd;n5mX#=Ks|os(re29`^K9akjm2YuQE7%`EZNbuUG!zQtc)7r4B|Y<tl+W<AEj
z{`6j|Omr$&K}gty4+O*&nOMKgtEJMM5%C(G$&>4mRwG-mX8_d}3lgzBogV1HonXXV
zFfyEQh@Pr(e+)VzOz{A2ss8=t{!4YnpHo%4XcV|60ziUmQc!COv>!^lr0i~C<vrSC
zSbAeBdoVXYuj^K(k-Cb&xun~L7mkouvhzG1S|>_RuLn#P>H#El!pa8>O7z}noT^_;
zQ8r~tR7^mqbn28`ksE}OH0PZqQ3oOT{~oIvv*>@fZeBabjRtvBn1^6pj1OH|S;n5y
zA?X&V!n*K_2psgZ`^|fcnt`7WDk?|i8E8bMPR6DF;#d9;Q)dFs<=Sp>8Z>FvOsOQz
zq9~#?C`BbjC@N8zB55Ecnu`XzB$_BGBqSnAlY}&ogwSLP6;h<m?`iM-?Q^d0T-ToB
z|9{`-xrepZy>4mM5u=5dFqS5J`13S?&PA?yrASm}F}CkHZ>0_Ta|%bX8H=$h#!xs%
zOwytQmBfyR;(zdVg_*IP{(U}x&m=8Ry;uvdB#~_hywcLrC5%V0bLsPIo4?_KF3;NU
zR}`~=KqAZuuN)xpcX<z)6`P+a%5cBC`i3&_p+(hU`={$g8e37Fia`++o!l}WUbI7j
z+ADiC+PW7Nq8dhA!6(GU^dP9szF>9`y<Ip}I{4ZPe8vQ_2i2*|nC{cV^OU?2)&Bsk
zLxPpXvNx{l`IMA_a&mHrtZH1z5tIcKT+*fO_dnd6w|uj(3+Gb;R-7N`Yq==URNTXK
zRT0}rM%Rv#j9gW>ja>h)7NF`SQ%uj7pZ)<g^Nw8?!u%9-S`<ItOpFN5mW)Y{HIs^=
zEJNX|xFcP)>buEyX-Db8U~rF|7FBAaKP5xT$IR8c6|-!}1){GMOClJ{h;kJwjqL2b
z7ExWFmG4vh$gh!Z=%zT+R@F`|Q1aj4YBsv}o&XuO3pZcnMVjvw<Fibn0j`KzuJ`Dr
z5&)@~=?VjC>;WY_dARtp_9wKjR6N?1LX63NYO0!X32p1vYgaY>T~F<blJMK9m}Sf%
z!YG|Jb`Qf`g-840!)207g5{6A&U7=j(UA2Oz`$M4D6s8iOnkpZ?>K`z#^{+?#3RBn
zv(mFUCRE$PyOB*u^wz<Em|~e9y2?YkR$+S=OsM(?UxH=b_@I}`Io#w%9)E1Nm6=@V
z)9DjN20t3QJAe->aqG24&u5<|s${+o3XqvkRm)6%G_L;Q`-xFM+@rsI%@~JZQkdt{
zEkmAgM^JK3hax9W;xjlyYg@4C-hWEReM&<<z%yM5zj_xu>*?#)l@|C5vQ)U+-J{)S
zbwQ*BAOrgfJ|MvaXLQ!+-4l7F0P<qNu`ncseHHVm;HZH2#mPlvdp-rXgpO*)q*cM6
zj|T-E5}FPw9Y{5u-|pE_cmH$(vm|u7ZE@Q7Xs;+ipwT48$}R5IMw02;pSoIx-J&Ts
zei|ytaJRX`_7`c5d^KyP;dwCA@IH0nHFlMARU6e7b#ldEjCqW1z9*EniLb*lpoc>b
zPr>)J)j!Gd{n-h#LnrJI07L%OhzM!X*4xI_JhdF_ADbm-cx2A;fHRZE4eYK*&Vw9E
z${I}uQWqW-{$n4YTI&d}0gNv+Pc(!ZH-F`b;lt;};y;Gy_s!OqIfJW~ur>DgS0836
z`(elWzrh8WXkMYP;0KH#JApERhU^(LQ%3>v?RD1I#)JD5=p7@ClL{8rWz3ZErB4xC
zIfMy|1eQNy6^tkXB~TLxK94C|Zu9O9VKZaVcl>jMNJ#ICandk^L<Bac<p$nVwLgFE
z*z^ss91E3T`etZdSohBKe}zelw|skXwHcg?raCAjb^V!jH8T^&51UVJg=hIx5Z{&e
zVo|+jgfd7Z#PMFG1H$u6>3~JX*kuWx;>9r#{=UzkwKD%!C3_;R*NnaNqQj*Zlh2<$
zudnDm{e1iH=g!?LZEyQjIp4X>*QMh&%M!W;yUKpOXmoj*f$G_EiQU@Hpa)>2az8kg
zKubi;+HfTFczqHd^3R2D@m?G-dD5guw>!(cgq{}8OE^FE0+dp)R~tb4ASr=q23}8h
z(Sh`MYq)1G^e44qW7(GMX+1nO!`h5(o4Uw)6U-9Y6*4C$wI~CZOIRRg?Y)s|@Z5t|
zD*F73-AtcTw|cIB1vXZKWQ_Z?p_LZx)9Lx@w!R5pgd6*66`z|12=>uIuZS;5E3!nP
z1zHboH{j_c;gZW{?xzl+t7L^63N;j9&Gkd?a1Dsb?)Z`4M*{rep#_h=xl`7DA<6?6
zUbrS$s84kUue02__`8AH($I{}AW~C)e!eH<NG{o0d&g}n?WB`mE1btk5xnq1=QKIV
z-$emk=$mA04g^1XoT*H;B0OOzBvLCjP7lvrMQY;bpC4#;-{s~q(V3w_K)WFd8G!m-
zZcAT~nb~6U=&^oBci}IKg{*VQ9Z9C|!GT*R)c>A|Med=dm}9xN*mz$m9@oW9%J*H-
zM=(YUOI>^{H&0cUDBO8rqvnXP#o@VRn`=u)ivYx^vwZt~zP`FdJ!BfaOqNN)wg7Ho
zt_Kdt0uPa(NK6d<?oe<4Z#A;7q-xL<LmAVH9ZIl`xli&=K5?3<D0CEluPrI1Q+|FJ
z$3_FCa|H}Mw+ujeK2^U`+rt#Do@E+?R?s#`7S+VxZW+VArJLt`-XT{Mdp24l-)g%t
zNx0H2nDV6!0+dVEvcFP_S`gc8%5>&wAQ^DDTe=$DJ*6o(O6un}G5?GS2)*MucT><=
zv;c~?4)ggpH!}>glZ%NQ#Pi|b@}?<Q+-!nVD<4%~IA^Ekkz~<SFyK(@^&$m|1_PwW
z9#;)wj6}`ZoAvHJ=A4|2Z7N%g&-qv}ku{~fLPutn@U+b}v3$T}3^PVj^j!QhG4X#7
zqHDU={e5Lllde%C2(SYJWJ}4;b~e^0*TtDhe%~=QG<}_;<3l<#5RY5a!;Y5rln|!`
zPXQgf-P#sQ)r^KID!%<zuwDp9xdD=Qw2JMx7vDbRs}5_dGEJS-eQ5aL*)d2{)~|0&
zvgjx~`%P$MWR~4lP;3r+VO~f`2=*KAwnr@q&$Xy~`|$pKkw#iczzZS})9KbGDr1pT
z2MN+~jRzx1A{*hAbdTwPb~M(!^j4mlITKK7U^0cjL~-q!fX_c=^#6oS83j<5)RTi(
zom)AN3k8SVdf;AUc2xX#;#+SsX~%EVANh*IEZKJTyMRxV81|y_ZFRNetH}~T*@>51
zU*IsIz-fv764cx+A;*|r(e7?@#-m3kb9NqwX_?Ye|3atJ<}!E}A%^25SWSy}r*!^O
z86K5ba79nCe}r%06UuVxGNBMzD@!%mci_N186W!B{Ro=#XrzPcaTilJUuU#=#^Do7
z<hO=?jmZ4#S8x66$1l+EBeJ2TeRl59tdM6V_}6*dIsWWurepj&rVG<p5i=A}oQqF!
zdOjqiHS0>x4}23rE9CU+=T>}{Dd7(_HTSQ$p!~vKjdif&w<YLv?rS9K(8dTHlAjO4
zVA6hA+a(ZH$lAKK5fv+Kc*vVh9&OL|9M5a^V~aGe0}i=lQ9~*IcVBqt3(`FhuxwqT
z>APn2#P^GFG2LSv?rLdrd9lS=XC5V~3<W%_)8fT{ndJVa<S+7P!eQ!158*pcB~P<>
zk0$Dth5w@R04!Z7=(tcTp-p+4(g`!9v-JP%z1no`@<VY+_>T~zAbx(Tr8~U*f!bNU
z>Y3)8F(zIf7mJbtN&i+kc<n^Bt|$9jX^-8Jg5a{`YEsRqoJrewy^cxMYDDI3)5o}K
z{RqD@hBcOit~>99K?hxF;TTVMH#Y(AI-9p@R?W}bo4Uejhq>0%n=?Jy-f~OczG$!u
z?wCT%gSp_}C>Xk0kvB>_B>nqb(f<9jj_SRy>d8}o-fO_k(M)=Al|RMKik=D7Dl@CG
z&VyMrX4A5-JRVhXXUn;jp-O1L%I}CI_?I(N@!~{Dt7?f_OavTQ2M|if<ObtEF!xK3
znMMX3RVVkl`ySzjFa{IGJiJ44kl1>Hx7l47^S=#IH8lrNjsq7sn>@J~8k*=GhGo)Q
z%9?=<{qC9U9_Z7%yieiX)JG#9r+@t!l%Jd1mhAK<YO(8>bpUl&-Uqe6T)6=bhlu+S
z41`u)@IQQa!0p7ONpKqi@DYFtt*;?O4#)-*`C{(;=YXE6AdUj6o#lD-_jSFztt(o6
z|B=1J7twbaL?+UCZ-mRc-1<_TpWPVNUmslBucJG-PwyOOAx*;9t3##s@OQtOYtvd9
ziw{Ggb;u4J`K8zAhiPdc*RCm-I79HPKwT@8si1sR<qsp@e3jOkec<q6ycIfbk%d(Q
z3c=HKZ*~Kf<NQmi=Z?6lxS0Ne7Ee7mE(M{9<mvfkE`gu~q6(CJ0s_ENnCvUcE-@}P
z9lgXz5?;pXWp3yjXzUo>l1y0KIfmNmcC%4<u92~aX3v|;#^P%hOqcE@FRY`cHs@zE
z4I)>0{@Lguy93VTt$T|rzu2rKjQl|>TzBM;q^T3b`~pHp-ib^@SU-Y!U^dIW7|^gR
z@8!$H@UkgXtP2X>b=aLQ<woqIFcBgIzUq;Hfc>McCJa%s`WkWa^xQMm%e?j6eA*sd
z6P{0iCtHhUQw$RcH%}@H40Yk)`U#gO_+I|uJ<hTCNL*_a$gKq7E^J1J9S(Sy$6E#-
zqspYF8du&c_TO75B#q<t+a#p#x|J%uMcw|Dvdz->2lMS=2>i02@;IcHb?zVm1XZ&z
z*PjxSCwaJElW)Y*2K%FGhHww`bJ{bO>=^9m;gM`IFD~N?ivb4kwFo#X%^k|~EsQ(%
z7;Y8-AhD38FP1HpCfa#P%f`|Q-j}nf+^BK8z*n|Y%TQ^lv08bg1ZkCZug7Nw1@U18
zhbk5-(nS;v4uwKyBNW6BuF=8{oo8!MzO|*WOnWav@h6GD`&^i|J|-#|vq^Ab3Giwm
zxq5t(>1oA=o2zf|23)d_kBDA6dJ?FUs3svOcy>3vvv3R$-X8*&qLmio6~Ot&T<Yh?
zqFO6GI(y(w_zJ7TT6>6h0{ov^DlS3ySz-B`&Ux*d4jI0R+^kbnwt9N*QQbWU=vO6c
zts)lY#%{<hbvlO&FD=^0<0IGR?gXWATKG*h_rhe4i+3g-x#_fI<niLkOPF@SU#lq3
zTE+BEU6dQiwdThI>WH6G_c`OtQ;YNhTD<7;E<A-(SDdW(m>1O@v9wM%Ac0o4des|l
z0(MDIjZ$G#h<c|g*AzxDtK*UH(k1o_38k?GR1Z>EZ?7b5D4B_$N^w8|HgI#Zs5iTG
zX_X+esyV!d%1l)B(9tATbg|P?Z%?=Zs!Kh|cx-t?ljoUHCC{^k1eL@GGyoLhcV@We
zbLC9wL%FsOL@udGs(pK}>r~O^!)${$;|ArFH8HOQwB<~WY!PJbsbIs|H`tnRt^Doy
z0w@%LdO}Z0IbGI@k+C1hJLFo;GBWD9{mVhi62Ar7spyi$8cEEAzScgOP=OUa0(@vc
zF?joTE6Q6rs=h<Cur}c|d3tS6o>A<7<qV3mVq`O4o@mxGlRV+A`M0fKBlav-l97xH
zINdQ=DHsDJCdKyZE8c^a`e&WNyLX4D>e3w3O!IBDId5Bwu0lZqw*Au(35O>JqQEvF
z7N}c-S%84}pzepZU3|fpOH*4qAOq#{f3LqZwKKxt8GE~k0eHZ9@qB?&sd+`8zB>Jw
zVearVR*7tF_`5UM#gU`|NG)}OJd`J{&}|6`8x2I^)OIMx*qw6@><sDZhT1BM>@6Py
z)<`UCGA(kt*V!U!2_+%wLxEKb11o+kI2JT0%)78%B{H#c;nA9IRU2*!tvsJg=lsHw
zQ+8JoYTRd9UaTJ^CkQD_bMbF&o&b>cXJi{UM{P90a~Uj5_{xjl9`87R&KyDJIm3}8
zjmm?<cA_!J$;lB2EWv@pA!KDpBiNgAZ2YJVvR(Ej-MthY-5m%50!b_oo<7~Hd5spE
zx11YYKD`<*#xY<C#hwE`H07J}nkTYiNyMVM$+LT8E4xJe*xP}mEarDUV<XL`y!@62
z2l3h!He+X-axS5P$Syn5-rwxS+}g7YU#Ws#?-#C&SnQznDHgm4*MD1$%h`K8y&vv9
zCI3xr?JsAI9;LE>U#JNl>`5Ck?bX^%C|jp+WTfgvWrtt~n@e6w2ZXhtM`KYts{3)A
zPVUej7bc+{12Y0Y_?~k~8}fga;Y=JS1!Vk3%ev`PU%F9<CYoO^vwZtGbLNQ6%{3{_
zj)lCqkdaOfY6DUlBG(J4EHPK0v=;8w6Q@vxyt=((s^E>x*QGqg$LPiPm+>hddPTJI
zKXVUzigzr2z0Xw8K~~)vMvc<xCNOmY#^4X0@Zp3x9x>?nKfkg%EVL6A&6<dZ#Hp5;
z1_}jrSK5Vv(ox#qOIiz$A1vu+l3+aN9gijaaEMVH2;mHVr=A9ZYCg7us~IQbHkG&+
zHxHyJ6uLA)QNx>#1*VghQ0^=JN09qR|L@sclhl>TCFB+AB*#BH3~|dI$?Yv`n8ZHe
zF9F%xq>6t$*$lY($A|;hqM>lkx2`N;cOyj#%8;%sXDckTyN~*VZb8`WYB!Y+U3P0r
zcaji4oRkx+#Oy&^WO=Wyuj}b}>x=GLm){z)U2gH~g+M@1h{!EBJJBAAJt7TNdGG3|
zT_UviJ$anD9~+DNFBVMo5muLIygZc-kU$>zr_>J<^cgXITg|?o;wWPe?oG;B#U@}e
zg0Y9YSaSK&ndyY8KRzIMkN%W9Rr>FN>YKged0+b;-3_!NjjL2Yo$TC3$7!!yX*Ziz
zpF7TnIAo}N+zP&uSfqtfC$e0Vj8}lAe>9Fp3$)sDnYpo)(SxDJ__A=4fRkdM$Uhdq
z!d#h}ZhNge)E|p$Ex$aS(G@nOch$?t8PGL=s~Vb`qF*O_zYAD%W+$sMIYUNKjkQ23
ze>z8$6cQ3j*uc}k5+;D@2pIy(u}bq5>-J`YiSy5$1%-g=Bzqae4g-E{G*0Uff?%oC
ziFdW0py8A7AE{Ej<F|+4rD4DXI*-Isn7yAo*_SLrvsS|J;`jG{3q<%~_GDYFdof?$
zD?4r60ao#9d=|kIJncRWt-!n&Dit30JyXDb-je(^XCF}J?IRbz16OS=!FLi%nSq}k
zk|g$18lXNEyD4<VlAgq>u_P^$n?QGkUD`jB<|vTmG)_OU+lC%oOiB#g{(WVT=H@Pt
zN@55bFCpPhO$;t2woTgdS)RExF9cWnf+t;>ZhmEY%bUiOy?^7jtFV2BKhgy@7n)|)
zJ=_L-pgZpRqjsiJ2d|RcE&(pDe2nqa^S5y8Kyz^+DfugYxCf)QeHg`AHF|31-*Z+X
znMq9RCh(bs$XocmZ+YjVE+N5|)Q2p?_@Vx1Sb}bSbG!X3{!xU<th90WTo=7mWPfB>
zfDIJA+dKL1TibD~vCW>utkDvULHkK@l(Iq~23m56gD>oCWM*x))8KFs7sa7)HhJNI
z9i(s=LJtMp2_WJ0{qxG7Y+St$nzmb*EVI2P4WpVVl`XSS0m{r0Jq_<_1emTfY|tvW
zedq(`FNplD#U|@1U+(uLCW<{OUryI<Ye)A|GhR+(WBYN*^*tp1V8!L<_lhB0PBGX3
zRc<MhY2u3uv!rw6iK)MTWIMB>M2v_87XMV_AxA@l0lTCXZD`y-{EURth!tc;0<(}P
z4BYy@JLw0H%eS#-TSlj_J0KWf7c!lvKu!|tcST7dwowTSBUbEvIUcp`LCdSQT+AlO
zAv72QDufC0!91=vM}luq^VGcK+GS(=P$3F$ze2#a?`1oFt=W9AT*O~I2qub+9#mRW
z8gh&volKUy_DcB{7gfrx^IfQL;DN;|^c}w&cg&_N0#EVgiMnOEl+oEEM-;50*kMlD
z_h)}g0@-9di4UFJtLjZbQ?~XHk4!{bVN7{=%GdT3!?#8n61RWxLE5pOW;Znj`6Q*e
zsCE@nE$}sfHUTb*&4!oGxOFs(RGT!err*Bf0ajInJHo8htsfr)p}P5RUtL?-`d5SH
z?_P**q%HO-iU=YWTnTGdv84lJK@HS3Vy3WgTV2-A<WobSo&~xDHo@&4P=CAe-?qXt
zYVtlCyRaX0dhh5?b(xbpv*m3Ep|CUhnzeAqki`3tff9qes%p)M`P*LUj}_}ja)Pb-
zh?2*T9x?Wm#(F|$)-i06D`(k&Sk9pv!ie`a>Rd0uchU<%y)?c)TW<j_Dm;kAakJQY
z3PKa<R!U3tdf_{L541}_<b<-7=)Wb#YVC0U_Gu?$RdX-i6$5}=2=}4|D1D8uF5AyU
z7Fuf{gH8gR{9~{%otYr?KOpeQH3e%YxK*bC*6;@(o}aN71l<|7<88aC-z5<WCl54&
zO4a7b-m{4<6+<zjf2Y0im_6J1bk{+@pQtrhEF9N<uzt^AtJdd}R@`aZ=YWB-+WerO
z%ZmH%ZBsC9RnG$Fpd+saKfIs3(%V3Fx%@MO?)wxM&X|7g&5`RKA6sUu8Qx>gbkAM8
zHcl;b>T)esb#qP4lP7P=w6iA$Y+(#%>-(d#+48bD-nBk3%LcpD{m`d%Fl5V~Er=N%
z?lCMw1;vm4X^rE&(Tf&p)=?J8TS|BL`Oa#4NSMLQ?h*ESk6kwBthPr^H7-N}Y7YIU
z{`}mUxbVOxmL#&%&0~F>nduGteiWsk8w1_S2U~#3yY1A)&5VbL1P2+4u=5tf8!10N
z4nWt??wpjTT)sA5*FQ_U?g%q^i4(!|kl9^DaRfW^gEFo>bEb0aN}JdK1bxMHU_B_A
zS<Mh@llAtZzr_$WHRJl@@`dNdTRn6taEKQx(%Foj&3iJG*{$cNxZl%eRD1R|BcQa8
zYdL75->nVtM5miZx7iXe#(#g-uk|U{cH2p5Y{=R=;D&CuT~8j`#Kvt^uC>{)di4$e
ziK=oAUaxEK009o%@$-vR>wtv=wxQVWD`wu<P%?xOpmk23Mj_9hURp4SO^Q2TyY^wN
z?*Iqy`IoG|zE5xPTIX6&_U;{$@msH-B-+i?cuk$+C||w-CP?f9U4yFRCxA)H$8=37
zaXdbHhjo6hg|elgRe=B57ffIfYjjRnBS^T1`r+y%^)z8}bY{n|AER0YbbeOVAfP0Z
zzjhy2*pH7MAkog(@};EGs<tX}z;iS9NE;qL8yFa_9jeq3CQEDp2&;~#qPg+xkRJZM
zB0@2#%`dOL{oA5tSwZzC;?O;mW)!XUpYn~y6uVExgpX@jur6bv_oGbv#|x&m+~+|@
zU2Sf9H1x1=rPgVq()N7Yw(Z*0J)bx33*NFD0KhQCxo~1=AIC96PL62rB`^Lreec_v
znz1V5+v+4#P$V8xK#Ve<Qqo+#Ec8l}jg7mKs=5FCOYg`N>*}2&jWhOjAG+mh`tYr2
zEBdP0FAXWsczJpw)c4x22|X3{*Va4x=+0ny>B0ePT_XI-wu~k~g?092Y19cCFjCNf
z!UecBA~9DOIdX1iCH1{S*Pc3dEcix#w6XOJ1w9t|dP8QmiqlCZ=cy+;@U!@f^~UCy
zQumkssqyWDQYwyF{j|4zUlMO5vzxUw4dYEuOa?HZ{sX^;vMUF552=1;8^>bT7Z>OC
ziIaAuWLoI{hU_Y3+kE@-#8<K(JmjKW*C==ebe^|hK^ZJzFoNM(hJCf%CNMKNe9Dw%
z^+)BeE3*JWtj|!kdMJ$v_hVWl%m*s}>bYaRb#P|HpGU}3l$*;QdhU%w5#7E<KVZ?q
zqN0Mte(K9-1O!wMJ2^tH2PT>gFE#8Bq^};LsTl~l$#To_0s1IjR-5V?%23Hc>sNmB
zwZHPv*CX!T%dCzON#l6l$k8ft!~0;-y#k}pKS%WE_b;|9MkC;0Zg-SQzo^|q-J=r|
z%{nh;0C&O<Ss&dHDM`5xD+6C&ot(Nv?-+XwS(+L5EKM#|S?^f;PR)Z31^uf9=#*5s
zY2ob!)}|jh*p-{`%%r!7Oy3lcWrN`!&SMPaai{uC95r#&sCilo(wf43*v-3<sx@8z
zjqj(gc?;T9o{Nb&M$+&<yW>|s(-Y#;B5v;1FTm^R0rm%M{q>_v&%nSM%u-bOlwCV2
z{%UlLAK5)q=I`6TtIIU$E*-q&uaAMC8qjg{#QV{{W^7%sdU(iCRlZ;VqKP8M;Y^pU
zZB~hO-Rka+Yh+-d?g?vRIX=7}neT-2p`Tnp3ZwE)D!Te>-AGSUe+4}ssj^DPcMH`u
z?{{y{@U*tje*)c#MOy-aVkLoFeYp-kqUHIq!(nSLk)iW?p^=@HHROas2R=`@)S!v|
zJF^n2vTokox&4c-I`+0Rl1+W^(XpVgvXg7`VndmUv5QVvqbhweD(cpT+_JI{A1-G*
z-@4svmZA4+-)Wz6Uw<l(eNz54ProqgE@R+7xQ(crStI+IyE%Tt`#I5j&dOhpVIcDt
zB{6!pdqh`D-9vp-XLQxdajcEn)ihJt=-*H&KYpP?doQOdbEKrqBudLpI?{cp?)8jR
zHN9g*2pCh~G|z86GjF|i^IKT(5;=Jt9x1y;I~2VX<NRRK1eX4|VlzL0b@Vx<jx>|@
zoBEXw_%RkKt|)R)28-==Kyc}Rp0qty2$BTY$L;1A_M5yxfoc44-1vl0&%~hxIi<E9
zA4-<~{_?d<{l;>aW%DeTFStGOfLFe~@o$GXv%t8}8?0u=TAUr~xt;1Yz75lRzQp^(
zor%YPSFfEJkl%#vLd>}X+(=}e(F#QRb|a%INq*x@4nZl6^J?f%(V!oa)4XA{VA9Rw
z8{e#7^e1<;(J5b{pqDtLlTJkIA>zSMas^+x!e;X$n>@7g7^03eo{y|PSgBg^WWFqg
zBQW<aV(UO|7b}qxe*bFjr@TU9dE!Q~WC716DSM`GN&{~wUNH{;(C7B|wqp7Q<KW?B
zUe@OT&pe=7Nzcqw(9$}MN*>@skCKigXuIj!oMRW<Bxi@`zK$rHd?V@SkIvP%lfwoq
ze6e;yeO2D!w1Mss`Pq2{@|q&^miVEQE-THAjd2a^*tBV*e>+u8_iuvcnY^*QTg>R5
z0|!2d+R<WX_;FD;#~4QZylCL4e$rVNBJ}7Fc6ph5wg)Z$JCL_~dPYjRtxmUHCwdJW
z_{Oodgm2~q(-XP+A>U$bKITq(>~oc!TzbdKpI_|6;KJ&{1n16<mT1fsbw*GV(AARH
zln#jX8j}GY5b>`@6^C7u_=az`)fwm>9%#Tdi~TlAuLm!3n!`e&#Yif8^WM|3=y^=m
zi0IcAIb#||>Is8nSgx|f#t)^X=YJ)^@uXK*ov5omG}GqeZ;EeRqOMBaq0yTU6LzVl
zr_S0==&^78{F)JAdHW2BZUb$~flry{)0%Jf?EQC)Xj7UPSN7?2puvd`<|Aj!>9)(<
zAKyT{)44_y+zoDjM3Zit=UwDGjcEkQ*9!vj!=nqT1p-Eeq8qaEj;ExLluEBzh^2za
zB!PvGn-uhh5JRcGRM)?mA{NX6?^33q9p(x~t+T}c9k1jNrH-H#m5E1uj#>vW?a(T&
z-zG>j^^Mx<2X#Y4{T-&Y_t2q3Rol;Mc%Aq4V8H2WdUckKDwS8BYlLr!ZNFQm^=xfN
zQqCI2s;tR6m3w(!w@%Z|=gyh4;``@PfV98xh$&L(8LgrE|3c#1uPU*v(AtIvOy>nh
zR!~#F2ko?BL(`%D6R*tr_5%Q6yhqbUK6QbsO{kxrU!r1ZB-8DV4}<t%8XAeu9I7{I
zvS;i9PNTU6)+zyTM&cgx7AGX0)(tto;H<XLj!bh{QElcGc}xL!%?yY5BB`7chYjY=
zwf#Do%Oc3GurYBOs~;V)85A`J5ZJAuayM%;Vn+L&IrC<9@$_oD?Mz+(8s$z)wzBj+
z7P~3wG@}6%+<sHZ@|G3Hjf?s)|6}i=DyqNN(1$~hwvICmz#yY4r)6+^2M2(8-~i-!
zZbm7^nu$!uzS-XFz);V0++PnKHn^Uxx$>W@-tB7oht`9A@7S!7cC?+)-}UM3>(>{{
zP3X}^XRQIpj$n9gR>oh+O_kw5)hBtk@da@~vZf{ng2N=wnh#6TIi}zmsWHnC_G|rT
zccT;5UF0<G)2%FA?;i5()mw)Rf4MJbDWO2VM5;i*X`r2E*kpW60g(M_#TJdzzWmqS
zXyYG(P8d+7dKbL5S7S6!AHf&n2rP4FBG1Vn&2H2&1y<)gB7aLJY^>;k=-4nM>ip;o
z@xuADrwJ;7G&SLeC%-gy{mVofi8@Jst<zNhvCH4`rc<o7dq$pjv<X$vJFT|GKQJ(Z
zPKz_mf!`OQud}wDu2272t5++~tq;+RZHutz3w7VlVrCDp$24xEN-X$uch)XlBPW{}
zboT67jaQ=ykfT6hMGpAh)O4f3ZW|V(=EGDenZrz@cym-{hyf#{+HrRHdcw>OgVcqF
zhUaaDJdV+aQ481a|J*hXebHfuf*W7p!|T9%oY8}V2Ii#1<=ev#GRpf1D&VMsSXCj_
zx&KCkwC7|NFhbOs%<8;i0VV^j@)Q+vIZI@ALt$d2cfa`8H}%z~53{m-7~t!%U_o~8
zslQ*YQtrE@^75tj($e>B<0w7PekLE}XquRwV11F|`0+;pkNvnp(W@SGgk1}*A^JY2
z{CDmA0&4qLQ{L?j?;mvsw9lQ^$RN>oOq0dljndLm2iMmQHQcl6i5a)zsKtfiHCX9T
z-|>Rt-sHZ#&#otT3km|!^aew1ZTdK7n*Q&9(r}yeGrq99E-NcCGH`Hvucsq&Ixj9f
zkeZ3g<;CqCJ6KmT2Q?ve(2B=@jo?XBs@y}Zn-3;RILX9uaF(F}NYw&;NJVE@`hwH{
zR{C_a3kJTQ9^(@2n`hzfgD*Wo$UD0_^aR1^N{>P_N2v&L_vux%H`vzO($Z8&9DUHW
zsGD86QV;PPy$@o#hT#)|&<JQj0A@FDA|^pIyaFd00p7C&J9Pf(#{_Eg_d)A>lMBNP
zL&TgIJA*>bo%3YQK@3zi)mL}}8*mXISCEoE!g`o>MXT=a>Oh`OdXqvh-9m-8l?GKw
zNvQ-von>M{G~I8jtA+Ird2<a`ef8zfyy;rCn?I?Ry!^54x`LriZ^CTuxgUJD*A6$5
z<u|Hnxz$fQrhqAi7(>`Xmwdl0_H>U_{eRQEME{YhqeRo9?roKYw$UqCDPBK-vWLZL
zc((o!rKRYOfj0qY*d+Fl`Pb4?9aba-OtkAweio&-WNRrjtao{--=RV05eyHuRh2}M
zrfQSP(BMAqq~D1X581+GSnJGCYmQVdMOzwAXA9o*^mlo#w>U<tR06((oQTR67zaR)
z2LVPydRw-9d9IZfbO6gx;$KZJ9*s%wEf8dU6}C533>P&O^LOj%w#a(4jxw{@uEeZB
z8fe867Q)X%MUyPnBq}aAqYr?KNKXM<5o(g1{J-S)Vz&!4k)>R^4<)JI5)CRRH9JMy
z*hKQRCmzvSb${Rd!1E*9R~yWnxyx;N6Q}*8hU=B~HaCp^>-yw&S)Km#SQAXmVlyKY
z^xCv-J02mKr;T-yrq>)O)mg_BpnOghT7>7SZ2h{L@01Vp1o<miBMk62Zyv6wxQB<R
zvgy-O+ErXtZ~bbj5-xynJV=ktrss6u6@esrh4O@ai?qap)T(?ZvQ?E<4TlzlGLw&=
z&zLwoFq^|xs0HS(2;9539eGz;PUEi*Jr$Q3u3COf0dc4}mjqYNp4#hyefsnfJ#tfH
zZMLYDa33K}(MQFNLy_E9W1|vN0crQ{y*uDRH%PL206I)BC`5KX0~Z`qV1;u|-sQ_G
z5oVDm@TnIQ)2#78E9z6Ds-|`${hj8UK~RKNU~R&;nDR#~NQ7<&t*oh;zi?qUV&r@D
zEv&x1mimro@`XCSd!IgHMwpNZf58J`ABsSV^-oRMukF#q2z>zGf%L)NP~Ggfp*>PF
zUpJ`4#>6a?+3o2mk;wL%5zRW?UWcF}XsE+neLMZ(faID^2A#TgEkhmt&@sMeuU@%t
z^Cs(c50U%3=JRsOh%KlVl-4%lVtsXiKWL>24TTsR;FT35ZdEVB%R{UN;op3OBN#q-
z@cGC<8i=PWKYaYyW59rnb2{Fz>Sz0VCgp<rGYOcp;sWWNeVv`VJj}OOqnw5xq+y7&
zJ#jo(X<6dN{X`}5z!B?yyb6{UnOkcunPmtDAg&Lh0Gd8_sybdD=V*N(m_b2Zh8t1R
zGsMr&rOdt=nkgHe`<NQ~G7#<x<r0>c-M+nJ)Qz7n1xg_(3}@$+fF3`ge(b9hixyjM
z`_fiO{dynVjS_G#vsHQKvln;Ub;HQQLc9>HV#F3@=f<MV2R~J8nzQ=fs`&5clxIMf
zlUKF=rb?_o6N)T=S9335->9*p;TWO^8<$_as0)?LE+}X|F?y(bq-?;cl#~?p>w_qa
zAwoLEU3urcZ0I=`8=IHQ$GN<Jbacp^A?>U8S;c^qbXWA-4_xibaov}_PE*06`~Sj~
zg5IKx%mZ&n{kDoPFpnraPo4gh4jkc|t+97PO-HYV{vkOL@3zOqUQV*Gp$XNn9@Kui
zdv7LSD9)HJ604smOI{lE=r8XT@XtGcDmht7Ps4qM>bP--(c|B6M4UN&?;7R4W-L7b
zeUM6fo#=aevm)GNrCWe#YbrK+$hWUv&SynB4+@9!<q^=;{bYSj@Q;6fto+7FSFc`W
ztJJ%<Z-+=7YNL~2jRUbi${TFFZ*J0>{NQN7)^PuTUTx01PEt|bIo~94yryQ-n+8o4
zl|_Yt9j9yhojy&>X=4<sPg8jfm$mMh_IvN2`u7(+m7K8nMz~hjZSgnI>;;W53N5Lv
zUBJaRSD)RD@?L=4L_<b}9#Z}I&d<}{>rCY~MAfL<u7?{xW@aWfz6WACeYI<wQe*b@
z8PKxu_mudSg&EgpRM(AN`cH1&RrR*-=!#F?3f@wkZL1V{>gkVS4#9UA$^gKbMSj~8
z^?>1eR;G?$`}Eu<t+0>nZH$*JkreT9x`(T(s%C+ft$TI^>HkUe8+FgvSXJ;+imDU9
zHbf(hqKV;JJ^#x~^7p%rq3^ok2cwNc^oViehL0WV3(dntC`dF__jv@;aeln3cfe@_
z6z~UvyLIa}jU&z%UptuN1DZ`vCrf<)@guHjbIRxpo&yarI+!`fc9DI_`;TJuF(x2}
zl0_&*om^b(a=#VYyEYf*=l27q=v`4q<03O_u(EPDnmmdHur5XRXi!;v8IUB`{lAYZ
z(ZBiVFdMH<Uh;*q7*z_1By;1+G(muvTajeE<}>eJDxQ1jq?q{gb&&xT?f*jAgVyZ%
z+h+B^EMh<h(-Y(SN+%>GSsLkWIQU>XnI3i*DSu#KV3Sz~9|5XiEhUWR0ORVQ5qk_8
zRBW3Tn{N*ebB0_+#G3^wM_6a9=roP-A)q|%mU<&{clpc60i^3y^hc2|&)i^F-V^5J
zCcD+IJ^1Yh9cpfNn1UYChoO2ssHVm|wLL#<|BBFm(Ks)s83IDyupvgnd+Ysu-Qx?*
zE`kk0Uhx`r2p2<j4`JUB8W}ij;|8(~DeZdm#Fv#)s*<7sJ3nx?a_squZ!{C(DW#>n
zPw9MvMpt(Es6WwZLkRP|Sumw&xkJ&ui11!gQerdpH%Ue$KUiQO%$tPdif18do#Vqp
z!o$K!9vvMjG-y00@<aH*aq8;+w;?{ZeJ|ZBETG)9c=>aV-M^schuQ*3NDzEAa|cft
zCpVANau8mXDB^@bBG=X-S{szO48t_*M@I&e;=}}{%&gP<<R{Rw4n`w0`-JtwL;YsZ
zN0`;08uz~&{!}NY<-EM!vF~4U4{+X4h76%IORtt~{S$DKCF(tU^mslvWB$jovRJSm
z@fwP6Pvw!tFH?>svZEUmie<OP8iOrc5&#|?YD8YLjXQ<~iE+3?)yD~syN7{n%{iv<
zIFsnMpeQr*N$<=|isbm3H&lj?N%;E@9kP#J@$vSrhH>zLeWz}pLNRmm<BE@$&Ti))
zbQH)iG(sEI0<gr%5a^UY2nuLT=fPXx@d9!(fCoVLK4nUfFe3*y`-8UT#~<P^Eq1iH
z0$n~^JAsbqyi!L1y;&zdZ1VV>`-ZvjI!JNa59WPE!XJ~k<NtNsp^n{*+DtH?jWWG>
zd-E7*g@no*Op$P?lae$!C!&H7B5d-<n_VYfrnHl-^?o_B+9rA_^KGSwF=^MW)B?>v
zOo~jmjl;3xF-NsS$Bv1^vUSS0*kqq6dTFp$ZT7k8hBAN7*_kJ@nX)RY+YcT*7-%g%
zGH^&?FnLuFjLc?>0#N_t9j0$)tX&(H=<{GzDTGkz?_agQQErIlZRNrBT0D@-Vj4Qi
z2zFO|F>mURW|*{qZ_S_5vU|Y4@mIdR1D{Hui-NYO*h=w<3zuI|P$mL~2h^V14XXMt
zKg?1U*2|7;`c|C*BwW>b8wL0EHnV#tdMi<U-~FIbbt*u##|MHxvn?*()o7RbU(ePq
z&Rbt!U#~EBY|Vk@t=_3m*S)N8bfB(dtzNgjeP2~5T3+A$|I5$s(Yb2GVxg%A;Jf*D
z{47I?X9ikDtA;0*SZPQC11hjm?n_O#HayPYg!Q-D;%u5d?_AptlwY=SRvo;aZq3a~
zY-QN1#CX+&{(!+!EUY-roMF84LC2sX$2DMBFthwBdZm=B5~qTd5NS9wkO9?@m*-D2
zaDDng&6FoW0TqwFS1Of7Ds?mt?*(QwI)kgl&?hr@r=C2E9)ww3uo5tZwA75`{p0hd
zpRh(FOvq!P%FpkGYxSkEQ2Xht7tQlwHFMEZg_)qxH=d2}z-;KUh9-rK7!5uCl2ufj
ziX0Mxf|aPXM)yAGmUZh)Ua4Dl^#<4GWtU!Du8j=5i2f<$*&HmC0w%kJDvd9AvhTmp
zcDM5ph5hg&DuD7*gW~h;c|t}D7fwT^CfezV6OW@cex9^>Jq0Cu2v>w(r!Ck;P~u#f
zS^p;2J6S8&x$&p`0s`_QtSTDZ#0IjuI=g2&MOB8&muE4Yg$(@u+6go`5_EcZU%c?6
zKqc)tChRct4p!oj3&4!~H(|nc1FwLX(MF3Gi<vGl|A_JjU&@M&35khR`ZJuIltIq3
zI-Z~{qL1}Ny2k6ls->!O2OtdHD6lTiS*q*6(Ov<0mxo>LKD~ft77PJn&?m-KM~obK
z*Ea6$r%(E9Rpg?G0;Q}hZ?!xXJx(&y$)|XZmBBFbD_CjTzC(w>Lxy<qiOG`vrFZr7
z(PdC+#HdmCii%DjIns>`322MM8R2z4jiKO?T-zOyOB1&a$c{@xBi^BF*Wn{aR-URU
zur>`%Ogu%wOGXjXoqS2;ej<Sr17B~3Fljhr4mk47P%uo*NrcqHAnvW%_&Ezs=s+;z
z^s^}Con2jrO`6mfG{n^sqyZ)35WOBe8^UgQu_ZkO|6~!0H?lnum((VBD@u6t4~1h*
zjEscjhwDU0D_VHAcpnWKp^fV2P(Rhs7uLs$!*IeDBlsx)b#!#bXz!o#Km6x>>q@2A
z+v|Jz?reA{uA_|1%%w~FB7g#>!Ik3?=~<+!z@Iy7`qqnu(5qFj)?-`SaC!NXE3Lix
zY$&+CoQl05+nNrf*PLTBc@-eQS*I>~{BV!r@dO7C;VTkyX(I{L?gp_DRW&pIU}mf*
zO{l!{=mX_UdUp0Dq?Pmx)EE@qGH3gopH<GPbFFgMp*uU8o=DHi`XFR;OX73aPJk!?
zp}`!g4QMMdKM>8MW#gA9S`ch21vCNH^AL|CaQF)lP!td6)H22^D<66EXb#a!WD9T)
zhk@IYgYxYahYf41cPuO-;@0IfVmc$$2Y@KW(rCyM&ie(-Vpt-Dn=Z$lW5Rf@sG-=1
z^x;N<qutk7#S9fP337g^u3ZmTS=?J<C_`5$W?#q<Q>Wgp(#j2-f5KWp&sM#7{prc7
zb;g&J#GtN#X&|NrbPxGi<q(o<+W`U`UEfEM!9SOziW(|93fU6J`8y-IVsAW0+NAhW
z%xH+NiZ{ucB1}R&T%;v1*%mJfQEo;Z+803{iLoiH@$R#OS@>2+2w_t{jg955*36(9
z$1rFKn!-?f4=^0A8gUPJjR?sd2(l^i2E7zVTX^rHFPMyqN;kwXd3$=yA(}OUrVUf?
zhh!(myfO1<P(=uy5D|v|Hd(eT6R2_XFF5noPV`-MsB3y1d~g?rRG@31OhVT2E~UA~
z=xqG<7PsvU;g$@r{$L$jR_*piO#y{K1U$)}`a5F&{hhCCqu6TTEhNB)2B=?6t&JR@
z6`zQ#5UG;=QSIA?9N5V1hkQq~Uz}JItke<D-2XDl^g5*jL*}-`slo_|G%Z#Ua%r(^
zL~gXryWr!8579gcY8upm;w=)a5!gL=up5pZAoT%T77->~Z7f7PDeNXV7D9tAfq^em
zE`umIh727VpzWbSeX{uMyOX)U&>;$mQ^$@Sso5mJ*BGd}$2-Y_A!mWEW4$cAA=ND(
z19&(SsDktTz+R1-6>OlsGWQ@z@(~;@)cZ?7E%1JYaC-Z%A8Vj8ay&Ty-DPC%9~nF&
z0oJsF^~-*(3&8zEkqO7Ji0K99_{5gb+HA_I02S*4-y_<q^RFG%$GDtJzPT+cvr2S&
zBzE_#w9cJ6SwZ<ssed|y=UcO<z<HwYw0sI#p<g9Q8buAF28HhcnpY$PhIOv*GC=K6
zyMbGTGW0M7ld*%9dzn&kBHI#}SO6t7&uJ(D#3BR&N|MXW92Wwvw6yf2rt3Q8vxI<Q
zUc881Qc!U+Q`NO{G5@Ux{vo;V;@WW6)6-X?dnKILm3Vi(;p(>-cRjsh8Vwf?oAa)R
ziIv0aZ+(%X5CVDtT-kf@V5egWE91;AQ+X#OocxgC??Mm|wjF5Es+J$daVej%ROVMo
znHJ|(y@)hd2kQX`422FPMqv@LRUo3AaRJOi2{pV?HH$PZio(H-T!5qXanK&36Fe%v
zb-78m2WuyM<wXk@V?e<@L}zG=d&R|ryi@N$R82xIOu#zZmnu#zF!Gs4dW_%!C@rla
zIv^hT{fH4X8Ox@DL+-?k4z?Jl1Sg9u^#ofW!ulQOI0&0#y&l91VjJ)pOK+F7_9p((
z?Tzj|&#&xbxjM}Q<-K?=LLE(+&D45uu5EO_J*zPmaUFRh$gk4U(<=^Gk2HH*IJUNq
zo1*rr40x=esqjnkYcrzogY{km*c=*h;-zEfAtxtnjgtCT3-GTbL~hr~ieFQG8!Tn*
z3K>S&^kOPkjThBAPW33>PFG(MOm{ONzYiY=7hqR6)xUbpLJ>nDfg0;0ga@GVUMh0~
zkL^4R2q@~z*)3CNcNpQB;?b-^7uPU8spttaBc=dT@L~*~|30A89Y~rjWm&-4nfwIN
zD1812l14CGgLpv9dwlxz$t^Z{n4Zw}iZ%(5_9)e?#rSLM)B?ra=-F*wJHYP;Glid2
z<V)jzAQ)T`#e&%929_Z}PF-Kw{1x5aacVH=J%o||QTR|Mw4FC>V9s_(h6;V%2LwBE
zt4>eSSvzXX^6Agh(=R`hKV&uLEri1J7cWGjLdROly%VT~e$b@o2d|wc^PNB_w?JMl
z#T)uSFvXrOA9O!-3nzly3)7BnZ&`xVPJ*xS(xIbE2Sa6J_{ZISEaaz5xt04m|ICO3
z(@FQ#b0vd*e{c7<ss&_U7u^gT7pU?(${~gt`B_vA8(?0*`oA;zD4d3n;>n5>5E!(~
zC4?(il(gAI^_<zN&M7Z#;tTR|R=_A$FaogQBUDt^FNZ%8OOS#0naoIKgytcA`&@*w
zfR$JXiLAtrUP@NN_@I=_Ficzf3^bQyTx|yVQLMjkMuOU^RjZRzwyL0<RUr1?OHb!(
zcNE`m@1X5-C{Kkz7A{3_H9!(^TE5M0VabQzByAo?P}2`k*)5jZZ~_RkQKQywCo>DK
zK)7iE32!cP?IXSzfX?)&JR>?Q$fX~j?{)ow*fnt3f@q4|akqaSBPa~bP~po74FL!f
z#SS{xp{lBWv@8esY`bWZgeLtT(Hsb{j<?hoxK&Z6Y5)3eB^>dn98ba*P-08k)@HYE
z)rx8B#oWP@bI|9%0<fk|M2O`lD0vd;J4!tePPYHPWniuhCora8Vn})9m5cLp)?IdS
z5k8=(V<ro6GX+LcP2xcWU*8UB|IXg=%OzKPeFtBa*!a5*DXRa`9o;m7C<+iA&85Z#
zE#kVrZ0`QXV#4B9V<|F`#U2u08SqHZ6Cec@NLYaIZ1-P9>al^@aRH{VGcoshtOQ9w
zbH~qjgY@+ESEJpbz8>#XGVP(ST=<DY)w$vRZkgg$!e_+dRLiNMDO(9;Lp{g-=X<a@
z!YvT-q1eL1D#-}b*qn3r%h3S>#p36sV{bp2np9M{gek~we#cNa3t)_=L}3i_I1P)=
zMk#d^Pbs<a%{$CK9_Hr<fJTnr{&S1#@(Z9>j4s@v$}?pT8?_3#$R$l~bB^6SfG^7(
zEfXz08hQ~wArQyINJvTd^+W+cS%AoRuYf6;yd`s*ea`K*>&eAo-T{Cbe*omq$io1d
zhXA^WFUdQKC2!+2Ef&(2=S5j2@2ow+wO_Yx-7T!`6%-XI&G~`W=mojjN4K{$<t8b@
zeX@3x8FInE3sw{jqL{@1<1@XsBGq6-K;9eLsyeDVM!MH8L`hDzKp?Z??fw0tBq7A-
z1fE=_7C70XX#%B`NaRT35Ywgs9T1{~ZZPtckDYQ~nwWKQE%6fwD3nKqFS8Q7+S|N2
zMem}iX+9=y3K3M;Q1O3>2GrvG65&QVXwV?zC<~t=$1<LXt#a(OYe%p{jzRkmH6>Io
zri_OG2OBnWVjmFxa#y^f?8|AsMPGBJHZU<inHB>g#pX#+I->rokD8KtgSeBdVwqUv
zR4|0}0m$d7$%FgZ*@xG?2o-_><{J4n>zpiOM#n;1q_li92VD~N-{#xR%ahZwn)BtY
z)=^!G-^(+f!~?K<8L=`a%y8H4-K9iH1|Y=ARScbfX{fxsCme~L#?;xrVIPQW<p|1{
zMGIbEoKmSq3#cEc*4Z!pKBd2St3aHbyvu2JT(kNg1k=ifAJmsgy3Ics+t#9JPY$h1
zG3I^|-rkz!cKHt|+$it5qkOXsgCr$B>TgN_px*NB{zPIWrLf4rw!*K3W=-Pz_wUOL
z75V)hsN;ONac<u}rAKXV7!|4?;DxRU1F-PiD^mS&)!e~tNad*t_l15FZDk}NHRW-%
zM2XWQl1@g*dO?O6o$p^bX6#rSw9aBBn&}Cl;T6G?c%6m}34sA?)j+9NBG-qb6t)9!
zvG-t0uTHux`d3Ojj&e2~pxB^ZR3r!BAZ6rFt&g}FT(^WA+TazysovIH9d9e|!tAdI
zi-hh{mPO}Cg_9^BGqYeyxlWmTZgcIMjf{v}q!)2bqPN!BaGRJxwgLVZJCNE)Nd>>C
zpw<=qICH=$znbi*sLa1!?7eHiCN3YuJv2&U#>XP*F@`qZzP9}M{(VAI!r8Nf=xC)W
zCgH)vlo%A#B)57cb|9?bM~`{gdoP^PuP=vpgze0PBK}qUyLpG;mQ&L~0>oN)F)e_G
zU{`#+x_H9l1G5BhdT5~5DFhzsTiO0Qhl&y59}E*!qPDhn-cM}-&@y04olleiKv%DK
z0~w>C$QivgO*`u4*rcNQq$QH|8I2CsJX)bYvVHn)17)y~jUph}(-jYUm~?KH0WoNO
za)X#NFwtMV?m)i1_~AHc(lKUagg@|poL4Z_A2})^8yN)SnWBHKsw#MYUy@slDqwxR
zjM=gxyP59Ez8y?r0B)8nS&7AmtLuZbv`f#@cLL;L;I2w>?tAUpHMbAfUp#*<tbYt;
zTrclT+?kmu5V^lb7*S#=VxCVMqD(dICb>Ax{n6S9keK(buY2LBbfiOEV;O;(Q1q^%
zBHdH6H6ZTqUSfEYh<S%JF&V<v+wLp9A>Ghjgm!!eYd9?*<IbPIeqF_1Ne|KvB841-
z1*cwvJ+AD0dVvwJcEBX(Rmsyry{W5+kHa)HPGM%nBZc^1=p8U-t0Qy-Ps7yX%&Aju
zMXZ6@Yqj6!<<eTi1n2)+HMw2y+WA|Z?h0i+h8nHNaebAIM4Wdic1t2~Ns#9(3~iq}
z#)rr5O!FC#_D4Fss=-Z*+DC+A9QBTH$N1nVaTj?9&^Lg=QT?0w;xb=STI&2@MK*EZ
z6{Q80ub`4eqXlU8(%pixhBsTr)dEU6dg_!woQZEHtq*k}lOQfU0kscP5%uH@B0W@)
zDl6}{5nbb>NAf&%%6Yf!VzxC?w5G0Bab~BD3d&$kk5SaJlN2hgrKDHzceKc{n`<pZ
z`NPN0T@jjPBesy(=0^&~eBJX4ZvE6aXTUtF{Bc1k5Y3`Wb7gkDY{cThN+lqn>T1%-
z3F~@jEN_;5xUNDaHtQmyX99{C(8Jd%%_de{QAFSe(^tP0+l*I5(>U-)j-5(=)0Z<x
zV$oZ$Kxkp2ET@WWNn)Tj+16BXNdkE8D{~L02zgCcE=tSw3lfkn@*@^@oF-VizzOLV
zK)cK*my;9(9>)~e-|z@)Y6QB7JiyPp5rUW20}M%T-aMl`op0a1DNdO17iF-@md_>O
z^-e7<G^@Ul7huvHlXJq9cldBdiis?OK;f#X_x}W{RzmB15qB+wD9mei0>*E!vDQZ8
z1&(NXeQkF-fOpi}Kv72pbt2t=Liv^2+gvvs%QJX-H=Rmbp&li52!snVRrF+_0VY9p
zfF+`)Aeeycpj$s<Ts5d2Pl#778aS#sx~{O)3xpaf6||t9P1}uS#0ah!rUX1<NPU(e
zj!|zJbLzU#z1E2-j(A(sH=+A7GtK=Nox69hHHv&T-BI9ugzTYoqLyuR%bPm;W_iYN
z#Yr!nwQ%e~5hf>BwBpX=R7xM_?D%b@&cPfk{4<!Lz541UN{|5q=FyC<|GrGF%C!|0
z7|m~9d4m>tk;nSBqC#)$)*rnNpM^!k%axXbdx5kxDm~i+H5hr<iXcukf4QnC{sa$J
z(Xut@MT6T~2GIKZR9bOOLDoqr5?Qq_{2N!F4=*wI7`lCGyw(3BQQMSCfp#~feTJq7
zKD9d@<DK`GLS-A@U9@6!LI9-#KJ8^4OiQH~?^wjoCrhX$UNu8|(*OPFQKLeLlCuo2
zZawmuw$RooZpZImV0lTY4Qm;U2tdw3K;@Mwqb`(cJ<7K(vHC_qf7v7LJ9RQ@w_?=D
zkwkIhLE#Y*@3}^NClc3Vxy^+-h!_VH9!Ay)114k-L_^_hN&6-A@<79t=W=MziEI}w
zZ*TXxTGy|8cb(tATJn*d^%-YjZ;{sFuE90*V#I0ze&@!?n@=}yqk8_E+f?io#2V9J
zB`$I6xcpwUZXW;?i07A0Jq^b9_Yp-R?PMoL0|bCYh@}}9#Rgyj4Y}T&Ibzh*_S-3Y
z-Zu~q1=3_{+>rd?)XVhD0=N_QfD=pBvA?9C+SbMztP%)qkZ>MI&fL^)wo{cjX`<+5
z5=iNQmqnCGp5?#^Yr7#pwRw6@$Hd__ZO!D#0r)e<fEppb-1)l0VCV=UDntJogFn*K
zES@(+V^XWT)T~mrw?|p9)7@@lY30&;U+FbVkc1Co#smqUGX<(tYdPN(AyX7Zw%9F4
z3VCIJ!>t*2M=JsvwZtWhO}9s9FxMucMt`p9$?;bGrEPA^{5^@9k<rV$s2Z~b2to1G
zm|z@!k@;}3idg@w_Gwftt4%+WeMFJla3d={-SBrdi2UFgd#;-*_xu(AF^mdVL)LFR
zRDeFCTvR?nY0^*CMwv#1xcm9@=R5cAg*@{#rt<|A5*#%7-XiJ};4c~#-NSY&0Xf-o
z@O%O=SpQkI2L6GbJEThKRF)0uzX6@+3$|nVa!PGmsUbNwGHOenJuuXg=i{cXuvxe6
za-w50<pW_NA|gV!ad?Ib6d7{vtMT@+`DPBS{8(d^Y*+LFKhY*PjTCoTv#$bUc^Knb
zy2E3jtZd<xRN>uyV1<pQ;{QljWEI>`+x^o)m*H;XtkPpL0j~T1EDcxcNO281IPry|
zy1GeLk3<ypQI8@Z3~i0|mJv@}S3I?e1@Ss+ee;>*={?ZY+|wC+K|Ufum>zaO;mhvI
zeG#wY#yfoYC8@)`WM%cmm-fb!*U?sQYIv?$Z5(49L)F#$(3!zVPz4*D_+3Yw#xa`t
z2Un+y+)+zQOBT}2K_c?EI)RN=A*qN0=P`Ot%OKPP30)kH2UL`>ojk|FMJ`W<4``El
zGL;4x2x=S70%C5!$6~0psUrP0t-@fv9#reLBdQ6~h&$Wlbor{y995pI3{p^mnLImp
z9fl4R5IN*oVi|bJLu7Ws_lA}Cd|Xfvph+QFMxY?9?ck7{X6drQay6PeA@@tQiR~^c
zn+bb4AFhdZ9w8~vtuTY$idrwwKAztImBxxKPbzD)Eq4jeVzWX)!GPF&(bA!lutDy{
zhJqs?7O<<o7PR;>WOi8$OyJx1XNpJ_S)E<wjM-D)!2fQneRG<RON)08Sav{Xo5wMk
zrzYAhB})mGm?OW%&23eD0ngn?^*e~2frJ>ov+F2-eev%}-{JtGfh`x1TB%EIA7F(e
zIA=zw72xlfFcx?`k+y@hw1Se-J^{h9gHz0<D(C_JsM7tNQ!A+c8uLxvFC^1`hTZN?
z)#iv{gg^>J8{gL$Hgk#DRZ4FxnWP6!ISI-k1ZoV`KpHhXUYeJa6DB!zmZ1RbOl~(V
z@k`qnk=s7bI!3Oluf-Flh0^W5P+@`k7=)x=hE{)gc+di#u%e${=WB@x3CkMlp65hU
zbmh7{Nl!1ON*<0zYnESg5=Is{Z<1Y!zf9=b#-+D&OHN6*f1EM(l52C+u<R`Ari$mo
zN**>ZE0_I$b1nN4r)p?6Pdj?>V1~hfrT*3dG|akrVKD{O9DHF0LBxaIVa)9;u<28%
z30+Iz$dRs3Rp{(#%Ah!5y(>poP;Iz2SB)6842qAso#}CKB{TtnpVUoE@<u&=Z1IGI
zt*#!gJBT9BR$1r72R8>`Y~yxw=v>Wj*kigLmHtkS-3jzmCl-$eRfQ|&CAfN0uUvX#
zHm@`MSfADk)AtJS1C1tW*G}bKscu~Tsen9cb~G6~Uys&I+a;ZL;i%A0L(q`Dg((RQ
zAtG;-)(F<b0+^}>F{g~GJ4|Q(Y(0CjirBs0d+J^g5wZRSiJ}v7IA|Ery#Ypu!5wsX
zpomm&`w>z^iJ(mGIAW3P_9e$?!R6*-)|50d{F@z!9Y}ERD0+Z&`ccrroKO{`h55p4
z$-y1J>dqS62uJTr!pZ(bTIKf(2^LG7Qps$9=f)Y~zd+i=j<bk}(L$Lol$Qkm%7$9L
zlIQEZkC5?+HI0W*Nr3s65#^!Wk@w`5-yp-Fb*-(hEj}x3g+vz!Q_1!>)F*Gj5`}ga
z%JmIS1d0ZyW`exMzJ7MB<=vyu<%~CR`R@Ujp#?w2>Y82j%@Fe7p#rEEKUP4%03<<t
zjITH}2n{X%8X0tjxhrfSIce?&E^O}SkY#r#cb`gZWSMSrY)y;Rj8q$Sb#*Zl%Y>+D
z%C^ab0OQ(dZrBst8j$DF1|J7KbSm~}d=_ppZlbbvoLR1Gdm<Z5UwTAlBB+`(3at8H
z#+P19+*<60Xn}d$SLOW=@|4e;L|H)VvaXCaa9VY8W$x7l1`hmfj>}a5i!oy&P5l&P
zd&+i}@_v2drPENqjwF(+m$J>13RA3xTzIPeA41XoZf9D>llq}jgI-iMa7MZf8pP0s
z2a}h5k<Gw#2d56I_LVGVU;Q5X7^9_2|89l#vQjp-ksL4}Xx&ws>o~fBCu4d@Ts40i
zKiHu28JffJYH;6Dgm{Z%&!2Lezhuerrp;-I+Hikz_RgC&r3`8PaRXPvq)4}$<1ly$
z5WhVFZbaW!cfaR?>t;8rZ)9eYC|AP-O><CoVd&U45CQH<<&~|3T$5US8#T>O`g(h(
zsi@qjD!PXy4;1K4nB2C_5nqP=A;3X^;>ssais{{T;FNvF;nw^dAtY{ysUId327f_^
zA}J8wAWTc%0>u((rlD0k%k4jg(XyeE0SZpz2AM?NzvS=Vr?JqbD+qw_weY!-;TRuF
zmn25(`O)Q1&*M-N_NEzu5<^MCkB5LEb~yrTer3?!+w$kzK9%o#1!jd_ofq+JE#P3W
zYa=4faXBg404a}R3QIO0&a$DZ^};iEJf&j8@8!@QbLPnNQDKzX-6^&xiI^WtM_e>@
zYD;}jG5M3{wfx!&on-xsARMf$L*Q9Tk$87iwAg$pE^R5&;w@Vy0dQ7+lkz_QY*B-{
zIdFkXhH9s*p?b%fv<ZZH%X}_4UW<<QJVpo3E<+(v4w~f5?wgrHGeaDL62R5m1@M<d
zC8y?C_(+TcAml-RfRkZZHOC0+@0?EWEqy3wLi`q9eaOTu;|{vsMieRyH7&A@VOclR
zQf~dUUGADanF*<9A+s%CG_I#7DX7+3H}8yj(YZ>WZ(HZ3KY7wSHFE%?A*sZuBh~?Q
zf$`sG8vZ{?Ri&FsgT|0Fw<VFY(2=8{12B7D@{R;S%TBlu0tgxgQw>+PqKgOvO%6u|
zSiFiBifh;l!9?7xa9*B3U5is%5(^yT3vzdQe_Pm;)je$Y>z%PBB`e~VIk_;BY9J?n
zIcZ#4e`hbJaFPAdPd0cB%yLCjOgsicH_UU#Kg7}7ZuE^^fpVD}SFaY4jaW#jg5Xy;
zA5^LVSJD{92%Ar}0pTW5ADM75$#|~uD25_<4Xbrfw>}k7LFtOpf&TK(*G>>ec#ASZ
zxUbY9t>U~}4r_f|8uH;Gi&bECt(hnnG!0X<La{(Ak?S$pD+wtsKTd6uQce3)fc&Sp
z!J*GKdEmiO=@tm0>9FysSyY~Y2N;Z+%Pa{tYXZXN{K(8Y&$#dMp{60v3Kt$RV5F(@
zYWh<&E~`~0E}eWQ`lWTzG@GIpG-4_auP4x|(V&Q!iBD#TY4zI6zjnTM@0C`CYYRMw
zvG=e(pev&PK=LMG8+%$`@kOuNif_8BZEg(oPCb{>=%Z^D8#_?4A+{pyK<TGXg{y~k
zB7jn<3k5$TNg@|9q#Y|5c0WG^8KPbLi7l1l=+TGp0w{U^{(|u*r#3D0UfYb`wBw>6
z2-RL!-+08|-@mC}gt=R;ZJe5h-m}!`foo<|o^c^~3;ibuao*-0)%x?!6nyZgX@5nX
zi4G$pBbViAwpYWGYo@>W2R)|(6+@kn>G9=jmj6%sqEmhZ{*bDW`8>CPj$?RKm{H#7
zJwo&3-{Vx!A!rMQeTA~!%h9ej;X{v9^qn&mV6AgoRpPTo>tfru8%_3Q;gExi3VQbM
z{}{1cZi&Q3E9wb)S4msT9kWMV`Pk<AnBMckuSSk+pC_!~^I8glXoZfBf`TX|<oQ%F
zpRq>4{^eC~d+UrfHZae?b_2HIB#)Gg%1xA@;4ngn0(C%LY(>PXyx|+Sod}`_QYDfm
zFp|mYwjkre%JIRfRERc!_{(J>N(anh`mg<R?r6xT@DTe1{0`8b_&ARHBR3FJ!pzkn
z4+3UegRIte#kiqETZhRhu_4MSq?v&(11Z&U%M<T|lX<?DPs@G%0c;=jfzV4J@qxWE
z?JSd1JtHA{`(Fx|Ehf95*z51pSxVJa<8<P;bu*CI36g2yfSf0f9s%=QF|BU?{i$f{
zmc_X(-@h~#2Bfc6Y);ubtiwu!Ir5`BBqewT+ncqObj~x2jLUub%gq1AYvZ#~k)6yg
z8lP`JWS_kJ<1QU`PJ1<dPOH_$jamVjn|AE@oH=HE2fJ0Rg7WG<PXJq+>|&$h;-Xzv
z_hVz@&sQ0_c5!Cb`lp>YZ!Y5=>f+nVQ4LR4j@s4U=9@GnAQUB&%=n8PqqJi9$<4G5
zW<(#5A6e65_nhr=t|M&YP5}~uZ6z>*C8}dwnmeJX9X@oZx8RStPeDOJ=sWoN>|YMG
z*G_o#Fs0@#P_Om4?;=aj9g%aep<-9v*~z#@BX8g6mPDQWH32^;FyG~+kD9O6Y;)}M
zE#2PMHaIjiG{b7##Iotd?6gSZGy`sPP~<aW{x{g`_4L{90M}LL+atmI{N+nU^~z;O
zX%dJf!ugKTs&_lv4HR~(l^avj-gwuUfu|@Py!~whlgteVRPQ7n#5^~t?R&n~FwM%j
z%a^KsyZbeE`Q@l6lgGIo@8;qv^S4?r@r`BazghqwUY98UML{HUvc8H?9SB<e`t`q-
zgd1ZIAtuxBqnSw@)Bbspma4g#nHZ)jsBnlG4Q@RhI+RUDbARNLy@mP2rtd%51}Ji}
z@VjG&g0K(ilA4LF6rJHNEMB07DczDXqFoz9ySndL=i(CEBMfV&<Z=%My;VRScYa6|
zGC)^wI>R<Dc+q76oHMQw^P~>Ntq)xvD<F=moS=!iRi1ztC1(G9o7fT7)=3)Q%zke1
zo@;V@J+GE#b@SGN?l&Tn51X@;kej({?_SILFG()jKb6_q+b@38x=+>m5qgUbTQ6;t
zIU11fLX+JOAdzwC3y4lAvCux=nrD3Dn*Y-`cu|75&<9iYyteLRKDsx~Zg^bjqRRR3
zA-AGxP1id(@16y*UTA5#fK~)W*NX}<$XHx_7C{Q=vydGkM=#%C&w?^Gw*w#%YAU~I
z?{`YMov3xh+{h@TepQV=8%RLymOGxnd`Z+u5+cph6CAX#EU;I*nCk^{p>g_Z_OCM=
z%Ur6SOzz{@<v++>T^T;635`g^r%Bv*Wxhu3hB7yj3hY*1{T8qSC@$b*3`#?I$5=#<
z?Q=UjXz~p78xjK3<~N?Uxw|Y9UFC=9jK`0`70Y9~zP^Y`W`yRaQKLp3?_s+f@nG*x
z3U?<8fg9e}9|XjN8qs9&kekE7F{4+cwgTi7vMRFuPzE<~S2+P3uWFm4B&>GnF{A$a
zbgOq!KmbPA>5m?jf8B&}Bg0w1hSGhNJt;F{*t<S|Y7%55`Qa@s%qV6egvbs&(poY%
z<=hVB5IMFJ8FzA2A2jm#+=?Fx85`o)>|VC?;mb4qTrcl-*E{xCmIkWT<;x>~+GpDv
z;=a9Q%d(9+g#&L@+b;W7W;<@ykEG(3nq%1)Z{1q<<7PyJ{>`l1++J~JKDo9_9@<-W
zyS=E7_JXyR-#(`;9=XJSn0qxVV(HODR!(<J4jR8P@8{3&T27bqs@<%+ZYtcD9jU&1
z*Dh-JY-&~XOJZmhoi%${evVa-06l|g6PEg<_(hNH76hnVEShKnAoV<BB3f2t2EiHG
z(#@g0!Mn0->>5u$Wx`ZU{{q(_;#&#6!N{P}Fx?$fZqR@U3SUSVRICNHQTwIQCn{<j
zS|Zji3q9Suc_Aq&7B^2^kM$q6VD8+?m`aAPrgEnt+>N-|kp0B`>J5G~0ov$*<hWC-
zD|0TxOm$%**Kvn$(Rt0Y1&7OiQdTa1A{Jf1HkLnoIoN-s%cy$o|CN7jO80EiYUtvC
z>FsJ2$uiYWn&$6!)^*YXeZl>9#P8yZXROxLF&QV+wsiL7r@yr|)f>;x{XeSC1FYx$
z{r}dn*Rf|fvXYGKWMmdXM$$AYGg+l&g%BbVhlC_abt-KMNfa`UQmK%H7A-2}|G0C0
z-*f)g^*h(^d!0jlKJWKy-1l?cr%Y+Z?xPYonzo)JBwmDTncW`UD6X*LL0`PM8^oMc
zu;_v*jU||iFV=;H&1%!KS_&mjHOJQ_2W$|W1Kz|JM)Ns;0ljbNZ|}T!uy<-cyL#5f
zHeNmHclWpIexF{qi>g^mxDL66;VdL;7`^#CF!dAt=l;>B&+g@M`?oEfcC$yOp;^ka
zXUEMkAzpIH{rp)AV3~^b4`t<oODh55dxd9Ob(a>1TSclanXNnar%e@MOH9~1sr;mc
zrt9TZS_ldN|J^<N=}yI;pdK|oe`DI27D;c?JDcpZ511X(-#zhajOFrFqj$wRI7Gnh
z2+1=&T41`|!QP%?Z{eQ$l(g;>jLoT2(a_wkOywzlxs)90SNyz%;r5?NL&jwpmNqna
zmT#pVtlQ>Oi<-!dBaa?EI_LDT-7x;aR43>ozcZKVAQTeW`U|T3{T3<FoHOtcaj;mp
zmRw&tg`)>E;A_G(=bWDqnHkA#CN^fd6CifkJ3{Jum+w~8MAHrjg2jd8;JpmJ<H7};
z+*Ps+rGhFqCsfUBZd;y_Ilw(a4)FYI=X|`q_knAdU}z0uKpWR<UBD12hIT)`iAW##
ziE7Ii76AT=Sm=XzA{KV_a>&W1;z*+C14HpxQfs+&E6$NqOjwgno@Kn0)?Q!@jM@k#
z`y>VH9NNOka}#Qwl_0r!1b2qE4xDcM%p<ulVapC|0pdb8w9ee;>^wVqTsA6*>3KKR
zco+s_cB)n^Mx;RZ#9PGwT3nHA_rJBixBK22m*1*;Egw_eYh}f6A>8ZvW<G`cOpRyH
zo-Hzk5hH>#hl7U5%3|gVSQQ#WB<nn{{wI75XSgV8CRP&w=$*O3Hxmv+E*+HDJto?p
zhK(1WTI>2p039VeS_T*rYUTRyxd#Uip~-zmjBur<mZ2)r3}Z!7GklZ$;K3V=bRT$T
zLg)&J!lQX(-^Lg0*mU<QR_*J}Nke*89Rrev=Y?$k92Ecil#d%4JMM<x*O)q0<Ol40
z5z~s2gdZ;%NW>7J&0B>79PPn=5PSnG<G3OfRfH>IIFBl{2cH(oRccuj!S2R8Xn|()
zeMgQNBZm{#0r~<C^RVbZMa`C7M@*}{?jIO90Q3{o#o*=A9R^2>6Ly<a?>{v=sLAUc
z_CH(1HdI-h_%dr<@ZgGMLuQk@yX(4p?^Hue%k8v!OaNUG901)sc0f;5k46=hDyu@5
zVF+2GFktlPic1L@>%(TBFTEQX_4<OQ^^@d%v6I}Z6(WodnwWiaK71InxdKy>T)~L9
zAU&&#e6s&;DX1p_RCExG)I;pR&x3q#LI@5>s%TsxhYIJnVv@@qm<n<eMEyx8*Plv#
z1;+oqd-uNi`0>EA)ruI;p?*TP<u9QlVc((ySv<~#OZy)>TGT+8;v{N~l&*WgYj`hM
z4-z;NgydG{GSdYkNumqQ8_?_v8CWD;NdTc9VngW+<T9j2INWQJ0@3T}l0~y6*n^gO
zFG4&%ez(>I&4;1*zyg6x`^wOvvs6xkMMDg5BciLLto%;?GaOFuv13p3rEcWcyzaa$
zbqg`$F!R#G(|Bxoyk&<!$M43fSkJ)4tfWdtLojNzud?!-f4i@UkwNpfPVB4c^Mb)T
z)_VPNs`f}CnudnJpaSeuykdDAZgRUGM;NP}87Xae5KJ%<p#rC)7~g25h<3_WQGN9A
zp{Aaw5jt?++(z(%wu3$i3>`z(=6wHfQ9_c{OnO_bBPSZBJ;jkDJ9s^N0D>VR7x`;u
zA;V$x*(KI#U8m{(fs_&MnCz4|PDBakTeEt#31t<i1R}7|&`^j6iJK7P8yph4`qk0F
z=cy#66k$Kmg-By2nF+ireh+pfY9kZC5~5c3Fhrd*kc`^Giaf=p4*QUtCnPkX6`__F
z>zjaBT>9^5mx1j!ll}_RCB!wYrHB!L?YHTbqz4Q&KjHgwEvqq^-7N7uBE>{3A<#a(
z0LM5JB$yu>KDC~%x6bd~x@qB?5V7I<(UeKufL6E0_xtO}jJ~4F1F1rV@b2^H^nkzp
zl#|F;Hw?D^y3gCRaQDdS<ceTd_Y;E#CXE2V^T@iQ<C|XYk+L>a?qA)^7EAn>sFFED
zJW>;4CgJZXk$zl?=@Z>;3kgV^7q=^We<DK0>RAqxc8C7CX?<~vriDq(jpwlET(TST
zL}SupqkImo`};hEL;}KvCm`5ELFly5@PJ8j{r(jfgsEcQym{iJ1o4Yjeo^@T{XG^w
zM05@b30@>jhyYpgC8_2PYyDF5!zVIOK*n~oR!??n+4Zgn30PU8;ir8Q02)f0MwL<<
zZ8rY{<+VI!uJZlJh$~*5t69o@&lb6Y-;v@8lhi|YiCwN0#UE>p&6rEXwkc7-oXlO$
zK2BL)dce!8FKhI2XsFw_?46!eG3bRJqf92jce=L@5I-p>tZ1N;%CzJi_xrj`c-}WP
zHPctkQ=eakUVTATjMb<->Gxq;6VK`Ur&ts(A7Y?}f}pj-!{`(Lwt|*Zo6|f1*dd_E
zQb;uIKGA+Gm%@9*eDsz<xHdVU8N<N5mLpIjD6ODPO(>tnjvYIH*)lEO5zFmTK!CWD
z(T>cXGv{!-9_ol7xU?mm&7NDA!bx&KITl>*n3#}rh&n>Ja?U?Vl*C{VPc;lZ2WiD$
zV;;pYvqxP(sS-)kvR&1h&nCIj+<GgZMWhsfIY|jL4lH}MX8n3L)Lz=}KhK^O#T(!L
z)BLp~Fm=$|op5)b$uw^vV8AAJ?%dh0uOCA0U7SK9mgc;1+c$1X!~&WF+A;v>48g(a
zf|j$aaZtMR?AiaKuViKRgF8cGVG)_zNp!Adf8LyLZEjZ42>A(E{R1}e>NRVYgQ5c8
z1)<hOzJ@mqwS%>-ag!!ZBtlhky;<f+m(9p-*1hq-B6L|A>Yt%Ezw8M%sg+oB9barC
zOGccd{bX@QueRBMa*yv)|Kwqv>SXFBo@T#c>3TndS|p3*?$ZVakVAo-^YB<h|IeGC
z_McthS#U`G<994HZou_`m2zB(iaT&%z11k&@n}GK^P@OC(@l5PoK{W(QdH`y?vZJv
zr{}xsDnlCJB!-SVT6C^gb!0(q`+PpnkRb?iBVObb*5puTQ2B8k8CZ4NW^5T+f79D{
z^wpT7g+*NsHAGtM7_ID?cKyQ@H38M}aUf(c;jD*-GbkGcEB%fFjk!bn;jJVj5*#ji
z%lNTl@pnF8V%J@cBeg{xcb68I^3-c9!ehUR&f%9h0o8=o&aLpAn6ITerrayK5g5SE
zVTJUiXoJ=obaq^nqoO8`YWjgT6Mz2L0&poQ?o8q&avW(Ir3gH3Jwh31d?@Pa-L0eE
zCa~=pTsnNky{5N*l+2do9U2#=Iq03jktV(zQb^HYoS9&=*~c^pQx07MR~h*<nV<T%
zWoH7Z9XxWR@W$OkK0X7`;n9XakKOsx+2Ry;u#_rn7QoKepR8wDtB3bWwm~<@BvW1W
zyt_x{{_nbb+ql10u7NR#lEkq8BaJ4L5ITTdW4%*fzkI>KxLf#3uAG&j%f`|AgFxV?
zEXDFLUr+DhFU4mcej^O$z?xyigP?Ik@vdrDu|9VbdCfKeiQEm#Go!W<;<M<XZBQxM
zH@qcboheO0TK`^I+O}&~U*+!`Uq>9L9VC{)U-{-M>!LPv&}=0c7>4jjt)PN*Eg5*p
znMj#`hXW8<=GD#7ycHe-OvwTMRSB1A`?0Z0S^0JamCZDa@dir2#^ze!HDfqNP?G8I
zP&J6Cmo?g<Qzuv4`uq#&$N2T;a1>(3VM-2dC{=1f>76-8X4E;Q#6U(R{2j>~60hSr
zD@K!>K;dp>lT9p{yGY!z;8WObXK~ta!(&5=c)t?BL+a488*fSM9#;xWv|e84?+h#8
zf*y$E7n_x6ycGyOR!L5M`t*YRDFl%Hu3ppeB`ZibaoyQD{Y3zd;eNcj3mqo+wMUj&
za>?OChXi3|dY8-@c0QVv)uKIxP=w(wuw!w6Nk9_nyHz7(0tHDW9AfFFruj?-!p@Sw
zB+Xr=Q$8R#Yu~o*(_9C7Dd~+zyy3_qDQNNivD7;KqRUrAZP~%t1g;;ECnBQc4^pXp
zdK&xk363M42k3+tA7Y7#?lN-qe5ahMccb3yQ#aZ;nw2-{VGirMPJKu{2Izz~O|sK}
zh4CVKqpC7oFgk`COPmvOJJB&y>h*Ir2u#I4OUM7y)Z+yrZYKrX1JuhpdU}*w&HVlS
zW$*v^A-2trHexIwUHd-&Sf%B}wxjN$av*YHI>VNE*)X^0Y0xPw8l6)A$ESHa--&&5
zdUD>IH@B%^rJ<(lMm#tuEw!kq2vRiMPx*OQI2{T1{^D%><5JQ}z_abV+9+wuHs<k9
z?8lr}t#~ZRVTxT7wk2$FEb`p)_I|(rX=*GT+nPp4nT4bsGmXsn@Bru#oZ!4nC)R8y
zh{vhY2At_as)<p63v4ROgmgRXq|~|mKj`RQ*igx%cI~-{Q8h2ua!@Y))ay!Wek8if
zrmb4#mVJ7`G4PJ!-P6-J`tToaTJfw4QP6W^FhJZ1_>TF?Cv|KE9s{4`Y~A{*nfmId
zz0rDfJa7AXkm-V3gLz`YT+`Mt-GMP0pDhoQ%{O91JCrv<c!Rt}?^6YX;6iKGs#QO>
zKDjh14F|!Dy=D)vvY9xf$$7`C!3+jrExV5&z_|tKnLxS^$kV5wM+>KX9Ly@>d2=BX
z3_ID-uqSm6zAdyXP3vJkrL?_p>(=?IDqHFTcr4M3bJ4*)z?mj*`~A@(-$jy9$>}6T
zqwEj-fedn-;`4$rlXq1@PW*ymMZ+Z=t+aDAH1y^+WzS~H5xV(n3}rBV3^K48oRY4J
z!@>6CVdQqPJD7oHg9uWw5|Op_@y2UkG7UHn2Ov%&n6^zD-3DHk6GMhkl+$kFln~V>
zM@aFPDv$+^6p<rGons0{o#}vy9_vS!!p>nX{98Mu4OVSx3(*CDsF7CrSg4w!O+9<`
zkO^3#!5&BNAcYR`fJ}};+PhaJDLOr5&Np%eSa)+RECO`2?78X<k7iFa=Gy;2Tli_~
zYw!>$j6hi40+U33th8Ob0|b)FqtB>OiwBubfc0;!XHI+1@{{Zgu3N5tc}9ta72p0+
zP9lMEYkp{Q=rL2hO6!ous->l+JD=AJ0r-xO_cpBFy$&**9pb+DP^C^m#|Gn*DvDd+
z)J06KuR1BZy8PqpsGP|1d*@n=>GU7W;ef-PM57f&X8ko?NAv;*WyKmT35(A~!hkwH
zYU-p5MYsxa@6pa6YMuv@V>!|JsmZVl&sL}R%s>}RJwfgQPj~Bh!|3M=5C?(roI$Mz
zZv{&mW0d{rZZx+d21ZQ4t{*;_ncjKESiNCgqLejr5t!pgP|2_KU8KZW$UK@`?>F%?
zP${)<-+n5;j@T!?Q=2R;Z!-?GNfW1}WtYp#%2qu(J)AfhnZgjiVv65OtFbfZ#a!8A
z8G5Vg^I(=EZQ)8P6T`+F^xawa$Q2gq>(6F*RlCatpFL}=U#}kx>W>~EaT1-b02MAS
z33z!ac|j1lJ6i&zXl>M^;Gj7p>M%wn_)O@sA-G7SB<0<gV8CuL(6rT1Xq+!ZW?RC}
za*IxdQNfqh6^)6O-gGKZfHzk~KIj}&ci^D!2{Cy`A&&>1R$6q(U7^pSw<LS0xg*7N
zB^j2<df&@l%oA-ZziMG{DH^4*<hlMN+7}uYaz-Y+gjQR;Ad(EiSws@xK;ouA>3jys
z_dnR>+O<HnIn+?`xDS(VovB!#!mt>Txr*W$eC0NX+)VmB3OmfDOg2HtEX%Y`ifR_J
zKwm!z70<IG`-;bt<fB%;@T{Ho3$1-4Gw$eCCQ(Ic-M{4M*pQ<_D=e7?wi2_SOc!PA
zViec`Ph)(eN1!3!pC)%DbAvfr@p0o=7Ezi-W+F}4g;fupoE=l_nwNjDGN_3v<A?6R
zZHpU`)XU=~3P3+}naA-HNSevIb%Oy^kPOWpG;>`WqqM;K@CV$E{r>o)L0wqur?C7B
zew#v?sOEqEY*Dpt3)xER`uuzPH;g{hc`k>JTRRJEV(hH4&wpgOxhgg@H_uD@?AT9J
z$H!rH0UQAO75AD~%lyOitun1UlOlcgtiHQLwu-x#SI+c2&9=5$bQ>6vb?sxpsLn7k
zNty+V*cvPTG9477)adz9!Y*iVnsb$+U{HNcyJOE4uKxJClP*mqwlDY_p#txzeYrQ;
z^{`%Dj@-9O&{OV2jv)fZDbYtdIk+wJ@6VwIR3?c7us%GHN9V@1@72rGIV)A6z?R_z
zpqHzVA-*Sk2V8;DS9%-nDi#bAcYOW)a`+`OVFJ^MQQ!b7Nr_An&MY&su<I#Wk_I~~
zlZ*jw#LPsnrVMhCE@Ry~xvrQoE=m9Vi%9NO?y==gd8>yGh(H}fUsZ{%CYhfvImq-U
za4A$XGg&M0V}$?=R<{1waecCx1F{Mm=6q0=zVooodP`4E1Ko*&ox4YLWdjDt$ERuH
ztp7kiv(4#w_Lo@K7z99s)YD0JV28?2psHz$NHqS%i}n=s92MXVf`PCR`|VQW-rj*U
zP1Q8<q_**3y;E*)qUc3gefHcmU{}NGI3;lL>|=Z!UvX{e7#q7VhyYCPcE=5?KRu~D
zu6Pif(`ftK-)ev4O+py;K*B}&<z0ubkSzm$CM6>6!U1&TfSKaHDI6Wl{qTFcb|O86
zz7?rAFm<1#5p>zB(r>JBulZqn_i+zWqNcPBn42B@{GSN(MsI`5_KjHKaL~uc+@)e0
z|2D#a<jXv0u$Xt%9mxg4ISObFiK;UqfXBmrOPqA#VWV?xI#3$|MRYqdqjDp?X>M$E
z;$Qnqv-@HygYRKbYe|Ts<J9PC>Rd)dPc}2_&mx<oH#DNiq|Gu97Q|T3H5C8fj8hyI
zD|O_smum-5buV*UMV-huLErWP-AT`*-%3g-xAw?*2L6Vs#p?*|WdE0IZD`kN(IkBt
zG)Zi*+-~?&sPCWxNgI;0pBm<ij6ROL-`?(L{3J$1)Dx<NUS2e_ftf4!IXRVbv@t5J
zT`kkpo}t5Cb4=6gnIwyGorsi%21k^_GV_2`C5A_0T3O;p0l5mLhd@liFeKUB_!iZJ
zl_=;KFrh4825{q&$$YyV90s-uGDM70WRwo{KBk7{x0CbqGtL6z5CbCMlo%m#tE?aW
zB0c?(qNeB%B`T`3(e{f;1$hBm^M1U_ZpgKsXlH^04@>^CaWh;MasGE1W6^_!y47jK
zH#BpiH4|8b8%T6y<TxCrJK{2o>GtSmnyTv1MS6NR3pb9H_ucJZzV&V^P5&i1A3p2{
ziQzHo-kglL-ZC^eBqR;an01BT3=P5@8Zh^SjF5{lH*d;7klOGI7f2SKftglZJ}gqL
zzn(MpuTszN<t2)Xl60g#m8J2cZ@qE}@|^9zBwQ^J%oS)$IwhRV-YvtFlg5r8Z}s5y
zO#}8ZY^`^J**uGB#}zdn-a4Upkf3;jO{33qq4OO_TEE`Z1Ubk6jJg=T8D+|iO5hxR
zmKU?HUH$w2v;Z$ozy^R2i1$$COxJ+ad>cGOlzS5CFJUD6_cvylaHhc^ibMu9Q^1+1
zN3X-b<%$gG3C@7IsMBa&z}>i0Nsj@hJ`d)~_mZ9#EaY>Eee2e(rR}80D8bqYT*9p|
zH(Ezx|AfMjH;55y_7ksV{$`--;sB)9q!*Nfoc+Oi^;W*ddrwMoCIyBNmo8v`{X%xp
z^n(`;WU4SicM#d{1WV4rbjsO|+QMXHHz`r*Be?%0+o%poDLQ{%QXW*3CrrgPMnXOq
zzP^p|r>*0;d*J)tA-Mn?xE@y%y3D`CZY$t(Zj3lw5ts*luH4r7_eVcG@etM3YY`g%
zT^*2;2PWZooKRg5LTN^Z@3V}l?Hax9k0VfO1u9R*HMgSYAe>YDDc_QTjR8boGgZP=
zBmE~KPpwg}Q-cs>ar@b`1<kaLzR&a4DPL-J>)|TW4~{uDY2$kO^sHa@EjoPR*F~^}
zqKE3Rr`CBR6*sNvR1cC%J;vo$UtlYq`>vm+cekZX{jNiIik6+bp0kfzR@{msyDjj0
zi4ErWwrwrry*V~UfkdE(UWm2<?Dkjh$R~8W<?ar6&L{YQ1Gp#e<yX2ZirKj2<UqrN
zFQDyEZkQKai~L4U?@YT#R>(QBvtwTb2fO1hzkV_v*4XR%^}_(@6oedws4Qqp1Fjwp
zx?}S3i&4qfuWvv3cwUK&lrn@4m@kDq)?mrFZ?if40e+;8#a$k1XWX@AW4f!M-3l0y
zOk@1@Ck|8j*%Y%qdiF#h|4G#}W%{raep%lo$=O{F+64=CD<`3!v#S1f^ndW{-B$1J
z#?M`_xx#+GD5Grcx)7nLSl~T`iGaxYn7`#l?<D&k<d;61<`Lu0j<6=fLYHmu@yrm3
z9p}DWm_WOp*N2)?3c=hG7qA&J(mQq)DT-JoWRxJ!6$s%%SqP7Hy6ffE4xOvpk(!d6
zY@VCwrWG64>s#Q>ZWWt@VLvj5CI0kyk(pWZXLaWcRZUQk=rybIRL#|rn%`W4nh|Vm
zTeZzl=Kb8Gg&kVG-8uf!5BtcD3Q_BJ1mrr<dC;q*PS{_hID*XZk4d}~Tf3&IF@#-;
zh?L8!E>eg>H#}hWzgc9jO$to?3Y=$<DW;LUZtdE49PVVRNv;6c$^pO&k~Xg;S0+rV
z{T+NhI=XtgouO3|yLzQ?7?M19>eLLD4Q&hUhb&DpyduE>xV8yl3Nf_w2?%KIza+!F
zVMnN`#f#&GCy+b{lx<9gcTjoLx2z}sMNhBUxi-q(TJASecvfBq$9+r!xbSI|6>#M_
z2cWk%!>2`a1O|BU@Zk>s3&NDN(7m@r)`Kq6QL@a@kbl9ZZoIH}dy$-Esz%5#LYOq!
zlmIz05s$C9VDVy^c`D5kxfQ%}RqS1KQlie<x?GEG7-@2aZFu=|BlZ}TGy1VSsMsa8
zS}J<Mm&2U~mJLTI_4nV?`2K7zL;@WrKYZUY;D{`>r@E2R4Cx$$$l=oS1-<@FXa)Az
zX~~$BvR0hCqGG#n;et_5r)R4#Gk`6|D6s6elcr}l$dO=DW4?n#28<(9Y&v7!D7Ymx
z&NQ(D3S13NK2BgDB*PNB4{L4bRdmoh<%<Oqszc3Hk-XP&LN`8KwE~u2rtX4mlvNDd
zH<4LPlJhRX90)G(VE5Wp>M+)4?%9+iujtx`!OB(B`TTjmkx(D&U)}q^oobDX%GU&U
z5;`p<n4<E1dl|{W8e!?tt#dPqPy@D)y9A9tXGedKW{7{LpiGI~QG?*_R>AW>D;_<%
zMe-iHj>_*jF0e#gSGdC2u5xc6HY=}ayKmyGQ>GG#!k^e(8Z*Gr(J?aTsM=}Z-=Qu=
zRs9o*rtIA`RpURvZ9iMd3;wZHQS<Y{?wb5v3Hd;YE!t{i416H`RiIYX2YBs)Z(R9g
ztT89_=3sbSd*S&`HscTjc_s>qLjNlwdF~awW?~{HKLidUBO|VT)j&;GDSeR$cOhnp
z4iyfFx?d*rggaShMyENR;RpaTE3X*F0=1n6|16WOxc^Z)SXXZg`p<cEY3PtklH<Bw
z+990)huvxGKTyUCHwC8jt0gz1v=VvH?zx<y9M94H+NQaWP^DG`V-i6<-RH<|k2^M5
zz3(h41j^mtyLWf4Ys?RQn>*&Ojep5!Lg^vtmLt2x@1OcM7@z^f@St+iFXf=uYqJq&
zuKN3X`})%5H^CPw8n*anMnCb&4czC@O04I2gz^;BV*mH)>vD$;yl#aZSWUzu17&EJ
zMbHjPNND?RAD=u^_oFfG9Q+?$H#AP_(lMY7FnZ{+%#65*i?1BdjM;Jl&3y+=S2~`E
zK90l?;+kF!r4kw0m8`~>YkwoBPsn$=0&1|;BmYp|G>*1vJ7EY2!)d=xNZ43n;}noQ
znW};<6==;IvJ48s*L|kFLyKn3*n}6uo%(E@V!u{`=lQ?8t!AxS)BNsZ35<fvA`u2k
zS_uFs@>=q4ur+*FQ)kR*445N_1EABl+XwcY|A3g5R{qE`|N8(w2@Q`pjFPb7Y;ISw
ztMyK8yteKoe|eQ&|K7dp?ZPYFUFr0ikTk@El655%I9_#1e_AY_z33kaV27Yb_aNzI
zoa-cRqdP3G_)>iXFO=-x_!Te-*E5T?0X49Z3PFa70ngsEB(IoSHYudqr!0m2Pk;O0
zMx~6^nBjFCx%BJ`_28jJY7}%t9U^wX!<CSrYiKx^rjR3B*MXFRVeqrlXDn=k^q(*k
zNN945BrTCk8!`>VhdEfZ8^UL^OWZ@>oIQE+9+s2C6#it;QEVA6Ub2AcyNGgD3K6$|
zf4dGHIPg*jd1$8w^vIkyMK<ZjL+abjL(=H=#=I<#Kl10nW_h#!?A)=V1iA@v0bzjA
z=IWwrR$ZUa2RwXU_VueUS|T?UjZsli*4cR%=WVgJHb+m*X+t$&XE%1wQ@?-RXBKTI
zwch?J$);ImAe0==Kal)W&I&#{zQ+_bo2_o#qzS2jx5awQVPw1Bo?BZq-Q3(p<>SOj
zR5!h3$moiZ1e<dsq}{Yp70oDsB+q=AzCNY(?Mjnl)TXev1RzPYD>|d;OP9Wy?)WF@
z0xbcECv|YtmEkEbU&<&7S`BW0U55Zq*P2poZ=KBfX%jh>xGq86siyK@B+r$kbIais
z>vZe}JwN-Y>QY!(Sh&;EzVutXa>=(sultN12*`kyMF}aJlR^x9h?hydLEB9UEb$^F
zJ&DamhEPnK93sM$0prIn`*dcI?f2$m{xh1U`{AZ0wYXs7Rs^9_4pzIVwx>RH7v%yS
zU%<y5^eu}0`v;Q+qhMDs9L1;YW%K#0L7tiJgj+4#_@Y5eqc$xX6V72TqXz-r;_z%C
z@Vdv3)NDth1E<$2y(_#5?lVBZe}*qfE(zV`m3f|7IqF?0bo7|j!orDTHDO^<aJX5-
z<oUaaZi5kznjkttWy7iZ_Xf$s2c40I2Wb2A3-$Ypl-ve(9@tdFUFGJ)(2Z|?4=}E1
zwM-H|r0JwjVCl*$65ep^rax$+6Th-CNQV<o4T3=ckUy}wT4&ib(R5wP$KYGXd_P>(
zA6|hVG-|^!t%&8G7_+IsEY&)2&;Jc$iCFv2<*!oGGRbD}2@5+#n?kh7--RvuK@|w;
z1-|SZhm4w|mLPH61`E-}P(MEfECr?$G(6?&kJC6i`67(rtvXcHnE?jKwc8x|{6VIS
zFj88CZ!(;F)d<nnFh=4Lg^SQMS?=__Kujmth}<TF2)?9_z=PbaTes8dc2+JbYk5n;
znt(+fSNs&UbSbKvI*guR2z&<R6_!yiC8?SEhtgBhkYm)(Ie$h*MJ+1=8^T&K>GS*I
zzh%tjCWhV?Y7i?tKHa*id8uyvtu+Xbe?!MbCkjw>>)IBH`q{EYC{g}or%v}d(V}hc
zbxuuB*TM}Bo6B<*)y~801$JgvEFA_99m*)BQ9u?7yHK`E_UU>15>2!R<9if;{P=(-
zOt?U*dr4@+;<JgmV~PK=cr*%F)HK{(@OR_C2K2kW#80R4-km#duZ47EP%Aky+#f=M
zM`;gk`N}RPEHsp3n%<OuGq`qQuqW&v{{nd2L;KqaGkp>?=kz~OqIUJ!YD-JYh|iC8
zDpIGN%2mlLwuMICxyj?1)4z_s*S*IN4%{AMzQ%0TsxOqrjNx2mgmYG+*Wuiw%ZJl>
z@kq>Yk$7ZT1TB>-fVv1_TB+!@z-J^?gp5ft65>)*!}Sxk_TgfeSuS|=*nWJ_F^ii;
zoF~6mG{vx$1N!!D2xr1fY#F4ENT&}+0mhymMSOI6rwE#L2}Q(CI*3C{%sV#$31Mxq
zA01XmdU`^!S&Y%o6J=I~D!^l$R29@FA5-3h|B4>@AogEZ#UKU*bL{r*)ODi<)e4ym
zUr>@UIp_4Gowf72J)$~=&E@w|C*Mg<F8=e08!jCDX?tnsDN=NMK4NZ&Xt_n#Lmk2K
z5#%}HG|_&Ql$zg3C*yU#m4UeK^?iFWiXgz4HwXeCygw%%wm&HYF>VyOc7F>rdz!pX
za9ja*Q5@(1h{I@)cPOUf5oz$yp+20?T>qw1FR-B~yJ!#JiYy?M%M<OH#vsq);$o5|
z2UDekIx=N<HrfKVx?EKX3P<@{RE9t{l-Qii{1Gzp_dHy67kxkIBR#?R!VN~0;HX~f
z#%s;yDiunU!hF%9%jd|)lr%gT46g5@vZ!coPo5FCC*Fo@8H->l{$XY%44WQ2beYHq
zxcG6sq45^#ugorGVjJ#>?Y6eup40dwO#GEG*?OlWF9vf##I>*G9aH1_-#UBd-|EXU
zj=rW!)AkDykIs#EN(wZQzW@^mP&l4w2YQn)0V@_W)A3A2LDcyNA5e6O@`^fza#Q-|
zkq;nYrS9Vpp__)Vf>Pw<Y}q(&v3eMd3ICoSXF4S*GczWx$cm7xN0E4zaQAaI4o>XN
zPh40vfxQnd{k60-fAgI@4K!fjgQuu@piISuvz^Em%I;5P>c~A&;8Jrcwb&|WH8QLm
zgo5(NrMn@OEvZ{H8QfKi_>o+6f!`~Q6GI;*yHK$#Ffs~IUQ|VV^rufY&#v#|x<Xhm
z?MsaNHgmHpD64<g)(CP+NW8?(N!<Ul!3FIK?p*3V<jvru5ma8Z0}a?m*j{!#Phbp!
zcw*U)=%|t;F0IO-uBqb#4G>Qa-Bo!ccNY>5`HNKgdU|QoT?!ALze_zzV1Tm-LYa48
zPh#Y#lcH|AsH+v4b!^vebLP*Zr`+6*0}&EXfSMNJu7drSJ4pc_FG3?bJIgHax3tkB
z-~}nL9vcAdiXxtn*#yDbaCYEKK^kY2CSB^;(MRC;CCX^RgbC>$=>N~l2zvRtXlP0$
z2?*hwv~{wRpo-mAdU$x8p{|W(GWTgNKpr4ZcS_m;prraI=X{jtLsSkqIXPbErk*mD
z<UY8j-9~|V%=&&qzK97V9)0&YK9u(nM|jSJ1Sc^A!L<=^lkwJWD|)vHwcm57#}t_B
zPD^-VaFq&Y1PHF=yhA-p7ske;$K&slgh89Yf(G$=BhpTW@5HSHN2vSw2=g|9XqT2G
z4GlqZBq@&2EX1hh;(U)Tay+Av{ck3RrDIfD$cSp9x`$Go$S5+D`n-2?%=Z#4&*XX&
zIx-H4qMN1KGG=%Aln)8=nF!$|E1JQzhdgTv%++1}*_;olHY8OM34n|@V~PV&5MvqU
z#!V~6eomyThWE7`9K`p4gVh6X=d)L@4lo&Wj{;PmXsCHGzcaJ()4MEwSlW#>?Lm_W
z4HyQjAftcr<4a}DTy)THZcJb|co!!Ma4C(@(NY456+S(E8UKR6i)?fg&JB_kzdJWJ
z<_4Bl{Z;JFNtg`b%LeAgvpHaECEFPyc3Zbu>o#mK<tO!E=sQRWNRC7<$<2KI_F25f
z%yhZPQAsg|2rw;ZWfSZ*l0(yZNbXB+=wI=kA}dA`02<FnAXr~4m1rFPUf!qXZfZI@
zepFh_#(Ubgbfq~dh=>l666Ug9{!U5ZvO2l1_HCTg7C^H0-Mjl0oaXHa*2S3CT`VHd
zX+XU?i6UB{x*4JHyk%N}gFv&CZH=5c5SJ}mCc6Am=WHdnYx*O)73f(?gfNv5)Z!kw
z1(*qH(Ixi?(f%dq2E3N|L4>{C=4P0gihzgu_Z4rrqx<?|O^Pwv3YkO6{U!4UoK_aB
zgr$-O)o{mo`cVPITMWH<n;{5vf_AXGwRcb?9Tc<O%IRhwY%s+7P>zH<ZJG<bA;H@9
zxS-grs)+dqk9XolN=`6p1MYcd7tERS9*4*(ei%9rxe-cA?$Sd6^KRQViUlgC9P+6z
zEGmj%SF2)maAyNYK~I8%#Z-l@LU;6bm`5a4qCtkQO$_}EsJj%^CRKsfv4EhU<7Avj
zOcJsS7sfi{LD%qK(l9T!1rRy?HNOq-o<z?Q(m55+_VzEJ7)M6bLaodBfO2Fz<Q)^;
zi&KiL`Ipjw38{;@jg}Xbz6a`N=u=3J#KuW%OJRLCqNi`&^}^cFWsybe5na;n*)za&
z<fu_hNaSZFw|aU05_=)s$<pu|pPUJ^G&wB${fKYEZ+TAt-}GCHww-G)K27Z$W9OK*
zzUb7OyP?gSHU+uwKK7^6W@=I@_Y)7N{X)IrYef{ui+Yi31B%K16KY)2zqzBZnb@v+
z1RN`2t$2OJPlf9k4qei%_-3p0%xD+6ft9{7gy0?T*&Kud6!j^)S2-!s!nRIzjAr>D
zs*h4Qe_F(+%Li<yLk7UVf9K9_bQGdT2Z$ma&CAzUbDXx<Gx423E8Fk@q}*ksTOU3d
zJ=hJq!i73IqKy~4hxV)Z#)MMtweLvq;_%i>Hvj68B1uxzgz2D8<nMv@IEliDBV^ms
zh{tp#xF~2jLAE0qmy0kW_EJj?cnr=N93Jo3qjVE80FSu9DCGgK($f>sD&HXpq!;8U
z{pZ^7NHG<|)GF-!u?d~T#N3*ud7ZDe?ltidjSq&N$v)}>3rkDE-<3*Vq@<)Q-vO>7
zaeLHlmOiBo=~W(qj!C#Pnu&rh%aDfvO3E-7{=_g!vHNH?DHvqO{k7b2^K}GRd~9@Z
z^`seE=g=QP#Y|Jv+)<u+i0|*MAB~*EOZjBQ;!L+QX(=i983rVS<0!-?kSOF~R?zh4
z%Atl;yaV+2c-l`jH8b*pzW3g|3pjR1?bmPDYoafX!9Z0Mk<+^0<g_&N>(s6U9j~B?
zm{9)r!c2undrMyqk~nd8vkXl0TjDQM4&_ic%*r!sWtJW?qO_z04F?^MA9=BIu~TO4
z;uRT{A^PdnmmUe&Ac(|O8uYqAPa=6Zt>N%ZMx24VG9eY*9txeyCIqV!Ocayt0yeAL
zET2L5qYBA1&F^Ii>qSD*rFlSN_KN#Jf=ony!eM!nbXm^uVO-|&e$Y7`(ejp;-e%(W
zT@nLA{V10PFIqaj`+)*s$SJ}E09r`|Wq(np)82ss2#72mOkw6JWO$1*13)A#KI$_A
zXVB5f2`$w<-&HK+5UyYNWJ;;W2%3;sQDOH!&@Tw1%Ylu3#oOxTLI$5*3q+tye3D!7
zS=6(XcSHX??hq5?UePf?YMQ{CC6a!HPHml1QdZZ00jRh8O|!eU%CkO>SEX%-4u2<k
zRrRWF;H;=Ad@uKd3>jm4i@%1J!Lac#Hkh>1a0SS9(6u&e+Vtb`P2>ie){jF4N)i$S
z7de#&lahNIjSzd%^jX~Mk-?a11c#;yCg!7;1i~XrqUQld6jQDQ6$7EmmxS|@uAooI
zBL5}a89+I*>sY^>A>T2+Oa>DYEjc+=l8z^WUU+{*GH6SPHuvP-BlqtJ!=$;kB|4aA
zjq_hpE;wO&X0;bLhhv+GCdk;?*kbcQTcOEm1Z)qoCZXu)G19oe;ieFpNQR+Rnrqjt
zU3ti?L}Vh8nmHmPOchKQ%qxhpQfLcwJoNJ(jaA7~!E?%v00nWGR6CXvYbHV3fF&@^
zGwHBpb)iBQV=@!=@(wjHt*9iMrv(ROXAe(+7vx0F08;#M>*z{J{NlmLbu5w;#EeC;
zKX=fdJ0UrxWD~5Bzq1KL3&;Wj0;z=X3I}cfx<XP_NiQB}|9uz?yVSwFl6r8pw#v7&
z-C9iA5;aw~>tKfk)~n2DPetg<ZpWl6-%xkUQ~iS7Kk`S7eeV7%P+xL{(TbsWXhi<8
zcX$wRFIp2+2~;X{crque(Cu+!E{4vX-JQ)m_5GKi%z2)(5{Oz9?LuPzU;K;kh1EM^
z&Zw#qJqW4^L^Am7jsRVLAS|P4+))>odEwvIZm_U0`;`UU3L!Il(gsaKhUpA7#PB_P
zK9*o$^b;cZ66Ax0v;yS_?OB6D$1}7M;;%pvDONAj7fy(x>9vOf!;v?L14|1`4ru2c
zvz{OtbZlPpp%9=wB6vq8o#4)<GuqnJ>>9p;FKT$yj2y}r=T<d(^7Dz5(~|5-5zL#F
z7;XTWh<6je83KpB!D@2_VwgHfV7c=S6=_!8T{wZZQyfZPjfh5q+b}2!T_Hg&MdQ*M
z-WxY=_vB9({!a@qj@m@v3WBL#uXK2AhfLXnv-US4c<DJn0e>cEolTgzIJ^31vZ}sW
z@Cmm9#l4efX;_6#UaFLMbl3VK|J8nr*ZGC7tyz-chx8)dPCq|xt^X|j->0l;(aCi2
z<fdwUnjLEttf5h%9+Lc{_RW~Q{;Q0S9kW0GbM<M}yqn49PpWn}WCeyB1twU8J15TB
zJ-K(=ri~ixp8Qm8M%C<BoJAW~PfDz{EdRWiyR04^!$|{8k+QMl<ObqF%k&r2!rRvP
zf1GuRLW3rNo+>}ZYHRPYRiEo|6<ZSM#1R?E`sR)$@P%l){@(@=TezI-f#vcpH#c6=
zmm9X3Fd0t+4IRP;JRqVA@DI;=2mB;L7fI}5Fyz*&Bdit$?4WQ}Hi)Q8Fa!00ih+mN
z{4xb+rUWKPA%Lz$R4^prbf84((EK-~J{WAjpo-o?W*Y40T#=NmT8FroA}WU=s+Xa_
z(FK0Nj3RT`aN3^ROIko*f_uawg(^g%8R=Qm`=xVM<{Hfr0W-va@zAp7`}TmlGWJaa
z?Kk*^VZ{vQiQ<hNhFMou2^B%E(Ur%K9|NSig=g71qbVXA_BWZ`2e=W~{x~1dp7Fo-
z<koO2J;*t6`t*V5XuIgIgaDZFn~-M(VfjQlpK)X!Dh)1_euD?MV6RH5GySSqLhAB>
zZgXSVQ2k5)!=WY7?TRD}qhgZ=8|?|DdINyqnb-->pI6as$RBF8cH`5mVK@-1VnSxL
zb6jwSL`@-h@E+YIVikql^?4nY$Xes%19(E}O8-hxUS+?5yajJ~Q?gdHyfu7Ypi{oO
zW5D~yg98F^k<v}wG~UEPCK^>Hzp?<{y?ef9pYg_2u(p|adZX<$%#%iq8IzQf!^`Jr
zEB7ldG;u){kt-Te4(Psk8E9NsTvvU4$tlDGnZQ+Q4o@<cvkxuIBg(|5_j)AU;?S>*
z$S=M%);8BLYOC+OY}K=mQE@-m<Hc!4S)F<R;{1UJ5Zq9#+@z|Lh(F{=J1Rmvw{=UM
z)n3(Up_@BNm_b1cYBMr39iDoIwrJ6UO@EtYXzV$N#G`I(Z{5843MvEXK}++JF<T{Y
zrkokG(a0wW@hs@w^VTo(A!;BChoRc0M&&Lw;cpb~og+LgX~pvH>BK<l3pN|0l0oIE
zt=8woZayZk4F^~$<GF$0HO$ODTsUwB6p(ILrbDolWdN<Dl$1^M*YTBb0@Sz?Xhgjt
znel{T^~?U2p7j$UA^up3@Xbq`pXt5c%t<M(banf>EiOBKPoG)TC3rmg4ISDhqQB+1
zds<*J1Y?Vzk{eMPVZNTVlG!rlPTT>JgqqULQCouNTgJV!7%a0s#S<hz`OWQ{)LJN!
zP<S4#RGO<5b{G(mr2P3xZjiVF1X*GVr-C4KH}bV^$c8^Mzv4TUm`XuSF*)ovsDEXV
zlHBDelEi(Ccrw5!CXr-FDS+vrPo+wf4q(x%&~@&w1Iwx!{S8Aj4oLx<N0<_P_Cjf>
zR5Bb&4iJFO0W#r%WIKuZW`GfO%>pGijw+5cIL-zE{cfeFM^<#4Iq8px{sU7}Qmn>j
zq*;XLAn|~sx<hzm#u0DWb@WZb0m(~%)W<ijjo~1${nu0%!7|VnuN$Coqo*gz8tsH8
zZvd2fs9CinTc7#{SMj#!%NKDgV9$a?I_}}|m`YcW4Q@;pcWdOfR?V!OI=(M=nEYFI
ztA_rMZu}bVN&C`ZpT-@|objQ}!qV1@th3;i0NY$_D*$*oKsO^srR3!UPC+R{hyu<G
z#K=O}=m!x7<%)BPMhd`U(UK+4A+TUhX&L}aLCVCcUe+ag!KQ6acy~aW*g!%gRc6;R
z?fo4B935XqX8TW#kAAlL<bZorWD9k5o$i<%iw{Bj*p`<s&YvTXiC?2q+f-R%O3O+q
zL8f;Qx!p9#v%yhi85+mlitCaG^otk`B_&4!Hz4IK#~T5vhY++X3}`gaO^akDN@5=0
zVHRJ~%6x_z@Qn!kuj!^gGdmj`)q_EnUl><XiuSq{Vr0AzWMGjQ(hXJ)WdW`=v?|{7
zGFH#d`0S71kzSkl<D$QY@{oGo>kP5Tc9TgWTHdaJ-c;+HRC1TGv&eQi%!qp5MIu4>
ztSuW5c`3xd)perkh<4--K;sLdd(3a`FPzuiB`u(+Guwg-P9{DQPb1MJLj9q%xNi|2
z?)PZq*~c{sA^N{N-Wj(3-W<AWww-0+cSf^0PTYj(Syom3ww*f7;35c5aw6XZKQH#l
zNZWEH8V!#_XSiMfTvV!)q9Y>SFeC*WgWHP^WNUPf-ntjIO(*Y;W5aMqQQH}B)At6!
zTPt@;+BN+y2s1<b2uhJxE;kh5SZ)bhsXxmOB8YS;338$sUg*?<vY)Hksk}7B^3Pqv
z&b3`}<^L*zVRPE5t41fR4b58A;uZy*02o|`8XGqFuYZ=l<R)%@v*(vJsbF1Yhd>Ys
zIX-2WvWkjC?4$gk6wltEac6Dlb2^p6>tFNPg;ZE!v$e+^9mFFcb(M!-wCF)20BQ;J
zyF`<(oo;P4Cko#o!)M6213CrZdXB2fbn$$Go~gBZHfpcw`rha=BpA6k$4{SrwZ<Eg
zeb>2x8__`?+dB~l1COwt+`6nVl@nW6CLHAt@00$HhJc)?neY%8y;zK8wlS}NK70Il
z!{k#w+!ZC?dLEoMv){{dp7R4Yzf><BUl|#|TID-FykxNvd+ZJ6`}6qGU>PcYzFf^b
zh@!3bRO*<0-V*OXi$QCSc5=PbHmnYkEvsJM0kkR}VI@D;C}HsTqAsgVuf(rd4$35o
zglDVinWy`tj$)ajq_BZVmI52igIjfjA~+o!?^a2K76=<j-!12(T4=*j-Hym)CiINK
z>^lJYUF#OB!uC1G?_m@Ont8CL6YNYJ#;1)EBsmu}bNszg2$^=2dG*@XRd*hbKrG)@
z4rch<e}3)tvrhu1g`Y;+C)Jb&Ou|MgK{9lX1YXU&I3MSW7+wmkKtRtx4*_*aQ{Gca
z`LUU*c!$K_i*Z2iGAiN{8RY_{!+D~v9^-L(4Wt@>K*mhr*MlOulsVo_tDpEK0lVp(
zd|s>_7-4m8-1@j|GEMo_Gp0<rbK!_T6c-7Goc5c!LM62hK*uyF0y+f=|5@;TZl~V5
zkK62pj1~1XMboL)34vU!?0$Q`kU6RDg+<Xc%?#D~;%8hVIr<`RmvdmL@()0K=^2i@
zg9;X8guBpwVTVJ*8GL$8(9*F~zS4+5au8;+b=JGVZdx3q-?P5mO5hzmrZH73aEhDW
z8!fQt#wiY%)FfQ;t_q7jef>vl8B3Dv`@BzKR@WRWix%kW4w!V7s)wk7^uiLa8An_V
zM}t+oU-!LytWtRN=RwzXeBU?I$y|Jn2qv3x80Cw)*{g$pqA1-~iNH+_0ouS0&C?<;
z2LxE$R%ydt=2mF%=JRJx`XWn*NCik(q+QgW>>iyZOH$VKnuA|LVdn|`p37A;CUIxU
z=sHRabR=zTwKWc$p$Vw#1?~*HRM=u_hTfOHUA=V<YFvB+s*eApRaF&2P2-(4+k7;9
z<Fxz%KXCfXfyvk?8cazL1a-eLJH^h<?!=#eCISSBodR<`e4W+luIIS}>X?-yR1-9;
zQTBR71Lw(@87jG=@`(&Q`SNsXo8g2qwf59>{X5{&4OMW(m-$)8#(QXYnmnif!tj^f
zkQ5av4awjK{Na3#199_;kG9+>xw$A^(XZ5ACrJQO5{8lwMKQKD!)cMXj<hgzYeXtv
zLJ@?R{vBgbmmB}RwtSaan(<GEkhIsPCCbFiWMm@@TB`k&t7-unEu+BV5s`Uss=jV2
zu+ue|Bmxouz&Ck$2^f|lHs>~{X`}W*6BFIxF7(jpF&h;+Ip^+Qw5w%*`g5uv)L74k
zCOy(Wv-{8)3MH0D!_-FSiq=>3$RPWXyOr$M`qbc4K$yVxPHk}80+UbYE4Bkp6Opn)
z`B)gX)_*1es~kJL=0u88EPn}caZ`)hxbtKYO#!|ludEA<`|Eb0N&pD|)&u$6orNO;
zvS0%RmgD2nq9%q#aL+R9t{IB3fOZKc!{GHBi(j9zmAoGb-4r2NL4hQvxeqlKkg5)>
zqFsliD5YJp`m4(}gPAM|LZ3t=z%k}}@+9*2zDi2hGP@LmVaUe?tK$jWA=F|9<SnXt
zZYLyyFf#oq(x5~nSYJDEC|w3|c1#|s3-_n&pT+bI|0N>xWYMrBx;}lyXyD(O)WT;t
z!+<efg*YgvNQk$N=XnU**|?u3LH6Hsdi{I=Mn8`WgWy>XWK{FG(qIzz2x7-6!2xEv
z{4ceWTK%Xp<n)2jq~2XHu!BTefEx7CbX7h%_WBmF1&O|x;2G*KSC?jfFGaoqqfB8|
zLa})Q8lIZLM|*l}B<Da+4?&v<B4pT+|I|msXL~&x`zEOU>mD!xKW?IULu~!Ayh>$2
zM0v-AaYcUxZ-a(@9-O{g+vkL*PpigL)w;h9yZZ98LeEA$@<O$|KbK6*3TtCN?}q=o
zf|l&ax~RINzrZM1vWA@`KM<#XlYnMhq(AK2*DIr#F8o2U7XO*;ZxwY8fs0G*Lw{}E
zgku-%gnJ?FW*tQrc$dbb4*%;8GW#}qRKcApw$n1(npQ!h7&tFgIsTok-yd0o<HG@}
zyXfsLR}Az$@(6+TaU<NyoPA#7q6r}bAWb^+=ijFA0Ki)_!0gXC{nJ*YK-IHyEzD<z
z-?)KY@XW-*g-A0Q9avC?Bh{HS-9XnRAVr)hVq&7#mPwWTA`DycD~=Co((|F~`9|A`
z9ajm@NNJvm;4>+=$Gg!6)rIL6$-}#6)p=pYG11j!=v!nKo9}vEyQZbxV~|^-)yv6d
zAvTi}vdi!KaXJR4KR`#XKRhiy{x_Ugl&)+FsulPz+YYKEKXBD;95YdUQ_c6WuB)g&
zN|y5Twq<zDb5#VGwtJm2-vnlj5Vnhpl++d#wX$yfhhS}7tt7uquk*n~b-X}h>eVe6
z+E0vT9FYwF-N${zNexYSz__-=J$Ty(jK1mqsTSb^C4xrG(bjH_<eZDp3AIY~uS5dH
zj57>+Iw7O~cwAptalrX;eaS)bixB>ZMuGSmPH36SE$EWSfd>ux)K+&R>HE^~0xihI
zU%*Sq@Si`De{!avyL!KFH_QS47dUSZcw_E_?e7i_kl8u-c+MGgJn89~4Av;=M3d24
z3mYiRCGWVzs^`n&Uv_p3byfVzUOP=IoOIA8p^I_k=6$+GHv5P=2cWS58>0~lozNcw
zPHxqom7YKQ;H{HNK}Hz7*x`)8L<(?{^Y})hJ(KyTd`#F_LYZ&NI)$KXJTlC(P@#hM
z_zS#=jBA1+vH9ffE|FPGJm^1eT%>XSwl`yIseC2TNz4M2iU86tUcHjElI;u|5(^n^
zhMZR1*ggplHRuHZ<%glzLYOwo%<PK6km~2Nht_{^VkOs!!1oxxKpZ9VBzZ@L>Oaza
z77U*c4R$<be<i$0B@k03hhqBT9pmTwE~5OQ8O0OEtDY66*z@7&kk(ud<Pt@C_gvIN
z<CMllqLd!0_id7YUbLBK=9DN-@zzPpd4D17t{2o?SV{QMm6N)}F5KAY?=ZLXOZ>LY
zS8^ZH$$7#WFp0zsU2HYHbsoO-Go1V9y6(l6(_cNRXoJ?@-{b4#-~U^{N4T}`JEon7
zwso9w#E?!c2*!?!on5B~AiIDV75L?Q5Lp5zAm8!0-?3{~P1tB~&kMhhQc1gr%wECd
zRNx{GehE~Tt9!010*&Eoo-5+0w~LvCO2Tm2iaICW2A6~kj7hKlQE<}eAXDV@MHlAx
zQmI8uYKude&z2l0I2@)vE?PfeFyhYKY}Tm=9`UHB6k22PS56594Rbibnad?ge;66;
z&Bn<{tIPyaQVz`E<^qpaamf$U&@iVELiwa@CMJe{PtuD#<AP;gPUen0uI<AcGlB?U
zio0@X_`rd)!lJsbX?fXP21>!w13IQPkBrU_%P|PAL#@t<FoQINTOOxB=j6mOY>!4M
z2bep4Z7A_lyRGuU96^8#i^iRX0|ct}v+DQ;$sH&WSmrKBF$5gEq7oz)1OTRM+x-~-
z&K9X_^x?sT-zP=b!o_g2zTjIJmjdem43RMquAj1M#8>_fF(wD8bva(@{FEyeD!EDC
zAd<m9$)6Bi2F`))EbfehXO0$*z5fNy{4KlUz0ahYHBQ;mk&Da?e!^>!l9;G}7XPe?
zwaOWd;ZwLs5(qf1ab<{TMua2uRq6i8&jytIBRpAc{iw34Dj1H7<LsY^cW&9gj92DI
zG!gQQm$8G9z5m1?N=8~>Ohq@@(9!<d#mxYBD2&8<E$-Fmb@22)_QizhU7%J+|0(1_
z?y9~`>hdJy#`E*(aY7>?Fx|RffI)QlkM{#6Ek{>flE)E1Wg0^0DO+qNJ?HC?8-5#_
zkm0YO@Ps~DQUM`{Hs#z8ZLd1>&&TUQ94!5RuEdkYoa2$*?9NRa*@1N>nl#V^{ws3@
z0KEdUS9~b%#JTmb{og%KytZB2{#i=Zl8f`p^1FVS@$BsObnOo(6n%}uay&Mq1}Shq
zpvo1#zWm!%<Y=#F_9*inQNDy7*Lv>W(}R9t=akJe8f|Fb)A1l>JFy$Nyvc&HE&DWf
z1G^CR1Q-q47d_XA9Rf3Om!&t35q=`e9WzF^lWNaSA^62@O5V5bbR!i}P#hW<3Q42D
zZx8$2whEo3-p`aX-3DY<>_AhXIA{gX?zD#%qsS-i3oR(W8f!i6VE+E4^}e}<-6gGs
zIg^1&)3nV=cuQ<%%OB!T17ZE0nz|)7u_HJ@m7-EO8D!@;{onlgY62*(N_)+tL4`-d
z_H5iZ#5Y4_%+s%HGUCEEEcSnsawXPHFiFn#H|bx8F^0{^A$fs~7)7k3Hq0E=?T2x>
zIVrv^L<nhFcr&v2R&~^k;3d9`UT;e^6fZE%IEm#0zKtqU3Sc&Gg2#PtKJ|G=?f?!I
zBO;Kgh$V>&3iwU>UZ-9)A7>iY8gT@Ow-y6aAaV_{Ruko;re7A8cOAYA-A3`EU13X=
zgC9pS7JxM8k2l&uQ*GhEFJUuuk2qyNrrVL8_vUE2BAt8-hE%+0g07OA<A^Y#8U^L2
z3O%K)cb8%t!-*`xx9DdJxN{MVlqFV#s!=0gjh^gtqBmPshLVzsA`%SwI7OKmUZ>MK
zQ%vEuEXg+P*`D)&QNZBX^kSzxHOEiAIR77T2jUMzKQ!cW&x0$jcqJyic(HJzwJ5E=
ze2M5+ypw87b~{Lo45a4L!n&aCIiO8SYAVsPyD9v5??CW2cim=q-Qgl|xEt*^%SyRn
zYJh6PPYWi#Vq>>#Jk|Y#LytU1uS3E9M~Wv~&arn~ccQPPBDUH2J+6MBwIY3uO_Huo
zn=$`cz%8eyKQUAIG<Moq6o;gXHF2onYe2^Vhz?(!Phs+ZSdX|V2>b&w20x<&W|aCY
zusV49p0;6zOcm>c3KG_)rPg}&n5W%aHkLS1I(aZJbj16luAy{-5ypu&m0L`;7~lcj
zY>##;uAmWbzlnziU`jwxu+dIqpBJICXSpEq;xG`WB)tHjCO|;N;4mo_!5-`3qkyX~
z*sSNYZ=<zle3<$eyNu*5!|o^a`4DF49%r@Ga-C{hQou5`xyN?B&IlaCjUm{S>GWf9
z!6`vU!oJnxKdxNqwLXs2?@GG*)WQl*Mp9nTZA}Z?P&nsNk68v|8aQ28gi&7OqDV-2
zJywx7`QOZr#TF}~Pha0YAoA2g)=OFS3=g=W;`gL`_wM767g=sNOtg2@>i}?N3aR}Q
z>VLHD^Wt~?ty{N-ugq_IIk5BRsy*a)of$#Jdym;JL!28xo=y1iC5~BLz8(=7#;~HK
zf9bn>f<AzFKM^kogb8SK!z?p>RwgbZ;gk9Ml$V`pZA9;GQe1e|sOa(dzJL6&Yh8j7
zDD2Z)t?t5lsQHyB4IL`+o~Jr!(Gwu~YtE76gIpQ|SuoFEVZf_Urm_jPRGYW6)(QAn
z?hFRpa?ssl%qrho#rW174p)fT%qRsUNx1udGl<syR<{0!A*dbdt8MP;{pa>?k8T&N
z36U!f9{v29talho0w&q=-<_0}PWgsq3yHh({q;@z-20c69Cm$E*&{UcbR4;6PA>){
z+U2DPi%8=v4rhc@U>pcZj?z7X<-kOC?=bk<oy#N;(zHlt2-&bSa&te_0r{I}#Ke8r
zI*=)seFj|TGXVi|jDAx)NLx9Tn}b=l7x;^ahfbV7pJncs#%7QJBz{Xh;PuUyB2;^v
z3Nu_>pUOcB7o}b2*RSabEC3u+qu?1<Gp~EMZWa-zxA>KibBG=JG$DP2^>o>sXQ$tI
zg=+rC*`Joq+vu;>PxEuwm>1>4LknM+hi4Eq2Xs9>>`g(y&tV4b*MN)dTKC6e6rxSk
zRU*vd43;fR-_V0YC1#=m5=FX^!itMN#n)c01%#5J_0VHve0rZwa~rlcB#m@q)Kbkf
zT`e)W1b@!{U4PyZx90mU&F;SSqs@9dXB3GtDpv?3DDqnC&}EN-6c0T%%MU&j9*$m=
z6o&AQ)u}BU9S!3d9wKRa7~N%39ANi@lyKlwH~)}^_<#(z`OF{qe#<M0S%<0_!H)|p
zg*gRi4<_*wJW>Wk3Ny=y2!rs&(XA<}(G!}=z%aCukby{{NzOX%F^6vwVP83dMgfC`
zwOZHf{5Bxd`l$t1qU~l>1#sVALY9}g?eY*R!ID*@@VKD(Z{j^Kf=dkv2kImsG!Ul?
z>_h#F$k*INW8&el$;wJW$0}w_LnrkcCLeqMyShwH_cl9XbyV9YI=AZUpbpJZ%t?3v
zoxMzif>>iu%Un!4GTLbsO6e2a6bxA5mI8<)fHT$mSQ=STe1q7*=8e10ykzL}0-Y_d
z#0Nc|{<@GN2}Iy%l$MVc^FY_k!T+ZPs3kOnfQ}Amrn2)J9cR!Q=%PFo$2*gW;HHDi
zTw=zUS#<Xq-m6({vtg}~B@^jRd~cv~TH`JDAX@MDniJ0jEzi6`m4sYE#bvbLFZ*J%
z7A?9zznf+9m6k?C5F)mu+}~d|vCX1KTg#SgYdRq&^UBl?#TI7*!aiMnPtqm@lEUDV
z{WJ-V)~$`ca>eXv4kMHR@C3@;T4@Msx}!QHyJGxXhnibf0|tg2N#1lcy|BXKTWI0w
zt>KwP1wqvs^Agfb)Y}Z|aLB6S>d|fA%zoH9t~69H-(zKvRA^OwVL&SGN${sUuHrmS
zx?F5R;#+|D+Cx(!0v`Bk(M{t_1x=_}8@$_VZD`2pLMyx2>1KDHPKKxfU<6*pqt*rm
zy(ydfaJNy1i1lepnXMAPkx&$RX%T*jkOV<FUNTE6g2dDt_+3QI0D%Z6Kn&nX#NAf5
zk5#+0lw$-4F0uU|w(Azx8qR0x7><Y249aNot%1|v-ElZSdhY_*Xp?A#!dqBNKDPnm
zTi}R9;D?TVyu&F3I}#)$Djr4)aayJo-aGzQi$wYfUvV{0i|Em6_<ye6rZZX|J%1p4
z&zJ81bYFN#cp`#TvD}c8i_{pRW$Lv7#~OM)8#rLlw(*AHnt8uMJ%$Y38PXno&LO;v
z&|nb^kYk7C!eOG!w5s$VLV+K_{Yc94CBuDQ6cOU%5^xboOMilu5f%P5ucIgA?++%P
zxm8~VUl%ok#cx&M4a}!~?fuFSK!>`%UQ>Qf@7@=5|41v(;rLA}{@PXDb|f%48a>f1
z@+evS#edHTz=|OPeJBSwv+X2asQG8LOGzK&zWddq2AjQymHB`BFgYM^gWp}d=(#iZ
zWLbn;>o3mAj-Zax4KR4if%JKER}KjDIWH$31UWUTjULn9UQu&xxIspCo^4e3=&+R|
zm$rAd*k`wW<Hy2AyTWqp`YLr}o|LpLgvG5u2g_L^S{|ZS`Ju~E9dSy)21~vM^9C1$
zyS+;-PAyMhjB;_s_HIYPFk44w*>Rdkpfrz8=0YJZ`B@$Nb2k5<W!87XgdT8cHm(Qa
zw97#e>H<I+A%Jq7vL4}r9Zs;Yn?)Bc^p^RK6g~q74O-6D=5>sKiv|_}V}y$aZjw3P
z=uo^MF=-JvenzkM@7UOpOGoHf=@bCZp;4z?oKI-Fh?4m^=t~|{JjqmFSnh_68hB<V
zRSlk%_2gCZOF2iGI!77{g*XMNtZxaM5rv6p0L?ueP7zwT`ss?u#fb4wdiG07N{Zi-
z+ji=I34m2&`^|2ANaNzM1U_5TOlR+l@JTCS=#jf`B0@`u83dQW*%UjV({>?K3r>br
z$mq7h;F-w;xK?Il*UsRBkv&0F&9l`vKQ+@%cxA*)QkE^E2yWYHJ~i&Z%_3OA;SZAS
zNI`Q>C<MPEYT^|%!NPPS+~LnVVlkm}#djR?;=HBhV<jtIrab$%bij8^h7IE+#h>=l
z%V8P&o`8%EKI)6P;rU-fAlw_)CN~eCl(U7gSPBUeCSZnKzvq1r%7bW$FqbFsnbXkV
za#@=o+$+2`j)h27x{6nOj#<GxYRuRM58r=`X`Xsb9gEHoRaX7*V~KMv7;a-kYWKs(
zY%lvqq-1CXg(bC`cC)bgOz#~0V$lXO{9ab*by?qi-NuUYs?U){VX2vwDHZmeQZ5y8
zk;IHo$*Aa1<o|M=G0lf~V%S5Q$?FC6dT*4>5e4HU`Mtub1M-s!zt*Q_Mb8_VLlYv^
zT{oEZY=3zFix^E;MWF`K2|B^ppr0ahr<B<QXJkYnOOF$eYojxqGG~eaS7<0uD<j>J
zA|EhKa;-8l8}^9VUJOnk$rYgCm5(>>hy%M6st;BVHMc8~&Meebe(Ch!Wc7XSW9ot_
zx(nE%$gV|;$8AWD0{@a0xn%&=e)bv*i~bbM#SX=Gj#c)P_|w3aQ_RfzBSWc^$8vi~
zS|bV>=+>)bb-^}BYCf&T8$$9pRik2>9O=+QTXoMx$F{1!{lir4Sx&)R$S^vsPu*yY
zHkWI0Lt^C$-+Q{5NVE|th=dbbWV1^E=fBanlhpy_WAs*3oThC+V#N^(T53ja2ZI<}
zHz|72TEX?p*M&d^uclp{iMWe<Xv~neo$2cK{MJHPUXE9ksP(C|aUsY5LSQ*EkD>BF
zlvRlg7;~s3UO{(T&AJd76(tkVP`KHkaOG-W&e_DJ$}Oga&XCFunVLy>hUAnz_ns|B
zKgFvhOP^C>7S@KK1O?>LF$?R*Z@LA5$05$7VYk;UHXynMo<!HeBWohTH%z0&LLisU
zfY~A3UUKHx*iWqb*q7Q14Mo9Ldm-{ccceB23@E%m0O2*uU#1_TEruVFkW`MgCJ0k`
zjpDuNKE~obi<#3?CV*Kvg!HJH*iVya-7QG{SX)S)j%^s?wBol?%T*WL&TQDA7wcSm
zKQ-ejUTj=<(f&GN3?fZQNz5)@_cJ5w&BryB**|NtGgV%kII=-sQPXumkFDliGfssi
zh1Tpa{+QgwWh4(Py5_Y-cv4bX{GP~I@J5MECI)GV-{OFe<2gE+%>03Rrw2nG1i_ZR
zOKEdPp;cuMtJfz2%r752qh9kQT#368dH-8PM4}%Pi&om_T3kwR_-A$a&7Aa6tVjg*
zda*kk+{1jJ6vg`fvD{Xae~e+3_{*bY+^@IZIIN3A8Z&G~*75{AS3R#^+&G#xO@xUe
z@Z&e+1;@v(Z0AWIOeKsJeluuYn`><cYqq1U$ip|2CkjU%uS7;uBXq`<q~m+iFOP&p
zxXs|gviT0l-&HV8v@3q@Ajz|+ajE+0KOA=?(<J|&KBIS+l(Mj1Z{`E$J7G7FLzSS@
zdQ_W8m^1(Se0Vn;r%baDuMnNhv3+VzG0W)H=rvIMh~yY-OQ<pw4z0m=Z~iDrlKJ*B
ztAx*&r{-ep_c9pAI6#qv`$tk+NJT}oubbWEP|TGJ7Vxx0)g!uioc$Q6)#hrDjqruy
zwinAs5K9`UzWO&%HFO3|1E^iEuSX*fh{aZgnX5x4c6@0_h;XK<&h6zF;l5=SFqape
zy(SD*m0YQ$km8<uf)q8Cjc%R>vEQoKGyUav8F>hDm4#FQ*x!cFCZUP+Vr&T;dlkc|
zY3Q%rb4ct>Ky5mYI{BY0d(jfXI~xY1#L#oVZ<3_;*Xn0~uy<1fjIS>7lQ2cDWWd%}
z9AcaW)Hh^EB+8GWWAAlz*Tr}&cEh?I;pw;W)jIK{%YoBry@EDeK=4sFO1Tqd{4db4
z@bAyBc0V@jJ84p{h*Kwf@ex`be2kPH%l#C6PcGRFK0T|Sp14X<d2Y4#&Tord6`_-J
zOCx#p$ZkN`)eN)f@XnrfzGE`k?`1n?W&7m6@ZXO-i6)6zDJtJy)KN$c461t~C%P5#
zo!%cX0N5`W$+%n$E<4Kl@)||Rz}GFh5&YA(&uj8gGaE%8U#*(aiSgIV7hm+P(@EyW
z*5=2GI^ZXAu(3OSjOAL#4F7O(6c#7mA%im{qnrHJPeFNB1h7cVAzme^!co*;O3lwz
z$e?M$!YdPaIV;r)%^8iuA%Q8DqAu0K?r1dj=uPjYojZ5#z~RFMCFid^TTO6qKTNaY
zG8<sW1DDi%z78H2@#J#(BwjLuGZf7UwSh-K0t+7{*YzdqoD@zSt)bvIdmXXK<ERfX
zAbbolzpj$(CiBCLjE-U}wg~r{KM>QeM9{FYgGofx&Y?+OiBSb8kG9()&Dt-EZ8P9o
zNvE>&K<=>9rc_dbadWRZ+Qzj1cLNT4)K+y?20;7QvNDj?c|{Ai_+6MkPy$WZtN<DW
zuM@%6QzvFuz)Wu)8Q?`5S1I9XR#u}aK*W7}JM^<3?ok<fz{wBy?hW%I$X4wS8(Iqa
zlkj7=2Ce9CK;WhQmnl4v?*$>rB7!9j(bYxfc)%$%YV?;y=6FeBVe3YFIP~byL9CPr
ztEBi84wud*EdO!N<css)6c;CRE=Uy&%Wlp?GwgqqdI#mu*~k4|HUb1O%hH5TC^m)L
zqx>|0#}FDLR?2~XH2?AatAq9kekJCfHf;$efHKMQ`-+i-D$Xm@mI*beccs(>p8tyf
z>I9`U1JYkvC{vz_pAG>AVzdB{BY|+nwLdDQ^Af=(>MZ)}-Rm?T7TI>oYJT(kJ7-E|
zxOyl1Q5lohn`Imt30Tarb*H8O!XAGl@c<>27+DWePaZ&&N-T<jdt3#(*QG`=f;;QW
z*y_0Q)DGS{ng#|vxb6g^TC`{yH@bTR@2Ou<pt$e*Q2W}t$IEA2xiSiZl^}7lOfbmO
zb_Y2Yx{&@~DiI0ZUOvRE0c%l8H1yqSHLur(CMKNI@m)oW<`}jiI#Xrduxo*V=I`zu
zG(0o{ksyUqd$fIoF9Vn&(AcJA7N+i$aXzE(OS7<x7wgO4_}siztDZ|%TxsZ`oxqhQ
z-eI_Ko(5W2!bfKME+TJ#u8E0>XYv}1LXyOQtplLrEsQBw6$zMt$1q9Oc00Fj+Vu8L
zlV0!nr(l!5%*CJ%QCH~<M<tQh{FCbE@buGG2A;Ts)aVdo0=`AmFsgDo$f%Bu-2uZH
zIW)coL5ypvk)3}+p<4eN61OLJp43WwG7<wQ2_=6IB~7844lWoL&Bsmpar>~=Qbonv
zZR-V<ZjP_#S7JbunDYOxdeg}0yzlL4#ZN=IMC46R_<FBB;)C+BEpCospxze(cTY;r
z?fK(7yck&MR@f*3Dv0&pWWh&Ux;Tcy>mu=#FdV?x)&m<;_(W!-x&5?hXASx$U@K;i
zy`yg7sV^UWmeH0H`opnA!S75pOy~l)8wx&(wR<MRc*B}hWA#w7_8By&86!G4>fKj7
zq>lVfAv_h$Aa{#6exuIeMsk|;qYqJhYTb@pN&4(V>vd|Vu{I(ebSCsIJk}!*S5aTT
zEiRTJNc5DF1D06w3=tp>CW%MkWE77LuaIh;s7<Bhqu)popuPYG;d??0eEfQ@_D-7}
zGMt#DgcpoAM5N(Le6JiZfUWy7E#w?zG7#MKED|(g2iud`CE{rgEKy#Ko7g&IIf+1@
zYyptr#91X<*w#|+@!pY$P}OnaNMWp`q!lnM<~#^TX)m;ZudOmRAS9PV!FiFxrzYb&
zFaY7v<JK2bTizJ)V!3s})hE>|(A{GB<oxFmOEZTT?BmuIdl2YCfe@xRU%O>^=!9Eo
zHwtHOkC_|jrLh^XS=bdMEFw6T+2dPnTdl@JjpKpcM-W(`($y#^ZW(AM{xfp95c__*
zQGyy(w9qp3@{WmsG1!tLjWY^_cN9|a>eb1=^cOeHU$3U%mG*-pxX{w$Mn<7z)&ga}
zNnZ-W5k4=!xF`ZUA_ApSUgd6_y;2fMP=LyBt(@ck_*Pu-q$I3x-1Yeb$G_Tf^hV;G
z7^9)z`Yu*-BRSLd)g+&FBfD`qM_<2N)CPYPLR!HiH+W?Kw!zPpct<wmn3s1K(_twn
zFq``OT<5n-B)*8_y!r4g2mS+dn{?@GbUPsCMjNYsUg)P_NnpyR7;ZF&EP5EWANzQp
zi4$)nD4r$|m7b`_jV`=N-Y$Gfd6gQi4>B{HBe1T{>wR*uT3k#xg)xf%Aanwu&`>Jv
z3*fCjzAVnm&v!=O;*vDTO-oc(vu1tVq{ld7_7gbMjKEZ{)O?cRsG!byc)*`J1eEcO
zg+*oV0jWi#pQq-dZn_;BME+{giB=w7cZYM7XT}nyjh?wbzM>2|SyJ8FXt{Yp(8;(i
zI-1EjT)wNmj>c;VRL{F<zOMoJ&W&Xo!dG%p^2_^Am|&1K4;@2bw)tP2mB>s_YUTbl
zZ&^5lfXZh-C3L8yJiabO&-YGi|Ai>xr50e(P;~^8Ml7VZY(lJmTHfurvye>7H7hxq
zAg405o^;zjFLvs=Dnjo>Ahm|i#Az-gG~rK3l%r<l*}6<9X$l?&Pg;rrZa8&{_$!<n
z@~2Teagi#JWvkr6LAb~N$JKkl^}PT6|3#G0AcV9y>DVe;S|3^2l!K6wWJTFhNeg9k
zjAWH9N~p+a%F0Lxk&+!LWJUe&&komh{r=~6uIqNbXVmBOe!s@^`B={-b2Peo%4c9C
zSVi9CPtfcO0U8^%XtCCQ9|gT1|N2J;VhRodwNF5rf4eoOFHvwZ$PRB}$ID-{&2K&4
zICoKC@x$!$bMM(1TwG=Zsj}BGpQ1OT$mVP{#Ig}!YeW1XnzjoCVbgwjyidYZ6-DkX
zXLp0T{y!g*pYFz#AgnoD69~JdG5cT#uSO*)dK&t?8Tm!ACKI?c;FSGlZ#^33erMD8
z2=cD3N|+4T|G5>L5C3SS7TAe52cwwn>Y3u)R>gnA|Lwfj4&3<o2gfRNKIUOWV7|>%
zX#1zLQjYl%UJ7G-;plk1fn)PR?YX*8rN+dR?2UDTW?#PY$B@a>lE!s_Lg{s}AlT2J
z^GW;TxvJ7BAl)+G5Z<v#lY7U1m3i20-oL6MXOyXqOtnGgFA{5#91;^T%f_|TdQpu;
zmoitR5;bG!|NXa&`1Q^7?1)&1I-$$K#>vgn`AT+nCgX{?O=5<`+>P{`w%MNF#1$(!
zvTU|1_ikufe%7K@#MxF(RZ~Xf10rsYrt_(s&p^Ex{UPvdsmBCSMo21sAl~u)!)rZ4
zwfkxJsvomtb_F-zGSFW!uOhcU?l^?D13?WVv}#j?j33`(IQLw`NVUQfIWjrQJ>f^6
zxD#_%gzjQkFnp8*M#8F^vF1>}&>{&K0Fqj}%ks}|SeDEC?P`o_34-;Cw+l07GeB{N
zali0g41N}>awN#SI}$}C>YDPm1G@FU7<j?rZ!?H=KM4_%v1{}u(&_$qy>Hi~Pvka1
zDj{sW5<PRoh>d-7RP1+eXJp!o?jqEneG<9(`wymCYV-Ob6e93>2d><P%hpv#1)^!=
zZ;k7)wdF^LPXCu&>9YB#k9{p@KY?;cL2q<n(k+S>y!<62?wM*eFoDMbaIKZs=)|+7
zC4F#FTS(Oj`u6j!Q_9=1P0BpdT(&&UyGr_%bnh@1(v1BU&{4k^>>XIGiR6-GCZ^nq
z{g?By9Q_3;FI4%Uf)Y)<c(Dm-@2vYzOf+fu_ohejnz(iMW9;B4mB9sP-<CxM>DP*_
z=&%awE>b$NCo>Cc5>jW{liTiSkxow`I*8E=2p1(IiJ7EcsZDNjBTMEO$C*GX0Cr+F
zM<>2{8xXI^C~<<~1B`F!{&dx;1!JBK*{tAb5;FiOUONMypIym$;Gn$T<SlX+e7vQ}
zC7p<K%RTg-<4D>p`d}G_3eXC&rg_HkA0Ihd2>7C?_EZvdnxg@oMeI-S0b365yZnyg
z{~ugg+ooQ`TmbON8OnJkSMuj?XuC{O!B`!D!?C*?GAYajOB`ag`cVo)-KkV9{Al}8
zqc$(CTdK;@B}uxJ-|*|ImRA4IePz7n#sp1V@uaizJ|y3p;}?B~&&Y`r9VoP6?k>Ej
ztgaVm0z1pk6`ua$w`cWvMoI)Fr4(&hkkOor>4xjMhb}7*!8u|vr{njoiXD7FLIuSB
zNhu%`5~P1rGt%&TDFqh*CN+rHxk&@|OuSsZB#;mePH0Jypv_~@;c)U0DP_S_z{mtn
zX>vGYx2NF(a;JO6#xB411*yy>F?q=Ylwns$dATbDGUJ5r+;&hSN<7(vXcWqsgN_hG
zB+zt5@NQ7|P@gkETKK1n9vy<@Uzk5PNz}3^MEFPVyV}FNNw5Hzn83-~IyvKMsfUIY
z!3Jd-ot}7qUA*UB=0Db0IwwEtX=Fu16V5y>EyEA8#^JY%67nV$L)N{bNn?i)Y{-Qw
z>k0K8iqZQ;-?)+?uCJH8%~mhqT-!Ejc?h%?L=RUt;46g)c~IeX?ny?cJj&l>l9)In
zZ=9T+4kykqC>hl#i27*loAZ)HCy4-$3x|gxzopdkvd@4~w;X&CjGyGn6UhNlBrS#i
z<90e9*=nn`;kRm3j#BQD#2Ay<Wc5uh9NmrGyoalG9sS!`Y^a;17V~gI5aphXhvj!5
zfTpYrKtd@}WIJY|^F@P>HH<-HgkVtS0LG<M?EtBUA0W3zq)yG7FFTcS?gh?5wuXof
zoSmQC(0JA7(#VL123<!Zx8TAQsTdF@-V#A*rBWX^kGF-7_?%M-N`o!!Z!CXQ#c(1?
zmjL8MYs5e<oHpqwahk45uB-|Tj!EJHio2F3t3M?_?pn!x?Ay0@hmv>ivT4#KHW8kh
z>VU8V$d;_jmk(V&IvN_C2!L$<@9`mB7*0tTw6t{IeTGmC{w)&{mxipHisA*Sx(IvX
z5_5AOeCbh@mg;h3rT0q8vqI28q8P#;fC$ay0^^-8`lr=?0I~B+y8;T7E?&W%{PU|%
zbq^vX=&HnzR#CAfz@1p!ZtgnFYMFs6T!7ZDJ&*1C_3PJ)(~U&h$P_`!kZZ3-=U@B0
z$=$sEl{`)A)i#5kp?T`jr%hVJOu7<8@09S&QEC$IFkiVJ-k)V=Tc*4={y&GhuF<5e
z^PNE>5#iw0Yj9?(+CL|zD9dnyus)@!moL9E8DOd_7ccGp6*vawNQjINY>mhHm+PUg
z{xnMT^UNGRx$F5rTbQk+d_<MI94i(DWVnCf+I@-M@&NQ9N0e{%kGSd<CvMD|%(7&9
zw$Y5%`4jXU_g*-Dc=65;=%g;DZCdm=76AYX)TeJ22le*;_3OuK=7r$UzWjocjvCl|
z)yot4?=b|IQJPIX^-x4$1FU|G+}30#&Kz@Qu3k(8wHq`jWV=Vt^$}J+_{Xx-z=|AW
zD!n@EJl4}_)jLzBUz|M@KgG@XW}82r`6=5AwaoHwu59+Wx8w#eykP--)pZ|L#sy)y
zJ8~e(y306(0_D&FblqRUhUn;E=<xMv<#bL+dGJRIa4W_DB$R`C@j35I=b-nC&+Syh
zK<GWJkeq`1vCaT56GTcO9wdxR$e#*=J?hj-Gjxd`Cx$8zSxRK4Oxa4_)|->9<E1@&
zOwtY|=;6a|nGZK^Xu}ajvEdqI^tv`(AiK!ORb#>b;h`JY#Ww3+$IgK9ykg8V8CvVx
zu=bl&TDRmpkbFd-5e6aXt*D*3;Md$)vzm{np#2nK$mOZ=TIgpWiaVC)b6@;(?Dg!<
z0|y)^G{4n1W7-Bwd+r;Vpd$jt`Bw)I@|)5I7sMHy#VO2ZdtP!+CQYU3FKLsX-5d5t
z6eLvH>t=fSO;IC+YiW^(SN$%Z_pJWI`_%HAqJpD)UcEHiF6RlDEv+!s)n&J^Z(}E(
zwe|M)PRS#TEK{`Ot8j0e<nc^6I6if1JZr6TZah#VcVt%?Kujg&^Rw81sn~-)$5Zke
zVnZSRN9ZpGc=^#pS$)4c{87cVcklYN9-FNqGuc@B;+Y840z<ORK3_*`{@;JBQ~#|D
z|26w&zO?SD+koyHCxrFfx$iq}`ZjO2d7q6(#~`4P_zD7V*XFp@L{`Bt`Zod9*KLDL
zTHY{@T-c6sqzwyMHD1Yc@@GF7SQ7E(`SUe3(M6<NQD0!}EohXth=-}ArRB?15AKjg
zjkuVN)}DxH^2gE<Snt_^AsobkO=)f-Z*-4tH|e-ziElgeqDQ|v&_BymSCkX#wD?YD
zAuD!Sck#jwCmKv5c@XC;gG;9ZYPEHCzTJa)NgH~0^#m?tyHGxJz48}K{n8qWuUtwS
zpk<0J7lmcglfnP@0tgJw42HINtoP_q#iWZXBY4F8?D-4FJY~f@Cq}e79J(JM5Xel%
z*r8;xSv$gz3XzwJg(yWS*12-b(cPnO%#CZ@`o*$E5YcQGZtp8n`n4HE+1C}Jy3r0b
z6+4y7iUuyd`gr|6I`~<A>aW^QW6J>5%;{!DFJ3U9{Q3=zLHv%<wD2<`|6x?u2lf8w
z*<|#|!#T_ws@gheX`eNh)82+0a*R=(Vacj1<FS)wQ{nzF@z#DC%9wkP+LwFwl*%0J
zSdenTi9x<OnbwB1sFkMGreD8F!4bbCcLd3m3hquSX_4jJjX&RQ_nQA%(KZ``x5dVa
zd6H6%VD5jZ_@BP>P^p#HCF|kC=Arhrf9|<4$48c5=L?@3$xI<MRCbgQuG@Pw7;xzx
z*eK^gS@Dm0i^HmtTL%cR$52A5A&PTufAROFX+<ABs($XtO~mnlb5@{U&&e8;Tc@p+
zmCx<Aj`jcd(ymc9PuVEA+wp69-B0~u(io@OFjR6R*Cr{Z-hi63G*^u=<DPGz`N$-p
zmE)<_Aw)KC>O!2i0m-l4aG}qJc`^NRRjb^#ogY-NWI|%`UyGUeI%vn5zuIgJnA2gd
zUsO=jVJ^vf+jHyPZaOeRX~y5RPag<~uzv7JXP}$fnh}dl+L**#eU>?-QQV+++m|LK
zloUKQ`0i|UeRIc$YiC@)^-8bRkgt=Rz2^?ux@b@3#ixX4gYd?$Q_tyn`^l3~ya#z#
z-7NZb%|`xWY^-kH*VI;_)0<{Dw)-iQ@L6{PmNzgDxndB0+$`6-AR7qDuG2OOxmL$+
zwrQb?SiKgdTXuGKy#*g1Ouy76oT4e%)ZxnXlgiC*MhsHuv{-mn&n%(=M-F&Jl>W`_
zn~xsXb#1h%&q$q3J9ey<cdlgOJi`@4y;JgQV>~dLP#GJqzj#1#nBfq;pk`>m&S7og
zV@<O4UWDg7-##Nkev4Zp7hitCc#dkcEa_aOIVj2B7Oii7<i;^5{+$sD3t!W<)I*q#
z7X3QDeU4;vNusb^2q4q(p09?{z2h_+p&9CS$9>+sEN&4)wOXAnd>-ZK8b-*3iC6-9
z?qX0;*52w4&CMG_L~G~lf9BPDS<}AS&LgNbAZK>|>ofm>>(<~v<6W=2G|$Ms7voe+
zOPwWGBCYdY+HsOZBfUN}Ruo@FkCIbRpfOS>9`eRiXYB~hS?SlVUQH8$A+3iVV=sB&
z@&&K-+1>x`9yrl`=n?N`7)EztTDIY~>uu-GFiGuxN@20xbS?L`R{sTe5+dp-`KS4c
zosc}=z<cfi=<2ac_5ItYsiN3s$&wz@tMNqy?&5g<>0lZ5=Z)L_)at&`!l#uI)l#BW
zo9OyzUNynasDr~h-SzCLcPzV2w9WprkK20AF!8w9+mrqasWPFuS#)^>)z%%>*hZ%o
zy;2&5jL}+SdCaC;c5YeFciQ$rs8|#_2lxBitKiiFK5ZvoWRUH?9K-J2n*sp<v)8PO
zN7*0uYa4%L0>scq*BlC0E_w+!b2~r3K0VOQo5N&|KzR4Zf2j+w-j{ij9zFGt4W*H2
z*x>BW1rL{IJ=KtRqa9#5tE#tgbbJ>k+RDEtGmit%iEeN+;Cv_6%a(oy$&N|p)xYPM
z4^b2hZ2RDB(Dv|N=O<9Kf{m5-ep5W(r?O>?c}UiX2@_gYp=`<@YxIlJRE<Eq-p#YP
z`RB<s{#n*vVbKpVSsWQmVGl@~qMOrC5{Q}3P)OTFxZKLFMjAhds6Knh0_l4;DPmXl
zmI$T2_uLmi#zvn$t;6QvFsRWZfByP4Xt|rh<rW#&96PBjl_Ia+5_>$+=U2rDQ=KUN
z77hJfOyqlx)SY@_kisG!l3A|9aVe`T(fzC1XxUoscNyf@K0mwBrp|XlG#VGFgw~%$
z7s04n)ro#DL7cBGKBbse$dmKh)mOad*gy?n=%e(Te)s$H!xw0F?KP8M6xw8@j<a73
z3vK*yBYpmN+hf^!?~+Smr72kh3KxCuTuW5*1`=aZRXxm0*=xy?V!E>oWb7*-tu$C|
zGOfESVV%3x`r4S<St`jvW2~<6#!gq$S4%szp^a9dN6(3#{RgYQdcRl<rw|TNW5TL#
z4;eM;&YGULUrw^T#(IAq=IOeed9!&@F){T8jFQ=Ecn2Gh8P=my%Fl&YUL3PCRu4qe
zEs3Q|F~5l{$Z6wue}kXCai{97`Tgy-N2sZu4++j{eJHoyq1-n|&Hi0I_)W5*TC9Kh
zYQw$z_SI&xy}gf*6}-GZJsAiA_d_i<<~K&rt|e$dJG4Iw*xbA^T`wlv#?PiiR_=^o
zpBT@l@^5c$`55WK>$h*ykqwn|ghca!1xH(e7f-(X-Hg^dP9&c@M`mysqKNv)QsnnF
z?J&+)@(65R*13KEejwOXgJ{!dM`L1b@dJ<8Ftp2`GdkMT>~@>zLD%1S*_wpcS?PN;
zDXAGYOi2TSr0m_N4_a1YR#dRKvpu~QEoykCbe)5VsI+4*e0iiD6u}vLhAqYNB2e1Q
zNaIZWjSkx1jd#5>{MB(<#g`?}Ws?4bsjX%nJ90JxDty6lR>BuB66SgJ7lkrMjm&hC
zh?AmIxB6xH)mDX?_H8-;ytdFj=yt7)s^WIIfl8K@WoTv;c`!DxKAYa=iB*Q^nT?~n
z6umc8op7^JsKbB(s?f+vhCqVjk9W^(tv<Q*-bSUmWBM)6zv+GUmACR<tgdW8F#P&%
zdJ->OPUu@yLc+b0<!2{q92oUReK5o9jKZ((9n+5Dx9-F|_XLgzn>lmlSWolVwt4ev
zjNaavUkzsO_H;EnY-qgX&eK^3M%811e~9<OL3oKN<AR+S_UY4-H<BfR!!()`7he5~
zgz<g=rF(PV*jkQ>UT}mNAYd`*YA>PMpvo0)L~S*5HOwC4w>*QR6e-Tl@Y<cUZ~^1z
zW(h%%PQj%ri$wltF^n%vTD0;`%^v&xU70l?Z*S+YL+!VhvAEW|-8@%NFo`4C>08%F
z=00RV(ERAA_qYD~sp;qSfh)E*E+x8dt)PZ1Okl8);{qcJb{mu)AN%V<ip~_s3%Yxo
z<BqBpVQdI7^Tp;J9`pUdlP8u_&V_|(I33vkoIXGHek}IKAn=iTNsku}u`VVCsz0Xi
zcZ<)JBpe)JI?-kZif=NdnE4$70GC$*HHTad?#YEJ6X{ih*9$W0@j6?q^}3@PljI!<
zH7f`RVsth9p_jN*PoNr;$?DrCF02QNB(+!A+v&}t%S-uK;E9tY9v40O*{U#VlOo_n
z+}Z<(GzJO@1cbEK>CZhXG`LXXqK8SmG3{_vl+o+9j^@8Ng`HW%oI6f405>n7hsfCb
zLezGX2*Xr%S<`u&^Wx@mEuj=t`TRf>fAPrgDUso}HvI4%je-r-Y5nKVpM6VTHE$H^
z2nvnDk}<Tkk|)of-+B3p6>JrMGyLmy{L{2NQhPq`DT%>Po-`!j3lmf!#KvWy0s%C6
zX-Xbr3kKy4XjBloZBJ0rtDqIL?-fCn7Q$S|hSvq-^Pq1jT=BGWl6A3U0&i@xsdRts
zqSCf%xCaSL9~txxWieN$3K-`^zv;`DC$~Smzv!&f4Kq1p#zpk%%XQCsZUx<#ijE+V
z&<>Gnr3M+S3MS_U>be<%C7?1JN5`f}_%?3Y((QTW6e<Bm=_#%3(e~5yp1nd%4&vub
ze>G?)2N^u4(#<_ePWX4WZr!T9$NPZRC(fR9*kOooE0f}b_1FEm9wM{-4q{H?<tu$?
z;s%OB1X=Ti2Cy!x0($QjLK1kYcVXps5%r<KYhBpo6Ka7BPKgqyfzLN!Un&B~u;)Lp
z{@OCud*)v3?B$3yrb62`TPYI~Erc$}n|nkZzH39z^DR4DZLqe@EaLfmuBeD2+_<n3
zSgCclZi_}t_~+}<fgz4D0Xc4ZT8CHk>1uTQm8lYRlMxh8VP(#e#}n$Ddb>9G@gUFS
z_B!fSp=G(G)ph|ZM3r{Ci96e}F|i%pfRxZ@iHwN+4YZ9M1o=(eiTZ>yurUc2%3Fr_
zikga2DsRP+&sA?bs)dGxkV|fG`muLPf%vy(^r){s(bMzO-doY7W4DxY%&U4m7Z<kZ
zr}*p*A9d&Epui$(X-!PK(K`LzELtGLo<=rUgRXXM_$@fu&Gj@?!4vgAH#VVMuBmM5
z?RK`j4b|eDwkFB(->U7sJWW0?_fFl=s#&ujDsbNquZBR0r{42X($d!UgG;wD8o5_7
za?O)@5RaL`<6rVkP>^mg)%`+TPrJhVDXHH*9F99y&YAGnja<|B`9&T`U#vTAJ2z?a
z`}gnN=FeC4)ec%xTI><`S?|cEs=k3MR;idJqEwm4wh~F)Qaxmamh9*kN6Y>_(Qe_h
z1@q0X?_C{y-erbsVPCr&7gl_YCJh+?95!g-+->3E{z&z%yjTbjf!x%ZXZqJ)bz~w6
zPT&C2I11%N^5tA@h;qZf-vInj*c&v)mtLGaAF?TMXWr<Tdm7CHlUsJ`*=m^S(czb;
zZ6}Z}boXxklwk|szj<Th<g_hsQ9=5kS!a`~|GJRSd{Nv{ue^s3L#QG1<_2Z>iL^U)
zrYgrW-k0AuI|w`thcIBE%N{xXn-y&y(#a*bIirfSB~Vx~QZZ-FyTgaZmuLQNaMpLZ
z{qttGyovs^W6zqj+3*YCU0JFi+m8=QV^gY@ul%s$Y0E|D25<pmYH_OLyq8)|b=R<F
zr@NR__i|z@Kkba%4aD-xvW+iMBg=uu^{ir(N)b&KS}2b>&SfFKrr`8GUl*KlS{`<H
zppSl!#~!souu49C@9Ox;idj!kXr-&IG@Qppn`)?;Hy@+5sm~F!a8K8;ojUI1&=sM2
z6<iFV)OyflaL?qI4OEmaO!}Z+8#$!};~|$XL<ys(<vRG&wI1EN+357=C{80Sg3=DV
zn`&X7%og>)M7s5D^**uSjMJX5Ek)^%M^N9`^0X(XWB@`aM@66Go{Ot2OX#bGp7;!s
zAl03UCDdTaFNef2V{quu(AmSX5kasiHIWx8U`DQ8yM}uryH{5`hnRcDZQH(D{eI7X
zM~wE!?a+jFh<gtQ`N6vCa6!%DFF80|$T{klNBYsPrLc&SLr<!j%ai0qXWkr)EMrvc
z2OG<b6=xQ#-%|eZ=iaikYYxUA?REM)Jupg$cu`NYbq|hcg0GtJDE;%MoH}yk>b*No
z2Syon?W#@JE34qZwn=XjI(kC@s@PbQN|x>V$#-zCBk4D%#-AC#W_e=$Ew&><;*{?4
zKWjPpfbw>@ZE$OUk1>nf92~NJ9e??iEwbHbRXs$*>8}Yy{tLwrJFGPZz@>N$7NE{>
z^8I_;^-Wv0$VK=J+fe3T+a`+ae79DQkUvFU<5B~yf1(EI^t91348Qtqym1R7X{hbk
zT4l@XcV~39<b<=jxG|$ZEoen)bssR1MFtydx(|n@S5`up+0S5zyLP)ek)`i#i9wmt
zJ<>VJa>Dw-p2o}T?z9{0z5M3Bo;$6RH2<~gI31Rg`jKI6zTLl91qQbWb$t+Pu>3+n
z$;yM9qmJ5nCum;09VpU`gxW{VQgC4O{q;E)GY`Xx(isXJj2uwpjpPR@zijgd49GrX
z;Gp3+#5DS9s%7@`k|0wLsevZ%|2pPo#ImA+Y4nuNZrt^F)9S8?gbi(7Rh@MzVETdu
z@lU^Qd9-bLzr>_vP5#>b$@E91ZZnzI<JJ4g-Vwu&x(*#YI2}Wiq*B)f4>h>GCG)A)
zNcZD?{zV4Fxe@lWWrPx1{6JXk`k2;YmBQQVHu_U}W@s)~w9dHZ&|)}l-g4ZCAlk*M
zR^OCi>n~S)%JIDE5ExO@bcA*EPb=Ipt?{=`kMFu!)NWUpwXknNyT(CBRd$&ohz%l4
zU>c0v>v!)m5e9XIW)WRbl>VbNFRC8@wfWB*H9uz3x<zwd=eFz4TOu}>>lu#x=I-W}
z3g88+z(6kbeL1?DYv1<s&@}&NQGZ2&$Ia~98`FoX_oD}foQS5%r`FU`ak0McO#ig(
zu&TAR*NU{X)BrcHaS?+odXL&(&v&eo!a}LN`QH=!+uKJUI;4sb3xH(x#7;q%rZz9z
zJy5al^6z7;!GfDs(W%So1Mv8idUi({-cv73YeR5wR(;l55toFz%n;<i4-V#6sStAi
zZt;r4LH0V1f_K#Xp;kM?;mvfVx{-=1&p2Cc6=g~ZKBW<E%+O}XviEy5Y1y(q73P;O
zi}Mym-l%)-iAJb><F`fHp%JB}^T23rb^1f+8#AN*@SX$zbJscks~sAa`{sE`a8@X(
z2-rUAY6R7WgWq`l@)FOEBXv~QL`0jW&*?7pU$c=qlxJvUa`N&x+G`zmZ28tjJ9IE8
zJ&5t6{Cvqqz2>mvG&P${2uXe8^=#{ed6K!5_1~ii4$SHjXzDR)#=(i53~on9#I;o?
zb2xZW$^C%ky2|BExq{Emo<BcmVkfNiwbA@{_5SgBgl34_n8z6og+KLn@U6Q?cFb(u
zy2S)%XJ?5ycS*{Zpsu~cmziH2UV^!}rtEKB@G^V)#zD`mzWO@~u(s>&Mdwc2z|@MV
z4ZqPPF-+~+ck@}cLk14CroQ6C%U8>N!|U}@{<D-Ck9uv~%3-TU3++ul=L1-!R*tTX
zaz=33+eTTVH=b_RT)1YLWz-svrk$llwD!6U8<vcf7#@^Mz+Qo$6*gIP)$)gny-9f2
z%r-5Q|ISvq*r&N~nh8JlMg(DWwmoT8I<r?t>hOIq0oPUnd07f!DshI~;d|)`GF$(p
zK9SGLvtGV@IYulxFs!{O0^}EWZGy7~(MTHYAM^fRSI+#63c2b|Zxp)TP>s8{L6;Ew
zE0#<>SEiOU(rXCLaHskZFP!8Ca#8XodNvfcfoQPSJ$f{wc0!e<Rz5JK)}g!-9uLuR
z{7S0v8@yyMoHjBsFF0ClUyj<@zGp&R1bg8RQ-`IIuQu@Is`p=WI6JXlVM2yS_!TR`
zx1t7aiNJ}q+ECOL_|v*0fD3|OQ$m=cR-qMQ52I9<^;YC@!Ytf`^T0PIHd$S2bg<sr
zR(5|Nq0tS@LTVMyz3EUlg8kt0RJ;GCExaD%gorKe7&T~(J74f~;@4RDGU47v`pBTc
zI$bVI(o7iNFepP=X{UMk>snL(dq@h!8ZepRbq&?^+Ot}keeE*(P%bw#HI#%n=Q%HF
zMXOGG9w?LIkTcY;@Cd1ez?_QFuc~Sl3ISGzv;55ep1>KWf8%$oH|@fAdE=$Av{gv-
zmApoq_9#Yr-RKj!bLV=*FPwTV*6M~pG13Gk$HkSl?KUvm^;AjnT~|k){`^Sj=4m`9
zfoNrzJ2_qrVOybK%g3#XRP2M=_h;@LNYJ^XX3bq2)oJ$MnQLsk<Fks5N1d(>y3Fe9
zHhXpk=$P`M+>rKh=NnP)6<3{Y&Zm~qHyDoSEESU{8{ChZ{Gs<C>*d~8Uo6a9^fkY<
zna_vfi=j#Hb<_h&qQ|tmFv-CE!o-^gU3GLpbKq|Z8495Kn1jrt_paEu))z`I8r<^M
zNSLH<;N9`l`8{LC_Ig~Wtltek)3vWlO6=%|1m}pN)Ilc|yU|k|Nr+4cEZrb&9dF2w
z+*22qqU}|s!S;jW@>(Mm5;QyieQchkb{)+>*GZNmZ(Lg??cT?*xvqRYh!h$-HiHDm
z@xaKqShIXwKQu$xd#nt^FhMq|L>JRaD3$fnJU+8$Nz6IxE~OtHUP$%KkGfx}ZgZhd
zbivjYb>q|n(q|jCyi#HPwd>t4c60lTdGK!b!*Od$M<wjlAwKzwjo;G|MY~gyF4Xbc
zZFJztyd?o!MpZ<wNNsx|<;>Hq7gfJ1-Tw4rM9I$!w><JS5*~kAwT}bk1j=4$XhgE&
zwfgoveE399b^8GW<jqQ=G;d`!7BEsYo{`o<NN4(CgFUZpb4x1ZHhDNF(80vtPz`-3
zz~RVoE`Qc*_-?Ng|MU*-yg;*CaMtEaQ*F4~DWJ~G%VTuuKWKy0&<n7w5jWyJksn3l
zX1{O2@$idpy?WCa<Sl~CDLkh>Yko7pId@gdyWLkCS_3!GADOmmC;jMvUzR@_FV$G`
z#M5f`^-iBF;tLjcSZ{Kr<&DKu{_}07m=5ww(iyhm+`y9O3wxoE^hHa~sfUc0Kk2se
z=7!oA(^Q03n$ge`t_G%oE=ZLF2N+E**vf~vFek2fpMa+{6%^vZysaiuRfq!*9BS}^
zJ7c4UD$+9k5FShQN9bv)<_4Rnv)w}1^Jw^1hHB(iY4z83Pk7X%)0djA^>Rgv@YMru
zm87Q5tF`-j;P_W(8SN)BBq&6i@A^URdj`E4<hR{xQ@b(0F0^{>os~TJp}F3_M@JrL
zJmqm-!-V52{)&mqPy6<E_+KaE;x2AIJNfa1UMtSMpSz2ji5r^!0dG=}6g;F3#_oxq
z*y%FgOt?yPGAEyO7;4J&OJ>T^lkx1Q&M?WTlluEhWH0S=W{AQr{OHBU6WR=pQq%t4
zl+F^*?Xq3woj11Yd1US(i@fcXb8pSKxOK>J`zOvAkEd1a)773_v~`8)OZ~8A_0s~!
zmp+_yKmX!@a}LQ$7bpDqF~0kb{AqI!yWcIkSoh@I6BomrAcXMuflM?%eG>s1t4%VW
z3s!y_nizSY9)TyK3gjAy4e!Y0#M%88M_-LTo4QkH*dHyx+2+xv80+lf@>CZ+{mqr6
zUs-D9+uveRyjgfwB$4ide^9#MGAaJ@;oPVrYk&qSRt3(^?{#<K*F}v-8m8rqD6wB&
zJ*8u7^+0-3@(uNvl99$lhKIcB^~jL8BLN#&HXL{mu6q~NGu)vj%B7xq8nQAZk9I_c
zoR&3L=xWu*A@(=Tv;SRlv{tYw5TdZ>^baT&A6RKVnLK;i6Wf1-vpD2KA!hVo&@zC}
zWnIZHV^IRvrM>Yo%icd|7Jxo@I~}$RqVZ3jf4^r9@dWeqs(p4F^(I_DSiGhbr3Vna
z^y<^*V3SZ1Udb0#0W4F1B+G>$z%oYnh84GiR}LOH&=->g25ls*&a_x{RR5e?nf?U|
zzfI{5v-Hn|Jc`}SEvYBn5z}dGQO?nT33w;w=o<d6#U>}=+d@K=J*rM^M|J*bi)rxh
zT9E6t?)E!<rz$M^&fh|D_AwKrKUxcdEEBH;K_da+$0*^7QH|My9)KG_*J(F24u}xb
z9LPByL%A-r|M$A+o8hoqjnY$RJW$cNWUs#&leR&yE}o$Rr72g<CiIo^W3&tmG*J)(
z*eek;Hg)@;@SAJ}ReF9Q!d|xx5a^6Q)Xf{X1lM{s`=&#LL)Keh=x8k}5J6KA-~@n|
z(@?pfCy`E2W<k<O1TGI&bRvTnSr$DO1rYlh5da5Y&i3(1^Pben@A2xtA8s5yXag6%
zLQ%YdQM|ZiParwre-gP-59gzmq<>#@$`!>EDdBaLUVr#NIi#cbvy9iw?ADwg^>I(`
zn>BEkraJzUhVY;3h}%}|8p4qRG~>j527!|aRv;yoGDT=&#9!*2_<uEDv2}w<!S-1b
zW=Q40H)fJo==DzL8nEdJr?i0qBXX#8hn0-Xy=&K|R+paH(`T5BETgumsp*4FEGK)q
zHVRmK&%Hk`b_gxnMmtMA<LHej=rUN8&hK6GA3cKCF*xz0Luh->6dsQfHLA{>KVL;F
zLpM70+Ya;5CgJ533ZtC6ch@k9q7iXYrT~QXgsEy{9Id^e_pWbv(Sm;VVaE6E++6eN
zT_Uz^d*$AvzKqh4BsT^LxBA?*aj1YW!R^gkX{hcT_9~!7i<~uo`WS~U9!4#CIlA6i
zibrHE5iC=1!oyCuzR=blIu6ZOe*j)AdsX#K!mpNeM-y+25ft}ot;icM7qvX{<Xyua
zk9~>lf`KcRDK&I;e4hnIDRqof)*rm_jy}DN=e-x{S*zy3=YJPxItP2cGT2nTG9aQs
zcNkmyR|tS!mzHKBI4(8ce+1A2@BpREE~5#a%M&BH?k>U6mGgrp9xiHp7Mz%+_PxQa
zCmGpjDt*(l$yAoCx=guxK}zb(IIk%BO<M6qmKQ*hPM1AZ_~z}~7GEwTFbLy8;-k{{
zuKh=5Mlq$Dol3ls+4hXTT9BS7*NT`|gK>T>eUfD<(@)vNtA1HM%+Fs3HusEZk^WrI
z^mHRG57W{3-{xzhcIZy8!ZA@J$`TaCtL^uR7ai2PhW}s{Xop|FWw5&R{rigMW7_Gc
zE9ovSa40;t_>`c5PzAh2e@gY@$<;raft>sISsGn^%k@-|JXxMi7!+>Y*evQT&JjP(
zW~zw%_p5IBFOv}eF=MQWhyjvC$bM1lX|0?(Uy&tCC>cj)pVh0Tn*KSt+5~hx)}i&s
z4N36DKWB#&nFQE^b&Ews?8)R3dIl&f6pcf5UQ%Lln#Wb|8Jz#Ev1_eY`8Tk>>JYFJ
zH2%26G{|vD6R&EKD7qWu@JlgH$@FsH(jnnP#*ERydK-#^n5+mLTDtP&T`Lxmkk~Tu
z+MOZ~QV(lOYCd3=OeDloTG#uhVwwvwv~L8qtwZ84{Xx4*>%aYzYsf!i^<H(iEBqR(
zD6cm3xarWM2f(HW!mps^x(da<mj&*lkEVLHU2DvFs2KS<xj+V3;2&zlZs<uQ3u;aO
zjuR5k9DVdF!KZqaP^?5C;8Ofrxys@@hJ&aRC$y+y7~^0&X;Q)7x%<1Ef__AdqY*Ui
ziudkbo^?eG2dRO!)gO~b{cbCx>xCYw5R6$npk81nMeA2yy;a8u>O9f|wVKi^e}_<e
z^Vi;v>SlhQ=Jx*OZ9rb67+mGRB)|oygghC+#;Il|S3=R*K_QXHunwPt(#7^W)(e3-
zVuUY?S`f)e6K<g2q?W3!^lowG373~#yOPZ+>I8redxUazn)k_3{LlW5w++AIBg?-V
zhoQxCn(b&)4!ZQ&BM1z1&Z}L|T$wzedPBB$NCVl2)CK}rCxUS9^zxD_%1u~DEfwVt
zDb@YNg1``u#~&^qy4oLgCEPgY`h%qT29oMFm#~hKC6NzfV>{u7Vv?yoz>)G;oyJ4E
z|2f!#y~={N%ux81H!%NeWk()bXc*P7>Dqq&gJ)T4E!|#E2}@xHWDsqsTuP)IN^?e|
zx9W+NrDZEPUD}E?4#U;EcklMq-(#{3=0+E616hT_+xyqviqb6+83_?j7|10?OAdTS
zV;gns_<nV0q+5r_Ub#chyx2OS3lPL!pD=DZ9*RQ8<h=K|DKnbWxsP=ntn0?x3w<1*
zVGbuM3IddN&p4ixv__nJ2%ZA4jzu#$YtZu{>=~%d+Z>$OpKWe+y>-oDvWqLN7r=o;
znFuvOR}@>RJ#>u75z7EjrsPET-+0}>rPc2{^?$CtW`Pzn5611i<DL-KE?ecM>Siud
zf2v`8*-&civzJci9#U^1sPQ!jH)+dj0{6rsK8>^zH#axk69typdhR!~kr>sSUtA5i
zQ8SyK(rL;X+6XuE;K75vA0q!SQ6D)1qN4`697G+597v&~7<MRkZ29Zsqb|SkN{-!N
zkv+y#2QjgR=n2u%pa`S!{c&$zcOxTZd}DrmGAE~IC_-wjTDfE5{<3(mR6xku_uo`3
zXNB6Cgn!>@p6i|veg3>2+X}sk#6ob&Wx0P-H2xnnG0PexVYQ)Bs&DsG(DM%{HCp`j
z_2~7e`$O{Af4<NI)SuNUzisZ^ju1^e>%h4^Vi=ycXU_)0k#<_HXvziAdily7eSdEj
zDjBM!eJLqV6FT9xCxsg)VZZ!rNsQnul;jTnO&O(xgem@K(_j<5|0zbJ>gQbf?ruIr
z2me+R_zg}2S~iUe53VT(b<dwiveyR(>)xyMK&4352o_s$U$HP&l1QA97|Gm(2Ig1i
z1LU1S>d#bFUj~HlizP~bImIVQN^#_>XsF`O3gt}!NWiQji16$XG~B}IHyFaA7_OzF
zwz|t+v(f1u;kHrKKpci`WR^PD)Sa=G4hjoB8!c*!%a@)u1Yt6ZFkBlQQ*n@Nuzq>t
zyyq5oW_{qJDZ17^PJvYoB8IunC*Hf7o12qhiry=L<3dAiT*!G6;`(!^qeNcANq+8S
zV4aJRJ#r^RGR@HuE2V_%0V>F=<NC1rjibpY+;QvrDYI}JlYqfo*0_RvD@yYA;R2eQ
z5#lzE7hV%?LG(FTmtntt`ux5uq#T;#j^E$7{_1qIv#YQ1H1@b0{M=*SX%`bE7(f@Y
z;i=rPYxbu?MWoVm`niYnj%=jXph(o@{TXd@ILo@IGhh}3t~F8sA;5UydA)hOMCtJ3
zZCqY5!+|%T3|U2k>aeF@E21l=S#n$tX#Q3il1~ok1-k{pO^le~!6DH2>K+n2WM<{-
z*ROpMTwK`j{%MMVs4?VKq1V!(uE{8k;YMH>WF|Inpl94<0fhWW5Iw|~->5g$MlWF0
z^2R|vpRd{g1leaw%lXIMYA=EtbNT-9#nO`FNkm-wqh%!Pw-pi}AhR!f-#1aOBbziL
z;*EQ^>%Jj@(T*{|+H2YG*y|0rz^JL$OzZ><`Q_`^4Cs^6T&OvWcTN)?kvQ)-qtW+}
z-y3`T#;en7YiI#aSqEMj4ruy3gv;Ebpx$a<B14{a_W534nAKk&+N#w;HzI>7<P-sh
zr-Cgzo6@6TX>B>J>M|no>U0WxVWfm%{?TM@tylhokC^2;jd-K)T6dq>=tsBkEG+o^
zdZdKQaSRKP4_~(l$;i8R?`*zucK`xq;*b*1lU&6O%k^{$gjfK_6YvX=FcpNF4AWOf
zv->V7eeAm|?(6&fAif!r0!UR!edqf4nfV)VRam^CQw<I_iS9Q0e&KMRA0kOJTx-n1
zATyywxVH59rU8=K1kwjhDXQPvM>8My+BNJaj^He2YKU}`ss@)Jf6|j8Lo))f_d$B$
zA=)yOF>zl9e{Fj5;-l>3pooi!qzqB)cp6%@0B@g4-yQq)8&UyDSP`eyj~_qk&fW7L
z^LRQbsTdW8vSrhu!;lJ*m*_w2P=?cFZ?U8(y~=MoV88&)!`bgZ(2%ZE;}P{21QLu&
zLvs24eOhbe2<o5f-P+DMVKuzwt4d>GVtK?}5jCPg@8D~19n=FjZvGe7x0!WrpaS9X
zw#iu%42$?hDHTY({L$<<B1Tp45C5lyN~inCVbvricF(Ka)rV{cNs7mCSmnp5foT8d
z>j730lirnwhjYvBxTCBJnJO$uoY&&T67Wpr*?i&pQkw(A4t;9wI=FZ(r{U=v?6-1`
z8fzv-wYB<Q4^6_aAKw81gwD;Nhb^NxFJz0@Or2_s@oFu6m&{m8{X#8^$W)x>5I`Jb
zfybu)Rm+!2T>xyP@93iN^Sa^Jym4c5!|cA$v0Sar%1pPh0T0H^9h_1-x^UUM_{+%e
z7hLa{RG64Qum0Otrd4en{=3>ATbXuQX{5eBsrxCV$wPE{Lri1ltI^nT@Rpq&sHn|S
z5Su6cM_oCDibRk?EyO9I48}zRDD?W;_OZT{yc|n9>X-I<KY>JzH5?t6CwjSWqzSCS
zFT8jE{>-skNTAUKj6U}yjAC;ynMqPTE=YEA1*A(!EdRNp?Y8UaSi^vaiX{FU?brFh
zlLexM<oF|GG=q^ivV$n3<`kT7j*rhju}6zgfO~5^LrfZcpEvFH^X`zbk^^$#0!{rp
zYnl<Z08VQnk#KHcf;d6LOmaU~McM*}of;qki_-5ma)dWi0lyI}&#0JYpRWm3Cx5uC
zrEmSdVXkXPu$wU(RucdNMK3U*<g=<abN15y_nKemH~t35@&;Un>XN=mTA6PJ*&)gW
zON$$#(Lgs<3=5o2fFct&8zL?{s{Y~iYcl>1%+?>uLpij2xjGn?U0Cj%?wS)K-W2?}
zpWhuRV9jkIcsAcA71u@p?e3Man|jOejAP^CYe8mUro;G@jt^Pd=QHV3LI9X~BchtY
z`TirpXUAjmJ4QG<21?7((?f|O*3VFlC+BM$<$Gyr8n}_<wQ!|K`Uq8%bh9W|?Br_r
zAAP9H3yqvBOJlNi`frN+n`&emB?s#Z!P#gO1oc5c7v+OIVQ*{9wc=lpCub<fm6DSF
z(n4iogdWVU^8%0EhaKBSf292?rdtfS;rgl9uvzOxz<S|g9ApwKOMq=r!a@s<9*nd#
z8amW7axKVRFR)(9%f!2Hyw1GbG1~(eshBBn+>Wt}Pq8WVPRylUg0O*@!z<J6`B(ck
zjG-2ENFE30Csk@X?|xrSYLSO5LlV(0QCe^M_g|H>6T{em)M!D=Un?5d{nHj2wbGmV
z%~q!;K&*C-qS2M=6=o6HCFjp^e4w)cyrHqoq5yx>Z6aw1srZ+bPCe{9<!tVoOqf`X
zmN%c`@fVd?-n+2tSf3+fj|0VAnp&P5%Em#VAw3V5rQ}yijA9u=?`Z0x*9J<y{B6PD
zII8f^W^#Cel00eOWl`@|d_%nWBG?0<YjAEpW(Z>{auLyy)GKOg+F<5H&Yanndm0u<
zMtgIti*5f>hkygnXD+DU_~aTtE&`Fs)gNF~cZb3c_$1cRd*v@J?VP1fBSsXu|Gfgu
ztAxd&&ro+W0Pwh1H<9Tk)@7zL;CJ4I3Q^iou?gUvLVg{>zIsZegy^4qa;$QN`-&Bn
zCp?`OdwG598+C=qkHD7maC`Ul#ZDZ}E){dd?qx5Vb89Sl(C!+l`)NcC34(+2vWw?S
z2l75^Bl~Nmb;Wh&%+`3isi81_eA`&rrq<1%7w|+3qU%3n#$QbI@q+;dk(V3j+~W7?
z#MrUhLTM7EheeDLrQdzzZA+SYnIS9bKyhtz*8k6oPS*-_n|>bY`^}Vt-;#kv%3XmC
zWK;tN3kA8Tp)CdXUdchRD#<<+3LEDm|A_0VoJe-YyEOsuN0uQ<Skko3kZ4+Ksff5q
z`hTVqjF((?Dt<agvUmDQtp76+-RUTBo#j=!O_&gw*Jbo^$8)hsh!<bJ_-sOZN9EC4
zpAhJu95`(W7>FmuKq@k<&$%1t&YU)NsxU8tv64<?OP>seS>wr$j!sW-W;<X_7&&^h
zKV_`u_T2C*+o+7Vu+y>Sup4DA8{a|nJXE&SBA5dj!$+C2Yfyh34sy`(!Xc9iO9JWL
z?DMS=f7;eHuY&4~)Opx0oN|gzx`Keo;w{|j^q#@o#8;-9QA9|o{2Lxyh3%`@r~0dp
zgw22c@@2XA@bIhC&(KwCvtmZ-NM^3QC-7kDuXe8}?ImEuYQw^l*E?lQn13Mv-~v8J
z9Q0?Zew0MzxXF3W(-5$oRZ+Xvb&_WCst>bUKDQXd#gf;(?XaD?X4xuI0^^x|CbzJh
zMiUoYYSNRGJ%QhUk~U|Ln&?&~3#^#(K(1W|YRDuGn#-C3rFJeY6`%PT!%4sa^{blM
zhD)$FBM<kEIwCV9NJXQp;&d@afd&kUt<E2u%icaP@4HLLl^%cYlQm<fD_Y|gQV*D#
zx-(*>CH9Y{-B1pSA>qukwe8`Zg@6XSL?10f9%ROn$Gx#mMl5|qOMi=Fop65rni632
zS@e*E1GInxWA4#~3m0CaSp%>K?BReYy;wO<E=%-bu}bzN<4BoH<Emc64EvG8F^y9|
zgzXdWttUk%wi1jY!#zRcN597tUT)>D=OtBE{Ax=NLU|{$`*JRI9acHH0jZ*A<~mY)
z^5ltsDLIld_*3{ej?{qMHww-+akQTQbh8iAzX613AcT>+?&#5Wi97yELG=49dR-?L
zSuK){ME#nRqs&P~37NO@I5n~;0%4#3w+jtcZZLAn{&tN-x1?ws07C@<Ay^wYVJku3
zIP^@{aw3DJgQ)W;(ojD7GajdoNct(kgXmmZ(!p|P)+oNY@An-$)_@b1KOwlz6o>UW
z!Sa+~q=60&dKT3Y%x#_>cOj6vS{x=YMTiVV$2sLC?HBTM=qGwXnaQA<kaj`dV2+!T
z5(xpF2>tB#?(vSzP}kCHNeJ*NWac$i9`d`wh_X--Um0@tx=-5D3)0Y|F7F_Fku(Gp
z0#nJVl>j-a3r>6hQw${QkcLxHgckNtgooaBZ`$*z*;%?T=uhc1(9nTDLdtNV={(!i
zXlUR6*{(@Sfpg75erEJCpLKo~WV!69(3U+U81d(dVTwr*;$V%>7)97m!YgIy8g`RR
zG%bOg%I9ot>8z9TrHYClSNHr=#6Zs&kDD&;a_TnZID*;9v4+gF-`<AD4ndp!EM1b4
z0g_zCPb$O&Cm<i|Qb{7wHVx1h7#P?MVTbB3%j-98T=MyO*+&%nbg9^dl&fzH89MY5
zmFEWO(GEt=v&BHO+ox)o=)}JLD2Y?OvEm;TaReocC4%KI`3>}OGQt^5PLGX9VxVB?
z_zG^{y{mJ3soTPKrl!6*Zd{KG1FFf_1I)vi{`&p<{;W4jMZ3PEqC@_-Auj9UWz?ei
zWEYc>hb>43Ko`jBl7u|SrCupbwZHGj9x$dIwujYR!Xnh-bO%r*qw&Fo6-l%ET?3-l
z7s2>l_k@zFQT<;I<cdNh$>_oxB~OoV4?g3`K<grpyIo9sCYP=3dZnFvX6Bw!ziR+e
zvz_s46mildi+y$}4Vg#^#2O*1h7c{^Ps*iO@Fk#1pm<_gq`HvsO@Znp$pNZ`=-E&X
zG-R8I>Xxclwc6nzBiI<mDuK)XNV*sv4OB-t9g49J3egA<CNr*E^wstMHPXTx>~=Y5
zy7t(beH%}X>wwOb|0eM2bak*Gx79Sp{26-!xT=;w8!Rs8;}f<Fo^gA47#(=OuLK@a
zAoP0v>mGvuJr?cf?qRi-Gw@1__4|zg-4MjuIuhAI6hZkugI{XE(fp?%(wC*JM&W>2
z)&-PRKyrAeupyn!ssO59VSF^HWWuWuGgt=<cAtLhfX<{yaH7z-74up6Y#a{)d-v|$
z8wtC_K8?@}ce9-u`Hv7?(Orzjh7X$CwRxu2_gfRT*E{hDYoJNLzeT5quOK}7S|1Jp
zr-=*dFo<Y!QL~h9b?_F&FDSPES@kvLvd}zu@hBuAc%&od7O<lR!(kw405V-P0PvFf
zoM(*J5i7@!5k3<vl;wQjknXvpR1dd}vJypvz;RdxCVILz8aV|btfq0Z*FCQlx}E~k
zP)*EcfV6nbN#a}pfRgTJF%_U#0y%kd@mvg#7hdm3JipL}G9aE{&z8Wab_iu<S)d^y
z;kFe=Axl-Pv?7rGH8-cH$g07IU9d^RwNekO()4WHx>b&W)^y`Ae92S5wz1}+7r)*9
z;hzGstK-Z@-)A1Cd5Y9gM^yE#NYh7+h8nP`pJqs~y(thW3gs*&naEHzB<uyLIGTjX
zlO!EbfeiZWawaR)<K9jMFrXO=az3}1FfM(59-T?59O&dV6L;9@?$bu0Mzh{3-&*%M
zWna<|zSg{yUyDUcL@z5DL}-E6Lnq)@3ivVM<F!-QH&kBo{O*zT-Rk=D3Jz@WtU~Vl
z-QP~m{G$bsA=~yA)$yHwADh8$cfvIWGO9q|H);XreoM%|0MoUo4eGs3Gc=CA%B|`G
znk{W8r4}>If@S&gM1@@y0~X*z3(~d+kW<G@vtXh&3^3>5in+uZk;(y?b6|v>hvv8a
zB~_h1&Aeb`GxbE$qTajE8?jbnOA6+#h+P*FV#Cpg)k6Aj)Q<v>0d(I@F^S`HK)W=%
z!1-(b#jTo4T9)ZA{Ly&L@^c+`dA+W>mse4AU-e|i@ALov8h}3qN&2S@{TS{Aa4)3>
z?&jC0gxmJ75l^?pS!t}^o)<oj@owDRQh3gJlH5dYPI)o3jmEuuw_v`TD5bj)lP0w~
z)qs+(ID|-w<c-!ZUsB%cZ(mQ%JAujXho|<Oc)RCaZ$!a3aQXjw8bP<#rR1NTeBQhB
zo%fqoRxD{%*k@(SdAm?km6npXnt>k*CMQj+?~uT@-t#^;SorIom$O?cO{#78=9T%x
z?XTYhdCO7my(ndACx&+c%GTam>1j3p`p<@nCh7rs>vYO~oa&^TG6b#YsfP{FcRB8v
z@rRl0O3PDKe&2O#oB^A^OvXFW<YNM90!k&S#Bv&uX@G5jP1YF2L^Oy}LPo42$dV2V
z^`=0rqTa*vxXNnxOr=-<Txxb@`K=k0Nt7C{r<yliH{-!N=)LaNIx1;Re9t)Lot?Z*
zKmU_)`Te67x7V#Gd!ettY!W9S(xQzePcjZ#-#=>=`|3wjyT%9B(MGaRgaQQ})ol=H
z*e~4w`h<-o%e>Fz7udSYPa5E9^@flQZsmzC$tzBjcAWF|RdQU#Ze>zTJ|{cu%K_-X
zO0Gc?dUpQ8<Boj_do{VzAC2gdF}7MNQlL}&h$8Z?`=Y~p#@qbSAg%WZYZSc+QVJ^C
zbS3^Tn#X6zn)`Fv$U0{U;-jL>mo-7hFKP-(o&d@_Dq+dMso8_$$M>F7h%<`Q0{KmQ
zhtEBheohR1Wk!!|X#IHrfE6@y``e!hZI{)Al!)t&!#lR*&ikTvS8COpcD{~X!PeRt
zPKqCoOmy=GoGi#QYtj734yWXSV=tB_&F!A=`(k&VWu9h2TvTFtV&_>adFriNwOX}n
zd*RuMe9r@;fM3paqC|F2SZ=NMcG(l}hyI%Vf@Rnh3<z~zCN`0%F~feEdOY({t-*a%
zd=0`v>@!e2q&8OJ<){C{{)O1^19uVZbSY8N3y883_>a>@DtC|aOU?FS&8A^t=}Rxa
z@!x+f=BUl;9!W)S_p76Z2VJ0X$Bqrz|CAdQG4)Y&fekA_?{(MT?(uc&?$WX0yPN!L
z=lr;RmkZ8c6AdO;YIseURo%Lt-GJnIhu_zJ=(@kS-u+9b20N*LTRGRD9Y0m~ZE|Hj
z&C+JK8XK{pIyAk$SL^Pn<UTw~BE&PL$XnMbS&c#xcUAWGpABjp=qQdD0&7$4{J+G-
ze=?JBVmx$Uo;Pma9?Y>VGsJ30VpK9Xuv%~!F#8cvpDHE0L<QZs^DLqF$Q{!<_wN1Z
zRdJ}r?M<b9S`gKP%uV-RKijLGVfK^rem%MQ)H`hP<kIWs13WVVmiWypj_Ds)*?YcG
z?5n(}mnZbxo%ePe6*PNK@6DC#zbtCHo|{dDdv!p?imh4GA!M8V{aCaH59j#D`Vg1R
zzzdGk%`ujr$Nlr4TILT{)96_3R+PnYUBcrKu0&sD^kx0(FY~Jd(PGFfJ<0K<X}o>!
z-ixCrPzU2Ikx51Hm9<ti@OhQ-DKYf)oRc5BEs3uD88~Y8<Xa_GlLGS#Jcsq4yrF0|
z;=o@Kw-%ha+-lygq!D9(yuElKY0<^f9(SuA#|6E8rhjz#&Dn2f7u=0X+)``u{Odu<
zx7<2k9N%R{zWt6<xl_6zp@@o&)!@SAPek23$=zjpX8$>hn!mAITd6wOxW$gUTd!WY
z(P6R`D=~bX#c)^ZQfj|4qVe`JwKcUn?KxGg9J=%xwics|tC!Y|F0jqD&ij6<Aph~k
zX^R$UE(~fB6F2wg**#WP>#SGB`pqexdu%tuVET~uD+x?YC;iWL8Svvx&vt*BgiM81
z?q<_}Rcw@eZE9pk7H8?WrE{hoPYZ0=66{k%bl#kWsQEqE<QZ-jf%En%R&8vjI%`h7
z)k-xWZ|RSA&UzY`_b%cL>>hceWZ~pIel;?iz3+*kE~hw6CmV))6TO+{x>rhn-d_y_
zv%gF3g_Hjz-^gncU^{%cCLh>Ht=50;$!`<e7`L#aQUXL_-;y@ou|}f~HI+)-uT(kQ
zYtOe($c{fAIR3c)r_24QYy>q#b*GdHWRf1;zuTW*msuOF8XDW#9#``k1q>#;gYdYn
zl79@oA_}fO8x)FZ4oAjNE$fnICA1F_&a5D|wz$;;moHT){CSYfb!x=T7o|UVMAzT4
zu*SPp&-M~M51O-!t4#00$VeNMI;VTFQ_7Df@}rG5Z`!2HfMGJVC3VHuexILX3?$!T
zED;Kefow%lzdU->;%$^llRqCi{b;M`P1-7cpJo8t<rvbw(1LRJz=fe`nsYb^T5CUd
z1r6Mq<+?WA%4hzn(Wyl%>t<=yK}^hfekJnd%j(MoyMhLc-SeS1Wu{3)n{Z2`N`18<
zi`5KWT{;@840)n_=|I9NcjuVs$y?eDJkasWoudhl#!vLHnZ0hDcD%9a1hsl?HYyou
zoeU3M6S1a4_`_8`WqooZvfK1(Y|`B9?Bu*l{as&nxYf&i=GIR;UoFccn0LXusm3k3
zv9XP|^mh93-+`l03=3{MK6uWw<2R1^UT)lLH4B241n&hgzPIv>v^V)nLcWp`Nim0p
zXlQNnghlbZQhARFo$64`gj?YRPL1u_<-Zd7d7EQnw`gzbiBlwIC>{cVk<gP67EzWk
zQ1N^UQ=(ROHi;gY_VnpfqpR5%GNQ1-i0c)!aQlC+E;_AwgSw~Qni;>{Gwjf*!`aWv
z?ZO~igFsDt^=@En{G*x`C?u49odM1(8SYmPIt1wHT=mvx=Q<0Px+%P<NwFev+6>87
zXt;*0WHDZC+E4d{@rTY@hhMGnsvEa;*%4e6($qMg&o}VEsF-OrIP8LKtj`_Stm`~}
z!h~Hfmoy}JL4IPdt~0+E*;+jQ5tFU5bb%6Ye%#%!D=ZN4yPekRbP@G?%N);IxR!#*
zy`wyNU$yG$^qORcO^u7{HI0sPsp)Xc4&=CHZ&o6t>*pkE#lF=Z<pF~atj2B{UvZHG
z2Je-}k|jY7CZ|eH@+zezTG*)XSz8M)@9+p_H*UGXnKW&?{l2c}-j|l%{E)A(Xs0#}
z4kuaLc|Ize6CERmUSxd*2k%_JaifcI;K05Dn`>`60{?dEL+-+}JD=7Mb{hj=KkVzO
zOMgD#f<%K3iq@CL*F3IJXZm>8q!$a%`WqV=pS&^9X?mE-Zz1BssSa1PLiXjpIp*kE
z>-<1py?sE)lrQK7$<`$6%fo9RA5gvKvD|s%HEc;I-#<?~Exf(L?sE52UI~I*(>M|p
zx=t_{_Rhw^52vgPcH95?c(xam=7=^5MIuqWVH#<rzt_Y<W7}r|yrG9#<0Jpkw|hPi
zl3Kzi0e>ZhVJhDBFbzl|2N%9XsN+pEtlK`zi_`AaJU6_9Llz%9uf%A};Eq|@&j$_R
za*7|cFK7D`S6N2&P5w)0d42Sfe8`!{&B8D1@QNQCaE$5Lj(=ufJM6#Zt#ImQGC4T#
z;C!=FDP1ji(r$BJ_jxH^4!6@Dceh6{X#zzlfonE{2d{O0YX18r8b@~>8kbkw_ue&!
z-KX0OxQY}NGN}%Y1pK01yBVGxjiWbB@4YIK?$p7=Q>g!EJ&)da2V^GT0Oym8s_<A4
zKUsbApXZsW(qPll^s9q&r*AryFe5r=rhw&rmc;I(Z7@F3?vwke*?FC$Hbkf-u*|&V
zHzo{3y7}#!D3v5=fTIgpa5{AL=%C3uck$igj@6O@Le#>-i!c}aW>nXl83x{;wJ{-9
zQ^^)lzht4V?DBTQk^u=39TD);BIM-ePWxJ6{^tl-)6OjSnwiP;;JR(~+SWA6iu)4>
znL$kM!#+%a+5iYH{DFY2%1THjg<h<2-^N{NN_x|m2KlqbIyz28?+A}!&vXg#;GViX
zoa!G31I6^S9kXLGX)jr7hJRQt2kI-C_5(y0czP$STQmv5?Y6iN{B-Wj8OeXZj$*gt
zY4a_l4)CvXdvRJ@EOu*tLvL(Eubscwn7pNdKO=Xa_6Q6|yQi3LKzx4pTMV`adJhti
z6IR0(eyY4yyWILm)>IQaUPJFPf;E+<`dzd0;byuGD4rcV7wVKY)=wG#p!GqFzl@{u
zfopuR@g>P}m*Fukk1-ql;MJ%jci(<pQMByrqe-2e(nn5n)oA;BA?{QQ@2p5H-Yz7l
zh1P`j=Hx{Pl_|5$X?!eNuE=GeIeFVkzKBd<_iHOjD96jl`BUMa#+`C)C~upLzDGXO
zcBT&P@nnsNVZVHtN|f2OqtW+j*w=<<Ki(Yqen6`i05kHRh!h*GZ6R+sar;XK@MM6Q
zDx+WFXbo~If8@WfojJ3`_BzIp^yL2a=}3qFh^t=pat-P`q-urC?Cok2E+6^swuzmZ
zS<d|}z!9+={eY4$Sbzv0g;0S<DSEc>`{|Vw7?c*moXU$q)|3v0bi(yU3tAR<g~xw4
zYX7CFuCAMd+J+Y&+y&AS@(N}y=M!f(X+;{-#yCNRwu4k=VSI_qY^bR@fMhnMSIaCL
zp1zPV3wyC}`Fr*lOgD>g-ge{np%?y)r~7Q&Z1eVGX3Gg5ay%P=P^P|=Lyn@dlQ>H8
z^a$s~klM6ownW!=&fd|jKC-jR@G&)n385I2+kyuQ)rGvI=<(^1*Z<i(%qcxTza?DC
zz`=u0&bk<X`h%3~w{LfO?R7J8xU9NGZ|}NH=w9=@xP&twu3DoVx~A$;w)$5-P0j<8
zM6*xiM_D+I8{DRmK$6nYkUE(AVTpQ(LW0C4nncO>>95iykVRQ&9fa5cXv<*0+1v9p
za$-e`jSb|2A#0YaM3_Ozl$<+z=FCG~mFXt~A&dzY7&qo0be9B177|NJ2QRP_-O9%M
zXu{V@WXk!a+5p_33YVGuWdCl(I7oU-lb{fo;8h9M6tPN|F1HWrg6j#W0(eQHpAldk
znR(sEetG16+N;sXc3UtaiClgEsJ~eOd{39$i}H74W4m_}Efo|*?XX!BV9PK<lkuKE
z&0I-|NLgD7&6|6y82MhGIz)b%sj1tQEqBwa2)JA0VAAqVRo54Xr@!==8rqwm%Kfjk
zDl;t5;FQ8C-0zXx;5iT4gkN3%v`*$2S7H;GF{>%F=#=Da;L;v2XU?(1A9O|?(Gh%1
z=HAOt<TQx~&!2a$p}xL;)7(is1i}hMlT3`fAi+S#QxQ&zuY-vRWhIlVTOb+uwK!6A
z!u82e<<S-j<~aC3x-^1>3)F=L74l3>#X>N!IK?gFdwtMmLT4o{gY}X=eAUOsydCLP
zMOwfu0F6B>{vwmAn!=J$220ljUMQCXOH`99RLW(+IT2?OKCcbwmkRKIEGq@ni0X?N
zh+;OGV(8Etdn@xKX;x?}ab6A@lqLv2jEBgG7!1R1xx`@?T3TAFyp53DU=9UCkV=#q
z<H*z!Pz{}#Fjo?FviG{jzi2%qJ&8J4@J)szrsQ_-(WCfcWhs#M0Lo~YigfBSfaW^T
z8$QZ3`QFS3mP95B$rhMFvb;T^;AP~Vu#cj~K}@?UZ&nvw5gH?dd$k=YJlsYYpA<EN
zE}vcGe^t0J_Ql8N^c)TcEtxPN_!4Ife${;)R@~x<nY(Q`iIfWY3%{TsRq#5I49CRv
z;%I}5ee6^HYrDm)MznYy@(Fkc3_MFC$p$BiKe2?x!1|6!FJv>8?hz*zsblG2LS^XK
zjex3nE<Cbs`Bns1U-GRyWOZ}9*e`0<VPhAR&WnE~M6~MYl-!tw`D@wIrMfAq9L*X(
zc1eDWq&bLKEd1&T2yxUKxd#`a!AD4y_0BlvVo67)-Ae2f*`D=bOQ^Ism#E(`m<ij2
z!lvp=Qi_1ZHOD@3fOT)XMcp|GAA|l>i0?8I2^b){qv2R$!pji%NX1QFjh>|OJgu(!
z+SBvp`b!&|3g%0oOMR+(zUB2}gPGY>r%{Lszzvphkp#pu+!As>&v+T6OF1cuLf(PM
zUXl{x)C?51bgNjOIJ+Z*?sE?c0wpRfc<q)cUzaUk4tpnQUzCi<74g~vygk?M`}ww=
zn!5VQagJz0#el!Ts!EsN%6DZ+vH64?;@_<z3_?@<COoe>YdzZ_dXD^p;tQfTzPHJ~
zH=c}|3>EV}h!Cpup#BSo0&1Z#lK%$PW4!ylb}{!-Bnyf=Sm2|WL0PC1Tu2bUa_iP*
zzWh39yHh<YvpQwEBp**d-Fw%~w~|sz3X$bc!oqC(_YWwqh#q0e>yorZQb>rUDME*G
zhFa+aHw9a5+&cr8W#9|)BU7q>&6lxfze@K=smu6;q-3AEX8#sHbH~6g^}TW1mvgM9
z1P9vr%yp$&9DtN8gHKlu52(-0veK%paSIo=Vphtf(_zOL_knvxNrFR>U9=i?eQbn7
zU8ZxM3e%PV6d4L27>mG_d=n`bq=Eu1fG?0D2BwY#;=O2}<IaCnXM^@xSE2UQldmY#
zd8ZL;D8b8&c&|@e3{%C7O>K_-AXr!tIDNd{xtNJy1FWr;Wb}mG7;KK3rGYPs3z^B0
z`b??IVtZuYZ@Xo1Xr<_{<=8}5C@KoQzDu>xFN+~nWVa5H-5YXK5leEJQ%<rpBEJ?}
zON>d)<EBi+(QpDTMFgktyx5)6K#1d6Q^-SsP@hRQ9Xi<QBaIH_xYg7VynR_PG@k;K
zVoo8=zL5E{4-)6L#G6q@j-PCo#ZSUwLf<TT1#w*rXT^XdZCQ-MqDop0m55dK_u(Y1
zDl0LWkHLZ(q{gLBT_*81Qi};yf9sZJ%8z?-ct(^VCq+^&d8ouQp{J7lQA}!H9|~P9
zef=;gtym77=bb64t0eKYLZqVh+e@h`<2Qc({Hd5rx?x4<^IQ-;SA6L}6;3|CzNlDv
zWM>%6jO43qTguNmocCPQQgw*_g73Gz<K@q~J$<VBqMb~^!>=n#o`}utFT-734=$*d
zHOPUxf9S3LK209(Yfi&b(<LgNhPUclKD13(!ZindtLJV6iKlU>6V$j42m`C!Bd`Ab
zw6&d^Q@BQUazDyzBQjK3W`nC~Y9_64plHj$TaOrI<EBl1wDT^^`^ZuWq2iFK=}1hQ
z(CkX52K^~kDQMS*(92mf>V>qWZ3oX4caKmU>;=P30}C0~BD!Xd2-JK><895kqDA@J
zg)6goIb;2TQ5hk5l3fSEVaKs=GIYoE#C|rALhPI%k>Cn52rE-wPT|yy3u!6fFcZT!
zKxP$*2n9{f`-PkS0dW$Igh)Cev}yzvnT>}VwrP`j<|mSsz7=;z^O#Zr-XwXD<Gq>C
zpdc0Nuft-eQgd;nYU%0*6T4L_h3TxJV^3fZ6)2u>BZ%P?wv>}z^}TEh<vsj<21Ut?
z0XZ!SU%sqMA;<g`Nf+EW_<%ss2rA@SCxH&pUGYh3EPm(YLQb=?j~~+!%9Tr%LH75?
zfo+JD&63ncczzM2pejrgMK9VPQ6`cfQ}+3@f}QGn{jyB?2Fg+j{xE$E)tNXK<@%K{
zbt<9a9x2~LA8wCjVw0pwYU${#k^DH>lyZIvIZBq?x&{9nEXlrgYrxW_J!R=08=r9P
zZIyP`t&W89aW2Nc9n&DU`@@^7Yk*5i+BI9ID-s(w+V)>Wp5O7S*@zU1$8Dm+W2*A$
z*?oDJsi;a3N7d#C>o6g)0gA(f!!4RLvZVg7L;=ZYaq&}dHLu#X3kDSL0Hf^g%!z!l
zuopxA*7J^(w&hQMNv+hj?d3TaxUJY4fSRgGSn;A5Ci1ibENLPslad0!ysKG>!;Xwu
z%NlNZIq%);*TPt@9~<ZNwlK|K3PK_;#2Yr_`c@+gi@VRxZoN8PBI<}f)QoTlq*{;+
z`0>Z@_q@S@fM`=giCTe3wypj9Ob$`329lhLi#KC);l=UXhT`mDEgZQN-)75T5(LBe
zZMCFR^C(KMbL#u2j~_ia@wmpw>u-t5%Xh<u4G&MWqYRZK3d9-I$xJ%+`1pJfJzmR{
z%aO5pkG%c6hPI~KVw$}zqf0KC@h@%d@Xu-rKgb|ZN7?>^51f4Z%TwYhd-O<p`m6Qn
zBMFH<bz7f4y~yNdQh!=B62!m{8m_tKkQ=_RrjZMsn`mNwE;)Jq+<}&T`({77`zQm#
z89f6Dc`^cAO0(LD@k6N>8CEV(uXtQwT5Au>U$WB5X37-9mEPXUN?aF=EJLFE-S_|F
z>d)hHZoBSrJS0i;KoTXH%FwJbqzSnSxhf5sIW-SRMU&7(WmXxBD>M&Eh0+{FQi_r$
z6loGFslIEU>3)8{k3XK*b6;|HzK{2D>|^h>*IujE!_yOd?V(4TP_BS-A$-BaH}}ya
z`Xrhn2FYvJWU8O}{B^7<=mfyDSy)IEQdjqzrzui>f_6b=z;B_Wj${WT46tJY=4YQ&
zb{z>J2qVdBwo056M<RwMSlO2z(tZG;1oz(DjggNYtw&s4bw1_STxf<PQ2Ghv5K=WX
zG|S5G8#|FwaY3+bMt2D=?5-XvD07f~C=+WRj%*O77K9+kk){YF%%D9?3ow~pFM=`<
zxw`~>u`z&)<cSl<G&wtyi<499=+WP2hpMfnfXu|pSr=A2UD8odeq!Nz?r~lBPu+be
zM#8PLmo0%@7yNDlDh+*Y%HBPoxlPRFvao=94qM2DZodJh!3|*=3-~(t3$#pTp3<<E
z?b&?k4w53|fGh#6tyZ>8FOSF9emPv`YpsVon~Y1|Z`IHRa<bsT>wlAQ;9%N74!fus
z5XrhckFlc0!Y?4B2V83q7CnRbfp7}tO(7_zK)$xHuy961p)iUwpH1KP15bc1NUGi-
zP=Z<Hdx?pVZzBI##;2+|Qog0*UTZrm=g(1R(3U~9fKm+vRnLTObi|uNxjN%%{=A=5
zIuXYQEoB2KZRpuRf_&DstzI`AQA1i<y2`r5w{HIxDI6hk%hoDriVF~P3$mQ()xob)
z)6Ff-{3g%;A3D;V`Xb3q?A#KX(7_A~@?qRaosQ>1>`sD8bv5DYUUT!d_V(!@py0x|
zDt(^~9653rW?sHNK0a*MJ~THk1R#xHZ~|oDW5<paJkzZ$(Xotgi7eT%{P2^motrlY
zq=#jy;fKne*Mj;E4HO>bCuFt-M59bt4QeSsrg@Zi!ig;Xdpd;faBl^8n^_~QdwF|v
z4V(0IcEVP;Px=GfzicN8lX{YcRj<2ft?@EkAvnW=*Ge^?VJUU#`S1qsW5>)MNc1}9
z<S&U{bO;e1HWv#ObnJd_@H~82*m8+43I|SOkiEwFiUAe*{TV1H!Um3FKd2N<iua#S
z0vp&}_VtZb8FCjyol#yPOPt99n56MZQE*5I+udC&wEV?@PacQjB9wVyT4ASK_V%=W
zw-Ew`--WRFlMXsrrp|v3guku-{mY?HZP#uC2wa8NGL<%Bq%B-`?u>L7Ly}LaOIn)S
zQ*T{?%*l?bVwvQh)yoE)O>-UA!rX%hBq)LwAH4x)dx!A7;xF4Rq~k}um497b?aui#
zO5pRn?Axhri(VG#J2O1N9tH)O7=4i&e+mDEMHFO0wTk9uYDplNvQSIm^v#<s0X<)V
zyAHoQUnOXoCUt2$Sydvv!<1^K*(%ME9j_m1e0LbJ=xA@3v&$$|IC5;MpYD|ofE|?4
zl$W2r0he12Ttb?r4|m*^_v?Ic1!1`zgczEyaEftNFc()ezSL)YT65L^P~X*U<s)wn
zo(}}rzhlP_J;rxqge8$CCmjZ9_M!QN6Gw8qRM3JR(|FB5W?^NM!k3TOdQSh?n+hqP
z1p;rYeLKpb;#^#f9b57PP@2l-Cj3fxDc>B7{GJ$R{Sc*o9lxKwI@De_KK<9$Z-*Fj
z+7#t>KYMm=dslS6QIFxJ^>xZ88%}>oSKzJfe0O;K+~<2rnZ@=$?zA%VxeCVZFJJW~
zw#w=;I8G{MzC6MY@Ghhzb7^!m8fLK2Vqw9g^hpp;NuEk6br9rzBFd1y5~x_hi6aQ%
z;kE<jIIu{9XRNu4V2bS-Weg<i_?0VHUJP)~`fs<r=;<^y{5_*Xh@<0x#q8SjEl`5O
z_})w;G|{@te&6grisa)Ys_lT|<y!=!A|85YiL5eF5w$Hewf$|1(TjlH_aAsW?K&p2
za3W)}hk#>&y=104`un%)x2Qt~`x$O-{X&K^Jk-9SKRZr_&XZ`Gz1$fW9<wWR?p(4n
z1M}OV7z17e0y46sSv183u6@nyaL}o#b44X6rL!1-qj?%43LyZ2^-5CNsBn<CbLS+1
z1f;NT-3*rL)2GAIk9;-t-8mm(A`4drr?MV6og}l$La7Todrte$qu$<&5&v;aF_=dK
zFsQHiL_{10GbjTM2w>SJ$&XqXpNSgfSCd43d_3b-XBFQ`$<;DmKf92BxFb)nj(pb8
zFb@bBWoihXxJN+7s_T1uE0{-Y3o2d=&mdEk#Ym{_>)+eqNr<cjY3+1Wmu1Qb_M_L!
zf8?A7T1QJ&E-Uy;H1YLZ;H}GvYYVUUN99kas!Z#nai?YrOAI6#GBThI&Bup;)AYkv
z93zWxe>tutp^h)`!Ee+V=4ILjie}&#P?h8J{n>KjG=n}2;KwTI7YP!yx3{yfV8@f<
z30o4|-wCMIpPz_OUXGnrc4wz0x(lEtfq^%}E%%hGf9NW+r2xF(q{|Y3q#w1E&C@S)
zI5?<)Axt4jq#kesGG8D8(^;-<@)f~+q0tre$^mjY`D{vA1|Xo!2wI}aGmIkPM=6NW
zgC+b8z}<PFNXJ4LDDsu$9gq+kz|>hgO)<ekGZe}NW*a{QHA<|Lm96pmMhP7D%ok77
zOWRL@nTQ&8v(zP<?;oFm50LGB{a5R~=T#7xC-kZq7iZ6h@u;bmi>;{%Nvxd^Xnbh0
zK>osPJ!&IWR##WY3w+0l(n&%Gf@N<?nHlhO*w{3=#?4^?oumJ1%O5tfKtodzTGsvO
zrI2HoqTf8YM?J;O3SRasw8LnA2?axD-eYV%VsoLNgiN1W%MK0+!(3w1XbuUT_M~wO
z6%dn&|8A5BC#?Xqh#qAn6JG>*CO1-1ko8IBrR<KqB|7I`7%U=x6&nn5p%}aHE+-TW
z@3MV^C}C()Kk-sHR0K*<$+-`-Yd>;2u@TBC@cZLsp~)ngvraZL)Ahtp(ubnZ{1Z}w
zI%^fCXjxc5fF$QIQ$0U8-!8zNxC8(Vd9(oVw+%E|c42;`WvozIazP}jW7CfG3jiWe
zh>_O!*9VrqKZS6aZAB9nA4Z%6Zb$j<Gq!(sDV8<dUZcIh^{NT;_AgUHR(Lgk4$c=T
zP~#xb?zFaEi~~48@C06YAub+SjIJPNK$4a?AM|X{B*5Wfg)SSpTYaxs!W2yJ?sm-n
zgLI#&N#@WYa(t@LXxbOtC?t;*Jn$cBOaU&s>!F!JR6D@4A_5!soB)7M93%`!VlJVK
z6vxpKrx_R+Uyy=q8Z;ePXnMmP1B;E$&8S6ba@D=PWHv5P(j!v`g^l7b4Q^qJqkV{c
zYb{YG{@`LqnNQ6-6eV29p*u3rn<wdg_*=13`6M<myp`Y$QuG382iuD*IhlqB(2;}X
zZU9;)jm~D@h{B2dm~ts#_aBaSuSVHHY-#q{vk4x}hx#YMMQA^71;Gpts`PE%%d(OZ
z5_EzUf;Q-#^qoLmZpX>cM8Zrr2_e$t4oP(k%TpW|S@<+`$aeSiI47m_VgP_DQdDVZ
zpiDzqgHSh?J##KNtsMu{M(0zhizrNmGs(vhDGZiekY>iuO(O#epp%}2vlJK4fc%j(
z6fWeQMi0nDBvgs+n5tuJg$2+n%dBS@r8W$D(Y4&+lw8HS#$U`5WjKN)(SvDz4g3&9
zqhjcn_kX#2Ge*x<M)-`m1h509j?Kg7BUj(ZPLSI$9af}wOfob9#E>1L+YA{K<Pa%?
zL}aI;ElPF<e84c=_VSfykqk945&8z8iqZ-#>@sTbqEbNDE$;U1Vqt&KQ~&B~JWBBg
zNJ3M|1S~+3fx5KR!Nh%uyA70L;Blx#5&fwcB})Vli{;HC@C^tMqOlx8p>SQK`;8qX
zlK%-U30c4LAA^I%@ZAa~t}{A0XmgfzzOO+ZOd8k(jM8u#X5=vW7d|mGF@E!%QVHHL
zIQ2wRMvaGP8|<+_l7BX}52@=z^AR8b4Ir4(w9>lM|Ae&toglMvTFl9UQUo|Wrf=2Z
zj74xFk(^m*Ac`h|LF*7!h+ql;PD<ILzUwDOCmyHl^g{HC5WT|ZEg=Mm29er&dNwlO
zuZa91W!b52a+KF%k(kNGm<lz5P!Xd9Jcp_Di_8BD2}wDUZN%Wfx&h!|;(u^|AMUyx
znTGZ^g1fO~03gJ(@TS2mY%8E}t;2_Z1kA33z!pVIg8Jdna00WvRAQGOy14y6A_wEA
z-Jn$jGoiMG(M@D%SIo7p6(bNuq<Di~O&od-!#U_VQ|*o3fG8d|?bsjGPqb430h0Mz
zJ4xmM|9e$aV|{Il)FlZpu<3fHJ_Wj1CgDq9$Utk1;xiI;=Ub=scw2Pe(*z8`NaHwM
zMJQ85!poA(lW*_Gh!hR7lA;j4L_1DgSvhb*6lLU5LXHGPTt|-s#E4+u1^9nRKnIbv
zHTmI(BE<Ka`g(@XUKGp-2w3v9v?rl>aJsV#-Qz6)mnk+Nq-;)nqaktbDQFf+&yS=j
z!w24ypgSCRCDJg#Pef%s`T6}jx$4kZjV(C;+>UFd-O8Tph8{oB#gsEpIfaQ%65zoA
z0TQRD$QNh~4~mH`HL-l=Z)iV|5epzfay$V#n+5}Hnj9gLGg(}14c=z^iU<p6#4IS+
z9-N|>yrg5CXgS6KO5|Hyh__a+Oy)}{$Js$utB)}ql)ivNP(hRY96IHUj?qwT?^rms
zjQ{aQEgy>(FY<2UQz9XR-65M(w5#bt<yU!JY%>n4UQb_IU~lGUc6o#jG}*V{s+7fu
zRg(E5YQmwrL6ST`A5liZ-;Xpk;FjT6J|>Vb{Z!O%i+<b6Y?LivzsG{~fIAo$6;)k8
zEZ~pL?ddtBesi9+%DlA}TKEuLUv0g;Gy@F(y!kihq`Pz7>C5K%;qz9*>&601(*hF2
zBcUg1ixdh$`#2d4H(C8QJ-1ErL`0Ka_aH)Z5E|hCzSQx0(M<)B7ja9}%@U({I0?dQ
z`RBlh>-C^S2~f*juxm$>fjGv>#s)e+_wnP$XSfWy^@uql^q@e8aBUc1DB*JLg)gqH
z-AJ-!S_(nY=*;xG>xNuAWiJ%(#u;to$giS-vrjTcsBqBv*&3{=rLc@EORo0f*#~G;
zMZxA57m?M#@Gdk6(4Di@D6ilDe4m`Y;e>|JRbiVe_IXYK*YIZX=+Vt;yo*CU|8r;W
z9{Tlbi;Yd3_Ik7Q>hQE{?j2<&lHs~nth>k48EGKjws}V&^M#`7yRmq-Mm94G=@Y9H
z`wpJ^IDsbhszm~sH;!jl)+-~WZ|&=gMDcgET{vaTHrznUBk!wjXQK%23Q#_Dho_Z(
zoO8}ODLne!kP_t^8v)@S8QUZy+OqoEr_r$64~s3jJ%1c|Tm0rR*B<_+H-4*{uI}>w
zx@w+sliI1Bv@TlQ@*;i)eCG5a&zKqacU13}rIaP&+F-f?SQ`z2g$u*X|0=Ro%$A*|
zDS5+wfBctp`B%#SY8AggA%J&`A(sv3c4&PR|CIvzm&i(u{qb9F&reU0$8AU<HZCrX
zx*?DJg7=f&1ek<6L|Ah_X{$Y)-aMpqZV4I|g}5%!R5_UYzNQU+JUS$ZTk{<Mkp+C!
zEl$^IIF95)zzKQpH`>EQ|InIT8XI<IPo%)Rc;s4utFm@qu#^1n?p+M2G}l<;bL@eH
z;b&{^Ng6PI{rVMQydVAc$!`ILX@%PDx}jP&Np^Ex(C4gq`w3yMi}xC}%W2vLfB{}T
z<Q%pJYkjN(aHPUuc1Zcmo1+32^H(3P(sg`De5!8;F)m`7{JEcNPX+wH@G2s>PuNu<
zahbt_yHr(~f7EK~(TyELpIum^b)vh8VWjLbDO-q$hu^&A@Ou@BIO$K57(3LiY&vIB
z;~F&)y5lvy1-GYoWxIjMg+ewQd*N#&x)wNylmde(6=7?KhXwdC%}Q%+3DxIl`Ej8%
zL|l2a=IT^YP_$i6qUmRH$v3n6G$dpT?n|(iL?Jq05R0k?C=<?6bgW^U5hDm>m8Oi5
zm&}v&QBwE98Y0asH?Z0M-)`HhCbdoY*9u2+p<Kj*CLjXULI@|HQ;xWYWasZYQjx{v
zriduB`qlS8`86IB^?#3?WEggn{M6Ef%8Y32dzA0Ie^M%$mAM7brRHKm{(7xl;=o^I
z<D+C0#_L3Cvom~M2XiF0s$eLWe5b%nL(qNwRtmC0TP}ZzA~U48nki^sK}xy!%@wUB
z>ZuJ2nx1o9iU!9k;upn9^t{R`13@TY#sS9cOVJeiy1H6Dog_qrPR?mVjv9Xbz!v!_
zFmGGS$2mK*dAW8d?>M=S(K5LTv)|~k5bTY|jnh~6>T_^%+Bdb;>>kkj&Z+*vJgj;e
z`<Jm?Q}qbL`@rYR%3og^{Y>_6dy7@Ng2;MH<NIxIGEszmrqM*p44(^TSK0Hbb6?*Y
zKwOOJ0u%7<o71E5%YS!tMQBuZg(MEvbKSP`>@UvC<e%vy_a4B+s^;C=`Q;lBI?gRO
zVC3cGP!}Jm7Iy@8uN&SLkxQU)#R}NmHGiZmDPwH=R4#Mp<VnsNW3)2@F|UkRz64_e
z$oWsatbNE|`{Nf)&)}wXb8p+}n$e65Zt@0H*HZY51;HhT<Zbh@W9tFZcoz!b{?wJ0
z-zpVNBbdO>AfVdhdtH>}B*g_VK)0YDW^t@XktCo%8WU8E#Cz26$lNF;Uw|8&{xc1{
zB`V{Gx2%lKSOHiJlmVcMwuvSZd0&ClYf7902<u&PJ{rhJakkUcJ7YDo;Wtmo*nl1q
z&Hv)o(N}vv3NF<p?y=>WaFQb#f7s^T&dG@~e2sm5`=Shk0Sp1~FwHiCQv_TZv<*JE
z)FAagF{zb+^coRr_}xw|?xlzCeL+7A$Z1HD{IX??w=Z?l8n_Uh@kb>b{4XN8kjYDg
zm|6s;<q8`bJfC{AImoZC6YIeE9tKNLIOF<pQp9;v*p7XuAvgF}+7ICey)NrNmas>O
z-4!*q`bs6#4$e%VA102Tb+dTVm~*9b+oP~Ao|liioQ{qNxpw02-Ma%(E>PPfsKq>K
zWf_J{;OoJKNIB#&&;YvV<X1z4AvIZ=ZYMu8H2Pd#?fbm>QncpA`<`I^<GB%0eT|z)
zX|AT9UrHWQ^7yTkVu#*nK#?Nx4sH@{*dV{EfrtR<FDu6vs~VrXNML-fOXWT}+rZi%
zedE0XE<>LF{?A_L5B>Dj{q|sN>CVw{=c6~H?0=7MJpcQ&W6q-6ZA<>Jb@kyteak%~
z9ezJ0<)zm-2I|dtDD!=LRs{(eSojfp7m^w&mBfX`p9-BHW7AxJ&j_|#6%FqvKxNc}
z0|r2u2l@zOJJyOWFOscWm#ZR(xle4Zk~r<tsP|<)enQrF&K4o82-g{sIwk*gRGx$6
zhtVn-?F0Vkvd*12dB>q;bf26_L8OV<%gt}%>&JB@`4Ln*9>n_(t$VRe&O9}1d((LD
zg6BR<a-D7XU(Q&1z4WEy&OI^v6qxi%Ds22`H23zorVoZjxBQqm9zWkM8Xz=FKv`KC
z-~4rC%7PYkamLh+4XL<KgNbN7(bOtUY`#!z5Qk@S%a_rJbMMyW+9x8rLRQ#@j2ij>
z!Uu8}u@os5z&&#?LC-gLF3qr@)E5z8*O-2%uP1-#^!5727Mr`7^T0|=HgbIUsisTb
zIPRV4!FQ%{-nT}2qF>?LYHpYN^`xZ+CzcN^#-yU&t9MwdE|k|x#f^@0u^{I{5-D<y
zm}`F*KQs;6AC#~l91-aOq7p@Dn{Tj^`&XH7#h<QuE<9OG0aLP+d-`-U5N#0ty0<F<
z-VYJALnp*I-$w;09LQj-fRqr4CrLSp6DP}26s%q(&?^uxYaWq26g}Q-*`#yqL-tQs
z+1hhsob`D}(k4<5e~=zIu=YfjWzVsKs4pcJQ8I58>ZR268ZIEi#jREZ<|!b^C(t@T
zKL>piXs-V_`o#U4d2NDSTYkPjD7RKnXaW~`4Q$!Bj^$3D3L7PM6Yd)Oc9N(u*cqt)
z-4Wu^N&_Xu_<A<Nw406fy>bII!Ep<dohVvUIXV4z*Wsktflbne>wMk1bzVo0s;Y|4
zi4PUHzT-f>Qf8m6ufW>u4V&uxHGW>R->b`L+}ScWtA1R}31wq5cQ$9VF5~CTUFj-s
zZGSn%HVtKMc-!4#T3MezG~{Wm;Xh9*>Vwjg{->sfb}bw=;L`#U{xzBtW!w1U2#BQU
zV^PWlI^#QMwe{2e^(PPdO5C8f?<feMFOA7_5<S7tqOh9Z<2UxnAbfp|0%Ly~2!TW;
z!LH&H74^|>QnzhDw}Fa&0A9@OD@=f|CSc3az1ne!=j6p^p$CVh#YGS|F>f60=LqhJ
zuz{;G^h?hPSbHq1*^dhYfMALL#Hb~DPq#BN__5WE{41^+*?=mVxi3Dsybe7GH)xw7
z;U-}cY_lWp7Kt1qoQ=pO$TD*<jWtP57b!+iM)}i<kl3|Lv`FROIs|MaGQ0xOTAzu;
zfl`_3B~V?_3ix{1S6X&-**a%JZcras0XYSIZg)Ylb12(w4B~@dfHwYo%L|jA!Z*cm
zJvm!1ZUWtTQuJ(vAu&{JZ{_EInEbr#>sf>_(V%lo%yV>nn17Bu#a+#Oa8RKY>dg<m
zP(+V!E6_e%B3x)w$%eaK!!vG^JimAkBjJq$dKj%(YT^X~$oZDZH~BexRDku;ol{r$
zc5E-}ti52!(`1D&Qm}pe)4)EYVJ5V1NN&(1lfmQ|#N<0Of>jv|aMDQm6y*n53IXvy
z(mYqh4C(`9_==+iDj#+uePG-)V9h)PXaczzhpbzpN0sN)-e*VB77e_+=$gu*vj9MW
zfiuww`B5XA7$b*9veyG)7yOa~r%m!~0O4$bhb{}qlIxWxE0yCy^X3t9g6K!~V^gGc
z34TX7xPtRgPLu?wOaTf;p54(AZ@VuhBHI2RDk0GGp0;LhP5=sUxoDj|pmFd_X4cGr
z-=;j7jGq&ro1e&KQJZ{K5kt5|l><QK%`BhgATMo%7By~LZosF&6P`9SG^8|bW-hRT
z(NuyX>9nvV(k=<N1U!Twl+B9KK8R*GI<1~#oE_bB%<K%DqT8uk2hpIzhaZ@mNzn^N
z(Y1dn#a#*6z_Fb`%b=q}x;en;_vRgbHf-#VE4=9){0l^&s1S$VFsOUl+qPAtInQE#
zM9mwdE_13k407J^{be$lq+bamZa1cPmMMg;nS-`BX88b`le;Ow$GD<c*6d-#M4YpC
zy!s9=5=6yfWXaeOpa_V))LTsN6}b>(;dtObz2NeGh)dF%q2f6SU6Q$=*${!3ruyDE
zO#8F;%i2p8<DpLYG}ukt%F7c0%#L>xjA~uRdD`Und|Ui?TVH4e2Ytp>;<fI$9)^Y$
zk<ZXu4)8`f@LX5cnQ;t&Ba#N}6k_0_GruH#<>a2WW&3-uP;BQsdbDKu@>$5ylzoO}
zz%OC>lPXuM5WVvKB3)n$;DNQaw+EsEW@TmNY+++UGkoFdQ!CE5c%&#@E;Q*XD8$Ql
zTnhmfMl_O#4;zyB1p@SJ8FR7TK+!QoL84_>uQs{4P7miJ@&kGhljTBZ7o<q>M>J@v
zE~;5~Zl&bDkS>2fu!Lw45CVB2WSDbp_kI0&G&4JM3@Kg-Sjup|s+gO(&l9bWVsKs4
zU7A%<>OJPy*geGs_<O=6g5BK@vrI04))KP8WuG&rn6yRSzO~a2efO}($se^7I!1{3
z)bB+Z=c3hqH=P`{z-?29O!6f4%}&>LilWO`JNJTJ(m((FQ;cY?j{}7J>DJx5<w^XR
zu1)X8X|o|w#d{={FjB{@c5|4I8dAp19InFlnq8~4tf`X!=UPcgR$^=yo|r{K8DNmG
zuwV=vW~?%9`{R-1LY@_5j<bMcffezczParc2pDKhW{!f80P5TNuF}<D`GW4pxH%6s
z3OHVKJ*;(|os&xsfZ%cQo;MA_fTSMt;^wt$oN8(X>YaP;8vvkJTy&R(xO4D1R9_3y
z7bN9P4e}WK_lL$jge@^dJQ&?K*10YMJA*B}sqrx)f^nMFw0&~#Zgr1?G75m=Meiew
zR=2gw4c~u)dSXH1t5gLS)J9@rVi?8PTbBpWc|VR{<|sUG(2i(O?}QG_PEdsQXIH~&
zIU*+~N4;Z{SO2Y2uS|d$a8x)|TV@V04w{3;k9U-mIXho1w|S3i60qz19~)yau6@(z
zd+58zAf?bx-?Kdm<-)m5j}clyKI9g8D03s@i(}Qh4yxCn-QcR^ig6%R^5S2|ni@x`
z?~k^t)K}kr@U<C(jTya=f2eCebO3o!+%HdDDZq0}JO3*vn7B1aTG<G(j6n=gqw+es
z?PgZMaP>cqEjs~sX`2t<-F+WwQdPZ!=yii<aBGmG#rfmG+^#-(g`-pBjanxY<RnDi
zVqiTynE|0Iry7P4gfsgjL)-!_*Ia=>+KJVSXT88_Aa4RIlBT}d)p&YREMWAF(sTh0
zPuWc;Opts_Uz}$ls)TLiMm9JY+4*3<!iihyQ6I!P=%n2m^!rh>6(J_%oz82tLxv!;
zBK)Jo7A<FsqhFS0x@%p)O@t1IzPsz&w8D1QZk$K$>sDigfO~WsWnOp<xF_Y_g*NJe
zA!0Euvr=$JD`m;olfOv*qq*7nNDF3!B&B2@dJi(7Q25lzpGVPUaxj)eOycUF;7B6*
zjcFdj=Z0<H{bXW%6t?SE-i_bDU5gZnt@*%T?r4=o3uJ$2PFJh+3BX>Q`{?GOx3-a?
z5u=a#^5vUaK=6t?zi|)0Cjc#C5}xkZ?exAr+j~{n7&c<q_P6Av1jJS5^m;@!!0CzD
z7wE9fEkBHFfqq%|D11@LXzj&^oit&3H{wx3!t?fA{tud^kB9ux20&yA(6+&O-1jQS
zpVA&+{fs{C?4(ifDIa+Gw$Nk!8X|jD+(<E{WcjFG*ZB$5)03Fk${dPY-<vao+^KJM
zQwfHJTFu{p?kjKB3sP4_Q@5&}Ud_3vC}q3%m}x6Vt84$<A(9n^NY~+44LZmiEPQ_&
z;#8p0;Cy;%dH2nuUsvkiTyWFx_e$ItFS5PSjs<UJ!Qrc^3MK^*O*-v;pekyy)U>@!
zR6_C29ZygX5Yb+&VI*R725%jy1#}P=OZlb$55dyV2ntz+18$8}3`M;dXpZoAbgW@y
z7l`#E2c;4Mi~}MdqHu;sJ>Q4v2an(-frNL`BDHU*qz#RM0ZghyU$<Jg&P5brBeTr?
zK#nEeJ9ZG@Zc^;iE*&D-7$jJXp(kLP$DNO+DNLv=nY>3^K5kPcpl|@ewwKQuB)*yZ
zX35wHzRjxhjyC0ztI?hZ->)jZueW!u&_j#VU~q5N>o`;z-}CL9S>kC8JRKA+#%RT5
zZ@q?|PoMVxn+2u*i`sEI-yS3OqKuS1-PF=zS@#E#1;U9+O$Trv#Wh@yj<!GN!+S_V
zH0RIA>miTD@nIapE!v#%sa)yqqoD9fOIOBi*Z;f0Rxg{)u{VXiZ-FQJ>0JUt2t|7Y
zJ&zp8HhFJzJ~Set3B?U!5u^OS)YQ~_-qR|F)YJP$nS%gp`c7o0Jj%#;-7X=L-o0jW
z@bN``Lw&tc>yUxEg~bi+Kqq8kO1||m7Rsz4Exm;P*ffJWJBR9RyZ5LCn7#LLZZLPT
zinDyLD=8_FDv{!8vz#=Is$<WgUTHP3cjNeQ2NstOZmCk`r-MkhIP+W_pZ%UM&#;1R
zQe4CfSZ&ofUDedo9DocL>A9ci?W5^U{0>IyY3AN-@qKu2&JFlgEX*u90Hsse`^O+t
z6t>;(^wi+*1uG0C51>dR?@Ew@XgU)BM47E!no=g%_LyMU3fUX7c*blmU-<R1AT%f_
zFe~f9w-G{LFLqojEh`IC9S{-~J@cvZ%GK+CVifGpcd-_M5a8lov{P3|$St?j9z<2F
zu<4EIS9Z3mfmO{|t^(mit?PEIYaXV2J9f~>={?4<$`MvyY_jd>=+N%zL;Z_1Hq)7#
zi%V$X5L%Yb=c)vc1-e0=5)(oZChrk9d9w-@&L2V(J#^_mmpNDVl6Z`i)bmv?YHDgV
z?rM--p8JfEo_YZ`ql?$AYgpBb9_Q(Z^+X}INOvzs?-PaU+-Gd*?q*G&(}f?t9opsS
zJ~}cI>c_M2w0q~rj|2~uUcYf;jZ`o3!9gIJ+ox^QG55nPz0&e>9u*g}`sN||IRRZm
zacD1EA*#;3x!$HdTd5CezE)o!hMOAWUhn#*1n+b)<Y$b*u(4}9F)#+9r56?lkY~oV
z&prRsckRCPr6w&y>jU3<W2h7|Vb1@?nZd3axAN*WUi_n1^Re;1Nt{4s6`Ps)ufgjk
z)1p7nP7=!M1vAwLQ;Ur1OZM9LpEUj1&bgsdol>Naf*2Q|=^H2>508%CM)F1SaA=%;
z?s^rHvK?E+3mV04n1ey<(-s0n)Pp76><uFyFVH+0u6gHkyBaY!!88B|Pm64z;jHb{
zkQfnKBDH3Xl93%d8d}=dyS{z-k}j-P)!b}u(5>J4{lJcvf4AF6$A~Ogu)0wY&^NL`
z+gIL%@bCLEmLwKh?T!0|5}D@C&d%it(~Y6>aPMA)gvr`?4ZfWBNAq0eb~|}|B)ut+
zvoc4)gVN4obphA}Zo;bbKOwuum(@GZ!mCiidK`+TDCuloc@*FY0Fufay8(KVbV6wl
z;7R5+n1k&`GTInG3iW#uQaVM8D()5vEs0ikmq0=1{<Ur;1apP`)8>?)&3ax1z9-Q*
zAazW-dB}__w3eyp62-R@#AdMv-w%oJk#n`P>pI&>YA5Gp>#BcCk6arX928KjxAh-<
ziAkf1?)>xK=<#j%=uL+e#2=rE!BhyURtzGzKGD1Zb@5zW0rHQkFVE?4`rpZ<>C_p6
zpoO8mnm;jq4E<K)Gtu_R8((9O>l+)h1KI)OJ3ESL5ELidxt+gSb8$)=Q~rR@{HJbW
zMO29-{{bouBO*Y%Ra~@AL$0<_sFYMn(dH%I13b}&fB%j+T=BQ0;Fzk_gF<yB2B;^E
zO^>$C-W<1SL^*S}{GA<*v0hD3(~!oJJqn67W>FK)GF9mE3dCr&&tzl63n%&#CU#^U
z<sB1H-eRC2SR<CHtS&10?N$hrvWP7Df4zv`M(3vKI`D@=dV&Xi7F6><2o_-o>n)@H
zfyW?b1u#JyfFLN2wmD!B?ucv5OuT}r27&*#LPz$Tj${OQ6hKVLp$f$#c0Y^EBV{5J
zlh`nZHc5C6GtFC1F*`9@3i1C-?!bJw5T&KWUsmxA6GTYXQA@3ol8=N`96BvnC2|)*
zW$3wYYRRUmupLDY!p?vYHgHl_4#No;c7e?i7w{=vQrIT0ENTdp1{!<!iee7=b}<Pr
zf~z7hb&K2J!MS#9V{F;71yb*r{1Rf)rH_s*KLo%QB^;WFXQAc}e0kPZri?h6MgRNa
zx{30k+eBkud_x2U&~yffJe<yK_kTuhv<t0c41*C6gbp5eL=<z)?7mI)ik`6>GLoK4
z@^4kP0EGfW!l4+6q5){kr$<#eftW*CwtzrKm)qYrCOGx3Pi7_PqOxXNE-7DJR3u~n
zS`w%h*iT?*5x$Rijxqz6^%g6uTepjuTUA2j->FBIK-*~F`q&pqARnSGN6vg^sIM_=
z&fT3i%aBH)LTas$`THv+=WYG#j{B=T44V^u8IasC>q45DsM~0;80!Q^J!0?)EYjBC
z?aW=R@Of&HB*)c$^=^24U(Qu**f(3UIrdb>Y=ddDcz3b!8MtkJ>dSkUudvXgb)i1r
z*o#}rQbhr+U5Op*!`h#B$s`3<h_eTV1ZGr7O?MG|emQ${lY-8LYoCU;wjBEKK(8_W
zj-E(wwZ|uiqX#c{UfZB2?wMa*oS++*CtK{VFIbbY|DOjAZ#dxDgM(iRmLieCOy9&z
z9=p1<zY;`CdDVH1FX;%zwm#QY=XGm4H7^cyC>S4|NJtP0x-q$YaY3ajlie-;i}PRc
z>i-PvB%jrJd)=m?uhcp;3gyw0z_CEqSY4LPln{|nRr@R1zhW9}VG}mW56>(8-Z%QI
zt7Zn=K1!(qPcW1|3=UV>>c9WO|F4YBc3y_zd_L{??=qQBj{ek0wRl3!01PJqPA|_3
zoqU4M)qmCOW(OHYYn`)yeP=d?$bw*E&rz9~nQ2?c-0Kn&a#J5FUU(@Q7N+-({2uCy
zglJhThDeiKA|%Wkk{e=M&F+25GB;}e-q>6I=lV-%(Jgj`W8`_sZoizL2~8g!Gh6Oc
z*6r#B+;aEb(b_DfaXpef+a*|<_s&TP9)D>C4IT@%^~EdYBy}=e3Jb0dNw1Zk5mZ(r
z!)X`8rdNBb`{=2gm#!$feh`HcmD{6c%AOON<KsWYW+#6+U_RuYQfQ@D`BL$)f@N@^
zWu!)PYEQ-?-wp4T3_L$OYV^x=wq1Vn*)L~5*HYze$)ZI;ri;F)#`cznd0&5#Z7{y^
z_05_O+3~(xGW49YE5YFezG3^z2k?vj&#6}~?6<8YC#Ts3eAww|ro2+%N=PtWH|RNd
zS0&Xpha)~_lNHZUp{vKkR*%Fv8;VcGF+7Z%qY90kPvkeA^!@Bte9B4jlv~!jW#>QD
zGzPB-JG(!(=mlqj^n?ZMhstqj);#t;)NOjzI&I|Gopt%#`VT_l?4CpzjL&z77H6;T
z*8FsJ<Y<Du$n@hZ8y4{pArHRQk>$2On_`OU-PD$t%wpMRvIUGlY%V~v99Lf$eex)X
z$Ev+~uwnRsb9G|g&Gg2t3JqUg#m->kS;Tp>IvDs7r>HV7Y1$#wtnm9Qa<*OkbMPTf
z{&q7=TB5?3#zKR*xXj(qiXn~ZIrBMAafEOd2Vlt2YeWChC-}cKr3R@Om!9>F33F5(
z^WGTUDYW%yl9W>Cy|sb^cXskFc^E~ZK)@7!P;uKm-Y$cbI0el!t-DZm3HuckZ13Xl
zT~>K8yxMbv!A<=?_uq>ZbIsUme#zCXh1<d<Wr0+VJSQ}=xCq?Zxih%9jU|Ai&{DJD
zNR<CSpY;q42@&i$avKl|9DaP7vzmHjIA<K+{4_7R)AdI1{I@}mqAkxZZiS$5875@u
z{9$U`;>pp{JYoYF%>FAI#mc8+rIl`oQFhXP2greLYap}*5e#U;PwRo0JUcu06-dab
zdVFPth*xm2zIBL6CV%Sp)j4@`A`hPnU2St=RO}CO=rrXL_DS4ZI6omFGw*6bLVaH0
zCB0aeH$C!oX~}JVTQU~y8ei8pG|I=$+IlcoignTFEQKbkl$NtX25pyXMx<XaKJGYZ
zoa3S4o2!0a(s8(_wJp;zVSD7Az$Z%G)7f*)`&*Jn{yXtC;oFO~N-M(nf2r>A?CEZN
zxve@qWi+&2_C>dG^RGpbv*`7KZl4AUB7p>kDT|Q_lyXl!Fu$nqH69^{@JlUy!K*5}
z-sj3gado=BpzPtUUC;q5<5<jeBtCYD_ciD06Ek@p3Ofjw#e6<=`V-f^w|j~W&7t;o
z=*ji&d|ky;vgd*n&n20e-!pCgZkKAymUvP5<<gIdspW}HYjT?3G)L}hw)?h3q&)FO
zzHRg6Ln?m04;ucA(K`WV#FN@^8Vwpcc0N<KwslX}Jnq+^!E>4lCRG9CLsqbU>uM?R
zuaL<&$R2Gc70fB<+qt`K?ibap<{SK5oi+cn3p&=xAHruCzk&m&WuHk<vrDAMA#Cz{
z{8JH7!r;<^(z~d$0mQXpyn=+>G5<_C2jZdex^RBM6APhri39i8Vl6FJobn|v%P-96
zt{Q2`y;9hrY@J>$b`t!?aO6a~KjW)kzcT%JfMN6(H)We<YBnTH`3UNtxtG9E(88mw
zKiswwk9dYPn8+Fu5@Bk8Rpt0OgQJ}{R6W<#IS=JD=0A@c<PRom$uQTcpJ5G-a`wT`
zFdrk`HYnOh4H-(<&Z)oO_G$BlcId3&SndK1X@!RVMdNk2o4X!=92!lwW-v8qOU0&E
zp(5$|J`ctpxx~#ZY5X80tO%-7kst9x-XeJOZ|LJI4T!hzAMs&f{_oP={W6)NE05zx
z$EV^2FQPs#tI>WX{+Rf}zM&aRn?^zDyp-a;Ax%TWd@&G>N>@(($}`6qe_kta9pB8t
zyeuj*M7ybZbltOR$hKH!@|XPYheH-K{>V$i!8;&s1uyI3dNBDVRQn7Ttvp^#Y9ask
zAVs2#%ObOIFD>rD)r_ydfVm>=VpH3^IH9VnP=>QBk6T}Gq>%XrqKd1gw;q$j%eeN}
z%<bZ4EG=8iz@+9{I&<_tZg|QLPu*aVO}z|fP>g#OT})+Cyx0`N9Nh9FH~xPYjM4*P
zD<%HC6nW<RO>ebkm=H<!e@nVloPNjT6Igt*W=#Hj|Eaqs<au$`ZXblZycA{^V?~g1
z6(&LS|JY7i&RyIbI79Dc^!yflSwA<VPlD?c5`b=gWIoHEsk0+EBgoqL5aZys_4t}^
zUwJgc6O@-VL_%#1?WlD=p3Kyi5{G5datp9_lY7J14Pu_wqu4X;e+^;dSc`aCMR+E+
z+`J73=yjDY|KIx*iWxZl_gSzv5KRB?IfV~JnXp9<gflrk=?9oQLI2OxVvDn?x5{LO
zhop4xGQ4DsPu1s~A2iLp!M2%`PR?J$2`n*w3LpFbzK-Kjm!d!OW{Nr}`MF0ymOAb&
zvsLha%OtK`;?!Yd#MpkGnS`v^!jy*=oS0f?VIx~5ZpKn}W~afX{LewEkV@B1J(y&U
z>V;T_kG!-xt9rnS?*$C{4o|H~7qDGyqMVuAUliAv8O^ECvBzyGdoVklM6RiIREgM+
ze}F%NGp^k}?NBMhN#pC};{iK0_{hm(w!nB@V$I;_7<p&-Y!btzLNF`$h_3!^dl4nq
z4`)bA6hkn_i`zDX5-#iXHqCh5YRO`mepu_aVt5Abhh25|1t8eacJG^+p`j0e&bK-4
zOJHlTC2~<XMtnOUmT@9(g|Iw>^XO3T!b`SqW==*F|37<}5<L3T`B<j~S*J;4^4W2U
z-4F>n4k|q~XY$HCwDq-s9$`}ERqmk!5A15<J~Z9!R;4QU+xy(XYrmH0@TcT{7J1;@
zq_lnY={3bNt%p?yfIMmnYVY-UWB79}1D}TeDJbTj@zY@vd56CQ;>y*&zrSW4J1hAh
zcx_ljoafxm8;QJ(goH4_L*l&bb@mg1;<w<+O6E0jZ$1v5Pw9ROF*+y+3mrkCcoqmL
z`GmU_&a{Jnp|13YO3dyMuNp%TNDq;nWvsHrE#bb9aYNPP`O)iNa9tP_>FU+?liCB6
zB~CT<lp*TZWI67E|C%kzZ(;(`nq#I2O}ftaJMVD+KgaHtuwB{1vR%>kMywb}&Nb>j
z9mj5RneupK6gC(G65izF_2d*sUWzTOt{_EUuhLWQ@(?}X4<Su50xNjxA-4q`&IZ7H
z9a_Jy1uv<T1M7P{hg*KyEhH{357&RZviXmn8i_+Z?HH)DZ#gg7cG=9uI?6jjL62H)
z7~A-k*3BNsqH`X$W!NyL5A2SE@Z15m{#Go-0)@8<O`||8$h!+`2%9LZ&_*!g79=e^
zCs$zAaWGe0XLr_Ee)izT1wqH5(W4uqKr-T+0mR6{6^dHME)Q){oe3*B2@_oyJO~@i
zK|o}?W>c1kV$hH+&f4I@sldwjwb&HlEK$~gG5yo=h}th_ydD9nx4{-CrJHdGzJool
z^3@pu&E?^v9X)dx51ZNsN8;)d%2HFRy=AoZk^aQh#l1zs2Ej<YKaWO>%lnbgZOt|8
zyqWjsx)=@)^&AQG$HUsfAr^Vcp^e$!C+;PLVV&Wf#fqf@#9EG%!HKl&>Wg}D=X+~o
z;wguV0&YZm514|{qx^>MQVgR}yOyX%c=JX-|65g=1E5^i`};mBzey38G^IeT%$WA_
zjs&xPSgJM^7)o&0v|HLHvFe}~Nh1Z{W%X)zf!v2)o(*76$Sp~u0UrSsghIT2n*Di_
zt%pMaHx)08I(9#B#%b=3H4L{z!VdnA;J}eU@e-$sir;XKpvfHa!pZuug_jlOXu={h
zkMVPov-g;L2q6J-S&el}NFXu29d`xDz6ZHoe@CQR9M%B;1%4+a)4X6`$R6_sYHqT8
ziX8{YyGXwv^gv8}8^$8a(DuZxHR#F1B}{N`f?*<d)0Q*B#}{h`84R}@;@FS(2dmL>
zq9G>>|1l5Fg{p!WKuPTT_x>$eW+1=rIu%IiI!jGAV6a_~oQ<77XC%sd(+M`NBKS2h
zg~74LZn=A@jY&}n8sG^Z8kez8k6Gs$fv?hFKiZnh|7BuD*zQh8DV^OH%>>pnjG?=H
zW$WXG+L7YyV9_LfSKXIC&N_V>4bf<VctkE@0GL|QPNc{%{^PD1EEqD?gT@qt?S!=X
z(1uDJ>QEHat4%}cY?^Ea1uif~3cG(w(CD(-xYs2X1721-`yiW<pNSh#MKS;!9x9kH
z7kj~zIQ5jn1tfVwSH0btOX7jz1`B?#44V!I7K9)syU}N0COoYJF+xFH&7UGT2Zibz
zcslHa*@#4mGv;WKhqn|5B7|)8@)RXj^^k5y4wf}n=Ft2o$}9yvO?1pC)m1<TKA~J+
z@IQ-`Qz_lU7=S`ht%=uLu=j9{_Tq*)Z~SqVYrr5;9TGz{!+R&;B-B-QP1&mv4EVmk
zCD|+dK<j`P;ZU#{QjU9STB|^}54>xT$<df1juR9N`1G$9KjDr}RRh3H@DF6c+~4iL
z(ndmwq$rQ=LlbzI`S1-tv<J}~`8g))X!DWJz=y{w7-(@&+_|rRUYNt6_UvN)A99zb
zxBxP{Oidfb$A<UTDOGW@P9Fx{mgJNxGR{~R2c1gZacTd29GESDD*CZMX*3Um!COl#
zMV05_fKh@u1w$agJb7*1h4z0X{Jo^ur{hMFL2-fs*qI2m7|3FIp6CIO71f-*Ysj+^
zJ<l$LiL6mvtzKX6-|dGg*(_WLphbFh#RkWHa4~O4Kh(K{T&7saejKta`DHAL2*H{`
zoa)$J2&^=jix^OM^nK?bS?kesAK)6+f)>|}L+2$)Pz=nd5}Jjx9Q(RQUoF*mj^9go
zv(i7r*sH;#@aoRuC6OBf5rftB1(}nK=oBTDE9w=~v>mK==E3kFxS=~MUlhj=qeyO8
zd+YZ(?m1|JcK`h2P>Ej+p*M1Hp<4#sk;0xM9c0Ok3+I--NXp~y+d+@H_54WN@5oh^
zLce$3d%Dfg@G+$3hM`Nv4NnO=BQyv`?M{f&qq{OMAwfe>8=g~~!ZIxA!YiODH3JKB
zY{-SgONTLD#LLclt6No2TODgF3zWzq*hP_Z1~v~FkfuSM3&!54W{PFRb@)-+*Cg&)
z3jzl>I#ZEff9`-i2E9{sJ*?sXLKAB^<OBHSWG2p&8?LhaBdP3xxlr6Ws9vjp+e};k
zRi1|<g&5R<{-@6B*D5$QKA6q3X;}QQ`BtFV0pC@Y;6edwVqZvH2<C{|v1L-yny*;;
zg`$;^`f_X-!=yRCza~y&(Th<iMDwJ`LI4;CDc~V&!RVZ%g=Fns-Ev61t@7X-;zn$@
zDj=<ycy#{>zr#o^HjlWalUfUq_oR-h>+7*n<WhwTBvGkO4}wA%Fbt+iO7w7PP?wo6
z>(87jz0R7<FKT0~G}6+4u<5N5^Dbj~-nr--02dG;@Qi6lp<siHG8de{7P1n-JO7$o
z^e?UkoNj_qKvBYi4!qf8X}BQBJ|+m$R3!HG%3~oKiSDO1Kg0Wv4PHbFLquJ_Sv~94
z<1pP?1(Y>FBBAv{LSUFqb~wR#;@w%KfsGGcBau{*Ej?!C-ULMlS+`=PNIi_$a=@E$
zS(G2$z{}o>Us{UW^9&foW|~L0pJxSjlOV;?iiumg(H0&1mVR`0!IJ$(>lVQr5g}*A
zs#PU4xlQ8G5SYUS42Hc>HY|ZlF=MdTcH$Y(J{0$hGA5!7{bAM`0LL&yCYrdjfmwvI
z?{5WE%ULPn?8NpZ>kO{}8y}4G<=~)ze0RY7KyA(Kt?qa(?AP}g?K6D2r@`YJuxB(C
z(>|xxDRAL?;N_xv53&{}bP>l-*5$pj?u`v937UgaVS$xv9&sWOF2_tNvW|XC;I<<F
zOa8ueCSC*`4+y1|1Kw%-d33Q3Kae<3i`ej2;DsTBoI%6l@NG0}b8UWHR>)cW6i7YG
z#i45G-<d^fCyFH|I4pdKo8^rK5AIv=MZwEpL2$%{mz5WRgy(rKXznJJP)SP8zi%%B
z&9>8cI_|g?acC0ee9NdzXf8ndLB`@pKadd9C_nx+noWs@qc1O|bl3kxVG%c6@@xUK
zBf!r&LpN;R=inxmBx@NV?ZPKRALiWpqdAPkb&qkXT;{<qAhEzF31>qE@UD@o^A5N}
z!I5p+#XTq2Smr-&{D}|*vl~Q^eI{ivu425HgM7SSZDLT3lp{gACQ@&~SdS}31F{0e
zmYIc4ULh8w?O<*{mdy6`9UZKC%EQ6aGZAN~lf8N5P`=N|cZbAVNLE4mpdG4n{?*-`
zV$uq7(w#(61!V)5xMSBVS^{Fa;0wB;D~d=!p^5sQkd$l`?=%&FHFoR7OLycGcFlQ;
z<VJ^gI-gi3SuWzdPoYW|8S=aOcH|}z+FtIAr^66s6WVz6$5Tk%rQ<vMKqWFt);37U
z%^BWqWI+$?2uCIuojK$L3&{b9W{qxaJch|Q(g!SnyHlcQwG1czhvB{K1qyb&WbuV+
zN1(-Lkc+AVG9b-Z)q&L{$wp|MJ-%l-2qq^fzruI{DJ;n|B0NhabqtLTAeUGO*@i;&
zv&obd+sz8HceJX^I}fU)6tmDL3?cFnCedd=WtZimKVO_@-v?5(!WXo9vgKXR5t?Gg
z^-NL+l%}(^jSgTcImz!ailUsZa%?9UT+<{f*@-%uAcm#ET9L%M&?VZ6KtAo_L^$HI
z2y#9=0Pci)>3PjbpZgk8Zz^8<4mphO(YjSJ@g~;Gw&tQq8@7{)PDw~?=l#QyGO)db
z#S|wG+hFs^tQNF$7YBS#xGkNMGaYff908fA(n)3yxk01&r(!J1IyR--Xob<uL(0!H
z&d}x{9Ybsjq?%;iX}Z{8zsth~(tA=^cP*-eL1*L9s*V9G$g&{)$2xjpe%h^kp{F?F
z?|Dh!HfVbAk*!}4k5~lBqO=yCpy6Sf{cUJcQBu%3Pw+I=I}}&_#zzj&gmc08&s5En
z)C4Iq^TTyjDM!{vE5fGV+R%Cobs4F}As->JP;$~jTP?{$^~kpnjLbjn8|uL2PNT3C
zuZ-^Xo)J0}@ockr04O>gVbUp2$?gu(M)Uj~4#QBFhfQDAjy&4vH1Vb8_AI2*)<188
zX<)3m^lH@@3R~iQz;CSVs%w5I7!&YL=*o)#s7|RAsWrWKSw~D{jq{SEnMzNJAzQde
z(6_-GwnixG9~~nFN5BT59EQPu1!BZtgjxbzAFvaGaNOtPx=7{iI`Xw{HtxJ)q*HKI
zp?ZRFU*vZYjf(SFr(vgQL6`%1F*m9wKn#svdGb;^M*9lNspvt9r(pk@m9*#F-@s21
zV}*G{Lh5>#xtx?CnPM&Y9^?>hjocYFZsZ6Eo;o5Zv0Tg=p18K;*UO}R9t66b?3c8H
z0|tw|BBSZR(Q!-~TE~oeSrr`XIZ0s}1CGwCK~*kCm28)6o4@9&<>qP6Eq9Lo?B0{u
zu_7$cZ|rv)SS9ew8HOK5e^bl2((?lXG1y+GGQ`>SlesB4b}=zWEv<_I{Ia6|804jx
z9lEm=Z=7b+R_)E<#ikubZkN7@I2{NI<nTQ_=7#}iHK!;Oc%>t^nzmx1-{)h+IqS%1
zw9Y6qipvg*3!x;CV<#}1h7|Lf%mX_q8lcVsR-Ew4?}CZ>C#GjlN=`e6<I8`5fgH>w
zyN}yYi^NGI2Y6g5$vr?Y0pj2)mp%A#8_kR0FNvo4{$%v=&9`<mJb=)K(B}zVRFoJ1
z55vUIEnKN!u(?;r)XH!o;YyT}h7(@G#hK4jO&|yF2FhD3GFGIKo|j@3g|yWvd2cZH
zY{RYvVoQ2D5~{(!ZGt58QkEdLf_<(IiRJ_!zW)wR=o>0JP2lQG{~g)VSB|Y?vb0KJ
z$W@pPSU0h<6VG$6m{{-v;|ug{h`k~5)e{@9tM+cBP};Ol{slPpTVX1bhCF)MXA?<a
z7)99&vr*lPp$3fx4|>61=$6c!w|7^p9x`UsT_hJsWhQ>29lMiGunatESM2QxEC2fX
zzd7)x)wka9&#lLi#0^A#@j$A47MF;@r#YOsQoE2zkq0hduxS>l^C+StDUn6W@Ay(5
z&~N8E&XKC;hcB!MvwN5E-<!z3yzZ2<V>x)WMruZeij1+h8y>c<Js(vmg~Gt-Q{t-1
ztQ(`d&rAO3ZC<c>#4%B&`k>~$z0uEPz#RWSbQ=|)vHG&kDko9S;%)b2{%!D}!h^O4
zk?hrZNnu+5_7;=ws5g;#-je{94&3xL`Y&V7iV4IDSjS=x0{1MbyCYl#Z|tSY3U}Vt
zItw^jH&`4?F1qFq0~jP6CE)iZ$1*feMKfl`nkcF|)UA;I0elXX-BSWK^lGhy*$|~P
zve6ORd+^y(7Avk3^Vhz2@hoYm<2;GQiRP%J8XNm(^CTQda^wjo*#Lw<Jr`-k+5PkJ
z7JN^2zGq~%k<>EUJ}KSEOtmXuDs9{r0~!a%;(zZ{rJ-Ev$ofzwsw%H6D6grF2ur0Q
zH2_WYU5y0l<0qn~J%0D{YvYPA39|V+#d#ixA*#=$mi;DWAT&v_MlVBVAFDYFXQozU
zZr#2h7Z4Hg4Q8-$ZpIB<oV^n9ot_)QD{Z(g;q$Jwu}WcL(edw&2FAw5u}^RU(mgDo
z#c<&5b|<vxlg-xs`1Vc!_gOJAN!opTy+2*z%PL2Li<C0!Yk6UHPwuad-|@)qZ)})-
zrE;b^?<@3H_c0jt4rZCIF{2|WL#&0joH(P*w>NraSe&6c@k<7#47>wdc{9WRwN0qP
zuoR@M4_1^PHV>}+RD|w3!LCog97bCj)7x#xm$jiNnUylXj+8;EK*Z%u!U&|3A@s|h
zJJdet6f%wE?3>4)d<ZF8c56e`DrxD&w5n^P-!Vq0jWh;{Ji=yju8ba5hVU|6t2M10
z8m$vji4J72w%OZCKX+sLo$jM04F>vxf<i)u#>)VudiJ?+t^I>9WCh^PFd-X#Dkxk`
zzxBr(Sg$T#ujtFnW!Sn6f-Cy|IlfukX`3O{LzFfxmaZvEZ|WiaZs4~uQjyVzRt0fz
z=w<b&U8>H?8wXOWih<K#IS3HD1ZN^XX^4x4nJje>-g3&@)x-Q@!dFQhq8KD_wN!cQ
z&s0zQ{Nv0KtL5w(r#e6M4VgJZX^9NW0=`>iA;){uTj}!fHu4#2;o)l{;xBZMnxE*m
zw*~1m@_TDxHhdnSEQhfT9DxBqN+d8ufg%gr6|#-Gr-mYa@6MIGz-$rxUwcC{d}Y;-
z5g)P`)7-=mW<#Zj(I5lEtfCm?<88PsQusAc037ShorSV45ElqC#8XgLGcZoYwF`Fd
zeZ`%Y9*0sx<3f-GwxcWv@46<blfZv(c3{AqH90KUr^H^yElSN_gL@xPa3FsxMh1ym
z0gywUL*ka9o~B?D4Ms`=FG8wBsyo31LHi9ES+d*e4swEnMpt-9_Ux22j?1XzNvs}s
z!3iJ)5HOM2d#qT7W{q$&qmlN&wPg{0F}pYCp=Oj;Z6IS4k&C87uu~iz*o+FB>MmIe
zLy_EAs|kgX*M=KsBk>H^u3a<j*@`R;BW^eS$469##N=fXBIa+>IWL$k^5FZfYk4Vq
za{N(Ev3zQ40-6k^UCSO>-%Vnm4v6!Tn>}j61#fuKtALl4sq?ymDBu9dQW!H$kZ=4O
zuY3Q9B~rxVdw_zVQ*-m*M;Ou}r4Za`1s_J;ednaSR_ts%b5iBKJEocso4cmTTwpeT
z1|sS0p~W+O_KkVrBEe-!RZci96`vTJspX20NS4y2I8))Ibdoe3P$C)trAx2u4Me_Y
z3$YkNhfb-~R;BH_y#I`dVnSnr@}V`Qd`q@y`Rphhd^&UO0!0Twh<Tkxjs}YB9(H*{
z!&2ZL2$_P~g}C$+48-R%A#c(Pe*4&49yE&Nv<6XC1ux?+uRANYUr)KeYR_1XIzC@i
zg&#sOpg3UMVJ3DY!+f2gA(@3<ymSd@4yV{AQJfCLU%=CeUPnnAHY|ysiu2<3edY1Q
z0v}dM9f9P8FS=oIw{XEP7878NE&6jMHcI^+QksQt#s$9S&Tc6Cn((#AQ0MBP^Yl;t
z<usRu8pyOKowGP?x;BtyXgJ}WT)SQ8<k{~NTWDAU;1Dpxk#J|Oly98e=a<lkAGYK=
zJGB(oAnT_+M`o-jc-i)0ZPDx$Ev^rHd}}?fA0iKsQk5wPJFvJ)X-D{dIeAFMAd}%3
z%`vGtkBy1RJE`IfqsypOWNph@$p;$@dQ!ZXt}-wbLW*(1S{R)GiZS6m@{tR<Q1zph
z*o=8MlzS&JOJ3YDK(xOgYZA<c|8-j6x#Kft8NK~*t)&;`&_-mlRC#0Qhv|9@m7Fj@
z;NoD1WUNBxh)Rr|n@08-G%nIBG<53RE~rfh0%c$^Y7`Pk!tma0SSPcLuN7x|Ui@)a
zbuNw(89RU!MCo)`Tq`bq%>uu%SHMlt&)BRlnE1R8095$)Ce+JN9eIVZJyX{u?9;)P
zDcWRVZe{c9Cax13q<DCRjs|1Py<A?Qh%<`jd8mr4{_7?8gajL$SQe?5n)9nTU;7PS
zZK*q2a9%PN@^E|)?;=;buuQKAMRk%N)twy_eO^b7`Kci_pF1FHvvhbYDZ9`+t#xfd
z)HGY)T7}7;sC&fUV-Ywz<+Gix(8A8d*04KX%XRowVZNI+_H43vA{PJ<8EAF~B_ypC
zaqp03u<4wav3kma724!bp3H@8Bsx0us!QH%D*BLjkZLwvPUwhXfM+&r9u0Gia|1!S
zBzz0E=!tmyuD9t+C@}#TX2>2J5N4<z`Vt+Wx_kQ6N~G71DIcIjvrr_^I0w!uOe)tD
z{5g0SoY<mlq!!>L)AffU2FWgy>Haien$a*W60}am*%8O}osVIGBo--Urr-*oXvi_v
z3FN|2aBO)_-j2)r35*q1Ik-=&DR@Cf?i!Se_)ZAC(=N{XgaGhx5fAq@aMYHF7@6>%
z4Ee0D*pvy%G@U!tk(gA$ZMNgBALcRmYsMY<H%Pkqvfrqd_geF&5jD9*>bj5lUr(#3
z2F@=!_>#CB!&AJp<7ujy1NOR?qH7r3T*q*i*ix@c^*UKslLS1KJNW*nNr$1lFLr_F
z3bqg$8?xwLI^KJSdJg;^G6bntjg?}5AqBz%_#o7byoF7=So)HIp*F<ip4{BI0f`R{
z87x9lkML>(JZ=~&;T&jDV1bcDBax*^W5b~3MutK(yn^*a;s~TO0+ia|BvF8{=my*0
zs|{1ljJ%(~f%FmWr^huv#E|1)uXlSEw20~NXFTwJVnN;-c<VTxPz~oY&32l7>oG^Q
zQ=+x-3h02tJMX#_hof|km~<Hb8^66*Z^<>b*`>9{?myAFGF+?weLQ1@#m&IZO4jG2
zAgT~yKy8XJusNy1X@f}9HLO_XJK}0jUy-^6x;MCbO98!1W5G3+B(rC~=K0B{6<P#I
zDjU$DrDBYTf54`|QRGmmx`Qhj$rxU-7dipl3(nfb1g*^<j<cv<YQY{z$oyXav7Z1x
zi4aVXHwyW+mfW`<?_N{4;X1Y>u3Q$YQ@h)Oo;rVPP{^6scE0Z3??_1<yq{SC-?PzD
zkY+f~W}Qws0*OuHoBlY4$|5;!JG1v=))tR+9pUL~mL7UzNL5)Jddor88h+0fJP!6)
zOm$|wZNsd=IZ~TPTDs8*m1x|=%TAGP^GKFGlIFT5<LG#MA*rkX55ps5mti!gVn@r!
z-sssE{jHiGx>_Fo@_HuD3MLttt!C#&fMYT?CG4;QDo^r-J{B$uWzOQwIJSf|p~4VE
zm4m(oPNJa{`|<AuPg)&rx)m?Q<*OOYZH-Q2ea7!aY;yI$c}b~^<&)ttRC$GzsRE(X
zyu6=tz8oux^TiJpPsGFo4Xs+a^1j?&J6lNB^{Sz#(xZli;EP)NrkzOcL3IWCfHwfM
zdsEHFK{hXWenY5{`YS@?E-z1^PJ~Bu7*Z<&d-1hOK(P8;@}Q}Q+7;6-2HSsroZJ>=
zn2Cs33fR57F1xjDPvU~@S!hfG>P${c`aUSFbKa2aezz*p294f-Y85C}dbk|gA8T>9
z%G>?Lk<Xt#z4`r9x4{GQrTm+2R&zw(R<v~0E9dGx6e!LVWm>^K&<wM2RA|T919=B|
zGFHlU9*dfNaaNFR*_q?m-RL)<-*W&!D!>Dp#FKTts*-wFoSmI@I@uSHK^hAS;8;x~
z-k7FnLv8es5ZMjuWjSY%Ha|$p(mr;Un5^YFz^_*2@IcAA(ReTt+ixy%AUyYvuA0Tg
zCtKV80g8&T1&!VUJRowTLLAa$Ybmbcr?`u|`|iZ8%9*f>G3p){;OSh6fSK1N!6~9p
z@tdD%ubKZ6hm<7ysYWJ50#M>1)qD^gppHwFen>dPVH6~8MLV)qMS!O!OIh~_&P0cj
z2VewB2S6#o(3T5o9>RC)ha>!af#o|&0OztrmJ8^Q1_ozy4M7VRb%hrY<-{Ky&r^92
z4uA`=?OOC8#UwNSd45$xJw#cq*Irp#_0{UoaMSZqe%5Jdpr|7oN8SX!0=L+vy@|fv
zc@fCQOgaxgTY5#I;?>9{X?bmZrg@IOX#2HE-QU?E#~B^&`8@J-o^rq$EM{h8`3bVG
z!xLa1ZH1MGV-`<l5ecXy|GNMwnI(7*QkAF~0ZZUU-bsiZg*|v)OX?KiKA3kzSPN%l
zUj?rZ4lVF65)7dJCr(pH9Q)ncQ23;@Lr9ZaMAW2)WC0t1mE7E1w5?MW4GygGe)2)p
z@?Vq7vPhN_cS{>16?cL64vD1W5d$kf1&Xuhv<A*elFHpRx<GDUh~m62B#MK2zy=$f
zS_>;zlA|lrv<<nx*T-{P`%$M7j*Aq30&=4SVW?jJM^nD~%{Mi4Pf}m`<1Rf#-wmMF
zz1RLARPqmwUqDiYD|;?M$yiwG385fEOo>C0gqVoKPGv12g!rd%z;=LUmg4Eejuar)
zA<~z$z(YtXa>Qp71SM=i_n<LyjyZGM2E5RTBaSxE1TI+bC1+em5Ujn+1K`c&F&bpl
zUm;0Ka%(Aa^&d4upSey%&I9AOR0a~`Q(M0u<2eRqi$%jm8q*Gc*$tpRQH%$PE?Bq7
zIFZUd1?hskg{bRI=>&E^<7WpH7YyIEU=M-{Ie^_2haE*O4wQ<3VoZ%Z0)I)ffie?d
zS7G;FZ1&#u4plkcqIrWp*tWnV7Gj1-9_$x1#a?AQo8k1Lx)nv}2lPjW0vXY+j`0(k
z#}KSah4yH@FUw;${D0A!U7QLFW*IRq`+L1dN0F@8sPBx%4=9x~M}Ksk)yxTs>sBRz
zm`r#)95it~Rmcdih;q;J8+z1YjcS@!RY5Paah{8^hPFOUOOOW_Sl`5jAan(N1kl{k
zZKlZ=1Vd_DfKz>j`jYt0S{P?0oElGTMcrJC*nT~{qXwD}MxBmd-IQg}g*<B=4_4O3
zqY5+zfzdEgn8zTtAnT|<8+}RV{7D3H>T#A~jfqx;xm;_|Uf=>o_9=e_St&(Ft9~=e
zUQ`TVmqs4A;mrnvip?mOWPN}CP`&=IroP}z*w%!+@duYjAY%e~2Dw;J=l$j;-+mNa
zg`W<Zj2!^llLKAZ1&_xjTL~l!v&g4k9rJN)sGf_cT!exOS{=l+?igsr5dz7SbUCpr
z+zV&UJK3B%pMivP&~OoxqEg)yoX^6-e?dn}-Jq!ZCweU19-kup|16=62xjpzr~fQX
zO+`pj0n`cVRJ8a{Y$TsBMDqEVpAJABUEV?cc=QEkm|S*m@f~VIk{{u<Z8uIdw!>pc
z{t&7_$BoQrP=zGEj_iXe1PR$fc$)qa%(@$XzwQ5^BcA;`U8S!dV=~M}9Yh>R!yYvN
zP7gqyrk<ekjzW?}17!oYDGASay;TJIhU}8qLFpK}?mo>;3_b*}iBbOv9<+utKeH=!
zs?lV=j6>Z$ks;R<M~mBNcyB#y-YsRYf7##UX{t$lQk-3sk$rCa4;6mLVjytj7mcKv
zodu-!Nad0_tkXRf#bxF3{Ctu73oC<n>T1Z#EuygEn-bm&#!{lx{5LT{P-xM5dwY{D
z8FFki5J8obu`XhzD<4=XMM{zt#(Qo@vkD1q4m1P04{xgBcQCGOJg6)xq!DO^k3{SN
zSz<k=(SuPpqrd*%yYssB&;g^MC-?{SF`HnX0O2+cAbS}_L~Up#1_G|ES4k%F)b}SR
zQnW*7As)B4x1Viucq}_wjn>OraH#+@dn0zsdi7f+P7p5UUh9ve7R0?U4257D;^R}a
zgs5}f{bYNC2eLkt|7{?!l~S1o!j{MhN$hJ}k_IMgbog;?*WCYiId$uaeF0Oc)A;#V
zjNiQJ1-bQsbJJXkH7U@qX6_BN_|VNnxq@*8YhfhuT@9&Xq_j$rL9ut;`}B;=Ovi8K
zJ2wol;$%Dj?7)p`)K!jtNBuYg4mEpdYM0P-9K&Sq?Mrnupq>p@ge1Au2z@^P^XL=4
ztc@tis7RxW8@U+?XdqsYr)2&`u>8mu75*2LINkL9H6NuMy8Okx=w*`#9jSBRN=M7i
z?aO~xw$x?k*^uUieuud5pF1a*xz1u7+sE)JBGa(5IEK^$!zENA2AUSQ>Kat3kvKI5
z*sQ=9iBk1&NV+O`9@`oC+O&)QD?Xr)byOt+eE}#zxFa1lE54AA)AQzKG~;EH(XL%?
zFU#j?*Ti_P5i~R$0BUV!ITx2-4t}TO`&8t9$mpmK|L4zOH7G=O^-1%IT7cIMD~L*w
z_}86-ZOELIA!-X)haeqlZsEPu52A}jAsrXQp-r7`fj0*a;eB9O)y#U+D=S=VNY+tw
zXVw?c?@B6rqre3gAYE8cwPCqZc?vWuL<=^&5+W{{n#N@H;$iSM=-znnKw+;}+bO#@
z2||a>%aJp)nm6F%F`Qu#TVUw-7^n*$&q@&xwke|oL}L_k<;D$*p8v0`YXOHc-Q%NF
z5)(xx<ksw2N*lS9)FzBet!kdklqP9Z(`A~`O&49Zh@I%73q?6)w$rhTYN*vxQA2D;
zI+BEttjjSh>bR8K{@(91_3X3r^z?j>nfbnVzVE&K|G&%sb$&G14AXCTUB{Vw!`SlX
z8a!~z{#}+>X7{GENYwbFkX1hPz$8ioeHwl@<L4Y6kJP${7nf+V;H?UY6kwqxb!uRw
zhQX~kcCkn@TmqiQIvSuU@O(t{sGf6bODzYi(B7P#;2Oy=${lz=0k%W1c0^eVaNLo$
zH3^k26bd-*OySD*I=4HY6gGNVK%3tIh9`a}sHvg>n%ogrFoA%0qsmSFj(KTf8?8}D
z?oQw)iOugGNb2ezgb_`Nl!gcr;r}9^HwxTy*6@$81!iwBxe<n+XiNyHDM;GNzd|;&
z#-k=f9jR$o)xxdKDd@kb0iWn}rH6#3C?h}Cg;a4YH&d%VAK7K@+-fTnd31d|L`rVn
z#=dfZA_)Tws3s;fQ|`s%q0_RWGy>w2t(UxRg<Opi95n6tp&Th4CTIMAjtwxaXjh9y
z3g+LRf7avJx6SXe<q}rP8$g<n#qV5QoqXLS5f+60LBknLiL>D4t*v+Z6_h98ZdKi{
z=#9v8<<|UPzd;HIxm#YWr3d=lSRK~UyW)x{-PGmSJ&1%7n84f-wIPD((OxV${1p5C
zk-hB*v(BIl9{bDthQ`o^Vl5LqdK3B{aJJc^TR}}kN{jiNI}W;)AlV8j-~YFawJJBz
zFwEVnC*bO?dp0$OqIC<iQij+|h830DPc<EgW!r4$?R4bim@hr&uNq96kW;fxVnU#k
zlPvpDBvHaB>BMwO7JlBVZbZ0Pb`k#~qr>d2_sgz(-?n6zB-cW7ZhWloQ?wbuQ1v;6
zqy3`yLCHg6LMR}#v#NEW!N3q*)xX#UIA0bXJW3giK_q_+)CDj#s5WM#ZD5MGmoywe
z4F5KFNR?}RhiGn7KS66JduK$~+mTI@S!mhmL|8T*@B)%d7q*LjI)wQN@n$LH8YykJ
zGGNywwa}4GREOjiV4pQos0l<mIOIjiQ5Vx^pK5pcTK*vh@^qx&gis_ZV`N!MG=-G>
z7f;KoK1DWTgQyHEeI5eaxnO9Vx47ym*He?<^gKetA<bK?InjP;vk_!kKuJh>5N_qw
zXCkFZ$bIhYNAVO4&LUwhWMqI90qnW!Y3i6b>kx*{l<EEQ%;hbPele}P!O#<A3Z4vk
z3fK=)El>e<3}J^6FfeaIG)49SLpR$GjDi$w2T4!Ldt=E6iWn9|QS?KDx>*{EIUfej
zqa@H>izVC(VM#Mf>~t{uLN<Vii%@ec*S)r|&Bb7^;V@UJ{6y1kX=bJI#v^W4^S`IT
z4bt(jgul+%G=b<>lL7XV$%W`6+{+w^qNESvVb=Nj0!nG-#8d|nTB#1tm*eJxfOrgk
z);Mkud<Eb#?@d9;J%>v<MndR|5u5?2h272zQ4wm2teXwXLWb?!#O5oqlAv%43)E~e
zKh&u~Vk_TsgvG>4DuiwTgubNe&SDsxzpJ9GFDX0K?7Bs<MOoU@-`^xOPXyw0-FGsS
z0_<0QKN$jv0(I$85h|0jR85Xc=;wdYIvVjI$t4*Va*v=!u^{N0tg)XSY)_7<83Aks
z@AMCyIVWT#=5awE8tUt82i_Tx^)GMO1*o;Y{#O=9?2Lx{G*E+#yZwLJOuw>V+9IN%
zo{sb<8Z!M4MhV?`<*Vmn$!xh_Kp8cR$Egu-Pd&|&gi0TbCW_vTqO{AOZ>cQ!s7E!f
zR4|2_2)?u}m?IjXZ6(`boPeJyzLAwh`lb7b&E^7JEGc%ywgq#kD=7?3;WyE|{A^0Y
z6r7K1t$t=JI`1cp7-I_fJ1*J$XAX6xc?EPgmsyXcT2#Pqv@4Tl72YG8zZqz&f1i-)
zAP(k<4p9yu13~sp7;AuHdL326$3j`<Jj#z1J&RJSJ27{qnb1|=N;mLDL#?ZF-NCq)
zBCSCCA5)Dt5IhpzVEa-nd}pA(l>UPJHnKkkq~Q6to&2XM?J#X)1ck1|uPzb85lpLd
zzokq!XoTjAn}MzXMaY7ukZSWH;4Owz#%O`#B5F?gGPM+4TO9FUFt}h(3$kteVU>aR
zk$#k)f@m{bkotc4pdfWbvDZ<WJ>tY~qDA;NOb5Cez{@;GKl;I#)>HPNt{D1W=yOLm
z(0_=eT}f&~kEbq#lGN7?ykQAc*2_X2@RO`HN4kio?n(M$k<>_6+mt+C=Fv~aC?MX2
zc%MGI#qbW=Ks;Fn-Ub}>;=jHJoO^-U^0dpO$j9)AL+VRgY_Yhv?=8%pig*8?OJCv3
zyTu*7w0Ueg{dA1zr(-1_`-JMg4~sH%a%#*}BI1_R8o6gOV-}rl%>LV#NEr=zpWIrT
z{=mvF-=H569LWd)Rv3Ksij>lm>rw4N|8=^J+!V_=BBcCm8#J|fAr=~_MuEMlZv6G)
zmkJw>AzedYvZ2NRC;9Nt8Y%WDK_E)*fUrm*W#Eh`5&F@yA{16?K#d@NVBNqB0(pJ3
zYvK)*PFHR2oV7o|uh*l!@P5d{ar9kK+aG1dv2bqy8GE~99?3qSSDvkqVvcYc1zK`$
zV8zS<GikteRa98o`;N1k(N2*kng(}RUKg?uYeWhC??tYGU%k}~N}5Q>aZtuG?P@Tt
ztvy2iqwnm>sR4xOhMJCvPU5i)q9!9dEztQnAXSLRe9v98UJ|*fX>dhug(zu7wg0*j
z7p2S^hClB;4x{HN?b7-~AB^cx>ZHvuZ&=$C^l^1)&G!=b{KWA59hON#7Gp(wLuf$O
zjc%*)1|05jwr?yO+}Em)TWfFKU&}R64gU0gIX7+o!FUTT!`kZbUh{1USj548k$rdR
zVPOBc&lZ7hg9895AJ`K%R`sC?h;m6G6e3;3qR2I{)N)V%!#hKIy+8HYPAkdMz$$4i
z+gH0kGrRC{y5RC*|EzC3>RM7HJRaL{_1(a)Uo<4Nr?)Gn2P|4s9aSB~w%ict{_t({
z9OGV<IA`}le&8!ZO^s+dY(h%>6J<UUWd`&{Ty!x#eOsj&Vs6^TGwS20LYKglZD?2z
z=7_jQp<=Aq6Qrk^+WJDrS=_&FMcMr1<6(8<d)J|K+eo2J>PPK7`FR|JKp-+B#MmI=
ze?X{Dl)veMtS2elA0PA6FKMi4tW%~7YMcR2k&V<5h&=wKY#{dD`{zF9snS88o~Z+m
zLK`Qhc+85~Sd%SXUb-o2(b?LNmUgR}eIC;?Z}gWkb(Klt!>!1Gy-u43q}tiOD4a3-
zR=d%HZJFQizWF=wE5JOeDFZnaIDq~={@_rPYFSFN)yt03LsDoC5T@33WIgVA>&)9%
zw|!OT-Z=9ZLr8%U?h^@=^W|BhcIYYPts`y*t8H#XF9%Qv04wrYLTMtCAU>sXe(cL8
zezgQd3Szlb$!6PdZG4ebn?wcIw5|!0&K^pzS`&0XMQXwToLyd9EZnFZBQGD5?v%cA
zWp|h)wS02-Qd5#*U|BzZyV)C(I4#YHVFxfcctuYICJVg{dQ2`q&bj;J!F&C>T?Zv+
z%vF2Kn>#$+y5ovUhaOm-=9bwvDEt%;nP=N(8aEsJnoKSo4(T|}WgI_dtsQOj+u%gj
zi>iS9qZVF^Hr^0$GveQCi_-;S$;^LU8O|8X$dDy47z|6VtD6V!R(=rl7Qb$Y*}bwh
zDYUz^MSA3Yaod47hs|vaMqb>d@%SKKo>jx^=p)8@&tgsb(CP#^4PI_+`^of?j4c{P
zADzwrjy|wB_xU7~95krE9rX#u7JB5R=ll1QT_4SV#SbWS^XHRK_;Qwc-gI9d{wEFH
BZj}H4

diff --git a/docs/source/_static/img/dynamo/td_stack.png b/docs/source/_static/img/dynamo/td_stack.png
deleted file mode 100644
index d20b3250453c503aa77b7967862c1390f05c4c4d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 308321
zcmeEu_g|CQx3!>A3}Qiw(yc(0BE47XNKF7k69Iuhr1y>jDos?R_a3AZdQ&M%?<I8U
zz4!K>%zI~?dB68_-}wXX598$H4CI{i?6ddUYpwmfRZ)^5fl@)woH;`xC;M3K%o(CR
z@DC-?dGI&6t1KSiUj(*lG7@KU+Gys_oMAX4_gEb6q_;G3aR4!Sa=KpeR^I3hy|2no
zl@Ez6a!MR2;;(KobHJHYB~tEE5izH}HDY@A`u+XaO;#C7P1j%~KSI@}>P`>7oDS+L
zE4yYN>Y3+O#aG48y6bC(+aAYr4=-Z-^;}OqcTGHN>Q7D%S8GVl5}dm#cIIFGQJ|J^
zgCPEkn{EUdArXVunSb*~+!+G;m*@WdS7>>8DO1$&DU$rF72#J-C^CHZ->!zpDE|yW
z{6i1x2&#Xx_&<K_Ik(aa|MrBg{&)`7^YrK!JNLidGSQyZm4A7dL=5li3D3C|?A05f
z{`Hoxu1(PW%LjZ_j3EF*v}d#)@V)0>Z<%5IyYRn@1+Zi>3gWA4KYvmBXWsg^KO<J_
z@A~iJ6D-;LAqB(u$9lHQ>XQHVXS}?w?Eb4z{^P`-J`}5E4A7@Z^ZVC7LqNasuY>tq
z!T(->|M6h|_X7OSvGl(e;Qx~t|F;7C&x!GWr~nKl=lZ@pHPpyP8e<SyFLp*8$A+VX
z3Z?#=U-$!wuY-qO?2rDV4*nqbW~<)&X1CDuvn86>vhh=h;0oX7_^T^X=Sip{MjVD%
zG+BL}Z~r&t!6*@5yxmkINdKdFPX*WJ)|{YF{W)ljFIp3V^WDm#BRW=JAu~w<tQ^o2
z_W!n=5#0gFJdk4OzxlHLb6_<_E?aXGOEzm?FGM%9)eLjJ93F_VX<|+@`uU;lu;A8z
ztOC@xPBMmZbuN9!9IF4gr_V{9j<@@(nuJsv2rke(!<ATCvQ{i{sAp+pGx<NG#~+k8
z;nltU+4xp)ye(1O<}Ky+i!Hljo|Yba6Blf|<QSdXx71SQ>p8MY#yp0EPLDQUeD#5(
z+0A#Q=~cPB7_aqyEos;}&G5G?F!dRn*|1Ah1j~P{)+n;JFI)S`V1coET<@bs8pE9G
z15|gpy?HWB^wJ-zSt8Y{bjpw9Fn;H}jO}mv=1c6(m~OHsygO`|Q@J6}_UXF))?$DA
zu>0;v#gfNjc1hINR~KZ2A0Ga_xQhitLCNo=3^u#-|2|W(9U>juR_+%|1Nr$+GK`x)
zr<wQX%oKGi1uRF<5aADyp|Vz4^w{YBlPIHxz?+65$6MVh7x=)&TZ_!P72lHHlUNQP
zaLTRP;fUrz%a^ap(7PKg^kq}emoCBi&fQmvnS-=?>`zmw7ruDXU5wT)Y!^8s3l5ls
z{f>tZV0jY_{|!P?BOAywc*TFRH`%(-o24$))m9Pjw(&DfCWJ1{tUIm9@Gau=EBsDR
z2tALt!<~0W99uI~lUr=tg~Lh1FvmJh6TYFrs$aAojUD5#<|Yk6)RbUdpAHG{NC&dh
z3OWqWODOS8oa1!eu}FS)<2MjbumVeBj>&zX>H8nG@Fs;;nN6zMaB2SjuK>2^qJ_As
zVTU2KG;#TKb9l({9?ez!vDHR!oA*VS#Jl$~3FSYpf2So&=eqRBw)+XMlz@gxtx3D^
zE|=)(5$8m^Y(ZH~H%rOzb2{g#P+is(<#-W2SHAOq1&i1OKA`#ZHFKc<5ztVvXPYyv
zY5FxDFKWFA%miI`S4OJzm+3*Ynh9RGAPw@uyLYv#HK#Arl;dB<c^sH=+As7Bu(_^{
zf1T;mwhXcxPVfr3T<Wy`+;%CyQCfgUaJtw_zgFgJ?rQa6u!e5g@dk%0J`(@flO{lF
zychmj*&jzEX6jvYdNOo0D|-59u|Jnr^kgr2e?C3FQa|)`qnT}{U*D4(L<Vl%a<MOa
z2HYA>&y)T1evW4)BMzKM#cqd#Xu@mv6h3cJ+=gl9SSX1e+jS>N5!==hTzDNvDM%(l
zFd`W5z7x5>>UnA&%=e4Pb}rF>rcR1gx4dU-b!<#Ag8deH6Wxmm>m(dwPLKIeY-{xl
ztT=%kI_KJQw7*$}5{B-~xhj<VVZMNSdar8&O%Ts-I}_ouy&(11hBdiuretkL?E9y8
zAisBf#ybU|{nAIjWA%?um20hntCfavo+r+QFGmE_i!GiOHqyB|8N`6x;OtEUal)sI
zLWgB!A^4J_tkjKw-6G}S>qd$RlKz)pv_w1-Y}*}mU5(^y<GW)xtGxYq%xyF1rhr|>
zO<@;19L4)~{#hDM(+&pPZx<O;1ax;~!1+V5Ua;uOwQs~TKN;f#vmxOmOW7r(93<2{
zMd;$@F>(o*UZvBxhiV9|3&+xXdqQ!7_?IK)k^XcpOp7xuuYYerjS>L#@>??hyl^)1
z;~;-;NC-85X4ymvx*g1?zu1`iX+}(+;$sUgR&=%dVVVumPOG)|!S*6Gf=TugHcsV7
zI;chb_Qn{EZJc?ov%x)<A8CdlF*LZ=bd*1!EvLCoaLxuHHs;#nv)Cy^&&~Ow<;^z7
zR;1m|ph>iR7%cMI9qA<-3&aJ~P%%2i7Go1D(I;+*D>B1Zjdyl#4ozqRth?*pg#Smq
zI$uh?IrrITzgsmB9E2jitK8G?e(w(dsSrKHlT0B#=5e?j?5Xd0jHO(7TJ7enbkY4}
ze|DyHGPpUDpL1~U92J-8vlp#V*lvOlp$*wWqo1U?SZ&r|{LR(Wckfk<;j>O(Q`711
zs&aMgE;Lat1ec=0UN-m@i2SJ-FU%DTvi<B8p{s|E@gP6mKtvoiik=uajJZ3uzv#sv
zUV!YSXUyPkX*O7pnhENI5wCl=0?p7WLwPmG`!XUI1ZQ%*B-V}(wiR3mF3`K>If`=p
z-U&1w6pnAZQ8rS4t0oL8=OFlWTKMcM;U+lI$THj671B39j7hOKRI&)9JIQ1z)<PfW
zU>7!Aw|h2noHu7iU3uFQF{I!9ukdQ>So<=A@*LwoqXmAcSRZla-mid}25QSKWIzKy
zYuOa3ZNIK#BuJKx&I2ipxB}yVepYVNjNSo2TSV`aS@|qoS1FabMPIW%WLe|P;&}7-
z+I$rNb1>l|Tt@Y0!oX(@CPG5g;99Sy{!K*j{@V8|RZs?fw{@w+sts@;k%uc4xgC1+
zloSugDc5dt7`>SXU#+w`U-d;^5Sl+t51q5X6){gQs@uP1jC1-1&s4iNZD@T4d;!A{
zWb1e?E&AoVD)L|4{W^x_cJt3seP1FU`Lss!QoB=}BSA2B%@QTgU7RbdQcqW^rmQyY
zyse%r9h7w8JNaLg&{qen`j)46nb1F0O+$#v=5{}@<3!_%BoMKU)4gSr!GemXC&#)x
z#?cZ?V7X;S8_lhtoJDtQ3cVcWv)%61WUM;Ra{mey*N@8e25RXrrZlD4s4nb+7BYqv
zfq~068kqdC@^d}!=9Wvwc21r|aWNOqGwaw6p-Uncb1K?soW{NLY5TXusCi#<xvo|@
zj?c74adWDtDaw_%f|zLqcdr#-l0c*+=DE&oE;tBlHlY-#_(8fnn+%QhYi7-yn8mGu
z952YI^DJ$yRLq#ubeLu!^5@@j7&IIoSlcc(+`k_l5rfK`bJd3{f}1Uz?KgbSfFMzQ
zlk;#mbr`H7&2e>f!r}R%#2v70{=M;6K@|43&KX!gq{Uroq|-3~OizA(xK|}=p13<5
zuD>H8dV0`bb6a=kB4f-ri-&qEBPqS$EkHe7%pk<QzIa{D<^F+ga9mi)uuXtjXOb_o
zZn>Se?zZR2CO!U&A=`6dtR`{JX`0*1Aj-HE!dbbADOtI)G;@nrqCL;T2z~EA$fMml
zRpwjD`|j2*YBes_UFEt<7){TJFM0p>3dx3mU2c`y!2kYS#TY*PQ4aMykL@{8JsAT-
z3Xh{y>SDJZSh2jD*9UN>GI_4%?P=_3-cX6<?n(TX{&DxW*b_vK&Y*U!?bGphd<chT
z8}Gt4f#yn(!sSfL48HrtRPz??IUFvR999#ezB)EBr!Q^o;;SOa^xrSh2@M=t4kV6u
z07Q(2Z+m;=vqzE_?nAM~VAn(aYN5XQ5>#&!@y?XzN`uRXs}~N}8|hmCK)w1r#EA!y
zSE*a!uuc=0obp6$?BgrUMkeX_TxW8(gU98-zrcwKJE&98hS<V?0%^gEZkw&V_&kI!
z8e1OTu|bnR?u2mlW@+*wvNX1Rcu|^(wMqbAwg4(}^*Bo<(WigI+gnVB&?Ls%FOt_%
zuj_kjBxhq!mL^+{y73VoK8wT(I_?PPqC>Yq@NfEv04Sp~S7v@qJB$jXzTJ+)g0lc3
zsHhN%9&df@mxtx7${)0K<r~VZ>Z@>nQq<4I>@vl4Cd-5<#^ZGCdl564>2Xba&in>^
z-Zdu&&AfcMcVcbZ_+QMlL{408>iZp|PuW1Tu&96g{+qw{4X~fDuGr~?gOAE0gZP^D
z)HSn_#x}5%ZRQxZX0%)vsnte&4zcD8&`rw|-jPp*{d^urPL{WNElZ30_x2`3o|L>C
zaeF|!PpjV^`|~4XwB%*}yCx)&e1t?~EZ(7aWTL2Fmfo5UcnR>H*-&wI*|-m}A|9}d
z==Gen7%ZqiIvhLQL9($vEvRz?)#b&>(f*7-y~i7uAD?zB{>%^?)_h_v5DNl&$YwZ;
z_UBPKE4#m<(mN1O$no(rKK?6o@s-(u!U+UatMhDZxW?yKS40M49B^hYYL0g}U6--p
z_-nqKtCjy;`qo+PQdx2d(8m-MwS~UA^IkClK;st+J(+UWHdV*3zj3o5i`LvHdaD<|
zUZA%tf7DaH7^bQB2*9uCr}@u)yG3xa%lb2IvF5=73rw@_19<q##6%!WN$wHIu~k_C
z(q2nyxZVQdR!ffxcU+>RKh^OrKy}+E6)xMTO>o~EG~P0`na}rRe$+&K2pu6T_6otM
z&wCjDtx&V#3-#jm6zX?*11bChWTpD>Wp}QAjh2ZAfI1P<K~$0I%EG-dPQTvHB)q!3
zA$(G?T5XQDEwY}JX1h@*#?Va&T@R!c<e4`EU5VW$h*}dD`TSUA?-)?X_vzRaYZ<NI
z0}T3IJECg4Pdj!tQ>u2b(8RSJibvG%TM3tkO0JBE$lsbBt#ZZV(ul591(U4}jT{}b
zHqleJTnS~?z|Sn&F4d5yy-bub1qQ)jJ2R#3hYsqMPU{vCthT>#(;GQZI~3p3oc}ZD
z{PCaOz&b#%b3U(olTqP%IIlU6v9nnzn~eguWoQXxg|<fR8V~nzw!wW!9_KykG;uL<
zGdh>K#1xsfZP(3uN`0<uBz6O#zP6fEvGOhj0~0|RavC;Vnms1p=-hmTPaj5EVgOCI
zrpSGic%qEX4HjG}RXME7&qqd|Owyk+bvS5?_gmLplZgN=EEU!OA;{?ao%kE`<;5hY
zOW|JappbCVp&q=;h4lZGLX6bFUhZT0DgOcQ-#-?sfvtj6Xc`TgCs%~aLVxZmwqLa(
zn@KX;-N$1jODZ08jZ*wvhjk1I5E|^}K#|$ULB}!o0G)gHe0AW{O_oPn-O754WjZw1
z?<=xv>N%&b_RxDCS$6~E@*_NLcL$#qUsmtGnB`%&tlIu;2C~(ZZNVM<<5D6JzV3g8
zo^F8*GtOx_Dul=WemJ(?pmx5=ORJTG>`Eg0M#=jb1%^RX+=1VOL9n>;C<gIA1})R|
zYr!LbMw>Sc9`jDJF?8dR^k08FOc}w0jw%BXD3;BrCqq?_i6O@~^7Lf4CJi8s>RC;x
zD~O$EDf-U<CVl~k7wPP1k;5fUJgC?8*w4jq`_tIG;XhjY{$#o-tjd#8$I83pndS|D
z+>;Tf$)M`+H0VW)*!uuUv`Wv*2FpNtQ4#htQ!cHb#pVVVsH(HD!T2GvD5GOknYF*2
z2kf_suy{6YtYv#}f2pvY$~T$wkwx#dZvG1hdpL13zUK5~syEVj9<3J*%QHF=&{i-I
zir|+UgggXTw?ul^DGY|F+=;`Q_wPu|{Xrr8v0d*ikYnU(9+~`(UHm6P2;6w`8!;PK
zce?VGVI>wac+h0ZN2PZN`1B#z@}g@Co^;VBGfYt)$KQB~hwVa}ic=pmvKs)PL-$lq
z)t<c)=tdJx>v?+YSZFmNk!;oKM?&2)(;8jXByqayd1?ZKXN+SHGmuyevue`o<_W@*
zd_<Y5{p)TeL6+%FvO0n|w+%Tqr%B#4#mMI%Tq&A+$fndIWaEkxQIsG~@^@X*1d48C
z?C?R80H%Jxt{AaUHb4*hp&BorJm__N0qv^Cup}Yr(<;cr;@rXi6u{M(Cpgck*>4X(
zP^4jqa351k9n%eOaQCA@tLZcrO6E0KYMtDFTOsiQf*OuctkOI1RIoRKjpJK~T$a)I
zw<KHA<2|D6aOJbr2MbwR)x0SS-%kFlMPddZp412Q4RWsh4kys*NQe<@fOIQMwV|r*
z3tGhP!(L|^@3%(`s8h17tRbaXftv``pWBHPnn;VMB`;%DwHaA*XzV(~N0XXCDzlXT
z{O~D5IFlz!li4GtoJV$ZDz@KAc<WrAK;L8(yq91G?UPm10;pqC*!~ZeJ?o(7f@tXc
znEtHh65AuvY^fM4zzw%&i_X{kewkLlNnCu0Z~BvZB}&b>=~F&BYHJnrId7cDy)QUq
zilxp*tR&jK?kFP4Je-I>Jxbp{Sj??D)K5$T;hpOKNR*URkldM|iV=jH%&M(iyfRdI
z!8y|^_T*ZRtVh5XowBaEiq0#dM_2dh%BMakR3?OclbF8|3Ei6$3>5l>S%|nCkYv>o
zB69F!j#D>mW6Qmphiu6cP<iT@&*3XxAvu!dg*~v|>8idQT|rKOpoda79e-a=GN6N{
z5dB#B^UVLK`2d(8=cw7AdCm7`g_l^R7VCokcMJC*#dd%qvD{&K;TXl+@8==+MeH4g
zHt4X07q_soY5GIFsGc9Tn$6yJ=P(;ekBRS}?lp>5%JW%Y!Oq^`d=ZFm&gA|32tN?s
zLXLx~C^ctj=8ZJCf}niIv4zWQeKY2?UQbzNrNereWo5*=k+xdBpKV6F*y845YG!$@
zeZQ_rLoltma1O>X^OrK<d%g_KjEIrffJSz*A_QiVQ7^N<F)c~UYdVRC6e}YYhpb;l
z!Ii}$>(H|bKijhVe3gaPU-x6C@DvHAF7>@CV0Nm?Xh0gz*QLJzqag&(!Jl%<eKLr*
z@)SdojL!HaGg59}r4dL%#U;zkZYHS7daR33YXjUt`nY(9J-1m`-LT7I_6$Juf!iB^
zb}`9)u29|xK>0I~{i%<3@qUwsL@lv`{lpv<ZtaQ=Z?s>Q)e|2q6%V~k4C=nN)_JP`
zIKKVhWoo-kc%ItmTTeg(%ydXzA#rC}`b{K`vx?O=j=W>J_V?W49Y~a6tT_PLhTl)q
zr;T@6$gK7Upw*vJs3|NlujP1mY!!Q5)|e?G$=3x!-Uj$g_~kG=eJobZ_*#v_1?yO=
zy|oEF?s%r%9y}8Z`nyS;d`8nYu;I;><h-yCJurE1-kSAc+xEz2){2L$Ie~Pu;CX|V
zZEIaP9a#*47$+%d+=>sT6^v3%mVSCA@l3!<JiFus-;jIu_=psxF4;_8^-wiHXrr#Z
zJY*=f=mNNZT&K&3kDj1tx!AKmy#T_ZAcw>jUEtc(51=sV&(U4PwyMu%IIaJB0cfdp
zOi^Z#-lthHjNX!EWs?w8XJ>hccUtUSoPYTkB1nPdTEK>iSeeK1fs#0L^hX~K=uBpT
za;93UMYJGzSW9%E5T>KCp>fnG{!Eo{<UJJK!O|rv=zZq*q2`jk#N<9*w(_eXHq%Yc
z?aactHi=ZBFiY5bdYLmPphEC`WK_QVB`tDkJBV{U8KwfYUG%We?3;9~A)(>xaHaoS
z&)3BWu(NwH&CS2(tE)dQ5+JUr7@w(0+<69VF*@v!2!+|beyV6&b)V&E1kk8@SxhVu
z=>lYVD#jF$>l}~ZiDCx9?AZmQZktcK!ISBz8NFOir&|d7_If|{08kju;6Xnu>&S6G
z>K5Oh(m>lmZ2s8oacM1j6Ca%D`?W1rFd9%kT32#AbU<U9J!Ez^93q3>J=$!KR$!~@
znU5$AB2$5@bn+Bgopvd(IkSZJ$lp`tUj#^f7v`AB5{YIKvjrr+AVRF``Eim|fFdZp
zX#b!iN8ccdhg#17;cN<*`$q9aEtLZ+qkfo<Cz(Df3H&4$4r9^g*`OJCq$L)g&pYa5
zm;yEB(Pl|;obRf4WeHf=14y<KD#VKTsS~(Gr^+s~>_=80XFTVe!JdjEJs^pd%%<us
zn4e1y8NK&9z5R^u_ai@~+rp;6kUya?XwV;_g!w&Bj%u&Ijf*{~{$QHyGGO2z`5>qC
zTh^Bd63O06?EqdDy6tZaq$R6_W4<B2Xm##Ku*R2Ym%Q9YcHO5wV8wjSI-pDFzZaZn
z5-XzrH)>`6I)E=C+QEOJwLhTs)N3yazgHAE@+j5RPzze56+Q3En37t`;xMrR^De?b
zBr}(J-@~OGLG)OmNqeBCR98|ej-?cSN6owtp4)jvw$XjUmoi3Dq!_A>QAvUH=+G1|
zT!v;bzy6q$xU1Bi_M~+&w}#H?onbJ4!$Y0Y4(4+OFIuV6^S)|ZR!w8SDcGBKY9Z)#
z{9y4wD{0n#tlUR6?|p%e-No`e$&mSHPyJZ+TiCp?ES($^(9ER)5{8+LV{tHHHwBhR
zhE04Q9w+DGPG*O=M}mD4cxNqTfdf;WghI^Ixtzx&!6UAkl+S*epUjf_fTU}S=4C5Z
zUmR6|aAKtca0TkOgxr;w_uk)dVP#@S+8w@W5_4_OP34B#GU>4&VdX(fTVG<IFUnwh
zz%V3Ls2;d*`m!SMByEb@my8Qq#TJqMJX~QgUXNJ~Z5f224at%e$)4Nk@vbyEp+5;V
zg~E1L-FC)t8ex!uguJT$g`L0L4DSj6uaI=KV7vT}%H;KEl<oT~%<BxBCn^u3d>;2m
z#8H%9598zD4$sOdZ_@sv)*PKD*~^31MjT28idV;KYABztnfE69p4gRCLa$SGn`>uj
zu#7Ew0+9QKMe)P(mv_ogy@#N&tnlm!UIH88#^X8|cczV4xj!x*f{48#<djFLqcGd?
zD+^gUFe(YXY8r~;u^<mGs-bv<*k@SqaUSJoikZSKD3E9N;Fy>?_5g5;1Q+*yRZDuD
z3rsD5&6m)!=GcKcc5|%PBMU%NZ-j8nm7N+A#3RXFiJVI;DxMoZ8AlH_0*;{pzPj+8
z3P(H71^yd7RO~<GSp2u5b{G5El4GDKvrduVaUE$!y3L=UvZ34cyyHv#RxY|<=_<T~
z(ptq6Ie2{ls<?1GBLKqT`i*J@_<dVpz56BV(GrN!x15(gwlz8MP$P`gWd+(6e|jn*
z9^#(}T7CV5?*4ePSgsFtBOnVIvSR3ad;1H#`KfR3uMC@cyLF4#e-Z*@2B6WD+N+=K
z`bRk;xHtgYn+xxEKip}BJ!Hw53qP>NPQ9_QMrEx75k~;KEk~mt@CaFHZN~V?LJ3)m
zaLqyw6+3$dm;XpXox!+#szH3ly~#LMx$g(7Dt&DVha@AhsA}~ti4|SP5pq-$@=Wzc
z(*yV8BGXhzo}DI_*+@AylFRJF>M%Cpi<fSBV^i3JC!k6BLY}wP?1n9XQejAY)1E}v
zqKoFT-rcYc#MfS-jURWE{HgeDQdH~X6JWwnhdKM7q87V}XhQL?w}z^nPji|iQPYOR
zM!x(no)48`U{d@g%_5kNG%<{X`%f5q6@aRGeC@^qBD-6cI&N3M)L!4jq$HV`_L$g4
z*zTjVc3<OApPAL~2Y;VldbGtaXf<A|rEP$D7jHpM)&hS&0eVzU)b#~kyY|J=_TogS
z9ma`bSH+jbUBu+zL__;gtl&u4?dd<b{&wdftBZR=Ko69T7d`NxY_yeYmv(O`K@as2
zXC3<1qittbf8UT!faV^T7%KmT@9=dfL51iNs1I9no!@tn0CgI#fClbMS>MI$_ni=~
z9Qi+f*>nQIWF&K2x<kYuYqAIsW?5;;3332TIX;!$-)t9c1?_v4JXcg>&B-=fIJ4`}
zRPK)?-wwX4z=8W5&+CZN*+a9Z)gul!Lqt#3kiK(qjyExW2E<vVG>w+g0c_QLdF~;&
zL#CGZZzo7wRHxs9BrTNZ<4dT?u|thXO16@skqn52zAa-#o`5!XrffEjlXuLmaCQD8
zq`_`1qbU^kNs1Q5k__pCH-Ls=0lRHoHOe$wzS$;NH5;k0$J(@x4M9@00+dy`KNGE&
zr<k2@f$qao`1ocP8GneEe)s{8Ntph^{4jjrX}~`8Z22YO^sO>7P2)@9w&&5wyBHt4
zc{f`1Si`iPev_2AWT<~VH`$TPR;NtW!o7LA@^pmw56jnO%H6`6wEAJs$vD(&8FeMS
z86oZ4OFtghKF+3{gr=59lzJ$YpL5e`n=KE*)n}qdz;UmX6A%5?ic`b^2|%U7fA^mx
zfC3?)IxU|Ov$E_hP%c7*dv+dY)BQq<As}?v>ng64ED0=nm3$lD$|wp199JyvVbs%u
zZDqx>lZ#{*1&T~XY-bd6Q$H%$hg8-al=xQq%p~|K9VcmrseU^bs6&}olve{p;XS3V
z+~<<}OQixRq;I`tm)!M5I;I+$epMm%_mE_%ln$Yb4OV9qPGhl3iG}ZH0#dZv7ByB~
zrTXAOO(I4ccRzT7s;8rj6kPznc`Ktyq2MqR5XX87)o{PjJ1AmGY*k2{>yv1nC&tjW
zLsSl1&5;d`q9`47ojn>%t14@-4DN*NcXGe9FiL58QDb}Ha1`+nMLe<Y1cOVQb?8vZ
zVrv!7>B#(M<T7J1koWbWc5%zRbDV_-h9{W^fjMem*Xw1h3ei0q$NB;)^T6~&TkWfr
zS5R`;o|4Q&JW$tg#^q*X(&&?(2F2b{jz?P_n$4L7#?5KVb}m<9?fagF2IsC?p{6mI
z08G;4L0|``)|k5mB8h{1P4x}<LIdw!vO5I%*Z+DrVs}Bd3mq7^_zzIWBt(d^?5Xke
z93xhH@Fk(fL|$S!j|?hjY})yl*bU>e?g<)mL0Wl&NC+{-4Hl6H30I5d{U{;@?B>P=
ztF{;Wqk*Q?VRlelzmo83u75MC#s?}MAcqlR2QqnX(<=(gZ<knT7Wbe!VGlH4Hs)aL
zZi^y%@m<sSEoH#Rv=8yXQ<*#;<X0XJTYV>)0m==LP)oSJhZzv@DG#`N#qG19lqrS;
z>pkkqBo_T?P$hAeN(%=AAg=6=QA5Uj{7!N{>v<rqlSKi0O5EzLEhIcwPk5>4!^>O^
z?X%}E<<TkFYN$x$CboHkD{#1y410+Dp%{2e($8tvg+`Jkz9)%oiAa0@!QeXiM#rX1
zdleo9CI<_^@(_{1K$@|1YHup_A9K(3`o%vR^Kr`#`dQa~yH`nFRS*k9V3rD&0R>no
zuV^6Q!MZ9w(%u}-<`@+{CM^&mVvgv{0OXJ$I_+5ml6_u8g&NB)NucOUv<oU6dj<Id
zdO8zo13fM~&*~#)^rd~t4kEr?h6L(VZU19}IXgFL-?D2wFNf!_R)IJlE^AiEXSlO`
z`@>_)suEBSW(3KF)d7pHYkuQ?3-)d?!Z-&gV*Up!39FGH<hg-5H=-1ueh{4L3}Y7s
ztl**@+2^i&AL;V5A{+Do4rpTi@eJYr<zC+l<ISqG*k|{DXJYV4%$vwmRh3xRLo#6Q
ziqM*vcMHp^9C@5vYCUM3vmG8Fhl_*8JWofm;7JS820kL<EU`VvrKShDMX|W7;3V60
z0l2CRv784>T+#a+6wCng)Km$115uHfp#RxiRgu6ap<bQk9cJWXGZE^mbuU5_q7kUd
zCo>TTr?BZp$j#PWSH5(Kn(Xl+x$jdl)I}<BadO0iwZgvVsTAImrkP3gYFG<3#O0wM
zht)kvnYfYtpvvq;>0lZ&GKK*242WJwA<v+5(rZ_^xa3zFUwj`jgRPN&LAQl8V|96f
zq0$06&ac~R!O8Dc3uZACq2yOvMD5z=kUH`8E;)1iA3O&z3j4Y#E3=j24V6RgfH|81
z%B1p7nZ)ksm*qd_Zv;@VY#?92))oYM+GKCVCmfD$Yx$Z9LQLSD3ahr^wZ6-^?c+M`
z*DnM$PHKkYA`Kp#_zKscT@lfpoQ#U4GOK3pjA~xexA-BCK0+eN;xx4Go6a34K;5k`
zBSt!RQxa-XC!1z*2S=Bn+RU%By|BxrC}uO_nbZg4$?Yh2&z|j+LIFffE@h*{pM2%J
z4|v$Qg|f~5=c6Pg(1ye_1$51OY5b7z{=_u-NnPm?Dmlq_h20Qv#H0MZKHu%FpCVLX
z8edAA&EU(@H5*r`4(iJ>$^#x)+8L^zQOO#r{@igy)noK+vNlzPN7#ak2gYuro2E*b
zxkM-nt7I60?D8fxa#$Tf9)6ANF%c#&vAzNDd}(AWaxJSsdU=Fo3bDmQEH?qm`~Juq
zv<?FX-sxLeQAWh=uP@pU<~&7ck-aY<@r9G^$rd#rD;&^WyC%Q*-g<>JClxFY@1N2Y
zhqZxnZrS70U_2*K;kdejUB?0GF}tEvgG{|P!+x=^Z_;<LmTf$FHhw0|a~KF)FXlQD
z+tPvFeN)7(RGqrsh|w8nor)jtKlu=+AJ+5%L2MJdw6{JP>l42pl5W1#ogUIJ-)fv;
zpA55R|3Exvy9<0OJRoJIY?`<i+?f~Y=DfRNKs>qci~AKPQ!vRqoZLR>OH3&ctF&qY
zaa9tSzni|Si@dvV_!{}T;imK@?W$HRB8&9^$TN8ww(2U#us|Pa0YVX=Mee|^mbnG<
zReAl!rVLjrz#|-x<7fEudj3J8yw@hgs3rSK)P%AS#DjioRNIYMaHVonDH#%wz@uDj
zh`~00zanaZmr91x8gg*~3e=Pc(!67}5dnfm68K;PcV3cJNb!}z_t{Vhac|f$(i3o?
zN`>3R=S*jb*(y7-zP<M}&Cdtyqf(>epsYACB<U9o$n(6mGcD`FMgwB9I<|$a&8t{-
z`%t4Wb<#}~5WD(OOu>n}MLv$jp*5|aU|oQRX?Emg5y?9EgtRZagpG<8aWB8gXO*zq
z&O^bbx75=l<6RfPTK>a<Z*>*(JoTA)1cfm4II|!OZ91XTD^5zlJha2=dBsIE%e@P*
z=!KK)s-unj-!HQU%!_YcMxD)Yum|<Tp=>F(I5dk;C#U^c&2^@^w7l2fNQx7!Ou&}V
z(d6?rS*U=xn+Kpjwf~gpj+ka`Ga%lZ8_e2y8A+GZYs$su9@4R?-)sKu($K8xFD)0*
zOqZuLl4JYu$Y$;VVfzKREg<sz1+dw%so#n*YLr~RF6|B5G_&A6Wo_9;lDZG6|FQ-U
z)xiL1qM>y8pHB0sI4_(#AfAM)ulJvR)$i4Ux(vX+-HndNP-i4b>3qjx{3&#M9d19;
zp)i0~Nd(kKqWn7mb3Mz$hIy?H<i?IbB)iL0sLY2I%KMczKW_Q&6+iEKKCv`d=qTVX
zL4y=hNM+G1kjrY4jClW%<O`s$W`NTNI;@DW{`F<fDEi|`jyyf){B<rK%TbFR&=<z?
zsCLNTqN_=iT4lObb=25>yK?2?(6209R!a=OZTiR<umU9anOI-)<+Q7O%pldSDy;m<
z&&S6_{~QVS1wZ%|+5PE;uQRDacVi4-xQwa<*!y~JA3_o6F|$}&;+j{Spi7+JP3N;B
zn2fakpANa_X!UR90Q7Ko<gvKd(dyYD9eaHYOhspP5is_hF|I3RdV<&l>!0_+Bf|Au
ze)0&PoQQf`R_`}tEhO(+478JC2jLa1rGD8`;pSHCIs12mlXtSAszZLGlI6P=-eE_L
zosNu%57z7bj!*<7@J`^3nf;7~=YX|I4tf?|vv_C*$Wiy*&8+j0ju*mLKNr8v%#QIn
zIZ!nItvOElf!j9h6dQ5xpVy7Y`{ekL8auy1jWJyZY);zpQO*U8Z32X_6)jXL1O0e-
z5(v^9c%4wl;snKp!m0qiuGSsoyBQtsyS?TGSmq{IH$c%F@HKTL83K&>aYewa>u&N6
zlJ!o;K`1WQ9r%?5FDFApSniiUBr1;)T;;K@p@b(&g&}=_iITIj{!sC9_&df^;F@#H
ze4XqdiMc<?3y+mYVAN=#F&U`0=&36>GgpKGsyi(Rt+_b1>F-+f9hBr{h?)<ww{%!C
zT-PB|)%@X*Z0d7gh34A2{Yf3Z{ztC}MO@exg|ib)o`3S)d|WFt5zW~y#AzqS6j?#h
zbu2*vnp$q4;RG&EZ=B?u8LoSK{Rt5NV~QO|WWz$pbP9bF;q$ZLP#lCG4&9uDSQ=;X
zgc~$~JUlzx)#ae0U7OAT+Z(_n-pi3|Zvp~d6ksL?S#85>*nk}`<2Yfl<B&agwnCv^
zs`clz^9t{7LINhvl@)z)l4!5ac;<jApZ2e(A+`X(i5l+|(LW2C5^+X$AkJ!^1S`}b
zVIzJ8!K@NS9iG@ng$e$Ulc=E!+)~6sl@#IE14=PowhNi*=U?V7Frs+F-ac~PFv5EW
z7c+rF<^_;cZ!J2AQ!`!VHu`ZD@6%fi-vjejHS=Am0k)+wjUwxH*WD>|dP(Jp52&Et
znKmKj*vl+5BH%hmhdKB7HS)l%C9@mlKQS2$^~|pVyC@g1jS>xO>_#a(fTdp-csbQo
zy$m2LFKGM@lw^t^97ni*ex>FiMdB!E&>)gD)5Q;CIz^A<>P@nMN$$pAr0<!Fn`CXI
zsziT7dwk!n%6v9M>o|OEp2Gv^XRA-C?w_MHk@}F}rJVF65jOKUlj{Ce;VV5kvhljW
zu>JgQ@{!<l_QOx7mM%&QH%+K9!^u0!^O&wQhD=;Dq$&qf+67E%L-2i95rRm}Mqk7u
z!x;20r_p&MAmEm0r2ZWH+~)k@8ZVqrN*t-(%Y>MW67{Uk1TKOUPG5Ju2Z4ABGzwT4
zr3B&~x*Wq^T^9IJUX!CA|0=V{6LAB+8&+MxbOsA-b&<rZrL|iNz0C(v=04CckpG7T
z`V&aVY7eG9Dys>c7yO+=jWj?@`J#%S`mg7XN2kD_h)N{u;kf^lIK`)wsNX9DH?=k2
z&6uqYm|)Q=--ra?9$;OI1S(-`w0}1Ft72zhuR1$KUGFRo9bg(D<tmy>mS>$u241!m
zhRQJC4YSC=e8lXSs^$VIM`?l%VqetZ4tU~Kt;g#1WC*4FAV2^N5YyB@B7>^%F_=!}
zs*p80qZ?H!E)TgfJ-zoHM8=qQ#1^$jBK45l#)*5=CuSw`RJi?}r_G_jDv(YVp;l@F
zG~m%g(ZLS)8}M6FiA*d|Mrz>5h%U*qARbM@Ehu_FLEI$fEK<ppnghIq{~*hJu{Z%e
zqO|i+g5@&$vd4AR0r9VN+C-fBH53c4Fsx>RSQm9M#z8BRy^QUsG53!g3#XUY06adY
z;iG;p3P?eMIo;%Clb~@Yl8)yD{F!nY@*|;h@jK<8n6T8%dexx<z{PT}k(GNOrqn9F
zGD#Y=p?Wcq<PS7n<J<wUB0Yi0_`c0~f8}$m>a2FU67Aw<v%`{+x9*V05$#`XW5W13
zqCK)0VUyqXDZH9fAn^>yFln~4ZPm7N=a`@_1vv!tXw5_k-y-Fb|8kjB&#G1=J*K{e
zs3`^H)7jaSrpvSOo*uZ_?CYjqTQvxz#39(sAAYL9^1x}*7L%gh$+ojCz@#`$O7uP-
z1HG4Ze!7d}K^yc22A3lC3hf5Kcp6A5#jVfxD=_+*K3?dICT|}q=f+sa(RoPvk>BVQ
z7yZYy)I1hzIP3t73*vjEgSo%os@yIC+4YHILw?doh1IyKir+-MlH<|w;my<t+5_I<
zMM#fD81IKx<=#pQZ?5ou9;Wf+WAaN@&TNOVi<O&goT$z@4!CtD;G5Dw<mp}QwmgSb
zvAej3XQwXRV6QcnXGR-lRj1CzNiYV^5keg@(5cO?vhhzS_1&zuKHfC+INsJ&uXfw7
zHJn1D!@GXcIB4`nfZhc8iq}PS-c_w<9du9;z^39ms=HgNcSl$z=~DQvE#_7d)8@vI
zOg7s#je;>35bb_M|E+_g21(Lp(*F79e`ZPCxgwRscUM6NcNh5#Ha9`KrW)|T_dds?
z9;kf4v4><>D4bd5`*i;bkc`=9<6K|fPg{XFerD6Bb7F;%zaxL}Idp$OiPR`otIqb~
zIVQ#ACOQwqTlk>Fgjd2fxOcq+?fxHRW2PbXtMeorc()L`@Y89|;nO}=$=(-DpIIup
z(@^))p!^=mfK-`@G>Nn3ck4pK4dlqVeEiVXn%VO`=N~j_6`6+iAN=G&mb|1L@KYe3
z`swF`fNDWV5S^XBTzU-5lPT<Us`{%>4$N~~^<dvA*5sxK40aF+^5{w~Am)@#dNK<7
zVTR}V<19+g5%#3XhVgU><Gqc^Fk$iUt0xM(!_e;aUW{rl3Hnw%ALjEh$RK@9${=XD
zO=9ge?mkC+R;$(e`u)Q_1((E_1gl(y=1ywM%Fc``jh=kx%@?_(IKWHXp}if}_nCLZ
z?xXY3dgIyl==pbY;vZG9!iW(C`?<l+1o2n!E8qVz0GuJbc@;=91V&1~9~B<W8x?v{
z3<E1OHDA4j)>RNacB`W!muUFdmxqy83t4QkP$3k%Tsk<y);QrIZT6PBJcwgY+#+b9
ze%8Tb+9uSje>gB82vzuuz=>~;z<ZwnI>IRX>SiOo=Q+AYp76S{63embWGSr<5eyRV
zSnmHWjbjBZ=Vz25=xdR|sa-FZ22J9*p+tkyqr18VX!KX1n#mAa!M-BPu~7$?)zxBo
zA?SsSCgqcSFp0dck)vArV(Lez5_;<Bei@K=R!@|SQ_8J!ZA~O=O}bL#(ty85kg3N-
zxi|Dky2HNWoEur!02pjgOLW9(ii;tT8`SVM>*sbjS#MG75;0YII?&<9?SyKAw#wib
zcxJ8PWAu9FDgqc_2dB#l!KlGlAF`Oo3i=iHFk18s)%iq|_BdwE%6G{9w$u+}fR}ro
zjUk(HQ6KE1_+p>^CJk!uGJ@Vk!fcK8|6-55oj}5>oBW~rr<doCROTH+t}-Vc_Q<CJ
z&^L+z*g&_abO}W=k5Gaw^-m($4P*{kYL0<STt(ZfqT_K5KYvZl5}4{*`da?i7q#n+
z+bL8;DuQ9)zO*raO1|okbOM2j&Ujk*f+l=s<c@e)0x6?veJaSX^)_vl7dn6C#}T}B
zuwiBFaO)*yIkhsAqP#aZt`_##R+arp!2CV=Pis|yct@cf;!R$5y#@LMS)ZEegVj9Y
zF*>(3aa)k=Dk_DlhX!E8WpqNO>toY5GArY+XK!1eaXvGND&{A75!v$c9m3<JYjPUv
zP4%!A9!z<fNgcSrZjOSpF}5u&k(}RYcpel3awvPgKR13Kgg@RX95mX9a?NCyUjJ23
zY!O_n5o?VQD?*GELa2|uj;f<G3c`He{gm!F&RImf&(kTh3Dv*uUfl=Ul#P?Ow0Lqf
z`I+fJB8;c-MJsfe<@%~gzjIc31WfvX1acEW+jcJ@s$?T&aewggUqysr6Qn8*%eI^U
zkY2pMkgD9f3}Nu`qlkY1&rnI&EN1aCw?Urimv9pB_bAJ@l2KVCVzDv%s<q6@1;6#P
zFG?2822`@>woKPX4et9xJ{c69y*YO>a8GcIQM2cY-${zws`~l@=SfFdl$V0j155LG
zIPJ@-ZH=)$QRk3jFxL`^CnsFWX%8O^Y~oeq-c*Za+(ulH+{gPf!2BpDaK#sAfqwq&
zJVIn|{Os&_G;TBViGBx>D_bVU8&h!?`L(R_49ZLkxT{h577l-U0mhyHpf|6JNn_@^
zPDZ6}&X1)G5P%=g`cc1pQ+}v)7z;cWdJ6Fwcf!0vkk^<$Ju?rQ;w^b(*x>Z_z0sWv
zpAR^_Oqg5&NY?9FII>>$K@Ze}yU_{jpoM!{^43<#nnu9^WgPLaCw2DFi{Pvx@T&jf
zL(CcC?U2&TQAg4`<R2{%#^<llS!_KeeeuH{5QW<#QtkOh?Ntd0331^l;hXk&W|}y}
zZ(k8A5H50v;Ad?q1qL_#v=*V`({j5^b=Ow**`V_56?b>hHR`ubpP^66gm{x9q+$jC
zHZ}GbD9+bggR1^{za;J?NUZ}S;03)97$FT9a)G$CvJ%Aqmy@3@d(-aX;*|cSmw!y-
zxOe=3L+D$@S@?Z0LZ;Dt$o0B)iP9p*ivsO<G8lebYoC$rzpPgSw31f5BgExL?Xr%x
z;XPvC$B|+^&qCT#>-C(05&hA8rY^GItnnu2iwMwm_Y1gJF}49C+M)wajJP>UTg6Le
zqB0Nt0sg42&yR8N+$)KPfF_OYfB9flB|rqBxlWC$NMVrsG=Up^*pi>F98bp&zs5b@
zoh);Ot!1KICHB;1E3VhI-|`%y8;Ys>MY=Xv>0CGnd}H6P`s4a2t}4(x{20%f$jb1{
zaO4mrOu^Mdmj{$m>)5V&zUz!iTo}9ST=>zX&Fy}DL0UG`JKE&6z>@hL%qNNVIB(Hu
z7*p0vP)5Mts2eENT*LZKgCh*87G%*WW$&>+hyG#}B4UV6oT+S_8!jsZWR}x74V*>k
zd}h(DfXUnFTP(46)<><J>CIe)7kiSpY01w{mc9u7U3i{d0t4=Zf8|E}g-m!ujF+3F
z$#44j!L;VW7a;r=EO4%p1Q>B<+c6NixbQ00%}38)Z>#J}60p_~g;Yo=blk;eq}K}%
z%?%c&<6UAF(W%?3ti)O#*CT+Gsp|SJ?|WYZa~lsYCL?8mtoR+|Ck|Ss7v7c#5kxF@
zS^Hp=A++iUZHsnj_*z6;;%=x9tA}?Ca1M@`*GvlN5V8Ya6-wUMNdvSHmwx7@qF4+I
z^N&v!ceS@le68?a&fH_I0en*~aDo0{Ykm@2ls^&>^&J?Omk<qIp5KF+2<Wj~-NHJ6
zPM0O1uQEY+GiJCC;m1FK|7y{dAGFBsT~^WG7|Tz6q(?so8B|#eEmo_@_V*P(L^a<Y
z+V4Kz#alAW^G&HS@FW;dc}<tF4zRW{f!T)&W@VYLJnnRdL3q1>5U@xkwk5akB-BQa
zIg$>|CO6MMK6$&qDc9St18ZSw$)s@I9kvbW-iGa^ocXk<!hLr{p*Yc`DJ!=+s&70F
z4IFrO>==0pF48gD9u38>cf)5ZM3T4Ml#3o0wZ5wP7cJ6i<z_Ht;6*A`3X3l5M4;DW
zR3Bs|BsSfEUasTE4*~;eV7B5}{H_l0j@7YdTlr8NUO&X>e#k~3b^5n}(?0?)E6-a+
znz~YOg#vml3dqoct=y~9nQ#WRjPJ!-7!<oWI^vt6R!v?B!sOymlG)+TvY>=86#bZF
z`)<y6>_QtLAD4z<GkA=QDNk);ew$d#UOubiW9J-%aQ^8KZ~91sr9nsfQ_~gakfBrs
z%47#w4@1zU3$n!FFlKXn0QO({vhi_O2ji3`g+h4P+(hbV)?enwAQD&}CK;mEH3>Sw
zMk->SH}a?wilAP7;!_@gibSRRoGl*r$2CGA1V^waCcYF`ep9AT2@Fa)?U!N|z6w{B
z<bn~{tQv|XQDDkV<}oVc(Gd~d6g}KN$pYg1m+%7+c{hC>w;wu?RJZ-t5W}>y&~}e{
z8ZZJ>H>%buw3o1dr?|YH<zZs3kf!nxIbT*R86#ki*^O<Tj!a%uDB?u${oJ<i*p2l*
zdtt>inrbSAVM2~EcIvS+m|$~lZ?)5ymFKyJ_@y4fGRJv8Uw?G=bKENz4<O+3+K&xN
zRIxTbr~d0^dOHCXyl(Ml=RdP*%=M0>{A}H}QtQb|F9q={L$@`bd*m3L8`{PX7%*MJ
zk7Nz<Y@dvNdCJM@yfLLqQS5`PxqDd%ObD7&lcvk@S)}+@B!`IL44O}C-oms^Z=r#2
z!f})_heTJqeM&eU<DRhq2YBfAlkfK|5~bu2FWwo>iI{gJh(B|Bf?)8$A%Nw)iRoNo
zy&Q|z0V3orhyGC6CS5o7iRP#~93j?!n(KMooWUVIU=3z0sIhbfLF^NEd>_HN85aPo
zl6bcE99SI%5ljbQ>;;W}n*`CUY5-Q`jo8mHJ?5U|1qxubw~4AgBHvOhV(aGr2Dc!l
zR=PyJ)>+7Cx&0vz-0C_0<)ZE{6vYl<fIa8IPe-ygsz$8jeZO4jH&>)`0%l30!MOW~
z0OinmlwnGTPfn|`scV0SPxCrQ*%J_MK|UM?{wwCb=46-pT=#)Z(H_;CLZR+*f3YeB
z#duH&&yx{1>A(7J2D4y|xlP%a=YBu)!)KARI{rDz*$-a%#AEh8e1U}dNFT6*1P?<@
zv?i>B#N&jn|0cA#=|PDYm!IGXUr>tjmv90Jv=aLyIoP#B`~rVXAM{l)6?YRbkbe&o
zonGELZrI*r_LP$IH4$`JniIN!Q0OvAmr#V-;b&Iy?n*sw2KSeozk`4SKiIN46Lak^
zEQMEne!#o|@O*k2vW2loz)(Sii=Hhby~T+60i|j-$82t`TqDvi@db0Il=XwT_mi4n
zrH3D8g{@-+>;oyZUindA8bavlU8+?bO?xszRf#Y6=L1|z#ey?4kPQ%wqym;OhaBXG
zh>sHDz!qgO#f+gH0*+M5J)9$WF~B8io`)O!8>^)T-_NtXe*eYY5966p)rTKJ19vqD
z2{C)5c?!2S2BR=-0ig!>-G;3jotH(woj;3r0QdpV{tTQMeF$uO)Un+>76^ON!>p*+
zFz-|j(CRJLLT~@Xjo~G4X~5h1jrt=k5T+{R^6x0wUV^Wf71k%~3YCgT?AMAXC?DGl
z22%s|#1G4Z*$L{|3Fukknp40F00h%Wfh%Z13m{+n@=H}00J<EnK&ue{0B*}jU?drG
zq4T}86%*2RAiZi2Ax4Nda1SJ3ocFNK9(YmZMK~a4!0o<M_h?ElM~8paL;k4Sniuft
z@a?AU%~2l8DrNF7e8Ad|juto`xz~|cwul2pH^jVa?3gTkOMb_(zkLeKzB-RlZh7?X
z-gft3rb+5khTg(eU!N8&!7OBwKOIdp#2`Lr7Z^ULcmE9Qd1=mKYX0_fixlFleeR3f
zL@dPpJE7$zn;JLNrr8hyJ&wb0e+Bavc>CACwV4(8+4!8lTWi5ccc~Y~b^s}hSJ$Pp
z(+dokOj9(7StW@@@$;Si&&6tfDDm_xh}AS{9+D!GR3LGA0d7bxcr-;#--Y7O6j9+6
zab!I|dWgC)w^n0{*wggw%-8k5UV&Wd;)e^Hf_DJ`Rjuzfm^~7rMv3e6MgybcC{x|F
zS-dBrGC@A#-FfRcQR{~z?Z|QX+?U61NfVV5choP?IX#;@PFhKTk%=SFq>c8OLW#cn
zfQdSHURY=LCYh0{l@#%av%e2DlB~U_gHvYWW=)>lfYi%=9zFv?k)OD8W#`@C;>N$~
z-nM1?OyxXFfXVHXxa-|@ReoJIYacSti6F1Fqn{#Sn-<Q+_0C5oFr*P?VZM+#Fxjo)
ziTq=JS_eIY;{L@@`DdIhWxsGdIej{B_uT+<IO_O1A<-*dL5e)wD;RcV{`jK>6N62U
zs?0r-<c8_+Q$A=n3?6O4G@gOt$w4!1(@Kl;BNEoBF;*#78WHcFyfvH`3?Ssf2q6IS
zD0*K6Oin^zQnJtR&|O*89_IV_Rh)g_t5eIfz8|uH8ePu+H&aa=1GwGiA7B0r*zioj
zRDl<T6N3M?X~uLXpe71_arA$AI$0DE?^%F>C%zAXAu=GZF!83H51R;;MhzEBQOkcw
zMF*v<va^<{?_aLv5ra2XBtH`aQb-W;u2~|IT<RAc>qJH!KNuceVi^V}|8UwYqc<$u
zCWwN9Ttk+}F2u}RkJ#)W{WKeO7IPJJ<jkuga4ZCJwt9btI+@}LlyfdF(6EyQCKfN^
zt_Y;6m<Qr}u_oYc0cqgX0;=f~mGQXG%qepmWIKgeTh-}A_GJa1HgoOQh<j<EC0*fr
zy+VTUB%)@wE8965VtSZx)kl9#b;$hI=|Wa+aq;9&R|_!w!S`&nYIi60vw5w|a$XO`
zgHtQ#xP`%#UG;)@C~-YJ)uf7m)?9<rwopJ79Ta`tgGEoA*i_OM>`>O}v*I-EXCPz<
zEQUQ0jHKVMqAXa)={Ze6z<V<mwbIj?cn|0(xVDBbmpfu}73!s{9N{dmjQUhUUTBE%
z^U~o)Rl?@tJ|E1K6DZ*6c5k5;3x`<DX787$-MlT7J9l%)Zy;1<<L7_+VF+MT$ztX(
zFsVqV#}M)R@xA8<wwuVp8O486avIP%%`xOBY)}w2uRuv-0quj{nyjiPJP(;Nf_w?j
z78e^7W3FBdDaEwEUdOVS`tyIL$ihB@11&sx{9}%`xDOoZg2>eyDN_?GK+zk_#CvM%
zDyTe&0Yqyg5LQhaUm}Hia~CKdYF%w6bcfkqDa_R?!H;<WMtPC6fAZx}acg1ZBSXJM
z*3e>GbbIP6^F;N;Yc+`wJKv=v80K{>es0v!jQQj4jh{HAuUQqTgUH+E?cpeV1t((6
zM}O*9UEQ~yY;X?1{q$3)l?h`cOOvDEP%*yb`_ab_xzm%a&C3-ED}x|*R2F%#Rth|1
zA_(J;isy<fD0jdUrh?EBH>*tFMeueX-ih|FzM|tWXq>NX4J_JtPPcCovmNH+>+C=J
z9p#@o+X*K9jD@8;gyRODlE2&<Ttad;x~RpI^hjgCmX#{9-X$T&eQ#ojGW%SdwY8>4
z^t}!GH!ET^z2WYnYO?uYSa1ehd!B<M5<wjt|3U=RZ$5I07k*2T#Yp{PIWJ&RGg9yT
zeyIn8EuNn+X^s3{c)=&{FJvl$36-}XHF6CjHn+(yv9QkfcVwPy@P+Z)E#rDky+`j|
z8mC&D2Yy}r2)J6L#+*CCOnTN1?MiWQuEB|T2D#MlQvusxIx~(t_?e3IpGk30n`{gT
z;Se5k-y-&`;kk!nUR_M>dkIE^-fSPui$q9hVb&5+*FA`1GF!wG#aJF=$|cJEVAf@R
zAIL~m$mtOgay;<23KQX_ei+V7`?vbzl4xVH^UZ?AmSZoKgty_A;v%$?#5nL~nwrf+
zW!pCg0t@arwcZ%lX<E>a&v&O&BQj-q;GL4aHb)I_uH9(?ohf~!GV!R#2|y2-{jANF
zOk#-|RuFEu`*bVs*p|dOUvx&&r?|2zm+hu<1$Blew=TG|$@QzeAGjx!wyXW!kEH6I
zwaX5sK6<gE5T4DbD^cbWq6gmZljDCG8#{wHRlVK`JUFSCPm68=Ekn%Kp-#Cr<HD#M
zTpZpj=FU2VbOrY1HrL|wB#<MrtwIv4U$04Mb5+=qmH}pNaOT_CZ{BDauzRKa`Q-o5
zj8se^MK_$ljIDNV`?Tgr->bS(Z;E?lbh6XojjCzMC>da7@}h)W(<1XUr7jN1?ku_v
ziC7=&qNTc2oog*U4pOUhiW2TM19uzyd|EL~;B9PAmvoI^i0u&=VApf1Tc!JRV0RJA
z(RIY>&L$09<G{R)FSHfs*plejOhpmgsu>L$*7usqu<ws2{OSF0h}=xVC;x}N_YTLh
z|NqBxMYmOvoidUU5eeB>R3tMqqNvD@gzT$Al+lpN-g{+b%S?-G3RziM+57uEyDRlB
z=Y4-azu)os9lzsv|8pPLo$ET!*Ld#ncqWqGwY!A70^PqES$3}+KZMT(V0ayT`x$dM
z4@U4DuB}Q?qfrs%yXKkF8Zk_1q7-P?+Wsm&<qc$DoexYQJwMu%O0zDi6V@;MnN_?B
zS12hP9rw1Sa~qDF4kx}cL@Lh~6d2s(8wW-7vEUZ$({;@mPc{*B@S&Q1=@Fr!4i+#!
z0MICn*hFTzk%%&@hnqs~c*?>djdK&lfU?g1-Kypzk)+O&1rTnL9^cs*s#4pB6u?eT
z9hly`ntMaSZ@NQ9U)otF@l?!9#V`ev%GlxCM|zRYFwK20F2)}lWDPvCW8a-5vaGY0
zn)^SPAt)Gsr`yT&AjfCVKl`hhLLA_Bot;>_)q}czO4A^H^!`|hD|Dml#dz4{;?-Y`
zC-!N-9s;ET5eH6=URKku5;Pm7G+T;Wv<o>)CDG`>*&uynr@Kp|^}g}678#$N{qJ|M
z^HoE(s|h;l4|A(B09;BQ#5nFV=*(5HhdCKQpB`5lr%I0R(PN%aOnhy;TmG+2ln%<z
zqR&_R-c7>ao7uFd1hZS;Q@c67{Z!|Yn8``@z$>Quhua*7gRi{h13y*Xfv~07i)X+R
za3pB3$|O(KJj%rLJ<HS-8<w=yAAZ@!9i7y1udD7FMCB<Dt7o55mF2d)e2S`RdOrsQ
zsyG9UDKQO^4o_dIrbavkV1R1@?;<rS39sOZC;>9h8z_6BFw70^vIUi`*Axta!Dv?O
zrOl_(WyS!eCEmP2YjqWHA)WyHh&?>dDk}Sc(>^ga9;>`sdf~wd&gfad=yEXMdNP2M
zs-H}BR;5(2>tY$H&a)xNxXp$}C2S!qf4wgw#f<V&%N%*f=A4J-IdbP~K83fOs#O!d
zNk<%-ut@%I<l$9<_XldJ3YBBR%yyel@j>r<tLuqW0xR|GcbKAfwxheXc{M7%!iQ;0
z-c3LVT}J#^@)A<vtn*A>GblW~hD~Qbv58E?#C4a^@%ArV`yaH6Bot{KBq@qDF^c3A
z$iYgrP95*iBf;fs&WyOa^BuB#za%=GS2h->p`=at8M4)_y7w(goEEI1z45^$q-GsS
z9y?faUb+Gn+cDug4H7OSQ=pawTh3hZ8Rw(qRk@0<YxE#u<kU%)?5f+QgL=?XUOW|y
zlmV!j?-^GfIauPY>nA?@NTpy`ON4(lgog`F7_D+1tGW$>i_&#*r@jP9ONx%s^v;pY
zeexVH``gvUlR*pVahysC@?KOX*pbb0Dkj}#i<<hV^7x7DFRG$h^4kNO<yAgKrw>Cy
zfo9)wLe6!&j;nz*ZoU9;6XPvPEDr$}o|nw`#->0`c@Vn|uO!amaw2M-vb|KV=E!d^
zQZyNf9~Xr%(L|Z~g6G()%G)$S5D1m5pANO}AxhivXoObO)(n@!Y7@e~Zcw@-UFZ4^
z0(sOw0t<-W$P`%$U2PWt*<vQz(RA|I?U146e)|gJ@uxu;d`71UoC@|L#EkA@I01EJ
zGI2WIM;>abdY?SeG{)*vmPB0klagpV2+r97h%sD9>yZC&yCKI^O*9-a>{{hZ_P2~g
z5WFaqZaSof$J#+<46UjLfS{4_Dl6MRHXXwITlm`|PY!5oQkbfDrBa5L%6#F$@EfO6
z&lxy6pIRx9+5@6X;H=9+42^#~G>8|?nmj4#yaJ9|0gX4IHgB`Q$km#F2-A0tC#0J~
z)vn3w3+iOk5xkz3P~(s@A3Wnx_E(}AWlZ~{SCdr4>X)ji8|%d=Bjq$s8$zo7xW0QK
zl?7$6a!ec;)A{l&qhNhTZqn#kh^s9_(&#p<qU!1OMTr9~P-azB8<$>7+%Zl(b?8!E
zDM@qybVQ7{ge9@pIUXOU64ya+P}17HLK~npOMrNGsDwBsUhdBI9gAIyg$=G$4^J^!
zg9Y0;SwhvRcIi~DmUs}c_26SVJ1#kA-(kA@bNoUt1vp>Cb808n^&h14S8rlcf6-us
z74hi2k~n;kgsWt_{-oaRL|*CY&wztbl_WEiZPHnoXLkT{doFR|cjaevC7MbTweKYm
zJSvuXfBcAMXx1CQ^@yRn3PJ%+>UnIxQb4quwyv>0Q=9VDUG7}gR+pG9-$fqlGzg=X
z&T@<M-n|${Aikn~hQwhp?wFXmne0H%Q;5$6jxBgn=dde0c@R)xVl)kU{1HSNUy@wB
zV6eAffRd9I+EH?4i;=1iV4EjU`N4F)$+%JEXv*a;zJ*O<YRBRMN+e@)FGdU;ipesh
zicb(}j8}QZvcfg!{;8FxaR3VlFMNrgdW0J9DUjsm9i$XdRz4MAIIVr0j@W(hbZq*q
z)uEyUqd-o9_&vU36zU^2YV*?1iHZ(neGC>#k1TadJjqddJ}EgSk&f7qQ|!J^_aSFR
z$l%wKs^9h=nL1+r3V`WU57<XKvP`x`uW@Hq4BRVIAr72Z7xT7e%uuo*;UJ6BP)OKT
zG@#~n{7AW)jzNFO?a-4ZItfMroHc{O5oD=Sst$f!q)F8FkBNOo#FrtDHa@@V|0pR{
zlAQO{vqQtm@n_7nW)7%wFHGw~o?S#<y>oi!ut-nx^=NEVqA;7OX1s~iqMjCH%;If6
zd#-Co<hG6;w*ApT_SOC*4svVPiFZ)wB=6UJ<Nk(qY~=0~U@R4!2Zvis&HZ?Uj~Szh
zI|6zz1zz_P$bIb9o%ZV8sYb+PLb%6w3bNwVorqf0or4yFLe8CwGsRLuUgT;-Tx)z(
zA63p_TM}G}Taij!(d%L{Cj%8*V)`nqvX$u?pUB~Pow2X&!?*Rmujan9h&}6>GxVZ}
zWapijeDK*9dzWe+5NLd^hpCkBUHq&f8xQ)^I4Ssk$^qZ&WyA~#a`#E?0I;YvHc{?|
zNmz`@oy5leQ!8vQYpAwpa)_x)cS~ops1_U{xINOC(7U2qD*X1GR&1*0)Q{Rcccjfk
z-<~RO%dhmyhR027JSn~Qm$UlTTT_9hd<q%LyN>IVtAA8hXI9_(vdLus+Cr`CYrq)n
z<G$n!eQA1*S@E}hy=Z3d$}sRHcHX-!O%>G`3ElB}{E(oTq28`;E*tUJifScBUFh9`
z&f5-d?1MAvI=kINmOaYUi7WXK4<M3**Q)2}m7$nGk9?c#aHQr=Wkl>cUBuuX^W<^6
z#QK=T8&d<dbL}@%^om64Gj@n3yh*w~2hv(;=<w(Do=pGuNUg@w%=y`0977<43jw77
zZD5=AN%`a+^(B?tP|rpzY%*HT(T*{xw1pP``8|LR(P^4ehL}*^iP%*Wx0+p)9)-h+
zN%Fl_K0fV5e8d{qQNlnT-QjGWhB+4ZNGgmq9mM($N_&ucWe@wmy4JQ1L&CHK2K0q4
z)3Q6i_&T5_IMgtUP5Ii63C`}?Tti{2T{-6O4ZmVE6d*ZQ&2A4I7EH-EP&ewtoF$tN
zB?5Q|1|p>~Ubg!aQM*(cY8B<9qulDwlXv*v@<MxWYl8cQyB;rlZm%H?v~srqd@zE>
zquDMBNjCr#R}e_b_et|{<1%p}w%XKZ34Q{Gei02dn(`)Bcz&USZzb<%@zmX*e&F7C
zSCau!UvlItu;RBre)|E3H5|G3hm&s!*3&6J$gFK&IQ8B{jgLYI_~A|glLsO9jWquA
zYM>ipCxSFU9w_~XM-V<Ef!31_1Y}}p8`!FG3*WG=I<~*~wIb&$yJMNj0T)76PnKuh
z%=)q57Z-8cCFV<mA%$EV82(aXyZE#+&)Gd!eugCZ769G@a2;Oq)xN(kb`!>m$Y!wa
zIaHBW&0R+DF@&Lvu~%ew^^qSh`eBA!%K@J$7M+o&{`TCTsDZ-?^9^Gxcieq9Ku;P%
zDA#?wEN{>6cToxw%#VdKXuVPF#~Bkw%U}Z>)t*;d#tQ^$+aUb(AMe6NLKSsfeDRkv
zLf`#`b?^XxRtx}A@&UTnbjTY7XBD~>9<gR#-1+;?v7g|YE_4R-{|a&S_ZOFy&OxC=
zEEHE2GtlVSM<8ga6F+(<))(cS1q8Qe<;#`HKfK^M@$~1m2Bg-@ZU@v`Mglb1vFD2S
zGJnv8preJ>_gaAU-xB~Jd7RtXO^SX$ME;KrQmW&i<n}T2fIWgLDOT22$X_EEZb;Rw
zHz#?E&+k962a0}1zP{x3PsB$*Lier!V!nyb`3QAh^4PXpo9Rk#^!i%9f5Hk90jN~0
z=8e4doBHwTnCy3msAA1!1Va$)WNKz|9;*MiIrv{$0C(^d{R1&$nR+n@jtvAEr^UxX
z`Wi#Y&BMuSR#Lx{4--9dbG=Dr!RveZ{s||=!wcoP4G>xCO4<eO1Vd14$*lSD%l9w*
zv?d`4yIClj{SL`W$V3)}of&R=^QHflv)?n5CeAAWGg57TeA^$s+n0*eg*%Tv68-}@
z-p7ET7m5qE_8$LGD>^PV^(n~mQ(rcp@gH`z8!oCo;w9gD*kXOnP6Xf5AY)5&(9QY?
za?Y3|zdvIWZVm3KTjMvTfnC3NR0>D>8v5xk0Zdh&qvA>ml<Cj~QLz7kuAk}xr{|PN
zz5WG{=I`}z9K{?coVcF%`ROaCw82N`efds4+`sRr=IfXr6!D#GHenJFpr8(amOuQ7
z@GvoAE+eh}b|YMAh4qNan!IH8Yj?!gA5w0Gt2)+u^%Ljshl9h=K*g~;)YQv;6a&*n
zq5*LIFkxI2e>jAvh^r%5wDUE=Ph{}z7&$610QSL$bf$i!@}(Dgn+5bp#&y|veKX%b
zVeMf#-EUg|NXOT|Q_kkOU}$MEhBV&Is7#XIi9;F5-cok9JmmV-lKXnfPvt)POK;<Y
zt!xX18uZ)p(Chuwq*H!fC;yO(9G74<1S`asU4AEs@LPmPqm2_Zso*pVj2qZgQ_tk>
z*;97$4?@oZ=<h>t>+h(*;0qqZAyp1yP@hmNBmB{A^J|bfs$wJsKSbI0qXGOpB{<`5
zM}<ELekLx0#o19)hq<v)(Ol>xXefj#iPOS%Y=7V@N<^1mZyflr{^;+Yu;e=c4*8*R
zT{v!>Di?x)cj$#u3B4{SeE-6_6m$!MaV0|RIe(yWGFcS_qV^!-U%)vlIz%IQeE>BT
zXHRYW2POJv{M5fiGq#x+k@Fz$uLLgW0tlfYl=&j4;-^LJ*#1P&50K5>8z}sJ#aMF6
z)*SPoYY^Ic(^Gl-4FJHrL9o1`Nbn+36okOW?cpe6{ln4Q0wLJ4<oto~*Col%pTbL#
zGHqJ%pHMFU;kTs-IFCgI_3z7GKk{EbQ9go=Vxk5r{`D_ffh$^lS~0(K=%3ey!$4`7
z>bdD)+JE~CJm{NaEFjnUlT7;KzbT&s)*!wUO7Vv?|MR}i0iM@+Ap?@%Ir0bF{<<4t
zdEgq{yDJ9&`WK&JO`ngOt*4QG7zuye12pM^YS_o?pt$Z}{KGHYKneJD+~WS3WHwm-
z-(TV2MI_PjTCd-w^agj64(cI;K7-(Q>A1n#2-v_lDjgK&``5qtFH<0tw#w*e$6&r>
z-?e=??-p4_o#4ZLG1omjB9HXG6qixpQ*F$`KSuhF%!_T0@PJ(XjuU%d?%5$7nMdAU
zx5tA^d|)T1Y_x^+4g3^l>GQYf4G0HvLqobGI+h%Z9XpqtJXaP|FLnoyPh8Ur8(5yx
z3-g2_PWXgmOjx%+en=s}chaWMyYa_A{>SgIh8ReWzFPv)`j5Z-;n%ok_;QNRha-P}
z@2?MF;>W`adp+n366&8HjWvZYHy<C{XiL95+W#>yv*EOa{J&X7cs6|5l*Mkt)B5?*
zFXSQj&+ov<w!z8%{9UFxZQzycyP_Lk#7|$pzy|#XbVb`K{>=_AaWj2vXn(Q$r{8b9
z{DTzWI^5~>B-;2Z>nqHnCYvxe@7wmzHv+fzm<arU<KyN3Y8l!jg!5T~)_?tr%ie^0
zjSs2as5$+AwT%Cg^G`JMze3K+oVc2rn)FNWBJ%|xBy`};ojVUqLa3!t`3?7!kZ}G%
z%>8DN4!A9dW0tnew#BhW(kFFta*|h2u<GtVNCInwM>0J)5b<^bM_XrnNT1z&r;<QL
z!~WL|vckfZaf@q{OHHq$86CmCL!+HcE~bzC1)asmVJSKIiK=!zY9Ye`!RZv^2`kPz
z2garpvh~B}yPBB%$&|w)ihR+YhKm56p&Opn5yQh=qqs!f?iKOa<TBgHIVDkxGRaYn
zynio^H&e+5H8;HaE2%r|c(sFimo7b<cWom>xq6Wi#4ZBXlEuf8f}%?9jU;%uCaSv1
zFAv4^hO(l`qVUfkkuQ#?&yX|upTxPiJD9{R-tqBiO3B_?k>Fu-k8E<T$<c%z!@60_
zr%Z^COpSh1q6E$OxFEiou3uEKb)GrZF|=F79*5`CJzv_7hbd>TGmVei>5{Ou{%Ga!
znjL*&jU&5>v?SX~Uy;r*3)(8rk-<}LD9<Ruoh`fK^rXcn1+t%qKJ~B{Y2$73SGsC&
zw@!)OVGMn%x8NbiK0kRvMD`*Zn;Q{oFRwAGa=M7;so;G53Bp(hF^!p}H@@?7Nqo1-
z?(LH-8l$d7mA3vypmW6RWX$miDUXK-DJu!QRt-5yaf2@^DvG=1%+^db%j7NAB1Dr}
z%d%k>@mrS~x44_QO5DQwmnwzr927tmi+xg{NodTK@$oRnC<pg8s?xtq*y3={RFP!9
zQG(`%*frA0=_I?Ra3ujkwUsp&d^eNLk{O)`MQ4*3J5l)z>jJX#O9M#&8I%7N9PJ^m
z?)&rOwo>dhMB<}72a0tzU5wSR-l?317oKRfY!@2GlwDx?K?e1XuYtYr^1Tih3pRW^
zCPaQ{NU*NK^PuM{)e7mF+6~F36DP|wq)Uo6p(&UbXJEfm)EvNYg+zoieJLzOO{!bh
zDvERXuk9N(&6aT@TAps+u7r9cb$Otx2Mj}*u;Di`QsS&;CO23s{gtx2itt@}MtUDG
z;=;PyL^b44cEiL4mzvz?90*E=;HJYt%Ojr%rqAaqow-RQ`8f9!9@?eJf!b9ap4Hxk
zhcU+Rwem|ybW$r89#Ay4+UkkDny-|5-#Jl^Z{mD@?d8vONqbQ!o|hhIH8bKfJwD+D
zo^#qtpSU^B;FU?Zn?AmdrlucgfJKp&k`2=VBC!Px4v8}RA2*-xiYA(7`AjjVRVY83
zQN6Cq|8O0HYVZJ6zBvjeWkQ_o4e_R^e0)dZcg<%g+Z0>b$CB*NMnx6x>JGTA!Fe4;
z=3(`L#-PN~eFsyWFP8l|Lqw?U(T)b$aP_)45h}U5YKA>1o5zN0!X4-lh%32adGOnb
zW*;@*zpWAQ^p>H1*l<?pvx#R)4Y7-l23230>#LyM$0F#11|hivd$3GgxOeZ~dB`wV
zKEZc<I{bRVIX5I+iFE1^aWv5)X;)~50h%@#1;vyV)yIIrx87H0VxmMk3v<*CRMg$P
zU1~;>MbTqyTU$|4VHWt=5p6YueQ>z!aXu>4c$hZ?Puv|1(v8f_NSSeG2&Z>UdoeTO
zWuM7XAxC?}K`JoS9&}X`kqxSaw~@<U7Y{MXcNHYa-@4e^lG(=Eg?6m|N^q)9v&Yo&
z38M)gwV&5{a3uVlMCz7n2Di*@Y_cp8ti1hL3SQPZoBm`j{KhGDA;4mG7NL8v=N=Mh
zyYDf~+ZD!Amx(2vtl(7|rMe+DX`eSg-Z$><<iIwEK|57<*vvk&5lY2#_W1SJ$$YtY
zhOP!)i}CRdP8u7(kgo6|?sdWQb~JK51d25S%;y)`ZkLr(-$y-@Ac|IqPxQEUuY|&4
zgd)G?4;21mZyf%x5!{m)HB31h(+7UD-MO7v;*xxP<kZ~Slm_W>4nzyBTxgf!aS~f<
z^lZ^Q8=siPpH$@VlX%_~)k}UD;|8b1ZU*Ks1}kyJ-;Qsc?8C!>#R<%VpAtPdjeQpP
zx+6Qs7N3osU3ytGzUZb?``s(sQA`w^H8B`VnQ0ow!qYKj;_l`lWq&dGFFW314x?T5
zTB~@aqPrP8Cx>?a!jwQv`l=6(p3@?@r|&Vy3w!UGJi~)2=kynQ%H`p(t4Sca;CViZ
z!0}LrBhV7H?7+8X^C#u2_;S<Hx;gq@;1s|@c&KZfW=&2nBhJ$?Zts#YzpJXMnwNC?
z2-=YnPAh@@aedf%)D0VmnS2&;xF~LR$Jo%0hV!~ayRN+E>wa~1)mqCqCZG%{PMZY?
zw$x7k7oJbtonW!D*T&KOF8?C9@L6$XJTS;)N#+Q-NM{~CnyIWF5<1D9t=3_O^m?YZ
zn<Tw);7*Zf-QRcKTZv1&gnj+Se!EQNGr&<jvx@!ra?E7xa~1wdGdt<%5_Y}m-@_Y(
z0koxrr4LZswX1VBC-;pZk-2M<$_p9ZRC6hz9Y4tM_YWMwu+DDn*mIbEa#<42!%vdL
zR=Dfs?%{<&-I0(Ply2wgN<Hl~iHM!=%ehF5W;oT|hes(nF|c97b?{`aT;<YS9)P)Y
z8R?h1%hI86(*tmZL}*ClwirS?ZF4$|g6xmX$=V}7Sz~TW3AXQs1bB`Kh<CI8%qa}M
zHqqYARKA}~ST%gRV8Y#$KBVHdLB=6r|0QU5-VVJ?x>^8#$*+6PTC(a1OwP-dH#M1=
z%<}guiPA7}Q9^l3_#=2r<>#P8B1-MjcW2x!&MH;9_H&&jf`R~>4$;)SPfyYzVLdb5
zsmI`3t<9a?c<O4}Gr(mnhbpzMSEfe&ZCO$IyBY+zrX{3M-g8a^3>d?J_n;eZVPfY<
z$0ZI!lC>Ogz%~Dxs8Mn`h~#h|M-&3!g?`(`z?7xc>jWfaNB-LV;IzW1%VF;QLJ|@T
zRaI5(&hATS@`{YZ4ekT)N%T?7#T~pwaq=CXowxzGV5{8+E18ZPu2Vv;6vgwE2W(fL
zN{COwpoW`NnrAY`%wag$)mFItrD^x)#Z{DH&lep;H@q4!pu|95^+k}FF!Aoi+`oTc
zACd0aEG#Vg1_lPQ_#TdrCkNx9Em|0`hNHLj0IzO?s?P!ba-_$t+$1SQV{Qd-@t|L`
zLGf`@V~wZ#Cy{s72hiSV0Mdvu(=w4wSr$h-=EWS=&gJ1~@S<AX1&6>6CucWn^HcBi
z8x?^J1^;H*BrZidhn9lQfYS;gE{BCHtNC<8i1T6o*@p58TMSq9rGv4FoWrM-S7AI4
zre~GGfH+#gwhQ+!11h9uZ1Xdzd6+;kmp+>F{%!c-iPFvGLdBCluWuoP7So~7t;Kl_
z+RU~FSr%V26u7$BeZl_ig=9s74m1*E>H-p681ow@d~lq}+(C&h?hZl*-E%JEbFqFA
z4zF}NcN8zYvCIiRMH%;E7w}d<RHR_1eeY#tbO|)7W*%DcBSA)vl&i?>7C5iKH;teG
z809oW`%L(bs>C8>6ioW_E%X;ORO1*GMPs=bygs3+acuZ5SW^FBlsegqBJ3B!2(8O>
zn+c6oO*FKPV<yTOS9uF(QY_6|<2G+Q%!JHaYAEr09*#^kGRvr>&Kox`AYcG`3#d6>
zV+xfhq)?-9t{`n_$=OP~_Mwi&>A6mrytZfNa7M9fq2B@dRUhHiRKI6>XkH%Ed60v5
z=s!;3dmK6U$ag>lT4l;*Bp2V&nRfsal|%0OQTrH7;<zbPvharAg+^M!>Yn^*-+6CU
z>z=IOJ}P$V4wxwNcx7J&OH)ppVNm|KBs@YNkR;zE$681P2j5e#B0Ami%pNelHYO1|
zFgjn>9WzDznylWumm<1-kfS3aq24b$`ogYwDkObn_I>Sq2O^StD1)*_@<&oh3Py8u
z-im(6FFc%LoDlO+u841hC4UTN2J+gCeaLUua~O)5s1hp9NYg$h`n?ulrp($#=F20f
zyx%1tvky(7Ie?BGBCqSkC(NRzjdiOv3^Jz%^rT>KYZy@F0kw7Ks>Wd!&L=YV<!D`i
zcThY$6n$TtWnkGC5H|#qbJ|T3)W-m%xm%}eQNxf+Hi%cfqPO?n2rV5Q5ufY;I-SfT
z(sMk53hbj&-rcn`Fz>??COhTpOCqE~=I5eq`1s=RSl(dd?XsoLx#GqFUqGb347tdG
z{OVE47BjhbP#4AsW0IC__4xfAE%N#wrbA1FVu|SN5Y)vmzR2COQV*jw8=WI>9f>(Y
ze@$}Z+u4NMgUE;sQxi62CS2B#_{EU@UjPu+9YjNtv7H6rDv|j`Cl)~p?bc-wvN005
zYpYA1kavEH-K}(B<}b<lBdrFvxlg$+Bq%v^I(<IR_{z+u#;q1+qU?sT3X`vghx*HF
z+>}oHm5o%s$V9spZw~O&DpC_3V{!N%L-wiAyUzl1F>;OVpx2`Da`R~;g{~=RK=qWI
zzadEoy6l9r1ymCgt}UjmX@~p91v-FMb2ravW19&KJY#+07IR#3DnL0(e1=NWhOjPZ
zA1DWm$VckJ;?6<~b8H_QDB@7Vt9{48GZ0DvpUawfFS~bz`oo#laDZ~;-36SgK>s*+
z+Z7lC;xSh^Aig!M%{;fG3`US>&cDfk2pbu@P>e1V)R2mPPM0<`MM}uZV3-{5$_njK
z1eTgVDK2n(ZJ4Cw9X~&%5~<QS8iiUk^W%jd=qMM5h#i<1N`zz~w@FdF+<zeVb1zIC
zrV;G0X?W@U*KQl-XFc9rJ0(;f5=j8Iz?D{Dbk&vxm_2e+T~jwZ=6$%uGbnp4hf)#Y
zl|?NhQa!549TWQuS{l)!?(5OXx|q_wz6-ZxwlwKkd7ljCJXK{1@Lt=I!JV&SoWI-;
zBLx&T2bh*@)oPac=1FIlY3TsKdv23lYO5_f;4*nT{q@0DH**X7x%HtRaC=hnQ@K|n
zOV`RvmS2_Jgc3E!Dz1X94TWGVFc(h&4Q+*LBv-)QU@!CLN~P^woz3-xJ{hrHAf5H2
zd<0EQi3URK^borsZrrEA>8s42G3&ID+v3!7C@do*qc9)=97GiLFM`Mdve%;^3l8wg
z<Ap&N?*8Q5P6zYuiXm|;@7hquvC~BHhYIqABS$n#7S$0$hO6Nwj1miNY^e^f1=^yA
z{;I%qP<<Xkh22}tn>t4o_;{}Xa>HBu<?AFogh*rW`!oaxtsLfxogtE~tUHxC0$l{D
z9|{2VQlMyDHEdCB9){saIM+fY&~Bdm!N?rbmkmBOCBhKPx+=qswu5{c*g@yp`>+Et
zS(5n&0+$g^b!YKCY~UeFS|3fIQas^);3kaUNa$y12WBmU_VAU5bCvXZFvM=6Da|GO
zODCv-tQq-2W5i5kTxzW@gQl9!gQTiU_3teTxc6(NGa=KhyWwBA!89tTR+y|+UU+Tl
ztAt9g+G#4^wVIvg%w0HFG#;X&8eEv!tZkaDKBDBh*zi)6n&(Vw^U7o_pzQHEjp%eG
z%Fh|wgupnOZWx~ux;g^`gLg;(h2*UGh;?v$ydP%~?~bNY)<C5!8VxDn6Yj;oS4~iP
zHc;SWCIV7N7+P^zNzB*mzJlP8gJp77y<9g)M9T#yQapIK)mHY|ET<!c`a$M73A-Gr
zMl5VAC+c7j4;fF%@)wWX%KYBOP=uQU9E+!84!n8PNKs-3^p&Hk&AK-m%y4(4KO|+z
z)Y4nPY3Y5J#ry|gL)FK&YDI4k_uk3(lGyFaTmBd1Kq}^p(THv{&>S68swM@kEiTJV
ziL?-j#Cx+TMw|w$EW_lX@3vpLhK_lY>cl6Ynq>Lv69v#<=}xwJU)aKHqeX+AaSI6i
zZuh9AtK;yJ%ft=Fo1jP!Akz!DOF)<3ra7<;Xwu>q#qCI$(U~D7R}o}*A$QjlK<Ily
zgG=$3RExBNvHYZZ&#aNO)!{TwnJ4OLF!M1kWk{~0`QVEIaBk*vsn<T8K_fpxN*(YH
zTUS)Yac4+KPnHfOXTBw7TzDa4IK78KUjXzqw5QSB?Ddl@zL;h}+Y7pmxNBw1Ze(m_
z($95QGfW0<9B5$sE2iTaFpAoBbwKWLm<Rj*)Z^4Cm~0kZ%)jLd;#A1R$IdzcEEreA
zEfL{Wt}Tud6S(a+{amJ_x<S4rJDw$#WR;ezxegqgH$me%Nz)B>r9z58@!ZO|tE(0a
zSeGVctW1#N3%_4zuxS`<vdhDwvj8D`8HiqSjYn}xcsM>vdzQ;jXz$vmmm+*zezPWM
zx|~-Q46Gx=4njC)8d?vQd>5bz4@7ZV-&7l=l0JxxY10B_UbeP6zLq04%PB!0_xg-g
zB^5u0bJ9LxSVgmjTL94b9u|D2@0vf3*v3X5Xg14+%~!BX1ux#SbsiAydc19+aJJo)
z*~4pfv}=`nrqe#T#nNReh^{-VWObxuTiDWv0Zt0vd(XpdF%nSlFRPV)?)e?ds&NF@
z#w>p*mfyt=90*Tn^UNL_KwPwEU|F0p^}~y7zgjuI8^)e->rUC<a8qI^h-5>f+9F5=
zPFoCa#V0&OCd)uWY035e=v}xhk<ZXQ!jG>A;3t<sEGn)+zy9{^XIF+b>Bj<wRpbC9
z^)#3dEW=~BaRJ+%2c;8g1xWxlEXUy7CGb1V^JRL;75O$jAl+vU1ADtcpeEgX(4!W-
z9cY@rlj4TFm53r~-TtEKdS=B1uiE878OcO>mUfu`#U*pJ@`Py3h}-6E@$x%NY${J2
zcU>KDmFR8BNCj{?z8S5ou+`}}eb3#tkN863+lo&zL{P`kz;)}nFBd3oae8ULSC5%K
z>Jr+v#lb+DCLFZF;6EYRyP4x6$^0E(-&n;`FBv&=un`*$^UZ&0&MnSn)*vMw1r^o}
z$X8pTwJfP%ylC9n^49Y(D2dax+bRV8i+9eNKkN2jR{RLQm~E=P88}-rsdUU?z^6P=
z7|0XAl70^_e0~h@Vpz?x-W}g6If*bUjL=N$&d`JUK`Hx&pw2_R3keAcNL;`jY^3v5
z<8Z%W6GpDHMga-OLAIUNGD(OjgRVj?B7PqqA9wH^46676RZsOIMx~v~Z+DV-YI0lA
zsFvY1KR2a^i^U3PUif9OS8rs_7-I0H2rq7%->+BrYzOZNz4vCB3eUihF<M4?G`0h_
z<C4*`7=zFZ2rmt`enbrEgT_lMU}un~itCBiwQ5&7xJ(f@Tu~}l&>cGLc>a;$*oVBU
zVP5y`5GJI%Z_^_*27@^p_laa5I1HmW%Ak_HbHb9@<Gz6#$2Fupj9GEsc)&Y*uisjt
zGta8TmShdRYyFKamA&OA(np}b$njNz%xGgkrFc#0V<L&uHqV7nqGW{AXF+bQ#dgZN
zVSTU%FDu6qNyzNJs)pcn8BH%Su1zwQGkfkj*gKF`G^-8ZVB&`I9~SBZiz$7DKRtT6
zf}pc$f$?o9wggSr8W?O17xzaB>7#2C+t=zQCGMzR*3i;-&mS--8n>?&E$dq8=xSOK
zOI-pyAhSiE?{=xe_z8xK)Rur+*;2lR-b|TDbf(wjYF~3u*Yc-)w>mH?1z83g{X`6V
zcYjYkY#cr{gdG{pzHz==s!X)kj})R_L{^Ei@6xi2g{%~;v|rc4Z0B^CwHC)S32JdW
zXn)Oqt+c~=0~BHNM-O-3q!=Dk5S<-C<VqQ`|5|$^?a~2&Zrcj8bnUAvzFU+dp`tL*
zkh8(<UbWlGY;MU`7#e0%ZYc(sz4{|5xuI6cnqP2(Ga!{4-2->FRn^EM0$)_Sr3+-3
z2UIX-4)I$VoHBnj2qvC6o&0e8t<#0V!hBo|SO3Au{>gd5l0F4e8daaJdk!`^Sx?4k
zV+uowFL(WJmaHw6Shn8>xDwXo+O=h+evZ@&Oe_-i;6b?0hqjM=v&lZ+nTsWGN`;2i
zbTB;avJUjz3`foz0TsVm>x@LStoEpUK0niLQ7~EtF}e1M6~vC^gh|R3ytM_eCo=Y>
zM!j!Ve)+T?U}yD#spS+g7D`Jg3P?Iq!L~M4bVgZJyS!xrM?p+-@4iRjl)qIrd%4cd
zQg$^ZfK^`ToM0MiIFk^xj8xW%yZGMR{-ha{3y)ImUpE6L&aPCjlN)2j76+sBJFd2j
zCg}5B$ZXQkX)0n-Y|evLaHB$@(MRcs;NP*sWDd-&IL>Nwa1<2y8Lj~~HrGhX*42tN
zWJq}E;^N_=OVAlY3|2$v<dUW9(tG{PH-aRlDcKqw<2|{iej-7lb{4l(olMdW_ZvY7
zAZ2+F(d6L9$2Tq^f=)y>3*M=D{n$Gb2&IG-mjp;3{7A7`yT(mD$*!+^?qbNpfWLGE
zHl|+47(nR_`^dI=2^=$-?3V4`yCxRuJS8Y4=JIM|Y)7Gea~qAWjY{oqpX#$I;DqbL
zwDYUu4oG;6Gb#DhTqbmhbMCraipO6}=Zdt-38b*P41}!?g_VJw#^0+m`5b5+f)5Gu
z9eEIritVEEqh5OF=W>Vz3)Vr)&GK!c`)72|pFht$E3OB(wRC5nM9E@PniOe!g(>(e
zX+bc?#C2Mxn_Y6@M0s{0NQ&;QmiE&9VHT-<bKe!9?Np+C*a?PFYQM7Es!so4J0UV?
zUOWqfaVLup=~3;Ax^a)$TYLHN=qGbaN1d~aR)B5f4pqdFW4cx*br3*3JC}L*4ciur
zd-~z_4W5$A-DF);ajuJT0a2v};p~d7dElQ*i&b$ccAs$Mk<g*n@WzXzrA;jAA-^~o
zpX?*p9&i0Yi*z$g&h1Yy^`p27(~a{Hr%cUd4S1!Nzf9(<M+L%wvqVWWXTKty_IV$H
zMS~!x8i+&Dje&+6VyxbW1#!ECEf#nY$ku`Hv?XD*1I<P>)C9p~TjcChT1UM12H&xe
zo(7JOSAYgRL2_hSt`vdtD<hZLr7~A6y<Pl}z|pC;T9=$1pNwr&z=q#iE><(pGpUB?
z^72zBxz!h%jN?M6dot8J&hWjaVqZ!IMriOW0nZFkIolB!w&~iQ2cclDb0BsDKihmo
zki^vE+N?9@g=Q*^Y=K=M8X(jtz6x@dZGY|u>yBdU=CGjPU{Wi~W<AJ>Wa@ctNAt&-
zcxizxpFeZp!ov{Zp6+V6`pw9?dfc`5uDaC|g_2an?E7khcZ?Xw4g{7gU(^Cmam0mz
zeX7F$WtlsP<D!$3tJo7xj{uW2=wtJIBlteK*ABXbpLgFo$wOr6)Jtz9Mjy9Q(OYPe
zrnR<B^s_QiA|mvt;wYWR9lJ<Ik~0;nuVy)Xgvb~Pblm87m^by-Xp090cCo+l7Gvc)
z5Ig*PDr5a${Mlg>VppZP@#T2@#iZhazxAN%eZtqSkOk4;4ILt@^S$MuEV|~#%(I&`
zy1}?kaI_??syHE0GXm{k3HVo1i)Pz>N9@5?rTc1;Bfh-Z;cP>&`-l*u<NY{vg*Hf&
zUR96V#LDOd)>j#1J5+>(jh<g82w<t~d3Ry5tHu;UDXF_cvLR5CqbnmSR5c{d$E5C9
z$?^ia6N+saYL`PHJmov^vXGm2eFMLzf$ESfb+NSr9i(&d@2NWV6de;g&ap)jT%Wj?
zdqWVTGT^d&6w}EAqCL0IE*l`By<t6a_O^=4?})jQ{TGLU3)V=j7GHDce%OJKyGNMc
zcpJp|kx(&}L1|pm3>Z9~fM0RBN=2h)0F3jrmy>!yW-u#4(*wdEB85vG<J*vlKK-4r
z^nz0-KTv=@NCJ>L@9)@|L1;5zh85u37F$6yQ@PBSGkPF_e&*DErPV$q&aPXU!ERY#
z^3_f_05wY=e9$~BdU{!6!S14i61DDx0-CXU1G2Yt$!WP1Py)qKP8rr_G7PB3bj{%D
z(}kJZeR&56aw8KEdhh`7&%7N5Ue6yxGJ_8ZY3f6+X~cvM(E6O`+gpmMU6y+pJprqX
zCpn{PXZLn6Yq(^LD!s(`+!|_T07LsG1OX3Ll&X*_sHf4Yd8^v&)HMpWI*yDwE(_|M
zj&0NdeTtZc-O#<W4U#1@cP13YeD`m0PSg}L;AUu|DnE{B>a#@r&hV=73;P~-1T6<0
zEMm{pm`AHy#o?fUk@QaUZ6Y#R3fZ6@7XLc?wCHP<b+zDQ>Md@iX2+|HDt@ACrTqqx
zSc0wRZoD*y0mzqFqbdASw4Yu$(O`7@usCO4I1-LdPp!AKAVI78sEdHmo98(Tc5JvE
zaC%cy!O;<+iu5X&S*7||L$zodpw~{!q`4N&l(RnTg#hJt@He!&G<UFXsqD@7cJcA`
z)hLs1O6nL=Ja#sB5zQne4RUj<O~3lVfv6A51lA;b2Jl&`4bHx?8M;}@z}eb~*!t<8
zo|tai+5|v+y27g?G$r%L&NlCw(<YUr>QGirJ@EedNtA#WAtlr>#gUnx+Kc-_6zxtS
z&$#bw*iv@n+0`$B-H2%fF>c#eQMvWSd6m>UH$!aO_f>caK~C2h@nsj)wb{l`h(3AM
zrNovpp@K$>nR}pGkLxw$_^?k(p_A2cSxff0o-x}5%zD&_9j}NgSnZWujnZ#5YETf`
zI_@+eTJ9@4M%z;n3jK9OYUwpmc|AQnRu(*HFTsDg52D6zi?yk6QiRb8lV0v53zU)0
zc^qFLKu3by`aE3C?uWXzZRu}CMs5l;Yq3PTS6i!GX7rX_D)6d)U&(a;HcGH*(@!~l
zUW}B4yK#Wj6{e4uQg<CXLWHLC4w6A4e^UoL5m`Kqtf&P?Cpr7QikiK!8y9x}VEI1F
zinnNTj2Fo#mA^Ae+UxI#J1)aX#M@VVl{lCcm&Ftum1juQWrTLe7xqKVrPXGq&`r25
zqC+ByT&p}DPLK=bT(c#oJ&wz|TsuE+-#=c4mOpaCE+7?TUFW&>D%0W&nZuqD;ylW-
zPDMCVLZbeWS(D7UG_>%D@+OePm_sML@I6k1tG|#**MDStBb#`xbu-$r5gx_EV_brC
z=?9`LiZ50dG{@Rw!zD-v=cefzN6@xRsRd+R`bJd*caS97o&Mg))eD-B?Mg4FrMz1W
zd-RCV$~yr4p1XH|>`Zmh;l@c3?S(F~R$gJ@)43k={0&aTO&XPg<hPbY$Ck2AKC5nK
z{$20<_!r(n<l^2ta<N0=;KB=b#ZfQjTNmo84}lS{dRGDe$!-Pq59deF>|k08IJxez
zk|qj{n^^4^?9>d>;x#A9BFVVbFGyjZRX;={9Beq$(fh3&al<m>*Gowtnum|JyERoR
zOZo0r{F{zuTT%H*8-#1tjxUZ-RR#4kQV^NEc{qc8ixa-HbpxAw*1o#^trT;!t*u(O
zfRs?z0X<I)Q1YjYF^;%3oN>knLovybM*HwG`HS1w%*%c&PXc)o(3%;3Dv7L%#cCgp
z-06Mk&>Qw*?aPr0=W@5ohqtX}$^2C{R*6PV#0Nk;%u{P`!gW*LI^qT8X8DyZ45oVr
z`Gc4C)FxR7X0tbQyFDBjOYlJv0@xh5)3(JW5j>1IF%*=LPBrRfAMw&S5(Yym+-XUz
z$!Q;8lM=+O)+*2cL$s9a(q3qCvLs_tw#Q}FaAY?)^<aYB_ureNW~jhevz#+gve1qo
zL%Sk(sIS^*vqTAXc<SLQZZc4}S0fd!p3dIfC!4xoq&EodJiHNc`MNma2~y1MbDgS&
z_n0V!a}WEju8T=m_b|0Kcs+DP(@<oWz?6^E-8V<hgb0Q15XI{fR?nM=uuW?`q}Qd4
zbi>u3Rab<bw)8?ZpE-O$ry6S;GeZcU;0b{<8ynkj>{JB|G1XQR7EE}@`WXgB6J-(4
zRv6uM{0u$5D4g#=4*8E`wUAq+lnPg=W8Y%iBMDR=L_X^k;WK|G>SSviSqkH&yVu1x
z_s;-Q0m`FHAhBTf7Y5(sczD~<Fq3oas|B+rv+^^=!~EbCUOf^Kf!iJFUZt#5_(Z`W
zaaCgMFB+8GD*S1Y8EPH39~=M52Tem_#uN@a37mT{?}?TwmY_B`d+jX8-pXbWb)OSF
zT<{&u(34e*NC%0f$~ctt@?ugjqAP?;M@eyEWoBOaQoXA%<wJ$FwdwOS!RMg{vzkq4
zJM{)CRE@Ss2-t+f5oS?Ldofzr;A<8f@o(=*iU^w@2f9{w2@qp2W4b+4?}2HOsG`^2
z{an4MI*Wnc8--Ig;o3m&J>Gh;3%1OQDe`dckt!fsR>R@D8X6@vdR)_17)%&d5b$Tc
zBOvPhg$ngLT-bqC*g;Y<9YbnV7iBZg7becB`Psq5B$s}fH@=Q0&3D?BPt24gHMsUX
zp{gqUUJGzVihV6kQ@~fotZ6nvb?K7!Xoe&u>bnRpu!71AC>-B`5Vm5ugq40unP4{F
zgJD;@of5|QkErnzJ9X#@6IYYfQ?6Z7tB)0YMloV)N32v}>Lp_%?2@F*4NUPv4gCIr
z_Yz3;BP5QHj*r<94+j(Zguprxx(;NcJILNJ-H;496xrd>(AhTFX1iH_w6mqgp*>H?
z!=Z<@wZZqHBMR9Pl08y@XSX%Za%1r2SnDiW*?IEcmnv~<nNP%sJdf>aaFuHc7o=us
zY}j&72lYWe6?`8gRJ$Pv{T`MgZE?0KeA5j>H^Xd`>#t**2I^ybF9`6(g=DLZpHL+5
zUh8_3ab_}aUP&x5IdI>xkj61R^s}Dk1UgO%9{@??zo*XpBYSb;PHe)e9d&8ZO_#(5
z_j^~Xl@&xawI|q`RYkMo?=Q*^8OwBtqY*S=-)RUQ)pn@R!|{?2vQ4P!ItfhN^ZZyF
zzVp<~(9B@Q@Xb)IxP9U`h9AzMd-m@^*k^K&A!Wma`irz__R*S!B9CQNNwnx!y(H5@
z?%nrfiPzpu(>$8rtL=Vmc^~?<??>!`dgGD3Sf)8lwyIg;TF1#vvH2g59|_9}t9Db4
zV##@e*WBFZ_vC)pj!!i!v+7K!Lgh#H5IBqxlU?no-X(O4OhM9Qw10OtbLc#SjM+@&
z&}q~gc!{uotc5lS9!8p7S#*Y-r#S4s>mu&7C@Y6BpR8LPV_7F^J76g<RI`yv2d3QJ
z1cjTI4vn&pa-`Z@MzIt<VXjJ7oEiSK5Mkk8iP{%{4R;4ig&VB~@JC{TsTxjE*uu9S
zm3qIkAM`?_8=2p&FKfh3?)}7pru=_MJ6Npv?`-{tfBElX!Fv4f$^MrN{!0dk@$&yy
z(dGZO*b9mpYo6kFq1Et%2*i<Hck?s<Dhv7XKh=gYR#jJX;e~m55lNwHZ7=|k@S4ZG
z1SFwxg~TvO%P2n!(%Xd(6VL-cgruN7YL&Cv_J9>p_ZSPwCiX?>0II#K8^V_l048V(
z&>8U=NTLGadcx0Ovhl*~<y)Yz?IJFA;XzWZ270R=A9xAg@7UGBihvsRodizcy~s~K
zZc?15MSUs06g0+=APVf92j~(7R0X>Uw~W+_$I);?umNiUue5F;cK@!o{;UN{@fjVz
zYRq8*Xjv&A9K6);2?gc>uGL|cugG<6LEEEUhT+*#{IE8mxnuCHJ|q>_GO7Yib0|NN
zhLTPGn+mxX;7f;~VLe0g&kOsm#{Ca~nd&qnjWk)VTbih^xC(k-5ZW#A2^%3Y(MpE0
z8swpYKupDp5j)}g7kEIoUn8OXgM$9P9)xg3Al^j(wQX(~@%{+W_6z3%?M3WoSCj!-
z3I8Az-qjQmpr#|>?w~cxZ|m2)Uf;{c_xJ^IL(*L0k(%nqP;qmi6GO1^w~^xxSA<B5
z2V(l*L~sjJ1D?>A{j&D>@#CCpz7b*e3i9&uYouTmqn_MrUtp%shnI-qd&Dft^b=l3
z!f40HC-%uNQKB_wx?zbRNGR8;b`nB?*aL7`MT6HKqBGwLQm|N(fVA6$*TFB`Ey8+G
zV-6gMh#(3wIRRT3JO=Rx8gbNOS$OUmJa<KC<~Zz{5&?9m8&!oyZ<Hfog_^v5`l{Ss
zu=XP0yy?gRbQAig`jJgIDYz-hd}nWVG-8A9SBwO!St2X=3*j`t33}6FCx0Tv-`9s*
zMQX0P8tLJ9lo9t)DpzR|WfS;>50qiEHx;t?!d|B#0N-#s)Cl!#nd)5OxvN1AEXd76
zg~2>hk(_2^Vv<;A`VVJ@|D$9Z4(%tm1Z7X*B=9z#Ix#T%Ax;a;a(M;KXe~`{SaP5*
z?^Xc;a3#*U!2xcrm@z@SNPI#YB<Mo$L<X2c*$`fL(5?lwVUdg@66lM(AQozz2uYRC
z+1MPCS5(v}{@QxCrkMuX2+yT03H=zcObWn;#ecKm50SF5I9z~ebB(5?8mJm*X@;6n
zOB`n7ZzFfEj6_4E5n1q)NZF7Wa84jD`;r(F5%uv2&h2q77PU>9kSrdN;LOM()<w$i
zH2M8su*D!Qzcg)cg6}h3fn$EH6TS)kV%8=2`d&5`6-;Mg)eFO(Ca@xZq=ozwH>4C$
z->@#h*Y~n9G5%K@ebcN6P5t>~|DS22|FXd!sr7%A4F>H>jjXK?LLNu?pG<fj5JUMl
zrw}9d3n#DZ>N1(M0EE4D2}~F1CcLsiFKDiM%{)VwQrK`r7QB5UhTz4VApv_n?$p`f
zC;WVT@2pZ*i03Q(Rh%>rWZRr6{0sd-%2M#09fD2w2+3#_al?I~w8;rbWzO#NYMZ4d
zpU*peS$ls?-~1VSXP0?ve2C>g0k&KOt7D&|XNn&LtjOcH_NdWxym_a!LO9>3i07SM
zaYBl(uNN(yiP%?f;e<qlrVw+~dY?b_#E_x2Vj5mp-*o7$rZ;;d&g|hrjg8}B=)khL
z;5G)m8f3ILv0{;2il0yDg?N}Q?w%N8yKsn*aU87@0VKH!q7B=HuYxv&pg3c-&CJdT
zE3v?o6B{t_-)D?lMT(eP8r~zKGy&UA8Fsh{P!5tu^BuO%@0C6tqDW>^pp4r84BA%~
z1inY~yN4&-N+*z9`_SGC4JW@+R|Szo)2qod<(cEjs1QE5=Sb<s+vXt9rT$*nG*u^{
ztjCa?{TF9HhEQkj)ZV|({$eS+;<y)-_drn$Pp)SqDlKt@gZ97+ibdR%tQ~{>k5cv6
z0I!AXV?*Le)K+J>@>$q)`xR>>&qK^w_lh#iVzY#mwNmg$)uLUs>hauBT7Ts6?;`c*
zFjx`c-njWJQI)v&ZhpOa4nq(B$ufhCb~S>XhCM-0A3HCvppbpWJQpvleL#NPxaRW~
z)E5(GA-N*KfUl;b7_N^p>~FZHW*cTG*n_1P7Db@KYv6mcK#Vw8F9EsH-6S^rm2pAH
zx<xE}{(*Dra^Se%t62akV<++J0b4G1-i89mRC99H!yDC%;s7^9$;u%HI@k$qSnGzM
z19^+nz3M9p3JOp@=81aCz!AD|U`SO3wuV4NJT|ETS+86%Zol83wc9u5_{sd@)xs(I
z4Gkaq*lVBqe75#I>SCF|9@`EVeU1=cL^zg<uGNdep-150!5v68Gt0zpKyGP;tWJ9m
zkPgbiw}6|<f7yT$S@tX@WLHDeiPE@P`)}x44_6G-H&V{umVXa;>OpeEL&TCigj4h#
zG7<WyDq6(u!ml_79D-he3Zav;AZJ${91suDn#*4-HtF46?a?hxCUJ9zL%DLzHPM3_
ztdQ>8cZ7l)2S+@LNQDzY`3)AO?AOAE#7AwOqvveM1_56v?h3yWoMN~?W|X0!NfFsR
zPlF?I-fl{?z63z#1kNOG+#iFuk(#(ZKHW)YP!Ctr%-KJxI*PSGfeS4?{Yds9bhs!u
z_03?X=pdB{(tGRP^692vB(fVE_vp6sos~Xnl7M!1AjO}7-(2eDjELU_@=F7cQZ;}<
zzZNPgS*oL^Hn8geO>Wpk?3YKW7i8m3m$EZdQf&rtI?WJt2aRm;FgHM>(G_MKfWu?b
z#6XknehU*5ll&?}yfE4=I~f_%KA1WvrM4&1qSBun1nO0LK56fS`>)5>-rdl&|0BP!
zu<*)D+G!Yz^-tu(s|Doq4f?>Ez0SOpJ<q>AK{8J#`Z@~H!=IG*Ao^XP;S_wXOT}iM
z&)=yqk>H%$o7Wy-%ZSc14oC!{qmCYV%wb&CJ_!m63g#_#?L_lx<J=$0sL+~OflDw*
z&vUx592i8{G|U^|7^(@rqBu?Ge;}1`1pD%B;RhsCSx>mvZwL-~J6K5E?DLO0RiAL5
zgK~!Ng8vU6D31cMGx?ligUu~+^NNV<geYCyS)GvA4QAK<hQwFKuA^s=vFnYv*zgA!
z<7yj_i%kj>UaHPLl1)h>{c#VNs8LJ0HsOL{?|oT(FTvC)4JYWC4n15MHw*2u_XH}s
z3jVO@pZ9al7Id&-%lj7io<A3^z4w`cKAb$ilw>vm$$Vd;vo~5{)XxS4BB_95YgqE`
z)k0KH2KC&?9y!Cge-BK4Z^}lckZg61$nUiP_&Z_Ct-IhAwAQ%x^ZD`<bYI?_TR?qf
z-zf<51)8f+!dncLzDEQkI6m?_jujt8x3!Q%Y7ekRNLUUqdt#H?4#vgC$|asVjrvNm
z7ipmTTpXhg0=vHU#%8+JzIgFs+6Dj(;fym+-S_+XD*mv9(lsPN3hjBwK0yZ9U(hT}
zl(B;ju&sFy`_VJ8=Vp<c?fO827-n3^uc7Emam)%evkly13od$F+tNYUt2IPj?%x*V
zKtmt{dA1c^WVQ{qND3}}t-s}_SEw;FNSR|K@+|LSU}3!W2wWIc<q7vngQUN)_6!ky
ze>jOBViT*1xS%(Y|Bodwc<~jEF&&Oo7BoW?YLFj(LWe|OKWqv^UElY&*8#+A^(y*m
zzAeii!R6Q`Bx8(D4&R;yaMiH-e_%{x6~a&!R%3PL0cteu+Y+#_Sg6hWzIc=m|6S~V
zPZp8(|DVz9#9C_s4ev;t!}gD+9PYl#Jp1nIR1QUw4A&LNNcElKxEZA;(yu0W!oV<p
zP>!|#RNe!Nqe84+j{|ok?5f<?AF4*2_u$m)Yc6Y7W`m37K8rg-FGp>1(VUUE-@we<
z<h8}M)vzvCz458fY1!*sMmTT(4^GZEp831b`rmV%C<?Ew|LJe&o}j2h|BCZ55}p4}
zMEot$cE$XIRR3E@kWEZ(`v>v*_ZMG)o67!wQA1<*FT@V0mMReZbpKvT7XG_}pM!{$
zd?qZ1hR^on9zdZjh(Z=XMeP|3rN|F>C>QfO414Y0NaGL)^w>nU^qPX|uZKkJd%Fah
z#e|h7rONM<dH#ju-R}+BF)}^ZwxPAi$jPZNyGA^?N$y;{ljHLU2EnrWe_TVUGFUwq
zKFofZJfmXp`I9O`z*?x)0(U_3;H?aYvzC+$-t@wW$*dOl*NJ|E+r~*iYOE&9*y@m`
z@C)z^POD$+&(IC%(#f|m=1bGtbje@p$Y@S-V0W0lt$FVDPq}P$ujaC|i5(=lT8x6Z
zj<%WxoKbLC`Ou82NW%p{D>esHn`z)5=iDqtVtZZ)iN@h@)LJjdRD{NxMn9?xnU2iS
z7#1@(S%=#f*Pb0OeisN3FAF?FA{+8aX(^(`n|lsFG<9+k%S?RmP3t#OXuU42bJ{GD
zXM1qJsU7$0Y>lfMvjsJz5RD=#b)KCye-SrZUjB0XG-pZfBgTV*j>#6|rFEZOQe0e9
zbqSw1UvB@rl+t`o&k@t%P+O3#Xg(8CJQ6Z?=iSiC`;n?;L%YR0Qw63|N$m@dEsQLt
zM1!DC|1^`-TiY*_5*wU9@U|hciA9YHKHdK9{=I94X{0O=(0#2YE-oH2L_unII>l+e
zYkyze%bgJ)h6}sLZp9&IjdKCte8`h0VMBy&EmAo2N@dILP>cjO)8qQ5*aGHo!|te;
z^tlDLXGFtHwOzC*!1AgCq(V`<tEM6CyK~zYGIM9iMOUt5H;fd_)Dz9yoU9kwh#X79
z!6v=X{_Z#+iEQF2reTpdcc;Ud6jQ#pE@gX!=mJyGUAtV9k?1ht#<~+LE7s<@7=|MI
zPWRLl&FV7l=_}KbH04hX<6Oro-bMEdyu=Ip($e#cXS^)E@I>hS@5ao3%9BS(j=o_|
z8vawo@vY}$FcEw6+MZSl78jp7E#cQ@GGVEwbT;&_2N88AAEddYuer?lneX$*`^M!D
zzByK}PlgLN?z-KYa#`;I&xQfxEr?KT_(Tkwr@i=%uym>7x6W6Pdk2UvK9GK^Ta-|_
zG4DAepy|ftkQX$+^Yrhp;ZlL4Z2}q$Hau)Mgyq|{VCnQx`~PN%@FWnUy8T%j261@9
z;R}xiHGVxT@I*9)xi>T=N#o%Qu|8BF7QT9t$W7!UHNEPJw>RXX&G3c$#^(us-4a5_
z$wENKJ6CqcCkbV5xKM0(Ed+cHnm<C`jG*-*$55w^Yj;x`^_7S@m5?--iSlDc@H-BU
zYDc;GttE*!c<lQ|7muJ|q_(j2%!WQ4DRk2)#crkfe_^Zt>h@Y8+k8kwL|aXhGoT(y
zk|xJ}2VZ9CBmO#a%AcS-g#Umb&4#Be{WszKddftNOG}FQ!M!7Vdp_<uVzhDB4KD&b
z|F{?i(mWPC{>@*-w!pdPA2(*+NXVWmMb6gz@_(M~C7tljE;|#y&nAv4G#d-IEl{$+
z-VpmMt3WDxXlIQ3Mht?(g9uPf*7N_#I@5n4AVlq?s%YQZNSM%GMnnPqeuQ+tivSx)
z^~ae$px&7K5!m$JEq;iM2HMtLUIG)qdwaP7)D^^@N_a3y@uT9nPV1b3!5$w!KN^aj
z7SLvD<KvQ~8;70Ck)ozadic+}t7o$x>UB;-Un{wxFFErkLVmy}-|)Ei5zSv%rfwve
zpD$_Ww|Mleo*Kp}K+vM+;NuOC!2pioPSAm0-wPbi7eTPm%W%w4Wmog;-OqL$7Ac0@
zO`-jtGW&-~M7sld5k2>>FM?p%li1i;@2e0oyM4CTlus_E{&kB;@!btyj)!qZj>>8?
zaKewWahxwust8-wM!Q6yS&`Lh|7_3>YK04a%(NM*KO`ZcmrP&c>*I6PX*<b=Cyc-4
z2y**8&458iNK&nF;REk?i`yKuVOHfuBwX^xL*uRZ`ueU6s&&ive|Td5dslPt;K9tQ
z>1mt9l>(&%wc&XN>K~W++XvF?AWf@ur(Ls+KK$@Qc7t_eBM<ZV{(LBZ#|;l!08(1j
z8x<r>4QE%E0S~Iax$&z!AlS;@dz=Jp;Qy5uDap%6LxwG~Z2l^gPR+Adf$ZiCkfYN~
z`|iY*en1LM+q<Sve2;k~8FQx#AK|s{-d{LkjQA39|8PtG^8=|Da;QGkKRY%eJ8tO`
z^&YOL&EV4ita$0KFke~S4Wr_u>Bs}#fI46Ij6|S5Y3Nq%hRv0w{)lDw;qoslWkFUt
ze`ezsA<DE0QKmn?2&^I)Zt_OtX;KvZvyJCU@nXL+N2cIM7~vqHItnCL6~OhqbAvY*
z$}EMxhWh@<T6OvG>JbeR3Dr`g=QSPve~SPuCRj1SbzAxkX^#mpIxM1op}o8CC>bZ7
z4eKd9ka<(R9xW~tR3!(t%h+7}a5#qi1@@z#>+<=00Lf|aptpF0U{{fHp27e<CxTQe
z+<3$Qt{MKWovS_ZIDpYfy|Yk_GY01P5TsRag^2^(aRBfjyc<EisVtt&PE$aDQK4JE
zC|$`lOF01FLn_=huFQTAXtSpcfYCBCA}&w(Z@_8bGibnmwN<pEg)tJh@yB!U7tfsY
z5oneL@Cu!dxYmn-y7d47@)+isdjc{G*UZ9w&M<H^z^fjD9@>zN1gjQOyy#AC+3SLl
zFhBQL&aL!Lsms!6AOh_pK!sqZ)&p1{U=0|@O#|dl){qkg!sJL-n}vz?hJD+#Ht>Zz
zqM)1+ZOC*UhhS15I3+`{NaKW;03;w@v@&X`v@~Ags(mp)T|2)Dsf@HiY}JJDk&W~$
zP9Lexd-ncs&6SDIVE++!2y9e_FaDl&HQ0{;iKSq*OEW4ee`kO_47rWFu}y>n5TYFD
z#N2m4{QUxAl`a>tuX^zb3qJ;3#Uxp5v@zRffl=2$-BPA#K{-Lk$fyJ<B@&%G1zWR9
zC1KLK&AVeXF$nTEH=tBjS}DwR*qvQ$2wQ!yI^ii9jUa0sQ{L<13ZP4IfKrmHf4R@f
z&+&{y=TvAQf>f3T=$xckI_%r+07f+y5Zexn7EZmbd@Ec;YuDv$CjzL-^-EoAs}Jp_
z_O)aH1Y`94#0tPKS|5<hL^4(0lIMHI5ApSA;xQf|*ic4?Bl5V)(i7p6p@#ZBFDUdw
z0Mc8y1Asz@fZSyf<gEJon3L~2hhpfC>{_xx(1ZamH@B@ZVtF;^y+sg$Xx!6Q@+BZG
z8jw;_jJA<Civaj?J)nc$dn-QoA!lqVC%1^ZsCW0cte;aK%lrJdNI}OT3fkSfQ`$4w
zF_N|u#<8E4Hlpp)bi_UCT)2Dp?rEy_=j>Cl7u7le7F6Y?S<kcB!()AbOQBactO!78
zHUQqrGbp~)Z;0StWZY1?6DKi~Fa)S4XZ-8W)5J#kif}sju`n7hxWFO-fU5za-iN0R
zA3sdvN4)JTARO&cis4g0<pKIT2mc6x)1G)0&H(Bd2B9}PAK!|O%S+DxANJlmp6mAg
z<BljPMOMm+iin1tkq9MwW*3F*y?2xmTC%dUK31}_qJfgVBO|hR_P&o(_5EJg_qslH
z-~Zl^-#;G8XS~mKoX2^b$MJl<Ip9x2tZN_E5oK`p-WhehjcO0snpF^4c!ZF0(FLp`
zI>AmY2^hqnUVDy^F4DoXi0s+~V5*O~AR(|=BEtiT2{T|gc9m5{|Fm}h*p7)uqG8yU
zPC37P1d0ULb{jSF+KmI@UJ4;bo2U$)J#5X10?PjA;AwY96|O;r8pI&7)l`{ynY%I+
zxl~eOoK?z=n1;$rd}|Xq%^)P8>g^(Pe)M@bqt#*ZIYb>rwn;Fnl?c{_$@aQ<E#Mu#
zyLc%Yt3H<@qGGw`;FpS&8(b`0o6=X+v=sA?x(*z+{LJ>v34;s#lQ2_n1I9@j&4gxv
z1BcOWj7TK&W{&D?4(c?C!b-in1R$l|it8Nh&oU{$80?(_u(()kk^tg#%}2DCBn``2
z0$?Y3p<SnJObC%`n&a-#o}bGjz^RZCw5AzY7~(18bM4zn*Dw%mUqeK&FSA?|CuVeX
z^o^^9EY`Czao9aLQVKLsd_M>#uVt#tc@n6Gqd@I9PMaYZF#*hum<5y9`A_V{CQblx
z;vaWNg5OU8xLV&d!_;aV7jgMk0W`>I{RXKS2yM;=7rZVPFz5kvo7gm~gts%pH%0z_
z--7niDcg!{^My*_ri2NBsi4F;gU)u7Fwt{a^@Xsl)-yG*lZM;tfV9&tHY-P<P2PGp
zBUYG8FpI<mM)6NOT=YMn*bX#WHD#cnO^pK56|=eu$fE~nj*GdyeN(!@sy*v(15CL(
zf$Q_b0K+q@r&|Wz&J~yucCD_!8fk(IjBe*D87Y!}$Bl8F3#@n)ln82BA+HCJK<#>C
z`<=LWyO3PVi`CKF|GH_TAXX1FX1=XppcR%1wMOIF@$yqI;GvS5WkCyU4zZSGPP=^F
z+YA<QUN<zlu=3ZwQY+;r*8q&1?j@PoeIyZlpn}ni$f@&Py_@;G3M{znv$Ea_4C}do
z`9HOsQ%Ry4B5_@e4M}6vkG}!{Q?nkBkTGl2uJes2$HI-j084um=#5fUU^LQi2RnZm
zXo4Mm02$c%5;wHoEbv%%3Xz;k-*<SmxUO&h^JA=sgl<%T6r@+LLNYT-H7R@wVX-KR
zzrW8sHAoT{&1*IaICY~>^~-1)L^%$HbIeh)m>lFO0a9TznyURVu%%r<6ujF&u9E`t
zol*1kCd~s5@UQ0vU~Fyeqf$$!5OA>xmP%yw?0tjWn-s;JnpPi=k0S8ZYPIU?M`%U$
zzseyXs>s&Qa-B4823C{Mu=+H}KqyQ+Q>5KXj#=%Vb_)fXa|Wg}=)&f5{E$i|vNF0n
zF`b}Xpz$05(b3LC6k6Elwbx(ogBuKXav0rst`Tj(<^bK+Wk5|i&dO4AApjORz=If_
zmI}%?*aYX}4zO*WRi}If+~R<kW`|xzW=cU{xppwKUsf6P52g?slFgh9k-TkX^@tqX
zT!xzhr~{K7$^wGz5K(n($%TzkSuwe9dt@C%vi0gu0GI2`N$X#^08Mtvz_T{Z{v_>q
zX`~-v{JO?-o!+2rb{;^={8297KaP&;Y&pIAgfvldfvcaSW#7$}@v@pFgFOa`0x*0b
zz36RM3ADP0Ju7wgNrO){GQZyH1ysKRb(S&}2E)VG=0_}B8uK-53$X=hzzQjcvcC@^
zM+DgPfm{X~Yk>OYN!h}F+(v}AZ5^H{%A^*{Oi<51XA(?hu^^920<zx$>$_gn0b<M}
z;q@*<>Ei(JjA<9N%gpyhU^$uWfmTG<?$@xKb_fClTiW1oCePWRmjN+vMyyK`?ob>C
zdsn<|#9{A}C4Yl)V)PO^?fCI(6Dah=AP>%yV`K#1a`qWm18wZc5#e$8dxi#Ebxrkw
zvTFQOmk6%Q)3!-dO4ECh4)%$XWy4ZQ?l4WH9EcBKmjrC+)<Xu1#yW&>Z33=wWm1)J
z#eT+Y#(FTcy>;I?tQil@Z$mP(wMe`j-yP{`_+FIdSZmzGBsA-%TkyDZHapn$ROzU+
zIcC)gFD8N}x$S%H5vlcx({2xuh6iTaZA6q#60wdouJRDByq7nJt*1Uc54pdIK*OrY
zc}<HGL*UDK1n_Il=K+$ZFTGY{AoN&QHJ%{uy_rxpY2zpD#g`aI>&YQ9+erZL)|%-o
zY;^Fk+I>7fT{5SSU&_EcA66xBBBN|`^<>XySZA1wJZG+zXV!IB4!AGBYqpM_dzK5R
zeeP0eyigfnefHCo0xT_}?zF7$K<EAo>oa6GvR=hpyv`082-Dc*Q)6wxgKf*8xTSAe
zUX@KA&~(?c?6x-v%g(cw^NmOyBU8xU`wYXSgI~i}f(X1W$sHBp4HTcCw*t~~-4X>y
zd&S{w#w+&;dfr<qFh$uYbd7Tp*0=DFjh%L1xvGA{hkUGO0K8Q;JN_aNND1hBKL@P2
zd5mf)W>Q!VuaKcZnXd;tNJ@UnJpTOxGv!4mvb*XKh&B(g%^&xCe_EjAY6_@*NfKTp
zOfghqyUf!`HOvTv@ARk#x}ovNhr7msd&yE$E1~-t!-Kgroec~xMdoMUfp7g(osI|8
ziOz5p+fOdNyE)pFI0~J875SYcMxC&o8L*LFj3O9Cf(jhH0o^&e1zF;ZcGLdE+6gqy
z>Zx2o*8`UFLS|H0$HO_oU>e@}tSXB&!D${+gOYhNwKSBsqjM3Wv7tCI9hPIDXTfII
zd)t8NLE&LL;HR~B6Ahsa%iltwl;mhI!9^s5BX;fB+r!yAKLO*v^=?7Y%-g-q*1*>8
zH%tazt+n=agVT6MD8r1x5naE8bnTUDRh0}!*a&LQ`=9cnM%-Y_eHxR#Q3kKUe{JbL
znj)W#G$^v)$q(|*H;4w%-0q|SKTRj#h4Q4C6nK;q-xT>6=wYE(lT<~C$)$b{i-?e%
zHMmhq;+YN5!QwFu65N1<Zd$M&a0ybHa=VW;h$)~ueCTi)9B7)sQPj+9R?aCJSR=9V
z>yF?C9<xePeU0CBxrx{uU=mVR=?G5TI1ss_T~aLT+}W%@sjvXZ^A~W1WsW?|x}qfz
zfZ-iC%pnM?$hKM-Do|eEk`C&2={NWe-v4(OcKcdl0wvlTyh1|J{V|7TjY}`g-Sv3S
zh@G8*@~m!dS|KMp7U!gSVu%Wcu(zZS|1`5u*mR|D-(_m$788L1s(R)i$xh&JDpKc=
z9}Ohv;5@uwSLwL&Nk`lMUTh)mDG~*~m2Z!m`Fm!A)Nq~?e0AAc>y@?Kr<7n>j<cJL
zKY4wn-d!eUW5OVy`?`??-gP6<{W{<;H38&MnTt#07>O`TFoEU-oGHYJQyUHKm!b$-
zFS)ZyP#fC<KAy)|zI(cAR#n!i>9M|R7Ps)h;-jrXRd51Y%H39DF!z2JB)Hh2Hiy7|
z{mdx@#Y#cn&)y&*_AQspAt_!)egKiZsK0>=9I@IhC}XP^^fH_-<|BEF+x3D=CMwl6
zOM1JbNM(5AmR+e$DI^|*1Un(l>=W)@mpL`#R@w~8H$|!?{p7aORO}MemX}(W@?meZ
z*<2as$*=?ruVi-Qm&3;C4-uSo9(?b@uMeHSl2pZtneOsbM+RlPAogFKi~krD5DAHY
zaiCk8^}@}-d(-;m2H@DR1%V%PNsiHu=;ZE0)z*vxGq0CQvu>0po%`|~Tp32eG5O#K
z@=RYEeWWMADbSzll63*x&x%sr<Y;gc6PK;<uIes@I5FO5xP0*Fz3;xlGI$|IARP2f
z-lf#a`$~sW5IPZ5m`R>xI73!ke`&I|68T7KRc2g8Nj>Zmr(H*{20W;Ak+^jzv7{38
zbJ{Rb*P{V=uTuCvND`3Ex0*F7Nfb$(F!t|N=1;9^Q~CSjGaA~0$Cw08!=!ZkX<35d
zj*L{h8uBSW8Y6AM%u}`tE@lTy&#n>Vwv3y)2nMBU0AybSlgdp@u5yEdU)K~+#<uUw
zEx)O?9AWG!G!PtDBbp|cFc6DP|89`ESnugX$wSDZ1?1Ch|1P^rq0|yI3kX(nTUNeC
z9P8uI+*r)TcB~XO0J{{mvamkZ=@G2Q>LerdgePU=dYK*50~tBHUe_fX;7)UC_9KlC
zr`9xWu3@%*-?a9UAuii@JHaIoRk!c94bXeR&?m>wUOe$KKUl(F%eDVi`mn$-AIFWD
zYFkF)IhgUUry8%3JW1_NP3&pBvf#z7s%5L(dGoP!6Zj{}-swof;_1i|aLRS>g{naG
z@mqSVSn@`YMGZLRYPM?x#p`gK313m4<fq`De!u@pZKW8c)|02F)Xo{JduKD0lNK}j
zs15qgNR|rA)vUZ{5|Y<cE&e1k(KinwL<L4s9j#1uOqhbU-2i#D`Ec}J&3Pv_3SV2~
z1Jz+xkeULL-D;Y67%<%qk#FYwj@pww#k{x~J<PjZ7?#OAUs}m!G!hy2ek#NjYOL?D
z#Vk4$bmk3g_%D^UnE?Hj9ak&w=<BCvqz<E$Vo%eI-B=?gPLO)vytBoO{bXOWw6rnT
z!4$!G<Ft%?Ns{!DO5cQBB<<I##bURXLfMnT!zaR%Js>?vB9d+A%{E3O*!ofC5=^`T
ze;OMjiHUe{h(kHQ_zePFSr#VmqEz|Ow<kzM{JMQuBue`vC*G4vJb?g}-6-s%rU6@^
z*CZ?J@Rl?}Tk-ULQp!9hO_kCk?=#OCgGNQVoWz7-?ps3E=O_}>@j1z=#@0mL5pE5p
zbtl=yZybJ;dCeJi!3s<HjQhN1Y`8276R`DF^<R5coee-`@?y@`AYLn$pvT6X$f6g0
zO+VwSWo^VZmZ5lLW(5>twDAl;U!b%|!lU9a$;=91l2yFl=}JWDGX^E43A{B1^yp>q
zL#EO{>2`N`GKmOf-0yH?wd2ANt$3UB9>J@|4S}n8KKAJy<QKD5+%s=uH|KYu(wb>b
z1_ib|@{$GM65R3KYHkTcwkGl|frf2AcpC{bVTdm}Hm<mI(D7s7n0Q_p=N_-uILM62
zW9(yi&yPKhS^riN&4|lvX>x7Q_j)D5&061AN$_FpBgTmW#J><yQK*=e&SSJ2_M^W%
zxC}sft%*&A>1&w;YL`9?0O?!`9q<e?>>)asNbsEV2%-vS56Dj%+FVl3S?FK0oL8DW
zt2Vp9E5vGNvL~2`yCc>pwUe=AteeR$!>`-VNP_xJ9-J8|bcDcD42Uv~2qY+ZVo&AY
z*8t~}Wekm(ZdD|28%AJk(uS-zVDr7%g+Ot}-tX)B?V-c1yIdIN$uFiB2AKrdAB#Tg
z#h-ECiZ@e$=dIAJwGv-{RIrD7+Slmsc?(`Q5%~2y5)nCB`wU%vQ9TlOmckZVj%5YI
zp><=Tn;{Q}ZcEj9I{lvGAn-{Oj0yrfjt+vGDXMyT4Bxz3Vo&YxE=v=Y$tiA<j%AR)
zh(J3N=J?j!g)I&m5VxBvlm-ZKsXwD1OVt&os3*uM)*F|01~4#_#XV9Hr<SGo8ja2=
zVYCtvu0tF#sYRSa&p=j5IyAte0X#B(;~pe9U;~PMJf*MgM5t}^fREnRZOOcPZ~*aT
z{TpELv}r)=lV}>ew<U7^rWRTba($8!mJ6cPvlR70?Y>$uNF)w(5@D;;dICG`g@Sr9
z3l6gZI?otvase^US)x^9jSkPx7QTQtHF`GKvF4ez(M};#o}_-Sjy&cdi-3Vjmf#az
zfcI`fDuLnQ3%Ei^)oOMsk6*Vc0|_sVz3g1eD^6Ou1Le>h4uT7puo~uR2xU7NbcKZA
zyA-`Oe0TyEL>Jur8KYSw_?|VM)Gqa?hb12)!#DUOKlC*^E1~$Ftrm5-?O=#11;18B
z{nf!RbHn3pDfXI;u<5IMs}-F+OcSD9yg07CRK##yo{D8<%Dzm4dO(@4bB}Zd`{_LX
zH}7TG&`E80Usg-oA_P{2CLw}iYafz2F$+yUJMQ~3WsAReMMkUIVX0^4E>8<lb<17q
z-nr<d=OEb>f8%O@6fdB+4P485>MmH8bc)igHt0^lIaXmpQ<2s(w}leqoXCo`xDF#>
z6%_T0cukw(on1se<HuO1Ymg!#&`(k^B^2TstPUp?y1EvP(|x2Dk{dtSNtD{pvza?y
zU4_8$mnnsz190zCzk{be0c$pa7Amwjf7_mtY-Thwxpi!>z^tFF6k_6IUQ<=!v`&?u
zr7Ls7iP0}E;4G5GvqSm|O0N7tM*fVEW3E|`g)ic%0(LhY<17~T4P;?r4!lb@M!{T$
zOPuY#G<xxOs}H=<+eFUSJUo<3TOzt>P=^qHL|eOIXgIHtF$6SBoLncS$Xu7pcMCOG
zm{BW}S6vU0nB;ih&Pc52ETslJe$Xo18gLp<@<yC`%MzlONn<UUsNmo2+#mL$(-PDQ
z5|af6Pb%wdLOe18X{GfW=PgvqgGZo|U<n}j=fi^2jyZPJMx0wn{GzEjdNKFzIM?1~
zqy$!B-MaZjh5b-fJ7TU!?m3PEn#xu7VkzT*CXX(Bu3bJ1XC9+YI0;3*CJiC&H_Y$g
zVh)N+qejGf#g^;DVdM3nJkd>9%7|^U0yhz?L3kNC{NOPZt`3NOQ}4EzgGnxgIBPSk
z?(SLAXo$p~6yXhF^xBQi7TtYfrAI0)r4gA@l+JR#0jTN^b(15J($d&>I79He!aR&=
z=V;;I#Y(aIWmqT_!J{2B)Ewohx5%qJxDKNjmVcC=3W%!|DQd$xBhlP-Ttm5c4FKRz
zk*Zlz9`cIB4kg;vw9buyPoybFEpH2UxYjY4x03}|HDmyS6Z-x;qz8RHuHZQ`%#+**
zK?ju42GIki4(aON@eIofe0DvK!}GP~Zq_P-*^6*8{gev<rPhqR9ufs4cI3arEN)tE
z06rdjbXkw@8d5+V3oF)__D!hN-JKxrnGOXBiC04@B$`?yo=zXBcZW%#M1#M$xm0$6
z1|k{Ci4{|lNK9@v&uQAlJ7b8@s#VB<`obc_xXx9Wspbu|mu=d1UcW;W(YU-xyr*UF
zX2pQL@$SN-@$)IzhNPSr73=nV1Gnx#={fUjF;7)byUfCd!CTSJPlFn%=&fud;Z55A
zD!Mu$d90h&&e5-2c+m37pktQXLRvGJVx{%`o+-xcfDcG=%~4RKl<{u$URUs~rI3|u
zw)(`zla)6RpRxu*S0)aw)QR&jOa)(;f<f4athik%2riaVc<a(W`(BXfA4!7h6_`;M
z$R%zg?W|LFcl|srd#maeL=v>;o@3Z|Fz!Jo$;CABH!0|7CkR2U)3{gjBq~DM{mQFB
z@k_|sTV-W@At*FpI?pOsX)-Uw^M%u!JLsT-G01)_@4^;zFO3JO@h}im@b|Uc&DM3B
zcuUHEuwuV@o?fPhtoH+k`%2Tp`6YEA@){;CA(8epaL2U?LS;S%W~KekC{Q_?2rhoQ
zdjEKKLaXKg1RD%UHvI>xkY1jeBIC%?_t3JE5Y-RQj=~KaRlL5l64veU$F;X}F+}&w
zT{AYza2LFAA)Lh11mTtz^7zn_Z^ustt)|jsyW!npFgbW5d^erT3j)ha9~iw3kPsS@
zEjx@U#;8x=DhY|7vfJao5~2rAi!908uzFv0ZV1KQT?)1k_kO|<_Wc{vg~CaV_cAQm
z36iBGI>AvULx-~!tGC0yC*_-AUAHT+kqr6mP#PqWon7nrn5r$D@Hw8O|2!z(7~o&7
zZWr5e%<EL(>SvP;URu_mj~JQ-T-<EY7Enm(<P@O5NwX3@)fTuk2(!R7+qoVh)T-Xr
zMiS|za3_%jRQDpYk#vCzUYv-*9w4kheTRJ7VUk_#ERWY!l4!!U>PF^1Qm#rFUe=6U
zg?AqHNEUP++$2|(NjN*Gsyhybq`js+TFhI`E<O=UHr_tIg-%wwRjJ7k4fT<o=Wxq^
zd(<tTvAlJpG<9D;*LcqKf#C9%mKG@jwxH&K-q3qz=4MKEM`JDCHiXLBWZN{c@RNNx
z%oC#a%}x?F6`98@PpR;TPrS{6IB-p&5+02@(NhyJ;3tD#VfL}UpQ6`UeoLv}J&+V>
zCW%gg!wZM9_e$b9fcsVOUS7W5vCg^)HnvC65cW(2SJ18QlWzMHUY0bw_q)4MM70Tg
zldg&U)Sk0ipwV%aWZf?+;H{vYoc|eFZz&`eHUB|E=v;-R1_b?1W8Ml%o3~vTnP=;H
z49k6C^eebf&px&AXJ)jnRnA_slv0^Yp`5v$zb5Y(=(KR@jNx1@&uLtd#`=}}yyw{8
zUOeF3@lKz83+&Y6=!J9xE>KZ<D^KEm@(4HK@}-$ch*{pqo&v4dr|7JOIXGazIK<gW
zm9}^}r&Di&VTSZZbfGx^g)|aQbgk3HX~IQk&^})ve5*Y9`<4Fsa=5eKbmMKsUWoOd
zQ;izc%xs)0&#esQlf-dPIvHOuTq3^cX%=W-I<BR)XJ6?`tvS2hYriOYmMRp-L=r<a
zEIcBda6%s}p=0!-MKSJAUlYCu>fVdv)n$FlkS(e(@qO;WJ%O7^K`E(=zUj;QX<wg&
zX-pr=9#O-)s=nt$?UCb&Wc8HArz;-H1UtNERM#Fu(l%GCe3jV`c|HPrlu(_uz5$zy
zP{l*#K@$GqXReE%*;+~Y8`43XPuY6gg8>oWl4bP-Etes60WV7w+UOR(I#ELES_b~)
z7(2<uYzG~^d$Gu7w`CimICeBTLxdZ@yWn+kkfdD)4(_86DY1EXkkNsxUYWMup^~Es
z!cmXId#maMET`w{FO(gBC;m9zK)wsfi8r2Sr$u`2ZzeOCupbwplD+vzdu9epUo)5F
z-rA#br==QX<l@P{@P22pn_C9op{05r!$+d&d;PLEKBz9{6}8jBwa+Ms1%)q^@;+q_
z;uyP$WSc9dSV$H}Z@%Bp$rR&ar4D+H{uWZVcp2MF+XbXzzCTG;^2veQxL3MM0dN>f
zd#o_7eWXeX1t3n^`T6xVxrn!{BW;@zgEW5bQk1Vj^6fS!y5gf87fU&2m+O~Mu1<p#
z+x!<kF}9r_iM`|BTUjSDn~v@^BH1ac5?n!Km(@-=!gzgs_`>O+0i*eetD)@D*~N-@
zfnbJdXH9r|9Q4%TVGUkWRQZWrV|vS*tv_Kzeu33G5Z!cC1gQ+JRTaS-27S*cucy1t
zMfRGke#s;4C&+Oqu!nn(B>wYxrLXqxl7!2-*v`=#STE=BA;Vm4$2S=>`4yZn8kh4A
zZ6-Bwv-LiLREjd!aS<X&Te#i+6&7`e&oR3gKgJhBhatpq)d|G4)Fzk}3fH7djTt`7
zWo@lyoeWB03Fc)<?a36Hb4-C-R$YUzg3?LTQ8;kN)E}(if}@+PrAxC82v(6@XdYyI
zs>>9z>=;}w#-FgboYT?G>_yM?%tD0!lTvNqVcmPt%S?9m{@wTbc~k7{#%|lq5ky=n
zKN-YWjBcCc>co$jKB*I!t5JI$Ql-G9&J$D)yHqEUwOAiBE?2g-!aRKtZkoXAB+`(y
zG-*XbpFB@;AbBU52d*Z>=#%VJQ^oTt>)XJ)=QnOof3=0~1yBT$eA;qiU8!f1=`DBd
zThZE@PrYSGadF+`eJ9EJ0E0{(hWlhyVPe5UgnStwK6V@8a?)848B)M=*@v3}K1Xi+
z&HY%fyZW}^PXC<y5%^)=;PHf#KjeMTsX+)+yf}#Ct&OZK-Drey<`t57Uw3y%V`u_F
z5*<7)q=J$Z7u~t4n-OK-!v_b%G8V6M;50&RMn`Lv?|H^&WEY=+@>#)G%d3HA>QpJd
zati6>_0%_FR(2UOv{=J+5qZ5!9pCWtDuJQ1V%L%yeH~F4#9vN&{jh{cMJ9W8P6PO<
zTFa-~eCg%+=E=A&Wz4#Xk0h!xzEzpsFD{qZ=$nv+BAWfKfpT)!e0o7{=S<d1-BRQ%
zRZQ{E%79V|MO>r<Wi_M7t?Wn{>O$^b`viGQqydaHlbazV+NiMcm9!rGzAZtqNqLPH
zGJe7e{+mCk^k?U=C-k6v;dLU5OR7Uehhx9@@7(H0WE9zN`9-cQh(HI4a*7EX9`#~r
z*LF6wv{CJ-aAj2DY;RNq-5OheEuA94{z68ZY>=s<xVi-{jFirLKV`r`sEXRH5QbNz
z{NFG2rFLtX6=@kKEa<Anq~uw>Z`81y54q@3eti!oA!4c}L+aHVr)#p<%T7m7awH|O
zbD_6~>}lH!>DV<~l9zr_+OM}ZrjvR5nS<1eAc)~#-gxvL^&@m6JJo9=ta5)K*2q4{
ze@1D7{T9^7UPHGet+Lp^W?mb%@})|e?u+Hgj6#&8^bHcYD9dI0;)^9H>bdT{=a3>J
z<U|N$g$5`?l4X?~`qxnNb`I|*<Z3X@72E)5TE)HVke90v@$6YmiS8hrDPvtdQ4UZB
z3JX!bhBMX`q$9W%Q0TS8a{_@g`=|M4muRr3qYK9(4CNe;2+CAA3FMwkf=gk;zKlUB
z>a2FtYVI2g#V=or0rnA9eEfta{;NCxVf!g*A@{k59-gcr8~km72W=2KwfV$WP?HWR
zV;YG8>_sU9d5~u*91Wf5xG1Llub%~3G{Udp1uKh5DN?^4EC`?dRi9TbD0nGlSfe-<
zSedN<j~`}o^XAQ@w~!tVNzNQu*HE3Aq3m^twn!sU?N+gEU+;EeNT<dJ6CenP$I6!*
zI5`!WM_-J?_MkgFeV}+t?ee7KmoxyRhOYG))B3U>6{gql<ex6#$S4}aC@=iFf&Q4U
z?*n~<P;0|?RsNYQ^z(o2qRyUkgUs3=efh8)#hlRPA%?5}Lnfrb6(!h0vZ-(n>cu4q
zTWk*}?1p-A;bv026P?@VH;^%TGH{1yK0b<k+N=zQ)@v5Pon<nFJGt0k*D%c*#D!){
z!1Q(hL5TU+SxJ8Y+yQ}AxdOspsj25FUV*AulVbGuC&R-g`M+`jFt$OUamEQ?s@YD+
zBc?a^baJoyk#2Bq`mM4A))e(?$KV^pyvzsl;a;3MT+~Cjj1Vc3kKUnMzplJOspDeV
z2`9U?g@-ND9>M|8NH&ph?D;~=7vSUW>)`+AaSy%%4Zr3Dx{VPVxK4uk1UqsfOwf#k
zJk+h5sMnS*`(W)>ZPf1z3781pBz-6y95ZfiZl+RAM|sG(Cy+#M(zgHg4bDqv0`K3?
z&G&^$kNFH3_TGG%uh>4eoB?v$P91b%Eg+w{E%uQbhC;p%a?-3iVb@5L>@`dXY~pIy
zYz_`i0FGiO8MxYha&sUQ6UiDbgS9UY?MIDY6M+2w{+%zXZ|sKL|0H-)bqctnbbF8w
z`zdAx{3$(@3aGCA?h<@0QXp}6PR1(baorLEe-u@viN%D=x$kctgi-)z3y3>Dmw@{%
zWyqnOc{3WO7c}}VSspU+|ClU=N@A$Wpd{u;PQJ~lnU|rZr=lz?sW`?pK}K;@t=>tc
zY8*-$JlFBZfCkLOq?oEyUFJZJJ+@8(B?1N{4H~719<pf)L$K<CaZz4p5oj1TBh$qF
z`R3zX_{Y+KUP|~5Pe*VIYIfEh$2vH2s6z0|CdzGWm6nw?(9Hf>*#R<m;4nlS3CB(w
zPC@mZ%dRkBmM&tIM2f#s>>t65xGHr5Yj6-xALT~7QK1CsUtoSu$GuQn(vfJ<^NbxU
z%Iq%i2bh{qw6FECO6J{x>MF%w<y<e}D?cfnz}|6$FkPf{AV2;uC2oJrkN=kvuJb5k
zIjjL&@WJCpdVA-LpPwI;eF&wUhb->SGq-;B1>4@1T`sWIS?){4-mUa<Xwzxt`FkBA
zQmN7t!CpP`Z}nS$tXjDVQ#730WqS|S$$Lkl70xa7t9t7b{Eba9{4&-h$c#qOl>~nL
zx708QAHwnj0|U1K5;tIrz4U<ikP;dYe%8O0M*&gj_jN%~-Ssi|E1;=4LP8`H!v~6}
z8$<qEVVR5SnP!(FpN>jlpF&EgZ!2LF`b&K!Ny>V05L7N+X2S|z+AeiSLZuFWsa!;y
zTypV%f89<{w~4|F9^Xh{4bJc#DAax5uDg8U3x`SAqaVj2*s-8?>I7g9vCO6)RXyC0
zZuu^W8S#Z-6+=Dht?k`A%@}(bD*YKlWP9wC{{Bu_Bfl-q1SFO0U83l)u*)!lVz*|E
z@!X()ZzMeVS>#xw0mi4t<~QpTRph-(g3j$AJN|RZYk{)lC`)n$)}!EdMc#`y2?+_x
z{EL_Rs`d*}aZ4g%Gq?@qC3q~sjDh$`$;rccK>^r40|)orc26fm&6bJ(ukOu+ea9xS
zNd&9ydw}A!9fmkD`y%5}c0YV$tTA+)RzfZ@)-l;G^x}p>ufL2*1aw?GGaL3=P*iAh
zH;|T!Dm>N5$?1ISE;ld&ca>Rx4k~>^MEU#omk@eubxp=tH3Dm@5+E&M-RxUqhk2#=
zYh3<VR=8c$lq0T6t46}V0HuVG#GXTY^4`xGB^7IB|MuERyHlLl`xoyp+9ww6;71$r
z;Li#F<B}&8AM4?1duY>9u_f0fz8}LN+>R_nw%&~iO+&2f2AJ*W*fmiLKhlp@|Aa#o
zL0w+6t-@ZcMt&zM+*7;5bm9a`A(mHOKhMqml;r~6ock+-wllw;+H^0J4l8L5O!JtM
z7GX7Rht5Gi*Bpj^RulbI`)`ASnd8qXuo{<LsC+YLS9@0m%;IpaE5F(-0h5Ymwh(KH
z(jkN_t%>;imnmO|dN5wMTr1xAw6ymWSc6pCWw1G@4E8T0Lk^qr&x#PO(qOEM4_I&@
z=<`^Ohc!4oEIOJUleUysQ?2qViV%Aop6h9t6xMoG0wq+;wJ7rUY4N8Ozg^y$(ilSL
z07je>zz|9X;J!sXC;Pbu89ib5B$KdG-Y;Qedua?SZ4&s2u=yCh^pF*=l|uk}n7<9A
z@W|0G)UigSx9f86ABNH&d5$MP&HX$AtTCli9RJ@6xnDDMdteiGsk-$I`exogobc;O
zjd8(jofN4Jfvr<(AP>9MK*jP@T>0nv6jv^`1D;DN1o_s==|4ZnL)aM?ZmdJz3@xf~
zV3(lS*7l@YDEz)N3xdLAbaoZ6KvJ$q{#+n`F1<sEb}h@*2TF8z#~i=<l&rO){nI-z
zxLi^ve=bj?VP*Qb5bL_2e*j(Xum@^tF(LvPPnCMUCfm7{W`({J$*3`1*aVOCM;~^Q
zr+f*2;gZ2v)TzuF^TX2o^JI948%As2d7)kp*(^y)>amb?TPRIS6kb;|s?mvUjM<pe
zX$-*|1KZ=kf!#PwqM{h{w#TfHP!>X$yT9pPTdk0A^!Qq!-BFCm<(viQCV_zcxxzf|
zn@`!bI!V*a+IUqx_u9rp)2rp4#;5pRKX2BX7P6(E*Zjn+y}hoqSWvY*xhuX6>zccZ
zDs>asVLos{m9~-!)q1Mn>-ffdlM5nvfkXS7cQ0nR@OIuVXLf3<c0K1bsm^{&Xj(Ga
z^XbZ$oOR(S&8~C)e$e3~ik{jV1(*=qs;a7SjI@l*{t0){wRtXiZfx7n15`qIqSlb?
zTR2c?)0+5+_U}>2Z4Xxw+Lxi3ZRb|pGg)UoeY@tw3kTfE%~NdJiH<#=V~89VZ*^e&
zXX-mTI$j9o4`ZF@1e~yTZ%gK$Asr(8{=|MUwfl|14VLjKtq4~Sf%c%&ft$fwQ!0jB
zxZR{FM9=0~JenpGJPz-f$81jdY20qWi`d{v(5UYkHPc%d8?l@0*3Qzf>(-X|eP`}M
z`9hCn>d0rPXlNkQaBQ%VJj2DJqi%O>MmptuM%}09($(}2N`=e3Ev&Mv<3tqaT;r^k
zBB!kj&A#F}E%f&4yQ})HJ)NeRw!B62R&Z~-yFFHmwJM9gEE>cuo`2uZ6+&lG(S9UQ
zMd+FFEtwN|JJrOC>E0Ibmr<z~Z<yP=<*cniEzBLJhRF-bMm|n?Zl^^xN->TzJ<(f4
zi{ci1;&`E-wrI;@qN8cwODszLSVI3Y*CG4_u82?eoERMQs`-~}UawwHZy4L1l2ON+
zmOB?`UTl2euFINiV{NQlPB*=JMrI_}pi8}qF$aXMD#*j?-DNvVlcKg>`-m5h+l#(%
z*>^krcwaM-yzwss^^e7I2PzdGUol{PCoq`sibeE7DSJ-Hfd%z&#mt=61LR$5IqkdJ
zn9dO329Vs02-Ya@(wHl>XKnvB73S=kaVWBR3wLhC++mU{M8Yn_{@lLsW#{)UZPV_`
zve+ynr~rUs>f?`*@{lx=Sg~6tP>8_%_@#k&*;0qo_kn5N=PR7`POOyBec|PlcBUYG
zR&5Dw!uJnNj%<AlKkWX$*qrhmXn$#{0St^(wc@UyCSJ^Q#oMcs=t7=zu-&lz!-OdX
zP!FNV$6B`3APYt<+y(+!98imSMY;NyA!?xvlci6IdeGF-amWelL+tR}Lg$VVW#0qU
z)TtrzqQABctep=N;R8CV-Z{aH@U~dz#jigJL;W+|^v~vD?_u=Vp7&80|3Am(U!Q>G
zTmBD62KP+=ha>A7p!qfBe>Z38A8Z%n{)d(D`<aNQES72a_b>UsDf^2F`X8Xg?=8ar
zO<73O{=X?(UiJTT=wkpIFcKd^CBY*g9q5MkZ~lH+9D)j#m^jnY-N2{=CJM<E9B@yc
zsK|mm#t;ASfByba*GW)hAE%oQLKO`Mz#G_TIL?JNj)2~KW*@xjUBj(<h=2YuivGh4
zj(CD^en5^5BsDxQ5~H##I2>{WAFQ&eXouMT?ys`~JAc1lucAUUbM-_ya2ZaOl*2hO
zJyN^@1UF2+23Sh`-jCRq3OTtCxO?;<PbiigdP?O<&2OpN9>V6}N;r)h+rkIW=Lu6c
zl2^fzn}2!U_A0sTzc^$6G4{fAsI>Y<^Aq5;c={ZKP{WVgFRM673%Wl%Whp#mLmBQ4
z=;9v)tBN4O5Yh6gBiOIEx7{8eIGCtiGq;B$nXnExh4aeAWWUuwD@RbfeTI61=!&4|
z=ov1qkp8etbWvpXLH##3$Ee{yC=0g`#p13^tadpQF$5s*)xF(`JqHU-!0Xv)mR;>}
zMYD_v7sAvzEaWqzkB3fTMGkJa%@dpyDqn6VBq2BtqI}8r`6^f$G21g;7?l8QUm@=B
zJW2r^3u#Lw)xj5c>WRyUX|cjow!aJ?B5P_ERy+{I5_aDM8h?mBgo(#@Sxvi#!G17=
z=Eau5d|54a$H4j%uK(|i)!0gD_wS|)nl}muXJ#poTTuRoqT%gHb}AEQN&jjwKfLoL
z<<FI*Yztw_wfvJ<nH1a2!$IkxvHDIZure<KeMyOF!y2#q(U;?>FEzr^vW+7L)zVZa
zJO1T;lWBc#umv{Zu?UE-;lUyk6SlvKz9~`|@{j=(3efb0F1?h{|E!gN{l)bpL^4B7
z6*=L9C{z$7zCvDW@MvGoi_IzQ_r0k1EFjde&wfi#5Z<YNFbqQVBx5HJK#;EEH$gho
z@4~7uH=&w=1fF&u=&aOVUutju6|C9rhyQ)j2LMKr?b(p0L4yD@^kY1cKkDM`#{b(d
z*Du5TxSKmmivn1a1Ry(;*x1+@ewDhhH7L69?*{pQKV}5TN68!)RAGT#H^c)CO8w}(
zAGo48N4o>R6_SJ#GD=mDa$Q2Q40Omg$mcxy?OV!);Z^&T_yzYtW1OHKq?uoX^$%<N
zzxCK3Rwkl<<pSg$`$d>53>q==tND42@W0Lb)m=77gp&0>1sLY?zP&#eU;jmXfUWdm
zhpnJ@d%D2*eINCagt2;%2P4z=CymveaGS<aUb|ojJwK8Jw?<5DQPI&;L*Lu521kC(
z1J7NsAhe0l95=iIL1-38ud1to%X`1tKQ-^a7kFm`kp1I0z}m+l;JWnux1;~p<-Q83
zgUMN)iN9Az+64hy1L4D?XW@xo5TnvGh=(cx4W~bU>Ngjws2}6t0+(uR(b)momy3oz
zI`s2zpN@FsE_62!Sq6MCkO@<^g)UK2)=@{RU@#h3*8{w~0s;qW0?zhmX7=v69~6`X
zl+JiFoS4_fF<!s5E+e2o5F=#z>?OSF#UU7wfVipX-#+I)NO-S*QHB$ZqOGO`O4}FB
zkW*zcV5IoluYx5f1Ya;GZ;ixh?<&PY!VeyVr$t%5X8-<;FD&4FdkrRK;K4?m;elV9
z$L;<#*OArvN9V*IKzqm!36*l)J#ZV|@0-bs`SfqE{mZ*Od(g73YpaGYI>d{v2hipE
zoF=Gxw(B0idhfPv`0m0pQgMr+8^sUGKx8Dn5d(v#`AHc`bN#F3`|;enVD~Vf?;JtG
zmM_krh7L@y|LzSC0s}+z1IH}<$J=w%V$Z)^Q66$ZjH6T~$;Xc$-`VQ>y*0qW4FCn@
z9N%R#Bq`$t!Pw<X8!%)Wi-*R3yNCbHd4kX)OixXBdjx&P+8Nh*ky+1~j*fuW%pq^j
zkFlsfh5Zjc6rR&|^0aCixY^g~@!`S%;mlie_AKwanI;JOC|mD(Ts=N1hZs}q9CBhJ
z_NR~aI3(<3enrYql7xUn&Xbi*X;k^q^*A2}WqJbD694&w&kgDKc+C?(uOPULi}PIU
zq`2^-wUV_STg{xhPFwx@iSU)I;g6&rJ}(q3J>eLSnik!GgGVfk?MDhY5;25Y`}X6*
zt5S-DL8HZ}g_*<Z+XoEY`Ko(6a0%!=e)l6c1KyusYD3KnuWWFEJfB(~gWx~B2|f~s
zZfg6nJZo?vlBbwW|EDjaZ#V-VPLyy{6h2&dc&|6i`rZ3E2daXn*B=k>Bf;9jpMUVA
z*aJa*5`khd=!#Bxda{{DM<lOEBYRIz99!@F=R-S1!1`%up4fj7byHBergd%W_;rG1
zrQubJV}ELGgCn%@{B(qsIC}I{_=n`+t${zi3-q9&!C%O>Uss4Hf)g`4P!;L9bN5i^
z{=JO99hB`MyGaU>=vu$T?Xlm~fd*4?89kxmvpR+z<9B0<{(77MgvoG1q#5A%s8e$K
z<-_yMvOGI??TbAu$J8tQ?8u+S>l+1pedK*|_<GMN3TD^=Z@k6hFl>k=2z{#4?HX*k
z_@}3kV-+sH$B&Dq34JJ<m==BI%uf{S<z7h~r6a->TDbg@{ZCI}1Isl=ra1oD0eGd{
zOJOw}I{RYi|JS}IwA=IVH$OTh(U@zo@27wlF|%8&jwVMwWR#%jE#Jx-Rq3?k;Wnsy
z_O{Ee%bF|xu-w<3H^hFk&VJ0KST7iS76*CWWAM`0MqD%I<Y@Y!pfq?a{6}Q#sRawF
z32dKf|I{k_1{1W<C=A@84~6&QeOXjZ5D&zpJ(ATdPma!4pW@D=IWk3}EZAj=!&Q9g
zkF)zQJGdKPpPLXvLsOzXa5)GU4VBLJ4eWYAHX;wXD7Zqc#WB1Le`=m#CUo|YQw|Sm
z+$~%)-=t`6Ewj8Rn6u)hxt#vH?&ZEFnm2T2ln6>+aaXs=n|OMo>W{O41e!`-NKCy8
zjbOe<uX>u{t!FQH3R%5~i;J(imVdlU;*SlCI0+35@0{2R4K$4N=t}xTEB;VYC~nfw
z8S`54d1ug(Lki^dU6N8=$NHMz9=WUar}ZU#3lA3mkb(t08i5sDvq<*IjLRBqJbeTC
zBStpm$s<radGo-Z8s{NwjZd)5*&bPX6xJCV`AnBhD<p5RPLZV&e7hU;rJT6cS=}k^
zh4r7h{%>ZSStFLjwCIaz%aNJzGdH~CN0>SNy?v?|_16CUJgm^JT#+bOwC>r+;CTi!
zo-}{(kY%<wcDU0zZ)Vqx_CI}y93PBWNbxahVl-lz!lkA~>mI?FnUP@eo)-_?Pc7EW
znI&tU%@6vM4I;=@ffZ=H@tGe!sKLh5smo>shc3!P-S%^tL=biO%Cml4q1@qWsSjR%
zdZq?^Sa?N!4A%beN{`ch1)~bWOvY!D9|Ajo*XCL7aK9Vc7qIV>X#E<&SXoFQ>!`)t
zr#kzQbVi<^UAd=G?vRALztW6l!GJbyr?u@@qK_y4IDg_{2Q(u_3*{>Q`WT;Dmo79-
zdDC*|1;UY=RB<Xd)K8CF-u%4#)1OUs6g^CC(oBY7crXGoVrKP<WO*DqAFo_C`-bSW
za-W;ze`pu+hy#omjk_P;MR+CsDPm0Q<3y(*x*7Qj+JeJ!PfbUBMR4dktt|`CtoVIb
zm!dvW-R`4P(94Hm%aQ+-@w_p`Be=|b@-b9s^xl7V{tq8Q@Sc!(=IapZ^P+39m&s$~
z%BwOeA0K}{VD|2Nb97pO`6U^eKaJKw^dxt-pM?EEfj@)?*voG4y8fvz42!^sB|Xj&
zMPs9mD~4Hq6&K5AJ#@(ah+3yhA-Q|H_|xhOt&au_e?M8!zBdG$=<Og5=Znw~7B+YY
zSt>B}K8Jqah1-AqZ$A>WDD`DiU-67|+9~7~YuH!rDNgS4|M4U6b_!j|$Y1xU_ufN#
zztimYb06C;x7b{t=4)%7_)~{~cnS2R&3MRg339Sz%d;>EbXCf;R|uDnWhBQSz^S+=
zt$m79u-C<XfBY~YQo>4n#k$>j44!f`Co*5cOlw??bKl#DX$pq!W=y3y6uSFQ!%IA3
z2a~qjZMVfHTGEWfn0``BCNfp(S5LDe)ARee+AxVM!p8o-b<`>edSi0B0>WUT9X^}2
zRmy|3Y{RhYefgtZ@eqEATv@b`cX%H5lq07rQ&wf%DHKO|QS;#ci_X}B8S>Q(O?2WE
z5(4GfMqKNmno%`&vKh<vxt$m7#Owvv%%Q(<{_Nl0gzorqbi3JHxRy9{Jh{*r!AGe3
z75>o^!uEUCfn>+YVNl2U>2#TkeH5=jz~5H(H!|?vNbgs`*Ls=}6XcCHId+|qdH7=j
zwOLn{d+rQmSw8cRB@nxYmn0RgySg+@7qtuwNCUHyS@E0?3;{NDcn0_X_7yjY;VW?b
z#8A(fkzP*gCna2dg%d4j0TkqdZn0;l|91#K;u^%b=JSm{f9yoj<Q@an@-NV?`Imvl
zgU!oAW4pO{`2CpW6y?yj=T{SMXxJy>yL^#*&>e?aV&3<DySxAT!*h!Ckf2j8-&b!^
zCmENrjYW=jO>60;ix!oC`Ec0hAvZyY>H&m2@rXynJzud15q=v9_7XhI-IuUsCwehI
zDm0I=uQ|}3Wccq*g}0WJuMEeSBQjbvz8zGZTSMWL@ZG;XJv}k>$U9R{^q<6JzMwI&
z*I>`Gcc=Ajem4T&tK0woxAt~3T731Pi~yW6X4tz7s$s*|ri!zickn9WbxQT!IUsZf
zP(P2=pva=u17~NA%UrE$Du=^pVrUH%8k=al%!<uHeP%dNI_uJ3_4|jM7vI0j6GP|%
zoQolVT=L`?)N`-CoO*PXU9DAg@RcNm1604%0N40NWe|<={_`XB)<Ce0IR5ln{I}AV
zj(Ef$SRmaajqDQ|7T6+HzR9=Jc0K)+Xt7hsEQGdTLxs67Ve&Q=^5p*35CWUza-eZ4
zS_466>DfAzteQsyN;K%b18`mA{h9%_)p9DoLs!3W@F@pyE~3MorwaMMyYQvfk#ztL
zpcZf)N20!jzMemB4fUYm04^^bWLg_b%Zh+r==kEg)Xi1(5<s(uq?I^ehtui<66IM`
zCS31@NlC7Dx<5sF4e*RLP%PDI-GB9TX)YCy*|QZW^Xzmz^z1AyIV=aoHvw;tqLwL>
zSfnj8f6(i)Dy#}U-bZRwFIJw+q8dK%*C~a}S1i|mXRKXjxd)4dO@9Xvy=$QiT-(|x
z;M7asl}eu1H^=~TH5ig`0mB{d$GLP>HEQGyUtUc4laq>4Tbmok`!#U{^E!<O!#Hn0
zY8<|gVgD8!$ldut0O$!B0s@PH!d(Q+9%Dj51RwHh6OETWS8?bjg{ReMokos|2obgG
z-y!%$2}`Z_J{MYQnIz!U`G^Ral~*0W3#8^d_ma?+c~E6}^st-^**}b~6Q0|A0$n%>
zqxk&#7F3*GSMp0Btzt<_*VTq;Rg;|5P#;>hKA3X88gQvcl-Jat!0udDy%zzedSB%c
zbIx~niPAosq?MH~!F-GW5GacSCi$6_WJ`QN{<*-{q#fE!ge9A+<MQKz?(0B~#b~m=
zmYL7$<THKoW|#QaEAGtmiIM?Ow(j1#KYZX~Y#OX^o7Taho5(P&<C^hSsmxfit~|$g
z5t%HsA%ozs&fd0vjKgC*fL?IU0es}u@RRd}>cTx9vLolmyXN>OUflVt7c6*R{Wx36
z%-bj%ehYST%9%2Ply}MmllAVK)>nfxC-qv`D@_>_cFN1me7#2(w5bp4ncEWx!oB-w
z1zxqkHGjR5B+o@DZolV!?dWANA~I(%<saYy+fc|MO3)d_7dl1gl|rfxY;;Hp#@!Ny
zhwTSTe>-4qnEoH8J(e0r`d}cfV3EN9ssQIa+v}${(hxTUbYZiR)Jb50G+v6tteqid
z)}XgJP{8sext`qq^DRmayNQm7W)%Xn@83R(Pi}&b@IVW}_PJ2qXcjqH?09>Y&LUCX
z%A+l)eGg>0Gdhj4bC-=R{u$Sc8q*x>#EI-@b$ZO^jzqmPcLq~u>b{3>dmd}zT*%1K
ziQO=t=y(xSoX*yG1WLn-T+9n~P5$YIy(FNvd`~h2R%2AnCek$jY#BhNU#553eM+hQ
z7<N{#254~lK(vV(S%zBlvWNph%du0|hy-=SRb>t^wb@gqd2_oqH>TZ>C`XQT6=bPd
zyut5UQDSkNFrwtK9cziuGgiir(Sa)LA%ucDjcOcQU7!p<{W;xkAK?)E^>#hybn}wg
zyJ|q;Npl$?;^+2p;_ftLoLir(m+1Ne)Tf&Fex%YdLavANgYCK+fWmy8?A!dk&B<_s
z=(7MW9qycb!mEvU9)*1*G{t356oOf`JgXyNzQ~K;u7g|4tz06+^~>PIcRZKsQtkig
z2gBW-vO+7z2zg7uA`gMJ<%H^3Lo&Ho@=N@hrIpkJ0ADcAg!1+1z)4s*LjW1PT~pb0
zleMU~+q{Z3g+s6M{z&kNw(Uh`U<D|laMAsI$L79A<@P}}>tSV6M`RhBodAMd1H9RW
zE6E*A*76QU(T+o-w)-$)bMOT`3mrD62#>4;2POmybv|?7T#*)}uxoS0E$DSC4hLhR
z1|_k9+Buxs?i;pq5Z;lPzSVMGKA7A`l;YZT&wLz(t||oLYo8Tb`eOaYL!#^1wV<8K
z!Zu)q)6`(sm^uUGUd5mco1XZHwMJ)Bcf|G_0%)=LaMT$CZ($G^pSgV};Qdi_Y{;OP
z(J+TI4oyr(|G+)Z`CSck+{7(obGlpqvl-NqhVfY^IgZ5Li*MTv#p`8`fE~TV-|jv8
z;HYQ~Yy~$F#(#*0-r1<A9H6DQ07~9FKcO08m<b%`Bsz&SyODb*L|WxwZxfsYdTltc
zCeAs1@jh|Ad*H%Qm|lFoLHA;Yld|?T!#E!&X3oOQ;ltv3c$=^$BC}iuS>FVHd>wxA
zE+Eb7CSYTModbZd62T8)-4&l@`r@6z{3L0>_QF-uw`QNkygZ^G*YghSOIXwT8U>V&
zx*DhqZafV5i-&xGPB!Y49NX|0PdOMlbw$wNbm6y0rZ=YUNl#u3KEhLntOr|NT+baP
zIb$f;&H8*_Td^MA?!?pGHqQDgznUd&-l(<pqp5P&s+Z-A)=@a;v9fb8`GJSfa=LHx
zWh!l&+b!B7otB}BiL}ZKv%3Y=$IEC<aQ<e-4R_;6ulK+fC_bswa{hSzHK)<U1mOlu
zT4sa+*HcqQoV+G^E<AoES8-*-qFdf4NGB;%`uCA7uK+pqHexe(ENX+v_Fn<6Z4=x5
zMn~9RHo5@2BYFA7MSD*#2f}YYcQMk4mSx@${fH-_>YkUlTiOf#f}tio)vrW>fZ#g4
zs&TXprX9Z#X>z_erY4cl7Gp7wn5m1iMPUAqDAN(<Q5zE)MNV8|d%g*99x8lxvQvih
za+eK)1s5+9$q#*=esSYn4M4DC3__lsN*k_5D8yION6cOwwHm!kVr3X=d~I<-KA;gX
zpW4mBhEHllPcjli6!Klvw(IoZ3jI|JiK8)c6||G(L~ApZr%Czt{@sBLp={vkq;p%o
z!2z^;iHp>(pHwHQVM$SOIDWrr4ZNFn+x@#~9byZiv~;|47ptmjAEKR;?Z_EGMjzZ$
zJKF=V-bf^IpLRQPQZR`^)8b8wU@5>C<UifN%3gTv)dbp8=1QksPuucGz4TDu{2c2e
znCwT&6oCrA6=kv$hvVcb02D?nu*-*??1M?A!?%YLHGs`g$w>pcA){06$poX2w|vdX
zvOLX6(i~YO-zAkkiv?@=$Q}_aIvW<+D))xn*;ScKqCK|pYcJH%*8^*z?1R(g@X$yu
zW0iT@)N0X-8u(_SqNW_CtS?z&L>YsmP4!yG8fl<PJfINf@Zs#0{GPkoddd}5`EPQW
za^CId4+HQ-T1I`~9f?>PI1T0wC=Z08EtrB}*|g{-p5w%g)ex7a*MV(b&3d8#v!MC#
z*-K_btXt$I1~plG`;Fekk6pMAjvE{gY7vY-`f;<t0Y>rrBQ=kSp0%30uW96facNsW
zW`0dXCw>)RY80G`_z&GSR&4`^0g|q+czh6M?>VGO)BemooC&k0v}9~}lPHm^e_3g8
z9FQGZpIfAPx^E(WY!S~EVB5lnW88ZLKx#zZxd1uI?KkbZ`LA?%c(G#UML%Vpc{FYX
zRwmNAjxRYbbYQQ}vdGDHjW@n<#z>B<>KqA7#c#E&qM`hfuAR`zop+=%;Ph)l1iFb(
zY%RH~Z9sJbL;DKUMCRke&fW4?b6sdXDzd}#1Vk}zGjJD+pzU8x7!Kh1nrih`858d!
z`_qk1{rz%9Nkh2zn=6=sy)@5HoNLv;m74hUTmW1CwfR~X*H5lrURiRH&f~cd&gR!j
zVluzqyRaTN>;MJsn${E9Rr$5E?ar|Pu1SzzJ790<B_Et2FaVHiPGq37l*yhd7-BRk
z0c=8r&p9!Y?=N>t+dw%&q4i0FVH@R&h-t^`^}*8;?Ip4?LWQ#KixZ!Rpw9xM?~o16
z>bP5(zrbP570nCX^-Mob`V~$LTYd)v;SDaUWj4kD6OC!pryr^8Qh((FXqfQ<uwpz6
zaZ)h;IY0uZ{H&zd&Nus>RBxvurZQ!2^lluXmXrC|el10xTgpCHa5R;_E9@1@8&!NI
z|3*P;z=uMenXqtvy0=U(RcK?rIpys`or+<Qj^HHR0Pj5(-KIns)J7w3HlNaWZPyhA
zWA><Ag(+_8OXbOv<~wE3ar(bbpf4Vh-7suo9G5Gm-!OXx0x#k?cN?a|kl#in;9yaI
zY{R~6P0Jd#RQ+!JPkG@U>XRvhK{QBor)K8syRL~-{&Z8oF*6|3wU%i7*aEOm9(`cl
zVm`M4XOk#lMz-KS0OESSz;-E^oYv3`sLDpRWSud`O2J;^mrDRLW3lbE8;GOtLZ>Gg
zp_Ol%YwhpXx7O>o!dNq3#amqFYTwVne(Eazz9GQR-Z!>s*WGm9<wJD%_5wi2KAOOm
z%XL%_{{z`XL;ZSO-#v_k>q44NV9$Y&Muto^)oaRXN>Rr-^=razAR!Z;aXIxOIv;#;
zG9n_aX>hRU7nWNxB~8=L+iJqxd{ga!PW~{59g?HDhkb+xQ80Vzk%cg<Twz=MFWg#F
z@?(+bS0AV-yRF9w2XA&f)n0Y}_Tgcx;FX|#f9qUf`p3dg356oAt6rA4cn1vEnPxjc
z@tRtL&mu;O59c%kPRq&>DVz&^341E1U55Fi9AA3oC^(qH+RRw_2)+@uPv7_b(spR)
z<4U`2rS9ozM&*Vdl-I7^B>=<UZMB*hZSW;@vxQcs)jZ-XI(KqFOCz*KSI5p9&>(8;
z{Hj8wTt$kqsumZaXbn%Vi$0<fqjTJ7SAqHbmL(3cYbIv#`l=)Uy-U?+Yzszv@4(u5
zrluFjb-1YOz=ib&sYvcx=9+8}C0I1!CySkUPIw2@Jt7N1p5e~Z$p+o@5$PGQR$Wpt
zLfvpw*elPK1&w3kEqiFNhPjI<)q`q@8BaHs2Z9r-?$XVaE%yi1!f~NTW%%tIhK4$r
zw4EzsmJ_IpALBp56|%gbxInv6bJu--i*WsaBPpF4^r^Q3PPn$3-3G-Q<LSrWMy`_2
zOveEb=CfgZwnin|N2~$p-tx>$FeY+YS%ve;2KapTdktFl55wVony=T4f9u?78@Lft
zQ*I;klINEw^SXN?a^t$jvpedNL#FP@rY9{50LdhAktXk}<M)qDfw**2Bl$*Zw|a~%
zyHr?@Ev=?id~JvmjkIFMEI)VdrWua1j|NRFy@;1vFYUiCWVv^vqjjlWf=5KBf^D_j
z0|`@M@_r`(1#&2V`{+CsWux;xs<>|U)j&|%O7>ej!tS%Me)-V)9jMVp>o<smU_->*
z*>vqFU02Yjj48$X-O4m~KKT7TK0rf%M)y6z956jnhNL*-GV?_vCT`L{IMJu-so<Ol
z=Lv4pyr7kp5LNBli0vCO*LW;VU3!oPP?<XSSr%U<bg^_S<It&f)Te)Udb#e7C(;@>
zAuEf{gb3<y!>&vsPfqRrt?=~$ff)~~GuMU3vNJ)uLLaafoP-#&mmg2a2b{1^(P{=5
ztKytFz%+vRHkSt!3u<vI@Yi<?z^UOjl5OfgQj?w3DFH|CI3mZ)y6mnbw!^C7I$yb!
zL&m*l!50+^-U6wj@6WPC<eY>?ddtc_LsX^~(4&pBA+G>q<{pM%&84U4+}AC0zTQ1D
z^i@v{P9DtQ`O#CSA#bIdh1luc^j*6zJ01`%zuI~_OgU}FHMAX?{g2V}v?GQv15dol
zv8fI)TeWNjUz+yct3RtETpsg+tuq>fo2)QG4CfD{9AKrir>7Be!s4^eoGb;$Mjxg<
zhhnl!hMH{GYaqW^nWjZiW-|3{$BHEATw6c@id0_gkaUX~aDOIi8eizCb}{(yh$a})
zvrWHA4MAM6X5&187!Hh9w<;()^kv#+1OqH4Wo%jDDqDzyL);{Dx;@afh6FRJ*|q?C
z6%JqDEC5OmL*U@?Y@eqGsq?-ydcVfl05ytTel1%+7~xR_z%dgnrV(;^B|noyCCV0x
zY-xm6DJ(W077?xihq)<HatvAB%)Ppm`y6#2iVG##-QSZ)gn{?<d`hp`c#=ZHgv4h{
z$>#-muunQu9HARJp4K2gBt^IJJ{3m|@H=S)pDqkv)meQc&#IM2QNu4DME#ld%gF&a
z#4%4NU4xrrLUcXuQOSZ<GtXXcb;}BTmv7HwPLe&pvGYRrL{;J6FMy(s^%c}K;3c0p
zuW^>Mk>9xOR1T>9aN8`Q5L?8$Y~;M@hzx5<HgFfS^sAqo6xSJQe4cfp<^l-(k)PKs
zlbit=4Lt;{66tEXIgYdv3?C`9nvh9egMdr?o>4&395w1C!|YHnG$2wRg_ukt?BK3O
z`A-Wb@2lg}*fd;xgTd3Q3JPR-nETEFVrcOevFcx$HDj}Zw|+^P1mM$05Tz_pC_ezt
zrRAmAq<9N^Egj*yVL$&kI*=8@A*dD(&4D8Hpz$IzUlMoS>x6gNT+}qD&ADs!ju?|U
z_{PM}n}YWmtj-66w}#I%3s$Q6)QfW51rwpUqBu;yQ{58l#4*!`m?tV0SGJNYWSND@
zI-f=-E75$S!Dto$dR<>AG=A^c!sWOldD2e6bg2bBH50NYEiq0yYxB!pU~V{QB07~*
zjT&KEMBZKz7kCYjVdFXE4;i>axQU}($K5q)qrLcnx}_Eb4!9!aZ?XCuucju<AEgc6
z+reej>*i2xrdnS?$gg5bRv0s{p4R?_K{PG|R>?Uh_eTqk4C8L|ylr~w!_U{QkbBNM
zK%}%H$Bg0CGwnMNL`aY#513zlZmqQNGJODc_A&B(P1&sLI`e`z+FWy}aS0BImftJh
zc0uSv%i+Ss376ka?8wZX4XHTVjpAK6F`6x>{vIDz(k?%+{B0$F2}ok5w2t2%8Zx+a
z=H71fPCSveAn{l|BDWRqo;sZifH66)p31(Ie(%I(C1v5q@<wU#4WeJXZ4P5jOmbdy
zhZ9O95}Xj9yv7M)2<NQ-AHKdjkm~RMKT2`SD3?lEDVZf1*&;h!+-qDbAzXXs+H_M2
zm5_D0My_iWu9-wb_BF0OQrRQfGrx27o}bU>`}>cs?rWUqIp=vE&&T8O1dRqh6c&E#
zxlzGeJ>Hq4n^tO0B)5}CCXOg<TyH&;r=!R$4Y_m{I5O*<;+dsIe1~0N!+0zmE68l@
zr5H!Ki)%kYCDw%>nd}e9ZJv=I20rPiwWk;O(i<WN63y?9VeWKhY1CPF-x~%%tMSH0
zvbTaDSH@yn@ZNgxMUK3B`EQ@1u7a~7dd|b-pgfplqpkFkLrfTVrx$#s9r;vYG4flV
zr;fU3s>SH6_K1S@)@;T@<LJ}|%Z&c1mS_N6NJ9iLN+RC#Eidph;XAue$vGptgR9Du
zFGCE?5ZH>{8FxENL3k98tBC3>s(6&0fHt_}s43Sz9mIb1&SSl}RMIQ5H*}`?OcN}Q
z(d7GSv<@O&lE1I}b3?g6GJmOk#GCPv$3H>$s=*zAMlJNm4RFuYXq{HBlOxug5l6w>
zCsyC8Qso1W*8Iojs&Nk}kW7;E?3TdL*^{@E<O9*AZv>>`*}aO77`E?v0X#+(S9*>|
zH~W>)pwT12n#0pS&k|CyoTqBYpj#L#agK5ote|n0W2Re-lCs}@<fWNFETPZ#G@4_@
zQu(4Tb|?>H74Q7|7&Yy83WKNpQT9GS_;`BNDwKZX!|E*2e(U3n(MYR*?~$}#_d`G(
z?R(?T22M^{DVoF37XYyE&(M%AC#xpi=S^Taj0CqH{s?vA1O448)6ZKkgRu`y180q6
zf1n&zK%an>fp!3yI0HLDOY%;2m?EXxLP-n!wX^I!ZVEGrHD-)PfF2{`rCmb1lN9W(
z1st-V+7F7r;V_YRTnzWp^{wFIa|x(4>5v}Zu`fk=sEdKyYXzHjG{l`?t$`dR?!8sZ
zy*M9ZafQpB)PlVX9zyw!l%|>8`8Ww*ND2RkOn-Z@sXV#~4KwJw85@e*?i!s323aQ&
zmz%C81uEp(K2VZ!=f1GlH;!vh?`FLq)=4dos?FAvqjX8*WhFZ6zY=!9H^pGj>@@z&
z@}}&`_<3!<S02CwED?N>Bt)9v9jO$%@V!#Nz@c6edXr*8oatS13{ly+oTy~5<w`H-
zB5nm*Uv2lmJ!laq(h{U8VAgcBskr*S5HpG=YnkjP>A3?Sih0Cot#ysl!o%IS)z%{|
z?3Nyb*CCdKZ$BXop^0Cb(^`K-2%g#vk!cDF$SY0=(`Y>v^aN)wqc0Z`bLvVx{sa~&
zJ3X*Prcx${c|}8<31t2FNE5U$!aEaKcJZJC1YtG6z$+2B;t4^NXE%n&%A32}q4tKa
zq{O<}=7B}{{N~ezt8!^jIxhaX<O^KzAG=xWqUfk6IChj?_N41>^~lNTYw$!8{ii8!
zwV3y+#}K9l*>#lmn_aUN!#UVwUAa|w2}swo4TTKExrPP(EMkW|yZj9?{l%s8RV$rZ
zlnmBFx$6OwBTs(+Vd_*0QK+j(WRh}rea`jeh~LUs<bCIlEah6|aYl3zA9?=+%uW*e
zawMR;Oqt9tCY8|$4V*O@Jx~h{;En`ssMJlGh_7!03%H^z$%@R}l#<&_A)H>hKyoE&
zGYh97iTF%{$?y?*N?07bd@|B8)F*9^VdTUbHz`<UD3x~EJ;KRKM6}mgf@1o^nJf)}
z*O+l1EEm>{_k^7}hFH~LgXC}EE@Ie<%W7%0g>0oHgbm`#cpz1|0xFpez`c-#c%fMu
z>8>E?lu3Y^)N3yI*%u?2U*HxkK%wfPGp0*S&pFRMyYjOR0>5gk38544!qY9E<AL6X
zufDpFU{?I;L!gj@hcWXsVFW5E<|k+K`AJ_|-p4LVYq@S%OAIKU@Ns3}CJSM+vU0wx
ziKNS4v%QsAuB{Ahfu3ve`jvbO*H}^(^Lk`=7T>Gll2jSy)e7D(8IHNdSZyqh>22^X
z_#xVVqIX-iudTFq1<Ub&`&kgF=RziA3D}~4;0|Pm89^%e&gP2@fPPJ;Qztg5PGMFk
z3E0peo3xj`wz1FN?s|;UM<4rSO4()3hb3LxzbIq<;DBzYIC-{8$azq6uO@|oYoYRd
zp>10IjRBV)9O<!=JKDl<N}cTVRp7G3OpyKyb!RS??;W1y0w*_>$sQ|Ma<nQ<0Q8WN
zy+Gk@T~<lC2EMHP8F}~j3crK~>4kX?Vl-k^r--=3IASFB%q%yF4&?mOV*(xwzmGp~
zpT2lHBEqkeus+2^w;{Hv^jhYx?3Hr9OurFYL13_*7tLkA3C7|ZjL1M-htOj*peOXQ
z`%4>JDV-^&&deqS=nwK}r)L#>`(c1!2m<!4b_ADig?WvawWsjok)swfV0n4OyF6^&
zAz4Ey8RKgb*$G3L<2wY-t+gs%1xkiR6SY%z-mkELx1%&@h*tZUt#dezPLC!(GMaEx
zk;UOrtWTQ>GN5wMer-2DFar4|CpBT)6LE6UG<kXwN6{b<W)gVky<Kf`cd{FifvoZC
ze(yMR`S4k<FxwU6VhAzCyT_yhMfjRweUnY2AROwW|51=;yZ9JFB{sLs8+1^}H&GeO
z2h-wm^BuOb$JB=))EmlFmk{obJunfaBDDp38++KiH=O;#thc6&D{(~Mm=F3%AHjrp
zN=HvK41MsL<qjNibNz;ysmyQ};bvxUTIuCbUUtu+Z>DkIp?6EH0kjbvR`3gkHx}km
zc0Th69-wdRhr_TE`n5a#A7yzOtX?U<orv{a+35U?)X<1?QbH_2nYy<V^Xqtst1=+#
z&J2toJPZUNS)HLauE}#ib;$eCGHE$U;CVj|5<+P)TlBFRUGqTZ5)FCv*GG0VqwRZ`
zLyY9!Bx(!~GwfKq^=r**eszSq`-J$k9ERgEq%t=<i8`2YIOmnUzJC{`A?MH6`Rh)U
z&aT?DdOrkLx2X<N9cp3$rPspCrWBH&wEnw5>jGCX{)*V_2HzgW8R-*Iu@rRePqZiW
zC2im3^L?X;q~fki!;JugN}L@WaJZRNO1g(Bn8^@eC%D~K(|5Z*DjJ=KI-YbFVXqo{
z&4EP@Uia<Qdxl3yNqc-?^yt93g*Pkoj<ri+yo!ib9^KjVf$xXPkg#lSqR%U377YAK
zBPZm9!`B`L?U}q(g#DE74k)CpY&~@XFhFW)bNt3Kz@>3UM+K_G-aaidWb(1ujeo;C
z;x(_N9Gjgq)cwseHa4l^nckGNqo|mc>65Ah`i6zRUpq(Nq~0=Oc_Ea^vR;B*E&$Hp
zuOo6A?$^6P`lmSbWSE2R4U9V^W>BSpwm=v4RFd<VE2`4Ei!VYKI=)$i(h)7Xr0m1s
z<Zx)`vKoHok)_6a!;R0(hYeKDj2g!Y!iAOF6-*-P#j4ccjn{FizIgozYRT5K5I&56
zk-d^1ICv!H1(W?-g8a`p7g0E5RNb2VT5dPj?+?Vcf+N~QZVomQ`SK%TyW^W?T#X^p
zAsgFeBz|#Tur%6Y^xJ@cZYqFkF2CLQYVo|v=ouk`yu4w!%3NM!!Ka>%vVol3WZ4}$
z*t$*A4LgOHKL!%_*`e}hmKgOP{?I->yjD?!2|>K^3n&#Y5M42QOJyEiv*`;`3dZ=T
zhQR2eSCL^ieyLa4DZ!RJ^A6r0Irw}NecbnF^tG?Fo&fLZ-ZAb~yS{WznOmw{*my`{
zC55nliNKm2@>t2wy}-#fd-4vA@bK}IO+MinzkadYss{F`nN51XmZJ%`2yAyc`vAi0
zk2$`I2H1MqIkWum1JtJh)@^g|_7zHif!1-7q)f}lbCOqARN=u+7|v=|&O*;4|12RW
z_EQJh>Y2Ki&h7flu8BJoENVT)HPRu>VE*SY>SFF08k)gVrd?ZQ+mvI?%KWzqT;oaN
z6FTiklx327@4a&55a0PsA4?78SZ-8QhO6`-?v=V(94%2BHg%&Rolb^$P|sM=xu#UX
z_j~Fo-{8HTN_<nqFKyxSx4^>xaJxy}d6`YNq{lunY7K#4&aZtXRq_Uwm<rmTGn&L}
zGNZPc*&xaRAj6wq<<Xw2WX|%*xH?_+fe7i8Ca8*_SEejtHIv?SO1A9!e*<tzZsOb}
z56<4|HqhruEmPuZ)L9c`kwb7nMH)g}YrVcZ*C0G6AHswR{64Y_dC8wX!Ei?Qr{puq
zFPV|Ylkh&AlMcPNd-6^6{HtLhid1Yge(}b)A6g{S5w+<Ml}G?ZItIW)Z_Y35JRHjU
z2Dd#X7g5RMqHobW<!2j8!#c%s=~YZ4>5%ux<ZH3sd<COd0`Tj+UTr4jiN2MfWc!NO
zK6e#Cmv=nIwq_)^Nvjj~*#Y3>HCQ9@+`=a@3l3)>-sPFAaPx-ySxRw<rEdnKMW?To
z3KKo5L1v7`TC%7|<&Eh5_%;|UNA|-Lp0Gp8a+}PN>Z-Gi4Mk1NzJ<6DQ!|#-QvXKA
zm?>i~c4#rZc?oF^{#0XPfn>L=(S97Fj43M$?vi6h9c4YaxCxxKIf;Ru5m;p};T73u
zwtVMV(IejB+_tqAAJ}t*b_CR`dX=3aB`J>amsJ{Q`c~x~K0G-7dj1wdrPX2`_DGFA
zEzBC-cGEGKa3H(Ql%;8I)--;`o?^_tGSUnVG%cT3tCBuGwFNsQ;Knlg9->fb6uCny
z;?1@&O3I^22)tTt`jja>&L$YgUUK4JOdBR9H&ePn&SjNw57)RSew`fRPqQDQw2wKS
z6LT)+{?U;q`VkbGSk1Jz4ILS^E$}GKAs7oICU}J&<@>?eG!}_6!qF{~NH&=QcQ?Lu
zsBuOKT(&v=>-!VlPf%mMYgR2UuGDGwferD%L(5iG?)#+@Duf&98vKJJTHM?Si=Pqo
zOi5>#51Iw3oIv=%LM}SoQ#ygPa8`vb(nVEpB@tM&Qmx}>4cbLx6n@>E*yd^v^0E{-
zz~mc4oC#vUc+dHLl;edqb=8dB4}*ey^+d1@{l?S?c{!n=C9c6xI7o@=y0ZcK{zO{v
zFN@@^Z!pBCohOMhQjSJTf*VuFPYBNrDLm@_0*|k~lF|J394Xip0DE;^-QQHx`f~$g
zk!NzppK?S%P80%PXT1$ks9A?6f!96RoJK3{$_!CXj8_+H=1YDpv>t|(nUU&YPkLaq
zfggElD5X7FI607oI)|3;wbm{={tGsHnZ5Qt!E-}c$h)?NLT|xKv{OKoO>$9?E%X*$
z6o4&ueQvvaJc;q|{v@h3&cmK|%;u@>y%hK^pO)ME(YMH>0<dq<6^l_=gO!S7HEqp<
zhQl4fvz0<nC;e0EWcw9k!#+fev)m_^36f)RVJ$%#Go%#v&T9zwWEhkzG9_M`sF_u?
z&~H571#4tJ=!%K#)*-XIvzoL?PRr{(&|g<ZUV|+QLu$8rjf<kHuxJj$YDCTGk{WhB
z*~ffXu<BxWyhI@;&)jg#Wp6hDs|;s<4C<)kfYQR>9xqJ6x7dZ5)~{c3u8`g;_VfU8
z_$QkUuFqB2sy!lcEKar(!U^y3VM<!;`f!ghl%);r_OBv<6^_a;LP;%Y@u+qZDnl8e
zw&Pms=%Ghu6?120OY)ipnEj%iI~9gh4}xng_hACeRJi&rZyLEut0C34-eBw@#n0Jp
z9krg}W8}``a^jj;N2c>Vr0>Q&&MO$-sXwHS(#vYRfH4tEbjZuG&oCp$Dit*D9Jn#E
zovqglDkQ-S_iw@x8n`RkZzx2Q7+z}x`}I*mnuIg5$mbO9+ry)qrAtAEwkL*bf6*d}
z-4iG!Uq@soA3kT2GuQ^u@(7kTHP|Gx)%U?Z=GtzvKVO!qmeD_}znhd2v)!4jBnDaO
z!`)KFKQo!ySTQ!ZKrh#Rk9TVLfRfoDtK4A(J-^G;)wkTr_YWyYB6IX`@%g`eq~ywT
zSFs9WpkK0!N#29+!$sy6!q2)dHL*iH24*Hy%JWvcks{C&UG0>*Cf|)%Vw~B~ERIk+
zP1IP=7LcjrS$G{co&rvrf?l{qjEq-|Kxu}XYkaSCwM%@YyN?NRaT~ZolS&3ldkN?B
zDlWHd&Eh|%6%q%|QaWtZaD+qV`&kpv(R7%~m9x1%p&C@oh+F!Y7bKEjguwO&;!~;1
z$eJ0hOvh#Xx^v0gHvfV&W@PXa;#|v8#e<`~f{!zw<Scv?QjU8Ky}hg~-(!KZ7#3V0
z?akI0B=SJ-!dF}K+YJC;VNf+amO?3BB99MIHr5j~8$M1CE1cN=PUNT#dC=VpQozX>
zU8+|)`lF;>UJsE1HcMsgJ|SYaKBmmlD;xA{xsXKLDhdrFmA8pgaY%ke4)PhtLyPsi
zlPUeXe5y?@HtaS^B@*=2J8fPHT;Ny#3HU#DR#W0qIkSY>Gm_90DbQ{(Oo2iBPUH31
zxP`^QXS^9J1wllQX0*HpLf7k9tJ7qx7u8p49*5B&Bc>Y&Q|N8@VDhz-+FYom!GhvW
z!nV|<#g}gJ0wNdaF$l4Txo*z{8a@y+H2ignw_zW{v*Lq6f6DS+4e5zee-Wsd{sS#?
zW#@g%*E1mp8AK@UL>vtb#5pj-VjXOdH#Uh+7!jA;eLVo`qDB>VhcPu);DkfxFek)u
z1a$m;&7G)vb`})5Y#D%Ee3JIGH>E4B^M{E{rbLB@MG#KNaa;P3{)2kA8W<h$<Ie>&
zb5~P2L*U|1DSgQ*+cQtcxDKxqiYFQ5YWdxm(Q@!7hSx<>V>xX)B`9pr%=p_5$_45O
zx;BQIwao&_t}?4&4oJhs?0{QLfd~FTmQvb+8)q%9*NiyTHEY&Mn!S%q_ym^j{Fa?}
zB_pnsc5jKN7C^P<)+^Gm>eYpuig2>yvr!MR_RY;fdbzfhEv~)%F^+0&;=qz1rhjq_
z_K}C?_^`iZ7fn;;lhMwWg8Vp710f&`0iK~7-{0NV`y=t%Q&xQoAmXq;R~X7#Q^2%%
zQ2w5Ok|)cK1bvc-(QMjuSB_di%roSJ=YjWduF&mmI81RAYvEWxJi(||cOv-QCSVUl
zI;&(v@rS$y#g)Ss)92g7{CJB>392dsAs<qo@bX45un#zfeP*2d0MG{?ald6iy8BdC
zt}l2LA?mJ%M}|?;L3sOcr2B<x_V;+NJsMFa+&sUoTK+)LLl$xg6Rj-V+PR=rt$?wF
zKU9emR=UoK5d-D8TI_pA^6uHBn=W3-Iaw&^!m1vVa_D1OW#$2s`U56qxeZUS28IAz
zOZsRDP*ps;Nhi!rzsP-o7Q>bU2-Q$tTEts@kosYt-5+pW{V}R=MIC=2kpLs!p4#yW
zz~KZmDQ#;U;s6hlb~Rw4yjo!91Z0D+H0z7n$nmOQYnwr!cqai000sS7NS!<MpQ{8V
zzWogR`SD~jPmFBw#NBUUUJI+F<$R8#L3}D>>cke!GeIG$!bQ+6e$gG)fL{_T(iFNu
z_nS-i!h+X!;A5?Z1to*}&6jH!E+UGbH3BvSOARY;R3>(w>rXBKtPlQU9<JTSzP8ER
zdV^cr{Ah4sLC$iwW7vLhSg#&5N46EbfFgAfuf;BB%$P#jD{%ur>``R+eBw^<rCLA*
zz~6Dlu(2dT5HL6&;)hZ5*<U}XtM=f|4=9@vAXxDCCKUYMjJ^bY#&eL+Jk5WX{wY5p
zxzkYs%olYY`>^*iWV1Ao8@E?35@t3H{WDWnqQ<l6u4yas-TbQiYP$i~qwZ7R`0M7p
z5L7^mjlh=v<mSL<(Yii-NIU3Y!;H8}tId?V`=I`~kA2t-z)I;kg5rC4adapQc1Ytg
z0^xSih=s|(HjV_VCQzC(iYw?VY9gGLj+r+D+7MULt^6ih?G09kbxNs3C6vGRT*R4r
z`|;iUhc?6xcQv^J7p30I(8Y&G56FTR-VdFpcIG1laD6jJag8%s+q0l`tvKr91mkeB
z2BJ=Ev#23^`&y@j64ikk4i``*zOypbF#&i;_<qDdWu8*Bl4uoUmnv2SIZ(FrOE&jH
zroegrjjUG=nGt(K+4E_jdR;?kh7P+<Z7x&V-8;#A&f4YM$mua$fSppr9SFW{(L4mm
z)v9%AZ@|j&gJdn@m4b^8&cUKlMJNLkW!4}vJEj5pyj7+Xxj0>^845@WriHyPcdbdD
zsTSqR3uS?poj)<MO3iuoxB<p0<$00a-jVz|k=lVr)#iwK6Hr#k3`Tw0g-$X{s}gJ3
z02RZHrB*wG^T1J>fSoVSJ^uXAJ_CfH+#cV?bgd-u$9A!c!VeD$K?}O&69F=1VQY~6
zRsw9PY1MX?k_oFFtr&fljn-AOMCma6b338PD+2MH1fdQ{S(b*SzbphTyYVIb4nUi=
zWIuqirWk5(i~%vSDA--0&|Ch4%yzz+<h}!H+}bJkbdGM=TcRqTiHSYUb#)(f-emq9
zbhH2#chYwoA|?ddnZ2%tzfu(bJY;2hxIeh-!x%1tW1!@L5EQ-`L7>@W#wNj@CC3|m
znE{K5&bT3&mDoe!CCn7JnsId(Ul6;VI{xCT0OAcO#_P@I7>=}~ho;_oMASrss-;bQ
zmzKF|=oQT6AsmxA9)T4Y#}7YDh3O1dAp191o~@X?2y;LgWG|WxkmS7s+}k-J49Ct$
z2vsCzH3k=aIr7md?lo*mgS)n3hNCLN>llQ4Q#pFcOD+DLb&{5x0wb}xc+kL95P|<G
zn9^OMP;iFu;(NsT*?u(uyTzODZml*G`yIMos;noCzHKrzr5o;&*ok#i`*edP7i1b~
zUY!~f0Ls*nEv<0V;uFbe$V3KQh=M(;K1X$PUYYFn%tA6&={zd=rGSplekFHYf@X6i
z+tfFUJx$;m|Hc?&jp|m6zuf&5i$?Zeucz#gI=JYp$a|=++%=p#rN6M+itNjd#~|6b
zwzJNQyp$W(1A5)LbVHgM6!OnPHa=ivV;59tK75~x$Xhu}>HVSB+wID3_g=w|v)v?t
z5fkaOAv7yt45A^DlDkfPLhh%kdzWv;js+;ko9+vY|H6=_I6^@OlGjGpCO}2es+(6`
zIrcUnbLy?wQO>dCQ`!;)l}`W)v0jOq1mx6FdnGkt-qOcCdj1Es7xy&^*>Q%E0tHhs
z1mp%nP-$R`)vrf_*F;Gkr+AORJvI0Phto&~?dpYlqteSRQE;=l%?mUEV$%u^omn5!
zpYiUAx%UA80{d-A@GIUqj)3kvo^;DcSCf1NK5Ye>J<oa<Rz+9&Bt7woD12R5#QrPm
z#V_+Q>h)`@u7pxCfIufXr-J6h6QG^+%%-Gvrv*4>>xHP`=;{0UQnJ0vL7}$<G)`PQ
zl;UBY{_N!1V6D9`0gh{&JEtrx1;74|T@=2W0Adg@3nMoHb+B*tIsBmp!eNHqYKo)R
zdo2P<#-~Op{LCG<$#649e!d7_Z3=Fg3hKR)$NPS@;l6-Z&_E2wRlrp7TfAZ&Bjxhd
zT)~J&aL@e<7BH4n+QXBcuL*ijx#K2BNE_c{bz4B`X($i!Jf6IZ-|l40%8h)A9qdBV
zvsNaQ6f!5-l+6;t;;?yuM^I!l461{WI=)+Arb_9Fq#x($rk(~gx5oqlTXslk!1mzi
z0y$oD>>%+>r)N2`ZPPjL)8o?yUD8)(e}Q0~y#~!7&8iN;5a-NiNsf2SR#Qs9Jh#wY
z>GJ>>O3e71Zx1hGMbPrJQKjiA!7;P$`Rh99kuJ|iGTk$m7lL~I>1)r$^dQJA&RTIt
z$dFccRCyL5!e^?+!k$BMmjC8l4A2SQXy<njo-^ycGo<U@tGoWh(n=WK__;8cbp6O_
zIhVP`G;4&3ZqfAx7=l>(DyjNSz17lDgkdSU2Kr(xEqrIuXT)E^VY9;Vx!0b!$DY&C
zP!}NH=$iLEFd6gD?B~zpmZDB545pa;z%e2g@Jd;G55*I@KVZzoLbF^KBHqSfOI<Mm
zaCbYfJsiEnn)IYgil7~4+ONhP2B?k=JD?ZWF&DMiXDI)S9#))>d{PC%_Sp8kw=a(d
z6{q$W!F=YEFw&+rR<_cJCWRzf<9xslkOOO!Byh2vhsp_0D{$=E83M-XbfUpiY?a>t
zlS3S!*siU${hC)76a*?VU302=rUzG0i0oS?pNmJfUp-z`j`c|TY?xg6MwyBq(XlAl
z+84=Q@HUN{p;dP<m`^B^?n_pau>&j4&<?2<m&s~zcZ~1sSh-(U`|7C<&xQuh;IoK(
zot~i_1`Dvkv%&Rdz4Jig5G5ysu`eB!txF))q_a=Ce=WCI_RWvKw;-1}$xp+F(5_2+
z9K10W;g8_$@_a$fO-LDI#2m8Cc`^=US)+L@an48LeWIGSDzT<szci@Mm|N4FQNn$A
z)4)PFk>mC-Od;wN?g_Rq$HI4e{YP5GQJQABwm@N2R^TO)s?zl~w7}1}&5fR)Aoxv=
z$wjZeNr2Pe95by|(neMn#Olgd^4=uuPRr}RfE5`LL~?3bp!#1_sf-X>gCMQX>{VJv
zn}cp#cR+qM=e-NX>Zy|hg;!5P5bK?rGg6ME^y}h}@`Tey&#i#6N0h)sB_d=_v#Q+7
zG60aSV+il30wZO1B86}v<>nUcjD*EZ=SHRWb;xA^w<<~gq!ff}Vu~{<2e#0x14EQ%
z*vr-maHn8=FFhj6Bitiswy~ng$zdJ4hU@jB>18>$B<Map=|Zf9Wv9bvUaHEf@f#Oz
z4>Hn9Ecv}U-q0w)G(}MxrUrORQMCSlEuMe<q{&1Z_`dSqaJSPE;6+Mk2@~N(!}ho5
zC?rml)^RqJ4DR7|%nc>x^GLN=F2aedwc`u-Qh1HJ_1_n%W_pX&<<M1vJVW5r!0U#u
zI%N(g<Su<kZ-PLsINWYw2inHe1ms8i6AG8W7Ik{-&RYc~vW}Qv{zEy0QPaRDO=eT&
z2!^Nu==9)Fk}Q$MYbbE$l$X!~t@g7B<a7Ia-B`zkYm{>zi%HYlBeTPA7Zfn^LyiR3
zpW9?K{HF@Scj*{SGW@z0!f*%IB(&UeRaeZUuW5S|C^e3jH5WJJHQ*N<<ygm`d*2h7
zvo|k)7PjE1UA4{p3>VlJskUaPGPrFc7gTM=7m}XV`r=Av?{Zk}{F4IS`PDa5W%VVO
z&}P%_xs!}?3tt61x4G+J$v7~qa!L1G#$h6QWaVc{@@TBSDKWp@)qRo~y|6U6)NPIH
zg<?i_aN8#HU!bh!py6I7=j@2lyBl@KFp^3I<T;<6o?sk|lCRkng0g{9gk{gSL5{~3
zX*+8-*6bipHsXACVrdZ@%i(CqBYskjfEi@(j>F!mi1EXArH=*j_}`4j2$2(C>ZWlZ
zK6yqE>hb5EEXV~<i5px$Lr)_^R=Ttzud1+airz$2t(;BYtsbj@(YDIn&#(gJC^L$D
zO*<}7;iGvuX!%E+{Aru;&|ll|t_BsK?*@W{#p{{X)NYQFHaZw3cx*--i4(3VB3qj{
zw|*Td=eYYNbOQ8_pX*-*EZkwfMF4yC<fw3bkuJ+M$WgtnN;9g&)BNq#=eH(66pTI!
z1ha*I1=_!YFVu`W=?2M0spS5^J3M{pHodQQABA!fQ;?KtZGbr~)0pj=nim<t%lGyE
zK&8D$J3C|wpQ`lq_!e)VnIN~bW$YC~_iI3zU%#ga1n<6IVR1^`oh_USK>E<apObPz
z)`><Q8t-v2T1+pCj)w@N=&4%K9vU86C3*-Zg+3@m``$nrMqqbp2O(tz`hDLV^llC=
z{1U1pDb47Jcij-=dfD(q$uh{aoD}%N>)gQoLUX)yG@*#E`<9yE54&MsKwvs*cz3B?
zSAHuq@`ht!<sInF$TRMBQZ)CwC7K!J%EY-jB&|>h%e@AVk^mAw>|IcfbDOCggASdN
z@o{S9?T&0?tiY@tM<rx9)&Rm-B4CE$NLJIE>g&;A0SkSA)Pa`!A#jZaRcC15Kx5Qt
ztz8$MHDgrui<$Q(ezsD%SG~81lW?{NKbZxal@AW~Q-VSW8|_zdAu$$?EN*<FP3VAG
zW9T!W5TXf-$X-LZQ;GZ1U~m9s58TN^M{|rqR&$Xlh0X)Z${c49;jf)5;VLsznK6YX
z=43jS^G%{VJ<ez|!2|lbw}jy;s#KS2Q{y=q3`Af*a+E?xgeEh%9$@Wu*I6LB1ezDB
ziR%WPS3V6pqO4Kjz7S~(uMU+u2~+gWl>7H+z~|`YYl#(GlEtz>di-U}l~M#=<(|k=
z=vL}DOkHA@OcI5h$rXU>D;gH{5?mSq)bvk9{=Z^pzvygzsEKyNK0`Z}iEt}(@gdT=
zuNL7@U*MG4I0raT4LTC`88?J&cO;HNXr>1{*GK$a=ZLU(ADx$RJ&2_9?fRSVhw*R6
z^L=RCV=w%&$*WFzD_p3dfH0>6`5je`=24%G(Td(6S3t1*u%?(~)^+bCdl`)iNVGMn
zL=o3`et;TIkQJ16F#;LqP~XX3bwq5emy!ww8*X!A;5%bFhbH7+-nmp1@~~dgU737R
z$@yvbmNn4Q%@dP&bHyw1+j_BiNiTGgY*xchQufqO+Upi(GS$~E4hI7|7g2AJRr`f2
zP*%#t7--)bATTa$5!P@7xdnAdU;lVF`h&A=rsHu`T4qBo!39%j@YKJ>H4q;XC4jV{
zkJ@0-nftkE^;W3i<@OS2L<%{arzDJqH=ZjLhR5%UqfTQ2DDr0fJaPyo&RjdAirsBM
zymGYpq7~y#eDo-<>e=dD7S`@rB7I`T>_;N8%Njo47x`F}V^li2f`q}ITtPk~UdXhN
zgr4_~Ksxf%rmVb8e}D;gA}HZ|MawZ6NcN9J+SPSGpgC4$Ndc#!$0J?e3&gUl4K5I1
z7zJ}>;(4#p<$FlEX+J$O^RR%ZgAUrFV;BGK2WYo$503)Votc1+Vz(XEb!a%f-uEQD
z(Aw;a3DqSc|8fLjl&|z8iaF6oXrbME7%Jp3y4RmJ%fUdrm6t<^#_F=nRRYH53C6A>
z!??*5V_*sNuhYbz>DZqsnpZ&xZ<}~xim$!L4Q81nS$#eg7EX!`p^w$TeQ<9=vNq9}
z=0jOBq@d^_E22!#`w{8xkDB$C6Iw!g%RIyil8s-y>zX6=50Kl;BUbB2jgab^*BKB(
zr^M`GaQAwKz;Bi~YRQe|28CZ&AW!YTE~bYwMDn&Ng&tZzRT^Cw!7kCA-Y7_ObO<+d
zl^~JnYUj9Ta{p~zVLV^Z=O%L_ZhCFED&@K?bD#d~$uzOhTz_6MoWz}$Qxs0QjZ{j!
zaHi`(9=(yAa;#m|$t9h&2*TB=Ye+u@$Gs%6?i)nj!P(Q0d-R<0vV_O!nhGv3fn4Y3
zSWbB8g|@Ey!$a0G-=8SAfOFc&tlMe@j4WLKOi6C?yZohLhz$U{mJ`C+%c3mt^k$uF
z@I>Xf>q>ETb7+Fl_cDi$<u)MfdO9wWT>N<9GR--qOss(T0G9JZ@%9T@KaZ&3nu~-;
zZire{=#%`x?%iNOSyc&rRv{5X>2e5Na*H55;>W|=`2bI*#{>dSU*I(7-kl^oOG~2#
z#?`BFX$xD%xj8MaUo?oGU$%@u_~{7KuA*N~KF>lK{CW%L*8Gv`#lVI(hh5WXKaES*
z8G1eAXW(2n<dgNQ7=q!0IjD77tF}LqPRPQo#sD__6t*-}-Q1wYRWGD38n#*`x{y;k
zi?`N3O57MENzGVz9Q?W<igwMt6>XU<Ja4?>NwWfftsgU6vF-yPISc5gpAiIMTl#mK
zvv&g9C(DT~9y&sx^t=iVtGVql#Qz{v{a&=DG^qj|&6r6V%FO0lnQo7HCx@DZR?{yK
zFE>hexb3d?sxhx6-5`I!@-}x*D3dvrlo8?GkQ*!xO3KW+NiYsk8}$i;*ficxyAM?Y
z^M-XURWb`nt7?CZAjH{><d&rKo)jN@iHl?<I30&<m;sq6B{h8<Z(vI&#b?IMR@D>1
zfwqTsYUEKLC)~sN7(zZ)C_9Ih4wFdk_*8Xl2u8^QP|Kbt$_G~Y0TOTKyGDSPncqDj
z)M&v<QPo76VXJ!d#3L!xeV;Q0RcXTqJ8QDGFF{ipJ(mn#-h4=qw_h?)j-4Zf#6e+t
z6>4K#M2?>k^re-1g-wP@rx-B@#3}<pJLMN(YF}%&I>TO$WSqH@qnL(V&SiPAwo1X*
z#v0xz$9dTho?r<N_-)c3ddW#nzm5}w%PAMw6ql}aTHgU>lqiC`JsyP_4dVOpS`zJN
zZfgDw_C$+UB|?cMh_<_E3+^@e3oWf97ta#kJt{GDeBf>UaWvKtlP@Ifr*z#MW3ZY+
ze$WBPq||jmkL1h;&$Sm+84dSt0=)0tF?MM&qY(WfHFSQH2_Q5d@7~L)nP@~@XRlen
zhnbsg2(*nBD85uo9Z|3cL_cA@yXX0{vep?P6Vtya;w1gcug?tWAr$4(KF|**Kuo7M
zoa~L-FG7ya$US|dB!L$0sVV4nAC8f}SW|v1N_A9M;{e+ARi~z4Hd{Z<N<iK^%H^G?
z%eWoZj~6}SRiM<2N`{mOemF~bH(r#<6*FR2-F|#k{xXIjRJn~%HzRK#hp|_dF}JTD
zl2!-0=OgAfbttZY|CMZo-l4O<ts18&){)nEB=D$dfkzMETvI>=Rn85&#5n<vH|{t(
z$p`v2)h0b1H{S5Fk>4tfX7yx^0OJ7t4Ks0*eAH<m2vCQL=!tw{yz?13|8Zqmh}qMm
z?fs=?z&K<h*1clx0T~DH7pJ@Rg*j_y2`|&f$<%nu+8{ldG}-HCoVA<-gLY>vlL6|@
zLy*8Vok_}CQqxbrxjqVbdC!aynhm*&2?Gqdnokst6B=6pJNX);`Ab`Is$sz36Ow_G
zSrJYq&O~yt5C@GGV3uwlc{xch=Yv3;%N;fzqH_qdU=i}VRMkddBTY0sBv_`n)zftZ
ztqx_ALU`fI%s)^n%)T-9qK2_JRO|^!hoW!(i*-jb7oQRYH?8Q2?+g}SVR4g@wY&4K
zdRoeR*RX+W{rH+k9Oa_DR~%26215^Z+PIA{z2qel73Acb(Qk_yl-r9SH5>B@GjtnM
zce|-u4kvkyPF30hh`wkNcC{ey)w6-JY3bP*tSgI9muHcspK$ZxRWrQ6b+ml9A0Dln
ziEQiNm7MumFwg%2-%vebi8r@=>aXL1&gVbyl7tly&7DuN@E;a=K`nrGHo6OI7bIl7
zWXh)&H5r_+tvu|Pphr?Es{G~Mc}3t<{<`hunic((UpI6hC;0DWapfz?``yQ0g}17;
zI|5E`YYEDsSz;+jQ(7ItXTT-kJmsf2wwR^p0BASAw2Erc3fbHG?OJHr<|=i2)j@r4
zX~ct0qG5>KtbZu!3ZXthAaGN;s2n{E@00)G|5i*l2o!yQ1MP;5=WYJ~5pp05@ee}k
zMUsX*8-NakIWRm`FAaDCW@^eLiJ``Ut$->#@j`AGhS!n!nEFl<mqFYiZLeHp%qXNq
z;iUFsf`@))M7=?hqHdlKqCH6b={jJ1%b+sLfH=9gBsab&kEsRtK(K08<ssvE^OWcN
z=R#4%yFg}&YI1j^HjGUTGX5}Jsl*Ct#_y&moh-T28cA5;JMRYQl@~eeka3S+rq~==
zxY4T}7v)5gnGt6hz((T?bwRq;tco6KB7DyCUzyHHlK`|ZE8M%ASWiOdH`8?twX+1R
zoiZpkBKR^ouII?=2qkN$w`@=Eu8+nDY;QWxdD)=TaJ{B9X<v~Zwld+LE@zW!Mwa;k
zEY5p<=Xfg6IPGUZPF}f7noSyMpK=ApD*v=h`>P=TM`=EUCmqEBEF2#Q_=zy7pMVJ_
zc4VrljrtgpPimKn%$%g934oiZ6i6WD;O5s0;BdY4j?B4`7u1@u7j>frRPe~B10tZX
z>aXf74i?eiya9F4HK+{{+XlRB%*1}?yRPMFC;#d>j~#gbz2Hr!eXVGDj=&MrXa-6K
zg7~aX=7(dH<Fc2^Dsv}|S^A_wp)f=9S`-I_Z${qcIZt}Si-lCQ0P?w~&2SA_K<;hz
zwx-voe#xaEePh))?8+9U6sKw>Uvsi^_7WflGTCPXr!K>p64(C`FQMwU>hICr0mz9Q
z8?Z)<)EuEGM=PmuA?@I-?OX;p&l4Q#Eh9iZ6pI&cvrRDXdG$@tUy67+w~3VXm7>Gp
ztH|Kex_VFuaRtBS&iA0}v;PEWo&JFOQIt!ak+)hj<6lc77w{TR3$PH1`ZN3cn+m~2
zx<#chWG$%xtRfHpc-YDvqL>sS(7Xapj(1)I+I;gIpdC8oNvuv%gc5|W&$}=2FNDD9
z)h4grUA$ESaN@)`Lu77lLtC4)@)xk#iJ;~|n)X8JBBfdS3qT;avCb{53prl=7uAOZ
zIvE61Ej;tA|Iefp)JMDzeXW;6b%+P;Y|NbHQwvi}pxxOX-IJhz9IgF^*(yd;Cw8WF
zgGCd}JRT-qbI(YW_C_rV2-mE1>N(OA_Y7X#2HAxqIC=RdnCa=1I8hPJs&|%}qc|?c
z@@hjCba2HK-KNv)kE|VX_}Y_5dt5PN)coISC@;|dF^!=*J%!<4)TI4TCFxM3nzNt8
zC_oqVK1YE1UhB!fTzN{q?fWqRNkgl`lN5jWwP|pjtAG$CLO9S<?rK>A_!-F*`RQc5
z@!8PZ)KxzyE&_?5a*?R3uqy`nImKai`{9L1=O@H}Y-ny^#c`AQ$8h~MWlC>w6{6em
zdYLVspYoV`+?)g6IFI|CN?Kb$xwKRH3eY2Z0z*(`kg5IiHItM8E++iJ?thph0-YG2
zNV-Uj4DGttGJ9R6k;<*US5_0iC1*}5CDmu3aR~4a5mK_zl-T}z&jqO-wY<4szm&s2
z|MmCh>1@<4pCjCK#S7`wLf?TBrS+wXFJWvCz#NT{P#$9Y=bZtcP>OJ$DeeFCwnQ*j
z09|XVo*FyC(>TH0Cs7I{@dO|IS&Db+UoVD>Bu<I134PCufBx&=pNA+`y#w#)Hl+`J
z5>2nMHT^P>)MXd7ocNEw&!%YTFmX};m#N@9{~~}clL0_m+gqmq#DRB!zR9&c_ACBh
zIdP~21t6a|`Q-io<patlyx8g*)I|Zu#FK}E_jX$WazdUme=Ya#R~iaBv;~NAy{7%w
z^Zxy0ByH+_u^=!mF!F2?%zqW;2c)jm4?{iw?Hwt=15^^C31J5*zJLE1%D)Dcz%Amx
z=2S9(%1Z>gmCDJGw||;<u^suW_m6iCu5Hc>zO4DuwVZz&wEg`@HplM*+pN}?$u0vW
zQ|&=(SFZ+aEDGfn0|C+Be*~s}I2eAhQ_sN?^zV@U^VigKi$GVA{Mx!9>3aC#L?H6d
zFwl9l{MSwIKgfU%NDK$fMHx>0?<f2B(_XJ3J@}Ig0A56TGMH{&UE~QfFo2<FsR80N
zMO*=z!f?}5Ky%w(%Xs#hNP~s(f7?*~`)k2hacP75NMcf;4EccBJNlUoD)x0doTBbi
z1EY6#K&jqAa1Tw--)^chd-|=VN=N?||AL*%m&Q>>Dt!LZe^@X4^ELhPZYbo+b!zYi
zNvhQDR;i+4(#{_`%OYkX0*Ch2m6Z;K53GrLe);nSD;m+J02{<R-FUr@%9)zkji2AD
zzrK;u`0v3_Hvu7*bfGl}JW*)KflzjrM;X*EUmAW%7Ia3UivM>0Rg|N4sc3Lfv*I-E
zFTpyO{4b~DcYENBIHp4xAx)-Nb?oITal7RLrKtjA&--muXm+@mxYHvo_!uazxBJ6r
zZx9O1lXEg4@^G9}*P|w@zt`nI-htu;5GC-XVmBDDZbeXq$slS(&l=TG`z)yh4>B<^
zi>?xi<^K#UQa|<9E^u_dUZeAj)jTzDSnA&*4odXfM}hUer=r9NBycYjFVgrvQ#$Q;
z@Qe%b<odH|(<+?i4T1MX-b)KPx6761#_qNC8#;Id8~ViYZr*>~EPc3MVcsLcBjv9b
z?MVY&psQI=y=tf9$#`$yHTTcY{unuPiW}<DOQJas6w8l(!1y*XSbeD6u6tY|a?`Bh
zW!zX|#nQHfn<y|icc~plr`GzX#hpH^iTY^EsJZQGL;HC+r)I6-jY8$--!YdYC<4Su
zs{`Bxl}Y~eegAEJ?G*zaa-p&(g`b+8pA@+m&6%}mHGE4a=N|i4%^$Nv2x7!7-Cf4Z
z3bfsyf!*_^T!`+_+ei1m;5Q8RcHgo7c_kp*UaUQ!SdqhC$ky=Szh>z?aM#N*FFM;0
z1$J#Q!FS&dpAd4sc-T8~L;}~jeaJr&#cJ9v&-!&BlVuR)v8XoaWV6vA--cf^SV6u1
zE3#m6R5h0HY;~nQa<#<S<A09jpQ`|^S3G!@wkm82J+(9cEd{2sBSm|cky7Uw-jzKm
zRwy4!)yaKju!ixiP1Jgo<23($zS>G~{Y+=jskUFi9S!>N0<sJ8j}~G`x{g0CuKoWl
zen}i)G7Nl^c7IM=S`%{SD?i&IbtSHM2ET+eW>vWe0h6V2eXkJMbOcY>wFUQVjK|LI
zIQ{8tXNA@WY5x_u|9)L02k_O>^N}9%w5720{<ynvOvF2lqnEE$yCf>|yf=)3?Aa7-
zWUkSxC<RG;X!!Qlwwzt&|8E0tj{`4;^=pYE@N#)q^f|U%No}CAn#J!0Nw@3kB(0c5
zdj6Lf&Zd%WJ==3p`Ry0rn+`kTc8~reJpbcYGDY1o?D+mEH;j%dOg66K*x5tWamVd#
z7S*`fX*R@$zQhEMbc~uDRPZ?iY_sQxHBDdKDW22+e%ycm3Z=LeGDVY_ro=*v!)2E;
z1axzT#eoI--f{aIix&$8bmb61v^#DI|JTm@`}@7Zzys@4M5Y|1mZG6u{(i{X9as?v
z!NgLmeP3Kyi6u70l&O5lS<JTi{~Mo&vQm6-xaF%=(@#^k(k@TQsz$M~q(%?SR}}ap
zxkt}|gl9l2eFUYjCiSUoqjS{d$%A*g{=<xCh5su$|M}pG6sKM<o_NJ{=R-*>6Yi$Y
z{isvU;U7ZJg(%w6%z3_`pR%E!nYy<_ijiyV(p8Y#YA<!Jj0>aXH(X;Kd-vz8`*U&p
z=SCM=2*#z0n1E&eafr(ItF-e{6?XXo$smSf7JRuJ^=j#chq4TXR3`BF_(tv|>ec^P
zpBFhpv)Z#(O&%-kj5lxHl$mZeU1(o1d8)AS^`tMQT>@p);3i-riz{MixF2#6CPuOz
z?5%KidgeyA;x-G;3UM@S$G2W{_Sl=6$lvT-D!YA}a|OP+@TKpMn@@ibvTGh2->kA?
z{M#@$bdAMr5MOQAk<X<=9^YvXncLg0<B}I7Y{h-w`2C-z&QeOZp2AYG49sqW&QWaQ
z0?YmVZqYrjgF6k&W*4nolAvH;1xBD^=T@xSg@qTDWr_$RG-pr#am&!-u!@G=RD(Tj
z!HILvC^8xFDBxtFRF`^AxFgR~b>98F3H9}Rp#$HL9?$=HOaypLMA5MZvNy0)UapAq
z8G1Nf+UOwcsPJw)e6X;Ze9h`mm1?^ER6~JJeD>05a$wa&{MQ@5Grg&6ltWao(f9Qa
z(be54`{uri5p)%n4CGt{_Zx#dZWo#-hv$7BnqS_PoUb34hj_gCece)3a9zitenIM&
z>gf&lJ)E-HtD}Sfw725cMM;GA)<YA}@9dmg<9Z)Q+WUQ^csuOBCgcSs$oiD=rx;Gx
zrj&F?@@BE+?+*sFxu9Pid`KVPUUVBh1Ah$(NO~lHbV+SU+F=ki9{9a1h0**Bdon%b
zXHa3BQDorng$jlRvAX2cFG?*=t->2OOSdA@*g{WP<Fk%J3zlNv+IC&N^3wI)GES5A
zO8g&rBf#0F<iPeCj-HQRGi$A)p8I2v_h>7wV~gKDfiAfuEb6c<#KEQ-=;<**BQxnw
z`~!z)6a!Qd7@0`x-yxk|0j~YkC;y>hM#%OflX!@XOSP~mgPbnk14Ty3^I-Hyr$S}h
zYy8&iP5pje*W?h7|0ACb6+Hx9_<1pPg+qnx|EJ|NELyQ7Ty|lyRhmbj`njihiHo?d
zI)2XW$*xmVoAM$so5}n9hZ>;N0=)g%A}Te~b!tE49SpTx#^)0H0T*vl>-$^~GqyBM
zv^z{|xpU(ae@w?I2vx1z$uVqU#as&I&O#KKX_kUEOR&nG3U-CJxAW0$1*W~JpfH*g
zG;=Q~NvDnBcZl$v2b&|)8A0yFU~0dMfy$vNvV-ZGjJ}k8kAx0YEnhu1*R|!?bo}=U
zLW-h>Y)%D61zzrqI-GKqa-{>*QB1Ea`-G}%Sg&JM%?j4~?$_iDWkxxcj2mw24T@~<
zMG>f>s8yAZX=@cR*EA{N5ZVTo+OEXjxDo-q^y4f(Y_FqE=swlnAM@`_ZU+@sgYJIS
zdG*suBl^+1jNc*o_Yx*{J$+6@7hk}d#wa#c@3VNR>DYnezox&Oi7?o2+Jesq1J`81
z#*nW`ZE{e`oy7;=X^@@Q_?q$+;}9`{MP=JGk0R!MZR8&-!v2YaOCx<w!W8~}>{nh`
z$nM$Zf?t{62$Nb)hk^-iwo-@p?_7tn!sQypFB44P#S7~C^PE9dXQ$fH(>U~3IClr0
zaN7T-^l@rGC9L}qt-hS`gr^7pxkiGCGUXPky7LRW;$K5AhbNyv_(|{FnF;CY`ep^9
zmR^S0zdh!x)U)kcD}oa(WnQJ90okS3H^gL$HO<mJ^p53+r5);1M$-{m`-=!^lNw#)
zriGrAvX(#AnEA2zKVR5~qu)1WC1ZHv{FdxrJ0WP;*cX4c8^pwd#wbRsD^iz!&rku-
zHzcPx;r>C#c*S^48TZf>q1l1ubWPooLcVRv)EZDr$NE^nR~^!3Tb};M9$0GeZnsz)
zNnXoW(|SSiGA3q*oC)jtDLLbAExh5B$NmEneR}^{$RNW&2GaBeYaJ>+<T>M9qY{UW
zH$(bFf=OLU3@KQMT|dYy&*U)M+*bD=*~)V+*q!tU3DXN^A`<<q)N5DsfCJ##p`fLp
z=y5E5!m7LM5kNoopZ*U_G?#(tm4oM*A-Yt3zV6r@xF7;8{Y7;QN{PWpTB!55N?*=7
za0LF30kS73V%yzII9}|&Nl?@^yb?ca7>W*4aM40<_V4{L)M`7kzubkM2V24rC;R1e
zSJZh78Z45M7qHp&F@c%6tD7`M-&}cgD6K*4a}}Q&S{w>Qq8t09e&61oBc#ZDs5mG(
zc^o$6V+w~hp8F(9S=i`D`rC<fvNyM|#=lvXp$ov`x{JNHa<)yh`La*Ybc4_=?K1&6
zb6*+vdhlJqiEOEK>D-h-Wpe}7e;ibyP(C(R+WEG_=#8a2b&u*$v^GqiJ59KquexA3
z3ENo?Kli(2gz8Y>fN7(O<~Ch!_8P0lvzs#!p0ul8{q*(Il)+}_dvb9uv-i4u#oW8(
z5QBfphk?S_m}}Oq!r{;%bmzSdDe5Q|lAU4rVfq=Oo-dZSeG=)T3}1Jc^S<r4|KRf%
zK%~_ao>V)-^Q8N}Aaq6pLgjFBr?{bD-`SfKO>Lq8#SwI6EX;og&4JKYAwitH@0y4A
z#}{O&4wm0rJl+O;N>+Jj%fzP(d|<M?)jn@k#s_~0XhV#M+cfWAi43ej3~C(fkK01k
zw<qqKW^*-PzPEN#eoN5q%>kkPtHDgDOdH$Y0_im$(0r_&@VNDRaBNPHBs1n`-&*uG
z#BioL$g%!@;&c>s6sjhUl?y-M=P66;RTUEGG@0nzxK(NRIl8?{dN=y(&i<ML5HiNn
zRK@sPF(X1Rg>#rEPrfay-Z?8m2>`%86yl6r98=SQv;Ib3(y1xuR*q<Dd9=CRA8RcG
zwa?elAXU@vFE#_pAF$1Gk>sh%2r<!pL7(DQr#9aR{$UwTTM9aarukGM&4cnaLfQ8c
z-ep-rHnWSGUC--qg7}-d9a^g25U|+sQSwO6{?USDNR4xg(Ye)@BIWGOaaL&mp4RDX
zHudi$LDGJ_qysS_ZNN3}S&&sSakZ%J&Bbei|5%4yOFVUsuJxhhG;kaPuEc(oNw@vo
zM3Y5bFtBa*PVSD=+20eSih`SNrC!sW4X<Jdi~GIjeCCF^;v0TNOW%5TK!7~%X4mlv
zvIEp)l7Jsvk|^s}+z;xTEf0?Kv;b%F>`x2#3T#=q=3)~gY4-oW^i8lkMwpXyq*F_8
z#mMzm++vK4P4$v=yR<)Wwscv6T=h0qCQD8yf8WFoHG0_aT?6MEi^*~u`u$bhgJ#CL
z7A!m-MU1BHm)$;{wtHcqD@U~Z$YN+*N3p7F0o9UC$xm$b%<9`_g3fC2H4SEPTA_>#
zf)Z0i!TO&sgzB)RtY!hp;b-W6Unzjvf%Acg`-}G2-{vz+e9DTomm`pHgUXy^&Sew>
zS}xcPt`%gQx6D0svF2+t!9!(39$L%5l!QEoq)MZ>m=#<*yLOsMecMTpE-Ia|9J)U^
z)f7+Q1VQqJhQyVl=|Odmj;<{>TP6%~rSR^5Zg8m6A|~;MQ<Lqji@)Rd5X}&)`60JT
zjs0jOaHWrOi@z>Z@KeiF=-s98hk?BiN`7mKXBp^hk`u4mu@McEfViQR;!;ilA*X6j
z(|$kx;;LP=3BaXWj&KIRN(KN#Gg#<sCMl~rL$@_W&AisQJar3Lt%3FQ4xpGhlwI3C
z3W4t4Il_6zL;3eg1QMTt2=0JBy=WQJ-w6`eqY#UnRb<N_c@~0N1JTzNovVk#_LtZt
zYsB&<Qp&&-+tE9~lKaj~ckaV#OPi-EEr8-+1^l#%O5cB<q+OwX1Aea4l%tdY!jCW(
z=0jtSGkZVgV^jQIQq(juaP|~<H6+S@Q?<2J{_FdrmD?`emqQ8^{G<0b=>wrNN3&=G
zX$KAW*gWq2ertbZ8H$;QW;QckM&m{zUtO-<@qkYFfih()@TatbzTp!E#&cInI#kcF
zn8bNk!M?oRUbw1!|Jbzm1mf7n;wie(bcZz8{>1>Z3!?S%<{R%OpM$^+jReQJ+$qfT
z{r64V&0Ne6hTV+ZqJKvei~2TK7JIY45jpAGzZX`*d6+8^n}NduZEqj&C%LL>m7=gS
z#yCMSss=?eLuLE6Kg>)eY)qZ9mTAEL3@lJ;bl}`203FY@Js_*P)RhzXR)uu2mfvyz
z`9O6&MAx&XKlzHD)CT}s+Lm$k{WZOq_{_Pc!<pW;!le~t{=F|Ior7g9Rmq72Vr6c<
z3mKX}KiQIK4Jx!I``rU52q@q;;7dK(DNP|jnM!j|{>*|9)vzFiuu&9S1qvRm6xMoa
z?D&;V<C(1X3a5(?E7mLm4)^pq*7fWMYB7kSMwgGz=$SqnP~e|~@A~(}+Mu->-T9iA
zv&syJmH)JGDos%AK*{-tzOg5A5Qn!$*5BG;fo*sFEeh7$e=D56h0QVFD&EXPqvXR0
ztrtKY^*I0(A?{4HBu)UE&|JOkb5VmX)vXkv1!nzY<nV=8K$3I#pdI?n<5O2M_)p2I
zHd*UzizdrYf;|?iV7iZZe7*>}Hl&@@>g9eSKBW9Ahr!11cX+2iq;{~4^3!(x`tg?0
zzr%_h>}=Jmz8w!nw|}kVC?o=dlZ)#ef~2cnQ?Px%;3&zFJ|SQP@o?~kxMO@Js7SP)
zdn9!GFe68be{eo2l^XJZ;wVuUt1w^M)xsFO@xIe)VE2kBz{M%8F<yY?4E0mB<S{E?
z+$0{jwZcDqshxp#>wCxT-;>l3K%YSEmN0MR+P5;n=OFeqq3)9PxYzHM8Gla^Db5)f
z-nAxD^nuEg>0dX0fcmN8(_6YVWOTq5g$17i)Dnq>4}!`m_T{Yr+&_t!u>!5e{%pVy
zUQKcG79Fi#J!>bUplX4hzHrs4Pvk1lYp{g@gZ{=jFmG00Juka-J+0He|M$jt5J25x
z2cBwwIBK&=^Yj*@@BX3n0QLN3Me($?+WRR~l;!S^B#+iBA|xm{8I;>zUr)`q8=krq
zGyG%UDOPT^yYIcR|2(lZ^azLj8eklDmAj7SM>KoTiroMXiX0-Ko=fZE`2V>2%784l
zrfWq|Kt&`KDG6y1X^@mIX`}^}4!H@XLr_9WQd&y7yG2mCI~8f9BqhJOIOlo4_j&&v
zIdI3``<gwoX3bhKdcOb@rKYX!AGrJTL&BuUooKQLw-Hq@{<=Fhwd~oA>hobcTG5F&
z91Kixn0MwKeQ*A4(upi!{f=Z0r3G;F?$tR)3Zt3?BOAm?DgXI+*NJ=Ikz`L2lN`Q$
zc6_jPeTMGN|FM&kChlD<Ba!eHjLl@V=#1wZpl{xW$hJG~JAU{Z7}GD-V%Mlb8Ixr>
zTz(fxy{v*+NWpA+^*z(QXLHbs<--0^i-=C<l?^|bIUKI&gxC<i%9-T6l0|<3AA8^Q
z(Hib%5nVjNPEnFw3x?Z)w@d!STi^3-egJn#n`blCUyS}H!P=}goA!S@UZvq8x-N?K
zUj`L5!=C+H%l>n*DdJ{+HhTQ=3;D<%tPLk9IICaZhc1)NV62~)2onY}`k(2oW|}G1
z1vL5oh_4pI&z{5NkMWel2p5QAuJIMXtX#)=)IywdGVIiKO5m*Xa!;0kTvi;6Crlz<
zDj&-=BYkJRh4xBV$xP^%zM2@SS9z0OLE`vr1eyHBud%|6xv#c@dU^k!b5JTzxh#0m
za3)HR$njhqQ0XqXKrCJI_}483Ne47mH!5<ub<Gw(Gg2Ui0$oOQ3HBfOAes(n%r9a_
zFunwHmspV0eOMU+olh>7wEo=ht%CcL+YP@Hu2d38jF-!?Q=={T7~T|R_x29>QIRZ-
zM@VgS!A_E(<Z6wi_|2OoRK^l&@~tT;!q`Slq>aA=F5(CHUF<z`c5V<dc>H)*en0+W
z;`(w;eutc(s^0Ff-jJQ5rtofUhRgoC)2}H|GYt}#REXndY1NUZSqFB7P3|uLt946o
zjJb2CLY6sL#aHOJueuSpDLxvmrwEupZnjOwYuY#a_M<YNjS&_*Y4tHA+hU^Qg|cMF
zjXdG&aKLVFutgKX6oH!eo=AH>3O`*C`3G($2HJFVURq(U@9h`Ya6K=ee~_ZgE|j9q
zT<BP=Y`)qS$3P{^?*+RwIVM#WIpc9Q#auCoD!O(z-{tvivwwzWf&MUi+Y1ZiYHlIa
zEOoDc`BUt>|0=l0l)1Y@n_IwI8YG16nL8P$WP0~=kM`@<>?U0q1(`~3T*MSf-ov8Y
z%sSx<w({|FF?ysRg-h=*63wbd%A{U_IABN;l@K7GGxDAiy(S+U7n4HL-rWoEhm>aM
zmOb4=uAgE#g_<wH4}k1P9m;JOBA^}j8mXKY0+{kMDMHcp>P-8nzF&L+BR9b{ZQdAH
z&%-~$wW8Zqo+!6cK-^Wxw?Mu4X@+8gIT4%w@8+F~hLgM%cKGxL=US@#jl8ecbvbON
zA1?k#8C^~q&eMs5VEviO76xm47m)cn{RwysBAq}mma+eHUSH*USV8?^doHjOync-%
z-BRFuSVU=?XVvmUu(^-OZBF-qG~)Bb*Rxniuy6jx)E(^IKe*ahBdavdy1&i)ES>Fq
zi=>4wq6;l2eHDLy!S*p#h8HCzuCz#1xF!e(#rN#KJOvFA(d)=LutfLfe#l!#FhS6J
z6P*%PN$sOZZwZHWg?i$x>US5=DU~jvWAn+FzCI+IW0S#)8E?LdI;@0w$(#oQD5@}y
zjt3pgs!^wrAI|;Jele?o1C>ogZ0?JWs29sT=G34`l!9dAE3*-l;H`06IgY{N1aie#
zu#D(HY+#?($TB&!GklK8*Q<#tJ42}Dx-g&WWVMob2iTHYM~s6{;`dvDM`jB4NJ)(^
z!E8d?4qEo}uI_yj#2-Q9pKT)+9<_l<80kU6SwN6F7e97*x!jS_bd&oNF5>_pWc$;S
zE}$jt^cYe}HeRjyqwwZb=?nzxyC6B6jh{P!P}Ft&cRNUJo!>t>{2~FSIPE4pLx?*7
z|3)#d)$`PEq-senX}UCKSI~y;DlKs`u<_(@(CuKKpQ9~Ydi(p5pZgh*$AR*KmkK6&
zom$jusk2ep4?cp{uH0MR@nYYC?r361wT{$68iV*ez0#GOhHb81MHKiD(cxIc$3j{J
zl872{MW+IGQkc*PGa>=e-LN-dnons)Vv#T>z1g3qQ&D_X3uZ>F9f5=%&zr-tiO@aC
zMxu?H=xuuOWZM2_K^zK;xuPxv*L<#`O6)sstMQ2Rko!T+ZXt;1U2q{|CjIphP_MIq
z_B@Z&l068A&#On?UxBGo9FE%9EY&3eCvd+Bjte=f;q#wq<Knn`4IMc%*YQMtsJ;m&
z*$)kvza1oy$O%&E?k_sBJz$KU@OyvBs0?~lzI@F6i=Q6H-s8Vm)XrUIGgI#rw#2ET
z)IEppv-Xe@6+&Zp&+j7IJgw3a1|3z>pW+Oze-18=m)P(Le;v4PogYVW+NTrILmLEf
zIw9kVf8?IWy%tq`fHz6Nh{_xO!i^;wakv5|MXyf(&IYn|g3*PAcA5FSA1-Uv99{`%
zC@*l)evb`T<ZqMyfnScaii7Wj^qUhhK?i}e&Uu$nqwCP=doWoS2$q>~k3PLbN81mN
zMgp}xA1JToWu?k4{|3(}q?l1i4E+{$Pa&&iPMXHJdL4u@?Bc)9_aYW00%qTl8VphK
z9kQ5Fw%Pz8hVHwgnG#L8h*y!{r#x5KoCTOoegFw~l(H;i%S?*I)d85YskKn#{fSQf
zoy102yAUx48s=12es*1_fn?ur`q9qvz;}|AY1P&#Q&21FK3QaYHEH{f55}jz;uI{b
zV7Lj(&7~UIZQp&N*~N0{GCC3*!7TPb4c)?#WFAAWv`S-oR!y!&@jqOESJO`4i@4kn
zH!)CnC0X9_nuwcptp}l;+6cM6luwRPn5LJcZV;ZPD1<^GdzYQQQi@T4uw7LMl#@;D
z<nFM{7k|vR94yt(K|i>YqG1wcD+(8arwFcBdXzTf2DMLKF7f$3())_^dda~=ThKZ#
z_f`{-B+Ta+2hwE1s|DXDxgWNT82~TSqzk%3R*2)65~z_A`bz?_m=uwiR~ngMl$oQd
zh)JgU61MkrMYZ(CL-*s?h+?MJepts(w9FG;EJGxadVAz#PQ%5qR$1m_J2(b1#a#_&
zCmSE$vW?jCjk!!k$eNiVY@be!p4=Mywhy$y1M+t@KPC#g#3JTJxnKp<?EumB<>`!0
zH`vM8!9^a0r_we8k4dGM`AM4slWZ8}8l-rBC>e!embnFLp8IamE*KxverLD6XiP?8
zW<&ynmSXBIjW?onxZ$oOG~KuRFMZBs+<i$S+5_LYeYj3By3}GM-*T-_$qK*s(hz*k
zXr9)MhO=pBX9iZ1N$Rc8s6r_^^V%8F1c6uFh#Go{jHH)H87AeQEFz~v-}9(q4sGi9
z5|gSyTFH-*RbHmUjhL38E=ix0?GVaSsQr<^f~BX{gehlP!W^BLlV9B~jeA~M!5OZf
zE3aKkf&<VkcL?Ii`~?%!buI!|9$LLPTB~>!?+Y3jFSfy~gI_Fzo>j~`Zhmy423SdX
ziX!^9?<@CLN7!CI6nzyH)8S+aUKt_w+hAeV9}n5$u*F<r32?-Q(|;Upe!X$^n*(U^
z#sa}n;EFMAYb?wT^gWp)Pj@T5e4T_>jPR^J8{JrRT}0vzTalRVz8_1i2Z%>XuF}X?
z!L5Rjpa?U}jaLXz+rWThWT<&if_IA^38{Z@A?lC^f3&1sN{?6jrd5WgTWT8SbozHO
zLAMpWYGQ_Bi**AfQa<giO+@GLjc3`SA)`0XG=H<%G@O*1O;tNsrY@5EUoRWz1C{rm
zPxvSxSr`tc{qym`3)x!HzB}ly=-B?Q?Y4L~fBYN9X>Rsz2wKh73?It`1U*5vT7+^8
z?n6=fKb+Njul<Sf*#=D<X@)By-%3!^gyjD@{Rny0l{UjiO(~#<J!N|poAKPZ2FgKH
zzcNFe+V(=%HmkA=KW26vzsFzX210m|5zb7xp)!hHEIQ@Pw|gM@c|P?FnEQ1J1e$oP
zT?qms_G{xq)fKlgenA-LHM*VNg{@jjoLm(Pd<^|vr1T_3wcd5gDr%Nn?zL}J_+2NF
zm3AJEZ?+B_E6J~4oFEr;o+F!7H!nb3L^@x#YtZGpG!qr>-HK#Yc=z!tVHNdtutl<b
zmY@FCyZ-CcE3?xGJ+qr8!~LY}M95(?yX)CV$HEcW$opo}rPvu;nXPZt<<B#qhVfWU
zlsk~0&Sm$NnyD^+*#v5pE$R@_Yj(Kt%J~*n_#&T|+`CYtdxena4HOzavM3mj-xPkR
z29$*`*|4&ToQJdg2^yxq#NyP812ZC`J$)?suV);u;VCNz;A7(bE&Qk)PsVk={7AGp
ze-_5+aOcLUzhthJaIW`6$<_AOxx6|rfPtRbX$ksb!3PpOi`GTyojV1|s|5|yq;{R`
ze8N#j9l&3-0J<L9s`qN|r#IAS+SXO`@jXM|fCU(>si}Bhwn9BqS}AqWuuM63Q0m7v
z4ASyb5vDV36JZ${cgHXPhQ1~qKSp}{2kLGpS{E@<bGZY#S{(1f4LW~UFA>}c5gPFl
zPmkIsUaK>(>*B(QJAjvn=QPp}i`Z)*NNrYCZU-&yAG{c2KH6qP2Lv@z-nW*YmJB;7
zWl4?~({F!_s1}`BERWFp9Z36P*zXb&V4)?mDKqSvc8^5Ad0tVy_mWN#*3hyql10%|
z6e;577B*o^P^>39uP}h5)I{@2wTdOFYoa9@%B3Ics*Z9ZN-$HSI_-o!tW~k9T=?iW
z$9I({ci5{<!k>U~!zIMxU5VkMgZCOt;!DroS0M|xEQ4TSVsfpRZFreo))wv9N;&@K
z|Jj7!H|cjCR7R0f$Pym35+buIoa4WGg<Dj@i17|u^+4=zon6pn@0zYDUW+c+Um3$S
z=Hw7@SyqgQ5X)eW`xlG3X8wj@psp(Lf$>d*Wskm}Qi%o$ZGXOU1gvdR%|*m|sW7J{
zkoXSYfT>Ndf$mlS&UMDfQP7Ew(_93p1ysMYQ;VuKt&S{{+aCN)EG%u|(q`Ee9zdAl
zonzx5r)J#5O>Iz47aaJ+Llzq;*Jt)Q!FWv*EdZh0{oP(fW*NTPW`<n~1v5r}*9cU*
zL3Y?%#(WnVQ}5<9rEISBX9I?hU<tT+D0n#Do(WKdJ+@68M9yI-7aFE(G54<*Bn4e{
zbRAh$)efsfjIRnd<!<%nXtGJE6~S;ul$oUIWwbBo&qYZi_MeFPAgJ2xLBDIrIRhAR
zQ%dA2<|Z5lPql!Os}o*|^m>p1d(pL7t|3w>l2IKIfAB(w96u{2tdI;bR1mRfM^uE9
zSap@#=6x#usGN%5UHIP*hc~+Sq*KH?`x-;l@ueF~`k}p#uG}lD$QER&+7V~q4^2I}
z?-BMyl5NV$nx%YiUd65ni*C*}ZPfkX=arJq2OYduJdRH)8In?(SBLdpM5_PU%cy-9
zKr<S4Uy|X2Zbib+ukYgE>Yf^(0&BQj7wPG`6hOa+-h-><hBM#KowbRP@qFDNO&u#P
zi-02F_YWhpOcL)3%JRxKSWx9R@RkY8@6%63FUW4^U8)qFhin8}*We17eP|V2B>}<d
z@r1M0QkLu%ZVZG|mu@`7<)!goqY1`FePE*t)HdrlOH|kxopYZ*3-0+-Q=$FQ;@d?e
z`H6?*CnDTfavYoAFiCQC1f`$_g(3EW;V@Ppbue$!&iYhMa(kWbRxb{EE&5Jj0X|px
zABeLbHS)C8NuOwM+)U{2TX>U4&vu9yj@UXH@`J$zTPNU57IVV{<pksJu_i9>{zgAW
zO9Hf#pKhQ2CnmbTYbk@&zxgTz%JTpfN{Du9l1lmN%2uZDqno0$80mo@Ohw)m%;8W)
zufR<hQF{%CZw@z=^UpL@(Yr|<?&;URq}RsL?-w+jR*yNa=5>MRL}c`#EKVQy_qh8B
z<LE-Ov)^17KAMnVIUmnMzRHdWeRPLJ4o^A$kTc7h6cqOli&*J2C}pL^!DfV6E`}u{
z*2QI`6y~@i<t)~@Sk0=RM~gZPt=MGW`pJFvcC2uu%;hW=;Qa9*!!|L?OmYd8>o+|L
z@%Bj_M-4xz#LFC{9Wlu>))tFoc}aTfj!$klzVh>PHGf(RSPERp4{pMhd@oeX{GRlW
z@OfezsHKX{M)LKhAM{>Y0xGf?e;)G`B^GH8w{V2ZxCX4Ez@nN$y4>Wx-=7d+K<5{r
zp5eqJc37X}`-5v!HBTfJL#+0JYBMZGoT$gS^@ghb0~v;a=SA{a*Sru$c`%v4Fc&#(
zkLwF6S%VzWxI6qU;{Rvil~&P@^KtS$>7;j5uRgSdg_nvJNsVv0w;9H344Gh9P1vg4
z{`!<%ehrwNAsQ%_guJb8RZ0P=ViylEgz#eM^CANF=I0-rbn(p0as8x-!&x5RYn$Wu
zfH~1&n6ANQO{dx>N*4?N8X`;^QxP1$N~aiqi<e;64#wIDpIce{LIzGwBkT?DbvkZt
zLF)SeNesSpT;)rtAzl$|1BVE_d+&<}{a4AiuDF1*^E;G2s(@*ZRGkQyYt(NoA#SyQ
z!NKCu928kSq2~#v$OW&*SiQyWSj$zvnMAT|0kWTrzd?Jy_YxKc+7L}|fk8tfK9O7$
zGsA(Bes|@sYAKp^gfx#58_A-vQbF*SPj5LPg<gFAM5#}*6_Qf13gYYmJVrgZp&Y8F
z-muQt4=5xYFF-DlM=d^R{605@A(A?Ia(xYqKvzYK7*F2g1$nNOYO=)321$C2|8otW
z7r$^l4PzciJQSTx)OHan_9tfCdT51z02SaDnIpoo)~#<IxDse2YYn6Q1Z`O9YWsH^
z8l|oTBDg=@ob9h~e9PgMEjm3720w?RLsx)y-s4{dDa8V$^?4R5E{1sfoYs@PxZy%G
zFhn0<G-$|`e*Y?C`Hpc*K9+foR0w%Dd7=)@`+l!RF{<=eGj2SCUQ?j`G4`H>D|_j6
zhdSU+&36h|2or>&FZ>1<Pi%ryKc@=osdla%`BLhzkovP-n+nQ5ZOwxm(nzKg2KbWy
zd*%t^&_t%!r&e^_<Q~UcwW_3J`l9daNzfyz+oCA6s#=7hL>v_hSS9BJTcS>6`i+Ox
z>IXfs)2A!1^>&)^AM+B`ib~wrFmF(kn@3$ZgCx8Y?7PY65|dFt9FKbz%$M<$bqm!j
zcIPj;<z7#_Xh<RBxH%SooloFn&fvkf|5U*LMwI?XlsU8HU7-rK$pp$!Bhjy-LrlPH
zf1U99t33iW(Y1unC~F1x;BC=MfDH{SBkgC_KUZDBX5q$nK6k*fGSqW4yzbU`6Zva6
zj+=e1eZ5=NAJFDMt#ka&dajy!KOs3q?Wt@M(Kkcw^?uY5&UmWDRCSqh?eqh;LwmZ@
z%odL)TETMTB{vwMg3Gcq?bvvm=B9?FUHdCgKtU67`3}i^8I<IHG;*(kyIPZ}he{J$
zS5Sjr&_X2uLdu_+w|A`+tx)S*{C@{y8#%VaaV{VWeiK=6Oy<=gE4GP3jN^@LKe^h_
zU9Q%HRQ$~(HCr9Oqv@bka;$c~eA^`3Z?~QHn)Fk~3Hp}rPZ;}<&%G#NwjN@Z&B?e<
zxM7B4^Kd+rgu__=%fs2B_rh1*CUnz;P7j`yKyhJWc<_RZI;*-pApF~Z-<Rn4n;SL8
zEmhwW@Bj%!Qd+#=h!Rq8lj=SVu4uihW#-pJv_^rr`S9WJRBI^mB0g>gZo834bLda(
zKw`IfP99NtT)eZpR|sh=6daCHYQKoT9tiXb^@Cb-9z!kC@}t|WX@D35&S^1rFS|A1
z!snt>SHEQ&;&OuctJ+KVXvFf^SeJ&(vH-YPxI9(!>bc|WHfUB4>V0+^cAaR4scJ)h
zhn0wtij{g0%m~JnT^^~WRs0+akvr(Iiw-K~PfymK0__swz)N<`=pLQJ#AJuME)_+d
zo-8MtpKHoi3O5qFb0Kzzm)$N{d~L-xK1+l(XopEyGkH({HTb{A+w+#G%!#mDjdLzO
z_fuqNep1+&Tl9@YvgsYTvh}o94;c&J**|~?PQCb3)cQ8csL$WaJledtBz*`!<UA9&
z#ZJY|X*)&DU#GTI4~XV!C-?!x@mS}XB=i<p1uQ)e`}2+|$%!kFyX?7*fJry0#jV&L
zRYkt^OgZJM@5%V`#mQ0B5%qUUr*{(y;m`T+Xv^M+qTMIydJH3gsQb~eO!bp?k>B=?
z;{xJUWPdT^ah&5t|G9sa-h1wkeV?+qj@n@u)*unC@Fy<)%;Rx-T#O?{6$zs8b7fgO
z>UeAfjIu=zfWXG$EAoEPR!G6*i*g0IqTx51sN+b?%!m|6z^_lL<!BRo>*abc7T$M*
zYW3{CEQc{udFhxtd*GgDM3c<wpayIGff1d<C`bib5qh@L>bPi~3Q|{5{3Wwb2`HR=
zO*5+O%vK@InKAebX<sO|{`wxmBSmWK!_^IC-U#9Zv;cJzJ5xa|s@nn28QLCthA&5I
z6ydS*F&(Gg4b0{Agk~OF4?AP)veYj8r;Q!VKz^^+)Up}HGQAM?810@HE^@4CA^?r>
zQIuEtT8xG9x^Hi03_|icSVs-OEs-t1*yMY)3x=LG%cEC7meN55jZnb9uffBE?ut#>
zZ=T5FCB!xEnL7ZBAFr}jCNtXoaMZSP<hQ!}CA>j8^ycR~3cCBP1|BCb=4KJIL5%Ji
zMOB&7Ur_+gbkRqC6n}V{!j*RY#?<2>=62udPmkU;;?h`t5VVVbfxs`xOW+RAI=Tj)
zm}bE+ij0ig4V0P<@IGR$VjDAt#oo_=2EN1R4^7+|T~Z#3&gpHdAPhfxFLUwkQ+(p@
zCnqgF|Jh}vAV(S!G#AejbN0B4Y1VJ*0757n4f*c14)wth9NMuxUc{=^#6vnM$CSL3
zhf=f^tf5HnO7%`>zM^>Y?t6rByFmnrEJKPc1OFzM)1NsN)-rwSpT7Op7-X&e<Zr+I
zW*c|ucQt*lZ$j!JxYl+34tM9@EyRa3!n#Xia2jsna68k%4UM$b+BFHU<iCT7?<-7&
z)r`oosw2$T-rljk4|h^UnU;Sj(680^#{giovaVjvh_B{fn$Q18UnG%tXGAcCJ%n74
zpRE@E@x#7DJM}@K-1q@239$kI8U<EfX4%M3Z(PjN>vt&P{3mpI-LRm<yBOE_F52=;
z!W5qkozGX?o4mLfowWFc8z3+!MZ2YzLwI*nfOrKKFRD0ZDO<6B54y7$U+$;!_6CJ@
zM99cdDE&|a`SaagY8}n7S5YNPhH^%iY_iH5KkNQp`6~1I`84#E;_uCN-_#3o5f>`~
zU}7*p(W#}3kNW0Y%kVPtB*5-!tH*hl$IC3-HlydY6OKPnww3#*x#-em+M~(OT*TX#
zd7V0-7~nRfuE`>I?NqX~E&a-=(Jc9fL%=TBZx?9`YtpphkqnvfTctSj6xlN>(iNBf
zC0s~LXcoOr!r8j4XNTBnMd03y_7fTAF8}CozUR{17xyEo6Z2*XI_cuD`~HC;v(S%w
z1ja4Y`-F0JieEN9kgla1iABn#fDz|}*O#lhm(bP>mXb9?-xN0}W4(yD+`%HdSPIz8
z!meJD)8H3|^yh3ErHe1@^tbdYAQQSxMxbNpONbW7G}o@ed^zscBGSA(`dt#TtFl(s
zswIQd%gF5&f>YO~C1lP}D*PmuW$l+KMMq6To?EI_`PjY1ewY*`(0$x;h;`&dy5EX;
zh{qV6LL-BVOu(`9ZH}8*op5j6QzqMt2Uj~dSax6jk?D}XNryz}11S0ZLmU)#o|i0$
z|Ja_%D#LO`aP9A18A^X)SU>)_uDD|GQqG^@6^eaP`SMSI96F-#jtp+)=p}gaE~ka(
z;;@LGB%Y(*k7L3yP@Ky{Ihr!7njV|0I{_*?OJ5XIC~U%#jX=zpCClneLiC1zIj5N{
zby_%4ao5nJKy-3EQEA*y`1FSr+^ef``gyv)Ej=Udt-%TAqHx{xvil&tCD%HTE*JeU
zPEBXpoR6hVi@sYW$Yri{K#>C^7JVIJ@GO4AKY|I140uz6NLyB|nVZ|%W}+vb+NZQy
zeEmJ#xBq5Vjh}wv3GLj6p(0dPM|oAe#Ei^gFjG4HOqJqZZfU<P23xdXdHo~%B*xvu
zC#Kf+AuBVoAMXXup*jbE)v{FS(dJmC)K}ftmI`$oVeaCY8^hHNa9heI(AC9%Y1B5g
za~b!a+ksLM%}?=Hnt}n6zM3a__A`z7Je+<MWnG=G{rS@+IppEKld-qfp77uV@H>aS
zQ_J8HWnMXXYE6m;fgqf5v!rf=mGExJYhf7w`nAU{BHBQ-Zm1u79<G3v@RlZ`&x`wx
zg~u;o4n#Y_MUqNsyLYE-!<0#Mxo@&M|IvGeAFAfn+*Pkve^?BrxUp3=!MK0FQ?xl-
zJJWMz%kf_Y_RfdA>tV?!mU#|q3ggp{E8&8XcgZ__lX0l~eg{W<O(_MxQ1H!?`+mk!
zcxwRKc?cY%1aR)owr;sH)h3pt89rO<iYi;tJl?F(HCJd!E+8|z&_#D3eKbdfmvP|B
zSp0eDjbu#t9nm#n=O6uCcwq!C{(8)gCtIwS!4}L?U%ie#`u^v732(Bo>2Yxq=j6~^
zA`&;W`5B&-+$w9K+>fTT=rdxE4opB1xR5=EBSuGUD;{c<JNqGK1;n1SU49hL=edJJ
zve7GL|0D97R|Q9<JBb{-r>-b!U6Xql?t^FFFC8p|_+aV50I_N~kGs_-{YEACWtW|$
zo3cNMIo<uOS5_sd>+!P2$bZi`Es~j#O{ipyA=YozC0<JAefwFY-A(QkITTn-wB%RE
zYq!qo2T2AcBlDQ`Ye$_ZN3a?&t`hOadB&!S#obE}=IR>x%fm5RDZOopCxAR(_IQQq
zFfSx?Y_CFTejZck=`O~FeqUG$C;VO+#iFx7_VN$}0hm5;pTM2XiAb_KiIO{7M{3CS
z9Pn3~;(IQ6e%LcSc8P=up-wxB8xx<!WyXJ%R0&v|sk?NoU!-3<8tbpLU|R$^15|(5
zI|d_~3X1e!j2fknX?czu*TxBRox8Y3(APsuL!__NmJ+YL8XtZ=$FC+^^n#XtP_gKd
zHh(;miSvf8sW9Dp)9i?Yx)HTQaxA>hK@`H%G6tHtOTERU!IrCPIAh~qbwF=4kn8Ir
z5gK<`NTS4KlO0yaWdu8PM0)+*x+5-Wr}(tiZ)%?&v0=r%{&H2%8s8cBYWsk$r8mt^
zXi(Pc)#rZV<mu(}YjO;^JKsO88J51<nC`y%S|=>kkDLS?=w77Sa~}SBxROOYR?t3N
zDkR~yDymvubovJdjXwe=G4{6zoyXZwzcXcc;w^cw1IZ)4{M0B^@c<TC!ZpyU(~S3c
zcl(WEjH$J{<!62S*=NxHf6t%mJQRYI!VFtJ+)bsDUkPljp~)VTO;~_WLoje@3(P@L
z{WfC@RZl#}UBkYHb1Q)xBz39++KGHhb8^ZCSps@E{WlGoh%qo<C5to3<4C`LO}nJ}
z`>|1WvO=cVv%dX4XhCbJj0@-8=&D$=S7vuFCa;I`KbMN)%$0tlR$T7%55#1?I{~M=
z48e!fYzrtg=5N}E4?E%}%Q!>cV<5|@jWyaVNOCo}+$=$P7Aek@+|W+zhI2C({&f4`
z<W?PN9aiB8My`;PI=-DJ&#luSOJSsW-Z$L_OgozhMmZOTmH^VoD?r3jRXCy^=U^xs
z-IFg}eE6cBpafhF=QwlhH}LM7*+{0u3Ar6c*z?K;e;*&{6_wWMOMJg4U>`bA-e6fH
ziAGYiwP7Xans!OONBFEZY#_7(R$ovNJ5!~-)@hs6Ho)p^bRrRFm2oO+aw(7yE1QPD
z{}#<c|Ec2W7O_Q5@1pG_b*u#slh^PFd}lJ5n2g<9bK?cODNelt7_#<gqCGU$a!@F6
z%wr=@g&7TmmY@=i^q89m1eXouYn77B!t=6JWZM|1EbCujU3kv5I&MJZ?AiA<><^X;
z28HvFQo-D8K`awX?h^i?yC%1F+NmcFc0KRETAw^jW(QS>Vxgyqa$&GtU;sKg|6k>7
zO4;N22!g%4d6U2A+|zCyJ2td5vS^pKW)%aYZb6M@sK5M)M~e`glA@NaHo7FAVo#O`
zx$4ma#;q!DB72*|K$XM4Q)QM}qq0FVHzs81XhlEzFium6c0kiLcBIrtwD@&-LQ@z=
z7X-r9__;sC8<)F)sj-pP##pxGqB0Yv9KgMWSQyhs&kYWhp&o4u{5?o0vAi)lUaQX7
zn7nbX%Zs~?{!FTI^?h>&1`~qGUqGLw!rftxkBnfjwauVF)=G=|solnw(Dw7K{r5OO
zL566M4>xm13HpnELsJ??L|D(jy!b}R^ENya8Ckz%0z=@pDW=h8KZ->eMDt3{L!rkB
zpt*G#w`JsfT1p*3(TCJ3pIg2-=0i1K&#Xj_-XquUcF3MIty=%-Hj>4w5JTRbN1dyJ
z&9ZiL13KV*E=l}5_HgwGxZLBP%yMffbSWwK<fM@0x#t-BjBlMj;RMMFW4SCA{N6Ql
z$oA=1jw5c;kJbbv%>**7N%#GpEMIs&kfD(9$UU84?c$1Oah^0ikM(4wO6)v|9Er<X
zh*N?qZ)nd01bS0C0?c*rlr-APM>H-r{>d@hEXt3VrZ)7C0bBd7XPU=O|Mb!9Xw<zf
z;WlR(_fGcjbH`7zzU+AGb+eYIsNTf8SasutDBJkFY3h}HF^Lqi6Xh&!?8rlG*SXg}
z`FCC)^*!>|wP|j5oc)@YbNWJo>64G5@lvTT`I7VA?`6N3TH4&}S@{e+0$8rtRmte=
z8nM*rZ?*%8`<XYf>_X7DHWP)4hLFCB*#5izqA})h2ejYd2i;Y}nR~$2&W15X4b$+g
zd_U)VIM&NQUr#i;aWbAfkj0ASH91=yPFQRjH4&|-ZMH4FpSiA%<!^~7zn<|rG!a-S
zE}yb31f*Z5b(8X|Q+S^`z`_4>p2fZ6dC<hvplV%m{K9Fao{-_Ua9{rEa9OR%-`!25
z9FoW}k)e4ix7xVziKd%?9Z6iJZ(NLX2>}y0J3{xKG!xc92}aO@Yc);s1juc2)1g&?
z^^f6Rb84C5nk;_kodE#Gtkw_%SlX%uO)$79-4MPYm7QPONop$wFGmjwDMIr<WBfI6
z4+s($R4{~`ZS&Nai}QXr8NX8c4;KJXi9XdEzU0O<?!64?SZdw@ic-=`$;3ma55SJ9
znzR-5Ub+2_jdBg1s1FPO8r5P1LOx-XR-!=FZxUnqL_|sc!eRtvF^Sfsd!~Syydrh6
z&qHT*chRm8<hw3|IS2!E@vLbtN3w{|<Xrvpo)4@R*aQ~D|85q`r$Ks4`kExakVaU%
z>4)pN&G>jbOFMqE2swt#@Ef4gNw3yG({w&wD4?X-Ap|{$K^k{q*aL(gR#B1P+h{Wj
zfGgZu#Ir}#^*5f+?vJj6+9FaxgQR?ExJH3m1)~o0ifSAKSS}}S7tMT>6uyc_h7qLj
zIW0m}k8e4Wd=*%nh1F<nr`j`S<;QEH`xT^~G{)Cdz1<6(3Wg+`vFPG|982$ioNTn*
zj~M@85v#LvGN*8qu5evTA!l@fn_k~--_OHMG0WF5EQ%k~wf})?GZxZ?D91d=+jLsi
z*4gebMT#}GHzMr&<Tz>?&oq{Rd<R)Z)%9iE;;sbHi}<+!+7E0duRfXws_A`<wZ426
zUV#!6rzwaeXwbNAu^1kl2HwwoEy|V)M8CE8=CA{!Ag*CQ&MT<gFr_aBbEcn&I}K4H
ztJ-xJUKwcDSS?GO0(A6)_wv}G6DU~av5f~_e+uBA)3T3!5D(c0#p0zKE+=0$0vx^j
zX_tSB4Ck(A-LpBytMv1V=Zol<O}TE<cdNw)sKBLf^H%4RpEDWS2}XzODVZ|_DTv|d
z@YC=(`g_kQJ!g_VO5G8*jRN3lB8JAazm@=HuAEJS#r8pkruUEVQu`lG(yNgGk`5qL
z>EK2Mhe(y2dT)v;8&sIe^xoU+cNV5pXIIaho&@7{S(f%mIt`V5qs56JE6kG)eGNt|
zI>J}u2J~miuxz{ox*j~S1mXT9I@X<@pfeYd<zegkzsB1jYs_m_l|o<}iW^}Rzu{)(
zQ-d*KsDW{w>^Dm^y4~#9w$W`~zJlD}R`{FiMY5e74l9Gz1WcGR7B1_z$<T|%MUJ>-
z6<&dl-Q2Y!ki+G?bOomqFQ7PNi0#i^b<MGx1C&U%*bJnn@X;(8<ENOe-ix%-0K@VW
z<45NB^w$oK1gm>rZ-@&NXF}O1AI8#4n7M;<`A54ps0`M5e~s$$&*4&;BxJro2(8eb
z?WmIXrb+L&9f{C=cx*)K|7qI&$f{&EfN4S3B2b4g1AIq-y7_xBGPU)`mp9vDyTe66
zk329}L!Rj<KP2e@ewYrfDBsvvKatGACQM@R^EJL{2asWL&K(EOk)AN#=koc!oW;y(
zr+2aQ-6v!k!WF`3u}8*Kpx48^`0EW0e{0)g@^pHWJz%0Gz4Gt>!w{J;SvZHg4a%2%
zROE32L97~rd7q(ro-0jv%ut9UqD_mh2GSI&U)?FNo`<$vm31c>kH%5WO;cW#In&6j
z%f4ZvzT{232hh(0t7GPOQjZ2%WFtSju9Fnx(Gw>52;&j{-O$1Mh~OZWydMupy&m7o
z_$Z3Rtn;Qm^6+F}y-u>wWe+jfOifZno}NMMi&eddH%l-_6=_FV!}82_L1fW4KAeKq
z?`)etpNhqEh@O{ov$T+?=uc;DT=#msNdQDe@ZO3Izn6vdeh{i|SE1oos#nhfwX7+e
zZ(7BzB6t<zArW>IR+!?jI?OlXl)x?o)VPOzo;q3#Uuqe?b9N)}3)^KI5;8Ya{A+}A
z)bO+l-%kI!_4(wF<Gn>{|1#`IRPnYpca2*;K%-t+-zWK@PT-Jzq>&lHI2urU;dI{{
zo)8;Xl2nzMzb(&vzfl$ih%*hba)-tGu%SBqWAm30OhJ}xkuX^L`98q*i`Tc229jR2
zecMr0zbO|`uX<W?0ICbVqsbuAtp~gw^XYVG0lXxw^bTm2M)fhM(KxGNH;c6-Np|Q|
z#KdGjP7*Q(87{uA)w9S4k6YMu`zOchb$0CBda9^$5#H0aTlXnH>jgdf_36ituO$Cj
z{D1bL|8Zl@NGfq5<9dVbhRoUV12p8$vc1{ZLdV*_gYaf4VHDti#zF{>B1oY!Iawym
z@Nu65A_@uI=*--0bNqwX28>e6zrpm02Nc1(s>I71*T&0AGA5MGbD{YuzTA|>L`?1>
z%Y3rM_q-K%aT|Ig(@2YW*s*R`P0Ha!A}eFAc9tm4RK%Jd?+Qkqp&pY?FMRRr9|v<$
zX%S+gW^XJR-VZ*_`hC1Lu5$VBrC}^g|Iz?LEN&VVznoEYp~y<9$t`+y^_hCNu5+?g
zGeVM)CfEbU&7xZdpN2D&zgD(Z1gkuq_OS#SsZBsiAv@xrv;bOPty%SuqUAg>ja1iY
zvkR*NpuG$F!N1bJZxdA+x82T(pjsqvNbQ2+(B_<!x?s;H)3^_OQKH}d3QR<5GHV2=
zU;SBBT9{V|<8Y<SzA8T45?{oN<L#M=Y8EUuv>D1)pOV5O;Wq21DVP8@QXh1<DhWEd
z9IoRCad9PFL+ZBwY5#nr`QfHNC=|a}Kgqds=rQT?pbF$@k)ZSe^T(_z9yhBu{BEc@
zP!UuF-L_!#py|WA$2E<MwcFhs#JVp<wdmTR{ps>nmop`z%5ITApYK%dzJdGSbn02<
zSO$-<rzxQ#_cA@|{GwIgR_KDpIilaD7(nTN-#Y*u2StH}m)E*G3ld-*wnQKNM2#-n
zY-My8?Q%g^j?PUn1_kjw4k!USIqjcP0U|;}Pq!>iC%1d^k<;wMGBCmPEm4oQ0tIMN
zLSv}%jqnfDX31^8r@HhIhB<xD+)Z;xC$QDAEb`?F(u(7@ig!?#m90i?MI42)T-S-&
zQ9r%4L$;PtU?&`|GxU3`ByYb;p_T5+M=-Fbe@vhuFRkkLZn&iRm9?@q-|kJd9Wr}h
z6W_uzYl=Y3W$x2PsZ{vCP*SdD_RsZh2d-$+;!fJ`LdF5p9L`+%?YVC+<a9FZ4WLC~
z5o0)FPlhhUC0KV2=fBpU^2g)v^W?K%>gezXdMFVxaYUrZU1ExU`$$5rpz{pw8l+tQ
z9|P>sXV0E}35pda_>D$^^@Qd*g9#r9Xa)6~I(5K6;4_DEtQAeR-AMOuN#)XJPZ2s%
z+J$kXhs1ieUbE4Obdj8Ro&DDyM^b-|2a#af#|O_WjzPt{+jhxzwbe{wUF7tM727nB
zR#(&LiwWcKrHVu^S7p-AD_4z<p_8Lh$DsL?`}41XYfU8gjH}%?@E2l{mdqsdclkz6
zfuiSSE#7iV{ABXpW>&LcN~Jftj=SwJ;Le6%Z})0dY0i7DVI(EEs(#mMzh-<eej2Jv
z2~{Pt>sg7q`N#91o5!JjbV~IuJBWLneq%9^hy{gi=q{@7)-R<893RC0q<wi62;2AL
zEQ@9I&+uv!5VM-Y)q)#A>SnDwZcpu=LG=-Z^rx}QX}v;xgvzP%56O3<093DJ^-Vrh
z)89&l+NNSIeHOI}EVjtt3;j9=f=31<rwxx@;$CwTP`dfQHwahy!o>|}5f1*U-rt;z
z_Qaw>M?w|@jc4FvY`UOOWXA?!(mlz!v12w}6pf;&^-zvG?)4`;G%rf_c7$#>XRMmJ
zgi)oUl>l<1!{*LEr;1<5A(E*4>q4hgS8g;Y(RI2}e(8n5nmE-mlb%FHXBxig2yL*Y
zw5)bqi+xqq^+k&9u*|R@YVhhqaIw!17h8iZEq_Znf{#UW%-}QGj^}ZpY8V+l1-wsd
zjVFe_@uR^4>%4a07rSu$8<5QVApR$#SMPZCRQF3W^hEx|C(<17{b{9^OYoK+X>kY{
zT`=%!CzH&oIF`4{TAB2cG5s`XHN@BwpFU?yeKDYarOis3Dju2itAz$p@X>pfFbu*f
zLAcylHl9u4qcr{yjjXlpolt+80<ycD*xaj4Wol?L@<2DLKn_21hDG+GBXCR>{PENc
zSRW!>l-pe95n)eT4R)%Ar=gP&x7`P<<Sy`j(dAr63IhK&ojKVIixV8v>0Ki^yM^+*
zhM~G`kg@H`&uknf_Bd||U>Tp-gya@n`G!ny{O7LsD!sY6o_B~L#GCfnirb$U_EY2m
zgtPGXPL$`kgDo^D@rjp7mcO6jt|Z>?+bllCH3wkx#UpQT6j7a$K5(gX&3t!IPC&bG
z5Yp@m>_PanP7FeZP3Vih#b>^ycP0>C_u=m9;&Fiw8<6I^uTJcx2y~7{F>4tiqTu({
zAC25*H}d%<{fr6k8`9!*dKQT*`s+i<+XabV*BJk8!d7e-^<PYYR>yEQ-aF}QKgXG(
z;}TeC$8SoJ{!EMca7!*F7usCfzBjO+1T*28WYX7#`RL$zR!Uf?u}Y_>>0cGpI@4hf
zb`iHwpyHgEl;uo0`f6YwB>n(s=%qMjFrJzL;H>gHov6d=ckrGsa|VJNF6C_`A59p%
zf`-^D-C($*?)v{cZmI#+&zR|y4CQO2lGfXYi!&DH3Z%$O#NQw+^=SvZXJ)B()3r{L
zV)2xBlbZz&b%TXtrK+x)C0ezxnW`MJ?vt|-Rg#9z@eZfS;4kVUZp?Y?RSj2KQh{PK
zY|K?YqVLireL*dA>C_$G+%DViolz@l%U#pZAZb3barAta=HG1$RYaS9bk+H-{eYbr
zC+Sn^#dtg}<YJ=1#gsj3-H4{?#9>FaoSqFSN#{TM57&CoxAJS8Wj>#h9KNUD+p!22
zQk0vXUs<z;sKf~?5LGY+zP>Kl{p|MP$1{ilofh>%cDl@ZLHw$d4X1||%3)2wIl~S%
zup&goF>>F%A#{*ToxDZl?++=rS)H0SDHMTK;0EqMS|ramR%SK9tyHUONeVrV1U}$G
z*kv;$OaRF-X1jzg$+aJulxvC2*+)Pzq^GKn&A_t>xba|B)4>&L(YH@rS{_@IVVTg}
z`mmW!BGJU0PQ;{HEu8L-Ol7FszXI~~o%HoT#BL$Wr)tj3RnmV0NgW}1*To~}`zf7<
zUuG(Va5ANq7NUHw634RYRo|-O9Sexhrx^~P2wS()JYE=%{E-*BS)lY@HsSMz3Ss)D
zDrq@P<`wW-<UQ$gO#-S0-;gY!7SSpbOVZH*F4d+?N9TEm=>I=WKkzoOU2BiBAI=m?
zp0@{+h0w9nm&HW1DT;N^D@rFMW3y4jqfoy4Iph#i1W$OA>f(yTwsp-~l&Zym0uNV-
zNe}B$FO7FtoR5m`(C1bev3wzc--VA9Im&nLXDc!1$$xr6gB{9CnMr^0USGcf(p?N+
zG(0+=n@~xJ=aVa|c*-<V5=X-f3oDzvKXb6qp((=UZ)B$*iYE0g2hZ_Z{`BH-rtoj1
zW%PpKx@O1o09SD!%DgzlJSQrHLMg)`zsQwI==?R0CDu7mpBU7<C!_6yh8H0VkBgpW
zVtjKrsCk%G(#@YUch5eFDJZhvX-u|Pumrr11Q_zwBFMX=W($O}rr};St6RjrUJBXg
z0y3-#R+Jk0Tvd~%&(vm~m8Tr(S}13&0*S9j=%`Z$<KCwy5h=TOr6_q4k?~i*xCZgR
z<ba@2Aop?9Kr^I-$dL3T{ih){y@r<@tKD8)Tvh*_q}{&zgYaN!Y{)@M>PL7OM&o_t
zF;VmLH29G^!?wN=!`x7jgKHg&5c!qVAydl^5Kc%qI#cSUf4RlJ0R4qrxuE%OOSsp`
zltpL*4pn7YY+_3bPoRZD9vm2`s}lwDueBWuJxLptkp}}OtzQ-*R28u%6ch!kG?r*9
z^WeVt=Jaztqx3IW6BzDLO&B1QgvI$97Gs?nK=U{Rop<P0Fw-pN>+f#8(S7aRKHCyV
zwvaFPt|4&}Mbku2Cx8_}qlO*lECZxD;_bll+M>gjE7<}tw)J6wt6c(~5K*)UNV{64
zdo^;UZ_U55K@})#Gx=ANfHy+!#{f6hivi*Whjf;Fn~j&cJTG2C+jv!xK*|1*M=v4X
zajx<go1vbWx6yDN@m|e*yj{0#e}7v#bLLWg-+qB`OQg;jLKjA;i2Ci^{^k+<a;(DI
za)trxE^a*eKyP*!nnxdk+qJf1#$`irAX`1>eiy&PWgJpdWO%gt@Z@Np2dbp&Zf<$%
zJJvoHmhnQ_Ww(tV{;y0LHxnTHMUq#|q)#(Xs!V*4|HUZNxa!K%mwEcLPLI}q;G7(@
zf)E=8_p;W;lBgcBjUJoT?EK*hlwi<`h;3o9O;dG(DG^d)a;)dr=}}>oqG-N|Htpf;
zn6Lr0<oo*|CD8r1NMG9qd<AJA=S@n{cy1BZ`FPBQNy&p)FUp<uW;-D7Ynaw^yeOdC
z#3G3lY6hjGB3nSI9Nes^U0|rPkhv{<dhpeDu&C}h`oIus*$4B+baWqJa)n5t$`MB2
z_?mpcDZYxH&gT6)3isEdCMhXF5}8WNF5*<V2rt~$NM#JpFX@&<#EvpdcGQ>Lvy^D6
zM5L#0*}eehwkT4|Ik8j^PV4jp^U2C0JGN#0u$PL~DtNg*4CS2mcsXv6*z=`i=+`+9
zZK>Cw7+qitKbN7QX=in~?^R5*0Jq5ltdX6Q^bH?itMI6m7`SfvZrA-hQ5^O?WF6ek
zaF1RCo;IJAjP&BPaDbF|6X1DVu+`K>Gca$rfw^fp4~YclJ5rSNloT?;VMigwfTI3D
z+3G+Q*kQ#a``&hlR!~zn?Hs`QpLz)4dq9PkCO3iXU$H{tQKDj#@=3Sy7h2YdRqwt>
z7tqQ;NA2-iml-qlpT$6p&-o-z&Ju=HcY(`OM_s67OU7aJG4yP04xyidtK}zZF|Fd~
z{@bc`xj@M(yUIT>YBWf?^5O^(gZPk+FcL}jHeV^W&H1K*JRKgpL1n3yS-;|^U(h(9
zHmAN40a#Wn^b7g3vYV<e{=F^JdmosVQo2QB#&LXXM|xYTw4)4V!vJ+YMWCHI$<&mC
zIDp?V%S)$+pF9ha?tlzX-)<oVrsOh|QzgUlS;|mA7C?wP2!iyoDE%cPQB8l5zf35-
zeAYrRm|qxmrLEgxnU-eMNd#NrwK@fg-@iio(i5q;wtpcPo9z`w&A$NckQL1$$WU5l
z|A~70!|E;ROzJPu%@0sH{gbkov|bSqpLX8q(~*~b;<ayBy{NTi2{tvKHll%O*7rBq
zbf&(}tS?1E#*fA^LP?s6!tDyZ`ctAn4}4OKpUnAa->xB;%_;h^b*YwMjEcvHQiytR
z;in@bf{c9mZ;(y%GD5Fycvg!>p%*>$%CRM8ix2rBCjzK_(~=YEEDuuFb;Y~SPKYVW
z7P)Jr>wOEDYK4K{m}JUm@jlc<bk#}NB4fe3cnLISxz^0GCW`@%lSyrY=I1CS)6W1O
zLp<D!A}ITz7|E`{S5AK-O{vwj^D>e|Gse+&1M87L|9KPuR@j;^W87p*=J{uUIxY^X
zg8vz8ehEB8Llz`oX}t_5Y%FFm;C(;SQs1{s&7D*nK0Sk*zd2#vvsqjOEf1LFIub$%
z3N!iCJgZe1nPzr)q_V=2UYRolHSXpIwvQ!iGZ#j%Xtb`L16lC8hFGKe8AhrUQRKV-
zT+{y$*6&%J(^ovB0rnlfge;CF3_FDfIpw#~Td@kAzEjhW(m)ps0-%ijrV@WPjLv<R
zc{I<0kPxlLOZ^!FURNUwoNitVF2av;_+F16u4dQQ9khI{eV@msoR7@$h6<vc&d{=6
z#7TQ@!*bsU*(4cC+#MJHH`ioh$czxQA=Db~rG7yVaU@TtWQG+mlu>32@jUYybi>`{
zKiZ@U_67JSEuj%9V327!4#(e5Oz6SqB7<*#(r!Q~xAp)?()u8kV6eDI#&7?!SqONb
zMqNPzWWB(;0$m(8)VFu^Tv`KbkGs;BMt2m4NpD_9zHl0T;R&m9r=QCQA-m=)$f0tC
z&B&Undhs0Dxoib^p36);Pmx=-`4Rl%VZ22-aa>@C2YzJNKx~5h<;zgx-SUF$xfj8k
zGxTa4jRC&ND(aiZF&Qs4%YdtaPgoZob1f)REu1IKNaAM*ar<7z8izbVUX|vJ=Nc_z
z&*!$7J1OU@p^oPFxP*WE{T-w<na6s)-@h}N>Y+BOn)A@b8Tk!dI#?px{`|iB+F-$T
z<17#ANQFU?1lPA1$oi$D(-@n>qocS3BVQZ}v79o?e&$S;4V6-<U+X@XFU0gA2hA_g
zbIpk>zy3cF?kV+;lc_oP?r8=Kr0IMCeMPSE?Dw0-&-LO}<mWXKEQ`!SZ5ktEL(F#t
zTklpn^#SQZ<ToEkJpJO0fdmk~XYN0JiRF*>);L=3`oC$hy6ayeCXf71zk-N07Jr%S
zzodB@FV8iRH?ve$_ISqcZXri9E!=bFu40ey;YVy7q2+kh6r|COD}Dj!W~t2R-ii{3
z6i@hqREQ=y{~?~dp!^{WX5C#paWG$za*bs`JPX~@arbyliBaL7r~f=2An*M_y3sLH
z)E4sBS<9RF$fn!lZmwrly}P69bt8XtIp$G{5AtSF@JJS>Wom|s+^rxN7h)zqFB+w(
zypR!kgi6O++MOT}$Do=)!#*hA{Cr|=!AJRCM&f_r$dsz+-snQ4)~$mNo}RBxyKLhT
zxuQ4uNrMh<Xw<*3O(s9=pn8(`I2Q0H*E(sbk#z~hKm*+qcBCzMdnH$^M7|@6rR4sx
zcu*LAu750!_kRx@Fv>S>(C4hf(Csb$F1UGcGY!$EXPET8ZoFGGf8BqZ1YS@Jo~s7}
zyOFnF!#9d|W&9eiab83vbD|+6DLxp{MzIS03sK8jgyzUn;nJ*s5vrt#*F}bU;Z<S%
z&bP<&!aGux`!YV5$PuH3B!5xtVeAJP8hPZmZX%4a3xU}`F%NtnP7~Yvxi+z<5uN{`
zT4Li>w%VtTFgPC{OadAL3*8A88i4h+UlF-+*}@rowaz2bzi$<)j#hc_++pE153)~x
z@HXpTg+B{H7q<R+LC;BR0KODb<3Wt_(aQXCE5*;X5Ht4+M&tRgLYik|ibP;1;5}fa
zvk(_=3izh&K%L7z!F$T7yHx8k`^Ps0|Lfh9gwe`PsP<%5zenVJ#Xq080nEu{Sl`8J
zAYV_Q1e|qz8)9PNF*L$_J>^0Wq50$MMntnbz8A7I^h~gcFIr@tv0gl*AUv!Z!R}?d
z&`&`Exxd5dLGr(=H%s*r@mz=3<@sE5lUfpy>*C7CeK7JkwD9{`h1f#l{ipwXrA6Qb
zNo&SuEh~0DMe0sF7&MK8*`$2d3;^;59-izBWB{(>?;kD#Yw;5O<p$Pwe{E!+%t56q
zNjtX7C#dD#pDk=B%Ns>=mlQ^6@Nke-BcV$U%0pCpOd`I%yTJp5K<A*4TK=z(CRNGS
z|5X$C&u-I%Uoflwz{gfbG6bLY1>F0$wjps^yspg90gW}gflAw7+GIk%yhXZhHr9P!
z$cVUzTqKwh5FhE~)v|9~Y=E(Ol<&iqc=we7wvm4uEls#Fpm+mZz-Uzms9$-&FukD*
zC`u#?Dl0xTO46gL_gb%z<IxQL?-OysD;C%exjVp$8ye)jE$@I;3!A$DV-EHzC_+Yl
z^fE9#ROcBbS=C$qN%uJMcbdrRWR+b|Q1DN+xC`roLx~p;&-En8kmBd7pj{nNI|EA?
z%t|&)iM8i<vfR9Uo=aq-m--_uj}j1eh<g#Gm62UEsJ-atVs0gQ`^JBt8bvsx5#jf&
zoUl6MczSeSe!(yX0s2pk%(|7mh!)JJ6%__jujPTy0xk-S#QLud^<xiS<|$o8`wP2e
zsE!<rR#rGWV+lv2Qr7r?=rKpS-bLsYJ;oyYccmC5lVb{{$bL<#vLADvf4l|E%6DRK
zamdW=DSiHlfVi22K$%foPc+MWA8y+$_anb%{lRy#SMh(Lm-EQsp=mFr9nhS7vgOh@
z(dS~J?f$v?D(F7A4Zu^fd0hz~6dEl*a!h9bt|+oZR(y~fK2w=sgFmU>r*|3{x$iV>
zzI4%r4M;2fdoGx~?l+NhXKVda+2BTbvgC1?$u;$-lAo_nny<*+{q55CUcBL=KadI=
z(!t`Y<%~G7D``7=K)-o_h^s8wr2ESJhYMhlc`FzyRpdG!Xz)0DNvwJmJ{7jV%Z`u&
z8U^<G%MKz-RVGm3Pp%aj7vk;s_?fe+<MLwdHTj(gJks-Dc^)rv%kV}+4@bT9Eg|@c
zmmj+2EIHQ~|3N>>ES&Q|mPIL&ov$XZn(`(dbm3rcLY!cn{87tTrZTl1!o$CW!`nDI
zY>+9LsTSG@HLW~?bgxLub2;yrW{M_1M>{>?se8O#FZW9;>v^f}c^u>da3jq)+nl3U
zOpIc}&vrYj%b_vz{Pa@Rp*LZ>aEfu??ic70M?M3kHJa!wd5hPZ$CD+Z9_LGnd&Mi+
zgvouQMO662t(74I;9U$=R36y5X294A@|YWgb);Rh2<Yo5oc+Lm=pC)^+*S(dI%^Y*
z_D(OE5MV}#HJ-iqsr*b3pnKjTJ3&5^2abgiI;NYs5y@@ze^$s(guw6ytLts;Th|DY
z>--}o`dBwL5exp)*z<eEOB}CI&}8djWNVhJni9{9YY87X=Au1YnXaqh`{{$<36Qci
z0$qx?=&yc3jO&f#tjgPNFTi1nd8eIpfF0DcS~^d>9~zq@)@w$DyayVRGW<w;&X<-c
zMnV3vq*nVDLV$!sN<k#9i}CQ}PWfgHKQqJ1KUt_8WozBL8HU`O=i#sw?P6SO<cybA
z%-^MFy0x#1_<p_TK)hM<hyflx1b~M<1C$N=?kolr<06t{=VNE%hX=nxy1-&ZH)_v}
zMgaY|;m$D{#pHBx3DGSE<h{{_a^b6r0E5-+F8x1mMDHbLEjVskj^!?$^Rcg`sSvp%
zBm6x*F2xbYy+D)WINz&KY&8Ao%REori8G8=jIN>p>CAwQKy?0wg>LXKQQCdI&JzQL
zF<*JiVA!M?G_H+}dlIeSD!NkwC0ofi5{p~@K`^HwI8bh-1LNplpFGPpHnCgiWbcaQ
z>cIMmG+z&OqS3qx(F6QlUh-X+`{0bcwK>v37?1fLONJqq6q}%pWSkH~u;`RJ^XMRG
z8WwHEZ?@4sM1IIX`9tWc?EUNq_Dj74;tGEMJh`A@fQc&kMQiPJkWoj7Wy0_H+Z2fe
zt4eRL-JX3IJ>g>%NqRI|^sEc!#JYgpr&}0zdU6y8fLBMU<yi5DpAltNSlLkq?oP;^
zC}IBU^!RTEGC?=;5Fi)OgAocF|AcbHLQiLP^e?;;JyrIl_3j0)-$<!7RrPqPHPz4U
zAoZTZfL!B0i;0WoJLyc2vO4_CZtds3jo7`g`{$3%ziGe)uR&z~KJ+c^uBuMKMI^A=
zFZZ|9f-uCeBS44Tb_jd{8oIII^)qNY2+C~!%P>@rl#F$UtoB=QL3INDb}Q=y85u_A
z!XTvI>vynKKZP-nPOyJ6hT$I8??KWpfC$S8I9>sDZ`h_0;47@Rx|9ANYwsOT^&38n
zM@OMylT`MI3aMnDjFilfG9oFIk(F#m6v`-+O)4`QR>-KVipWT|keQXepX)y7D1E*?
z&+mC&uirntPR@DX@9Vzyy06<~#T;@=!J7RL$cmygf&TmWIC?s4TS)f@PJ4uC2Ix*q
z^15c8e-JZuEiHj5c+_L_^b@9lf}gMM*r>Uk;o;E5&&~kPHVV5VcTa$NF{IjKA|KvA
zRczWGsz1ncY>Ge}JTcM7`*J1MH^|kTxBd0VBm`URzFY+p8e&A}plcxl4sw<{9ce0t
zXs70lIHW^jHHI>v={&5lUC}rJ5+z}W*ljuT!FE+|VzE#kUjQPup~~B3X#EJhcCtGP
z-GK89Y^5rt(6XKjV~&mDHsyq9D4bVTQlzNk6E-C3aE5lv*GY-J?a<KLm;5-;e<$O#
z$Z;6uRL&Fq9oZ&dESPxASuK@4Bu$Nq_ZVmwUkmP`S>&?b?njPy7E|To!;=*?@&YA&
zhPqocW9A?0AawoSDjN&Nf`c{o{VU<w6%EzMLm#FFzHnNP)6NbEeTFj6u<H>xmm^|a
z>We**+L@xs+s~Sv3-+GLA%WtG1)7Uxlq{_<(FVtL&OEAupsVYr%3V0WqQwN7F$Jjz
z*~V;bT~HUCMR|}ZmdpK;XJ=aR!xS<dIp=$r=FIj%rZpiuW1-0<5NuQ{I$~x~IQ{kW
zu6YAb+0BY&TC{vTqsIm!KALlzA;>Rr0--JV%#&L%2}c)i7;L&Wn=(`NMTVfr6xg06
zD^8$>OPf!c?7MaJMdW3E@0GCaUleS=E@&@7Gj1StKa@>A-<*IXdw7mz>(}KanC~J4
zGkOfU%wVoyO31r3r^&v{V2+MJ3DXbCA_3>1IblmqeSxF(Bot$Hhb@YiX95Szpr!eU
zZ5jT$O)z5{TMM9u5Mb<hr&NwM_gnd{2H#TIoi1?Hlp~-sUGiEN)(%d=C={Jpf=s6v
zug}BcXRki;_d#whV{rl$QB{O#Y=q7jWV=Od&1iY82SMp`dcw@EFpi2cvO!MI*Sv0a
z$4k-1x_O}4#j1OXX$vmdm=*H15x#E55qrc5Mq~%YfMde`^ar9C5C8qGeJ}OmLK1{k
z_ZZ+6JRy=m%rIvN2LfLe(nH_Fb|lGt$JC)MLiWuM=z|aeN6+n?XFBNkz{e`9E6bC0
zW+!iO9t$o$VZ!<HPMay*tsq!zfBJ3=49XNsU}}_r;z~8-0sHlTK^yDQK<V=rltW4O
zb_icAm$&7C!`?x=E@-AGknS6K%XcX}m(T4RSU1gC&ekITy3xIbXKQa9-mEmDkYYfp
zD?vL;FW|8}t@D!KmEblURt0_!P!oE;Xgzd5Y6Fwf_a8sJ3#ifveG+x5Y<D$-$_F~N
zWWR;XMDV)*#p>}7fbuCnn)gfQKsivXruiw@_dZCge=1N<h?Cnc_T=XauASn2;TMNm
z8dpy#_X^t9q9Quwj11IqOx4~A$cw{anq9NeK&TE-=x?11ciA2~@|3I<3io$0C%|lo
z7_)2K%e@O-`9C4!`A7`<YvRqW94N~Meu#p}i|xo+cCr*OXzKkCQjKP&&7Om>vKrbP
z4QE$v$L`$;(&JH$HgldH(mte`HwQDgq!orW*xfLcBNcg%+4t)#^o98%gQx{#&f}Es
zJ#Ck#_2r-W!bZKsWy?=NiI;3wOoFd7ocnyZt4Z8jY9H@kXn=7+Y(_5bJ!RJ7eHjgW
zz`uG|hmzn#^;i3!-@ddw+hq%f(s&@H6n(lN`@ATq^86_s+YfnA&#@gM^9%Iy0+TLN
z1<s@~*PKRV8Y;G;>*l@=*{(E$9#DaC-L}#XAD@J2x|40z4ITIq9i3n2z)jFSMD`WT
zfLZSou8Gemt%;7^^C*W0H?_glaYm2?ev=>f=|iQ!CR-on)NvSQEavq{NTJ6GvPm3y
zv_9q!#T&UjUnk`+4emxB%}(Ue^gN@nE2IeJn9@kHq7-~O`D?|aaAN{djbkY=Ljprz
z#@@5tmN;z_Cd95!^wvRk2#(<<;C*Ay==cNYT_+UK<d|NCtQ!;E3Eg1;a0`tOpF?ix
zUZphcxP&RaIJcPy^OmfXmwA+XzrwkVu!b`5x1w!edQ_TNQ(6u9u~GYT+v-|IuRbrl
z4B4iO%Rf;ugSL-ez^A1)+m<p?%H?_>`2cdqq28W5ng(sIf({PL(Sb+1$?Z`71eunT
zpBSD>LW{U+Txbhhe5ihhn^>VoolHxJ-IhhWY~iN6q-ghIu=~BdDlJOfxYK3QJD=%{
z_pCVbvs0s%Ah2^@p6;gRni;`D6$K<lc0*tKI|4{oOPY_KC_BxDKi~daF|zewy!2UP
zo%$knReZgBG1c$8z_unhr+iO@(eFG|npn+UQSJ&R!NzF!ax(4qJI!SrA!e881PQik
zluoc*Ma4CgEr9~t17LA$+o2mi`|o*)l*8QHcU$*J9vyoHiJBcxizxY9Dy~ss=ZBZz
z0#Fe+!+Q`_dfs01e|nfj<nmoH=)0S23GS%{pE0_BPp?W+WA;^@Noa2TY9P?h3hY;B
zyBh|X297IQDWKR_H4cR}(4C;AmDQlOgJfj*WLwXE=i*)&?}|MJ?|1^QA!KlQgS2=n
zV=idLc?wKva2>PSRF)t=u6dU_4&`b*v09`T44m6ErQJ1x76dNO4pX}8-_B1KKFx|3
zsCB+jH?NV)$!C1j^QAIOVLvBegw{crj8HS&z1HaRB!j!F8T8-eX-8ByxeRV-mHDZ`
z!j>G%DBf-x1)fv;v$f85+Gd}F!AP8Cy=TrCy3g+Mb615?vn4a=1Ql=gLqoIP!Y-KN
zdKdyo{kP%xiDV$p=Z2p_FwzIzOQMkSj-H-`(c3M2BM)-7!U0Ltu=P2Js~2vKG;XiB
zQp5P%{eW;2cuZprU{Li?A<nj?9V$UpFppM)*6@JRCY-U0y;UeATZIkf(cly|KU5yJ
zIpdo$g!#yh*D6|W{1kW$@l$v+Ok*hQ-W^?j?ZqZ9m7>pq*@m9c#!R_TRMcy!Fu%oz
z%UFMMOkT*O?NNo0vs_w!$9f<12)0P<fFX{(xS>cmO3c=u&n$BJY`y37a8nIBav<jL
z!$bt)I^=}EDSH28TPE2)$4B5H9fr<IE2yemwvgq>?MBWp{82j?Cz^g%z09gTKjK3O
zv<E+M3D)=a`Uu4@GhG-3HaP*4TrBsvjvwncT{)zjbDeAS-bK5#GwDOp4{oHunPR6#
znB$;`k|vdtpUE0|L7^<zVKGP?uHaD4CS0LhU4t|uDrjYnzp=Su8}(Z?g(UxSq;{V?
zz@9x8v}p&|5Jl^)jNH7XCyaY=1Qs%;m=k@m3hA!WawQ{zY~ix#Uigt`F{Q}uCbz7Y
zE-0U?m(1gYVQERqFpox-N?rAk#7LHA?9CczRxtkx-3o86L(}~Q9+k+@$w|0mr5u@&
zGhe=d2YgOFn2tWV4mScuS&Y>Kl2~Xa?zUa6u#adhoeLi!Ua^AnynD`OL!jCRzG}An
z0bDKkWJVAtG7tB|RFDsg&mbFy6qD*QOtfr4)`;xAE40Q}G3r^|)2SF+c}%G^%gw~2
z<V^0(Ff&k~yWIjmL={AY9TGNYS;6Iuk2Yp?xYNZ;>Y9oOo-9myflRFIv^B*YfdvX#
zdCH8PdXii5t=5#}-;TVrp*``ol!k+#lp?XUr%c>8CpqiJbU{RawKuv)kT#AlOcP21
zl-o~#7H?^Nz!@YX720+YGLun`g{93b7B7y)?ihejPB{N~-25j<{hFSEiK&$(fE6cB
z4LD_81vTH?oJ*$+wV%~PCauM${w3u6&y_7f=P6EwPbIw0r}?7?oY;6Xg(CPUCpZ`#
zJ%qx%ivyPZaQgVesj?sL?%>7ke7g9HakSr9B<5Modw-YkCX;Fwe1iVOL5NjP#_l#g
zC5f|r`pb6n5W~`dX>`WPC0aHtGO&+8KBki2%6dh=7-1WtkUFe|ve+;jhmO3Z^sBj6
ztj))*QGG;BB2C)zJj-v#waJsSr{M-%wW?0IEw$v>)X(PnT@DQzfhR7*`36zi{z?*4
z<Yf2UDLv?ZA*l5ml#4B)oZtN>Ck{MEG&wRl?v(lydtLvExCcrMoi}>6Wj3DCQ@CYm
zH+~L0r`;N-L30ncgnar8gC#;A9;r%?FBbKf|G~uB&!U${oO}~Kd9NB*aXtNXRG!Ph
z(SD`J+`={B)J5BU0OMu8$9aIg-_)m6Zfz_Rsndsf#qHRRSI`d|1%CeCgBK6@KE$5A
z5R@R>MmN@0%YycuMF+v=3^UhCDK%eiLC~gKUA)5er`V8SaM0C`k^N9>Dt`au#xv`o
z{f=^;Q+oH3{I}q-6Mq*&S*TTNRdhGroZ!!}se5|5@O_!t_wi2e)Pa~68HZ`V)7w6t
zpba(~KowJYzLh#CHQ9le(6@B9dxxJ1R~9rRcsRjK9Tk}I!zs4{ExOe(yZs(hZh3-x
zuNMSnW0=G*r*DIJpz#{H^|J$C9#_MWGK+v7)UJ#M%^UCwKkcL(7Ju?`1T|m0J~@GI
zFh%2Ns)rT3w^MkzBQ*lE?ym0g`Q{>M*8J+J6Ka%d39d{9HFHnQ?wBOzoxjVRbfBs9
z04n2P(@{ggRgsT3Xyw?3?zF$aF)20D=4>NmT(FnOp@6P2R)&$f;C*aKHMvT%jvp^J
zp{m$Gl9h*k3vNEIlk5}q(eV<HJiaiHrps-vO5D23!|0~087OSO!qh>CB?3LU6%!SP
z(8n&9CKse{-m8kgo&mg*gJby%<c#n9IV^5^pt}e_TgFV@tGj{*Iet|7EVkA+6+LAF
zUd(k>DL-qNoYHgu=qINWpHB*;XWfSxrO?21wpAlF6T&9Oksg?rECihix~hezeY6xL
z9?KSz%b$Bcou@Ozz{Nz+jzh>*Tc$tgnMKX#C@$#O_Il~bB{FI_;jVrMlR)n5ReDr|
z?9r2s)UvSBaKr}<+eumzkEG;XF?DCAASjO@Y;ZyBaj#llx5*S=m{7hK=0U{O9*|nm
z$N;m5ZvR9x*puyEPBdvOqPoq-ESNNi4pi*a8{*SG@W3{e+9q$iB}@gxrIcDUUD=1`
zzk*w1&`|-=OOhBwC)IG3IcY&5WFP{(Ro!tlFm%jF8vG19=q8re<?`=`(eA=v@f|Z<
zK#8C<Tx`>EdetMhAAuuO8x8kf+%tY``-A+oi_2)N!J}B}87-?lX3U8AXVCfEkIEvS
zOKE%;WJN!2xMuGwdnX@B#5+3Qg?3B2p4SQSLffVM?~YB4Ps*v_iJUA=@*Ji++-i)Y
z;(%Ww)V#)69B>&meREB<Q#?Y(qbLv5cr|z;J9Vk+UtUy1jR_H`Hf-~df*&0DEXvEh
zz7WHc_aB@VOxo$eFC^9{&ro)wC;W(IB<joNo2UcE?k5lHeMhGSQ>-nZRnio8CVe#w
z&1{2k`q1+L=%Q|z8fyG>Qt>*3h_`gvP+fP|JxHJx<}cWiT!92!$23S_tT{BbCtue#
z?<&FdK{X*YtrLcdKuY;rZIk9pTg<!snb(Uhy4GuE#SO5MLM4^qgJq_8ePm|gX6*yk
z0%?Qqzcg8$QO}x51UAJh7FwXoX!>3^-C_5N-S?cU9G}5;0nekxZTp9g0cK%5Vz6ux
z7|OhPK4Wj`57D`@8p~jfkDF)qCZj?8N8hpdIiiwR<RSC!qiEJhIdmQ$A@dRHkZy>T
zX%CZLh2Zq=X`MU;<G8msH}`w<9{Yq+mx94c)`v|hgRvT)!72XkYy0j=?rApgA#O86
zQYsH7|4w$J)94~R#V5?*G+v@!u9d1=nH?Mn(AynQdTv@(e2|NiBrF}>Z_pPkw>@QX
z02GP{Ohmj}Y6f@<WE+5XPMeI76T^^}2x{0A_URPcvlq9=zW%aJBFsY;+JvJ@gCrw&
z7sC>tyu1uW4BtEG_P^qf)jAf`X;2|BKymTY@H@_tGb;NzH=)cka@y8noG%C~T_q0(
zQtoiSM)!E{0_eruPQLjC0VebuL!@&MhF1kLAF&9ZoU}~}(0yTqi-08VJbbSVQ_fXT
zkcy6rqVBW%N8vz$#AUdSAdvbBV9+kwjqCUVpn4J`4fEz~PCGi?sS7DYV~GlCA-y-M
z|2S0t!bj-p9+*mNFn|NS<aQ^|?|!;?A390mpzrVPNtxq*aU0&n&PCL7;tAOcnacP6
zp5RK#yG`tJFdL=NTW)oT^KIEQjrYR*VB3Mvk+EZ1O3YzM%k=eyb|<Z-)=fEY%ysvd
z5@vD#-hTXba(cVTV*{EJcDo(8@E2=CV#@YF$YM3j`ZTpv@}d)~I^h{o4(%Ezs_1(w
zHAjRGoO@Mz2(H#`xnPpvLE)nHZr6b3?vCdsR(^!?;7in%%yLEC3p-Dt)$&7;W>G<=
zBQDHCJ_}VM;$LABD4p9jJ6cEH<1^kfc2kCGCM-rz7Ztuhet**jH-LHfS|d1LK)mUY
zp|I=RH9ORFz4bewN(jhg6?vOsM}bP$;|2rdCaMHJJf^@)-RDwQu(=XfG;lW$g@H(U
z04%W#huI+~ZsY5Yc^CDde0f!C0|!KTP`zxcKdd1y*7DkZ)z3C&wm@UG`n_x7gPk{t
z`Or0Bf^|uqXBsCn3!d*dM6K_3TB7V8sm{lANI(>fMuPe5yvrhb;53VQss-H~B>Uh0
zgaZKPJ2$;N<B3Wci=GVO8zS)@4+*Nn?%PUk;)#t;VEt~d>6*71LsGgGTJ=wm<4$9w
ztOgcF`Ew!$Sre~*x+9?`%edjf8W3y9!G6F{FNEIBNIc*QDg|{gpftDzl3p|{s~+0_
zB6mJ^6jBn-fpX)IyHq%}YE{QCI0cn5pBG7dr}^;<Vk1z5ETSHH6RS&J%Dq5CRp<Kj
zk~SEfI)(U9d%bC9cH`KE+8*6M6d@4j#1;~7Y4eDbk7Z&oxF3IX0yQ06u8!2sc#O$<
zb7R6d6P-wxKM13->Y?WwBd<sfjq2a%n9LUlska5x`DEy&PlembL7g<9_>Fp{p|<-1
zA2nQa;nwDqaH~I&y}=>6)K4jB;WwueWmbI)=3Qcv&FuWt($CS^#&>hfa@WZ#-zus~
zTRA}=sB%rLROUvu_nZpfLcV`69YBwtR)WNs5^vxOc4{WvYXzCnTb_&7PYWkj_GQ@A
zL%mepfYip{=~`dPsD3@6=(7&xTu~cj<4kI(zBI;R3$Bk;f$yi^REv`Ga9xc4>Me8j
zmza0%&+`^<?b@LHQ;T*7a51h5Bp@VjuW1Q&vJCEuuX|gv&2q}>(ic+9{o`yIT6*Vo
z_V{fU(+<xH!~WHJiIh^X3F!q*N2l3|Yttux-KC8>WdyzZhhf?^c6Ttu(chpuVfTjA
zW2741Qohbtu8|0FAqpMF!XCFNbI5sOX~|zF#DVv_bGEqMqk<-GR-tf(r9bXya{XpZ
z0v)!BE{0iW-3Eyt@5{$)Jqju>xe&!w&Gqs%29BVaWWenwyVxfCYeeB9R?^eksL7*l
zZOM(@8E$b4F(cbXh=@&$?lW^>Qcsq+MHp`T5+q?&D$}S>G8J=)bo71Zjsb~y+5{PW
zr&n*@5$kDM_dQ)+X%}A=b22<xmxY34AnXZ29F*Mx8kSq%EssqF##UbSD?tai!8kmL
zW)EN%Dc=QSSF;aQRRxgc##X6_ZA2|M!4#-xM7Vk@E$Igi^=f*<BeXeLh8738u2xg4
zyrQ^a;6(k4%O%5?cu<M@mZt}&KJU4gNmO!y;sm~evQKR%mBuWFn%2~@kjt`nZ9J#y
zS!fy)1_wb|H@Q^P8~bgHqs{rO68@>+Gw|Lzal6uOxSgEpBlL#Hj@qoIy!0u*Hl(nD
zU9E83X3gR<U@VgKr$r}3Y9vGa=O}22W(PM!d(FU&UV)z1uvEG+CimHF5nQH`*=GCc
z$AudZXeq;JkIX>ci5^}uo$e&x4M?qH!S#p1tu#C;kf8p-tG+jv)pDnNvkCc5X43dy
zHDdeE#CaN4(m7~md?SC8aW1t0Qb$J_+lih8bcaF7^4*@zaYLWC6ym*3XmnEA@KR0J
z^gxu&EDWP<gBfdLH)UYj0)9@T3L6D$c7eyB_yor1j;7i>-uvTt^~{E1y?gg;pch8n
zlJtk|q750@6*Z56MKYN%`VD^UUbn71e_}CfXigJUP+mB^i(M_JY1d=Ca>tp_)pl<3
zSY7O#^pbB2`RT+dPEanOfo@PuC^XyE7kRB*P-EXO64P-s35t?X{0kTHM1$yI;Lg6n
zo2f4=ZH7`_Qd~NUCxkjJz`}i@hXZ)g5A%_G>C3mJ&yu4z+NBdL<mrts4J)!RuI#v9
z85MqCjcq#sbf!JK>XzQGCNdg)Hf96H0%I{M<?C`*fWQurI`+b*1h$WMysv#E*DR4%
zT3VYJ5{PbHNzdKmN4wSw!1)CxX-`6~S1wTr(^jamT=30ZQdLSjk}IDuh5j2TSHIn2
zs7JzvjxT9k@#SLu7N-hLE0W+`nK6$74z|^Po6M)hL)7CL=AvFo&!ol%+UV>Sm^reB
zsw4`D(DADnr96F*I!tP#_SC!{bqnCdWkqURQ_SOUcU)K9eO6e%Pz|T-!l3z$rkz*e
zu0hn?29`3`MqVa#4uYmZ4S8SZ?kFZ_bO3S#t`~JAMdz{60?saz{$O0;bA3LiNOcn`
zeh$ZftT=Kj>~I%llgA1RCLeUr*)|c#BF+(eP=%wKs3%S<l?<=+7jc~$R3}GWfA~d1
zfw0_9$=%;9+KqbHsqg)9hrMqc6(>YsDrP1Mh2i{7MVre^{+;eRPI21lF1FZ0hj;j7
zJ+^BVyxa?*z(}cIP$zrORph+w8f$Tp-TTq}DPPC{1F6$RS!#M8<E;E6UjlWLpLBg<
zeAMaYfBU=H<O$3FS|@-9%6f45!irhpqVf4-nI6T@#2-J=A#Z?C;hJF8O0EmrJ-4bB
zLzcVL1P1{*D}i*`2)7M1$)R%<BPC;{7v)%QQzxI5Jbx@W-75V6v!F}UqYt~Uwt4Mt
z<q30Kz9yS~Mu)uzk5PgK!{Y%Q@+eQ8ak}GN)DIqX#z>rUIe&p!gNjsxTprRNmZz;1
z*p&_N5LfO5&OH}pQw~nNR;Z`S(&O!)^1i5BTs-DxE|Ng&cHGBQV%dZ)-rF))Iw{3M
za*1@e=ys2Gu9x=AY#kH@SCUct=Q=MIeZ;2IC95et=d-XFtt^`vO2|?(Ab%3!Df;8N
zKpln19)0I`>Ita53ZCpL23_)k7JV)~Q8Kndn?$E}_x+Q!VDN<xUyxv-<!B`G+L~P8
zU*;Kv)%Rj$xgn3;SFw+J+}~-cQ-kV6cU_2&_PZSfB}EGhYkBco;*(#SNuF;Bjy#6Z
zq6!x_9O^8wDVR+ZNekMdQT6_j#Q%UFX>@1O4*T;fg*jKY9XNq0Ez-!o?CpMshL5}P
zL9Pl$s*HRRwMD}Q*x>3*>F-B1wxE4?lh3Is+^ARK7KV!G2F6ydy)$__$&V-K)8f2^
z=}@om|3p^FDh;3Z_+vLV1A#|gD<dn#x`9LIPg_V^WYj#lDtJ7g=F_qGqR*06TQlM}
zKm27|MQKSy_X?UjaR%Rje3ynV=X)0eC4;QyvzX+wrx-rXDTof`HsmCfE4%$`dmC_5
zbk`KNJ$<sXPC9hu%bu{A#<oI!$DrzVxwMA!kp)z@Ht;PE3w@@!;UhVe8=5^<KHRZ!
zVHJF+a<Q!oL~b*I%SGeXOyECuh6EB}$QPS(K*q|YgI?eLdw|Y>=1CfA^OCcj9O|m<
zL8s^)_k1NZJ!0dX5y^t2q=;7GU)VyVA%&h2Rv$foPBr%LK}<M9Oz-mQ>4og?3L8Sb
z6qwjcmB^hM3Y%hX+w0Nrc&8E-10~Aou)jEj&R6}1U5RkwId_EiTG<iX?i6eO15tB|
zy<Vrkv@V=+!(g-XI{3M}ChBCXG^4KG*1(r$v6g5}y~z*wxgqOUg;LGR^m7kp)uzj?
zG(xUxZ%kE+kABCxtyJLG7mot-;-xsJ&AT6Ns0?6NR@ROQ^>!SM5fs&0ex~oWe|5b5
zklw;PsD3)t+XQn5;Qi@%Cki?7omV_PkF27;!2Y|v5G1DJ?g`_M)Z(|U&Vtc@c?;8Q
zwfDB?d%+E-ThG4@Sm_+YXql4)7poNuePGnItg;>D^YWHR)eg)PEZ?dVYwSI@i=a`1
z4<HZYu%Z0_#M=wn;MWza_q#yDQz#E_z|YDdBHM&|d}Uj|%IwpDkNZg{wz##e&Pt4z
zvTv{*yzO_=;bvV}nbe(?`Cl`Jom!Sp@0JO4f0=&m^g#CBRU*j;(SW?&lk%Z|2|c0{
z4hv&ef{rY3k3U+6aOf5Z-jLkZ&C2prC?WD__jm;(y-sb-a^Uff8Lm{A9~GV|Togay
zOm^-<A5XhL+yGuh07l0lD0?6IH#QLrPr^M_+8%#jg6TnrAQ!JPQAv(UnQI)ql_cu1
zavg^MLixQP;zyYFOkhhxerUg4J8U_RbQ=HvWZ%8q@l2w$b8}m|V?$N>F8pS593>sO
zlAsS;@={k}_pV~~+lO?2^sC$<B6A<IlRoDz-~Ii5BLAvqV$gkK(J$FYQ<_sQ6RR0W
z;;xe(8%$X;Pxl()W-koevOkqY_1i+nl;MTver_Ct1akwpj07|%@g&|mwhoo}kJvL5
zOT&h%wskfsY@;8B_>J0rzng6tO0}D0^+Q$;myoY*kxv;~&IQnQQ|n~u|KlSC>WO$8
zB4_sh@(`fnzXLGVVEEs>8ta8pn&Z9Hs0;f4DEoiG3k2Ns|MGjbU?UI|T1hc)LZ8|H
z?WUl!?*ASr1Q1s^C|!)&_DBEe4*B0~e{EB6B7n#>|CdPk|C2mi-X*Jx9;-D=HLFW%
zvI*>4*A(J^G0e6XB!}w~5B?UFHPG=~b?{V|;p)P}`F@$z!Mlc}ad3)-Cv)G9qyIai
z2CD%_rV%nP>))cTKziOLh~-wPG>CP&v6ekC?)#tU<3f$$T-BRiB8jLZCpoQVVv@kj
zv-Uy-ih;3&Vz|7@X}Y&dE;S=zF8P0Ac5d}&JH?yqi-j^P!*3sHDAL^96t=hV(Mj%_
z$sGw7mA<gWoJiWLv2&|WAuGRTifU+x>}{ow>dDC#_d~xYRqmDDNW;y0pJQfq+F*6Y
zYQ*Jvc2nU@<mh*|!t>_`n`hm&jSaj1ENohMHm2aYfrK=h25ua#KBLD+LW=Qcr?xgy
zA#>d&rEynEcnc|poSF@5bcl51os@9YhYugVD1S`G{|!C3?gRa4xT`(5)%Z00EjgnE
zZc|KT>JLa88qEx&VZ)*y$B-p@)ieuej}2P%Q?7p$UVy%LpaUpc1N8)9-Q)Y%x<tCS
z2}u&(p}qsx7k4L-hr3NPd_<V$9}oM`QjhO#xE{V09!M2r>(X47%8@1fjxDa_KKH4*
zxw*NivUgJRZTk2<uLB+4A0J3cD(%qbaFqw^Swc>RaXs#9ZAN%J{u(KhfxSjPoL8v-
z_U-KVzRR+IKTALay&~-YsLKT&wEs8QH@Ix*GAS2lf~myannOGP!WD7D>z1k7&(_sL
z!47!9J)+3kGN-&-jYd&xZfc=6=RJ1a@7A7F;-!AheSA{fcrfR)HwFH$xIF;2Gsa#5
zy{ebVjA`62ReB=ZEF>gkUc#RHUjSs&gfGM!?fZ6{>}uu+qY_~wOLEwfipO}0$kD+n
z=Uy1So!W(Qy_K_j69@j!f5D^9kL#nNR8>}fqHu>X?8*wyDZ(<;0}6l;CaGDdpL$|-
z0KEn_dEtNa41yb*PS!nL+D9Gasg};V`3^7s4cSb<3d@`u&{FS#Etq4o?;$=UF1_|4
z^j!RWc4<m#C#m$M4h=iPvlwRv>hVGC+GPZ0upYLAS2S!30Y!h|pg}iYJ6y3O>1&X8
z868>?*Rq8iMjX;?K%c);kk4(cX*u!NUJ392xUZJK3VMva2Dw#~m8*^xrBz)$>_}i1
zPwXBNk~m4F3_K}|$oHLl#T5_~B)Z_g3iq>KT`ExPnDWu#{l}{OoV9zqL>w%1E-_lL
z6Tio82ej?Mv-^%nPtTg#<Zbf*K>;8oz5#yqv_m?Cxc^OTAUhkh4Dg7*N<zY_wu!KD
zbp_lUGj9ttX6>d?FWZ*GLxgz)?jAY$++FYXc*1+JIN&78H)4s$0THQy2rWy+^xFi|
znXSz89EuELe=7F}kdh_%eV=u|Dq+3&NPwrYO(zYw1<Q_>%G=LFl({NRHkMI8E0{JG
z%EQCc*ZjD39a+O^D>u1z*YwmZwT&<%1OnJY{2<FsF}YT-_FXKt#1u1di;M8pCXQPE
zHoA=puX$yq*UMKSKs?pgkDfr-aYoRzY6>7|E+A+M4)egq5K8RXufYLp6%X6j^G6z8
z^0D@Ne{;w>)B|snC?hmt(6?Jv;v4gtg>Mwdu(p(LT_eYUZ2Y|G!<1HUm;4RuSJ_2c
zE#nvz6qE#0A;c2|ZbzLW>=!pg321e+PQMF)_yC9RZF>+-O?bNy74GMR{yV8jto@et
z@HAiPvT?$=CCoFo^$@Y&paC)#SYTBRTitOFIK;PW8&UqI42*`VZ{4_g!NTI%*z#~N
zfqbwRY&%&W<=mXun^Cpf(*(wM24EWOI@2TJN1R{_m?;X5{X=4)OkI*>uXp(!flW$0
z0h^n)K662co=fud3`)ZNe#@T^I7uLYQ>gwL$R}YD)dI_O1oBG+vS`UTVZlGoz_yh*
zg161-+ka8myW)35Soj{WjMn|Ia2_H?;$>jfWt#>(hI*;tqH0_(Edm5d@^7rchx?vE
zx7Yq6qzS5x?il*79Nk%@*Uo}=tY7UIt+{<fcq7qH{k4cXAIP(;<?Ro|g~{MmdrNr2
zNnz<B4&cp)F*!uOUwaE4fI^B=8+{EL$(9Bi{lI~t>V|g0ve-O8+Q&kV%W5E7r-4&L
zuM{Xh_zPnE9!e@PV1$la8xi&t9}n9lYu{0ixR?Q!NPV65b&dJfZ8?6GwAr&pYdUs^
zxvlN1)r~{<h)b(qgMD~e2M@v~<CS1B?G}}*L>XI;G@+yLySL3l5rpUXfUIg_v(?Xl
z9n_WJtJ79dC-F?P9>+k*e59T}Y&$_f0=+&Exr@cb88)4KAC?HaUayOP4fZewrt;ip
zJfI(rC>lU^j_mhJI7*5`#j)Ss>{GPYHVfJBLja)?Ok+sYM<0a0+B?`8I#D`0y88C)
zEh|$8V_nwRGzJ5O?dWdPP_v$%9&_ON`u#<!e<)Hy4LG~yYsp7M=Qv=%r;~#YR0%)?
z=zJM;2G19mRF2wRqY=6xY9u<B9Mf;EQ6`?BG@Vph!!2|E{Y~w2X%1_ACJrRDr~T!C
z)*ZM5J2l+lAWL}f)UkN&*A-D1H_yg(dz1zvGsUdsNMwf;fJ>>+jR&#AFVMoFrq8QF
z@h{an2aof99rn%OAIs1F&6}axfZAsgrX7R@O4tDUO^;I55CGxa;6*$lvcvKHwM&yH
zCiT5tju(oQ6Kw;+p)LqP2w}5-$?Yv*XA%0#H;G%bL&ocgMoLm1#q14}KRiOOQ!<fX
z*3KfrNszQ#;BASx+Hr!piGKiS&2GDX2fQfQgAbjV`%UcYq-(7gWeL!tja??sLh7eO
zj)<lPw4n?MYa}2$UI7~gzul%tz+e`GZ`>(h6M5e?G60sm0D<(bH<g13f7_7?kErKu
zL$+#x@(`=vz2#7q$#X3r*wYHHYzg%K!wBf1M8LV^Rw6_=4^VzLcB1_m@d5a97mkqs
z=E&b4a39GJ6(=6ePWJPoU=S0?e9uUP-ky`Kt3KetRm6uz9q{~Qq8;b&$9{jn=^_<t
z+qxgASa`D;Jl69CQ4046sJ_*wne7jTe&d=Sex(_ao<&XAMLZR(?}lx_bCW$IcsUlm
z|8k@;D{wBU$9zpKEFrB|kEe(nQtp(Q$9})z&j+pJK!$^Q=0x8E%L;t8;F*jM00#EG
z0B06-G1K7#(c$4IVEn?BC6}dhuKRW29~B}a{8<Ze5H`)Trni@C2ay2S{ebi<c_WM}
zu;uSS!mXWczW>-VI9E&GOHv#_5}XvPV+h##%aybOE!3uUfanDL^-?y?Wfw|$@Lq}B
z{eNIBB9r4Gi1#G5`S}&xB6L(!{Ie=qT%h~`BKEFu0SO*g=V-w`?m~!T8oE$U;O<6R
zUr|g#*`4Ky#1e=LvKh?||Je^eKefy!u=}EC9wyO(YpcMD`9yS{`#?xV1doU!iN@vv
zMce)JVu@VA-9_eJvrbvqQkr-?2#QyWeX)WMsCJR8`uX|kZxB9@@pt8eDNJ_=`^V8D
z)nOE(jMTk0n5dL@2Xr^$Ic-NeHtm{P72o-;8qE%mtV~hyKTPSo#l$1FhUII7bxG3v
zbfsjY*HEFv8Bj&&C!xoHA+{En1j4S8JwdaP9r;3we;ou!Ej}*4iuMY<OJN~0G(s27
zgZ$8Khwh+X80$W|@eeC2!KS;S){GP%SO~Q*j+INWggOnb&#*Mg<B`?MrB)VuhGvUT
z1axntHru{M)bY7<&4w2aCx1(=5th`(<jk(>!;Qao)gO-YRGS>5I&ZhD_VN(Xa;`H!
zxb@BqhJ5N2+v3U20iD-LhWQJB?48gq+y#v5t45cl4x#A6!;}P^2;D%&kA(&A#++6H
z-+Lob@|bWxSxC-58B?e){6pm9ytO032GvddDRYmVYX)Y%RuaA|F#=xSG@OQy3FdBt
zW28uQj0UvXr2Sl=w>RnK%a`X1Y-N7a0vs>e%00yrVq#+b%8}2<=*Kk*P35DmoFxG8
zmudoiwTVFYb&tcV2WG-Etbm_x-Tq1R__6WIz+jhJ_S<m-v-u%g1Lr8$*+l~1f#y~t
zZRVGVN&R)`_VqsTRMFNk5QWqF&)3XODspwgVzRaaIQ07Q2EZ?L5;5)(f^)PgL`6&#
z7y@88aD1OLl>e~XT^}ae6_VZ|qO*gqg5H3rHL3Zcfg8T(lzJm>kizw;)tJbLadv4I
zF6VF%cKn<8=z|@FNzh=Vz}=8p-;BOtt+9zEB4FC|!0X)^x^#dEN*;jm#WvyH|5$*N
z(JZ4|MCa0&BuSiA(DKfSx@|-@rw)Gz2IS4RHegiT(3VocvWb%q-4Bt(x$_PEw*G&B
z!8%|f)8?wQZFtp6Dkk(Boen(nCLqxa>|oCzUJlNYYR1J5>g?4cVWndJn??jIRg*Sf
z%;(|d)g%3pGF9)YQhdHXOOMD{xCC%1=q(kiP{c(>T{(HmPPBfPLP=5_gKz%1G`ywx
z0}tWdmAt|I4wb4+CPDp?;o$>%mwvgXU1K1=+2|<nz<Zk`h{=_>gZBrq#5pswutd%7
zytR8gB63yV)Un1z*y>*jHC_<ND3{~1|GB0^ECeMZ{3O$t-ifct(Oq>=h~9IzKp|r9
zkwtpg1Uk*oK489HlH@e%{z&kb=4C75rq&_BfwA&Tr}?z*c~KJ93cJR6^H<mL1XFhd
z;Tsxz$pDUo(RtElkryr$b5+wPqb1|&2CtJedj|F7=QG$#;g{MGjdeAIM(I2Cpti%R
zS$Ac|u6cr>xsDrUaK3&;5^v!NBzoF>nGU!gA!-34<ND3=2B7@oQ-R^7C)00<RC%h+
zJX}aA9ORDIYFy4ffCom%0%zfQ!`N81()3%Dk>nqaBBL`~k$R-*$-6k6H@g4ZK^0<L
z%vPHT99`or4g|(JkS$E}wCR(?^@+S%q<Kd(o-#UVb_t<E-3s@!hT&zTyX%->u7C47
zPda$n;L*I?tr-5fL&Z4*8!@rmgqOw-0rj*yFARf~#Zr+jizf<zwq`B%&UKy?B47C8
zNEK>D*j_f{_;TLYR_lpBI2WhPl1NNzygi>tiVf@^s)V)hU&b4gdc=#~U+V=Rm1byF
z{{&RU7LzS#^j&WtiL)5&aAz4Linf_xZAO0VSWWECKZw92Vp_B(A_ThX^$RP;n0|wc
zQWg&1E?q^QH5I8{SY>tVh|osq>f&!|)oRnRZzO(*;}@y*z@6!gDxB-^FqJ}?3IZeZ
zWnv4Iy3<o}$I0R>#w#X@FLk)B<#m363)LHDyBi33WnXQEyw&;p8k=1sYeqgxm^Kmh
z!7@!{5u0FJJ*!UP9}AxxiA`%_YbA4i#mY5KQk*Ka?JF@D&4R4qgg)L~QN*1O8oaXN
zBeD2tM&EGw=u1YXL&en8hBG|fuJrAM-!Je|$2|*(<L>rwOU?1fa3)%GMAySO3o~Bn
ztS0_0y61xB*_zeE9@;Ln&(^Xgyr3JJ@wy?9v~W-T;b)#|X(D-U&PhEp`TrGW_*qFM
zneOg-h7f-$%0s#2Q>DY6nkprxy?UlZ{FelScXkV7iM*>&gZPIR^KUVX+h9&K#~qyf
z<LR#<H8~AC^WKH#r=QH#?UPxrCP<3&rG>wOBL<ip0?0IrrE-m7rK(f$+J0?sc5f5t
z>iIeHMxvWQskLPYL@EOT2)<BtKrfwF;$owVf9~zeL=(Ri(gUW~a-5Mic#3(|Xp=v0
zQpHCq?Vk6%p~n;oi+?CWSn4o!WznW)^AGKBjyI<W8m<9OZvc+RfP{%VJi+ISO{2}b
zc=2N1Wl_MHV1@4wU0<TpgI5HEmEe~<!m+s|agd<KRV!2w-}SdDWvqNqS-rSr3dyDP
z<V=%49}D)^U<|3ui9}*PXgz@O0oa+tmDknzZ@739Ajjo{Vs5akNPh^3lOi{fIVK`4
z9s5fQgrKzkxu=?)Z>8X@l*KE(Kbq+Gf_K%~tV=33FCBk<5@}0>`Qs?uqP>0-#ZA$q
zlTwLSzmc#Ow>_MkKX{c$W`ax*w&)Z%_=`lQhJvtbbwyB?;-mS8L4(9^#?6CUG~lUr
z<VM3M4MVBg*J10QTtmhlvR@e2H!Dq?uUZKZ;!cA^h1#|%BOmTDU3iVK!OiV+M`N$r
z*WA4Kk4;QzvMpXHvgk~mozm1ZBrJ=K08advrPl}9V;m`YReyNKF%Zf{R#sLuARAfm
zK{EViz{t;uvFW~v&scLQ5K@iQK?`?R_b4kX&&8O)IeMa^ri7SZb6`YR44$RejBmY|
zzlnML!=}I`FT*O+@@MGL_$AV6?Ip~WPsMR({%~k~C22K7LFVMtg5f>K&uv7)9fN&t
zJ+0FTASWkdb1Uq(9qbBOzSKG2%HTGz?+%_C{Ix@64xM(2`P`e<7!kzU9QSiL=;hhB
zu}ao?WkkruV8^y>d3dvbnbIV!ugYQ6VKTdUxNkvs&!kn~y(F0?BH=U1bbVa<7CLJF
zq~G9Wg_JkipX(f(C2IfmKFlq);90=nCf8SSy%T$5*)86%<2QSTbmU|AVys?TMTzkr
z%G{NQ{I^W;&ATRBwi2`V^vWPSMl8nSuvfl2*bV~biUtRmmS0Iq_2GT$OVpL_#@DZW
zCT2>uo)S`A$8bsqsGlD@J}GhHgJ|6pkx_~uyqNOQKzZxrIBdtkyqlDBy}OIuB`%iX
z*#NR}%|g~w{GfRC)+yfK!U^Qk1BSIK)5m*V^`y;|;t-%`+HYbQ{}4E`O_7R_*BdCt
zPTq?yYMSZP3BoV@U)@+K0LdVoj}GWdp;K&#V=2L4e&0c7C;4zQA#t+xI9t&}K{zTX
z6f^uUK>!J|1xfBTwxvPBltov$$?}!9>n56U8_9?LYYDSIJhcu697^{znCGUWbvIHE
zXqA~#lf;2zq@m5mKWFw&kbff~IRQKb2FJ!o1iL&=71Ypklp2OmV6eHGVZ}Vx+hRKQ
z_2&tFxwdO^>&0Xp1O)LHvIlHF3SmKP)kqgU9G;O!DsAVM9&<VkyqU8vhA8y#d;6__
z{`<iJvIi-Xqk5=Vw7?~#!w~1b*>zV<W0Fe!Ho`0ZxqlMUr;tI|TYV-1Un&$3dInxu
zp8w+3@PE3wzwvPs)o`NIk&mV>fV*B{Kb`5NmgB6pK_WKoOi)DG9|zz!wtjy|ehO&Y
zpWB`Q4};TiUE6g5biplOVhv%Tc*i>bVeKjUGi<4nvn}W>-T8v>$rbmimz{L`x}z5g
z%iU+0p36MQ4!AP9h3UX9%I4&+qM>EX1SdWVb~j;$kcI8U=xR?4>L4fIt(9l@f5~x}
zzLI4$Ty31&x>NIK`+39nBd<(Ar}GIVkE_cpj+h3g#!c@0p2Lh(Yc>kV%Z})A0cLM;
z2206ab#;|2+j2xxuth#Qrq7Tu@_CK>eu;|}cH2y;0@gyLwXIAk8f8$Q>Y!GMWmAs5
zx{%315HQ>fX*<TJyu3WIQ^1#sLeVHCYH<XG{Oc<wTe0zt!UoHfNU3jNNj!zbJlO~9
zDi|ml-0cx%h)pZ&-hUrJgunBpmuQ)By9W-MhIw5MR%MUM^AeFg{M0Qgkym;^^#11d
zRoClmHJwj6KD$U?9chxfv{KM9SZuR=@Z*fgzRU{&B7D5diHF3*ns20YNVl=1*WdPx
zKv=^<936}II|tIRX>6*`84#m2?XSbLOv8!z#@X5%B)^kL^cqI|;ky`chXE$~b69GA
z?fe#{-0zRx)X3#7f1Y-Xk8=y?9)mNy@kvqqtB=Ezgb`C60(nRn@tp_K#rGo5zWP(d
z7>z+-KmWYkg$2ZiLbSLu-($ml)k*_QB-XU0J$3l-H0B0@PqEXOL@$jIX*T^r2E6z<
zLlttcU7ZSjUCUKF20oi8Z&KZI7f~yIMMd$V+s?-DJ%|e~fii31np$yRWbP~3;_`Yb
z+`%oOh5RQXMyg|==;F$jGOFeY6_;SB6!Qk%zlxrU7r^}#HqaEUN%B%lW13rv?`*vO
z8eb*7!dE-G?O><e)j6TF2MGkahh5rQzpLC_I`!&MC({A_Z?+1ulH4t3n%wFnB*&6N
z5w}zRgp5&_)4<Oi$D79*CsuoEw8sa3;_L0opu27`;@>((I+8k9#oJZSzpuCYLy>jF
zcC;mk3W_|=Sjg=sc&!auP}AD^oUCDr{RLe10UT0T(r`OiSDV3V<-ClT*tvylUBo(@
zz*vizU06o0$Su;5K(Y@jo*jd?9U-_I|NauB0%gXrM=XZ7y)1e$M2wdI&h&0#6KY!A
z5YxfAqr~1GY!vP^WC%l@7nM<Btc=0T;n0a(6fH71dQ_u|8OC3b`dBkxg!GXkXO=!h
zvg7dsulIpmONAGnpF<6eGRFEzrGIo93V@^W8pXiVM^K&c3TN#QK2b7~x*-Lt9;H|*
zOCadlD&?fjO=NHOsQ27{TZhL>18FsDujPt+$5LTY<AlM}^I0EYvyhb{mEP#pxF0?D
zq#?S9DrjP2f>|5Wr_?VI`-yHX^F<WoA%S2w1GdFTS+`5iq6r|LiWlpuUs~3Odzcr^
za)95V^q#b1(TEXHmiz>pbTJ<)bw^{<?gx1*{H=g+`ta_&aDBo%C<SZvM+{4n##=9W
zq5KT2LM*;OJe77aKhnd~nhC6%PK0B7+h2S2#E!v2g)Bpa1Qk8X`H>eGZO4#nq4GVy
zg!yhn_+%`kmcsjbdrczKmM*6aNqz3xKd>o^k%%#(ecAz8!kP{0#Khfkw%P+-^`kY4
zLJD%o$2|ludfLp7VsWe(nJ$@Yf`1JJ@0u8=giG%w(nll0!_D-Q{@x~7H`EERvPTb3
z;s&|+_S<e`u0t<Z&z$tcdO$vFVK_w;ahMscnPbBleOg@18N?_~-UONa10V@&4Ib&#
z$+xft;H-16&Z&<>-u5t~1p_?4BObwS<ABoB`CF<O#{I5HPE4CsBW0yMn^eB=AG9e|
z#|=?YSuZ*9<6nezB8yC$QBzet!dSO--jj0WN4uT~@84|4Eu>SD#B9vp_1IZ0eGk<g
zqOZFKs3q)1au<LCxDp3kH8!6TV8<F|!N<qb_^d7e{#whDB<<<g4fi{qPhB*&!T<QT
zb71rUxc@eR`?VC~wX^pyxTa<npON*10xN`zrj}fQsRmVuL`+kwuG{&0P^Qh#Wwem!
z3L!K#oM-F!=AU!z`@8ac57z4G@dx1b4$bDnC_97`htkbtR_c)E<}LX6UJ$-49LPiQ
z5T=iQfTxSjTECoXzL}@1haD9KaQT!sG}DN+9b^i~=gw8y5j=}j&e|on!p6<+yu7?=
znP*|Eo+TRq1)F@+Q25he5Fh2Krq8l!64jG8v_gLlrAmkP2D`}#&?lbXke1}DO?02v
zw+)eNodd4@Sa8)63iVL-Fb!`hM-|g*Oyovs4ZkF9Pu>$PZYtBUky1hr@P+bi$STa)
zJ<_2fR(;fYQDilelZ|{5Y$Mq#aMS!(IyNUSjOwwHkS;^Q5ylP|Au<`=!p_{DxvuTp
z@^b!d$3}dMg9DwJ&hqEk(tIh3uf$6H?{qEX4x!SR`4CaW47PfF*xDYps-?@eDCe^8
zNFdXPRxSQY0YZW#*}nt^_nT_Jp&L#4II6puz{#}%;0%fCW)OI%3{q)}tSl_2CU%}U
z^xTEsWvnU?#&Cn4r@tyw1qYl+!)OQWGpv%zR8>{4=a)9&nHUA<)+SE((oVNtB}U~~
zCer3EJ69vVj;##@e#HHvFcDy^xa~sb`Y|`?va)B%0D`!9^JPTWkCaoi;}f?oUt-;|
z1y5)(OgGu{xfSNsVU>MB#B92{DZ=DfQ8pqJ6s;NeofRs4cBA@YYD;mGOzP`aC}5Yw
z14=e=rx^kssLkFSi&$ot(u;`S9nzC0iT#_s<BNdXrrTWvP<V#B0wJB2ong8^RFhD2
z*VKx<@8mEjJE`T+n@UiF^%>H5Nlc%w$%v2~8_T%HVADo|ll2jWnWM`h(81u(4j=Ej
z388wI*6Dcd5?;oZrJJs4J}zG}?g!oculj8RSn^Wz3FU4j_Tc3X>BHuV_F=}0%J-o}
z3E>jG38ym3A8XT_t>6mhE7QTw(+v1|BnD2}N`k+=>FS^C5ndwZ0l&ZW@(A8o=rPE#
zS1giT`X*(^g^fprNrh&LNrXr;vR|8A6Wzh_E<gtV5SYMY+fMywG0eJT8R+lZ*qNKf
znwgB_dn;BVxWS~NE{1aHa8zpI-wS%0?pW-c?x6}BZQJ4f{scV<@4|nWf(lqYU0f7_
zcfJF?D08`a<UO$z0I<?)^F(DJ$18GZS}R>*+IP10>@9oubJy!7|5ljmRS-C1TY)t$
z@Uh&4{vxpST5h+~QL2%XK3t%SuHK(@V0XpgciRpgA}~D>5HLf){KrJgv+yihA_?#B
zqv$^Z6gz}mn_YOBG$M@wtITqCEeEu=1a@7e9sYKi0^eIwRBQ@f<Q_&?qZcgA2kRe7
zh!Y1;Gvg_theQ?gn-RA`8xR$0{Qy>b0A>U#oF>_fS5V>aZvz5R;xfexclcq09^3iX
z5R7UR(mW%U`PhWiVEaz*@&fP(ray}YdS_nd?b7VpCIWshCCbUY-Y?7!k87c}LU9*!
zZUjSZ40bZ_7cp!v*sf1V;Q=gpfS7954WIC$p@c-kRuAZiV@IMv)2fc(jlxbwMV+2=
zXmuM#CfJSamGyBS@3EWHZhMIS2z7WI7{R5sHbS$OKlET~_O=Uyef<cDbIn@@2T`6N
z^lZrl7k7uGCjMm^D8tbmo&jaaj73{Q$q@|LXJA-|4!5iL-=tE`scUIFe$59gfsLwp
z@g#tle+4B9xk57C!V?Tz0<WogyB<QQMVyCQf)(b|^$A75j~rc#F6XX)1g7GHdcT^6
zWBD7x@9?zX3&3hUyTg?7$`S&rfPjFo%e#4q`4DP8f9wo7l{U?tS0a6cvTz@2Sn!Z=
z1O<<pCnPr$Ux{C8K*Q+6eeS*QB&$_kCn;*>5;-6n#mmz(diZ`+EiJai1-}i}3ZgBq
zP~s7`RAS};Vt`)(Z7Bv5+38I?no|E+Pk``8A#x!3vE-o;{EzOQ0o#<da-0_nX)NTU
z%7Ksi`}ue_QX4HD*9B#yS_x3802heLIO3tfaR626ZFH*qVEPO&eyDkF)xYx`hf(19
zmPd~REz*Fxx3241y@jGEvXV9N`M=j+_i{je2Vli{(oPirYQ6-lmh^cz5<S(h%jSHk
z!9iY7mVY>79TnCVX@IIw(yb_~4+yX7PO$Hf)-TCu*@TSXBgw|$H7Quz9Pzn)$Po;F
z{?v%4M*LPl%cXDJNcmO5M}>~vcK?JHyA8C!mDht0kQS2O@}^&E>||1`^A+9K*Di>6
zd2XzO`JA==H1?vbxW2K8z=6o1coz@`RIG_{>}>g4Nf8OCjs-wTt^D_bGa4^Xif(Ib
zfeD?aRdcz;%a*zhjoPQO{I95Lmn20lUeN|m17y_n=;z~WWidPh6UdG@IU3*>d!&!g
zG4Cfsw~tS~rE<Sk-t<{|@n=^1^;Q~|m)ObKIS`>Nvo8rWHI=9zjxPv6bej37{BIDq
z0Ek1@lX6s2p@DFLj6Do1NkmQM0ix|N9XCD;j<28bm>@g<9R}|OZ<kpbw}&|>PanSN
z<sS6q3_Owc?3EJE)Zz+8cZ&r~Od5o1dLkyu9tVo^7mm4&$sI9<<!P)vzk|GG0~{&2
zHNF3aLkht$Vt!o#MjU$hi?&9!nkU6PI6s@;qz$h(_zt&8M&-Z-Da@IzFl(ePNqRcP
z>+`oP7@Bp~ZMv}@rc{adcu{%Gy&o*uNL6~o7*}?ODGA*#D!hfk_4pg+TOcC#fWW7%
z2kChJ!!dZ?!)5~mq!t##q@QaGO&-fl_ApW4^VtD(<CXMcbIlnSz=DBynySq56tjry
zuaLNCtGpi%WUCio0*dOG`)onIW6@;HR4K2ta14yd8%$AiyYi^d?BHVt-_;y+acm|J
zdzVOyTf|PNdJ}@PP;f1iq8BfJU8gy=Vv*tqINuXu0(tMxI9AV>2i8#L1OoMoFKjjv
zhCUpCEjwp2zJRL(6E6meCHDJB&2`-zgz*|`!k^VV7V7Qc3bUvE(zAKyFau6m&84N@
zd9>Jnch(kzbhy28TLnjUGG7+l-KL^uo|c}Yq~UggQ;7Q-jMaStyzi8WO>hQ1n)L;A
z#NDr*1E8;=1v;s1(R5}Vuu~`5-bsNS5pj-+j{d1Rmj9b|2ad<jR~NswhRMDG7bUZ<
zkh}i8$a+=F=tB;b`}obljjVH6ojY&fIBfmir0Dw`@wPBYDG>$=NTNF-Ef?XMo5OW5
zlP<0U23{qZN)4Yg2+;m<=Q*6hpL)ksbnPpxXtPPkRQs5FuKKCfUS639bg?S4-NJj;
zmEhQMZ(jN<u{<K;I2eU2P`J?eJoYdSM#FHUn<XDdh2LW#t?umUaR{ID48x1lJVLBk
z-a3y3N4X`7a<<eqhRpE0N-oiy#Mc4``E<m_@;+?uOxZ58JgAlg1JtA(S8e1q-}uEz
z4GCCpe(pAMwH~Jbh>fR8Q%Oy!!vU$Mby}BFfw5(lCxSeb9ouXa{f(5CTy{w?4EsJW
zld*Bjnw#Cc=LVb9;7K(lboUr%bXL{DPrU}lqr=<bQZ@TZ_S;qY(szrNehBuDn(koA
zq4b6ATr!8DfxF7b%lU96Ft{d~Lnz!5E=E;Y_?la!?Zu$w;t$bZhhZ8g*60NQesj7J
z-_uW2D`ZT*J+HgVo?1-qo$#`bMh%_FN`xT`H9B)V{?gO1GgtCT-~wYQG)pdzu``2G
zJtInlsqCH<>}AzH`LeXzp-#GEF))-y%m6Mz8iXk@s;1F4$DbRhcCP+<Sp0_OwO@&v
zjNUF&`ih;GT47391l(Zx+I@SYogwRdrDEM;$7~VvvV7R1<RedL$Z;y7?S~BD>Sp&E
z!GRv|6lw#6_d~A%a;g0-51w4J>7Wufao3L4zfYars-#~`<VPIx6=DKDIxA>3`VQ?(
zZviDck_9TmL0>nGGg7>x;k#F|mL1#_o}?)K>t0@g|ByIbFy*iSN;KFV<oOBAI}|p6
z!M&3ie4&nA<ea;0N@<(~$8NOG7R?sIh^TiDniRC-l5glfVbp%O{b-jbv!*zV!>NPW
zMg~vJBR`Ro!Km%|2Od<p@YX3fe(B~dB4Q6d{Eh9PVuU<E#m%?o5iB>sIk~?jE3JdG
zP4id4VK>6=;-`?3>U{TL%UpfjO6VCwx9J>`a3mOo<357Xb`vlM*|^0rw_Ti6YQj%V
z8AhSn^_)$Ety&MF!8_hI2MVX^a%RgXr21oBtXer=Bpj>>a-Vukj|)%Ueo)77H}ALi
zWXsZ*lMS*%!n+k^7QRf4Iu;k)Zk}2FO64{nHyD3&d*m0Jw4QbjPX>m^-6t~1UFC0F
z9U917sZk12pz(F{Xn4G*(N^5r>62a^(k}_h**TMDD|?<hr-~QdTI&U{wPfA61u_?q
zvX!3nwqyBi#~>F2^;##cHq8LHY+~DCJc@&UbQE9D1Pk-Nez9|PCFjoKe2qs3d2iNL
zn%ZWVoGacf?-R9yS9W*vb+~-}$t;-YK^Q1z)~vQ`*&aq&sd$VaO^oJ?Er5y;dWBWH
zo&fXzv^jknf*9GK!mVk^mO3=IeAVS)>iO+Gk&yGgtUUNZ&E2*IY)u_Z=8x?b^Y`Eg
z4zmn$`*~4w*VqGIt}{;#N1bxV=~?A*Oa0iZX72PJMwo_opOCi9BHhBcFFY2UHnZ7x
zNI$B9esp!YT)tjEo-<1Ab)8Y@mZo|vvBthcL#|}s{M>yyZE7k>#_cn6=k-=gkbdXW
zpRTSf^~)r|F|+vqB}u#NqxpNi<n6L=?{QA1*>Gk1uS15bPX*ofqCv}h;U-txk9*y8
z0);+i#x4KMYPpj3^v46e1L>e9_wsK~hf6I64Oe~`*1;X^?K?8(RmP#NJq1_#CgDvy
zzv2k#)^_C?&>hNxH{NckLpPPo<UDu%Az*%c_w|9qR-6G#4Hn~QJl1_Q_q^_nD~<HN
z{qs&WF6Td$l})&h-KdaA%^RmpS1L)8kGhl!DMRENUehy{U5A4|UQBW>xB$~LtDURM
zo%N!{^VMQZaE0(}wv;Z66`nac(dSyg7<LW~B%Zn#<jS=lhO*~v=sBANS1D%x$fj}4
zgKIABrt4ibsyJnpf?V2q>YjTn58H#~_&mVLPpwc~ZGFvA>O`E?Q5eux52MONO|5C3
zpbG{T4jC6&aSXscDyD~jcE=@uM3?9$BE4m}SuBGVn*O!wPN8c7k6%mIUL4XXjyyjZ
z*%WGjsWBqQQ_W$fP6_<;A|Um6bpWw7{m-@tdeaW%Tx)p3mouYd{FbP_zE(kPnV|c`
zovgmdt2u8xKvOFNPR}BL^6`R7?*7#2(l4r$mRFy>Jnlb!S~64PoGHw5j2j(VT`pd2
zN)~TuOUPH(V&3Uc0GBq@XU0xH3#|8jT-D;kE#4s+v;}Sr$<G2>C&FAchdqvtMcgw3
zT{o#z(WS2acMLR?aCw0-fWS|umqg`Q(y>Rn_^yezEDJ%`<_~nlt;szCpKt!}HZc4U
z)o8}-F9H{1r=nduOhVbs)Htfr(Nr{(y*JSRV@`4rowge~_8aI{<M5WHbFO>}HY)7M
z3Yz_*bD6;TFXYpbJ<K&`E}hoylTlL*NzxAGO=^-8rCG3M-=3?~Ot&V{7}4SIwB#e9
z+xBiVR@HQaVbY0%FbMpolW+19dY7sD58rSpG(I60Ne8M($p}cPSyb+ppME?BGgAh^
zh@3zi%)<WqD93>nKhxa$mrioAR))70KRXtMz&+NJ58Uv!R^ZtSS;O)#AJ&~L`kh82
zdg*_h!FYAhLOv~lu{?Aa9)ze#I~`|5`a>56=GG&5WcKLb@3KP^H%WIkGcue-es>Y(
zNVm!=Vyv$=y3N9m{ne|)tjU65Ec{-n$%vZhFDjU@>)$Tij#YLkocCDi_o$2NT<T+P
zmqf}NY`#$onp3(2)8Gv|l9dlVV`K>K(=6mAm3o=TR6JJ};PFd~cj&sxgsU{%3vLeA
zAq>uZePrkX!{?Kl3P&Aw%jr+-8-pov<{$;5pE{pD63i;|gwN+&oQKm*-MD3C(2edi
ziDwtZAeVqYlm_ioq|#DddH;^}Fblu{^&a&aQ#-5^6`zJ%+_@MIce4(hdmDF#Bn~c$
z6mT2TjEnrD;H{AmL{2fChQ=(Lqk(>3+F?L`w6Q8&FmleHUzTR)S#ag-sOCfBmbjK%
zi<evP4#RM+rQ??(G_qT*3ihYNVwwE>Zhazg=30&)&M^63EdOfgvHVEIF+KaMk|-3f
zn|c(E4uVT}t!cK}K~5`197gi&Y>er4ilW@35YETnDNhzn9#AUN%%)?=TX)G6&2D9>
zh%VF6c4~#j(>ha`#bz;q|3%qfhIO@mQKPUTD%c=cq_i~BDIy&LQql@acQ=9<G=g*r
z(xr5RNT+mncXz#W;a308d7lsGeAw6KLWSR2cg#7*9CM6kQ%m}tJ&~q0cWItjAWF#|
zDeC?IU`W8+fqGJGt@VH5vqIU0z#yNUm7Vt*)e}xLV^@+D`;mioX(7D8VMw(SS{;|o
zJ1%?ws)BopQF9EDry{Nj;2ze$A!kwsL7OQhgegD21pN1=dzUg0xX4?xBuv?jR&BpM
z*lIvdJ>z6x<T!|^ZVh=-L1*bN<kMSIbk6Lha}vg3Yq_BBOU6CJ-m1%fjD+hNwHkaR
z_xE0{g0_c!*g}5UR{CaxLYbL7fho0;tf5_hM^q3koc)}W<Ac_MX%Ju&15H*bNlnQi
z2M``jM55bi2EKf(cDx?UTI2golBLA2IDkiGx2OD=*$^r@JgS`>9Lf~V*U>JExkycf
zWIFe{Oq`Au)Dw4$@o8MGLE`h-L^K-jQdT3a<K?mH>;sVM<~*7edL#<A%wj*;e9Arb
zBrqyGP5Nayw>o8%Gw%B|d0vwK|M=?dl~+LASnwlh<^ATR*L3!Zz)aOCxqGsmU(PMi
z6_VTyRJkt!5Z!eI2~1USle<<l{fU^c<m|!*(8(iyp%FFbS7T^^sQ!`Hzz0=~MBCLg
zTL&7hlO5+1R_?ue$*1bD4DC-jmzXrmV7s*5!*<R}m$OOk!nexubc$VnDl2vmMlNM5
zk?@0q9YsWf`6JYub+~|`HF+BrE{|bk`j#U<cC?x@?n(|g1b>CNVymVqP(kkc47q<4
zc235wJL|Y9WI)W`gxrP)xzoJoCD#tEZ{E?%tyPsEi5H}wfqc)xu8J6+VWAuCg+Kx2
zfpu}$o$uP#!?xOuPDP<IsP04-SdO`R$bOSS-6VQ-i^lbBUwJ-;hUdg|jt1JR=(*jp
z8)HvS(Lj#NSC~d`J_=EFSgU?;%z6y!_kVDe|MSXsc?!2dq?^almJK6MiVjo0P4*+c
zv(hf4O0xFuH-5-^&J#evQtM?8ZzyN;J!{F!n&_pngJs6$h2Sdqe>l(ans4hYEeZz!
z14AVrbmY7dIZsiLWByTc5V7KIdf1NKdo4xWJ~4J(scK!j8)yk7piUBP+KdFqpjye&
zgP|28*~0p~q?{~7FE30>1;-^SIWBYj?#FrR$mNqp^<#g0*jcT2h__clG=1usjZ!_0
zz>{DT?t4aoU#UHL5M~pQc%$ph+&$YXhnu+NF6j_!xb<ar0gR7D=<M1&IkvG|yDzJ)
zy5{wiPBUlfq9!9@Oz`>iW{0G)y`kG$8GMce2&x&T61Z|ZU#>e+;5n@Rz?*4v_`x~6
z=3@bZrdD`19y}l#{sXDWz9J@u6CoJ8`%u?KAZ6!QHV>*ALnCQj%9Ibf|AUTCVcpTG
z(1yZZ*TJFW{>>Ad_`9U^je9X6mOCgI2Kq)(N6S}-e#43vpbLOM)XG3KIK0x`=rGaY
z>vUmm$7l90J^sZVSh1f{_)T}nBQ2hJ#HDA^r2J(1AhZn%y%wPR_f-9s@h_o*xI%$9
zB(5aQG<=US*5|Z6S1V<r)<4h1)XR<lGN7sih0YAOEUxwDG;6tS&5?BEH00aIuw<>?
zNMWmtpB;ErEH_1CGgbX4#T4sVo`77W7;WLJ?&uG%hi{cX(RS88s$nf_TmP705Z(8N
zmrHrgt2t5Nd>k0pnsiH>r0;uN;+B3uma^!uq_3`TFNxY8yDq*tkW$T{z%XJ{HXxUv
zTSg~2l?cccO{C4Zd-i)zN1xRWVyF!7s(ZeXTYQXa600PiCooFOr+`SM;f#GlqB2j3
zH%}#z&*DPD5?pO_UW_euj*aAx_UZ+{%|LD|iKGbHgSTlRL(*R^Q(<Fsk?$&(1>R6T
z@P?A2Dt@8x$XH-(y0hM!Z!dCuqIJXg(mTuijgZ#kE1F(=O^w`Jm`U^DA#$n;sfpP=
z=0yuzRmz4K=PzuPI{4|+RPL-2HNEt!iCZiVzpWs|E~)ts+}DBH`N7P_jmPdt;K?Q<
zz;+~BY#)TyUy>|l?M>!<`9L<Hhv{N7xxcB2KHf~s@DXaD<?`9J8yFa7nefeogq=M#
zZPR`xuD=<OoM@whvG;dXKUlj7hEa{dxLcL!b5j-BJZ?y@LM*-HaD}W8Ay73w+~mOY
z7R<BX*@#kPl)*1R+fAPZ@Dai4(7l|bn2ikUQvDX?xcO(Oc4593JV(W3bv<@UDzoG`
zPYyRasMOXx?y7rmyE9wK&?E&k0v6br<d3Lix7?xf$@GXTSHjC@wtQJ)bfYUgwz654
zIeu3O_uXyh>qK9-A@%z{wV&6k;fZxp>T{zPbX<p5-Ff>E!>oz@RZkP;;fWe0yzDMr
zW6db(WIc%NVOs{5b}ia*hI`Kc*Gx`%K)?A$lD=g};yst~;iRJsjih?B^6=7DmxD^$
zUr+{SP%JO^1nV0AL#baFxlGTmn!a|}eZVu8iLF4$cbU#%?6QoTLzptE2vH}di`Vq6
zE~YRlSN`~jq>zq5Bp!bSTNm4!Xn`&yn@Kgkf#LOW!Z+)zx0<hm2YMdItp%{!FlM`&
zTDp~PEP)YJmlu6FN&cXcpqXZySb6GIiig;SC1U<#*QANhnA(a~fgqoS!QCl*Vu^%}
zvNNWgvgT9az`ayE=<i8rb(m(HGxfDsY0Ry*SA!t>bfVDUT3uB&hK;*sO9BcD<bT#4
zQsRe^vk3yvska~Y??7r><9t0X?3btd;kcmAG>8j}x7u}QYgzU+Z`@k=7}wwg$q387
z!<?9UKUz|CZ{IEbk2A&Wt4x*>sV~$VmOwejt{;rvIB1K9l*ViDtOptJY$rc?>+xfd
z6oCf`&PlWT$uWNgHJ<_oMN=MHfaC5@f@QY6%NLNAhK(jM;O&MkpHLl?<YI5cByYc!
zukCsrZQg2w7mb1_TR`Ug6C;G<8mpjZ#epny4u{+u9G@KNQydk^s^24{Y1DJa-W{}P
zt5BIzpQd^j_95i%Y&IJ^neVqI`mDMCs+SG)MPuMMfvajN0L^7HHTP03|HvZ<D8)R_
zC^!@49o3;4H1su`?+HuT*Twep$czM6ggI|xhl#W?0C9YoQVN*zL3@_d5U^NX_%Hl;
z0`hBC8sDWN$bM$husQ~fW%c#8RQM}if!6<=a9iso9#aQdRC%Rb@bV$4cJ^J*NwUIf
z)z4FUhMG3sTf`Q$AeU(;M|`C-MIbjKi3CeX5kXeHJ>;eDQz&t5j0L8%U^%Og+oUJ8
zX9LjFJZpvErwG~-m7HHfhQf{n!nDjKfULc3sk56N(yVySb3;x(oR(RwLhP5_`An>t
zJ<;k*&eO3>1ole@lrlSIr|Lzz&zT5TFC2SJ9qiS$v$;{iL*fYMKoD~^IlwgEIK{%=
zQLXVMt}SCy;f!kB0-77I+*?`O)Bw4$2TAkz@-L0&%ifo?CJ>qrMSt#IXx%V>7NsA%
zv)F3cyu)0M{%|BE)aoCBm=^;;Pmgbx_hwcRKmkl;pJp4ef#PvHT=?^Ik9;X{Fe2RN
zNiPyOtfo!4jaQ<v^T&|*jC&APOc0#x-aQh$F$>mTVu(BBF?wF%`iDn{FYW5Z)E+LR
zp6o@R^d<GS>w6MP<b?^R5{KL2C3YPxp)egRW=j%BFm%O^i@1Fo-~H<Ad^j-|rk|n9
z9tm-S{s^{+b=-VwGwVpiILv5u8@?ghW}^;M=6Zr5+Ry<vX>j@B)EwmBYEjDCsvRx!
z00CR?L(bVR7oDEc%8#8Q^OT(w5?lhYHtkgSe8tR%%5s8XMaNJviz2wV_ED*OgMxH2
zxw2z&@GpMct*s_)9&^FKuOULRSND;K#xi&l+at?aJG|$hE7OVjG*{5S*Qz-f&8@eH
z<O;GYp{aOwvtPU?+QCLUvy=tN$5odSJr7Q6CmA4;O3?w@=elS;%xcjuk>7I=H{$f+
z%|1h-tAVdBBw#V?D7HVThIls_L%MzUw&Hw^x>x$g&?k>ViB~L#4pllOqJQ3SH(yDN
z+S>i9-!{#9<(%f<yiFFNcX1uuaq&xHYe&K<*vm<J5J$$Qh-}|UE0nhdG>kc=p5KoI
zg|wNb60<L^xCq<R);On4FxwRsgC0ulaPR|6?=%d~&OmV52(^>_o>s`qp1eLun{j!F
zcQ_~MtcS}PlumLpxO4(<MF&@N6Dr_B(+FAC%;1vsz)S--&hffrHoNpfLok4!gN&~s
z_cNMP^_of_QtrmTy~aVIKp-O*l|bur=9#QeSZcMM^H!>Q{8o?pBkuh+->9-2)e7=T
z%9H)flQ4ar=G)aHAEskup9_4P2N<k9twMb8cC8Qqk_2HHDqThuNZY{sg-TvEhMe@(
z1?W(;deI4t#&>Pm-oiQ(2uz>8bw4Ai0uk}gTN(#}_NQP~t3HsK`Mu_zH*szl6(6&@
z1A-eu=5AASWb3MKeq^nak{vPtduf=f(YpB`X-Mxrz5g4@ukR-wjSd}-J_rAWI`;GL
zk1lk5t8Jt$8*xb`;!=xap>h53ZAPl=q-C3tt$iWtN{3B|=GOf9@jAN2!N{yp=JiiT
zW%<pCC+o>O&Na<+CMi%=tN)`IUjevK+orqJ3&F1vPT{^7;YZ4-L@it!O`D6_kBTuK
z>hLsB)zsf)3eo3TqP$kkO&(#w=Rxec-!^WO3UH;9pn`O$4uql=p&gn78^})YY36ZW
z&?`G9v^odEaSm}Sy*uECVk@s;nkHMU8J<9Eq;Qw4j0bzZfw9)$e6)=gQBp?+ga=b^
z6PcXb5FEWdSY^`XDzw2eMnd{-;=rx36axwhpA(OrY9%>OJ4nJt&~_~JFdCE$bGwu-
zxE=5Jl(X!Ad?Rbli2%1}FXH}6UI<e6J@DaLubayhpeIjR;b%)xJ%D;PbiC*(4dvZx
z=~wAWvf>TzO*9BJ9`)y<VF;d2wCYMsSY9A=>(;$m#`8Zw`RPi~8M8FaG&dWKphdOb
zabX`h;PUv5FY>OPw?mKEG&5%X67ry0t%mCOX~>ugWXDn0|AMV42#3a*94Qx>eHPq3
zaE|m$59Qskh9OEwD(rsL{GxPTgXq^9(n`YEt{*sF{qTq>1trp9y`ILn?@zC>WXQ5)
z8RYE60B+mzTHrYxyUt3PcO~WZ+q*}1$ft3H#3ba5+;?oocXJFDWFsk1cu#{sMYrsu
zT3mB+j}%E;CDcaPg#u?DxZ<bx|GfTa5N(Kw-~emq={q?fF^>0@jk;c57k^#D491mp
zX)zL9=4<;;w-rC)AkdkjkpAn7-X@Z=GkckEViY&rSAU(>{ZRjyclQ*di@)Y<xv)Tl
zy9Ic5R(dTUtS9kFt5~(emU`b6qytmsNMq#XJf<)C3^|`1UF`uA3X|?{7?(Ws9yc$+
z=Cf6TwiAa$ThB!b2;ICYsy7LxF1xg!cStFsDt-U9=qf|5*`npOKAwXib-A5kU|<kR
zp}IUq<=gL@ap0p>>~U&IM6=hrhRR1cy!o-5K&CbSjql8F4k2>gQNp5B57(pNE*33h
z%yr60E-moKjZ2{Nh7G{>`y{XUIxuL0Uuv{h(^taSt|NS!YnY-^7^!b2W@D!Pgv=8W
zba8+Dc6sl26baVLK5v9k=^@8^9V=Q-s3d!_AB?&W`Yln7h-?B3`!tir7chPyMcE*m
z2jxI)mIP`pQoiPG*f%N<`$Ny7bt>!x33C<z+%*@lL^XmfhF$j&2AO-@aG3H&HUcXA
z_`bU##62%vIDJ#Iwk$6t#C*AMCLy?NGky+!@?l!p_+6i!XR$@%soPMx=U@!$V!v=-
zAfC*tCky|IG@s@eqxYJ;Xo;R<-H+bOK`d1e^TUq(_d>YF-3>cPK+3eKrYhoar8_RG
zc77BDJ-xdfp2XEelZ;){DNR^`p5q0@NIx+<=#F!+`l{>gCJxbzV9mzqiP@+=L`Y<<
zNFJB@bh?@sVac@!W>9>r_;&8e5t_snAM4`Q_QRf%9eGm;+a8{jg_L_b6zxU(7+m!S
z4p_QFwlj+`n0J93(PWsr<S&E$@Vvu?ne;!ifqw{b#o=wS8Z|h8Y6lso<j<4y5tuTy
znMWTC)-Z4`q(fT(I)6An-d=YYJ;!LItMG#~W%yO5VHtxOqkE<?gqn`W(Foe73UmkW
zxH7OP^UC`XHB0Uj>A_n;jKB8Lh=M~6XUwyiWx$}u67erUx;MYphK=YxPK1mXBy!R4
zk(!FjxmjpD(oolMj!lsO>!U%03vk+Elh2Rdts+;#&&G1`LOK`TPH_#As43&eXe`xj
zYUa5%@<jaWbwWH@6AzIVu_E?sz#P=SYV&M_W-nh|`CSXB+fD^~PfW$;avoDsia2g(
zM=#yk_b?@1wE}0^mU{q#??h#!-E_X*FIV8cSM5|a;*BJ3L*UeUwU#QQAU0iKu?y)E
z*7R6I`=SOzuX!mq)6sArg6m*gR@eD-pBm43B|zbRZBDI?xX!{M+%Yc9KW$I_!an;|
ze&Hjfi0R2Csx<l`?d|fTwRg~rG?nLFfO^VkQ8$lIQiY&ZkSn?kV6M2k2be-1hfIE6
zrFkp5;c6z+1BXjBpb(*6Ga|{NJ5~bg18(<!J(y{_U%0EG`+G_i+Q(7Le+RTFf_S$j
zhJkC2aR1Uox=mQ5-%Ua8+qL=OY8ZNKWi4#$O|07cr{JS3j<>UtoVoh};-MoL^Q90;
zi$vfEDQMRVo{#zsMhjr5C?a$WWUp(-B-*|~yDq@{rR$SSsgWt@cr>bQXKfy>IG6Wi
zB(7XvNxEmg*(Q{~$ViEXfb|BrVpwvohzv09M@)|pgc~;<+lJJvi77@SHoq8xh%c<N
z56({Yuuw>B_^kySKw$^gzy*xUOg=Y4UdZhT^k<FTyu*G9Fyymkq!-$W7vE6a6)>S(
zL=3QNO2%!y2%pL7c)Sr!eIcoGTh3M^GZ95>ZKdywT<yCS(;r}D5X3TQ^0NjTqo`3{
z$m2#2fbu+<_hTQ*y9H#?M2+ko903sZ@M!enBb|dC>Zok<7WPExvvbzv2dczU-Sdj>
z+G_v^TG=(oM77_Uho-rhDeiM~PpPM}LLNLC1jO{WMs6zQu72zhPN=k#E~KJ-!knL!
zgf&KN$OKCmXUa+m%~1?MiW0k=zrKA6guUy2fq&CJFHly|W?B~f%l8Ho_BX925yf{&
zT`^4!BxP*@Ks#QKTyY>z&uJGeEH(i4`6XIY5{1-lXbmNt;w;cFEUXP#sJuf(S^SD>
zaWj6y*`j^^W5({YwAYl>?Tt3lwq!n99BWlKwJmSn4??>mKoJ|B2kw)tmke`Ky>7Y$
zCT+Ga_huK?c*pgs?KLQ{_$=ueeW#OlOzx7FYVJYergsW^GtIu&o9DfP@dE?^+j(n~
zDNm86{NiU0o4!>mj`StXp9y@V;WM)B#`zJ0cEYu6>Kopsov0^kyF*#|MEZ+fiols=
zNVTIa?NPt4<1=<UYyi<>XkGX_*M?^Um#IU49Q@$9#!LWt1W7aHt=7^6*{Gu8p)kbJ
zg<GkGu_;ZzauRwiS`$eH$=eX(*b&r~vBrC^dPL&?c;CQxXi-!8Zy**(civDJ>E-kV
z{bBwkeSx;+cNXqp65k0gB{@xZkGpFL4jT-dwps+W*Q+u3`fhb;^>1x~4JmU<a@Cqz
zX-<+j=2>5Z-Pm9r<)H!EQKZ|*lG3o*e?0N%IPUf61C*N>W$vSLtTe97P&@XgDaT@p
zLB9}k<LEqCrVPq0{^~D)!{lHT*#Mf)LbZue%U(Kqo-V|Eegt|B_s)JX@YJJ&ZphP#
z*%YhC5lO=vZsc4qyGu<Ylx087=Pm(c?ya38_mPT{wz}sU*nLO6cfZ65S9OmC!J74h
z<EO6&0PWil+)CKXv8w&n+|^S&yp<uyoGh>J>*(0#sn)r@&9+DE)1kj5ZV~3pbr~bc
zL;KNyQhMC&;HTzmTZsG^7ozbq`@fXJ(<c8Nci5J~Pe?$c`V8S25&p-ry;uLdOG0`Q
z2vheT&)^~kN1<|SBu3t%cQ8C`080tTPIOl5&S*`#k#oGi(AqZwRs%z5!VwTN3v0ar
z!oCxiH2YAYj?nr8JFS<kpxh^e1)%~$O0sv(c>;l<kkZftPpGwl;b&-Y&ca6d@j|&0
zeh-tSwQR2KM)=6!8d@&ftvRGgioni^7mopz?@a=HF3dMdFaIOdm2|oQ5!A~`Bo_Lf
z9Iu=_HMSoZ=<j!cewQBZQDCB|q&c}XGg&2#DQ+jUx(3^L_^z!{4?Ao$<7FCJKHk2|
z0XZ<vSpM{_RtMDSX=NHpN`9oQrg{6MkG*;y!8KTqeVV&y3n%?^psSl-N<#Ur8sEa4
zWCC$f9lvXs-vVJJ##H46m%a=Q%8Mf|T0^z)=N;j&A&b_6B#>!B7*!EWCjOJ&BC?hs
zcBnDAo2|2!oX>Vk;UBwEN*-Rd#?l~!kSHUt_PiSaiPiAur)*<xIpD9+te=IP%?_mg
zQQQGn+iLR*XTpp>ic(A0lMd7S><vK&{rBCry9P~M=IWBTd%}l4SlAyC*xa_+vW!&#
zEq*T~c9wh)0RCg`GNAq37BWW~dZBHSvWx61B^%G&9Hu*Hf&F%OEIgSfwqF`ozbjeb
z+bkGH7~|qGW2?9~P$EgtJ?uA8drjJTi$p%g9AGeguS<Dc_&b8^C{U16Rxd)e6Pdt#
zv{kTON9?<3a|FG%0K^Ja@O4Td<yiqCr0?AiH~77i1G<xSI=q7xwN=Q$(oIPexysih
zWgG3<1EM;iwR~`=_Ps!S@^poCYTV+ua5@+RazZK;+3{o%0Gq2lYD%)r9y10G4i17N
zGLC}PUl9Z-eTw@MwOZ!Kd6~YGghi+Wygg*4r)C{j?9~w?=;;W%!&0B4rklN<--(1-
zn5Pk5!sh2huK3&cc3P16kth=*SeS}OuzXyX{sm@H$hKuFB^Y1-Sx*;)2veAR!RaN-
zS4_Fbv6Vnw91-($ir$ErX-av4iOqu~0BS|FouvqVEQ3lDD_}#)N57G5^I28a4Lgj=
zf3f`J*!xBel3GP|kIeUGBCMY5l%JbSz>M(Go`urhu5nak<A$`r10UV6R&(sL!3gt7
z;0@U^cS^sTshd&MnwNGHRcA-@y5?XlY+FVn_l8G6Fwxew+$cXeDEE$$c1tBf-Gt8W
zR}8BKA<tg`>Ord?f%=Ys_hg9b*9}A}+eeLUwO}_IMf58l+Bz&g%APl#o1@WDz7XJj
zkb*5>L+2`q(eLSuK-CDB!74Y$Mops$jjy`RwO<D{Yh)x*i-Db;Jq-OW(yJy#<SqN&
zhtmF~61e~`c@6h(fRt4vJU~A|qqQ)9B^GG-zBJFKgL&A&cj6o5uVC3YnB>#pYXt;H
zCcfJ;T_UkC;jf_5X7jULbSX&LNCctf*$r<*_2lyk%XR@a8{5!`wBXC|ST_N;bk^;V
zV3jbH1y?eQ>hk>5GRAqwQ|egWh|_`M{pb*D*=~2@ga+Jc9m}Qxq)OO?+PI9=5!IBa
zHyd%E%1C0I=Vo#$g*`BzJ0dUkhW7+6(S75T|CyRU2g3t**}k9s3#AGo^%&nF%M+m~
zf2dJ7Xr6zI$qevpz=e86*KU%Spbsq>A(uKXl{Z8Rgb*J?s`GlksELh?lJY_I349DX
zLs|U?W}ceKWzcg=1hgv8l%yXZ<-w%YXYXQq70UO!a=_rTW%LBvx8d@TrxO{w7h>qZ
zg#5%9^bC28VM~_o6^0|o?&;^M5Qaw(tTdAPyKC<Qef$b~KRqyjlt$rzieJPvso2Er
zQ890}INIvDZP!93^nw<K9$Jz4BRhnl-P}6#?;QM(hQ)8D`V~<3)E55r{yr~oE|F`v
z9eTO++_ebX(s92dnszfn!l&YIslN%5O&#IU#eayhps~geKY(QM!oOt6AEJ_9%vErO
z`f`4};5!?wVah3WA+vOw^8fkM0RL&-1vNh$oCU+_<$ru(DNVRi?-`fH|LOVYAp;F>
zCGz-DP&qCk>oAAdWR@)hlSO>Z?u}qf8oa#ML+l%5$ned%tGiW1Fxx_}52c$xpVP+Q
zmMldo<QR(9EhT@S01<JVls1DYdbwMWNpiMRrlvjAbccKJAnj3VmP@W2!Z)+C*AoEZ
zxB@(E!LZ=JyZ_g17lVqKaZkFZ7+-zKq(*LQME*S{|J0U14Gif%XhvYmKmRgi^Jw?S
z^>!pjpUdRl{_ZOQuwtv!wAKxjx0s;c40Dx;VBfvYZ-}}E?`zltl-YrQ^a}wG_}_sy
zWUu^_gt5dZfZllr!vD;G+^@zTZ0=V!p+Bq9?S2O#?AiUq8K!T}0)NUxFn*4q74KG*
zhBxhDl=d2fkRBPz{vwU@PrQ-$1b+Fxb~20*{5~iy<LIx>@$mGimvBFX*jRLX7a;18
zsO_A@c&Od1YnlO55%w5;V$Qi*ajpYH2$$hV2&xLOwhz%PBIv|Jy@bf~#Wd4n1Kkuv
z4`mt*8Fl~u@S@Aq&%eD0djU$QYkV=pMf1n9&;KZ{{m+QyXf%qni?%0Co{orFB|7<6
z1fS_pR|a6E{S?|h@#Ws`MHts(BvN|tNY9L;Vf<I>419@V;QHY2`rMd>sg6WthMqs0
z4DB<VhD7n=*BLNfgv|y~6``CCZLdRy$W*#hx*~xw-3R;zEtMT8I%yF4hV2J4JQqG>
zzQase90qtWk@*gtZ1I1bpNh{g<*EIjo*W=F4X{z=hE?^6rPN*E)zBd=n>qM3OZUdW
zU|a&J^m}#+U3fpitM|ykndW=Ldl%u&6zEU?omi2whiB=3gmUK3^>s=n-rQEgby2$P
zFvB?_f>0wAE&!uNs|a$u;-cDWR1YKuR;e<;Vm2VFkTCR6;m@l-i*kZo@f3gGjqJa<
z%t6SzbOHR7L_CqrU*eiKG3WxDu-P)*6B+ogNUST?1lh3vl^^`?xIIL)ZDA-<rUV_+
zP(y<VoF+g1)17c>M+iSw3*r%HTxd}dyaM5^CBQz)Us6S=GaUdSheQGBVAYl_hK_}B
z_(Kjf<l$Edy%>clPu6=0&&NSBx#RJ_^T_+c3EuC9x%ursmnbhH|I+YXQVRx<S5V5~
zZtg(1bQS<O<Sj13$&*09ft7P}F1%;qZ(jO_o&cxr{~SJp*DR#~ue@lv#PZK(nMc^t
zzK(gKD5%HIE(l9*?rXqPQG@_g1a_Q>iau=P9}#AdCXv)#__qxP@$Z0~OaxzI{J{CA
zAk6!}L%GO$#>~PK2#nm{06O^%6od}dzE(>R4TBFO@px9f_e*C5=<Zd~g9^w)7kTpX
zs#mzIkFsbOU?%wgP2k2^4ULFMTQ^JpJ<Ha~2XFP(fWarY+z@fNHw8uiZee;vOZjpA
z{OP>#=|@9DK3ea2^WO`6gb251CJw`V4h*769&HbL{JmBwkqg^JA_w))7IVo%0mpL@
z7-Hy9SNKu;^S_BsgavubodbFkf)EH9m7L@LykL;W{ipQzdo{s)?QFEa!Dro5{SnwT
zs18LHW{3#565Qf8)W~Q29oIff!Qg29@d+kv{pY`*2~8Yk7j5pG{w)xXfH9Q(M0uEL
zHG%W>egSILzk$*bbVcSa$36Y~11}-6=D%t3@V#Ge;c>6qAUbZmqPnmb7wI&<fS+D0
z69dD`bASI4JV7D^L$0yGzj{?tm*5GW83c&@eu50-3C7JB;0d&lkw}Vv|MoL7mE{($
z)$sR2A*G<1OyTd&;%5LU5}A}=$3iL~3tz_IXe#~vo2b)>PMX8tw*k_`?4xl+4uRJF
z{m5^KLAXek{f7Fa$QC0;->u>Ual?z>@f}Y;*6GVmdI!%)lh;S|=k?`c>o}d0sz}W9
zb$N%Z5pOkoc*?vlK~LUP{6gN~QzKp&D<@us&LQcL<Ia$7ZkK#`>(cV?S(QLYMv@1E
zl>QS<hkqY{h(G+k7-j!&G4=&M3XnltJAjPXQD6@Dy|6mqAnlq22fenNlgJ1t97{ZS
zpYx0H|InKz;D)Tpo<6$EJiJp`W6AW7e=t^L=(y?0zsxCeZRG=j(=L~t*>C0cBp+V$
z3T?QmsxOnY;i^kTa_9XKDd&v@!NlP$Mx)4R)Zb&0l*mulWleu~#zh*opBLXy2uReE
zngG_82m?39`-n^O=_riq))=DK1|OFhK5oiV($?RlVM4Yf{^_#+-?!xSeUm1J+EW^S
zhc!aQnrmWgjS!MF`u$qFV5j~cuu6piB7Of`lQs!h^FRde^Z-_MMn%Or9x%pe+cZ7f
zECyplxXYs6f^T?r&+`C9_N#U2f@;Uc$g2Q}c7WN2+B0ga&~MB>9R&ivs5<Z~+L2Dm
zZs<6=qz#`nUO^bI04zCW(f6R=v#0O?ep#6hIAa^1{AO3BAlIX}W6mWw*f1W;$YD<m
zN-8p=kT2LERNTbOVVE^&ms3y;+*r0#w?g=mGLqcPG|<OCrEwr^V3}R+X$Ie9R<EvL
z(b1$hA5ml;W-}trRY7*W9V7=}-~X0q`F-kBs6c|=>$gJFJZ*2o@3$%e+f*;BElA`|
zMFo6$K`>PlG?Yk9CO-)!oV&*MqNtd;`#CWJF(1vBwI7whO)iOSiZo`Uvb|~DIByow
zmwQl`=dQIv_q87lMHkhB>>D6;iAc(0rvyREsA#!p!GlF&<Np5&J)IuM7s7DVp186g
zJ}0mWSuot%vKvK!ghPR9p3_Ts8riXKusLj8eqQ6G#xtwC-?8y`@U617813dD(jP;j
zfITG=)0MO%E!&}!6bN}ksbFsOuJw=TK{wjP><VB+1j$1{Q!GVn$s}MRO(j~W1J4B!
z(E6-j-t;%_5SGK%aH3N9eGmN`Oa0$r)w?LbNif+57?$lwJH>j^nYZZbZq9;jfCP&O
zOiMlKhnc;5flf>)KQRkdD=&8>gd1__7mOS&TkC?OD1JAS*OM4Oc>H&Rw*vmoyU)Hw
z++CjGHZphNwsl>URsDTbQ9!Rds*5WU1}9LOTCk9_;s9r5eC5+!%%su^fJlN7cC_vX
zB*jB!vU08K&zS&WGTypm0?(GV*-_$iJ(BwLp&jHLyyUXhrX~6HIA(~^0craN$}71O
zKyC!oY`X)RKLf3x7KcnPCKGdNd_my}95(Vq5($@k@UA*_vS|X(xVXT%zTzgA2qU;M
zFQ7T`6g)u}ErOBkstDSM`|SMj2MAZX9qB9AeAhBfYeN7N1O<xf!F`;Hre+9a0PY>R
z?Q5}N;<&eSD=$%fc6{(I+<ANDq!Sf#$?cp5%2X_)GBW-bAt=psAFFWiWui^Cr)%l0
ze>^|kK}bM*djr5HFyMIRFe?0CvhzP$2!eF<S#NyR8wAVuwEWru5D0`Jm;4+cASD87
z{}5VhzLn2-U3%kgN^+UnX#qvokz=EY@iJS-y0|WI6eY-`9o9G9;x@Mkperl5Is*$S
z3N@v~a&^@)e_$Y5LFad7Wqj}`pM%oX9`Ejjyy-t(wi5z-Z4{%ff|E4`;m*JdObt?)
z4RY;nv{!4|11b2Yv5c7Rh!$#jh>KuiJp9TZ+90L#j41Z8fSU*Z+oVnBvIXZJ+io@c
z^wREO?G?xE{(}wss3^7r)s<D-kz7gb%#}T-iGB3~lP}UL1m|7&z)398&pJBypP||z
z*{IWQt#<uh=}6jd)7?uS&Y(nB_UX%m?f9$ryfx=}!t#2;nIC<At@UJ%IqHoH{`>0$
zM#GLfs@WEE&tx)G*uf<IZlmsm@1tXScoFWF(mTsT=~lAK7_;s1p0>w(GoEDKx%vtl
zfRoP*^Aj`0<rxlyJ&;MjYPRW0kxNj@)e|@=x82a%*qezC-`ki8_vP8I19kx;x<cYe
zY=F!!@W{r#e`zjMFI%g*DNQlkgCoG?n?|+A++2iO6L5R=v1+F>z&<%lK3(ZC8p?SI
zFjjtS{K)g4Ke;XPEj430&$ozKmJ15<BHpk@$HR-zy$|+kG<W7mZlfCv<TVdM%PYZn
zr0h$=4m3}HrYYyQ047BHrN4v15(VnYhYLNKtZ=^-kQQ7@xrWCP;(n9-7IQK1IhtUz
zU|QVGBqx;0H&V@hRqr2rYh${Z@hT3>X6ZPL6flA1V9{JhqOW3^L!maOy8{Brx|*+>
zUUsPQ!PDokUT)eGJ9oCa0pUzUahHPJZvuF{>fU)rOL#*ST+?EUt(*R{8$>L$Dxxdk
zLN~^0o1%77{CxA=Z-2D9>+m<%H3<~YgWazg*o1w?wHIt5bfGKTtyD4jab94V>*a?^
z3?k~}&i)4|DX$xYreSO^hr=&v9=xjQg>X&zE3ZkiPho8U_=1%?*OM8Ioh%stwBpED
zr*JRFXo%z6PaudlniV1ZoTm3zamM>^#}@nZSag2ywHktP4y}qeXI-duZ;10&@}M7<
zh%z{OEW{B?)xl@MT3^n#Li^*Q=ZAQjPOc1RJv{vug#k*-%G&<Bmoqc-oK}kz25f2g
z;AzGy3=hQT8mnl5R_D%teXz9Z;^S1d?Ox61&AE=ZVQY3CsI(=ke2jLRvmvSaok=gN
zdF3)x`tlX5BRYX%tIlXLT7jkg<F%ROGyceIe{3;ixInOld4E6^ND@J9QP`q|#FH(Y
zD_o3PO(A@WPJ%tQa91g1NULGXe-3IxZ)ZmpMa<+dhtItKx1|uPGhn!HG_%%jdcTzv
zl<If@TMO2JFcr0gza;EU70<q!<f-`_FFjbF{_xVlxQ}8!)546<WB5#cRWvc(ow1%+
zz@(>;PFGuK3TH!Ilx;YRVbf%NKr?jR=L<s+z{_m^yMCSD{RHck;WkAZpXN>2ee_+a
z3cjIT5b<BPz^uwwts;ZIhhQUb4ksimWC8RyUh)9Q|8?KPYyzOP?kjj4IXdWYZhrYu
z^0+|AZGp;-I||Qne=gBwS-m>?EH^J<J{SPjFCD$Ky7ABCEg8dvb1LPwQ|cq;AeHm~
z?J!bU5HnrYkjtoqH{_6gS!CYC>GP)Qf)Kv#*F+nk*eD2mzdQ*2HJc*X&BeikecTOw
zKIie<3qf(>$<N|rBD7F$t2=zRg(-e6`=##m@U1X~;}qpPUq2=-4;3fcTD|KSY6zr~
zNLR|$xCPH~Jsb?B$*w*&wZYsg5HKIeH>L}*7e>KCmyX!l8F99cn)-V7_U?Ji$i2e*
z@NXJqQ{~eu(7zw@k;6IvC3$Ay-?HtAz)j$^OqKedH!I=l*cD_yQt-eURci}hXXkwZ
zr#8WKq6Rq^Ug6$@FE`)VK*jEDD`5z$GyedMXrGiZoO0$)bqtMq*`WCuc6w&LPKssb
z{0gCi<U1sk%qW;YyE0Tcr^lhaHL$tRLl=KdIad#~op**p^mBXUQspI*%ac?$K?5K}
zt<<hs{2*A}i2<8R-mCwanNb<cCch7*f9X0?0^kFkdQWDAGTKd7-uGa-Tz}*FZ@=o_
zOF<!xqE>$R?yaP|5&GNzZuNFPctR!mc62!ZqsW=tJzEz%H3A;=3C<_Srj61nJ+LLT
z-~e53jrYOl?pJ|PX46cJvw?*H-(NT0B3_2;r+Dx9U`6x7J!ZYng|o4q;}?QwXMoiH
zV1FN8ioxb+w5T_t*VKIf11_fd_w}WV*Kk9geNdyN!u$nWlP>-^*6LGmg*J@UCm0PC
zNh^!CH1EKC1A}qNn&4mwFdPW7?Y8D4!s+0-@{oPUVp?;0H~+WAn9i18bOx(<iKyfI
zzwH7;VJ*owyZ`{XQRj@Jh_P$amr@1*-89%8DjeU`dUuzcFwt+mE49|U<J-$8an95f
zQH`0CS<;1OQw71;q+8I+(RNsV$5jF0rwJ0j*IKPnhEDr)<k`Uc2rY9w2!pDF?povH
zi+ETQ`*92Xd4|e$yB@h#?#9S)dk7D(+%FJ+QOU}+T#z>(b-7sMo_neOM~&F%1~d=C
zV8((-F(StrV|9P$ZCIV7;dE2zsw?jgFiw?O6*_a~@1Y`GL7Tnpha|@ZmiJfdi_^6p
zpZ=ND8`MKop8erRI=^C2NGko2=OzJnM#=T20-{<p72wNH!vvjYXy72;baS}2US|>p
z+d!NzPrt+AS}ns8)(*3FOGJgft{)zXhk|mpmPFeNyKk8n4-21}5rr}7`tJeV&<j-?
zlNzD?20c%Pt6~~y1}8>46(!3iKYJ?P*Z5-wP&8(Et;Wwb26A)*3L#DN!VQdw2dr(S
zOe`gnf^{2n7cAWYJ-?#TP2ha^a=JCz7=G02>hml*zbYcE`gQpwk9;F~4zkrm<@|0H
z+-G{jTLoL!z4txCZ!7K7>&1l^#i@MOrGoqh-qq84^9%|*^0Y0+7&qO{pc3)I*QhM+
z5B6_9FSbhm6jG~N&sH)Bnb|a?R~~bGu3q3Vmf&^_B~2bGzK8E8PZ3yYFI39?iQCvg
z?YyKWpH!?#t|75DmzI6OhJE>~z25TT76jr!T&LZ!2Uj;1GRwo@;rqqIS(=p^;SUT7
z+3MR8z}RmZ;n0bGWOO>T>G-Z$D`h^{pPLGA&0ai^QjRaTRhD=udb>ptIE(SNz-Q#U
z{@|=wFs*76q~_xppKb<p-+k9#Vw-IX(V>~Kl}j_>X{4UJ!DiI6RCyY6n0)~zaUq!$
zMTkd*(2DBM57${e&UJ>g`2P}hP5lPlb>~pXDwIs!>_+@Y#di{k+DRU^hd#Ad2^qdj
z@>F!v@0QiEd;eV7=ZVk7BnkS%CAJ*zW3Hp|he41MHYN!8ciY`qB^*7zA}le9LdR-z
z=|9_97aa`0m|8_U+KH*o4{s%NwgdaX`<qk@UU>bw_NE5r9WI-15ck!^>Fg*TIqq-%
z;%=Oqf%?%yH`9jBkvnvMI^39gtgh}8@x-{tqLTsnZFUR(xv?DlX<t0f?m^IwG1U*4
zsAHdr&_sI3U)>m9_USD`V$GaT_+0GGC-HhiL6%ky#>eq2@YB)tw1%Ay49y`JT*xzL
z@gwB(k%{EAemY-d!4g#Lqa4yzYHuxO;-5sq|H-}`k}M5$(Gt;xfH!qi`?O8>@{C7}
z+%q;^)ZSgfemMOqZsZV$!y;*JsMvZmC#QRRY2YC5GS#YdL?1K{Ukdu-8<L5Kj1ENK
z$@nj`L!TjI7)z3lYkG5*QM^(44;SE^25v?PZ98{L-s^_X=qj1&=NO8i_ov}>hI7pA
zrWe1_RAaDY;RyfjqKUmu7V2m(@sCgM`)MKxP49bpyuUdP=+j}W9;7=#aH(vl{(T9!
zxS<Xb;TC_dVpE&y8qW0nHcL9$wPXK#eGnU7bS-{QT(Z%v)g2@yLztPfiVI(I2VX?c
zVgQsJJvE4t;B<0iZ4OhSA#g4^#W!MEJVM&v?jaaJ@`6VPl38~fb200-S9K=KkjpAi
z$)|lIdRwBx6oEJU#s1{@s2RZy<P(Y5mu1tQGiYU)37R?6v}quhqPrgn7rHWR!_wSL
zwQ}|Zo=HOgCU$e*RGrk*z`ovKbp*F5lDRYd6zxpd(kUmUW`~xA#MXQl&3A9y04!os
z?T{G%n=No=a5|}{w;=WHAGGXI6>L#nX0XUekx$oSzxE(V1f^^|C_)=+i4{_>U#i@D
zlK~0J`9@JoZ&7)u6f$!7xqrUDdKqiH_uTB@3beq>HahU<uSkSGB;CB%pQEekZaTR<
zk#YX=tq_-U7nKe&cP>Uc<_n8u@L>uRinS<&)xxcX&vtvLn66~i)fPsgo7>~Xyd%7F
z%A3QOvo#ivf#p@ZIp#Cd;i;fv`i}Hr99%^=H@HO!Tzd%;R==Pnp8N06Ac>s^tS-N4
zPYB~_qW9`8Zx#`5^Mjjw5#s5AtyvERf}HN3ad9A#(m)|t>|S4Edu=V@QDG*8C-V85
zL0{Haa-!fnN18%>@AtPB!l6{zyTln0x$qc1l{()3m(rf9$q1q^9|q#d7KtWn)*&Gc
zftyH?3oCbW>hQixjR<+9ERzDaaPWj*&rc_NgX*KYFPiWeFB3JZaq9b~LFWj~Q6ly|
zbl%eYpo-B9qwuGo&Q#3SQXF@Y6=jg|2si6N+An(DX?Jg@e_p%;i}@{-%Qh=nwb&{v
z_*QD^;7dxm6xlSM6560MLY02sa!n@$8y^e#2)-2CDOI3W?_D6J_>{B^rQS7r5oyjy
zQg`*iDbm$F$k1rk8sOVTOQs?IQi7G`kkycSo$olJ_!6MNrSyvpO5`kCXt7R&A)-Um
zz*wW*-><%{ZqPiG0oouPW~=K|)wm5q<kEAFk0%O<>mzz{^m|%LHrvIt@`tRl%c##r
z6qi=<rz_`QQ=-g#5<`f_CXj^Ya7m+4k>q#Yjkpi%qtp^OJ9z1+o<xhi%jTtBx?t{d
z2Gz`)mt}-lInN+8Zz@r9cn)(!N4nawQUoW2$#Sx~N8cNq+fgHopfGeLgoKeWkNwf@
z<fL$9LPN_GQ&af!r#XLOAzyrMzDEVu!+bTONN%lr6ow)p6<U{R9|;8<*caM_2yzFh
zBS>$1K~x}});F8-8?8Bt;gY8A2%=G@4vwbJpjqCkw-mU>iYc}i=*Ef3!=%?)C&Tno
zo&3`$+@J{3o_PBB5N5|apM|CTk_gf#AT)WHllaSgiuMyg@qn!5R$;WTUo0Eh+A5{@
zm!o>O(Hdhz%x;}Qiy*hxm_?r{<tSzI%>vp(k0%2p`Ev7dM9##SBB;;T3>nSH*X}#d
zFluJ^8)=K1yzVd%8L7~@NcPt25|BjA^?uikc+r-6oqbf;g1k0TR?J7<$f@Xhr2UxJ
zgDVU2q~Bk9-(wz6h}X%29A0tZij0|LfM-}k<6~q}o+(}|_*w_HAxQZnx8KpNY~H`O
zpOh5pAPNDEn(4yc_P6X9mcVL=G*Hr!pxL0C9#%=Jp`?D}d*G-ZeAX?S-tA)*#!G+r
z(`y)Y9u*&07jgU7xWXFz?mrnHl*<}h{HEJMM)i!$ONXEKj-6k|hc>fc2)f`nWI{6h
zDO(T{e-A4WKBAeEg~#@g!h5PISOf)g)<Ko%8S6zd?YHnzy((x>r$i$b%cUb*yY;iA
zm)m7b%udKxGp@S_O?JKvo}TV{n%Nr3*~4TsSdf&yFGsB;C35CV-GfX=|8rib9FZKB
zSyh~%WtD-w$7niURbT`umu_K7FH(jmAFNF2eYM;Z$awHf1my!rp->oI5x{=kQ7v>$
zIPZz*joNRS`q3nq=rW$S=~$2fe+q4irTiXzABdV3%Y*DqCh|B#`9-&nmd|;|*{}-)
zi#U+2McAv9X9uTL2z^rbAd|Tw7ot?|R7xtfep8_Su?Qbk`r{Q=55f(P&ULc{1WKcx
z!7V1~{PaR-<#EwTWr2-*Xl(0Xd_nOm;wz+=+M{f%KflA?efLrn2aP`x(uJfI!j1n)
z7fdisjt{!`vZG~o&oTU2#b@L%E}Qyetgs1PrkY#}|FU8G-HjXbGIoAzr>+Sp96lEv
zOy1F(&5oMgH6c)(_SzAyQ0Lzvz8SM=!Yy1!c(xuw&;+gl)_Jl|zJt_3bn5u5+^UHo
z^qj6bHC}fGzjIrO_^rc9QaYi+@Qucf$M2-NBt-CD81<$4MD>I_c@+I^HJg`BV&SaD
zEWp^{9er|yl(3kX0If9DN!P1-#Q1yqa9(J_+lh0O6%SQ#T>Wt|=uQ=_vW=gP@yYJQ
z^|<e)gCCgG(QPSZlWz39!~HV2U;`tmO%R@xJLMPH=ab^bGoX{K-(0#gjAJvIQx20-
zsYv?fiArLIeu-)w4VRGrT%_l>r=0hTsPu?j!f?GX<%mpux3@vGCYRp;s-FD87bGcd
zPL{L3?k6Zas>UW~y>5ITHclF>q9N#gZ>mpz=J_M>&j5VQTB5rkB}Kuu+PMF#^E_sw
zpX$hF9jmwtTE?I(pV=$BZ>stB!qss#oc}Vhrk?6vJkJDssg-1_Ic{NX)fEP{6*)D+
zDgFfWC2GY~y;7(6=K`6>Tg;^jcGa%;SJ5peb!7>W>aR{PTl~*TLKh!(vRzZiZ0>Qi
zQ&dxKr3gijmD?WFzmHK9B*KHfiG=Lr(i^%?|5(_QApFEzdlc3Z;Fw^tT)$%eV;&Z|
z34?bufIV+B;w4I)kgw0p1Yud?51!&d5ihZ<rxf-tHi8V7A;g!t0l{UguyP{j!T4D^
zbS+sU7hirVOks*x!Rs4GvF{CBe1jW5*_X)TH^8Mc+`IMy8jVY}H20)9=@@<hHmczs
zvFLwh6j<+TlUvXW!f9@!WygDvM<u<Nu1rfR$-O1(<@a-0C{2=M?kii*t>(rDQZ@jj
zGz;8T#bPX}A_IM<ClP)RW^ti{&J^_F?em(+nwi9Q#9|tu&s?~o(^@LJz&+{rgzKHA
znGJ%;C9clG?_oW6;mXH_X9t7D)+-gdb5U&tYNXPh(K3O{)nh+D)T+qiy7)$zOyYJG
zSuRwt7vnUAF*h>lcT+EO3|+_UvMC)AJ`?|aFhfA?5u4>)do^Y!Tce~y+<~yEE3uG|
zSGezXuho!9&PY*WwOWfDCf0n^F=&xB!Ehc;E#r`;8Qq4N@v~Q7f_Eu(lT^;Qa6vR8
zzs~ZB^bcSL72$K#2owT$`&~n)YZp)n{T5IwFR8oov$R;gpJNmcSJ4jk-rI8|K;J(e
z-x2Ey%iidh-M-(Xh&Eu{Yt{EcHG<*B9@Yo^9=L_yi9){rW!V8g?Es>2o{C;X{KZZX
zII<E{AUE~ynET28yw(S=8K{jo9LK_IN*;Y?J#^)V8kF*BcgR<zInnt@Fjq0TdXI!h
ziM4OCx1$C|aL8w>;dO{JJOjP5+?9Bb9y`cwG}VIZgsvZxi-+8>j>PX*$WSSqUBEP_
z$g9oZ&)@>3JdZt?Bv$Q|H&ry#Tc;90-zyxmSQ#ENlY8bBgy%?x;pTSf>5os@#@243
z{UR}w)a;n-JvXbkZmZoUwqEof^q$<c*d6<zye7=?fddDCpR7T@&2z#bNo4LCRsE_n
z(avSqI8(GsYfz*wdkVBELCtD9xPU3IYqZZPUH?VMhkbb+Gn##%&ai&Yqa#5AhxkWm
zaV5vDa9tJ@z2v1wnAo}ivd+*B;h25)H}`k@V5Zg1mM0Su?Ae9D2!IpdbjUp~HZS}*
z$87}d5|;5VHk{hcp^S+X&m?yOgFi~|cntr<bZ?q8V>TJ(m6XSHuw_3Wxf*nt(C;D=
zGJlX|q5eIhXIzL-?4BpL;vWLjKK5DFhl4S#Urz=IJKgc;FR1F5s|~Wd;eD{WQTG~U
z4in!E<B5%_@asd_jQT0=y7<ELrF0ic@#Red#PnO<6{1G!mLE{f7RB@gsM==~ZIc#X
z>qk(cIx+ye`o1<l*CnO5ULK6SSZ`IURvD0UZNI#wqzc2Mz5S<Ap(KgSasj1kY6wMP
zeF2`sW*EL*@p-c4war!K>%V?>!wJwJN(D(~jksCba!mV7oBQrBCrQSwhD?f!C@4if
ziL~@*z4lC`*IVrUJ`IAK5A^2Av%cXJ#J$I4@T0G4ykAvT(>Nq-s%55bCePyb`sL0;
z15Wmsf=E?^8S-)-5~e+11gBm9MY8HIRh0=jl-A@K&0uv|8NcTf(1T$H?7Mu0?Qzcn
zU!<JC0=t+IhezSY&4oV0sff7T*|fT=+CXIb815OI8qnhwnW6A<M`S674zu-Latvd7
zPx-+zXQNB4qUA#OmL+QTK;L*26sTSM{^JWpR!jFfTEB_}P@j!#pjv;bkqYd6s<CH>
z<86zcIo6n%(l_3`amQ$wJBM-b<d)M!Xvf*#s1<^1__d-|9{Y=azKhDe+OpX%YIi7}
zNf&$|x$|>_UZ2LD;b)U4OEvio3%ZM?8Fx|L34U%$9|rz<LVHtq6MvkPBx_({an`}~
zy_v0bC=UEIPh@|LXV51~p-#S%$<^y(6T(2}oBXQz$n9E}qQ7?uDh~krv2Ev8Up3-A
zj$in24Nnh%wW$ZIB6-f*_UO_iA-|qjX&0X7G((Zb3RSRmSBqG}cie1?ouxfkFiQLk
zJg?>);}M<&dx1>~qdHY12B}!KOek){VKg?0?W0FV;0?YjgX5UZ${W_*1*Q{ejJM<0
z_vxEjZEKBHIasrPeZCo344XlV#NoTxnlb_%i8B_qevDMnKBV+|A-1MtN4HAp?oXm+
zdbyB<C|My>O({?O(O0*<`}{_VP?oM@q3=oXub5lTcQSAc%uzhb|B5ISed(3hiHxm?
zSV=XrPI(qD+LU8aPN}AUw7<m=`<U5(RBTPjUBzn>l`NkKJrz?fmRs^7`TiSTS+vp7
zn}LxUQGfBaGbnQ;ggf|-_>U1Vo<aoWCC~NF!^%9F0YV0etfN2@f|~b8R|u)_7~yrl
zdQVlV<z4{|djpa)Ccy)`L_-~Gn7h=fMV*}h>dt)u4KOlu=nz(`kXvIFV%W0pl5kg?
z(B66ERDx~EY&kDWrBtifQWv!}kRJ}M)@tjJnCHZ%Y%rX!nPFYL+!Vz<K%v*D2>r5x
zPv(BbB%<?Bl(q$9hi<~)mu*XH<5iJ0b;``7Sbl13=I$>~J`|845P*todzJgm2~@G<
zWtqH;n=;=mgLnuxJOxCV=;(yVaupopVSA*B4f#Aav$cg0Krc5^rzy*)CH<5a^r}8$
zlpWa!JT@FGX!H0kCS;55a2R0$`Y?}Mcc{y4DT2hbTU3jjUHR#k^$!Ql^NfbVe%Xra
zE0)-NPyE=hbQq^YQ+-j-2&zrb(8{Bo5n~XrIw9d7E_XhWTGp)hXlwu8x_Q<`g#-fD
zZR>aJr)`hlFZI>iH^IuhDW*Kz4jp3}{ysceKYSSo{X|ge(UdToiMyOCKfW5ANZ^ns
zV{d2uTgj~wy{Y(Mj-iX<Vd+0`3afRSLVU+@7c_8r%yE03bQS7a?>?Xq1l$ue;@Dc~
z@ilLGo}=@r?d=9V3PHNeCpCA`3_-w=^1^Hi2ib^o06x7fQ)u>jWSL~R?XZOZW|6S{
zxYevq@3$|Ztj226(Q4*Wovo0+;IU@lPwvdN#mz35J^p0=QNgzQ>4OCRSVDoILcCWc
z`+lB&Zgl;@-!yw&)4sBlyWxoG-2d*L?OLVmo{(0@jP4jQWx-+=9UN0Y$v%V&bnP$i
z6r-K?leg4H_q>e~Bgf0qY-ri5zs6adAeb(L>eYKpb!K*ELT{6$V+)IB%z^rS2VFCQ
z^nQ@WE{WbVmFQu6%s^BQk3ddPJ>UwjznTaqyi8RXt2afCs<j8CoD7I}9u|Y*uLW^1
zAzfPl(UmFODFrjyVn}Ew)Pm@xT{*QMV6MTA7O&lIyNF7JbVI-9`_}*0F*x1(6~VkB
zm*06%1!?HZko$SCk#4mepNLk`RFl?i{XI*B_NZAzt8LdwmOt2KNq3!-x`a|qO8BV>
zir=L@KjrIrQ}bpm>@!u>)mYktVa)n9=-0_4!d^{^V&+Ynn8K#45m*uSdZ^uki-7Lu
zFF9a^$6Xu;YHircw`F??47TOnET^OWP^51%|Kuqi!iTMVO;Y3xO)hgWsR`<xpElK`
zg^|jKZk%eL)Rle$N`lK+1912RuHs-N5&C_SeSdmh=f&QH8gCq&@3M!_p%SFoc8W)%
zw^nHf#L^p;RyyU!+lx3$#o2>T#p=_9H1MtYBR3)n&hxm(c9Rfxe7O6#5_{$_r^mx@
z9kb7NDAUgP*Ikq7u2<b@d1mhcb(bDgu{0t@{mfRa%CUfXN2KbuteTJXLU%fq-J^h*
zr%#2=#-6<jb7Ucv?Ql$Fr0K?{lwHLW?U}=0rDW$2k(yNK*PK}xPqCqOcV)`UF591{
z8riw!9#~|(g3s#3|9oBy!x$0dZCYpyqRG}Mr$EDc2;ur&$w3<4KQIa@qyM|Let&t%
z8_GGpfi*7G74XhZyMv?rZ7jD5sFbZ13*F88-l&OelSs7=ovcV~a_GfB3WZzsF87;E
znM9w=17D3FSl6eixqIcmN|oAeVLt1|og$go`88;@B2E7I8joEC&Epo(KptmLK*a$x
z&rW4BXkboIeA42kmbg73dLp747GgSu8hdfw=&Tq1rkKMRHQhzRM3WyzR0`2RmnNL<
zht#VL8V!Jss8y1%pIm|R&H51nWFj;I{nnpB!S(@F9@33Y&SURxd#FS%=892&VGI=A
zTI}nn3xrFxmpSt&2(4z$fmoZ&vDr6XB?$^hO@Ty-C@yxRu(0x$Vr8}o?6--YmaU?=
z0m6j0tN2=ETDdmFRTv9*lGNE3cij~&ulHdna46XMn$=DewkrR^7nvnH8?9Uwtb?bf
z{dkEPaYhG(@wEe;Gmu=H6tyF<H};`Fr(0-H_^uQ)P$X%Bn*IG8k7d!BzD#}RO(%-m
zd2pBaVw!O@`kcGmPptNVmj}8&0j<T2{ijFk|8K4S`|fv$HYP;t@849>n>C`D9r@WL
ziUg1eYWM@9LT0dHk!R`}_Q<*Hljg_bO<pBwDF`U6AQut6m4#m*<`;Xh)9&MvPnhZU
z{V#7+92Z4NR;5e)-2^>m)-iW=BF#G5Mwb1lKH&Rs>cid)D&Vbj|K@rT{o_-PIT9J@
zL+Dex(+!T}d*5dWsIds5F)(p5gzQ%fNt$$Lm<clL2-%64p|+V;Dhv^h@oVwl@KY8h
z8|#HOU}($B_Wt0O7jy0LnpMP3+G4J(xe^JJYS8p7E)bg>)*84J4dZa#dnUmVn%g7J
z&H#H{S)&>nz>F0xAbJ_(_6l>`cIa&p`J|d=59^47Ca4sj87Z4GF^6?mVvt1|q*`;`
zp&fnZ!QF8iI{>bP)2bHPTzWp(W5cd#iylYvRm>{N{TAu>*kQ3j?3zG>sfNJrqZMj_
zTwV3j!B5aV=ZJew7Z~yO!j(It2l$IVn`Y(Ov!xOf<Yt4SdW^bPT+k5A;)jH0!~f$P
z{U0mlf(2;22nKHO6qyT1K6b&N2=jDuVw7kj>JWxx%Tv;nPkw~bBC7#)a&@*#NNW@!
zVuvgr^56OYG4>|lRBzGycu7U6IEcz{aE^J5WQZj5EL6raWQYt0T|<P(6d5v4Wr}2$
zoRmt&%oHl6%<~)~^<P_cuY14W-}m`HPtSes<#vYs*?aA^-u1rkQcn_nG!QT(Hw4Af
z*=6Xyj*35XUmQt5rt>4mf_a9dMVtoYnpkS}J+x-=*<?G2020JDMHp6eLE}*vx(@OD
zSEqWZda~jiN5LRQcw>F#whhl{0#u9j#wVRe)YweMxdnW0cXQI^81WGE8uaR0uf%<p
zGmC|eQ&)%zp%QEJqcKR)`^Pt{uE!1+5J93MYjUdad~vnz1KmcC7F*_53ul(%YEEge
z1iDR9tdp<dd@2EP4ehg{IW>AUdExK{D`OI=kL#O!7Yah<gjnIC<bja=WhwV?MN!ys
z*b2%O7KP43!}P9gQY+{Cy7b^tJ^xxY&MVNv_k6yO%6b#{g?_JkZ3wMSYI<uyYi=nn
zy%KuLW>q5x<+D#<>146#a1i8N2IEElzW@In+cuvGb|+8`&iK^fiJCG42WzWv1XA+S
zJyLQ#uUrbFC<jnO(uHEr;nB}G4wSQc54nBem-luL*0B=`e{ai#_!2~bmFnNg$Q~A_
z@B#{m{Dre+kxb2=<kv(}2Vzls^X_oXQ~tnxUq3)QsA95D$jZCpbYcc6Y3rE}c-7b%
z`FwYIa^-ebD&3L-5%0ZUF)lgz(>`}-?ox<$r*eg~6uNvG;vVKFccKXxx*FcGMq|oh
z8uq@yxR!^#X~v<i!YSLlK3wSII&l?OjXifn-p;)?r_oon<C-@sGr1+qN!ZcEqIB%q
zs^g>=%e$x}5p`6;DVHiwejPmcc+B>RW}n|7`E4p9cN$F9(m9FL;;KUD%4M2<JelDk
z_E*e?TJ&6cd;vP8w8mrCp5LKYo^>!)F_RRJ6NfJ6kCNn2oRL~~jm)TuALn1=6e{r}
zF63RGl%|wU-PM9E&h;k6>f`@$5VV5e3DFD`&^08|hOtrk4Orm}l(Uz%Jp)ooHe~eV
z0W5^-NNW~amdKg|Ql86YOb<P4=d7zj9&$s$-~8q)b^~0$gTB27BBo_9_6I<6Z1<zU
zk$}~ah~ekruP-otbyI&J5TNtg5DDklJ}2A*BDCZ4l~M%preXf7-rBOr++dj9ya$fo
zWg7A!J&az(M@<owu}NhUyEmY@UFX^XJhP@tb*eawC10y(h=B86L+GUIXz}YE)wcCs
zjO{gp_SxfBz4IL0U-3?%arA28CkZ#n2_@Rh5)Q}_^`cR7`WM{7TvFPHMwQVKuvim_
z71T5uD$iQiSV?!z_n3`vj{H!pVtYn6Z6FsXvA``*Uj!J#C(P>#ReLo|xo(aY3tzp)
z0AFpcve^A~rB+Q-)ma_Pr(VBP^ZJ0?M(#+6)2c7LFirflV+c2L&TQ0$y-9E7VbPTJ
zZr=gS-1^FmI5rcf`T>uB5P^TH%U@Io^q^E?;ku6+ZX45QErKr1q-iPXNe|9~RfhYX
zVN9<lPn9LB;tob!JUxHTROLX&Cyz-!wtCFQ^i+@6vpx@?n%SYsz3CK(+6JI6#|T2&
zHuji0<a%PtuimR*4*>f~;I!nW^fvI-sD5VaIVh!cu2EdB8vE$4IHsw`(3@#U;ov-|
zO2Qj_X)0d0b{1tB&aEZ|vd9S3Xef#lfNr;7sj{=diG6;#P{{&!jt~;r6r3LWKrip$
zsG50bhTq!jkxA5=bL}oM#`u1O&T-YN2@{5+7D4!s`S%>;=Wq|Z^vOJ1@$^waO?+k1
z7Zs{`9(zVFYkOV=r4Kvah#;U+a2Au)%P%YO^RPgqbJR>Fc%avETDBU8q?ODnt{;2$
zT*++^pzn2GITsDZdxGnHJ%xx1_t50odjiE7nuNLttx>q%i<5o7%kA*q&`gndTV~T$
z=iv@T(+E)iN`H8MU!CtZVaEgoe*lc&(lY@G^WN`uQ9*47g(fyepnbQm)b70uRsiu>
znyy>|c;H`|*}ra-?V5<YVr{XqgN*#(u6>7V%4P1>kd3`08nFW_mo8&>^EkKsT*FKA
z59L?7nTd5uy6N<M<iTgS#B;{JtOFDBQp@p_N5I0Hyyx+*<Wj!<Gg4^+6OgJ@nu~xA
zo1_n9PxyIgt`V^Wf32gxu^VNBV;sRdFu{9pE*_AB^U~1}nHn3Fs!9K9)RW)OXM+Dh
z2#5>~=|7LTu0*;F&HY4BZE|T8A-ib(aBhNql=VwZxhUR1m&bHHi0AV0<Jxl1-BUYM
z%evrMYMD=biV%JSbfB;r6R1ItB22}lioEOIS`x6BAe9QCjbkr1xUZ4IyPNfS=b_kb
z@;wM_H@z8bV;-6yQZ6U_<RtBvgW14|Iigyz0=_%?=^`z9{6@=OV^gR3<p(CuVJI4%
znwyR(_1N`SeQdZg1suup?un~f3n9DqmhSrNh)}zykD(UcsjvOsX+EdB%SPHB&?@l3
z<DwRC`>Qur?N+^u>xiBi4x#1AK5AUxD`P@A7c$>f?KsyMI(>=ie>}4Hd$Z*KZ~?ZQ
zKVNzhJo{Y74iE82E$CF)Q_TSz=Fojue&R9hw0BB&ELK<|7qs`J)e}X8LViTesgQ6J
zmz0WzhSi3q>aS9C90L#e1k`Zo<bR?<XDvRwH2vYY@kjUNj2!V`UK)Mc?yuY#)GEc!
zDM!d{hF&0Uk%3>loVFR^aH6nP_<bx#Ef?-7uiA;!s=^7io?M;!302wC#KX@(E9I*p
zAGG~wvF#GMZ!^k3-Rod_=>HK%8fX!d?-ED+Zc>83Puz=9==neDQ%*laa7386JG4={
zq%YO(<I>>e#>a7!N1|e-X+vqhPKfClk!S|HrN<lnCPFHNvs@Rdo@zf$HafSY7^wv!
zu83jo!O_Fiw9>&NTphQQBcKb(29#J#>E*ZFO;?m6jifx58rI&|L-sDNm{o~$(=nn0
zdZFBHh-zcghlKwiGXHDpoQa_82SXM6N!(RlKyccwR6->|^ih;$sQsCl2mbin!ftaH
zc{+J7s7Bgiimk)6uMRc#Y(&~d7-iz#s3<mm9R;N25$iH1Kt3KQ_i$*77y3VQ4@7f3
zgRgg`Y|tU0=FkvflyC{sPf0de1|FqS!THu>rvvvN1B_qI=Es%yWkS%V7KMIf$-r$+
zvO6N%<wy20OZQ=t+==`<TMFT-%f9lVT%p>BT0@M*ifzQN`{g7+ip4YLwR6PKUMY>;
z|L14<iFkm|r7p=+avt1XOU32apbrVYFqD~`XP@J}iU)E5;L{ei>BKF9oZL-AkzYO}
z<#I=Q0Di2aANam}<GI4pq$kMjK0SR$=+}1XfkyKL&vHis^gN3r&#Q+^0q!jz2V8~|
zh#Rdb8g*$)1LA-Ft1KmC(j*vltd<fff@v@J6fl+!#qgX<1O#AY-!AbZDArzs%Eb~D
zB0*M=eBFK;#N{9w6p$%*5fD8~NDeIr>%y{K#yh*EfGVk1+NB|x^@f=afFvSfh0V-T
z2?Hza-=TEt&dv;#=_Zw)*-c5Ha)V5v^U?k_==^ueU4)t@+lWXIdImLJZAfnVpO<K0
zM0?*#ARc&<|L;SB?Sk!Lktn!v8tX@j9(H_18X$6sz;BRu#H$_pRfYeD$o&5Z|2|3E
zNSIqmfi$c5861IyVF=5mjvm8bT;0=3e=xI%)9{u<?UsVHTfg6bmtE#2x@`n6J_(ee
zA$J*Tf2o!%CS*AQvpxh77&buo%~JXQe=U(+6QhTa#^4L+ND_cBwbwQPfER_VXj|Ez
z6>6YDi)O(V551><FPg=j7l>@V2((c{a|%GTzJt5#>oGT%{eK4_LJS;kqmPG1cKTv)
zg64G;8Rz+}HbBuN2ot?BxPYHQ-g@iaZ@MRC5q!Tdy#M2CS79P@6kQSPnpgX$K;(hn
zoMc$?J(2M8vVE@I6u#R?;KhW%!}In#pn|G30fjHIHAV8(f#0BsF9Y^7ey+no2oXrd
z1tqip{aw(TDy91POWAAR{ut^#;u_-VxS*UWaPH5ip}az(PrX|4Q@id}pfxjtW!g{D
zwUt>1@B;~t510t#c+Om~kVeMyd&3SL1o(#dh`D*!e=(agY@bKF%~!HVmCNJnzkMt7
zez`MJ^KvusAEw@)#yVw#n6E?Cw>t1E*e*1P`ePK#(kBMZh5tSiFJae^3a10>|KDF`
zh}{=zvVPz=@_iAw^WKc<(Oc%ffOp#R?9$mkx1L2fEM0SPmKB*?>Wd=NSIywYs9zQ?
z?mQ9?Fq)#bG4Jm<9B}~c%~$*-yZ*B`1LU#Vq;X10Zw9{)Dk@${b}|l-k9@mH_JB{Q
zl>>i`W}>y!p3EO72!W~e;!Bb)Ksm$|(;|%9DgHbb3Wy}p+jCsV|K2mdzl;qBCC$sM
z?bG!9R-|#Btjo4!o}E?lRg9Ns?N)G392h>&sYB$yUgfj=Hs1rIBwLr73c=ETNp*Mr
z?5*Dil!nr`$X+TqqPLn^PAWl4tURKgYap^nF-=NLb=|j}qtx@%?5`#Am4h$fD7|%Q
zDNn&sV6CwE69fLNx=bo{Q2=ZCWT+=m&T;tDcE`Q$cG`Lnb;G(|){8q^9WnAEIA|tN
zZ9b<V0bdODD%m(NKo;V7dA2a1ePni_1CBL%^53h_x1F?F&M{Zmuy3$Oh|1_*<o##&
zdaNi8*Bo3>A|2v*U;UP2`opQf-_P~}d`dBf-`|3S6&<@pV3;NWWSRs>0$uxSeQweP
zbnSlspmUb=-wTgbA@yUez<r;i4(OefdlP*`vcPM>dD5x1{p#fW`*Z%%#NxQ-UrW7^
zgyOG>eu~)Y(8?Dx*Ufi~FeJsz*Z>K<BNLIQ5nXYp8<(&TLLlO*w&z%r{re3;j>Aeh
z)J?7M#=No`q6nvYX?l_ih77a*c_Lcl{nGE#u7vjMkXy&$Ur(7*p9HA61*qv^HuC)H
zv;pA!!woQ_6ZCVih+IP-t)_o3$DeImqA25c+omg9pUSY7%lLNAeKFEfkWtMB4-ow<
z{qr;_SxkP@VsMIZ{Scm7@B8&hgLr+R*_DX!faVL#tq9+bG^VBwlwpB{OQ)f-?fri?
znMJA(<2xFf(im%Y%-1rzcb&Z*8VPB~*ev{H=+7~SP5yP(u7|l*mi=m=^yc5Z8v)ll
zSLR2{{LPQ7RYWT4@X79VEZ>pGKcGx`mkjekb~XhBoW{O}Jt^nD5-<7hBk$XXgTq2=
z*<Idsz+TGKTE0*{qL;&3zG97(YQXi^0{EPSkd(at*MG=trv%>!U-5w;wt=$}rq!ff
zaKn*c=YFd86`KbpNO_hcF+dB<inG}dvljjICHJ<Rgl)CppsD|B8~=%m+kB%~v8I|%
ztQC}B@4RV^>8l{~QVEx0+w3#Qe1Tx5Q{mb6`q%gLAw%rGk%y(NA%^qNSX2YL!QPuS
zzv(iU@>jsUIZz*3I=R4=eEBdfV1#aa;eiWZ4ISZrH}U4X^Y-tz{k@4~6v$|f`!$6d
zSLbrDeW1?Opjwx^Tp7>1xju48!b9f3q_g>4eV3?LfqHSvEDupp`JarjZmIzKumAkm
zoOZGwL}qos)TvuDfb1k+R67?nB#f?ae0Az#6f*NCf+Q@x<d#`mDTo^^VO)az#?s8k
z4-|)vX7+A$ZYNdogZGNM+Zz9$U6-zoHKsD+Gb|UutZGp@Vj|iVFNM}eWu3luuB68L
zz~<KoP{7)fF#CRlSzl(W-0U}yP9>s{6zG4JDly!bHtrDuUo5trbQm(ous$N*19-#B
z#6F-ox2(<&s`nHahqCho(!E8ptb=!ZvegpI5cOX&R8^2Ve=@-;HQ4rEe-U^11_5_)
zHa?S;2y85qr5Gqg>4TqelF6Yvlyl~=^M8Vl)=|=+&)kPVJ#pE|$cu7~W}(sa*O7_{
zLP^bL!g_!N<S|E)rfRqkQ71N=pKG&^Y<m&`eoI0<u(9Vo`G!_#<dr5>&=Y{XZnVH>
zvcY9?8b1+v>P6O|tk4YV9tSvLu~Zs>cpgV~NAK4Pv@>oF@5&I4njAh@-n;k=Y+72t
zRpicOm*$7>eGIraozK)y{fD3X_wMkWqT(ztyuG1nD(85~p>4qxS0nxHN$JDAo6(~L
zHN(n-g_&TPbx6zY-kDYX{NcF>^`*h}SbpQnME9rQfo79t1pr1fa0#83=2@UNH*BI(
z#0N4GCo@YCsT>q&)h<_UzJ3MAnBk|4^GHkpgDxD}9<5BgZ36j~$g~mR(JM%Z4OX6(
zI?^=1{b*)#3J&l;XY%ixJU~%KHUwf*)>QAUocnXFYNF@t_IinAGmg?;y=LT99hJ`b
z>+%qWL;US&<->g>ltFA*sy5iQ&9@E-KV-(j?C}UamA!QQBEUklK3jk5^+hnLjT-)(
z@mkKV6a2?46~Y-3t}+N)wTb{xt?v1Je9EykS-eWyY{%~XWk-wmW-Y_$fK4^oFzoTS
zgz<DN<uN$)>E-6lbXzZ1NEt`SXgK^-tH}*dL=q2m3X__V)6~CJlr4i|VC1`iNy+H1
zW8mw8YEt|3rQO@fhG@2vdmzYDwV7=`=mU}ML$nuAXPUu2DPr1`O(<38a|SsN`xWR$
zD1GRoJ?#oiBD%bjwc;@X7d`=Fvn?4698v5zisVLT)819MEeth|fQ#KC!^D?BW>Ekk
zo9jdfW)uRxZGA*%^$dluhdBixv5E(6`pdP9RjHJhKr`rCtGxX7L2P^;w57z053+AZ
zaq+)*Qh+*^k;*6tw`ZMV`1OPiExL?&BpXg_rUO_*c#0OGB7?Bs^w|&=Ck!D}*sO+Z
z5xSz6uJj4+0OOX1H~EIg)w=v_onLd^uaV1Ww8@25kImp5Fj*2n<;P$5C>QX5O8^Io
z)8C2O^->;>Ww;KbZrz|&ldmNC^aN1jN76(27c3CxI8h@-A}h?f;4fTpc#nzd)8VhK
zD<075xQL|FMi<x@c_`so?Xk$vXaD!*uzmM-)(Wxx-_*9<)0Cd&xBaDubwPR8{*Wc$
zu?$i83Ypi%LxefWwirn!OA-ltHW8UxP67Yk5<+kJkFlQ!<Ld{j|9*SaqOgfnQn)K2
zG2MReMuM1=S;skC`<aJlB~-Q5V8U5ghd0na?_0NlP*<`P$mz8d_?LkU#tZj`I-TF@
zopa~Ew6-)p0*=ZpftTv?Pp$NUhN}e_WS6SGFQ^K8F56|S1P$nd4xr_!`(jAf&K_Cp
z6<G4jeR#${CDZTRtHGUQ?U2^M?hWN~8mr91BGy_Zah*&gc!h1nn!)$1E^c0T4W=ae
zw5ts}Uw*r-DdmRv{$!caKIgt`X#GTd_?RBUEEQUvayxz1H_){CtHy}EIMAO2v!Ip0
z(w?PCxA=Xrjn35@1p0?-FJH>Qt%<|bWwV<SFgU6OC_^Pe*Xru&0lyso@VIbP3@lrU
zYc#%T5p3W~#SM36id8{NEz?W?n+t#E{=ogE8h~G99JuoBh3KT=OP~K-7e{69iow3~
zI9h^tF_0xaL$yxXuP2*STeSKAMQFn#J{l6&3Wd5Irnme`(mN1X><HB5jqWoucx-<~
zd;x*cGl&1W)do0#oe5lDFRE-JBz(bwvi%m5)RUYWqF3I>gW6B>>;CA91$gGifr}7C
z96(q-wFkcAlEyK;%r`H5cSwz9DoK)mNH;N|vKv1iUE?Kj1X}Y!;tLIIe1U>qVSY0F
zXbbJFe8I3aM@DO#=F`*WX{r!;_vOK3{x+JxI}Ve1Aj>=xnlml37_b@iP%<HQ1OkZ(
zmE&ILsoH5E9gYAS^Rf4fe2^x5PW-4L0~1NY-Jy@Pqp<1gE&@lCwdLzOTnfcV=~-g-
zibgW~B(kKmfeVmRVccXA_Nwj8{fq{0@<&QQd*O0m@LfNIMl}fHW+t8!x%ZDAT#1d5
z7vDAB|NQy=^SL1Rw1oKgE>%OJ&hvNMS8uASR&JJBk=d)P<9ZJOoxNU`4|d46dI8<g
z5=9&6tcU{9^u{|7%DD>g5HzkozWnwz@IjlWAP^-a?%IgOLOj8frTtyi{;oVS6J(s&
zDjc2F7jL#3w^A$VO=Wn>r~Ukm9Q$&jvai<IycJL(f$dUb(qPYNW-{B65!i8{0f3eL
z>aBb<6Q7<{9+JN1HZ~dBH3ylY5KMfIrWL@=ht()!7{Ec*q;|ZU*W_t0hS#pQh^2@*
z*ccIhKpgq%Mfv49C<>1RTOi>%lz3Jx#{3)@dL>R4Z@?{;dwYO`i=pPpn8Yi+R!~nC
z>maxix?v$K5|2Bic6>nxpZsW9x7r@Tz5y;zZJ5y%M+6U#qfhE|CHns>d`r>)xEu3<
z0p{<)@T&qL!T_NgoD@u<0WqL^v$XX->wwFrrWl^MGtAy}AQ8DR0ve988JqkJkob3R
zJpAhK#IQJdZgixET2n;g{we&~5F1IT?QbM5c>FKA!p-2#Pf{E9`SQCGyzEX5TGCrW
zjUiq^vje@GDS?bG)C9p3twIhUp^tm<{o7&-ED5jE&Gbi*3ntR=1@q$`I>pLSVoqB;
zpb=O?;Wm;@F%k9no$1L*pu<by+^Niwfb=0;PA@>H%;$95|9ImYm<w!yg0M;)|FmZU
zDSb%K8YIrtSM$Ab1M}1+$XX!Q@s&Pk+m$Sza6R?{*=7rvB_4Ku)~6LEC27imJ=Fns
z^Vu?H3|QV300v`Psq`R4&fbLNn0z!Nh!#`5HX!~ejGmzY`R?t^I@L$T%ZB_K&pcQ5
zPsvh1=5*mrt7;!zZ{yFya6fXK{SHyMRVf5|K@@Lxlf$6+2YCLygS?h)vxo|UgrDxv
zM~O--DR2i>BCiuw3^Rn*p?c|(J%1{t&GUSBjx6)=O9Z<Z7CBCGJr#Zx@`_67>{x~g
z)w;jAD$lRTx)07V!JIiBFbDv#yf?l=FYc8-$aj*oo3)>)f@RAo5Qy>IDGisZB4tU`
zd~E+w4Y~su4b@f_!VoAm-Br5c1jm4szes~U0fQt*0D#*6u~3c=bAh;1cLFrGOhsa1
zwvc{VLb9;`<Mjvoo=NLQ@@SV7NJvN`!5VFn?tKkLB>5Mk%iJuNKyKuu6BBydpWW1-
zi4cH9U}0vtu=n=z*k3`{8L2EFw#CVI$)mE+%u-K|;Uu}z{ZJo4wDML9NFa+kRE6}M
zRDVt1>+!BE{02g&X5dWQ1NVg5;|O?OC{HpKH8IMwXCuB@dL^i)EkI~+vABmx%=Sxl
zfP$@>;<)rAAnKs%mOb;4YnPT4htd!})gan)pXu)C3dLKGp6f25ZTW+#gyi2V??{$8
z(S0N5zGEvHkGg=bMXfZMS7%e&r(em(AHpIaEmZ79kjhl7rlDE|iO%!$&}k)tHN`bX
z&uWV9G8-K>j%c}X2Ew}gVA%0$m$;&^wFu7$71kRduQ#yAHg<o0d4q4ee<hRq5LKS0
z71zy%HYo*GntbB{ieP<E`&~kE&O4;1fZ!viPPnLCjx%`zsdy-VG^OgmkxC>XXcyI&
zeF)8iBxs3z+mDX3p|&b!$G5jIz1MD<D0hPUQQYY}r9-VaMSP(Il$_1bpE5Iv$_qD3
zjp8B?qe;Qf-;%?7M>^Q{6g*0{``(v_3Cnf<eCxVSSr1%Qw&G`xS+`>~j53d*#!0qn
za3+#cSi~Oexe6eQf|X%<!N482@e^w?J>PW>X{OgA(^wlSLvC{?^$~=eGZ=dOr@D#Q
zPlBNzTf`b>QkQFz@3SjD)oW`FNva!lyH`%<DChEZ4zqyv!02D-2vH52N5Z^vI>=7G
zaA;wd;Bhj!7#{|?CBYvMw?sg_cnmbcB@A*nk$0f8zct{slxU>&I!?d=YIxg6kr@gF
zap~DegD0$B>&%tS!-($)@ux+zaU`CU)+m8OuN;wNjjpe)f%IieJiO)r#vAUrmXP)L
z6;2tn-=Z7bhae$)l->qR<;%qS%rlr2V5f>`)UY0p4ds8rncV~(IQ;^i)~Jdxm_a*N
z#(I{t11fA;@j9{f6W}G;453_i37<a;%_R*Fcf!R;`euE?*Wy|7U?o4ZPF&6So80Kp
zUP4Etr)qpuyyf!LyMue>^-<4GC=Q8h6eh36%^0ST|JCSa18q{%o8|AD<Aq)k!b_SQ
z`dyzZCqDxFlXfr3BkYoe+8GM^vVvCQM!^1>=L#zfb-C(tRmb>-=jS8#6)%;11`m3S
zPffw5O<jEBS=e`Lh#_mmu-47yC*;EV7yc4_c0hSdUOo;A)3F&w;Voh%xJv<b0<a;d
z|MhI?%L!<S7f9oq(mH<O3#$;xXwq1uWbS?@139;3HYoVGq?%)2SV22C8&*iXX&O?V
z$H^y?T_Ktn%!`k6$`3+7!WZRToqn(XJU3(a8lvE_Mxji!ZL;Poz@kFPy`y;&;Z3xK
z!~DT9ID>@;ElI}>3wQ`Cy$Efl4tAtDiI4QZ(D-N=NGT#<81q=}s7d<n{8xNR9w1m#
z2c@{#lbEC^9XnPhT|hnfSfVccZJcr~W#49>;Zb&H;MyXA#h}<M0NT4D_!9Ec*^uZ?
zlS<p)m5%CVV?uCL`zSCU!kz0QWoD5?dlEw0r%*}7%edV<kKMRBEO1Ot*Rfxr-&NaZ
z_+GPxuY*`M@gVP^F!o$3>;jP)wTUU{9M0Y&Ds3A%s1?R2DO%?ES}MY@Vdyv&xg)Yg
zSp*06t}&))o#>utHWvz=@2<CuUpz(_Qerz5U5U%=TweeqtAi_vq)tyR&}5B<k5__e
zz(D#3*&CUr8YnHK{_q0#pS18iX$bYpw;+)6FKENUaqQH@9x6UiAU!^+dHYv(dl)h&
zNXxj%Xn41a>$wrVaXaJ=Jb{Ce(u~)?k~ang-Qn6>T=n$C%b6KqI&lAm{ddyc1Y>Q|
zh_sj-riBlz%pdd{`8u^-BA2xbq=Hq7l`*1?xr+Qe4}QYr@)5X8=AJ%;;`b~?beE|d
z$$oORK2~N!w}B}oCqvZU8#i>)MV-vA$H;Hu9GxVlS6i;n^bwsr$9k3_5S21qO^$i+
z`IYJk*7=au@NT+t#A@NW+RXn^fMDq~o3-(z&#7-<-?QuuF(?sE%<7-?y-n(d(I%?e
z3t|TS^8w?io38wDOi4q++e|39-QvRZ5H{9C+n`a5g!_qFv1{vL>1!3^pPqU4X4RH;
zZM5f|EA;OH1FhojAC(_-T(Y5%HEn8XvUPA30Mf!EPKBnXU}X}HV2thC2QiNO3C7Vo
zqUFw%-&ILxe!eb_r#HIe3Kw)qc=GoDK$JC<O2+^;o2t$a+_BR)iX%eJpKjhA|NKHT
zg8gOgA!Kum!{_U1tTKs**tJ^Pb2xXsYl{|44RoGGILU~nAS2OaSLp8<hKr8p$dNGD
zUTJKsJ9K5}ShmqwnzuvYo)U1Ghm91TXRp18MwL3naqzs++D%~4HaZI!Lr9HuYxuBe
zIwv-4lE3hLZi28?BHSf)B40zO*CGAlJryV?hM`oxgUKFhPtVB{9IjPn6(CaL4FMjA
zCxVxpIe!<E8<Bo|l3)&Zk7l1B!6>pT+SsIe|1);e+eXGX(ny`q?(~hg4=n(d<u|Zz
z%oFx^KATF-TYIxxZP_9D-JmT)aDH`AmrGa;4l?X;>U^<)b4aHNb<Q$H!Q3A%KoEyn
zu!+a=)cCi=zE`*gr?W?AIZQ^IZ_CGTFpJv7qv5+=NfXD!nZ;Aw`od@j*}tTAPxD(@
zB-J?2czXZ#@iEOdc%xt?FBF1da3}u2y*(7BddElFhfv=%&NQ-P0Q2$Su7Ah5Jw86t
zk{xXp-p~C<1GZ^vZfBr~rFSQ(Sj)V=w^p~B^H;;D^YmT0ozfmMElD08qERN;2tJa<
z(YD77`XM^e?CM7${k!b+q+#q4Cgx4yg_D*P8|?+V=f*jSferFC#+YjjzKX1Af?2%<
z&>N_intS~1^|#aM^j70P@s3Ja-&xaXtg_?N;mCSi<tbJ~Qh*Vz-5laUz$N6$e#RPV
zp*Y10(i9_P<d6vLeEhL1;_n-tvF8kNJA=_&t;h$sy=FZyU+-HN3XaNo<}=$N>=j~%
zs55w$*SY|ie<cyF%<i@KsQkzE5#p8?AkPvdIL_w*9`?;UErwHRw8c22hOk#XDr~g#
z`^Rd0zB}|6<XKt7ml}w5#z0)jjnsNw*hl5jm}*n#{LuQ5Za}C+9Hz=$uVmmz3S$b3
z)%eEQ@N<}V#95)~4Hy}H5fXShmu7b<P1JDd3cIAgwG3B^+;%`)hm>xiL>Hmj{{E`M
zWJ6frA~WWJ%J0Y9egtSIMM@WzKR-9W_p-R#SONxm8{}}zSc}!wFeF5InRGwYF9wSO
z8gZ9~FQ>It2!`TdX_Oz&UQ5AeRMD;^SV|aHQLRYdy#`2WYyM86B^#A2P_WIa9*{BR
zi!-_cCpNJoL$Tf^Yxh^oGe{MB={)7Lx%R6>njJoEF0RVDEBMeEE`d`wNBBD87&C@@
z3Tu=wvQvP!WCO;O4PpT&_2JgRa32py$HMxepPCVI^vJfP8Q|_K8q(i}Dt@GTOmI2F
z-L#uukII68Em1W)obo|eh05}P^y<ZG)sz9IV@7Zh4c83v-oI-(_xYv5&skpa6Uxca
z4PNqol;m%prdLi?4KWOcb95bw%m^zjV<*2}eP^H#H%5r7JW%_&k*iTZ<aTs>)1ju}
zW7(}yt#?{402_e%$zVlc8nA@KLEB(f_0aiFLjXs|lVLj?Rbr*fb7UD|8bSLgk~*@+
z&b}Tm@6L6!O)9)KOiU{P`IUT`k>c5UQhbtp7{7$(S$VW6a8nt{t!DmpU~C@=ULBi2
zCh%<cWZZe7<EuZ7oh7S7LtCO}&*8mF4lx6G$_;mo_DAs%=<!(G@?navlve`W;g}{U
zDh%qDFsn*N*vL~Q!^0CDAOKo~q9hT-XZ?r_EM%;bNrjV{PdW{k`omR0t4167aGvoT
zi@tj_SMVm#;w-;eH58@V43tu{?0ceY3b)9T6OGQUEl(T1DF8mGCGbLhd)$wSK?2x<
z#J`^TeWAF^PJ5Fly7P}v4-TJjz(^4@8gViUgs)EfufZcU?9d8UnSA4sGF-a=rk2OS
zO3R6p9i*G(Rwv#=7oY{)lpXR8kn2Oz$xQbJADLumM0dFzcbK>nG)&|1LvP<AM}jYr
zW)r{#&!pGBllH>uw-cXo^VH9z*DcBg8m&>$F~Cv^5Bc__#w<i4V-L!}sy}+3{imT1
zcRI&*`IYwv)UG%MoAkWWF<R1*<jGZV{Gt?j*rC7=iLoZ4`!wDS=seF`l2Yxz4}=@I
zQpgaa!|iMt^vCxA?Py0ZYb}jeOgPZi5z2`SK$QO~tl$&(7Rv*EsX{ufSCF~k?hl1L
z%gSaqbzk!sOu-<_qfnozXH{0%KJ5#7Dlr4t2!4hO+$%boOc<&|nEN7ib3^h)4vp@<
zY~ma@8J=F;H+=Pn(~paTKllQHGy3r<L~8N`9ukdpT3rIa2OKi9L_|Owft2UgH-q!0
zy<k&0KAxS~+ShSy#x+;{L5KCJNtddbV~Sp@mWGy|U$cbmD<_mPam7)_CM3+{L{Pt`
zG?#_E=bmX|R9pcv_<0R|0uLc}U|t&FiaT{*(!V1?@5$vzy-gtEH{gF)jMPs$^z#=R
zw<l=nx~Woy_*fz|NvneZvp5>NO|L)id&sQLH<xoo2Z6721i{TWj+(<uM6%x9y$D}9
zIKdfME-lRo!d)f%fe~(YL&9Y&LiZ=CMM5qUDl2>Kc`%C==CyY7nrC)w6wafb3r*Vy
zuXxzfZ2!ranSlawHkHT<S!c)4(?7m1YPOxf4=`msWRj9!zN5MiIhUo2t_fOwe&NfN
z-#A?}IE9NG&S5R43A+@RexD{SYgOn&R)aU6wYRq{SMVv+t&)E!-o)62@5um5xB9Y<
zYp9tbKl?2veHF@5rqS2tkI(fb3ahjLfeTaux1~h!%ccTr0B?ky7vqV{QJp;VCZUuj
zP7gF&ld2{pX}ad0*fo;rSERAaFgm1i-eVv5R~WbTUxiNOUk>_*wgTPNlw_EdoCB;{
ze#*PV)haxTt}By0PQ6%O<|D&GNp|FmUV|C4oOFOAN09<7=@tY(k_G#iV*SfAsc|Uc
z<H6{GpU-wKL}miqkS-=iInSZyfsspxyuyBonJCKBROW;y2!m^G`h!pE7zFk_ecKT)
zWZn$ZTH?;6wZg>dEPBD`olFh+nSf}es>6ss5n=a*q4=I57q<E%&(FZ5DzK?pEa|hQ
zafq=~zEuj<il+&CM~IwU<y1-N=p<p;sj}2MA0eEPFJ9>b&#a8K@#$p%EYF@iB5D`p
zuJ5Ik!zvW-Z_s*XDVg=<)WW)H9#}_Og7h}kHd)C0Chc?lshA5yA^;m*4SY@diKD>j
zr{{Q~d{qGR=qDI&4$56H(NMU`{NzI4s3+X03TFIofh+mYixAg65^km{QV3Vjk>H<w
zS?g|4=EQcL8Es7&bW{F*nwB4;XF%o3!!6d7+el6gF!@Ubw*fN`0PKoU@D30pb^JPi
zpQdIPN17nVEm3sI-fDdCqJD$Y@B4*tEMP{Vq(jyNc-D8t=86W{>)@+!h<s09R$*l=
z-Ha&V;-TaA$*A#qBIQh~`^X_Q<<KC`U3_Qbydp^mM1@^VGf<Vy2!$jNDqC>Et%zz4
z6MYE%szYjQf%MNk0JaTF&rGJJc|1IP7^eW%Q$iR&B_+Hf$42CwEz-WgjE$p6>kxUj
z;m~7XvU0OgMV*5|ap%w!sQHx2I;YT6HG_V8`2}Hifnl?&iIbBA9k?++J|uY2XOuo9
zE^1EEkG)n=e5cF|b24(K0`bdFvH38Gv$v9KG(%2yZD2bhYB&;YwT{{Kwl}9pK5p;=
z7ft)41Rp}2oHS@Cl53AsXSqubeiXSm0*(I@ZC{?wuvy)SU+enBuU~>2@kn7KZ-u~~
z=6DFRf8tYQXk6tGDNef^>H<?-O{cqK4-&)4T~7F{kVZsahxRjjOMiPE{(?WV$#Iw!
z?9}}IG&@qd^i;Jpi?u)*Z)!3&>kg~ZfP;&1W1jS^Y2T|j&O@6qf(de&&R3Bp@BmM+
ze&(2i{nsYNo-phg%sVx?zC3M9?EMZ}&@z8YjJ=XkiQ^Yp)os}0RF5S?y`~Yi$yOG@
z6H0m#por5OS!W12&<YAio@}GT1R<EvXM2ma5Vb~@D3A9!-XMal$BDwO9yBh^-^(M|
zLgYU16>E*|U&ZWg)aIaS6Wr@qr_llpdQ3=#>f*9tTxtG?4uy3N?Rt3fhaQg-nzTui
zS~F@+V^%6M-cMD16uOMO2vD+Pmd*7TYh@QBXF-@!{tKC_%5QiBpgH2RWB(kZ>;PA=
ztxGBbfV*=UT-r}Ljh-LiBezK&#$WDwI#e;lE8vu15=S`!BY1>3Qr*(=6gtSHJl0Kx
z)rxp_oYy19Ujs^+qnK|w$n0ah&qDHQ7A<9p{7twv8|-MQf3-S(VS6&~$yTN4<$DKj
zizSfpOBt1$yL6V@xB$LHBjPJaWfE|(pLR$#@RZ!C{-eC$fl3DVMj#srK-(|mFMfRD
znhoHZlhKk^|9I~i2FKGkDTFYKIBu!WE7Xe>h0%G)K$dAxjg^C0{9eW6rdA$?3<I{x
zso)oSH!4~ko|`&+*k;o}MoaPJij1N@FGOS+0k^TBNrRJpYQTX%{^PZ~?>V)&8XACY
z0^I?7y+wM&{(j8eM>vlS=yE^u7v&az0W&VOg<2j360FGyP95?yWFp2qLb2XAUTHq1
z7FmJXH3`^+bi|RI4WNe)_WAb>2tJBF66Md6`v%E~tH7S4ViITrm!bQj|0c^{*>s>C
zC*A2D<)bpAjM??AHYFfoR;ZqVn&V-If#>p+jWK^9Q*iQA*3zY~K$-#$;WA_{s&3EO
zolj2w(EgsvH`8%3#4_4BuF~Z;M@aeJS2sjE@Nnxs@vLv&TJ}cpZa*2hr<%zjD!1Y*
zVj8`>a!(4b5Z_EPQsP=psVvnxX|Mp*H=JvkH3d8uN(gd0%ii(Q!qZTc=}?1W<u4=i
zTCJVIgzB)1yjH2u4XEXm$Q8qPynto2mggW&0>MO-pym`OCxQ{nG$7F00B~(RA1M_s
zcQ~Z?a%X0+&6}La;e|mO8@FJ7mn&K#an6h;;XH(Q-{2P9!RT}5%a`P&WnC&24egj@
z0uA-S?588rUTZGEp7U$>607eZIx5D#E4%<R(MA9&3`*z*XR$$lrBlT=Gp^$&q@gFN
zu>Uj$&j@;dCsia40#yK^qbCpyoIwmmwM$#=i2TLzW-vSImCy{K-D~K*a<esuwc|Uf
zo`<o9`b>~O7|-^*X6+${As3?!Q3v-(*V`Z}HZ^c~9()p4Gz%8NyfZ~c%%~N~`ou}?
z$W~^6GG@+$ETo8oDcxHkQS?e;amFJ*!93%8@`I#8Ux0`NVq(7<X^rDT8-Gckjc5p*
zb^TT``4DJ)oajdRr8}1SL=~ZB4kp`ShH+hA*cd}&bt3%5Ax;}~epG%lm-h2}Jpluj
zI=4o@65=*Xf7=kUGgCTF6QKfCf&{O~-uq&BMI5<0K;xJ46`_Vz<~-{CeYO2Gr@}*M
zupp0D-X{v{*$4hI@fgo)-&j4SI}|k#J7nep4Usze^6{w#aph|6;8S4dX-=+C9czeu
zX41~vs}cxM<f`}D^IDpXo|{ljPWD<8msGBCsXJsv8eZroN}T81)mreUnQnO)+H*R|
zZqPV35rW2q`)B&#ZAwO&?kab5pSX<sKZM-I8qzk(#QPt$W=<1WzzStR`5{fWd(%73
zLX&5_flSZckmK55AO1cw`AkkVkQ(m`uB7N~5pmP(-)`htJ?+}Q!^}z`$fH^)ixxt-
z6v-)B<`!I<v6tVcS5)6DW_gdIDiw8W(u*~lG2jPsbi4R%G2;_732Yrfa-`$@#3m*B
zyY$@T65my-$%8R8!#H0}E#-y`F*bSKcW=Q^Q1e{c-KwW1{Xh`I<B&%*>fUf0pYKTq
zYJh@l1VThl=JfW_)@L%?M_rPJqf1X!htxB~&u+Cb-_#Avc?{IFwV>qX4>|M!?rM@A
zuE;iI`U}W{RDfBqM8v8*D=WDi?uQ2~c>c%*udgxM-_KK8)dnbA3=Yc-GxdzuSk(J|
ztIgr^Q*n+|nt-hS(a8P9UZ~NU9x2%CdZ!DCD@Z|$8Z~RX6Z=^CJx$*K1onLb*tQ(Q
z``-H4hAiubwC?1ftD3whS+?kI*xN&I_4jetzI+Cj{Fa@W%4cy#ti`vDg5HdRpCz@5
z3&-{@e{$-O?7LFup&%xA=D#u0GT8wSgClXh?%~!DLwR%CKx)mMsNl~fj9x*X_c$+D
zgGcuroUbe`#FlGIeKL{8G@Dr~{f!raI*7#Z;)air>J&XsFf-UUpH`)e7>|lDn;)nh
zfp%S)*;8CP*SUh^VH&n#<&vzb_dt}2<<&PMzXVsl)dWEHcE`y-H~?D*b5!X&kd_>9
z)77kW%(X{yaVlpnU=qierwu&0@!D-L1mI%6zU<y$*CE0Hjh1p5o-PSdw4huyc;zzv
z{w)Q);0xV+gJUtKU4*dY6dL1^gU#Qemt#595UCK(_x@Z=H#U&DR)>p;HzA(exUk|O
zG|=_0z9Epm4(L1-;ix7XtJ{}ft#Eeyp-McBE=CCY6<)FJJnuR;c(h~1CcQ!0w0GQS
z!dqkcn#23DM&SKt1%)Q3<KF+TbtdrZF;YXkE_6365C}6gD^>j(t3^f{MYR^BC@tp}
zK=n;2C*goCpTBVPVW2}aQ#F8LT2n7@=t9o2J;FQGBz!PJU7rx3R)vDAVgllD0-cVw
zuoUyupjZ#VWTbn*^`vc84elgc9_Usc4&%|*W~GY;n^{XaqjXo*i^Y(a)8*@HNMmq~
zcn*E5ySHwb`-qWXY8t+%beso3QD0?<fRBtye~9RsAtOMX;H$^}r7VW0!v@>9{2Vn<
zj>sN69$d*uCie3R-{F8Cz9%{U#Z=a2_o=A&T=8Q92y9&c2BSIku=9&Pw5fVYtI53H
zh+bfQ`qVq2#p|$JdGiL)b!a3|^6gD>{v^7)D5xVEEJ6JhyEm%la0rFYtw>Tul;*+}
z*)&HnCY?s6w8VBV)~I|)Z8N(Sieq>T7eS<p(OV!#@b2acwsc{q6^A0Bl>QV!dfuGx
z8)CoynC9Zyl0tHxAUSnFWUqDuv#8%*ZOtzq1s>9aozz{e8tGI0)Q^4!ta=q|wMk9F
zdJh1k_C<GVwfnzoeSauo8&a|(n9SIZyv)iuh~P|_d#J(8CatZeBM!)VSog%afTfM%
z_Exb;NTo1<Nxvcog~#801sOTiKOL(Ry0W!@N^efN^8Kd}<!-4O^(E}N=_SF(kWofn
z(`xZWxKV^XNn;Re#^EotxP^`<H1FgQqlgm+k#UdBV||`2ua2lG*BV&b@}@5dZNUfK
z)v#mtE>%fV`-gqT?54Gsa=#{ABtj;07oOLXHS4=M!CrnfMF!Mqfwb#n9or%7vY@N7
zOkWKWiQi-pSRM`dq1b^fKcEacxLwLWrh-FzxU_<!`G$d<REI=Y^FfH!j3l{aULxy8
zx99e(KieT>J(DbN38~>f9R4lKZE**(&szgo&Rf?cGe+i~^j!+-mjO8(`HEq!XbI03
z2QIHWP=<3>tkF!YwBLO;2p2ST{eg=7RLbhNZ|x?v!X4R*r`q9~{`vHj4W1+c%NLKT
zk;XA`emD@b_+wkqrUxWm4twa{?d<p_voyhe)nnJ?uWmA27LX9++DHBFf5aRAn|UNM
zK%*S-&16y-@sLw#KRd><0K&{$({+PA(rIiBe)RJaXwyfrMm%U&Tx`jj)ih1#ch=9;
zJ+t0c0UPKC>OGzGY*hJsj{x3u3*MUKgJbO8%`daL;*{dZHS{TM^uW1qpG7DYUf{Gf
zq3`EB!QwQupPCvqizC=caznfb@?+ErLu|U=u&L@@v0Xr8k5D8n;h%H*A@k~pEmI0E
z7NzKaq!RyIvxPE5CO+`C_Q4gUHyOv+F#;?T<gU8bsILqnP#n)vhn#aQW4-N6V$%9L
zroah*^(j3)JaUwL9wJeFymf;#h`<jM*wub+{h?B-%z4EmFzLkiyOW#TbIrxbJH9KF
zzj6g16tl@ex+CUWL>1Z^{9cyXD<7BQ)u-XUY?LgU#<};h>UT#$_#oI()FdbJnaXZ`
zVKNj4#5BsbgaIF!a<ZYM3XWSjiU%}O4tSK-COU1dG<JhDeQbjI>=4_$jdDn2=X1j)
zp!w6VqDTb^-fztmy6&(7WECf0U`dV(l*h>bSvdHG+#Q*!AkXR`k1}~VNBO0@M>o$j
z3t1-XlJ;Xf?U&&nOm?A~9}AqXT|H>FXalQ0gjPfRnUvbEAtt|{_z%F!93{3qer&`M
z(sceA@*{1sr049Dk6R+=E^HE_*4>fwkLy`6Yk7632XDV-qHxDTRFl#i;=wq&+fziw
zf6;}1znDJyVQA5zKS~{VnQbJ}fZ2HxI)tzfa{>eJF?en61xjCCf9WUuNd?j-{n4Pn
z>r0UZ=qcF--uD4~JUQ3V_~TpR%>VR@2WxJ%7aI<Q!ttR*SZ%EGB-Ss_{{Y=9l=wAh
zljQF4DoeHt@)9}eb^U5AnC&@ttXOYG&^z!XgFEeo&i%b9VQt0WhhN0DEVj(uAu`fn
zPr~@idva3tD<lF0ImehjcQXt0x3qB)4|+MeWSLJ!-lqT1Ke^POkg7<taz`BAoQRmy
z_Ok~{x5`$4uSP1(@LO5%D?Hz!@Wnc5^2=QZb&z+&xFQeTh~ZJm#+=Rl1>6hx0wP!0
z<wa*^d&(>7xfVJY2rX9x7(Hc~;ShTR6)rk${q=(_PjQbAI>fyev0Kz5z}<ezu)U9(
zSGicNB%P_ntNrdN@!I&t%`YlbM(*z#wrFsPy+(Sbt!+x^soB%jRQl=C`Y9GjBY-DR
z&0h*g;P0cm&G#vKyaJD7Ht&Pqfo_P?*NAonJJHy&<JG1hjz-Mai14*zMWxW1-|&`&
zgPwAT%iix5Mc3fb;$Rh6PIdfIHf?SXq5`Z}!8F<8mTRlw9Ldb?8zz&@Bd1?l$VnyI
zijKw>TKTwcGFxRXlZLP@bp(wQzLN7Sbhw2~ZK@ZCSbZ_Ht8=I2`I*Q^wkUQeX;DUB
zqSH+M&(#7V5!jPUSgix3!ulRE5<qpL-|Z(i_)cSIO_&AY+}N**Es8~pq4@TO*V+D_
zGaIRsU!{!uUw!Pq(1qt#xw-EO%0K*(6qEx09<9+F%N!)miA3koMBol4foJrK-{PwX
zJ(8IN_>y@omb|?dg8wWiq`iV@6Nq{TAG6-CA?Ja$IeCZI)P>@g=7Dk!uf=fhwgg}J
zwt5~y*TGr|^QR*F(;hhmK{98=2JigarZkeJ$g<_I&U0_CjOTjDJ5qdOsf$hcCm6;g
zfFvg#=20af?>aDsr6-R?&L^cH2Py^6<9G?ziFoM!CIM=YxG>zLXn94AW;4xzTSA9Z
zvPEJ`>aOA^6H&w-4dyucyCc@9vyY0$2@ZDsmhVOE@hzHh!7QzMX{yOj-Du#5`8aSG
z9yuL~n*qib_rtEmk%Jbb1_(-zt!I!7aZ^A8rZkDslkcDl>JBaxEl@=?1B28Ar?yA<
z9SAVa_nd`Js+#lW68p%+qL;&1-`uiUIK@ctEHldTZ!*GI{PI4BbL`!`o8m}2zf7<d
zqdV)gq8lT?HA2-a<d&|_s%Cio%iP_E&eM>xA}A2|I7}IxTba-C=HWujAQEoY1>D63
z73X`qO8eZ9`MU_mJ0)*mCe|h*5~vxN-DR77efcW`h%kX2*4Y&Oi+uL`v|vesYot6J
zbK3Mf^$x2Jhm+1vrjZr6MBZoKeEto3sQg^_G-bVCX<_6A9E)`WLpNJz*`eq>j+^eM
z?iRrp<$Abb`3TnSFgaq*Iz5B(u}#FTS1>HYfATV*RhBRPc3J}Y!V*%`*a4$r!|{h;
z9#VY*io5ZtsH)A_a5o+uWNK{OmVy}YGHhf8hw*{sDB7>qA>|o9OcG~)-D|$GED=gm
z)Cyy48m15xW`71<+cWH6tnn`mI>F}!)K=<{QV&{|pmA4~0y-j>{%`@VPCsmR1pdxQ
z%klm)*2k&Wf4G-Id3h`R157LszP37dEBw?i?L!0u#Dm89^ke@-ddOj^9DnY5UV6o;
zRw_5dLu5A)^MBp&T4SW0;}dMLCVu(?k%dO<jtg8%UG-}$Ag2RzX1BNC5!i05q^M8<
zkUu5<U3ZT5tA|F!PB-!OOiz6n?;MKeRUneznTa(dqXOh9-=_VpZ|;TXRgYo;*GEUd
zusREL>IQ!I0MD%EOjN}SX@gms&f(jko9p?v8R8!mCqw}nrtK%4G~@f%Rx+{zBLOns
z%lJIY8>B%`OY1{64-oc*ub?cmA^qgR5i!O2^&c}F3r^QhYW`(m<wmmE@Jo^D-YMOB
zyl&t}_vw2MOlUMuh$0e!cRR?0F`xW|d_<rt)erJ3cHR3hL^#|A5KpxaFfpU9530y}
z1p@YLVXmMd=Xe{I{t6(Ii^$Q6N5KwJLR~kY^|aa71`;mg1F5iIU&Rv^&f@klw#~5m
z>sA`#*e=C+r=mXZ%(w3nwA#1p*!_4;w$c5X3j`G;8QXFsrQ-BbePN#&bK&(PW9wxb
z8}Da!M{15N$ELz}y^eBrM%c73F%`c%ydmBF>GFGp0ix4+pqneh9#Vs>9n~rUpsES>
zCXrMrJ_UUBX5hLVadU+y*9--!WlJ7(*nFpLV0j_9c7X9AhrxQS-Ij1f2=(c482=!w
zcD64%N8q{MhXZ?qKflg*erfmX+ZmwRFv|7oJU}2r8CB0%pQp=5qM07rC}?v_)}DYc
zT83GLXw!_ST0K_hBU?fIVh$1MN&F<xlK;vW%d@*<m^{R=2s4R92&>{?CEg5e%jmFi
z!CzYMJ17aolct((`CzP?SjGV+$u+h}9v`|Ki`vtXfh`~$ygrpe-g~y|)6X~xc@~S1
z0ZOV%4asgTi@N?AEY24EyAQPrX<iD-l=Z(AbIZMhIYAbutvjNI7Y_bj*yi28@pRo}
z<$agbf&PA1qQoc>97^-s-nTgeG2}0=>#SH3_=9Jq#4pcxT||m&lRYh*R?Pix>opdK
zd%lEveYaPS7d*w9h6yXSXCwKlAw!bpah>GNb*da4lFSqXLL<J^qs?L(MxWoxl6{be
zW>(!Zf*DoKP+XW5^62`a5p-y>8(=OcGG4<l>{lm%u#pW6lz=BO6I9!&D7RDPTqMD)
z>dt-6vHA6Ze*KeH6|2@4jeEPR2eVvfVq_KL#}ghQO}F_sD1ED`I5YEZ^9_BiJ$W@m
zGx*w`<K7RHN#(G~KBUqf5^^%66R|JMLiu9f0=in6<T*p+h-yri4^@Lh*#cCAH(L82
zCnM9_OkQ9zUIFfBqEj0@z^0|Cccq!r?Je*M@aMJSi&pX<!0wML6nauh7Uc8KAKmS~
zf=-%u@-(E=nIbrMo})cMe>r{g*D~sui_;X3Qw2>61iHWUi)lyf3z8Tf(uuwuCV?lR
zRfjbUVPZO_K`M9mJgMUgeHboU`k<fC#D_F5tPYZ(DlUDw>xji2(|dKP4ai+<1Q4HN
zjFMMHR0W>7ysR{Hm>}#RzqTh_pEd+Ok&HZb#Ebjsfr??&zZU1$q6(7O)JPFSJjP1S
zIO_Gj$P@p)xEIwP`qa~{Z)?(Mq1~m$$}D97zsO?8`fBC|PkO_@)1LZ_h(ch`|N3$h
zc=p($8>~T;H61BbgkRq}a=dj4ID4Pk99I!oc}UFh?#j<|`j==IVcbyyNF^jktJ=WJ
zo0_Sk3fTaQz}%FiTkj7>KvyIXnY9qAG_x*{eE?VZCe$2JOOyUct%V#VY?r}cd^U`0
z>PseU3rb%PEPN`dTP16pXrKQ66CvPRME$LtlH-DQ8xV4>dl_{9elEKFzGC-N%L=5c
z)-{XjReJP@gN3odGLO&#y;szlhTYQ;ylo(<v{j<XeHsFw)!Fl=0qq_@eLIg_XfIt0
zH*5o+s@V;@)wwSta3Gpi8RieL-Zdg|(uB^+VSf1<zpNpE!{hODhd`Mm4)bdWMD<#e
zMewaUOsWqgl&_uly`U8qc_m;2>~nZZPJ(+M5h((i;ptf*<Qo~kl=cyt61wvZ(d?20
zV8|xG_}3Q?%Q{tP-+3QXfvvGXru13I-8}tZ@YYRt&{wP{wc-t<_Otnq$uWL;C3Qym
zzYY<lZzkzLjr6sib&^jNCCgIy1!ajn+#DpV5jWvT*jVR2gq_ktiN;7C61KcmdNLZ!
zNRqCU>sw!dj({;}!=`SwNrIP^2~04%`SbRTRxpfDaNPh#dGi@nzL>->REi9yNa^Ur
zSa6Ii(r$Mex<wFg{HqlAydOBHHur%T74eyh(^D_5_vl+O-r(f!$A%rTR}acnY7+fx
z#;6NIdTEEk5qFJKU|OSuuQ?>;Q3m&`ZMr4$r8I*6uqBLYR@F5LKxa4(KKfV5<q6_H
z=PpR*#P+y~`u?0juYQfkOcw@!-qG4qkeR6q=V|lIKDZ&1?8cFwce~#<_}T;W?{8J8
za@bLT>9Wqvhds2D#fPmqgr~q7jVrVAx_x3H2u*lNN%NYorLzRT*yhzK__ZLA5Hw!3
z?FaKv6vi!#e}V!rOQTDk8%?UTcq@PT0kF$ORX-z}MGvBB*Df{6q8FI}h@s%t$}J5Y
zirLXS`ewoS&QI*+C#LB|tRw1Y!Qd*4Z<f30+elIpm^BLDw<V522V<U|H8vzP)9(Qq
zp#^(|$?5H-YZ{3{$w7#auiKMb1W(V43bt`oo#8XuE$3WG0o(kD<K5etGla>;z|?}@
zzw@2IewJLH7x?*(ie_z2KT14MPWr9T^5rZt-M!n%uRXYw`9=dym>-oJwEmm<E2l69
z{V?O+S_6hvj*bGeN)T9cuU9EpUc+49k8{G(_mdej-@Jzhtn+p7^Nj<dK=yO2J0|u9
zrV@5J#}+eQ1L5HyxkdE=8eJ+zgXrzM=G);KIB}d+a+;Pk8%;a1D{T?or}q(J-3Sb*
zzr6Yr<@0ezn>5v9v(3rejYwFMZD*h}CZcyKhVj^C-PUya3h?KUdUcPhKq#mZF%U|7
z%|gXPqr!QnD|YipKt$TthRc0QgM!0FIOd9N{#*8n519o<3Fa-|NsyCLR?bcS{i`9x
z7x?UATMJE`a>N-PZpv|7aAtwDve9<peS!Td1cw-Oq<jTo(~P}C_md<^kS=AOf9)Zw
zWYb;y8OFZ%;w!)#%=G5@oiE$KPBO{W%is(7KoTsVSY9e>ia-yhTj%x%66Q0C#SsCg
zJ5+e`Ee`S-3}xl9X3<q=14MRW-i;ajM-Z#T#J$h!tcm;ZL)&^#k3l2+d5tTew6|~G
zu|z4`J@2)F9@WIsis0kNtKW^FoC0jwvfeqq+9%~pH9+UxiUu&L_N88YS>&X8=FoT<
zbS~rcHb+~6Z_*yuP?a;sFLhtI?&UBAbDpA?FfnjZ<bcc2>YGNHyrkc8bu+*glkVZ0
zLPHA8W^?}pTr5cf>9u<rTDQzY|B4E7S7~{BaAWmnnpCLR_S+Xkm7fod=uAYu-H^Mx
zj6#;D`2(h48D+B1q>B*DlVHk0C)V+lnxB?=jNVxxs6|#tS+3oDLArexR~qLHvW#S^
zxYW?{odJ9=%YOiM-V*eJt=8V>-=P`bC`CPVscXdQB+|Bk;jr)o)|hVmzT<Nzxc(Xs
zvb=#}VM^~C=!z0f?-$4AtwATbr@bfB2|9y`KmdGvZxu*=4j|+`JNpy0cG)k_-?G}x
z(t?J$`<+x2l;@?5E$1t^UN)UqSar0Cw^U7`<HyFj=ZthgFR*8uflFi$13<rcp@vNo
z^dIRNt{c6SQd+E=jg&<J=Nj51`slX$`;V{1xX^%DV?M_}>B{!OY1N7vXlU{cSgF1D
zVinrFsgqEy!VWKThT09(n9p?8nL$%HVN1mYg3|ZMKQOnDuV$yyII&{4Jr@jAZFs`C
zpBu~o9O~Lo*VY*<<u1i+z`&?y9)87W@FGMH3GL(AnrSn{u3jp&MLk~M1Jl4Sc(yP1
z{#j#Jy+U%+fh3q4-5l|N$7TwAlWt0{zFhRlJ0NuN#WwIY?Qeg8<Vv@%gCHR46Th~2
zpTE+*VPLqNKYjN`1^N8`wl+{T?YmMu2%XnPC7YWSSAfvdrgFW`RlAR}&KOdeR$bc@
z=Lqx4A`iK7B(Cz<c-A?XDp-(Ll~=lTqkUD!^1CXWk+3WOR#2_nbrqLyIQ%Pq%iWD4
z|Lilmr=NJUZMBN8+=rua#Ubjbw`WE?U8R82AYY^=1B(|w2`i<%3<<LoGrVdp*=K$~
z;MvHC_!=sUrvuN?63F4jk2eQ`fJ`z$+a9v0<1h-@x~`2f5G?vX>G#7l`O8dhpM7t&
zO}Us9?EMMTK^?RkuM)=JmEK3(nZx{C{x4v-dACp|Bilzkpc}Nn6G<ADeYH;$NN3Ja
z5L$}RIqe^`t~?y@Qof=S*xV0qJLws~HhcEOkrNH}OV)FI9wz#}p${gyBg4{S^Z3jc
zaBYl`7fb*AzW8t;6LY$u6b5lyc-`DK0j==n)o+mNO%wNi^W|wjXlNJdI{p6<_7+fC
zuG#yrB8_xODBTDM($WpmAR-|k1}WW$bci(4AzdONNK1pHQc{uv(jX<G@a;#>Idf*#
zZ+-u@&aB0mnHSzC?)%>R+E)O-a5Xe(!3<`S%P}fk*O0sx|L>Ryw0E6IT^o^txPS&(
z%7eFy^Z$K^&lEDF*lMjyTX5S=!XgSBl)9KLwXb~*nJePXz*X|p9#M-;cN)HKA))){
z<jnkrMPe37H*5Uj<b)+-d(7)csf@?*hv6!Ejak$l_MQ1ifm(D@_zR;l=j<=iCwXO4
z$C7JV({W*oRM9TbeNMc}PG>{1n*EcOpU5oH1W6o2e3D2y5C#Y0L9r{G^&HtDiD)d!
z;joC+U4}ZUo1puP$Z@AVh9I4m7F~f7QAnM2%SRw!JhH7_dcm0BdA}8vyQw*t7jajB
zk`$Zt_`BfZ478`|7BGohIH|{8Xa)xUyOPLDZ;|jbs5TkbD-pHg252AkBP4_jHbJq3
zyR==ii08QfQQ@Isq@o3|{qlfuP&g^(bLz$YIc>->w>7VoUPBO-ai=il<t+ud-?hY_
zHxdQaZ_mV+jrzP#_UMe=bLC>=n)+hq-z(l%i>TW<@@1S&$qP@kcY?7YLE|*sZ)*zg
zj2ETBtxAGl_mP+mim<R|B;G^-H2gZB#si1*oO{+E_&x)?!wRv1310(!^;)PBl0O`t
zF}3=JAT~;X?ULzg?N46lSojdrgBo<-5pZ-Mn9QB1eCWv~;$jF&;brh1AN+1V^j<Kw
z9gf`I6i(afi3bnfGbP-7PU1=<%49HU&)7(2G2rJL1(<~tg+$vbMnVeWNmreQ>8-fF
zWJczCZSl%-kW8ZxA6?KLt5;PGMUOs#i1+5W-x$K^2a;B8(3Doh$b*(S3i^Yj-`gIx
z5_B=rLpnzPe&a@^pZcpP34AEZm584i$lSp=qOnoF^&_Ov8~J}lg`b<vmBJ8~xU5&|
zvtqScFMM;K(y0Y!(LA4!>1x7YkQHO9>Y-Or98@|2P@?Q3Wql6w4s35!s4O}8-#**f
zy_GVT!e*kjBv6Yw#SBh?<KdGRq%Irx6vOrnXWKCIff$ax6p|haq*2}?L^9kDuE}Ns
z6Xvlc2HzuubJi^Z0U28{T>0T%mbvK#1<6z*;*>jaIn|ngY=YQCOAfku#|NQIFS=_6
zn08v|f}9MHq-hAjJfb#7&gq8TYt<o<e%IW;nXB`}F(Q(0@Z5e{lBMY!g@QxQr1yif
zJ`1d=dl1_;UMlM{shDNA`$%QQ{aG*Osnv($_f3d3_>PtZS~iHV9)RkPLPF+^?3zFz
zQlXH<z`+ohQ&{)chWMwf4Sm|^zWi3qvhr2(554y`_p`N9V&iW}Tc#i%(GmAmCb+P~
zZmz5>$hxNF@w^9V8zjLP#B1LR{`V2I^9d<<r2^_Vb_3{)E=gZDP+Sn*N63aiEc1Mf
zo}a^pZg1W4i{%mo(A(#AihzQk501PHO)^s~6t;w@Zl;~TH%4snyqwmGW29}H+aM+f
ztNLQS0j7~-IPM$YSeWht3+U2NE*+~1{l3&lfDGQ2s3zfZ9LXp0Pb=YsW^a)Oad8m{
zYqK|ge9fki@B}{LO|TkptbJbkyYA9Jj@0@8oTsqCJw$=?+UKGf@hQVln3`eAGbxq}
z<Wo6RF-pCpo<A1^xEE11OP{b-DV8g!9z6S3!}PCLX#~v|5%bbM!9w4<ia2Q>Wy49I
z0^jz=vkp*-FMB`ef|kc3LTDl6wu{h+)5iF<>}jCO-3!8}N#Cs$-KuGGoGNEfCRiZ)
zUb7ZqWA2AyU{e*3Br)<SGm{qA45b{CB7Dm0KsBvKtD!;=IF*%-clJ{30z;*Nxv`E|
zuk8})V=zPgj=D(KppfgcjM<!#^#$0Zo%W|DJf{#zIxHxn6qAco{%U9bdIy^2n#qs9
zFXX?GX4m|8`46|LI|E*qV?G^RSY^7BbBK=%_|Eh_T19jfQA4Z1@5d(fuc=e0%Y$g)
zjzceCB&R>^P}^=d>xS-P&gcv}-V`uFxj*U*+M*<o*aXdF$Dz;+DT_f4DTU3;enhkl
z{5s7?RNJ*HSB4?3?8BhX_jwGYi|uf@NJe{q5-Krka0@LOUss%RhW#fIyr+1xISaoa
z<wtmi{kIZu^-TX@1$p-6&WPo4lmkcK)oy2et$1KqJ}GxD{{7f;=%D4Nyu;4%pX&49
zTg*Q{k0Kk;8ns~69RPip&QJNG(_s`=_?3fk0;QH<huro|V?UHCi(&j;h`$I@3<?(C
zu*fICLNYV;Q8MA>o}D`#hCSoMNe!wvYzlmU^Y|=OGhKCqMo{!>sr?ATfxE?@BSr-_
zRm1VDx4#1kmlt3HqoD`paSV;*7Z(G>K|&b3k~&;te{b7ED`>*d{)ans|5R&daQxp$
z$c5}-kF#TxR{;MHZNNDcBx3RQZhrBF(iIC}e*boADSTI_h&=38D4(+*%KZ0p|JO0}
z=S{Fgo_3W)=h{om5wY__lCNqO!YHT-KF@6tUDTUoCcmMjYg67q7FrA1gTXv#NA>B`
z3$Q_RVB~+{rVd=`mz!Ooc^|HU=Y1tOSqm3;P+$eq{P+$pOA63?N09*YWe5`uw~>GM
z<X0c`t7*D;30F>K606#YRAk2xGpA&bfqygvir>p62->i<Bc_MM*^hJ6U+&t0mxFt$
zr0Fiqv^>m8Sl`m0OhPv9C@l##`r+u=>CwA;wFQ4YXabV&JC~bXeSaS)|K95U_vg%N
zq_HqqES!NAB7})xMk<XY;h?x3L8Q(U5)4L&bqRf-5Z6iu;|IdXnp9iEPZ89@yu7dC
zeV(h#RRYtUvYoXf1mTq;wb)Bt$H$_i`Y|qEP!K!-dk<UIuoHYP;r&Pv*db*H#aMg#
zL!J>1N8RaVy0qZv(?>{a11xn4Rff-?{2z|FE@)4o6*+MuLS&5<G2&w^l|e%)23joG
zaAP#IDE`$B{L>fw>xTkc*yD7Lz84<A3yp<o=($;io}XxP1KQ#AkoE{D6-J5U0y-|>
z-!Q=w{t|`i+Rs@5E{}uHmtltrZ8!|)8{iRIV8lvj4qit>`E=Cyl(yt3P&JT#n(oyt
zF5m1yWamOm!NPmFYvxd7uz4ixzUbeN*X8Yrs@Q<OMj)p<3}-qB2_JG>@^7g^RE-Gf
z8m5}>i<S{)<&<@$s^LMXge#WUn4zQUV3HzKGPEdxjIc!MEQ+}PUl(~3xya?30$unJ
z8si5TCU9!LN0OsOTs9J&6QI@^wgy9spzDS%8_-2BgMfaZ_C5#~jm;;o25@0&`%zVC
z3Z@9zXRU(klzg<A$ag!20A<8^gF?jVkr`gi6}oDR*`~(Y`5>x3_;kActIhP^0rFL2
zcY5{Dp!eGj4N#0;R!*pT^pPH!B_O&yq>nQn^%W)XegvGvv4@yV>0moiIijV)v}K?p
z1vm8@xP67Iir$JsFCYRWpbpuTK3}X7VgLW#l{C7L>T$6}?~`2S70kH@m9SvUok___
zr*#Z>K^Ee<K%*soT#i6La1A{rpqg?wbqsYV*a3Rs^_*s9VkA#S-y%1jmkW|ooNojt
zn{`2A>rL3q7#?4kMWQGWt1jFu3REQWG_`iKdWy2NBkduWXW*Xsoc=t10D9u^O%yJP
z=?m!Le65$MW3q+<G||uF+vMZB0Jc30X|-9jc#1hsJpt6^v-OoBP?N$&9waLxyt>#*
z+Rp#R8<7*~fBuoYPXs6wZ<pLFBKQ#P`KcC`(B6KCMX-dem&<>B02F;&p*nWvY_cH#
z??6(S(w`OpB$m!F0B>Eo_9z0oyTiJpD!nZ1kh2O>7&YhIMohYqOez09xW;?T>ggLs
z5mfjV4KAOZ^|;ZJRSW}FhGg)?%)yCoS(quBY!iyY!5T5M{1n?dHk(-bUW^WYq(%#Q
zX=p~<E`i7tEMU4>bh1dj4+dtGTH(7t5xYpBWAbN5fEH~c)oDnna81_~xpeS*^Zl!*
z{0)NqbIAP952NbHnL{IUfUNYOW;ob3bB(_(BKE`3FBf6i)_hI{@qBtFs_7so;%^iG
z$|45IXC`ut1jTBJr^y7onRt-GuOpRQ4sO*g`b=ArImDtM=a$jVYoS(PPEC8n;g@fM
zOznudF59;grbl6tvrc<r$D?r4kk#D+Soq&t;r~9G|9Vo-5O9GDwo=RVX0U)-7w#bt
z>O*jaxwY&9FBFi{G$Q(H<d9_0ek^x@s_+t#3tN}9pi^|%5z#*%1NEZ}x5C%TgE0e7
zwyCk(_i*<BnsVn7gHa_PsA~)vn?bcPh|B}((Ld!~tsZY(eWY`ook<2$2_0VFv!k=q
zj0u>^Q6>w4i`(AavxB?w)8<vG|Bn++mKFo$KI!+LC$nb)<VH@Kc>QBc$dibQXhxg>
z!MO)RJ?CYCI7)~)L2m;H)|a7}JVNEm5pqP!q7E?3!DKj~KuIgR4etGjb672~cX9BR
zKz_-K7$CN+>jomfj)2hw$LspLvAy4$+W$Q2fB$P0Qff?6>m2|rkKE34tM6f$$2Ult
z4-qsoJ_u-}_Denl=`B~BUae;#hYTWT%>x=QTCq0i?*;YG<M`jh?cWD{rZ{rY(JYo2
zBXgannC%436kh>!-~+?9-q2APH!kP;O+l^N5NG^n(W1ekRhH5}`cJj&`-7?3tMc~C
zk%|ubVvm2fvB;N?M~VlLinWB_>snF`^|vpac{jlxSRf_6k7W^EzYWm|s=;`dZ^BIV
zh7|r8*qU9>KUt+mR!Vm;QfeZYz&{H`1B3^DsWANzFZOR4e8x8OKCtBKcdXO1%dXSc
zij}bf9A1`K{m(@gCBd&5z2RY5|L1FHV*UK}^F#ZuzqtTMv%l@Yzzir9iC|z^(1!!E
zADg2@W8|dB0r?_|%J5A71^UAuC`x$B-HwIr@-ss>SuvkF&iiHL*=ug}m~#cx*)Ki+
z+|Ykk3`s0}gN~fyy8pUCnIW)&v(X#sUj5HL&qPbRtF}vr5h(t1JCiC(R<dK|wdg;;
z=g$u$5%Ajr?-eHh*L7y*Ais?be3}09!aFcGH}t#Au~nT!Y&zX@?Tkn7<&M3DNjxT)
zfJ1o1%Kyw{*5B~UY3v?DDb9a=HAz%uNw{Pp$+4^d{YNe#e*|53l<hxXxt16c+x`1&
zE<tHcfaoZ@4%WAC9g-~>@qYOdKcqB=A=dZHFQHc-*p66~^#13*{HtI3&oxTMBU`>Q
zhFU+_-!)7D*6)t2;R%UBw|@a6D`(-YUA33!Vu@oV(+f0MKG^(4MGF79Y~(Bd-?dSJ
zBuaI2@be5TVrCr^l~G3I;r{stY9y(L>EMGtE4O}Aqt8D|A>!Jq#uT39=^1z6UZlh=
zqF*^weDVK$r9YoiL`KwKN_4dT>yNXC!2=@Yr?w3L`#h6;PQWUcd=<-G`K+ua)hF!g
zPpmxkSd-%Yd!?d(-4n@LtoDBXpK&?4*7@7$N&D$UsVq#n4)hw7P}jr-Rc_J$=a$2I
zKx2VZq$RpY`tM8d6=9mopJIO_8X0gS{!B9d&Uc27>Pr&DmM{Oj^_%dR>Q-M}ZJ$4x
zBySz3r>?h74aq;WNWT3^DM67{`4OY0rq~VswR@%Th~LMjmi~LZOW@0k3^n^MWKR9l
z@9JFjHN)}vGNx4(DKFFZt6$_ypKW<LZOBviY%lA|pXaG_<$OX&e&Xy(yh3L69TMgr
zSf|tExJtA-2KQdin8R~cQ2*}%iVhoWF-1;uBi8?1R|NrUP(hdL&*;p2PMJI3QYLLB
zI;I)T@2w%0nm-Cui(>GrZ2ydGHJDI0MvAW2C(aJ)`Z29Ms9UBU?HkTex8RE3V|a<f
z5heK^O<Z*IjdmN&E#s%Z65_ySlli|YkpE-4&B}M)r!CkAr~De>&9}My{k-_LXrG&O
z*qXk4)0Xlt&HHnuadDEhX>NY2{Qj4@a}2_*N+}a4!>dWmBGxHyislyFWxmNuJc?IO
z^TbH=6G`^nmTDDQqoKQ|pG%sN6~&{W3$xRIb~%`k)5GH}R9qMT=f514Klk>^tP#tn
z>fSUKJ%_r1S-X2Oy|bC@`|`RPJK5T+9?O50s-^GK%}(vF+iI6;+eNspxm6|Upz0s{
zCC?3FzvmU&p7SBs`f4uGmAjUp@*(F~_v%mG!hUf$2vQb{jQ@QvM$Vr%n3G9B9EL=`
z*4F)NDE<-Ek843m{}@k9`E7h$Voxre#>T#J^&CcYO8U}uhMbB&UpkjwgPkt&?DM5V
z?@P>>uLbPYC+#cc_yWmLa-GkVrcTSgJ(0NdNg6BpOXlr@kD6ltccsrtha5VX25+GN
z{A;OFqMT+ui3#+c%~fD~TlB!qs_13Qb5@ZTUy}OKp0-=nY7F!L`3A>CtbtL%0_a9m
zv0(;A3+Z>r4;e2Hnqfb+#>kbfJChZ5Q%mG|`SRpggj0Dxc3(-W)(jbX-Zs~LjX847
z7a-dt1gt}bj$Xj~%X>WAgt$*R-();0Jp(Ls8Bl#*tCI2{*R>)zerxseIOH;KqLOhL
zJ_3Z*(zan&emF;om=DGsm4NZTDCDnug*ie77i4L5W0dNjS9}_)Ec5Z>9mc}v9Zx2<
zS?F?k;of0?#2;+nwjAEOTNFL&jXc{-7pyqt?nRtibV)khi&<Ru>Q4sE63?;Ted$nI
z_^#%0qLsSV$6HN%+@Xbgo_S8rB1mDgQ-(q6xSCu#1h*t!eh!K=KJdZva|dM>U-B3T
zuK1t@Q?|0zee)2itm}(ikw=qt10w5*2(|1DI5n_Omdb}cRW||H5XlRpko@hU2$SnX
z=-Ul5;fsAkh!a<KoC^Qh?na3f7f0^7zUVmh5(p}~k}jg5i~94q_G!QCUf-xa){~+&
zdyRX@iIZ7^s?4;!Yvdkw?XH1!c?absX40o|q#p5~A>mVbZ!hf~YKMf>is}>Nn=Q9W
zTWM;4>P*NO-0zQX&hJ6L*-z@L%)|r0=yp3*1fPpFVEBmcD}Hi6%;pmqKKFwgYT`$6
zSwcXlO2!3>P3h<8dBB3s5nMzccsuo7>q>-dc${Pa!mc2@-i36of6%T1Wt8I(kFR9{
z_C*fkqmQ0RmZqpGNumqpeA$W(NbEsYz2qhlYkXo?M)vMwwxPTqC^QtBai<f`p+1k~
znowxpu{1rZO>|5&4E#i-lW^)@JSwU@r(MiY{3KKo3kxzoK00kpPq=r1GHFodV-<{L
z;>zTz?2D1fHPqx+v-y(k#yZEx47k&u{IIBmIhs(t7zBGcj++LLQV|{Kp<>H7J4-LY
zyo>AhucF_e&YvBiN45E`<dM4l%FvawNiFM3OGDyD3@CxLbNTt1rP4MrntE&kJ=h<8
zM*q~JQ4;4RmMqGt7yYhsuKeuFxNB*fc*dvQO&Laax(kKKGpaa=w`}tD{_iC04^%WR
zD=20tz%o%742D}^jEW>b4L5mvw!Le)4RvTg$Lj{O?|RWx=Rf6b0$A`Ez?x`-z&d5r
z9$6E>0+;7toPVz~{Jmuhuzl8Zcfj&WH|b9PRG6eQa>mqkslWP*c6FW*lVvV>C1<Il
z)qN+&dU9RY7_}%#vNXRp`v+&aI==+rpOVg;2(xZ4hg3^D{p`E3B0Kxl{7i*=cYmQ`
zryUlLw`RQA<2h^0y~MU!<g-6|;h!<61M^dv=JOcTCP=1sxX5^!57=P!i>#N#I)1ME
z;N}+sb^<~d<bYmw<C<(^JJNI7g)H1<7I5$@nu$IG($Yn?WoQv!jD`YCGV$}s=j<(l
zS{}G)TqXDAMK+R~bIu0;Eb=UZ9reb%&*tB#7P-Su&hF+{JmbAt^aceBF@||C)8LNS
zRz3hQ(6zcO%A(O}tpSAwg&$aElBi~wIi%FzN00Y-$Ds5*b>Hr?cdVLAy}5TyEV+YO
zLvdHVJV2tw{Wg(i>X19>NnzpE4;8MR-91V_)xrlwC-a)?a&w5k0+QT>;;YgH?INqB
zJCTVdA{*~6$W5pAA~--+&ocOi>|JtN8+q^Pi)m2ML;RMDo6r{$bCL)d<jp_Z?0@~Z
zoav9puF2s<MObhdh0AvkxX=ym)&D}(ihL!T*lNBEY2#*>2$048J&AJMCMzzf2m>0E
z;=<9obk4yLy3REPJ!hnzklaFbu>We0<#^juIi^AdPpmdFzLfKduXL!7-@=1CDch4)
zlr3oiqaB{&MO6==KydytVoS)%=Zk<kqNLY>O%(oO?!z^^xn|zCr9U3y&mwSeF+&Z*
zFCaFI&bFY6eFvbR5B`&`=>WDp4<K(%#jYw$hus&b@E%*nh{9K%SU=ng=Bzh0Fbxa6
zHR@qBj5J6}*)OaAsZMj~qMRzbb@OG#EEJ2aUl9vIJKYb!S=2|>nc5V7s~2_a76K`)
zPXz5PyIqGloZFmz|7?m1w6CblKBRS>O{K|u^&cp+<pjT`#3GUYey{%57rrwkhPqol
zBbHtZ*i&$Q*uC0Sp^~2oF7aE&Aue+AKBB;to$=aJML?br++Z_jCjem{_~Q!L-HNR2
z2fq!Y5(B-PjgHHG*Wf&om-ifi<Xkv+o`zXuQ^rvFB7TjjKta+ckAyJ+VlY*!TU0EY
zsfOVH`4?W>{#DjN@vT_DO0tos7O1zHSv(=fQU;cT$&g>eyeJNs-8aMNz2|RM8U2oi
zm$jSrog0y>YbRD@=zLjScC0fboGd%GdoPP<RF!r%E-#SaS5A_TMU(F5L4(p)3C&2|
zqK3Mb*bI2UW~lnczNT`&ZcXq4d>uZ-yrZ%eiT4BOq{2?H_(L+%_YT7>H+&2Z-G0P_
zSNN^lLVIX>8%f~+SL41`=tl?Uq}f0uISCH2Ysk`&mjfeL0hl02KZ#$rBrM>A>Um`1
zX>t8ug_K4vs-fcxMx*Eg7A&Ut!iOk-4?LxJU}H*#^-weS$YEk%>rSX2zCSw&jDs9e
zlsbe=!hmJ{N@1^D#Aj~AJ5rL%k(S1&DJvsqOPuIrWU!6hSN&Ibl?OWt8p96aGG+7j
zvp{weLbU_ijAZbYAl(o@FeFst2l`tYaSIsUZZ7G(KKKp}kPSL&Er`!x-+1{w$E_cL
zg<1f`*KqrXwz}Bp9Ah>szArKjqK0isUvOo+7bsFj>GG?k^csP1Xce|fX~+K6yk)9k
zGJd{ZVL!6YOoTyaCu<Dkfb}m}nu0h|bH9fPkKcLU((NGnJ}#(SKfzS4PA6~V3=eRG
z!8fN+b#Yl!Y2~5)!N}sJf<gMI?HjCa-gav3gRyx>G2;q@bg0TOmVY^gKrG#RWl%Xu
zF~9+$aUa#=Xv-$(B1O!{HHa+e2Gpyn$3v^O9}y1s12Vs5JqNrTvlsdgl6q{seD4VS
zH4L~v@?!g_6Y%=`kUO3lBqF?84TiU51sz4qxV|9W^-wdZd^k8zQv#e>M7?ZLq@_<-
zV1+!;gPr2NzaMBc>>cZcE2@l!9`5+x0*N3qI(!093w8(V(SjZZJJ{7z?nMYQ7n|lH
z7p|6~<+sjV_cg0M<xadHie)tN29rcezHAp#8ge@>eCq<J%`a81nW&;=5!$F*KdEIG
zM0a*78T~SH_PB&kMrQn3eAPFf7kV(F$T95zF&i;Fd@$|rqySiQx>^!xlot%z`@g)8
zP2-QDe|5zSR%-e=#!H}NDM`?40$(?DmTbh4+Us~J0W=)K=<~xsQF{nx{mWw6a0c+h
zVm0kNsBTuk$zSr0kq^IQ3Py18qahwjGJ3{8>O5Z&z}a$3UhCIi-K*~!OVq9L*vNv1
zaK0is3xvUYFddMunzDR;8oZ49Ve_{bgK2xwc@c=$bA;MY@C|vF@el<!xB}}AWAMN6
zLbYS-d^L_|=ol%oR`88~!)JRVgZD@<vz3zZVMYahJSB7|8g!`GOr1(v_(l)JgFa{>
zPyYr=ZP=5czDUMHwwdp`X8nM{FBBB0(rdJ!g0IcBN=Oa3|7Hh9evcJBby`$k7v;Tz
zv0Zvp2cFig@(_J|-$&Nj&jlP@NM8;DQJUo0JdiPX0DsYwsYM8%sosYZ6ld?C@Gd_n
zkhI5~TWI6!UvTCbzf=Pv|7VWcqwFaB^37Gzc^yfVROZfqqX6uwl9Z<ETv_SJ2{G-l
z{%WQLZ-#E>$%&1O<!phg^b8+NItYII9m0gTJ_`#Ea?e!gE{DSe4M}Lwb*dk&>7hkp
z1AqyYSkj5ipYnXpPUAmzpV`P!f~|lzyXDuBOMurC5rn9;EQxNkhuSZ!z2xZY`ZA*J
zCpCIime%hnckAM;5Z#-*s#-1TMDU^dqLbaYzlt2+7-AtH-XRv|yj!C8dq|M*o|w0}
zFCNz*ti$>)i+02rj=6pyYZItemNPzQ-rFS0uq(v7nGvIZxqJ3QuTx3DbAJDCOjs#>
zbcl^?P2UM09i<eAKVx7IQgf&7rAwIP1ag!K?AMk5RilrH`R25y8_J4_Pa|AfYQQ;{
zVT<TVIs)RU4S{G0(1wLSY4+iOp^u2;a?-;G(|&JMxZVz%Rm(W*vG|r<Fhk_&CB8al
zjHEw4xa7tav=i#2K(T_YL~AP^|AjzLOA~)Hf*6yu;xRH_rWN)zG<*^h;uI6+%~+}?
zek)!>3nfGO5?YlMYEi(s!j~@~RIfza0olw|u2$*9?`Mf5Q!xL-11QQSzV=GC8)2a(
zsf!=Cmpw{pk7lvy!nk-Ts$2uKBpQybSS;6dWITkmeA#*wp1$!ZepP*E?0Y1j(1<P2
zl2zgsCyK5#AU5n2to+)4K{l2llGE}eM*RvSi`hut&II@t;vAN`fv@HLoYtZ^x?13t
zs={!RH#zpPfdqtMf<?LH9%vBU)`fEHvFoO8NzYnP{bFpKgxBn$-SpC3IR})Q`D(}$
za6gZ+FQlRUKKj;InY(X}b>1@+RYQxNLr1`$q0Ae(@MA8u*Lj=qAjhJzCrna|&<z~j
z`@n5`!@1V;XlusH8%i<`HorGq5=}hG@;_`PTF_aNJe3TMhUf;49}s>260SxbCi#mH
ziX4g52ue&Jl0b&1OKyoKYlzVmVlFsudn+$Fk=pdaokyC3?8PMrHHEkm23W~3%o(UO
ze+&hy3^$>|W%n=96pcYgL@Jz`%tWL1Q;0$k<Qx5tdS<)vXTb5ZJ>8A;NyHTxZ$|Qx
zgiUWw*29k0mkMF=6dg)vLhz>L=oX}6j$HLkbD3o8-Q;|~T3Uye7x52Z^yMb4+IZ}B
zXW1C^o9Q<uJiA!;MH#X%WSLaBYRzLF#O3t;eHMg!?pB(3tX{{RC+K*PV)Nk7VDYy|
zs|`;yr?BYJefv$D*(PuKUk!urpn*bHpp@f9f!88u0_Oi|f%bt5AGG-c-nlP5mhGz2
zr&6~M*aud(BolE6@u95n2IH|Oo2EKbWYgp=Ra7LrcMtL*`L7=%;<~nAd~Pf9A<QQW
zL+%}+-q&Z@*EPCk8gfu*PM$przSkOnRsLKDun6BCT1jlCPxOfZ#$L{V+G8RrtDBT_
z1|fn4HkwjciY~=zH^_~80Xz4Q2gqc+Flen>9WT#DW^VMw)S@odFE}n*w6v-<EKN0O
z84VUTB4o+RR~Zl%dkjUy4Ssj<*=%e=%tev(z;Z9Y&0nyUHi{t3&Ar*PU$f;#$1t1%
zD`Xq9-dbz~q!_41nB~YQUU;<dnQW#pvhcYXUOo96XLQG86uWHP?Svm9AygJEJKa`~
zxBjf*Xoss^DgZ`V{NL`(Jip@c^<|jw*Ljg0J0Lt!X}M19LdB8<_8W58af<8D_o6SC
z@Yy}O_nd!>b+C=3h9f2Nw35{N;=N-)MfBp%6>pT$Q}UR#hkVen&B<M#edog@I6EQm
z=|uH53pN{jiHL?JLNOmD>{b}pR*rTa_0{#4%fv)Q;-xD4czex98W0Eq02tDF+giQV
zXHEqz>K&x~kH5`M_-%<R&&P|NIT*9~std2K&meV>VPW?73M=USNe`KgVhMTY&D#Bv
ze=Vh#=6xNi$wg+L1JbGO2C#|axYwty`=QYjv@>!xlmeY$MoCodb4OaxjlWpGj&(-}
zHNATFiDxCua=K=^n$BkpL5nB6*u{C}@tcqG!c|NO&z1)>wsZ9t_Ze4PE^PzZ+Ra9?
zWCW-xI?~iy`pz)<)6`bnbpgJrCtJCpZ^$-epY!@-;0uJjA8H3jQN+OO#d1t|1?wDa
z)Q902pnL5@f;NOd72+TRU3hBv*(KtCKebh?b{j_0btfSVW1$Aq`oVEjg5qi5WCywS
z0t|CwaS?XUjuTGh=@0qpG<=UfWDfZ|PZK5>8_;%m@^j@kSUD#8ohd%<pU0D69WK%5
zizLqv{S|j!XxSF}_kpHe$C+m;X~xaA7crg)eP0ZNT%!S%tE4aL8tBnqNmI@{_<T#b
z)xSM#eSJ4Qmf5N22JY2(fJ#`Id<_G|mI(9{VxDa=p~T-jr>Dd#H(rjoTy0P-ZK)Hz
zjF{~#(=m29U^`-Ee)kTfl=_7^a0Xo)KqFIo8l9J)=v#nJhl+E9qohpQ(o{j*<^%vo
zT!9&(r0)=y_t5QDSI2F;)+%#EDq-mWxf6Xbf6GwU!*<vN0n^3&c7(b4;YT7ZSayjc
zbakh?I)n-LEr7XJRy;xzT9>>D&}Q-4s`yj`fygj9W{r|8$^BYASlF4p&b|rT$h}_s
z3PSSi>n+mXK}5sXF2s2bsFVGO(YT_M@|d&4^pi__hKs<By@t34b}k|2;!@3sYzPx@
z_{em;f4Y2W!h(S3j0rolqwk>b_XrlxLQ$a6I*ZxrB@MZW81_of_ZFsupZ-S`cP}Uy
zmz^4YG<oE#QWe8=#u`rI`7XT!!AF3AuMCTwrYte-bAxfpw(Ll8O9=IvEs{%o)*U>c
z{8hV3rZni8WIrJ$@nL&tPA^kMY3+H9U|Cvx6xaeX5SQXg%YvK>-|auEBoA<qmq#ue
z*zZ3(tA#Pt!xGUDNmase%i>pvh&laZJ~D|-g~V{Q?G-R0?ME7r&PbXJd*$a;bXvDT
zB!f_qt&pB%7><S1E%hGe$w(?(9G?+{q;q|wqxBM7xuUl@hWGL$V~a&S=-;|%&E|ZF
zL)`5C$hg!#neMG9JGbX70C`^3p*|}P$tGkG0d3IDq966Z>VX!38c?K@p!Gydx>KyY
zC#?%n6+fw4E-gxi6(Y}YUZgV(?Sr9zpplv&`XapZ`<*WncR>T@?(J_n^mGF7m@Dk`
zi%EU?pKrL<(Csz^rfuDvyhZlluKQAaL+UF#{8qG{>iO5^x?Wo1pj7HWcqP|Xi){Z!
zmGM#RsgyJ@u@%GQm_kL_wgVcv5R1If<Q7q}ZL^TehMHj)Xc~sqQUv-%-?aPG%+cGK
z+Aw|XjxCU(3ncKETVB98sCYog{AK=}^0l<k*!@5qOCVlYj!_bSs<3!E97Uhp487Cx
zFYy|`m0s~V9h)1&IqMZCdC&7LscfqBP%VQcxmvH3n(~QrcBrMY7Px#Reaiv_$=&wP
zHI;~#Wf^iD3lpueIyP%9)m3OY)2#U)@`XCC?5@nYP`)l$zdoycVoqAx@Yi%OlMAyZ
zRj=D!Jvm#tcB1TDmoy0%gVG{A^Fu9}+)$*$Dva{;Lm85frVnlA$5bFfl{fX3?1-P5
z;eH1M7DvFPy09}LID}~MNs7t*pfeF6%S*V=;;Q60T8F@2J5mcU0s@!&sKSq20EzfV
zd@~n;b#WMQ1LOg(SjH;u!5coHNE-jTx$FL`OZ-8Q3z1X{=XIl!L>6P}liQM3fP0yd
zao!nvXNZ!J4X$qYjedX@BN23JOB>aTdNsf|@{G8R8MWZo6gzX#R|k~xV$qkYw|@3I
zQjm0ZjFKJ-qX>AOr+fm3t8=ODnh0TmJ4&hCW1EnXpJm1Ve>I6wg1(HlV$(S!tyi}E
z%QF1AO#R4*&c{Z8#02=X`D+r}={er5p<{S>Gdp<3C2_hnxekw5-Rzij7l}#o@U1m!
z@odX(`JW4LQf--=ySVMJgh%b+&hqdxV<PjNy$N#$|EV`()u-__GJ9Lb{e)hhcG2#h
zGP@-|iC^2koh<NM4Yx_C$g!csOJ1Dw?k<21dxuZIN7fQM3gTrfaNWH+D|k5RXs4cc
zn67xhN`q4BjlH&b^i-y2^h#}r_d;o5&O5zPadD-|yIQ`HRu||qbU2hm3V7l20-1Ph
zr-d8=$u!sf0kuF1mr>mpZ$2_w#J^MnXS0kdv!)3OzKeJ=N*AI7u8v4H2Urmh#`jLY
z!$1jz1t|3R7)W*=elHN2kY=o1XG>#eynL8kA2c5VFqPp9Q0YT@0;CA@(|IN{1Z{#P
zSnS3XG(q-^2Ysdqt&nnk=K9@8PojHd;XG58YLO1?BQ9|*7MNT}32seuNu}vzL&S{x
z=PU|mjYHkAxc<aDAD_X`U8K<IB}{-ru_~l{HAE7=+qJGWEQopIKYN73RYTi*%=*5{
zq8IX#rw!cj(fw#RzmUQuBXqW}@BeCkVLC5E8)|@m5bxppBCZ9R5_^|GSJRo$JuEzH
zYoMnOxK1GqH<ezauXE`Qg3CX|zgpsBlpt+Bq76){HgPn?`I`%1Gv$GC->%6BfD?w4
z!Si__;^?MzfykF6zN>C8Pco{-)nvh?p=)NI5bxe->$1`vbc^2lv70|dghp<K_zPJ~
z>2!ZPo#i;$)|^I<EyfiwHSUhqM0o20=Rf^q?{j9c<+_DsD+Yi_h3nU(Uk1~&cVPu7
zev(>dDVgQXC^9b}MB;YFyS_14K2}h7_a+`Uk5z4unRTJHyLZJ{CEDEh{$O_2X@tdq
zZgRd1O2&}gn2~Je15DaG_QFfF9*N@Kb^0KbcHH8SPP*(ePWSP>KmdNJTm@vgP^cX;
z;i|2otqVht%td;CXeuYQGgz=bC5SwGkb~Ri&1gZJLwo;H{+L}U7W!%%fnGdsVz=ws
z2;1R{8~8OKTwcvf@5LoPg&JFMzYfABlK_}vh`#)tEkE_Q-9&gDB19Z)WKNj7Uf}&o
zMbgiJ5c{-j9MG@^`>;E*_ffjp&_6k1^P!3~e5aj3Vt@O964a=j;1cx%`+N#bjdXPS
zfHTTIXcG4f^9^@lw@F6qA`qL;Ya;2Us2aHr5v^BdUDmN2uPIMsk&^gQB9rjv_+@o}
zPf($@dz<uSHb|8Tm65aW1-skBJ5Z>mZmTtLT$^Q+4X7<%xYbjy`0~P0y4NsWC0F@p
zmnrvkoDtn3*GlQi?ziGIDhU7EV;v8K3$Kr6dLNj*OwDn$p|O$(iMgFmyE#!*Txxln
zW)FY*%bmwV>l@FN&qRiXqM4h<7_4P_&P10TPCn*ozmgzb%fO6#AsJ^CnXw#1)=_(~
z^Y;DOiO%E|^BG(z9<@{Ip?-b>>pj;?<+izK!Nr>gM|TL?Z&9vTKF0Sngqq&jB|B=H
zIli0i)i?{6cY@Xy+2c5-9kwe5qns%GD(61>_2QzgP*K}CgtFS>yA0QOlofPj(VWrv
z;>TWNzTYypf=SWZ^mMb@e&Om@HGczKv-s=z7mR*EfYvirJEF&L?Gg1G(W4LCTGI1P
zesaz{J#;%QgsV`(M^dYiQBYtE-9YnqGQ)HvcvyqVwc`yafEoEBk}IPgM-&d_jfi$4
zll#{4K$;aIR0z+?nQL83le{70A+*u5h?3%ns#rSrg^uN%;o&#azJe|_rOrV+9y7XO
z{Dd~*41^i|0QlMV{Q^5O4f`Y7>P#X^J=-MjK8*}I8uTSf_&gb7Q#HmO4v7hy5FfpY
z60ZoYyD~Lt)$f@IAPwxnl3J$tSGlUJBGp4lWVkRc<H>!gZ23ZoHZ$yg_pN{6&w`^g
z!5OzU(Op$#Y(q_?gLPZ2OM1o+8gb#dV@xlC%l6|814;P~>2DUQ19U!uN_n6T%Pbi%
zp(z87)@x6F>{S;uE@tdLz9uUf;$ios)xD=``Qu9&4DLoj>6EgSY=U+I(OSdo?fPwT
z8Ou-)tWUGGbMmz+jI1x!8*f}*%f}=M1|%wSyL-`6NjPBCR93OJ!!>yQoaLRc5*EkA
zc<&d=*q@FNlgLyz?)O_b7+&1<8@z2&s|crb3da<`m<Z*Fsr?R|w(m?|2YFv#9;<yI
zUl1T3UxaHKj8Y?U;?i(oM5Z0g>KqFTDwaIDN)E2YH0O^g-2O%EOHbei3{)BQy++{9
zp`X-UZ^=7Fr?Q@NVx&{O1%+=KTUz3cG-%=V-_`-lAr>tkQ+C;_HPyG^S*R!;8_c{s
z<VO!JBoc5!eOf?|I*;We<O~(&Y!dilxZwok;DGt|^A)Z|hyIq`!2SEVZ}cK66!Rr8
zxi8U(dg~D2-FrqAvDqq<#8{Z1h%2Z3+T_bU|L%aaYj~Z&G({p~=kz0nSd7JEQ11aK
znIS~J?uLGX=t*(6<S@ewal9HZe+#Rg`Bz^S+fu;%=qHr{qV`4oSNT|-7@EYJ2in7d
zMuQ8i+-waONgSgmjJ@KUS^7hYti=!((G>+F@4qNJzHTU^V4dwbFCyFSDX^^=_yl8_
zq|iGv*i}@Ep+11N!RMQr%H(31$*=iT$AR|_HuJxTO_gu=Sf!d0V($t+TWAsbn(2Sh
zBAfI~=BbW3f=pv!3@GGi%@N=QH{AB5Gcw?6Ve^?ub7zi0rJ}Kpt2m5fzHK#k?ounC
zR{Z>}FLylc8*%Jt#!@!?Z(D5LeSW`9;Kk2=mN8zVlntUEI*X5r`{^}|QA}5ByfE5@
z-li)nw%jNFa)N8GKH|kfzE-k1Jd4;<JGi&J22K1g7_FvJa4gtfsm@1|i<7_@lxOQ(
zmXA0L!Ait<@jl{T7J2jasPvF3YW~8l(&q1(TC#?@Us828WKs<A!hWV6m1)(rzlc#6
z-eQ72x?nw>UgH8P!RMN3>Dx|o<_47c_|w{m0ADW9NjK*e_On;jlk6(iGWzX!A*R-J
z!Bz`(2!)vvm)c0kyQ3YKzuH?GCpzHWrI4vG4mn*zAA-cpQ*b{eg^^BH!nZ;l!dx9S
zhODf{(l|ZNig|frQqCB9dTjKb?_lk;5^(;+TJ$-R8M5;H>RGOHSovq?M<j)nWlJs{
zNR(RDWl{Td29z!~wUQuVak^7%F7!IJ)CC9BVlNc_Gw7`;Q|Ke*G>!Wdxo>^#aG89}
zUBpLsv(7D9{ml6lrVa<fD*8P%{8Kfs;`zR5P1nj_tngH|WH|fG`h5cUr(&*C(!m^;
zT?f;}`oT&f-kD=adJ_T*LQ|RxAI)(1OJBq@Lo8NRX8S48wi;5eJoJCB{`LU+RZo-%
z6CKAerWt#{pv+(U?8jc$I<EVQc%x>fL?~Q#j)NxEK&y7W58RQEG~NN{f`VYwa!R(Q
zMXmU<(f|&Eb#*SyOM`eB_TM`rk>L>s|BZEAYPI4|CKq~^#uqcLsIS#&zWzD#7?3AB
z6fovlST>1YGiyml-NT!nUGUq!erWZbtCh_M#EuVONEUSPqpbNn4a-kuOuc1PY83z0
zyG@bD0}7nIsSy{d%%7DVMH@OQ|LC3+#%S(MyTxz4ebHAv(Jhy#vw&!_+1Ih%qSRz2
zLSL#;p6De${gs((@$6MK>Pkvz;-v|~O*aq;{HVelY)gU&tmXKvGwZ?@r;>GuQT|G%
zuPFX%Qmfza&2gS!x#<JLU=gbI<r>M&@Vy8cfVfH!COI$Hp1MLy2a~OteW2Wn0z2VK
z)ZTll1AeB3X0X5?Kp1jw-R0Hfg=yoBXWtFp!N<&Vue4p^wXDE#6*YPxwyM_KcFHA9
z-BvlL+}nY|>Ahsmw7}jj+G3&^Wsif-eU+<=UQaZe8pdaQ|D}B6i2EggBJgC}jz0{?
z_>0{K=?^b-SjqKR%<>Q~UXnJYH82YZtJleRCixlJLu`YMFaPZWa}mWpMOmO!dBU9D
znQz;mWlk(qHCwvzA?o__Q;gMMf2kfWPr?k>5@=y=9B2nIaGOQU`7+=o?{HDP;Ober
zZV++(vuRrHubL0xl>9vnu;nhpBxq;GU3lNL3#?lkwEJ6JAEN3s2(B!mK1;zps2eiL
zzmjv-_*9%$R+*@Q!;!W(o|*IgLCwzOx?;=vT*nWTK{Z3v;TMaUHjj&1lk3*R7KU6b
zh6`sh2oCO|#t>speZZ^p3(TD_l6`xne#M^aH146XjPRJRjo_8BU-p*~5s1v^!;N3@
zU-oefhN^;s9$A1-$L?f=8$;hU0`!etM7V97{rwcT*Sux=MMXzqFEf*0bMk&gX+i#V
zZ6$ZKUb)Tfod_b9$$_QxQ8!hsa{MLhw_3ui%MuF!Q4ebHhro@Le0lkSU`XN;6u05d
z-|U6Yo67wHEI{s#ATyz+omK?en=-q!n9AOT7iQ`olFA%VxG$A8uL<{CG|wFf;k69K
z*MH^?{^1F;G$zlPR#fpf@zaMEZSl4^Z&^5d+x$8NMDLSfyD`t0`nFz~zl6f4Wjw*f
zr=*VOfX}=zUPWAUP{u<M&i~19xU&DypZr`&8xCQ^^_?seoCzVR9;G3Zof9*O*yA_8
zat6i?1Zy2=7YTmMwa0hX_UdwP=|Yr;N~Z!RJ&nt_seS*46?%=gsIAt=Oj)m|6r}%p
zIbhPPng8A>MKzv&V>dXo4Tstc6MfHT=0X^ub<bP`)x}gpJA(s;zFI*GPl97R4F(JB
z`^z?m5A=z;rWiNpb;Pz^sPhmvDrs%(xn*UKM?;QC!i!A!A|=DtuO5!qZd7Jcwm*Fm
zPHx6A5c`o(-**r0YGLjpB!5Ud6FBR!es%jwj(R2jJfrDC!u$tpznmJAv>@WM8lGA|
zS6A_sb{`!};xTr`xA!rus4?qe4s$^vXI$sn8ehjr^J!6tZ1m85B=5$$w|$15fzPNh
zY%{IIn8Y}fDWivJ95zj)@<H1=%hdL}kmc*f<)VoQFWpdRWm`%m+k5VK7y7%ColxWq
znAu3I9=$3~!*m@#qa(Pn*b{QQleDIpC!bxfMs~lYn&+L8h~8B_njYubvWu7BtU!rT
zlo;o@v~X`$F!V#Pm7$Ly)BbyJ5~iK^X=$Y}e&^}D0YkDGS@<oHv_bYuSVz6F7-{e0
zX)yg2ACtysQ143F5wcm+B7y6V?O8d&PzG&~or}{2a?ac6<E|3e3!g>xrUbSk<3#OZ
z4FBson#A`FeZIQhwk$O<YKgtm@#LHl3QarLzC?nZzC!R-yW@jJX8$%dp2XBidV(`Y
zPr5Pj=q^-Fr8{ixVmHV9ZFy^OYaPZh(A3|Ix}-~=lgSaY=x#+4ZW9A3S?C-Z(f!Hu
zuaE62bwvs|_Cz^(X)f@sElx<bcf4;em_07=tT#@MPvKvERA|^pgh_lASYfZ#1AhEr
zfopK14q<z;JMYH13gGRa`d{xq^mL}{JOC|(jZiaT$u{)r146!=05Kh&nyzD+9+aUI
zd9C!Ud=_<!G-(_;Mw06rU)x8qCR#q-A8a|Yn>N`3C16P&hIYzsscH}2Scu_ANVHdP
zzt;ugvOW+4P4C>p^D2*sW1j20bIyvi<Q{FiQVe$cQli!D5r`%CQk`uy6Yt_hV@7D4
zQj%>Al*rUl7yfe2o)dc{xG`)i;E?%ze-UV34rE1IrBt~T`3^6fb2qV1Rl`pjQf)pp
z0z08cftK7=_D-*>3_mKKx6lXFqX!!<vC{j{YpACVg^C;-Q$yxE847ASb&WA}Yhgjb
z!so2s_Z=sa+wmm08&B7{456c;i5u<#eC079A5yUCiO}cD&U6Oc0H<m^@Hnj5FyZcr
zAF4>*d==*zTuU<%+{lsJ7(WuhT8{6?-O99INTzAdRL+&u%))v0N`Qua)n3CumR1l2
zCO-ISU+f*86W`)8MRlzdWVW|j1p)RWncLygim^JJrraZF9G#{ia?E->^XAbUedoK4
zz3%OX$SE;9a-yUAhZ-kVV#OLy&WS9^Z5gzywPe~Fb14&YK_litHaCQhLSj&YvIwB>
zwF-7Mt;y9SyLfwjUv0v|T)#o}9doLTdM(KbT$8NMP{B`p^&pXcsQ&;XP^jAEd7XP<
z=H(nc8CGI)o=!Z#M-K<GN<9$~jOeaj38FbUqHo5rOl1<zr?n@+UglC8;xcX!@_zL0
zqN$#G`aMM9{hRT|`-d7Rn|X>U8|s;9Dte^z&9AVRjh`z`vy>s?QHYFii{!rF?N)V~
zz`53?3bWZ*{&O>8`r8{|;pDi88I7^p7AuQHwvP#8mp`x^B!z>1bNEE+Ar<x0@k|G~
z4;F0inOUi9nq*4v(qLp&!w5TkO`iobPe`Se`*x5MiFIGQT+BWRqL!jlt>wr{#WdXB
zEUmgY)VzDTjV*B>(~8IX1KMo7{my)23O}ppH&v@EU9XZQTx&DX&%`^ujr+azrr=eI
zAy)jBMpp7FW<_7Ku~RBiIDtb*$Qx2bDaHyNv6@7W@v7R$y1!_-j`P<@RAgv{o1C`J
z-$(sW8ctSnw=$B?W#e|+bkK-lpPVAD0anZOQ<;Xob;Km=wfBZTtcX0KWIcNuyUp_-
z6;avkwA+b891pOHrGgt5Cw?$7?kEi?Gr_l@$K;8|B~FjJgYB!ZBYV4z=4A(2Z$Z4a
z?YMNk-}T!<TRFlL0giZM<E7fFH_J5Q>gjD?cDZOu*|nPCRzJfO{ux44ZnEeGBkM~W
zrJFnIebI*;j~+k&{v}pjoGlcW){LF(a6{5CUfzP{!OC+xj6hGFngFS4Z;u@hV{u>|
zlJ#U%SK3jMR`<J=aiUxv>Olhy{lE_AAfdyAG|AMhw1?zU)Ug()d*okl*DOw~n9sRP
z**v5aw56VuO+XX$7Kaq_XQzT;Xb(8`kBuJsUSrSV95?z@J5;`_<pM-tlE%ArzL+|E
zNa5_hKjQKiO0K9tt32LT&zmvDLeJqn{C){LyzT003=RdPYrQnJX~h%BEQMiP5KZNk
za$r2m&rG_6E!kb%eo{I-Z_k$Zi)sQTgAv0lW~Y25tn|o2&jWAf*X_!=Ir;JTPwvjO
zy)0#nsGzQ>2o%KB75MBl9A5fmP14^JkLv;iXNAiVSC!7)-Eerk%qMV6a8#3;tTfya
z#5x#qc|X2Q;)yI)v`|C4XcPC1Y-26I!8FlN2KZ*LI;#fcK3j`d<mK4GlUxryF*Jn7
zXPF|YWgp3%S4#Hwis#7fuG21w)A{L#Qf|sU172oeSvqr+V};i;Xb8?DTM8a!(zz+p
zBkkw+K;1rhG9!;l9B)IP;iA`51v?aeh0j~ih3P3*_Ke{r%q4V|B_w`1=Q=@Y>?`=d
zIPh66hqKddN2VNQfheNyjEgy1P&!{jz5UbFKV2@S3_-XX8xEy*{jBcA1YQi_eo|J@
zI@sETdRxcrJVx-Pgv-})XhMhxdQmU<h88y>o`>2_#0_I)3}xS!SKhp3Ipk`=tX+7&
z6sjG5*~^!Xhd!R9H5x4EII=@}$7(O(v)=2;;bP@k5U=5F5w@)eMq|b?;BuvB?!lB@
z&=M}c)MGAk=@6yljDj0)q&F*pCda}59ZiFzT?`keP`?F^o8NsSoD5E)#Y6L{WJjaa
zkkHuB2Cp*7gzI4OGg^|sonCuAHmA(9JQ25zXQgP)IfrMl>2^jC65fqSZ!b;o8`BoB
z)D&#pjr#H<XBwQ-{zn&v*k;+;2pun^6%4YcWv`2c9a*(gDh+Y4tkMkB=8(DOvdo$E
z!ldW32JSi;^#Udi-oq$?>2!a4-pPY4P&?8*OvbK%^2yVfj@rxm$n?|6n{}}xh!o+6
z!^`uFC$rTxNEfq}_oyt`ZMInkq52@JbP?Xv<#YDr^sle`7H`^jT_*RW4R^XH!~qZ3
zqkJ^@6@@(=)f4N}j{}gm=wU@gy<n<JcaxyQ@hgr?L7THk>cb2fOX{DY*khr$xgF33
zS*_T@9--LVirHs8dkvFq8$!i3ftG0;&8iH`ygO{jq&A&(46n%`Ygx+T$9nQzyGASV
zxgzq9c%|RKr7E)VDn^$7UhOyz3R+@tl&LA;=HCwKVuXbA3g1!o@GkfHVx`+0PIPmX
zO<ERHEEn)#8TfF@pCw+1y%PWHH=B(3Bj2pqY3Gq+>Akc!uWsO!w)HzCC@Ft1;#!^6
zKU-6lD2SnX{`2*VW)`2IA}LhaIUg*$tdNdk)@b}^v|fZih;tbtgj(I}!qBjq3d-pt
zdM>Cuarssg>q%XXI`)z4<Hcu;sb$is3m5{ET)XdqTQaYDDXaDVaY!3Mt;%Mc)-;uh
z@?;R^R~V9F^>_3t@eG9>qu`XZ+PqCIHLCZx)t8H@mKN>NhbMa#K3u*cw5VRON`)sJ
z&aHd~(Xa0(WZh`7mCSZS6$^mvNpTQRdn3B3uR6bJe@I14k4eG}{Wdu48+&dpr4c<R
zo*FMz9_ip~Lt(v@*3!+Nt3NV6iSs7zel(H6s29q7;+aFwHrB0g4Q29Zu{LBh!k54P
zXk5%ek2NLNCfm?QeP}<*ZyMpS*OQ+CI#<i*px3Gr$Xbj1iyUQ66ymTc#d^1Y;fhC(
z2+DR`^$Bh;z8g!oV8IzsMC`9D%%=q(MVsuejdFZzR@b@1JR<?Jcfar-F=6qB+wOq`
z-^UBH4A-$rafR@DWiKyh#+FPWbiV<TwGx~0tzNF+TRmm=Bp0XX*w!c#1s|usC8SuH
zY@0+|=as@pEASAGBYhztn(%PQ_4MF1t_n@M`C^ED`KK!z0!;d!dU9#1T%NCQv#itq
zv{{<9JUh8#wU~PvB3X{vUVFH6)O<FjQ6dmmXPhA8HB_~G-KJx=T}_)bKI{M~=BBp^
zQ&91DW5j%gFZ$U`No5=c)NV0ZSaG#3Fdp2}rbzfOZ7Y$)Wan-6I?QK+ruB13+rh?t
zQt5N;4Zpy2_S-b(rp!vqA;L!qa;iOS(tMftxZ*vtzUSq>)g;#n{KD=?Ny?Y%cE&Ge
zjjXoUVz9hgn;3C+EADu<Wr0=O0nwbwC8=%z{bfX3=m~t<h3DhUP3cSMnO!&+P%rxu
z1Vp_P<9tqj_h;%FlC$-Y?s>cuj~yXa@9Qg^!48R`H+h5#tALk5#oShlietjLJl;y4
zfEk8W>V0OoMd6Bje2-AHwxsiyRnoMq_mu_=VktFMX2K)8&!CgAeE~^PB@beH$KJNd
zqjuC7JCEw>FS%W*$)^d4F&r!0KX2gNNy}wes*IWJwq@vUm96!p<8$vpt<`*j_|PY>
zR%eu`;Y3c2`c7ImZZ-OE^&ua+PCvJaaeNnTSs4*_`ud@JIctc8G#%*hw8`*Nc}V|V
zcDZ2XIByn?p=qRQB%Wbh@O=;MQ|hM&=kuF{)3)wpnDF^skSoDOZ6j+(l~_bFm;=S_
zO5gO8sKqQ#^5@_4X0ohA?YcDWtZ(o#qv6wJIgBPe@mikL{2*{?75_BL+0NOJ@fr+0
zDXh?3u&vfsjD}`dTaJa-<`QR1LGzvhEOLdq!?NRcX46S$O6mj{%+44Rnj0!mnNU0N
zFdI?Wo^MffunT^!u9Ylj<Dtod@w7mlP@UE~_C2{lt@x+VZ*HaRbSR_Jw93=kn-(>#
zoll*e$I3V7cG%fua41Th_mLqRHO$8J-%^OWlq8JOS>FxDxn2Lzn<hF+%tx}uMifme
zecQ?K?rn*80DM$7&Ehbzb2GkWbe{KnfBFG&{mgMQPhv#Lr~zPIK>)Mj9vEqG*>_Zm
zcj?C3R}JF^>$bZ&jEjeIFgQ0qYm^*1q#tQ{dX5g?CiioT9oAe-->{Zz-2k^D81GMW
zoR~&8=MVV!j(8KYqMs*wNr3ac`bnuL&&0RSw^|LovL-ueoMchB9uhR=5TXaDq91>I
zxJz(f@LA9kp80$9gb4yY+>Z?o?CYY2y}%Dkk<)E=eSw5R8guGZ@E~FOI5Fw8rQ3Nn
zIiB~Q0kqs}A{`qAa|wg7)rOidBR{gzmix=lvf&zySEzT}tB$J}BvT&<4VGz8T$D8(
zvEjzk?)2fpzBZB9#B+tyc;%kyAWoc$QyU6T7Hj9=*4x3S!@)Tl738ePUSOk@UP<oT
zJBZjnTcsHHtcY$M4+8|Tit3jlZoPKQ-KwN^cFSXSMEq|i!$Xu(SMsxy#xX;t4U+NG
zntXTl$_!F1{wQhozQE3^q_#WqJb^Jr4~c?r31E#|W1`!XX{%2;*9#Cef!goG$aqMs
zkjWQ?anU%d#tt%SQmr{+FZ_o_%wIRCTzEQWbi#<W=GEF-coP^Lqc~E+ZWvh1cGw5I
z<T5@S#xhkKx}f$Zu&FG`oW?==T*vt&lbz|Z^Auz^GOW2eYDw+Iwm&yybHosboYY(9
zZw$$6J5;l?!E6V(fr}!vcS3)4=c*zhYe9=*&UR5hpsViAt?Rb3Q#KbTzexRA5NtyJ
zkfqpuU=^ose?rWb?>!E#F!yMV#+XMR*bjN;?<iV9|KXEKSp?aN*<CvxI#S*)y81Tg
zzve?@jXZHT&LhSNxMq(6KGrbS_AuKBo;dX};gI&}B8f^hBJF~hr`hdBr9A|1D`yGf
zE1r4d@Y>2^y`7knqFt8Y;A)9YO((*CxaOx6C_d_u$iyENKpuB<Zn4Keg=%Ro)-Ffq
zCTdaCQ>%)vt_(G0d|*V&JxUx~Ut{u3KH~PqrEJ7p4;bOr^MOvXHz|b(zvgeY1}*^_
z_S{OkZRO^bM9v>lbv$ZOwPcoA>~caa0V7pUzuXP3B(P<FlIftZ`Tx=N=HXPgUHdrO
zmZ{jLB=bCmka?cREutu7Oft01V`UpMl?Wws#z-QOF*6}U$dt5^j2S93`K?{|^Ssad
zyua^p?86^!x<5YGb**cybDirvt&6MS_%}qKPmK_6bW7*XYrr$c(Y-zv0Uc&-KieI~
zPUeXzjRhPFdo^}bd*jVoC->=5cIDtSFEkY;qd52T0L7rKU#?TLgLm3DWBUWPWb+x1
zhu@GPo}(2OskMv`^$~aO@J!Sd5j|0VUTh;yFSnDT;wUuazJGeJ{K>R>*Ez0>-H~CX
z^w=ACEsM(gDHHz^P})GQ81kKzpx#E6ZdTs-a=qXt^fSyce4A#2+eFN^?v01Oe}o^m
zN5%XSL9JUlkCuulzk2Aq$UMWKcV~u!FIkvO2wjzJ^B%!6@^*N%9Zvg3vCWtZn=VqR
zeyFpi?1vICv!tb~DOrFlH^q@<gHy4;sp13jSuTq2<^q2KPcgt*e(YTgxn<fe%a03L
zoktyxk1#dM7{Lrb_o+MM8m-N;BIYFQpHD@4uV88@yhzV<n~hmVt}lO8s+OrP6#W+$
zK;e_~U~++wd-yA5KQ}Srqu*MGZs<GGR(RSwC1O)}Wh0<&<fS(BWXN(USW4IYHO88U
z^-4GQ^1kaVr>~`Yy`pqmW8iLRyc)MV4>BZcFOv5yNLJ+DnF4iPGzH(_gZWCSS^KR{
zl%tpb<O7?N$B&#=3KP*=&|`gHt%T0jJgh#N7JN7DEsvs>@T(ds%AdnMc4~~M(S7KB
zvD)4pLQYY;;_rsB3cKHnG5A*Jr!|U#Ac|#LPQEXM-9AmH3;U{tNosx!`W&gV7Y!6n
ze7Qw&Zpx8iXTjU!>0Oh}C=gh|Y0$@tifp=_$7u`l3)9Hj&e0@bCdi(W7A8M5IG&BQ
ziHe}S65PZc+m&@ECYFqnLeqC@n_GigWPPrm^L@3)Z2g6RU(bOXZ?WWEr?jV$m1wRh
z(bY1AdU}G858aV0ZXWg1Y&{(rOS8|Os+Rv592Vy7E~ffcgmkw_JByusEz19v+`30s
z`Q$h$Qiv4&u;BR5W!q_nWv!!{Eu+p2;qtO2yVf3BisyLY*$MiWlQ7z;xU4f7lcMub
zy!ZB^4NZ@OR2ELD+GLJ%CZ<-@cl|AqsilZ@N5t*6<evY(6i=mrmeecy*_`BA*O;;o
z)KxoivuAQxq@r8!rC+8OB?*JfJ=&A?iJQx#{k!EoilZX$u2HNimwKDmzx3ezjLu4O
z_!(VE8C1cWk~-nu3H`T>ax#q|OlEm&^6Azo(rDV9^|uGm(sp$17~h^AAWOU0ym@+b
znop{6MUu2sUAUSQX*8C|n{n>gl{Okz5;v9bVE}jPf8VQaldj^wx9v6YuHysn8|#zC
zf5ke{tCEJk;KwHPd|-c{{P+u%RJtXJ=Yqr?CKONkZ~A-LY3|l4OY1X|lsDuIts|+T
zYAutI+$+0{wx3{TO3|;zrj0Dci_ts0fK)NGIhfcl_-ZwaG>L)d7d*rJ$`G1ng<<@s
z?nti3LC?9dH`761{DygZpXuKV2Ax=G((rLwPNKVPD~=d~Day9VsLI1AIrBctwr{;Y
z&9&oiLR808ojSr~8ICyG>|Ta#%v}Ap_}(yam1Rs#gyCwYJa{qq)`foggja8AN~xiA
z_WQmc*^3Pj2_2Vzcu&;pY|GT0<8|5(<;-}c5yp2aij04X+QTsJ_whk9sFVdrZG`S4
zf)92!7%$Z@*QaP?PqaUJAnJbgL%O8fqtC)i>JH)WLPW2h)VJUkOqzJKC+iro&~lOo
zM;_Kgf#BkmBsE%~I^sa)uo!QB5ot4cSN#PvdNOyA_AyWOER)LZO*(T<9^avArb}e;
zY%HKJZ~Hu^lyT}J>j&DC#m4RmQ8ni24&qU<wDynnG31FuPojcDLxoP#>H5u)XA&|$
zJ0BMrZEjAN3A$0ETD+xO$j6fA_mqoo@7Yj{crM1?InjSIPS^NRvh$KtLMm<ATd#!W
z;bZ;BbA+t(JXvBT-S9I_=kPO46H&#=@b|)JQ!m7jP+sxD9l5h2J_cvRW)EY+XpON=
zVcT~Nr9Tf}5mku~vkCVr#1E+Ywi1rgqxm8=2k#R|abnC)y!Csi_f-F}h=u#}NeaL5
zG9v!lS7f-uDLlpe*%W}nRgt<6Dr*6JWnoOrucNnh<Yf~2o6~H{ut*8z8SsCKu~p}P
zZ*|vC&g3rMXmD03Pu%7L?<tW;f=jj6rOL)gxnwr_>Ku~1gtt{OiAI6sjTs{4W)CG@
zoNrTk<a2%x7DJVL2r8c{x&MLnF|B*NeB?LR4~%wPnTjmgx)S0?m!y`~+>;3S-_$J%
zLXW|YI%<NOQqG%x`FFCps29aI!)BXXL7<^0a@~vCjHF$_G|p=u2IH7ky<l=KATl*K
zbeCjP>{3;Pu4U&o+n2|>>H9JoJ<NiqRJV662{8O})qa{msJ1ZyH8<+U{`6ZXDLW?w
zteq)ltWHtea<Vt}=B22qxRR{V{`&9$u;eNIjb#0Si(7RvQEc=uI=B`7#kkp;W$~Ju
zt->h#vw7L_ZSoYNx{d%KDl&?szmH7`;`3M~t4i1v{%I%Q0ObD~g)<Q~yy%4s{k-Bd
zJ9OU|FP!-U@Ss9)zO(a|n(g;eEjSRmq!^k6O<nnO^rJa#*Ivb5??87Y=dA|!th<tY
zpV>W1S9wkB*1W%N7y2$%v-{kLdFwg=yT=lBMje#=&rm$B*7WEFUzPWu(rb(<XNX-+
z$1DFbc(t_VJ^tcz@<y0z3cG~BtOns>foN1R`5Drlq7NYSR~)Z`;fjwCXi+|E+Ak_&
zCX;mI)t@3)C4*L{F2VA7)mP@bQJ=)#%^6Q0hKDJ8IKM(QaC?m9pQs<&P-?|eOx?`%
zpCK#sLh%)id&dCrHQHhSmrR;Pv!vzZV<I7$S|t!gXXIZ=#Qlz}zdCWaH59w=n1fWR
zW+_+goX!-(8VyvQ8+!)qbe?6a2kNrS@nm<Gcd2UotI?l6^%J_;OOPf|A2AtIaFziR
zGl)qEc;KPVQ}s35jUE#o`Mp1uy=9r$->HV1J!7A%UG`S)SMH~eGnG-EKE-lU%I}h%
z<kShE@^z}6t6N>t<*F~a^i<`uBHa?^BP!|>UFpXf(}#qd-@oB!tYUsx#ZuTl4OP#>
zvl1`*R1G|CePg`bALW5r`#Gc+Av+eol)R>S6z^=VtaB&k`gmq!+0A4Z_Ufv6`jVe-
zK<+hH?+PxA;26hoMkZGVr6aYgIQ2T}5{;VqrWn~mmMRza7Ta{Pp<_X;M8}S=Pm1S;
z21t9QfFd*hD-fFRA$}j`kH#A|`82p#>vPE8dZe#_mYwKeUIEtnFUqeKZ(0=mVIFWw
zHTc_59)(wQ99}MIULxaBZuh*Fiiqp@q#p13hsBvm*2`8rh)OLS1b}sBBbWzmxV)UY
zcfkQ6+*i)O56lpz;fZ>0JR#m9agI}(%XpK2K$bBkr(qyrnOotpFXr^gZ=?@ob{Tx_
z9(J8w3gB%LW|a^T&K87fHAWv;Rx>}$zW<FyphJOhZda@nnPE9=r|~FOS82BCE>+qo
zupf$hahHgPd4%*BlV}ZVa5gKN&FiwFh>Y&dx=nUd+pzHa{NakOD(sTw?TW*_TB-UW
zt@0Te4=0o`^t|iSZ^8a(vrRYs-j?)e=q(N((<D-2lFR8)2bhe7Yl#WH9X3d50$sia
zV6Jv>7;MTKU=%s?N#e9rw>eBw#z_WIHxf!YUr>0N=e9^ad+6ITw;CS%IwSM_NiUuk
zSLc2|2oX}u5&c$9tg<P5YCwVuk*o#<hPwRnq2I^k7#fl-pw!pev@Xz?m)*Xn)573h
zHo~^u(5--c{Me@2l4_Tk1LZPgj3wwm(_g+$!nSH8cj;Spa~ZLc43)`US5n&+7Nimw
zDpdG2$#!=QG*b1Qs}k$*T_^Lpb#JC)ia;c6rALGKCuMs0%r`f1sNH^LS-p23#r#3)
z))p0IA_uat)#kE9BTLkGU6yZ=`fcq_!;wTuE=tQ+nRu!=U0MN<yj1pu+AJwE#0C9r
z)Z~Gh3t8}>vU)853!s{z0tQqz95wNx+Ig>9E@obcnj=lxd-h2-x~m7l$?Cc|Yu(ZP
z;o|2D$R%vZne}PMy9|CgQw$|bwg(@z{}hQZF=}&DZNm!PZTs8j*4^qt*3fppK@RN<
zlepO?-a)u!zGg(BQ*%IvS2e0xK0ZTEVD)z}y8FzaAzw6J?V2{dI@DC(^<wx2Za6?}
zQ>}~=E%}K_aZ76iRIz6bIIhKZ$waeET5{%}?$cBqy;ygMpG%mZSc(*C$M*2>_}?7N
zqTgpZWlY7CS&i@tWSZxcpE0TDVRULdzBW=l;YlE3*hGadp&NdRXZlGMhC;Br%Fp6^
z`W=U3>!7D1=$W5bvk9aUBDikGfsm!q;avH!ws6$s)9VK+vts?{@=;h60{eiaDvIAA
zYWgaJff0R4tfVoieK3j3k;F*vEaj)#wlWX9Ic|fMngV?>M~dK-AhDOP_NnXH7*C`e
z)tDsKOw8#~xX3$jl4_sI<W6}hCK{90FV6Zt5%r0i%rXRHA4C34Au4N!VdJ9t*w4#I
zV;~=Q@yAt@L>4|1M&y~PPqIpV*{TraUd-yId#sW)P&h)>itK`Z@8rBe<=?ruVOc`7
zfi-ArTZ|OymX}@EW3^8A&{@p?!&AbGuLhp{7laf@HkNZ2SEL?9l+7AZm%cKG6id@^
z4p&Ow`3^lgJg4_P&;-(#2tC~Vr>DYF5PYm?9zHTDG=@_zbv0IQTm|nX_Cztr_u&=E
zBn}p$!lGOgbV6V1g_+yz1ga`?B$Tg7uY{(<=%JfV2I?L@IV@w;oFGdN(8;X?7*@OL
zv7D|kHQlSR!k!vsO4(-d>N(7-PY=qHn2T)M>8?%6j6dKf-?%TdGR(O<`K6MO-i$2`
zt&m&wQ)T`1WSs6awL@1?(5H5REZ@<el{CA<KNp^_CF`*lmGSZCln%d5Dz+k3-kqup
zd#$m1%?cMpV;sZsz8}MLoR#JlxRlg;8`xrZc;1tB%qhL|WBC$bJYCzDO!Kjn$aVl&
zGz+pvHQ~L`baL-t?lUV?Z0vWcr*@b|G13`dVEfsCcWlVLamBSaQf3BoU0M)h#F<F2
z_^BndhAc;L@*we8kz1mS3vZF=g4E(;KM!A7)$vUU>9^2~<+|DPbiVp^o$ka7@dyr_
z9JfmE=8siN8tj8X61s1;X87wKJ|Fc7C<Zyy@1C-@ZsG0VAC%4L>M}$s9xQiDUumEl
z1|d1D8lF!SFKPBy4=;o?DNZG&y0cQpLX;xO_80Zr`n{#+nb7ryA)Itsuliq<NX~nB
zD(a^^JEEvoYW%g;N3=+@gIUA(p`It6Ae;0=tyPC^OAlU6xbYlD%ZolGj1WG$t^D)7
zM5Y+Opw5-1Yb16|Q?lbt6|x@ub0B#6t5{VMRO|D`?q}t(gma|vMm-)2P7`<38~HZ2
zKHbVR$XZ7B?(n-xN;pm^E6Lfz?KVha)Q6o@W!*XlO}JlsiNlOB%N|U)b*h<X_!(ob
z_PLBKZx<O74dNEA$p}os<r_u$GqpCDunCYsX6i_rmvW~Stump%&W_shOGUg*D^~v2
z!uw}-{H>SO`gY%uqW9tPNR9R*l#2QCo|0-FHe0s)iRe8JpB9#FwM$|Q<)5u)+UkZG
zKR$v9(Q`dy57s_C!{je`MyH1Pu;-|9tK+3DrkI;4+i<Fn8JX=-lEk^^5p3!AbuMiJ
zS(Wh=GK?<wy=K_79);ZP1OEy@PvniE9=VxWhfa~tyYnH0kVv60o!cuBcOSDS3zR5{
z)m}?JhD+)uPtV>wepgXhJ?S=hI%q2~fLQwjyol&Z7OBaSXS|BB1=3QjKM>^ur`n?b
z>1D1zGG6CNkpAOM<h0NItS_QA%ES!2h5l%*bMpXRVq$_fO8T>BlGVX=Z1!?WiJym2
zk#;Z7x`&(2A@B*U^@yc7@lGJJ#2i&!(mrSHcqcD~I+wMaO81)zz|Wya8&Qjzmo>%f
zk=jy4>RGBbq6p93oSGDdd4A&d{-Cw`fS<>iWbU6n=Q+|l?|QeKa10D12KHym)GwbM
zJfAmy9I}MQdQ+$L)%jYyTiPtED~dxa=T;uE3rO7Z-Ss546Jfq51!&uVFXv1ExDLIC
z&h(<2@VJ|1{1(tQf=X2ZYH67EmPN2<3QWd=6P}~gl)t^v7;Umi%SVFMO&sWH^kv9L
z#`&p|RQCx|(C#8NiB&IE$+hOL#;ANC3x{S!kW-=jpk=orKtnv$Ew~oi`2mKK<N%7k
z=-|j&1fc|4%$Hr*H`?*QvsBi+j1noBXGN}+-byYK8kG@-oi!X-6x(6^(nFdR)~wL>
zEi5X0?XwMThqnGwdQ;@CQPLGkA1}5x{<rI7B%RqrxjnK`yAg`pb+}Lpg^e<n_gBEt
zt9oFUY7Xz!;RS5?R49!U3%~e$t9?Hkij|4k<!fR4HtV5!2bG)9B6R9~{&!hd%NVn7
z&M*PtH*$vBdA#2n#N>NHEtYOn;g<C=gspSkja_+2p~Mxg*XIAzPx>n#^A(?21(PiW
z{)5x<D-s5I+<|%G%=H!}x4!=IXC_H;A^NzmPX;OM1*gt%`Lw}S$l_6&wvN)1sqFn0
zM;S?XSBVJ=+OwO?J~|B4bc1E%ZP5qMni8VCSzNpE;@sBV6dI~|KX7fqrec}2d}v_<
zZBzX&vTCV^t{$pqZVDzdSA$qyhkqRBRxp{^fh!36{+e70efp(&y|d%o{DwZ)bv!%M
zu91lPn>84$9OHXj7DaQZEnj0Mh=!v?h{weI#)aFgeAQK&XN^M*khOWO<nUL^C(q+&
z+wzOakIm!)ceZj`+XFaNid8+vWlBn|`7c2p?S%3*O10vP$~t`VfV1B7#k?Eyb$>n2
z{{AX{v`}<k#k`43RUDCFN#v1~@xlg7H>6}XcA70u@o|2$RsA5}KL@hSE+Cjsy2jGQ
zbIM7{DK^K-x#u7#BH!Qcw)Tz78rSj4r3)f`x~Y-#dq}#Y+Ca~r0(DK=S6{W?$@U@n
z=e^^vR_p{Gx%{i*fj^4pKOrAR1uZqB>rJ%BE5{=~QN$kPAD5y8>J+;57)hVJ>orWy
zlv2KBx75ew1{qs)H_fN-g(wAJQBeH^oQFO~@T0L7Ob}L0?DNm!7Dmpgv-dq|u6!!=
z)casPu`j{Y&=$Mg2Ne1b?A}@mKS3)xs7T!7dGju}ASS9=fLGX(Qo+~atq^qyQd3E0
zHt}xE>kQwv{DqtYEhe`l4(4Vn6!kqJysLv3UYQ{Hs~(FlRLOyeCpPSNRd;5!hqoiv
zp3y$~P0PprE#l8DsEFC6D2VgfY@fTK&Z8s*U6cXeM)E0Arwka}7cTte)*NUlFsO5f
z=@I2|Ra9xXj6AO{aYy|)I&oRA`R)a-8Og^s@mn*K!v%&+Mp5^f?8Hz9_zC5WMjJt6
zDTHmyi_72%p@E}42;ID68+s&o<T#$`_1XNN6cwMgWb5Jeh4P6Cul<wl@L=<1Sf%pl
z<~VRPJ9tkAyuzHqi9su&Py$NjytSx_9lcNIp%oC9O^b8xGP|}blVJ4l#C-S4lgm7a
zIX=|fMC;oea2k9-J!6*s09<cbEXZ=G2l*oMU%VF8x~F!!2;#cvE9*<eyraZ20y8;7
z<@Va(=E5uiQ*wW#JaS0S>YSx#$g_Xdu8UV72fIvkM}GAj&YSkQMkCDXn~$79jk8yg
zb>K|~z@RkmE?dN{OE`ltd+rwSLZ-UqyP<JCqQYJbKS)zzTKK%Q_J&sR`Hm<$J0zuw
z45215<->K3GYp`6t(gA;`h4}<uIp@Gr!$kENxl^EYg41hv^uWK(K#6_tLPQ<+c!L4
zor{>Es5%P9>z{!9vFNNHa#|UE-`1Xf`(W|sAc~ZBcX-hDG{_+S6kZhhdvpBr_568+
zG3OqC>~yG<TJpcu8{25gq_b99Z<=PWt(xD{6osw+f%?+-6EZLEz<E!B2Nv|@k}OGd
z#g(6}hRtwFilOqXY2|~kifLoctjqFet*>wuPML}mKKAF^*>|{Ot<Ur(|1?9o3udsn
zycDH+x(II2ep<~oV!PPv>!co}?q_PW245Ffzhr-Pq622Orf^MRbv;mHEs^=c)m(Vb
z^B@@1#(QsY7PX;@kwlar6m-4yLCae{m>Z&p(biA?bCzC2gBlU}s_J=R?|4t)twY9P
zFj@TeU8b*iyOni`Ho2%&(sXQ&NmxD$Rrp14M##VqcGpYZZ2n~<Lc+GB?QoI5P%qrZ
zb|$0xZT#uue$Q;!hnIjL($T;2%f(}J>E{ppyBtWpoRaou-?lo=pL|IiH5VZ1bLl^W
zLm)4RHHXU1SL&Mg8{ORl_a}Ua&_!~u7Ug}2a*tiklj8H%oy$fMl49&K68G~ygC4VO
z?!#WCLF+Q9w=h^vZg(FL=F?9<d2SSKG+){#*K69WbBOh`oMEkD|0Z=DFNlp-_QD74
ziJPDksV<kt^T?dD^x1aJnRVICVIYEo4oI}Eo2{W`1P~OZt6y`InW$O8XrG1Vn@Lao
zu!>$tujWhc_FoP9ey)(z30U(MIR&5(H-%?xpYY*WuUzIO1}Z-hQ}>v24+b@p`s|k}
z39ElLXnkm(USiK5gNd8;4<v&x5`KStfJ=R<&z%%*KPVn$k9qI>At!=k?jmS!SBT8*
z*1>X9aNfQacWm_aS>Vr!@|3c`TtuA(`1gpnImsO0e&JQVZ3S4J11gs<fGE|$ywMpT
z@BW>(_!Dr2@k5(W|M>>Co7X1mbdr~m<AYH!R!B<D#J(m@SGgD!Vi@xJL>~Ec4PHn>
zk&r|{PzKwF7rnlh{gDG(?XBh$rD1gvYl@t)40>iTlWD3$%fJezjx}(lzlTft@y_YV
z#ai#h8M2_uZ`?C><j5xwSuj@o1iyv6lBNl+&?HdVU7m0&9E`a^idF_icjw|N6Ms;S
zIF?{!J|@r!Bl?NQd5^LhBnTUCEPXl}Jui8k^}4@~fEPKVV8czp6X6=+IdtNMsu5F<
z^-MUT-<6zy=+GU1hcQ><?sd*&UV2^U^GD0p{dIRh2EF7~pVB<wFs!R<K0X1NU&ez@
zWL5T!FQ@yw+^48gzk6JumZjxtUhFFhYrKqN4K%!ya%l#`)T+ai%!?3<9k}Dy+acE%
zgv{HZnpoiN%1km0-FCyUN^B`se*iCqaF+Isw2BPzHf!F_V7t~uM!=p>G6Hj_G1%e@
zg>lb!!)nDM+2dbXCK+rlQyNyE?9;!uj>32jkXO8mL}abMSFTDz>V*8yCx-MquC+@A
z!Tsmc%xiPRLCxp5(ND9RJ!kt{_3YT!3YUyU(+%iO09#$8t5ju56TywrN1!>`YIoW^
z<;sQ_QDSaQOD;T#5}cE&r>3Y?mUyGI*lo<~d|#AYuGEu#sgmviDwYD;a-{sY^Ygs)
zk1zCiWBXyNIxY-6UBWY>p1)4naU##dE`5Cdn1&(W^NmL_lXF#k>PNSF=$?538c<lz
zOeqH2{cY_rHFna_6+%_9aE^<o#XnDXx?M3C#eM>&bg6Y9jFp)Mo()==YPfQXY8NjD
zn6)Ws?XT3lJCApOkZUlj-D)MIUNvwTDM`5Tv6xY`bAKNajh^cU30m|yWywO9x<TN}
z0m4@@$n046o_PP+l+Ya$CumjcUHk31q%&0g#-qSlIwR0~Wdh-^+Ds-B9o(`Mx)$5i
znJJB8quaZNWNRYM8g(Tgi~sjW<sH0d+E;}3Ub=r?zMW>1ckN{6VB}1u4VonVVO7?t
z?IlSMv7nxd2?S(cV?&r29tMBx_0(l|MS6Ftzy9(XJt*x|^N9SD4qw$JhMklS+7>#Q
z#uDp>%Yqk6_~EfHBDkVtJjwEG6v1nG@Fd|VQ{I3aC_@Zc$5H^Lo=Ua*QuS_%hVU3$
zBx$}zG)<=bA;-s)-J(~jO6;$I6S;n~4@y#ocyAq&VzSBb$!h+m%)|Qt$$*qMsoQ$v
z84g-&66GYP8YU%BLDVz1C$CnU_Q@0Sih-BW1&6UPTn}zsog4GOs?CG%b>}a@hGmEv
zLFrVQRa`SHvZ|}1QMFZaq(RX%f2N(eQ%!TWMVmeO9m(Zn)3y8An~xA#+4o65ssFPV
z|L#Z;cyF@^8c_~H8g}?#5vT#_;&pY7c17Y-QNqQ;Lqd3^ydO~aIU3X4grAC%CzN*Q
z_Hb(Zok-BGxRF6r+eEVX{Md=pUzt#OjZ09>ncPC<!eEz~oZo6mdd#VDY8AmDh|pHi
znSy4;N$gSBWlE$B<O-g`%taOPfiG0Uj_E}aT`FvkWe|$w$P?42cV~Y5YMFj520_SI
zsQK$V$Ow|&f5*25nn6SpuViz=jOeBiD@6af{2XK{Bg3^`*El|P*8bfAI_#y$m45pS
zRr51Vv*M4we|iHtTi>yC^g2aNfBsSfvIR~yB_eU2cgwj3P28(z-AV^*_);rW7<OPp
z-g0kyZOz4LS_bqL*7zT@%haXAgzexnV=mpwN86QH>QdMFz6?;Bd-D!@whC)=U}hr(
z)Ry&n%af;A5JehFwjPl}QB9rPd(nNbZH)diCg`vqAy<?=EtErzafx?UOR)ch3lnP0
ztHTb($*~i^CmE$&%?B4ZzyO;99Uy`KZFVNao|cEXRNSjYp*0S|H%R_f4}HPf(M3cS
z$cgH}{Aund+pYIHILL+{<Fy3TNSrHxp~V9v?E8Fk<?+{Pbv7FmkE4|WV1J`eUck|Q
zmnII<{*Ubte+NOg^EK<N3;qS+cY^_Giq!L!?X}N!(up(vZiJ-oGZjBw?|BR8y};GC
z!GxFF1mSJBpqrAasF}2n(?kgo#stmC{fItP0ssw3DR*%acJAQItI936e%*NgFD^hd
z2cZl<-k<6R2sU1u+z0)GcOY8G1(Ay30DZaKuKivQ?!*R9eD@0>SS0e`8Tp&zUVk3x
zp9e$HW;gM*g>@Dr{s{pZaVl~iMgw=)?@fj{j6VfiGkvhV&+y)c(J^7X8SX37DJ=-A
zeuzDZP!T1MEb5K#jRI%LhdkT8l-OB7<gX$|j*6k3kmy(&?XjR2q$OA0pm7Uf{o#Pg
zTpW3MHA~|g1iQx|K6Me41q+ah+RPrv#RQcR<lwDHsvm?I^%TJdOjO?MU4f+=TL1B+
zgf!=o{Oi*i|9Yc16lq<%A4ccoTOL3J-3G~KVDP_-)=0s4r=y%0&g4^`Yzij+A7tOp
zznZq75v#E{!KxtBC2Mr`!bZbGp7oF~zF-sc4y;x#40nSES`VO1L{m4U`i&5%sh4ma
zY^{uQfm~HGAaoigvZXuYQX}*%2;S+Uk9F*YV+Booa$fO@|D5(eFEfgokUZE<a%@HL
z&w*0OCd-rC?KCV>p~@)txO(TXG9A-GM)vZY>)3fvX<~cZSJn=XiRHkJ)NqTPxTdbt
zWRKYTB9<;?D6qP5DljCL3Cxn4Dm!DqeBvG0wJ_pNRyf0N81uyIeWe{<`w81`?4$Ri
zWNMb+oV}AajsJNx4mee?53c4}Q2leb^LTMzr)Y=Sh-5Pt$zT9f6SNm=awq^)Jp;Ba
zdUv>9{@JH=Nbh+pd)D!Jm4_jZk-vH`6wLj2=q}Tvwl@U8%-d%hX2qrnApAA>lqbEl
z>tdqH0$I4>S9Pq>LLoE23x7i){wqow!Ha#cc+<n7Rq5~6iVwMpOrH#@c3vJO{aTm_
ziP{x6IRF$dfbX(+Gu2y&TKz_j!pU$zrH}?`+H_$Z*LLhb8Q*&0kowcWmZ5R+ESPXC
zf)bKiCkO&@ffB)ew&PnV@bf5S3-^z%=S_Uc7anwt^PzG(wf^TugMF<+ioeuCRqbg1
zhBQv2rms`#9!6gyGg!Fs$FqcNK}3l^iZiRk<3~pbZFV&uj>xp>+ZppRm3YKQR%k8M
z;3pD^UFo0m{)yZ}a=mtTj>Y5%F7n_|F>=@cjG+W%4mu%-tmD`X&wuvsdP+!Q&;x<*
z;hkm`GJgX?JPk&Ug*DGGH$KO)f=H+7cU^=of#G~60cEO@;Y35w8#<wj!bMdezonC}
zmmGY9_&I*7VKor89ve@aA|)RGPi6$muBOxUHL8owii254>dBw~!u}d*u&U^7OZpx2
zY8t;FI10kT`&>%O!Oi`8!Q9#XK2`*V9uFRbQ4bVPRJjS=1b1My<<-Wc+cjX#L^NS1
z6$>wx5@`}09Lo8w?|D-vrOMUp32E-^Nlo*kbB1x?qL-g8QM7u$)xn-kx9J37y2x=g
zh-&hry*5m0X!g%u9HgITD+EY$o224ZyvfYdVzw?|=yo*x>c=Jiz5^R-v6^j+mz24E
z+emzfrVZH7z4DmVD!TMCa|JlTNOiE2z$N|0_9ui=onXRIW4()N_*XYbz2C*+t>nLg
z79Dz_9jXOkyu|LippI{_)54#GBwi5Rqf4s1Y)!0GBz4N9DC@`Bg+(a~FKULT4U66V
z+F3K^*QyZoFNt}^tGGU<%;mn9>7(9)_lEm|B;T<7k&9Lix4Qi+>}L|Po;{?JXk<90
z0yNStYMS6?i7g(f{<F;&$re@>3trPM>QB?d8{XwD19vhow0l}ypqDI&kI3ENca^nv
z!$OHHj~n+|y2Xi%4{b`Cr`Ma`f2VK%*}cGThdV#}mGi+PN>_SyqEa6(j`N=4E6bk+
zqsC^}?a&$Eh~A?0d>)LEyh<^UI`P?Zxn0XiYUZ(PJOB6;!@FjO+<%TeI|af$bIpP-
z4WqBLyl(eBq4dtK>z&%#*584GB)w(Z_rLS~6Z5lCVi^lVLuh3F3s(}dwhR?8xpDaM
zwTtXj*;8|<QmS)pk0^9hj;8Jo;VvTQ9Q+A4%9i432S#<b&OQ8nx?Pcj&6O$@)bs8z
zLi1HkSKfCeex^M(=)7F)pCGbAh8*`O&qlDZS5hdMkQok=sSh?}9<FboX`(2x*{&(a
z@>xDGj{9$MAJ5viy>74IL=|L+xQ2+htrT$hq?zrm?9jhb4`rnHHmj1?&(<gT4?`G`
z#TQ=)E|0!6;wLXsjz@&6A|g0#^sYKE?5=#Sdvo5<zJWVlf{<^UH73YYi>V7yb@NMU
z`X-3k7VT}re5ek{v~iAK@{NI=ZHY9g*`eDDF@jCv>v(}T)<imu$=%AS<ke&nW2tld
z$g%q%tslCze~8g8``J_ZU6qNVDiVItSY?@kYI!`p-^pF6XDgO=nS-#Alx{Ef?$tvF
zMf|$q&zdyW)S2(VU4Lu|9Fx<8t!hod(m)7fYyrie>r}l3KF<{}&IuoWape*?wsO%6
znvz}_9h)Mc8LRflH7zz3C~^det%Nuh`D116MQLr@Mhka^9mBo{@$IbMb&92@i_>x`
znH^!e<$fiIs#Ybn(W1@mn#?xCeka~92(O+=src)B=%1L;gu>A5(gnWwR@X$r<TXa{
zT4<0+LXY}Jlx|tG;Rn(anDAOPj|9IkMu$fS2VKGGUBX`1>cikqO;F)Q1ij+=yK4w(
zFxu>96J&loZOg?x`j6g0p7xx%e!f3TwCAy?AF&OHu;a71>e@#P2_dvtXnmREg-d^V
znj@8hk2HeZmz`?|uGH>m1ihafkl(MbD*TdD%}1Hruk;WfqJNR*d2R@Ny}1@ZyEArY
zy-fMt1uW^0vbu2@Kd<`@51v%NcDU>-B3$v=)OWeq;pUYppQraMS9>F<)_DH?fV{Z*
z)b{>baf>&O^2)C)^kqMu^IylgrZw+-a%Uf%?=OEx&3_Yr3a}y6*+E82gcg-IlHX^@
zUt%v&DUJXpzS1)F*RJ0sA2vFUxYl9jO>cYkbD}@e4A;7CR9{f4OWM3fvyn<<yz-)}
zLHi4>@att&x|#~3zstVMWpk%$+(qE)_Q!g%-~rX=t-e?)4MzX$LK>&)@e&WO6BD>=
z5*=<9yJ@F|$8REbQ6vF10*@bT<=d_J-MalidKq5z`eT)W%;&1tAEa&SywwsQJ?y{q
z#ax68nGkYYJVDv|m`BnWFFty{izE0PL{sw}>PIqvu+jG7SqK2~JQ6|t(hfW|#lhR-
zWsJS-S?-WsiQt{}yi8>PZ#*I6p-UqDqD<V*meT#5NlS*fl!AjiKwLpAek^_fCC0K>
zSj{Kd+IbhhBm`tKC{nxB(t7gX5^qa|%I<9>S$qgBVN7g&8k2R*`+Ap;jrxLl)y>Oq
zcg$Ed8E+?C`*Q$rX9$Oi;?y!1)~o!4@8fTy8ABBYj}DJn9}u4S0sXBWK+p<IZq7;Y
zuXpi%_D}i@$~8}<+%B6qMILA53_Et>2Mgkh@>?#d;E1jH7$32w-?u^|i4n@-uG#lP
zZYj;miy`h*g#LUo6!G`2D?u7GZcm(WKUlp4$N9G3-A$-X|91uebq(5w?gzzH4vR8(
z{0oetA^hCH=brT)B)%R3R>NPIX_*5lq$kWi>VPb2o@I@v$=l!n{4qU%`tX(eXSsm&
zW)jE4z0KknFbV64V6aZn_W|iHd|0|HnYF@S3c@r2;N%;e+q(VW<}>w62?9Ki1$-_o
z$aNg+d>FE-)Bd`kJpI^;B>v$v9)={w8a<^vgw&+VxdZ)Qk4zU6vS><TBy{qIq`4^m
z=l#`H{MMoq<v;zH>&XWNcMGg5+%@>d+hDPokSr9_c@A9<1$xHo11bQ*wh}ab#ty|=
ze=&MW1Hm5R?Qza!M6zW8jNR{n);_~|<A5VvgDe<@k3-0znFDV~wbPQ<aTKlmmu9%X
zmEXyV8v4oGZO^j#c5$(0;A)+h^1R2e6T!YVqK4MI%mxeAz{d2$HC|45dpY2M@otA&
zAjtex*snX+@~>~$oVS4O^1qkC9Jw&lTJtAHe&9s;ZL`W`O#08>FD0QbmB(vJ(-44#
zY47H(`gmkL_c{=WbRm4+*W6nDd<CyD<LDFTs5yQ36AorKfZ)O)5agt$?6)$;0lDLL
zguSWIhb++%0Zd|rktt)@)er31b(?KwlSIb&CQ!lTY_Vw(C)k*jf&8>N=(q}R`W@Vd
z%iM++0}@i*_|nB1@R&M-6ftbI4!amKG^rpKk=0~CAr=0=vMn8UL8NyAm65QPqr*5A
zL=zdBfzLrRk204bsZPKWkrXiR0SJ02e?9csd<YR`)U2d!7LgVXfSpG$k3Kk74E^|b
zVDuR5?2+<C8Ehhq3+F*i-pwImI54<zUg`d?*IN7^baxVi+=Q{nAL>o77o$^WzB%l}
zkYEqs*qJmxpf(lVT>6*@b(v+paQmGNs!}hIl)rlWSXBN)l_oORFK-pQ7HQXFp@bo>
zpL4ZNCq)VvG7u|yqz0@A{qKbdxrtnFRd{<~%LUQ0`R&J^n7=7SJy-V4S2<s+qoqbm
zEOMoCytgUKLatCT>%o3sV)1^YfnY^d0}}z?0F!w13+lQaXoSzHo>`3LZ`*f*wG)R9
zhXufE?1rpTi51ZRYwChj#;m^(vC{Z?!?&FxoR!K~SMEGW(`jLPCEPBGD5#`aic4cZ
zean|?UH6>GnJZOE;3TMdh!<p>7%sSd|LigcHkcMaVGe%x%0;PSwZt>>Lm<~6c;|5Q
zvFFb{II8d`pYa?}CFuh9*Tl_Q;V<@0KZfvit|FK9zmm6qZtB1H^`EcAJlS>I*=Idy
zce48nytM%sN`@ODv#^&!PTiu077K7~7f1p+1qZ;NAb3Ch>h%2nJlt~Ub6E4D%X6Un
zDcG(VO~Yg9$2jw=Znd^&BQWYNo+|hD$conI?SKFa!;p6X!DG!GwWik>W0nIKfjX1~
zBGYXWt2+m87jk0rprkQ~USKpkNvDB8udzfulCWsp`79lkf!Fumyb15%2RJtO6q`La
zks|8T02n-XpmVMHN4655std<$w(PJ6Bt+eu|0IO}0iOQ-PL6g{lBXT`sD4;}7R$OQ
z3}e?Nx8GljS3!~Gd1md-#EO0ejhP>n<F^~`z<J{V1W|YC6ETA&`<McrvOToLBv9iH
zg+k|Y*@pv=rq*qbp&xqGPBBP}q4_Z;oiMWy{xFmIB`G;!ViR9!=>;&Uc?gB81yCsG
z7uQu#V!=C@W~W{7)mU~O7ruuKRUyb?^k-ed4qHJ3zy_82Y@#e^+kF?rI={l8Y%-Pd
z!Fl%6(lzM?E@G3xxYy>d9J;(8|JV$n7CTfrALYbKtVEc0%9ZkeZ|8r<9F=J@nT&zl
zrui@SP@zuG^IyRkWIOd|hMHOym^TDg;)Bhl;M-)u=_VxE6@YF$#8a0Y8?WBlG?N7;
z*Zol&i6xlyiHAdLVvxr47IFW)V^O$*Z=c7lEsg8XxIWPz{tO9~On`OkilG;B03a)r
zPTjsUE*W&lzABJ_u&x86^6h#_Fn5{<N*sX_5qL@pLbX!Ur0MxCo&!sBJN&-Wz{IDb
z3w%!$0z6GBq}(U@Asgg3_nN*(+Svjk_y%Nwxb6ire3}CA&Q~)J;Z8n;Ds?3kBSRq-
zr~i~v|H^>y>G$6cb{!=~7Cck$oPQH%n@E->Y<%36^Myr)7GBG`xdVY7RYQ5gb*2v(
z5#r+<gi`G)*Ktk~W{e6BO=xdV%Pj3WmOWt(H(-0Y=UjJyAA2=Slg~j#BOOQ2d=f}i
z{;~tGQSdHMJWU?tu2Fcef}J05BKYys;2V|jw9kHp`z#G_&`|X3DI?Hotoyts9D~Z3
z5FK_F!lBF2Ugw_R8$ojrHU$C+DcSLgR5S>V464DsCaYB-b{N}*dJ~@g2GZqetFLq#
z;(Rp5UpdC^g_}Oa^x9~IslZ+AINcJVo3RU7#T?AQZoEh|=bE{-1HR+k09cd`QqK#q
zpOyp3^B-mkJ-xJqi7A<sN&%t0cK=iu|L2hCu+t(6h63nz4kFpnM(QY3ev{TG;OTw^
zeNhu8_X3qh4%QlAo#n%wPyGFNTQqIH-`dox+lPT4<M25z8a#VKtH_e=!|>)Hp(k_-
zUlU_OmL@8n<1b9IJN1-F^6*ab?BxAcg>m2f7Jz)wa3{Vy2cp&u5t;UX;7=WYMm4GT
z8E~=jef9;KOE=d8g_b1CfOJVTg01kERe&pbEHCyRDg3(3%S03GGH~tm+^)9+O33Gw
z8BnlsQbGDlz&wl3Axx#P#A;4xR%f>HG~r84vd?G2ZKOr&4_D^FjXKY?$fQHisW+8p
z2mMW3<rL+cR#y{ui|_J#3%PSpG$l<`y6D$<&JCY_17Q3QAlGz>HXh`RbY9%uDK%~g
z^ii(ukW=8c0gn01xHbtJ6-lz@f2A`2&H^;TLIj8n(~a_%74Ua4sL22HDZ8_`$mf&e
zJJ+5316Hi~H3Q}&xP7aCg~79Ai9Q*wWw!1OdwHo(n9wP=NeIAr5LCdr-hc4SbuQOK
zd}I7gyQ4SDpb^K3WX1T~WK6UX_n>86H1y70T04Gg5meY7f@*8i&^3_x9sdEsGTg})
z&rTRCe^I{j#y#1qPgc8Z!<}3#Qif`csSW<xxsr-L@yztjF-sttWX7GK&<E^nSRG5{
zq(4BmV0XRXp~QofzC}p?;$ubM%6wn%dm^FjUOjzmc3#50V;O>h;T1HUNK?g$e$gP|
zD^&)R$_MNG-l8M7LK1r+6S?MKL&GyqGy;8SK|FNXp$SLrJYK)DGENRELqM5Z{rE@^
z&@$6Vql{>F_Rv@^6QY)nv@@=+Bc~Oy*?AiUO`ZGYj?w@368rld4qn`^Ru*EV=&;l-
zttrqCn}debMLWbnm<q{ae%aX}$UTyjJ!iT1Vcb?oraet$-ChbwcFD@eY)~tHr~#vh
zWp6Xw4P<;q6{mdh2G?sJ5X96JkXjEr0{@sqbVlK_cJZi3%PA4Hz+mX3dxCKt?;#7{
z-RbwCi7jL1D&6Je^n(1_j{pbh0!8_P-)Sx?ArX6Lw4WMuhgDD_^<=)M3zm5hZqT8F
za;anl?*Cq}^~AND!#H<G+Mlm|n~DtE4L-8r!qYH*;JvHUM(P8+cpiZ+?`R%imb#(B
zt~oR3(sizV%?e3u>ijJ%*tWQSu$_J`JR{`R>Pb#bUGtshUi;l2bDF&oDL6oid_M-4
zBx696=W3R>qSAA)4E!G0Fk9RbVjb@E4Q`~gf!Ze$PI@3Eg8=ZW=L{CiSarap<KE-_
zIrMm~k4tl~CoQTU7eq9PN(ihP*ud8@;3Av!`jL97C3CoqzY+VWPXoi5oTg5S7-6WN
z|8Ho)Hy^~21|y#qm|=0$?+xYfdbC4_;VX2n%w|GCweD&@g&yCYSpM^-t`*?VpL2{M
zFVu}Rd<jWq@qR45&^H<B4<n$&?qC+#FaTDFYWXXn7&WTuC6)Ra%_a_#oRb$0;K9h8
zlWv1UG#JD`Q$Zz(9HU|t9!u|)l7f)lrnq(Dy+Kd0aqeA+1w6WW;b%|76{P@8gbq3r
z%=2cKtqTV-WO209Th~WRGwMMH<cU;I;<6zV;qL7t@W9fH4mQbO7WbiC%K&~_6knqv
zV}M^op68N>-j}&j2Ww5lE?0K!)7g40J#EIK{FOdmF0Fg<Fh*8TcV^m0f#hY)x756#
z_4Lj_K1yTgUWL;Y<8Msh28mEwJ@973mw;eV9a;XseETxzUa(q7tdrnVW#m3e<IT-4
z*0NubJn2dd14_&VWs|4F2B)6A4nt%;yT`r%|4Qvpj0p59o5xhW_+flBZOND`c3&T^
z5n?{gMrdwt64iX7GaSjoF5?|<hL1kicop7bmNkJ!T++{UIcU2(f^4B@8@2YS9<}ZQ
z$k({%1J5@>E-wO5i>`?U>QH4N$%UQA0&w)f0+SX2Ki5ZicOyo#8iy_#kfTLwZJRT2
zy&z;hRv!giSo-^9;kFnDaAkQjc=CRYQ@+OYtBtp53e^hIe*8SNCy9{s&|yo6UMFwi
z&$F=K(GEed9hq0~u8}Ud<jcHF7&QKvMd+_Idn?P?#x0K`jJ-4>>?(5({KI59W_hAa
zAYIQ~%ets!$ILMz9wB1g&p?{D(0=*~RaEEnb${@d^uK&gV+R_b@%UC^KB*K7O6Srv
z+_gm-+Gtg*kzD9_BN}V8wc^kB|46H#?HdttYlz(07^BBRJ>M%dMD@hFCDLAJw&T7d
z<fwy{E|+$JBG@2o?MA8^Vo#OV5K{TJ%Zjw<4TwPEaeFg?jDb7V!%c7Rz{oLU{?AZ}
z6N7p{4H&P1UGOlbIr=wb5E_Svkbhrvrkt$aS|Dz&nS*_O>2PGoChvxPxGJRQ=d3+J
zsJU2@8#S379QELlh*fdAH+#L>lGp|I<WnZK%^7?d`-wiv9V>9JTs*pD6dBrh#<Kc6
zo?U4f$?<&uFz_hl^fqKPbthL;(j)to?>0h<Y2&f9OHW3KK(aHhGl|zt)oQ(gEJC2R
z#DZ_U+;jATh?IX-OZgc}!b@^>I0hGFl<28xUJS4_$=f~`G+kCqJm)OJjTul;wj`jY
z9P|iaL~Y{*`^hxHC}uZQodm9QKQnq}(S%1m0_>++g4*2zN(JXMHSygohyO|7{+-GF
zlh1`vmmOW##`ZUobh=;>NDkB$$SPF;Jj}J!LdkF}YfXlo@NoUr^n?U~lKG0`7DD6X
zh{Gu{zsg${U%Mv@nwiFwcP9(UKO!z<O_%u%el0_N)}m5h;#*1}^U(EqsthiD3E&xc
z9#B(}G4>Er0kaTAq?nGt-TUQc&oA;=QYQ36_d28M>Im^zo`#^LVtgMC%5d+=6xF3O
zt_wJ8t2UyuXn<2JV}nX+JuWi}j2qZH`GMJk>;sWs%ZIbWmU(YOo?j?#vTV90(28%7
z?)0ah`=-_O)msPq;E6!>0)n`D>OPaS2moLoi<9et-$S<jFI>E&n_|cib-5lyHZ%=8
zCOm#0?Ak6NKHXp+04g0%=4h)Bw*JZjv>B`R&~{WM_M440XZjcA@`md`X6K<bRAN1I
zaB9&)*8_pWiF9!-tcHKeY_~wNyQV5*`zTEXm5JS7nY-}#XTtv@c!wbIER;kAMo)Tv
zm8j4pYm}Qx+pKraYlx)ILqWFo6muyE<vy;>2;Bvu={}$~=RvD&>Ee*C)q$ef?v|WK
z@ebHn7`rP0Z6iY%^c&*!RH*7I45V&Wz$lt#e&*m%rm76`LpM113PSyy2%-K|yW+w&
zjK1Wd_teJh*?Nw4vyRRdkae|<uoV$EHL+f2nhc->75vv$T8){vwk)2=`<3EMe?p$m
zceFR!hzIsNg3dmUe~5g4w$E1tvmOA7FZU5VQRVFXLK-`pzde?iOjMepy9GSQT}6=n
z&ptZ)sbinfxqDn3o2Pu^nIrXM`nwTolA=Y8YKG}nZ`g364$~9E1xh7;dw@rm#u+yU
zh?R7*!hKk{CXwDwyw9cgeu{l*_@$2<`r9H1FPDczJrPpV(emfYzjt!wN^JBZj;js>
zNWzEfcqa;d(;q#L^Z)4+{!jh(j1>3F9j=I-2nEyJjt&4NH4+GjmAZ0Mi&O~HL}VN(
zV|O4$RLA2HH;Qp`g-HfL+j@;hzaX$Az7lo~h5y~G;JGAL{}DBU@C8Vc_c$h5WvFA>
zb}FJly!E{~AG3RWTx$%>SdyDORK*|?1^~n0p%=O+d9lX6lQNn2J-2zN3-t~sj#tum
zWh_s8T~7EYV=^jT$_xw!@y%1uh-S%aW9d!2no`&clQO+-G4K$k8?U6qrxCA%cRhJZ
zLe&x9GA>;8NZrLVcnWl7zs!#AcNRSbQ!#$eE^(_cLk`!6;>qfL>KSH)?ntHEH@&ZY
zq@JRqKotD4T#zBY{%yL`%i3?W2AG|5kaxcGe>?X1ZAltO*yxKi0s?jdT@6)J|3l!W
zNr(vDlZKkWfW;a*4$s?~QEa1->)4cP`%l?fMFwDf_%uF5P?CNqlZpsIF0aG9T=6E|
zDaeMb&<YzG0?^*IPIacy&(1Lz$+4Dv&%GW1UO~i**BXw?@D>QtI{PKuzC?Ae3$Lv2
zEf<kSKlx&kbh%+g-?AqMN`9{X+yA|YAzCWRhRR^?C(ijNlJOG|-pZvnZ2E%2m_;oY
z=}03YfQ_#YwE#U=x>!H2K@L+=7+dLd$%UtC#G~yk-+sgNkU=%8yA=G$g*#xAKJ*nP
z2=)CH8H9d=EnYWxDn1%k8jcKnQ8NrP{5*Q)2n6f<vx*TU){U#R%lDiW*uOQqkXFw(
zX=+nN1(jJ~lgtWq=9fZ;t;U9R=>aynl)&}yuokc;hXz|m8W!816t`sIWnQQvS$xa>
zOq>|4z-4%aqvhrHgQA-{(j@K$LsMJJ9;x+U{}<6jyDulO9(_nhN>5PO>rWn>9%+2%
z7<&dUGr{K$AZQKWJpyd>p?jC)i-J#bKRrG}HclK$R#I4gOyE*!`emnd??GL?^>~q9
zb16g?>{fMNgOuzMldVdIaAVZ#o$tZOr{b1BOGsOGnhnX(4;DwrFqxs}PlL30E_f$s
zanybJQfGk3f&||h?>#D7_q;70vr)75cAvRNb4ga{)>)A~P<cq+`J9>4rxY0<lg#O6
z6e-|n3(>bT_1u4#YAQAWl&f+&9@A99vWQFFgN(HWZl<$TrMlS!ktfr6BV6>Ljb>6n
z;3@Bjt0m%NRs_;V;>V(4(97<*$0$J)(|XM4#pTMR=kcp2e*sKcNS`3HNt{LiP4s}4
z9@Kb~GYOXMS3)3p3!@_DAsgS`cWf`eew+PK5M<jlr#R-uArH>Nh@vf9Io@^k9%INw
zN4-jKF`VEn)0hxdeD!&uU5i=j;1{N%i>Umh#+4#Fz(Gx3Tt4?^=tITwLEzECi4NUh
znd`DOc8jB*tez??D^Td^H|gRM^>z}g&ReS>0)QJ~^v!0|sR_@O!EER-FiE+_L)ENB
zMre>Vi)#E0Cn-)OWU+gD*+(IkT}6y6sYD9jtAhpQ#khErzd^8-sFr00!L3iH%r}jP
z8brWfRH&HJ<uqwSIeHn&ef*`L$r%V3%N(E)X#R=;1e@64Xx5?}xGba#43WJWOtx>Z
zZoGZVq8ek{GMDm+N!s0;uBh&eeB!bJf{jl)!=T%z_N1Lap||CZdxTFgK56}e6=U-l
zP<;TkieRVdzY~^!H|hQb|1{Bvxg9vv$(|fTzp}+wg(w8vw<#oDztX$g^V^*TVMLt4
z5`(RWYtf>YHP0(>CYq0CwNgW&K&g3}7%e&ugq&vVJJvCRa%RV}0IcEh3o6%%NZ5ta
zCqwYSAyOT%qL5lT<-Ihd&|cTohCXOE^ngJ|8YcDga4C2rX<5zFp^LI!S+&~yT+4|S
z(R)V8*WppCV3w_nw|W-_LC`rLev-Le7J!jf4%H@JLC$@O*b#l$d3OTzIP*1vjvwJ!
zb!;vpN%Dkz+OJ%dx4mu}uj}(eZ#48BvX6OeGE#<wQXF;E(sc~2h+1h0_zp{frcqo_
zWPzgQ*UGpA7lHY_N<0AP%Fl=InfEk1qH>%L7zA$1`7GsSvRNZFX#GnIy&_eu{EsyN
zMGfJ{av_g)Cxor72i(AQ>sVEGeLYSvo3UG#lOXBwyfj3Zr)x{#MXC$r&w$NXY+3=;
zeb{Nfl1-dV^x04h(~;(lZOaSV%_oi-RFG90*|FNLrXq4gNLL;UTjYexT~{7y!lWy}
zcsi{+_Wa~(X*k2@lEYWde#Jqs=+cW;XV=PnVa6&U)en_8oAr?|YJ`1<O$KB0(B4#V
zURMSdWdj7y^a7!K6SEb#skE8uINX!l#J9xt{`Y^M<`8|v(SGBR5axS1iAg%HP3~`5
z+N2B&I3cCj7ya`xb<dwj{ff^lg5FUkWt=Wf!gWlHQOriAV{-)v2{?z`tiVn!^&*q}
zuP_d7AV^#2SaDcBnM8BE=+r>r=Q_V03#W|yLFXrkP;%GT9GK!Subso#esa*Xbi~o~
zs=fzAmESN`Qn!o3i_ztQH&)7?s9B&4-CPjCI_~~!hFd~P@jKzQ9gfcez_-ek3S}?T
zV}M9(nihFG^6DDf^Q!=pnf~n66TQAB+U6O_>=~sZe2v^yqZf2EQv!Q$x_cD$>*DN>
zZlmVsjV!;Zp4uzwH`{kIr~+oef?uiIFtJk2qsaZB%_SZ{lgpNKpwyR7KKU+brqfhM
zB<}9T0Ejk?zrZ`vhv^0MLh<$5;4NNkKOW>VaZ`l)FVrU`ttpUPz-&6@UbzR2jZ>;#
zZ;?BxFRtoYZ2B)BmT=CUk~`4#oAg$%X>2l-NDCp*W|(leOo_BGzc9w)fNP7Y!S~rZ
zXT+!g{=LlYdsg<w(yHE*sxxWJh6uL2j}c)f)4fYfYh6Z5=}9L6p?+cMv;Mi+Fs03M
zXHp|j@{N65U=vGea?xbz<m<>$2P1Y;63OlHo<jF6u6OePjbt{dBML;L56*s3ZxRNo
zLkQ9mmEF{Nyh{`ts*4Ld!Ks*!jeTm0IBBI2%hv93`4+f7ElHNME~ei!`AFe+Do?N?
za_$s(^|52m3z*^gdl?g*i`OpTIuD=~!!w-^P`ygNZds&{$wZMzB1*ulF1|Des=5yS
zGQu!Zu0V(VL<zLOvVdMQaheSpd!IUeWRz%e5EFPhEoqBiBUPkvO8KI4g7t|RamAG)
zUV|dFlI3Y}v49_6b9#lP*QBiz6$$yU4-Uht(<FUB$i?qrh&;qD=x<(iq!DwtG}yRS
z3uNo$<*R^QThh<|Dna{5Odd~|5=@gcGbd_#;jptlT{=;C75P*&{lYIuPW@SyiKLzR
z6A;G*(9^tmb$v2q&jG9TpUX?qR|!AvQLc+~V{lN%3;`bcx-Vv{%EmC(2`bDN67D$y
z5iiV0J6P?u2lMqQCLZ_<<25GDI|2E7W|;+-dExKpbsH(}_*x}7jHZP<1kd<L;y31;
zy*=j{23u2<%Iid2g8Sq%gYO}yy1&kZs}sna&v|(%V-L{>2#%eI)m(!t&A{kpgiN|F
z)+jFQ-KYPvqC$v*`#R*o^*KGBcvz`Pk#PTk@0e-jq&oS=Y{ve@`E9*K6+%Aq@9yH(
zuW<lz8pyTBND`bP$t%auVotVv8vw16M3F}DPfYr#I6=uktLQ0v)>ZGrIolQ_5K!wk
zNNzQ_SD%1;yL+r1hT_OHm;&L3Knuv1O&{a9muODThfO%GLFIa#{Y;90+30Z5g?vD5
zbhlMh%t|d~rCXCRC!~;@hKe$0@zxZ&j<)K^G<BKgQj)<S8?Muo+NixTqQX4r)9u~+
z^r+_qMEn7*ym0ky62)JpekoLAQAAu9jx&mDup8L_f%LTRoSbI-|LFP?XsX}teH_nm
z<RBb#r8s0JLuD*Nlvx=<A(T{PipYEnQA%dw7&1o4P#HqUkg<#zGe?MIp5wn?yx-67
z^E~VSUCU~T;+*?+-}~PC+ShgMS{!JT+mx*yx`;n*BV4MB^yQ0j@J}YoiW@-JCT6Vo
z#k)ID>nL*!-sLf;pXvWoZ`DV4>Da68a|oMFr`^y+e&9`@>d21_6Z=xUYj2}-hwA$5
zHyTOjYdum=?iP6Y$sZvhku<h1WC|gE#!lZk8Tiuax8DKs2RgKv$2gf&=@AHC7$x|E
z-{c#Z^rsRS6Lj84zbBtL>o2TVRJ%UCIl3}44PGnX$mLu`UUnDpu}p6cq+LpYNvR|5
zbgB7}Iq?g5cWeE_82b-iiwA_vWBtP2k;g&QkQN7#>^)C(&#w0&ESGaT*JXJ%KV;UW
z?lElmJD7Qw^#!cd6h_7))W%7x$R(OaQMY7dpG7LU9zUP=*04Ykyan>p+z7Q|wlYt?
zX{!X^dI4U{Wgq5TvPti95jwQ1!pO3)vv3q`bP!GQuhtP?MwCk6h?C6W6s+iDLn{Dg
zR5iuY&56qPSoy7Gd4W(H9dda+6kW7z=))gK2ENX!Rd%xo9dF^5ocsC{u>T63XRbyy
z5!s}|c7ch{UH*3PnVoPC)RR>g`HvmY&|nrZ6@qTNeYLuMQMIaP`R%8AbDE{|BNb23
zaM5iI+dF^AGrXmVGNp$Tx;}R=eR^!|s#S+d>n-}W+h-+QLxA<~UzLU8FDIaL`AA#0
zLCTvo$=w8<gYYYa;4_9Lb(*o=irt>BL-IiuH_nbpdx6_FH`D#iFU@)*evH?#)KL0}
zSaxboYdKuVgN&_IgI{-)6Wzm_Lk)5ohWf~tuh}^*(>i=-uyvRpZFweVxoZo^H_hmc
z<7MW!Hho#K1Fv+*{0*f1Z@W<V6Ul$f)s@zpN_5{-aZZ1GqDE+5;rpn~la=$vsq?fJ
z0=(ug5(4=(bN{Xw({PkL`m=b{1--m-WKN*AN<(J7gjEEP(f@3K{ZA_Ch%6V!muz8<
zTj4Lu{Wi(Ex)1NSh)3~gN?QrkspF+cXYTJ|+kUP);65W0;041+RiN$Kr8)_Gjt0<3
z3$3@aavsj?Z0+Sza3c{sSp$KpYuZ8fAWWc^h>F%t3N>xEu3}J9?LFfl;~c`!)%C_e
zQi1}-Y6BQXl)$XkNo=jzUu2}&QM&@25F-(p1h>F8uJ}kgtf6_T3T+9EdCe^t9lr?R
z38l_iDY9CPhhc{a<oU1e(B`~v)BaTbVyzWQIDG-8PWMS@W9aeEXcWMvy(GpgkuAys
zmG}>qY{Tvdtq}&RwGu<EpaU+~Zm^G@eG^4lgk>d=T#RZvlh*0rmUsI*d5(3yoM>ro
zvPQ%k+zw4lD2u790`w6h^~BLEJjHT36(d}+!l5k`Gq{M@Y3M-fuf%l-zw9Xb$vIzT
z#p5q)d)$&xJAtLYIhabWBy1Qnjk4HswtrgNJ2hRCFX8vPk^V%yYwEF*33*H!)%U7&
zz_V3nOmY&c2A_ZD-V9i_UiG<yvA$c37biEiBBEN(zyf=8<W8OFb_UQWk-q!H;1RT}
z;m4E&kUr^II7KuZo^dycoc(aW&on02kG9rP#WMIkZ)GZ@i0QdEJW>?ZIo0z=e81g&
zR!yHw%*@|5I{W`bRS#424t~ow4`5%+Vff`RNIXTssL>)|s==SBnPt-~k>GTr@YWN0
z`e_n{uZTEJUmwOR>r|&Y;Td`@n<7=+<JdF33#KE5ySkuNnJwX4NMYl{k?>MBf5EYd
zKT|4$%8&Jcj->eyGS(Zwv{lcRiAv_uKZ|t!p!B&dVRB_HRx3XqwtLqu9Qa!%+8L6Y
z3p)-+sopV#C~L(NU61E!_WGPAlQ;p>>mMLcLB`<E6=4dvCpI5orY#S1UA_bN^esu$
zL))tq*eRj|8S5(cts=(PThJg+ubM(K<UIEB$P(!cnY&aeqvAZuN+OL$oJ_}%Q$L#@
z#}@EiJKLVV$*ck#1I}GiION~3YQ>m%7Rx$4T33|`S&^DvxGIqeD=h8S+sVPPG8II3
z0&RUm{oRj9VTzU~kTnWz&Y498FZdG4LWEVc0~Xo>Zj>BUq~)!Wn(L|mF2Ih;MjCea
zdqxG)&dR`+?cv*0Gl<o_$G?0ZI2}z)P5d~@j?UNDdgMSr-2qdRox0pN%dP{9&k>Qm
z1TNq%JH`u(=-=eHQ(cngsNkzI2cd7}U8|7L1E9NWa~zwU8j0BGU7-254gbF=E{35^
z@@jimNz!FW3dRhr1cBo+PV*i-BTkMn!Y0o$@uzch5F*KQAFrnVyQg`3oDOAZGPuVV
z@cS{W^?s?j9Uk!mD^Zib5q|GxtpH0rKKk?(9*=xgpRXugGd~jk)5{T)tJVbt0lGc*
z4eg{U?1SR<n&0#J4_->qTZwC`TDR!zh@>R#xk;*e-lWoP<zCj++_?DL6CTgm|FC1;
zhYl&wekle2%go=Taz6;hUh{IO^Sb9dn7>eU&;y;oiG$2Uw$oyqOb#+i-m?w%7We~Q
zZ1%PI(;>PreUe>(n!*1q)?DWP_JuR$aJ0}p*a+;w8P=c+I<jjzx#ndqe}ox$G*l=K
z@JozR-l090G@2YMtjVlYj9nRV3)0h=JINE`=4XH2zbRck=Ao=1Qd&6@i^xkj#b2%r
zoGukrm?EJom{a@q(Zw$6>Q!1y^2o&}hao;dlbd%!JcB09?P{v*6hdmyS*-xsoa#&t
zq#VudCr{~{XPmLet`RlGx(*{PvL?-@yT=dp)FE~<D~H`2UPTbe|0KX;ezPXnT|E8M
zG2?z;zPTSgRVm#Hlj3^hUGEgHioY(XH*bN%?)k0n{Qv6_6B$&v=bu<-GnQnVMKrQ)
zXgomdG)n=BLh~mVQV<ArJQ4h4WKCOu${_Bln*Afh40<hjD^~DDh(w0hyME-JD=_u~
zu;DhN3Np#|)a@^^&vY2%b4U%2O5(qAM|wjaC$0QgR{Y5Lk9Qls42!+BV$wAu;)E0c
z(BhS|nyL=ABS!b<JnaUp)S~dul}ga-bYA7&BLY6uhO)D)PqQ|kF1yBFkaDnl3e87Z
z%X??eOI>_V^Y^9SvIADf&|FQQL7#cnAfE#8?!k3%6jx^heXf6e*gg8BH@2uT1B}0Y
z1)$s<r#p-}M$Q0O7m#KVHgrnANkHoT=*uL5Pz#E@AT5VR?wAHRbg}cNbQBcc{Z>A7
zOzLv_kQxj2<LXS0$e^Y2+Y5%T-af(czzp=5Z{k{aO%{%JT;r&SVghwBOPYJv&Dq|9
zdnVQas-oz2+FAuIl6DsKNS0FDwEs=M?_AxV%hrKmXaDs}S=+eQ3bZ2*>sItpx%`lO
znLEYzLu-4O8frO^s%GmvnGkHdl(y2u6~IooI4ae|cZj5ukV(uGy;*hbe-rNiv~H7w
zd~7S8v;Q8e?90yF#rqr#;fKfOZE8i+p<(s_lPBpWLsBsY0v*0fM+wG(R2pSIHP7bc
zQtiH_310B_Y)zE))#NB)?Mnsi*e`TtA~R|HMU}t;NS(MZDt<3={zYu`AS4F%erCk3
znmrJTU^G;TCs||N?qQSa3;9X}?NRMih+pTD>&fqjWt{HxKi4L_u7oDpIFgdt3)^Qe
zLFX>4JNLR^6}{Ga&Bkjub$t?IsaxBwyJm9=$+t_D<k$TpcC@BNG>)$WH?HfT2;rp$
ze)*(Id)<7rk8qsNNWB`>5rZpP#r{Td2I%F|r;a}=<s56e$IE<Mbv!a|(_=T~52OqS
zPICC?U~SE)ect=antE+mBHHF-Zxl<hgsE1yzXx;v=CeI`3ZxkRNoIGmTze3>T6JjF
zTQ7WZiO3V}v?}M1;*|>wvt=}=zhm_L-3@X&mb@;~tc;OzivU%BG6nitg)G9NZP_#r
zcbgW|)Y<M9t-HOrWXbe_+GD-Y)j7q+TT$`QJW|&@t9E|XeQ|>E*EeA}vop7IRF~)s
zixn6pp@K{sTNory5iPkfjTd{YQUPh|#vSHnjSIwOwv+zYc>Xde{r~5)mOo*(yP(Aa
zJQir|9K`S%n#a7LCP3m{CF$Vb<&ySZ@!rWG^L#pzdb%`afS{g15mX=5aAEQeOSo$k
z&4Ek@hbA@}DJTTrCL5RZ-70bj|5{=P4fC$&-{^rcua{Y24PCAGhp7mo*fS4)YA6f?
zS;(oXaR&HRSwBa=N$N;dg<-_pmhT{A`3mp4Rm&cpL8@(gdXL6%ZUK`c#xeP?96<Ua
zV6M%>FLhZ;tKz}|(kguc7GDCZ9Ut+<8m*4X2&2?t%ZLxuUoU3I0H7u%BWb?$0SGZ0
z@{He4=>jKgv;4{LTNqbRru~?mNmsOSJb3Koy~M_sG`_daT|Z47gc@0Y6`U9rN0H?R
zM7&D$+R-4{h0-}jzCVCtK0GPm+Z->9D{PYcQ@#`#+s6h|SqP~`<{6%n(r5*#n~sM7
zH+my)Jj3&z%U`pM_0{`m{0X{W&XiKa92Kz<cgV(g(ykSK{DGLb3Q%XucBE4W#AB7p
zZBqbFKV?yi`H^l&&Y>K`7E5kx*!QRAnz1(d;|(h{+1rC_|1g3heg%GHEa@dpITO^m
zO$k)Pq`3h5g+irUt8x}{=BDPyU##Rcsxssy#~K(;p58u@%#Kq08bEKMJMv?=q-c*m
zA9Rvrrpn-TpovXY(U1Sh%8eLQqB5v>3kC_emNLhoDM37_u=q$QB(9$t9GHLqisbE~
zLj`>gzFoOQR-SHSDuaTdn-87Becc)AHW7@>;Lr^hRibblafxg<C)uo;Tgmh%6^ZA4
z|M#JC-lcMCY?QicT0t6NivsaJezXI-vm`9ov~?x7Y?w}da#Y;o_61BXd14>d=EUci
z@5IvwLR@*)LSfplVyW=2QDrrb2)=_W@`p(!FJdk&A0Wq+a(vKY!qU&A+>n~UnWXSj
zXPGjiFFp)pn{k%PB_3B{8SqU?w4kuynm<j}(WUr}po+`Q8#HMyW0?!!A(@Ax#dDR?
zsCY$wIhQIo_t71J8m<tcvq}u8j*Ta@R)E#ZJ5UBX)k~6`#lM~06I4e?t7K|TO#>s$
z)LE>m#K5Se<<!*|kwk2p*!BstxSD`X6NIx+m$27dkhh`eW%uZQ!MM&^Id5q#%XU3>
zR_ecVU>_0mY`o$6kgxZE(+ZiaAgER_2S^Aw7qw|$-3blq(Lr2aK!9tvsBDajjZt_p
zQa^SMm*<Lar-RT{gkO`Xmfe1!nMXrQ0(-l<?~=>B!iI`D<(iD3ID@|*kO(SaWZjL@
z%4vM$4d8Z0A%3%JqPN_RTVP5m`ba~R?AgSW%g&fjOA(z9&YgBcfKO=xk)$rtls+j7
z>=3Tpl@3d!Oi2K87%wCABacGbbgg<A{+iiw;^i-mo~y9d?A0qev=N3B)%+f--@mUy
ztQEwAe?!DG{pwL_PFCJa+!5~dIyGz#wREZz3O3PMMH~ci=Yl`rq$UW=33WbiL<bG0
ziOb_f;SDp2HXPcoCPfyv-rhM~Hjy;#3<;*3(itra@sj%-E72`y^zX??nQ%I@*07@<
zD!kFZ`HH~IKfhr0#fS=7-r6{y9c9{J{h4~oM5U)I-^?ceEAZjk(97=_R(Ef_?sUT+
ztg8d$Sm~)x=iM#neBMex3F-Kpn|j7@lhNe0<_X%e&zziUM7bL$Gas5@pFd)q%=eo4
z@b-u%ieA*$57(g$cSI90>k_TtX9oYHfbA=G%wTEFzDU>&<#f19QyAdps%GrB6`pDZ
zJ;BkOA~SvfCKx8m^vCeo25;vK!Fd<l6da=)(}uA>4ad0L&(2QQhjuee=W9$qn_ZJP
z46rf;B3JOvGcV95?z}v7q2C(p&C`$F)gUp~0N!9k|I-i0AAAv0lc#z`4xklY?+%i#
zi^LyG$l1ctD8%tw%cW?1dsPoy<Sc))mya7a3V}qXHWd)*pmIF$!VnL~kza*WpF(*K
zjipXB$FPW#&dJ#;EY_HJa?LCJJxui$^?+JYptE<(v%BDCeOk6Xt<GizkO%5+8;;4n
z?Qd`-AVp9>|8N>8GeN+L@Vx$@o$$2U6+e$l3;qwOW8TLn@gYk2+OQRnO%8}6nmNol
z5*ub0Fqybh^Bd?XSewJA$~i{3=FQ{W?bZt!|4>YgS$#9xj9(Kc|EQGqNKqgOaB6|k
zgAxm)uYtfg*#P5hve4XR>*#6{3}yhW!^h!C+aW%E1=g9{m+xv$tXmc^gw)a^?h=)M
z0_jh;`j8+lPho3R#)49+2xnpl_e)pv+ltjV`V(nXK@KD5{C0gwIxb2-ks>JZyKQp4
zs{oeBbawPVeZ~&|8C*7~r}UfLIlTHSJ>9R-wI{d?@%PfIGOhj%e@ERd>{F9C7l{^Y
zhl5}zlnqtQDaUaYgxUG+93oZ;&ud<F|B4LJ<r`A%d5aK!@_-rU-^}K<Ze9wkGlrd2
zxPH`tPtw5@voynlr>*mVx>Y*nKGQr<)dkuK{2y!w?=cpR_9ILtV!yCgVUv&Op+9ux
z-bXgLlf5r8Ip$u(YkSql{lhE>dQF!5_mVxOhtx^r5&fT!YI*rxVbbf~L0&n0-=0*6
z=HFz6(~@%GK$0_*4{sc$<l`70yP!2PtFE|xh|e8*|BO4#=cSKl%Qa!$Rn1l@8SSb^
z=rIajgJT#&jgaG7{DhHS5*PNgs7+`*>C2o9G9V1xg@NRP;Cv@=G#@IgW>wyV$aY5C
zfsz7x-j{MeMmPPQ&jwJj9BuZ;pnezmpRf&(SgpqC$j9Y&YMs@5;&l{!3?e0FVEv;3
z2;=Ue&H%QwUiC1?QM~%)Y@Z;dH_x)7{^bIEI+&Ht=DB>O{(Bow$$)%A7qBPmXPnx@
zIs*=?mvC8zqYe>9$1ILs|563?!zNJ8kM>HWZTh-vBSeinJmwB?-i~}dV*HHp%G84l
zypt^tM=z}MNyH!t#gax9(5YQnbH<J%(eG@e0sfsUQ2bN`X!0&0)t%Avh&F0Tb1$I9
z^72>282r91opLI?*PHwR11$ZX1%nToQxhc8DzqYJ|F3O=JSvPkc{emr(>7!J>8yRx
zP2^@Cfz}_MmMpE3mQ2t|Rem>tW!go<`+<=$m+*HWgGZjcE-_OnKcJJML;<wmlG%aM
z(<j==jj!8+bxm3syT`o3k73Q{$E56&)l&U86HtngP*^pAUiEp*zn(2D6$i~bfSB|0
z)K@(jGweJ00kPX(4g-@QI4!SajPeoCTc0V*zrLmBe9L4T917Jz*g!v*00WZfRbx>W
zCxXuByfd{gWIc6Dr6V28-WV`at3Z`<GSP*Z%k7sskhVgId<pa2KG6(pxHOqgTGh)t
z2rox{B>DlU_5?CnOoV=(dC}pFUP39RJ7u-!IErSdh#VA)(S;$IXC1qZte7gESp&7a
z(@C>&!x{53%%X)Yb_v_!$^>^u1VWz7DuEM9q0^F1g-LZ<<C#ee7A{l4jrGF?-@#PV
zQ-m|S_LfPfO^rNB=wMj!KG=BcB0PmJt_%16yO_m~VB5%K?;PoLBD3=7rANMr5hTf6
zHg@_$F%wjmItZ9|GFB2Fr+Rp#-o_QwQ)yuB9-^-hiCSoZ{z#Y;wLdc!edw@gasxlF
zNj%U?a(~5(TVJhC$MVRx=|Xw#|LKA~s)|G>*m6uSwU~5_BZ#@p6m5JO*8#v%<jn|g
zps#ja)g4BfrVxMz9)7>CcHky0=3MB*98l&_cn4z0W^pBWcluv4U@X<aq2YFXPE=)j
zf~!g4sbtXG(8NYfdECKE$1$qQ^8Ft`O=9w|q{a0}R+5IWs_ifTA_lbXv<`v#xQt%<
z13@s(>9_cOe-^4KHLH|&0hab89qoOSlE(4CvWrll$P)SNJ_e6NS4#tvw&+(P5G5FC
z%Gr@-p%sgo{>tVy#nIZ|?7MS;7^xVFVsF9jttnoGA%E8lPbRhViOFHBnq?Y)dvZVW
zP^uj`|Blo7P~!VT`>X#4(<@ncS-`8|W}NV!^+GP>tKb6yo8Mb7c;*@A(kIQi$9EnP
zj?;wcP|XS3nrJv4zZ;~5qqG;?lrIx08Af0YHGjW%S4-2b``cqsWNCB0ADg&ijKe#s
zFn43@WjlmMKfQERJo*|DV!@`&*t7?FLZ%aI;h(b_KwGDPxc{xtQvC=RPizj>G9JEE
zGB3}FW=Vu4Q3rO0hl3I@q2I_yzO(pD$!!d`ccPm9L^rj{pH`P)PhRNC(*?a09kO37
zK%aB1lV2AHy^k~NKoF5ThIse|`+3v!2tQV8)bF=OMH%1nOs<zoBh8~!0cIh?<T}(6
zm9T+VEog<JFcf%+;ydltOZUWHe1Z{}pSk1LI64aOPa^mZW&O;n+`QNN{rDuxFFH|O
z_Q9It`@@scjg<)T2r@~yQ^`&vDI~Ib{3L&ad9=6doH+BpLo9j3oosi_h(_*pWYWh+
z%B|H%MuaOwYfQ7PYdo2?zm$EGM@)O5)2V8OG=8pBsw*vy3oC^)(Fc8e<UCgn{^Z@D
z`=O2$D4Ol9`5M~eAN}4Lx&8(C0b?CQ`rY=6e$YD{kA;8m0VPH3rYErM>K?x}`6wZ{
zaJhtUz2{n|8+D+d!+hptVA;NEC^Fr9h5zjt;saTO6~!4JR?<pZQbDk$2$CZ4>}=Z1
zd0+N*=>c8Rfmt>rw;RX%@DGwE@jg26l~b12jDAT<ddt=Hkq_0td+Qj4R#-X#heT(D
zKza!q#5dB0nv6pvN{|T)e=oMvwlBo{WP+GOCrgKW;+^k11C<SbZxvoLmS~I5PsvKw
z`Xs4CxK#857<I6_IHtoX&<}L%(l;n|oN26lIHZai<vM_8?g)ITB!_LQG6^1~#wVvw
zD!zr4zinegxtH6^Qvwof@3tRhLgjZbkRQE%9#6V)vn3o>eN2_OQ^O{A^9SK6XX0s}
z3NY2ck^I5LcF0~%`*Zw8bfh<GvkW8NZ$cxL)cJy>J7PWN{?UwO@p(#6Tk<jZ+`+b(
z0>jBxQ4-OgLwWYhvBz`f`sL0nP(^)bG*d<{4(tth4Ye+_{~F$SW7>v>*;{fzG>jVp
zB`U+81*<>E79gslGFHx#ZJN^GHrn7Zb6j}h9+&5F+G+#&Daj1bIt^T&>CsMf`^CSa
zv%B-x5{6%)C+P6Fj7h~Fc-MD)j1KL+YoF7=WRyDlPiBGpg!Xky;rdi@dwTC46CoKC
zqh>7_XjpVPWXOvA2PM4E-QPRlsMp~Ax5r!gmHX@k4NPy3eC)^AntPoNNuEGAea9Tp
zNVMIP$fGHw{&pYr7iT4tqa)bQ(jz|t{Tv993aoU9HvzJnpQ1Ig%6#Y3Pe0A+<Audq
zr*G5M_Cf1hHmrJ{v))3zN<jYI0DYUb-}ldSw#_5XTv#-UM~wD!tHR+UANC!Ta0aN1
zVWTUzv6V}@BM~!Gi(bfH#o^z<=~4|2V)^H>`tYI&T36h3q^S0fn~MExf!Dx~i_E+S
z_m|^3%IS}T`qQ2OynNXi6NNquHL3cCeR3_%5IxIz^^K(hohA4u<&;);BR^E0i((4J
z?nLi>q0I;vexc<bQ8Z)dfy4r+vZJf~$XW84a^jHpkfQD*EZ2Vg`YLeLe9()UppM2<
z9qakMk<qfaq)d#gWOTl=O%NFq+P6v}(t-*=R-vyojadJ}h$yqST0h|pJ=s8;ts&ds
zKOZ<n^fLQa4x`EHu49peB56w=^4feZffq$$G6A*r+Fghz1KB{Gc&p;Q7*dV)ogE^b
z1!UNDvA}w7^D)BaLtAq7{%<Ql1Tz(H#Meo`ZnzuI$u6Q&v={w@Xb?3zUnVj2LaZ!5
znxt%<(N|^$@umT(KMNRvtR)YFqm=csk?`Kq$|G?@>d621n)hCO%0;4AwzkWH!VMti
zvc4BCI*+^*c@$!Q^yvyFj$31hSqkN}Krs-Suki1;+Q|C&c<$r{L+3`iq<l}Wj}Qzf
zJqC)#qW1sMfBqcBnaLF71$&h|Vht&{CAjzAlhCnof=NzB53!iG&%AqfFMAB$aW(lH
zMMe{&^4g)5-;r&kvb4XANDRQV%jv@Jr{vte4{P)07dZEJ*WP>G+hzx#BP<I;l^6)?
zRKIBPBuWiD2(d|ioI+IGU_3DPI3ry|FfH-&_Lu2>%?Fkl#k0otPWH}RIpey&<;8*o
z<P~Eb;Su<I9O(1-pP$c#+I-@C%q&*c{evO@M|)CQar?2o-RD6Oo5&!wUI#ITK7V(|
zI@g=vfBwqs`7Ncr&Mnq-Zwslnl2yYGc1bLl*!wjC$gjCq(rftNzlJ0RJ<Dp){la@S
z!dqcmq0O?#^~nCSqd_}?$5zML@iy@xHPl5^hIm+i(12A^%FKq`$oT$Q!f=vo6*Uc2
z;@Em$5qX172i-^Lvv>@25qW+6nuL)B`9B}8muuXv31|EotKC~cSBADEYDV`uce&cE
zH6F1S!VjsgWk*Oor`=N9dnch9^4?lT259|AHITK5?Vr~FzB`ehv-bgn-3WEkICJeT
zrf=-zR6fD_d%<X&qBmintq+&D6S9rV_aoyNiHvU9NgRoLjk{2zwtt=up{pa~n(xVx
z$q}0N_fRE5MPm7q^FM<6WO=BqIuj&HDZeRQRo{DwBc$Mf=Hc^20HsNc%x)fcl>%|P
z{F>kMeX3)S9Es=P?G>qpF&j#cl=C8e7m$~WLBqa9i5;@}AL{{mHj3B|ovGmfFy!-t
z=x6xJs{&z(gJ_or^ajz$jM?A0<soR5<O67p5L-quMl+nDWfet<Y8TV^*G^y3TcrDE
zZBdKJFK*DL`$lIZVlAtRa{sIH0C=^K{`d38zh)3k(6WQx{)k>OlfrI+=&zfM(K~;k
zPj<>_>N3L6rH?yz?;9U7)7`MF>S4Hi4(9Rz!<*6DR-IV;i0oi%POTGrKNDCm`G2nW
zf2VUI5g?NbAc*J0BjL(I39|RY)Dj=}A1YP=wLZ$k+9751jNS0a{(6_viQb=2sYNsc
z`~tD8*Gk1gHV2^=k3#q)U=&Ps(*RVFMy0?4O%1{SJ%PVW2j*9LHJ|G?m3*?wJWyZn
z?Z3lQJR^~DRKwr><|<Y2+I2NwS@*wQ$N^khiEpm|`izao`0Q9p9J*n#r0$=-my$`M
zm!E^B>>8z{cZE6Ygbt$98F&O8t+2v)Xt73Lw=Lyv;-efYMyg`S{wBb;I_i{V6Jt4y
zY00OUUy|+pQPa&7;?vlNjnB}UlMArsx86$_=z|zY{%+j*bGGs$xh&s2#U`e}bMHjb
z!HHD5h3li<Up*B%QKR;QgO4S2NQo_QYLoyPJB~?uJAe*Z4z(IA1(9K6L)ZzEB*t_<
z?e5nm!*?pYfr~2y{>+g&>1w5wF)Ve#2hR8W5FeVo{}W9ZI>F<T^hJI0`2HGmg-V0x
zP-6Zzj~dX5Z?^n%{}<w5WtiJs_s}@_t>Z6!c5Vu0-|UY)cfo1_{%Zk8vUlAzco~O6
zzB1Z(wS%<QB(Gxi8e&TGW`<T~3TO1e=A~YqabMR;Ip0~t8*V-k;iwh~2DvE5`E*Wt
zq@DfR5+=`#CV&BU(tD?fuP7nJS`tn|L#%wP?gOKQbRzE$^GWLcSK5cPZCUS?@8=3|
zX@U>hSPi?uQfq=Zyuw;PpLpFFTvh9!>lbo&5trZn+<&*$VxZwQ>^%7f1iQ!;3_9eF
z71?r8cs(n0#vXDY6P~3UGYR9CchJ5S7=65%fyDDd)U^5%I_~p7E;96@$DL|Fo1dLk
z!PD|e|Njj(_-7PSo~6_mTBe_!ZQk2ja1i+x`424n7?gsxdK%;$+n;vVc8!adf%Xvv
zFq|4-DaOM!R7JeWRLL%L1N^{>X|9wXI>L`CyUbw|uNZdp&_f<QbZq4rWIyi(tu)NY
zN0x4TH-CrYm+;zY89ZnAOgyW&v8#mWD(>d!f5NK#99jdhOLUxUqDS^Qk#mG=vHcTo
zdn51WgP+jP3<U|ob=;yV3t|(205rno^NM%DbHA*4qbv}hzt-vis)>>xAxrrOnVBHO
zKYt}^OBoO*`~4U-YSX>p+M1~6-7|GD6-bQ5;}}GxV^b>ttH3+XQ|N*rOh&uK^}XA*
z7BUoyg^_~26cY9f&TRKe6zi9Q!;TNb;dNP|7&q1ro%89c0BH!AFvm!Jm`(Ph3AfHG
zy@+(%ff`R`MmyG-1_!DEd<TrajRFpse(d9V=o33_O<&i%mp{(3_b$y+l$g*4IMj%{
z3zQeNp?5YW+21#QdbD&K@iT)`?PHs!#b4M&Z1nd1Nrz^xg`n7JdC7zKB^~xW@^Fig
z|8pUslMZO<J^uYoZ9c0o0NHo*>?m8w_fr_UTq8ft6WgxmZZ%-*LskHi2N;kB;gP(P
zVec5y0}6kY<y~+fxp(LYoDT=>bM*-oE$k%4GaO}2KkW_~E_!1$5@hrb(j4OpW)TtI
zya`sFmcaEaL_#SOBnS;)Ug|KV6Z`1_<dlo5jffvysJtO!{udo)@xMgbMO7mI$(Ws*
zo3Y;ZUwA*|M^!*u795I#p%nhw>y$R?d-e<9lZK&rzXwWnLFI4)kqH3JL<P|3bO7OW
zKo)OkcdtsJv9jHzU74j(U8F)URLmkh067fi8%3*A^rBBZz~_?3kk_c{{bi^6^BCFh
zV7K$^u$bcC(`5dA;B`c)sM~iCHh~aaQEyUS=byg;u1JoG^Ah{P>kFbdJ0fDLxpyn>
zrE|!lVogvm@^9prqRV^xmH?o1sRQp6@j|GhIb%0uj5KJuvlq^`WrUr$)C4ZmaiLF}
zhC(2`s>3ElsGDSWQ-?KXSH$mjD9M3FD&>5DCqr_mP>pf%ba768h9hQ4D*tyox7jt?
zSH%zO&s0vPoZSid)d!Abx4y)Rfe8Ooq$VPun}G5~NbmHC>wkb>ja%;lV^?kgt5Eq$
zj1pMym?BAQcfRrddqG27wpaYWc?<G6(Hm33v@sGwE4S{$nG~wYRx*0#e*d>C0}l{`
zPVkhrglx7DI20?xu!~gysa_iDgoY*}LK@?nZD+Mdyk2y*Ww}AnpfL13+L{b3b;6FE
zzTE(!O9YUqC{W@xg<WFfzu*5ZEJ91B@x9=ct<~T9(pQnm(1@qXWdFG#c6yXn*0FV?
zW8oyXPG3+aK5q!*Y`#o=H<@({CTTXpJmr{?j>&H$<~5HSVKNJ9FiBJ*50sIM2uuyZ
z2$3a4fH8Dn0jtde3?_D!D|PW4ByVVjP|ZD;Nj@a}Ql>V@{m%-x0;-vK>+i@Nnfnb4
zv!>y#c1GA4dE6qjJC6vKK|fi)DG*S+t0Un{0RJ+ZxIk~c8RBqwaij@Af)NC_Us+&4
zvH6U{X>Py>!OszC(Hr;nPteT45S{@&A+1&dB%tA-%74vhhFEgwwJifpxB=WZjw|c;
zs#ISzfkmDGEJLL)P%A;E5p@2+c?tsWVLEZU{-;M*N+@D$W!G&o;;)s|N9EE=kUaJ9
zm6f}4XArU8K}L>&1oE@K3(nRw-wY1{<2&404YGK|UKMeQwDr+N99@3z+%{E32~#K;
z4UnRN(oMjW?^SOqI7!$V<8?1w{zpEh6ya0(k^pw6Q4E3Y#~OE+DYM>guKu>3{RxXL
zi1>24{eH}98w&Tq(YJnW<oxCHH_}93!=Un(-hH(@nA5eE2?gECmwxO>KxO{;12M;0
z;H<vc(k$4DM1TZQa%Od;4ic0maOM`4y8DNxEfCgR8#mSmo>b3@p&%V7f)4$=&oG7h
zPZKxBR^ypHolib}QvLc{+4WM^JcnD>HR`Z+=R-`|u7^a%1E<UWq;1{iI5<4em#m4E
z^O~`&gUyCAz-W`6+(KgAXu>MzLIQ^XAVGQ;!~gDG0$@8z1l!TtpZ}BEGd}ZC9Vacu
z_*af<@ILXSj(GFbL2-b;>p-ngjLAczE7G9hiv~GRnZO!YPDUda6L|l%rFbLi&r|f_
z#$cL&m{FExiwr79@MwZLdMVfFDJeRI@-yJ@<5bL2L1RS#xq}Y4HB4->{jJth%+d=O
z0%NX4XlEZ*mQ#7P3KN<b2SdKfrJ0=m3?bm#)-ps!zW#Hy891#pNze++cr9Gq#Ym-(
zr#y)BzDbNzL9Xx7bMtXTaCO#G6E1c@0wnZ;M7px|AGlM!Z4UJ@8S%S5x~wvEL~(o~
z1U*(rtBR)`M(?L8mDr$Wzb=;8|E&~oj%W?mFUU<P-*(gOIcsYb0R>~DBkS3{&t*C6
z^=eCJN0()#BN7!{9l@L8jd!nTmA7Wz8TDu(x*%v7k|#hoy!NZ-tq31&=5?Um3BVRE
z(;!G)0miaTIW2FV!{X5bm@x|1dBRP%96e9pvo!vfO3rr@-O*Z?`YwWjk|j7Ssur&S
zD=J;BC~*3lh*Z^Ya!zhtWCpY&@H1|IKQ}b_Zvmr%BZXV~)`8#{8T#Hz&yOQUiNb#w
z2=ox|%qC)kN_)~&8Fap$Cc$)vUG1VwdlxX3pnxi+59(y&0`W=<^N{v_LG;w;@90zZ
z*C(5A?(qGlWEa`Le38Ag_W>kw5AA9fUU$Rcml|)yoHg=!K7&3XT`_=C^|=gm>BoT`
zrI!^y`|4w@YddR@Ybaau9HYM@7>}JS9&&p#^TCK^uCM6QNVlJR4;VGu??QEi7FOJg
zFb;J`)W&}>Vrk&%so(3#%iRfHog$G;g+92YKMAXYN=2A@tXGV@_lksXfXTV{h!ZMQ
zKE%=n{&IW1rgXdV9`I$&CwG}I7HBy}&nH9@QN+=G&eiT(mH1`zdcKDDfC*nrA9qw^
zdMd7(B<Gn1-Ylpvv6>sG!|}kKV$zoZYDoRJ4?7>8y$^UPPpFkW%uDG}eeQbSL*)Hq
z5IjWjjG@iG_ws)#WBDqK6vd<1!Z&6Y#>&P(@T(y`I)%*aYe4#M%WKea6X+p$)IvPJ
z1-6OG$}`j0d6Y!SNQ+;@%ruHK8bZKTq;ZEZ5fM|`Q|_xaQCF)}2Rif3FGMl_AUtRZ
z16e(<o;m%`k<CX-KaU&_()jHTQ@PUNisFwCk+WdP0svJYpc}3kyK++&76mS{9&UFc
z^$46UJ@4uaj#`zhTG{Rj#^AJglbugmJ#4!#i!RbPMEXp7-aCKh=_21^W8}T%w}I=-
zwxVTxeZMw9i*N#&_!TK=AiODx>aRf(k0<rHQ~U099Lc$sMb+|P#r|Y-36Dw={i3mt
zx^Y6CnuIyT`SCw##QrBEtR$VZQPG!u_&<8EK7J-p_AC2MWKb<zW};g*;_nhN;Q?<5
zac&f>YS`XbvV;WaNY~3>79|JY-|f6;q5~s&)4t|`z}sMRWhTbi6A^9kpTy~PkY>HT
zyn~F3TJDFi@znp+W-tssi2<$X$9oJzw5}p)lqSXji>+t>l&4+Cp|62gT%?eZPK5+g
zA;ZWVhCIIu3=(LutC<N8<K9S&aYj;eFzz(N{wZ5!j6QG8i`Sq?b8Ee3Y0kjHd1pI@
z&JG0vt|<6jM>$50Yy(R!jp^W|-v~S+OPFDLFQLG^<{;80)C0xy-#0Lc;uz2xH(pER
zTbe?g6cB&Ig!of<9Oeq8#_J8gEaDdtTKhlgBW@h+DkCsX+OU^e{%4yVpjg);PA`{D
z{7lVU8smxib0P9F2f;=T#v1s4vUC$TayJh=uB4AZ6SCkmgsO5i(u3xqozqS;bztTH
z`fD1LpsJ91Ml?Z7Q(Y^CA@k2hY|`o(0k_Y<vI@c7f63o=o>1A-`SN960QL@$l|%fz
zVA4#M?xq{KIhxx0-gp6)V~qC?4S`P^DlA+!bnyq=j#uLSYzgkOYjR-z=^JVDteQ@R
zn-SPAw0tyt18w=Y=~YyT<2HB7@V0js5>*fz7KHAG8HMhKL6*);*PBT}#FVZP@qml1
z$wFBtnO3+_bU))UfA{wJL!%=~Lxz_uG=JUxdRv59I06`(BAW~yFCm?LA#DBA63P4>
znAUs%q4wb2&w17r0V%m5TC~?w{7>k@;e+Ui83!u0r{hU`8}32i_+cH5;j@rJ4IVEk
zBMyFMGx_*1Dl^GT=EuWfFquW}C9c6W=&e)fu;T0~5p}0QR-rw8KbMSxOfGaWR6&g!
z(bNw#;kw_2wuWH4EB2nH!-0dFS;tkHtRoh9@w(+uGIaot;@$nzrSyGvz6aE_sc%=<
zsu?}jEP=H%GC?IuncLE0e9D|>ZjXB)CR|`Y#mlW0c{y`orGV&J_EEQ=W)Y>0X?tJK
z@Jz@bO{qnE7xqT-DQA)|t**k+d9WQqETHXjSoviNgsbAnTNmK2;!|k)w6h{I;TS8T
zK;2DZXiZdYyH8wM03++j)DW4b-zO_g$sc2{UO0FE#-D0D`HaLf1vp?GA1G*EV9l?S
zD9Ch29iq1fN%Q)VOe<JiTPv!!`>0sM6Zot2=#Tp=sloqp0lw~EsL;tmtt&pKR^E<@
zp2D-TNd!T+z!x0ZJx0!jHBATd3vS0B>K%v$7dpiN6^a5d;^r=F<U%UJ2^=z^%Ky7)
zH<*oD>0EK;iLX0qD_?)?!Lu`<7M)HsM$2KV<-b|b5<)M<iKP`dQcBUjlFdqij%%Uo
zf(3W9!_#4}t*=+g{oF^|mKsjHkQ&+r7)%|>M*2V5^qpFf#DwCM92xAXOMoZ+R9L)p
z*XOzZhxk*EH0RJc@3SM~^#~Gp$IM2072IL;K*nr!e-Irl%Kq=^T!*$TsK*T{Zs#Yy
z1|J<AwJTDShgc;vaF^jgJhLJqZYGGg=$ANg!8fNkPQewIiHsl)S-Y8l8Kv(iuQss<
z<Nydkws^yXs3}N-R1x{Q=tk%Tn#SzotMQ0pv~;mau#++xW?6`MZmmYa%_!3?jSw|q
zFjlZ>{k0>u^6*G55T~WFl9CqzJO9FLx<4NT45<M}xY&T&$IDc3m(j}%kpo;5-{$kh
z;L*@=%S$Gi(F%1?<9_?F^<`N3F40Qp&f=-VpmO8sS^zz>8e;UhBj>s46!rKGTrsL|
zCphQv$(92p4$78_L|^u92hJgW`+IDhT(Zw!PP^sT@`UM*j7uGhAfRYedgUFT0&VI$
zAFlfG5v|o1N5$1Z?G^E`+MGd)1*OnH%EB%ydePLTF#SHGi0D(c--cr!C~El?V1n0(
zgeA-H<dvh&4{qe=`OG|T<R+ia!*zc?eLSmBheGS}X8R@fT!PqWStk@&1oU~tK&&Ot
z6MN<6am1=It>qIGh=^ZXRGzqr8FkP~AiutKnR{-}DDz8IH=S=31*hwXjP|c<Wxldy
zb*CYjhU1}Yg`jc~g$=KkKX<G4g?u)th(jtQ$^#b(e-W!^Jc<sKH1Z=xC;tH)#-I31
zpD_|<4R~~qCwT8)sL-oS^pf(j3>KhLD9dsuyal!)MErX9j$iKYvhI6|35m+TtJmK*
z`k&EVbp;;k1i0a73;cwcXpumL5u5%4dX>nxm78jzGU_jk6Tc&Dvb#rfi2Uo2D)tw+
zV_W1g5vp<7pujrZ>GO)-#&*=HTo*A2u<t*o|BD&~uex_3q-#=m&wC-BMmjVsXca8y
zOO&wkW*w_e5CMiS*k&Gh+w3bZ<3dZ>QRLHfT?OZZCZJp0bT<-dM@Am?a+NSXobKSx
zss*76Rk!-pvdcMHh;`AkdyiGftXVOt^MCSdXd;kF8c`33B;-=jWCh>|@%-7_RE;#U
zvk>LGHqlvE3nWC-4V6jkd3^c$={L*<AHu}rH{xnMBrNPX285323<1HeehCv+r@#v{
zn+rec1v8c1<#W<^>-={VRR+O3BrQAtpyqoTjUI_gHaX~3m=eza;Gd@$K6X2J521=g
z5UPp%g7$yn%!rE5jXY^-WV<KlqX`nlV83y`Jj8o7Xz4|=*Yjw8GW4&y3>q*MX!PwI
z*CM&n4ECBS1Ro7nIc&RGjVi6pE$83;;-jr9&Sd!3{|HUNHeK3MCRgtZ(CYF6k`MBZ
ztoYo?faLWQOpQ(JA(HtXHEHmt3On@!2$iQlw2#}Sv3P-e^9`dF&y~f0Jd;&_7f8ec
z$2=o#`Oa#vHrR7+KxLHXx-fQ8L*?z`&Y4w}NrCa6W6v#Qr#l>8wQg?*k2Mr&JMe~T
zeXC#da<ans*I)->Sd_r42*W`6TQ^-QtHfa!EFU_k)T#2S3vLQ74l!fvP-cx6;BkF5
z7yOjwiT*L6SE=q^;1G~VU#^y+nUOXQ6z5XX>_c~72l7v1>Uu+##xgQ{u{8sX{u`&;
z-rZKdcKR%<)Jes1aR9>W%UDic;{t}<t?v6jo{!>2#`~GqoWb93H1?AXH*yO66Cv)u
zVpn|lZx1sZ=FNN=+u@Gy-0QS#gCU>^6(_sD@Zzg@ZLdq9R%Z(bfk=A+J>~-9gitGJ
zzl9I7f)vD8^C?sV>R_5b?J817!+%Kw-#O;?loDgymiVvTic@?9uAcf85RCZlcEWhw
z<yU})Wd}ha+60+snm^O#L4rQH(kYnj;_t|Q%7m1cEix<9LO!k#v4OiUpd?<>Eu5_=
ztMbP(yVX&=Z0xDvI4~H+qF-p9!sHSL;_N#?l#;78{hicv1J<SU!%LQ17jc6J&-Y}!
zTo~`-!b+fQ2RuqvDZanXuoI~jwU_xuI-Qo^T}WOl2Emdc$Q}G@PAe+i&vnPi2*7}y
zaCz35)Y1o&JWt%0KM=o?&-ow-q8ml^ABgw-Yjr?cYVY;PktCw->TukuKA$5PcJB)t
z_B{B7<u~;49j8jGt?uVGAzrxybSqRnn{GLP0Tfvl9Q{=?9c&NVH(<WY9{kiKE;dKj
zgGi{HtTlQbR<hZk&KXXflY?fk`&^+jaz?<94%!p9`TFSosoeNDi6Y+#hxWdA43#BO
zdh!yxyLD+_R8m3nivN4vH#mU}xU!znNP|>gz5k1%^wmuiZ!s&z%6{`)yuBltqEWA5
z%|-39=Vu69#S)YGbj~37J8TjRQrQ$8#{_VgQPw;TxhEj!vUzbK)F*AcstP874SOZt
zfu|m4BcTxLFG`V-(}ci=?-wEkz+N1&K+^ct3V2qq1j<A_kp+gC^UjW19pj~?sD-g#
zuV~4<+g~#9WR^pbNma|0u?{|sXknSMd07Ka=&4<`<eXqef5)DjF{eMCp3Rc=G5@je
z)M3sP81vW^JGKn0aUENQXkvqD20t~kX`l*5@qr|X!7Ha}BF?x_CmipI7_&dEgYzP?
z&9VTjJKpq=_Z@|b1AgMHFa-U{KNjp$KXptQPNB9d#8!N+z~nNDuRr&oenU_$F?5IB
z{J7tWRKbap{5_J?Wa1;V|Aj?$;@6+iLnzyakIwU`EPcz=PD*3c?CS;v=7ovq)Q`P}
z6V6j@3cwYN);p(jljCm}AdKl9GE)v91f@*;(TBw4E((W;%XONM5H3mwPiO?$Cw(@p
z$Eoa7o%f3edB7UmPEo&iN^Qa?GSB4d(L+HG2s(DJc0l#dX>#S4vo7oDKEnt4s)t0>
zcxOrIz4AWWTjV!$&3l9O%EimuEYGumIe2|y-ikJd3aP6K{b16@Wg2Pwu?t7r^!Sqp
z!hJ^$w|Ybn(;+=pYYV?m?3#poPP^>5@N*S1O7k#&#8mX9oTpody8SrbFSXNim6gyP
z(=DE{eEl%Pv*&D(u^Y@OVKC*+F0YcjHdQHnh=L4s%-s$43+76JKGUZON#7fPsN=Pr
zex7qzL|_O!VEx+3fr0<DVR3i@W_!(hw|JB8@r|+v)eZ5;>Jm?*`Y@iMn~iES6Gj&-
zOtBJi`J)l{iSH)?ERGa5DU-_3wWt5=^k>KfaXZR>j3dkZw9=rIWAV$cGeb9fM={O+
zc^;AryB;C9wa9JR)N|VJ@+Nt3A`L?r(#2&Q7-=TezT12Ag{A*7!e~<`Sl~Oe8^=Oo
zc3X&)^>X0xhp}%K9dd=6sy*yy&W&e>$C^ai{z$A&jMc<Ej6om6lo*jOEO8EWW&&_3
zFc>L2zx@YBu3bD1jX#1h488|z1HNydA2%wn2%xVu`@H~bc{t$fRZ6Xhn-;<r_tG44
zp%w~f#np;|*G)#MiWRty>wK^=x<rF0V#J)^c#--KXz^dQ{`Q&!zIB>1MYHAn>X~#v
zR109zIoB(hAg53_rmUW2;x$UtE9nP6A(zNC`FIuF%;MThmc~Zth(~FY2Fr-SNnJk+
zz+TJsLF~yr7Jv%E0tl~Z`gdjkutC&EE^zH%z*QV0xBZ%ju2G6tuz!TGqz~0%#JEI6
zb_VQa0R<W@Ex^2FEK@_rn%*6;&?d)Ii`w{4<Cm$SH2IR%$rE!l69pcfM`3VUY^2By
zIm-ko%@<BaVC3-ubizC!|1fWXlvog27mg=Wsv7OmCSanYmZEb8qe-u8hqChY2nY4S
zj!TC4GxV#wNpoJ6J9emPf7z19tn!4D@ET?CPBXYX_v*7&xPy=RdXd_WDahv3bI5G~
zzdI6c&+4yOW_FmrXbw)*&3=I(4IV85A6!b-iqfuL3gZ!)u><zF93udlVNyzCfl5&F
z%EMlf$GS914Ir~}61oXtQU~0_>U(sbWR2jAz|)mWjM11KvO1v~9t+A>u8F)sEjM%V
zZ0^nuQbnFLFyfuRhrwd$Fr>cpjG@Wx`KW^_9G#_TfAlG_pb*k_l5y@6oYqJ54z2Q4
ziv|UiAE&!QWKjzhLq8h4i}q6pRXTKpZPeUV<DxvheBt3Kqv5Kus|G#ND^mK-zZTxS
zpW}4ux%p>!XzihM*RID|0jAFt26}eSeudFtTSZ7%qhetgK@LxC$dMo)m%tTrMPg><
z%M%zZPCgm$!`?yNc1+Rd00|0>e8mTtcjzWl`w5ZJ00XM2QeBVLq9hZcRHkM{k9j|K
zjmqr_5nfP-FOM;FOMcq;8ezBd(@qPl-D;JwnJt;2!Ez>^a5r$-d2fARdp}a|HCrEU
zP`MbjRmnV>`h95igSCsqrZW_Myifse;2$VE6AXDc6(wOluknPnvU0*}QN|?GR$D8A
zL)}<_DLQ!3Jl=I7+V+pf<XM(xr|qsAfA!7d=iG<4c(h%s8JP9oeE9HzW_GFlh1Z$V
zo{Bl=@d>?hDYviIrn+}kgJ&D7*YjK|X2QIbwtncYeYl76mF*(TD*F)c;S0BLBvlJv
zlVA6;sOwD>mkbL~(NDhE7-RY5io1Ng*{gDt+D^)T2Xs*>`IP2*MNj4(^xihwYI|>X
zMr59w;D7FOQ}xYkg_d67ldaT}8^#}$Kfe-)I@sLL#1uT{<=DsE<-{3UY2#jdg3SSE
zvy!-0Zq>dj_0Zk@<2SCl^}bfG)yl1RDWcK8g3=@X2CL52{Jd~tq;sGp40#ThFA!;G
zjv7AXh6nQRlaVJ!S&Eb_#w=IA@l&|75J`9E$y4!--^Jcr#k4|=O!$;Qx#d^Ep;v4c
zrssHNr&UjW;87jxoS|FX_+lkj<XXy{R2Y)Z9cNomTg$#(#V({ML-*K?NXixp5TkT(
zsOC<(f$<5}dDXWYt-IT;jax%&mCDJ#!1G+y*5%`OeVWUkc^gw@bdE8I6xceYvWovw
z)8l%|-N~85D00I4r<GSm`=g&um!r(IUDqY=X%#8npvV!)|0SnV^2l>ouJ!V}FS^E`
zY&Nn}Brf)H`!@BR6sSM9yU}_+-yI$F>CaZB-Ehx|FWVcf-d+V7QxB5120Vv@Ev4gy
zd)$Ej&2!~5@1R9vK1=0%fcnc+YFdvG(un-5LeDXVB*TFpjQ1^@^DmxHTCe35uFz;d
z_1?~{Gd>(+Sg(_7$>hSeGatzNY3t)B_mS-@uTG(Q`*QhuP1AQ*`v}CsiIRcu`s@rO
zNx!bHav|Ga9F6wqCO+W%ciEBkzs87NzDxJOFS88L1LmrjZDxgdOMRadx64A7Or!B)
zb{jt0<;KDCZI_EN>?c{@zWVTp;WzzkpMRM(eQc6Xa_f5iIj_8nD>a6z{GUDpF5_B7
z6>j+SUe{`tw&%So9K7sJPi}G=({n^@GdJ5OK=HO!dCWY@kmnb#_abjZO!+wXRwc`;
zq#L^{H|)@CRX;y%H-9?kXd`r!N!?nxP5P5^iA*-SZqY!$_>PMZ>oq5ljgFYgGfym=
zqVGElGci3{R3~I?onI|@A)8-oq!;r{R4dGURmP>X*(BWM)7D@>^S3(LS!!E*Khm#H
zvCe&kAM{OTdvbl&{C}@guD|;+CA7CGIY-Eg`Tr|cm{VXj4_Tl4^?@h5V_>~ba>&Kd
z!}0HPJCB=#y3cKQ3&=b!k`FxaBXf5<=pv0mGovim6tXuDxu+;8$FqD|{~YlA*?}gJ
zza1jn0KT~`$__8d24B_?`n>Xr){}4JBRsV67dulgRs*aSd5xH(Lza}CKg}|mr|d4?
z*lFg(GB!I|7GqeIgEAVU-;OX$HW=h>#dvSTjKA1jf58$(9bBzd@Xb(2?5O#x6if9~
zMRSkctHotql{^<GUKOT)=CptJ@nd{#hk4)0X#eNw=1bT8AI^ELeH5t^s<Z8T2!e4Q
z5SvwZ{5e0)L94nm+qz>jymkV!LVA9lR5SK?X9Tg(U3Rxcg7pAN(rw!iK)(;*<4`C$
z@v2#^|L@!$z$flS)Od3vLgK-U@l!W8J7U7^)*svT78?jvt@xc^_p7;M7MP2jpT-u~
z>r$;Q2mRJTV}zLaKGH6t++rgpk>g;|nE9GZ_k;2XZ+5v}AE(b^NxkeGF|XjzG!!u2
zBQplZ4$6)L4)xr-)93$YD%<mYR8HphJ*oM@Fhnq#ve^!@n?=rd%SfPn!d(-+BKY$+
zGGJ(6oJUf1XF6%;Zo#`Qkf=Nw@t|vaV!xg+=ahM3c&jhzZ=YIJV6}L)Oo5G&Ue6N&
zS!Jd$i%)m{oV0t?{5jmRz~7*qnJF}7vo-r<wJhCYqpd>=hP&@EZ~iaS)<2P7>(z;w
zr3oh}klW_5{C0{Hh|owct#g=?w&(qG<LsTZDZ}JhZiCfDucR4QQ8lgIuD4#!1;=XQ
zhn6!wCoXXE?tHv6{rKyLcYohdt-rJDpFBe5zWMFpNfbJpQm>Spel;r_LTb|P_9RQy
z&%+-jJ{XjbF_A})sI_hdT{_44XruW?<)2f-D<;EH+coF6YsRd17tZh0SKpJ>jCt|&
zfG*l1&StH&)ieD3?$Y_;VjF#-PPZ5C1F51|MiRpC?%!eUof_|T+Qt>QPR5^Tdsr-%
zPs-X2r>9!9f}Lwunsj5Z>2k_v`r8GA5ID7l7D8N(Nwq?QH=%s>{o9kW-spjwk3QJs
z-%Y^jPm}#_Q<2Zlq`N-hhxQQA?F&O<Og9~N-sn<m!GA4^L2{FN-aBPqxxXQrtM__L
zTuc_T4Jrhl#2i1r^W%Kt!0c_w@5VRX${jrx&&uT{*e+Q7inGp}E+7-`QcBj!=WY2^
zRMa_sDuhGchj<O8KZi$qc5~XUj2LG8I=?%)ODNy$dTZ-3WT+z`K=f!X_D~;{wlI2&
z2^%jiVSlt3kyH-zHntmHFn-_=*!7rn@v=(&APYfT+iNq^yR5t5%F3j4>mw~kKh}W}
zqC@+O-@kvS>wBljT-IAc=9{dMdCWP+eB=tGW!JLc@14?4_zl}{nR6uKU|VXxwjNuP
z9bM#8qF&T*Bn&RT`Ly|F>)~_vG1j5^$NDtI+JPv`s>E3Dg+H11Wl2bqR<@66BFBn}
z3|39C@A-djM)-u3{`F$G;QhShR-c2mW15d91aG{oZ7!LG#rHhf*WVxK+FIXNkzJKI
zmEe7;PP*=zrrA?ebE4JcmcZ=kXYDWA8cru|d{Lf=6S(g0$G-CV5;v{)dQAj_mX!Gf
z$CbXH-aD<kkIa&^J%8y2-#b;HEI-As{iq4=38~H7GkJN3n@weB&1nU54n=-y%eC{I
z`CNDnT%qThD*bi+ZKCh9@2=NJ+}L`5L!Hw&(suL#5?Ye}&RRu~%gy5hND@+(m`=Jd
ztGj*fDf+IxU8~I{;yJDU=FoSGq}N<dTi=JhhIg*Knrd;Wd@%8#x4;BnXr^tfY5!67
z0ZG!TP{I>rC9YyH>#=PAclR7FmhZ~fE>B*Y{^IarrS1CePsvY@7Wfxu;U4f5k_HQ!
zz@cTM_uBbZS8H;M&^&^dCx=g&Cw<lS`vj(r+VBu0x!5<`0>eurgSazo;Yt<GK2{fQ
z011^z1cGBqLnB7ez~hPMQk3=Lt4Ko~!4^+&^JQ0X9r(fKg8QOlz!!b%@9*KA-^x;)
z?sLoMT*`kOkHCDbr8kr;ZPXM<*!c~4V#=>}h&!cryO4(PK7!1x==IS^>UG4o7Wihy
zg}suBmqgtAn!{hG=zdFly}A@ze~#zT_zCux17pm^Qf-MM9-~wUS0~)~VV`C2p9Em5
z1ZOKilIoQHA@i(q?6j5ls@02Q3?ylV$yal!dP~FQx3>;o-`+8JM`3GVky8=jqkaDr
zIg<U#Gr&IMxrdyW|LiW}JbG;HWA7N=c5_%ccRAeJ^`>ML^XF4E!+!^dE8d=EF>$)G
z=tF+aOu*#W{qtkP{)2;q=^}26$Ct*e-d)HYMc?MNhoBd=Ci@7xrqKD4la8irwcmQp
zc?vd)HT?t5kII-SKr61Htx{52AhR7B|4>|UUl8l=?jB71|M7L*@l@~sKPNIuDTzp=
zva*WEI!;*?I>aGcNeC?~+es-hDmSAtvNu_Ua4KbHZ;47Adt}ew>vN9orF*}>f9|7u
zobw*9_v`h1&G+lwu-Lu2(pzP`a_d%MXV<k?u{bTRt63kPXunF;UHVeIGMy3mWp!m1
z3r_6tN00u^=$++0>go<Eos1`wY1ENh%N6XSH}0Q(nM5P6n8;)BL!JWXb*6ntxsEqj
zu@G8p?AZ>nsR`@gWSe7OZ8(Bf_P-f+JUQ!V^KeWq(i8i-xQ3+4R{1%5O!-L;0tv1;
zF(%F^T=GJ!@SEt_iEeS!5S?qEXyln|_9^-!(w~DOreb>m8fpp>99-q`S=bxq*yrw*
zo#*N4U$HBB5<XETf0^xN3;DWOTH-Y7B`$7fWq6{5(&FybOZmy*mk$!0Oe^XVxottA
z9j8lAZLSO6UPJ9V3X1R$=pB7MDVM?o+Ofoj%R|ahsnKC3Sh*jZJ&l3cDX>J78PLJm
zk*G~1hlYU0zmXWf(HINIOUR*2emk{mCafj&da}es(!^5Li`(rp16U8|9N|WBv;0we
zDwh@}I~fX;>F&f^wVF-B7bA}bI6Udyj7HpzC;hTxPCtfK?`Z>}HM)3$WwSr)U^2}w
zm<soZrcTtcf)k~xjUkw4$)}i-L@_E4lb4f^oo(u~MHp&}u(~;I#_f8hxcPf%YPnP|
zE9B9(h|)=x_7SFx+lt781KZQnG_BR!A4XV5c8#gs+(KleKpm(PBL)rXezKbTmB_2L
zR6JWGj0oTT?m2J4YHHV-6F>ZVr!zd<K4eQ;SA&t|GCCnHT7W`7zv+mBHC3O3H95$q
ze;}_(J&G971J6+#4L=ixPeL;>?OxxA1=t+yZIoS8Zi4KHWd&lCjYuy-&hUkS>%>QP
z!+Oghtsz=LVYlkND9S#r%xiT#Ma=Ma_ioqyE!950vWQK)$6pH4$xqB>hjc$&HK|JT
zF~fr8(!XCag$^yaIk3kr#oZ&jX|Ru4onZdUG>QtD{GO+0t$?>K4`LyBVv;cR@^^0E
z-g|(94dF9Y{iwkzI0MUCuRP~jL@YFx`AQ6-mrn&P&x^Vivll#Pp70+i68Os`&{E!$
zU_ezlYM=EkJm>qx$4^~ee)}BdSo|xTR9c4=u~7%pvKv{oTGfk2O!)OOdrU9~zUT#m
z&v^2aMDfj{ASvf@l-bGAHzk@*_ADQGdqn+i*3lu$Far8b%BOmf>6e9pn9hm1hf$TT
z4&XBoh*S{R{3#tOWW3XO)@P=EhldsWTt{nhuxpv4tMaX%PjyD6`^Q#S#(3mrOyycS
z?E9RfvqEH?m?QflhE(gXqbP%EQ|XjHEvpu&S`NSF9}fDus$kDCd$LcT110P+UlpV*
z^L?kXko|07x2zNVr>iW^&~!5_^;+$axCrmzklo{ZzlK`u1d?&{arTH-Tx{lpWixQr
zoi=Iz;B}$%s}h1|zji+8W^{CPWERe?G_p3sH4p+r41R?jsd8R;&s;1rpA@-}<kr2C
zAGw&{8&>Pe{lXhOPN&I)>4t;@IUnGx{0Ye>w&JDxIB_4)uQCX=*753N?EY6}UkT_q
zELQ3iKS6p@7rp*SRk$;8SFb~0tG^qYIn%hMe7z(hi&0t_Lc(9=+euV3(MBl>2VM#j
z61FlwPHz_s>r<v2)Sf=GBj@hNw><TbWqIy8HtT+U5)a-UP3e%)Y(X!;<TDA?&tDUS
z|4P|1qBWF6YwZXdJygL}{g$5)L@$!CxW%Bo<TF3}Gb5X=h(4(38g+Q>mQVyKfG5!L
zGJQlv(zq6(-0W@H9j8B3(awq8wPitgt%;FgJgMIdP6O5|LXDTdzkS5z0$-2leT`OS
z`0%f-e-&-gx8EY1U+I>RS5_vsGV$@)E+i#6*_41jE(rxQ2`wMbEoe$K1qtymG99!g
zoq$2qFq+3WDf8S%`FH(&KfafHFuqwBObP1z(kH%n+I?Eb<M{kX<0VDagS*w!%<p<>
zH(aKO9Pm%tzY~xw6Q!@OPu&q;-EDGT1x^4F5n<GTm-m}cW~HJ>g*32mrNinoyHZm{
zQF3Odisg9l1ur8qC%#-cC#ql!P4e_%`UDHzhJt3dhVEQk)rgB-nM+ZI+lW;K0LHQ#
z`<v;s>*228M`x<Gp7zeYnrQCgqJMbgv7cpL7z%xGpO_jqu=oYAxjsqV>L3|lr1Jw`
z=QCZCQ&_G)<a~bD0wl*-GVR)tEg>C|{xuTQsE%DY^`<1aU>7X2?}++wBIBv{sB;Cb
zt;umES;_oM)s6wpBlBMSr=oh_Zx#OH*k-)+(fAPJ+z&-muE^4A(M~I9-hJ)D`e?f_
z1V;{}Yce7z?oL6$A$sR;1&#cpg()#xOR+td(!KJotA{m8ZPmTBDxytGYhULBD`5(h
zCZh%G4DI~_xy_XB@!~EWL$w*p9T|tQ?vp{Y;p4zoXMw$1a~*e@>*+_GODuEy+VRCp
z67N}S;(baU;RiP`3<d&U?cW6p_&swJ7GDr)AZ(>CKF+wPR}-yTJu4xoK5KTZ<Evfz
z)+uXeO2bI_?nhv^$SL&UWyiO~h`7<(2i84hZgp?X`;eD5kpf|A1~m8{hmm+%zkl|(
zuR3WpM5F5;HaCC!N#!b4YJU}B$u?ooz)A=Q{GQ$u6N0i4{Q1--hF{JwAk?;%>9ldn
zr<<0|;mQ5#aS6$PEh7sTz>B{}*T-<ro*DJ>KF+*g{Tb0XZ{gIt94nAgP04}Spatts
z!x=b<MCMtS;DZq}IXlP0X66Rjfv{T+1D+&)t79RF#oFYmdQl^TXg=+Qftfku9ko2N
z_qNpM+oNQSDz{&vIgXr68eI<4jE~ZB`lZrWDg4yA9T#~zLkWn(DMLKDqr-6-YAQ}h
zLi1~hmiT;wH-ioON%t9M4X%7@T?C2Cm}t?5`K$pi%wv{>HaOF|Tg9Awa7@)t_Tb`%
zbrhpURl=uUSVr-zl*GMGKHYgACjYS~k##1C^=!I6-=W1{Jd4?7yApmx=(M*e-#Anx
z>@*d_5&m}Wqwu+OkL4k3g4gDOGgtR{P&e-2`|G$`VuGqgK{xYw*l63}US$6~u>Us)
z>N(f%zxq6<eizTCWc29_ucsMPjAhQ>aLP9p7O$lIYI1rRR7o?X>JdOZ06z!sly)(8
zmJ63MmTFrP>ub>bm~F)!VxLa8y6jsIL#F2YIGgm7J>s-eu^t{j#)3I#Pj4xRxk5K*
zM>&*(2-*#G452J_{jqPtzoZjeZgTEgp;>8Sdj4r;$lK}Ef#uk|xNq@Cd8QEZ8`3t>
zE-J8b38){x`$cRjrsJYc%*b0)1`AeV#n)BZX}^oHi<U8-Sh<>XQatCCP;W~M#r@kS
zets&Q`eQEI%f=@58sB)jjo(GS{Os8`VtKGSMha`|p^V4^p@g<cH#!HH&i+)xs7G~6
z5^c^+4vYD;q|d$%iy*Y=3E6;To&-7&+D5Oxc^CP|C;qxA#-X7ZiN)_nYM7e!FQqJ;
zS+vg>o4bU!Ll2LR3OQJ0m1;ZpS%)gy?-va~ep0Y2k<F-Pi~Hagmz<lmhvpia1#Wy5
zr|O{j73=Z(Qcd9wYg@Z`7a@?<fV2n%L2rO9=*6~+9tqCC?@17f1o^TK1WuJKCe8%e
z+^XN2DJA@^g}4&C<1_W`=<K1px3h$=e=jK_giTG=L)ePO?^M*8pW$lW?pd=EmMrgO
zYkA{44zcFf9Mf0alsu(+NzXc#5rt;E=%@i-uGR#ujrft?x;7bU*3jn|^jxDOk!QL_
zd;ZL-blN7ZoLYlVNxW|v-Has&LDQN~&4seh$bl6k!j6}!a{B4f_i?qmy<OKe|LW6s
z!^;}tGW7~t9}(xIVr!1>%cCBn@hUxg52KubuC^pFbOr@B;NNkNv`0cL)pxu~Gqrb+
zcO9|?29|KlOwgp|4+*9fKIef)%X%7|77ye+bne*~hzseVZ5}AaEIy2Whb%htCf@!-
z<v{I+f|rp?wWIc4ayN@zdnDN$W^7%4Xr|3>xg65$`|$PN@frv2w7Qt9repFl^IPfW
zRNWugKLRR$fKbNXnhO@Rq^+~WU}WqhA@1`yo&)WAq0ZcIbJAyInj08Sw#oLcF2lDL
zr;She`J_~;WDTQn@6~WK1C_iSJw8LX&on;G`$b~2I;s}>WrfuiLp?DYdA4_vsjnI8
z>dcL&XP9u`27VOlMI|h-wA-!^6Ci=yt0wJF$;W<zli^#$l1gz=ku1d407gwFQS;1K
zE$>V0bq_-oY^vR@q`^V@yb(1|cW>xRq!|4srmiDtjR9|y%&-!&RmDb@Jb5%9`p52d
z?i<RAxDs;F->s1!XI3_k#^=FY9Z&4`BZTH<;&tX@PP!t2iL-N9lcgtze6Ym0$OmUn
zeo+Z!CC%T{ZFD<!&%ndL;i>?%v8(khRbL~5S!WEYmRq?j7qNy-gitoXa#yO}=PNG8
z8tZHG9WCx|Ue+j1zBsdL9zyd_Tr0`*M6OeiW39JC){lnfIj!kxwAZa2y138ohFfXo
z>;#9GNQ71Z2)Vqrl$M?4!u3xhRLafSab+aZ?xUI%F7VwJT%<lQs92R8qCY=oX6*NQ
zBCc&;e=3@!ap$PK_vQF_p7~<jE8k$)HJi%%8TTz}_~v|+`pOH#u7gZry_==#gO5^u
z&c-=WI(=Hb{<+BofOC3hiyY8o+i4I9jMqy(GUa1Yia6KkBf*2L1D;pK|CUP9Kjw5O
zs-8Tfv)S4k(&YPV>qJi4u&Wy#&D>Pdh#vepG#}iBY97x~WLcZ=!IZwI&=6jVFw-3h
zJ{|7;kBB3KZ8PFz+jN%gc?NH`PqMVv{pTi|bp%^maMRJW#|9f{z@$++Vew2A#(4!Z
z_}Ta7JSPpX4Qpa$HFP@W`#UvCSCSRC0X(!gIc6$^4j*P|8k5o8Q^2?2>^^@KAMRdO
z+3YO{n;xF1mx9?<&%&n9e@GG~Z#o7w!71b>@;W;^yFTyDiH*DF6oHsdGN!#p-SA$P
zZH830)VmPZ2iz{<C&rQ;(rmj5&@((3lM+wQz`Z9C94n#)iYSeoA`QI`*fxx+ictNO
zSJDyq7E1xptxoy$BEsVT>jw-h(eNz>WyR<4EfEc{czm{}UPI7(BVE@GQJ3WDiA{KW
zj-;Y>0g8U%|L1o}!%PF!2cJul0E-o<{@eOqH|ECaviC5IhfR+Uflp4kzyt`_+S&a7
zZX(nhHX-&&Lm1fvXq`xnZ9z)w&9;B-MnnX@g{ObehX`FIC=AX0hHeKpE@LC5qmIBY
z(l3XHAe-HeMBa99{KB7eB!cy(*<HO8`Boih^n?R1$mab2qE=`w>>~BPaX%nIT?RpT
zbyApTb-HAw)s5MI6^ZIP$*;7oE*L+wdg0Jv8E#xf_5Tuuvo0PE>iM%8Md0?GGzxWx
z`3Ds0enJqSDDSo=MPyEGeY@IJq0VLI|99Uew}3i9Z0FfQok9-Px;#E(P}1_DIMaBA
z5IuL#kL!p-0#cfR`SP$duc#<3x!W}XA3+k<4ZO*~5(oR^y{{I7?9YIcBdfi$v(qsv
z(V?;VnD%qJIcpmm_5o8Z!O9t78_n*E34I5Nr;(w-i}b;WkF%BakKv!aXgvC#WGAWs
zg@r9!k+3(ssfQA7H%py75SsU_*vAt)^K5=l*6d<J-ol6s`PYOiR9Ul<mMV8$B&501
z*IS#v*Nj0Fi?h)T;tCz2^bH;uG#y9>a0teYRg@63?OG4T=ba<5Fg^~{2l`daqd;2u
zKbZgoiEsfl#iwKEJI=s4?4g?c7JMO^YcV(dy^tHVYw+}$HwhmrdbLQ)GC71&oIOxK
zmCnx`a92?j(FlJI$l?P;JiX;C^2fS7%#7yn#K1HJ;SUw>>C{4@+tF8yzy6eaH}=t}
zGrMY@#2^F_s)g-rP(7aVU;M%n0|T-st35{syn~sc(xG&AaB$#e8s|$szuS7A+2y^g
zpelC)c_{~Xl?!$6{XELwCd0K$ZGEo#H&>fb0>2z&Iiv<*5{wS2o!MZg-I*hErPG(%
z)AL@V<&H;YsPCqTUz(1&rQzB|3Q~XprOe5J+A|Yg+cw1a`v86q2vj=?n$6E?DkmI1
zv!c7aS$L;|kKBNGpC?vhI%28yu1m`cne99zS{-D5;Nq3|)$~7kn!th_g~-{C(=ccN
zr7!<rY{=mE#$h+yc!l=ZnUF5t1wG}|b1HuleZw)}wE@rce%c)a-t@zA7OLvxO&e;x
z893OZnKS%pbab?MP{HtS$R~+OJ<v1%-p%h%Ht<CNWiW1HuN48p2Il7GJU$t5`pnCN
z78Wm%Ajy9+Jb6u90e=}u`1`~R=Q=Ypqt`qTa|fFztdYc%z8B|qNhM_HYOdzFH9G$X
zLm}hf17J&1TO6E`EuA#cWs1xT$r_aE2{Md0@?K_whkqtkzY`yK)GrNg5dVuM|M^4&
z-1Nwaz?b`B9uph9wFtLMR$fRpGLmt`k+d(2ZdavKbHoc6|HnGkwi*RG39HCkh#2#O
zsP-GG9j$pGIsA0Ha`$rS<uzUCTn=3@s6(ub|2>X3;>51SXRTFD+K(C92w63o5zKX&
zqBr4+%uO3B@BNp2{k<8BlK?1G3JBj5w18E5a&#{yQ~}3PMYAwByTl)pp5Ex%Y+MY%
z6-e4AJ>v}6g5QiX)R`FI@5xSf#y)3ypbFvCjo;pQ$kwMoi!%<UBY;7%bvJIp(#j_#
z9fht+p3Tv9vmnUFTmHYlS2cj&f8U(y0^dqJqOSEZ0bak*p@@uDKR=~E9Z?W--Q)jV
zfHyDDu2iF*8@^?&9#v&lP(U*$pdXQvrrxmS{`)vG-S8B&rLqk(GY+@ljTHJs+_Uv%
z0>7D-U;q-=cdTIn*$XR5k?Cd!O|{+5aos$bkU`nwTfVGigb8Qy!^=vPu9xP|lcK=?
zI@pEXU<YV+<^tVBcHz^L=U|Wlem}7%FB??Jn~5^T3ym5}UCV0xQ}=A%bGrLoDfI&_
zy8YuAOIADu7$MtksEZ*<G<gOh*NMiNt#I4GHtDPA-Mz8&B-)xtp$xw;t0^mE|2Qf9
zF+9z5>7JCsmnFtTD|=ldH{nykNSh~##XZE#Z83~b8S%u{PZ|V&7J0cwtURiM<ZEpX
z(&2#{&zgZH7$c+k!13S&Pf6NhjoAg+W=JV4Fu-o*+m9o=J%txCx}3XrTvxkD`ApT$
zbK*#J4hA{Mz)QEsg+byDzEJ_8KwBtg>1%}hd0Ez1+1aMFzW)B+Th}PzwpGBTY}d%?
zl)2Jn4c1M!GMZ*-^7rF!A||lymz-xXS`kf7(LTji3XWzM=de9O<g>hT<WY&lt+At>
zNl`x#s$?o~x{n1$2EJmF-TYw8$;oVC?VL3C@NS)neHkHAF4X!{UO_X0X+bFpQJ`z7
z${<FILSO82U_oqs%2&~RzC?L4mDN%6E(d4#d$I2KM-p&&db-OV-L>AggJpFXrrLW<
z^2NxUR9Aq}%N1d3t1vo5KbG22E9%{H|06A}>UrSwCm)hEVKfVD)MtLQ4V7!KU~TT}
z*R4in<SZ$yTb+|(_Q0#SW?IfQ%Qhut`YyI$O5y9-y0YQ5%-<&gb?r2sg<$PrU8a>Y
zvuxS-@)$Ym0s)H1ArpK(U5@TYfPlcm$CRW$ioMuykldWeuM2pIqIi4@6`3PVZeleL
zXKLmd-}OLTio}Cp5Gq#{!#_h`60+FpxQE{5F^#{HY2C&XEwq7Y3A~h)Zc0a0SwU<N
z0*EmrvbH4^7<+M_xVs4-2I{cd6-7kDpOR^I3pO&9H5=|q@duuh)EY}V<~y@KnF#|d
zL(R^V`N&xk%{?)WsGC^E&SYO%%ftKMMT~xW7e>o42i`(p7>TjsP&V2V75DEAYNPn+
z=JugoPxw3v+>m;Qaae+(#wqiYycJnI@K;d-4bc<lxJu$Ka=eSU8a%^=*}wpMc$2?t
zO1xmi3~h1s4en3YKV#9bInI{6qTiG>@kHH0Z{D|CZ7p=Oo#rpxH=oU1ZQ03LSXWNT
zl~@YSYWRr-V(bZ?=|_ycQmT&R@7d=FWs?Ssk(UdUH>lhoW?mOdFiMMVU(}wmoS#w*
z<1Y@1<c3ad$dt~^yoJgY+Cx5{m;~FOW(+sF@EAPWgs%Yd{*G&m&}=*2W9pRd@Tpao
zyi+nif^8FtGOIF*jr0?CcH$;B_Mgt*L;6!_Iw6@~rhaIce~a0y(5n((*~aiA&Ppb{
zU>3>mr5!=^2@PUT`d(?(<+2K=LkIV>zB^P;lHtF#50nFP@=pt-v;`Gw9nt@)q8Q2A
z^Sd-qTPTA`X5Iv=(Q~4NLmRkb>Hs7{?`K(oFq&|oY_ujUSrM*M;C_5<BaY%xc>6nO
zqs%aJJj4^jgwHa2^L<}?+#5=l<iw8qoA78*TjVMm8dZ!*7YNiASAp8&c-pwF8_$aK
zU?x8?Na!<DVF+#+e|C-Rr>QFM$hODj4i2k?0*T##lv9GE&IrTg6AEyy&q$+9`ozp{
z94*S*GYU&DP`GAZ+(X&Fm$IMv2)PHm8aTezXTRD5kcyng<0SGSNgh{pbr)<=hcAVF
z<Eao1LC|O+7U%KZ!-hNqVT$TIyOK37q;=PD*vgAKd6ecZa4#Dy2Dc(Cu<Jses?HY?
za~0h^#}m#F5)z#Qf95;0T8FHOP|L2<9J{siv3f(Nwv%y}k(psCY;M!ge$sJY&@{ni
z^3X5FjtLJA7hcKUhv@rt7Gkx$NjM3WlC`!zE3+UTCi9Po7T%TV)PcyC0e?9lm_!H#
zf&V0?*Fh#^0W)t3fl-iQc))$10e%zS9kl4eO$R18Y}Ip7mQ0Za6dYByYS#vd3}mMB
zha{@ffynl4B>@NtF=Ze~?3<-!Nq=Q|gVD|8DeS<s7lX&;^HYWz-SbS@lK^H$nF7Lc
zcqNo#)s>Gvl}38a^m7*?QtHP~JRl1tizgwA`av)&JE6w{g^UIj$#v$(<dY7P$Q$(p
zqkAC%%8m-J$35&s6e=4y@9pE8-XPBVC;mr8J^`0qWYkq&<Al>XXkcus)NG|fl2Rlr
zQTpze`BbF&MYw$pMP`5s9ocq>;Q^;DhM((_%A51NSs%RGb2A*Ge+$X7@Rti<+nKr}
ztc@|nTVLS07S;HC2#0eWqy$!NzHPCOEZz`<in6fO#Q&tAe-w{=n-B}bb02>pMRL<b
znJ+L6pXpXkY!m%;CVG+b3O86i*%YmYFnJDNX0mT>R*1Ge3dHlOZO)5_WOB6qg{kME
z7HrH-s=tAJks<UU;Glt$9umgIK!K4N)9vWmJF=SloZJ8bRRO0|%V$Y{OTw&h7oKI}
z0E69^JXmN9W~SH@m-|7ogD6TmDWaZ9yUP0u;-V|o*RS)u!aB@P<fu(7-^R17TCI6B
z1x%o7FU!kL*t$46zMjl7?8^Jn4il``R2ASONl#=RSrkc4^u;!N0!wpCo7wr1r$tU2
znax@OO3gmtE^?lX^xyXUA;LF=ZIsDNe&_ZM3Uwaej5a)kF;N285@oeE1XmBkjGD*B
zBn*B1l<SKQHzMJpkW!T-vb%Mti^o+t#?>Ahd~0!N4HcSz(oKBMC7f-V-E9yLQN@DR
zb~a5`2Ko_koW_<s;6#ZXGX9uwmB3SC{9*`V6lX8ZN)-tG;kLvdS;HI^k5YHVDo0%h
z+X7qpT7-F`r11qO!?Wsa1ad9;K97E?0-Xjr%Ja955;=>vX5rx&uaFx~MxdjWHvzA-
zLFaD5Ye42^L80tBm8nJ@5B+AR{1cVmS}l-MCZ35BNM!u{;HOAqn)T{%v8??&Gnh5=
zCYU(>JMdeOvtL5Zn-xSW51coI(qQ+2v}kN0lVB~=j*93z|I(hEb@u@8n3HQr`rVc}
zk*z7IaP^DZG+2{PfOpkaZ>JH@FP!%Bkzb`*2J@?tMsEaqG_{I&0&-<^%;oz_VO8tN
zjsdYOvm<{pw6%$Tu&K*qd5_d)L<*67R`JosOR$q|=Q-KFfUoQG=}O9!82y}rMfo2z
zhRv#A>efOjoA60ArL{Fs-gJ;@<<~NQGEulq^l5$Tj*TYjX8f?)ZGi+PkA^6$og6*!
zFDgW5cvVC{+_*{;1%Rc4tOBIK0p6dcbP?Z{HAb05UqS8ij1n*2aFiMNj6eXi+ak{S
z48#;%|5UyBBSm;)lA#o;Y}~n#-3U2Uly32|#m*$fKTXBe#wf?}1M9Bx0WB>ptz4YT
zPotFslpuv48<D``T9(t9=|Zlmj@FbUG?dQbCy`dl#3|kAe*MRdl--19f>p;;BO+66
z7-g-S2H_UPr>l01!(oel_25c%fI_iUX>dY*$64{UDS^o)3+DzkkK7BI>DHDt!^H!$
zjV93Qb0al833ZggMLnt^!hy^$=FJbZoA1|X=3GSCSu8h_(h8i9NV1+XU3$}gHi49f
zPXYB@&43JKf#WPu)j>}M3ActJO0{`V<OGtvT)+6b7)<+C)svC?sXxRapGVo_vm$Np
z$fgBJBbiD%7)*8BKZD5-d?D`O$}d>B|2iw;qUmTYPn^lqzF%IkepEzZ1W;-Vc0WaL
z=1XDP+;!)7Ex<yM$C(Zd&2VziB6atL+*Iv-p=mXd5sB9r${7%bT^Cl{gx`gaHY*6e
z=h$G3nmiCSwJW&)SQmLFyyVu{DyLG}$INcF5wr`W0G;XX3rtgIJo3gMO=uLb$3{_D
zt&A8&$}5SW@wj&pQ~kldvOm~#1i3LLtbkjMEXqYNCBcMpQpE-i?P+@UbK|@tBYsGZ
zABW`lk*!2fO;T72&pYqm6hekrK`264D2;0{xxSJ$N}<O}lYrO)`1AOzLFt>QcM<26
zq!C(bg6yAj3P;?4U%T5nI;0vVrcP?jSm{2UTgk$v=tfvY{8G*!b1VK3UxDM~o!V-$
zfc{{*AVCb$UVh~_f4ko^7+}aG5h;9iP2_QpPdX*hRNo|oqKp>;xm`=ASYx6r!qRa=
zRb|4IiuzisuhtwH%0AxA;jb0T_4~}d$a$N;K*z<4JzD?|4u2DXU`9;0!0O8>JuLkH
zH-MIH*Hv>6-5$ft3xXw;o9c`0_-1_^%n5&YRw79;0I<bl+{4mOfaPKpEk$;8wkd$#
z=iK~qFK}4sOHY&9ZPbP#v<#tsfU%do&np8pq42;ul*jO@RkQH+`URTl*k%L<QYV^_
zA~}R)9(Y%1NQUP}F9+T`LNfL6>9E{_C-x-SD5pzJE@oV0atn>U#sF=~;a7GLL~ks5
zbp(EG5{$AB9-xb|eglbrC?HGlKevY8lK7>~INWx%oup|*)F@6yS_Y&T0f|f+QEI`>
za4lmhF(2`kg25gQWk(iM=YyFE@f?1bXjWyX8<7%p@y2=bB27>z#dia=b}Gzk{Bamn
zWt95q*VDP|qN4Cpv?WvTu;uqf0GEf$q(qU3F+S_jx$@H5hr`+qs_v`vgM*SVl{lbo
zwxo*=5<l*x%DNa~$L9m}MeMgXJw@A{<xpjMx;b1X7+N{6*PZaYV^DylN9Cr<j4Y>o
z>tD;mymSL%lHQ0tMPsc5np1Q%9TY2Gm*(;Z8B2=<-X6xPn;%;nKTjHDhwhhJ^B76I
zjr>H5fmgryh~bQZVkBLt1L83$P(8bA_iUf22D%H0>01EWDv4^bB4bX_?iS>Eykjml
zE~e2F>mZpiF@53TM!*FQ>7x0I$}O7z!|3R>T^w1J*Mmos#kmKv1&o^CA{VXts0O|5
zzMEHMMKaa1HBU>(#p@yRg9C}|BcZ4r_oL;XT=b(*VmYx@e(C&U_exblLiV>Ch=%%m
z;mDDrP(ze%Z{HfqQcTvSX@x})j9Cmr{V+0C0UoPd)hxA-`rqDDTo`kDEU$6ey!oi5
zNRGvPx}@}wH0*qR6if9qDZO_c0;GFXZvIqJN{EX!Ro-!qTWqf2&#;vpH&iVJEt}x-
zj@qU-JlaC_T~f>8Nb=opx;S&qHXmB8T@O4yGz^XnP}nZJv>%FX><vA;*^~!adfNjK
z2BkK`zYt8=S-!>j`#TMdFk-6GUC)2z6e&>6l*yEC-&HLW4|}x0{p_b1C&DwfoZrnI
zYRi(`A&|q2zeef1g8uZhZ_hZNzV{K;$!5pO0FAVS=aNIIFd0z~j7zJF{SfyiR8wg$
zb7@Rr_*N^D!hQ`PtiFMeQBT0!I2^BNHjs6dQ!)4yy4iI^F-j0`_ANqk+0c4dgXJw%
zebYWta-^;ZLLRf9(vdUM-P%MuS3MHMoE^a;1qiZXTstmKqGpZW)Z1iUu}8*N-Os(d
zb4$^Mfq2ej^jie?fpG3JJBd3rq!_p%0FT|_p|{AYZYaQ6q^KV}izySf8sP3RZCHQg
zq&YfvGEC`?T*y5e_y{Zgt6rBziz(FhO~RSe2^B|iW$)dQVMJsAf4}+r>_No*7HXdQ
zfF<?8x}ac%fK0^Pm6(!J_+!1>Fj2!);}EdiwRl<X!BcVRi1y*(;X@N8nqPpsd?Pe|
zE^AGOeOIkZcuBKcRwZJxJAZEWa30aLJ}-65Y&E^Cq*ENBYiI<V^3*4-Kx7ps@Ma~}
zvG@oukv@8H{X2FE!4}I03B2V&#RdUmp9P7UQ^WkmmU`t0g-!$2l;`o1_gu88TpBzA
zf7Mq1=<d56Y$UPmvNy>kv;)D67Y%ksko@G<QepFfCk6xZ(Gblz`C9kKhVw;ss<|5x
z^wn)VRwB*5=u>FT7Xu4o-1z*vE|vUarJ+B)WYykdOj%uQkgN@U5Tvm=ZJ2}>g0!Xb
zu}Q)9*L=z?|21s*R+@!jp6!8llJis2^FMD+9=OXV%-%FlX-9!t<rles1FM^vkodmH
zB(Q`FIhRMw4C6<6HV#9$OPRGJpy<Z4I>I0wiHC^lQ36Cp;fVe$0oBKfy5&42FLdt9
z|7@b<<`J2`fo(JiG<(A>(QJ=ssx(Np=Da5!LPq}wEKOLDLdiu|u3jkFtw&0B;WV}Y
z^n0+yWu29sAKN<bchTGGM;&GSzR{h-dpoR2MwO~OASmmPMCLz6JMd`7Txl!d1M&AH
z{96$-pyE4~!|?NNE|a%VMzV0Y<Cieev|^rSGFE@3xJp*%C*Kui)We_SldJB2%O7Y{
zf}4B~7`K#gU*yyzInaGbZEo$&R;vvH8JdWx|M2aV9B(91eSjP`VtGp_I2lP(S~_Vo
zQDWK!H4c`M?|Dwq4fKJ>9FyARwpCT|OE!O7kWenu>a|{pcM-+r5km;?FTqLv;E<PH
zM;nA47Abv`=Wc(y{A6uEYE)e1S%rm#iRj1OTyGgYqA%5E`JzpCEbK12iw)_m^r)59
zY1Hj|lTV|{_eFD0ozGFG;jpUe7A&lM3L=+lmG<jo3jdW-YvP~kqhYx6W`R9UJlHE*
zn3Vp3oksQW7tNV56OBZ3ocgiEdz+<0Z=3^u!_z|8S@njxJ?Re1KW7S>XJ@3?YlQU?
zoarFxt>l?fG(^VzbEC2DXVQr}3+K@uV^6i`EI)Ke(x{#^p#!9O7ZG3OsCKb>nv@6Y
z0RFS=ZrZz1YY{jp8}1V)zVo5gID{L2gE3y5s&CJfF4JTMH47YlN34SoDu#)dZq|>F
zlBZ3yBPrn<uu13(koR71t^5D1^*4`@KEafrZ0QzuPajX~&CboWr$oPR=I5P}-eG8S
z8#RBU_5%Oy<A&!`&ahbAj)_+!v8fh7*UJDuQsf8O>mLK-AJbOz5rcLJDU|c%O?fn~
zer<2x+s=Dc2=74Y`(9RlM-dFDl9>=L`9lEtZ6;`fvTbVtt3Sqq(e=QKSWqCEK^eF)
z3%3nMYwmQ4YfeYv_H*35jTFh6=T4bee~X|YmX_!PspDQ2MQ28s)E<gYLDJhMV!&Q8
zd->@R)cE&Oq4ILq1s)<J+^H*Oy_NAxc&^d&eBP7VuV24DDvYwB)MI>5@@wa4#WndA
zmoH*bjZ<%0qQXg%aRrf#s*A0b|Jpuzmbyqxb5w0I9V>*a9#*2|zko=zpXIC{eJQLa
zrl4cP8bSztFRQj$N-dDB_4HLi<X78pAl!)uV+{YK=-R?f?tpPsey}`9C?sgiy}+)O
z!3&+Tz_iSAiz$fqoT%ZTu8F|6h%66}V8IVhSJ;IvO*<C$SM=S2;X+>l6ZlaZw{GOC
zzez%fr_?U8z=q`OY1f-4p5!<XsrH$>wJR1=|CU3K7&l^qEub`p`nw?0Zym_$L~MWu
zECVyX{^s{T&J7U%FTrl6ixEeu+$v9@A?E3|e~14;b>%3F;<8(I@`+hYHV*~qfAlWq
z3f#>ZJn=3<D*FSHT7aqm(BGZWH5()0>nPH+C-i(>LaLMoC8sszCn1q0z+2CALHu&2
z9P%Sb79D>R)U}`0+noQEd{)&n;Xlc75q}2T*zS1k0g$Z3_NaxT%GIar=6loIv5uQy
zIu^t;SgWP}Pu{_6nb4h#Bz|})f1i%pzI_sN1<G8vmc9tc5APDs7tn!4HU98MqdQ1D
zX@o+c-xa!l(28`rEJcJiI873%VmLV1Ljws}`EQd*du#mBC)(glibl!9Fnb5fEtuC>
z;$KXCLjloDRYWL8<<{8CQxXofHfNUFo6bvD-Pu_mPovyMv(3;%m-h<ew}^O(@c1%%
zH|rcyQm+qCQZ({ai{?g-&OFb#Na%dBlN*UQBPV|upT#^~KNB5=U6@v*bPK-c`o)pn
zDc(|_W*dT-v&F<Q`+eEN-Ck<ctJUBK!f(`dz}8C-UEaui1WGDjqZ2mY8{aU-HXkO>
z8YPd%2`@E<%-)P-^qADAw90$^n)#{QLdRX#g(nW-d6eSpqq<CWK4_t><!Mqv^s6>p
z#MYKKCw3r<`_DR7z<@|hy_buMQs|lrhRTJ?;4$CDXHevBH;`U@2VK=s@cOmHo@*m7
zd<fhg$`-73K0mbYEnFf}x?DMiw+{j_VT1&3SWna-axq}t`B`vf0%rDzs3?g&w%bPj
zTP7bKxPCjkR8B_LqZ`xlSNw<e(DwG54>y+Wvtg~X%yJEROs@7Yh5FaMoewOb+<*O`
z#M35?KWC)}gB#b}E79F7Ds?;@(G2Mk-!8LX2ni89*lPbK!4i`&aC5V1>ps%2)z1Kt
zcLjy7TOH^kf(%5dAGy?v1-@!kA$?Co5r34@?XGM5iC>@E6U;7#34K_@q5g7<<D`|W
z%ciYc*Dzyw$z>_mx$)?ICBO4&BHH+u)G*j5sqFa1(c-m{p4XfnAeGWq>jI+H#AJ-E
z8wNpTAI!p&Gt>Ju?~(y16vFUzJkY~Bq?>E}EtHDsj_a12pB1$+a2pW)(ntc{oBnrO
z$Ub*7K>8JO+14LM12T(<G#-b?=Qv;NJ{g&omge?@(^|IBvfbr=$g-@3?dZqu7_fB@
z)#&C37Ha1R=0*Q#$WSEKkfVrwTp$7=k4Z+u1|9huCwn|GPm|G#CnHa2JU{T@gt10j
z^oUg`3JhV_=ESKXo6q?qncAyMrnEhPWKv+}%KrdK3RE0Q9^UAf|7-hGc0C5-F=VTn
zdE3B_b1+xw&BY9oYxJUQVVoKt!+#0;Z(L$|h|mvs+O5E#80usNvdvsa{G8anMMT?l
zRU)TrR;KH1<@!sN<PAr$!FbmU!q*Co|4l@eSA=*kv?7ys3%!Zo?Gs`}RQU8eVrsRW
zQ*ZY0z93aX()b~{>%Tt4*Ka)9f7XcMKs~2h5J<`4>TN7b#E!8V-nD!dkv(v;k&Q`~
zoFSbC7h%w;om|8E9|VUyGdD=*9fnhA(HEXpcKIV1wZ>n^Wv}_COCwI|pCzmRO#b`8
zJTa}Plsk0LE4SnO<W8E(>k~!c;o*n663$BBb4})LHxI)+fBB5N-`DigLv_7+?$j%z
zWF}khVEpF}PjF@Ok7xQOU8_?rFN?L}syp;Z(=MF5Ul+16AJ97;NOuTe8O;!-OxABl
zFyueSfueGg=1IepH-m^!s=k#%L6n6*m5ZrA@%(WRah`yOB|8+EQYl>q{i57R2ue5!
zWmDPYWB3oMK^+_+8ElWZ!tV{ZQ_NMT*l+<hFI~<Z-G#S*FFQo(BJkn(6WZu7lC<an
zE7Wff_uiOGShrUB-4N_UVn?y~DA(1yuHGlUf3VQ3E)3<U%EO(^%&L^G>bo5Dv(Wox
z9204SCa#D@&n60C%nj$^wfg}xU6qjnebzfxsOcn40X-BzTg{%U3KrGNgG*Kn_s^o)
zcMv_X#(PGQ$o97)XObG-qMAsq88R7%!M!-R=%GAsJw-q4DKAmIFf(b=R#b<SG9zRG
z-_-`erUJH<qY+NHHV9|id3+^-$<czV<ct-|GtlyNN(wH2pk_IwxP96>JK1!F#YiQy
z@b|hR45h7uqhnBFj%H3=GlOZ#Qg8<Gphf-iZU1j{_JDnTYVB`vl2^BhDzYI5WkLsw
z)nrk0`$NLmcJn%)={c~vg}Rhnj0BqSyL(IxWcuAqW?u>(W_8oclW9#ZzT$;ed(UdW
zZ$#P}F9oiG?crq?o!zHZO7gO^MXe8|kpsSrNJ6;IWX(1Gd#<*>2rVrwc$Ka5)>p+0
z$eU1Ai6Ww7{P$nkEH~g`?sd|Q>J9|LbCV-2>&$VAoU^7j)qh=#FvXeh^eG8cF?BNB
zf#w-W&k|>*9kb4K?jzTSuB+}@cL-3)5L|R3Q>(R<+Gv(c|0<N?NzV)0?Sk~tG)*`f
zi|{DyoWNEMntAJ6Gj_Hk{9t&2`Ct|-$kD2X^>%4ldUE@NO;Rv|mV_4o4SixA3u`&J
z-)t@gr1N2j+&6Qkb9;R_QuzZIpb%x9^zx-+_GbolfWdU!YcAhMwRPs|(=$((4K0t!
zB^$dLxh%N+zNF;`rE%ulVq*O_$|F>-wjUPDQrYYmJO6Ag3~7CuGkL0sKFvBY$zQ1R
zvScO>?!~#<SKnn~rSFQ`&cAhbTk@;0PpNcEtzjSBW^3;=E$nfWMfi^wR68Hzq#Lq6
z1I}eC{+2<byg0M_Zj;X6gHDnO1gjUMi4~4+crqjSqJfXhg802Ov9k=gLaAPLOf!1-
zh|X?G-(_vV=UP0&2Ya(?h}J7@=sP#eYCS{TN`}WSx6dv%sC7n;D9fx04LKT**s_zj
zKNCv%%a#yAmO_3YVs2H<BP{vXCX@E#+EhLrtiHoiM6(@dI||}|{if_c0$_bEim0nK
zBdImBs21sOlkPZTlwTuS?2s{RJw#Xf#s5*DO$i~(<i(EkXQb;9BF7Om=byL&%2yvh
z_fyEt;xRgw`2C_{eZswhw?^HY^=5-u=dsS0JN0J`+-94n|CwuTc3b9hv|N>({?_M0
z5gGLF49d-EV(L=q-JlkvCF_bZldQbpg5HdLctoLiw!?1Z(EW_z5SL67sln`1<>crM
z8<h7>qPJ>KINh+a@YC+<!3?>L$BUFfR=&s7*B+Hkk#?QSH1kmsmwX5=Sa3DJ%NE_4
zWiD}Cv$A*?S=^Nn_pb6{BcxCJIO}kac-$q~8oj(F{p>a1!T{&b%l%S}Q-A6$mTb(5
z<d-b74_R)*zuFCDynLeqZT6v%)<-X5nOZ_{7b{1D#&G)>Yxh2<y#47w>{HSbO*#N3
z8uuF)9mGKOaXTkg?6n1L>Q1{DZN?B>c11+&Fe691Tj$)p9P99zH%9enH}nnNmj0a8
z%BGfQ-VT4jERaSiszx-C=5|xN$Mx&i#kR99Oq>6uSSw$3M;5~Vb8Bq&7;*`IuA7cS
z%JwF$9qy?)Bi(@T;}-c9iDjwC^5kES%(^B%2Wn_=@}}1-sbB5NlZsdmxa5_);cw4q
z?V0j1KRKa1Zt6Cs7I8AuJZK@EeChU|%MYQCG2+$P1IaSDke_;cI(#ZcUKQIG%xhr_
z8S^ccXWpEe8FJflC#y(>W0GbyN4js#9$UvJL=_X<rqb-lVx^bMLo_ov*9ZN`OCSRA
z<T(zE0P%LCCaScocfN1C%y5wHeR)RQ3u8Kgj^{!HT8H1h%FWFs#@Ode$Q^!&Iyul<
z@XX_s;d2g0!N2kH_dik(W+o>Gn!68`&v;lbq>6j|oQq2GRb39ezn1eU2516j1O$!y
zU}l8~R!>|)WazGK2LUU$^Lt3CxLBdAGAnXTOm^1^eV(}=K?Yp`A=&>@unR~&aN<A)
zS~+??Hw!K_IKMT)+G0s}6ybgkL=FASVsnBvL83VpcYPtfI%vnFR#Ijowfc!|ae{7B
zj#pbx%cI+WwhYc3WnZ1b<LA|N>Q??>`c3$YG`p$XC|$jeisSSiyEEW26MTc+NQ5Pb
zArac60nR5qF<SxqQX8<cIX5~QG%sg7HP%j<GLdFT<YV~d8V-edKN~R5=JjnwHnfQ%
zsc;rg%)(0%j$T%R3Aw@sQGEBH8gJmFC&3z?kiBY^h;zCkO>yPR1kGG(_GO*DhIy5N
z)beI}#eat6*JDu>Q~@V^Dc=wHiHRB;l(EF^XHw%9apLZ{C)N{0Utr&it8%tq3(8}l
z-t{K-`&TEaA4xoal7Ip=0=!P<7zm=*4$1`cuo672PFbPcC_O0M_1J`X$o6Ou!vmMu
z;jCN;R|FA5$k!uCoBYii3>I)zGC@tPp9^DRigKfJ&Z@E=86LJyEfG?Ux{ep}k_(<1
zbP+QoSIKTufmDH&u@~3y4<nZgrm@HWy|U*J?rUFSv7n$p(NiwaOhHa8c&mlT>Fr@}
zF$GhGOV?R9U5#m+a1?e+3i0;nIFVtyMxzhA%XP+mwCU!z*{0^K73BQAN;m;q5R2#8
zj%Fyzt2B1(#GQRfM_C*WLj^4TJaSJJC1-QjU2FCzIbRJ$X1*wN{SY)i<#}J+$l)RT
zhj|4HPlWm+&CeA^5Ka}S!<jwGVGHgUbix>3%R|`+kHi(!S|>aSK$sjfj@HCK6=v%e
z<w-!0&WceAc56c$i*t_N=h3O}d``tCS8&Lca*+1&X1O8lIgPG_h@v90rCUdZbW0Sw
z2|z~t+=3?Qwsl4aAW{(s%GCQ|E@(f-_&^=2uQGo*<M9^c4oD9pr-%tY4<+{=y?Os-
zHL|h9L^X+X{od~)&}c^fVBMJvo!h+kcP#*rnez-3E1;mrj_GvB*Q?oQ1Apc@v!7x<
zSeJxh!*g7WJO_mZq*Y=PP&b7cPDT!Ay$Cm|BY(CLEB)X;qxqrDabc971nL|TdFZDu
zT(hdqKGjL9#dz9y^5a*vU}*A^)MqXHA3I!IA^sw8@irINoB%>1)#Txs=MG9y2}v)0
z(120rpNSRd4Doi4kBgC5x1sg&gqX{kNCC+tagYs9Pot2?lV=~?vy0eHAmkr{m~Cp|
zUpfWD_cLqDw;0)y_O)>jYlDd*fvf?-kT^u2Tj-~0rg9`%YZOf)P?q+Q1y#3c7piTG
ziR3WC`lJ6nmWeg6fGOM2k;5Vk_}f0kg@v7O&SUky@dQ^PMFCN26c;1KkB6_-tNrH)
zGzj9eP}y*baN!eM7=>$xv?~~hd+Rx^`0=!~Vzi5igQ72KqmWC4|Mhza5c&Yms9xr@
z_X6*5=kOVMDO~!~(Ly9YUUdt}fr;}y^4`_>=+e_XnguP5EK+dkzsCY67ln4?E!={C
zOjr7Fye)Z)i`Ky20gRP6J`6bAExHqz{8$mnu@<ZS&N}{Ys#0!2LJyDdb=y$#yAt)%
zc-#ik&elFs!s;W1*XoY688pX6hliW;?~+}>-{joj1X0qY3PRRi{3Rnps5hqhsw~{}
zHhnvNXA^!IwQ~x&x2!dlbiP@+u523_@BWiQL<HinHmzJDB6z8n`*F@&#BW0_6UH26
zpS(b`U?;xq-2Zbi=n>qcKFSU@8Pb#Dn3(AEeFZ9>1T@7-4#xlOuwwoxFRpuu30KF8
z$!8{fcy|NZNGS2CA0D=aD;k0;%YLi5Ymt;?+R`y;{(zbO-$mi;>G2$v(C$~5vWsZI
zX`}?@Hr%gVRok!lgdIE3jbxi!sgk$2ZRbGytrZj!Pa@g%avay?A@dtMh(e_MsvAg?
zxf3j0|I9*O{3zx3#*_(^)XHBrcVwY^2~w7F2@lca)lbi4Nxz8#6WGZW+Gn{jPyBkV
zZ);8t^{pST4avy#4I74GtWda+p0mQlmyT9b=M<1cc;y{*@W``y%6i0eV1OOyzbhM)
z4!12_zeDn1ge?$S5PvY%0KG&Wm|&kCej`IP(&?Chzg`ej-!ru{Rm_Vd0vHKYl{-&I
zXXjAgoti%!D&z|$^1#GwcN}$5Lo_rqsWJT*U~W4e7y|LZcj4UD5>6Nw)GJjTcb`k&
zI%Zt|zMb$7979eMh5o=ooyMuoukN@v{lv@r9dW71jdBbRL}5aAj{8LYt|hck<|H>?
zbpM)XIz-9?{WB24)qh{f$DhI%G&_GkbVrt(5&vx~(g4tC$xs$v%6x)!w5f0#bLsvj
zp|{nFHZ{4AQJCG)!9nX+(&4M-Bq+@^;9PYHN_M)k7QS9L384wGf5f8~ME6p>i>TNZ
z7r=0WbDY#l_Ep*b=g2E@%EhzLf=*I$d8*M;P2Hrqq#qT=xbKD`B0hM3+E^&Q*=}np
z-}8R`@sdG@9nyq$4~Cj3p8-w9G;FtlnQdTKuUG+X{qr;G;n_m>$(#gP8w2+Gj!Kjn
z)Ec$p!WsfL45WUNGva9cw^m%?YbL+0uV2&ak{1ednu`~7uFXE+H+?ne3mMT7@x=G-
zEjG6IRNBlgqcrCiPwbgFu*_cV9{S$4`c*aO2$aDl!@+2>>n^i~Bc;^IGNqBjdnB3$
zjtR#MWo}^%eMQKI-hRDrf7NvI+^CW4Ka>Go?XILA3rx?TTexuc@_t*4@Otd3%f&2>
zIsuQT3yb>v(xDaDHgI_HP@O#>jmFke(P|p@PbL2E(J}ka`oksgSeX1E`-#sH0T@b`
zOI7zBvyAm;kTBc;hDeLwODnVVZ3osJX`EynSPOlhd+jq^y8L$cbv2v4Jg4ID*~N%R
zE(Bjg^Nw9mu6A`utA-qo2%u3AIYy6m4z|1myd32LqPp_g@G)U0WD;(h^1Rz0bdMlL
zHF`79<<OW@t;>x4IfD5!)1qXZDzrwJ_LY=LzW4J<14lR5d6%gg;@(V_$bTAp!CxCn
z(j;|pke3Q6s6lG_FG2iG)Qhl%vXWn7w_CS=!<H?{aVqYEnvrMGuo<3NXtf!H^n?@)
zZ?^ozpZWTAh{2e1t90EGbj&~Q)Y<t}yycqZEGtSwyomTR<#1sO#(qQl-Yc9U({XRK
z-Nxf;?q7lw2=#HcOP0?MSex$f^e<YL58d2>F)_r?y>2^ZIK)qK%zlSbp=suXit<9z
zvtgfx@pHQ4=4lV5-)Q1#Y?h#_q0KFLKlG6C(s$+{c!<$?qJQRfiQ8nkGWys&fg1gY
z)RxZ(9!X^OmX@-45wzoMPf+C2<w?QrNkQrkyE5soH+NEIf7Uc<FIhc>{`F#qGuh80
z>sdTMnA5H;^zEs1txdE8S`(R|Hy5eyMpZu$qU4GVO-CQ-X}p(d3-MJGxGEmF_f!-Q
zv=ERn>mExA6KcftKtm<fh{?Ya4k3>?4%8zJxq~#%NygcgB#n+kn`LzVRD%lgx{LUH
z-u1=2BnMlw5}^WObT2%`DDZ{n&ZWG0IvS@M$KmM>cbn7@6Ea}mv*H{6!o905x^+tB
z@kwuy0u&J}eppnMBGRdbq6lg(vw=^aPdiSB(WKIi-F_+k^$f$)3;U7InmgMP3zpvt
zFP)n7Gna`O91ZFmgKlG2c5QiOc&p-Y1w5Q5mX8j$C+fKW)OiAo{l1IRwQ?$cbvf(o
z^+sm6v$0;P6AwP3pSv7+<Z`5BeDjB)7p<hUCnoB2Pu)xTGkWZmwam6iUSt`}y{YS?
zzM{K$>BbbICEfBP1<gXU-W*^pqXa(w8Uzi;_f|CP7#pU%>z?!P4pk5##!D1gru~S|
z`qdb@GSv$&9~j%2r>mTooqM<AoY0$|JDcTMZGS+&)v!wSp|WSFcH_U6T`w>;B4q-9
zn@XXXpq=hg>$D;i!hkeUZ1?5f$ti*~5YVN6T{FX{{`xAGbxC8->spPg4r%a4kPOln
zDF~i-$*ZU3>fNQ{{P||p7T4%ij>WdJJyrIfEeoLyVc6Rt!!fD4pZQxN`ow1J7YhUq
zIm1)M1qoSl8)Q!v>7f+8IYl29#gd3|w=6en9;EJY{S2TBjq9M=PwuW1T&{@ziOc#H
zj)N9I(~2(J>0A-%<BGc3jMHJ7lmT&9DTW^1!uik?J@f^<(sgf9F)ldJwfA7f(XxG_
z(%)SP{m^OE<`z75R9pE?{B^$C#h(d(i|3K8;eyNzlYA0cq>{hi)aes=lordtL32h<
z#=<9EvX;&bv^%KRiIpSnqp{DK_zG@1AXl1FAtZZLjwXlqQAxg!rup?7lPc+n9gSS>
zI}(t_b&p8bBV=^kAKv%`jWQ=cW@!%WxfsRr18F2iJrih_3>~c!rt1bT8h%Imu5-<a
z^&4->io8)CP=j=r=JuGo?{OlPqVd^D8k+fSmsrF$*j8SUtK8e0X@LaFxcMG=PzTco
zpUe6ng>+K*eR<UK1<F38_lfoX?h~Ly1F=Q`3LF-5lKn4&mMS8b&fIF!P-dSq5;!?+
z(>-HDT?{=3$({UU(RYvQFWq5&%jxu<G2%wAfJPBd@?5p0{+@B4p2qBm=~PJ#q7^;G
zfylqR!81{P@3*qv`Te!lEk!ZpTC^vj2UPDCi9@VV<c6=H>y6ZaUOWzo_I}8XVeoQp
z-w?Brfo^+7jcjZxxBSib>EupjRSBs;XS_o8wx6U1hy|@hLXz;-ddU8jNB3n8&1cIk
zW^e9Z9n>j#hJfw!E$SbQmI9$^@{=<3%q!)Y0AFPe=&OEYzwAm^df{k6ob69&pL}hy
zlm~rtDZI(61}ct9KUS5`IsLwUKS;|fXk1(M6_E}R1H75aP2Fg#H>UAs2xq2X+@I1>
zM7|1$bpSu}3JAy1z=*sZr#Lx!DufDi`R7C*e6mt7!^0cDvQ9(#-38ZI3%~pns_0(&
z+<gdXoTd{2?TPyxpD4=?YEaM26P4wjefhTCG4YEx-!O}cy6n`;9bK(YGhBZjxz`*R
zU)Sx*k)Gf8toM2Gz;n978N@8{ak_F40=?y=KrbN12+bU)l5{`~sU&n=WC5CAKFT-;
z4L?I3w?aLX_aSgKV(aUI>uGo2bNTKaPozpcoUbJDR+)7P>CKPenO)Roj-?*zAnES;
znmG1{oVCawINF5YpG0(<N+3tac8}Frk4S?X@2D!2(UXrA8?V_u?>vOOig0Unv0E;`
zYvkJNtbC-^>@8@AKib(3?TS@zRq21tdVS3u+M?>;&4o_FI}QO*B?Pf`&wZ*u8hH1*
z6!*J4ap@W|s>nK=(_zuGx?Hj<kq^D!3~whRjU_Amp_RY5(2sqL%;Tb0)^3S&);+{%
z5O{>%!oAt5X+v^WIR~y;H3r9$0^FD|P~KsW#vncs<pM`F>Th^48j3F8TY8o~Y#pGV
zBeyVyp1m{n!~O1~{ZGu%zrvnMvr&Z7{B`*?vgZmPc=&vN-}5g=jqKh|#R;n!jo)ED
z^va+;YXo@$U{X_VT2rNDwbf&}6>068HMWSL-xE+*dA~-Nf5@$vPNtg6TGih-)7LU{
zUTB0Ks##YJt?Rv`i<NLOIz@vxLxToM9{wKl&kEGquMt3w(xJe)I{$Pyis%m_^u|dz
zcp-sQ{^*8MtUZVuKY9kfRtHUXu&aIPCn%sEcK?nCJR)!Mg^))u)B+^?LN~KNJSzYl
z<kR>aPsm9WPq397^)aauyms?pW&deMzs%1=#;Xg5g6({5OC#G>x=I!ul~=w%FL2cw
z+1W1*KR+ZJEU6eToS$qfLyIFls&&m~5j#=;ZY1(d1$)~nJf5tpRZ($mr|Uw%*j&Kb
z{%cv9w~OG>4QvEFZou7Y<YE)2DFc?5O2qhG9<2@u2j5I>sr?#0{m)D0p9hUCe;qRv
z$hXga-ZfnA1n)uYV49i6u1;ZtI2K-VB)iH=@1mhN!10Uh=%xy*YZcYY4^A6~ld9zw
z7|U0#-3IUQ@Pw^Ep4q#|6}>*vHDQs%aLRoC@x>syzh(2f0;dOKkoG7a-Ry0Vw~r58
zosL#iIjeZ@S)!Jv`NO(0j^%oe!$_~*C#$ooE5?(tJhDk}3w|e;&)~_OynLOP_!6<u
zcCrG15tldAFy;a6##P<NsXauTS_{A8hW7oo=|>1h6|2G(6{9ZfPqBE2se#9JU%Xvi
z9>WHp7l^&^N`>RCrHQiEt5t}v!Gm!4tX^3uJPpWJ@_4DbcB#7ZNBY~z%HE4RTrNyP
z3+?P1B`jWvD^p;j-I!Ru@OAhubotI4gEw!36o^$yQR$wKu!ct$NfBmn_1yzS4<hVi
zG7JWdDaT2qD2i}`a%og7;AX>FxR?Q#5=57ewtRqYF2d&<u^)S{TW3WETTV=#%cZ~@
z&p$Oj1JSb?L&@ZR#Kb6qrxNNEPXA<f|HgdPP$GFiW;zLLvwQ#i_<gw-r3S_DGVWDF
zQHOxc$^^OP0cg&jUr}t@JzG4>PV-9v7YUEc4-k%{!2;pl;aumnUBE^=Z`*|#1;rA(
zrU7{8<5yU3@awl`-wtHBPp1T3C{8<zi=q+l`mulZ0%hDN7&@cU-M_oir!@*J?rP(S
za(+QecG6oI%U1>bU9OW0msuWy<#IvktTgy4BG-_1sDk{y<`NvYIA%kyf(rq!HIMJ@
ze`7iAO_}wx^PY>i7>%s4IEtIu==Qhb6!(dT#xe{B16jHn6aC;V7)(l+{F3$;`jytb
zkXC^pR4?+u5sp=)wYsj>vwUcZx;4;e6ZY<l??1~BbN~ILDtH$*GjS!T%QrZ&qpM48
z1v%WsnTKZ;cesyxg?O7M-%Y&~mT87hgc|Mn3hX&jyLpda5*|VDWQH(bs_*Ut5%*<!
z4^VGIG%`=1$I_1@E-U4MOQ$ExJeJDtsqoA89SuC=KJ?l>Wio&Gbku1Ak45!Xl7*QW
zfssiJSZr{8<583?MNW=R9em_=C2R{$u)aC4h}3IZ2)M#_ujMkhg}-wrY<`%PzQa!{
zCN`Z2CZ2t5m!JMlOi#XLPdT;grHVM3lBrz9H~o(TT`y1i>b#8}e?oe9V9xfw^p^wq
z%Olq#6+-(PRD{*siTD4HtFMlWvTNQK1OpUM!2$sTX)p+v4h8A%5|A!skysi<6a*}~
z8<vvpu#oPOR&r@!>H5u$@ALki_xleYS=l?znKN_EHP<=gVLrN_7Hu!R0+~gXazPE8
zW{)(=*!Je~z&ues>;I%l4mzED4>Q`z)AX4|&YiDbm854Lh(&s(m>WCpG(oj&;<(4y
z@gO&xFQ=Q~*{1mHfoqQ_4HGT~C$jsX&S04bpl`tic0!e(eg=8a<zPaA!$%1EVO$EB
ztjiySP6@&{kaN!U8tx1;Zvz0>3~oSv&|Q=vh|~V)sDCv+G8$?V|DR9sc7asA89lUy
zjM%?UZqtMfOva8B>GbqNDg+jZ6#{TxzRdm@`Tp3$rE8k@E!=(h-oeCnDc#*eJc9Hu
zL5`>=FZ_!uz<uCq`$qApybK#vOYSm*`_rEL)5lqsl8>|VODfGUV2AF%y;<S@kEeDw
z2$;+;ioo`Jz#n?#QEG*-htdWxbgvACRpV18O2>m(b+zX?_kVMqH%#7LN6y!3AU{|p
zr%Ug}M^6iw#Z5lz(<{+5nggK3|0HN>J7-B-^>-P7?nvFI->DjqbK1Yh&7f5kFq>cL
zs?#ri9F-Sb>z=T&bJYpq81#datc6N%I%`Xt9{+c~@<t!vI9@#cC<1yaMfS^#x%$uQ
zJ*VRV3C1HIu%&Y}s(O|h^c6wRqEXrXN%jgAt>S_@O0_q!v$~K=E>$+1k$^291!TI6
z19%9eoT%CKB$B~pIDnr1HTK@t!o&v!d5-GM-<^&>?6o}Mc;RP{Y7+ege#Rt{C=hvd
zvKvG&R@459I_y8<w1`ZwtOs7wlgp)1CHDPp$-(-l^kCQ`g<&RgHm)Sg$rOWNFGV8X
z6&+P>%#1i)<o!oW=Sxbzzq9*sRu4MW=^Mt4LV&x0^vLjhJc>l#hB9b<Fg6689+Ez!
z3phY?h({JX0oaXXeo2$0K{$^&WXD#Lu!(3zxS%smCzSfO8RB36F`-*}5NrN9z4HV_
zW||~DgrUA-13uUmS#8u$_p;gFW{+GSQi-y|AD;-Td`(sb2jMFq$Dqsatb?InE$fO|
z{HP1v2zGMLg}DCSZu$b!3;TRVCaWFfU}>ar27h}=`nPjbF6uUPTlJDbY{Jh<Dux#x
zs>ROVHrSihATQi=^4xIx0L?@1yq^5zUyPh;9xz#W?Zt=D6)In$J)8n#N!5al%#SGu
zAP;RJ(s5wtDAmSapS$nH6lz@Y2io9FxaG}jupNX98=>+}=u)}+Tpd1aoYFWeWk^Bg
zVT}mex1E|e2W{&4{wFnh$^6#cBVvC)Y;RccS)XB0lsW+YM{Gz4y2OldP-MtRTr_dF
zW_#-?b-CtL=1^BS`s49xFi>r_hL6M7XwJ&8xOefotC+b%Rv+;GZkqslbgO1fr-u7)
zV;!UuPUoBiadi5N0_y;VL?cwT2|>a}&B2}&8+XX?qVo^_n}-M0CQ%QkhwM*1ztxco
zy%lF`%M?(PA14r35u{q|0$stP5!?!HnW5Y$BgbI}n#tfM5EKWSE3JOy>*3T@-I)gV
z!F3<V+uDDKVyiNJQDpQwClU71#z51D1}pv5NcDk*8$@FC2di_1UGh3^iFN<G6S)fn
zAQ2<sv&a$W{~+%3pe8_eMQ&V*Kae5z51nd!=~~Ga^J4;zl9O2k+Kb@~zQI;$c=7=!
zM(LNmoK5|=vnqMrs^t&rrw%a@^ssn}!wDlC6LL(v4Clf7u*w@Q^Tu>KrK3o%CB`#Z
z4YttS*>MF<b1%kMsyS{Vr<@N#XAHJsXnC%nr^-@}10Vc&^uz|}^DBmtRt!kv47O_g
zB&R0|f!Z<msS@Xl*+dGlXC6=maFk8F|E^nai%dj_00bW`3^rUi7}K5fKCEiqB*lGy
zcUss$Q!oQB!cU=jO_5T$NHq+1n1djR6T+0@K#(nKC%c9y=f%%5VK34ysBpVIOiE@4
z^~MA6FC5pi<BEv$Yc5WcpI^*qLCqdT`qA6~5K%?CL(wE5IDV;3Q~OMs#g8P6g6bcs
zl62Ws1U||rAfbrk^(PsjSE>#oFFpuCF3X5ySLD!Dj%avq(P?yJEq!JcKo@V>=hL?B
zh5V91JJ96rcxkul{=uK?=kISZ`}qG>$P1{7_sQ~MIzR8Pspz;sMe1XuRmQ}^v-wj`
zk2T=5Lw5g~e|$a*lluKnKK{fZMMn~M;}jgx2N8E0Wbb^4Xt=SoDj>wdi$b}?=q}1l
zeOO{Hei6>o{RdXL=j01l{51AOQr@6k!d0~!n2C3iE1|?68i0ed1fVCKG%Mk_`D5vY
zj~G980D2~uMC(y*|1^!VA=CphmImp)GUcwP7C)cz8TyBtknzud$`8aHA;|U8yDgJ&
z_cGxLdaon@{UK6LbYe5vN(e<wL?J?NK`0s*c4k>>K=w8*5k)=l1aWN5ccw3D(Wei^
z^IaP{7jc)0p9!CU<?zThYl}4nMyB<S?9-xQr_o^nFc`Vv44*bXIuG+mZmUqIS9Udf
zy(c$hV+Bz7UpwmybE+YEVY8jL3*Uqd`xKMA*B;nzLIZ~IqF(LHF<Wi1X8oJZ!=INV
zey{Zz@Lu6}<a&OboCk1O3r^=(7Si?7lgDXM7v02Oa?hL2J-9+HAt6Vvf=?!K0X%Xl
z^cAljciqONT`{xU`5nO7jLpoMt#EI*x7gcWyL(;Y%`Ge&9=Y;D=G;TI0P_>ym_?}K
z?s4Nt3E<U_{PW{sD!laLR?;Lf_^9)mNG&7#-3AwWFOkd0KpCD#`SlZ%s4ZQ6N5k(l
z?2yhdfH!neoEs`DD9}~O)vsC!)z*f_ib@ynAM8R`?AE?MP2<xAjVzg)j%LT*3g591
zXIy(T2#u9B(4+1f5u(qJO%gpA%C)k{hIZwb_1Zjs#Z_&L45nMYsr${Nw$cCigIu%g
zQdJml&T}%pdR@=GHB(?^*<4yxi%eHrc*%zh5|XY53*1?r(=L{63Ui6tpNG@l?~q)@
zxmDWi?`~Dg`0POgEz_yyh}J%zsao1^t8Kz|jw3G9(ZaNXm_poeg;Q%Bb79i0@KDNv
zDD0kW_^r>VoT^h%#W6647JWH-4b}6nyyCp}?#g+F<*8oiBl)lKAZGLyj)oD!MaZ-g
z!3aJ~^wMJ(x^lhe%ZGb9Z%)Dha=-vC+-=@^aGpuE3ii)EZdF}~H&Cbgq!gd5sWM~y
z=lcaGAKtE1IrNts9&$3=8H9mVyFC!YtP0t}h8_D2j~74>^r&Fr{r*ZAP;VLejLtWN
zH`%odpNL)8c5KpNuUpXg(Wm@qM6scTchovac;9*S4|X8XhZ*ZofJ=?qR;OfnoMfD#
z9Aq%pl@^S26DR3x4Zr<03s@c0;EXQx4U=MYOQHxWv1rK8E*UBBcktaVH@GK7i|tPO
z^pQ{_@<W$Ae~XiTVfMBrx`jJtylQWKP-1^XV)<IP>L;c91Uu7i?3|_lXaO$wY26p$
zk^HYgA*OYfV=Q}Qc4T#HPwR`q^Iuny1$aXhGI24+mxfopL~atF*nDh1fWnCpiuSl}
zihP`_qf&e|0ThPJBXdS<H+1oCIYw>$wqS5$)YDG8#P&)NtC=hNn!~}~MwCgYR;tAT
zvr6hk-{<>#OP<qfWh;%W{GTIXe(xEm_+C|`xGe19_Icc@rfPklNvl|1&6CezW{k+6
zgxj@@-bsgDBPFG*vgd`*c89(8dOu|jLX@?@nvN+L1RlfOw)EG__cRMJU_-_Og%&Ex
z1|%CZ<#}Hv9GRpBTVe$((v78_KT4A&{iLdqnilur-|z1Ik#y77rIOQi=4YV3Zt~%q
z(<_1mB~H1!|2UXTI@DF;8cRFja3~_RrC&Eo<rxcozN(*?g4M%W`dB0UD{FhgSp8WU
zws$34(2LAudt-?%H7eBx(9o&0FK-Pzu^g+AOEP~FCVozfnYn7V#S{}4I&qob(K>nD
zuCPPGbZ4Q2H-_(-{TH@v&n3-P;R82x$Ab%#Yg*P897|+aSqJF;K7s!;6=fHBoYuMa
z)D=FiTsyTErJ{s;1JHKJnARQN@lY=t25%}OEw(XJAXy$ds+t!M+EzsGQ!4Y}+r_V6
zc^-J?#Pvs`uT;c%-&`L2w~IYO*l>Y<?87d~=mof5kt&jYZ>7DOLFa1C+=9l^Ga~V^
zF#FxB^S9UgiG8pEpSU#UyEAah(7~I#WYnVqXgsTH8u*So`XtHyJ+8XZzN8l-#mP|L
zPUcWrVN=o2k&1utf+aM~)m2HMty!3Ajj^KSQ`P}5g!Jraf%9bou2hfpQN68zCLz(a
zZQx&`F6b+rm<i+^dE)cHFjDvNdWR$(x6OW==YH4LU3LRg-k;`|3ZYmaD7!&iqC`cj
zHhglBGF_S8A|*tfWaxoRo`n=oiJr@3h-s}Cfg$;4#}hBk$NMXov+&$MwEOsYFIw|p
zYTnkF(QUnwgam@ye_gC43(!)jyEHV2uPB0vS5@F-^gBZMj0rv`V+_2iAzy3f!Vx-!
zjdmKq?2kPsZ-zz+=CSVUTC31MIJ`~xK)_5Y3pz*h_h=UnJL(j`a(=1pGY89pmv`9V
zg9`J^Ty9P4bscb&fuRX}&_qa_a}JO1Qiy`q=~HV?&~OR$BYzN&>uPbCjxm!6rqx^d
zXj9aqZrYhF)w=(K2K}aNtyd@JKz(xx27r5Euq3#<<cjJxRF4hGgm<5yp#;EB^=3Ym
zq9gpaakRgf(g38amLsl9g5Zxw@1ZkHS_?mF#tFHTI3{5Q!}Kv3xSq@~i{31)&o<L-
zadzv_)p`|*Q1=F_phA@@6UL$RvY{9T@RPVurO7#HFMj4Q8kwK{v_xHIHu3xLw{&E*
zwA+sSKo(agD|po^bW;Sl4tEl0>(b_8+zm<_o!EesGL|DTc^uO%QPXTrMML&YTO3hT
zq2u5`Z^gN<N4S8)2qe6^Fi?d4!osdod}{DPtK;P^_l@CcqW9W`=7#G-XoZ3ww65Xt
zMGi%);CEt31mMf1E(=9{*RE)Y#{MiY<KIaq3=YlOhF~gJVmnNtUc&6~_Ve@O{8Vqd
zOIYUMR52C`aHN=OL#DM9hUP+9P3R!(R<XZOG8$d`t}d-n3OKbWLZLf5vzfVXLKUL~
zT=FAs8^7#|6O+cCEu0M1iTp~&-^>P3G9i$whFo~}(|FCO$Byqv@aZ$JSRNgBOIN(0
z_IBa#INhqQxx0Edg!L#A!Q(>-;Z*#LwmkShZwJzTVPm|{^Fe2qTf^wKpD<c?V5!|e
z>t9qmO|*;MSZ<O!ebYz0*VoI3>l~xJLuhs69TWdmyQTM9>r&~k*^*Y!NSJrSwVPTN
z+)36?s9BNFT1D9u-v)Pb2kVvW&h1DX4lTIfsyu60yuZVI<i0qp=E_k{Rhsnah{yAX
zP^;+dvd%z6n7Ac3HO&R&Aw}RBOUVu~2MYrjzH&Z300P3toA7Tg-gW5&FTJuC<bo>q
z8lsgS5PLvl_|*2cEUkh!`<}4=?xe+Huv*Yzn-ybSf%bh;I3F^aZ#sGEzPr#${q;$Q
z&($Zob-hf=Erp0r9WF+CVnUA)&N4sQ&)@7x5DyeX(%+Ppo*z9L*Mq~|wnmnw_hk}i
zgP=Y^aw<2=dzWY~!(-m6qgeiRN2E49np*$Di&;V4%O|=^YN45$A!coM@<q^Zb7&sw
zbLqCxi-P1!r@b*3zyVaX(T^W`|MVg+11h?}*E{vvZKn=xHlG=6Hfpt1>pBF#9*X5-
zZj|&r2dgM;?g`W}z!77dBDH=niE&=!cKY^O^l!<$iAm3gLr?K2!>=zRpcg#nid2SC
z8%M>wQTekuQ=UQVf_kXo5K2)kvyR4g=iTA>>to)tJo0nXOl8fu`@Ix-GdwB9veW?$
z`X%z}*#P3YrDMmh!9(mL({KmgQ;KFzqTi<SS%Zl;z+t@L-VnMi0^nGr)2MqNf?k)1
z5MsLE*GuZipey{5F-{{t9cH^5xa>v*^eCcF6CO}~5`Nn~$h)oh{^ft5bq7(Uu*#Op
z<(8oALo@w-j(+T19E0Ft0Gxm|`^z6$*Y9Ee8MRhFn_J#^lzVlcy<IWs-4jiSiZ8tg
zg$>--LL&H`a<nWnQkpF@v!`~WUW$Yg!Mn*6w3H#QaA;r1dqsj>C%ZVJc#$<wOgLip
z3X@}d;AtcIqXSt9^Cu~m8%Cnhb)jAx$21KRG2~$<kS~jzN3KR%(2V+@tEm&slk4t@
z32Wtz3H;~jm-pS2dX%ef<<_13{^wHHIruik0{}3T?8R~^UK>Z{J+4YkQlga)pGb&s
z{f-`yZkg=Aa5X%|HWi3MYtTRHkw?0LOv}prEgr}-62Z5zwen3zUmw$p4e%nraQyJ$
z)M2t7o1~%?{!Z-Qi~Tn#c?GYps!^MBtgpRYgg9<io<LF@Msx+f^hD{gey!^&|Koa!
zBm+hBzoV3_SF4}+KF-@B?rAGIvJ4Lw2mI({gE?p^XevYI1~g-1D%4Iv{&025v-D$M
z*WOtW2n{vk@#qy~z@~^#l$q|IJ@U_&4=<yPqzMe?8Z3~oe-gg+A->@e6a0CtC(O$e
z=EvH#6#`-Om#L`(Lve|JG0<CLf~BL6LNl7byg>*3eZs$>;jKCVYKs`HRwOS;hf~1g
z27aO+!}HQV^ztI_lSqY2KE`aLT#%1e2}k0f$?6?X)RBLg0|Zofr?eo(-~?o~ouq<d
zK56sYtK!iexSxjN;?Fx)?{6HebqbtD#v#5O76NvzrQ__)v8q2k?dfA5KAkkWclb<H
z$doDR^Zgsr5fEV!Pi5Ina{|6z|Hg}VEGb?A0s%ai={iaYLOT|oI=S`KIJVTQq%hv~
z7NHQ8+4PaaEB<?U{$;>$O0-*Jt+B@8yciJhk0k{T`3r5Auxz&<4lx!&URwDG4ZA4(
z;aRRrr-%<99S%=)v;5L3dT6EpOsRK24AD1vX+O9jGGUvK|4!IhKpBkQAHKZBUWbeE
zwbC43gT4%|@t&Xe%_-!H|DD%Y*C5W!lbdAZQ#|-!b9nt5WA2XWRJZB^sU?Z&I8Gx&
zSBKnRfPkjj5&4H%+dF`B#f0do=G)+~axH%5r=T{+p3ALHwOH}f2kFB$G9m7y>DqtK
zx0gs3eAj3)`R?KE359fzBF}vdp8JNq_C;0Dfp4mcM;OCrXNkAh7px$wpFEX~o^F+}
zGhhP@Aw|PtjQs>7PI9zM@UKe<^;#5P>&@KHmL}m}EWIxA_kQ0HGd(ZCCyKn}@(IPy
zeoX}MuZOTzBW|MQ<aO+OFN0dM`nOkG#nlm-;H{~*P04U?C3N$YOWOp_J_3^P<gJgW
z5@2S8ARZhVEUdwu0u9M2+QRaE8pNO;7Qpqm+Bl{WU)U`HPQqpg#RAt;JTzcQQ)EUQ
zXHidy?r?q@b{vd&eU6a_B><QxRWg`1!l~h!*T&5l@{0w|#7|le@6mI9*<*8-*0m$?
zK@aV*AILJNc$ffpp_FGydx6|V6Ydho==BvbchG&im9pJIFj9d^+w1FZUAzH#jHFuz
z57Ii15ugF>KOSKSI5Kj)RvQ~=B9`w=_0N4Mk%bbC6&#VMFOIxhn+NvbjVD>YYK@r=
zy^Gw|lHrk`<U$;`fJ|0T^AlY#|9p51wl<+@+xZekmrsokypIrSQOpb!WnO}ut1Dx~
zV`pLN)%BqIFB#|{S_1wV=~}<0L#T4ZyODIW$~-a2o>SY!H3(V0$$Ib_17_i?$gQvt
z5Yx7&XwJaPG9e2toWek%$m`0o(tsx2lTPH(v`N@>OWS2*_<A7c=SzJMa6XUxyME%_
zfK@IxkRAbh_Z-5l33iT3gdUe#ce~Ap7??Ny1a!I=L^nhrvnPc(tK^@L*kGJR_`uSV
zjcHM&FuJe<FwUWc(d?X3t?T&QV&n1XGiQDRY&W6`7jiG}7?rzg*2)FkalXs`)^tl9
zwH?jeZ9d0&^P38>L*3IUVfvQGPG6>+yFJ6U1un{)NcV%V!5a#A6%l$pLdSZ)xbL_~
zgVtm4LAo#8N;@4bi~Qd(%w4BoP4pF*(=_;eJDOMG$_KB}s1IyPXU}mA!&6ETuCHkA
zELP;Y%p~|p&$61Qo#yGUm5t)b#x+5caNkS!o;=F>5nnKY1ekrA+hT`r4Oo!w8_R_F
zq3uAYMlj%%#yq;iI0HSM=yH(dOz#4>f6vSKCf%R#qeSjC9zU@o-im}kI7M#(aXegT
z0}M~|-o_}M<>)7B3tVon(001TDz#l@2lP=tY3O8D@$rPSUHi$B)W^yw<cdeU%g?pH
zkAUsCwgVS$A#3U(-1&9%`D4C_*&?96JXYvf@DW00@U$N~Ur!c(IXPp9we_g7frXfw
zkZ@J^xHbGOH<cpA)O<l3Pn*zA7O*lg2$2!(*=MCw`XoUMbY<w5p1)r+MpE0BUhP$}
z94yY>+Mdge+NMF-C*uZ7?2r+8OzP~ppLgS7{<MF`-ynn-9byKCF!L{zj1Ug+i1#N#
zk2dlQvCHbg3&^jHfE_ZT*#-Mvd+~Z_U0oG5i;<=IdcXD}dT*^S3Oa>yYvC7%2j9>N
zmfPA?AMCq!0X&QT!}=tB%rLqo=PbVVD0%o%vU~Kc;_Vt=h&5wbf1Ufb3|B^s;qMRc
zS1|6^eDY9-(;<pD7kb1X|BKeCh<8JAke(<}N!BL(rVIyA<N|qbj3+EFn^nluB2eh5
za3DhJvLNp;{UFD4TgRp-<R_$ffmas&(KL9qE8alvH{zE@g{L2pG!$C)VEK-|+G2ug
zV}=&6I%rJQV)<M%(2+EA@8+!zfcLmnu~hB)cicV+Xyi}({hvWo0zY=2%GmXA(8LHm
z2<TiJIEW4-#XoEJwjq2RS63&I_MSXjh$vMd&UUe4NoaNXD|r+^O7`yf9X!t17F4-i
zl`gA_g9S==h+mS^*&ZXK3BOy~XW%*PFyDVW;wZ-*EWB5fb@kR1N@p1Oz$r+LgBIL&
zLkXybJt}{KlQHH$c7;KwRzLQ=qE-HM!2fyH|1-fvZeOJHV2#l$>cL?5tWBApr23Sp
zWsag@4-l70t>vuQH4@q!JGu%aOcei8<z0vLA3{+CM~{Ug<)R42hIFx$x*l7TrtR^f
zsKFpe^c@xkxV!hI2F<^`IBDv4#W8(mYi7snr%6}~sxDdTLv*8hq4fzEMxLpNg1;QW
zf2Q6Vfu_A~=wf8TQ(#C{wZr#e5WPFWCO=Nq=`)}El(`8xC3dsjlJ%Bp^F=hIqI`z2
z>?K2;Y&E+pQ;>CY0f87`@`!(-@PiAq7&gKM$Xi~&W!--Z4Zls%fPR9GSI~7mtHrBP
zV75QcxJBP}A*3rH$BCWuf=1UJ$8G(U;tZ9vNbn@|IpXaKKE(60f~r|E%a3n!{nttl
zofkrhpU|{M;32&t!r&=g<3X(05@KuG&3PKG*MyE#bfIh+3v7aazMV4-7F<uCp@+Bn
zSi%xReuc056s^F$X-$~TOr)}Dzc1UQ+!As32a@bU7xrn-W?;IMLjF7utWQ@;k{zvf
zM?0#}OBx4dHYCVJ@t}s?fqyUs+Yg?jz<+o|fZe#^Y=7~8Nqt4=6Bc)8xu5Fri4_1f
zULc0YeJcxd-4K02LhUHQlH4b<wmW*<;Y4j$_k75{OB6x{;gYXoxdYCq*TB=g0GI<*
zh&1J_CeTO`_ns!AfaGd{+L>J%E1}T$`yURTRcwb!AkcvSYDC0cv&(9!TVT~KV6!n!
zcaXqP2J1<7X{72LIuUk?@g6-+)_h4}$WUCX&}NWl`XaF3F&4w+&x~j;J~SapA5poa
zfqC#Rp8&ZLJw))61+^C`nB>NwsPIHza~Fu5nAs5^T*|c!eW|49-c(V%lB(UAAKt5M
z-br*DYRfjh%mY#42w?`oxotB~5q%};M?8w*Bul}t9PjXB`Fp=!B*q}b;M7b~Fwb#1
zw@+F?0QuVrTXfz)w~HQ+fao<4?J^aC#BHW>a_Sojk47q8T9s|gVxEP%%x0uX)5f!m
zr;Eue55$W;Z-zFfBV07ZJ%#u6q8C8l$_4C76K(}+i?@NMezkfFA0k4)U*9O_Ovw29
z>i;@k2FL`N&jbxqJoKu4a6#`ekdwQq?{=YM_nmV$WgphoUla>P;F8Na$_b(}0yb{6
zkr`)}A8B$>ixF7%0){Ub*OjI)*5~xo<N_Kx%7+8>!iRcpdLxgtIaY8`07wB~HmjfR
zZy`kySj@)2RaNx|g(SayPG|;NSF^6N1>iEL!1ujb(Cz5}rlA>{IUh{nBW^s`R0wU2
zB!bldI5Nq?+WJ0%`Tym+>ES3IH|0C;q9tw;|0n)OqZhGaqtIhS7W1UDf#TCqA(-a7
z+geF=1>YYxpGktXZ4Iy8Fja}}B_M?QTx$ySL$!;5bu3^ff;G1XQEo)6pzBSuPj8a1
z5n^?_k|fE^GP0$)*l|1#Gl~Te0#y@t-fs^9ZG~nzo58gNudJ<XAj;oR1mz+*19@$#
zq`wcIKJxYZKSAy=(w#`wfkj(B9abQdTl>YAv;`Dt5l%jK)my1ctAFs)PfVpCQ~ywS
z$xs0PzSa|*&=UswM<)R@p~7D5c^68oBPFZ#g+WHMw{Mb-2%(CQ3ASR^xM)CdzDlV*
zMdm_5kGf6-$SeA?^6D;|zGetzs#q%J?{^*#L}SlcoCL*11zYb|^9Vk>NmjMhEf8KC
z3m%JMkg3fX`Fi$aLnNKV^TW6-LP@Zv+qYEw9~Wee#Km)fi)s<}5;K<_#=8kYBye_Z
z4V98{iJKs<Tb6;*2cQJ<2Kgr^9U2i%X6xnCwRma4y`7CV97jNkl;_R@uNiX;!fj-j
zCQW|q)wW`vu2Ueli{!P_zq17tt2M3b{O!B8L-(!^s%L9QI4%rC*cAV)n+GLd9a&|F
zTl`r^U04dul63BrE(*jLGC{mzR~g!@D?^1bO<S(OwB_1{O2Cy+9b1$c?NF-84Ado~
zHT@FH);*Y#r6p8SWtq1wMkWC@+RzMz7`KgKr&RE5@~@=VJ;)iQg(&z#*~Z-KWwwUi
zU;2r_U$B@+z%IOupLAAnkB~f4<Tp&r4O{7p6od&xbwO9Gz!BwE+7wVOA9^9!HDfX2
zdEh?HfcUJX*KVl0-7iwjuM-L}N6N}W+NX^0E^1v34c{1QpLdnsXquY_E|Lcv3r3!i
zq~wLo>U5hE;We!M3V@4Cdj~$mq;DEyIf}ZKB@F#f^A5FJmraHeKW7{8SRVT<(e(0)
z^O!f{E>7?hjV`GA&H#LP!ct^jmRxKz7zs?i;8H5M36I+_U)cy{hJZ8o9p&%`YWl5E
zVtB|aXkWJcg|r2zrAy8nB=tiuJ@Q3S+J*nYw(Khj+#2Ou=_RT3VL38(75m$BEg-z5
zoXcQpFM?E)cdZO4({NztZm--%iW3}Fcx41;9AB&ea|>t9DfO}c^6uIA;AgUI7Pfkw
zgFjoH2#EuWqvz}=))ic-(qbT%TR!N}^c3MtS)rnW>H3~?n*euPhRV<}VU$)|Mn=(Q
z^BeCox7#}JMgyj+6+p7vtPtm$m#3pmQaA~Pv?458&k+#K%E2nDYPmR`?eDec%|tGr
z`6bqgf`6<4DpOHFAs;83LMUmw%0Dk%pky0E9sj9`pLsr>UMAo?yW5U>?QcjathXaW
zWtG!!N_PdOi3tGX8P+j)!4YAL&vDrgf~U2HbeOu_r{$_2&F>e_=Dl8xUNk9Ks@|DW
z^*_mXA;M}WkY|8t*?~$H1^d66!Eem5#T9Q&MF9=T-<Cn7Do=`WePBEa3W#Xq1}Se^
zJ8B;5@N>@o6}TnuA9S8Ws5S-Rz3weZ8=ze+x6J@|@IV|YU447ivMY)O3Jo5uwiuns
zaO0rY18ScamZ~=N)4seLbl4LHr<)L*Fc#4a%{yr=>3rKM&39mz1*N4ckH13gKK<KS
z!9PF$I|1iEEh#;7^C;GEJ%5iDiPXqaZ-}LnxFA2avsB}x|Ee3UTwz%j`yCe%um7d{
z{i<2?)0gL*0rr)|n7t6*U4FhQYYT;c9;{>O!TydPzj<H@-{eP?X7EB`y#5G@I#T7@
z=j_mfs+?MbVvh}9IJT(<B#Aw!dy*$9f#>=TiR+0QDllz#rhS!QdF)T7K=wh9)YwZg
zUvhzxvOO@;M?kyop`CU5wP2RlOwI{Wv`Z@zN`**4EiEz7VAYlSzPB%w4Xb9ZPPIgt
z*_KV7NATBqnkej6dgR+kTT=&>b4ARXEA8UsR6`4}WhV*+80q>m_6-1E$BsF|{+w&q
zh3!=8C+QA0C6mWzT284wSGv>-RdSR!Ys&&GA=_SEv|LxV4(~Lbo%0k-DcVP9B-Q^m
zz>7By^G4LH-AT8ZU#Vp=Q*vANW<8_|=}wz!^;nK??~Oayoi_4&gI|MS_!DVMF1K?(
zg4Nd*sdyxfJ?u_3spKjtoIUAGw>X=wg;~wSG}1r?8*x6fCpU+~pt!lVi#m%;51a-o
zbP#b>kW=;H{hm)Xd+R!o4bx1!{04J*T&P+V+rthDHThc;fQFv2xaXG7Vq4EHVS2+t
zvA3QY<l67`)$C6S<gQjHUz~nQIn;qpLlBf;a5rh-c1h%vWp}#LyVyM;5LB4<={W_W
zUs2&Qjei7Hx6$O5O<hh}RyZFS%GA{qJ5ndpR}|+0OEvB&=!pnVYyz16RW1U+$wU18
z@wA>A)%ix$WpCSA>C=BG`JRiZ$?n?Da|c1tXMD27B1lsTO19`n!M4(*MIbL3I7yHp
z2zzr<LE6D!oN1{9^$)iV4X=#uc<lu218jd1^%J{*ZJxY!4Po1^JLXtt)hyg+*``o-
zG>-~}q>SYE4u|U*XyNH-^tESc+(}dzdpfZfKef=te$BS!xSME@vhc|?<9>bIpcB-5
znt9yf5_eS@WopJs5bu$mTmxY<=29a$^4j%5JK-e1>nU7n%MrH>#h5Gd6;2Yaw|z<<
z{lGx;UhhBfBmN7ahnkziL!sUDTLQxf@$>B3+Yqm|-L2IYSiJM9HXTjM;6_XCiZ9*`
zw~wWsokR%+YXfYYecYSLE2~!jH1lp|=hjKaXo|Vd+;LFW(aLhi)<7Zs+kn|n40CB=
z^PQ$mN1fMU7sqKmb@!Nc&4kgaJ(~=nn%9sd0eL2>X9>b8<66ci<L4ppp7?e{q;}*=
z(9wUi0E{;umP@4_k$KkRsK4iPeKP2w3=^(y3|v?9NfPGTrvTSB9Y>38bf&ZEDl$~l
zSt9y*4Oy;ddii79C@o3WXH1>vvP&F!RGtOGnPd%47u0-}>a&3QYwc8oQl_8ELY=Ls
z;b~%h5?|HsTx+PlW%tX;I8*^W;^LXp?l;<LiW!`O{i&padLF;dS7{Z^=Qo|77pF7j
zvg&0sqh`0^v7e}K|3NmOc$K}S?JtZX88R=sS8_)sPyX1ci;pDUTvW0ysLi#58fj~k
z-0>?8iu!8Tm*f#$j#e+5EWLC{KAFIBZM1UPh!AA6<3#U3BQS2AWGU-0de^}1kMF8F
zO?E!vdZ+H<C};Vf)c*ir5qA&<xH(@+sh<AWfz001lgaF3N#q-AV&Sp5%*nTn>rUOZ
z87NHT><pQ60htIt=X{4faa}N-T{$2UydR?5_&lwkw1<zz1AGlA*ZI6awH?XtWSiw)
zH~bSw8AH5mTdvx*hg2Jz<Fg7Oq4?j}lD*CGU{i?Q0gyha_=pU6<a`vV<v<X{e+gG*
zqV(S?gAq(=H*##pKt6SAy5&7cl$y$ZseTe+a2{UY-&x{Mh-3)U?L0X#GwLDN^mGA+
zxY|HvG7ixX{M3-~!_~=LjS|cDJlK~w$g19+m5Txmfuw27v)(_$3+YKg_AFWbqKSaR
z`)DvT38<Y-2G|a;?@|DPX=go~x*kC-DBkxZDf94Zs6p08T(0Z$l|rg9^{%vw$FzC3
zK?Y0aPO#-$R=68#FzT3A-mzL8laiNtRv9Jti?-}I``uLf(EW~QDYsdgll<hWFl&t{
z>yKD&ZGYC;d{fcsbbm$6t>1l=_!aegoug<_T5;4<6`n7xZYvZRfdv>>P!qZ`0L;V+
z3x6OrFUPc@A`1Pj`ZF?*ZZki65Nn4)D<_)Yxe@wHwr|&-2l+jNaBW{SizT8G0Lm2X
zWZ=ShJo4-}z{+&*VKx?Rpxl1tSxL8@-ElWnRD%bH7TFgqvltWAyqh`208{SCbPu-8
z{+<=i@gPw{_dQFWyG`2S5t@RI<$H$x`RMs+dwB!SD3;Bn=?Tp3ZjV2AN!5D<eqavD
zl;v215JF8AiC^4BD0Mv&ij|C(!VHYe$H_phMyOPeX52|^55&$ne-)UF^{vW!S0e2D
zT$Kt5-pr+L@5BVlwe(Cv)K*@imp>oh5fW%WzDi=S=^KN@B(qCQb_HRVReYn^i~Fx}
znkTpyW{Zh=m^@neLsa!A(T@WXoF;xcHM@=#CBsM`B@Z+e&zP}>rvLRZG(yVNfb8An
zYu{G_U$ep+p~E0DWnnBeDG_T*btzG!Z((P>QkUJ&Lqd85YP~K|$RVr}R?Za*jjXv`
zSX#<%AL$#JzduM%Bm2-?Dl8W%ZS%v@V83}^B#Sx~wXgxFoYxZt3)^{Q1YiyXPo`_f
zEvgL9x}|V@)lryk);#2-{7IeTuHanWZbX<W8|*4RqpG@>X0m8SHIrwec4@16XOWdG
zHjjPYD>&9WMCJ9sluF9YDz@_}7CEYy^f*UxTd7}YF;Y*fpsWyO(u?1f=iJ-kt}4WF
z_)&gOp)KiYdhV#^BIwt{ecOH8fpLEImml3!<peG^jioL4(%rh5(L!fPWWUWx*&)xU
z3t8ju_sMapr2W7AlSe4iEY5<uXq;_BB`d?;--jDRZ44wHbFqt-0q2_B*<ovz`)5=G
z>ndIrIeC%v3_baJT2AfNP_D%#cC|-;NaJ;#fBB*k;^IweC|@e~UrTS!5WGp8st@{T
z0bxO}juqLF)F@6h0+PYP;xg={a}*1yTqGB>>V>Bo%N?CYB*H)C66lDHeR@6Cby&6C
z&9@3P^j%VB%7}a`cO6xMp;aZ*TD{l}W5j<*ZgS4;e6N_-%(Jf%olbcYA$@o0(^s-o
zm({7(+p1d+WgILl?$$7rPl4DYRNFEm@w~NSq}P>)ChYT_G(Z1WZI~|UjUm~bIl2QO
zVJL&;ijzvMxSBA2=rMpfM?yLb${N&u4lGYtiHwBj)aa!-<8fs#W6_&c{Zt$G=CW%%
z_vQ>*?{btp$k4vBu1jU4zIL-E80&5|qFrolfzOhs%Badd`-7an^<GH@puHAcTHNB`
zBoLcX+wF}YtayazRtf+z?g+`8vXZ;SnWb6?8w2mJ8d(AIiPwf{!ZNfxoz<@0Z&zhZ
zq88SkBtnS6E;z@y_9#<)Vtfe8IuUy|)y53hG$KssSWz*IB;*5xk%oSLwi~}J!u!Kl
z?DBv{k8<I+@W?%$=OuPk3!Q-XQiYY*=+rLcM;W|Uaiy*jO)P48MhkoP45FhfD*|ay
zom-J5%tHz755rTB_=hU4a^$7#O`ONC`w!>tDN4wg!HzSuAz55WAd93$1{Z&HV*C5h
zL=ut>O44MDQJ@db(|l;WoSFFEpo``d&As-F`s<uas5w(lkip#6o@%ir`Nf2vv~f(K
zZ!rXx%{^Gh+=Xw6jO1~VWlv+7{ybK2(b09WENxZfrTQfTL85}u+27NuCRw;cR_(t;
z)x$&)L1H(@;b(pCS=MK^>VYs_s%*wX);K9x(?M!UlDMh%MgDhmPgnSe^Z4ncY2lG|
zfb<$BuW$Hz3^n>L{%I3fm1Vr4z9ct&%e<X_w5xr!j!IoKVML+h(t(a$l?lj89s%!5
zJ-xFI(oNp-RG--E5`q?0yRM++ksJ<O2si<t?<`W(dyi-=B38eopJl4eXY7gj2$@YU
zlj<*<B#A)#FQ?1T%ycCA6|~wGMG>Mf*F`3ulX4TkrM#o$n;<5)$COMd2_(;WGmvB-
zyFgQ-;7fQ#>BrMSo013shkRxwfVKxuQS-2=YQUDy_jVWS$ylgnn?20WO`VR_DlomI
zCN^N-#4ZOJTx9tMX`wmUv#%MvRX&leFjH-(cgu5*6F>DK6dSm0{+fgMl$xNlT?{d|
zyF&WMY(k!qVjv(R0DDdBB5bx9Ex5u|VN&fznO_uY-#c}<o^pkBtm(9}22~e~_!*Q+
z_EgOK8E85E-aT|~|4>riT%`R3bUV*~-Wb0A^7L5xc2KPGj4tI!2kDc=Y7ZH{({w^Z
z$Tl^L*r~{vs918F%0G3e9~$?J0o`P7hX2DPM2~;lax3`Fs5o{6bDPt=tf#QDG{tfh
zgUUNAmmpjv;0hE~O7(ye!><1t^ihP$98w^+M9RuMi!?=K#JfTwCK)-6a>>$omQTNJ
zkovDSNxHox#|S!snsXsLie%V%K<4_^S|w<$F8WGW%6Y#}x8t0-`U5tVhis_(jZ!OV
zZ#R03r*3NP*!=;vKN2aSOb7f^`0{tmV5q6=d0EL10+pX9M_*X1pVjk7IBzFB(pkb%
zu?t)ukGs~dWQeL+xP!H^hjyNnmz&;Px+_MkQJ6dcggBkEG*&bDQ(y^Bw0Hhz!Pi^&
z=DyO|*I#uRWcfJd{>1e7Ylow5uvv0)!7NPQk+(g3H*}PR>!I(egRb^2;jXp+=ttWB
z9eWIAdYV`89@~$(Y?H1c-BK1VtBmaUTW?2K!wUsG+XHl=tdPGd`|u!pvUROe(eQGW
zKo1DGu?C5VG|d85Ve8W$!E{TNPt<X#{(V@t1sf%OvMl^&BhEl3SPz3sanXC>;++>g
zu5LAI;IYL}9Okel7$5)JanQyLzpJ6vuc2iU@=a~Nv!EZTow%G)X;bvv-!e1H5D7FG
zt-vJYYqOsH$lnH11FFZKh=YT^=58$G(z(o3%R1PAw+bdVb$S%%S#C)=TZx7!;HMN5
zdeao>%wA`R^=KE?*tV}VRh8hfj}G6;+zvi9@6YD(qi01LJ2aksf85qg!>S_=Ql<eY
zhw+EL?&>rGfvAnXfi{P9PpD5Mp)5$+Po45lx1LLlvb}8iM*D-#U`4hflY~_UtMjDZ
zSy?-J8&KM|E;^1$%j7dtP<A8Lv;w#Ym7bB@P6awMC2My8b_@Tm*d2r{C+0Znd!)}p
zk5z0@mRj|I4d!9$t?)sI7(KT?-(7~uFW#zsbMf}I={714KA<CFp0(LQ@HP^+x7A&v
zkRWF;4>|KGvk52ahu!!qvG`{V5fJ4wC@K)p)tF^r8l>C0z8sVI$K^*$c|Z>LhEQy6
z0DnrdsbKE=XEC{reXReLwXG3E8ivo0Wl`hQ{8XEv&c1<&^Sxh-EkHKr5NftjZ0TYE
zvdmP28V}TuUUZLGPx0L3WyS9q2Rk(fP1K7a8u}J^YYq#!Hg&4`PHyb7KT8mjI?J1z
zfVh>biLF4k^=%Pl-({;TrA~|VYvuYkdJ4>ZF)T{#(s8^mvYF0L-N4hRaw?z4pp@HQ
z1*4`e-hDDw8~I}GOSE4_;FaZ$(P7L_#lYELN;iI<hs^$!2B5{tSbQh}!OmCJm^)YF
zHW^29OcM><rdwm8(D5KSY?)y{LQG${xDJ$SO#7Q&#@^yZVd;js)FN<r^7UeNCz+k&
zqlosB=wDZ>XAu)K?XGvVY=YcC|JKBK$*Bl2IqbEhRO*|<4w={hY=*PlDOGRDrjHE`
z9Kmx}f(_IzUYhCm_#AC?Y`qVm5ehn`Stl2I3(ORfqTFiUewdQtb@Tlqqa^l_<quXl
zcf?d9`)SS8#_}{$(7K!(dBvkcoQ$1)(#d^hh-wi8%b(;7+1Rb7rtM*C{k>n8D)fLg
zn}S?{!})|eC)53;+K^#9_tKwviVvZh6(F=+S?>xg7<swE<6cuHdWZd~BT`%AMe83W
z&YDOZukWGeAesB-_>1Vo#tXC&D?*0+m;R?GC0wIi?r>u&Ot|n0&n6)n0!(4eOq2bH
zi>mJ_wZipcY0t5&>1&v+rC8U++nBlup~21q3bhiY2k-P8e}3OGL#kK~KQrvqetQ7i
z5nD=|=D^@``|~Ce!-^|8-F_bLSiLIvoJ9MYe%ru#$RwkO7XgA*Hu5DLr!y26bk+D&
zYvTa|4+jkg6<;pAY_GehV%sF1t>Em!w{r?y*ln8<&;<)~)~dfA_gheD>s#}Q*WV;1
z$(`8|p{lEDBEXV_VCO$}#=1;hn%;7XUI6y>X>8G)t5_S)FuKExXq$EfX!e-tNDRm2
zDPNhTe$@_r&tPAh@}M;%+!;xlGgkFX^MAftOV%>JIeV!2?t!Y4iaR<eoRFYKmG^3#
zGh;8AcDhrQ)C!#JfVxU+M;vM&oW<*kCv{GE8ytFjcC^MbS)6$yDav}Mg}CJ0Fa>k=
zk_~Y1l?6Ga23@E9LYcb;N!Cw4Yp>mhEoYJXRhUV4EnbT9lO=t*K@L6#!~L9(k~nd@
zYs<Cl)Vn~C@rk|%DeeWqH|s@VnIr>Jygzw|o`|=G2?McHng4m9mh=92_W22?VdOBH
z)}xFidq73Ew#-5$%gxSw&GdCIL7%|3uct#*#^XXC(~)&BHGVs}EpFm)pDAhbk9gBp
zqvDqZ6)`~DqRYGATVfvm;t`B@p3nQSLS_}#R2+79bus=(%t(DllnN2XUw<Uedh7yE
zHvW0D<}rCj0RM+s%71=Pok=h8QYbs&!P&};Px9}0%3G!D40~n60bjoKxS8?R3wtw@
zPd+bopX!t|q*DGiksHqh$Ge&yL#7?SRIL<i!4jTemMq%lIB)iwFK4vL@|{9rJsHMh
zHv5L&?1~zOS+z}k?M=Q)n!H3C*Fx%Keq|fMaBkCsw%spWYW$rA#YiCpr4A%k%;Xf;
zB2<;{q>=jatClqmQu1`H3e}{t$O7r{wrdGY!3^r7zXg)4p&S^V5K<WEb1Q#EU6Hf<
zo&NhZp<tn<%GEcqxtOS&K}5goFxT7Saz=wHfHHQK$9LGl<7pw_n{Y6Ms*G*fJRxqO
zsu!6@dP!Kb6s^(ICfb62>?t_c11Q$XKYqXh`jlp-9b~jZ5Swx_4tgs_5ONhyYDls2
z+Ad!$L_&s9Et6R{54HzPU6JoGEtFMhKV5A5g|hJkNp>kOtMncc%~1`W!~HYIehebA
z0@X<FoYYzNaw!r_ffzY8vR7uh#s3yBtH*ff>CW>y)@SZLuBu9))?y!|N~r>(T%yU2
z%E}|igUSnE_sZ6->esD68xXR~*}clR7C1F%OHwiu+5Ie$yZt$#>58hegqm%4E(`nC
zPu8qdt4961dw^RU0lHh2Cv;y#SRq}>@^^hlkJJFT-p{Sxwj3F>UXK#>kcr#_PQ}Hp
z1*aXFFsl6I`_!x4AFLv!jQR?gzk;2A+)gL`+aj-EgA^OJX(m-%`9~Lzl?KL`#zv($
ztFj-;O!qnSaAnepUG_S>BhJRmU#$it-I<yTCf>{Nd8UTt;hPmWD;`+H@H_G3R<8Ij
zaX8jWat`589}Kz*7}k=eE3op(W_}}$97=V`jS|?Y-^KBk-QTwb5L7%0TOU(f=Kolc
zO<e4DIGqKIkhejfy#k33^TV^<s>3=V?}siNQ1)KBYbx71g;l4-XZ>7?c&nfOQK~=H
zO1#N4!_oyd1w@nl5tVBbT@!oU9cHsOq_R^c*O!2;n&VSU|B>@H39ETIpd=%|)JJo&
zaH-~Cgo}&oo?5+(4T%*Mb}YG#9N%0PojP5>7o-=G6)iO0ALgY%<BvzE`)?o@WAsqL
zG{5p1)$ZVX#=`UsO#r3#;xX-_KQxoVRWrwQdL%3D(q`mHhi9Be$^Ou))Hux5ebXCY
zmOUw-oi<<fxvQlQIm-^sf)O{K?as&xom3sWucqVAUG*T*Bt+F;m0UbA%PI@a${n7}
z%cY8wuxoc7gaa<9W|oHK?MQ#}=CLef?H6)|*ya>{izv<$H-^I!5>N-bt5M}$at3p~
z*#Syh9(sK~v>Yz1Jj}bvPu`BoeacAyUV}BQJ1a-LD&h`kC*RkN4NJ*<WA-xT6{K&c
zHXBJDXE`xt<48>BF2~?cC6?GDA@WOkNxN^6X|qfvYHKY0f4qvwd!qdF%N>5du<z$>
zU<+dLPuN6f6mh<4BH2uDeHE+B*B77<bb5>HdSW$_luZ-o#iydX&q`vavTwU&-Wl9U
z_jwhXsNp64`OKAqkExG>e1see;;4wL($hgal;2OKMLE{Ks>=I<($QMmr+Ft!R?Shh
z=%!_X0aM63tV1S_xEzF&#gD=wNqtp!2VfVE6`oB+tu%mJu%!Fxnqa2?@VWpn5DgoV
z?YGsnEmA5MIbjoXW!OW~z|UlNhy}$}A4<0Q%UEDeD*r0|Bu7<W6kI|@+Dkd7(`G@{
zN{abY)W1KH%Szu3r1_}cyd`b~DqZH+%{bOpGWEY*L6nD`9v(3nVikloQWjb}W$!7S
z>JTfTgwn8z?5o`CiXw0f$Rr2(PF?oUb!ORwBhSTH%g=NQlh~U}T@7hOa6=T?A2><_
z#lSuMtZ>0NGZ*k(2XpqUP5ITnsFR|+r!mHoqVy%iA&aNVGgy9z@T%HgWVRM25wZa!
z(@IT+aTQ?*+Oxejyzg*yPGf*)#(CDju7&B=Z?iLwf;PiMJM0E2jnbf3OSr4yK|sBh
zTV~sXRcob4`pDtv)P0nWT{2bV-7r2i;ERf=+-l<77rj2MrT}J2Rrz|!ob?sE#Xm*X
zLqOiYy-VYpMQOgu(^4_=@ej}N$JwjFRh63kR{jgp7fxVlBumpARM!8kIReS|jkiVQ
zMr+Ekd-Go_PPwAcnPE$m>nBu-9z*YsI~e`M<-ph);)SH?*W2Y)_ditae&n}Ri|QbK
zHgcf>&#m?q5j_U#Yp-U`?n1pQ;w!mu<(RzzRwD7jm7?T`(bcy7y`88t*Ci&-3~)Z4
zg{{i4u@^O%v0W6~MXYApmoNGC<!Mqu!J7STt61+UBGvxZd1;YFd;F-|YT!9@5|tFG
z!C9C565InN-+N!?mlmhhqdL(RzCx99GDg`TDsV}o!durbmFZtSC%DwJqG5}^Qn@qZ
zf@N7C;udrYiOAWASNu@T=rHVOfjO38_HHC~>8Wq3v<1m}x|m8{Y8$C)iYaKbxJlHh
zupd1dnbYo#UKp9KDIYO0<P~)CfTEon>q@%)do1QB=OZjRaUbO)8R@2hwvgn~yJSe-
z6b?cZBQdut-q#3UxUryL_w(u+1|`n&vaDv5+@`EnGB<+^`I#8A(#R>LhYT}y_jwsW
z&vJsTB7^!um9ODwxWz8`Ufy&}X8qEXf{B?c9uA!@6t_#FW~gD|N#MU#S7z8zUW@2C
z?Lra&#f_G|=5!R#T&cnw8PiLY=1nq-bun#yqUxTexwADL7vn;<+Wvh|L-DQUuP?r>
z_Lk%V%OZK={Ja?&2JS@#?g^>X<zlNSr^TUY<(9jq%}*N7OP=0hdYGI-NP0zv{N()P
zi~}w9_xQ^<4jdjK@9@(kHMrB4mx>=G^vAOic(i`}{^PF{`5#Xsa+4^3T{KImvd?dG
zBvrZ*8>(VUs{LhnFqiNO#!!Znxg6Pj+j{q$ngSU~Dbu|K5#PwrJf!-x5^I5@Q4NG_
zHjGSwY9aHj7a_q&*lA?{Uee_JUL<ywNq1a*p%=|7D5<PjKUg?i|L*!~xh5zs>|?iZ
zny?I?b~$2bvCE2iW)2wn)JLk}>z|L)g+mXi3F~cS7bBviX<kGP_RBah$0uGmzLq7^
z^ZhGV1gprT4L05#j<tCCv&te$DM<pQ*1dr4Iwnd$r`9V360g;ubCcx<AccsYP9I5~
zuELzh@z^9y&x>-(&JHMYUsnrJ?z;ORCSb2K)5m_DG+*=~l-q6f*VwCMU3#_59@szQ
zEe`vzIyV8J(kV22)i%>?0Mnb9(z_cOyL7M`cfi#W$&Id_-?)~tFCa_hP&MHK;*fiT
z<t_WMEVv2dA!aLK4Wr~FB-MydM3VDHnQ5&vV_*9=%?3D&?C2iY{a}*KhcXF&)8&kA
z6B;=+XH~MUT;DQ7<x9FD5^9&Ut5C_`s%_WRe0uf4z_#B6=}BKwX1|>hr91p;4-^sd
zZ@=o4N&cc5el=?oVc+-xJ7lN$fL^e<y}W@hXhfppDJ8v9gxc^et5*E-m)s0|rtKU2
z&G)n~fkeF3{c*PC*f52!4atSADu!Q+xqde;`&F0hMLT@Rq!!6G8A*H)py;QTp(RM4
zx6}2N_)3#4i5D)i<9VnNI;E`rbo@x|$*x^mf#yhVR>mg`N2^fd8&m#b)-2P5pn$pM
zhc4cNt$J^4d)G_@q)F!ZP|LMPN!5KxhF$!5C#Gfm8N*cm7Y`Xx)(;%dTfZZocxrWs
z<5Zc@m@u3^dz7~aeMU_|or<O=)r^INz+8%MCn6}88RrtAFA)v;?IoEP0Kc$yljJqV
zIF|r@3YA-;A&-%|>*(z*wZV>;7@;J~+apggVL2;~sfvNI&&peLaFQ|_%)1n+%$i9g
z9fkv?PsgLf#wz7Le^8D+n(4m@++tL@^4fX90zBykQ+o1TgU42mMy|dD)r%n>kZqcw
zxZr7?l?UP9sJB_E?dgeL)}w=}SG5w=Q^B1A9mOJ3LAzd$5g;+@E){1;2Y$IXVhVMl
z4Beij`J0@+rSlh#7sVK~Gc>+;!CEFQowmzVt(9z>T`Hf;sbtu%{Go0x)i$B}px#!y
zxb3v|mY{J@QPT9bVN{OUo?iT{Vx$6lL)b^t1Kqp>VdnA2G)Nhu7gR)#ZR${?{wImt
zPh^r$Q(wN!;yA?RlvV3J(%J9OCD#JW?N=o#(EzX3sQxQf*>>WKGjrin*y=fLgKd8;
z?N~QVqIRnDw72bgrOd0JKQzq6s5~i|lrz*TG8}TRh$%VW=e=BhRV_WWKZ@HiE<{rz
zi$+83Pe+S+9{n3ab<EMPc2}O(jL%(t9r{Pv<eK2J;`apm9Kjaf-u~V}?Z|U2xxM)u
zc=gvonSP9z&7`H{+`}`t$ad6R>HtTdZnLuJEj6^Vv*eB?4K{wF+Wq%pu5ybbQ_EhY
z?Mt-=sl82hxzq*Kh==4x4uKih-U|wEyQ+wCMAjVnMVoW#Kk57{6_E{+_2ZA9t}7TE
zy0;>#dhJ?C0_rNRNgEQ*^nCd;IZg}Q1MZqsFJHYsH)othn?6+#5Y+g$DSagvF}Bk6
zXiIp`vQoM;z}bpp;HzN~S+`%*Ce#;Y6*N|T5XK}Yb06p1VkFnrQ5jM|bw_D4@#9np
zO{jq|$vo4M#3yaqR@os1uL(u5V~adh-4=dcB<yVyi_;nr^?Q*%hS+N+yftRMk!ehv
zf=CYRsNaa^S;|+$NK_)*TUXALv~Q_t2It=oW6{rGsVTvgclS$MXdAUHr999RCQ+oq
z^y<&MHd4RJ599BuRwBftMC)*GTpd%9vpK98zJT=V+=xH>873BrLb4p%#Mujy+U_Ti
z(*`7DuDC=Il3?dmUZ?usB#EXZNn`3yjxuM*rHH(w_UP0(_g##AeMMm%l}?ju%d$50
zW{2nX=vm`M5A2sD5+Tuq#iWaE|7Zb%q%Zp7RLGB!-5c`~ARks`eUZq^>VTFExEX&$
zKEYCQrDtt0&sZ`|Ipm8mX70Wn&be=Oo!_$kXp;GL+0kCn&$J$)$#yxY>wl*Wd-c(1
z+OaAs{lL5@hcgL6{l`#14{HnJn@y)_FV%1D>J$TbWjOLUBfkN5fnS$P)W6)-WHjOk
zxjG-`1uUlNog{6<YLUJE%lxDFW;Llq@y)sIO@^4zB$3zDo$Y8YwTt;t*z~4;8?1if
ztvCAmt5zhBaHzzo29GkuwkU_3?~yDG^}{xzJ-+^HNx4>K=@wn7R70vg5tOy|ajvv*
zd_(JjE6CsWrrh*6Cw|KK;!qJj48EbMous~)x^wtlwX4_E%t{PkCxns4t`gNVyCc~4
z2AuC#+u`y_o5X2nZ4bEx0HM#Onr#h(l2(=O$;&i>`OB0WMq#Oc{t{4Q<AbF8i{rb3
zn9)iLTl~BZK@3w1CuPdAJ{&kAH&|@58v9_*q$MR(o2BvU*$Q@oxNq{OkBT4hegoS>
z8a9b*aX&cY{nE-&d8`>TVUAUzpQ%6miPA`v4Ef!oJ%T-(%)(&U=gaYot=yu^{=s*`
zGfF#sfdM0-Co0~29IUbBTDv7a-XRyu!i2l<|JpnAXsWyK@gts)DXvNpD&ZC(nF<#r
z^L))irb0sI5+M|22ubFdD-FslLkh){p~y@`h7?bf%)fmu^;A#KcYW4xt?ydjKR*4@
zy14K2K4+hO_Sxs0y<g`%Uf`vXwBn|@lDDr*f!&v`R&vxa-bU3Wv}-E$s7~X4$EmrN
z{dDw_8N525B*HA)jMxLbCeGAtJAjUD5hb8y>dBX7g6s-0;M30)_2)smHl9(I8^yK;
zd1!lWGMJY7s|_JD#k!Q5t&J|Gq!AmsO|XV$n*hhD4rf`~^IJ}Cv@O`9(}t64DV+q2
z#>vUfd{r}DhT5rGx#iNN=F$V*#67`iZMEm)B9qE0Uz{~Z$WxXsJi4UyIRk>B1xCfw
zOk@;#-izP$XUk2p+^B)+)$+ZqF=J%f)Nd+-IecVzt8lAwEQHhVKmTH1bV2PFqiy(x
z{P?*!*{TM8YUlB;6I}7R{g5EbJIY|LE8jBzXS`^$xTvGGbCs_;;lTD>vPyrm{w$!J
z()-}M`bhM-DstOJNt5jqU06Z4IRLDdY-4I4inAe&(WO$Y4O(T^C|d;@(PxvLd6+$T
z&X+yRxw*1(DaNGuqL*?HESS@4b{?lcsMB!k3%g&6Q%>cUcYc3_efQKNd<QCctIV7#
zkdVijz_y)&uv4|q+Z@Xw?)ejTC|c+hByWgDGZj{$G!l_B-NTbb;Mc@m-g*9+K~eAB
zxC`Jc-+~mYeG}%_OkW$p2K&sHs|VZOZ{1!*pW_sct@Yg3$;0pv;D+x)`BXBa01(oA
zWNG?|>f#E7xg9jj#o8ergDvbEy>ZxHT(j$o@pxuEgbDMVlJ+2A4^L-o<zptB`9pPb
zLnZh&Cts<G@=08%8fXXDg(HLu<R*B3UWKKJVe86LjnzlB{?kuQ@2d6$NXoa$U^=Ut
zGxcltPZgSPtXd+I3P18mVK{SG*ou$IeSYNT=rx~D!_S)zRYK&C?yT2c)>mR*xZW^b
z-sO$N@fVDOV04gU!$?UzKP_<MYDR>w(!1Ne_vS)l?^{rRo`l&|EtBWQNT#!@zIf55
z-P5q(5hpExoa~W`&ARp%C5KxW?XaVH8)Xe=;*k84nHRQ8A1G1!7N%wFmy($=FPUy^
z@C6-`MRK7X99i}S-+>;)t$Y+BmicES<Q*3GxgS)QnztJM2obUN=`CT!iEwuF4mjVg
z{wy63^=Ci&Mo)DZ&OpOIuw277*cjx>et^onFp5;^@qqLbM@20D3<4ohe>tTmeg2K#
z#)MI2&3H47`ku?Vu^!_uYm1G_KZVCkB49`2s8K&QIG1F$(=wcbPPIMSm1c279Iuc3
zXXm8C@n#)--{EW@PBy;PEIaAIB+`w1<(YmH=lUQNHoh(|vwM~-w>Z)DrQ$pAbi<W)
z+QOmkYJY)4pTsG4{MdHLx^k5M5@||pV;j~N5pPzV-NL$&)*bfFnw-7et@rbt%9b`l
z;U?~(2X(em<&*7TnkJB7O;8%PAe`q0lV0a%;Mt>L^PHbT+=h~zYAi_KJVY^SuefOQ
zm(TzQV^^kZ(JKW3u@2FC`sNh4%{e8$uTSa)TU4CMDjZ&ZeW0VT6u3EuWex_O7lHi`
ze@6lQg%&DFlEI_^HJT@gZ3H9Dn#`xlCBS6+Zfi^e9SUqT^vk2TFl&*3&G9rMC{K58
ztp$s40$?!_Gxg^+6)#poL|i~w&=!7Uo4X}L)WqzXGphizh()ZK8#c3(%{@ANNAWr^
z0gE)yg^$f&OH#jA@KPz$_144i+<@i;@q_o1AqU}=4GV8oR%SiM{!@{Ka`Ag(SGE2u
zl2CtP%^rX(ye_bct#HiGRfMUdLR0MN8ANcRt{J=4@idqze#N*kqRWqO?kcq*Z?EMd
z?}_wNRdATE@b=6^Pr3b#n80p0_Wq={3`d|}Dw+xgVZ-$dMr~gk#scNdJq(T+f(bJH
zlgNQWXRYXS4>s5to>x(hm@kCG_I6g6KLy3j7GTdiv_{{~r}38Uyr5UI>|5<)+!Upg
z8u6z069vru=OGbK1?HETG7WtK{<f~Bnq+<t<Fw*rn}7hMgqDB-yJkj`;PaV0=bg6X
z`+}W{q8rQd+v6$EA5O7ujIou(H{EU`pKVd1=g~1V=*Vo~--<nMd@5V-`XJ*ZfpTBs
zdNw}r)T;)hXI}>UORY@_3tN$A*6xZSnJ(HLCRRhm6WueLAlE3=WzIsuj$$kyK4nzK
z90~BpL2$mS8qd={^J-sG`5QrPOR|)gS)H#ScA=64MO6rHLV+_G0)r19u(<JRQc0x0
zk{WN+>TbAT@|e<bpDyz_L>mb~w6kZE6m^G<tq9-Z$V&QzsNr_yMyK&xI`v^}i5%BV
z!(8J{b)xR{%?@Rdi3GlzMdvQmB_*m!=LKKUt=Pk<y4suiyIJX$>|T@kmF@TO!Ah3+
z_DN<vW5Fq7<~`2wrX|{Mp8HVgnCZE(n<PUN%p*+-7JRB!j#7J2NTHEKX>F-#J2hTH
z>zE^7JcQpXGrxb3aqk}3;&%v$)~%N+L6N@H4UUX0?9zs19q77orNB7ZORawy|3TqK
z4)r1L)d9%7ICtY(L3r2<EO8D$PJ+OfmOR*eIcGwF&j$o#-C>B99IKSU-){5NOsQjf
zGPoI~)}y?7DiRG<KiT#R+qZA`yl?p8`5S$-ZlEQnrtlJGjcLevoZ+Of=8y?NdbmRK
zrFk0FBiWE1c8dYJY80grLd%ag^?ro354GAzvAEo{C7`^$c(0V22ByB5kT(=0J6i*n
zj_|l<2wX1izFNG2@Ax5~B~M5RbQulDvkgOx&I@sQIFoz<c~x33d2@yoQW?VivDEcr
z+msJfxMo=L;Fxs8jT;qt#Zsr`G!|hEnV=D1^7tgoH>x|k^$v((_sl>HhIo7EQq6wz
zKkh<^^q{VB*?L4!Qt|C~ax#gsXCUrao^-F{B1kEbTE$T4&mLD=tWA393PiOeX5N_;
zcP}o!eaP0K?ix$ftAPY@7w2B)Bl-K5Jv~~^twL(nxxoV2(QJ-36gzJ<;$c#@xUa<-
zwgtqQbCz;D0&c~poY5g&g6*2-`3J>i5sMUO($7@fQBRhV_Y)|#?Gg-3T<V78iW6T3
zF0N*c6k@KtbejY78V^ROWt@AgO#9Fbr2KS@i;OLP01}fB>>HmJ#{_r5F1SUOzaN^=
z#G%(pI2dyLeEp>yy1TqQB~NgTX<womj6nSsNP(Gb9a!!!?D`<b_>nNZr)521o>Gi#
z!P~00<ZM#~HxY`!!3GeGPzPV_=AC_BEyB9?Ge&93q)Zf=Lpj(ylH>W-chUlQ3-Y3|
zGoEq+P`~@y2K5e*yyY^MSv~2!k&e{{4qQxUYjhQ4ILYw`HIK!mOo#PcH_NG6?jC*~
z^#YP(3}$0td)&N#irsKRp;3+?6dA*>ZFcrh;N60}*f<rdw`Zu^cp~t5FTTOb;!#u-
zB;gz;vayQFr#kQ^@}qf59~`1;Ax<>{?%loC1=ElNgj#+e*903(tzy2dT;fJkJ*^;t
z3=b@~t3d&nFRgAFnGLSA1Tyvf3EjZDNxfEZT<!|xs$^zSt4cUlDXA=!-!&?B5bry!
zD25lLwxbpyFqRhEX{7#T8!SBaz&CCcVuXYpx3Xz-RtpOQts?eV6OaU6BECZC!v;_@
z-#Pi7!#4g>ZU+NtFfiBfU^VjW=pCE4a#?Ly8!q3DlKtS$v<oZJVKIBVA;8n3)bwN3
zst>n{rh|4Z6Q2CR6M7$JBiH?W%a$9J(yl!|prCDkk?(RFWLa_NrA{0)vd<4RaPP8?
zch|Pw!$+_#worCCS3GC5G7f%=OKxTY@HcCNwvE4+V5bo69eJQbC7;Ud7`r><p}z)%
zL+HNRChP$_iCLD+h2!GaYq<NT1A6KqMb|-xwC;>gqYhzvr0gEenr`YhIgHo+^}hiL
zS&HtuICiS2<;r73mWJH3tUF!Vo*z@ryaT5~m80XQ+#yLb9;JCrDNktQ?@dlfeGHPh
zQ4SfiTTj&npv>P1%!vIxj;LILE=~~>dY?w`W#DN(H;~H9yVT)%3MdgtBy^A!P6<%V
z>A{x>G$GUIZmUo3ZFfYRbtjUjT#Ekv<0jgL*)%8Ie@>iaa3X)O4GuuQp!k!1<`lbq
zxp_=_uNal7laHEUmD*(xIt>NmZ(lApD~H3ePgw7qn(&xSpLE@MM~uOa(z!sEqx{a|
zIemzVzr`YO!dKIp88&9S-Qg7PB!rP{l016O#@^x80|OMc+^?mVdU)18M<bX_=sZEU
zF{03>byxNd{H&(_(Z&E?6(Rf1`z@DSKdJT$gkz^+)y}78qtJLE@C@Rci4R@bO(qIQ
zV`W{yxN&}`s;n?y2oq1|B>a_v{?6%p5Jlbe$?D?+`%PQ<h0(ch>xzRm*G@iZ*q*Rr
zgE)Bk_6JI5%908bZsq|S&*Xl5%G9_;{cA1zc>4#zQzxn2ZJ+1!xD2~zSU*w&5Az}L
z{N{tA3ut1=^uG+yQs1$y`mGjc;_~VJsTSKt3WRK6T2H~{S2Z6?I}xt5@%@zh&ndTH
zc}6I$1R;LC1Jb<aS&M+=|GM|W%!?6?g^#x_2@B7`CYuUR-iKVR9f8sXd-U>0LNLSQ
zaEVKkmI88bKq=1bA59r)C=kY{T8GrIjzOM^AmnYO%>#ODFu=QaZ#YCX292lLq6DhL
zejK70&V9?p16tNHY^PcF;qeeaRq?hMd41k+1r{~!7>c_g)9LQG=dYIPjSn6V*>~4T
zmQBKguQ{U<6GN5D|Mp9E$~Exai@NQORXl2JCfL1<QkP;W36F^^2iVb{s1Z5@c@b&}
zTPl&|x1&M;JQ|IwM0_-?q@k3W%$BOGDl^xQBRVxK!Zs1;#+|E4<#o3+yK6)OqhSmr
zr^Tx$h_#|bO*OYYfiL~zCUp|WA>{h(-Rl-8zS$Kkey#`c(_ZUahFX-ruI-@u+Eyzp
z2y^PQ6mX+x)FiSKjL`(#tI0I0_;E>42>w!pi#-t@kkcLY;eEivpV2f!Tc$UC^^}gK
z%_U)+6U*&O_7}+2O3!N8Zzykf%Q}WICg8KW?2(S$N9{gVg{M7sLlYxn2wUs2fhvzS
zT!0L&YqB!b20UVEN`i298v2<P)1vA74l<@*7_9J4$~;mw;N?l3(F;t_y17}KN6zo&
zy{eUNDwCX=t7N=U-0BY7BmK7++NqicBVr|?E?r4D_gY%6K%q_C>CjS>a(D5|9dkEx
z2W(ilyHGN7lQDKU^`ef9yi5j<>gXMY>)7|5^;ci28~4~5+^1sN8w>G%%~Y(%Ov;q;
zS9L0K^u6AOiY8FY*T%_;QFoxsf?AOmi)ED_`0I9${VAH}?yuZ!J5&g~(W5K<uX=CS
z+$=AM&6Ooc=ooH?OJo1sGYY4GwkSWeh-Ku07AM0yISYR&Ladz{86-t=Fua?xPu%-B
z38NyC4J!R@2@Rk2XmH#nWmgT1G12n1hTYEthJxIF+M_gzgTWBgAjZkyWk4vY&(pXh
zi$Ocf=in=exK?nfnLW2j&S;F>QW8$|fXvXiG9Ng8xJJfAO^5GJWgLyllq$T?0e;_P
zzynt{6qTLLc%wtMF<?Ay(+-_7&}Nk97+p_jx#=%Ut#Suwh`~d;)5`6sY5VnV<}gui
zqajT`AInt-I4A?iV~eUa7)^LD%b{|JeeNjbVq0yr5a^P^b>Ee+ouUp2Ox?V%DZgNt
zhTi|sHsm+J=oNpX9kyd}<@gCWxKSf{3*l&jkYf_e)TV*o#aG{6;pGv_K*=TG@=j$2
zSU4;cg|h?HQM?82aC{aEd9r~MPx=(Rk1S3;Y;rcFp2=y@j%|>-PFGV$K{1)UkCv8J
z^&<P&Ob);x))*0XkSmleKh3%YfE$|rQWH`4)%6XE(Dr`Xrm+`?3I!hT^f=r{^6J|}
z`7Q7&CmP0u(({*S*C;|2GSRvcJemO5HX#SmY~Ft|r5~-1W}MaR%!V0UZR|1;NTyMS
z>$EQE-K|lW?Rxr`vrvXB0GHv8+iHZ8a40cb?t?3QaPM5;aqL&8&qy?dhyi7EF)R#c
z<(DJF&Q^VA;_ZsZ@TCVEPz_}Kcp9BbkC1x_d8Rrw{3KOeC`A0Xd6J18L0=T|lO<(O
zx|n7EK%5PzFDS#*P1KrZo8Ky>=f%e%7HRkCZ+NF&$>$@Zv{L1m5HkMM8p_A~>VbQv
zQ{~Ua3ZdLz%bnYZP=hT{j_obQCSdA(_dl$aQldko_#?_)_2lF>G<sT3<PKVG##O7M
zVka(%>XyX(B2_y+=#19~tN%je30KIXQ|!2aGFl?|q4@DQ^`^Z~$VSM$pUQV~KiNv3
z9fk{{P$InE&F?SCQfVXR_LED)VGbS*qX;Jr5pU{(+9~Bn!w%yMUc%k}s`0tCW{O5j
zy-AXC&segoYL8YI_f<WH(Q7yna#Sdbzhqa5l$i2*p<h?Y7Ak@>6-swOo&Bs3)#l5R
z0Al&1z+FRM+j_1Enu1M?1g+TrLS!_K*H1u(T{5HnN%wtocG9vNK0<-d0jY$;lJ{0k
zg)AqIdORTa(mG4!#JDb12F(<&xsT#=S0Qjaq;o?&Q4e&GHXVv4qWL1&pbcHOPB(!k
zHvu>a_2m~VyX83|arv3$y-Jc3Z6>){kZZPoggH1l6aZLtWSRVrGtD6tGt2JecYX%P
zX|n#y3m;3F&Np@AtLLFc72w+tZr-v;fQsPYmg#9i_&JN{AGBKH5}ElWDeCDG(qx`?
z@0`h~$5%tN<vAV`%hw*dbemj#|2ol!#4ne+llj6|6GHA1D<XMOB4G<N?;tLv0~lbB
zpDb7vNpL&#E@AVL%g2BfsaYen{Z#&kJ{Qer$5(B}50FV0CYXnX;zTGKO#GGYblyLI
zSNMcTEQBihR^c>k%JTECdr9$c2k~?Kh$2&f8)lL*D!Y)`Bp{SeH(}%{0}sE({*t#Q
zm5sEMC5t(8vdZVi-}@|XWoI@*18b0?!N9+fmWy$MYU)MIdT&HXl2%2~Gt)oxU~{Si
zS$s<c21+VrTfFbGOYP!02J_YP@v3zIjr?xP8F(}~>^tj?ZgEd&nQo7hQphe-4~}~3
z0ANMRXr_tQl?7aglBKcA`dfT)o_?@#yZ@AB->dP*Qfo^d#9)gQq`wAJ0fPeEsr!At
zKz6mxU?hR)bhaEfDZg=UGKH?)yTSJq0NZ$Y7>?&0-<V=ZzxBIQ#9&gr=xgRR^V-2x
z?h6VU#X@RAc4XOXC^W+g>Cl&V2?ynY35a-m(k`_Mn{&!=rRMCk`s@u`jT)4D@)Pvh
zvu|$L72!!LD;b=;>kq{HLsDa5beejP7;H8Yax+Z?(4!wz=O?9#I0kxakl@zD0)Qd&
zm?7ue^W=PwnG4IpgT{2bPbl8I!c_N;vMrw^HAy1rNKn}N3)V%X!30kqG9OKWpK*3z
z$uivS#DQb`cTeLEQOsUV?C)%=+Jt{cWmu<dNz1hDQ&N=Zkql|>Y3&y*qxzIOroI~L
zBcbACYnA^l{`06n4+n7seM<q-{{_3$CP8F;oZxefcs?^E^HK)fY0$Qx=OdFgHbp|y
zSQpy2nSdCK`;DKx57}5eYH;GYrX4U)WkAT1e==GgO`#)C3-HE#9e;ht=mghJGHEw~
zL7L4-B)2b8xZSQ24O#Bnd{)8!m-ZuzF-dPxIO#q2u`ki(RfVgir6sm0Dr)VuVW1xq
zVlEg|f4DkMDl;da9;Ocr<)Tcb$M_xDMu)%;&4+nvNQ^Tm+f042lHYQaQ6H|0A;Je9
z98b1vLSV2Q1YWnZbJ>rE*0(bN60*${X*zESu5XUZb?i{F3nV+XS^fG8u;sfo)N&B3
zPE9=!d%+K>)ei;3D-hMM1er`y@*&V)rtH1mDD(6$AdduhMD;_hulSZn-`ugpTW~JA
zI(*^mB?`@jlh4`t4aggqrhNti=lf@NhAJ72bUhUPwQR(NbR#x{0YS>v8S>x4$zjIb
zOrFMHF+#gECLn|Ne1yN$dzsZdh`DH0z&oFo%$E7eg7<qjrwx(|2aDr$4HLx<K$D@K
zfB%<@u&{<NS<wvg4AFe-|2%jqn1t*z8bwm0p_;oRl>D^(H}q3;$ns(qSirNHk=KsK
z?hf$LI!8FGwV7OLzS6ZkTUMRiM&y^pAui=8NS&)De+O!o?>&H@@uLI8zd4A0e{PE2
zzMqn*fw!<Bd@q;E>aFA>BokWI%So@^qz4TaG$WGa%txe7op<@ow+a6$AF3^2Fy1^j
zw}BVn*r+?D^&TBiJz!XnL)UzT6HPDUFzTa{N{3Y#6e@guf*&F3-pn<vz~enVsjO4U
zrA}u9d|X)rcPuTaFO#43cjMA<h`l^gG+|f9vj69o;X=?r@^ltI1Wic!0}UkKCQdYv
zBH**EY(5M4uIkma!zId&wgH)cTr9X}CRT5BEGJ<F&iYxaQ@EZ+W;b4W$?=BvuaXYO
zwyDi?Wp4LOa%VCI8{V+~-6O<LFTrKwR(<%q6scMv3d>y-KS7Qf`3)hbkccZyWj%0Z
zhuw4#3IlkS+GZy^F>ueUMw(EL+2<qM9}!RYZsj5`=gz4UeVIkQCk$)V12+7vx*2NK
zUR4<b`$`8C`!FuDFs22Nq0X~C8X6j!1@m;%>ia3PWoe?tMy$34lYVdcyo=?lca{y(
z_T}yBL0WzTSyduX$uEGav(eGkb0i4vC2G4ADYod$UU|PwuZBMjOJ7enRdTr_x!$d!
zv?FeS(H$>kLWky=wdzFdBAwb%^>HfULP<gz)jJ(fk?SB+MsYqGYBXMzL5<wR&(MJH
zDTK(9yP00t?t)p;SCmJyNz1H#`m^qG>tB>4csa>R%g&Cto`-YIZ)rY$4K?myP(D*D
znmvoqjce~>xR5-e8|GBpVpOGbWjF*SvthYG*PV-IufdnBRKb|NKAA9cjKuFej6QAk
zhqaK^nleGwkHxmrSVTLg^+(S2GaOH?<cTuV?>^v^&Jo);`2ow;M&;#QlBv!7hvnTT
zxkTg{aoqaiSjqb7<N8)8=bv&avqLR>s}Kcn?IgVf{Hmv1iqi0F+!8c-NHp^}48#au
zwqd6y?27aoZU`MaOMY}{?1|%KCb^wbS2m{DZc@%V$&SZ`SORe@`|-dlC>|{W1d2yV
z)c#p3-hhOJ(M<8Ej1Iy*b1-N@qZD{Vh!H(JSZFQJ<{d~Jhft8_!Si&`asv#dVV2?+
zAHOK!K#d1t180*D$|RK>8~+<%n-WQ8nAKRBof@LfRXYxb{{h0;Ha=`FMFZ_`b4TJQ
z1|#JuK@r7xz(!~dKTD>0Mt+D!;JMTN6-I;4jK;5w6xvDgtwE!S<|u`R)QfNzRxB~$
zd{3fG(5T{oPB-2GMWBRzFr~_oJ}@-=zM`@nmBLZk78L$5m{-13<amc}fB(Z07MU@8
zw|l!<4u0!wcy^FtGP6uSZQmm{BjliIg20cqS{NY{=qC#jFOfo@5GPsjW3K?L`EeYZ
z$;{<s`X6VVwH2)52pdQy425lfoOsl@k_P!JYxa|aldSCc8;{`tl|P=q^E&AHkpgE#
zCB~^DXE`A85_(@Md=9X%Acj)-iNR+6MQ|v#)^qNf8~nrh(cw>z<*el0e>TaeKHH%@
z0;kkvd6144S~XcSHH<^ixlmk&orE~cw|e|!%+Zs1RIY$Rvz`CUDHpVDIX&xUEWm&t
z);$I{7*rve6FQ&f2yq(^i^#n-)vjpsZ*6R#Z~XWP#|6UH(YjfQ`r`+Do`x9+ZsK(1
zHdsOZ`28OrU*8@_O&fx7#B_7dGWzQGW8i<Z`I~5S(2Sg`>B51!;xoKROS(3J16<O4
z`2BCZrynZ~<7#a{{C-^h+dFd+^mLsvtWW{3X;T;E1~2d@w(;*<`es?c9WtPV4?LRo
zzm&G>U=MfFuT`N1stzHroFF0%H_&A~_o(mh%Rnu73O0yaO8GoGFMWWnGI_5@G73GT
zh$gt<E_3^Te-r~I_z4#glfZyrswI2I(QJOj3<mvD*6cTL_MZjd+CiIiLgO|dL;js&
z2gWW8y1nhMVL{h_(=t+c(217V^-mGsieGU78PXvX;8evcNmR986eF7FjZez`w_%6l
z>gwtkz05^X_ZpEQxs3xcs9$l$!?x(|h9+XV99j~tXB+4stgp54FUinrJ;{;gyG;5S
zd;}FaIk~5%o#?WMXNWO8`9k~@vgSHU@NlNpID+BxAbHFO&Hkt_-_)1klJDFT^5Vm?
zQi9blVX>cK3g4Z`j%7l@+&^l^6=d>+SPuWvIO2~wvK(U@?D*O<txqC%goNxd*;T$G
z{r;^6X;=3l{w$6I(W-MS?9qtRK}!?3lo(-teytL2Gieo8>H_YCHDY98(DdJXg^y@?
zLN>wh?&I;imz7enGH?bK(%61d;u?=qM+Hi(wHe=rUIq33@jq~!tLO=YA?=Voi(p1#
zp#Q9Fi^O3~g-dY;Ma7;{Ziaf`kT1KQvHD91B!2o7OSxac5Z{Pxf~Ewbkts$<K=s$y
z0Q!9hH%dJDLfBc()Xtw7*K74jIZxIffqXjDU!Uokt|NmbftmK<cu$wiMfcum7mE-o
z&Jq>3ek_531@)5=lu_^xF%f@<ON4z2!VHzwS5$vlXxJXBUE7G9e@K$P-1xW2{zro(
zkYGodo=qe3767{fiqDP;pjoy0IFKxFqD`M)duF{FZv1adx?3cfW5#l46s?JGriC}R
zktE6?-RFey6A)m5R$RZnbiG?5VG`yh9DG8&H(-Gg$b}2@pCAB5jJvh7(+0CM6Aj6q
zkk<cFlYBHahDL@VNvfbgvo(BtgcFF6Dw!kbZzt>50&s%xmD+%>5~!KDZSaG6bSDv$
z912?6UE-qi@71?pYWc*wpz*q=;95wGb}F&591OakfBL+{jm2+R+BK%$&$_rK5>O|Y
z$Qu+-p|}4H>SoA6y5uIHNb^5DYC#-WO?!2wdH2?JK<|-~uOkbgEN3WpkB7rHaL50x
z`%+Q=WLe0e2EuL%_!)>W{+~Vhh`qs%8w-zHu06gAGxeYrszM~<fS0;FdG@cp{_|GR
z`hnYj`Xt;8xHT3gb>TDW6v-{F_C&irZH8)3yldlsx!wQ&5`$Z8Dbndr_SG?(q>gtI
z$=q+Mz<QKmkobS}yw)`1AKD@ddX1v<i2X@Q?E6=GTlfrz$&Y`YId&&Mzm?~R64AHK
z`43cg>$H!+?WW5#ER2hSLu2=0@A|B(wy5{CpJ$&6^M+#(jp1HhU%r&?lLaYpR{fMD
zpd~4~P+vQ)=Cedy`)&VTF{q~d`m|TFkCWjxF7A@LqIj`V(0HVCSiWhr>q(d{HzRF)
zjyCLiOU<qRRX5>!o@reJ$_ggpe&Qhv&!IgSx2pRx**VLvBko#U-up&d&`<*Tg$>{u
zXp3;N{hOA9A@u@kZ<`Mt3~tH{vlM;GoR*GF6rmxXjra@z#vJ)h45>Y0%(cGgTtmhG
zL-?qVo>FN%=02-k<I9kf>&wP)ZZL4Gl7vrZiXvh8rf?3;l#Y$m<UI<CD>Y4oSgNm9
zUoGCYL0HRQATvhkV>HAb|83X87F2&kxKzs-dwZH<%PV*6md#fn*kAUVP^D__xytq<
zx))?;18|kNXpZ17MfaBPFM7>wdKf-17dzl%uwXiS_~LRG%qDSrb*1HIU25H0J0N(b
zWR}=T-w!>0*COKY{7NXWEYrbWd9!*Zeo7m+VmDXkK8@YiddlroWIE?}FeQk{7r8tC
zgPdXg>T!-4Z%V1%wd+`R6M<uTwY&V%jsSC#KYSM*L$QAi-4y1_ci9mA^pubOh@0Py
zOR43%33v~K&*JxP4p!Gw=vJ|O{A<%;U$8~iWPeCm2ZqR@&+GS8{_x?{CS&O)Gda?W
z>CRu~aOcTZ86Zt6j!o=ElK5h;w6TZ8E29hUc27(w8N^hpt49m!M!0alFNs+83H=l9
z7-1pPIC0B&w+5#NHa=o=&~0*ozm*&~@y2DKh5Lv5WPel82`|4^!Imkp=RJc(_c|%4
z7<je_O!M;V3IC?*zc$A20_le2B|0vS-<11nmH%5TNbkQw_^%MwjKco~P4J_SO^c7{
UHm7GLfq#m})Q;YjJ>mcV0BBC8K>z>%

diff --git a/docs/source/_static/img/dynamo/torchinductor_backend.png b/docs/source/_static/img/dynamo/torchinductor_backend.png
deleted file mode 100644
index 84e37aa7c4b63e7120ff77efa0b5a996e0d87ed7..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 122529
zcmeFZbySpJ_cksF5(79x4Bah5D<L5bBPk$_bhoq&ARR+@2nZseAkqvqbV(^lrywDn
zlEUxi`8?0_{XVgN>;3Ef>s{+!i<x`YiGB99_u2cLeG#RhrbviSg@5PH9YVO0oaUW7
zSW0*9U=!fp1Ku!q7gGR#Fx)j2W$u&@)BL(~hv5!fPFmaN#dZ$%cb0C-a?rpsU9ea?
z9x+XoG_i--T^O;rh$C+Myp@|Vl#oj4YuobfE*>#U`@Avc+|hB%pT(@Kk9EH?e`N-2
zy^&zK@zB+k7}q?RV7ItBTZw-{MFs(5{FkSMKZu_t;k(cAxaWWQ6&w`#8jE-ymx%%M
zKc8B5hM>aV^{?+lzQ+B}M~UH^#D8nWq~V82tYjYu!u@Zp{x;m*{~QZ=^Z$nXFU#`(
z7W2Ph?EeHR{whnI_n$m`a((tL(>$8<0+qg*_3tpQQIo=iZxK8X!PQP`3nPDFE41LV
zQsU&d+fXdJ9LfA(TJ6zOEIFxvc|J-3ZO9a)oxnjtU_IZ})S+161rd!S&q-q&9et}*
zAtU(n=zoL0)E-HK)aa!}30>st^tO+s<@V*R*B3#H!=mdn)zjA5QEC4^v&h#Gw}uI2
z`dW;oW8&}WzEe9%biDgBSzV|z;{M*)!bih@N6S|#3_O-k2V>Jr8v2;B5k!x4jXlSu
zE{P<LHL6D>FSYAh>@#q`=d=y*wmLea&#kN5FhLpJ<R;CUA^~SVaE<^klHE(^H`si(
zjI_Qw=oN0-sWUt_0sKmk2WFoC@63kns)0d|F;$y;_vsw(5mKcDB*kqgV>-T|bsywk
z%e2Ym$Dv15Oq(OQ2d6CJ=hOY47t6gR$5UeX)%{E>Tb|)=(4z^x&KNq0S4({ozo$B5
zDEOH=Gh4j-?%o#%b1$&vc#Li=mQ@eiuf1^}Uf7B>Ffzo<3%K}@8F2B75Xtf7aDdmP
zykU(>5Kxk5PQU-@yp7Nz!u{y#XhgWisyN3>pqD-nF*#d&FH(vZT(J^-5X;1qf!1o_
zd%0S7@Z{z^KGUggn(BeE*R<6;!cYBd*=2Lc4?=)^GFU7Lq#Z@ca&IrN<`_QBy0o@X
zioUcwq0X%iD-`(sp5MAs)UppOxYvA?m1&*TKPa*i^X&;1DH)`MDyVw+@qlCPxDMga
zMembqvk4>jp>m;<Irov+E{Msr-(Jg*Sp@CG`0roKOkY7*xqiE%f~Rxt!VgbI0<Wt@
zcTp>^^3UxpqOdC3vMu67v+Up4<oXVV+?TXvd*j+c{JH2O0KjN)f|O=x)-WW9j-g`i
z<v^y`F3RHVdqbC&TG2npV}5T}&$G@hwmxT5T4yjG!AVpfKKt3xHcWd1ec@wP{Z$6!
zE1x!)x+w**O>CpTIa751!dd+A`^!GYhzwvE>$|g%c>&8&>{MlPbHQgmZ5TLm(s@?U
z7jHffm)#Fi=56_2*0kG5NPB(qInc?u_0R7MDGhaI%zRbafLX6ur=gN(@8keOc{>2C
zqlfPu4_}Bac@S&5(dg1E;`D&|PhYqaLVYX_Ro3!qCtu9Mss{xO&T6-Mv|C<a&bgm1
z__=ht00O;}F5FOP*eY)-5W#b9f3TIL%)`GDC*~}i_1+;`N}X6_<88=b5Dwkc*WY<J
zSL<twH`m_kzZGMhW7~d2(9^}>ML4Y5mwuX<!S<-bjg(RbCQ}ot&W@Dw#JgH1Z{<^$
zIQcGhcRI>iv2pRsdR`!1CJ%E;%JT|=_Ia~OWA8^}DuSx|;~(iD`^0Alz4SvY$tnWi
zKo-do!-6$MdVkBcEC-&W-KIUE8t*99G`{+IFGJSlf(;$wpchOP!;iNz0LyC5??-8i
z11f3x9-B6MF)X$x=ghJaBN%tS;}Yo5L*-DGwzb`j@3X#g7_<7;Fe7j&TyR`l>%@HW
zyESA31nis_<`m<25L>p_myxvE2a(Kie1JQ$F)4U;G$cF(XV0<RZCI!4T#l3RI1|7Z
z3q(<>h#z!WX-`&rww;uIMhlFoi8+n@&i_cbe$84PW|$%FZsOgz6z2^XsE|<_Tr7BN
zP;g}K*V@J_rmsg>b9?->AG5Lh0qan&J|0!RkK6vI(Jp!2ny}kP^Uk(7sb?8(iaOx&
z&Q-XcF=v96DEkZ60Q-nt<^n%z+C?@$B~Xi?I;hV3Q)Y&9_7C^;b-XE%Qdm<QBo2K8
zQ&{v_iS7HtF-(y(VKChCBb<6rQ<*!}%hu%DS$#R+=5jH+6#$n1^oX-(G>jicGgoze
zv6YSFMWfyg9a`zV#dwZOG5ZYMsMm{UYAjV@zC7dDe9)6fF>r-#wwtQ^BC3o~i7Pfk
zX@BPhC(zps6B4g}-<Z+wxiLQUq2B1!YRl<@pq84bl`;6?35hbX5#=8Ga<^!4A>h)f
zQ(R3R0PfK(aC3bP%lm!?&AVUE170p1Uj7+u<T6!Z_G~TaL1DZjh~PODM2esq`*rKp
zDsQ_Z>7U3!V-F@GM&jw!ab1(x(YsGBHr_ppmY8Mj?+u*-i9hk=y4-6G4Dp-zRKGy=
zKL0sqMBB7Yo>LuQz(rrcsy@ZuRgTp|LTfRC!?q~PdYFFn@5>1K0KUb*n-=e91~qCq
zlaGVg^AG}AeoN@Eua5^{)dbpK{4!!x>kP>;Wt6(umPFFjS~MaBW%B%20rZs(84Z$`
zr0wkjIQs_boT?KNxH((p4V>0)g*ncI8<8QU9srxfhsck9cSxeR%26F}ycc{4+eNBt
zGTv+nFZftENe=2~9MAGw#PBP~reFWAZ_PZJF#NpONVy#^Mrm<4#N(qWttWncwvt^H
zHMMT!J@09q`dsG&q2l9Be3s}Mliht6z%bTo#?%}>){L>8>FF>QG=;T#kJjRAT`AJ}
zc3u5sf5IPfk5FK{EpWdbCw$%-)^_&0<>vbG+szHI2Qg3bF3MzK|7JqM%;QAYGB#Qt
z1YkzG7G|#oh#h~Goh5#o6}JC1UsjnG6N_9R>3Pfk&b(;|Ve`=TarL;WjwDg@DcEZ3
zh0vU<tot{4@<(P2D?^sasyg1{o2|GWZ(d_1#W9ZrNDpplq~Q^Z0;Z|pjB#@@*-DVy
z3#`2ee%j>IhW!o@A+J^HHOmq{izhN>w}^wa?dPA<#XP$T-(jx>UmJJ882#b{0hdW2
z?YgCjyNgsJCVyWH^Zw~ZzTZ9g%LCl|E?#nG8^9T1j)N{6t;zsIq++=PlQ><2qC&6U
zf<0Bf(Rr^`d`=;T!tD2q!{?0@ub5L{pLRf{5AbwFR4po7O`7QVBb31l{@C>ejaVy|
z!%<r)#*vjjLD~k;%NHNmEv>r5GEEO20D(XbD-fzzHwzla&Q}cyFV5}r+u1PU=8cp!
z<xFbN=P<sGYM`d2lXhxYP1{nr`@oe9bCk~aXX0IM&Z;Svik$_&z4`CdT~H$*BgjY0
zd1qx3*M<u3^UXD}MQF**XU))H{$83fzbLiFsjqPWgj@P0jsQ=7WRUvPDmWF#nKUxx
zZ1za<3)NkYy;?-?XJE5f+Y7w$qkMj1Oxw)9b7w(sOGWbh=lhf740&p&nrC(=-DGBz
zH3K|NDKA{6$;#0en`^)#oakHv0g0U3^Gyu|1{|{J#`=uk8p&P9hmx7aK8P85wtQFu
zWFj7MfEI@<hY{}+GTkTet#dpl`Zh!Hh(MIkfjq=vR0H5Ys6Qr1C8gT#(JGpXsf_K&
z*tc{;4a-g+YgKe_Kh1ghh4ZeB;5XBH9v3s4R9c;SQ{$@TXs)$S<?QjNnY{zY^>6Z7
z61;5rFPawr@VtDtEq8z&-F>qDT}z$Y<NR~$_2Vj!W?-ip<gTAH2e$AynUfSxP4@1h
zs+qYDuZ)614-Cx0$euQdcT9DFc;__z2dyHrd#=LI6meg)n0H6`O<KfDtnEDMur$U;
z2Fl%OBA^uLGV(tee>V7b(j-u#ilZ}@2xNb)aB+mkmeHmn@+0)wo460fxs5JvEdV+b
zv$G9xotq{M98R2C$N3>8On_OF(WDl>et}-A+Mfm7%Amji%mQkHIC$H@rZ3hfM!z=$
zq1eA4cqWUDR|5Y;%&<?LjNi1B^=J`sRBPZl@@5Ytq;DzkMHI`&lRUbm%1^)N^}(**
z%YIhsk4y@o%a3c+gdcJit?kq=F2vA?R-GU%OE4y>1{-#Qn~=OsYMP!{kFAR{99eQ|
z!0@eQ9Yd#aQiRaXo5ri%Mx9iri4ieUhv$CYZ+t-q&U&P?LHZVJ&F++fWm&K0J359q
z$WI}a>+jQFd>n9sN(7>PnBna{P9ij?$7AZZK~26(rc+fv+gdP)$999B|1Ko!IS>{-
zRs?pYbv+xjrVfah2}OcOXV&FejB6ZYOn~|P&}_{pc<IA2Jw~3C%pFZJ2dSHr#!5xh
zhf0lVWk^uykv@k2yb{UFrPeC2+VUfsar>h3vUrp#sBqpB(|Q-M#ZSfzE7P#YdSyLf
z-MBh3`l?NCWNl&hOB1U=NV{C&1(QN7TJJfxA@#tEu?>0*0y~So`B<CW99wjvr(4v(
zK;HI?F1jdXu~B6~B9sF}+Z*s_wz%LYfO;M^ctPq}QqEajZW{d&8|Y5r@EFrS1{F+x
z2i@d^7H}ipxgJ?cXtlt`DZ?+<tibE@Y}b9l7q<CfRpz5^fW>nY0~rNVo7bB-vBAP*
zHL9{K0%9z-&R7L`ECg0YXk+3CzrGcFDNmTGt20fW3;Mh$fn#HKRnfGo9!=EEo?yzj
zl%+fwBtPxnbrPE)gdY9yeGbi#<xB3e{i^CRvzVcSVc%l9=cY&VwFqN*pC-G*C-JK<
zk0<7*_dI^H=)QSp(q%r-c%TK1@ERu}RytG20p_^V=-9hubrkEUgsMUV`-W2`Lup89
z1+Vpxn%E90!SU=t&;+*p+k3!ntL>4%Z_`^g&vUk)G+M9bOiV_b`$ZT;zwoOlyF5Q9
zFYwwoc~*5*Y*gkhyj00Rx(RXdtN5D?%tdtljeFMIH(a8=j!(COfETSIXuV2gPiJR(
z#rB%V63F^=ACs-G<wwv(&k61%318z)@LcZHn>;8@)iLPbkEbBF+;byuN0FLwo0DTU
zh07Iw2W`IA#)ORu1N(mX3gEMw1}9`ckSmFr0uJ6;PldN}qafU`iXNXvHmy;d7szs4
zpLhsXO?25;ZwJ>3Ldo1mB)lyvQQtL{Q~jg9j4Oh-O#W=@67HBU1anbMxZir{o!i5d
zKQSQVc5>2Zpd=X%2j8TT<sj1*PXYeZ^FuVS|5-<K7WH|mn_}*OdcMT@iU|n*v^R<%
z`i;kuvDyUEH*t4$X>)mlJrW26Iem@+z_uG@#Ht<B0IV2OdmA7$?xZl|2v-7l>_G|{
zp)xa;a_CKs4>4LQuJFbx==(rem_Yd}OTqS?I}qx6mNIthH*P%}t&s|Ro5y}xIe|U1
zJXizUa%DIR_sglt`NzTUFY_z-A;hbAZ(0w(aGlBu4*_}E^vhSyA_q=VSke{7ra{=6
zi{$g6ip{J>s^`2Tf*AE~I6oUQ?TRuB-c~iwA|t8vCgy<*h;Y8?Hg^O({QMBl)(n#d
zH4x}Nc>G-jciOlbw2MjYj55<Th-u&gZC6>OB{+roP{MiSjeoh4^Wa-x9xuOB?(({-
z$`m2zbT$gn8LJYgYz30kp=5PQyBEHzWSyu)A##Tzv(9k&1JTD4AJkWtl{0%;*h(F!
z{KapR1|U7yZPCtc9G#ijUCaYAF!z4ebU7aBkta6@4^gkRK1^vL-0LvSp_h)3AS`dT
zT}n;X-pQEDh8g?POo#FYHr;xCZFA{}OKeOA(0#W2xd@ey5k#~zDGi*H*l$977k|h;
z85UWQyP0{8Fc+CLH}~%(6t3B-B0RZ(DIMOs_ftZxD2&>r+3l@!Px!jc(4)!0x%g-A
z6y)3Ob~JT9{-{ydA6%h&Eyjdl-Om$h$b`(**~JJ2_lz0upUucFX~M=PfK;z$jp%A&
z^T;0nvtp*%whRo*n<%M4T9inxhAl0|*vDqdF2Y?q{anMmfNuf!p<RLT+EWRfiwko4
z9zj{_+y`U}ewH=M^FzTjrPyotVHf?u5y*N6CxuJdAltEvo@!43?F#0Y?zsZ%kmE2L
z%5-{7UBS-<IadyzgFu+LuEN7xF$)}X4AJ>*CvIK-bk{FSHguWsf43<AcK$Zm%64tP
z@W+Jb&fjBtSDj$`>;TBv6@$KI5bVdt-=oKoB;MAk1m%)O#&<(NV9Ati3i`mSgD_Q>
z6_3usa>VgBMI218_f}n|?5W-OGEYnC>w;xR>&Cze88O>H2u~<Y+`u5lfgqjE9I`l<
z>N;u<!MR_P2^U7jG>5%XN*^GCYv$*?Q;g?<ZTxi7O<RFKdo^QM{D>y2j_XTxP--Wz
zyHL%qqqp(3l?WKRk_*DRB}Ool*{@SYIcsL+O!nQ?4KpfzECdFzMs*KyS<WJvEg3f0
ztoZx0o=2UMCx7Gm+$;bQrV%X~KQ3(cGlLed7A}mbKar?<ZL7$*c2AmrK3Jp8ypSYp
zeO5K|X={Vx(Oh=(fZkld$+&JJ!|T2zmge(|p7UnVPRFpLn7juTlM$9F-2;>~<2GYg
zqCTsGG>05RRoFY&`b(lpX+5d%r}`tpi{{W0uOPf~o*kY@v2suF-lFG(eld5tHILkD
zzj8=AkhqPk0mm&vo}X=;j5szV3m%T3KQqi!JnInb6XzZg%&0X+uPE)sWXI5!O?%G)
z-i^B9-t^AXr&e#iv+cUWGQPkNg09qJM}vYP{?g5>GhLT`TXnLF_eGXo8>z0nLXcG7
zGdSsH8l)Bx)4l6i0c6&9Z-b2%Mg%52iC&1_-z$i3(JG(HVCuXPX~OP1;V)*nji?-R
zoa`->uw(kwbPr0Xg4lsa`O}a{YhHAR;qmjxBQ)mbZQ{(@M|Sz`qR-!Y+hZnrH1R>q
z38su0p9ZFNV5Q%GxRRrN#+NjNS;>Ty>e_A2F~h>RyA@xC?e01nkn7rkr>`Exc8L2R
zV@4;xE!wY-=V}K)2~uSi@<wlifa+l%AKWG}gY_;B3_>Gg9{hQ)5;xosfAVEmh{!7I
z+60Vfo^(jwhj=#FkHO*NbEvE(?tTNh5RawHt6e7S`hAq0n~5`U1jexi*W_HDjFZ1j
z-hX`hUC~K|oQbl!v&Zk9P6y%<Im_UW8nMR<8u%hlZcN%hX13G8!%t!<r#xyjqoaa8
zyZ<m(KAKvwq|ZNL^^Jz<zB87bW${VLyEuDWucl?Q3S291VO4}XC6j^0kEH=2Gx`Dd
z__OZQTBRQ?ZEA-uftRyIW3zt{?)NR$7I!=R+Q=Rut+$sbyD!3H85aEgc4~Dyt%*7w
z?&;;3zF{vS9eFt6gNuK)=c9(nYu*6n<>7JEImddmY;j%|3RZNzzxoG~KJPXA=~rf-
z0FPcj!-Wl=*Os*aU~P#}KGFwkRM~FLBTjB%-kdrkxJbRu!yWgqDFH#Y1KJMroj&Gx
z6^DS%y)I#y<NO@EFi}Xb!M9+rV%s>ZgP$q?a+g6Ue<Zlr*3TmsIMB=kqV(tmN;Itm
z)UTOBe~BOCEnm9@{~*h|)Zmg1nR)JQ6tf%(`^NMtbPS(fHO1@FVo!|y#rJ*b-nk^&
zi!zWHTmANC?K@LXX##x7Z7p>PPfO{Q>N@E==F4Ot_?dh4KHNZ#BpsuAP23zxVr>pZ
zB@@)|sVr=tQEUe=Em(4@Y-WD1NEkE(zeL&0G|Xn#s=^Q%F)8{R^hN8wMtKyxW%^Y`
z!j^~fY_D~qc}pDAE-FZ6S=^B73_;vtE=HgrL1jj<dpg*9a__O{kFmD~UlPWdVf@_1
zzgYNyn<6+O?yk*kJr!w^zn)mBShgwwe$ZTvYcx<Gq+Jq`W1**2q2|UdZyLg|(7!y3
zp`aoDt1$WTbWC4M57`p3Xr1U0jYAiZCO*CxuF)K=BB<1}ahtVgyu;IB*9_74eYE2z
zl!S?OWZd;M=E#-AjPa`Y$)68j{TA85Oj8ilBXwyx+2gu(La#R*p*YYsc$&DKbRvXW
zkAb;}B~_M4l*PbR>umTEbF2Epqkfj;htFkIKzU#!61T&7q9Yu4&%lS_@yq+?YdnZO
zUmGj`h}p??%4IQXa+rT*xYb?c>F2G--R|4?cj0Ld^)J7;HEY42=h#TDPK@|voErD#
z5V>KNZs~xIWOs*ZV!?~`(v&R*?>`^OGf)HGVOqrVs^n|Ac@x#k%@4&2NRj$bEaJG8
zyMw_Ux@SuKa98hr|6bz$oTvx%_L3SZ&4;9NcO~t85p^pUkaWHMs+B>a&&IAJu9;_p
z()yN+dZ=Kvr1yj*9%U+{D*FC3dZv|kRAyQ{o|J7D@gF?pQeldkPxL0(epPyi(Ywq>
z`h+D`Hc|{d?OUZpX8hB_soRnlS(`Ac6AJPh&8q*L!QNC*qR!B-#Vghj>8x)-l^`=!
z$1&i}9e2_#(k|~d{#@6oBzis1LjQ~b?5?6I&0tkLt_y}w9}F;s9uQQc<lY?YD4G9S
zZp{2?m2m0<^_Mi`8RF*N^Bj${sq8uJ<FUeLX%?##Kh|d}svSBNI~78zd;jPa=1bAZ
zae-13Anxk1{0I-JCF=09wYXCqfKJ`<LHkR$$Y!&ldPlwLbM&%0gy2%g2Nrj`hks&H
zcYSH@w!seL<1_DyEO}6dKf0X7@?O~50^i|znMZYs&q~iaIweJB*5<tLB=S$HVn**K
zZaX`Vqh^D1@SHb_Bpl{9cF~{VzGqn(9=Y{W5V{)c`LizSl%&9Z!VSJfQXUk^EqU-I
z-B_SJ-X@rW`#AoT?8BIwAz_P6m7~G=+{gyk5&rE(!0+3v;{`g*p|=IY6J6F3K|hI;
z;(yhtk2UYsOM7?xi0OBOf@xKsC>l;pujnOeOmC-xnW@C0|GGlTG$L$e%Bt-+Rla$?
zs1VI;jdJFMBgsvS%g<JJA78%wrgh($1@WD;hPpI20jXlk9)m5##i{wDH}rAJbkG@P
zms6y2+R_UcF(gz*431@CH3-JVwd}LZ4GR(*X5<R3+FdO>Aw3{b`<RVK$TqG&nED~2
z-$zHwB$w6OY^WSIB1@NumHv>eIltp){AJesM8az3)~<rFJa!Tg89XK?t-3IF|5PYx
z+8F24rN`G`1L+9<wmz}aXm#e*zd~#6od{j-b&t5r^SfX1F77OOd^7d}qA=xY6GY3q
zyu8JvU1Q#pb3$1ee`F=_!y8Krt5l85D`g(rP@fV77HMD~!(V&|+lLytw)e>ddj}+u
zu{ia7><rL}3Vcl+Y>1N7i9-Eu2vfy`(SQ}_Pch1=EywJ8k770UtjpSi6TH#JQ+=i@
z%-R7**^6>a78BkqCl!@uhFsSABH`tGJm?iaVNR_S!*pH|?v)N<dQfl~>w?MwUNxK=
zTIk)~($DQHU{Hy%axc&dW?ZTPJMWVud$7MUA6pB`_Cj|DED&fp*HNSWQH*4qs4vUq
z8LKVICSX3Lhp#hYky?E-q35h=DZdpkf+L!u6tc(4o=z*tmAG@KSK$Vs3DlownvhZQ
z<}hQn$N?F-yyAc`d~ik^i;#_WJ1t2wnU*%MV?-`*o)9_>NAGeRNuE;AbOvKj60)%&
z<T9sNGt-ZrneqqE(5bUQC~Zq!G~0S3@ECWwwsXG<58BEm{)l4yaW5O>=FUFw8>@rx
zrTruXC&429&doH7E@>W~5Xr8-btdaLX-fQqWC1S_LTonjMNHmEYy|DbR9AYP;IrP3
zEFV~QW-NrZKrFumMY*ZKLR)`wPEathaQ&_5_<`le-Tv)UVOVOLOD`<U`wBvht)Zl_
z3gdk45t+<0O7T5(N5)c*^<|Gc%}BljH!1Jj&Z|RwmSe)}LY6@idJfAtHA66g=_U`_
zC)cO9A=z@0_}()YAo_G_>nzN3l(Q2X_zf>l_`;G_N}@OMk6M8FnS^ZSJ#jg^dViHE
zh&KvnZt*b2Z^2K*LwqqL0k3t$>O<L9<H-_Fe~Udi#rpJnqFTaJ)vKFi{5_-cC)1Y6
zk7#;M36V<6@6Mo>de{tadrrl4qV&wB!uBZJqx_vqf4iZ+9-@1q`2*PZX$kxd4gJyn
zi&_hyu8y}18fIoGtZZZxZ}g-sxyv1N6De#CYUjFz(`0asox^hr>lk$ouj`bPriL?^
z6i~>WGqDYWml`ffQp?Ke*}_}tBdWc%25{m6NF!>}5ya7ji>5DE>Y(ZRQ_gZAUsz$`
zUz=pcJK!C4$~*Gj!40_)rqqG>ZgbM>D_(kV&y?f0lI9>+_FZb?X0c^o=cL*sf4YhO
z-}j^@j;bUBAFE->wcXx2$nO!D5FHm59R`{62Pd>gFpLzifPxF}yg;Zx6e=Wf8)Eho
z%}%4_gK|i6yIGR4^L$yDAV@FCx6VJ9JJpbehbk;aK`ht%6@sx_8hsB1ph{I|>6e<G
z7n%<m|5O;~CLu!IWcT$Es9jb&H*T3UnF^b#?($s2seBvEFLDDs+X+cf)NFTc0cWX%
zxI>|d=bzG*3t6`PZAt#jL+tINowH~3SNd%>c(_|6VkIG$SqJ;Ilj}0QEW@$c306#B
zYJG+)9~5xQj}|bcTy(Xuv@=-7c84x3d^^QP7q#RY%Z0*2Nxb=Ewm_JcMlz?A7C6ee
z&*W&!RHjtB4B`*gTKb#rfQai;R@3uNb8WU=S6o5ClO*cmKm5tpTNGvv3h}){iVlee
zEmFaa3RwfxI+3`3a2-QjIAQ>?`>t+0jCaQlaw>Zf_qtv17usRq6?(eRwY54@5Q}t;
znvi^QhqEig2HzWeDVFzeji<O76KP!UAT5(8Gl6Z%Xz^|H-dfMg_@ysy2k+bB0#};8
zU{ZS%z{Gvf5Z_9gRf+hsU)6fS5b62(42GI<azEA@X~^;>38vGta2Mf`8W(g=6V8@~
zNi(rY&!@tl80g;(&JRA}vPhOnU1kA6S-bHT$0daxm|)Ik$EvBC0osUZm3wZDP|an}
zwUMiZEqdzOrb;FK#{Ra)q(KBk6|1hW7O1Y(sqyQ07~9OYg3VCY=u=sJ3e$z$`U>jK
z!ddc!2c(X1^AEg`vNGzLhE)L-{%aH4DUkY(I<)I0QcA{T&gWh}|I0rg#tOPGz0xyS
z1l<m-4BLcfhQx#Y2toCzuX!wEwmlKuiH1|y7VAm#>|Q$jpa=-t0L|XOy(J!oqXk|o
z+V~Tq;5UYPwhU4NAFK*GTI&}B#0=W1d{mXAl>H-VQLU%~**;86FW8l=^fcIL`E8D~
zpMP+eA3^ew2Gzzx65FfMX*d$OBLltRE|nU6I+4sG=mTYHCV|kb<oL7o&aJ38&oA@5
zJj*Ggm&_mWZe`@CtT@n5noc%{QCB=Y*rBU-$V~Yb^%kj@6b5xIZ_}D^Pt?AT#J<=W
zSWD@~{}dzC$V2`;3R`Ug+gp=5DHyg!#*5fxi!9zQu^9g;x#in6toCFPVPYhmY=b<<
zzL9bBmMMe0j1*1LK}@QGQm8&A`;nhU!C32(WVbuAdj0P?rSn|;aR2;tz`k3CBE2NR
zDw9_kY>pbq5{~yHTqZ!;p5|C0y%|eQg~Bt)!_ZoKLX1LPo`N{5m&6yp&w4s)DeUDL
z%mlfPUzm1+^Ly97tU2gL!f9wG>+PoT1|IA3IX|*iPFt7l1&;SAn1c!nyn|d$Ih`NS
zlX~gvO<DS%r;iV5Fg!8&gDA}8sMI@1O#t$K1}`s!7{&L_u8{dm4L&(9YkkM}WO-5@
zg_A@S>!o)w7}WRGy<c_~<aGO$h#&Sbhr>e+GtSNi5B&L=r_QUnm2E$}6lXb9%#zz2
zDp~aG!JA%QEkwa()4Z72_!}ye)8@(924FGz8r4WDgSHE3jO2|QTjDm14V%}acfi>_
zZBQz|pB976?HBxkcm!;X6Sm<Wl<qH0<L`NKHCG)&24Yo-8XGm9-D_iMb<fKbhh;o0
zCiDBz(R6Q8(y#c@3oNa8m-Zna#uU~agO}wlilffd!Bpj8IXsqhAiUkk0=?qe>N99z
z<e+m|DU%cK$q><6X6=hzLiBSl29@D%GMsNSR#Q*Q!n~d=*38bwm<nR;O4d{Dy5;ht
zYhg6=-6Bv+$AChzJO&MqGErXT*LsswL*x<6*VYlR%jKnE_#fqghh3*rIf@LUmzb4n
zc$(YU@AzLhVs&9A@umv(g0n5^rmdA}gqFkh7dkP6&m4TU5-*M5N5>;hZ(Pb_`jC97
z&-svBKXbi9EPD6M7Cj#)M$gZ~xG!Q={h%jJz)=D7?NLE9Gond-`z@W%QqVUZVX?IG
zK#$_a_K27zi*lD*KYwj!Bf+H67EPSB5U+Ogpl(ae$6_#%iXx>4`fPmqcYj-14e*GY
zhitWO<I|O^e=u$xVeP%1PuK5Vw+<PPaEi-GA!Fv&Fr2t{H_GBt&uui2n5e(Z^RACJ
zMwnylZ=XYddbwJ7Wf>{cU9?#yMOl_4U`KUpX~GTL7{eK>rdX9t7s_I$MF$`BEJr7%
zh0nA&8@NuF?fewMBFJbGTh4!*={1j@ckueUqh0UiDq&K@5=^)JDor~Hh#J;CnY*14
zJB0^rkMhTlQE*4xxg7)Su;9q`23`>%mV~9<rJmw`&H9nUxKoMl*sxUjH-yI3^CQBO
zLz`&>;W&$vdKHKBT>2{-W^QXmmpqMnMA+YIGM9-uo91{Ve>_AlB6OS(hHkS;59ZXn
zyp)OE{7>(?C09JCR?c<N>08A@D$32GP&y%3^Y!17Rw?`Z2ry%s8d8PU#Os^KrYYR_
z$n#u(fcW0L&p{vbBt@g-(d!{79eue5*yL~86+=)B@w;MMF(6<6o(QDt_;HQAwYq6J
zOKKk2V_Lx*tavQA92td0R*SClQfHAX4%AO+Rc>U~-Z?L3BEYedoJ2C2m~e5)?XK!l
zYx070l9JeE<WpyQ>S#`4`jy%KQ~g2k{d@eZJgC&Ak3XP`I2jgT6hhiPd6$1U|FiS)
zmD*f7+TqMx&6TDpSfBjTt!0~or+)6G>a^zeAiRtG$8-fKabLDv@W3tK5$>Fq!4mFu
z>Z-_mlxjRW_wg`BIf=8{FrJspYo-<vM4%seZX}VkY5pyoZ_V8auX{3b3&efp58}XL
zUf@P&aM~RR=wUHtgqi1c?U<Q$`IA=_DqGBK?xD9IyXD}r?At54EgBoWBb(%Iij@pv
zQ5K5Qc;avO1oCt`s<)+s2Qdl0L8bt<X1DkrH6)%4W7>(m53@#43+c<Tdsn|BgK%<I
zJl^S5P1#Gn6hDWlSI~7)eeIRt=HOf;pWBxyfAsr|1navRHGQ;nk`?2y9}c-p9<s#g
zr02wckJ~R-^C#WVtGE&WH@-@?3roXeuBSEC1?O1AYjCuliJj9iUasTu0qk<ET5t8I
z8}HL5^rhJ8SA4Frulws>pzpr3B`fl8O@MN%4@-R)+K$kLU1Sxts(RML-kl%JLquAA
z@sA{j|0oq_5*<Q}ZE`%D0Y3%9?Ndznq|wimP=PwZj;-0Mxm!{xxXfd|larZ{Ds4nh
zB8OG=)j4<M=VZ>6MtU|jr{&?cyR6yO86OFGh`bO<QcZ^<_1Jf3?lr^?DATBET6OdS
zdJ|_?DfX+1{n+{I3n&Wr6<n;^R};c6L62Ta#<httqnv(%FekAdROYTt5eC%frxhu9
zV4^f`OFm_$-$_wL!S3l}(4HFkc9?NX5GH^Kdh{XPk8TUH_y_rQ(^@K${a2*5m;K?x
z8`lebVQ-w&>naBKkmuyFy$L-J2KFL<Gz-gQYT*vXHpx)7Ywi3-LB+0gCYFcT6HrZC
z`ctkk>fl)KOaIndri7Zy^EHW7%N=-NC4Re1JA3#UbPi!?)IMdkWo@YJ7m(D$)?bz4
z#yF6EnH#m`9@5kkWF9rUYM{)@y_ReqX7+8pN8kW2*4faO8Fb1rX#(TR*qX{_65<kO
zBRS$Y&6&YF6<wSQZDozJb5>z8iaeX>0j{hVBqM@LB6b<e0)0oCbe;_Mf~65B-3Rb0
zwBR94Zw3*aILhwb1L)r2_xBd^QdoyT@nBM({_IFwGlif<6Cd8XRljobYfrhH-mAPE
zV}XPUpA~F%ag1yJA5k}zv;7@Z?ZI&Hjb2yp!M&<NTZ<|cg$RvJP4cJE#4h0v%PT*T
z+o~p!?1@&oY|fx$rnvj>|EQ8EO}TY^VNF@i*fK3gPCFYRq}QjW*#$9KImsN1A`FX%
z(rDQyJ9^R|_2g)WE?2p7d$Ui~y;=&eaC<pzl`h|O?$#*EHn5#rb)-ITXU8>r{ndvD
zJVWNhsu0bKZ9yY7nm*n~zMoPwDl{j!e96Q`k9EgnX*xj}YYhb}H-Nx>0+;Z6{Y}Z`
zL=bi4My}GIIem3X>u(F(?CR$|IXlE@zF{i|`$VLyQ@cBOouY8;6q6`cm7AUhmPEi^
z`?@=9tjLg`cb#pM3`xxIk05iZ_}BUcBMg(aM&e6}c*N!(#7OKt5gt~Uqm$$8VRdrL
zDgJ||!1t~82`wjoX){2r+;$qfmgP19GO3FfFDG`soIOXlb6?~^1=%^z8hG7xq29`q
zFJTT>e2ZsnR@}-YRP%S<R8s6Ter1hi9Km;vL%;ntnB$|{xOgA^Zg6({w3;`L><?>p
z9B(f5k(5yRDCV!3v}DG5gD#Cxruu4BTBp%KZDHl-5r4bh*9HD{wp-W34f;l9I}S@|
z*==TBlyy0_H%y*_tA1(X(WH@kEq1#4KFI2cz4Be{54hiN3t8|N?&Kd=+O_i|y~6Gb
z)9-7T1-;&i;7#%yeFT9=4g_`VaiOsXin5)l&O3VAYf(bUsx#|?a2uSwcxpE$8EGY1
z-_0>BVmWrKSdBU=Hg}lgoUx5CE+vi9Ivn0)J=ibIhE-z`G4v<$aybnZWGY)UfR;*x
zH}T@Wgzb*a6-O^1t@757yw`H&q`nv;khp#w+u3gg+dHC}7aHVjyQm1B#a>S6f(uZ4
z5IO%a_i5i6_HnqrV}?f6Se~~cH(Chf!S>xbYQ7HPY;{nn$IwO<6k<cQ=q;><<ek+>
zx-WW4kHj>2JXpwoG7q9r=aZkwT%Xg!bt4y4F@+e;8G8x5*e^RbC!TVHAa->kJAE&+
zP8ZZ3=WZR9gBrvtT`3;fYZoNEDV9LrNphokEq-zqC%<ncy;9_=XxZ~BVo(PjMq#Rl
zI;)B@i^61Hi0vv_GF(ez*$OG#syjMgYh?89La92&Y&UL|x3ev4{&cP>Y#6B6@K}V4
zKPfDccWnEFh2*-f)EcP{F{EyHX`r9xs|;~vQgfqk40GF$g8p_B+pdt6rCOP~xDO)s
zyBq3`kb&q}!z%oji=A?M{3tNK@|4R<Z7k%gT3Y>RYP)<L%O~fs(24eF;lY_JSQ*MK
zj4?wkzl>iebj>=MK8~_ul<u@SSZ$2EfZa6<%TS}qDD04s_mkNTc7svnop7Ag4OCG}
zXSl=|etC7@nwE=}9n+HyxOfhJx2H8VJ3YukVlk&2Sq=U|5%P;Y`AcCx2|pfRx1wbw
zIZ^(@GORKdBq;gYS0-XQ8pSs;KI_7qQyfJnD#6=HEIM6mb;pbB34Dnblbz}M?djKv
zq@lQ+`oV%dPL{+<P*9m53fjbmcH2KMWa4O;z|*5`a901mXK>GQ{@i6<BD3B(6bB|+
zb^JM!lOHMPMK+(~VR(hFkHfFuNfp%%Id#`ulJu(@^r<fSO@@h0?D0MBeNq};1g&w`
z!!o)<l+5|A%}gi<oQ?hqPzEz|j5wc(ZBwLoQohgj)7jJ_czN3H<I%!>^osuk>vo?2
zn;le$$1v9q>KyrG`*<4r8G*Zx7t#c%u;I{q*{9k)G!-^2wzlVW(9-l}DY2MnS*2n&
zkCfzIA|9w%$JiNx8RejpSG==Zxo`Pw6$(9$p|&74?Ao8JK9gHg&?9_*XPt6ga@p`H
zUR1Qa1+#`tQ?AP<k^lM`p|BRT!fqsTq;c3Yr)`Ezt@-W}WtOfb<>J%>X2}WF2&x+i
zWf<vVa7s5nw4^+n$6`nf=}md62=lp|8T1Q&J6<WZJ1`?_%?*3F(qV*As+%`OOs{2G
z9pjfwA<}$#{(|XY*qLDcvR{y)cUE75RoS)$g?H%7rZaTE-iiQDl`(~AZ|#@kD0dnh
z<|?d82)WI@W$Kvy9zO!e0=q}gPL<a42=tDA*GU<|mRg2-ar9_d?DMaO7oANd+2+0z
z=nWCb2F~s6yhyZ_HCg|t#H3}S`_wb@4(cqhV5X%eWRHqfwPIOlsd{)<!$_}!*_nl#
zLUw2ULPf=~dMSZR$_NK3>J@7KjF~0Dv@UQ&;#7|0k49_Kl@#9cTH{Gua+E@!u!NmJ
z%ER#Og~J0c?#GDsRm3hGItHQ=SsC*ytb>O|457kTi7+D$YIumG>B5GJJ7;}qo|eWP
z4M(k4n=S{KD{}Ej)>p!4?1WG6@pEq{k-iHxc0mlUFa+X8;|kd!iORbv+_@3unIh~~
z-7n{Bmw&s_ime1Ufv7iyor$~*>=xrVo#b9~jA{0>Cs=ycA2x_oe6<*?`29eWf(xeX
zSc<)1rq?(n)&bM0@UeaF!!fFG`3A-mMO%Z~I04U2kJlncVk>O_=%dAap>LZ*Aek0!
zp<ZDEY-VA<c!;WhzcU5`{QE67OKm&>X`};^Te#3p*>HlWTnKs+PEDsT+SLoeCHuj~
zBZ<^DX>?iiL`++pg&#Vrf~sikg$KbC+n2!>{yN#aO)2MhmZc0BY!+)XsSnUHxZcPe
z`=6jYo2BuW)OD!u=Cnv`9j>4$|H=15gAL|LCZW&@TrUfO_M}ClN}d64;Ss@Djo2k<
zx<)w5T8blSjV8Ejia?6xX%>k2cW7CmVFP-Y3<I(cL>QXwV(%P-Lhs*|!Mss|hewG?
z7cbu&x={iJPB=;|nI<p6^qi7cyM<Br$vwjcCH$o~Wpr5JogJY_TKzoHhefBbI9W0O
z@6^3h`|l}!p6U%Q1zvg~Mh9d0SFC$G*1#rX><O+CUN16D*Esi!Uz9n`EziC`u<AR&
z&VN^470k5uOFhwo-(h&`qWe!sYRpJ9X0=t%n|WC|J`vQ~nVVQc9t7;9_4A>YrwP6O
z9YF}exxghf#!JF$kV+1F_og|RI@xqiJ_Cz+_glT*8&l%G;1M1^Igu(5i3F58--qm=
zvSx_r6~0XuW_=+x+n`^oX@JY#J(4FOmki1W*jH9EponJ9*ao__vF1y}bP6I(){dXF
z>Nr2f0u1LHQ-9T3cd?7BuhFC9V|Cm~ep;l|GPkFPL^&72vLbI*(h5Dc^vf6H*0Z<F
z!#0XqSNH68>%Fr5ou%~dLP}CkpH5?Q5zQ$tF9(dAOEUFXlj+a<$msp4l<pAc++!o!
zQ%4Uut1!-Nx>FxuJFJR@dcx_RMWHnG82DO4#EuA?P;O==)B_bve>xvzI2sK;nY!w?
zPmOGPpVPqZOn;=kxlBz(91c#YRPq9q^*L(EX;f`Rce4slD3=V)F*hqmL)WNr`O8yd
z3{5n_2L+Q8zCcM*|7{u8KGDHWSK*`Nco2P=-Vd9~?t+LY!wEbYL~Nl{L?m<bpzHy`
zKACmIV7=0zm0|n$<S;R0Sd9$5{zLvCwHej!?bQ3a9G6d<x8gIF?}eUxIPQ}B(TH+g
zs*|DmQnTDh*d<MS-d4?54Z8jWOPY_ohEKi?+$IhyIE+9{WGYRqhawyr>ax{%KaI?v
z<={O}gNfzWxv5uD_e`y2vIv~7iSB38uS5QzP8*7oU?QUeq*Kn4$H{Q=34=NMPo%lJ
zyYMoqIk?@%0)ql!uo{OU!H!wtj)5ve##tywD5c3&a={;_1V%>+?=EhW$4eUXy9Mcu
z6&d>tA6RSeBblLvjOp0sWem+z)71u*;v8Qdc+eFs;fn7rb4jAdjh~wk@br{!r}T=`
z<68UXUZUbrufL5XTw(;#>WeqNe|}IxJ0g7gnK$*(5su3B*mBjBzlR{WvaMH@nKRpH
z;xgHr2kE=Hl4E0--|ac2lkaPx$~ZS;ZUcLJ7@!vYZLN;}TZ|qfe6GT=@)L&42tm+u
z-`+xOduwTRsHm6&_=rKy#SrHG$;VF2V$ho-9=t*=xW<CG9L20Vl3;%8^#YRT&9wLC
z>lNvMahW?<?-07H1!HLVj5rb0wOsg#=9=^2@cq1{%(Xr7qcLf{B|MHC1Hz#IoU8LY
zJ4qn1v64%_4nHEd>FA5dm|y;CnQzN1OV7ZjvOOAFCDAq6>7A0p-gO76SwkirhJBBc
zeXawCBu`lqD!E0%reNuATF;TvG4_ch|K2GCGkLn(m||MYCd18wQkDUw18T~_<vi%R
zSFLG8g}}rs8gmOh2Fkx#qI;JHCS+$bS(ccBsWl7$@Ld?;%dCyN_I@W#PZbY?V+FKF
z^1~+VyqlO-A~0Rn>|7QsGAm4ZoSV{7o~As5p{W$2&QZOIml6(GSA}K5U2U!_I<Bg0
zF_s!%lF$2{xK8gdQKgx=nN(Slnz?lpF6L-x(oax0|7frX-r<U55x=qH2XDLk;<()>
zz8mU|Ap=9cx~lhG&};sAvRXt0r4g5nRNST^E)7QEZ+&hJuz&0-$A-l&w_PKX-Jl~r
zec!4=skGR*)&0pRx$tY&$#;3p>(S6WZ04qA{m@^{lX4WSxFY3LM#6B|kIObc7&?Bm
ze6^>m&*58wP9RXw!%yi`w{4p4tdd$&*VTH2UeAH<{aj%Sets9UU&b0I-E)~5D9-CR
zXiAp&e#xu<4<$ZPp0*;5OH^ftOq?&p5lPi_VZ-P*g{~GDk7%<pE*&*;$}!q(9`axi
z1kS+&Q;&e6f`U=qtr~1HJ$Juh3M*hKz{uudK1RusOi2eVO77oOme<{Z)1P={7=KF@
zbWIFPOr2zFyUZ^#5#H`l3H{NcDs?HM0N3<GR3amPEw5zOnn-fdUmBHtt<d{z<u&+<
z5Qr<8G-7}h#C&ch07~+RO$t%+tQ?OjdK^n^ZTsbZJ&SU@4%LRC2Q-Io)eeVV81rK#
z$J-kf*9;ogo$})Gk%GAX$3^`aroesuP#=#<(ln!KW%9Vn!=1*1J(Z_?@8UxuF0hE%
zN}RK&{<N1R)3T@_An)h4>Fu0Mi^&rsjUVBQx#(4)f>cD~a*n@RVp?zeoDYVr2$>2E
zVupl;w;a*l4D&os3o|pwn*)xjnxqAeCKA_uy12YZs*4_3T{Yx2+=xVU-=#rHHn}Pw
z+HzvoNgcRnjk_nw*(M{YRQQnF<p++YVc|>NM~SR-7&}XpY}X)UoF1w9vz^)VRi6u)
zMMM~_CR>SQ>&4j_z+vJg=H_!+7j-9Z>T+QGYQ+_16Gj1*ya$uii{w*?oNInJt=e4+
zF-ykw$_-v63c^~+ITBy@+SEfFO^??v(tg5cLEub%i@nQH^;(*R>tfK-s@+qc<|RF2
z#lihk(OqNcrY||CxbWOqrBYr>CZ_{6ke@O?*pFLkkba{)i;GuFc|b9#C`Syv)=T9J
z_hQSO%E2^ai*xT`q6o@Q#kS$15l!b;Y<s`A`=>`Z#+PDGL-Xj+=n!#qpD_$Oa6v>G
zA=p573ZHh5XM#;l<*OpJ&g0fS_4_wp21|AMuw(ld{c)2r!m4X_6wUj+VommBWX?sr
z7ZFk&(CZlI)piK8D;w)+BUj0D$L&!2GpK)CpWP@6_!@7N?!`yv*Xoj*U3gyy8c)oQ
z@L0BetmbJQTClIFk~C|3Y$}wyG0E~nG9JM)^cgCfa_Y9anC8j3-j83KGQQ^Y;Bpqa
z)q9lG`$$!|4zt~f_787p;I@naouzIY6U>fSMLfn!{{|%vR?K|JzH?F8G{~P}YoQwc
z9$W0A9<p%$?$gAnT(N{xa%YXo0aYmI9gN#dMgMDssdK$Z(t8Q{ct&?ZCf_vU&4S6u
zk<!tlp@Q!(j*rUMvfIUKB0UvL9X$d74oVAZVc&Vx*K&-f|4XzgtuTkv#>cMn4*VHm
zZz;>wrjUwl<XB28`)W@6X%b<)K&Uv{#Idn0jgYh~1|A1CBu$^RA7{a2N=OInrn$@B
zRlKP%W3~#}7AK4$H*iMr{f@Ccxj!bJb%j1Ru_o`=4h)q5*Y|x*4k@ciD&dD+etnuY
z2m-Oz?;}L?P4e4)l^UkxdHpmz;42reYgvB_VLUE6P5LTa8{LGPQS0m35$qR{Y};!;
z0x-F{a27Cjz_m$w0+J}uxlytKZHR!DK;2AQfOuXv@B5Nf=_djoWb!^9BXolzW;EX1
zSu`->abxU`hVWB^<^aNnM6^We>Gb+#43fJ10j@pPk^IA7zr&xo$voBq*i$CB0LL>u
zQTxfO;MeL5W*G6QYGn9g%rK4QD(#<k>>YQ-2K3E~etxJ=waAQwsHDsfecVy29sL|2
zN<K+Eh>Q|@ro#vYyQygZ3)Pk(>XrdZ&-7Iq6aBEYi@KkhuPR5?I_T*HLGUOmHa<};
z#*v0*&~m&=0{VeoZ%uOgrt^J_Idmjks-mSCyFJk~s%xGx!PCGxtodZi=YO%^gRWVC
zocKxa^Olbs7%;)PGCr(BM^>;?Q{z*HN6=zR`i?(rq|{8qkHN(zj_F*gO&1?tm8!n>
zNWijXCDK%?=+um&fCI&+adMF5Rc*!%v)cZTo<KnL&xZ%N^=M?5dmDnazu0Xu$%FNo
zc}#vBgzk~sV@KmaR->k+m!!Gn{E5{O|EL8hm3;YUOmglWcoTOc`iXd@W?!^y8ZA(B
z^S?;_wy6MebnrG2kUr_x-{YMd`?6io(`BgTXi@ub`TrdKpC7Dl!+N1-<LsE6Sdqf!
z#b2~8p3MvV^XI>R`05X&dyP--*-H=M4gwYQw@hBjjgwJjUfTCVT9E%W{$F$1paZ0v
zxA@F8nbHREu-<*I_hJOE0wi1$Ev8L?c=@l~XOG(|z}@&D0}Vk;fJ9j{4e(8=U$p?d
z&Me>_yV9EFyYaus2#is3d)GTo392n*!vg4*k|Bn%#%x)(Oh-TwbS6NDdCNxnUG+a2
zL2nyT-LEaWhlReraJXevKEkqpKTB?csMHVqbK)Q+WlZ{S!USzXfE8-te}=S|G6g8k
z`~U;YM^+sueZAwp6ofPJ_H?u0-v!c+2L#%TjWW=_i^<yxRHa*`=~KOG+^L(*0xG)4
z|NoniU@|MlJp>rs^cy>}_a!W$#vb3Yj$h3G$HeS!C-&(;!@*u*Da)apzwqpq<dy|{
zY9B5$4`gir8y4W8^4qek=U0uWf1m8FGNi}=^Zd}kQ9}6dn*SSOjc=)ArDvXQ|JvtY
zLj*mzB_4-8)1dwr)BFb`zVRvG-*peRmj7$d&l!N8@!!1_`Zq}Y_0{7R%;n3I-M|U=
ze|G*qEc<_x^>1G{Zdr`R-Jg-je+AoLJ!=O6JwJ#Ep7~!E`9Bl<`urB|(I(_TEc`F4
z__wbmm_X0ZDX?p$|I4K0-j?3-M9Ertc>c%M|EFgSY@p|leE9Yu|5XjTw@mm40merE
zwP*a>WXz28`@R1%ssEb_ev6F>Wc-Sq(?G?)jh53cKRo>sV13hG`R*^ZFDxbv+vPhv
ze*QlqJibjQdNsCW{#pl5_5}I9*UyqN5*0T#*j(~V&2NUt3|J~W+-p3X{wW#&$Z!7t
z&<;0oq&#T$CG)l6XIv_7CXN#iLs-;JJ$H$c3MX0p1CoEJ=`0Xfq79wD>n!LF1|I!*
z>!YMDQasLa`X{v-f1eXTm&b&sD#$wh<+(OQFyz3KhwrF$=wWOApbL$`cme5i^R%gx
z({}^$12et)n97;nnVZCa4HL9^>ofoGYl8lQr2)&5LGb>K7-w)7$VvE~;uGFBe}pk3
z+rxV~G}+?yI^r*IXk4W{44%ZB(D6Lh`THC$!T<yYP{L1S|Mr4`SYlb;r`Y`U_h(ld
z$Dh*J>dpf83j53#@9GsOiTpaIE!c|s3&>KUfQ%`f&uaf}BN?x`!)btPbElR@u3%fp
zoX*^PU1vjZ?d6LoYTj>!$7b4Lf5+cfF93WqaeCx`tq)K*Q``j>CW;JA&`~TmSf}^+
zD&0e-gphu*qLc0v_A{EzNO6K~yixHx)!(!d^y*gby#w97Te;u9#M0~1oS|gzg37Q2
zYwtpB9YRo_qoaw@dO`;+GxqOTb8DVztpDBmq_%)b*9>bs`fJjpEMJ}Z<z$rFgTC2?
z;Z9*j)82hQ?!!T>j!zh&_K#(r1DgC)C?fgG@}r4uiy4^s3pYwOn$^G+^2E6zI`Skl
zjyB_*_GDvG=q;*$NXZ4XJN}~dA?NnR*AOgo>B8-{3}V?=`_>xCIJjr#`6)pW>%pX5
z#L8dZet7&>_nS%pBxn_l{h@%8f&bcJtIO~_iqVMz_I&y|rZ|_Ce&X3<ru@Rv&eFd-
zlw&@?3rL9j*f-u<vPZI16ISBXj9j}`lW1e3S26C{a$3rd0NG1`zS67aDLeW1O)?05
z>+yKQpXJ;tA!r!PF>BRRN7VjE3gDP8-4JugiX^f|?Xvyd4*oy({yZAW_k94zGmQ+Q
zvF|k)OSZC$Fm|#e!q_6A>?uo@Vr<#7WlbSuU$aCcA&QbL5h|r@iAsx#@Ab^o=l%Zt
z&gZ}1U*B`S=kz*f#%rGYx$kS=?(6DDZZE2xP_{vc1j=KYnfHs&drQRjD;Ha~q$=AA
zkcdk4JNKS0pJiNlv>tDs7M+_+c~couW9`r5yJ3YqYKF;h6A}pIn(-DGK7rEN*(tWu
zB;=t^69qkAh2ggyhrHdJkh5gy78r@A60xc6Ms5#o#%tsB9TTPd*0%c2XHP7N2Rtm)
zw9Z;6rSM#tIPw<0pbcA~h;j6z2If?p+44*M{7(W@H|8-<Z;G3+4`d52r%5qZNlB6=
z;{Ot^q$f<V&*PiZ__d^D^LY-~8=%uLJtza-36X>`OQw2!vuNCMmVex;A%y}jA&H6`
zdR7<V?Pc;L8N1W0$D77mQj{HhQHS5^P}9c=vF)>yKT2k)0)!C!sN1E)koSJ%$0i8W
zm*}Nd_Is6L<dHg5104I2GZ#-R%k)Yv4oO>8QrNj0o?2KXV`s03sqSGWZKjghkLx^z
zm-E8zQ*&hV1-!UF2zQ;Ig(<%Dl$Cv-ed^oUE5VJ6I@L!Os(aFAdCxbIv87!A)e_7z
zOn<4+ripccrKNyoy=*{L`!(DNl865~ey%h=-DQ*H*zio3XT$WsW^K7ktdP!3*yfee
zVDJMF0{&m*?3lRjQqxbTTO(_2u-IzC+zK|uAd=rdJK)taGoHvTUy(jE+=N2gFdZqA
zX_iPO7TFq6puvGy;L*p-c;J}s#sybbWL@F~pvytc&xM;;io!nK+wS}cck7417&=*c
z?c9+@;E!zNrseqC;wKz}2hY9Jmks@~7L%T3A7FhRnqjb2)+L-iN&V%0o{#QR>ZkNi
zG3+wXRU|bB)WQ&oUSha+yy?=SWe+p0Q1ut9zm~<{JC6R;2txZ|8?6E^u>$)-*ECq`
z)}ir<^wGoXQ9*Fei^mQZ%4g$Na&-`{dwUA!c{WV{8Kd10s)L&;IncnOnyL#iKW9QL
zq#VNj_>bytsY3bI`vH*~a&Y%L*eM;#7c9@uU$q*y5v5A$NfUcD>7Iv>_h=pn-ry_u
zf)7A^f*)1g!{%ne`V+392{OHdn@_S(sn>pIy^n`Jl}pAs_Ri)~C4GKn`jUYoSaH%w
zBa(UQwCz8i!rm|5x}os>RoJ~da2a%H;n_`d9j#ZfPR5_Vz4Xb0-XQlr(A>Qmm>HFJ
zecvU2vi-7|25;`<we&~&pk!fK{R`cMhQ+*htON_jP6vI+e-DjIL>VTpsBaN%>?_K)
zX_2-a#+o~8M2XIRjiH|hMBdjw+?#Nn0i6m_r_S5&5C3=_=~~~*E?(U!c0A+VC6;@Y
zN_|;Jl(y`1ZdV*kqZk?gecoXqRCU|%J9~XB%^R?kld4o|;2yrHnr3@EIxQge<Go<@
z-)ma3y{zVP{=3ELS0_y}fkLF1cL%8<U3K98*VEe{KgUWz<Bbb1mp-}nR%3*2^LVuG
zWK=51vmb7vaFDwZ2(2U<2YWD|77mk&tk1-^4Tr)FxA(6j_b3SJCY~v8M4z-jd!PEI
zUsbw^a(Rw(ngP-eh>!YakG#keC7Gv(zi4}7!~{~J!w*CnhwhbDLwX67<Yrz$`YsFs
zgdO@_ggceKZy_xpT=Tq|7r6M@pF$H64pqV0&dmyF3UO-Jl8B|jJsNqFbNAvTbsaex
z#8OR1K6_QuQ6xScVz%zcH9^EB=`JvguFa&ey-&x+AatckYq^8;tdNFoDZv?+{J%|}
zRy>s?Z|TGbU6QJTe!RIn1Ra{VU7C(DktEQ`%1fe+&=%>!9v$L>_BBRS53{I(kM)H&
z(L><;Fx-#xrR4^LiKX(whx|aK9RM?JlJo8TmAGcOEPH5IRzZ1b?jz6J&vad(8eFn+
z(`?Bgc$+`Uk%^<7yNx5m)a((qjF}<;6G1pRqe~tjN5CB*6?x0)RO?KC`?L07PkFa)
zC?kDk)3}W|v(k&1-d+tI^pvq9+%lEpu=h*-E;D@(>DbYp=V%ERVh1(iE;I13(_V%S
zA5IfG7`w@2wao=#(&7j8?K?le-`k!E@Ya#S6&904C0P&|fc2nkU^@Z&xr~}*8I@{T
zrDN9$tpSXeYQ6yuT$^7w(QDa^nMmk*`0gxA&!LroGOgKy{QN|Xh^wqMYK0Nen5X?n
z-x2Ao8*<^!qm$<zOAZ0yPxzg=N}bkPZ{T~Rzep!l$6(67;1z|IrZj+-RLUw$07*8?
z3z<B|%qwuGcX9<9Di~j`YMyd`kMwqeKLtrXJd$6tiv*XypC8`-{lz$G(V3mc?m;M*
z3)1~Tw9GeW^`M_z@(>9ZN6)+aGpmFL*7J$#ce_t+9k`TYS|MV6uamF7sD}O<rGGxP
zXgYALj9Y~i`R$?ti5c^{>AQGkLj$_ty8QbXt%1FoSOYeu1z{?Oc!w#i-MkG0eigjX
zE6WvJMYq;z5DS8VxiXKbIqV!2pc+3$?8@QK`i&qlPInI8*!n7Re#Ah8`idSg_SJzL
z@sFQL{0V7th4?LdO@k~N(=5dQAgKv}(z~+D+#V=(L4s7&t)Js9XB~VOWKo`AvQp6-
zQ<QRpk_8jVn;CP*->YL|{5^C;_(Xi}-TEd*Kck)=mGrB{k|(N!j$PdQ&~jCyKxNA9
zh&876wV%n+Tr0s%QE*cYM@)>0Vx>O!ag52T9RYnoFp%1DAEQwL*g`zZ0^3P^pD3w#
zPVsnB9tX3&zH!H^OTYXRjc%ctM|P-xdvR*^(wir;&iyHlH@3IF-^<&WaCr4g1=a1a
z^_fE!y3+V1TB{sjjW=eY=kE-xt1_56Bv1iH0|C8+1OPAg*{FicdM1@nODkxA8h{Jo
z6IP2C>SIq{{s~<~WXXC89DSs>7>5gYj&C2=kS4vPb06F@X8#mEel%>cBujuEH5IXJ
z&elMEe+g7coXm}`)h|!g71jSN9vuRZ?2~prrhXsIa(g!D)-F_1vg{M3+2Go?r3z6|
z_gnEtaX<g{<}%BxgSI2t5P*;(et_F|?)v*yGQ<8XJGgHB{P6v{Uco#(D%jsJ%IH_u
zU^(qQN2ulVtdSC;C~Oe`+}8m@1rKn|M?ox&#o*fs2cdMI8`J~Pf5+1`*RAxM2K}g!
zQ3k{YMiQa>T#570&#hmdkPZmWCWq!TdOB13va9xiX!e+Js=XF4<>m^vOnJ_H0R7~_
zc<5b9@)gp>`ye|WVxP}J6AD95nuImg@6)n@glEM^n4C0GOdoUL_Nz)?pGb-w$e|dy
z9vAk_=(Ptby;hjr3DI!Z&q!wN3rII^mz>}~8xuSxB))F_{)*zkfWXEYd?A`&dmKN*
z%V$qU(Iyhl*y1tY&<DEEMg>*tR^StRI73n7o=~zN0<P*ma$p;$McM;pNMD{=hu&N2
zJ87H>!wq+ouHXCf(PwBb;lOnd4&M6c-KuVLu`@DJ%yvxubEi0_bOveJ3-i3&%5$KJ
zm7&Xd<c7IsvVEIm3jU5~lycbUF})n*26GAVZ74JYIw3AxrBFA!;Ji+Yzp0xf^U}NB
zYjFs!`X4BIUCW>9JmWthpU5zus<&7A-mjTYxjjrWW5e$+`IZl$xK6arwhBFQeS7|K
z`P?rhLkIsvvKLPHj+jP!L+q{Lc-?5tZhcRkIh*vgZJIrM?FMeAtC8M8_ZxXAMLa%-
zxeI*`cJa?6t?UphU<j`4pZxI>+Ff~F`ka0v+X32$rDh-Wd{I{#j6HxfN75<U;mu=B
zH`fao_NlY%pBs6zY?Zlub`Z3f4<dB?(_F5oemf}f`=jRg4K^z~X%GLnhxWd`Y_8DI
z&s(|gLpLd57~bcG^o5JO`V<P@*h-wXCZAqnztgeiu-a&8uU^yfZK|8gCKF7vNPp@Y
zLvMcSkI;<B#~Zr&EL^{Da|W(hDkMm?oq1>0!})L?`u`-_T>#-6^g-Swc)tsdSq#95
zR_JpVE>B@tiW9kh5EPwfMosqVsIYph66oXX`7DRov$e_4pKqtdN!7-sipt#fPn|Dq
zRUd{SsvS&6pG`gwX1kQqY^V5uQ{~Qo)cAWC=a%9{*L|+8Byv-@!}Jhn5iv!)dkRFI
zT4;atq^2Cyw5{wkTx4OcD<Zszo#8xdq4E2|LgD4X{J>c^FgtQf?U#pJTi1=Q2QQWD
zhZCaqn-R574QZ?-KrtIsEc0-bUwX~UUK@+Ak*B4p^SDy?%w1zG8?4YgJK}=iHSA$g
ziQ9Lsm_TzOqpZO^?(v!oqTu-<d8^XpD8}xNw+)Y_5+uUrG&_ju!v`FE?KNW`%KTbs
zROH(Pj0J|Se`3+^7kluB)c!d8R5Di+i|-m(cUP`6Y2P;OvS=|^linT3^Fwv5YiG_w
zRN*3+Rn^)nxr0gG4@mqic}m|fpU%G2)T6R{c@R@!h6udP4B#d;KBB)N=x}uszLH7!
zWsO1TSafPOA-9A&VWY}&uUf@QHp^}{57Glhl|e+G{CpGg1VcUGd~%>$dDPXcaW42E
z^c2$dY6A<M+b(3q5IRg=cxKXJ%T9Ep`T|YXnn(9k?ME&GW&?**m+zIJ$%G4V3!gCb
zM3xt;dTt!Lo~`fF3<n>HgMc~n{8LXCr)u!+?}yo6qi^dd-TV`4*LnN)`T&s@8j}Ts
zO18W&77%v<8m#0^->qvV>{##viV5Jkpxb(M#HlH2Dk=M2qd<53u6i-U4ovX0B9XQH
zw*$&xT@PCynt3VTq_L)+;@B@p0tEzn+#R(6#iGE}e@edF+xf2GWa;}18?Rh~*;gkC
z?(#g`4Nxy$87?~&BX36}?4b8;$j0q`-OVBfc2?CS*bF?EI$uZ7-<9HY-iEu=8RjOF
z$N8)YCaN39wd~>~qB}dFXvvz9Vei`nmG1i@*UoD8Bt~rPutZz4_jl62N%jMiZVyqZ
zSe}!~6)rG%DDS=>-K*$r?tPLe3|fM5nSE;eh}Wf|4^`|uVy05};=bnMeVTo8h7$GK
z4+;Z*C*AJM_!*K*7<tshDL7rKX`P`dMf@gDJA0(G5L%m*UM>APf5&YcU-Q_w7W4Ar
zA>vX7B5J-uL<+BZP~ROw#)My(DB%f%dfi|znugt^sPGDOrZZL2#;ydXIX!T=IiJ0L
z?Vj?E`GK@Vw{>h={n6O-n!B-8*wAQBez2iAf6yjSSBXr2hUQTh>=jnt*3lKJMP(F^
zR1hNObv*f<9B8O^RMNN31ird>EP&bJ%_h;gD~Gsm?Z(L%E`dOtqIWM`X>}jBKAms0
zCoa!GM2+@(Cs8J#1fzSHW0Wb2LE%N0UFdpmP7U9_o{MYX?BoI^C^IwKsriPF*j;VD
zaFz>2`-fY`SwEvR3s>=5EAdg<32wuJVO4{U=@x&wej8^cEdfR8kWeYbBh+?L4Nu6H
z_7>9QX8R#g72??#EXKHui4aQKacUy`#7>*zxVXm0br)>GGhliCnK_FTeX{IzL$^RE
zG(p|HH&Te!?E?6Ao293)j%WWAU|Sye*REUV;*(f3Wa3Eov(v8jO0I0Cqb4L{(dYn%
zl3Vi^=8og=cf5LZPzg*=HtuBgOmy4q<nLu%8t_4P3hj;Y=)JGW8?d_Gx%K=&M8T9$
z^Fn{H!zoGa9hM`IK~tOdkne%cpzr$oD`LYN(K7R|pnE7Uty-~NpIEom!s%Ov$4F7X
zxU-~Cn_Ej(_u?ZsIQF-ed<2a=YktR}_0w4za|7YmEO+1Hs#kxVxNqyPNQvS7AO|=3
z!T&Tw1=+a=&)`)m2@x;pEd`wS^Uq6Z62`&TyO9l@97jWx==GfrI+yQh;gNZZV}2pT
zn0v&CN$~*#y?#T7^FZiZg@dehU%`uuS26`%a<jiXx~sW<;pDdWgu$;kmASsk!lYPz
zB5TSf=LEB2m&ilMpop^9;sye&mb^&m!qVB3V{^Nn_KvxLA8DvMp^9S-M7Qun3<+DC
z;8I?5zY~vDb-872vTSEbLLa-#DJ4-;A0^OeS}El0<vbAr)@zRX*7}v|6BP;{E~C8_
zrQ)>Hrly&f$`EpQhHwpDR*X*+!Wak8p>vo_^dIigry?mm8Y1=jFCfixv)8{ep#$_y
z>p&Y7|Bhy~(U46j3N=SHPSZ0C;`R`CMYnPEkKH<S;|!}#yn*ahf==K0?JnC%M@Gx|
z8c*9_oGbZZ*2sJzV@EM-m3>`?Z^FgI+5;H>>Gn>|kcG$4gf%_({^^TfAc)}1Tss{<
zVdEi*L<JOn=ja@+w{i;T?88aD`GysH$QO8RqT*Kn4_8$Yd`i#L0ccu!dTZ^nI^Krh
z(vf+Q*ZL+_u#xp;yx8M)e|6zXOwDdZ|9j@6VTfaP29kx^fl<KJ)tIQkDy<^g^baQv
zfZecES(`j3lwl3!EsMeRxB=TxFI`@K>MD<q;UzQ)s{+&>)FW$Cy^O4zV8AHQ%B<I;
zxtq+<EFvzo9pBvD`}W_u<`nSa<Ym&mUvMv-gI~h$Y>B}m1#PSi`)7k*5xY>Z-CTzY
zH14eZlnBDQ#fqdm@9zmB_Tybv7W-;CkK&;4-X4ior6(9(k7T;tn$@^#L8mnY0xS$g
zNk<e=#B)C*Zl7z;dsAv<FWQ(od<=2*a42g?L<}4`@(}3t>tQM@be#QZ&9F~D8Mp5-
z{^;3-(o9n=8JuI$Ck}Je8}VO{<Vnnst0cz1?sm$uG<)DgJ=b>otoq8m?X71*%^=4e
z-}C%ZYIu9L$`<@J@$-FLr+5G63glBX7H2;kD{f%P;oNb5NTA%&<qK$|Y-qYW7?OQ-
z^fh5RqM_$Nz4(Sp0sn<%g_oWzo&xncXYbVqMKKDWHxZF-{()w`&NE`!a`WuI-=DMY
zc{0aiHR=f6HoYR(aJ!t`GF<nMCM$e*&o6An`&R95cVIRM_RkpeMo=b-*tDOC6%c79
z(Xo_ODs`v$nd9!;3mdZU2~1^fzu6P^FgW4@f2-f?sgoS&bi_g(p7{>l{p3rUlk5-o
zD`;hAn#R2?`VFZMd8A?C^OHS(8t>|>Wicadpjj1ofA#VBWV_wUSGd5}(B>R>K$9A2
zm&|&g*9u#Cw%+&g9`)ZTqECJ-eZDg=+=35G{qybRQ{Lc(!V@O&f6hL5WnY=68JYJM
ze}m8f{!cE_uR~9Al2Iqc#(&IW2<hQ?7KP&QmUE74gZ7X&FAzF49qpX%mc0{v-MIH_
z9DWAFe63c55<Au|$OpINc6!Yc1h~u4R1<@3QkjC%p*IK-!w3BqhRq&6<aibFwb*&H
z-=^PDi`yaadnmL^jt>~WSVX;XK3{-33)^y$=#CYPG?H_RKHj93fMIVZ-b$(OmFgVr
zbbid=o>+SgEO=LFdwJAD_rWJ<@oAfGOGR@R_GbL?&8JpLW9eSjJ2fO+)5sBV`D<UG
z7qMRXjf>RJV!l4|Mp{A(T<W=6KjV$fihcU59H^_`JBnx-aDRp_P_zYFLbqwK-jCmp
zA5PhuLue8EiE(B!XnQkvwU*=mKH*D${*2G{6S8A?60}EyXdo1P*|@HkqzE-hx-Pr<
zvc%SF{ARSXnAYS&d~#Oom<T6M7il3E-KP1CXCgSVRe&gm@BO9UPVA2!IIR(3OD(XM
z%X?1KlXY5U+EZ=)(?OwD_x?v<hDe=haHs7x9}stXwyWlo&Cg-WiMd@dtz)hC?aFta
zq}!WVd0+@p3m(uPxJGWY<o7CzejQ!Jshs&J7L)#KmfVIc+M?(K4hD@c_BYP_Ii7VC
z|EAsXA%5}0Ck^$al1%|}1iOpF=J3B-03?gRciiF@h#VJ8>*C&vlkS@rWmZ`9r|MPq
z9<kznXn}iA>w830Vhe2ie9_h=Q1(mqgqKbl*W7k_>~y$BFEgEzD&m9qo#)$kl-LXI
zf4!vmDwL(y*;mpT8-!t3d9YhH`__$1Q@E6<M;2E0Y$drRuifjxDGuJF3V}xkZ-IKk
z0=@p@I33jDrd@g&z4*SVZ_xTN7dqO$!m9Lg7(NIPd2=_*;nF11Xp-R%PSC_=`s(fa
z{>}EIIF0O65|70A>Gv+}xnm7YnL=FOL~|$*KeSRmOZ8p5V|;b&{iSL>0fT$VRhwkj
zp&GJJBg$H1AfHT+p;qynzEMu6<re~dqqa8DX<+`oxCZS7ORh{KO)arUG=-Y+15Wv*
zx`@X^p+J+Sd?dof9Geldcf$Ux$GTf~G(C-b-cjnpdtXu@BVMuQX_oZ<Wr7{JT`@02
z0%%Lv-)FyqK*lj)cGUGv)W|ub)YC|__`#2vEG=BVXbwIz!b=y3D@nuof*!nTH4$C#
zv~;yjhRUVHLjotPqkm=onvitUh>0`Qd@ty5Y<PU`=Gzd)di?!daVeeaNpq4TITTs@
z-#PFM6VSgs&7$QAwTDQUdf__5_J9u#!HYsjgKifSdv5Lvs9J&7M$o33;WD=UzDW+x
z_KPD%F~i;RHF*~}iPqTK9)IzAo~$u3n$&2*HEZpn)=TWm=vDleI9p7^5s3_-LX{u9
zZ?+<$;}6u(3DA*TzpzW_#O9^m!UfMjJkAwtEl=ekjy<mL`cr3D2thSD^Terlu6pr1
zcb-0^{#d&6>yv9A&R;&a+>W%sEh?>b?)cPjyWaa@)X0v<Go{{v5u4%&xo$&M2lEUP
z_P=P*B@*>R;t1AYv%?M=tGhzOWQLjB=A)T2VKOb)D?N;DIUC%uY&}&N>Hvm2>)4z4
z+mEjYcs<cUud_|_5D!1J++AoS(KC$0EP_pt6L4oCaD**=06!!UeTu6cYDA=^KT1j&
zLW{t{J0N-QPF!9D|KANOI_)vXu7#bx+sknoU(P6@8>=%Hf8yIAZ^w}Umo(p(1~w-Z
zyClZBTEnkejN1p!^yd#swXS1#3LBQ*!W$Nv&>FbZmzk-mN#}o?6o`{~dEBjMl%1^a
z4}cy2alUd2B4urauKcx4JC7f>d_7Ple<RXUY`t44<7CNTq0{|K(8rI{P7_1Sb(Xx}
z%_Y7o!pYV57VhBwaaP}xw{8%btl4u4)u3JX_0(`j4)%U4hl@=)ej^4vTgf39Di>c{
zYtQ=LLCAhMxn2)?Km9}BHW7zqmXF!Z<y5ydAod&keWwS?A%-=oq$Jbk5q6&o<Ym0h
zh-+r@Yzumv9x>xL-%~XB4DP^SF(YXFDo=7kWSf%?Ys0nvslKX*)pgx;3{=MwdRz_K
z*jJS?Ve5_I@*^NGIMxTuG#5#4&)>OX$1i8tq1>nKEnl--Ql;;A8tl4Qf+}){^h0aq
zguw6w+HRSNJ;y&mWsdP+`zTQ<#l-)|+M5#`mjh;5(S`UYG#baO7x-OqqqymJ3^nn?
zT4IiK>89lI0?<#lc{Fq?nhi7QLF}o8*otdqb+Zf-YB$_)J@gZa-of>dYcN8=Y7ot0
zgiRVUk{!wr6OLNamFPADiE-i}x}~=P&rWzjD<V7;%OqIze9zx$2F`A-YuS*^d6|+Q
zP2_4N@{2M}MAwtk4$FJE<wPgsv0bkth)$;qA6XbYiBt!fMhb5!w)Eb({@MO|l>y`t
z+*z$I*FUi(8`eW&&iNZXEJvM>9L@jU=C7nCsgN?4I{HS+bRQBg?c#)bE_%ehj}Lzu
z92Muod!bg!S8vnZbZ5TC4uZ46xyiDfmjy2N#FKiP53uW}Zu*&gz&Hj)8D>9KO}M8H
zeXH|5jBeNuuo+-=-nl<N9a*(X1~na!TAL1eVM6YPe!iW@rn}=97nm(ns8}4_Ow(rW
zeSGGlZoOn?##pI*rZPEX7mQrhKEW&B1Tn2HL&CUVn=}$@)!bpotj(;b(}YeNU)|y0
z%G0iUXX<vJ)oCOVOc+D3x``>j2uzzp+aLVTcBeu2cWT85j7X-_<rdWTJU!-?L`bVl
znwGOYRoO@m7B3;e;)_wEYOt+Av?mE|#ofDiAkl<|GUkGa_Dt<b(h(46o<SO&3Jl*v
zVx*s5RGbztRY5g>QbNV$-_fnta}QK0dw~4FG@Fpd7KcvjCi;`BtBzlX<2>4v$_=gr
zB!xaICezAo&#ml~iF=}HiZq0Pe=pFU&t{qgQWt7MAeRn^K!W#MyjB}W(C&I&vmdpv
zg+1jL<x{I?J{B;N&-)ndtTQ5`X+bu5RgqNtQ-&OF36#KV!PkBJ>SsJM#x%bFjS(`T
z3@q`R&8++t=V5K_f+^(0A)%wnaN<O?=DOay6X{F^jFhza8G+ZkJm$k+Tr{uO_j^D_
z<2F=ljf9=5*c(TXq>T#r2?KXN2bkU>Lyr{|5L!v(O(mGo*;tTu{Z*v;FzrE+E*RkN
zgdFMdu>$exX;}qGbv_ucYfSo&QiK(GW@O3GbU>gvaiKy{z|<D?4Lu%Hu%o)ELuiy-
zuO%8C&iD4wkSlb|VC|{UMj#`Ggxp<XG#S7nKNC%WN}G30Y!%AnN&)SzB&~>OC(n{H
zC48GY`Kb4h@M!J&aUbo1X0#KjC!m%p2AwuF%e+J`Cd<(5O4jODj0ua^`p^CaVgKi}
zigv;NS^44f|D=%sdom?1g~g}HOMgcSp8iN5-vi$grJ42pcqq8Moj?#Z&$aWM(y`k>
zNQe-K?13@R9?FUeoKf)_W-_K0P+$(2I|##8;=IW<QDrp1D;u1rH_XtR2*GsjbMhzG
z{InqzNE5@$-DGebLlRl>wj6+~GaOVZkHarAP`i0?_>PhF)&{a&wNusC;l2NrJH4od
z!>3GN`t|WxAJzY)hcpS+p7Ey750RJ6L$L6^wR{Y|F;az~JhqE>1h*kI&hXEMANs_P
z_P!Ejeq+mDd7g~+|IV;PphJE2VZumM`E$wx0;tE5Eq0N(?(HiH(@Os7WFiVfh{)k>
z`7Ah^v@g<$JW;}RO`VdXL1d*AM-bh@tmY9@$|ykmx(C4VM>caQ4l{Z^9Mv?w#g5%U
z_I;pbPDqjjtD#QyBjjxb%m^_%agJhum}M}cj0Rk+7MBs8iX|9zH5(=Fyd=IAySfK?
zcEJIlw;AX;Cd*VNM0JBLZm_^3NWAPh83Zy^wz&IFQDMHRzg!<*upl8z97j5_KVpdE
zhI)o!I#Nnrl2M^;Va54I<{w~t-poetyZAMgoGgWo<e3}C$@d1NQ8g9<P9dJE0&hB;
zB1_(rw_*Uh%s`usA&XuPZ<suOT4HH54RR{eNblG8KR?{qx9|=6ak?T6^ZR(u4=NX$
zI_MM}z_hN)*DGm$$e3+85JRLC-U!1f{LEpcuY6S09v|EJtNkvk{dm3eJfs_RwnOKK
zwf4wE5cp`xuYx|HVL9fMjo4KH4>(UE2$AsyZrC71R;9mrDYkv~KBNQtu7Toq{XP?T
z)bnw&%IX;t$llkqw7lY8gX}>LlF=GeUaoQ_QL`_!td^_W3*(kkrS^nNK^+1PWsfBJ
z3}5?O0|uaj9-BeP&+(3Lw%&JoSD>3H!>Sy=FchA)PI}aKEG1Wc-P-5d!<1^sG7sTq
zd4NFZ;)H4R1|C)nPHqn-7urC2yXY5!Tp!y0_+p>S%^rLq$b1ka$t2bBb;QUk`%F>X
zAj}1;ajeccGq17$FOr4Kq>taoadTrAWc0<`koRB4DsJj@aavzlkb=WwNt;kdYWAw$
z>Zwk_C)%hgeO9~LPnSC-`FiqCRX2^xfTQYNy2+%6Odz=v=3I1j8)u7qv%+!W!u>(D
zUn@VwN4NwWnd*9(wyFl@+f>QacLMM_>T!3>mZ^-7>V|&N*TklF2AkD2xWvnSXVh5|
zn&{UYf^iZ<Ffk%;gOx8UeMgO9z}ibgsXINpc@!j0ODfx;x74Y=hn!3h<lTT1(PdGH
z_XgrKeArpF3Mk~Po%{m@7H1fta`z1rNgD+>eXmXeLl1%Vm3LvfC_&1-^m?%SgRn``
z3D+DK?dBnLvg$WtBfOr`E?Vh8tgx7XX%xY1nrSJ0{v=a{vL3?N!AKlomPfXU2+@2I
zDr(OmTBFN)=-J@q>i!a>qGOHuesm1==Dw{p8d(_nl3;V07JB24?B1sq`nsaBsv-W)
zYTL5%G@X5o1LnoLhuvc)+ND1-kCw>%R}2S8d!nz05GoY{8v~zxyFK&T2##Yg)#MPT
zkMm2f7J27mh|TzN6lIa8`x>Sr$=G@|jdh)jJq|%6n<=+Vdvq>FbT!tS4xwKxDH}1&
z>4E5)_;IPl3R*K}dXR}c2SbXMFD!Stzl#^IIf#@67*2?u`wX*1m~(y#qOhXhAl)PT
z+*r|`^@B5naOCJEZ>MmDtkfqQpGp7l8)nzY@_5#1Xub4}L#iD@8r0qW0x`wCe5oGB
zfqgnwNJTQq8xdfVpWA9cvRGkSRifoIGx7FZ@kdU4g%G}U*zdRdI=u3reX$@%G9>aF
zZ+bz~^G`QNyKhUDrz{14NKBl*gcJiMLy)t#^UpWiFGyq>qNUS`Q1fx80-5W8)MiYb
zWh}oG_O0>8e1#s(L>8g{kBo4UkJiOc3>#2fThNJ+T;R&sd8RT+s+SXoV+!oMr)C*<
z@@*qI;7i}zUj=DM0F%6jfK*}w(Jpwg<UR0TEc;iYH}+SNeI}2c4@3-&*z{I(1v6mQ
zOPDkmt~Nq6SJjgCsw$s5*6_An0vejTLN-334`=x81~AssgHi!Okf+|QnHi5MW=c^?
z)78I6HbnkoPMdn5yqhC9W6YGszc=IF_d|4lg}M$7r5Eu4%3DrfT2FAuzI8pg;!P_x
zI}!KjdN&FgF(Sn_2SSFAmR5&uG7yrAXJmn2XpTNqrhfAwhR6FLQWHZqqQRwAkL-^S
z>H~1Q0hMQpVn50~H!?5gV)%f}BtpVBib(j$$sSO9c@nD123gIGm<}f^v)g*s$Go}`
z^qvbDPSrAPVln&*Ju>Z{LDzZjp+g5^`H=94SPcVe9Zd`O`EHs%XERHeD&Nm+DDlSr
zwhx4ukwG@GRXS?F*B<mxt9*rv1bbyKtT{Gz(}`!l`u$!}l2#aGMd2iz)s~xeDzYi&
zX~i5gg_j57Yu7lgFWz$^2$5nCH^~k-v-bV+-eA%Ana^<l#HAwAV=q)&#z?PSg<Axp
z<bdFhMWkZNss6_!WI7WtT20wKnCD`W5uf90Rn9>bC%D1wvq7%O{S1OFEh`(buoiew
zbW>A`T9amaQmwC1jS!KP8(W95L9<AR!L_QsFaFZ6iF;cv?_mak-Go!>m1tTN^)|R@
zu1GRmX9HSGixN2mx}myn?_g1+?Ve0NH2x3Nm<<^#&Ny#<xv_m=`6fea11{6FE<^cU
ziq;?%4R1>20b(i)JnR1b^_KdO|6ZDVzdt`d7Lvf~REy{U<6BNiD8u$?P$aJ9C7JHZ
z>?fVUkv2XIHE3ZELI@>$(P<511cE@2>grCgfIv2-A$f*BNS?t1#cbfN`g<1#i*^DG
z59y#nJE5d_n?vXx4jz24yM&4dO5G+wWZm@Z_THVIawd{z7$Pd1cRE|CsgQ{#@{nqG
z-~5OKsaxsZs6`HI{L!Iv0(P7pc{i12$?@kf)P_B0q~E#(B}~W&wbD0%b4fUX$^TJM
z4_<*VeIa_*tIV;y5e7W5d*_*~FrYej%y>U$OQ=xnGpj~!IFJGW4I?IPNNq@g2&R9g
zR3xTcGU+bVISK@Fw!CI+SSU(R_|4@@+O5EEm{i2jGZZ=69pGpdarFeuAu+b-gs6mZ
zQFzEO()mC34Jm~ghUK&mRzV&)^)ch3??viLRwDKSJAZ5#lHz_R=FHI}I9Jz`Dev~D
zbO<Rlc+Qw0VFR{6q$9z`>UY=tWx0m|@U#ba#B(S9h$bVOlST<Pt;J*2ui6c9tmhwj
zUF}I-O-`>N7yMfw)}xqUVF2&|Twl<J>kFTzdLbA6LG($qJ#@I0UKK|?+DWieb|=)x
z&`%P}WSjB)ge4@IewJ2(|BTQ0XE0@WswDJpw)YDpXiuJf0-Ce*xfN7Hiejn{oF4>{
z=iLl-kfrCb)>z+$444F`G?xDX5|q4kM@XvmN2WDjzX$^|JGR4`PFhEQk0(b|0hH0G
zY_hCdV9fGLK>18xgMvb9#xZGCZJeYzgMF&4@g9b<Tym`}9}->Mnc`0i_?X4Le(C1?
zdGLwCw#A}4s<NhyEMs|;1Ky?T=_V`}e}+JGHjGM}b*%P<LIAz;Fk4CI%1M}aaWh+h
z#ps}hmVayWU6}M?9QHJL$dv9>6cxJ2G<U90>vAr#+M*0G3cC|;82lC~U3-+lBBu78
zt1qhFmtNYmF1hfB(i>$cjpNbb<$8D+8PoNpO=`Ye>!Iakg8tX$UgJ2yF|GqmWVfb5
z6Rdm-UCob(m4+sz9BLk{6_-9-qS&3)oK+<cO3|XmgXz};4t>_bXutO2r#hysV6PN~
zSJe1pIJEqJK-rtuiO_W&*@wL6<?Qy+r$anj#42wtlJ-ZuczmAfu8ABl2BbvKIR?o<
z2@SfID}l)^soUaV(mL}EM7ir(N~!-GIGS6rZhgHcQ7L;xt0wO$WLVkm5K;pst?If9
zkAg|(^89J*A^!JeU)87_4qEo4z-+^_1?PW%x=V@=xse!5_C^I#5SN9U{|`_VxXApe
z+AK?3?QF=LN3Ay7c2CIrzlZsOLo<{AvIYzv_;2G{UkL3_7(}X3PCy@%Qd}9(zGp^J
zv;MmEDVz~9lPiv$1xa-wd7+4`p0W9R+kbqtaJJtaZ}22uG)m))j>h<E*h$B2?>jxh
z1qFlLQU-aUb1WzpOEqNpo!@UL_nnpN`?>*<xcw2t567bj;}-;jv#gxEMt76#V;tDW
zY`m=U2Ef6;&~VAJ=wn~*a8KK>Q|7Mpq0;(g+sM$(bYlhU8pDe)%Eq;uR>V{3@{hfW
zFptYzwMPai1j^e=mZxUNdOMC5w~D$Y^$5*FJA7`XM85|;OGFrP6LHIFT{Gau?Q@VE
z%K@rTxxmRT8_QA{u@LzKD2V?pBKN0EhNq!YOr$#Q*q>O(*8RQI;(Po0q|oz1O0`FP
znERQ1qOLFU&E~zWv?QzHTZog%>ShmyU4aPg4>K(>DTAYoB`wb&JY5Ub6E3-z49}~t
zTf`9SfNm6iv(jz-?Ao~}8wy4n-VoW>kik;?Ct{bEG(w1@9Q(bmvDsbHP3GmQidR4?
z6?Z9EKZ0_hHw<f!?i9Wd@$G=JgovNdZ)~yN-R-L#E8>&|cl8d@&E>l0cOPnfcNo&S
z#@%r7_mT%xzmEFZPp8<Wiqr4pd#;M&m(Kfs8@;-klu!fV3LL5fxn)(Gd~FI$|6yrV
zVZOmf?Ep{AQ~>|9Jz~_>+V<6*d{C6p=sTWaDu}rrAH6H#Vhd7!>I&24JT`whz<e{&
z>@s_ZEHYDv3TX}Q;_)0_dkJOSSn4QAGo)lRCgWb~#B1)i^mRva?MMC~Dm>?aPP>7@
zVW>n=$k#GTDZU^Tt4WMP4E@<<FId=E>PU~%eKDbgtMt}BKsahXGagm-F<h#_eh+-S
z#oi2J{T!_@{qDu$FZ_>@y-QG3`kTQQ#twn#X~0B<zH~7*V}G+c7OB$<6D034I8s4Y
z@mC(3CGq~C`u6YCvT$(*(($-Aq-@`}Ns7C8jv6tUeQS%}M8-12s0;RC7q0#gY0Z2Q
z9e|Z<(qDI3^6l*aW9ZRusNK3E)APgW!t3O7y=+#TJ(-AwQO?y`zxE5MCRyXj2}-j&
ztHgQ#JxpknYu$P=54M;bGNI*pQym6zSZ`+B!{`;B0~;ndvcB)(DV@mZEB0r^GYMj2
z!q^7x^(Gt5fTo9L)nv=40=cc%#B?kK^s<;C&dsbwLh&kJ^JvVQ3ew|UnY-Pn3%iGQ
zu_)6#a{-gyYx5^kVFooUKF?pp8Op#wBVBeDqDkvKcsC}4Vl3{Lk78cG5XvM9Fkd`z
z>8DHN<D+E`vEe;yoL~YCK?cSLnd>EL5^sxDI=eF$H1e)XQ!to?A?MTx9ZYaLt0K^h
zu@q2C;7z{SfmF|CT%XLM%l4>-qQlhJ>BnawXCm5u#h&#t-mAE6JH)b*$wth#_}AMK
zcF=6Ggj&Rq_Yf@o22m2fe8QQCCz{NK+9lsH*uYZ@!-dAu^zq;D`>AUVtSx|Pz*0h0
z`mc=9qs{Nw_bU?h%9?-hN-L098pCG2MFP1A5)H9drW##JP`t~qDh|niS5}92Fu2OS
zY;QuU>C`K^*<$nL$*AjEc17304l{B}JCJZJbw`KW$MH||*tOoM&tnoNi+N`&$l<^L
zzR`0ghhU~`VEn|k@8wmxv$FiX2NoR@*zGjWiu+2k%x#{1d|$Ok5-m7`R3xYiqF%ed
z&6LXm*9aJTz!k*h;F#p}QT>hO$E;fFFxi&(w(@;tY7gd!@hKFohB*@xt#X2p@pVlJ
zmKQzs=Amv^%Ma}LuM1IiP{_MIs2ns^b?Pxct}^wUQUzoWNZSu)JGx$FKH1pm@DP`F
z@|;&jA?qEvGj>O=olkTY2;8MLpQf*wX7qFW$$sfVCw#i?J4@DG148GYq@H`4xJyt*
zp4<Hh-N~Xu+Y^DO`IcKQd3<Vu6zj*1@!;9zjogXBYp<p}RG<C|x^()}#^lrJ9i(6?
zDua$xSqv6;(8_UEF?U>;%jZ_yqA=ulr##<jrwi&DA6_}RCr`4k?Tm3^tL0dC!TN{#
zVGjv*o`-ky{nge;dj@sAZy$1+f|0P-WNeL^R%4ksr6o)6#(Mtrw@_JUeR>Cypbe)F
zzvZ%ylKq)_K!N9`oYSpqt-9$q-P_EJ3h%u7v)b};o~f(Hso8?Q&vNDi3~tRamKBSU
z|H3}*WoGihPaMa&IR1PLHc2QrK_Hm<yxWaJV<cFKa=F^3GP6%u62d#@C4F)v`?*z0
zdoogApuWMqmFv|c-$<TmT>IpZw}S1_cMVw`o;TzB?#!pM?76t0)%fCL$%TwtmwF<@
zjP93D1(iNaYLoyw{?983-9k?<FAJ$2Zjaa=Wt?=h&^V9VxgHyS>7~-U!UL+SzE>uT
zs-@jaI8#_Be#8#H?>%tImt9b!f;}clS^f8$Jx}+3PPyz1hJulc@7#xDx0D3R;?KTo
zo9OUPFWPtL7Vp=03G-6^Od^3|*!%jt(%iMH2c2q7`k&Im0Uf!hs0L(go(K`byHF&Z
zFK?-Z+DtIot!5{7%Lg>Zb($ZZyN2*&GuDk4neK9$ejvcNXS|Q;N9f9_eUCdg$7)@n
z4$hO^qd>cB_<bu>zn*@cCLnrvsqALMi}m;3f$c9>?meb5@O;Wscx1i*F^k4=w5ipd
zM-~l`zO6a7z{+p9BJdTw)<X%e>MQL(x_ecm^s?Rx{hmu)7K-3~=!(uw=8pUJWc$R7
zk-^N>9aJcEsVEMY<5XF}LWSaC0i@vRghVHa&v(!DwFi8hvP{MZIkgSkP35o0xx?P?
zacK)>ifSLi#pyT*hI7s_X&C=z5sc<t<~YNhs6CL#Z5{TjRS=dnW5T8skBlrCgm>9c
zF9q0QGpd>n?K<Chj_`RwPd)gd)PczTo;1|^Y4Fl*_)^J3n0KfMri*a16NQe!<8WMN
z9-{PAsC4GoXZZCu>RYOxuo*@b100s6?ta}6*HN*3;9Q}12zLm^sx&Cx`#EG@T@x(H
zpRzsEy71}Y?{m6|5>(5_fzYr-0|KFe`346ILVpDYrp#9BSW}-v-(;P7^d(T-4_U6S
zMXfBt7{3>lx9fgi37LM|a9VZQTJMQ;g!9PIOc^XC=X8!4_J#4(ho$&0v<MM!s-)6A
zyL7J_wfpqc?60p|^zsRLIZzw8K6*<{{@0kppD{s(&710*Hz!}tertA9tedi+ll$}Q
zD6wiM6Ap+&m>P}G8x8b9q0?B;Y!=`|4H`m>Dn<?j4dmvIJo{G*VAw#T{^P#hk{?ui
z9{%;3MqQ~~xnj_X<g(S-D4sN{9S+dSW5=d=woy;t*g?Wm=bE$+_(lmC1f)N+Qr1oA
zxt%+=-c)<hYqdu8W^ElEFB2fCXS7?Xxw&}(*~bo25<Ny=VHQnC2q!#3MWkiehHd(9
zLvhqKK2SF=#e%`VaRG;T<CeouZI4!?g95Aux6{iM02;d_083}zigHt-+-M~lE4BJL
zEz&HvIZt^ZBlA|cCtCZOi`3`$ADwKL8sJ<m8GK1S_o+td^E2F+&_woU=MjP6xPU^X
z6}F^&X2PXhBjg<uBC)>e(>*hXJIG3cARt}^=)Rfl1C-ze2S83R*Htz{*WEh#N5~S~
zziBGe-+PTya)f#6F?p?OSgYw=I~VLNmtKM=qvvWu0xz9muB}R=I;nknUTX5j_RkwW
z?f3I$zm~2%3G6%{rszLEi_Xl^Os8q-VC#!USU8;U5Y|c7uulfy+S6;BGV7^P-&)*=
z7BVimHgT&<_WF9_N><p<td^NMkxO$T4T@m*U7Q_nQeiz9U8-f2C4hic1Hig#<}jTe
zeEx{)W>$6bMD6>tpx4MVv!JUI|L5n^Ftzs2S8ES9FX7E+OU=(ziKiK5LHPq3gXhQL
z42p6r(sq#c(^WBN_AI2$v<KQYI>+;!3inrD?_p_#fS!l#!0h+V?=CH^y^_0#Q+^4f
za;$Op)Z+z5N7&5IBXpmg8y@cLE?J=lOYW~bLS%MizhK<(vZ{KdewRU&j@jYaKdXVK
zTRJzYCoFd<R<NH@zNjl&M1zn8fet{xYGchui`6$NNa(^3OMK0>n^>&wmSz(S<J``<
z@iS*7=xNAoGM{&F`yKjyO#OOs(9h_e$oqASjvp`bdVOO8A<ZVC@&FaefR1%%`i`=Y
zgJJ^n{03^0dS}y|IhKk?>h}&n_1?9a6<5XOcfnF4Mg@zasja#4I=MN@31rc<A4HRD
zdj%Vyl%4J!)d0Ab$x%X&qlAC<MvEwOlTOmU6V#4Bsb^0s$XYLaEI;iAaQT04A2&=W
z_;~wTP*x4y(K);40o-yi<`_BFGc2*$%FnSAZm?0Hyzp>5jQKkW+m>UArKt@)b1Cnc
zkbfXk!6@y|L!^9^7b>+izFamKL?k>=#(_;4<vA1yI~<^|YKSqgy*sC$KDR02t9sLP
zsv>wM8y$aAKV8Qn!T<42CiByMqjQgr{BKsJLJ6`i1S^}^qWF*49;)RjB7hcXxY_*K
zO%|2P-&`d_77|1>8h^@d|2!T9MmTjF#^`h1sY#j#%Iy3{@>6d3XgXd!h<}tvBe4F;
z*48$!EFy%N%+dq27=7W1Vlfa8q#^p-`Dx~MS@>F=P{9+^ULsp}Yq7n@uMd6CFTd(|
z)lEi+UQoMqTz2#_0`C}51zb**qR2UgMN`y8rP?z!O$T3^w6@y&9#UGA(XKC8L8~}F
zv{(|u3?#TPq$cz9&m+ti4s+qC40hsgkw1S&dFkb_8k#XmRj~Wm>%|HX7CXasZ^YZv
z@CWV4{?7m0C1&dLBOL@L`!N`OvokLK$QktkZn|5qymTWZSct5(uO^OPcX=e6R+{T6
z{ejxuYx5+VneRs|eT7LgPdO)oip1jtg3L&axhH%s#+x#lSOyXETh1&ORYz7v<gGcB
z@b-QaukB#o^W^7A!?&v;!M?2ev&r^NE_dvcb}x4Ru}eixjrl5s!z~()`y=dM5J-|L
zUCf}Lbe+iarxranem67-_CGlrG0%kS#Z_Lx;(gMc+RvoVhHcCs2Y|-~;PL1&9kKyn
zj`~tM==bxi-K8B;m)U!9H-CiBPOx78{HG!!<F0Do=2~am>ziVSSxmi_6klGX2%K%;
zwymTYvhLf3uS?!hvUI;qtf0T;;FB;*M&GvG{_I!FdFxnj{a|&N&}lXMcHqRlAB}3u
zCnor+S<FP<eL$#~;Ft@`xHVVWkU7|p<i8U9yx~mbMeFnp+ZeMSpMwt^lX`wNW<>ds
z26^8K;yB!7)xCCvW4bWZk8bifEgA7@aZyQYjy#D!d|=0-(^tdO(SH1JL}d9CSYAUR
zL6c0H8R+-M;yH>*&gst$W8R%Oaj=D`a8oU2)a;QadGS&ZVmZ1h71RiTjO+@;<OcL=
zB+NGy-Wbj-Bm{Y69=}?-YV&xs_YQe+pj0W)i1zMn?8pNM5s^U)hV2oss6_LO3(27R
z`$t5={(rwG1M2^s@Zr4E1p_=ARYSfd-er$J`WM+ma1R>p=2(%@PcooenQ2!{2zP$f
zd2w=ld>p@AX7-O*f>0>ti--V*KV;LcOf@aX_Plv;x%?mDq%mVt@MF_rhUAssgEIcg
zNTDm73&^QQ)wQe0QYlqkTYr9?e(vTZX50y>X#C`A!5rlsgv3%+&)wC@6hcg6j&;P3
z^TY`{_!R#*&6LfnbQZUt<tmu@<UPRh6tYdpCRlI^;0&r*%A_cNsna45ip(h|>H&Ek
zC*Qh67SmLyB>0+?eT{6zp-TmUVN7f*Xeo~k=ZA+Es~B1##sn4WCbo=D-|5*n@d}>N
zBPFqji!F_E_5~t=Fd|*{7~#nsq(b^pYW&acYOUJKU{F!^*2aoW@he}nA)m?-kD8af
zm>{T+#tXV>_kzqtmQiMdpFCQ0qWpvx*xdJay4l}NF0)`$s-8~elX<es5&ej2V74Qq
zF*}&$A!#R#>vD?HKW^O*YRAmo%bx<2S6FPy0v!dGhzW#ZR*fL<84o(fpzfJP4&?+y
z`1(wPyapixUXA9lzxtQesp8Zq8^m~m#$90gx7f&sn@AI-7>cC96bqR$>Ob1P0mD1~
z?it-M$jZ|}K@DEibhb%&OI<xQFwOC@sSAY)04IAWIAH~`v%k4b8IgjpqXcv!?wR`k
zwH;D4oPjUu?-@*Y>wo+BGOJhJcV)6sMCI&jLWDqt5#^P|QgFf!nlg0+m3S~=3>qda
za{nF|2gg<YxBjp1pI#}rzk7W@QkGg!pLu2Xza<8NG?Y<fQqVUAJIe89r3{5I>fmcp
zBxUbt*gLI4o7msI14~dgfPEna`-@kP>$PcVZa({u6w+7@M*OpLfc9Nb%zt5o#nsuv
zs(SzDs`T&@Wz7Gz6*vhE1^IyA`sz<W7lC!a8IH|kwSRdNmy#RKgfDMu-K3Z%5|7dN
zHY&<V2#)RH;o4UyaHpli=reozSqM`w&JJJwuw-1N6o7ShU)SpY7jsl76>9KLo8IoJ
z)+i#I9RT5SI!=u=_s5lr$Sd#%OxaR+)k2kmICfBIhk&R^xtI3U6um2qdqzt+02S&L
z@W}!L`DeIeAR>wwYN!4_R07+4NZDo_vQ4iGV}G}a#&1&4id!=#og*}yetKPk-OZm2
zN7#WGD*(LL(&{wjATE;#=e$@|R0klC6(eAw0ZunRZWW09MBfHb?f0efFa<!SAjB4!
z|Nq(2pI_fk|N8b(<ueW0M2!7+SOWx1&W)9F$cv1Q;r1NP-NlDW+y4#aPZ2oiDLClV
zqaZ%=PB^m(zAtBi=O!|}$HMUeU2*w7ycFPoj{*i<>2UlK8F2&j84r?o(lCz3aCq%Z
z!{|EwNmI&AwlwG@?f46nNMPmNopG!wWWbHm-=<ZWiV{1PHn)t%Z+N{$LPPLN$NwdT
zfc1Kh@%cI#U5q=1Pe^X2GVm2Unw~0_i@0dW?+@+*BZ1S#fQy3Hog}BTOt&VgwageT
za-;a>O-okyGQ@vmrK}y^g#+%2JhLmBg40TR36-HzV}jZ4levjz)*Y5Lf29tVcm+Ht
z)hDYIK@-fepn`K#5xVs~zWpBw7a&$IO{^5fqKXly8<flYj7e)-l+nGiyX5NCzihAo
zN?_!2g$IQR7-%I_9^K>&mY~z0GI}_PCB6J7w`@R!+SCUzQJ`Mh)$rUicNae7`|6MY
ze&C8QeH){>e)&Hnjv+FY@&Y5H0?y!-mn{cgPEF<7J`kod$dil`(XCLXAc=rN1S_{S
z8v-7jB&pPPUBN!G=WFim%_|g{NjeT(Ufz5+7!f61&JagvbmLCRxHT~<`Ci$Y$R<bu
ziLfLnDZeijyvPQN#fF`8jnh;4ODBq_B1iMcxF{9@aaTJumW_O4#-5QuAnW8ij#j=R
zMmfDUyeUrMt~>cnU#a77>Q_W0!H#`{hpwxJG_)rF6CDS20I96D78KP($aLvFhPP;+
z%+Sj@9xdw6Pk-hgI*$W7&D{((DCiU-wYe$yQyDzhvAh3QFrrIA>gH&QSL`GMHe0an
z3|fE2Td{Ov!SXMJjsS!*Y;5hx5IRJ1>OXef)tn<G`$W23;IWhBH>KPLR$Y8=6@+}t
zZtE&?Q|Mj8^TtyhS@&l@oe$=<I%P))d2LcCB)g6f#Fy%XX%NLpcer-vYm&pDpsK;D
zjZ3Cbjj)vIHL>S-oLj@6Pdx+2uTW_Ee~JBKp@TlT#zANUwX3DSKY;Etp*Qj((>fCL
zLE{Y++K(V-B^*J0VLw$(rw{R+h96&k*KO$^QPm+IDx5F{N4)*g;UoEoOr$xvyu)8b
zx2zBL+<3@j?)mv`7ip*7)?p^GvA<>kGnfVVskI7YM5(yZ9ye};Se=CcV0-N7XeI^Q
z5CSr|voV&Bd`n|0%XMX;Z<bOvsx>GLDHA0F8|Rqk@K-Gr^PA4*lp0hU*SZzGBgY7U
z(Zx`@DMlqiV-&29V&rnn(jB@Nm}cVs!r?z*1d!<Brq^8?F#@HpdXg3X8g$_VF*w+9
z8yi*f!GcJj*Ag0qU)fV64(T0WV?h269T`54P{yT`oT_Q(_8cQiTwF>GAa$nMiX1V~
zH!YCD)qm5E$k+U9gOGAS_Ulmyi<WlOxUv4cv1Y!8#b4NaAtu3Jvj$?824(dY<d7-^
z9gl!1%bJx={$W=uHH`Ji{s>_lZy|{%%B$Zjf+?g@&=DS9+>viZVU1&S$E&rN=^IyP
zJ5&<n_5Vl2{<n=n7(va4_teN6!000p3RiZ5ul0<LEEWh(sCj>ViNRbVjj8?LSD1ln
zKSh?lBsBHsl%D2JT7^!rzpU{eo8d<(LE?q`5Yt~X4U1a~1Jo?gQW%7S41~l!1GL|@
z@7O_E{jb4+Bfv{278M2Sf3*QdKkXB0$3KbnZ>j$nc7K%uqN@RXo%A~>p#Jwm6v9dW
z^Xk7JQtkrEeoF25pI86=0LpUbe;%ZO<G(hWsP!7`ES>`k5JOTp5mV0L_<eQeX5}0R
z9bGF|>c1L20QBBhIyl?z<BpS1i+B0E%O|Ve@m~2|Q@rsyj%QKJbgr9iNB5m1I=JxJ
zy%onI$Nf(LfU}J-6Zu=_q}EK+IT-TIFk|u)Hj)xg$RZhXOt^E4`TN!$pCTfpYKj;f
zd(h16u{7U|Lt|}NS><Sj**i{Pq-PjA;yP`{c{RC8Uv_U>HA-(WUa226|D;zw;Y0C-
zEStl&w<NYywzo&^M{fXJ`~fM-{OTeHrAa^PGQzSy$>P?#4%R0>uTw@v#h9Kqd8_!U
zc2V&|WmDIhgZ+x&@@7c0!Pyzznk%2b`wl7izx?X)nczH8W3Lx~4z73@b3Fb7TReCH
zB=GAzeJa{2sg%nCYDzWUi)54jATOy?tGB+>C&n-lH`yLo8}6($V_|^7a*{w}qLYea
z(UxqYmm+U3HQl{VK^x}{Fa+Ov$O;J)9RI*>H}cvANlo-Uiuwti)Op_->}C|t=W%~N
zVO7(#^ZSJd^_M~>1qX@`%uF1Q*<bL}{>MGbcL$?q?`|pg{CW?L$9UMOKqO?;4-_Kk
zaj;it*s`E?R{wX)$rdl64zgVkBajm8$Xe*m*`r>+s{b<~;djRgf=tHVO5m&AVDHL&
zWj9kpsbbz$PJPHySZStl&Z0;@ReIKYYoecyoBz%UFa8j776iU9;++|pS71o*VejuP
zpHP0O#YSX}V12brjo0$hm=_>8o`L^c*cAI2ON<0SF!Xy<M0_y&8PKMBSzqlWvVdv3
z&VbCm>7`69G7KK?$m|nwikL`2$E%F7-5O*kujnrA!cTC7QKD)5Ey)c`3y{VbVX1md
z5f$?EgY^3GDld8&750DJ(ehsStP1QBWKfRjsR}w$DLhqb_JpFgA3%m+cl*VR<Q*hm
zFfjVYP8=~TU?I)59P!m?f8p1?BtT{x$B(Ely78PE4c|)7YQbWN&Ljv{phzyT+sE;T
z5IO_MnFHho1CEKI(0$<AaGh@7FHgD(SP)+ik|y&$XJXggUg8Ji4lbknY>2<K6DWOm
z-7(fU!YkNTXS6+Kr7ArTk$X=E@)K|jU`26%tja_;!~Xm*@ERaK_~U@w)G<k*su%z!
z=f?t8^tc!fXD!~6NAY)O(TG^>{T0kxf7Qox(RSl>2U`~s#6QRn&qi^ZTVUkaL}~Er
zC&ko`AEHR1|9~L8s}2tKz<Xbb;BW%L_f2yvDZH4&m4o8HdN7F|=yyjC*u0pD+CicP
zu^{}StU|lMgh2RD#O*MgXZ`loG-ljEPk3Y9OTESa5(Xg4x^&U;pO&S?zCyZUvh(6E
zSC_ajA1^PjVmoh$2$Sj&bc{PaGW_3m<SqeIJ6j`yOp5L?u!nUKFuOGFmm`e<m@9t<
z$y30bMv%tEF$qF2|2NBJ|2|6Wt!o(0q~*guFE|VoSWI*LFHQ3>zql8+-Mn8OmBt_+
z1qSCq$Q=(X<e5@qBtZ09&kl-{;nQV`JnzMY5VI$ouD&VzGxGvS_YtZhLy8h1fx*9P
z1RJjr-n9TfzUiaMhN}aa>>MOpOA+5%Y_a?0>lqDrL%yvXYS_~My$!LM%Ql{{Xn5Ca
zI7FleGc{Zlf(-&2Q=VA{XQ#?=>LOxV!CFg_RhQ6W@YBDJQg$ZcXNPKSJ`?`A`{^cr
zLV*L2%)QPU6A;P0J~^MeCsAk54iyt9l)V4{Veh@;v26eN;kr^Ob(<kFLiUJ^va`um
z_Q<Aep^WS(*)v<pO1bQr>|_hs*%6YNz4?94%RRoo=fCH_=a1+9>wf7v&+{0c<2XL!
z{eB<NHx=Exu&0Y32XEnVaAp*o(ZrsKauN_`N!`d|IEfO0%#iCDCl{0?q&Y6+NJnAQ
z;JDeG5INTS<ZvtWVi#~bG^G9YYiUCs7?Kv=cY((vL60g3LRU7C;hzln3t|NCY4};2
z%dN!!#zh=Cn}p!cf=Ill0kKc^Bgl%#K*IU5`V$s*>`a<7<imDu4O{zg2cJ(mVbUv@
z3Ogw{QEw2)DMDMdo@ZgRNeus0hTg!)i=3(u_D%s4df89cDv3P>|NR0kl72abLj>Jo
z=>LJi=E3A~)lM^TPn%j<u?6fRy~insUpb`AQsBtI&T(1UgoX9ELCi`Uhn%z){=ax(
zDY_#WEVlo*Eh#7bUSc&+ig)>TcSu$ioPnV*14*SCs;C=?>x&&2oD4ASzw0~y=9w<Q
z?C|)TDjn|v-#&>$jTh1@r4U<pEdTU@368^nfnYeY^l@gu6df)eZx?9qDz~i;nZOyp
zXAcq<v%#@nhgjryXMgm$3o5mAkDgnT`W^?jY%u8>j1JIMBA5_{v6E_IB4ACfHo1+6
z`Gmu6!w}-6aR-I73&Xc=Hs6YK23dm-ikL~T<D;c*cyxH6y`cMwCgXdy;v6)90k%u|
z?9b}4;4jh~TXYX>dx}MReSs&uH(-+b%@*(CoPim<EOmn!2m78`-Q+7sf}(8At&A4_
zk^$~?{Ns5HrT2LlqvNw9>pX(x8aze)=h?#J*a}^zqrGu4x85}BXuZSLSc8p1{hCx5
z4ry^NCUII9=*Ftv32{p>)86~K9<UXDtEvSNjyT{mYs+%(mo<rS?j?=yBrVxa1>S*H
z#Y{d%5=&0VK%Pehz+Z^0?D}8}ges`PZg*@UoBm4RaIWhY%CYNv&9b81D@nSKw7pl|
zDp~m3&I>}(xfd)kbL+&$J=hr|WnR4nL@Z=$HJm_($84Z#$+2jBJ7m21k$sR(y7G>9
z|0+PF-8ZA&3tw|<(;17T@>uUGVmXKJ{^Qiy`}Z5@t{eP#PP1?51jxaKn-hQ~X@MM%
z@1y6?p^K9hu=BkFlT}Q;tnMTJ1Uma9tKiK}*wJ>_ApU+W{sTm#D@<*oqGqq6=pLwc
zQS0;fen0?lMKp&KG#z7rk+)?7saM@_j{#?oftL!?r8q<9Dgou+NxhzYcJb-T(c$uu
zX>57#yCJ>aEMNeJJJik6!So`5PT7jO2pV>l_2C1+x;InrG|Mi1m<2@dTfJX`1M#A@
zC#;?VJH}MtQw!QDik+JG9w@%uj^1|6a~rizZ<8j|ixQb63~Tn>(DZUtnpTZ#$CG;d
zbWZhcc^a)uxOKHnDo%s4aK`rwY3L}Q6W%MhxVo24?VmV7dqQ3c0P>AJ8VU-@#L#P|
z;OEuNz>IiZ1$dD$<;JH!s)qV<OCJxKJZO~_+wB^3J+;y~P`z2QQ2MdT#>r1;<f;(J
zW@=O)l5qVw>-FbsQGSZM6uCW+Av~1SLw}1kbjdD5Ik!+}bWY<rYg(PP#t$eiJ8znK
zZJG(4U-9K~H)<MUNaV0^br$epsN&Z&fxQFC8qfc>#5n`!$7Ofo9JN}gt{=7Jy7px~
za;t6UBEQ}Ej?3fsyWAshe0SQtwwnDm_G;gKKd)`Sh0$ftE%qz%&h>&!TUE;*MSj9l
zR|h|O?S15O%wu`$K2R~n=k}9wyeutn+5MN0*+h4E=qd$K7_dl+%tY7XC0SDot+js0
z2$U;NkDuZWS$3=b<;JgDUTn)nD_?QY_vVCmADCqpWjV@Q8EjgnPxlVlCUORYx9?QR
z>Gay));W=yTMf}#bJ`55*$zS*6#lGKy(hxA7j3W;JyxR}x*@XqgQ@kv;Aq33==ar?
z&R?QipV&!dL&Wx$!Acg|3=SWY{_<-3=zeFjFVB;2uT`@;Qg5>`Hc6y+6y)JLAK5*p
zEY4l7By04jSsbp;S#9##a`uWH-EI_E+JBR!n|~!7;gt<00y{#T^LGg`KwYG{TR+Tg
z&(|Ez58f9FZHK1wn=N-hjt=%<o)9NBP<(`bg=EUOOmZD`b3Lp%7_O;GTPTpT3O)fK
zC%IQzWIy@xT+pw2$lg5J$TARdo4_-OJE*_6U*9?qh*wV+{`O$O%xql0sW&XPdi~wt
zd(n-|ySY?kE?q(Typ#vs(}eW0#yIkQ_JFp)B9UT0>g?Q~em2Gb@e_J8*SX4SdcM4t
z7blhLE9C~s$;CxLBw46Aq?a()G|}9&OXlIFvDhF%azRzd(aPj>3iSm}_6u1e<?QWV
zhrhf`5xZo2KfvqJ9MW}(FjOtas8B(hzF6CdS7D)X4QqCt2#(9_?RGtr>Cw{OHnP6&
z4yZt!VM|)$tS%eOxzlfZ&|jTZ+`9o#s0DuqP&Qz9yvTG9KyY;5lz+xWZUO20c~8zh
zt|cSw<=gZubSF{I5ihvhg(Osb3pSG_wBO@~@9M^5q4dtxYe_`{Rlm`AK@{`B`@Q!L
z)L<5?vP%^ZSW!`#=|bR|?a`(!qD>at;7SoB^Q69md99UfEA$V2brNuLkX6PnZAvVg
zUG#WfXT1EC4%cp}$fN7URE%`A6<;567Ue8)i?C&|P&on#@lZH}M>?EDHD>acW?9`O
zQ`P7zJ!IcM+8Q81U29+?a|wCQ%y+;fJb1FXr1%U58;7CCT(*Sv8&d3vvs7bhq@Wz%
z&>L63<;uXintWD-c@IGv`8p<i>`_>eUb()XZ6;zxnodF!_w)7BGS6u?gLsjNbNAvX
zAJsd&LVz$ucVuo77)dF<aslbfnga3k;MMA_Dz2ag(~6r*Pn(2H5o{V8*QK7Y!wrL@
zlAvJ+rx2|vfqpX<8vLiVrwW;uflReBD7NO4ALmMFW8pp4k?_EdERdt#gham2h%B*P
zKyyb#6qN)2wUa3#N%reRuye{$C~%*1K&tHukF^GNd=0jJ7jAXBbc{$G>~|jR*S+=H
z{zB6c3c{H~3Ju0d1?Im}>18PE`I8aJ=V=R?z3)wh&Xqr@27xCr))o~{KKi(7UXQDy
zJ35zFM{yN_94)-Xb*AeLU}0#mU?A*;(672>n$!8y&~6`{$Ilnv&+uKDp&H-v7#F~!
zd}&wjACAXaX2|cuR9CDXI({%Z-cAkQ$bEb+k1Ah$L50uOBNbG(D6X5>N)v~#%gooo
zQ2^C``ScX^SK?6SiPtyqZ&2`>_4710YB(G}+Vgti*U;1)rX@ft5$Pg(Au;T<;m_fp
zjIzN}+k<Zp1{baY!Yj1LB)#oDXqPg^4%*+1qIY@`p#0ae*U_Hye&tPxcYPNHU6o%*
z70`F2T1cP=e1CtCljU4{DNuPeD4eyGv?tA&S8hSHyRXZ%Dn55<QSboo>u{l~oVUTz
z`7c$^pFJqENqYd$2xP|yby874C7lDePvg^au=65WIRFZa$hiqC%Gm`ye>r<H0osjP
z(?vnCDuCsNdegn5^?RK%r+6>xsd#$0C+<HH+j{cE@29}$-2$4uf`z-z<vr{t0(t6j
z!}iNSg7Zz1%S`+*C-G-Rmy02cTfcW-OZO10x;1o&5^7}6`zzDkbM1ari1f5-WL2nU
zO{mB`4_D53J=7!tP!b{1!!-!*)AXIJ8r#Vhe~LTG=LpEt!S2J`Q<ZRbFEKBxT&4Xz
zP|=xnG-G>cx@lFjX;tzuPGIpW>0srOW1h~f?yLR8pFphBz->{Qsh*Rg08gPO=jl(K
zS+}vZH-l75c7EE-h30;-<SDBlTYA*I=vSE01lT+SzM(mXQOFTL=Ck;NJMFYQmNdE=
z^YQDWr(Y;d$Ja#0Z|HpBEBQ%Jvm>{eg70CzxksG#(pW(gM|z;VzoZbsa1E;GK8|jQ
z*l!Nomz(xagEPaR<;`aCdvtEKNmlr?2i5?hg=iVFmpsL!%G4+;@LRG^x?KF|zrkiq
zA-m(RXWlw5_cd|zk6X=_+h7`i!>9(HHr{x{ov6~&up;2KCZMgzlqp1gwB2MzxUI%P
zbbn(AK?4qF{JCJ{f23Kn+czi|yXsUwaimkTp)>6Bfl&yrFP>v<O|xoFU7NjhXr>J0
zS>GPah8>zaW$BDsk_N65zN}Zva}SB*f2jM*ijD+9?`l@O?7)Izn{%`(omgcRoA!p`
zSfT0*u(&U_|3yqV+A_41y5NfvWEE*3plg}DLvR*xUwOef=6&**)LR~osmAfwE7vdM
z=Ww$oKdnDy=PrIYwtO&V<3){E)+muP{c{&EgrR<w<g7G71^RO;v2zz#WqgrI9CoTQ
z2@~P3nBmKdABEGT4ME?2t#&b(VK%r?IuoL2&+k&t-&ipEc*Wn<W0Lr7Gcd5rY);Ax
z3)gIx<mu_y|J2_CaPIxl<sfkXtQTWB!*DOteGNM8)(JEMdZFvSn!%QW6EBEOd$Nq{
z>rx}6^kOnwO6cN(^+M~?e-o-Ayy@aWiO(itn8OZ#*M`zg<Ya8puj-055a-!tRD;g)
zga0>ZcdAUp8x5W^f6CaRh>3%2umUsnO8BZXoa@@ze1t*qq+!U(HC|9_DV*G>7O=X4
zET};MU@qRp+Xqmku-j)m{rts*g3T6Ip^T9wr;o#=1U9Z0Uue<tljgJ26m-Zbz7J@y
zKpd%^=CI`_Z}%SLd2VE}CH<_#eI~j$*xBJU6<|AK!rBKA-<|$NKoGiL3sRcNvGtfQ
zwW@DURK0uaVX2ZWjz&T0or2f)tH^2FLG1~4gKrw<^3{Zb>jLG0pp235oN`!#4PRx;
zPnx3@8nZNWQjTh&Jogo$ohGmSrV<@FubCTfTlBZ<$U2;E297t(p5j(#n)58l(z7Y0
zR}DVz|K~#%eMYVH(ZLjx*eyIk<HVUb3bL!#if>t|r#-KTxz*ECM~Hw^Igh!STjT?a
z`m`_?5>~`OdVcg7w$yJgu!}?&3*a992vwNU`P1l>!+k&HsnpGtd_NKE;kOO0CxH+<
zS1P8n{;fLjzT(e--@*;_PAZuRbiM1M1&xyS_Cw?Wt6*EzzuSaFxaNqg?4!b%L{|ce
zB2L*y(Xg8o-z$FDr~H@K<ZIA%HYvTK)I%>3f=icb4dmz5g_0GsuBD?t_($Ol{)tb&
zamEVC2u0C2ncz*S%aV93J$akxE36z=DvP7gB^xV}T1|-i2BI5n_9;rM7xfgZZR<5p
zN70WSzZ$aOb9@*y{v6k>RW{pbm9l1qQahWhcPdVRHM_-X?Md|?U#_n*F%5EWTx1mo
z{Pprzw?eec-U+)hN_^C+>??q<tz~9d9F9ryk1$dri|^NQj!S_TC#4H6+4hC<tXy<k
zP$O;vHbt39c`Fjlm=@5ZS3V3`T<g#DFS%4M+G-@DD^?l@Yd>#jSEGPb#y)EteVSn<
zpS<UG>wIYa?CZGICxdPJP8v<yjMZgYHpQ~VcXn`NF%d*7KZx-$=rG&{<Eguz!dss@
zlNMRkoB>7u5FF8(>cwL9VOE*DQra=`F+6#%xqVqkaX5}dG#ou&_$t4}EN1-N2d#gL
z%_2s6)i>Z?b~-s=1SdBL33PW_HQyZEl<}%WHljOv7dj@>$v+WVbw}-eODzp_lQAT@
zeJ_z7mF70SHx@#w<Owzx-#~HOUhPxUkPv#{FH2R8JPSBxEgnrP!j7W`8_hZ%>Kr8F
z8V6^Q@YJ-2wiREQ8};K9&30|XZ!&HvvFfW_)UY<%%zBsuWfji`7;QArXUvWuUN;G|
zg!k(8gE;;b#HkOVe*EhaV+IAud%<is&A3Wkv7J|9+B+BBi&gD8;#P+)imlxcE3zSb
zv+t^|W8c(i>^8<EJaH~$*c6Zs+PZ9Fbol|lIUHqiY<9lbc|Ka`SK*V$Sh~Nm#;8Nx
z<idUWnoNh(xWBZxNmr*!lO)aVH};H>=?T@H%I`@m)N}M#GKGu-m2qsid5pW1`AQf%
z!ot~h#a&*YgK-1K^R!|!gdei7f8h-UkZ)DdX}jT<vfQXC&^kAmb7q;NJZaKHKg2Lc
z+_Tj>BRMF7vCzx#)CW-hZez?mwlHQ62?>jtDRw~66NCpB-TImG2Z#<Up>?s6A(&_B
zmX;EIb(w^w^{99~YHpik;Z>uq^~j7ZPW+8=DRClI;>n3JcPXI;PN9ahE8+FeZ4_TI
z({K-^TDnbx&RE6cb){&{KIW(l{KA4m=@$YG^rWXk8&rm`pk-X8z7x)Ck#rsa*I6Vn
z=F(@SKEf0UMK(6B3l#G&>dk{E*R#qWH_??B^tV<11g2vWI*)~hn~q6lhFy>@JK!73
zT!bHOL3I<T1iY&HC9)iEW2WaQ<tF{%^2xivK@k%>{3#~trnmC6Cvl87>&YvV!JLn4
zRxg!<{aLrRG7d2EgRhH_6r!V~CRDp^AYL9?%-ak=jAq~uH!9uocF)tekRRz!>+KF!
zIr$5o%d>ye7RHd?)tk9m8DOlgMVuEzWn;|!O+6GK9ZNP(Ohkf^h;Llk*ZjTKvr_jc
z*~)r{5X=T62K8>=yF62~Pc0h;f_pms%F;UI4Vz!Y^_R>}h?25|Sny5rS=XhG9GMKs
zDc*t@N2Z$kZgvTgk@5O)Tw$~yqdKGV=FPm{OQHp9YV73{pYO51q)g*Bp409l{H&ws
z$mS=uk`ULZnrjgJSD^A$bbmm+Kq}K5@on{9A&CrJJ7b1>+Xgc{u`O>lvd!8_<2ddT
zx3y8vx7F9>hPFch#y@^W0dlTo5H$XLB=(N!8<Zux18q#W61HeXZ3AU442GS)86>O7
zw~hGXHu&5H9a1jmCHn4P%Tg1jE^eC_tAFk)%&fokI96x6yJ%E6P}REsQxF;s0KbM+
zW?x*MNuABr(qiuVs{KC{DdNKG29OFFoFNFx6a~Wd_@-ISPGZd`liQ=xPt}(i94$$!
zN+GE(G``b;%}r!&1`-lY6XM&aeDBzT0yLiNBNAJ3Rnu10;8d;yBxPVdB`=&lzMdg4
z%AItrMn=&p;5xTN*!Rq966spHymj^DCBz5Z5*0T<$h_I+=l&mmUxlBai*(<ts-<U$
z6UNkcNHy>|OqLL;RL<)yCoIycI<hBF^e{LC|46fZp|w6OcC;q;b}{nCvj4Z`_nvd_
zS-q}!-|egGLW{f}_!omm3uzzugvH$c02tH8o|7{o{!ORQ!o{SLFG0x6tF%!{c;Nv2
zd+mqUOp^Sbeh;;MZ=&U?c!!00O-8JkFevjZcSyRsT8>}$%<eSb)w(H<WjBMwI~)fI
z7y^3gM<zIu_EphdM_XQRJ@9C5i&rP2cT88pX@o8`JVytUe@IZ+faF5q;~}*AXT%j|
ztZ27+RP<Apa7HcUDa%{(G~CvT?AMUt)6TZ&p79sGWTP)!$120jE|ipIxh{$0&+U<E
ziB8&oQ^0l)Cr}om&=FQC%7)U?>Bo(CxeAh(?!aDZBj(m5G*<9)KSNY~zOvqGKKJr`
ze7$)Bh8~rS>p|LAFz3wUX`AXn7V;v1mdEFv3;mNN*tbo+R++rIi+4NbmBF;=bMFv?
zsw5;(157pKQNxJc%@T6Y(N-7a4Efy1Xmf9Jc5hnnzE*uPEdN2g#cBIRt(!a&FINdw
z9a@7hoNWBIHlgIcc0e?IM;i6od3iPWI^kixQ?3V1LA9LAebtd0OEhT`sZ^71aqnO<
z=)bZV1Pcd5h^Yrv^Xr+=$uQPSRIeoKiW&!vuSSJn3hUJZ$!%1G=SuBEtYkgtx0(H}
zt3$>>f_V5MmmBalY~#=;S*Y_|+O16KWm|h%SDzS5$-Q85lyy-az9}|Ge6mFkOn(qJ
zXp#``N9pKo#vx{(qRAD+K_28foaE}@FlCRV7$zc<MMvMur99~biX9zw;gektjApcm
znS2JM{gSNbT)(?|>i31w?+0;IQ6VX2B}kcN3OqjJ-GaEzUFn_R_!k-+P8%#$<|BDa
zv*EHvNt-@S+LaT#@4a^4hfrs<`36}vZg>GJFspC`P2HHFQC8__cX)kxeZ!_BC`o;Y
zQ=@#=(}<d$B-h~`TYFsqE=!Ed(~iy%X#*2h^}a3#rl8f$TKgRG2$G;(4XT0)l^1k7
zcR32|EUn)r(x{(onTI?n)os~VoBq`ske+=woJq*uK*6G3!&cC@goIK<cRCXrN$C}g
z)nxm4Q)cnF(WZm+KB23wRNRito@S&R=6gWwDt}BBGkIXbtAg`-r9#OuDvyS~#rR^J
zbW`O<SZf~3$Gt+%j%67zm5?-~D)2Y_V(kT3$`{-H?NY(&lBHFDqdx^Ra`qMdSgsuN
z!=18DkEI(r<___W62|YtkZhq)zj|{X6~6I`cz>Vr28VlavKDWHVYFLQIZMH-79M)X
zxh1k5%1Q5|P)Ha=sG3alugdv+Z(*o==Q2Fh7-aH5JeVNPnfIa`%p%-1l%x_Un;oDs
zHpqCiL$m*x=Brb3OBnemNf7FnthzYSytNwo)cvVTc&UxJfrVbd$$@q$tWiOP(%gk+
zU4%l1BlG%O3BK&p^cm73D*^XjqbFEw^OC~0Gp&BEc4j%{29!7W=eq#sG6d5RHol^D
z<DGl`vJ89aePLt0Sn2r>U-#UYcWqn_qO_i;TNV!&igp-4<XD*SRuga<SZD=cQ}$1`
z{uwrM^ub7LXmS|6pre5Vw3Du>G68WOlD;c8`8=>HZEXKlYarcHHfDc>d{C)`x`lP4
z9HQ(`Kd*B1@0Ycz=N7g;WHZ>yJKD`F+F*<s6fBj$IyteAfuONeHLL$+0I{)+_o`FN
z4&Ko!*AY*UjV&1!y3XgRhhlVqo-%k;egPKT>as5FQI8F>5_lXyFX%x*=$#&Vr}zN7
z9AQUh>Rx&~f2H@VO2G|xe=dK-P-z%9C6H4QO?S%HM70r5w1H;zjl@6=(O)69?j~6T
zMV29Pxb#_0nSrZni^;0LH2BJ|m2Pq5-B2kxaQ0lp@M%w)?MT!U21cKXz|Bionp4$0
zD`pF&9gANx>adua!E$y{0Br#SvRP-uFK_5>IyKq;$f;<s`DNUU9o*8_nbD}bH#9J|
zf_CepAmPT)I(Y?T;;w~0eHLs%vlElPf_7MVNjP5Cw2QxDS3c`+ySW#~N}{Sac{~3p
zub#g{JT(EmEZr50@;7q9n>UeA(uT<Bwy;uTD96eP8Y>r)ux(P&4i4&q7)izG4-}cJ
z;QDx!CR+&?%MMH15lpDDPu_Q~54|<rpqZslTu~P#d!wH2T|u?<j=!iQq`99C&jxC3
z3-q6F345v+%+5+vkZgC$pXDvF2ZzjuuXqEi8;^~X!sFhkIZkbM{Z+tht!N+MN%Z+k
zr6EJE>V@=HA;yMIlZk=J=6FH4o<*Hr&N%AM{t_8kP6$t9Oy^qvq5pUd?$Qz*I+|mG
zo&rH)L7t8~Z3dphpOg<j$gXT7w75Zu{AoS6$=8!G)SS`pxdz)aEVyQug-^<RmF6lw
z6Q@7{a&}0Y<D4~w{TGTP>jm-GsGW=*#DjbNZ3|7Nq!i!z`?alx?^K#XmMF<dIJhbI
zs`{{>=Wfs2Hyl4S$xhTip8>tn(WfNR!io(YqzxO)<xzR(<>s7DYHuDAioZbj)x}NF
zsOqP_Ss5oxtxq6XtG*p(N1tIu66TU3?J}hBX=Fv5D(zlO{1WkWusH3IT^AueTWip#
z4xJMM)afQ>(bA5AJRKEgy<!<tS3u*a=#JDZaT-6JaDa`K!AhX!*Bld<bxB1(m+n_@
zimiz?-x!nzuEys4)L(Oo(pK|Fu0AVGdbHze--&ybekAy}WsnTpC9=hwInAAeHdS*>
z|IGkCD>~+YoJ<o6O~o!2F4{zcyfP=n^;hN}CzoaRsilT*@iQTn<EX6Fn59Iv6d_o>
zhTmc2j6Q~S6V6J*wud*bKo*k8&drlN=P!NZ*MapI+N3IC`^=}O>_ZcDG8gORA~B=)
zhSb<)O6wEovzA^K&5o(#nf2#n<}2zMJILZFt6!B0{^0LqQOKs~*@;<v@+;)4zhCF~
zTj?V&(QP#JKGgd^&MuQ*EimVZm@LVXQLJzFL+Uz3J;o$5y}t%L7YCe3GA)Oc6fXvL
z5v+xha}yi&q!Lm!49Glvj8ue%r|a0A2wB6rEi5&JxqPxGOTQv#W;02JY{{)j?-I)w
zGOw%}W?!#vj;IgQ{2B=C65^I-4bJLcw0PWk^F3x+%|>&?)FVFQp`AVgZyRgA49ipt
zD$D?^N6`P5HPSj0eF)wKXA-X@S1u@*e0Z=BML5piDKwcPuv9X4Y6kKiBZIp{9vm?V
zu{D+#Beir(ExQIE9vBy+t*LYz>m9<(#P$XU-*Gx-zT?y>!IKZ34RBMZiq(DGV!ivd
z(oq(pPu$~{+FAeHj{T6LqM9UZbv@SQ(ucy3CngL%NExC-&)<0rm0|!9s;SOo4B4jc
z`Ri?k9}I$7S?=9w%_CNY0ANskuNJxfJTW&04oG>NzqlLTwG&PU+XRr=4C+)r8sW+)
zQb*qjv}3L-{cdkMMu_*T(V<|Kv;*BLW?L1F1ffM1w;NpR@3R(U1e)9cJ7_PfU^|KG
zfx4pNdyc*r;wIo%nqbE61zX%$d8W*#Fah`QeO+=LsNmFliD@AJNmmeUeIX)G2>h9+
z-}FpxcR#fYNfI{>39{0gzwh{QcIsEbqIEC>Gqv7gx-y^OwI$*nR=Q%XVygMd*7ge1
z(IuW0yZZz9lM_4Kko{Sh5HK*K>tBWxK6Sngi9#@uE7V|B=4noTGlAUt&{`l<J@4zd
z1a4c7@b8(^AePYr0GqO2$F*w8&uWS$7`%+S8!{iIbU1FDX5muPYV^k#Q#Nr*PCL3L
z9E!GQS!Z7*qSsG|;+A<Kov6E(F!0)nZ2oh7uG;i?!2anj0ppZWaKc?fmM-Vv1;up`
z)7Q;VO-uW(<wV3jdcZXch==ZTX+hu><bwk_(WtrMfNbg(dvOwk_+kz8{dE3*p~Is~
zPC4fVZ6i9h?$A4Bn7h`a;#NW&6?6R0HpU$ijFUH^-VA)|a5TjWletcW$JZ!1P#zXn
zs7$R#XeBTe)KbdPiPV3oK*&x=-xD<9&A88}FA@e7H|E}+PP1GYS-nM=-e8=tvim)R
znHobM?tgZH9D*^~KGNEPG3j9D02W=OGF6Bq3AMBDsXEth(9c?~{-GWStR^@qVf{tN
zxt<fveV##vHTq)*M>YShMs^EvPg`J6Y`<R3_8RjMByn!eSM01OX$jExuLkTkyp7(w
z(^vc+lVnD^c%r8`m<GxDrn%3c0cgTl*Q(|JsfDm+uOLHC@rd;(iKu3eyO24fOS(pz
zH;Lu&FTssoRkMSG^O|K6i#b77T~gtfDl~5K17dA3xS|4(K2(riau`xU<MKiYj5lNM
zpneq<wDN-pUtn5KDuzR@uQ(~wqli+~=aI*}4vkB5=&0x<<+^&<>RpMIWNKA1Qjegq
zsW;qo?^!DY2+~+b%pbRlb_R&g{PJA-Wob(|?^u7I!KTzXgg<!E&*WEB93zpeF4OK&
zY8O$7ujNyTKK_9<C*uGDdMAfKxTZ1_sj8KvX;YBW?{&vk^A1Na*L}ieFb>px;oz9|
za7Ir|BAomjX=y>5QgC*l60;bcT!8O06Xl;bnL^79_K831WE-fCd6wsLLdxfCVO#=9
z&w9cOR>y(0H78hMPLK3~SL&?`P7@aa0;1-5EfeOq630q}ueVZ5G|yBVU#*<0ta!se
ziN)BcVV~J{@75LH&1(S`43MJnGsW*|I9t1^`PDS-A>D-l+wVVz|9sE5>9jpN>7RV9
z-@G|&`=zK%#)TFC2#$hnvtUscngVf)UV53Q^)X*eM2yoVgUgty301z%-{JO|t6b|y
z44Ivs+;Yf%)GDG2^I$hD(*kKeKgds9Sxk@mn|n3b(lBY{Fcwo~V@ojJ+_WBtjVh~b
z`$%hS@{;Q2jnyP+l=x3){(}Xe7cyp$&+a1}{?u4n+qUqc=-d`i!r6jY7`fH2?J8Lh
zUs27@u!|z=2<ejGUTnB+m<=OSQ2P2YF%ava6-xIdZsBGtgGY&Z6w%B_+MWYJNj|@~
zR@l2TE7f4efCeQhoJPRfUAAgt0WOUsY*-iGU5k2NO93m=s@{nW=>Kd4suqDk(bbS}
zY+Z7$*}GU;d0r;@ZZa#awv+c#x!d-w{|nM`3$W69&k?+caZ0!QovD#Y0Jd;n;@Ak~
zUfso$lNoykJiyvUct52r>NIV>C$w|7eKg_(ki6@2xe=EA$4jiFMjU>OUsy}J3A7+3
z-zAB(8{xV0iRCf=4;6A&=?5Zv-~`Bv`Q}R;--fQ{-mw8FRUx;l002<wU^oJJ(=7?K
zAQCAIj(Gxwr!;Zv0e84PrE@fO2jocG-mIrHQDxrG79p|%j$5!cSkg?tsuPf;0T_8%
zf{po<Hy>mJ`Y$Xw;oxSYX(d9NL^lft0p-D4@vBXLv+y{ty(hLB5%#qMuB)N<Is|bD
zM;*h$Qd9+@_g~OcD&ulM6v;zsD$?31`YwP{BVF;8yih$PH{mwNLeRgrmz_3d8y6Z|
z2^~J%kNH^na1*XzJZgShb-0n&3H5^tG_g!hBo=EGT0Ydk7(b&48Ulg{uIm|ahZ-RZ
zcv&Zi1xWa4oHzSZOlJ37T2)!^9e|cZ+$g9r0vw>|?SZw^LP66bp=D4t#8zJ!KidE0
z{rNo94VAwG?FYZ~>v<ViR97Kdec4OVDq)C*0pz<~aq%R|>e5pv%EXDlO^k2dI<?|9
zK-Q<#7({70g`W(&AaI&Ddef|$+Ey;3!Qun(Sgy2Q7+3)mwXw9Y=0ZjBR7$81OM3&(
ziK*f@+P2)GFZXE!o2da*AK-#9G6HgBhRr!rg`up86x2#dRrOI+P>2x}cQ^-@$nvN7
zy$FX8?1Qna!$>7<0h=e~yUTc83ap4gM>-zX+$(sAV$Qp496=)HGYq7*^SFHv;m^U4
zb~;<=C$eCK1w+U@3~ac651f3NUxER(;V?;(kEkBfiyK~mwsRLp@(|5xA?PnGnp5*q
zxI^&#W2o~<{ID|EyHBdKHRCnV&em)5=t<%N9CZV<t=R;Cg@)Q6>+A+_X8xuIdPM@Z
zo?&o-6-_JeE$R2yLo5m=2(IyyJuq2}0Es<;3aD-+rH3foy9Yh4;#a3f;q(JzTRZP#
z1tB?PSRN3xCli3>$^QUE)xNcJB3zwpGfXuO&=WJ~vM*HqYUe**{SRYlh|oVQKIRP(
zen5cd<j@voI|Sqb<@YCra22MY5FGf3iVUap8OJwp0zaq=$+CnGfY(|`YASH`Iy;PI
z{*a^vtT`y;kiq*nNstvC1%t%$vAUcInkQfSTm3blKX4dfdB)rQ#DF@873?0_zJqAo
z3wNL^*_%ofkjn>TnlCtqc+vxqr2U7J+k&hF8`nz+vBCQstS-ipGK>re$bhw2895OH
zmIS5_7;SApaCxaN{_g5AjxYueHg@#PX?X2-u=uX4SU>Y$mMJ5oTjJe`vjAiFAK;UO
z+4C8X&jWvsbVZ~!;)X~SX4O<50H5A~sFU)d{lA!paR|b76`kz^xI*aK@5|q(&s~6&
z7f-#0GUIW7^bvxAwqi<%N#`38?R_s#1lo7tTZM3S&%x3mvb0ve77o2yKaG9hItP5Q
zcu3U-S{)2;zp=EWU}%DBz*kbnwF<Y+W*Nb<sBj&iOTUA=eSshd2#`p-pINT|%VYd+
zW@+=O;f=1ZM}3cpx(Gq{Ni;U&1WDkhFik&ggsw4(8{UMun<(~;B+v?i?{a*7bj)z;
zXI1j_>a?Xc@Trfzimf92BXv=mj<dNuE~ewmAQns;ueOG{fK-4P;DnGbo={Ib;e8ig
zp7gk!+<OB0_i|iDK@N*YjRDJNXPa?{A1@$gcL@-w-w_-SP3r0w+{k*tXIL_HEdxCM
z%332DP{-}iHooT-RNiQK6aOe`>6jf}4p2=8+qpon+rKGhH|@cBo}?1rK)~~ck?@#2
zf0hCvZN4cCLc-_K(Ao?vK{N*!2vcJY5ulKRG@3vQX04DeITm6w+BP1*S&Gwg!<&pR
z=9VLd@L%>4#}&+K{lda32ofo6Otj!eUM&ePJcj-{p&In&HWvn;Jh;8Ny{r{Jip2?U
zKqFk~%?RR<>EuCY(j8k?BAAmOteM(5`1ihxdfv6>IDs^8RwXEJ^CZluDj}@`1ut+}
zo*-Rug!a$9%oR7}MNLy#U#Or$_plt3sk8u?drhv=@R$WcDuO&{4#$zjvndjkDiT%v
z%cT03GOY!tN?~~l8v%KUxXW2edO4XMhx1rD5|C8+n`N5~+miO^w@bLe63Ld{8X&lk
zaQ7-Qtk1-AYAb>9nIEv_hAsN{!lcZFyuNKngp4<Dn!@pL{6lB^AJ`qPnui^%FB0_o
zaO@}~Ya~~8XdR;H<t%@{M%FhyI0Q-F;&do=n%CLZZ{>bwF(UJ$2$Tq(9Y&4Yu-`$~
zv`-c45Rm(byLk7*oy`w$Inpj159&Xo1a^k-iD*xHV>V=g56xSNB^rwJpw>{4aCkA9
zB&Wg?TIw(inB)QC|0pWRWEYE9YQ~1a4y>M_mIMTX72}ig-J=TA=LocbH&w_G*K;md
zIe9xr1<(&({yYt)!8w^<kp|?mc)MJcBD*bP;7Am{er+CXM-kZ$8Z_Ycf@|I`C*@yU
zTj^8^1I`zIiFqsHlZKqTu(P}vlH&feu`RgHKlUr0@XWzZ#{9s{Au5vJQeX0!I0i;l
zO-H}#6(RGzyZ~kTm9S%896Ql0oxk=5MVsQa%WhU0+U;vuyC@?#hJ;XWpL$rY&BIik
z@HT~~P49NlUBB;bEFS5l6et!nQR@@g#h>K^#{R7K8BXABnU$I>pI~*#G?zZkP}27C
zabSp_Nl#~nK9N_0)o$CQzrfKTV)im*_0(#89vS`U+s~l)y!D@C8Dqh7dVo5FJ8z#m
zYK=%w$n|xSqWL)9)>sYX(qH4t$@n7kgavqjP;f-F7ZOv&FMMjRkq<N~_3(HSotgt2
z`|IxB8AkP$()?|F){m<ge_b%h9VY5<Th82F<M*`D@3Fb)=g1e7kDnzH^%9GK{TuZf
zE$(g}9L^y6g8#Cm(6VHP;k!#UWQ1`q#1s8GGrT1ePtfW2WGKEe4-RE5P&8DooaUjY
z5M;G{A8RpWOS&;$Zq<8G*>j$wEH3<kUiXsZv)U*)QB*C_j#u-+w73T@ajNfhbE0B6
zb0f$e^$1NlXvgLbTb>^no;sy4Y-kwarn&KZPEd80J~8)8p(;n~bk3g8*KzK|M-y{<
zT)R}+yA>wUdQQO<RHigaz>IkbJlV@md(2ecCFj<u!n1Se9wA~pKH@#e$-iCfEhb)Y
z@~e3ZD`{tl;g@h?jpG<svh)V$469O!w4uJ*xG;Wk7fDbp1<sJLbqPQ^|6I*`G(4Z@
z-3E*p>~u%K@|78~3q%7`Ll1{t@2=i#s&H2E=ZxhVg~6sjLugUqE&JtiLOk?vdRQS}
z)yK8t2f?35NZ3`rQGatyk+)&D##z`ax`hhB9sB1m$N%&H|Nj3=<Fn3=<%?KZW&Nrs
z`&%pEf6|Ot+tA0MBMC5pb3!)P3%RptX&TuCvH!#kj&tKTh1TN}T9Du-sI)sl!ObQ`
zZlw?v`B%8!F{71$JO}oJV)XkT*e8?Y#e+!$>=+I+1%b0Y`8w`V5l}Kr$(;XD?D;RM
z!hR-IypJTIP>DKN&6i2$)$G0F&;KtqLENw#MHoyX`$El^Ze?x%Eo<<0ytEF}C|d1r
zUjGLOV`fRQ4;oIP9Mhy%fAg#?m;Yg4enCI~o1p7ah4V6hl7+~~-wb_A@FGg2?jGCh
z7vc{(c6<s~jG9wRWLKZOpqsZHH;)C52=yoX;^_Pb2a7*Kk2{7akkDWUr`8`QfxV0Z
zf0+CJgR#_Xm+Qvqaz_^pE=Z)JS5MphLi@75>GAqKf)#4|n_+`IXDEeoR5f44u;15Y
zz?TkI(X#!bdHO1$56P))4DB7I!ZD}E0?1KKOMozH&mMOu8RdwV%g^Ejx-(sL=;Ga5
zOc~u@Dce;q+#yXenV<F?YCL8ci^HjsDt0_|CxL>TVxiKzpQ6LuEoCM0OKLmfqs_1o
zg?hI}$2_(R;y|yFzrkeo4?`~DZc0I}oGm)PA8)|-noKl^FpdOct@#T03$uS8rjjT<
zH2kjlC52tX@Qe^)-y(1EEI<7EJFfm+?B;MDPU~ZZDkO36+dM*Ax-hqyg3=oO(UJ$l
z5NMnf$;anEjI>i6@x>St_1<9}vn14xoVW0I^Cz|`a+D?Z)ft@ZnOV`w*Bm80kKc@e
zS#v#ey#L4>_MO(`C>>J$5<FGt6B)r9WlRKbtbG96^(za7ONhVkeHGV%&$sa}UP<%z
zi72DKd<Gd>?JU!qi<u0_$X-_j_v!kubS8FY9-y)~%Ph~gH&eJvHnb1WT2wsD|1Cgi
zc@s1o;Z@487G$~<&&wlylqAt|JRhR;z&}Uu4Nnda2GmL4IBBRLGpF?ymV!tB5>~5+
znBe$#s3gN@>I5-ZY9lhDGT@kA)_cWr9qEfTsI#%~lIn~~1-*0WbZ<)E-m2vA{?!5+
ziKcIrt&o>)XMkU5du**waLUO!jExjWtyDofEgE;7zt4CH%i8)+-w|GW=32Pdskirh
zwM*@s@U%6)?#vTtU1JZRil*bWKQ@s}aL_Ww>gGUr5K$P?h?g+Yi;$_mOYuycI!a18
zCf>(nxBUdgJ-DJ75jz88S+0MC-R*(Q`Vi^<ha8M9`t(pJE4KnBGaq|q{6_^uCPM+0
zrI=E{|3zV4U)a!_-zY%&+NnoM(g@kqTLY$E!&)CZPA0@)*)~p_{;rJzGX;M=e_jh*
zp=+wPMjP0lN`plmVBH%=w){B~3ZZX`rB)xbPMtLRM@J5#4w!b!yV2NbCor%0fpL^x
zr$P0h+FoAE7@w8L&bJKAH@OZv9&2rW;=?Sw5`;+s;!nK!Z_881@YV#~z<RQ^rtq>v
zduL5#=x0F{kc1*A$LZNJQ4460+%e{OyM2Io--vy|x%%=uYb&-Bc&CRHQwRfR)qo79
z#mLTmkWu4ght;Cd`maL!tTq8>D3mk+nQ%Q-`cLRpR2$AU@9C`@JDjJGiFl51ATYN`
zULtl9h>{>Y@P>FS=G>iN*2}g+Y}ia06iDJ#tS}&Yio;0v&GdXog9cOgHijFni&U6w
zCR5Q@yM4Q5D!~BaK-%l9(CH5ecZ{HG*Y&*l)1+U}^(^9B@}p0tDU@3c2-~r8x>Ht@
zeqQYO9u09L<PakD6SIJzd*}QV{?f|T-mV;R_c$@QAJ&@6C&RBnBT?a27!YC;Ucw%+
zVBM5I40~$?4XFA2Xg>1O+^@k-R1EkH?6xzPAoOEUdOb&ozd{&U|5khbNy*)BPUl0y
zE%}@w4JYNDp=$RcZCJ+^2<1-_Rpof<-{eA&9ef-7<ChvLGJvP0Q}Xl=!+*VVc86KX
zUDAyC$y$bmkgp7R-^llPoJLMVHF>zaGK`L;Mn$fBq&ps_?6}SggTsoG5NrnujLu2?
zh-{|H^4tYb5#&90N9C=jsu=@={Dgos)j5J$@ZY=NWmpl?UO#toqIs@8j*D9Biha>6
zMk1oG9bzywP&xS=P$R2CWluUE@pW&He}ljRQ$sc<{Y+d8%-B;`FZzUc#Q9!d`W-<A
z`W*ed27TzHQu@G*ojvLm$p=bq!sxqv1X?7KFM^cAUhPLKf%hv2F{8tUImihGYZ@xw
zq$7_X&vi^uS&1gqzOuaR7fDQ>Ag-qsdF1h``$J@TO_(o(<a`6LH*3Fvc}iLlP6OwA
z5fvo*08a*!Xc7;`98uzIAprMdiN9jG$r8evi6>u~cWq-nr4U4vSs*|n`uZDUk2{8A
zXXR;VolBo2vhcEaJQ2znzff}Z`d#mG<!j1WBy~hrTW;><sSwlV|A#jH{;akVzso0L
z&XB=?D?K+;MW3OdqY;FRwep~iJR2-GB_cOA1)L%Kg_M6Siqq~x;Pc{{GW~{DTQF(C
zcgw2xHz0ZOT3k<w=vv|&bSS)@zcz6`7U8X2hkgBvHQ75l@#M27twPvkE(NZC`>hMI
zrk`=^-D*LGp|G6LUKQ_7>R7>-_DU1W4Jt)HRS<ppm+*WN)pq`SifOghs6g-zTL_7E
z=>V|My;IKfQmGS7d?x8v!QjcKCN0?A=pDo8LaIs?Lbkg$TaZPdN7Y3nEudwCzI&U}
z>-D3dPp`yZB{Kx__$rg4=Papm`5c8m!dT}+$3F9i_(0wOBR(o%n(0AcEm$c$qFSU~
zlIJ3F67`eJoQPKODto|h{+<42^_-dIqm`|Pe6V@`tKD(k5Hx4#Sv*c;7d0n#{l$A~
z>02OVt%h)7&M7nso_^2CQ1Enqs63GpZ1FlQ+*%4LdpJ|{k~s<Fu3oVa^ldDs%i0nE
zxFTpcqMt{8I3R!jqqj;^rYidmPkVI^a9O{?%A18uj-o>Hl1>4OA3gg0H(Y7PzG-!c
zllX8I`PeOpKv~@5$wj>;h!4m?x8%Wtiy37An#D`szsfa7O*1w>6XiG3-yg$?-Pkxz
z58+(1&muSC>7}3E>P#sOcno833z|2xFdBctI}}G{-)`W+`4B2|!8t=t;1_#GiuVw@
z?X<lFJ=~VP503Kt6c9lJwwu9MLqirtOlrQG@`&}z^F&YDw+bFiBR1+Gki|R4egngR
z5$VlOfAs6))mRUSQF(Q+86?5@<>HS`9Nc0U;7~;)?@x>y#i!7Fo>S4t1}%|nqZD}-
zUk?s=E8d*vw4Uk~d62pK0MnIH8jLhmDftn1e~PE%K(TgWE?Xg}B;D?`vAu8b3tS&V
z1sKPdl^v39ovbxYGCQsFy65CGdQJL3H2t_V1Y4+ghU7DospjvYjPrqDCe}F^U~Htb
z=UXcgX-J;CgQ@msK6|bIn*1l{EFo_TI21Alb1kJI(}b>zDDX(@V<0K`KF3f|1UaYh
z@LE<lZ4#_~vV<FMjEB6xnec$wyMxhv<kbo9-;n#u7|5`cLLMEMApgQ&ik65luP^Ww
zdA0DA6-f<*p4K>bhaavmeHG5&pOHVK*@Rw1wh*L#RVSnR0F`-8@#+<A)3XWY<J>qR
znH!QD2$((=W#bcBcEl)StyYCE>;LxN-d8=GJ-tnW0ayaWFjVLd6-?j;oNl}l%6m~-
zF~wqp>NWWNfIFL7zO=HuD)AwRHSz2<7xQib#XjBv42yt7WVKEmq||0%Ep@-owSo;J
zvR3O$1mpBy<T`7)y&{!JQt}ls`R<2E43MYWS|>sqbeIPC)cqmlf{J{Qa9{dd@SSA&
z=IiM@9q7{OcFBnCk1V`D=fV`gpYe7)Uwly^HQ3|c=S)@|(Y_4fX;Zo7o_T9AWYt!}
z3sy>#B8lU&#ryhw8e0FFPRz^*%1My8F_?=nM(jc_*+#3|SxLn+?+saG+~XcMGW%1o
z>q2tT!T+}k>;atD*X`0Rn2CrozGaWm=wf~K!+T&&M0tyB?3qIoj52bKZZ0bXk+Tgj
z!jRqOiHT9Uiy|g7U;S*WN|PURQ93nKF(v+~pE{CYvzAM4*_%#Hb)<?>^JTCPTP<fu
zbj2T6y77qvRD2=78C*O;k_ZU}45<g(fbA$vcFZ~H)a-fA@u(pKJ*=0NRLg24AA4KA
zuiU8MP_9~3Q@<ibF~a8kBH)+OA;%@365SU83coK&LU~7H1b*@Qn$}X?>4ImE?drNC
zA@XhY`A7%*ycY=?5Ey6`!TM?c@ksVrtpKR2W3&hSK9oNu8D>AqH#;Bl2CGCUNn&1}
zHB?!cw>l<dOZlTLmg>>V?>bQdC%i#)PU<bJ5_*(liqyEOM$Ey9L1!2KZd^o_0%rHp
z_udbYF2g4P=srA|^J>wCRmM;d=H_?ty9CH1)@Ia2@TbTnQ*iJk<i1OWX`xjjGN*`|
z!pqV)S$bJrrR1XB3x;V&dH0hkpjUF80XCKJ`;ywY47B-6cIoZLSrQRAf~iQJj`p^<
zw2}M^TKoE2YoYtEw<l?jmZgK$g+}TpbJQmSxaf%%!ZC`5&*0-H`2*<)<1n?UcIj=l
z;V?^e6>WR>N)CFDrJ}DY31ArN!Z10mK_#|=m~>W#fa?IJu~@%WLVKMLTF`r6%>!h8
zl9%n$ZJ3F4t4k=%-z&3Yy+L)_@hXl(J-)~3AP;{!8}H7H%_(;zBo6vGbf#=z#>pOq
zLjj=z1??@^K>BcY&&$TZL>&>tdWGxcXy{`lvOvP~Ti=Jsr(q!_GOen12}^eL4Dom0
zst}`0z&e8q`l8|?zHD6!UP~SD80O6jw2_D?i(%oVpW_Y04m%~WIZ@OUJy%=LgFxS0
z*S7|Y3>M`bEse=g2LLrxsD=z?vq~GTLBiB$Ri)${+Z?HL4#s|j=`U4t{&AWpWa@Al
znUp?$u(CHAC~wdU@|NwR%h`#&Ed5UFTS<#?{?AwH#*m#t3cHqV%w*n#cn+6B*(*3~
z!>8BY`O?uZi3eh1vgbszCTjAx7+Pds*T2P+Khw8=UXm6>HRM&=a|&^G$+UR8(^>>>
z+mPzC*EnJl)<(SkRxid)zxwTg^1wb`pw=_TvB|t0GK!Vu$>vx4cS7B9%yWlKrz|Kq
zQe$v%9Uw#Y`+n~RFzc^#UCG=tEK*=58j%TN82PP+RTJZ<Lk1I5*YxY<<1&>kZDV@G
zIBO9#uUA$om+0~@dq4aVQJUh<!kcJe;EY6bCsFFi>G||1SLowYkE;QPa0W9XPQQ}#
zJrptVhx4)+jXYb*KpahXB_3Z>V=7rkWO*8@pd3%i9S+j6R+*oDeMMT1Cvz!<2S)~x
zYLLFc!r*Om{;N)K$;}I;i=~w=%tS~0zNT@Dd1{E+izh22TPSruP?Pt(yCRw~JnIbQ
zA%x$Gs7Wj88g@k`kpCtODvBo4wo;gUeovJJIh|k|G{WN@WW8C9+d^h#!&As`MASgW
z{N<6`JNoPYhBMY$hGS6)+c9R*O7$cO7X`A1ktFU3?>9K*%J_$z|6l>Gs7I1@k+F*K
z#k3LqQy&rp%^p&liHe1iRa>vg@#>S7X}|ce?KO0VRE2GjRUF^Wh;qry;I>*67*`6|
z_RDUf*4u_ZP-xL~-W(rW_XG$C5QD;HCvAS8QLoKiRpL(KRsFSdW|R*VP@d*~H<*Bf
zdZ#lnnAJj1kgo&LK-KHoM~Z*{NQUB_E<Z1`Ad!}K+~*mTOTO|2kySAr5?9`uFV{P=
z@4qOqY55DTEZcmik0|B;t+r;ETZ@^HtEPr{N)S==2g!A4!|VEcKWpBTcxG3$hbZZN
zEZZZDdjp%Gs|`j5M}t(0kDP^!mPUc{sKalN|8i=@V`&QA<RJW1$;>;?=LCI`uJki}
z+i<f;G$tGO%F;J@v)esIt={vrHE;OAH+lw(?!XT42jP`c<6snt;_e@SalSmHj*)oz
z(8n+LmsTE1iPM>zJaC@$mp+$jlX~k^2{NYn`rc2u7U^CX*k-=B7f`jy=nA#G{qO1P
zeG>1&1ZCdIH3$7-_l*N%yY3ydE>`@qpEEy`>q@_VSE#KP^#)OzZs2@W{jVtL&#wp>
zs094m2>;c7@n%Hr;1Z3y^L5Q_UZGhi-*DiALAfAL3u?ss-)a;uCo0;v<b7<}T9Kw_
z2vT($3Qhm&T-#D(-aJ3F>DpH84fuvQKw3X2x5+ZaCimB(xyl4>s~`8|oEF$Ivbh${
zSN;9l4?g1xL}?$fYnTjMFbW5#RL-2A8h6NXBTLA*+R8JYy3zI3F=&`$mo%%)!^D1A
zhF)&hoMnIkyZ(?z|G~EZ`=aCjxenF3mU>F9jpMif`>+3Vht0pQK)#MJEf~jYG1xrD
z-%Uau`lkSe-R%B*mm+v|JS%53uq831`G0@le{P(?_Ui9H8&1N35+IO2hqcUq8*>u%
zPX+IPUV*%e7~F5gN^kZ5XTtoS`shEO`}@0bWiZ&27qeaer{T!w#s9-Y{_h8B>mgJk
zwl(b22}5MD!$#ofkt`A5cw^x9xQ!U8GEPN<DULZsPj=qdAH&{u@lOx0BU(r4391h@
zE;T<7!i1-hTV~V$&?@0<tE|2fSARq%dC9w#Cjq&U=AT9y21Dn$OA~lL5`8!?wkno*
z|3}yP51)eqZwC;(I~k|?->AY_4;|q}ze`5r(9Jf>dR@4>|AEp)d20<L*opV#CDH1X
zBkLJvo4G*{F@y-=-+}bbg?(2~zPw`yC-AD=2=Ad+(uBD1f@aP#cRE3`B{~XykC5nf
z_M&Ee$p3uJ9JX8w@0n}!gj!qX;ghG_^}o1Ng<kcg(VL5X9)%7rCw%QNn3@X?^FJRF
z*MUO`E>M`)MKBpfR+}Pp={HdfsPN2}G<v*B_M<V5eoWIpq=X#1Qgg8-+T$6i6^5c=
zbpfbg&W3yJZEmQXxh=<)Ez4OR@TcDVmFIA8A2w%%{Z2ZR`$6P#Kx<1>)-P%RXK-lf
zzDfHChCM&?M}2DkE@(@@OUd7e_X*ms$@s=}wv&zJVfs=xe)WIi*Ha&iq1w3b>?u0!
zra#|b<@vNqA^`US)OvYFN-sOMtP;|BD8|A$8%KR3mow5!i{<4*kA7PGj`dff(R#<4
zs{7;Xv&M1CTRU7{$>R>;+#98dWUti@b}9*Ews|)6Q#}Db5g^(XFMjxAcseS6a+ZKz
zQqT(XS=T%%4ZSL`T}k8eg+@_839F|}?p<3>)=6_x{;_nisP1F*n+Lw9K|j>ZIeJNu
zWF^PT+2+O!+a+0AX_4LSm-2=@Gt{XI$e4Zet5uQ+$0VKB%(U(M<u59#(;HnN%ppuu
zQee)i{HpVI;)gr>1CwT3+cylv`!-NwH05AAvmCs&tJ2KfeawVC56^tBJ;l*2m@!Kt
zNo@P))FN&mb|%?C4wuR_I<djcqVJ2_%9_E-ZMSH*oZTVJ<C)LWD`t9rgbkgP4I91X
zxRf)|d~+qCg4#5X`t*NqBtm__TEb&$Yox?VbDm<#)<2`Im~p+%M|?abfDm0yiOfu}
zda^J@`uy2F<NOG*sbBPTtA-5KYHZ^IqIh>Fo){MBdz@ieLq%0rH)m#P7VoioWD29-
z2ho~m*QB+I9OsfBetUPHk#^<Tt=Wv%4<>eXX>Trfu1s(q4!bhGT9Cj7aLVVbS#h=3
zpXu`r-Siq?nE&kgAw=V^E2~Ea*B&sxRBbroJfr6E^ed$__1hqg%vg&~kK!1sQivXk
zqwg#@+VoT0a^JQyR=@~Jd9O<3>$%gi4D?X$5$Naxa*6i3i1+@QRxkTl=dSBLwoqJc
z{%;>#{ok~Hh!Az{6+}g;zb|J$FN1NyWB1x@-J9p}6xHEKn;zkWT}99G?!tX=G++M!
zm=mc*f|ggDieo=FA5&%U^>~#$Wil9{?0be(X%c~WFEvD_N;7Su+`#RZ)u-6<?|c!u
ziBdO`Osq8;ixqO5kS{jt<ISA>^*W<OhwmmgPgQ2|ng#;yNfr1!F!Mxu*pq&O(&wC0
zpcJNetM~Q(^ZLT^`fGK031C2S8irIxazAo^P88nkt~IMWtZ?AAV}8kKBi8im8fPxC
z%9YICf%`r-HDx@l=|`e5lW?ytOQ~yShu#Z18<hv^I4{}1ZMK)oC#fX)jn+5bf>*|s
zi;GHp%wyqc@EdV=s4cJC`Jj6EsoQ+AeRE1QV~J!>**;}dbXPNxD>u+vp620@bx`?U
zIc!Yxk<{NjR%`4uMlk~dr%qX_N&nVM&a0Ku>UQh-gb4TYGb&x7DqQ%L>&_IH;cL)@
z8<FwQek}j=?uC1E84JbH7HhY@`s~`2^mrycV@TOFa7>q52^m6VpL4vTp2YTTW;jIp
zbsbSs2<#s}NzH`>8Sw8-dR!aBOL)P8im=~`PKl`|e-lM^`jP6)q(f4!^Pu+TlepS+
zG+xFQ1)P$@?$N#)&l$be+WoqwZAXaBR7Fs-IHGX>sA=q{uMe?TeQCUF|46yU$;zHN
z?Su%$^K*Wid7?q>Ifa+TUnR5SNSRlOVqnUjqsx-c7wyMydH=M*W3Jzbn~Lev(r{TF
zDzZ`Sr=b0qlP=m)`7h~&N*K`$=084Um9%c4!6KS9!~v3B3({P!%1*BmN7C)pD{$ce
z6qs>BwmVlcEvFZ9_V42+W6^$pQCSwFZ4H8qH~*SG({j#XC;oj<$&<k8OvvmNxN}GA
z#wO_)fgIzmM2{^k2E$GguOzD!eHW)1AdF^MJq^IU%{Fg5avKI}HOM__mLKwrKZ>%Q
zYInQl$^{~RwcRM2mwz-xM)>rO+Kc1gB{t)o;rym<S2#Ds*oMO>YJ0I#bwd}ZDxJ0S
zW$wdWbk)&Xvh!t4TBkt@;NAm7?&V|JHsR|lAitBOZZ(KMz7taA;@ppeWWu~{P;KrL
z#mxVQr?(7?>U+b5f8r>E<N(q!LrRHs_W(n83kXPeskDSJGz<+QH6Yz0jUduUcO#wB
zjqm3Fo^w9Jb-~PDYp-WLamTQYH+5FG#}DEsy11$lZ9XgC_saQH+A6D^6ueus0Gk<p
z=j#Ha@-G4_@%;7X6~@cB&L3m|0Oo>Yx1YpG+R)pjBkH9QBd_GEP#OJEBc6I-3>1#V
zd-qyLpd)YF)0al6G`Pr#KJcYXf+a>mK~_r$V|2RfDKi#AZZgDCE)spubo-#@&;+QF
zNmM86&Z}UH?%S&MtBReCf5T*0r{wn=BT05>?5+oV1uTfRSO);ZCZ@9?3yWRaJ`tS2
z<Bm#|Ng@`1rig#Pr~cUzrt{v-GqRJ*{oBeT#z2W{`_yk#w<85s*l!->rS_Ns-e!%O
z-;UPIdq4=r>JT6Y)c`^qbzzgPy)4h*OJ0CfK;6+tH0vvtmxT2_i@cv%&XqlZ9?kPW
z%Z@1Dl(ga-z6R9=n*^Pmm2V(T5}T%s-_;_U?3Bs8SZ_3)zE_PKOLG5BHe@2(26RRq
zWM^XiPxEHv#gF6n9_($+BgfbP_Qg!*fydXqP9n8|7{KK{Fw(E)!~dX=Q_UNE^7qS+
z@IOeNN9Z-1^RDX+4?dfJ5NUZ(aXWkgFI9Qa{&BAPbkntPa#iEGR4|QYO)!`bU?LsV
zvky+R$A0bn5k7i<klXhD&&hPIi)Ap5lggWF49BmV>_l(*ujbeLi;muNJ?fR5t2~Gn
z<veH2>3Puj+G`l{b0W#<Aq8X`0On^gW(S>ygXcA%vt+~1{YO!+aa8i`zCs^BJTKHd
zpj?f}EgA<OG=`*7j8^zLU3wW)27=lFu6?A)Goq&HPA7gD;dI}Gw3I2zi#S771E7gz
z^zi+BWBS6DhS(z2(TBQjei(P)S_5#~_IO$&K#X0^&zfzr`(JX?@Wn*$LXPIUF(Ywr
z<Q<^E#nuGK>V0#>9<wYteQ=xe0h|Lxrq9G4Xd_n6zJDmc>+C-8)oTC&mP6yZ;@@h@
zZx-?R+y4SCYQjwqL`wbmMu9!^_D_t#j&e_c=Khnw#^L(j%|fQ#1m1?<&nEtA0xcjU
z%RsF6c%D^O@1G@!yD^DO;|JN6mR3i(G+r#_9dVzl<_Vv%cixX5DSZ9B&#6lq#Y=rX
zg9Z%&_S68RcqYry&2C@lK~pKSW%&tbj}ss%HT9+GfXM&0J3#d2L37RVyBJ`9T1|Ty
zNNZ!9_#iejU~*QhX9@Vkuay3V5E=udEeF3AaTCE%?sJX9mlgjLCz$~ZUjOwewF25F
zpKoej_E+wF+gU8_mr}S5@Qt*)-zz>g0f6KMAd*iMDjhgEJq3op{j>;?8oO2eZ(I|d
zKuix}g907H9!*OxUN0oBGd&zdO52TmCjhw~Q8Jg1(F&Quy?b4t{ji5MVdAGb{mtl+
z22S4<Y@Y_Fyo@0smPr+_2cfV1uYitSQ;pHm^<>%08&pa&gU{b!+UEUl%$XjVBy;U1
zfJB9p=U}-hZz%UW<4y7M3pBGB3m`W$#>z=OR%h-psvpDTJh@&8jGB`tbyP7x(%F%3
z81U7L;Z3(F#YhOkc)7IZcY2w<z9`KRQvNMqxpu;@I_3e`Qe{WVrHuRC6DB$|H*Ass
zgO80mTZ)L&s(QUBUw{OkLB%D!8bYL+TE;`z^y4$2TT)|eMnXO0yTMcQAl{|&vMt$t
zm2>G*ss2MV05u*2d@+6Y>i<1>{mo=?JQMa`llMoDF88YWyPZEo4ErLj1(h!Ku0~;;
zl})bk6F}R~)>0$1{;2{{&4;{20MwbFWxe^~kCXB`9Jr1c)Jdb*uY-yXPEyt1%0%zf
zC6FS|@E@pG{zhUJj^8hR`D!iFe!N}WLNQI*RFjMjI~2U80f3u^-<<+0m}DgMSnT!?
z?hkj*IU(2G8Xyy<`mSki!Ku69AD|T`N*P3(r@lcgW0(co`KuRjrZ*Qsktc5+2nhO}
z*UUUT;oj}Z(9r#M*1heVA^U?|k~Ox+7r^$l#yKC*l;msbdXT5t1H>7PEhhwocq{d@
zxe}}Di13y}ibv$VssJ#5tffGGJML5UzIU=cHz(W`b@V^~=k1y894I@6A{Db4FH<30
zKff?uD%-p+!WR9%NPkD1?+t^F)%vuqTk>Puf{`7l+qBB!^48B3a`*tEbQ({_iyjc~
zZ2O<{UB7wYsH}V`^YIFPfM8R{^YY5S&14FMT1v(5E|H|IKw$1S|DK^jlXW#<Z6y-j
zxs{^xKxw9ypVTW#Z`Eu*U%KC4`u$nJD=H@{<J8vs5bzQ9KL@DSz%je+F05;MIb}8z
z&+u!?=a18D@MrmtlC&6+tqz97QR8JNlNTx4e)>4u>0HkQS|ua5CtW*#I#1Xvz>*9M
z04gKjtQY$u1&y-ao;bla75&wd$&G94c>~GO;D802`1*C>Fio)yuGn5xiM!KJe9^y9
z<{Z%l_?KA#(Vzwp45KZ<fvy@Jxp)n5b)RP63@XWy)BP8|AMxN1Inz;Oo#prL{L4kN
zwbO?hhd{uSQNZ#+i!Zi$FZO;bw)#PAC8_COCvOG-HQKhov7E2fSh%YQxZ54VCAPp`
z=FfYzbI|ZRi?lxgNN#5kMYqwd3PR1U0vc|3fOF%s0c@cSzc0lZ(lrM|=8^0#>zDu4
zyr)r5AfA}3zz<=(>jkr;KnzO6Ajofsy1t|?H@gt>t*)D4tITf&Hv*uvU`K@hGdM=Y
zv(pJRyzK@1Zch20X==HbG<g?TH~L&Jx-SMFc<AYg8T06z&HH5CRC^uwG7kdF|D{w6
z5X31m@LEqwsO>4nYFGO6H!pFUXQZKJ5tBryt+wnQ_6k7T*X@smzxf&`oyno(&HztF
zFMPczbF&H|WO%arWj&`dc86Ig=v@!+8q-p(fAKj;OJp}T4`+0vy<YUc*@wSXIIbOI
z+U9Sm#^!7ucBt-UYOxcyO#vsCRJ6`213`vk=n5co;3O%liXL?)SLuC%=G1s@S*V$s
zq(l!1@0NxpaxWpPz3;C!n;cqh=QyM~I<TdJezEWQ4|%S<{AX;3PaG&EpG{=WpyKjv
z&jxb3Je}cpwVv|SJaB1I9scr=WJtKEJx?yWFX}Ic{mV*xhU3w1{hV#{r7h>PmYxb(
zMj9YwN2XKb_N+H%k(_6SR8^{260JclT)}g(6?n?#NLE7nclt1u2!OL8uka}aNCEQL
zK!D^EOATN<9O1Z(Yw#xUNu!-<V;VT#GTESD_}$FMzhQNt=>fi}*H#5-R(-x*K8GzS
z@N#ADUIr@w>1GA2Lx1zvCDQSn*t;%<T`$!8V4um??t>w`B~3G@5(+<Fo%&djy0MJH
z!%>Yx;Qcl_bup0iNIfcnh0PG{pb7GjxLkgGGPiVJXR!&v?2*A28&}`A$PH&I9>=Et
zMJAtNf9kp9cTuJ-@r`Gl9;#DW)zZ-*;4oczEMnk-Z{AC2B=u_d+Fy7UJ2aPnkVR^A
zrKHa=YQd^My$4432r?>9<HaB}Olk)B{FcrDzSMK+=4NjL*=Gsj7XXDx5vSxMEk-05
zf+Xp*$2`Im!p;VC%;uxWxahDp8I2=GzfpO>F#GsD+z~m_N*lmlbHhCdTDX1@Yn|rX
z=Mbj@M6iB`ma|E}ZJ8N-SnV9MjDrs5HItW3(hHWGvh9=Sf$fr#bawg0+y;2X)#s-7
zMjzIb^iXHtjEs3_e}%h`$dWQ9JuNZE7ybLuz^*9Yu(9UBmpvF?ygu)x|H+`h2H?)j
zpJL)&<n{0FVz4Kke$Xt8S~snQ0>ZQBeAuu_!*jD&laUuA+{fOuS4|!L%~l|A!o+8v
z5$^^#8|CP{s-E|Wvi^VBzkB?huCnlK(^<ntIw)bG5ZQZ{T-S8xv(-wSkQfD4q_b=a
zi|psf*9QMyB{9}_;=_u#k&T3(xK0Lmm|^LF-@F`QJEYnPOz58&E4YgM?Ckf7vu!1m
zk}#-K0L<X=7RqRqW|<s<__OfWIy}Newoh_L<38F8M(+JeWUo?&j&NQH8^VBI_^6=8
zUK=;}O9XAKnIxm2wU`j21tB8_nUC65pPT6*NnDrYHFA&cMOrt~?FqZ(o-4cc>^JU~
zZ(RO~JuLJAY{Wm$IDTI+JXo;Ga2x>a-Dfm>WEbJuFGsp;mKx;Xo`#A(@qUHwMmaTD
z78Ic9@1loHk?gLSss=Y)l<OIrSKf|fj~Eej*7C_I9nJ-Nk1=vSCl;TT>46?yQ~KA7
zQ^3HX*K%tZiktI)IA5I`7B7En@E}NbGv4<L$-ZADZ9|aVu~jrztH@)x{i<l85qk3U
zfcrRGVAdutrt3+d^crqP2%yx+d}ZW&^lR?a(43N1!r%AU#N5-Kpik_$hu(dALvk&L
z7K6N-4re8pZ1+KPD%r^`lel%@bI5>wZa%dkvP&};9bO~dT?!@+lhhSfanO-uajdL2
z*>Dd3c?!5Ej}5Q})kc|M#xRchKpbViu3hg|;2$K^hw%{&ey@&3Ji9(fPedttY#9OE
zXm}`Lnc@T-WXqiTCMek_Tn@(iO8`uWL~-wLWH_tlI{DE9l&lVIEfpQB3)X)QEW$~u
z_~}m-7D*vW$-^(nt_vW|UJ$i56D$;t=m~msSbL@uN&TBK<+VBgyvxtJk+Wj>HV`CJ
zLL?gB0qU@+9N*`w*O$bvx9Edk_5nW-@+Dk#xD%Yq2pB}Mz%bjTcb>!(uy;{S?yV$y
zfRFu*OFC95_*_=#5RCjX=%-TRo+Y$0>Ce$}8eOY$6Itsirs8@D_~Msur0w#<#}#mo
zoz%q6w0Qt+hyro^5zY;!Da9c&F>epIAK#NOkPawKF9hQakho^qI|O^UCSL?qDZ;~F
z&Ia>hd$31H9)Z!2O3~=0FX~BU#j=t0TX9SnUar5M>!E`}bt!M2v&cS{4lGpE)RDGd
zTX?@@?Cp&i-@Di5E5o@yrY6A$*OR;q>H1i+@o-hR=>r*~u+;ptM0xF~;xDz%9+I)o
z9&oK1{PYj2;q*rKi<UX8fx+s(3XtE)l@e`sR3J8CbLv41A5~1u(R&^g0b~|kQMUbe
z44$7fTU%arh*kjMJh|Ws)r8H2siXg1n(HKejd2Cq-yw2$Iu<#oz%F5yL4_=NllVjA
zPZZBU+4seC2P!BQG>O5}xet@ptp_Hvu<>f>@~^U|g+pU1n=JXZP!#saESW`K)x2B(
zI+MJ^9Q;MT4o?usJTEEfV(_Wwe7!3cm-LK{t*Y3?yt@^ksvSc<DQQY%*vujH`RqzE
zI4@WWL>y=Xsz%0_D)Q<?k{n_FzQ&V^vnpLL{b!Kw0+)D5u!yI)XFmWOZzY9)AwRqT
zMe;#_`#tO`9Q|vvyy<`<LfXs}G(*6RO!l`$UHJh;D(ApXNd=9^X_ekj+d?LK0>XA?
z^Up|@Z-)d(n9^v^pW=UVx>v?*QgqFk&5`ePa^>c89IAeiYyd=GdvfZ?Z?qa3oohH=
z^RxlCt~JBb*tlZSia&c#e4AgvvpUlgiIfa1=rkL-G(+_NGZ-MJ6{_=3f4WaZUUhdq
zM~QfhME)W%DqXL7&?+?AVh4>z3oK3prx5GslQ{Z|MV|`6#A$Ii9!f~Fgv03cR`cHu
zY-%EsTKUsa;H+g~anUe#5gfmW?~)-~0mLk94w_F?tHkKu(0Fgbj%t96IM%A9tgde_
ztmWetK`y+9^ukjvo#vV)kc?2HmUYh{OTLY)MIE9JdWsiV>!*t(D{`wv{tMFoz5y_G
z$1g&)LYDfc3zd3&=q$s@j-q=si-Qq9@pMf1zY77v;#Bid7iDkjCth*{IeamaI|Z7f
zqUqp*r{Ovx(8ZchPCSuP`filY=L6|p(fC`CG==kLHNmfi-{|9H6VYb6zj*AZ7pn*v
zej8DM%scVqSr7Zo8t0IYZ8eFKI&$|ZFcsaL9j|}RIk!1lrJ9A7w-K=?e$X&iW|g8E
zl?}HYGWp~4NxG3qv_B#`2=i`bI94O)$pT*Qw9E9+G#FwA;m!)_o9k9@GWEZ*%%vI%
zje2eK#eG6YFPxI6Ejrc3wAn0#NQr{?jVyu-_Wj-cN75xVg=QUA>?s8no8vGT65T`3
zh*(QQJ?B83d3`y|G${eUzz7bML<B>58-<<z3M_bxF%(O4f9Ca#nM9Zb7ecG0=@jrJ
zZe41Co!fNeKJh+xgF}^u2AYLdF*Ml-4&X-0!lGKzsauBNM1dt=9tG2@2lY@zxCe?~
zl`90e*@ayvhXrzVMV^0+{#bIJ5&|atTu5MUw+;4?PPQp_zc<JS_5SNtrjy^x{d{iV
z-*e-RnU63~f8DPrzlF92JLcebU~Eu?8TAbtJXXr6BlgClO9k8sTt20iE(IL)6DhKK
zxe!P--5&RgknaqOF@JWER!UbzG><VC5y&pQ3C6|o>V-#yc<2LVlY&eiW;_l&GbF@e
zn}GGOo622z?t}Ra88!%xrJ>+YkrWW1yH4kG$68WK=ly9oAuv0xnCz3LDS@%)&xCM<
zfyan!uq~g>6hX^Rt1i;OKMRrw1<Lk<qLS|4xaIv3B)!+7Lp7&{vXYQOW(=>+gI!{d
z)wbxkR?-{N9-qkzC-${BFvFy?%B>fuTuoPUiCwlE(7B2rV_{TEKK&%Fu1-BECM<@D
zE+(T~g^;j)Q%t9bRqOeNiY~1%OmPR0QC}G@8KP8+Lp#ntGJdua+)(Z<#0Y+B7v0n)
zHOrYXx!>6#6U5vpa135}#+xJawIw!~i-l!OnP*U43;mM{uqkkK?6xaHfUtsWtCBRO
zrwJC8bvv+RNiRX+X9po5kNp!U;>Tl1HG6DywD;c|in;F83w#!gm1Mc`VZ01+c%F3n
zt8)W1fx<*M(rMLT^bUqkGEKC-%DQ|AM32xynkD3-6ey<B${tMzsd`+*4Ri^ZbYi8w
zhPdLnxIW<R@EMHI8D!zJz7&25tWunUHdcz%^p9AlgiWshhXoMg7TUYgeM@|3`!zEe
z{r9?yf-0f<XS%S%^edj3w+WW2YMQi&v;9u~CEt?}g7)~QQp<WJ+vq{zg<y+M5dm-l
zi`}yCCe{C>hN8f9pZRp5t)_p>IpulzZ72BYFuxKXhWS~KfHFj0(m6^0;n316FrB_w
z70M)F^c3tq$0!kX<WGk7&X!Jb*>(CIq3(mU-8hEc9#yP|dB)3<*op|ctNQ~?^HBe{
z2GD9zka$i<O?@oy^A1wKlf0*BS7<R-;4;O?<HUdaT?{G&{FMF%mRV4#(&)eKrG;_K
zUp~-<KRe=kEA5x8sce3e8v6Ko?7IcesW+*WEu^cD@QGm6nqIFj$Uoqx^1Q2AmY_w2
z3_fOf8uBD3wYdT)$P^fCZiR4HU|5}m9oSqlwo7p%-}6&U8aXKXOw_ix8Bz>|F<_%<
zPsmGdoJ(2*fJH{iLz(Q*zeniA3FG$QvX}2VA@U6IRkGwa^yh@&%%)@#mMDL)Ds!$+
z+0WoYBH*p4SSCmWM;X7HVKfLnF-(`!U##2l_A?azCc<=H3@B&%<OYNWNAcR<UoMsZ
zsidSLDEXS3MV#q4XDJ@HQ@vj|_wJ-{xg`g&oIAuUw(W~cF({L~yalMHQQ(vv8apsZ
z5zc07xqiap0aJbeKO4pIWh)-w4Eaz0Dw1bNMty8zu*VQ8&B9tM8^B_{i|zH$@30D^
zeuud0kmUU_-$X*`hnjq4NY$#({=psmUS%|Jg#wG`LVWaNbst9Klnnm>pqb1u6tiz(
z^LeY!p=OZ3NQ3^vBd8qI|1bQPKEj26!(^MHEByO!LxcoqFo7MH3c=!*EkhD9xm|l0
z>}zThUuS~7eIoUj3jbJSqKXB>3(`X;|MW=Y@ay*=@?CM3Oj)0eBJ}b1w!5<RZ_~ua
z7ql=fJh9W^fe6o2B8*>CUOqrILOA%Uhjt2C7xFsCfo;3Uzthv9&vDUfVt3b7m`gX8
zRoa`p%I{-hoMx!S;imAp;ofHW-)W$fA{u3gJ#qrN5guHtFUEh1-|Qs}q@^D5v>{`p
z|I)?>fE>bO%$(JPh0u|F3ZSeZ<UvyI$Vrq|#T{OtD7+VXk4MTyN+BPIlRk;CAq)}S
zzBy=eKF*tpj;57ZRG-viol+8HRNO3w*zM4m-&q?u#;9lgogH(Ys+)5Xao;KWT7W0h
z_c*z@vp$+Y_D4<ztq%edfw9D}qHT*qC^gD*!2arXdEwohqt(~UcXqaN5v}3-c}uj5
z922h@xU??}uRmV5YNG^s<tl%PppuG-BI9|dwn*2~?76TxS4)A~%XA~}+qg3moP+dj
z{NVlO&PY#Kg-ID$8}g`is#H&fU7}$%f{kh7?Q4sQjxYH(^p3x#Mx+555Dc>gOIjkz
zctzUqWrn`a=kb=#l5ZbjS{2tWt9#(L&SPq<>V7L4JroN2a2+axl_|2A@K$@XUEvqa
zGxMN3>tvpnyDb~f8CC6L#j7NYUD}`2(1@H0`82A+Ei7`aW<=uoiQR5TmVkCR&&bkG
z7u|T1HA-O)uk(bFVCTzERq*hL`Z~2lIS9k3)q*!~4gay{ZOm0<619v#gZ0|?V~ZEm
zZU*e(=)XxbM~G}@LT-s?KiXoD#1dcm3rvJ<1?%jh9C_H*#$sJ05lNqwr_kfn;gu{A
z&_{ZTT}R?D#O;0d;~F6Sct0-hLqzkGAHK#L)^{))O+>>j!OgsJ)86cUCb1`G6S{`G
z^l3|4$*Pa#tAkf9!@VN;POYeYZVvLpv;4by{4JfNk7Tzy-qE#^O_5b722a15S!dbo
z=YhQDA&1R(i}>75aBKp;WeSW;%~=%z!Aht2&>e~!>RhqcyS!qdNQH9~aV*rN^||f5
zMOk5?65dc8Wi9Eb;GcBYKV6I(*#23tc$dO@d(q#-0ZT{in6wP>-1jEX=;fe~x$|5}
z|E5h9yPD{pT!m6PZEYmsFZBC!YQuU1{`>vQNi*2gdz+DG&Y>p!t}9-S!Z&Kh9TTDo
z@2Kx;jcX}01Ts`cMc(waruV~SK@5&WFRPjs;=nB#t5=4h``!JV<Y7u6B@0}b@+b=+
zpe}esqHj(jcgpCS0J2!HM~q0iu)k#&Z{6`x-YP&Rb2vihH?L{Pi=0QhLgM68U8kzS
zgs=Wc2FzHdJn260!3LY2{$xv=dngoa>FZ+WJFW1{K_lgHN6PSkqf!q$O%KdYaz^~{
zb2EtA&kk|Y4%}E|BHO<6aH;@1P!7C`dS%8!5pi_gX`(^ds>)S*;%D1dW4q>=*<I?V
z2TK1aX~;SV-Hz;RR+3<pnx*WlP4L9EXZ+1N<x3qF_g%ci@kVHu8MCDHRNJS8K{fIu
zpVEEIsvQ*(4*z&r*GZ^^vmLggPS^Kq_Mrx}alYJmRzc+T*x$d8@S^g7-e#xGC|45+
zuZq{)vzXxtjT$F9ce#!4cw8hyAv7WcH@_T_%z0ORUUlAD%k!U)&G`mgQ49KLGn@7b
zE&_Bd>t^gq%4Qqg0I^Rd3eT##S-X<7-SVcFu&Uv7_h>KQ?fjH3{`@J=KuH{p+!2N)
zjCpWS=TfRssKfBda!swor3M`ZRHYsj%x~#h<exxd5LfV7)oZ%+uPSR(Y1{qv>Xb=w
zVnPzD1*J4>KK30J$25DKe@tcPxO<z2;Ke748Zcp}wE@AHG_8TUzJ93!I@VM(&EDb$
z@r{&DaCC3lB34CO*}_v!+VN`b$<!4KTO2w1t=RQeo&p&Q4K=?wazyHF<#w8R1WVAu
z8P>~k>ly}fN}>+&e7F8O-)ilb8MqX88vU`7i^Sbh(BhLk^!pbWXOd7Q3v~FB=|2K$
zpVYydu6!E>I=Xq{`P3FtOv}z)$or&GhQ9}YM8ikaq+~i`9iR(M8KZT3Q)$+d(=~+8
z|4@uj9`L;G1~Y&<BWeTkQy-C;2@=7Gr8n$%T^GJ)`Ogw&M#w~NdI~RcwqDXHn5wz!
z9=+~m*|-oXGh8LN{v3}szH--z;IHz5efQf4U&B$!_*plbrwviSLNzblA5_G`6w@zn
z_US~1L0CjTNTaLQdY}d42-CDuT0j3<!CR+>^<*oc7obc6N!{zF4G<M7RCH*(p*r~5
zhs-)(G2t=qL>3N9vlKs(GwORDT_#7rzC}mK7%z?bVf|u0%EiF6a)5IsMJ`UCUyMV5
zIG(+Cx4mP{>8&(PajCdf66FT9Vn&c^KssafGkoPJu>KxT{);DcalU>K)>((T87uQ*
zGD<m;r>w__ZR0%oSUfq+U*|QwQ~7r<m9qH!;+>9Pk5uUq1+tr?6=#X=<kSt{N1ANv
zw;_1W2UWuU;vW*?34)=#Pzj{d4;Z|!#u<)(*!J%}O<fW1lM@!<zu}j*u~#50>Q4@D
za8s_|Vwz(S!nea|6lwsevyo+Wx9Gd-tc|%MRdPH^WET6KfF_ba<}7RXJo0_7zlq|f
z6yDu>QZvJiVAC(N%oeG7N-rDGsVL=$5tgr?QRp{BuEBYc5_H1~V=KlXAgiWm-oDSD
zy#tYwBpRb@i5Y>6h+}V7Oi#i}L>xW%O1<{T0^xxr(P_&RPn@%Rq+8jr+6{CO4@s47
z-8duWxh#9$6x`L~+6HEylCiFcZ%YY6?=El$pLOsUpU`8Ef(eg9j@Ul|F$Y>Lbv#>X
zaHM6Y559XfT`I1)Qh6^3!*<wi01U81Sz2&%+HYY0(vIB|1?uv*(LEG%1@Mr|oXUmY
z8o6<|4S3<*1rZIhAUw3`i`?*e%SHM>?PQ#PE;gTFw-Rs6IsVJ=Xj8MAkx7E1wl<0s
zcx1W_b%3S<<XWuIr)c^;oh)Ef1D7cUTc~wE8h<X6Z)}>PLRj=ZYv}&5b!y4bFId_;
z2G<344$j$RI)qJ4#+H^u5L~e$8B9p$t-y7Bw)+|fihxp?^G+bJ+p#v%;nF$zFx#^H
zmYdY#!1*D6NG@y1UWNAb#~arH$NH7f@UfAJXU&^md~kwF%l)JFIY-!?bn@a3z(ukI
z2-|o$+qX&D(N)O9$T~=vB!$MPc83oXc>mXWvVC3%5fi?{DpF5uFUX^gXY#krTHgpg
zcBaw~U;ayaIo&EXAo;p8#1|#LT62%yRF_M6EP+iWfs?3u@~L6851+nXHX_QtmB0uI
zg-s=V!l3sZneu!@n@h6?Tj-X3=FBkK6`S0*lXvR1ClXhlw-m?iz;I~+`Qp-n74|Zz
zbC<zcUBC}H^CN6FjOvlRDfJf@ZHbChp<nMM%S1q<weqi}%eKSH57Z}{o(CQfAn%MP
z4vQ2#sD~@uqWMC`gPfvy-%KjA^!?yG%wWv+0?sKz{Lj`E_Ue(&18c#b)x13nm`{XF
z_r3}9neGO8rYrK4D}b<y1mVjCI)fr>aWW{ss>A=V&U7L`$J7$A*ZKVI>Bw>RcAaxt
zOe&X;<l?>`n4+AUkL1QZ;cn`Dp}ZTlErPb^g6*XWH-^7So4?lDy!mXVE&h!B=1rwK
z8)+bK?%Uy^KJnwn@^R=-HUbTX7&Q7Mk0p+whkEhzs(6<#{JPWC;<B-&4DpHts&ci@
z4Mb_{#Olg#A@Wpa2%V$opQ0Hb)or=UQ&^{`-!UAqEosroRxBH28&ldo!n-e^?|QtF
zfQyRL)uo*gg_=yV$xKJd4%rokXqFu`?!FAxD+EV!OzleS0<%n4)Kex$Nid{<J+moX
zvucuVaq_!k2-s1)1|#-7APoLBayPmT4KyTPVkpL}W#A)KW>E#_s(5*VV!{&z+0=NO
zamT1^F*J@3^%Mb_{P1gT#M4}TnLDAYwFIMDWgB{?Z`u)xgAsLWA5|%yk|`wlqqX0(
zsf0s&FD4BuLSMs-d#A!qJU8n5mm(DorFXE4*yF5hL3qDKd6$n9Pm?<Tiu_syP#r2@
zA$#<}<y+Fj=)_xIYnBU0{vW;fZZ{L-FFvU*sZ<<d-yH~qgJ@p_bu!D(bG}9tew#`t
zwcB_97!n>+M6!CGDf4oGOQ_h3)GG^G#n{rb6hajT=VqKIp#QWVHjm`<kF)3{{;r~M
zIevEw*c#ek^(Tf*GbA~OxG&sSZo>`}YY95#4dO<nr5$N+adFI2(2Hx^^pRpiibl2O
zuBuJHBZ^^yO<ffBwZ?rn40K?=Gdgx9s;PjdejRhk$zzdVt~Tb$(q(<f-2}ff;V;H*
z*}!NNgRl&7;5|KoQk2cF;K}#&I@`+;PKupc>mV)-1tPraU~NLv=GTO--^2!4OIWdP
zySG{uSoj4=q(>@a#m@Cnvkoe(VeH$(SGKgWgbe(ClpG}IgymA8OGdjpZ7Y`N-HwlO
zMT^_e`|U95X#s_s{902Cy4(@(lP9*eoQLB0(fkt64btZfwhsiY!#@fE(+{42PU-89
zX#zV>#H66aDx--@w}CK(wAZd}&_}T}4L;k%N_i5R9<0=3ps_SYkdBG1{Pqc^KRgLx
zy|O&2f<&vu)i+cVfZ`qlGPz-78$GLP3{R!2TZgUK$47HB-*IMy*;lmi^Z51XAX7j(
z8azPmnIA?{klW#~En=(n!2_yTsXnt&DqigssNZnCQ=%nCUZt-zRn<du7MwGTSJ%q5
zgz!O~nv8x?5$A81bSMSueapW_l%YB|=If9Y?LRj*rV7NckGL{)^A=;Atl*a>;XNC6
zDGauTsn*VMle*6zkq7b)!ERXevT_v`JEyNYW^rIp*{2@TD9MPpXY}7NHPk>%_G!*=
z>Py?7=M)Dr#Jq_G>^i3jwGy}1it8TXqqw%dmp(kBb>Y6QxSAcbw5sCYSCw(3r29N{
zJF7E<+48xSu01u;f5T|~r#E^mDk3+qgz0Rb!_w#pUL+`UE9@mwHmVY4Fg|;-No+xn
z!LvV82Rj#@bNp^>)EAY=<POXL8E@v}j>?fH7b@Pgd)70%B|6xF1RC61ZT}w2#9dO_
z^N)P+TZsO)U}K;9E}+d<Tq?*=5YuhxO{~GN1`V?SVGz?`39U#3si`lQ79zP0Y9aWF
z-j349;K?tyV*!{B6M!PAQgVO~uWo%RWWLIAa2w6FGrvZ3w}-3iNe9ljbW*I8H-87<
zGBDW8?UQK^ov5#o9!YVI-zIc>&3B$5uclA|Q4t~;&wEJzdA%d*<3k6-;xHe9v1GuU
zClS1yxhbP?%Y5-_gFo*-sX$I-%RYb@P@Q6_EJ8Y>8?(7|Ki;#~T4PW%%CByMgHou@
z8R;of9=}*!vh8a8#BXTG*v=d8ey(a8W`?(ec1DE$Y(HG;W2<l|JWa6zjC<_n9;J8@
zjehw%rX`p-7HVI5aqs_w*&Zk*Chvj%w4lL-%!Xc3JnJ|D^8?*SmWsXftS>5S{Fa`K
z5N#h?4U*3)qvzrVJnF@U+rJ^2#z;-37EJS>zwyswOG^Yf6+Yn&suYb{7={oSilV8m
zy2N9VC}7I<m9d&p<$}|G@7n7S64!Gg6~jah4q`~A(ipNhYy&K`eS_WtO{0AUoy^*i
ze5~XVT1~s@ddc5~o?-vd@FFRZu)nAMjS%5<#xB`Z)=0*cQaE3gf_cKxunaz<j@4+C
zN|0LAI{~_urX?`qxzo7&!r}MqA^T?-wi@(QPWh(-p1hx|F?VUh26%&l_@3cD1MPB+
z(!12TfMws)+h<}Y+2?bxV8C}c(mk4eriMX*I7fLL(lHztCRC>A)q1Awydxn@dsBCK
zM5UPoY`=fL&&mLV74ifxT9(8B?rXcp2PM2M#2wX{@1dsiA~5q*8EvSs{q;KRPwdua
zZNrZnx#c>epADPXmjybIT<15tdXhM2!90nkIbr5cw7Xn!2cW*_7bdxZP-~YLiozcR
zw}hif4oM_hBq$%Tumv|g<w{4NgA|kz=TM&4Gh+?6Y`#IDM?BNF{*)ScFRJorF)V&}
z#h%8NqHOODtckubW6eD0b7E2cURd`n^BxwLANUH1?e=eYf|&)Q`aHI2W`cPAx3$6+
zbmhAZHMoib0uCX16!#9<DI`Vg6b0e?Ar5vHPja~jG}D2kzOCDU?jK6-6Pz{9NxHF&
zmip(NW_p>L_9ev|TM*XBvp8POKAg*8lB_{MM&8jS{$NF}7O}dw<q_AnM*)!r#shxG
zYTya2vR8xk$+he~+`RYD03;66PU;<t&f7~)1~o$up6!m-f+b*%mRV9LZ73l_MrYN3
z!?2#F*#n<Igb?V<+|Il(XKAF2H;|xXwxZePyOV*Q&hgt=?;8>^+&`W5jy6853bnX6
zR3h?5sl`E0({(0GZ%yp+y`%q|g|Qb`F`k=W%a|Ou-W<$nnh0))w-ls<-^MePlUAkE
z51X)HOu(VqVtsO<=%nTH4P@~Y8{RG4qH{~U^Ylv#QP^SWxCIMGK2eV99J8i2a#^`V
zyW<;r<{FYmwIif2<2G$iki>lagYWbJNs#0K47e3?@k)<yMm@`bg!zj~G2xo@mJ9w&
zr#PH@)Wa*F9Afrol`}uH9s;uo$PU*Hnb9WWl|XvKCVD0?6X_7_Qrev-Cj`#-CtD)=
z5bC{I02z|7HPq1MKJIGJNx2K8E9)OIe|hN`1;Kmc7gcU^jdX+1HgL?ZVj&JkBm}70
zZ6=fo#+QT%JyS8rb6rYcZiPTe86Yutqql8}V?Ltj?|fvobT#hz`K94I<e(?shS~+;
zT;b3a0!3R>kmLy&!{gN~Tj|erEwK{!^=0XJS+0Tw<}}+xFsf0_&=-6@073=dbMC!w
z=%@?g3dRS{bB5d0-$a$7nh7}JtMwP^Ufs{5lUVPw*DXpQiy!_(hu6t-;=I0kNwh7!
z7LJzca1Wo=2j+Cowxtm^5@oI%oNvc~p8MRpG=`2{xn<%}TSs7Aa%n(yE)`rUP{Js+
z8g^l`h8WuLhvD9=ZxGZo{E}9IRD#uY!f^>!N@44MJ{)G{11zE<$nrDkTiqq)Zv+T>
z1>Oj>;i00BmhnRY;eBA5r%}Jqrse^40YM{Jb@H&p9v#zi9RWZAeje|ij*m6mN*vAq
zQA@dh2Ez|E#hAK2i2Hm~PctdPRs3V66L)gh&kEbqvzBW5RQ$30N`pz0RUqFJVzEvQ
zeX4fW&Yttm)Fq|$a{c`XKTP@;MG_1W+zquVWRNLVs)#y#0Rja9(_o2!ISXh5(h4<-
zJU&+#g(=tvBOx0y>6h$9M%r4nav||I|4z;|Dh@d@U?}u~y*vjN;f3X2jLWgw^m~8U
zv8W?34{(Cix%Ibh*veNpKE+t8%FT`ails(IN!H<xRbn{?VO5#N@kkfx<Ao_ak8(I@
z9**yZWv?NqR9l*#?c<IjkjavVkK>hD+p0DO0Z*s!O18y!oxR}^QO}1?Z`dIW0>Hpt
z9rSFQJMJ-Iyp%Qu9=H}dTHG4qB0aA&3c)hQ$C6nZWj%txI>eVyIGVLspg5JJKmsUt
z4pPxw+aDJ6#O=wS7LLjv7#u%cf7#|MvM1iH-*kH}zzOpQRWg!0d}IYD1*nv4{(e$?
z88+*`|7{AzRYPh))oe7H;cKS*&wqnU2j+^jidSzoEw4F&P*aV58s7<)v<c9)Hb%`t
z7&PsoQS=k{gjZ$(w=RS`MFl{n9U(~rl8&sDi(iw{sBomfK!pue>@e?_C4MN~RE`Iu
z=A4?!$}q6P0^qv}urxquJfSdw4)FwUXv+mnJ_J+zFFFc~`SYu1Hkz`6X!k8tRPg!D
zVOy3>NSvD{Ivy`-5pzP1K_+7jz<nvOuH(>(i(c9wxK5`K0(764yP%W9cEP9Q0E$Dp
zxWQJUyqxx54A}04W;ih$9ltaM?VHQxV0<QT*jCNuL?Sc{s+?+pmOBv&!IG}lX6y+6
zwv`*1u8}8~X7)4m7mo851>7f`DU@A-C~t70&}3c!G)m{Ut1nxp3kpv(%v(b&ZTNTN
z(-EUkrsY*V`Cd4}{;7+Fz#=@3mW*Qd&3-i8&VnIC5XX9V)ftAO7y7TPJ)clYntV=`
zPFk4!CFZ5)l(C*N8_d!sFPdL*)GvaGOL*Jf_%QG_0C;+jIvy6JUI2pc+5^bR(+1v{
z%_j2}3qk|lqqFH3RTv9CdE&Mkk#*iA0LDRCy)FY2rf7sng}$oNy?Fg~QF+3yc4sqo
zlub{xgra5K<Y8-!35K6hdU|Qi4pkE%IYvs>5C+bL?}w7WD7yZZuG7ztZ<F*)(%?!2
zMIC0JWtw!E!=);2><@r;3Y?8Gc0*-AYlJu_u_B$0K>Ce30yRs(0E3X3#67$OxhVIh
zHI!1F2ceGhF=B%F5ilm&0tJ!{ePG_-5sQ#qfT+8n51cET?v+4TVTNs;cJI|jno45+
z+ZKiI)ia`+wt0#;wYB<Kc@_qFP8ZuH-w+^Bh6D_@k~~Q}sdKz!rh)Yu*~>529Z}yA
z^<UEIvVnEZgf1sAG?I7(<u39dz=$>EY0Gd=yp;EDnFY-5KQs5KL@4?EMJ7Kt=wlH)
z#Km6slVlNf@sr~)`t({VK3OP9U}FM95;Fq*Ytc`r5f+nKyl%oQ`=2dKh1*iNxG{tb
z1iCG@D#fykkR*Ee{^8Coegj~~#&qJ>u2aVE$yvJ!`G_8c3^+Heleh!Xn7O0b$-q1k
z+|hs>c-eZn8lHXV*3Xer((J(#KcVpz7MFadadzUfj)f8qE-}qGB*@;leS}FhjDba0
z_9D1MJ>@s?!?F=SItzIp@p(}I0e&IGjc~5HsUnbC-BEk}e!<fU$YFO|%^c031linp
z=8f8SX87X$>@_dQxe1)>yn_)X2<`P}AxTV#(OmgLlRU<);tN^u*%mO+<8~k5lAr4a
zAkWqvnk6h45g+M>SupM~N_S<xB6j>dWuZ(9%DtV9tqZqzCIG^1;4!}FD~~^bIf>&t
zAgRB_-KGZqBpd7zC^>7;+DDNKe2DZ2edE=>c|r1rT|@ga%ksvZ;$)grBZHRCmEFAO
zk7Tpu!8SS>G>=LKCovuc+?Q-1I82PxL>hhz@20Yq*PCyVQ#v<SH2|na?`vhp*8s?Z
z_z_m1@J_sTG8BtTRWWA#qF??u{Z({%3ovssJTGgl(Do?@Yd?J?5rOXe65jW*fVxNw
zsP4z_pP#?|59lBA|8HDBN9%u?1N&y6^6(!6{pr;OQ0Qt<II&nR<)XjZ=4uw^(YK#?
zA`IxBs%{$lWE1c0RoYw*R^W{MA&4U}WgMpzz@^}OvlP!T_+F*Nc07@cVh?DnIlP;-
z{hHjK1xRvR^>ci8V1f$U0Xqf=S_;QABr!ZJhmKw|hu?-R=KoHB%?pg2PkIWdP*vnp
z9FdZs1Qb1PDku<^D-nbjZm-SkwUrZ=jvW#6@ZgH_P=;B<Z!Lz5ScELzXuBUt;y-I2
z|8K=#U~qs#NETVy@^{&^LBdi{wr_I1@v7?7pM6?ykCm#Vfv6^KldNwZ-cDW^0|)+@
z-6u)oTm-9p9AbnXSRA_pduw1ffN%QbQ8XRPlXreLS&$yZvZFT4^e^+9q}uV_4_&qm
zl+sJv?ZHK#Lq`Ci@i*dBy(tGcW-35l7V?C>cy+j#N@_K@0NaPCO;~tC82Tf28c@i#
zvjPCiJiu6fGN*sG`;k1f?DOXW8@$VY=i$Gjn)bJc9}jXUc)6N$7J+!T0r(Aj^|zG+
z!+{L|Dro=mg|OTIzk;<bhB`rXtg>;4kJ|-xr(UJh>L55sE}Fvf9xy3p&T_15f4VWQ
zr8W@(7uHjh|DjT|;?mO?q+>b4K@HuPEQ9>LP&lAb=#4jQoBiVP#UmXkU_TV@>*yIz
z5SRtp{9i1<>wvp6F|TC)V&K6dB-t@+i}-brbVbBctMaj1q5SXCw>={y10Vn<jCr5~
zn{)mc7|nM@k&i9KY>g2=!hiIB#1eO9D7~CT*mx&K((n|ih9~`{kdHDD0AQP1eR{6@
zs-pGUPIEOyD2IY!@C%W!w`?{mSlgiTn<gmy<$+FG^(?+DlsXWA27NDkogI3xQ(A<9
zAuc7I^%{kK_9dKk8SPg{i(xxL-x{ot=e&F2?=$g7P2z5R9vI|qFiTv8r`Z)3d7qIv
z>izHCeht<Zia>LN6lOcJu=hhvZh_rP3%LD5y8&eKd_XI_)$3;ed&0jvKn<N)!6&<!
zlCn^NDGi+i-mZ*<zxlrKjt2m0zE(<*xE_fe07mgfYZlj|B$yiliywa80vhVdt(<?l
zV+8;-hU&4{HIO_FM)CZ%Nd^EzoxA+)AfnTJwd=S80+-%*ixoHj_#)6E4rd3)1Yp_N
z7ebngA|uRf50M548!?hPc?eb+plula4)5m8;?>!dX$Ek)FU<qfs10EH_9|BN7<mQg
z_-6{~cd$V=S1Z33fIWhL0FE(A?!a*yM?u>&p2GTRP5AE39Ri<_Ai6|-FaM_YpQRH*
zT&9R9#d0+m7+VSaJf^@QpAm+?zuVh<dwA;~VgQPu(C(~Vk$^#5kOE^tSP2IIJFxVA
z$?vI-8zP2Gnt^vJoIlNbr%36Vb;|<t(W6|}TOs`kFk!A_VV(P7%LP-&>cgva#FkC>
zpJSm8<s*r~jUSKkmFc#$E5FSis`!w~NxIRo2<7s~J$xPz`VPP$xrn~C$E;Fuh3AOf
z@p}LkoA7v{-t`aaCs{mQ$UUiL96R9GGZbN`I%i9U@es07yKezB<Wlq5q)~pJRKx&q
zu2X#jx?-qBJYKvkwlMWOODdQpeqt3n>xv&!H&8@)B!*#>v5$>$Al@5MA)T&2HeS`*
z{L~Nmzxy#lvK_3=6@m5$>cq?s?=YvT2QY6r=+oJ6QzmnosIq0DRNk*31x|HG)eN*p
zjxGHR&~h<jO1H^v4U+1^f%iP9=C6r_*adK^V4+NMjb6{S;$d-=kSMrb+9}=1{7|W=
z3dt&9yuc6eL;asmJS0#Lg8!H#C1vCJT=2d^e4ab3@d0I8l1Og2`J-{#JTi?IaAH!p
z#t8u2Z7*;}ULe0W2BmX0{|!pBh!N@|D;IbF56GFOr_BI-rM=(S)+NAevR4&c0o4G|
zaF0){v!dY2m$k#vp)XPaIM1ZA(0Zam2HeS2uwk#dAN5a-Vg#7CkGTyW?5udZ4-vRb
zktj7Q6$yy#e&h#0x*;FcLWfILNdE)~j8th8)XqlH4U>U@xX%1=gJ)Q#7*7K10C9yY
zYrg*-T#mtlaWJ8&$AGt7@o}~H0zRJc0~Sa=*1&T@XApozI{k=}p?r%-7yEa6vQd!X
z#I3~H;^9W7tpyU6ebcmG#{&%U8b90szkLh9pzBmKetRzAiR`_3ATYkZaN!E+Nw+Vz
zeW5fUXEteIJ;)#mpgO<6ajpPA7?l;!Fc2rK%bSn;(B{8A##E*ftk<5}{^taahiSg;
z{SQYk?9~tOrEQN2qx|`tvxNJ!Sr}v63eUi+U;u9Ig%h;GR#uBjb8fvF0JwyrKWZey
zfD;w-U;H1N!UnnZ)`pGrk`uWzRoQGwkhBJarO<zhjEHoTbf>kRcmUH~9-eOUp>E31
zp5tK%7(#3ooLvA;4E&*~#9>MEVfoR~NMg!H+!8)^)!=ANk>t+{>m(gr1w;qCbQSF#
zGqo*nZ1{nRqHq77bUEUSXS)PazyWQMem%GU1f!L{*AI|fmxtB>FZBd;0O;nuO7}Tv
zNC>74khopJPiQMDNG~R>ke|~A*#&;|xNHV+sDn=_9n>T5?U^$IKh{=o+GnywL*q&A
zs`YcL3o^A?C-$)!db%i`Kcklk&f+&E^(RwL8-im$YPWaATmuY@J?yKGcnFf-b}KKS
zU#Ze6qoc%<hawJ%-o>gTv&;W%-YSD~ddyLxZ_jmU$95<6<2}R7SN?d!ARS_K5!{Id
z2`0EjlSrlm$&-rIn@oT(uJ_tiw%^47yE}?pO9Wh)-9r;0@!1WCy8d*`uK*ZuMi$V(
zu}U_uDq2Xb{kcS!4GQPtU?klK3`FmH07S5zu8EiBk&o*`9x#$8pP|5<sof?JA!!K|
z&P>qLLYFblU>&Q#i)jDbZM6VyfP_|oUBQ0>fWgz74qNAZ0qkzC^Yq`9pYHe<z*Vf+
zdXc!Ftc-=4k9+D2&L6W$S8*k?S89=k!sB=cb3lFY5Mss-Tqz8%R$!RI1Yx#|_yMJd
z`n|^!{VVxXytwcDx5yzw41;D!;emowWNGmifFD+YGr|P`{#25}3j6?*JG#t1x2K7O
zQgpJhiBBYFc!+b*^f7|D`IJDDsRcNTT73uU;-(NAng6<IAX>ftz*Us-!0B)<gpKwc
zJ`MR~4;X-%qEwtm<hGX|C{Y}PG+t}*RzqSJQU1i8&pHPmaOlmL=Ht(GVw_%cI>`Ls
zjPCPopO>*VSjo6<Bo1T;NpJJ1Fv$OS#!;xkKZAaS>RkN?5L5V%&?TEf_HsL$p73SB
zB9rUECevuDr2c$F4?)nnd;2>lb2_}Ns*7R+pi9{^f8P;fq$3!GvCw~%#zGM?W5l)s
z!&o8}(3E~5<Ism*<B4uIj16FyxOoP76OeFn?9qp8@gW&bzLyk-&g#{|B9tdsmI~1|
z{V$u|isMORYHXC0H?n@%2Z$Y(NSX{_3dl1zhFuK-Ltw=+sOU@+4RMSeWxD`O(}25g
zQq$K!fJca<-6*yr$|16;>z2ujFKN#1E|w&C{AzB~G<@+9T0hJ#fbKEbq}!<0#M8%t
zfJHlZ(dZ^nNbg%ckb^ZbuK<`c<!3blI*aZYT)D}2qrKdC6Pc%4hgW!kaa!3??+u<w
z(S3^21+ekmXrS18j3@Ym%41mS?nAS<45a7Ne>!&3-ZYkQ@v{V|0RRV>&DCN5z-4aT
z_V4a+4rH)@hXkWYP66^N`_#>F<QN1Av2(OL&HbIZL!WKvICBW5>rA%Xw1YEWiQNMZ
zGq{FNhQx15y)IY6h^&c6LT`=IHDG@qQcGK&CepLERRG3=i1<&R{)sOD<uCpB!vnN2
zYhR&4r|0+6%pCxUoIi(+RyqTrlcDhs@MuG~;_t>9fJa(5)jPZ+d881Ajw<Zc1fz;^
zTAw9o&>;#PT!GKh%klYi@d%a=<Sq198t4ej2`{TJUK@%V&otj9sgMZ2`WcGLMiuEq
z!HzA@pOL7Q$j*<-&2YWBPy}H`zRZ}<bp(K90-IR@{*^Ppt1PCl5Eq4e0esgN{&J=)
zR~(;ykcfpAd`lr=$rWZ$$-q6MtT<G*E2d}JAB3m`{a3<=usVGveGXjm%oC6Q{SA`<
z;aj6Mt6|Qg&c_dc^=hYN2*j|>*+$HpO3`TR>ms?2Emd19TG9jrF*62mz_@g`T54jR
znwihq{m(nm8=rwoNg=@o2O^ZGD*-B|8=rk21b4SHVlIKl?EzLy8-S%y@lvVkHWv=R
z#On_%(xFbBg&KJS00>9d4#_5s4oRm0@VN?lFjwf)ORkr5eO*VuRoZ0mHeMD^_66v@
zK)puU<{5(4j{7+UfWg+gl~^Ft{+9;-5F#2E;wb@4)Eqm?Q18O`sVJXr80yM?>A)O|
zdn%LUKnBp%aMM71GzDsLh4Qs`mW!_4)CxKj_V7Os*I=46wHiN#CIGNZ{lEcg2bu~q
zsY^i2YMp^P*+%=}vwX8VlYax;6_9VDP|Aari^WzPm%FnmfvunGd{hXu0MTGbs+;aL
z*Uc~{%?D_BXl}8JMwh`paNF_=sw(K2izcY>8)yvLi{N1p=IDjGXCHzTl0f>!ujm<r
zKx${e1v)o%5KSrYyK)5D82s4;&jFkaS>OW9uu}nwO6#9EE$PeSaYIvp_mgl$f4xmf
z3HSf+YpVIP2O!z=7>F<gVB>fiq5oUW(+qfaD5)4MyE<*#mVc#aRFyHRnnl^DMu$ih
z@eT7fBe3)(m1e-&rm)G@?+8}pOFTJ0S())Q0cv%}+=`Yh_pv#ghqFI=nx?UTbn_d{
zEge-7J^a1pB=6){S1XSd?}i9Zr?sjzj$#I6ahoXlE;_<qB_fx6{6hIzI=c5UOm0oR
z0(T*s-TdcyU<)K&nt*EW>!o;#ExZ7_`mkRO`INz1Juiu@FJ__{5e#?nOq(B-Fn<F~
zKprzz3*7j;6jOg8vm2P;B_|6kB(sAcOfg>iDZpJ2Qur11P${Iv-u&Y9H+ukU^0=+_
z*i<~rO|ERqB(s$R#-Q@YKNU@Zg&Tl3PtvGDYS$=Os7ewxUGdkzsh3af0SYD2_q!V)
zxiA}E|ETSKvG6{@MMB@6wPoPdPaLd2#-RS!#4)hf`k=p!8ePc^kFR(EZhM{afzR&V
z6S!1}sB30xWa<q=5QADV{z=3|_j_d3iGcs@2OfYpz_>))Ll6+0DvNq050S`oM&4h^
zZ82w{46HNxtdQO+Dc~1Uql_QDeOSPk8SrSDc+<!6MO&r~$)s$G;m1b*fNJqu?a!5{
z7M<t0@;+i%<eNZhw|Jb&Dj4c5b|B^a56B6KFUq5Qfc_gdsj-K~5bVuA(O!lGv^eJ1
zNnInyB=|N-!`DMAmp#mI8~L9--vQ7Nm%^A98%<zw;`P1{D4h2w32Iwro<MRgm!_WT
z->h_d)HXIgYp~oqlIhw-Ze+!EaZvda&yg}HwMe!0hlV_LQd-wXR?qLb2?MzPKY&2#
za8zmVR$&VJW&IC_#_a+}g1A_m_;m^lE95pIEY50KttrkV(=cA}`YpLFJ=Ln;yU(p1
zYqY+9+k0~@X96T}red&sZwc!&V+4l$2&GU=ZpvxqA=iJqj;Z5F#4oOLsEqkXebr8M
z6`{fr4ZuY<wnkoHGWP^SG0x-BmqkMgjB=i$ULSNR=?|DC&~|ic!0C{vu)si34ft7W
zR21i0MwcvQ<v~^~CQCdf{A0^{hkn{pxS<HCf6)}vGWn@_y##?U&IF%Fa-ZTBkz4{S
zNG9v7LH;@Ri!YHabZt%OB&7n7f`)TROJ=dQ^Ddhd;pqQI(|N~J`Tl=AI_cQwAY`6%
zke%%8b#M-mSw<x@GnAFR9V^Gm3gHkUBT7Pch(cC2NoHj4e%I;y`TfyfdfeyS*L|+*
zevjAl_2zWgn&;Fqoz>IgCJnk)5gyXI$OXw5?LS3agty5(_0CfBd)JvjW5)kFw84db
zBvZaIy(^_G&`s~<qnMipZZpUzHY*VBek{8prgU)z1hLu;H2Tu!sQB}Lu4FkU;Tk$v
z5}|DEa^%LsESgZ7SIE#oC5~{R$A(RM9Q)lI)3mXh5c(VVVWYWRP>qCeI0&ORVdSAj
zRQjCuAvSQ7!NqRlv{re}CkycNb8x5L^X=!C)6D+D+{o#qbyN(UkM?PHu!peCeJ!17
zE?XpTImy~%*#XWv03IdpPD!F}zx!^NRrZjo{a4fZ<fVBK*wMC+Wy(T51{T5V#r<Gu
zZ5=eh0zx8-uNwFw*zR&|)!RqGf4wBKNNNd|f5|NA%u}O>s=>i2Gfd+9O-GvN_(ttK
zTrae}rKUSJ2#4Tz?0JvOXTz1a<c_Xpk1){$5bhqp1;F#m`EC^}+-`1J&M%@tagl=(
zb8hIRjQ``@A*De@=(p|o$!3626$sW{SN<VWzx_pSV%+`nyC$GjAfI5jO8F)mGQYR{
zGB;;3_ft$~;ttKx&C(&T-7ZaCf+Bm`^<5&of3eKhX9EURN&cs*!{a;X4{xa%v6w<U
zd(g=reVLVpp#1p$UHw%N($~BILp-ypxVcJ2qIBs!_>BocN@7%EUOJTxXP=DF-Nndw
zkK*ztgZ`ersTYOi9z)0-dZ%u&Uxwayl*dU}2B4zUhIf5M7eL|TUAgM7i=6)bRZV$A
znjhER`BBqU11L<$)Bd?Q%ziJL&5zzs=F5e%4YbInqhpp9{<>=uSSS&o2t^;l5kEsj
zWPRzDl6ob+-N=}c&L`<lLP$ogTKLRG5F(F&l#h;~XrW)hYi*1%fx4b+$?ia7;>@nI
zk!*1<-TEoEM&nMFeSS0@s^cAalh>`<GY=)6{`$k#>U9&Ge+!AT%1^~uEAT2BDo3#B
z)}TL42hY+xud-+cPX2+y)l)t>Uq=qeHjO~T$w{kiO$H$38~uSnm^_jo4_j5T_Zk;2
zNR1$W$aVZ$8Ytf71jOr6!h6N4$B@!$nKggNX(Mn-?M)XIrseUJV1Q?i$nl2~u&U*X
zDqA0BP9GLKI|j{^Dwgv>_?nyVX@~_nO;YHoD~J#2DnaQCtW}p~*EVn`I>#=`zO&#w
zycE@}$ULPMgD&2s2`McL3DBu>SIb1IpCd60B^ZZsZ`5!JSc1$3%d9g<Nid3%VEnF_
z;D??26H$M_b}QAMObTUif%diuV|EJ2Rcc(Amt<yJ;`QGI{^?*!Za^Q!M}k<j!Rlac
zyL^JJ2Y<j1ftw(DGc}&#ty*XW&?&jHEytU*U+&y2)r^B0i?}@pxv4>c0<H_ml%YPX
zlaTLxHBr`JY|AxpJBz?(b$JBf_o3Zm%;zc2M7i^ozTmIMHa-=gObIs>ycnV<5MMNw
zvaPsNvJ5T82Q*zKwcL35T@k3a$Uf<CP?m2$$!2+yV_NHU8i37#-ZkKZ`99dt_^IZ=
z`90xJwJWML;}li$U0Y;o>a4z4@l8~r#Vu$~BagoB$(+2U>j)(!`RaC=uHk^}`b#u^
zC!dy(7=>z-`30-GN-pwJ-I<Gw+Ol}W55jU2KU@3jCVA5F<G;c5msPK_vzCB5f>caj
zdqHddC9(W-nD_V{^rYW)dBbU~(jC$~<=lQheBfHiI}WSrH%*U#R{&iE8eVeuQ8!Q2
z!w34@#$=6*gMoKK^#5L7dMX5LA*{%6nb?Jw!~95Hmp?H+TI}LVU=Tbalz2^eq?U;?
z#d8&oFy8aA)K5>NP=_HNrko#V#T&*&;Ft}<X=IlWRTJ5`dE0oU#cf@nA-#WDq+KXH
zRk)jVTYj0Pff{dvTun3Ze|gHea|}8rJJRwdKa!zzds+tSWi2dE9uy$wQyjftBfr?!
zzO^z_&poFj$={epA+1h<>D+nw&1nn-h1vont5Pg?%BIldvoCt8!N=fu1U_YhgkJby
z9RzqT%M7MpH})Q*3I`(G_MO(qKbNV7carHcufk~J6T6H{MXMgHd)=H@<XUJZC-3(@
zrWyR52Bm}6=66eY+)m$45>GqBI?RD`WtOQ!Vd{8&Lfkoi$tOFDzR{M3<9k7|Jz#L7
zaA-RZ#%J!y=W6O6?r3Zk63s~(#u*qmBzsyFbqybQ4TcC~kB?qE=S?#{`6CALX9Jf7
z-$kAxv{UbIHK@s7Zn7hzq(I{&X-b89eqV{gG4Tg1f+P}KnSz#^iV-q+&!6+jZkW!0
z@cLjr#ga7S#<OCFf$Nc~!dS!kyGfl4Vhx-G$A~qMv`B+A&@tz>Sy!zngggh$h-Oq;
zMEXjpUmie;$}~*j!_;?cOV&a7G9+#B)m?iaLJJ>0!{bo-;di~$=L=|ut5xlECG^1x
zyivDGos{vITTGMid8M6xHA6#%=q+N9ZD^=rO-`3DVX{3?SqJcz(Cca@l|dbt#G+8v
z<uNwd@7^$V{S+j_S7eU}6^zbQWaYiXdL>gwMl_emh#RV!lal;Z=UeNF<nY}?Dg&pq
zyI$QBcW#erL!!+gd9OMfv)q5fJJ?a!NgPbTR*Vyn=jk;c8ZK4EuxT1jID$sNuh8wk
z<@_pJQ=`F^K`DiYEm0RD0(9jXIs-%OzzHHnO&u5)UDx&5gdR&eH=hJwGIzs2>-jlf
z`w<{1)*r^brbf(Nrx<JJZ&hMnsj7|fT6&K3$IpJkx0So@Or7aPdYEHgTB@aL@F62(
zj{>@^2Csrw($>5Ci!6x@IQ_yzNq_0fZwUO>69TUmGSK+u9r2m;EW+CztsAj2l_O?O
z>U|DU(NY5ZD`btu*C478EGCOkdC^HNMp+l4ntHbKNbjr6r0Uf))ItB>Q0N2d=DhI5
zbHnGN3594mIN1qD6wOVKIEC}0HtNImVfS5I>YMNV0_lU4NaIe@A;_e%@Q}4Mj6P4k
zH7HI?5IwI(e>`xx5+kWI+7|$I;%bi+DRr&rMq!#0+Kxf@h>d5@^;@!=&D_G{Zb~+R
z+ZF+yNN(~0F|^?^jmUE@f_dtnr(;|NmLu4o*dy)st4~#;uNAs1GV6*gG9OE`nXLUP
zF<F)k;kFDI-!OXlG7BpRTpIGWe-FlqsuDJ*c&wH|U01tPr=E_Rb-)T8JO_Oo&*xlo
zUTx;_H9L#gFZr%(4)D3Jeo`xBoQ?bD5S9$90tohaCk=(EkADjd=;jU$kGLwVfLNy#
ztc^-LXU7;~INFbW4@yO{XJ^N@AM9}*!_S_pqn~j~bmQbh?zfET!8*HW3tXG5j={oA
z?}cSO`)EJ$L({+I0@Dq4K^wZP3)oj3(|p768)TnAQ~O=t2UR}Q-yAzS{T~(<w@NIc
z5teV3D7(Gn|By;ggHmTS-krmGJCux;)wA{t`2vFdTRFn(8Kul;rB9x8?RD7(8+WY4
zb0eSAHzOMR8EiFslaFtTMhtJTt^+<~ReyOP&mG>;0wxqZmDkuIBZPQtf`bg@aa}p|
zadCCjUw<6AK~=OwXJWWU4ub=op)t3e^V-x?*_GnA+NTugQ-94QLlHZn-;!(x108N$
z%*^NYU*$QbG+3~?y)~oMpTb{!t&)=0HsA`b3YG43h`5Qoj8wYL3U_ILk1W3)Do2L>
zx&Xo_T^p(`qm2b2_dzVLy2FJ%*0wA1AY<$?IWL%O_2Zd^b1S6MIt5BcLrfEI&t1cW
zlYQ;>>o2ets-zUI(bML1>7kUp6xihPPBkZm$}Z1s9jJOSIH(kh5E=5>n6VY_g^LNX
zu0G;Wbq{IkrcB8N=(dFLaQF^at9)Tn%b7TT?x$9cYLt68^Hu@c>XF{}W}66siL*~W
zq%k=|bSiXOuLM@Uqp_kJCl{EKx~?t->NW&O<)~plzASEXkXwfWrh5Nc{}5CoTVz<Z
z*%S3%b<ftB=2gUvI%!cz;B?i(ccwoREh_RA8dOdse&;ME6^bg#6wr*H=okaD@={Fr
z%}4-X=GN_RdxKcBoEY>`&CghSkJDZ6G<p2wt?TvVe!s96%lmSnchq+7if!6_j7G>G
z>(K08KUQT{vm4`c?a|=`#fuELDkh|3)PonozhYzS2SzjEFZqO^@pXOQHfoArm%WCW
zhI+klHB~+Oy^Fc<;w8PH41SpRR9*>WHzzZHv%&{V=JJakp5q2p0rAWW>RD)f`@+i6
zDZ<7B;FJ&Pv&R9=O()9Lz;~8VXa2J&+1-Vhg8lUG7DZ1+vwf9NC7q89Q*6goc-^$X
zE#4RX6ESA8G}SL>vE9yju~)i8@(E2dK|ey^B|~%pH;eXrixP$^cBTOyp{>DI>Sv1Y
z!#+Vr8gs9JeM%S6Zu4QC)v6}kS!-N!gtgNyuFr|DBD?E_UX9_q({INUa6FXGo8+q6
ziS|q~3W1#E&@=8*eE3w)DyLCX|1H2HFMRgb@<U=sFqyh>ZDRFod2KaASB@*-TpENV
zt}-P8W1<%KvW0Wg;bJ$k|6Tfu@0mgzi2~(wle}ZKBk#4;<_bf#bC<R{60W)Jd&~}R
zcyp`Ei)$=0`dw9uNl~1?Z#9D|T)@BXIHPFg9f65V14&jKx7D3n>d%BixxJjvJYj-0
z*+nI-4@{2@zwi^dFCiWJmrZsQlb1A(SQ2a@`Y9|bt>{VwAMu?K3X^&CD@KaH16A>k
zy~BY~6`RQFN~W4l7<GesluzIl9a`qj-8^$2N#Hm|EF=#xbwedn7)k>|!~N8Lo`(nU
zJU627wmq*R5Z@k0-x0#3PB$$~r-`9aO3UO#PJgsP|5BF62G}KDdB^yuXDa=gF2mpB
z_jQ0nuY)t*sZj~8`jaBXA6UxLb7*v1l~uD^EO0Hhv*7AFzluSc^zd5X(LM1%t}@-M
zB%itWA^X|TVU%!W`=Va#JJUh20ueNMWh9+@%;OlP)>HSZ?>na<D6SD}VTb7Ndk95h
zZ=CTZr7VTf<YVE5${ua}Y&%lGJJX>E8^KA<>>#`riLg|#V@s0C^t_ew=jjrY&%sk3
zHfQ@to5C@%TzAFyzQWhp)bvY`vq);>@3H;kvC#}f$cfcoa#6d_T!sQNPQOtjp8@X7
zj#`#!BNR3=9kLd<M8NI2Rp$NHO?I;Gt=x`5%)RJNdvAZ<sU)9LpVO2hJwfrPYqq4y
zj3aPnGU=f~N5;cX+%6Qjn)e$`$MvDhp!A`;<-?<9L?!iIl2L}?hYNFV#|3A#CrRO}
z238qHmzG(eqT?PBHc#vlujf<$5bS&-YTfeoeXe|R2m2$EuVFT?bty{^!~N?QS6V%G
ztx&Fj<|zp`w|QKWmgl01sc{u5^PY6qG^e1~$EGAci`zJF!<uDRJB>oXxgQFs8Fn8|
z=f5uhrLG*~I4Du+%pik!d|zN3w`DC%MV=ldaRmm|ysbj9n>NV4<Q-=(H=3y0jJzo}
zV%@ORR8{omlN)U}UYhXz;F)Jv&FGs4t?!ebJzRSmuO|zqYl2_sMUG15-^I$2$A7!0
zoe4cwox#P0@U`~%fblQYGuwUTPnpGUw*-4fUK(>g{C_O~$3?5{q$<q_c}v<T9Djc6
z^><~;Li^4&0~#*xyb5fYzU}u*W^3TxZV_U5No&?htbUcu&_X}s?H|6=|2;LEzyB76
zNUaxXD;XPw(jnY<->xF(-dtG>iRw9te6(DDhywG>LhEhVI+57x?KeSQg5^H9%;_Uf
z?B_}4H@k#!iaa`TUI->IweC)(n5!A3ZnmTPA~_?+(MDA6qTR5;3k7=N!HrRyrhzF{
z5><Svknc-y&->-}Y|)56-yciq3cbLY<Uzl5UQKz8?qE&%w(h5)Gs)|Dw7v}9E=TB}
z`KWl@>%+yYo6dt8x>!yF6pe_rJx!ugIY%sN#>mgy6oS1>>FTOkRz3;`iKnce6aJgg
z@iujSXuMRaHPKIPp}#LD@;on`RkWT3MuKMv^w8vU<axE#0UWM6pzRWKM&5UjyiBO)
zO&VE#xgu7aS{3m!K6E^+hU3!#icScpJf}5Y(5rZx7c@WHbFg-H-i6NX8MG8%6fXC=
zdKzwxN*b?y2#tnyt99EMd{x`)V6sS$V7gbQlb*Z!nQl0N2Qy}&=1XVUH(i4h`(VJG
zns8S?za{%LY56WANi>2#jFT@FVcD#+j|VTU)>UrC`iTlXKI!UjUZw|p4^|_owS18^
zxnB5QUqv$np~2>|QW;^|#w8FDj<`E01rfUDb0BvqKB<#c;D`Mu^-{9KYPQVSPSJ&q
z?$(sA-+Psr4@;DNq=>dTH4+0~)iqEUlN0`dJA{a7^&4jJxjvyUNq+*kW3c7kP|Q#N
ztft>PG%hahSs`l;HRv~AzBx-E<)H7pg$1or8+qaWWuMsI7K-J*Rd`NzU9ha`9fB}3
zV~v+pGtwv);-kq2Jx_t_h(V+jkn0YSh`i|)bJ2fut6h8@hK2AS*X+*(De?dL?7OI$
zvUR^ZdfvM@x$O+p{Kp3qrv60c2NR$c&pB~hFXt>4<wA=e0kQi}9ou*5$*6Ft9Hm>R
zEVVCkeiaOJt4-Q94tJvN>`9j&3xc5Ulm#y0FOB)$sy`9!=5DrO#a;$WecnCT^m3};
zg|H2>-vn^FW%E;Sd>M6i7T|Qe%NDbJw8N!eGw(R$25q)nj7Fqq&5SwtEz~DHnoFii
zdKdga4_K4vd|f$z`piRi^7y**mKszv1ZUcXOMf9ED)TXWL7Kir*`fOISFXjH>MhK*
zeM1c|P&{_suJ;JzI&8{gD3Lh%UQ?xiS<2aD!WP|uFa2Sr)M189@yT&&?B7IIYth-+
zs-%^o-k(|~he?YwjCVg*pS)cz4Lxw8xvB3GHGRLl+$$>hCLXhJ&-*5G!v{47-s{(`
zb}Mm_HcD#z<}=ru;Q>xur<kbSeI`O!Wg3?mj}7i|q`~d?QNQ{VU;`v2Z7qFDLdk<6
zotp5@X+1j6-H9UHIsPwI8tu^)!>U@i@^I#uXS=nlKBMxZH;wJSqCb}R2byj`<q7-G
zi}s&I<}t8$obBevUe$`okzmc&9Hde);aBJ1%l3Mdjm9(Rck`8t^nQ?s-$Y+UR&qX8
zT!Nf$<YguKd*k~?m&Zg~lP3P){fq~pTeTeLmozwUp3|Y=S6_RtB~&X#^#;FLiAy`i
zvRzFkANtY&HTn+U?BM*DM~B8w<VTB|RnU;?Pyd157aLWqyyLXBJiA$s96>|Iq{#j<
z$GI&{HM>T0^?7J_dnEVUBKd&xf;~);Tb1uS_l?FJHL5mhvsTmFj8fgpnUsnQAmsP7
znzWsr-^6V`l=4Haso65U86W=o2EN(2>ATe1d82~wB1tZSJhk}hOYdQSf$0a)MoZi}
z8ESHw28!f!8i}(EAi^$)Y@T%s1U`wp`qW6S@&}-I*#)^q;A)L0wnBaE1;uur*Gkw~
zU}~rmrK_%}X9j}icJ#Lh9LyrB-6c&yY=ObSpn<jf{jUw-&%T;FS*J-B>LTlnMV5TV
zFG5aCfcrw&J5ldw#w*nVeZF&;o=jRgl}E(d=@F{&S}72w{$^#7harh=>I0K6<SD;z
zttyckqy&iU?TB)%%BjqY9C%cBL&$&Z(^_O#_lb&3Hj;%HhTk?_QoVGqE+-7}{tcmS
z6R<gIe{w&ElKHK#Ja)Uj@JZmxCrJq!38hW9gL+PozP~~aw<@(4{B2PsB5Az)Ehz8I
zw~&I~gre}|jwrfTzQ|83SpqorAo0!wY5gnCCm^ZqgKmhf2Yd@EC9N!u%D+D;TJ6py
z)^%&s((+G-xwUAwAN#=NrIot&uvf=P?;87e1C3c4OwbiDQ#vPy{PLQ_Q9-8T$s&ui
zBERAG;*O%PQ344wyxS>=Xg~A8ru=V4H%Dkc4pObM>c3<j;6n46{J=+aYIeEZz&}Gp
zr)cdH;Btf4ztcw4{G9JU{w=`Y5h)O26uB*Y^8yQYlfx>i5#^xf5cOo5i<`lr$$){^
zPkis?>(s-i{VI8DRZeK$Is4B}NigAMgFK0=7~aK9hH|YC63z#E+$Vk?%W@&zs}&z}
z@XrEc<o%>QtPsZEp63#tavfBRSU|W#31tChGvstH_M}O4zP>pnYNU>){PVg6(IIku
z)-$ooJt!LSn+p_Cx{M+ybEv6}dylp=OdzH!NdafrePQpTx!UgY^Vj(?C(X#1MM^8r
z<G7mRCs(aQ%kolKA*6?0paj?rqx5m<cULK7ebjn2xUo5LKXy>=$tW(N8k=V$@?-X+
zQ+e&J*PEttUXVK(ZO&J=(MujN*^RV=QJAddqO7Y}Wc4!r)Z5jWud3SZ&<*)lWtL0d
z9r(>*F8&uEfUqzB!}Qpm|D1M%U*^4UW6OGnA;*45Y+dbE7++qP&rrbN$E^N<ppQR%
zCrk_e_?mMiJ@Yh^t*O4*JO@vHYc<jKrf=?sz(wux+MwA2*Is%ym8tZU*FDp;I%I?P
zJi>2H;?y8mzS>T9(A3*8AjCvGG*;Np5u1T`KrmUJ)a+u<eV+7HOs8K_rj=1Wq7F}V
z#rr{Xj?gTrhd7_#X;?@!+Rbx<^NSH$?J%6hPYC7ayvwuK3UkdXF45!+sL`1zo@iP=
zU4|MJhEg{QWg|i^Kk~%XEX>dP2Ovzxm=PZ1>+=fNix)Vf=hHdg2+V{?V+B58a_`=>
zP5q>o)^vFB#?h2b->dqDcSU=Kj~FGVLKwuEa4|a(x1v{|5&bV~$p_|NUaLb+2YQgY
z93J1wSBn1f^PC8r(%wLy9Us=<as9`&&%c_Au$WEOB#1!=)2sNFr%tMc4lacmQ5?Q}
zd`sj=<xA{4T^RSpy9cPWrErzzx=1@VeZFj#4M?L*bh^Mc``j#{HO>i*I<VRk{c*+k
zqQQ*hVC~6P&(h3`OT6l2+kqIrPbXsU2p8$A7-BHd%howg+@WZ?`Mz&OLBjC9>xZvF
zBk8T(BwhcWnt-{!tjST1Y9O7hbuCB<)23OYl#L-~%P#zo=%VlKIt-0kK;=h9O9B;9
zm79|HlIyvb!p9*G`JuV+TJr3!`r_|T`xr0_s}*{{;vzMV#WrqSI|l9R@tW&|^FGBX
zF5@SZg>7{X5`o=!COGnr3GJvQts1rBB#w)I2KLdx>bJwhiUP;4xl}Xa&(<g^4e~^U
z*Dno{P*)NQ%eNhjN+g5>+&*O&X4$z1pCB`bB*S~<u%hIRf`T@j0ycV!a!C+}fU56f
zGs+HZaoIJGw9M&AR8-F8qN~;mK8k`y2QtL6{qCt2mf%bDF|4|~vLYLeSYGhU93d|i
z*18mkfFu79miz~2`dt#^px01^N9rig3vPX`uanL&JWx8>yq|$iimz&qfw&XTN`hVI
zNaz5S|D+GXjVc<-TN-Y*ruh~}ISe5Z5bi$sF!InV0r$EM1b5HTh4Y7tn^Bt2Ye;FH
z5GLxM-4<d?5Lewy;?0mSF(*H-P^zoYgSOdx&R>*nRs4!TU0}-t7a&0J94bbU3S%8H
zodxP4&nTwdQC5QjYX~<HV*@T8k3Pv!wv|^51Bku{7!-iM?t<Qq{v|4zb~|H}g{cHq
z>8r*&Z+pGD$p=HvZ;S?Ph<*$=y+r$MT-iL_=$f=`nup_{^Q4UKiX8vu=`|2mia9cF
zrFdL(pvn6cs(Zg{P>bZ1y_1sNr*{L5fieCWezgWD44vKL-;-QqyXlDseSa=_@lAao
zG+jY7cfv)or`Uc_TUo!lMMa7XB?t{vSZNfNQwfkJE#f{%3p{>fa%GX&tW&8#*$e_A
zpC-J5N39^Z-&elJXAC<GC6CA_554%kmZ*fwcXMFTfL+JWCn+-V)1e{Z`MS)M1KhAH
z%=K`GD1{PE7$xC%KjQ)&>9bpI7N~28)q#E0zilkbl%TKRDENI<=_}J;fAHcphtih>
z5tQ^Q&kcUoxN$ft8VIds7Ciw$xV}7k;y~j_J8r4bZw5$U9umC-XB(h>()m7^<wOdB
z=b)Dz=@|QZSG&x(bW?4INwrzsghiCmkcO4|gWmXUks5;-=50i+KF=xKO*kHb)qdl!
za^*h$O8rxHuu<k=AdCh=DlIATz~5eXD1B4IgBRjwW$#*eG#Onc8Adt;$>FdZ?(J~)
zk5V)vCvUgRV-+zt+$JZCPMN2cBp;!1o79RlL|3V{C5X{M#>j*GGWU7kha<qfIu!j}
z-b={fgU<x=A*O2cn{od@aOGxv)tjJCT%2Mwr9@55=TAwxagEKxk-;f*(&ObfK6&D0
zVYWLmojtyqc%7t96SLinnm>9e_d%acR899al7gZ!4X>wI2Qytr(rXl|UG;j75&0-~
zBhgq6nbQB@PeF6~515^4WWVtIYoxwyQvsp9WS0zNP-jZ7lWwCsx_rf#YmEO%k252c
zafpGX#G<}%-}q4x?LJHW!7C-B#r3Pg=-^A4Yp)9rK1A0jA4sNTs$TyKdabETN*D(j
zdvzE)4s*qqOH`xueCW%jE99x`Fqd8Gz7*gVtcVl>Zkbj@18ah?kP&-l0NKD(wDy5@
z6fAUeKmGxJZ;oD;pp!`RljR306-OZpY)fnm9Kypqr&pfZHgI+2ft;}70uxF~7Z_qc
zy$9Eq1D%=&c9+=l!}=wyr|m8e7}`G^d!ms!3hWEAJ<UgR0R(hR9%A_`*uuDI^&)dJ
zjzjIy{hFH>cmm}nF~v2%!h6n7?4>6+Skzj<q3fePLotHFp%DA2_;UiY>BlqpO}(lL
zb#vfON)%)>gG0eJTqA*M+?=D~PcaSlSWf5L#ib6a7lUJ0KdgyyKg`&plG-$4MBcqf
zy94Bn$$S^HXp-Lr`LtjMW}B6bnFo3LN#=s0sH$$~n_@0J(uRFf%o31<MuWck2eNF6
zK_{y?CmH{I9kFFXa%So+vu-w^d{zm7^}MFjX(P!Pf}_HY+^=oy%b~I(hlHR|C|6I1
zmw9)h1am$Dwh}S>qS4eYg9m%ps$A`cL;@V!As^O7NuILkAhKuHNkSqG6fxWb+M>$_
zRW1{&IRSg$7e3(jbb04XiRQO-Ros_nsGC-w?AmNS&2t#N?f&tWfJ)LcyS4?sWEsct
zg`qzHH3>#t!$9jG4eV)PAqH7FdaJ@EG&9*OwS7qkC<v)LJ=bB0j;a|WdY&ow6h3r$
zwNU8L%TKZO9?rn%k%ILxm_+N6$L8ojWisV+?HaJ*Ck<gE`!%IHFu$d9C6}U<hX<XQ
zO-mryL6Vzg!wCWdd`a_HUi`De05}t@IFrSf{@SQ0V^lCYvKZfpHIU{2?yPEB=l=Sg
z9Q=u_3@{K=JlRwWI}rGr8?<jIv$^nNw`2S1=yOL7pP54u<o1;BL9E|#$Gh-73U2cJ
zT0(sT_)*!7Ra#<ga~cvacI@k#8$n8=o4-HEbpbv<S#Q7m$$k&2^sl}zv+R||EK8;$
zFn|q>d}I2d>>rE~0WZ#%&Tn;d7!`<AzLgnY6w}doO@eRGz;NDd;1y9)Ujo&8<_EON
z!fm`i<LSm6I}myuJDELjZVy}y3jkZ}1|0<d!3Gj=lAxC1^S1O4R02M&(Q$iOq&SZ~
z$C>Lez1lb)&H+e*asW(O{<01#zY@k$v>>8yPrTT$$I_sobMWYG`(HC<Ca*ro@GTnK
z|L=;x)ifrU;dMz=w@Sc7nB3KFeS(8j*m0Am&Hp}-Iz-?$YZ*iZ^+X%ol!FC5d&NIR
z{1n%V0j??QV&Rko;yuYG^aN>+pZ^a^_yR9>^rhx1g{UNXw3D;%YM-rWyx8u3>HY8L
z(a%8KCAZX^dL}r_+>aiA|6=XQK|SI(G62vO0B@lorECfY{GsRqOy<FEzvw^ELXVBK
zNt(3!u3+A!sEW5k7Le*e=b<M<Gy8cr|J!_gWF&wu(sDAh1#hso8U>h`4A1eHe_xhb
zt$AOg_?lDOmF9Ca(i&yP*k%&{KeF8azAhPah~5q+b<ZUg1)iHt@JDJ{iGP3a#RYG~
z>sgL)gqG{<Qv<+gnRVz0@!u^fB49BbC4)`nAdTE;XS(uZW->-<MR>WbcUbe^ZRl}9
z{2$#Njb>?-Uw0HBWl_(kClXBHao^X26|3z)O>RF$U}XzSq{6lo`rZF`>xK}C+M<`6
zjz2CbH}AeR-4a1RH+u15{KEe}6dW<=W{VGN>c-%sg;erOq>jMH|2uhbD$0gVR(gX@
zD{;y=szfc4(I?Nxz5n~JEgH?#p_N?aNqe|;Lmn;+-QpFl7_?gP#bZiB!N1i*P?G$e
zHIbx6-}<}z7yyJM(EwmbGTGUY{qGXh@M7&9Oskqd5R=ZNP^a(sJN%F%>Pi&orl@ZJ
z>2t{a_cfLjNK54g{o;1XhgA@0GDikr{hzq^-+d$Ef*BNAhm8p+0d4%bbXHlHHdjK>
zw{9X@N__5Y5F>Ab%!Zlf5j*MC%U1hnErt~;5PA?#)?!&w&*D2jtV|q0$p(^q>rE?#
zAa{GTq)u(g(QVb^m;W|e74(48<x(SU$5Rt<A~&fke?D8e@o&k&1?WjVOYRu5hK4`Y
znued<)&4uO{3<WNn0WaY%n+lLT4uJ|R9_TKt2R+!YUpwJxYK{PuA=(l;Q&{(cz!7h
z_wROmc$d+0SA3JKEaShwSHt@~LY?DwVp=qnQECL-1fUDuHe34P|9<K(phk=#_-js=
z1V`%-BcTXvI@`<A_zS~Ke)<C=J%LIBqoS%?t{+dmKuY~J9dr^$i6;8Qp$4z+DAlTg
zRoN5<6yOrLizXm;kF+;t%|`y)n0%zhDu@&_wZEl-b`{hrbOoBEvzf8?QY+LwkKF;u
zkqM&T&+;dmZED$fz)*!=An>s#wy#&b?)Vi!B^XF}G+B=3J;ziasx~-^@2iCWUuaU&
zyrE3lrKpie1o;zKz`#gp)S8iHt5hxR-@~s#y+o&3JXcb{-{To+*T&sjhI0e%f~2YF
zW+mu~t3QuLvR5etA~9S9>EBKK?~7(YhxF5j*^L0n8_kVGKUf>963#EL5Bc)^U3~p4
zaijrJt=UHuej~7hH@NV=a*|EpK;^D=XeH;B@V}`hIp8-z>C$)=aHF8hQFtnJK4i=&
zt29b1EhJmf@PE7Ng*WkIYDXaC+55D{Y>twU58lZQUxY!3{Kfg>>7MYiMBX2J2zFye
z!a+w{s0zLHf=DL(cq;dLpLUpchz_iX<KcMCw*U=9&Az@ulHx~b!1Mp;pHSs&L#H*t
zvb(@>ZNKq~loc9FcWP$_nSSz%{SD|tWD>0~SyF)NeQXUx6a%BH5r=S}*^rC9fGLsm
zDDaiqwjK+YOJQ?pH6IaWj8L}U;EJAL+P&eT`M+<k4BvzoI}Ni0vDI>8n9=s<%ekWi
zO`{D=9`{N1fhA(0?&R=WvwSMQh0!_0X_UR83<$LWQC+t;U_mRvy5v7rfp;YYZ&W1@
zCafaj(t;LzqyGEymte&oe&b>-UC6=uSH*rz;x<0WO!*e||I*9>+P}Gf4>=!$aVEd&
zdF$O9E2%-*>u|UtFx80Gz!0L`th7*)lk$a99l=4uZ-w4h`*X0I`rpIN=7J4gDzmQf
zDJM^o=Z=0@BbU4BS+@-os5j6iM7g+)7n8vPk?%VMKc9zQppzmF0FnpO1xyTK&0qwX
z9U~Bl=XLrp0qtjq#&3r6t}f;+tIaA;|C)W9SsQQCUH8F4+hg83a$~U~#VrW|0O;?{
z_MUdI0~HGHJ}<CWoUXCRdZ^#ugfT;=;QQr6ISBnEo6RPhCz|2YU4WWSGtwj{4hj-~
z^G#PSBaurp6M*nP1H$iFz(_&B6PFQmc4%(q2PFEvzytSIKYdL`JrWUNvqJLvp{xF}
z`6Y;zy7}tTz$=Q*@dO{pwEwow<9?Nk8Q^LzOIIL&zE=o?c>NFczeUkAg^;Ox+ss0f
z8o0YdE^?%&FdM&8A_emTQcSJmC3`{NSj7F>KPnOExt*Q?^WY;|uX7sBegI5*Z>I5x
zxOp!D0Oy>>OSP`p;F@9qnItCl#(QvkZ~`P@WO?eD2ETt3gqTbK)sVS+kjBmuQJPki
z=~#MC;obx|(Lxjv*D&>5=dt(&P_DVv$Qy7lvU30j&I@6eMXk`A{6>WD;Lz#iPQEet
zBd4n`=;Y6*MQ}h|1EbbH_7PdR&CFgPjV7)zf1#4G58pKRHMe!ZK-`qlckEMD?oPC8
zbY*u5g7^1GopzY#?y-{wfde6V$JW4cFz&P&kd(#!m*URVCD)(CTAZjFW>9qA%5nm4
zzHYdN42)mu_v)4c_Xul=<z(1yt%j2g$Qfy9Rsc&O3TkSB{9#Q1kxO<nfIi2UgVLkj
zP_aG{5Emzsby~|0F4LYaB;V`>|E9O>TU}|d&jyRt7km{0!(m;+9|9{6auGF7nSLmS
zyPHC=439&a8<yvW<d6oiPlD+&4`@Ir`xxA^rb60jGk^_PJj?GfL^Xpuun-?W)AKM@
z0Nk61iCsSW>n^-x9HT5xikwPRfvPfJ4-Fq>5uh*O#-0!=wo<_ujND#8zg_W4tkfyZ
zKLMsTl0qXynla;kD0wdtY-2+914M4f8Bo3ho5BE|$5Qksfb(w2ucscCC|Mb_GoVR5
zAi5NZuSYX{1^5H&l(5CmOuEmS?4CwDr@oc}aOlqxnpP{bnRodLoHj(qCb}f8B&w~f
zjg2udNfGr1w9qsD0>!*OEr)_{jm+=5ANj9`W9Q*O*8AHk=sk#DKS{s1EkK$Gr4o}N
zg@B($IEJo9bM1kdFmj7kuj;ev+#L`4#>9`O>#otA9RT9%M^V_#$VL6R_e98N50rPN
z;Z}e;uFG?!*S&7JcnVVjO+Uhu-QnBRgbU&4As2>y4mO0|^jZO8dau{ZN`zS{ut5e4
z8tDUE5o?1j*s;2}Zsvhnph&H@rB~;Bcw;wj>}<tz7_E%b+`RYdijm|@+Mc5uOfSjC
zZQ6ENV+#b=JOd36vORBkjGAl{<*4mX3U3Jkpg2_g4={5P+(V9={eUxjJhFK~Jl5uB
z%3clzy|34=INQ&Du7!a~DEJpRizITvAQd0ad{yZ%8sX#)!bY{GK#Q21^hVu21!RB<
z*sLk$$1sszkcxHK6_2zPg~CRJj{95#t-mP!^>6d}PzAghuXbBU#6?$?za1CArXIA=
z2ErO9c`<cF(Y>zA8Nfe8VUm;P?Z+{fWA7PKk%m1lFr!ihk!Qaj#0pEtBjg9B!Tuy)
zqXg0|;cdXyqN$@#y8y?SK~OqQB<NEwsb@f8lIuk-ZypeP1L7OsC>cUbAeqWfgFBiv
z@MBEEPlScB`KXX8izvrC@4$Qk-bAV<pocT7zogWseG5<b1}L<%fZ2+8bOXZE2Fyu{
z86i6a^h=*ArLYYUUk7<mGpN1>=5TL_kc`>*3b*WqP#e%gSg*SY))t+lO3y0h#Myqu
z9!%l@GLiLPxa#-ZqZ+I(DKNDp<b0$z#n$INYfy{2(O?WHOtnweU|emOGGC#_uEtT=
zM@4nG8ypADntO)R9@93+U>M(*qMB2Z;|vheNJk5GATMML+9Vzc-H1J!<jr-vI)4Ye
zSL)SY@GoGvpmbyzMGB~ZqI}z_z=)WSe?jj|8ax|s^ERPozM>CWOMw3pCV){R$HZZS
z++qu0Xq{)iHR|$QthsYl#KCu<S@U}{?Yqje@YyC=#|~i#aA4#%_+OIbekoE+ltPut
zNvhVu+IT7IbpYXs!EdbExZeRi8*$?$wXchflYsK+UhI-KcrjVs3*#UP*d{z->Z@gM
zp5ovei*r$&59$4d0~Z})vr)k6=4ID%K#cMR3@<T0wykVdU&M{wm{yrGzcT+AfUx8=
z=iT)Np9k%}vQU@HTzQI3J=&n>1`o|O|MNfYTlt&B0-0iHyDnMDqfnh!b1e57Dzx+S
zil+QiV;a*QH4?j7<^@CnmAT_*rN4(aL2R+jFhOG73m^cir(p8YTLxwiP2A>Y{Mm1S
z^tJ^0gEM52DCl0yaU#5I*y5}yp!k0+K#yRzQbZvBWEGhZ2qwL<jGuhAI2#<~fY46o
z$Ro?q3uT~bf7cR<CB3}xastMFzQi_OhXIayT}b|Cd=<u_7JQ@J3`*!TRq!xgz3bqb
z8LiCcdkdRn_a$mfzrdBaKgeDURgMt7BNo#>p4d7k``)F6ZGKA^shcGwx^(Z)V#jR^
zVfHLCt$r?4DptHz3>-FXjgmi$GA%qAZ6(zG*Apd2Ot_-kH>n#bP`tj#8|p`0#A*<C
zXpC|O_mS2Dc~lGYz%bD_uY~5jL-69o@XnH#Ihj5?{#)%&Q)}oA42dQYjI@0~+GBUi
z%%M=~2A{%nO=U0O0W1z+c<+_);1`Jc3wb78T>P|`*&c#;@R{OO(1u<Fg%wuEtUszA
zrq*jdxmIz1@)PrKDU!l^(XNM4LcT@(l0QFku7{vzQ7}3kFLw1i6@;QMtT1J^z&2`W
z2`~mE845j41nfcE4KS1|;6mIfN9ryBKjGUz1<G?8jtDNBOW4T)+x>k-fWWb3V9M2*
zT~X`KlqunWKpi-={8AZpm~%WA#wWvJnBjnpu0vAiJ^#KS*wG@~1MsFQy(sxhxj|>|
zLl~437J#SjSNXMHmys8Jr4pf*$UsRj*`ao7+6D6i2o)cW-1VM%>t(u$j9@f(7Onb>
zCg@RrlJ0@N3Bl>d{{`<$F~i$S6JTtT*bv8lS#^(ql@?9m3E%Pq7eFlDz|cF{cflM&
ze<C&O?!_o}Z(rRVw}N3}S@a(fnB!_MslTv?43@UO|2nwgHqbTtO5wTTbvad8C$^>x
zt|$3MvC6>#Z9qLCuD*ah{0=-ZoGyJQ9S{eT(zqq%Duio?41{R}*&LxK05g69w|_LN
z%tcOj1())u?c+ArN086e6i#kt_!?{l*2cW&m@MQY!6gF@lxqbTnZe*yfJCh!c-Kik
z9t->tFCCiuVbqMo$V05|tE9k+a*v0jj|v99wTs1=s(5o5<wgBE9Ju_D6DU%&)I76<
ztiOf4Fv}d!Jo>>xDf{I0*_EE#`kG%aZ1rTGvTb}LPW3GNT^pb~6i>LB`~lgBlqSua
z(PtU#x&MA{h?OD400t%Bc%QAGL`b=9c%<eYI6LH4pjg-lbLd33RgY!@e}!bs*gb;m
z!~><>Api$+CE(=sE3P}23NU;IPHP1Ty69Dxc8O!5nlV`2Pd}C)3$3O<HI88VIxESU
zDE!5cnz&f`^`YJIPCD+rh{6b(LiNnb!`rL0q;%vh$#OEx1vgdOLq8VR*K-nf5J?ny
zp*~CCBpZ~{+mh*CUHDqT@Xf{OeCuoIt0Rq|VgdSUFtCk3^_uF*E_effX3){5&;I#+
zsTAXT%|G&*Af&)mK8=qtjxOl6F2#U~5#Q0eYmgw#dhmG~%vKJO!Fs!N^&vtbnt`+S
z4D~{vix}e-L3ijvm>UJ-i`IX#7TZmLBD@$lj&mnuHS0wO%6iGqV#n%Tf$v3762*-a
zm&5@cT*D8lt#7ZbVc~8F)ihkXgxXi?cyWWMCmhFu`7(S^<y5uOCH+%3Fm{6eTbcEd
zRRxjw?aPbe$jl&X<Dekr(f-jf+M)f5A^l~puZuYF-e8o9=XRA>Y4xCeqT&*mDl*%n
zpp(UCdnTm!@L5Ec^o|I~FFsA)rF_ECPmd%FNN$-A?6?6aa*|HKAmXyt4A>u@?~iR=
zUu?y4Mz{mh8(WRr%0tRygv<IYNL1@I6Uu(=7qsiB_0?~2(}n<pXb(0VJWk&3EGX`<
zL@RKI+y&-Is_1yCf>=cqevChTLVLtO+!j`b4<$0++AqBNK20q>d0!)rd0<4Q3%*n2
zT)Of5!^Xp>3VUO-h`O9opOXQOFPZ5WXo<?;OC*KrvSJkmRrhi9o?q?Vr(tVgi|Tg8
zq%WmBWVez}h*5qbv5hOf(gVrY5DUshFI<DSBe@*Y8Ju?(`b9$SBPlTDPoNU|agsaF
zQW5TWY%FtP`$fs{1|=m9qz;vV_<`kDK?<_vmezScPGYJ`bhCL&P0^xPBYMX=<ch%j
zi!d57Be_SyPNG_B>%h0fRXeZcO(jTZi$dAy9_+eW6>Nbp;^oyq=lf&K5mb|#IpJ|<
z^D>X9D=ud72^sBvYRm~Vz((CM8ztb_l_Fj$y{XQ)iRIGV*{{ratg5MC0womoz2`DU
zc~nk+P-4MBaa8%d{#=>#AwEulM)p)<WfO#1Hw5wJkt|`F?-xbOnd2)rP<!2-yUjgJ
z?Q>B-OmwYi`)Ggmep|OKwQ4vnqsEW$E5~CH8~vl(yRu_ed0cw`m*i%FY+mll4Y{*p
zXyWy?a_<Qj>BnFY*gN^0;(G#=dt&MW8$P>7(_H;Vq9d?sExb=xxYR}Gv%?>cEq>mW
zrfB>-Uw!c0bHvnzHZ-md*lpyOcDRY>#jy>YGB7e;_K@UOgRFssz~Qh<Ll}S5GB+cs
zs)OxYZs$IwZM|JGMla@pP0tdYrWr?M|J-b&rG5LUGfGO$3&2BJ)KX)J1DTEP1cMNC
z^{|eWzyd5pjr*x9UOc?+NQy{=f^^uM^!+q_NU%mcI+&0KjMl=HN4#_LR;C{W->Z{m
z=)T;-16*G%RZ(?By=*OewVXOxQW%b6N0n&3@yyQc;U8C`J&*bJQfs2)>dUQ45mioG
z@RD=l@<Wj3fb^CiycdN2XrNO`A2!)i#SLf7+w<{~K5~GWJlFJ(8V0Y60lLKb+Y|&P
zG--jcD!SIDM#op57gd!#`*J@eR{7hm2KP=Xg1oo8f3cZk>%`#Mio*JOnq10{9z%UT
zC18(0CxY%G@^;}aN?<JBco)s@Jb~C(v6d5aisAQ9FMJ<K)nuRCRd&lYPvsF2pi?2e
zOu~=6oT!86rIS%j(YCL8P-c+ImP104_-sPk;}fux=>?`dB6|)XAAU#=kpt-jJv;n{
z7V;5mYk`Vn4qpH5SBoKHs)p{nS8f8Sros9u3oyF6=)jBVNJR&@>-S3rSp{E<Byn-2
zC{j5LQnFt|u@cuWPbFfo{M}68u?=my9YHTyR&A(aO62>}HxM9;59m@Bt}Blq?$FAd
z4-p3;;1&Ijz@19Y#H3v*f0SB1$D{L_2s5X_z^5Lp5Yv)UdlxX<MXS@XZzh?}v@OI0
zl{u-{ugg0iZ2Dv5>#HzeDK?rrGra5><eLw{QhYJJ#8PKmfISk|H|@Va{*C6hkgm`p
zA)Cr+LXuu&+N2gVzQ#{YW;pz^YV(1{>nnLL3~VSNtwhga_%&dcd4NwZnK&1#ly#0u
zqz4#dWD4EMYcS3?c@0RJa9DMTa9~tdOGJ&E!qpn~WY)MeNbjat-Ic=8O_^B95-k%J
z61592Ktlcb21uw>PB7A2T=%yg-jMZ`p{|dfk>oG$j{vznnGTfQu>g)6xH2#k){=oj
z+h`@iaFtQ*I!F#2N1v?6FZJkRDir%Qta8pN<FLh*T>q`&qG(i6!WwOKL-5-D8Kk-a
z%q14nz@QG-63kEf3Nodf?`gp8NkUxq{4Zv|+g-GSmV7k9;$81R$*hj2{Ck44*Ejrk
z`hpAef^L+$_X$!yP>J{9uDugRq1>c}H8rdc_px+mI?|Nz=Ea6^vB%-cokdCQQn_Ar
z&(JF6XVCR1`ExLCp~)xIp`;z&-N_-ptQIpKNP)=th5(t*6NIL-qO;k7U~V=rz_J6w
zUA5{d-Wux4*m}GJNAEh}IcKaw7Vo>7=M&Hz+HkKKz-|uF`T0n7VRgmF&4@YERd+Sp
z0=WvN7^wXaPL~SQe(|ubBbR>rq?UAPG|oVER2rfVJla}^ROsUWfKmxBL@vq?+r~|+
zwN_A!2zw1Zc3gbngwCS#950DLFPBV)y=bsW5z7Jda@zw;%IxKz%Aw>`I|FIO+ISw|
zLR&0(vt<;4w^E+dpA$6`(>LlD#w@BVVtF*{IFaVh9UQ~Ll$UP2`ZY4*T?WBEmEoj+
zxFYp3D}rrS<DM4oXN=(M^wo=s<;yi4k~P8Xo<sU}z?>YDNp1WzFNV^}GJ6bU*x3uS
zURL2&`41IAH<4!Qi`AEp$x(urk(u_@$|o2IS*%ay73?h~%>)h?U}IIcAU61X^B(5e
zOArGmhu*da_AlKlPXjrW5v!YRF7?4HJNl5!c;#M2Oc3vif@c4@ZO^Z=(QyI5E++|h
zFPi(wTm%zUlrQF-L2L;Fj{b%in<7!5l&>8+f^(W$t5rqAoRG%Os?T_0{(|J()-l$m
zkSF{9J?U0Sfh>r3pFJYwYNurc+;WxTc>DL*SI}0io}bQm(?gv8gcl$N`=7;)As?q_
zY(MThbYRjiI?fP*n~QIdS0;yCZbc{2DC@qDh`7`kt=my!pBVwk59>PL4Q&g@C8qm^
z*RBODOjY+EKKoU<F%z3+2Wo;}bhKtA@9U=-*g(J9wEajE^vb0+Z4X_$y)}0+wMYC(
z_<ffTOV6K!Mzh*n1>q|OSufNI>LWJV94_N!iBe3X7D0{aG{$RdtPLY6%$FOyOSudn
zzVXT<ikN+c#=!H9PQ+0eH}?=t9c(A3eh6hZ#={b60|AQB*^^B!wtio(LF3ZkPdQ`?
z@w#ygEIR{txD+yy>KsZsO;Zn4GrfL4<_-xumRZ)p6-OGxV@)+U+a*Wg5o$dNrFlC1
zkHSwukJ9BC!HnJ#W&HY|X&UZw?GfsQ8Q0-VHXZk&C~G~};{7?PihHS7oA(|@G$T-<
zliDe7t*IE<N%S%CL>_tfnI0ib#2`{HZFt0ub)j-&^>O>oz&-kLBaDmk$XK7+xcKIE
zd_Ke0Abz9A$3L|F+2t?ac#lGu!ZvT*{t+(LNq`+rCzNR^oYnEE;O=LBnEmE$5IHmt
ztIX?Znym?XclNMjuH-iGA3`IXI0+v)ce5_#Il7xBUdl_Hud}RJ90#nWV9_5vWYiAh
zsM{n=A4&eH`u2|*IZ-9LRC}|oM54lYj4gKp6JPVdZkO_Is-BkEDME|K2AIsITnlKU
zky*f_<#;|%7SsE!i+vXQ_EyvAmmy=)@Tok3_a($@PtGuuo3)@<kg$bukY0{N(huHY
zdJ*6b?0dM^q&c7_-jEMPU{v)&f}>#U)q49ai&wRYyXQ(f|H<mpra+k)YZEPr67L^z
zQ^R1fPk?-QqNmSR;oINm)sOM?fvIeM>=3HMH-y9Xr?L3kgWgcnUFrej8U_DQus*kz
zQt>8(?zpDJn)^72rTClSTjm!&|5kOL3#nQx68hN!{qq(8F&<O8IS*)BYqjF2TFr+<
z`ACOV5!ypJP)+K&*CTS;tJGf@25<Xs?<<>fq3A+~Xp}-B56CkeiJ-yT4|eJ+g%n<d
z@cuyF#WaZStKXW(*P9S9D=?S&;(P<8dOpUw%DOpY9|WvqT#hBLmu{8MDZWc_HS>rG
zAWP&T2pxKXX?}0XkJa1`$3?Pd%lCJfNa9(RX<T_Q^=%bDe+%7=@vEx#zCbz?Rzy$_
zg{d=oS`gw=IB8C9UC5T8V|<k#XHSkcCwH|~dAW8?pZWC{%=7~lzBw1=Jpa51Mll~3
zm}Z3XA96Z@IS_+g6;8Mx5{VCGR+W!eI<k$hd~>9xC=|4mgD;pY`5u3cl>(QBZw-98
zPCFPNn(Dw@L01>HGvW#6>juH~@(CY(_E#L(Uhq&;jhK~ozaUStBT$GljhP>mu&))m
zojZ)gC?w`7W`wpy+tcpxHRW^zV-aiEw~co2JdBd{Y?b!}jj(Y@O=k<|o$?{+_RVkO
zt%n_oMq+y|frZbB-Y=7&8QT5Rz*GGldwwzYd>WMkbI{GjB(O4N#FgREh-on=PqK1z
zcHq%y@11^u#?szXCecCN`Z;>e`>}js1iLHA&wlO-49VRUZF)MD`De_(wRs;YhQ~86
z`WZcv67Ns@gnuVB+?l(7o;r?37j%kzLxbsf6hdlU<L3{^9cl*12jinPes0Qx5R$bk
z9=ytg;dgnqAfY@)!WUHfffHz%i;#h%csSKD9h>35IkQ<)Yd66i(!95gIQ$zIvNNRz
zx$py_kg;SQy{W8pA-`=Z7XwEO#-^gA6~SQi^OxseJ}bz61Zus!eP!>aeHX!lq5nm(
z0ZLAs)VEA+lp>OQIVnlc97;3;veJ)UAV_=l1HEgj9YvGT#G7X@*Uu+J&F<AXEGvK4
z;d_wq!Q5mU2$(xBKj7vduRTJ-)Z(rQ#Iuyld>G~sB=uA#?tk6<l_vFk{<pwYua`#E
zs$Jujn=&s;7?fg$@0DLSPg>2%Hqj0(g|d@S9Q&}RpY8&2wrlO~d6nbKcIn7y3L1yE
z?#ah%dg^Si3ls+5Voy6UxzU#t3d=<HxpK5(XcIXNj8$%P0_(d67rPrxS^V*`fsXH1
z@HT<eS|1R?E@t~)*&i7JIP&T0(U&sn&<XrQ#(c~5<%mG`Cp(IKqC!vDe8&?NrLV%d
zQ`j{O(dCLzK2d{Q??MG*;y5$n(XKfB`~n&UOVkTTDuQtuX)#AOc@zR@Ab)}bE*1Ot
z1?kgQ{Amy3=tPsR%xcMshSM)VmeqlgiQ(rTh^K@{U7`U;pw^5>|Gpvh-DVLCe%3h?
z%t*N^d{@(>2XqyOxAdjaM!n2CnO|U-iB*1?!OKN^SBX1OWOS;E<(x!mR>Ylt#=xqN
zZggT=CG{%OJeZZ!$W-Q%{o`-bf}~+jm3v)Neg~CGQo`(b#!*sQ?HdZ2Tt?&Oz)3Vr
zw@6RoqUymdSDhFH()VW@t<oBECFq>7c-+`B!OW&eRp<Zf<q?O)zk_o_e{tD=05j1R
zmt#oGOV5p0_d=I%f_j*XiyQkmPnE2Dpk!8AkL-KdawhNiU^=e5YU$sD>9{jsyNjVh
zt}y|n4;g&Jf^NL}*F_R@4i~JxF401m90PYf<qRMj9p*scPt4}2ku4>fdCw|L<jSE`
zY@}K#L#wH(sxTejg+xuVTJ;k+GV`e}xvjgJKJ_FQdDzxQlU|9DY19rqG*p**E<i$|
ztSit=&ZtNR<iX?lf+v3c>LJj?I1GHSHlVzoI0FTK%$tqt2^ZDa0sd5=KKQx+Cy?=b
z8dSk8gGTMsJPJ*=3oKPz>#Nq6Jv$ID+A*TnJ8M8>Po<^Mpb?$K5&BB!|JZx$sHnC+
zU>L_iK@>p{MCnFAkPzt+q`PAzR0IL(Zbm^=kXE_`K^loc8UtwtNy$+J>1K$5fo~tY
z_qq3ZzV-h5t@W+-{&zT>z4xzk?>$~2L~<1+FAhLa*@ohsK_Lyy=Q$<yZdDgJGnTcj
zNcm6*_I{s9P}fFB)cOn$eNMv<#G7))FYQP6tLDv)CS<k27)rSAh4^Qz8tw6CNhv7V
zEms|E@6;pg?6l7qax0oVj+OJvXrs7I$u4@arQSL@_ICVF>p0ijw=Qhbs60$&zhK8X
zJT86oVDmb5$;95eSk%6x$tQiYSsdF{zq;=?JLcCN)Q}LuMAz=pDmm#QEW~>2Q^RV&
zY7g1aY+#h{K(^z<9|D9vfixudjk?uAFOsfNMiX{iA{$wBV2hD|3+I8Y#EMtbzQfQJ
z6^3euF=W%qm8SJ8Vq-HyN)uA6J{VpZLcc8GRLa6sS<<z*&_^jlO3ydtnee?+&$axg
zhRpVb7^IL)2@jYgq&ZO<o~QYu_`WP&QbGs{)z<}Ed2beTc<z<`kpC1S@hHo{r}qhC
z<=)e5sd9upjh%<OyydX!4DRx1lJ(_)TpBpZBigx*kv!ij%&kt}WTC6SNVU8_)hIsL
zYUedye{tn9J^;HDkg^=MxsegWBoHvKCK*QKV7KZ!Tz4Z&>rjQlwYqMicz5mm(c09F
z>Kcu)w=ZRbHdC;7{AaxaXRSlj6)o8>`81zo)UKb`Z~|RnpOH>;&(F#;>H@0YW6;)l
znGq800k!ld81{@`<sa;asOB#(>93}lJ!d>z3v>EZW`<<;%8zm*r=p0U|2@=d0E?BO
z)^d)z#&pSF`R0jwq~mYWpw*qg)uh{v(tFP{c1eZU$lU2-%?-clC&x&tt;0!{_<LvE
zv<SH}r<c2>YRDQlc$><1rvwhC+;AO3wV-pKfi)hExVN_l9=jjsx!t+h#KWv|DPH^G
z3#D9ho0JEw&M=3M%675c?DToyqQlf4bb_SUUb8lN3PvEDWvijA6whg~rC_0NWvYSO
z#aSHTuFzaWiuMJ+q>VWHte6`HR|GV!$;*%Z6#K1iCG(8bq6ubH{cb)ZU4!#*-6Sq^
zUxV-_H<&$JT8RJad&d(W+uZRM2%)sbhs7!h=g)nLHB{uPB(dWCfKO)IBiaPJ0g@&V
zu>B%|*sFBcg|u6_pAw-y!fXA1j~LS3ZMU3ocfVzF#AfdpuY|tXGnK6-oeYabI@UZ8
zCVYJ_gF<8ca#f=?)k&1Uy2#cJTw|gS;ZPDT-8dzS&zAN5tc^K;(0rb6a`&s)Wow@)
zu6r5ZSRcnwc|9GYViqJtUCZ$DG^jkQ<{GJkj+^Uqd)4Ni^hhtI>P9)qyuHQ#=#;$d
zmVBPkw{*KLNT^%PR$LR@yr`nLLL{@t+({MHv*}9ZiP?XR?C5UDFwy&G_mS;QImL#4
zoZJDD_SDn5^Vg$_**PQTrq9DAW@8bsdDrQk4|r5lr)UogBDi^+xYmENJ>#Xg)7a1O
zY+D&(r+`IdxKYD`L5uW?OBAc*JpZ2eP94^)e#ftAr-t))cs`z7TJ46h+t_F$i7>18
zWv#GatILF5HU3(EZ*H=Um#PBO;SKF?)e;<cqxbSN@c9ac(Oz3@Wf(^ho^OJkf#VaQ
zL?|D!C47<IskV`!`U7E4Bl`dskyV?!B~=)b*a&u>DB7momKHX~^=CCVlo}*SNE9WQ
zHNbuC?I9A`r3Fh=@j2?)cUv4ysr&wz9e=HLlMI}Rm<V@aD|W@R8mTFN+OfScTcN}9
zi(n^8Qq=`cqHdnKT%m~z&n&sCQlj$V-qoJzx}c$pT|;%UdvUTpLr<5cH*>h8ZHF9m
z_pT7vs%N(w$o$e+ou`aeFV*rb)DiEh>^%L%E^E-q(qDP5Skl@f+pGQ;LrdDK|7>^a
zkA?N^>ZXPd%MMPGg=qDx@~{Y+b(VlL(s|@yX|qgwZC%YK9t%@XjTV)Zmz8P07)*6w
z#<p#Sq5siN(2>bW#?8c5YV6@MRt9@2e@`5BEnC6=Ys$7f8*`H9(UxHy`aDJ5nGKe#
zYRAOEmyI}NQ<eAlFGerJ*{-N=DHA0zl)PNb_=p(kgnybsFcGR2F=mst5TERN+v2cV
z%~<zvZz-G&Gw~ia!Xqeo$}kGMmxdkT(_Ku=10TvPwssd<5VLPCl$zU(ZJ&6<&zIl$
zWiPW=EkS*o6BDrF7H$J>Pi{HMwA(x-j`7iS0V?KE?vW<fGJ5#<Q9(9wO>VE-0uZHG
z_^8aRNU^l8IaSy-)bsh9YQFcLI&^}&tq+b3ZH=78+GE<(eei=vmwA>cP-q^Jnmmkm
zSX*S!%V4VmnM+lt9GF-~uNs*(vCp1SAN_LRZwm{1$~u_oV#_%y4-m^IsSM~Dp?RHi
zXqLyFk}UOVGF?f!{7`v_C7+QAr^94W|68;)N$I5Foz83a?I!v`hv%OKTO9^`S0me|
zv#Tvcpi~4VesWULQoK;Ff1)2nbOI)dx^_#2?#^)2(#Vc*Z1b02)oMS?R(*|JQpU!A
zKh-lQEEv)50B`hDLp`%oIYwXn<FhIS&LN-EhDr!m=J8koNq3iF2hJfula2oJNVEr`
z^L@e3lN%E0=Ze`8mVvO=5DwdAnSK}7#-{VXcOp`2zy8I9JgFI;7UA^u2tC^6Lk4(l
z;7SEr`E(auIxBP1-Mn*VGXb(Bm4g`WxEn08OvubdiwK<WWkYdK3{M6CdxCiMajejI
z+mZE&<y_k@g95KQ)^dtZ$(*ht<v*G=**XNF+FH$1nJLOBZu5gFT7jl@Du<sk*-wi3
zg}!Mn<(tA{)0u}|QyP^L;vEN_j0%{Pho%bBex1>;4tRu2)#4U%wBSht3*}MFvGL`f
zVmGU%a>UbHNmuJ_t!=F&&5Ow5^kh8@--k&r-EC1b--w+zaL*E5C{I_ens0<7=brTT
z@||e+#pBuemp4#Ty^qhSNbC1_IKcLMxJoB6ZX>yivv3lM#A-wW=Ek57a~<rZykg>Q
zfUl3NyqVq6Q?dHH71k}KhYO{S=-mciT!E-^Z8L+n=H7@vLc(38vEmR`2f5CJnqaFW
zf`n=zIf0_Ls(dCa)YZOW;q<eT9p_02;z-il9ZmH7^;JDKN`<5p#?{S@cU2NFoMDG)
z>>?(U6}#x_hT`4#O#APrK+Da-B}{VsuFpC8_X=Tf*1<LQ{ssC)YKrKh3hZfHT>i<p
za>~bXw`$#JLiLY?x{k13wd&#$>@RV&zxZnJaF9|k3nn5G%5pA8KSEaLnE$*lBS1HJ
z(|CvX;S?<F6zk0DBM-gj&&3RIntaU6;@8W@GH7pZA>-uf1NvQ`jrx~}-#D9p7|KnN
zq}5c&a{>m>W$qQ_%Re-$z$kx_m#4D0tnqVRJB$SmkkXayw|P4E*5@?~bCOLHf9{a{
zDNc^$^XE7H{5x7!3WBnh$6hSVMF+^_tRN#d0b~eEGuZw$OAA7-Zw9eR6rpQ6JljJ$
zLCwyA$>Ig6!t1Tusc3#!Y(KtsKOHs`7GXbJlBN;$-7EP^x<a~S1CJC>6G8lUW61=+
zHX&TVhEYg5*YvUA7b>M-AR_lk<FnN|nlPgaE)Shu=D6d01IYQCip>Hwe^8+AD?Jqn
zVBBi?&LaJO(jLwpZvEWx@4Wzv%ljr(^^B=!>5}8M2c3Kh$Xd>5k21Z@CCXVm-D>7V
z8&WwY`2?&0|9bJem|%Wj3HR{l1&O{Kj6ib42Zo&*6W{rOmuK=NlO!s(xMsJOsiIRS
zjY+wv-;s+g{|epzUN9uv9DkHPUCZ-O99@6Cr1YsC-MpT3f$XC2mkS{g-}*b)5x}f8
z5^twP>hT{X#xUlQXm&8$sARa{S5~XArK^;mJhZ|WO9`Zs8vDfTc`hEql*9`HYcowd
zd!HtaHqpdaHFin^iAD?qeVWkdp(ihM_tUN}eK*)C2>TfwApKwkn5`2qJ(`=pJJavy
zo!X&GjsJ?HtlnFlu}LgpyPG?1vEXU`V}M0jn!V_BDE<io3xmZrDoi+Sz%-an>CU6I
z?{1t+@g^?hX*#b)waPch1_7)CBwGVbd~rWw>!yOUZqT>lA-9a$m!sEtuh!$w>ZeFg
z(?%&iE1E;4XzLNPQ!Sh|ptH!d#VbqSq|6c7i6gQ)m^C^_0*OTpEP%gblv;nO0&PlY
z+0zH-q!mYolm;&+^$}{E&d6vEhMCzvxawiVGkcDOnVC+`gf5)=-D!@Q%2!!=xm(Z2
zMEKCYLD;pe{AF{X&di7(H&HB#Eh|^_AEhj$z9wJi=$FB1r2W2BA`*~g6cC$>H_@kX
z9bc_045OroKKrYP8a4~}3_#44%P1T5pgq)n?`CD%kEI<87JP_LuqyM-Wkq!yapPfq
zWq_vL{$3--qJQ=?i`?6Q#hS%~h1#3qm}J+0h1y{XKslE9SGOR7AA@2k2fKfoo9d1B
zD{>g_NA_9Sf{#XFV>d&ar-|Y0ajUEsA?E-2Rxq*|&C`<q(qr`NeB6E2<?<(Qcm`WZ
z>O5D#q%}L4cq-<<zfDX;^bbTk^gkQRjj1H~-_yX>mZ<c{r`nB?r<GSYsN%AAQj+uV
z6qj@n2%#KwIYgAdmbs*nsrH@K_)@VSwYsJIi4*N}W|@CL{TmM8MZbd;-}@o<j#2p6
z`#JFbTOES<U-Ll|quv8#?(YO|KwAFK=R~M);C;3i>db$?2HHvn0@alhGyh=pXIwUT
zueb`k_+PU?sz*Z+9WimTgX`b3gZGNG;Qb`=)&F@-62T9|r0#UQ*1v#(h#YYZyl)`W
z|DV?smym!?p69tCul8@8FPVY&zO?WD=QXn+Iyl&BiBVhTzdvUdtOf74u4(_zYlP(^
zK<Mw%ng0JZI}rW<%YZ+m{r@uq=G1=7fS)b-ZAdF%zO?unSleb*IGmk#{$ck&%3WrK
z0-GXhlyB;uVmB(e!*k7|v3{D<{>llMG?01UQy!B45{;)+%7U?iIe{d~PZv*eP#V7g
zQpF>k&rnF!7KS3Dy>>~awl;m(^3Mq7?As+~bwigTZY$lKqGLh)pa8PQVpqZ<SdJj1
zTwsDJu28i6rD#t`riHcXf<-PG2XYn!8P*08@XR<pk`u5^pkQ5fM+1Qzu$k1+uI%o|
z_n*d=<opti`@^5_B#-u&^v<bmh}(|&7?qlfE(}%N4n`CP2j{$MBL%MU-H}2@aLyg)
zqO+O{qTqL<$7S>+{%ApO6ap!zt10FviMdxy*b1woFov{bUb-ltg0_u*OfBKsL@)WR
z_N6+*lYi_A>J%zqh?X+KJA{N{P7i_T%ggnnycvU#;eK?5zmJ2P2h)Be@wW-mqn@j6
zY?C?BK@EP`9b`I2Kw~Ewb%bGJBqXX*)PJ<Oxeq?7h{|c0!L#lijiI;rG05boZ4fM_
z0!VJ!b5PD{Ahi3gyuCvC$MUd|RuZ!f;uPR)Y#y=qNQ=}Fi1Kps;qeEgB&x3mGfa;i
zDtw5!mnK4=HrwekzwT2G8gsMH{cm2vpI!t$WwBTU#zjdHpBgf#@y95l(&4qB?><#~
zid)*C+W~;@%*o8WE=XY<9h_-(D=nYr7*xv>uo>3Py$D^|j&CfA5qz21qNrv$AWmfS
z%H7P`7b)Vs8ULcvbz%r7$Hy_({gg-GBqF3F%2Kp&8a`%j6X#jG%g5AD%!G(<CT5l4
z#`Rs_fE8hh0w$c;vw)NG;f3$pEYnjh;j@h&Z7;)+>2XIMf7|H-*xr)!@@Uo9WiL1L
z+&+D!@c5(Rl0*#MAci9z3NkAJ-5$EkP+l@YXbZWX=1xBr6TaJ)*ifTtVm4frGR>!K
zIygy}HeknRO}z)O01|h)p;4OLOFDf@S|3jTfi3~$e$Og%7xAovZnJMQ(Qpzz9YE?Y
z=7XFtKagj&wVzHkyY)P)%+FHt)$tNt#+l2)pF_!LZ|GUMR;D4%!mYlt7ib1Ie}ZVV
zsNK!t{I0o#*sQbWv&<5G!J@>mk_cL$0uB$cv*UX&-ie=psR0)8RoPU9H=|&!EB<<B
z&z767eu(%Z>X%F`GjIyJ1|Qqn7;?Q~a9<E3Y<&Vp8eVhtDrxI7*wsm~@UtF!oK8@{
z;w#mvdPanTeDsQ84||y^>i&}pNO<YS`)?1SwG)#NV<gccd9{wpV|zoFl15Hu)`9{Z
zfR+&sgG&H`lDIDZ{10xbmKmA%hvzOJaXkfN2Cs=w{4m&ss8c6ke&Dn1DOaICRLcpr
zw2Ga0qYL~Qh~nvLPEu=GmcSmz)@-NxLW<sOO1$B<Q)NH=!nIOqBCE027y0a7@%t(n
zl~D~4Y{6n@wu@g=Ub+ofM^YhoEKSWHVKnB_i9MlklD~!h4lM5i5Mb4zfMWq}z6*0;
zL%14|l`>i{UWLJ5^Dr};SU1^SM16z9IkLa;CLZJP2tN8a20xI9-dtAx^%2M$Kq0nj
ztJ3qrrCXL`r%PNHu^aMt<X++L`u(7kP>+FxUeF#D@FX$-lOY<}0pQPk0<6l(SUnPo
zo0^%Or_0cS9H{5V5jY72_+7wGY{g*VwsZ#IBYcPz0HLO3&xIl-nLcTv7%uYi5|2^O
zPL>0V;q95@xqVRbH0B!F$)06AM=ALtajTvn<ofXiKt*-3PWl8)?2Nlfo>hhe3Ei=c
zL{$N?8=(jTyWJkgR7o+6ssuN`2Or-jHq#0QR5E)UJ6?YvXe{uK-ow?I_>S^%C}GXo
zd*f3%&n4SqOMnYCwfU5p43@u{9~?ZLQ}3u5jG%?&f~?k4$CDi5VwX-P|D}Ab@R6MZ
z@#J^H;FTr^$jDw5?O$Lns|FUqNr)EG#%4L<?Q;^_(m`2|lA*FnHV_=8^_oN?lpe5e
znpq3!$5?ux0%*-O{NQYQs|GkWgrz0`SseGe_JVN|nCk&UNe%eSIKR2X(rS!c_;JiD
z8NOtV;ARyNpK;B2xWV9f3X1rxtU1Qx*+fz3Xf|AN$@SQ#^UE7Is?7D70K?e<t#zI3
z$WWnVB&sx%ACp0RjCw@^ETsln>C;?Wqhtp*bwj!QBrwOE0Rqv-o0me#EEta%3+sGj
z@xj5*vTpM1pjrp0_z|HB5Mx-g$CMZ{fcaOTjuDHc?l5>H^~3!#9&Ygq-)A5|oGZas
z2K$bKDNSP+wk07+&9ndqTAI>V6oZ?UVP?75M=8|~x7(G|a*Y8|`HY(*MaWj}XaEX8
zOl$EheoASa*L?k!$`_%8x<idg3682{NKVNq%h<XhCkZGl0!Z3Fce)BpZi6bhrP9+W
zi<8&xa|`>x0KTNT)_db6XE)aC>dWT9pXXox1}KRG4?A6ZI}~zHnQGdx8+>!4d9{AP
zd00wAWo5lKmp0Kw%0XQKQv?@KQH*&zoa8JwGdCRtQU~G$s=1Lx>5y?0O=I{W;?A9q
z^4ZU!0wFM#d)IVt0d`OWcI3}P>5OqkbK}Q07W>#DErW2#_MQW)i{NkZlTchT1}ZH0
zVG7cr`zU#@9F;g>p`5_@qwdRW!hziSX+$W6GbGA1BgK~$f)Pe1ZmAgM|HAMN`?QnY
zLK}Jla#}9H!XgeIUb*Y~^AmB<Db0pogHc4NK`_X040jobtyyMgm}z(o<Kg%?IJsj(
zYti64NcQPxd<lDR9pclyk28M8V}~+rNuD|LgzN6|5W@e7u~GS?l>vf1)j^MAmmAQ_
z3Wb-Wr|rltXfC%KWr&^`Q2JI(*ePGe1WEQ4tykYp0kx4h;8rvmii<&kpQHB%ffI}V
zoqC`ryHO#k$-!fBEqRnF4}=S8*{1uQ-_6hG#F3^Vn498ovY$&VngSok<_4&JY^&T?
zT80P`59Z>wodHis6grdrRvWtb$}mm*fCV}^w{HML8QZmOB;R6lt(Q8oXn99RP-rvE
z7zgOvTLH|EjdnGVp&D2#oI}UtbGXv54y#=3%whV&5h?f(edLm<w`i6p-6JDEp&+ya
z*{9)+*?p<>M^zx+Yjy#G_v_#e(5^GYwWm17a<P2U+_oF-WJ-#0`cZhXYK_^Vwe>zL
z7#FRv{Y}@oNRbCbx~rFvxby=Z14uScfu!cFW(fkQ9BVHJw+eE&Hq)NKQDj@IBh7B&
z>s=XdV<GvQx9qEPj>Bb*eso<}2+wA~)qzxKu<D5u&uviPN$CuUD*EJ&hQxtk4$4^i
zJ)+uFgc7PMl+6JMlVqPrC=bJ4nj&PqgegGmml>xY11jRcrX=s@&vU$$(Y>uK6lHUV
zf0nl_*Lj*62z8<mHt$w9<P0oRf`h+QfFu`11*|0kQyp!e?A!lf-H@f7l<{^oyEb*o
za@8Vo@0yDQax9}REaDPn#O_7lGn;FHt}QM&6>}ggEm7$6OZH@{qn@Rdv1bp0uR5;7
z)Xpn;7GI3*nV^AJ5k~kfnH-cpJ#_&nCWs2AK0@Y@2<Fi5R31E!Hgig*5o5^g;NpE_
zyFRo)E+!Wn_pNTd1OA#mlD_s?fu+@hCU~+MfKeJ4Ilsr^P$97u;CF~9bqL4e!_5Nj
z4&TQ&SAj6DY`{5(|A#VmNGWI<GXd&O4vKLi3~H&4l}~N%Tmcw6p8-0@qdx&f!~DS0
zUXhQDHh2<^C!54A4oe%sN!%%n@7?U9iFkyxV85gBQF#xm{&s#1Asz|_@LmAryLT0$
zuZU3dAg<)tTWg$lk{LsvXax67^+@pRy)4jk$sJCyyKU)@dK=32IA%SAM-)Of8Boi9
z!NCDgs~l!euIjn$tap@@wH6l;V(<tCQ#+$XehxHWlA52qgPdLBdm!>()AAO8CDelK
zlA(Y@_zyFd=L*P#om6|BBbXYA(voujk=xHm$h?+ifbV4mY=Gml1eNei_I{1m9~9Qm
z4#A=9PozB0B;_}ce+Jiub!~mrbAZgNHi*rdH3!*&WCHaZ@VC9yj*kBFrnu9$q$-yQ
zmFsMEtxg6ojwZ_?yfj(0AG7)E8nlozFl<xLA&k`F9k!x$X1cOMZ`hBfZN>uAX<w!a
zhuudi-vgV&x6&oZIRu9RGcJvwE4s3V&kZvDaTj|aN&1ZqGX+z0z-!f;Oysn6gXmO0
z#L?!}Q@;*VKpJhY*{>uar0J7BU(*VM)9+P0YM1o!$vGBRQ=pQzxUf;ke^fMI28JS<
zUEuNHPn=wAPgDdIEvsO)^K`W3bgzdBO<zm$arK4`gwu+Tjrf30;V&5bYWH!l*8~!L
zEmBwHE&E_&xfvSo`yR*eY`>6za)#z>K<{RuAcc(5*j2laI#z$!H0kr!<4`H<<m+#m
z=|9DovB`E+#q!llOv&dQM~2@)1hpOvsS^mTZmF1S{=S(PG+g~{UQW+}W}Df*CkYXl
zS7^VgzV0LIuG+<ttuA#Lq9)>pKEOw>y;cCgrbuIaPn(sUZZAvs;0H_ta}fWj86C_9
zcH5sl{LUQ>0D_Z)@-aQhtPtQ}mrCX`$L~*}WC}Geog2(%R<7%fuzr{y+p<e5RuanN
z(lhfSRTrnX-4|{#Y)FTAPLY@mge0;dw+B4m)XrRf%UPZFndW1i+=ay$`d#M_k91*r
z=afbd&r(LHFI?B9gT{Q<1A@Yp*bj(b21lXi#YpLL8RM7EsN{+e-zCL_-#E)F*aA`9
zYPgo266Yte<n~qs8u$>HzRD}&5FNuWCi(%yOqQ~1MxPFP6_9=zEHnzgp8H{DXEB;M
z?Yvo;M^#$MPcVpCkPc|4Rjr%?^qk(D`XtuOqnfeZ8xtS9L-n0z`z2*0{bgr*--$ro
z9?s*y{ZGqxC=CE8D2Wn3&7py<M@CtBM*X<TTknAH=>6H2b)uD}-^p;~+hq;NNX7xk
z)EmR~A&Pz`>k{R*J)=_OCU~%!{bdEFN1`+ei(GOY6luv@u5}<odk(vB0?GpqZ3tX?
zQFR{#881rw6o*`_iS)g~lBMs*D`X89*H(xgCr{R(vYDhHmd_5xeWiWq$D?{q*fe|o
zk}tKAp7K5E*!2t+QSYec#GlGztL4SM()kePWrhHAQ$O=M$VO#0mn0OrUJ6u9k6?%&
zkcas5{pYnsvrL|5HI_Vcx7&h!enzz{5L|pvLR&^YNDrRRatMaai{WU$Whs@P9i;6@
z3TsBE*=aINGF*sQGA$Fub<2XZrPVF(HMC+csEGAVbU-m4b1$<XE;HxQm79J2QWT4z
z`%Tw}%`S7AH|{51{ZlsB0%f>)<NyJJR2Ri_8esoQCZ5W68%;@{cS#(49<#V1IW8hy
zatGP=EathCcEPdkaX}sVh@qE|{Ai-`PjTdq6>a*^*A2=8_tpP!s+Zci(u|9mZl6^5
zxB_<;XcC<+xZoB<OEJyVPhwNR2I4#C6CEqwoOTV#CI6fMBoQzERAin2<vx{dNC9Xu
z#8bx0{k*Lt&8Z`lxzjE3;v>`3a>}kn1&b+gO@=>$lV<)$Z0v!jEXqo#iUC4EMS0Ub
z{D;hw*jxz`70!)dzMRWr==J4a;*ugiZf@0vx<9uDstVi>6pCxAYeE?#N`|;L;;OAF
zpODARK{acpo#p<vhTmLe4tdEk>0|P+$bs3>D*F^{QtoGA*(MuxhyaKmm0$wU6p@4&
zw?y#}na7@xewEu6{~Ak{-HOu&*H*HHouUt0v2uGjX7;CM{Tj-PRJH7&G~1f5mnPoo
zMK_vXIVveo&!SQ^L@t(2R+-I!5dDpzuD};!ng`o^p4R{?5QJ(1iLJS&!vda*j4bCe
zt;VVk<zjI%PxjIE80YEU;Ot^@+G}@iBG39@x5fWdOp8>2JmpRf8G=Dww=LVipw73^
zR-ETMpA9}D;{?U7@qAB*oe>(qx8bS-%~$RhptV3842%nLS`i3-DH012NNrtop>0}h
zK<*@o%5Qu~gwd+Zr|0sk0KQ+gJW0p!b;$$xq5^f*AoxJn0tqKmF+6(aaq2tzrITat
zgXwQTbF+p$922P^>H&WN+&^y+kq88-8&#Xy6q*?6RrR*>*#(yN@${!Z7yZ=#2(HKH
zMQ(_*00pQVyn3kT2d!afIlPr}|8cB?K(m?RsUoznVecAx)?s+lMY;LC{l}q@fCoRS
zVZIvwFUcZFmCNju;Lxy-&$%{cZkK{Sn(#Q}8J$V*r~Hl{17W2DMhE**xugB=`ry3O
z6+1b|&UAucfv-@u>M3|Gyf`P!J%W>a#NJ-Z_#W*%y)alP_9&|kct_EVY<_jhh@!=s
zep85$V51O1D6&rXSYU1+yA$2&;=8(LdAF5a-;gJNbMOkKu~ihM9}(rS9&U?5TYh;M
zLnkd*pb9No0gPK$Bbg}l97e_t|KPbiYF*GSqx3iguXE%Caa;23{6Rp6<n%aNl&wQz
zd6d;V;bjba*j9&s$^7L=62;&Bd3MKh!AC*R;vg5O3B;Y4tWtw%u5NxXGw0w)9?s(+
zP&YPO!+X13R4%qLBieSnblIo9EhfZfMp(PlBRyPd7-w0g#ymn}=!n$H)2h)sm7E{6
zf}QTSxiTIOV1tv;1Hl%*-7N~qOA?|+)KRZ0%=_nPn7z?6*QIde*t^((1(S4McJbQ|
zwZoO6Y%|ow^72*lX#d@0(PFHSKY#q6gng0?_+AkpAC|<hG0z7{_Wg@29gW%AcyiSc
zcDI1op5Z!vdJ{$7y#6yM+THPT8o-1hew+uWrGu&sNfF{l-&?hmP?ceKcTyevSukib
zu=Z)PBif!?zaO*ERBhJ7<9F{&)8>PKlm&+qqdrGH??ABu{+b&o=DkuoD)4+6z8zq4
zb=bJAIJsnECs1!W9?e6Q@0H-@njURc&@gUZngxs9PVV{@k(=+F?<1bP_<i1aQ(GI7
zd<x@apgFeE1=0}nZ)IM79sf~P8eyWE+4q%~-IgkO1v7j(_7Rz6NBRX{^_?UGVQA1C
zD311(=G8$F3nCl=(vma!zZr9ZuI<?iw&XuFQ@r;`GOzZAR+UoixLdmPyCE<LYNr7&
z;}Tc`89@{knLDYzZfJh!RUguB*7(Rhk^x%bYt`UDlI3uT_E<@~G@NSFhh`ui;@1#V
zB(iC;7X{BW0%iPy`6b1=tV>iSOKv4G?3=z_9Y5G^DfYwO)Opr?8{S;7ui`lN&l;fa
zg&_umKrA6|nUUrc!8&r%^GzNnb$bj^PGC+@btn&&L!{kb<yuL=7sm6@*UZkrz>3_v
z{qe**s#+@}K~R@P=cj3h<)1m8Z0;5yz{xkrW5eJ}4Cm-v+;<4CI-~e{c-ZtdB>wOM
zXz7!GzE-guzI@T7Y>+ig`c}r;4GkpW@;P@o=juP+fC{Lp`a5BLD8`4sX6Y3v_~OgH
zIX@P`+<rG8W^XGg8DuVy(Ff24aM%7Rc|N3gcZx%i#rrlySZKk1P<dCc3mS-oX8HRi
zc({ZB8ckHr`ma~Vv)O`3ICnzU?vJ7W=f7u}fe$1yq5St8U_f&vFn|3kOjS_N_TMX<
z0>12Qa1qIWEeiBu6nGokov6{jNc*4ZPLcqZi7<r!dk(O=paYPNzf$ghuYfuMus+Gm
z`}n^W1zHPr&PEB@p8uKn&jU#8V6i8nFaFax{R6H#WQvy$b^hHRfsnQ&0_zgBz4$}K
ze<sTXAqUv6^bRWB{6ic7XE0co$O`;m*1w?y<ll=)I`{Wpfd9*iW5xLYj1^^4C~!N%
zOxhKX^{JMgSqT?9*uE4~=fn4gsiw^tY^|hP^IjVuA`%mUa+XjTnUW|o5RLy!qY5cM
zfU8`>d5uaU`X==Pyk!Oh)N=gSN|`hSNcP`suYc-qmEJC{hPI!SY5c(B78MTfNZ>L$
zeUGIp8R$m(Ss*A&EY9<NZ@a^c{wV1jlKF)Hp$ftkw=me~v>FIWQZMj4>(3y`3r8r1
z4}?i`(W@v={gI|TYGuSd_I94oi4-92VxW2^brFntNI?<uD2VU(8ft=@sR`K9E~uTn
z!>AtwgZ~DOAnAT-KB)Y1(Oz;ZjzNE|BBpkbbuy*O;j364Mc3~6R5a_tQe=iAP5On)
z;{z@9GG0y$EeOZu-$)R!V60Pxy5Y^>Vlno@1;>{hh|EI7aIh6JvzJDMgr{~pWD@{g
z#)yJK;iC77(M5UKs(4U=0BW4uPYZ&L<`8(n2^9jee&O`@rJFzgU=l#GyZ`pQ!VdVW
z`6tlG&&zi}0sJwb)1R_xa5J2E!QO)O2iJF%g4@B(Q{X@0tM|@<k~#%y;q)NhGraMq
z<m~~-K{J=BfwX@QnratJwZ4xy2lPJ^=;7nEH>%(R<1$3RGB2Al&G<+%sOMUNx#E^D
zUj&6Wb)ejd_zqxaoAK~#X0`hIUhgTRi&p7+??KrSC>LW-+kt=HCPr(#z20K|h9TD!
zO!)5th$4tc6H5jtK>7~UdW(PIn2Ti^nRqYFp6idlRDf^`^%E@lIfWNgzLEf4tow8d
zRKVLLvH{wM7o)rC<qa~3P$Phy)?*%5L60dPe4a<+3D`$M@eGu}+<+}X*+LZkcHZt5
zM4N)mkEP}frdD-W`XS~dOPhQs&b8*##(w0~0C0X~C;-KoE+{D+E75ZSMSozf#H-5o
zq22=mzCc!a2I}GcJDy8~LgZ6vIWHiBK}PIE+B;H^!N~#qo;eNx`>n~M&9LD=)A4F*
zW2Q6Zp89+7P_!R#J(dkH9K$2lyDz$ZGfwk!AuOc;RlT=WIT4mqurXMy1DhJ>B>kVw
zJkJ4=oTJwZ6oQ)<Ac<OZJ;w<oDmXZ&Fdh;l#1Cy$F2kd8Zf}~?UdKkF46yWW!z;kI
z2rxi+fzJTA?q+pBA>rklgJ6vq@PUG7p&})7Zx{k6Z;u<30>7r(>v8O-pCS5bniQkG
zZ-BW4BLP<QUT-fj3!VfJo|RC!2-H>34O$S)wyHk}U`K=5Mm*DAP%%5(fsgV-Cyp}@
zpudozSBJviv_@?z7@QMyRg;Xv-a%a!8YtSb%(DYbr~o~NF7%>)DiMkWUVs!ZZwd_1
z%Mn$68f|n6d^yC@TiH^|b7OJN585*63s#%QJ--ZU{VZU$5D9q$NS+bk8765E2PCZ+
zz>GG_n*#asUt5wO{^UVu#RxvOO2a%c1Y4^c!B+<|+9v^6pi`IRJo^r6%UXj9d6LiR
z0+73ggG`sNL`@WQ5M==A0_<vOA$;_~bNXorF`z<1#fREg{s%F3XK?2vE!apfQpw$K
zcOfzU=m;bSWOn(vkcDCd;QMUFnF5N&0tnc>TOe^D25&FAmve%*ya17+TD1qH6qigh
zZPhHs4#+3SIzWifEJR!olwrlL;vw>!lD%`utr)cM{s{>L%fWQh<l0gYaREkuHJU3x
zg{iB2Oh1_jl?8-BUWGRd*uh1BTTv;d3iPV}30j)l02DEjh;@iuWeAV7LiPN1thji&
zWg*WEvK4QrbyE&LX$L=z4X95H0HyZ5#Q;$sG73Wb28Dq7r2ve^fV|&;`pnuBcbUUy
z%^LlEHqTkqT)f}-vr{a99MBF7DMg4f=U?kHJq)N=C8Qqj25JSVf8?ech)BtR9pS5g
zA`0!mWg}&OSgM(?^4)yp9vd@Vz8<w2&;-DR!XP-&zbx5t+fDhew%bjRD!Zm;K$|*0
z0Dyin6#0NvFM(WG=tBKH$mM`sb!r_lfaIV&*m~T<1loGE&+#Dfp4%DQ9gs63yg)<C
zi`NFU{p0|GXC%$yAnE@nT(Q&wuhT6xffASj!g%1ShahfpgSXm!P<8bGGy@`_bN3+N
zKH1)hA5d6@Vt|^-kc0CC7E9{5)A#SX_Wy}6|0#q1XV`ZZZS2!^Aa4o;`Q?^yTDm`R
zZUbPwpNlw@rau8IoVsfWHkP<SdRH|&06E5gLS2*xus1^^tEL*k2ql2O5_cXHZ#J95
zz~w>xBYVT87Ge@E0@7T-F8&Y!SoeRZ<%MdifMYpqq7WgefOM%B$?HRsrl125It6I|
z9q8h^_pyEA0np@WduRbcpa(5D@gB?|u$p&()m$SVk^*ld*tO3qFC59}RsmiUp$tG=
zZ8$-=tQCv^-HjZLHVj|`!qNhj+k}L(?_ZUG><s~YgctKr0@4q4L$z(}puB<;ga%2<
zO79>;Vh0f!3X=arE#}^8FrSmo4p1+IEWjm@9y)vw3MfD``u@17O9=#xZHgD_O#Ua8
zSpxC7w-gX!M5Ys<5cbUq)NTy~SP^d4g03kUAS_?yeNiYcW&?96elZgH_p<+x0&M35
zmpHh&01wgau-OE%d2ZlREBAK7X$5G&UY_MTWe7GR6gdEnTv?L^@*J>O+;+oji2VO5
zHb(-Q)6dS@0rY_hGx0uc&j^-O1dv7qmnZ|8x`Rh|gexG*A^@+bdbJ8_N`ZX@(SPkD
za0VN0rf<p`Q8AY}0|EXdX$*N6!N)*-;O+`Qj{a%>8$(Lo5Kszrf^WV8!}QBnAV&QH
zBxv2M0k#J8p_TvcLsP=b?`s?ew(G1I-=|HXl#q)7!`=a1m%8v@a`E@xR!HdyKuYm0
z?z8}%s{oyct#&jJI$&!lJ2<AoTXLtSJpUVzZ;axr2xA2WKn%-QfG`H0b&6EXy-<hS
zpaar)@N8*>h|Qb^{@2WD1`;lcIUVqCwp=77a~b4zH}Lx5LmX?h!wn_g;9ggYad1Y2
zmys~1M4PcGo}o96AF4K|eLST3uj1R^TSP$3?^qJLkePVa<k!<Fd#+bM*CTVc-9TK^
zRkJeyy1X2K60gVWMu9i}1|ZBPod|Az2x}sYHJOdCM|w#a9iC)d3!Zw7bfhcRJi`fg
zh&`W$BH(^txYa8x23`=Mo&#cV_E8qx`ay((PTnZMV(=~mJnrPm?+Y*j>2dd0taH;F
z_??WGkIGM}1of<dj2RUGWOx4sr~{%5NJWU*ap%#6uy4jN;S1hB$t;Yew%<pANa`JC
zNY?;Vs(no*NhuP6epxw6=QaZC4d_l6kGu=_y|^1IuopXJg7F=)I~j{TB47p5H&bV+
zu{blVLH+*3>b1sg`zEw^n>=G*;rkQ@PDpw69LonwkFDxcV9U=smD~o-6)0wsj!pi8
z4y<Xvt_fK`?AKtlF0;BRqcmZoN*lDn738`Ooo5o$pJ!P0R)Wr0SgRUHtC$7i^MYzW
zK<6wxS7q@^M-SJrhhsrVeWnb!V{pouzq4O*zz+IgKos~8C?c6?SqL<|^)U%-0!Ndk
z9KGts&S?+|O7R6#Dq6mMzng+qW&<5vO8q7Zy`d9y2j8Oz#_JY`ok1fZq$8YS0%LTc
zYbTi2PE`D;1xPmL2N8CwEcUCWO1v5t`9_`9IV?7lhpCwe-<SecXpXi4L{nMk1Lm$*
zdpMj2oMxn$fHG_@08=W_AV2U#Aefq#4%qAQYOL_?L{91+e04fdTu*7*OKCa6TO1X(
zU1Op1Sp&zY{1ia*ZT{>GrfGIJ`utw>pVJ56Hc>C}^9mULz11T^lT+hh)2#HWN&V4&
zQxL{W;09rRAWx>s`?qjLP0h{|rh^|E8S$-{gDwoU?7?T*@tW~FM{KeiY%Rs$1Xs^+
zgv*o@VaUk=cK{v1nv&V6!CV@=T-lYe`#ght1)OWm;1$+$Y+NI2a_;rzUQz^NnFpcs
zUt(Fr!@P$*+7EJSbO>6uw?O*ucm?hJ=GZ-d^i$S<h-z;yt97O`Av{ld*|w$F(W6k_
z>HYUbnV(cLcd4}fcf*3b*E@_F%Rls|tqj;BE0k?%c$YWZDqm?Ge9>CkeBFR8#MdF4
z_M(n<qp%e8TjnUzalvJXP2b%_eZW<wjig=lzkq%HiZg;XG@>Dsa^={hlzj<iztvi3
zL0Etkizo0WZD5u+VbQ|kU<q^h$VHu~Z-xaQb+j%^bKbZ2No{Ivf!pCHi(Rr5!K<ya
z0h6;A!J$G6EOH;a_Ee0T37^F@I!5TY60j*2v}t{{Cq8Nx+;iAMm9hKf>s9N4E~UU}
zCCwUq!vMct&O(!$mQRP?3md6lbPeNJ4RRp;^p3Nt!0@E%f|x|rYzGA3+o0X6EOe2R
zjS3w4G@=&p8|i$ZE^z7BZf)s)?bN31(Pk^RZ{GvMld;E#1BGUCOMd~r*_SScTE$=@
zlvUQ&xDLN$%oQec)nF0==Mx;y!G_9DwL=@r@Q&O;%cX>>sf?q|punC3D6>v~wd5P8
zbHBP_*+Hw1%Ms5Au@l;g@L%lg5@l{z+xGHr^`ZucP_bgB%(|<eoOYi&aWfw6wrb_r
z<mtV%L&Tasa2b%@P&!n0y4a<k7_{khs4=@w$d|?Cy-<=~&}mWf|DyEh)2kxUjj+RB
zrmHe@Ph{ThmYv@(>(&%=yD9W(2H7}+yj+L}XW{}bvLF;WXZ8x1=oDf-9tW<|2Du17
z=3{%gp!7JFOX??SN_0M6NH(l7L!N<{-EEG;17rX%fpUp%*WLhz-o(=Jgv!GHl8vli
zDeY9rB_Ed_vJL3a-4rx=%s2SnY$P~{_nSjPpa*xJP_qLR=cu|(Y#?KUC*z*qw^K`D
z;HO;_JZX5MA~yCuMD2elndtMKTs`PnrN(YMU=6JXZs(3oXTSwb?ucdITBxhvG7GdP
z)^h*E#<bTqr6Ay|^D8_^{k8wJ*EO=E#vjB2+>KMbjZ^P(OCCt{=_sGRbLXg{d>+$t
z(s2=3CEzU~8DP)s1l9oEoC=GrU$b89lOMX!UfuFwZ5{Zam8Y6jjvwdKH=7O;j$GGI
z%C4QXA@=Vc_SabVI$FU>o1tDwTGYDdw+Wj|3vnz`*Ug{p%T#3<>Da~+>VmxF-v_N=
z2sNxoMuDH+;2@sCL3q%*z>8yXj*#jg*OT3GlQlY+);hFu>JB&Pq9>4c&9A;XG+hQ8
zbSV<Ufca-6<bZZ!1iN6~`SMcV!NM1(rGA`9%h~v}Sq2VsJ?b2efb&GzNH3bSB>1-{
zq}FMn$%-A%jpb8td0f~n!r+Uh{2FKddV0Dg_)SV5JYyRZFdEKjcUzM^nmwX_GZ&Yz
z9vvo?)Ap$u$HtZvxOO^dZF(l7HW;3ju}+ua;<%-g@xc0vaV`(xNzmGP_4o5sXzm+X
zR-fAGP1t^e!1iVKs_(TSXdBUg1tiU>GHuq=tqm(4rC)nGNMf`I16us$Zy(7-11&#n
zt@gB|afXzC$xSxs*A=X>KeTWWQwSd-X~IwFEjzXnTq?d9bVDzsT+mK&(#j|f9CF@a
ztbb{eFasT|#dLIUC9Av9O|`kFEa0g~yTOsRgtIzZ?VY=cQG1DBW2i@5@g1*txde^|
zh^-dXKZVOgo|fJHm@?sb;jwkyd<dI~m$hB%*yJ{vRdUnn<=X>to2%gFiA~8=QP&l4
zx=gFM<0USsAIn=oWS{~>WeX(8^OXI?AQuCDbD6f^4}J<9wlsNvj(>h^5Q`L|X`fwF
z>LFYS`sI`=iQ{+jQ7f&>a?e-Fn6(N~%q^aNYd01{!LuCL7nfe?Gr!;qx$4OX{WiIG
zHAl<99cmE0YM!Bt^1i)T%bV1=)ny~IS@eM=FL;0!d*Bzex_Z%qFlaGRH6JIl%DD5P
z)w))N(574%!lfs9*m|_fl#*{)8nA31?$x-Nmf~^mg1!~IdOvkSgh<Dy)-d{@nbLz-
zQ>gR)mM-9??WWy#QTTVziS=%l>JcA}A%+Eem*H{VCN4Vgdv3}fHmeVZu%mp}BwK>>
ziuO|v&F2XpouM6T>L4o+S|`_)hPEOmj-4Ly|D2rg<Y<lQP(Eftt^BQxdC0c-aWoQi
z1iYeK0yuTMdb3?k-=r;Naw~VL4g?xv0?$nDY#0UMjKpL+$nK%zx;s}a@M=M}!g!AG
zPb+R#5(=IV=6%_@eOBBukp4Zh{yjak^%01@4S}N#jonIq?{<<`MU*;NyDHNJymD}3
zHGX1fMT!^1Evs%Tt&nb6_)pFTm>U%|^mGoS1#N(X`s<xd2c0P%u7^t*go>1)s2!e2
z{9Iyu%wd}^gWl+Ku72JN(W1o(y0umX$0uMlb`2Rx57Kw)XO}g0W{(as1Om5|g4|aZ
zJ<Xz>Zy?VE?N<lwTXz@+hV{43kJU{qJEnZIA@vn{ybO(*{HP+=2_l=)g*&rw4lKSH
zTf37HkVbZM<4X;p!PUym4C?ur00(J3Xm%Ck&%H4F1LhPSlA}2$Lblt$-l-jGCG+Hr
z?aI81*ewn0z}~hG>gbIC{>_xu(Y)-0ug2py5t-uC%I{+H1vHQ=zd4TLz{%+FTw;fm
zjt<Yxq^SY-=@tQj;F;?K63j_AUL_1KtbYb^M~aT}K4YnO3-61B#;vz4X)Eq|-L;oU
zeF~I3d)vue10+Y(m)ksWnbyd?CU8-JZvAeHz_r80AVO&i5{n<|zE@6X7E}Caku3+D
zYp)xpxFfy(Tx<ODRsZGTz-8+YxdW=U8Syc(JLALrWE<84nh9(y$WM)u<3Gt$`h(^Z
z0&Vr0d~YUrJQI?e#vCNc>RC|^7uD!WZ|tuDX=z&3waRnT%{!2};c#)>{YePzUK+-&
z?|~}vkgp>{C5%2v75Bhm`$38sI@i5uef1U3ULmw8))NXenEnJBU(c8OJnyO=oQZKl
z?N8E(_Uj&w$nM*uh*Fb}cyM{tffBfwfa%!N^;r&yu?tLY(+|W2rU!}bEcQB#Xhv8b
zV0~?Pv3HDQ7uY;U>#1&7eH*V7(DS)KzGSLrjohzzEkBg@j?t&RIE>RF5SPheZNl~@
z6&+*dBC~y8cKhjaNaz0HaucCCJcOz)a;Z0-k_x-Qk1fYlLh(n6?WLJ@V8sH(w!TFj
zv_+-zN18O~z|z4nmDD;fq^yVmd-g5peg;=z1zrL0tEyEG&S(s!`+)4=`&Yk}SyEX?
za3OKU(Q8|AQX(N%&bxX;rF(HeywmGAJ-C&uW=;nb3vY%u?S?BH7Ae2;Y>UZcZ$j1u
zDhcBkgk?8{&({z}MCq<?5<t$N0ISHS$2A#k6b;Ziq^X(>Lb-uGNLq`ETdNX?Pz~^*
zOVR$o|F8e%D3v1j(HRd-6{RU3Qu5e|Z;*KFx^W*Ig72P-TNrq<(TZDkD|}$(t2AW(
z0o(!KwD(%t{+ZSKkb_dLtd%c9E)sI;rwzpPLWE4;1X?jXFXGaXM!)>tF2R)~VIbWn
zySM6dYjSosa&<Sdo4RO$GCZKk0aN+T*eEJvFDYZB=wqy>)wH_Dm0!6M*2vMQ!?`HO
zqCmxYe~^XXdk%R7)XR2=E!K!rf{IVuxTqTU4>U#uKQ#4wz!X&tE~QV&XP<c%8S$oF
zKhD<&$~~&-e#Xd1PAen&JgN)EWq%Gp20Y{9tJy=*QlD-X1*ejB_qyXl^m-ZifhVgv
z<{uVEx~b9X@5gQg@Uz*;Ov}qyZPsPDZA#WIk8df!KKgpQyPqVlM+HwKZYF^8%s(wR
z%SM>}S>=ts)P&guHFXO%<#J;?bd`hT##Fr1=gsAvMT;YR+M@9Em8r9vrJAAb;kM=_
z3rZ^~4EK`Je464W@;~Rbg7D{tAB_Eiv*C134N~cuo8J*GpAHM03%i_>$fGdG|H@5Q
zkrGNDrM&|C!+p$EGY+P?#3sd-J09oY^7-phg<*NMDQb=1Fz<o`q9Wd@FzH7{<g}4J
z>i4~&v7?2E*&d9lZ*wBkOkSZ1gRV803^9j7Q%qX9y+eWn61FN_Rb(q+6!1lLXkj-;
z3!jsQ+_b)nRnO+}x>Hc?h7EP!9OwMb8+AX|xdk`5)9CeUsv76Z`K<lk2hzxUdJHYP
zvSLkMq>_b7dcg@9vyF5<x=f-M$KVchFSpD<+A1c+IFiPbZIkN^`UAaW3s8l{8id4L
zkWfm(3j|_(lDf3#7R37Z>k-|=5?IHh&Y(R3Ou^VJNvx90Mode%`PQ2>Y7=@~#$kVu
zucO(EBC!}|mO~9a<ONMYTiNZ_!|%T*lN$RgzdwW}7X_3pw#CP&6WVKLbTn5VUX_#?
zxhyWwSY10<d?Uf?g$g*FhJASSN#NkqIb_TY4K`(Qx`|jefke8yeK#1`028&XhtuR2
zZW#~V*)uv?uvjJ}T1&t7kF${8a0tu|8hDc7Ag1LVgHtH*z$l0BkI%}aDMsqArW~pX
zu77BiUb*8^2aC-mHWS|9uy1vjPNuog{ps@{4{O~}zjBaOvNjDzPmC;b!$vbeIul4v
z@nXy@0m?%$X@PSYNIJr+;CsX0M6R;WJlM(P>!{<G>N1rYY{^PlNOPsKLKn-%D?D^7
zg>%m6M8XyqJUcc)eQ&_S3Ha~-uJ3J}X7HaQGa#%#BvTgh{iW8lrB=Nn6kEkz-mjW_
zVle8_YLX?plH6;jeYBILmuh1YMhs&X6+&$?`wRTW!KU@OCw$*cWjFfL=EZTr%(J4Q
zA~OSV3jrIS5{WGel{&hfC8{&D#V6Nmdx&qFSc%#*0uu00X=K}sb`VA~2qWF@$qO-|
zx2lQn`h3JZL*duKwLkr{=!)XfP`&gF+BNmJD$K70*@qwGs3^`_eZw5^kyYP#Y1}p!
zTcn{Yeecdwox!8S(j#2cWc)XrGSiU7Vc`iCShmo?JFM`EhMu8N8b`le!7U`LE5yz$
zs_Ag}2!C!e>`=zWbUa|y4_mCz3Vs6-_I@m~C?jr-6=h9mTSa~<pw|dr8j_E{@#Mn;
zv$TVn7#&T$5xfziZ^-R<r_MSks{I<_O(H_kkrZ8L_Swp23iwQclec)cV3tbEvB~e+
zAG5~FqL{!|=WL}Bg^!{AU34<0MQKsM(&xH;vT47e`Quo;num`D;ccUDX}!12WrW9o
zN<Vc<Zay6yY6B{9q|k0fJe~7>gq+Qp$K&5USGQHhuKHO?XynR!i1%@g_`J0ZALVN{
zjl5}l`Bm(G-qA+tKpu}lXU}#C?i<?-MmUyrF$pO7opxb8+46I){i8QDmJg0^_(8Ye
zvd`9TV5?s;+Fqg5#h#MLPC*mMMF;uyMhmjDTzj&bFm&y&X3OW>DiWC2q3e&3OrhU%
znHCny;#q7I&kwG~S@I#TTw|spB4VgexqVAt#H%LAo>)=l7UJ*oZT~9Gf(oI@lY8@Z
z=R^X2rP?y$L>b6^oE!I2nre1af;OEdooyAb3@GVTPBhIceDH5~!f%UXza~f)R!-F6
zHyx0~!+BaF6T;bQ7k*DGNZQ}B>QE-Mxd-E6g{0;}53+5;q9aV(^&`fsZP^t)e0$KI
zzlp^qGxVc!zww);3Y~k)C?R5&;Q8AAl~utvC=pJT<QE|L>|xeDEOtC>!i!&XanQHp
zulwjsda?(1lyHxEUcy6Z$(7uz59-G8@h{a!I)YYRzX&1OhMyEg-0KiFWyH1UZeoR}
zSi?(JMt|SDD>pRAeX<%u7{@GEN;{f3_TOvS=aWH-Jsf4=wqMO5>__w}lel91CO5~o
zn`cDc<ubh6kc^#p`Wjff*S+DRpqQ@<DuZgvCd;gdOj?xaDfW@OzljCBKZ-Zy#+(l$
z5+i2!S?`{U42beiNnO@2RHC^aFjSE~_<_7NOCWTFgA4n}C<@=X+UQZ5o`J@KU*VDU
z3a{tBp+VXH`h%^7`R&ceN&G&Gi>cEy_QUtedAQlGEMyCA(_PmoUP9x{3uZ+I^JN~-
ztRAE_=}CdY)5M+E#o>!6`2?c8#z4&mi)8R4NuMflJS_WCabxH=-J05@H67VJ#>yo9
zm5FQDF?xhi^=}1H0}mG$`8p1uv~3cugqzLDXBb15Iod1rixe!0#0RsJ)f+}d=&B;n
zA@Ydw5c7dkU@s6<O}4|>fmU7vq)oQIw(T?@_N))TXY(+Rpr5E9O*D{`GSH=98|VD`
z1rOx&E)L|~%rUpv6t$>G;_PdU2{owrr8Y*HSMP2_E7TpynXiUZ4yah{Damg9{`)%F
z`iiNyh{wCr#19%4FeZvw@!`*hXp_6;^61`IEVg}mOUKgZIn7Zg!NGk(Pmz8w3S3M!
zVH3XMSZ6aN(!L_AG1mFpME#1}!Qtvr;I9Vq0M4;dV}eiry*yARY%&?2*YvWh*FeL2
z9wyNHTq|eO$BpL(PlM#Ib0}Y|ZXp?`zAzSGBbFR;pXq3wDQKlHAUA6Hm+;pbTrhjn
z(M+W7%)KA?2h@%?32RfsE`Sae)DQurXotNRNw4y?I{gAKRvRXDQ{#k60xdo_Vfbtf
zrt=Q2Ek6G2n1EMB{^3$@!Z&rNwz;PZ>mgiHcfpTTF)8)47UT9gx1O639-1CyciyYS
z*<KHGTu;IKf28&QXsxQ=A5&}fz8~+No|@9)`OBDbX6pr<%H&a%ZzX?El7~t%jefUs
z^nxa5jZLn%0qwX$vSa*a@(^$S&^%Pmu{JgKRI-)07dsKwk!|ahRU}%08R`m?ax&??
z$zp)}e$jg<Z~9At1b+z+v-q?ZZ?>k;a*?l9fo3lK1F^SDtoqWmQo~`p5iZ~&p|c(j
zpk@ZtNlJ;U0O@sp^W~GV5i)Hekqa(!6nOmoxX_!M<rdP13l@aJ7I3YamuKLoa&0U0
zTCtN^vQypWR08dAu7a4`1SnFs_RPd{IU=_<uQdL=@}gDtaNQ4E8hC3+qg^g-EHZ?a
zZv9)OW(&H7Z}VlKwAH)GCYcR(WVJO{qI_g74}FbRuC-NI%F@?<cah@d4g02T`xb^3
z=ddfDuT(Pig6>{hP5O2!DOOLmFSqDK+`Kl<p-}iiQLXt`sLnCv0czJZ$ehDHCyi`1
zAy@YPk_C(<?o<-_E9YXV<e`@KPbEGkAUj|6Cke>27y86aFq6K(H<ah`O1AVgGfv@d
z%4m#&o!9amz0!wr6|GXguZtC3+=Pm^d>b~7iexjG&;^$+;Zah%=D3BR(wv8E%BPly
z6rgw$6pFt5`qQ}t;bwWaBh86o#t|X0@M=ask>Z_bKSM9U5d%dUPL0G~k+)gnuW`82
zAcFO%JfW@Aw_lUZ!C|yY+CEii>UVUDMW<^D4SVrWzQgj%Fqy3^v50$Z)oXh<8t2<c
zUe;j=E3-7M>L21MSo^PllwQoildN4N;vf#kUH<=+cb#EPWm{V#Dxi*KMi3N4DFcKi
zy$IqcO+Y|E3<^lE5k-oCL86W|NKrtF0R$47AT3D8aS#GhBy<LBfDqb99r|4xr)2Ik
z_j|tI_xi)<JSXSuv-jF-?Y7?cMH5!eb}8HaYmYVd(kh~K=U<JwM7(A1c|ao9a?#oZ
z1vm50KEK`jywll-5HY#LW^1!DU{le>M+r(%=TOR9_F|Z2v56q1Eni=p>7B6sgZ^}0
zK{)GBS2T^w`^}x4Kkd(uJM&CO`<6pU_+*z$cQI8$@Th*Bay4k_;iH(j_?oAbYLgK`
z+gPqkjduAw4F&kyae1R?2i~1kf6C^Q=#eIugi)_Y=X1m!9d_dF6gwAMObzbequv2d
z$bui2<33}r8R1Ei8%qr^^V_3B)6d^xT%nM&A7?L^te;UDw=_Pnav_}0X!2C7iHPTw
zS9^KyH*ImJ*!KTcoUmB@I=aE&$KP%m{dV(4#=8P@H~-|Dhhkob{hH|7f8s3h>xs0+
zR7?ofk=Q8_TRH-}m1GUPnEUykh1KZ&KUUH!pOIl2Z=<Smrnxe8+e399xA|Ue-b`pN
zu6C&p@+fiE-Ues8q2jx0`(tWdZY(V9cJyzPOwL#KY;R5*_juUrZD+#YbwET=AeQTO
zgNnaOQ&a;_QbE%f_(NZ-9!g|<lcRs7L5*Hm`1#C{*`KVKEd~0g2;TbR58q3STy5p9
zF)YuleAuN&aozb|LNV^$%V|RKCSyzyshzH>#^Dz9esfS_ZrzuLMnjypvJPcNKBTcO
zVY7bcAS{#y=spNi@)}amq)*vqkZ3yi{B>~3@Q>=!lzHFEP(kTN|D8R){f=U$-{djs
zXk(U{9ZoI*)=~A*%+ZQ)A6+KIZO!V7-}0mv^KQh3g-i3Oyj64)u%%P4)=@pUir);~
zD0>6P5k-Bt!5jS3i497hX9P`8$Yvyu)q006v?j?W)+$zUY?0N<xbJutH}8M_Ue^9o
z{D!vLJ!#RC-8CAsxu0ph4Qcn0|MihXI=O-h+HzgxPhlzo-c>HN<w)h}y7+qpe(}Ug
zA2G6VOlt5lX`hTO_-qk0V9`i!Ady9Hlv&L+$UTygHoZ2vuN>ukqTS@l40$)Kt+ua`
zIJZrMZZunRTe@yBuKQ8=fth62`j1SNffDf^43$}&UXLWW8rV}OYo`mD3zBUb8J>3u
zGH>p1bN-Q?wuEIS|4vx4kM00ZY$@2J*KS{{^pe0ABWq%n@F8qbM3;VlA$kAu(B)`n
zH!W+45glJ;bX0If*CfteHbD60E1HQ!&xQE6oZGLy-=GAmo&!xq{cXm!J@ro7dS*7f
ze&%JaN^xk1T>jv=XnoW49oU&%a&Ds!Mc;HA2v1NjOmfM%#=sPEyx8^BQJ!9*V_hJp
zMGb=eiFJ480h9I}bgt?)!9kAiXL-aXQL^jB)p@V*G1q*_3+i|3rp{~|?e<-Eb-T<Y
zJBv1*ASVYx#_WmCDF;h^)2fJf32Xu}0kpB0MqVk7Uzs|Tj}C@m_Q#4O%3|Nq7gQB`
zC{G-jhV74H7C|&CaC^oP&?0~=u0*k4VF+-$H=^T-8H;N^;lomG$-mh4yELy}yI~Mm
ztMs##O-km#;KoDi^(zW<qzZ6k#<XmFf;m-fU-yG7oe1ucEcZ)8;oJ;^8oF6z77*C4
z+FPqLljXbX+${Zs-z_<-GIDi(%57Czb#T+F_>!|9HsB1U<SQ+-vWw14gN{$(p;sNg
zJmoGQ8lCpQ#sxLdTC8lUOlximw(UGHGdZF*=h3rUnD|3`m+lo<Vq~ir1$HwOiCX?b
z#%VckBf@21iA8Aa6;IM6f5;wl&(LXyeTQpwA(1ATQkqHNIJisKLroShV%u3kv>lHQ
zSM+~f^o*lbUQ)DUnYuci!27BxyQ$l(F>|#%Ld<iZplGSjw`(>~9B+c~ViJac+bG~~
z;|Dp0-D{pxT1(U73$1SSzE>vU6xc^`6&YBb!o+j8cqo<N=R(}9$jhV7+(to@51Y)T
z9ox7!su!&I$IwdzXCxiwify!+4$s2Mgg&2L@Zydv+gCoFD%~qVDrqtyoW5*@xv`*_
zQ92&ILxP6ulutgVF)4V&YYNljb33w3)LBT=_h{GW+D3(eM<m!wOCRpBQ|0G4<ZH;#
z6?nLFAI>z{H98?cU6v1fQl*;~-8gaddgiFWBU)+}sLN_Ssye`+#y)6OJFB_+e*4P#
z@VCceXkM9TdzrUPzx~;v#E=riciBO_oA%^--D1ULWX><GsUzWvA_Z=o0-1BQCBnG<
zqWA{hBc+tY)Fskv-MD;LdfHfReq6fkK)CZEnqd@vF1EXAqmb#y&KUp)RRu!GkzrB#
zHRQY?qyJG(<C9!tyI}2f2uWOJCS2?QPWl7hGqA_$Wvzrn>_#5Bx`WbVhVE7#nWo)g
z3cKs7jq-a8e0X*D<Mo#7w2Z!W8C^6CnAyAFaWvjl*#FWS(GVIpgR)Qwr{bLwcCpcQ
zB)3|nC0wZBhok=e!bcc#yX=I1x$Co{PHd^U`le?3rB}0hp5!G=QqQ}U+0WTV%d*=<
zw%m4lLfVI8S6UTHtmPFMQ02AS64aJyZSKHpWZk}aZfl|Y{LIv@iw@jzq%(T8&nJ-K
zx*5I>I>0qeiRuu!^Cmw`^2*#i0h?p=II;?RKY!3qd1RtLrhdY6#V<kdrR~Z-@naFu
z<)e<(HH2Sk#xCEdld%FZ9vx|WU|i>1fA=;*M5v)7!bkgXE^^Y!ZF~}gv(NkYjzOb_
zd8rz*KuB+OKA}ayAc@DWg^&R2<8rbeAXV=^(RSkEl=Z}nwey_eVm?iqQXc=btZ(o0
zk9(a)J|5=AX(}`sb45t%a4F>WH&G4nOt1d=H{KMAi4i&sEao8PIePaz{mH71ITfGf
z7jbbw%CA^Ip@?tB8WUkMzZ)H_KLKRXc0g32{v>|O(Z6@bfUoY(k4vj%>zvi#IMMi>
z|KM@+i#;)092|^)cnI3CsaGHg^@5{U77k=WM@wHDkSj?C>C2O=a;f4;y3u0Na?bH)
zGaH2-GQWvLt^%?KcQ>Q_J1`*Ne}HEImV$@f@({`?0v*9?U93JVWkaiCshwHWXn+#~
z7ZQ%CZ{?pxFxOdt{ET7;uVUCvgOml%+|M16XHNoBNJS+V1G%(|&jFioUX>f5Zkvd2
zzBEN;t7Kc3m(0KO>bpj>mt5`Pl_*#vU_C(jY|o)@(0Bht{!|@=mfs)J4naUM2e^JX
z%n6y@;HquNFo-O|0=c3F2d5Yr&H?F8vl247LF&CQzWxYx<!VogMJk@*8=33rkywjH
zH*W}uaCa&wy|8Bi*&uIxr6T~tX2}M>qfGbYt1JcG!x~T(fD=OI(&o@-1O!5qn2YTv
z03CS?1IJ!$!hp#p+h&@rk<?xT;E5!ha}J6u0Ax;gNIZap@4<E$JATy=p|&}o`rm8%
zASTZ=*m#(u1rLU$K=Col8Rqb<JqY(1Rptd4v}Q1%tCAOh%dlK&*TS}G6>qvLricxa
zkw=X7%>YgK=H_J{X#QV}jzKJ7U_9TYBZ(A>sPX=o$JeB{mv$QD*0kPTM^peaw72$j
zp9rq%1KX6-&#h4+{yg6JOG1eau(Sd(n%}2kpdc2++kZf<Lc(GL)ctWbw4;QO>;#!L
zL}DSHG~NgemC#M(4Rgqy@cJ<EJMhfPSYH8=XQtqU{!LC(D*#2;gON7aVei3REMQu)
z5+5o6IGY86{g=QVaRv}e?TijfXvY%Rb819R0Z<PJqZ{0L4~Y$M!d>i;12FSg;KMRE
zp?2(!jtJbR4$YsLE{p<1>=AfvX#p`V_ZUN+1!zxEhhyL_rz#GZg#Q8X8w!P~6P~34
zsM`o$tJE;HhmHVL)mM#ZAc}!UT|e25fJvxR@Ez&O>oLcm=C_PbuK|z0Vu;*9Z1aE6
z3}s6L($bT*iZy_;Q(BE%jrs)1cjs07of6SCSl6I5*zDSVKzw8B5#F)Lvyilo$PfLh
z4u824oRU2rrOCGbUtR|tkl5Jz9XK$+Kyl}y-GKE=fT&q$^_GSL5V&p=^tFr?^tJb*
zb^|yt0^eeuByz&9pw1QiyE>Nyfb;TRNVN&9$+Lw0F3$oH@?|S>rV?<b;n|Q_KrOF_
zqS#cA?nW*o0N2-5O--RFBsO-*D&r^wJ@Dd&O&R6z0654;9yFwRgX=TK_9_4X`rsN2
z7H24BU3NxDo_+~vC<j-^;UNeMD|pN*H6DRMS=b9;1*dk~41|y5*kZPNQm>g@C8UFy
zTPj_%<7<d8l5>s&+O>d1^>orjz<5pq#>;Ae3dL%z(8E{Dug*_ywM<;&8K4L*CS*FY
z4o`5L0j6UAH&X!(!r>ra5)vk0WhVf`A+IyLO$Lc4q)DY&lvA}V7kBxBj1YSkVD<CJ
z744^h1Oq)1W0R16Xgy{FjFX!-S?ogt4GXXwCW5cTRH7+7a8hz~1%2qBqYjSP2IHE1
zdD6J>_Zq7Bl9bV)gW@mvWGj$1xgh><9vyBl+Y`WC3lZL^HviQ9|Bp5SqeOUkZBqD$
z?%xSreq8kj1=RiPhyZ3P%*xww<pCP?=Ka<fMt_L{y*a)n9@w?H(=Af6MfKq~W`<BJ
z16>$Qk{XS7!*wfoJ3j|p6u_u)_#jkSpgB4?I7M1-4rV{iRdBl%!(JFAS*JG^L@@|(
z?FvQf232;mBh69NI6KDrMln>(MR4ek#DDX>AD}!ApIaHorC-A}j~_V$(7IQkO&?3o
zY~a{21fceo+fM`G0f>{I<xpO07~+Z2qvQto)0<FTB=(ACW)k?S-1uR0fUbXmqN@FJ
zLCmH3b`|Kd-;-#!RL?rvayttZ#DZ*>k%ljoE$HK6BxXU<Lrl?sQZ>5_y^n-_OnTE{
zRT6*V{a~D@6aaO-#%HwwW88zZ%ATKtXT26r*Fz*%!m}%cw2g=;@Fzd=%nMD5o6~W-
zwH|?aTVrm5qmK_+M1~B{9}W}->V_Z`LQ>T?M__ytxIV6?T?kM7x(aRLNAsEYj<ldO
zNh76d%7%D|Za8Q$NaPRep9p6@iW}+3VXP7U|Lh0iJoMfm!vc|{iRs8(-EvGM=ye2n
z;4XN-*(7m6(njjg^%TK)1}g77;wTHC3l;(f3mrw~7*uPknqeg1ykM2&VBY&I7m{Iu
zVNW(sexMyD38-i@3ZYeJkjx8#kX;Rv6k1l~teiTFV9qck+{r;oB&<OKd?cLroEyX;
zfwQ$}LE(Cekri4-RZT*>>nH3bVKPOl2LHK)Zm?1!6s_s8c3StRu%N(m#7%fwVrmb|
zw^U7a0To9awPX&4cq5+?dY@ki{=x_e-~k0sWHS>+KH+>iGO$<yoX?y)&`Mauh+Nqv
zV3&L=#S(#&#E!+<PY_%Ld_9WmM+QEf6%tVCYeuw?i=cQ!7xMw;9kCi5yj6-~!i%U~
z{2;z0j{oe#;oUv{);>R=JTf@N;-nG+;ZH#Q`}BIl05{FKNByhXATYC^V-LG7bkQ^i
zzdJw*cDQR4uL$66At*f}!1OFMC0&#79a=J;JY`+SGqtC5=}=F$2gZsm);fM}xxDN1
zKxxnfA=@x8qhsRoSOuPGS9J3-jpy|0Vz!A)m#l_PW77o}L~r1!2hpHt&6I7>H!RBv
z(t)t;v;qAa_@KpMVv@~INYvA)y6BgI)-jctWutqa2J^XGwph^A<7P$<znbvhYG*nF
zqvL|b@d0oEG-Z8dhHd@c_=y{ESQ@SLsJoS`x40)9HA|QonX;yi?e?=t*3=fU2Chtq
zVTTiRy#dh3Wba-uhVf<aU|4kKO`-YrrrVvx)3E`&hAa|8!v-F&8-6?)T1>Oi@^jxM
zO6_fiz+OkZ1X2F!1@@~ZBSZ+6c<faq;Z@?XI+gy|^pAU=D7TSKV0vwbsKXtTb{9l*
zHMnwcRZP_ZVx7j(CXKlqvirS|+h`NPg0EeB2yU%KBg;IxHurrk0=^Eu_-O%)*1x~s
zTDxGKx4%x7zkl-uboiy^NZ3XF?FH7=DXQBazn(_}$cvQ|uu}iy*4hQ@YWQ#UMAPND
zYpJj=_}dHLLTDQ#ertv>u&njmgEUIBD_ao3gyjX$Fa!vGGe-vNP4p%SW-gu%n_<>6
z;LR#FG}I2Ung6X6c;PQSJ=R)kp2vV0IJ~cm|NXQ71?I0>-H>Zirre;sJhu-1>1mxj
Kp8do5>;D4rHz)l7

diff --git a/docs/source/dynamo/custom-backends.rst b/docs/source/dynamo/custom-backends.rst
deleted file mode 100644
index 2c8b338045e62..0000000000000
--- a/docs/source/dynamo/custom-backends.rst
+++ /dev/null
@@ -1,154 +0,0 @@
-Custom Backends
-===============
-
-Debugging Backend
------------------
-
-Suppose you wanted to better understand what is going on during a
-compilation you can create a custom compiler which we’ll refer to as a
-backend that will print pretty print the fx ``GraphModule`` extracted
-from dynamo’s bytecode analysis and return a ``forward()`` callable.
-
-.. code-block:: python
-
-   from typing import List
-   import torch
-   import torch._dynamo as dynamo
-   def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
-       print("my_compiler() called with FX graph:")
-       gm.graph.print_tabular()
-       return gm.forward  # return a python callable
-   @dynamo.optimize(my_compiler)
-   def fn(x, y):
-       a = torch.cos(x)
-       b = torch.sin(y)
-       return a + b
-   fn(torch.randn(10), torch.randn(10))
-
-Running the above example produces the following output:
-
-::
-
-   my_compiler() called with FX graph:
-   opcode         name    target                                                  args        kwargs
-   -------------  ------  ------------------------------------------------------  ----------  --------
-   placeholder    x       x                                                       ()          {}
-   placeholder    y       y                                                       ()          {}
-   call_function  cos     <built-in method cos of type object at 0x7f1a894649a8>  (x,)        {}
-   call_function  sin     <built-in method sin of type object at 0x7f1a894649a8>  (y,)        {}
-   call_function  add     <built-in function add>                                 (cos, sin)  {}
-   output         output  output                                                  ((add,),)   {}
-
-This works for ``torch.nn.Module`` as well as shown below
-
-.. code-block:: python
-
-   import torch
-   import torch._dynamo as dynamo
-   class MockModule(torch.nn.Module):
-       def __init__(self):
-           super().__init__()
-           self.relu = torch.nn.ReLU()
-       def forward(self, x):
-           return self.relu(torch.cos(x))
-   mod = MockModule()
-   optimized_mod = dynamo.optimize(my_compiler)(mod)
-   optimized_mod(torch.randn(10))
-
-Let’s take a look at one more example with control flow.
-
-.. code-block:: python
-
-   from typing import List
-   import torch
-   import torch._dynamo as dynamo
-   def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
-       print("my_compiler() called with FX graph:")
-       gm.graph.print_tabular()
-       return gm.forward  # return a python callable
-   @dynamo.optimize(my_compiler)
-   def toy_example(a, b):
-       x = a / (torch.abs(a) + 1)
-       if b.sum() < 0:
-           b = b * -1
-       return x * b
-   for _ in range(100):
-       toy_example(torch.randn(10), torch.randn(10))
-
-Running this example produces the following output:
-
-::
-
-   my_compiler() called with FX graph:
-   opcode         name     target                                                  args              kwargs
-   -------------  -------  ------------------------------------------------------  ----------------  --------
-   placeholder    a        a                                                       ()                {}
-   placeholder    b        b                                                       ()                {}
-   call_function  abs_1    <built-in method abs of type object at 0x7f8d259298a0>  (a,)              {}
-   call_function  add      <built-in function add>                                 (abs_1, 1)        {}
-   call_function  truediv  <built-in function truediv>                             (a, add)          {}
-   call_method    sum_1    sum                                                     (b,)              {}
-   call_function  lt       <built-in function lt>                                  (sum_1, 0)        {}
-   output         output   output                                                  ((truediv, lt),)  {}
-
-   my_compiler() called with FX graph:
-   opcode         name    target                   args         kwargs
-   -------------  ------  -----------------------  -----------  --------
-   placeholder    b       b                        ()           {}
-   placeholder    x       x                        ()           {}
-   call_function  mul     <built-in function mul>  (b, -1)      {}
-   call_function  mul_1   <built-in function mul>  (x, mul)     {}
-   output         output  output                   ((mul_1,),)  {}
-
-   my_compiler() called with FX graph:
-   opcode         name    target                   args       kwargs
-   -------------  ------  -----------------------  ---------  --------
-   placeholder    b       b                        ()         {}
-   placeholder    x       x                        ()         {}
-   call_function  mul     <built-in function mul>  (x, b)     {}
-   output         output  output                   ((mul,),)  {}
-
-The order of the last two graphs is nondeterministic depending
-on which one is encountered first by the just-in-time compiler.
-
-Speedy Backend
---------------
-
-Integrating a custom backend that offers superior performance is also
-easy and we’ll integrate a real one
-with `optimize_for_inference <https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html>`__:
-
-.. code-block :: python
-
-   def optimize_for_inference_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
-       scripted = torch.jit.trace(gm, example_inputs)
-       return torch.jit.optimize_for_inference(scripted)
-
-And then you should be able to optimize any existing code with
-
-.. code-block:: python
-
-   @dynamo.optimize(optimize_for_inference_compiler)
-   def code_to_accelerate():
-       ...
-
-Composable Backends
--------------------
-
-TorchDynamo includes many backends, which can be found in
-`backends.py <https://github.com/pytorch/pytorch/blob/master/torch/_dynamo/optimizations/backends.py>`__
-or ``torchdynamo.list_backends()``. You can combine these backends
-together with the following code:
-
-.. code-block:: python
-
-   from torch._dynamo.optimizations import BACKENDS
-   def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
-       trt_compiled = BACKENDS["tensorrt"](gm, example_inputs)
-       if trt_compiled is not None:
-           return trt_compiled
-       # first backend failed, try something else...
-       cudagraphs_compiled = BACKENDS["cudagraphs"](gm, example_inputs)
-       if cudagraphs_compiled is not None:
-           return cudagraphs_compiled
-       return gm.forward
diff --git a/docs/source/dynamo/deep-dive.rst b/docs/source/dynamo/deep-dive.rst
deleted file mode 100644
index c60047c2a3d8d..0000000000000
--- a/docs/source/dynamo/deep-dive.rst
+++ /dev/null
@@ -1,145 +0,0 @@
-TorchDynamo Deeper Dive
-=======================
-**Author**: `Jason Ansel <https://github.com/jansel>`_
-
-What is a guard?
-----------------
-
-TorchDynamo operates just-in-time and specializes graphs based on
-dynamic properties. For example, the first graph above has the following
-guards:
-
-::
-
-   GUARDS:
-    - local 'a' TENSOR_MATCH
-    - local 'b' TENSOR_MATCH
-    - global 'torch' FUNCTION_MATCH
-
-If any of those guards fail, the graph will be recaptured and
-recompiled. The interesting guard type there is ``TENSOR_MATCH``, which
-checks the following torch.Tensor properties:
-
-- Python class of the tensor (tensor subclassing, etc)
-- dtype
-- device
-- requires_grad
-- dispatch_key (with thread-local includes/excludes applied)
-- ndim
-- sizes\* (optional)
-- strides\* (optional)
-
-For sizes/strides you can disable this specialization by setting the
-following parameter:
-
-.. code-block:: python
-
-torch._dynamo.config.dynamic_shapes = True
-
-The full specialization mode allows the backend compiler to assume an
-entirely static graph. Unfortunately, most backends require this.
-Operators which return dynamic shapes will trigger a graph break when
-not in dynamic shape mode.
-
-What is dynamo doing?
----------------------
-
-If you want to understand better what TorchDynamo is doing, you can set:
-
-.. code-block:: python
-
-   torchdynamo.config.debug = True
-
-which triggers useful (but spammy) printouts.
-
-For example, the printouts for the first graph in the ``toy_example``
-above are:
-
-::
-
-   __compiled_fn_0 <eval_with_key>.1
-   opcode         name     target                                                  args              kwargs
-   -------------  -------  ------------------------------------------------------  ----------------  --------
-   placeholder    a        a                                                       ()                {}
-   placeholder    b        b                                                       ()                {}
-   call_function  abs_1    <built-in method abs of type object at 0x7f9ca082f8a0>  (a,)              {}
-   call_function  add      <built-in function add>                                 (abs_1, 1)        {}
-   call_function  truediv  <built-in function truediv>                             (a, add)          {}
-   call_method    sum_1    sum                                                     (b,)              {}
-   call_function  lt       <built-in function lt>                                  (sum_1, 0)        {}
-   output         output   output                                                  ((truediv, lt),)  {}
-
-   ORIGINAL BYTECODE toy_example example.py 9
-    10           0 LOAD_FAST                0 (a)
-                 2 LOAD_GLOBAL              0 (torch)
-                 4 LOAD_METHOD              1 (abs)
-                 6 LOAD_FAST                0 (a)
-                 8 CALL_METHOD              1
-                10 LOAD_CONST               1 (1)
-                12 BINARY_ADD
-                14 BINARY_TRUE_DIVIDE
-                16 STORE_FAST               2 (x)
-
-    11          18 LOAD_FAST                1 (b)
-                20 LOAD_METHOD              2 (sum)
-                22 CALL_METHOD              0
-                24 LOAD_CONST               2 (0)
-                26 COMPARE_OP               0 (<)
-                28 POP_JUMP_IF_FALSE       38
-
-    12          30 LOAD_FAST                1 (b)
-                32 LOAD_CONST               3 (-1)
-                34 BINARY_MULTIPLY
-                36 STORE_FAST               1 (b)
-
-    13     >>   38 LOAD_FAST                2 (x)
-                40 LOAD_FAST                1 (b)
-                42 BINARY_MULTIPLY
-                44 RETURN_VALUE
-
-   MODIFIED BYTECODE
-     9           0 LOAD_GLOBAL              3 (__compiled_fn_0)
-                 2 LOAD_FAST                0 (a)
-                 4 LOAD_FAST                1 (b)
-                 6 CALL_FUNCTION            2
-                 8 UNPACK_SEQUENCE          2
-                10 STORE_FAST               2 (x)
-                12 POP_JUMP_IF_FALSE       24
-                14 LOAD_GLOBAL              4 (__resume_at_30_1)
-                16 LOAD_FAST                1 (b)
-                18 LOAD_FAST                2 (x)
-                20 CALL_FUNCTION            2
-                22 RETURN_VALUE
-           >>   24 LOAD_GLOBAL              5 (__resume_at_38_2)
-                26 LOAD_FAST                1 (b)
-                28 LOAD_FAST                2 (x)
-                30 CALL_FUNCTION            2
-                32 RETURN_VALUE
-
-   GUARDS:
-    - local 'a' TENSOR_MATCH
-    - local 'b' TENSOR_MATCH
-    - global 'torch' FUNCTION_MATCH
-
-At the top you can see the FX graph (which we already shared above).
-Next you see the original bytecode of the function, followed by the
-modified bytecode generated by TorchDynamo. Finally, you see the guards
-which we covered above.
-
-In the modified bytecode ``__compiled_fn_0`` is the return value of
-``my_compiler()`` (the compiled graph). ``__resume_at_30_1`` and
-``__resume_at_38_2`` are both generated continuation functions that pick
-up execution after a graph break (at bytecode offsets 30 and 38). Each
-of these functions take the form:
-
-::
-
-   __resume_at_<offset>:
-       ... restore stack state if needed ...
-       JUMP_ABSOLUTE <offset> into toy_example
-       ... original bytecode of toy_example ...
-
-By generating this `resume_at` function we force the remainder of the
-function to be executed in a new Python frame which recursively
-triggers TorchDynamo to restart its capture once execution reaches that
-point for the first time.
diff --git a/docs/source/dynamo/faq.rst b/docs/source/dynamo/faq.rst
deleted file mode 100644
index decb3e2024de2..0000000000000
--- a/docs/source/dynamo/faq.rst
+++ /dev/null
@@ -1,376 +0,0 @@
-Frequently Asked Questions
-==========================
-
-At a high level, the TorchDynamo stack consists of a graph capture from
-Python code using dynamo and a backend compiler. In this example the
-backend compiler consists of backward graph tracing using AOTAutograd
-and graph lowering using TorchInductor. There are of course many more
-compilers available `here <https://github.com/pytorch/torchdynamo/blob/0b8aaf340dad4777a080ef24bf09623f1aa6f3dd/README.md#existing-backend>`__
-but for this document we will focus on inductor as a motivating example.
-
-Torchdynamo supports training, using AotAutograd to capture backwards:
-
-   1. the ``.forward()`` graph and ``optimizer.step()`` is captured by torchdynamo’s python evalframe frontend
-   2. for each segment of ``.forward()`` that torchdynamo captures, it uses AotAutograd to generate a backward graph segment
-   3. each pair of forward, backward graph are (optionally) min-cut partitioned to save the minimal state between forward/backward
-   4. the forward, backward pairs are wrapped in autograd.function modules 5. usercode calling\ ``.backward()`` still triggers eager’s autograd engine, which runs each ‘compiled backward’ graph as if it were one op, also running any non-compiled eager ops’ .backward() functions
-
-Do you support Distributed code?
---------------------------------
-
-DDP has been tested and works, support for other distributed training
-libraries is under discussion.
-
-The main reason why Distributed code is challenging with dynamo is
-because AOTAutograd unrolls both the forward and backward pass and
-provides 2 graphs for backends to optimize. This is a problem for
-distributed code because we’d like to ideally overlap communication
-operations with computations. Eager pytorch accomplishes this in
-different ways for DDP/FSDP- using autograd hooks, module hooks, and
-modifications/mutations of module states. In a naive application of
-dynamo, hooks that should run directly after an operation during
-backwards may be delayed until after the entire compiled region of
-backwards ops, due to how AOTAutograd compiled functions interact with
-dispatcher hooks.
-
-The basic strategy for optimizing DDP with Dynamo is outlined in
-`distributed.py <https://github.com/pytorch/pytorch/blob/master/torch/_dynamo/optimizations/distributed.py>`__
-where the main idea will be to graph break on `DDP bucket
-boundaries <https://pytorch.org/docs/stable/notes/ddp.html#internal-design>`__.
-
-When each node in DDP needs to synchronize its weights with the other
-nodes it organizes its gradients and parameters into buckets which
-reduces communication times and allows a node to broadcast a fraction of
-its gradients to other waiting nodes.
-
-Graph breaks in distributed code means you can expect dynamo and its
-backends to optimize the compute overhead of a distributed program but
-not its communication overhead. Graph-breaks may interfere with
-compilation speedups, if the reduced graph-size robs the compiler of
-fusion opportunities. However, there are diminishing returns with
-increasing graph size since most of the current compute optimizations
-are local fusions. So in practice this approach may be sufficient.
-
-Do I still need to export whole graphs?
----------------------------------------
-
-For the vast majority of models you probably don’t and you can use
-``torch._dynamo()`` optimize as is but there are a few situations where
-full graphs are necessary and you can can ensure a full graph by simply
-running ``torch.dynamo(..., nopython=True)`` \* Large scale training
-runs, think $250K+ that require pipeline parallelism and other advanced
-sharding strategies \* Inference optimizers like
-`TensorRT <https://github.com/pytorch/TensorRT>`__ or
-`AITemplate <https://github.com/facebookincubator/AITemplate>`__ that rely
-on fusing much more aggressively than training optimizers \* Mobile training or
-inference.
-
-Future work will include tracing communication operations into graphs,
-coordinating these operations with compute optimizations, and optimizing
-the communciation operations.
-
-Why is my code crashing?
-------------------------
-
-If your code ran just fine without dynamo and started to crash with it
-enabled then the most important first step is figuring out which part of
-the stack your failure occurred in so try running things in the below
-order and only try the next step if the previous step succeeded.
-
-1. ``dynamo.optimize("eager")`` which only runs torchdynamo forward graph
-   capture and then runs the captured graph with PyTorch. If this fails
-   then there’s an issue with TorchDynamo.
-
-2. ``dynamo.optimize("aot_eager")``
-   which runs torchdynamo to capture a forward graph, and then AOTAutograd
-   to trace the backward graph without any additional backend compiler
-   steps. PyTorch eager will then be used to run the forward and backward
-   graphs. If this fails then there’s an issue with AOTAutograd.
-
-3. ``dynamo.optimize("inductor")`` which runs torchdynamo to capture a
-   forward graph, and then AOTAutograd to trace the backward graph with the
-   TorchInductor compiler. If this fails then there’s an issue with TorchInductor
-
-TorchDynamo Errors
-~~~~~~~~~~~~~~~~~~
-
-If the error that is generated occurs with the ``"eager"`` backend, then
-torchdynamo is the most likely source of the error.
-
-To debug these issues we recommend setting
-``torch._dynamo.config.verbose=True`` to get a full stack trace to both
-the error in torchdynamo and the user code. In addition to this flag,
-you can also set the ``log_level`` of torchdynamo through
-``torch._dynamo.config.log_level``. The available levels are the
-following: - ``logging.DEBUG``: Print every instruction that is
-encountered in addition to all below log levels - ``logging.INFO``:
-Print each function that is compiled (original and modified bytecode)
-and the graph that is captured in addition to all below log levels -
-``logging.WARNING`` (default): Print graph breaks in addition to all
-below log levels - ``logging.ERROR``: Print errors only
-
-If a model is sufficiently large, the logs can become overwhelming. If
-an error occurs deep within a model’s python code, it can be useful to
-execute only the frame in which the error occurs to enable easier
-debugging. There are 2 tools available to enable this:
-
-* ``env TORCHDYNAMO_DEBUG_FUNCTION=<desired_function_name>`` will only run TorchDynamo on functions with that name.
-
-* ``env torch._dynamo.config.replay_record_enabled = True``) which dumps an execution record when an error is encountered. This record can then be replayed to run only the frame where an error occurred.
-
-TorchInductor Errors
---------------------
-
-With TorchInductor as the chosen backend, AOTAutograd is used to
-generate the backward graph from the forward graph captured by
-torchdynamo. It’s important to note that errors can occur during this
-tracing and also while TorchInductor lowers the forward and backward
-graphs to GPU code or C++.
-
-A model can often consist of hundreds or thousands of FX nodes, so
-narrowing the exact nodes where this problem occurred can be very
-difficult which is why we highly recommend you use our minifier to
-create tiny reproducible examples of failures you’re seeing. We can
-minify errors that occur either at the AOTAutograd layer or Inductor
-layer which you should try in the following order.
-
-1. ``env TORCHDYNAMO_REPRO_AFTER="aot" python your_model.py``
-2.  ``env TORCHDYNAMO_REPRO_AFTER="dynamo" python your_model.py``
-
-Minifying your error is the quickest path to getting it fixed.
-
-The minifier will actually create a ``repro.py`` for you at the location
-set by ``env TORCHDYNAMO_REPRO_DIR`` so make you have right access to
-that directory. You can then run ``python repro.py`` and confirm that
-you are getting the same error.
-
-.. note::
-   For other compilers such as nvfuser, the process is similar but
-   instead you would leverage ``env TORCHDYNAMO_REPRO_AFTER="dynamo" python your_model.py``.
-
-Why is compilation slow?
-------------------------
-
-Dynamo Compilation
-~~~~~~~~~~~~~~~~~~
-
-TorchDynamo has a builtin stats function for collecting and displaying
-the time spent in each compilation phase. These stats can be accessed by
-calling ``torch._dynamo.utils.compile_times()`` after executing
-``torch._dynamo``. By default, this returns a string representation of
-the compile times spent in each TorchDynamo function by name.
-
-Inductor Compilation
-~~~~~~~~~~~~~~~~~~~~
-
-TorchInductor has a builtin stats and trace function for displaying time
-spent in each compilation phase, output code, output graph visualization
-and IR dump. ``env TORCHINDUCTOR_TRACE=1 python repro.py``. This is a
-debugging tool designed to make it easier to debug/understand the
-internals of TorchInductor with an output that will look something like
-`this <https://gist.github.com/jansel/f4af078791ad681a0d4094adeb844396>`__
-
-Each file in that debug trace can be enabled/disabled via
-``torch._inductor.config.trace.*``. The profile and the diagram are both
-disabled by default since they are expensive to generate. See the
-`example debug directory
-output <https://gist.github.com/jansel/f4af078791ad681a0d4094adeb844396>`__
-for more examples.
-
-Excessive Recompilation
-~~~~~~~~~~~~~~~~~~~~~~~
-
-When TorchDynamo compiles a function (or part of one), it makes certain
-assumptions about locals and globals in order to allow compiler
-optimizations, and expresses these assumptions as guards that check
-particular values at runtime. If any of these guards fail, Dynamo will
-recompile that function (or part) up to
-``torch._dynamo.config.cache_size_limit`` times. If your program is
-hitting the cache limit, you will first need to determine which guard is
-failing and what part of your program is triggering it.
-
-The `recompilation profiler <#recompilation-profiler>`__ automates the
-process of setting TorchDynamo’s cache limit to 1 and running your
-program under an observation-only ‘compiler’ that records the causes of
-any guard failures. You should be sure to run your program for at least
-as long (as many iterations) as you were running when you ran into
-trouble, and the profiler will accumulate statistics over this duration.
-
-.. code-block:: python
-
-   prof = dynamo.utils.CompilationProfiler()
-   @dynamo.optimize(prof)
-   def my_model():
-       ...
-   my_model()
-   print(prof.report())
-
-Many of the reasons for graph breaks and excessive recompilation will be
-fixed with upcoming support for `tracing dynamic tensor
-shapes <https://docs.google.com/document/d/1QJB-GOnbv-9PygGlOMXwiO9K6vVNm8sNg_olixJ9koc/edit?usp=sharing>`__,
-more careful choices for guards and better tuned heuristics.
-
-Why are you recompiling in production?
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-In some cases, you may not want unexpected compiles after a program has
-warmed up. For example, if you are serving production traffic in a
-latency critical application. For this, TorchDynamo provides an
-alternate mode where prior compiled graphs are used, but no new ones are
-generated:
-
-.. code-block:: python
-
-   frozen_toy_example = dynamo.run(toy_example)
-   frozen_toy_example(torch.randn(10), torch.randn(10))
-
-How are you speeding up my code?
---------------------------------
-
-There are 3 major ways to accelerat PyTorch code:
-
-1. Kernel fusion via vertical fusions which fuse sequential operations to avoid
-   excessive read/writes. For example, fuse 2 subsequent cosines means you
-   can can do 1 read 1 write instead 2 reads 2 writes 2. Horizontal fusion:
-   the simplest example being batching where a single matrix is multiplied
-   with a batch of examples but the more general scenario is a grouped GEMM
-   where a group of matrix multiplications are scheduled together
-
-2. Out of order execution: A general optimization for compilers, by looking ahead
-   at the exact data dependencies within a graph we can decide on the most
-   opportune time to execute a node and which buffers can be reused
-
-3. Automatic work placement: Similar of the out of order execution point,
-   but by matching nodes of a graph to resources like physical hardware or
-   memory we can design an appropriate schedule
-
-The above are general principles for accelerating PyTorch code but
-different backends will each make different tradeoffs on what to
-optimize. For example Inductor first takes care of fusing whatever it
-can and only then generates `Triton <https://openai.com/blog/triton/>`__
-kernels. It can also
-
-Triton in addition offers speedups because of automatic memory
-coalescing, memory management and scheduling within each Streaming
-Multiprocessor and has been designed to handle tiled computations.
-
-However, regardless of the backend you use it’s best to use a benchmark
-and see approach so try out the PyTorch profiler, visually inspect the
-generated kernels and try to see what’s going on for yourself.
-
-Why am I not seeing speedups?
------------------------------
-
-Graph Breaks
-~~~~~~~~~~~~
-
-The main reason you won’t see the speedups you’d like to by using dynamo
-is excessive graph breaks. So what’s a graph break?
-
-Given a program like:
-
-.. code-block:: python
-
-   @dynamo.optimize(...)
-   def some_fun(x):
-       ...
-   some_fun(x)
-   ...
-
-Torchdynamo will attempt to compile all of the torch/tensor operations
-within ``some_fun()`` into a single FX graph, but it may fail to capture
-everything into one graph.
-
-Some graph break reasons are insurmountable to TorchDynamo like calling
-into a C extension other than torch is invisible to torchdynamo, and
-could do arbitrary things without TorchDynamo being able to introduce
-necessary guards to ensure that the compiled program would be safe to reuse.
-
-   To maximize performance, it’s important to have as few graph breaks
-   as possible.
-
-Identifying the cause of a graph break
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-To identify all graph breaks in a program and the associated reasons for
-the breaks, ``torch._dynamo.explain`` can be used. This tool runs
-TorchDynamo on the supplied function and aggregates the graph breaks
-that are encountered. Here is an example usage:
-
-.. code-block:: python
-
-   import torch
-   import torch._dynamo as dynamo
-   def toy_example(a, b):
-       x = a / (torch.abs(a) + 1)
-       print("woo")
-       if b.sum() < 0:
-           b = b * -1
-       return x * b
-   explanation, out_guards, graphs, ops_per_graph = dynamo.explain(toy_example, torch.randn(10), torch.randn(10))
-   print(explanation)
-   """
-   Dynamo produced 3 graphs, with 2 graph break and 6 ops.
-    Break reasons:
-   1. call_function BuiltinVariable(print) [ConstantVariable(str)] {}
-      File "t2.py", line 16, in toy_example
-       print("woo")
-
-   2. generic_jump
-      File "t2.py", line 17, in toy_example
-       if b.sum() < 0:
-    """
-
-To throw an error on the first graph break encountered you can use
-disable python fallback by using ``nopython=True``, this should be
-familiar if you’ve worked with export based compilers.
-
-.. code-block:: python
-
-   @dynamo.optimize(<compiler>, nopython=True)
-   def toy_example(a, b):
-      ...
-
-Why didn’t my code recompile when I changed it?
------------------------------------------------
-
-If you went ahead and enabled dynamic shapes via
-``env TORCHDYNAMO_DYNAMIC_SHAPES=1 python model.py`` then your code
-won’t recompile on shape changes. We’ve added support for dynamic shapes
-which avoids recompilations in the case when shapes vary by less than a
-factor of 2. This is especially useful in scenarios like varying image
-sizes in CV or variable sequence length in NLP. In inference scenarios
-it’s often not possible to know what a batch size will be beforehand
-because you take what you can get from different client apps.
-
-In general, TorchDynamo tries very hard not to recompile things
-unnecessarily so if for example torchdynamo finds 3 graphs and your
-change only modified one graph then only that graph will recompile. So
-another tip to avoid potentially slow compilation times is to warmup a
-model by compiling it once after which subsequent compilations will be
-much faster. Cold start compile times is still a metric we track
-visibly.
-
-Why am I getting incorrect results?
------------------------------------
-
-Accuracy issues can also be minified if you set the environment variable
-``TORCHDYNAMO_REPRO_LEVEL=4``, it operates with a similar git bisect
-model and a full repro might be something like
-``TORCHDYNAMO_REPRO_AFTER="aot" TORCHDYNAMO_REPRO_LEVEL=4`` the reason
-we need this is downstream compilers will codegen code whether it’s
-Triton code or the C++ backend, the numerics from those downstream
-compilers can be different in subtle ways yet have dramatic impact on
-your training stability. So the accuracy debugger is very useful for us
-to detect bugs in our codegen or with a backend compiler.
-
-Why am I getting OOMs?
-----------------------
-
-Dynamo is still an alpha product so there’s a few sources of OOMs and if
-you’re seeing an OOM try disabling the following configurations in this
-order and then open an issue on Github so we can solve the root problem
-1. If you’re using dynamic shapes try disabling them, we’ve disabled
-them by default: ``env TORCHDYNAMO_DYNAMIC_SHAPES=0 python model.py`` 2.
-CUDA graphs with Triton are enabled by default in inductor but removing
-them may alleviate some OOM issues: ``torch._inductor.config = False``.
diff --git a/docs/source/dynamo/get-started.rst b/docs/source/dynamo/get-started.rst
deleted file mode 100644
index 44434d49e525d..0000000000000
--- a/docs/source/dynamo/get-started.rst
+++ /dev/null
@@ -1,181 +0,0 @@
-Getting Started
-===============
-
-Let’s start with a simple example and make things more complicated step
-by step. Please note that you’re likely to see more significant speedups
-the newer your GPU is.
-
-.. code:: python
-
-   from torch._dynamo import optimize
-   import torch
-   def fn(x, y):
-       a = torch.cos(x).cuda()
-       b = torch.sin(y).cuda()
-       return a + b
-   new_fn = optimize("inductor")(fn)
-   input_tensor = torch.randn(10000).to(device="cuda:0")
-   a = new_fn()
-
-This example will not actually run faster. Its purpose is to demonstrate
-the ``torch.cos()`` and ``torch.sin()`` features which are
-examples of pointwise ops as in they operate element by element on a
-vector. A more famous pointwise op you might actually want to use would
-be something like ``torch.relu()``. Pointwise ops in eager mode are
-suboptimal because each one would need to need to read a tensor from
-memory, make some changes and then write back those changes. The single
-most important optimization that inductor does is fusion. So back to our
-example we can turn 2 reads and 2 writes into 1 read and 1 write which
-is crucial especially for newer GPUs where the bottleneck is memory
-bandwidth (how quickly you can send data to a GPU) instead of compute
-(how quickly your GPU can crunch floating point operations)
-
-Another major optimization that inductor makes available is automatic
-support for CUDA graphs.
-CUDA graphs help eliminate the overhead from launching individual
-kernels from a python program which is especially relevant for newer GPUs.
-
-dynamo supports many different backends but inductor specifically works
-by generating `Triton <https://github.com/openai/triton>`__ kernels and
-we can inspect them by running ``TORCHINDUCTOR_TRACE=1 python trig.py``
-with the actual generated kernel being
-
-.. code:: python
-
-   @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
-   @triton.jit
-   def kernel(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
-       xnumel = 10000
-       xoffset = tl.program_id(0) * XBLOCK
-       xindex = xoffset + tl.reshape(tl.arange(0, XBLOCK), [XBLOCK])
-       xmask = xindex < xnumel
-       x0 = xindex
-       tmp0 = tl.load(in_ptr0 + (x0), xmask)
-       tmp1 = tl.sin(tmp0)
-       tmp2 = tl.sin(tmp1)
-       tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)
-
-And you can verify that fusing the two ``sins`` did actually occur
-because the two ``sin`` operations occur within a single Triton kernel
-and the temporary variables are held in registers with very fast access.
-
-You can read up a lot more on Triton’s performance
-`here <https://openai.com/blog/triton/>`__ but the key is it’s in python
-so you can easily understand it even if you haven’t written all that
-many CUDA kernels.
-
-As a next step let’s try a real model like resnet50 from the PyTorch
-hub.
-
-.. code:: python
-
-   import torch
-   import torch._dynamo as dynamo
-   model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)
-   opt_model = dynamo.optimize("inductor")(model)
-   model(torch.randn(1,3,64,64))
-
-And that’s not the only available backend, you can run in a REPL
-``dynamo.list_backends()`` to see all the available ones. Try out the
-``aot_cudagraphs`` or ``nvfuser`` next as inspiration.
-
-Let’s do something a bit more interesting now, our community frequently
-uses pretrained models from
-`transformers <https://github.com/huggingface/transformers>`__ or
-`TIMM <https://github.com/rwightman/pytorch-image-models>`__ and one of
-our design goals is for dynamo and inductor to work out of the box with
-any model that people would like to author.
-
-So we’re going to directly download a pretrained model from the
-HuggingFace hub and optimize it:
-
-.. code:: python
-
-   import torch
-   from transformers import BertTokenizer, BertModel
-   import torch._dynamo as dynamo
-   # Copy pasted from here https://huggingface.co/bert-base-uncased
-   tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-   model = BertModel.from_pretrained("bert-base-uncased").to(device="cuda:0")
-   model = dynamo.optimize("inductor")(model) # This is the only line of code that we changed
-   text = "Replace me by any text you'd like."
-   encoded_input = tokenizer(text, return_tensors='pt').to(device="cuda:0")
-   output = model(**encoded_input)
-
-If you remove the ``to(device="cuda:0")`` from the model and
-encoded_input then triton will generate C++ kernels that will be
-optimized for running on your CPU. You can inspect both Triton or C++
-kernels for BERT, they’re obviously more complex than the trigonometry
-example we had above but you can similarly skim it and understand if you
-understand PyTorch.
-
-Similarly let’s try out a TIMM example
-
-.. code:: python
-
-   import timm
-   import torch._dynamo as dynamo
-   import torch
-   model = timm.create_model('resnext101_32x8d', pretrained=True, num_classes=2)
-   opt_model = dynamo.optimize("inductor")(model)
-   opt_model(torch.randn(64,3,7,7))
-
-Our goal with dynamo and inductor was to build the highest coverage ML compiler which should work with any model you throw at it.
-
-Existing Backends
-~~~~~~~~~~~~~~~~~
-
-TorchDynamo has a growing list of backends, which can be found in
-`backends.py <https://github.com/pytorch/pytorch/blob/master/torch/_dynamo/optimizations/backends.py>`__
-or ``torchdynamo.list_backends()`` each of which with its optional dependencies.
-
-Some of the most commonly used backend include:
-
-* **Debugging backends**: \* ``dynamo.optimize("eager")`` - Uses PyTorch
-  to run the extracted GraphModule. This is quite useful in debugging
-  TorchDynamo issues. \* ``dynamo.optimize("aot_eager")`` - Uses
-  AotAutograd with no compiler, i.e, just using PyTorch eager for the
-  AotAutograd’s extracted forward and backward graphs. This is useful for
-  debugging, and unlikely to give speedups.
-
-* **Training & inference backends**: \* ``dynamo.optimize("inductor")`` -
-  Uses TorchInductor backend with AotAutograd and cudagraphs by leveraging
-  codegened Triton kernels `Read
-  more <https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747>`__
-
-  * ``dynamo.optimize("nvfuser")`` - nvFuser with TorchScript. `Read more <https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593>`__
-
-  * ``dynamo.optimize("aot_nvfuser")`` - nvFuser with AotAutograd. `Read more <https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593>`__
-
-  * ``dynamo.optimize("aot_cudagraphs")`` - cudagraphs with AotAutograd. `Read more <https://github.com/pytorch/torchdynamo/pull/757>`__
-
-* **Inference-only backend**\ s: \* ``dynamo.optimize("ofi")`` - Uses
-  Torchscript optimize_for_inference. `Read
-  more <https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html>`__
-
-  * ``dynamo.optimize("fx2trt")`` - Uses Nvidia TensorRT for inferenc optimizations. `Read more <https://github.com/pytorch/TensorRT/blob/master/docsrc/tutorials/getting_started_with_fx_path.rst>`__
-
-  * ``dynamo.optimize("onnxrt")`` - Uses ONNXRT for inference on CPU/GPU. `Read more <https://onnxruntime.ai/>`__ \* ``dynamo.optimize("ipex")`` - Uses IPEX for inference on CPU. `Read more <https://github.com/intel/intel-extension-for-pytorch>`__
-
-Why do you need another way of optimizing PyTorch code?
--------------------------------------------------------
-
-While a number of other code optimization tools exist in the PyTorch
-ecosystem, each of them has its own flow. Here is a few examples of
-existing methods and their limitations:
-
--  ``torch.jit.trace()`` is silently wrong if it cannot trace e.g:
-   during control flow
--  ``torch.jit.script()`` requires modifications to user or library code
-   by adding type annotations and removing non PyTorch code
--  ``torch.fx.symbolic_trace()`` either traces correctly or gives a hard
-   error but it’s limited to traceable code so still can’t handle
-   control flow
--  ``torch._dynamo`` works out of the box and produces partial graphs.
-   It still has the option of producing a single graph with
-   ``nopython=True`` which are needed for `some
-   situations <./documentation/FAQ.md#do-i-still-need-to-export-whole-graphs>`__
-   but allows a smoother transition where partial graphs can be
-   optimized without code modification
-
-.. |image0| image:: ../_static/img/dynamo/TorchDynamo.png
diff --git a/docs/source/dynamo/guards-overview.rst b/docs/source/dynamo/guards-overview.rst
deleted file mode 100644
index 4991a831940a4..0000000000000
--- a/docs/source/dynamo/guards-overview.rst
+++ /dev/null
@@ -1,513 +0,0 @@
-Guards Overview
-===============
-
-From a UX perspective, TorchDynamo is very easy to use. The user invokes
-``torchdynamo.optimize`` as an annotation:
-
-.. code-block:: python
-
-   @torchdynamo.optimize(my_compiler)
-   def fn_foo(bar):
-
-Where a complete example looks like this:
-
-.. code-block:: python
-
-   from typing import List
-   import torch
-   import torchdynamo
-   def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
-       print("my_compiler() called with FX graph:")
-       gm.graph.print_tabular()
-       return gm.forward  # return a python callable
-   @torchdynamo.optimize(my_compiler)
-   def toy_example(a, b):
-       x = a / (torch.abs(a) + 1)
-       if b.sum() < 0:
-           b = b * -1
-       return x * b
-   for _ in range(100):
-       toy_example(torch.randn(10), torch.randn(10))
-
-This allows TorchDynamo to capture the interpreted Python frames, grab
-any and all relevant information, and speed things up wherever it can.
-The speedup comes from a few places, and can be rather dependent on the
-backend (my_compiler above) provided, but the one speedup we care about
-most for today’s overview is **caching**. Caching itself is not a direct
-speedup, so much as a critical enablement to allow us to prevent
-recompilation. We dig a hole with dynamo, and caching allows us to get
-out. Its a speedup from that perspective, but relatively neutral when
-all things are considered - however, it enables us to hold perf
-neutrality while then enabling backends - the true source of our
-speedups.
-
-With even a pass-through no-op backend provided:
-
-.. code-block:: python
-
-   def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
-       return gm.forward
-
-We can see TorchDynamo speeding up Python execution quite a bit, even on
-regular Python, not just PyTorch.
-
-Caching and Guards Overview
----------------------------
-
-TorchDynamo operates through caching transformed (by TorchDynamo) user
-bytecode. When we receive a frame for evaluation, we check if the
-**objects referenced in the frame have changed** in certain ways, and if
-not, we read the previously transformed user bytecode to evaluate it.
-The details of how we do this will be saved for a later writeup.
-Instead, we will focus on how we can identify whether or not the
-**objects referenced in the frame have changed**. This is a critical
-piece of functionality in TorchDynamo, because it drives the entire
-invalidation lifecycle. We refer to this functionality as **guards**.
-
-At a very high level, the vastly oversimplified TLDR flow is this:
-
-1) We receive a python frame
-2) We convert the given frame from (1), passing it through instruction
-   translation
-3) For the objects captured in (2), we create tracking objects that are
-   (a) tracked on an output graph, which is an internal specialization
-   of a torch.fx.Tracer (and the topic of a later writeup), and (b)
-   guards, the topic of this document.
-4) We process the guard objects created in (3), turning them into a
-   generated python function, check_fn, associated with a piece of code.
-5) The check_fn is evaluated whenever we encounter this code a
-   subsequent time - if a check_fn passes and evaluates to True, we know
-   the code in the cache and the code encountered here is the same, and
-   can be safely used. If it fails and evaluates to False, we know the
-   code in the cache is not valid, and can be thrown out in favor of a
-   new entry, through recompilation or a graph break.
-
-Python Frame Evaluation and PEP 523
------------------------------------
-
-The functionality of TorchDynamo is based on 
-`PEP 523 <https://peps.python.org/pep-0523/>`__.
-
-TorchDynamo installs a frame evaluation function on Python, via
-`_PyInterpreterState_SetEvalFrameFunc`. The overview of function
-selection, thread management, and cleanup is out of scope for this
-writeup, but the important part is that TorchDynamo has a hook where
-Python can hand control back to us during evaluation.
-
-The function we have installed is ``convert_frame`` or
-``convert_frame_assert`` in the ``nopython=True`` case, but glossing
-over that nuance for now, let’s take a look at ``convert_frame_assert``,
-as ``convert_frame`` proxies to it anyway.
-
-We can find it on `line 20 of convert_frame.py
-<https://github.com/pytorch/torchdynamo/blob/main/torchdynamo/convert_frame.py#L200>`__,
-with a signature as follows:
-
-.. code-block:: python
-
-   def  convert_frame_assert(compiler_fn: Callable, one_graph=True):
-
-This function wraps the entry point of where Python invokes TorchDynamo
-with a frame, glossing over the nuances of ``wrap_convert_context`` for
-now:
-
-.. code-block:: python
-
-   def  _convert_frame_assert(frame: types.FrameType, cache_size: int):
-
-Here is what this function does:
-
-1) Checks if it has seen this ``code``\ (see: f_code `here
-   <https://docs.python.org/3/library/inspect.html>`__) before and exits
-   early if it did.
-2) Checks if the code is an unsupported case.
-3) Checks if the ``cache_size`` (second arg above) crosses the limit
-   defined in the config, ``cache_size_limit``. If it has, the function
-   drops the frame and logs warnings. This helps to avoid constant
-   recompilation of a frame as it generally means that the frame is hot
-   in an unexpected way and caching it produces needless overhead,
-   as it is likely to get evicted the next time it is encountered.
-4) Passes the frame, alongside a function that creates an
-   ``InstructionTranslator`` through bytecode
-   transformation, via ``transform_code_object``. A few crucial things
-   happen under the hood here:
-
-   1) New code is produced through ``transform_code_object``.
-
-   2) An FX tracer named ``output`` is produced through
-      ``InstructionTranslator``. 
-
-      This can be a bit confusing,
-      as ``InstructionTranslator`` is not an `fx` tracer, but its stored
-      in a variable named tracer, and its output*\ **is**\ *an `fx`tracer.*
-
-   3) The function produces guards and stores them on ``output`` above.
-
-   4) The function produces ``output_instructions`` and stores them on
-      ``output`` above.
-
-   5) The function maps the newly produced transformed code to the initial code it
-      read off the frame. This mapping is worth remembering, we will
-      refer to it much later on below where we cover guard failures.
-
-5) Using the transformed code from 4.1 and the guards from 4.3
-   the function produces a `GuardedCode`.
-
-Now that we have learned about frame evoluation, let’s review
-``InstructionTranslator``, and see how it turns the frame we handed
-it over into TorchDynamo internal types.
-
-InstructionTranslator
----------------------
-
-`InstructionTranslator` does a lot! We won’t cover the details of
-everything it does, but most importantly for this document, it produces
-a mapping of ``symbolic_locals`` which maintains a mapping from the
-frame’s f_locals to TorchDynamo internal Variable objects (more on these
-in a moment. ``symbolic_locals`` is filled via traversing the frame’s
-locals:
-
-.. code-block:: python
-
-   self.symbolic_locals = collections.OrderedDict(
-       (k, VariableBuilder(self, LocalSource(k))(f_locals[k]))
-       for k in vars
-       if k in f_locals
-   )
-
-We will get to how this works later, from a few other examples that lead
-us to understanding ``VariableTracker`` and ``VariableBuilder``. The
-important component here, for us, for now, is the invocation of a call
-into ``VariableBuilder``. ``VariableBuilder``\ ’s call implementation
-proxies into a function called ``_wrap``, which in turn both constructs
-instances of ``VariableTracker`` and calls ``make_guards`` on them. More
-on that later.
-
-This mapping, in turn, is critical as each Variable has associated
-guards, which are then passed to ``self.output``, the instance of
-``OutputGraph``, an fx tracer, mentioned in 4.2 of the section above. If
-you recall, this ``OutputGraph``, stored in a variable called ``output``
-is where our guards are stored before being passed on to become
-``GuardedCode``
-
-How does ``InstructionTranslator`` do this? At the heart of it, there is
-a loop that is pumped, which drives a function ``step``.
-
-``step`` is just that - a single processing step, taking exactly one
-instruction and doing *something* with it. Note: These are real
-instructions processed by TorchDynamo’s ``transform_code_object``, and
-it’s pretty cool. 
-
-.. note:: This section purposly skips the details of
-   `dis.get_instructions <https://docs.python.org/3/library/dis.html>`__,
-   and how we set up the ``Instruction`` class.
-
-For the toy example above, here is a snippet of a what a few
-``Instruction``\'s may look like:
-
-.. code-block:: python
-
-   Instruction(opcode=124, opname='LOAD_FAST', arg=0, argval='b', offset=32, starts_line=8, is_jump_target=True, target=None)
-   Instruction(opcode=100, opname='LOAD_CONST', arg=3, argval=-1, offset=34, starts_line=None, is_jump_target=False, target=None)
-   Instruction(opcode=20, opname='BINARY_MULTIPLY', arg=None, argval=None, offset=36, starts_line=None, is_jump_target=False, target=None)
-
-This is the core functionality of this function. Take a look at the ``opname``,
-and then take a look at this little snippet from inside ``step``;
-
-.. code-block:: python
-
-   if not hasattr(self, inst.opname):
-       unimplemented(f"missing: {inst.opname}")
-   getattr(self, inst.opname)(inst)
-
-As we can see, we check if the current class, the
-``InstructionTranslator`` has a attribute set matching the operator name
-(ex: LOAD_CONST). If it does, we invoke it, passing the whole
-instruction object in. If it does not, we drop the frame as
-unimplemented.
-
-For the LOAD_CONST example, we can see that we do indeed support it,
-with a relatively straightforward definition:
-
-::
-
-   def  LOAD_CONST(self, inst):
-   self.push(ConstantVariable(value=inst.argval))
-
-Passing over, for now, on the other details of ``InstructionTranslator``
-we can see that this function creates a new instance of the class
-``ConstantVariable`` , with a value, in our example case, -1, and then
-pushes it onto the stack.
-
-There are dozens of such methods - see symbolic_convert.py for all of
-them. Generally, we implement as many matching methods to python
-bytecode instructions as possible.
-
-Across both the logic downstream of ``step`` and the logic from invoking
-``VariableBuilder`` - we now have a lot of ``VariableTracker``\ s and of
-course, we’ve spoken about creating guards quiet a bit. Let’s dig into
-what Variables are, and get a little closer to understanding guards.
-
-Variables
----------
-
-A ``ConstantVariable`` is an instance of\ ``VariableTracker``.
-``VariableTracker`` represents a tracked python local or stack value.
-
-When it comes to representing an object inside TorchDynamo, a
-VariableTracker does exactly what it says - it tracks a given variable.
-Its an extremely flexible class, but there are a few points to keep in
-mind:
-
--  It manages the ``guard`` relationship around the underlying object
-   through:
-
-   -  `make_guard`
-   -  `replace_guards`
-   -  `add_guard(s)`
-   -  `propagate` - ``propagate(*vars: List[List["VariableTracker"]])`` -
-      Perhaps the most important of all, in that it combines guards from
-      all the provided VariableTracker instances passed in. It visits
-      the guards and combines the guards from these onto itself.
-
--  It acts as a proxy on behalf of the underlying object, implementing
-   methods for the rest of TorchDynamo to get information about the
-   tracked object:
-
-   -  `call_method`
-   -  `call_function`
-   -  `python_type`
-   -  `as_proxy`
-   -  `is/as_python_proxy`
-
--  It stores the variable ``source`` of type ``Source``, from
-   torchdynamo/source.py. This source type is a relatively self
-   contained class to help us organize and bookeep where the original
-   source came from, and helps provide convenience methods for things
-   like getting the name, and importantly for us, producing guards.
-
-And this class (``VariableTracker``) is built around subclassing,
-somewhere between a full Abstract Base Class and fully fleshed out class
-- it leaves many methods raising NotImplementedError - with reliance on
-subclasses (see: torchdynamo/variables/ for all subclasses) to fulfill
-contracts and custom behaviors.
-
-Knowing what we know now, we can see an example of how an instruction
-from ``dis``, ``BUILD_TUPLE``
-
-   BUILD_TUPLE(count) Creates a tuple consuming count items from the
-   stack, and pushes the resulting tuple onto the stack.
-
-In our case, our signature will be a *little* different due to the way
-we create ``Instruction`` objects, but the gist of it will be the same.
-Instead of passing in ``count``, we pass in an object with a little
-extra bookkeeping, and of course, we deal with turning regular old
-python objects into TorchDynamo notions:
-
-::
-
-   def BUILD_TUPLE(self, inst):
-       items = self.popn(inst.argval)
-       options = VariableTracker.propagate(items)
-       self.push(TupleVariable(items, **options))
-
-What is happening here? 1) We read argval, which in this case, is
-analogous to ``counts`` in the pydoc for the equivalent instruction.
-
-2) We ``popn`` the items, in this case, the signature is
-   ``def  popn(self, n: int) -> List[TensorVariable]:`` this hints at an
-   underlying contract - we are returning ``TensorVariables``. If we
-   take a closer look at sybmolic_convert.py and
-   ``InstructionTranslatorBase``/``InstructionTranslator``\ we see that
-   the only thing pushed onto and popped from our stack are
-   ``VariableTracker``\ s.
-
-3) We call ``VariableTracker.propogate`` (remember it, from above?) This
-   takes the guards from every single item popped off the stack in 2,
-   and recursively traverses it and combines all the guards into
-   ``options``: ``py  return {      "guards": guards,  }``
-
-4) We then make a new instance of a ``VariableTracker``,
-   ``TupleVariable``\ out of the ``items`` and ``options``. This then
-   allows us to install all the appropriate guards from the ``items``
-   that make up the new ``TupleVariable``
-
-Note: You may wonder - where did the first guards come from? Propagation
-is good and all, but don’t we need something created before it can be
-propagated. Yes! Remember that ``VariableBuilder`` above? It calls
-``make_guards`` as it creates ``VariableTracker`` instances, from
-``f_locals``. This in turn calls into the ``source``, to have it create
-guards.
-
-After all this, bytecode translation is done and we are one step closer
-to producing ``GuardedCode``. We now understand how locals become
-``VariableTracker``\ s, how instructions are handled, and where guards
-are called on for creation. Before we can go into seeing how code and
-guards are combined into a GuardedCode object, we need to dig a little
-bit into those ``make_guard`` and ``source.make_guard`` calls above. We
-can then understand, really, what was going on when we made guards
-alongside, and on, ``VariableTracker`` instances.
-
-Making Guards
--------------
-
-Guards are just python objects, of the class ``Guard``, however, theres
-a good amount of detail around this little class.
-
-Looking at the definition of the dataclass (and therefore, ctor
-signature), we see that it has a name, a source, and a create function.
-
-::
-
-   @dataclasses.dataclass
-   class Guard:
-       name: str
-       source: GuardSource
-       create_fn: Callable
-
-The name should be the name of the variable.
-
-The source here is an enum indicating what *kind* of source the guard
-belongs to [Note: not to be confused with ``Source`` and the other types
-in source.py, as stored on ``VariableTracker``, as discussed above]
-
-And create_fn is the heart of how we go from having this simple
-dataclass to actually producing valid python code to be invoked for
-knowing whether or not things have changed in between invocations, and
-whether we can safely read from the code cache or not (In case you
-forgot what all this was for!)
-
-The most common code paths for getting an instance of a guard are
-through ``make_guards`` on ``VariableTracker``.
-``make_guards``->``source.make_guard``->``return Guard(self.name(), self.guard_source(), fn)``
-
-Or, in a concrete example:
-
-.. code-block:: python
-   
-   ...
-   elif istype(value, range):
-       guards = self.make_guards(GuardBuilder.EQUALS_MATCH)
-       return RangeVariable(value=value, guards=guards)
-
-Since ``source`` was set at the construction time of this
-``VariableTracker``, all that was needed here was to provide the fn,
-``GuardBuilder.EQUALS_MATCH`` to the ``create_fn`` field.
-
-This ``create_fn`` must be a method on ``GuardBuilder``. The reason for
-this becomes apparent in our next step. Once we have all the guards
-created for a frame, we move on to ``CheckFunctionManager`` and
-``compile_check_fn``.
-
-Remember that ``convert_frame`` function way above, in the first
-section? Before it can produce a ``GuardedCode``, it needs to run the
-``CheckFunctionManager``, with all the guards, to produce a ``check_fn``
-which will then, in turn get passed in alongside the code into
-``GuardedCode``. This is the same ``check_fn`` that we store in our
-cache entry, and the same one we run to know whether or not to retrieve
-the code stored alongside. For reference, here is that code:
-
-.. code-block:: cpp
-   
-   static CacheEntry *create_cache_entry(CacheEntry *next,
-                                         PyObject *guarded_code) {
-     CacheEntry *e = (CacheEntry *)malloc(sizeof(CacheEntry));
-     DEBUG_NULL_CHECK(e);
-     e->check_fn = PyObject_GetAttrString(guarded_code, "check_fn");
-     NULL_CHECK(e->check_fn);
-     e->code = (PyCodeObject *)PyObject_GetAttrString(guarded_code, "code");
-     NULL_CHECK(e->code);
-     e->next = next;
-     return e;
-   }
-   
-We now know how a ``check_fn`` function is used, and who makes it, and
-what it is composed of, but what we do not yet know is how. How does a
-list of ``Guard`` objects become a function we can run later on?
-
-First, we iterate these guards:
-
-.. code-block:: python
-
-   for guard in sorted(guards or [], key=Guard.sort_key):
-       if not config.guard_nn_modules and guard.is_nn_module():
-           continue
-       guard.create(local_builder, global_builder)
-
-Calling ``guard.create`` runs that ``create_fn`` we set on the ``Guard``
-class above (don’t confuse it with the ``check_fn`` we are working on
-producing, the names are similar, so it can get a little confusing). In
-our example above, our ``create_fn`` is ``GuardBuilder.EQUALS_MATCH``.
-So we are now invoking it, passing in the ``self``, the guard itself,
-in.
-
-The signature is: ``def EQUALS_MATCH(self, guard: Guard):``
-
-And internally to that function, we can use the ``name`` on the guard to
-get back our original object, querying it for data and type information,
-which in turn gets us to the most important bit: appending code.
-
-At its simplest, ``EQUALS_MATCH`` appends just one line of code:
-``self.code.append(f"{ref} == {val!r}")``. Where ``ref`` is the name of
-the variable, and val is the value. It might produce code like this:
-
-.. code-block::
-
-   y == 2
-
-Pretty simple, but if we append a few other kinds of ``GuardBuilder``
-functions on (For a more complex case), and then combine them all with
-``and`` in between each statement (as we do), we might get something
-like this:
-
-.. code-block::
-
-   ___guarded_code.valid and ___check_type_id(y, 94367738391392) and y == 2 and ___check_tensors(x)
-
-Now we’re talking! Let’s see what we have here: 1) A check for
-``.valid`` (we will come back to invalidation later on) 2) A type id
-check 3) A value check 4) A tensor check
-
-This becomes the heart of the code our ``check_fn``, which in turn, as
-you recall, is evaluated the **next** time we encounter this code. It
-will then check:
-
-1) Is this code still valid?
-2) If (1), Does ``y`` still have a type of ``94367738391392``?
-3) If (2), is ``y`` still 2?
-4) If (3), let’s check on if tensor ``x`` changed in some specific ways
-
-If all of these are still true, then we can use the code cached
-alongside this ``check_fn``! Joyous day! [Note: a deeper dive for how
-and where this happens if saved for a later writeup, but reading
-``static PyCodeObject *lookup(CacheEntry *e, PyObject *f_locals) {`` of
-``_eval_frame.c`` is a good place to start for the inquisitive reader
-who has made it thus far].
-
-If not, then, we can move on to recompiling the code anew, and storing
-that in the cache alongside this code, and a whole new ``check_fn``,
-again to be checked on yet another subsequent frame.
-
-There are lots of other such functions on ``GuardBuilder`` which get
-coalesced into, at times massive, strings which then get evaluated as
-python code and stored into ``check_fn``. Our example above is
-illustrative of a simple case, but I urge you to read the other
-functions on ``GuardBuilder``, or better yet, dump the ``code`` variable
-in ``compile_check_fn`` to really see what’s getting produced,
-especially on larger, real models!
-
-Summary
--------
-
-In this, we have glossed over: - The role of ``.valid`` and invalidation
-around weak references (and potentially soon to be NN Module
-invalidations) - How the C++ side of guard functions
-(``___check_type_id``, ``___check_tensors``, etc) operate - What happens
-when guards fail? - What happens if we produce invalid guard code?
-
-Despite all that, I hope this has been a useful read. We covered how
-user provided code, wrapped in a TorchDynamo context goes on to get
-traced and tracked internally, organized into ``VariableTracker``\ s
-``Source``\ s and subsequently ``Guard``\ s, and how those ``Guards`` in
-turn guide cache entry selection and invalidation when handing Python
-code.
diff --git a/docs/source/dynamo/index.rst b/docs/source/dynamo/index.rst
deleted file mode 100644
index d34f6a7d27552..0000000000000
--- a/docs/source/dynamo/index.rst
+++ /dev/null
@@ -1,44 +0,0 @@
-TorchDynamo Documentation
-=========================
-
-**TorchDynamo** is a Python-level JIT compiler designed to make unmodified
-PyTorch programs faster. TorchDynamo hooks into the frame evaluation API
-in CPython (`PEP 523 <https://peps.python.org/pep-0523/>`__) to
-dynamically modify Python bytecode right before it is executed. It
-rewrites Python bytecode in order to extract sequences of PyTorch
-operations into an `FX Graph <https://pytorch.org/docs/stable/fx.html>`__
-which is then just-in-time compiled with a customizable backend.
-It creates this FX Graph through bytecode analysis and is designed to
-mix Python execution with compiled backends to get the best of both
-worlds: usability and performance.
-
-TorchDynamo makes it easy to experiment with different compiler
-backends to make PyTorch code faster with a single line decorator
-``torch._dynamo.optimize()``
-
-.. image:: ../_static/img/dynamo/TorchDynamo.png
-
-For more information about `TorchInductor`, one of the backends
-supported by `TorchDynamo Graph <https://pytorch.org/docs/stable/fx.html>`__
-into `Triton <https://github.com/openai/triton>`__ for GPUs or
-`C++/OpenMP <https://www.openmp.org/>`__ for CPUs. We have a
-`training performance dashboard <https://github.com/pytorch/torchdynamo/issues/681#issuecomment-1233828468>`__
-that provides performance comparison for different training backends. You can read
-more in the `TorchInductor post on PyTorch
-dev-discuss <https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747>`__.
-
-.. seealso::
-
-   * `TorchDynamo deep-dive video <https://www.youtube.com/watch?v=egZB5Uxki0I>`__
-   * `dev-discuss topics <https://dev-discuss.pytorch.org/search?q=TorchDynamo%20order%3Alatest>`__
-
-.. toctree::
-   :hidden:
-
-   installation
-   get-started
-   guards-overview
-   custom-backends
-   deep-dive
-   troubleshooting
-   faq
diff --git a/docs/source/dynamo/installation.rst b/docs/source/dynamo/installation.rst
deleted file mode 100644
index 21f010951820a..0000000000000
--- a/docs/source/dynamo/installation.rst
+++ /dev/null
@@ -1,83 +0,0 @@
-Installing TorchDynamo
-======================
-
-This section describes how to install TorchDynamo.
-
-Requirements and Setup
-----------------------
-
-Python 3.8 is recommended. Python 3.7 through 3.10 are supported and
-tested. Make sure to have a development version of Python installed
-locally as well.
-
-TorchDynamo is included in the nightly binaries of PyTorch. You can
-find more information `here <https://pytorch.org/get-started/locally/>`__
-
-Install GPU/CUDA version requirements
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-To use GPU back ends (and in particular Triton), please make sure that
-the CUDA that you have installed locally matches the PyTorch version you
-are running.
-
-The following command installs GPU PyTorch+TorchDynamo along with GPU
-TorchDynamo dependencies (for CUDA 11.7):
-
-.. code-block:: python
- 
-   pip3 install numpy --pre torch[dynamo] --force-reinstall --extra-index-url https://download.pytorch.org/whl/nightly/cu117
-
-CPU requirements
-~~~~~~~~~~~~~~~~
-
-There are no additional requirements for CPU TorchDynamo. CPU
-TorchDynamo is included in the nightly versions of PyTorch, which, for
-reference, can be installed with the following command:
-
-.. code-block:: shell
-
-   pip3 install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-
-
-Install from local source
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Build PyTorch from source:
-https://github.com/pytorch/pytorch#from-source, which has TorchDynamo
-included.
-
-To install GPU TorchDynamo dependencies, run ``make triton`` in the
-PyTorch repo root directory.
-
-Verify Installation
-~~~~~~~~~~~~~~~~~~~
-
-If you built PyTorch from source, then you can run the following
-commands (from the PyTorch repo root directory) that run minimal
-examples to check that TorchDynamo is installed correctly:
-
-.. code:: shell
-
-   cd tools/dynamo
-   python verify_dynamo.py
-
-If you do not have the PyTorch source locally, you can alternatively
-copy the script (``tools/dynamo/verify_dynamo.py``) from the PyTorch
-repo and run it locally.
-
-Docker installation
--------------------
-
-We also provide all the required dependencies in the PyTorch nightly
-binaries which you can download with
-
-.. code-block::
-   
-   docker pull ghcr.io/pytorch/pytorch-nightly
-
-And for ad hoc experiments just make sure that your container has access
-to all your GPUs
-
-.. code-block:: bash
-   
-   docker run --gpus all -it ghcr.io/pytorch/pytorch-nightly:latest /bin/bash
diff --git a/docs/source/dynamo/troubleshooting.rst b/docs/source/dynamo/troubleshooting.rst
deleted file mode 100644
index da73f90269279..0000000000000
--- a/docs/source/dynamo/troubleshooting.rst
+++ /dev/null
@@ -1,665 +0,0 @@
-TorchDynamo Troubleshooting
-===========================
-
-**Author**: `Michael Lazos <https://github.com/mlazos>`_
-
-TorchDynamo is still in active development, and many of the reasons for
-graph breaks and excessive recompilation will be fixed with upcoming
-support for `tracing dynamic tensor
-shapes <https://docs.google.com/document/d/1QJB-GOnbv-9PygGlOMXwiO9K6vVNm8sNg_olixJ9koc/edit?usp=sharing>`__,
-more careful choices for guards and better tuned heuristics.
-
-In the mean time, you may need to diagnose a particular issue and
-determine if it is easy to work around with a change to your model, or
-file an issue for support.
-
-Also, we are actively developing debug tools, profilers, and improving our
-errors/warnings. Please give us feedback if you have an issue with this
-infra, or an idea for an improvement. Below is a table of the available
-tools and their typical usage. For additional help see
-`Diagnosing Runtime Errors <#diagnosing-runtime-errors>`__.
-
-.. list-table:: Title
-   :widths: 25 25 50
-   :header-rows: 1
-
-   * - Tool
-     - Purpose
-     - Usage
-   * - Info logging
-     - View summarized steps of compilation
-     - ``torch._dynamo.config.log_level = logging.INFO``
-   * - Debug logging
-     - View detailed steps of compilation (print every instruction traced)
-     - ``torch._dynamo.config.log_level = logging.DEBUG`` and
-       ``torch._dynamo.config.verbose = True``
-   * - Minifier for any backend
-     - Find smallest subgraph which reproduces errors for any backend
-     - set environment variable ``TORCHDYNAMO_REPRO_AFTER="dynamo"``
-   * - Minifier for ``TorchInductor``
-     - If the error is known to occur after `AOTAutograd`` find
-       smallest subgraph wich reproduces errors during TorchInductor lowering
-     - set environment variable ``TORCHDYNAMO_REPRO_AFTER="aot"``
-   * - Accuracy minifier
-     - Finds the smallest subgraph which reproduces an accuracy issue
-       between an eager model model and optimized model
-     - ``TORCHDYNAMO_REPRO_AFTER=<"aot"/"dynamo"> TORCHDYNAMO_REPRO_LEVEL=4``
-   * - ``torch._dynamo.explain``
-     - Find graph breaks and display reasoning for them
-     - ``torch._dynamo.explain(fn, *inputs)``
-   * - Record/Replay
-     - Record and replay frames which to reproduce errors during graph capture
-     - ``torch._dynamo.config.replay_record_enabled = True``
-   * - TorchDynamo function name filtering
-     - Only compile functions with the given name to reduce noise when
-       debugging an issue
-     - set environment variable ``TORCHDYNAMO_DEBUG_FUNCTION=<name>``
-   * - TorchInductor Debug logging
-     - Print general TorchInductor debug info and generated Triton/C++ code
-     - ``torch._inductor.config.debug = True``
-   * - TorchInductor Tracing
-     - Show time taken in each TorchInductor stage + output code and graph
-       visualization
-     - set the environment variable TORCHINDUCTOR_TRACE=1 or
-       ``torch._inductor.config.trace.enabled = True``
-
-Diagnosing Runtime Errors
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Below is the TorchDynamo compiler stack.
-
-At a high level, the TorchDynamo stack consists of a graph capture from
-Python code (TorchDynamo) and a backend compiler. In this example the
-backend compiler consists of backward graph tracing (AOTAutograd) and
-graph lowering (TorchInductor)*. Errors can occur in any component of
-the stack and will provide full stack traces.
-
-You may use info logging
-(``torch._dynamo.config.log_level = logging.INFO``) and look for
-``Step #: ...`` outputs in order to determine in which component the
-error occurred in. Logs are made at the beginning and end of each step,
-so the step that an error should correspond to is the most recent logged
-step whose end has not yet been logged. The steps correspond to the
-following parts of the stack (according to the image above):
-
-==== ================
-Step Component
-==== ================
-1    TorchDynamo
-2    Compiler Backend
-3    TorchInductor
-==== ================
-
-The beginning and end of AOTAutograd is currently not logged, but we
-plan to add it soon.
-
-If info logging is insufficient, then there are also some backend
-options which can enable you to determine which component is causing the
-error if you’re unable to understand the error message that is
-generated. These are the following:
-
--  ``"eager"``: only runs torchdynamo forward graph capture and then
-   runs the captured graph with PyTorch. This provides an indication as
-   to whether TorchDynamo is raising the error.
-
--  ``"aot_eager"``: runs torchdynamo to capture a forward graph, and
-   then AOTAutograd to trace the backward graph without any additional
-   backend compiler steps. PyTorch eager will then be used to run the
-   forward and backward graphs. This is useful to narrow down the issue
-   to AOTAutograd.
-
-The general procedure to narrow down an issue is the following: 1. Run
-your program with the ``"eager"`` backend. If the error no longer
-occurs, the issue is in the backend compiler that is being used (if
-using TorchInductor, proceed to step 2, if not, see `this
-section <#minifying-backend-compiler-errors>`__). If the error still
-occurs with the ``"eager"`` backend, it is an `error while running
-torchdynamo <#torchdynamo-errors>`__.
-
-2. This step is only necessary if TorchInductor is used as the backend
-   compiler. Run the model with the ``"aot_eager"`` backend. If this
-   backend raises an error then the error is occurring during
-   AOTAutograd tracing. If the error no longer occurs with this backend,
-   then `the error is in
-   TorchInductor\* <#minifying-torchinductor-errors>`__.
-
-Each of these cases are analyzed in the following sections.
-
-\*Note on TorchInductor naming: The TorchInductor backend consists of
-both AOTAutograd tracing and the TorchInductor compiler itself. We will
-disambiguate by referring to TorchInductor as the backend, and
-TorchInductor lowering as the phase which lowers the graph traced by
-AOTAutograd.
-
-Torchdynamo Errors
-------------------
-
-If the error that is generated occurs with the ``"eager"`` backend, then
-torchdynamo is the most likely source of the error. Here is example code
-which will generate an error.
-
-.. code:: py
-
-   import torch
-
-   import torch._dynamo as dynamo
-
-
-   @dynamo.optimize("eager")
-   def test_assertion_error():
-       y = torch.ones(200, 200)
-       z = {y: 5}
-       return z
-
-
-   test_assertion_error()
-
-Which will generate the following error:
-
-::
-
-   torch._dynamo.convert_frame: [ERROR] WON'T CONVERT test_assertion_error /scratch/mlazos/torchdynamo/../test/errors.py line 26 
-   due to: 
-   Traceback (most recent call last):
-     File "/scratch/mlazos/torchdynamo/torchdynamo/symbolic_convert.py", line 837, in BUILD_MAP
-       assert isinstance(k, ConstantVariable) or (
-   AssertionError
-
-   from user code:
-      File "/scratch/mlazos/torchdynamo/../test/errors.py", line 34, in test_assertion_error
-       z = {y: 5}
-
-   Set torch._dynamo.config.verbose=True for more information
-   ==========
-
-As the message suggests you can set
-``torch._dynamo.config.verbose=True`` to get a full stack trace to both
-the error in torchdynamo and the user code. In addition to this flag,
-you can also set the ``log_level`` of torchdynamo through
-``torch._dynamo.config.log_level``. The available levels are the
-following: - ``logging.DEBUG``: Print every instruction that is
-encountered in addition to all below log levels - ``logging.INFO``:
-Print each function that is compiled (original and modified bytecode)
-and the graph that is captured in addition to all below log levels -
-``logging.WARNING`` (default): Print graph breaks in addition to all
-below log levels - ``logging.ERROR``: Print errors only
-
-If a model is sufficiently large, the logs can become overwhelming. If
-an error occurs deep within a model’s python code, it can be useful to
-execute only the frame in which the error occurs to enable easier
-debugging. There are two tools available to enable this: - Setting the
-environment variable TORCHDYNAMO_DEBUG_FUNCTION to the desired function
-name will only run torchdynamo on functions with that name. - There is a
-record/replay tool (set
-``torch._dynamo.config.replay_record_enabled = True``) which dumps an
-execution record when an error is encountered. This record can then be
-replayed to run only the frame where an error occurred.
-
-TorchInductor Errors
---------------------
-
-If the error doesn’t occur with the ``"eager"`` backend, then the
-backend compiler is the source of the error (`example
-error <https://gist.github.com/mlazos/2f13681e3cc6c43b3911f336327032de%5D>`__).
-There are `different
-choices <https://github.com/pytorch/torchdynamo/blob/0b8aaf340dad4777a080ef24bf09623f1aa6f3dd/README.md#existing-backends>`__
-for backend compilers for torchdynamo, with TorchInductor or nvfuser
-fitting the needs of most users. This section focuses on TorchInductor
-as the motivating example, but some tools will be usable with other
-backend compilers.
-
-Below is the portion of the stack which we are focusing on:
-
-With TorchInductor as the chosen backend, AOTAutograd is used to
-generate the backward graph from the forward graph captured by
-torchdynamo. It’s important to note that errors can occur during this
-tracing and also while TorchInductor lowers the forward and backward
-graphs to GPU code or C++. A model can often consist of hundreds or
-thousands of FX nodes, so narrowing the exact nodes where this problem
-occurred can be very difficult. Fortunately, there are tools availabe to
-automatically minify these input graphs to the nodes which are causing
-the issue. The first step is to determine whether the error occurs
-during tracing of the backward graph with AOTAutograd or during
-TorchInductor lowering. As mentioned above in step 2, the
-``"aot_eager"`` backend can be used to run only AOTAutograd in isolation
-without lowering. If the error still occurs with this backend, this
-indicates that the error is occurring during AOTAutograd tracing.
-
-Here’s an example:
-
-.. code:: py
-
-   import torch
-
-   import torch._dynamo as dynamo
-
-   model = torch.nn.Sequential(*[torch.nn.Linear(200, 200) for _ in range(5)])
-   @dynamo.optimize("inductor")
-   def test_backend_error():
-
-       y = torch.ones(200, 200)
-       x = torch.ones(200, 200)
-       z = x + y
-       a = torch.ops.aten._foobar(z)  # dummy function which errors
-       return model(a)
-
-
-   test_backend_error()
-
-Running this should give you this error (with a longer stack trace below
-it)
-
-::
-
-   Traceback (most recent call last):
-     File "/scratch/mlazos/torchdynamo/torchinductor/graph.py", line 246, in call_function
-       return lowerings[target](*args, **kwargs)
-     File "/scratch/mlazos/torchdynamo/torchinductor/lowering.py", line 185, in wrapped
-       return decomp_fn(*args, **kwargs)
-     File "/scratch/mlazos/torchdynamo/torchinductor/lowering.py", line 810, in _foobar
-       assert False
-   AssertionError
-   ... 
-
-`error with full stack
-trace <https://gist.github.com/mlazos/d6947854aa56d686800259a164c62100>`__
-
-If you then change ``@dynamo.optimize("inductor")`` to
-``@dynamo.optimize("aot_eager")``, it will run without error, because
-`the
-issue <https://github.com/pytorch/torchdynamo/blob/d09e50fbee388d466b5252a63045643166006f77/torchinductor/lowering.py#:~:text=%23%20This%20shouldn%27t%20be,assert%20False>`__
-is in the TorchInductor lowering process, not in AOTAutograd.
-
-Minifying TorchInductor Errors
-------------------------------
-
-From here, let’s run the minifier to get a minimal repro. Setting the
-environment variable TORCHDYNAMO_REPRO_AFTER=“aot” (or setting
-``torch._dynamo.config.repro_after="aot"`` directly) will generate a
-python program which reduces the graph produced by AOTAutograd to the
-smallest subgraph which reproduces the error. (See below for an example
-where we minify the graph produced by torchdynamo) Running the program
-with this environment variable should show nearly `identical
-output <https://gist.github.com/mlazos/0458ab828aa403c779fe73c012aa5982>`__,
-with an additional line indicating where ``minifier_launcher.py`` has
-been written to. The output directory is configurable by setting
-``torch._dynamo.config.base_dir`` to a valid directory name. The final
-step is to run the minifier and check that it runs successfully. A
-successful run looks like
-`this <https://gist.github.com/mlazos/e6ea41ccce68a7b1b8a7a09acb1b206a>`__.
-If the minifier runs successfully, it generates runnable python code
-which reproduces the exact error. For our example this is the following
-code:
-
-.. code:: py
-
-   import torch
-   from torch import tensor, device
-   import torch.fx as fx
-   from torch._dynamo.testing import rand_strided
-   from math import inf
-   from torch.fx.experimental.proxy_tensor import make_fx
-
-   # torch version: 1.13.0a0+gitfddfc44
-   # torch cuda version: 11.6
-   # torch git version: fddfc4488afb207971c54ad4bf58130fdc8a4dc5
-
-
-   # CUDA Info: 
-   # nvcc: NVIDIA (R) Cuda compiler driver 
-   # Copyright (c) 2005-2022 NVIDIA Corporation 
-   # Built on Thu_Feb_10_18:23:41_PST_2022 
-   # Cuda compilation tools, release 11.6, V11.6.112 
-   # Build cuda_11.6.r11.6/compiler.30978841_0 
-
-   # GPU Hardware Info: 
-   # NVIDIA A100-SXM4-40GB : 8 
-
-
-   from torch.nn import *
-   class Repro(torch.nn.Module):
-       def __init__(self):
-           super().__init__()
-
-
-
-       def forward(self, add):
-           _foobar = torch.ops.aten._foobar.default(add);  add = None
-           return (_foobar,)
-
-   args = [((200, 200), (200, 1), torch.float32, 'cpu')]
-   args = [rand_strided(shape, stride, dtype, device) for shape, stride, dtype, device in args]
-   mod = make_fx(Repro())(*args)
-   from torch._inductor.compile_fx import compile_fx_inner
-
-   compiled = compile_fx_inner(mod, args)
-   compiled(*args)
-
-The ``forward`` method of the ``Repro`` module contains the exact op
-which causes the issue. When filing an issue, please include any
-minified repros to aid in debugging.
-
-Minifying Backend Compiler Errors
----------------------------------
-
-With backend compilers other than TorchInductor the process for finding
-the subgraph causing the error is nearly identical to the procedure in
-`errors in TorchInductor <#torchinductor-errors>`__ with one important
-caveat. Namely, that the minifier will now be run on the graph that is
-traced by TorchDynamo, not the output graph of AOTAutograd. Let’s walk
-through an example.
-
-.. code:: py
-
-   import torch
-
-   import torch._dynamo as dynamo
-
-   model = torch.nn.Sequential(*[torch.nn.Linear(200, 200) for _ in range(5)])
-   # toy compiler which fails if graph contains relu
-   def toy_compiler(gm: torch.fx.GraphModule, _):
-       for node in gm.graph.nodes:
-           if node.target == torch.relu:
-               assert False
-
-       return gm
-
-
-   @dynamo.optimize(toy_compiler)
-   def test_backend_error():
-       y = torch.ones(200, 200)
-       x = torch.ones(200, 200)
-       z = x + y
-       a = torch.relu(z)
-       return model(a)
-
-
-   test_backend_error()
-
-In order to run the code after TorchDynamo has traced the forward graph,
-the TORCHDYNAMO_REPRO_AFTER enviornment variable can be used. Running
-this program with TORCHDYNAMO_REPRO_AFTER=“dynamo” (or
-``torch._dynamo.config.repro_after="dynamo"``) should produce `this
-output <https://gist.github.com/mlazos/244e3d5b53667e44078e194762c0c92b>`__\ and
-the following code in ``{torch._dynamo.config.base_dir}/repro.py``.
-Note: the other option for TORCHDYNAMO_REPRO_AFTER are ``"aot"``, which
-will run the minifier after the backward graph has been generated.
-
-.. code:: py
-
-   import torch
-   import torch._dynamo as dynamo
-   from torch import tensor, device
-   import torch.fx as fx
-   from torch._dynamo.testing import rand_strided
-   from math import inf
-   from torch._dynamo.debug_utils import run_fwd_maybe_bwd
-
-
-   from torch.nn import *
-   class Repro(torch.nn.Module):
-       def __init__(self):
-           super().__init__()
-
-
-
-       def forward(self, add):
-           relu = torch.relu(add);  add = None
-           return (relu,)
-
-
-   mod = Repro().cuda()
-   opt_mod = dynamo.optimize("None")(mod)
-
-
-   args = [((200, 200), (200, 1), torch.float32, 'cpu', False)]
-   args = [rand_strided(sh, st, dt, dev).requires_grad_(rg) for (sh, st, dt, dev, rg) in args]
-
-
-   with torch.cuda.amp.autocast(enabled=False):
-       ref = run_fwd_maybe_bwd(mod, args)
-       res = run_fwd_maybe_bwd(opt_mod, args)
-
-The minifier successfully reduced the graph to the op that raises the
-error in ``toy_compiler``. The other difference from the procedure in
-`TorhInductor Errors <#torchinductor-errors>`__ is that the minifier is
-automatically run after encountering a backend compiler error. After a
-successful run, the minifier writes ``repro.py`` to
-``torch._dynamo.config.base_dir``.
-
-Performance Profiling
-~~~~~~~~~~~~~~~~~~~~~
-
-Accessing TorchDynamo Profiler
-------------------------------
-
-TorchDynamo has a builtin stats function for collecting and displaying
-the time spent in each compilation phase. These stats can be accessed by
-calling ``torch._dynamo.utils.compile_times()`` after executing
-Torch._Dynamo. By default, this returns a string representation of the
-compile times spent in each TorchDynamo function by name.
-
-TorchInductor Debug Tracing
----------------------------
-
-TorchInductor has a builtin stats and trace function for displaying time
-spent in each compilation phase, output code, output graph visualization
-and IR dump. This is a debugging tool designed to make it easier to
-debug/understand the internals of TorchInductor.
-
-Setting the environment variable ``TORCHINDUCTOR_TRACE=1`` will cause a
-debug trace directory to be created and printed:
-
-::
-
-   $ env TORCHINDUCTOR_TRACE=1 python repro.py
-   torch._inductor.debug: [WARNING] model_forward_0 debug trace: /tmp/torchinductor_jansel/rh/crhwqgmbqtchqt3v3wdeeszjb352m4vbjbvdovaaeqpzi7tdjxqr.debug
-
-Here is an `example debug directory
-output <https://gist.github.com/jansel/f4af078791ad681a0d4094adeb844396>`__
-for the test program:
-
-::
-
-   torch.nn.Sequential(
-           torch.nn.Linear(10, 10),
-           torch.nn.LayerNorm(10),
-           torch.nn.ReLU(),
-       )
-
-Note each file in that debug trace can be enabled/disabled via
-``torch._inductor.config.trace.*``. The profile and the diagram are both
-disabled by default since they are expensive to generate.
-
-A single node in this new debug format looks like:
-
-::
-
-   buf1: SchedulerNode(ComputedBuffer)
-   buf1.writes = 
-       {   MemoryDep(name='buf1', index=0, size=()),
-           MemoryDep(name='buf1', index=0, size=(s0,))}
-   buf1.unmet_dependencies = {MemoryDep(name='buf0', index=c0, size=(s0,))}
-   buf1.met_dependencies = {MemoryDep(name='primals_2', index=c0, size=(s0,))}
-   buf1.group.device = cuda:0
-   buf1.group.iteration = (1, s0)
-   buf1.sizes = ([], [s0])
-   class buf1_loop_body:
-       var_ranges = {z0: s0}
-       index0 = z0
-       index1 = 0
-       def body(self, ops):
-           get_index = self.get_index('index0')
-           load = ops.load('buf0', get_index, False)
-           get_index_1 = self.get_index('index0')
-           load_1 = ops.load('primals_2', get_index_1, False)
-           add = ops.add(load, load_1)
-           get_index_2 = self.get_index('index1')
-           reduction = ops.reduction('buf1', torch.float32, torch.float32, 'sum', get_index_2, add)
-           return reduction
-
-See the `example debug directory
-output <https://gist.github.com/jansel/f4af078791ad681a0d4094adeb844396>`__
-for more examples.
-
-Memory Profiling
-----------------
-
-TBD
-
-Graph Breaks
-------------
-
-Given a program like this:
-
-.. code-block:: python
-   
-   @dynamo.optimize(...)
-   def some_fun(x):
-       ...
-   some_fun(x)
-   ...
-
-TorchDynamo will attempt to compile all of the torch/tensor operations
-within some_fun into a single FX graph, but it may fail to capture
-everything into one graph.
-
-Some graph break reasons are insurmountable to TorchDynamo, and can’t be
-easily fixed. - calling into a C extension other than torch is invisible
-to torchdynamo, and could do arbitrary things without TorchDynamo being
-able to introduce necessary `guards <./GuardsOverviewPt1.md>`__ to
-ensure that the compiled program would be safe to reuse. Graph breaks
-can hinder performance if the resulting fragments are small. To maximize
-performance, it’s important to have as few graph breaks as possible.
-
-Identifying the cause of a graph break
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-To identify all graph breaks in a program and the associated reasons for
-the breaks, ``torch._dynamo.explain`` can be used. This tool runs
-TorchDynamo on the supplied function and aggregates the graph breaks
-that are encountered. Here is an example usage:
-
-.. code-block:: python
-
-   import torch
-   import torch._dynamo as dynamo
-   def toy_example(a, b):
-       x = a / (torch.abs(a) + 1)
-       print("woo")
-       if b.sum() < 0:
-           b = b * -1
-       return x * b
-   explanation, out_guards, graphs, ops_per_graph = dynamo.explain(toy_example, torch.randn(10), torch.randn(10))
-   print(explanation)
-   """
-   Dynamo produced 3 graphs, with 2 graph break and 6 ops. 
-    Break reasons: 
-   1. call_function BuiltinVariable(print) [ConstantVariable(str)] {} 
-      File "t2.py", line 16, in toy_example
-       print("woo")
-    
-   2. generic_jump 
-      File "t2.py", line 17, in toy_example
-       if b.sum() < 0:
-    """
-
-Note on other outputs: - ``out_guards`` - a list of lists where each
-sublist contains the guards that must pass to ensure the traced graphs
-are valid - ``graphs`` - a list of graph modules which were successfully
-traced - ``ops_per_graph`` - a list of lists where each sublist contains
-the ops thatare run in the graph
-
-To throw an error on the first graph break encountered, ``nopython``
-mode can be used. This disables TorchDynamo’s python fallback, and only
-succeeds if the entire program is convertible to a single graph. Example
-usage:
-
-.. code-block:: python
-
-   @dynamo.optimize(<compiler>, nopython=True)
-   def toy_example(a, b):
-      ...
-
-Excessive Recompilation
------------------------
-
-When TorchDynamo compiles a function (or part of one), it makes certain
-assumptions about locals and globals in order to allow compiler
-optimizations, and expresses these assumptions as guards that check
-particular values at runtime. If any of these guards fail, Dynamo will
-recompile that function (or part) up to
-``torch._dynamo.config.cache_size_limit`` times. If your program is
-hitting the cache limit, you will first need to determine which guard is
-failing and what part of your program is triggering it.
-
-The `recompilation profiler <#recompilation-profiler>`__ automates the
-process of setting TorchDynamo’s cache limit to 1 and running your
-program under an observation-only ‘compiler’ that records the causes of
-any guard failures. You should be sure to run your program for at least
-as long (as many iterations) as you were running when you ran into
-trouble, and the profiler will accumulate statistics over this duration.
-
-If your program exhibits a bounded amount of dynamism, you may be able
-to tune the TorchDynamo cache limit to allow for each variation to be
-compiled and cached, but if the cache limit is too high you may find the
-cost of recompilation outweighs any optimization benefits.
-
-::
-
-   torch._dynamo.config.cache_size_limit = <your desired cache limit>
-
-Torchdynamo plans to support many common cases of dynamic tensor shapes,
-such as varying batch size or sequence length. It does not plan to
-support rank-dynamism. In the mean time, setting a specific cache limit
-can be used in coordination with bucketing techniques to achieve an
-acceptable number of recompilations for some dynamic models.
-
-.. code-block:: python
-
-   prof = dynamo.utils.CompilationProfiler()
-   @dynamo.optimize(prof)
-   def my_model():
-       ...
-   my_model()
-   print(prof.report())
-
-Accuracy Debugging
-~~~~~~~~~~~~~~~~~~
-
-Accuracy issues can also be minified if you set the environment variable
-``TORCHDYNAMO_REPRO_LEVEL=4``, it operates with a similar git bisect
-model and a full repro might be something like
-``TORCHDYNAMO_REPRO_AFTER="aot" TORCHDYNAMO_REPRO_LEVEL=4`` the reason
-we need this is downstream compilers will codegen code whether it’s
-Triton code or the C++ backend, the numerics from those downstream
-compilers can be different in subtle ways yet have dramatic impact on
-your training stability. So the accuracy debugger is very useful for us
-to detect bugs in our codegen or with a backend compiler.
-
-File an Issue
-~~~~~~~~~~~~~
-
-You should feel encouraged to `file a github
-issue <https://github.com/pytorch/torchdynamo/issues>`__ and expect a
-timely response.
-
-Before filing an issue, read over the `README <../README.md>`__,
-`TROUBLESHOOTING <./TROUBLESHOOTING.md>`__, and search for similar
-issues.
-
-When filing an issue, please include - your
-OS/python/pytorch/CUDA/triton info by running:
-
-.. code-block:: sh
-
-   python tools/verify_install.py
-
--  A minimal repro script if possible, which can be generated by running
-   Minifier
--  A description of the error
--  the expected behavior
--  A log (set ``torch._dynamo.config.log_file`` to a valid file name to
-   dump the logs to a file and
-   ``torch._dynamo.config.log_level = logging.DEBUG`` and
-   ``torch._dynamo.config.verbose = True``)
diff --git a/docs/source/index.rst b/docs/source/index.rst
index e43160f668fc7..e4b6a124d6bdc 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -42,13 +42,6 @@ Features described in this documentation are classified by release status:
 
    notes/*
 
-.. toctree::
-   :glob:
-   :maxdepth: 1
-   :caption: torch.compile
-
-   dynamo/*
-
 .. toctree::
    :maxdepth: 1
    :caption: Language Bindings

From 7340f98a3557abb58d39b3afd86c8ef6cc59952e Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Mon, 28 Nov 2022 21:15:21 +0000
Subject: [PATCH 1338/1922] Fix comparison of batched_prop vs unbatched_prob in
 test_distributions (#87977)

When using SciPy >= 1.7 wishart_log_prob runs into singular samples which means there are `inf`s in `batched_prop` and `unbatched_prop`.
The difference of 2 `inf`s is `nan` which will fail the `equal(0` check.
However passing the tensors directly to `assertEqual` is not only supported but the correct way as it will handle `inf` values etc.

Change the same code in 2 more tests:
  - test_multivariate_normal_log_prob
  - test_lowrank_multivariate_normal_log_prob
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87977
Approved by: https://github.com/soulitzer
---
 test/distributions/test_distributions.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py
index 127018516e123..219eacf4790b0 100644
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@@ -2036,7 +2036,7 @@ def test_lowrank_multivariate_normal_log_prob(self):
         unbatched_prob = torch.stack([dist_unbatched[i].log_prob(x[:, i]) for i in range(5)]).t()
 
         self.assertEqual(batched_prob.shape, unbatched_prob.shape)
-        self.assertEqual(0.0, (batched_prob - unbatched_prob).abs().max(), atol=1e-3, rtol=0)
+        self.assertEqual(batched_prob, unbatched_prob, atol=1e-3, rtol=0)
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_lowrank_multivariate_normal_sample(self):
@@ -2176,7 +2176,7 @@ def test_multivariate_normal_log_prob(self):
         unbatched_prob = torch.stack([dist_unbatched[i].log_prob(x[:, i]) for i in range(5)]).t()
 
         self.assertEqual(batched_prob.shape, unbatched_prob.shape)
-        self.assertEqual(0.0, (batched_prob - unbatched_prob).abs().max(), atol=1e-3, rtol=0)
+        self.assertEqual(batched_prob, unbatched_prob, atol=1e-3, rtol=0)
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_multivariate_normal_sample(self):
@@ -2331,7 +2331,7 @@ def test_wishart_log_prob(self):
         unbatched_prob = torch.stack([dist_unbatched[i].log_prob(x[:, i]) for i in range(5)]).t()
 
         self.assertEqual(batched_prob.shape, unbatched_prob.shape)
-        self.assertEqual(0.0, (batched_prob - unbatched_prob).abs().max(), atol=1e-3, rtol=0)
+        self.assertEqual(batched_prob, unbatched_prob, atol=1e-3, rtol=0)
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_wishart_sample(self):

From 20b4834bfea089f01d110d42b0c73793c1348bc8 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Mon, 28 Nov 2022 18:03:41 +0000
Subject: [PATCH 1339/1922] Add meta implementation for _efficientzerotensor
 (#88936)

`_efficientzerotensor` is used in several backwards formulas, so its
lack of meta implementation makes those functions untracable.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88936
Approved by: https://github.com/anjali411
---
 aten/src/ATen/native/TensorFactories.cpp   | 13 +++++++++++++
 aten/src/ATen/native/native_functions.yaml |  1 +
 test/inductor/test_torchinductor_opinfo.py |  2 --
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 7245cb77b1c50..037ba84181de0 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -1161,6 +1161,19 @@ Tensor _efficientzerotensor(IntArrayRef size,
     return out;
 }
 
+Tensor _efficientzerotensor_meta(IntArrayRef size,
+                                 c10::optional<ScalarType> dtype,
+                                 c10::optional<Layout> layout,
+                                 c10::optional<Device> device,
+                                 c10::optional<bool> pin_memory) {
+  auto device_ = device_or_default(device);
+  auto allocator = at::native::ZeroTensorAllocator(device_);
+  auto dtype_ = dtype_or_default(dtype);
+  auto zero_ks = at::DispatchKeySet(c10::DispatchKey::Meta) | at::DispatchKeySet(c10::DispatchKey::ZeroTensor);
+  auto out = at::detail::empty_generic(size, &allocator, zero_ks, dtype_, c10::nullopt);
+  return out;
+}
+
 Tensor& zeros_sparse_out(IntArrayRef size, Tensor& result) {
   result.sparse_resize_and_clear_(size, size.size(), 0.);
   return result;
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 2c32062e9a8c3..8cab3667e142b 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -5772,6 +5772,7 @@
   dispatch:
     CPU: _efficientzerotensor
     CUDA: _efficientzerotensor_cuda
+    Meta: _efficientzerotensor_meta
   autogen: _efficientzerotensor.out
 
 - func: zeros(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index c9a9147830e66..f7ac61fafa5a8 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -233,7 +233,6 @@ def process(device_type):
     "scatter_reduce.sum": {f16},
     "scatter_reduce.prod": {f16, f32, f64},
     "segment_reduce.lengths": {f16, f32, f64},
-    "sgn": {f16, f32, f64},
     "sparse.sampled_addmm": {f32, f64},
     "stft": {f32, f64},
     "svd_lowrank": {f32, f64},
@@ -332,7 +331,6 @@ def process(device_type):
     "round.decimals_3": {f16},
     "scatter_reduce.prod": {f16, f32, f64},
     "segment_reduce.lengths": {f16, f32, f64},
-    "sgn": {f16, f32, f64},
     "sparse.sampled_addmm": {f32, f64},
     "stft": {f32, f64},
     "svd_lowrank": {f32, f64},

From 3b6cb5e8d8926a0e341d9c497d6948398d80adb7 Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Mon, 28 Nov 2022 21:36:01 +0000
Subject: [PATCH 1340/1922] [Vulkan][TCC] Fix conv2d pack biases (#89568)

Summary: Fixed bug on pack_biases, where the weight scale and zero point were being assigned to the bias.

Test Plan:
On Mac
```
cd ~/fbsource
buck1 run -c pt.vulkan_full_precision=1 //xplat/caffe2:pt_vulkan_quantized_api_test_binAppleMac\#macosx-arm64
```

On Android
```
cd ~/fbsource
buck1 build -c ndk.custom_libcxx=false -c pt.enable_qpl=0 -c pt.vulkan_full_precision=1 //xplat/caffe2:pt_vulkan_quantized_api_test_binAndroid\#android-arm64 --show-output
adb push buck-out/gen/xplat/caffe2/pt_vulkan_quantized_api_test_binAndroid\#android-arm64 /data/local/tmp/vulkan_quantized_api_test
adb shell "/data/local/tmp/vulkan_quantized_api_test"
```

Reviewed By: SS-JIA

Differential Revision: D41350358

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89568
Approved by: https://github.com/salilsdesai
---
 aten/src/ATen/native/vulkan/ops/Convolution.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
index 63fb00d6ee0a3..8a8f63424df05 100644
--- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
@@ -542,14 +542,14 @@ vTensor pack_biases(
   vTensor v_bias{
       api::context(),
       bias_rearranged.sizes(),
-      weight.options(),
+      bias_rearranged.options(),
       quantized ? api::StorageType::TEXTURE_3D : api::StorageType::TEXTURE_2D,
   };
 
   if (quantized) {
     v_bias.set_is_quantized();
-    v_bias.set_scale(weight.q_scale());
-    v_bias.set_zero_point(weight.q_zero_point());
+    v_bias.set_scale(bias_rearranged.q_scale());
+    v_bias.set_zero_point(bias_rearranged.q_zero_point());
   }
 
   pack_cpu_to_vulkan(bias_rearranged, v_bias);

From 66d6ece96b9003e096410a0762f73a36fbff20ea Mon Sep 17 00:00:00 2001
From: Jiewen Tan <jwtan@google.com>
Date: Mon, 28 Nov 2022 21:44:09 +0000
Subject: [PATCH 1341/1922] [LTC] Metrics can be reset too (#89606)

Summary:
This change allow MetricsArena to ResetMetrics too. And then rename Reset to ResetCounters given that's what it does for real.

This matches pytorch/xla#4109, and is paired with pytorch/xla#4245.

Test Plan:
CI.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89606
Approved by: https://github.com/JackCaoG
---
 .github/ci_commit_pins/xla.txt   |  2 +-
 torch/csrc/lazy/core/metrics.cpp | 47 ++++++++++++++++++++++++--------
 torch/csrc/lazy/core/metrics.h   |  9 +++++-
 torch/csrc/lazy/python/init.cpp  |  6 ++--
 4 files changed, 48 insertions(+), 16 deletions(-)

diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index 43527ab040fc0..6e8049d330f4d 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-b969cba3410799d74981ade37a5f07c2c12d33ff
+640a5b8a8abba13be7fad286a5bbd30a5e024164
diff --git a/torch/csrc/lazy/core/metrics.cpp b/torch/csrc/lazy/core/metrics.cpp
index 78aa7f15a5260..7f12793a66001 100644
--- a/torch/csrc/lazy/core/metrics.cpp
+++ b/torch/csrc/lazy/core/metrics.cpp
@@ -106,7 +106,7 @@ MetricsArena* MetricsArena::Get() {
   return arena;
 }
 
-void MetricsArena::Reset() {
+void MetricsArena::ResetCounters() {
   for (auto& pair : counters_) {
     if (pair.second) {
       pair.second->Reset();
@@ -114,6 +114,14 @@ void MetricsArena::Reset() {
   }
 }
 
+void MetricsArena::ResetMetrics() {
+  for (auto& pair : metrics_) {
+    if (pair.second) {
+      pair.second->Reset();
+    }
+  }
+}
+
 void MetricsArena::RegisterMetric(
     const std::string& name,
     MetricReprFn repr_fn,
@@ -141,6 +149,9 @@ void MetricsArena::ForEachMetric(
     const std::function<void(const std::string&, MetricData*)>& metric_func) {
   std::lock_guard<std::mutex> lock(lock_);
   for (auto& name_data : metrics_) {
+    if (!name_data.second->IsValid()) {
+      continue;
+    }
     metric_func(name_data.first, name_data.second.get());
   }
 }
@@ -157,17 +168,19 @@ void MetricsArena::ForEachCounter(
 
 std::vector<std::string> MetricsArena::GetMetricNames() {
   std::vector<std::string> names;
-  std::lock_guard<std::mutex> lock(lock_);
-  for (auto& name_data : metrics_) {
-    names.push_back(name_data.first);
-  }
+  ForEachMetric([&names](const std::string& name, MetricData* data) {
+    names.push_back(name);
+  });
   return names;
 }
 
 MetricData* MetricsArena::GetMetric(const std::string& name) {
   std::lock_guard<std::mutex> lock(lock_);
   auto it = metrics_.find(name);
-  return it != metrics_.end() ? it->second.get() : nullptr;
+  if (it == metrics_.end()) {
+    return nullptr;
+  }
+  return it->second->IsValid() ? it->second.get() : nullptr;
 }
 
 std::vector<std::string> MetricsArena::GetCounterNames() {
@@ -230,6 +243,14 @@ std::vector<Sample> MetricData::Samples(
   return samples;
 }
 
+void MetricData::Reset() {
+  std::lock_guard<std::mutex> lock(lock_);
+  count_ = 0;
+  // Don't clear. samples_ are init with placeholders.
+  samples_ = std::vector<Sample>(samples_.size());
+  accumulator_ = 0.0;
+}
+
 Metric::Metric(std::string name, MetricReprFn repr_fn, size_t max_samples)
     : name_(std::move(name)),
       repr_fn_(std::move(repr_fn)),
@@ -362,12 +383,14 @@ std::string CreateMetricReport(
     const std::vector<std::string>& metric_names) {
   MetricsArena* arena = MetricsArena::Get();
   std::stringstream ss;
-  for (const std::string& metric_name : metric_names) {
-    MetricData* data = arena->GetMetric(metric_name);
-    if (data && data->TotalSamples() > 0) {
-      EmitMetricInfo(metric_name, data, &ss);
-    }
-  }
+  std::set<std::string> metric_name_set(
+      metric_names.begin(), metric_names.end());
+  arena->ForEachMetric(
+      [&ss, &metric_name_set](const std::string& name, MetricData* data) {
+        if (metric_name_set.find(name) != metric_name_set.end()) {
+          EmitMetricInfo(name, data, &ss);
+        }
+      });
   std::set<std::string> counter_name_set(
       counter_names.begin(), counter_names.end());
   arena->ForEachCounter(
diff --git a/torch/csrc/lazy/core/metrics.h b/torch/csrc/lazy/core/metrics.h
index 2e263cc7f00b3..40bc606326eea 100644
--- a/torch/csrc/lazy/core/metrics.h
+++ b/torch/csrc/lazy/core/metrics.h
@@ -55,6 +55,12 @@ class TORCH_API MetricData {
     return repr_fn_(value);
   }
 
+  void Reset();
+
+  bool IsValid() const {
+    return TotalSamples() > 0;
+  }
+
  private:
   mutable std::mutex lock_;
   MetricReprFn repr_fn_;
@@ -93,7 +99,8 @@ class TORCH_API MetricsArena {
  public:
   static MetricsArena* Get();
 
-  void Reset();
+  void ResetCounters();
+  void ResetMetrics();
 
   // Registers a new metric in the global arena.
   void RegisterMetric(
diff --git a/torch/csrc/lazy/python/init.cpp b/torch/csrc/lazy/python/init.cpp
index aa876a33622c5..fe74d29d87ac1 100644
--- a/torch/csrc/lazy/python/init.cpp
+++ b/torch/csrc/lazy/python/init.cpp
@@ -126,8 +126,10 @@ void initLazyBindings(PyObject* module) {
         torch::lazy::LazyGraphExecutor::Get()->WaitDeviceOps({});
       },
       py::arg("devices"));
-  lazy.def(
-      "_reset_metrics", []() { torch::lazy::MetricsArena::Get()->Reset(); });
+  lazy.def("_reset_metrics", []() {
+    torch::lazy::MetricsArena::Get()->ResetCounters();
+    torch::lazy::MetricsArena::Get()->ResetMetrics();
+  });
   lazy.def("_counter_names", []() { return torch::lazy::GetCounterNames(); });
   lazy.def(
       "_metrics_report", []() { return torch::lazy::CreateMetricReport(); });

From c881e871544d0397b6178f328c837e2d0e7487eb Mon Sep 17 00:00:00 2001
From: WeberXie <3942083+weberxie@users.noreply.github.com>
Date: Mon, 28 Nov 2022 21:49:26 +0000
Subject: [PATCH 1342/1922] =?UTF-8?q?[skip=20ci]=20documentation=20update?=
 =?UTF-8?q?=20for=20the=20kwargs=20defaults=20section=20of=20fun=E2=80=A6?=
 =?UTF-8?q?=20(#89719)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In this doc, it's better to multiply the scale instead of the constant 4.0 to illustrate the default of kwargs.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89719
Approved by: https://github.com/kit1980, https://github.com/malfet
---
 functorch/_src/eager_transforms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/functorch/_src/eager_transforms.py b/functorch/_src/eager_transforms.py
index 6144318edf3a3..3f02b7fa3a1ed 100644
--- a/functorch/_src/eager_transforms.py
+++ b/functorch/_src/eager_transforms.py
@@ -233,7 +233,7 @@ def vjp(func: Callable, *primals, has_aux: bool = False):
 
         >>> x = torch.randn([5])
         >>> def f(x, scale=4.):
-        >>>   return x * 4.
+        >>>   return x * scale
         >>>
         >>> (_, vjpfunc) = functorch.vjp(f, x)
         >>> vjps = vjpfunc(torch.ones_like(x))

From c1690602947cbc7b4ac51ef163c286b98605869c Mon Sep 17 00:00:00 2001
From: Aidyn-A <31858918+Aidyn-A@users.noreply.github.com>
Date: Mon, 28 Nov 2022 23:20:16 +0000
Subject: [PATCH 1343/1922] [CUDA graphs] Add warning if captured graph is
 empty (#88754)

Fixes #87894

This PR adds a warning if captured graph is empty (consists of zero nodes).
The example snippet where would it be useful:

```python
import torch

x = torch.randn(10)
z = torch.zeros(10)

g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
    z = x * x
# Warn user
```

and in #87894

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88754
Approved by: https://github.com/ezyang
---
 aten/src/ATen/cuda/CUDAGraph.cpp |  7 +++++++
 test/test_cuda.py                | 13 +++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp
index 24ee0b19ab90c..2d989d884ee34 100644
--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@@ -179,6 +179,13 @@ void CUDAGraph::capture_end() {
               "when capture began");
   wholegraph_increment_ = gen->capture_epilogue();
 
+  size_t numCUDAGraphNodes = 0;
+  AT_CUDA_CHECK(cudaGraphGetNodes(graph_, NULL, &numCUDAGraphNodes));
+  if (numCUDAGraphNodes == 0) {
+      TORCH_WARN("The CUDA Graph is empty. This ususally means that the graph was ",
+                 "attempted to be captured on wrong device or stream.");
+  }
+
   // Now that we've instantiated graph_ into graph_exec_,
   // we don't need graph_ anymore.
   AT_CUDA_CHECK(cudaGraphDestroy(graph_));
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 59f379487c43b..40eaaa97a3b7e 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -15,6 +15,7 @@
 import tempfile
 import threading
 import unittest
+import warnings
 from random import randint
 
 import torch
@@ -3291,6 +3292,18 @@ def test_graph_capture_simple(self):
 
         self.assertTrue(b.sum().item() == 11000.)
 
+    @unittest.skipIf((not TEST_CUDA) or
+                     TEST_WITH_ROCM or
+                     int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs")
+    def test_graph_warn_if_has_zero_nodes(self):
+        with warnings.catch_warnings(record=True) as caught:
+            g = torch.cuda.CUDAGraph()
+            s = torch.cuda.Stream()
+            with torch.cuda.stream(s):
+                g.capture_begin()
+                g.capture_end()
+        self.assertTrue(any("The CUDA Graph is empty" in str(w.message) for w in caught))
+
     @unittest.skipIf((not TEST_CUDA) or
                      TEST_WITH_ROCM or
                      int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs")

From 14c26e016fc2a64ee023e338379fd2463647969a Mon Sep 17 00:00:00 2001
From: Iris <wz337@cornell.edu>
Date: Mon, 28 Nov 2022 23:21:35 +0000
Subject: [PATCH 1344/1922] [Checkpoint][2D][3/N] Add nested_tensors for
 distributed checkpoint to core distributed  (#89501)

This PR moves nested_tensors to torch.distributed.checkpoint. This is a pre-req for enabling 2D checkpoint.

This flattens sharded tensors in state_dict. It is used when saving and loading FSDP SHARDED_STATE_DICT.

Docstring, individual and integration test will be added in the following PRs.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89501
Approved by: https://github.com/wanchaol
---
 torch/distributed/checkpoint/nested_tensor.py | 117 ++++++++++++++++++
 torch/distributed/checkpoint/utils.py         |  20 ++-
 2 files changed, 136 insertions(+), 1 deletion(-)
 create mode 100644 torch/distributed/checkpoint/nested_tensor.py

diff --git a/torch/distributed/checkpoint/nested_tensor.py b/torch/distributed/checkpoint/nested_tensor.py
new file mode 100644
index 0000000000000..4ab68c81b1a9a
--- /dev/null
+++ b/torch/distributed/checkpoint/nested_tensor.py
@@ -0,0 +1,117 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+
+import copy
+
+import torch.distributed as dist
+from torch.distributed.remote_device import _remote_device
+
+from torch.distributed.checkpoint.metadata import (
+    STATE_DICT_TYPE,
+)
+from torch.distributed._shard.sharded_tensor import (
+    Shard,
+    ShardMetadata,
+    ShardedTensor,
+)
+
+from torch.distributed._shard.sharded_tensor.metadata import (
+    ShardedTensorMetadata,
+)
+
+
+from .traverse import (
+    OBJ_PATH,
+    traverse_state_dict,
+    set_element,
+    STATE_DICT_ITEM,
+)
+
+from .utils import _element_wise_add
+
+
+# TODO: update docstring for nested_tensor.py
+def flatten_sharded_tensors(state_dict: STATE_DICT_TYPE) -> STATE_DICT_TYPE:
+    """
+    Transform ``state_dict`` by flattening all nested ShardedTensor instances found.
+    The resulting ShardedTensor instances are only correct regarding the local shard and
+    MUST not be used for any other purpose but checkpointing, no operator will work with them.
+    This function should be used in conjunction with a state_dict produced by FSDP's
+    StateDictType.SHARDED_STATE_DICT methods.
+    """
+    new_state_dict: STATE_DICT_TYPE = {}
+
+    def rewrite_dict(path: OBJ_PATH, value: STATE_DICT_ITEM) -> None:
+        if not isinstance(value, ShardedTensor):
+            set_element(new_state_dict, path, value)
+            return
+        shards = value.local_shards()
+        if len(shards) == 0:
+            return
+        if len(shards) != 1:
+            raise ValueError(
+                f"Cannot handle outer tensor with more than 1 shard {path} -- {len(shards)}"
+            )
+        outer_shard = shards[0]
+
+        inner_st = outer_shard.tensor
+        if not isinstance(inner_st, ShardedTensor):
+            set_element(new_state_dict, path, value)
+            return
+
+        if len(inner_st.local_shards()) != 1:
+            raise ValueError(
+                "Cannot handle inner tensor with more than 1 shard"
+            )
+        inner_shard = inner_st.local_shards()[0]
+
+        local_shards = [
+            Shard(
+                tensor=inner_shard.tensor,
+                metadata=ShardMetadata(
+                    shard_offsets=_element_wise_add(
+                        outer_shard.metadata.shard_offsets,
+                        inner_shard.metadata.shard_offsets,
+                    ),
+                    shard_sizes=inner_shard.metadata.shard_sizes,
+                    placement=f"rank:{dist.get_rank()}/{inner_shard.tensor.device}",
+                ),
+            )
+        ]
+
+        st_meta: ShardedTensorMetadata = copy.deepcopy(value.metadata())
+        other_rank = 0 if dist.get_rank() > 0 else 1
+        # Remove the outer ST shard the inner ST covers
+        for i, shard_md in enumerate(st_meta.shards_metadata):
+            if shard_md.shard_offsets == outer_shard.metadata.shard_offsets:
+                st_meta.shards_metadata.pop(i)
+                break
+
+        # blame other rank for the other shards
+        for shard_md in st_meta.shards_metadata:
+            shard_md.placement = _remote_device(f"rank:{other_rank}/cuda:0")
+
+        # Add other inner shards from the inner tensor
+        for inner_md in inner_st.metadata().shards_metadata:
+            if inner_md.shard_offsets != inner_shard.metadata.shard_offsets:
+                st_meta.shards_metadata.append(
+                    ShardMetadata(
+                        shard_offsets=_element_wise_add(
+                            outer_shard.metadata.shard_offsets,
+                            inner_md.shard_offsets,
+                        ),
+                        shard_sizes=inner_md.shard_sizes,
+                        placement=f"rank:{other_rank}/cuda:0",
+                    )
+                )
+
+        # Finally add this shard
+        st_meta.shards_metadata.append(local_shards[0].metadata)
+
+        st = ShardedTensor._init_from_local_shards_and_global_metadata(
+            local_shards=local_shards,
+            sharded_tensor_metadata=st_meta,
+        )
+        set_element(new_state_dict, path, st)
+
+    traverse_state_dict(state_dict, rewrite_dict)
+    return new_state_dict
diff --git a/torch/distributed/checkpoint/utils.py b/torch/distributed/checkpoint/utils.py
index a8d2a42d0fca6..7a3c259474b5b 100644
--- a/torch/distributed/checkpoint/utils.py
+++ b/torch/distributed/checkpoint/utils.py
@@ -1,4 +1,14 @@
-from typing import List, Callable, Optional, Union, TypeVar, Dict, Any, cast
+from typing import (
+    List,
+    Callable,
+    Optional,
+    Union,
+    TypeVar,
+    Dict,
+    Any,
+    cast,
+    Sequence,
+)
 import torch.distributed as dist
 from .api import (
     CheckpointException,
@@ -331,3 +341,11 @@ def find_state_dict_object(
             f"FQN: '{index.fqn}' is not a ShardedTensor, can't find by offset: '{index.offset}'"
         )
     return obj
+
+
+def _element_wise_add(a: Sequence[int], b: Sequence[int]) -> List[int]:
+    return [i_a + i_b for i_a, i_b in zip(a, b)]
+
+
+def _element_wise_sub(a: Sequence[int], b: Sequence[int]) -> List[int]:
+    return [i_a - i_b for i_a, i_b in zip(a, b)]

From 30d83fb071ae3aa6ee686c39127ed123b1706825 Mon Sep 17 00:00:00 2001
From: Angela Yi <yiangela7@gmail.com>
Date: Mon, 28 Nov 2022 23:39:57 +0000
Subject: [PATCH 1345/1922] Add bits tensor types (#88594)

TODO (in later PRs)
- [ ] the other bits8, 4x2, 2x4, 1x8
- [ ] bits printer function
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88594
Approved by: https://github.com/ezyang
---
 aten/src/ATen/DLConvertor.cpp      |  7 ++++
 c10/core/ScalarType.h              | 64 +++++++++++++++++++++---------
 c10/test/util/bits16_test.py       | 43 ++++++++++++++++++++
 c10/test/util/bits_test.py         | 56 ++++++++++++++++++++++++++
 c10/util/bits.h                    | 61 ++++++++++++++++++++++++++++
 torch/csrc/utils/tensor_dtypes.cpp | 10 +++++
 6 files changed, 223 insertions(+), 18 deletions(-)
 create mode 100644 c10/test/util/bits16_test.py
 create mode 100644 c10/test/util/bits_test.py
 create mode 100644 c10/util/bits.h

diff --git a/aten/src/ATen/DLConvertor.cpp b/aten/src/ATen/DLConvertor.cpp
index 614dc46158e8f..542adb9698176 100644
--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@@ -60,6 +60,13 @@ DLDataType getDLDataType(const Tensor& t) {
     case ScalarType::QUInt2x4:
       TORCH_CHECK(false, "QUInt/QInt types are not supported by dlpack");
       break;
+    case ScalarType::Bits1x8:
+    case ScalarType::Bits2x4:
+    case ScalarType::Bits4x2:
+    case ScalarType::Bits8:
+    case ScalarType::Bits16:
+      TORCH_CHECK(false, "Bit types are not supported by dlpack");
+      break;
     case ScalarType::Undefined:
       TORCH_CHECK(false, "Undefined is not a valid ScalarType");
     case ScalarType::NumOptions:
diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h
index 51de905def9c1..2fa3c9ceb4ea4 100644
--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@@ -3,6 +3,7 @@
 #include <c10/util/BFloat16.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Half.h>
+#include <c10/util/bits.h>
 #include <c10/util/complex.h>
 #include <c10/util/qint32.h>
 #include <c10/util/qint8.h>
@@ -43,7 +44,12 @@ namespace c10 {
   _(c10::qint32, QInt32) /* 14 */                        \
   _(at::BFloat16, BFloat16) /* 15 */                     \
   _(c10::quint4x2, QUInt4x2) /* 16 */                    \
-  _(c10::quint2x4, QUInt2x4) /* 17 */
+  _(c10::quint2x4, QUInt2x4) /* 17 */                    \
+  _(c10::bits1x8, Bits1x8) /* 18 */                      \
+  _(c10::bits2x4, Bits2x4) /* 19 */                      \
+  _(c10::bits4x2, Bits4x2) /* 20 */                      \
+  _(c10::bits8, Bits8) /* 21 */                          \
+  _(c10::bits16, Bits16) /* 22 */
 
 // If you want to support ComplexHalf for real, add ComplexHalf
 // into this macro (and change the name).  But beware: convert()
@@ -272,6 +278,12 @@ static inline bool isQIntType(ScalarType t) {
       t == ScalarType::QUInt2x4;
 }
 
+static inline bool isBitsType(ScalarType t) {
+  return t == ScalarType::Bits1x8 || t == ScalarType::Bits2x4 ||
+      t == ScalarType::Bits4x2 || t == ScalarType::Bits8 ||
+      t == ScalarType::Bits16;
+}
+
 static inline ScalarType toQIntType(ScalarType t) {
   switch (t) {
     case ScalarType::Byte:
@@ -309,6 +321,12 @@ static inline bool isSignedType(ScalarType t) {
     return std::numeric_limits<ctype>::is_signed;
 
   switch (t) {
+    case ScalarType::Bits1x8:
+    case ScalarType::Bits2x4:
+    case ScalarType::Bits4x2:
+    case ScalarType::Bits8:
+    case ScalarType::Bits16:
+      TORCH_CHECK(false, "Bits types are undefined");
     case ScalarType::ComplexHalf:
     case ScalarType::ComplexFloat:
     case ScalarType::ComplexDouble:
@@ -423,28 +441,38 @@ static inline ScalarType promoteTypes(ScalarType a, ScalarType b) {
         toString(b));
   }
 
+  if (isBitsType(a) && a == b) {
+    return a;
+  } else if (isBitsType(a) || isBitsType(b)) {
+    return ScalarType::Undefined;
+  }
+
   // this matrix has to be consistent with
   // AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS undefined is used where we
   // are not sure about the correct value for type promotion.
   static constexpr ScalarType _promoteTypesLookup[static_cast<int>(
       ScalarType::NumOptions)][static_cast<int>(ScalarType::NumOptions)] = {
-      /*        u1  i1  i2  i4  i8  f2  f4  f8  c2  c4  c8  b1  q1  q2  q3  bf*/
-      /* u1 */ {u1, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, u1, ud, ud, ud, bf},
-      /* i1 */ {i2, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, i1, ud, ud, ud, bf},
-      /* i2 */ {i2, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, i2, ud, ud, ud, bf},
-      /* i4 */ {i4, i4, i4, i4, i8, f2, f4, f8, c2, c4, c8, i4, ud, ud, ud, bf},
-      /* i8 */ {i8, i8, i8, i8, i8, f2, f4, f8, c2, c4, c8, i8, ud, ud, ud, bf},
-      /* f2 */ {f2, f2, f2, f2, f2, f2, f4, f8, c2, c4, c8, f2, ud, ud, ud, f4},
-      /* f4 */ {f4, f4, f4, f4, f4, f4, f4, f8, c4, c4, c8, f4, ud, ud, ud, f4},
-      /* f8 */ {f8, f8, f8, f8, f8, f8, f8, f8, c8, c8, c8, f8, ud, ud, ud, f8},
-      /* c2 */ {c2, c2, c2, c2, c2, c2, c4, c8, c2, c4, c8, c2, ud, ud, ud, c4},
-      /* c4 */ {c4, c4, c4, c4, c4, c4, c4, c8, c4, c4, c8, c4, ud, ud, ud, c4},
-      /* c8 */ {c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, ud, ud, ud, c8},
-      /* b1 */ {u1, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, b1, ud, ud, ud, bf},
-      /* q1 */ {ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud},
-      /* q2 */ {ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud},
-      /* q3 */ {ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud},
-      /* bf */ {bf, bf, bf, bf, bf, f4, f4, f8, c4, c4, c8, bf, ud, ud, ud, bf},
+      // clang-format off
+      /*        u1  i1  i2  i4  i8  f2  f4  f8  c2  c4  c8  b1  q1  q2  q3  bf  q4  q5*/
+      /* u1 */ {u1, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, u1, ud, ud, ud, bf, ud, ud},
+      /* i1 */ {i2, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, i1, ud, ud, ud, bf, ud, ud},
+      /* i2 */ {i2, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, i2, ud, ud, ud, bf, ud, ud},
+      /* i4 */ {i4, i4, i4, i4, i8, f2, f4, f8, c2, c4, c8, i4, ud, ud, ud, bf, ud, ud},
+      /* i8 */ {i8, i8, i8, i8, i8, f2, f4, f8, c2, c4, c8, i8, ud, ud, ud, bf, ud, ud},
+      /* f2 */ {f2, f2, f2, f2, f2, f2, f4, f8, c2, c4, c8, f2, ud, ud, ud, f4, ud, ud},
+      /* f4 */ {f4, f4, f4, f4, f4, f4, f4, f8, c4, c4, c8, f4, ud, ud, ud, f4, ud, ud},
+      /* f8 */ {f8, f8, f8, f8, f8, f8, f8, f8, c8, c8, c8, f8, ud, ud, ud, f8, ud, ud},
+      /* c2 */ {c2, c2, c2, c2, c2, c2, c4, c8, c2, c4, c8, c2, ud, ud, ud, c4, ud, ud},
+      /* c4 */ {c4, c4, c4, c4, c4, c4, c4, c8, c4, c4, c8, c4, ud, ud, ud, c4, ud, ud},
+      /* c8 */ {c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, ud, ud, ud, c8, ud, ud},
+      /* b1 */ {u1, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, b1, ud, ud, ud, bf, ud, ud},
+      /* q1 */ {ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud},
+      /* q2 */ {ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud},
+      /* q3 */ {ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud},
+      /* bf */ {bf, bf, bf, bf, bf, f4, f4, f8, c4, c4, c8, bf, ud, ud, ud, bf, ud, ud},
+      /* q4 */ {ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud},
+      /* q5 */ {ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud},
+      // clang-format on
   };
   return _promoteTypesLookup[static_cast<int>(a)][static_cast<int>(b)];
 }
diff --git a/c10/test/util/bits16_test.py b/c10/test/util/bits16_test.py
new file mode 100644
index 0000000000000..97a8220f16fc8
--- /dev/null
+++ b/c10/test/util/bits16_test.py
@@ -0,0 +1,43 @@
+import torch
+from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.utils._mode_utils import no_dispatch
+from torch.utils._pytree import tree_map
+
+class TensorSubclassDemo(torch.Tensor):
+    def __new__(cls, elem):
+        assert elem.dtype == torch.bits16
+        return torch.Tensor._make_subclass(cls, elem, elem.requires_grad)
+
+    def __init__(self, elem):
+        super().__init__()
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        def unwrap(t):
+            if isinstance(t, torch.Tensor):
+                with no_dispatch():
+                    return t.view(torch.int16)
+            return t
+
+        args = tree_map(unwrap, args)
+        kwargs = tree_map(unwrap, kwargs)
+        with no_dispatch():
+            out = func(*args, **kwargs)
+        return out.view(torch.bits16)
+
+    def __repr__(self) -> str:
+        with no_dispatch():
+            return f"TensorSubclassDemo{self.view(torch.int16)}"
+
+
+class TestBits16(TestCase):
+    def test(self):
+        t = torch.zeros(20, dtype=torch.int16).view(torch.bits16)
+        _ = torch.empty(20, dtype=torch.bits16)
+
+        s = TensorSubclassDemo(t)
+        s = s + 1
+
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/c10/test/util/bits_test.py b/c10/test/util/bits_test.py
new file mode 100644
index 0000000000000..c87c8428b29a1
--- /dev/null
+++ b/c10/test/util/bits_test.py
@@ -0,0 +1,56 @@
+import torch
+from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.utils._mode_utils import no_dispatch
+from torch.utils._pytree import tree_map
+
+class Int16Tensor(torch.Tensor):
+    def __new__(cls, elem):
+        assert elem.dtype == torch.bits16
+        return torch.Tensor._make_subclass(cls, elem, elem.requires_grad)
+
+    def __init__(self, elem):
+        super().__init__()
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        def unwrap(t):
+            if isinstance(t, torch.Tensor):
+                with no_dispatch():
+                    return t.view(torch.int16)
+            return t
+        args = tree_map(unwrap, args)
+        kwargs = tree_map(unwrap, kwargs)
+
+        with no_dispatch():
+            out = func(*args, **kwargs)
+
+        def wrap(t):
+            if isinstance(t, torch.Tensor):
+                with no_dispatch():
+                    return t.view(torch.bits16)
+            return t
+        out = tree_map(wrap, out)
+        return out
+
+    def __repr__(self) -> str:
+        with no_dispatch():
+            t16 = self.view(torch.int16)
+            return f"TensorSubclassDemo{self.view(torch.int16)}"
+
+
+class TestBits(TestCase):
+    def test_types(self):
+        bits_types = [torch.bits1x8, torch.bits2x4, torch.bits4x2, torch.bits8, torch.bits16]
+        for bits_type in bits_types:
+            _ = torch.zeros(20, dtype=torch.int32).view(bits_type)
+            _ = torch.empty(20, dtype=bits_type)
+
+    def test_subclass(self):
+        t = torch.zeros(20, dtype=torch.int16).view(torch.bits16)
+        s = Int16Tensor(t)
+        s = s + 1 - 1
+        self.assertTrue(torch.allclose(s, torch.zeros(20, dtype=torch.bits16)))
+
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/c10/util/bits.h b/c10/util/bits.h
new file mode 100644
index 0000000000000..89abf454791ef
--- /dev/null
+++ b/c10/util/bits.h
@@ -0,0 +1,61 @@
+#pragma once
+#include <cstdint>
+
+#include <c10/macros/Macros.h>
+
+namespace c10 {
+
+/**
+ * bits1x8 is an uninterpreted dtype of a tensor with 1 bit (packed to byte
+ * boundary), without any semantics defined.
+ */
+struct alignas(1) bits1x8 {
+  using underlying = uint8_t;
+  uint8_t val_;
+  bits1x8() = default;
+  C10_HOST_DEVICE explicit bits1x8(uint8_t val) : val_(val) {}
+};
+
+/**
+ * bits2x4 is an uninterpreted dtype of a tensor with 2 bits (packed to byte
+ * boundary), without any semantics defined.
+ */
+struct alignas(1) bits2x4 {
+  using underlying = uint8_t;
+  uint8_t val_;
+  bits2x4() = default;
+  C10_HOST_DEVICE explicit bits2x4(uint8_t val) : val_(val) {}
+};
+
+/**
+ * bits4x2 is an uninterpreted dtype of a tensor with 4 bits (packed to byte
+ * boundary), without any semantics defined.
+ */
+struct alignas(1) bits4x2 {
+  using underlying = uint8_t;
+  uint8_t val_;
+  bits4x2() = default;
+  C10_HOST_DEVICE explicit bits4x2(uint8_t val) : val_(val) {}
+};
+
+/**
+ * bits8 is an uninterpreted dtype of a tensor with 8 bits, without any
+ * semantics defined.
+ */
+struct alignas(1) bits8 {
+  uint8_t val_;
+  bits8() = default;
+  C10_HOST_DEVICE explicit bits8(uint8_t val) : val_(val) {}
+};
+
+/**
+ * bits16 is an uninterpreted dtype of a tensor with 16 bits, without any
+ * semantics defined.
+ */
+struct alignas(2) bits16 {
+  uint16_t val_;
+  bits16() = default;
+  C10_HOST_DEVICE explicit bits16(uint16_t val) : val_(val) {}
+};
+
+} // namespace c10
diff --git a/torch/csrc/utils/tensor_dtypes.cpp b/torch/csrc/utils/tensor_dtypes.cpp
index 3e0e3acf38c29..07ed3297d557d 100644
--- a/torch/csrc/utils/tensor_dtypes.cpp
+++ b/torch/csrc/utils/tensor_dtypes.cpp
@@ -52,6 +52,16 @@ std::pair<std::string, std::string> getDtypeNames(at::ScalarType scalarType) {
       return std::make_pair("quint4x2", "");
     case at::ScalarType::QUInt2x4:
       return std::make_pair("quint2x4", "");
+    case at::ScalarType::Bits1x8:
+      return std::make_pair("bits1x8", "");
+    case at::ScalarType::Bits2x4:
+      return std::make_pair("bits2x4", "");
+    case at::ScalarType::Bits4x2:
+      return std::make_pair("bits4x2", "");
+    case at::ScalarType::Bits8:
+      return std::make_pair("bits8", "");
+    case at::ScalarType::Bits16:
+      return std::make_pair("bits16", "");
     default:
       throw std::runtime_error("Unimplemented scalar type");
   }

From 5296648d394b76cd318f8134c10745e8f0ca72b6 Mon Sep 17 00:00:00 2001
From: mantaionut <ionut@janeasystems.com>
Date: Mon, 28 Nov 2022 23:41:52 +0000
Subject: [PATCH 1346/1922] Re-enabled 3 reductions tests on Windows (#89567)

With PR #88089 the test_ref_small_input_masked_prod with int8,int16 and int32 tests no longer overflows on Windows so they can be re-enable.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89567
Approved by: https://github.com/cpuhrsch
---
 torch/testing/_internal/opinfo/definitions/_masked.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/torch/testing/_internal/opinfo/definitions/_masked.py b/torch/testing/_internal/opinfo/definitions/_masked.py
index 5a5ce8bc7e164..10ada2cba7436 100644
--- a/torch/testing/_internal/opinfo/definitions/_masked.py
+++ b/torch/testing/_internal/opinfo/definitions/_masked.py
@@ -465,9 +465,7 @@ def sample_inputs_masked_normalize(op_info, device, dtype, requires_grad, **kwar
                 "test_reference_masked",
                 dtypes=(torch.bool, torch.int8, torch.int16, torch.int32),
             ),
-            # integer overflow
             DecorateInfo(
-                unittest.skip("Skipped!"),
                 "TestReductions",
                 "test_ref_small_input",
                 dtypes=(torch.int8, torch.int16, torch.int32),

From 1afca44a731e308b4663ff23d586e4ac5748eea6 Mon Sep 17 00:00:00 2001
From: Iris <wz337@cornell.edu>
Date: Mon, 28 Nov 2022 23:49:14 +0000
Subject: [PATCH 1347/1922] [Checkpoint][2D][4/N] Add nested_dict for
 distributed checkpoint to core distributed  (#89537)

This PR moves nested_dict and its test to torch.distributed.checkpoint. This is a pre-req for enabling 2D checkpoint.

This provides the functionality to flatten a nested dict and unflatten a flattened dict.

Docstring will be added in the following PR.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89537
Approved by: https://github.com/fduwjj, https://github.com/wanchaol
---
 .../checkpoint/test_nested_dict.py            | 42 +++++++++++++
 torch/distributed/checkpoint/nested_dict.py   | 61 +++++++++++++++++++
 2 files changed, 103 insertions(+)
 create mode 100644 test/distributed/checkpoint/test_nested_dict.py
 create mode 100644 torch/distributed/checkpoint/nested_dict.py

diff --git a/test/distributed/checkpoint/test_nested_dict.py b/test/distributed/checkpoint/test_nested_dict.py
new file mode 100644
index 0000000000000..676c7c64de3fa
--- /dev/null
+++ b/test/distributed/checkpoint/test_nested_dict.py
@@ -0,0 +1,42 @@
+# Owner(s): ["oncall: distributed"]
+
+import torch
+from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.distributed.checkpoint.nested_dict import (
+    flatten_state_dict,
+    unflatten_state_dict,
+)
+
+
+class TestFlattening(TestCase):
+    def test_flattening_round_trip(self) -> None:
+        state_dict = {
+            "key0": 1,
+            "key1": [1, 2],
+            "key2": {1: 2, 2: 3},
+            "key3": torch.tensor([1]),
+            "key4": [[torch.tensor(2), "x"], [1, 2, 3], {"key6": [44]}],
+        }
+
+        flatten_dict, mapping = flatten_state_dict(state_dict)
+        restored = unflatten_state_dict(flatten_dict, mapping)
+
+        self.assertEqual(state_dict, restored)
+
+    def test_mapping(self) -> None:
+        state_dict = {
+            "k0": [1],
+            "k2": [torch.tensor([1]), 99, [{"k3": torch.tensor(1)}]],
+            "k3": ["x", 99, [{"k3": "y"}]],
+        }
+
+        _, mapping = flatten_state_dict(state_dict)
+        self.assertIn(("k0",), mapping.values())
+        self.assertIn(("k2", 0), mapping.values())
+        self.assertIn(("k2", 1), mapping.values())
+        self.assertIn(("k2", 2, 0, "k3"), mapping.values())
+        self.assertIn(("k3",), mapping.values())
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/distributed/checkpoint/nested_dict.py b/torch/distributed/checkpoint/nested_dict.py
new file mode 100644
index 0000000000000..91c34fe39298c
--- /dev/null
+++ b/torch/distributed/checkpoint/nested_dict.py
@@ -0,0 +1,61 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from typing import Dict, Tuple
+
+from torch.distributed.checkpoint.metadata import (
+    STATE_DICT_TYPE,
+)
+
+from .traverse import (
+    traverse_state_dict,
+    set_element,
+    OBJ_PATH,
+    STATE_DICT_ITEM,
+)
+
+"""
+TODO:
+Need to add ability to handle tuple, OrderedDict, NamedTuple.
+Update mappings from dict to a class.
+Change set_element to recreate the right type for tuple, OrderedDict, and NamedTuple.
+"""
+
+
+FLATTEN_MAPPING = Dict[str, OBJ_PATH]
+
+
+# TODO: Update Docstring for nested_dict.py
+def flatten_state_dict(
+    state_dict: STATE_DICT_TYPE,
+) -> Tuple[STATE_DICT_TYPE, FLATTEN_MAPPING]:
+    """
+    Flatten ``state_dict`` made of nested dicts and lists into a top level dictionary.
+    Use ``unflatten_state_dict`` to revert this process.
+    Returns:
+        A tuple with the flaten state_dict and a mapping from original to new state_dict.
+    N.B. The new keys are derived from the object paths, joined by dot.
+        For example: ``{ 'a': {'b':...}}`` results in the key `a.b`.
+    """
+    flattened: STATE_DICT_TYPE = {}
+    mappings: FLATTEN_MAPPING = {}
+
+    def flat_copy(path: OBJ_PATH, value: STATE_DICT_ITEM) -> None:
+        new_fqn = ".".join(map(str, path))
+        if new_fqn in flattened:
+            raise ValueError(f"duplicated flatten key {new_fqn}")
+        flattened[new_fqn] = value
+        mappings[new_fqn] = path
+
+    traverse_state_dict(state_dict, flat_copy)
+    return flattened, mappings
+
+
+def unflatten_state_dict(
+    state_dict: STATE_DICT_TYPE, mapping: FLATTEN_MAPPING
+) -> STATE_DICT_TYPE:
+    """
+    Restore the original nested state_dict according to ``mapping`` and the flattened ``state_dict``
+    """
+    nested: STATE_DICT_TYPE = {}
+    for key, value in state_dict.items():
+        set_element(nested, mapping[key], value)
+    return nested

From 403a557decac79d635f5b886607c59e872dd9b5e Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Mon, 28 Nov 2022 20:53:08 +0000
Subject: [PATCH 1348/1922] [FSDP] Fix `nn.Parameter` usage for 2D and
 `use_orig_params=True` (#89782)

This ensures that all elements of `FlatParameter._params` and `FlatParameter._shared_params` are `nn.Parameter`s (as expected). This was violated by the local tensor of a `DTensor` when using 2D parallelism. To fix the breakage, we simply wrap with `nn.Parameter` if needed.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89782
Approved by: https://github.com/fduwjj
---
 torch/distributed/fsdp/flat_param.py | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index b5892bca683a2..3ffb5bd123df3 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -373,8 +373,8 @@ def _init_flat_param(
         prefixed_param_names: List[str] = []
         shared_param_infos: List[SharedParamInfo] = []
         shared_param_memo: Dict[nn.Parameter, Tuple[nn.Module, str, str]] = {}
-        params_to_flatten: List[nn.Parameter] = []
-        shared_params: List[nn.Parameter] = []
+        params_to_flatten: List[Union[torch.Tensor, nn.Parameter]] = []
+        shared_params: List[Union[torch.Tensor, nn.Parameter]] = []
         param_extensions: List[Any] = []
         dtype: Optional[torch.dtype] = None
         requires_grad: Optional[bool] = None
@@ -436,6 +436,16 @@ def _init_flat_param(
         self.flat_param = FlatParamHandle.flatten_params(
             params_to_flatten, requires_grad
         )
+        # For `use_orig_params=True`, ensure that the logical parameters are
+        # `nn.Parameter`s (and not plain `torch.Tensor`)
+
+        def convert_to_params(
+            tensors: List[Union[torch.Tensor, nn.Parameter]]
+        ) -> List[nn.Parameter]:
+            return [
+                t if isinstance(t, nn.Parameter) else nn.Parameter(t) for t in tensors
+            ]
+
         self.flat_param._init_metadata(
             param_infos,
             numels,
@@ -443,8 +453,8 @@ def _init_flat_param(
             prefixed_param_names,
             shared_param_infos,
             param_extensions,
-            params_to_flatten if use_orig_params else None,
-            shared_params if use_orig_params else None,
+            convert_to_params(params_to_flatten) if use_orig_params else None,
+            convert_to_params(shared_params) if use_orig_params else None,
         )
 
     @staticmethod
@@ -1307,7 +1317,10 @@ def _use_unsharded_views(self, as_params: bool) -> None:
                         assert tensor is not None  # mypy
                         param_var = tensor
                 setattr(module, param_name, param_var)
-                if self._use_orig_params and self._training_state == HandleTrainingState.FORWARD:
+                if (
+                    self._use_orig_params
+                    and self._training_state == HandleTrainingState.FORWARD
+                ):
                     module._parameters[param_name] = param_var  # type: ignore[assignment]
         for i, (
             param_name,
@@ -1339,7 +1352,10 @@ def _use_unsharded_views(self, as_params: bool) -> None:
                 module.register_parameter(param_name, prim_param)
             else:
                 setattr(module, param_name, prim_param)
-                if self._use_orig_params and self._training_state == HandleTrainingState.FORWARD:
+                if (
+                    self._use_orig_params
+                    and self._training_state == HandleTrainingState.FORWARD
+                ):
                     module._parameters[param_name] = prim_param  # type: ignore[assignment]
 
     def _use_unsharded_grad_views(self) -> None:

From d40fbfc99b1d1b1ba0564d9f8f29ff1e2e07d79f Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@fb.com>
Date: Tue, 29 Nov 2022 00:55:06 +0000
Subject: [PATCH 1349/1922] Fix archive issue impacting summary stat diff
 (#89789)

Summary stat diff was reporting diff between previous day and the day before that, instead of today and previous day. Issue was because summary stats were not uploaded to the archive before the summary stat differ was run.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89789
Approved by: https://github.com/anijain2305
---
 benchmarks/dynamo/runner.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py
index 38bfa3160625d..90702c50bea76 100755
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@@ -1340,6 +1340,13 @@ def extract(key):
             parse_logs(
                 args, dtypes, suites, devices, compilers, flag_compilers, output_dir
             )
+            if not args.no_update_archive:
+                archive(
+                    output_dir,
+                    args.dashboard_archive_path,
+                    args.archive_name,
+                    dtypes[0],
+                )
 
     if args.update_dashboard:
         DashboardUpdater(args).update()

From a8b363d98d525262b81498ebf205579f1c09f2ac Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Tue, 29 Nov 2022 00:55:32 +0000
Subject: [PATCH 1350/1922] [functorch] fix possible overflow (#83389)

Fix some errors detected by static analysis.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/83389
Approved by: https://github.com/zou3519
---
 functorch/csrc/dim/dim.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/functorch/csrc/dim/dim.cpp b/functorch/csrc/dim/dim.cpp
index c43a6c7a9cff7..6fc0038bfc958 100644
--- a/functorch/csrc/dim/dim.cpp
+++ b/functorch/csrc/dim/dim.cpp
@@ -1158,7 +1158,7 @@ struct EnableAllLayers {
         }
     }
 private:
-    int64_t levels_start_;
+    int64_t levels_start_{};
     Slice<py::hdl<Dim>> levels_to_dim_;
 };
 
@@ -2687,7 +2687,7 @@ static PyObject* py_stack(PyObject *_,
         auto d = _wrap_dim(dim, ndim, false);
         auto idx = result_levels.index(d);
         if (!idx) {
-            py::raise_error(PyExc_TypeError, "Dimension %R does not exist in inputs", dim);
+            py::raise_error(PyExc_TypeError, "Dimension %R does not exist in inputs", dim.ptr());
         }
         rawdim = *idx;
     }

From e966945b02fe99dc33e08ef4d0a4c82b4162266e Mon Sep 17 00:00:00 2001
From: Jiong Gong <jiong.gong@intel.com>
Date: Tue, 29 Nov 2022 00:58:46 +0000
Subject: [PATCH 1351/1922] [Inductor] Add an option to mark wrapper call in
 PyTorch profiler (#89674)

This PR adds an option `config.profiler_mark_wrapper_call` (disabled by default) to mark the duration of wrapper call in the PyTorch profiler. This makes it easy to identify the duration and start/end of each wrapper call in the profiler output.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89674
Approved by: https://github.com/jansel
---
 test/inductor/test_torchinductor.py | 16 ++++++++++++++++
 torch/_inductor/codegen/wrapper.py  |  8 +++++++-
 torch/_inductor/config.py           |  2 ++
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index ea112162c8a92..a95e4056284b6 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -4677,6 +4677,22 @@ def fn(x, y):
             [torch.randn((4, 2)), torch.randn((4))],
         )
 
+    @patch.object(config, "profiler_mark_wrapper_call", True)
+    def test_profiler_mark_wrapper_call(self):
+        from torch.profiler import profile
+
+        @torch._dynamo.optimize("inductor", nopython=True)
+        def fn(a, b):
+            return a + b
+
+        a = torch.rand((100,))
+        b = torch.rand((100,))
+        with profile() as prof:
+            fn(a, b)
+        assert "inductor_wrapper_call" in (
+            e.name for e in prof.profiler.function_events
+        )
+
 
 if HAS_CPU:
 
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
index cf8fb46c84bdc..251c21e1364e9 100644
--- a/torch/_inductor/codegen/wrapper.py
+++ b/torch/_inductor/codegen/wrapper.py
@@ -1,4 +1,5 @@
 import collections
+import contextlib
 import dataclasses
 import functools
 import hashlib
@@ -330,7 +331,12 @@ def generate(self):
         result.splice(self.prefix)
 
         out_names = V.graph.get_output_names()
-        with result.indent():
+        with contextlib.ExitStack() as stack:
+            stack.enter_context(result.indent())
+            if config.profiler_mark_wrapper_call:
+                result.writeline("from torch.profiler import record_function")
+                result.writeline("with record_function('inductor_wrapper_call'):")
+                stack.enter_context(result.indent())
             while (
                 self.lines
                 and isinstance(self.lines[-1], MemoryPlanningLine)
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index ac97ddf563f19..1639fc6aa860a 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -87,6 +87,8 @@
 # Fx-based linear/matmul/bmm + permute/transpose vertical fusion
 permute_fusion = os.environ.get("TORCHINDUCTOR_PERMUTE_FUSION", "0") == "1"
 
+# Mark the wrapper call in PyTorch profiler
+profiler_mark_wrapper_call = False
 
 # config specific to codegen/cpp.pp
 class cpp:

From 637432c9511fa9f1ac95e49998a79cc8dc62b8c2 Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Mon, 28 Nov 2022 21:08:39 +0000
Subject: [PATCH 1352/1922] Special-case fsdp wrapped modules to be
 Unspecialized (#89330)

### Summary
Making dynamo treat the nn.Modules inside FSDP wrappers as 'Unspecialized'
results in dynamo-produced graphs where nn.module parameters are inputs
to the graph rather than attributes of the outer graphmodule.

This helps in FSDP since it forces dynamo to pick the latest copy
of the parameters off the user's nn.Module (which FSDP mutates every pre_forward),
solving the ordering issue in backward.

### Details
Imagine this toy model
```
class MyModule(torch.nn.Module):
    def __init__(self, a, b):
        super(MyModule, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(a, b),
            nn.ReLU(),
        )
    def forward(self, x):
        return self.net(x)

class ToyModel(nn.Module):
    def __init__(self):
        super(ToyModel, self).__init__()
        self.net = nn.Sequential(
            *[MyModule(10, 10000)]
            + [MyModule(10000, 1000)]
            + [MyModule(1000, 5)]
        )

    def forward(self, x):
        return self.net(x)
```
Where FSDP is recursively wrapped around each `MyModule`, then dynamo-compiled, with dynamo already configured to skip/break in FSDP code.  You'd expect to get 3 compiled AOT functions, corresponding to the contents of `MyModule`, and then see FSDP's communication ops happen inbetween them (eagerly).  This almost happens (everything works out fine in forward), but in backward there is an ordering issue.

FSDP creates a flat buffer for all the parameters that are bucketed together, and then creates views into this buffer to replace the original parameters.  On each iteration of forward, it creates a new view after 'filling' the flatbuffer with data from an all-gather operation, to 'unshard' the parameters from remote devices.  Dynamo traces the first such view and stores it in a compiled graphmodule.

During  tracing, we see (1) view created for first MyModule, (2) compile first MyModule, (3) ... for the rest of layers

Then during runtime,  we see (A)  view created for first MyModule (and orphaned), (B) execute first compiled MyModule, using old view, ...

This is a problem, because we want backward hooks to run right after each compiled-backward, but autograd executes those hooks in an order mirroring their execution order during forward.  Since we are forever using the views created during steps (1, 3, ..  N), which all happen before the steps (A, B, ...),  this means that all the hooks will happen after all the compiled backwards.  An illustration of the problem - a torchviz graph showing the 2 possible orderings of autograd, and a profile showing the view-backwards ops happening after all the compiled backwards, and before all the backward hooks.

<img width="2069" alt="image" src="https://user-images.githubusercontent.com/4984825/202828002-32dbbd15-8fc3-4281-93e9-227ab5e32683.png">
<img width="2069" alt="image" src="https://user-images.githubusercontent.com/4984825/202828632-33e40729-9a7f-4e68-9ce1-571e3a8dd2dd.png">

A solution is to make dynamo not specialize on these nn modules.  It is worth pointing out that this nn.module specialization is de-facto failing, as we are modifying .parameters and this bypasses dynamo's __setattr__ monkeypatch, which should have automatically kicked us out to Unspecialized and forced a recompile.

After unspecializing, the new views (created during steps A,  C, ...) are actually _used_ at runtime by the module, making their creation order interleaved, making autograd execute their backwards interleaved.

The new torchviz graph (this time with names added for the view tensors):
<img width="2043" alt="image" src="https://user-images.githubusercontent.com/4984825/202828480-d30005ba-0d20-45d8-b647-30b7ff5e91d3.png">

And a new profile showing the interleaving of compiled backwards and hooks, allowing overlapping of reduce-scatter.
<img width="2293" alt="image" src="https://user-images.githubusercontent.com/4984825/202828533-bb20a041-19b8-499c-b3cf-02808933df47.png">

@jansel @davidberard98 @aazzolini @mrshenli @awgu @ezyang @soumith @voznesenskym @anijain2305

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89330
Approved by: https://github.com/davidberard98
---
 benchmarks/dynamo/distributed.py              |  5 ++++
 torch/_dynamo/variables/builder.py            |  4 ++-
 torch/_dynamo/variables/nn_module.py          |  7 ++++-
 .../fsdp/fully_sharded_data_parallel.py       | 28 ++++++++++++++++++-
 4 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/benchmarks/dynamo/distributed.py b/benchmarks/dynamo/distributed.py
index dee44210e93c8..194b3906de03f 100644
--- a/benchmarks/dynamo/distributed.py
+++ b/benchmarks/dynamo/distributed.py
@@ -17,6 +17,8 @@
     from common import timed
     from dist_util import apply_fsdp, cleanup, get_model, model_iter_fn, setup
 
+log = logging.getLogger(__name__)
+
 
 def torchviz_model(args, model, inputs, rank):
     from torchviz import make_dot
@@ -81,6 +83,9 @@ def move_tensor(maybe_tensor):
             dynamo.config.log_level = logging.DEBUG
         if args.dynamo_optimize_ddp:
             dynamo.config.optimize_ddp = True
+        if args.dynamo == "inductor" and args.fsdp:
+            torch._inductor.config.triton.cudagraphs = False
+            log.warn("disabling inductor cudagraphs for compatibility with FSDP")
 
         def print_compile(gm, ex):
             print(
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index 843e3d1edbbb4..ed88b515ff579 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -307,9 +307,11 @@ def index_source(key):
                 return self.tx.output.side_effects.track_object_existing(
                     self.source, value, result
                 )
-            elif issubclass(
+            elif getattr(value, "_is_fsdp_managed_module", False) or issubclass(
                 value.__class__, torch.nn.parallel.distributed.DistributedDataParallel
             ):
+                # See note [Dynamo treats FSDP wrapped modules as UnspecializedNNModule]
+                # in fully_sharded_data_parallel.py for more information
                 return UnspecializedNNModuleVariable(
                     value, guards=make_guards(GuardBuilder.TYPE_MATCH)
                 )
diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py
index 48557f41d0b23..7dbd0ba331f9d 100644
--- a/torch/_dynamo/variables/nn_module.py
+++ b/torch/_dynamo/variables/nn_module.py
@@ -567,7 +567,12 @@ def call_method(
                 return variables.ListIteratorVariable(
                     items, mutable_local=MutableLocal(), **options
                 )
-
+            elif isinstance(method, staticmethod):
+                return tx.inline_user_function_return(
+                    variables.UserFunctionVariable(method.__func__, **options),
+                    args,
+                    kwargs,
+                )
             if id(method.__code__) in self._nn_module_method_ids():
                 unimplemented(f"UnspecializedNNModuleVariable missing {name}")
 
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 64d65c67ecb97..c9605ee882868 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -330,8 +330,34 @@ def __init__(
     ):
         torch._C._log_api_usage_once("torch.distributed.fsdp")
         super().__init__()
-
         _init_ignored_module_states(self, module, ignored_modules)
+
+        # Add module annotations for Dynamo support
+        for submodule in module.modules():
+            if submodule not in self._ignored_modules:
+                """[note: Dynamo treats FSDP wrapped modules as UnspecializedNNModule]
+
+                Dynamo doesn't get to see this instance (FullyShardedDataParallel) during tracing, since
+                it skips tracing all the torch.distributed.fsdp code.
+                 - Why? Running the FSDP code eagerly avoids lots of issues trying to trace complex hooks, and also
+                   gets us graph-breaks on FSDP module boundaries which we want anyway for comm ops.
+                 - However, we _also_ want dynamo to treat the wrapped module inside FSDP 'unspecially' (*),
+                   and we need a way to indicate to dynamo which modules are wrapped by FSDP.
+
+                (*) UnspecializedNNModules in dynamo are traced-through without any assumptions, and with thorough
+                guards.  NNModules otherwise are 'specialized', meaning there is less overhead due to assuming
+                their code is well-behaved.
+
+                One particular issue with specialized NNModules for FSDP is that the
+                views created for orig_params are captured into the compiled graph on the first iteration, and while
+                they are always going to point to the correct flatparameter and give correct results, their order
+                of creation influences the order of backward execution, preventing overlap of comm and computation
+                during backward.  We need to _use_ the new parameter views created on each forward iteration, in
+                order for backward to interleave hooks with compute per layer.  UnspecializedNNModule lets us achieve
+                this by capturing the module code more 'functionally' and passing parameters in as inputs each time.
+                """
+                submodule._is_fsdp_managed_module = True
+
         if auto_wrap_policy is not None:
             auto_wrap_kwargs = {
                 "module": module,

From 42fbc094cdf45bfee7cbd5f0827b4e68885c582d Mon Sep 17 00:00:00 2001
From: David Berard <dberard@meta.com>
Date: Tue, 29 Nov 2022 02:03:59 +0000
Subject: [PATCH 1353/1922] [nvfuser] avoid out of bounds error (#89584)

Summary: update OOB check (https://github.com/csarofeen/pytorch/pull/2218) and skip tests that OOM on internal machines.

Test Plan:
```
buck2 test mode/dev-nosan //caffe2/torch/csrc/jit/codegen/cuda/test:nvfuser
```

Differential Revision: D41502369

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89584
Approved by: https://github.com/jjsjann123
---
 torch/csrc/jit/codegen/cuda/test/test_gpu1.cpp              | 6 ++++++
 torch/csrc/jit/codegen/cuda/test/test_gpu2.cpp              | 6 ++++++
 torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp              | 3 +++
 .../csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp | 3 +++
 torch/csrc/jit/codegen/cuda/test/test_gpu_rng.cu            | 3 +++
 torch/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp         | 3 +++
 torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp    | 3 +++
 torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp     | 6 ++++++
 torch/csrc/jit/codegen/cuda/test/test_gpu_view.cpp          | 3 +++
 torch/csrc/jit/codegen/cuda/transform_view.cpp              | 2 +-
 10 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu1.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu1.cpp
index 2a14695b53ff2..42dfc93780bea 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu1.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu1.cpp
@@ -7177,6 +7177,9 @@ TEST_F(NVFuserTest, FusionComputeAtExprOrder2_CUDA) {
 }
 
 TEST_F(NVFuserTest, FusionComputeAtExprOrder3_CUDA) {
+#ifdef FBCODE_CAFFE2
+  GTEST_SKIP() << "OOM on V100 32gb";
+#endif
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -9791,6 +9794,9 @@ TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) {
 }
 
 TEST_F(NVFuserTest, FusionSmemDynamicPwiseMulSymbolicArgWAR_CUDA) {
+#ifdef FBCODE_CAFFE2
+  GTEST_SKIP() << "OOM on V100 32gb";
+#endif
   Fusion fusion;
   FusionGuard fg(&fusion);
 
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu2.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu2.cpp
index d154b454281e1..8f9afb40c859a 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu2.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu2.cpp
@@ -2704,6 +2704,9 @@ TEST_F(NVFuserTest, FusionWelfordOp_CUDA) {
 }
 
 TEST_F(NVFuserTest, FusionBlockWelfordOp_CUDA) {
+#ifdef FBCODE_CAFFE2
+  GTEST_SKIP() << "OOM on V100 32gb";
+#endif
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -6336,6 +6339,9 @@ TEST_F(NVFuserTest, FusionWelfordOuterPersistence_CUDA) {
 }
 
 TEST_F(NVFuserTest, FusionSegmentIslands_CUDA) {
+#ifdef FBCODE_CAFFE2
+  GTEST_SKIP() << "OOM on V100 32gb";
+#endif
   auto fusion = std::make_unique<Fusion>();
   FusionGuard fg(fusion.get());
 
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp
index 8d24cc3803747..0467680b83f58 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp
@@ -5945,6 +5945,9 @@ TEST_F(NVFuserTest, AsyncCompilation_CUDA) {
 }
 
 TEST_F(NVFuserTest, FusionMergeBroadcastingTrivialReduction1_CUDA) {
+#ifdef FBCODE_CAFFE2
+  GTEST_SKIP() << "OOM on V100 32gb";
+#endif
   std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
   auto fusion = fusion_ptr.get();
   FusionGuard fg(fusion);
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp
index e827de56e56bd..3981b2c2b4497 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp
@@ -1561,6 +1561,9 @@ TEST_F(NVFuserTest, FusionGroupedReductionReEntrant1_CUDA) {
 // Channels-last batch norm with vectorization. Relies on re-entrant
 // GroupedGridReduction
 TEST_F(NVFuserTest, FusionGroupedReductionChannelsLastBatchNormLike_CUDA) {
+#ifdef FBCODE_CAFFE2
+  GTEST_SKIP() << "OOM on V100 32gb";
+#endif
   Fusion fusion;
   FusionGuard fg(&fusion);
 
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_rng.cu b/torch/csrc/jit/codegen/cuda/test/test_gpu_rng.cu
index 3e5968c3e0840..a1ff6562e6bda 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_rng.cu
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_rng.cu
@@ -167,6 +167,9 @@ TEST_F(NVFuserTest, FusionRNGManualScheduleValidateWithCURand_CUDA) {
 }
 
 TEST_F(NVFuserTest, FusionRNGManualScheduleValidateWithCURand2_CUDA) {
+#ifdef FBCODE_CAFFE2
+  GTEST_SKIP() << "Fails accuracy on V100 32gb";
+#endif
   auto dtype = kFloat;
   std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
   auto fusion = fusion_ptr.get();
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp
index d1f185011826e..3ee3bdd293835 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp
@@ -2621,6 +2621,9 @@ TEST_F(NVFuserTest, FusionGather4_CUDA) {
 }
 
 TEST_F(NVFuserTest, FusionGather5_CUDA) {
+#ifdef FBCODE_CAFFE2
+  GTEST_SKIP() << "OOM on V100 32gb";
+#endif
   Fusion fusion;
   FusionGuard fg(&fusion);
 
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp
index c00d02c8a40dd..1ea40a136f8c7 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp
@@ -2815,6 +2815,9 @@ TEST_F(NVFuserTest, FusionAmpereMatmulLargeLoad_CUDA) {
 
 // Matmul test for Turing MMA: across supported layouts
 TEST_F(NVFuserTest, FusionTuringMatmulLargeLoad_CUDA) {
+#ifdef FBCODE_CAFFE2
+  GTEST_SKIP() << "OOM on V100 32gb";
+#endif
   // Keep multiples of 8 to keep vectorizable.
   int M = 504, N = 136, K = 248;
 
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp
index b10360f00315e..229369977343a 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp
@@ -335,6 +335,9 @@ TEST_F(NVFuserTest, FusionScheduleTransposeMultipleOutput_CUDA) {
  * t1
  */
 TEST_F(NVFuserTest, FusionScheduleTransposeMultipleInputOutput_CUDA) {
+#ifdef FBCODE_CAFFE2
+  GTEST_SKIP() << "OOM on V100 32gb";
+#endif
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -994,6 +997,9 @@ TEST_F(NVFuserTest, FusionScheduleTransposeSmallInnerSize3_CUDA) {
 
 // x->sin->transpose->cos->y
 TEST_F(NVFuserTest, FusionScheduleTranspose2DSmallInnerSize_CUDA) {
+#ifdef FBCODE_CAFFE2
+  GTEST_SKIP() << "OOM on V100 32gb";
+#endif
   std::array<std::vector<int64_t>, 2> shapes{
       std::vector<int64_t>{1024 * 1024 * 128, 2},
       std::vector<int64_t>{2, 1024 * 1024 * 128}};
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_view.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_view.cpp
index 3892762298e14..9785e089052af 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_view.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_view.cpp
@@ -1272,6 +1272,9 @@ TEST_F(NVFuserTest, FusionViewVectorize_CUDA) {
 }
 
 TEST_F(NVFuserTest, FusionExpandFlatten_CUDA) {
+#ifdef FBCODE_CAFFE2
+  GTEST_SKIP() << "Fails accuracy on V100 32gb";
+#endif
   auto fusion = std::make_unique<Fusion>();
   FusionGuard fg(fusion.get());
 
diff --git a/torch/csrc/jit/codegen/cuda/transform_view.cpp b/torch/csrc/jit/codegen/cuda/transform_view.cpp
index a543c6d0f79cf..c617f548649ec 100644
--- a/torch/csrc/jit/codegen/cuda/transform_view.cpp
+++ b/torch/csrc/jit/codegen/cuda/transform_view.cpp
@@ -509,7 +509,7 @@ class AnalyzeViewTransformation {
             "View is complete, but there's still some elements to distribute.");
       }
 
-      if ((new_view_index == new_view_.size() ||
+      if ((new_view_index + 1 >= new_view_.size() ||
            (new_view_[new_view_index + 1] != 1)) &&
           original_view_index + 1 < original_view_.size() &&
           original_view_[original_view_index + 1] == 1 &&

From c9c4c925a6d08d4b19eaf59f197db00059d9cc60 Mon Sep 17 00:00:00 2001
From: Luis Montero <luismontero@hotmail.fr>
Date: Tue, 29 Nov 2022 02:09:47 +0000
Subject: [PATCH 1354/1922] Add type hint to torch.norm and Tensor.norm
 (#89728)

Fixes #89727

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89728
Approved by: https://github.com/kit1980
---
 torch/_tensor.py    | 8 +++++++-
 torch/functional.py | 2 +-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/torch/_tensor.py b/torch/_tensor.py
index bf94639e2dbb0..6c441a04e92b5 100644
--- a/torch/_tensor.py
+++ b/torch/_tensor.py
@@ -625,7 +625,13 @@ def __reversed__(self):
         else:
             return self.flip(0)
 
-    def norm(self, p="fro", dim=None, keepdim=False, dtype=None):
+    def norm(
+        self,
+        p: Optional[Union[float, str]] = "fro",
+        dim=None,
+        keepdim=False,
+        dtype=None,
+    ):
         r"""See :func:`torch.norm`"""
         if has_torch_function_unary(self):
             return handle_torch_function(
diff --git a/torch/functional.py b/torch/functional.py
index ee04cb250c2ce..7bf9c8f304188 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -1384,7 +1384,7 @@ def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa
         pass
 
 
-def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: F811
+def norm(input, p: Optional[Union[float, str]] = "fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: F811
     r"""Returns the matrix norm or vector norm of a given tensor.
 
     .. warning::

From f0aab3bca9350f8e45b4f924bd9730125c254218 Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@fb.com>
Date: Mon, 28 Nov 2022 14:32:47 -0800
Subject: [PATCH 1355/1922] fixes for inductor <> batch norm (#89603)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89603
Approved by: https://github.com/albanD
---
 functorch/_src/aot_autograd.py      |  7 ++--
 test/dynamo/test_repros.py          | 56 +++++++++++++++++++++++++++++
 test/functorch/test_aotdispatch.py  | 38 +++++++++++++++++---
 test/inductor/test_torchinductor.py | 20 ++++++++++-
 torch/_decomp/decompositions.py     |  3 ++
 torch/_inductor/decomposition.py    |  1 +
 6 files changed, 117 insertions(+), 8 deletions(-)

diff --git a/functorch/_src/aot_autograd.py b/functorch/_src/aot_autograd.py
index 992648a432c84..bfd5602fb0ad1 100644
--- a/functorch/_src/aot_autograd.py
+++ b/functorch/_src/aot_autograd.py
@@ -1196,9 +1196,10 @@ def describe_input(i, aot_config):
 # the same storage, so long as they have separate TensorImpls.)
 def aot_dispatch_deduplicated_autograd(flat_fn, flat_args: List[Tensor], aot_config: AOTConfig):
 
-    _fw_metadata, out, _num_aliasing_metadata_outs = run_functionalized_fw_and_collect_metadata(
-        flat_fn
-    )(*flat_args)
+    with enable_python_dispatcher():
+        _fw_metadata, out, _num_aliasing_metadata_outs = run_functionalized_fw_and_collect_metadata(
+            flat_fn
+        )(*flat_args)
 
     # pre-compute, so we can bail out quickly in the hotpath
     _num_outputs_aliased_to_inputs = len([
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 7bd258cbb3c8d..ec5ea4ac1fb55 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -2059,6 +2059,62 @@ def f(x):
         with self.assertRaisesRegex(torch._dynamo.exc.Unsupported, "generic_jump"):
             torch._dynamo.export(f, torch.Tensor([3, 4, 5]))
 
+    @patch.object(functorch._src.config, "use_dynamic_shapes", True)
+    def test_batchnorm_e2e(self):
+        class Repro(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bn = torch.nn.BatchNorm2d(
+                    64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
+                )
+                self.conv1 = torch.nn.Conv2d(
+                    64,
+                    64,
+                    kernel_size=(3, 3),
+                    stride=(1, 1),
+                    padding=(1, 1),
+                    bias=False,
+                )
+
+            def forward(self, x):
+                x1 = self.bn(x)
+                x2 = self.conv1(x1)
+                out = torch.nn.functional.relu(x2)
+                return (out,)
+
+        torch.manual_seed(1337)
+
+        m_ref = Repro()
+        m_test = deepcopy(m_ref)
+
+        @torch._dynamo.optimize("aot_inductor_debug")
+        def compiled_fn(x):
+            return m_test(x)
+
+        x_ref = torch.randn(2, 64, 32, 32, requires_grad=True)
+        x_test = x_ref.clone()
+
+        # Loop multiple times: each iteration the running_mean/var on batchnorm will update,
+        # which changes the output of the next iteration
+        for _ in range(3):
+            ref = m_ref(x_ref)
+            res = compiled_fn(x_test)
+
+            self.assertTrue(same(ref, res))
+
+            for r in ref:
+                if r.requires_grad:
+                    r.sum().backward()
+            for r in res:
+                if r.requires_grad:
+                    r.sum().backward()
+
+            for param_ref, param_test in zip(m_ref.parameters(), m_test.parameters()):
+                self.assertTrue(same(param_ref, param_test))
+            # Assert running_mean/var
+            for buffer_ref, buffer_test in zip(m_ref.buffers(), m_test.buffers()):
+                self.assertTrue(same(buffer_ref, buffer_test))
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index c3ac0f19a3f58..3a604281ca956 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -6,7 +6,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Union, Callable, List, Any
+from typing import Union, Callable, List, Any, Optional, Dict
 from unittest.mock import patch
 from torch.testing._internal.common_utils import TestCase, run_tests, IS_ARM64, IS_WINDOWS
 import torch
@@ -255,6 +255,7 @@ def verify_aot_autograd(
         *,
         test_mutation: bool = False,
         return_fw_graph: bool = False,
+        decompositions: Optional[Dict] = None,
     ):
         # Some tests pass in a callable for inp, to generate the inputs
         # (useful if we want to generate complicated aliasing inputs)
@@ -292,9 +293,11 @@ def verify_aot_autograd(
 
         fw_graph_cell = [None]
         if isinstance(f, nn.Module):
-            compiled_f = aot_module(f, fw_compiler=partial(extract_graph, graph_cell=fw_graph_cell), bw_compiler=nop)
+            compiled_f = aot_module(
+                f, fw_compiler=partial(extract_graph, graph_cell=fw_graph_cell), bw_compiler=nop, decompositions=decompositions)
         else:
-            compiled_f = aot_function(f, fw_compiler=partial(extract_graph, graph_cell=fw_graph_cell), bw_compiler=nop)
+            compiled_f = aot_function(
+                f, fw_compiler=partial(extract_graph, graph_cell=fw_graph_cell), bw_compiler=nop, decompositions=decompositions)
         ref_out, ref_grad = _outs_and_grads(f, graph_inps, inp)
         test_out, test_grad = _outs_and_grads(compiled_f, graph_inps_copy, inp_copy)
         self.assertEqual(ref_grad, test_grad)
@@ -306,7 +309,10 @@ def verify_aot_autograd(
             if isinstance(ref_o, torch.Tensor):
                 self.assertEqual(ref_o.requires_grad, test_o.requires_grad)
                 self.assertEqual(ref_o.is_leaf, test_o.is_leaf)
-                self.assertEqual(ref_o._is_view(), test_o._is_view())
+                if ref_o.requires_grad:
+                    # _is_view() should probably unconditionally be the same,
+                    # but in practice I don't think this matters for tensors that don't require grad
+                    self.assertEqual(ref_o._is_view(), test_o._is_view())
                 self.assertEqual(ref_o, test_o)
                 if test_mutation:
                     # This tests that autograd meta is set properly on the output we can
@@ -426,6 +432,29 @@ def f(a, b):
 
         self.verify_aot_autograd(f, inp, test_mutation=True, return_fw_graph=True)
 
+    def test_input_mutation_batchnorm(self):
+        def f(inpt, weight, bias, running_mean, running_var):
+            # This is additionally a good test, because the input tensors that we mutate
+            # are *also* saved for backwards.
+            # This tests that what we save for the backward is actually cloned inputs,
+            # and not the original inputs that got mutated.
+            return torch._native_batch_norm_legit(inpt, weight, bias, running_mean, running_var, True, 0.5, 1e-5)
+        inp = [
+            torch.ones(2, 5, 5, 5, requires_grad=True),
+            torch.ones(5, requires_grad=True),
+            torch.ones(5, requires_grad=True),
+            torch.ones(5),
+            torch.ones(5),
+        ]
+
+        from torch._decomp import get_decompositions
+        # This simulates what inductor does (running the fw + bw decompositions)
+        decompositions = get_decompositions([
+            torch.ops.aten._native_batch_norm_legit_functional,
+            torch.ops.aten.native_batch_norm_backward,
+        ])
+        self.verify_aot_autograd(f, inp, test_mutation=True, return_fw_graph=True, decompositions=decompositions)
+
     def test_input_output_view_simple(self):
         def f(a):
             return a.view(-1)
@@ -526,6 +555,7 @@ def forward(self, primals_1, primals_2, primals_3, primals_4):
     add_2 = torch.ops.aten.add.Tensor(primals_1, add);  primals_1 = None
     return [add, add_1, add_2, 2, 2, 1, 2, 0, 2, 3, 0]""")
 
+
     def test_input_data_and_metadata_mutation(self):
         def f(a):
             a.t_()
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index a95e4056284b6..209a5bd0a7a33 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -226,6 +226,14 @@ def gather_leaf_tensors(args, kwargs):
     )
 
 
+def clone_preserve_strides(x):
+    if not isinstance(x, torch.Tensor):
+        return x
+    buffer = torch.as_strided(x, (x.storage().size(),), (1,), 0).clone()
+    out = torch.as_strided(buffer, x.size(), x.stride(), x.storage_offset())
+    return out
+
+
 @patch.object(torch._inductor.config.triton, "cudagraphs", False)
 def check_model(
     self: TestCase,
@@ -246,7 +254,7 @@ def check_model(
     kwargs = kwargs or {}
     torch._dynamo.reset()
 
-    ref_inputs = example_inputs
+    ref_inputs = [clone_preserve_strides(x) for x in example_inputs]
     ref_kwargs = kwargs
     has_lowp_args = False
     original_lowp_dtype = torch.half
@@ -327,6 +335,16 @@ def run(*ex, **kwargs):
             equal_nan=True,
             exact_dtype=exact_dtype,
         )
+        # In case of input mutations, check that inputs are the same
+        self.assertEqual(
+            ref_inputs,
+            example_inputs,
+            atol=atol,
+            rtol=rtol,
+            equal_nan=True,
+            # our testing sometimes uses higher precision inputs for the reference
+            exact_dtype=False,
+        )
     else:
         for correct_val, actual_val in zip(correct_flat, actual_flat):
             if isinstance(correct_val, torch.Tensor):
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index b9c9225871362..9df8e3b00fbdd 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -1571,6 +1571,9 @@ def nop_decomposition(x):
     return aten.alias(x)
 
 
+# Also register to the Autograd dispatch key, so this decomp can run above autograd.
+# native_batch_norm needs to decompose into other ops before autograd.
+@torch.ops.aten.cudnn_batch_norm.default.py_impl(DispatchKey.Autograd)
 @register_decomposition(aten.cudnn_batch_norm)
 def cudnn_batch_norm(
     input: Tensor,
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index 6cddc0f489c55..4f094aabce91b 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -66,6 +66,7 @@
         aten.mv,
         aten.narrow,
         aten.native_batch_norm,
+        aten._native_batch_norm_legit_functional,
         aten.native_batch_norm_backward,
         aten.native_dropout_backward,
         aten.native_group_norm,

From 7a09f61b4d45b24487a83e6127e4d3ebe9293c6e Mon Sep 17 00:00:00 2001
From: Driss Guessous <drisspg@fb.com>
Date: Tue, 29 Nov 2022 03:02:06 +0000
Subject: [PATCH 1356/1922] Call _sdp_attention  in nn.functional.mha (#89470)

# Summary
Replaces the the inline block of code in nn.funcitonal.mha with `_scaled_dot_product_attention`. This function allows the fused kernels to be called if all the required input conditions are met.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89470
Approved by: https://github.com/cpuhrsch, https://github.com/mikekgfb
---
 .../ATen/native/transformers/attention.cpp    |  9 +++----
 .../ATen/native/transformers/cuda/sdp_utils.h | 26 +++++++++++++++++--
 c10/core/SymFloat.cpp                         | 10 +++++++
 c10/core/SymFloat.h                           |  3 +++
 test/onnx/test_models_onnxruntime.py          |  1 +
 torch/nn/functional.py                        | 21 ++++++++-------
 6 files changed, 53 insertions(+), 17 deletions(-)

diff --git a/aten/src/ATen/native/transformers/attention.cpp b/aten/src/ATen/native/transformers/attention.cpp
index 9c5be12ef24db..06ea49bb516c4 100644
--- a/aten/src/ATen/native/transformers/attention.cpp
+++ b/aten/src/ATen/native/transformers/attention.cpp
@@ -9,7 +9,6 @@
 #include <ATen/native/transformers/attention.h>
 #include <ATen/native/transformers/sdp_utils_cpp.h>
 
-
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/NativeFunctions.h>
 #else
@@ -741,10 +740,10 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math(
   }
     auto attn_mask = attn_mask_;
     // Naive, composite implementation defined here.
-    const auto embed_size = query_.size(-1);
 
     // Scale q,k before matmul for stability see https://tinyurl.com/sudb9s96 for math
-    const double scaling_factor = ::sqrt(::sqrt(static_cast<double>(embed_size)));
+    const auto embed_size = SymFloat(query_.sym_size(-1));
+    const auto scaling_factor = embed_size.sqrt().sqrt();
     const auto query = query_ / scaling_factor;
     if (is_causal) {
         TORCH_CHECK(!attn_mask.has_value(),
@@ -753,8 +752,8 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math(
                 "_scaled_dot_product_attention: Nested tensors for query / key are not supported when is_causal=True");
 
         // Replace attn_mask with causal mask; lower triangular elements take part in attention.
-        const auto L = query.size(-2), S = key.size(-2);
-        attn_mask = at::ones({L, S}, query.options().dtype(at::kBool)).tril();
+        const auto L = query.sym_size(-2), S = key.sym_size(-2);
+        attn_mask = at::ones_symint({L, S}, query.options().dtype(at::kBool)).tril();
     }
     if (attn_mask.has_value()) {
         TORCH_CHECK(!query.is_nested() && !key.is_nested(),
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.h b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
index 55e9aeb184a22..2b57ef6dd6f6c 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.h
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
@@ -40,7 +40,9 @@ inline bool check_tensor_dtype(
          allowed_dtypes.end()))) {
     TORCH_CHECK(
         !debug,
-        "Expected query, key and value to be of dtype float16 or bfloat16 but got Query dtype: ",
+        "Expected query, key and value to all be of dtype: {",
+        c10::Join(", ", allowed_dtypes), "}. Got ",
+        "Query dtype: ",
         params.query.dtype(),
         ", Key dtype: ",
         params.key.dtype(),
@@ -162,6 +164,25 @@ inline bool check_head_dim_size(sdp_params params, bool debug) {
   return true;
 }
 
+inline bool check_head_dim_size_mem_efficient(sdp_params params, bool debug) {
+  const int64_t query_size_last = params.query.size(-1);
+  if (!(query_size_last == params.key.size(-1) &&
+        query_size_last == params.value.size(-1) && query_size_last >= 8)) {
+    TORCH_CHECK(
+        !debug,
+        "Mem efficient attention requires last dimension of inputs to be >= 8.",
+        "Got Query.size(-1): ",
+        query_size_last,
+        ", Key.size(-1): ",
+        params.key.size(-1),
+        ", Value.size(-1): ",
+        params.value.size(-1),
+        " instead.");
+    return false;
+  }
+  return true;
+}
+
 inline bool check_runtime_disabled_flash(sdp_params params, bool debug) {
   // We check the global context to see if user has explicitly turned of flash
   // sdp kernels
@@ -259,13 +280,14 @@ inline bool use_mem_efficient_attention(sdp_params params, bool debug) {
       at::kHalf, at::kFloat, at::kBFloat16};
 
   //  Define gate functions that determine if a flash kernel can be ran
-  constexpr std::array<bool(*)(sdp_params, bool), 8> constraints{{
+  constexpr std::array<bool(*)(sdp_params, bool), 9> constraints{{
       check_gpu_sm50_or_greater,
       check_runtime_disabled_mem_efficient,
       check_requires_grad_and_nested,
       check_for_attn_weights,
       check_tensor_shapes,
       check_for_attn_mask,
+      check_head_dim_size_mem_efficient,
       check_for_seq_len_1_nested_tensor,
       check_for_non_zero_dropout}};
   for (auto& constraint : constraints) {
diff --git a/c10/core/SymFloat.cpp b/c10/core/SymFloat.cpp
index 511c50e3398ee..161313c777dda 100644
--- a/c10/core/SymFloat.cpp
+++ b/c10/core/SymFloat.cpp
@@ -1,6 +1,7 @@
 #include <c10/core/SymFloat.h>
 #include <c10/core/SymNodeImpl.h>
 #include <array>
+#include <cmath>
 #include <utility>
 
 namespace c10 {
@@ -70,6 +71,15 @@ std::ostream& operator<<(std::ostream& os, const SymFloat& s) {
   return os;
 }
 
+SymFloat SymFloat::sqrt() const {
+  if (!is_symbolic()) {
+    return SymFloat(std::sqrt(data_));
+  }
+  auto other = SymFloat(-0.5);
+  auto res = normalize_symfloats(*this, other);
+  return SymFloat(res[0]->pow(res[1]));
+}
+
 double SymFloat::guard_float(const char* file, int64_t line) const {
   if (!is_symbolic()) {
     return data_;
diff --git a/c10/core/SymFloat.h b/c10/core/SymFloat.h
index ff9e101e31afb..50512dc6fb206 100644
--- a/c10/core/SymFloat.h
+++ b/c10/core/SymFloat.h
@@ -40,6 +40,9 @@ class C10_API SymFloat {
   SymFloat operator*(const SymFloat&) const;
   SymFloat operator/(const SymFloat&) const;
 
+  // Need guidance on where to put this code
+  SymFloat sqrt() const;
+
   // Insert a guard for the float to be its concrete value, and then return
   // that value.  This operation always works, even if the float is symbolic,
   // so long as we know what the underlying value is. Don't blindly put this
diff --git a/test/onnx/test_models_onnxruntime.py b/test/onnx/test_models_onnxruntime.py
index de1003ce449e0..4b7bdb58ae514 100644
--- a/test/onnx/test_models_onnxruntime.py
+++ b/test/onnx/test_models_onnxruntime.py
@@ -394,6 +394,7 @@ def forward(self, images, features: Mapping[str, torch.Tensor]):
         )
 
     @skipScriptTest()  # TODO: #75625
+    @skipIfUnsupportedMinOpsetVersion(20)
     def test_transformer_encoder(self):
         class MyModule(torch.nn.Module):
             def __init__(self, ninp, nhead, nhid, dropout, nlayers):
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index a1a102d786f16..5da45046332b8 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -5173,19 +5173,20 @@ def multi_head_attention_forward(
     # (deep breath) calculate attention and out projection
     #
 
-    B, Nt, E = q.shape
-    q_scaled = q / math.sqrt(E)
     if attn_mask is not None:
-        attn_output_weights = torch.baddbmm(attn_mask, q_scaled, k.transpose(-2, -1))
-    else:
-        attn_output_weights = torch.bmm(q_scaled, k.transpose(-2, -1))
-    attn_output_weights = softmax(attn_output_weights, dim=-1)
-    if dropout_p > 0.0:
-        attn_output_weights = dropout(attn_output_weights, p=dropout_p)
+        if attn_mask.size(0) == 1:
+            attn_mask = attn_mask.unsqueeze(0)
+        else:
+            attn_mask = attn_mask.view(bsz, num_heads, -1, src_len)
+
+    q = q.view(bsz, num_heads, tgt_len, head_dim)
+    k = k.view(bsz, num_heads, src_len, head_dim)
+    v = v.view(bsz, num_heads, src_len, head_dim)
 
-    attn_output = torch.bmm(attn_output_weights, v)
+    attn_output, attn_output_weights = _scaled_dot_product_attention(
+        q, k, v, attn_mask, dropout_p, need_weights, False)
+    attn_output = attn_output.transpose(1, 2).transpose(0, 1).contiguous().view(bsz * tgt_len, embed_dim)
 
-    attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim)
     attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
     attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
 

From d024852b547845bad4993f03ad28f6f5ad982c7c Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Tue, 29 Nov 2022 03:15:16 +0000
Subject: [PATCH 1357/1922] [decomp] Fix native_batch_norm_backward dtype of
 dweight and dbias (#89740)

Discovered while debugging an accuracy issue for Inductor.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89740
Approved by: https://github.com/soumith, https://github.com/ngimel
---
 test/test_decomp.py             | 42 +++++++++++++++++++++++++++++++++
 torch/_decomp/decompositions.py |  8 +++++--
 2 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/test/test_decomp.py b/test/test_decomp.py
index 73f8c7a126ea9..264b62069e6c0 100644
--- a/test/test_decomp.py
+++ b/test/test_decomp.py
@@ -21,6 +21,7 @@
     onlyNativeDeviceTypes,
     ops,
     instantiate_device_type_tests,
+    onlyCUDA,
 )
 from torch.testing._internal.common_methods_invocations import op_db
 from torch._dispatch.python import enable_python_dispatcher
@@ -577,6 +578,47 @@ def test_contiguous_log_softmax(self, device):
 
 instantiate_device_type_tests(DecompContiguousTests, globals())
 
+class DecompAmpTests(TestCase):
+    @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
+    @skipIfCrossRef
+    @onlyCUDA
+    def test_amp_batch_norm_backward(self):
+        device = "cuda"
+        grad_out = torch.randn((1, 2, 16, 16), dtype=torch.float16, device=device)
+        x = torch.randn((1, 2, 16, 16), dtype=torch.float16, device=device)
+        weight = torch.randn((2,), dtype=torch.float32, device=device)
+        rmean = torch.randn((2,), dtype=torch.float32, device=device)
+        rvar = torch.randn((2,), dtype=torch.float32, device=device)
+        mean = torch.randn((0,), dtype=torch.float32, device=device)
+
+        ref = torch.ops.aten.native_batch_norm_backward(
+            grad_out,
+            x,
+            weight,
+            rmean,
+            rvar,
+            mean,
+            mean,
+            False,
+            1e-05,
+            [True, True, True])
+        res = torch._decomp.decompositions.native_batch_norm_backward(
+            grad_out,
+            x,
+            weight,
+            rmean,
+            rvar,
+            mean,
+            mean,
+            False,
+            1e-05,
+            [True, True, True])
+        for (a, b) in zip(ref, res):
+            self.assertEqual(a.stride(), b.stride())
+            self.assertEqual(a.dtype, b.dtype)
+
+
+instantiate_device_type_tests(DecompAmpTests, globals())
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 9df8e3b00fbdd..1de7aabd703f8 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -1627,6 +1627,10 @@ def native_batch_norm_backward(
     output_mask: List[bool],
 ) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
     input_dtype = input.dtype
+    if weight is not None:
+        weight_dtype = weight.dtype
+    else:
+        weight_dtype = input_dtype
     computation_dtype = utils.get_computation_dtype(input.dtype)
     (
         grad_out_cast,
@@ -1704,8 +1708,8 @@ def native_batch_norm_backward(
 
     return (
         grad_input.to(input_dtype),
-        _maybe_cast(grad_weight, input_dtype),
-        _maybe_cast(grad_bias, input_dtype),
+        _maybe_cast(grad_weight, weight_dtype),
+        _maybe_cast(grad_bias, weight_dtype),
     )
 
 
From 88b14602e2bbc8716416219ced48c9528b689dde Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Tue, 29 Nov 2022 04:29:42 +0000
Subject: [PATCH 1358/1922] [dynamo] Minifier fixes for reproducing segfault
 (#89712)

Helped with minifying the segfault in https://github.com/pytorch/torchdynamo/issues/1928

Tests not really needed. It improves quality of life as segfault can fail anywhere (when CUDA_LAUNCH_BLOCKING is off)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89712
Approved by: https://github.com/mlazos, https://github.com/ngimel
---
 torch/_dynamo/debug_utils.py | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index 29d830167b109..36dd15e047351 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -222,6 +222,12 @@ def dump_compiler_graph_state(gm, args, compiler_name):
 
 
 def save_graph_repro(fd, gm, args, compiler_name):
+    sync_line = ""
+    for arg in args:
+        if arg.is_cuda:
+            sync_line = "torch.cuda.synchronize() # Ensures that segfaults are surfaced"
+            break
+
     if "inductor" in compiler_name:
         fd.write(f"import {config.inductor_import}.overrides\n")
     fd.write(generate_compiler_repro_string(gm, args))
@@ -243,7 +249,8 @@ class AccuracyError(Exception):
             textwrap.dedent(
                 f"""
                 compiled = {COMPILER_REPRO_OPTIONS[compiler_name][1]}(mod, args)
-                compiled(args)
+                ref = compiled(args)
+                {sync_line}
                 """
             )
         )
@@ -296,27 +303,41 @@ def isolate_fails(fx_g, args, compiler_name: str, env=None, patch_code=None):
         stderr.seek(0)
         print(textwrap.indent(stdout.read().decode("utf-8"), prefix=">>  "))
         print(textwrap.indent(stderr.read().decode("utf-8"), prefix=">>  "))
+        # print(f"Isolated test failed - {file_name}")
         return True
     return False
 
 
 def inductor_fails(fx_g, args, check_str=None):
+    has_cuda = False
+    for arg in args:
+        if arg.is_cuda:
+            has_cuda = True
+            break
+
+    def sync():
+        if has_cuda:
+            # Ensures that segfaults are surfaced
+            torch.cuda.synchronize()
+
     compile_fx_inner = import_module(
         f"{config.inductor_import}.compile_fx"
     ).compile_fx_inner
 
-    import_module(f"{config.inductor_import}.config").triton.autotune = False
-
     try:
         result = fx_g(*args)
         assert isinstance(result, (tuple, list))
         assert not any([isinstance(x, (tuple, list)) for x in result])
     except Exception:
         return False
+    result = None
+
+    sync()
 
     try:
         compile_mod = compile_fx_inner(fx_g, args)
         compile_mod(args)
+        sync()
     except Exception as e:
         if check_str is not None and check_str not in repr(e):
             return False

From 867bdde817cf7893e374a35ddd56ef866d3172c6 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Tue, 29 Nov 2022 04:38:53 +0000
Subject: [PATCH 1359/1922] Move Dynamo docs back to core (#89769)

With contributions from @svekars and @malfet

Waiting for doc build job to complete
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89769
Approved by: https://github.com/soumith, https://github.com/malfet
---
 .../source/_static/img/dynamo/TorchDynamo.png | Bin 0 -> 349490 bytes
 docs/source/_static/img/dynamo/td_stack.png   | Bin 0 -> 308321 bytes
 .../img/dynamo/torchinductor_backend.png      | Bin 0 -> 122529 bytes
 docs/source/dynamo/custom-backends.rst        | 154 ++++
 docs/source/dynamo/deep-dive.rst              | 145 ++++
 docs/source/dynamo/faq.rst                    | 376 ++++++++++
 docs/source/dynamo/get-started.rst            | 181 +++++
 docs/source/dynamo/guards-overview.rst        | 513 ++++++++++++++
 docs/source/dynamo/index.rst                  |  44 ++
 docs/source/dynamo/installation.rst           |  83 +++
 docs/source/dynamo/troubleshooting.rst        | 665 ++++++++++++++++++
 docs/source/index.rst                         |   7 +
 12 files changed, 2168 insertions(+)
 create mode 100644 docs/source/_static/img/dynamo/TorchDynamo.png
 create mode 100644 docs/source/_static/img/dynamo/td_stack.png
 create mode 100644 docs/source/_static/img/dynamo/torchinductor_backend.png
 create mode 100644 docs/source/dynamo/custom-backends.rst
 create mode 100644 docs/source/dynamo/deep-dive.rst
 create mode 100644 docs/source/dynamo/faq.rst
 create mode 100644 docs/source/dynamo/get-started.rst
 create mode 100644 docs/source/dynamo/guards-overview.rst
 create mode 100644 docs/source/dynamo/index.rst
 create mode 100644 docs/source/dynamo/installation.rst
 create mode 100644 docs/source/dynamo/troubleshooting.rst

diff --git a/docs/source/_static/img/dynamo/TorchDynamo.png b/docs/source/_static/img/dynamo/TorchDynamo.png
new file mode 100644
index 0000000000000000000000000000000000000000..351689d80dc925ba5d1fbdc53c5f0ade693855e9
GIT binary patch
literal 349490
zcmZU*2{@JQ`aX<gh)_u?nJSfJ4jCg!rBX>KGb?2*WF}N9B(Dkyl}sV?RGFuQB=bC#
zA#-Jl_|ALp{XdTV+uwUQ-req9)_T_c-1l{z)AiifJbi5ax^3&IsHoN-S64kpMMW<`
zMMe9WfewFjx}C`b|3ho9a$Ji6|F|=l-NXOi>Tp!Y;k+%y!O8fBIn@;#TWj+J_NF(?
z&28*0Y#pZ76e;3G2gr+#+%PwGxN2*|t98}doJz~loL54OSK09juehkV7_X?9tfZLi
zK?z>XQ@on0T5-SMcvDgFQXN-S)^dJ6-tD4u-f2#0+G%4>+OA~d4b-|?n(SRk0}me7
z$TYWH>vVC5%RV1DDwUF=Z7H2S;SiqF(s0%CSc-O1b4zwYLf*N<s%-7^k-OM7GPLLR
z@_6Pe9igFlb=%dTzHD&kSY4gI;=H}$FxS&+GjVS=6$ZZB|NSp3=J%=6xBmNY{`G?7
zmqr5r_k|36<pO6#L;m-bCLw#nj{nblGNd18uif>3J_fI7+qI6e@&EVn8`nxO{_h7#
z-S-Lq-`{47P%Y8;-`~D>>Xngz%g8-d{>z2#TQ}aP6$nX8mG=9eYjn6}{>p;I<=);W
zCz5s2X^tK}%D~KA?lx86w_&H+t+M{#vvC{#?>j%bIsV6W4TH<0YuB#XO?DlUl#=3?
zliO}!V9@r?asTDZm;bmn{?B(-m(!v=mDkGBJK!L@gX(Z<>(2fAnOqABgK{7I&*kVG
zDbe_;dEL&A%D=*7rT6Ps6_5X20xPZ5r{T?`4l?#b%s<A)#I7~%4d6Po@#amf>YP0%
zPMokCs@?i+c(}|$vb3>rlP^7MCC|z<Jp(OG<}I$#L;06aW2v~$ol9gtb?TJejT<zf
zHEW~m<5%+kdr?hHT@QRsXug*2@{7Wy<jHm5z=7$RnUbAB@;(**TTDxDGqN|0-+lg^
zD_Q?NM}B_3#kFfb)n+~+A)C6oyVJc^J%tQDaGp_^WuJ8!r;cf`nDhEnyX#y3uF#s9
zpEV*eZ{Do4lG=Xr=1n_C$D_{9G6H9!0$v{RD{XDv;;tCY5%k}SqRwLy?z-=vOs6S>
zWWyrv?Cfj}yL<J+JB5UpO)CRRevdY8*s_J*%gZaRU8_51r(Oz2z2cG(3y&gmpqlB7
zqf|uTqepDzRaK_l?;IJ{t$X#ief^<Bhr;At#Gge(P*VwUFdaO2aC&||{?=MP*5Hf`
z@c<qr?!dsn+F=<k7H4^G8XB6u$w`(gSFTJu%P;)-LoRY<Wd*y$Oc7sp^qum5-|Tta
z)Yy}gow=+kFTV)Yq~zysmA2_ReD>@HVf}ZvZ{4CQFE2M5D8J9C;410k>${Jw_465}
zzqiZFRV}jhk_@G{1@7Iyzbr^;<=&q^u2wxCwwB#pAM^E|ev{&1EAj9bT7pX!mJECz
zM`IdvE?-$sx90V^efarxh{bOWE^w%<x%tWB^zYxQc6OrtJ9lo_y!lRlnO|9Z`=W9Y
z`+x7^Mg6gCqqxjWd%vxMDJdy?g@ja%jrqhSB(DEx+J~(|OHaR3K!8q6OpH^^g!aRS
z58OY5wuEckO)dQMVBdkxt@)Q9N?iGTSU7a>?|g4n`Tb3ak8QVDNN)4@I7#iO!mTLJ
z%_=M`>@*Q!Day$kTEiDwbD)r7Ys)GvE$!^=T#S9`>FLRN@EW_osb{yVL-zRAeDA3b
zJ^bG*$-4G4PT{<P?K1zxix;0OdTbMX@}_a>d{VQ8m6fWw`R)e~9@zaFI(50gbu%L)
zqx$jV%8rf)xwyFcCMN!VSsQ*jO8$PNoRdn5c7#VyMa9~Nh6X!(dtVI!Q-L0=+?%AN
z;K<0pFJCl2IZXKc^Up>u8M|VkBldOA4&r5Q32R1g%UeBv{=9E&jOnE&!+lkn0)yU(
zUSr8^FV7|NJ~?sr;zhr&U(bCT8XCy$^(<>nx`;2iVdF-cJ$v@ND<m8ETj7_nSqmE*
zwe#mUG1IN#+Ogwmm-0ssGrU9dwU>>fMv~h~Owuj5@BZt>)FatZ;p^_u7<&CZsJPI#
zCiB)Mr@VLXN^5GE$Zlh0C98~`?)UNIN55AAKO$@k?;p676Sh|3Sm3rht%hDZUcP*Z
z5B&J)Q)#H7=Y4Es&OkM)+$F`z^76jMGfEXriRbqn$=nowCc5&U-6z9@_H+2JP1j+z
zwVhJz$tZlOCD{M%n<}f}x0g*F9S?kbXi(S0OEj|e0<jgFT3aQ&Pls#JC2Fm^uj*4-
zS;<m!tGBwOgnA7<i>ihObCk=_*4ikA?Q?T;^Gmx)#r($is&Q0POY3&Xo-_PPN<7%f
zWdU0y*tocuSy_Yp2DUzXM31KPvapbw91%;){m&yKO9p59`unLBqgnAI9LnV8<|e!P
zuU+A)o4Qtl`bB#BhKp}6E4#YNmYaxa32w%146h!TT;6x=*s+DhMLu3$DpK)Y9}i*U
z;aPqES@ClIjopQ-D{caokE#ZZdka181qJz`PEoyxja{3VnAnzgedpJ&U&9!kIsET5
z#2gzWOZBZfWZ#Q+@0epjCW}A6Ylt>hS2LpU;AD*t9&k%Lcm8}j%F*M{(6X|!HD}~!
zXoo&WG95csRoEk`^5?NZs^7W9+bS<_;oo{GchzFN@q<5{i1ktB_g3Gf;^J~>-MV$y
zymc%!zH%}$M$>&I&I9-NOxJs@Y?9|rWi{ALx}K+PfZ9&HaC)YS#}rpC%E{bkC&R$R
zR2FhZ@qWJjut{IB_w@8MRcKAD)hQKM?-Q}5D}R?rg*}p}MU&T^Q;FS!BB^}&G8aET
zf8X#hJ$Dd2R!T-jrntKL{-T|o-HXF}$Y-gu{=O!<DX%~_3o9)6WSwgPH|oNp6R{ea
z5+Wk3i;Ig12?--Nbd_*Zl$4a1xn$Tj31}2QKV&a@eW1Kozwdj)F`kOV^YJelN2#d1
zmIry#uhetntcQ6-8>AkiPz0oGd#vXEqU>W;E30_C0`l@?)>-IX?<=NWp0M>gf(<}b
z+}*t+Ha3>^f@tRJ*R=Ht^IVzFntFP;`}_OD#Ld@nh!_N*SgcNdKA`fh#zJzSCFPQx
zgM+H7>h1R1zF6dQiK-`0GNB@F+rIt!Sj&N3d-e?e{9dEIWeSDcq~X<3a<@Me6kJsa
zCWmfbRXEgiezMznFi^Ty&w?s9nrY3NH5+(%g2E-Pn3Uh!F#Yo<O;FXE(3-#9!Yl&c
zJS}Z(DskHV4_dZ3{PEyop`V<b1RyNFMZJcNo&7c%&F$N_c|}FpxVgFQZr;3`l(he{
zzW&t`>VGZqmfx;fHTX6-&m7UVWBXg?_%_}T>!aA%2&0zCj^`$xOVl(lc=+?@jc?z+
z@rsFUK>gLpve-bWd$v|Mw6jwr=8V>b3nfOPqN~xej&GV<DOaxeiJ4aRm9Z+_4h`Kh
zH_=I-qRkZ3a4#T$IaxQ8j>mnPno=94OkH=g=~}k#qkiLYbElbsqnC?3cc9i#y=cA`
z92)BP<Hxnvad9Ttg`84WoOG<*0r&1PsH&-zwzua8<-S{0KXb;fpg`eOOw82^Yo(jt
zK5lTIZN4ZPuCeef<c#Reah;*R4h{~tV`6s7%gd7kPQ@ufVaA#8$5-K!yUFaw$|F_N
z!hT=1t;kCWn|ft7)vKhtyPNV>!6cm%o9o`ghj;z_{OYVOdbPG3xj%<J`DjU5@X4#J
ztS0_Pj9rEF7cN{d9{wEJ_TEW^ZvD2~R~zGuzrH?x@4*9~`1n0&(_LL%78`c*br+kY
z7x&v0Q!>MIlv+5M>C|^^#81~d$C`0V6el(YT25Jqy=o7+?w^fO;}aBI*VEI3Gjz+}
zpDB>%Utlx2@56kL+mymRC9{8bEYc6xt~Dmhqv$SE8T9Y%v9i3#DR)z7`S0I`uU~zJ
zhA2t9mr#&ytMHM=G+4R*b!==&L&JuWl9Io&cJa;EW=iOJM7R1<O)vkdcWM)PTw$Wj
z%VyM;W^Cu=R5d%+imnjO%>rz~z`~-YuFkMlYx2>vXAgi41R|QNs_2unQ%pWBC^B!{
zwCOtTp?|Ff?)6K}djkU|WV`I%y>_@h`k0CejZIe`b+9@uYNxZS>(^_OEWoz%OXHa`
zuD|!v(omO{mX0is#p65%;Jvl9w0Kc8H|;q+{+oO7+c#QUTie@r?(oUVZo>h;r6F+S
z=+U(j4KEukG|!!*xufEZV;eF0G9UHN{m*E8p35%*pf`>EWm664fdns(x23N^lTov<
zAfGmI9tW-9K^-OL?OT>m5yQ!>=-dX2+p7G^YHD;3cgWo}^jus^8i2L+0rqukfsdQ6
z9n=xx7@wH9`uvRg9{drN-lpdR;~f>_9L|CyY@cs84t=Vr3I69FAJeLb{R=aLAM5M=
zP+d{h$H&L}Mn=>JkKt2qKa!JAGorYm=6wGA8E09=$cVSJtc<{-bki!6Kfgy4l9G;G
zxUf0zvM~0~3+<#jPlNbIG*aq{ii(f5wW<aN-0mJ8{lmkpB0R-Ee*Bn@UR|ydlAFD=
z#o(}}re=I<6`)LTaBx~~ZdJBrYZ%oNlz04+fJPXV_u(%h^VNUtW`A6GuH+TPNTH*n
zGpTs68P)ClUiJZ8_Kwr8*r9vb*I|b?wY0oK&BIy1X*+iA+{S<Y`A1V%ml4R0hlgif
zOSdv=)!#p}d-v^A#s{oS`zdM0T~f+6E~AUPoSd2}vhq7#345*otx}aqIuPr_v@}tm
z&0BSp!J)c{T2()_7-is4e19>sk8BY#UY>tO_YiD<=gytJ8~L0{p6*2kTuq^1HPb;}
z%WLl6zyGnO<_KUR_Oqz@C-&(@_jep9o1+IGK75GZU|V+wVAV9rc2X@-3kQ`Qcek=$
zX*Gb{6jWw352JJ)(QSc7neAAg_$3a$2|F1s(mIa#vE0#4x|5kHvAn$8H#NnIuZXuN
zc&EL+{go2s*p4U-flav2KY#uteCF~8XVR6TqM}e%<HoQ4>+Y^LUMZ=a!v-NRZFKZL
z-jxqU6U+vc`BJV;1?UP5x}Bt?WZ09hVw}uu{uMyF<+xEVH21P?-P%5-o&MzMQxmiR
zO6o**tATH7>Xg2rAIkP!mL2N=eQIkpC|@rNal`<n*bmh%c0V2{xQOp5Rda$FeK^Cs
zhWiaYEv}wa)F)4#L_V#|57A2@kPpb@EYJ_GqyN*B{dso%G^+d?P^jb;6qcF~IVs^#
zT6MqMd@Xw$N+4;{T3U}!(I%_6f{yeL1}WV=mq<%ROG|4s`ZZR69VN#&vweDY_NbN?
zJE}}ZW@fl$=80I9Q>WHH71A}?#I%o;g^7-Xcw2nmzErQjV@DiMjh{Le#9e`77G`=Z
zwKeA5yKNwSVTbGmCN7d=`KhyWJ0Bn4BlTThEF|r&U-w281cEja51)K?kRrK_R5l>N
zT>=6FZRw^T>+0@Y7QRyWx!-PY+VPJ&@JhA~8@$oWxk#&y@>+2r6#C>-p%OG7(W_r~
z1#l~D+o={%{D|)e8=$fGVJaslCxC)m^}`asF3|aw2z_jqDssdLc^w~L29S={sO9>U
z^k&-mmscpZq}-o>b|$8wxUOzJZa~QgmtUG^&w8JaKi&T^cqg`CZSfi5(47hjJ7#BR
z0ga23QVSonwY7O}QcZihyET$JDLR_Rz+?8p$L}}ByWfY7zH|X)-?eL(QT0E&yPw@#
zEx1+`-QCsYx~yp=%K0cXl>W(yJ7HmAo|#@A?(Wn5emo{s4|lZX+VV)~He3L?#NA0q
zOjOp^<^T#L;3a@lj3!9&&)&E*(LJ_vM<W2d?^T#EZx#O*<+e3PKLq<2_%h9T#w23j
zo)holUcD+E*J~(jwQ&N;0`TZ5u8WWke)jB>hG0+Lb>Dqs*G|qSzIw%vyI2f5C!pba
zIQE<_*M59<(eh6w()<8F+El3`*i=4Ft%<0s#I2c`Z8h8}ExiR>(BkS<Wz^CeH*R2|
zs6o_PTC}rra=?R}IQt~SHNc+CP8h7Cr75qdFrMrp2Vb7lxXH=W&D$B$OFw^RQB+g}
z+gEmQkVGxUU9RQG$UfkllANsS>8VIaBA``Mb2Iw0ig2jJaJ+WXQ9ZqFf`WpNA|g6t
z-{}8KNhorXTHN)ut?i>(z}7%DGYZ87A4Ybtnb|HNiFCVumFKdK>z3xH{B(pU0<4x$
zQ-CZUKY7yMe9^F?^bXU~QKO2E*Vmnvo~}jv1$`$p^~8yFAXAS|?EWVtWTwqDXx9I)
zT~YSbruIgoIPIj0_fv)b`!BvdoYZVGGf?5Y_&s!~ZkMB#l~7Bvu5VVBWM)9R6zTXs
zfBLArJnZ4&;TOU!8_5E6k?`h?%FUZncv~^gWoP-tZy|u3CyK(x+w$%A%uV$Mp~b5n
zJ2u$6x*`pj<NEu32aln&ZBNz$SGos4D@C^^a`xqq2LOS!(Mmj9w{E3uVn-Ws{uX@f
zk9=5S>sYv30%|ap5NGO7#iFI<z~G>mRhww0AS2ZmoV3)Um2J2-=gIux5l;;UmlxPN
zzg-pZ;~*PZ;Rjk#Vsxv)!fa)&jZ$>hlO1cR(Be@xon2hcHgDOF*P$|?90GFkjP%|F
zssIfl$8gAAEoK8GjgQsUC1kUK_mT}p_}!`4b5|XcuZf5AZoF@|Fl|g40wGXXelfQ>
zD?n?I^^i&P+d3eK<i4AkoAdGW(*Q4+4Kbkb65RXZ1ur%eH5GvO*T%-{bK|;1al@LR
z1=KoT+_ka4)3)f2Du2nJ*|7~A925yAmd>{>Xv@$DJPHnx?c&1F_D#clDVZ)<y_|Un
z$_vyHF}uFQK%pj;0b5BOIQ?9v<adNtqL%u}lV#*S<D`9981QFZ_Re_Uk(y?juA}g2
zXdM&N5!`NENBDiMaE+GlH(nJvO#~g_4r-0@^Ts_~UKk9*y@p(+qOZRL=(xc)!&_H~
z0|(12J9=kORT+BM22M_f0)wEqH~|8!4j$YLrueAZOgpm)JqgA4thV;u-@hHmjTR|d
z7I$}dw;J-Ud4`o6_!@f>G<@&gy~U6BouB^QcxK+hC+@FjbxqCKja7aVxi@d#)V>Wb
z*zky3$||fP`(=#%sIp}d(WJmk<Bce+pb&kiVx@RvpJ>lNjHo!Z=~{+{LD03BAq~mU
z(s-XT+``D0a|&1mEEeJ%rPVH^nl`54qdNe_$gC@*yN_n#>aph}4<3A(o*uktVPOF*
z=lgG7dLq^&qZZ{GSQ&H;)xK|V(D(kPJp+mHQKgUxtXCF)C?}<+R<&nd=^q)nx3J(0
z{xml_m)33tFkBlZwnl<dOGSmU@+u*LL04DzKUfKJBvcT-kZSbTWNl9DH)W8Un-d*(
z?l5g3{EOiApdi-3Ee!btSx8xT;M>w-^YmxdE0&><$t`=bva%8-VP<A#?|}mqAhg#>
zS$O*NRd)8~;)$czNfiXlO9S!&ELTVfZA^UkPWHKi8^`I>r^60fa?sK!8`_>LiqjJ0
zV_%1h!Pn$vV*vYY6&VU8u#mn&AX>hIz9(6MH@)48mX5BpzMhqEO#E_=RlCpfvU}Xr
zVP$3N9GmV3PoHvpSR9Jzn{j6SvD@L2EsG|w!L_+15NZpm-(DT`fo{*j6B|drbamCr
zkGHn8@B@RC`LRx2T<YksG)yz12K=geE<)!^4JD-V)lq*<NJjt`pj`Y14y@-7{%BGf
zgYSlAOh`#NijOt$`s*05L#~A~UlSSL2OcitFv14%+ZAI(SsFYp5b_dO8q`y3V*<|x
z{|d_d%N_<<-LZq_`@JXUs;mtRdi33_W-rqD_7@uX<17J<-ohEJjgV%&^7(nm;;&(H
zl8Ibr>i=lzXzV%VmzuX~zMsocRWgS<)j0a1!GioirqWKrxj&1|_wAe6<@Y!1l3j0|
zj`!6|p(vu(=;`Tw>FDT)u?$jqDZ1nZ22WslNQiIXcIkmLUVj=Diaz&-eE1-5)s?pa
zRTd1U6g&HxJ+&buy?|{8*Ttv>(%8M{KLR=Lt^WJFuV3%FoM%UkE(Xj9t%Fh(ierjy
zmSf#{H`;4uE7|^0u0QrM?~r8&$Chl_O`E%AVEEg&;(seiI7N|k5}nwms9b$1xjm%-
z4AbZ*{n*xkIaD?xaVy_L4GBJ8zg?=Wyh|G!RLZ7{oc5yCzf%jtN~>4xTR*IzUk(Dd
z5r+Hd`EzFBQ2#1!H)h+yMFAk_s~dKK(f{<a5)K1r7vj!IF%Qa4&&(|Q^705u$31A-
zCLpUa&eOc;RJS^lDhih-<#05Kg6*!jT3B>9%7Q2DV7P`t)JT-HEq9J>??dc594;Zf
ztb3oR{zm6K`EETKdxhfG@+rGR7Us@|0QaHrD}CeRwad|E{fBwk@ME+v-A|}Chma0c
zu=;JRG=%?{ygW{<14XxK{rdGXE<bk>L=Sa_PgwY^VRBbt=+`vskV@;VPG}0K(VXID
z45y}l??iu~3?H@qEr06d$xNBIkiD%2N}8&ws>T<DgoRhS*LHpS^eN4{Q|j^K$Gi~V
z$68W+b93iC+E*c=uc2c;eE2ZcUFNMmWSL;LV7r5ELov&`KOebVxOwwtzJ2=`A;O}{
zO^wy@>7(+XN<bWbsks+tCxEDoEiK|aXvLrvNA&geqo1aW2n$z2i`g62{t)&;iP2${
zY(g4m2CGcx#@js#bR%y#IF$F5_-@#??ZNfI$`atj&Kzss_wVIYjvT>T^9HN)q8>F^
zghg*!cV}VBYt<jhwGm`Joe$1pDk>@o%c3p`Kqx4dP&wD+Ddg@BseU&rhY~}i4I;wh
z+ZE$ezce+`QehJ<0Bni5{<aX(&3Ij36=Gfg-pSw>G@9o4&L%_bU1RVG^%Ey(?A%^f
z2k$(FTNVJ(C?dlE^1`E4Q~Thxmb=e(p2^Z1{PE+#@eqNM-d-LYgwo2u?ct20ckj}T
zcja%oEWE|C^=)>wpOB@l97qDpt4`l0oMJ*6%+1Y-%5P@&x{gwzU@>uW&umt0@<FG$
zoDq=@dTUS&G53XO7T<@sHSnc?c1lq$MJctMq$Nm8AKA-pvbyq@l$Fe|!!Zqg9od%7
zzv`sZ9mlkR**k`UjwWh#41SG9kH+tVC7XI4b8&Ggk`v~>x$%DaxWfBT-VY^4#jHxp
zw>*~S!p)`a@-GK$m;Nk3vpnJJT9j+54T?*^Eeb#GE;DpT93d>jQq&Sx{Oi|7OLG&N
zd)bM*VBj`!kP<8JU!j5>gl*?KXf7`*5W;vOcH@nFE>u6TNK{{ZiC0Ir{#7Ss8Px#q
zdrq`b?P|7OR#uki@H5T5s=(yp;^L1`e;_+i<jwu9UTVBMfV=+j^XHsJBT7VNrP_FV
zCVgkFEkRP)B2PsOA8wVP5UK6>@#A@pV;tMIZ3_#~hk^!2z<r_LFVA6gqk-$_ZjEP>
zj93W|ZZp2uBH&nn_>|h-mh%~V094YiH47$a2}1brT~ye;d$+o}dhzn&FKa=j4Dd!U
zTK3v;zI2@coX^bq<FV%m$O2W@fiw--;g7`_QMfxB<}(u}beJeN5B8Otq_c5w+%>Ox
zikDGum$EYQIf(PQFce`6Y4PFilTW+s<6faOLRWojPym9y4i|*W%FEB^!`U3H3_JoO
zGW4XIZq0J=iC8E~lXGRf;0nMbqJX#H;WDzaH^+v(>c8Ds^kAZN{Jb4<kN}C@+SI&K
zQY}LJf7yVnN?ccRU$DD<_pS-VwuSM`dLjbFG%$oz-yUmCHHBHymTkEOr>J%=AKT%{
z{)@cq>(EKP$x-a?E~pn!Zn`$BVBVH<pfFG3$m~2jrz>ZduFAH&9kt`tGmt)Jps3LM
zvFuRM)eUdIF6y=7j(*&8-cB9664Gtn^+7ryrm$b_P})K8i8K$#GvPg*M(4b(m6g?a
zV?cDzO}JLi<(ydX1*mL-ByLW2-5-wj;#O8xRw!)2{xE{njuYtvqH^PgI4*Y@Wj_C6
zh6_&?lv3g?!NUYt{c<fEKiF>&#>xJ8_3D+V%TF_naPjX^Yf5g*2LNNkHEd{Wy9e<3
z`t@sG5KoX3^zDh}pMyAUI4~%C96T&6EVV_DA*`g{Uuso>Kmb><ur+6PzSK-yY>+-t
z>jBIo&cI5~bT(DUoXN=n`KwHKRM^?s`-rv#mLKIYr`xH{J^TN%05CGvDy=N+Ja_IK
zuzJ{wIK7lgbn>&8F9%$Sa-s8DtT7Y?V2;p9JNnTVMr(MoM&HfiRfEbVFnpa1Ge)q`
zM^B&9_06e-efo5Qean`eQc`nKxr+wfE_Ie}J)^9x@|Vhne-YFTSL<2bQE1Jil))RV
z0&018@Zu5d(`D}Vu1$BdllNNc`bYNvTGV;xun#<?CDpKg`83o>R>cJ>d|;TUu}Tn6
zn(*bkTSS(D-3=J~sOGz-;1i%@EH71beHFaP4O_R~$F4E?^mreK*BB~bTcL;CvuDpl
z!Hd##gff$NP-#?uj$)=e#Kp4SA=!ebcDznJvxR0RM+lz#wd{^@duRa@GrQy?>eS#Q
zLr)U5_`;iPQ1EPP!1h2jH~_>tasE5oS|?DF3vdU+>IH0BU`5jT6qm*&ph%_FOa<aN
zL3aE6{E#n}2dBeis5Ttm?rQFyu0WzWx&&Q(lB+ZY(8KLKrvk(Rh3*lk6dD%}zs#XS
zCGG87X}qbty}co4tHBgoa9{CY;!4sI6xe<`shM9~Jj2jzI7dn3om(ogdEwyLjkk$G
z`nY6vn4Xrl@5x1j(V2sKNzDV;y_M*3!O_vIs2gDL_=GSG0rDZRNa$I(iXk7)4ET!x
zg}y3tV+^f1C`Z?!9X$V9{MG@%y?aIX2hw*%y-tC=X=G#+1P_CrrSC^`Qtb>shyM)N
zm@!nPf{nG0tIcpOR3OEIy`vo)LGgyeBO)TQu(Y%j>Ues3`uydL1^9~<MJux+=>3QJ
zgLT#?pnwpL2W_M*Ow4rp>q!w~_$89Y=B7v>XzJ;qBt0nXVZT8isOF1~Mp!*4B?{wV
zZSAKOZAI~IR#K>aa93++1t%cd4h+!XQ=Q%1BzUY-)<8H0rz#%)MLfyj8zL6_^BHc9
zS__8OSm)GLM@}-A)8r<#Gu%;uZH`;z18Wa@e3+E=HqWIA>G6(iCb)#K+Ds<50#AS>
zilA`$v>LdrKYR9U*7O~DE$}$#alcmG)>CM16%{GL5vZn>@a}BglBnf+rYKh$?y10Z
zOYd8k914_wokI6R(4q0=2+r?116`89$`TL<zzMj5yTM}rJgr+R5%KJqkid$Yc*Dox
zUo$|DDgq(KC_fg~)@!NQIXDI)oq9@ub>Bq>H#9Zf2B4%Pga)*=p`+tkdJ0-Se)xkK
zAj|F*6fCt&Ew~$dGW40buT5vpdK60N7NpMNPT7iiN5Q=rZH!k+yO4tRoaeqM{`1!_
zlqZJB(^a&G7u*$LcJ+hY?K>aGyKC3Cs67hkkZI;M93u|0#M8h%BgW&TrC0sjl&&qE
zMADl#kDffalaR0%%u#tS`zu5>0BP9ntD-z=uepRC=nNPNRpB-LJD)`mdJfi$*RNln
zmuH*;nn7l!xV5!4S?hs8Y8AfabSaaF=q8-gZM`1zJMbmLOI*SGz-5@dyjF#6YjE*1
zD@#)`ppj_yD46^9?SrLPZW^S}GGY3oHe5m{&yEiu8aE3#mS}$+9eUulKrxrb<q&$P
zjg)JB_4v`bM0cYS4MY?Q+~$e4zE?!V4{j@5Sc6o0zcOPQzPE4R64?jzlazJa9(k1Y
z*o&EY7X&vGF9gQ>lhHfl=w0wJ2Ggs!Ge>K)=L*a=uUq>$T8R|_fWA2~MWbS>g@uK~
z5WHa$sJvtY65A^*tcK*luZ9(w{u}wu^W9EvTuldX>z*FC^yu;9+W;f}y!xGUg6!*v
z$e)(RhFi7J=gUgqWs+B8$4q<UKRWaLuvtTe^;OWc;^$w$N#&Q3$*Y-v@sK0@Uc~@+
zbOi9w>Qe7&Sn^}3f(D+8@r}g!Aj%|`6-FXK(v71qzKpM2p+^WIFE5YeETH9aN?NcH
zsmZqIgUv-Q0#4+nKKLdlC!gN(4V(l*;|qKlPyyH|Rp`0tR_z(W7hm$u_redK!7c}+
z2sp&zDZ;er&wlVE#D4IKwV;yTIzp3Q>#xgj0sWC58LlCPrcnn-kYY@?e>%&9%rLAj
zR4K8su~Ff4w{>=&zx9*aG_##DuZvst=8Yf}wfTN+NBt$YS0zS9KH=7YEuxk!LWCUd
zITKYr+5H}JRh<yG&DE<VpjDcnqK;!N#QT$KAGiaPF!r3--viV&M`;vLE6g@<S;<K+
z>3J0mIkx`gI+LH5XY_B`p|sFr$?nhZtAu0t`<y&^l6YwU%AC23v|kp@^k_f%;z|ix
z=HXxMSEBFe2vx#O4d;GbU0uB}U$hzsi8rBUlU;Ljv-8qKF44Fc?vx6f_W^)H5;Ja$
zJH;tu#|ta0mNx0FVG+w6737}o0dXcNt=38R^6uQZ#@N`HVg34}$mY<%WyX?h*%E57
zT|GN1s}E_85;)suFI-rgsP&1Kr-OzU2;$%AtfO$P&|ZT8RDa7X2!vEmw_LLAhs=aL
z07<mK_(j%7?8di`+y%o*e}8WR2^aHPS!P+fox@6*qlD{11je2&L+7}b{cQA5%hUWM
zu=~bQA#MvOcwk7PuD?g<JLdWK?5Rq=eLp__O3<DG^t}IA7~7?7cn=-g3Vnj(yWXs8
zLHo*E^c4?}g7(GNjic|H_ROEAXXRF<P~vpG&=WUeJ^H~nC+5Z$VRc>~`ND^4g9b+A
zBFI%Rp0@?k>7}#`Y%_v?4FCq(n^}K}Ny<?*zOlqY@}^vZg(xSvS?;TU=b`o;Idv+q
zVs3QSRbcRKVj?3dIv`r!&53OUUZCXkRqu%|hvgu<Ub5$%BNO3&mxYf3LutRgd^Nb@
zP2*@9@g>2kAhwK$dEs4=)bW9H-ez@|vNSJ*qenpYiQ}D}oybsazAPNt9*^@@S62ts
zZ^vn2?Cj-V(W~X~WqM*X?qH)4NduxFhwjjhtJs<-(P=i_vV>ls|LeaSa?HMx3CgOS
zR1kms7i7mFZY_QNKnRKn%|BSCl;1l|Ro-2{UDUcGOSs!tb{!W4k$IVjm9BhyNQa{D
zD_z{&OwrQj*%RvFYOJozD_wr)zz98=W9vWQ*7(69;`+eN)=<G}=EEH#xWH#E({$7M
zGmcl?L!Lgp>x8oV*1L8N&MNE*$YH?c>q%zMmu~&5Vn#k~IFDS1ZqT~TPfq+^f_99Q
zQvc8p9q5{&p&`Uqey|C7?x6e}Ep6?Ak%kxwQzdaAz(9;31J2K8Kl*$~MkZ`l(h|0V
zqoZR{SetFMa-~T+`crSea|Hq@HK8Izh@S3t>g|U<0{6uD#A8Ez8FU5PqQ6pcrz1~L
zpZck(nbfRtX-(gpi$iL?9KPI*{H}5P3Ez7ju@h0vae!<<kbeiBwO&9r1s)&#9vlGp
z`M{?|BBln=!kKv#xgw;jOHsqFFE3mt$uN{?n1(oZ6SGAmiTYbwYD?{7^<4#eZyMpL
zr+)ak?uNa6@Z-m{k*ATJ@c+9sZwj9RB1O?yx#{uajO64s#-F)VSNVHxP7P=Gnx!Ql
zhyQ_`BFHSbEk#5Lpro}viE_uyA;NM*#C9}Z0&WHdN*@QTV_+cof{vN<miHQZ64^mw
zWJLQ`dQJ`pPL?VRdl<{fbcZ|6jE6pXQi6CP#DE{y_kGSOWVLQjfMJJpr3zelxF+uv
zWQyP<D*hSS3BtGhXDoFA3UdnGN!JdEMkCR~yEeW=!-m)b6FK+eYY0s6#70_fuF2(b
zoismJ5D$1z_wWePk}`5~PVbb^Sn++%Ub^HD33Z70gV2ny<>8b<F;4raK4>T7jI2kU
zfCK=d(C0gAnK+2(r0;ncw@dx>=^Q<d2E8*X$b=z8f-J!c_`tvm9K#ORuZunK_kaI;
zz7u-1*2RmaNXc+?w(5RC=&K)nohVa+`}Z4*auO49i=q1l)JLaoRxq)JxlUVBo&lF`
z?sq2UL`XG$NSKW1rm?a86<Z8%=jP@%n;Jt$SU^4sLS_HxC?oQP$(QrL3Q;W~mq?;L
z9NsAD)%B+$q>SsM6i9&UXO5!!M14h$hl2>DQ6fGh>tz#Xxx3IN3L-Kxhum-vQn<&^
ziLt%3ljzVS36H=V5FT;B=X;BQV5ebk$0q*@LVro#BL6Z;hzjYXQ6bOikn7`uxR^8Q
zOz@*eM(JtPBMaQWvH$s`q;gI!i7P5{!y~UiO|E+@4ZT3&N06m>c$#Iau)kt8j?0vG
z<wzvZCgHI#TKZr}Xjg5EJS+HwIAB+=?t{4pDt8xeU<z!y!zKvc3aZJY@bLDSr63R)
zB+NXQXHWvQjNqHW$a-)}<E=vN-ZhPLq}{)GQT(G72pr88ak~-?t%u0Gl04VF4Lc!?
z+RR9hy_}RJ!9@AF=nkORo!k25V#lYPAG0K}0AkkG-rk-2g9S1TDv?0Q{n6bWJ0>I&
zfU0k+7~9$oc>s&lzFCi#l{7bJH+s}T(SUN1+vCavae`PZ*Z=-;L`Nt!5+(l1RMDzp
zkIV3m+UJLU6>9hLm2*Iz&NLb-pxXfemXVQBd!jZ&M+kwIj`D-O9S5gAi@*OH2|*C&
zf-IB%`<u!!4My<%bc#F`!0;zWoh-BUC_Uu}T)|X>@n}A2kE8~oQRPs(mM+{0%vnk>
zNWG#*fk1PX)tva@l|z$zTQW{fW=kH^)MUd&`Xwg|#U($-@&e3}q}yrBKe<mn9#>aS
z3(I3h@(b$46<qTsh|!R<hz)}CjI6I2M{Pb1Dxleh?c14H{CssYDKB5Y{`S@zjA#v#
z$-xDqeSK8eGXi;|D7Yy9B%=27TQy7w?f8(e;t8u5RPQZU>Z2yIQhEF<NV)}Rp1Lc_
zUD0h$h`8<Y^Ii5%b>A|E1}glu_tdz_wynajegx8p&j1(ie($tXR<`5Sp2%$ROsC16
zkcgqMLv(>sgu+jD0`w`;<Z8YX+xYzX^SM51EfA52B#GnbJlB4OXXJr7Vn0B!nT+Gl
zo^6x=`@50A4um%le-1agaOB7uAah<0e^@`^;!mx4AlXIEt?9RuSpZzMIct-jk@3}J
z<KO(@C|^InW1;lp@J2~>k@3vGEDXzNW+Jy2ED1i<2JoJZ_itB+S=-p;bj0V%0iJ*&
z`jf&57>A6nSm)g5PmWU3(xy20M85<h3mcR`^ZWk&ds~k6CBJY7^p0MyRR!QTzX)mD
zo31Q!hYs-}%?ueA<^{Zl*rPnku=#;wQACO4g0_y*o>tNO89^dY2B?FYd&`k@PdBMp
zCvf;_RMbO}!UY;+x_``N%}OhHF3UuD{<#|QyzLzit3Qrt`I`$#@0yu%PQeH7@m#XL
zT;$o!zaV8&&LCu1$c1|DgJcCliG+Ihqm2S-U->K&dD>6GZLSZ+EQ^7XGZoJ0z<s;O
zbyl!&`R7T9O9Yj}H32CC^nz-llVhbD+SA(Ab@{`u&vG(Olb7Se<E-a-p)Nri0I-Gg
z)G*H7)V?MoH&>ceHi7950TGeZW6B?ZSe8*$DN`hFALTZADAVH-SUT}k(1!PGCy%71
zeh2B8hPFKjR+oSzjGYpE6z1(x%m_rmj{ygSp(obqg#HWKqfr!zs@flXj62=cwmaR_
zuVM_^n>DP6Tc8a`c-cgik{U;etvvd5Hv$);Ub!&2AhSY2CnO1UME+SC?>81G-Kf6A
z?HP{n^<4vV9H>YV9fF?Z`-{ETOl>=CfAc1#@|)AjdPIkpGXAnY!^ua|bJs>j;z7YZ
z0K&q)#`U5?kc2)0=s1#nzZT~o^EI~Y*XIvDjI@G4h}Z>F*y9L0upsv=9M~B7{CV-<
zOxSAflUGx$lH1ldK3z-IxF>Tz+IJcf3Q#@=5VkEr8HUrf4u~G2d_Vri+s}_4YE^5$
zTP(gLs29R}%oi`&96x;T;6&zLw{5kOoXnuiP*?uN1H;3s-RpoXA%-2*(a{OYZM2Y7
z0gD8jMXm`u7G)Ik5QtUbn(^bbqh3x{_E}x*KK2Km_H|?HK$l=D4%UbinMI8T@hE++
z6`6CG`~=$zI3(KrCnL)aSz4rPgf8VUBS;_%`9!F&w`EN@b`T(h#--Str5uEiQAHZx
zN7N6K<T4ns)ao;!qBh}(2<wa%!?%0FH%*)h6Qw4B7LuB#(5%7)BiVX)_YckYrBMi+
zdR*uc@5^B@tE{RTgk+^`WtFTOe!s0vhZI8K=wdiFVa(g`+VOEK5|6<S7Ag3>5&7;J
zBvo||FuF)v?{@FF!qnc`xf8`7?4caOblQ#YCz3YpL^&p21OWh<MofNXC7`(|qW;LU
zA>mzuln|&97D~phj|$c@2-5=?tFN#ONcmBAJG?K|QwuoETMWm;RdVkKiA&WMk}z89
zuFZXoWF4id{WSYPm7ic>;C95GaIgsSWUO|pe$LK7i=_QARaII7a+Q7Q2pU630XGD(
zYquAy<CqUZou>@z+1pE;j+CuPer}KCD+wM>OoX3Ha;j;`5J3KyIGqTR*Ck6?o!8c;
zoQ+F=UqE{%TG9GoS~A#(ZLcDgQO~g_C(LK_Mg+No{P5jiSn^6rvICfxB2FB}n2L{o
znU?m2KkVuO88U4E^JM$K7TySoHnA*#9?OB7jI)=gu|KC5hofQr<u#g02n!RdRRv!I
zdp-R4d+3?O9><gf#rqia#KEedGJvpO^-5Lvs9J*{WTb!weib%@9D5n=Wfw0vDc320
z63OGaK<&Rp#M8njocJ0V8s!r8@{K9trPfcVp(NFQ-Yh34M?zm-E5D-QIoajCfA3pi
z0upD^lyEjo!A(j#S;re%`A`8APhZWwSs&GNp9Xg~TwlUY=u=u<WF@`*vz%|qEePW7
zzZ;HWp^0}9KRkn|O8U(Sy*Ot5?P#Ekn*?;ad^`RWtRTR_G0(eY%NCe2iH$|l_Cp#P
z5mM{pPe+<?7?@qTawMtwW>Rm=>G(3TIynqJ6(;@Ir^Y@3);%8c`p6Onhrj5~x*Bqn
z!wO>5`JwDg%kxO9t}NDjnfxAU(1?4AFew@UNZs^UYSF6~FK)S{Ub=MY`r=P>600_J
zpSg21mIg->utQfU2ixOcN$BeXs%YZR*1qc$C`26WAqhRe21X4q5sw2(^hT%XI+&-y
zot)#TFL|}eiI4)Q!2HPWVtF;RPl2-nXx!Zs<^1YWY*osvk)_hhu1=(|B#yUh(|t&B
zABg;$2WxaHaWx0AdceGg+zJnns3+82sD?4mqi%M3`Y-}8l=`SZ#5XV|Rh!&od=U)-
z^^wT1zzQS?j2Kt=mHkMULPsR42iPjf?eZ-cJ16U^AqC@udcU=Eka%kuKUDqs^V>_N
zk{G253*#hV2!uKbd~ZCr8vzEa##+VY=_5(hkW78iR4E$rkjcoKBER0PH{Pzzat`$u
zA-?})%jV`LwTn9~B*py?o)XxEfEk3|eR&;B9hDo;B|c^}$j#o2c2N8LNN45+ukB~D
zQuluC7SBZ6fkX)MFLQl>miXzQq~eCdb}|U9>jZ6i2oDzr<3&cs#?jGH89*eAP9SPQ
z4<!b)0Ap;Bl^{2jb#}flYeh)dV(xiI^jjDGv**wIK7P!eIg_Ejs}wnGz9%ORaBw(Y
zzzHXrO2`+KhXaTYa7x>7aeFP#xb@ihmQVrkV;1I)N{PH_micj6Sy?1kx1#Hy2F+*j
zs;6l0gg*g7e%Qo>|HR3Y2ZjQ`HsDT7f7}^l1e1Yi)sTUvu71GO)3<(e`<Nq^CI9p=
zQ1Dg*;a8$cN_^?VspHNimLsanwq?r|N6|b?RB$P{Hd!48zfN;syzv3@>b?8-)14Ep
zJ30Bmue!_1^U!|ybNH##XbhBBVyGv;E&LGw2-+q-7fE1CzE9bYoHcAU%?lUqAm53M
zlMzf33Tq31M(wVjk-if!n&F{r3zGi{afTXv7Dp0lO_hA!RBuu5xb1iR3K9SaA86$M
zQPA7}QX0;kfaX^yOdYR2*e)I3F}=E`*gD$Q&FEU@)kXnacUB{F?pwrnD~VeV%Ex!)
zE*X3Q8#a2l9)bfBTJgTQKx}#nsc@8Vg^l|&;$cF9_maUUNX1S%|KxdyN=Qm(UHWU}
zD7ym{*gX5#OM_Htw}mlbl%m^yeiF(f&<P=)whee#@i1H~`r9Jj1xYMCVCvzg<JTm&
zaVffU!ZWIXf_C8TL1;K+vPIP5BPDYNm1PhyU<AmLD`up~VC4=uj+3Fn74TxNPxmP!
zG6(-o8PO2fkjTrHfFjg=p1@pRrhV4LEF$WpuWS)9$QsF~)uFV-$Hu<Iegy51YP~Y$
z`M4TE?W#S|?ml4KSl)LhLq4PL^LQ<b5!)iu;|HPw*nxx4v$96c<BE_xr$=fQ6E_$(
zk}zx>o2{Kq@es`{m>VvtHY-IaVH#-`Xv%42gKt5-iTZ()Obf9!sW${dzI2cC8j@F*
zbMp1^nXcS!YqY#DqtS9|$?Xfg`(pPZWF}KC<q|K$7=n(r;j!%~tjn0ssDQZ`?$Hjb
z6JCdn`7C6j@>f4gv#vI#Sv2tbAMb^izaEMY#z5LKudo`peBbO}L5=)6QsZNij}AfH
zCNLG;a~;4`t!EFqFhDrd7U6QdGXVlGkat&$2@XGOkV#oj&_<@l7~s{*xVXyk4`%5)
zIt8u=0Vj<PZuhIB5LxcXU1ntrMj(In08MEC_W+UPqvy|u)J4{?N~|;JV1e#UfFl6H
z!sPp@;_o*|h>`q_r>AH4iifp_{Ztc&=D0P&wEV&M66G7E=HN&oo$d$K1{)oj5eTX3
zB)1F~dwOmTQz^%CeUwmpz=**ww%;6YYcO8D3IiNk9&-KM(Co>Q-zGCpVG>tZkfnX^
zz=}yuP#he76F=P_)&~SwxaIEwgOf3gh0fj;%Jm&ZP!Gi4v*a8I7-#LEm*^~d`u4%Y
zhn3)_z>;JPlcL;@-T>+R7LrOj0jyv;(};C5VVH)_Zpl*u-Nt=+rYcP4x=HBDTL5CD
z`;mz$``tJ9`*%5ZH)08IghOuwh(dEKK{FuYAEa2IDC3?FF82ZhC(hQ}_s!)Q02edS
zbbJ!9m*I+Y$N&fjZkRgM*`~ek$qBf?e86bvt|V;$wFt~U>_+T=h7!SL2sQ&jD}|hf
z*)_xQO_=u;i^H5I2*a0AhyS?g7!Cr&fPao5X!NsxJ38>P|Kos0=!Zgm#4t^|Q3;Jk
zv|>SLXB<XOP@j+wBz|75O?L)oJ-k#<HgF>&BF13;WTwl$9(mN#AMWAVmOTZve7~Fi
z$`;A4`<%1(o7=`(<ejlaDlnB`m6PhaldUVq0+s+LPG~vS7IAMsn3HH*P{RlyhIC7&
zgDM~H@JD|jXcNui(l3|d2~#<&0n(WJHg4P~yQ~Pf0vn1{608e36VtF5O}g@~cfa{~
z6o8sainI2EL|c=Z;FGx%N+oEU5K;J~r9&4zK<Kldg`b5TInkLLlbBd-<ranE$qy@k
z#yI7j?fh74T2l>4^6~&3(j$eGHG81e;Zi;ZAHMPbvH-^4J_eJpB1xtoM)>U1KMyeh
zPWUZG7Z8s&6W>Z?JzZUwRs(nBb@<ze#0T-EhX2tan=Y9Tvn`j%!CY|n^jq@)u@wJX
zV`M!rQA4Wi)jJr4IePtidb7cO1;%Op0@r4fFY+jd{?=FbJ2l;{`7S=?hr$8WIeF+&
zJ~%QFzHr1ewyp;SiMZr=6ON}F%X91}r2P?8D~6tc#l_IHDufk8X8I>40!i5vzcZ07
zBGIX+c~hDsro*okm?H|{ys|h<W@+(eLj@l$ojFUz5psVbs17+7(v{#mrGH$oIU^9#
z4?a{5(H$PAeXG?wxuP6O-8Rgg=am_~#5TrpCPF_t4KFY;`jzS1t)N><tIHBF_k9sn
z3kZK?6`kIyJ$VCHEOX=Ay>?3yD1%kb$HovAm1C{ciD_6=C9=teG9*&gHX(^X*ulq-
z>d0Qlm1iJRi$M&iAW**iLQ;z-c;}>h-Pb^DK@3ubm(9UdK2qLg<DS#cMc-?K)D4i4
zX-tmDF1w(LATmsb+{U|5gmT{{6zhez%k^Hte<0fO0!BFxc*Y-u2}Lj);z~AR!TP`}
zu)uU_#jApN>brbp$IPEcL|9&*BA5p!ig*gA_oRa=uy6-eeaOkNDjgXa5!QEtqNupK
zY!CC4(hH5pc%mblLwESrqKXj4fYWuX&R)yOq!RM4D8<DQE{18{${#MJluR+eIc82#
zDs*i!H&2G+pof+ji58=#LwX@I%j7R98|P4VIF?Yo5*KO=SH8xM3t;|+Je>eJCW3g7
z?7-IAz|L-FKmXou>nWCY7R16_ehnD|63u@wQcG%n_C(C8F=8qg1q}!cgJ1WFql=Md
zWqbR~_IcDV%Fn;(yD~S&IYGKe?hm4LA4ZmTB7R5%TK`^=2S*wWa}bG6k;1=oox&Fz
zK=UAl8wn;sAKnNSKJmNbnkXlU;*z(=HX1*7yd3hD6<l`kWik>fJAR?HW$@SMqUQFz
z;*R<VjW|Wdj;yKK)<z=%tP<u=W+1@FEUu={cb`40-?<tcZhTk#$W|lT$8U_3H>}<8
zMQ6iy!KWjc<s5D`{zxrx2Bs?nZacWvEkI%8C6kNnXZM*<KY7JlP{!;?3A>3W7)*Lx
zo@8cDxZgV;QLmRQv?n*p?X%+WczsW_{#kE$@+W+zvA|?r5H!T%wGpzb0|NsIUY3&E
z*f($PhaT)25)LC*#^aA9LUL+#4bbArLLn%XW>&osf;zl!nz*>QwCrs2CYx3ZNiq|F
z>YWB``Jn5&oea@OA#6lw`B*Su01Bi`-sLAPO42cmgMh1Po<F|^2@#BEGchq;>upN8
zBuywjYT0U^xH7!-qnzOp5oMs9saLcFpRg<61a1Pi6|Iy>GzpG~@W((B3FC@NND$`|
zALU1Qc16W^-#v;S^}KhXcA;yMIapYUc;o_}-+|bRcm41YjA6ZaK@}bz4&OmdP3`V!
z@h_h~t%Ep)wtG5r{xnP>BoWAbG>F%4!O-(MI`=@V5oR*M;M_=4;-*1PYpK*jg6gvu
zPV4#n85~2i1%7UjEkwEiG~`Qxn-lWrBMOB9fq~g?{lDKhbcFAe;Uh=Q!NI}AUi2?W
z7$`N3uPkXCJr3;ILyy75VxY}Oj~=}^m{;`S0~d-IGVY}S@5bilcMJ+#jnNcFy5CFd
z<=UidNle7t84pkIuHbeyxLJ8OzON(mCLksn&t*0slEm}8K%$sx4YO!pSq1$uid3yE
z)h$eQ79E_SX*#L~9kbze8yP=1fBwkv-_(x&Z$Ln=KCxx@uOPZb5Vr!<(vu-6F0(&2
zlLt<K3-kkQNgjKdpMMA*Z%JEi$hq?^EiEG*+55Q_Tvgp#q2Uop1tJ`+@(Wl}xU2*W
z`kX$^0uyBB&#WclNihIwi1bqR{rIsCP!u`Ohx`{Ae2R&RKR>D8LDPbu!KEB4<*A;6
z?T#uZc5O^4KZ}4AnJ~lz+sJp?-8<WVh`a>@dgnCNW0lVyuA!D>Ox2V<3S*FS>T8<7
zB@cQ<B)F`tCoZ*K*?|W`Z1z~?+bJfNuDi9~#>R$B=t6KNa`4Cq1BP3%|DaZfg@>R0
zX@a>jk*}-?Ay==ei%MUSu>bk(I33?!Hij(9kQ9A^>#PcpSZ2&7>LIzNJ$C?R)6lSx
zWe-e03$9FsYzW(53_Ir853>Yn(*B@f%L?3LZ*OnamPQS}-gF{Ok~ZH3P--(aMLG_V
zpeaH!>xh}p$oO*d6vg(Rgtu=iA8g)lw=`$1_cSp)d~3^FeSe6l=Q8YTo*sztNJalw
z+tJKQ-N8_Vzc%T}x~e4?ytVdqqt9a#&%b{Tf}mmQJoTUj$!?=?-$k|$*xrF+@u0$l
znBW0(QsUw&*RBb|sCn2^;1&Y{!@d}=B{;J%V+ztkrrePhdKKA;YDNhT--YL&1YmE7
zjTha&&xkwkjd%UZf8Wy5lFY1Wgo%2$T&X^^S6uv`p{|Lpd<in?fZckqW6QP(Ut%5M
zzDWvhwZtYyjOX0fgv0vN4bbSGbsgSB>%Bwnrl{3$e6;6-;^M<8romah-=ukCUX985
zVWJApaNrDXLl)AX<3K5bgU^z^PN~3{G%WHY;l8KB`WVpD!GsEaKnvM_4cEtKNNj(5
zmWo6*WXD{8(1a)26#Mz*>eq8lPc~f6SJwaXv|#4#^CenksJLWk!fR!5kLTani`blF
z9)%eD$I{^P@8E_R;lLr1MkfoTp5P&j91bCGBVS%}JXu2qN|4B&{`vi+kp4S%045{0
z3x^{I<@WDqLbR$B;{_idrCBpW-X<j!gaCIk>4~Vd4$m2$szRtjIznnNHW7@#qdSZ&
zEl*Y_uj<1+nYvv|`=f)T!C>DV)79My<v^K@7804=Ksh5)ZG$mEiZq(7!*D#2*wl!$
zH8Z1y=Y#(;=5gj9wr1_OaX5S;hK1#j`TQ{yNB9ngY>EMHuJ)!Q9z_N@k%qZyssU{Q
zXvS@B+#9+hxYci00cfolZhK?f>UC`6x1LB<i2ReiML~G}38oc}OwHRes+;&~1!Vwf
zY`y&k+Xe@l$N>;&$VeW<FIY>Clsjl7@C^G<Gl;(PIZA;kT_<_|7c$G`m6ZcnNE!D9
z(YR9)ccI4N2@<2eopO($p-LK_h7?66IDnK2njH@a35gQ#!{CFC<*ABFO(Y*M2WyOA
z29lAKWiK=kQ&Usif3>s^T7nh8NnhSMTH?fc_m5q@U$@hDJFH_;<l5!w>HXB!oZaTb
zWk|AysI^&iaU<Id<!+9Dv2@S2>D~qhr5v33svDjvB#16J`Zswt7={f#uwnT+Y8?cI
zTZpt|rsBAqh3A3IM}Rv@_ZR<Ux*y}IaQD8~gc3;tCGlU6N<mDa;mw8pk@mOLVwAD9
zJQu%jL7E`Dy<UTj7SzcXPhRN*QYOK|FBm5-a{T<Cvb`#-C?nq3&=8z{%O)(P?Huy<
zpTgCl^o#RTcog05K}@(B_!@^)7)=fvh$&cI)t)9aG<2tg#3m$(nXimN?vALHZG0Ly
zFgBKS-si!tgVhYjGo0n9PRv!~jrGQlsi_ShUY>O)EHZKjqHA5df*GmMp<BcAIq@Z+
zrjH~m>G5=A*si7V*5-e|4^Qc!!{kMPmA19&nIqxxe{OGwtd3~49mK85fqF0>Sfr0H
zCkF<u*U->N!xR7>nAFHs`t<43siUuc>c7I?Of@Jlk;w=Q2*BddlW-Z5>39wdv`^Am
z(Mjp|@8AE%lkvDoVc@TTfIq3dTb_j&qg&rETKVHIb-LY3ifkNc&k|f(usX<uDaaG*
z`919vy^`8mW^$<U)Slx@lS4y82i`Vdg<d_Wtwkt}6xZ*cHg|AlWn_?v9+D15BnMtc
zfHu$BM6Cgcnq-t9(5^x41a&bUZ-asL#{y%rSKTIw%miB8mSN6x&;V#@8e1R7&bN*t
zu`BY|&!3oCJ;_9e0RtVV5R}i)kDIe%(r^QbTBA27{yKl|T+jW0h4*)nCT+X6Y11Yo
zW2N6!3oy{&E>=$Wmk}z55EB_VLc}$T>fwLncJQ4rJf%b1ua>Cu<Lwe^OBg~%O#R^-
zo_*}=P$m;x|D~#*ITLTrO5=FC8WilBqs;aE%Xho^mnM*nxeHh&p{^?KmXEvuX;3If
zCo|Lk{kS7<4E;Ahg|-AIkZS@ARt~AW*yH@05|V;CMfct*$1l(Hm8=2t9L*|0J;tC$
z?D2PSH&8^S_SG+7aJvs0O4gP6xw%T@h8*Y=bS_<LtTcj&d4!`D*|o8ENR6sm?)V&s
zTp?UNR-%3*C-CBVI^wDK?%gW}k}LUsW2^2A6&ok#LP@<L9j8&Tmjr+BLCl9C18g_^
zc?X`XL^VIv+nO(oGzoBDjMbif!Rj5_KD>B@0vR9SLm&c9<RLtHg!!fB$GoSPg{$CQ
zlTOv)maZd&Heb@x!hwHa`#e5%A*mS=71)SKsc0r@LBMgI?xTj<OB}lsCAaX<9l%36
zJm+E>xL&ja5B@nT$aT2eGxpUhAAtfmh7UmAz)H?pRvMLU<TzzcIT3sAgZtt}G)WR7
zM<R$kh-hjm$P0-}C;}sH!K^k6FxBz&w1~m1W{Ncn9vR^}`{NEOC7FVF+_6EICKkmH
zN*sBT1&m2kYdi@hz^41%sG~?mmolQ|gR8K%P{CN7X9m`S?u&eKv87P%f(HK{Z{Kuy
zN54Jh_s2VP&m=XO;yt+Kop0f`8-M@w<kVyYuqdR5X?Qk+So6S*@@6~aSp1G0A6Z3w
zmT>v(J(j|VUx4JrzHYP&ya1MnuR~4-AiXLAsxVfk5p5Jx=LW7vw-^UlgO-HH5|v`7
z5Fke~Z3v2+jUU1rp?6Iq_T7&i4*#!VF&PjM>2`#{wsne!2V^LLNF%J<_Q6GhoGO^u
zFVLdAYT6dqj7&^?h=7b-4UmFWOFqH6o_}RAXLk?shuB+7L{!1;v#dKgkX3SC{J9(T
z9;vEC1$Fch(z%FYLQmiLwqaw${6Ga0sbKicqj>U#`7c6ZF<-BmI(6b<Dw&YQ_q1tQ
zIH94@<mTSq(P8rCg)*7tK{h_27ext=xM}@nFGZ`%BO@sI;7{Ghj||vdRgkW(z5z-%
zhfSM?#wd8@WQ&I<ZVgoBJHS_?THByqJ+`gF>+xOTOsE;un2D<%JxYj{^Pk`QhU%kJ
z&l^Cu8YHF)*!NE5lCCZ;DJd!BhTvdm8O38O;t>rn03gON!X80Niin1o&(ldaSqmW~
zLVvY`0RG}ztE<x`!Qt>^nXB~id@I231}L)}aviuCWKi~EszHO)&@Qp_Uetwn!rn73
zeyVF51-|b>#$>k82|iOg_LacSI(0uu#+^H3)EGAdB;$vkp1vT82-aCbJn+$w9p~S>
zH=+8}IXw7;g-hl|=HyBI>E=`q1esd`nUYx>gxnR2+8c_nTglvYAR8vD!<4)VwHcq?
z4G*0yq6hf}A7?}hClB|bP$(q&>N+=mADjabv=Xo~cSRln0RgN5+3Ns9tHJNsY<;0L
zsYR-hkQP)NnEqOkvQiQ2fd*b`E*MDpW19r<u6=SMHgr%GVID|)S_-lou_Mc|z76T7
z$4G<|2GB@jJSzsp5{ju_YN`jP<FMmF5FkLUaz5$%B*&|eFh#_g>`Yvcs{JD}%mI0a
zTqS7OdMYw_g<hm4Hwi$6Qj%qH)OzqEDhvSEm*WmnYf<HY|7iB7IANk4r|wI>J$d+_
zTAWeYN<FL>htYG}f*G5FbC9iyY45AN=D89AhGOJi^=DVIFw#*vdU~Z_zivbVfuJko
zcQL~53oyoy{rBS+IC)!vn>0d7oLjT7|79>HTGsFh!`RwMi3xK<R<8hc{4pFeR(rUK
zBH{&{yF6XS%KXHKJeCTfP{Q8Or$NUaz%KbJ&W+-L2yjET<;82B2!6?6Qq#iz3o<E~
z2r3r1UkhY3-dkiiER*u)O(~h#L(NH<<vxzqU<_5PsYpUXV#B6QzUY-B&u#JT(=j4|
zV2oz{=iy)QU^~>REALXT?a1JBBtnf|>ybZru1t(=Z(%?{J(N;19f!<pdL?z3Hz(b4
z@7?}OD^n_$^zHv~c8!>df)9ZMHwa{Fs{ZL894b;JfwwVsz6NFBM;D$$WM*de0#i^K
z3W(j3$6kSpU}KAU{BfvM<inH|_V*Et{!KXHK%+94A}KY}p1B9aJs=<;VY~s;_z2zX
zfU>KK5wFdXzlZ9!6Wy3rxera?b;lk&9Rsks3R1mlXFb3JT3s33edup-AARsRgswu5
z?X7S1k04&>=njl;Q0Sf*Qkz^9|Mh#@9tIlRC(`eb5VV`>QTQZK^ZiEGd#511`Sq9{
zs-V94M+HIjq1N2p65j6)cT=ySGs0{IkzQi|Y}m2mRB$Ph;{nWjV6Ob0=)5?!q9JGs
zVdAT&w3L(@EQBvf7e&YwSV=`Z^CLqT2ywlzN#Vv#!4-=CG4^=mt!$?DN*L-LOic8J
z9eGF!Sw|y!b)xh9jr_}aXi0HL2Og5R4l;m>wl)W$ba+^kJD!0tf`XM9YaOlNb{Fjj
zT8b%Lpy^+~Vp6hkHp%#6SCLorpA@i9JbzQ*<QY0MZk%k?^Vk*_gi~-5%K%5{kO}U{
z(@xZKo*Umv2IZ43rvBLV3^%~Y)YSKLq#TTBZt_ey@v>+ouZIXKA!YW)R{P?`LC98Q
zx&XqcFVcq`t)d==*auVYE1TEd!FIOHv=j=-F~hf#<PWK%U_O-}52ZrxwhAc@4DaK~
zm%#L=*G@OHvXG|=;DL6&wUXlE*PhnB_gInwzVn4VcDF91<~soCsi(rO7YD}xSV&|l
zWUvPE1W7mI^7h_$LRJ;C50%dhEw_U{k|FQ^kFht8$8zn$ziFQ5k>)|AG@>FxrI02=
zBB@A*(140W^E@dHB%~5T<`5d$g;J)F)K<o%fd<0+J!?Pj`_J$5`Mvwo^X%HW@B6y1
z>pa)F)^Qx`SV$8BcW>XkSx834A6Z41MkdFG84m6$nkNRx89%zHtaa79+wFq(L_J}j
zhjR|C6`ULUDG52;$rUI3vfH|8VE_K>XsxB?Gz}Tmi|aZAjhm1%&T(t(2W%a1!Y-=*
zG3BY~{Sdw#4O$9P-6;wYE_Pi_>$_NiJUa?Z6s>|;5%|obNA1x5I2a~Lt@V)XFz9pZ
z$iqFmQ7ew&dvubBNd*nY^(baR!8z^gkGvIZI}!2`J-i542D85!73Ha<sG#6QTg|nh
z6>s0F4N-DDENBjqxINE*Wkwb*>XP0_R&`Y?5FIYO=<U6t!u#Sy(cU8GQgM(HZ$8ha
zv!ai#sXO)ce1JzB!Mx|VBe6A3)oGqf01;g`x+%5@GS}JJ=_(gIc(9j8UQn~@pC6(R
zz`C}*M`dyR6#M}K8<YU*_Fum~)@c)8_z2p<qG_BqB4a&5YDR^B0uU<q*J4^g#!!XV
zwrv0OdL(e-n!3+R-{GgQA3Uoz;ewor=Atp4sSUH?vO94WxSpgNpcf~E?v*C8+o@?2
z{GN{sU!9<V;8A1J0O8d2h%@ytkC&Fq#TDB#Q4H1BQh)ZDe(BbNI!d|8#;J5ZjU^S$
zteglU_lPp@2&cAY64H*BX|g_9GN0Zg>(rQtf~u~vqGBOT&3%G(XKq1}ll#wi9Wlk}
zOQQvVzC&&gfW0#S&)G$95G)3onvCf6yS91BaMr`Qb%Csqpp~V$9j>p?0%hGVN|<#P
z7V}MW{`u$dPgIdKxT!0*U~zlA<z<A~T*UatkuYCt=<e16)xTtp>-SsBi*u<|i)O>e
z&8NP${;@vNNc2J+M$&+XLC&+eTi<tjIPB2={mYt70$&Jc!kB4ID5h)deu#@DOTPMK
zb|vY-@$a`z0f1ixG>rM-EpOe20z9zc=Yt0t>!F*DC%DrxsQbNS_O|j~B!Nmn6xhb;
zL6nTOMMHWV;!!(9=}OI`S`JLcBX4PLZf-AfCdGlC?jX_*PD|UWb`wyR5K9R{0bpVu
z?Uhv>QzuXJ-w6ymQmER~e%f1I@M+5pp-KAkR63EZ%^_S~01pTSsK9y3q<BApa7@FL
z?5bUHTs!QGee3Wq!nf{4z6!T~mV3*bb)YlK`MJ60Xhm%U-ch6JXA)-Emz4DpTfXzs
zrAr?a6?s$l@$zxd%xXi&ommv~X#*f79T1wk=<=BlqgvJoq$~0doY?01ma%=PvKG{Q
z<GktxmwheE2h8bO);6gV2SJy{ayt)$yjO2>n0!G8V0<sRv-Q`eWH~a4FC0YKpBy%9
zSO5p{H7p`bU-k&igW358)VqH^0y&A*42k)`F(ktcDGNEt#Fs1*hIk^B{QINSeO#x#
zBPOr8yT}z9T&Wkkax~2y0tQU`mDqwx<1l5ZV$pfvjS<jP^OaAfm-PlgeFV~am{h74
zhwFR}QpCE!+agSpG%;O!*;sLNdEjB9iDhiU{81x0IkZiMviuR)TyX1@vri7Au(L7E
zp^EdPf6V_RRKIA=8f@GD>+|AbAJHXW+d@hBh>D@aN}1L#yr<6!KWwUF4P`TKJkpX!
z@UV*!lpeP#NmQ^?d2AVd1pM${v7c82Z!#>t>p%!p-)NGY<p&)f>DJ_o^hzzlDcU$p
zMxfP{Y@X}B0RkTeujxw;ZTvheJY44crxG2LXdiFyK0FYMAx)h8>+kPvAG2j9z1s(~
zCQg{}BBT5S<zCHi4GmeZK0qZ7j=P4=*>PH|I<gGwvKk1#<6rtqNre>rbou(evyieC
zC7hMfwK?w|m6+%)d@JpSO;=Qu5!RKxiB-8<NaG%H<^`K8ZU3~mFtQPK+!*q_9IMo6
z1Q5&x=izU4^j$H{;bS>vj{s8j1ryjG7A9vGPOum)b_qOM&a=>)H%AElLs~LC&EftN
z4dttj0V+PC8RZB@m%XV8SlaE0*P_AUUvSr#Nh;l^3Zmd=>p;1Q`@al(S6v-q;MYtK
zS=M=PgYo@Dxh3pU1-*Ft_USX1gg(c3ekq}D{NN#dx(=9hPk6AhdxWu;OJ2lu1%<~f
zk=yt>c9W7{6p`2ia3^MwDiJ6&{)Zx^cDExXbWD-oD-2c{ghwh#oJ`GA04~xj9ow{p
zZl?!dZ>Tc?>%%=Xx#qp-D@tu`QI?A;dhnnPxM;zuakm5RhQ%cw70uTq<Bi{IkC9v|
z$hw8fHR<{6|8(deK#HdGG7eA^{o!Bj*4o%St?Y=LOZWR8N>!1>$4r_aA>6Y?{=X@M
zrap?9PQFS`>;F;7KNMc~GaNB4P@ekP3DI$JX>3ISL~7b~+N0(HK>T8Q<v*SNxv^>O
z!|$6WgHZ7N(s#H6#F7v~7}jJy>mY%gT5yC2xs3fpLKE}Pvqn$H;eC{xs`m>ECjJ67
zv{=Hz^JCPdDQ{E1s^f+bRJa<&Bst5Bq9)q7V!CxEkrvuW_Kv2J(I8@s|H|)nC^Ln2
zYg19ok=_Qyy}Cy<)K}>pjQNPYb0q&g>{4h^c>>I?UKyrTGooDB;h`6XeC0|6nvtD$
zT5-|!u@-%d{-*_4A^bK`4=UA8z2syz_unQ$U5s&&;L$`={zab-J%${H5L5J)>>`0T
zLCkg?UEK}2)ABk@kNR!*xv4I7M%1Q%7B98i`ejIu^bymV=C+xqS>CD|lQ(_Lo9V~s
z>Gux_(XMDblwKx!&&A3SiA{=|YuYp_U7?B-NDIJ(IbNnQiC)NRg@zNGw}x9B#q{q5
zZcyEd5`d02T-fx>gC?WXcC5^n<ff$q#|zU3>iKqVjrDt=1U*{A15h<;PxvU|RBPqn
zaEjK6b@;W&YZ;4&A~F}c)T}G2iaIK0W{&C9M%L@7@pf1VxXR13wzLtlSjr;D6d!m2
zl#mc9@Le^gY9_wQCp2@eHeT<}A=?g{DL9U0sazuZ7O$t7!4W$M>g{L^4fA|dq}Hn)
zY-cZNX`QALut}i|!7jk5<lwf66Mo%cmjIu8H~Qkl1GNEKaJ+Fp@a>MT?~mli1!Lm`
zTPhN8fkIz-3@?CMN)D06UW6SF5kv4G^xa2#&8*&dFeJ}L7%Y+@XUS$POAVhqA!4zm
zy|uiQ@Ryh~XLaaHebAuTH=}CCw$SVqvpA|T+b~pr@`ao<<9R@10jyC0CP1t_J)Lb4
zT_`kmz<HbNE0^y=Xb=AFukVGnT~Ng?pN99DHRNReEsZEM(d-C_8)h84ms6bRKmJHb
zT6rB?a8qf@P?Cw9#0w6GxvRfv3W#wzh>RO8v@j(814G)5>9p5l_2eL%Ri4=!en?qb
z+C=YU9l8|X1-$W9j0pZB%t=KQh_=kyOA8#1I^ILf054rmOd*cdbC}q)1&hMKHaU6*
zdGJ^Vos-=M96P|lCRiNWU$36DyEws*2H@KcY9*=*3T6RC+y(;%gBD#Q>#@>5KFJdC
zAc8j`y<^{`L;5>7etdQ2t2(iXn$UNJ#^~ix!SJX#;Ttw>IxdPcdi->sKB6Nd5JFN^
z<AU>A?5osYh-kRhA|gj+0$`>{xFvRZOpg{Y$JDhFEDEc$JLVn9P802D1SNs6iFyco
z4Q$~DM@8a)sZPmncfT;(y6XM={W$z!^m~s^NFI7+Y3f|`j9Y+3MM=Q@0wdvz3K!!!
zm?vW%1!mz7b?2hfR$<I)9vh!XGi7W3k~H9x4%W+35Bf|7o-e~mO9P}f>qQM_R=nsf
zX^njPLiDDfIvzOnYX{9Z@3?|8KKq&tXXG)0<%1V5B40fIEjnAw(|a7$`&o*cAxBl9
zo@V5`QL(pfeGD5j`GCeq^W}biey!?HE%O(W>X!|6>V2=SFVCL@eIb(Ryq<#D!Y+^o
z??s`TmQjuwMyxr3RIBKR{IqHJVJCdX42%KPKi*}`h$cN;1CFJd&-1+eI`_<o>k9KU
zG#m#c{GcW4_^*R?x9o5$mo@n)wlb_L4AG4eY<17NT%juWtzB<zeMGn<0H5Snz5n+4
zvw!b3l%zDX?^!nfxBuaO6lB8LOgy;xqxMi#Bs$f6&!6tVxi*i=nu}W;`-S=mGpL&>
zP*GVo`wqplkj&$vkB@l#LdZ$uTQ6*ZDL%%G9UJghBLSqNM$-NQ^oK~{PhL!IYb7rc
z6c=~?-97uC4wS>f2;2C`V*e$h;AVOgOZ116VbVD$Icd_QH#R+tfto<_tlb33!dFNm
zbFtp$+&p{Hz+ZDvVzE8;y9iA_V)rsuH3en6Wiy8f%e>E@H+#R1pl=A8z2nT8@iP0o
zg(-462!N1euoYu=hP-o<m&!89^W@CEN4;xZN;8b;H)5(sp0;I5&sFkmTXqggscMyx
z>SI~ghBF=I#{snw{ucGst*M~Xe~kC`ckb(bxoi72lET1?zA1BveBp96^2{QM+`Ol%
zaG_|GX&)B+S#GX=(Jy#4)M`_~l<Rlz7I4|e!m}anM#JgCBB@%Jdn`I8<`D?1Q~p^f
zwrw0@LWspl*yiRoo9&g^>NQ`w4{cH_X$gc{Vqpo!)v3Cl$UL`&L>F$-u1-psvxELy
zimhV|X+Qv1RWL}-L(v37zh#DoluW_f9Ij7UwEi>8KGy+tCYd&lA6L&5qC#AykRys}
zcwQ;32iPIw4vhG5x$8bud*1Wbz@Q@M!wO^;V3|T4hZFP(-A03dq5u}X{lY4C)VF`W
z_9n@cPgmCJbCA-)rsk5j_f<eIqgb2%<SwD#RH)N~rRp1mqo(jf0`g&^NnVm^b>NK~
z2R*;aO_`F;gS9TzkS0lI-+vMKJz$K|sk3xJq0-=#UZ9AjP|c)$qkDA{W|}Mzu_$#_
zlaQk$!Fv1Q!vV53ktG9F(yFrK_s?m<rN$uCFnRj~%RhX3ld`JcDUzAd^qyc0m}`8C
zR$lZlJPr!8h14-9p!vRgKms+*aI8p!hoEII(xt%$g<fU1w|F|Ip-r%|b>N*uS~X@&
zJKg|PFRmlQfrOtW)$wM;JVl8n<NG~${P@6@h=U3<_OxneDaPz<c=9c6(;Civ%@a}!
zqIWi=7qHlGmOl9s%O_C4F09!8LLr5|Rh-dJ$fRFjo6<eF&e~clYa`EoM{{HB8mGlN
zOs^pye4Xriq4C7I6+IDH49&2NwszfjFzy&B@PYg-NZ;ux6CPFq*wQWPPFDi4S}3@L
z1-{~@S012=xDAMgb(9i!Feq1T`DtElD)p?GTQdl|Tkxg=eZ6HIBpS$x%bNeD(o_8>
z>>3oB(&3`2o02H7Mu&0;gys~=omnJoet1`WDo2hT+a7G0M$c>gV`c86jiU|k=<g8&
z_s~Z7+;47>Tq7cXs~%ctRQN@eDXa=9Sb25;Uho1y)#8Im1HF^K<dkn-LU)M8jM#P-
zUuqi#lc)OYec!Miy|1r00tMMtNEO$cn_E9ykK9U}aDYy{)g3^uJXotfkiGk3W1Rw@
z?IEE5%L<5z+YK`z3{60eth3Ia)xIwJC3d&`+K5v0<>h42eoF;WR9PPXt|7**k9S+U
z!rcH}4j=ZnC_~8I8*$D1Xa_elk?yc!t)0FLYJhbkOgWJj1}GW{qi>>wb^6&6T0ZBG
z_XWcTl{MDf!t*^gql2`ipTZ+Zs%bmFzSo?w4sU_(#AvA_FQ+OeQ*cl>_~*^lr7@0N
z`2|J@ma)QaL-=#Bx}@%76)ZZN^(^A!Qs%9PZRgB%4|HD28A>lN?V*X!NQU2p4Ex%&
zm#GBAM4%{R{rF|l2($v-(Jy2#3c{Fo>Y@~sWy??Yo^D)1Z00z42<6S97YD&j2d-E^
zz_6SV8!s#xbE1gnydR}CrYX@cHl3pbu2Y0r>Ww#tZWX@&^eMcd2`P9PoqsPvR)a?N
z9yV-AmU=^zs@CQ0HTm5EIt)Ygg7t-~9uN4U^7in2|HIU*{%qx#gc>qtlH;~wObGcU
zf*oN>C4apG8m3z&@jcKm<c6l^M-0~a7bK})@S>|wiFXhQG;}8c*xlN^J=0r3XNp0`
zK2RI*U_uD;g)-N8xG~un(vAD?Hpj@SVlB~AwO2Z2;mv(<-Dd0^0ij?r=K|sFRZ)@X
zPXRN$*8GWpA}qJ)bjmWGn1~P*V1+d$x@nPR!!`kZi*`P-aw&54OEmiM*dEh~Sz4Gl
z0vsJvD0%6PkbEMO;>E`^lMafclYPF<#-<DSDTgbs)~|_79MS}=D11!;Y($MKb><1K
zPINqzu^|8<&D(e0j=D}-px-Jk^*bDFomeeKt5z+K&GJb66@%oD4JRk_Vv&{YK)P-)
z;$bUzH!j^)It?Wd5Bi#$88?*=??}PiaoNlW*xaJTL_}*KuIf;RC;q{WKv;(hf|!1m
zU0<J!6S8nrdcoT=o<2Pdv^;vo*gl<s3ir}kup;#fqn=R7gLs|jb4kr~s&jMXLJb`0
zpnC<@F<|oQSEF8@%#1si@nOf$&m&PS6k5gcxeq9FMWcwI1{!^{EcJG`jk1qmG70kz
z$O#r$u;c<s?qXWn<%R`{$_(r_fF0s8cdk#?K^T3s5DS85OFG#N7&`O>mY^I=HKil!
zdUJB?uN~4Ex<qhih#r-mkM7il_%3o!K@|h_-`(9TMMfFprib`Wm=>|T5Y?!MHbV%Z
zO&B<6Obo8qL}@lEW0Fo#5eIMqu<H8BnP5F-q#7~Pp3rv!0+tajL!ERjul&h}twOj!
z2O+8(|K67x?5^-l#kr<91!jh7TiyrkY%|?)X+NtWWm3?^Xp$6#N90CG8n>=`^06W<
zZQ!LRbsWP4EQy~}f@d;bMYHxi<0a@)q(alIRThjo93l{gp1Q;s{!}}YAc&ZP4=+mL
z`)f|zNe|mJ46Q}hl@$_g!xDtDQVUqNHZQ~g%L^WU#|87}uLZ9JI@IJY1jgN_FNzy|
z^G7lyt6*B++I?2{yd)T462;rDKkPA-Sl-*4&aRFi0-}}OOM5~e@)xoAg%zm~e1(KK
zdUmGV6{exWutckq=ZMi|QkDkRG&Sa$v}l@Hjo#F^Pappk!QLOTb34HE;C2)U_HNwm
z?AJ*|x*)}1U?XvOPc}<A4r#{I@-BZ*D|<SS`yuM!2k@p8uk*=z<ZLG~GD&&+Cn3aS
zNnH1;m!7?Kj7aQx&ee0!dx0DZy(UY3AppM6PuY}_vbhT}1*)5v6UzxPk;vgu&ni^k
zad;JX?e$?_U(;j%bYWSfxvK2=ETMh<YuMN}rQ^WB1}~{`m-Ti6Oc2$OT;&MVtz_eO
z+*?&i{oc`-ME#^U*s5qz8hPdn#Wq>$59_?}O^GymeKYg+Wcl-7iqQfKNzTZ*g}um5
zp*|mTmxF(}p>V`^1VO!Ur?Rs<fi}Q<aWJwgF82g^!c{hau2LIlFEKYoxMPY4Gr=CV
z8Vk&I_>JhdW_v|evwSF@XAahagcY3ZU6+QViAylrL$>oPU-6UMJJa5mB!mp}U^7+I
z)#N0_2S&wn9$7|Dc+g<?Vvyu>%V^QjUr(;4r7VnF1jwxU>`l`4o*XF$|4u)!dapdY
zBZudSqgV|*y_%&CpmBV*73ZeF=O@X@MbghEdWsN@>L-1us1RhFkWqsnA}3fN?Ev{r
z6iEYn8$Tfy?m(y<fnCLtgxDW_%nh~&OPW08(b_&1yD^>LH)|Ma$uQMji)MF_d<!IV
zh=N>Ti56@7O~_~vrU9ff-NybITSDooFR`Gx>GX&{tmN^y=`3s2IrwjB`ZF|QzdmqU
zg1*mjfAU9|CU0E}PpjfMIWh8vY@#p0RH8b+5BAyuN%3l4Ib6uWBO|T9xXKG}k%z%R
z=_fIa^9R+?y|PrvH+F-YVmqHK%S<oGMUpVFCP=x&_&<qOFUf9Qx;&r;{vs@fM~`j`
zmMuz$;Do)eH=-N#Jvz*x>*&#=X?nl+`t{}WlKL1{l0c;Nzl0cCxR?lK>h6ty{GR2t
z^!-l`$(ebFOX@(Adm5`dxKzl7ooa;VuiiH&pb*{JZEkjz32hBW(?qA25PL4LXyYWS
zi`Mr6z3<v#Po%ZW`n1~v8T=MWr+>a0ww#o%>0(?JS|QC+D$Y@3#~v7^B}L<GM?ty)
zYe!G_t4^J`vw8HwlT)M4)y$*Kk02-FD_lizDKrGa-L?pzMT%je`afk!^v)eSglf=k
zz!|N*G^)W2ijF=W)4Z^;2#LYw%X%Knl%Fxf{O4iP@Zc>GUAc0ci6~hy#c8{Y-W+<v
zgn=pbs;KD|mQcM2K_Yas7*j*X<{#nsDUO#JpMdTmvi5+}h%=KQkz3<?D=h2<3{jZk
z?P)%w94cXoQ)bN?gaTKCQck1_)VgKVW9PCaVpj)7u;JT3Ukp<Fb(x(aI)&f9eXHr`
zH^i)=cD@_kpl``nbWct}Qs!9o@_<wj)s@h@#O}^_ssn8jHcoI_{zbxV8A0D6JoU<k
zaB9oG+k5WxxTe#G4t1=3b#R*^if*W5VMaEfwzn>dg^G$p6dnS>ZnLanG)~j==qW>A
zB|$H3q|svK=O6|$6CDKT0`gMVcX+o;!G@ufp~82TAJCHWx^RI;01P`?gtagU4&(U9
z$AZVtbE__<0aG+1uptp$_~#$*Y}a6kwtBg#X>)#TWxW`0vdPYlI!Ik5o$D^u(wz-1
z%vu))vFAodv+B?Yi2i?mOc<Zje+D{5Px*|fnTtlDEPrzDq414D>@M;NtP&Rt1)e&=
zk`c~FWNjN3eH9WU&ZCR?dY@1`vuKP^2GMqbGwwP|%MJiuP-@)dsnaL8q979lZ$?I_
z*1tE_>8|jftQ`SEFE?sZjPKdgr#(Pa5TexV%a9)(5O$+#?gC4ZuQ7h}CvBfrrE%zj
zIq+c~_N$Fh!$J4ii1w6kJk5B|dp578;MpYHH-JYXR;{qVX5pnc?l?q3%CB<i<VNWW
z8IQ&25F~;&zY#W}@Lh4{89N0pJJCU5r8t!#P-2=360jGk?(Xyq0c0(^bVc|}@#t7Z
zd@Z4;gh$T_CkOqEy|y%CS^~~UbJiW-j^58GW!G$2sd}k}La2+%Piuqg%*@25Bq0iB
z3B$}D?YHZ~qYd-s?EXDSxI_ki|4gcB?Y7wD;CuGJur2(d`BP@>*s9FTA)w8SjJ&q;
zwi<d=A%J1yg!qwe^&~bCdUA)%KiL3_NwN5j5{0-HFzOU#rQ<VofeXWMtQ9q<I=Jj(
zKzcz&JN0P9SDE+HmaY(|DGh3z(2E;Vu`QUh>*unXr0yibWJeAI3WMaV?!(P~DTXf-
zv!FV1(&m0k-TAEpGMa}3{He@Va;(nnjB-|VxL`wvwu-YzNX}7?370lHj^z(cl@Mbd
zu>BKj(#gy;su`LQ9I$b1#bk06{~m~%x=q)UIjcVEOr_(^xPBa{u;&iPey9;U;9tMr
zZHS(RoPY4QQR=<w<n```gd9E7tQGR{_fWVRF&kF26jMvJxoJC4T3T96HDxf$wKgtP
zwZ|ZG-qL#0Pm+8^bU{KELvN?VSB$x?Uek9MXT*;B%HA-HMOPCT4j|knK*An^L)<LY
zTERe3Va67tqW17>w?y88o|aQ!J1=c)j?Et>bbE&fnod8_untuSbBN4D(xj_$00Ke4
z%_i}h9C;o}6}#Q~Tp6<o@F=*H?KTZ}a&7O(X{RkV7#n-%w=T^uR%YXfT+%Y4BaKDC
z*QjEIGO&94iuEZiJ0_y-r|SJ&`C@dn^68gJ|IG*bjCn3zyPn1ww$kQBZ_CT`EjpN*
zkBEL*bD_hF^CitoAEc#qfCiU;5X*jN`(!2dsOnnt@_JVkS;FA{_1&FkJtf`0m3J5Q
z00r5@XU|S55e=o|9ZGMMh(Z)2gXx#LVFr@5;!M=-+d?=p)jX}6Z*%y&-&B~QeHUrm
z@sJ5xMR$?PuSLVz@7L>aQ)SUvEJ~KbyQpI*$VJyIH64}a?$9q{5Rz=p?UB@csLq66
z77i5EyOt7%g<?6qj7Nd>=qRu&^b^Ky2k?V{i(4=e_LdI^Qo#dE9Fw-mJr6yU^m~3N
zICz4W?qnQ)IRH9bD(ppKg6hsitF)T2hUV>D&X(OmeygvW*SMoNvE)CrJA268nH>UE
zt8OkEFZyrkxdckl7<EPjIDiZ>fQ$^4ih?{DFb=*=Kyt`U`N{EKPX`R#zMX38zmZP>
zAfkd<?JqREPoEx@nRtO`h}uLU(ZhA+R0V|tSQW89Zf4ylW#S!8^t;|3$y62wTW+4?
z-g3DpMl`KKVUBE@(+m1XsI=*g>B^F}dGQogGJy#$t+UV%%RM|aCV)~<WW5LF&DhBF
zfb1b7@ID`g`(oI(87qgg1ci+{;@*gXgEv}QrWtKYrEG#U-@Vad4Kmp!eyZKg%u3^*
z_S@wLqrYKSO=Faf`#6#isy-xOo$l`VviJLH`2MJ#Cq|-D0y(QYcX(8%HVaP<xlb#w
z_X5lFUS3#U>(t0>q0JPg)Bnx28X|^RT*jyGiJ%64RV^hcDTy<C8V`BA562i}(=q(*
zgbavCCai!q&l39P@%?ZpAi57mPd%u|Xg2+>FOLTb$C2@u%=&bnQ6Twe4g}+5-qhu7
zz6+ct@-Q&>I<LjIPEl~ZJv4l`|Fy@NnVDz;Cvv2>=e)|DI7`D!m|qbF1@y`}Y3=q9
z)gnp5)rLqEr&EjGfiocBJdkf3Qr8Cin!KWowY9aR&_`uv9MK}BTPx}Xy5c`v95Z8x
zvSXzO3dK=#<_saf+w^_N`+xuaGt>xVHDE*2*=^Zim|uiV4MM-_K(13Xt+AF;S{iLB
zx0lzB>(xncqYPK-1etci{vHMAqTYM6sshRLj@KEuw~WDD?i58p83TX<pQK#JB@07c
zQYBCqv>a2XPv2`Cc4O};WY7r02klV>z<fH{dRAO@Mq0tb)vI539D3MXEo1HGsm*oj
zKADJ`gPznUPyGDrx6B43X%Vl9O>z^DJbbszwgYkuqkU0Ex3=3HVyBshnXg~};Pl>I
z1G5d?e^RFIX!$)m>(@j|Qh`QNlExgLzT(-msZ&qqTM9f{TiYeV;s)E2u0e(sYBU>y
zo`c>%FI-)<Y9@do2dVzh8j8c5@*OAw#FL>B<M6R#foDoM&(I{jF!tr=grNx$UAKc*
z#s@L0Mo93pT(>6NT|WIRV!nS)kA#&wWB6h0^wG!0%<$44tNX#9W<*g2{)*+O1JX=1
zOlWEpjcU|eR;x}IiBO7sPYnGuj!y6vqpuo%tCh@R{_7sX#)4UrPBmW}Tg(5jQx8a-
zMgJ(+3GJAGNh?;I)s3C9Y^EX>=O!B*&RlxX(6qwM_3KD~(iy>Vi5eIB2Ibq|{>|7*
z7fv)`B~L2PK%UJlzjr>p!Q&f&Bz9DNr@d-aMrc^AD>^bI5w;hE^^ur>M6k39^Lz6-
z1#39`AMQhw;@%bBFw0JH3Y9KwVmF_e>mIcGbN}N;)F^al*Ul<-CorjT`byMk4a3Y>
z9l-i?rBqH1F=P8Hr4J-DlA>aA5WK0C9E8>le*$n!^APo3Vn86z7OTI`A<qRr$g;mU
zdz)ooiwY-|fTg~f0|+4ZV~pZy&C-2XY31j~y)gIIfEb(Sc_GV^K)pOJ;@uTppD2u4
z|4@i`r`+woPjA(#Bb1q=n1jOFkD1hEdWd1*i)ANq)sm#3hnhHgd(0CZyI47sE^Kmk
z#7J^@JOCi5X!X)NyWi`~ncl$5z|p#L=W$UGx<|K&CdL100U&B(X6=iZEd<ich0rcZ
zg$BTq!}vqB_c3<#+hbEw?rrpQOMA6hzI^^Cpc*k2t);oi)~n?GmmOjtH=PTD-5os#
zYGVuIwTZqD;HycVt|=aY1Ep>~;N{gH_2<>t0aq1S3!*@|h+GU9vO#9lf>FK83K5!q
ze`dQ{bHRct#{sQ5kQT->r;y-OzPz3>H>#6eLAdB!I!==La(ZZe`&Fs$Oh<E9IE(dx
z--&s~HMffj3x(EL_v!8Yg(O#^&zol6YHteHm}w9G$dZk@xKT1P2dgvQcL+&oXQ)TO
zWW>8zt=SYqT$@eS2<oFD{db`u6)g%@Dux4@W}b~vpzKYJfMsIDMCGf<GS^=xyOZxn
z?rwGw(B1hHPd{uGqY0?DMib&m><;p!!KC<w84i-f{u4|}w`lZgx1i54JOr6;L&#+@
zikKh<Eu1|&FFKlBnl}jQy-BTPhnRr*B$9Sc!L8#9w4-9xOf}O-cjC6pq*ckX;3^sx
zS@0dTyu0C`rR3J|BVzcLKtPyPsBba*NLu*_ydXpm4*kqIN*xDFOOKY9Kj*nBd~WKQ
zZFr;i76uxi6HoWlQfY}-(&MkxR*3*4=4=Elm_Oa``9hfeFSH|7H^-dGBbfw9norG1
zXXz53xzwjVy32%AgZfnf1Z<MQKXMc*$aTbJWF=k@3e+F-h@Fg5|0gARVvM6ZllQRD
z-A_DuSzZiWwft+i_I1xQ?<fi01h4OODt#Y}V@VLSDk151bhK;Gqu{t(E4H@k>?M4G
zj8i2f7+No8_}l4JMyQ^f|E{Tx)~gZ{u{ghRI!W>iG<vs_yuP9{+FedarulXS+7%gt
zl^ca<ClFOp%B-`u??L$^F5tZfo(y1T_)*7nNDR)ip@tLnuTWK571-z8=nmESV8j}W
z>6e}*oIBTB3}K>`GG>@L2+oG*Z88@>r;1=mhF+^ohPwQMwLlrzA^%I$f6QB5A_3kH
zvf0u%ZA*d0%Bb)mi>>mnXh<8!+6a{90h(RWqv+CYLUL18kl)MchR~~U6%(fGAr!aS
zTwOG<SGyNi7uT`v=O6}^amb*f6hCal5AsbebzK2GD+?lqV_|)SbSVI+k5!mP7V)M!
zQ4q=Kg+$Me?8I3h7~p07E=@T5L?GEZZMee^g~<|=m<cWDnFIiZf>z{h01LrxGKF<4
zVOE9MRp=8GWLIi8HdfE`$&yyxH4B@GaHqw@ZB3Z~qe(Y5t$CQ4Uw%9SrlwjY1tlA?
z@rY*n1$3O?wOWM}VXvv0%ZQpnYSQS80vPc56q)7nAM8jreOD4XNt_up{Yr9k`)LH6
zeyTK?G?H14`mzRRzm*mL)Cc?%Vw;S&<y#Z1cs#7T*k^?n5}pBz5Bv?H?gQW01pV>A
zLQ=x{DyE5q=DEAp$6vZMSak2FxVfVH=q>t6+!is7-Lk;l8wWZO2#fI=x+bh}8R;{e
z^7!+ban+DRVlLtJA))QmM@*bp<7**DuNjVyS2;Q<@B=PqB?_t$0Ve&@lAI_f`gErQ
zibAIY9WX#7)`Pvt-NC6eAsL9I^O|IYN(99?4|)so8v!q#AFQ)C<AWe{&dK}3P||@?
za!avymjmdrsR-Naw!sNHZo#T^<a{JC$2TbqAMWSAI4*Vq3R<hN$AMLRyi9*ghHLB%
zJuDh2#B>n;-5|5C^uVJ@5+Rc}x^>j2KJytGyHrWJy5L6p>W|v&a-TdiG-RsH^?E|A
z<k>;AnnO6tdFCKSNph}zKy5>YIykKG?1}pnMLe+u5Tdk;zZf5kwD}O2nR#K<hJRoW
zpw4L5$eFZo1cO_|kN|voj%I5wS+cuAy?vsXu7Os2>iZ`oAEc>58-q?j;8U#QbJrp`
zxE5@Jh@y>#U$RDrGF`^Db^1r?S;K~hTB4PcNda`<tj$d&$q=cn)z(`Qn=vgDJsf6k
z4SC8DPikswvwj{#%PP*}=+2f=szpH2!HS}13;z}5krQ5Sq2Zni#d>RxM&9J+?3>e^
z+O5#`eY{@C9AkFwzcu_E9u)AuE1ax$%o#=(PzRQ;=w~0Y1p^x++D>T7GP#xsRE=ZD
zc)9{T8_!cM8rOPYC?ixZXfuSuKdZ+_QNExRLW+G-(1oUnj(gNbWGAk%w)VAZ+fsM`
ztx;oXJNyM+m2X$K^xUJ|pzbN3<#lWH2FazDXYB!uJ=<79oM2_08{?<g=e_C1zL>J2
z(dkXJ&2`{75m>y)j)4@ae8Y;L^ckOsT`9#&>G&;{*4CX-E)J5G*4ws_g+vuqzttkX
zpXjaq@#E04@z)wTn}iS~aR5*;X(S~p3YC6GcA`F7owDoF`FC0mh@PAg4~ZthO50h8
zR;4K_VOL6`H$E4ZAlfHA$JFsi#n4yQvzVPJAEX4V2jw7|aMRqh2A3a%*cS93DiSHL
zxv}*2a9#(yuQW6i;3}Sdbb=kP3&+k06&6T?V{MVw78O`RxE(Ry6Zkd8_|lj{<sF~b
zGO;@Q{c5N<Vf+Grx)vKr`Pr|_@1eniK?Cd63><80XlN*u8Z^AEmr5|Vw_h^)4h+p8
z$#d6KMiw+;WI@_Pik0<r$mq@uKY#AfOkRo|G7$R1=g*hc)ie)9$a4?OIubt6O5S_>
zbW6ZpCb!c~w7}9cRy)r|fqD<aEvxIpJQAJ8Sv%~U^Jfq^)G3{lkd|$c|6_C(n6{TK
zo#<TKg(#bBGg-FZQ5m~6%rT?V35xz6KH292xmx=!1wHrXTFE1#plKbug|mhl;>MHf
zi(mo2?{qQ(0?1~l8O3BJZXpadP)pv9l{6SuNB+S_Tbuw4wYd-Sxv2eYn#?FQJD^a0
zdF&P49oO66#_|tT(H}?<mR@q%%(TV2y4KZ-(66-0!tZ7kO~L(6wD$`(S_qnZ2<4N%
z`o8QNy3)d5u6&bdL=>vxq%VdOL#cIb&U{fNTVK0woh1I3%vE02+W*LW;g`l{0JMSA
z0Z;23u&fgaYsh&G9U)*__weA0F2_UjY{bYav^{`&rqNoRg--%^Ksa_AJPhI6@bv|i
z0O=Mc@iWPTLJ4N=DBrjY{g`O!g`RBBp-%)P?+ze3alYIG8x)GHuuEf#KvdBfSa4Da
zCw$TJnRk#V$|5<Kznxhum2Z-jxk%2zKL-?|0{yTRAQFx+_CN{Zm?2P`^jnbiZgVy`
zg5Z;6_6+LZKX25iQzRy&pMtgTzomedcQNfx>&aR7E)CJi)q%V1(%jpyuJN86MBeZ}
z-U@7wDwNG|5F;)!=)upcDk_4uw{GeNekdkVEbtIOJf@-tmyO?)|7r%RYo3M(rm{xD
zhB{41Gp^oxNDt~djIe%^ItY4zT!aTSZ%m)%b(w!%0TC-8ixP?y5@$a4kIc4&Q7nRJ
zdGe$)=fffBWLz*hVZj6jW9zj(yeEl^7?MC@ib{=C-tNrk4Unaxl}xA%kj=0;8ywEY
zjoF|iy-&4pvqeAIU^V|K&<kJa9bN|kqO{K(y6p%7f<op<qBHm=(~)~3eWqNCD%_)#
zDCSP1uKjO<6mf@VBAl$K+unSPeEe>+C?$W-{r+i=Qg!2-)}OYP({*H?j2YQ^Rc9sv
zpT0f5Nk%8*MjOL+VNo~7SIVuavvVz8Q`g)OR}oWS?rJ|UqDU|H*wL`!urAs!Tqpi}
za8lIhUX0xubNaN~^UU8ZZ+E3;S?p=2A=B-7`s{80WR98P*X`G{U8$u770oSof4p&3
z+WGa`@7<{YJ$5bK(;hvtdVI9M)O3qQ_cosXvU6bHzJYMcRy4?Y&)#+eqFhO3%UDH4
zLrB`~{KS^MG;wP4E)G5(6rgKbKlf}MkyH}Sb2Fw#@ob5G0VQfI2vi9kQY-bR?P4~1
zC6h>B!sYZEI@ImOlgT8(E}!prR3Ni(1IZ~$+8%*4n6%@rM`w)z#}>fuR2RobAf>tv
zP&<CyxP?%V)upMeV_Ox8#0r$VZoFQyE{K*&*9e=u4*qV2p|M$UF0*Z;V$WZ`e2a6j
z2C3Q1QU~$IE6v;tNFG1wXMDjY+jp+}8q7mpfYrM;KX3hgD&e`^<@sykHeEv0ImUEW
znvF`r&A`b6FH9Ueb^-p8fgE@ncHf|t#_Hk0?l$)J`OL#{+puw?*_P`*j$dL2hZ%-)
zRk6E%eLdD;BV96dnSdCcXCrS^{RAMSZ)CKiuvT`;WBYhp7F%L%TyZ`7m5fiDCK*)U
z%jeD82U_#K!6i#;HStQ8jt#y+EZn>s9)C(SWB9Z#-Ir4(Z}Kd{IQHr5IvqOFNxB<?
z<~Bc>b0y^F&1~jm-M}!4Aq#F&s=MxdbWXy>L5Le-SB=w4>@`cOWUg_|_>ZR!AI^rm
zb*k{)zNS9*$noRyCh5`<>dTj(q5cVnn|fO793Q+oD<dO^ed%@n{0*4;RbhtF3wEtr
z7tr#<y>(@3bFD)I^o#f0zRa?2Iq{l#GL7`-I?<Wy(W57LTkdoBmTkY9cf%e=WZ0QJ
zeDJ{WVfUGpSfnY;JM*0qQrAz_ltFoGee#naqwF4(kW+>P1l)hUQJXJDnbNjM0-seg
z&e0s?jmiq^Vgdrx$$1+&^&0SxRD2r=C+U=<JJT9yIoa8Ipt**{c6^Sq@tSC3+&+Hw
zw_MBU*reT7^a=a?=yvE~08!bRXy$^&a)i}trl;8Dt0)et+llvkHpV_mySfGC6n<h3
zU`~*Mk<s}0V!A=^c=-LQ4vm<c=$4GlfD6+AlwYS{tf?CP)Cpp*{JRqt4b2}n&IN_C
z-QIMRn@gT3&$@1=O~%Xd-S4MB((k|ha;yo9522R_H`?|0_MhaQ${F2OgOB*Af2U;4
zL`U<I#pX#3JLa$r52LKKT_S_<A_ptvdh&G}0EP>5#%)+U08^T4!opbv8%<Wl?Vav8
zwc00q7$CPQYW|8Qzd}uemdD2X+Dc^mhL6v@%-9PAr9~flJ(?9jCvD%&M^2nD;1z?7
zurBwJYhFEh<-M_^Mrqi`Clwd>Ni4B<t8*>Q!N!JIJ8*M$6NdRU`0pgL+q2I8^i{d^
zvBAX&Xdl&06`YvTj?eyLk)e%KP*m)oAB&lRHMxOUj%WcC7*lDl@?yBObSSo+HBX;C
zGh_)-U}yQREwk9Ve*M22V%HqSevIG{!pgHDgIC)8)&p`!^VF~~Y5P0xu6C5B($~}0
zKANDoH*%iEZ(YaqXD_hlt1q&RXdj~{B#=ol=L7EB#M-b9WT#9iVAXDjG^#Hf-g?KR
zpt#0<=mCQg(<-OgmzM;g00<^~(e+g_;2T60ZaRAW___457kuHOm{q2CHm{3smk#Nl
zhnJptGQ5k@w({dC4Pg$}U;(BrKI9zKQ*pDq%1@Y}1xjqg{FRNbYMsMOlME>U3px(p
z<xdxKk8!q`1*(BrCVokiW>@Tlvkkp^Rf9njk|PhZiwo{)FIr?3Z&&en(EV=+1^Y`&
zyRJ&Tz@=<YZ88NE<l*bU=2S)Q^FxQ$9j^yaoV!<TmZg@KUVp9{loKqhQm9P4dBm)j
z;&(My*sAzBE(U|>F-x8L;~F@ZA(ev{IeQq&R_7`wTlNRFIZrnTlC_EwS$5JSeTGZ1
z_w8GJ8UI@&1b$-2HNTRB_0unHdh?_H;qaDaKvUK9joqsQ5)1`@o~W#BLXtjow;+(5
ze$`Esk<p|zCL1CMR}00Vvo_!B)*Gt4<dduE)~!(#hk<l*1;@o165nZsE1AB<no&S5
zgtu7S;2)=_PgGH9zFKUep|RIKu}X$^wU@U(Dp;z=zdm`Q@D-pJx2*DI90orDbu3dQ
ztSSf3VkVd-2l1y^)Aka-O<n6o(qwL#k)Gk@?HvNET}hNuY|HqK>B&5aE*K;v|9BIM
z&{CE~kGRKD;c(<ZLjwaZv5N8aUl@`6-CIluO7GFNE57yT0Uv_lC=2hF)HKBXniLih
z;ShiH(^goZsv5J_M6nh7_v?2to~h?HEL1wA&Iz#+NI)pCDR#E!Gd?8V`q9de@E#K_
z%KrT~hJ^duwa<?sjBQD8%8$5JJy5+zVxFXuDgS*VH`fa*J%`=jd}QV1^4R~hvv8K&
zpjYknW&M=~2G?mv0V&hw5-wcM#;*{FCTMw}W)&_d1vI_1`EL5!K3=SJ9(0HOlsg74
zbaFNjMr3EsT*-YDTMQGpBDX782IyX%6<N}R=1ZRz?#(`VHU{?g;WW~?JQ>ikBuI~G
z72+UH@I~lU$&Qy&a=a;o-yBYmk56xnPzG>qVZhx}HXtMEwkDO(40svVDS()<6u!1+
zr}oD~@)T_ItxyD%7rB)FdbYj3uC=9(#Oz{W+74=;#(4LpnZBxS#$bgj@jMFp@d1IB
zQ~J>DRs=rDyr*(#Zmg+;fR+q=&ZoaTaAYS(?77G5r%iMu3Ww5wDmKi^1I>QSh^s`3
zq7eTD4Mf$<)L$oP%|&$@$~I!y3UBmeedI-TEjzm~0+AE;Phw?=JJeQYa3Tkjx4c$Q
zmHN|^v~Pk-lavQ%=rASbADh^xRgP=&$}ZlxAw#vOgMO-~(SVU8LS&xh+oaESGoBt|
zS!~iZPI>jMiE67?U$#%GRgCWb=`l?{W*lzz->$%&s%BlBHjn0nF0WEJ?(3Ni>A17W
zr83}<BYnXcRgN~KmyMscrxt&xMO>ppH)k$smCdjq&f#j%+g#vlYUvp=9ne4~>W(k&
zZJw2R-Z&!9cK>4$x5xo3-?Z?9mGn2S#4c21P@FZ<#it$LZ7bL@cM6FG8tDe6|E?Oo
z>$Cbj+ww0v95dwND_I|kt-FsunG`lS>~4g`2|Ec1rJAG9j72`tNY_;Z?-&{!tTpV^
zLa;W=sfCSZWwvE)^X|}0<x!)#);dPF(wfm#e;%e=Q*mhzt9={mOXjj8-s~vro^^S)
z9-Ny#k(|tAw(_%@hLCI5=0gVMv01WWyHaar9`&Py?-BZb(tu;T8$!({%$~i9gsbMI
zx0~;V=tB9m_dI$OyRBx{>vfb@8!RknkzY)@#r0L`-$Oukj>L}&(0=%ZF-X-k{wy^v
zJ(P2oEJJF}j?WtoonlEFCpm|~Wsa}XNJ`7FUieB6KI)vWZx|xq_U&eGf4yblsaq>w
z4TfT_{Z#2rz93nAEWIq@*1bp-`*@Sm-{mtE=v-k|gbm6($*|Q)jYi^8(L$be>DtPu
z<{m`<rE|XQPid^4m{9WR*p=J2jl(P#?Nk_Cm$wo~U}5XeQ>`~SDRd0?JC7rG9^PEJ
zu<^qb_h7ETDM8L*rJs2GENl5<b12$1Ut^<<CM7a=pn){2=0fW44Qk4MwBu9(=H`Q8
zDWK#NDHY8cM^BtkD3};&92OE0QNSTVxqA&2?NAjLePO`6&%SKi%@6940X#}}XX?k>
zGbc@2Oy~>Z)@R1bnxsxC+N3BfJsv8$u6*#bn-t_NJ4Lx+u*KrHXC}L=1bsIOGh{@H
zIuARlxYoC2*HoFErgE(De7&VA*DtJb(c)YXSyqzc^nS?~BBvCsSUN>=PiScJNr_qm
zO((C1@3ILc3e32%dv$x{TF(#Sbr{N7amxM)F!fcV$=5KZj#hFYCuqbxp;-U0lJ637
z?wb1@hv&NeDA|}YuxXd70&PLROu4?cA*aI6@y6&|VsP_lZY)UcG3$%{t~a%X(F-Xz
zydeN(mRz=)tnk)3DN85g?ERs$-COR`W?5A3Jo;iH*`2b&o~b1K-1S_YDajUB?#{(s
zfXZ%ouMY9)RynqpFFF+VD7|@QT7Hb}rcLq>4<n57RdF>WrZAWb|0cS;t}w#OSLbBB
z!m-DrSf)r-xnx@u;>gsOAM)}*6qHzYQaJ{_r=r0c*iT70t77w5<2;J*6N6{mC~2yx
zuFlNAd=Shg>(Y#cGfN!xf_9HHH=R2=Q$GLp*~yE$44q+locc4E%BT3XGl9H5qO~!C
zWXAG?kt@Lt<Az%uhwV@!4^WI>M;6N>2#JZ&pZn{{(l*8k<rNh*sVz0Bs1R?#iW`xo
zT$m$6-1Y`E_>@>Pc~w}x^V})Nodf@@b65uF;p3E)j47WC5kvp_vC7yugg(3(nTr=K
z61Q6(haC@zMig<41@6r&!^~1~v(H+AJuP`0ZZ6_%;j_p$tjXpZF$`>D+}UK(0A$Km
z;OJtZWEnnI366|h4&_~x>YgepA#BdBTnWdfVp|*M?M_x-U6R4KUP9D0uo!9xN(yx}
z#j~Y1-z#-+mHTJul{vxmCNG8|O^~g~!*5D?*K>vecz@!=Uy@vL{l*O&RAXSzzQ1M~
za6KF#R@5uC_GYLSs_MTZb-pfco2W|eeOK7{Y=}7WD6g!OiVMhh8nWm0Y`@8j$`oe^
z&y?@65mltEiIrbGr%ijC()3Ah$&zPd%{uOjw6v@#4Rl&3bdB=(+5$yA*;TiRqLIT~
zBwi72wYK6dFsjZ}{Hoa=d7^6r(1OUuB+nkbWf;2Z%&(HV-7C4{ighzo<4yQq=NXa`
zD({Xy$m9;xh|%pp%v2Rc`#r-Vqz8`vSrf*cCZyeLY;;li{>of(&yD95Vm*?~G;ki!
z@U#RLxcphbi|U6)Gs6-Sjahslv^&-xdB<AAlvx{kGP%(yv0vHS)R|k9e2pX5AD}@$
z0{L!X?e`^6m1n)ZU0x`vBp=DL%o%-Z&Q%Wei_xDPs6Mj%=B5_Jx3?~wzQU8aFK^`J
zEXC-Pp;y{f$+80L>+9`zPO8#aq)H(h9cw+F_Gk5CdpMB*&``LXB~Y{ilt5F1+x~Sv
z(qQGv(5R>tJe6|06DLlTA5p`kBB{iFN#Zow_i$R7M<4J~GUc1Jqm?&=-+d01P-Rlu
zc>eC1h%59rc&J6Eq=+(qId?%AYfwBssdDu88<=*@o9PooZ&sAHxC_C-RgOBZne08W
z@izrh9{gf><)cO~syVCZ7L$#uPdqQl7@VHRqLgQv!Ujvrknr#&<gof1+7z75m!w@I
z5Tq=SK_cCy<SV0wZ%0#*$moYyjyI23SGKfBiHYrDIj1<t071f}9quk?hvs7s+vfYa
z<F~_NML1^(TxeE~8AlkRG(@rED{`x9N-)XiC9huHWJy%>GcF9EhOIt#=~6J``9yWV
z#YjzE>j9A3pZpj`&{?@1nO_w>A$C{Xv4ay)N2%Gp%uz{!XF%4soH37Au5}IAgCdNx
zm;eD@%wFwM`Pp1QL(0oM&E-7*MTsDZ3u-8_j$0Lc=Z+DHXGwm3fqi@i{*ByxDP%YH
z$qj~CjG?|fDfbNN27Z7#$dM8__)j>GGd8WRIMiMG2D53L>$WTTmcL({S~`VOkA_5R
z8ccj!KI$x9tiNQ*NhtA2sihHQxOh`6BbBE<6F|zRJ7qoGaKyLEKP7LR6PI4=Yzspp
z@}?MzC!UJw=42#^xSTt+bSk&a{GS%!_s-_(m8o4il^r{F>`&AH_ArjNJV9y~&XV7@
z-b6=pY_;k3k_{btq}8mCdG0k?-}H@hv7K?suAPU{hooJr6i<mx`ty$Kl2k;!5l7tm
zP+ndQ0+2t&{a1#^)2CSX*uB299C^Uq+zfjTM3FCxl?}xrntOa(3O|W$-}?J`Yc3tj
zHp4Cy)jErCMApKoa7Od^pmiN$(jrYVj`bcReZ1N*P$MxV<xMwdy5G#3tCqUIq&#i&
z=0_7v%A$<M-st>J6IZzn-@bi|-BPBRbwwrVVQrcApc?16^Or8|TvfLdbxbxI`Y97W
zt2iKyF-EYp!Om>v>&^Ryxm1oU*^)lgE-nJ8gcUM;9`G)oCzIJ7E}s%A90eJ+Cb{T!
z%C)Y2r+!LlvgiB^Dw(=6@wTsU#UMj30Gn2BP#T0*B}?mITC1yTp0|jKb9P4Ar~I-^
zCQB)I4JxfG0QeFaZsmc2Ybxu%ky|&A3R5*tAEy;#vG}xgEz7zQRMZKldFQGhxNC1+
z{o>NZV~?*=&Nvkqxp?7dC2$S$yBCzD*5T`{83g;1d@;$XeDnPI`>~(6P9vA|o%2%R
zt9R}Cy`aFZlM^^|K~MLe^TbmjMHy(C6cImOR`w-={Wfv;l|S7HEC|<iQRpS_IGi#X
zyQD>)gTD{%-$gETfN_hh{sYtvAXgw>qE?`Bg7j$eg^B)qAJ#f|T<n-IgoNMa{h5X0
zcw&&v3JPq4drt}=u}QIF9`38mICJshO`@d@plZp>-HGw~y8`7{?O>|yTnF7b>hi5Z
zX|i95y~B)E*_0J^KRW79Gwnf?<d6wv8@4u=wuX_+JIyv?H35$-hien(QJ7KWDF3Ly
zvD_d!2tAsUkQ<6hP}MKOWa5#=A9WG#jX~}!D7M9D%%O94-_1%q_VU2ldu-gIgl$1g
z3AKJRjPQVffayg7RVBODb8$&rRp@YQLy^l@?HxPr*x&u~XG!UlVH25+kR%2tyu5;f
zQ1CbMIt0}(#cAkM`dgC%Vh?GLfr*Lhg0OE>4lQqA?wr(r@L=okz*!))fPYI-`+Qmv
z@68xIFp~URk3MQw9EXIp<|rQe29KLiapa0rnKS<!00x5)CMYW><&+T-a-rlJ1WbDS
z_A-tDrQN^ph`Ep_hAJC^n(EzH`~nvC&&Fm&4jk%kWONaQtUP>(z+=#q)qQq7d-g1_
z>{by9#hOsdc+E{~X3SYW6mq!xuM6%k*iY}7wKX;CXVO&J<r_0Eu4~!k=oljqC=kO5
zbLX1zX<||FgOd|(e1I+zr+{5aq5&A9h>xWCp^9bX07(W^>IG|Db-}@FZo7o5XdZCz
zQ+1Pv)|0~tT8KW|UcZw6`t>c&ybkRyy*%(ugOjfje8nGQI6aYi@})mGCsEgP+Muer
z&K}s-d93-?)a6x%dV1!Di~MG7(Lg?vEkG-~;lub5dh%>3t|ikSK$uF+-g=G1Yt{9F
z+}vwOjjP#y+3|o_8K36l_9m_7aPT;~D{w_Zs8_T|jAv{-$?+KI@#8M&F<6c@u!J#K
z`0gJ`rA{tQ0$@T9<@%z_N0jEQK@H+ES(AHXj$dZ2Z9{cld7X+=wSL&DXwA8;zn&Mz
zY;vA8<SA<h_eDnz8RwnT87LBi71K4o%ePiA_NbDl>D=2YMrSSud=d@69P~~uUqc+%
zwcf9F_Hmyp0uB8IMJmtN4{42q?#vIXSRg6Eh{Ka+`@zYWD1Qs#n%Lw(s#kG)e|_C|
z-$@!vLlLr31R3C+jC?yp95Tc}kts&l8yvOu;l;Qqi`0ymM~OLmeC_2VF_NU`EGufm
z7qE(*er|s3V%~nI4y=Q?6@=&jI<-t0maVN8?=cal$Utz!Z>5x>D8iR;8zCp$Bizj-
zHe1JB?5?7jx0bKWUSWvSIl=+Rm(zsHRf~4*x+}nAGKgx1!RF9@X|QTlFx(c|NQcXh
zW<FRHlr!N}dLR<->i|TY5TfYI3hz@p&A1Hjskv)coJ3n^Y+PjUtDCTU6(}pRah2BJ
z-*|?n9O>LAb29t-GyKL4O~G-%xr{OG^bIw245pm|-qj;OfRP3fMr$|`ZLF+LL)jqf
ziG@sV0Q}Z5KLIAoI&>14Jz$ELW5CyMu#QEEPN7UrO$d#1lg_i~ap>WuGxoo2Cw1;i
zKEv%0l@9&?PqVTNTwEGA)cJ5&QqyRP?JXh_#hyADen?oD7k#0G8W36VF7AMJ%O}H7
z11gb$wc&k$vywg~>Uh?FjDBtd36=Qayh)<Gs=2R_30E^1Zd$>4+rf{x8)g`J-bUUk
z2EB^pgrt_iCFOD0j!xz#GStHc7p%mZ@Fb<YU<(p*QTM|dG;k{2ymRN($D(!ZGSU>w
zzU+{9tX#&CTulYPRms<T)|RVOLg5}y<wNtt0;*a#xA9C1C8ex_f&u*?%PlR-$ENq?
zN(J0RfsrFnn(*y~#+H^hhPi&6MtBv_V?OuT0NJ&8z<GJ?_kY%0#<hi$Aw5n@Lx-e8
z&X;WCb-?sRcD6qm{jm;FzQIbdMw8HqP|(?I+SKjlt&~Ar1PelhQ;|fl?2mgimDtB4
z0uSZ$+r^m6L90`;UZX?btxUSKGOl5Uo2nmrW`PTUPcr>V3+F<C&ZNyu*-}`ylenA@
zJS<aHC%62przg;gMDxSBzmfl5ScBcFj=h-xG{}vvlNRf!#z(pVo(V*JU`a~1G^vL~
zK7rh39d+m666oWej>EE{{ifJ{jh*&xGi^dHY<-LPVv4BTPb=o|cxg8(n6mpz*JNh)
z72Vkq_$$HC#3Zz_vC-rC%+NepA`^5wDuWr1?()PPVRlR%N6pjMKPMI|vU86odbI}a
z+e&PXJ>72g-kd7%z(8?i_}C|EiIb@1dwgmYl;27_JG&s~Iip9765oogCOcz>9z#V}
z2+2z9JZ_xGl9@qLW8yYSo&-54qvN3a|NdxdfavKEbI+UB^+L_gotzLjcw=AAh)RDV
zYz+~j08v9M&1KF7a@N4RZs*RNS+~1~_y`hq5T^25Hf3&mdi9lXnZnl*6eI*nfgpgk
zCI|wkn!8jP_vAE{nD<PFD+1;os=CWa$mCe8j(|xXAzo-axeMEpHO!NrJk$TcCfcU*
zSdgoXnv0_3W!B>bldnz(GCsUyxhgQMt@c&qJT(4^P(2~Ua1Og*+6VRQRwl2BU{%#a
zyIG5k#72;(J4Zxn;yg`fr&s9d#iC``3Ex(D1{lc}LP6l_h}WiIDM(HOtwW1}lgsdX
zYxkdXPBP^euz7hw+T)vJj-5PdNQZ6P$+UrL>*^ZNw8HtD=vLL11<Q#KrZx8^F+9e;
zL<zcO0+2f;CY5;(Usmupm+oKc{n}Kvk{lvOgk(-tG7Lg;LPnX+(xnA}Tj6LP7*DuJ
z7}3qx<L}<u(s;k;Zhz7QQGLHeoo}#W1?kY#$FSJY31aJVTW?IBqJ>>*xO+>5dp@D9
z0n|YhPRw$LMaUL>Q?Y%L?#lzM6@nFnIV!rnZr`erg>NhlvDHc_0?MB|KNhd7b#ON!
z0qEYnyA)2|Nu?Kn=P@3!CUq`Kc8$jxA`B>@63Vi?@$kz7KjOfAr~(&>92|ny>&lfk
zw%>Jz_BaHv%`#|n-n~**kU06z;6WI%r_C1?*F!NK(|LM#3OUmC$T)}6aAc_$<J}>+
zTo`g^uzYy{T0K)vDAVdI>yIZ8-|0Z25Y1<CIyKf?6Tf7^X?U`LIa1TsKiV**1d(<i
zBG?I2raa&E+;;FV>Eb`x`|A|ZLdKzFn*aUJ7ZlwP1dx8W-2^Ksn5}O!&_{r`d*OhQ
zHJ}jmh72P#{7zpZXXiM<iXdG|D6t20On&%e6kml$+vwg}*J@2hP?X}9gx^vq<*Ij$
zwRCC4)`ZOULS|WYP}+VOYere&2Af@Ea-?YmXwY(Bzn;XFAscU}@5UZiSoM2x;V0_+
zK7tqE)DUa|UGg`m4Fxoikg#e=P|oGvg6ik!87m{hcQ3k&plyMsW}inAzGA{Km5~W&
z{MV*t@HrQZz#3vB+)g}Qym~tr0Uh5?6gwh;hNUvty*ot^lY^r)Gm4=Kw&|As{Q1-4
zxj%!$?5#(4_22&M@s!;GW-VVYNXc8B=EWi-7sRi5)at`ue(e&wNw7g;9fAcESQzZ5
z1QJktj$eyD9S8#T1hxtj&~IgA>Z*u>P#!RS$#!BHiMvJichT!+T4N5$uANMz(vThk
zWu5!;eouKUR<s7nHUQ=b#sVM3rPw*8UTw|;*==S_T7hd_au9q^wOPvtGe;<C6|w=o
z53F>Og_`wiB&Fa5Fhv`eteu{65~vcE4I%5FFqrRhezJZVsp68Jb3S{lvX;j$uo>7P
zAHxJ!i*Cpf#LZ_+y%&Pln6Xm;Y3e!LLV#Cz!1mjlez+uyEF#Y)IiyVfyv^pMl4O}3
zca23j5`?6HnX7Ak*#f}qnbqlJ5DP&F6W_OrdF%DLdP}Pi`Sj>??0VOC4b+50q%PzV
z6IslYFJ%07w=Hapqxj&+A$3qFb{{@#vzlOA!Ogt<{7UV|D-+ZN=tZhlxS{!t$f&{3
zg0KoTqh~z_u|{)-_pZ%jxo}6UDHP=89q+isrq})O$vETKkwC{<=6s$oa%8~D%S{Y^
z7b~*ful1Los5ua`*E6zKXk_jWlahF=q<(SdB}}3bH;Lp8z-^nZf_byNdDc1BG{@v#
zoww7QKOdT#+jr@lb<aG8LYb-leA~96w)jGAmx5RXD3l2Tdm^S-9ys`A_IpRAzo^Zk
zz1`j_y_lXki6V;|(eSA6(%ZWvUF5(-fY+;8J`6QDE5H^2`l(iBUtE#}+xAyFGv_~2
z{^|4QGfy?K!r@uK^aZnIi*9{o!<gvj6*K1TqpeIJn<6yUQ+<c>7&*F#!CgDan79ue
zIB?vl^qh%H4w{97XEpyjFg4q~xxig3@pkGqafFe8;b^+z^uj|m>Zn!((<dN*9>_{-
z>)?c*Z`jEZ1-85GR&$T|HP}&WV903{G6~P^V>T;J79nrRl6=JKVmIp28CCUi%p<~l
z6$FzY?~B`cd-j847KuCfuz<Bl6oEHkg_xeF5B$u=8Bbk?_gp@UUVqJ+H^CJ+6};@Z
z`*Yi{)d^8KWp-f=uESg|z}7)Ts|lz}=-{nhT(vhl)nCHn<m#k_Gt@0(jX!+Z@iV(a
z_*D?cX84!RM*B99GUDFa?$Vu4005?Xa)FW(R$1#KZ*uks#A&ZeKdpgnqCloatps1^
z*)nW10~^G29zBW~k#*4x1$1WGFbyo-qi-Xet57>NyxrGxO+&>eU36Dr6oB3Z@WJwA
zXnPme4+4rHS+pf_Vv*;eVL|XZ;Xl~`wGl{dw>xXFBgO=TupgX|P`IgKFnKQN7wP2K
zapTY`J>FDynS_YIm0|yZiHM;HoSMK88lkF53_Lw#U&4d~NNWL%SBGEF<R}-FlD++>
z{;5;qc0Zq2w?4iY)y8<>VA)smBqdfK>NG>L-WUihpG(R#T*3JtU*as?Uq(VgqO!MK
z+a>K5vLc0Y2)Uh4e0PX1po{MR-rTIRjZ=uFPYovZF(76l-1A(}EH5q#O;Qy{BPtQA
zoXJgFF)f<yy9E6yzlh%O-n!()d1RByt8@BFNW9hVZP}(WwxvF3Zg6<`v%9Kb3*Cp!
z`5Cn_xQ#@wr^egm)+%(C2=E7+B$HaUV8H>ft08og31F$iP7Ad<FHg^_0*=E_!sXq8
zf%WdKKip*}Pc|gmP^$<$772IKKe)Ko5J-vKkRw|V58|G3(Xh2EP^uWToRE;X*Kl%b
zU$qEFTBF>cEy|TVJRQ+dAto4gIIxYx+aeAB&Toy&e>CADGR+O8eYqXJQc|>R>!e6r
zpc6o-psYyq1nkxDOu4QS!DBu_P=DphAn2~0$!mV?N>QxYfz#4N6nC7dh-UJkR=i**
zVe4o6K3m&r!!1#of3dUwN|CgXPi39G&Hl*)tFfBal?StW9J5b_FiCJ<FAsF7$EC=M
zs$PBA;LWw00brF55qf0Dk7vf<6^=3z#_Oyrar@`bACHcMqn~;<RS^BRr!J9hBjNF8
zPjVlL<dX$8c@rb<Ugc81#W8adoq4csR}~3~>;ZlkBsA1HM1gQ`vFo%vNl)p3MQbko
z{)}XS4C|YA18b1z`@_0ONT^*f+Q;jx?ASFzMMSvTS<KWTtBq~{d;$IpH5al}_1B}j
zax4h)zpEzVyKVLznb|J8=I-|*TXG-82u4B40%R%r$cZPwmFRIpz|xn9yus5V)|ig5
z05XsEJ1zKI2^Xn#G9GKL-@Ut9T5;1<6D_S3z+bQJXUctJJ-1VxBDsGrb&4%!2<1+c
zkV29SmVaNmAFuWBAHhe5(rc@QW5;=dsROOHXY-yG_m_|`efOj3!;hwYD%~V1qqskR
z^7`$c<Q+0VLMSpW-InAnRv%i_QKDXU`0k(U)C_Joj?BFFc#?#K?>rBd&31A)A@+lg
zI5VcO)|m(*U@{(>`0TZRJ{x29*x8+Z=yBP=4!n>ZWqjo6Scw7>$_OEMV8q&V2cZn4
zFhEiy{pgJN&`twhN=hU@<!Kb@uVW1J#ixt;uWyy;ZN5$?ZVk(;+zwPz<hQ*Rw&(9g
zTol`0mR)E-nZC26$`R?vg{gfdJeFLQ?Rzg4YaTNJi;G)u41m(b#pQo5e6UwM)(O<M
zLRJUCcvCQ$kpCpf*RuGs621P#?T^S71YMVTm&gh@s||3aKIHEg9VKou;&YM02omK3
zR3K^|D!{cpyGnT29*q1RQ(ET`-9tg5-jW!Mm)!*z3fng%IpW-4#hURk5(N`ac95u7
z0#%4PBUVQE4B{w@zSEtoTIY$h94s3BXTeBk$M@#1J7G9}z-fl$>a*0q<mzqgk3l|&
zlTNc8f9*e$2f)82B-9InmJTQW7y*FXm1DP{`|0}cP4QRlq}PeS$eShNqbVc^B6=)d
ze4JEE&@Py(O?ov?tngm*Z(ZvA=SN)*2qX+eW1MQ!#~J^jK2H(*f4?pMzFqJC-m+4(
z|3i250TCY~Vl!)9C);uW|L!!6{Cjt*+3wW2fB!;4;?Ai3m-rjCPDs}74-?;$Kg5sz
z?<<HuWltokir@VCr_zfj{=Yv+)NB8J^<h%y|GoNNe>NUJp>h22`p(~b{r%a*_Wxa-
z_@l>~BP<avM?L1ESVa;N0Um$;!=K+vs6`O7`0f5d`Tt!;z}~+<FIn6EVvP6SmlP;H
z5UcaQf0FzB|9SsT%iDSXdBI!bh5voTvB!Nrj{N&3j$Qxn?>O@Rz2g1<T@rWn&rSV*
z?`Wmse|MC7($44a$9ezvalHaH#207d^y~G1y{Vnn`TqTey|n*N_q-D=Se}#htyWzB
zyFS95{(BbF+UWlI3=i9(|EHTGsc-)0ae6HKdz=6Ktc3qcw$tC=^8f#P<|F^9`;h_h
z;5z8<U#<M_TTi>?_xD3J{!h>K@6Z14G5-D2|K4BYTxng;|E-GfPJb@2SDQ%#bdt=(
zawqa-{2#yURT2OAr&Ies-}2AziPB^LJhBmwZd?AhpZfoI6Rzg{{rNlj|M%6~>HmB4
zb@zweYq$ZQ%uz=z2#g|r6>cgo2%92TC$2DYuiD>7xVom(3|Gc<Hq`?MX?v1-@{5r0
z%c({uqiNEBjQ*x$oT|Qm|MakQ@Sd12MUM4OH3eCy0}~*QzO7&Vqp4Z2mh_XZqbs?x
z&-da#UrO!fh|G`cSW=AZ3I_Ts-fyJo!GTfy`qQ^xlUF?{Y;-Pj$#6fRQ&Z!ca7t|k
znCwaj4UEMzBWj(K$mKUFn%pbsK%@tj$;-{X85fs-7in6%)nKT3;D<s$OWU|NnuiAu
zUakJ$Zob!W!0}o~UQj8BV{#Ib|7%py|4}-kstOKX?&g+UU0p4zaOzSjl>djS^MLDl
z58HoMk~9cqL`HT>$w;Vd*<_@YsH}t(MM>MrI8hRn*-0u%k(5G44k1O7WJJqoIj!gY
z`Q<$S=YL*aXVLHb`+mkfuKT*L8{CKg9a*p^{_O0>VJ|Cd1$`C!d9-QbqQXmRy7n7(
zYP((eWX+7E2VZ`*E3YmcAvXKYX4h5D1Eutuo*taNbLR~@InRl@2BOqUH*PFj()2`1
zDlOr_Ok2s4^i@`}sR;v#{H9;{`_Szl2RD~I9CYR-GzrIsx-Hb;bS7VpiCMaRdjiW(
z>Zq)wWMx{bx@1rE$j}>`LGwi7OdEr0l4@<@E%a_4u20}gKa^W=aMqpft9C8m-CCu-
z_;fw4{86pN!`khipWhr<cD^D$znp17kU>x6m+Ph4i<noxu~y9bAPb0J@rz{i6}%R}
zJ@tz(cHVP$|8pvFDLvA&)JUX3^%pN%)StZ|4KIz$R9i&hJivQN?{<5;CqGS^2=7_1
z*8H-;rln?YEiP9_D_wPRba!pF>9JPg*?i4GDix8{!*Zj|E>a2mdb_^j|Cr_6fjoh-
zPgg}7)B>1@RpL9TA)JB7F)lM~7WcTGc-p44TV!wZ&5fh4oTeVLsqnVBfDZvCNoDpL
z3?M@4r2T4jdm}^EA5})xGOLx_4it~E?)UeWlwl{1PqLjLzbRTAj;aAX>gD(>WSaki
zD?({PkVa#*n)JUri~bCfJ^C+EXt`ji(4IeEvVa70MXFnhh0E80;(S1&$TPnA*>U^V
zfLDpz9SdyY9~L=0fEU2u`SsONdgU0+=*=uQmks0%!^z*vEi|19&)<GpSu1IUplIg#
z{cmvtOYG0t8i*gXi4V52p&x<dk+Mb5>wS9Ejs}E0;hSjeOnE2yHY@wD@R3hTxD08C
z5P(?z_yDM2QN(o<&&(s#SPb{EYS{Q!|8W5bIX=-`vD>Ej=9X2DE7$D|{rl6f7gtN?
zm416tb0zWilV?}=-b#rXSv!%ak4|PaSHc*qmbq83K~}j_6tra=ERQ;R4t#IL^@xMi
zZbIC<6G!t}r-HDtC#KBsoG&UvXp3)P{yXm~X;uK=5Jcw{Ne<x`OoBVl54LT)JbH6!
z$;6o4VKKQqzi)TV%}-ug78i5BdiL&^xPrvX5xQ^Edo){=)eQT6`c>iPgcVQLo1A#O
z)^rmLIC4_%<xGw?kc&)`F8dG{mtAJEEpj$WYssrl{Pou*G#4wdJ$Qe0Y!1o2B&6i_
zvOhuPwjRf6R)K~xz`xBF@s7@r#stRBX(YdzOP8~t>5JeWvXLNKEUl{R?Y-ChhDyMT
zK=*?&Be$nS1+25%$rzZ?8>cs4Z)$o~_#!9Y%;)5^(H9shn)^mK{lhDdt6NfZ%Gc_z
zA7LhnqEydde-*0``x9!hs$d+mM7vO-1UDe<>M$x>^X!w$j^X#Z1roc2>Ex}NJQ$D#
z4l?71zP-%L8!WD4z`Dzgey7*A4Owc8+?SYy7+<Z$hwgbcdGiZ3*sP1q%!YyKP_c!Q
z?$a&aZYjVtuXwJb@Jf?yuO%Mp<q#be74-t(bl^;KV61_mnvzTZ;>{Ufevw*_%gYcK
z<k_4V*)Gc{K-`!(10l`8;G|2W+Vm60kHS@i^)Fl0?=R|hoozlG#XK_{QhaTx$$nzd
zVaa0ap~`SNzM(}C7+f-+ir{6_VUl-yu_DH!z1LoD5W6z-rlO70^pNFVr;i5)xA?h|
zzEexBMr4@q9vrv1df~>fscb~dFXfOD@nrb;3@%%_6UxvnZcyDzE8drER#-r)U~Z{}
za#4F~uF2`F(;;cvQ`*%^V|>QyIB_}c69tRcB4$jUu_XNMgu4q`5KO8$1kPe&{+u9x
zdDRUzdUzJs!Qy?MU6bcy_8(HQO3W=?!Xb<G71uSar=m?wEV?{<v|fC5>}(TZJZ-HF
zt#Fd}TXbMIp6Y2+8zTmqkKpD1I&P&AZNsXvO+l|SR`>?@@+w{pw!AiB#S#k(%`;EC
zmlg558}`tse6^R+ETARex-9#Fi)}uvmMrfHw3}Muihg|MWu<!Y;<pay#giy4Hsu?=
zzsc4|;o7#jlj5vApyaej5@UPGW9V!a+fXc6Y7w(k^Td9wF1<&kSAzP9FvDSH+0uT>
z0#p$m)jn?7nuM)`_t&UeCDH)-M1*R6r+Bq<mCEgTw%;#=la3FtQ7(=+p!LEol92V8
z(q0p<p+-X-+ij3Ue&N1fFB=R85O_bO%O;yG+S3MZUbLu-WKkaHxlbPay}t2spMbd=
zmIu#l300-}a<9f`$tfrd%9`Q}b$<X%NbLvx#R=T&`N+hvUCXg55yxuU*tJ?iEBEYQ
zw{hdSt8AZRkNOibEK$TT0tHwMG>`==i!0_O!^d`0Qc{nqt{j;h=cD1E-OH<u)SfG=
z`zafFKmBqnx3p7@mcv2P1HvHKmBO0TIPjBFj&1l+aw!@QvP$`Aol7EuOsj^e(#0m?
z#R0AO8#jVmqGTnnU_;Br)+;1r&bF>&lcZx`i6s5XQm^0=*{GaZEw309YpL+w<)c$F
zeK`o0Xq<RFb*+Y`zeq!$b-mv-u$~@)Yr#{OGUEx_O-tzKH9uIjEp;-YwQE6BFUCm%
z!7EEK!V}Mo%*#j{oqj2+$|a@Obg@qAd$n|A#Ey*VxKef#$1Lr9>?;5HJmn^TY(QJ#
zzKq&i*&y8~iEl$QPV)UY?jcXQm09tdQYF*Gt9S+5C+bU5sL3Mrxg6|&c(V8mJ8_Go
z`Z_AwNP8%)hQ2jak}SRNXV-n-6|FZ};v&iPmIRrD^ERwKO@HB;-8I3q<y@rnOHF;c
zV0O?QzE|(UV<vH2f>XZ~Vzc1hFk0^%6<sygCZdns=rT*tI#KVXRRAHv(zq{t{Suij
zpA~1>>FmD}){;cK$mn>c+3EC@6I2jTq$}{J2?<R(;J}TT&EyN|S=@ybxGDd>e(I(H
zPzWV#70`~scporeQGp&ga-R5}0>dR=EpW)_WH4S{yy;&`bHj(VytS8_OH5mj{BrqO
zLCYz%QF|zb`L>_XB9bpG-i-_!rew*FA1ki_h6~v1-<%k?^FWQFz*GBnTL5il0f#VI
zLqKYH^+c}6wVl6g41Rw+21nvs%@;t*f~FvzoR-iRzSZ1|I%13W>c#WFCYM+#Op@8V
zBzQSYcB&MAQ~BZ*S1^qnt^2Sqr(NgrjClb8U$&M5!vYzd6JL%}lIUru`0=-iXMXuB
z(eelqE?7)}pU<fi7nW~|ouH;r{()yWm;-~A{_x+Q<g}p0W~jZ%k~WMTs)L^p&44pS
z*%F!&I9x$-aZ+d8u*<I}V-gq?E@0rIbnhNIHYW2uZ=+W!3iW4p{qz?F>2)xyL`@RH
z_UlbgmAZDFDz3!MyZ_~1X<bV@S=kV(363Rag{YU}uPEhBm@wf|@wdNeZu-{3(wI-{
z1R$=<wT`s4i-xT${Q0TVr$+%y7c}))_~(14XZ4ja4f<N`LQc^4tI>8lu#A9!qFFmq
zECd`VT9iT2MU7{K`AyxD5f>pG3r6I?*~d%bpH6%sG9S?$Lgi!;(|{5uw3is&D@*Ul
z>jcnBE2=7#vz8Dd5PQ=dEGdqF>lyZ3x)fNaRPY{9R_xlX+Xd0}gZ`UA4;v<6mgX#7
zB^sI_6r2;pb8xoGSJo}*e%~stmZ4VeMO~XOX2XxqO?daCzERXR1%YD00`-}KT@P<%
zg4(<jr}Lq!66hhou#nsU-ylL2K6P&ITc^iun<loG9}A>1$lRnM1_mj!t&>(@SV4b9
z3)(u~OcW?5HhecO*!Am4!OuEs62kAkk62MLEVgjZd@=$!Se`%MB5jik=W>H6juHLB
z4dJBECs`tc^Bi^IvEXfp8$rroYG~*obYtj+SDU1V)ay$iZo$tK{YAur^n#f(^(!&K
zr@SmqlQXwxgRn?dYu4Hj$5`zA)V1;Rm{dWBf(kcaCgLQ|(Pses5^P}+Qqp$x`6*f3
z2BIcGV;3c2$ZXqsKloGrro_MEremarNsvRf=RLu5qjvp5;I(Y}LdtlUdCV!?62Tne
zP$fW~-G4m=&G}2TE%XL`7KLj9r^?aoBMDz1O-SnZ=qaH;vQ$)1M<b$RuWG$}x51{C
zd&80xy!vr2N)-D<UobXhsi3^k+#zas+S&^Qw5R&w69pgN(=OkJz)M^yT9rH%CaPTX
z4+m)#G<Ej38s9p!e}ChE2|xXS-S&=BW09m%`mjjIlT|GXdJS(58ZxJN(EA$BzJOSt
zd(V2zW{1>@1}SmC#BK@b7k_?qz=_8LQEj%EL~T}Z%toxby%Nr~ePZ<@kZSQ|Qhq)f
zMz6I<e<?7LW|DJ&o%6D*)`<l3>e~Hq-=cuJukGB;o<shy5HA4VRnVS^qm!!>ZY~^=
zx&7k<CBa!nyy!W-EirX*G<OTcWAerMf&TN6ffY&JUtiwQ-auV;;7mbZoag@UB6=)!
z2V37*DfqXrx<&1F;D7-oG`TEB#^l)w8hK0$wmmfveG^qQ{!DeR%jlyE>4QRjnAy;&
zjc%#CCaxNDKmgQ}HyZ}2wl!ED{Po9<Ymm7_|E8#sVeDdXK!fSnWa!31)E=ogf{Uwv
zHF|J^6WaK3(4=hGx_?1!5Y(%}y8B~*T!#{UQUXk=er4vuiimeNz|vA26aBz#PS<mU
zB-BM=f_Xc^`Gb);8W~WW3u(y$@o;heL}w+6&~fKF7JZ}By0x01bW-_VM3#n0)3tw_
zq-LIfUWW=F)mHv<tk_$s{$dDKSMT6yqlf=7LUenv_T-ZWY70emjE5W*MU}x5C*GcZ
zrg_%yT@wWXStMT21TMmTSVvc);fDMTegOfweEqZNT&{<KAnHK6K(e1cHn`VR=qEkY
zf7++c%>3%`?l&H#SoO3v(Y9<8KHzZM&db*KI(pc9j?<X<PxiG(Zv=;iY&sVD#Qd|S
zzM1_Pvj=PA75fhOtF27i&Yk~W*DB;)+(75I(H8FR-(SRE)wVnkJ>9Q%!!+yVNBitn
z^vPaY`Q_K|ZiUVdzE!&vIVJmET8Jm?*H5;coX%9mEG@=oFz!(bN+Mr26WS0+Sc;lZ
zCqvi_S-OuuLsDhCYpzVwue{|+je*gL#aP?>qU>2x?Shc7)MO<1NaOvH9=e6yN!szF
zpXhxs3DKKVa=dEC1p3DZh`a9S<RoUA!bPZ4m{9$45i_TJOf-=4Gv))J8HicfY_ft-
z^n_!mjpCI>y&dW}AJ}v$p~?)Ip#7)n<fR@71xZ_sFYSzNP}j&X;ca>hzHJ-k(ErE`
zAx)jLKAX|QFTd0<4>NU0wCS+RAr*JO{6fk}{3>@cM&I{r_T;nKZ@10=Qte_Dx6Cpz
zzo^;;g>QZqNl@u)7j%P7@{+dnhk$V8<jH+g_jmKo8k-C8&5|S=X4@SmvHO$q_tnwq
z0%faq`${L9&P0w%-hF(6`^kJ>9@`YQQ1gT_Ix72ZQP~ch)$ZW5{u5rUPl!Myha>8P
z4nOprAJpXf!#&E?Zjt->fqc?xnsf5vZ55b?<0s~c$>@ZfJSiqb0=!MW_`(X8vW%|r
zm=t@qhj$k$P}pi~no=)CMP*RgbD&me0t}P(<er_GSx@wITfeta)MxmE)$RgmdT3?Y
zZdANVzJ9%fCt(4k4KyEa)FTr`J5FxPd2F9@1*wnc9NC^IVvY~h96abZRInmyJSAmi
zcbS&t{ZAHQ!Uvd|-lkDg(OAL#`a)tU`hxZh$dsjSk1YsEUAulg-6r1f{6ckg^;R%j
znb159Uafhqmw|ynl2dt)1Jn9r{IcJ@gN-gmSgOcL1ON16-U*OH`pk=suVL;W%p=8b
z(^S@P(j@oE9vijLOc}m1Ns)Lq10xcPx8S&S0@u48Q+!qD7=;_Z!3|#bhkjP?lJ!rP
zYxbKr`w$GEZ658Ga3pn!9~&o{4h`(b210YV4`r1ZUI+MB*;=OLNwno$QnXa$g2r!O
zAJbRoFcUlwwj(058WsHP<x4L<O+D*ce!1PcbsjnEyH1o*-0<{N@{xyY7{q0Wjv()K
z^XAQG&!6vQ0H0y+r+p;nqtjng`gM3%dvZ!`e}_Uc?e+ngZGuhohCdY|#G<~))SxLb
z`OVO4iYcV)O9>^W9r!>l!xX78nb{Tji3(H^#YA*IyEUJ{aF)dUPBFRrxgS|U+HD|w
z-?I)dpqa1EtvUKtPL37b&O%TJ#>U<&S2P*V$FHOUd!~dFYgs}ZVj%Dk65oH#H&j<u
zSt8$QXK(Lg8g&mk$mVeimTbgtP-dMd)J4f(g;Fkbmy`=tZ-@PVr#?q3{_NQ{Oc{`t
zLe-p|YChCQk$;ozeYD&Db{rqAkSBfz4oq=%O?DWbVi7?4s6Wwp<HpO+a%fTX$4koA
zl8XtQr-^42xJ)xlOm0=~UhH$sG^%U5$^LtLk+d=N4ZfpxmYOj~bRUzqwaV8dWtfD{
z^+oWxFUsjlV=SgxTlWVU*}83;m6l2v;|>A?y<ve%?O{v+vEX-nzL@*ycktjn(j!YP
z6-8Cm4$Of8f?j!RhZ3<|I&#iGj-6>}i-$=Wuiq-{{P}k7v=@KRO>OFmpu!Av*irSP
z!iKeY_Tq&nISA5Ts&wb95(?{QAZZD?w3tYJDe>RP(B1j!xSH)@08UEGVTZAH2`gyO
zVNywFh*z`}-^E28K6!F{iM30L#f}N5)h{0vXV~xVeXT`p*C&i0kYVC(C-ZjY;&<hj
z%^A?bO=FG^=b64(>?+FJKTijIQW)Y9(np#*CtC}<!O_{dP13l151PXbFUelkj;*=#
z-Xd5m+>|l7YE2c!F}eDeLj~+wcW<OyE8h$cVDm1pT3e0R*9XA!Aex&peYzOhfaI%`
z1JOnUZQ>c&nj?th!{qG-VO}T+qlrp6Tc{5r_GC@W$mF4WLg?n>=g&f?G(Bh;Vw7T>
znmE&B0t~#`BZeVC5klo|uTeM%%pI}j1RL#umL~&!y>qMIgkXU+7a-+TFg`)))e=w_
zqm=ObtIXJ8*!KsRhgUDF(`YqDm|i7s1#B-wFm=`WynWCU8>unVQ{Qds)1Bc(Ojq;@
z);!UXgJ==c9gm)8X45xQcGrQ&pZ0QnD<F>0A1EbtW2GBici|eb)3{X01rtlGw`2_$
zAHJYDG&tk;N>W*=Js@&cLroaJNG#^MeG$FOLLC&DFa{3Yfxz|yo~b$n;2j56YGw*6
zFOCw&Chyw1F0k`_DMAYs^Ym@7!ro^nOqxPyOJ~!MN5f()VSGI+txk`-wqgVgUFu%$
z<k~_!)`w;9PnSyW2FHrG;~ckdZ<REz+ZYWEn>>s7deVED6bl7(IuYlLNJ;5!5##*q
z#g4q-nPRt<-He<~ZB>%bIBZDz{nX+1iIn3F0l$9lN?ch{y{xIY?df&B>r$rlQ@y@r
zq4B`j`}O8C;w2WeqJ81?`AITzI>tjT-$bj51m8+8Bs{k?4o*|PQ&$3ttTU5V$&5^;
z<s6nk2k7yseWsmNQVVF0CB6?c;LWfHa=LJPCj5NRpZn(h;_F@T-R$Sj`@P*9{~8}*
zm9T;<#0?#I%Pm|3!F#x-^M?j9fOa|!jt;BKEbB7tEBcs7-eaPQ&Av4I#9G;Uqp5|5
zjC?b;<evF~lP_j^7H1HVMGS&sNxO-k=VtyGxvbu46*H+AUL-YUQWAq&YMXpJMx^yv
zJ1Tg7y=9%g&JCGC{rW|?GUvjmB1I#+5y@}E(yUwEq2<BAl#(La5CaRh{w{Cdmr%c-
zrl!1<!;cSBuO%dO7-WxvM~6&$YM+~DrZaay*6It6=-KPnL*C}{dY1>n$s+Y0T{1V*
za>mS=O0V_++o!KNedoJx!NSO@L+;(hD^00-FH?|XqVlV~Yq*}hX|!p_{YAIyO>5{j
zJbUrt^b*}AQ90=k1`<20?DHTF0WmZ^?8nJ}e|q?3^QqeS*iJs-z4n(@X5Bh786u#c
zn*4;{{^`r#pAHUJw&8Lxgi%V0blI!NX4*U3jSrRY?B7;6-YvyG4hwD3qlhgR(%X*X
z3wj5L;JocEjbk$}wUJt3X6C)!7QSZeY1t(JABY1>*}rm|^x{CnvE3?vLLG74zB}8X
z;1f6d>p*NTJYvwkE1HadZ9EjfKpO$X-m(~pj5nlU?{?0Pj=GeUT1kb5hT{9*m<K>D
z+%~&+TD1BGSJ(Q!O^$cEEVC}Otg!&_rm{(J={8w%gcYLa_tweJ&Q|yG!9L1-w4Y8y
zF>2I8nb$bk_m1O<-m-IY2Ab^f-@o5;VdSmiIKX}HFQxe2(RUbLC{y|*?Mufcgz@Pe
zPT9o&zWwTvM`-YbA!5<|22aq+QufFw+Vk{F#~WKBVV0}tI}IneVzGr5Khmc86K}`G
z31yjf4!=y^z!0`_Jkq>ucOU1A)~HyT)V)SAOr{pEBxW{9SthLDh<9GtJ9_u;4Q-`}
zKwnPGv;HlKRQpBvF0r(=8SEVltxcs%)Z(G-bsvrXzD3@*4Ti}+u_wImWnbhzF54Z@
zrS4>1o9C{PvrUEu&blP*_jl)t|AL!c-$PSm8v1WD+LX1C(PAiBXpx|aX*QIb^KR|b
zx&8ae=664r#lJp4y|7peYGfK3d(GgP5pT~NJ-U}B1YxT=>S^FF!o2?#7+A3NGQMmV
zlprz79FahP0WD^K6}h36O@uLEzyLAA!fwL`V<2WZY2#FP`HL^$>CJslw`41+k6*qN
z(^sCd_U#6Y8gRiOl4fE6;RX_4j(KMN8?q`*xnT(jUD2pS?E=B}Z?$o4iWGEqPSX~a
z)*x%~&_Jzr_a{x$Rb~j8)!u!X#j4N#)pY1x(!7&cI@m;`%vt|%LPUOfHw(g_cf(@k
zPJQb<Yr{YJs*id!S|y^89Mod?G>q!2va~++(doH9G?=-X!YZL=B7*}(Or;pzquv5A
z>)>kPGC_^P<9o8>F7I~znuC2+H!$dpHJwJ*?l7G3G4Q#H;GudjPlMO}{&-bWXFS!O
zob?=uvLeF424u>{2s$=WV+VST>nwD(5^@hP9hZ$gRQT@QA>3tQ%~K>29cY=lMepzq
zBw*s8(Ye*W-neev9Wfgog@v9QUR}O}F_GRqJoV5ad7<aXxu6(>^ks5P-{;LMhA7J(
z3PwXK!nLNam&%kjVvIr8uI>6>8?STr>{;*H`htQ2XY*pX@`MLg2mYRW(GwrLQq$Ko
z%G*SvjTF7{Z>yJK2-3Gu<93PGXT=?cCl{!Vipec+`|47*o!6W88HHaTX~#>fRFUV!
zA^Xx&2X7xA+HdLBVZ?5<MMXqLc5rX{b!H13vWA8)lQPlQY3_#^;B{$-vD>t1s;jF$
z7yn1}=G`56_tdrFK}_vgYO%8&zn^z=`!F#d8R(}!@dZ~C2b>-UtxL$*E?u>|D?oJe
zz$C`Vq=Vy#`F^?t=42q~>bs=6OB4GNv;6bVT&d=_&R0hgNpsfuPsB`RTnpqrFLP^p
z(=a-q=a>rln3{q(iA$6A9Avm4X#tZ|Z|`f;jtK9IRpiJ?NgaFGcK^MtnVE^#uSX;&
z_v95&P{i=1-#*b~z1`V4W6db!zGMq7HAcq7bf4>{t0I?jA+xPdNShAR?sZdXTi@p0
zoex3FMP?oGJ;S-C-}g%I&g-phJVFYBF^fCEG8XV!BE3ZsWil9q*7XOa)%`Ryq*;JW
z4y7MEj*F={Zq4o<=s8Ay^Q|A!;A(rw(25CHwSr9!&pQ9zB?Vp48FS}OL(miG?+{T_
zt$zAw<S3ylrWZEVj1j9Oq%Jdx6uUKrr+xVqe*3l>lt)kQKZ#kY_4f9Uh%nIdb+!3v
zJX3sSmt`{tbS9oe7uIKS5SX_YwSbqeUVQ+4W*@zCOH<0#0>xa%>C@Zq2^r+BF<^j<
zxgSI`Bj&)}OimV~SFR-{7H)aGFU|V5`>T<dK^8FWiN9`B^jxDQmIpl&6r?k={^ZEF
z<GXt_A9!JT2IE<_Y!~Q#Y<@Y7$us86DX-mvT_983L1(xY-8od7BbXS&iRSg07dDyM
zRp2*@-Mc&IH}>}kDLyE$hdxc;eZ{;EKsAT^KsD30ZXE>?oe?<-DHiVeIu4Ao0d5y*
zRIl-lGR!EuORJPNyks-Ao{anw(JUaK%$^Y2f$cTR8fjtrE*QJist8ATtV&g!l(2kb
zchzaFSh3<T5ini&pdp1d2{%I07R;V?e(SSLb++Gk(WOw`>5i!lA-|)u{^J6iwJ~0o
zwUH49cw>j6^mfA7fY}M{|0<Wp5drky3Dsd@V<{<7faz)~cofo}tw@{3x>OAoVI$7W
z%`*i;La(t7QcMKliv>;CIf_8TyB=1zV(;Faf=zsCYMdM$9n;F43iaM@yRlM;jAq&B
zNQudUC>A81n*W;Yl~e~SMReaE{j=l$O1oT6%N(PpCsK9I6a5Dc{B&qTR&s}4I*+wl
zQt1i8+rUYRwYAZjwC4Hxgp`_uTU{{5p>x}ld(qs@iJe=Q#!yH`<-DueSbUpktjfgz
zH+&={q$3@sj%$YoK5A9oeh7nonSDezRXEuX>DoP7Y_t&gGyBE{Gliox(M)$f!@$Vj
zD(iwg)H+Jhb?h^5Qlm0)CGCVkliZtjmsfx2qup_jwT(Y|8S_=_7LHvImE*T>AIkQg
zxuso`dr3{^WWrGK1@|q2uDOFk%n|0^0ne)w$RWzir$t9c=OsIj6mE)TV<M#$YCtLJ
zSsaaG`h@R*b7X@GP*Uj2MWwrHGC&7`!Kt}}di3ZKOOF{^<Gq}cdy|*|6bz-d>}@?H
za29zF8Chp@bMvIFpJWA7wP(-#$}?x9qB;sF)qx%$sWj1-2xw7$eES24Lms3@5z*0|
zK>X{-{sR~rW7zv~3^tbbQi4~Y2!7RWA7%SKX!(`1B5v_ZZ<Ar7RHr|R_QD5&atBN_
zy3}~w=tGDn>%IC}$4A^@lQR>0gK3DFoQMhq;Q!D&ZMeN5d6jr|KqeG#y#;V^a&$)t
zcw#MGk(eQ|`i^5fD!bO*_DDaHs-yE#D1}q}NM|L*U2i~jr@H{CSSjPP*=a4f78GN3
zbl>;U5_);%jm{XPZ#g-GPdtv>`e_!{WHm(uVf8tVr``Tt$7}#&%@g>mHi&oL6~2#C
z`SH`IqV|*OTG9Qy<#B2~kru&21nb7bjcjedY|1bp+`!w4Tb%YJE~c^FeT`!sU^Jl8
zuos-5rmR<1Tth=bt1)c~@^i+{Z~DiafinLHkg#8b)*UP}WOS*$lWb}X2L;Yrz*xlY
z7CTxIGmT~3I)hk!iR9kqszlU7-SJi!NnT6K>wXSGVXfO~Vn!wq^_eBiShn#TVo=;2
zO|E4NUC?m37vNUapE>bDJRQs>10yV-u%Z(fV8|2c6OY?*Q5eob%G3ulod`BzKh6}3
z;MP#whc$$*(?;k+GkU8d=m6-9)xogbQj}y4kt%<Pym<QCn2#eEj0(5}VkU&avGo1X
z>^k?VD|iV@Kv*v=^>&n|vsQ!E1;oGpi-PL@v1r@$<EnjP$l272BXciiP`f>Z7J(Yb
zd^!vTF<)M&|3&0DetOiEuhX1{e;2cA=(72GrFcVo5($~qKKSU`-4i4$**=(nV%k~5
zdGi!{$xokudYizhh#E-{kDNH+oIgYdNT&|XgF{qG1T-naNj9G1C&`-Q_}<)Z3D`+9
zi*uP49dScRriq1|dQs8HoA><7FEIg4Pp=zA3U2&F0^ENK7Il%rhzmz5ut#wJvF=S1
z5KZV5ZR+~i8vSr|fHV)JFyXM&+3X0_SWE7+04ynsg}?YpK=uB!R|rMGv+?m=@a_os
zl{%+pyDBBwYW(M%+P7pi|JH_`^I%-On25uJwlL67963O$X#nH+WyLBN@`G54u^Zpr
zQUx;?z#E;XJ|E>Qo7;Cwd!nWD*FRwzVzp%TKp-%d7hiOUf8Fb^FKD@?j(gu_5NAJ*
z2KnI@w^Tp}h^akELReIM7nX$iUeb=D(eM@pqq<j#x4kj_IN1!s28;_xgwe&%p`^D0
zOBZ9LHO+5&2{Vbso9gD)G$J6Uw7L$d;it*3sJeAVO8euX0M(MXJt3|Rh<tY?sH7z>
z-A}21h3kl}EP=ib)>N34!q@xLzB*p*|7#*2Z*pp^(xiX01BBW(!Z;riQyWLPF<-7f
z-7a?92{6TZ7iO*Rkr}Z_YVV@ozvorWM`=Hn%e8!<J>@^JW~Jl<`A0$QROA{D9Z6Zr
z9CZjlC_|{cd^(9vIu%-0;9PyD4i*kb4niAY|Dq_DxS2|5M0j{Bl7(flmaVAMRn#Xn
zX~&NWJ`mc1XIbsFx**dK8mzmc$3bbN+Q*W$0L!%ro$Fla8ufM<@jlVjYU<*sp_6bJ
zZQc6wFe4<zPdiQm=$;pLHkSI7F`Srz;xX4Zs!|-^Pcs5U`A!H~(Y6$Rsk9?7D@{~l
z9tqPEmR(&V2C^wCDQOSLw6WC?$;*NrMf$Ev;Bs!Ux^2O9v`NZ}669)DQ|ANdU!K?T
zvr~PVT(_30#qisk!yLMO3k?qc>_6x<2B7`3ygWaiAkrlf<N=P_jW%bmNC^zh&3jWN
z`9V>x--r>LW*jR2_1nupNA*uO?i)U5mEB~nSVormhr%(_hX(HZ8E*dmm%?aCs?e|B
z&93@7m2+^cw||9JT5|7*C6^_v?+TD6mt3GZ%b<7f-aF_la-ff>#BsbhbxdOOUKrWy
zXfYyKV?*bH8~^z%*)CL%Sm6w<V-@@UY(nB{+s`S(1`kdXa<U?T{jX}oT}}VAwOxWd
z80)-=zuWfNU8O{k02YRr3|><Guez%|<`E;w+!WC8HJPL~{Pn}(^FA3ctTuOAqL7j^
z4T~12*vNxeZ+rIYl@Bshuy+8Stf(p(mRpoj&cIY^(cMM~;R7UAAc9doRpW5i;K3N3
zT;dA;k{lQD!9{&^h8=qtmb3nbqJV<*kIopdD|NEf9129?^BuCv`Vto3LONgvRnqsr
zeos-&=+V8q?-{zn7(~NSLf_xGsrmSeuc(6<nwVrARy!dskl)=5P}{Psa~{AW-yCnM
z=TGwC|Ej4mCZ}@Tkjr%$W1MW{Pm>vP2g-l19rZf)klS_;$Q99i=$}iIE>oMO+$zJQ
zhm3nra8Qs;<X_E;MX+O;7e2G-QDHD_3o9Try^wpF{zeD*3inof5cfj(%lpKU#6c-q
z8TD)39JJi+!4&5YeRM(&A5LpoNHrG^>jmXb`d}whuJ{=p<cD8x^ub~LIm0UA2i<Ki
ztkkp(o0<5jG7Htc_mv&^%uXtU_RLTE>1@+_OyMwRC#QWYmAp^#seg_@k*Af-ValOw
z3-ApIMcm_X#ruDf$Yx~~8)!^E!Hyl>Gd2ISH66Pg?YnJ!DC0?_q|PsVcu4=|2!t8D
zf*TBor3t3Z3$^3rbRtK(2SqY+7W`1tSF*%DG8tKYq4}n7cd~gND!|^iEuiJW<^h8l
zmU1Ls+4S7PAw;246~Qwbe>LL*ue7X-XF!LUhiP^yG9DN>>mE;xuBfnws`4kLzF!-p
ze5D~ZpZ#m1B?u_Mkn?~EF_#z1s?<qIV}KzO+Pq3vuFMVvWT$MV&RAnBVE_u2SWeei
zkOTnH_`}(T$qzhS969A_L5JI_>g%iv<A&k^IhcQ7w05F(`TF<1t)-%J_L08whpA(%
zDBCJ|O*_8__cY&IQOoE2g=O0@pU|Q1yOmwS8`p`4z-+(YYu@4BSA#hKm5nFv>?$nW
zCRd|(liBgqIWzcsk%K^_zz%kW@_BY?1h2_DL&`t~xX*Eq!xYhh%q}Q6NPXnNr-md?
z@zB95zZdKl3nU`6sGNu!H&lhHDPO?emG0lhcWNq{kJemui>Xbit338_4S9(8MN(az
zR@qgy9n*zkD(fd|e&6Ad+h%ieLHX{@r9F?o)0|RXSu<b$-v)R0#@*GcYJE@OfJkHT
z&r_V8$8bqWy7B@g%k|PXv-oR*cY7s=Umxd_*@w;@<TOOA;}l&l6&vZ`718%&+^UC(
z>SFPaw_CR)7bLD!Xy~z{In=FYZtZVRU0iK)f!pjfy=Rr1zsxP#SQz%Iw3^XllcP<E
zwfK9-T)yeYtvh2Z7i%1=zB%IM)9auc1ls&cxq`hb+72GUjpP@H(`d)K_}7PngFApe
z{XDamRZ`P!sxWb~TbKklfumx9MdLnEZUTuJ=rr!nmo?JwAgd#C*T<hI{Y-_gqxhg*
za{RL|zd971Y~C7Em|Reiua=yun2;*JD7iqMUjzz-j8u#hR*YEbOynQ(ge;W5r*HSo
zP(Pr6k2GQe&HPE1O7nM)Kb)4+yoADrc?>0+bWqq8NmopxfFJilf9?(RP?j}-i`4I%
zr<F!E1ng(;-f3P=5ZmlGc$Zd|Tm)hW#VAe=0jH(rz@Nk%8sAQK7)%7p4OS1U`n?MP
z8{5)t{M^Q3(@lCk35wx1EAEiC0Cg7tlJ`}qit(pwET+G%*=+(uKpHDX+%hhk@CB9t
z$JN69OjFG6&ADG4jT|Dg?J87OBjeW_vNFrE_K)4^qp&FG>m)1P?;oGv9(39z<(=!q
zD?pGJKGw|}UcE}8y35cMi^>tD_sG?J{y6@j4U4WRb_V-my(kc#KCLFfwruSapRO*9
z3D1&T0Wc85p+Z!m4HzKsuF<703N;P6o1H^=shd3+FiEWI{&?b<#_}Jp3pX`o)}*{E
ztOWxS@i7V%^H;43fD#Se%ktf=og!z)v-?(!b}fs`)Ite~V>qg1m0OGYz{dRRoe2N7
z5%p48U6(qjo3tB4mz6bFL|@O{etzM&bEE&AmvoJzwQP8brt{4w_KAhsmmP%}Xv8O{
zTvt@hI{D0fq3wGwRSp9M-t!BGc!liF_^0W1z<~o^pm}pImTotX$}t)mNXmYPlE}Z`
zQmax7zL%D3>V<TY_booYX*sZrNO;jER&R7|@u*w0*>F-eYcjFgDxId35%m=}RCXXw
zrFvnX?|rIwKhSM4j5f8Vb63oKm6LAMRK2RXEA`&=);CkV9o-Y#xeg!cpmR7>$5x&v
zh{2-8#K$}6+z^4oE#!KF^fvVX_VX>vP-NT?CxFDd)59N<GpFWAga!^Ff*W!lDjEl-
zS*Mz<Zfjr5)p6PM`+X`~9u5GRwutxRsxsxKd-KdAP5YsW<d^%r-Z*XEynxuQ(=6wI
zbr|b1=L~nBb;O8*9vFT9oQmTK@8&*MV3@PltohhGS9M|JVj1N|!`1JX-dK5v@`P2$
zAq>7S-1dnm_O#|iP-*sB7<pH)I^c@d9^byKaJIYI=C-%zdc-75>G!qk_?ncr6;6_B
zy$^XcXPUfJpmzcz{3YGg`*)i>@(t#HPVh@@Ih(p~Rcw65Hm5I+Hr;RjC=6IYxRR(G
zF^LjvjEsTXN%^yL@~u`Nl0jp)5Y~^HZ))m;#FV1Qi7E0s{QkL?LE;K1BTZSiZpic=
z_?RPu|J5%V_Hxh;#e$9pr?ml%Mfj#6Kee$Z(mLLgwX~Ff&7-@s2ytq`RDgW+0o2sU
zHe%A-D&I5C`EKcjyT4WjmAT%?JrdK|e~L`o&CWOSuN_Z&c71c$;3G5a$BldSX=?|o
z@JSN_XPvj(xUsENc=3x*TRVNLoP`CA$?Xq=WbVZ-*zR<v{)t_>cJ&rRyPN8YA3S_$
z2ttC>1yFw0;qQf!%qgy{b3V5)(r4G!7We7D-%6uhAi>JxPd$zwnS19jBMb$a%mgLc
zdr7wS9R#5KBf$MGsiS}n1c(73sNg6?^g2aBPg`TTVy8rQVHdhndmeu>qVJ8BvsYcR
zUJ?+AT<z4@Ev=`Wja&NV%o7{qZQFGAxK~X5I?3Z@YU*x^P}nT5+5D4`P`Cgshp-cz
z2m!S7TEze9sq@^m#=LBJS~`e{KYKWdHQ&O@dhUWk(U}PL<0kzPm5yeGm1LaCvfi<&
z=P4oABy6N;X=au`A~%Ks)-^RX76q5cn8olV$P;W0l~aDfa>AW;|5@)U0YkGJi4_cU
zNLFxJplKAUoyK{ZlHwvYD<<I4!IB~BGh8k}LraE_{>eK42W})u^2;9QuzO|HUPbAA
z>C)~r=ViZs-2uZ9I4h?zXra69Irm-2M_<;8c-cH!>snR&T^{|{)*m{tWUw-~Cb7)y
z>#yGxF8Ms>JFcJuMB^YWtJVUYhx<5uS*+AhnwVp(Ep6Ai7hGtQ(rj_N@^PQ_bOxcr
zZgwSqqaH?Aa3nxtz$;-P8B_cbq#c3#*^27b;-SFueJQE1y41I6;=co_%k$?+$s#QH
zu9bBz4A|Dl_OoX%g3G6J?y%B95`0CvMP=y@-t3@2&jkq8apbK%0>^_S1XHnWD`9bo
zdWKTGvOEzVDA=o*RRIJvR+}F~ld$S@@Aio_ho5s#q#a5xx(1Jm=oAeva;Q#Z-9gMM
zq(y4f<)=M|V$X_x>;S79R@TNvzbWlSszPM*hr)ns;Bu6;VY6!WI4Ia(985&61v6|j
zw2=P-cEt=6*`b&S3KQN&TXJ`=Qd0)5-0}7N2`y0W%d1k=R=NEiC}<LbXaQaWV4Ws-
zABPWrPivpV*{-s*3Y)1g9;*pFst!}W<iRiJPu0%mk&ESB$L<_SfZ@R5rYZ(aio^#R
zTVK@S%AW5{|LXVwq1U|+*QCF_v5mRMTQkeYo?G~JQ=55c7`F|ziC-==7D1HQk?R*O
zeD2V+MPiL-3<!^Zbk=vDd&B&sXLh;*!UTm8kdr_ofoH{ygmooGN3mV)!D61lV>vOK
z?`oxa>YZ(N#vC|QqeoYJbZQ0>rJ&EFTN8}}hTO5#vP7D1^@(oEoC}zj8AzAzrTO2Q
zj?uv*NA`K?)C;T<Q^+rlMG8%+y8!3X{j5a5(oc4zoX$+;qHZ?-5+zun5+)q4p8Z=t
zL7#vmoeAdOo_X(bjC$%NMhbe~QM}gp=f!$)4nYKsxOs%UgiOLu11|cTub5B$9wHz>
zw>4u6#c*bHz_d`LCEUdwte$atdcPwMyIUwm_f>3;4o=s8P(_?8s={1jZKOap=Eb%W
z#0&x4qQW~_<5&@N?9Z!1Bym+1ov&dOT@WE`%Za3105*S+Zil$p9fcxQQFZRBOqfVs
zTE7mCd6z~LZ7lLbCZXfI;!V^?_Xv_~j*-GrxdHne%CqF)$nJp-EM{)Qg1<+L$B|>l
zJh_?x`1{RKSLLrx{gVq0flj4^4so){<q62~vhwu_^TY5DyHM7=2S6C>QoVi`vc{s?
zAc_i*e8dHZj`H@9B>MNW$3Xjku#(Yf1Vw=zr3Ez>4<61ygR&M<l3ft3nCW)9>6TiP
zqjCfq!j)Ed;`e$o@syyB+S_ZQd+*0o>LQT0I)ssYMG+dE{&bEJ?4h&{dXfOyv7=A$
zfPKD0vfA$9!h%B2E*r(mm#BaE^_zU-uBK;6_;zT7FLQDj?cR~_jeE$YQdk@G^~CAZ
z0vTP+Fmw^B2qlAF6L+Zy<dw1&9F=|t`b<)TovD!HazwJC6$d15)=`2=?e(vW_q%JW
z+XFuR9xgPj&~Qu`XN?*}mt@@qjIV{F)t?E&0RL&z`|S2YvFtru?_q*^M5v%D_Lm4r
zx5vS}Pc3###xiJT#$3|Hp@FTXh^T><TB`p2WH)dLs1!iAB9tf06Cnq-D*4*s!xZBM
z2#qY!9ol6AD2l!34R$nE=Pp>j+@B^1OC)+&;$2T9uc5Y;9=klMge^gE#(~OcKcnf?
zS_ND6@L?$e0=i^aU*@aME_-gY&nP^NorhkwkIJ<`!Nxkgmx5!yAN`IZgxc>i3&H#C
z+m)36_L*x+j+4Kal}0ChoYAS7kF|ciQ#m*dmKpo~{lFS=HzV3l0gb}X90?3$SfD@c
z;cM2ewQ9NUN8!Z@{uto>W%J~2NO5I?pZ|Dewz-I-BCs_ylttAcZgxvps)Y5o>YgVk
z<nKY<E5Fu&tbQjFEu!#CwRb$6g@Z&_9JYPtA_5}M_IS9)2o@OkUZd>9IOyM4M6TCz
zEDGCnR}547&wHZT6^A&BsOb}WF*Ed1-df3a9g$J7c^+FF^!m;Xn5d{@iDA|6IkBSR
zc6#S$RdFS0CEzBlJ~4v@RnLMU`{kFvxpcPiLn^|n;{*X%5^=ZT>XHRx?S7(}A#l(G
zJx0q4swR)uMoW#)!ca>#aK-?!f$a_jGkZ%;r~P1x&@c^rm}xFd0ZmGR#)0JL9e}R+
zD_6>i>{rB;%B!(ug|<rm4@mCUi~^Iu#=fUEEKtpP^SUQrYd?#bxVM`dE3HwKq8zqb
zQNo<6QY6U8m>aqHs?Ih=OxO=xDAGkL;_Fdd69q|0z$Kilm|qPzi0SYFol0IQEU{AA
z{Q;eG@d?kkE6}{j3*q>PpbSa^N4Wi5if#1#a9<@aIOV?9F&}eFTaAn?*D)Ag;JDSu
zYSke|<v<<3>|ILF6Po7fpygVd-%a3}p5DJpa;y~bAmTpK0sz=3pgq)RfZ&yrb;a*<
z?mB~rIZ?f1pGU#bPWHF@oZ=629oii%I6w66t#CDha?VU7Zwf^dXdi%!sR79!{N%2t
zK<K92tz9<d!>Aqy1<!2SG-<F0Msia~{XJ2@BmgCc#mV?png6k{*vsnkdm4)bQyAkQ
zOpOSP1Q9mbsS}i-d*acirqU@EBM;O_4s^u($mr;H4%_l&&SuL18PhX=B+7-MHS&K@
zvDpIK>KCC7a<+FGh(y05T3NT|3G8~}zk{ia#4&?UR23kK-$wyj1jG{<z7{lkLpNq`
zfMTgT-QYY7=-fO-uAi(Lya&2^MSw*%0}auT5vF{~J<alP_;m}+%{u|9ZOpurFi2dT
zY|n#4;0M{<bQAYPjZPLUfQ8%dwanw~22u4M1tCG{;L7)6<nz-BzvPII_HtbTh%N$8
zDOWU9MmJO@_zkYZ;4C0F=0pG%Ku;GQYUj?Kt54L-2QwM#)-aZ;QlTiD@4)5!Kw6G)
zC2)|7<n|e)1v$)r#t%}7Mnlo6&zzg?EHDi5`=l#zRvFTif)w$tcSQt3on!*7dw-Je
zdzyh_n+9K2Jva@5z-lIaL5nB>8QHaKLgD`Q{uxD+w{IWKl@<(a!S#WLNisf!J{V$f
zN-K9YC6#Tz3MHbp9>m^JIkEwnls=73VsfpS5TIh<qN}&*jTh6Hi_uc5Ypiq+fYS4x
z@`RXreoylAd-qhv`Q5eF?NQ`7UL@NzNbokOwEZYtdzc-vWC{yJ`@CyrH67pm#|2o|
zg(SxZTE|(+9E2bl!u+D@?DLqPAi+`b30BsH55|;hHi_)DY-{>LL&H`=a#+y%@$&aE
zDwNvj=eVZ{Yo3)TV<jlokd=ObS3r~|$t*AX&LTN7y}Zl=o=7ojvYGh&Efo1~(O@I7
za=UAmi~LGhFJa|GWH)6>8*wo2gB^|=r#e3LkgbM7|34)ZEPURLZG-T71P6CD_w(qz
z^j1{6hWhy|ojtpwa6zX|$-zTlDTueWq?#x};}h{V!9Qj4bf;b-hJk!0Tm;?lXG#3{
z*Z&^v4A{RH#vUOR_AO3FM@HBy+9dXyOLKVJ9bsCqrraq1y^rm5MNx=`9d=%jjUXap
zdGB)Q1A~IFeX<8sb)-f1M7$(YLWzYFB;$r(PpvsHkUAJw-(umYj&u^MPS}KZ2G;)*
z(+|EZJRgpOC;-k9MIWkH{I)pZ6bA<nAKv0=D3Ni<G6jA>%Z99=mk3*mBL8YS<Y;vH
ztlV<t74PzT7ho8{4d8qK?_)^qK&ga(0-AxrIE60VytUcQ-Fx=jFQ^iKvSU#tr@Uu{
zE-IY@Z3O{n(CHnm;feG(I8BlWaTKID(X!329}sOw{Q;R5<ndZ0FM=V#hZ`HWfgE!<
zBxIP=x|h=l5=AQ@d$bxI)jF1k&YKRxz>7{C+zZythfK;TM{;kyWmqX?NIG<e(AORs
ztp1OkE=ZKXHDtYwB*3-Zp4QX*Z#C(+zyUIC_?0X2G;@e;TDQ({`Epe`Y0td#3nq#3
z*tX9mkR>8e8Wgm9gcW0*%2zOsr3hRpQ)Dusp%izANz5Hr!$3S*V*S0ub2sXVGd+F7
zhGC+yPWXGmz11f~=<dm+n!B1FCE;R_FG6#GcWHFEtd2_~XpLM<98qBC;xb-b4W7Ch
zB4i!f-y*vb)mcN1^6lD`O7%g5rk9P9R7KTX<aCA%2D)!?7Wz8IdDIkRH{@#Wg4`wQ
zuPD^|i?o!3PRd*MQ`}2GqlIu_sXpO;_$D3CJoN<-i+dc=3QBE|?z+wZ7lL0YI8ZRW
z01GS+w-gNesn{%>Po{_0z@(5s`vj{$;!46urfe#9A2OFqdC#-CrQ@z2V%xpqx*Qu^
z2Q~{kKoA09HE{=t+=-kC154B(D~Y-CDmpfDf`h+u<!~p-qo!0fYm<HxM8Lu#GR0uQ
z`F_duA^MjTxB)a_t)(Sb-ri4SIegXaUexra5#{5>?n+E?#ojv_3mkCRch-zj$7&F%
z!!QwrQ`E`x&Sx7?_!YLB*>D5@Ao<FBa6`}$MbR5`jnB%+QaldnKoA6pOz+|Ogrcqp
zhEkiVV?-=0f)iq4adlVz(`+QxcV=M+Svg&QpH*|_bi(>*ldX{8w?wP-c=`Pa7#A$F
z>}+UQRGD>c*>I^eEuFilJ_@%X+zlQtj)4=x-HF;Hz{MUBbW>;iUw0#-hJW8L+aYo7
zQz=(GfANA&AR5O=c>6gmmzd2dR)Fl1SE8%niH+ytp=*PF?*SCML(P1XFh_#fDNw!t
zV}UJp6txrdcoA^1YLbAfUql#!8LdPc%?CQ+M8z6=kTg}eZ~7dR#4Tvrat?%=8o1Tx
zHdDx5Zr$p|xgf93q|w=r@TBbRjV@wzIW1ZTF(FpM5c&Hn{r7-%n>YIsL*p-%W!nj+
zGtm<{K-}!JeQDSgtUn9|T$44Ug7XHpJhyAh#a}6-)h6bJCp58kq5%XLR{$>N$xcNy
z4+~1`5As`)<TDannsvG{iL-b_X6%aNgX_q6zzGhMx&c{<P9J)UicIAclxPrvL~Mxx
z5`8(SGK%j*P8c_voSYD}r?`EsleMYZWMs9^!vm9g2;5QNDgrDebQV{g7Z+S5JPm1F
zvFbyyG0Ly##zKupn~T=ckVf46{o`KW8GvPo6Z%0JP{H(%-SNGzD2~5JUCEfyl!?Uz
zR5Ta1Zn^n4V#92~r$&PMqE>AL4FuuI14NufOjAX>xXDA9W^}!m-P)lIOQGdv_}S$J
znP`ZpKtn@NRz!ao;xKt@7bpflu_1#$d+uD@Rql<W5w7xrEG_s!v>mBTa+^TKq9x4m
zEAhBL#eeVK(BPs-nl*7TL&dB@E5a1`6KQzSE%fm6YWoBlkZnA>-x0itBvX-HkcZR^
zrUiT_-7T2tOcEIA%0@8BVppkIVY(=!i6B{kw<M8GP<e|JaxF3#EV~D%hJ=E@`A~5H
zvfE(nh53bcVKpmq@rUf`eWsFcOl+)LBpBmClmsMUrz9e=%Y&YD77vXYX$h?WBx^KW
zD<Spkv`%>qkPl2Z@#RsyilrY|FiXTTA{*f&Fd_^UXL<U@jl**@B$un!kGnP1a*wLY
z#)WlcJDD{3?nP`uQeIjSH(TsfzaDyI*>qGP?`TBlBehndhZ~z4jE8er5r>=%hE85-
z0A|pJZfz0F6C(=rMFf|uK#_~Y*q5l0l7`9j(&@P)H2o=EVz?4v&F%6Cv!GY@1!5Uv
z|8_FW``X6?q$Ud**Uw{T!ti4v;(*8@aw^dcE6$V{L?PM}0lEHPkfr?dkAV;G`Cs|G
zDoiKX1ZDwSB|=$<uH2q`n){8W&}+ne)P16KQ*d*8UQ0{l9^^OP)1xO~L{ce1n-&-5
zu_OnPUAHmdY?w^heOhAvL>VO>YxP1FEL{pwb})c#8kU-zhV5B>Vw(B|&Xpi=A6nJ?
zZ!W#qw>^mJ1xpnrYXi5M_%l{SsvIL+#Of2XB)94+6beJ7yn_$5j{UnHYl6LCSYO_G
zZo79o7*F7!V2@aPE7WC_*cIr#Q`r^xkcW>Sm!_!2N!Ux#@$`-l9h8-oEqOv7EmHIu
zNy<t@9)kJBggHOLw*Gp0$4l3WO^%o-9-Qs`)I{^22Qf4fh|obqMkGP>HU2pYH~SxZ
z=p`KB#fucei6sd|Z$`mXZFrp8&=Dc(ruJMjE`t%}0g)>TopnKp%r%yrw?@*}c2jAk
zp@2FGqPe)|jCSpW`$s;-Ycs_M;Yn)`N_Yx~*GhVsL<=(&rgP`caei7InRQ-L)zQ(>
zW>}3dRHTdE7~KsUq8vJFNsMY^7ZI&F*(`LYfP7_DDgvLvZ8y-Ca8HCo9L=m#<GK((
z^~xcxaSW)NxZkdhDm3{q+$&6rNT01V3XTcZ<+8ZeSkb%GEc*t|(uO?^9q~U4I!;(I
zSYFkGDBqCDc-^r~1B4WXg)+KInX@bct~h%5O-|}|fHEN2XXT#_l`%>4G_t=FRtQ)h
zd9-JA)527sd$h&D8tf5tY0(j35p$wcO|nP%AYq1dRD1tpg4aL>py$Yxs?M<ZzB4me
zw4+SHU$f$=VSoQE%gs!qTWlY@<2ISq*PYzi-z!`rTdTv{kuTOK7-<}f8+NRsV`84f
z9^cdveu$d8{sI`#qt?oDOa5TiAJCN*X{R7aB3eJ;SC)>jB3z&8BU$bOLrY(cHgh7v
zZE50#*vkS(aDKP~f3Aq1k4Oh)A#Jo_l*1w5T_rH@WvOlkv@3bz?W`FyH`cWVA+e#~
z>GwZN!uwwd0@QL1iJ((*k;7P#b)w@-glk-ORTvIfb#T69_uUQ+Zn8XfX%c!M#)O}Y
zMDSwxqm*!9q(oLUP1A=n-0ZrXAo1aSxxFpeVzzO5kpov2$Ulq*Kavq_;1_=<Wk(!W
zoqJiqPREzk)pgCbov$9*e)Y(1>xV{t6+3rs-RejzMV>wF66QxGL~oBWH5;~K@QQk+
zMA?ePla*T;wr#tnQ{N6=1BZJcDX``$w2jBL3U0xDjQ5!I?dVZIHgnsAj9Y~<^`}in
zRqiavO^M-b_UhgHll~D!BP7+5N@tE*<k%rHOSzo!Gr~-HczA4ln{<TRLSU4RV|{>k
z3Z^2=YxWHf55JmuK=P9%Le7TJ|HJ3cf}Kn*p`f7f+pYi2#W)Hw#7|%VB)h@5_0^j<
z7ZzA`Q1{{ov)2s>9&+<;PW_rkr-LI5q~&}0ojKD7EtOrw9Sx+3D{ngb^VA?np1)Ag
z7i6%v1qE@ju_YA_-r~u3%}pW~A3bwso}Jxb3;<Bp>ye8bZ*cUWf&W9a^*UigcR^<)
zIzPH=P-pR)iq0cu+U(QZvMFRa|Ewt~NFD>ZxnJMDd!Q!%giX7ZOT-D8%9e8f)_|W)
zK^Se{8h*63W9<qG{~`UAK2$_39=g42Fm<H5-@nDe-5^>yaPZ)d@81RBE8GQ;$KmN}
zJH03t^y<}1OI!PEg_h)NdzEM%%~CcudXu&9(EBf6GIgF#Fy66aEH|ew-7_$!U}nc%
zyCx<M@#iF8I&>P^4ms=5MLwI96vT0S)OfLED<#RKq$J^TK{xKA7Es4@D3MMC7}Tf`
zjfKO@SY~GC?0iY`Ax4fe;?>mygCqJ=Bq;jwWjcjZ;)3bXraD-)hQ`KZeO=x&B7er{
z3yPs=cJ2ks;4jFvXDNb=962&_PnUx9r%wYWH54hx_0&7S+?bO{w{N4iP5)4{fCORc
zi%!`sTeqfXWnE6r=qh>mhwC#~CI6^lIs=1;j5k8#B}m^{ler*>X`c!U3#*WyDGp)(
z84DNr{X?I@Y@!jm^+cVNyEDROY9eW_XuGbfuh&6%!>z8UJrduKz+VJ&80w2HL`%HH
zYFpdff+AVT5!>>(<%k6aKesV;Jcok#c4wKEucH>Wc<95&kF8N2V#F~-8NOD9jD>|o
z1C)IjgnPlc<E=C0Y4Q+Y6T&PX;B~@cKF;7n%XRBs|M!e+d$E}x-sr`TSaPyuon^AS
z0F;QtY-k{jc-kCXE?TnwaP>XI3ga`<)4x|$tpR?a)!2s>5d81IW(~w}4A9g(ghyw^
z&ly~8xmycb-e8l>`B96+@r~FvGitf|_~WzA_rnk)r$-ki9_0=)8H*OZsF0r{W1y+2
zDY6hb)lptdnu%j{D0Jexj{=8{0}D^3mrx|Ds7`0Rd2{o>?Md(Z=Sa3JoVv^3@Y{(1
zw`|{jp1{w{%nU)Po4kSj;IU(aXz2>1a~fBB4;{-J4n?g2y+32HvrZd?5ZM%$l^qBU
zUheEWOk8yJyK}@1@!yc2HH&{TDSDLnWCw>Zh9Fga8|W_Ea+?n(l$}sjQ4wN^DK6DV
z8%N#i{q)tV<Et3M>E-1mS~T1meoQ9{d`(XzoN1itF={z~fn-<6FA<Al<QH=EV8^K^
zN^+u3*w2&4mKDrAo6UgXR-zNIslKpzE3SkvQM?IjnK5%eYdQDv#9*)|!j|{N#hxJ{
zX3o9?y$l-~8g~4+*L^374RoSr<mZP#2yy%O>mYK;7X?LsY?#K7>7C6v=O&4^7vREF
zG<%n<r^|y#<=9_;-Nky%Ao+lMyp}L^$BrGA_V(`z)WsSn&Z)^#ju0ItL`{V9K%XBz
zeHvJMFk-s)=+SG1xJrFJlhK|at*JWJp|2QpQSnT&oL+Xv2Ix#Zp{1j9?2*Zq=~{Yv
z8YoqAQyC`ogw=oe;DKmJLLNh#ivp|R)^y1Nx0M3HfkX8E{rkyueA4NBb=8HEI25OL
zP@(tVw=V)YoHqkIN)}Sh@Mv(vZ0_vaA@r}H^>8;ML(KjlKb%K?&)CYYgXW$7NZzoP
z+^>q^B;c6N&Pn11-1*ovtK|}dE1;Clcmivp{|<e#i>Vbk{<(riDkkC)K_^dsO9&2p
zsFz(Z!N6cAJS-V``OnNMT1)wlYQs;8O^n2RAe5PCCGlX;NQ%qDGqWV$(lu~l<N&ng
zp*v8s*c%_O^X}cdg5Jj^cSq6at<D-B3yDv2{u`Am-c&|G!IrR_-d84}Poj?Y$XLPP
zZX1OOX<>$*v2k%<E9OhqYRVYyu1e3I$BrMLhAJgvq3=9;^mlSbiw`k+YfS=#isF>0
zoNsFJ2?_W9`RB?LYc!igTzodW1m?^8$(}z)w;0z_13ktCnk$*v9$Ryvr1R22Iz-UP
zWt}v=)cHyx*TU5bV*o>9E6IkC7K5LXP8@~QmRv<eMNmk{pS4wt*pHvXTrkrR!v&vh
zWc}KGuBL=@L@(Ei#hXv@h3SLP?6Q&Cw)Xai;^X5DA0}`<IB9gp0Lh_^(?MST{=0X#
zpI9$iw1_FdFMH-=K}L>@(aXeq*dSiz;%{JUYb(YLdj3=s?LYuwm2ng|?}SzCfmqdB
zh@0)NXC#>@X>K|c8W?zwX~b3c<vVmJ66uFrk9tbtVW@>x#xdfpsG{OZ?MZ~0fMH);
znG&fk44W6G>q@^xLv`+jt}l5OzQfhEi;~hE&?1p*F@La#!~rbO1K$Q<>0uOp*y|Rz
zT%U=tO1y4m$3uz{qQOr9Gn`LyZmh2O;3wsC>%^qp=)1ndjyd`CiZYEFx9zi#Y9J<k
z|MBC1LVxbk+iL2bk$?Z4{_tU>`NM>$ar*k!2)aItqvIv<ua(KyKU^7jqdRl@hH2dl
zE?x4LafCjY|7+uFn07W36}03`U|1SuZ$5GLrZO+Il2Se*a@;6d9_40TVQgm*tx%LI
zn4vX2D(5Gv86iwx)ho7ut`89R%r)gpR&a)B+0k1SPgMGgiKQ=4Ds#<x^5md_YmKFi
z%`2O?0wzFoz$L(dFj(6B7r{Q=ZZYpl%S%cgAcdCqIgT8h?v2?aD~{h=?*m_AePCAB
z<Hs2V1?H7;kW`K#xZ*T$Rwwm`CfNfKjKq3m00bWm{&46spFu6ZgPhzNt_GCHgW#HC
zR4UBBos6m#WQnGxrsozWnDXA!E8@ILmd!eU6my8ilxSY<)~y>7HIB~9KBysZ_yokK
zN+S$ay~hZT`9?;r3&~7Hrx^?P&7cNlGL@IRZGe<6n<XBiVYUkvf5r@%)cS%-F24P*
z_(4O4_>%yo%v?Bc-aLd4*FF3AiU+M&G7^iX>R3ib4z~I%<Xlh6I#;e-DL<{^Gb}o9
zucv4JsZ%NM;vCj)%{@Hed-aM3s^eANwr_uwvnBt{8z-0NW`4=br#D;lxT*5VrQ`Cn
z?}d*`mz+^ual=XfVM2cN%6vMugf64C^VF>)1`YD9Ol`L4?hL*T{{(@eXJJp@9RK(x
z=QL%O)@|D4AE>9v1O2}-E-}I34-@EMFmK)A*}z)L*f#S=5eqfgM!9-*9cKFe!-wZK
zZ;?f|LFUorOioNn%}K*EnEaiByy4Q}D)cuh)WLez(l9PZt{B*E6$;tcR%{B{(r1#}
zDb7Gi$~+P=xmjiFbjA6ua(e~cP<l|ivk?*e=}zVYoVbrgA3vTOV>)AopP{>gFr@gd
z>UY*|Zf?>6ji`}ao8^*x<3@tZI}lO~*T$t0FBOH$zi>VW73{dTZTIV2unkyiW}wEy
zEOGn&sk<m{VPLhji8IFnVS##Rm)J~`xZ0`-oB@a{qPb?)`PDc9={ZduvQ_D8`AzUn
zZ*{JL9(p07KhX2`F%qEzT+ftDAs^;>G*~zgYNFF1r7s@(nmQEn+zm9D-GB0={DS{I
z-dc2&q|8jlbkN(Y>)H7z#L06AOjpKX4SG110l>{<7+=Vx29);YPRcDE8YoI(E?a!P
zyuLFyF>(3!Eh*Nm2#wdw{~+WMz|X(lYebwfn0QlfP_Hgsy1Zs2>AN+>3@Z(0POV{=
zH@UN2+`;dqmz>;H^V*h9$8U31-oJORsI)X6JBAI#|HX_s^6s?PZ-yfknm+W+_fIyM
zdF-@`^Yr2Kfgql{y7%l+Ti(kxc}olyotu<WTPq@bgmOZx2MrhyOFWhm*C~2pvc;z!
zD}Vp)`7_a-8qlc;I~P`tt_PxdLL8Ofu)J{Y)^c7kdc{KNwr#aoNYAFz!1cG57;WY9
zNDs_~KA5q$T}S&NH;x}UatFrP-?h%q($fW5ydW&M{>{^Md&on(C@Ma)c?+B@F%I^|
z$2N@U>YVFV@pkULf`pD{_IF>rI0hesBX%n#rPImg#Ds)_fC{ihXMgYcb9wG+1URFc
z<a=DTjQo_RZtPQ%GL2sGQFbsSH&bc~G>thD-qnvY{bod;`sBjKD>=K5gqyc*Y!p+H
zo2;LW&HaUfdBM-P-XfrQjJKb)_Cm=5PN1NuwQAK0{*?zq4g4V+iGmT)RR)5C^T<Et
z<>d={|BXXyrP8!^i65jWVtwc?{D08&&OLg#%>MH5(WCT7kM?1UP$p4RQ<EOp#;Zi`
zpMU-#IE>G^8yjBvkQB)@r=!o@Q<#2W<=KF}B#YEH1h##4VI<&k{{aKq_Usu_s#f~-
zYh=i!%YXk3WA1s_!;)?|>ck1-q83JCQJE7t-?<!XAU#hGVK=kN(WtxX*ln$K$!s0{
zx2pX|rfjZ8a?9oE8D0&SzDX_Q)Tt~nDSFI}sQH6;FvP2qOUy1tGKzy`a2<Tc^QZzg
z@FH`&E8`9yR%8$IdnFJ2?XdkVt1Mc_Hr<{_bDmj38G~}POP1|%r>8So!>4_x9$o<f
z0e8C2T$<SkL&g*QRd3L7{^)O_+$$$^VL}&$`>NeMTYURnT)BTeFwGhQDwovRKKFO)
zXTGz4J0kVO(f|W-wT4z;>3>r*t2(+Eut;1(`)y^cDGjXqP4~+QnAP*>2C$Xr4{q3%
z{0Z_`ZcX1j`iAMMRr8=6pzB9~V!dX~%fBD}vRTpOwyJkx_`}f(3IX|Zub7AQ-7s%!
z`J1Q1IeT~R+!6iB8#Zh(ADzM<!NLpR=-R)hw5zr?Ax3Dw{c>t*s^OLZHG>|Y2q|Zx
zCI4ApSN9#qcsAQE_S&_(ct(QFp1U^31I?ONMsBFxeCtU5%q-YsKWG&zIvx;k@L)=G
zT;&pB;XZ$RWb)d~xM~20?N-(E*>Zg*o+2+&>eFW)zKab0Xi#?%wI;Rc+P!-`ZL4_R
zuZ5}2BL)ifUS9eR`7YT#%tP|FGA}!CtphPJBRh5*{w$=Gf0u3laRIKy6JjPU%9_L!
zYtt@wd<I;S5~(n(P$;s3JSQ*>5`%d752zl9m|GRk%5?g5OTMatHYO+QQ~J%V`6nkQ
z!Mw}m+_^)sB!I%I<F*dKTil$bYMdXr)5YZ{l%JA0AL@VqR#mR8KJ?__L+!egnFmx6
z26<M$yytjTHD%?ygp*Yo=Q$>8zB}h~I80|QGaeLZY;0`SdjnVZS>A|;jJl5J%z3(*
z^6wj>>^g<IGJ0fV;NVf$WD8%t>c-}zB$bzy$=$T!T;WryKRDq4>{cfIJZWC<(sSYO
zj)~o!AGh|7Dzx^wKGthkWwx;JwTa;|Ee5cC`}U<v)NsDnhs~@e-Yz}ZTSdk0S*7V?
zqvIQf>2zvLffc0zl7tIY54`Mfenl;;t(Wy>b{rVyj;FnT9n3~2>Q^u_UWcg|rX8+c
znQ`UGnMtWRU(Sc0Nyv4Z>sj<L0qk!SGaSpx%k$00W`}llCR!nTDtsO%>;2(+XV=8>
zS4vin=-3($rjO*-jA4%xr*_|og6Fc^yT$>#yyv2Pt8nm28VIEivQ99_Sb^*WGK>#%
zV{M-F`26(DJR;(_>(?*bzI_@6No~gxQqDPknu*jJsb{($2@ZZp@Vnz%<i$7U3xbay
zzfZ^bTuaNo0+w1lRM0(U+U;{aZ5oB#vo}=7tDG|c4QdDi8Ke44cx&ab@T0}i;NW!*
zt<@(@N?mf#e0IVtvlX51FP&ElMUX_!(7N(<{KX~P|A7N2_is;^6l=f)2S>+^+sa5m
z$@0&gn9N54%v^rbcFNA3<CvV1dfs}oa&4?thJ%BHmfh0?G3e`{!Nxo-CRykA(w{Wx
z47G=ngXNz;OZu=pqwHelILrwibuTPMdGnwON=tn1%WnoYC8fKWt3sD-)uGK#OLs?Q
zIyt_T_1;T3_U>opc_JsnnO9giSWPPiK9HMJ)<J#B(hZT)wY9YuMSoCD;w<3+L@%m{
z1t7M$b2|-A(j<L;p%oa<ru!ly*S<2>=1pE6hWarJDub*xS#QbK)P|oOxp9_k!!7QS
zq@z`NpHrx4e-WftpeaIm)rq^ty3^0x>55F35B5t}{tr#(0oU{Xw(+clB0Hl9#|#Of
zM9RtP*krXx8IhzyWmd|_9v!kGBT3SvNRm;KPLfK?YKs&yp7+iFd0t-s|9PIrso(GW
z{e13mUDtixi%j7}A!>1~+zRSoWMm}TA0dI(!22#cVEG3$h@5zQe)>|{+;eePq6g6R
zJ?{0pgS@A1dS<$no!yQFA-yRG88vX}GZc4El7b8+v}n;neBJ7{7%fWTq{7u60}YYd
z2{+?M2u8l(G59ylGD9HRs2R}oY|jb)A2MZdVE&(8vS^UzKW3iMcM}pC=<Dk%fQ<Ch
z(z4I0hE_wU!^%|B-HVqmiwX<VJDq8sUfe{HNBnrOtP;>!`ly|$sp%JB_zegbzEV9h
z-gSmb>Sp2}+|b~d)}FTJCSZz<kN@e3&t^o-wrz-<CqiEw(Z{10VH+LFwNG~%wJNBn
z$m5qdn$Zat(%E(I+t-JsV}dY3f{O3PsI{+^5Df6Whm0IGYLZv@@1MGCgaC_y467|0
zwrp5u=ArCo4y^14-j0&2;Qjl)_4j$w?}x43{MSA7cO7^VS_20Hzo8hqXr4Ua@MKO6
zMZLnEw6s<<p)SeDh-7x2CRbTwwAz#@k6lg~`><6}^r*p^quEzaCE?;aam4Ggvd2;{
zfMmisi9G>G%XW$hZG@yjQv)8cw#a#x${*lF+f#s(8JA}BA<buI+`D(~rg59IQ)s9W
z432_qfLK6#S5RKA<@SIV^}ZcjbPoL2u5L8iVe{bZyLXK!Z`pEkpZ_^s%uyiaD2&w8
zE2^yY^YZ%pX0h;jn70e(0^CzjQj%Tiq!ebCrUCUe5zs}_ch!5F-iDzTacV;rH>N&^
zHx8%pp}4t^bqykm-g1P2V@m&r7msh%V`$0Q$2V^HV<Go!_1s*n%-4$O8}t26Vl71Z
z)Q_8f)ME)>b6tN#Hg$`NPCkFOVs3Ee%<0o%N-rfONAlX~(sG||^2cgbIDHO5;hQ&J
zMrW4Qjq-f|aFMzB1~5)hb_z60F*S1JNFGIQ-kINEmyd=?f1cj*OAij&1Vv{ZI&|pi
zO;-mh7Vsu3)>)&u@MK@_ZrT}sRMnTvOtR`m#YtAptA9DIS##&rD~;%9A8+H>YH69T
zyBAdilAIkRRt13PoV>-lL`M4DogkvEyL6et@DXpnB3@CmQW1Mz8&t5^7wjh=WJEdS
z|AC8DjcR^F>2E<_jdBOjC3*)34*(W-=cq!`Uq>s91mq5EwD1<R_KHykZTVPNC%k;z
z{7hPU`i!F~TK)Q=xX3+S%&X-{8i&knq^hR&;_B-upjX1?%e+HE!d6^KwXN=KskPjq
z>e|}P%}rx{!Pd%_#=k?zy6b=76)GY1*lDqH%3}2V`?YUWWKmpLDB@pwTU&Y%lpF`?
z31EO9A>fwhEJkKPFHTOMDqEU?-L&f6>>tgmYu2uP7U43Gk9Ors%IRW2-iK*vP4B#X
z`7ii`+M+r0EiKdNG-*}30nRT$5VBjtV8Labq*VYrwuoWKI$)^Fy9!LE<oIqQZUD%X
zMt``|NOXrwiv6-HW>#+V8V7iCjl2d3j~ob$_oeyY&kx)y_p!lsq&GVK2@{fArdGB{
z&o(kPc64>^GkWw1R-qo)w@*|)v=u^G(i}pMG5Ds_Y2fc)*9ExS`C$s(;r0PTLthQf
z;0rS6x;=UK-o19{LO{pa$#f4b=K1qM(W}5Vtyz<aRSGdUR{zgu)?ED_E3MPw)i+^+
zhWm$vminPQ*W)O5A48D%k8sdH&rN%L1pjb9Fcp`8cP=?8&$F_^-M_9yB2xY7(-K}C
z@#7d$u0jrS<)>!p<|hII{1LNA=kfA}FXWaeWWoG@$$yK+bq!~y7;c0T|EB&To}a<=
z$J^9OyfrJA>gwupCxB!G6GWGSEFvn#>}%#>e<}{Zt1iP{fxmvG9~Bmczl~I2570-e
z7A<z=#CLFXnANvk>^%@HIW%=W2%5S8a-f?Nu@vvuzdyP=UlO@uD~;T%9YzeCl(H}+
zWi<L{f!pXO;G*||gn~QdoY+lmG8~Z6-Q8VH*Iq4uQ~f}uSPD4lAiVOoC;%nE&<X{-
zoO+qT?<Gr?kSz38Da(=YxoQhu8W`A<6U$A;c-jcqed^SyBSr_^xN!*e5CW_QO3j<=
zT)eJhU=Sc|2vLk0SaeiOQSAHss8O!?>>?h;WB5KeQwPQ0cOf@XFKB*9bTC-kkY!K1
ziz5@ihlc@0IMd85>*}YAvoGhY7<v<BwD=CO;uSa}c5VP4rDDucRoCXU2h5y2pt=3R
z>fuL_Ntc3yg{~#jZ$USPER`J=|6E9d!x_cn;Wu5(ES=hAg8cro!RC1@BKZ!S?Bip`
z5@J|#ZmvI#->f^J2-bRD)N1vGk@jcKo^7~!^JX&HxMj<h?Wkzg^IlwE{x$=QI!q98
z%%%`G*LD5<vZ5zUS{k(MAi<@l$!;sOIuhK=%T1B;N<Y}A&G61D)b2-9>>ZBJ-(!-9
z%=trYZEqMFQZH}f!{^V3n>BPa`%qW+j_IP_T3Wj)0^4-!mWCJXS3tqXaC$PoM%Q;Y
zGqXL>pryJ1OoGNeeL8(H`>k5GJo0x#?sCU{%`7$VJbXCm=x(h616B_8_h-3p&fpCG
zHNNBVP5Ed}m>@g_X1kRhj&7zCb}X>eiTCdZ6%QKk<2IcfgNs}-sS()W0D6lPLb7mx
zT`}#Ux)HTL!LR<96f3TLnI~b4jlr7;G><1_g=5_};NN#l^WngK=*}HOL3yC9^p;+^
z{r$~gf{!eN%V~I_Ck%p`4NJhjQ~yh5=6I7G5Y-R=^5w`6ovjNmr;G-e1xq3x3io;?
zB%~8Nr2knszFB$(wHO@89(ZAS?u@%&+~WgLyY}bz4H_HYYiex%?g$vCZ=Zr-1e>*o
z82(ENW}Ubl_z2#OAN`K?k_fm`?_-PYss?x+-Ffn)r5L`kkHO*F_u#t6HJ?5uK7L#l
zls0Yh<i$WSkPc_lm#OL4_zcSK;TdfZA0IDqlZPT9LYO4kD@XjHrX%;el)7}yqV~CT
z@nQi$5*v!zXlP6|G&DSVzdJWA*O$q9s$(dJeVkjS*ysrebx8I2!OH4-hp_F`h-xp+
z`D1^6>4|P0d7<}(lN&E5|21^iu3gBdT2N*%gR)c1rUH(n2tls@<`(6%VlySB?m9XL
zz$#C2XGz8sC%Rw01h+jFHPb(4^RG=p4_>@@`r<`&3rY<eHnh2I>S#uhSuEZ^q}n&f
z9pbGiqJ?Z{)h=#GPYWNWG-#xVllDVqsj{gv0O{o;3D-t)zw}IM%kg`cIyi*hJPIX!
zbGgo}^D~k6{zG^8w04s}en0L^JRwEd`|aCCp*Wa!pdy-i>96rk<q7~;^yFnO;bAjC
za-1nCvC(8SDD}qj3?*gddoYua7lUETKpLJ($3Ry_dBu-<t$&c{<u(_k4|VHX2zy&(
zS`3wyG(W!JV*~0n&P+zI+2BQs7RkKPG=tBx_s&;6m$_m&2o4@>%+cl*=C;{mBD2Iq
z5n_1VgQ-*HVInOkpq94daUh7<#}}G2=Ws%@Wx$<VhaaUd`q8{a3*jCKRAP$5ZF31Q
z_Bq*m*f2^OOHTYlm<w8#o+C#_8z+;(i{!5N@4r(TKIE?qWPXKEfmwosLBV)Vw{9m*
zfTLZhujO>l_Y;v^5BA>>t{YC_3>(SBcjGNPb~FVch^ce}ivX=0F!tB?@Im7`w{QP@
z0x&A7svuak=+)9yt5VTXmgGc<Hbzt%locPr+Qnf+E(x0kwugs19o6a~01D~Y)zy_+
zT0Rc3X(>xCuKSDw-Z!{FB~TS#?-%uI<1VJY@;q+%`ZmLVn35fN1FUQMn>I*tkl0OQ
z%fvf|X1m;`1HSpG4Q1*>Q|oImBSY7{pDxPVnV0@M?Aztp{qmQOQff`Fm`AVK%JrvD
zb9P(Wvg=4y%KC-z9W*yKr}4q=Ybp&YX(Bo%#`%jHFOoJ^ndi#LNir%Z_>RFOe2_C3
z;6zFXw?!>}=NlzzvF$l2@EbGn&fWqC2q`e4ys;CsI^gekUbCN{UvzA&4t0h`c<dsd
zllNhB^Ks@EsGDNJr80LC)eOg0@-?8Te1W_wH@nWmmZ{3O8-+{=%S>y6XpQJ7JD&U<
z7_U>{z{K2#tmGi6@R1RN$bcB-%D8WJRiQa<T=A0<J}L0yfBkf$dlqv#(VU*2S^Yk6
zugj-QpOc0M{O?*ZWuA*+fv83x(`V9}vBPfMq$GfnoDADq2;EqZl|UR4%n?RJxwJN_
znchfI`F+h0sx2vR5ih)W@j`~qBoZRvCCV-ZPaSb(>YpUm)*qCPcGk<Q31kuzk+aOq
zT=P5<K7IVS?x87Fd6!M^qFMPN4pkv@i;?q?1VmW@xPQH1=DMBe2#K2MD<Y7@l=N1t
z8V*knGtrRl8)`No*11f?^U{_8?Z0IkYp*|giFMzq7E{mThFJ`!DO)ih^b6={|9UuE
zQua-V=`iH?V!4ZCP{c3Hn0*JyyiY?2LrLMXE-flNJn`W}Ur;u*R`*th#+j^G(Vx3o
zj4Ya^H=gJKR^RH8-j<YeV+wpTBXQk(_S_DFfy7Js+kvOg`gD0|>AwNfLuvyBHNACx
z34j%jUCNuMA7uaQZ_9dCva=TKj-+KW*7DV>ZSkEu@U)$gkA~5Ll;)1-8+paW8E{Nv
zZETk=edB!4-^|SR)w0_u%%?@#XWylbQyaZr8%ACVPbP#P8XR~(Ov*EmMA}YHB){r!
z*B$?bkOvWCY%-oVPYn&94P6-po{SrEF{MPM2j<qr`Tpnkno<KmoCAvT@FewjQB_qM
zs;cvOR|tkTEf@?EwL28xF@JxWr^irmARM;~&j#ZTXKiPf3`gk0YsXV=7#NpS%`j7<
z-^;(&=-h&kpmXOuAs=U<%V&EaH6sT(!YzNr;mLmkm$L^VC^E7Gw-iyt%gp^(|C&np
z1U%(Oaf+5KUFsDY>Tpx9EaHUN+oJznY-{UHUB~GkQQpSMU|=Xa-t{<hBlY!*D=Iw9
zR&5m?a?tA4t7*4Iwg`*F;(6b3w@;Y$(GHzzANdCEgG4-&l?wzQX_AtYlTSa7H1b(E
zV*LjznR27vm>0A5FN#Mk^VoJsRG=y{D!Vk{L<Llp)(0N>b<{IR)hzZV@X5?Yi>^$u
zwWwD&2~5Yc^uogNB~?{>j~+D(c2cKrW#`&Z?A_R9OE)Ecq1F%fY1Y|CoOtifYZF&I
zY@;Wi2PCXiqZxhO?bny4WPhRm$m`cuWcC*=63ShX*3i?>n?K+A(HCGR(Wv150D2;O
zJ7_xS-$l3<a{2O3;0z(yzyyI=M4t%Yz5Q)@YDx+!g?E%$OKFuPB1v(UWe|Dg%5FYq
zI5JXx9L-#8RrJ7J9Z3C~ej-`Nfp0^GBO77Fm-E!-pQG&3PG$x?e(*r4OP4MfE2iRg
z)(4Tq>Wvz70>tu3Q>MtsjUDHXbX4+$e&%n~2D119QaiSA17SWJPE^nZ#IEP#<1JiQ
zfh|$`3bumM*SBob;N|JLxuzgB<cvWyAg2)vVUofjqL&z5^VC>$n<L_aP8tFJ<xy-W
zP@xfQj2A8RU%pJ|6LDOIhUNx$BGgJfHF9L`+H#}ZDGTOK$SXbSI(F<>8EB$dVu+UM
zmt7$Dg+)bofzyFwD+_#3Yvc}^|KYFkD+fL_jb6K%uaa3hiX!Sy0`&|D8OSFIxtxz#
zLmKIM4T2`i+ney+-GZ8hy82E0M4#2@64Ur7@*$X;d_zUtYv8~=?46Qf_fG_Zp?@@7
zIdk%?l)!>$cm46}<>*PJO@Y3A1X8D>t4EF=O=c_)hzi}E$kZ5;$-G*1mIO5VM@`0P
z<|etJ=5<5$5bxj*ejJ~k42EPe!Qw?hNFvc#5%Djr_}oy7RHV|v<~-@Scl6r4YT#m-
zMgXsV8c?=^h$*o^w2=@uA8RtE<*zxzbCgAC%pIJ<D)t#Nf+!yn-@lCe9)Or<?^EK%
z8G#D|dV>05e&We!j0{>#K7aSW|Jq?cO4lqBLv(*NHS46&rZ{{^`?WCO!lg^TxX*lK
zqU@l*|0HrM2-v)N^Q`RcQ|Jw)92iL%kJPoI`csqZ>t^3LPSFk$^Iuk$69Jb9{Brc$
zmyb-1gBx>tnI=-ihu51rbKRGYs+yH1B+&2taOrm_WceTtC8MdcWST*q97Tb64G`Bv
z8N&Kv^{x?wKEJ0Kw@)jjr-O|T3Y!jQ$RQ4Xw<vG%^T&@{P~yaG4!wBMiY0Mk?(1d5
zv_@xSPk4m9@pfC;Tr+kmefI`jJ><c*GN@4nn`AAq7dqEZ6;b&!7c6+$uQ)a)MrPaY
z@9j2?Sjh1ZVG?2nmq+WR4t5<daNrx-ZiVQZH-|E@1pI56dIEh41qp#?v9)#j<$>-D
zD^NJtF;PMRQ&drr%1Fi?8Y{*Qd+F;R=f3*0k_sVn3RK5e-sUKeZ|j1?C^+ZGyN?3d
zWc*?)eHOqGEo^f#fqh==VSWHNnt>uc$+_gaXzOxfvQXDF^<soE^~6Zo_e0%w>(di`
z@IUz#@aHfe8-mYXW#}UQ>)S5B^DZ;<A3rVz$YV;&kEDqdJDXODSBe5~E15Ek)Wq!D
zf}{iR8*Wu`6ZpNH0)~x`Fj*ToyMN5Ok~V}(d1r*~k6=^J&~IKnl@S1Pm^4so6oyl!
z;)k(`%y#=)y28|Wz))=AM>?0b<JHlD4#i)E#HinElV+t09~;X11YIoSpb}b%s8Kia
zuUxv+kD}^4O$}KLWAMEI3?wZSkLNOK&$JeJ<-L;kY}>bQ-jq}v%?`C38@G8Z$hEZn
zTz<L*A#CU!MEHKQkAvs3(1HRh`VCs=wN)N%d3Dn8u+ch<YDx+>(WUo49MWG~?-@cd
zp^TY+=Fx;f4^io%KwfrXPe0ndbxp~c^g)cHMqgj1-K*ER#AHZnv1*`!1aD136q78m
z7zo3;{Xsr9foBIm3KK}CJkhZ~YT9mZ+Uj}01eA;Pn=+C^$=s|(3#pxYj2_*UdI$l}
z2mD2#tvTesg}2DY;(ydGq%vBUx8XnfCHqyYHdMvc$y2knwWTS-SA1Jk)ChjzcO03{
zQJBI!J253GmFli{9O3VvVDhd{pYG0FgzPawC6qDH)Njw8%!*=YYs&-h*?`pd;W{qv
z3JePJ#-Kq4`v_YIx3Q)5+_FU>udXdV%@EE)lQJqW^Jx1ZnV5)J@DCL}uTVX*V1XQn
zW-2O4d3gbn2fL0ka(4dw&#BGmvW@!>&3N`K+Wo5zL4ith2m;QzAtyamvs`KWfuQ97
z#Xi6x4soA5Ffh^s>)#`Do863!{fGeJw@dNLm5-Utlavc_8lV~{_3!xPu<-DVf5zwb
z_4hw`&=}}u&0qf6Q4W@Qw@DZBBEg~MF;l`w?S?ukd%~i1zS8!Nc(s1JEJz@DJG+i_
z9ZjajIdp6_M;tyYRPrI%sotv5yc<G(l1#l{zup7dNu&*s8&kW8diQ-xz7dMxk`B%0
zFI-3^lvh|>Q<--hU4=}Jl8yX%U7Ws+A(n2ROTBL29t+f_a5eYcg>&cL2VCI5)1xCj
zbX>R220H~Q!=Eg*u^A7WAeWam4&D>_m?;PkIXiB(Jq#Vdiq3|QHs<<f(bta#kr2m?
z8`oG-5j{Cz0T}NTS8=oFW<m&9lL(>5M4KTWquFih$MoC4WY~7>-P;PZ5}Tgg9GtR~
zv*yivz-G{GWDnRcp4PT)+iE&2;H(Bt@5^YMH6KvX;fL$ZSL=&agaJjKns;W&qc$@g
zr%-6AA!-#?Psj-Hp~j6H=YIH_73n|x^*`<(Ur!QTIy`&{oyW8INj@$zNr<+E(swNq
za~igXROpg|k-2MXX(_W#$t7er7@4^53fjRcjxZF%?j9Z+0LgQ%Jn8T|Dh#5{inV$B
z`DM!G+5J&b0?Z&?C`J)WQ?hly!5)|t^rFn@H!Dac8(HQ~(~_8#6%{=A@7zGvoW<&U
zYAgq=mOgMw?@Ccx@$)6}P2G3w@&9W9<PwVk1AV5r6(bifDHdI=`I(rQ`DXtSbv)5W
zUnp14=h?p6{L2IN3hm>F?^+QEPu|E-FPp^1qa<y!8*~#F0wkNLuQXYA9BV#^>kmAo
zBwYmu3V}<mc1P}C0#%Vibo-Ns54WeXKB4El6PxKcuS0yvuIMhQ%a9nbfcKY58hYH_
zM~`--;==3&w~Ytj*3P4jL6XuiHd=IPCWC*`m19ovp~!0#CW*A+q55+tP*8%&@241P
zX#gIdG;r7M-8MWU0p>^=sFht4kBmT@c{F9%Ih}COL9~3NF<&2_?!$)NnDtY;xntB?
zM@QLx$UBFZR+M24o|3r9ASt%t1IT=>(10E>8fYJwW;k}W<M-=J9|Nh#8IPSbmS>rl
z#$-j+&nZ4`jA7tnq$|MBix(mvr_+eNIv7niK2ZFhWUd5NYwVI%-NvNQ0(ru!0$=ul
zaDuv^HXhi1tx=7F`B(L&yjcV)yP#;OQs%{N98QOA_2on1^;1%A+}ySx<ax!-WxPIL
z-$@tT@B4#?)pS}7l+H<6^VctSR#N-Lp0nUX;2hBS&i(s}n>YXJa`N&0`?58=;8BG6
z;2}y@OpRqFy<rFSXVgp0qo2*KEtUPS1DrDtv$j=PO}ii{%g(`J+VttF3fwtVBV|8B
zSX`Y|nf3Bz=Wg9j9k*CW=`uuDxAJ;v%9HL7O&uR|{RDDi@Jm2l(7qEVPP`obWZ$C{
z8W7+ZL!^uXFX!YeT>aB4x(R*EJ)$q|wA7j~*Qn{FUnZn$Q+%-08Qi{DW$d|7r~>YC
zTY&~Zrn(wzJOrQx<vDfz`sngbe+iwSKa~)~g-v|+>;MizaHU6V-clGoOigW4Q}_55
zBLb8sn6<WT)8;M+OLa{R*udP_nM#@qhLB_;j$`2VhEfw!rWv3Q%BTN{?4v45U!O4v
zJj0=5_CIfldI%Xx#A0xJ92H|17hU|<C^M0AK|}<h5Ul&YvSW?zuwjf*HeOURtmOUs
z^@(8%4ur2N$qdf=XBU6N)1u2X_KOTU2_6MAdvS`p#WWVV4C=Iyt*<#98gayDWIc-y
zSvhpbkOZDM;YA})(b6^`X`qv<YjRfBQTl>|z_~MK>>u~gG-5%Xt<EPZZ)rNX(LjOA
zpKkC{&AKqXuS~TuY#|0^->K^m4s4@`k%=k<=%&CFM5?%wrpsEOwx)qKhLD#^5upRu
zqoA_#RX>l9za{Cbj}p57A9y%_wc$|06s)5Hss=S}D+PE6!PgN<N;U$krz9z$lRYE*
zy*KpZ6}^b4uw~NKhpg=MHYGjcri8?emyZnWv;dkO6M@{qoJ1t1Jk><B7h(=VbcsE2
zH=^0yF3vq!6+#7*{^EtwqDk)~v}|vw6C9#uuXA2CngD~0?@X#wMENwFBH0<3Y4H5T
zi;MLmjIV|SUd>k(C$aN0TeoTR>`aptK&??J1anFS@T^fVWx(4cq{#Vs=VxA8{qmz0
z?FmHzg|2AhX_`lrs}J<lUWA@T^r>E6VKm$Th<CuJsmh3sL(td%(WfpCzpKMA7yfi;
ztlBWhm4y1j5Jk4lFH;h*$w13C=Z<@+hM@4aUb2KLVCz+#@FY&E^FveiVg&@K12Re%
zLa`0%#JoP%pkkSk{cEbv&3R0;aUUrI099P4yo-*_vtO~|^3{W>gkJrSwSP;JZ%!9M
zOlC-E5~_UAeuSsa>ve__ZjjH*>Mfjb0d>XxV7J<khLmtuu3XU`K0N8a{|@oJMGV|h
z{dV}GALe;vR@9klx17BY^t~EWT9G$1=)8)9nMg}yTkDzWYX3ERI$a(qD1s7Hsqo!g
zux71VO*k_>x8_)6|Dn?Y^pZ%IVG$8Bt3f<>UH8H$HyjCufQe9-a&V+aWMl;lj~Rxg
zoJ2-PeG&IW#!OoUc1I5-T_Yg4aHZ0M!*wh@FilA_*0-0AS9JUlL=7m8___-89x!uo
zc7Y7|@FW0SWF0X)vn{a7z2xL2z}5Wg=a=k;T>^IsM-wZ&4}<a~u8OIKkvERZ9tk42
zlp7Ll3JZVJbW=-3i(sA#A5WEPe&IgZg|aEfLj$M91tF{F+?zdf=3SP5jG|3s!3Svc
zCs3}T$ip^LP@!~wRriWCUUo8{hMQ^)##zh-&(FmDD8F)~{^Tj6DST+}bgb?ui13kS
zL}ko`Db+mxW>QFZ>O?ADSdqHHz<k4x5jn?eTpp+&iXJEvB4CwpfAlu1za$s0AfS!-
z$5w-Ai3JwFQmI+9A<M!oHOJQju8B@@D@K5_7D@UC3V#5O!qgG@HazLRhac&VRT5-J
z?A0z@=!1JpA-%Zf@Zk}U=YIPQd#e?d+StNS4)V`_wiuyDn|AHSlM+S&z%n?Iz~B@u
z(*9GY(kSfpZ~b#Xwdl!AL<4-fFF@bP=+mCyc18W$jmE3u>L!^<1L{O$k4z$g)sFL7
z693A=Oe*JTuX*1rmr$T@m)Q`J57mzbU|Ncave;Q6a4@M`4}--J8y8(_5&lmU1m|3<
zUU6H}A!N8s3_(%=`BOY}@pJraalov~EZdx8=)BMz8fF&$Q-tz_U*>Ul9ub($e4agh
zx+WtFeb|gSbN(h4y|b|D!w60EKt^H+1BC?z5&~3WhuogOG-&3f*F<1uji^?(@7OV6
z!VWFZFTZ$SLl)mt>Yz364v2Bk!oMR}^C6PGYDSk}-Y;f(S1cgs3Q$K+t_nziW{j@l
zz~B;QR8tqsT^8y35%qEAo&){_4YuMnGOK-ze)%;`kQL*DFLH7Y!r<c{!0A$~7&yrD
zeaxSMkK`v${2;(Erb(DGIPM<alrpMFpsvufB(sCS^a8)RKpQ3e9YU+=^XF$zG85Kt
z-56FE13B<O+H;Hn;gO9#bMdHST)9YoF)2l4bRZS8EG$|Y;!Hyjy>obAcf<p`eSIG~
zr_F-XXE}ySGWT8}F9o16M@A63jEREd0ZIJC@eTv702y=m@L8?2biGibou@4%8m)^t
zNE(Mcboi2cBy8ii?^~$#R>9#hBHs`x2MdH#h%!01hTP6R>5us69bJ=5fr$=z;q&I2
zt4~3ebCj20n<W$Q0?;!dBP($j8$6$5Txue<IU-np{Aqc&_zNCk{^`}QX8rm8O6iAb
zV4VjXUD&1r-=92}*F%eGN1$<pFv*;50Is4x!!IZZ7P@*5u5_e0LHt10|F5{unMxZD
z34Y6a5#oJ(YTT-@q+}Ae4C<u_->)~Qvc)Ii@Z_Z*Mqx|xTaAm30v;Mj;Tk#FrH=_|
z*GJzAuwlER%3|o`JBPit2|Nhj5Vw59{SL%B!QdE&-a`LQppb@y24l<5SAJ5>m+uCU
z8^tT9srb+Bd#T%Befso4cG&=_TLA`BF@<qp<sInKHJGE-gZAk?a9~L4%H@NZIimh*
zX~+<)TU}*bNt`)oJ7N~u2KN?Zd}CTfR8*ht-4&MUUjk@LBTnnR?;pEYUJ`Yd8dWO}
zi=F>lG57pT^SV};qrZhor8<#80&4UNWsL@UzIP{RS^xb0Rb&<1BLN6@?AS4GS99-{
z>W>5+ftdhK5(tJcOJjGl`&yzeQ<mXjVd;MjJq}zraG(tEg3yqyuko2EAZ2RF-rf{p
zI4=Z8GJN=yzFPE%%|x~%Ljj4fc`px7Y+GSR0;T|>WO*$-B86w;0-)F5|BFOJaw^xk
z%epZju4H$A)KXyUcprVjQp$-w5dRsyPc!8cO)V}imR(3s7@mNiS`_c@FYv0!?P4!K
zTDd%80p#oa=rx)$Nks;Kk)1t+o>;@9W`;SK*0Tf1suVEn4$9gV>17gFNOlt58FgBo
zt}~H*V^VuqMIq>**w_P|O6PkwZS1eyQg<1=g6XCp6eKq;E^biCi~X?AVkrUcD*>rn
zxBUK{a@fA$PkOg-um1Ztcz}oA`@iOeuC7~1|AluW=7{#rCviJNjF^qIZinSGv!U}L
zgtk?-G^(*j4JWMhfdfW}kDxLWl9MM?jp7l51hz*iN_$Q;afpch%2gC2E6&zOW-o@I
z_4Oad+~E9q^HfoWgI~c7y%VCF33EmDGI&198X?jUo6|(K?b>zs!Gj0O2UFRbi_70{
z+ws=~Tv5x5x8SsCX(E4KZAW9k(W6Le*nQ+kHFV_|v`v5w_}}C<#1qGk3oE}VB|yKU
zt83kAKa1S5pPYm1wC7L^8*0*J;<ci3U*$&bj!Z3s#pP{cH0CQ^>oHm}&;nG=^yyKf
z9MGls!7PCQ3yFm{F<w%W$a{`XPKht(l`3->YP3Kl1yo}ybS>C3-6ufDTTT;2>bNL%
zb4qLgRC#Fvf)gJQ;!MK?Rq$qXD!0DAJ<j<kc@mPjg8t<VG8te@*`Wm)52bbI@T$^y
z_9T;2#F<cr?dj3mwQtX7n@XjLNalHxyg-iUNcS(-u5O#bPB2EIC!ZHz9QW_l$q-hD
z(8{CN_$q>A@|N=_e`vP1n8sdU(jMA_%zu8$X^UVwXfPi*Z^3*h)Ra39o7j>029*UN
zM`+<Vv-bgbpo0K+t(Gj=&R{rqkmGqkD107}(0?@4V3rwqBjznwu-L{%9F|DsLD#Nf
z+1`%8<>=yq1XzH28DW&!KR9L5ne6rJ)~y@pYmG(SNkg5#iz3ndwbsz^Bf8@m3Ss$u
zY;C1M{KJnwyZtJA60`heFN0^F_D3Gl=zGYx-$Z{?!=-vnyiFaOcXKTLZts*|a^k|n
zLoxr1c=uv<yTko<I8JZf`pRLaQO5lab=Vj*&`a0gTx!^;A3qCR^G^-ywW|4~jKun{
z>vQ+!#vQDGa(LK32kV{VLR*lA5s9qZ=w^1<UsiDP>QKaj*R=E<UsMs))@YGzb$55a
z^>|20Ky_VRvTSn`q7wf83zK1hzrrHJL9Y*skYuCj(RNo7UVy62z3|21SxTNBaqr61
z3&z+=q4t?>X4aIAKW>#VJM@<uDRuI|x)AxL^aTQ%p+Ff4yi7~71?nbu!@GcJ_%KZQ
zT6h_y+x6v<`R>rzs3faWfh2*pRJmf*ud97V^2=>Tw%(QWC85=(EqI11iJ*7PukTKn
zA#v)2Ps6$Ut+#K)f)FwO^9K%`Hvco7q)2#kZ`EGmo8&rYnM=DSF3>>wqE`Cw>C^V<
zeLZOr*y~31`2i(eXDY%Zis?1Ls1yT*cw;3lbcpkU1}f-y`%4o9?jK$(*(*C1Utr!N
z#wc)0Ma*P0PSj>I1B<^3{k}>rrLU}BU~PhHg~o>0t#MH*$aBsA8_+{kI#(7%G$&58
zZ6$XJi=4KyIUU~SL$KwK%UK&1cw4g~+{^Cd21W5BMH|KA5S7Z+K-naaZGT@}q!^L8
z_3LN-#n8~I;fJc$80FpenSlDYH7{^`=87be6>CwRm2KRWbazilxmL=5CF{fj6I1nz
z+qUe}H0rTmIXcS~%3K0GrUYG3)6-Qa6Th48F)2PjS7bNd{fnBF?6Fp(&%TqN-|fyX
zmHA0i_lw_N=g!UX6@31DE*aAbF|Dfa_;`C=O&cYdgHPL3JM@Z0TxU`Bb1l<ZUnPt@
zd~Ff~U^}^SY=)@aaDtmf=Ilxhv^{BYg4m0TnMv7fWHz44BYNyt8Z~OvdfWxX&{zm;
zLb~kWW03)9I(Q)epyn-Wp2g9KoLT^~X=r6tpptLCVYCJkE`h8is46Go-6LC=$x}S^
zFNLi(`>^NI3bJXUsi~~IqYY?<;WEospgtVm_2A#<@qIXY`+F0I+dp*qXUuxfAm7i<
z_GNnlzE$$f5XjsXd*SFSu{Djot%YY1t896WMAsM>OO`CK^z(17NaKbL=oq*2cspWg
z`u1(Rh&~4&Uc<MWA>Woj@a*3aCoI+j+1d5+G}06YEcwe~8Mkf8t|T3;3yOh{Op96r
zO$%co8!l<z#g-RBDfe@2gP?HjPZ+B(N+tC<G{@`fmk+Ay)G*{-gwE+69sG=$Gq+*I
zDl07jBm*_WxLT^&_PSSxcG7H2!0-f4*VEUR7sf@P5$Sa2^Q%JI&k0n|d?>y_agApK
z&3NAbPJlfU-VD3;-DpMb$(jPn-6gO%#7*(h|2>X|@3KgFAlR5bHs4ew=)I_|u%)-A
zP5J^WiowecYMQPzRolP&@ap$1On&#Nb-Q++%%fZJD=`BHwlV=J;GJWh3!EcQ8cx|~
zd=DkByJIE{Q{9y$QZ})uLnv~Zn2dEgJc@wg81rpZ`8<X(J~0WDRkFdE$aag~<~6?B
z^j{<-B9YkiE#s8zej<IdnpZ5QO-;ws)Zbeimhr1lz!^#k7P#;bvc^60r^$CGE3c<8
z#d|KmZ_L9nefXYtx#KD<1R%t5<F{C=;kE+B(SV`8K{<`TTkc4up_Q!`(omy^!l{qI
z!*<^tgec5`U)bx4ERZOss?^w`g*}7~*-MdCa(IcL16g1GX@Hmiy(QwLM9H{1{uT`b
zu2%|KeXc~b8md0+iaKi>`3}O?QX?&YqGaS9Lvy|QeI$$-y9?TbXERLx!8wChe{b=Q
z51O7MJF2SQ@fgd@2Qo`TEX%f(_32C0p@G>ACUVojZe*uLwE@G*2iL!RmA|P`+`zrb
z#%tDGbM1C<J8`l5*s)?s^y5j|#9J)HQ<CK^l<cy*fJ*YWJ1j$fj%<OI)sL=u&K+S2
z&0U_IMDh4CC&%$~=^Xf;LYTX4v->CUUptvS9=B;cS?58>5;et!TNU^N1j9SK%T(7!
zk4;6@>l=NGe}skX(Z{V-RkzTwvl%F6w4Y)3Eq_*jr`kGw%R6SR=99FTKZxl8m>i>)
z?k#KX=I)wPQB{?1SMV@$Lf>{qecRz|q(CAc@!35r8LW(VpIf}vMLAj1BTkk{vleL^
z(Fp?#5mwJQHp%-W+##ZV4Nw7LWjUdV0b1{ab;8-)jKE_HUjws%)}1?-9;|JvvuOTr
z<FeIS&g|gbKd~`?wi;?L>k~P!Lkk)P&BK!c{kEO@{<;sKpsYKnhD8*ge}*TT0unVn
zZy*V{)3?>b=XHMYv%HzI!XlsLI{sXNR`3<D&YJrAU1&licF@x2)TQS~CNzIJ+T+UB
z@~Vm@4YJ18y&QW%_jpKF<j<>B4sM&1+w4lBzf!BwhVZWH`d@v|A1I3wXfPJaMtgdM
z`_zRZdywW}q)WM)Snnc@$S9d)>}|aUZ7sy6qC@=G-cXI=I)ey{&p)=)p|*SYtUe<?
zcWk+_Y+j|yhtD?kqqnMTcgUQ+evvxXU5F{Ibo{lM%Bpm_)o1##Nuy0ze9CGQdN}bk
zmSxkd|2yrvH0YsC>F@-h-9QSY93fd$Hg2SOFM=jeypXxzdI2r55vTd>^3H&l2n@=C
z7eWEai2vlDDOmV}nv{RmsPB^n(Lqj9#d(yRlSoZjjcZl`HdrqJdB%JxXw<k#lR`e@
zTRlA<dG)RIc>&9Hk6VY$vkIFR?d&pa-@<0cPOR#(yZgJjq1`U;4GzuUepB;Mv2Lxt
zxBk|-RJx>BRaV}FXQ!mjdPiJiBH-?dh#tgSdd`BE?@HFB1Vpy<S~6;N(~Y;IMxN}u
zc2U&rR<+LpvL;xD4z#Nq@#5$oyLTLi$>fbX<)J6jwtLbQwZLL{z!#CLx=fxj#gS@&
zcipp3pX8<5{n$Coob-(wH<(KrBM76oqY#>9zw<ha-__f_284~k7iVlAKYBEM)~xNI
z1>B3HhyAf41Dclc1a98Tm0Bx}d{)XtkNgP9EjTw*P)&76_AA!lrUChiffTcCh@}k~
zNhHWftt+*nmlyC}RnSXDS+>Lb7t)A}*%`90olFcYF8%M4L?w>ABT8nbRhrTT+~=YM
z_;ms!^^8truQEF+_(a%R!Z+tCH8Ba?zwpZ3;0fDKEL6!|{nze~ufuX<LW1`_sy<*|
z<#4k&-Y=(O$JpzZuF7w+4TEp5^6;9$`Z?k|ub*lx_b0ZOI58s@n;;t;I(K&OaAedN
zv&vVm9P={BBskPZDK*~KWz*a0^||K_>az}qdl|$R7sqENwM`Bm%l@tVC_z3jNNB|B
z4^Z8>tdkB!>Xb^DP&fCw8)$8f9!!+|fMacesK(Hwdebh|>MwGpZ%6v7NIDSfN{E}^
zh%`^i6ySEY%hVq7W*oilnikEVE1duzEds31?k!bRD(382Zh3wK+6I&+vPnud6un*>
zGx+$l2iIDwsXdqzfG{_al5FOsL&tCV8~Co=JTnUEHj4&I+q-*vd5JGGF@c8pb#HI{
zX2yZ0ag=huwOK{zAh&V?u==)U8c-ZHJ?ipyK<E}w`tN|Prej6i?h73!an5~A{>rr0
znx4CDd;VyA@u5wT-a}TVBKa!|^6S~IITEI=;Moo4G>rY0oww!Qm>WxWcvcNC?WCog
zSkwICi6&-#n$Ha*!^`a7XJ*CMZawDVY+;%6BDp}wby3OefTcwDNyuwNabu;XNI5hw
zY;hCl3B|51VXK^-OMHJ+3|8+J8vJf)_c{A2uiOj|9kKUa{|9yVoGZ(*<=GA-z=$GS
z&hkIp>~3sHBtb8+jV#S$V%Lc#tv3eqB$23^JWp#iDU^xp;+jhS&3!o-$8%I;hInh!
z3d=AeUevd~l?F2`+Nt_injq>r*CAkf3?AIdHoOz*Lc>5;t`zeE63t|qPUvXX0d}-~
z*=p$HO`UQP26EVkaCyP=-E8ca8JTQHuT~7Ku)`u$jO75jB=$35t{|JN2p&)K@=`;h
zqN0k4NyPKln^xoXP7EY1G*v)zXv8UjW)@p*erQi*^^_J$Dk{$NV??b<rWfu2)<k?#
z)-a5G>ql*lI1^J0i-j<9Y}rQvfy3o90S2@(Sjt!HKdObBNki|LF}vUJn__-#_V8-$
zmuP;yj2fI%B$FIh<_=(MgjwpttNJ-_OE>B&-)N<!-ck3WlWVoj`F{sDe74o<)C9Ah
zQ*-;*Mso_cv5{`(rN*byCg42CsFITJ_-*LQHtj9$zopB>dGGt1Rg5zDe=Wepva;Ip
zvdG(6Uk$h2+FIT!`s#b<sBK^7hDQCm>Ga*Lyv*T9TzPQFlgg~>A>o0_GmecK9YM5{
zL`8w3xql*KBfA0EfHu3c?w&^?A^-6ksc2<wsGBVxg6X`bH$=iHo4CaY6G%20hZ0L|
z<?Xn&i7)1uQ}sKg*S%+skU*@>*g4C-s)F>iA&0uQ-V5a28RS617s0msUw@s?dV1ad
z*nBofF)}{xLWVUlZrD97%|_5UJ8s>&b^N7S$lO4gs5pEW)vPgY@Dx?bY;Pne;2tNh
zg4zbmm~%D`sWz}bzi$f%f@rEqG1X)L#jG%ZyJ`fMte|cy3nGvw?4_7Hi<Kx>56)y9
zQ?n?tWvP&qSj45a(>SsFX2_!69{UZoIs}5_f<=t*?BG}E9bC4@q$i4kN*Akav!~gX
z$BHW#cRLq7D|5s?&EeTOdrUas0|yRVq{;%#JH^#j;n~-gXY>X_i^yN{qUoaAoNvts
z<YlkkY-$(fSgRlTYs2ZX5pL$2%NoUOI-P9Qv3}nv_2<tWZx*N3mepq7u2olFw8}1v
zVn<qUzI5}UJNm_0v<jQI8^n|;A5r^3v*hMmu&ufGw3EA~x?*;OrvjwKD;#fVsK7Pr
zj?nvAq^t)B4sHQ>qXQH#+B5vmW!4=$Be3-<;eB+psQ1V#$peyuv=6Rd1lR`laSa~=
zkzJ#jAK~fWVV75nk)ReVIrSx{b&EQ&FP$A(#_X*oaH7s@#buO4$rKbGVg-Ca48uHG
zx=6?n;SC`Ij!F}EY}pI*;DI4nE<!`(ks@NCUtZJaDAPB70)M=HE2&$$Z@+G1TW;RG
znYIPYQw(=Fr{em{4ngJ~o~Ea7MOCWgWps3o)@u`wfWEF5X5O7u9~0a1y}A1H{Meg0
zqa$vIx4rmeP+m-GYGrZhy{pB!j;}60EnOMbt@P=9{btqey{+k|nr=0e%{Ah2D9h(o
zMdf)@R5wt6ydvT*djV{8Vs5R=m0SFDT?hA;HjcT;)yFC=-j!~wuS-ZrO3{?`E8!>7
zi)7E_tVEcyxYVCN(&x;3p6~3@WR*unPX4AJA+2n>naxS?P|pb{tG{Wf;9*fURkft_
z<J_?mE@!T|_48X9>dkZ7PRACo%3K73bLPmF8s5WMwt<l8f0u8H`R<=oT*_ZGP~e4&
zJsczie|ZQxN0GTmS@+N;x9CN}CtKXY)VMKHmqOGu5nQ*!t|nr(%W8pTy#{c3J$J9D
zndGzn#P}Ww$n)ey$+EzqwkhwO<JhN39YBcN0ZCy^A4Y$8e&06LZa910L%*TTwlf9x
zWXPna3|^8}#QKsO%x9HdGc<vG(5>+DQ0ihR4bUc?BJl;O6o2S)Wy`&!O+YCV_&#(7
zEH6yZR%YbHSL|;IPY)nyq+y2B=q%sm%=Et9+xT|xu*+ed;@R5=?SBN=e;R)+!?NIA
z<-RqwWA!T&PtLP@H)HAGijI*DA7;BGdz=W-uPm$d$c}2i#=?2=?bJEWyMkB!n3EXo
zloP)xzf#e+QK#n5s^``Au?|S>@@|Dj?9Z!Cb-LxB{f<7VjkukfI;P_fm6+`Ov?o`4
z#)YMQp4jq=+S`rCAOX^`JLZ;($7eFdjAB<?R{nBe_Xh0lZEWV|rjHb64@8`FvGUDH
zNN=A6Oi>>&>9(5p)``8&THEdz0@jX1nS<X@LCol}`J6E$p~-Pr1uD(AL8WpiY1$N|
zv!i!Z!{j9j(js4V4NgwV@7;E>p*z89i(z*uOqv2Tp)PDBOBvv#TC(|87KriWYi|Dz
z*MS|#!~-}0f=bX2v4Z1jmbNKXB+!2}x*Ezn1cu9e<{Npdx>OEvQOkjhk{8Pw0rs{F
zM2(zUzB$*OQl<cdnpOzvDQ@?(uoFgt4=2l01$*JTa?!=GAASv1JYhw|VvvkQ3jL2h
z<}C|uzqXLH6LaR@bD@tE>sK7XJ(}`n(0c2R$*0t#RC?uxS-Q4g_HxIb9|P^*#Cp8h
z8=RB9!#ipP7cF<qZI3xCV;7w8GUd_q*#BVpmZn?k)o(VBUNpxmde)mEPggy;yEwVv
zMds**mATLJmV~8^?b0}WtlP(DKV4I<>aR4jTxFZ39_-rnU9)wOdpuTA<*)X68voqf
zdBwHcQ(5(wsI2FGitd64Pmv7dZER&_bt<5h_sySrX1jaUTH8{?up2Vw^j1|f;*!9F
zV0=i?6?s!&rh(lLMtj{&*}p@5tuD4_InOgQ1@8bn4wT0^*kU~C2>5^i$_gqpGrOS*
z8$|!~XD~e4?2bSC->CNn71#_$qma$YR*K)AbZ-D%2`IDZhh=%1HG~EISyN2yH{l|-
z1Wj9`$^tRKRn=)j-*`(Fu#o;lQa|<7nACvo?49v82}~e6wqg>60*N?A>Gw4{$y3#g
z5iE?_x1t<pDT++YBW{~OI;OXb`KpWmj_g7ax}?QqZZM>`IB8T~4-DL{|J8n2X8|B-
z4IK-UnYfnts<D^F>`M4FQV5g7_|HLVt(6*sj`L^H#bw%R(c<YWpN?~wvo|w-@{`OD
zHjT_*TuaH^x4F@!CpL+TSoS;SRF#XD-R8{sXrGEVUeupE_j=irO}0^^yr<fG9h|52
z*2vZ@ZELXhA(ezhUT30WE6bl{#s-%;)T%vMJ@o12MP_N|^^exphZV-?Pr-~RRC9Pm
z)MxDOM;!|K@$|U;XBJt<pV^odAAsI3PR}@|hpN)XCa23WloM_bA%7E4H(3k-RBz%m
zb&qOo!ubUuJjXNI2WzJEYGaKWMs8<T;kJES`D%ctnDrOw)hc^4V|*5Pgm*CS#<JEb
zBTD~f{$AE9z#VNVQ@_1n;X(<~i*BvoA6#&2<LCY(hK2BczzD_lyV5BN2YsAh5fMi3
z8>*+*nSYxuT|jxXhzeGSO=Yybxx&17_feI%K20T>2+lqpi4)>hUp8>Ck*+{C=Md(*
z@1N-Ysqt@F`xhmThP^FVdL?^E!PJoYwSmi0*n%cNO<)GfX$33J6Vbi%@Hve=m0ej}
zNWsTlNHllNYl4$Lh-V<%4X9($To1pxC}vw^q%MfwX|`&U8pXN*kn9xhCTzR}5@gW_
zC!SroZ5c*VQCNe8E{^rwJsj&<)%Si~ti`zV>C0YEHPQ>dzc?!}Ft9#g?8Bck`?gzq
zhS*vXajo#gkMsIDfW#fYy=ZFaR~9w%d`YQGxwHMg*Zpg9nWzO%d+#!`R7=bGO6`)t
z0V)-xm3sC6I?nN3*-h<c?7H0NS<@3d=6JlUDz^U>@3(UE55vl2$4)z6v}{tI>jl@q
zv(25PXIDm4WLJ&sCUSL+%J{1E&z}vl(2<*RtaOjZwDql}?-D~`YTVNxqRqV)KNz$3
z)qj?D-i5m<DEqc!GvXTSpy_>UoWmbz8;(<r>14m7+3{3%!nN9-aK~%Lt6D_gSi#A5
zv}o4YmJA>515OWBPP2u{pF+X62Lt*a-g(?L9H5q@5WV(IbF(-WrBA1nM#$k55YEUt
zQ+m57pXjNa-Wk!B$!Su!gNxrzh~#3tcMO@mlvPN)WP9wB(DZBts1rv`4q2<0WqYNT
zx9NL&+MRnVp&?qsD1pcHU?f06705Hd<^X%@)6y<z+BWNSpSL*TidHZ$82I$!<;(Ko
zM`*1g7y?oPnr@|gvC#Bl9r=O~#YFAgW+m)&x!Q2nrP)Tk+5mi{y}5SCzTm;t9Lwa&
ztgOFo{+ehLv7qT_&k%j%qL*eXUYZqrHuKNQTBW$sGWqF$*=w$p`}T3^YiVX2`6}g`
zVO^K0-!7Zd^c<!+rP()(a&<AM*r;=3w{)ORO4o-ktj2|AIa$|E^yt>d^UFz(&m;8z
z)AE`W*miGnal<EkVW%7xG&33+vqqjO_dU0!Bjpv8sWeG$do&VVuGn07BnXns)ah47
zH;T%0=_0F3;aR(~eNH?y=<zny=j4lXHcboc(ge5!EJjOqUD*EnaZZ=u+neCa_y+Q(
zzmd$cAB{<<U0j+*3e;_E3WgghEk4r5X!Il=&!yQ?bmvAw7D3moX<!GJzzUwHFeyil
zbbv{Lvp<dJ4Z!_95<p>9)n?<vz2@ZRcGcKB!KrMd)%Lsbmrb)(Q!-b5yZI__NljJ<
zSMLSt%}>mUj$Kd{ywRn5lxuEB!_ZR{XuiS^QW4VWx3l%rw!da&=|9Q3x8}WtT@y{w
zzk$AVi`uFD7YPvS`tr{PJ}TPTYr+}b<bkmtaR=|$BJ{TP5PwtZeUbWos7~%?%(tL9
zVDAvi2nZowSR`Orxt*gb?pjh30ED{N6_{I1$msOouJ_F+n_Pz9ow_+I-OkV`*9;_8
zU|NU(%FqwaPGm7->)zG>qH}~{WzWGGDonskI#Q5<rV4~Q64u@1kY1k3-q0KZxzSqY
zMzU=~+9g;XND4?;u#$j)07~Zb8?1cH!ukwb7Jp-Z2n*Nld@ih-kTXfk+uD1TbLdSf
zvw2!U*0#+wx8I$=w72Gu{SR$qqe9RtYiCzi=}?gu@rBDHUZtlupgBcHOvwUCb0#EY
zPmWo4)$U|ykVyjr1Hpq4B{Z!+6O)wvU$gCDBX2rPTYKWwxzK`$+fO4xon0NOE^oK8
zoaMN_xVF)#pgw!kv#O4&pGn>{{>;zk>YFttox0S?d#hphf}j?yeaDZuG6DlET6q(A
zJTHIGeGP`Psgi|EtTJD=?Bay9s+n@&xU`Hqp%u^uBF2k8M)M8@nq)15`OP{d5iW37
z*aI7)a0iO9xdPow*BO=_lr-Z5GfHS-fxL#5-Zgzbansm{Znon3#ACP@T%NzgUN=X@
zX}DVk9weacsE&=)!%Tdr@MW3Uz=0LsWt?wWf{QG^o%pd*-emP#q}8D(mmu6W)rcf^
zZ}AYQDQR9QP~-FKBeLxUeq7hOyj>fQknmZ9q7xn8Z+3a9>QeaWYuu*D{QKI<ke31$
z(ukY^i~vsrYvIr0uwx6OEW1YSN?Od-;3deWNeUKttJO_?loQuvtRtz&MjLzGK>kIB
z?#Fp}c-&4Xv1)ST$<^>*249MvnbpS@t!NyuRja2;eN@Q`e(5synM|3oEA;qerB2?$
z?4bSW!bOpBXUNUbE@Qj_B6ZCY3r6MK9|XPyd=gF4rv7rhg1Um18n}>|_nTd3re$lp
z`{k5J)eZmeRCueN<vZpiw|a)NVu_23$BmP!6)oz&=WH%?{H|lOxn%v1)RuwEHg~O$
z`T1h7hM&{-JcxL@uFoF=*%i=X7Zvk=D?VTAG#94wM%XC>lN@*dz&Byn6O5oi>u%D(
zs}}uZ|2Zmig;#==+KqY-MoHLdx%{Qq3@B{DD`lgfbmTa&wd>f?`20+{XMlNkxFsxw
ziuvV6i38n1H(>}am^I_S{lWcOeOur<BE{NarDeA5Sj)KL*y6vsbT9q9kz~yuAcaBp
z>#lg0m)Ut+!&jWnDB%wTaMN~OUq<<w)6<X?+zjS^+^l(SN&8E|`TK*N>sS3y8CqUy
zmu&blyDEKF&V-LE<<jszj|U%pTl|-&viK~>Bn$6PWKvvP^-<O|Px`W^h;?&a7%+-o
z{bb+b@N6<M>rv7(GK$y6_?Km`>V2@pyUbiYZH(2r=X32~zX_mx9=BDYSaWlEo_Mr&
ziuh$%S!TOB$82z9{Bw1;j<Kan-+u|8-uISuOT(~vDWALAY#&wM-Ot?hdv@lBjw8bJ
zW_7#MwUM1&nTPr20d-~N$DTx<KX-2Wu~Fm0ob;}%cLfKVMKL^K&Yckpu2~1sk4?Q$
zJjS&({LaX^wtAbAKllFk<kEGx8{I_<ukBbgUM3UtUCQrJxQpBzA8}%vWBwVE3KbH?
z4Nq+bi<`cE`!;T6u*c~)Bb6@(JDcUaC`QQ?b8D>b^wJh?_N>zcl8M?DxplQuhqHP&
z<2+uthELg?xz4Zd&3ZS7O(yH24y@}Q73ZJy(=pa*(~8q&S0ncflP#3DfBkT{5fw!|
zdzh6<r{Fj3c10DOuSf?Wji1t~@RKcdlLNInJXUF-sXyB5W}#-6?zNdtPcn;Edleb_
zy_wW`kL~mwZ>qh2de^LpsI=EmsDHJ2OPRHH*U%-KYwx`=go!x|oQBvr*L}!>klG`c
zOJ20f^_cTeFZkFSh{@Pfj>Y%$)^+i9oz$z1wz6?z3VCq#4dtQwoize`Qu7B)-9ej4
z;Qtak%5z`qlp!fyQy!V>#NvBd`0K~*Upqi?Ie|<aH82c)WU8AY?mK`JAMp2Yu^_}{
zdQOS2_M<Kpj?Z<ct#YzE6~C$Wo^zw@9jdPUTU3*kxw^<F+K*8YRlBri^fes_TpDZo
z4mq@;zg0~WGlPeU&wB1LS13ToLOsv=zD9}KfGk{0b`&}V+xdI6(sgw^``LM+Yt&6A
zeOHH!{I0F9UR^T4Nky^IZj<6I&-w3~jUJ&%*}5*BCaTWo1k(^u3gPK|X1WiLnTP=`
zc;P6ic+s5x3-^~s586L5+QPHEEG)Fuo61<`!H^ml=#^+!Yt`-=N$6Ld)}ndy&NEI8
zedt_n*YM!O#*9NAp5mz*dRF`3mje@fp(5B6x@xq>-m(vmz5VZePBWmuj9=DU)26YN
z{$%PBrA{l{JJ^o-`K4GU{Q2pQc)At&=~u!Ezjy9Z|7Zw=Dcv}gycL2Z_}0&pZ$bmH
zXc}%LdP@D@{SU`R3~J-MTIuJBJ>9BSm~Kd}&TqPMnbMIT9wS@$DX#q?TQCuoNTm6*
zIak^f2DOTxJ373HcUYR&C0Tu0sGePes3m=0<#+9P_3imJxwgYP=QSUbUBSY5Bx%9d
zGgm~QX&`?yfU<>qm2Dqi_&H@vWPJbGOLgXlcAtG_y5XRsbqkLEhOo;cc&SG>W2lfk
zR!CW~TY(E-yRl2r!gBW;?QA(;t8UD;ec5Hz-j1ig>NL}Qm85MNx_oBelwQUSSOk!{
z;{4OCszt5sk3F2~%=|^8))+>KyomTu-=kqLqA*;W6)DY7Uny*1A6+pTJQmUqa&Q=%
ze99tp$D%>5okr^A=<d@#pdR~dj7>>s|1`(E`Jd-mCM>iB4$kr3PKppz4+LOtaC?zU
zkj(?utljSM;@v7g>kInh`s)7*3Qk%!?3L<zTHFq`-%!zrdJ8a@qGE*S_b0u#9-Y{0
zQNcsSfbuseoYqvl96n4}*XTmVh95D#q7dID+UwTGcH+v|4D<HiV)ypE#-MUzMR;1p
z%Y4)Q9!=VNwd4>LRIf)pG24HVQcOQ?DI;nTW_}eJ*0%E*mLTZHY5J50>l_Fva8z!a
zm-jqNulg4LXMd3B9!#KYOVRXEf0A{uJaJn6`P-_8DxzV3Sq>`U!_TkfRlS@|apB=@
zDZC&WqyOwukG^w2W%Mt6n$Z-Wjf)G2FHmV@f7)O{VpAiZcTJi$-Fvz2V0z8G`SP+9
z8+rNLT(llAv|mau&-Kk0&KpO8YICtu8*5u+GDo!LZ<Fw6JW1ykvw1z?GvHw9RX;jA
zJ?L)T=-I+%ld_vnPfi+Av+fqVXxC~kn>@L>s4aM)01~Tz&ed6WDRad)gVsi)ZHGDg
zsSVtl(Wz8xf>L$w`bVZLevY%KD(=wQ=r9+qDr`YWO+4a)ncm}%bpk!3*aX)88r$#Q
z6tEw*VTK+b=dI~&-Cb3wyQ*?|#iKEem%0W2YucN_<XEch<yUG&FG$fwqsLvE?N_?b
zVc6+Mrba$dqgoDKIjplsZV#i4Y5tSWz}g=)$vD*Bb6;VCopo<k<sp{Kwg>G6f{I_!
zoC~~GdE8%ZngP^NN0BQ6K6EVtgeox}s(EjTNse}SU~35w4omXwzPSH9-D;>C%^-{)
zKznz*_e_F&&@7;?xVLA~_}>`O%&%H0?R>{`isd-$N~*Cwp;|PDvY!%*YiJvuO@{cv
ze&gTQ$9C9UTffNU7F+J}>9fKT(yey*i(df9(jKCD;k3=)+NuQqGqU!ue~rOW>4SRu
zPfIlASNe|EDJbmOOw;Bx;4fvMmw#eomHp$DsvAen@|@VK^Wnl*%XqH+hf;cP^#~nu
zJav0#+mmBFI>Wl7_nPf7VWYL0qH^NX6MQ{hM78_emJjFJe*5|rrHN9to|jPr1S4@4
z)2!cWlkg{`Hd}AxQ+KWEuh|s_Yr`~$6SgPz8naqO)8^cgcd1>=dHmmu<<;`DN(hEK
zlNL9Z-|ZbD3c#Ab9DJ^a{##m7Qf)njFTs3N69r_H-J)#P><Q{WJF@dZYc)9l(ZfCW
z?SC>R<y`-fI)@~(LMQEi@-8uD^7)za^DHjR?yuR#w{J?<yX8BYo|7uA8O^5dag~j6
z8oI|<&GvE+-1_6`b%OPuM{51n3V>>dN#HmD<MpFYeV)|ESnYSr@AS{CIUJx+YFu$V
zMr?5nH3Rnb8_O5%<{^(c)r$3s0A<0pszxTweuHK_4b|R<C&z4FP3)*C^jBT`n3|}g
zt)F78rK*YXn#bD8S^W<De&`8i)^Eky2jXblnd#H*))ck-{cFRr?Q1p}Da9;DA|R_s
zM3w^jkH|{-m-ejgs%s1`mL!hUyeF89ns=vkC)>`-iZn#y=!Hg)&do~gq|Ml>1y|QI
zx4&*m*MFQSi6*I)>^<)7PpzkG>FJMpar>rIJ$*A*{JnBz*|qO}uLsGqJGuC`p2R&*
znP0y(ZRYse@h$jB`x0Qd5{j!$)-AO~>IR{wz(QX>w#a)XpMRE>y7u_&{%@XN(mANQ
zHkdm?6}qJ?zyOWJ&oL{;rfB{C8CJjIjlF?kwn(=^Zy*ED0`>Lk-Ma^d9QRJ_W#4Vj
zTDL)(HfzEHda8M!NpIIo(|Bjv*a$gros{i|M_Yt;OE<tSFjjmEm@ZQ$ZVCm=uuUs}
z)3;?opLfZlJ$Y~L(uP~omvya9c(v^HNV_193u+RKi54ApdE_bU7$Upuv0qjmBK`)R
zH9>=bfOa2xf#n%N)<AHH-(082VV!23KaIN|5Z=dAx!;)06G-@~I%+SpoqCzsvRgtX
zFlcz`3X@=Jz3AMf@K0ubiuFXL2d%?$-~7g8n{JWM6~(#jx=Ov5Nus;I7Ma0Qx%cIi
zt|z&im2q_z!pz)sskn>2TZUheEQ1NiNEaVRXc@pnlvPz_y;~9Ax}*oVaCvs93`Gz6
z`_s*X>FF@oLODEpH$30olczp>)jxo0GHys4qgzw^w%cZXthKuB<X&wQWP#NGN?uf#
z08d4{is)iLeQm5#;1lYj&p)aD>JE6Nt(>T$!WA1aB{*{}0G+Si#N9l*HTgZs;Ekyi
zzPuT{x!Lvv1&9y~cisiXuUr@+tBnfS9;AR65AKnP$WDWAtp8%IW;@ZfWv?gI>qldK
zAfrgM%>u$XR_2ak0$oWuSP)PlV-aAZ3~461YVGpN!{uNV^^<tGS}eIOJU-7sk2v1M
z#P5pstlkG_q?t%;JnYYv>lvH80%bFz_cdU-vSs)Ws-6?&kc?nP@K0G{0PHG@o{i3Y
za6af}y+f(%syVYgRnsz-4O_GY1)F>BzX>BcUmogF{I7oW|7!sZ?1EBl6EXd%W(MiA
z+d^V24FQZeD3KMqBMXO9nJbb2@>$#Ndwh)d^uG5D>T<Uu6>k-}`x@I;F>2igKqv||
zdS;gKdV?oJ{_j6>#9IWa%xUMoZ|?!Z^8(vQx^S?AYNS0sf~Fc>53OM8YkS%|+Q=El
zzWxf-c3NLty&g?7y-Z_RWYEhkf|jv7n+JIx4hv>$D=&9#=D$=u_b8lNw{2To_O{Mf
z;@$-boiXj(_%@~xu^1JZmcO4@>Zz=#n=<TpYR_0bJ2;>9c+rSFQV1#TiZt{ff3Vw*
ztPB+H5{6myKUj6|gdQd`w)CxA?DERAv+WTgfypQB@LH2^ZkGlplHGSw??)MFcw67m
zR;IE=gwIq@_1IsZ#v;B7%i3ng!NsE+0eWfBUIEDwuQVTOa0SdrRLipT97lw!;Q=(<
zskX%y1E8T6hpA1PG-*9U8PzD;KfBGu!~8Cl+()~9loNsJuoz>n=y`Hqw3QnXs3Yo6
zYdd{e^2<&{Yp8leuFNEeIErE_r67}<mxu4#xf7+I4?1Dd)NrM1oIeFzUl7tWHiOvo
zfmY_<p#*_mU%#u%qkLdRO$XeCHUaP^E48{a6gSoc+}s3!5Q<lDc^SV)l0SXHf}Ol1
zQPRpX6c`4f`5`2wo<ai4I)yPSZ+l2qi@HmWgYxQKZ*9vJH_ID)jAnIM=hQ8Avg}Cm
zKh~R8F!dHI-I(e5CnD|D#GkptULc2oI|{>T?8Mm38hQbKZkzt@(VBJ{J*ooy6MO+c
z0#y>b*^A6>e`_z&WYi}4BEDUr&+ig>`sQDocx2~imSom{YRsEKYTiNA*63%USBnqV
zpLqn)mRv0U9iSMpzn2mb^lb*T8k!Tw%zydX{Kz{H<zh)7(z1!pHNUH{3)6wPY%YHM
z6QCsNg|=<(6?j%B<D|zmX71dXRmVG9`G4XRbc3MB5+>Sp=umt4x#{l#k<lnG6eCiR
znMr8Xu<ll%eBtuhg+n@fA921n*K&w*Vyb$@0C-WBP_Nw>khEB~YNKP@D;tDyWJFSY
z2Vs8$#&tyUXR)fwsWh3j5pj5{Wn<ydAle2;tXLN|PfNtKQvUJ;VCf8DpKvsv$7;zS
zQ~!q40E7Y#IAyMY)#AAZDS#wD%(cqmJH63;)ON!8ghl|Oqu-Lby7~<lhaOLrCAq+c
z=qm~%>Z|7*%YOM%j_v-5Vu)y6t>4M-#pL&Zu^{gaSfOAD`!9etmU=}P43BiJ>?#X!
zDFm$<{C|J}Is@&Y$EOOeyrewQRL^djQm4u88n%Kd+GqpY@lp>y9M*q;a@PU6xIXMe
zvRcH*nas?vD#BBG@=;ETEW8qbMv*#f`o12QyzWDWI8N`cyZniSj@{3#ZL{0vDktu~
zJXO<JGv#&ldfq?JIMsGL5fXt*b;hZm?Zj11tp1=NnaNjyM@&TS@irndEYlekruv8Y
z%9PBB-za#cp&DKLV#&qe;6@6B^`f}CSMqlid9-JyzZh`3Ig&7DbSq=a2FY-otm*)1
zoiTT=H=_efSleR}ViFnZuF(?oWMFEmvO)DR8=Epma9m;_VjgiJTh}t_@gIqP^aek?
zN_I2ddv(DlxKIR8iOAoCTLs{d^g#DMX5~<op;MHq+tvQ?lmlkG=l?@w(Ec$gSM*7s
zj5SKXib1*Hq?o#0WS1v`Qu(D=8VkJ##2`dAeGtzw#&DRP-pc`Vj+d{w^ScS0Gc2Y2
zy-vHkrY<KF<CPjWB;(5da}KQN9HR|0j$8N68RL-Bb6I%yQzjo6Pidfl%Wr{>%}_OE
ze%rVyQ(Axla}!jdq>ddsc0AqzjGVVf(<llRq+_%gBXZ}4W(C5%V1BlJcK^R8PkyvT
zZw1p80!fjL&_6@Mi58WJbo;g|)g3y6G&vLu77#kZkSJk2pZ&(w`gaYb*<qImd+Bkk
z9v=9Mp#ZrcKXIlih<SI?>1G8oR2~Pc#RMy>)5-dm^B#RFen2ATBQSI#tG6juPz%1R
zsatV7jl8`2vvtj5Cc>zHj8LYaiTJ>T<<n7z#`QwRiSUdr9@t3~&?b;HatF{joMncC
zC?nHTGIt;^4@KeA3zdJO%(bdM;4)G+c(*$h*K@vB2dW0x&y7n+bpNIg`|}q6N5Io)
z10xqQa3j<b_lzJa=%<t-@~PObhF5`zF1T%)^u8d`Oo{HEFttZ`A6t`I?K3By&s<TB
zAonaSID~|NA*g5E^R6tcWp7z0dG_3ns!eq_kB^y=iM)WxqqBev$jJn%A=w+`zD)l|
zDy_h!$L2p@)^*dGzMlS#WR8I0i~=q^(7jmDDj0$#lbdM2d2e5^-W!5p)0n+Wwbf)&
zt2xEaj)}eQ$|A=icBSu0&an@hCv#d_3vMVGol%vmI3V=aHcGap68X=hFJn)67<NA5
zpOdpUppWc2)F+6g=jEx8Nsp)ApA(aWeluxjdp>JS#+?Y#G7gHgso)k#EOwA*B@;&C
zYn0V&XWBa%<q-@Yg?`I|Z*B?rUyu0CgV;LxJW~sW7K1UOO77xtEB&3@DE*Gbi1>xI
zatd_6R=9?{F6-v;(SB6&eFME41X=>NAPsjV5pK7!G@vD-499x6YfLo2%!zQa2-81r
zi_m0|?VkGe{T(!cM`aFEy+<uJHlwG_?HVvzqZJi{aElgf>7(9`&l*#EdnJ{2&gz>g
zh$z>7{mvL|Q~vF-4(m6)Nb{G;8I01i-+kZ4r;s@TzJV{5n8;vr%RH&a&P*5I>14J*
zc0n5HSRv|K)=M-PG&^cRh{!C^(l%IBHTnns5GXT~Lf*xW9cK6TeUi2^u0JAp-o0B{
zWWRW~OPSD8W3&%NhgDvz^9yW~8Jd?EEW>QnhuhoOLpl+aewQG#zm}^{x{yJcvEBOE
zj170aR3||C8B^X<1Tg>K8e7p>o4df@F%Tlmm1%F;b`Ict>uFJ;Z~vj@OWsXjL`tMm
z)Tgp7{r_k>@3@}#{{NRKv`EM*B^Ad?W*R6JhwM!n6b{KqL@H5sMja9=nTL?QQYj@X
zlwBzqQK@LC#P9xeeSg>O{Bf@9qCTJZ`!$}=$9i%o_R5jOA813#b_L)QIX@a3>O@Lr
zJA8St@GW$8*G)`$8t-z<I7o$}0)aWD#+l4FRQGi0^s}7@4{nD%-07tA(y36@Tu(um
z!i2eJdw|5@bim*wUQZ_{T^>a6`xXIo=ps$y^UImD?8Cibnq~|{k%E&{F~Q_uIg!yL
z_W+?G!a5zRH{#)E9+yHA<XYz@1HB>K)R>v;%O;jR6W1ng$TQq@$jZgJdwaBIR&iDA
zvVGI-V~;9n&b%{VR{1ImiZz5lPDI!!h@nRuTMP@-<Q^tkrMiRr!b>wGSEjyXAl1p)
zOOY-*e3PLj+V7am{E|WC181G#^1~QXWu3EVKp}A-$#5S#sI_{nm^*eyyniGWzm(Hb
zScyMt;D!+_D1Zq*A*t&2n7G#=w<Er@kzH*3i^W6C&C7F^XQF9%C##G2iE|9J)=>-Y
zV5Q^<nIRj3PU+yG(^(5Itzs5s1&Pf=Sqsn_$@~9s#pKDT5Q|ujY3CiO2a-AN+X;oG
z3zARpK}uF+Q>fYR1dP?FtFJHQ2c(jsp#TyM`z?+%>xR!`59a0JemHYRL)=qDs%MSw
zEjbW!xd{%%9s9i4yn1<O^wb_6$Fu5-%4$EO&-|}H`EY;Be!o^w>+)s>@=8hQU>!R8
zPh}s8vPUuxC0melgts<pNNL1Fch5+tvo${+N0t=tdE~aui5<)FNFbePcSxWOWJr~W
zp*jRt<LrKU!9&y4)m1tLVnLhmbDUHMEZ8@l8s(*?S@Hel7#ZH(^>8_67hAEQcP+9A
zDJ%(N&wLYa=?SU_QTu*bZFJ?R&tJdV!GTr!HjL2|IUP#K%FuLGZa=^VuMhWR5ExhL
zSKBSqdIZnvWlxLAy@B59({s}+p1E7ZPX=Z!%6(hg)G<+==P;;(tUm_Tg}!b0>Z9%c
zI;KyS-=b;lC>Z{S>Gpbg8Z8IW5WFMN!^^?5?Aig<0Ysqi*D_1>balTzf7@ut9POD2
zWzF!1t}4Cs!i!j8t0n0gns+r6_n|Gv(edt9{nsES?b*M6w}Y}no~Za_P&pBH<I*?c
z>VK{vRbX`|CpAa@DDnz%6u7#++lkZX)goy`QC5vJ>6o2(gJ&ec$l`wVs(8m}j)gP}
z^iO9DLK-aV>+8EP<(`$0iYd4*&Wm=7mBh=tvf{Sf@7!iw{q48AzijMS@_nGT(hB?P
zs2IDM^FGe+m($&B@qi9B?dRzPcldX*p;m#1b$vV8WT=`TXx?f1{|LID?mi$g9&Crl
zO|eSP(3%$0X~6|IXUsMgj6G22DNE+WfAv&f*U|}e*d&KN;?3ayX);3~+1rCG0ItF;
zr?5vVUjK78hC@fDmdXIrKdHt!1M98NAgK|7E(LgbT8AahU^#5XbTlay$E_RW&Y*(%
z6ZhGxd?v-dc+HUgP&CPG8hIPYXGJn5Pi)#bWH9yp+HaTOIpI=4ytxkwL<WZUQX;8%
zj3VLW9!P~`{bTlKV3SM$Thq`R{8d^)Rveu%J?Z1J-|eSn`g0S)!)_&`m7N=N?%d;f
z`@kp4x$<M#lM-LB?%SRyqP286JhAvmP);x{CYfr5CyyVxO*CBYue?UH&NjX5zmA)z
zf|SiiAHFZgDIf{Yl$zHIDtzb!5k{o}Lcn@Mo0e~#^HU=Asq9R8_PirLbFQRlTCz}!
zZ%t+2qB(TA7uF3CwJo(0dD6`|MM)HJX3~p-f>l(ecgOO^m-{Y1_emT5ieyuxurjgj
z<odEqotFLX!-sNjliL(Za)?Ykm!g*(!)s+DP@*J3DgwqOPj0d1cHFuFWLdPSUpJht
z=Dj<Rj`#~WJo$XOJr|WP*!TIq(>FK|vFCHmWPK`q858<)u(fDpiGd_<F|tnU%NgQ8
zgarmv9q6}BY+rcLf_$~s9l0<g4z^1ec!8b}RtvX7pu*%i+);GoG_gauHMvi@YMLCK
z@aj|BOgV<|Khg7ZmJRfCeKWt`VG3!AQ7R%f)UA_+Y6Ge(eCzKIRySMoop4h5C}1Qo
z?5u5hJ5-uEJ1_EZ<eO7DZ6-N`-Upaq%Z-bg1L@f}40k!JpZ@Y5Kyw)Y2~{&k6q3Kb
z(GAiM6Zjy*L}a~zT_S1!iL<6$N=;D*a`?AZ;8<VYdY>4JPflW&UL|{#dmVIf18NXm
zLF~?%N0h0{*@CA;+SOV3!tpQic!5_SCCJn{b%X<b1ELqOU%UX4-6tsl1S2?E4~w8S
zN7$-|N@Ud0Wz*D>*MJxO{5UW%kje~^lCoc~6M=no)N<-A6KX=&ru2MpQE$e-;0C6q
zEt9H#`?IV>-Y(D>=5v|ADv;no-E{6LHW?q9%brgA<kYd|@#&YTf9ZK4>noVf@s1oW
z{hC`D%7*~{<n8lpXvIUoQzU=@gT{N$4!D?bD9duCc7@lNc4KAFb;P9Sl@FpDWV=De
z%J>4wT^2hMVC>Wi9gc0f5;W9OQpgjQy-)Ptu<qgOGe5bB%Mi;G3NvvdazY9)(Oha3
zERYo-z$wmsBvfVQ70(e!0e$=x#G2X#xLl-Uf)s-|$-v_uDSE$>I^Cya0D{f=zIX3l
zpR6b`+0y|LWV8~ni6i6c&A2D?|F~;JZ7$gb@WM2pe0oknkrx409gd4rC)7xwL*Ryw
zQ-%N+)%+CRO=Fx#=2(E6DX7p$X%bJ%k2i0B1y6NaeRS%O7jZK~=%v=X=;W-cFa1Gi
z1QJ9A$PB#Q*wjQe$_xGjII%ZKdTNttS<?|t3GLmh;vr{88x{9gi8KCfRFd)_xufSw
z8bZ)&xgphS2ilsd1tC}x*9?Pb_6=HU@by-`XV}00iZES#CxT<K20C-+QV#ATgNgqD
zie)tDU7bE4a=TSA49t2VE^@YDEpi3#Lg_1S1k4j{&@<yQhc;X9{FRcRMqb7pu=8`s
z;1o*<Z<JKtpGEf8w<SSDSev8fk{f`QaCV~0KXXK02v|FD-)DUaWCC~ZAU@<T3->GH
zkc8|zuBOz{QybhayvoPIMgb{V9USzEX*)U<SM^Sy%Y%S`@gB}GEmS*T*#>GpL4nwG
zG7}0X&&`!{N-V72KgT`edFCPf0)8w>#8e%S2^z&^md{Zj4YYLfN=nzpnSrSR%+g@W
z>>a7L+q8N9*MrI_H|QaYE6aeiB{Tw`3Re`722wCf>IgTc#NXhbs~&YjA)koKR0`3}
z9DLk7z)sTr`Q$8B;U`|*Q_0!IS@@C>K}@`qcnKDo*Z0`%ABT$QP&zkBY4m<C)PvyI
zvL9goJGV$$cu72`)GI%jWhtr|JPT|92`OOuNHPrh&AK~ddhK=NG*$@C{5kWu<H6g=
z6A-b5j5r$WM>TN3X5q!jU(Zc0PEnj0Pho1O@Z!_iLu5qxB!G#_2o^p~G<F|q%!vDa
zF3@d)Nuh3~?w5F$o3B?^uL*^|;;XaTgXxb4`F%yaH{oO|;E_yx_#K*1wqwVCGv2Oq
zH=3BWrnljdaam1I8dfSaifPhV_41n5R}a596>YG^dheF0|90-&X6Vo(-4dD_K0UHw
z#Z%R;37Ml`&-JbSl|Q8Zk)he1;ReILpZ0zpb|KTM{^q9xv-3Xn1UdV<+iaKP@f7r+
zkQGwVvT>zv`u)o<Xs|{*HD$KQW}x7{OB8hPJ;}&uCPNZ<(I~RH2mT1IA|rhUe?s6V
z3y+Hd^+xdlyk2gI%<+mN4_pP_MIZHK-Yj@PuN5m&31eh(auaY2fvhEetlieu&6{9l
zad^Yz?ls-N_CgY7IQp-7b5Bm{UbSN?M1qXqAk+J_S?}mgzsaO@CVu7zlkPh2$?!tH
zsk<j4VOJ=@y5iO2dui3)WboaQG1?LQ1Q&PrVDIARpC4Q~VY>U?#yUbphIS8shig~@
zP&QY@S6@?k`p>teyCRSr)@42FzrkJwAyH6}5&@y6j9!4al*&QUM!>;jdt>(mqDXo2
zq$SC|Y@gd4ysU9$QXJF$n%S&U<DBN$zv}<nO2ETKi*zI2>#7OKmV3hoT-9aXye9lv
ztA`UOcW*5op6Sz@Q8mAdx7zS4WaYkB`hAx-wb2>AglyzZ9AP|HF`M#3#}0_VlddxP
zhbp)@DIp1sn}^pQ1-v5%ELG_k;wc38qGnNETYjWzc3yC28E4iiHoIL(wDt%wgQ3>O
z2U1W}BsxqPazy+fLF-*yl!}Xs#{=)POy7ZQA?uThg7`qPE~6C_dJ~T&uadt^ix#!1
z<06>HAD}E-xe3Uv(o`Ddt}*>|wL0}P#|ozh)oYgm)0s17-1PvsM$OO7gtpFM5Le9K
z<QCe*K4CKoq(%vXTTj*o`9O?;RVRUt{Wv`KiJDHq=-4MMTDNXOB6F|$6XUv5(vZ%U
zOthwiIf$!W%%+rLD%(tMu50*{Q$h+6k4%bJ5x?SUAl@^%z-xu_Ivj%7mfc9zk#{d&
zmruVI;iJ^eWK15~)F--OGg1^m)I~dQZf@>O=oiT@ZqNo^D^~`Sbf_`XapMhXu|@4E
zsRx`x8+7u(O-STc+1A4M>CL+eo&Wml)Zd=hleaxO6yb0Gb9S1=y>lfaJA3iQ#q}WA
z8Pp1E8%9kciX7TOPXhHyLYI{7JPM-T#@xO#foCJY1yHG^So(aO6|~Fk#JS6;`x6%S
z4oNXB+W649kE9j_eV@^N?pNf~N|$CVzL;6)HHciz4FoY^yK{4k9oBbG##~p<G;>>{
zjgOqpa9{bpIXl~tqnKcDk3^)`T*G&3$;VkZF2eqJRzY#`Rw&$@>)xuAd8nGxsdOB2
zvv1;J@z!6lCg_n&&jn#g$ZLQ#2M-%);=`TCFo}H31cb9Q9l|%XwK@jz#D?k)0U<TD
zls#YftUWO+f>IL`lh?9k+&xF~eEDkP&jmqaQsBq<5h~GlCn}7cbSO_>t>gP;gI_UZ
zBl#^Af#mdIQzMO-Wi*}vJ>rJ@@PTmB*5uuJEngnQF)*m3#?@K9@P;@Z?5Lpv^>aDd
z>by1i*a#~etUTPJ?`rhgtIC|i^#6hmtF@_JYsJFkEa8C9ut<LJK#iGWOd)O;79$gv
z+41G%bq4bAuz#B$`sLq-Pb9XsoE)|IX`@$M&GuvF;|B1r9WgsiIRa%|q-g#&ZAt=P
zsI%R<`Hrg;rqb4uCnu>B;^0g;<IyxX$iVDU%1=!fHvDbpBsYDtd3|3+8<=tUN}N@X
zSy#{9x7u!}1VIgZC?O(<Z^^tHPBF`hR3adaYjOV21ijg_W}SmH<Fu;$GKiV!yZt?y
z`h{?xKrzH)GoI`2zTWY87iKsxmQI`M!C|<@#8lAC;Q-QdssRo23S((bLNq!_vc5o?
z<74gAb)F<S_t70W^4lOUj&<FhVZnXm9%Gdn+s<Qe>v!vRSPWeodLA3FrOzj?LAEDt
zHh#E#<w^+Yu@tFHfrMg1{N4opBHuzl4E#;(8fcf>v@(MGxW+p;><L7hn0-p&Xwrve
zE{hj$f8n%7oibf;d`jk}^nBo4RYMfJ?s6!X`uS__4H^RpyC4T=<cR@(+&An|$={Q|
zE-<T5aD*lS@K%u2q@%tu50H@llE~MfWM^jDws&%JO1L$EPez?_*rBStJmA{3)ltnh
zQc;jo27tA@MJgU1%L!Wq;Yj@bYX}D!b*%)|$@FgyP+vLBn9QooV@D2h`{`3HK~gBM
z5rE6hDDzJ&2Q(!Nd?*Aol`lYkPEJmKe*WXLehsmAkDJN$_tgs?jvPY<IJH~6#L@AQ
z+nAP*XF{qBf3JRQ^{1#n<G}3Vu`3>Z_;K}dUfvE)h%dRv3C`jh;VVTXs8zNU&<x~j
zqP@NYpJ>UFP`Ec4ZNptW=)dIM$Ns+-fXY>7NrF!_Z-RB&aha(V)@<dn1YSGq1w&6x
zA<A4GS;7=fm0vyjM(7OmJn+Q)>MYODZ+#YiQg7m%?Ztv}dw+MAaxpRLO6BJoFcjOR
zD>Q%Y7l|B+6^cXGVeq{UPg|Hwms12|3(B1h55h(roqeymb=^%x_0{)af;g(%o&NjH
z{fYmJ5+kJ63n9hp;9K{MiS6IImdI+V<xRVD-oAYsy*{$<&PW=aq7`*+#h-NddVFlL
zjmKZqwbhzBy{a4EJ$&YI#*VM6agUv#JI=4uJQ+K)jd+H~pI4lBkNM+SAn&@XtyWZ~
z-nhQ#FZ8=~Q6f9I-;5mw<`*ZHj*AQmN(s1h){GuOE@uc~czq5Tp>#2KA`3&a$k5*a
zvLJ=J>ex{Y!ID*GtV(lkKW<=J1y*8tdHIj5M|3{oK4-WA2LmbAdfc1RuCs4I?>MPS
zN=o{Wk}1)alsUm9X&wL<tIohn2D%L0wJ|(=a+LaBz7%6|#R9~L+Q!Pt)m6)WulPBA
zL3o|24bI>`8}!wtsO^gQt;d)wyYG_+4;kWyLVyqhsVS*mxlOji|AG#_+jYB%w+aOZ
z=u_oC%T_fHdvS8KQ4kTEjD@M{-nX0TK+v{`(aQ7{q*OC$sd6<I{`*hz@-*V7so`^D
zcw(<S`|N8p`J9}b<6M#&W@Z<h)K6$n&%)@EA5(RONpqpvb13nDVD#CwXO9vtA!?SJ
z6=@yllg>N+-H)c5{PC+dZpaL29#;~!4N|#+Y3FXgc+r;O^Om=w(8V1u4f~d0ZLA$?
z5cTN)lC(I1DM1Mf=BU{2_?nZzzG;Adr5^8$J9|Y_hb`xh+#U7h)>c~sE~sxa34s5{
zStG<I&xaUXCc)Ws52qUlxc@<>PjRoXG$}3mMr`X*O$t5L3XwM42<WXc?0WuU{?zc8
z&b8;1gN%2PeknjM?h=}5t+V~E&DtRcO-<R`5GHAH-V;-lRXThXB$YIX92QU@5*p8Q
z%%@N>4C}o2{vC2~iLrVeTN9X4Ak8D7w1IF8$V7wkV!_n~AP4+f`sq<8zQFMIk^A<^
zX+<f$ikVS;h?S1@8;!C+Mkvin@C{#8Fk#esYIV{9uTL#oj&MoL_(T<;tZS*dkhd+<
zqk+6u6>Myq|MBB7pKS{lEJ&tLA~Z{ZS0h8<h^FiBFn48eL_|9&{e=$`2!OT9T@xh2
zW>UzNn^_SI1;j4sjhxNs%bHm~D)3@*`KA|h{5xS2nVoiz9VbbJYzOY_RGJ)T@bwxC
zP_G}H-P`Vc!g9I+EPH(42_%#PQ@o^rLb?Grn_&=s|APS$EJ9?Ndgcszng=&3y`kap
z{g_+HcLN+786g4^Y_vh1F{j3K{yBS@`_uUsjqOH0jne6H=n+*YzD3ikQ4`b<?E-6Y
zGs-kB-|(ejQwxdDRU>kr5L1zQb*|YvVJ!<Ih;E4JA0D#Z8{@tA_3`wswWO_OFZ$L1
z7Bf?;`fl`|g~b#~GNOf6oeKhOK7`TGKUTI_hBtiDv59u*X%k&H3?w<b^N1UX*;7tW
z&|iUz5ZMX~f~wl3vi-@9k=ZrWpHGz+@17F-#A1Tg8o%@o*HvEU9j_buq2|K5a}pUh
z{;-d|dv(ZK)Hio)AjY5OeSH*H+PZd-sjYi^<tnScw3~N&I=0@&=Y5QR-zP1S9jdxm
z>bE?(?&=l8BQfVE&AIbo8T|9m=U+G$IQE)^q$ma@7o8v9GU?(6hO}*?SkAfOqe%l{
zTQfZFI4hsFhjhpNie38#h7^uK6(h5AP8Cv7F<nT;lXbxc#DA2&KI__M>n;LWB$G;X
z2;I9kDz{Ei;(%Zd)lJoIK{1qq!muOr+HmPoRlwz?=Ysm)O$5dhqa=XY?)c#5g=3Fr
zWqcpiZhUQBohul&2qO7S{qvWbvTEgYK#$gE;J}d3(9D2CXRX;jqMYD^^Z#`*F)3O)
z@TC1?4YMKdBL;H_#$2Hav>{-STZ8kVZnC|pw)^G1O`A4lUbzK1Fd||C3&HLuhYW39
zSN%pb5uv7TX)A{8Yl|ZvoQpRuqs<X!E*Te5Ya+cHQ=bB<5twOb(6_G^0LQ>3dA5-K
z{};!~-s`m9u}7O$#h1aEP(!%BA|k&TdF3Ld?$f6Q<1>8q9!ZEIv_H_-9-}eqw{FFl
z(Og7SEChCHAPoDRJ52%u1Lfdmv<#wcU5iyi=jJl{;?fLt#wq@&tCQ3-U0q$f@Z;<*
zG{u6UuqVX7$5G4<_3-eJKgKU*{-CeQe9kD&R0%0$SIBlG1shRQrS9|yf)L+@Pj$jI
zm_#%kk&8=VP0{4ZQDGPU2ndy^hr5p+*}?QQ^JmttIu!~RHVM$jRb8U}&c>ZSjd<gq
zKHA@6?}bR&2+SoZJv5SsQ6BT0_^x~m{O2}^5D5IuXpoaf9w&(<!#C~QzyBScxh~Cj
zkY2EjnFS8xA2*xiIpyBg^Je2muAX!mX)r-MX#ZP;hq)+RiaLKjTbT}>LyDzJ4!?qm
ziVDgCJTdHYx@4dt4B}Lo{IxxGpkSblXzqEGT(?wSwR11LqN&29Lax6$uP-1Z(;EdX
z_c8Sv!92<vhvzXp$>br|fAoZt4!0ho3fR8teq$At4t|bOGLgK-(TWjAGT3k4kXcgW
z+b|@Q+x75NTP_ZH!siyf>Z@gba`{Ytx!_wrft&C@8w9H|<$(`CA;J4%*ASK8SAbZQ
zNA^psr&4gpF-km?aNd}r_uj`kpUQKk{<fr!e+Q-i`T3VAaZ^yLbxO^*;Q)$#l63oa
zBRVY}ICBq1Fk(!`AdiV0fW+<eRs%EmY&N+7>F3TRF%p0;6XIlPGpkoy0SwB4SI~v+
z%6Ud|y>jb2K<Yo}>Lo+6y!L0W<sBBP)dRQ)CV8;FJGoFKQbX49z!UQBhOLz;#$hvp
z-!F%AgQXZ5vMD>5ei9&IDN*TCNB3ysqp!+R4uByS6jan_^ah-U-@bo`M;UuBV{Tvi
zn>}yC;-5GNUH16+w*P~Yw@rdJJ~;AZzVfGvuqWi3TSBa}Pt{Q1dN&Ik6PWBQ7l7m_
zaC1loE4a*_76-V^NgoMtFhp?Cx4m+>c_L*MP$sAO3iDj&B7ZK}i4(It>!-6b^w#}Q
zL_4v5oJn@Y+{1a9kD_v82I^<l9S%TFi=AKp?VC4m;Jtr&hQ_4HO$kE!bP5BLyasjD
zs=H1k%FMBLZl7Wt=KSYr^N-e+)BU7y!eL#vbm6+kvpbwvt?C_@5I*)(VK=q87i05=
z8|^8eLrJ~6<24HdS|(;vcR?Wu^VsFF*Z0i!#o0M`r@cB9)^3ye8eb=y55R4&)OH0l
z4D0;muYu#bb@ydfcOVxc<N^tTtbG1KJJn*WYw6fgPOlrEn$*2XRA$IHlahBm9a~<3
zgGOvpJ8@#hIJZcBt>77v{C%yM|Bf7y**DW5dVzk;oENv+Y%87{W~2tA9Nc|sHIxLP
z%!Y#pJ5d)(kWP!H;~=~^1BL%V&O-*?AB1+q);8_DJ%9X#oBJKZJ4E(@tgi{ItwxA+
zs0`3d&dbxq=MM%*+Zcjjl%|vJKvch!-sno2AlUN9>X@qgIis#QEC`Azjv3x?k?k~{
z(`Z!X$I6G3GUToQ?}kFp38B%Y5rb{rHRzs;dLnNs`1nzoW_ihyGUKQR9k(5dNC*Uu
z<&*&G88y7uINjbN@1g&aEE!pac5yD;%5oWzxv5MCFRUoV)h0!KvnGucC6Ne#Ta+uS
zFud*DvnL1y4<*{}_whic`dgjQ-UtGYFH?fXKyx!<hxeSu^OB4uy7v|+a2RZ0zgMM_
z=>QUj$<UW_q&~MnT{$At=;G*4Mz`??serYgc4=X2nY%*2AQPKJe}jC)TYDGF44tv5
z%(6?!Na^2RT_*8Kj3IJIULy^jZ89%jk~=<ql7}aOWyCsv9^TnvRk;Er|8PDXHM8i2
zDWaXmJeG?d*hq9~xD^HQ`=7W*CWO`KurnmUowOKnD_0CSoDXOS`d<L<mImA7-Q!l|
z{{2Go5ZEN+XV`jW!`-X*t_=t|)SJ68RR;9&qB?<8nZMV;Il(5BvXYKpt|q=Pu+mG$
zeD2<}XXGnF45XlZUn8Pcri>-3M~v(_X-KQE*O3F7jbxVk(UtAur{xqbXXZ#pjgUnv
z%nz<AYqoLtaJz-A_E6ja*eFImG;-R1u-Rxxc2*urD~eFgONqo{#}lzn58F{>kK_V_
zbuik&?Xk%Pllv3%{ipokeh}6`KS4MhsXXoCwu6h1oS}j20B*O{w+le6bg+i0*>F%m
z47tUDdCasddKu4P7s;F>;X;mAA)G0FcCMX>j8Lrg^w>K@9L}7P_i;~+(r91gz$bpB
z73a0w$mp>)8!vz$l~bFc2d>CJZQCtsir6KE;#tCq*%bZ_#dCy~(Kjf|y%EA6GvHXK
zq!vO+qmy}KHd`mRo#@T?eDb4eVlLyCti$(nb35TiL~lt}K^pCS1uLuAPZx|ys1tcb
z!qrfAF%<<Zo!s7{mxdb>>;x7RB~`rBr~GW=D1xJt^73wNUCT`=O5@^iO$B*vqG3b`
zB9H?PgD#}bu-j20rW2Pg;-{R1HEm@~II}54?!vr)d=&9A$6GLa2$(_U=I;nfHJDN&
zcUsui9B@;cAH}Zcc1D24U0wcZM{+}$)|=GS4JJ3Y2j2hiC-a<{$_W3KVu~2<zA!{g
z!a}t^xqf2jVNzj>p7n3H=E@D#*q-FLvQ9;}#LDk{vOU4VaxMY1?CP_HJCn_C1j69^
z>{j6V%Ra}C9_@-S`e~sXPlyS9W7b`m)`Rh|u>`M>R>r+d|GF&dHEOH7W_BMA_Zx7g
zGS%%??|V(Y?7MY09S}DpXH|}2m_Ci0z`3*UuEcM<tfGNF7>%=;!-L}blEo2$C%%u*
zyzkuS?u|FAV=W3UTA#e6u=UQ<&l4}kHmiI;v3~k9pG9NuR~>nDE4#|XTS@c9ZBcZ1
zdVbY<9z~z>>{)ASx?fq%I?|2NHVVu$yL!CtvEea0bw(H&8%xfP+=5h*R3G0J^_$P@
z+YW)OKxeoyn^2Z9(M(imbfrX}o+w<}ZI^?P>uile>Ul&CBdBkg8sE6r-mS<=G@ECZ
zS+`;*v7-Ua^+eJ|u?3hf-VaeVQwgf}?%liZzX)%D*!E27fmlK$7~tLHuz@4;_QMos
zwgrq~rbf-6sh^w`TASC6lndW^>n$@1c$e}NE?g7iYLiuq=6-aZeX)`XGIGVoNiX{<
z2L)1Q6MWPXcAs%_No+$IF%qN-KWay~$MDWAc9ySWODa8eH`+7ONSpdqa#~pQSTZ(E
zKzw&_bRN5pneSVvjbcumXbONQye+AcU<q7v%v%U~@L)6{6$1G1Z9z?p?=3A|p7nVJ
ziQsacP{6aX_mNsjhrtRIq6P>K{YZ=AJG3?L7y?Gak;Fr!FcWJrKahsQ75voD%HQLD
zDo4L7>s%%`P__#_%K?+}>Qy_j%>)LlGW0}p&hL<G=eo&UGtQy$;}z)usJ<zFs=C!k
z$g=dt!hKM!ffc1p9BX-WuaryF>uI~X*o%-GVYvKQBO@aiDG8ARGmw)4X`tlQOLEwj
zO}}=Bf?e?6QBlmv2TaXhb`3do$lkqesh6pcB_xR2b1ke1BKH*b9m3DX%tQi^Wiyi9
zDd9f+EjeJDnYM1xqCB;iM4j`f|JYsIHB0=jf#uayk%QjO!f42zJqHuIpFFqdHDsT;
z?GaFWeeJ5n2RDx_-Fg0ChC}JtHe2Ua-ng~dljK+`L=g!<g&`viu#43nFrW=oq)ayi
ze{-HUZ(~@Pwu0aHBRWmBTeogGZ#|<%uie79B%m5A1wc{mumcAVs(<Weg#e?%0#Quo
z4f80HD}wXA23czKe$zoDsuJ287bn%WC|Xd)b0r7P%7hw{@5=!UU=WSQQM6dpyeG4c
z&$;TQ$Yr+yobJK}Gx4GjU$)zL;?W5%Afp0mA?R8dHjZ79PW2+U30>`41{%wJ!Hyj}
z-d$71h0O+*aRv0DlDe^u-PLRGFyQ27`W+!pJ+?YS`0;~flsG2GkojNLpon2h3hCXF
zAzPXQq%F&%G#1&07+o0pO-U^7r<;{e|F$&xavxI-x0FE?%f98|D;L%Zf6GM3+tF)9
z^a0x*jRJ`AfpQ}w(pLaxq`arALe+Eo(W5^qa{8sqi(aiqLe7x4Hl%6@$B9L{DHrII
z>;o8^G^gnn*(-i%(&T5uGWSMf9UTw-N^;JKK<2C7{=cdw@I(-_X-p-6I@tzr;6fNT
z>jpcqsC$Ir;(*YN)7Mc;i*oP5T9&L(SZg)x#hh4rYs(dxNipJfuZ69GAr6((A6s{L
zt-GDe8~#U-#x;{es(*$jJ9nl@=D-xf9aS<?p@UYJ#!#Sg(JJWfUe9dfJ~NyQcIhO1
zVvBPLC;h&;_=b%~?V!PftEO8WOt3$>sZ)?5MT!&!2v;@Yk5H$x(J3L-Sr!Or4C*g#
zAKR_V9Yn`u$?S|{FqQM6ti1Kjf1eoBzI}aet-oJ=W%%cx{-1W94EwYrX3UyIE3LBw
z6IH%U^K*0ezm)RQipqj`-JDx~9l(Xji+;RX6K;Y1Wa|B8lC;^mxef6Tu1)Ll?~{s$
z>|=(g4oAYw@Sb>72Z(rOw<2L5v3Bj+#LM=vt99=Dg9wRYRFU~4@Y3fPvPLO5t?U3`
zrzCWN+z>i?M_RLnI&ak8dYnaKAO&-oSAHN&<v&{kAY+EiHp@`WjU2oeZKbV&ezD<y
zJ8qrvr&=EQiI6);;57%%UWb1bW>1(X@M6dZ5ht#~9>&D8S4S=SXEGVe9&rg`Q_=BC
z_?6GsoMbivJnwc`+?o@mVONGcTS>)6L$>ajqUDyg=0W#-{vP2Uv+c(AU48)WWLmBc
z`};>0(@~<wnLy)(SeX66yUo>ByH_i6TyDsZxvi`?6jKIYV#M&p^w$!usvP%NJbL-b
zO7Ow-yKbqQS}M6uJ`>kRQrc*`8S@s;pb*qmz_3Z&4Lyv?-X@05Sy$Tx)B|2zW|5v-
zTf1yq`~J=8qbL<n01slkqc}KdggOE>vw&03aeF@+({mcf9aHZEIeXt#y_l1ZX;G+l
z2xYodr8#RWnuui%mo8%OK)4eo7?Sj9I3=A|mcm2vsw^6h%trn&=+JGMUscQLD;Y^t
zMPiWV<P)fPupZ?!22F&gqGuCwkr2-SMyz-2)X96<CDbQ7$2npi^K-JGN0clq77y11
zXv5PC)$Hj`B#dq3m=uGPMOF4h!|ZwFw@U3Ph7tTj!Z8A(aVs*|DJ%AJ<?*#RN-AK~
z<#I!vD%pgR&n`9{Km>mF)z#mPWL~|jW)gbE9ZI<e$rXTNthLA@X)`vbzXW{P0IAeS
zAtUy+UbK;0E_Wp8DFa#`KaV(2@>B29<If{Iy{>iP{?KpVZT-$~>yE7Zr`4|4wy~yK
z=a;who9W*wopm81OyunjR~K+NB9i+*p+EX}bH5|ZNIrWzUEPpok2f+*yE!H9+$Sjo
zUd%LWBmtHv)9GS3gm*ZNIE}G@<p3A~$|<3%J^uS=;lfSnlqek_Z0imN2$Y|(xIYs!
zC6As0!~7F|dM|J9KjKo3w8ID)a4TK)Dbq!6kkP0VFvqZ?x2vlGJMFmJXrlc9z(W{x
zAyBC#g1Jt2EZ`2uy~W(D+wbmdl|XYQmKI&y%3{pQY9(SM?qrGfMI<2J727G)pEOOj
zJ1_=?zJJKLjj?%Rm+DkzwHRy}VnZ$jn%<~KFGaY3ySK;m^Nq!o0G2F84|7E-_c=WN
zHxtbPMD&4PHQRS*@?PXHc;|9s*@saFJwEfX0IC=`oyNzaSvtOFY#3}QB`3v=jQkQx
zi!MWQNr5$wls&E<5Uti(+3eSj!Dd5j-rV?jc3U#N;J_0%o#>Ng!$1<C+bU-!TmfKT
z5GP7rWYEKC-n)1Zs7<H?1jotqE$$h#q3d?LtY#h!gOHj@PoMAaWMZLvHuF$3?hib{
zX%I!5Jlr8Uc(^Z4`Tma}1d(_Af<W+0@;7)mE`7T$17T0KzJ>edUAOVSM@B+R11ECB
z0Ulm>t7-GDrJd%FtgZWeoGP_R`MHB&0EYd#g#UHJtiZlqE70|Q5fKu8_~gm1Ba0u~
zYqdAfQg0a+w)<6!h?ku*Ue0cJuZz8&{@@L_g6<88gc1N2qzn{s8~ipNo>WycE+VQG
z!N9LvsR^(x|FT3XW<)4gN$R2ZWvuZg{u-{zPK3n~c^$y<OVtk6b)%~9V)DhPQ-C6!
z7cOj$+Y{6qz^{OBE?O+!x`l;BIU4bT2#-UbS6bj^lya6w0$oV*F1r4W@D!iBnqe3K
zOAzdw6(b^H#GnvhT<vh+R4M@sV0>~d9)bntnu<>jPWfX%i=a1f;6S*=;md>AEYtyC
zIVkx#KtXV6>(dJb;mOO(LvoUBdiNQMd@;kXtNUzBAHZa8EqQI!jlOe}mbkbC(nq2J
z{Qr}?XtTxcPJ8@3e;@z5!;X)1%MyHxln-8xszg2?2MX8R_0y3cwen2}yyV`{)YR-W
zJ6gXMlo7Ra;NA4(WJP8rnRVA&%&QeMKbRCF1c=Nx&Wxv|<!1=v`pts3E7ow@I-2^_
zVW^NBac(IqK=W`@&K-B=_=$e~6oT(Qpebn@q+v7Hy|;7`ZQCi(zwR7~@Jf`+o4j9e
z3T>)!@!*!hm@V>d%54&#`tRFw%sSdK=G4iP`faA7TY6HF7fl7|*ycYDDVaHssQm8j
zjVv!%CxKz5^+uf_Iu9fZESc=_CgbPQ(gjTGVMla8o1Qalm3_e;<rKmtfPNi;kL`>Z
z?L{1mAa+E$!J^8dC)1cV3DrUFg@~NcTpYaoiReSX0=7kYRh5aw1nd8=1-OIc{`tna
z;3vr_8~~66<U{Ixz}&kz(pJNJu3(<L>33uLGs94~Rd8Lj-&){1{l@0Km8l0XYGhRX
z(|ZQurwY;a-@0{Zk+7HKJ^1EaMxO&dWO;tALT6I_V8%hcrRT&E1YhV-4Tj`e@lHm{
z2p*5*Ozt3r{L-}0nX!ZBHbRTUAVGU=!_Yrt#8`2V_5%m*HH^=E@F2Tlf5~Xug`fPg
zd%dXDv3YO!!S4~-`YtX{<^k7G0X6|~qqA$abmXq^G^?tX`?XsRo~EtJg+)80Z}<^J
zFWLQqWTG}}Wkdo=N5(Arwmqx8x$<iGd7!mF10SGaLHjvh-EdBuFLuNjE}D~HFW_xh
zE-NESj<f4|^BmATG*o6P-LjrO9qzEQLx)Jk=_WsMNu++G_PM6~y@OH`h5-pprEr9)
z;5HJfo&m*#3fu86^*0?G8|67p5!Fl}j{;}?v(6E#5ly=4gfTVP=8Sh5rGX5%K*vv8
zDP<M-AesXRgJ*+7;^IoXSX%S+%y+4Ar^Syd-3A~CarGsashR8fcbTdo!G(5q!NXi9
z>G8~metko&1N8SMZf*Mx9X7A!>b^UTL2|Zjjv$Qe>#%>#>eVig+Po2gxcCNo^*4T)
z=-i8_tx+6d__H++Q6{$KV{9_KSR&&f;1uZAlDRhMRQz)umNcSi5zaLBNkGP21HxW9
z?;05nf%J3F%#(VQ@G$-;KZdcmGCD0XnKR2L`nhixyKO_qk8dfOd7fb*?QTL}Lq;b4
zD2ff;WIt~7=xtACt>CHirj_rr1DA!)zZg<Gklx_N;t;D^x;n_WbLlUJ3t7NpHZO%-
zR(|UNI!N^-^uGWAv~U6KlqMEy@jqjAK^eL2;$J*gC=;oc!5KE|J1rTRe5l3tN!bQ|
z^&#bRhp`A@)L7C2n@c(ZoeD^ng64~1VOQQz5*8E|Dlv7`y1%u%e-ix?w4S7Pkhg{k
zh_4Ivca95}GpI8YDPY?Bf0^xgG_6+VtIMngicYB+b8>S<>Ba%diG3S+8l?<a@Y2vv
zaVQr!^Hu@Ci4{eR0xBv#gA=I?g<cbzJ=$#KCY{TM9L-wc<u#PvL4p84Q4)OHK_kGT
zN)QIg5dsXHB&9V4m4O6qkoNncOJ*r#htzYTFUPBeh(x`8smp7f?wK*vMzHYlb2GFO
z+`NzoP-=_8gjx(~sWhDlTWsAIE*y2Hf&LT$Vi%E1dPMn6IqP9uZr%4c)t4v%0Gy7*
z>J1n;Q0BHsIm$ga`%`Th@dm7YNi0w16UkT#QCuZ_15lEL`)&6;pe=X|@yUa?*u@SL
zYYCyBK&hf7pq(_IN9BeVBZ!jA<@Ln{UVM8#rIhCkC@cy~petk&LDT%0m`Yl_x#Ju|
z32|6|x>I1{)QfxnocMa78y|$-|1j1tqMLSeDUif7L-|GZi={76_IN9f4feYTgmHQa
z(#6ia(PI*7T9|PaYzSOWv9+yG@30f~U40v2Gi~*4{E!1%aX7P>aXvRrur(?l7t`?c
z@7}yjMF==9+N>ey(5@xlzpqN)ft(iDrAy3_N`1~J{?Kggd}Ik!h3s`19|(>h+)1x%
zx{66JUhHkL_O;V_ZIiz36o@uiIB9iXehE>xlCDS=qA-qxu5c_uR8q+e(~h3>?Z>U#
zd3lFA_SV|W|AJQU=!@oyvQLcX7JK%OU&e=J5_KfN8^dn}7sR?9`bHj!kPAt%8OQOW
z@c;xoq3^u7Agl6YDNwQmlnChu4$oBEh6Ssw(ACIQFQ^rWQEJlycqICN<&2T{Y$oWL
zgc>5t?DZb-tt+$HMK%b?L<zR*fJV0e>$f*lM}9kk<bo3~a_2rxU5m~~rZ5c(sU3UG
zZPI7ks&J)wRdEwe%3zfQzbF<4Mj}4J(9^vjg5hPvXUfFQIrdBEf>pz3#BgYXEl6zr
z(4j-yZcIVj;qdz)=j?LN#pgHP%Xk$<wQfsj_4JD%;_a%O4u4xa`}&GzQ0eqZ=sb~C
z@H{%92o-HN2P}PUhsn2<`NNa!;M(L7!AJrnuptMGvW2SyrP;T^KJ)(l6dqNQ`vvde
zn;V7pN`I7o|HHDNLyP&|b{__#@mV&TK8O7qaO%`QSDN2_^bJ;Fbkv6~Et)#qngWh=
zaDX>dxWs-a1iVl2-QD)A|EB{HW54+otC}=<Hm4<vgdqUwh}|ic0-*Jzvf$!^hokZA
z7&fJ0(h=3>X?2z)0WnLf=(MKZ6?;?OAf0OkrdwoRg)C^hjM2=dZ~BpALvE20=EJ9>
zV2X#}=EM~iLwXgoPrkOJ$mXO>P{it?RQ~t^hqP7Sx*T-EKd}`}J3n`Gas$PIVtM&5
z(2c4sP3c5<T+FuzsZIP<_fOkeK&Kz|^(iSSN{BaQej0oc$xaV?d5?37i(31w*?8OD
z0%+9MnH^P+d}v3DrQhP{gl<&Aj>K|e571d;<6p>k22Tk<VS)bYci3``f+s?>LImFI
zLyot|;UKWVsrh}YR<D+gh3-dCPYyONzeCe*zlLonvOJOf7oaR9@;-C#?r34QGn`(o
zbK3d=FUvrLh1+0d@Jw~A)(!%@VHwDQ&4I4C>ck$3-YI9lo0tzCTtIJTdNmQyg#3<F
znUJIvXT<LXx#CEnFoG~K9{&(ZxPtM4&YzziHTUh(z4r(BNw|1xxl3saDIL#UzHF42
z2LLWJ>8K&MUA}>{oRjh8)umnd>n;lxjGWL4FFSdhEkaYNaKyU>(XG#k?VTod$E?J4
zomiAq)+kLrSLf6Ty+>F6j_X^JIk=%bbFa_ad4xkz7%avbY0R8CbM*dx&0ybjlJ+uL
zlICzj04Y7|&Tji_O_wMJ2S}nCePNk_UcP=eU%<z;Si7Ys25I5x4txHc_y$~FA{(Q_
zAVs(W(;wEr+i+AkrDdmqT;fOr$3TCss(=fwqjqgxRgBpmbq{rTGn+#Zs|F>5MwmGm
zbq^mUj!L*}Ne;lPHht3=?vD_!Zxt28kefhL37$_x0rIaE&Dx5M5yE+B+!V<Kpwd%T
znGc7{#xy{%pZ)?ebfdt&s9Wxf%nea`udRy~qo&zC?DCLtF+-syP^fL6v3t9Y>Fww{
zXIDCm@80&9RRqJWwT^C|)cx7`FqQ=POF-tQR)5bCQCVpLxOo4C+QLIq{O{;*W=8+}
z<XRhzknav#&OfSce6nWfoP(e><^{8cAV7=S5V#PDD15DU^AT5`=W@~jN7-6YKluGS
zmN!EK%h8?x<x6u$Qv}UD%Zimwmc59W=GTI`$LJO~kqB>Qnu^1(fnfm$?{%Gr{?5j6
z;SwZ#thNpNvW<){CG=J)PF>L%W*<|_2!{tiTAb68^9*5#vt|9>y;DQ;q5?I{PE9yj
z{+*F1B}-0rGy(?$G^H|2VrF<KF&F=-en!P5yka5Jl@Xk|B9Re;-qxxj_AgT;@(T(A
zDNqi2kRnp7v%k{EYQa6TcB_hN7tEXYtkm`K!-v~*PO&5VI{g!}>=I;dXu8>`ee+*r
zNy}nx_#P7U`sAV98zHNliD=t-dH($tt4zMzk6-U&<uv5xLJw-T2lEsEu3~uc!3`59
zoSc+v_4eS03*`KHe%N#H;Fz=?i)$G{OxZ^LSy^;T-EP8>v-BNwthV#!eTd%gqv7IQ
ze&plJugy-3J1^-!U`L{ohT`G|lJ<Ia&ciIMH;&u1xX($%sjBwp*n;>@CkVMYQZKxB
zEtc#{Z${58dP<pDiVWtE<(fX^E=XX<-_yyrVT8)h`nSJ>5#6GY_k4RR&TI+Qe=_{d
zM$udX$OgmIiW-)xjydtE&g)DmohjDjXccU2c#J|wN;1^!b@hp-#+i5#E=B7}Nj=Mh
zOHYwj_gb$Ts?*=J3rjCY<%nkPM)+9h{av9nz<`jO)*`;)kp|PcO58WbFdGhwwuOCw
z<v|!5Q9l#TO88G!MAo?F8ed+kjhI~2(?oMj>07{+_S=uWT{fq>u|UTFXM$HR$-9|&
zS}}bNK3XlXY<jC?uo&@5(a_SQ?5=E0wMt|IU2Q5JJJy3DPb?_y<5(>yqR%Ph+gyXT
zG$yF09rdUv(y)y`dHmS9&&7q5j~qu$0GTldv6Yk3(j@kW6JCMu8uQ<!<u9sLfCVT*
zh2@YV8IX=UE|o<)c3U6r2SGIWFNGX%SJ%|^LffGTnZ|F6$VSHqbKswks~B%f;Mudi
z#rOn~3mHKEs&w>XGvcBWyL?dTgzn)2KC;xeQ>Q~+yB6HuC?Pl)D^>9x(?k%MU-IUP
zCLEv5lqqc>O&ambw&|Fz&+bYh5Hqp!V|*5nycbm`3H7iyT!%3bFwj+i5zyER_MWFj
zKvtX%-qwN<J2IMynJ}FF;srW0Cvk_EVJUWH<`d(+9h&|+1e|7IY^*Z#@*-0cJ-jLJ
zmy^sOiZMopenmZ<3C5!$uN}!w@|!mu*u@ltKftPmM3*u3+}m=CiB1F2xXdFRwECN&
zOmIbR1lw0YRnVl|Yx}FH)z9l&+06aeBDl|n<Zb$WTWPl4ENuil`d&YNE*k@NBLDC&
zQ>ILTF1Zc$V(R`tr_<4B$OR$}v78`24iLG5#Z+yi+s#aYUAHi8FF|*cHkINzlx$A3
zv(CAs<QEs8+WTNGkn75oUt7iq6~nO(DuHQ~b-S}3BLvqj4akU_dMAFXK~FvJTrgr|
zo}zB|d+On_-+~l0+or!B<?$!wRFm*mj7t1<=y^T6v*U+Yj|I}o=%7^OJ7C<4oT=ls
zIyFZr7CrnGb%^ts$@wvB=_c^0P(q*z<~X}P`F+*#wMZTunmWOI#BNz>elYLUPtArV
zEMdW1`2e&78(B`EyL8K}{9#Q;0o?#92LbSc_>s_N(evb3h!11)5?BAgFCC!ln3J$x
zM4}*H^z^J?TY5Iu)v3a3d9CKJao5m(x#)6U-N^(0@KnXOhAB((RS_t3Fbp6Yrw@-9
z5`inIGqfztZR*T2O1=#(4j>%i4)Xr;L}S5g^4ZGV+#pv!U!f83HJrL-t&;AH*a_*(
z$lOzAxmxpL@n^Nt^6R`kPO+lPR`-@)|L#3~+B9gS-R1efq(}+<j0SK<Uen}Cu{#_7
zB}5%kqu64IW!D*cldY&+-?zV5R`53NH{*EzjA#YT{vEf>{a5Zyw8bxePBk)%7;Pl|
z=<oW(-(B>R&|8r4Kwwn))yU&*0nr4>M6<-kmb3$$^_uAJH`4PU>BP!dK{AC@A^Z4d
z_-W2Dv=F{hBvGXOgvMpM_R$eS+G+KN)_)L>g$r-9Un)r9k_`*ur9W6VlyRm20odZX
znA|Xy`l{RDSQ3Li!tMg9Uop?D`#DJ3OX4#m>WCMEAlaU$N89g37bMjbfD+h<SYBzz
zE6~Ptm}EZDuhWN&N`i1)KqTRu15TdoR^hc4*Q?O^$OPxinS+1p#ME!iR8@-&qU0Nr
zllOR+6vgq2-wa@RBQvx>zhVB3nVBLFlbaO~g$Ww!V%sGTfc8h$eq`iUcqQCe%{VxC
z@2iUAt~hj>R5Y`Dr5op`aMP02+iP!7dSTT}6YEjd<sN$?vURLCBsYVHB-TZ`GEN_n
z`{LeV0}`p!NWpNp9TSF-LWjcpw(R{N@!rcgD%_bf|5mH$*`vp%T^4)qcRktigmx$Y
zpA{iaWlO;eIrBxC|338qBz-{0Kf4jMDQMOYmaU@v+HaV^0gM3z9pq}2_8cAJK}RPS
zjgTNX3ydku)|3e_a#9QZ#hKumG=rjAENV<n(fN6cA+m-Ows4Z*C=Qm8EY{1k#B~YD
zE6Elh23GnUq3~#q&aG?k?G&PjiiMwp4*~<Q82Az6q;PR>VRqn9kAwCZIH}I3KT@l6
zMz;;nuAr&Df)`wQY|>Nwyqj$W_7dNQme%m<S0MQKHDndw)*3Zxlq5iiUXBH1xlpBR
z6vcnLFij$4K{cpl!Hv%morFY1oW~-%+@(q65O*gqtutmdj9NE&`{uscq~F2<4UHM+
zOn}7yW%~&mOoh2V*|~+A4u>mEvf`ND&*$hE{ErEBSKAnI=eN-)wxw?kjw;S-&Vv++
zYHB%g{5MikstVqVk5ljo(d@%I1FHjbZ<rP}W$ILMCb40gs;Z{&`L+l5{;=#CkbBYO
zGmW`$gQobQu%k`S?XCjwE=LsKPsB5j9B`@V%7Wo6NJitI2`5WTGtyn-)w*Y0GB!`U
z=iTd6+yUzUuBrdwM2fKq5&L)SVzM^7)Z99HbYsV{ANn5Aeh<b$FP)$<XZSF5$RkXc
zQa}ZGV#7IB&X-sA8$F2vGVFLJ=H`mmkBy^P1|2NZcCN*Kbk_%nAU39rURzKQ*hUd9
z2&4ws4OSzeRi>zxI~{)d;6X|6$R($&so$4kai5Fs5swTa)(w;z+%e^3M;MTd2gm-!
zZ1tFpKRTNHvCOhklr6JP=Ge~awRhFCb`yKCBgOfL5DeZ*u17+g?$!_8xUWA&98lp6
zon7<SoF0J9kqezNq+w;lYro}V!wp}5xENZtcy{h~)k$w3f;r;=`fJHYR2wbct#4j9
z5se>^9Q*LWA<G+?c~2(Y>XjekvNQfxoUzNaOX)Kp#oUI+CBBcfd|P+Q<eslYQ@H;4
z?h{aL`Rhh*S>pM<S|hR_930Y?1YINj6MK8k956qn#naC##ON)_9TY6;)OVCsV$c^)
zg@d6%!7uJONl3)v*0S$+k+PB}PGJgPr+dG@$n(WV4S$GkXajFUf<I&hb0dmHYL|n<
zEpN=rt^b<h{NY}>?_29HR_jO<qH{}yq<TW|BmE{ylSb_4E-uyGlq{~z(%>J~F{&5|
zF5zVCQ#4}i#u`Ss2@fF&BN$DD@|45W#mx-^Y;!!Cm@<Xtq~M~kN^m^xLb<@m%xH9(
z62Ht={6*4vVMEQ^7k}xs*?M%URhY*!d<`PS7bb|$!{KJ+7BS(Z;b&{si?HXNVrTPF
zaHUI1spLB{;M88vZv!5ojz~_JXORL$xyIWs7`eVfYS5w6iFK-Ivl?`h-I|6fOD2`i
z*A|V;jEzHD><fF+8*f5vY!}*cQs#u01~R#Q=g!f$v8~59q!q3wev8j3Tr6FwfDgXD
z2Il7b!(zS-nxHJBwvunI{(k+z73ezgI1{Cf?g$^Q3%rhku2%h|L*sET+uNtFi~3H#
zA#kUH0tF;7r$sFyUgLgAfyb$!;?r7^4n5(zC>w&2o@2rbf*LBnbb_B2wgV9L9Y>^Z
z=46glpdlVCA4SL*<t|aH`8Svj(I3Oy(xPk!*@o!YJ!y)z%bmgs-!QXREodS1!#*q<
zEXEUt);S%=8%L1k5u3Ra07FCo2f8H(BLozsZsooQ+5ggiOuh0g!rK3qQ|&nK@qY?Z
z823{(2fddr<u|H2G`0q3*mURJu#<Yl-TpuM)-Lp+*`9mX`s-LUCQ^awbfl5}L9n*r
z%AeQ-;6)v}UL<IN+3F*gwK_(Q6YAI03EeC$JAvRUNS?9OM~kC)F+|~b_R}2c;dO0R
zP{EdqmYvmOApW>f@4yqnjw-9FssaePQ2H8X>V==S;Ao^sHotaK%_X|^KVZb8^Z$4R
z7hJh!$G<e2i3eq9n};kFxNy{D0_4QSb$!0VPdu;-wt{nx4?Xnt?kolKvaueTlWmH6
z=xw%rTDn>-!~RCE?){3K?;zBX@N!`fL4ajwJpzG3&*!Z+;<FQfJD0skxWPuKg@|LK
z-LjD7N)KN2y?$8W1Ut>_m%^wL*G^W#x>IKK@U#Jfi%V#*6c&6sk%`B&Y1xw0;KW{s
zY|sL6yo#4RV|M>#@9$c*8)KrqrB@lz1l?+L2VXg&KmSujHtqF+Q}GtNwQ*Leb<e$M
z*|^)-@VplJ9X0GdOa~)D1g*sYmrr@Ws&M<jx{k-#GBf8^3?TP0R(1TShy|yA$KF#Y
z+NIy}@nkC!QKp~m@#xZrSU=4>t`%<`Tf$STDhE|u(Kj0sJFs!xo^6TOJ&ei|&mCe*
zBj*na4{wcVosw{#_9Q2J4ttqN$YsTIHta{pit(W}Uk6L16B!k9N55H;xSfg)vP9en
zp;fidjPzvEw**-J^_MDOFI`DeiqAIcJzP%Ht}W{-d|SU+1CJK1<qJr!E^69AYpQ#Y
zV^pc-Hfh>*@x7Uvwxq1G+g=8_&TC*H%2r}9HPno2LtT|9*eIGD5{G`c`TIv<sMx5D
zYdNjH+%+CD<W2NilxrJ-oIi(ahIgo@Zv8=(C@dL47)4<P6|1CQ1upEvtLRnOEi^sF
z|F_<nPo18pg@65Bc4F#Ktug&3F4|`5+~?)7SIRYQu3R-*`uf=EtO1d8YerV@9diH)
zdX3SmD$kWm?xme{^f~Y@^~KV^JHEMEppiC7$*xjk_qa|Czsk*)H*x4ZG$7#nYF*3S
zcCly^=DR<ty;WA@_n~zT@uM=*iv&hWd9#2Oefqc>1+;8!#-(bUT9K1ws-u>hdY|fn
zmY%g5^X8@!xzijtrf6g6G{u{SI?hJ-ufgk^H7?EU`i}{Lb3=NrvCZnoTm=aWB*?WS
z_eNW}GHOdc#nbow-wa7)Dk27<pZ(mw?Np!-F#<0daN*K`(x48NzLpB-hMoADN}MAU
zbI&n_Wo2)V#|@;!LcGQeOcG2tu)ueSS%jS-Fq<M1r0iwkZ^+1lp7(M4q0rH|GkX6q
zVavE@$7##hs*vY-YODxAci4^*$S)-WhoYWu!*91ao)urb>{^af*U8F_%slS>Z)8o!
zLuc8AYo<rNiDk6i&YcSRQ-%*k&A!dfE%9CFF<GLLMe`h-V?BBfmcoF;Cd7)yFaFZ3
z@O?ShF43Fk1HZYoGjP{xx*_H%Y8o_RNNo{nU7c*E)7o@T#(Ot|vGyG(S<!0>@9O#H
zBhe2PSVu&sK$R|~AyT4O`@4fvZhu_dyAhD6Ds&__+{(Y-)D<r$nUM0fy=3fo=H1-f
zZQ?O!Zd4KaOZa1H>BB9HQEnL(ttK_Jv%*N*VUxJXU^*`S?fTKnt#>0voPakayQJ<l
zW5q|t-?1HvWP+IC7>Fc@poT4(kDLweD|E&W$sCQ(*UzX8FKIG*s*+O9E9>g0!8=QG
zp4`qViA!(vzpy3|;Yx>#jfPi^4=TRD6GcX!)!*JL&f8J5G2!89;{$rL^cn6Zwi(D9
z-k=b6v;=?5bL4A}$&wo1rg%OW3wIi+2gA?4C<MMr>Ex567%_T4blDOD2Eoh4$t&~b
z@p~}*L*|k?=9{j}ZrHp8K(WMsWBeke$8mYb5?Y@!V^tnF(@IC@?&VViEdsFHocEuJ
z+eRd^5;G5pDq0;feFx<)Dy9UYmHzkOky)TLQA+iY@Mt!VN4fQlxc=(qf0MfJ-SO;e
ziC+EN$ItwnEl`vX9Uqn+72DJ%=<td084unsyGDU!&!HPopqi@qxTh+^UtNgHZBSl}
zZB*2FtBPVm6rAr)Zv<zA$hM@$`)DS%LpkSu*1bqfV`b65EAR*qIU@&;-(jjf0$|gH
zs_4hVCF`X9;zwN$zB$@RCO!<y>NiBcq$vfIgn2Hij=HeO#%^?J*^P19O{h3we)2EB
z@wDM-Q*ltRr8#;z5=e?YfHSsNln>oDPqBi$Iz<T&kzsSlO1PScXG6fi_3K773Ro)l
zOTt3iA;n8R4q19_7R{^ZRHFQU4dz%BXFo-Ra4nQOJuUmbRGVX@w1T?=_e#~yx)7d%
z_|rdxPc4TOkTJK^?|s^D7w0G>-;j|dMTuzzUUE^&?35R0=ST;R<PyvvcgEM-R%Xi$
z7rPI{Pr@`$Fv3-f?5bC7h`%%U%N(a$Yum}6N7%&NMOv6w-VuKL;&+W9SjWq&j|@sY
z6iYb+nlXRB?!Ohi#$=HjR_0#1D3o)A-$F;t?T0;<OybR8rV?6Od^vI1ppH`xNIOqu
z2(7ux)77Z(z`tZ2P_p4>r53+3HFBSKZqIKW)I_OqKJYE|gJs*nC|Kf;#xh!m14<#M
zxz@b2De%1Lr_d|=d8SW~XcLs2q<q)xoi~R{q)9<j_Y#eq_7hp|24-fh6nv}RjH`e0
ztOiX+)!^esZMNdyxcfSoAWTxsuDyNh+^1pU^ywK+#xH9$FZ6^V{N?HQ><YOXj(bL?
z8Q4BBY|+$4#%PP@e*XS6>)*9wuQ;h}|8L*tZV?eF$^1W(6iysK3<^I{gOYRne+)Y%
z_=70xsDobBhB`x<;I@N0<hFf>XyH=Uu+AYV6*{|UthZizcj46{aR}0$D_bszB;AL_
zDj~=m9stDLbsfDb^~{nVy}U%{UcK5hE!)WZ-+4o``f=0`3HW!^(OzQ?RhC4-$G(F*
z)SLAC<;?plSFBjIwo}_Z8Ma0jX4`_tG2RxFi%12QGH4vrZ~T-VWX4Ep{j_QO-k7=Q
zym=JgZ=?u6X>sJ<0d=XbDNCnXk)&2qaS`?a*rmpp*p1f~Llkq`7(rJ^tG|jjE0RTZ
zhtBITVw5L>Q;{hu=a|^HSod8pvLQa94Mt`wZYhxMItwRS>+`KXLTHs-=UORoL#+$g
zh&W@2We5{>hFJFEI?(6u%3nle->n;D)!pd0LuiI13Li5Z9N0}Dj*^Nu&k7^4g}?p%
z%V)rM7eo(;yFV}gmWT<34%G3@Y+iIVag`&64h{Iber4vbZrH))r~O(2VT9Og1bP})
zJ#0{2j*7%{^HGRjlR*KfjN4EAoqnj#fGw@IZ#J0n#%xAJ1a~1vV9Kw-oZ3Qz1k5z}
zKu1;fw5-Q2hv2ZqPg?1Q4$LpRe^CzVDW}3MGl!U8<_Gc*bm}<mJ3CrOm~TFlv?$j4
z>Z~*O-;N+6fq%%<?%Q{_<r%L*2ZpIQn|g=;FTfb^isPw+SKob$&0Za;rYc8VmjpHq
zUl_e$f9~ZQY45&%(ZkON_|HG#qwZ9zOjM3eER{d{C#(BhL(x}-!GydkD{nutuNDzf
zz_A5%5KE&rTfOen)m@M#JQ0OdTw;~y5TI74L{%Y43a(?l`Wf+JU<wNY3U#@#%UgM3
zn+1awyEDCh+NnuvOt!cxZQ%$6TcyAF)8~>Sp`mmY@te3Q(Wb~$XaNF17Oe&hwP#uZ
zrx%<ZE^jgrr(Ds}XI@w7OfF=$&y{skW1rl8KDc$$#^n_ipKE698~m$k5pgVc7h%#k
zm07r}VP(flr=W?G;{5v2eIz!NE~#I%mwASEu`X{j=iW3+Y)g*An5;!ZRA#Qpp4&G*
zB7>Vk2E`tTa4`HbQRltaAm%1;Amevq2;$cli`9#At=Y&*Ur>%GRfi^on?_}p)%bK+
z(!zAl(9UY%NSOa5&_7#vhm}?n-`jovL-)u1)R{ozd53vyArKuMt=8(GlSn6aQmB84
z#Gw>{v0xqA+_P=hOZwG0b2RrK<J|>PF%N0o*W0`kNYC`h&?k0eWDvQMUdXs@VlrAP
z2e~|c-UXmm23QfIj(%ckev%@{9i#a?hqlGBJ{Xoo27`keQDN5K95>hC-U0Oh?jMTQ
zWV;`xE3Y<izmnUP`_!7ne){DEyjJ);tuFWx^~%f%W%vbxX$Q=7KhilOq9F^ssHf_l
zmX|OK6LQgB3RZ@9IaKwl3mTlfy?+H-MVZ_vvIF{K8SEjJ8yFe{4yf~1`rcyLS~_j2
zcz9*{fmjDw!a_RjHHuCojYXop<}X~>h)M+&X1x{kGKC6o{7?=RhyVn>Uft)i@e8Qu
zxZW0}wOqc)#lVp)6qf;!;@F{+A7tV9q~CUCTMzEkn44569L^&~b%}-8J@ZiR@qMdd
zY$#4d)JYwlTg}Ml^~Q%*H6jp#Uw>P~Q+pGCHX$K;_uqTo!b;)k93XKqjHMGynMoZ$
zG$r2$Rl$n6aSb~fKP>I}51o?!E{9p>UrifQUcUE3C@32FloBUpkA{@O3PjeAzDiuu
zitb4>;-=iJe*Mo?a`#NO+!iVry4>k^dc2{A<0RANhfO(5x@=@ci4z#-6z^UH0=)9r
z+|!EuuvxzjT2z@kU+uBo-c?W6;Z14WB$cNFe5y0Y)wvC--+gSs{7bJ}o{b=g-_$j#
z|9B?ZNGPB9n(8V)d$!vL{w=m^i9S9sSPAS{N?eSya_z{#sE!>=E}nVkb$hP6#@>A+
z$7`dgmbo7xXO~}_C3Q<}tw7KGh^6~yH_f?sb1U^fauUZw7MltC2ds5{a(IM-=fjN>
z7f<NFfT`!_jyzjq*088wdzO0HZuOI=ro~OE{*hkbHOu_itF!6ZLzI7Iu7B}tv1wq3
zhD-O|FGyS}Hj1~Gd5>bm$jl7Q9T1ux1QAQ^!dhnP^7x4pQ~phBZ!%iByVu`4DXqQh
zX1A+I=v`93<=BF~jqHP8TUU1tFY&jTS`%2Y;nkw*rk^Yu1rGH}Ju%Py^`gtB^STVu
zfz_3(gSG}qhA!1kI!gcz7J7?~R71mFRJM04xllseQcqSQ<t!J%_1XQl8PO0E5P;`L
zbl);+fK6hxxCa@{lK6CV0%<PKN{4rqsvPSU+gfHMp*`z@8HKX{!?q|GJegv{;i^1h
zyv$Mp5RYnP;5M>&w5@xO!}|s>b@AGYz^+=GsrBbxsT>QRfVl%?30b>WVVXMY*!+P}
zF`7T7WUL3=l?+j=&lJjfPktvN7Ik&k<j|JN6N4)4Cx=e~S2%GWkS;RA1J-9JxHKzZ
z+ue$LdG0HFkoI@ZGuM0;4n3w6{P-UAdmDixkNa(>eyg6YG!fdHaW5jS$6yH093TuU
z;*>>GZg$bXNFyNA>}2wBSa^8gu>}$@OeBKMz?LgG{!cs9U}$p1{PSmf1b!3LfwG$)
zPr<Tsxn{IU^H;B4iB=BPp~!|%9j-W)aTiY0iH{VMIr5gw;n>>yRsg}js09Lv0+98l
zG2$|d?<Nh8crehsi|&y`mKF6mm^bpz-1vGMDvJ3*B4vvvH96DbyBfNlgN|xcnC~Q>
zoj5G$Q56!9RdiP=KueSkba)T0FNg*+_Hs_;QUn3Sw4(qG_|@~DPF0yu-x>gyf_Z`z
zuUL3+Cb*rjDnO%{O=MI7W%jFIuTU6aeo{?v3bkBQ{bx+Lot@p;?`iT4gtz2mr~gMs
z<ukt_=k;rI_l4se=bA3JnBz9Y_2!m{eq&;(|DdY}Vda4LKjc^aYH7V+vqM~Ge*%R3
zr`*^uv(M$&E<N3~f@Lff5T?*39F~%uO{*s1cW9W!W<g{xn`M!+C)TgX&F!?bP#9az
z_i5>a;5+1*`@e@x$m4T`)AoGhjv9z)%PmNeV{vC7XdSU~<p7f{!%_C38*73gmUJ?1
zAD=a3Lc1Nbp=9Fb61xO+l;V}3YjzhC-Cg_Mi+C~n^ThgFSNyHNA9yvz-*LQOGoR|i
z?LljR8RB8s*1bs+g1E_W(^qwvkA<xl2Wxovhi}y-3rG@z235>iro^DO_Sq|SO6u=D
z3)ml19XhG|y!!OKZVLyQ9kOh_J7Q$#78dPX&4z5FJ_TVbKv0Z}A(g<<0Kz}mF|OFh
zWPI)2%|=Wicj|!~aL~i+r%#+nLS-9_l!yL{<mAv6ggiYhr-F+GTckrwfuevf?6f5W
z1%<p=0SGam<}6L`w9m@V_e#Suq-P@0YTuy)4w5aQR&`COFFFkLv?Z)FEAa2BQ`^dj
zBhh0+Cvhu@G!A|-r|eEH=Y#M|?8C~1S8z6<Df9aNag4&1^M1+`REz*Jt$&s)y|ISo
z)S2`fRJIF*C7;TM_pk#l7EqiwTUj@UtN~?7KR|XY@&~9*?+fZ~T$m~DFXr}ZzxiPg
z&r)j$FXES#rSL%DDAsGPN&XCsf~3z7m$!wt=lT3_7ezPy0c|qX;g9cF|Eka=Y)K!b
z1ePGKd(3WMu4o<~6B#U1kN$Mf_)i2}Cmbw^#kvVUgcK%J?AQ$-MmI1PLA;>2&e`xV
zANlNHTZEcdennDBK0h;is|1CRaw)Z&=%R#K=A`0O;DizF`Z+?|d5MBjLNy1UJ-g(}
z?;@lb<rwQ_1QUwFRo^m<E0`nY%<V=w$5|)0zc{4m`D2bAy|Z8vdVYhiR|_?SlYxj!
zMxC)8?n0aE+^w`pBqbGSoBW?wnTdlM6kQ9dmoleMV@VX1LXVgCgOA5azeAjV^io4{
zAfRo6oNA6_g<=8Gl#uLF-%3%1rD*>#mWoe4il{&accd)8*pOsS=r@Ol1=yL4@kWXU
zxCT=4v+AtB43(s`Y00c0l9;c}T{Hi!HR1inZ>$AP(DkbvFEL6FbxL1Z^FBC4N}gGR
zV@V>lRqgv$cx&@F8y`yQ5^AxY6A*!OTM!%xASfq*IP#LmYaGiM!C#j0TdQA3BSjXe
zBevwb?n?ArYv#7NZQI7oSSuyG=fL)Ts)H#kf$9Hm`kb_|6p8k_s~aOSZCC29m8Q=j
zh2#e+Mnv(ys;?v}UhNu2Rkm$ev4x6}S*xT?uODdTY<;yTjc@ixRD_uQ#F;a8ASm!Y
zeg1jgL)=IgF6@a5)n)CXq(NS&O8J1M?n{=bn_;_Vk<)!*C!E~L>`dC?NieVj2e#!b
zV~t1>3k;F`Apc)Kost6>`h@U@Vog-Y9DpLkbE9xk+CPULn^l(tAjHI=^`!NZIq`go
z_SH*KG`7zVy=nKrSK)d>RQ&QLp4(IEm-oAH<!EU$w~4Ab*S~H?hqc7N{>`w=Zgqr(
zu?yv}LUbr&Bw#JAg1kD3$C1rTz+mg7xMl4uo6KLc@7A(5?=z;YGIMPomFcrU?b!u?
z07>OsYc@uyO4`TEiGW_iQb#um>@$h7r$zhr_x{ycY8>0$eTPTn^k9yMLaxdYOx>t1
zd#lN?KmaIqnqsqYT}BeRKP1qPdO(V9`N|p^6cl1qA&24?>R=Jz(_27qi$;~70rm9q
z-KSVF^U9nN6mrCD79u)!jceoQ=a)>7Y#*I*I4GHF*o-MpR-D8_ej(4oXOQrwnleNp
z{hzy6b6g000Zf(l+;Z~CN$j}j1?{;g>FszWe~tli<K*A~@6@!c+?k@fEPBnP&Tcn~
zK782734!*6!kSI}CtRLh5J!ll(D0o#@17Y%YeA!gC}_m^@lSTvdBC&{T3MpQAu9<f
z5+8$`4a|UFTvSpb(Q<if$}B}n3Nl7vkh>*z>mAoAMJ|nZ6NP}vlwU^AeP#4G-}R5J
zo`ON5go>tZgc|{Ww!t+c!UQL$9h}_Jq)h`<;29I}apA)50D7~}{Qbdm-<|`;%D$CM
zy6cR)OeQG8YbYc^L(Doq=_8{NsikFjDPqnhD1nA@n^7MMt4I9I`A?%6X-3^ZTg&g~
z(^xPPM&L`a@jI>Gjf@c^Nr<FH(n$!2%KW&uB7_!?d9w3Rur-bhIaCN7l7UV8&=dD=
zxssv5GCaDRc`q8cA|)t;sz({xSR*9mxog4qT8W&|^)1q8rzJ1ERIL?;58{yTmu}Dv
zg&cz;upj;^wEm^EB%!JemcN&Z{_?wui#dtnDGR0laWoF){mU>lyu*b!@Kof+pEXMj
z)`_?^Pzwx_=P$WXCh-r$6yPd!{eLc%d64M295)XBILJV2b9uV;ua@Fak-R90ZsS4I
zE%329vU!yocnWB^|KQx4{i++jQ@CBtwx()Mm*H%GZJp7hO(FAxlw>K;0^?cAr?^am
z!X-ych4H04KS?Y;w!or1N$KU~#WSyLRyr2!j>Z>GMZ&Bk5>9c%_^ZqfsVJWF+c#;C
zUYEeX`mangV`g~H)@#*O|BtFS0qZ$!-?)E5LQ=9MTVk@4vJ*)Olfu}E7Rpk|sH{Z_
zWyv;F_UwCMw9-x}(j-X{86qX2EGh5j`Zdr0eP2h%bIdcN-*SKN`?{~|yw3ByVsF={
z282@oB-XRnhF=FK6;-T~*<#pCe_#IDw($`u)VQpbV>>QQXA?wrd|`TZ&h6nR78?KV
z-W*p)B<6`oW&B}F3nh}efdUMGi4t9!9U=;*)~1A(X`Zw+RS#}@x7g5k;ed%dLtofZ
zp5b62o>*ey0H6v}PzKe~?W`X?@B>|o50&$_9+!!70{Ic+Ect#uKPxOR&W&loa`%l2
zBeaW)y^x)0C@Tlokr$Ha&rF~EMmOf>&6}C(o}{MiVnvjZUg#dUt!fZbTUuaPAX6??
z{gTdAZ<~I>f>B^8xoGRTH()>84LV4oQ}U>7dMoN9A&CnImXSnl`prwlLWL<~TkGw(
z^`6EM5`Cie7_|6wx;ReOxy~CW9L~Au6n`+uPGK?G#r9pTyYnw~k4|<jUQAR0IGWmX
zU%tyu%ldWQo;?f(Qk@+${H^!YTUNfNh6dd(HcCb;DF)miu2&kK#B{g6>(c?jOWFsi
z5(S0bCc{nXqDzn<9J!pF1E$9!`w{S@^C^NYvFoM)kzAShmkjO65KeAATdUZncpbSs
zi)}?CS^!^JQts4<<DGV<Ia*u;o^Ol%jqIhnjPa)Jof4Kfk}FI`nnKBuu}&O|3Ovwd
z?yL+m12QLd=R8O~HfvTO-Zzi88URGBeCF8(zo38reohap^yoT597RmB%ub|b(i)rs
z>bdmB{-XiW^2#_kxH%ST{!`AicmGAIaiu03x&mo*v*`73M4uqWtW?S_NG7D402v;0
zPN`%B)&uEBXfLU0>GUxIs@I&!S#rfWvm>QYa#~s=97b-TYb-EYbZA^?C=@;AFS3}3
zIA|7L-Dt@TkAfA*L$E!#Btn{`O$hVw(*~oo&5WB(9qSl05bH=G&`CdyD}fuCT=PTr
z`TUMFRfxuzZfnzSPa0gf&JU{w@i&Libn~&Zfr9n$r(6Ln9^On75^{=+xZ<{uX9*%l
zh+s`F_Su7Z1DbHZ`86do2+=D;YT%tOhy5~!;2>ra+#+!~`dVrYLYOlmQ^|$tL!^O;
z0)(v5uoVn(9C7u~Bue@tw>WfqK+?za0<ivI#Fh+C0ruR2tM|x$n;BY2FljC=aY?f=
z8=m!h*%AIeUP{6O5oRGUS>6jROArnU2_6<7iRAnR*w<}&rec3Vp6W7ngvDfXXc;vb
zNENGb$KmnhUou01^4IgJGS?#|e*gD+8FLD2S)VFEsN-PcQ6s~;Hz5W8mJ{x?i6Ejs
zWJvU!QG!oZmtKQLaf?{+kPShm4YIuT>ybO=85!-%b01ed*%tC@{EHLao1jGle8(YB
zK7wEKkpY~9>@I$)YOkaMu4}B)4U>Q&Svt`8qQUna?Ur2Gk@G$$a4X^yyq84nBz6ou
z4o1RB`+!V{r)57t7ipOv1QNcu9Y!ajikN+=<0H?4aY=$J-7j^g`4Yec7o~d;mS@C^
zmm~yiJ(2IB%E72;Ysy#jV8RX@;_+6Kti`?d=V~GR$?$L6^K;HKi4Cf@{2ThZV<@m>
z>Fum}ri4_eOH%4m$BuDwI-TMQG{}0887QJ(KWwe7UH`@Y?xz`Vux|n*^Mr84c^Gm%
z)z!@nVD(5!E}bpnoY0YqiX0+cc3tzT7iAtv-;!nNzVDu!%0c>skb?~A;Y({xthKmp
z1+3v>jPo$bBeWCs9q>m|2%g$|etf~XNsSAxo`eAEaiF4*@_|sKh*B}~+qiMAu1(yv
z^`JYj4n?I*1z)u?bnvnxDP@L@n{#{+utsBvFZ_sQ2TfXSyJfaS8sVLHA39XvyuWsm
zPR@bQSI?YUM|+wb9<_d?5I_kLk_Ji18xWvVc6z7<1X0izfjXJ*w3GMvnfdEybKdpf
z<aT^hrd_?n^52}iW@$qR27cn52Ohr)YMP}*MuQC`<weA_O@PZly<%u8(RbF^jKn`+
z(Zd{)QW1jG;U+s;T688V0D}<xk_61;>J|`TRLz&YDV20)>0(RxrqwE*F4orqL+&m+
zaGK|F1@jmP%whv44Cq^t!CL>4M7+s7x-+fgT6MsPP9IYjauY_?-i3`yInk(K6ty<d
zWi$7&xKN238{UzGVbds29-|nT{4AM~U`8bn<<bvmUSJ`36B62Fwg?53Lh<3oXKOk<
z>3tkd=eZm)bZ`<)4&}8pD`+#)8sKmVJDn#_1m&9X1>r9_8e#oEE&%n89a&gMN9Y}E
zKWz6n3KyI1k=nJqjIp0I+N>LSv}RP_nGqW<6&LlScgb0nMlHpoTngZsPT%Wos2erO
zl?rbjIahvpdD_h5&zT-5q#IHIp=WGpiX;YJt~eI!=^e`)H-$`42HJ|h0j^n~WVrT=
zUl7b#090|fLPaKBWrkbQThIQr90rJlS-yeXARC8s`3$Ph=3^2V6vN=l?V#)6v>|PY
zwI5;{NrHUwqB$K+UICK?dI;idOZi9&&w=o{rAf9z6s<?rLpzLnOPH~QRMGXxh&{}O
zQK6#Uqk|^1mAkgMd#s_s%F5;zlYzd{BnlDC#1sbn448$+IJy_PV@WtEi`qvHhM*;h
z5F+r0j~<C1oAmN6EqAWb^EWm)5HfWCwfsig&Kyc8z_WWa0|<q}!F5N0N67?wy|bBr
z*X^?3We6ojEIp+UE4I&R^0IYT;}tFr*JlerA-1MG8JYG5;b84y5k#}eaK7|wWoUA1
zY~-krjEpM_jGGqRv)fM46Z9JqCBz#D@iKCo9h$}kdT3rI1&25l<eZ=8;`IiEQDPjG
z+~H=b<&7t1Q>3LJRFOfHdQ`HQcF={Dp>}acFg0L6?|%Y$!HtIw&9h&81TFxW?dJQR
zSWt&sR$Gsw@z@4C3TKBlWGVtTpk98ubo<%tcQV2h^>NnXqj<T?y$2yr;1IGIY<=51
zb)s1%(pHG7n;G0ke`eR=!_UfA@I99^?(G;%WqC<h{E|9KVnfr(>zlXloL^^%r0q@C
zv|M(hLREBi!Rh_se~A^z;_7<G1i``F8C>S^g=a`cE`>HJxSD)D{BvHXjPmIeK_JS(
z)s|QR3?Cjh=NEGsByzwx1PyTd>U+eoBjadU!&gl8@&hlxbteK^zv5SMN0kkps=7^;
z?YC7%+L2DDX$ZK*l??*h@lAO-WXHEZ8*wU8StvFD&VfE%%RH<_U=vGc-UGSdn4pr~
z<GL~ew;>&<yugp~l5@)}ODhI@$h<0QH9&KO=WIH*KPGzjXJ<b5{`!ohUFsts&f(+L
z6~Nh!uB9Yfhpb!U0prO3r%7cxWy+pxmoZ=?V%Q4$&oku%-F(c2jpe9FyT!K!GWX*j
zS<F;pvKk2&e?uxpLW=Fxqb%}m9!U^tBWZSZXWAwyJ*8Byzu3lLe~ttg`OwnQa=zVC
zkV!F3pk@~F5u6fv7oYF1+dzD{I#7p$V+a$z_nb&|QU<>RPaJ-9>)qSsPE8l+^)lm~
z?=2R3l+NUIT-^vvi9G$ie|U%;E;q90%;AQ#plImvP=pTBed|X2Y^G0tyy2>KwS#_O
zC>a$I*AmcP8X%lZ_zhy}bLLDJG9~F<EooGyZeYAOW@RJNKt6{XDDXNw&n~`=2Hu+I
zb@Z}6gbeOIXiFSu2A^W80t0N&k2Qv~#u0o!R#lBffKTa$B96q8Gwg2`maf8@jK*^d
z;OB{6Q#ii6$31f0FoG`}9N7(`_1?P@qyV(Nnyx8{nK<Jm1D+_pP^&FEFcxq7wKYi#
zRs@<KuV=euQp=q5t?i@teb>ch1kHFvg7zOP(mF(6TTdY52maTSnyo1>qaTTbF&?d5
zn6^N>pPNU;H}QmVkp!SD9h>V$W%gIuL;3zKPB%W$T>GS1l}WK?{}nF!nrgzS9;8`H
zp}Lu!i^v*;dlQK+h+EuWImL@5^P~$C7^)Cxiak`srnq?Q_S|KsPo1j#{+wtoBV40#
z4U>>VR29C%ML*_b=-_WRD{KahGls<CRCWsw#jUh4a1rl9>WE%L56tO_>xG;pvKN?F
zWCrpk-eXcB0J2<ky;#TplPgbp8i{?150MdgrgvpNtQm?tDITZ+)>Y=rz>WsGZEq?X
z){!G4brsj-)y(qtBzEH1g*H%zPQdvYF7qVGMa)f26=YSEK+uAOEGRd;n03x<bCKG`
zv)y1)Wf@oq2|#9zNqb33O4`0se<PKLH{KZ7et%2O4xvoKotVS}UtM!}huXW;aCBmP
z>1Z-R`kf+lvS*Nj%egM#Ejz}h>|av!XAGH;`+;Wf)JPIBI2B<ds8|wo_d``SbhSp-
zese5+dOh9#R|8T{f%MUF12G8g1tni+W8=MtGwF=o;;elXTLi|4sdU#)P2aw=?ZKCn
z(C7hEFm?tuSU0k;OZ^<eg_d#S1{>3idS0{He%A1S!)df_z_LdAN2k^K3@M`gZ^Ak7
zN%%@Lq=R*cMr$tfEuDN;nU)9c*l`O!IBwS&kF)mNUBKQgYZcatQKj@L$6Uy{I<P<|
zYY8bGB6i1wqiy&PFhP8B5umSn;oE5L4-NWCG9HxsZYP&V<n2S_x^86mGXi@<x4YPN
zGnXaFdUh0gaJpi6#a^m-3$^&%)RIqKDa>W=qC3+wJ?F%W>$S7<wd<c15wpxcgpAN=
z+<5u#zl5%K#H1Z$Uc?S_qDs3lE-v!O5o2azh)FR)>`GyFTAK5ugq_AZT8{dO3)}%^
z1SzH_6=O869nDUiMqLaI9CORziXt)O@3L4Z!H7&6fk{v$5=7!oO;-gstzKZVGi^nB
z|I}x$ul|flad>s>rTd|=t>d=&Mn0Ptv9fYV@q^m7_m*{Q*(A-pgU;6D27iYP)$$Lh
z=;>r+<zKk#Md&W?%%R)6t8Bkd3DYDvT_zOI`QX32ZDS1$7;PUT=6BQRW_9e<%=C-~
zZueb<p?zsOW0GXVH8;n_MNit0vr*@+lG*Fx9)|WykYv(+p*HBlbtVa5R(K<5CW{Zz
zaRT2XI}#=W$h#QB#ktPuxlib5Xn}5tV1m~3&x)PvW!Q^mn>LuA1Uz_PEv6yjx=yzl
z>&7eCQ36<;&yOT21XXDwvqq_cMJ^19bDLCvgd)-|X<#FQqd6`)v^cbBygy=G^qz}!
z8_k18P{=6P8qtTM?W+PV2*--D=LM%g?s=GeXn~h=B8RE>Zr-?Y(wOEZ4Gx^H$eA%;
zNGpP5!t^D15X)kYB4Ln95iC)<%9LRB6~fTsszBWhTzWpca9iU&jhi-wRi;z>KB2a$
zP}nf@A!Iu{cO!w10CWIzCqb3zlzs1OJ^g9}%nF9tkwe%Ndm<U!^P^13XA2ts;qBv5
zT&R>BQe#|UNZ8ysuV(9izUm~r6iV=o$GZ!ZEox;@Q<wtyI>;P1uXYe_;AcS{5@I5F
zb_IhOM7M;ADNap)zP}M32iK*6O48o8t@>wk`z*OmGPiSW`T<}>8M=vwrAy?andgTl
zKXD-Iu8QlsC8e!LMa9d_=hnW^h?}yO$oBK+&&>lj{sGKoXt=A7eqkqwUN_&qb|2Dr
z;#jT`0994jyT*Tgtx7Y$mvORt6B;+f37O4EiUX|owyI|R5Ig;XXQ81(TL757+SF#A
zDS$os2v!ne5oFYMocjdOm*I^|&U^PodjEah+sYqVZnw`G?+-fVXFp<_e235Q?cSD@
zICNr`6sEfbTujQPmm0YtGq8chWC|HjdN#zF^jAx5wp<DAuq`x{Ym8K1=r=;sn_fDb
ze*5p`Z34}O`^-0!9-Fv7f=!tb@P!ix-0L;xDC8t+Khk<hUL#%GQP;J18%}FW=7vL*
z1hx_yE6wsF%}PK6H{Npao!eLZS7i1cx`dm#hjd`1lSUuG8&gmr1|7<N!~tI8#bciy
zWCk5N?1=ji+Z4`){Vztm_UAlGfRmA7(xhc~N~t<4t))}^G<b7N`QS#9zafJI&(!FU
z17bz6xuz7FfJ|bbF+5~>k$HG!sH>&9x!^g#a5IhujR1qDqrf?rBpys~Yr@ah{X`WC
zd7yL4Cgex_@DJkcBksenr876AjU&vi0=ubacc&($v_3C_4g*<ylK8~Ab|4?5%>)Is
zlbj6Hjn%r>*7xHU21FVIDk+W$3K9OuDDOPSTMY0V^7K9!YlAJ@w}#~9<P7YbYDGAN
zW%-J0T?Ty%CWV9Z&b}RObgF^xuhlP2nmswl(qK<W51c9X@4r3!x7EiFUnJ^gXLotl
zOtn66XzJs_?=&u#<9tAYExrhBzP8y~&;4IguGQnbk%Hf*pWBbFh(dutp#9hh+M73<
z>LG4u_tjyZjlmSnTdOjK3wxyD#O#OfD=#jr8TT{;x<xwwM~FojOllc#!D*rGU^4^5
z5lU*LY%JXNAv^(?6kN5nq-q2+t)F<w1=(hFy?tu*CZzq0u46l;_zQR!@L?UF9cS;(
znUX~JQc_R22Y?dQ9=c@HklAE@xzym4M{I!O`Rh)ztpJ51u-8JUIP_BIa!@FU1JZ&<
zt7L%^Xt;Bi-Qy%B1dS2WhB-VasReJ&E037hwXP7QLAsV(N2Ybqcv@)tkB+78?N1&=
zD2?BZD^vL}AWr?@glrcJ6e=%D7rfa^bq;eB^mpr#_LK6`lM(mMABR?4&2yp3Phq+Z
zTxj{g85vfRw#nRWcZS8#{F3p`)W$fPLE=wFjZxhv^R?QHYD+v<`t-S!S!-VsQvT#h
zeLB42s!GlEFej(sfzz`_dm!=_h4Y*x=|^Z4jG*^TFN|R9m_CeIg*jb{7l%AOvfZ(@
zuc_WD-4n<Eq&YpPr}K!3(EyZ`<Y8QH)Y@Xrz%?c<DsoIV_JdQ~)Ku(`!Kjt{n%qF_
zCB~kp=brv4C9h$nxubqwhKl>p6Hl}4%&ly~M?YdNmP{F;9m_@lVcMAIGM1dNw(wW=
zdhkEp8O4J)t`*lepI_C5C&*ESsDAK1u}FH>L#H>dZ{_sHT$`4&l4{*0@aC=3ag7T1
z1qYMoaG>{o)(vY1Bea$A;sdG^v&UHL!n2Pqi_CY=ca4fUx9<TH>&*#`8|KXNlwJKO
z#+I9wqK~=@sFQoz6zqcffyx0i37D<X+@7Pc9H(+a9$^Bj3d~du2`#col+3|t1&p?9
zWztSp_sJBx5ls+&x!5^kC^@p`OSVr`G2n`@kMgcAlx8b68Wikr>wLmpV<orX$`fKM
z(ZNI-lXWE=yII-VkcNq^##y`6kK|XE^FQI5k(!ov&2_MjI&}^{pkC?GcMjLQ!sHwZ
zhkU})WzJV-zF<4U1RtqAotv7IAX9y+*4}&lC)e)k1$i0<z4jh*R%@{FjM0+NP`uy^
z7CM!943l(Srn7_1k`+U46X$%olX02j`l#4<r;jWktmv@f3j2E*I3~^Z=o$j>@7UI$
zBs<&HY4HK^#X%>-JYa!CIZ&_oYGcWs0{3?tlIk)@(m9?x`~WcE6m6RkzC@-y7NgV+
zLqoOO+Kdc}ZK`bJ?B1wkC%R=m!=YVmjIWk)y@=`892XuP5NahO$03cV5h|e+(coBi
zCm*se-LmCN1G|kpTE73E5g**z1*dPCa7uS-YsdM(-4w$z_TW<fb?0NQ&JEVS*gvjh
z!pDf=)y~}$4rAwIZl1a;->_iU8o3m++g{JGx|TQML#h>751w}sT9(^a*1ASL+sqp8
zI5#`g|B3}Wgbw>o-04Jp^zEB~*Mb!QA&!k(-LamQahj@?f83e;3{_QC2j;x+Z1G6l
zg8es1HBLs+!@{)+eh1JK)bFr$BjyB>%!5*a1ns#yT-|Ew!Mr2ei_2|{C<>Ct{qsl0
zf~&?R{o^(pE_d(h6tro@2?m@Kr{<jKKVrn8jQN^BQlTm@92TM*(%`FE$-)+D4Q!lK
zZ&|TT!4!X7+IeRDd@6r7B1gpUcy|J;aBjMEakP4B(&DxTn^t7U0OLG8$tB%ino$i0
z1^Q-;GfmC7Qtyw0N!hoo$mA#}t_|E9plRXWkWY-Xp4P?H!tIJwC$FDll+=Vh+fRpt
zGE0+-h^1LRaEqDFFfK~Eg>bI_`D5v`JeNVl3ofISCD%nJtCZ?d>t1brrHlR2-*x5?
za_Est#~pBqHemzzUfEfhnalo~Gq|=c{pxUNq_-}!<6#2JY>a}Ed4r1_oG9pqk2iU4
z#Gv1{9ap`mM_P>vhtZS<JsWp1<ER~3_h~%c7WM=c^wsh_*zv#wp$|LOH%!=_v!sJ|
zLhh<zTeQZi>N^CrSMfcTf6~}~U2;HZSGz!n+=HtI8iy8lnO0l_x6v<W$tH&GTshs%
zBW1GF>28b;5K6NQY@uZ&lpto^G~0SyW*I`?J^I(3&R@*VWD7A$<kS!dd}_Zt^YN&J
zg#P07`F01U$(!m$`$;zd)>g#qORI_kyWNfqIhXrBZ`_p%11F<hNe%4>9P7UIiOUeW
zuDi!S=&zEug|ab@8~|wYP|5-)qdcTGzJ`asWMnijbZ~e><uvx+6>5&n`vwm;NM&np
zhvNCOcztcRMF%LCoS$WUNciP*b*Faw<GJreKSmQtCEC-w6Foc7Z#x=iCkkRKw&X0S
zTov4#SH9AAUxwA%{@I7`_2#k|vT<fcPzU${cV}ar=jNMPyq5+;dxKr+yA#n~s(rW}
zdcU`?;Q>QmlZ@{V|4{PX<?3L5B?%N3D2TAc-UpDwGH!K)<`!LB$P5aG{BFGO<i`rb
zml96Fods93+@44KEg#6+G_cvUK)M>scf!|wGP*<9@VcdIiUrNhd85dQ)VpjC$;<w+
zn~$)${+pR83sgP5yjt|%`$Egm)7T`&=`?e)$P?Io@F1IvJZiIXX5$)|o_^?Ugn3Hh
z3+bkGaEjVpWft?`>{f*Bf6ctgyV`Gg!dJfY0&j!prpPAzk_^L)jMsJ?<vJ)oy0%vN
zQMPv-FcEnpb4*3O3UQ<P*bJxrrf-{`c01k8*~y6gSM+eKj;2*rfBSEpl_f<7X$}&@
zdG2mD3)&-84E)5k_piS>-=PC+MK8OOn|$}=Nfniqhj(e{Y4_3dS1!Zujps6PHw0=Y
z(G3V0|4}U@!%E*=6aAxUP-lW#2TF30*bX{)zbtTjm%haNShvtn<qewB>~2Q;;k#A{
z=eZ58ZQ9mP4lbe}=yZHnhA@o^IN<0K!A+4EC=^#-by?NGv4`x`Ree+WN_x@tH<=~P
z>lJrUtNDy>4hFuaI*S8BhI--OV7`tG;N}|;`l{2|jIQz}?YqkPwb_(eJVt+aojjs(
z9Ub$qRvY6oS0_&HIiY86?m*`%HKYGVQ$2-Yg-Z+ecy(uoH{a6Yot&JVix&;vVyd@&
z#i9dF#xh5o%IB_K=T!~pU9h9Tk^|(u9ui07LTRvb^OFqaZed5=cWPJWq&b&;5wSPh
z+@B9P#{T%@j6>r%ifjXAybUls5E@EzfvQXFyFY!>&%NvKMr#0pUDBcfdzSa(OAZ(?
zF$D>0*rdet^re3vgtee{Q)hYq?2*QM<=t7$2B9Z0G#YaY@!au_HU?&fc$U=X&L>PQ
zI$)qOh!+{eCnp~Os9a!_7qB%n{AQIdxrf+D0jV+mCFo9Zi`%M{rQjgsU#aId5QuaQ
zjHQo;kM{rGmphA*aOb8@aB;$+#rI*|MH=SN&&jCuWB-aq7L)0cu%`iMTLki3Y^mEl
z_<Hnr6^UB|yDT)W=B_p)gTfny43?eY!8gJw2D#P=rgkge=yrE2C!Vj~s-9gs#$L=>
z0^Ntf5e)LR{P9ZO)=*VX`TTqK+_$yafxYbA5F=}naK<KGT4>1vrPLJzr7NZw?|sqZ
z<EjzL`@XZ|_1V1C=*cY`A7H^c-VHco!HDyeMFYe`MHVyZ5^|XnSwcyua$<CL&I29u
z?k$77v^fnrmN@s4Y=gFu>6vG=Kl8-dsibnv*{SEV+*+{4XACP>GflaA-ANY5KOX;D
zGG7vLl26^THyo1wO}aRJ`o0c`_|G7MOUE=Ggb-<I_%SrS0ZE?BizB3)m5;4h#R=pk
z__6NP{BHvoQ}vi3$%6%X`t~Gihm{X0OB%}RqT?3^J9)2N{*O<)ph#wXBfeX7U^BY-
zawd&h#x+%|f9&Q>?}FUbyk&|F7&EC8T}{il9$i!{CTo9)-FG+R(fi6*!;G_Q58Tdj
z)eB$UvudE*+Q)I83-cz;FOS)7xctiOOQrW6u0-tQB|&rI0z(NtFB((sW6!QUSDPEG
zxDUx0Ulz!E=&Q%Z&EjNiC%uEY*N`4Q`kAPp@Uc@f7?us9-iZ59Kw30i!+#oO=>1ds
zX3=-k<$rgmE`HE)#gO(@w_pDm_bk8P%D*ob$5y46-g<Pz%yoWx`<o96KCc?#m0#sr
z5|fejr)6fNo;upMRjpJ{WGz}?kr;nuc-+drj~FNYv*fzhL96#Y2E|_8ep+yBR=uid
z0wbgKcZ)I&poN{z&z~`eZU(2n7A6|xc(IwmA9L~F>s8D~DdR@8A9y(a%10MC1vwJ#
zAj)AK<yLQERc^<OPre1J)$vD)!i>X~>#3Zqi3}}&{r1hhrAM~bJh@ODotb6iG<1W#
z_s*KDzBk-P^(`IR+T+WO@Z3W?ibJjxn(JK4+qTujV(g3~U*}q9|DEj_qPTk`3W`wG
z5YjHo0mecy^Uqq83a%Ge*(t1zRqIv)-X>2Xbi+c%!jbL;^l1`-6VK$zy#0;z<|BS+
z(9C|KpJw;C<yW!IxaN`@by~LpmXFv22x+XH$?5EOPXd-)%Tuq*-kz5pHQrg)=!AOJ
z-ogyWH%IP%YuRKI(UpzJn{lG)`pY0LGBfxIk1v#&oba-><=Y0S=G8Qgsk!w2L3N<Q
zc)Z=HiWe>|j)zySiBHQLRN8I$p{(50R-d0_8S!lQ57kybJn=}g1t!jB3v6f(ZdDQn
z8N)?58a_TT;6#R_<;Lmr6Ly@qD}q6$nA820&nyg#IbS<%thTFarHr;K4PBr9-Pky*
zT+JpkwOwz}D@ehL4Y*zNtWh<K;0IDv>2Ib_c6QRKpr-xu{Yh=i3D5R7oqpZu7xt%N
zRBh@|XUF=^;j4A4s?XPWuTP)5y7>x|jP{M@)*OoeaedU}zg8dCY8QI!k2kkRl*bhC
z4Ln2j9v>Ned-0vnCwaM-iicHY9=<tt@}mv@V^-d{y}eoc!0lD9)ZQdKUt#-ZNMgao
z=}W`CAJ%>@O>392+W-5q>Tln~W*W4pi-A+^JDc^oyE8j)pQv+i7C{F*9Pm+@e^4Fk
zIC877Z>MKajl6pLog_eAMm*C1_!!Y3J80E?D+OK$M1&L`2qfRwg`SlepGw~MH&$Ej
zx#xDC=@G3t-ks)6{T~+q=J=O2H&eb=wSTduNloCe8jGK?`YM&(Vk@`QBtG=9d%8mZ
zPTp1DpSQ;(?fzI$xO9y7&J)Gz-i<#8?QiB$lXV(Q3*z7abN$Ij^BSA9b*aYoU$aAp
zW;~v#vHqF`+t4=NcN)Xs$+Xf7oc+?{Q_h)AZi!IaWLTFShK3MkK4PeGE%UyU@vz~;
z1@ZLKZoT$7cM@$&n%c(4GGDn5xwz=xo<aV*eCMuhUcB_kxcPbK6l)r$SI&JrBl11-
z>+tsy=X2_c;ODP;Pl^b)GqZf(yn$1J;{e^YHy`KZNZ)(drA?NTrw+Ey-u)u)^T0R#
zwf}wp_VMR|VUzCcU;D0yM(mpR<=!Uq8*M<|y`?ba`Q)?3(IM%t&A5bmw;o<<Oxy0<
z)O*jnwirLSbBuFaT?`y(FbkQ9Hq#bB9l8Npty&E{Rv2okqXE!Lcb5B=FDNW!+TT08
zuE~Yr7h?dly%A%rJQuHFHr8BBCzv@E%KZugKq;97X$cRueCGy65953cR&bM9cbU}4
z;Y7*X)tS{v@sERdz0J7%c;DBj>%B7HmiRga#eRCyAuIoZ{*mf6#N%VY7R>Pz&c=cT
z|L)2k{kEBL-aj+Ee6OW-={yKC&Mj@9R%M0`3-zX76mP$*m;GDn9r7#~7FsX4VAb0i
zqz~`cZ&_lpO4152FX`iwCP<UaYc&w_(Q-OBQbgmPuq7_A=AUfuQ^iKr)$#M%S+CrG
z<?Z95MeD90hN8<vjaMH~tkcJklL^&H7l%)RLb9Af<&48=?xOlWTg=w$-|#*iccFFK
z7z>aPtb2*Uz~>}Vpc(v-_8#sR(Pn;`@r1QO->dIgm#towP@X+t2e_CFz5+EvaFR%!
zvH53>T7#8|3!GG<RD8J30z=5;RMoX-;>$U!|6a*e8`wCQ-^w5s2vGne;G*ZYYiDhI
zO&_KQtvGpUTBG4%h9O_$V&kvnrgh;)SI)ad;8pnZ&0uK>&<*uBd%0+^YeD}>?ZS6w
zguL(fA}qYO^;)PNV~@x9cTS$$eej?`dn9vx*stnC^D3{BjHPfBp|z6%y{e5}tPfdc
z8z-hEz!GJ7>F3#D#h0ivK{adU{h|J~nR{k*YKQXWo`<ja&kLtT!|y?>^dsK4TXv%!
zJMgk&`9H=X<>M#+m0rH^@xqRAMz_mitnKXd9xb_^oohSmdEv0_1KsAgj;`8&VY}yw
z%sKP-Kk$ng1&%zpsWaUaP9_0l=B3AOqcf}a-^f_uJnLLl-npt%i$c<>^25%~nN<tv
zl(x0b$67Nz^=>3xTT<+A@s&(<x^KTUTnQv}R%Ti)(LKOuq&V|jd-o__GPqwC^ohHi
zFuIk!0}v)qP<>bu%s&=2Kc;uU=S9lp)2dw|R7M8%07onuyy;;hD7keRA!|jS(G)Xe
zmqBxmw$7_6Ztk4keqzDU*(0YebyZ2yUcTVe+Xrjz`g!lXP`+X_I6WZ-zBzeqb%@r0
zrcIjIU7uZ1_^YOnrR$RO%wLPF@&!CgXXC_>n%h~IikH^+{!p^?a{Pz$FW&Wi`RMrp
z&tv%=em5+xAV@%-uTSTnt5ifc!2ie`o-PXv@_k-vtWy>~X72Uc4sX}gpS9)q*?QiQ
zapCJ19r!R^|8#hD*u~sz)sWR0kA7U8|9r-d2S>hI?KkMMM%*^~-T0XN%enl)i4*6`
zYkd=(?_LbOyP#X=UAxp8wD6A@w@Uwx)93~kCqMkN<kF9s*Z+BZ++qEe+2_(uHIKZ0
z=F+r?M@w3bo@+hYSJ9+i&(3=OznPkiGc&7NP*wBQa!o(uy?0fULdQQ3N8;jD{9vWg
z-dP{2eMZh7a`=Hy&2TN-9)ad6u&^*ILqv8=+<nIAA6M(@ABtOry@8f(Bpd=gI`8ws
zqYBRq{cfFNHT}`#`D&exh+c@!I9fiaE0L3$Z_|GSS8a?*gA(QdsL)qKz(?L7x(%_R
zxiN29@aLh8*FJ9aA+S7du#ep)-$}DHXL`O^yV~YbeB9H39?=InsHo0vxFd1piJ0=u
zKR;Y9fB!DscF@O9cJ^Z;7XFA`w=ybWPqtfBe)yEr7nU5j+|y;vigxxfiRas&$<8}>
zYPC(H!q@ax{9owJ+~zYH{X;@|GoA7JeYTf~_v63+{_DR+r{!jWGoequphYoM#99cg
zU!ECN(<6VQPuj^Ti=#eIuWA&Y`+jgAi<ONEd-Pma{HC<$fa|j_6(J~jNw-wgrk9^O
zc8<mOriIYNv7q3B!RT8rc>S4!KLffa$IR_}#c1yerxsbS?`34%Sd!iF)4^t&26hgJ
z$S-5_qEQR$*h0&ekvHQ7FJ$^#1YDY3LARn_hfFwC^65tT;zE<4AG7;BElk_3A7N$_
zeepucsl<hfFX#U`7nT0y@A$-+B>!WEGb{ILrhY&2p~sEV4_6b1-u>|@>ixNAb}Jj{
zts1-|zCCU;-$oS;51wW7Ik<9akFLS#R71l$&|~<y|Mv+Kwx0d9t=4FlpVjdX5Ed$G
zY@USr{%T%-FU<Ir4qbFJK$kJU2N?rWmiZ=MzI@>!2j~YZJ2;DJq<Byp?hQz8l48|l
zDYnpvSFo_(^2}|^-33i5cLlud8T<Zn9Fx6LV=k|2cwxc&lGr1c%gbE-Y~TDDgGJos
zZ#CTwTQ>D-ziO79VQx-^>0Y26^m^7i2S%Lg@}iUeSFfOm54DlLXDZj+EPN7s{&`F5
zMyhL)-gJKNddkqf$D&rV8+o--RK59BJJKV^VY2>ikG481bGKfLO82;a(fSOA@NNlF
z0~<U0g}IKHyi6@;TyphqT3M!N;Cz;!BjUm@bPgGB?yc{Uv-|D>h`*G30Qq4ZTWjqO
zus7H=(EfP_=7rOpS~P1xYbFgQLjVA`;%&>G4dZwoaq$I_ah?sIM~)&BS?-f?L}U7e
zX)e3p<R@o;eG=7hk&V4`#H;ie+{<$R@U*u{Z@2;#jF6s+e$MbMEAr1aOYG*qGCs-9
zE~4AqzKBL$&govfd0sQ9VbdGlnN8E;zqTFHZ&h?;bB&*!G;90X))cDcoJ)GsURxu+
z*n3WvYUGd4uAht7s?8lc#3T39pgQYk-{}`>tbh5Kw_IZu@~&dlPp`9j`nqTR2VH~N
zaGT+K9yw3rHZ={;ZRgR%|MF>zw`4Dt`!2_ZH~3)H=E;>)7Y<jfIWRIT;oFC~tA-9A
zQkC{aeag8<%NpH}Z||?+n*9Aj|1+QMTDy!~`qIO~V$-*{*&&y=So~(*(ptx9%1rHK
zD}}moplTpivEL7OvYIxnCh6-I?}!NFk5_Alsg3&SePKfV=1D!R>xVaQ?FFT=ErV*7
zhi}X}GCQ@(W3|nhdLB)Fi#y+-?EUL!eHwk+a`0KhqrRUnS2@ipbljQt>gzhygT+(S
z?$<BAURvheVC3z@B@YtsJxEOVUws{jg<ZYz!3q4(ztxT&Fz)+wCyh5lS1f*^97yRg
z4A<?Q@EGDRpVKb9mx1-{q5%hXZBHrlN^z30R{?-*x*qdp7iQaT>Sb%&uXIsv?%^Tz
zSKcdK5_;Jyw)DQINuRH_Z{Po=>6>8lW2yQx?=$7^)<vZIM@$={;bm)6yy=x)hZTL#
z?6rQe`sLRE!&W=oXZH<GdHrYL@Lo;Rk~F;HOcvew5ck=2<#12c{Fg)Bi;ENcPBPn8
zLey37G;;3G!?(&J{#(YT>c%manp%^;`rqu{s)@zfE&n(tKhH?gNK(0e`EvRB77=mp
zV}FFT`IZt9SMuoj)azB1e-`eF7(D9Brjkb=25Efpu#L!H>G^D5?|!A89ZFxsbv7N+
zp;fEjYR|Ur`|9>w{U*0pKCrSs(Yu#%k3Gl@5@Y~_urHARjQ#iPh}c;Fdl|69LBGHD
z$oR5qL{U|1Wtmsn#vGz0wME5^a!=p1K?nQ1UFlv}lA5SEbs=5RpxL&K4JRcpd}f=Q
zpYGRip>3IaZqC*C+=fR!Tq2pGYe4P(^?RkJOIns)!kKl!pS>^t81rI9{<9U&pIs?0
z4@<27Q0v&7-r_2ip!w{pXUU&2!^#%4{nqaMnOD#Iw$nNJW8_!w?>F1^wkz*>sJvvz
zlZ8;dL=6F>kU*vQ=9x0NdrK6p4r)uU<^6bFX=?tclkCdR&5VaS8)#)E8Z?+|X3<Q`
zcBtx*0Xl@|0M&M#GcLjcpxK7DHPwBJTKBkNku!UbUioj+$T7#t7l-HOpBp3Qwos*+
zgi4`k)9Av~rXxD6nguKW;nvv1ckO=YPN+2ayX1WOoqcn=9s1DhOm<`P8MYcau}hf=
zJ1?I+IB~uA%F^>w20rm<l@!~k;f1~}=dGnerMb>4@`I#BI&Tm<U>g;<MGcvT)Gy|^
zskPdP`t#42U0)cHzUh5z{_^Ro`l!9U^6`epw1w~HOk4dMdNydj-6LDIzWrjY^dE68
z`+N4KZ&A}P<jn9$FgLI9ozt|(MOd>~Kq%gx&2U+^tj(yIfl$W+0#v?l4mJC&fuFa&
z?j8Rn=g|g?sQvcp*@H$sxFzsYCrZAU1Z^lhx4Pl3B<t!UGtN!*`JC10LHgUZvsSiR
zYTIysP0oh_Ex76w9gaGas}ezrf@+h#ZMHLOH>I$6)XcMw3%A&P?bhD^-u#32AHVw?
zJ@^mr21~U|t3&(`2d|qyhU-%#iNF8u$Bet#ms-#N8lRTb_p?dsTASFWkK@+VR7E!Y
zb6L1{$EfUgUoM<3y}x|^{+vYviq=A2#Ms7(;BD_Kfx7UX)s_@r%3wC_%Ie=3mdqd3
zeqp-FWnL?~jnmg{$-*)nb`$M_fNAq>C-%~OQZj!?zrWqf%8~}3RDZlg`>on}arix)
zU#1c3GJp0e)#;N}-hMt5TI1P%=J?#~zgpe2ec*5<vCQig+%}oHJ7Gc-M5p;ZPJU_D
ze66<1ybpyZqjDREIm`%e6_z|%Ez7&t8ov%-t6~#-KDRsL*I`}C*AE&mTV8BrJ?+n-
zTaG*i?1Bup9daq9I~)c&EdD>jv?u%1-_`6@gca0}N|V2oU(Qhb?b!zdbaxP1`C4ld
zCk)WN2qg#{?2vW0UdD?}rNLwul`l-!D}D5K_oFR{B1MmuaruqU8se4hmVap8U7W}Y
zgBSjAoHakb&&0KB*A0zoJVImoK%LTcGt%OBPE=d6DWA$j<!#8Q&oBmTBXe7A9QqP(
zSGcFKhuVCZppo>rN5OhF{+ZS0YtP><+xb4Zhs%wp{*h5Ls;}uGG5S^ht7a4@oyOh#
zoNE2!zOUhHr4UzMRIaH&CV6H4nLQ36<LxId^3>Oy{RhH+t;v}>FCN;FnA}>@YP!=u
zrXLR${xo<r=9uA`N!>*sxb$+2Dl{nz?((BgXH>n3wQFTtW0G0)`tAFIQ2xQm?)ChL
z35iGga>}ukg7_<{saIqkDX`RKzLpRae3EQmv_I<YeR0P7*rb-dH5LF!A=GMCX&BI5
z#TybF<Ya%QSfaUnV7OPcd6;Efgyr=~q2qDhFt3fS7kJk1;RB86sW(&Xjs~du;g?&m
z?(==cn#&_f%io*5TQY59)k53&qr>CF=D%2&q~QyBxSnFnX7zdxD{eq<-M25#+<d^<
z{uw7Dx^cLs_QcGc=QXSD_04TRH8Ylf_6sx&*0UG{SgU}tuc8=vhIh9M8Lx%4s~%M^
zU4wn0@`=nJT}r%{F)7NF(aI%F2{<pNg!E{yt);aoj*kZ2FX??{)!O3jn?qYxp|b$L
zf$(~VTuuJO-nwU{ufKJH@4iskP~E2#Kna)fo%B2*%94!8-saNu6gxwO0%#&XqXkb%
zYjW@qLa3I75y_C&<daaUUQU}cc;z3bBDzg=VhB6`O>1($E7uL0O~07YRsZ<7+wvyI
zAACy>N<OkIWbyy!%eF|i>YU*mVPkH-yK98eA7PXH{~W?sQXXUzq_;H96~R4?WBPAY
ze%y^($yPhlx<rQl#)occw(zv3!&O3eK)?#8DGi7?4)WrlimJfer>3+m+ASY<QybO6
z=E2EUQTcu5-9Fs8#~$x5S(ff`&vcBP)S(XKjSOA1;q#@8zs7Y{o~cB`z52wT%@os$
z=x1)+K>Tc0F7-&A1uZ<)E@mt^IBTm1MVbnyPCN^!`Zn7eWGUKQyx%*am?`acX@-0G
zstXO=&*iva7N%xtp?p*??+ni9s;h<of|OV2uXaZkX=*;rXku^BOU^UvYN5bInmk~6
zzX8DA_}FUh+^YQU$&(`t_txW>STx}whv)Ws5UPC4&79PYd+3w0U%7PgzfEfExOAfK
zMHS`avS<-@|Ki-|S(A1uw~LGM?^-g()y_~*9|~xLy*;S#_|s&#x?4csO`&d4?G%3A
zTi0P86*03n)p~LDh%CLtLD$U9Tb>j4D|RAcLL$V)+?eNWou-(psVldve8nX92Ufa^
zuyUEj+A-ur#VEA*qm<6aFKnlH)Ur5_DpDKhLR9o{OXYlZ`tLFG$&_nk`|42WfQQg;
zhd&ylcf5SpL#OE>z4pp)YZYglRO>8T?BFc<#QG(Bs67JBll9mF`V;Kf4oldebqj+3
zT>p%}^UGuh^5^)mb-j!?eQ(%yk!)Gbp2nLp4Q2akUAf=eLXG<6=Dg+`>JHS+mz8VU
zzIb>nb*P+(J&pH;#<x0P7Qyjv`9OxDKq8<&`L)`)O^Vf4HDCMFS&RI0-@gtXD(|EJ
zDdqou%`D`ua@BftT=#J_sS;=-u&j^mVEMhe8c6}8%omG4TK^baY3|%!_o&C)g+TJ>
zOUXafy#}66m!HOxyI3^ff_pK=Qx-(a(=M&aag9Ut@B;KhH63D~xha%iScP22ABh`%
zqoUht7jo_?&wF=8{fWnaw?7@Br9&j)%%!~ZNPb_r^{AurmmJvEnxJVk1i;ZqNZ1}o
zSEU5vK+kmCFvd?SKjR<MKN$TOo{R4f{IbjrGXhkHC=X-_kJv)p{<&4%VKI@vKg}+r
z@yB1j`uQ*hRpXuoL5R0!-o~!<m>yde>1mtQ{oUY??F=nY+Sli<La3^IHF;zUjAor-
zHTCll7d68%kC_{|?d#R;C*{LBy!?UHRvojD#tP!0a*wbkMZ=v>YdR?dgT4j3!YoYf
zwL1%&tir<U#M_^)t4b@BYaP(Nzyo_AU)h~1ol`yHo+*a)G|pSRft+D@Way~?Nq=(R
z*NafbnSg+e+%1d{>e?c3e#H2ei}BMzG0y$1vq-a<igFL{J`Z*U7|7bgKP$^Ezsv!3
z;NjP;i?>|a+f{lQd(XLVq;`DVmu>TZs(=+keMVv()=@*tb|HC;9Nx(%83bJGTW3I4
zB=U;6&(~{pm2W}7{@;P|ty?B8joqb|qkc?2yT#Q0I78spR+ajax4{X;=AvCm=d5Wj
zSeV@R3nf5XY$wOVn=%={gz5)5>>&Nk%FqlCG-k3y+qVgjx-L8jP>ehEPhW{n`upgq
zBNY-ltLX5N-K{>m#e7C%W#}5_$lWxiZkU>BTGzK@Pwc+rBq>U4;)~6y#)Z=l$u3n@
zfzTn!Y0>6DQp8rbW>WLO%PrW`ikhd#cLs0=Rb=ndmpmK0KNv{4?^_k$Z>eIfJX~p}
zdsTbcr-U!NW%R2PhZAm&3@0{C(|Ata+_|HcZ6E71+VjF^xCB3V(2f(^13oq^14Flh
zXrEs{$v3o^QEhf^s@bV2kfFE!w`F$sBXNB&Q0^&RH7+_@iuI2<7OQ_a*5AFctJ5>L
z!NuuZWC2$Zx`=H79E$Kc*6ZauwonGs#3k2RNmkvYE|}^cH|!Fe&YX&4;IQe@FOPCO
zVGhE2-VpTFnD~7r`)<Ti#izr@do%m-H1;5vn|IxF1-rRByZOIJWZv>G27?h^QeJAF
zIj5(;bt;7dx)EBzDi5`so|bvJX~(F%{29Z8HAs;=R^wvE$$;ayhz&Jz9n?<wU;+Zp
zYRAld-XlZ3xLK~b`7@fZu#ls5{;U^#-uPTO?FP|H0a_INncfGGzG>yIpWikE_vxpe
z=*+dyoZWkzXbQtp(P93Zm^S(nc(fX}c$zV+y@o=hxWp2zbhjvUuusdcf~`^9ZCPZ#
z`vQ`=@c17+{~Ji>LWwVvtpS0|Oa@GTaVWG!VAs?xJl7^Ba?oh>q#Ew6i@~;zzv(t}
zO6tkspN!LusNuzRM0no?ZY$Y!onLqiyVd%dsbw!$NUnqX8CC|NsiP56&%<Q~Ow%?@
z!EQVK*BoVNJZntyGnos3HL)`IBLHr+{CRmh%4n~vQ;$8<rcRZ46+(%_^;LJeZ&UA;
z<p1-X&+}z4vAE(;E@|v5PL~#O7)vkwBsXKfrHL10<I0ET9A8+KvYB=_m$xF&)CI8;
z{%-W-y>Amp(V<D=vCT8-UKI1p?Maze#SM)1z$MizTgH%?i6t3_@&;3;bxhU_07%tn
zKSZ-6ERe(=({27&`G|iN9pW93@xfzO%xz+Pb6&r}1<FUVdmJFctBQbP$sc!TysBWF
zXkhU`-DVU!za3c__l%i}sYpKQrn}!CnCmh8=xG2SSnu>qf%c#@g_nFw*@2FMaz-4c
zrB`*cb{$9<6g2FJcBO|uH<-rm<-TMxsN5-U6S!&2D=uYt3~3LL=|1c*w#A)0;?u~V
zO-*SZoSWKndT--9U5}5`D_zxj-;cplR>VDX2<fqVoTT)W#uY!f0Dodtk}Nh|Xs0{X
ze%u0M-~M4WFDGg!5%l6`Aybf{_jkia_)XLyK03<045w>Ps;1&6N^yE2XWsIV;>tDl
z2ww5LBXvd&1<qT(uG-q>aGx9V`iWx_2Et+rBTiH>kc96x{6lnojKFph#eaBOnT0IA
zmKS|<o1_#7WlOz=11PXI)z{UyVJupLWFNNf=C%vOpD#=+uX*J|<xSGvO3nVX<|6JL
zeI!u2442S=*9ly1YOr_HnsJXZd4l4vCI~a?M!^?#)MXkZ-95qtGj&o2T_Y{Xowv3R
zdFd@r@$iGSJ)A9qo92c?hYs~k$)K*i0r^F|u@wq{NK2@@KHpM0sk}Wq(uEELLYd_s
zUyK$joCQXKhp%qd=&%(O--@=vgi}}Z)84Q?Mxipj8Mh?}t<3sRY#>M>WC+5&OtM0F
zqN(Ccfh~l!z1xiURXqQ=XGLq{`$gpoq5+32X0Feyc7Jf3ygn2JqBs+&Lf_!uFaB7D
zT`G;_D@-{0`C(-OW5vfF%`I%jZ$TR+4^fsI-Nk-MlzynR#pJx^$FqI1w7l?+v0Wnb
z0mT<BTPAQU5q!hTX3d;9UHd`@=t)9}yh@|e@NLijx;viSTsv6y2KIx;5GZpBp%);g
zND|StRQF{Q>*>MHBbD{(9E<}U#u?aq7%sS@S%ATl?Q>_?!&Jn*<RrF?Iw#$w88gI7
z;9K_5T7Pti%ZCP<>qpM$F!ax^5`(a^WAdBl>iNWBTq|yi4;ey`BFrRGzs7$xA<5%T
zEy8T|H@2EWMl65B^=t>bwh=Rti-!51pURiqH=*INF7B@Aq%nw?OtY5<#O9u$-kYYj
zXku{Gh|(GE)!V0H+kCxl-Ol7@Zi@)Y99gX{E@=Qg!@Yw@ukoL<$K8#rLF9(1$Eqf9
zi;-v0s`HuCjWT5AOR>&D0!zE`<G*`tj51CpO?=XBlA7_3v5o{$<sbXhsQbMW-H!$&
z?_?W5<r34j_Cwqg%T3PXG(oaXn<B)}UWM(8?C+nEzN7!g1yGnmG=KXrtmrCB4~9+$
z%-1X|5hFlMYyLj=^_|Bs@r0qcuJ<-%?TTdo#xhWX9sn}K2(Pl);t011btY^gc;BdD
znz1tM6Icv%ibfzp5#KwRE8c&B`oHc&r0sy>GGJN|>LxSN2N~zq|4z%>URd$mz(xWu
zfm7dVZN=iD_VZDc%Ti=$u+c&8(P*d&Uk&hw86QD7MRY>+#&>(dZ6Q@0-Rv-?$AVpn
zSe@qRs`XGl5$&)YF++?kjGkzqElNu9ivpta%YDCTUU-_AWq`H7(-m<SV-x(@XgpUL
zqd0ecO{Mqh#vbDr8RtKG#PMduf8C<M4AOLh`?A$lnIQ!PG)Tflj&MmaKy<Lbe_){K
z*|{@>4v+s2cu?X3qn9Y{#B+yEC)!PPq<YVvmw9bF!!cyHVI4yh*%ca^x3HBtL0z<l
zK()}QahNH<Xh9q+MBgbERbl|F&_mP4a;Az#J~UM78aCBBieq6harbNx(@z;IV>ANw
z5keM)`?&++Zh8w#un451QgmLuhRqoMR<eo`@-9uMpRkRFu-vFPImu8^x9FFu()Sai
z1{z5GNF3zU+F>q(Rcz}yxhfr!>~OlnHtS{Xny)kt1<XZME}OByBYR>|VHNr}xuHRI
z>M1~bSH0ZSMl>-%OmHVeg|yS8N@we~ZHS>Mt=G*8-(=@+_y;S0^z#lbBMb8Hmdm*}
zVJ0X7{%_?aK}eoOpqJ7XXJVPQR{G?Mh^!(OW$9z}A}fz%op=wbDt?Wru1MfoVSn8w
zcVSfi@`@r0h3V9(e?{Nyx-0j+1wBwYc&ZB0e|F6<w;L5>#kNo^#l$}limixiMpfr!
zy5pSq?fNwtKQD;$XE+!l5Eh&&;k>19ei^;$dgOUWu)h>hDcWOoNbDHa36V*Z0%#y4
zm@~oz>nxdI^Ks2h3DpG$VO@y$9$&|mIOyHT_CCziEo8^~_+!>HUH5AlR|RLJQ~wOu
zJiHX=V3DgUEyzF}r^mQm&j0lzSZ6bKEA4wtznFM<<@3fdKmT!hw7ZLCk*Fjn1L}+f
z)~<WLHa+V8#d2?IX(i&sgt3eBN407A)BI$>t9Er|SisvBA7X4CB@t;P|2f1ygX<yJ
za6^S)2~f?_OuiO$QQb)4iIW(oD@Ax_m>2~_wb?inIEOi%T<Cl6K6+IXdHwoJz&VlN
zfktBQumfc_GcUsM!IzEAMbFOp_{4GNOFk<u5UrM#SFD<ZA2`H<@0V6!o+HQ0>HdM7
zcOyG!G@_xY@c29D8M~eYjZ=rSzTf<xMQy}G4S(syqD&Q!T*9A_4Y>sr8@|=LUY{*S
zJ?uX`E1#G9@U{Ru;_2X$ceNpPG(Aw#1Mx@~^F-8X%U5$6NhyiZ<l6%V4ceMJaL}Nl
z3*qq_**FaM6#q`~bvtZkK`G1%Xc{|Z2v>n?bZ6%1WkU{Iv)AcX$oB2NM`Snx){&F`
z@Qk<Gz#XcGDZbpX`o|UOMuB}_9tt&Yd6~>pJ~svxz{^<5$vpAHJ=<-X{<ka_6#&Z!
zw40hggG89wVm{@ra+&+9-^i3h%0Lo!h*&BSDPYU7od?)43QwE``OxAONJf&`?@oga
zWO5P(QN;QL&zL^lfnP=O=9gsm_KS}H^h@)?+i+8G*K&S^MbB-cs*p93A@h08N0TTb
z;^XJd-SY99)FsSngbaE`y19w(JeV<5PmFj-FK$(a&%C07+9ef35}-6LhAyh-Of6e2
zMZHgKTwYZ=Yt_wpuu1{b!Ki6>^bIvx8#&+eyeY3`XDCC#px+*@ofn+`&O_zxlih`G
zQH^dcW8K_6u?3oc-ScZ^w?0`b1M2D#6US9!{Mr_0bUp6A@LC>Qsj0B$ZZRE0OI!O@
z)OPMPoP$0_CDb&-5Z#{w%rn=v+w@V7b92QGTQSV*TSub>c*8UXGDH9x1jlJx`C9H5
znKpK`_DeT$)Q3mvczb&iCqahXZM&r(d1)f6&JBaFea&sSX=3Ce5Smw92U;X(z{s}g
zQj2x3_@nj@&~EVHxBpUWNo%>~5>~}v%EvaL6d}j_8TnalC!J0azlo}C(Ez#INf1&i
z8ka4*cJ=DV3-fE5;AyEzD~(LsK=s_9gA?a2!;J!d9aWdaJ3KhZH%(i4`M+Ci*&oJ8
zsMX_TqYWNvjjOMj;;O_sI#3s+HYU3MS7LvsY+)-B`;p{Q1fK`vOn8qgo=9#7KVMb<
z;2>8Sh==_@rCXr5Rl}x$l`7wj4Ghnw|B^>y;+*VsP3BF8-}uyoVannrFGlaH%C6@|
zV7;#}MZqP6P(sz#uRZ*igJ8+@-8`4g+}CwJts<Vj@#UZ2(aJWE*IoHxuGlCgGqHn{
zKyJJv>iL?c2*c?swa%r1`15>k$|$T!cJQdq__)r&=n3@&W<;s?#=SJje^v3V#v9iN
zu|m!;&eZGM*ZRg>G?VaTxRgR>R-EHz-oM`<=+x=cW9QCOGxiNSpH1>!e}m2=l|%xB
z42iA2uf67aW8VnP?#6;ymH@fReRVxLS6>Q0sgnhg>^}GI0wW@bqQeU7)~#CEvy-I*
zMZZw4Y;5c7klzZtP5#)?=bOFUo+3LousO0FE91ZR?=M0v4-K}7^uWY~6-NmR5@9h@
z;4jF8NC=_`qFa=&zq6~e&*wS1>Z+Km%ivl#9cdz@71IdZ<-|si99%}FKy3se5JMjF
z)ZWxDG8$A03cxTKvLX}BMM+8jW)u7?9Iar<)bnU?T6VnT(a7bk*9WsJ#8&`!K?oU^
zOq%3+Dy)v8CdD|Y<LZC4Fg5&P+Xn9<%%F?b|N6Rq_@zI-&B2kX1v(_vhgz+h2q2qE
zkf^6XR~r59^g(w#Rf)F}m_*`AmJ&2)+09n7FIC-}r!dh>wz9umheQ5v&t@+guysW{
z=}2j50Wj)z7ZM_&o@)yQZs(yxLleBa9GZRUHhBsRPMOTNjzpF8S9HR}A1QyZiXRP+
zOejL~^+=mK5co*%ZuN;O*ofv?^o4S(7+4<)kRk$$*IP1ag1O>+TJFaaH*OS^$1_zC
zO_2z*D~z}l$lx3>{1pQo5@312M7b+Hr{Ft;t1NysX*TPB{4sZJ*>%MRYESXCPoWCf
zNiicHeR7+Nlu=x@I_|WbYf0{ns%Q&;wLwjsNP2#Ls}Oe(;;$H7UfFRm;*YJQx!7Ne
zo1Qk4JGK2r(xYI79ZB(g`uxdAVaZwe98@Gi+~f50+VFeMF;mNjJ^i=v6u!M|zCM65
zz#d9pQNbyRF47{vr^%7=4LYNIMqp{MJ0xV)!hQh?ZU_LpbuP+o%rHX$tuB9u?i<&Y
zJx!=14I^fSb<|ZpolW%~gpRL&i(M>s5@0GaADMYV{#FJ}D<j@E0k;CHcck+8_U7O+
zaSy~Z>3bh|@j?NS3pVPR*TEs{k0;qBLgbQxQtQ6I4wm$QAtZdF02-D>xLYU1dcJoj
zqenJ82J{da&7uLxpyzd0iO;^>-1oC)%@Pkkx$Nr(&?UdBph6OT82>5y`<sIm;<7k&
z=&T#AsO8;a-x+}WOG{EL$lx8wP(YrOfUB~EaGpirh8!;;-EbtX%p?P1WsVh*KR}Tb
z7eM?tE0HHAUC*7jO?>1gPo69nDysVk$21K@2#tlb&ql3zd@lRlF8tVWE*1w)sFz==
z+coX8Xj;#GIuj;zl_3{JJJ<j0GSBDxLV*|nUs!D70ANL%kKE48ixz;Fpa$>pHn?(s
zIHKSybF-?>CEl}o84?h%C_-RhOt40u$O4i{p-E1(zIpSo@)nq^fNU{Sd7|Wc=krl8
zBUvBuC?(8?>xpTH<Air0#ssqYxV$MbpexvY`}k6f<=t*pPRG|Ldzst#!WUWZJeYVg
z>(F{*+MzO!g0Q9>gmC=0nR-1a``fAd#G`U^JGZw34B%l%_koD^jZ)aXD77)Cj5ub`
zh8Hsk7Pifo3Y4B=elN)mD58vDQEzM108C-@lqoGl=fwC`{8P5|ndcXXrB$=R1slY{
zLB`#inQa`>{0#|-JpBTIZJbQ%DR_sM;*!F@KZfMx;;xJb4{nP)rf`IP@^|1nA}N&w
zRK5oLq{x#n!QkO!QrsXpabic-1p7&p(tpPAL#}$pb;%N$04h`I<PAu3h0gWp^{diA
zO{cLI7NhldD;pMd@LY@Y#(i2h<q`$x+pONUMTx!g@J!YLgQcm3j8<l$$jDJLXmE7#
zh$FEZ8`M4!L#XO$raz`!$dN=77>s)3XXGZF$&iLxs{iCZ7pr5Nk>90llsk9Dj``bW
z%_oZ&yH&Ef>YuBC79~aGmJ@xvs^4grmj{5e*i=<ZOeilCoA$dQZJ+)l?z1$Q_QRbV
zHm%2!0lEpAMmorHw{%>ihSAfp`O~=eZebmD1;cy};>sU!#GK_p4_T>>z~bF|_Dsg|
zXc99?NUu)N0-=}556O~IsFiwfzhKS3enf^YT)LRMlwvT<-|*Vy%hrbmgJZIDU)}Co
z*Gfx1tohe{=Cx%;<t({wyLnGMP%H(Slvyw|l-`moSwH9Kn;mD^ymVLzqCf_Y_e{~s
z7%PLg<5l+nw%?Xb>p3mB)tUIOm*_;Yu?4#lQ(RyaazL3D4@@VskQHLwAmDqjiFIOv
zk#=2ovmRZZajYXrkfIJaF!!5fx9hGnQ$WjonRO_Wx&flF&@5QI>BqkuQ@M;4cEA@h
zYAOI&*?)gH0RiVYs!?&G@NcZ0x&)YrSuU$BzG`2TrdIbUKar5&UoslhQp_T0`1s6`
zCgr%~HkJyyf|7ga>u*3b|AKyS8U~Y0D3!yyIrPy|`YmT{j*3DU3>$OnZ3P7dlbSCz
zjrnJ5mK{L2k|fY|7A}vNg$q*B+cp*|%Wm@5|Doji`t#{q_)0LPP6_L%YH^HoqGhXA
zGBsO{0vYh-=q0-@$CDG)Q3@Qc<<SnU?$`Ra9~!Xxp1KSQu{y0u`-t!J)9^-E(gZbQ
z8xTfbr?#JF*0G|>H7tZL^RBT5#_viSb8hdmTP$uW8{6hd)KvOYI1Y8Rnqip-#}Am<
z*!~SCULnnx0cXK?L@X^n6+C<qcwwfoWzEb-O`#z?oVQ;B+~h{=s@=5q@|^Z7A>jsQ
zAwBGNLFO0_X5&@%v>`1}!Y|G3gTqHys7a6D__!*@(X{8%6Qm{gyHP#and^%G-`;hT
z#E+X*ui4of{OT64$Pg<25bqxY7`!4Z=I!lKB6qV^uZ%bwOznDOUbv1!lU4_SnReS%
zmgLNEeMSbZKPBRxmD?dbp5dpoa?1fTm2|fYD8!_&tuqY<so9)lrzWpMpM>!|tE^zD
zJb|SWaB@<25$I5n0RN&sj(gVe{{o1RJ(^7?CrH6&L!p34!)+hdL9?<kwGs(y5G83>
zev{rVz|gT1!}iuS1q0NSzd7$e8)k*+g&f8X^hoV;Jf1J8sCy{&q=cBl-hE%G=zIMD
zoDcpqQ8zZ}*u4J4t>Z}S<(`(6AOVXHI%kOn7VtnJRtwXe#)iP;P*!gp{tfG>$a>vm
z&@!Hp{DCHQOop0F51-eJCn8?05OEKU+8Mi#Aq)rIVjHtL<+2UX>SAWnwWPk1LUHBp
z{<@b_eyj^QK6Oo6*dDic<eLWz@uB<w?^-()?pTsVCR>4@yp8~Eyc|6bos!OzJE`pp
zH4EtBFHg#^!zisD4)RO_0`44LcI+;X+n7_6WygFFfyl`@H?8UJ(boT7U03N7>=OOy
zFawl@zP`T7O!_@D9#A`TTtKF%ix<RY)=T~RS+8K&9GVh_W8**Y;>uhK^ijDrH^Oq}
z(A5){BW`qB{Y$P7dz8|401;iLyE6zflZ-2xjQNUP(nhG>KYSrY$!s4+AO1=ql6>5I
z^k)O?k-|FeAE#zalT@KR>kGak86-eRduP@)r|&feaZ{uN4%0%p`kv@^kWU6o3soP`
zqXh+vx|k8Y^oqmFDJQ(0|JkK{2dZcp7s<xkK~v8-qrDz9DjBUN6Q6PU+enFx^^BQG
z@7;#Zr`<E;Tj}5B-yqTvbEE?IlxCde4MB!~gy+_OyXaT)e~Z;Z?UM9!{j9dmWW&jP
z0yC5CqtjSCz;0Fn&_?|))m<D8K`a%*(|Phl=V`r$q(h)<4oW&f-1KN3oNbSO{b0DK
z0S;DsKPx!3Q=dtdW@L8Cb>O@19#yzn$&B%S=`qYcD!<;h@^KS)hx0}px}xvyO`B}`
zUp#Bwu3dil(JQh}J@!pV_+KA1SVCD}L(Wo3A3!ifmI|_8kle-<0cR6Lp%EF`+BEHF
zXvdx1QoDR=qvO=G<<?0W_<DDDiGbPCyEDA~Q-p9h-d<~Sl<S|hQm8;4G2{b8WQB)+
zsH$IQPG0Q`uehv&g5Ht)J!9sgEQ9^Sdv)$;+lU;SfZ{^g!6N94T(>GA{2&LrHwT+j
zjg%EmFS&OeiDhJXIy~xE6(4_BHj1+pNvwSAT%ZmPtU_^Vdb;6?4b+jAvu1_PtWoor
z`f~G%kRHJ+vipG12GNG<$e;u8p`vA1YLjiuw=U|rZ-TmXZYje0^~xB*x#jZQwQpZT
z#uv6W;yJWj4z4nMs^)izDYkv#ZbvF>K>Wht8tx7a?O~lT#^keW|6Zia{B+ix>>Tgv
z1Va$!gt8g@Xf0}(m?iJu&|1v`*d%4y5Y(yN*b`SW-*X7QeM;{qU5F8%?sRz)@4a??
z^SUz>FT9!K>96O6wndCL-Tn<*wSvF<3aSPdB1{bJ$<m-4sT2LEqSql6`_38gOG-Op
z;qtHFzR;Fai25WaO*R!>jXw#`NRfIkBOWL}nR@2d(80z=Mn7(7)J461%6D2g=-{og
zrbJop{LUS-egowdyCqN=22V2!YGZoIq=7n+BFLO8ZZI|3#l>Z&XXHm$W0>$}BXohc
zKJqx4w2jK&L4x*DrUOuV^5Cm)XwZ9s(sS%{@ITLEfusd7L?S6hs#I=epNC1<NN$$W
zPPbIzO>jE7{h|S8oMmOIzh%uh{g<DgL4sgM(hmshxTkwT!dhBt3Ez!UE^RV=QRuY)
z^FmN!+hiMyCT*efh<61Z!ZLA_j_)W*^1WwU+AVaNpP#Yj=DhwHR?H{y@$s2iGY%pG
zpFKV0o|Uee1OodMRj^PpiI&`&Y8QH5_2s^&35gfXt*?9H@unKG;*C0*K0zTc6!OUb
z=Mh3jJ{tRwwh#Gz$=}<{*M?@l^S|#DckxQS4Uk)K4(&TPTBN^xY;sN9iyUch(WW(t
ze-16AuIr-Mc1wkkNgx%pW%N3C4x3>c4fYGHMX#9XNjix706g`F=Yw%HKI5MCJ+hOw
z4gLCrO9`Wh$IslDf5R8*#BpE1zm|X{*$$dsNIQc6WuNi<_8@fi(0!qVB%bnFMtKot
zmIHECy{lr+v+IxW80wE^C@^@Lh^x?9;0AnieoBXu4A4^WJ-+LNQ=hh~oFW~5)o=ZG
z=?QINkG=}q(lMEGcVpjtcnn}Z>tMao%og<#4YME`MhUOi6=u?Dl%Y}m*A3L&LFxU!
z2)Gz05}_2R?gh^Up`(Ly8$;|0By+-tCUi)v2~$0><$<uJkY}X(hDS=$?g()F<i=`c
zn-f4<VG7rPY1(40DwP-!XLP4)X|H@?%U~f#w_<telm2XH6sPJPhP7lXTC7<!4Ac%5
zL*L<3TeA%k?q$&R?t<-dTK@IF|F$C?!8o*wc*Qe}hKFPapdXF=2*=<@y*eH&*V%M$
z4kjRn+pwc+w?V6ZM9Xq{_Qb)`{{y>@e!YLrj{z026k@?q`DNZ3#!Hth5fd@a1LdbS
ztz7R|6J;>n)J<;c*8=^xx?UR6>cKw$sWKcTfnkAifsmhrvfB2Y!|WX_ro>QLu>q7u
z9E3^jx#@_C((Zr&l)Ac=4f7@i3`J2W)?TatdydL4wIYo&&v{gJTN&F${>}l0hv`xf
zt%U)-<>JNX3ad{r8dzN7@h=pA-EVoBK974$s$4ZfrfkqK+9FpvsM^=Kcns?q_f+T(
zrZwVohA}75`poxEb$vu(1Id^$$#gL$D>Q<b&-lWbu^v6d9FBYhtBPawj)JCgc;Mcr
zzupK<%Pfo@SW7Tam~pF26!?Q~Z#P&2Qk(6_ZiKn{>uUxIupqWwvX54c;4)R(4l&f7
zKv2|e)ORJC#f$gN&6$(DV(InSN>22@emtPRnA76pGm?pi!{AyEeiFB`AE;IC`}_-d
zI%nKCH0T=jpP-yquU>V3=9ydhF0$^B#Pf-0Ii1av@sb!+D>@`ip>P(TL6Syptp1vN
z@BTkjy$4*+ef!3pi_6Sqg{+M1t&ECf@0FF&AQYk`EoCHok1I+GA%u{$BuToo2o)-&
zw6s)`Bt7qw`~SS2=YIXK|NXiz;`jT0Kc90P$8jF#S~;&2YNy=YkKke#vB7L_Y&4w%
zON;<TjN#P`K9teF%26zL<fVw{fXDoVO)@M~lFcweN~GMQoE3Mdx)j;xb@HTwbEE><
z5$6Y?5sRw8gP5r7iW7U^U%#p&d>}NW3qr)fB~Zwsfy1b98g5;(UqtRHO9|R%&ZpkR
z8czuGhhRuD+^awI0OB}9{{vX_fsxYI%StT45RmzEMcP|rvaFA-OxbelY0!H<*y%Fs
z3S<HQ$Fqm+lQl|Wi3~B^asIgSMx~)eL-_-0{yxv!f{FajkbOD0f|z4MUtHLQ52PiY
zxjjdWNU}(b7b*Uk!;RNH`#&u}zxky~_nH$V#b;COk}zv-AGKNmmL>bU<Pr?A1u~Wu
zlod54LsmOtBg#x<n>ICBzP<LP-&q|s95_kVa&rS+HLd#oJ8PZ4xkI1T_)7pc{7qiG
zgIjO;kx^W{qzRD<2mVyaxqwv0sr`IkAA+kAi$ww%-w(!$bH?8eKGq@s33qX*`J)<>
zyE$iY7Xa;s%OmnWeXOpG+JIbmX!8oE<FWDFQQNUgDQtjRY4*tgtRVhJGRExLQGCXe
ztPunVUrmoKC<$%d2mAbPVwuUiSBBydg)pL>T9gh73dQdNnuvqvT}VMOl$B5Co{+7i
zBS!{&sT_}dS?7W3hbtD${}f1#kR~b;*inJy6TP%$nVF^gYz~G@sKV)18?RSXpuiAk
zO0EUrjpa0Z(TRfYOW-GCh?}o;yclbC=Hn}9?TJTI@e>9*njleFZMciW{PRa@g!o+l
z)@w7RT44!1<~eX4CS(m|BY^;tgc+zbeXE2~%N}g%BnX_sV=>_tg@uT-WZ@!<Sb=j4
zEv>Iw{OL*#kA#G9j>%~gvFod8*fv_)trXTmI)Fd*`Za@{I$QnCi_%OU_*<DQg($?U
z4?=A!mKNt6awM`&{R?5Yy~{yI0AQycBSy55uMM@&3y7_*DMNPH#_tPhw{z#tFJHb?
zz?xlj+3VH;JpPZchxLgjstTS?^ZqldiOvMTZc1MH{b&?W6NBwtjyoGjpC#9!*d{FV
z;aTmOV1KMmJ<9le8@e0uzemFV(tZizoL_ZiyTrg>P7nl;z-Ho2Y5cN)PX@LMH_xQk
z+=o<sEc=Ddl&!9h!L%r>6#>ohMIMLG;`a~LkNSPEfa{VA^F6B}%)*EXjkMkumg3Sq
z?7zjpZLo6U|HeN0De2>}94hy&|Jg(~m_7QQ-2#it)o`nzI(FD0GoB@K3{eh>RV~#p
z1<2DhhjA(*NWwZ8@<9lq-{pRzNCWNe;P6q%?3zI2QNbDi`dK3@)?~}e-4&Q6v(~+d
zSekM3ICLm3My&nsq>6V?DZ<e7dCW4L%0u$B7y64@#hKI%PT*uJAgzXFFVqkFIcjrU
zhhNky8Tn^ym_pk-FJ5VdUsuhojX()ovi@h=l~zu_ThS*o5Yj3g61B(1<bihKAIzhM
zf7Wpts{{E9yeyayP^tj1Q2D~2b{o4NQ^jQ-MiscDPoWxN_qxKF=e--7B5{j5tW9!+
z*zSPgI$G|=6feuJXrlRI?N1L-bUaoiBU1MYqDSSh20Qg-cUDb2aiU1XEhHAPl2w=h
zm&~V^APG82E|7<g(VZ;GY#`($hlA2kK`ZacPh3cZ;71D}Sx=|t`Fu8Rjf+UlyQb^I
z>PAP%pw$rT^iIKcp3Dy#`d5$D;dk7G#wPewbqRn?5-=Ghm;@6cH>DIMGxh@IBcw<F
zPY*Ex{RsO)<TjL}MKLB236z0G5G!B6**o}0v(D<Tf~FhxeS7eT5|~A_CQP`Ke}vm%
zYjrb{<kKy8$y{)*Pq6E5gqoiV#DxHB|N4F0*S!*$b$hpa9|MLKn7PYYrj>?oD_9^h
zLaeo)g7%0y22Ar(OxnyV3#Uw<F3wv>f#_#cPWZ-jx1+A<KVSe=(GZCl5yMg>AvbLB
zzy6`26O--97Hd8mJzCPJUEevsNhumOP@v~!p{PQkqKIU3H;wthUCeUGtW}}9#t7%7
z^$#t!+X(1PWTHq1OzTo#sZ%dLgq-DGOg@K%gvhPsw#$-p%oJMCZ<bZ%Y_PD1p->g8
z5#AIPr3iDXBkHyPsA*aFA<Ivx85O$uhYP5Tj7e#7Ae4-V%oT7lKOi(7P)_mvmaH*p
zPBM$eRf+q@uk|zZeO>r-QUo7f?zrckIYt~31U=o!-oLjOQW%PW9+2n@*XMCgxe&xy
zSH(aO{zWIPpghLBD?CIKl37sZ&MsqZid(1J!$HU<CiyJGDnTHQ=zmm~?eTWScVlGD
zH*WG&TVi?(n5{S&3u<SKtp}NpXn{Hy-2XQcR}o3m)dD<Oo=sfbWVBX77au)()X=3T
zMHCMMG~o|fV~}okXAtWFg)%$o;c*l(TXkxbz=zETDO#)_zDC;Y<Btc%wo#qdLr2HR
z!zbw+8Wp0MR9}#-l<T-rL;fJPI9N6j8^2+1mBjsBjt)HtSkDOabd7YIw9U)zuWtjq
zjyHi(dsPKVw;b->yJn}CitiHQh<!=xr%>>Vj0&({Hl=}`aDUsKbD?g7=|X66fMI-P
zGyv`V&6u1eVktnfbpk`6*`HXpi2l?ksZsbmyZPWVkd2Q1S3AF2Ka$`{c|f(mss!2%
zS>TEw_zodmaz$(BIhnQZsWT-!b50JPOHE0+M2BV)u>9d%k*eYI!eli1Sr|qJX|hS~
z!FJ{K0k)YR=VsW}*L$D7xq9W<YOl9%cL%jpR*zR1;jk{S=7a5dlWpnYWmh-9x<A*y
zZ+O_&9bu<O&V1c_^Njy`uRihDh(U_Z4c6Rg5TJeMSoMRov$Z00zRxu3qUfsdpjlar
zb>Y2dNlE!lmyh>IwU5v_=h0wmb-T5!39&=A2Q{3ydBc@{n@#q$o^<fTOxFR8kAV>r
zSHIV0)+a)Z$bxdKJvBv^d)>C!oBCh5k{i1EYVyWh+65Nh{EMFfbd-78=%zjQww~0q
zXX7T<<CP{kw#Q?hRUXu|Eb@iFw}d+*jc;A@_oZlK*%km--SAw5EvN~F44Z=SW!JsF
z82R#6R#+10pF%(KFQ|aT=O6xk9%Lt58OYA3*%UI*w@5ePt!Lz_E&Fe=k126hY>}q*
zXC=x&uJ^MO-CxMw(6F$v_;Rtk(1q@44NDqHeGU%lNJ1c&rclbA*gelLSt}}z?w$Ka
zL&gg3E(c1j9L+{{-lk>XZ{=u{zdpKXeH~rxkrlVDA{d^kOU<!P$YHGS?Yu7ar+lhF
zixvRCM5<x+fXl3@fCBIvSFc@TYm=e^y9+%01^wSy<!AKq<HtxWhvN|D6>q9*kyTon
zh=k-jrTq&GySG?dmsC1z(LO%>^`_CY-Iw!BZ9Oqe$F<60c2U)!-|y$bg&y(^fxQx)
z8a5F;n_fNQW2L@<f#=vn3euwMw{Q0yGv*vzCSp)H$45hQI`VFKL5eC1-3ILaK9JiE
zp24z&e_4HoxG0g&RJ5t)b9VM5780BrYLLzIXVpUNyX~X;_7w{~{&b$7C_Qisq_7qv
zx5Jkt{ikPSXxe@0LGu4y`0qV%Kprth!MDebzqPdJ=G!n<5QxW#@m>9ZNv})-L=FT|
zt?QS}?`7%~%ZXeW+l7$5iU$K-)HrQzQ27v(sq5H|MP(-^jdxfnTO{-IV?Ye>R4rVz
za<Q$gEumoO(4kZ<Kzz7;tBxD@^TwUG=0UHMk|LoZ?ZH2!wO?`TEdE#{*2OBxqLi$P
zpWeqeCYQdz7fH6InqXedHg}>1v}A7t%tEn)uUJwEnz}Auu_6|HEouEo2v0RgENH8U
zngomQNhfwn8^`m0mYOO;6jslSP*w~YwGD`rY*w%>G;LwY03Q6;bNOBl|F|bi+rItB
zX73_{Cv!$zqeT&xk8vZ}%ngs5n%>j4z|d{OXm{X(kkSe?HH7h_9DwVwO21!Vw%H8_
z8CdP}QfboHLreYEA?0CF+Yts!P9mg3RU=CgDB@A=0oJnVnlyBn2g^pUX02M~gW?!}
zesi4&#8)8N9ql&uEd9$OL&HRjw%)N41B*fK=IfU)vm7(f*NmU*?6cI*vw<GfE02ZK
z-WJ%or~-^B;9arebM6gvzbrincHCn(XU-h&5AHx7M0E~kMCFunhA?2F-C%+-aw5ue
zS}Zx`qdrwuRiSm?$CsmOd5l&9!|%M6nLj!ZIua+mF$rK>S^|ZYJ=Q<6=~&D2IB-=t
z5=~0%=MaicaR#9%E($+?p2U`epfm{p(mB=0e)MI7_wL(QPD5H*VzJAej`bXi2VrCd
zq5yw_v*%8G4%stSON-wd92VyDx{w{&Pct*SPu2ApvbSu?%K}&R03V;68)rj!^iNk+
z^M_^LwVq9gbirSK+6k2emVLZK!wf*9I%-r^=!?Qz3B`e&M5M|u2voFMw7%sJjvM_6
zjw3X9?!UuhA37UH(3Gnz`99WV%tAfAm|u0Z?;>{9FNwpboNZh7^v?26vlcC^#&_u6
z{YAr8<0gPYP)y><3iRyem+~<$kIhYv-dfW&_bd$lnPG!8x@f-}w*gn`tir;X#39fW
zx{G6I<Fc-<^f^DL@7cl2Covu+&RBBIX*YioeDVR}`t|JT#6_n^VcAX}9Qf&S38AH>
zrQ&_xxh1Nr)wGSWnG@Z+fYt$oW`27)p8gvH7aAJQqf+6>@h!)V8z;-RMAyZ+Moy`b
z|8g7>p<O>-dxKJ~0wP5oR{pEDY|(-RyV(Pjndx1|a%X|d`Eg_fGZAk|NFff12PR@8
zjTqB_ci@fvh7FUYQw`6h<$So-t+a#%0Q)ZW$4h(0?|^x!OEdN83&tZmcVbQkUh5Ot
z21w@gzyEH_=r)bXuW>6&rn$IL67UJ=vE~4zAX5t~N$VjQY=rU*JJnL_j6v!XclO)c
zdPjbC_7f^{yF52j?Ac7{f{-AwW4EFHKi%to{sb!M%PDTwuJ3T0;$^_;+i%7sBpf5J
zGEyfiCY~oGT)BHU$$pXzZv!*hDbuE%hUH3H|M25mLKmF{yIqM@6H!mFdLmWd5QqQM
zrzI4zBEvU0*p<G;l^t&I<Z1bzBQS^+X_>8UzU-&GwKM=>B9V{zzDw7yACj%wl&V=<
z3py(+&!C#cAWC8upN)J+>dL(vqXKQ^h6VPkl$2*I)}jEp6i}hdcAwND-lVFU+G%Pd
zSNVq(WD3M_fx0p>Jc%!uz(;v$JKTQ%4TI~QUwyjE4I7)5-?v{sbvBJbR}17u3C8~J
z0YF1-I(CfVB1m{cMTW*PvF3bTr0&E)Og&9%nsjEboAFBqKB;Msp~=+nQ}+*Avr~Pu
zNnBjqfWd=%WBg7tmp0x;QrAJJZR^6+n^`;!FOD4@FsZ4fI%-qC9y#QUi;GBnH?V;S
zQjV{IpYh<qM`_pEwr;&8!Tus!2A1P(!fRIjlNULkoFvnd_kq-DtX;~?U!$a=qC#2r
zIwfV=<jKxf8L4Mj9Rh$ZVTC}<2jQSaLx0Rdum|W691_BuudY&bC;bNx9=zfpfr@aY
zF5SQXFYB*pI=P7BeiZnyS3p{Ac}W{KutwU2RYCOdAPC$Gsp^?-l$N}PP29(yuVp9F
z83q^ku|(`NZ)q!{CXyv1nCu5>-L-2xhf$y|T9;>OX<;#=LxO`P3sWP<z<zpQ9HjYl
zo24bvJ`qpu$H?Pr)5gggN3l=i5)}z5u_NSs7!fxvC_^#XWK$Mz$eb$e(u;AGbG&bu
zw4cfnVlRrr1-iNunVF;V;sKZ~O*Fjj=NI=bZz(1Wds}DMpBlbMQHzIxumP#DCTJ?c
zP1#MKs{hDen22L^J|di=qNGzv@75_os>G)gg4<P&Ze}^=OzfO;YoypwH0aFCpglW8
zc~kSK%oJN{@koG(am=+wCW@eO=KT3hhR%1YWL9b)-qCcfs94tlKkYCyqSNOpVK7O4
zvH0%x<;#~<>{68nEHP%ulBw)#qPds90JxT&7c5RA;e&9{wRq_4@}h>aLbuPO_FUq;
zB_zD`#Llm(JHG{Z<J-(xynNxpZV(Du!RbtEP;$!fj?LKd<HwUEHS(YCjZ1c6*@moH
z8&h-Ka5NRruj*a(=Xy@we}@|i<V4p18}YBERUSK7_o!^%71p9<%UraU<PH~Z+8$@d
zc1Jqqbji#@1O-fgin8>>&f3lFVR!*(5l^T-boA)9M0e1r$6#lIT~9xua)nO@jK?N9
z535n^w0l&WQC0!Nr#1g+gbExFnCb9EB%t6ua@(lkXp;$sQ@E%!cAR)4tDy%54N_-o
znS;UY2M^8yIDvedzuiCePujAy{r#2%3OBU-_%{bzTw&O!P^V_$9UQ<DlPkg_L0NJd
z0MxX_n@69`9F`rR9#X-di^>sBzoluS$Hj*aJy2!s=AW$nFKjI>2`k<7^z?{>uop*F
zwBOr1P@M>W7_4uwOjkD&ajS-HX;<PVGan;Z(dPakvCE+oC-OygDEqD~euS_{MkWFi
zQyy&5>`@mzY^T$u{xUf1mpqF-c1(@uvpaVbZHCLN<eai2P#$<Z1+KXJZj<&ruV&=@
z)YPCCk!$gGr1oiR(wu{>can2Gi`A#HIFrhr%OxsvWHZIq%Yk`mFt02$2vS4)ei~t;
zn&bB^f==~2FicWN&%Mg4D6=2^17OU^kFSr9Go~C)D#wH8a?EAon9c@xpQXQVhWRsS
zA17xGdQ9XAfWQ}S+&GMw!1V7cEI;*q%`&~i<YB@St|IhFd}Ii!y>wpm{nPF2?EIHp
zq`JjYwwJ1E?Yc1YCDogeZ=s2<U~sjPH1UP5d@J(}yt)oYa`zWieEHIYKY^c+Dh8#L
zfWrQ%nO{N<U&7dOgxeejO=h`D-MUS6^;PA6yWTPh4-ePmO@Ps=0`_^%vrWDSsSJdC
z<Vf78-(%8z)qIeDO3I}x;^iq$YFa+ZcHO$Gd(o{#UbuDZ7)oYcly(sjI@Gsy?=@rB
zs)>-k?;O_&yeUc*4{e9D>{Vp1E~FmCBKeesxc%rr0GdEl!OrSNCpukT>c^(Kyh2~^
zJZ^VBc<Cx?Fs4)%EMGp8{sj!h$jFGNap=eqzyU(nTF^}{VB&(etA?j9Ueb5BpWh%V
zfIn|LV+J+dmC{=#w#9EFc|$TsM*E>R#4l+*Kg1IT{++=6IMJc!mXPRTx-P&WJ{8vA
zZ@t}B|Ge`8(~PJ&&D8d_c8*2>tU*Tvr44=v`uu*4%JW~@xw+A_dY7$|N719|>+91T
zexjlI-TF{Yhh>zGn>g*tLXaW8JVg-ct@E+GK`+dpwHkQ59!&Gc-}F<bt;ykuehz-`
zh|aP$Zhx!pABukVV2t(5^C=9a;oW6_)?+fYBFoju!6K3IcK`i4M)XjcdP-q>`fKbZ
za;qn%A0sb@FtEhJAWqb$K?iypZ-ED)E|R{7p*N8QAwg=WJ=n6_@dW`of0<5Sch%>w
zrC1S^JtQTE#mXEC$&C5eH6VpSRb{RcWta^Sg%CEdfQ)Y+?HCJHL7p}WyoqQ!Dyrdb
zQXr6)$8RKo`iyxq55h9jB0sFh{&}3O>*o6;%~*FexQM2{{7H*p7y36R(P-y6wYT+#
z1%|4f1I0rjO?^WVs{S&nt@GZ+a(#o->asgV&~ASDS@}X8*w3)V>qi6Mwdmk{cAl@W
z^L$xy`BbJ{;m!-IChP@AcyN4_djfctMt48@8L9+6A`(r!k_&@2|Gw0%v(4ETPh+oA
zusU6hI^L)WX^{XgiX(UTv<a=Ir(Xie6ZAKEV;^m8Pg16n%4y{(U0vLc4}2-v)W}HZ
z<m9%5h{i3YO6}0O^DJZIUMv%k?v_6E)tfillDBe@c+T<)5efIy&~Sx5s!DqcMANkA
z5BK}M+67J0Rbp&$d6>-*61F07yrK5xmA$O^_Kn-I4kyyN$p9F37CBH!E?)d*LaW(|
zz3>-FJY%6Wbv5fc99cb;kEHnGjT;*;9H7pfq}YFh`k=;vWM6)UQxDI{7kTm_+<_@5
zn-jUWZ^myjH*Y-DJK2Uv0;WOk?W3g?ni(|lcbYgDd6OHCQV}d)aIxz*Z(e@%=)=3b
zQu9%+gVZS~2>!>9A0Iy~lgngclGjbet#$T1O@LWut%*1Em2oF~PA>gXw%oE7@VlI#
zWgL_y?cJ4yBR0JqP8&&wqysa>I$l@q7)0QRmr?EIp?GrY7FbzZuj90mDRe>U37O1W
zn^1^q+x)27zJqcEjtex<9bAOK!u9KVvgFU3HNK;(w2vez!@-QF&*WrydU`f$)JWFQ
zxq(?~oI~S$<>Eyr{6*8o3*HM#ZV?w68uZ^S)QgCe#p8${eNJNj@z9I@*S&cci_so8
z{rs#RC>6(5@bg?oE;mK^)GG^tF{V@TBgzpVX@ddft<2!_9UEY(eI`V0YAcffH%J7f
zsUL+-1!WiXT>b_gurPIGLDQs5Q`sKpOriVFmurIHdF>=8!u+@mU;SirSyz})<*%%F
zW(!+na<U8NUO<ouoQ~(Qv0I0}Ub*gSPSzY*_Bwt%CX!t(EXthTEB+1@9MG6Y6-<G=
zfODkyq(Y>nvZo6YHh{+L609_9QP~2-xHC{+<~K`z&xZ$^kxEl90+a=(U%7%bVmDK!
zmGIluA<bv$${aee=3(j9fiAK3!$Zio^W!q-^tidyk57m?`~X$$Jv9Do8LP~UjEZum
zY3Jj+jWX*1ZtgN`SkK1Ssjc&EJ@HWR^73jpuGa`xID#+#sh*IH!o1e?=H@1GXNR!Q
zMeOWYmHUh%47$^T86<yc$lMU_3&IY9)KJ@zFZ<VSnVq-Ylp%q27`KhbBtj5U1VxR^
zb`ZsBj-0$)+;}Hw0I0Qu#sr)1L7PDYQ2b=_`x2z<>UmXZynVm%aHNmJ$q<GSiH$a4
zo<OQQ>3p~1-|CaJUWN(ix_aSBhq~qU`}eafE$>=&8?y)%1fc*#1U<AzHE>JSfPYzo
zA}8e9tgfRsDQ^kM4xoXcH~_#C#}A}VsRrKe?p>%=#*F&r-qOy(#2y5a1;fB7QZG0<
z;)~$RI0cbO5Hi97{V50L)VXsp{yVD%U4Nxy+?qw8QE_ocIBz^c^TO)Ix=JKr?9qaQ
z|GOCO-w;VXPn-`&qq~JN89lS+wu073E_+kG(tA~A9~pMq+xrD2i9MT2nX2z24;kpU
z&!|z>7sgEH`2??7dt$occ5Hrl!hA*J{`X`5PYVFY&79u79kn)$%$20;78$w~u2)~V
zbm=s6^P!@VsI3KgAx<6i4^PW@Hq}u(@WF$Mw9H>;#p8gIEWt!oOY6Lh;L02Y+yvEm
z=8u#h)_<~;s=>KdUbgi|lT%V&k&1i*{e0Hlywo4J$3-mUoOK1-=olcGq17BfJDQ06
z5{{eZeEn^h+k}0#hXZkrd|9-HUzFTPAxs0|25(9BN!?-5waFQa0}!Q#gXVQ}@kupX
zmlj@i+eeMA@GRXr?EZ5`YnU`JhHnu0O}OFwEn9|55lu~J`}OGho=^48czGo<Dl7Mi
zJkfjj@Gpb>i~b-N#%gZA=dv^FM;~9TsDzwa|DQ)o2DS(D8_v-a!v=&C+6c2xQs~HE
zC)ZZmZafKO5(L8snVDjh+zBQeFC}kmVrTYf=RB`*Gr9WMhXSJJzbH-hPq%GDTU|>v
zRN5RU0)h@0P$W83W=z5zL6ibsPjS_%RV!|2KteIWj(U};opx>8_OfdBClNS!C76hu
zF^~L$Gp@q$?O)f+=~rV?%5=;1rGDH%L+_M#;eme9#<ZD_rq?b5bE_V!uQ}gDs;sn(
zh6@kCpaIu$EVR!q@)M}WVARmCjCth5!<V*@ff#+$#+b+ZLE+K?hsBYG0Gn!XHE8Jr
z{zULRqxKkQ45VfR=N37RAf2G`)X&Rt1OT}<iB~2f?2`E^<}kKSe=xUcc`{MZqs%dX
zEZL8pETAm)u|>k@MAVg7lz;h>H725_B|UTKG1c}Bqeq;$M~vW7JbpZZQOP;j7SL^&
zqED2kg%g7KBQP=^J%49~OoJ>VT|?cOat!8pXq*C3MlkNW@`uMEFBBVbhzBCUW46#1
zMMwXAY+BC4(qI~2)BoP3U2$l-J$DDMie~A0aOpH%B~JHf_ggW!v9-5@j%Jv5e|dZq
zr}Wsg*G19gnxP-2&Rf<#Tdl@K$#x$hv~9a~%O5#v(y{>zqDX1Xu&dM^yh;jqMtwJp
zmKK?L3|eH$ZlEIbw+m<ed7b|1Q*~va77H#d;vwU4LeIR|^~;L?{_2ui)bw4~AFtX(
z8{b`^r<eaQgZq_ZQ@#6sm|4Q?cPqcYOH%_9)HckwL6#rB^q`^HyHYDtUX5uk<@M9`
zBfkZ|D1SXdr>}nLJatZrm<TCgoH7~w4}liU_Pi(Y{>|8xMc;k+lB{fuT%}yl^N{}x
zI+iycA*m8TS?fa|Pt}ld*7~qhs&;%MkA7tb_U_Fh&GJQw)T?mFKnU*163+TzbepVV
zKPGP)5>$33qwb19s2<5c*m5@ilC@%3devi0iJ6Rl$@~dHmo8rXkX8-X2-DRVdi=O$
zha=JUB(%${4t1GjozVJ?C9s0B;(nFYzV7bsmD!&^zt0KH`4)H^LJ2sYHa5{a!{4YV
zKTxIU*LRm{gBpFm<n2+9@A^;tbjz^ud9>iSmO`4De+E7Dym&QdyG4V7z)89>rn6XQ
zpS+RJjYdSX%ywK#-M3r%y&c=zp5E)0eLE{Rck0HC16fzW5=wOjEBQLpK|J{VL_)wW
z@Q@34@1CN871P*voBEC$r~khu?Vy7pw+HcI86_KlVkPi-#nX24#TKM&jb&GllA!|!
zejemsw&G*Hi}~Uq_JiQLsKj>|ely8`|1f1c4M-yX&$9{CBEdHgvO?9d(lAK5D`#Fz
zhX}#hu!-ZzxYv?DJE@#iUUl3#ooCl+_o1*%m(?cKOYeg2Q+krE;Ac%`SN;3@hfp$9
zppxM9PG*0qEK#nn@veW4k|-82fjEvZ=wl4ngoY<$p+k;?C+HbmFku3+72;UF4c~+_
zld1Z!=+|q@yI8lBd@Fh+I#H~{N8k+*KJe>9wL76|4?|Nn$IR3{o@%Qf@vheV{!whr
z2+!<86T1w*WQrTM2YP<=gSO@sOx6xuk)fJZRI)Sce9r=x)z?*jHoD!sQImxh$ydUU
zh1Z(q#cuw*Q!s{K-9np0=oyAaG|9+JHOqLEypB}>0kOdj(U#S-Evhw3Qwz7j8RXAY
z$e7<0F2Xe-&#>$SB;%WFx($Dn4K5u=r(mE;p#h!(Y}bj=SC%ZZv)F0onZBnGm;LUk
zoavj1B1PkJwEFkI_QG^Vi$5K>_1b`U%eed^-vPw6*ip1nxa#lki~AWZ@3wsyEF6-I
zJ@r5Q>$7+L%HIWY0Knbq>z${Y7cO7^9Pz?jE*I^I>F-9c_lTw%|D=q)5ngOlySw|%
zjzNhdOie6SUv-}6E6atucke#!sB$E-#CLRk>@GE7)f#OOhv_+t@p7np1SBHeLbZ1e
zFDdAVC%|3M97U;{Yqj{>r9uZ&Y1_T~e73FrVRUIlff5D!e*fMh`E^Eq!?G4c1guBY
zpME|DO5&F+FDE(dxcBz(56i>zuLWC;D|MZ9!N(_R`Na20Ns#q6hPL{HJH*Y^Q`dTy
zYd3_26&dE!eUc<XfRlgt@BzvUWPJm|+D)UOlN=U*(;V0+tbW`5vJ#aP8~<g+x3=D`
ztqjvisWY{xHZbcnefZBMe*e@i)PA2j@QvA!RPDEoJn9>V@4K4n_pi0S(o|$+SwJ`R
zmcQZ?5^^y_5jP&a&AvWAf`bm{m@QGYjo?10d@OTWz1gC==Eo1Yb~9Y<TY)~MkD~6K
zId5KG&Iui1iKN5=>tMw}6eB?%+Jzr0c711&^`~(;H8z1;N!-*|D;3Y4on@RwJ|s;F
z6_!*~_4BK*PVTAcDm~TIwq7{U*SuZeiUbygzd*w?g&T`(v!QnHo=>}dy7nF{+kUv-
zllU-iAGOy>^WL`^U&VvX&d)!<QW`oHnL)!2<*sGUG@Nf9(}$pNM3sC5w&NZeB5bG<
zB`+WdSO(foa_d2ZX!M|DX>w|+RpF~XrYFZPfI8No=L58(&Vnf8C3^<!^F@hVJ>jl{
z>)X2R)e84&s|HyZh1^tC3ROEAR<Yfpx<UBTzYXof{QUf;SGm92RGKupIySCKWsG|Q
zATBTfUxNz@qcII_D`gsfR)&7+7T<oj4AHNB^{RbvVa)C7N4Bk&__sf36VXzC$>zfE
zh7pD*ZNp=>*Vbp=x}P7u^=Ic}`zd-=tir~QFA6sZtv63E7(Z1Pr8Od@eZT?v`Ijla
zL1j5cOOg`^rk?=UdPDIY|Cx9sa_W;gmZ^}Ked+uI%~BTvUvdv>;K|;>m2;$RmYO`t
z)rn{140MX};>xl8zuV7X(<OR(m0%TC=)pw5_o&jdEDOsnAdR3?f5<UZf*2%tS{?l0
z*Ux&zt);?VS{-XvZv7heE5XF-!PhfADvM-?HKo+*t4HFFwMq*O4$fjXv#)PI2w};0
zAiI)TK{5r#0~muws|@h2J4$Z=bsP(j_Fq45$0rZ%D}&j`+<0@cEvbRxavk^%FyVg2
zY7ds}D03*N`q7}<In_<EiHUg;%y!jw?Q2-{sOaxidP8joB|Gh~Ugwfo^3y(d$MG*;
zV;?-!Gc^DDJ@9OyN?XI_$#&t*W6U!0?RItbPp;Dr`%v|vv}j20@TEl=rKw>-pG$6O
zm)&xFbldS^P{>GsMb$7J-=NJQDRp`u^6iYmUW9xO)|j*<=HE7e2BLaHi~xMN<wa@b
z2eaPsO13U^PhfpirK?aVN^^{nn#`|@K~K{9MnlWur^3LUJlVt4<9CES=x~;D6`^E5
zv6_PH6d_jIwL9l-7}*&u^LP<$Xy&&L*K1XbK&g_s?Cg?~gkZ~@rY&1WCnRjXaG=VC
zDh)jiwQ6F@nMoS|eAQYpY?f<H)yo};bEdCK(gS*?Vn%Td0}d*?aM`l|4!0j9H5Fe1
zG+qtrOysm+Wzsk8!kSP1n?ee2jB`ZKv4QquZe6{)pXJ6|X}(Zf!qDb2cJXYjS~>i-
z!-SZ;ruX;=aIUX*AJ3gf7)@N|7i1F?^!nJOXCH27#^2N1e<-y@M}ON)Pdmln)qU1m
zRA<KS2+!CRp8KQiv=4<_GRnQITlr~^8%G7y1UM3#mD(B3|A}lj!(6Cg5QwxNH4>VZ
z00-;TeO*L)mkE0?G+E4T2V)sEhxb9bA)X&>Gu}OYh=6&P`BgEydPQeU)m5VDc<yfT
zXeMAgZO%i}s9O%G`iw~}d-v~0tOAPHY53twFJHZyMESzwnd3Fi4K#^ZqD(Dc8tQC-
zf<{axi5`@xz3;buoNHeNfDrKamG9dNR7@t7S($t|$fW`A;4FY}5=F1xxpR^}@hE&+
z1z<$GcI~3@BCv;a+puwCHk@8IUI=So1Lt^;_msLES%ztsc&CL%M)L`h%uD!!$#VyR
zZN)VP9NMwcJ+tebyVl<47NnmqP2M+2e|B|<Mgx&IP{5l9RbM;VzG2zEj)oVA-g0LU
zs%54`^Z{fDJM5EOjKEhDRfD`_56v?Sa5%%=YT0IY8#0<K)CUn2G@16~zvd=+az2yw
zj08<NHZ6)7oW|Q{WdHNJsH)o0_4>l=O`A5&s{iR-?+mQYwHEInB$xZ-p+Q3N&>BCS
zy4vXPCQT~gYFm9B{<O#IckeW>Ma?0B5-d9(duIQ5dAe)P?w_~zx@8nrZRbb6pv@RC
z;1E@Cz}DPZg6L5PB!JF|+E5fShcDf6tl27F0;A2-|HIR0`JGh_5Vtn8X**->+TNLU
zrMA>QX!_4nHe=L*C`J@D2nk5|YCYD3-HX1~`qD1Vfd{(XIvkxP;#tTEeDs2LWZWCm
zvs4Q_16)Uh*zilBS$`Srtm#7^nmF@uw_8hDW&o)mDbLi}Vs-D*A%`z9!XIzyy`!<d
zd7CQhP0oF9+XU_1W_$X;fhJ%O(Z~y!qD8(qf!8UpV8th?M-D(&$rD=Fy7yB-JU5L_
zyP&8(cI;SK``ox;5l>&gK7;Z=WUS3b)@IopaQkO_xECE2ZRE@uGsF~2z9WUI(5E2H
zktq74$Pv2|{MTsyMv49m;ff440gfE-_08YhX}(b>#m4wRf6pDZi#f?g^*{4<Hf-9o
z7TGe7NQPdhc=+5Usg_Pa+9O_ewb~wAmK{FzTkod^F-aT0QE&Tjrs==cAW(qYffxQ7
z8k9#2qJMk%@L^-Sr}BId9u1qStFa|ZfhO#6?#4jYo;6T$X<aAhS>95k#*NjCpEjlN
z7MLDY$VK%x*iD#F!Z-T+Z%W_B+y9&N)eY|*lK{+4OTRnXjPbjCd9Uow#KTO56Nybv
zbgNz8Co^DG7CQQLha>yEe}3wROnDF8Y=sU&Tig2vVLl09{|c1O87a5B{Qs(b3iz7_
zVT{c`_`=rFFCYg(Pg3#2SQ}%!hP7T)CmS{pxH3q@eYQw*HJ{>_Oj{5oe=Rgm4NkDJ
zlDI%hg&1<tGh~UentjEmon6}XNM1EbaX60gM~)m>JTr5RWQlIm{oaZ0X2F8Tr0D1W
z{kIB5AINFFf{a#xpKaN4=g$Fao$d7p^e?q!H^P*>_+nGgD7-F3b`5VSq8KUEwQoA?
ziCo2&h5ZPxMeWFOhLoeH;Wpc2P9n`e6~kH%9&jQ?K~Ct4C$us+*yUbYN&mzAu<|w+
zHz*@|0iulbOq6N;di2=s>$M$(;jxp-hq>G-kn)8HqiBF!b$0EH*zaaL@LEdYHr^v7
zh}4FH&#nwIUx(O1nkaM?!=Wbuq(x{CUo5`>m3-j;Q+&5<8LpZWb{akTix<DGGZ19T
zArvSaPy^6YoRnd6h|sm`#`Qogs=U6f$v=jz#(hzssm1R&<J*QU@?@`|RN+Rw1*iwV
z1d6B7i)4~)IwAtlgG5iyU*^ugI}?O9N8UNOU+Vwc(1OMR(wUwve{T1^n>TFuh$;a&
z?`fDGW(n>t8%&PnaRW;sK@??2c3z%MztOq3Q(nD#0xNHiohcc4s()(rY!S%41GGV2
zCXw*YojYTHop+wg`oajfafHB_iX5k1qI`Y|89M+E5Y&4Jb0wCc=@Q`(3aize2{5pB
z%E||)`E)8sXnjeKyf{_&M|i?bA5U+3b)TfK+fS;E7;%bws8zocDajYIi#m@RW9Ik1
zpXW@~g-&BUt$}~uVx_;S%*tv!-|*wJu7u_+3F3V0-mSpZV@b9$8FUSYotox2CkqyG
z8m44ur+yL|e1$hh;pbgfYzlWJ7M1`<k;0Y~Xmlgrx@tR&23FNG+Tj9a*IGqo9t_5=
znm8l7^5~9-4j=w}?k>aD3LCr!KUFxTsoumNsH4qPUG{#5mh6hk?V^|NA~_w|GIRXI
z-9}*g2pk0Di@q5VQNO$F`yWC#DHGkhJR1TV^vKIMZ>B)1czc-zrGXx-1=*rpVeD}d
zL?nEZum%C|pfh=UGI93kk%$jGJrj_2TvneohNcd4Vj?`S2}V#ENvVsJ_E8vMzy2N4
ze&15RKBGr}^HNmDN)3(z^y+KVa;T(`5I06pAxk8MLUT%G7CXrQ3#tHTf{q#p2srka
z>S{oo7Q=@5r|q-}X>GIwYM5CAJ-yVio$Z$2TFQaD2CD-u!iSI{pQ0+h6Z>GTl4Yw}
z7SU)exTzGr6p$z#S*Ce8AmH_K>$jfF@NXK8RtO)Zi??ni)M(K_%E$Zh@YhT2#}8j;
zHaH$TCx-Pu?H7kphNB;*hXO7Zu_i`d0ZiseHbO9mY(=n1k%z+E2?&U}&13Yz@raL}
z6N-UxZ5?ky7Kk&G=e(Ff{tur}xC2LuUkBk{geE~o-zInc_<WUF<>05`fJkG<D$H=z
z3{HREPG7ENAMO|VM$AwJvmZVBEV&(FC*8jQ8%gUwO*s6g7wN~gCx$lTar~M{&hYj&
zy}L~GTQY%&y&c1d9*K$bfPntsz>w7Z2Z@tLs|uCzj;@?tzL$(<AZSbXd_U5%8|_%)
z%O5oPG}!QlVA)->PPJ44uAD!Az9^ZA+AvIr{ZG+{z`Vq>d8dYUNWv!5K4LG)sTcDY
z#AuWz(P?QbZ&)@ud`WS}V;nUJ4itl^OdJ{+BJI=<-r~e3qzQmX3(ITv;rObEyLa0#
zR0)H@ho+-LD~HY)z7N_%GCfI~sV1%ZzA$v<2XuZqA=3HUxq!=G31KAv7*$p=%0=YY
z58ZZ%?<ge^G63|(j4WJOcpd++Pgl+v8l%gsGaOQKt+jQ5zi7A{))`g#CawSS_3KmE
zEJSmH6N9=#pczxBJ;W+QuwsDeM*whmj7c1${{Qt@<lTg6H?#<d+-$dbK$gj%P{)oP
z#S>PVaE`1XJD^nJn;quk-wHhgwSv%8%iqU;n7p=ZWqKep7_b;V7cMCHB>gAnh6e}(
z#C#x+>`L@adF`beUTysL!DM9T1~T>Xx<MA*F0y1hm>dmMo4i3dD0KHz$;zBpk-jaw
zF1G=SgMf3#Du13tEC6H?z(J-+1;FFWES#&9L2^spxUMukMAZHGT$wJp7Cf4-A^MD?
zN7wQTqVi{iSk%j?lB5rV!Ba(?&Qv+NuTEHLQ}utZsrg7dYv|vdSX>`LW)L+NT2@fu
zygXT)E5E6>HgoG2_G`RjV4Qlf|KTTO5;FYbjEuU_7e#z#DjzsRv^nVWAPUx~xluxI
zS(;sCtqAG+ICo%R>1q?Y51@Kq8h@m-yQd!M@DUKPXgjQL4bwk!`IrdQWKezAu3d~X
zYBCE*((Qw8VE_L8e&6~*+w%5A&jRP|zS9VmPY5#j1FgE7aA^gI7Q)EL7eyYNr<9I6
zbykNHNjeR_vG~SV6%%@VsCo!q0LunrH;BY}WM>G8e*OC&#8AwVnHeTKXY)Bl$}2tC
zr~gh#Q$n9{K$3+}781Ioer22tNv(ZM=oRWt>4ce$)>Rr`TddBkxIG?4NV0R0)#Y}x
zH)t}w%i3n@`;?WBm#y8P?qGq$FFlY}OG<xHb^--}nWM{7M6o8~Y_dqN{{7cqI3V;K
zP0Mb&Pr^K%YGIe5B!=fNUgXP6F)9TGk%&n>LqXi8bLYd)*M7EK@XGiuQ}&CIddHA_
z?cqo<1X>M?LQNw=whwO_Y_~Kc;&9MqFiXV8@V$y(PnK$J48#|#wODbJacA)`j2z4V
zroO)#8hRd^7_owuQHS~S`ynw!+zf9iYUP-i>3PSyAv+x+Plz#NT~Zbm%9fD1zEC=_
zs-n&|!M$Y6PeWEL3V9$(eB@{{eFCCF)gwhKbFpr4sTcb<$6zMACf~cCx_agL$k)^{
zZ$qsI@h>mkyZ5S-N#|qJ7!7zHAK!cISof@~tnr)8`0FrRDClIeVsHmxA7}`Pf+XMJ
zh-c0AyMayTipq`235B6ej`B@KsK0b+-pv_r3b)B+AgP3fg{eOz6EM6~b5*C3@=-j^
zW#*gd<wX0|E5IPR8xX`yjZ*_(>QBBvCYm>HqOVT!n>R1tzi&Q)6(d5BP{!I0UTnR%
zBaT=8M){M%JAjc=(~cae#385L7Crhqz7=zX5zE83kK%=*`&4)Mwn<zz#pawM{uxb<
z+c<kQ7H0QQRn0kmzgbFAmArPEvko}92<`~_E>o$~e9F%+i41aHQ~EaYOIh!y;s5Ds
zmbu0)PwyfrYsi=}V|ZP7_S5pp1d7V`f}Vsd0hT3aa<2b(<dTN{@G+i{Zom-%7~j!c
zefI3x!t}o*ZBbzmM7-EL1Lq~@txA&)jzG-eZAQK5lk|mwYW!Oh(D;#U`|_D>EK){c
zh$7{|jT^EShn)~XL1UbpoQwvnqFPT3nMOTADH^Uar>vlWJp!lca-l|{+*3D~*lE)O
zQC{-%NLq=NXgyeI`}uWQ5UX0brBqh@8iP@Izq+d&2-bi1a8=}A5Vte`hs+U0kfX9Q
zDS-$2mOzH9%=9~M__g@^52J{(v$IRD&;Spl$#2Csl#+*G)tcK8GF>jsr1&Y3#Ez;L
zILA%&;JhLvy2z*{6$atZzsoe<z4$ZApeI5i(O_dy$~64%5{eV8X+$5>P3Tab6_MKE
z=5$>p$<0_xz7Uz>uwk~1_mab5Bq<5QY)UR67#4gG6N4^0ZIR=DUE#J|Pj5OPK9&_U
z4`>Z{B(!_T#Dz>wgV;XKEjvybvS`JMSu^k7nx^`HT7a2{tTbb`3vr5pAOd@v2R`9R
z`7ccZWYm(qywES_GI=UMl;$TQ=8_mi=0;Ts4$*0ZaU)&>0Oi=F?`-=_)s+g5D2}oL
z*WmvIbwQb@0MZ~Jk_@x=C7?p!($RO)+SVVKR*t~gFlTEE7mc?Ae{Iw9v~oYtvdc5r
zd4cSNzTnBoOadJgVub|J=1R8aTD?VQ!Z$Mg_+%fsy7YXlQjpqhew&wlUMJ26SuF<&
zK&~g!3@^LvLuo&@|8qPuH}abt|0iwM&T_T?9dMJ8!b!hIaV_i&;nI@K2_*oKf=F$K
zpNxi0N<VWs`sVI9tAxyh2_`<I5@6?7j1mX}^ZnTvp5C*TEBigk9qmcwDWesM&M9tg
zZhGC`Vo@SIi@0bZMx;5sy>b-N%y=>C<vetvA;E=snZK3g=+UEpWBZ%lJpty&@cjw`
z@a4J)>k)DFWhwBZJbf?nLN3U^`kx^y@jb$3`3W~(1`JSe<}-D$hG?3YFToqhyO7Yp
zhm;U-bUBQwK+?i7^&2_z>;5t$&_S$5*P>GsP3Ov$v;P$CcxXcK(PGOW7d`g)CTowE
z!<U#56-14}Oz;IHTI}W(F+8%L5JWbpJd4vq&Vc+G4_B6%HxdC60g;<W<kb@=Ug$Mn
zC&e7l_YD3PN-YL7zSTZ-a$XLHBFi7T@A!&gZ+nP9j@VpSR=7{FP87s7$WKIFge1P<
zDn=tx$32f6(L@hTxlI4jV{(PcQ&w7_v`JMR9Jp+-aCW$)Vma8<bd@PhD(4B`hfLuI
zs9Y(S&YV0sVZnmdj1eN{q)ud<_xBnAB1--D6ZJh9ZVOFI*Nz{lteANCr8LJYR)`yV
zzdn6(&u4Z-V=ZMzVuu@i5y`=jE;Mmg-6;b1&{B957Z<z#lRu1sD*%M1En2JrAx~`W
zM+-!mq7ugz(Ju&j)L6x_(;{tC&~N0z4)+w)gf@xInxCs5ul1otOjmCp0aSFLxrabn
zqoen+faVxj+D=+dwGFQ>`=_he3<oTu!lH0Q`8pMgGjtOSrhAiZEt&C@Mw6G$cz}>7
zpKz{NhY1f@k2A$E^O>@Qj8Zjx^O{$M+mHrTh;tnILx4!0n|)nXstmz&;h=AJ7}8|=
zy1Z3&wr#VE65AO6zn6$_4M;McMpP<iKn)NC$x9j<%layGkXl81?q3NG?z67+g5usE
zhdfBePsN_XFPS<jV13f1@qhpR8vL(H=k7o8B0IvNS<>V?Dz}?AkKzMJ7iIDG^IZ;}
zOl$z+Qp_#Gp2#j)joT}(xB`XIY~(vR2CgXCq;n;Cqo5P?VV|!1di5GKsOUzORKQ#b
zth=p_-;qImQN+S`3b#)B^N7P{UnUtUp(M7VKQc|5fg~%zWKHIy9niH_xU3l1c1e5x
z9@m497Q1S@0KQOezvA?-=|uVh24&62+Q>bB%O;XBqdfPker+8zjk{M=6vtnH`DX`)
zT!?C!%kM5XFrbiVN%f5E2Gv0)wY3M;)0x2>iGn0C*1*lQGDY$SSVtu?97_(Fe9ko|
zv?(psQOp9qLC4Bez%-wv<;T7=A1j~;C4_^4|2$u-#?MGv^bIl?LZFz$_%-w*=M`#X
z2Gl(z&jkjb7$P6rFoJHmQfAF)5Wv*1)&0nGpv>Z`v}R5eic@^qg-#}CR&w1~T{nBe
zgvZu8-Me+`mRQy-+%Qx1k9M{Dpb*2V4Isc|gvYrycLlxzvWQx%mV}0cGKh~w0;u2*
zVd|q31uro44)c!i1l|?k3jiyLFh!;?P704{Xgwpz!RqMy<f6-xD8^@DLFf$XMjNfA
zGl4kB$0usX2aBzX8%E}~TZnS-x0Z$#XB3&t061Q`yG^Hu&fCdqRA(&Grc0DruVY>E
z$Bi)MLg4rt(R2KGFA%1R@Uk5z48m+=AX;ReGM{w1&uko0W_hO#$~?$4D5&jE)}=k;
zMv9<A4iPuAC5<0gf<;`BhQ_a3jcX&$v^LT^9-`JK{$Y@ay*46a#^-nh<`VfkFy2}v
zOuxp!o=-C->Ha(3pxDOYg!)#Cylgvjix&||+Z`NkbDWsUtxGYV$sNkLHO%TL45k&6
zO2{3_zAPq^1*Gmfhfp0rOFfDm9&s9c4ZHXk9Bx~5h>{c#zl+F0g6Z958l09cqc`=S
z_92Ur{w7VC@^r}(AILNA+@GOw0B0}yR@0TyK=7u4i=i)&HRRwkegQ}ttd)hzag0&Z
zZeOMe6;_ipbZ=eU?X&?hjV;wW4-iya+?zQ{J&`N~cO99ZpI=E*61D+}4?q10fy6}b
z>c4N_+}$=Pk*A4(6UacE2Ylyw8skUCMB!F^&d_5J{c|`A9?aRo&t;R#7%1d$_f}PP
zX2JrcV0m?Q_md$CWj34??CmxDm<aKyiEOp=@vAGdX-A5Z{tD!{Gbx*2!upbv?(VZ0
zT=Fm5VP@tR@Z%n>40b8Hj8{@h0p8|}&8(5_!|PDBiibSl7l>RZ7-1TL3j`%${vFA(
zAZpU4$*M29mgJn?r@%#|XJ(dH_Duamlp|y{XsV%A8YD6jY3%v=u?FWF#+v<oi73GL
zL<Z9AF@&dB@i4amL<XD{#{e!#;VO@L*rV@zFr+#|0Y-5H+eQ{?1jttO;CxAO@l%rk
zO}~}h*eMn(rp;h5@FI+PXV<SOdzj-hu(Gb+>-6)>9bY;0+`D+idZ$Jk{%Uij>GJb-
zL#ji+-7B&_@=L$^bEs`{c+t|q{UJJ<zB<)o>UPDMGbXj2G_Ww-w69zI&tF-IA%$)e
z`ah}M8J=j?*UGEXBi|$5ga2RY_{a!|O42C#L_#`tF_ivy0|Uc98lSS<<3v5Q&2XKX
zTW>8OiP^o%VP<h@x&*?AnrZ>1DP;hWaG~A%qL4hBZYT8yy67A_`Rdup=LVG<pT)!=
zc3LfUF}B;zd^u{6ERGVu_dW=Q{|g$-R*Q<>zqTTfwH$(H3=99-*2Ki*C+w{%Oq(?8
z%T}&bXF1?%jcmVd2?GYTZfZXM*$8!?YXgGEDld*7nRxoII{^W;Gt+1M{g=~(3Byj^
z^sm0UGbQtf+49@AwI6SUnfdw#1TdMCT4J#%)!@a0)P-f-dVAc{=6bam(z<D*Jz*W)
z=9rugUPgS!0cVT#f-!>!7x!!POpS?J4j$iD291Qf5_>NYM`}Z?2RPxX6v+%x>%}_Q
zam!V9e*b80{XBQ8QuBSEru-!vw`9DMAt!#|c`ozgF>D-}3<f=K-aN^ifO$E&x%+_3
zeP2s>M*b>m9dV&?^e$nqknBeQ_610j*&A5^Ntbgvi?z(p&>{%(AYe8oBv9y1gu=GU
z${GclYRZb_KBE^KIw<ZL@o=@Bod!osrg8bBn5~*(VsiE0m^&<oX+GT#p0s&$(SgDU
zOl_+fy>`^==W%g4kDVGa<-gk0G)A(Q?0^dhunO7=(B}vaak@pD+xriGfDq@ku1XKK
z4F7Dte*J~Oz$F7$vhjm%j_@V3XX44j@ev@1{UF>S=HzE3enI|uTt$1<<jD;YU<i=k
zZ}jK~ddt!%dPH3}_va7NdGDYVyGreRW;D(4bOVqG_sEc(CTIf+wNnh;!Ujo&r=g*&
z0AMO5R@?UN#d4^Iyc)7Kw>fTsa^0mRk?BI*NQ7tv55pyE$MN=^EA3ev#$*Xf&ol2o
z<DVck6m6gk$}`1rg-R7bjBSRyzAy$((bG#*BJ0=px3RHtb#u!&IbtT(hjQVuS0Fy4
zMH2j)zeHFaS)m2|#4;sI9$5f9sAA*EL8kYKzJKOtP##sBPpeqeRDEn~?aGbgSytpR
ze#d!;sXNHMc|g3CMY08<0cKRggkG1H7df^G6Pyj3*IebwG;7-QWTOOeBNS3d+LmR@
zm$TsF(?^ZPZA_N-2##)QnpYCXC*#>+@{v<gqJ6WvbYgTooSV38k+BGwE+&mTbxEoU
z^ncNK#h3*(?jfENdIXQ2(>{960QEN|w|#$oT%We3YwDJRhZ^1*lUH4EUdiFYmHg4p
zn{`e$HH}qU*3G%u4g-TD&ultpy1Thq`}q|4m4_SpPrtYT?@pX!FxA19=Du=S7lZoJ
zqU0H=4zrA^-JDWd+4wIU=z_<R@#f7y2|>yQZJRdw%jvmmFOvuN?=xfDK;QH6_93_9
zS2`i)i;Q$4-;g+#-&;E~=)oYR<{lm%3}MSHVd{^*iAnU*J|<6+9c~-?`@&d)?xP!e
zZkRvcKlO~edmQ3#g{J@fV<I40qee5<;Yr19mW^&$d!6y{NH?u@KYu=$vx91y{L6uw
zx@_69pz+QwF6}AuWT6z_1qhflK3ucW9kwy$9c-Jvi~_ay0&MoNY?uHu$jBd^hkzZ(
zx**n6$$6CT<8j!K$qU-B+IY*ccy7p6!k6#f&73e{cR`b1!?0!HgNQ>1_#5bSQJ=Le
zjFGMlfqL(19}%j`{z_06B1lHcsF5RQg!Ddkrq<KL!|waX1I)f)Zu$3a@@wG^rnNui
zjYCsLN+X7%Sco0JFP_gNYZ}qYTrt&<0a@{NqCTAr!seZ`^jW@}6U}Uii%ra`;U}3N
zr-SFO`ltCmaqMK<--m>YHx^zOFKajQp0uNs+KigP8BaiPCgzlah5~XD;M5tw24}o;
z|9)|b;hO##<XOlm=Ftnz_QPVpLjW4Hl);Z`OUH`qBKHE<#;btmGTRUFEzVy8Vp3yl
z*|H_LXLb^XaZ;CGuHRQ~?|<XQtRMey6PY)!-Ik4h358uR?UjaubgW0bKFAK-q@*?u
z4i0eRGAX0;>$@s=Og78!B#Lm9L}UqO^PLjao@P!kJVH?Kp+k=|*l6}4V$#LOfW{pS
z?HZ`7tN+5P+ns#AX2XW(tn{3=Y}s4u?mpPI8eg5Tr9~S}bqk{pj+Jw%WS0eQY%dDh
z5MvWc9zpaZeNh~RH?|G?rvh7P!-f&sI$bu_c%>PZJ#si(sI^70uGP3!_e)ni=zd&l
zf548^g{jBV^Aj!=O`e=uof=?%p(-k{)uNP^L3iG!nocXfrnCB{&Z$ioUNrVI>rtNY
z#@ab~hfmO`C4(QP*jTqT*shAX&Djdws-NS(Ds?<?FLBjy*O8i2xsCV^U0>`<Ayvkp
z1cO2f;ZI+9^<)Fi;Lss?d3oWb9S#0|;Wc}HYV3s6u-mhZj5ionU%Op>7aVWE)8d5K
z?y8b6y+^CL#b18t2&@4^nDsfUV!~=&i+)gx-xDl$PNw`M!q#cEpMF%ir=~WoIC$ab
zl+4}rVR~H@8xyx(M{i!?J!uQ|7Fa1H8K0}EqGECCzMQPA$6|+$--7Jw{BgTBb^S=N
zktFm$`8z(KU83Y8{p}A7tU6iY*4!Mc_4CMm#YTvF240w!Sa2Ci4}1rXyg4^`$dGju
z!=eKyEPM{5bwv=gQ8zEPDO1{LX4a?r^08xchxNRL8c8Crd4^VDH49I~7#l|JG6(?Y
zP#PR=WT?TQqR*duk7#%MumiK&Jzzr4O_~t3q$72!>edhsvIV4kA<!qaDSF7_k`jiJ
zi(SI*AyOlIAEwC0z2(@MGwsOZ7*mR3ft!6f{k-<Rr5DmF4XJFEIwwa*Pi;Gw8EP4C
zNlWVjY>VuL*dR+jg_NTb{g|5@%^nn}+_K89^tbmwO7mWL&j%Y3rvdIHYKjF*mp)^t
zO%rby0lHWRgH&~;R(Ay|l7%DPm@K7E4?E;GjSnhga}>_{j~v}ZUyf58>0vr;9Bvz?
zWgo(2N`_eA1yk0lbN06t1!k9aL#?OZHt=5VP`YyHxmL4P(#KAXA88z4vk4$OH+c>w
z1axYW(+FEKS3>wmM7%L<-<1q%2XL)?T2@oAjns!0q;Nr_iO6zc2qDo=p6p?>L^qvw
zTMxHl67}S>-i`i>Oz<sI_O^U=DC=7mzHKn45QpQ6b6z_i-;Y6zV&jHFw9(SgU5%Yp
zQmUn4-gVsQU)ulvT@V<4S9Ro@(A$B>y*3wUE&Gre`f+_x)%{x$b(u4^H{Ygx&MBoz
zsbc%tZkr5J5B#&}o8^$A+f#dtT(YS+<W^QG?Pg8O4jnpx-(pK~IrVl~6Ni!_Hb!Tb
zeR6T?8~F9-&0!6!d$b9pKn4MV#0nkqZ1_Cwv9oqGyKJzlUtL1$q5+BDs+Jr1OSTq%
z=(o28Z&Z75t2ipx9>JA`afQk4og;f>9B6Q+ao+#tI+@>Sc($^-TEJ5c4GqOxNqvpu
zCu%5DC7=H8n_9$iE6^Lc*Kf}R;BW6D8Z|UetTbVjlrwpP&%EQTOI)tFWGE}Y^>sza
zeZ<?lJO3VCh73x3czVjPvou_zH^dff9pQ4jt`&yx>>{4~!6vxSUnj+=dx^<|7Q;g}
zr`G;lGhBK2m@zTWo;7ABi7NQ+y?bmMS+IEVQ`t)v7&uDMT-4br$0tAZv9z%{yEhD5
zM%n*F4NV$BEhtPlrhIW!tS>hP%E=<eIx#>vpm-ZNf!LwqFK7yB{Pz9(zZNW6(kItz
zuiqIBJv>w+B6bChe~2U}Ik`O-gNFnw7RkC3Hajq|#^VQj6{rcZ7egt%0Yrf91<wbQ
zu-vkx2cUsegG^I0^EqVSw!k}gRw2fRkFZX&&`Y{?_pWYR^{6LLw!kT(;bwG97#toe
zg~L^>WD?xRi1oqqTiqzwSp+jI=_WKfy*OW;rX2G}1c$O*SZpr{`maKUP##jBbKRs!
zy~lQ~=RM6VEQV<Z!0q2XH(MW%nb(E#P~+p-N}7n1$#?pnI3`Tr;X%r@KpqV-BaxLz
zTLEaFV2bNN?%H$gSVGl~Lvt?pxxNG4Cf9)U;IdP)zqo8(JN082GZOu-=toke!#O#F
z^=^e8Wg22OMJxc1Dwwf=L)=0hnu>t-#1z`R&Rd&0vK|$B?1fJKIO)eA(sK}~M#je<
zt*@^Cm1&({vvi<~?VI|UJ-!bZ@iN#wD{<X|5wGoDKL`pEUjpib`RD6Qn%8OS{#dcS
zxqi@jr@aXm9mgw=slB*)`1S|(7aG_uN-;9)cJ<#;ug;m>iLk!9bM(Jz+L@W9+v*ZJ
z6_zcVqiu2ZY^PZ}u6kq^)$KI2Sia~{eUN(U);Z6rX%ZJk6wSHiA7P!kwXVeMuc>;b
zS4JG&WK<UR>(=6dcY}gtDC1SWcG;r3@P8iM_A7W_HD>)RZMYS&O*b>EJ@eyuvqpPX
z#+0PQD5X1MY`xQ2qt#k9w{62l?jJgG|E%H@R7L{9F_67G7rG5$arkQEtGwsgkWrw>
zAn-FL?X{eA8q2-2!z}J(?kA5#G)$&&Wbt)Fy~@a{-1DCgySOYF(@#(DXks|!layK%
zUNvl~5iN%pR)9ZKtP!mmXUv};8X5~5!f4{NlMlKNJ=*Jn?u60vOo}W*K1B9k^kcC>
zW|wHkMdvOW|I`i)3YtAs1pss3zQ3tpsZlvWY_I+^H~ZfA%aDQu@~E=9i6Q|};APp$
zF3VIy^ymG-Fj02&I`dqX^NNR)2$tEI`3FdzJ8fEHxEvpnALE8{(gFh1d3dja&yl)t
zELsITzGL2Qa`sb-au%p@4KZm!3O!?hJG8+x=r&ksG<l~_of5ZMT0a7ij3Rq&|2zQ3
z>lncRXdnFw%N(CUp3Ns`$Y$QmOr!S^3$Op{rFwMeg5b|fy;=M#xF=*H7_ulJ;Q@$=
z3v07(q;10$1*t-{)>T?ctaPN8W&O;;@v9UV-jvD_f3F5oKf@$*>lFogh6oN04ZTM@
zNZCVKdGcvIFMELRwNqf6TWc!Tq0<u)dt~IqwtleF_Vkd;Dd~)MRJCV#>JwlNy>U6)
z2v=KJyny~P4?YQ|oKh0x?dCR<+Iemp^|CkX#}N!jYD8CI5s4(Qh=Qd3!p^9zlF}5A
z|N1JUdYMmS5LG&EfL{4j&)v-ARhXKZdhPtNK>_!#%yM+j7Z(o{aKAQuy8E6<7h6l!
zd8A-N-`45fTzi|)B|scJ6=4j9>K9rC85DN%@;5VA%Q1UVHQ956mFn>EZ@M%Is_m>W
zrYgYu{e{X2y{vZa+4iDAjZ=q3ajpxyzuerxV%Nt(4vDu*LJfB1HhKRjqCzLRL6ld_
zrk^+SLpslLd(>f3>_1DV`KPw32}@3`-f*Yv?UrQgu-D#iQ!*V2b`(d2Z#GJO{pfap
zc51bLs-Eiw@3z2V$f)omz|(#e5Z!;d72b0Sbetw^sjRu4?$Bt=x!d+He?%2@P~X4w
zroVspJ8WI@1x9E-H_gg`9vVve$B!Rfw1OEk7nukoqj)KT0{(FxaqsPAaKvxkm4#If
zA4DB~Dk?tyrk={fN1s0r>wR-hUx8<2^jSko>mFL1+xj7sP?o_4MLvJtj0X%({y(V=
zdV2f+k~&dm@pp87H9BfvNQman`XaSa-;$1Q{#>|oq`G=kT3Xo3h+TK?|NU$SfX>XU
zLM{9Fg7bwn8y$D4{y6fiY$t!XsP0lydYy}dg^7uYX|iw9M2N3HrdBgP&WMVNx*HJC
z<*dHBZ9}LftR%&Su!4Mf`s`Uu5Vr@JZUr$xvnt*|h(u2-wC&hYkV!G-jiO;fQYBR>
ztdg*&(p?1wi4YPo={dOnuW3!XGYJRzs~@s;AK6P7Q^+3h*-I#V5bV*t_Ac*wdwjcn
z`}bF5Ik&2KIk{CE#GLv=O%dVBm<J8{EaL(}r0J<;<N}z64${_YTwZ?u?3xLzY!x9A
zk`2zZXdUrZge3`oUy_qE+0HKb)3i~Lif<7lp#7Q36f{14Kz*IGx8LtQFE)4UfRV)n
zC+QWUXuVmvFhE*HWTgU-<0>&DU;oxp+jcuB=e&3R7$7|*4TaRN;)WklZ|&E`zhh!z
zqCiX{O@o3ECox2HGNge3VPVtqF8j*0);@jmBzyniqXuPewxKhwF1$`om&~wt-#+1D
zr3~Tkk3k_l2@$MNC<vz;vI<xTNg@i$ZJ>r`WXhtdx^7MT_3I~7ND`tE{fY3Na)t?o
zH8W1N3hm|L_E+Ss$r`kQGDXa6Ai~Tj!=Ra~O+MaGw*qmH^$<}Xno{Y}LuJ^1duNKY
z{S%)U?+jU|8M&%;;vD%^#j%jfgtbMe1kj6eq8L)Tm097Odw1IZX#p6S5s_TDK5(U4
zA}*0k_5?c(c{B?pnwSY9P4xcP)|?lL4_8(}!9jYpzVZ0ciC3uGVvE#6%d|&aHCDdq
zah>FWD@G$Bi#aD$>^X!H=^hp@o`Q0X)lP!p^Ej{ue;iSl7SU2m<E8qP63P^s$<wt-
z-``uezE+gkFzH<ITcZ`tsmerC!t~+qncdKRqf38m+`M@%ecv$OtL1@yVXwcB(NCDW
z(bV(+=CM9LlUE!)9^-W;^XQrD{(Z8aP1rF!$C43e@e>dQKUzXs0Z1>#QzkE4rc6+g
zo&q3)+X^Tn!$7#X%@`U2OwdJ5VOzhRXPzUE$RlXVu)@E6M}iTcftyIk<(Unm7lO66
zZg&CJ*ADH@2zB)wX7YQ2H>0G-t_6T1i+O97pu2K`0R7Z2L}}yyhuIR!$m`dx0d=}S
ztEe~>k3d`_8WLI#%uo1PhIxAq9C#R^lT0A+pU`fvX&o=$lY|Pg0s5Vdxh-AX-3JeF
zs_Vk3qUkv|%!rqB03ns}>eUnZmNHF3Qt7I>RTh-4z310+{K^$8D3=+8;9Qn}{aRe<
z+O=y}MsQ_R8%qtbt7UeBfcEs=yG|`yv|#-@fGtjDXS}_K9qV@X-+yPcGvGB-ls_X*
zt{gvKmC_D83yQ=ZR|;{F0smM_2BjKay<x+lNf%{0nt#Vx=xwme%F3#Vr=5CNEQ<sU
z1=fX$&gDQwr>0i#FDWzMV)6|Gu#ef<Pzepu@15OhS_=9~$(&PGHlN2Y)@0b#<>SPV
z&&ii@e?l4-H)dPE{^g@_`*q=WrVaiBLe(AULLJm)a@%Rgl>5a1I=G)`&D}EGQMKgy
ziLvSJnjBCv`Q}@?wT%FVS-wa>8iI4J4}5QKd=<H3Yjt4*hTjyG2&5QO5;H)jC?r<p
z#=6vvuRq4WcXf3gp9*9k!VAUOVT}(vE?l`Xj)wUiBh+v_khc59Dp2qRW<lO;v<})*
zewN+SN8DcSdan?oKeVO(=c*qQC!hGdEp(!4m+OlMiV(M=Lhy9Jene^F(m}B(N-g9U
za7;09VHl6(GtHqge1-FhTczKsWk?)9U*;c=O%w10QCxGU8_65P!n+U7KB_yXcu>df
zsV}OwYg4^A=Up24sBhoC|0uUHy_RI%_;BzVC9NihdnH#c>3gQCaNsDf6F@Scj`TiL
z`ktXeIRQ2S5-uSPT?CMaZhs{WP0cn4Rd7SmyRH8bp$;?JNyN>E5Hk&bwQ8j2KL+Zo
zC)ty;H{it;S{uk)(Q$V<zLfwfbN`%w;mr8F{2((i@`yxviF*&4@9>@T4WteSYQY(S
z$I^Qu$XI7`E(-mLFd#CFx@?&+W+G_#_N{c$&4vAyL^;ZZz|%91b3tZ9{#|<NX@}|4
zu`4hZ7Z98nIMIuYjD~eHV-f&DQO+xq7Be%WX>de_Nam*FEKy&4V@}^Sw01%SvYrQx
zY9HIeRYE6$MKF%tvu+o}=xkyf+RdWA8l9>aA@u=zoI$!}Xp0!A7}_1^Pqw#)hu2+L
z=il+l!sl#<?cHfCqstcSM}~|YWqNghgqKgB_J)URqk$nI$%1PsI;n)IFMALUWGo2o
zs~l;WC{#J$F_0?%ab4jimA7=K-b2<m^QEI)L+nXrT{$gkhrav#v1Wgzo%rv#98USO
zmw`HcgTt9>_}0=dt+(k_<#FwQAOE~;t{0s%2jo=`*DWT^<xZ?%Eg=c@#tUnvH;;DL
z@#68kdixfwPje1EbA=yXj&3>nJB>qV?#|*_tiZ;l!<R)m3}vb~ewj!Og8HCvNlO1$
zEt?obJ=++#0#$62!I?}Gv!Lm@wS%2q*2j-5U;t`&BLbhq=SED0Q0*Wjkgq~RAQ0}E
z&wLB=5ElDp3u`_;E%-Kal&Ohc%=-DJCcFF%I(p9Qcj(BDTip*g42;Mb@gN{P92M{i
z`gJ6-aKa5Kqv?>uuZP(#4l&I*N8{X1Q|%^?0OUgJHgCJze2!!Z##S)PDClWn26Y9M
zLqVj7wtdi`K{Hov1I*GKW{e1o;)t9(qvwrdsp-FNhc#TSs5zD+!QFuWjE;@Hk*5Aw
zI4=rF7(f<=8Qz{Q{RF0s@{fY>Su_(+q`-=8J9T=OU)5F?V{r>)Jq?1o88nHcFxs)%
ztDhdzTy-oJ5z}<ywmq8$n1w$3c?EM%NaA85BO5S?VO}eieE7PeWfgRRkylW7mx^v(
zcKJLz)Pyk2Eh4MNg0Wy?^!Vh9lE76SU$&?ZKOXrcqn}A<!X-tnF)0=FfxG*Wqa{kV
zvWR*w0&&~Uonv8QC$pAT!~~4yFas*8RIGHGY<q#>Af>R(ed2=&iL^_+&`w{_d}%*M
zs|sNvX@|?29$4ScDQl@}S2x>!lbhp;HeX_Fk6=y2<vt7gW1$4Yy%)bP7R|fr_ZFns
z>}rIQeUm0l#LAGqO?uH$qg=2|InU%&bnkO}t}9!ujH=)FiBw#inhO~>g|q2hboAqv
zkDo2)o*#KV)2Rknl~GCmdWGQ6<J9)=!)~oqy`#adlCI6ShI9pl0&YY~5Cb^OshRl_
z+*f^h{6iPzsL&5H6T#Ca@ZzDWg&_l!1tylD!SDnn*6X)#|AXegeCHBHo|pmgnvVR;
z59(QGFm?`rx#H)hVI2tk5mi5G{vV#s1Fq-2?c;W~D615aoN!V?$SNyIcBm+6AQe&}
z$!J(vA)(ATom6I}NwyY3Xc{RhMYiy~FV6kkuh;!N=XTKl|NDKv*Z6!s*XR2B?*7Dn
zq1Ll#XX!sL&%8Ni^8B;CXmf{sE^vqJaL0XTcW`jespnZ6gbIl;B%%?hP#Qa!q=$WG
zoMNUIJHQ`r=?ns8w$=C7XJ1jmNI^sG!Nd>n+;Qk?()XXvQO32oMg_{VUPT`{rC;m1
z_U-h)lpjBNG9P>oVx3r40-Y@aG2Ek=3ZMXaCdQs}1$m)F9ho5z<ru%MS@Y&^&btEb
zU|XfRP*(7ianMH{og!A4pYpX9BrIE?+z?T|1~DXlQ}~de#$F0qfuR@qNb?>4yzU8T
ztTjS1P&v>G;*Zml%6Gl3NlWEp)4P>#F+SRjN~!0>eKl~nf^QQDsS+G0fZ-hu&X`O^
zOKknod<`*=*h>9GDxl*a_ilw`KI}1<ftNRL-i+X8KdpSTuX6Le*j}_66riuqSA2DO
zoS#oIvBO$Nh7g!`(zbD$j0-nJ0@(GXoJ28FV2cYKm-vIJCeDhS@w}p9BG-QD$IdD6
z1_z$@OYo$%fK(QV7$deapo(^mL(~gAhN~?SVA&TBJ4&G~f_6Mc2tlqMwT->>n9^jn
zVEwss{n-h*AAEv%qUBQY>XmFJM+<YWmCJEhX3-1q-@B~}bE?c|h3rzi9Yb3vRo|UE
z&55znCJ?j1*UJ#-ied%8j>o}^13?nwEAq!|Om_&RHivg<QO=<uk}d)enzFU4kQMx*
z4DWsG($HW_=-k?VTYm+YI3f-0Q`*aO#R_cOf5qI6%I}&Uy2`72dZy%sQA<_rl=UZ?
znlh|SW^DMOxn57MY6Bx*DoZr3lX~;u@9<5az)L|TWU}XTvF~ASAsIoT<rAt>tyW#$
z5?r`$5TDAPXZ!GLl#DE|(kv~j44pOhRPL@rhcbEf-?Ju&uxyCC;L?5j+9P%?yu;L+
zY|>zcL8w-ECvbeaDsifM+2O*OGna9jN0kVugdZQyHS19aedv8KEbQ;n50zU1kY!OA
z|5nz<=jC<m+V${=j6nkjGOW;GT||V#;pdU0%9+=2-G2$CUrM0~)LruYc{<A9FF5DO
z`GA0tR#)sF@NR3lwM8ws4zhi0vgZK&%^kT?yVFqAg78T)?al%tH1t}Nr?oAxe#>st
zjr14{P+n$^j+zD`IG^;n{4jjk?`s~W7l1%QW8D+{nY`Oyh)kd;F)NQY2NR&*%o+Z<
z3r9&JJ}(js5>>5bY(@0c%q5E!foQV>W9{^xlfgichy`DL^jQiUFoW5amJFt`X$2z{
zlU?1mU2>hoKZzWeT8v9&WtBg_B7~QzJ;oPh6xHONE1;rCH>Y{KELrka<;6^)CDVQk
z0)P|m5}O}yW~Cge@lqW}MbR}WN%u*p_W#aGE#<7SP6&$-wv*=xYy_rd{W@@7uI$vE
znTd!NU9e1}V4o(#K3whJEEP8Cw(MY_Q+R>b3(aSf1LX4w_@V$rGbqN?ASt{TR4Z!e
zxLJRVLJ@~773zYmLxDUKXjYj3=Wqa2irF`~))=IN^l`kU&x>R6Awq>Hl{Nm;Sge?n
zU#R2QNe)_Z1y)T4O(=*m505+y$W0G48JI#0c86*x@RgMC3@7`2W%^$xaU=|C1?%C(
zMpGp=w&X}ra4}#3uZev0^~{j4Fg2L7E<;yr`u1%#H;+)uh@upynVFg7>ry}t3B<B7
z3m$;z45rX|d4A*33%a#c+Sx^Zx*$IEktfmHRIaKM-(#C!-)2!83li(MU%OL(G^=Vj
zJk%qDD@>l8V^z@I$hWD}$wh?E7nEBduzYXHH~3<*v74LBX8Uj^(`4raT7y5Zj4tCR
z%qyDBn6XFs`Mk&({V_cdo9h2C6<J9KW696XE7Tf+WQniC!=f1k{dC>3p9_6Mtn%n%
z2j(&3mBZ~|Q3mCq03+sXngr1!mG!lXYdL=WO`{452i^-TdNHz%_29)q?@_#8dKGu&
zVf9)LA_OKbAG?WbFiW!AoVg384?(aw)btjL9Xhbhw-MJ&<~ZIRa|EZRcY^(W_Toiu
zwS8JrGyDwa#a0{4^)Jc3X76;8nq19yz67uJC7u|0whuMaD!oHF&|ZUFb>+w=XTKx8
z$SztT7tfvRK4?&fKLaZ=ltF6`v`qsO$n&J9<BP*(k$i&>H6SqvCW3Z>fG2es#!-3o
zHbR=ocOq5`apwDLEeBf0DyY?=uBh0~oIN}L`j+-&++E?}+mVJ0x>W(^%n^kqIzs6x
z#TYtGyfEm&_+HCO@3iNq3ev|3Jw>aH6O9lA!lL6ZC@zsYI<6*V4>B^g(hu!Ea6s(o
zFrJA3@Z*w9e4E3!pemP0SX2~_9%kMpAG)pCf5wo+rkjEk&Pd$z($gETH_swUgiRz)
zu)R)9bW55d$&Fro-gY7G%FN!JSbA<;+v!KAl+e@gA`#?r6vefip;X_i?>9Vp_z>wn
z8mFXNt$KFqPZSZ9f)BzM6H!R()~(|c=WJM8-h0ZX%`G<K+s6K$>-X;MqB=hhz%4)p
z+Y-d?2z&w~HfdLZT(~`AAi(VqVFu4R;(HZ1$*O1m9qY-aI2qpAl~I~p_dbJaxa8NN
zP|N7J{9*$l+*Qm;5O7b9xwM^AYpRq>W=3};`*9%Yl{$0?h=}OPb=_E$L5h<OMW%rO
zeppHAY7$}Bc*<$2z!SjqCYL8}rE0NwE<Czf8YjDFCOXh-cMn+j#|lD++j55DaR
z)H;2CXq=gx)M;9_ncm>;avHB>&8JDGE1WuK&Sqdeu=;MDl$ZG^e!wGdDFB3q%K{gd
zEZi>zEC64E1`+pAAv6E^r+j7+JRt2LeB31R4oAa@>8KyAmmq==yA0r(Cv5KFJ~7N@
zZt3RiJP~Mx*+yOJ19+)=vz#iu+B^Q;x9@%yBFO$bnbiRMDB(Tc0)AC41Vk6-L!2bi
zv$MBzAKl?S7<U=$^KAfOfP%iw>$=M`$D8mMi|kMY+q?>U2M27kdxixw7(}n1fk}h}
z{iPmvBg*s&*e%7HK`~6CUdVymrNX#hFA^P?P6!h4^qjSy^yxXq@tt6wN4v(`E+y^(
zTy~S50C@!J8k{xPfi`d;kzX+~p{Zql=oa?i81*GQoYbLd6Kth+!K700QFXWBY}%E4
z9`wq*jXoCtP*7almKBW@rD@;RX+0e|(`k|c(1)>w<2nyXR7#{>2A>w$<+8Fc6y^(8
zQwJ&&*k(}>H9J6$R3iHjwSuMQ7bMI~tHSZar8#zvvq;qR)1$*5D0k`7_hS|etB8vz
zKa(NRIYYHfJCa)nyfe~nKA^u4K#5dDjAmp^6od%42X>tdD<D`a5x64($vEnv9mG6y
zZ2aZGGxJ?s3jcoK3N#L?C87+`t;-1auezLH)FiJ0CiQJsu(lJXKAZDQjea%Nn=Y{O
zhJ3{ZI%jCXD5t=aXq<m1>(Mau1#h&Px@sV4hdDs9lLvfj_B0z27E`mLCPAyM4O>iL
zp|GV|LsoE^bBO5+mb6RV-LE1nqX{NEHc6g!+rGarV~7)pJX$o#{26{E*%4&5ZI>?M
zW#2z^8+Ho25bQiFDH+8Dk5Amzz=jCV!A0dZ@greghS!m2Eu&Pm)s>}yJOCgmwt=*#
ztfHlEXzkHg>^dl@_!wv6;?!%pa1G#ax)A1WWoMsc%>(Kr>SlOQX~&h7C1{aSWY3yK
zl(&C3nwJHPA@RS@!@U!z>FCAJgls}opm;|ZIdRB91*QJ2e(Oh6&TK3BrTeOke99Ec
zRvBL*=OL9LfXpYqQ`G2FCy1Ez@ZsWU*K?z6+W|R>(JYUFsXxec<Ov@w3?}N<o7+RN
zU~_gRqUjnuvX+Ak$wb}AwaF`j)V9~$Z0_HHCrq>g6maDGl0(lUO%VAKv()^p<QidE
z0I2|w7f@0O_9i?}y8$D=mG>ZRQ%73&dw1?s!YX|jV0z8>bY86&87@f663GQ;yC)?9
z!>iKjjQILeiB|vnp?^eWDC4+)&>HfuI2WOw^~$;_vvQ~y(Wt0H%#5XY;5Eq%fzRi{
zn}m*tO*5CyOzz#UUl{>K${`*Pc|*n**>Fv}FNq?ns%_l`-7TRT@>@WfLkQ!<FQw`2
zuElH;+{PTYU+5q7ssp9DF}wwkE6G6btcjfD(MVV(#*BJl_VcJMiaSoLBr7#FHCjb#
z&7SeR!LI`^Ap-T+TxX7*2|1YBhI*0u*z@50;cM%@tW`M4ZRIHZSLK^Du(he$ecN8(
zPsAH=G^}bMR4lUL>;OI5-YV+_21A<4Wo-@oBX6fgi><D!Vj4DVC`zt)DEBPdb^aNr
zo%??bR{37~`se3;Y!z_D78{5)CO4PI&(92$X(0+u4y)9tD4?)`<-BwSQ|oJ;u#;K6
z!fEw$2S;N4(`<a0tqE5`5Cl43$mgE865idcvcW1NtUR-_yj&tTaE@xkG?@s*6kI|9
zFovjJkfy?g3B1<1^8}a!;GP>#o_sy<NjoGXVl@NVGO#12ONhcm3(b@t#0d`@bZYvv
zY3Vq|i%x(%iE!iO!6PD{?h$;O62haOUaiGoed?jWv2|+9If>V(ASsORv$LC6nT|X>
z05k+!HQgz=1|itWLTn7hxqz7}<loN|jg#g<M9L-NkC+ZIfzexJ5o3*@Vp8mT0314D
znNcSx2-|u71E?K31%F-wEx?fSRB&$w)sVblt-$?%JN+|hg+e9L&9U;A<(xTlYyHHA
zo3e(P3k0T-zYO<$+g5A{?!z18v){KL4gLpSJ1HjP@l6Ajr`vv%=ld9s&H2!3L;gmZ
zha&8{qzuEV3#y}w#iXf&f3pYO5+*VK&rY&ktlnE>5uDJm1(oJy<!u^{K}RRYhJ@>X
z3v39MLc=~`qd6#IA7|6@nH2t}5h{;#?YRox-rp-?##2UEL<J5n3pUeAdb`HLtHF)+
z&d1k<VgLlze2ZFx`FCg&+2hFs!Th&rzt+N^xOoKEmkm?7xk@SX`lXjO3%EwM!jy-L
zAy&3s)YSMbjW#arwS_upA5nb-@qp_vK>N;Lny5AE&+0Xl<T0T;?u-M0sX!-%nPHBf
zkayqP$*IhBgO#s{L7$m|oyt~vv0=c2RM2!|bEJfjy8Lt74jq0Y8&IM34*PuSOffJ%
z>Y-<gj=p`aWj-52c%W1kE(}~$8-qc(VbEm}_D)zwCnf4ZqGm(1@*-$2KN?8YllaPq
z^&RGiM`TfhQ)^wuX=uFqDnI(h$C?`1w{LGIZ^)bjQ5GKICRNJGlP6t&nEnyqaH|Bz
zr>8->h>L}6$|CEL5}06`7oWO&+;L&=Xf!GJwEOi#RM6hvcs;BX+yvE)lA_{Wx2`ns
zVhygH=7b)KCnoI)Z6XB%O@rV}I7bn95LtG;Fw(gW`RPN-pRwQ>JXgkz2~65!hcLEc
zpEhK7ettebs259Iw=^d*G`F7g^*{R!iWCGVMf;i<pzXVsnJ-yl1gqUCewHp@{v7qQ
z>Z5OgFEi`3ZI1kg*uF%{ivr<_jGVAZz_nYO6wiAZkh`e`CvJ45lt8K?+wp)lVx8-m
z*5m@=Ev0dkoj5p^M7w5B8Xubnx@iJ0#!xpsv@~ASWP<8Ywh?H#5x{|VZwoHTHZGD{
z@tc!gS3!&pEKh3w)|Cp2;~={|>w(1t`*pn%U23kZ&&DjOyOEStsCzzHQj1C}hu)Pj
zE$VnM9;|?ZsnE>BhWdwwcJ96R-~6b1YGqK+Jbk3(yGw9}78hZH7J+!e)UE)X2=fFe
zmB2`(l*P*&jY$g@9M`nvy$BOb6E2=N<nb=p>A<3Ty1)7L;JR5oz{<BNpXZ@!m>ut`
z23svi8fChykOb>R_rUZ->Jb#U3`P=oZd2&QI;U}IG1%HbJH~Se1;it{k6x}!95_!N
zWZqExFp7)Q@~TDjC)KIowZIDRsdFLEE+1yYi+o2`r!b0&dfvp&k%!ICvqR5AMUV4d
z-%|a+D4!*KM46JqWQh1u+5SKtT>wfT(I-(Y>=^OwuYSoj4Xomph+iZn<V1~W4*fgR
zMrC4wD7QjrjPhe48Kv?bckaA=BdEwU`quxn0IJNEVMuFQLB%gKK}40oRQ7KM?&3>|
z1Oy!M27444?5Xgq(tFgFxXlrm!-WS;iJ!Zo>9fpa!_Lf7ji}pFf5vZtf-57+I^dCu
z0uF83E%~^?beX;xwsK1I>y{imhWh0sg*t`L#~w01aq|XMKQpbXPo9Kw#?CQ`Dej|r
z{`+XgOL^dut{^~lY*Nh}93uAgnsoeLx8U38$LzrnPFr7^w!EZiBd-P^fIzYnbJ|h_
zUw!+nL(7qK8~f)Ezji#5#)5(qO%1bL!9;o1(8lM1c`O2VAQ_M~HJ6>IucfSq#)*uM
z-di+_XSHwNqrX=qPpuq~YOp07RmFW7G$CP9n=)(;7cSN#08ZdJoE17B#OO;|!wQj4
z=#L;ifN{Ql>^e3(Q*}Lf&dBdiaws|_l}c|p>ENJu90z6)v+)=k3p2o>X3f0c^{pLk
zJ}>L2tSqa{OWu6c#_%`CI&7mD^~A?}rB$@2Y*~t8`M;hEoMW2$a>iA(->)BcoI-JJ
z2}sIIgcu<GdTZ-eQvAa7b89I>#hQsCgdya6v=>-P-TQk$jG5Nx$Qj3`{6*P*gpwHD
z<{=~mjUZEr=s|y0IfziNd2?ch0`ZGRkZLcldPoc*n%Yvx0@`YJ*Q--Z<N3?gkfM(H
zQ)&hHvKq8PrY}sWNg4NeFLqD}=4}@&pr2{PVd2keZyAdMihU6;(W~+zYcT@&6t{Wy
z{4p;%V0yPhVaNmb93TDrJYD(t0(`+*CuHbZyFnL46k|44EBG0J3AK)_)FRwr5y4Ez
zwT-{jwog6b%>)$9qHF;p)Pr8o-M*UJb~%$mdt`!@cSufLwtRVa2ngoGu(890ttV8l
z1E@@2%M0Z(gL`dAYJollvI%tH{*O_cLgPmc4-56WGx)T%PS<Jsm(UaZcG8&8z{$5d
zDBWr7SpN&03G_Ka3QN(%2C(_YL93c0f+E9TvM&0&(q$InGnU<+OifisUx{A=A@}|_
z!z-bSWLM)&b#HMJ#w75+MY0IQ&O!1aKa}}$hC?pBz0i=L&i+B3__`>&#lBzf#dbts
zFr+z-DjW`uxG*@wKNoEL)7+q@tY>n`h-AGrD5w=c(eJCsW#UD#z5ah<Idm4nnNs{q
z$$*v06PMy~XOoT}-$+ADe?x&G_ZN0p9U_gJBkjC2`;hY73z_~!xW9kx^+KvfZivkL
zLFtR*8o!@gC(HIkzQ}nLqCi{~8D3m8jX6LyTz+ItsZ68+MvCxLS|)mOs`x#7_AI3r
zO09c85je=)*tmhn4?uR}_Zm(gtY2+zw%oCKuT8(TQU-@b6SnGdJ@A)=t}PmE(9kf>
zDoWu1i@y%~HW|19NxFYzS5rOjGW5N|_JC}rF&QZ>6Tukj**of>=X6Gq#92#>cMvcj
z(|?@eZBV@S``<F%1+^lV7j2<c;f(bq0Y})$frB88G+&kkG$#Au;d-3^X~k9M&Gfzh
zCO+PtJ7o6HQO9dUfDIt_vZnG)UFDmm17F=55OB$5;liC=4)e+C?Y3fTaR$?mo=Txf
zpQ5WZ^`=wsv4}@<fGjfv^%K7-Kziy5-DS_mh^vn{!xVmJzdB(b5~7W@+UJC+&mJTt
zI#pCu03jBj`Ah!!t->du1Roh*v?8dHtgidda^S>A+95KNtN~((^$xRVn!W5N>*Xf+
zQXTV#$h9wLrG0CUTj#Dv-m6!eKJ(a$6CLxUATpE~=hl3EKVnBDxsxVAw#vj={T`wh
z?=Nl`Y$uU60__uUBJ|k0sIM(^b8`vFOxK;E*G9qL;~|^iMJNnWL#eOayfJv;eYY}C
z1nPNa8jFiY(0z-M0q(mK%*-;+*z}mhtN#zqI5>Hh-gK?tR@%OP5Y2{deLUB<4K=$#
zQN~~Sa&xcn5)6uwRdt|?1ATP!sBL6WpV<Y#QrSdA<A4rlft#Ce)IpEwsQ=2M<BMcQ
zK=?ZuGr(y8TPbCc^-*^JH0(a$$SB?44z-9%z$Z#xyl{^?*h6={p`O#Lpp#KGAoxQT
zt~y?GZ)VAW<^yls61zW2Ql2H1M`eEe4k^s0PQB+`Y>-s#qU;u=abgk;bnBr<!L@&z
zoY0TI!#9Cxy!Ivkx-Vzurf^Jb1q==`rm6MtK#+HXzQ)FRzjlo~euaEWD*1eHVnKfX
zL8L>(5FsVGhUMoE!7<9XJIwVTEP1c%s)63mD=RYz6S<Ra4LCL}QW`rNPEv(9c~PIG
z*<tR|1}kPC^II$)k&#f0{?T3&+BjvVb|-11A|l52&bRr{53vg<>{o2}MiJ8(7mF_(
zc)L7Pxph{43#_k-R#!x;TBJ?LHUq2C)BOW|PP%dL-r0luwS^Q0_3RDFKqW6~>H4!2
zJdmrpx*Tt0&^~3}WJgCWlz3i2`cb5G;-lbmc(H6Vjd;$Bm6?p+HR-EnjSK*uKC2E|
z6+zq*Zx<^okvj|?O!DcW)y%pV@asG7BE8Rq){NxH79oj<P!?h`4E<)hN+0py5KaX8
zzv@X@lfoDV<u(LdkZztO+&7Ngtb!y)Py`(t(r7x@iHa>Cx($w68vY*!Zg+3FuSXo2
zhc3Cz>~Z8NAPe+UX%3({3N06@vQ^mDwq)J+^-*oR0g=_yjE{w>_+0l5%~YRl4R$Lj
zi;|86Os4ogtc)(Cq54M9C%}nm`H2CO<IbMtWp`f3?*RJ|_>A9@qScTtr1(rtTHe-W
z(~R?OH`|c3k=b49pI!a_;C)u>evh6dTkW62<1XKk`mLgS=4p`YkInmd>njvAitn(X
z>oLkeFfXfu0|x`K4n^mP2V4o5pKskwUuhdcUp_@g*HEq?@B-s<j5BwnYK5gT=9!>s
zvTONnsG{AgDt~L8n@^vrI6I5E5T3&0kPaXKtUqh??hYM0woz&BHJCb4q!wI+){Bxj
zv+dN>hX1pF4oCwWJ2Uga)<IwhQpHn(O}MP8vu*xth2;Yq0A_CoKkyl)5DvG5!=E@~
z|A^_TXXiH|co^s+DmISj{ogw$556oK*qpt=i>9r5;q)WS1`cX%;gO{m-QLnUAzS1~
z(X^AGpdPAb(ng-l$q8EF@eIA5YhHVBOy<TiDPZvH5;5W_d@}fS;%N&XL^Z?dDtMIF
z?S)s9F>f=Ig6dz-jIr_jT^Tnr=(u426WH*UO*V?k@XC=+3XK|Vh@p{ILe_6?o*nJF
zr9^ANp-7#OjPIGHD^A%tIE+OV$J96+IbD8&RiWnklI^nBO{iCahp1%hNiLM>VA(c6
z2}IZ1al@;DcOT9juz&ibrS5vn2Nf2k9PHM<ZCfJ~lUtK=dMtEuGO*9P7#Mi-NcL;W
zADW}90PH(6O2ImTaF`#o2V<rgKeyub{fZ8kNT35Q{ll|C2Qxq=Yu&M>1TOT>K3got
zIfYM)>~ePkd*Ze;SG4X|KT-q?oXCHHdtkoUNInVz7DR>pU|tzF7x$hUcE|@M$k_`Q
z5L$0QcgW$6SbAo7=f3teqn_;``d;F`2_(Rg0^d5%6c$YFb^;ez55#dh9C;BT!o}ZD
zcu+NFmchxTjzx1@u5?=B?5w)Nk(e)-w(7>{=)Ymv&}w(Y4F;$^p_d>rwzs(Sq#eKg
zt3vSLzIE#s=f0K7?n?7(j~?wqkXG=<(b@pPAWHbgDf4nbA6&m3xisqTUeILO1Y~J>
zK-0eWUQ?Pr@h3oNEk}+LK+5K}o}QbU>eYVYvU_|_6MnL7+hx=9yM<&3p7!f-ivErr
z{|^22%iHms*(b$w)y<w5>{jend8G2MBi*A)EKV$re7a@sgV>9YEJGgcv$!&MiqWX`
z4Z7daYP8<3QF^;u;btAq?egglkglIL@=NI;4Z}uFmlnl_#CCjD#Hy#y;oh$fmAQwP
zv6XA=;h5TY?_PiUe4I`c&`)$9h#6(q0am;x=)I*irL~lQ0dfQ+pL%l8g#aU$3jf&_
z7LQz18nIXdN#7Oo>zP0z6kJ6weA+i3yTf=12qk^zGuUq+lvb3+q7P=9QsfNumOv1I
zSCToH;ob4sKO_7R1w~<a4}isQIN~qbR^<ldvXY)b-1eE`>4gmf=!V$YW@R~GyDyXq
zhU6%qMJPiKY`ECn$I30!B6jwbFr(OOW_oi(On|m=0Y49d9qQ#W%dZB11snDnWOsh!
zkv?GD@aXgT+i-{DuO8|b-o9WS{|sLSWr0jf7p$r6ye}Pkq7@O+9xV9h_u^kgcb5h<
zxUyG}RqU(CepCKZZrQniBl;}<2YuWtQbZlDjpNxD537e-+waH|g)#e|tPe8Na?U%P
zm6=JOV36DzVhwEKOz|BI#*l^nHe)|@Xq&i#;FUzPEhARs95JgBdQ&Xkx?<(wdiBsJ
zKV%yqZbTGW8jg(<7p`zisHARSu`qS3w6()QbW;+&yec9HTB00fTB{9GbKb<Wn54JQ
z0rZj9aj6vmD@d9~L|yvBnoRwi4{uV37pm&y6h0mIAR$~;g+_5U%$`uk*doyU%Ptw%
z1Csily?d9H-_@X_lHG3zvam5+bNRGEe)Ij^Gb_W!#@qNa(VPCX)t>C$((zvj^8>O$
zitzmN>~4x5w)DB#wC!?>dLga43-tz~D;vzf%$n=f4gzfw?!R70t?u7*6F+F>nJm3X
z)o~U=Rr^BNsHOU7EZ8bkFJyR%l)EIBbZWGfNSnSgeUFw*)cV&K4v=A#XO)$G8)eP2
z)$Q~&E8lPbkeY=4Yn-A2i=t)Ao$OA+4_ulG%u(?ZW6qzofICr~F-bO#795*XEj)7R
zp}!m$l!wSqscqXUkZ{DIZlu1Xw}}-eSO03?p~E?5M(DS2j|qP7<~hg-w(WKB&gk6N
zkim0D=%cPdIEX*!!me9BuSV~))YH?EiWX2t38A*9=O9+D3iHjC+{ykSC~bZRwuwyz
z_O(1!bl>8#vfV45>-o(VIcDz~{&}cbTmmD@x;7`ysQ*?^UZgAy@w{;A)T!c^Pn*W{
zVw>Gw)9cKnkj6w*I%e$H!pg-=E9S4`X5D@~0x1zYv5_V<Mw2aK8wVI=f|q~+QL(9a
z=!GXNS`?`};PZyKd7%J~>rI?4HFfilm4e|SK@c#DU=4(GgFZc?;K(0y2Wh4R5#i!8
zss%Xq{6MNRR|OG--oSwoI77qFejaLH@$nehl7!GO<myvkx|ED8U$m|=%bMqqmzOu4
z^0#yuMKRffrw2d|{!H@oalSmz4pk4r2JuFbJ&&+r*I0E9sY3FNaMD*yxUWBb$)`m3
z@~N4P{QlSeHKP)FZ7{n|oz}0YTe4o`Fj|V0Zz50WGPsdcof<sMxlw#qssni#J$v<1
zAbJDzX%8C1Q(OCtI}X#)P_bRR``H;Lku%;OEUpWXaYzynOwuYgv3<da2k^FIZLoT|
zS*Egc+WPCPUo%uPaO-*C;6d6F`@%#In02X{P*IWqtdnX15oBso&`_eHlso_~ezb{E
zsqw&@Xpv;|7an8uP9k5>##Ctf5I*LQL8+h!=p9>FyPiLI(CSoK=b&+-jxse}{5k$D
zm0CAL!=#KC@G6X%)jNIqv=lL3Ic9&FHn4A*&Y3y$Vd*6>LQ!_pAJ_Zko!zkUF8$i<
zh*{yG-Ctk$(>e#nDb3!cPp!D(=H4cIt)3)wFZYELBlw^!*$!274mHWaxjl{gVJOQt
zT*?7=&C&1FlFmpRV9%f(1q_q<M}m>TKPNX-tS1|D@*sK(->Z+>V09RvXhJ3gO_goG
zJsG9LKHMEGjXcFWELtX388^Tm;Q`V*&rEet3=eI1IQ!Mp_&00vF&N?60GwS1t=dNH
z#|hC6Z=0|;^`ch|MKQoV6hR1JFIczN%+COzNMU8)uV}RX{nw6^KxPR5o4DODxepLD
zI<04@@EpLi$Par=O9z%J7(n2pCcz$<=i}r}!f*h+ljREj#`Ue&h7S#k$vD@ddGo=z
zBrLv|cvX+548o7|Z#|u|^DW!7NqghDVk}+~118sH*!)ZfU485lmjB6Z`t;oTd(T_>
zg{mdcWS(D?t0j99@N4?!(<As+sP^v=S`V@3zhO=YK=_}twsL7@G@OL-oM_3}52LV{
zH<vYt=m!%|erLpIi*?xCBli}fglljFwH5pn8v-~A0<Qq8lk@QBWy7=zJ<97jg9Z#>
zk@_8um<&e%#Uo&@C5DP=ot|&PbwQ4SsbG+b9QskqJ;VJ3)LoB_lAJ47&%LgyaQ!Ei
zuH}O6*7#*grWpYyExob8j0~JR;_t+}Z1LT$qjhJkse|{m9Ol|fua?NM2EbxTeq5{R
z)29pQz^^b_^!CG=P&3)lAY382L%Bl-F_RDn4T{>>f74hcCi{MaF~j0q@HL;Kw9rIC
z02z&c)vY5&8_BtVJ5>C_Tp)#|S))`JyF~7g)OI5CXAMQvS}iUBPf$=+3Iida5&!Z1
zh}OWR+#a!(p|Ql8`qY3~Q7{#-LYFUJHqT%kytu#=CVA<Dwg4QWEFw%GR{Fv?JdoR!
zYuEf>48$M&VO4o^z7*IdW|1dAnQ21#6X@ZP?;9OhfP59Tr~GtY08>RMH@D+cke2s)
zaohQh3OYOnnXCc54Tt=MTSz+8$GTPF_5t1hI~WEXsk|1gw|8}2o1wFh%o}ud;Q&*^
zZevXhjI5;m0mB5xsw0XdV+JK>9*|c(_91)hFuhMX`5oIOy_Gd3>N&OV-|z0Obm7yp
zN5vzL-;-{JN2C@LeA~Y2Tk;=22njyabg(P-OOoCfur;U^*`(^NSLtxeMZ+!T%$WcX
zzN6|*^WPcpyz<+LkPD^kszXi@85=7aK%B&CaPq*eWgR5~qEpslb?(4$s{{(!K8aw-
zxGu33_@V}46~|dG_Q_>kgJ6U=D}*cMrJwE!wKIf`N29)SVt4J?a}66}8Tr7gkD0Cc
z3o08t3msCksFi;H(7=TLm6g#l<%J@lR0thj12YVuq7N2mD$tL<#r0TL2Z2%#20a#{
zk}bfO7Eey+Ey}1Yek5OcwbUApKqg|0i3Jehnh_n?GZ`qR7)HN}$Z0BCbjk{r%rJH+
zi$D7Vf=2zTU>)v@FV_L`pb+OYtF0~raB^~FARMdKU6?bVF_sO~fT*XKS>MJ61Nu5v
z!utL3sII#6iCKlGPY>V(7fj6#X}_7MKmwJwTZf50-rH>E%z+1|zfL9i_(w)+i@zbz
zg@8T6;S+Lc6~~#FY})huAK*FIU%clOsAeE5%ecgBtcfkISaGpwPUiZFAlEca<tq}M
z-|u<5Mg6*&;*dmYZV<IO7h37}X}P<>`Xh!h1!_8eW=2jR4z;q~eP)z}e5H3Xq9A_P
zyb**W((4ho8Fc41kUkvC?;fQKLo}+Z5=1T7z$75qf*2Js(B8_@EBf=e8aa8MfnJ^k
zzy_+S2R%I&T7^9FZKFP{6PO*4GS&dxAGF#-hsS^PX5mTUE5$dJs?P1ZcyFZ$k9W7x
z3T~^cthpBrGs99lw__oMY*uUY0Z5F?r9+HCdq_tF7s>l#)v#knn$C@eOsZ%QRTy{%
z1weq=jlztydT?%ZeJ)!Ia61CHx_@Zg%}Hivi5J^51H&JFa?!aX;xth7g5BvkLB<dS
zO%N$i#LSCL>p5JX04=~GZx^doC4M0J@+vfq_rngbq&V>8P|3(3B}1VyM1$P7IoQOA
zJT!1h*wZmQHU_d8#h2)u0sO^%o8r{a`oI1M|4<>%@uw7gDk;hPaCLD9evWwOh+GHn
zb|z%%FcG)`gs+lB&(eu`Z!soh;FyR9k}Q#1%x>wrUjQ{aLVr}<Y2B)o0}EtjFBk5-
zP{W%zR@7%WmvH_---Ud|?sRVa+5Qavp$W`=@ZcH)siKz0=d009zt!k0@Wg_PM~UDr
zu6DLpHq^hUU2d9Vrs3Gy40P|C&o<@o-_W08OWUWb9k*nS{e93Zr0BS@?OCkvk3M2(
zQW2Y`#Y$QOgLu#Gc8s2J{)fH0ro^r6NYp}V&fNjoZ(WMG6Ur;1z;M=<*U>{Btv)vM
z%aUm4S<ez-o_n@EV+e_Iiy;BpFX-WUofE&QjLQIloVX)5+qi;vN=2LhBaJF6q%{7*
z1@Y!}$1xmp7z9^Zcn5-G(lT`#wo(@j5U;&y+v(lv-TsFyhf=Q%>h(BGDD*$w)70tH
zM+2B38bY&0<4e<9zI^nAsI#g%Za-0_6s~s4W{(Y$aVTx9`O5RAxW}skm{hGbRM@?=
z+n25!C4b{(qpzEB_PH~-I>so!{kWJB2_f2H8Dt=lSx~%Vt~`1aH!Cm0zQ0h%%;1xt
zp$1@0|H2Hhx8C{o;+RA_k;Ud5ATribm>xz2s934xNJIDois21WjQS9z-7VL$DU6OH
zl9SP79^C2=-PG`Qr(^C8{Q=*@#Cae_dj$nu@F1Yc=()-qX9;BF+gSTU;Y$W12FSGL
zlo$>TY3aJ`&Q1L)dkA<TKz~BiCMRR?rNU&A?cF0rYB8VL$ip>3$;QU58drlvq!L&H
z?BM(%ji`Eci2eg&ovqM!L>3IA1cf+;<Wqg0PlACPm<F~MD&!TFDPQ{^J??a}slIpF
z-9CELJN<72&AKyghl))TU)&isqJfbUAqnB=CPFUvY0Zaw<0)rNZpssfa06MTO5*G`
zm@s=r)I+soMXRybv)|A6>^0c4mqnbqoAbKYl`FlP^i5b&PwXy5E=;4^mDpSHx$te_
z=1tW5VFA~wT0l|CvN#%RB#t1l$2CXLqhv2??@v+nt@5l`$8&=~rj9WB4Jsj{YHX!o
zB9y7KV>5ckj=vatpwIBq*b|TvV7~-#esc+^J0EywSQtBe_jTzkF@_Z&`9aY<7+#YC
z6&D4E=J91LiG6wH*A~--|I-5GiWW>PPKH0SIhA`4<sh++QwRLbjKb;<Ka<~grp$>~
zinU59C+LERG+gbvBEspUqP~Jwv`=$a>ufXnEt&Pl+9c*uUxKAb{|YX-$0oL9`a{GP
z{I|$CX0nx~|4o#@$nM12bMM{>msVT(n-T8j?YKe^^;sOtG#CUZvwr@ROeil|B?Y0)
zu&n0TChIeN=94|(rzH0R<-^Zc9do{T9giW7L;od@b-DSpWr&UXFTC50NSB!0TDBf-
zQy-o<%SAFByM^AoPe6Epe8}eK`w;Pn5V8!Qg`~&#=1OrF6;_Pd7}vltU%3xs<PfNs
zosWvmV=`s+g5m7Y3aZIe{tZHDEq4I>Nv_wisaTUNuWQyQj7kzWx);=*PAoiMg9D2Y
z|I8pkmdg4ZfKM^-V5-QoPv1|Py*)in_d0tn4ZZT^2M?6Fqd5Go&WLs^UA`)|)oc>&
zk7`@zqJE6h!(eO1);`+#nDU8u!rFsoR3lg_h&MWvrO7-ak`zx|O3kKCWt>LFqit-8
zUHy|$+n{WMbH0J6IF?6DmEF5H^-g`S=qfYJ{QLB25oSIfO&xK)YM3V>LC6mVR;A{}
zIh^V{&uH+*j181w-Bp5478V=OyI8rU$czz#zN<a4>c7Eo6SFNSoX?&)Q|^Ay(!J5)
zk-p(*keF-_K-(j-Lw|q7FzX<7;jId{cylA5a>Ox9>Jfwl^t*p1_Hr1fmmPfM>V_7Y
z%JUuFT3HU)e|BSYs8@GLBBPB>Jl;gC?7@Mge&>Wdh;j!&_O2>iY-e}B;#{l&9h}Ih
zc79#y+r3fddeKTtn?kGuRxdoHd*06W50v-{Rpdh*SP(#~#%FISeejzWDT8N-f_AZQ
zn=C55o>Y2{j{R9-$z;)r^HHlNT~JZ|xH{#F7ElW}+>t9x4*~fvd)ugX=m<weJ!~L;
zML1zKTZeZA#3O(-_OUk_@}3y(+REdT$ttld;Z9WD*a|F9U=&YX3c5o)K435Qu=%xC
zH-_;?5_!(6dLGSxPrdU4-H=Yn+zew;F|&5fk~v~Tz_2gu_A^Vg;Z@1v1-6Z$8Py|%
zix~njP4leao*0@3Q4!DtpXiT#4TA{#nvyHKn`&c*fh9yyQEK52{r>&YVIxvaE-Wtg
zt{yJbJJ|+wlUYf*V<d$WzmU{HHdNcr0_@~6Eys=CB6eOD5$oy0Sm91&$sSFc8Nv^;
zl3H>_(6}BHm?9%XnjTl1DYj#%GN=#VQ?v^Cr-v#4m2;Cr(Y1GUHydYm*>A1wxMkQG
zTAlHJ`_vLh8&nT5R$qs2`BObKbyat{nz_Y)H>}p?^^W_M3RDQ{WCDs}N3Mcd*wDS$
zHZhwU>lNC@-qJn!;)M$zh7_EDblktBAYm6ShM)%H34ch9+YMq&>cp@VWGy67&pe0<
zF<yp<fa?*-I`h$(sC;1xR#QVC^bo23gfG*g9ttVNV6M26a+u)~7$&f3J}~;;LU9e1
zZ8}3x=NPUxKRQLEFhsH5d$-6YCmIJYRUIZ0-c!<tA&{09Sx`MNV+8QB>jZ3LRH8J&
z63K9$+4wT+1J-T_84I8ITn9V9!Ek7f5YZHbBuDWQ3z@f<EZ7}yS)c$8sOGMC#6H}(
z*TbO2X`LnR;5zhY{j>ZPBs*pzC>z8KkCJns^P?J|Ryv$MKwOSm!^JOx$oz%rnJ>`r
z@C}V~hSj#Dn5=qZ$s`@d-<lds1DQNqWR>EDa+q4EuTAP9q#gngu}QnID)}}&0oGj5
zLm*2o9p|;PmD-n`jbFa3T+8yU6`_Wg;;k3$@A{AW39q3BJ>+sOoSc6sbc5kmSKrPD
z4;;9bJ@Yrc=Gwimbdo?P>yM<tT+e6ApFxt!57VflQ;|YmsWMu;xc47B>xi(9^CZ#J
z6ry8H&&g@Sw<bx@&q>`uEyqHsQL^}lG3TEB?l4!hmhDxzQE6n|B*h8P`RnHo*)zw7
zMxraTOGGhQn#c@?SUFQmgSLz2U+jWEGlee8kn0#3gmH)Y5+E>M{Qyq{^^<Jw;QiA(
z%OpsZpIO3^g1JZ%WwQowjlc1|MF)Pn8_oCd?y(<~Tp2uHR*52h;r2{(CDTqMNs}W*
znN6+9X9t~YTr%?4SE1~s_2<~PC)7ySP`&oEA%4Sm%`cz-Gf63u>EAS%>?~p&i@u&G
z^@{F9tb&9R49Or`*0B|~`;Z}>nZ3jEiRWA3`|Agq*aeFhizT<LLPfzVn>W~{M@Rjb
zStI6gXe|c06=ljMixw?{!*7cdFKqx~4=LR1UQhPjy}W%an+QVYt=`Pdq(=*&Mg-zb
zKH0?`f+2c+Q~f@Q1#6&J3`6FXl5)Rou!!1g9C;zJL-FZOW${Yf3z2IKP$Au2zI}Uw
z*?~qs#yQ)4B!*IBP#!Q7WNNrgK_$=qkSLr$Ex<D5YD|kfk(}IxSPbxr)H%g`P;u7_
zA0?}Acw64r>E!;57LC$@K@$pZ<>d6jaACZLl6d=yF9#1sY-a(+P!*`X%EPo<Fk^<4
z30S|q@eI9O1B3jbEXB_PqX?e*I?Ckx_uH{{l1<=uWIYh!foSdsZ!LpR6xq)y2|>Oo
z|HXxz9+#K43G9@V3@PC?v@GPdJ%<#BY!)v*=O@Y}g|JtAUGPoXnG#-|<R9dPgcAba
z4RC{A;VU<wazuQ8!g@aj{-e-elMCn)U?1KNYOrnd=4Kc`%tw^^dlMUP1GyYEgp|-c
z8>D1Z9+)+V&oXoqEja%Hz&MK}4ZdL`FO3dO%fMh(yZ)uihY%oTP`d8TthB|wn!YM6
z74^z{2_ds*|JCd$>a{C>AbcD>Co<e$4+IXgnfkuIzdUo?={RMfRPv{%&fKDnv<c8!
zQq3S5lNcy50zdb*6iPA+%j^IWsCr^nQ_PAFI5X_7FT-k>LSaEpQ9_Sk!#X7QqiVa8
zvlUtodn);&<dw#wOI}(jcS2W1;b$WAJv`-`?p9v^JbC=s*2TqV^x92a2Vm5&SM|$V
zFsIZ~Wx&GA0f+uMefp*BIY$P<;S!0{KGY!AA>JQL!@L5y*cblk{ErMEp(0^-%PKZM
zCNS3ebkdTHj10sKNpIX9(WKCzlRCde-SN`=qqdG~@gi4dfO#31cMyT)L*#l{zk2!d
z3<;m_DsUnd04g+5A7KsRwzx2nt$4ecQQOI`$Id(wi)hR1Xq{p%0iIaOU>)2bo34h{
z)m-Jm3Y`wQx|xH*9x2Q`Uj?MZKU!)=!+lrv+DZ+@6(=6LgJ5j1+C@rnM{!K_W?Jab
zt)tlvMtWp9R165R2(z+0SYvbmMqwtzSC}&msu$`u%v2T*8#XNQZ-zi7SzZldAuEJS
zQbtp4@(SW1e0V04VW|Q*s*+n60$GP35hg?vsf-8~!luZG0jl2t9jZXC2(`=j3@aAx
zU^<?)#K;uH2tY)sD7{;%58F?H^xKw>%0b*zcy08yfD?*<e5f|IA>Wnh<z~&A5t4As
zknvfT2HC?Lh@TW_A3eYKg&==FKa+|hxND+Q`kP;hc){2oa5_B}o?@e;E%?#<2Q1os
z;p>Cjw<8OEm_w5-qvA~+=v2Zw0HH1@+9V@F*~+#og|2G^A`uH8?Y?Jtw>X&$rJxPV
z4isB@h+Iw>(ndu^#cr$i0$t$8ftlD5czYq7s+Rl31^k^&Er`Fw*obKt&!_wWY?isZ
zC%y4FR+E`LP;piN#wpiXcLQ?IQSLN!#Xsc+4U9%EwJOboh;cw6erSYG;!StBX~to0
zWM#D??EywmL{uv_OdAO)IiJfV8z80+K{8=LAD-}PWnG?(VUh4DW0BDQJNM_tpC|iO
zCmSN5kkNDb<EIXM90^uUnUH(`zVX<xV-@Z#XB#WMi1o$w?DPmk+fCfL-}Pb2f<-^F
zXZD_X>Lw8aet`j-CT-NWojTPE6h%a3&z>#2di^EmO9sy5zX%pY!GiS#+$r2utLTX~
z<yC`ozlD9ckx&d#gmaL*Ny_qJV7N5sqNc$gV`N=TL%t7l{g_!zMr75hZL4cd8scLy
zu)tKcUE>WE(!Lek5vB+(X)onWIIxI@<MLHi-(<_!ALn7tE1Fqb402FiLBX#1S{kzZ
z*29BCyr)jx_40M+cI`G$U+u`d@N@?y^z0<hX!{CL$m4c;IV&r=FePZ*@jYed+rw3|
zs_`1;(}r2k$odE%uiNTUU(M$2bD!yBWD94FtoC5ef$0Eh3}P;&M9+7vbjp%jB6=sk
zia8JP?-VvW@g-%A$78lTh;t8we|LZ>M*nsEFU0v)_r1mhLZotbe4nRVP9lSmrU&<c
zi2T&y<f66gNvq`ov5n?7JJW2y3C-88P5n?LNa*IxuivS59=39+g14pfE#9=TY}AA-
z8oKi5BQ{l~x9CSZ6EH89`I^?qjZ+R&>w>a8bhCRk%ELAx7m+Dx^KKt8<l-x@&nQ06
zX}ogL-D;I14rSV2MmADzF}fkE5yYnOs^9svg{`8o|5!7rRAzM0JJs*wSY$#MB#S3e
zn-vb+D~kb`5YRnxJu_UIG4RxC-TH6;CKU0jMw>$OOe(y>ckjL*RlC=;C+|+Y*u;m8
zi67<w-1hH!&QyWCav;<utyHebMnt+CW>k4@s?!21#-eEy&r6aV?_J$%0Y<V1B(#!(
z{@co>Rca^3*KD_ZMGCM3-k}S6|9HU;y3ZAb`5MDecNw|0UEn|J?vUTbLyE4Ro(8lj
z+>3b+7izW%z7<tkKTvc|m_PZuO3``9C6IVBDzZUpWKgt_)(#*!XNc6=D;CM#Q(13}
zX~OA^n=GeK@3DC?bU&ynP#Aq*h@TlW6J0xn#7PEmiBzo@gdy;ga0=z}5_#lHUIQZ{
z!OS6{K#*P&#f&Xu#UciV(Z=Qe$+IVMa}Qcid~nvnY5_eO+(E^|Z+YMY|B4cXLkfLx
zyNMla)6DS6E>+=D@AGZco^mFxF;5_Kp;$f*HZ-gi!!o9C=K22ckjaDW?6)(^U&O%T
z#y63JVFiSA?PR0%M-IRKv~{toYih)`%s6UtK#!*p?-Fr%xM(>Eco(Q%{J>;S1jGqX
zd&|Lt`5w`SsgEcvXZ18;J8<*KVdf05f(OB$6_q&yh{tHhUm3Lk+y##B!=m^32X2nO
z>noC?4jmr7HFV30WD8rwAuVDSt^7IWW(BE&hh$IV<5st4h4PRX=Bg*PGmJDWMqQll
zCL3l^WL^MffEo7u=>2o(a%lb##iXOrrTA^Dq}1c@X;go++1`W3Q0}UTCP*z@uk8$;
zdbk93s^)aq@PXqi>*jV#n0F!D8T$^!f)Tgu&*|iWE`Pc|EcL{RF;D*~E>t=7Fsb#=
zvaCWYIxWq~CTxN7v|B9e5MRNtYQZF+TIIb{JFq7n{r()>!FX*zVsT;blBn%A-Wtd-
zJboHO@OU_}+S1BObh`}AGLwesp?Q8Kvua4p@viAiw2(#0)DWRC<I>w<oFGo(<c<aC
zjTsS+Q&Lgu(&dDc)$G|TXFi{6rSVU^ssW>2!f)aL$0@{3g~EmE^Fl_-s(}@cEn>|Q
zk^`3f_}pX1u<oTVShdT!Mxui>LFhPJsyoT^)lm<XzARqkGQN-W_?v#KfS1;mTVz)D
zF3XQzXMD6ow7qy*sclE}K%@oDE(Pusob}JlR3^fmO%Rm`t1f#C>=yrOcNe;>!?73R
z;!-ub-;54G7K1ieniii_sl6B~u9y;aP*|8%Aidu;%@44g$0EnJTM5G?bV*5`#g`b(
zB?(hSQKsYnD=TBrMqvwii=_tJksQ<UIWazwH}@9jJlHcVATZFzAm_tDBW@YCWpCaM
zLsm}wz!6D7h8iiaiBw0gepUoJ;74O^BfhCp<U&hQ4aCk;T9;9j6rW*;wAMgfV@dNy
zzx0<@d|WNoT3+>BYIO*pyMu$p$<tHm7{%?4z*N^tw?5$~F%W^%XSEn$7=^hoqp*d)
zy>e+VL9}JvGp?WS;?sTv5wZC!S1HRfXw{(}FKBi{iW{%g(#OX~=vXjzwdAV-Ejw=@
z%D}V0$Um>D`n#D6?GvyItp-1acc;(59Ht3ZsKLdrfLPU?C1?b<vuulIO7o_wTk@4n
zZF^|W&%$bjj1Q&5^#URCyw&2V5+hr?JIx@SQP&c`-agnk0mmUm#Ux#dCW+n>8Ju|b
za)TLX=$8_oonG`QAg(&`V=>cr@}+*a)|gU-<T0|eo5{FMz~|tbSaXHEnLB05U89|q
zE&jikY}#lpjKxi&733ot1%&>^v8A8?axA*jgle(Eb$Hj|I6QCNx&Y8vq@5CjsJK2=
z7w7Ornl^8q=<1<Aa3J0!XK$VfG+;?sAn(Cb@4){3)a%2Kw5r%ha=Du`gIRR>Ke*g6
zKyvN={c|=~X9&Omf;`vQmd^B;MHIP`Us`d=Y6`$Rh&CO3+Q+U`JACnR$EV&geDXx|
z956o=4H*I=hTr|+iFXX!1O9-BMZy05V-FWiuh4(9qzkcMmXlHv(P!PF;Su91lZg|h
zoM6J8=5iDPYKKxOEm5S+6^Hpx5t__8y85ZgM`3n+vVzKQ=9#j&M-K0@@fC~_h^;9t
zwuq4F+H^*nPd+>%3@{jB?`7=hLkBEsM=e{`QtL-hL`3O2HTOS$Z|xO_=r@k<5AGzF
z1fk#7HK8eEbV4S^^+nvoCzDN!@y_XwJo;GA>-xdrG<f*}{7<AKpyCj@A2vl>sg{Ay
z7$=r>vGfeWXp>zW5G@N=1)xUrHIDp*0k^b4cz)?JAcIPUUUlCX9T*ImF-DDMSk;1F
zoV|c{15{+Ko{mmy{v~+4Y}G(JBEvo4PQaKw-}M_h^Z?E^0%Nd)ShNE&*~B{6QzL(D
zM#?k$P8maKv{0iWrjbmB;7LPq=wsPtQfU9iOw))?6n7{vyy0S`0xGsYG!*2AE{u+z
z?n`P`xLzX=*_Ac6KT%_}RZ|Pvym>;>XA2t}--m-T<ML2!y@U8{A>}|>3Q7<(Tk{sj
zf<DH@#W57L9D-+U3;jNg&adAp7zyVZyQ&=e4@$E#);M+}57@XI7#<Lsq9jwW{C=7M
zaSUW21fEiq3XpNT4j<mZV)DLyx2;>ZPJQeyw7xVTG}OHEwg_;*2fw8hkz~Zs8Hnaj
z??-N3k#B^^h`fPh^Jrd$2#j=frz{S>x9|kAKN2NTldC@Maci(AS}AU;1|=#}8Tqby
z4g*vm7wiuguOb~mF;MJ65;kkm!nxyn!5H`y(sF?;Lvg}3LmV&$Op_pLdq8}6vAm?v
z6g<>6@`e60GlEuxV_}i*1jf++n}nr7HOWKPMJoYz*AvPYhIqth5knB3#sfYAWaERt
zSrQVB5!d_a%RvHZl_z=t<N*n+=PwmOEC#e_R|$G7M4N<>&z|WKeb6|gMiN?{Ps}Jg
z*e&lGG3qv66!XqfgGAjt!T(76_AnMhrDtJYMY2{{a#u^Wp~`M<)#e?iE*;`)Dvo!U
zoV0I0b>>X}dkaapO_+URR78MEf=!`)67m&FMUWV5kWRl04P`o;FM)Bw)f+bgII+^O
zR#<*t=IR=Ap+$VZACH+C28d^V6L~+COc#Ed&f$q%a5^TaRa(wbl;S&z#WI{X$+3FR
zV8e8jEEHLQdV7zxqC7NB+E-tE&k6F7&H|*PFZCD_cPTNkKRPHO1W0&L46=|t@1fy&
z52JpZ$KlH^sy73D-Hxs;VXs7rV691sPaY<eE^KAP-ZNBnRqyiM`e;U+Jo#zD9;RBK
zgA!>mmCw{Kw?8A*=lj!U_PlB~M{Fb`h=qxX-D6=9kcnb-n<m0MoC5A2SPg^*^soRQ
zY-)I^q~|3kWDL9qM1wtR1SbkKUFPNp4X|ZvFj5CKq#U}5IqG%nNyw;(#5xdvkxs#$
zh(9L~`g;&=F{I?9uqO_KmYL_avj2Fy&-YK<{GPkw`T;l9phS9yeg+2d3LWWJA|A!$
z{+AcA^Y|@N{g?(mMw6TT<5RwDQ~Gx+^iR0Er{|}cPxNAyN7<}?-4lfu@u6ya7RE!A
z>|I2^>PEoFjJm;IQz=OqfPv+jyLXlY!on3J+uC`S{IO>=%A@2Gb1~F!Q5Ml}^fG;I
zE=3R5|85LVL;4hP63aJHA;7djahxMJQ?Pu+v-P&@5j}j^CZzPj+i%~I@Bo`M6BVBC
zm#bZ4w(gI#h+4%-1$1+xY8CK}%?N4b7N<Ea&k(CC5P0#$t3@Iq+z3n-xsU>qiH&UV
zDmkZnb{x6r0$s^CfyfRpiQKe#b7$S>R=!k%LWoG+N<}J3UX(9zl{&GA0;Jg%P+T;O
z#C7RAnU<$e5Udrwrg&_C6_of6i6#ya9R$cBV<FAEw$|%Ps*22pQQ=JFkI>{6<8l6y
zk%gV<r;+pfty@=bLT07&FWI%pf7HfYfufX=LU%5Vx|sH&Y$lZ4MqWml7VKw`P&qdR
zBQ0FKwqu9E!mQth<aAa%e3LTmy{7c0mx<WU{Qq^p0)d~<S<3ZG{qf0)gtd{4oorZw
zkXzL+F(E;h(p(+KWU;-OIm9g~9R{0Wh{K$HvBcnDqgba}tncJh=Gidq-a_0y&a!X^
zM9&n#I+5u9hDdrP++AR71Oq^}h<JEPyK<ydt7P+IyqitZP{fu&F8Oiw6_u5*p$C96
zbQw8fZC%2LHHH;8W!+G^pW32;XaCazG*kO_c3!O0$2SiJk0nJ)^AFCPv_?ywJJCyn
z7|Mi}z@8m&(R4k$fFmPVglH`z(pX+x%V{zF^$Vh>2JtI~Cf=w+@r4){F%4U++tjFu
z?Pu4Q#|4kY%!l}2%05Nj%ge;(+f%Z7%Eq`^%THyNHs0`d;g6sWb}TL_JG0{TG=b$P
zA<=@*-VwQ)t{#Ct#OB<YZ?A)P%WhPeC_vpR8>k_Fxt&-9-rz^>*LP|S%3S5A17jlS
zDThW@gP>56w20IIjlT|+!W`UF!3j@WUuxH;%?_PBHKGBVK0+CPTZ-4t1bfIp%<@A4
z$VpLLlb@GGzQ-O~5<TF>MCa>|R+zcu6^cGG{gl`Yv8J7ZY*T3HlbO-e*RoL<YfQNw
z0LikJgINHRTG707eP{ux)cRu;74G}<YwL=bxX&^8GcsC7|2JauX1JzlL-+$PnfKqf
zqzTxEh!WB2)BsyOD=&Y1G(}!2XNOvU8LB17`WVa1C1W>*He@(_M7KezXyFio5Cu_H
zE^7EQ%+}yPE`f-6{I^~%L1))%*GA0T$`lM6$)RSY<CuQuOdN??Gz6zlC%-xu(*(W1
zpGIA>A_CU)=WAOfrIlolTIu-H_i4Z^I=<bT#EL-s+6|Z{^vdaZdGQ%t19{w3AsE+P
zCUgBU6Z4q+rjh|kRKRo*LT%71>bg7~09BWiti^0GX_x?qk0se>!Z9myJWJ7hg=yk4
zfWqKs-mlab`W+Zkl)|jSk{OX5zI8pQ8&wrYzWgBul;BQMWfD5DIHXRiw#e*@#u+Uj
zT?uB`H>vVZOo<1}7DiYs|ET`Kvjr+*HzXP2wZdTP4p#iJT^hOJ0%QVt1D80s>GB~x
z5AH?R&1MCe$VBGK`~)09oKj=d;$Nr;vk-tzA`;VJ34P||l;`kEmIX1G<&x)X>E)m9
z@Xm*x8^MS&YrO!hBIX{5o_9dtb)*xJK(%5C#!KO-BCI??hsuK?#k0z@>xr(pcsY2w
zJj9V(biYCvk6hb@z6-7(!hbIM7El%O3qyI1pJE1^EV&rwhHVO*Wp^GrMA^Z7+xb0(
zkodqCo?I^gAhvzPL3v$_6pQSN0tc%ZrpT!Cq&C7Mob>cNY5M!icpn+Jpsu#IR_<ri
z!{=pXy<A-3nRi}eeBKhI<ub@qua7TaUDUt7<6F9|9Em4`=Ac3QFlLjDs3bfTQ>0+i
zU@O4tMaxzKDN!SER_#y#F}w0H!RTsE&JF+;@MS5u<ui93nQHSreXn(OK_nE<r=uRh
zn@6Fc`whj_uB6qEn6!n|K6%HzeMEAdvB|4&MgE?=C?8WhK!pAD;oA3i@=~b*MW@oS
zW1d4Omy&R>n_994bt|kdh3ZwHZ6tKHq$L@>M>+P`#SkV|2HO}jNYBr=vfTRh`}bPF
zu`?bO7TCUw<-N<<CfpUqO<*4Q#f#C{G8-=Y8@-e4Nd#tOg#T!iWfU>sw+4VV^&E*^
z{;;|E_C2w<K<q<-EV^aJp}I$V#1eJF**A^fn+#sji}`#ya1g<ojLLx8#TTuP3e!)Q
zfeEGz<P2wL$4s{>nd?<}ooslE*BYyB8o-*2R!ufIouqhYPKV)%bF34lA_vWW?Qwo&
zn*l@&Sl@<VJ=Zcaw#t*@1%3`FOk+L>+L&!5OeN<jUp&Jf7atM!ZuPgnr6`${5BoMs
zr-F?Y(uYEd*KB3KQouM*;6i8vfeVl3z7~-kk?YdY4gZ#W8ZYsK5(OTFfeO_nVIOD%
z#ZrpzV8SX8gk_l7V%m_uXh^iu8QNLkuBMD##EdJ4iaW#JCV^?N^%Uw>Q}aab?O>W5
zDhA9_>xrRqc9g65m(bfYETBZ60}jHDgh7y<*t)_H4Ov;!P0Z}rZ{P&{3QCNlZE>vn
zVuOQhWdp0}siBS09|gY7*ibwXWHIu+h4vgxx(SmO+mO{jeKVMz&dPi6!Lv-D@TZ`Q
zApoSXAlc)@;cYTu!hAHhV#U^_%bFes3nl3;I=d4F!ZO;CmRQ((CaU(BMkrgJzxWGW
z(dx}e+1e~~vW6cHON2r}BMVHysAYyAscT$Zl5Ni6k>aP>3Ab6Og9W@jf^;y$GxCFR
zap5-0i0uH#@Ru*IQ3qaT))jB5W87+Xj=szB{Ta+e;4<QY+;AY)q_EvADK3M=e*yI-
zCNo%c(94fwP?@}h06`4M@#_$$Q6Nx;M->QuqFh=Vb_<_gE^0byfF_9$L=Mzx_J$-Q
zOex8g#iIgaOYteAf;b28BJg9m26*-b^@MCBWtAH5T><NW315DQ;VsY>TXRdfmCTwM
zH}`(fds<v-2+tln?)x!a`^`(9-F#}aMHFT2GiDFUIJn&q!cYVgGnUVFZ{rmJ`=2x7
zdtVW2il~)|gR6*@VdxLAwzOf0dqv##460pHVM9eVwZf=FNk~s&2vGn_#|YVHoxJj0
z;?_{F^wtw$m|Qk3GwxR2MFyx6Y6*GBog3Fnd9TjMwicQ-+EwNcR?PJ5f>Q!@3Fb=z
zRH9np_d(J46aNsTobWhFm5C|j+nGZIyu+~rYYCHQQ{c{!N7$3aDA0^dqtRT91uQ~e
zj13K<Mg)WuI3Q4d;GqC<Jx7d4-Ix$(e|z_c_;;q%PSCmnTF#n#LmDY`B(mdp^2i@6
z0{<D8G9TPr(B!-dh&MN{6b7tNV85mS1qYNW`fJ*+?<4u!9~@s=)ZQh<1`CMm$*$}A
zD-P3-ii&!|`Yz-nCn;(GQ#g)AM{_Pb{ood#N|8K<4wv`9ePh~h+30HDpLMH7e6Q3O
z7Bpe${Mbs9xr`&4&`R>Vs52lOV-ou!6Wc3-I)^!VQ8tuU3}&uJ4keqkhzUYBGqrTq
z=vV7>U|q(WRtzolShlpSomR;3DON%EmOiOCIrESzQf(=ksMbL%-qGU<nMBMOx+k}H
zhks}E2GwBpPQ1Hvy>VC~+R@g(nLZbDDN3weS{DJ2(0oX9AVYVPC(8!fD!VW7P-xn#
zf=FupS!IKp$oh=x#3M)Q!QykSWbgX3bpNV?54hKa8n~(BjYlzonnA_`#i?{$=4sX(
z*Xqygy%&Jf)Nm1}iuULkA9UKLy&OO<m3gx<#z!Yby~Du++Klrq12Lfr#R4;(vSk(S
zcp|Uaj&4<-S_|PO(!m~#t)ivd9TMV|dp$GG5T<0P__eP3`LzsmvqE=S?&6Dln8;sZ
zUqhgHUS9rJSng9_;00<!{*BBeL!mw09O_bikc&;#^p#;rCLgw~Fvl~J!&S<!;aR)1
z^06LDyf1~-Yi|;oBd*;4<3Yl9bTeKj_%S7!tlz1MV-bfV%q(#q$&Og4ku26EeTIq|
z$BZX<Fwrsf%kvhXWJ6YzGd|z;hsq#EmrWYPjyrA?F^yHoe{T*Y3`tP_gGy(`uKn0Y
zY^ky*KEf~>Z$$BdM+YT~#<|Wy12Vwdg33+1&8d`>lmX$QE-fv6s4?DY+_)xO9hB=V
zc)kt`l)q}7JVuO<s5Rk;rj5+b%1U`==jfboS$iS9)agxSWx<uJPZthgYf#TgYcTor
zRy&~i*I%;fjRghRCjfsPvx@!VOv#NPEw-SU`^xo4;xA<Pvzo4ew#aZ}>Lydf55icZ
z2=jQo-m6DBewJ}8ZY9g;Az_wyOOFRg10n<Jy#sqGsBC;Yi3vQC-%ReAxQ6vX)_yN$
z@82IPK5g-iag3+v#8yrE&bjq&=}@5Lay>Ju+4MdMdId{zGKnnPy9`Me@ez87{lp`J
zt%+gM49Y_0LtegmB`F&O+11j=Z9Jz`whD3mFCA4sIvjIhKriu#q;B<E97Ru1O6-FS
z0EGKq=-i)qYSJ7)Z59eI5jn~R4{k~NNsmATe_36B)+nGQq453jvnZ*~1EHvo`E#E>
zBDlnk9*$QHrTR7MFmkHrxN7=2akUluiD!Jz3f)Ty$hY87p7ENXXL0_`Gg4mY?HaZ2
zen;V&0cm?`+@PP}=Znuh?tWl4j#TGi=JS}nI#KAzR^Ofj2Fz&Z!v{epdE@HUhH&$8
z$eT@#{zE}-!77LM&e1X*(#>s9lZ{4qJRWyMWKEMdE@{{*?vx|`7`zzMqfGni<9#Hf
zW1MjTOjT9$-`?!bE?e0kNoa1{wQI41PAb?O#T2u;Z$g~@&3(Oa$&x+ELEfV)UZ!~L
zTx9r|pDYwSH|dnpsz0U^H6K2ViMG)9`S!uQu`3q!7iY4t^05UELUv&>niRNzzO*X*
zBTErC1qD@|sldujULVxViNPPhYwsJ(u(n3mz2!q8m44F1`q^9#AUt>WB1ru~BF2Q;
zBJJinFBUEj(xkLx)6-vShHJj)0zTK1o|jr`#es&(w&+Pk@`(08l*|N+O*?i}zpsno
zC*!1G@*^SYZcmrtstWfX-)&DuAXgs$>04R2YQ_ww@R2`1NBdi@vr|&snX+?|V*RnB
zW{kSt^Vqdc7Y$!O)*C*-yz=(z*Wh8fx$dv>bKQrY$}Ka?(Hx%Za;j>O*1bVfd#DUp
zJj1%J;?yU*?21GFwbTBm>gUgzSq6<~oE>dcmz;fj*X3w$x0%B|s;B3aeO(itI)cjN
z3Nz)4AN??4M8*14Y)p)+<@qwL^hXY-nrXC9xT<?~!$9Q+ntuKDn`tOreete@4yqz|
z|IC(G*+bKgP#-s9Z)b;!+a>zz?_}z<jKdOTPKO;`N8TX%PMI;oj|!Z?=QQKYIV?hD
zb7Bq9#|vDgu2K{`1=s%2fm>p>r=<34!FL_SrYJ}hSS+{X{<eDy{q8Ni@WkE^>w%Jw
zc{Vqh?P|n?+0=(uXYad?<=Qe1F126@Mb>Cz<MeIe4X^5o3FN#r?}RlwMs5-2fcdqM
zDFan!?`xcKPFXih0d;l_8!a=R{)}Fmn0dZ`$LAeK<Tm~ZL(}1xqi(0Wd3k|U>1&An
z6vPS{=fwStXmthy3)k;=?@mFa*sWi`HdJHeJ!4bTJ7-+#p;%bgqJvh?p7USdkFaoC
ziiw%by!ZN%3>Ae^B=BL)WHYldd=v-|G+gyCOQqhK-_;h@(;?{n%3(zLHW}S;ei=`g
z<;UF^u8`ZHvfka|;*oBTESRBS^x6pXn-l6GEI$C`YuRD|c8^^E-GTmPLnt)>mGI!+
zN0ydlt5F54bC@IRJmLSZ7S%$=@)LG=w?=MQ;5>Bj;7v!5YIF*^q3yMk4T3vRjZIqY
zC>sLc<`?MQy$F5;<PlYW@9ns*({B{4ta3tWNZ^d0y&dIfGZqM2srK#O-R^<Y{rh@&
z=#c?F$3qm5YWcYU;pxT2zM0ztwQaqsi-t41g5|hJLD{f5?Kb;;e@_6s92QHY#;pUX
z_>~v$&e&bwqL(p8aA_EHEE~qt08(1qeez_;Yui^d^3PgCg;^MeyWqDBY{8j<7+2dD
zF;pEAiI`Bca@Yzluh9|HmZ=py85J7pw{!E6ibfjyLPw}7XKMN>_P3d;>puUCgX2k7
zn=^aI_MwIp9#8~ZU&z}k&|KMnI6|u(>ccl+5ZPe)$!^S)uN&2k#VH+S^6K6__K5WZ
zPI>au+TZx-^=q0+a4mFG4CXJ(2*_#@Ib#d@JZTfYFwnZR>ghnV?X5@FJ!|9i_uYwC
zGEW6Q>|n2MJ9|^9BXt^xRseHdjt`1C%XjZ8QWf2O^eBLr0t{yhq_`4rj)*w&^gFD0
z5Sn&?XEdg+LGH62jIvdWLu^f;R5%_J)4JW}MwEf_c2=*Y|AxQ_HZL0#eQ}}AfC0+n
zzyzP~gJvxr)U&6<>;Q!6s_Mg5u8WD$q2K7Pqm#XO$n~E;*NOo&OQ_(qJE^EVPyM1A
zG!E8dsRJs47s-oMx$b@YW-cC;0d>27s>7<VLx&U@F{8X@@~Q*y9X-?4GxcitDkMW{
z)26#j{OB#Bs=x9}IPu;sD%xIDG>=*p;V&QkOy3sA?+rdZEavs+=>zv3c{_RSdnK}^
zRm&A$zkM@b8SrMz)5(~kt$UZB0}Ot;iT$c!>Wv!I@0iJ&x$`O~l$!fnJLRrW*Lv02
zZ_S9qR=*yoaFh0yu-Beda-X*;oHa9;Z~Nskhr)oLj^sn5-DU>%m>3)ODs{ED_oZxO
z8X_DF_UD~Eo9V&Hg39jHsZ+h06NiGe6%E^-oa5Skmn@CsRBYV1F}StzWM!Ywo_>nP
zC$~#5^vm>8&b02dq`d?@gsigK0UD1<jv!_SwmyvD$JW4>Oz1i~>f!PU3N5dW(rjnc
zd#`!q@B=rBzJLFIMYkoY;pn&HPvss>OX~`aZQH5Iv`FP?kt<A<USO5qq)8Lq!GoLb
zbI{mjV(aEMvUI{Hu>UVPzv?EF{#feJUi!J+D*RN}qU6nV-!Hhphz-6JeNGJ|ymW5%
z6c?el9?#yb+-Vw-d!QVLWq&jw^Ls|$q@>j2FK4xfL)ez<cAKt0*UJ$<4W!98sCcGe
zga=;5t`!dCO4G6tyJ`>R+*Y=KW%ieTZt}Hf##heur4e96V%(3Q`JK3aP5q4-^bk93
z?#k~^0FAVDjx9%I48@T8h&7uSz@rx6%x!Pc_rM1K!otE$TecXlSn*+hX>*@7Yh*`S
zcRjsj({eJdUw4>M?*XDY(B*`c@46$h5#=k3PTTZRwX>X_yKn$|tLhWmj=QGv<Uo;(
zSqaF}tgQ&vu&_4<KWFuL3(NgK!`!~C8G$5W2Z+;z;n|4Y>}>I@XiH7NI-k)@06}dB
zHr5xz(TNj5te|qY0}W2qu0uQqwuvd~DE@LjgRBt_4_DV2efPtGZ2LJ*%<3>QW&ZER
zizJ6AwHCd4om$q>IXvjog5LSM9dcDnUe(VyyRBsmjK^D*S=QFuWSa5Cz7J4B^=nRC
zjj)J9B|IQ<)Z>+Av7Kv9?70xR#DCT8IdrF#PTN&4SoGez(Al}6?y!{xonKXbfx!Gr
zTib}j^>qA;CywDC@z%QJ;oj>`QIR@TGIkzUKaFjw5jWtR*)R?JF9$+vR~>7w;kWZt
z$Hw{{n^^4)#e?s~jSdmhv?V3&ZK|fCl1_^Y)W4oZ(nljBo3*@(cC|UWz6LcX>f|&9
z232!}biT~7vB~-Sdc$FzH-2Ja6`o>Ld14f@wYVQ=U@0-4BY-F$g6jp<Xo6R@I=@cW
zwc*2u5A!DuJ==FH4~~A;{l#Y%w+~Yn&;o%77Fe?6VgF%;+)lrU6wuRCEi9Ul9^bxo
zg^R{kwkd)EEc?~s+Z`MTLkG7e6VuF!a4n9o`{vhAA0Z17Ca^_gBNADJ36Sik+Gkc)
zR|gs&6?l{pb<9Sul-06{!ygosvVqlN&JVnK-M9<meXb?07Jd7+<an?M^ClJqe2yD`
zdm%2^%r!g8J`3(92d-ehLB=^!5cdq7lFUCPCMEInh%fk&HvxWP4sOn*EzfGd%IQXY
z%HWFZ7>1l{#+f_+UhUx?TcgzPTFRLNvK(idbP6z{^1Obrx>EoXI^y@RX3ZMllOO=!
zsq6o8$IOHwv<5MAlLEkl>B(j8-P+QVWH8Hh1@kh*`vu{LO*A*e#>QU1SnrhF3`R3)
zpsgdH-~p)HvLV77Y@3deE4uHxW%uYG6sZXAWn+f02jx{&{*>~}A!jze#AV@w&_I%#
zL&ApqA60Jxj&<6G|I=nkMbw}YDs7U`N}IH6Q`4waghz{%s8m`Asi=&Khooqk_9UXs
zmXJqArlhE}kVGUYQoqk_-ueCh|2gJ(kD1;k&-4A>_qCkod7W1@LW2CKYnW^}z#qj=
zKgl2nD(q=2e{*ZvTvGy1AXn=&Wd6+fnFt4j0hWN#X{uoGeZW<KEO<0Y5mmf%TnUiv
z6b2n@Mh}Q+vs~%QF9mqRgv1svA4_zaxI?~ygm{xw?P6~)$Ec6prgB?@t-B*12LG)Q
z*!H-mCd$05YR~W|f0(sc6sB#T-=SY^-)z^d%c-+A8y<BoC@md(GRJ^(LVe)Be}C@b
zF$)q`XT|6JXt*99ANl3ujQV{4E#~@us!N8IcKK~_O4Om^DN?DE-rylaGI{9^X4-L}
zgOQgqCo4*qmZw=a9m`*y_&`X*g)coJnLgVG*vDaF-y<{p!GbhWSSJ-Qwg$Yw4#Ol<
zbwNL{V5!Zk9Iu@)KElJCnwf|VxWD1eW@b*w*Tl%D$c2PkAJe(>^2ftk3CAx?k3@0$
z%Kx`zseq3M`7ggTHj0{v#fNGmr>ry96nicZtG)u6nTi$E)xC{8S&psP$0me87)va=
z<eT}Xj-=<_ShG}^f{3;YtRp$aRAo23uOnUAsP(Vp7GAU+cFz6_cy6trD7*9?cMoZq
zs2@viZrq=1-%wYVMx~N;?+hv7K%}lirwn6`zKF<zvw?qgb5->+>Ci7Mw~lhx5#t}$
z98_Md`<n?GK}chACAPfRU0B|tj3+}nYN)!<=pXIKmnDhND_&UK$jc!s@v6FXLyWuO
zO^1K17N0KzL<hl5{_pXo<S(+>1Ug?K&?F}<GF*p4?|$kF_Fey)0H!ArAYL<bZq9gT
z6?8Fu>fKxSRU{EB%_7>59FSg7FV+l|VYYlMZu^;2r(Wd_O|e48;Z}OnA8w06_wqeg
z{vAz7-0H{Y7cKqgNmqOe(8vRH_28o8VksyxW|akZ06lAYZh=0v6>l+iMgWMIH;fX*
zvWlENwr}D^z&PPW-U&X}?bY2gXF9Y>S}5yAZ$IEM%dk5GU-F`$qilzP7K30QW|LwI
z=p*E43W;%a9gqJ?=!iO7JZS(q=8*I)EP9fkr?5MQVG2G?l5CRB!98df0o9L%hj<)B
zXC;xGd2EwzdWjz@wt**q4Lx*d7X+MP;8&>Pqyx6BoPt!6V@O?TJT&Z%M_St*1=N1=
zqSc%^U5G|VQ#X&ZJb9?fb8qGf&?fycW45L@fBo|1*q4OqZ2N?`ukw3dH~v@s_&KY_
z+OLt+zi@m}X*#spT<hoqd3DRcA;GtGhFl+Kp%?G-MAo-sF@6|zFMqYX{JYXq%%QX&
zt@!<bNp#PdQfYnI5ML=-mPIhQLsI|Hgkr|<?&?}D0yMl|!kVHO@)8YY0I?Bk-7s%8
zY(9Sf`KZzp-P(O|N4Pg#El;Y7G=WRv{UC<DF)$N3$tdRyhQ!)OtT+OC8p=fJf|8OT
zae08=h))wqWN27!_-W)`BbaCX`uXYC>!CZpckG~`kZd^q+yZ54e2kbu*YQ);BpwMz
zJz+w4p)uv!>-#UH|7iiPnP9;$&vzgg2kzhBw)WJRMZ%wL@nW%w0AGh6*kvF8=d%+Z
z6j%odpV$=)*wB$whsn_>mJ4%qz7^;1xZghT?4+}Wn5%T*2QtO;ZjjtH!`vf*Lk17d
zU_wTYxWg>iHp~1>bfOeUs{1qlx_mic@udL9o0;H6Uf+)4CO7L_7cO)ZK0RWFA@<g-
z9$>8H=jTM2*x$N!OIYxLC-$V_vD~?HJcCao2L-a#SpjYxY^scd@Ac!`FwrWo2yaev
zg7#h$(`X&X%hKdSrlu~FFj;{v7fcTPG$T38M6!!QnGhmQ5hLGzZ2uVj$&0c8#s$@v
z0uP6VIt%nM&b8QtOya&M_KAriyh9*NlS-#fNdG&3y4!3dY`#k0$~Jm>K6Q<&N>0S+
znk!02zu~lmN?a^`qsS9s5vz2Vtlk?7?ow}DUY>G*qCrf@Fl&gSBRVso|DJ$=?q9-e
zW>&kvYcG7B?|t6EZQ1?di0fW9zYyBu)uA2^i@M)%E43uw0d)}r)U>p6+MA-^PTF*z
z&J_9=B5B*gwyS_qGvNDNA05~@ru5>bDb^hm+rE0TDN1Gj_qQR<5uFaNrTX%-?&U1`
z)8G2s0?U(g3Hdh7{oa}fnTP`Hu(X$^rZ9b_q15ZXUk!a8LPCe<ynNv(K&5<7OfbV$
zbs<Fv8yAz>Q#J#`x)o7(BBs=&F!QxF6!(9}m)PEVbO0!&=!Gc~C!tispdm#-6F^d|
zfJsOA(+Pipnx|*vt}MJDvKuRY1X+e@k|@;-LfyNmAWG*H?&tq>T)$owCI3^*bx<`5
z4vt6MXZ-lyg3Ay?3cRv1_*lixoy*VniI|UB{MLp+ewKP-?h-hrkFKtqVQvidqd0Ry
zsVUZsfe(uI6E@nQMTgeU!mCy;+WWH--I18u_|ni&_Tb21LAD;`J2_m*Spd!ZieO8}
zB6{FDJUFpfkt0Y@(1=mCT9j`hN1y-%{!i<&qjtA@Ry+2d)-iaijn6_V-aJc6Qg-$R
z8xaWxNP}^v)ZZ_7e|fJa8gJZr+47|^zNz5gRbW%}5SF^CLZC`%0}u{4+Mk)w1C+*&
zSU=jrkNxF16xUiYamV{7Vedj2sih=2U_I@GOm;TQ4^N=bVomDAOwl``U!mFp>ut-H
zWERO%)}=-aU!$t2xiYkV|Ni|Mpt3ZgLC6I_67_K&<!)dc@JbRCbZkiaHA8rXr1n80
zxkRXk7OPmvzk2dun>N2oZ_ps+k?URnL$nug&*8&<Xes$Ke6!v-a2Yb5JTgNL^=?@q
zy%%`T5*0c5jNE-jW__N|0ahX@T}q7p{?#roHz_e8VdABQ&p+DH53Xn44ggkO4wZ+H
z0BsLpIfp;Ngoc}n=Wn`39_7HWtKo|En{IDy_~8CLZpP_rH;2}DK3cHy?UPH_p43Hr
z(2iDr$DD6M;oZCA7@1HSf7E%=<%^27)lJg)%zrv`e-v@bPxX1MlB%D@#58M5MZNl7
zm!!$YPxVdX0UiL$OSBJdNcLWk8kt`qmf=y=09MHz4@#pn57s{W1oN8#15%;2?aj17
zHGD+gqi=baDCz^wJHkUTe}?{B5VZ)n;2cHKaO2Xd(i!wr6!~Z*UZdO_4P`QQtZ>pr
z&!i2c1Sgci6zo3i0h;Z0Jw>}BW_I8f1#H3aRP0}WDTWPQo~S9qVa6XfjvXMGubEZI
zxbIATqo=CKe&E(IcVuSNHKCm)5!a@VXD&~8BMT~)?eLUVI<HDMzs!VH5VJ0<;c{NA
zu7+VBbD@EF47bI8;@Q=Tu048)rN`X9HxSFFwAq)J-Pwm4(>+WxbXwoqx;h_3vNz_$
z*~*6Wc1EO)8h#gYV}|;Xf0qH3_Fnk9h`~ih6-hL^@u*{hp)JiS@Zu7SL?}U$9(6vx
z%yl3XtF8=OJ?4-4abuk&<HlZ`-EUaF%r4Mb;*!h7<T=ZePjRlq)uJcm{zU2|iVx6x
z^T7$x{gGm}MHBcxzz!LLXePk{p{6LGmdJF?hI}*!XR!BL@6s0;K9Qc6v*BQik*V|T
z4(;066Ljbb7+jfvG@TNa1toiN5`e%B+Io?>k!8Xw{=$`WsC?;IsC`FX$ahY(zR~jZ
zW+x_4*MC}$dX26z%dUAY0|bCkD4a~Uu613#Ivw@RdelE(zJ5(-gc@Y7$I<CSLZ(qy
zwBz{#LWwOIK2z2Ziv-?5(=f1oI%vwslg_s{c{hJu%gtI`8g}lB;=!!l1uJ`YSGn#9
z_&4lAnlq^5<@ryFi*tr-7_UZBqWeUmp}N<KszUns#5KrZ%>b=U>kRx3g=hxIXBaPN
z6M0C@I=Yjxq^qT(^a`1#Xiuzt`48v)G8tLRO~x(TxL{%Vg0BK6;0#ae+g0f5pwqkG
zI5Mp-^uasdFzO_<L$QgDnjHv;V12Y<LaI1*ZERuXToNN8rS`iwkKU1SyVzxO0vvFL
z6cStq99D^B(#LYwh(^3wHf=e(`zCXe2vqWNaD5ARyE$SvkhHDQNO<ZApKhow!COGE
zp{3l1j8BZ(zyI(d;>+E<Ixh?Hb6^d-NAJf_AN8XGTc(lii**S(s;2DNo)B9=3bY)c
zVL9l?;_an?JVJXbETOX~vt}YtSW*Haqk|S_y{D%iUFXc1x#81Q)tzS?GXAym!<+HF
z?i-q}v2>QCyhAfbRI^LjA&lyU8>c{PNN7inv`@BkEA)v4<7RR%mee5nOo|%PEYR|y
zW5-<M?ipqn+wHZ2Flv9MZ`Yiw)%1&^{HKL6GM#_6Fj(u`cO#gZsmOo@RE1{s4bK%^
z2{?Env6B|ua9PP@eYM7I^}7y@o_}r^f`b_N?v9_smlgTXeYn}Ax`Ns!X-7-b?bYMy
zzfs?@>0=W2MN}pTVZa4~pFKJ3-vk5V@o6Vnd}P(yK}}6f;_!)Dj6wZjAQfPJ2azcO
z@NHaBYfPtrNyTKaP}p>__?Z4}iOQ)4qJwB-P}*}Hd6~mk`_$HjKgnnppzuCVdgBez
z?m~6AgC<jklMK5%(oS=DfZT|b+KJ=owFRni>QuWiQEurKs~?)JRI*sL?dUTpTs(14
zmulJO+~1|8oh~gjKdF*S5e&@@Xp7FC0lY<bzDC}a&2DJW^|RRi*QeRpH&*6ZUpq9}
z%E}(04h|a{uXNv<yE;h>X3dC<b`6oP((%Kn5T@?flb^Kk0^^b6-|k&=&YcomCawSa
zwR}b(7#50}XR9|wvB!}NYnZs4h01hj{Ae(K<WRQTjku*x<6XLl7KY;Hf_fF`J#r_>
z#%jEW9|RY#N_aN#_vnAB8nA1aLi7iJh{I%8ZqE7dU?m-S0*XpXS_|K87({9Hc0=-G
z`{B{~djKk^oayI$0LI1nUE?=9>|Du3VkZGXRKcM29hl1iW9spw1~B@@(f3|RL{K{d
zD<>zqE}p&S`~XvrpR8I=1p&h_|0)t>AnpA3Ewa_dh2F2i9>_X}{rRUaJv|;M!mp?#
zCWRT|+J=UxzsE6$Pf9;?EVJ^{r*(C^dw1*R57{IT{-h*L>IlSqzJN61u}D-+RNA5u
zhuJ+58{78Ax|hP28|1kY9Scwo_WR66l!<O8)mR0!0SBxtT?1PXD6<z089XKo$kUx^
zfPrH-)xFLndiQQ<Uw|DI)7ZkC7a#|6I+!LNe$ahU<Wasx2F(Vmc9M4f8b`}2np>%L
z2fPFYthZpnE8&L-;&;_4rz>AB_R5umgELoGwnVI(k{C^rc8|Ymn9y72paLJ8Zi*M`
zxoOk0>n#vIRBmk{i6(QB;aZgl_gDd_BL=0fp|%B_%A^vjV+%$Zl^O8Et2(dNNM>LD
z>Xj~h8sn_vJ~k2$aa#mi1S1zV#lE)@(&H=Oh5xkS(Z2<YN7feuBMfVlp|B%rx8-R<
zXsYnJX!IxxOKxvbM|LQP764AXLvxO74j)jz$qpuy&}uZP@OCdu-8CWq-X!8`VsEs~
z0!#i#`S<)ON1a(`UH87A&yVVLJ14;#!7m#e8MaEl`~Kwo4eQnoB|R3fxsaC>;Gi2a
zjXQ=;t_!J%l2_0uFc9iZE9@U^YQGB)-Bp|K-Je9El)Yfy+)&j)rZc-I(up}Tn`ysj
z)55Ufx_(c)ACWHAwC?wN#iP_zxiAg;yEX3?fHfm$miOwa(lfbd@ie4{-1yf|GCzL!
z;7idqsx#kt#YWeXA4w)s*=o72UZ2O@x_R?JvmLsdte)nq1t{@)lkIwU@1HYHQ$VF6
zdu8A!$;0Hxir~P&U1!f$T)(hUy6je5oALbFCE>p_gmBno)+Gnc4xpn*^k6AF3m{(j
z#Ub&qLZuAl%1i1@1p>QeJ!*op&RiWjA&8pm`A1AlD~8epi|rH0OOOyzvqVPPpV_jd
z?I2TgI~4n3j(|GX%@}7VEhuNlhfc$ao;};g{GuW)QN*Lsw;P@;LEZf3ub1XD(iH7U
z8{R2HA5tm8a)|E4Q4q7cH7SV%dIe<b-tgh~z|0lYz3%&mj`#kiicm&DP9Vuio2s>m
zQ9#3jC8YX6&u6Q0SJsUhl9PzQeDjQhFK=xgN>gLl;S5Eya1!K@ml6NLI9Q!z<>`6-
zaYI{2M@P}?0(-|^zpfz;7VI108X!U(L^J{?s22t9$9(pB#0H`r!u-q!6kB|05LW>3
zl{4~JUB;7GVp+Y%LGuS1#w=hF(FR#wTKI;t)pzcIw{6xY3abH1TDOITLO&uXB3KT>
zB_BzE8<fkt!B<5l5Vl2ajz59(6Y41zHRycZ#+RdThmH`do1s)xmHolz*-W7=Y|tRR
zZ9nRAP*xB%R=T?0qt6j5)%hdzxW8a8{C@7Hz+@2Oihx6=5ROXJ4^dxK(#e0Y!W*i;
ziISG5o4m=6as_`6ZQOAL77651M`9#$bfMB_JUHvB=rvZ0P7O#*KnH*yyQo3jMug##
zigt;yA{>EzsVQp~kI*{P`S^lCnOua~$3~@XOl<k_<0`{Dg5(4?gKdD)5e~%g&HX{%
z*LK|XJj+#kk>sePva6-xu>2YQ{k-@4f&?>bl-Lv%s`|OqVVv=tG}-rDrtSzW$d_nF
zI!!zMXjSz(xTPhJrs{_TZ953iPp^7tTHp6oRT)Gi$i*QYO#5|JVv^jjhNTSa`%TGQ
zF$}KnY=gzFva|=qPRf@LI6ZJWrc^$0{F-|!hJ~FTV`k<D3aT|aaXN+q43O=DD-|s)
z!%QOL9wWzTYwtrFL&?<#ngX1RK0s?^;9#)z5$2CSJ`FK+me_p!P-%QpX3f1hHg|v(
z*$+{uA5)n=`4xTo^=l9Pp#Ea&^5wmW;Ag!%zho}#($68B65bzuYMEpm5{9&wFJFFe
z6g(zIJe{cv;UQ81W9#09E%&80vo}tqKjN-?F?Gakl#jCw5+N0ggx5WEJxGDDnHAPg
zYey_O?7YZgD+M&R&X3sl`sUK*mp51dN+TA-J~jSyx#R6>4Ube07o&S<Fbd#XXr5mq
zwr1SLpE??ADxMKpOypDXCqu~6VhNUpdtLV`t*xk&v530DAa%sRv1gO7acl%+Bl{Z9
zULJ?)3RF&LHC{i<r$wwC`s?%15u<kk&EoXj+k0GZTK=~t5|Aqu?m7?!e+yX*OeTiz
zUQ7q!kz_3CzmUrYOcuK%-&lp_iO+-p@l{!w4`~2hu9D~&nX<{Ucqwq{Z&~95aF!6b
zG+$@K+Wiiipnw!ms1Cr_^dJhvQc|cMgO+FE&)gYV39t6za*5(*>(;H9BBncUIp~^2
zcl8w0<f0c>Ab_aZ+_OB^uTKR*5Vn642WYS?qFrjJ(uh#$#Qv&^jPvXxQIY<cJ?ahk
zBDag;-(5Z%%>yQ$zwzKoBGeE$_b}BYtk>3-P2vhq)DOWzZL3Vn-_%EN+o2O%HdzLk
zwr_%kZr{pv&n%Zpk3QJ^=l8VkYb1Eqx*6`uub8u~<o7E@9{uKh_x|bUTLBNhCafGq
zq4}C-HXM{B4L+C^yJ=jy)r;pLpLT!W=x9Js<ND&Z7Q%7iLdzi(a4c1cK$ajknPj3C
znha$o{KOHjY?UefEYF{LrD#d7ZP7uPOD>RntsDlzonx6JL*<QTEx<}&K03d*sn^J*
z6x<~W8>wfg<9m-Zb>W{7z)!`}=K8_$PgdlEVLU~2OLa?H5QehM8U>HOHEGb16^QLc
zHA+lDx)2B+OD|2T(govZeNP%WP_*b^KQS=%gr_NFcg_InMSfk;XTmO|9}@;0Nk^FV
z&sHzfoEI;Sn?Lacb(ei`)j{pZp0sW%8SB%apk>2m4l`ES&C$(3NP|XiA`NU)b8~Xn
zFadt!a>Fn+$4Z_O9b?z_!cxrDwO#HtFtWFjpU57;ZLqJ`H#T0s;A-ZNPh7#m<5G>D
zJ%f;H$K1KoixhhG#*G^{tFylrf(-Drrzb>n=F+&fEK^5qE__1i`mzjPYW41&e?+%e
zRlN=_qtNSpaMA+=WYSeLA((RWh1Z)-LeCFN2nn*s=G<GiYe;wo7u{w~1I}W}M@auI
zdSl0Km*sR)$q?SvgDXJj{fYMs;LQ1JzRXyK@g*F_{eYie(<HJ1D_}~jPn!L%xdZ&T
zB482VjmUPL8tQv@?;gODL7%tyhYC*#B?bM<lC@d1!D1iovpN*+QxY5tn<*mem?uf)
zmI4HVlKUgu`P!dx8a^1%*CY%hX@P;`ZSQ)2znY(18IOT@7jW_YY4`Y|BF~WVX(1Fg
ziva|;a`XbmJQVk;CqwA_#kE3I<M`0p5Hl<C2K70BH|*cAK1C<u<{F2pn#}9dtkCgf
zmpR$4vBUpWgdF}C(DFFb+*s~ob=0}PnR|(am)YZ@RI|99Z+~XrxW8>(-tJBDkGs=J
zQjP*m?xw#Hv}#gP_>#L~N)bJxP!NfMDS-(gw0|A$u25JXac+iXA5+tFvX6z1JeT>C
zmJE<eIUV0Jhwd2GKkNLJD;29Pt}Kf0z&&`=K@V=v2bq|xDH%Apbu9(^3r^|u56uyA
z6a$l9BwjMMz$`Xz-UC&7H8q%aXBh*wDE1Tx&7^b=!mC8-g+>RXm-QkYU@ca1N{2P*
zq7xP5OhUqujwMVtuVAV;qs`PdK62v1FhiEKp^K;wKOSM8rW@`q=N~%;H#X)xOLC9R
zw7LI8wxAiG>-%Cup7s0iR-dn(Ee-BBmN4PU43&A!rWJ9TAa7YDf`Y=r-9!nBBOeD%
znjT)dM~TfNkA~3-$J?!6zU5svQD}(#j@kz$8jT%o8HK%tTb2-@!n9U|KetZowT^2D
z7<c{Y0iXUp+2Bblh=c+>=fC4Ri+EmMe!qICfk6ktZuk5g&@G089)?#`6D}YxdH(9a
zg?5Wnu>(z*%Ef06$?ne9ZUDH#+*E{L)cpIK=|7Xaznf6;gAF2$3K%``%Ghf%Lp?uR
z%YG82xS$6zfdhS_V{|aimzf<&JXSp}Y0JRh>n}by*L8npN#9?44-XnX#6dFVG7WBK
zU!NSEqQl1~95)`?-y&qKB<5|cZQrg+1R&LJ-P%*T@qCbyiUdX4oD7!6TuvQBbiJ6Y
zC+gkGY$5$zu9huWWWSg$^>af*CU(T)zvM5ilyL{xZ~>f5Efq1uVG5~^VNZd6a3Z*~
zBPD0vTJ||IBg-&)^=!lLbGG@sIqSpqWfyvQ)!>MA<_kx>_HJCdYCMSTn<wX^hg#{Y
z!GW99#8ej!FRbRHrZIlDWT;1C(2bSe%=O~D=a0&ggD>(FA*y@PI*3;B+1IQQ8_9?n
zE_d!33MrqeWwvLy#HY%0jP`q7q<llA3Z?`34kr1es09-9Z>~%t{ysBuuP)tipFft)
z1<5~s+F4eu0CrtmtaNE%N1q9Dv{`PY_j!uE)6~$o2BvUWMMEutRDs09G><QyNyoyz
zMa|^sbC@(eA}NSHH*S>o`g4HsA8)cK6<Qgz*)J%}#JrOb-_t*oaYPsHmX^zVgiJG&
zyi3^|V-jEghrPY7aAp-U(`EA3BMb14NXyAtm3BBHYAvm&I31!m1N#HCb%6Ub{X9_!
z6&tDbfOA+VIe;#iNth0h<*{Qd`ESIoXw;frkk+vPtvg55!)C>O@39Lk6}37#yL2_t
zc5X!%Km8^9m`>;a+uX<tHs{2cEZPAo&x_|au%dO4^+s@(Yu7f&;sg^ypCIRDQa7G-
zGGnh2?~khAcUoUdszerkKCWPelL(1Fnp1w63x8LrybAM|d*(CuaOqM)L{!c-LlEq_
ztj~RwbFXD@-yXKli?H;cM3|p~{6YK~MN4#5?OBrW6a=<$*(Og<+u$DHSFynUsj~L=
zBBHG+F^LxnxJ5??4>wnD+qb2qCF4m<q!AbcqN+twCG=`^*U5|8f$gP=MgYJYs;>{c
z8N(G(b#HbQ8J=o9ruzAda13Esx>YPwUM_}~QASo=Gg@?XGHtyeyHITkMJ3Hp^yA?^
zdD6Qw8|fm;Y+LIV&zrMVO^#K@gWjw>v73W;+e#|!*fu)9!oaMDO2B9($(Yfj-5u3e
zCv25j&!``+)TzVnOm|ya9bretU_CgnkmHilNhMR)UI3M4<q)HQa$eU;LK2*}+~*dt
zAl{dv5iMK{pgPWAu;N*NO@B?~7b-$1st%l1AaL0;tIyyo;87E=6lIXU;p^uQiZx%#
zVZhi)Tywh5DVg?QR6-)hOW>p<LCSvjd3rC$i-`q~9{D2Pe+`>UCj3@+aHs2V+ttcC
z%I(@9uDURDgY@+v!V8EBXc^lunom6}ezs$vX?>-cn{?6qS&qR|OdS<2UA%aY5Q_G}
ztWPt2ZZGIvGl^E(Ic-hNRK(plChP-q63{FdBcr!ZX^sKt?Bg`_wA{|TtQWWiQi}M`
z$JH&5k0Y=H8c)JcL|h=@F99MabS47$1o;+Jd*i2<Qz7R?C&X18ox96qyR`b#r?a~4
z^_(oNi<EClR4rRWL-}~ZnS-WOI{LR3Wl(uYGI==;B(|>?-LQJm?Y<wOCGEA)?xCMD
z>cQvh%tsK$gQ$?ioVsYV5Ov(Al~!hcHqc7{s^NYbFbhxtngc72F-RX%a92@>3>+T@
zF2{zj6DTYYLJ5043yWDZ*OdOJ1+V~e@2sGpK)^+b_!^u~pw(U$I*ez>;k2-*$4-W=
zTcr#95E?omhmY5M8nqU|DcH7HbR-%+O-)6hB2Xopb7C7IOUruDhKkh{!1u}5?x-Va
zNDg`iU@3Zib{<sqXVH~F{`iqdA$AcBH+_21iMj`vZFg(yc7m;^s0u`!<$~jwLR-1I
zuBEcNdNOiYu;`=-aiO7w*Ow1|AoJcjYe0C;wU0qZrE6hRz@2%~WBN1$<WhJaPp~hC
zixb?T+#E>=4817rg^3S{tILn@my-d>Oq-5_+2}D}<%=X+OE}VyMPuQG|G2U&6Mlwi
zmtY>J7~y83#Sdm}*^ckdNPO~EZE>G3Ek@Tg+T1|T$$^;a7bJOLbHd(WQ6L@(oOn5V
zH0hpE{ok7J6ZE>y!I2ft60$Qh)v)WLXh0Gx;0_eE|0MFt8JV(s;*8)^2qFX^M9>$~
zI^GKia=>rPz10w!OhE|17NUJ@0p@e0!J{GBf>aVO@v>cVo4r=P;yi<M7Y<VYP*Jf*
z#9|SZXb(j53z%n5S%E{{6gUW)9QV4{lNtDA3R;1XlLZreroQWea{{?Vc7R&qw^7Fi
zmuvY$f{2Siaf*s97?NJ-Y3+?q+`OqRwgph#iwko2u!bNGZL&6O+I0WVelz9d#7I9g
zM1)5Ugc^MSrXr%@pfE=la-TWQKo~hpnUFko<S_`nEO-#A9CuXeqRjzTqbulsaMiMy
zrxetr<3g{5yUtA?lW*N(!n3p7tk>HYTi=v@v?e*EBRo4u_S7m~&eX+4=czOLcBSC<
zSguLYB`9fryisO!&`<`wXnm*P9~bm-9onbEYBLq3?)22}D=U4F@-PSu7^8P&Mgb!i
zR883U2z1F~;j$<9zBnfWA&*(<cgVe9qwk@z0j8mbPB7p12sZ@h+CAfc$RlNZa@I;w
z+P=?jshPXOj@NZwcFBdw$`bU<gn@}jQ5oC;*O<tF8*rLi-484}4^g<HY$m3bG1M(5
zC%T(oEh)eDa12F%ij@bVZ-LcO^ow&aXWaL8$T7f|`E}$wye45~__L)M6Hbdu73sz*
z5bJe#N~FPLA%#!!sOXGLf0%0zFb09wkfY#}ZcHyOO16Iyz*DM4#vsI)sJ2qYxOw}B
zlqw`_3eZCUTB6Xju<*9Y`i%Y#DMSdZ08Y|^O8(=rolq0d1&QUPVy7&Gh6or`nnd>>
zZJE@5WAf*$v_D{n6ZQiH?m>lm%}dtQw3WypFDI_NI=kWjSTpLVb6G{j$Uf5JPG#W>
zq{UKgP2tmJWpyv;<C?_LW5&GhKmNNC;ts_Cht!JtbCZU(r_~%Wy`aC}p&X=I3|0!y
zFT6oUeoF?@0oyn^chs6Hv!F~385-Qv#Oiy4W^P|Vto~f>GU-ZDp^LBKC=ji0Se>*1
zaA&n+G;zUus#GD9k_9f*-k|o>p9`oDMK*&`Cvn;?Z8Ff;Zv%JkXKgSE$0G0`s!LdR
zJOMJ+R$#AyAAn9-Ss6IyeK_{N3k%V^DzrlEuaNuF+!2|IS8uc=F^YY9)E0sb2pi}U
zjF<+rK@eylxYWMfU7^zSkzpGx!Wp+B_RmG&_^NR7r)>l|A%+yjcF!;-z)rxBOAha_
zO|p6_fat<ORN)5z9~c@!usj*~ZPUl6@(8NLn8d%Rs>rnN?!xq64BqdbI1z9_IQbA_
z!2gIDI$t+~5QzqrNd3~gz7sK-r=RR0cid^t-7k-33aJb@1w6YvQ_=lyDYa|Eu@$Az
zz=2A5mA;yxd+?u^FApM-LNW#KD&S9ALetMTrA_r%eQC(NA6TEK<_el&L4{N3;}Gs{
z{?a4ww#=|odRNQ)mZ3F@T6t!p**P#*Vt{;nOOAJXr{IYXmnRM|?PxL3^e(Wf&`p5K
zCT*!V5bOUY@9!S`Y+>K7LYgd=DKXB1`cw$fkQvT%>_uXOJ3|;Z2^s`Z9Aiyg#zl95
zE*3<;`CbD)JKc8Z)^RkDd|31tlL(&TcRqjKF~C8v2ej@NCT`~q^tA9&<v8?g9EC)L
zZ=6cEDf#llWd<ep3oNGRrHltprW4vDSj)$&A;dv}kpPr_XdFv^|MAgfe2;oBWO%It
zBQVPt#Bq?1Vx*|MsweOo6@47bp6NJGUb@r?QVHCI$cfrYaAl2SeuuFIi<=l*K;BiR
zO^vR57lB#^HB`f=XnpmEvST?UELB>(Q4hrY5kn(LAt~=)7DPxp?im<v7M5mB9goxj
z)7IC7)GThj;3OCx%w4@c_i))LHAY4ZLMIx8=Dl4tCy><mcEk5$y*J+topD-y$1j6P
zx~i%4G9UnsZdt9fBxkS7B4Mi`(tsV67)U}r#4Zcx^JiU;(5!)_0<yb!zjWI1dQo#m
z_Ha{|){a!w(%^td&pBdZDud7GgtXKNB+eiAxUf!Eir;I7b&=Rei@)butRGd>tvBr%
z2iu3aJdv9OgavDVtXK4*z4nW`i;@VJKOlaASPSL_9L{uyAwzwi23;6iO~`lu@y%V7
zeJj_ljjD@Ny0Cd_p$~lgF6mO!P#&3{y2~Kb3kxq)F0)pQdvJKW<iKSuKb7UCp`i08
zF(eGIH4MOSRNmb3ZU+5xnKV~s{VHd1C<n-kKUGN3kuP@?Ig_m)lcx3kbF_IE%lnT=
zs)BC-zPryzkuVw&b;g4@YxLhgX=XB0TArt1G+2fT$ebBe7tdd`b!Qe$fR-}MwYh1e
z=cl@&&td6wA@Ym*9wGG@{eD`uwHP@BY;mY6)b*2@)Hka;ON>-Jddb?!p&=GX3jLc<
z$_eEeD4ft~9lkTi)eYqfrGIPYhQ|mA@d2pKTl{^+1KF{iyeF2S5yrCmO<g?Le=K7~
z8)!d)5SeBdmr`{_k25kE<>QiQWj!aK@XDc`{0yDS7v<t9cC8hQQlz!_@sUSG1e!K`
z+j4{XIWjZnO^;4RB|8#lGmM3JPFNtaibd3nb9OeVgBetOu~5{S&PWohv(O_TMZ3D;
z-HhA+MxJ8{jy96=!mJMqdKU&4UYC?Zx3mJpsy$A~pen7x+l<i2?&~yazQmw*?YTt{
zmWhUaDO~}j*dAQHgdiT0mqz*Y_F0?rWMI`)j(~Y^YyhaUD=o6`73+s9SqvN*84-b(
z`EI1m3{IWI+mN#4ORrr|gBD(x%;IS3X>6aih=7haRm5XQ8{_&)Q@R$~{iDvSqKr65
z_ojf-%L&Pi!GX)&{Je9$?GHZ56W4G)C~&RFm%#pDt6nGrnTa{8F*o92gw!zi_4|TY
z=)?~AH`TE$T<zDFIlrvbH%p|Ke5G;#OF?+HtiU*me)YoDX@QrfowgAl1n5g1sn7)7
z@P;*qCg`3R9#El0G(UB!FPs8EX8yO0IjfC5oTSBcb=n%dMoU+|^#*5&IO|%u*+V;e
z=IP$s(E5f=mJ)76hSeFnwcSx?VTyl|vo^(78d^LR{27|dhyER;ntIT`%@U8)Cu`j6
zy>1V5X`^mZs2L!wH=Hh@9=u(%QhrvY_&P75D$4tICwsM5$zkcLrkJw}235}NEe=!L
zGxieV?Ij;e#khSPE-A*zyz{L8l`2hV2ga*Gl|uN#fYJPC`JN`GusNbUmYaZ^2@J+x
z%SS1!?B!<Zy8nUfp@4vZ-LnYFfN3M%G~dvcnU#)W=8^c=icjc`&<~j=XOY~$OXtpy
zTEbOL3~o+QHR<^;b}<A&1W0fD3}Zn%Q5+3FeK-93smD1NrRNGO*W=Pv^WrG$G2b~?
z1P;lSb^UDSxTrg_)0mhxR2GTjdYSCG_@dO>Sps#zJ7z%W{r|oquyqf=s1Mf5Mb49y
zssD$T%wjO%b(=$oY@%#-tvN51Qa2}Utm<xi)E^iGNB;(e3PlciT&LIq#XHNoa3)*t
zU1#Ye^+BdpA3iAKIT(MNi9ppJuz)2WpK<MO_Extvyk@q3a#2T1W5Y@Q#8*{I$up9Y
zP)^8+2_`{A-Z^-#gXvJaq|}B+5mC8{%94j_Jb12m!|LjfA6J$>Ju6E}V>PuL$r2@7
z=UZ_Ahk8zKLYYw|*NQ`T*S@XqWJaFpSd4VF_SVam&H|pgzWCCmIeS!Bq!I0eN{Ued
zx^yH`bDEaL7F8vG+M+(h6ojG{#UO+N+#Z$c{w80KY>l~$Hx4PZYQw-fldG(wvJP+%
zq{-{oqHg2sYfli9kmB(AI%uLzP}KUZp-4&VA;*UO&e3YuPKagUnL_m_#`~GUb<)Xw
z@Zfca$N9%bn^yX%)>rq}mdSE+Gq_2q*9>XHL_z!aR;foI3{+evzA&EMQt6zNN=Kv{
z-QC5#Vhhcpi~QKYPT{^%5r^XfB(K$1X#IO~){>BgQcW`;4dbw2_7zXxzRtAEzFF60
z7q{#!7U*@?y-pkY&pIRC#?yd-gCkJ1k0ZoW)Du~qC0YSK{0MpAG`)Krte0C444<y3
zwSS<!<IUNBH~1bZSR;9q+J}OuSk&)4E=pRV!8*2Jyz8|!64tsM8^2SBK7ndS*yK1n
zKOENjD)}YsrF_MAxg=fnK=!e+@}#_4>rVm04HF6F%}K*qmUk!r3u&aog)yksUQQ%Z
zpbLW^w)H2TT0M~MU_G9ogKdva=3G(&pZhg3%lfpDlAcE{&%;?#5zPd#;P5dS2O~1Q
zyOZP^XR8w6Aj+|iA7##xN5731=ci_#fV`~Y3}+T8S<1q%NT;V+U;7)@v<I*vX`FJ1
z%Ve3Y38g8BmewhIrXa=qF%Cr51F}a+W!C6_@Pg#<AfSGQm*?3DXF36UEY1xlAqW%{
z^&6q*Cz~AQ5l|V>e@VXlbQ&IB32heOz<BE@*F0-&O%VypkDrrWoDv&XC*_|9*MBs%
z%)TKTFOE*fJf5;^)qGt)>vU+{*2xpd_DT4qz?D%QXKgRZwlBBcev{=C$Hs2!&I3ov
zCHzIY#*bjjhryBUw4PdLOyUnqNB<IbFzDFDGz!Ej&A*FhQ2K|chZ?efR+>?rpKK*_
zRC~mb(Rfm;={>2F?e-oxa32$@)Hijk!%kKjqHglJxztN4rOuclCYAdaKUW_(bEZV6
zHCFr&KB@fwJ#(qAl;%SWq+4q)9jf^650YIX@lPbhfhR5B@%>#=V^m0$(SdHqa0Im5
zy9jn2qQb_xbB~ku26~Ci-D(D$4El2R(BsV|^qb7-3^rwwMqG>P@CPzX&u{*wZh7CT
zNYOH&2Z1tL9JKg>W5@PF3`N^Uxd!*9>Cqgrv-e=rdMJ0Hcd<Phb)@SKr`MEW;^V8$
z|Lztw+vSa^UOBxMJ!4r_RmV763!?d$$9uBgP?SQKsiLrAV)X3UJ(AX-G{T;<T~l*r
zqOE1%WKpIZmF_g6UE6+^U|~{qmqlT3SWJ5#B&mR5cj`ym(oIg1a@#Z+nm7PSa+&bX
zXxcFX*wyc8=V;--fUY)iT(=c$T<#)_+W7NY0VbW{5~Ir?T$D<{9{fG9wRntE+iR6R
z%P?l!Fg>T3_gg0_9uYI!CXHxykhXU~1&1tS*r%}~^ig&p@Hy%%9Hl8#=Swq3OJ3YK
zBOdc4bRReabrg7((7zytb#mJ+<qNl>)ZGQtdv1Z5F3;BQ>8=HX+RKSDKWML{Lfzz9
zwn_ZIrUTD;cmjb7Z~z$cS5h$bx77P%ock#u{9%9J;9)a;WXUdO+R2S5PyC)9s2nq$
zaGQ(ShI}e=J67G@{U%=D2zx@Tc*kq3)3jTYWoyy(RNMye6Zh|*of&jrQtIYpT-cRT
zfpzUXP1H>&=|T<Sig#gi{bJ~19?<iX%qtX<Q07a7X%;}*(o_FsE;>xfBV9v1Cku_g
z`@UULzWdI<-{KGI{o~ueE~1s<ay|Lbf=9W>_K|DStWjsOR>NbFl+sK?c3qy(&!?2N
zJuLn1dCktR<(JC?nOS>Nuu#yrwlr{qoF|m4<X*Eo`|!uh${!D(y_7u9c0BN7%f*!7
zhI((8pFi$5=Rf;e==>sQWQ)b)ooAahmzZ4A$~0C9J8c~KtKiOx^6)3JQ-6N)I}{gm
z`xil3l`m!J3YaKpBh145aCErvuD_CAeIa%se#o_iKq{Rfk+%C^hENkU#)Df5FdK`4
zanT2HZ_C?X=YKY2V0a}xX2>+F?aG#}ynyK0>dKNfa~{a>=D@fovdcbO<q5&vu0~|1
zm?q?THx(^xKAz)Yq8(ha(=a#P-guXD@$_GAw^Q;aN9t@bniA{&G#P>Uh3DZ93m$Ds
z{AqC@>q|uB!{=Y>vwz8cNmkveysD8J|MklEP3pyx+uNnT9(o!je*P7=xuRh05f|;$
z=nG4azTJM~UFiWw7xy+wPIao*>PHW$S}GZ@t(@Wz@YOR)rXg88W8jeL>;AJ>4g7nc
zhH_uk#Yg=v^&O(8H2v7Uhp+Fs_k6zh&F~j%7u~A)5t4k~X=dWP?4yR`Q#?HTmTueF
zF#YI|@8h<<oI{6c6n1cn%>a!zMv>Vgj^|oLo90)f7Hu)UA(iE?yuEhUt6!xJjm!Eh
z9jP76U}DXP_~UEW1_oMextrzJT+ueB^4^Gev(P)34h!jucn7qdCbGT}4hZlB`@u7I
zSh-RWSwd{0R<UH$TlcnMxnhK^N6df~+zVS6g_Kb}^=-ta5P3ZBb^QleX-Xc>c4P^v
zR7y3NiTtSaS8dcN_cp5&p0)VOpD{XAf8(L)(~!>J4%n4%>6}{ed9SN_c8hgIvWsH*
zC-2)ELzjNqqMdBL)o7^8or;b2uN|i7R<#U?T<sXVS^r?>%bVx4FQhc9cy2k-IN0Ce
zr19s3G2NQGJpS(4VpLsvtEehEF0RdrfgiuvjoWZ*?~D5$)Cjw5&V3mZHv4$)*rB=a
zKF_;Rcl(mfxt@)m)*JtNansI-w=H&N01dKrZ!!=T61IZp&-XKGekkCLmB#UQ6YUR8
zH*1q&9g!p4K2T-=<^J5>KjP$*E6WcGq)rS+;;egl-o#TCzw(tc-%m<Fq99~%ay+o-
zZnuIbN))F%NW8r*&<?=ubp4>GmN&*IEO-0G0}kaMO(d;P(BIMr{3v5ZTx!|+S4Am5
z6U?(!zUKU4JAw_h)0_`nsn1QWUQjdma7CZ6EgjnSxpCxtxy8Dw<n33l9PG9*C_z^A
z^Zu<nS9YrGPPV?;JK3&g_?g_L2HOLBec$$)J+W0k6ScPVe$#%|arwe*XUEd$l)B01
zbHD3nyxXCqe<)gdrHH?Mt~|fI<VI&okeaNXLA$&z)_Uvb-oG`jc<m|YKD(#y?7#E*
z_icIDF7}3N{U&o6w;O4{aI2w6ICE<z4@qCU<F6^IOJ9FKUj6--ike!1XHw;v$XYQx
zESB=LGEm`_7xF$b<ra)!2V)Vj)h@cua+<O!E{4sv$$1idq?H@~4wQ5{mn4^D=3;Pb
zp-hKEg=eh5p%B8Z$3kdS)NIiCTT2wS1`O_#-%UC5uiD9D{;tlcm#fK6&G=(~N=Vq)
zIX|CuXzEygf6T64K~MY&-D`%P>gd#B+c@l68}#0aNCRtfa<;e6Q(sg*^zY9JhKHWF
zGwGb)Wq)X?%ZY;-^$p`cZ%!*3v%L4*sdwE@x6J7~P3cL_@|h*}QKu@_>gUhX&374f
zx#Qc!vJsnh7U2Y2TGX~v`<5X+wBF8qeCuxd&yWhAlKVx@J2`V7J;yl27aMHXu4*h}
zc4z&VC|gXJg;*4E;N~L()I$2}Jk|f4SHF4p5vi4xSt&HUFYvz*d;r)%PJUTD{>d@v
zrdz!rBR2ox{7`DTC;Gc(wyDf4GVkj1>Z^+{shI9P6Jpl#=ND(H{T?6m3e2=jW$nZ&
zC~S`%8Km@Lj~klTYBy8!kmP?XuFSmD_2VXU{&pP^S>zm<*WtqB598z=Z=C40rPL!b
zL?OMc(za92z4rNCE!$vuSlSeRc1=oQ^26otO&99EUDxYX>86>=mrt*|9%MIi!}v!x
zl=^S`m1aA#AZvKR#*^N5#~mXp9UFgZn&nxXof|S+WAc?o%jNwis@4kO%RiI)#TbwL
zT;De3+XNAa;xB=wG8D@fb$TaXpkj~^!Q{(eVIO|wmq*rlnOK1m$0|l(hmuXt*T#;k
zD)_QtOjM0y@xlv_x4Ri@T8=e~S(H(>KGp3;euQIy>2$ybreBoL7Vb5#NotFlX5#jj
z+9PF$;cr*)@@Jnd{FJsPEVuow($0G_hLug2_+ed71p7x7-euh^byc2cR59#eW73(b
zVRMSsIGx}5&`M)VyNSD)ZMrdf?|=cC_FcC)hEF#wW-_c(e6b3CJMk@j)$&vj|Mc1M
zIW+neBh2ZW@7DLaGDAQAfpz|9)1^kgi4R0fK<+CxJSx;ol1bvzR>ZM&`_`42?MGXg
zjXgEA8+q%m#~<c)*<&FF;?fj)`<1#0gL32r-Ttv~Om`?}Q<U~~^M3f)@mECW<I9zu
z_$af$UH{rM!D`}FY@qm-!iD$gooyr8g7v21Mc+CVQ?QxZu#_K}r7Mn3l)Rr_?R<Kl
z@i~*wz%vK`S%FTHN%qtgaeiB;na%oUG!1CSN<-eFZI7Mv%~bQI9DmtC5{}IE<TVgh
zMc-ZMk_)TuWG{~_MlVM(sQ}Fas%wAavazneG-klfv-=&wr<>h5H6osZsNVUZiTY((
zU-5xsq&^C_0)M8(=UptUbjrqeRZr~FvH1GbQH0Xg8QF2;bas6*E?f~OoSACQ%SL;Y
z*VZr9G?rSw|MW@hauDNp>qoV^l8Nzxa|_T!)p%51BjDQ^HI8Xakkuax{NXVs>T0no
z3KcMrlRBMe^$Fl|@t2xa)0t6fOHDRBw9Ze?VRy)xTwGHGA<kIz?YjFtzNP<m7mLgC
z_`HwU&$_T5T*X)S)zKwx4u`Z1V3nz@n8gs3CFTN;9=&!m9~^aR<0BW<%947s4N}jg
zMq#%q+AKJ?peEF1?ihYy5rf4>Va;QwJNBqLHFp5RC-B4k@15C?OcE{5%>DCxO#6xU
z;aU=i8C-Q$IJQ*0CsH;K`}d*klRLgvviE)OQ7#5*YX18p-EJx|G0}>IGko5C(2>;(
zoX=#GrD9gAyH6jR759uzBaJPT)sfC;KX^d9w0m1ITj}S1nxs5pdz8f_o8?vam90;2
zKe$<2E4}HaQTyY4Ce6;T>eF_?@`DZ+8~TLJdi-m&<JgyJ<ANLOJHFjIYJIYuh%?2m
zgqmd)KaYODIW8o$VlC4n_9jyMI9p9gfRwo`vF>_);?Dm)Xe*7l|9xVlmakHNHK)|E
z!15#{8u04+Q9WxeYoq01u|_I;3_5Z(5O*>yJLq1l!E`7d5d{(Nq`Td?RZ$Z<`_ab8
zyb+a8zQznluk;Mq*ne-F<(z!yM~TP#FVFokq<6CG`QC-u&BwCe<+zP}k?6Q~sDI>f
zJ&wfs<zYt6=dbljj<Pe_d}D3HZ_{QJoi0kO81cwuO@3%$WzPgx{i?=AofrJ41@MmI
z1tUqLS3g;>QdhAJ?L03F3NyW_(a-mw11~{MZJwWMyR@nkO3(AU2D_Q&zU-*4uY81X
zyZJ&f99eHfyUP^b;AYDtavK5-2_ETPz^rcF<*ARpu{m}hK#RIjSjU(FTiGZTHT!J-
zA&;1%sYdk)JFcCz_l{d*(C%`Go6{kSh5f2+pUc;7ZRwyrrbCs-M(~pI{OE&lWVVLg
zZ7(ehj8Qx?qdgc8@*`3z3M4_zlgH`8xEn&AiaDLHNoVDCWC_lDpuXG^S=aC7T9O$G
z68_cF8GKvN&R`gI!Rx#;F63S0G%*bFTaulTFoI*_O>|v)AVdZIJLfBl9x!$E=g(GQ
zv&uhvRT*?WJfl8nW^|O~QL>Bb$QNIIb(02bOifIR%1%z%_~ufh-EZ&jwk<yE((g=4
zsI@_TQ}MQi4hOuK2H6?>zIy9jiN$hr^|90I+i!-3CpsI1DrID3a6n?88GtvYQxfI$
zkG4WNQ*)<r8FR|#em0ynPBjh{--U0}y+U?M`jL%k01Ug+>?f>RHE{6tK>buOvV&sR
zh`_|5<Zh?SP!Tc1)JDz?8@CQp({NqIn)L%qtj&5hZ2Z%$_r;BAFJ9VKQ3TGzl(ekO
z0j{BJWXgBJ=hepcZc&7-P+z^rT)HVa>{fnm=_jl6!?*u#cVOf8i)(+h*xfhZ5>Q|>
zVD*jX4kIG@dzgiV3dtcdul(m|gm6yI&a-kpDf##QFAVuA+zy=5nnt33e(B$H^oDl$
zQD-_H9NQsbaWAUG0_Vk0o`O|fx-=Mrf&2@z9LZ}|^Zabi-5YYFBz>sy--i_|PHY_X
zn9($p{@6*}XLsHK<}Ji_Yr@{Qvpbv4&h0Yvl*RLh0qXfj@7wU!;w{Ga$kQonRBp{7
zF(+P??6P|Ca)FD;BCzh`e$-A9YnQJqjZwVD!f(R8IMADJmEO4x=azapdH8RJwy8tU
ztF<q#tu+?Ij|q##{s$^LM7RN4ZQt(srvvA*4>TK)4#Ol3W8yMDeoXuKg$-ykP2m<9
zG}s{F?-e4cl-ZN*IjCaURfqkVCaWPNt1_<5*tgPv!F%C1h}n4{ydcWMG}jTaST+AM
zsZWg3t6Td2UPSv7)8F=ZF6*i`Z<JV1D&hve8({B6ReU7R1SOT0q-%RA`>B1~ceIZ$
z7Bn^22h@us8?beym6^QvAFb(0KPJ%NcgdKj*#T0sSVVqlh_*4*t~S`KF9RKCrzuUq
zs|0Vx5&_J+6#Nya9|lFjuqi@gu?>YWe?7DV5SJ(>lp1OSCAUG{g|tJ!65z+XfA(85
zd2S<i;ryjnR;4>~u-RUtkaD;u!o)lcEtP_x_0(!&B8oQ3-{zc3!}`Dhfy6~Z5#xp>
z(Z%g1rk<G<`AuiC*<_#<?jz^53#0`DSHj2%`RpVC@$Z>x7K?Ff*~vXRzElzHvzdzh
zeY=gn{tE+sx{4~s17lP=cgA}SiTd&<J5D{Ye%1Su<lTnve(M-2>2l$*m|($_Cs(`U
zJYBb&m1pL5@d-Q=h|?$3{`~o&%`-=Az4Kye+;pq7-%v*XuWUu2CiF^Z>ZnUF9gCUU
z<>1OG@wSFoChRJ8+f(e?x#_oQ-ygOX8dg>gXQ?5?;_iQS^<|!8`rjFvzcZ%4>AG@m
zaFEOKEgcxN*T&!1H!Uq~`ayrjtmII4p<QXWCqrHiV9@r?){PUEC+fwdqTg_2^RXxv
zuo=Q9mjs1VWA_gy)e1)0N;z`kFNd?x`{JcA*L^wZwXz*rW|l>C*4UT&E4blz>F9a0
z&#DVdUSLW8t31Dzk4#b8ou*nlinwF(jzLef`^0d<)wgFwckdgAu2eqvcr^w*JW#Q7
zBZ(#7W$Q<2O1kFBUHrev+@sBmqT&rlCD-UC`ArJ&uU?qR*Rf^@3BhKY=FdM`6wPF2
zQfuf|PJkb6(UjQakBK+<+;X*yRq7!uLr%b~B@+J%Z>iP%X{zlQ_!8<8#_PnI5z?19
z_4ovv)Fi#>T(YrfDd(wEQ?u6K|78Kiw$}SJQ0a>uBZBD!7kYzCMI;O)ifvvLy;~r`
zb3!;BcHYyLv}JqOdo=4A+v>EP*SCrpPiE&>Z2(w{p?ay1H!wv!F09gX3^96cm*@ef
zM4Y|NayQ=1@ZHuit5}I7JOQb*phfhD3<-8Hn5T*}*DJI^dRFtSm|UZc6n}jVR!AN2
zNX)at)C*%#@#=w!4Q;Q=nJluO6=1P*<gwW<6CBD>7YW@Mx6J#){Aj^e^8KceJp?_>
zR4+xEZdmS}hueqLTQsVS+(=@NiNRT}^vK`Z!S}1KRU}t9Gcdhc>Unv=ph+!eYTrFs
z3H1hJk})4QZA<;M&PpSRD}e97D@yNlew@(J6RI7MsS)2!>gW^%oi133_}|>QnJ6j_
z7qxSS_Cgn%Q%E?I$^v>^#h^?KJPLNe+F*Wa{qM_W3*lQYrW)~O*pKR8mr>Fa%?A5C
zM?@kj?IaF_o?@Ge3kMr^?|5<Hg(BzunQh1=8<#%2?{*`DzgH)$0MDf<?7qLfBwkAr
zUwr41NN5XgS`;{MNG|E+xs~a<Bl+(hjX@H+y2tVxAxBy4cO_TFW|ajSm83_T-ADCY
z*8f)NxDVGVu1$C`c<jzSYez&j-}GFB6(i!$i}%i4<<-txu%KOhaS(DO4t5NXH#5aZ
zSvwKkvb+4>S}&1xLF)@=jXa5)4Am(T{f?t#$37mDazBLLfN**FZj+b|JpA%$f9;dp
z_~lgE!jX`wh5f%O(Jdcc$~GkH?u*H<-)j^)=-laCE>`F`=wTZEaBBk&%>m`bl-Kza
z{;BuAxbbdcoyzZ1I~@DzJ7;Gus^yoVSQ7&9Bv{|PNOgU=`S~8t(Z{BZ|21JYJ;`$O
zN6z^kl?{~WSGIp!Jn}_T+}Dca7j%u4o;NcxLc+FRs<#`lA*$4^F?rduDK|fb^q1Qa
zHplOJb#}|PWpifFo`SjzO!~~*jg}f%XZmn^L6C&p8}e$bz>e4zY#cfu$iJw+M(Zh#
zFq^1)yYsj>%TWEv{KpXUB1bUmD-`q_lD*S!M7ugmP<3L7-X$_GfMbB0+U(UotU?)z
znoNC1HHT1dkF&Y^o(vxZWk>>r5tay4kjcaPHTF~Vy>-OZ&aNa}FD7N~JfpBls_iy^
zbBr&(J*dDU|Fm3ok@GwahF8UKJF|A~S>qP!j#*~^;ZxtS{nM6#LzS-G)lJ#Ic}qu6
zDW_-dL(`!8+aJFlJ5VxZ>+yca(;_RjoTxH#>7917%PX5kpSpBcG*$C7N{<}iy>jIR
zI-xaTUyLH3kCKVE)Ch_*i6WJvb4r5!(Hy5`-HzBKn90%FUJ2DWaL^)n>bV6m@$sP!
z2L5n{yU1D;WtL|PlezoK%l}JG6GrL0GEhEBS7GOoH++=$H)EkJMzw@AbyfO}MnYuY
zR*ye;bz=Pb$G`uf2M_itPJQVCy3$ve`;me&N8YXL^w)y(bGzgWou?sGi~Wy#wPEg0
zcD>|mim{u1)s%&&f;SfznG8&d44kldQ$R#Yh4J6cyM9#|9jX{>!)P|A6N8M#+gBfL
zl&i7lY9Q=JpFxCAZrg$}S?aqD6E8g0A2jH}_RBWsycY({=$7=Gk==yl%cE~uJ~e&c
zXc4Spbb4qTYU$0jmEYqVry7OzLB1()LN-hLP?Y|lA4sP*IrC<f!DmEVCs1+_1DH_v
z8?cWDlHT**uo$D5HX&5M`1Y21;Rz*%VbEyeCvvQI&5mnFq$GR(lY5N5VdJeacldyL
z#!SMPe<POdL&j3JW_dFpfvLenRaH_o+Se6q9bV>98+`lYmVNEOKPDC*)L1q>?UB>N
zsR1^98Y`>DS*)x0)Rg_)E5O#K<L^7G=TYxAt=>6i;jM)`!?G5SU%3D2-kP1abLkkb
zANg$i2HBm}&Ydaw+qWhz9U-+=om^1mVi5I1ZF0fVV2hCROPBjJIB^beh7dy<a>v<9
z|AU+={qi%==nq`(sCKX7{P3@OJ~l3H*gqNiF~f}impOGqmSs+eFXH?|0`2DO3(W_S
zD5|zZ8Y(p{tn_5Jd3Vyb@I+X=_?dHX^ngc7zH-q2a10^??pR_!d(*PJ#p%oa-&j2y
zT9N1&)%-l=?1aHXTM|5@tk)Ffxb?`sYjEvgu2z=S@!mW0E1jlQuRJ|e%j<ZvdHcmn
zOF)NN)u;hmIkIGZTQT&>NHo43h|8_FezZ!1vlRr@JdJr+J|ZIFWUsKd-&G&0?mzIq
z!awz;&*YeCOwWnE6k_?g0BSBLmuoW(H0OO{D8m&WK3IZt5TkeVTeuoapM2Y^AEm2E
znM@N-niYUXeD0-^*vgSF!c>eJ_baSjyH;$dP{2$AV~6MOYijQQarbjKp;f2r?({W7
zPdSj14Ir919rBhBl%8;+7DtuFhHBAj+<y;R)go|O2!gaNVk;!VRpDQW)N;U)2kz@e
zN*~Cu9*bq{4ajE#TOOScMy(vd6b0D8Nu3UAvr-o*{0(^wq6cKdXRT=t&3$)lcSJhA
z!Diw+(V)-`mGSbJI00BPn+_T)d?zUZXx>ou?7q44*Q|e9o7KR;^lSz~aQP&>`XLrY
z`M1sTM*Xqz))CQb4QUpId#AO5y6{5XeObc<mcm@@Cd&6NH&s`znKWIs9Z&wPL0E1G
zwu+GLgjKnVuTIwj$nS37vHJvK=7q=Ut|Q*CVe6z$_kti6viMEa+0u^P*2WB<^!PG+
z-v2G$ER3z<eOOmWogth%+bVQ^g^YOe>eU~TZgjkE63+n?R;x2-5&~`J-k)uzwY=$3
znB`bMrO;<GC>DLW8fEJXkCn;h1X56@hW348=9*}#aE|?R-wC&Y4C|O&3_132zyUuT
zfxcBI5Bn=?;J2EIt+j<-yNQ~^i`3@z4PjCeB$*#1j0o6Zt1OA%Qu4h07?*N(cZop~
zPS5NChW+#xO1g%Oq(h+VAq*{)*x1?(Izk7tXZG0){OkBAds##}x6A&(ANK!NFjiKk
zLUMPAi-zz?%y3s%SHF2-aL72G%q^t7nJa|fEJ#8yDQAF2nD$NqSM9Ykno97eOU+#2
zqpP^=Q<hbkF2i|D+o53J^LVwhv%x$~+FtQh_#smm@F6cE*4<B8$Y68uwjTLy&%W`O
zit!GHY!0!(7yAoJ$`e761O6l+Z+1Sjz-O{meB?iph03P#e4B45kkWX2!0gCW4>ALW
z4Ag(Ct(EDYSS+?~CGeXPtJL(9idgiFljU@J#uVcSx`K%Up?qR-ggYnGeM#PI&xh7z
zVxb^s*wXf@y*!WCcifJcX?^^|T}@|N9R~sAgzD{!Y2F@jM5^INp*?KpPj6z{sGgm}
z^CXE8{UT0XylBX>_O`9g-)tV~(DG^Qe<A$!=s5-;7Sn_QH6~rDP9_e_GQUZTh*}w0
zOo=PBvebaWLzo}^ch+o;y~ytQsr&&O)Z57;ZG?pL@_KqFhzzO@J<i+#_eow(CNW)-
z*L9W+Wi)Hrbi~q01Qo!;|32&ZC7?H=JEl=kVhn6$_!hVnAw9t=4Zw;fYGQBmd8gFc
z$zuzN9TM0rV3&N-mlr*lSPaX3*V%NDbb8se)x$f=QFJner>CQ)_C5W-#=bVP#_YM>
zmUl<g-G-Zoj5fvBtrq0w1hXU1=T_X2!@R;rioWbpkoUJtd+_fUqKc?_U|ktMX#5qM
zBrPq;fug$IftI`f#o5%<@<w&+ZYZqA47=whU6PFLa-9A=dPq1MfBXqV8SgeYh_P|T
zTdu{aPu2ztW6)Fl(RBSYa-B?c4xFjwBd(OZx~0$x7_=)pCN*0^D{RkD2JR<XHy(eq
zdM3j$NR?L}Zob(Fe67o!W#gS4`$znhf&@ZH`UEaAD(d0fE?Mvx;DI~|t7T8|qCiXL
z&u}m7J)j3_bTD+Qw{8D>+iANe@l4z^yVWejed@S>=8C%HC>Q55SB&FPi*t{x5TbAa
zo~}5WeqXGs6@GR?6YFr}`J}NuQ+V~jkqpkid^t%!Vmr=qjL{J7<Yw=mP&Sy+Z(bZJ
zK^T1fz3p<btJonYlnbjg_0;|!e5?>)8!|n9GC>P@B)4Qef?HBkQ^~{l(@}gBYIdfk
z{MaA4f06QFQVD(fgf(l{JpH-0@uCb>3+S(d`9l4OWxl!?ge46&J?(5&cK_c>g+dU8
z1$c*6;|Lz07v{xv0_`m{w$yJ$c<6_SS+C@sB7W!Q#Gy@PLE0Oj`yCmwI&{N$?7VpK
zuzS2!DC_~}j?F*-&JIf_&#)?-3fX<(@y!hDu-r`J(DXl-U75<pTe;NL3A0zekcs(p
zATO~PNaNsvK)L@??A|nkyzY$R_Y>d<$%|_>s^@`u&x3}$NQA4;`SW<UWs0BJH#7%H
zQy|!>+^#%H$GX740X;dOC{jan+Rf9@+ozvlt*jK9gZo8?lq$MM+8kUMfE4aWjr8#H
z(iAThw3MP`_gFjq&lA-?$TM*c`}yvTk6y$jG;OSY3&0In44>hLNxAmrnZ(KY4CTs|
z3z*<DAHhprBC~nANEjY<?TQ(tVzQmihq7<|_UY3A^8Z@Rb8_Y;YL-f#RC<0k9Xx9=
z90Bcspes}-tG#S`a^u9@paWJM6|x$+(t*!+QMjpRp<k>M@<Q$4faQLDns(B>3Af{8
zv-=BJ0BNB@!@6<*-_zP-m6^7%=jiQSE6<Xw0}S8Ib12UqZ94YC(PB>N|6{${p0?5Y
zP!^O^=_wN1P?R?CKqO?j=dwp%h*ZP~av>N-=I|oB-Pq4Gl`M=HSr0X}W3Xwj&I+@m
z#{~;R6H(k#!h@2ur0}K0>L|{eExU_pw|tvMM#FxzBE5=Bav!-~$FE8jW&Rs8RO7mg
z`yOc;Iz%$C1WG~J?;@CT=p_dv4k|<>*d=(AuIEfV=+#qC8yD7hoMKfd966x`g`kt6
z)AGw2I@8%)jNhVhWFBp6eex`3%Z0WR9W(3n+7obb<(NO(Tfb?iccZT8?Dol(c^`^g
zdEVK{$sTtn>YPyw`a%@ke$w3ff%f`lm94)Dy>@$=gYdj+eS!Xd8D$f;Z8N9o7j$ON
zQR`DHHrY6z;ntF<r(C-6yqeLcG(Hq33?>Jx1(Er%#~^P1L%Rp82(lP;Z_uoWSx1eQ
z?jxIvC4}77+Qk)DOn44^eJ7o74fseGb^(3zljXef7R@s2b!uT2GZDNTrWL&!9{)ir
zG;;M4SOW&DsK|+1NrU(G;~tC2BpgJqG9IMTrAwdw{UP@xH+4D}Tm;?{Zy^v&lSy77
zK&OHet{@PZ+eAz&tn{OT`JZbR4x|)UJimSmCiUM3<;n!EHYcNNK|frqdg)L9lyXsa
zs`4m}3}bD5@XUj<5ONd_?8w#TjNqUz02>l^lbqRQRn55W;lRZ7KKyBCG8A)Lg$>4m
z>Blje1d6Kb{j07wHeyOmFSq>{2dp$EFLZizQG-ekO_8>*u07!Bvg$O8Um-^ana)9u
zjbR!HH>7N0c)yz}t>n@*@!I~40o*bQ#)s1Bdcq)s86GhiL2Sa=EgiOPJ^wcE`F+OG
z_6@$A3K~uy|7n@gK7ANUVYNg--v>66Ng>Lg!tZkf(-&jaCKj~|!c*|w|I5v>TM`Qn
z8~<~vSb`@5#bbs*JtHEt$tZGnD`Xd0dLi0e4-aTcWdO(rC)M-3L^aZ9y2F{=cj;*Q
z2b&Ib;=shMemqESSB{eycEc9GX5Mr52s$uz$r<HSZy4HJM`sElhqHQ>ecu8vM(B#}
zpfK>iggaTRKH&8-k>9@e_AhF9ZOR2*mZ41+gL!-o(qsO4iqyV&(sMTF<gVr!sx5u3
zC1FqrYR*J<>JW1Z6Tv%pDSH4(SXQse2y(-RxAJn_C0?6A=dN7YJhXSup7((0u>NM<
z*`Bf6zA~BB9Rth4mbrnFA^*KZUxhs@6Zcc$LDLHCt<^(pPIN#<S?zhdO{01;ZJc^;
z{#lqv@r@paP1k5gxgHa8%6LAnUt%*2r4<X$Rsh;ViHV6OVCGFVC6x(Kf>Y^9sI8ze
zN<O)80)-_!03k@_dPZI-s{|vd(Phudh5w~8c`y6G_kRi?0)ZwiH@4NiX#}SvNXKsZ
z+?O{m93@zkpf_T~2&l}@!q%M^kx4E4YD2OmIUSH)bR?^Y=_Hra%uxy*4gC@~Ss?qh
z_}yyJ|1Wz^wF^tZ<rFoPoUyU-F6N>674R^+7~YZpYX{Z5#s6U@iWNDhirt80czOlE
z5?x+;i`GqHp2nSYnb6fCBnThBbDW8(DeUzmi`Dusg>{I<=j=XWuW$M|UC%L5;#4hj
zGLaS?lk$%zPuZL^-dtlRmS<z?PKak(QS1S$NQhc|2@k_RZ)q1w;>A&c^PmWu!gdWv
z35sd;n5mX#=Ks|os(re29`^K9akjm2YuQE7%`EZNbuUG!zQtc)7r4B|Y<tl+W<AEj
z{`6j|Omr$&K}gty4+O*&nOMKgtEJMM5%C(G$&>4mRwG-mX8_d}3lgzBogV1HonXXV
zFfyEQh@Pr(e+)VzOz{A2ss8=t{!4YnpHo%4XcV|60ziUmQc!COv>!^lr0i~C<vrSC
zSbAeBdoVXYuj^K(k-Cb&xun~L7mkouvhzG1S|>_RuLn#P>H#El!pa8>O7z}noT^_;
zQ8r~tR7^mqbn28`ksE}OH0PZqQ3oOT{~oIvv*>@fZeBabjRtvBn1^6pj1OH|S;n5y
zA?X&V!n*K_2psgZ`^|fcnt`7WDk?|i8E8bMPR6DF;#d9;Q)dFs<=Sp>8Z>FvOsOQz
zq9~#?C`BbjC@N8zB55Ecnu`XzB$_BGBqSnAlY}&ogwSLP6;h<m?`iM-?Q^d0T-ToB
z|9{`-xrepZy>4mM5u=5dFqS5J`13S?&PA?yrASm}F}CkHZ>0_Ta|%bX8H=$h#!xs%
zOwytQmBfyR;(zdVg_*IP{(U}x&m=8Ry;uvdB#~_hywcLrC5%V0bLsPIo4?_KF3;NU
zR}`~=KqAZuuN)xpcX<z)6`P+a%5cBC`i3&_p+(hU`={$g8e37Fia`++o!l}WUbI7j
z+ADiC+PW7Nq8dhA!6(GU^dP9szF>9`y<Ip}I{4ZPe8vQ_2i2*|nC{cV^OU?2)&Bsk
zLxPpXvNx{l`IMA_a&mHrtZH1z5tIcKT+*fO_dnd6w|uj(3+Gb;R-7N`Yq==URNTXK
zRT0}rM%Rv#j9gW>ja>h)7NF`SQ%uj7pZ)<g^Nw8?!u%9-S`<ItOpFN5mW)Y{HIs^=
zEJNX|xFcP)>buEyX-Db8U~rF|7FBAaKP5xT$IR8c6|-!}1){GMOClJ{h;kJwjqL2b
z7ExWFmG4vh$gh!Z=%zT+R@F`|Q1aj4YBsv}o&XuO3pZcnMVjvw<Fibn0j`KzuJ`Dr
z5&)@~=?VjC>;WY_dARtp_9wKjR6N?1LX63NYO0!X32p1vYgaY>T~F<blJMK9m}Sf%
z!YG|Jb`Qf`g-840!)207g5{6A&U7=j(UA2Oz`$M4D6s8iOnkpZ?>K`z#^{+?#3RBn
zv(mFUCRE$PyOB*u^wz<Em|~e9y2?YkR$+S=OsM(?UxH=b_@I}`Io#w%9)E1Nm6=@V
z)9DjN20t3QJAe->aqG24&u5<|s${+o3XqvkRm)6%G_L;Q`-xFM+@rsI%@~JZQkdt{
zEkmAgM^JK3hax9W;xjlyYg@4C-hWEReM&<<z%yM5zj_xu>*?#)l@|C5vQ)U+-J{)S
zbwQ*BAOrgfJ|MvaXLQ!+-4l7F0P<qNu`ncseHHVm;HZH2#mPlvdp-rXgpO*)q*cM6
zj|T-E5}FPw9Y{5u-|pE_cmH$(vm|u7ZE@Q7Xs;+ipwT48$}R5IMw02;pSoIx-J&Ts
zei|ytaJRX`_7`c5d^KyP;dwCA@IH0nHFlMARU6e7b#ldEjCqW1z9*EniLb*lpoc>b
zPr>)J)j!Gd{n-h#LnrJI07L%OhzM!X*4xI_JhdF_ADbm-cx2A;fHRZE4eYK*&Vw9E
z${I}uQWqW-{$n4YTI&d}0gNv+Pc(!ZH-F`b;lt;};y;Gy_s!OqIfJW~ur>DgS0836
z`(elWzrh8WXkMYP;0KH#JApERhU^(LQ%3>v?RD1I#)JD5=p7@ClL{8rWz3ZErB4xC
zIfMy|1eQNy6^tkXB~TLxK94C|Zu9O9VKZaVcl>jMNJ#ICandk^L<Bac<p$nVwLgFE
z*z^ss91E3T`etZdSohBKe}zelw|skXwHcg?raCAjb^V!jH8T^&51UVJg=hIx5Z{&e
zVo|+jgfd7Z#PMFG1H$u6>3~JX*kuWx;>9r#{=UzkwKD%!C3_;R*NnaNqQj*Zlh2<$
zudnDm{e1iH=g!?LZEyQjIp4X>*QMh&%M!W;yUKpOXmoj*f$G_EiQU@Hpa)>2az8kg
zKubi;+HfTFczqHd^3R2D@m?G-dD5guw>!(cgq{}8OE^FE0+dp)R~tb4ASr=q23}8h
z(Sh`MYq)1G^e44qW7(GMX+1nO!`h5(o4Uw)6U-9Y6*4C$wI~CZOIRRg?Y)s|@Z5t|
zD*F73-AtcTw|cIB1vXZKWQ_Z?p_LZx)9Lx@w!R5pgd6*66`z|12=>uIuZS;5E3!nP
z1zHboH{j_c;gZW{?xzl+t7L^63N;j9&Gkd?a1Dsb?)Z`4M*{rep#_h=xl`7DA<6?6
zUbrS$s84kUue02__`8AH($I{}AW~C)e!eH<NG{o0d&g}n?WB`mE1btk5xnq1=QKIV
z-$emk=$mA04g^1XoT*H;B0OOzBvLCjP7lvrMQY;bpC4#;-{s~q(V3w_K)WFd8G!m-
zZcAT~nb~6U=&^oBci}IKg{*VQ9Z9C|!GT*R)c>A|Med=dm}9xN*mz$m9@oW9%J*H-
zM=(YUOI>^{H&0cUDBO8rqvnXP#o@VRn`=u)ivYx^vwZt~zP`FdJ!BfaOqNN)wg7Ho
zt_Kdt0uPa(NK6d<?oe<4Z#A;7q-xL<LmAVH9ZIl`xli&=K5?3<D0CEluPrI1Q+|FJ
z$3_FCa|H}Mw+ujeK2^U`+rt#Do@E+?R?s#`7S+VxZW+VArJLt`-XT{Mdp24l-)g%t
zNx0H2nDV6!0+dVEvcFP_S`gc8%5>&wAQ^DDTe=$DJ*6o(O6un}G5?GS2)*MucT><=
zv;c~?4)ggpH!}>glZ%NQ#Pi|b@}?<Q+-!nVD<4%~IA^Ekkz~<SFyK(@^&$m|1_PwW
z9#;)wj6}`ZoAvHJ=A4|2Z7N%g&-qv}ku{~fLPutn@U+b}v3$T}3^PVj^j!QhG4X#7
zqHDU={e5Lllde%C2(SYJWJ}4;b~e^0*TtDhe%~=QG<}_;<3l<#5RY5a!;Y5rln|!`
zPXQgf-P#sQ)r^KID!%<zuwDp9xdD=Qw2JMx7vDbRs}5_dGEJS-eQ5aL*)d2{)~|0&
zvgjx~`%P$MWR~4lP;3r+VO~f`2=*KAwnr@q&$Xy~`|$pKkw#iczzZS})9KbGDr1pT
z2MN+~jRzx1A{*hAbdTwPb~M(!^j4mlITKK7U^0cjL~-q!fX_c=^#6oS83j<5)RTi(
zom)AN3k8SVdf;AUc2xX#;#+SsX~%EVANh*IEZKJTyMRxV81|y_ZFRNetH}~T*@>51
zU*IsIz-fv764cx+A;*|r(e7?@#-m3kb9NqwX_?Ye|3atJ<}!E}A%^25SWSy}r*!^O
z86K5ba79nCe}r%06UuVxGNBMzD@!%mci_N186W!B{Ro=#XrzPcaTilJUuU#=#^Do7
z<hO=?jmZ4#S8x66$1l+EBeJ2TeRl59tdM6V_}6*dIsWWurepj&rVG<p5i=A}oQqF!
zdOjqiHS0>x4}23rE9CU+=T>}{Dd7(_HTSQ$p!~vKjdif&w<YLv?rS9K(8dTHlAjO4
zVA6hA+a(ZH$lAKK5fv+Kc*vVh9&OL|9M5a^V~aGe0}i=lQ9~*IcVBqt3(`FhuxwqT
z>APn2#P^GFG2LSv?rLdrd9lS=XC5V~3<W%_)8fT{ndJVa<S+7P!eQ!158*pcB~P<>
zk0$Dth5w@R04!Z7=(tcTp-p+4(g`!9v-JP%z1no`@<VY+_>T~zAbx(Tr8~U*f!bNU
z>Y3)8F(zIf7mJbtN&i+kc<n^Bt|$9jX^-8Jg5a{`YEsRqoJrewy^cxMYDDI3)5o}K
z{RqD@hBcOit~>99K?hxF;TTVMH#Y(AI-9p@R?W}bo4Uejhq>0%n=?Jy-f~OczG$!u
z?wCT%gSp_}C>Xk0kvB>_B>nqb(f<9jj_SRy>d8}o-fO_k(M)=Al|RMKik=D7Dl@CG
z&VyMrX4A5-JRVhXXUn;jp-O1L%I}CI_?I(N@!~{Dt7?f_OavTQ2M|if<ObtEF!xK3
znMMX3RVVkl`ySzjFa{IGJiJ44kl1>Hx7l47^S=#IH8lrNjsq7sn>@J~8k*=GhGo)Q
z%9?=<{qC9U9_Z7%yieiX)JG#9r+@t!l%Jd1mhAK<YO(8>bpUl&-Uqe6T)6=bhlu+S
z41`u)@IQQa!0p7ONpKqi@DYFtt*;?O4#)-*`C{(;=YXE6AdUj6o#lD-_jSFztt(o6
z|B=1J7twbaL?+UCZ-mRc-1<_TpWPVNUmslBucJG-PwyOOAx*;9t3##s@OQtOYtvd9
ziw{Ggb;u4J`K8zAhiPdc*RCm-I79HPKwT@8si1sR<qsp@e3jOkec<q6ycIfbk%d(Q
z3c=HKZ*~Kf<NQmi=Z?6lxS0Ne7Ee7mE(M{9<mvfkE`gu~q6(CJ0s_ENnCvUcE-@}P
z9lgXz5?;pXWp3yjXzUo>l1y0KIfmNmcC%4<u92~aX3v|;#^P%hOqcE@FRY`cHs@zE
z4I)>0{@Lguy93VTt$T|rzu2rKjQl|>TzBM;q^T3b`~pHp-ib^@SU-Y!U^dIW7|^gR
z@8!$H@UkgXtP2X>b=aLQ<woqIFcBgIzUq;Hfc>McCJa%s`WkWa^xQMm%e?j6eA*sd
z6P{0iCtHhUQw$RcH%}@H40Yk)`U#gO_+I|uJ<hTCNL*_a$gKq7E^J1J9S(Sy$6E#-
zqspYF8du&c_TO75B#q<t+a#p#x|J%uMcw|Dvdz->2lMS=2>i02@;IcHb?zVm1XZ&z
z*PjxSCwaJElW)Y*2K%FGhHww`bJ{bO>=^9m;gM`IFD~N?ivb4kwFo#X%^k|~EsQ(%
z7;Y8-AhD38FP1HpCfa#P%f`|Q-j}nf+^BK8z*n|Y%TQ^lv08bg1ZkCZug7Nw1@U18
zhbk5-(nS;v4uwKyBNW6BuF=8{oo8!MzO|*WOnWav@h6GD`&^i|J|-#|vq^Ab3Giwm
zxq5t(>1oA=o2zf|23)d_kBDA6dJ?FUs3svOcy>3vvv3R$-X8*&qLmio6~Ot&T<Yh?
zqFO6GI(y(w_zJ7TT6>6h0{ov^DlS3ySz-B`&Ux*d4jI0R+^kbnwt9N*QQbWU=vO6c
zts)lY#%{<hbvlO&FD=^0<0IGR?gXWATKG*h_rhe4i+3g-x#_fI<niLkOPF@SU#lq3
zTE+BEU6dQiwdThI>WH6G_c`OtQ;YNhTD<7;E<A-(SDdW(m>1O@v9wM%Ac0o4des|l
z0(MDIjZ$G#h<c|g*AzxDtK*UH(k1o_38k?GR1Z>EZ?7b5D4B_$N^w8|HgI#Zs5iTG
zX_X+esyV!d%1l)B(9tATbg|P?Z%?=Zs!Kh|cx-t?ljoUHCC{^k1eL@GGyoLhcV@We
zbLC9wL%FsOL@udGs(pK}>r~O^!)${$;|ArFH8HOQwB<~WY!PJbsbIs|H`tnRt^Doy
z0w@%LdO}Z0IbGI@k+C1hJLFo;GBWD9{mVhi62Ar7spyi$8cEEAzScgOP=OUa0(@vc
zF?joTE6Q6rs=h<Cur}c|d3tS6o>A<7<qV3mVq`O4o@mxGlRV+A`M0fKBlav-l97xH
zINdQ=DHsDJCdKyZE8c^a`e&WNyLX4D>e3w3O!IBDId5Bwu0lZqw*Au(35O>JqQEvF
z7N}c-S%84}pzepZU3|fpOH*4qAOq#{f3LqZwKKxt8GE~k0eHZ9@qB?&sd+`8zB>Jw
zVearVR*7tF_`5UM#gU`|NG)}OJd`J{&}|6`8x2I^)OIMx*qw6@><sDZhT1BM>@6Py
z)<`UCGA(kt*V!U!2_+%wLxEKb11o+kI2JT0%)78%B{H#c;nA9IRU2*!tvsJg=lsHw
zQ+8JoYTRd9UaTJ^CkQD_bMbF&o&b>cXJi{UM{P90a~Uj5_{xjl9`87R&KyDJIm3}8
zjmm?<cA_!J$;lB2EWv@pA!KDpBiNgAZ2YJVvR(Ej-MthY-5m%50!b_oo<7~Hd5spE
zx11YYKD`<*#xY<C#hwE`H07J}nkTYiNyMVM$+LT8E4xJe*xP}mEarDUV<XL`y!@62
z2l3h!He+X-axS5P$Syn5-rwxS+}g7YU#Ws#?-#C&SnQznDHgm4*MD1$%h`K8y&vv9
zCI3xr?JsAI9;LE>U#JNl>`5Ck?bX^%C|jp+WTfgvWrtt~n@e6w2ZXhtM`KYts{3)A
zPVUej7bc+{12Y0Y_?~k~8}fga;Y=JS1!Vk3%ev`PU%F9<CYoO^vwZtGbLNQ6%{3{_
zj)lCqkdaOfY6DUlBG(J4EHPK0v=;8w6Q@vxyt=((s^E>x*QGqg$LPiPm+>hddPTJI
zKXVUzigzr2z0Xw8K~~)vMvc<xCNOmY#^4X0@Zp3x9x>?nKfkg%EVL6A&6<dZ#Hp5;
z1_}jrSK5Vv(ox#qOIiz$A1vu+l3+aN9gijaaEMVH2;mHVr=A9ZYCg7us~IQbHkG&+
zHxHyJ6uLA)QNx>#1*VghQ0^=JN09qR|L@sclhl>TCFB+AB*#BH3~|dI$?Yv`n8ZHe
zF9F%xq>6t$*$lY($A|;hqM>lkx2`N;cOyj#%8;%sXDckTyN~*VZb8`WYB!Y+U3P0r
zcaji4oRkx+#Oy&^WO=Wyuj}b}>x=GLm){z)U2gH~g+M@1h{!EBJJBAAJt7TNdGG3|
zT_UviJ$anD9~+DNFBVMo5muLIygZc-kU$>zr_>J<^cgXITg|?o;wWPe?oG;B#U@}e
zg0Y9YSaSK&ndyY8KRzIMkN%W9Rr>FN>YKged0+b;-3_!NjjL2Yo$TC3$7!!yX*Ziz
zpF7TnIAo}N+zP&uSfqtfC$e0Vj8}lAe>9Fp3$)sDnYpo)(SxDJ__A=4fRkdM$Uhdq
z!d#h}ZhNge)E|p$Ex$aS(G@nOch$?t8PGL=s~Vb`qF*O_zYAD%W+$sMIYUNKjkQ23
ze>z8$6cQ3j*uc}k5+;D@2pIy(u}bq5>-J`YiSy5$1%-g=Bzqae4g-E{G*0Uff?%oC
ziFdW0py8A7AE{Ej<F|+4rD4DXI*-Isn7yAo*_SLrvsS|J;`jG{3q<%~_GDYFdof?$
zD?4r60ao#9d=|kIJncRWt-!n&Dit30JyXDb-je(^XCF}J?IRbz16OS=!FLi%nSq}k
zk|g$18lXNEyD4<VlAgq>u_P^$n?QGkUD`jB<|vTmG)_OU+lC%oOiB#g{(WVT=H@Pt
zN@55bFCpPhO$;t2woTgdS)RExF9cWnf+t;>ZhmEY%bUiOy?^7jtFV2BKhgy@7n)|)
zJ=_L-pgZpRqjsiJ2d|RcE&(pDe2nqa^S5y8Kyz^+DfugYxCf)QeHg`AHF|31-*Z+X
znMq9RCh(bs$XocmZ+YjVE+N5|)Q2p?_@Vx1Sb}bSbG!X3{!xU<th90WTo=7mWPfB>
zfDIJA+dKL1TibD~vCW>utkDvULHkK@l(Iq~23m56gD>oCWM*x))8KFs7sa7)HhJNI
z9i(s=LJtMp2_WJ0{qxG7Y+St$nzmb*EVI2P4WpVVl`XSS0m{r0Jq_<_1emTfY|tvW
zedq(`FNplD#U|@1U+(uLCW<{OUryI<Ye)A|GhR+(WBYN*^*tp1V8!L<_lhB0PBGX3
zRc<MhY2u3uv!rw6iK)MTWIMB>M2v_87XMV_AxA@l0lTCXZD`y-{EURth!tc;0<(}P
z4BYy@JLw0H%eS#-TSlj_J0KWf7c!lvKu!|tcST7dwowTSBUbEvIUcp`LCdSQT+AlO
zAv72QDufC0!91=vM}luq^VGcK+GS(=P$3F$ze2#a?`1oFt=W9AT*O~I2qub+9#mRW
z8gh&volKUy_DcB{7gfrx^IfQL;DN;|^c}w&cg&_N0#EVgiMnOEl+oEEM-;50*kMlD
z_h)}g0@-9di4UFJtLjZbQ?~XHk4!{bVN7{=%GdT3!?#8n61RWxLE5pOW;Znj`6Q*e
zsCE@nE$}sfHUTb*&4!oGxOFs(RGT!err*Bf0ajInJHo8htsfr)p}P5RUtL?-`d5SH
z?_P**q%HO-iU=YWTnTGdv84lJK@HS3Vy3WgTV2-A<WobSo&~xDHo@&4P=CAe-?qXt
zYVtlCyRaX0dhh5?b(xbpv*m3Ep|CUhnzeAqki`3tff9qes%p)M`P*LUj}_}ja)Pb-
zh?2*T9x?Wm#(F|$)-i06D`(k&Sk9pv!ie`a>Rd0uchU<%y)?c)TW<j_Dm;kAakJQY
z3PKa<R!U3tdf_{L541}_<b<-7=)Wb#YVC0U_Gu?$RdX-i6$5}=2=}4|D1D8uF5AyU
z7Fuf{gH8gR{9~{%otYr?KOpeQH3e%YxK*bC*6;@(o}aN71l<|7<88aC-z5<WCl54&
zO4a7b-m{4<6+<zjf2Y0im_6J1bk{+@pQtrhEF9N<uzt^AtJdd}R@`aZ=YWB-+WerO
z%ZmH%ZBsC9RnG$Fpd+saKfIs3(%V3Fx%@MO?)wxM&X|7g&5`RKA6sUu8Qx>gbkAM8
zHcl;b>T)esb#qP4lP7P=w6iA$Y+(#%>-(d#+48bD-nBk3%LcpD{m`d%Fl5V~Er=N%
z?lCMw1;vm4X^rE&(Tf&p)=?J8TS|BL`Oa#4NSMLQ?h*ESk6kwBthPr^H7-N}Y7YIU
z{`}mUxbVOxmL#&%&0~F>nduGteiWsk8w1_S2U~#3yY1A)&5VbL1P2+4u=5tf8!10N
z4nWt??wpjTT)sA5*FQ_U?g%q^i4(!|kl9^DaRfW^gEFo>bEb0aN}JdK1bxMHU_B_A
zS<Mh@llAtZzr_$WHRJl@@`dNdTRn6taEKQx(%Foj&3iJG*{$cNxZl%eRD1R|BcQa8
zYdL75->nVtM5miZx7iXe#(#g-uk|U{cH2p5Y{=R=;D&CuT~8j`#Kvt^uC>{)di4$e
ziK=oAUaxEK009o%@$-vR>wtv=wxQVWD`wu<P%?xOpmk23Mj_9hURp4SO^Q2TyY^wN
z?*Iqy`IoG|zE5xPTIX6&_U;{$@msH-B-+i?cuk$+C||w-CP?f9U4yFRCxA)H$8=37
zaXdbHhjo6hg|elgRe=B57ffIfYjjRnBS^T1`r+y%^)z8}bY{n|AER0YbbeOVAfP0Z
zzjhy2*pH7MAkog(@};EGs<tX}z;iS9NE;qL8yFa_9jeq3CQEDp2&;~#qPg+xkRJZM
zB0@2#%`dOL{oA5tSwZzC;?O;mW)!XUpYn~y6uVExgpX@jur6bv_oGbv#|x&m+~+|@
zU2Sf9H1x1=rPgVq()N7Yw(Z*0J)bx33*NFD0KhQCxo~1=AIC96PL62rB`^Lreec_v
znz1V5+v+4#P$V8xK#Ve<Qqo+#Ec8l}jg7mKs=5FCOYg`N>*}2&jWhOjAG+mh`tYr2
zEBdP0FAXWsczJpw)c4x22|X3{*Va4x=+0ny>B0ePT_XI-wu~k~g?092Y19cCFjCNf
z!UecBA~9DOIdX1iCH1{S*Pc3dEcix#w6XOJ1w9t|dP8QmiqlCZ=cy+;@U!@f^~UCy
zQumkssqyWDQYwyF{j|4zUlMO5vzxUw4dYEuOa?HZ{sX^;vMUF552=1;8^>bT7Z>OC
ziIaAuWLoI{hU_Y3+kE@-#8<K(JmjKW*C==ebe^|hK^ZJzFoNM(hJCf%CNMKNe9Dw%
z^+)BeE3*JWtj|!kdMJ$v_hVWl%m*s}>bYaRb#P|HpGU}3l$*;QdhU%w5#7E<KVZ?q
zqN0Mte(K9-1O!wMJ2^tH2PT>gFE#8Bq^};LsTl~l$#To_0s1IjR-5V?%23Hc>sNmB
zwZHPv*CX!T%dCzON#l6l$k8ft!~0;-y#k}pKS%WE_b;|9MkC;0Zg-SQzo^|q-J=r|
z%{nh;0C&O<Ss&dHDM`5xD+6C&ot(Nv?-+XwS(+L5EKM#|S?^f;PR)Z31^uf9=#*5s
zY2ob!)}|jh*p-{`%%r!7Oy3lcWrN`!&SMPaai{uC95r#&sCilo(wf43*v-3<sx@8z
zjqj(gc?;T9o{Nb&M$+&<yW>|s(-Y#;B5v;1FTm^R0rm%M{q>_v&%nSM%u-bOlwCV2
z{%UlLAK5)q=I`6TtIIU$E*-q&uaAMC8qjg{#QV{{W^7%sdU(iCRlZ;VqKP8M;Y^pU
zZB~hO-Rka+Yh+-d?g?vRIX=7}neT-2p`Tnp3ZwE)D!Te>-AGSUe+4}ssj^DPcMH`u
z?{{y{@U*tje*)c#MOy-aVkLoFeYp-kqUHIq!(nSLk)iW?p^=@HHROas2R=`@)S!v|
zJF^n2vTokox&4c-I`+0Rl1+W^(XpVgvXg7`VndmUv5QVvqbhweD(cpT+_JI{A1-G*
z-@4svmZA4+-)Wz6Uw<l(eNz54ProqgE@R+7xQ(crStI+IyE%Tt`#I5j&dOhpVIcDt
zB{6!pdqh`D-9vp-XLQxdajcEn)ihJt=-*H&KYpP?doQOdbEKrqBudLpI?{cp?)8jR
zHN9g*2pCh~G|z86GjF|i^IKT(5;=Jt9x1y;I~2VX<NRRK1eX4|VlzL0b@Vx<jx>|@
zoBEXw_%RkKt|)R)28-==Kyc}Rp0qty2$BTY$L;1A_M5yxfoc44-1vl0&%~hxIi<E9
zA4-<~{_?d<{l;>aW%DeTFStGOfLFe~@o$GXv%t8}8?0u=TAUr~xt;1Yz75lRzQp^(
zor%YPSFfEJkl%#vLd>}X+(=}e(F#QRb|a%INq*x@4nZl6^J?f%(V!oa)4XA{VA9Rw
z8{e#7^e1<;(J5b{pqDtLlTJkIA>zSMas^+x!e;X$n>@7g7^03eo{y|PSgBg^WWFqg
zBQW<aV(UO|7b}qxe*bFjr@TU9dE!Q~WC716DSM`GN&{~wUNH{;(C7B|wqp7Q<KW?B
zUe@OT&pe=7Nzcqw(9$}MN*>@skCKigXuIj!oMRW<Bxi@`zK$rHd?V@SkIvP%lfwoq
ze6e;yeO2D!w1Mss`Pq2{@|q&^miVEQE-THAjd2a^*tBV*e>+u8_iuvcnY^*QTg>R5
z0|!2d+R<WX_;FD;#~4QZylCL4e$rVNBJ}7Fc6ph5wg)Z$JCL_~dPYjRtxmUHCwdJW
z_{Oodgm2~q(-XP+A>U$bKITq(>~oc!TzbdKpI_|6;KJ&{1n16<mT1fsbw*GV(AARH
zln#jX8j}GY5b>`@6^C7u_=az`)fwm>9%#Tdi~TlAuLm!3n!`e&#Yif8^WM|3=y^=m
zi0IcAIb#||>Is8nSgx|f#t)^X=YJ)^@uXK*ov5omG}GqeZ;EeRqOMBaq0yTU6LzVl
zr_S0==&^78{F)JAdHW2BZUb$~flry{)0%Jf?EQC)Xj7UPSN7?2puvd`<|Aj!>9)(<
zAKyT{)44_y+zoDjM3Zit=UwDGjcEkQ*9!vj!=nqT1p-Eeq8qaEj;ExLluEBzh^2za
zB!PvGn-uhh5JRcGRM)?mA{NX6?^33q9p(x~t+T}c9k1jNrH-H#m5E1uj#>vW?a(T&
z-zG>j^^Mx<2X#Y4{T-&Y_t2q3Rol;Mc%Aq4V8H2WdUckKDwS8BYlLr!ZNFQm^=xfN
zQqCI2s;tR6m3w(!w@%Z|=gyh4;``@PfV98xh$&L(8LgrE|3c#1uPU*v(AtIvOy>nh
zR!~#F2ko?BL(`%D6R*tr_5%Q6yhqbUK6QbsO{kxrU!r1ZB-8DV4}<t%8XAeu9I7{I
zvS;i9PNTU6)+zyTM&cgx7AGX0)(tto;H<XLj!bh{QElcGc}xL!%?yY5BB`7chYjY=
zwf#Do%Oc3GurYBOs~;V)85A`J5ZJAuayM%;Vn+L&IrC<9@$_oD?Mz+(8s$z)wzBj+
z7P~3wG@}6%+<sHZ@|G3Hjf?s)|6}i=DyqNN(1$~hwvICmz#yY4r)6+^2M2(8-~i-!
zZbm7^nu$!uzS-XFz);V0++PnKHn^Uxx$>W@-tB7oht`9A@7S!7cC?+)-}UM3>(>{{
zP3X}^XRQIpj$n9gR>oh+O_kw5)hBtk@da@~vZf{ng2N=wnh#6TIi}zmsWHnC_G|rT
zccT;5UF0<G)2%FA?;i5()mw)Rf4MJbDWO2VM5;i*X`r2E*kpW60g(M_#TJdzzWmqS
zXyYG(P8d+7dKbL5S7S6!AHf&n2rP4FBG1Vn&2H2&1y<)gB7aLJY^>;k=-4nM>ip;o
z@xuADrwJ;7G&SLeC%-gy{mVofi8@Jst<zNhvCH4`rc<o7dq$pjv<X$vJFT|GKQJ(Z
zPKz_mf!`OQud}wDu2272t5++~tq;+RZHutz3w7VlVrCDp$24xEN-X$uch)XlBPW{}
zboT67jaQ=ykfT6hMGpAh)O4f3ZW|V(=EGDenZrz@cym-{hyf#{+HrRHdcw>OgVcqF
zhUaaDJdV+aQ481a|J*hXebHfuf*W7p!|T9%oY8}V2Ii#1<=ev#GRpf1D&VMsSXCj_
zx&KCkwC7|NFhbOs%<8;i0VV^j@)Q+vIZI@ALt$d2cfa`8H}%z~53{m-7~t!%U_o~8
zslQ*YQtrE@^75tj($e>B<0w7PekLE}XquRwV11F|`0+;pkNvnp(W@SGgk1}*A^JY2
z{CDmA0&4qLQ{L?j?;mvsw9lQ^$RN>oOq0dljndLm2iMmQHQcl6i5a)zsKtfiHCX9T
z-|>Rt-sHZ#&#otT3km|!^aew1ZTdK7n*Q&9(r}yeGrq99E-NcCGH`Hvucsq&Ixj9f
zkeZ3g<;CqCJ6KmT2Q?ve(2B=@jo?XBs@y}Zn-3;RILX9uaF(F}NYw&;NJVE@`hwH{
zR{C_a3kJTQ9^(@2n`hzfgD*Wo$UD0_^aR1^N{>P_N2v&L_vux%H`vzO($Z8&9DUHW
zsGD86QV;PPy$@o#hT#)|&<JQj0A@FDA|^pIyaFd00p7C&J9Pf(#{_Eg_d)A>lMBNP
zL&TgIJA*>bo%3YQK@3zi)mL}}8*mXISCEoE!g`o>MXT=a>Oh`OdXqvh-9m-8l?GKw
zNvQ-von>M{G~I8jtA+Ird2<a`ef8zfyy;rCn?I?Ry!^54x`LriZ^CTuxgUJD*A6$5
z<u|Hnxz$fQrhqAi7(>`Xmwdl0_H>U_{eRQEME{YhqeRo9?roKYw$UqCDPBK-vWLZL
zc((o!rKRYOfj0qY*d+Fl`Pb4?9aba-OtkAweio&-WNRrjtao{--=RV05eyHuRh2}M
zrfQSP(BMAqq~D1X581+GSnJGCYmQVdMOzwAXA9o*^mlo#w>U<tR06((oQTR67zaR)
z2LVPydRw-9d9IZfbO6gx;$KZJ9*s%wEf8dU6}C533>P&O^LOj%w#a(4jxw{@uEeZB
z8fe867Q)X%MUyPnBq}aAqYr?KNKXM<5o(g1{J-S)Vz&!4k)>R^4<)JI5)CRRH9JMy
z*hKQRCmzvSb${Rd!1E*9R~yWnxyx;N6Q}*8hU=B~HaCp^>-yw&S)Km#SQAXmVlyKY
z^xCv-J02mKr;T-yrq>)O)mg_BpnOghT7>7SZ2h{L@01Vp1o<miBMk62Zyv6wxQB<R
zvgy-O+ErXtZ~bbj5-xynJV=ktrss6u6@esrh4O@ai?qap)T(?ZvQ?E<4TlzlGLw&=
z&zLwoFq^|xs0HS(2;9539eGz;PUEi*Jr$Q3u3COf0dc4}mjqYNp4#hyefsnfJ#tfH
zZMLYDa33K}(MQFNLy_E9W1|vN0crQ{y*uDRH%PL206I)BC`5KX0~Z`qV1;u|-sQ_G
z5oVDm@TnIQ)2#78E9z6Ds-|`${hj8UK~RKNU~R&;nDR#~NQ7<&t*oh;zi?qUV&r@D
zEv&x1mimro@`XCSd!IgHMwpNZf58J`ABsSV^-oRMukF#q2z>zGf%L)NP~Ggfp*>PF
zUpJ`4#>6a?+3o2mk;wL%5zRW?UWcF}XsE+neLMZ(faID^2A#TgEkhmt&@sMeuU@%t
z^Cs(c50U%3=JRsOh%KlVl-4%lVtsXiKWL>24TTsR;FT35ZdEVB%R{UN;op3OBN#q-
z@cGC<8i=PWKYaYyW59rnb2{Fz>Sz0VCgp<rGYOcp;sWWNeVv`VJj}OOqnw5xq+y7&
zJ#jo(X<6dN{X`}5z!B?yyb6{UnOkcunPmtDAg&Lh0Gd8_sybdD=V*N(m_b2Zh8t1R
zGsMr&rOdt=nkgHe`<NQ~G7#<x<r0>c-M+nJ)Qz7n1xg_(3}@$+fF3`ge(b9hixyjM
z`_fiO{dynVjS_G#vsHQKvln;Ub;HQQLc9>HV#F3@=f<MV2R~J8nzQ=fs`&5clxIMf
zlUKF=rb?_o6N)T=S9335->9*p;TWO^8<$_as0)?LE+}X|F?y(bq-?;cl#~?p>w_qa
zAwoLEU3urcZ0I=`8=IHQ$GN<Jbacp^A?>U8S;c^qbXWA-4_xibaov}_PE*06`~Sj~
zg5IKx%mZ&n{kDoPFpnraPo4gh4jkc|t+97PO-HYV{vkOL@3zOqUQV*Gp$XNn9@Kui
zdv7LSD9)HJ604smOI{lE=r8XT@XtGcDmht7Ps4qM>bP--(c|B6M4UN&?;7R4W-L7b
zeUM6fo#=aevm)GNrCWe#YbrK+$hWUv&SynB4+@9!<q^=;{bYSj@Q;6fto+7FSFc`W
ztJJ%<Z-+=7YNL~2jRUbi${TFFZ*J0>{NQN7)^PuTUTx01PEt|bIo~94yryQ-n+8o4
zl|_Yt9j9yhojy&>X=4<sPg8jfm$mMh_IvN2`u7(+m7K8nMz~hjZSgnI>;;W53N5Lv
zUBJaRSD)RD@?L=4L_<b}9#Z}I&d<}{>rCY~MAfL<u7?{xW@aWfz6WACeYI<wQe*b@
z8PKxu_mudSg&EgpRM(AN`cH1&RrR*-=!#F?3f@wkZL1V{>gkVS4#9UA$^gKbMSj~8
z^?>1eR;G?$`}Eu<t+0>nZH$*JkreT9x`(T(s%C+ft$TI^>HkUe8+FgvSXJ;+imDU9
zHbf(hqKV;JJ^#x~^7p%rq3^ok2cwNc^oViehL0WV3(dntC`dF__jv@;aeln3cfe@_
z6z~UvyLIa}jU&z%UptuN1DZ`vCrf<)@guHjbIRxpo&yarI+!`fc9DI_`;TJuF(x2}
zl0_&*om^b(a=#VYyEYf*=l27q=v`4q<03O_u(EPDnmmdHur5XRXi!;v8IUB`{lAYZ
z(ZBiVFdMH<Uh;*q7*z_1By;1+G(muvTajeE<}>eJDxQ1jq?q{gb&&xT?f*jAgVyZ%
z+h+B^EMh<h(-Y(SN+%>GSsLkWIQU>XnI3i*DSu#KV3Sz~9|5XiEhUWR0ORVQ5qk_8
zRBW3Tn{N*ebB0_+#G3^wM_6a9=roP-A)q|%mU<&{clpc60i^3y^hc2|&)i^F-V^5J
zCcD+IJ^1Yh9cpfNn1UYChoO2ssHVm|wLL#<|BBFm(Ks)s83IDyupvgnd+Ysu-Qx?*
zE`kk0Uhx`r2p2<j4`JUB8W}ij;|8(~DeZdm#Fv#)s*<7sJ3nx?a_squZ!{C(DW#>n
zPw9MvMpt(Es6WwZLkRP|Sumw&xkJ&ui11!gQerdpH%Ue$KUiQO%$tPdif18do#Vqp
z!o$K!9vvMjG-y00@<aH*aq8;+w;?{ZeJ|ZBETG)9c=>aV-M^schuQ*3NDzEAa|cft
zCpVANau8mXDB^@bBG=X-S{szO48t_*M@I&e;=}}{%&gP<<R{Rw4n`w0`-JtwL;YsZ
zN0`;08uz~&{!}NY<-EM!vF~4U4{+X4h76%IORtt~{S$DKCF(tU^mslvWB$jovRJSm
z@fwP6Pvw!tFH?>svZEUmie<OP8iOrc5&#|?YD8YLjXQ<~iE+3?)yD~syN7{n%{iv<
zIFsnMpeQr*N$<=|isbm3H&lj?N%;E@9kP#J@$vSrhH>zLeWz}pLNRmm<BE@$&Ti))
zbQH)iG(sEI0<gr%5a^UY2nuLT=fPXx@d9!(fCoVLK4nUfFe3*y`-8UT#~<P^Eq1iH
z0$n~^JAsbqyi!L1y;&zdZ1VV>`-ZvjI!JNa59WPE!XJ~k<NtNsp^n{*+DtH?jWWG>
zd-E7*g@no*Op$P?lae$!C!&H7B5d-<n_VYfrnHl-^?o_B+9rA_^KGSwF=^MW)B?>v
zOo~jmjl;3xF-NsS$Bv1^vUSS0*kqq6dTFp$ZT7k8hBAN7*_kJ@nX)RY+YcT*7-%g%
zGH^&?FnLuFjLc?>0#N_t9j0$)tX&(H=<{GzDTGkz?_agQQErIlZRNrBT0D@-Vj4Qi
z2zFO|F>mURW|*{qZ_S_5vU|Y4@mIdR1D{Hui-NYO*h=w<3zuI|P$mL~2h^V14XXMt
zKg?1U*2|7;`c|C*BwW>b8wL0EHnV#tdMi<U-~FIbbt*u##|MHxvn?*()o7RbU(ePq
z&Rbt!U#~EBY|Vk@t=_3m*S)N8bfB(dtzNgjeP2~5T3+A$|I5$s(Yb2GVxg%A;Jf*D
z{47I?X9ikDtA;0*SZPQC11hjm?n_O#HayPYg!Q-D;%u5d?_AptlwY=SRvo;aZq3a~
zY-QN1#CX+&{(!+!EUY-roMF84LC2sX$2DMBFthwBdZm=B5~qTd5NS9wkO9?@m*-D2
zaDDng&6FoW0TqwFS1Of7Ds?mt?*(QwI)kgl&?hr@r=C2E9)ww3uo5tZwA75`{p0hd
zpRh(FOvq!P%FpkGYxSkEQ2Xht7tQlwHFMEZg_)qxH=d2}z-;KUh9-rK7!5uCl2ufj
ziX0Mxf|aPXM)yAGmUZh)Ua4Dl^#<4GWtU!Du8j=5i2f<$*&HmC0w%kJDvd9AvhTmp
zcDM5ph5hg&DuD7*gW~h;c|t}D7fwT^CfezV6OW@cex9^>Jq0Cu2v>w(r!Ck;P~u#f
zS^p;2J6S8&x$&p`0s`_QtSTDZ#0IjuI=g2&MOB8&muE4Yg$(@u+6go`5_EcZU%c?6
zKqc)tChRct4p!oj3&4!~H(|nc1FwLX(MF3Gi<vGl|A_JjU&@M&35khR`ZJuIltIq3
zI-Z~{qL1}Ny2k6ls->!O2OtdHD6lTiS*q*6(Ov<0mxo>LKD~ft77PJn&?m-KM~obK
z*Ea6$r%(E9Rpg?G0;Q}hZ?!xXJx(&y$)|XZmBBFbD_CjTzC(w>Lxy<qiOG`vrFZr7
z(PdC+#HdmCii%DjIns>`322MM8R2z4jiKO?T-zOyOB1&a$c{@xBi^BF*Wn{aR-URU
zur>`%Ogu%wOGXjXoqS2;ej<Sr17B~3Fljhr4mk47P%uo*NrcqHAnvW%_&Ezs=s+;z
z^s^}Con2jrO`6mfG{n^sqyZ)35WOBe8^UgQu_ZkO|6~!0H?lnum((VBD@u6t4~1h*
zjEscjhwDU0D_VHAcpnWKp^fV2P(Rhs7uLs$!*IeDBlsx)b#!#bXz!o#Km6x>>q@2A
z+v|Jz?reA{uA_|1%%w~FB7g#>!Ik3?=~<+!z@Iy7`qqnu(5qFj)?-`SaC!NXE3Lix
zY$&+CoQl05+nNrf*PLTBc@-eQS*I>~{BV!r@dO7C;VTkyX(I{L?gp_DRW&pIU}mf*
zO{l!{=mX_UdUp0Dq?Pmx)EE@qGH3gopH<GPbFFgMp*uU8o=DHi`XFR;OX73aPJk!?
zp}`!g4QMMdKM>8MW#gA9S`ch21vCNH^AL|CaQF)lP!td6)H22^D<66EXb#a!WD9T)
zhk@IYgYxYahYf41cPuO-;@0IfVmc$$2Y@KW(rCyM&ie(-Vpt-Dn=Z$lW5Rf@sG-=1
z^x;N<qutk7#S9fP337g^u3ZmTS=?J<C_`5$W?#q<Q>Wgp(#j2-f5KWp&sM#7{prc7
zb;g&J#GtN#X&|NrbPxGi<q(o<+W`U`UEfEM!9SOziW(|93fU6J`8y-IVsAW0+NAhW
z%xH+NiZ{ucB1}R&T%;v1*%mJfQEo;Z+803{iLoiH@$R#OS@>2+2w_t{jg955*36(9
z$1rFKn!-?f4=^0A8gUPJjR?sd2(l^i2E7zVTX^rHFPMyqN;kwXd3$=yA(}OUrVUf?
zhh!(myfO1<P(=uy5D|v|Hd(eT6R2_XFF5noPV`-MsB3y1d~g?rRG@31OhVT2E~UA~
z=xqG<7PsvU;g$@r{$L$jR_*piO#y{K1U$)}`a5F&{hhCCqu6TTEhNB)2B=?6t&JR@
z6`zQ#5UG;=QSIA?9N5V1hkQq~Uz}JItke<D-2XDl^g5*jL*}-`slo_|G%Z#Ua%r(^
zL~gXryWr!8579gcY8upm;w=)a5!gL=up5pZAoT%T77->~Z7f7PDeNXV7D9tAfq^em
zE`umIh727VpzWbSeX{uMyOX)U&>;$mQ^$@Sso5mJ*BGd}$2-Y_A!mWEW4$cAA=ND(
z19&(SsDktTz+R1-6>OlsGWQ@z@(~;@)cZ?7E%1JYaC-Z%A8Vj8ay&Ty-DPC%9~nF&
z0oJsF^~-*(3&8zEkqO7Ji0K99_{5gb+HA_I02S*4-y_<q^RFG%$GDtJzPT+cvr2S&
zBzE_#w9cJ6SwZ<ssed|y=UcO<z<HwYw0sI#p<g9Q8buAF28HhcnpY$PhIOv*GC=K6
zyMbGTGW0M7ld*%9dzn&kBHI#}SO6t7&uJ(D#3BR&N|MXW92Wwvw6yf2rt3Q8vxI<Q
zUc881Qc!U+Q`NO{G5@Ux{vo;V;@WW6)6-X?dnKILm3Vi(;p(>-cRjsh8Vwf?oAa)R
ziIv0aZ+(%X5CVDtT-kf@V5egWE91;AQ+X#OocxgC??Mm|wjF5Es+J$daVej%ROVMo
znHJ|(y@)hd2kQX`422FPMqv@LRUo3AaRJOi2{pV?HH$PZio(H-T!5qXanK&36Fe%v
zb-78m2WuyM<wXk@V?e<@L}zG=d&R|ryi@N$R82xIOu#zZmnu#zF!Gs4dW_%!C@rla
zIv^hT{fH4X8Ox@DL+-?k4z?Jl1Sg9u^#ofW!ulQOI0&0#y&l91VjJ)pOK+F7_9p((
z?Tzj|&#&xbxjM}Q<-K?=LLE(+&D45uu5EO_J*zPmaUFRh$gk4U(<=^Gk2HH*IJUNq
zo1*rr40x=esqjnkYcrzogY{km*c=*h;-zEfAtxtnjgtCT3-GTbL~hr~ieFQG8!Tn*
z3K>S&^kOPkjThBAPW33>PFG(MOm{ONzYiY=7hqR6)xUbpLJ>nDfg0;0ga@GVUMh0~
zkL^4R2q@~z*)3CNcNpQB;?b-^7uPU8spttaBc=dT@L~*~|30A89Y~rjWm&-4nfwIN
zD1812l14CGgLpv9dwlxz$t^Z{n4Zw}iZ%(5_9)e?#rSLM)B?ra=-F*wJHYP;Glid2
z<V)jzAQ)T`#e&%929_Z}PF-Kw{1x5aacVH=J%o||QTR|Mw4FC>V9s_(h6;V%2LwBE
zt4>eSSvzXX^6Agh(=R`hKV&uLEri1J7cWGjLdROly%VT~e$b@o2d|wc^PNB_w?JMl
z#T)uSFvXrOA9O!-3nzly3)7BnZ&`xVPJ*xS(xIbE2Sa6J_{ZISEaaz5xt04m|ICO3
z(@FQ#b0vd*e{c7<ss&_U7u^gT7pU?(${~gt`B_vA8(?0*`oA;zD4d3n;>n5>5E!(~
zC4?(il(gAI^_<zN&M7Z#;tTR|R=_A$FaogQBUDt^FNZ%8OOS#0naoIKgytcA`&@*w
zfR$JXiLAtrUP@NN_@I=_Ficzf3^bQyTx|yVQLMjkMuOU^RjZRzwyL0<RUr1?OHb!(
zcNE`m@1X5-C{Kkz7A{3_H9!(^TE5M0VabQzByAo?P}2`k*)5jZZ~_RkQKQywCo>DK
zK)7iE32!cP?IXSzfX?)&JR>?Q$fX~j?{)ow*fnt3f@q4|akqaSBPa~bP~po74FL!f
z#SS{xp{lBWv@8esY`bWZgeLtT(Hsb{j<?hoxK&Z6Y5)3eB^>dn98ba*P-08k)@HYE
z)rx8B#oWP@bI|9%0<fk|M2O`lD0vd;J4!tePPYHPWniuhCora8Vn})9m5cLp)?IdS
z5k8=(V<ro6GX+LcP2xcWU*8UB|IXg=%OzKPeFtBa*!a5*DXRa`9o;m7C<+iA&85Z#
zE#kVrZ0`QXV#4B9V<|F`#U2u08SqHZ6Cec@NLYaIZ1-P9>al^@aRH{VGcoshtOQ9w
zbH~qjgY@+ESEJpbz8>#XGVP(ST=<DY)w$vRZkgg$!e_+dRLiNMDO(9;Lp{g-=X<a@
z!YvT-q1eL1D#-}b*qn3r%h3S>#p36sV{bp2np9M{gek~we#cNa3t)_=L}3i_I1P)=
zMk#d^Pbs<a%{$CK9_Hr<fJTnr{&S1#@(Z9>j4s@v$}?pT8?_3#$R$l~bB^6SfG^7(
zEfXz08hQ~wArQyINJvTd^+W+cS%AoRuYf6;yd`s*ea`K*>&eAo-T{Cbe*omq$io1d
zhXA^WFUdQKC2!+2Ef&(2=S5j2@2ow+wO_Yx-7T!`6%-XI&G~`W=mojjN4K{$<t8b@
zeX@3x8FInE3sw{jqL{@1<1@XsBGq6-K;9eLsyeDVM!MH8L`hDzKp?Z??fw0tBq7A-
z1fE=_7C70XX#%B`NaRT35Ywgs9T1{~ZZPtckDYQ~nwWKQE%6fwD3nKqFS8Q7+S|N2
zMem}iX+9=y3K3M;Q1O3>2GrvG65&QVXwV?zC<~t=$1<LXt#a(OYe%p{jzRkmH6>Io
zri_OG2OBnWVjmFxa#y^f?8|AsMPGBJHZU<inHB>g#pX#+I->rokD8KtgSeBdVwqUv
zR4|0}0m$d7$%FgZ*@xG?2o-_><{J4n>zpiOM#n;1q_li92VD~N-{#xR%ahZwn)BtY
z)=^!G-^(+f!~?K<8L=`a%y8H4-K9iH1|Y=ARScbfX{fxsCme~L#?;xrVIPQW<p|1{
zMGIbEoKmSq3#cEc*4Z!pKBd2St3aHbyvu2JT(kNg1k=ifAJmsgy3Ics+t#9JPY$h1
zG3I^|-rkz!cKHt|+$it5qkOXsgCr$B>TgN_px*NB{zPIWrLf4rw!*K3W=-Pz_wUOL
z75V)hsN;ONac<u}rAKXV7!|4?;DxRU1F-PiD^mS&)!e~tNad*t_l15FZDk}NHRW-%
zM2XWQl1@g*dO?O6o$p^bX6#rSw9aBBn&}Cl;T6G?c%6m}34sA?)j+9NBG-qb6t)9!
zvG-t0uTHux`d3Ojj&e2~pxB^ZR3r!BAZ6rFt&g}FT(^WA+TazysovIH9d9e|!tAdI
zi-hh{mPO}Cg_9^BGqYeyxlWmTZgcIMjf{v}q!)2bqPN!BaGRJxwgLVZJCNE)Nd>>C
zpw<=qICH=$znbi*sLa1!?7eHiCN3YuJv2&U#>XP*F@`qZzP9}M{(VAI!r8Nf=xC)W
zCgH)vlo%A#B)57cb|9?bM~`{gdoP^PuP=vpgze0PBK}qUyLpG;mQ&L~0>oN)F)e_G
zU{`#+x_H9l1G5BhdT5~5DFhzsTiO0Qhl&y59}E*!qPDhn-cM}-&@y04olleiKv%DK
z0~w>C$QivgO*`u4*rcNQq$QH|8I2CsJX)bYvVHn)17)y~jUph}(-jYUm~?KH0WoNO
za)X#NFwtMV?m)i1_~AHc(lKUagg@|poL4Z_A2})^8yN)SnWBHKsw#MYUy@slDqwxR
zjM=gxyP59Ez8y?r0B)8nS&7AmtLuZbv`f#@cLL;L;I2w>?tAUpHMbAfUp#*<tbYt;
zTrclT+?kmu5V^lb7*S#=VxCVMqD(dICb>Ax{n6S9keK(buY2LBbfiOEV;O;(Q1q^%
zBHdH6H6ZTqUSfEYh<S%JF&V<v+wLp9A>Ghjgm!!eYd9?*<IbPIeqF_1Ne|KvB841-
z1*cwvJ+AD0dVvwJcEBX(Rmsyry{W5+kHa)HPGM%nBZc^1=p8U-t0Qy-Ps7yX%&Aju
zMXZ6@Yqj6!<<eTi1n2)+HMw2y+WA|Z?h0i+h8nHNaebAIM4Wdic1t2~Ns#9(3~iq}
z#)rr5O!FC#_D4Fss=-Z*+DC+A9QBTH$N1nVaTj?9&^Lg=QT?0w;xb=STI&2@MK*EZ
z6{Q80ub`4eqXlU8(%pixhBsTr)dEU6dg_!woQZEHtq*k}lOQfU0kscP5%uH@B0W@)
zDl6}{5nbb>NAf&%%6Yf!VzxC?w5G0Bab~BD3d&$kk5SaJlN2hgrKDHzceKc{n`<pZ
z`NPN0T@jjPBesy(=0^&~eBJX4ZvE6aXTUtF{Bc1k5Y3`Wb7gkDY{cThN+lqn>T1%-
z3F~@jEN_;5xUNDaHtQmyX99{C(8Jd%%_de{QAFSe(^tP0+l*I5(>U-)j-5(=)0Z<x
zV$oZ$Kxkp2ET@WWNn)Tj+16BXNdkE8D{~L02zgCcE=tSw3lfkn@*@^@oF-VizzOLV
zK)cK*my;9(9>)~e-|z@)Y6QB7JiyPp5rUW20}M%T-aMl`op0a1DNdO17iF-@md_>O
z^-e7<G^@Ul7huvHlXJq9cldBdiis?OK;f#X_x}W{RzmB15qB+wD9mei0>*E!vDQZ8
z1&(NXeQkF-fOpi}Kv72pbt2t=Liv^2+gvvs%QJX-H=Rmbp&li52!snVRrF+_0VY9p
zfF+`)Aeeycpj$s<Ts5d2Pl#778aS#sx~{O)3xpaf6||t9P1}uS#0ah!rUX1<NPU(e
zj!|zJbLzU#z1E2-j(A(sH=+A7GtK=Nox69hHHv&T-BI9ugzTYoqLyuR%bPm;W_iYN
z#Yr!nwQ%e~5hf>BwBpX=R7xM_?D%b@&cPfk{4<!Lz541UN{|5q=FyC<|GrGF%C!|0
z7|m~9d4m>tk;nSBqC#)$)*rnNpM^!k%axXbdx5kxDm~i+H5hr<iXcukf4QnC{sa$J
z(Xut@MT6T~2GIKZR9bOOLDoqr5?Qq_{2N!F4=*wI7`lCGyw(3BQQMSCfp#~feTJq7
zKD9d@<DK`GLS-A@U9@6!LI9-#KJ8^4OiQH~?^wjoCrhX$UNu8|(*OPFQKLeLlCuo2
zZawmuw$RooZpZImV0lTY4Qm;U2tdw3K;@Mwqb`(cJ<7K(vHC_qf7v7LJ9RQ@w_?=D
zkwkIhLE#Y*@3}^NClc3Vxy^+-h!_VH9!Ay)114k-L_^_hN&6-A@<79t=W=MziEI}w
zZ*TXxTGy|8cb(tATJn*d^%-YjZ;{sFuE90*V#I0ze&@!?n@=}yqk8_E+f?io#2V9J
zB`$I6xcpwUZXW;?i07A0Jq^b9_Yp-R?PMoL0|bCYh@}}9#Rgyj4Y}T&Ibzh*_S-3Y
z-Zu~q1=3_{+>rd?)XVhD0=N_QfD=pBvA?9C+SbMztP%)qkZ>MI&fL^)wo{cjX`<+5
z5=iNQmqnCGp5?#^Yr7#pwRw6@$Hd__ZO!D#0r)e<fEppb-1)l0VCV=UDntJogFn*K
zES@(+V^XWT)T~mrw?|p9)7@@lY30&;U+FbVkc1Co#smqUGX<(tYdPN(AyX7Zw%9F4
z3VCIJ!>t*2M=JsvwZtWhO}9s9FxMucMt`p9$?;bGrEPA^{5^@9k<rV$s2Z~b2to1G
zm|z@!k@;}3idg@w_Gwftt4%+WeMFJla3d={-SBrdi2UFgd#;-*_xu(AF^mdVL)LFR
zRDeFCTvR?nY0^*CMwv#1xcm9@=R5cAg*@{#rt<|A5*#%7-XiJ};4c~#-NSY&0Xf-o
z@O%O=SpQkI2L6GbJEThKRF)0uzX6@+3$|nVa!PGmsUbNwGHOenJuuXg=i{cXuvxe6
za-w50<pW_NA|gV!ad?Ib6d7{vtMT@+`DPBS{8(d^Y*+LFKhY*PjTCoTv#$bUc^Knb
zy2E3jtZd<xRN>uyV1<pQ;{QljWEI>`+x^o)m*H;XtkPpL0j~T1EDcxcNO281IPry|
zy1GeLk3<ypQI8@Z3~i0|mJv@}S3I?e1@Ss+ee;>*={?ZY+|wC+K|Ufum>zaO;mhvI
zeG#wY#yfoYC8@)`WM%cmm-fb!*U?sQYIv?$Z5(49L)F#$(3!zVPz4*D_+3Yw#xa`t
z2Un+y+)+zQOBT}2K_c?EI)RN=A*qN0=P`Ot%OKPP30)kH2UL`>ojk|FMJ`W<4``El
zGL;4x2x=S70%C5!$6~0psUrP0t-@fv9#reLBdQ6~h&$Wlbor{y995pI3{p^mnLImp
z9fl4R5IN*oVi|bJLu7Ws_lA}Cd|Xfvph+QFMxY?9?ck7{X6drQay6PeA@@tQiR~^c
zn+bb4AFhdZ9w8~vtuTY$idrwwKAztImBxxKPbzD)Eq4jeVzWX)!GPF&(bA!lutDy{
zhJqs?7O<<o7PR;>WOi8$OyJx1XNpJ_S)E<wjM-D)!2fQneRG<RON)08Sav{Xo5wMk
zrzYAhB})mGm?OW%&23eD0ngn?^*e~2frJ>ov+F2-eev%}-{JtGfh`x1TB%EIA7F(e
zIA=zw72xlfFcx?`k+y@hw1Se-J^{h9gHz0<D(C_JsM7tNQ!A+c8uLxvFC^1`hTZN?
z)#iv{gg^>J8{gL$Hgk#DRZ4FxnWP6!ISI-k1ZoV`KpHhXUYeJa6DB!zmZ1RbOl~(V
z@k`qnk=s7bI!3Oluf-Flh0^W5P+@`k7=)x=hE{)gc+di#u%e${=WB@x3CkMlp65hU
zbmh7{Nl!1ON*<0zYnESg5=Is{Z<1Y!zf9=b#-+D&OHN6*f1EM(l52C+u<R`Ari$mo
zN**>ZE0_I$b1nN4r)p?6Pdj?>V1~hfrT*3dG|akrVKD{O9DHF0LBxaIVa)9;u<28%
z30+Iz$dRs3Rp{(#%Ah!5y(>poP;Iz2SB)6842qAso#}CKB{TtnpVUoE@<u&=Z1IGI
zt*#!gJBT9BR$1r72R8>`Y~yxw=v>Wj*kigLmHtkS-3jzmCl-$eRfQ|&CAfN0uUvX#
zHm@`MSfADk)AtJS1C1tW*G}bKscu~Tsen9cb~G6~Uys&I+a;ZL;i%A0L(q`Dg((RQ
zAtG;-)(F<b0+^}>F{g~GJ4|Q(Y(0CjirBs0d+J^g5wZRSiJ}v7IA|Ery#Ypu!5wsX
zpomm&`w>z^iJ(mGIAW3P_9e$?!R6*-)|50d{F@z!9Y}ERD0+Z&`ccrroKO{`h55p4
z$-y1J>dqS62uJTr!pZ(bTIKf(2^LG7Qps$9=f)Y~zd+i=j<bk}(L$Lol$Qkm%7$9L
zlIQEZkC5?+HI0W*Nr3s65#^!Wk@w`5-yp-Fb*-(hEj}x3g+vz!Q_1!>)F*Gj5`}ga
z%JmIS1d0ZyW`exMzJ7MB<=vyu<%~CR`R@Ujp#?w2>Y82j%@Fe7p#rEEKUP4%03<<t
zjITH}2n{X%8X0tjxhrfSIce?&E^O}SkY#r#cb`gZWSMSrY)y;Rj8q$Sb#*Zl%Y>+D
z%C^ab0OQ(dZrBst8j$DF1|J7KbSm~}d=_ppZlbbvoLR1Gdm<Z5UwTAlBB+`(3at8H
z#+P19+*<60Xn}d$SLOW=@|4e;L|H)VvaXCaa9VY8W$x7l1`hmfj>}a5i!oy&P5l&P
zd&+i}@_v2drPENqjwF(+m$J>13RA3xTzIPeA41XoZf9D>llq}jgI-iMa7MZf8pP0s
z2a}h5k<Gw#2d56I_LVGVU;Q5X7^9_2|89l#vQjp-ksL4}Xx&ws>o~fBCu4d@Ts40i
zKiHu28JffJYH;6Dgm{Z%&!2Lezhuerrp;-I+Hikz_RgC&r3`8PaRXPvq)4}$<1ly$
z5WhVFZbaW!cfaR?>t;8rZ)9eYC|AP-O><CoVd&U45CQH<<&~|3T$5US8#T>O`g(h(
zsi@qjD!PXy4;1K4nB2C_5nqP=A;3X^;>ssais{{T;FNvF;nw^dAtY{ysUId327f_^
zA}J8wAWTc%0>u((rlD0k%k4jg(XyeE0SZpz2AM?NzvS=Vr?JqbD+qw_weY!-;TRuF
zmn25(`O)Q1&*M-N_NEzu5<^MCkB5LEb~yrTer3?!+w$kzK9%o#1!jd_ofq+JE#P3W
zYa=4faXBg404a}R3QIO0&a$DZ^};iEJf&j8@8!@QbLPnNQDKzX-6^&xiI^WtM_e>@
zYD;}jG5M3{wfx!&on-xsARMf$L*Q9Tk$87iwAg$pE^R5&;w@Vy0dQ7+lkz_QY*B-{
zIdFkXhH9s*p?b%fv<ZZH%X}_4UW<<QJVpo3E<+(v4w~f5?wgrHGeaDL62R5m1@M<d
zC8y?C_(+TcAml-RfRkZZHOC0+@0?EWEqy3wLi`q9eaOTu;|{vsMieRyH7&A@VOclR
zQf~dUUGADanF*<9A+s%CG_I#7DX7+3H}8yj(YZ>WZ(HZ3KY7wSHFE%?A*sZuBh~?Q
zf$`sG8vZ{?Ri&FsgT|0Fw<VFY(2=8{12B7D@{R;S%TBlu0tgxgQw>+PqKgOvO%6u|
zSiFiBifh;l!9?7xa9*B3U5is%5(^yT3vzdQe_Pm;)je$Y>z%PBB`e~VIk_;BY9J?n
zIcZ#4e`hbJaFPAdPd0cB%yLCjOgsicH_UU#Kg7}7ZuE^^fpVD}SFaY4jaW#jg5Xy;
zA5^LVSJD{92%Ar}0pTW5ADM75$#|~uD25_<4Xbrfw>}k7LFtOpf&TK(*G>>ec#ASZ
zxUbY9t>U~}4r_f|8uH;Gi&bECt(hnnG!0X<La{(Ak?S$pD+wtsKTd6uQce3)fc&Sp
z!J*GKdEmiO=@tm0>9FysSyY~Y2N;Z+%Pa{tYXZXN{K(8Y&$#dMp{60v3Kt$RV5F(@
zYWh<&E~`~0E}eWQ`lWTzG@GIpG-4_auP4x|(V&Q!iBD#TY4zI6zjnTM@0C`CYYRMw
zvG=e(pev&PK=LMG8+%$`@kOuNif_8BZEg(oPCb{>=%Z^D8#_?4A+{pyK<TGXg{y~k
zB7jn<3k5$TNg@|9q#Y|5c0WG^8KPbLi7l1l=+TGp0w{U^{(|u*r#3D0UfYb`wBw>6
z2-RL!-+08|-@mC}gt=R;ZJe5h-m}!`foo<|o^c^~3;ibuao*-0)%x?!6nyZgX@5nX
zi4G$pBbViAwpYWGYo@>W2R)|(6+@kn>G9=jmj6%sqEmhZ{*bDW`8>CPj$?RKm{H#7
zJwo&3-{Vx!A!rMQeTA~!%h9ej;X{v9^qn&mV6AgoRpPTo>tfru8%_3Q;gExi3VQbM
z{}{1cZi&Q3E9wb)S4msT9kWMV`Pk<AnBMckuSSk+pC_!~^I8glXoZfBf`TX|<oQ%F
zpRq>4{^eC~d+UrfHZae?b_2HIB#)Gg%1xA@;4ngn0(C%LY(>PXyx|+Sod}`_QYDfm
zFp|mYwjkre%JIRfRERc!_{(J>N(anh`mg<R?r6xT@DTe1{0`8b_&ARHBR3FJ!pzkn
z4+3UegRIte#kiqETZhRhu_4MSq?v&(11Z&U%M<T|lX<?DPs@G%0c;=jfzV4J@qxWE
z?JSd1JtHA{`(Fx|Ehf95*z51pSxVJa<8<P;bu*CI36g2yfSf0f9s%=QF|BU?{i$f{
zmc_X(-@h~#2Bfc6Y);ubtiwu!Ir5`BBqewT+ncqObj~x2jLUub%gq1AYvZ#~k)6yg
z8lP`JWS_kJ<1QU`PJ1<dPOH_$jamVjn|AE@oH=HE2fJ0Rg7WG<PXJq+>|&$h;-Xzv
z_hVz@&sQ0_c5!Cb`lp>YZ!Y5=>f+nVQ4LR4j@s4U=9@GnAQUB&%=n8PqqJi9$<4G5
zW<(#5A6e65_nhr=t|M&YP5}~uZ6z>*C8}dwnmeJX9X@oZx8RStPeDOJ=sWoN>|YMG
z*G_o#Fs0@#P_Om4?;=aj9g%aep<-9v*~z#@BX8g6mPDQWH32^;FyG~+kD9O6Y;)}M
zE#2PMHaIjiG{b7##Iotd?6gSZGy`sPP~<aW{x{g`_4L{90M}LL+atmI{N+nU^~z;O
zX%dJf!ugKTs&_lv4HR~(l^avj-gwuUfu|@Py!~whlgteVRPQ7n#5^~t?R&n~FwM%j
z%a^KsyZbeE`Q@l6lgGIo@8;qv^S4?r@r`BazghqwUY98UML{HUvc8H?9SB<e`t`q-
zgd1ZIAtuxBqnSw@)Bbspma4g#nHZ)jsBnlG4Q@RhI+RUDbARNLy@mP2rtd%51}Ji}
z@VjG&g0K(ilA4LF6rJHNEMB07DczDXqFoz9ySndL=i(CEBMfV&<Z=%My;VRScYa6|
zGC)^wI>R<Dc+q76oHMQw^P~>Ntq)xvD<F=moS=!iRi1ztC1(G9o7fT7)=3)Q%zke1
zo@;V@J+GE#b@SGN?l&Tn51X@;kej({?_SILFG()jKb6_q+b@38x=+>m5qgUbTQ6;t
zIU11fLX+JOAdzwC3y4lAvCux=nrD3Dn*Y-`cu|75&<9iYyteLRKDsx~Zg^bjqRRR3
zA-AGxP1id(@16y*UTA5#fK~)W*NX}<$XHx_7C{Q=vydGkM=#%C&w?^Gw*w#%YAU~I
z?{`YMov3xh+{h@TepQV=8%RLymOGxnd`Z+u5+cph6CAX#EU;I*nCk^{p>g_Z_OCM=
z%Ur6SOzz{@<v++>T^T;635`g^r%Bv*Wxhu3hB7yj3hY*1{T8qSC@$b*3`#?I$5=#<
z?Q=UjXz~p78xjK3<~N?Uxw|Y9UFC=9jK`0`70Y9~zP^Y`W`yRaQKLp3?_s+f@nG*x
z3U?<8fg9e}9|XjN8qs9&kekE7F{4+cwgTi7vMRFuPzE<~S2+P3uWFm4B&>GnF{A$a
zbgOq!KmbPA>5m?jf8B&}Bg0w1hSGhNJt;F{*t<S|Y7%55`Qa@s%qV6egvbs&(poY%
z<=hVB5IMFJ8FzA2A2jm#+=?Fx85`o)>|VC?;mb4qTrcl-*E{xCmIkWT<;x>~+GpDv
z;=a9Q%d(9+g#&L@+b;W7W;<@ykEG(3nq%1)Z{1q<<7PyJ{>`l1++J~JKDo9_9@<-W
zyS=E7_JXyR-#(`;9=XJSn0qxVV(HODR!(<J4jR8P@8{3&T27bqs@<%+ZYtcD9jU&1
z*Dh-JY-&~XOJZmhoi%${evVa-06l|g6PEg<_(hNH76hnVEShKnAoV<BB3f2t2EiHG
z(#@g0!Mn0->>5u$Wx`ZU{{q(_;#&#6!N{P}Fx?$fZqR@U3SUSVRICNHQTwIQCn{<j
zS|Zji3q9Suc_Aq&7B^2^kM$q6VD8+?m`aAPrgEnt+>N-|kp0B`>J5G~0ov$*<hWC-
zD|0TxOm$%**Kvn$(Rt0Y1&7OiQdTa1A{Jf1HkLnoIoN-s%cy$o|CN7jO80EiYUtvC
z>FsJ2$uiYWn&$6!)^*YXeZl>9#P8yZXROxLF&QV+wsiL7r@yr|)f>;x{XeSC1FYx$
z{r}dn*Rf|fvXYGKWMmdXM$$AYGg+l&g%BbVhlC_abt-KMNfa`UQmK%H7A-2}|G0C0
z-*f)g^*h(^d!0jlKJWKy-1l?cr%Y+Z?xPYonzo)JBwmDTncW`UD6X*LL0`PM8^oMc
zu;_v*jU||iFV=;H&1%!KS_&mjHOJQ_2W$|W1Kz|JM)Ns;0ljbNZ|}T!uy<-cyL#5f
zHeNmHclWpIexF{qi>g^mxDL66;VdL;7`^#CF!dAt=l;>B&+g@M`?oEfcC$yOp;^ka
zXUEMkAzpIH{rp)AV3~^b4`t<oODh55dxd9Ob(a>1TSclanXNnar%e@MOH9~1sr;mc
zrt9TZS_ldN|J^<N=}yI;pdK|oe`DI27D;c?JDcpZ511X(-#zhajOFrFqj$wRI7Gnh
z2+1=&T41`|!QP%?Z{eQ$l(g;>jLoT2(a_wkOywzlxs)90SNyz%;r5?NL&jwpmNqna
zmT#pVtlQ>Oi<-!dBaa?EI_LDT-7x;aR43>ozcZKVAQTeW`U|T3{T3<FoHOtcaj;mp
zmRw&tg`)>E;A_G(=bWDqnHkA#CN^fd6CifkJ3{Jum+w~8MAHrjg2jd8;JpmJ<H7};
z+*Ps+rGhFqCsfUBZd;y_Ilw(a4)FYI=X|`q_knAdU}z0uKpWR<UBD12hIT)`iAW##
ziE7Ii76AT=Sm=XzA{KV_a>&W1;z*+C14HpxQfs+&E6$NqOjwgno@Kn0)?Q!@jM@k#
z`y>VH9NNOka}#Qwl_0r!1b2qE4xDcM%p<ulVapC|0pdb8w9ee;>^wVqTsA6*>3KKR
zco+s_cB)n^Mx;RZ#9PGwT3nHA_rJBixBK22m*1*;Egw_eYh}f6A>8ZvW<G`cOpRyH
zo-Hzk5hH>#hl7U5%3|gVSQQ#WB<nn{{wI75XSgV8CRP&w=$*O3Hxmv+E*+HDJto?p
zhK(1WTI>2p039VeS_T*rYUTRyxd#Uip~-zmjBur<mZ2)r3}Z!7GklZ$;K3V=bRT$T
zLg)&J!lQX(-^Lg0*mU<QR_*J}Nke*89Rrev=Y?$k92Ecil#d%4JMM<x*O)q0<Ol40
z5z~s2gdZ;%NW>7J&0B>79PPn=5PSnG<G3OfRfH>IIFBl{2cH(oRccuj!S2R8Xn|()
zeMgQNBZm{#0r~<C^RVbZMa`C7M@*}{?jIO90Q3{o#o*=A9R^2>6Ly<a?>{v=sLAUc
z_CH(1HdI-h_%dr<@ZgGMLuQk@yX(4p?^Hue%k8v!OaNUG901)sc0f;5k46=hDyu@5
zVF+2GFktlPic1L@>%(TBFTEQX_4<OQ^^@d%v6I}Z6(WodnwWiaK71InxdKy>T)~L9
zAU&&#e6s&;DX1p_RCExG)I;pR&x3q#LI@5>s%TsxhYIJnVv@@qm<n<eMEyx8*Plv#
z1;+oqd-uNi`0>EA)ruI;p?*TP<u9QlVc((ySv<~#OZy)>TGT+8;v{N~l&*WgYj`hM
z4-z;NgydG{GSdYkNumqQ8_?_v8CWD;NdTc9VngW+<T9j2INWQJ0@3T}l0~y6*n^gO
zFG4&%ez(>I&4;1*zyg6x`^wOvvs6xkMMDg5BciLLto%;?GaOFuv13p3rEcWcyzaa$
zbqg`$F!R#G(|Bxoyk&<!$M43fSkJ)4tfWdtLojNzud?!-f4i@UkwNpfPVB4c^Mb)T
z)_VPNs`f}CnudnJpaSeuykdDAZgRUGM;NP}87Xae5KJ%<p#rC)7~g25h<3_WQGN9A
zp{Aaw5jt?++(z(%wu3$i3>`z(=6wHfQ9_c{OnO_bBPSZBJ;jkDJ9s^N0D>VR7x`;u
zA;V$x*(KI#U8m{(fs_&MnCz4|PDBakTeEt#31t<i1R}7|&`^j6iJK7P8yph4`qk0F
z=cy#66k$Kmg-By2nF+ireh+pfY9kZC5~5c3Fhrd*kc`^Giaf=p4*QUtCnPkX6`__F
z>zjaBT>9^5mx1j!ll}_RCB!wYrHB!L?YHTbqz4Q&KjHgwEvqq^-7N7uBE>{3A<#a(
z0LM5JB$yu>KDC~%x6bd~x@qB?5V7I<(UeKufL6E0_xtO}jJ~4F1F1rV@b2^H^nkzp
zl#|F;Hw?D^y3gCRaQDdS<ceTd_Y;E#CXE2V^T@iQ<C|XYk+L>a?qA)^7EAn>sFFED
zJW>;4CgJZXk$zl?=@Z>;3kgV^7q=^We<DK0>RAqxc8C7CX?<~vriDq(jpwlET(TST
zL}SupqkImo`};hEL;}KvCm`5ELFly5@PJ8j{r(jfgsEcQym{iJ1o4Yjeo^@T{XG^w
zM05@b30@>jhyYpgC8_2PYyDF5!zVIOK*n~oR!??n+4Zgn30PU8;ir8Q02)f0MwL<<
zZ8rY{<+VI!uJZlJh$~*5t69o@&lb6Y-;v@8lhi|YiCwN0#UE>p&6rEXwkc7-oXlO$
zK2BL)dce!8FKhI2XsFw_?46!eG3bRJqf92jce=L@5I-p>tZ1N;%CzJi_xrj`c-}WP
zHPctkQ=eakUVTATjMb<->Gxq;6VK`Ur&ts(A7Y?}f}pj-!{`(Lwt|*Zo6|f1*dd_E
zQb;uIKGA+Gm%@9*eDsz<xHdVU8N<N5mLpIjD6ODPO(>tnjvYIH*)lEO5zFmTK!CWD
z(T>cXGv{!-9_ol7xU?mm&7NDA!bx&KITl>*n3#}rh&n>Ja?U?Vl*C{VPc;lZ2WiD$
zV;;pYvqxP(sS-)kvR&1h&nCIj+<GgZMWhsfIY|jL4lH}MX8n3L)Lz=}KhK^O#T(!L
z)BLp~Fm=$|op5)b$uw^vV8AAJ?%dh0uOCA0U7SK9mgc;1+c$1X!~&WF+A;v>48g(a
zf|j$aaZtMR?AiaKuViKRgF8cGVG)_zNp!Adf8LyLZEjZ42>A(E{R1}e>NRVYgQ5c8
z1)<hOzJ@mqwS%>-ag!!ZBtlhky;<f+m(9p-*1hq-B6L|A>Yt%Ezw8M%sg+oB9barC
zOGccd{bX@QueRBMa*yv)|Kwqv>SXFBo@T#c>3TndS|p3*?$ZVakVAo-^YB<h|IeGC
z_McthS#U`G<994HZou_`m2zB(iaT&%z11k&@n}GK^P@OC(@l5PoK{W(QdH`y?vZJv
zr{}xsDnlCJB!-SVT6C^gb!0(q`+PpnkRb?iBVObb*5puTQ2B8k8CZ4NW^5T+f79D{
z^wpT7g+*NsHAGtM7_ID?cKyQ@H38M}aUf(c;jD*-GbkGcEB%fFjk!bn;jJVj5*#ji
z%lNTl@pnF8V%J@cBeg{xcb68I^3-c9!ehUR&f%9h0o8=o&aLpAn6ITerrayK5g5SE
zVTJUiXoJ=obaq^nqoO8`YWjgT6Mz2L0&poQ?o8q&avW(Ir3gH3Jwh31d?@Pa-L0eE
zCa~=pTsnNky{5N*l+2do9U2#=Iq03jktV(zQb^HYoS9&=*~c^pQx07MR~h*<nV<T%
zWoH7Z9XxWR@W$OkK0X7`;n9XakKOsx+2Ry;u#_rn7QoKepR8wDtB3bWwm~<@BvW1W
zyt_x{{_nbb+ql10u7NR#lEkq8BaJ4L5ITTdW4%*fzkI>KxLf#3uAG&j%f`|AgFxV?
zEXDFLUr+DhFU4mcej^O$z?xyigP?Ik@vdrDu|9VbdCfKeiQEm#Go!W<;<M<XZBQxM
zH@qcboheO0TK`^I+O}&~U*+!`Uq>9L9VC{)U-{-M>!LPv&}=0c7>4jjt)PN*Eg5*p
znMj#`hXW8<=GD#7ycHe-OvwTMRSB1A`?0Z0S^0JamCZDa@dir2#^ze!HDfqNP?G8I
zP&J6Cmo?g<Qzuv4`uq#&$N2T;a1>(3VM-2dC{=1f>76-8X4E;Q#6U(R{2j>~60hSr
zD@K!>K;dp>lT9p{yGY!z;8WObXK~ta!(&5=c)t?BL+a488*fSM9#;xWv|e84?+h#8
zf*y$E7n_x6ycGyOR!L5M`t*YRDFl%Hu3ppeB`ZibaoyQD{Y3zd;eNcj3mqo+wMUj&
za>?OChXi3|dY8-@c0QVv)uKIxP=w(wuw!w6Nk9_nyHz7(0tHDW9AfFFruj?-!p@Sw
zB+Xr=Q$8R#Yu~o*(_9C7Dd~+zyy3_qDQNNivD7;KqRUrAZP~%t1g;;ECnBQc4^pXp
zdK&xk363M42k3+tA7Y7#?lN-qe5ahMccb3yQ#aZ;nw2-{VGirMPJKu{2Izz~O|sK}
zh4CVKqpC7oFgk`COPmvOJJB&y>h*Ir2u#I4OUM7y)Z+yrZYKrX1JuhpdU}*w&HVlS
zW$*v^A-2trHexIwUHd-&Sf%B}wxjN$av*YHI>VNE*)X^0Y0xPw8l6)A$ESHa--&&5
zdUD>IH@B%^rJ<(lMm#tuEw!kq2vRiMPx*OQI2{T1{^D%><5JQ}z_abV+9+wuHs<k9
z?8lr}t#~ZRVTxT7wk2$FEb`p)_I|(rX=*GT+nPp4nT4bsGmXsn@Bru#oZ!4nC)R8y
zh{vhY2At_as)<p63v4ROgmgRXq|~|mKj`RQ*igx%cI~-{Q8h2ua!@Y))ay!Wek8if
zrmb4#mVJ7`G4PJ!-P6-J`tToaTJfw4QP6W^FhJZ1_>TF?Cv|KE9s{4`Y~A{*nfmId
zz0rDfJa7AXkm-V3gLz`YT+`Mt-GMP0pDhoQ%{O91JCrv<c!Rt}?^6YX;6iKGs#QO>
zKDjh14F|!Dy=D)vvY9xf$$7`C!3+jrExV5&z_|tKnLxS^$kV5wM+>KX9Ly@>d2=BX
z3_ID-uqSm6zAdyXP3vJkrL?_p>(=?IDqHFTcr4M3bJ4*)z?mj*`~A@(-$jy9$>}6T
zqwEj-fedn-;`4$rlXq1@PW*ymMZ+Z=t+aDAH1y^+WzS~H5xV(n3}rBV3^K48oRY4J
z!@>6CVdQqPJD7oHg9uWw5|Op_@y2UkG7UHn2Ov%&n6^zD-3DHk6GMhkl+$kFln~V>
zM@aFPDv$+^6p<rGons0{o#}vy9_vS!!p>nX{98Mu4OVSx3(*CDsF7CrSg4w!O+9<`
zkO^3#!5&BNAcYR`fJ}};+PhaJDLOr5&Np%eSa)+RECO`2?78X<k7iFa=Gy;2Tli_~
zYw!>$j6hi40+U33th8Ob0|b)FqtB>OiwBubfc0;!XHI+1@{{Zgu3N5tc}9ta72p0+
zP9lMEYkp{Q=rL2hO6!ous->l+JD=AJ0r-xO_cpBFy$&**9pb+DP^C^m#|Gn*DvDd+
z)J06KuR1BZy8PqpsGP|1d*@n=>GU7W;ef-PM57f&X8ko?NAv;*WyKmT35(A~!hkwH
zYU-p5MYsxa@6pa6YMuv@V>!|JsmZVl&sL}R%s>}RJwfgQPj~Bh!|3M=5C?(roI$Mz
zZv{&mW0d{rZZx+d21ZQ4t{*;_ncjKESiNCgqLejr5t!pgP|2_KU8KZW$UK@`?>F%?
zP${)<-+n5;j@T!?Q=2R;Z!-?GNfW1}WtYp#%2qu(J)AfhnZgjiVv65OtFbfZ#a!8A
z8G5Vg^I(=EZQ)8P6T`+F^xawa$Q2gq>(6F*RlCatpFL}=U#}kx>W>~EaT1-b02MAS
z33z!ac|j1lJ6i&zXl>M^;Gj7p>M%wn_)O@sA-G7SB<0<gV8CuL(6rT1Xq+!ZW?RC}
za*IxdQNfqh6^)6O-gGKZfHzk~KIj}&ci^D!2{Cy`A&&>1R$6q(U7^pSw<LS0xg*7N
zB^j2<df&@l%oA-ZziMG{DH^4*<hlMN+7}uYaz-Y+gjQR;Ad(EiSws@xK;ouA>3jys
z_dnR>+O<HnIn+?`xDS(VovB!#!mt>Txr*W$eC0NX+)VmB3OmfDOg2HtEX%Y`ifR_J
zKwm!z70<IG`-;bt<fB%;@T{Ho3$1-4Gw$eCCQ(Ic-M{4M*pQ<_D=e7?wi2_SOc!PA
zViec`Ph)(eN1!3!pC)%DbAvfr@p0o=7Ezi-W+F}4g;fupoE=l_nwNjDGN_3v<A?6R
zZHpU`)XU=~3P3+}naA-HNSevIb%Oy^kPOWpG;>`WqqM;K@CV$E{r>o)L0wqur?C7B
zew#v?sOEqEY*Dpt3)xER`uuzPH;g{hc`k>JTRRJEV(hH4&wpgOxhgg@H_uD@?AT9J
z$H!rH0UQAO75AD~%lyOitun1UlOlcgtiHQLwu-x#SI+c2&9=5$bQ>6vb?sxpsLn7k
zNty+V*cvPTG9477)adz9!Y*iVnsb$+U{HNcyJOE4uKxJClP*mqwlDY_p#txzeYrQ;
z^{`%Dj@-9O&{OV2jv)fZDbYtdIk+wJ@6VwIR3?c7us%GHN9V@1@72rGIV)A6z?R_z
zpqHzVA-*Sk2V8;DS9%-nDi#bAcYOW)a`+`OVFJ^MQQ!b7Nr_An&MY&su<I#Wk_I~~
zlZ*jw#LPsnrVMhCE@Ry~xvrQoE=m9Vi%9NO?y==gd8>yGh(H}fUsZ{%CYhfvImq-U
za4A$XGg&M0V}$?=R<{1waecCx1F{Mm=6q0=zVooodP`4E1Ko*&ox4YLWdjDt$ERuH
ztp7kiv(4#w_Lo@K7z99s)YD0JV28?2psHz$NHqS%i}n=s92MXVf`PCR`|VQW-rj*U
zP1Q8<q_**3y;E*)qUc3gefHcmU{}NGI3;lL>|=Z!UvX{e7#q7VhyYCPcE=5?KRu~D
zu6Pif(`ftK-)ev4O+py;K*B}&<z0ubkSzm$CM6>6!U1&TfSKaHDI6Wl{qTFcb|O86
zz7?rAFm<1#5p>zB(r>JBulZqn_i+zWqNcPBn42B@{GSN(MsI`5_KjHKaL~uc+@)e0
z|2D#a<jXv0u$Xt%9mxg4ISObFiK;UqfXBmrOPqA#VWV?xI#3$|MRYqdqjDp?X>M$E
z;$Qnqv-@HygYRKbYe|Ts<J9PC>Rd)dPc}2_&mx<oH#DNiq|Gu97Q|T3H5C8fj8hyI
zD|O_smum-5buV*UMV-huLErWP-AT`*-%3g-xAw?*2L6Vs#p?*|WdE0IZD`kN(IkBt
zG)Zi*+-~?&sPCWxNgI;0pBm<ij6ROL-`?(L{3J$1)Dx<NUS2e_ftf4!IXRVbv@t5J
zT`kkpo}t5Cb4=6gnIwyGorsi%21k^_GV_2`C5A_0T3O;p0l5mLhd@liFeKUB_!iZJ
zl_=;KFrh4825{q&$$YyV90s-uGDM70WRwo{KBk7{x0CbqGtL6z5CbCMlo%m#tE?aW
zB0c?(qNeB%B`T`3(e{f;1$hBm^M1U_ZpgKsXlH^04@>^CaWh;MasGE1W6^_!y47jK
zH#BpiH4|8b8%T6y<TxCrJK{2o>GtSmnyTv1MS6NR3pb9H_ucJZzV&V^P5&i1A3p2{
ziQzHo-kglL-ZC^eBqR;an01BT3=P5@8Zh^SjF5{lH*d;7klOGI7f2SKftglZJ}gqL
zzn(MpuTszN<t2)Xl60g#m8J2cZ@qE}@|^9zBwQ^J%oS)$IwhRV-YvtFlg5r8Z}s5y
zO#}8ZY^`^J**uGB#}zdn-a4Upkf3;jO{33qq4OO_TEE`Z1Ubk6jJg=T8D+|iO5hxR
zmKU?HUH$w2v;Z$ozy^R2i1$$COxJ+ad>cGOlzS5CFJUD6_cvylaHhc^ibMu9Q^1+1
zN3X-b<%$gG3C@7IsMBa&z}>i0Nsj@hJ`d)~_mZ9#EaY>Eee2e(rR}80D8bqYT*9p|
zH(Ezx|AfMjH;55y_7ksV{$`--;sB)9q!*Nfoc+Oi^;W*ddrwMoCIyBNmo8v`{X%xp
z^n(`;WU4SicM#d{1WV4rbjsO|+QMXHHz`r*Be?%0+o%poDLQ{%QXW*3CrrgPMnXOq
zzP^p|r>*0;d*J)tA-Mn?xE@y%y3D`CZY$t(Zj3lw5ts*luH4r7_eVcG@etM3YY`g%
zT^*2;2PWZooKRg5LTN^Z@3V}l?Hax9k0VfO1u9R*HMgSYAe>YDDc_QTjR8boGgZP=
zBmE~KPpwg}Q-cs>ar@b`1<kaLzR&a4DPL-J>)|TW4~{uDY2$kO^sHa@EjoPR*F~^}
zqKE3Rr`CBR6*sNvR1cC%J;vo$UtlYq`>vm+cekZX{jNiIik6+bp0kfzR@{msyDjj0
zi4ErWwrwrry*V~UfkdE(UWm2<?Dkjh$R~8W<?ar6&L{YQ1Gp#e<yX2ZirKj2<UqrN
zFQDyEZkQKai~L4U?@YT#R>(QBvtwTb2fO1hzkV_v*4XR%^}_(@6oedws4Qqp1Fjwp
zx?}S3i&4qfuWvv3cwUK&lrn@4m@kDq)?mrFZ?if40e+;8#a$k1XWX@AW4f!M-3l0y
zOk@1@Ck|8j*%Y%qdiF#h|4G#}W%{raep%lo$=O{F+64=CD<`3!v#S1f^ndW{-B$1J
z#?M`_xx#+GD5Grcx)7nLSl~T`iGaxYn7`#l?<D&k<d;61<`Lu0j<6=fLYHmu@yrm3
z9p}DWm_WOp*N2)?3c=hG7qA&J(mQq)DT-JoWRxJ!6$s%%SqP7Hy6ffE4xOvpk(!d6
zY@VCwrWG64>s#Q>ZWWt@VLvj5CI0kyk(pWZXLaWcRZUQk=rybIRL#|rn%`W4nh|Vm
zTeZzl=Kb8Gg&kVG-8uf!5BtcD3Q_BJ1mrr<dC;q*PS{_hID*XZk4d}~Tf3&IF@#-;
zh?L8!E>eg>H#}hWzgc9jO$to?3Y=$<DW;LUZtdE49PVVRNv;6c$^pO&k~Xg;S0+rV
z{T+NhI=XtgouO3|yLzQ?7?M19>eLLD4Q&hUhb&DpyduE>xV8yl3Nf_w2?%KIza+!F
zVMnN`#f#&GCy+b{lx<9gcTjoLx2z}sMNhBUxi-q(TJASecvfBq$9+r!xbSI|6>#M_
z2cWk%!>2`a1O|BU@Zk>s3&NDN(7m@r)`Kq6QL@a@kbl9ZZoIH}dy$-Esz%5#LYOq!
zlmIz05s$C9VDVy^c`D5kxfQ%}RqS1KQlie<x?GEG7-@2aZFu=|BlZ}TGy1VSsMsa8
zS}J<Mm&2U~mJLTI_4nV?`2K7zL;@WrKYZUY;D{`>r@E2R4Cx$$$l=oS1-<@FXa)Az
zX~~$BvR0hCqGG#n;et_5r)R4#Gk`6|D6s6elcr}l$dO=DW4?n#28<(9Y&v7!D7Ymx
z&NQ(D3S13NK2BgDB*PNB4{L4bRdmoh<%<Oqszc3Hk-XP&LN`8KwE~u2rtX4mlvNDd
zH<4LPlJhRX90)G(VE5Wp>M+)4?%9+iujtx`!OB(B`TTjmkx(D&U)}q^oobDX%GU&U
z5;`p<n4<E1dl|{W8e!?tt#dPqPy@D)y9A9tXGedKW{7{LpiGI~QG?*_R>AW>D;_<%
zMe-iHj>_*jF0e#gSGdC2u5xc6HY=}ayKmyGQ>GG#!k^e(8Z*Gr(J?aTsM=}Z-=Qu=
zRs9o*rtIA`RpURvZ9iMd3;wZHQS<Y{?wb5v3Hd;YE!t{i416H`RiIYX2YBs)Z(R9g
ztT89_=3sbSd*S&`HscTjc_s>qLjNlwdF~awW?~{HKLidUBO|VT)j&;GDSeR$cOhnp
z4iyfFx?d*rggaShMyENR;RpaTE3X*F0=1n6|16WOxc^Z)SXXZg`p<cEY3PtklH<Bw
z+990)huvxGKTyUCHwC8jt0gz1v=VvH?zx<y9M94H+NQaWP^DG`V-i6<-RH<|k2^M5
zz3(h41j^mtyLWf4Ys?RQn>*&Ojep5!Lg^vtmLt2x@1OcM7@z^f@St+iFXf=uYqJq&
zuKN3X`})%5H^CPw8n*anMnCb&4czC@O04I2gz^;BV*mH)>vD$;yl#aZSWUzu17&EJ
zMbHjPNND?RAD=u^_oFfG9Q+?$H#AP_(lMY7FnZ{+%#65*i?1BdjM;Jl&3y+=S2~`E
zK90l?;+kF!r4kw0m8`~>YkwoBPsn$=0&1|;BmYp|G>*1vJ7EY2!)d=xNZ43n;}noQ
znW};<6==;IvJ48s*L|kFLyKn3*n}6uo%(E@V!u{`=lQ?8t!AxS)BNsZ35<fvA`u2k
zS_uFs@>=q4ur+*FQ)kR*445N_1EABl+XwcY|A3g5R{qE`|N8(w2@Q`pjFPb7Y;ISw
ztMyK8yteKoe|eQ&|K7dp?ZPYFUFr0ikTk@El655%I9_#1e_AY_z33kaV27Yb_aNzI
zoa-cRqdP3G_)>iXFO=-x_!Te-*E5T?0X49Z3PFa70ngsEB(IoSHYudqr!0m2Pk;O0
zMx~6^nBjFCx%BJ`_28jJY7}%t9U^wX!<CSrYiKx^rjR3B*MXFRVeqrlXDn=k^q(*k
zNN945BrTCk8!`>VhdEfZ8^UL^OWZ@>oIQE+9+s2C6#it;QEVA6Ub2AcyNGgD3K6$|
zf4dGHIPg*jd1$8w^vIkyMK<ZjL+abjL(=H=#=I<#Kl10nW_h#!?A)=V1iA@v0bzjA
z=IWwrR$ZUa2RwXU_VueUS|T?UjZsli*4cR%=WVgJHb+m*X+t$&XE%1wQ@?-RXBKTI
zwch?J$);ImAe0==Kal)W&I&#{zQ+_bo2_o#qzS2jx5awQVPw1Bo?BZq-Q3(p<>SOj
zR5!h3$moiZ1e<dsq}{Yp70oDsB+q=AzCNY(?Mjnl)TXev1RzPYD>|d;OP9Wy?)WF@
z0xbcECv|YtmEkEbU&<&7S`BW0U55Zq*P2poZ=KBfX%jh>xGq86siyK@B+r$kbIais
z>vZe}JwN-Y>QY!(Sh&;EzVutXa>=(sultN12*`kyMF}aJlR^x9h?hydLEB9UEb$^F
zJ&DamhEPnK93sM$0prIn`*dcI?f2$m{xh1U`{AZ0wYXs7Rs^9_4pzIVwx>RH7v%yS
zU%<y5^eu}0`v;Q+qhMDs9L1;YW%K#0L7tiJgj+4#_@Y5eqc$xX6V72TqXz-r;_z%C
z@Vdv3)NDth1E<$2y(_#5?lVBZe}*qfE(zV`m3f|7IqF?0bo7|j!orDTHDO^<aJX5-
z<oUaaZi5kznjkttWy7iZ_Xf$s2c40I2Wb2A3-$Ypl-ve(9@tdFUFGJ)(2Z|?4=}E1
zwM-H|r0JwjVCl*$65ep^rax$+6Th-CNQV<o4T3=ckUy}wT4&ib(R5wP$KYGXd_P>(
zA6|hVG-|^!t%&8G7_+IsEY&)2&;Jc$iCFv2<*!oGGRbD}2@5+#n?kh7--RvuK@|w;
z1-|SZhm4w|mLPH61`E-}P(MEfECr?$G(6?&kJC6i`67(rtvXcHnE?jKwc8x|{6VIS
zFj88CZ!(;F)d<nnFh=4Lg^SQMS?=__Kujmth}<TF2)?9_z=PbaTes8dc2+JbYk5n;
znt(+fSNs&UbSbKvI*guR2z&<R6_!yiC8?SEhtgBhkYm)(Ie$h*MJ+1=8^T&K>GS*I
zzh%tjCWhV?Y7i?tKHa*id8uyvtu+Xbe?!MbCkjw>>)IBH`q{EYC{g}or%v}d(V}hc
zbxuuB*TM}Bo6B<*)y~801$JgvEFA_99m*)BQ9u?7yHK`E_UU>15>2!R<9if;{P=(-
zOt?U*dr4@+;<JgmV~PK=cr*%F)HK{(@OR_C2K2kW#80R4-km#duZ47EP%Aky+#f=M
zM`;gk`N}RPEHsp3n%<OuGq`qQuqW&v{{nd2L;KqaGkp>?=kz~OqIUJ!YD-JYh|iC8
zDpIGN%2mlLwuMICxyj?1)4z_s*S*IN4%{AMzQ%0TsxOqrjNx2mgmYG+*Wuiw%ZJl>
z@kq>Yk$7ZT1TB>-fVv1_TB+!@z-J^?gp5ft65>)*!}Sxk_TgfeSuS|=*nWJ_F^ii;
zoF~6mG{vx$1N!!D2xr1fY#F4ENT&}+0mhymMSOI6rwE#L2}Q(CI*3C{%sV#$31Mxq
zA01XmdU`^!S&Y%o6J=I~D!^l$R29@FA5-3h|B4>@AogEZ#UKU*bL{r*)ODi<)e4ym
zUr>@UIp_4Gowf72J)$~=&E@w|C*Mg<F8=e08!jCDX?tnsDN=NMK4NZ&Xt_n#Lmk2K
z5#%}HG|_&Ql$zg3C*yU#m4UeK^?iFWiXgz4HwXeCygw%%wm&HYF>VyOc7F>rdz!pX
za9ja*Q5@(1h{I@)cPOUf5oz$yp+20?T>qw1FR-B~yJ!#JiYy?M%M<OH#vsq);$o5|
z2UDekIx=N<HrfKVx?EKX3P<@{RE9t{l-Qii{1Gzp_dHy67kxkIBR#?R!VN~0;HX~f
z#%s;yDiunU!hF%9%jd|)lr%gT46g5@vZ!coPo5FCC*Fo@8H->l{$XY%44WQ2beYHq
zxcG6sq45^#ugorGVjJ#>?Y6eup40dwO#GEG*?OlWF9vf##I>*G9aH1_-#UBd-|EXU
zj=rW!)AkDykIs#EN(wZQzW@^mP&l4w2YQn)0V@_W)A3A2LDcyNA5e6O@`^fza#Q-|
zkq;nYrS9Vpp__)Vf>Pw<Y}q(&v3eMd3ICoSXF4S*GczWx$cm7xN0E4zaQAaI4o>XN
zPh40vfxQnd{k60-fAgI@4K!fjgQuu@piISuvz^Em%I;5P>c~A&;8Jrcwb&|WH8QLm
zgo5(NrMn@OEvZ{H8QfKi_>o+6f!`~Q6GI;*yHK$#Ffs~IUQ|VV^rufY&#v#|x<Xhm
z?MsaNHgmHpD64<g)(CP+NW8?(N!<Ul!3FIK?p*3V<jvru5ma8Z0}a?m*j{!#Phbp!
zcw*U)=%|t;F0IO-uBqb#4G>Qa-Bo!ccNY>5`HNKgdU|QoT?!ALze_zzV1Tm-LYa48
zPh#Y#lcH|AsH+v4b!^vebLP*Zr`+6*0}&EXfSMNJu7drSJ4pc_FG3?bJIgHax3tkB
z-~}nL9vcAdiXxtn*#yDbaCYEKK^kY2CSB^;(MRC;CCX^RgbC>$=>N~l2zvRtXlP0$
z2?*hwv~{wRpo-mAdU$x8p{|W(GWTgNKpr4ZcS_m;prraI=X{jtLsSkqIXPbErk*mD
z<UY8j-9~|V%=&&qzK97V9)0&YK9u(nM|jSJ1Sc^A!L<=^lkwJWD|)vHwcm57#}t_B
zPD^-VaFq&Y1PHF=yhA-p7ske;$K&slgh89Yf(G$=BhpTW@5HSHN2vSw2=g|9XqT2G
z4GlqZBq@&2EX1hh;(U)Tay+Av{ck3RrDIfD$cSp9x`$Go$S5+D`n-2?%=Z#4&*XX&
zIx-H4qMN1KGG=%Aln)8=nF!$|E1JQzhdgTv%++1}*_;olHY8OM34n|@V~PV&5MvqU
z#!V~6eomyThWE7`9K`p4gVh6X=d)L@4lo&Wj{;PmXsCHGzcaJ()4MEwSlW#>?Lm_W
z4HyQjAftcr<4a}DTy)THZcJb|co!!Ma4C(@(NY456+S(E8UKR6i)?fg&JB_kzdJWJ
z<_4Bl{Z;JFNtg`b%LeAgvpHaECEFPyc3Zbu>o#mK<tO!E=sQRWNRC7<$<2KI_F25f
z%yhZPQAsg|2rw;ZWfSZ*l0(yZNbXB+=wI=kA}dA`02<FnAXr~4m1rFPUf!qXZfZI@
zepFh_#(Ubgbfq~dh=>l666Ug9{!U5ZvO2l1_HCTg7C^H0-Mjl0oaXHa*2S3CT`VHd
zX+XU?i6UB{x*4JHyk%N}gFv&CZH=5c5SJ}mCc6Am=WHdnYx*O)73f(?gfNv5)Z!kw
z1(*qH(Ixi?(f%dq2E3N|L4>{C=4P0gihzgu_Z4rrqx<?|O^Pwv3YkO6{U!4UoK_aB
zgr$-O)o{mo`cVPITMWH<n;{5vf_AXGwRcb?9Tc<O%IRhwY%s+7P>zH<ZJG<bA;H@9
zxS-grs)+dqk9XolN=`6p1MYcd7tERS9*4*(ei%9rxe-cA?$Sd6^KRQViUlgC9P+6z
zEGmj%SF2)maAyNYK~I8%#Z-l@LU;6bm`5a4qCtkQO$_}EsJj%^CRKsfv4EhU<7Avj
zOcJsS7sfi{LD%qK(l9T!1rRy?HNOq-o<z?Q(m55+_VzEJ7)M6bLaodBfO2Fz<Q)^;
zi&KiL`Ipjw38{;@jg}Xbz6a`N=u=3J#KuW%OJRLCqNi`&^}^cFWsybe5na;n*)za&
z<fu_hNaSZFw|aU05_=)s$<pu|pPUJ^G&wB${fKYEZ+TAt-}GCHww-G)K27Z$W9OK*
zzUb7OyP?gSHU+uwKK7^6W@=I@_Y)7N{X)IrYef{ui+Yi31B%K16KY)2zqzBZnb@v+
z1RN`2t$2OJPlf9k4qei%_-3p0%xD+6ft9{7gy0?T*&Kud6!j^)S2-!s!nRIzjAr>D
zs*h4Qe_F(+%Li<yLk7UVf9K9_bQGdT2Z$ma&CAzUbDXx<Gx423E8Fk@q}*ksTOU3d
zJ=hJq!i73IqKy~4hxV)Z#)MMtweLvq;_%i>Hvj68B1uxzgz2D8<nMv@IEliDBV^ms
zh{tp#xF~2jLAE0qmy0kW_EJj?cnr=N93Jo3qjVE80FSu9DCGgK($f>sD&HXpq!;8U
z{pZ^7NHG<|)GF-!u?d~T#N3*ud7ZDe?ltidjSq&N$v)}>3rkDE-<3*Vq@<)Q-vO>7
zaeLHlmOiBo=~W(qj!C#Pnu&rh%aDfvO3E-7{=_g!vHNH?DHvqO{k7b2^K}GRd~9@Z
z^`seE=g=QP#Y|Jv+)<u+i0|*MAB~*EOZjBQ;!L+QX(=i983rVS<0!-?kSOF~R?zh4
z%Atl;yaV+2c-l`jH8b*pzW3g|3pjR1?bmPDYoafX!9Z0Mk<+^0<g_&N>(s6U9j~B?
zm{9)r!c2undrMyqk~nd8vkXl0TjDQM4&_ic%*r!sWtJW?qO_z04F?^MA9=BIu~TO4
z;uRT{A^PdnmmUe&Ac(|O8uYqAPa=6Zt>N%ZMx24VG9eY*9txeyCIqV!Ocayt0yeAL
zET2L5qYBA1&F^Ii>qSD*rFlSN_KN#Jf=ony!eM!nbXm^uVO-|&e$Y7`(ejp;-e%(W
zT@nLA{V10PFIqaj`+)*s$SJ}E09r`|Wq(np)82ss2#72mOkw6JWO$1*13)A#KI$_A
zXVB5f2`$w<-&HK+5UyYNWJ;;W2%3;sQDOH!&@Tw1%Ylu3#oOxTLI$5*3q+tye3D!7
zS=6(XcSHX??hq5?UePf?YMQ{CC6a!HPHml1QdZZ00jRh8O|!eU%CkO>SEX%-4u2<k
zRrRWF;H;=Ad@uKd3>jm4i@%1J!Lac#Hkh>1a0SS9(6u&e+Vtb`P2>ie){jF4N)i$S
z7de#&lahNIjSzd%^jX~Mk-?a11c#;yCg!7;1i~XrqUQld6jQDQ6$7EmmxS|@uAooI
zBL5}a89+I*>sY^>A>T2+Oa>DYEjc+=l8z^WUU+{*GH6SPHuvP-BlqtJ!=$;kB|4aA
zjq_hpE;wO&X0;bLhhv+GCdk;?*kbcQTcOEm1Z)qoCZXu)G19oe;ieFpNQR+Rnrqjt
zU3ti?L}Vh8nmHmPOchKQ%qxhpQfLcwJoNJ(jaA7~!E?%v00nWGR6CXvYbHV3fF&@^
zGwHBpb)iBQV=@!=@(wjHt*9iMrv(ROXAe(+7vx0F08;#M>*z{J{NlmLbu5w;#EeC;
zKX=fdJ0UrxWD~5Bzq1KL3&;Wj0;z=X3I}cfx<XP_NiQB}|9uz?yVSwFl6r8pw#v7&
z-C9iA5;aw~>tKfk)~n2DPetg<ZpWl6-%xkUQ~iS7Kk`S7eeV7%P+xL{(TbsWXhi<8
zcX$wRFIp2+2~;X{crque(Cu+!E{4vX-JQ)m_5GKi%z2)(5{Oz9?LuPzU;K;kh1EM^
z&Zw#qJqW4^L^Am7jsRVLAS|P4+))>odEwvIZm_U0`;`UU3L!Il(gsaKhUpA7#PB_P
zK9*o$^b;cZ66Ax0v;yS_?OB6D$1}7M;;%pvDONAj7fy(x>9vOf!;v?L14|1`4ru2c
zvz{OtbZlPpp%9=wB6vq8o#4)<GuqnJ>>9p;FKT$yj2y}r=T<d(^7Dz5(~|5-5zL#F
z7;XTWh<6je83KpB!D@2_VwgHfV7c=S6=_!8T{wZZQyfZPjfh5q+b}2!T_Hg&MdQ*M
z-WxY=_vB9({!a@qj@m@v3WBL#uXK2AhfLXnv-US4c<DJn0e>cEolTgzIJ^31vZ}sW
z@Cmm9#l4efX;_6#UaFLMbl3VK|J8nr*ZGC7tyz-chx8)dPCq|xt^X|j->0l;(aCi2
z<fdwUnjLEttf5h%9+Lc{_RW~Q{;Q0S9kW0GbM<M}yqn49PpWn}WCeyB1twU8J15TB
zJ-K(=ri~ixp8Qm8M%C<BoJAW~PfDz{EdRWiyR04^!$|{8k+QMl<ObqF%k&r2!rRvP
zf1GuRLW3rNo+>}ZYHRPYRiEo|6<ZSM#1R?E`sR)$@P%l){@(@=TezI-f#vcpH#c6=
zmm9X3Fd0t+4IRP;JRqVA@DI;=2mB;L7fI}5Fyz*&Bdit$?4WQ}Hi)Q8Fa!00ih+mN
z{4xb+rUWKPA%Lz$R4^prbf84((EK-~J{WAjpo-o?W*Y40T#=NmT8FroA}WU=s+Xa_
z(FK0Nj3RT`aN3^ROIko*f_uawg(^g%8R=Qm`=xVM<{Hfr0W-va@zAp7`}TmlGWJaa
z?Kk*^VZ{vQiQ<hNhFMou2^B%E(Ur%K9|NSig=g71qbVXA_BWZ`2e=W~{x~1dp7Fo-
z<koO2J;*t6`t*V5XuIgIgaDZFn~-M(VfjQlpK)X!Dh)1_euD?MV6RH5GySSqLhAB>
zZgXSVQ2k5)!=WY7?TRD}qhgZ=8|?|DdINyqnb-->pI6as$RBF8cH`5mVK@-1VnSxL
zb6jwSL`@-h@E+YIVikql^?4nY$Xes%19(E}O8-hxUS+?5yajJ~Q?gdHyfu7Ypi{oO
zW5D~yg98F^k<v}wG~UEPCK^>Hzp?<{y?ef9pYg_2u(p|adZX<$%#%iq8IzQf!^`Jr
zEB7ldG;u){kt-Te4(Psk8E9NsTvvU4$tlDGnZQ+Q4o@<cvkxuIBg(|5_j)AU;?S>*
z$S=M%);8BLYOC+OY}K=mQE@-m<Hc!4S)F<R;{1UJ5Zq9#+@z|Lh(F{=J1Rmvw{=UM
z)n3(Up_@BNm_b1cYBMr39iDoIwrJ6UO@EtYXzV$N#G`I(Z{5843MvEXK}++JF<T{Y
zrkokG(a0wW@hs@w^VTo(A!;BChoRc0M&&Lw;cpb~og+LgX~pvH>BK<l3pN|0l0oIE
zt=8woZayZk4F^~$<GF$0HO$ODTsUwB6p(ILrbDolWdN<Dl$1^M*YTBb0@Sz?Xhgjt
znel{T^~?U2p7j$UA^up3@Xbq`pXt5c%t<M(banf>EiOBKPoG)TC3rmg4ISDhqQB+1
zds<*J1Y?Vzk{eMPVZNTVlG!rlPTT>JgqqULQCouNTgJV!7%a0s#S<hz`OWQ{)LJN!
zP<S4#RGO<5b{G(mr2P3xZjiVF1X*GVr-C4KH}bV^$c8^Mzv4TUm`XuSF*)ovsDEXV
zlHBDelEi(Ccrw5!CXr-FDS+vrPo+wf4q(x%&~@&w1Iwx!{S8Aj4oLx<N0<_P_Cjf>
zR5Bb&4iJFO0W#r%WIKuZW`GfO%>pGijw+5cIL-zE{cfeFM^<#4Iq8px{sU7}Qmn>j
zq*;XLAn|~sx<hzm#u0DWb@WZb0m(~%)W<ijjo~1${nu0%!7|VnuN$Coqo*gz8tsH8
zZvd2fs9CinTc7#{SMj#!%NKDgV9$a?I_}}|m`YcW4Q@;pcWdOfR?V!OI=(M=nEYFI
ztA_rMZu}bVN&C`ZpT-@|objQ}!qV1@th3;i0NY$_D*$*oKsO^srR3!UPC+R{hyu<G
z#K=O}=m!x7<%)BPMhd`U(UK+4A+TUhX&L}aLCVCcUe+ag!KQ6acy~aW*g!%gRc6;R
z?fo4B935XqX8TW#kAAlL<bZorWD9k5o$i<%iw{Bj*p`<s&YvTXiC?2q+f-R%O3O+q
zL8f;Qx!p9#v%yhi85+mlitCaG^otk`B_&4!Hz4IK#~T5vhY++X3}`gaO^akDN@5=0
zVHRJ~%6x_z@Qn!kuj!^gGdmj`)q_EnUl><XiuSq{Vr0AzWMGjQ(hXJ)WdW`=v?|{7
zGFH#d`0S71kzSkl<D$QY@{oGo>kP5Tc9TgWTHdaJ-c;+HRC1TGv&eQi%!qp5MIu4>
ztSuW5c`3xd)perkh<4--K;sLdd(3a`FPzuiB`u(+Guwg-P9{DQPb1MJLj9q%xNi|2
z?)PZq*~c{sA^N{N-Wj(3-W<AWww-0+cSf^0PTYj(Syom3ww*f7;35c5aw6XZKQH#l
zNZWEH8V!#_XSiMfTvV!)q9Y>SFeC*WgWHP^WNUPf-ntjIO(*Y;W5aMqQQH}B)At6!
zTPt@;+BN+y2s1<b2uhJxE;kh5SZ)bhsXxmOB8YS;338$sUg*?<vY)Hksk}7B^3Pqv
z&b3`}<^L*zVRPE5t41fR4b58A;uZy*02o|`8XGqFuYZ=l<R)%@v*(vJsbF1Yhd>Ys
zIX-2WvWkjC?4$gk6wltEac6Dlb2^p6>tFNPg;ZE!v$e+^9mFFcb(M!-wCF)20BQ;J
zyF`<(oo;P4Cko#o!)M6213CrZdXB2fbn$$Go~gBZHfpcw`rha=BpA6k$4{SrwZ<Eg
zeb>2x8__`?+dB~l1COwt+`6nVl@nW6CLHAt@00$HhJc)?neY%8y;zK8wlS}NK70Il
z!{k#w+!ZC?dLEoMv){{dp7R4Yzf><BUl|#|TID-FykxNvd+ZJ6`}6qGU>PcYzFf^b
zh@!3bRO*<0-V*OXi$QCSc5=PbHmnYkEvsJM0kkR}VI@D;C}HsTqAsgVuf(rd4$35o
zglDVinWy`tj$)ajq_BZVmI52igIjfjA~+o!?^a2K76=<j-!12(T4=*j-Hym)CiINK
z>^lJYUF#OB!uC1G?_m@Ont8CL6YNYJ#;1)EBsmu}bNszg2$^=2dG*@XRd*hbKrG)@
z4rch<e}3)tvrhu1g`Y;+C)Jb&Ou|MgK{9lX1YXU&I3MSW7+wmkKtRtx4*_*aQ{Gca
z`LUU*c!$K_i*Z2iGAiN{8RY_{!+D~v9^-L(4Wt@>K*mhr*MlOulsVo_tDpEK0lVp(
zd|s>_7-4m8-1@j|GEMo_Gp0<rbK!_T6c-7Goc5c!LM62hK*uyF0y+f=|5@;TZl~V5
zkK62pj1~1XMboL)34vU!?0$Q`kU6RDg+<Xc%?#D~;%8hVIr<`RmvdmL@()0K=^2i@
zg9;X8guBpwVTVJ*8GL$8(9*F~zS4+5au8;+b=JGVZdx3q-?P5mO5hzmrZH73aEhDW
z8!fQt#wiY%)FfQ;t_q7jef>vl8B3Dv`@BzKR@WRWix%kW4w!V7s)wk7^uiLa8An_V
zM}t+oU-!LytWtRN=RwzXeBU?I$y|Jn2qv3x80Cw)*{g$pqA1-~iNH+_0ouS0&C?<;
z2LxE$R%ydt=2mF%=JRJx`XWn*NCik(q+QgW>>iyZOH$VKnuA|LVdn|`p37A;CUIxU
z=sHRabR=zTwKWc$p$Vw#1?~*HRM=u_hTfOHUA=V<YFvB+s*eApRaF&2P2-(4+k7;9
z<Fxz%KXCfXfyvk?8cazL1a-eLJH^h<?!=#eCISSBodR<`e4W+luIIS}>X?-yR1-9;
zQTBR71Lw(@87jG=@`(&Q`SNsXo8g2qwf59>{X5{&4OMW(m-$)8#(QXYnmnif!tj^f
zkQ5av4awjK{Na3#199_;kG9+>xw$A^(XZ5ACrJQO5{8lwMKQKD!)cMXj<hgzYeXtv
zLJ@?R{vBgbmmB}RwtSaan(<GEkhIsPCCbFiWMm@@TB`k&t7-unEu+BV5s`Uss=jV2
zu+ue|Bmxouz&Ck$2^f|lHs>~{X`}W*6BFIxF7(jpF&h;+Ip^+Qw5w%*`g5uv)L74k
zCOy(Wv-{8)3MH0D!_-FSiq=>3$RPWXyOr$M`qbc4K$yVxPHk}80+UbYE4Bkp6Opn)
z`B)gX)_*1es~kJL=0u88EPn}caZ`)hxbtKYO#!|ludEA<`|Eb0N&pD|)&u$6orNO;
zvS0%RmgD2nq9%q#aL+R9t{IB3fOZKc!{GHBi(j9zmAoGb-4r2NL4hQvxeqlKkg5)>
zqFsliD5YJp`m4(}gPAM|LZ3t=z%k}}@+9*2zDi2hGP@LmVaUe?tK$jWA=F|9<SnXt
zZYLyyFf#oq(x5~nSYJDEC|w3|c1#|s3-_n&pT+bI|0N>xWYMrBx;}lyXyD(O)WT;t
z!+<efg*YgvNQk$N=XnU**|?u3LH6Hsdi{I=Mn8`WgWy>XWK{FG(qIzz2x7-6!2xEv
z{4ceWTK%Xp<n)2jq~2XHu!BTefEx7CbX7h%_WBmF1&O|x;2G*KSC?jfFGaoqqfB8|
zLa})Q8lIZLM|*l}B<Da+4?&v<B4pT+|I|msXL~&x`zEOU>mD!xKW?IULu~!Ayh>$2
zM0v-AaYcUxZ-a(@9-O{g+vkL*PpigL)w;h9yZZ98LeEA$@<O$|KbK6*3TtCN?}q=o
zf|l&ax~RINzrZM1vWA@`KM<#XlYnMhq(AK2*DIr#F8o2U7XO*;ZxwY8fs0G*Lw{}E
zgku-%gnJ?FW*tQrc$dbb4*%;8GW#}qRKcApw$n1(npQ!h7&tFgIsTok-yd0o<HG@}
zyXfsLR}Az$@(6+TaU<NyoPA#7q6r}bAWb^+=ijFA0Ki)_!0gXC{nJ*YK-IHyEzD<z
z-?)KY@XW-*g-A0Q9avC?Bh{HS-9XnRAVr)hVq&7#mPwWTA`DycD~=Co((|F~`9|A`
z9ajm@NNJvm;4>+=$Gg!6)rIL6$-}#6)p=pYG11j!=v!nKo9}vEyQZbxV~|^-)yv6d
zAvTi}vdi!KaXJR4KR`#XKRhiy{x_Ugl&)+FsulPz+YYKEKXBD;95YdUQ_c6WuB)g&
zN|y5Twq<zDb5#VGwtJm2-vnlj5Vnhpl++d#wX$yfhhS}7tt7uquk*n~b-X}h>eVe6
z+E0vT9FYwF-N${zNexYSz__-=J$Ty(jK1mqsTSb^C4xrG(bjH_<eZDp3AIY~uS5dH
zj57>+Iw7O~cwAptalrX;eaS)bixB>ZMuGSmPH36SE$EWSfd>ux)K+&R>HE^~0xihI
zU%*Sq@Si`De{!avyL!KFH_QS47dUSZcw_E_?e7i_kl8u-c+MGgJn89~4Av;=M3d24
z3mYiRCGWVzs^`n&Uv_p3byfVzUOP=IoOIA8p^I_k=6$+GHv5P=2cWS58>0~lozNcw
zPHxqom7YKQ;H{HNK}Hz7*x`)8L<(?{^Y})hJ(KyTd`#F_LYZ&NI)$KXJTlC(P@#hM
z_zS#=jBA1+vH9ffE|FPGJm^1eT%>XSwl`yIseC2TNz4M2iU86tUcHjElI;u|5(^n^
zhMZR1*ggplHRuHZ<%glzLYOwo%<PK6km~2Nht_{^VkOs!!1oxxKpZ9VBzZ@L>Oaza
z77U*c4R$<be<i$0B@k03hhqBT9pmTwE~5OQ8O0OEtDY66*z@7&kk(ud<Pt@C_gvIN
z<CMllqLd!0_id7YUbLBK=9DN-@zzPpd4D17t{2o?SV{QMm6N)}F5KAY?=ZLXOZ>LY
zS8^ZH$$7#WFp0zsU2HYHbsoO-Go1V9y6(l6(_cNRXoJ?@-{b4#-~U^{N4T}`JEon7
zwso9w#E?!c2*!?!on5B~AiIDV75L?Q5Lp5zAm8!0-?3{~P1tB~&kMhhQc1gr%wECd
zRNx{GehE~Tt9!010*&Eoo-5+0w~LvCO2Tm2iaICW2A6~kj7hKlQE<}eAXDV@MHlAx
zQmI8uYKude&z2l0I2@)vE?PfeFyhYKY}Tm=9`UHB6k22PS56594Rbibnad?ge;66;
z&Bn<{tIPyaQVz`E<^qpaamf$U&@iVELiwa@CMJe{PtuD#<AP;gPUen0uI<AcGlB?U
zio0@X_`rd)!lJsbX?fXP21>!w13IQPkBrU_%P|PAL#@t<FoQINTOOxB=j6mOY>!4M
z2bep4Z7A_lyRGuU96^8#i^iRX0|ct}v+DQ;$sH&WSmrKBF$5gEq7oz)1OTRM+x-~-
z&K9X_^x?sT-zP=b!o_g2zTjIJmjdem43RMquAj1M#8>_fF(wD8bva(@{FEyeD!EDC
zAd<m9$)6Bi2F`))EbfehXO0$*z5fNy{4KlUz0ahYHBQ;mk&Da?e!^>!l9;G}7XPe?
zwaOWd;ZwLs5(qf1ab<{TMua2uRq6i8&jytIBRpAc{iw34Dj1H7<LsY^cW&9gj92DI
zG!gQQm$8G9z5m1?N=8~>Ohq@@(9!<d#mxYBD2&8<E$-Fmb@22)_QizhU7%J+|0(1_
z?y9~`>hdJy#`E*(aY7>?Fx|RffI)QlkM{#6Ek{>flE)E1Wg0^0DO+qNJ?HC?8-5#_
zkm0YO@Ps~DQUM`{Hs#z8ZLd1>&&TUQ94!5RuEdkYoa2$*?9NRa*@1N>nl#V^{ws3@
z0KEdUS9~b%#JTmb{og%KytZB2{#i=Zl8f`p^1FVS@$BsObnOo(6n%}uay&Mq1}Shq
zpvo1#zWm!%<Y=#F_9*inQNDy7*Lv>W(}R9t=akJe8f|Fb)A1l>JFy$Nyvc&HE&DWf
z1G^CR1Q-q47d_XA9Rf3Om!&t35q=`e9WzF^lWNaSA^62@O5V5bbR!i}P#hW<3Q42D
zZx8$2whEo3-p`aX-3DY<>_AhXIA{gX?zD#%qsS-i3oR(W8f!i6VE+E4^}e}<-6gGs
zIg^1&)3nV=cuQ<%%OB!T17ZE0nz|)7u_HJ@m7-EO8D!@;{onlgY62*(N_)+tL4`-d
z_H5iZ#5Y4_%+s%HGUCEEEcSnsawXPHFiFn#H|bx8F^0{^A$fs~7)7k3Hq0E=?T2x>
zIVrv^L<nhFcr&v2R&~^k;3d9`UT;e^6fZE%IEm#0zKtqU3Sc&Gg2#PtKJ|G=?f?!I
zBO;Kgh$V>&3iwU>UZ-9)A7>iY8gT@Ow-y6aAaV_{Ruko;re7A8cOAYA-A3`EU13X=
zgC9pS7JxM8k2l&uQ*GhEFJUuuk2qyNrrVL8_vUE2BAt8-hE%+0g07OA<A^Y#8U^L2
z3O%K)cb8%t!-*`xx9DdJxN{MVlqFV#s!=0gjh^gtqBmPshLVzsA`%SwI7OKmUZ>MK
zQ%vEuEXg+P*`D)&QNZBX^kSzxHOEiAIR77T2jUMzKQ!cW&x0$jcqJyic(HJzwJ5E=
ze2M5+ypw87b~{Lo45a4L!n&aCIiO8SYAVsPyD9v5??CW2cim=q-Qgl|xEt*^%SyRn
zYJh6PPYWi#Vq>>#Jk|Y#LytU1uS3E9M~Wv~&arn~ccQPPBDUH2J+6MBwIY3uO_Huo
zn=$`cz%8eyKQUAIG<Moq6o;gXHF2onYe2^Vhz?(!Phs+ZSdX|V2>b&w20x<&W|aCY
zusV49p0;6zOcm>c3KG_)rPg}&n5W%aHkLS1I(aZJbj16luAy{-5ypu&m0L`;7~lcj
zY>##;uAmWbzlnziU`jwxu+dIqpBJICXSpEq;xG`WB)tHjCO|;N;4mo_!5-`3qkyX~
z*sSNYZ=<zle3<$eyNu*5!|o^a`4DF49%r@Ga-C{hQou5`xyN?B&IlaCjUm{S>GWf9
z!6`vU!oJnxKdxNqwLXs2?@GG*)WQl*Mp9nTZA}Z?P&nsNk68v|8aQ28gi&7OqDV-2
zJywx7`QOZr#TF}~Pha0YAoA2g)=OFS3=g=W;`gL`_wM767g=sNOtg2@>i}?N3aR}Q
z>VLHD^Wt~?ty{N-ugq_IIk5BRsy*a)of$#Jdym;JL!28xo=y1iC5~BLz8(=7#;~HK
zf9bn>f<AzFKM^kogb8SK!z?p>RwgbZ;gk9Ml$V`pZA9;GQe1e|sOa(dzJL6&Yh8j7
zDD2Z)t?t5lsQHyB4IL`+o~Jr!(Gwu~YtE76gIpQ|SuoFEVZf_Urm_jPRGYW6)(QAn
z?hFRpa?ssl%qrho#rW174p)fT%qRsUNx1udGl<syR<{0!A*dbdt8MP;{pa>?k8T&N
z36U!f9{v29talho0w&q=-<_0}PWgsq3yHh({q;@z-20c69Cm$E*&{UcbR4;6PA>){
z+U2DPi%8=v4rhc@U>pcZj?z7X<-kOC?=bk<oy#N;(zHlt2-&bSa&te_0r{I}#Ke8r
zI*=)seFj|TGXVi|jDAx)NLx9Tn}b=l7x;^ahfbV7pJncs#%7QJBz{Xh;PuUyB2;^v
z3Nu_>pUOcB7o}b2*RSabEC3u+qu?1<Gp~EMZWa-zxA>KibBG=JG$DP2^>o>sXQ$tI
zg=+rC*`Joq+vu;>PxEuwm>1>4LknM+hi4Eq2Xs9>>`g(y&tV4b*MN)dTKC6e6rxSk
zRU*vd43;fR-_V0YC1#=m5=FX^!itMN#n)c01%#5J_0VHve0rZwa~rlcB#m@q)Kbkf
zT`e)W1b@!{U4PyZx90mU&F;SSqs@9dXB3GtDpv?3DDqnC&}EN-6c0T%%MU&j9*$m=
z6o&AQ)u}BU9S!3d9wKRa7~N%39ANi@lyKlwH~)}^_<#(z`OF{qe#<M0S%<0_!H)|p
zg*gRi4<_*wJW>Wk3Ny=y2!rs&(XA<}(G!}=z%aCukby{{NzOX%F^6vwVP83dMgfC`
zwOZHf{5Bxd`l$t1qU~l>1#sVALY9}g?eY*R!ID*@@VKD(Z{j^Kf=dkv2kImsG!Ul?
z>_h#F$k*INW8&el$;wJW$0}w_LnrkcCLeqMyShwH_cl9XbyV9YI=AZUpbpJZ%t?3v
zoxMzif>>iu%Un!4GTLbsO6e2a6bxA5mI8<)fHT$mSQ=STe1q7*=8e10ykzL}0-Y_d
z#0Nc|{<@GN2}Iy%l$MVc^FY_k!T+ZPs3kOnfQ}Amrn2)J9cR!Q=%PFo$2*gW;HHDi
zTw=zUS#<Xq-m6({vtg}~B@^jRd~cv~TH`JDAX@MDniJ0jEzi6`m4sYE#bvbLFZ*J%
z7A?9zznf+9m6k?C5F)mu+}~d|vCX1KTg#SgYdRq&^UBl?#TI7*!aiMnPtqm@lEUDV
z{WJ-V)~$`ca>eXv4kMHR@C3@;T4@Msx}!QHyJGxXhnibf0|tg2N#1lcy|BXKTWI0w
zt>KwP1wqvs^Agfb)Y}Z|aLB6S>d|fA%zoH9t~69H-(zKvRA^OwVL&SGN${sUuHrmS
zx?F5R;#+|D+Cx(!0v`Bk(M{t_1x=_}8@$_VZD`2pLMyx2>1KDHPKKxfU<6*pqt*rm
zy(ydfaJNy1i1lepnXMAPkx&$RX%T*jkOV<FUNTE6g2dDt_+3QI0D%Z6Kn&nX#NAf5
zk5#+0lw$-4F0uU|w(Azx8qR0x7><Y249aNot%1|v-ElZSdhY_*Xp?A#!dqBNKDPnm
zTi}R9;D?TVyu&F3I}#)$Djr4)aayJo-aGzQi$wYfUvV{0i|Em6_<ye6rZZX|J%1p4
z&zJ81bYFN#cp`#TvD}c8i_{pRW$Lv7#~OM)8#rLlw(*AHnt8uMJ%$Y38PXno&LO;v
z&|nb^kYk7C!eOG!w5s$VLV+K_{Yc94CBuDQ6cOU%5^xboOMilu5f%P5ucIgA?++%P
zxm8~VUl%ok#cx&M4a}!~?fuFSK!>`%UQ>Qf@7@=5|41v(;rLA}{@PXDb|f%48a>f1
z@+evS#edHTz=|OPeJBSwv+X2asQG8LOGzK&zWddq2AjQymHB`BFgYM^gWp}d=(#iZ
zWLbn;>o3mAj-Zax4KR4if%JKER}KjDIWH$31UWUTjULn9UQu&xxIspCo^4e3=&+R|
zm$rAd*k`wW<Hy2AyTWqp`YLr}o|LpLgvG5u2g_L^S{|ZS`Ju~E9dSy)21~vM^9C1$
zyS+;-PAyMhjB;_s_HIYPFk44w*>Rdkpfrz8=0YJZ`B@$Nb2k5<W!87XgdT8cHm(Qa
zw97#e>H<I+A%Jq7vL4}r9Zs;Yn?)Bc^p^RK6g~q74O-6D=5>sKiv|_}V}y$aZjw3P
z=uo^MF=-JvenzkM@7UOpOGoHf=@bCZp;4z?oKI-Fh?4m^=t~|{JjqmFSnh_68hB<V
zRSlk%_2gCZOF2iGI!77{g*XMNtZxaM5rv6p0L?ueP7zwT`ss?u#fb4wdiG07N{Zi-
z+ji=I34m2&`^|2ANaNzM1U_5TOlR+l@JTCS=#jf`B0@`u83dQW*%UjV({>?K3r>br
z$mq7h;F-w;xK?Il*UsRBkv&0F&9l`vKQ+@%cxA*)QkE^E2yWYHJ~i&Z%_3OA;SZAS
zNI`Q>C<MPEYT^|%!NPPS+~LnVVlkm}#djR?;=HBhV<jtIrab$%bij8^h7IE+#h>=l
z%V8P&o`8%EKI)6P;rU-fAlw_)CN~eCl(U7gSPBUeCSZnKzvq1r%7bW$FqbFsnbXkV
za#@=o+$+2`j)h27x{6nOj#<GxYRuRM58r=`X`Xsb9gEHoRaX7*V~KMv7;a-kYWKs(
zY%lvqq-1CXg(bC`cC)bgOz#~0V$lXO{9ab*by?qi-NuUYs?U){VX2vwDHZmeQZ5y8
zk;IHo$*Aa1<o|M=G0lf~V%S5Q$?FC6dT*4>5e4HU`Mtub1M-s!zt*Q_Mb8_VLlYv^
zT{oEZY=3zFix^E;MWF`K2|B^ppr0ahr<B<QXJkYnOOF$eYojxqGG~eaS7<0uD<j>J
zA|EhKa;-8l8}^9VUJOnk$rYgCm5(>>hy%M6st;BVHMc8~&Meebe(Ch!Wc7XSW9ot_
zx(nE%$gV|;$8AWD0{@a0xn%&=e)bv*i~bbM#SX=Gj#c)P_|w3aQ_RfzBSWc^$8vi~
zS|bV>=+>)bb-^}BYCf&T8$$9pRik2>9O=+QTXoMx$F{1!{lir4Sx&)R$S^vsPu*yY
zHkWI0Lt^C$-+Q{5NVE|th=dbbWV1^E=fBanlhpy_WAs*3oThC+V#N^(T53ja2ZI<}
zHz|72TEX?p*M&d^uclp{iMWe<Xv~neo$2cK{MJHPUXE9ksP(C|aUsY5LSQ*EkD>BF
zlvRlg7;~s3UO{(T&AJd76(tkVP`KHkaOG-W&e_DJ$}Oga&XCFunVLy>hUAnz_ns|B
zKgFvhOP^C>7S@KK1O?>LF$?R*Z@LA5$05$7VYk;UHXynMo<!HeBWohTH%z0&LLisU
zfY~A3UUKHx*iWqb*q7Q14Mo9Ldm-{ccceB23@E%m0O2*uU#1_TEruVFkW`MgCJ0k`
zjpDuNKE~obi<#3?CV*Kvg!HJH*iVya-7QG{SX)S)j%^s?wBol?%T*WL&TQDA7wcSm
zKQ-ejUTj=<(f&GN3?fZQNz5)@_cJ5w&BryB**|NtGgV%kII=-sQPXumkFDliGfssi
zh1Tpa{+QgwWh4(Py5_Y-cv4bX{GP~I@J5MECI)GV-{OFe<2gE+%>03Rrw2nG1i_ZR
zOKEdPp;cuMtJfz2%r752qh9kQT#368dH-8PM4}%Pi&om_T3kwR_-A$a&7Aa6tVjg*
zda*kk+{1jJ6vg`fvD{Xae~e+3_{*bY+^@IZIIN3A8Z&G~*75{AS3R#^+&G#xO@xUe
z@Z&e+1;@v(Z0AWIOeKsJeluuYn`><cYqq1U$ip|2CkjU%uS7;uBXq`<q~m+iFOP&p
zxXs|gviT0l-&HV8v@3q@Ajz|+ajE+0KOA=?(<J|&KBIS+l(Mj1Z{`E$J7G7FLzSS@
zdQ_W8m^1(Se0Vn;r%baDuMnNhv3+VzG0W)H=rvIMh~yY-OQ<pw4z0m=Z~iDrlKJ*B
ztAx*&r{-ep_c9pAI6#qv`$tk+NJT}oubbWEP|TGJ7Vxx0)g!uioc$Q6)#hrDjqruy
zwinAs5K9`UzWO&%HFO3|1E^iEuSX*fh{aZgnX5x4c6@0_h;XK<&h6zF;l5=SFqape
zy(SD*m0YQ$km8<uf)q8Cjc%R>vEQoKGyUav8F>hDm4#FQ*x!cFCZUP+Vr&T;dlkc|
zY3Q%rb4ct>Ky5mYI{BY0d(jfXI~xY1#L#oVZ<3_;*Xn0~uy<1fjIS>7lQ2cDWWd%}
z9AcaW)Hh^EB+8GWWAAlz*Tr}&cEh?I;pw;W)jIK{%YoBry@EDeK=4sFO1Tqd{4db4
z@bAyBc0V@jJ84p{h*Kwf@ex`be2kPH%l#C6PcGRFK0T|Sp14X<d2Y4#&Tord6`_-J
zOCx#p$ZkN`)eN)f@XnrfzGE`k?`1n?W&7m6@ZXO-i6)6zDJtJy)KN$c461t~C%P5#
zo!%cX0N5`W$+%n$E<4Kl@)||Rz}GFh5&YA(&uj8gGaE%8U#*(aiSgIV7hm+P(@EyW
z*5=2GI^ZXAu(3OSjOAL#4F7O(6c#7mA%im{qnrHJPeFNB1h7cVAzme^!co*;O3lwz
z$e?M$!YdPaIV;r)%^8iuA%Q8DqAu0K?r1dj=uPjYojZ5#z~RFMCFid^TTO6qKTNaY
zG8<sW1DDi%z78H2@#J#(BwjLuGZf7UwSh-K0t+7{*YzdqoD@zSt)bvIdmXXK<ERfX
zAbbolzpj$(CiBCLjE-U}wg~r{KM>QeM9{FYgGofx&Y?+OiBSb8kG9()&Dt-EZ8P9o
zNvE>&K<=>9rc_dbadWRZ+Qzj1cLNT4)K+y?20;7QvNDj?c|{Ai_+6MkPy$WZtN<DW
zuM@%6QzvFuz)Wu)8Q?`5S1I9XR#u}aK*W7}JM^<3?ok<fz{wBy?hW%I$X4wS8(Iqa
zlkj7=2Ce9CK;WhQmnl4v?*$>rB7!9j(bYxfc)%$%YV?;y=6FeBVe3YFIP~byL9CPr
ztEBi84wud*EdO!N<css)6c;CRE=Uy&%Wlp?GwgqqdI#mu*~k4|HUb1O%hH5TC^m)L
zqx>|0#}FDLR?2~XH2?AatAq9kekJCfHf;$efHKMQ`-+i-D$Xm@mI*beccs(>p8tyf
z>I9`U1JYkvC{vz_pAG>AVzdB{BY|+nwLdDQ^Af=(>MZ)}-Rm?T7TI>oYJT(kJ7-E|
zxOyl1Q5lohn`Imt30Tarb*H8O!XAGl@c<>27+DWePaZ&&N-T<jdt3#(*QG`=f;;QW
z*y_0Q)DGS{ng#|vxb6g^TC`{yH@bTR@2Ou<pt$e*Q2W}t$IEA2xiSiZl^}7lOfbmO
zb_Y2Yx{&@~DiI0ZUOvRE0c%l8H1yqSHLur(CMKNI@m)oW<`}jiI#Xrduxo*V=I`zu
zG(0o{ksyUqd$fIoF9Vn&(AcJA7N+i$aXzE(OS7<x7wgO4_}siztDZ|%TxsZ`oxqhQ
z-eI_Ko(5W2!bfKME+TJ#u8E0>XYv}1LXyOQtplLrEsQBw6$zMt$1q9Oc00Fj+Vu8L
zlV0!nr(l!5%*CJ%QCH~<M<tQh{FCbE@buGG2A;Ts)aVdo0=`AmFsgDo$f%Bu-2uZH
zIW)coL5ypvk)3}+p<4eN61OLJp43WwG7<wQ2_=6IB~7844lWoL&Bsmpar>~=Qbonv
zZR-V<ZjP_#S7JbunDYOxdeg}0yzlL4#ZN=IMC46R_<FBB;)C+BEpCospxze(cTY;r
z?fK(7yck&MR@f*3Dv0&pWWh&Ux;Tcy>mu=#FdV?x)&m<;_(W!-x&5?hXASx$U@K;i
zy`yg7sV^UWmeH0H`opnA!S75pOy~l)8wx&(wR<MRc*B}hWA#w7_8By&86!G4>fKj7
zq>lVfAv_h$Aa{#6exuIeMsk|;qYqJhYTb@pN&4(V>vd|Vu{I(ebSCsIJk}!*S5aTT
zEiRTJNc5DF1D06w3=tp>CW%MkWE77LuaIh;s7<Bhqu)popuPYG;d??0eEfQ@_D-7}
zGMt#DgcpoAM5N(Le6JiZfUWy7E#w?zG7#MKED|(g2iud`CE{rgEKy#Ko7g&IIf+1@
zYyptr#91X<*w#|+@!pY$P}OnaNMWp`q!lnM<~#^TX)m;ZudOmRAS9PV!FiFxrzYb&
zFaY7v<JK2bTizJ)V!3s})hE>|(A{GB<oxFmOEZTT?BmuIdl2YCfe@xRU%O>^=!9Eo
zHwtHOkC_|jrLh^XS=bdMEFw6T+2dPnTdl@JjpKpcM-W(`($y#^ZW(AM{xfp95c__*
zQGyy(w9qp3@{WmsG1!tLjWY^_cN9|a>eb1=^cOeHU$3U%mG*-pxX{w$Mn<7z)&ga}
zNnZ-W5k4=!xF`ZUA_ApSUgd6_y;2fMP=LyBt(@ck_*Pu-q$I3x-1Yeb$G_Tf^hV;G
z7^9)z`Yu*-BRSLd)g+&FBfD`qM_<2N)CPYPLR!HiH+W?Kw!zPpct<wmn3s1K(_twn
zFq``OT<5n-B)*8_y!r4g2mS+dn{?@GbUPsCMjNYsUg)P_NnpyR7;ZF&EP5EWANzQp
zi4$)nD4r$|m7b`_jV`=N-Y$Gfd6gQi4>B{HBe1T{>wR*uT3k#xg)xf%Aanwu&`>Jv
z3*fCjzAVnm&v!=O;*vDTO-oc(vu1tVq{ld7_7gbMjKEZ{)O?cRsG!byc)*`J1eEcO
zg+*oV0jWi#pQq-dZn_;BME+{giB=w7cZYM7XT}nyjh?wbzM>2|SyJ8FXt{Yp(8;(i
zI-1EjT)wNmj>c;VRL{F<zOMoJ&W&Xo!dG%p^2_^Am|&1K4;@2bw)tP2mB>s_YUTbl
zZ&^5lfXZh-C3L8yJiabO&-YGi|Ai>xr50e(P;~^8Ml7VZY(lJmTHfurvye>7H7hxq
zAg405o^;zjFLvs=Dnjo>Ahm|i#Az-gG~rK3l%r<l*}6<9X$l?&Pg;rrZa8&{_$!<n
z@~2Teagi#JWvkr6LAb~N$JKkl^}PT6|3#G0AcV9y>DVe;S|3^2l!K6wWJTFhNeg9k
zjAWH9N~p+a%F0Lxk&+!LWJUe&&komh{r=~6uIqNbXVmBOe!s@^`B={-b2Peo%4c9C
zSVi9CPtfcO0U8^%XtCCQ9|gT1|N2J;VhRodwNF5rf4eoOFHvwZ$PRB}$ID-{&2K&4
zICoKC@x$!$bMM(1TwG=Zsj}BGpQ1OT$mVP{#Ig}!YeW1XnzjoCVbgwjyidYZ6-DkX
zXLp0T{y!g*pYFz#AgnoD69~JdG5cT#uSO*)dK&t?8Tm!ACKI?c;FSGlZ#^33erMD8
z2=cD3N|+4T|G5>L5C3SS7TAe52cwwn>Y3u)R>gnA|Lwfj4&3<o2gfRNKIUOWV7|>%
zX#1zLQjYl%UJ7G-;plk1fn)PR?YX*8rN+dR?2UDTW?#PY$B@a>lE!s_Lg{s}AlT2J
z^GW;TxvJ7BAl)+G5Z<v#lY7U1m3i20-oL6MXOyXqOtnGgFA{5#91;^T%f_|TdQpu;
zmoitR5;bG!|NXa&`1Q^7?1)&1I-$$K#>vgn`AT+nCgX{?O=5<`+>P{`w%MNF#1$(!
zvTU|1_ikufe%7K@#MxF(RZ~Xf10rsYrt_(s&p^Ex{UPvdsmBCSMo21sAl~u)!)rZ4
zwfkxJsvomtb_F-zGSFW!uOhcU?l^?D13?WVv}#j?j33`(IQLw`NVUQfIWjrQJ>f^6
zxD#_%gzjQkFnp8*M#8F^vF1>}&>{&K0Fqj}%ks}|SeDEC?P`o_34-;Cw+l07GeB{N
zali0g41N}>awN#SI}$}C>YDPm1G@FU7<j?rZ!?H=KM4_%v1{}u(&_$qy>Hi~Pvka1
zDj{sW5<PRoh>d-7RP1+eXJp!o?jqEneG<9(`wymCYV-Ob6e93>2d><P%hpv#1)^!=
zZ;k7)wdF^LPXCu&>9YB#k9{p@KY?;cL2q<n(k+S>y!<62?wM*eFoDMbaIKZs=)|+7
zC4F#FTS(Oj`u6j!Q_9=1P0BpdT(&&UyGr_%bnh@1(v1BU&{4k^>>XIGiR6-GCZ^nq
z{g?By9Q_3;FI4%Uf)Y)<c(Dm-@2vYzOf+fu_ohejnz(iMW9;B4mB9sP-<CxM>DP*_
z=&%awE>b$NCo>Cc5>jW{liTiSkxow`I*8E=2p1(IiJ7EcsZDNjBTMEO$C*GX0Cr+F
zM<>2{8xXI^C~<<~1B`F!{&dx;1!JBK*{tAb5;FiOUONMypIym$;Gn$T<SlX+e7vQ}
zC7p<K%RTg-<4D>p`d}G_3eXC&rg_HkA0Ihd2>7C?_EZvdnxg@oMeI-S0b365yZnyg
z{~ugg+ooQ`TmbON8OnJkSMuj?XuC{O!B`!D!?C*?GAYajOB`ag`cVo)-KkV9{Al}8
zqc$(CTdK;@B}uxJ-|*|ImRA4IePz7n#sp1V@uaizJ|y3p;}?B~&&Y`r9VoP6?k>Ej
ztgaVm0z1pk6`ua$w`cWvMoI)Fr4(&hkkOor>4xjMhb}7*!8u|vr{njoiXD7FLIuSB
zNhu%`5~P1rGt%&TDFqh*CN+rHxk&@|OuSsZB#;mePH0Jypv_~@;c)U0DP_S_z{mtn
zX>vGYx2NF(a;JO6#xB411*yy>F?q=Ylwns$dATbDGUJ5r+;&hSN<7(vXcWqsgN_hG
zB+zt5@NQ7|P@gkETKK1n9vy<@Uzk5PNz}3^MEFPVyV}FNNw5Hzn83-~IyvKMsfUIY
z!3Jd-ot}7qUA*UB=0Db0IwwEtX=Fu16V5y>EyEA8#^JY%67nV$L)N{bNn?i)Y{-Qw
z>k0K8iqZQ;-?)+?uCJH8%~mhqT-!Ejc?h%?L=RUt;46g)c~IeX?ny?cJj&l>l9)In
zZ=9T+4kykqC>hl#i27*loAZ)HCy4-$3x|gxzopdkvd@4~w;X&CjGyGn6UhNlBrS#i
z<90e9*=nn`;kRm3j#BQD#2Ay<Wc5uh9NmrGyoalG9sS!`Y^a;17V~gI5aphXhvj!5
zfTpYrKtd@}WIJY|^F@P>HH<-HgkVtS0LG<M?EtBUA0W3zq)yG7FFTcS?gh?5wuXof
zoSmQC(0JA7(#VL123<!Zx8TAQsTdF@-V#A*rBWX^kGF-7_?%M-N`o!!Z!CXQ#c(1?
zmjL8MYs5e<oHpqwahk45uB-|Tj!EJHio2F3t3M?_?pn!x?Ay0@hmv>ivT4#KHW8kh
z>VU8V$d;_jmk(V&IvN_C2!L$<@9`mB7*0tTw6t{IeTGmC{w)&{mxipHisA*Sx(IvX
z5_5AOeCbh@mg;h3rT0q8vqI28q8P#;fC$ay0^^-8`lr=?0I~B+y8;T7E?&W%{PU|%
zbq^vX=&HnzR#CAfz@1p!ZtgnFYMFs6T!7ZDJ&*1C_3PJ)(~U&h$P_`!kZZ3-=U@B0
z$=$sEl{`)A)i#5kp?T`jr%hVJOu7<8@09S&QEC$IFkiVJ-k)V=Tc*4={y&GhuF<5e
z^PNE>5#iw0Yj9?(+CL|zD9dnyus)@!moL9E8DOd_7ccGp6*vawNQjINY>mhHm+PUg
z{xnMT^UNGRx$F5rTbQk+d_<MI94i(DWVnCf+I@-M@&NQ9N0e{%kGSd<CvMD|%(7&9
zw$Y5%`4jXU_g*-Dc=65;=%g;DZCdm=76AYX)TeJ22le*;_3OuK=7r$UzWjocjvCl|
z)yot4?=b|IQJPIX^-x4$1FU|G+}30#&Kz@Qu3k(8wHq`jWV=Vt^$}J+_{Xx-z=|AW
zD!n@EJl4}_)jLzBUz|M@KgG@XW}82r`6=5AwaoHwu59+Wx8w#eykP--)pZ|L#sy)y
zJ8~e(y306(0_D&FblqRUhUn;E=<xMv<#bL+dGJRIa4W_DB$R`C@j35I=b-nC&+Syh
zK<GWJkeq`1vCaT56GTcO9wdxR$e#*=J?hj-Gjxd`Cx$8zSxRK4Oxa4_)|->9<E1@&
zOwtY|=;6a|nGZK^Xu}ajvEdqI^tv`(AiK!ORb#>b;h`JY#Ww3+$IgK9ykg8V8CvVx
zu=bl&TDRmpkbFd-5e6aXt*D*3;Md$)vzm{np#2nK$mOZ=TIgpWiaVC)b6@;(?Dg!<
z0|y)^G{4n1W7-Bwd+r;Vpd$jt`Bw)I@|)5I7sMHy#VO2ZdtP!+CQYU3FKLsX-5d5t
z6eLvH>t=fSO;IC+YiW^(SN$%Z_pJWI`_%HAqJpD)UcEHiF6RlDEv+!s)n&J^Z(}E(
zwe|M)PRS#TEK{`Ot8j0e<nc^6I6if1JZr6TZah#VcVt%?Kujg&^Rw81sn~-)$5Zke
zVnZSRN9ZpGc=^#pS$)4c{87cVcklYN9-FNqGuc@B;+Y840z<ORK3_*`{@;JBQ~#|D
z|26w&zO?SD+koyHCxrFfx$iq}`ZjO2d7q6(#~`4P_zD7V*XFp@L{`Bt`Zod9*KLDL
zTHY{@T-c6sqzwyMHD1Yc@@GF7SQ7E(`SUe3(M6<NQD0!}EohXth=-}ArRB?15AKjg
zjkuVN)}DxH^2gE<Snt_^AsobkO=)f-Z*-4tH|e-ziElgeqDQ|v&_BymSCkX#wD?YD
zAuD!Sck#jwCmKv5c@XC;gG;9ZYPEHCzTJa)NgH~0^#m?tyHGxJz48}K{n8qWuUtwS
zpk<0J7lmcglfnP@0tgJw42HINtoP_q#iWZXBY4F8?D-4FJY~f@Cq}e79J(JM5Xel%
z*r8;xSv$gz3XzwJg(yWS*12-b(cPnO%#CZ@`o*$E5YcQGZtp8n`n4HE+1C}Jy3r0b
z6+4y7iUuyd`gr|6I`~<A>aW^QW6J>5%;{!DFJ3U9{Q3=zLHv%<wD2<`|6x?u2lf8w
z*<|#|!#T_ws@gheX`eNh)82+0a*R=(Vacj1<FS)wQ{nzF@z#DC%9wkP+LwFwl*%0J
zSdenTi9x<OnbwB1sFkMGreD8F!4bbCcLd3m3hquSX_4jJjX&RQ_nQA%(KZ``x5dVa
zd6H6%VD5jZ_@BP>P^p#HCF|kC=Arhrf9|<4$48c5=L?@3$xI<MRCbgQuG@Pw7;xzx
z*eK^gS@Dm0i^HmtTL%cR$52A5A&PTufAROFX+<ABs($XtO~mnlb5@{U&&e8;Tc@p+
zmCx<Aj`jcd(ymc9PuVEA+wp69-B0~u(io@OFjR6R*Cr{Z-hi63G*^u=<DPGz`N$-p
zmE)<_Aw)KC>O!2i0m-l4aG}qJc`^NRRjb^#ogY-NWI|%`UyGUeI%vn5zuIgJnA2gd
zUsO=jVJ^vf+jHyPZaOeRX~y5RPag<~uzv7JXP}$fnh}dl+L**#eU>?-QQV+++m|LK
zloUKQ`0i|UeRIc$YiC@)^-8bRkgt=Rz2^?ux@b@3#ixX4gYd?$Q_tyn`^l3~ya#z#
z-7NZb%|`xWY^-kH*VI;_)0<{Dw)-iQ@L6{PmNzgDxndB0+$`6-AR7qDuG2OOxmL$+
zwrQb?SiKgdTXuGKy#*g1Ouy76oT4e%)ZxnXlgiC*MhsHuv{-mn&n%(=M-F&Jl>W`_
zn~xsXb#1h%&q$q3J9ey<cdlgOJi`@4y;JgQV>~dLP#GJqzj#1#nBfq;pk`>m&S7og
zV@<O4UWDg7-##Nkev4Zp7hitCc#dkcEa_aOIVj2B7Oii7<i;^5{+$sD3t!W<)I*q#
z7X3QDeU4;vNusb^2q4q(p09?{z2h_+p&9CS$9>+sEN&4)wOXAnd>-ZK8b-*3iC6-9
z?qX0;*52w4&CMG_L~G~lf9BPDS<}AS&LgNbAZK>|>ofm>>(<~v<6W=2G|$Ms7voe+
zOPwWGBCYdY+HsOZBfUN}Ruo@FkCIbRpfOS>9`eRiXYB~hS?SlVUQH8$A+3iVV=sB&
z@&&K-+1>x`9yrl`=n?N`7)EztTDIY~>uu-GFiGuxN@20xbS?L`R{sTe5+dp-`KS4c
zosc}=z<cfi=<2ac_5ItYsiN3s$&wz@tMNqy?&5g<>0lZ5=Z)L_)at&`!l#uI)l#BW
zo9OyzUNynasDr~h-SzCLcPzV2w9WprkK20AF!8w9+mrqasWPFuS#)^>)z%%>*hZ%o
zy;2&5jL}+SdCaC;c5YeFciQ$rs8|#_2lxBitKiiFK5ZvoWRUH?9K-J2n*sp<v)8PO
zN7*0uYa4%L0>scq*BlC0E_w+!b2~r3K0VOQo5N&|KzR4Zf2j+w-j{ij9zFGt4W*H2
z*x>BW1rL{IJ=KtRqa9#5tE#tgbbJ>k+RDEtGmit%iEeN+;Cv_6%a(oy$&N|p)xYPM
z4^b2hZ2RDB(Dv|N=O<9Kf{m5-ep5W(r?O>?c}UiX2@_gYp=`<@YxIlJRE<Eq-p#YP
z`RB<s{#n*vVbKpVSsWQmVGl@~qMOrC5{Q}3P)OTFxZKLFMjAhds6Knh0_l4;DPmXl
zmI$T2_uLmi#zvn$t;6QvFsRWZfByP4Xt|rh<rW#&96PBjl_Ia+5_>$+=U2rDQ=KUN
z77hJfOyqlx)SY@_kisG!l3A|9aVe`T(fzC1XxUoscNyf@K0mwBrp|XlG#VGFgw~%$
z7s04n)ro#DL7cBGKBbse$dmKh)mOad*gy?n=%e(Te)s$H!xw0F?KP8M6xw8@j<a73
z3vK*yBYpmN+hf^!?~+Smr72kh3KxCuTuW5*1`=aZRXxm0*=xy?V!E>oWb7*-tu$C|
zGOfESVV%3x`r4S<St`jvW2~<6#!gq$S4%szp^a9dN6(3#{RgYQdcRl<rw|TNW5TL#
z4;eM;&YGULUrw^T#(IAq=IOeed9!&@F){T8jFQ=Ecn2Gh8P=my%Fl&YUL3PCRu4qe
zEs3Q|F~5l{$Z6wue}kXCai{97`Tgy-N2sZu4++j{eJHoyq1-n|&Hi0I_)W5*TC9Kh
zYQw$z_SI&xy}gf*6}-GZJsAiA_d_i<<~K&rt|e$dJG4Iw*xbA^T`wlv#?PiiR_=^o
zpBT@l@^5c$`55WK>$h*ykqwn|ghca!1xH(e7f-(X-Hg^dP9&c@M`mysqKNv)QsnnF
z?J&+)@(65R*13KEejwOXgJ{!dM`L1b@dJ<8Ftp2`GdkMT>~@>zLD%1S*_wpcS?PN;
zDXAGYOi2TSr0m_N4_a1YR#dRKvpu~QEoykCbe)5VsI+4*e0iiD6u}vLhAqYNB2e1Q
zNaIZWjSkx1jd#5>{MB(<#g`?}Ws?4bsjX%nJ90JxDty6lR>BuB66SgJ7lkrMjm&hC
zh?AmIxB6xH)mDX?_H8-;ytdFj=yt7)s^WIIfl8K@WoTv;c`!DxKAYa=iB*Q^nT?~n
z6umc8op7^JsKbB(s?f+vhCqVjk9W^(tv<Q*-bSUmWBM)6zv+GUmACR<tgdW8F#P&%
zdJ->OPUu@yLc+b0<!2{q92oUReK5o9jKZ((9n+5Dx9-F|_XLgzn>lmlSWolVwt4ev
zjNaavUkzsO_H;EnY-qgX&eK^3M%811e~9<OL3oKN<AR+S_UY4-H<BfR!!()`7he5~
zgz<g=rF(PV*jkQ>UT}mNAYd`*YA>PMpvo0)L~S*5HOwC4w>*QR6e-Tl@Y<cUZ~^1z
zW(h%%PQj%ri$wltF^n%vTD0;`%^v&xU70l?Z*S+YL+!VhvAEW|-8@%NFo`4C>08%F
z=00RV(ERAA_qYD~sp;qSfh)E*E+x8dt)PZ1Okl8);{qcJb{mu)AN%V<ip~_s3%Yxo
z<BqBpVQdI7^Tp;J9`pUdlP8u_&V_|(I33vkoIXGHek}IKAn=iTNsku}u`VVCsz0Xi
zcZ<)JBpe)JI?-kZif=NdnE4$70GC$*HHTad?#YEJ6X{ih*9$W0@j6?q^}3@PljI!<
zH7f`RVsth9p_jN*PoNr;$?DrCF02QNB(+!A+v&}t%S-uK;E9tY9v40O*{U#VlOo_n
z+}Z<(GzJO@1cbEK>CZhXG`LXXqK8SmG3{_vl+o+9j^@8Ng`HW%oI6f405>n7hsfCb
zLezGX2*Xr%S<`u&^Wx@mEuj=t`TRf>fAPrgDUso}HvI4%je-r-Y5nKVpM6VTHE$H^
z2nvnDk}<Tkk|)of-+B3p6>JrMGyLmy{L{2NQhPq`DT%>Po-`!j3lmf!#KvWy0s%C6
zX-Xbr3kKy4XjBloZBJ0rtDqIL?-fCn7Q$S|hSvq-^Pq1jT=BGWl6A3U0&i@xsdRts
zqSCf%xCaSL9~txxWieN$3K-`^zv;`DC$~Smzv!&f4Kq1p#zpk%%XQCsZUx<#ijE+V
z&<>Gnr3M+S3MS_U>be<%C7?1JN5`f}_%?3Y((QTW6e<Bm=_#%3(e~5yp1nd%4&vub
ze>G?)2N^u4(#<_ePWX4WZr!T9$NPZRC(fR9*kOooE0f}b_1FEm9wM{-4q{H?<tu$?
z;s%OB1X=Ti2Cy!x0($QjLK1kYcVXps5%r<KYhBpo6Ka7BPKgqyfzLN!Un&B~u;)Lp
z{@OCud*)v3?B$3yrb62`TPYI~Erc$}n|nkZzH39z^DR4DZLqe@EaLfmuBeD2+_<n3
zSgCclZi_}t_~+}<fgz4D0Xc4ZT8CHk>1uTQm8lYRlMxh8VP(#e#}n$Ddb>9G@gUFS
z_B!fSp=G(G)ph|ZM3r{Ci96e}F|i%pfRxZ@iHwN+4YZ9M1o=(eiTZ>yurUc2%3Fr_
zikga2DsRP+&sA?bs)dGxkV|fG`muLPf%vy(^r){s(bMzO-doY7W4DxY%&U4m7Z<kZ
zr}*p*A9d&Epui$(X-!PK(K`LzELtGLo<=rUgRXXM_$@fu&Gj@?!4vgAH#VVMuBmM5
z?RK`j4b|eDwkFB(->U7sJWW0?_fFl=s#&ujDsbNquZBR0r{42X($d!UgG;wD8o5_7
za?O)@5RaL`<6rVkP>^mg)%`+TPrJhVDXHH*9F99y&YAGnja<|B`9&T`U#vTAJ2z?a
z`}gnN=FeC4)ec%xTI><`S?|cEs=k3MR;idJqEwm4wh~F)Qaxmamh9*kN6Y>_(Qe_h
z1@q0X?_C{y-erbsVPCr&7gl_YCJh+?95!g-+->3E{z&z%yjTbjf!x%ZXZqJ)bz~w6
zPT&C2I11%N^5tA@h;qZf-vInj*c&v)mtLGaAF?TMXWr<Tdm7CHlUsJ`*=m^S(czb;
zZ6}Z}boXxklwk|szj<Th<g_hsQ9=5kS!a`~|GJRSd{Nv{ue^s3L#QG1<_2Z>iL^U)
zrYgrW-k0AuI|w`thcIBE%N{xXn-y&y(#a*bIirfSB~Vx~QZZ-FyTgaZmuLQNaMpLZ
z{qttGyovs^W6zqj+3*YCU0JFi+m8=QV^gY@ul%s$Y0E|D25<pmYH_OLyq8)|b=R<F
zr@NR__i|z@Kkba%4aD-xvW+iMBg=uu^{ir(N)b&KS}2b>&SfFKrr`8GUl*KlS{`<H
zppSl!#~!souu49C@9Ox;idj!kXr-&IG@Qppn`)?;Hy@+5sm~F!a8K8;ojUI1&=sM2
z6<iFV)OyflaL?qI4OEmaO!}Z+8#$!};~|$XL<ys(<vRG&wI1EN+357=C{80Sg3=DV
zn`&X7%og>)M7s5D^**uSjMJX5Ek)^%M^N9`^0X(XWB@`aM@66Go{Ot2OX#bGp7;!s
zAl03UCDdTaFNef2V{quu(AmSX5kasiHIWx8U`DQ8yM}uryH{5`hnRcDZQH(D{eI7X
zM~wE!?a+jFh<gtQ`N6vCa6!%DFF80|$T{klNBYsPrLc&SLr<!j%ai0qXWkr)EMrvc
z2OG<b6=xQ#-%|eZ=iaikYYxUA?REM)Jupg$cu`NYbq|hcg0GtJDE;%MoH}yk>b*No
z2Syon?W#@JE34qZwn=XjI(kC@s@PbQN|x>V$#-zCBk4D%#-AC#W_e=$Ew&><;*{?4
zKWjPpfbw>@ZE$OUk1>nf92~NJ9e??iEwbHbRXs$*>8}Yy{tLwrJFGPZz@>N$7NE{>
z^8I_;^-Wv0$VK=J+fe3T+a`+ae79DQkUvFU<5B~yf1(EI^t91348Qtqym1R7X{hbk
zT4l@XcV~39<b<=jxG|$ZEoen)bssR1MFtydx(|n@S5`up+0S5zyLP)ek)`i#i9wmt
zJ<>VJa>Dw-p2o}T?z9{0z5M3Bo;$6RH2<~gI31Rg`jKI6zTLl91qQbWb$t+Pu>3+n
z$;yM9qmJ5nCum;09VpU`gxW{VQgC4O{q;E)GY`Xx(isXJj2uwpjpPR@zijgd49GrX
z;Gp3+#5DS9s%7@`k|0wLsevZ%|2pPo#ImA+Y4nuNZrt^F)9S8?gbi(7Rh@MzVETdu
z@lU^Qd9-bLzr>_vP5#>b$@E91ZZnzI<JJ4g-Vwu&x(*#YI2}Wiq*B)f4>h>GCG)A)
zNcZD?{zV4Fxe@lWWrPx1{6JXk`k2;YmBQQVHu_U}W@s)~w9dHZ&|)}l-g4ZCAlk*M
zR^OCi>n~S)%JIDE5ExO@bcA*EPb=Ipt?{=`kMFu!)NWUpwXknNyT(CBRd$&ohz%l4
zU>c0v>v!)m5e9XIW)WRbl>VbNFRC8@wfWB*H9uz3x<zwd=eFz4TOu}>>lu#x=I-W}
z3g88+z(6kbeL1?DYv1<s&@}&NQGZ2&$Ia~98`FoX_oD}foQS5%r`FU`ak0McO#ig(
zu&TAR*NU{X)BrcHaS?+odXL&(&v&eo!a}LN`QH=!+uKJUI;4sb3xH(x#7;q%rZz9z
zJy5al^6z7;!GfDs(W%So1Mv8idUi({-cv73YeR5wR(;l55toFz%n;<i4-V#6sStAi
zZt;r4LH0V1f_K#Xp;kM?;mvfVx{-=1&p2Cc6=g~ZKBW<E%+O}XviEy5Y1y(q73P;O
zi}Mym-l%)-iAJb><F`fHp%JB}^T23rb^1f+8#AN*@SX$zbJscks~sAa`{sE`a8@X(
z2-rUAY6R7WgWq`l@)FOEBXv~QL`0jW&*?7pU$c=qlxJvUa`N&x+G`zmZ28tjJ9IE8
zJ&5t6{Cvqqz2>mvG&P${2uXe8^=#{ed6K!5_1~ii4$SHjXzDR)#=(i53~on9#I;o?
zb2xZW$^C%ky2|BExq{Emo<BcmVkfNiwbA@{_5SgBgl34_n8z6og+KLn@U6Q?cFb(u
zy2S)%XJ?5ycS*{Zpsu~cmziH2UV^!}rtEKB@G^V)#zD`mzWO@~u(s>&Mdwc2z|@MV
z4ZqPPF-+~+ck@}cLk14CroQ6C%U8>N!|U}@{<D-Ck9uv~%3-TU3++ul=L1-!R*tTX
zaz=33+eTTVH=b_RT)1YLWz-svrk$llwD!6U8<vcf7#@^Mz+Qo$6*gIP)$)gny-9f2
z%r-5Q|ISvq*r&N~nh8JlMg(DWwmoT8I<r?t>hOIq0oPUnd07f!DshI~;d|)`GF$(p
zK9SGLvtGV@IYulxFs!{O0^}EWZGy7~(MTHYAM^fRSI+#63c2b|Zxp)TP>s8{L6;Ew
zE0#<>SEiOU(rXCLaHskZFP!8Ca#8XodNvfcfoQPSJ$f{wc0!e<Rz5JK)}g!-9uLuR
z{7S0v8@yyMoHjBsFF0ClUyj<@zGp&R1bg8RQ-`IIuQu@Is`p=WI6JXlVM2yS_!TR`
zx1t7aiNJ}q+ECOL_|v*0fD3|OQ$m=cR-qMQ52I9<^;YC@!Ytf`^T0PIHd$S2bg<sr
zR(5|Nq0tS@LTVMyz3EUlg8kt0RJ;GCExaD%gorKe7&T~(J74f~;@4RDGU47v`pBTc
zI$bVI(o7iNFepP=X{UMk>snL(dq@h!8ZepRbq&?^+Ot}keeE*(P%bw#HI#%n=Q%HF
zMXOGG9w?LIkTcY;@Cd1ez?_QFuc~Sl3ISGzv;55ep1>KWf8%$oH|@fAdE=$Av{gv-
zmApoq_9#Yr-RKj!bLV=*FPwTV*6M~pG13Gk$HkSl?KUvm^;AjnT~|k){`^Sj=4m`9
zfoNrzJ2_qrVOybK%g3#XRP2M=_h;@LNYJ^XX3bq2)oJ$MnQLsk<Fks5N1d(>y3Fe9
zHhXpk=$P`M+>rKh=NnP)6<3{Y&Zm~qHyDoSEESU{8{ChZ{Gs<C>*d~8Uo6a9^fkY<
zna_vfi=j#Hb<_h&qQ|tmFv-CE!o-^gU3GLpbKq|Z8495Kn1jrt_paEu))z`I8r<^M
zNSLH<;N9`l`8{LC_Ig~Wtltek)3vWlO6=%|1m}pN)Ilc|yU|k|Nr+4cEZrb&9dF2w
z+*22qqU}|s!S;jW@>(Mm5;QyieQchkb{)+>*GZNmZ(Lg??cT?*xvqRYh!h$-HiHDm
z@xaKqShIXwKQu$xd#nt^FhMq|L>JRaD3$fnJU+8$Nz6IxE~OtHUP$%KkGfx}ZgZhd
zbivjYb>q|n(q|jCyi#HPwd>t4c60lTdGK!b!*Od$M<wjlAwKzwjo;G|MY~gyF4Xbc
zZFJztyd?o!MpZ<wNNsx|<;>Hq7gfJ1-Tw4rM9I$!w><JS5*~kAwT}bk1j=4$XhgE&
zwfgoveE399b^8GW<jqQ=G;d`!7BEsYo{`o<NN4(CgFUZpb4x1ZHhDNF(80vtPz`-3
zz~RVoE`Qc*_-?Ng|MU*-yg;*CaMtEaQ*F4~DWJ~G%VTuuKWKy0&<n7w5jWyJksn3l
zX1{O2@$idpy?WCa<Sl~CDLkh>Yko7pId@gdyWLkCS_3!GADOmmC;jMvUzR@_FV$G`
z#M5f`^-iBF;tLjcSZ{Kr<&DKu{_}07m=5ww(iyhm+`y9O3wxoE^hHa~sfUc0Kk2se
z=7!oA(^Q03n$ge`t_G%oE=ZLF2N+E**vf~vFek2fpMa+{6%^vZysaiuRfq!*9BS}^
zJ7c4UD$+9k5FShQN9bv)<_4Rnv)w}1^Jw^1hHB(iY4z83Pk7X%)0djA^>Rgv@YMru
zm87Q5tF`-j;P_W(8SN)BBq&6i@A^URdj`E4<hR{xQ@b(0F0^{>os~TJp}F3_M@JrL
zJmqm-!-V52{)&mqPy6<E_+KaE;x2AIJNfa1UMtSMpSz2ji5r^!0dG=}6g;F3#_oxq
z*y%FgOt?yPGAEyO7;4J&OJ>T^lkx1Q&M?WTlluEhWH0S=W{AQr{OHBU6WR=pQq%t4
zl+F^*?Xq3woj11Yd1US(i@fcXb8pSKxOK>J`zOvAkEd1a)773_v~`8)OZ~8A_0s~!
zmp+_yKmX!@a}LQ$7bpDqF~0kb{AqI!yWcIkSoh@I6BomrAcXMuflM?%eG>s1t4%VW
z3s!y_nizSY9)TyK3gjAy4e!Y0#M%88M_-LTo4QkH*dHyx+2+xv80+lf@>CZ+{mqr6
zUs-D9+uveRyjgfwB$4ide^9#MGAaJ@;oPVrYk&qSRt3(^?{#<K*F}v-8m8rqD6wB&
zJ*8u7^+0-3@(uNvl99$lhKIcB^~jL8BLN#&HXL{mu6q~NGu)vj%B7xq8nQAZk9I_c
zoR&3L=xWu*A@(=Tv;SRlv{tYw5TdZ>^baT&A6RKVnLK;i6Wf1-vpD2KA!hVo&@zC}
zWnIZHV^IRvrM>Yo%icd|7Jxo@I~}$RqVZ3jf4^r9@dWeqs(p4F^(I_DSiGhbr3Vna
z^y<^*V3SZ1Udb0#0W4F1B+G>$z%oYnh84GiR}LOH&=->g25ls*&a_x{RR5e?nf?U|
zzfI{5v-Hn|Jc`}SEvYBn5z}dGQO?nT33w;w=o<d6#U>}=+d@K=J*rM^M|J*bi)rxh
zT9E6t?)E!<rz$M^&fh|D_AwKrKUxcdEEBH;K_da+$0*^7QH|My9)KG_*J(F24u}xb
z9LPByL%A-r|M$A+o8hoqjnY$RJW$cNWUs#&leR&yE}o$Rr72g<CiIo^W3&tmG*J)(
z*eek;Hg)@;@SAJ}ReF9Q!d|xx5a^6Q)Xf{X1lM{s`=&#LL)Keh=x8k}5J6KA-~@n|
z(@?pfCy`E2W<k<O1TGI&bRvTnSr$DO1rYlh5da5Y&i3(1^Pben@A2xtA8s5yXag6%
zLQ%YdQM|ZiParwre-gP-59gzmq<>#@$`!>EDdBaLUVr#NIi#cbvy9iw?ADwg^>I(`
zn>BEkraJzUhVY;3h}%}|8p4qRG~>j527!|aRv;yoGDT=&#9!*2_<uEDv2}w<!S-1b
zW=Q40H)fJo==DzL8nEdJr?i0qBXX#8hn0-Xy=&K|R+paH(`T5BETgumsp*4FEGK)q
zHVRmK&%Hk`b_gxnMmtMA<LHej=rUN8&hK6GA3cKCF*xz0Luh->6dsQfHLA{>KVL;F
zLpM70+Ya;5CgJ533ZtC6ch@k9q7iXYrT~QXgsEy{9Id^e_pWbv(Sm;VVaE6E++6eN
zT_Uz^d*$AvzKqh4BsT^LxBA?*aj1YW!R^gkX{hcT_9~!7i<~uo`WS~U9!4#CIlA6i
zibrHE5iC=1!oyCuzR=blIu6ZOe*j)AdsX#K!mpNeM-y+25ft}ot;icM7qvX{<Xyua
zk9~>lf`KcRDK&I;e4hnIDRqof)*rm_jy}DN=e-x{S*zy3=YJPxItP2cGT2nTG9aQs
zcNkmyR|tS!mzHKBI4(8ce+1A2@BpREE~5#a%M&BH?k>U6mGgrp9xiHp7Mz%+_PxQa
zCmGpjDt*(l$yAoCx=guxK}zb(IIk%BO<M6qmKQ*hPM1AZ_~z}~7GEwTFbLy8;-k{{
zuKh=5Mlq$Dol3ls+4hXTT9BS7*NT`|gK>T>eUfD<(@)vNtA1HM%+Fs3HusEZk^WrI
z^mHRG57W{3-{xzhcIZy8!ZA@J$`TaCtL^uR7ai2PhW}s{Xop|FWw5&R{rigMW7_Gc
zE9ovSa40;t_>`c5PzAh2e@gY@$<;raft>sISsGn^%k@-|JXxMi7!+>Y*evQT&JjP(
zW~zw%_p5IBFOv}eF=MQWhyjvC$bM1lX|0?(Uy&tCC>cj)pVh0Tn*KSt+5~hx)}i&s
z4N36DKWB#&nFQE^b&Ews?8)R3dIl&f6pcf5UQ%Lln#Wb|8Jz#Ev1_eY`8Tk>>JYFJ
zH2%26G{|vD6R&EKD7qWu@JlgH$@FsH(jnnP#*ERydK-#^n5+mLTDtP&T`Lxmkk~Tu
z+MOZ~QV(lOYCd3=OeDloTG#uhVwwvwv~L8qtwZ84{Xx4*>%aYzYsf!i^<H(iEBqR(
zD6cm3xarWM2f(HW!mps^x(da<mj&*lkEVLHU2DvFs2KS<xj+V3;2&zlZs<uQ3u;aO
zjuR5k9DVdF!KZqaP^?5C;8Ofrxys@@hJ&aRC$y+y7~^0&X;Q)7x%<1Ef__AdqY*Ui
ziudkbo^?eG2dRO!)gO~b{cbCx>xCYw5R6$npk81nMeA2yy;a8u>O9f|wVKi^e}_<e
z^Vi;v>SlhQ=Jx*OZ9rb67+mGRB)|oygghC+#;Il|S3=R*K_QXHunwPt(#7^W)(e3-
zVuUY?S`f)e6K<g2q?W3!^lowG373~#yOPZ+>I8redxUazn)k_3{LlW5w++AIBg?-V
zhoQxCn(b&)4!ZQ&BM1z1&Z}L|T$wzedPBB$NCVl2)CK}rCxUS9^zxD_%1u~DEfwVt
zDb@YNg1``u#~&^qy4oLgCEPgY`h%qT29oMFm#~hKC6NzfV>{u7Vv?yoz>)G;oyJ4E
z|2f!#y~={N%ux81H!%NeWk()bXc*P7>Dqq&gJ)T4E!|#E2}@xHWDsqsTuP)IN^?e|
zx9W+NrDZEPUD}E?4#U;EcklMq-(#{3=0+E616hT_+xyqviqb6+83_?j7|10?OAdTS
zV;gns_<nV0q+5r_Ub#chyx2OS3lPL!pD=DZ9*RQ8<h=K|DKnbWxsP=ntn0?x3w<1*
zVGbuM3IddN&p4ixv__nJ2%ZA4jzu#$YtZu{>=~%d+Z>$OpKWe+y>-oDvWqLN7r=o;
znFuvOR}@>RJ#>u75z7EjrsPET-+0}>rPc2{^?$CtW`Pzn5611i<DL-KE?ecM>Siud
zf2v`8*-&civzJci9#U^1sPQ!jH)+dj0{6rsK8>^zH#axk69typdhR!~kr>sSUtA5i
zQ8SyK(rL;X+6XuE;K75vA0q!SQ6D)1qN4`697G+597v&~7<MRkZ29Zsqb|SkN{-!N
zkv+y#2QjgR=n2u%pa`S!{c&$zcOxTZd}DrmGAE~IC_-wjTDfE5{<3(mR6xku_uo`3
zXNB6Cgn!>@p6i|veg3>2+X}sk#6ob&Wx0P-H2xnnG0PexVYQ)Bs&DsG(DM%{HCp`j
z_2~7e`$O{Af4<NI)SuNUzisZ^ju1^e>%h4^Vi=ycXU_)0k#<_HXvziAdily7eSdEj
zDjBM!eJLqV6FT9xCxsg)VZZ!rNsQnul;jTnO&O(xgem@K(_j<5|0zbJ>gQbf?ruIr
z2me+R_zg}2S~iUe53VT(b<dwiveyR(>)xyMK&4352o_s$U$HP&l1QA97|Gm(2Ig1i
z1LU1S>d#bFUj~HlizP~bImIVQN^#_>XsF`O3gt}!NWiQji16$XG~B}IHyFaA7_OzF
zwz|t+v(f1u;kHrKKpci`WR^PD)Sa=G4hjoB8!c*!%a@)u1Yt6ZFkBlQQ*n@Nuzq>t
zyyq5oW_{qJDZ17^PJvYoB8IunC*Hf7o12qhiry=L<3dAiT*!G6;`(!^qeNcANq+8S
zV4aJRJ#r^RGR@HuE2V_%0V>F=<NC1rjibpY+;QvrDYI}JlYqfo*0_RvD@yYA;R2eQ
z5#lzE7hV%?LG(FTmtntt`ux5uq#T;#j^E$7{_1qIv#YQ1H1@b0{M=*SX%`bE7(f@Y
z;i=rPYxbu?MWoVm`niYnj%=jXph(o@{TXd@ILo@IGhh}3t~F8sA;5UydA)hOMCtJ3
zZCqY5!+|%T3|U2k>aeF@E21l=S#n$tX#Q3il1~ok1-k{pO^le~!6DH2>K+n2WM<{-
z*ROpMTwK`j{%MMVs4?VKq1V!(uE{8k;YMH>WF|Inpl94<0fhWW5Iw|~->5g$MlWF0
z^2R|vpRd{g1leaw%lXIMYA=EtbNT-9#nO`FNkm-wqh%!Pw-pi}AhR!f-#1aOBbziL
z;*EQ^>%Jj@(T*{|+H2YG*y|0rz^JL$OzZ><`Q_`^4Cs^6T&OvWcTN)?kvQ)-qtW+}
z-y3`T#;en7YiI#aSqEMj4ruy3gv;Ebpx$a<B14{a_W534nAKk&+N#w;HzI>7<P-sh
zr-Cgzo6@6TX>B>J>M|no>U0WxVWfm%{?TM@tylhokC^2;jd-K)T6dq>=tsBkEG+o^
zdZdKQaSRKP4_~(l$;i8R?`*zucK`xq;*b*1lU&6O%k^{$gjfK_6YvX=FcpNF4AWOf
zv->V7eeAm|?(6&fAif!r0!UR!edqf4nfV)VRam^CQw<I_iS9Q0e&KMRA0kOJTx-n1
zATyywxVH59rU8=K1kwjhDXQPvM>8My+BNJaj^He2YKU}`ss@)Jf6|j8Lo))f_d$B$
zA=)yOF>zl9e{Fj5;-l>3pooi!qzqB)cp6%@0B@g4-yQq)8&UyDSP`eyj~_qk&fW7L
z^LRQbsTdW8vSrhu!;lJ*m*_w2P=?cFZ?U8(y~=MoV88&)!`bgZ(2%ZE;}P{21QLu&
zLvs24eOhbe2<o5f-P+DMVKuzwt4d>GVtK?}5jCPg@8D~19n=FjZvGe7x0!WrpaS9X
zw#iu%42$?hDHTY({L$<<B1Tp45C5lyN~inCVbvricF(Ka)rV{cNs7mCSmnp5foT8d
z>j730lirnwhjYvBxTCBJnJO$uoY&&T67Wpr*?i&pQkw(A4t;9wI=FZ(r{U=v?6-1`
z8fzv-wYB<Q4^6_aAKw81gwD;Nhb^NxFJz0@Or2_s@oFu6m&{m8{X#8^$W)x>5I`Jb
zfybu)Rm+!2T>xyP@93iN^Sa^Jym4c5!|cA$v0Sar%1pPh0T0H^9h_1-x^UUM_{+%e
z7hLa{RG64Qum0Otrd4en{=3>ATbXuQX{5eBsrxCV$wPE{Lri1ltI^nT@Rpq&sHn|S
z5Su6cM_oCDibRk?EyO9I48}zRDD?W;_OZT{yc|n9>X-I<KY>JzH5?t6CwjSWqzSCS
zFT8jE{>-skNTAUKj6U}yjAC;ynMqPTE=YEA1*A(!EdRNp?Y8UaSi^vaiX{FU?brFh
zlLexM<oF|GG=q^ivV$n3<`kT7j*rhju}6zgfO~5^LrfZcpEvFH^X`zbk^^$#0!{rp
zYnl<Z08VQnk#KHcf;d6LOmaU~McM*}of;qki_-5ma)dWi0lyI}&#0JYpRWm3Cx5uC
zrEmSdVXkXPu$wU(RucdNMK3U*<g=<abN15y_nKemH~t35@&;Un>XN=mTA6PJ*&)gW
zON$$#(Lgs<3=5o2fFct&8zL?{s{Y~iYcl>1%+?>uLpij2xjGn?U0Cj%?wS)K-W2?}
zpWhuRV9jkIcsAcA71u@p?e3Man|jOejAP^CYe8mUro;G@jt^Pd=QHV3LI9X~BchtY
z`TirpXUAjmJ4QG<21?7((?f|O*3VFlC+BM$<$Gyr8n}_<wQ!|K`Uq8%bh9W|?Br_r
zAAP9H3yqvBOJlNi`frN+n`&emB?s#Z!P#gO1oc5c7v+OIVQ*{9wc=lpCub<fm6DSF
z(n4iogdWVU^8%0EhaKBSf292?rdtfS;rgl9uvzOxz<S|g9ApwKOMq=r!a@s<9*nd#
z8amW7axKVRFR)(9%f!2Hyw1GbG1~(eshBBn+>Wt}Pq8WVPRylUg0O*@!z<J6`B(ck
zjG-2ENFE30Csk@X?|xrSYLSO5LlV(0QCe^M_g|H>6T{em)M!D=Un?5d{nHj2wbGmV
z%~q!;K&*C-qS2M=6=o6HCFjp^e4w)cyrHqoq5yx>Z6aw1srZ+bPCe{9<!tVoOqf`X
zmN%c`@fVd?-n+2tSf3+fj|0VAnp&P5%Em#VAw3V5rQ}yijA9u=?`Z0x*9J<y{B6PD
zII8f^W^#Cel00eOWl`@|d_%nWBG?0<YjAEpW(Z>{auLyy)GKOg+F<5H&Yanndm0u<
zMtgIti*5f>hkygnXD+DU_~aTtE&`Fs)gNF~cZb3c_$1cRd*v@J?VP1fBSsXu|Gfgu
ztAxd&&ro+W0Pwh1H<9Tk)@7zL;CJ4I3Q^iou?gUvLVg{>zIsZegy^4qa;$QN`-&Bn
zCp?`OdwG598+C=qkHD7maC`Ul#ZDZ}E){dd?qx5Vb89Sl(C!+l`)NcC34(+2vWw?S
z2l75^Bl~Nmb;Wh&%+`3isi81_eA`&rrq<1%7w|+3qU%3n#$QbI@q+;dk(V3j+~W7?
z#MrUhLTM7EheeDLrQdzzZA+SYnIS9bKyhtz*8k6oPS*-_n|>bY`^}Vt-;#kv%3XmC
zWK;tN3kA8Tp)CdXUdchRD#<<+3LEDm|A_0VoJe-YyEOsuN0uQ<Skko3kZ4+Ksff5q
z`hTVqjF((?Dt<agvUmDQtp76+-RUTBo#j=!O_&gw*Jbo^$8)hsh!<bJ_-sOZN9EC4
zpAhJu95`(W7>FmuKq@k<&$%1t&YU)NsxU8tv64<?OP>seS>wr$j!sW-W;<X_7&&^h
zKV_`u_T2C*+o+7Vu+y>Sup4DA8{a|nJXE&SBA5dj!$+C2Yfyh34sy`(!Xc9iO9JWL
z?DMS=f7;eHuY&4~)Opx0oN|gzx`Keo;w{|j^q#@o#8;-9QA9|o{2Lxyh3%`@r~0dp
zgw22c@@2XA@bIhC&(KwCvtmZ-NM^3QC-7kDuXe8}?ImEuYQw^l*E?lQn13Mv-~v8J
z9Q0?Zew0MzxXF3W(-5$oRZ+Xvb&_WCst>bUKDQXd#gf;(?XaD?X4xuI0^^x|CbzJh
zMiUoYYSNRGJ%QhUk~U|Ln&?&~3#^#(K(1W|YRDuGn#-C3rFJeY6`%PT!%4sa^{blM
zhD)$FBM<kEIwCV9NJXQp;&d@afd&kUt<E2u%icaP@4HLLl^%cYlQm<fD_Y|gQV*D#
zx-(*>CH9Y{-B1pSA>qukwe8`Zg@6XSL?10f9%ROn$Gx#mMl5|qOMi=Fop65rni632
zS@e*E1GInxWA4#~3m0CaSp%>K?BReYy;wO<E=%-bu}bzN<4BoH<Emc64EvG8F^y9|
zgzXdWttUk%wi1jY!#zRcN597tUT)>D=OtBE{Ax=NLU|{$`*JRI9acHH0jZ*A<~mY)
z^5ltsDLIld_*3{ej?{qMHww-+akQTQbh8iAzX613AcT>+?&#5Wi97yELG=49dR-?L
zSuK){ME#nRqs&P~37NO@I5n~;0%4#3w+jtcZZLAn{&tN-x1?ws07C@<Ay^wYVJku3
zIP^@{aw3DJgQ)W;(ojD7GajdoNct(kgXmmZ(!p|P)+oNY@An-$)_@b1KOwlz6o>UW
z!Sa+~q=60&dKT3Y%x#_>cOj6vS{x=YMTiVV$2sLC?HBTM=qGwXnaQA<kaj`dV2+!T
z5(xpF2>tB#?(vSzP}kCHNeJ*NWac$i9`d`wh_X--Um0@tx=-5D3)0Y|F7F_Fku(Gp
z0#nJVl>j-a3r>6hQw${QkcLxHgckNtgooaBZ`$*z*;%?T=uhc1(9nTDLdtNV={(!i
zXlUR6*{(@Sfpg75erEJCpLKo~WV!69(3U+U81d(dVTwr*;$V%>7)97m!YgIy8g`RR
zG%bOg%I9ot>8z9TrHYClSNHr=#6Zs&kDD&;a_TnZID*;9v4+gF-`<AD4ndp!EM1b4
z0g_zCPb$O&Cm<i|Qb{7wHVx1h7#P?MVTbB3%j-98T=MyO*+&%nbg9^dl&fzH89MY5
zmFEWO(GEt=v&BHO+ox)o=)}JLD2Y?OvEm;TaReocC4%KI`3>}OGQt^5PLGX9VxVB?
z_zG^{y{mJ3soTPKrl!6*Zd{KG1FFf_1I)vi{`&p<{;W4jMZ3PEqC@_-Auj9UWz?ei
zWEYc>hb>43Ko`jBl7u|SrCupbwZHGj9x$dIwujYR!Xnh-bO%r*qw&Fo6-l%ET?3-l
z7s2>l_k@zFQT<;I<cdNh$>_oxB~OoV4?g3`K<grpyIo9sCYP=3dZnFvX6Bw!ziR+e
zvz_s46mildi+y$}4Vg#^#2O*1h7c{^Ps*iO@Fk#1pm<_gq`HvsO@Znp$pNZ`=-E&X
zG-R8I>Xxclwc6nzBiI<mDuK)XNV*sv4OB-t9g49J3egA<CNr*E^wstMHPXTx>~=Y5
zy7t(beH%}X>wwOb|0eM2bak*Gx79Sp{26-!xT=;w8!Rs8;}f<Fo^gA47#(=OuLK@a
zAoP0v>mGvuJr?cf?qRi-Gw@1__4|zg-4MjuIuhAI6hZkugI{XE(fp?%(wC*JM&W>2
z)&-PRKyrAeupyn!ssO59VSF^HWWuWuGgt=<cAtLhfX<{yaH7z-74up6Y#a{)d-v|$
z8wtC_K8?@}ce9-u`Hv7?(Orzjh7X$CwRxu2_gfRT*E{hDYoJNLzeT5quOK}7S|1Jp
zr-=*dFo<Y!QL~h9b?_F&FDSPES@kvLvd}zu@hBuAc%&od7O<lR!(kw405V-P0PvFf
zoM(*J5i7@!5k3<vl;wQjknXvpR1dd}vJypvz;RdxCVILz8aV|btfq0Z*FCQlx}E~k
zP)*EcfV6nbN#a}pfRgTJF%_U#0y%kd@mvg#7hdm3JipL}G9aE{&z8Wab_iu<S)d^y
z;kFe=Axl-Pv?7rGH8-cH$g07IU9d^RwNekO()4WHx>b&W)^y`Ae92S5wz1}+7r)*9
z;hzGstK-Z@-)A1Cd5Y9gM^yE#NYh7+h8nP`pJqs~y(thW3gs*&naEHzB<uyLIGTjX
zlO!EbfeiZWawaR)<K9jMFrXO=az3}1FfM(59-T?59O&dV6L;9@?$bu0Mzh{3-&*%M
zWna<|zSg{yUyDUcL@z5DL}-E6Lnq)@3ivVM<F!-QH&kBo{O*zT-Rk=D3Jz@WtU~Vl
z-QP~m{G$bsA=~yA)$yHwADh8$cfvIWGO9q|H);XreoM%|0MoUo4eGs3Gc=CA%B|`G
znk{W8r4}>If@S&gM1@@y0~X*z3(~d+kW<G@vtXh&3^3>5in+uZk;(y?b6|v>hvv8a
zB~_h1&Aeb`GxbE$qTajE8?jbnOA6+#h+P*FV#Cpg)k6Aj)Q<v>0d(I@F^S`HK)W=%
z!1-(b#jTo4T9)ZA{Ly&L@^c+`dA+W>mse4AU-e|i@ALov8h}3qN&2S@{TS{Aa4)3>
z?&jC0gxmJ75l^?pS!t}^o)<oj@owDRQh3gJlH5dYPI)o3jmEuuw_v`TD5bj)lP0w~
z)qs+(ID|-w<c-!ZUsB%cZ(mQ%JAujXho|<Oc)RCaZ$!a3aQXjw8bP<#rR1NTeBQhB
zo%fqoRxD{%*k@(SdAm?km6npXnt>k*CMQj+?~uT@-t#^;SorIom$O?cO{#78=9T%x
z?XTYhdCO7my(ndACx&+c%GTam>1j3p`p<@nCh7rs>vYO~oa&^TG6b#YsfP{FcRB8v
z@rRl0O3PDKe&2O#oB^A^OvXFW<YNM90!k&S#Bv&uX@G5jP1YF2L^Oy}LPo42$dV2V
z^`=0rqTa*vxXNnxOr=-<Txxb@`K=k0Nt7C{r<yliH{-!N=)LaNIx1;Re9t)Lot?Z*
zKmU_)`Te67x7V#Gd!ettY!W9S(xQzePcjZ#-#=>=`|3wjyT%9B(MGaRgaQQ})ol=H
z*e~4w`h<-o%e>Fz7udSYPa5E9^@flQZsmzC$tzBjcAWF|RdQU#Ze>zTJ|{cu%K_-X
zO0Gc?dUpQ8<Boj_do{VzAC2gdF}7MNQlL}&h$8Z?`=Y~p#@qbSAg%WZYZSc+QVJ^C
zbS3^Tn#X6zn)`Fv$U0{U;-jL>mo-7hFKP-(o&d@_Dq+dMso8_$$M>F7h%<`Q0{KmQ
zhtEBheohR1Wk!!|X#IHrfE6@y``e!hZI{)Al!)t&!#lR*&ikTvS8COpcD{~X!PeRt
zPKqCoOmy=GoGi#QYtj734yWXSV=tB_&F!A=`(k&VWu9h2TvTFtV&_>adFriNwOX}n
zd*RuMe9r@;fM3paqC|F2SZ=NMcG(l}hyI%Vf@Rnh3<z~zCN`0%F~feEdOY({t-*a%
zd=0`v>@!e2q&8OJ<){C{{)O1^19uVZbSY8N3y883_>a>@DtC|aOU?FS&8A^t=}Rxa
z@!x+f=BUl;9!W)S_p76Z2VJ0X$Bqrz|CAdQG4)Y&fekA_?{(MT?(uc&?$WX0yPN!L
z=lr;RmkZ8c6AdO;YIseURo%Lt-GJnIhu_zJ=(@kS-u+9b20N*LTRGRD9Y0m~ZE|Hj
z&C+JK8XK{pIyAk$SL^Pn<UTw~BE&PL$XnMbS&c#xcUAWGpABjp=qQdD0&7$4{J+G-
ze=?JBVmx$Uo;Pma9?Y>VGsJ30VpK9Xuv%~!F#8cvpDHE0L<QZs^DLqF$Q{!<_wN1Z
zRdJ}r?M<b9S`gKP%uV-RKijLGVfK^rem%MQ)H`hP<kIWs13WVVmiWypj_Ds)*?YcG
z?5n(}mnZbxo%ePe6*PNK@6DC#zbtCHo|{dDdv!p?imh4GA!M8V{aCaH59j#D`Vg1R
zzzdGk%`ujr$Nlr4TILT{)96_3R+PnYUBcrKu0&sD^kx0(FY~Jd(PGFfJ<0K<X}o>!
z-ixCrPzU2Ikx51Hm9<ti@OhQ-DKYf)oRc5BEs3uD88~Y8<Xa_GlLGS#Jcsq4yrF0|
z;=o@Kw-%ha+-lygq!D9(yuElKY0<^f9(SuA#|6E8rhjz#&Dn2f7u=0X+)``u{Odu<
zx7<2k9N%R{zWt6<xl_6zp@@o&)!@SAPek23$=zjpX8$>hn!mAITd6wOxW$gUTd!WY
z(P6R`D=~bX#c)^ZQfj|4qVe`JwKcUn?KxGg9J=%xwics|tC!Y|F0jqD&ij6<Aph~k
zX^R$UE(~fB6F2wg**#WP>#SGB`pqexdu%tuVET~uD+x?YC;iWL8Svvx&vt*BgiM81
z?q<_}Rcw@eZE9pk7H8?WrE{hoPYZ0=66{k%bl#kWsQEqE<QZ-jf%En%R&8vjI%`h7
z)k-xWZ|RSA&UzY`_b%cL>>hceWZ~pIel;?iz3+*kE~hw6CmV))6TO+{x>rhn-d_y_
zv%gF3g_Hjz-^gncU^{%cCLh>Ht=50;$!`<e7`L#aQUXL_-;y@ou|}f~HI+)-uT(kQ
zYtOe($c{fAIR3c)r_24QYy>q#b*GdHWRf1;zuTW*msuOF8XDW#9#``k1q>#;gYdYn
zl79@oA_}fO8x)FZ4oAjNE$fnICA1F_&a5D|wz$;;moHT){CSYfb!x=T7o|UVMAzT4
zu*SPp&-M~M51O-!t4#00$VeNMI;VTFQ_7Df@}rG5Z`!2HfMGJVC3VHuexILX3?$!T
zED;Kefow%lzdU->;%$^llRqCi{b;M`P1-7cpJo8t<rvbw(1LRJz=fe`nsYb^T5CUd
z1r6Mq<+?WA%4hzn(Wyl%>t<=yK}^hfekJnd%j(MoyMhLc-SeS1Wu{3)n{Z2`N`18<
zi`5KWT{;@840)n_=|I9NcjuVs$y?eDJkasWoudhl#!vLHnZ0hDcD%9a1hsl?HYyou
zoeU3M6S1a4_`_8`WqooZvfK1(Y|`B9?Bu*l{as&nxYf&i=GIR;UoFccn0LXusm3k3
zv9XP|^mh93-+`l03=3{MK6uWw<2R1^UT)lLH4B241n&hgzPIv>v^V)nLcWp`Nim0p
zXlQNnghlbZQhARFo$64`gj?YRPL1u_<-Zd7d7EQnw`gzbiBlwIC>{cVk<gP67EzWk
zQ1N^UQ=(ROHi;gY_VnpfqpR5%GNQ1-i0c)!aQlC+E;_AwgSw~Qni;>{Gwjf*!`aWv
z?ZO~igFsDt^=@En{G*x`C?u49odM1(8SYmPIt1wHT=mvx=Q<0Px+%P<NwFev+6>87
zXt;*0WHDZC+E4d{@rTY@hhMGnsvEa;*%4e6($qMg&o}VEsF-OrIP8LKtj`_Stm`~}
z!h~Hfmoy}JL4IPdt~0+E*;+jQ5tFU5bb%6Ye%#%!D=ZN4yPekRbP@G?%N);IxR!#*
zy`wyNU$yG$^qORcO^u7{HI0sPsp)Xc4&=CHZ&o6t>*pkE#lF=Z<pF~atj2B{UvZHG
z2Je-}k|jY7CZ|eH@+zezTG*)XSz8M)@9+p_H*UGXnKW&?{l2c}-j|l%{E)A(Xs0#}
z4kuaLc|Ize6CERmUSxd*2k%_JaifcI;K05Dn`>`60{?dEL+-+}JD=7Mb{hj=KkVzO
zOMgD#f<%K3iq@CL*F3IJXZm>8q!$a%`WqV=pS&^9X?mE-Zz1BssSa1PLiXjpIp*kE
z>-<1py?sE)lrQK7$<`$6%fo9RA5gvKvD|s%HEc;I-#<?~Exf(L?sE52UI~I*(>M|p
zx=t_{_Rhw^52vgPcH95?c(xam=7=^5MIuqWVH#<rzt_Y<W7}r|yrG9#<0Jpkw|hPi
zl3Kzi0e>ZhVJhDBFbzl|2N%9XsN+pEtlK`zi_`AaJU6_9Llz%9uf%A};Eq|@&j$_R
za*7|cFK7D`S6N2&P5w)0d42Sfe8`!{&B8D1@QNQCaE$5Lj(=ufJM6#Zt#ImQGC4T#
z;C!=FDP1ji(r$BJ_jxH^4!6@Dceh6{X#zzlfonE{2d{O0YX18r8b@~>8kbkw_ue&!
z-KX0OxQY}NGN}%Y1pK01yBVGxjiWbB@4YIK?$p7=Q>g!EJ&)da2V^GT0Oym8s_<A4
zKUsbApXZsW(qPll^s9q&r*AryFe5r=rhw&rmc;I(Z7@F3?vwke*?FC$Hbkf-u*|&V
zHzo{3y7}#!D3v5=fTIgpa5{AL=%C3uck$igj@6O@Le#>-i!c}aW>nXl83x{;wJ{-9
zQ^^)lzht4V?DBTQk^u=39TD);BIM-ePWxJ6{^tl-)6OjSnwiP;;JR(~+SWA6iu)4>
znL$kM!#+%a+5iYH{DFY2%1THjg<h<2-^N{NN_x|m2KlqbIyz28?+A}!&vXg#;GViX
zoa!G31I6^S9kXLGX)jr7hJRQt2kI-C_5(y0czP$STQmv5?Y6iN{B-Wj8OeXZj$*gt
zY4a_l4)CvXdvRJ@EOu*tLvL(Eubscwn7pNdKO=Xa_6Q6|yQi3LKzx4pTMV`adJhti
z6IR0(eyY4yyWILm)>IQaUPJFPf;E+<`dzd0;byuGD4rcV7wVKY)=wG#p!GqFzl@{u
zfopuR@g>P}m*Fukk1-ql;MJ%jci(<pQMByrqe-2e(nn5n)oA;BA?{QQ@2p5H-Yz7l
zh1P`j=Hx{Pl_|5$X?!eNuE=GeIeFVkzKBd<_iHOjD96jl`BUMa#+`C)C~upLzDGXO
zcBT&P@nnsNVZVHtN|f2OqtW+j*w=<<Ki(Yqen6`i05kHRh!h*GZ6R+sar;XK@MM6Q
zDx+WFXbo~If8@WfojJ3`_BzIp^yL2a=}3qFh^t=pat-P`q-urC?Cok2E+6^swuzmZ
zS<d|}z!9+={eY4$Sbzv0g;0S<DSEc>`{|Vw7?c*moXU$q)|3v0bi(yU3tAR<g~xw4
zYX7CFuCAMd+J+Y&+y&AS@(N}y=M!f(X+;{-#yCNRwu4k=VSI_qY^bR@fMhnMSIaCL
zp1zPV3wyC}`Fr*lOgD>g-ge{np%?y)r~7Q&Z1eVGX3Gg5ay%P=P^P|=Lyn@dlQ>H8
z^a$s~klM6ownW!=&fd|jKC-jR@G&)n385I2+kyuQ)rGvI=<(^1*Z<i(%qcxTza?DC
zz`=u0&bk<X`h%3~w{LfO?R7J8xU9NGZ|}NH=w9=@xP&twu3DoVx~A$;w)$5-P0j<8
zM6*xiM_D+I8{DRmK$6nYkUE(AVTpQ(LW0C4nncO>>95iykVRQ&9fa5cXv<*0+1v9p
za$-e`jSb|2A#0YaM3_Ozl$<+z=FCG~mFXt~A&dzY7&qo0be9B177|NJ2QRP_-O9%M
zXu{V@WXk!a+5p_33YVGuWdCl(I7oU-lb{fo;8h9M6tPN|F1HWrg6j#W0(eQHpAldk
znR(sEetG16+N;sXc3UtaiClgEsJ~eOd{39$i}H74W4m_}Efo|*?XX!BV9PK<lkuKE
z&0I-|NLgD7&6|6y82MhGIz)b%sj1tQEqBwa2)JA0VAAqVRo54Xr@!==8rqwm%Kfjk
zDl;t5;FQ8C-0zXx;5iT4gkN3%v`*$2S7H;GF{>%F=#=Da;L;v2XU?(1A9O|?(Gh%1
z=HAOt<TQx~&!2a$p}xL;)7(is1i}hMlT3`fAi+S#QxQ&zuY-vRWhIlVTOb+uwK!6A
z!u82e<<S-j<~aC3x-^1>3)F=L74l3>#X>N!IK?gFdwtMmLT4o{gY}X=eAUOsydCLP
zMOwfu0F6B>{vwmAn!=J$220ljUMQCXOH`99RLW(+IT2?OKCcbwmkRKIEGq@ni0X?N
zh+;OGV(8Etdn@xKX;x?}ab6A@lqLv2jEBgG7!1R1xx`@?T3TAFyp53DU=9UCkV=#q
z<H*z!Pz{}#Fjo?FviG{jzi2%qJ&8J4@J)szrsQ_-(WCfcWhs#M0Lo~YigfBSfaW^T
z8$QZ3`QFS3mP95B$rhMFvb;T^;AP~Vu#cj~K}@?UZ&nvw5gH?dd$k=YJlsYYpA<EN
zE}vcGe^t0J_Ql8N^c)TcEtxPN_!4Ife${;)R@~x<nY(Q`iIfWY3%{TsRq#5I49CRv
z;%I}5ee6^HYrDm)MznYy@(Fkc3_MFC$p$BiKe2?x!1|6!FJv>8?hz*zsblG2LS^XK
zjex3nE<Cbs`Bns1U-GRyWOZ}9*e`0<VPhAR&WnE~M6~MYl-!tw`D@wIrMfAq9L*X(
zc1eDWq&bLKEd1&T2yxUKxd#`a!AD4y_0BlvVo67)-Ae2f*`D=bOQ^Ism#E(`m<ij2
z!lvp=Qi_1ZHOD@3fOT)XMcp|GAA|l>i0?8I2^b){qv2R$!pji%NX1QFjh>|OJgu(!
z+SBvp`b!&|3g%0oOMR+(zUB2}gPGY>r%{Lszzvphkp#pu+!As>&v+T6OF1cuLf(PM
zUXl{x)C?51bgNjOIJ+Z*?sE?c0wpRfc<q)cUzaUk4tpnQUzCi<74g~vygk?M`}ww=
zn!5VQagJz0#el!Ts!EsN%6DZ+vH64?;@_<z3_?@<COoe>YdzZ_dXD^p;tQfTzPHJ~
zH=c}|3>EV}h!Cpup#BSo0&1Z#lK%$PW4!ylb}{!-Bnyf=Sm2|WL0PC1Tu2bUa_iP*
zzWh39yHh<YvpQwEBp**d-Fw%~w~|sz3X$bc!oqC(_YWwqh#q0e>yorZQb>rUDME*G
zhFa+aHw9a5+&cr8W#9|)BU7q>&6lxfze@K=smu6;q-3AEX8#sHbH~6g^}TW1mvgM9
z1P9vr%yp$&9DtN8gHKlu52(-0veK%paSIo=Vphtf(_zOL_knvxNrFR>U9=i?eQbn7
zU8ZxM3e%PV6d4L27>mG_d=n`bq=Eu1fG?0D2BwY#;=O2}<IaCnXM^@xSE2UQldmY#
zd8ZL;D8b8&c&|@e3{%C7O>K_-AXr!tIDNd{xtNJy1FWr;Wb}mG7;KK3rGYPs3z^B0
z`b??IVtZuYZ@Xo1Xr<_{<=8}5C@KoQzDu>xFN+~nWVa5H-5YXK5leEJQ%<rpBEJ?}
zON>d)<EBi+(QpDTMFgktyx5)6K#1d6Q^-SsP@hRQ9Xi<QBaIH_xYg7VynR_PG@k;K
zVoo8=zL5E{4-)6L#G6q@j-PCo#ZSUwLf<TT1#w*rXT^XdZCQ-MqDop0m55dK_u(Y1
zDl0LWkHLZ(q{gLBT_*81Qi};yf9sZJ%8z?-ct(^VCq+^&d8ouQp{J7lQA}!H9|~P9
zef=;gtym77=bb64t0eKYLZqVh+e@h`<2Qc({Hd5rx?x4<^IQ-;SA6L}6;3|CzNlDv
zWM>%6jO43qTguNmocCPQQgw*_g73Gz<K@q~J$<VBqMb~^!>=n#o`}utFT-734=$*d
zHOPUxf9S3LK209(Yfi&b(<LgNhPUclKD13(!ZindtLJV6iKlU>6V$j42m`C!Bd`Ab
zw6&d^Q@BQUazDyzBQjK3W`nC~Y9_64plHj$TaOrI<EBl1wDT^^`^ZuWq2iFK=}1hQ
z(CkX52K^~kDQMS*(92mf>V>qWZ3oX4caKmU>;=P30}C0~BD!Xd2-JK><895kqDA@J
zg)6goIb;2TQ5hk5l3fSEVaKs=GIYoE#C|rALhPI%k>Cn52rE-wPT|yy3u!6fFcZT!
zKxP$*2n9{f`-PkS0dW$Igh)Cev}yzvnT>}VwrP`j<|mSsz7=;z^O#Zr-XwXD<Gq>C
zpdc0Nuft-eQgd;nYU%0*6T4L_h3TxJV^3fZ6)2u>BZ%P?wv>}z^}TEh<vsj<21Ut?
z0XZ!SU%sqMA;<g`Nf+EW_<%ss2rA@SCxH&pUGYh3EPm(YLQb=?j~~+!%9Tr%LH75?
zfo+JD&63ncczzM2pejrgMK9VPQ6`cfQ}+3@f}QGn{jyB?2Fg+j{xE$E)tNXK<@%K{
zbt<9a9x2~LA8wCjVw0pwYU${#k^DH>lyZIvIZBq?x&{9nEXlrgYrxW_J!R=08=r9P
zZIyP`t&W89aW2Nc9n&DU`@@^7Yk*5i+BI9ID-s(w+V)>Wp5O7S*@zU1$8Dm+W2*A$
z*?oDJsi;a3N7d#C>o6g)0gA(f!!4RLvZVg7L;=ZYaq&}dHLu#X3kDSL0Hf^g%!z!l
zuopxA*7J^(w&hQMNv+hj?d3TaxUJY4fSRgGSn;A5Ci1ibENLPslad0!ysKG>!;Xwu
z%NlNZIq%);*TPt@9~<ZNwlK|K3PK_;#2Yr_`c@+gi@VRxZoN8PBI<}f)QoTlq*{;+
z`0>Z@_q@S@fM`=giCTe3wypj9Ob$`329lhLi#KC);l=UXhT`mDEgZQN-)75T5(LBe
zZMCFR^C(KMbL#u2j~_ia@wmpw>u-t5%Xh<u4G&MWqYRZK3d9-I$xJ%+`1pJfJzmR{
z%aO5pkG%c6hPI~KVw$}zqf0KC@h@%d@Xu-rKgb|ZN7?>^51f4Z%TwYhd-O<p`m6Qn
zBMFH<bz7f4y~yNdQh!=B62!m{8m_tKkQ=_RrjZMsn`mNwE;)Jq+<}&T`({77`zQm#
z89f6Dc`^cAO0(LD@k6N>8CEV(uXtQwT5Au>U$WB5X37-9mEPXUN?aF=EJLFE-S_|F
z>d)hHZoBSrJS0i;KoTXH%FwJbqzSnSxhf5sIW-SRMU&7(WmXxBD>M&Eh0+{FQi_r$
z6loGFslIEU>3)8{k3XK*b6;|HzK{2D>|^h>*IujE!_yOd?V(4TP_BS-A$-BaH}}ya
z`Xrhn2FYvJWU8O}{B^7<=mfyDSy)IEQdjqzrzui>f_6b=z;B_Wj${WT46tJY=4YQ&
zb{z>J2qVdBwo056M<RwMSlO2z(tZG;1oz(DjggNYtw&s4bw1_STxf<PQ2Ghv5K=WX
zG|S5G8#|FwaY3+bMt2D=?5-XvD07f~C=+WRj%*O77K9+kk){YF%%D9?3ow~pFM=`<
zxw`~>u`z&)<cSl<G&wtyi<499=+WP2hpMfnfXu|pSr=A2UD8odeq!Nz?r~lBPu+be
zM#8PLmo0%@7yNDlDh+*Y%HBPoxlPRFvao=94qM2DZodJh!3|*=3-~(t3$#pTp3<<E
z?b&?k4w53|fGh#6tyZ>8FOSF9emPv`YpsVon~Y1|Z`IHRa<bsT>wlAQ;9%N74!fus
z5XrhckFlc0!Y?4B2V83q7CnRbfp7}tO(7_zK)$xHuy961p)iUwpH1KP15bc1NUGi-
zP=Z<Hdx?pVZzBI##;2+|Qog0*UTZrm=g(1R(3U~9fKm+vRnLTObi|uNxjN%%{=A=5
zIuXYQEoB2KZRpuRf_&DstzI`AQA1i<y2`r5w{HIxDI6hk%hoDriVF~P3$mQ()xob)
z)6Ff-{3g%;A3D;V`Xb3q?A#KX(7_A~@?qRaosQ>1>`sD8bv5DYUUT!d_V(!@py0x|
zDt(^~9653rW?sHNK0a*MJ~THk1R#xHZ~|oDW5<paJkzZ$(Xotgi7eT%{P2^motrlY
zq=#jy;fKne*Mj;E4HO>bCuFt-M59bt4QeSsrg@Zi!ig;Xdpd;faBl^8n^_~QdwF|v
z4V(0IcEVP;Px=GfzicN8lX{YcRj<2ft?@EkAvnW=*Ge^?VJUU#`S1qsW5>)MNc1}9
z<S&U{bO;e1HWv#ObnJd_@H~82*m8+43I|SOkiEwFiUAe*{TV1H!Um3FKd2N<iua#S
z0vp&}_VtZb8FCjyol#yPOPt99n56MZQE*5I+udC&wEV?@PacQjB9wVyT4ASK_V%=W
zw-Ew`--WRFlMXsrrp|v3guku-{mY?HZP#uC2wa8NGL<%Bq%B-`?u>L7Ly}LaOIn)S
zQ*T{?%*l?bVwvQh)yoE)O>-UA!rX%hBq)LwAH4x)dx!A7;xF4Rq~k}um497b?aui#
zO5pRn?Axhri(VG#J2O1N9tH)O7=4i&e+mDEMHFO0wTk9uYDplNvQSIm^v#<s0X<)V
zyAHoQUnOXoCUt2$Sydvv!<1^K*(%ME9j_m1e0LbJ=xA@3v&$$|IC5;MpYD|ofE|?4
zl$W2r0he12Ttb?r4|m*^_v?Ic1!1`zgczEyaEftNFc()ezSL)YT65L^P~X*U<s)wn
zo(}}rzhlP_J;rxqge8$CCmjZ9_M!QN6Gw8qRM3JR(|FB5W?^NM!k3TOdQSh?n+hqP
z1p;rYeLKpb;#^#f9b57PP@2l-Cj3fxDc>B7{GJ$R{Sc*o9lxKwI@De_KK<9$Z-*Fj
z+7#t>KYMm=dslS6QIFxJ^>xZ88%}>oSKzJfe0O;K+~<2rnZ@=$?zA%VxeCVZFJJW~
zw#w=;I8G{MzC6MY@Ghhzb7^!m8fLK2Vqw9g^hpp;NuEk6br9rzBFd1y5~x_hi6aQ%
z;kE<jIIu{9XRNu4V2bS-Weg<i_?0VHUJP)~`fs<r=;<^y{5_*Xh@<0x#q8SjEl`5O
z_})w;G|{@te&6grisa)Ys_lT|<y!=!A|85YiL5eF5w$Hewf$|1(TjlH_aAsW?K&p2
za3W)}hk#>&y=104`un%)x2Qt~`x$O-{X&K^Jk-9SKRZr_&XZ`Gz1$fW9<wWR?p(4n
z1M}OV7z17e0y46sSv183u6@nyaL}o#b44X6rL!1-qj?%43LyZ2^-5CNsBn<CbLS+1
z1f;NT-3*rL)2GAIk9;-t-8mm(A`4drr?MV6og}l$La7Todrte$qu$<&5&v;aF_=dK
zFsQHiL_{10GbjTM2w>SJ$&XqXpNSgfSCd43d_3b-XBFQ`$<;DmKf92BxFb)nj(pb8
zFb@bBWoihXxJN+7s_T1uE0{-Y3o2d=&mdEk#Ym{_>)+eqNr<cjY3+1Wmu1Qb_M_L!
zf8?A7T1QJ&E-Uy;H1YLZ;H}GvYYVUUN99kas!Z#nai?YrOAI6#GBThI&Bup;)AYkv
z93zWxe>tutp^h)`!Ee+V=4ILjie}&#P?h8J{n>KjG=n}2;KwTI7YP!yx3{yfV8@f<
z30o4|-wCMIpPz_OUXGnrc4wz0x(lEtfq^%}E%%hGf9NW+r2xF(q{|Y3q#w1E&C@S)
zI5?<)Axt4jq#kesGG8D8(^;-<@)f~+q0tre$^mjY`D{vA1|Xo!2wI}aGmIkPM=6NW
zgC+b8z}<PFNXJ4LDDsu$9gq+kz|>hgO)<ekGZe}NW*a{QHA<|Lm96pmMhP7D%ok77
zOWRL@nTQ&8v(zP<?;oFm50LGB{a5R~=T#7xC-kZq7iZ6h@u;bmi>;{%Nvxd^Xnbh0
zK>osPJ!&IWR##WY3w+0l(n&%Gf@N<?nHlhO*w{3=#?4^?oumJ1%O5tfKtodzTGsvO
zrI2HoqTf8YM?J;O3SRasw8LnA2?axD-eYV%VsoLNgiN1W%MK0+!(3w1XbuUT_M~wO
z6%dn&|8A5BC#?Xqh#qAn6JG>*CO1-1ko8IBrR<KqB|7I`7%U=x6&nn5p%}aHE+-TW
z@3MV^C}C()Kk-sHR0K*<$+-`-Yd>;2u@TBC@cZLsp~)ngvraZL)Ahtp(ubnZ{1Z}w
zI%^fCXjxc5fF$QIQ$0U8-!8zNxC8(Vd9(oVw+%E|c42;`WvozIazP}jW7CfG3jiWe
zh>_O!*9VrqKZS6aZAB9nA4Z%6Zb$j<Gq!(sDV8<dUZcIh^{NT;_AgUHR(Lgk4$c=T
zP~#xb?zFaEi~~48@C06YAub+SjIJPNK$4a?AM|X{B*5Wfg)SSpTYaxs!W2yJ?sm-n
zgLI#&N#@WYa(t@LXxbOtC?t;*Jn$cBOaU&s>!F!JR6D@4A_5!soB)7M93%`!VlJVK
z6vxpKrx_R+Uyy=q8Z;ePXnMmP1B;E$&8S6ba@D=PWHv5P(j!v`g^l7b4Q^qJqkV{c
zYb{YG{@`LqnNQ6-6eV29p*u3rn<wdg_*=13`6M<myp`Y$QuG382iuD*IhlqB(2;}X
zZU9;)jm~D@h{B2dm~ts#_aBaSuSVHHY-#q{vk4x}hx#YMMQA^71;Gpts`PE%%d(OZ
z5_EzUf;Q-#^qoLmZpX>cM8Zrr2_e$t4oP(k%TpW|S@<+`$aeSiI47m_VgP_DQdDVZ
zpiDzqgHSh?J##KNtsMu{M(0zhizrNmGs(vhDGZiekY>iuO(O#epp%}2vlJK4fc%j(
z6fWeQMi0nDBvgs+n5tuJg$2+n%dBS@r8W$D(Y4&+lw8HS#$U`5WjKN)(SvDz4g3&9
zqhjcn_kX#2Ge*x<M)-`m1h509j?Kg7BUj(ZPLSI$9af}wOfob9#E>1L+YA{K<Pa%?
zL}aI;ElPF<e84c=_VSfykqk945&8z8iqZ-#>@sTbqEbNDE$;U1Vqt&KQ~&B~JWBBg
zNJ3M|1S~+3fx5KR!Nh%uyA70L;Blx#5&fwcB})Vli{;HC@C^tMqOlx8p>SQK`;8qX
zlK%-U30c4LAA^I%@ZAa~t}{A0XmgfzzOO+ZOd8k(jM8u#X5=vW7d|mGF@E!%QVHHL
zIQ2wRMvaGP8|<+_l7BX}52@=z^AR8b4Ir4(w9>lM|Ae&toglMvTFl9UQUo|Wrf=2Z
zj74xFk(^m*Ac`h|LF*7!h+ql;PD<ILzUwDOCmyHl^g{HC5WT|ZEg=Mm29er&dNwlO
zuZa91W!b52a+KF%k(kNGm<lz5P!Xd9Jcp_Di_8BD2}wDUZN%Wfx&h!|;(u^|AMUyx
znTGZ^g1fO~03gJ(@TS2mY%8E}t;2_Z1kA33z!pVIg8Jdna00WvRAQGOy14y6A_wEA
z-Jn$jGoiMG(M@D%SIo7p6(bNuq<Di~O&od-!#U_VQ|*o3fG8d|?bsjGPqb430h0Mz
zJ4xmM|9e$aV|{Il)FlZpu<3fHJ_Wj1CgDq9$Utk1;xiI;=Ub=scw2Pe(*z8`NaHwM
zMJQ85!poA(lW*_Gh!hR7lA;j4L_1DgSvhb*6lLU5LXHGPTt|-s#E4+u1^9nRKnIbv
zHTmI(BE<Ka`g(@XUKGp-2w3v9v?rl>aJsV#-Qz6)mnk+Nq-;)nqaktbDQFf+&yS=j
z!w24ypgSCRCDJg#Pef%s`T6}jx$4kZjV(C;+>UFd-O8Tph8{oB#gsEpIfaQ%65zoA
z0TQRD$QNh~4~mH`HL-l=Z)iV|5epzfay$V#n+5}Hnj9gLGg(}14c=z^iU<p6#4IS+
z9-N|>yrg5CXgS6KO5|Hyh__a+Oy)}{$Js$utB)}ql)ivNP(hRY96IHUj?qwT?^rms
zjQ{aQEgy>(FY<2UQz9XR-65M(w5#bt<yU!JY%>n4UQb_IU~lGUc6o#jG}*V{s+7fu
zRg(E5YQmwrL6ST`A5liZ-;Xpk;FjT6J|>Vb{Z!O%i+<b6Y?LivzsG{~fIAo$6;)k8
zEZ~pL?ddtBesi9+%DlA}TKEuLUv0g;Gy@F(y!kihq`Pz7>C5K%;qz9*>&601(*hF2
zBcUg1ixdh$`#2d4H(C8QJ-1ErL`0Ka_aH)Z5E|hCzSQx0(M<)B7ja9}%@U({I0?dQ
z`RBlh>-C^S2~f*juxm$>fjGv>#s)e+_wnP$XSfWy^@uql^q@e8aBUc1DB*JLg)gqH
z-AJ-!S_(nY=*;xG>xNuAWiJ%(#u;to$giS-vrjTcsBqBv*&3{=rLc@EORo0f*#~G;
zMZxA57m?M#@Gdk6(4Di@D6ilDe4m`Y;e>|JRbiVe_IXYK*YIZX=+Vt;yo*CU|8r;W
z9{Tlbi;Yd3_Ik7Q>hQE{?j2<&lHs~nth>k48EGKjws}V&^M#`7yRmq-Mm94G=@Y9H
z`wpJ^IDsbhszm~sH;!jl)+-~WZ|&=gMDcgET{vaTHrznUBk!wjXQK%23Q#_Dho_Z(
zoO8}ODLne!kP_t^8v)@S8QUZy+OqoEr_r$64~s3jJ%1c|Tm0rR*B<_+H-4*{uI}>w
zx@w+sliI1Bv@TlQ@*;i)eCG5a&zKqacU13}rIaP&+F-f?SQ`z2g$u*X|0=Ro%$A*|
zDS5+wfBctp`B%#SY8AggA%J&`A(sv3c4&PR|CIvzm&i(u{qb9F&reU0$8AU<HZCrX
zx*?DJg7=f&1ek<6L|Ah_X{$Y)-aMpqZV4I|g}5%!R5_UYzNQU+JUS$ZTk{<Mkp+C!
zEl$^IIF95)zzKQpH`>EQ|InIT8XI<IPo%)Rc;s4utFm@qu#^1n?p+M2G}l<;bL@eH
z;b&{^Ng6PI{rVMQydVAc$!`ILX@%PDx}jP&Np^Ex(C4gq`w3yMi}xC}%W2vLfB{}T
z<Q%pJYkjN(aHPUuc1Zcmo1+32^H(3P(sg`De5!8;F)m`7{JEcNPX+wH@G2s>PuNu<
zahbt_yHr(~f7EK~(TyELpIum^b)vh8VWjLbDO-q$hu^&A@Ou@BIO$K57(3LiY&vIB
z;~F&)y5lvy1-GYoWxIjMg+ewQd*N#&x)wNylmde(6=7?KhXwdC%}Q%+3DxIl`Ej8%
zL|l2a=IT^YP_$i6qUmRH$v3n6G$dpT?n|(iL?Jq05R0k?C=<?6bgW^U5hDm>m8Oi5
zm&}v&QBwE98Y0asH?Z0M-)`HhCbdoY*9u2+p<Kj*CLjXULI@|HQ;xWYWasZYQjx{v
zriduB`qlS8`86IB^?#3?WEggn{M6Ef%8Y32dzA0Ie^M%$mAM7brRHKm{(7xl;=o^I
z<D+C0#_L3Cvom~M2XiF0s$eLWe5b%nL(qNwRtmC0TP}ZzA~U48nki^sK}xy!%@wUB
z>ZuJ2nx1o9iU!9k;upn9^t{R`13@TY#sS9cOVJeiy1H6Dog_qrPR?mVjv9Xbz!v!_
zFmGGS$2mK*dAW8d?>M=S(K5LTv)|~k5bTY|jnh~6>T_^%+Bdb;>>kkj&Z+*vJgj;e
z`<Jm?Q}qbL`@rYR%3og^{Y>_6dy7@Ng2;MH<NIxIGEszmrqM*p44(^TSK0Hbb6?*Y
zKwOOJ0u%7<o71E5%YS!tMQBuZg(MEvbKSP`>@UvC<e%vy_a4B+s^;C=`Q;lBI?gRO
zVC3cGP!}Jm7Iy@8uN&SLkxQU)#R}NmHGiZmDPwH=R4#Mp<VnsNW3)2@F|UkRz64_e
z$oWsatbNE|`{Nf)&)}wXb8p+}n$e65Zt@0H*HZY51;HhT<Zbh@W9tFZcoz!b{?wJ0
z-zpVNBbdO>AfVdhdtH>}B*g_VK)0YDW^t@XktCo%8WU8E#Cz26$lNF;Uw|8&{xc1{
zB`V{Gx2%lKSOHiJlmVcMwuvSZd0&ClYf7902<u&PJ{rhJakkUcJ7YDo;Wtmo*nl1q
z&Hv)o(N}vv3NF<p?y=>WaFQb#f7s^T&dG@~e2sm5`=Shk0Sp1~FwHiCQv_TZv<*JE
z)FAagF{zb+^coRr_}xw|?xlzCeL+7A$Z1HD{IX??w=Z?l8n_Uh@kb>b{4XN8kjYDg
zm|6s;<q8`bJfC{AImoZC6YIeE9tKNLIOF<pQp9;v*p7XuAvgF}+7ICey)NrNmas>O
z-4!*q`bs6#4$e%VA102Tb+dTVm~*9b+oP~Ao|liioQ{qNxpw02-Ma%(E>PPfsKq>K
zWf_J{;OoJKNIB#&&;YvV<X1z4AvIZ=ZYMu8H2Pd#?fbm>QncpA`<`I^<GB%0eT|z)
zX|AT9UrHWQ^7yTkVu#*nK#?Nx4sH@{*dV{EfrtR<FDu6vs~VrXNML-fOXWT}+rZi%
zedE0XE<>LF{?A_L5B>Dj{q|sN>CVw{=c6~H?0=7MJpcQ&W6q-6ZA<>Jb@kyteak%~
z9ezJ0<)zm-2I|dtDD!=LRs{(eSojfp7m^w&mBfX`p9-BHW7AxJ&j_|#6%FqvKxNc}
z0|r2u2l@zOJJyOWFOscWm#ZR(xle4Zk~r<tsP|<)enQrF&K4o82-g{sIwk*gRGx$6
zhtVn-?F0Vkvd*12dB>q;bf26_L8OV<%gt}%>&JB@`4Ln*9>n_(t$VRe&O9}1d((LD
zg6BR<a-D7XU(Q&1z4WEy&OI^v6qxi%Ds22`H23zorVoZjxBQqm9zWkM8Xz=FKv`KC
z-~4rC%7PYkamLh+4XL<KgNbN7(bOtUY`#!z5Qk@S%a_rJbMMyW+9x8rLRQ#@j2ij>
z!Uu8}u@os5z&&#?LC-gLF3qr@)E5z8*O-2%uP1-#^!5727Mr`7^T0|=HgbIUsisTb
zIPRV4!FQ%{-nT}2qF>?LYHpYN^`xZ+CzcN^#-yU&t9MwdE|k|x#f^@0u^{I{5-D<y
zm}`F*KQs;6AC#~l91-aOq7p@Dn{Tj^`&XH7#h<QuE<9OG0aLP+d-`-U5N#0ty0<F<
z-VYJALnp*I-$w;09LQj-fRqr4CrLSp6DP}26s%q(&?^uxYaWq26g}Q-*`#yqL-tQs
z+1hhsob`D}(k4<5e~=zIu=YfjWzVsKs4pcJQ8I58>ZR268ZIEi#jREZ<|!b^C(t@T
zKL>piXs-V_`o#U4d2NDSTYkPjD7RKnXaW~`4Q$!Bj^$3D3L7PM6Yd)Oc9N(u*cqt)
z-4Wu^N&_Xu_<A<Nw406fy>bII!Ep<dohVvUIXV4z*Wsktflbne>wMk1bzVo0s;Y|4
zi4PUHzT-f>Qf8m6ufW>u4V&uxHGW>R->b`L+}ScWtA1R}31wq5cQ$9VF5~CTUFj-s
zZGSn%HVtKMc-!4#T3MezG~{Wm;Xh9*>Vwjg{->sfb}bw=;L`#U{xzBtW!w1U2#BQU
zV^PWlI^#QMwe{2e^(PPdO5C8f?<feMFOA7_5<S7tqOh9Z<2UxnAbfp|0%Ly~2!TW;
z!LH&H74^|>QnzhDw}Fa&0A9@OD@=f|CSc3az1ne!=j6p^p$CVh#YGS|F>f60=LqhJ
zuz{;G^h?hPSbHq1*^dhYfMALL#Hb~DPq#BN__5WE{41^+*?=mVxi3Dsybe7GH)xw7
z;U-}cY_lWp7Kt1qoQ=pO$TD*<jWtP57b!+iM)}i<kl3|Lv`FROIs|MaGQ0xOTAzu;
zfl`_3B~V?_3ix{1S6X&-**a%JZcras0XYSIZg)Ylb12(w4B~@dfHwYo%L|jA!Z*cm
zJvm!1ZUWtTQuJ(vAu&{JZ{_EInEbr#>sf>_(V%lo%yV>nn17Bu#a+#Oa8RKY>dg<m
zP(+V!E6_e%B3x)w$%eaK!!vG^JimAkBjJq$dKj%(YT^X~$oZDZH~BexRDku;ol{r$
zc5E-}ti52!(`1D&Qm}pe)4)EYVJ5V1NN&(1lfmQ|#N<0Of>jv|aMDQm6y*n53IXvy
z(mYqh4C(`9_==+iDj#+uePG-)V9h)PXaczzhpbzpN0sN)-e*VB77e_+=$gu*vj9MW
zfiuww`B5XA7$b*9veyG)7yOa~r%m!~0O4$bhb{}qlIxWxE0yCy^X3t9g6K!~V^gGc
z34TX7xPtRgPLu?wOaTf;p54(AZ@VuhBHI2RDk0GGp0;LhP5=sUxoDj|pmFd_X4cGr
z-=;j7jGq&ro1e&KQJZ{K5kt5|l><QK%`BhgATMo%7By~LZosF&6P`9SG^8|bW-hRT
z(NuyX>9nvV(k=<N1U!Twl+B9KK8R*GI<1~#oE_bB%<K%DqT8uk2hpIzhaZ@mNzn^N
z(Y1dn#a#*6z_Fb`%b=q}x;en;_vRgbHf-#VE4=9){0l^&s1S$VFsOUl+qPAtInQE#
zM9mwdE_13k407J^{be$lq+bamZa1cPmMMg;nS-`BX88b`le;Ow$GD<c*6d-#M4YpC
zy!s9=5=6yfWXaeOpa_V))LTsN6}b>(;dtObz2NeGh)dF%q2f6SU6Q$=*${!3ruyDE
zO#8F;%i2p8<DpLYG}ukt%F7c0%#L>xjA~uRdD`Und|Ui?TVH4e2Ytp>;<fI$9)^Y$
zk<ZXu4)8`f@LX5cnQ;t&Ba#N}6k_0_GruH#<>a2WW&3-uP;BQsdbDKu@>$5ylzoO}
zz%OC>lPXuM5WVvKB3)n$;DNQaw+EsEW@TmNY+++UGkoFdQ!CE5c%&#@E;Q*XD8$Ql
zTnhmfMl_O#4;zyB1p@SJ8FR7TK+!QoL84_>uQs{4P7miJ@&kGhljTBZ7o<q>M>J@v
zE~;5~Zl&bDkS>2fu!Lw45CVB2WSDbp_kI0&G&4JM3@Kg-Sjup|s+gO(&l9bWVsKs4
zU7A%<>OJPy*geGs_<O=6g5BK@vrI04))KP8WuG&rn6yRSzO~a2efO}($se^7I!1{3
z)bB+Z=c3hqH=P`{z-?29O!6f4%}&>LilWO`JNJTJ(m((FQ;cY?j{}7J>DJx5<w^XR
zu1)X8X|o|w#d{={FjB{@c5|4I8dAp19InFlnq8~4tf`X!=UPcgR$^=yo|r{K8DNmG
zuwV=vW~?%9`{R-1LY@_5j<bMcffezczParc2pDKhW{!f80P5TNuF}<D`GW4pxH%6s
z3OHVKJ*;(|os&xsfZ%cQo;MA_fTSMt;^wt$oN8(X>YaP;8vvkJTy&R(xO4D1R9_3y
z7bN9P4e}WK_lL$jge@^dJQ&?K*10YMJA*B}sqrx)f^nMFw0&~#Zgr1?G75m=Meiew
zR=2gw4c~u)dSXH1t5gLS)J9@rVi?8PTbBpWc|VR{<|sUG(2i(O?}QG_PEdsQXIH~&
zIU*+~N4;Z{SO2Y2uS|d$a8x)|TV@V04w{3;k9U-mIXho1w|S3i60qz19~)yau6@(z
zd+58zAf?bx-?Kdm<-)m5j}clyKI9g8D03s@i(}Qh4yxCn-QcR^ig6%R^5S2|ni@x`
z?~k^t)K}kr@U<C(jTya=f2eCebO3o!+%HdDDZq0}JO3*vn7B1aTG<G(j6n=gqw+es
z?PgZMaP>cqEjs~sX`2t<-F+WwQdPZ!=yii<aBGmG#rfmG+^#-(g`-pBjanxY<RnDi
zVqiTynE|0Iry7P4gfsgjL)-!_*Ia=>+KJVSXT88_Aa4RIlBT}d)p&YREMWAF(sTh0
zPuWc;Opts_Uz}$ls)TLiMm9JY+4*3<!iihyQ6I!P=%n2m^!rh>6(J_%oz82tLxv!;
zBK)Jo7A<FsqhFS0x@%p)O@t1IzPsz&w8D1QZk$K$>sDigfO~WsWnOp<xF_Y_g*NJe
zA!0Euvr=$JD`m;olfOv*qq*7nNDF3!B&B2@dJi(7Q25lzpGVPUaxj)eOycUF;7B6*
zjcFdj=Z0<H{bXW%6t?SE-i_bDU5gZnt@*%T?r4=o3uJ$2PFJh+3BX>Q`{?GOx3-a?
z5u=a#^5vUaK=6t?zi|)0Cjc#C5}xkZ?exAr+j~{n7&c<q_P6Av1jJS5^m;@!!0CzD
z7wE9fEkBHFfqq%|D11@LXzj&^oit&3H{wx3!t?fA{tud^kB9ux20&yA(6+&O-1jQS
zpVA&+{fs{C?4(ifDIa+Gw$Nk!8X|jD+(<E{WcjFG*ZB$5)03Fk${dPY-<vao+^KJM
zQwfHJTFu{p?kjKB3sP4_Q@5&}Ud_3vC}q3%m}x6Vt84$<A(9n^NY~+44LZmiEPQ_&
z;#8p0;Cy;%dH2nuUsvkiTyWFx_e$ItFS5PSjs<UJ!Qrc^3MK^*O*-v;pekyy)U>@!
zR6_C29ZygX5Yb+&VI*R725%jy1#}P=OZlb$55dyV2ntz+18$8}3`M;dXpZoAbgW@y
z7l`#E2c;4Mi~}MdqHu;sJ>Q4v2an(-frNL`BDHU*qz#RM0ZghyU$<Jg&P5brBeTr?
zK#nEeJ9ZG@Zc^;iE*&D-7$jJXp(kLP$DNO+DNLv=nY>3^K5kPcpl|@ewwKQuB)*yZ
zX35wHzRjxhjyC0ztI?hZ->)jZueW!u&_j#VU~q5N>o`;z-}CL9S>kC8JRKA+#%RT5
zZ@q?|PoMVxn+2u*i`sEI-yS3OqKuS1-PF=zS@#E#1;U9+O$Trv#Wh@yj<!GN!+S_V
zH0RIA>miTD@nIapE!v#%sa)yqqoD9fOIOBi*Z;f0Rxg{)u{VXiZ-FQJ>0JUt2t|7Y
zJ&zp8HhFJzJ~Set3B?U!5u^OS)YQ~_-qR|F)YJP$nS%gp`c7o0Jj%#;-7X=L-o0jW
z@bN``Lw&tc>yUxEg~bi+Kqq8kO1||m7Rsz4Exm;P*ffJWJBR9RyZ5LCn7#LLZZLPT
zinDyLD=8_FDv{!8vz#=Is$<WgUTHP3cjNeQ2NstOZmCk`r-MkhIP+W_pZ%UM&#;1R
zQe4CfSZ&ofUDedo9DocL>A9ci?W5^U{0>IyY3AN-@qKu2&JFlgEX*u90Hsse`^O+t
z6t>;(^wi+*1uG0C51>dR?@Ew@XgU)BM47E!no=g%_LyMU3fUX7c*blmU-<R1AT%f_
zFe~f9w-G{LFLqojEh`IC9S{-~J@cvZ%GK+CVifGpcd-_M5a8lov{P3|$St?j9z<2F
zu<4EIS9Z3mfmO{|t^(mit?PEIYaXV2J9f~>={?4<$`MvyY_jd>=+N%zL;Z_1Hq)7#
zi%V$X5L%Yb=c)vc1-e0=5)(oZChrk9d9w-@&L2V(J#^_mmpNDVl6Z`i)bmv?YHDgV
z?rM--p8JfEo_YZ`ql?$AYgpBb9_Q(Z^+X}INOvzs?-PaU+-Gd*?q*G&(}f?t9opsS
zJ~}cI>c_M2w0q~rj|2~uUcYf;jZ`o3!9gIJ+ox^QG55nPz0&e>9u*g}`sN||IRRZm
zacD1EA*#;3x!$HdTd5CezE)o!hMOAWUhn#*1n+b)<Y$b*u(4}9F)#+9r56?lkY~oV
z&prRsckRCPr6w&y>jU3<W2h7|Vb1@?nZd3axAN*WUi_n1^Re;1Nt{4s6`Ps)ufgjk
z)1p7nP7=!M1vAwLQ;Ur1OZM9LpEUj1&bgsdol>Naf*2Q|=^H2>508%CM)F1SaA=%;
z?s^rHvK?E+3mV04n1ey<(-s0n)Pp76><uFyFVH+0u6gHkyBaY!!88B|Pm64z;jHb{
zkQfnKBDH3Xl93%d8d}=dyS{z-k}j-P)!b}u(5>J4{lJcvf4AF6$A~Ogu)0wY&^NL`
z+gIL%@bCLEmLwKh?T!0|5}D@C&d%it(~Y6>aPMA)gvr`?4ZfWBNAq0eb~|}|B)ut+
zvoc4)gVN4obphA}Zo;bbKOwuum(@GZ!mCiidK`+TDCuloc@*FY0Fufay8(KVbV6wl
z;7R5+n1k&`GTInG3iW#uQaVM8D()5vEs0ikmq0=1{<Ur;1apP`)8>?)&3ax1z9-Q*
zAazW-dB}__w3eyp62-R@#AdMv-w%oJk#n`P>pI&>YA5Gp>#BcCk6arX928KjxAh-<
ziAkf1?)>xK=<#j%=uL+e#2=rE!BhyURtzGzKGD1Zb@5zW0rHQkFVE?4`rpZ<>C_p6
zpoO8mnm;jq4E<K)Gtu_R8((9O>l+)h1KI)OJ3ESL5ELidxt+gSb8$)=Q~rR@{HJbW
zMO29-{{bouBO*Y%Ra~@AL$0<_sFYMn(dH%I13b}&fB%j+T=BQ0;Fzk_gF<yB2B;^E
zO^>$C-W<1SL^*S}{GA<*v0hD3(~!oJJqn67W>FK)GF9mE3dCr&&tzl63n%&#CU#^U
z<sB1H-eRC2SR<CHtS&10?N$hrvWP7Df4zv`M(3vKI`D@=dV&Xi7F6><2o_-o>n)@H
zfyW?b1u#JyfFLN2wmD!B?ucv5OuT}r27&*#LPz$Tj${OQ6hKVLp$f$#c0Y^EBV{5J
zlh`nZHc5C6GtFC1F*`9@3i1C-?!bJw5T&KWUsmxA6GTYXQA@3ol8=N`96BvnC2|)*
zW$3wYYRRUmupLDY!p?vYHgHl_4#No;c7e?i7w{=vQrIT0ENTdp1{!<!iee7=b}<Pr
zf~z7hb&K2J!MS#9V{F;71yb*r{1Rf)rH_s*KLo%QB^;WFXQAc}e0kPZri?h6MgRNa
zx{30k+eBkud_x2U&~yffJe<yK_kTuhv<t0c41*C6gbp5eL=<z)?7mI)ik`6>GLoK4
z@^4kP0EGfW!l4+6q5){kr$<#eftW*CwtzrKm)qYrCOGx3Pi7_PqOxXNE-7DJR3u~n
zS`w%h*iT?*5x$Rijxqz6^%g6uTepjuTUA2j->FBIK-*~F`q&pqARnSGN6vg^sIM_=
z&fT3i%aBH)LTas$`THv+=WYG#j{B=T44V^u8IasC>q45DsM~0;80!Q^J!0?)EYjBC
z?aW=R@Of&HB*)c$^=^24U(Qu**f(3UIrdb>Y=ddDcz3b!8MtkJ>dSkUudvXgb)i1r
z*o#}rQbhr+U5Op*!`h#B$s`3<h_eTV1ZGr7O?MG|emQ${lY-8LYoCU;wjBEKK(8_W
zj-E(wwZ|uiqX#c{UfZB2?wMa*oS++*CtK{VFIbbY|DOjAZ#dxDgM(iRmLieCOy9&z
z9=p1<zY;`CdDVH1FX;%zwm#QY=XGm4H7^cyC>S4|NJtP0x-q$YaY3ajlie-;i}PRc
z>i-PvB%jrJd)=m?uhcp;3gyw0z_CEqSY4LPln{|nRr@R1zhW9}VG}mW56>(8-Z%QI
zt7Zn=K1!(qPcW1|3=UV>>c9WO|F4YBc3y_zd_L{??=qQBj{ek0wRl3!01PJqPA|_3
zoqU4M)qmCOW(OHYYn`)yeP=d?$bw*E&rz9~nQ2?c-0Kn&a#J5FUU(@Q7N+-({2uCy
zglJhThDeiKA|%Wkk{e=M&F+25GB;}e-q>6I=lV-%(Jgj`W8`_sZoizL2~8g!Gh6Oc
z*6r#B+;aEb(b_DfaXpef+a*|<_s&TP9)D>C4IT@%^~EdYBy}=e3Jb0dNw1Zk5mZ(r
z!)X`8rdNBb`{=2gm#!$feh`HcmD{6c%AOON<KsWYW+#6+U_RuYQfQ@D`BL$)f@N@^
zWu!)PYEQ-?-wp4T3_L$OYV^x=wq1Vn*)L~5*HYze$)ZI;ri;F)#`cznd0&5#Z7{y^
z_05_O+3~(xGW49YE5YFezG3^z2k?vj&#6}~?6<8YC#Ts3eAww|ro2+%N=PtWH|RNd
zS0&Xpha)~_lNHZUp{vKkR*%Fv8;VcGF+7Z%qY90kPvkeA^!@Bte9B4jlv~!jW#>QD
zGzPB-JG(!(=mlqj^n?ZMhstqj);#t;)NOjzI&I|Gopt%#`VT_l?4CpzjL&z77H6;T
z*8FsJ<Y<Du$n@hZ8y4{pArHRQk>$2On_`OU-PD$t%wpMRvIUGlY%V~v99Lf$eex)X
z$Ev+~uwnRsb9G|g&Gg2t3JqUg#m->kS;Tp>IvDs7r>HV7Y1$#wtnm9Qa<*OkbMPTf
z{&q7=TB5?3#zKR*xXj(qiXn~ZIrBMAafEOd2Vlt2YeWChC-}cKr3R@Om!9>F33F5(
z^WGTUDYW%yl9W>Cy|sb^cXskFc^E~ZK)@7!P;uKm-Y$cbI0el!t-DZm3HuckZ13Xl
zT~>K8yxMbv!A<=?_uq>ZbIsUme#zCXh1<d<Wr0+VJSQ}=xCq?Zxih%9jU|Ai&{DJD
zNR<CSpY;q42@&i$avKl|9DaP7vzmHjIA<K+{4_7R)AdI1{I@}mqAkxZZiS$5875@u
z{9$U`;>pp{JYoYF%>FAI#mc8+rIl`oQFhXP2greLYap}*5e#U;PwRo0JUcu06-dab
zdVFPth*xm2zIBL6CV%Sp)j4@`A`hPnU2St=RO}CO=rrXL_DS4ZI6omFGw*6bLVaH0
zCB0aeH$C!oX~}JVTQU~y8ei8pG|I=$+IlcoignTFEQKbkl$NtX25pyXMx<XaKJGYZ
zoa3S4o2!0a(s8(_wJp;zVSD7Az$Z%G)7f*)`&*Jn{yXtC;oFO~N-M(nf2r>A?CEZN
zxve@qWi+&2_C>dG^RGpbv*`7KZl4AUB7p>kDT|Q_lyXl!Fu$nqH69^{@JlUy!K*5}
z-sj3gado=BpzPtUUC;q5<5<jeBtCYD_ciD06Ek@p3Ofjw#e6<=`V-f^w|j~W&7t;o
z=*ji&d|ky;vgd*n&n20e-!pCgZkKAymUvP5<<gIdspW}HYjT?3G)L}hw)?h3q&)FO
zzHRg6Ln?m04;ucA(K`WV#FN@^8Vwpcc0N<KwslX}Jnq+^!E>4lCRG9CLsqbU>uM?R
zuaL<&$R2Gc70fB<+qt`K?ibap<{SK5oi+cn3p&=xAHruCzk&m&WuHk<vrDAMA#Cz{
z{8JH7!r;<^(z~d$0mQXpyn=+>G5<_C2jZdex^RBM6APhri39i8Vl6FJobn|v%P-96
zt{Q2`y;9hrY@J>$b`t!?aO6a~KjW)kzcT%JfMN6(H)We<YBnTH`3UNtxtG9E(88mw
zKiswwk9dYPn8+Fu5@Bk8Rpt0OgQJ}{R6W<#IS=JD=0A@c<PRom$uQTcpJ5G-a`wT`
zFdrk`HYnOh4H-(<&Z)oO_G$BlcId3&SndK1X@!RVMdNk2o4X!=92!lwW-v8qOU0&E
zp(5$|J`ctpxx~#ZY5X80tO%-7kst9x-XeJOZ|LJI4T!hzAMs&f{_oP={W6)NE05zx
z$EV^2FQPs#tI>WX{+Rf}zM&aRn?^zDyp-a;Ax%TWd@&G>N>@(($}`6qe_kta9pB8t
zyeuj*M7ybZbltOR$hKH!@|XPYheH-K{>V$i!8;&s1uyI3dNBDVRQn7Ttvp^#Y9ask
zAVs2#%ObOIFD>rD)r_ydfVm>=VpH3^IH9VnP=>QBk6T}Gq>%XrqKd1gw;q$j%eeN}
z%<bZ4EG=8iz@+9{I&<_tZg|QLPu*aVO}z|fP>g#OT})+Cyx0`N9Nh9FH~xPYjM4*P
zD<%HC6nW<RO>ebkm=H<!e@nVloPNjT6Igt*W=#Hj|Eaqs<au$`ZXblZycA{^V?~g1
z6(&LS|JY7i&RyIbI79Dc^!yflSwA<VPlD?c5`b=gWIoHEsk0+EBgoqL5aZys_4t}^
zUwJgc6O@-VL_%#1?WlD=p3Kyi5{G5datp9_lY7J14Pu_wqu4X;e+^;dSc`aCMR+E+
z+`J73=yjDY|KIx*iWxZl_gSzv5KRB?IfV~JnXp9<gflrk=?9oQLI2OxVvDn?x5{LO
zhop4xGQ4DsPu1s~A2iLp!M2%`PR?J$2`n*w3LpFbzK-Kjm!d!OW{Nr}`MF0ymOAb&
zvsLha%OtK`;?!Yd#MpkGnS`v^!jy*=oS0f?VIx~5ZpKn}W~afX{LewEkV@B1J(y&U
z>V;T_kG!-xt9rnS?*$C{4o|H~7qDGyqMVuAUliAv8O^ECvBzyGdoVklM6RiIREgM+
ze}F%NGp^k}?NBMhN#pC};{iK0_{hm(w!nB@V$I;_7<p&-Y!btzLNF`$h_3!^dl4nq
z4`)bA6hkn_i`zDX5-#iXHqCh5YRO`mepu_aVt5Abhh25|1t8eacJG^+p`j0e&bK-4
zOJHlTC2~<XMtnOUmT@9(g|Iw>^XO3T!b`SqW==*F|37<}5<L3T`B<j~S*J;4^4W2U
z-4F>n4k|q~XY$HCwDq-s9$`}ERqmk!5A15<J~Z9!R;4QU+xy(XYrmH0@TcT{7J1;@
zq_lnY={3bNt%p?yfIMmnYVY-UWB79}1D}TeDJbTj@zY@vd56CQ;>y*&zrSW4J1hAh
zcx_ljoafxm8;QJ(goH4_L*l&bb@mg1;<w<+O6E0jZ$1v5Pw9ROF*+y+3mrkCcoqmL
z`GmU_&a{Jnp|13YO3dyMuNp%TNDq;nWvsHrE#bb9aYNPP`O)iNa9tP_>FU+?liCB6
zB~CT<lp*TZWI67E|C%kzZ(;(`nq#I2O}ftaJMVD+KgaHtuwB{1vR%>kMywb}&Nb>j
z9mj5RneupK6gC(G65izF_2d*sUWzTOt{_EUuhLWQ@(?}X4<Su50xNjxA-4q`&IZ7H
z9a_Jy1uv<T1M7P{hg*KyEhH{357&RZviXmn8i_+Z?HH)DZ#gg7cG=9uI?6jjL62H)
z7~A-k*3BNsqH`X$W!NyL5A2SE@Z15m{#Go-0)@8<O`||8$h!+`2%9LZ&_*!g79=e^
zCs$zAaWGe0XLr_Ee)izT1wqH5(W4uqKr-T+0mR6{6^dHME)Q){oe3*B2@_oyJO~@i
zK|o}?W>c1kV$hH+&f4I@sldwjwb&HlEK$~gG5yo=h}th_ydD9nx4{-CrJHdGzJool
z^3@pu&E?^v9X)dx51ZNsN8;)d%2HFRy=AoZk^aQh#l1zs2Ej<YKaWO>%lnbgZOt|8
zyqWjsx)=@)^&AQG$HUsfAr^Vcp^e$!C+;PLVV&Wf#fqf@#9EG%!HKl&>Wg}D=X+~o
z;wguV0&YZm514|{qx^>MQVgR}yOyX%c=JX-|65g=1E5^i`};mBzey38G^IeT%$WA_
zjs&xPSgJM^7)o&0v|HLHvFe}~Nh1Z{W%X)zf!v2)o(*76$Sp~u0UrSsghIT2n*Di_
zt%pMaHx)08I(9#B#%b=3H4L{z!VdnA;J}eU@e-$sir;XKpvfHa!pZuug_jlOXu={h
zkMVPov-g;L2q6J-S&el}NFXu29d`xDz6ZHoe@CQR9M%B;1%4+a)4X6`$R6_sYHqT8
ziX8{YyGXwv^gv8}8^$8a(DuZxHR#F1B}{N`f?*<d)0Q*B#}{h`84R}@;@FS(2dmL>
zq9G>>|1l5Fg{p!WKuPTT_x>$eW+1=rIu%IiI!jGAV6a_~oQ<77XC%sd(+M`NBKS2h
zg~74LZn=A@jY&}n8sG^Z8kez8k6Gs$fv?hFKiZnh|7BuD*zQh8DV^OH%>>pnjG?=H
zW$WXG+L7YyV9_LfSKXIC&N_V>4bf<VctkE@0GL|QPNc{%{^PD1EEqD?gT@qt?S!=X
z(1uDJ>QEHat4%}cY?^Ea1uif~3cG(w(CD(-xYs2X1721-`yiW<pNSh#MKS;!9x9kH
z7kj~zIQ5jn1tfVwSH0btOX7jz1`B?#44V!I7K9)syU}N0COoYJF+xFH&7UGT2Zibz
zcslHa*@#4mGv;WKhqn|5B7|)8@)RXj^^k5y4wf}n=Ft2o$}9yvO?1pC)m1<TKA~J+
z@IQ-`Qz_lU7=S`ht%=uLu=j9{_Tq*)Z~SqVYrr5;9TGz{!+R&;B-B-QP1&mv4EVmk
zCD|+dK<j`P;ZU#{QjU9STB|^}54>xT$<df1juR9N`1G$9KjDr}RRh3H@DF6c+~4iL
z(ndmwq$rQ=LlbzI`S1-tv<J}~`8g))X!DWJz=y{w7-(@&+_|rRUYNt6_UvN)A99zb
zxBxP{Oidfb$A<UTDOGW@P9Fx{mgJNxGR{~R2c1gZacTd29GESDD*CZMX*3Um!COl#
zMV05_fKh@u1w$agJb7*1h4z0X{Jo^ur{hMFL2-fs*qI2m7|3FIp6CIO71f-*Ysj+^
zJ<l$LiL6mvtzKX6-|dGg*(_WLphbFh#RkWHa4~O4Kh(K{T&7saejKta`DHAL2*H{`
zoa)$J2&^=jix^OM^nK?bS?kesAK)6+f)>|}L+2$)Pz=nd5}Jjx9Q(RQUoF*mj^9go
zv(i7r*sH;#@aoRuC6OBf5rftB1(}nK=oBTDE9w=~v>mK==E3kFxS=~MUlhj=qeyO8
zd+YZ(?m1|JcK`h2P>Ej+p*M1Hp<4#sk;0xM9c0Ok3+I--NXp~y+d+@H_54WN@5oh^
zLce$3d%Dfg@G+$3hM`Nv4NnO=BQyv`?M{f&qq{OMAwfe>8=g~~!ZIxA!YiODH3JKB
zY{-SgONTLD#LLclt6No2TODgF3zWzq*hP_Z1~v~FkfuSM3&!54W{PFRb@)-+*Cg&)
z3jzl>I#ZEff9`-i2E9{sJ*?sXLKAB^<OBHSWG2p&8?LhaBdP3xxlr6Ws9vjp+e};k
zRi1|<g&5R<{-@6B*D5$QKA6q3X;}QQ`BtFV0pC@Y;6edwVqZvH2<C{|v1L-yny*;;
zg`$;^`f_X-!=yRCza~y&(Th<iMDwJ`LI4;CDc~V&!RVZ%g=Fns-Ev61t@7X-;zn$@
zDj=<ycy#{>zr#o^HjlWalUfUq_oR-h>+7*n<WhwTBvGkO4}wA%Fbt+iO7w7PP?wo6
z>(87jz0R7<FKT0~G}6+4u<5N5^Dbj~-nr--02dG;@Qi6lp<siHG8de{7P1n-JO7$o
z^e?UkoNj_qKvBYi4!qf8X}BQBJ|+m$R3!HG%3~oKiSDO1Kg0Wv4PHbFLquJ_Sv~94
z<1pP?1(Y>FBBAv{LSUFqb~wR#;@w%KfsGGcBau{*Ej?!C-ULMlS+`=PNIi_$a=@E$
zS(G2$z{}o>Us{UW^9&foW|~L0pJxSjlOV;?iiumg(H0&1mVR`0!IJ$(>lVQr5g}*A
zs#PU4xlQ8G5SYUS42Hc>HY|ZlF=MdTcH$Y(J{0$hGA5!7{bAM`0LL&yCYrdjfmwvI
z?{5WE%ULPn?8NpZ>kO{}8y}4G<=~)ze0RY7KyA(Kt?qa(?AP}g?K6D2r@`YJuxB(C
z(>|xxDRAL?;N_xv53&{}bP>l-*5$pj?u`v937UgaVS$xv9&sWOF2_tNvW|XC;I<<F
zOa8ueCSC*`4+y1|1Kw%-d33Q3Kae<3i`ej2;DsTBoI%6l@NG0}b8UWHR>)cW6i7YG
z#i45G-<d^fCyFH|I4pdKo8^rK5AIv=MZwEpL2$%{mz5WRgy(rKXznJJP)SP8zi%%B
z&9>8cI_|g?acC0ee9NdzXf8ndLB`@pKadd9C_nx+noWs@qc1O|bl3kxVG%c6@@xUK
zBf!r&LpN;R=inxmBx@NV?ZPKRALiWpqdAPkb&qkXT;{<qAhEzF31>qE@UD@o^A5N}
z!I5p+#XTq2Smr-&{D}|*vl~Q^eI{ivu425HgM7SSZDLT3lp{gACQ@&~SdS}31F{0e
zmYIc4ULh8w?O<*{mdy6`9UZKC%EQ6aGZAN~lf8N5P`=N|cZbAVNLE4mpdG4n{?*-`
zV$uq7(w#(61!V)5xMSBVS^{Fa;0wB;D~d=!p^5sQkd$l`?=%&FHFoR7OLycGcFlQ;
z<VJ^gI-gi3SuWzdPoYW|8S=aOcH|}z+FtIAr^66s6WVz6$5Tk%rQ<vMKqWFt);37U
z%^BWqWI+$?2uCIuojK$L3&{b9W{qxaJch|Q(g!SnyHlcQwG1czhvB{K1qyb&WbuV+
zN1(-Lkc+AVG9b-Z)q&L{$wp|MJ-%l-2qq^fzruI{DJ;n|B0NhabqtLTAeUGO*@i;&
zv&obd+sz8HceJX^I}fU)6tmDL3?cFnCedd=WtZimKVO_@-v?5(!WXo9vgKXR5t?Gg
z^-NL+l%}(^jSgTcImz!ailUsZa%?9UT+<{f*@-%uAcm#ET9L%M&?VZ6KtAo_L^$HI
z2y#9=0Pci)>3PjbpZgk8Zz^8<4mphO(YjSJ@g~;Gw&tQq8@7{)PDw~?=l#QyGO)db
z#S|wG+hFs^tQNF$7YBS#xGkNMGaYff908fA(n)3yxk01&r(!J1IyR--Xob<uL(0!H
z&d}x{9Ybsjq?%;iX}Z{8zsth~(tA=^cP*-eL1*L9s*V9G$g&{)$2xjpe%h^kp{F?F
z?|Dh!HfVbAk*!}4k5~lBqO=yCpy6Sf{cUJcQBu%3Pw+I=I}}&_#zzj&gmc08&s5En
z)C4Iq^TTyjDM!{vE5fGV+R%Cobs4F}As->JP;$~jTP?{$^~kpnjLbjn8|uL2PNT3C
zuZ-^Xo)J0}@ockr04O>gVbUp2$?gu(M)Uj~4#QBFhfQDAjy&4vH1Vb8_AI2*)<188
zX<)3m^lH@@3R~iQz;CSVs%w5I7!&YL=*o)#s7|RAsWrWKSw~D{jq{SEnMzNJAzQde
z(6_-GwnixG9~~nFN5BT59EQPu1!BZtgjxbzAFvaGaNOtPx=7{iI`Xw{HtxJ)q*HKI
zp?ZRFU*vZYjf(SFr(vgQL6`%1F*m9wKn#svdGb;^M*9lNspvt9r(pk@m9*#F-@s21
zV}*G{Lh5>#xtx?CnPM&Y9^?>hjocYFZsZ6Eo;o5Zv0Tg=p18K;*UO}R9t66b?3c8H
z0|tw|BBSZR(Q!-~TE~oeSrr`XIZ0s}1CGwCK~*kCm28)6o4@9&<>qP6Eq9Lo?B0{u
zu_7$cZ|rv)SS9ew8HOK5e^bl2((?lXG1y+GGQ`>SlesB4b}=zWEv<_I{Ia6|804jx
z9lEm=Z=7b+R_)E<#ikubZkN7@I2{NI<nTQ_=7#}iHK!;Oc%>t^nzmx1-{)h+IqS%1
zw9Y6qipvg*3!x;CV<#}1h7|Lf%mX_q8lcVsR-Ew4?}CZ>C#GjlN=`e6<I8`5fgH>w
zyN}yYi^NGI2Y6g5$vr?Y0pj2)mp%A#8_kR0FNvo4{$%v=&9`<mJb=)K(B}zVRFoJ1
z55vUIEnKN!u(?;r)XH!o;YyT}h7(@G#hK4jO&|yF2FhD3GFGIKo|j@3g|yWvd2cZH
zY{RYvVoQ2D5~{(!ZGt58QkEdLf_<(IiRJ_!zW)wR=o>0JP2lQG{~g)VSB|Y?vb0KJ
z$W@pPSU0h<6VG$6m{{-v;|ug{h`k~5)e{@9tM+cBP};Ol{slPpTVX1bhCF)MXA?<a
z7)99&vr*lPp$3fx4|>61=$6c!w|7^p9x`UsT_hJsWhQ>29lMiGunatESM2QxEC2fX
zzd7)x)wka9&#lLi#0^A#@j$A47MF;@r#YOsQoE2zkq0hduxS>l^C+StDUn6W@Ay(5
z&~N8E&XKC;hcB!MvwN5E-<!z3yzZ2<V>x)WMruZeij1+h8y>c<Js(vmg~Gt-Q{t-1
ztQ(`d&rAO3ZC<c>#4%B&`k>~$z0uEPz#RWSbQ=|)vHG&kDko9S;%)b2{%!D}!h^O4
zk?hrZNnu+5_7;=ws5g;#-je{94&3xL`Y&V7iV4IDSjS=x0{1MbyCYl#Z|tSY3U}Vt
zItw^jH&`4?F1qFq0~jP6CE)iZ$1*feMKfl`nkcF|)UA;I0elXX-BSWK^lGhy*$|~P
zve6ORd+^y(7Avk3^Vhz2@hoYm<2;GQiRP%J8XNm(^CTQda^wjo*#Lw<Jr`-k+5PkJ
z7JN^2zGq~%k<>EUJ}KSEOtmXuDs9{r0~!a%;(zZ{rJ-Ev$ofzwsw%H6D6grF2ur0Q
zH2_WYU5y0l<0qn~J%0D{YvYPA39|V+#d#ixA*#=$mi;DWAT&v_MlVBVAFDYFXQozU
zZr#2h7Z4Hg4Q8-$ZpIB<oV^n9ot_)QD{Z(g;q$Jwu}WcL(edw&2FAw5u}^RU(mgDo
z#c<&5b|<vxlg-xs`1Vc!_gOJAN!opTy+2*z%PL2Li<C0!Yk6UHPwuad-|@)qZ)})-
zrE;b^?<@3H_c0jt4rZCIF{2|WL#&0joH(P*w>NraSe&6c@k<7#47>wdc{9WRwN0qP
zuoR@M4_1^PHV>}+RD|w3!LCog97bCj)7x#xm$jiNnUylXj+8;EK*Z%u!U&|3A@s|h
zJJdet6f%wE?3>4)d<ZF8c56e`DrxD&w5n^P-!Vq0jWh;{Ji=yju8ba5hVU|6t2M10
z8m$vji4J72w%OZCKX+sLo$jM04F>vxf<i)u#>)VudiJ?+t^I>9WCh^PFd-X#Dkxk`
zzxBr(Sg$T#ujtFnW!Sn6f-Cy|IlfukX`3O{LzFfxmaZvEZ|WiaZs4~uQjyVzRt0fz
z=w<b&U8>H?8wXOWih<K#IS3HD1ZN^XX^4x4nJje>-g3&@)x-Q@!dFQhq8KD_wN!cQ
z&s0zQ{Nv0KtL5w(r#e6M4VgJZX^9NW0=`>iA;){uTj}!fHu4#2;o)l{;xBZMnxE*m
zw*~1m@_TDxHhdnSEQhfT9DxBqN+d8ufg%gr6|#-Gr-mYa@6MIGz-$rxUwcC{d}Y;-
z5g)P`)7-=mW<#Zj(I5lEtfCm?<88PsQusAc037ShorSV45ElqC#8XgLGcZoYwF`Fd
zeZ`%Y9*0sx<3f-GwxcWv@46<blfZv(c3{AqH90KUr^H^yElSN_gL@xPa3FsxMh1ym
z0gywUL*ka9o~B?D4Ms`=FG8wBsyo31LHi9ES+d*e4swEnMpt-9_Ux22j?1XzNvs}s
z!3iJ)5HOM2d#qT7W{q$&qmlN&wPg{0F}pYCp=Oj;Z6IS4k&C87uu~iz*o+FB>MmIe
zLy_EAs|kgX*M=KsBk>H^u3a<j*@`R;BW^eS$469##N=fXBIa+>IWL$k^5FZfYk4Vq
za{N(Ev3zQ40-6k^UCSO>-%Vnm4v6!Tn>}j61#fuKtALl4sq?ymDBu9dQW!H$kZ=4O
zuY3Q9B~rxVdw_zVQ*-m*M;Ou}r4Za`1s_J;ednaSR_ts%b5iBKJEocso4cmTTwpeT
z1|sS0p~W+O_KkVrBEe-!RZci96`vTJspX20NS4y2I8))Ibdoe3P$C)trAx2u4Me_Y
z3$YkNhfb-~R;BH_y#I`dVnSnr@}V`Qd`q@y`Rphhd^&UO0!0Twh<Tkxjs}YB9(H*{
z!&2ZL2$_P~g}C$+48-R%A#c(Pe*4&49yE&Nv<6XC1ux?+uRANYUr)KeYR_1XIzC@i
zg&#sOpg3UMVJ3DY!+f2gA(@3<ymSd@4yV{AQJfCLU%=CeUPnnAHY|ysiu2<3edY1Q
z0v}dM9f9P8FS=oIw{XEP7878NE&6jMHcI^+QksQt#s$9S&Tc6Cn((#AQ0MBP^Yl;t
z<usRu8pyOKowGP?x;BtyXgJ}WT)SQ8<k{~NTWDAU;1Dpxk#J|Oly98e=a<lkAGYK=
zJGB(oAnT_+M`o-jc-i)0ZPDx$Ev^rHd}}?fA0iKsQk5wPJFvJ)X-D{dIeAFMAd}%3
z%`vGtkBy1RJE`IfqsypOWNph@$p;$@dQ!ZXt}-wbLW*(1S{R)GiZS6m@{tR<Q1zph
z*o=8MlzS&JOJ3YDK(xOgYZA<c|8-j6x#Kft8NK~*t)&;`&_-mlRC#0Qhv|9@m7Fj@
z;NoD1WUNBxh)Rr|n@08-G%nIBG<53RE~rfh0%c$^Y7`Pk!tma0SSPcLuN7x|Ui@)a
zbuNw(89RU!MCo)`Tq`bq%>uu%SHMlt&)BRlnE1R8095$)Ce+JN9eIVZJyX{u?9;)P
zDcWRVZe{c9Cax13q<DCRjs|1Py<A?Qh%<`jd8mr4{_7?8gajL$SQe?5n)9nTU;7PS
zZK*q2a9%PN@^E|)?;=;buuQKAMRk%N)twy_eO^b7`Kci_pF1FHvvhbYDZ9`+t#xfd
z)HGY)T7}7;sC&fUV-Ywz<+Gix(8A8d*04KX%XRowVZNI+_H43vA{PJ<8EAF~B_ypC
zaqp03u<4wav3kma724!bp3H@8Bsx0us!QH%D*BLjkZLwvPUwhXfM+&r9u0Gia|1!S
zBzz0E=!tmyuD9t+C@}#TX2>2J5N4<z`Vt+Wx_kQ6N~G71DIcIjvrr_^I0w!uOe)tD
z{5g0SoY<mlq!!>L)AffU2FWgy>Haien$a*W60}am*%8O}osVIGBo--Urr-*oXvi_v
z3FN|2aBO)_-j2)r35*q1Ik-=&DR@Cf?i!Se_)ZAC(=N{XgaGhx5fAq@aMYHF7@6>%
z4Ee0D*pvy%G@U!tk(gA$ZMNgBALcRmYsMY<H%Pkqvfrqd_geF&5jD9*>bj5lUr(#3
z2F@=!_>#CB!&AJp<7ujy1NOR?qH7r3T*q*i*ix@c^*UKslLS1KJNW*nNr$1lFLr_F
z3bqg$8?xwLI^KJSdJg;^G6bntjg?}5AqBz%_#o7byoF7=So)HIp*F<ip4{BI0f`R{
z87x9lkML>(JZ=~&;T&jDV1bcDBax*^W5b~3MutK(yn^*a;s~TO0+ia|BvF8{=my*0
zs|{1ljJ%(~f%FmWr^huv#E|1)uXlSEw20~NXFTwJVnN;-c<VTxPz~oY&32l7>oG^Q
zQ=+x-3h02tJMX#_hof|km~<Hb8^66*Z^<>b*`>9{?myAFGF+?weLQ1@#m&IZO4jG2
zAgT~yKy8XJusNy1X@f}9HLO_XJK}0jUy-^6x;MCbO98!1W5G3+B(rC~=K0B{6<P#I
zDjU$DrDBYTf54`|QRGmmx`Qhj$rxU-7dipl3(nfb1g*^<j<cv<YQY{z$oyXav7Z1x
zi4aVXHwyW+mfW`<?_N{4;X1Y>u3Q$YQ@h)Oo;rVPP{^6scE0Z3??_1<yq{SC-?PzD
zkY+f~W}Qws0*OuHoBlY4$|5;!JG1v=))tR+9pUL~mL7UzNL5)Jddor88h+0fJP!6)
zOm$|wZNsd=IZ~TPTDs8*m1x|=%TAGP^GKFGlIFT5<LG#MA*rkX55ps5mti!gVn@r!
z-sssE{jHiGx>_Fo@_HuD3MLttt!C#&fMYT?CG4;QDo^r-J{B$uWzOQwIJSf|p~4VE
zm4m(oPNJa{`|<AuPg)&rx)m?Q<*OOYZH-Q2ea7!aY;yI$c}b~^<&)ttRC$GzsRE(X
zyu6=tz8oux^TiJpPsGFo4Xs+a^1j?&J6lNB^{Sz#(xZli;EP)NrkzOcL3IWCfHwfM
zdsEHFK{hXWenY5{`YS@?E-z1^PJ~Bu7*Z<&d-1hOK(P8;@}Q}Q+7;6-2HSsroZJ>=
zn2Cs33fR57F1xjDPvU~@S!hfG>P${c`aUSFbKa2aezz*p294f-Y85C}dbk|gA8T>9
z%G>?Lk<Xt#z4`r9x4{GQrTm+2R&zw(R<v~0E9dGx6e!LVWm>^K&<wM2RA|T919=B|
zGFHlU9*dfNaaNFR*_q?m-RL)<-*W&!D!>Dp#FKTts*-wFoSmI@I@uSHK^hAS;8;x~
z-k7FnLv8es5ZMjuWjSY%Ha|$p(mr;Un5^YFz^_*2@IcAA(ReTt+ixy%AUyYvuA0Tg
zCtKV80g8&T1&!VUJRowTLLAa$Ybmbcr?`u|`|iZ8%9*f>G3p){;OSh6fSK1N!6~9p
z@tdD%ubKZ6hm<7ysYWJ50#M>1)qD^gppHwFen>dPVH6~8MLV)qMS!O!OIh~_&P0cj
z2VewB2S6#o(3T5o9>RC)ha>!af#o|&0OztrmJ8^Q1_ozy4M7VRb%hrY<-{Ky&r^92
z4uA`=?OOC8#UwNSd45$xJw#cq*Irp#_0{UoaMSZqe%5Jdpr|7oN8SX!0=L+vy@|fv
zc@fCQOgaxgTY5#I;?>9{X?bmZrg@IOX#2HE-QU?E#~B^&`8@J-o^rq$EM{h8`3bVG
z!xLa1ZH1MGV-`<l5ecXy|GNMwnI(7*QkAF~0ZZUU-bsiZg*|v)OX?KiKA3kzSPN%l
zUj?rZ4lVF65)7dJCr(pH9Q)ncQ23;@Lr9ZaMAW2)WC0t1mE7E1w5?MW4GygGe)2)p
z@?Vq7vPhN_cS{>16?cL64vD1W5d$kf1&Xuhv<A*elFHpRx<GDUh~m62B#MK2zy=$f
zS_>;zlA|lrv<<nx*T-{P`%$M7j*Aq30&=4SVW?jJM^nD~%{Mi4Pf}m`<1Rf#-wmMF
zz1RLARPqmwUqDiYD|;?M$yiwG385fEOo>C0gqVoKPGv12g!rd%z;=LUmg4Eejuar)
zA<~z$z(YtXa>Qp71SM=i_n<LyjyZGM2E5RTBaSxE1TI+bC1+em5Ujn+1K`c&F&bpl
zUm;0Ka%(Aa^&d4upSey%&I9AOR0a~`Q(M0u<2eRqi$%jm8q*Gc*$tpRQH%$PE?Bq7
zIFZUd1?hskg{bRI=>&E^<7WpH7YyIEU=M-{Ie^_2haE*O4wQ<3VoZ%Z0)I)ffie?d
zS7G;FZ1&#u4plkcqIrWp*tWnV7Gj1-9_$x1#a?AQo8k1Lx)nv}2lPjW0vXY+j`0(k
z#}KSah4yH@FUw;${D0A!U7QLFW*IRq`+L1dN0F@8sPBx%4=9x~M}Ksk)yxTs>sBRz
zm`r#)95it~Rmcdih;q;J8+z1YjcS@!RY5Paah{8^hPFOUOOOW_Sl`5jAan(N1kl{k
zZKlZ=1Vd_DfKz>j`jYt0S{P?0oElGTMcrJC*nT~{qXwD}MxBmd-IQg}g*<B=4_4O3
zqY5+zfzdEgn8zTtAnT|<8+}RV{7D3H>T#A~jfqx;xm;_|Uf=>o_9=e_St&(Ft9~=e
zUQ`TVmqs4A;mrnvip?mOWPN}CP`&=IroP}z*w%!+@duYjAY%e~2Dw;J=l$j;-+mNa
zg`W<Zj2!^llLKAZ1&_xjTL~l!v&g4k9rJN)sGf_cT!exOS{=l+?igsr5dz7SbUCpr
z+zV&UJK3B%pMivP&~OoxqEg)yoX^6-e?dn}-Jq!ZCweU19-kup|16=62xjpzr~fQX
zO+`pj0n`cVRJ8a{Y$TsBMDqEVpAJABUEV?cc=QEkm|S*m@f~VIk{{u<Z8uIdw!>pc
z{t&7_$BoQrP=zGEj_iXe1PR$fc$)qa%(@$XzwQ5^BcA;`U8S!dV=~M}9Yh>R!yYvN
zP7gqyrk<ekjzW?}17!oYDGASay;TJIhU}8qLFpK}?mo>;3_b*}iBbOv9<+utKeH=!
zs?lV=j6>Z$ks;R<M~mBNcyB#y-YsRYf7##UX{t$lQk-3sk$rCa4;6mLVjytj7mcKv
zodu-!Nad0_tkXRf#bxF3{Ctu73oC<n>T1Z#EuygEn-bm&#!{lx{5LT{P-xM5dwY{D
z8FFki5J8obu`XhzD<4=XMM{zt#(Qo@vkD1q4m1P04{xgBcQCGOJg6)xq!DO^k3{SN
zSz<k=(SuPpqrd*%yYssB&;g^MC-?{SF`HnX0O2+cAbS}_L~Up#1_G|ES4k%F)b}SR
zQnW*7As)B4x1Viucq}_wjn>OraH#+@dn0zsdi7f+P7p5UUh9ve7R0?U4257D;^R}a
zgs5}f{bYNC2eLkt|7{?!l~S1o!j{MhN$hJ}k_IMgbog;?*WCYiId$uaeF0Oc)A;#V
zjNiQJ1-bQsbJJXkH7U@qX6_BN_|VNnxq@*8YhfhuT@9&Xq_j$rL9ut;`}B;=Ovi8K
zJ2wol;$%Dj?7)p`)K!jtNBuYg4mEpdYM0P-9K&Sq?Mrnupq>p@ge1Au2z@^P^XL=4
ztc@tis7RxW8@U+?XdqsYr)2&`u>8mu75*2LINkL9H6NuMy8Okx=w*`#9jSBRN=M7i
z?aO~xw$x?k*^uUieuud5pF1a*xz1u7+sE)JBGa(5IEK^$!zENA2AUSQ>Kat3kvKI5
z*sQ=9iBk1&NV+O`9@`oC+O&)QD?Xr)byOt+eE}#zxFa1lE54AA)AQzKG~;EH(XL%?
zFU#j?*Ti_P5i~R$0BUV!ITx2-4t}TO`&8t9$mpmK|L4zOH7G=O^-1%IT7cIMD~L*w
z_}86-ZOELIA!-X)haeqlZsEPu52A}jAsrXQp-r7`fj0*a;eB9O)y#U+D=S=VNY+tw
zXVw?c?@B6rqre3gAYE8cwPCqZc?vWuL<=^&5+W{{n#N@H;$iSM=-znnKw+;}+bO#@
z2||a>%aJp)nm6F%F`Qu#TVUw-7^n*$&q@&xwke|oL}L_k<;D$*p8v0`YXOHc-Q%NF
z5)(xx<ksw2N*lS9)FzBet!kdklqP9Z(`A~`O&49Zh@I%73q?6)w$rhTYN*vxQA2D;
zI+BEttjjSh>bR8K{@(91_3X3r^z?j>nfbnVzVE&K|G&%sb$&G14AXCTUB{Vw!`SlX
z8a!~z{#}+>X7{GENYwbFkX1hPz$8ioeHwl@<L4Y6kJP${7nf+V;H?UY6kwqxb!uRw
zhQX~kcCkn@TmqiQIvSuU@O(t{sGf6bODzYi(B7P#;2Oy=${lz=0k%W1c0^eVaNLo$
zH3^k26bd-*OySD*I=4HY6gGNVK%3tIh9`a}sHvg>n%ogrFoA%0qsmSFj(KTf8?8}D
z?oQw)iOugGNb2ezgb_`Nl!gcr;r}9^HwxTy*6@$81!iwBxe<n+XiNyHDM;GNzd|;&
z#-k=f9jR$o)xxdKDd@kb0iWn}rH6#3C?h}Cg;a4YH&d%VAK7K@+-fTnd31d|L`rVn
z#=dfZA_)Tws3s;fQ|`s%q0_RWGy>w2t(UxRg<Opi95n6tp&Th4CTIMAjtwxaXjh9y
z3g+LRf7avJx6SXe<q}rP8$g<n#qV5QoqXLS5f+60LBknLiL>D4t*v+Z6_h98ZdKi{
z=#9v8<<|UPzd;HIxm#YWr3d=lSRK~UyW)x{-PGmSJ&1%7n84f-wIPD((OxV${1p5C
zk-hB*v(BIl9{bDthQ`o^Vl5LqdK3B{aJJc^TR}}kN{jiNI}W;)AlV8j-~YFawJJBz
zFwEVnC*bO?dp0$OqIC<iQij+|h830DPc<EgW!r4$?R4bim@hr&uNq96kW;fxVnU#k
zlPvpDBvHaB>BMwO7JlBVZbZ0Pb`k#~qr>d2_sgz(-?n6zB-cW7ZhWloQ?wbuQ1v;6
zqy3`yLCHg6LMR}#v#NEW!N3q*)xX#UIA0bXJW3giK_q_+)CDj#s5WM#ZD5MGmoywe
z4F5KFNR?}RhiGn7KS66JduK$~+mTI@S!mhmL|8T*@B)%d7q*LjI)wQN@n$LH8YykJ
zGGNywwa}4GREOjiV4pQos0l<mIOIjiQ5Vx^pK5pcTK*vh@^qx&gis_ZV`N!MG=-G>
z7f;KoK1DWTgQyHEeI5eaxnO9Vx47ym*He?<^gKetA<bK?InjP;vk_!kKuJh>5N_qw
zXCkFZ$bIhYNAVO4&LUwhWMqI90qnW!Y3i6b>kx*{l<EEQ%;hbPele}P!O#<A3Z4vk
z3fK=)El>e<3}J^6FfeaIG)49SLpR$GjDi$w2T4!Ldt=E6iWn9|QS?KDx>*{EIUfej
zqa@H>izVC(VM#Mf>~t{uLN<Vii%@ec*S)r|&Bb7^;V@UJ{6y1kX=bJI#v^W4^S`IT
z4bt(jgul+%G=b<>lL7XV$%W`6+{+w^qNESvVb=Nj0!nG-#8d|nTB#1tm*eJxfOrgk
z);Mkud<Eb#?@d9;J%>v<MndR|5u5?2h272zQ4wm2teXwXLWb?!#O5oqlAv%43)E~e
zKh&u~Vk_TsgvG>4DuiwTgubNe&SDsxzpJ9GFDX0K?7Bs<MOoU@-`^xOPXyw0-FGsS
z0_<0QKN$jv0(I$85h|0jR85Xc=;wdYIvVjI$t4*Va*v=!u^{N0tg)XSY)_7<83Aks
z@AMCyIVWT#=5awE8tUt82i_Tx^)GMO1*o;Y{#O=9?2Lx{G*E+#yZwLJOuw>V+9IN%
zo{sb<8Z!M4MhV?`<*Vmn$!xh_Kp8cR$Egu-Pd&|&gi0TbCW_vTqO{AOZ>cQ!s7E!f
zR4|2_2)?u}m?IjXZ6(`boPeJyzLAwh`lb7b&E^7JEGc%ywgq#kD=7?3;WyE|{A^0Y
z6r7K1t$t=JI`1cp7-I_fJ1*J$XAX6xc?EPgmsyXcT2#Pqv@4Tl72YG8zZqz&f1i-)
zAP(k<4p9yu13~sp7;AuHdL326$3j`<Jj#z1J&RJSJ27{qnb1|=N;mLDL#?ZF-NCq)
zBCSCCA5)Dt5IhpzVEa-nd}pA(l>UPJHnKkkq~Q6to&2XM?J#X)1ck1|uPzb85lpLd
zzokq!XoTjAn}MzXMaY7ukZSWH;4Owz#%O`#B5F?gGPM+4TO9FUFt}h(3$kteVU>aR
zk$#k)f@m{bkotc4pdfWbvDZ<WJ>tY~qDA;NOb5Cez{@;GKl;I#)>HPNt{D1W=yOLm
z(0_=eT}f&~kEbq#lGN7?ykQAc*2_X2@RO`HN4kio?n(M$k<>_6+mt+C=Fv~aC?MX2
zc%MGI#qbW=Ks;Fn-Ub}>;=jHJoO^-U^0dpO$j9)AL+VRgY_Yhv?=8%pig*8?OJCv3
zyTu*7w0Ueg{dA1zr(-1_`-JMg4~sH%a%#*}BI1_R8o6gOV-}rl%>LV#NEr=zpWIrT
z{=mvF-=H569LWd)Rv3Ksij>lm>rw4N|8=^J+!V_=BBcCm8#J|fAr=~_MuEMlZv6G)
zmkJw>AzedYvZ2NRC;9Nt8Y%WDK_E)*fUrm*W#Eh`5&F@yA{16?K#d@NVBNqB0(pJ3
zYvK)*PFHR2oV7o|uh*l!@P5d{ar9kK+aG1dv2bqy8GE~99?3qSSDvkqVvcYc1zK`$
zV8zS<GikteRa98o`;N1k(N2*kng(}RUKg?uYeWhC??tYGU%k}~N}5Q>aZtuG?P@Tt
ztvy2iqwnm>sR4xOhMJCvPU5i)q9!9dEztQnAXSLRe9v98UJ|*fX>dhug(zu7wg0*j
z7p2S^hClB;4x{HN?b7-~AB^cx>ZHvuZ&=$C^l^1)&G!=b{KWA59hON#7Gp(wLuf$O
zjc%*)1|05jwr?yO+}Em)TWfFKU&}R64gU0gIX7+o!FUTT!`kZbUh{1USj548k$rdR
zVPOBc&lZ7hg9895AJ`K%R`sC?h;m6G6e3;3qR2I{)N)V%!#hKIy+8HYPAkdMz$$4i
z+gH0kGrRC{y5RC*|EzC3>RM7HJRaL{_1(a)Uo<4Nr?)Gn2P|4s9aSB~w%ict{_t({
z9OGV<IA`}le&8!ZO^s+dY(h%>6J<UUWd`&{Ty!x#eOsj&Vs6^TGwS20LYKglZD?2z
z=7_jQp<=Aq6Qrk^+WJDrS=_&FMcMr1<6(8<d)J|K+eo2J>PPK7`FR|JKp-+B#MmI=
ze?X{Dl)veMtS2elA0PA6FKMi4tW%~7YMcR2k&V<5h&=wKY#{dD`{zF9snS88o~Z+m
zLK`Qhc+85~Sd%SXUb-o2(b?LNmUgR}eIC;?Z}gWkb(Klt!>!1Gy-u43q}tiOD4a3-
zR=d%HZJFQizWF=wE5JOeDFZnaIDq~={@_rPYFSFN)yt03LsDoC5T@33WIgVA>&)9%
zw|!OT-Z=9ZLr8%U?h^@=^W|BhcIYYPts`y*t8H#XF9%Qv04wrYLTMtCAU>sXe(cL8
zezgQd3Szlb$!6PdZG4ebn?wcIw5|!0&K^pzS`&0XMQXwToLyd9EZnFZBQGD5?v%cA
zWp|h)wS02-Qd5#*U|BzZyV)C(I4#YHVFxfcctuYICJVg{dQ2`q&bj;J!F&C>T?Zv+
z%vF2Kn>#$+y5ovUhaOm-=9bwvDEt%;nP=N(8aEsJnoKSo4(T|}WgI_dtsQOj+u%gj
zi>iS9qZVF^Hr^0$GveQCi_-;S$;^LU8O|8X$dDy47z|6VtD6V!R(=rl7Qb$Y*}bwh
zDYUz^MSA3Yaod47hs|vaMqb>d@%SKKo>jx^=p)8@&tgsb(CP#^4PI_+`^of?j4c{P
zADzwrjy|wB_xU7~95krE9rX#u7JB5R=ll1QT_4SV#SbWS^XHRK_;Qwc-gI9d{wEFH
BZj}H4

literal 0
HcmV?d00001

diff --git a/docs/source/_static/img/dynamo/td_stack.png b/docs/source/_static/img/dynamo/td_stack.png
new file mode 100644
index 0000000000000000000000000000000000000000..d20b3250453c503aa77b7967862c1390f05c4c4d
GIT binary patch
literal 308321
zcmeEu_g|CQx3!>A3}Qiw(yc(0BE47XNKF7k69Iuhr1y>jDos?R_a3AZdQ&M%?<I8U
zz4!K>%zI~?dB68_-}wXX598$H4CI{i?6ddUYpwmfRZ)^5fl@)woH;`xC;M3K%o(CR
z@DC-?dGI&6t1KSiUj(*lG7@KU+Gys_oMAX4_gEb6q_;G3aR4!Sa=KpeR^I3hy|2no
zl@Ez6a!MR2;;(KobHJHYB~tEE5izH}HDY@A`u+XaO;#C7P1j%~KSI@}>P`>7oDS+L
zE4yYN>Y3+O#aG48y6bC(+aAYr4=-Z-^;}OqcTGHN>Q7D%S8GVl5}dm#cIIFGQJ|J^
zgCPEkn{EUdArXVunSb*~+!+G;m*@WdS7>>8DO1$&DU$rF72#J-C^CHZ->!zpDE|yW
z{6i1x2&#Xx_&<K_Ik(aa|MrBg{&)`7^YrK!JNLidGSQyZm4A7dL=5li3D3C|?A05f
z{`Hoxu1(PW%LjZ_j3EF*v}d#)@V)0>Z<%5IyYRn@1+Zi>3gWA4KYvmBXWsg^KO<J_
z@A~iJ6D-;LAqB(u$9lHQ>XQHVXS}?w?Eb4z{^P`-J`}5E4A7@Z^ZVC7LqNasuY>tq
z!T(->|M6h|_X7OSvGl(e;Qx~t|F;7C&x!GWr~nKl=lZ@pHPpyP8e<SyFLp*8$A+VX
z3Z?#=U-$!wuY-qO?2rDV4*nqbW~<)&X1CDuvn86>vhh=h;0oX7_^T^X=Sip{MjVD%
zG+BL}Z~r&t!6*@5yxmkINdKdFPX*WJ)|{YF{W)ljFIp3V^WDm#BRW=JAu~w<tQ^o2
z_W!n=5#0gFJdk4OzxlHLb6_<_E?aXGOEzm?FGM%9)eLjJ93F_VX<|+@`uU;lu;A8z
ztOC@xPBMmZbuN9!9IF4gr_V{9j<@@(nuJsv2rke(!<ATCvQ{i{sAp+pGx<NG#~+k8
z;nltU+4xp)ye(1O<}Ky+i!Hljo|Yba6Blf|<QSdXx71SQ>p8MY#yp0EPLDQUeD#5(
z+0A#Q=~cPB7_aqyEos;}&G5G?F!dRn*|1Ah1j~P{)+n;JFI)S`V1coET<@bs8pE9G
z15|gpy?HWB^wJ-zSt8Y{bjpw9Fn;H}jO}mv=1c6(m~OHsygO`|Q@J6}_UXF))?$DA
zu>0;v#gfNjc1hINR~KZ2A0Ga_xQhitLCNo=3^u#-|2|W(9U>juR_+%|1Nr$+GK`x)
zr<wQX%oKGi1uRF<5aADyp|Vz4^w{YBlPIHxz?+65$6MVh7x=)&TZ_!P72lHHlUNQP
zaLTRP;fUrz%a^ap(7PKg^kq}emoCBi&fQmvnS-=?>`zmw7ruDXU5wT)Y!^8s3l5ls
z{f>tZV0jY_{|!P?BOAywc*TFRH`%(-o24$))m9Pjw(&DfCWJ1{tUIm9@Gau=EBsDR
z2tALt!<~0W99uI~lUr=tg~Lh1FvmJh6TYFrs$aAojUD5#<|Yk6)RbUdpAHG{NC&dh
z3OWqWODOS8oa1!eu}FS)<2MjbumVeBj>&zX>H8nG@Fs;;nN6zMaB2SjuK>2^qJ_As
zVTU2KG;#TKb9l({9?ez!vDHR!oA*VS#Jl$~3FSYpf2So&=eqRBw)+XMlz@gxtx3D^
zE|=)(5$8m^Y(ZH~H%rOzb2{g#P+is(<#-W2SHAOq1&i1OKA`#ZHFKc<5ztVvXPYyv
zY5FxDFKWFA%miI`S4OJzm+3*Ynh9RGAPw@uyLYv#HK#Arl;dB<c^sH=+As7Bu(_^{
zf1T;mwhXcxPVfr3T<Wy`+;%CyQCfgUaJtw_zgFgJ?rQa6u!e5g@dk%0J`(@flO{lF
zychmj*&jzEX6jvYdNOo0D|-59u|Jnr^kgr2e?C3FQa|)`qnT}{U*D4(L<Vl%a<MOa
z2HYA>&y)T1evW4)BMzKM#cqd#Xu@mv6h3cJ+=gl9SSX1e+jS>N5!==hTzDNvDM%(l
zFd`W5z7x5>>UnA&%=e4Pb}rF>rcR1gx4dU-b!<#Ag8deH6Wxmm>m(dwPLKIeY-{xl
ztT=%kI_KJQw7*$}5{B-~xhj<VVZMNSdar8&O%Ts-I}_ouy&(11hBdiuretkL?E9y8
zAisBf#ybU|{nAIjWA%?um20hntCfavo+r+QFGmE_i!GiOHqyB|8N`6x;OtEUal)sI
zLWgB!A^4J_tkjKw-6G}S>qd$RlKz)pv_w1-Y}*}mU5(^y<GW)xtGxYq%xyF1rhr|>
zO<@;19L4)~{#hDM(+&pPZx<O;1ax;~!1+V5Ua;uOwQs~TKN;f#vmxOmOW7r(93<2{
zMd;$@F>(o*UZvBxhiV9|3&+xXdqQ!7_?IK)k^XcpOp7xuuYYerjS>L#@>??hyl^)1
z;~;-;NC-85X4ymvx*g1?zu1`iX+}(+;$sUgR&=%dVVVumPOG)|!S*6Gf=TugHcsV7
zI;chb_Qn{EZJc?ov%x)<A8CdlF*LZ=bd*1!EvLCoaLxuHHs;#nv)Cy^&&~Ow<;^z7
zR;1m|ph>iR7%cMI9qA<-3&aJ~P%%2i7Go1D(I;+*D>B1Zjdyl#4ozqRth?*pg#Smq
zI$uh?IrrITzgsmB9E2jitK8G?e(w(dsSrKHlT0B#=5e?j?5Xd0jHO(7TJ7enbkY4}
ze|DyHGPpUDpL1~U92J-8vlp#V*lvOlp$*wWqo1U?SZ&r|{LR(Wckfk<;j>O(Q`711
zs&aMgE;Lat1ec=0UN-m@i2SJ-FU%DTvi<B8p{s|E@gP6mKtvoiik=uajJZ3uzv#sv
zUV!YSXUyPkX*O7pnhENI5wCl=0?p7WLwPmG`!XUI1ZQ%*B-V}(wiR3mF3`K>If`=p
z-U&1w6pnAZQ8rS4t0oL8=OFlWTKMcM;U+lI$THj671B39j7hOKRI&)9JIQ1z)<PfW
zU>7!Aw|h2noHu7iU3uFQF{I!9ukdQ>So<=A@*LwoqXmAcSRZla-mid}25QSKWIzKy
zYuOa3ZNIK#BuJKx&I2ipxB}yVepYVNjNSo2TSV`aS@|qoS1FabMPIW%WLe|P;&}7-
z+I$rNb1>l|Tt@Y0!oX(@CPG5g;99Sy{!K*j{@V8|RZs?fw{@w+sts@;k%uc4xgC1+
zloSugDc5dt7`>SXU#+w`U-d;^5Sl+t51q5X6){gQs@uP1jC1-1&s4iNZD@T4d;!A{
zWb1e?E&AoVD)L|4{W^x_cJt3seP1FU`Lss!QoB=}BSA2B%@QTgU7RbdQcqW^rmQyY
zyse%r9h7w8JNaLg&{qen`j)46nb1F0O+$#v=5{}@<3!_%BoMKU)4gSr!GemXC&#)x
z#?cZ?V7X;S8_lhtoJDtQ3cVcWv)%61WUM;Ra{mey*N@8e25RXrrZlD4s4nb+7BYqv
zfq~068kqdC@^d}!=9Wvwc21r|aWNOqGwaw6p-Uncb1K?soW{NLY5TXusCi#<xvo|@
zj?c74adWDtDaw_%f|zLqcdr#-l0c*+=DE&oE;tBlHlY-#_(8fnn+%QhYi7-yn8mGu
z952YI^DJ$yRLq#ubeLu!^5@@j7&IIoSlcc(+`k_l5rfK`bJd3{f}1Uz?KgbSfFMzQ
zlk;#mbr`H7&2e>f!r}R%#2v70{=M;6K@|43&KX!gq{Uroq|-3~OizA(xK|}=p13<5
zuD>H8dV0`bb6a=kB4f-ri-&qEBPqS$EkHe7%pk<QzIa{D<^F+ga9mi)uuXtjXOb_o
zZn>Se?zZR2CO!U&A=`6dtR`{JX`0*1Aj-HE!dbbADOtI)G;@nrqCL;T2z~EA$fMml
zRpwjD`|j2*YBes_UFEt<7){TJFM0p>3dx3mU2c`y!2kYS#TY*PQ4aMykL@{8JsAT-
z3Xh{y>SDJZSh2jD*9UN>GI_4%?P=_3-cX6<?n(TX{&DxW*b_vK&Y*U!?bGphd<chT
z8}Gt4f#yn(!sSfL48HrtRPz??IUFvR999#ezB)EBr!Q^o;;SOa^xrSh2@M=t4kV6u
z07Q(2Z+m;=vqzE_?nAM~VAn(aYN5XQ5>#&!@y?XzN`uRXs}~N}8|hmCK)w1r#EA!y
zSE*a!uuc=0obp6$?BgrUMkeX_TxW8(gU98-zrcwKJE&98hS<V?0%^gEZkw&V_&kI!
z8e1OTu|bnR?u2mlW@+*wvNX1Rcu|^(wMqbAwg4(}^*Bo<(WigI+gnVB&?Ls%FOt_%
zuj_kjBxhq!mL^+{y73VoK8wT(I_?PPqC>Yq@NfEv04Sp~S7v@qJB$jXzTJ+)g0lc3
zsHhN%9&df@mxtx7${)0K<r~VZ>Z@>nQq<4I>@vl4Cd-5<#^ZGCdl564>2Xba&in>^
z-Zdu&&AfcMcVcbZ_+QMlL{408>iZp|PuW1Tu&96g{+qw{4X~fDuGr~?gOAE0gZP^D
z)HSn_#x}5%ZRQxZX0%)vsnte&4zcD8&`rw|-jPp*{d^urPL{WNElZ30_x2`3o|L>C
zaeF|!PpjV^`|~4XwB%*}yCx)&e1t?~EZ(7aWTL2Fmfo5UcnR>H*-&wI*|-m}A|9}d
z==Gen7%ZqiIvhLQL9($vEvRz?)#b&>(f*7-y~i7uAD?zB{>%^?)_h_v5DNl&$YwZ;
z_UBPKE4#m<(mN1O$no(rKK?6o@s-(u!U+UatMhDZxW?yKS40M49B^hYYL0g}U6--p
z_-nqKtCjy;`qo+PQdx2d(8m-MwS~UA^IkClK;st+J(+UWHdV*3zj3o5i`LvHdaD<|
zUZA%tf7DaH7^bQB2*9uCr}@u)yG3xa%lb2IvF5=73rw@_19<q##6%!WN$wHIu~k_C
z(q2nyxZVQdR!ffxcU+>RKh^OrKy}+E6)xMTO>o~EG~P0`na}rRe$+&K2pu6T_6otM
z&wCjDtx&V#3-#jm6zX?*11bChWTpD>Wp}QAjh2ZAfI1P<K~$0I%EG-dPQTvHB)q!3
zA$(G?T5XQDEwY}JX1h@*#?Va&T@R!c<e4`EU5VW$h*}dD`TSUA?-)?X_vzRaYZ<NI
z0}T3IJECg4Pdj!tQ>u2b(8RSJibvG%TM3tkO0JBE$lsbBt#ZZV(ul591(U4}jT{}b
zHqleJTnS~?z|Sn&F4d5yy-bub1qQ)jJ2R#3hYsqMPU{vCthT>#(;GQZI~3p3oc}ZD
z{PCaOz&b#%b3U(olTqP%IIlU6v9nnzn~eguWoQXxg|<fR8V~nzw!wW!9_KykG;uL<
zGdh>K#1xsfZP(3uN`0<uBz6O#zP6fEvGOhj0~0|RavC;Vnms1p=-hmTPaj5EVgOCI
zrpSGic%qEX4HjG}RXME7&qqd|Owyk+bvS5?_gmLplZgN=EEU!OA;{?ao%kE`<;5hY
zOW|JappbCVp&q=;h4lZGLX6bFUhZT0DgOcQ-#-?sfvtj6Xc`TgCs%~aLVxZmwqLa(
zn@KX;-N$1jODZ08jZ*wvhjk1I5E|^}K#|$ULB}!o0G)gHe0AW{O_oPn-O754WjZw1
z?<=xv>N%&b_RxDCS$6~E@*_NLcL$#qUsmtGnB`%&tlIu;2C~(ZZNVM<<5D6JzV3g8
zo^F8*GtOx_Dul=WemJ(?pmx5=ORJTG>`Eg0M#=jb1%^RX+=1VOL9n>;C<gIA1})R|
zYr!LbMw>Sc9`jDJF?8dR^k08FOc}w0jw%BXD3;BrCqq?_i6O@~^7Lf4CJi8s>RC;x
zD~O$EDf-U<CVl~k7wPP1k;5fUJgC?8*w4jq`_tIG;XhjY{$#o-tjd#8$I83pndS|D
z+>;Tf$)M`+H0VW)*!uuUv`Wv*2FpNtQ4#htQ!cHb#pVVVsH(HD!T2GvD5GOknYF*2
z2kf_suy{6YtYv#}f2pvY$~T$wkwx#dZvG1hdpL13zUK5~syEVj9<3J*%QHF=&{i-I
zir|+UgggXTw?ul^DGY|F+=;`Q_wPu|{Xrr8v0d*ikYnU(9+~`(UHm6P2;6w`8!;PK
zce?VGVI>wac+h0ZN2PZN`1B#z@}g@Co^;VBGfYt)$KQB~hwVa}ic=pmvKs)PL-$lq
z)t<c)=tdJx>v?+YSZFmNk!;oKM?&2)(;8jXByqayd1?ZKXN+SHGmuyevue`o<_W@*
zd_<Y5{p)TeL6+%FvO0n|w+%Tqr%B#4#mMI%Tq&A+$fndIWaEkxQIsG~@^@X*1d48C
z?C?R80H%Jxt{AaUHb4*hp&BorJm__N0qv^Cup}Yr(<;cr;@rXi6u{M(Cpgck*>4X(
zP^4jqa351k9n%eOaQCA@tLZcrO6E0KYMtDFTOsiQf*OuctkOI1RIoRKjpJK~T$a)I
zw<KHA<2|D6aOJbr2MbwR)x0SS-%kFlMPddZp412Q4RWsh4kys*NQe<@fOIQMwV|r*
z3tGhP!(L|^@3%(`s8h17tRbaXftv``pWBHPnn;VMB`;%DwHaA*XzV(~N0XXCDzlXT
z{O~D5IFlz!li4GtoJV$ZDz@KAc<WrAK;L8(yq91G?UPm10;pqC*!~ZeJ?o(7f@tXc
znEtHh65AuvY^fM4zzw%&i_X{kewkLlNnCu0Z~BvZB}&b>=~F&BYHJnrId7cDy)QUq
zilxp*tR&jK?kFP4Je-I>Jxbp{Sj??D)K5$T;hpOKNR*URkldM|iV=jH%&M(iyfRdI
z!8y|^_T*ZRtVh5XowBaEiq0#dM_2dh%BMakR3?OclbF8|3Ei6$3>5l>S%|nCkYv>o
zB69F!j#D>mW6Qmphiu6cP<iT@&*3XxAvu!dg*~v|>8idQT|rKOpoda79e-a=GN6N{
z5dB#B^UVLK`2d(8=cw7AdCm7`g_l^R7VCokcMJC*#dd%qvD{&K;TXl+@8==+MeH4g
zHt4X07q_soY5GIFsGc9Tn$6yJ=P(;ekBRS}?lp>5%JW%Y!Oq^`d=ZFm&gA|32tN?s
zLXLx~C^ctj=8ZJCf}niIv4zWQeKY2?UQbzNrNereWo5*=k+xdBpKV6F*y845YG!$@
zeZQ_rLoltma1O>X^OrK<d%g_KjEIrffJSz*A_QiVQ7^N<F)c~UYdVRC6e}YYhpb;l
z!Ii}$>(H|bKijhVe3gaPU-x6C@DvHAF7>@CV0Nm?Xh0gz*QLJzqag&(!Jl%<eKLr*
z@)SdojL!HaGg59}r4dL%#U;zkZYHS7daR33YXjUt`nY(9J-1m`-LT7I_6$Juf!iB^
zb}`9)u29|xK>0I~{i%<3@qUwsL@lv`{lpv<ZtaQ=Z?s>Q)e|2q6%V~k4C=nN)_JP`
zIKKVhWoo-kc%ItmTTeg(%ydXzA#rC}`b{K`vx?O=j=W>J_V?W49Y~a6tT_PLhTl)q
zr;T@6$gK7Upw*vJs3|NlujP1mY!!Q5)|e?G$=3x!-Uj$g_~kG=eJobZ_*#v_1?yO=
zy|oEF?s%r%9y}8Z`nyS;d`8nYu;I;><h-yCJurE1-kSAc+xEz2){2L$Ie~Pu;CX|V
zZEIaP9a#*47$+%d+=>sT6^v3%mVSCA@l3!<JiFus-;jIu_=psxF4;_8^-wiHXrr#Z
zJY*=f=mNNZT&K&3kDj1tx!AKmy#T_ZAcw>jUEtc(51=sV&(U4PwyMu%IIaJB0cfdp
zOi^Z#-lthHjNX!EWs?w8XJ>hccUtUSoPYTkB1nPdTEK>iSeeK1fs#0L^hX~K=uBpT
za;93UMYJGzSW9%E5T>KCp>fnG{!Eo{<UJJK!O|rv=zZq*q2`jk#N<9*w(_eXHq%Yc
z?aactHi=ZBFiY5bdYLmPphEC`WK_QVB`tDkJBV{U8KwfYUG%We?3;9~A)(>xaHaoS
z&)3BWu(NwH&CS2(tE)dQ5+JUr7@w(0+<69VF*@v!2!+|beyV6&b)V&E1kk8@SxhVu
z=>lYVD#jF$>l}~ZiDCx9?AZmQZktcK!ISBz8NFOir&|d7_If|{08kju;6Xnu>&S6G
z>K5Oh(m>lmZ2s8oacM1j6Ca%D`?W1rFd9%kT32#AbU<U9J!Ez^93q3>J=$!KR$!~@
znU5$AB2$5@bn+Bgopvd(IkSZJ$lp`tUj#^f7v`AB5{YIKvjrr+AVRF``Eim|fFdZp
zX#b!iN8ccdhg#17;cN<*`$q9aEtLZ+qkfo<Cz(Df3H&4$4r9^g*`OJCq$L)g&pYa5
zm;yEB(Pl|;obRf4WeHf=14y<KD#VKTsS~(Gr^+s~>_=80XFTVe!JdjEJs^pd%%<us
zn4e1y8NK&9z5R^u_ai@~+rp;6kUya?XwV;_g!w&Bj%u&Ijf*{~{$QHyGGO2z`5>qC
zTh^Bd63O06?EqdDy6tZaq$R6_W4<B2Xm##Ku*R2Ym%Q9YcHO5wV8wjSI-pDFzZaZn
z5-XzrH)>`6I)E=C+QEOJwLhTs)N3yazgHAE@+j5RPzze56+Q3En37t`;xMrR^De?b
zBr}(J-@~OGLG)OmNqeBCR98|ej-?cSN6owtp4)jvw$XjUmoi3Dq!_A>QAvUH=+G1|
zT!v;bzy6q$xU1Bi_M~+&w}#H?onbJ4!$Y0Y4(4+OFIuV6^S)|ZR!w8SDcGBKY9Z)#
z{9y4wD{0n#tlUR6?|p%e-No`e$&mSHPyJZ+TiCp?ES($^(9ER)5{8+LV{tHHHwBhR
zhE04Q9w+DGPG*O=M}mD4cxNqTfdf;WghI^Ixtzx&!6UAkl+S*epUjf_fTU}S=4C5Z
zUmR6|aAKtca0TkOgxr;w_uk)dVP#@S+8w@W5_4_OP34B#GU>4&VdX(fTVG<IFUnwh
zz%V3Ls2;d*`m!SMByEb@my8Qq#TJqMJX~QgUXNJ~Z5f224at%e$)4Nk@vbyEp+5;V
zg~E1L-FC)t8ex!uguJT$g`L0L4DSj6uaI=KV7vT}%H;KEl<oT~%<BxBCn^u3d>;2m
z#8H%9598zD4$sOdZ_@sv)*PKD*~^31MjT28idV;KYABztnfE69p4gRCLa$SGn`>uj
zu#7Ew0+9QKMe)P(mv_ogy@#N&tnlm!UIH88#^X8|cczV4xj!x*f{48#<djFLqcGd?
zD+^gUFe(YXY8r~;u^<mGs-bv<*k@SqaUSJoikZSKD3E9N;Fy>?_5g5;1Q+*yRZDuD
z3rsD5&6m)!=GcKcc5|%PBMU%NZ-j8nm7N+A#3RXFiJVI;DxMoZ8AlH_0*;{pzPj+8
z3P(H71^yd7RO~<GSp2u5b{G5El4GDKvrduVaUE$!y3L=UvZ34cyyHv#RxY|<=_<T~
z(ptq6Ie2{ls<?1GBLKqT`i*J@_<dVpz56BV(GrN!x15(gwlz8MP$P`gWd+(6e|jn*
z9^#(}T7CV5?*4ePSgsFtBOnVIvSR3ad;1H#`KfR3uMC@cyLF4#e-Z*@2B6WD+N+=K
z`bRk;xHtgYn+xxEKip}BJ!Hw53qP>NPQ9_QMrEx75k~;KEk~mt@CaFHZN~V?LJ3)m
zaLqyw6+3$dm;XpXox!+#szH3ly~#LMx$g(7Dt&DVha@AhsA}~ti4|SP5pq-$@=Wzc
z(*yV8BGXhzo}DI_*+@AylFRJF>M%Cpi<fSBV^i3JC!k6BLY}wP?1n9XQejAY)1E}v
zqKoFT-rcYc#MfS-jURWE{HgeDQdH~X6JWwnhdKM7q87V}XhQL?w}z^nPji|iQPYOR
zM!x(no)48`U{d@g%_5kNG%<{X`%f5q6@aRGeC@^qBD-6cI&N3M)L!4jq$HV`_L$g4
z*zTjVc3<OApPAL~2Y;VldbGtaXf<A|rEP$D7jHpM)&hS&0eVzU)b#~kyY|J=_TogS
z9ma`bSH+jbUBu+zL__;gtl&u4?dd<b{&wdftBZR=Ko69T7d`NxY_yeYmv(O`K@as2
zXC3<1qittbf8UT!faV^T7%KmT@9=dfL51iNs1I9no!@tn0CgI#fClbMS>MI$_ni=~
z9Qi+f*>nQIWF&K2x<kYuYqAIsW?5;;3332TIX;!$-)t9c1?_v4JXcg>&B-=fIJ4`}
zRPK)?-wwX4z=8W5&+CZN*+a9Z)gul!Lqt#3kiK(qjyExW2E<vVG>w+g0c_QLdF~;&
zL#CGZZzo7wRHxs9BrTNZ<4dT?u|thXO16@skqn52zAa-#o`5!XrffEjlXuLmaCQD8
zq`_`1qbU^kNs1Q5k__pCH-Ls=0lRHoHOe$wzS$;NH5;k0$J(@x4M9@00+dy`KNGE&
zr<k2@f$qao`1ocP8GneEe)s{8Ntph^{4jjrX}~`8Z22YO^sO>7P2)@9w&&5wyBHt4
zc{f`1Si`iPev_2AWT<~VH`$TPR;NtW!o7LA@^pmw56jnO%H6`6wEAJs$vD(&8FeMS
z86oZ4OFtghKF+3{gr=59lzJ$YpL5e`n=KE*)n}qdz;UmX6A%5?ic`b^2|%U7fA^mx
zfC3?)IxU|Ov$E_hP%c7*dv+dY)BQq<As}?v>ng64ED0=nm3$lD$|wp199JyvVbs%u
zZDqx>lZ#{*1&T~XY-bd6Q$H%$hg8-al=xQq%p~|K9VcmrseU^bs6&}olve{p;XS3V
z+~<<}OQixRq;I`tm)!M5I;I+$epMm%_mE_%ln$Yb4OV9qPGhl3iG}ZH0#dZv7ByB~
zrTXAOO(I4ccRzT7s;8rj6kPznc`Ktyq2MqR5XX87)o{PjJ1AmGY*k2{>yv1nC&tjW
zLsSl1&5;d`q9`47ojn>%t14@-4DN*NcXGe9FiL58QDb}Ha1`+nMLe<Y1cOVQb?8vZ
zVrv!7>B#(M<T7J1koWbWc5%zRbDV_-h9{W^fjMem*Xw1h3ei0q$NB;)^T6~&TkWfr
zS5R`;o|4Q&JW$tg#^q*X(&&?(2F2b{jz?P_n$4L7#?5KVb}m<9?fagF2IsC?p{6mI
z08G;4L0|``)|k5mB8h{1P4x}<LIdw!vO5I%*Z+DrVs}Bd3mq7^_zzIWBt(d^?5Xke
z93xhH@Fk(fL|$S!j|?hjY})yl*bU>e?g<)mL0Wl&NC+{-4Hl6H30I5d{U{;@?B>P=
ztF{;Wqk*Q?VRlelzmo83u75MC#s?}MAcqlR2QqnX(<=(gZ<knT7Wbe!VGlH4Hs)aL
zZi^y%@m<sSEoH#Rv=8yXQ<*#;<X0XJTYV>)0m==LP)oSJhZzv@DG#`N#qG19lqrS;
z>pkkqBo_T?P$hAeN(%=AAg=6=QA5Uj{7!N{>v<rqlSKi0O5EzLEhIcwPk5>4!^>O^
z?X%}E<<TkFYN$x$CboHkD{#1y410+Dp%{2e($8tvg+`Jkz9)%oiAa0@!QeXiM#rX1
zdleo9CI<_^@(_{1K$@|1YHup_A9K(3`o%vR^Kr`#`dQa~yH`nFRS*k9V3rD&0R>no
zuV^6Q!MZ9w(%u}-<`@+{CM^&mVvgv{0OXJ$I_+5ml6_u8g&NB)NucOUv<oU6dj<Id
zdO8zo13fM~&*~#)^rd~t4kEr?h6L(VZU19}IXgFL-?D2wFNf!_R)IJlE^AiEXSlO`
z`@>_)suEBSW(3KF)d7pHYkuQ?3-)d?!Z-&gV*Up!39FGH<hg-5H=-1ueh{4L3}Y7s
ztl**@+2^i&AL;V5A{+Do4rpTi@eJYr<zC+l<ISqG*k|{DXJYV4%$vwmRh3xRLo#6Q
ziqM*vcMHp^9C@5vYCUM3vmG8Fhl_*8JWofm;7JS820kL<EU`VvrKShDMX|W7;3V60
z0l2CRv784>T+#a+6wCng)Km$115uHfp#RxiRgu6ap<bQk9cJWXGZE^mbuU5_q7kUd
zCo>TTr?BZp$j#PWSH5(Kn(Xl+x$jdl)I}<BadO0iwZgvVsTAImrkP3gYFG<3#O0wM
zht)kvnYfYtpvvq;>0lZ&GKK*242WJwA<v+5(rZ_^xa3zFUwj`jgRPN&LAQl8V|96f
zq0$06&ac~R!O8Dc3uZACq2yOvMD5z=kUH`8E;)1iA3O&z3j4Y#E3=j24V6RgfH|81
z%B1p7nZ)ksm*qd_Zv;@VY#?92))oYM+GKCVCmfD$Yx$Z9LQLSD3ahr^wZ6-^?c+M`
z*DnM$PHKkYA`Kp#_zKscT@lfpoQ#U4GOK3pjA~xexA-BCK0+eN;xx4Go6a34K;5k`
zBSt!RQxa-XC!1z*2S=Bn+RU%By|BxrC}uO_nbZg4$?Yh2&z|j+LIFffE@h*{pM2%J
z4|v$Qg|f~5=c6Pg(1ye_1$51OY5b7z{=_u-NnPm?Dmlq_h20Qv#H0MZKHu%FpCVLX
z8edAA&EU(@H5*r`4(iJ>$^#x)+8L^zQOO#r{@igy)noK+vNlzPN7#ak2gYuro2E*b
zxkM-nt7I60?D8fxa#$Tf9)6ANF%c#&vAzNDd}(AWaxJSsdU=Fo3bDmQEH?qm`~Juq
zv<?FX-sxLeQAWh=uP@pU<~&7ck-aY<@r9G^$rd#rD;&^WyC%Q*-g<>JClxFY@1N2Y
zhqZxnZrS70U_2*K;kdejUB?0GF}tEvgG{|P!+x=^Z_;<LmTf$FHhw0|a~KF)FXlQD
z+tPvFeN)7(RGqrsh|w8nor)jtKlu=+AJ+5%L2MJdw6{JP>l42pl5W1#ogUIJ-)fv;
zpA55R|3Exvy9<0OJRoJIY?`<i+?f~Y=DfRNKs>qci~AKPQ!vRqoZLR>OH3&ctF&qY
zaa9tSzni|Si@dvV_!{}T;imK@?W$HRB8&9^$TN8ww(2U#us|Pa0YVX=Mee|^mbnG<
zReAl!rVLjrz#|-x<7fEudj3J8yw@hgs3rSK)P%AS#DjioRNIYMaHVonDH#%wz@uDj
zh`~00zanaZmr91x8gg*~3e=Pc(!67}5dnfm68K;PcV3cJNb!}z_t{Vhac|f$(i3o?
zN`>3R=S*jb*(y7-zP<M}&Cdtyqf(>epsYACB<U9o$n(6mGcD`FMgwB9I<|$a&8t{-
z`%t4Wb<#}~5WD(OOu>n}MLv$jp*5|aU|oQRX?Emg5y?9EgtRZagpG<8aWB8gXO*zq
z&O^bbx75=l<6RfPTK>a<Z*>*(JoTA)1cfm4II|!OZ91XTD^5zlJha2=dBsIE%e@P*
z=!KK)s-unj-!HQU%!_YcMxD)Yum|<Tp=>F(I5dk;C#U^c&2^@^w7l2fNQx7!Ou&}V
z(d6?rS*U=xn+Kpjwf~gpj+ka`Ga%lZ8_e2y8A+GZYs$su9@4R?-)sKu($K8xFD)0*
zOqZuLl4JYu$Y$;VVfzKREg<sz1+dw%so#n*YLr~RF6|B5G_&A6Wo_9;lDZG6|FQ-U
z)xiL1qM>y8pHB0sI4_(#AfAM)ulJvR)$i4Ux(vX+-HndNP-i4b>3qjx{3&#M9d19;
zp)i0~Nd(kKqWn7mb3Mz$hIy?H<i?IbB)iL0sLY2I%KMczKW_Q&6+iEKKCv`d=qTVX
zL4y=hNM+G1kjrY4jClW%<O`s$W`NTNI;@DW{`F<fDEi|`jyyf){B<rK%TbFR&=<z?
zsCLNTqN_=iT4lObb=25>yK?2?(6209R!a=OZTiR<umU9anOI-)<+Q7O%pldSDy;m<
z&&S6_{~QVS1wZ%|+5PE;uQRDacVi4-xQwa<*!y~JA3_o6F|$}&;+j{Spi7+JP3N;B
zn2fakpANa_X!UR90Q7Ko<gvKd(dyYD9eaHYOhspP5is_hF|I3RdV<&l>!0_+Bf|Au
ze)0&PoQQf`R_`}tEhO(+478JC2jLa1rGD8`;pSHCIs12mlXtSAszZLGlI6P=-eE_L
zosNu%57z7bj!*<7@J`^3nf;7~=YX|I4tf?|vv_C*$Wiy*&8+j0ju*mLKNr8v%#QIn
zIZ!nItvOElf!j9h6dQ5xpVy7Y`{ekL8auy1jWJyZY);zpQO*U8Z32X_6)jXL1O0e-
z5(v^9c%4wl;snKp!m0qiuGSsoyBQtsyS?TGSmq{IH$c%F@HKTL83K&>aYewa>u&N6
zlJ!o;K`1WQ9r%?5FDFApSniiUBr1;)T;;K@p@b(&g&}=_iITIj{!sC9_&df^;F@#H
ze4XqdiMc<?3y+mYVAN=#F&U`0=&36>GgpKGsyi(Rt+_b1>F-+f9hBr{h?)<ww{%!C
zT-PB|)%@X*Z0d7gh34A2{Yf3Z{ztC}MO@exg|ib)o`3S)d|WFt5zW~y#AzqS6j?#h
zbu2*vnp$q4;RG&EZ=B?u8LoSK{Rt5NV~QO|WWz$pbP9bF;q$ZLP#lCG4&9uDSQ=;X
zgc~$~JUlzx)#ae0U7OAT+Z(_n-pi3|Zvp~d6ksL?S#85>*nk}`<2Yfl<B&agwnCv^
zs`clz^9t{7LINhvl@)z)l4!5ac;<jApZ2e(A+`X(i5l+|(LW2C5^+X$AkJ!^1S`}b
zVIzJ8!K@NS9iG@ng$e$Ulc=E!+)~6sl@#IE14=PowhNi*=U?V7Frs+F-ac~PFv5EW
z7c+rF<^_;cZ!J2AQ!`!VHu`ZD@6%fi-vjejHS=Am0k)+wjUwxH*WD>|dP(Jp52&Et
znKmKj*vl+5BH%hmhdKB7HS)l%C9@mlKQS2$^~|pVyC@g1jS>xO>_#a(fTdp-csbQo
zy$m2LFKGM@lw^t^97ni*ex>FiMdB!E&>)gD)5Q;CIz^A<>P@nMN$$pAr0<!Fn`CXI
zsziT7dwk!n%6v9M>o|OEp2Gv^XRA-C?w_MHk@}F}rJVF65jOKUlj{Ce;VV5kvhljW
zu>JgQ@{!<l_QOx7mM%&QH%+K9!^u0!^O&wQhD=;Dq$&qf+67E%L-2i95rRm}Mqk7u
z!x;20r_p&MAmEm0r2ZWH+~)k@8ZVqrN*t-(%Y>MW67{Uk1TKOUPG5Ju2Z4ABGzwT4
zr3B&~x*Wq^T^9IJUX!CA|0=V{6LAB+8&+MxbOsA-b&<rZrL|iNz0C(v=04CckpG7T
z`V&aVY7eG9Dys>c7yO+=jWj?@`J#%S`mg7XN2kD_h)N{u;kf^lIK`)wsNX9DH?=k2
z&6uqYm|)Q=--ra?9$;OI1S(-`w0}1Ft72zhuR1$KUGFRo9bg(D<tmy>mS>$u241!m
zhRQJC4YSC=e8lXSs^$VIM`?l%VqetZ4tU~Kt;g#1WC*4FAV2^N5YyB@B7>^%F_=!}
zs*p80qZ?H!E)TgfJ-zoHM8=qQ#1^$jBK45l#)*5=CuSw`RJi?}r_G_jDv(YVp;l@F
zG~m%g(ZLS)8}M6FiA*d|Mrz>5h%U*qARbM@Ehu_FLEI$fEK<ppnghIq{~*hJu{Z%e
zqO|i+g5@&$vd4AR0r9VN+C-fBH53c4Fsx>RSQm9M#z8BRy^QUsG53!g3#XUY06adY
z;iG;p3P?eMIo;%Clb~@Yl8)yD{F!nY@*|;h@jK<8n6T8%dexx<z{PT}k(GNOrqn9F
zGD#Y=p?Wcq<PS7n<J<wUB0Yi0_`c0~f8}$m>a2FU67Aw<v%`{+x9*V05$#`XW5W13
zqCK)0VUyqXDZH9fAn^>yFln~4ZPm7N=a`@_1vv!tXw5_k-y-Fb|8kjB&#G1=J*K{e
zs3`^H)7jaSrpvSOo*uZ_?CYjqTQvxz#39(sAAYL9^1x}*7L%gh$+ojCz@#`$O7uP-
z1HG4Ze!7d}K^yc22A3lC3hf5Kcp6A5#jVfxD=_+*K3?dICT|}q=f+sa(RoPvk>BVQ
z7yZYy)I1hzIP3t73*vjEgSo%os@yIC+4YHILw?doh1IyKir+-MlH<|w;my<t+5_I<
zMM#fD81IKx<=#pQZ?5ou9;Wf+WAaN@&TNOVi<O&goT$z@4!CtD;G5Dw<mp}QwmgSb
zvAej3XQwXRV6QcnXGR-lRj1CzNiYV^5keg@(5cO?vhhzS_1&zuKHfC+INsJ&uXfw7
zHJn1D!@GXcIB4`nfZhc8iq}PS-c_w<9du9;z^39ms=HgNcSl$z=~DQvE#_7d)8@vI
zOg7s#je;>35bb_M|E+_g21(Lp(*F79e`ZPCxgwRscUM6NcNh5#Ha9`KrW)|T_dds?
z9;kf4v4><>D4bd5`*i;bkc`=9<6K|fPg{XFerD6Bb7F;%zaxL}Idp$OiPR`otIqb~
zIVQ#ACOQwqTlk>Fgjd2fxOcq+?fxHRW2PbXtMeorc()L`@Y89|;nO}=$=(-DpIIup
z(@^))p!^=mfK-`@G>Nn3ck4pK4dlqVeEiVXn%VO`=N~j_6`6+iAN=G&mb|1L@KYe3
z`swF`fNDWV5S^XBTzU-5lPT<Us`{%>4$N~~^<dvA*5sxK40aF+^5{w~Am)@#dNK<7
zVTR}V<19+g5%#3XhVgU><Gqc^Fk$iUt0xM(!_e;aUW{rl3Hnw%ALjEh$RK@9${=XD
zO=9ge?mkC+R;$(e`u)Q_1((E_1gl(y=1ywM%Fc``jh=kx%@?_(IKWHXp}if}_nCLZ
z?xXY3dgIyl==pbY;vZG9!iW(C`?<l+1o2n!E8qVz0GuJbc@;=91V&1~9~B<W8x?v{
z3<E1OHDA4j)>RNacB`W!muUFdmxqy83t4QkP$3k%Tsk<y);QrIZT6PBJcwgY+#+b9
ze%8Tb+9uSje>gB82vzuuz=>~;z<ZwnI>IRX>SiOo=Q+AYp76S{63embWGSr<5eyRV
zSnmHWjbjBZ=Vz25=xdR|sa-FZ22J9*p+tkyqr18VX!KX1n#mAa!M-BPu~7$?)zxBo
zA?SsSCgqcSFp0dck)vArV(Lez5_;<Bei@K=R!@|SQ_8J!ZA~O=O}bL#(ty85kg3N-
zxi|Dky2HNWoEur!02pjgOLW9(ii;tT8`SVM>*sbjS#MG75;0YII?&<9?SyKAw#wib
zcxJ8PWAu9FDgqc_2dB#l!KlGlAF`Oo3i=iHFk18s)%iq|_BdwE%6G{9w$u+}fR}ro
zjUk(HQ6KE1_+p>^CJk!uGJ@Vk!fcK8|6-55oj}5>oBW~rr<doCROTH+t}-Vc_Q<CJ
z&^L+z*g&_abO}W=k5Gaw^-m($4P*{kYL0<STt(ZfqT_K5KYvZl5}4{*`da?i7q#n+
z+bL8;DuQ9)zO*raO1|okbOM2j&Ujk*f+l=s<c@e)0x6?veJaSX^)_vl7dn6C#}T}B
zuwiBFaO)*yIkhsAqP#aZt`_##R+arp!2CV=Pis|yct@cf;!R$5y#@LMS)ZEegVj9Y
zF*>(3aa)k=Dk_DlhX!E8WpqNO>toY5GArY+XK!1eaXvGND&{A75!v$c9m3<JYjPUv
zP4%!A9!z<fNgcSrZjOSpF}5u&k(}RYcpel3awvPgKR13Kgg@RX95mX9a?NCyUjJ23
zY!O_n5o?VQD?*GELa2|uj;f<G3c`He{gm!F&RImf&(kTh3Dv*uUfl=Ul#P?Ow0Lqf
z`I+fJB8;c-MJsfe<@%~gzjIc31WfvX1acEW+jcJ@s$?T&aewggUqysr6Qn8*%eI^U
zkY2pMkgD9f3}Nu`qlkY1&rnI&EN1aCw?Urimv9pB_bAJ@l2KVCVzDv%s<q6@1;6#P
zFG?2822`@>woKPX4et9xJ{c69y*YO>a8GcIQM2cY-${zws`~l@=SfFdl$V0j155LG
zIPJ@-ZH=)$QRk3jFxL`^CnsFWX%8O^Y~oeq-c*Za+(ulH+{gPf!2BpDaK#sAfqwq&
zJVIn|{Os&_G;TBViGBx>D_bVU8&h!?`L(R_49ZLkxT{h577l-U0mhyHpf|6JNn_@^
zPDZ6}&X1)G5P%=g`cc1pQ+}v)7z;cWdJ6Fwcf!0vkk^<$Ju?rQ;w^b(*x>Z_z0sWv
zpAR^_Oqg5&NY?9FII>>$K@Ze}yU_{jpoM!{^43<#nnu9^WgPLaCw2DFi{Pvx@T&jf
zL(CcC?U2&TQAg4`<R2{%#^<llS!_KeeeuH{5QW<#QtkOh?Ntd0331^l;hXk&W|}y}
zZ(k8A5H50v;Ad?q1qL_#v=*V`({j5^b=Ow**`V_56?b>hHR`ubpP^66gm{x9q+$jC
zHZ}GbD9+bggR1^{za;J?NUZ}S;03)97$FT9a)G$CvJ%Aqmy@3@d(-aX;*|cSmw!y-
zxOe=3L+D$@S@?Z0LZ;Dt$o0B)iP9p*ivsO<G8lebYoC$rzpPgSw31f5BgExL?Xr%x
z;XPvC$B|+^&qCT#>-C(05&hA8rY^GItnnu2iwMwm_Y1gJF}49C+M)wajJP>UTg6Le
zqB0Nt0sg42&yR8N+$)KPfF_OYfB9flB|rqBxlWC$NMVrsG=Up^*pi>F98bp&zs5b@
zoh);Ot!1KICHB;1E3VhI-|`%y8;Ys>MY=Xv>0CGnd}H6P`s4a2t}4(x{20%f$jb1{
zaO4mrOu^Mdmj{$m>)5V&zUz!iTo}9ST=>zX&Fy}DL0UG`JKE&6z>@hL%qNNVIB(Hu
z7*p0vP)5Mts2eENT*LZKgCh*87G%*WW$&>+hyG#}B4UV6oT+S_8!jsZWR}x74V*>k
zd}h(DfXUnFTP(46)<><J>CIe)7kiSpY01w{mc9u7U3i{d0t4=Zf8|E}g-m!ujF+3F
z$#44j!L;VW7a;r=EO4%p1Q>B<+c6NixbQ00%}38)Z>#J}60p_~g;Yo=blk;eq}K}%
z%?%c&<6UAF(W%?3ti)O#*CT+Gsp|SJ?|WYZa~lsYCL?8mtoR+|Ck|Ss7v7c#5kxF@
zS^Hp=A++iUZHsnj_*z6;;%=x9tA}?Ca1M@`*GvlN5V8Ya6-wUMNdvSHmwx7@qF4+I
z^N&v!ceS@le68?a&fH_I0en*~aDo0{Ykm@2ls^&>^&J?Omk<qIp5KF+2<Wj~-NHJ6
zPM0O1uQEY+GiJCC;m1FK|7y{dAGFBsT~^WG7|Tz6q(?so8B|#eEmo_@_V*P(L^a<Y
z+V4Kz#alAW^G&HS@FW;dc}<tF4zRW{f!T)&W@VYLJnnRdL3q1>5U@xkwk5akB-BQa
zIg$>|CO6MMK6$&qDc9St18ZSw$)s@I9kvbW-iGa^ocXk<!hLr{p*Yc`DJ!=+s&70F
z4IFrO>==0pF48gD9u38>cf)5ZM3T4Ml#3o0wZ5wP7cJ6i<z_Ht;6*A`3X3l5M4;DW
zR3Bs|BsSfEUasTE4*~;eV7B5}{H_l0j@7YdTlr8NUO&X>e#k~3b^5n}(?0?)E6-a+
znz~YOg#vml3dqoct=y~9nQ#WRjPJ!-7!<oWI^vt6R!v?B!sOymlG)+TvY>=86#bZF
z`)<y6>_QtLAD4z<GkA=QDNk);ew$d#UOubiW9J-%aQ^8KZ~91sr9nsfQ_~gakfBrs
z%47#w4@1zU3$n!FFlKXn0QO({vhi_O2ji3`g+h4P+(hbV)?enwAQD&}CK;mEH3>Sw
zMk->SH}a?wilAP7;!_@gibSRRoGl*r$2CGA1V^waCcYF`ep9AT2@Fa)?U!N|z6w{B
z<bn~{tQv|XQDDkV<}oVc(Gd~d6g}KN$pYg1m+%7+c{hC>w;wu?RJZ-t5W}>y&~}e{
z8ZZJ>H>%buw3o1dr?|YH<zZs3kf!nxIbT*R86#ki*^O<Tj!a%uDB?u${oJ<i*p2l*
zdtt>inrbSAVM2~EcIvS+m|$~lZ?)5ymFKyJ_@y4fGRJv8Uw?G=bKENz4<O+3+K&xN
zRIxTbr~d0^dOHCXyl(Ml=RdP*%=M0>{A}H}QtQb|F9q={L$@`bd*m3L8`{PX7%*MJ
zk7Nz<Y@dvNdCJM@yfLLqQS5`PxqDd%ObD7&lcvk@S)}+@B!`IL44O}C-oms^Z=r#2
z!f})_heTJqeM&eU<DRhq2YBfAlkfK|5~bu2FWwo>iI{gJh(B|Bf?)8$A%Nw)iRoNo
zy&Q|z0V3orhyGC6CS5o7iRP#~93j?!n(KMooWUVIU=3z0sIhbfLF^NEd>_HN85aPo
zl6bcE99SI%5ljbQ>;;W}n*`CUY5-Q`jo8mHJ?5U|1qxubw~4AgBHvOhV(aGr2Dc!l
zR=PyJ)>+7Cx&0vz-0C_0<)ZE{6vYl<fIa8IPe-ygsz$8jeZO4jH&>)`0%l30!MOW~
z0OinmlwnGTPfn|`scV0SPxCrQ*%J_MK|UM?{wwCb=46-pT=#)Z(H_;CLZR+*f3YeB
z#duH&&yx{1>A(7J2D4y|xlP%a=YBu)!)KARI{rDz*$-a%#AEh8e1U}dNFT6*1P?<@
zv?i>B#N&jn|0cA#=|PDYm!IGXUr>tjmv90Jv=aLyIoP#B`~rVXAM{l)6?YRbkbe&o
zonGELZrI*r_LP$IH4$`JniIN!Q0OvAmr#V-;b&Iy?n*sw2KSeozk`4SKiIN46Lak^
zEQMEne!#o|@O*k2vW2loz)(Sii=Hhby~T+60i|j-$82t`TqDvi@db0Il=XwT_mi4n
zrH3D8g{@-+>;oyZUindA8bavlU8+?bO?xszRf#Y6=L1|z#ey?4kPQ%wqym;OhaBXG
zh>sHDz!qgO#f+gH0*+M5J)9$WF~B8io`)O!8>^)T-_NtXe*eYY5966p)rTKJ19vqD
z2{C)5c?!2S2BR=-0ig!>-G;3jotH(woj;3r0QdpV{tTQMeF$uO)Un+>76^ON!>p*+
zFz-|j(CRJLLT~@Xjo~G4X~5h1jrt=k5T+{R^6x0wUV^Wf71k%~3YCgT?AMAXC?DGl
z22%s|#1G4Z*$L{|3Fukknp40F00h%Wfh%Z13m{+n@=H}00J<EnK&ue{0B*}jU?drG
zq4T}86%*2RAiZi2Ax4Nda1SJ3ocFNK9(YmZMK~a4!0o<M_h?ElM~8paL;k4Sniuft
z@a?AU%~2l8DrNF7e8Ad|juto`xz~|cwul2pH^jVa?3gTkOMb_(zkLeKzB-RlZh7?X
z-gft3rb+5khTg(eU!N8&!7OBwKOIdp#2`Lr7Z^ULcmE9Qd1=mKYX0_fixlFleeR3f
zL@dPpJE7$zn;JLNrr8hyJ&wb0e+Bavc>CACwV4(8+4!8lTWi5ccc~Y~b^s}hSJ$Pp
z(+dokOj9(7StW@@@$;Si&&6tfDDm_xh}AS{9+D!GR3LGA0d7bxcr-;#--Y7O6j9+6
zab!I|dWgC)w^n0{*wggw%-8k5UV&Wd;)e^Hf_DJ`Rjuzfm^~7rMv3e6MgybcC{x|F
zS-dBrGC@A#-FfRcQR{~z?Z|QX+?U61NfVV5choP?IX#;@PFhKTk%=SFq>c8OLW#cn
zfQdSHURY=LCYh0{l@#%av%e2DlB~U_gHvYWW=)>lfYi%=9zFv?k)OD8W#`@C;>N$~
z-nM1?OyxXFfXVHXxa-|@ReoJIYacSti6F1Fqn{#Sn-<Q+_0C5oFr*P?VZM+#Fxjo)
ziTq=JS_eIY;{L@@`DdIhWxsGdIej{B_uT+<IO_O1A<-*dL5e)wD;RcV{`jK>6N62U
zs?0r-<c8_+Q$A=n3?6O4G@gOt$w4!1(@Kl;BNEoBF;*#78WHcFyfvH`3?Ssf2q6IS
zD0*K6Oin^zQnJtR&|O*89_IV_Rh)g_t5eIfz8|uH8ePu+H&aa=1GwGiA7B0r*zioj
zRDl<T6N3M?X~uLXpe71_arA$AI$0DE?^%F>C%zAXAu=GZF!83H51R;;MhzEBQOkcw
zMF*v<va^<{?_aLv5ra2XBtH`aQb-W;u2~|IT<RAc>qJH!KNuceVi^V}|8UwYqc<$u
zCWwN9Ttk+}F2u}RkJ#)W{WKeO7IPJJ<jkuga4ZCJwt9btI+@}LlyfdF(6EyQCKfN^
zt_Y;6m<Qr}u_oYc0cqgX0;=f~mGQXG%qepmWIKgeTh-}A_GJa1HgoOQh<j<EC0*fr
zy+VTUB%)@wE8965VtSZx)kl9#b;$hI=|Wa+aq;9&R|_!w!S`&nYIi60vw5w|a$XO`
zgHtQ#xP`%#UG;)@C~-YJ)uf7m)?9<rwopJ79Ta`tgGEoA*i_OM>`>O}v*I-EXCPz<
zEQUQ0jHKVMqAXa)={Ze6z<V<mwbIj?cn|0(xVDBbmpfu}73!s{9N{dmjQUhUUTBE%
z^U~o)Rl?@tJ|E1K6DZ*6c5k5;3x`<DX787$-MlT7J9l%)Zy;1<<L7_+VF+MT$ztX(
zFsVqV#}M)R@xA8<wwuVp8O486avIP%%`xOBY)}w2uRuv-0quj{nyjiPJP(;Nf_w?j
z78e^7W3FBdDaEwEUdOVS`tyIL$ihB@11&sx{9}%`xDOoZg2>eyDN_?GK+zk_#CvM%
zDyTe&0Yqyg5LQhaUm}Hia~CKdYF%w6bcfkqDa_R?!H;<WMtPC6fAZx}acg1ZBSXJM
z*3e>GbbIP6^F;N;Yc+`wJKv=v80K{>es0v!jQQj4jh{HAuUQqTgUH+E?cpeV1t((6
zM}O*9UEQ~yY;X?1{q$3)l?h`cOOvDEP%*yb`_ab_xzm%a&C3-ED}x|*R2F%#Rth|1
zA_(J;isy<fD0jdUrh?EBH>*tFMeueX-ih|FzM|tWXq>NX4J_JtPPcCovmNH+>+C=J
z9p#@o+X*K9jD@8;gyRODlE2&<Ttad;x~RpI^hjgCmX#{9-X$T&eQ#ojGW%SdwY8>4
z^t}!GH!ET^z2WYnYO?uYSa1ehd!B<M5<wjt|3U=RZ$5I07k*2T#Yp{PIWJ&RGg9yT
zeyIn8EuNn+X^s3{c)=&{FJvl$36-}XHF6CjHn+(yv9QkfcVwPy@P+Z)E#rDky+`j|
z8mC&D2Yy}r2)J6L#+*CCOnTN1?MiWQuEB|T2D#MlQvusxIx~(t_?e3IpGk30n`{gT
z;Se5k-y-&`;kk!nUR_M>dkIE^-fSPui$q9hVb&5+*FA`1GF!wG#aJF=$|cJEVAf@R
zAIL~m$mtOgay;<23KQX_ei+V7`?vbzl4xVH^UZ?AmSZoKgty_A;v%$?#5nL~nwrf+
zW!pCg0t@arwcZ%lX<E>a&v&O&BQj-q;GL4aHb)I_uH9(?ohf~!GV!R#2|y2-{jANF
zOk#-|RuFEu`*bVs*p|dOUvx&&r?|2zm+hu<1$Blew=TG|$@QzeAGjx!wyXW!kEH6I
zwaX5sK6<gE5T4DbD^cbWq6gmZljDCG8#{wHRlVK`JUFSCPm68=Ekn%Kp-#Cr<HD#M
zTpZpj=FU2VbOrY1HrL|wB#<MrtwIv4U$04Mb5+=qmH}pNaOT_CZ{BDauzRKa`Q-o5
zj8se^MK_$ljIDNV`?Tgr->bS(Z;E?lbh6XojjCzMC>da7@}h)W(<1XUr7jN1?ku_v
ziC7=&qNTc2oog*U4pOUhiW2TM19uzyd|EL~;B9PAmvoI^i0u&=VApf1Tc!JRV0RJA
z(RIY>&L$09<G{R)FSHfs*plejOhpmgsu>L$*7usqu<ws2{OSF0h}=xVC;x}N_YTLh
z|NqBxMYmOvoidUU5eeB>R3tMqqNvD@gzT$Al+lpN-g{+b%S?-G3RziM+57uEyDRlB
z=Y4-azu)os9lzsv|8pPLo$ET!*Ld#ncqWqGwY!A70^PqES$3}+KZMT(V0ayT`x$dM
z4@U4DuB}Q?qfrs%yXKkF8Zk_1q7-P?+Wsm&<qc$DoexYQJwMu%O0zDi6V@;MnN_?B
zS12hP9rw1Sa~qDF4kx}cL@Lh~6d2s(8wW-7vEUZ$({;@mPc{*B@S&Q1=@Fr!4i+#!
z0MICn*hFTzk%%&@hnqs~c*?>djdK&lfU?g1-Kypzk)+O&1rTnL9^cs*s#4pB6u?eT
z9hly`ntMaSZ@NQ9U)otF@l?!9#V`ev%GlxCM|zRYFwK20F2)}lWDPvCW8a-5vaGY0
zn)^SPAt)Gsr`yT&AjfCVKl`hhLLA_Bot;>_)q}czO4A^H^!`|hD|Dml#dz4{;?-Y`
zC-!N-9s;ET5eH6=URKku5;Pm7G+T;Wv<o>)CDG`>*&uynr@Kp|^}g}678#$N{qJ|M
z^HoE(s|h;l4|A(B09;BQ#5nFV=*(5HhdCKQpB`5lr%I0R(PN%aOnhy;TmG+2ln%<z
zqR&_R-c7>ao7uFd1hZS;Q@c67{Z!|Yn8``@z$>Quhua*7gRi{h13y*Xfv~07i)X+R
za3pB3$|O(KJj%rLJ<HS-8<w=yAAZ@!9i7y1udD7FMCB<Dt7o55mF2d)e2S`RdOrsQ
zsyG9UDKQO^4o_dIrbavkV1R1@?;<rS39sOZC;>9h8z_6BFw70^vIUi`*Axta!Dv?O
zrOl_(WyS!eCEmP2YjqWHA)WyHh&?>dDk}Sc(>^ga9;>`sdf~wd&gfad=yEXMdNP2M
zs-H}BR;5(2>tY$H&a)xNxXp$}C2S!qf4wgw#f<V&%N%*f=A4J-IdbP~K83fOs#O!d
zNk<%-ut@%I<l$9<_XldJ3YBBR%yyel@j>r<tLuqW0xR|GcbKAfwxheXc{M7%!iQ;0
z-c3LVT}J#^@)A<vtn*A>GblW~hD~Qbv58E?#C4a^@%ArV`yaH6Bot{KBq@qDF^c3A
z$iYgrP95*iBf;fs&WyOa^BuB#za%=GS2h->p`=at8M4)_y7w(goEEI1z45^$q-GsS
z9y?faUb+Gn+cDug4H7OSQ=pawTh3hZ8Rw(qRk@0<YxE#u<kU%)?5f+QgL=?XUOW|y
zlmV!j?-^GfIauPY>nA?@NTpy`ON4(lgog`F7_D+1tGW$>i_&#*r@jP9ONx%s^v;pY
zeexVH``gvUlR*pVahysC@?KOX*pbb0Dkj}#i<<hV^7x7DFRG$h^4kNO<yAgKrw>Cy
zfo9)wLe6!&j;nz*ZoU9;6XPvPEDr$}o|nw`#->0`c@Vn|uO!amaw2M-vb|KV=E!d^
zQZyNf9~Xr%(L|Z~g6G()%G)$S5D1m5pANO}AxhivXoObO)(n@!Y7@e~Zcw@-UFZ4^
z0(sOw0t<-W$P`%$U2PWt*<vQz(RA|I?U146e)|gJ@uxu;d`71UoC@|L#EkA@I01EJ
zGI2WIM;>abdY?SeG{)*vmPB0klagpV2+r97h%sD9>yZC&yCKI^O*9-a>{{hZ_P2~g
z5WFaqZaSof$J#+<46UjLfS{4_Dl6MRHXXwITlm`|PY!5oQkbfDrBa5L%6#F$@EfO6
z&lxy6pIRx9+5@6X;H=9+42^#~G>8|?nmj4#yaJ9|0gX4IHgB`Q$km#F2-A0tC#0J~
z)vn3w3+iOk5xkz3P~(s@A3Wnx_E(}AWlZ~{SCdr4>X)ji8|%d=Bjq$s8$zo7xW0QK
zl?7$6a!ec;)A{l&qhNhTZqn#kh^s9_(&#p<qU!1OMTr9~P-azB8<$>7+%Zl(b?8!E
zDM@qybVQ7{ge9@pIUXOU64ya+P}17HLK~npOMrNGsDwBsUhdBI9gAIyg$=G$4^J^!
zg9Y0;SwhvRcIi~DmUs}c_26SVJ1#kA-(kA@bNoUt1vp>Cb808n^&h14S8rlcf6-us
z74hi2k~n;kgsWt_{-oaRL|*CY&wztbl_WEiZPHnoXLkT{doFR|cjaevC7MbTweKYm
zJSvuXfBcAMXx1CQ^@yRn3PJ%+>UnIxQb4quwyv>0Q=9VDUG7}gR+pG9-$fqlGzg=X
z&T@<M-n|${Aikn~hQwhp?wFXmne0H%Q;5$6jxBgn=dde0c@R)xVl)kU{1HSNUy@wB
zV6eAffRd9I+EH?4i;=1iV4EjU`N4F)$+%JEXv*a;zJ*O<YRBRMN+e@)FGdU;ipesh
zicb(}j8}QZvcfg!{;8FxaR3VlFMNrgdW0J9DUjsm9i$XdRz4MAIIVr0j@W(hbZq*q
z)uEyUqd-o9_&vU36zU^2YV*?1iHZ(neGC>#k1TadJjqddJ}EgSk&f7qQ|!J^_aSFR
z$l%wKs^9h=nL1+r3V`WU57<XKvP`x`uW@Hq4BRVIAr72Z7xT7e%uuo*;UJ6BP)OKT
zG@#~n{7AW)jzNFO?a-4ZItfMroHc{O5oD=Sst$f!q)F8FkBNOo#FrtDHa@@V|0pR{
zlAQO{vqQtm@n_7nW)7%wFHGw~o?S#<y>oi!ut-nx^=NEVqA;7OX1s~iqMjCH%;If6
zd#-Co<hG6;w*ApT_SOC*4svVPiFZ)wB=6UJ<Nk(qY~=0~U@R4!2Zvis&HZ?Uj~Szh
zI|6zz1zz_P$bIb9o%ZV8sYb+PLb%6w3bNwVorqf0or4yFLe8CwGsRLuUgT;-Tx)z(
zA63p_TM}G}Taij!(d%L{Cj%8*V)`nqvX$u?pUB~Pow2X&!?*Rmujan9h&}6>GxVZ}
zWapijeDK*9dzWe+5NLd^hpCkBUHq&f8xQ)^I4Ssk$^qZ&WyA~#a`#E?0I;YvHc{?|
zNmz`@oy5leQ!8vQYpAwpa)_x)cS~ops1_U{xINOC(7U2qD*X1GR&1*0)Q{Rcccjfk
z-<~RO%dhmyhR027JSn~Qm$UlTTT_9hd<q%LyN>IVtAA8hXI9_(vdLus+Cr`CYrq)n
z<G$n!eQA1*S@E}hy=Z3d$}sRHcHX-!O%>G`3ElB}{E(oTq28`;E*tUJifScBUFh9`
z&f5-d?1MAvI=kINmOaYUi7WXK4<M3**Q)2}m7$nGk9?c#aHQr=Wkl>cUBuuX^W<^6
z#QK=T8&d<dbL}@%^om64Gj@n3yh*w~2hv(;=<w(Do=pGuNUg@w%=y`0977<43jw77
zZD5=AN%`a+^(B?tP|rpzY%*HT(T*{xw1pP``8|LR(P^4ehL}*^iP%*Wx0+p)9)-h+
zN%Fl_K0fV5e8d{qQNlnT-QjGWhB+4ZNGgmq9mM($N_&ucWe@wmy4JQ1L&CHK2K0q4
z)3Q6i_&T5_IMgtUP5Ii63C`}?Tti{2T{-6O4ZmVE6d*ZQ&2A4I7EH-EP&ewtoF$tN
zB?5Q|1|p>~Ubg!aQM*(cY8B<9qulDwlXv*v@<MxWYl8cQyB;rlZm%H?v~srqd@zE>
zquDMBNjCr#R}e_b_et|{<1%p}w%XKZ34Q{Gei02dn(`)Bcz&USZzb<%@zmX*e&F7C
zSCau!UvlItu;RBre)|E3H5|G3hm&s!*3&6J$gFK&IQ8B{jgLYI_~A|glLsO9jWquA
zYM>ipCxSFU9w_~XM-V<Ef!31_1Y}}p8`!FG3*WG=I<~*~wIb&$yJMNj0T)76PnKuh
z%=)q57Z-8cCFV<mA%$EV82(aXyZE#+&)Gd!eugCZ769G@a2;Oq)xN(kb`!>m$Y!wa
zIaHBW&0R+DF@&Lvu~%ew^^qSh`eBA!%K@J$7M+o&{`TCTsDZ-?^9^Gxcieq9Ku;P%
zDA#?wEN{>6cToxw%#VdKXuVPF#~Bkw%U}Z>)t*;d#tQ^$+aUb(AMe6NLKSsfeDRkv
zLf`#`b?^XxRtx}A@&UTnbjTY7XBD~>9<gR#-1+;?v7g|YE_4R-{|a&S_ZOFy&OxC=
zEEHE2GtlVSM<8ga6F+(<))(cS1q8Qe<;#`HKfK^M@$~1m2Bg-@ZU@v`Mglb1vFD2S
zGJnv8preJ>_gaAU-xB~Jd7RtXO^SX$ME;KrQmW&i<n}T2fIWgLDOT22$X_EEZb;Rw
zHz#?E&+k962a0}1zP{x3PsB$*Lier!V!nyb`3QAh^4PXpo9Rk#^!i%9f5Hk90jN~0
z=8e4doBHwTnCy3msAA1!1Va$)WNKz|9;*MiIrv{$0C(^d{R1&$nR+n@jtvAEr^UxX
z`Wi#Y&BMuSR#Lx{4--9dbG=Dr!RveZ{s||=!wcoP4G>xCO4<eO1Vd14$*lSD%l9w*
zv?d`4yIClj{SL`W$V3)}of&R=^QHflv)?n5CeAAWGg57TeA^$s+n0*eg*%Tv68-}@
z-p7ET7m5qE_8$LGD>^PV^(n~mQ(rcp@gH`z8!oCo;w9gD*kXOnP6Xf5AY)5&(9QY?
za?Y3|zdvIWZVm3KTjMvTfnC3NR0>D>8v5xk0Zdh&qvA>ml<Cj~QLz7kuAk}xr{|PN
zz5WG{=I`}z9K{?coVcF%`ROaCw82N`efds4+`sRr=IfXr6!D#GHenJFpr8(amOuQ7
z@GvoAE+eh}b|YMAh4qNan!IH8Yj?!gA5w0Gt2)+u^%Ljshl9h=K*g~;)YQv;6a&*n
zq5*LIFkxI2e>jAvh^r%5wDUE=Ph{}z7&$610QSL$bf$i!@}(Dgn+5bp#&y|veKX%b
zVeMf#-EUg|NXOT|Q_kkOU}$MEhBV&Is7#XIi9;F5-cok9JmmV-lKXnfPvt)POK;<Y
zt!xX18uZ)p(Chuwq*H!fC;yO(9G74<1S`asU4AEs@LPmPqm2_Zso*pVj2qZgQ_tk>
z*;97$4?@oZ=<h>t>+h(*;0qqZAyp1yP@hmNBmB{A^J|bfs$wJsKSbI0qXGOpB{<`5
zM}<ELekLx0#o19)hq<v)(Ol>xXefj#iPOS%Y=7V@N<^1mZyflr{^;+Yu;e=c4*8*R
zT{v!>Di?x)cj$#u3B4{SeE-6_6m$!MaV0|RIe(yWGFcS_qV^!-U%)vlIz%IQeE>BT
zXHRYW2POJv{M5fiGq#x+k@Fz$uLLgW0tlfYl=&j4;-^LJ*#1P&50K5>8z}sJ#aMF6
z)*SPoYY^Ic(^Gl-4FJHrL9o1`Nbn+36okOW?cpe6{ln4Q0wLJ4<oto~*Col%pTbL#
zGHqJ%pHMFU;kTs-IFCgI_3z7GKk{EbQ9go=Vxk5r{`D_ffh$^lS~0(K=%3ey!$4`7
z>bdD)+JE~CJm{NaEFjnUlT7;KzbT&s)*!wUO7Vv?|MR}i0iM@+Ap?@%Ir0bF{<<4t
zdEgq{yDJ9&`WK&JO`ngOt*4QG7zuye12pM^YS_o?pt$Z}{KGHYKneJD+~WS3WHwm-
z-(TV2MI_PjTCd-w^agj64(cI;K7-(Q>A1n#2-v_lDjgK&``5qtFH<0tw#w*e$6&r>
z-?e=??-p4_o#4ZLG1omjB9HXG6qixpQ*F$`KSuhF%!_T0@PJ(XjuU%d?%5$7nMdAU
zx5tA^d|)T1Y_x^+4g3^l>GQYf4G0HvLqobGI+h%Z9XpqtJXaP|FLnoyPh8Ur8(5yx
z3-g2_PWXgmOjx%+en=s}chaWMyYa_A{>SgIh8ReWzFPv)`j5Z-;n%ok_;QNRha-P}
z@2?MF;>W`adp+n366&8HjWvZYHy<C{XiL95+W#>yv*EOa{J&X7cs6|5l*Mkt)B5?*
zFXSQj&+ov<w!z8%{9UFxZQzycyP_Lk#7|$pzy|#XbVb`K{>=_AaWj2vXn(Q$r{8b9
z{DTzWI^5~>B-;2Z>nqHnCYvxe@7wmzHv+fzm<arU<KyN3Y8l!jg!5T~)_?tr%ie^0
zjSs2as5$+AwT%Cg^G`JMze3K+oVc2rn)FNWBJ%|xBy`};ojVUqLa3!t`3?7!kZ}G%
z%>8DN4!A9dW0tnew#BhW(kFFta*|h2u<GtVNCInwM>0J)5b<^bM_XrnNT1z&r;<QL
z!~WL|vckfZaf@q{OHHq$86CmCL!+HcE~bzC1)asmVJSKIiK=!zY9Ye`!RZv^2`kPz
z2garpvh~B}yPBB%$&|w)ihR+YhKm56p&Opn5yQh=qqs!f?iKOa<TBgHIVDkxGRaYn
zynio^H&e+5H8;HaE2%r|c(sFimo7b<cWom>xq6Wi#4ZBXlEuf8f}%?9jU;%uCaSv1
zFAv4^hO(l`qVUfkkuQ#?&yX|upTxPiJD9{R-tqBiO3B_?k>Fu-k8E<T$<c%z!@60_
zr%Z^COpSh1q6E$OxFEiou3uEKb)GrZF|=F79*5`CJzv_7hbd>TGmVei>5{Ou{%Ga!
znjL*&jU&5>v?SX~Uy;r*3)(8rk-<}LD9<Ruoh`fK^rXcn1+t%qKJ~B{Y2$73SGsC&
zw@!)OVGMn%x8NbiK0kRvMD`*Zn;Q{oFRwAGa=M7;so;G53Bp(hF^!p}H@@?7Nqo1-
z?(LH-8l$d7mA3vypmW6RWX$miDUXK-DJu!QRt-5yaf2@^DvG=1%+^db%j7NAB1Dr}
z%d%k>@mrS~x44_QO5DQwmnwzr927tmi+xg{NodTK@$oRnC<pg8s?xtq*y3={RFP!9
zQG(`%*frA0=_I?Ra3ujkwUsp&d^eNLk{O)`MQ4*3J5l)z>jJX#O9M#&8I%7N9PJ^m
z?)&rOwo>dhMB<}72a0tzU5wSR-l?317oKRfY!@2GlwDx?K?e1XuYtYr^1Tih3pRW^
zCPaQ{NU*NK^PuM{)e7mF+6~F36DP|wq)Uo6p(&UbXJEfm)EvNYg+zoieJLzOO{!bh
zDvERXuk9N(&6aT@TAps+u7r9cb$Otx2Mj}*u;Di`QsS&;CO23s{gtx2itt@}MtUDG
z;=;PyL^b44cEiL4mzvz?90*E=;HJYt%Ojr%rqAaqow-RQ`8f9!9@?eJf!b9ap4Hxk
zhcU+Rwem|ybW$r89#Ay4+UkkDny-|5-#Jl^Z{mD@?d8vONqbQ!o|hhIH8bKfJwD+D
zo^#qtpSU^B;FU?Zn?AmdrlucgfJKp&k`2=VBC!Px4v8}RA2*-xiYA(7`AjjVRVY83
zQN6Cq|8O0HYVZJ6zBvjeWkQ_o4e_R^e0)dZcg<%g+Z0>b$CB*NMnx6x>JGTA!Fe4;
z=3(`L#-PN~eFsyWFP8l|Lqw?U(T)b$aP_)45h}U5YKA>1o5zN0!X4-lh%32adGOnb
zW*;@*zpWAQ^p>H1*l<?pvx#R)4Y7-l23230>#LyM$0F#11|hivd$3GgxOeZ~dB`wV
zKEZc<I{bRVIX5I+iFE1^aWv5)X;)~50h%@#1;vyV)yIIrx87H0VxmMk3v<*CRMg$P
zU1~;>MbTqyTU$|4VHWt=5p6YueQ>z!aXu>4c$hZ?Puv|1(v8f_NSSeG2&Z>UdoeTO
zWuM7XAxC?}K`JoS9&}X`kqxSaw~@<U7Y{MXcNHYa-@4e^lG(=Eg?6m|N^q)9v&Yo&
z38M)gwV&5{a3uVlMCz7n2Di*@Y_cp8ti1hL3SQPZoBm`j{KhGDA;4mG7NL8v=N=Mh
zyYDf~+ZD!Amx(2vtl(7|rMe+DX`eSg-Z$><<iIwEK|57<*vvk&5lY2#_W1SJ$$YtY
zhOP!)i}CRdP8u7(kgo6|?sdWQb~JK51d25S%;y)`ZkLr(-$y-@Ac|IqPxQEUuY|&4
zgd)G?4;21mZyf%x5!{m)HB31h(+7UD-MO7v;*xxP<kZ~Slm_W>4nzyBTxgf!aS~f<
z^lZ^Q8=siPpH$@VlX%_~)k}UD;|8b1ZU*Ks1}kyJ-;Qsc?8C!>#R<%VpAtPdjeQpP
zx+6Qs7N3osU3ytGzUZb?``s(sQA`w^H8B`VnQ0ow!qYKj;_l`lWq&dGFFW314x?T5
zTB~@aqPrP8Cx>?a!jwQv`l=6(p3@?@r|&Vy3w!UGJi~)2=kynQ%H`p(t4Sca;CViZ
z!0}LrBhV7H?7+8X^C#u2_;S<Hx;gq@;1s|@c&KZfW=&2nBhJ$?Zts#YzpJXMnwNC?
z2-=YnPAh@@aedf%)D0VmnS2&;xF~LR$Jo%0hV!~ayRN+E>wa~1)mqCqCZG%{PMZY?
zw$x7k7oJbtonW!D*T&KOF8?C9@L6$XJTS;)N#+Q-NM{~CnyIWF5<1D9t=3_O^m?YZ
zn<Tw);7*Zf-QRcKTZv1&gnj+Se!EQNGr&<jvx@!ra?E7xa~1wdGdt<%5_Y}m-@_Y(
z0koxrr4LZswX1VBC-;pZk-2M<$_p9ZRC6hz9Y4tM_YWMwu+DDn*mIbEa#<42!%vdL
zR=Dfs?%{<&-I0(Ply2wgN<Hl~iHM!=%ehF5W;oT|hes(nF|c97b?{`aT;<YS9)P)Y
z8R?h1%hI86(*tmZL}*ClwirS?ZF4$|g6xmX$=V}7Sz~TW3AXQs1bB`Kh<CI8%qa}M
zHqqYARKA}~ST%gRV8Y#$KBVHdLB=6r|0QU5-VVJ?x>^8#$*+6PTC(a1OwP-dH#M1=
z%<}guiPA7}Q9^l3_#=2r<>#P8B1-MjcW2x!&MH;9_H&&jf`R~>4$;)SPfyYzVLdb5
zsmI`3t<9a?c<O4}Gr(mnhbpzMSEfe&ZCO$IyBY+zrX{3M-g8a^3>d?J_n;eZVPfY<
z$0ZI!lC>Ogz%~Dxs8Mn`h~#h|M-&3!g?`(`z?7xc>jWfaNB-LV;IzW1%VF;QLJ|@T
zRaI5(&hATS@`{YZ4ekT)N%T?7#T~pwaq=CXowxzGV5{8+E18ZPu2Vv;6vgwE2W(fL
zN{COwpoW`NnrAY`%wag$)mFItrD^x)#Z{DH&lep;H@q4!pu|95^+k}FF!Aoi+`oTc
zACd0aEG#Vg1_lPQ_#TdrCkNx9Em|0`hNHLj0IzO?s?P!ba-_$t+$1SQV{Qd-@t|L`
zLGf`@V~wZ#Cy{s72hiSV0Mdvu(=w4wSr$h-=EWS=&gJ1~@S<AX1&6>6CucWn^HcBi
z8x?^J1^;H*BrZidhn9lQfYS;gE{BCHtNC<8i1T6o*@p58TMSq9rGv4FoWrM-S7AI4
zre~GGfH+#gwhQ+!11h9uZ1Xdzd6+;kmp+>F{%!c-iPFvGLdBCluWuoP7So~7t;Kl_
z+RU~FSr%V26u7$BeZl_ig=9s74m1*E>H-p681ow@d~lq}+(C&h?hZl*-E%JEbFqFA
z4zF}NcN8zYvCIiRMH%;E7w}d<RHR_1eeY#tbO|)7W*%DcBSA)vl&i?>7C5iKH;teG
z809oW`%L(bs>C8>6ioW_E%X;ORO1*GMPs=bygs3+acuZ5SW^FBlsegqBJ3B!2(8O>
zn+c6oO*FKPV<yTOS9uF(QY_6|<2G+Q%!JHaYAEr09*#^kGRvr>&Kox`AYcG`3#d6>
zV+xfhq)?-9t{`n_$=OP~_Mwi&>A6mrytZfNa7M9fq2B@dRUhHiRKI6>XkH%Ed60v5
z=s!;3dmK6U$ag>lT4l;*Bp2V&nRfsal|%0OQTrH7;<zbPvharAg+^M!>Yn^*-+6CU
z>z=IOJ}P$V4wxwNcx7J&OH)ppVNm|KBs@YNkR;zE$681P2j5e#B0Ami%pNelHYO1|
zFgjn>9WzDznylWumm<1-kfS3aq24b$`ogYwDkObn_I>Sq2O^StD1)*_@<&oh3Py8u
z-im(6FFc%LoDlO+u841hC4UTN2J+gCeaLUua~O)5s1hp9NYg$h`n?ulrp($#=F20f
zyx%1tvky(7Ie?BGBCqSkC(NRzjdiOv3^Jz%^rT>KYZy@F0kw7Ks>Wd!&L=YV<!D`i
zcThY$6n$TtWnkGC5H|#qbJ|T3)W-m%xm%}eQNxf+Hi%cfqPO?n2rV5Q5ufY;I-SfT
z(sMk53hbj&-rcn`Fz>??COhTpOCqE~=I5eq`1s=RSl(dd?XsoLx#GqFUqGb347tdG
z{OVE47BjhbP#4AsW0IC__4xfAE%N#wrbA1FVu|SN5Y)vmzR2COQV*jw8=WI>9f>(Y
ze@$}Z+u4NMgUE;sQxi62CS2B#_{EU@UjPu+9YjNtv7H6rDv|j`Cl)~p?bc-wvN005
zYpYA1kavEH-K}(B<}b<lBdrFvxlg$+Bq%v^I(<IR_{z+u#;q1+qU?sT3X`vghx*HF
z+>}oHm5o%s$V9spZw~O&DpC_3V{!N%L-wiAyUzl1F>;OVpx2`Da`R~;g{~=RK=qWI
zzadEoy6l9r1ymCgt}UjmX@~p91v-FMb2ravW19&KJY#+07IR#3DnL0(e1=NWhOjPZ
zA1DWm$VckJ;?6<~b8H_QDB@7Vt9{48GZ0DvpUawfFS~bz`oo#laDZ~;-36SgK>s*+
z+Z7lC;xSh^Aig!M%{;fG3`US>&cDfk2pbu@P>e1V)R2mPPM0<`MM}uZV3-{5$_njK
z1eTgVDK2n(ZJ4Cw9X~&%5~<QS8iiUk^W%jd=qMM5h#i<1N`zz~w@FdF+<zeVb1zIC
zrV;G0X?W@U*KQl-XFc9rJ0(;f5=j8Iz?D{Dbk&vxm_2e+T~jwZ=6$%uGbnp4hf)#Y
zl|?NhQa!549TWQuS{l)!?(5OXx|q_wz6-ZxwlwKkd7ljCJXK{1@Lt=I!JV&SoWI-;
zBLx&T2bh*@)oPac=1FIlY3TsKdv23lYO5_f;4*nT{q@0DH**X7x%HtRaC=hnQ@K|n
zOV`RvmS2_Jgc3E!Dz1X94TWGVFc(h&4Q+*LBv-)QU@!CLN~P^woz3-xJ{hrHAf5H2
zd<0EQi3URK^borsZrrEA>8s42G3&ID+v3!7C@do*qc9)=97GiLFM`Mdve%;^3l8wg
z<Ap&N?*8Q5P6zYuiXm|;@7hquvC~BHhYIqABS$n#7S$0$hO6Nwj1miNY^e^f1=^yA
z{;I%qP<<Xkh22}tn>t4o_;{}Xa>HBu<?AFogh*rW`!oaxtsLfxogtE~tUHxC0$l{D
z9|{2VQlMyDHEdCB9){saIM+fY&~Bdm!N?rbmkmBOCBhKPx+=qswu5{c*g@yp`>+Et
zS(5n&0+$g^b!YKCY~UeFS|3fIQas^);3kaUNa$y12WBmU_VAU5bCvXZFvM=6Da|GO
zODCv-tQq-2W5i5kTxzW@gQl9!gQTiU_3teTxc6(NGa=KhyWwBA!89tTR+y|+UU+Tl
ztAt9g+G#4^wVIvg%w0HFG#;X&8eEv!tZkaDKBDBh*zi)6n&(Vw^U7o_pzQHEjp%eG
z%Fh|wgupnOZWx~ux;g^`gLg;(h2*UGh;?v$ydP%~?~bNY)<C5!8VxDn6Yj;oS4~iP
zHc;SWCIV7N7+P^zNzB*mzJlP8gJp77y<9g)M9T#yQapIK)mHY|ET<!c`a$M73A-Gr
zMl5VAC+c7j4;fF%@)wWX%KYBOP=uQU9E+!84!n8PNKs-3^p&Hk&AK-m%y4(4KO|+z
z)Y4nPY3Y5J#ry|gL)FK&YDI4k_uk3(lGyFaTmBd1Kq}^p(THv{&>S68swM@kEiTJV
ziL?-j#Cx+TMw|w$EW_lX@3vpLhK_lY>cl6Ynq>Lv69v#<=}xwJU)aKHqeX+AaSI6i
zZuh9AtK;yJ%ft=Fo1jP!Akz!DOF)<3ra7<;Xwu>q#qCI$(U~D7R}o}*A$QjlK<Ily
zgG=$3RExBNvHYZZ&#aNO)!{TwnJ4OLF!M1kWk{~0`QVEIaBk*vsn<T8K_fpxN*(YH
zTUS)Yac4+KPnHfOXTBw7TzDa4IK78KUjXzqw5QSB?Ddl@zL;h}+Y7pmxNBw1Ze(m_
z($95QGfW0<9B5$sE2iTaFpAoBbwKWLm<Rj*)Z^4Cm~0kZ%)jLd;#A1R$IdzcEEreA
zEfL{Wt}Tud6S(a+{amJ_x<S4rJDw$#WR;ezxegqgH$me%Nz)B>r9z58@!ZO|tE(0a
zSeGVctW1#N3%_4zuxS`<vdhDwvj8D`8HiqSjYn}xcsM>vdzQ;jXz$vmmm+*zezPWM
zx|~-Q46Gx=4njC)8d?vQd>5bz4@7ZV-&7l=l0JxxY10B_UbeP6zLq04%PB!0_xg-g
zB^5u0bJ9LxSVgmjTL94b9u|D2@0vf3*v3X5Xg14+%~!BX1ux#SbsiAydc19+aJJo)
z*~4pfv}=`nrqe#T#nNReh^{-VWObxuTiDWv0Zt0vd(XpdF%nSlFRPV)?)e?ds&NF@
z#w>p*mfyt=90*Tn^UNL_KwPwEU|F0p^}~y7zgjuI8^)e->rUC<a8qI^h-5>f+9F5=
zPFoCa#V0&OCd)uWY035e=v}xhk<ZXQ!jG>A;3t<sEGn)+zy9{^XIF+b>Bj<wRpbC9
z^)#3dEW=~BaRJ+%2c;8g1xWxlEXUy7CGb1V^JRL;75O$jAl+vU1ADtcpeEgX(4!W-
z9cY@rlj4TFm53r~-TtEKdS=B1uiE878OcO>mUfu`#U*pJ@`Py3h}-6E@$x%NY${J2
zcU>KDmFR8BNCj{?z8S5ou+`}}eb3#tkN863+lo&zL{P`kz;)}nFBd3oae8ULSC5%K
z>Jr+v#lb+DCLFZF;6EYRyP4x6$^0E(-&n;`FBv&=un`*$^UZ&0&MnSn)*vMw1r^o}
z$X8pTwJfP%ylC9n^49Y(D2dax+bRV8i+9eNKkN2jR{RLQm~E=P88}-rsdUU?z^6P=
z7|0XAl70^_e0~h@Vpz?x-W}g6If*bUjL=N$&d`JUK`Hx&pw2_R3keAcNL;`jY^3v5
z<8Z%W6GpDHMga-OLAIUNGD(OjgRVj?B7PqqA9wH^46676RZsOIMx~v~Z+DV-YI0lA
zsFvY1KR2a^i^U3PUif9OS8rs_7-I0H2rq7%->+BrYzOZNz4vCB3eUihF<M4?G`0h_
z<C4*`7=zFZ2rmt`enbrEgT_lMU}un~itCBiwQ5&7xJ(f@Tu~}l&>cGLc>a;$*oVBU
zVP5y`5GJI%Z_^_*27@^p_laa5I1HmW%Ak_HbHb9@<Gz6#$2Fupj9GEsc)&Y*uisjt
zGta8TmShdRYyFKamA&OA(np}b$njNz%xGgkrFc#0V<L&uHqV7nqGW{AXF+bQ#dgZN
zVSTU%FDu6qNyzNJs)pcn8BH%Su1zwQGkfkj*gKF`G^-8ZVB&`I9~SBZiz$7DKRtT6
zf}pc$f$?o9wggSr8W?O17xzaB>7#2C+t=zQCGMzR*3i;-&mS--8n>?&E$dq8=xSOK
zOI-pyAhSiE?{=xe_z8xK)Rur+*;2lR-b|TDbf(wjYF~3u*Yc-)w>mH?1z83g{X`6V
zcYjYkY#cr{gdG{pzHz==s!X)kj})R_L{^Ei@6xi2g{%~;v|rc4Z0B^CwHC)S32JdW
zXn)Oqt+c~=0~BHNM-O-3q!=Dk5S<-C<VqQ`|5|$^?a~2&Zrcj8bnUAvzFU+dp`tL*
zkh8(<UbWlGY;MU`7#e0%ZYc(sz4{|5xuI6cnqP2(Ga!{4-2->FRn^EM0$)_Sr3+-3
z2UIX-4)I$VoHBnj2qvC6o&0e8t<#0V!hBo|SO3Au{>gd5l0F4e8daaJdk!`^Sx?4k
zV+uowFL(WJmaHw6Shn8>xDwXo+O=h+evZ@&Oe_-i;6b?0hqjM=v&lZ+nTsWGN`;2i
zbTB;avJUjz3`foz0TsVm>x@LStoEpUK0niLQ7~EtF}e1M6~vC^gh|R3ytM_eCo=Y>
zM!j!Ve)+T?U}yD#spS+g7D`Jg3P?Iq!L~M4bVgZJyS!xrM?p+-@4iRjl)qIrd%4cd
zQg$^ZfK^`ToM0MiIFk^xj8xW%yZGMR{-ha{3y)ImUpE6L&aPCjlN)2j76+sBJFd2j
zCg}5B$ZXQkX)0n-Y|evLaHB$@(MRcs;NP*sWDd-&IL>Nwa1<2y8Lj~~HrGhX*42tN
zWJq}E;^N_=OVAlY3|2$v<dUW9(tG{PH-aRlDcKqw<2|{iej-7lb{4l(olMdW_ZvY7
zAZ2+F(d6L9$2Tq^f=)y>3*M=D{n$Gb2&IG-mjp;3{7A7`yT(mD$*!+^?qbNpfWLGE
zHl|+47(nR_`^dI=2^=$-?3V4`yCxRuJS8Y4=JIM|Y)7Gea~qAWjY{oqpX#$I;DqbL
zwDYUu4oG;6Gb#DhTqbmhbMCraipO6}=Zdt-38b*P41}!?g_VJw#^0+m`5b5+f)5Gu
z9eEIritVEEqh5OF=W>Vz3)Vr)&GK!c`)72|pFht$E3OB(wRC5nM9E@PniOe!g(>(e
zX+bc?#C2Mxn_Y6@M0s{0NQ&;QmiE&9VHT-<bKe!9?Np+C*a?PFYQM7Es!so4J0UV?
zUOWqfaVLup=~3;Ax^a)$TYLHN=qGbaN1d~aR)B5f4pqdFW4cx*br3*3JC}L*4ciur
zd-~z_4W5$A-DF);ajuJT0a2v};p~d7dElQ*i&b$ccAs$Mk<g*n@WzXzrA;jAA-^~o
zpX?*p9&i0Yi*z$g&h1Yy^`p27(~a{Hr%cUd4S1!Nzf9(<M+L%wvqVWWXTKty_IV$H
zMS~!x8i+&Dje&+6VyxbW1#!ECEf#nY$ku`Hv?XD*1I<P>)C9p~TjcChT1UM12H&xe
zo(7JOSAYgRL2_hSt`vdtD<hZLr7~A6y<Pl}z|pC;T9=$1pNwr&z=q#iE><(pGpUB?
z^72zBxz!h%jN?M6dot8J&hWjaVqZ!IMriOW0nZFkIolB!w&~iQ2cclDb0BsDKihmo
zki^vE+N?9@g=Q*^Y=K=M8X(jtz6x@dZGY|u>yBdU=CGjPU{Wi~W<AJ>Wa@ctNAt&-
zcxizxpFeZp!ov{Zp6+V6`pw9?dfc`5uDaC|g_2an?E7khcZ?Xw4g{7gU(^Cmam0mz
zeX7F$WtlsP<D!$3tJo7xj{uW2=wtJIBlteK*ABXbpLgFo$wOr6)Jtz9Mjy9Q(OYPe
zrnR<B^s_QiA|mvt;wYWR9lJ<Ik~0;nuVy)Xgvb~Pblm87m^by-Xp090cCo+l7Gvc)
z5Ig*PDr5a${Mlg>VppZP@#T2@#iZhazxAN%eZtqSkOk4;4ILt@^S$MuEV|~#%(I&`
zy1}?kaI_??syHE0GXm{k3HVo1i)Pz>N9@5?rTc1;Bfh-Z;cP>&`-l*u<NY{vg*Hf&
zUR96V#LDOd)>j#1J5+>(jh<g82w<t~d3Ry5tHu;UDXF_cvLR5CqbnmSR5c{d$E5C9
z$?^ia6N+saYL`PHJmov^vXGm2eFMLzf$ESfb+NSr9i(&d@2NWV6de;g&ap)jT%Wj?
zdqWVTGT^d&6w}EAqCL0IE*l`By<t6a_O^=4?})jQ{TGLU3)V=j7GHDce%OJKyGNMc
zcpJp|kx(&}L1|pm3>Z9~fM0RBN=2h)0F3jrmy>!yW-u#4(*wdEB85vG<J*vlKK-4r
z^nz0-KTv=@NCJ>L@9)@|L1;5zh85u37F$6yQ@PBSGkPF_e&*DErPV$q&aPXU!ERY#
z^3_f_05wY=e9$~BdU{!6!S14i61DDx0-CXU1G2Yt$!WP1Py)qKP8rr_G7PB3bj{%D
z(}kJZeR&56aw8KEdhh`7&%7N5Ue6yxGJ_8ZY3f6+X~cvM(E6O`+gpmMU6y+pJprqX
zCpn{PXZLn6Yq(^LD!s(`+!|_T07LsG1OX3Ll&X*_sHf4Yd8^v&)HMpWI*yDwE(_|M
zj&0NdeTtZc-O#<W4U#1@cP13YeD`m0PSg}L;AUu|DnE{B>a#@r&hV=73;P~-1T6<0
zEMm{pm`AHy#o?fUk@QaUZ6Y#R3fZ6@7XLc?wCHP<b+zDQ>Md@iX2+|HDt@ACrTqqx
zSc0wRZoD*y0mzqFqbdASw4Yu$(O`7@usCO4I1-LdPp!AKAVI78sEdHmo98(Tc5JvE
zaC%cy!O;<+iu5X&S*7||L$zodpw~{!q`4N&l(RnTg#hJt@He!&G<UFXsqD@7cJcA`
z)hLs1O6nL=Ja#sB5zQne4RUj<O~3lVfv6A51lA;b2Jl&`4bHx?8M;}@z}eb~*!t<8
zo|tai+5|v+y27g?G$r%L&NlCw(<YUr>QGirJ@EedNtA#WAtlr>#gUnx+Kc-_6zxtS
z&$#bw*iv@n+0`$B-H2%fF>c#eQMvWSd6m>UH$!aO_f>caK~C2h@nsj)wb{l`h(3AM
zrNovpp@K$>nR}pGkLxw$_^?k(p_A2cSxff0o-x}5%zD&_9j}NgSnZWujnZ#5YETf`
zI_@+eTJ9@4M%z;n3jK9OYUwpmc|AQnRu(*HFTsDg52D6zi?yk6QiRb8lV0v53zU)0
zc^qFLKu3by`aE3C?uWXzZRu}CMs5l;Yq3PTS6i!GX7rX_D)6d)U&(a;HcGH*(@!~l
zUW}B4yK#Wj6{e4uQg<CXLWHLC4w6A4e^UoL5m`Kqtf&P?Cpr7QikiK!8y9x}VEI1F
zinnNTj2Fo#mA^Ae+UxI#J1)aX#M@VVl{lCcm&Ftum1juQWrTLe7xqKVrPXGq&`r25
zqC+ByT&p}DPLK=bT(c#oJ&wz|TsuE+-#=c4mOpaCE+7?TUFW&>D%0W&nZuqD;ylW-
zPDMCVLZbeWS(D7UG_>%D@+OePm_sML@I6k1tG|#**MDStBb#`xbu-$r5gx_EV_brC
z=?9`LiZ50dG{@Rw!zD-v=cefzN6@xRsRd+R`bJd*caS97o&Mg))eD-B?Mg4FrMz1W
zd-RCV$~yr4p1XH|>`Zmh;l@c3?S(F~R$gJ@)43k={0&aTO&XPg<hPbY$Ck2AKC5nK
z{$20<_!r(n<l^2ta<N0=;KB=b#ZfQjTNmo84}lS{dRGDe$!-Pq59deF>|k08IJxez
zk|qj{n^^4^?9>d>;x#A9BFVVbFGyjZRX;={9Beq$(fh3&al<m>*Gowtnum|JyERoR
zOZo0r{F{zuTT%H*8-#1tjxUZ-RR#4kQV^NEc{qc8ixa-HbpxAw*1o#^trT;!t*u(O
zfRs?z0X<I)Q1YjYF^;%3oN>knLovybM*HwG`HS1w%*%c&PXc)o(3%;3Dv7L%#cCgp
z-06Mk&>Qw*?aPr0=W@5ohqtX}$^2C{R*6PV#0Nk;%u{P`!gW*LI^qT8X8DyZ45oVr
z`Gc4C)FxR7X0tbQyFDBjOYlJv0@xh5)3(JW5j>1IF%*=LPBrRfAMw&S5(Yym+-XUz
z$!Q;8lM=+O)+*2cL$s9a(q3qCvLs_tw#Q}FaAY?)^<aYB_ureNW~jhevz#+gve1qo
zL%Sk(sIS^*vqTAXc<SLQZZc4}S0fd!p3dIfC!4xoq&EodJiHNc`MNma2~y1MbDgS&
z_n0V!a}WEju8T=m_b|0Kcs+DP(@<oWz?6^E-8V<hgb0Q15XI{fR?nM=uuW?`q}Qd4
zbi>u3Rab<bw)8?ZpE-O$ry6S;GeZcU;0b{<8ynkj>{JB|G1XQR7EE}@`WXgB6J-(4
zRv6uM{0u$5D4g#=4*8E`wUAq+lnPg=W8Y%iBMDR=L_X^k;WK|G>SSviSqkH&yVu1x
z_s;-Q0m`FHAhBTf7Y5(sczD~<Fq3oas|B+rv+^^=!~EbCUOf^Kf!iJFUZt#5_(Z`W
zaaCgMFB+8GD*S1Y8EPH39~=M52Tem_#uN@a37mT{?}?TwmY_B`d+jX8-pXbWb)OSF
zT<{&u(34e*NC%0f$~ctt@?ugjqAP?;M@eyEWoBOaQoXA%<wJ$FwdwOS!RMg{vzkq4
zJM{)CRE@Ss2-t+f5oS?Ldofzr;A<8f@o(=*iU^w@2f9{w2@qp2W4b+4?}2HOsG`^2
z{an4MI*Wnc8--Ig;o3m&J>Gh;3%1OQDe`dckt!fsR>R@D8X6@vdR)_17)%&d5b$Tc
zBOvPhg$ngLT-bqC*g;Y<9YbnV7iBZg7becB`Psq5B$s}fH@=Q0&3D?BPt24gHMsUX
zp{gqUUJGzVihV6kQ@~fotZ6nvb?K7!Xoe&u>bnRpu!71AC>-B`5Vm5ugq40unP4{F
zgJD;@of5|QkErnzJ9X#@6IYYfQ?6Z7tB)0YMloV)N32v}>Lp_%?2@F*4NUPv4gCIr
z_Yz3;BP5QHj*r<94+j(Zguprxx(;NcJILNJ-H;496xrd>(AhTFX1iH_w6mqgp*>H?
z!=Z<@wZZqHBMR9Pl08y@XSX%Za%1r2SnDiW*?IEcmnv~<nNP%sJdf>aaFuHc7o=us
zY}j&72lYWe6?`8gRJ$Pv{T`MgZE?0KeA5j>H^Xd`>#t**2I^ybF9`6(g=DLZpHL+5
zUh8_3ab_}aUP&x5IdI>xkj61R^s}Dk1UgO%9{@??zo*XpBYSb;PHe)e9d&8ZO_#(5
z_j^~Xl@&xawI|q`RYkMo?=Q*^8OwBtqY*S=-)RUQ)pn@R!|{?2vQ4P!ItfhN^ZZyF
zzVp<~(9B@Q@Xb)IxP9U`h9AzMd-m@^*k^K&A!Wma`irz__R*S!B9CQNNwnx!y(H5@
z?%nrfiPzpu(>$8rtL=Vmc^~?<??>!`dgGD3Sf)8lwyIg;TF1#vvH2g59|_9}t9Db4
zV##@e*WBFZ_vC)pj!!i!v+7K!Lgh#H5IBqxlU?no-X(O4OhM9Qw10OtbLc#SjM+@&
z&}q~gc!{uotc5lS9!8p7S#*Y-r#S4s>mu&7C@Y6BpR8LPV_7F^J76g<RI`yv2d3QJ
z1cjTI4vn&pa-`Z@MzIt<VXjJ7oEiSK5Mkk8iP{%{4R;4ig&VB~@JC{TsTxjE*uu9S
zm3qIkAM`?_8=2p&FKfh3?)}7pru=_MJ6Npv?`-{tfBElX!Fv4f$^MrN{!0dk@$&yy
z(dGZO*b9mpYo6kFq1Et%2*i<Hck?s<Dhv7XKh=gYR#jJX;e~m55lNwHZ7=|k@S4ZG
z1SFwxg~TvO%P2n!(%Xd(6VL-cgruN7YL&Cv_J9>p_ZSPwCiX?>0II#K8^V_l048V(
z&>8U=NTLGadcx0Ovhl*~<y)Yz?IJFA;XzWZ270R=A9xAg@7UGBihvsRodizcy~s~K
zZc?15MSUs06g0+=APVf92j~(7R0X>Uw~W+_$I);?umNiUue5F;cK@!o{;UN{@fjVz
zYRq8*Xjv&A9K6);2?gc>uGL|cugG<6LEEEUhT+*#{IE8mxnuCHJ|q>_GO7Yib0|NN
zhLTPGn+mxX;7f;~VLe0g&kOsm#{Ca~nd&qnjWk)VTbih^xC(k-5ZW#A2^%3Y(MpE0
z8swpYKupDp5j)}g7kEIoUn8OXgM$9P9)xg3Al^j(wQX(~@%{+W_6z3%?M3WoSCj!-
z3I8Az-qjQmpr#|>?w~cxZ|m2)Uf;{c_xJ^IL(*L0k(%nqP;qmi6GO1^w~^xxSA<B5
z2V(l*L~sjJ1D?>A{j&D>@#CCpz7b*e3i9&uYouTmqn_MrUtp%shnI-qd&Dft^b=l3
z!f40HC-%uNQKB_wx?zbRNGR8;b`nB?*aL7`MT6HKqBGwLQm|N(fVA6$*TFB`Ey8+G
zV-6gMh#(3wIRRT3JO=Rx8gbNOS$OUmJa<KC<~Zz{5&?9m8&!oyZ<Hfog_^v5`l{Ss
zu=XP0yy?gRbQAig`jJgIDYz-hd}nWVG-8A9SBwO!St2X=3*j`t33}6FCx0Tv-`9s*
zMQX0P8tLJ9lo9t)DpzR|WfS;>50qiEHx;t?!d|B#0N-#s)Cl!#nd)5OxvN1AEXd76
zg~2>hk(_2^Vv<;A`VVJ@|D$9Z4(%tm1Z7X*B=9z#Ix#T%Ax;a;a(M;KXe~`{SaP5*
z?^Xc;a3#*U!2xcrm@z@SNPI#YB<Mo$L<X2c*$`fL(5?lwVUdg@66lM(AQozz2uYRC
z+1MPCS5(v}{@QxCrkMuX2+yT03H=zcObWn;#ecKm50SF5I9z~ebB(5?8mJm*X@;6n
zOB`n7ZzFfEj6_4E5n1q)NZF7Wa84jD`;r(F5%uv2&h2q77PU>9kSrdN;LOM()<w$i
zH2M8su*D!Qzcg)cg6}h3fn$EH6TS)kV%8=2`d&5`6-;Mg)eFO(Ca@xZq=ozwH>4C$
z->@#h*Y~n9G5%K@ebcN6P5t>~|DS22|FXd!sr7%A4F>H>jjXK?LLNu?pG<fj5JUMl
zrw}9d3n#DZ>N1(M0EE4D2}~F1CcLsiFKDiM%{)VwQrK`r7QB5UhTz4VApv_n?$p`f
zC;WVT@2pZ*i03Q(Rh%>rWZRr6{0sd-%2M#09fD2w2+3#_al?I~w8;rbWzO#NYMZ4d
zpU*peS$ls?-~1VSXP0?ve2C>g0k&KOt7D&|XNn&LtjOcH_NdWxym_a!LO9>3i07SM
zaYBl(uNN(yiP%?f;e<qlrVw+~dY?b_#E_x2Vj5mp-*o7$rZ;;d&g|hrjg8}B=)khL
z;5G)m8f3ILv0{;2il0yDg?N}Q?w%N8yKsn*aU87@0VKH!q7B=HuYxv&pg3c-&CJdT
zE3v?o6B{t_-)D?lMT(eP8r~zKGy&UA8Fsh{P!5tu^BuO%@0C6tqDW>^pp4r84BA%~
z1inY~yN4&-N+*z9`_SGC4JW@+R|Szo)2qod<(cEjs1QE5=Sb<s+vXt9rT$*nG*u^{
ztjCa?{TF9HhEQkj)ZV|({$eS+;<y)-_drn$Pp)SqDlKt@gZ97+ibdR%tQ~{>k5cv6
z0I!AXV?*Le)K+J>@>$q)`xR>>&qK^w_lh#iVzY#mwNmg$)uLUs>hauBT7Ts6?;`c*
zFjx`c-njWJQI)v&ZhpOa4nq(B$ufhCb~S>XhCM-0A3HCvppbpWJQpvleL#NPxaRW~
z)E5(GA-N*KfUl;b7_N^p>~FZHW*cTG*n_1P7Db@KYv6mcK#Vw8F9EsH-6S^rm2pAH
zx<xE}{(*Dra^Se%t62akV<++J0b4G1-i89mRC99H!yDC%;s7^9$;u%HI@k$qSnGzM
z19^+nz3M9p3JOp@=81aCz!AD|U`SO3wuV4NJT|ETS+86%Zol83wc9u5_{sd@)xs(I
z4Gkaq*lVBqe75#I>SCF|9@`EVeU1=cL^zg<uGNdep-150!5v68Gt0zpKyGP;tWJ9m
zkPgbiw}6|<f7yT$S@tX@WLHDeiPE@P`)}x44_6G-H&V{umVXa;>OpeEL&TCigj4h#
zG7<WyDq6(u!ml_79D-he3Zav;AZJ${91suDn#*4-HtF46?a?hxCUJ9zL%DLzHPM3_
ztdQ>8cZ7l)2S+@LNQDzY`3)AO?AOAE#7AwOqvveM1_56v?h3yWoMN~?W|X0!NfFsR
zPlF?I-fl{?z63z#1kNOG+#iFuk(#(ZKHW)YP!Ctr%-KJxI*PSGfeS4?{Yds9bhs!u
z_03?X=pdB{(tGRP^692vB(fVE_vp6sos~Xnl7M!1AjO}7-(2eDjELU_@=F7cQZ;}<
zzZNPgS*oL^Hn8geO>Wpk?3YKW7i8m3m$EZdQf&rtI?WJt2aRm;FgHM>(G_MKfWu?b
z#6XknehU*5ll&?}yfE4=I~f_%KA1WvrM4&1qSBun1nO0LK56fS`>)5>-rdl&|0BP!
zu<*)D+G!Yz^-tu(s|Doq4f?>Ez0SOpJ<q>AK{8J#`Z@~H!=IG*Ao^XP;S_wXOT}iM
z&)=yqk>H%$o7Wy-%ZSc14oC!{qmCYV%wb&CJ_!m63g#_#?L_lx<J=$0sL+~OflDw*
z&vUx592i8{G|U^|7^(@rqBu?Ge;}1`1pD%B;RhsCSx>mvZwL-~J6K5E?DLO0RiAL5
zgK~!Ng8vU6D31cMGx?ligUu~+^NNV<geYCyS)GvA4QAK<hQwFKuA^s=vFnYv*zgA!
z<7yj_i%kj>UaHPLl1)h>{c#VNs8LJ0HsOL{?|oT(FTvC)4JYWC4n15MHw*2u_XH}s
z3jVO@pZ9al7Id&-%lj7io<A3^z4w`cKAb$ilw>vm$$Vd;vo~5{)XxS4BB_95YgqE`
z)k0KH2KC&?9y!Cge-BK4Z^}lckZg61$nUiP_&Z_Ct-IhAwAQ%x^ZD`<bYI?_TR?qf
z-zf<51)8f+!dncLzDEQkI6m?_jujt8x3!Q%Y7ekRNLUUqdt#H?4#vgC$|asVjrvNm
z7ipmTTpXhg0=vHU#%8+JzIgFs+6Dj(;fym+-S_+XD*mv9(lsPN3hjBwK0yZ9U(hT}
zl(B;ju&sFy`_VJ8=Vp<c?fO827-n3^uc7Emam)%evkly13od$F+tNYUt2IPj?%x*V
zKtmt{dA1c^WVQ{qND3}}t-s}_SEw;FNSR|K@+|LSU}3!W2wWIc<q7vngQUN)_6!ky
ze>jOBViT*1xS%(Y|Bodwc<~jEF&&Oo7BoW?YLFj(LWe|OKWqv^UElY&*8#+A^(y*m
zzAeii!R6Q`Bx8(D4&R;yaMiH-e_%{x6~a&!R%3PL0cteu+Y+#_Sg6hWzIc=m|6S~V
zPZp8(|DVz9#9C_s4ev;t!}gD+9PYl#Jp1nIR1QUw4A&LNNcElKxEZA;(yu0W!oV<p
zP>!|#RNe!Nqe84+j{|ok?5f<?AF4*2_u$m)Yc6Y7W`m37K8rg-FGp>1(VUUE-@we<
z<h8}M)vzvCz458fY1!*sMmTT(4^GZEp831b`rmV%C<?Ew|LJe&o}j2h|BCZ55}p4}
zMEot$cE$XIRR3E@kWEZ(`v>v*_ZMG)o67!wQA1<*FT@V0mMReZbpKvT7XG_}pM!{$
zd?qZ1hR^on9zdZjh(Z=XMeP|3rN|F>C>QfO414Y0NaGL)^w>nU^qPX|uZKkJd%Fah
z#e|h7rONM<dH#ju-R}+BF)}^ZwxPAi$jPZNyGA^?N$y;{ljHLU2EnrWe_TVUGFUwq
zKFofZJfmXp`I9O`z*?x)0(U_3;H?aYvzC+$-t@wW$*dOl*NJ|E+r~*iYOE&9*y@m`
z@C)z^POD$+&(IC%(#f|m=1bGtbje@p$Y@S-V0W0lt$FVDPq}P$ujaC|i5(=lT8x6Z
zj<%WxoKbLC`Ou82NW%p{D>esHn`z)5=iDqtVtZZ)iN@h@)LJjdRD{NxMn9?xnU2iS
z7#1@(S%=#f*Pb0OeisN3FAF?FA{+8aX(^(`n|lsFG<9+k%S?RmP3t#OXuU42bJ{GD
zXM1qJsU7$0Y>lfMvjsJz5RD=#b)KCye-SrZUjB0XG-pZfBgTV*j>#6|rFEZOQe0e9
zbqSw1UvB@rl+t`o&k@t%P+O3#Xg(8CJQ6Z?=iSiC`;n?;L%YR0Qw63|N$m@dEsQLt
zM1!DC|1^`-TiY*_5*wU9@U|hciA9YHKHdK9{=I94X{0O=(0#2YE-oH2L_unII>l+e
zYkyze%bgJ)h6}sLZp9&IjdKCte8`h0VMBy&EmAo2N@dILP>cjO)8qQ5*aGHo!|te;
z^tlDLXGFtHwOzC*!1AgCq(V`<tEM6CyK~zYGIM9iMOUt5H;fd_)Dz9yoU9kwh#X79
z!6v=X{_Z#+iEQF2reTpdcc;Ud6jQ#pE@gX!=mJyGUAtV9k?1ht#<~+LE7s<@7=|MI
zPWRLl&FV7l=_}KbH04hX<6Oro-bMEdyu=Ip($e#cXS^)E@I>hS@5ao3%9BS(j=o_|
z8vawo@vY}$FcEw6+MZSl78jp7E#cQ@GGVEwbT;&_2N88AAEddYuer?lneX$*`^M!D
zzByK}PlgLN?z-KYa#`;I&xQfxEr?KT_(Tkwr@i=%uym>7x6W6Pdk2UvK9GK^Ta-|_
zG4DAepy|ftkQX$+^Yrhp;ZlL4Z2}q$Hau)Mgyq|{VCnQx`~PN%@FWnUy8T%j261@9
z;R}xiHGVxT@I*9)xi>T=N#o%Qu|8BF7QT9t$W7!UHNEPJw>RXX&G3c$#^(us-4a5_
z$wENKJ6CqcCkbV5xKM0(Ed+cHnm<C`jG*-*$55w^Yj;x`^_7S@m5?--iSlDc@H-BU
zYDc;GttE*!c<lQ|7muJ|q_(j2%!WQ4DRk2)#crkfe_^Zt>h@Y8+k8kwL|aXhGoT(y
zk|xJ}2VZ9CBmO#a%AcS-g#Umb&4#Be{WszKddftNOG}FQ!M!7Vdp_<uVzhDB4KD&b
z|F{?i(mWPC{>@*-w!pdPA2(*+NXVWmMb6gz@_(M~C7tljE;|#y&nAv4G#d-IEl{$+
z-VpmMt3WDxXlIQ3Mht?(g9uPf*7N_#I@5n4AVlq?s%YQZNSM%GMnnPqeuQ+tivSx)
z^~ae$px&7K5!m$JEq;iM2HMtLUIG)qdwaP7)D^^@N_a3y@uT9nPV1b3!5$w!KN^aj
z7SLvD<KvQ~8;70Ck)ozadic+}t7o$x>UB;-Un{wxFFErkLVmy}-|)Ei5zSv%rfwve
zpD$_Ww|Mleo*Kp}K+vM+;NuOC!2pioPSAm0-wPbi7eTPm%W%w4Wmog;-OqL$7Ac0@
zO`-jtGW&-~M7sld5k2>>FM?p%li1i;@2e0oyM4CTlus_E{&kB;@!btyj)!qZj>>8?
zaKewWahxwust8-wM!Q6yS&`Lh|7_3>YK04a%(NM*KO`ZcmrP&c>*I6PX*<b=Cyc-4
z2y**8&458iNK&nF;REk?i`yKuVOHfuBwX^xL*uRZ`ueU6s&&ive|Td5dslPt;K9tQ
z>1mt9l>(&%wc&XN>K~W++XvF?AWf@ur(Ls+KK$@Qc7t_eBM<ZV{(LBZ#|;l!08(1j
z8x<r>4QE%E0S~Iax$&z!AlS;@dz=Jp;Qy5uDap%6LxwG~Z2l^gPR+Adf$ZiCkfYN~
z`|iY*en1LM+q<Sve2;k~8FQx#AK|s{-d{LkjQA39|8PtG^8=|Da;QGkKRY%eJ8tO`
z^&YOL&EV4ita$0KFke~S4Wr_u>Bs}#fI46Ij6|S5Y3Nq%hRv0w{)lDw;qoslWkFUt
ze`ezsA<DE0QKmn?2&^I)Zt_OtX;KvZvyJCU@nXL+N2cIM7~vqHItnCL6~OhqbAvY*
z$}EMxhWh@<T6OvG>JbeR3Dr`g=QSPve~SPuCRj1SbzAxkX^#mpIxM1op}o8CC>bZ7
z4eKd9ka<(R9xW~tR3!(t%h+7}a5#qi1@@z#>+<=00Lf|aptpF0U{{fHp27e<CxTQe
z+<3$Qt{MKWovS_ZIDpYfy|Yk_GY01P5TsRag^2^(aRBfjyc<EisVtt&PE$aDQK4JE
zC|$`lOF01FLn_=huFQTAXtSpcfYCBCA}&w(Z@_8bGibnmwN<pEg)tJh@yB!U7tfsY
z5oneL@Cu!dxYmn-y7d47@)+isdjc{G*UZ9w&M<H^z^fjD9@>zN1gjQOyy#AC+3SLl
zFhBQL&aL!Lsms!6AOh_pK!sqZ)&p1{U=0|@O#|dl){qkg!sJL-n}vz?hJD+#Ht>Zz
zqM)1+ZOC*UhhS15I3+`{NaKW;03;w@v@&X`v@~Ags(mp)T|2)Dsf@HiY}JJDk&W~$
zP9Lexd-ncs&6SDIVE++!2y9e_FaDl&HQ0{;iKSq*OEW4ee`kO_47rWFu}y>n5TYFD
z#N2m4{QUxAl`a>tuX^zb3qJ;3#Uxp5v@zRffl=2$-BPA#K{-Lk$fyJ<B@&%G1zWR9
zC1KLK&AVeXF$nTEH=tBjS}DwR*qvQ$2wQ!yI^ii9jUa0sQ{L<13ZP4IfKrmHf4R@f
z&+&{y=TvAQf>f3T=$xckI_%r+07f+y5Zexn7EZmbd@Ec;YuDv$CjzL-^-EoAs}Jp_
z_O)aH1Y`94#0tPKS|5<hL^4(0lIMHI5ApSA;xQf|*ic4?Bl5V)(i7p6p@#ZBFDUdw
z0Mc8y1Asz@fZSyf<gEJon3L~2hhpfC>{_xx(1ZamH@B@ZVtF;^y+sg$Xx!6Q@+BZG
z8jw;_jJA<Civaj?J)nc$dn-QoA!lqVC%1^ZsCW0cte;aK%lrJdNI}OT3fkSfQ`$4w
zF_N|u#<8E4Hlpp)bi_UCT)2Dp?rEy_=j>Cl7u7le7F6Y?S<kcB!()AbOQBactO!78
zHUQqrGbp~)Z;0StWZY1?6DKi~Fa)S4XZ-8W)5J#kif}sju`n7hxWFO-fU5za-iN0R
zA3sdvN4)JTARO&cis4g0<pKIT2mc6x)1G)0&H(Bd2B9}PAK!|O%S+DxANJlmp6mAg
z<BljPMOMm+iin1tkq9MwW*3F*y?2xmTC%dUK31}_qJfgVBO|hR_P&o(_5EJg_qslH
z-~Zl^-#;G8XS~mKoX2^b$MJl<Ip9x2tZN_E5oK`p-WhehjcO0snpF^4c!ZF0(FLp`
zI>AmY2^hqnUVDy^F4DoXi0s+~V5*O~AR(|=BEtiT2{T|gc9m5{|Fm}h*p7)uqG8yU
zPC37P1d0ULb{jSF+KmI@UJ4;bo2U$)J#5X10?PjA;AwY96|O;r8pI&7)l`{ynY%I+
zxl~eOoK?z=n1;$rd}|Xq%^)P8>g^(Pe)M@bqt#*ZIYb>rwn;Fnl?c{_$@aQ<E#Mu#
zyLc%Yt3H<@qGGw`;FpS&8(b`0o6=X+v=sA?x(*z+{LJ>v34;s#lQ2_n1I9@j&4gxv
z1BcOWj7TK&W{&D?4(c?C!b-in1R$l|it8Nh&oU{$80?(_u(()kk^tg#%}2DCBn``2
z0$?Y3p<SnJObC%`n&a-#o}bGjz^RZCw5AzY7~(18bM4zn*Dw%mUqeK&FSA?|CuVeX
z^o^^9EY`Czao9aLQVKLsd_M>#uVt#tc@n6Gqd@I9PMaYZF#*hum<5y9`A_V{CQblx
z;vaWNg5OU8xLV&d!_;aV7jgMk0W`>I{RXKS2yM;=7rZVPFz5kvo7gm~gts%pH%0z_
z--7niDcg!{^My*_ri2NBsi4F;gU)u7Fwt{a^@Xsl)-yG*lZM;tfV9&tHY-P<P2PGp
zBUYG8FpI<mM)6NOT=YMn*bX#WHD#cnO^pK56|=eu$fE~nj*GdyeN(!@sy*v(15CL(
zf$Q_b0K+q@r&|Wz&J~yucCD_!8fk(IjBe*D87Y!}$Bl8F3#@n)ln82BA+HCJK<#>C
z`<=LWyO3PVi`CKF|GH_TAXX1FX1=XppcR%1wMOIF@$yqI;GvS5WkCyU4zZSGPP=^F
z+YA<QUN<zlu=3ZwQY+;r*8q&1?j@PoeIyZlpn}ni$f@&Py_@;G3M{znv$Ea_4C}do
z`9HOsQ%Ry4B5_@e4M}6vkG}!{Q?nkBkTGl2uJes2$HI-j084um=#5fUU^LQi2RnZm
zXo4Mm02$c%5;wHoEbv%%3Xz;k-*<SmxUO&h^JA=sgl<%T6r@+LLNYT-H7R@wVX-KR
zzrW8sHAoT{&1*IaICY~>^~-1)L^%$HbIeh)m>lFO0a9TznyURVu%%r<6ujF&u9E`t
zol*1kCd~s5@UQ0vU~Fyeqf$$!5OA>xmP%yw?0tjWn-s;JnpPi=k0S8ZYPIU?M`%U$
zzseyXs>s&Qa-B4823C{Mu=+H}KqyQ+Q>5KXj#=%Vb_)fXa|Wg}=)&f5{E$i|vNF0n
zF`b}Xpz$05(b3LC6k6Elwbx(ogBuKXav0rst`Tj(<^bK+Wk5|i&dO4AApjORz=If_
zmI}%?*aYX}4zO*WRi}If+~R<kW`|xzW=cU{xppwKUsf6P52g?slFgh9k-TkX^@tqX
zT!xzhr~{K7$^wGz5K(n($%TzkSuwe9dt@C%vi0gu0GI2`N$X#^08Mtvz_T{Z{v_>q
zX`~-v{JO?-o!+2rb{;^={8297KaP&;Y&pIAgfvldfvcaSW#7$}@v@pFgFOa`0x*0b
zz36RM3ADP0Ju7wgNrO){GQZyH1ysKRb(S&}2E)VG=0_}B8uK-53$X=hzzQjcvcC@^
zM+DgPfm{X~Yk>OYN!h}F+(v}AZ5^H{%A^*{Oi<51XA(?hu^^920<zx$>$_gn0b<M}
z;q@*<>Ei(JjA<9N%gpyhU^$uWfmTG<?$@xKb_fClTiW1oCePWRmjN+vMyyK`?ob>C
zdsn<|#9{A}C4Yl)V)PO^?fCI(6Dah=AP>%yV`K#1a`qWm18wZc5#e$8dxi#Ebxrkw
zvTFQOmk6%Q)3!-dO4ECh4)%$XWy4ZQ?l4WH9EcBKmjrC+)<Xu1#yW&>Z33=wWm1)J
z#eT+Y#(FTcy>;I?tQil@Z$mP(wMe`j-yP{`_+FIdSZmzGBsA-%TkyDZHapn$ROzU+
zIcC)gFD8N}x$S%H5vlcx({2xuh6iTaZA6q#60wdouJRDByq7nJt*1Uc54pdIK*OrY
zc}<HGL*UDK1n_Il=K+$ZFTGY{AoN&QHJ%{uy_rxpY2zpD#g`aI>&YQ9+erZL)|%-o
zY;^Fk+I>7fT{5SSU&_EcA66xBBBN|`^<>XySZA1wJZG+zXV!IB4!AGBYqpM_dzK5R
zeeP0eyigfnefHCo0xT_}?zF7$K<EAo>oa6GvR=hpyv`082-Dc*Q)6wxgKf*8xTSAe
zUX@KA&~(?c?6x-v%g(cw^NmOyBU8xU`wYXSgI~i}f(X1W$sHBp4HTcCw*t~~-4X>y
zd&S{w#w+&;dfr<qFh$uYbd7Tp*0=DFjh%L1xvGA{hkUGO0K8Q;JN_aNND1hBKL@P2
zd5mf)W>Q!VuaKcZnXd;tNJ@UnJpTOxGv!4mvb*XKh&B(g%^&xCe_EjAY6_@*NfKTp
zOfghqyUf!`HOvTv@ARk#x}ovNhr7msd&yE$E1~-t!-Kgroec~xMdoMUfp7g(osI|8
ziOz5p+fOdNyE)pFI0~J875SYcMxC&o8L*LFj3O9Cf(jhH0o^&e1zF;ZcGLdE+6gqy
z>Zx2o*8`UFLS|H0$HO_oU>e@}tSXB&!D${+gOYhNwKSBsqjM3Wv7tCI9hPIDXTfII
zd)t8NLE&LL;HR~B6Ahsa%iltwl;mhI!9^s5BX;fB+r!yAKLO*v^=?7Y%-g-q*1*>8
zH%tazt+n=agVT6MD8r1x5naE8bnTUDRh0}!*a&LQ`=9cnM%-Y_eHxR#Q3kKUe{JbL
znj)W#G$^v)$q(|*H;4w%-0q|SKTRj#h4Q4C6nK;q-xT>6=wYE(lT<~C$)$b{i-?e%
zHMmhq;+YN5!QwFu65N1<Zd$M&a0ybHa=VW;h$)~ueCTi)9B7)sQPj+9R?aCJSR=9V
z>yF?C9<xePeU0CBxrx{uU=mVR=?G5TI1ss_T~aLT+}W%@sjvXZ^A~W1WsW?|x}qfz
zfZ-iC%pnM?$hKM-Do|eEk`C&2={NWe-v4(OcKcdl0wvlTyh1|J{V|7TjY}`g-Sv3S
zh@G8*@~m!dS|KMp7U!gSVu%Wcu(zZS|1`5u*mR|D-(_m$788L1s(R)i$xh&JDpKc=
z9}Ohv;5@uwSLwL&Nk`lMUTh)mDG~*~m2Z!m`Fm!A)Nq~?e0AAc>y@?Kr<7n>j<cJL
zKY4wn-d!eUW5OVy`?`??-gP6<{W{<;H38&MnTt#07>O`TFoEU-oGHYJQyUHKm!b$-
zFS)ZyP#fC<KAy)|zI(cAR#n!i>9M|R7Ps)h;-jrXRd51Y%H39DF!z2JB)Hh2Hiy7|
z{mdx@#Y#cn&)y&*_AQspAt_!)egKiZsK0>=9I@IhC}XP^^fH_-<|BEF+x3D=CMwl6
zOM1JbNM(5AmR+e$DI^|*1Un(l>=W)@mpL`#R@w~8H$|!?{p7aORO}MemX}(W@?meZ
z*<2as$*=?ruVi-Qm&3;C4-uSo9(?b@uMeHSl2pZtneOsbM+RlPAogFKi~krD5DAHY
zaiCk8^}@}-d(-;m2H@DR1%V%PNsiHu=;ZE0)z*vxGq0CQvu>0po%`|~Tp32eG5O#K
z@=RYEeWWMADbSzll63*x&x%sr<Y;gc6PK;<uIes@I5FO5xP0*Fz3;xlGI$|IARP2f
z-lf#a`$~sW5IPZ5m`R>xI73!ke`&I|68T7KRc2g8Nj>Zmr(H*{20W;Ak+^jzv7{38
zbJ{Rb*P{V=uTuCvND`3Ex0*F7Nfb$(F!t|N=1;9^Q~CSjGaA~0$Cw08!=!ZkX<35d
zj*L{h8uBSW8Y6AM%u}`tE@lTy&#n>Vwv3y)2nMBU0AybSlgdp@u5yEdU)K~+#<uUw
zEx)O?9AWG!G!PtDBbp|cFc6DP|89`ESnugX$wSDZ1?1Ch|1P^rq0|yI3kX(nTUNeC
z9P8uI+*r)TcB~XO0J{{mvamkZ=@G2Q>LerdgePU=dYK*50~tBHUe_fX;7)UC_9KlC
zr`9xWu3@%*-?a9UAuii@JHaIoRk!c94bXeR&?m>wUOe$KKUl(F%eDVi`mn$-AIFWD
zYFkF)IhgUUry8%3JW1_NP3&pBvf#z7s%5L(dGoP!6Zj{}-swof;_1i|aLRS>g{naG
z@mqSVSn@`YMGZLRYPM?x#p`gK313m4<fq`De!u@pZKW8c)|02F)Xo{JduKD0lNK}j
zs15qgNR|rA)vUZ{5|Y<cE&e1k(KinwL<L4s9j#1uOqhbU-2i#D`Ec}J&3Pv_3SV2~
z1Jz+xkeULL-D;Y67%<%qk#FYwj@pww#k{x~J<PjZ7?#OAUs}m!G!hy2ek#NjYOL?D
z#Vk4$bmk3g_%D^UnE?Hj9ak&w=<BCvqz<E$Vo%eI-B=?gPLO)vytBoO{bXOWw6rnT
z!4$!G<Ft%?Ns{!DO5cQBB<<I##bURXLfMnT!zaR%Js>?vB9d+A%{E3O*!ofC5=^`T
ze;OMjiHUe{h(kHQ_zePFSr#VmqEz|Ow<kzM{JMQuBue`vC*G4vJb?g}-6-s%rU6@^
z*CZ?J@Rl?}Tk-ULQp!9hO_kCk?=#OCgGNQVoWz7-?ps3E=O_}>@j1z=#@0mL5pE5p
zbtl=yZybJ;dCeJi!3s<HjQhN1Y`8276R`DF^<R5coee-`@?y@`AYLn$pvT6X$f6g0
zO+VwSWo^VZmZ5lLW(5>twDAl;U!b%|!lU9a$;=91l2yFl=}JWDGX^E43A{B1^yp>q
zL#EO{>2`N`GKmOf-0yH?wd2ANt$3UB9>J@|4S}n8KKAJy<QKD5+%s=uH|KYu(wb>b
z1_ib|@{$GM65R3KYHkTcwkGl|frf2AcpC{bVTdm}Hm<mI(D7s7n0Q_p=N_-uILM62
zW9(yi&yPKhS^riN&4|lvX>x7Q_j)D5&061AN$_FpBgTmW#J><yQK*=e&SSJ2_M^W%
zxC}sft%*&A>1&w;YL`9?0O?!`9q<e?>>)asNbsEV2%-vS56Dj%+FVl3S?FK0oL8DW
zt2Vp9E5vGNvL~2`yCc>pwUe=AteeR$!>`-VNP_xJ9-J8|bcDcD42Uv~2qY+ZVo&AY
z*8t~}Wekm(ZdD|28%AJk(uS-zVDr7%g+Ot}-tX)B?V-c1yIdIN$uFiB2AKrdAB#Tg
z#h-ECiZ@e$=dIAJwGv-{RIrD7+Slmsc?(`Q5%~2y5)nCB`wU%vQ9TlOmckZVj%5YI
zp><=Tn;{Q}ZcEj9I{lvGAn-{Oj0yrfjt+vGDXMyT4Bxz3Vo&YxE=v=Y$tiA<j%AR)
zh(J3N=J?j!g)I&m5VxBvlm-ZKsXwD1OVt&os3*uM)*F|01~4#_#XV9Hr<SGo8ja2=
zVYCtvu0tF#sYRSa&p=j5IyAte0X#B(;~pe9U;~PMJf*MgM5t}^fREnRZOOcPZ~*aT
z{TpELv}r)=lV}>ew<U7^rWRTba($8!mJ6cPvlR70?Y>$uNF)w(5@D;;dICG`g@Sr9
z3l6gZI?otvase^US)x^9jSkPx7QTQtHF`GKvF4ez(M};#o}_-Sjy&cdi-3Vjmf#az
zfcI`fDuLnQ3%Ei^)oOMsk6*Vc0|_sVz3g1eD^6Ou1Le>h4uT7puo~uR2xU7NbcKZA
zyA-`Oe0TyEL>Jur8KYSw_?|VM)Gqa?hb12)!#DUOKlC*^E1~$Ftrm5-?O=#11;18B
z{nf!RbHn3pDfXI;u<5IMs}-F+OcSD9yg07CRK##yo{D8<%Dzm4dO(@4bB}Zd`{_LX
zH}7TG&`E80Usg-oA_P{2CLw}iYafz2F$+yUJMQ~3WsAReMMkUIVX0^4E>8<lb<17q
z-nr<d=OEb>f8%O@6fdB+4P485>MmH8bc)igHt0^lIaXmpQ<2s(w}leqoXCo`xDF#>
z6%_T0cukw(on1se<HuO1Ymg!#&`(k^B^2TstPUp?y1EvP(|x2Dk{dtSNtD{pvza?y
zU4_8$mnnsz190zCzk{be0c$pa7Amwjf7_mtY-Thwxpi!>z^tFF6k_6IUQ<=!v`&?u
zr7Ls7iP0}E;4G5GvqSm|O0N7tM*fVEW3E|`g)ic%0(LhY<17~T4P;?r4!lb@M!{T$
zOPuY#G<xxOs}H=<+eFUSJUo<3TOzt>P=^qHL|eOIXgIHtF$6SBoLncS$Xu7pcMCOG
zm{BW}S6vU0nB;ih&Pc52ETslJe$Xo18gLp<@<yC`%MzlONn<UUsNmo2+#mL$(-PDQ
z5|af6Pb%wdLOe18X{GfW=PgvqgGZo|U<n}j=fi^2jyZPJMx0wn{GzEjdNKFzIM?1~
zqy$!B-MaZjh5b-fJ7TU!?m3PEn#xu7VkzT*CXX(Bu3bJ1XC9+YI0;3*CJiC&H_Y$g
zVh)N+qejGf#g^;DVdM3nJkd>9%7|^U0yhz?L3kNC{NOPZt`3NOQ}4EzgGnxgIBPSk
z?(SLAXo$p~6yXhF^xBQi7TtYfrAI0)r4gA@l+JR#0jTN^b(15J($d&>I79He!aR&=
z=V;;I#Y(aIWmqT_!J{2B)Ewohx5%qJxDKNjmVcC=3W%!|DQd$xBhlP-Ttm5c4FKRz
zk*Zlz9`cIB4kg;vw9buyPoybFEpH2UxYjY4x03}|HDmyS6Z-x;qz8RHuHZQ`%#+**
zK?ju42GIki4(aON@eIofe0DvK!}GP~Zq_P-*^6*8{gev<rPhqR9ufs4cI3arEN)tE
z06rdjbXkw@8d5+V3oF)__D!hN-JKxrnGOXBiC04@B$`?yo=zXBcZW%#M1#M$xm0$6
z1|k{Ci4{|lNK9@v&uQAlJ7b8@s#VB<`obc_xXx9Wspbu|mu=d1UcW;W(YU-xyr*UF
zX2pQL@$SN-@$)IzhNPSr73=nV1Gnx#={fUjF;7)byUfCd!CTSJPlFn%=&fud;Z55A
zD!Mu$d90h&&e5-2c+m37pktQXLRvGJVx{%`o+-xcfDcG=%~4RKl<{u$URUs~rI3|u
zw)(`zla)6RpRxu*S0)aw)QR&jOa)(;f<f4athik%2riaVc<a(W`(BXfA4!7h6_`;M
z$R%zg?W|LFcl|srd#maeL=v>;o@3Z|Fz!Jo$;CABH!0|7CkR2U)3{gjBq~DM{mQFB
z@k_|sTV-W@At*FpI?pOsX)-Uw^M%u!JLsT-G01)_@4^;zFO3JO@h}im@b|Uc&DM3B
zcuUHEuwuV@o?fPhtoH+k`%2Tp`6YEA@){;CA(8epaL2U?LS;S%W~KekC{Q_?2rhoQ
zdjEKKLaXKg1RD%UHvI>xkY1jeBIC%?_t3JE5Y-RQj=~KaRlL5l64veU$F;X}F+}&w
zT{AYza2LFAA)Lh11mTtz^7zn_Z^ustt)|jsyW!npFgbW5d^erT3j)ha9~iw3kPsS@
zEjx@U#;8x=DhY|7vfJao5~2rAi!908uzFv0ZV1KQT?)1k_kO|<_Wc{vg~CaV_cAQm
z36iBGI>AvULx-~!tGC0yC*_-AUAHT+kqr6mP#PqWon7nrn5r$D@Hw8O|2!z(7~o&7
zZWr5e%<EL(>SvP;URu_mj~JQ-T-<EY7Enm(<P@O5NwX3@)fTuk2(!R7+qoVh)T-Xr
zMiS|za3_%jRQDpYk#vCzUYv-*9w4kheTRJ7VUk_#ERWY!l4!!U>PF^1Qm#rFUe=6U
zg?AqHNEUP++$2|(NjN*Gsyhybq`js+TFhI`E<O=UHr_tIg-%wwRjJ7k4fT<o=Wxq^
zd(<tTvAlJpG<9D;*LcqKf#C9%mKG@jwxH&K-q3qz=4MKEM`JDCHiXLBWZN{c@RNNx
z%oC#a%}x?F6`98@PpR;TPrS{6IB-p&5+02@(NhyJ;3tD#VfL}UpQ6`UeoLv}J&+V>
zCW%gg!wZM9_e$b9fcsVOUS7W5vCg^)HnvC65cW(2SJ18QlWzMHUY0bw_q)4MM70Tg
zldg&U)Sk0ipwV%aWZf?+;H{vYoc|eFZz&`eHUB|E=v;-R1_b?1W8Ml%o3~vTnP=;H
z49k6C^eebf&px&AXJ)jnRnA_slv0^Yp`5v$zb5Y(=(KR@jNx1@&uLtd#`=}}yyw{8
zUOeF3@lKz83+&Y6=!J9xE>KZ<D^KEm@(4HK@}-$ch*{pqo&v4dr|7JOIXGazIK<gW
zm9}^}r&Di&VTSZZbfGx^g)|aQbgk3HX~IQk&^})ve5*Y9`<4Fsa=5eKbmMKsUWoOd
zQ;izc%xs)0&#esQlf-dPIvHOuTq3^cX%=W-I<BR)XJ6?`tvS2hYriOYmMRp-L=r<a
zEIcBda6%s}p=0!-MKSJAUlYCu>fVdv)n$FlkS(e(@qO;WJ%O7^K`E(=zUj;QX<wg&
zX-pr=9#O-)s=nt$?UCb&Wc8HArz;-H1UtNERM#Fu(l%GCe3jV`c|HPrlu(_uz5$zy
zP{l*#K@$GqXReE%*;+~Y8`43XPuY6gg8>oWl4bP-Etes60WV7w+UOR(I#ELES_b~)
z7(2<uYzG~^d$Gu7w`CimICeBTLxdZ@yWn+kkfdD)4(_86DY1EXkkNsxUYWMup^~Es
z!cmXId#maMET`w{FO(gBC;m9zK)wsfi8r2Sr$u`2ZzeOCupbwplD+vzdu9epUo)5F
z-rA#br==QX<l@P{@P22pn_C9op{05r!$+d&d;PLEKBz9{6}8jBwa+Ms1%)q^@;+q_
z;uyP$WSc9dSV$H}Z@%Bp$rR&ar4D+H{uWZVcp2MF+XbXzzCTG;^2veQxL3MM0dN>f
zd#o_7eWXeX1t3n^`T6xVxrn!{BW;@zgEW5bQk1Vj^6fS!y5gf87fU&2m+O~Mu1<p#
z+x!<kF}9r_iM`|BTUjSDn~v@^BH1ac5?n!Km(@-=!gzgs_`>O+0i*eetD)@D*~N-@
zfnbJdXH9r|9Q4%TVGUkWRQZWrV|vS*tv_Kzeu33G5Z!cC1gQ+JRTaS-27S*cucy1t
zMfRGke#s;4C&+Oqu!nn(B>wYxrLXqxl7!2-*v`=#STE=BA;Vm4$2S=>`4yZn8kh4A
zZ6-Bwv-LiLREjd!aS<X&Te#i+6&7`e&oR3gKgJhBhatpq)d|G4)Fzk}3fH7djTt`7
zWo@lyoeWB03Fc)<?a36Hb4-C-R$YUzg3?LTQ8;kN)E}(if}@+PrAxC82v(6@XdYyI
zs>>9z>=;}w#-FgboYT?G>_yM?%tD0!lTvNqVcmPt%S?9m{@wTbc~k7{#%|lq5ky=n
zKN-YWjBcCc>co$jKB*I!t5JI$Ql-G9&J$D)yHqEUwOAiBE?2g-!aRKtZkoXAB+`(y
zG-*XbpFB@;AbBU52d*Z>=#%VJQ^oTt>)XJ)=QnOof3=0~1yBT$eA;qiU8!f1=`DBd
zThZE@PrYSGadF+`eJ9EJ0E0{(hWlhyVPe5UgnStwK6V@8a?)848B)M=*@v3}K1Xi+
z&HY%fyZW}^PXC<y5%^)=;PHf#KjeMTsX+)+yf}#Ct&OZK-Drey<`t57Uw3y%V`u_F
z5*<7)q=J$Z7u~t4n-OK-!v_b%G8V6M;50&RMn`Lv?|H^&WEY=+@>#)G%d3HA>QpJd
zati6>_0%_FR(2UOv{=J+5qZ5!9pCWtDuJQ1V%L%yeH~F4#9vN&{jh{cMJ9W8P6PO<
zTFa-~eCg%+=E=A&Wz4#Xk0h!xzEzpsFD{qZ=$nv+BAWfKfpT)!e0o7{=S<d1-BRQ%
zRZQ{E%79V|MO>r<Wi_M7t?Wn{>O$^b`viGQqydaHlbazV+NiMcm9!rGzAZtqNqLPH
zGJe7e{+mCk^k?U=C-k6v;dLU5OR7Uehhx9@@7(H0WE9zN`9-cQh(HI4a*7EX9`#~r
z*LF6wv{CJ-aAj2DY;RNq-5OheEuA94{z68ZY>=s<xVi-{jFirLKV`r`sEXRH5QbNz
z{NFG2rFLtX6=@kKEa<Anq~uw>Z`81y54q@3eti!oA!4c}L+aHVr)#p<%T7m7awH|O
zbD_6~>}lH!>DV<~l9zr_+OM}ZrjvR5nS<1eAc)~#-gxvL^&@m6JJo9=ta5)K*2q4{
ze@1D7{T9^7UPHGet+Lp^W?mb%@})|e?u+Hgj6#&8^bHcYD9dI0;)^9H>bdT{=a3>J
z<U|N$g$5`?l4X?~`qxnNb`I|*<Z3X@72E)5TE)HVke90v@$6YmiS8hrDPvtdQ4UZB
z3JX!bhBMX`q$9W%Q0TS8a{_@g`=|M4muRr3qYK9(4CNe;2+CAA3FMwkf=gk;zKlUB
z>a2FtYVI2g#V=or0rnA9eEfta{;NCxVf!g*A@{k59-gcr8~km72W=2KwfV$WP?HWR
zV;YG8>_sU9d5~u*91Wf5xG1Llub%~3G{Udp1uKh5DN?^4EC`?dRi9TbD0nGlSfe-<
zSedN<j~`}o^XAQ@w~!tVNzNQu*HE3Aq3m^twn!sU?N+gEU+;EeNT<dJ6CenP$I6!*
zI5`!WM_-J?_MkgFeV}+t?ee7KmoxyRhOYG))B3U>6{gql<ex6#$S4}aC@=iFf&Q4U
z?*n~<P;0|?RsNYQ^z(o2qRyUkgUs3=efh8)#hlRPA%?5}Lnfrb6(!h0vZ-(n>cu4q
zTWk*}?1p-A;bv026P?@VH;^%TGH{1yK0b<k+N=zQ)@v5Pon<nFJGt0k*D%c*#D!){
z!1Q(hL5TU+SxJ8Y+yQ}AxdOspsj25FUV*AulVbGuC&R-g`M+`jFt$OUamEQ?s@YD+
zBc?a^baJoyk#2Bq`mM4A))e(?$KV^pyvzsl;a;3MT+~Cjj1Vc3kKUnMzplJOspDeV
z2`9U?g@-ND9>M|8NH&ph?D;~=7vSUW>)`+AaSy%%4Zr3Dx{VPVxK4uk1UqsfOwf#k
zJk+h5sMnS*`(W)>ZPf1z3781pBz-6y95ZfiZl+RAM|sG(Cy+#M(zgHg4bDqv0`K3?
z&G&^$kNFH3_TGG%uh>4eoB?v$P91b%Eg+w{E%uQbhC;p%a?-3iVb@5L>@`dXY~pIy
zYz_`i0FGiO8MxYha&sUQ6UiDbgS9UY?MIDY6M+2w{+%zXZ|sKL|0H-)bqctnbbF8w
z`zdAx{3$(@3aGCA?h<@0QXp}6PR1(baorLEe-u@viN%D=x$kctgi-)z3y3>Dmw@{%
zWyqnOc{3WO7c}}VSspU+|ClU=N@A$Wpd{u;PQJ~lnU|rZr=lz?sW`?pK}K;@t=>tc
zY8*-$JlFBZfCkLOq?oEyUFJZJJ+@8(B?1N{4H~719<pf)L$K<CaZz4p5oj1TBh$qF
z`R3zX_{Y+KUP|~5Pe*VIYIfEh$2vH2s6z0|CdzGWm6nw?(9Hf>*#R<m;4nlS3CB(w
zPC@mZ%dRkBmM&tIM2f#s>>t65xGHr5Yj6-xALT~7QK1CsUtoSu$GuQn(vfJ<^NbxU
z%Iq%i2bh{qw6FECO6J{x>MF%w<y<e}D?cfnz}|6$FkPf{AV2;uC2oJrkN=kvuJb5k
zIjjL&@WJCpdVA-LpPwI;eF&wUhb->SGq-;B1>4@1T`sWIS?){4-mUa<Xwzxt`FkBA
zQmN7t!CpP`Z}nS$tXjDVQ#730WqS|S$$Lkl70xa7t9t7b{Eba9{4&-h$c#qOl>~nL
zx708QAHwnj0|U1K5;tIrz4U<ikP;dYe%8O0M*&gj_jN%~-Ssi|E1;=4LP8`H!v~6}
z8$<qEVVR5SnP!(FpN>jlpF&EgZ!2LF`b&K!Ny>V05L7N+X2S|z+AeiSLZuFWsa!;y
zTypV%f89<{w~4|F9^Xh{4bJc#DAax5uDg8U3x`SAqaVj2*s-8?>I7g9vCO6)RXyC0
zZuu^W8S#Z-6+=Dht?k`A%@}(bD*YKlWP9wC{{Bu_Bfl-q1SFO0U83l)u*)!lVz*|E
z@!X()ZzMeVS>#xw0mi4t<~QpTRph-(g3j$AJN|RZYk{)lC`)n$)}!EdMc#`y2?+_x
z{EL_Rs`d*}aZ4g%Gq?@qC3q~sjDh$`$;rccK>^r40|)orc26fm&6bJ(ukOu+ea9xS
zNd&9ydw}A!9fmkD`y%5}c0YV$tTA+)RzfZ@)-l;G^x}p>ufL2*1aw?GGaL3=P*iAh
zH;|T!Dm>N5$?1ISE;ld&ca>Rx4k~>^MEU#omk@eubxp=tH3Dm@5+E&M-RxUqhk2#=
zYh3<VR=8c$lq0T6t46}V0HuVG#GXTY^4`xGB^7IB|MuERyHlLl`xoyp+9ww6;71$r
z;Li#F<B}&8AM4?1duY>9u_f0fz8}LN+>R_nw%&~iO+&2f2AJ*W*fmiLKhlp@|Aa#o
zL0w+6t-@ZcMt&zM+*7;5bm9a`A(mHOKhMqml;r~6ock+-wllw;+H^0J4l8L5O!JtM
z7GX7Rht5Gi*Bpj^RulbI`)`ASnd8qXuo{<LsC+YLS9@0m%;IpaE5F(-0h5Ymwh(KH
z(jkN_t%>;imnmO|dN5wMTr1xAw6ymWSc6pCWw1G@4E8T0Lk^qr&x#PO(qOEM4_I&@
z=<`^Ohc!4oEIOJUleUysQ?2qViV%Aop6h9t6xMoG0wq+;wJ7rUY4N8Ozg^y$(ilSL
z07je>zz|9X;J!sXC;Pbu89ib5B$KdG-Y;Qedua?SZ4&s2u=yCh^pF*=l|uk}n7<9A
z@W|0G)UigSx9f86ABNH&d5$MP&HX$AtTCli9RJ@6xnDDMdteiGsk-$I`exogobc;O
zjd8(jofN4Jfvr<(AP>9MK*jP@T>0nv6jv^`1D;DN1o_s==|4ZnL)aM?ZmdJz3@xf~
zV3(lS*7l@YDEz)N3xdLAbaoZ6KvJ$q{#+n`F1<sEb}h@*2TF8z#~i=<l&rO){nI-z
zxLi^ve=bj?VP*Qb5bL_2e*j(Xum@^tF(LvPPnCMUCfm7{W`({J$*3`1*aVOCM;~^Q
zr+f*2;gZ2v)TzuF^TX2o^JI948%As2d7)kp*(^y)>amb?TPRIS6kb;|s?mvUjM<pe
zX$-*|1KZ=kf!#PwqM{h{w#TfHP!>X$yT9pPTdk0A^!Qq!-BFCm<(viQCV_zcxxzf|
zn@`!bI!V*a+IUqx_u9rp)2rp4#;5pRKX2BX7P6(E*Zjn+y}hoqSWvY*xhuX6>zccZ
zDs>asVLos{m9~-!)q1Mn>-ffdlM5nvfkXS7cQ0nR@OIuVXLf3<c0K1bsm^{&Xj(Ga
z^XbZ$oOR(S&8~C)e$e3~ik{jV1(*=qs;a7SjI@l*{t0){wRtXiZfx7n15`qIqSlb?
zTR2c?)0+5+_U}>2Z4Xxw+Lxi3ZRb|pGg)UoeY@tw3kTfE%~NdJiH<#=V~89VZ*^e&
zXX-mTI$j9o4`ZF@1e~yTZ%gK$Asr(8{=|MUwfl|14VLjKtq4~Sf%c%&ft$fwQ!0jB
zxZR{FM9=0~JenpGJPz-f$81jdY20qWi`d{v(5UYkHPc%d8?l@0*3Qzf>(-X|eP`}M
z`9hCn>d0rPXlNkQaBQ%VJj2DJqi%O>MmptuM%}09($(}2N`=e3Ev&Mv<3tqaT;r^k
zBB!kj&A#F}E%f&4yQ})HJ)NeRw!B62R&Z~-yFFHmwJM9gEE>cuo`2uZ6+&lG(S9UQ
zMd+FFEtwN|JJrOC>E0Ibmr<z~Z<yP=<*cniEzBLJhRF-bMm|n?Zl^^xN->TzJ<(f4
zi{ci1;&`E-wrI;@qN8cwODszLSVI3Y*CG4_u82?eoERMQs`-~}UawwHZy4L1l2ON+
zmOB?`UTl2euFINiV{NQlPB*=JMrI_}pi8}qF$aXMD#*j?-DNvVlcKg>`-m5h+l#(%
z*>^krcwaM-yzwss^^e7I2PzdGUol{PCoq`sibeE7DSJ-Hfd%z&#mt=61LR$5IqkdJ
zn9dO329Vs02-Ya@(wHl>XKnvB73S=kaVWBR3wLhC++mU{M8Yn_{@lLsW#{)UZPV_`
zve+ynr~rUs>f?`*@{lx=Sg~6tP>8_%_@#k&*;0qo_kn5N=PR7`POOyBec|PlcBUYG
zR&5Dw!uJnNj%<AlKkWX$*qrhmXn$#{0St^(wc@UyCSJ^Q#oMcs=t7=zu-&lz!-OdX
zP!FNV$6B`3APYt<+y(+!98imSMY;NyA!?xvlci6IdeGF-amWelL+tR}Lg$VVW#0qU
z)TtrzqQABctep=N;R8CV-Z{aH@U~dz#jigJL;W+|^v~vD?_u=Vp7&80|3Am(U!Q>G
zTmBD62KP+=ha>A7p!qfBe>Z38A8Z%n{)d(D`<aNQES72a_b>UsDf^2F`X8Xg?=8ar
zO<73O{=X?(UiJTT=wkpIFcKd^CBY*g9q5MkZ~lH+9D)j#m^jnY-N2{=CJM<E9B@yc
zsK|mm#t;ASfByba*GW)hAE%oQLKO`Mz#G_TIL?JNj)2~KW*@xjUBj(<h=2YuivGh4
zj(CD^en5^5BsDxQ5~H##I2>{WAFQ&eXouMT?ys`~JAc1lucAUUbM-_ya2ZaOl*2hO
zJyN^@1UF2+23Sh`-jCRq3OTtCxO?;<PbiigdP?O<&2OpN9>V6}N;r)h+rkIW=Lu6c
zl2^fzn}2!U_A0sTzc^$6G4{fAsI>Y<^Aq5;c={ZKP{WVgFRM673%Wl%Whp#mLmBQ4
z=;9v)tBN4O5Yh6gBiOIEx7{8eIGCtiGq;B$nXnExh4aeAWWUuwD@RbfeTI61=!&4|
z=ov1qkp8etbWvpXLH##3$Ee{yC=0g`#p13^tadpQF$5s*)xF(`JqHU-!0Xv)mR;>}
zMYD_v7sAvzEaWqzkB3fTMGkJa%@dpyDqn6VBq2BtqI}8r`6^f$G21g;7?l8QUm@=B
zJW2r^3u#Lw)xj5c>WRyUX|cjow!aJ?B5P_ERy+{I5_aDM8h?mBgo(#@Sxvi#!G17=
z=Eau5d|54a$H4j%uK(|i)!0gD_wS|)nl}muXJ#poTTuRoqT%gHb}AEQN&jjwKfLoL
z<<FI*Yztw_wfvJ<nH1a2!$IkxvHDIZure<KeMyOF!y2#q(U;?>FEzr^vW+7L)zVZa
zJO1T;lWBc#umv{Zu?UE-;lUyk6SlvKz9~`|@{j=(3efb0F1?h{|E!gN{l)bpL^4B7
z6*=L9C{z$7zCvDW@MvGoi_IzQ_r0k1EFjde&wfi#5Z<YNFbqQVBx5HJK#;EEH$gho
z@4~7uH=&w=1fF&u=&aOVUutju6|C9rhyQ)j2LMKr?b(p0L4yD@^kY1cKkDM`#{b(d
z*Du5TxSKmmivn1a1Ry(;*x1+@ewDhhH7L69?*{pQKV}5TN68!)RAGT#H^c)CO8w}(
zAGo48N4o>R6_SJ#GD=mDa$Q2Q40Omg$mcxy?OV!);Z^&T_yzYtW1OHKq?uoX^$%<N
zzxCK3Rwkl<<pSg$`$d>53>q==tND42@W0Lb)m=77gp&0>1sLY?zP&#eU;jmXfUWdm
zhpnJ@d%D2*eINCagt2;%2P4z=CymveaGS<aUb|ojJwK8Jw?<5DQPI&;L*Lu521kC(
z1J7NsAhe0l95=iIL1-38ud1to%X`1tKQ-^a7kFm`kp1I0z}m+l;JWnux1;~p<-Q83
zgUMN)iN9Az+64hy1L4D?XW@xo5TnvGh=(cx4W~bU>Ngjws2}6t0+(uR(b)momy3oz
zI`s2zpN@FsE_62!Sq6MCkO@<^g)UK2)=@{RU@#h3*8{w~0s;qW0?zhmX7=v69~6`X
zl+JiFoS4_fF<!s5E+e2o5F=#z>?OSF#UU7wfVipX-#+I)NO-S*QHB$ZqOGO`O4}FB
zkW*zcV5IoluYx5f1Ya;GZ;ixh?<&PY!VeyVr$t%5X8-<;FD&4FdkrRK;K4?m;elV9
z$L;<#*OArvN9V*IKzqm!36*l)J#ZV|@0-bs`SfqE{mZ*Od(g73YpaGYI>d{v2hipE
zoF=Gxw(B0idhfPv`0m0pQgMr+8^sUGKx8Dn5d(v#`AHc`bN#F3`|;enVD~Vf?;JtG
zmM_krh7L@y|LzSC0s}+z1IH}<$J=w%V$Z)^Q66$ZjH6T~$;Xc$-`VQ>y*0qW4FCn@
z9N%R#Bq`$t!Pw<X8!%)Wi-*R3yNCbHd4kX)OixXBdjx&P+8Nh*ky+1~j*fuW%pq^j
zkFlsfh5Zjc6rR&|^0aCixY^g~@!`S%;mlie_AKwanI;JOC|mD(Ts=N1hZs}q9CBhJ
z_NR~aI3(<3enrYql7xUn&Xbi*X;k^q^*A2}WqJbD694&w&kgDKc+C?(uOPULi}PIU
zq`2^-wUV_STg{xhPFwx@iSU)I;g6&rJ}(q3J>eLSnik!GgGVfk?MDhY5;25Y`}X6*
zt5S-DL8HZ}g_*<Z+XoEY`Ko(6a0%!=e)l6c1KyusYD3KnuWWFEJfB(~gWx~B2|f~s
zZfg6nJZo?vlBbwW|EDjaZ#V-VPLyy{6h2&dc&|6i`rZ3E2daXn*B=k>Bf;9jpMUVA
z*aJa*5`khd=!#Bxda{{DM<lOEBYRIz99!@F=R-S1!1`%up4fj7byHBergd%W_;rG1
zrQubJV}ELGgCn%@{B(qsIC}I{_=n`+t${zi3-q9&!C%O>Uss4Hf)g`4P!;L9bN5i^
z{=JO99hB`MyGaU>=vu$T?Xlm~fd*4?89kxmvpR+z<9B0<{(77MgvoG1q#5A%s8e$K
z<-_yMvOGI??TbAu$J8tQ?8u+S>l+1pedK*|_<GMN3TD^=Z@k6hFl>k=2z{#4?HX*k
z_@}3kV-+sH$B&Dq34JJ<m==BI%uf{S<z7h~r6a->TDbg@{ZCI}1Isl=ra1oD0eGd{
zOJOw}I{RYi|JS}IwA=IVH$OTh(U@zo@27wlF|%8&jwVMwWR#%jE#Jx-Rq3?k;Wnsy
z_O{Ee%bF|xu-w<3H^hFk&VJ0KST7iS76*CWWAM`0MqD%I<Y@Y!pfq?a{6}Q#sRawF
z32dKf|I{k_1{1W<C=A@84~6&QeOXjZ5D&zpJ(ATdPma!4pW@D=IWk3}EZAj=!&Q9g
zkF)zQJGdKPpPLXvLsOzXa5)GU4VBLJ4eWYAHX;wXD7Zqc#WB1Le`=m#CUo|YQw|Sm
z+$~%)-=t`6Ewj8Rn6u)hxt#vH?&ZEFnm2T2ln6>+aaXs=n|OMo>W{O41e!`-NKCy8
zjbOe<uX>u{t!FQH3R%5~i;J(imVdlU;*SlCI0+35@0{2R4K$4N=t}xTEB;VYC~nfw
z8S`54d1ug(Lki^dU6N8=$NHMz9=WUar}ZU#3lA3mkb(t08i5sDvq<*IjLRBqJbeTC
zBStpm$s<radGo-Z8s{NwjZd)5*&bPX6xJCV`AnBhD<p5RPLZV&e7hU;rJT6cS=}k^
zh4r7h{%>ZSStFLjwCIaz%aNJzGdH~CN0>SNy?v?|_16CUJgm^JT#+bOwC>r+;CTi!
zo-}{(kY%<wcDU0zZ)Vqx_CI}y93PBWNbxahVl-lz!lkA~>mI?FnUP@eo)-_?Pc7EW
znI&tU%@6vM4I;=@ffZ=H@tGe!sKLh5smo>shc3!P-S%^tL=biO%Cml4q1@qWsSjR%
zdZq?^Sa?N!4A%beN{`ch1)~bWOvY!D9|Ajo*XCL7aK9Vc7qIV>X#E<&SXoFQ>!`)t
zr#kzQbVi<^UAd=G?vRALztW6l!GJbyr?u@@qK_y4IDg_{2Q(u_3*{>Q`WT;Dmo79-
zdDC*|1;UY=RB<Xd)K8CF-u%4#)1OUs6g^CC(oBY7crXGoVrKP<WO*DqAFo_C`-bSW
za-W;ze`pu+hy#omjk_P;MR+CsDPm0Q<3y(*x*7Qj+JeJ!PfbUBMR4dktt|`CtoVIb
zm!dvW-R`4P(94Hm%aQ+-@w_p`Be=|b@-b9s^xl7V{tq8Q@Sc!(=IapZ^P+39m&s$~
z%BwOeA0K}{VD|2Nb97pO`6U^eKaJKw^dxt-pM?EEfj@)?*voG4y8fvz42!^sB|Xj&
zMPs9mD~4Hq6&K5AJ#@(ah+3yhA-Q|H_|xhOt&au_e?M8!zBdG$=<Og5=Znw~7B+YY
zSt>B}K8Jqah1-AqZ$A>WDD`DiU-67|+9~7~YuH!rDNgS4|M4U6b_!j|$Y1xU_ufN#
zztimYb06C;x7b{t=4)%7_)~{~cnS2R&3MRg339Sz%d;>EbXCf;R|uDnWhBQSz^S+=
zt$m79u-C<XfBY~YQo>4n#k$>j44!f`Co*5cOlw??bKl#DX$pq!W=y3y6uSFQ!%IA3
z2a~qjZMVfHTGEWfn0``BCNfp(S5LDe)ARee+AxVM!p8o-b<`>edSi0B0>WUT9X^}2
zRmy|3Y{RhYefgtZ@eqEATv@b`cX%H5lq07rQ&wf%DHKO|QS;#ci_X}B8S>Q(O?2WE
z5(4GfMqKNmno%`&vKh<vxt$m7#Owvv%%Q(<{_Nl0gzorqbi3JHxRy9{Jh{*r!AGe3
z75>o^!uEUCfn>+YVNl2U>2#TkeH5=jz~5H(H!|?vNbgs`*Ls=}6XcCHId+|qdH7=j
zwOLn{d+rQmSw8cRB@nxYmn0RgySg+@7qtuwNCUHyS@E0?3;{NDcn0_X_7yjY;VW?b
z#8A(fkzP*gCna2dg%d4j0TkqdZn0;l|91#K;u^%b=JSm{f9yoj<Q@an@-NV?`Imvl
zgU!oAW4pO{`2CpW6y?yj=T{SMXxJy>yL^#*&>e?aV&3<DySxAT!*h!Ckf2j8-&b!^
zCmENrjYW=jO>60;ix!oC`Ec0hAvZyY>H&m2@rXynJzud15q=v9_7XhI-IuUsCwehI
zDm0I=uQ|}3Wccq*g}0WJuMEeSBQjbvz8zGZTSMWL@ZG;XJv}k>$U9R{^q<6JzMwI&
z*I>`Gcc=Ajem4T&tK0woxAt~3T731Pi~yW6X4tz7s$s*|ri!zickn9WbxQT!IUsZf
zP(P2=pva=u17~NA%UrE$Du=^pVrUH%8k=al%!<uHeP%dNI_uJ3_4|jM7vI0j6GP|%
zoQolVT=L`?)N`-CoO*PXU9DAg@RcNm1604%0N40NWe|<={_`XB)<Ce0IR5ln{I}AV
zj(Ef$SRmaajqDQ|7T6+HzR9=Jc0K)+Xt7hsEQGdTLxs67Ve&Q=^5p*35CWUza-eZ4
zS_466>DfAzteQsyN;K%b18`mA{h9%_)p9DoLs!3W@F@pyE~3MorwaMMyYQvfk#ztL
zpcZf)N20!jzMemB4fUYm04^^bWLg_b%Zh+r==kEg)Xi1(5<s(uq?I^ehtui<66IM`
zCS31@NlC7Dx<5sF4e*RLP%PDI-GB9TX)YCy*|QZW^Xzmz^z1AyIV=aoHvw;tqLwL>
zSfnj8f6(i)Dy#}U-bZRwFIJw+q8dK%*C~a}S1i|mXRKXjxd)4dO@9Xvy=$QiT-(|x
z;M7asl}eu1H^=~TH5ig`0mB{d$GLP>HEQGyUtUc4laq>4Tbmok`!#U{^E!<O!#Hn0
zY8<|gVgD8!$ldut0O$!B0s@PH!d(Q+9%Dj51RwHh6OETWS8?bjg{ReMokos|2obgG
z-y!%$2}`Z_J{MYQnIz!U`G^Ral~*0W3#8^d_ma?+c~E6}^st-^**}b~6Q0|A0$n%>
zqxk&#7F3*GSMp0Btzt<_*VTq;Rg;|5P#;>hKA3X88gQvcl-Jat!0udDy%zzedSB%c
zbIx~niPAosq?MH~!F-GW5GacSCi$6_WJ`QN{<*-{q#fE!ge9A+<MQKz?(0B~#b~m=
zmYL7$<THKoW|#QaEAGtmiIM?Ow(j1#KYZX~Y#OX^o7Taho5(P&<C^hSsmxfit~|$g
z5t%HsA%ozs&fd0vjKgC*fL?IU0es}u@RRd}>cTx9vLolmyXN>OUflVt7c6*R{Wx36
z%-bj%ehYST%9%2Ply}MmllAVK)>nfxC-qv`D@_>_cFN1me7#2(w5bp4ncEWx!oB-w
z1zxqkHGjR5B+o@DZolV!?dWANA~I(%<saYy+fc|MO3)d_7dl1gl|rfxY;;Hp#@!Ny
zhwTSTe>-4qnEoH8J(e0r`d}cfV3EN9ssQIa+v}${(hxTUbYZiR)Jb50G+v6tteqid
z)}XgJP{8sext`qq^DRmayNQm7W)%Xn@83R(Pi}&b@IVW}_PJ2qXcjqH?09>Y&LUCX
z%A+l)eGg>0Gdhj4bC-=R{u$Sc8q*x>#EI-@b$ZO^jzqmPcLq~u>b{3>dmd}zT*%1K
ziQO=t=y(xSoX*yG1WLn-T+9n~P5$YIy(FNvd`~h2R%2AnCek$jY#BhNU#553eM+hQ
z7<N{#254~lK(vV(S%zBlvWNph%du0|hy-=SRb>t^wb@gqd2_oqH>TZ>C`XQT6=bPd
zyut5UQDSkNFrwtK9cziuGgiir(Sa)LA%ucDjcOcQU7!p<{W;xkAK?)E^>#hybn}wg
zyJ|q;Npl$?;^+2p;_ftLoLir(m+1Ne)Tf&Fex%YdLavANgYCK+fWmy8?A!dk&B<_s
z=(7MW9qycb!mEvU9)*1*G{t356oOf`JgXyNzQ~K;u7g|4tz06+^~>PIcRZKsQtkig
z2gBW-vO+7z2zg7uA`gMJ<%H^3Lo&Ho@=N@hrIpkJ0ADcAg!1+1z)4s*LjW1PT~pb0
zleMU~+q{Z3g+s6M{z&kNw(Uh`U<D|laMAsI$L79A<@P}}>tSV6M`RhBodAMd1H9RW
zE6E*A*76QU(T+o-w)-$)bMOT`3mrD62#>4;2POmybv|?7T#*)}uxoS0E$DSC4hLhR
z1|_k9+Buxs?i;pq5Z;lPzSVMGKA7A`l;YZT&wLz(t||oLYo8Tb`eOaYL!#^1wV<8K
z!Zu)q)6`(sm^uUGUd5mco1XZHwMJ)Bcf|G_0%)=LaMT$CZ($G^pSgV};Qdi_Y{;OP
z(J+TI4oyr(|G+)Z`CSck+{7(obGlpqvl-NqhVfY^IgZ5Li*MTv#p`8`fE~TV-|jv8
z;HYQ~Yy~$F#(#*0-r1<A9H6DQ07~9FKcO08m<b%`Bsz&SyODb*L|WxwZxfsYdTltc
zCeAs1@jh|Ad*H%Qm|lFoLHA;Yld|?T!#E!&X3oOQ;ltv3c$=^$BC}iuS>FVHd>wxA
zE+Eb7CSYTModbZd62T8)-4&l@`r@6z{3L0>_QF-uw`QNkygZ^G*YghSOIXwT8U>V&
zx*DhqZafV5i-&xGPB!Y49NX|0PdOMlbw$wNbm6y0rZ=YUNl#u3KEhLntOr|NT+baP
zIb$f;&H8*_Td^MA?!?pGHqQDgznUd&-l(<pqp5P&s+Z-A)=@a;v9fb8`GJSfa=LHx
zWh!l&+b!B7otB}BiL}ZKv%3Y=$IEC<aQ<e-4R_;6ulK+fC_bswa{hSzHK)<U1mOlu
zT4sa+*HcqQoV+G^E<AoES8-*-qFdf4NGB;%`uCA7uK+pqHexe(ENX+v_Fn<6Z4=x5
zMn~9RHo5@2BYFA7MSD*#2f}YYcQMk4mSx@${fH-_>YkUlTiOf#f}tio)vrW>fZ#g4
zs&TXprX9Z#X>z_erY4cl7Gp7wn5m1iMPUAqDAN(<Q5zE)MNV8|d%g*99x8lxvQvih
za+eK)1s5+9$q#*=esSYn4M4DC3__lsN*k_5D8yION6cOwwHm!kVr3X=d~I<-KA;gX
zpW4mBhEHllPcjli6!Klvw(IoZ3jI|JiK8)c6||G(L~ApZr%Czt{@sBLp={vkq;p%o
z!2z^;iHp>(pHwHQVM$SOIDWrr4ZNFn+x@#~9byZiv~;|47ptmjAEKR;?Z_EGMjzZ$
zJKF=V-bf^IpLRQPQZR`^)8b8wU@5>C<UifN%3gTv)dbp8=1QksPuucGz4TDu{2c2e
znCwT&6oCrA6=kv$hvVcb02D?nu*-*??1M?A!?%YLHGs`g$w>pcA){06$poX2w|vdX
zvOLX6(i~YO-zAkkiv?@=$Q}_aIvW<+D))xn*;ScKqCK|pYcJH%*8^*z?1R(g@X$yu
zW0iT@)N0X-8u(_SqNW_CtS?z&L>YsmP4!yG8fl<PJfINf@Zs#0{GPkoddd}5`EPQW
za^CId4+HQ-T1I`~9f?>PI1T0wC=Z08EtrB}*|g{-p5w%g)ex7a*MV(b&3d8#v!MC#
z*-K_btXt$I1~plG`;Fekk6pMAjvE{gY7vY-`f;<t0Y>rrBQ=kSp0%30uW96facNsW
zW`0dXCw>)RY80G`_z&GSR&4`^0g|q+czh6M?>VGO)BemooC&k0v}9~}lPHm^e_3g8
z9FQGZpIfAPx^E(WY!S~EVB5lnW88ZLKx#zZxd1uI?KkbZ`LA?%c(G#UML%Vpc{FYX
zRwmNAjxRYbbYQQ}vdGDHjW@n<#z>B<>KqA7#c#E&qM`hfuAR`zop+=%;Ph)l1iFb(
zY%RH~Z9sJbL;DKUMCRke&fW4?b6sdXDzd}#1Vk}zGjJD+pzU8x7!Kh1nrih`858d!
z`_qk1{rz%9Nkh2zn=6=sy)@5HoNLv;m74hUTmW1CwfR~X*H5lrURiRH&f~cd&gR!j
zVluzqyRaTN>;MJsn${E9Rr$5E?ar|Pu1SzzJ790<B_Et2FaVHiPGq37l*yhd7-BRk
z0c=8r&p9!Y?=N>t+dw%&q4i0FVH@R&h-t^`^}*8;?Ip4?LWQ#KixZ!Rpw9xM?~o16
z>bP5(zrbP570nCX^-Mob`V~$LTYd)v;SDaUWj4kD6OC!pryr^8Qh((FXqfQ<uwpz6
zaZ)h;IY0uZ{H&zd&Nus>RBxvurZQ!2^lluXmXrC|el10xTgpCHa5R;_E9@1@8&!NI
z|3*P;z=uMenXqtvy0=U(RcK?rIpys`or+<Qj^HHR0Pj5(-KIns)J7w3HlNaWZPyhA
zWA><Ag(+_8OXbOv<~wE3ar(bbpf4Vh-7suo9G5Gm-!OXx0x#k?cN?a|kl#in;9yaI
zY{R~6P0Jd#RQ+!JPkG@U>XRvhK{QBor)K8syRL~-{&Z8oF*6|3wU%i7*aEOm9(`cl
zVm`M4XOk#lMz-KS0OESSz;-E^oYv3`sLDpRWSud`O2J;^mrDRLW3lbE8;GOtLZ>Gg
zp_Ol%YwhpXx7O>o!dNq3#amqFYTwVne(Eazz9GQR-Z!>s*WGm9<wJD%_5wi2KAOOm
z%XL%_{{z`XL;ZSO-#v_k>q44NV9$Y&Muto^)oaRXN>Rr-^=razAR!Z;aXIxOIv;#;
zG9n_aX>hRU7nWNxB~8=L+iJqxd{ga!PW~{59g?HDhkb+xQ80Vzk%cg<Twz=MFWg#F
z@?(+bS0AV-yRF9w2XA&f)n0Y}_Tgcx;FX|#f9qUf`p3dg356oAt6rA4cn1vEnPxjc
z@tRtL&mu;O59c%kPRq&>DVz&^341E1U55Fi9AA3oC^(qH+RRw_2)+@uPv7_b(spR)
z<4U`2rS9ozM&*Vdl-I7^B>=<UZMB*hZSW;@vxQcs)jZ-XI(KqFOCz*KSI5p9&>(8;
z{Hj8wTt$kqsumZaXbn%Vi$0<fqjTJ7SAqHbmL(3cYbIv#`l=)Uy-U?+Yzszv@4(u5
zrluFjb-1YOz=ib&sYvcx=9+8}C0I1!CySkUPIw2@Jt7N1p5e~Z$p+o@5$PGQR$Wpt
zLfvpw*elPK1&w3kEqiFNhPjI<)q`q@8BaHs2Z9r-?$XVaE%yi1!f~NTW%%tIhK4$r
zw4EzsmJ_IpALBp56|%gbxInv6bJu--i*WsaBPpF4^r^Q3PPn$3-3G-Q<LSrWMy`_2
zOveEb=CfgZwnin|N2~$p-tx>$FeY+YS%ve;2KapTdktFl55wVony=T4f9u?78@Lft
zQ*I;klINEw^SXN?a^t$jvpedNL#FP@rY9{50LdhAktXk}<M)qDfw**2Bl$*Zw|a~%
zyHr?@Ev=?id~JvmjkIFMEI)VdrWua1j|NRFy@;1vFYUiCWVv^vqjjlWf=5KBf^D_j
z0|`@M@_r`(1#&2V`{+CsWux;xs<>|U)j&|%O7>ej!tS%Me)-V)9jMVp>o<smU_->*
z*>vqFU02Yjj48$X-O4m~KKT7TK0rf%M)y6z956jnhNL*-GV?_vCT`L{IMJu-so<Ol
z=Lv4pyr7kp5LNBli0vCO*LW;VU3!oPP?<XSSr%U<bg^_S<It&f)Te)Udb#e7C(;@>
zAuEf{gb3<y!>&vsPfqRrt?=~$ff)~~GuMU3vNJ)uLLaafoP-#&mmg2a2b{1^(P{=5
ztKytFz%+vRHkSt!3u<vI@Yi<?z^UOjl5OfgQj?w3DFH|CI3mZ)y6mnbw!^C7I$yb!
zL&m*l!50+^-U6wj@6WPC<eY>?ddtc_LsX^~(4&pBA+G>q<{pM%&84U4+}AC0zTQ1D
z^i@v{P9DtQ`O#CSA#bIdh1luc^j*6zJ01`%zuI~_OgU}FHMAX?{g2V}v?GQv15dol
zv8fI)TeWNjUz+yct3RtETpsg+tuq>fo2)QG4CfD{9AKrir>7Be!s4^eoGb;$Mjxg<
zhhnl!hMH{GYaqW^nWjZiW-|3{$BHEATw6c@id0_gkaUX~aDOIi8eizCb}{(yh$a})
zvrWHA4MAM6X5&187!Hh9w<;()^kv#+1OqH4Wo%jDDqDzyL);{Dx;@afh6FRJ*|q?C
z6%JqDEC5OmL*U@?Y@eqGsq?-ydcVfl05ytTel1%+7~xR_z%dgnrV(;^B|noyCCV0x
zY-xm6DJ(W077?xihq)<HatvAB%)Ppm`y6#2iVG##-QSZ)gn{?<d`hp`c#=ZHgv4h{
z$>#-muunQu9HARJp4K2gBt^IJJ{3m|@H=S)pDqkv)meQc&#IM2QNu4DME#ld%gF&a
z#4%4NU4xrrLUcXuQOSZ<GtXXcb;}BTmv7HwPLe&pvGYRrL{;J6FMy(s^%c}K;3c0p
zuW^>Mk>9xOR1T>9aN8`Q5L?8$Y~;M@hzx5<HgFfS^sAqo6xSJQe4cfp<^l-(k)PKs
zlbit=4Lt;{66tEXIgYdv3?C`9nvh9egMdr?o>4&395w1C!|YHnG$2wRg_ukt?BK3O
z`A-Wb@2lg}*fd;xgTd3Q3JPR-nETEFVrcOevFcx$HDj}Zw|+^P1mM$05Tz_pC_ezt
zrRAmAq<9N^Egj*yVL$&kI*=8@A*dD(&4D8Hpz$IzUlMoS>x6gNT+}qD&ADs!ju?|U
z_{PM}n}YWmtj-66w}#I%3s$Q6)QfW51rwpUqBu;yQ{58l#4*!`m?tV0SGJNYWSND@
zI-f=-E75$S!Dto$dR<>AG=A^c!sWOldD2e6bg2bBH50NYEiq0yYxB!pU~V{QB07~*
zjT&KEMBZKz7kCYjVdFXE4;i>axQU}($K5q)qrLcnx}_Eb4!9!aZ?XCuucju<AEgc6
z+reej>*i2xrdnS?$gg5bRv0s{p4R?_K{PG|R>?Uh_eTqk4C8L|ylr~w!_U{QkbBNM
zK%}%H$Bg0CGwnMNL`aY#513zlZmqQNGJODc_A&B(P1&sLI`e`z+FWy}aS0BImftJh
zc0uSv%i+Ss376ka?8wZX4XHTVjpAK6F`6x>{vIDz(k?%+{B0$F2}ok5w2t2%8Zx+a
z=H71fPCSveAn{l|BDWRqo;sZifH66)p31(Ie(%I(C1v5q@<wU#4WeJXZ4P5jOmbdy
zhZ9O95}Xj9yv7M)2<NQ-AHKdjkm~RMKT2`SD3?lEDVZf1*&;h!+-qDbAzXXs+H_M2
zm5_D0My_iWu9-wb_BF0OQrRQfGrx27o}bU>`}>cs?rWUqIp=vE&&T8O1dRqh6c&E#
zxlzGeJ>Hq4n^tO0B)5}CCXOg<TyH&;r=!R$4Y_m{I5O*<;+dsIe1~0N!+0zmE68l@
zr5H!Ki)%kYCDw%>nd}e9ZJv=I20rPiwWk;O(i<WN63y?9VeWKhY1CPF-x~%%tMSH0
zvbTaDSH@yn@ZNgxMUK3B`EQ@1u7a~7dd|b-pgfplqpkFkLrfTVrx$#s9r;vYG4flV
zr;fU3s>SH6_K1S@)@;T@<LJ}|%Z&c1mS_N6NJ9iLN+RC#Eidph;XAue$vGptgR9Du
zFGCE?5ZH>{8FxENL3k98tBC3>s(6&0fHt_}s43Sz9mIb1&SSl}RMIQ5H*}`?OcN}Q
z(d7GSv<@O&lE1I}b3?g6GJmOk#GCPv$3H>$s=*zAMlJNm4RFuYXq{HBlOxug5l6w>
zCsyC8Qso1W*8Iojs&Nk}kW7;E?3TdL*^{@E<O9*AZv>>`*}aO77`E?v0X#+(S9*>|
zH~W>)pwT12n#0pS&k|CyoTqBYpj#L#agK5ote|n0W2Re-lCs}@<fWNFETPZ#G@4_@
zQu(4Tb|?>H74Q7|7&Yy83WKNpQT9GS_;`BNDwKZX!|E*2e(U3n(MYR*?~$}#_d`G(
z?R(?T22M^{DVoF37XYyE&(M%AC#xpi=S^Taj0CqH{s?vA1O448)6ZKkgRu`y180q6
zf1n&zK%an>fp!3yI0HLDOY%;2m?EXxLP-n!wX^I!ZVEGrHD-)PfF2{`rCmb1lN9W(
z1st-V+7F7r;V_YRTnzWp^{wFIa|x(4>5v}Zu`fk=sEdKyYXzHjG{l`?t$`dR?!8sZ
zy*M9ZafQpB)PlVX9zyw!l%|>8`8Ww*ND2RkOn-Z@sXV#~4KwJw85@e*?i!s323aQ&
zmz%C81uEp(K2VZ!=f1GlH;!vh?`FLq)=4dos?FAvqjX8*WhFZ6zY=!9H^pGj>@@z&
z@}}&`_<3!<S02CwED?N>Bt)9v9jO$%@V!#Nz@c6edXr*8oatS13{ly+oTy~5<w`H-
zB5nm*Uv2lmJ!laq(h{U8VAgcBskr*S5HpG=YnkjP>A3?Sih0Cot#ysl!o%IS)z%{|
z?3Nyb*CCdKZ$BXop^0Cb(^`K-2%g#vk!cDF$SY0=(`Y>v^aN)wqc0Z`bLvVx{sa~&
zJ3X*Prcx${c|}8<31t2FNE5U$!aEaKcJZJC1YtG6z$+2B;t4^NXE%n&%A32}q4tKa
zq{O<}=7B}{{N~ezt8!^jIxhaX<O^KzAG=xWqUfk6IChj?_N41>^~lNTYw$!8{ii8!
zwV3y+#}K9l*>#lmn_aUN!#UVwUAa|w2}swo4TTKExrPP(EMkW|yZj9?{l%s8RV$rZ
zlnmBFx$6OwBTs(+Vd_*0QK+j(WRh}rea`jeh~LUs<bCIlEah6|aYl3zA9?=+%uW*e
zawMR;Oqt9tCY8|$4V*O@Jx~h{;En`ssMJlGh_7!03%H^z$%@R}l#<&_A)H>hKyoE&
zGYh97iTF%{$?y?*N?07bd@|B8)F*9^VdTUbHz`<UD3x~EJ;KRKM6}mgf@1o^nJf)}
z*O+l1EEm>{_k^7}hFH~LgXC}EE@Ie<%W7%0g>0oHgbm`#cpz1|0xFpez`c-#c%fMu
z>8>E?lu3Y^)N3yI*%u?2U*HxkK%wfPGp0*S&pFRMyYjOR0>5gk38544!qY9E<AL6X
zufDpFU{?I;L!gj@hcWXsVFW5E<|k+K`AJ_|-p4LVYq@S%OAIKU@Ns3}CJSM+vU0wx
ziKNS4v%QsAuB{Ahfu3ve`jvbO*H}^(^Lk`=7T>Gll2jSy)e7D(8IHNdSZyqh>22^X
z_#xVVqIX-iudTFq1<Ub&`&kgF=RziA3D}~4;0|Pm89^%e&gP2@fPPJ;Qztg5PGMFk
z3E0peo3xj`wz1FN?s|;UM<4rSO4()3hb3LxzbIq<;DBzYIC-{8$azq6uO@|oYoYRd
zp>10IjRBV)9O<!=JKDl<N}cTVRp7G3OpyKyb!RS??;W1y0w*_>$sQ|Ma<nQ<0Q8WN
zy+Gk@T~<lC2EMHP8F}~j3crK~>4kX?Vl-k^r--=3IASFB%q%yF4&?mOV*(xwzmGp~
zpT2lHBEqkeus+2^w;{Hv^jhYx?3Hr9OurFYL13_*7tLkA3C7|ZjL1M-htOj*peOXQ
z`%4>JDV-^&&deqS=nwK}r)L#>`(c1!2m<!4b_ADig?WvawWsjok)swfV0n4OyF6^&
zAz4Ey8RKgb*$G3L<2wY-t+gs%1xkiR6SY%z-mkELx1%&@h*tZUt#dezPLC!(GMaEx
zk;UOrtWTQ>GN5wMer-2DFar4|CpBT)6LE6UG<kXwN6{b<W)gVky<Kf`cd{FifvoZC
ze(yMR`S4k<FxwU6VhAzCyT_yhMfjRweUnY2AROwW|51=;yZ9JFB{sLs8+1^}H&GeO
z2h-wm^BuOb$JB=))EmlFmk{obJunfaBDDp38++KiH=O;#thc6&D{(~Mm=F3%AHjrp
zN=HvK41MsL<qjNibNz;ysmyQ};bvxUTIuCbUUtu+Z>DkIp?6EH0kjbvR`3gkHx}km
zc0Th69-wdRhr_TE`n5a#A7yzOtX?U<orv{a+35U?)X<1?QbH_2nYy<V^Xqtst1=+#
z&J2toJPZUNS)HLauE}#ib;$eCGHE$U;CVj|5<+P)TlBFRUGqTZ5)FCv*GG0VqwRZ`
zLyY9!Bx(!~GwfKq^=r**eszSq`-J$k9ERgEq%t=<i8`2YIOmnUzJC{`A?MH6`Rh)U
z&aT?DdOrkLx2X<N9cp3$rPspCrWBH&wEnw5>jGCX{)*V_2HzgW8R-*Iu@rRePqZiW
zC2im3^L?X;q~fki!;JugN}L@WaJZRNO1g(Bn8^@eC%D~K(|5Z*DjJ=KI-YbFVXqo{
z&4EP@Uia<Qdxl3yNqc-?^yt93g*Pkoj<ri+yo!ib9^KjVf$xXPkg#lSqR%U377YAK
zBPZm9!`B`L?U}q(g#DE74k)CpY&~@XFhFW)bNt3Kz@>3UM+K_G-aaidWb(1ujeo;C
z;x(_N9Gjgq)cwseHa4l^nckGNqo|mc>65Ah`i6zRUpq(Nq~0=Oc_Ea^vR;B*E&$Hp
zuOo6A?$^6P`lmSbWSE2R4U9V^W>BSpwm=v4RFd<VE2`4Ei!VYKI=)$i(h)7Xr0m1s
z<Zx)`vKoHok)_6a!;R0(hYeKDj2g!Y!iAOF6-*-P#j4ccjn{FizIgozYRT5K5I&56
zk-d^1ICv!H1(W?-g8a`p7g0E5RNb2VT5dPj?+?Vcf+N~QZVomQ`SK%TyW^W?T#X^p
zAsgFeBz|#Tur%6Y^xJ@cZYqFkF2CLQYVo|v=ouk`yu4w!%3NM!!Ka>%vVol3WZ4}$
z*t$*A4LgOHKL!%_*`e}hmKgOP{?I->yjD?!2|>K^3n&#Y5M42QOJyEiv*`;`3dZ=T
zhQR2eSCL^ieyLa4DZ!RJ^A6r0Irw}NecbnF^tG?Fo&fLZ-ZAb~yS{WznOmw{*my`{
zC55nliNKm2@>t2wy}-#fd-4vA@bK}IO+MinzkadYss{F`nN51XmZJ%`2yAyc`vAi0
zk2$`I2H1MqIkWum1JtJh)@^g|_7zHif!1-7q)f}lbCOqARN=u+7|v=|&O*;4|12RW
z_EQJh>Y2Ki&h7flu8BJoENVT)HPRu>VE*SY>SFF08k)gVrd?ZQ+mvI?%KWzqT;oaN
z6FTiklx327@4a&55a0PsA4?78SZ-8QhO6`-?v=V(94%2BHg%&Rolb^$P|sM=xu#UX
z_j~Fo-{8HTN_<nqFKyxSx4^>xaJxy}d6`YNq{lunY7K#4&aZtXRq_Uwm<rmTGn&L}
zGNZPc*&xaRAj6wq<<Xw2WX|%*xH?_+fe7i8Ca8*_SEejtHIv?SO1A9!e*<tzZsOb}
z56<4|HqhruEmPuZ)L9c`kwb7nMH)g}YrVcZ*C0G6AHswR{64Y_dC8wX!Ei?Qr{puq
zFPV|Ylkh&AlMcPNd-6^6{HtLhid1Yge(}b)A6g{S5w+<Ml}G?ZItIW)Z_Y35JRHjU
z2Dd#X7g5RMqHobW<!2j8!#c%s=~YZ4>5%ux<ZH3sd<COd0`Tj+UTr4jiN2MfWc!NO
zK6e#Cmv=nIwq_)^Nvjj~*#Y3>HCQ9@+`=a@3l3)>-sPFAaPx-ySxRw<rEdnKMW?To
z3KKo5L1v7`TC%7|<&Eh5_%;|UNA|-Lp0Gp8a+}PN>Z-Gi4Mk1NzJ<6DQ!|#-QvXKA
zm?>i~c4#rZc?oF^{#0XPfn>L=(S97Fj43M$?vi6h9c4YaxCxxKIf;Ru5m;p};T73u
zwtVMV(IejB+_tqAAJ}t*b_CR`dX=3aB`J>amsJ{Q`c~x~K0G-7dj1wdrPX2`_DGFA
zEzBC-cGEGKa3H(Ql%;8I)--;`o?^_tGSUnVG%cT3tCBuGwFNsQ;Knlg9->fb6uCny
z;?1@&O3I^22)tTt`jja>&L$YgUUK4JOdBR9H&ePn&SjNw57)RSew`fRPqQDQw2wKS
z6LT)+{?U;q`VkbGSk1Jz4ILS^E$}GKAs7oICU}J&<@>?eG!}_6!qF{~NH&=QcQ?Lu
zsBuOKT(&v=>-!VlPf%mMYgR2UuGDGwferD%L(5iG?)#+@Duf&98vKJJTHM?Si=Pqo
zOi5>#51Iw3oIv=%LM}SoQ#ygPa8`vb(nVEpB@tM&Qmx}>4cbLx6n@>E*yd^v^0E{-
zz~mc4oC#vUc+dHLl;edqb=8dB4}*ey^+d1@{l?S?c{!n=C9c6xI7o@=y0ZcK{zO{v
zFN@@^Z!pBCohOMhQjSJTf*VuFPYBNrDLm@_0*|k~lF|J394Xip0DE;^-QQHx`f~$g
zk!NzppK?S%P80%PXT1$ks9A?6f!96RoJK3{$_!CXj8_+H=1YDpv>t|(nUU&YPkLaq
zfggElD5X7FI607oI)|3;wbm{={tGsHnZ5Qt!E-}c$h)?NLT|xKv{OKoO>$9?E%X*$
z6o4&ueQvvaJc;q|{v@h3&cmK|%;u@>y%hK^pO)ME(YMH>0<dq<6^l_=gO!S7HEqp<
zhQl4fvz0<nC;e0EWcw9k!#+fev)m_^36f)RVJ$%#Go%#v&T9zwWEhkzG9_M`sF_u?
z&~H571#4tJ=!%K#)*-XIvzoL?PRr{(&|g<ZUV|+QLu$8rjf<kHuxJj$YDCTGk{WhB
z*~ffXu<BxWyhI@;&)jg#Wp6hDs|;s<4C<)kfYQR>9xqJ6x7dZ5)~{c3u8`g;_VfU8
z_$QkUuFqB2sy!lcEKar(!U^y3VM<!;`f!ghl%);r_OBv<6^_a;LP;%Y@u+qZDnl8e
zw&Pms=%Ghu6?120OY)ipnEj%iI~9gh4}xng_hACeRJi&rZyLEut0C34-eBw@#n0Jp
z9krg}W8}``a^jj;N2c>Vr0>Q&&MO$-sXwHS(#vYRfH4tEbjZuG&oCp$Dit*D9Jn#E
zovqglDkQ-S_iw@x8n`RkZzx2Q7+z}x`}I*mnuIg5$mbO9+ry)qrAtAEwkL*bf6*d}
z-4iG!Uq@soA3kT2GuQ^u@(7kTHP|Gx)%U?Z=GtzvKVO!qmeD_}znhd2v)!4jBnDaO
z!`)KFKQo!ySTQ!ZKrh#Rk9TVLfRfoDtK4A(J-^G;)wkTr_YWyYB6IX`@%g`eq~ywT
zSFs9WpkK0!N#29+!$sy6!q2)dHL*iH24*Hy%JWvcks{C&UG0>*Cf|)%Vw~B~ERIk+
zP1IP=7LcjrS$G{co&rvrf?l{qjEq-|Kxu}XYkaSCwM%@YyN?NRaT~ZolS&3ldkN?B
zDlWHd&Eh|%6%q%|QaWtZaD+qV`&kpv(R7%~m9x1%p&C@oh+F!Y7bKEjguwO&;!~;1
z$eJ0hOvh#Xx^v0gHvfV&W@PXa;#|v8#e<`~f{!zw<Scv?QjU8Ky}hg~-(!KZ7#3V0
z?akI0B=SJ-!dF}K+YJC;VNf+amO?3BB99MIHr5j~8$M1CE1cN=PUNT#dC=VpQozX>
zU8+|)`lF;>UJsE1HcMsgJ|SYaKBmmlD;xA{xsXKLDhdrFmA8pgaY%ke4)PhtLyPsi
zlPUeXe5y?@HtaS^B@*=2J8fPHT;Ny#3HU#DR#W0qIkSY>Gm_90DbQ{(Oo2iBPUH31
zxP`^QXS^9J1wllQX0*HpLf7k9tJ7qx7u8p49*5B&Bc>Y&Q|N8@VDhz-+FYom!GhvW
z!nV|<#g}gJ0wNdaF$l4Txo*z{8a@y+H2ignw_zW{v*Lq6f6DS+4e5zee-Wsd{sS#?
zW#@g%*E1mp8AK@UL>vtb#5pj-VjXOdH#Uh+7!jA;eLVo`qDB>VhcPu);DkfxFek)u
z1a$m;&7G)vb`})5Y#D%Ee3JIGH>E4B^M{E{rbLB@MG#KNaa;P3{)2kA8W<h$<Ie>&
zb5~P2L*U|1DSgQ*+cQtcxDKxqiYFQ5YWdxm(Q@!7hSx<>V>xX)B`9pr%=p_5$_45O
zx;BQIwao&_t}?4&4oJhs?0{QLfd~FTmQvb+8)q%9*NiyTHEY&Mn!S%q_ym^j{Fa?}
zB_pnsc5jKN7C^P<)+^Gm>eYpuig2>yvr!MR_RY;fdbzfhEv~)%F^+0&;=qz1rhjq_
z_K}C?_^`iZ7fn;;lhMwWg8Vp710f&`0iK~7-{0NV`y=t%Q&xQoAmXq;R~X7#Q^2%%
zQ2w5Ok|)cK1bvc-(QMjuSB_di%roSJ=YjWduF&mmI81RAYvEWxJi(||cOv-QCSVUl
zI;&(v@rS$y#g)Ss)92g7{CJB>392dsAs<qo@bX45un#zfeP*2d0MG{?ald6iy8BdC
zt}l2LA?mJ%M}|?;L3sOcr2B<x_V;+NJsMFa+&sUoTK+)LLl$xg6Rj-V+PR=rt$?wF
zKU9emR=UoK5d-D8TI_pA^6uHBn=W3-Iaw&^!m1vVa_D1OW#$2s`U56qxeZUS28IAz
zOZsRDP*ps;Nhi!rzsP-o7Q>bU2-Q$tTEts@kosYt-5+pW{V}R=MIC=2kpLs!p4#yW
zz~KZmDQ#;U;s6hlb~Rw4yjo!91Z0D+H0z7n$nmOQYnwr!cqai000sS7NS!<MpQ{8V
zzWogR`SD~jPmFBw#NBUUUJI+F<$R8#L3}D>>cke!GeIG$!bQ+6e$gG)fL{_T(iFNu
z_nS-i!h+X!;A5?Z1to*}&6jH!E+UGbH3BvSOARY;R3>(w>rXBKtPlQU9<JTSzP8ER
zdV^cr{Ah4sLC$iwW7vLhSg#&5N46EbfFgAfuf;BB%$P#jD{%ur>``R+eBw^<rCLA*
zz~6Dlu(2dT5HL6&;)hZ5*<U}XtM=f|4=9@vAXxDCCKUYMjJ^bY#&eL+Jk5WX{wY5p
zxzkYs%olYY`>^*iWV1Ao8@E?35@t3H{WDWnqQ<l6u4yas-TbQiYP$i~qwZ7R`0M7p
z5L7^mjlh=v<mSL<(Yii-NIU3Y!;H8}tId?V`=I`~kA2t-z)I;kg5rC4adapQc1Ytg
z0^xSih=s|(HjV_VCQzC(iYw?VY9gGLj+r+D+7MULt^6ih?G09kbxNs3C6vGRT*R4r
z`|;iUhc?6xcQv^J7p30I(8Y&G56FTR-VdFpcIG1laD6jJag8%s+q0l`tvKr91mkeB
z2BJ=Ev#23^`&y@j64ikk4i``*zOypbF#&i;_<qDdWu8*Bl4uoUmnv2SIZ(FrOE&jH
zroegrjjUG=nGt(K+4E_jdR;?kh7P+<Z7x&V-8;#A&f4YM$mua$fSppr9SFW{(L4mm
z)v9%AZ@|j&gJdn@m4b^8&cUKlMJNLkW!4}vJEj5pyj7+Xxj0>^845@WriHyPcdbdD
zsTSqR3uS?poj)<MO3iuoxB<p0<$00a-jVz|k=lVr)#iwK6Hr#k3`Tw0g-$X{s}gJ3
z02RZHrB*wG^T1J>fSoVSJ^uXAJ_CfH+#cV?bgd-u$9A!c!VeD$K?}O&69F=1VQY~6
zRsw9PY1MX?k_oFFtr&fljn-AOMCma6b338PD+2MH1fdQ{S(b*SzbphTyYVIb4nUi=
zWIuqirWk5(i~%vSDA--0&|Ch4%yzz+<h}!H+}bJkbdGM=TcRqTiHSYUb#)(f-emq9
zbhH2#chYwoA|?ddnZ2%tzfu(bJY;2hxIeh-!x%1tW1!@L5EQ-`L7>@W#wNj@CC3|m
znE{K5&bT3&mDoe!CCn7JnsId(Ul6;VI{xCT0OAcO#_P@I7>=}~ho;_oMASrss-;bQ
zmzKF|=oQT6AsmxA9)T4Y#}7YDh3O1dAp191o~@X?2y;LgWG|WxkmS7s+}k-J49Ct$
z2vsCzH3k=aIr7md?lo*mgS)n3hNCLN>llQ4Q#pFcOD+DLb&{5x0wb}xc+kL95P|<G
zn9^OMP;iFu;(NsT*?u(uyTzODZml*G`yIMos;noCzHKrzr5o;&*ok#i`*edP7i1b~
zUY!~f0Ls*nEv<0V;uFbe$V3KQh=M(;K1X$PUYYFn%tA6&={zd=rGSplekFHYf@X6i
z+tfFUJx$;m|Hc?&jp|m6zuf&5i$?Zeucz#gI=JYp$a|=++%=p#rN6M+itNjd#~|6b
zwzJNQyp$W(1A5)LbVHgM6!OnPHa=ivV;59tK75~x$Xhu}>HVSB+wID3_g=w|v)v?t
z5fkaOAv7yt45A^DlDkfPLhh%kdzWv;js+;ko9+vY|H6=_I6^@OlGjGpCO}2es+(6`
zIrcUnbLy?wQO>dCQ`!;)l}`W)v0jOq1mx6FdnGkt-qOcCdj1Es7xy&^*>Q%E0tHhs
z1mp%nP-$R`)vrf_*F;Gkr+AORJvI0Phto&~?dpYlqteSRQE;=l%?mUEV$%u^omn5!
zpYiUAx%UA80{d-A@GIUqj)3kvo^;DcSCf1NK5Ye>J<oa<Rz+9&Bt7woD12R5#QrPm
z#V_+Q>h)`@u7pxCfIufXr-J6h6QG^+%%-Gvrv*4>>xHP`=;{0UQnJ0vL7}$<G)`PQ
zl;UBY{_N!1V6D9`0gh{&JEtrx1;74|T@=2W0Adg@3nMoHb+B*tIsBmp!eNHqYKo)R
zdo2P<#-~Op{LCG<$#649e!d7_Z3=Fg3hKR)$NPS@;l6-Z&_E2wRlrp7TfAZ&Bjxhd
zT)~J&aL@e<7BH4n+QXBcuL*ijx#K2BNE_c{bz4B`X($i!Jf6IZ-|l40%8h)A9qdBV
zvsNaQ6f!5-l+6;t;;?yuM^I!l461{WI=)+Arb_9Fq#x($rk(~gx5oqlTXslk!1mzi
z0y$oD>>%+>r)N2`ZPPjL)8o?yUD8)(e}Q0~y#~!7&8iN;5a-NiNsf2SR#Qs9Jh#wY
z>GJ>>O3e71Zx1hGMbPrJQKjiA!7;P$`Rh99kuJ|iGTk$m7lL~I>1)r$^dQJA&RTIt
z$dFccRCyL5!e^?+!k$BMmjC8l4A2SQXy<njo-^ycGo<U@tGoWh(n=WK__;8cbp6O_
zIhVP`G;4&3ZqfAx7=l>(DyjNSz17lDgkdSU2Kr(xEqrIuXT)E^VY9;Vx!0b!$DY&C
zP!}NH=$iLEFd6gD?B~zpmZDB545pa;z%e2g@Jd;G55*I@KVZzoLbF^KBHqSfOI<Mm
zaCbYfJsiEnn)IYgil7~4+ONhP2B?k=JD?ZWF&DMiXDI)S9#))>d{PC%_Sp8kw=a(d
z6{q$W!F=YEFw&+rR<_cJCWRzf<9xslkOOO!Byh2vhsp_0D{$=E83M-XbfUpiY?a>t
zlS3S!*siU${hC)76a*?VU302=rUzG0i0oS?pNmJfUp-z`j`c|TY?xg6MwyBq(XlAl
z+84=Q@HUN{p;dP<m`^B^?n_pau>&j4&<?2<m&s~zcZ~1sSh-(U`|7C<&xQuh;IoK(
zot~i_1`Dvkv%&Rdz4Jig5G5ysu`eB!txF))q_a=Ce=WCI_RWvKw;-1}$xp+F(5_2+
z9K10W;g8_$@_a$fO-LDI#2m8Cc`^=US)+L@an48LeWIGSDzT<szci@Mm|N4FQNn$A
z)4)PFk>mC-Od;wN?g_Rq$HI4e{YP5GQJQABwm@N2R^TO)s?zl~w7}1}&5fR)Aoxv=
z$wjZeNr2Pe95by|(neMn#Olgd^4=uuPRr}RfE5`LL~?3bp!#1_sf-X>gCMQX>{VJv
zn}cp#cR+qM=e-NX>Zy|hg;!5P5bK?rGg6ME^y}h}@`Tey&#i#6N0h)sB_d=_v#Q+7
zG60aSV+il30wZO1B86}v<>nUcjD*EZ=SHRWb;xA^w<<~gq!ff}Vu~{<2e#0x14EQ%
z*vr-maHn8=FFhj6Bitiswy~ng$zdJ4hU@jB>18>$B<Map=|Zf9Wv9bvUaHEf@f#Oz
z4>Hn9Ecv}U-q0w)G(}MxrUrORQMCSlEuMe<q{&1Z_`dSqaJSPE;6+Mk2@~N(!}ho5
zC?rml)^RqJ4DR7|%nc>x^GLN=F2aedwc`u-Qh1HJ_1_n%W_pX&<<M1vJVW5r!0U#u
zI%N(g<Su<kZ-PLsINWYw2inHe1ms8i6AG8W7Ik{-&RYc~vW}Qv{zEy0QPaRDO=eT&
z2!^Nu==9)Fk}Q$MYbbE$l$X!~t@g7B<a7Ia-B`zkYm{>zi%HYlBeTPA7Zfn^LyiR3
zpW9?K{HF@Scj*{SGW@z0!f*%IB(&UeRaeZUuW5S|C^e3jH5WJJHQ*N<<ygm`d*2h7
zvo|k)7PjE1UA4{p3>VlJskUaPGPrFc7gTM=7m}XV`r=Av?{Zk}{F4IS`PDa5W%VVO
z&}P%_xs!}?3tt61x4G+J$v7~qa!L1G#$h6QWaVc{@@TBSDKWp@)qRo~y|6U6)NPIH
zg<?i_aN8#HU!bh!py6I7=j@2lyBl@KFp^3I<T;<6o?sk|lCRkng0g{9gk{gSL5{~3
zX*+8-*6bipHsXACVrdZ@%i(CqBYskjfEi@(j>F!mi1EXArH=*j_}`4j2$2(C>ZWlZ
zK6yqE>hb5EEXV~<i5px$Lr)_^R=Ttzud1+airz$2t(;BYtsbj@(YDIn&#(gJC^L$D
zO*<}7;iGvuX!%E+{Aru;&|ll|t_BsK?*@W{#p{{X)NYQFHaZw3cx*--i4(3VB3qj{
zw|*Td=eYYNbOQ8_pX*-*EZkwfMF4yC<fw3bkuJ+M$WgtnN;9g&)BNq#=eH(66pTI!
z1ha*I1=_!YFVu`W=?2M0spS5^J3M{pHodQQABA!fQ;?KtZGbr~)0pj=nim<t%lGyE
zK&8D$J3C|wpQ`lq_!e)VnIN~bW$YC~_iI3zU%#ga1n<6IVR1^`oh_USK>E<apObPz
z)`><Q8t-v2T1+pCj)w@N=&4%K9vU86C3*-Zg+3@m``$nrMqqbp2O(tz`hDLV^llC=
z{1U1pDb47Jcij-=dfD(q$uh{aoD}%N>)gQoLUX)yG@*#E`<9yE54&MsKwvs*cz3B?
zSAHuq@`ht!<sInF$TRMBQZ)CwC7K!J%EY-jB&|>h%e@AVk^mAw>|IcfbDOCggASdN
z@o{S9?T&0?tiY@tM<rx9)&Rm-B4CE$NLJIE>g&;A0SkSA)Pa`!A#jZaRcC15Kx5Qt
ztz8$MHDgrui<$Q(ezsD%SG~81lW?{NKbZxal@AW~Q-VSW8|_zdAu$$?EN*<FP3VAG
zW9T!W5TXf-$X-LZQ;GZ1U~m9s58TN^M{|rqR&$Xlh0X)Z${c49;jf)5;VLsznK6YX
z=43jS^G%{VJ<ez|!2|lbw}jy;s#KS2Q{y=q3`Af*a+E?xgeEh%9$@Wu*I6LB1ezDB
ziR%WPS3V6pqO4Kjz7S~(uMU+u2~+gWl>7H+z~|`YYl#(GlEtz>di-U}l~M#=<(|k=
z=vL}DOkHA@OcI5h$rXU>D;gH{5?mSq)bvk9{=Z^pzvygzsEKyNK0`Z}iEt}(@gdT=
zuNL7@U*MG4I0raT4LTC`88?J&cO;HNXr>1{*GK$a=ZLU(ADx$RJ&2_9?fRSVhw*R6
z^L=RCV=w%&$*WFzD_p3dfH0>6`5je`=24%G(Td(6S3t1*u%?(~)^+bCdl`)iNVGMn
zL=o3`et;TIkQJ16F#;LqP~XX3bwq5emy!ww8*X!A;5%bFhbH7+-nmp1@~~dgU737R
z$@yvbmNn4Q%@dP&bHyw1+j_BiNiTGgY*xchQufqO+Upi(GS$~E4hI7|7g2AJRr`f2
zP*%#t7--)bATTa$5!P@7xdnAdU;lVF`h&A=rsHu`T4qBo!39%j@YKJ>H4q;XC4jV{
zkJ@0-nftkE^;W3i<@OS2L<%{arzDJqH=ZjLhR5%UqfTQ2DDr0fJaPyo&RjdAirsBM
zymGYpq7~y#eDo-<>e=dD7S`@rB7I`T>_;N8%Njo47x`F}V^li2f`q}ITtPk~UdXhN
zgr4_~Ksxf%rmVb8e}D;gA}HZ|MawZ6NcN9J+SPSGpgC4$Ndc#!$0J?e3&gUl4K5I1
z7zJ}>;(4#p<$FlEX+J$O^RR%ZgAUrFV;BGK2WYo$503)Votc1+Vz(XEb!a%f-uEQD
z(Aw;a3DqSc|8fLjl&|z8iaF6oXrbME7%Jp3y4RmJ%fUdrm6t<^#_F=nRRYH53C6A>
z!??*5V_*sNuhYbz>DZqsnpZ&xZ<}~xim$!L4Q81nS$#eg7EX!`p^w$TeQ<9=vNq9}
z=0jOBq@d^_E22!#`w{8xkDB$C6Iw!g%RIyil8s-y>zX6=50Kl;BUbB2jgab^*BKB(
zr^M`GaQAwKz;Bi~YRQe|28CZ&AW!YTE~bYwMDn&Ng&tZzRT^Cw!7kCA-Y7_ObO<+d
zl^~JnYUj9Ta{p~zVLV^Z=O%L_ZhCFED&@K?bD#d~$uzOhTz_6MoWz}$Qxs0QjZ{j!
zaHi`(9=(yAa;#m|$t9h&2*TB=Ye+u@$Gs%6?i)nj!P(Q0d-R<0vV_O!nhGv3fn4Y3
zSWbB8g|@Ey!$a0G-=8SAfOFc&tlMe@j4WLKOi6C?yZohLhz$U{mJ`C+%c3mt^k$uF
z@I>Xf>q>ETb7+Fl_cDi$<u)MfdO9wWT>N<9GR--qOss(T0G9JZ@%9T@KaZ&3nu~-;
zZire{=#%`x?%iNOSyc&rRv{5X>2e5Na*H55;>W|=`2bI*#{>dSU*I(7-kl^oOG~2#
z#?`BFX$xD%xj8MaUo?oGU$%@u_~{7KuA*N~KF>lK{CW%L*8Gv`#lVI(hh5WXKaES*
z8G1eAXW(2n<dgNQ7=q!0IjD77tF}LqPRPQo#sD__6t*-}-Q1wYRWGD38n#*`x{y;k
zi?`N3O57MENzGVz9Q?W<igwMt6>XU<Ja4?>NwWfftsgU6vF-yPISc5gpAiIMTl#mK
zvv&g9C(DT~9y&sx^t=iVtGVql#Qz{v{a&=DG^qj|&6r6V%FO0lnQo7HCx@DZR?{yK
zFE>hexb3d?sxhx6-5`I!@-}x*D3dvrlo8?GkQ*!xO3KW+NiYsk8}$i;*ficxyAM?Y
z^M-XURWb`nt7?CZAjH{><d&rKo)jN@iHl?<I30&<m;sq6B{h8<Z(vI&#b?IMR@D>1
zfwqTsYUEKLC)~sN7(zZ)C_9Ih4wFdk_*8Xl2u8^QP|Kbt$_G~Y0TOTKyGDSPncqDj
z)M&v<QPo76VXJ!d#3L!xeV;Q0RcXTqJ8QDGFF{ipJ(mn#-h4=qw_h?)j-4Zf#6e+t
z6>4K#M2?>k^re-1g-wP@rx-B@#3}<pJLMN(YF}%&I>TO$WSqH@qnL(V&SiPAwo1X*
z#v0xz$9dTho?r<N_-)c3ddW#nzm5}w%PAMw6ql}aTHgU>lqiC`JsyP_4dVOpS`zJN
zZfgDw_C$+UB|?cMh_<_E3+^@e3oWf97ta#kJt{GDeBf>UaWvKtlP@Ifr*z#MW3ZY+
ze$WBPq||jmkL1h;&$Sm+84dSt0=)0tF?MM&qY(WfHFSQH2_Q5d@7~L)nP@~@XRlen
zhnbsg2(*nBD85uo9Z|3cL_cA@yXX0{vep?P6Vtya;w1gcug?tWAr$4(KF|**Kuo7M
zoa~L-FG7ya$US|dB!L$0sVV4nAC8f}SW|v1N_A9M;{e+ARi~z4Hd{Z<N<iK^%H^G?
z%eWoZj~6}SRiM<2N`{mOemF~bH(r#<6*FR2-F|#k{xXIjRJn~%HzRK#hp|_dF}JTD
zl2!-0=OgAfbttZY|CMZo-l4O<ts18&){)nEB=D$dfkzMETvI>=Rn85&#5n<vH|{t(
z$p`v2)h0b1H{S5Fk>4tfX7yx^0OJ7t4Ks0*eAH<m2vCQL=!tw{yz?13|8Zqmh}qMm
z?fs=?z&K<h*1clx0T~DH7pJ@Rg*j_y2`|&f$<%nu+8{ldG}-HCoVA<-gLY>vlL6|@
zLy*8Vok_}CQqxbrxjqVbdC!aynhm*&2?Gqdnokst6B=6pJNX);`Ab`Is$sz36Ow_G
zSrJYq&O~yt5C@GGV3uwlc{xch=Yv3;%N;fzqH_qdU=i}VRMkddBTY0sBv_`n)zftZ
ztqx_ALU`fI%s)^n%)T-9qK2_JRO|^!hoW!(i*-jb7oQRYH?8Q2?+g}SVR4g@wY&4K
zdRoeR*RX+W{rH+k9Oa_DR~%26215^Z+PIA{z2qel73Acb(Qk_yl-r9SH5>B@GjtnM
zce|-u4kvkyPF30hh`wkNcC{ey)w6-JY3bP*tSgI9muHcspK$ZxRWrQ6b+ml9A0Dln
ziEQiNm7MumFwg%2-%vebi8r@=>aXL1&gVbyl7tly&7DuN@E;a=K`nrGHo6OI7bIl7
zWXh)&H5r_+tvu|Pphr?Es{G~Mc}3t<{<`hunic((UpI6hC;0DWapfz?``yQ0g}17;
zI|5E`YYEDsSz;+jQ(7ItXTT-kJmsf2wwR^p0BASAw2Erc3fbHG?OJHr<|=i2)j@r4
zX~ct0qG5>KtbZu!3ZXthAaGN;s2n{E@00)G|5i*l2o!yQ1MP;5=WYJ~5pp05@ee}k
zMUsX*8-NakIWRm`FAaDCW@^eLiJ``Ut$->#@j`AGhS!n!nEFl<mqFYiZLeHp%qXNq
z;iUFsf`@))M7=?hqHdlKqCH6b={jJ1%b+sLfH=9gBsab&kEsRtK(K08<ssvE^OWcN
z=R#4%yFg}&YI1j^HjGUTGX5}Jsl*Ct#_y&moh-T28cA5;JMRYQl@~eeka3S+rq~==
zxY4T}7v)5gnGt6hz((T?bwRq;tco6KB7DyCUzyHHlK`|ZE8M%ASWiOdH`8?twX+1R
zoiZpkBKR^ouII?=2qkN$w`@=Eu8+nDY;QWxdD)=TaJ{B9X<v~Zwld+LE@zW!Mwa;k
zEY5p<=Xfg6IPGUZPF}f7noSyMpK=ApD*v=h`>P=TM`=EUCmqEBEF2#Q_=zy7pMVJ_
zc4VrljrtgpPimKn%$%g934oiZ6i6WD;O5s0;BdY4j?B4`7u1@u7j>frRPe~B10tZX
z>aXf74i?eiya9F4HK+{{+XlRB%*1}?yRPMFC;#d>j~#gbz2Hr!eXVGDj=&MrXa-6K
zg7~aX=7(dH<Fc2^Dsv}|S^A_wp)f=9S`-I_Z${qcIZt}Si-lCQ0P?w~&2SA_K<;hz
zwx-voe#xaEePh))?8+9U6sKw>Uvsi^_7WflGTCPXr!K>p64(C`FQMwU>hICr0mz9Q
z8?Z)<)EuEGM=PmuA?@I-?OX;p&l4Q#Eh9iZ6pI&cvrRDXdG$@tUy67+w~3VXm7>Gp
ztH|Kex_VFuaRtBS&iA0}v;PEWo&JFOQIt!ak+)hj<6lc77w{TR3$PH1`ZN3cn+m~2
zx<#chWG$%xtRfHpc-YDvqL>sS(7Xapj(1)I+I;gIpdC8oNvuv%gc5|W&$}=2FNDD9
z)h4grUA$ESaN@)`Lu77lLtC4)@)xk#iJ;~|n)X8JBBfdS3qT;avCb{53prl=7uAOZ
zIvE61Ej;tA|Iefp)JMDzeXW;6b%+P;Y|NbHQwvi}pxxOX-IJhz9IgF^*(yd;Cw8WF
zgGCd}JRT-qbI(YW_C_rV2-mE1>N(OA_Y7X#2HAxqIC=RdnCa=1I8hPJs&|%}qc|?c
z@@hjCba2HK-KNv)kE|VX_}Y_5dt5PN)coISC@;|dF^!=*J%!<4)TI4TCFxM3nzNt8
zC_oqVK1YE1UhB!fTzN{q?fWqRNkgl`lN5jWwP|pjtAG$CLO9S<?rK>A_!-F*`RQc5
z@!8PZ)KxzyE&_?5a*?R3uqy`nImKai`{9L1=O@H}Y-ny^#c`AQ$8h~MWlC>w6{6em
zdYLVspYoV`+?)g6IFI|CN?Kb$xwKRH3eY2Z0z*(`kg5IiHItM8E++iJ?thph0-YG2
zNV-Uj4DGttGJ9R6k;<*US5_0iC1*}5CDmu3aR~4a5mK_zl-T}z&jqO-wY<4szm&s2
z|MmCh>1@<4pCjCK#S7`wLf?TBrS+wXFJWvCz#NT{P#$9Y=bZtcP>OJ$DeeFCwnQ*j
z09|XVo*FyC(>TH0Cs7I{@dO|IS&Db+UoVD>Bu<I134PCufBx&=pNA+`y#w#)Hl+`J
z5>2nMHT^P>)MXd7ocNEw&!%YTFmX};m#N@9{~~}clL0_m+gqmq#DRB!zR9&c_ACBh
zIdP~21t6a|`Q-io<patlyx8g*)I|Zu#FK}E_jX$WazdUme=Ya#R~iaBv;~NAy{7%w
z^Zxy0ByH+_u^=!mF!F2?%zqW;2c)jm4?{iw?Hwt=15^^C31J5*zJLE1%D)Dcz%Amx
z=2S9(%1Z>gmCDJGw||;<u^suW_m6iCu5Hc>zO4DuwVZz&wEg`@HplM*+pN}?$u0vW
zQ|&=(SFZ+aEDGfn0|C+Be*~s}I2eAhQ_sN?^zV@U^VigKi$GVA{Mx!9>3aC#L?H6d
zFwl9l{MSwIKgfU%NDK$fMHx>0?<f2B(_XJ3J@}Ig0A56TGMH{&UE~QfFo2<FsR80N
zMO*=z!f?}5Ky%w(%Xs#hNP~s(f7?*~`)k2hacP75NMcf;4EccBJNlUoD)x0doTBbi
z1EY6#K&jqAa1Tw--)^chd-|=VN=N?||AL*%m&Q>>Dt!LZe^@X4^ELhPZYbo+b!zYi
zNvhQDR;i+4(#{_`%OYkX0*Ch2m6Z;K53GrLe);nSD;m+J02{<R-FUr@%9)zkji2AD
zzrK;u`0v3_Hvu7*bfGl}JW*)KflzjrM;X*EUmAW%7Ia3UivM>0Rg|N4sc3Lfv*I-E
zFTpyO{4b~DcYENBIHp4xAx)-Nb?oITal7RLrKtjA&--muXm+@mxYHvo_!uazxBJ6r
zZx9O1lXEg4@^G9}*P|w@zt`nI-htu;5GC-XVmBDDZbeXq$slS(&l=TG`z)yh4>B<^
zi>?xi<^K#UQa|<9E^u_dUZeAj)jTzDSnA&*4odXfM}hUer=r9NBycYjFVgrvQ#$Q;
z@Qe%b<odH|(<+?i4T1MX-b)KPx6761#_qNC8#;Id8~ViYZr*>~EPc3MVcsLcBjv9b
z?MVY&psQI=y=tf9$#`$yHTTcY{unuPiW}<DOQJas6w8l(!1y*XSbeD6u6tY|a?`Bh
zW!zX|#nQHfn<y|icc~plr`GzX#hpH^iTY^EsJZQGL;HC+r)I6-jY8$--!YdYC<4Su
zs{`Bxl}Y~eegAEJ?G*zaa-p&(g`b+8pA@+m&6%}mHGE4a=N|i4%^$Nv2x7!7-Cf4Z
z3bfsyf!*_^T!`+_+ei1m;5Q8RcHgo7c_kp*UaUQ!SdqhC$ky=Szh>z?aM#N*FFM;0
z1$J#Q!FS&dpAd4sc-T8~L;}~jeaJr&#cJ9v&-!&BlVuR)v8XoaWV6vA--cf^SV6u1
zE3#m6R5h0HY;~nQa<#<S<A09jpQ`|^S3G!@wkm82J+(9cEd{2sBSm|cky7Uw-jzKm
zRwy4!)yaKju!ixiP1Jgo<23($zS>G~{Y+=jskUFi9S!>N0<sJ8j}~G`x{g0CuKoWl
zen}i)G7Nl^c7IM=S`%{SD?i&IbtSHM2ET+eW>vWe0h6V2eXkJMbOcY>wFUQVjK|LI
zIQ{8tXNA@WY5x_u|9)L02k_O>^N}9%w5720{<ynvOvF2lqnEE$yCf>|yf=)3?Aa7-
zWUkSxC<RG;X!!Qlwwzt&|8E0tj{`4;^=pYE@N#)q^f|U%No}CAn#J!0Nw@3kB(0c5
zdj6Lf&Zd%WJ==3p`Ry0rn+`kTc8~reJpbcYGDY1o?D+mEH;j%dOg66K*x5tWamVd#
z7S*`fX*R@$zQhEMbc~uDRPZ?iY_sQxHBDdKDW22+e%ycm3Z=LeGDVY_ro=*v!)2E;
z1axzT#eoI--f{aIix&$8bmb61v^#DI|JTm@`}@7Zzys@4M5Y|1mZG6u{(i{X9as?v
z!NgLmeP3Kyi6u70l&O5lS<JTi{~Mo&vQm6-xaF%=(@#^k(k@TQsz$M~q(%?SR}}ap
zxkt}|gl9l2eFUYjCiSUoqjS{d$%A*g{=<xCh5su$|M}pG6sKM<o_NJ{=R-*>6Yi$Y
z{isvU;U7ZJg(%w6%z3_`pR%E!nYy<_ijiyV(p8Y#YA<!Jj0>aXH(X;Kd-vz8`*U&p
z=SCM=2*#z0n1E&eafr(ItF-e{6?XXo$smSf7JRuJ^=j#chq4TXR3`BF_(tv|>ec^P
zpBFhpv)Z#(O&%-kj5lxHl$mZeU1(o1d8)AS^`tMQT>@p);3i-riz{MixF2#6CPuOz
z?5%KidgeyA;x-G;3UM@S$G2W{_Sl=6$lvT-D!YA}a|OP+@TKpMn@@ibvTGh2->kA?
z{M#@$bdAMr5MOQAk<X<=9^YvXncLg0<B}I7Y{h-w`2C-z&QeOZp2AYG49sqW&QWaQ
z0?YmVZqYrjgF6k&W*4nolAvH;1xBD^=T@xSg@qTDWr_$RG-pr#am&!-u!@G=RD(Tj
z!HILvC^8xFDBxtFRF`^AxFgR~b>98F3H9}Rp#$HL9?$=HOaypLMA5MZvNy0)UapAq
z8G1Nf+UOwcsPJw)e6X;Ze9h`mm1?^ER6~JJeD>05a$wa&{MQ@5Grg&6ltWao(f9Qa
z(be54`{uri5p)%n4CGt{_Zx#dZWo#-hv$7BnqS_PoUb34hj_gCece)3a9zitenIM&
z>gf&lJ)E-HtD}Sfw725cMM;GA)<YA}@9dmg<9Z)Q+WUQ^csuOBCgcSs$oiD=rx;Gx
zrj&F?@@BE+?+*sFxu9Pid`KVPUUVBh1Ah$(NO~lHbV+SU+F=ki9{9a1h0**Bdon%b
zXHa3BQDorng$jlRvAX2cFG?*=t->2OOSdA@*g{WP<Fk%J3zlNv+IC&N^3wI)GES5A
zO8g&rBf#0F<iPeCj-HQRGi$A)p8I2v_h>7wV~gKDfiAfuEb6c<#KEQ-=;<**BQxnw
z`~!z)6a!Qd7@0`x-yxk|0j~YkC;y>hM#%OflX!@XOSP~mgPbnk14Ty3^I-Hyr$S}h
zYy8&iP5pje*W?h7|0ACb6+Hx9_<1pPg+qnx|EJ|NELyQ7Ty|lyRhmbj`njihiHo?d
zI)2XW$*xmVoAM$so5}n9hZ>;N0=)g%A}Te~b!tE49SpTx#^)0H0T*vl>-$^~GqyBM
zv^z{|xpU(ae@w?I2vx1z$uVqU#as&I&O#KKX_kUEOR&nG3U-CJxAW0$1*W~JpfH*g
zG;=Q~NvDnBcZl$v2b&|)8A0yFU~0dMfy$vNvV-ZGjJ}k8kAx0YEnhu1*R|!?bo}=U
zLW-h>Y)%D61zzrqI-GKqa-{>*QB1Ea`-G}%Sg&JM%?j4~?$_iDWkxxcj2mw24T@~<
zMG>f>s8yAZX=@cR*EA{N5ZVTo+OEXjxDo-q^y4f(Y_FqE=swlnAM@`_ZU+@sgYJIS
zdG*suBl^+1jNc*o_Yx*{J$+6@7hk}d#wa#c@3VNR>DYnezox&Oi7?o2+Jesq1J`81
z#*nW`ZE{e`oy7;=X^@@Q_?q$+;}9`{MP=JGk0R!MZR8&-!v2YaOCx<w!W8~}>{nh`
z$nM$Zf?t{62$Nb)hk^-iwo-@p?_7tn!sQypFB44P#S7~C^PE9dXQ$fH(>U~3IClr0
zaN7T-^l@rGC9L}qt-hS`gr^7pxkiGCGUXPky7LRW;$K5AhbNyv_(|{FnF;CY`ep^9
zmR^S0zdh!x)U)kcD}oa(WnQJ90okS3H^gL$HO<mJ^p53+r5);1M$-{m`-=!^lNw#)
zriGrAvX(#AnEA2zKVR5~qu)1WC1ZHv{FdxrJ0WP;*cX4c8^pwd#wbRsD^iz!&rku-
zHzcPx;r>C#c*S^48TZf>q1l1ubWPooLcVRv)EZDr$NE^nR~^!3Tb};M9$0GeZnsz)
zNnXoW(|SSiGA3q*oC)jtDLLbAExh5B$NmEneR}^{$RNW&2GaBeYaJ>+<T>M9qY{UW
zH$(bFf=OLU3@KQMT|dYy&*U)M+*bD=*~)V+*q!tU3DXN^A`<<q)N5DsfCJ##p`fLp
z=y5E5!m7LM5kNoopZ*U_G?#(tm4oM*A-Yt3zV6r@xF7;8{Y7;QN{PWpTB!55N?*=7
za0LF30kS73V%yzII9}|&Nl?@^yb?ca7>W*4aM40<_V4{L)M`7kzubkM2V24rC;R1e
zSJZh78Z45M7qHp&F@c%6tD7`M-&}cgD6K*4a}}Q&S{w>Qq8t09e&61oBc#ZDs5mG(
zc^o$6V+w~hp8F(9S=i`D`rC<fvNyM|#=lvXp$ov`x{JNHa<)yh`La*Ybc4_=?K1&6
zb6*+vdhlJqiEOEK>D-h-Wpe}7e;ibyP(C(R+WEG_=#8a2b&u*$v^GqiJ59KquexA3
z3ENo?Kli(2gz8Y>fN7(O<~Ch!_8P0lvzs#!p0ul8{q*(Il)+}_dvb9uv-i4u#oW8(
z5QBfphk?S_m}}Oq!r{;%bmzSdDe5Q|lAU4rVfq=Oo-dZSeG=)T3}1Jc^S<r4|KRf%
zK%~_ao>V)-^Q8N}Aaq6pLgjFBr?{bD-`SfKO>Lq8#SwI6EX;og&4JKYAwitH@0y4A
z#}{O&4wm0rJl+O;N>+Jj%fzP(d|<M?)jn@k#s_~0XhV#M+cfWAi43ej3~C(fkK01k
zw<qqKW^*-PzPEN#eoN5q%>kkPtHDgDOdH$Y0_im$(0r_&@VNDRaBNPHBs1n`-&*uG
z#BioL$g%!@;&c>s6sjhUl?y-M=P66;RTUEGG@0nzxK(NRIl8?{dN=y(&i<ML5HiNn
zRK@sPF(X1Rg>#rEPrfay-Z?8m2>`%86yl6r98=SQv;Ib3(y1xuR*q<Dd9=CRA8RcG
zwa?elAXU@vFE#_pAF$1Gk>sh%2r<!pL7(DQr#9aR{$UwTTM9aarukGM&4cnaLfQ8c
z-ep-rHnWSGUC--qg7}-d9a^g25U|+sQSwO6{?USDNR4xg(Ye)@BIWGOaaL&mp4RDX
zHudi$LDGJ_qysS_ZNN3}S&&sSakZ%J&Bbei|5%4yOFVUsuJxhhG;kaPuEc(oNw@vo
zM3Y5bFtBa*PVSD=+20eSih`SNrC!sW4X<Jdi~GIjeCCF^;v0TNOW%5TK!7~%X4mlv
zvIEp)l7Jsvk|^s}+z;xTEf0?Kv;b%F>`x2#3T#=q=3)~gY4-oW^i8lkMwpXyq*F_8
z#mMzm++vK4P4$v=yR<)Wwscv6T=h0qCQD8yf8WFoHG0_aT?6MEi^*~u`u$bhgJ#CL
z7A!m-MU1BHm)$;{wtHcqD@U~Z$YN+*N3p7F0o9UC$xm$b%<9`_g3fC2H4SEPTA_>#
zf)Z0i!TO&sgzB)RtY!hp;b-W6Unzjvf%Acg`-}G2-{vz+e9DTomm`pHgUXy^&Sew>
zS}xcPt`%gQx6D0svF2+t!9!(39$L%5l!QEoq)MZ>m=#<*yLOsMecMTpE-Ia|9J)U^
z)f7+Q1VQqJhQyVl=|Odmj;<{>TP6%~rSR^5Zg8m6A|~;MQ<Lqji@)Rd5X}&)`60JT
zjs0jOaHWrOi@z>Z@KeiF=-s98hk?BiN`7mKXBp^hk`u4mu@McEfViQR;!;ilA*X6j
z(|$kx;;LP=3BaXWj&KIRN(KN#Gg#<sCMl~rL$@_W&AisQJar3Lt%3FQ4xpGhlwI3C
z3W4t4Il_6zL;3eg1QMTt2=0JBy=WQJ-w6`eqY#UnRb<N_c@~0N1JTzNovVk#_LtZt
zYsB&<Qp&&-+tE9~lKaj~ckaV#OPi-EEr8-+1^l#%O5cB<q+OwX1Aea4l%tdY!jCW(
z=0jtSGkZVgV^jQIQq(juaP|~<H6+S@Q?<2J{_FdrmD?`emqQ8^{G<0b=>wrNN3&=G
zX$KAW*gWq2ertbZ8H$;QW;QckM&m{zUtO-<@qkYFfih()@TatbzTp!E#&cInI#kcF
zn8bNk!M?oRUbw1!|Jbzm1mf7n;wie(bcZz8{>1>Z3!?S%<{R%OpM$^+jReQJ+$qfT
z{r64V&0Ne6hTV+ZqJKvei~2TK7JIY45jpAGzZX`*d6+8^n}NduZEqj&C%LL>m7=gS
z#yCMSss=?eLuLE6Kg>)eY)qZ9mTAEL3@lJ;bl}`203FY@Js_*P)RhzXR)uu2mfvyz
z`9O6&MAx&XKlzHD)CT}s+Lm$k{WZOq_{_Pc!<pW;!le~t{=F|Ior7g9Rmq72Vr6c<
z3mKX}KiQIK4Jx!I``rU52q@q;;7dK(DNP|jnM!j|{>*|9)vzFiuu&9S1qvRm6xMoa
z?D&;V<C(1X3a5(?E7mLm4)^pq*7fWMYB7kSMwgGz=$SqnP~e|~@A~(}+Mu->-T9iA
zv&syJmH)JGDos%AK*{-tzOg5A5Qn!$*5BG;fo*sFEeh7$e=D56h0QVFD&EXPqvXR0
ztrtKY^*I0(A?{4HBu)UE&|JOkb5VmX)vXkv1!nzY<nV=8K$3I#pdI?n<5O2M_)p2I
zHd*UzizdrYf;|?iV7iZZe7*>}Hl&@@>g9eSKBW9Ahr!11cX+2iq;{~4^3!(x`tg?0
zzr%_h>}=Jmz8w!nw|}kVC?o=dlZ)#ef~2cnQ?Px%;3&zFJ|SQP@o?~kxMO@Js7SP)
zdn9!GFe68be{eo2l^XJZ;wVuUt1w^M)xsFO@xIe)VE2kBz{M%8F<yY?4E0mB<S{E?
z+$0{jwZcDqshxp#>wCxT-;>l3K%YSEmN0MR+P5;n=OFeqq3)9PxYzHM8Gla^Db5)f
z-nAxD^nuEg>0dX0fcmN8(_6YVWOTq5g$17i)Dnq>4}!`m_T{Yr+&_t!u>!5e{%pVy
zUQKcG79Fi#J!>bUplX4hzHrs4Pvk1lYp{g@gZ{=jFmG00Juka-J+0He|M$jt5J25x
z2cBwwIBK&=^Yj*@@BX3n0QLN3Me($?+WRR~l;!S^B#+iBA|xm{8I;>zUr)`q8=krq
zGyG%UDOPT^yYIcR|2(lZ^azLj8eklDmAj7SM>KoTiroMXiX0-Ko=fZE`2V>2%784l
zrfWq|Kt&`KDG6y1X^@mIX`}^}4!H@XLr_9WQd&y7yG2mCI~8f9BqhJOIOlo4_j&&v
zIdI3``<gwoX3bhKdcOb@rKYX!AGrJTL&BuUooKQLw-Hq@{<=Fhwd~oA>hobcTG5F&
z91Kixn0MwKeQ*A4(upi!{f=Z0r3G;F?$tR)3Zt3?BOAm?DgXI+*NJ=Ikz`L2lN`Q$
zc6_jPeTMGN|FM&kChlD<Ba!eHjLl@V=#1wZpl{xW$hJG~JAU{Z7}GD-V%Mlb8Ixr>
zTz(fxy{v*+NWpA+^*z(QXLHbs<--0^i-=C<l?^|bIUKI&gxC<i%9-T6l0|<3AA8^Q
z(Hib%5nVjNPEnFw3x?Z)w@d!STi^3-egJn#n`blCUyS}H!P=}goA!S@UZvq8x-N?K
zUj`L5!=C+H%l>n*DdJ{+HhTQ=3;D<%tPLk9IICaZhc1)NV62~)2onY}`k(2oW|}G1
z1vL5oh_4pI&z{5NkMWel2p5QAuJIMXtX#)=)IywdGVIiKO5m*Xa!;0kTvi;6Crlz<
zDj&-=BYkJRh4xBV$xP^%zM2@SS9z0OLE`vr1eyHBud%|6xv#c@dU^k!b5JTzxh#0m
za3)HR$njhqQ0XqXKrCJI_}483Ne47mH!5<ub<Gw(Gg2Ui0$oOQ3HBfOAes(n%r9a_
zFunwHmspV0eOMU+olh>7wEo=ht%CcL+YP@Hu2d38jF-!?Q=={T7~T|R_x29>QIRZ-
zM@VgS!A_E(<Z6wi_|2OoRK^l&@~tT;!q`Slq>aA=F5(CHUF<z`c5V<dc>H)*en0+W
z;`(w;eutc(s^0Ff-jJQ5rtofUhRgoC)2}H|GYt}#REXndY1NUZSqFB7P3|uLt946o
zjJb2CLY6sL#aHOJueuSpDLxvmrwEupZnjOwYuY#a_M<YNjS&_*Y4tHA+hU^Qg|cMF
zjXdG&aKLVFutgKX6oH!eo=AH>3O`*C`3G($2HJFVURq(U@9h`Ya6K=ee~_ZgE|j9q
zT<BP=Y`)qS$3P{^?*+RwIVM#WIpc9Q#auCoD!O(z-{tvivwwzWf&MUi+Y1ZiYHlIa
zEOoDc`BUt>|0=l0l)1Y@n_IwI8YG16nL8P$WP0~=kM`@<>?U0q1(`~3T*MSf-ov8Y
z%sSx<w({|FF?ysRg-h=*63wbd%A{U_IABN;l@K7GGxDAiy(S+U7n4HL-rWoEhm>aM
zmOb4=uAgE#g_<wH4}k1P9m;JOBA^}j8mXKY0+{kMDMHcp>P-8nzF&L+BR9b{ZQdAH
z&%-~$wW8Zqo+!6cK-^Wxw?Mu4X@+8gIT4%w@8+F~hLgM%cKGxL=US@#jl8ecbvbON
zA1?k#8C^~q&eMs5VEviO76xm47m)cn{RwysBAq}mma+eHUSH*USV8?^doHjOync-%
z-BRFuSVU=?XVvmUu(^-OZBF-qG~)Bb*Rxniuy6jx)E(^IKe*ahBdavdy1&i)ES>Fq
zi=>4wq6;l2eHDLy!S*p#h8HCzuCz#1xF!e(#rN#KJOvFA(d)=LutfLfe#l!#FhS6J
z6P*%PN$sOZZwZHWg?i$x>US5=DU~jvWAn+FzCI+IW0S#)8E?LdI;@0w$(#oQD5@}y
zjt3pgs!^wrAI|;Jele?o1C>ogZ0?JWs29sT=G34`l!9dAE3*-l;H`06IgY{N1aie#
zu#D(HY+#?($TB&!GklK8*Q<#tJ42}Dx-g&WWVMob2iTHYM~s6{;`dvDM`jB4NJ)(^
z!E8d?4qEo}uI_yj#2-Q9pKT)+9<_l<80kU6SwN6F7e97*x!jS_bd&oNF5>_pWc$;S
zE}$jt^cYe}HeRjyqwwZb=?nzxyC6B6jh{P!P}Ft&cRNUJo!>t>{2~FSIPE4pLx?*7
z|3)#d)$`PEq-senX}UCKSI~y;DlKs`u<_(@(CuKKpQ9~Ydi(p5pZgh*$AR*KmkK6&
zom$jusk2ep4?cp{uH0MR@nYYC?r361wT{$68iV*ez0#GOhHb81MHKiD(cxIc$3j{J
zl872{MW+IGQkc*PGa>=e-LN-dnons)Vv#T>z1g3qQ&D_X3uZ>F9f5=%&zr-tiO@aC
zMxu?H=xuuOWZM2_K^zK;xuPxv*L<#`O6)sstMQ2Rko!T+ZXt;1U2q{|CjIphP_MIq
z_B@Z&l068A&#On?UxBGo9FE%9EY&3eCvd+Bjte=f;q#wq<Knn`4IMc%*YQMtsJ;m&
z*$)kvza1oy$O%&E?k_sBJz$KU@OyvBs0?~lzI@F6i=Q6H-s8Vm)XrUIGgI#rw#2ET
z)IEppv-Xe@6+&Zp&+j7IJgw3a1|3z>pW+Oze-18=m)P(Le;v4PogYVW+NTrILmLEf
zIw9kVf8?IWy%tq`fHz6Nh{_xO!i^;wakv5|MXyf(&IYn|g3*PAcA5FSA1-Uv99{`%
zC@*l)evb`T<ZqMyfnScaii7Wj^qUhhK?i}e&Uu$nqwCP=doWoS2$q>~k3PLbN81mN
zMgp}xA1JToWu?k4{|3(}q?l1i4E+{$Pa&&iPMXHJdL4u@?Bc)9_aYW00%qTl8VphK
z9kQ5Fw%Pz8hVHwgnG#L8h*y!{r#x5KoCTOoegFw~l(H;i%S?*I)d85YskKn#{fSQf
zoy102yAUx48s=12es*1_fn?ur`q9qvz;}|AY1P&#Q&21FK3QaYHEH{f55}jz;uI{b
zV7Lj(&7~UIZQp&N*~N0{GCC3*!7TPb4c)?#WFAAWv`S-oR!y!&@jqOESJO`4i@4kn
zH!)CnC0X9_nuwcptp}l;+6cM6luwRPn5LJcZV;ZPD1<^GdzYQQQi@T4uw7LMl#@;D
z<nFM{7k|vR94yt(K|i>YqG1wcD+(8arwFcBdXzTf2DMLKF7f$3())_^dda~=ThKZ#
z_f`{-B+Ta+2hwE1s|DXDxgWNT82~TSqzk%3R*2)65~z_A`bz?_m=uwiR~ngMl$oQd
zh)JgU61MkrMYZ(CL-*s?h+?MJepts(w9FG;EJGxadVAz#PQ%5qR$1m_J2(b1#a#_&
zCmSE$vW?jCjk!!k$eNiVY@be!p4=Mywhy$y1M+t@KPC#g#3JTJxnKp<?EumB<>`!0
zH`vM8!9^a0r_we8k4dGM`AM4slWZ8}8l-rBC>e!embnFLp8IamE*KxverLD6XiP?8
zW<&ynmSXBIjW?onxZ$oOG~KuRFMZBs+<i$S+5_LYeYj3By3}GM-*T-_$qK*s(hz*k
zXr9)MhO=pBX9iZ1N$Rc8s6r_^^V%8F1c6uFh#Go{jHH)H87AeQEFz~v-}9(q4sGi9
z5|gSyTFH-*RbHmUjhL38E=ix0?GVaSsQr<^f~BX{gehlP!W^BLlV9B~jeA~M!5OZf
zE3aKkf&<VkcL?Ii`~?%!buI!|9$LLPTB~>!?+Y3jFSfy~gI_Fzo>j~`Zhmy423SdX
ziX!^9?<@CLN7!CI6nzyH)8S+aUKt_w+hAeV9}n5$u*F<r32?-Q(|;Upe!X$^n*(U^
z#sa}n;EFMAYb?wT^gWp)Pj@T5e4T_>jPR^J8{JrRT}0vzTalRVz8_1i2Z%>XuF}X?
z!L5Rjpa?U}jaLXz+rWThWT<&if_IA^38{Z@A?lC^f3&1sN{?6jrd5WgTWT8SbozHO
zLAMpWYGQ_Bi**AfQa<giO+@GLjc3`SA)`0XG=H<%G@O*1O;tNsrY@5EUoRWz1C{rm
zPxvSxSr`tc{qym`3)x!HzB}ly=-B?Q?Y4L~fBYN9X>Rsz2wKh73?It`1U*5vT7+^8
z?n6=fKb+Njul<Sf*#=D<X@)By-%3!^gyjD@{Rny0l{UjiO(~#<J!N|poAKPZ2FgKH
zzcNFe+V(=%HmkA=KW26vzsFzX210m|5zb7xp)!hHEIQ@Pw|gM@c|P?FnEQ1J1e$oP
zT?qms_G{xq)fKlgenA-LHM*VNg{@jjoLm(Pd<^|vr1T_3wcd5gDr%Nn?zL}J_+2NF
zm3AJEZ?+B_E6J~4oFEr;o+F!7H!nb3L^@x#YtZGpG!qr>-HK#Yc=z!tVHNdtutl<b
zmY@FCyZ-CcE3?xGJ+qr8!~LY}M95(?yX)CV$HEcW$opo}rPvu;nXPZt<<B#qhVfWU
zlsk~0&Sm$NnyD^+*#v5pE$R@_Yj(Kt%J~*n_#&T|+`CYtdxena4HOzavM3mj-xPkR
z29$*`*|4&ToQJdg2^yxq#NyP812ZC`J$)?suV);u;VCNz;A7(bE&Qk)PsVk={7AGp
ze-_5+aOcLUzhthJaIW`6$<_AOxx6|rfPtRbX$ksb!3PpOi`GTyojV1|s|5|yq;{R`
ze8N#j9l&3-0J<L9s`qN|r#IAS+SXO`@jXM|fCU(>si}Bhwn9BqS}AqWuuM63Q0m7v
z4ASyb5vDV36JZ${cgHXPhQ1~qKSp}{2kLGpS{E@<bGZY#S{(1f4LW~UFA>}c5gPFl
zPmkIsUaK>(>*B(QJAjvn=QPp}i`Z)*NNrYCZU-&yAG{c2KH6qP2Lv@z-nW*YmJB;7
zWl4?~({F!_s1}`BERWFp9Z36P*zXb&V4)?mDKqSvc8^5Ad0tVy_mWN#*3hyql10%|
z6e;577B*o^P^>39uP}h5)I{@2wTdOFYoa9@%B3Ics*Z9ZN-$HSI_-o!tW~k9T=?iW
z$9I({ci5{<!k>U~!zIMxU5VkMgZCOt;!DroS0M|xEQ4TSVsfpRZFreo))wv9N;&@K
z|Jj7!H|cjCR7R0f$Pym35+buIoa4WGg<Dj@i17|u^+4=zon6pn@0zYDUW+c+Um3$S
z=Hw7@SyqgQ5X)eW`xlG3X8wj@psp(Lf$>d*Wskm}Qi%o$ZGXOU1gvdR%|*m|sW7J{
zkoXSYfT>Ndf$mlS&UMDfQP7Ew(_93p1ysMYQ;VuKt&S{{+aCN)EG%u|(q`Ee9zdAl
zonzx5r)J#5O>Iz47aaJ+Llzq;*Jt)Q!FWv*EdZh0{oP(fW*NTPW`<n~1v5r}*9cU*
zL3Y?%#(WnVQ}5<9rEISBX9I?hU<tT+D0n#Do(WKdJ+@68M9yI-7aFE(G54<*Bn4e{
zbRAh$)efsfjIRnd<!<%nXtGJE6~S;ul$oUIWwbBo&qYZi_MeFPAgJ2xLBDIrIRhAR
zQ%dA2<|Z5lPql!Os}o*|^m>p1d(pL7t|3w>l2IKIfAB(w96u{2tdI;bR1mRfM^uE9
zSap@#=6x#usGN%5UHIP*hc~+Sq*KH?`x-;l@ueF~`k}p#uG}lD$QER&+7V~q4^2I}
z?-BMyl5NV$nx%YiUd65ni*C*}ZPfkX=arJq2OYduJdRH)8In?(SBLdpM5_PU%cy-9
zKr<S4Uy|X2Zbib+ukYgE>Yf^(0&BQj7wPG`6hOa+-h-><hBM#KowbRP@qFDNO&u#P
zi-02F_YWhpOcL)3%JRxKSWx9R@RkY8@6%63FUW4^U8)qFhin8}*We17eP|V2B>}<d
z@r1M0QkLu%ZVZG|mu@`7<)!goqY1`FePE*t)HdrlOH|kxopYZ*3-0+-Q=$FQ;@d?e
z`H6?*CnDTfavYoAFiCQC1f`$_g(3EW;V@Ppbue$!&iYhMa(kWbRxb{EE&5Jj0X|px
zABeLbHS)C8NuOwM+)U{2TX>U4&vu9yj@UXH@`J$zTPNU57IVV{<pksJu_i9>{zgAW
zO9Hf#pKhQ2CnmbTYbk@&zxgTz%JTpfN{Du9l1lmN%2uZDqno0$80mo@Ohw)m%;8W)
zufR<hQF{%CZw@z=^UpL@(Yr|<?&;URq}RsL?-w+jR*yNa=5>MRL}c`#EKVQy_qh8B
z<LE-Ov)^17KAMnVIUmnMzRHdWeRPLJ4o^A$kTc7h6cqOli&*J2C}pL^!DfV6E`}u{
z*2QI`6y~@i<t)~@Sk0=RM~gZPt=MGW`pJFvcC2uu%;hW=;Qa9*!!|L?OmYd8>o+|L
z@%Bj_M-4xz#LFC{9Wlu>))tFoc}aTfj!$klzVh>PHGf(RSPERp4{pMhd@oeX{GRlW
z@OfezsHKX{M)LKhAM{>Y0xGf?e;)G`B^GH8w{V2ZxCX4Ez@nN$y4>Wx-=7d+K<5{r
zp5eqJc37X}`-5v!HBTfJL#+0JYBMZGoT$gS^@ghb0~v;a=SA{a*Sru$c`%v4Fc&#(
zkLwF6S%VzWxI6qU;{Rvil~&P@^KtS$>7;j5uRgSdg_nvJNsVv0w;9H344Gh9P1vg4
z{`!<%ehrwNAsQ%_guJb8RZ0P=ViylEgz#eM^CANF=I0-rbn(p0as8x-!&x5RYn$Wu
zfH~1&n6ANQO{dx>N*4?N8X`;^QxP1$N~aiqi<e;64#wIDpIce{LIzGwBkT?DbvkZt
zLF)SeNesSpT;)rtAzl$|1BVE_d+&<}{a4AiuDF1*^E;G2s(@*ZRGkQyYt(NoA#SyQ
z!NKCu928kSq2~#v$OW&*SiQyWSj$zvnMAT|0kWTrzd?Jy_YxKc+7L}|fk8tfK9O7$
zGsA(Bes|@sYAKp^gfx#58_A-vQbF*SPj5LPg<gFAM5#}*6_Qf13gYYmJVrgZp&Y8F
z-muQt4=5xYFF-DlM=d^R{605@A(A?Ia(xYqKvzYK7*F2g1$nNOYO=)321$C2|8otW
z7r$^l4PzciJQSTx)OHan_9tfCdT51z02SaDnIpoo)~#<IxDse2YYn6Q1Z`O9YWsH^
z8l|oTBDg=@ob9h~e9PgMEjm3720w?RLsx)y-s4{dDa8V$^?4R5E{1sfoYs@PxZy%G
zFhn0<G-$|`e*Y?C`Hpc*K9+foR0w%Dd7=)@`+l!RF{<=eGj2SCUQ?j`G4`H>D|_j6
zhdSU+&36h|2or>&FZ>1<Pi%ryKc@=osdla%`BLhzkovP-n+nQ5ZOwxm(nzKg2KbWy
zd*%t^&_t%!r&e^_<Q~UcwW_3J`l9daNzfyz+oCA6s#=7hL>v_hSS9BJTcS>6`i+Ox
z>IXfs)2A!1^>&)^AM+B`ib~wrFmF(kn@3$ZgCx8Y?7PY65|dFt9FKbz%$M<$bqm!j
zcIPj;<z7#_Xh<RBxH%SooloFn&fvkf|5U*LMwI?XlsU8HU7-rK$pp$!Bhjy-LrlPH
zf1U99t33iW(Y1unC~F1x;BC=MfDH{SBkgC_KUZDBX5q$nK6k*fGSqW4yzbU`6Zva6
zj+=e1eZ5=NAJFDMt#ka&dajy!KOs3q?Wt@M(Kkcw^?uY5&UmWDRCSqh?eqh;LwmZ@
z%odL)TETMTB{vwMg3Gcq?bvvm=B9?FUHdCgKtU67`3}i^8I<IHG;*(kyIPZ}he{J$
zS5Sjr&_X2uLdu_+w|A`+tx)S*{C@{y8#%VaaV{VWeiK=6Oy<=gE4GP3jN^@LKe^h_
zU9Q%HRQ$~(HCr9Oqv@bka;$c~eA^`3Z?~QHn)Fk~3Hp}rPZ;}<&%G#NwjN@Z&B?e<
zxM7B4^Kd+rgu__=%fs2B_rh1*CUnz;P7j`yKyhJWc<_RZI;*-pApF~Z-<Rn4n;SL8
zEmhwW@Bj%!Qd+#=h!Rq8lj=SVu4uihW#-pJv_^rr`S9WJRBI^mB0g>gZo834bLda(
zKw`IfP99NtT)eZpR|sh=6daCHYQKoT9tiXb^@Cb-9z!kC@}t|WX@D35&S^1rFS|A1
z!snt>SHEQ&;&OuctJ+KVXvFf^SeJ&(vH-YPxI9(!>bc|WHfUB4>V0+^cAaR4scJ)h
zhn0wtij{g0%m~JnT^^~WRs0+akvr(Iiw-K~PfymK0__swz)N<`=pLQJ#AJuME)_+d
zo-8MtpKHoi3O5qFb0Kzzm)$N{d~L-xK1+l(XopEyGkH({HTb{A+w+#G%!#mDjdLzO
z_fuqNep1+&Tl9@YvgsYTvh}o94;c&J**|~?PQCb3)cQ8csL$WaJledtBz*`!<UA9&
z#ZJY|X*)&DU#GTI4~XV!C-?!x@mS}XB=i<p1uQ)e`}2+|$%!kFyX?7*fJry0#jV&L
zRYkt^OgZJM@5%V`#mQ0B5%qUUr*{(y;m`T+Xv^M+qTMIydJH3gsQb~eO!bp?k>B=?
z;{xJUWPdT^ah&5t|G9sa-h1wkeV?+qj@n@u)*unC@Fy<)%;Rx-T#O?{6$zs8b7fgO
z>UeAfjIu=zfWXG$EAoEPR!G6*i*g0IqTx51sN+b?%!m|6z^_lL<!BRo>*abc7T$M*
zYW3{CEQc{udFhxtd*GgDM3c<wpayIGff1d<C`bib5qh@L>bPi~3Q|{5{3Wwb2`HR=
zO*5+O%vK@InKAebX<sO|{`wxmBSmWK!_^IC-U#9Zv;cJzJ5xa|s@nn28QLCthA&5I
z6ydS*F&(Gg4b0{Agk~OF4?AP)veYj8r;Q!VKz^^+)Up}HGQAM?810@HE^@4CA^?r>
zQIuEtT8xG9x^Hi03_|icSVs-OEs-t1*yMY)3x=LG%cEC7meN55jZnb9uffBE?ut#>
zZ=T5FCB!xEnL7ZBAFr}jCNtXoaMZSP<hQ!}CA>j8^ycR~3cCBP1|BCb=4KJIL5%Ji
zMOB&7Ur_+gbkRqC6n}V{!j*RY#?<2>=62udPmkU;;?h`t5VVVbfxs`xOW+RAI=Tj)
zm}bE+ij0ig4V0P<@IGR$VjDAt#oo_=2EN1R4^7+|T~Z#3&gpHdAPhfxFLUwkQ+(p@
zCnqgF|Jh}vAV(S!G#AejbN0B4Y1VJ*0757n4f*c14)wth9NMuxUc{=^#6vnM$CSL3
zhf=f^tf5HnO7%`>zM^>Y?t6rByFmnrEJKPc1OFzM)1NsN)-rwSpT7Op7-X&e<Zr+I
zW*c|ucQt*lZ$j!JxYl+34tM9@EyRa3!n#Xia2jsna68k%4UM$b+BFHU<iCT7?<-7&
z)r`oosw2$T-rljk4|h^UnU;Sj(680^#{giovaVjvh_B{fn$Q18UnG%tXGAcCJ%n74
zpRE@E@x#7DJM}@K-1q@239$kI8U<EfX4%M3Z(PjN>vt&P{3mpI-LRm<yBOE_F52=;
z!W5qkozGX?o4mLfowWFc8z3+!MZ2YzLwI*nfOrKKFRD0ZDO<6B54y7$U+$;!_6CJ@
zM99cdDE&|a`SaagY8}n7S5YNPhH^%iY_iH5KkNQp`6~1I`84#E;_uCN-_#3o5f>`~
zU}7*p(W#}3kNW0Y%kVPtB*5-!tH*hl$IC3-HlydY6OKPnww3#*x#-em+M~(OT*TX#
zd7V0-7~nRfuE`>I?NqX~E&a-=(Jc9fL%=TBZx?9`YtpphkqnvfTctSj6xlN>(iNBf
zC0s~LXcoOr!r8j4XNTBnMd03y_7fTAF8}CozUR{17xyEo6Z2*XI_cuD`~HC;v(S%w
z1ja4Y`-F0JieEN9kgla1iABn#fDz|}*O#lhm(bP>mXb9?-xN0}W4(yD+`%HdSPIz8
z!meJD)8H3|^yh3ErHe1@^tbdYAQQSxMxbNpONbW7G}o@ed^zscBGSA(`dt#TtFl(s
zswIQd%gF5&f>YO~C1lP}D*PmuW$l+KMMq6To?EI_`PjY1ewY*`(0$x;h;`&dy5EX;
zh{qV6LL-BVOu(`9ZH}8*op5j6QzqMt2Uj~dSax6jk?D}XNryz}11S0ZLmU)#o|i0$
z|Ja_%D#LO`aP9A18A^X)SU>)_uDD|GQqG^@6^eaP`SMSI96F-#jtp+)=p}gaE~ka(
z;;@LGB%Y(*k7L3yP@Ky{Ihr!7njV|0I{_*?OJ5XIC~U%#jX=zpCClneLiC1zIj5N{
zby_%4ao5nJKy-3EQEA*y`1FSr+^ef``gyv)Ej=Udt-%TAqHx{xvil&tCD%HTE*JeU
zPEBXpoR6hVi@sYW$Yri{K#>C^7JVIJ@GO4AKY|I140uz6NLyB|nVZ|%W}+vb+NZQy
zeEmJ#xBq5Vjh}wv3GLj6p(0dPM|oAe#Ei^gFjG4HOqJqZZfU<P23xdXdHo~%B*xvu
zC#Kf+AuBVoAMXXup*jbE)v{FS(dJmC)K}ftmI`$oVeaCY8^hHNa9heI(AC9%Y1B5g
za~b!a+ksLM%}?=Hnt}n6zM3a__A`z7Je+<MWnG=G{rS@+IppEKld-qfp77uV@H>aS
zQ_J8HWnMXXYE6m;fgqf5v!rf=mGExJYhf7w`nAU{BHBQ-Zm1u79<G3v@RlZ`&x`wx
zg~u;o4n#Y_MUqNsyLYE-!<0#Mxo@&M|IvGeAFAfn+*Pkve^?BrxUp3=!MK0FQ?xl-
zJJWMz%kf_Y_RfdA>tV?!mU#|q3ggp{E8&8XcgZ__lX0l~eg{W<O(_MxQ1H!?`+mk!
zcxwRKc?cY%1aR)owr;sH)h3pt89rO<iYi;tJl?F(HCJd!E+8|z&_#D3eKbdfmvP|B
zSp0eDjbu#t9nm#n=O6uCcwq!C{(8)gCtIwS!4}L?U%ie#`u^v732(Bo>2Yxq=j6~^
zA`&;W`5B&-+$w9K+>fTT=rdxE4opB1xR5=EBSuGUD;{c<JNqGK1;n1SU49hL=edJJ
zve7GL|0D97R|Q9<JBb{-r>-b!U6Xql?t^FFFC8p|_+aV50I_N~kGs_-{YEACWtW|$
zo3cNMIo<uOS5_sd>+!P2$bZi`Es~j#O{ipyA=YozC0<JAefwFY-A(QkITTn-wB%RE
zYq!qo2T2AcBlDQ`Ye$_ZN3a?&t`hOadB&!S#obE}=IR>x%fm5RDZOopCxAR(_IQQq
zFfSx?Y_CFTejZck=`O~FeqUG$C;VO+#iFx7_VN$}0hm5;pTM2XiAb_KiIO{7M{3CS
z9Pn3~;(IQ6e%LcSc8P=up-wxB8xx<!WyXJ%R0&v|sk?NoU!-3<8tbpLU|R$^15|(5
zI|d_~3X1e!j2fknX?czu*TxBRox8Y3(APsuL!__NmJ+YL8XtZ=$FC+^^n#XtP_gKd
zHh(;miSvf8sW9Dp)9i?Yx)HTQaxA>hK@`H%G6tHtOTERU!IrCPIAh~qbwF=4kn8Ir
z5gK<`NTS4KlO0yaWdu8PM0)+*x+5-Wr}(tiZ)%?&v0=r%{&H2%8s8cBYWsk$r8mt^
zXi(Pc)#rZV<mu(}YjO;^JKsO88J51<nC`y%S|=>kkDLS?=w77Sa~}SBxROOYR?t3N
zDkR~yDymvubovJdjXwe=G4{6zoyXZwzcXcc;w^cw1IZ)4{M0B^@c<TC!ZpyU(~S3c
zcl(WEjH$J{<!62S*=NxHf6t%mJQRYI!VFtJ+)bsDUkPljp~)VTO;~_WLoje@3(P@L
z{WfC@RZl#}UBkYHb1Q)xBz39++KGHhb8^ZCSps@E{WlGoh%qo<C5to3<4C`LO}nJ}
z`>|1WvO=cVv%dX4XhCbJj0@-8=&D$=S7vuFCa;I`KbMN)%$0tlR$T7%55#1?I{~M=
z48e!fYzrtg=5N}E4?E%}%Q!>cV<5|@jWyaVNOCo}+$=$P7Aek@+|W+zhI2C({&f4`
z<W?PN9aiB8My`;PI=-DJ&#luSOJSsW-Z$L_OgozhMmZOTmH^VoD?r3jRXCy^=U^xs
z-IFg}eE6cBpafhF=QwlhH}LM7*+{0u3Ar6c*z?K;e;*&{6_wWMOMJg4U>`bA-e6fH
ziAGYiwP7Xans!OONBFEZY#_7(R$ovNJ5!~-)@hs6Ho)p^bRrRFm2oO+aw(7yE1QPD
z{}#<c|Ec2W7O_Q5@1pG_b*u#slh^PFd}lJ5n2g<9bK?cODNelt7_#<gqCGU$a!@F6
z%wr=@g&7TmmY@=i^q89m1eXouYn77B!t=6JWZM|1EbCujU3kv5I&MJZ?AiA<><^X;
z28HvFQo-D8K`awX?h^i?yC%1F+NmcFc0KRETAw^jW(QS>Vxgyqa$&GtU;sKg|6k>7
zO4;N22!g%4d6U2A+|zCyJ2td5vS^pKW)%aYZb6M@sK5M)M~e`glA@NaHo7FAVo#O`
zx$4ma#;q!DB72*|K$XM4Q)QM}qq0FVHzs81XhlEzFium6c0kiLcBIrtwD@&-LQ@z=
z7X-r9__;sC8<)F)sj-pP##pxGqB0Yv9KgMWSQyhs&kYWhp&o4u{5?o0vAi)lUaQX7
zn7nbX%Zs~?{!FTI^?h>&1`~qGUqGLw!rftxkBnfjwauVF)=G=|solnw(Dw7K{r5OO
zL566M4>xm13HpnELsJ??L|D(jy!b}R^ENya8Ckz%0z=@pDW=h8KZ->eMDt3{L!rkB
zpt*G#w`JsfT1p*3(TCJ3pIg2-=0i1K&#Xj_-XquUcF3MIty=%-Hj>4w5JTRbN1dyJ
z&9ZiL13KV*E=l}5_HgwGxZLBP%yMffbSWwK<fM@0x#t-BjBlMj;RMMFW4SCA{N6Ql
z$oA=1jw5c;kJbbv%>**7N%#GpEMIs&kfD(9$UU84?c$1Oah^0ikM(4wO6)v|9Er<X
zh*N?qZ)nd01bS0C0?c*rlr-APM>H-r{>d@hEXt3VrZ)7C0bBd7XPU=O|Mb!9Xw<zf
z;WlR(_fGcjbH`7zzU+AGb+eYIsNTf8SasutDBJkFY3h}HF^Lqi6Xh&!?8rlG*SXg}
z`FCC)^*!>|wP|j5oc)@YbNWJo>64G5@lvTT`I7VA?`6N3TH4&}S@{e+0$8rtRmte=
z8nM*rZ?*%8`<XYf>_X7DHWP)4hLFCB*#5izqA})h2ejYd2i;Y}nR~$2&W15X4b$+g
zd_U)VIM&NQUr#i;aWbAfkj0ASH91=yPFQRjH4&|-ZMH4FpSiA%<!^~7zn<|rG!a-S
zE}yb31f*Z5b(8X|Q+S^`z`_4>p2fZ6dC<hvplV%m{K9Fao{-_Ua9{rEa9OR%-`!25
z9FoW}k)e4ix7xVziKd%?9Z6iJZ(NLX2>}y0J3{xKG!xc92}aO@Yc);s1juc2)1g&?
z^^f6Rb84C5nk;_kodE#Gtkw_%SlX%uO)$79-4MPYm7QPONop$wFGmjwDMIr<WBfI6
z4+s($R4{~`ZS&Nai}QXr8NX8c4;KJXi9XdEzU0O<?!64?SZdw@ic-=`$;3ma55SJ9
znzR-5Ub+2_jdBg1s1FPO8r5P1LOx-XR-!=FZxUnqL_|sc!eRtvF^Sfsd!~Syydrh6
z&qHT*chRm8<hw3|IS2!E@vLbtN3w{|<Xrvpo)4@R*aQ~D|85q`r$Ks4`kExakVaU%
z>4)pN&G>jbOFMqE2swt#@Ef4gNw3yG({w&wD4?X-Ap|{$K^k{q*aL(gR#B1P+h{Wj
zfGgZu#Ir}#^*5f+?vJj6+9FaxgQR?ExJH3m1)~o0ifSAKSS}}S7tMT>6uyc_h7qLj
zIW0m}k8e4Wd=*%nh1F<nr`j`S<;QEH`xT^~G{)Cdz1<6(3Wg+`vFPG|982$ioNTn*
zj~M@85v#LvGN*8qu5evTA!l@fn_k~--_OHMG0WF5EQ%k~wf})?GZxZ?D91d=+jLsi
z*4gebMT#}GHzMr&<Tz>?&oq{Rd<R)Z)%9iE;;sbHi}<+!+7E0duRfXws_A`<wZ426
zUV#!6rzwaeXwbNAu^1kl2HwwoEy|V)M8CE8=CA{!Ag*CQ&MT<gFr_aBbEcn&I}K4H
ztJ-xJUKwcDSS?GO0(A6)_wv}G6DU~av5f~_e+uBA)3T3!5D(c0#p0zKE+=0$0vx^j
zX_tSB4Ck(A-LpBytMv1V=Zol<O}TE<cdNw)sKBLf^H%4RpEDWS2}XzODVZ|_DTv|d
z@YC=(`g_kQJ!g_VO5G8*jRN3lB8JAazm@=HuAEJS#r8pkruUEVQu`lG(yNgGk`5qL
z>EK2Mhe(y2dT)v;8&sIe^xoU+cNV5pXIIaho&@7{S(f%mIt`V5qs56JE6kG)eGNt|
zI>J}u2J~miuxz{ox*j~S1mXT9I@X<@pfeYd<zegkzsB1jYs_m_l|o<}iW^}Rzu{)(
zQ-d*KsDW{w>^Dm^y4~#9w$W`~zJlD}R`{FiMY5e74l9Gz1WcGR7B1_z$<T|%MUJ>-
z6<&dl-Q2Y!ki+G?bOomqFQ7PNi0#i^b<MGx1C&U%*bJnn@X;(8<ENOe-ix%-0K@VW
z<45NB^w$oK1gm>rZ-@&NXF}O1AI8#4n7M;<`A54ps0`M5e~s$$&*4&;BxJro2(8eb
z?WmIXrb+L&9f{C=cx*)K|7qI&$f{&EfN4S3B2b4g1AIq-y7_xBGPU)`mp9vDyTe66
zk329}L!Rj<KP2e@ewYrfDBsvvKatGACQM@R^EJL{2asWL&K(EOk)AN#=koc!oW;y(
zr+2aQ-6v!k!WF`3u}8*Kpx48^`0EW0e{0)g@^pHWJz%0Gz4Gt>!w{J;SvZHg4a%2%
zROE32L97~rd7q(ro-0jv%ut9UqD_mh2GSI&U)?FNo`<$vm31c>kH%5WO;cW#In&6j
z%f4ZvzT{232hh(0t7GPOQjZ2%WFtSju9Fnx(Gw>52;&j{-O$1Mh~OZWydMupy&m7o
z_$Z3Rtn;Qm^6+F}y-u>wWe+jfOifZno}NMMi&eddH%l-_6=_FV!}82_L1fW4KAeKq
z?`)etpNhqEh@O{ov$T+?=uc;DT=#msNdQDe@ZO3Izn6vdeh{i|SE1oos#nhfwX7+e
zZ(7BzB6t<zArW>IR+!?jI?OlXl)x?o)VPOzo;q3#Uuqe?b9N)}3)^KI5;8Ya{A+}A
z)bO+l-%kI!_4(wF<Gn>{|1#`IRPnYpca2*;K%-t+-zWK@PT-Jzq>&lHI2urU;dI{{
zo)8;Xl2nzMzb(&vzfl$ih%*hba)-tGu%SBqWAm30OhJ}xkuX^L`98q*i`Tc229jR2
zecMr0zbO|`uX<W?0ICbVqsbuAtp~gw^XYVG0lXxw^bTm2M)fhM(KxGNH;c6-Np|Q|
z#KdGjP7*Q(87{uA)w9S4k6YMu`zOchb$0CBda9^$5#H0aTlXnH>jgdf_36ituO$Cj
z{D1bL|8Zl@NGfq5<9dVbhRoUV12p8$vc1{ZLdV*_gYaf4VHDti#zF{>B1oY!Iawym
z@Nu65A_@uI=*--0bNqwX28>e6zrpm02Nc1(s>I71*T&0AGA5MGbD{YuzTA|>L`?1>
z%Y3rM_q-K%aT|Ig(@2YW*s*R`P0Ha!A}eFAc9tm4RK%Jd?+Qkqp&pY?FMRRr9|v<$
zX%S+gW^XJR-VZ*_`hC1Lu5$VBrC}^g|Iz?LEN&VVznoEYp~y<9$t`+y^_hCNu5+?g
zGeVM)CfEbU&7xZdpN2D&zgD(Z1gkuq_OS#SsZBsiAv@xrv;bOPty%SuqUAg>ja1iY
zvkR*NpuG$F!N1bJZxdA+x82T(pjsqvNbQ2+(B_<!x?s;H)3^_OQKH}d3QR<5GHV2=
zU;SBBT9{V|<8Y<SzA8T45?{oN<L#M=Y8EUuv>D1)pOV5O;Wq21DVP8@QXh1<DhWEd
z9IoRCad9PFL+ZBwY5#nr`QfHNC=|a}Kgqds=rQT?pbF$@k)ZSe^T(_z9yhBu{BEc@
zP!UuF-L_!#py|WA$2E<MwcFhs#JVp<wdmTR{ps>nmop`z%5ITApYK%dzJdGSbn02<
zSO$-<rzxQ#_cA@|{GwIgR_KDpIilaD7(nTN-#Y*u2StH}m)E*G3ld-*wnQKNM2#-n
zY-My8?Q%g^j?PUn1_kjw4k!USIqjcP0U|;}Pq!>iC%1d^k<;wMGBCmPEm4oQ0tIMN
zLSv}%jqnfDX31^8r@HhIhB<xD+)Z;xC$QDAEb`?F(u(7@ig!?#m90i?MI42)T-S-&
zQ9r%4L$;PtU?&`|GxU3`ByYb;p_T5+M=-Fbe@vhuFRkkLZn&iRm9?@q-|kJd9Wr}h
z6W_uzYl=Y3W$x2PsZ{vCP*SdD_RsZh2d-$+;!fJ`LdF5p9L`+%?YVC+<a9FZ4WLC~
z5o0)FPlhhUC0KV2=fBpU^2g)v^W?K%>gezXdMFVxaYUrZU1ExU`$$5rpz{pw8l+tQ
z9|P>sXV0E}35pda_>D$^^@Qd*g9#r9Xa)6~I(5K6;4_DEtQAeR-AMOuN#)XJPZ2s%
z+J$kXhs1ieUbE4Obdj8Ro&DDyM^b-|2a#af#|O_WjzPt{+jhxzwbe{wUF7tM727nB
zR#(&LiwWcKrHVu^S7p-AD_4z<p_8Lh$DsL?`}41XYfU8gjH}%?@E2l{mdqsdclkz6
zfuiSSE#7iV{ABXpW>&LcN~Jftj=SwJ;Le6%Z})0dY0i7DVI(EEs(#mMzh-<eej2Jv
z2~{Pt>sg7q`N#91o5!JjbV~IuJBWLneq%9^hy{gi=q{@7)-R<893RC0q<wi62;2AL
zEQ@9I&+uv!5VM-Y)q)#A>SnDwZcpu=LG=-Z^rx}QX}v;xgvzP%56O3<093DJ^-Vrh
z)89&l+NNSIeHOI}EVjtt3;j9=f=31<rwxx@;$CwTP`dfQHwahy!o>|}5f1*U-rt;z
z_Qaw>M?w|@jc4FvY`UOOWXA?!(mlz!v12w}6pf;&^-zvG?)4`;G%rf_c7$#>XRMmJ
zgi)oUl>l<1!{*LEr;1<5A(E*4>q4hgS8g;Y(RI2}e(8n5nmE-mlb%FHXBxig2yL*Y
zw5)bqi+xqq^+k&9u*|R@YVhhqaIw!17h8iZEq_Znf{#UW%-}QGj^}ZpY8V+l1-wsd
zjVFe_@uR^4>%4a07rSu$8<5QVApR$#SMPZCRQF3W^hEx|C(<17{b{9^OYoK+X>kY{
zT`=%!CzH&oIF`4{TAB2cG5s`XHN@BwpFU?yeKDYarOis3Dju2itAz$p@X>pfFbu*f
zLAcylHl9u4qcr{yjjXlpolt+80<ycD*xaj4Wol?L@<2DLKn_21hDG+GBXCR>{PENc
zSRW!>l-pe95n)eT4R)%Ar=gP&x7`P<<Sy`j(dAr63IhK&ojKVIixV8v>0Ki^yM^+*
zhM~G`kg@H`&uknf_Bd||U>Tp-gya@n`G!ny{O7LsD!sY6o_B~L#GCfnirb$U_EY2m
zgtPGXPL$`kgDo^D@rjp7mcO6jt|Z>?+bllCH3wkx#UpQT6j7a$K5(gX&3t!IPC&bG
z5Yp@m>_PanP7FeZP3Vih#b>^ycP0>C_u=m9;&Fiw8<6I^uTJcx2y~7{F>4tiqTu({
zAC25*H}d%<{fr6k8`9!*dKQT*`s+i<+XabV*BJk8!d7e-^<PYYR>yEQ-aF}QKgXG(
z;}TeC$8SoJ{!EMca7!*F7usCfzBjO+1T*28WYX7#`RL$zR!Uf?u}Y_>>0cGpI@4hf
zb`iHwpyHgEl;uo0`f6YwB>n(s=%qMjFrJzL;H>gHov6d=ckrGsa|VJNF6C_`A59p%
zf`-^D-C($*?)v{cZmI#+&zR|y4CQO2lGfXYi!&DH3Z%$O#NQw+^=SvZXJ)B()3r{L
zV)2xBlbZz&b%TXtrK+x)C0ezxnW`MJ?vt|-Rg#9z@eZfS;4kVUZp?Y?RSj2KQh{PK
zY|K?YqVLireL*dA>C_$G+%DViolz@l%U#pZAZb3barAta=HG1$RYaS9bk+H-{eYbr
zC+Sn^#dtg}<YJ=1#gsj3-H4{?#9>FaoSqFSN#{TM57&CoxAJS8Wj>#h9KNUD+p!22
zQk0vXUs<z;sKf~?5LGY+zP>Kl{p|MP$1{ilofh>%cDl@ZLHw$d4X1||%3)2wIl~S%
zup&goF>>F%A#{*ToxDZl?++=rS)H0SDHMTK;0EqMS|ramR%SK9tyHUONeVrV1U}$G
z*kv;$OaRF-X1jzg$+aJulxvC2*+)Pzq^GKn&A_t>xba|B)4>&L(YH@rS{_@IVVTg}
z`mmW!BGJU0PQ;{HEu8L-Ol7FszXI~~o%HoT#BL$Wr)tj3RnmV0NgW}1*To~}`zf7<
zUuG(Va5ANq7NUHw634RYRo|-O9Sexhrx^~P2wS()JYE=%{E-*BS)lY@HsSMz3Ss)D
zDrq@P<`wW-<UQ$gO#-S0-;gY!7SSpbOVZH*F4d+?N9TEm=>I=WKkzoOU2BiBAI=m?
zp0@{+h0w9nm&HW1DT;N^D@rFMW3y4jqfoy4Iph#i1W$OA>f(yTwsp-~l&Zym0uNV-
zNe}B$FO7FtoR5m`(C1bev3wzc--VA9Im&nLXDc!1$$xr6gB{9CnMr^0USGcf(p?N+
zG(0+=n@~xJ=aVa|c*-<V5=X-f3oDzvKXb6qp((=UZ)B$*iYE0g2hZ_Z{`BH-rtoj1
zW%PpKx@O1o09SD!%DgzlJSQrHLMg)`zsQwI==?R0CDu7mpBU7<C!_6yh8H0VkBgpW
zVtjKrsCk%G(#@YUch5eFDJZhvX-u|Pumrr11Q_zwBFMX=W($O}rr};St6RjrUJBXg
z0y3-#R+Jk0Tvd~%&(vm~m8Tr(S}13&0*S9j=%`Z$<KCwy5h=TOr6_q4k?~i*xCZgR
z<ba@2Aop?9Kr^I-$dL3T{ih){y@r<@tKD8)Tvh*_q}{&zgYaN!Y{)@M>PL7OM&o_t
zF;VmLH29G^!?wN=!`x7jgKHg&5c!qVAydl^5Kc%qI#cSUf4RlJ0R4qrxuE%OOSsp`
zltpL*4pn7YY+_3bPoRZD9vm2`s}lwDueBWuJxLptkp}}OtzQ-*R28u%6ch!kG?r*9
z^WeVt=Jaztqx3IW6BzDLO&B1QgvI$97Gs?nK=U{Rop<P0Fw-pN>+f#8(S7aRKHCyV
zwvaFPt|4&}Mbku2Cx8_}qlO*lECZxD;_bll+M>gjE7<}tw)J6wt6c(~5K*)UNV{64
zdo^;UZ_U55K@})#Gx=ANfHy+!#{f6hivi*Whjf;Fn~j&cJTG2C+jv!xK*|1*M=v4X
zajx<go1vbWx6yDN@m|e*yj{0#e}7v#bLLWg-+qB`OQg;jLKjA;i2Ci^{^k+<a;(DI
za)trxE^a*eKyP*!nnxdk+qJf1#$`irAX`1>eiy&PWgJpdWO%gt@Z@Np2dbp&Zf<$%
zJJvoHmhnQ_Ww(tV{;y0LHxnTHMUq#|q)#(Xs!V*4|HUZNxa!K%mwEcLPLI}q;G7(@
zf)E=8_p;W;lBgcBjUJoT?EK*hlwi<`h;3o9O;dG(DG^d)a;)dr=}}>oqG-N|Htpf;
zn6Lr0<oo*|CD8r1NMG9qd<AJA=S@n{cy1BZ`FPBQNy&p)FUp<uW;-D7Ynaw^yeOdC
z#3G3lY6hjGB3nSI9Nes^U0|rPkhv{<dhpeDu&C}h`oIus*$4B+baWqJa)n5t$`MB2
z_?mpcDZYxH&gT6)3isEdCMhXF5}8WNF5*<V2rt~$NM#JpFX@&<#EvpdcGQ>Lvy^D6
zM5L#0*}eehwkT4|Ik8j^PV4jp^U2C0JGN#0u$PL~DtNg*4CS2mcsXv6*z=`i=+`+9
zZK>Cw7+qitKbN7QX=in~?^R5*0Jq5ltdX6Q^bH?itMI6m7`SfvZrA-hQ5^O?WF6ek
zaF1RCo;IJAjP&BPaDbF|6X1DVu+`K>Gca$rfw^fp4~YclJ5rSNloT?;VMigwfTI3D
z+3G+Q*kQ#a``&hlR!~zn?Hs`QpLz)4dq9PkCO3iXU$H{tQKDj#@=3Sy7h2YdRqwt>
z7tqQ;NA2-iml-qlpT$6p&-o-z&Ju=HcY(`OM_s67OU7aJG4yP04xyidtK}zZF|Fd~
z{@bc`xj@M(yUIT>YBWf?^5O^(gZPk+FcL}jHeV^W&H1K*JRKgpL1n3yS-;|^U(h(9
zHmAN40a#Wn^b7g3vYV<e{=F^JdmosVQo2QB#&LXXM|xYTw4)4V!vJ+YMWCHI$<&mC
zIDp?V%S)$+pF9ha?tlzX-)<oVrsOh|QzgUlS;|mA7C?wP2!iyoDE%cPQB8l5zf35-
zeAYrRm|qxmrLEgxnU-eMNd#NrwK@fg-@iio(i5q;wtpcPo9z`w&A$NckQL1$$WU5l
z|A~70!|E;ROzJPu%@0sH{gbkov|bSqpLX8q(~*~b;<ayBy{NTi2{tvKHll%O*7rBq
zbf&(}tS?1E#*fA^LP?s6!tDyZ`ctAn4}4OKpUnAa->xB;%_;h^b*YwMjEcvHQiytR
z;in@bf{c9mZ;(y%GD5Fycvg!>p%*>$%CRM8ix2rBCjzK_(~=YEEDuuFb;Y~SPKYVW
z7P)Jr>wOEDYK4K{m}JUm@jlc<bk#}NB4fe3cnLISxz^0GCW`@%lSyrY=I1CS)6W1O
zLp<D!A}ITz7|E`{S5AK-O{vwj^D>e|Gse+&1M87L|9KPuR@j;^W87p*=J{uUIxY^X
zg8vz8ehEB8Llz`oX}t_5Y%FFm;C(;SQs1{s&7D*nK0Sk*zd2#vvsqjOEf1LFIub$%
z3N!iCJgZe1nPzr)q_V=2UYRolHSXpIwvQ!iGZ#j%Xtb`L16lC8hFGKe8AhrUQRKV-
zT+{y$*6&%J(^ovB0rnlfge;CF3_FDfIpw#~Td@kAzEjhW(m)ps0-%ijrV@WPjLv<R
zc{I<0kPxlLOZ^!FURNUwoNitVF2av;_+F16u4dQQ9khI{eV@msoR7@$h6<vc&d{=6
z#7TQ@!*bsU*(4cC+#MJHH`ioh$czxQA=Db~rG7yVaU@TtWQG+mlu>32@jUYybi>`{
zKiZ@U_67JSEuj%9V327!4#(e5Oz6SqB7<*#(r!Q~xAp)?()u8kV6eDI#&7?!SqONb
zMqNPzWWB(;0$m(8)VFu^Tv`KbkGs;BMt2m4NpD_9zHl0T;R&m9r=QCQA-m=)$f0tC
z&B&Undhs0Dxoib^p36);Pmx=-`4Rl%VZ22-aa>@C2YzJNKx~5h<;zgx-SUF$xfj8k
zGxTa4jRC&ND(aiZF&Qs4%YdtaPgoZob1f)REu1IKNaAM*ar<7z8izbVUX|vJ=Nc_z
z&*!$7J1OU@p^oPFxP*WE{T-w<na6s)-@h}N>Y+BOn)A@b8Tk!dI#?px{`|iB+F-$T
z<17#ANQFU?1lPA1$oi$D(-@n>qocS3BVQZ}v79o?e&$S;4V6-<U+X@XFU0gA2hA_g
zbIpk>zy3cF?kV+;lc_oP?r8=Kr0IMCeMPSE?Dw0-&-LO}<mWXKEQ`!SZ5ktEL(F#t
zTklpn^#SQZ<ToEkJpJO0fdmk~XYN0JiRF*>);L=3`oC$hy6ayeCXf71zk-N07Jr%S
zzodB@FV8iRH?ve$_ISqcZXri9E!=bFu40ey;YVy7q2+kh6r|COD}Dj!W~t2R-ii{3
z6i@hqREQ=y{~?~dp!^{WX5C#paWG$za*bs`JPX~@arbyliBaL7r~f=2An*M_y3sLH
z)E4sBS<9RF$fn!lZmwrly}P69bt8XtIp$G{5AtSF@JJS>Wom|s+^rxN7h)zqFB+w(
zypR!kgi6O++MOT}$Do=)!#*hA{Cr|=!AJRCM&f_r$dsz+-snQ4)~$mNo}RBxyKLhT
zxuQ4uNrMh<Xw<*3O(s9=pn8(`I2Q0H*E(sbk#z~hKm*+qcBCzMdnH$^M7|@6rR4sx
zcu*LAu750!_kRx@Fv>S>(C4hf(Csb$F1UGcGY!$EXPET8ZoFGGf8BqZ1YS@Jo~s7}
zyOFnF!#9d|W&9eiab83vbD|+6DLxp{MzIS03sK8jgyzUn;nJ*s5vrt#*F}bU;Z<S%
z&bP<&!aGux`!YV5$PuH3B!5xtVeAJP8hPZmZX%4a3xU}`F%NtnP7~Yvxi+z<5uN{`
zT4Li>w%VtTFgPC{OadAL3*8A88i4h+UlF-+*}@rowaz2bzi$<)j#hc_++pE153)~x
z@HXpTg+B{H7q<R+LC;BR0KODb<3Wt_(aQXCE5*;X5Ht4+M&tRgLYik|ibP;1;5}fa
zvk(_=3izh&K%L7z!F$T7yHx8k`^Ps0|Lfh9gwe`PsP<%5zenVJ#Xq080nEu{Sl`8J
zAYV_Q1e|qz8)9PNF*L$_J>^0Wq50$MMntnbz8A7I^h~gcFIr@tv0gl*AUv!Z!R}?d
z&`&`Exxd5dLGr(=H%s*r@mz=3<@sE5lUfpy>*C7CeK7JkwD9{`h1f#l{ipwXrA6Qb
zNo&SuEh~0DMe0sF7&MK8*`$2d3;^;59-izBWB{(>?;kD#Yw;5O<p$Pwe{E!+%t56q
zNjtX7C#dD#pDk=B%Ns>=mlQ^6@Nke-BcV$U%0pCpOd`I%yTJp5K<A*4TK=z(CRNGS
z|5X$C&u-I%Uoflwz{gfbG6bLY1>F0$wjps^yspg90gW}gflAw7+GIk%yhXZhHr9P!
z$cVUzTqKwh5FhE~)v|9~Y=E(Ol<&iqc=we7wvm4uEls#Fpm+mZz-Uzms9$-&FukD*
zC`u#?Dl0xTO46gL_gb%z<IxQL?-OysD;C%exjVp$8ye)jE$@I;3!A$DV-EHzC_+Yl
z^fE9#ROcBbS=C$qN%uJMcbdrRWR+b|Q1DN+xC`roLx~p;&-En8kmBd7pj{nNI|EA?
z%t|&)iM8i<vfR9Uo=aq-m--_uj}j1eh<g#Gm62UEsJ-atVs0gQ`^JBt8bvsx5#jf&
zoUl6MczSeSe!(yX0s2pk%(|7mh!)JJ6%__jujPTy0xk-S#QLud^<xiS<|$o8`wP2e
zsE!<rR#rGWV+lv2Qr7r?=rKpS-bLsYJ;oyYccmC5lVb{{$bL<#vLADvf4l|E%6DRK
zamdW=DSiHlfVi22K$%foPc+MWA8y+$_anb%{lRy#SMh(Lm-EQsp=mFr9nhS7vgOh@
z(dS~J?f$v?D(F7A4Zu^fd0hz~6dEl*a!h9bt|+oZR(y~fK2w=sgFmU>r*|3{x$iV>
zzI4%r4M;2fdoGx~?l+NhXKVda+2BTbvgC1?$u;$-lAo_nny<*+{q55CUcBL=KadI=
z(!t`Y<%~G7D``7=K)-o_h^s8wr2ESJhYMhlc`FzyRpdG!Xz)0DNvwJmJ{7jV%Z`u&
z8U^<G%MKz-RVGm3Pp%aj7vk;s_?fe+<MLwdHTj(gJks-Dc^)rv%kV}+4@bT9Eg|@c
zmmj+2EIHQ~|3N>>ES&Q|mPIL&ov$XZn(`(dbm3rcLY!cn{87tTrZTl1!o$CW!`nDI
zY>+9LsTSG@HLW~?bgxLub2;yrW{M_1M>{>?se8O#FZW9;>v^f}c^u>da3jq)+nl3U
zOpIc}&vrYj%b_vz{Pa@Rp*LZ>aEfu??ic70M?M3kHJa!wd5hPZ$CD+Z9_LGnd&Mi+
zgvouQMO662t(74I;9U$=R36y5X294A@|YWgb);Rh2<Yo5oc+Lm=pC)^+*S(dI%^Y*
z_D(OE5MV}#HJ-iqsr*b3pnKjTJ3&5^2abgiI;NYs5y@@ze^$s(guw6ytLts;Th|DY
z>--}o`dBwL5exp)*z<eEOB}CI&}8djWNVhJni9{9YY87X=Au1YnXaqh`{{$<36Qci
z0$qx?=&yc3jO&f#tjgPNFTi1nd8eIpfF0DcS~^d>9~zq@)@w$DyayVRGW<w;&X<-c
zMnV3vq*nVDLV$!sN<k#9i}CQ}PWfgHKQqJ1KUt_8WozBL8HU`O=i#sw?P6SO<cybA
z%-^MFy0x#1_<p_TK)hM<hyflx1b~M<1C$N=?kolr<06t{=VNE%hX=nxy1-&ZH)_v}
zMgaY|;m$D{#pHBx3DGSE<h{{_a^b6r0E5-+F8x1mMDHbLEjVskj^!?$^Rcg`sSvp%
zBm6x*F2xbYy+D)WINz&KY&8Ao%REori8G8=jIN>p>CAwQKy?0wg>LXKQQCdI&JzQL
zF<*JiVA!M?G_H+}dlIeSD!NkwC0ofi5{p~@K`^HwI8bh-1LNplpFGPpHnCgiWbcaQ
z>cIMmG+z&OqS3qx(F6QlUh-X+`{0bcwK>v37?1fLONJqq6q}%pWSkH~u;`RJ^XMRG
z8WwHEZ?@4sM1IIX`9tWc?EUNq_Dj74;tGEMJh`A@fQc&kMQiPJkWoj7Wy0_H+Z2fe
zt4eRL-JX3IJ>g>%NqRI|^sEc!#JYgpr&}0zdU6y8fLBMU<yi5DpAltNSlLkq?oP;^
zC}IBU^!RTEGC?=;5Fi)OgAocF|AcbHLQiLP^e?;;JyrIl_3j0)-$<!7RrPqPHPz4U
zAoZTZfL!B0i;0WoJLyc2vO4_CZtds3jo7`g`{$3%ziGe)uR&z~KJ+c^uBuMKMI^A=
zFZZ|9f-uCeBS44Tb_jd{8oIII^)qNY2+C~!%P>@rl#F$UtoB=QL3INDb}Q=y85u_A
z!XTvI>vynKKZP-nPOyJ6hT$I8??KWpfC$S8I9>sDZ`h_0;47@Rx|9ANYwsOT^&38n
zM@OMylT`MI3aMnDjFilfG9oFIk(F#m6v`-+O)4`QR>-KVipWT|keQXepX)y7D1E*?
z&+mC&uirntPR@DX@9Vzyy06<~#T;@=!J7RL$cmygf&TmWIC?s4TS)f@PJ4uC2Ix*q
z^15c8e-JZuEiHj5c+_L_^b@9lf}gMM*r>Uk;o;E5&&~kPHVV5VcTa$NF{IjKA|KvA
zRczWGsz1ncY>Ge}JTcM7`*J1MH^|kTxBd0VBm`URzFY+p8e&A}plcxl4sw<{9ce0t
zXs70lIHW^jHHI>v={&5lUC}rJ5+z}W*ljuT!FE+|VzE#kUjQPup~~B3X#EJhcCtGP
z-GK89Y^5rt(6XKjV~&mDHsyq9D4bVTQlzNk6E-C3aE5lv*GY-J?a<KLm;5-;e<$O#
z$Z;6uRL&Fq9oZ&dESPxASuK@4Bu$Nq_ZVmwUkmP`S>&?b?njPy7E|To!;=*?@&YA&
zhPqocW9A?0AawoSDjN&Nf`c{o{VU<w6%EzMLm#FFzHnNP)6NbEeTFj6u<H>xmm^|a
z>We**+L@xs+s~Sv3-+GLA%WtG1)7Uxlq{_<(FVtL&OEAupsVYr%3V0WqQwN7F$Jjz
z*~V;bT~HUCMR|}ZmdpK;XJ=aR!xS<dIp=$r=FIj%rZpiuW1-0<5NuQ{I$~x~IQ{kW
zu6YAb+0BY&TC{vTqsIm!KALlzA;>Rr0--JV%#&L%2}c)i7;L&Wn=(`NMTVfr6xg06
zD^8$>OPf!c?7MaJMdW3E@0GCaUleS=E@&@7Gj1StKa@>A-<*IXdw7mz>(}KanC~J4
zGkOfU%wVoyO31r3r^&v{V2+MJ3DXbCA_3>1IblmqeSxF(Bot$Hhb@YiX95Szpr!eU
zZ5jT$O)z5{TMM9u5Mb<hr&NwM_gnd{2H#TIoi1?Hlp~-sUGiEN)(%d=C={Jpf=s6v
zug}BcXRki;_d#whV{rl$QB{O#Y=q7jWV=Od&1iY82SMp`dcw@EFpi2cvO!MI*Sv0a
z$4k-1x_O}4#j1OXX$vmdm=*H15x#E55qrc5Mq~%YfMde`^ar9C5C8qGeJ}OmLK1{k
z_ZZ+6JRy=m%rIvN2LfLe(nH_Fb|lGt$JC)MLiWuM=z|aeN6+n?XFBNkz{e`9E6bC0
zW+!iO9t$o$VZ!<HPMay*tsq!zfBJ3=49XNsU}}_r;z~8-0sHlTK^yDQK<V=rltW4O
zb_icAm$&7C!`?x=E@-AGknS6K%XcX}m(T4RSU1gC&ekITy3xIbXKQa9-mEmDkYYfp
zD?vL;FW|8}t@D!KmEblURt0_!P!oE;Xgzd5Y6Fwf_a8sJ3#ifveG+x5Y<D$-$_F~N
zWWR;XMDV)*#p>}7fbuCnn)gfQKsivXruiw@_dZCge=1N<h?Cnc_T=XauASn2;TMNm
z8dpy#_X^t9q9Quwj11IqOx4~A$cw{anq9NeK&TE-=x?11ciA2~@|3I<3io$0C%|lo
z7_)2K%e@O-`9C4!`A7`<YvRqW94N~Meu#p}i|xo+cCr*OXzKkCQjKP&&7Om>vKrbP
z4QE$v$L`$;(&JH$HgldH(mte`HwQDgq!orW*xfLcBNcg%+4t)#^o98%gQx{#&f}Es
zJ#Ck#_2r-W!bZKsWy?=NiI;3wOoFd7ocnyZt4Z8jY9H@kXn=7+Y(_5bJ!RJ7eHjgW
zz`uG|hmzn#^;i3!-@ddw+hq%f(s&@H6n(lN`@ATq^86_s+YfnA&#@gM^9%Iy0+TLN
z1<s@~*PKRV8Y;G;>*l@=*{(E$9#DaC-L}#XAD@J2x|40z4ITIq9i3n2z)jFSMD`WT
zfLZSou8Gemt%;7^^C*W0H?_glaYm2?ev=>f=|iQ!CR-on)NvSQEavq{NTJ6GvPm3y
zv_9q!#T&UjUnk`+4emxB%}(Ue^gN@nE2IeJn9@kHq7-~O`D?|aaAN{djbkY=Ljprz
z#@@5tmN;z_Cd95!^wvRk2#(<<;C*Ay==cNYT_+UK<d|NCtQ!;E3Eg1;a0`tOpF?ix
zUZphcxP&RaIJcPy^OmfXmwA+XzrwkVu!b`5x1w!edQ_TNQ(6u9u~GYT+v-|IuRbrl
z4B4iO%Rf;ugSL-ez^A1)+m<p?%H?_>`2cdqq28W5ng(sIf({PL(Sb+1$?Z`71eunT
zpBSD>LW{U+Txbhhe5ihhn^>VoolHxJ-IhhWY~iN6q-ghIu=~BdDlJOfxYK3QJD=%{
z_pCVbvs0s%Ah2^@p6;gRni;`D6$K<lc0*tKI|4{oOPY_KC_BxDKi~daF|zewy!2UP
zo%$knReZgBG1c$8z_unhr+iO@(eFG|npn+UQSJ&R!NzF!ax(4qJI!SrA!e881PQik
zluoc*Ma4CgEr9~t17LA$+o2mi`|o*)l*8QHcU$*J9vyoHiJBcxizxY9Dy~ss=ZBZz
z0#Fe+!+Q`_dfs01e|nfj<nmoH=)0S23GS%{pE0_BPp?W+WA;^@Noa2TY9P?h3hY;B
zyBh|X297IQDWKR_H4cR}(4C;AmDQlOgJfj*WLwXE=i*)&?}|MJ?|1^QA!KlQgS2=n
zV=idLc?wKva2>PSRF)t=u6dU_4&`b*v09`T44m6ErQJ1x76dNO4pX}8-_B1KKFx|3
zsCB+jH?NV)$!C1j^QAIOVLvBegw{crj8HS&z1HaRB!j!F8T8-eX-8ByxeRV-mHDZ`
z!j>G%DBf-x1)fv;v$f85+Gd}F!AP8Cy=TrCy3g+Mb615?vn4a=1Ql=gLqoIP!Y-KN
zdKdyo{kP%xiDV$p=Z2p_FwzIzOQMkSj-H-`(c3M2BM)-7!U0Ltu=P2Js~2vKG;XiB
zQp5P%{eW;2cuZprU{Li?A<nj?9V$UpFppM)*6@JRCY-U0y;UeATZIkf(cly|KU5yJ
zIpdo$g!#yh*D6|W{1kW$@l$v+Ok*hQ-W^?j?ZqZ9m7>pq*@m9c#!R_TRMcy!Fu%oz
z%UFMMOkT*O?NNo0vs_w!$9f<12)0P<fFX{(xS>cmO3c=u&n$BJY`y37a8nIBav<jL
z!$bt)I^=}EDSH28TPE2)$4B5H9fr<IE2yemwvgq>?MBWp{82j?Cz^g%z09gTKjK3O
zv<E+M3D)=a`Uu4@GhG-3HaP*4TrBsvjvwncT{)zjbDeAS-bK5#GwDOp4{oHunPR6#
znB$;`k|vdtpUE0|L7^<zVKGP?uHaD4CS0LhU4t|uDrjYnzp=Su8}(Z?g(UxSq;{V?
zz@9x8v}p&|5Jl^)jNH7XCyaY=1Qs%;m=k@m3hA!WawQ{zY~ix#Uigt`F{Q}uCbz7Y
zE-0U?m(1gYVQERqFpox-N?rAk#7LHA?9CczRxtkx-3o86L(}~Q9+k+@$w|0mr5u@&
zGhe=d2YgOFn2tWV4mScuS&Y>Kl2~Xa?zUa6u#adhoeLi!Ua^AnynD`OL!jCRzG}An
z0bDKkWJVAtG7tB|RFDsg&mbFy6qD*QOtfr4)`;xAE40Q}G3r^|)2SF+c}%G^%gw~2
z<V^0(Ff&k~yWIjmL={AY9TGNYS;6Iuk2Yp?xYNZ;>Y9oOo-9myflRFIv^B*YfdvX#
zdCH8PdXii5t=5#}-;TVrp*``ol!k+#lp?XUr%c>8CpqiJbU{RawKuv)kT#AlOcP21
zl-o~#7H?^Nz!@YX720+YGLun`g{93b7B7y)?ihejPB{N~-25j<{hFSEiK&$(fE6cB
z4LD_81vTH?oJ*$+wV%~PCauM${w3u6&y_7f=P6EwPbIw0r}?7?oY;6Xg(CPUCpZ`#
zJ%qx%ivyPZaQgVesj?sL?%>7ke7g9HakSr9B<5Modw-YkCX;Fwe1iVOL5NjP#_l#g
zC5f|r`pb6n5W~`dX>`WPC0aHtGO&+8KBki2%6dh=7-1WtkUFe|ve+;jhmO3Z^sBj6
ztj))*QGG;BB2C)zJj-v#waJsSr{M-%wW?0IEw$v>)X(PnT@DQzfhR7*`36zi{z?*4
z<Yf2UDLv?ZA*l5ml#4B)oZtN>Ck{MEG&wRl?v(lydtLvExCcrMoi}>6Wj3DCQ@CYm
zH+~L0r`;N-L30ncgnar8gC#;A9;r%?FBbKf|G~uB&!U${oO}~Kd9NB*aXtNXRG!Ph
z(SD`J+`={B)J5BU0OMu8$9aIg-_)m6Zfz_Rsndsf#qHRRSI`d|1%CeCgBK6@KE$5A
z5R@R>MmN@0%YycuMF+v=3^UhCDK%eiLC~gKUA)5er`V8SaM0C`k^N9>Dt`au#xv`o
z{f=^;Q+oH3{I}q-6Mq*&S*TTNRdhGroZ!!}se5|5@O_!t_wi2e)Pa~68HZ`V)7w6t
zpba(~KowJYzLh#CHQ9le(6@B9dxxJ1R~9rRcsRjK9Tk}I!zs4{ExOe(yZs(hZh3-x
zuNMSnW0=G*r*DIJpz#{H^|J$C9#_MWGK+v7)UJ#M%^UCwKkcL(7Ju?`1T|m0J~@GI
zFh%2Ns)rT3w^MkzBQ*lE?ym0g`Q{>M*8J+J6Ka%d39d{9HFHnQ?wBOzoxjVRbfBs9
z04n2P(@{ggRgsT3Xyw?3?zF$aF)20D=4>NmT(FnOp@6P2R)&$f;C*aKHMvT%jvp^J
zp{m$Gl9h*k3vNEIlk5}q(eV<HJiaiHrps-vO5D23!|0~087OSO!qh>CB?3LU6%!SP
z(8n&9CKse{-m8kgo&mg*gJby%<c#n9IV^5^pt}e_TgFV@tGj{*Iet|7EVkA+6+LAF
zUd(k>DL-qNoYHgu=qINWpHB*;XWfSxrO?21wpAlF6T&9Oksg?rECihix~hezeY6xL
z9?KSz%b$Bcou@Ozz{Nz+jzh>*Tc$tgnMKX#C@$#O_Il~bB{FI_;jVrMlR)n5ReDr|
z?9r2s)UvSBaKr}<+eumzkEG;XF?DCAASjO@Y;ZyBaj#llx5*S=m{7hK=0U{O9*|nm
z$N;m5ZvR9x*puyEPBdvOqPoq-ESNNi4pi*a8{*SG@W3{e+9q$iB}@gxrIcDUUD=1`
zzk*w1&`|-=OOhBwC)IG3IcY&5WFP{(Ro!tlFm%jF8vG19=q8re<?`=`(eA=v@f|Z<
zK#8C<Tx`>EdetMhAAuuO8x8kf+%tY``-A+oi_2)N!J}B}87-?lX3U8AXVCfEkIEvS
zOKE%;WJN!2xMuGwdnX@B#5+3Qg?3B2p4SQSLffVM?~YB4Ps*v_iJUA=@*Ji++-i)Y
z;(%Ww)V#)69B>&meREB<Q#?Y(qbLv5cr|z;J9Vk+UtUy1jR_H`Hf-~df*&0DEXvEh
zz7WHc_aB@VOxo$eFC^9{&ro)wC;W(IB<joNo2UcE?k5lHeMhGSQ>-nZRnio8CVe#w
z&1{2k`q1+L=%Q|z8fyG>Qt>*3h_`gvP+fP|JxHJx<}cWiT!92!$23S_tT{BbCtue#
z?<&FdK{X*YtrLcdKuY;rZIk9pTg<!snb(Uhy4GuE#SO5MLM4^qgJq_8ePm|gX6*yk
z0%?Qqzcg8$QO}x51UAJh7FwXoX!>3^-C_5N-S?cU9G}5;0nekxZTp9g0cK%5Vz6ux
z7|OhPK4Wj`57D`@8p~jfkDF)qCZj?8N8hpdIiiwR<RSC!qiEJhIdmQ$A@dRHkZy>T
zX%CZLh2Zq=X`MU;<G8msH}`w<9{Yq+mx94c)`v|hgRvT)!72XkYy0j=?rApgA#O86
zQYsH7|4w$J)94~R#V5?*G+v@!u9d1=nH?Mn(AynQdTv@(e2|NiBrF}>Z_pPkw>@QX
z02GP{Ohmj}Y6f@<WE+5XPMeI76T^^}2x{0A_URPcvlq9=zW%aJBFsY;+JvJ@gCrw&
z7sC>tyu1uW4BtEG_P^qf)jAf`X;2|BKymTY@H@_tGb;NzH=)cka@y8noG%C~T_q0(
zQtoiSM)!E{0_eruPQLjC0VebuL!@&MhF1kLAF&9ZoU}~}(0yTqi-08VJbbSVQ_fXT
zkcy6rqVBW%N8vz$#AUdSAdvbBV9+kwjqCUVpn4J`4fEz~PCGi?sS7DYV~GlCA-y-M
z|2S0t!bj-p9+*mNFn|NS<aQ^|?|!;?A390mpzrVPNtxq*aU0&n&PCL7;tAOcnacP6
zp5RK#yG`tJFdL=NTW)oT^KIEQjrYR*VB3Mvk+EZ1O3YzM%k=eyb|<Z-)=fEY%ysvd
z5@vD#-hTXba(cVTV*{EJcDo(8@E2=CV#@YF$YM3j`ZTpv@}d)~I^h{o4(%Ezs_1(w
zHAjRGoO@Mz2(H#`xnPpvLE)nHZr6b3?vCdsR(^!?;7in%%yLEC3p-Dt)$&7;W>G<=
zBQDHCJ_}VM;$LABD4p9jJ6cEH<1^kfc2kCGCM-rz7Ztuhet**jH-LHfS|d1LK)mUY
zp|I=RH9ORFz4bewN(jhg6?vOsM}bP$;|2rdCaMHJJf^@)-RDwQu(=XfG;lW$g@H(U
z04%W#huI+~ZsY5Yc^CDde0f!C0|!KTP`zxcKdd1y*7DkZ)z3C&wm@UG`n_x7gPk{t
z`Or0Bf^|uqXBsCn3!d*dM6K_3TB7V8sm{lANI(>fMuPe5yvrhb;53VQss-H~B>Uh0
zgaZKPJ2$;N<B3Wci=GVO8zS)@4+*Nn?%PUk;)#t;VEt~d>6*71LsGgGTJ=wm<4$9w
ztOgcF`Ew!$Sre~*x+9?`%edjf8W3y9!G6F{FNEIBNIc*QDg|{gpftDzl3p|{s~+0_
zB6mJ^6jBn-fpX)IyHq%}YE{QCI0cn5pBG7dr}^;<Vk1z5ETSHH6RS&J%Dq5CRp<Kj
zk~SEfI)(U9d%bC9cH`KE+8*6M6d@4j#1;~7Y4eDbk7Z&oxF3IX0yQ06u8!2sc#O$<
zb7R6d6P-wxKM13->Y?WwBd<sfjq2a%n9LUlska5x`DEy&PlembL7g<9_>Fp{p|<-1
zA2nQa;nwDqaH~I&y}=>6)K4jB;WwueWmbI)=3Qcv&FuWt($CS^#&>hfa@WZ#-zus~
zTRA}=sB%rLROUvu_nZpfLcV`69YBwtR)WNs5^vxOc4{WvYXzCnTb_&7PYWkj_GQ@A
zL%mepfYip{=~`dPsD3@6=(7&xTu~cj<4kI(zBI;R3$Bk;f$yi^REv`Ga9xc4>Me8j
zmza0%&+`^<?b@LHQ;T*7a51h5Bp@VjuW1Q&vJCEuuX|gv&2q}>(ic+9{o`yIT6*Vo
z_V{fU(+<xH!~WHJiIh^X3F!q*N2l3|Yttux-KC8>WdyzZhhf?^c6Ttu(chpuVfTjA
zW2741Qohbtu8|0FAqpMF!XCFNbI5sOX~|zF#DVv_bGEqMqk<-GR-tf(r9bXya{XpZ
z0v)!BE{0iW-3Eyt@5{$)Jqju>xe&!w&Gqs%29BVaWWenwyVxfCYeeB9R?^eksL7*l
zZOM(@8E$b4F(cbXh=@&$?lW^>Qcsq+MHp`T5+q?&D$}S>G8J=)bo71Zjsb~y+5{PW
zr&n*@5$kDM_dQ)+X%}A=b22<xmxY34AnXZ29F*Mx8kSq%EssqF##UbSD?tai!8kmL
zW)EN%Dc=QSSF;aQRRxgc##X6_ZA2|M!4#-xM7Vk@E$Igi^=f*<BeXeLh8738u2xg4
zyrQ^a;6(k4%O%5?cu<M@mZt}&KJU4gNmO!y;sm~evQKR%mBuWFn%2~@kjt`nZ9J#y
zS!fy)1_wb|H@Q^P8~bgHqs{rO68@>+Gw|Lzal6uOxSgEpBlL#Hj@qoIy!0u*Hl(nD
zU9E83X3gR<U@VgKr$r}3Y9vGa=O}22W(PM!d(FU&UV)z1uvEG+CimHF5nQH`*=GCc
z$AudZXeq;JkIX>ci5^}uo$e&x4M?qH!S#p1tu#C;kf8p-tG+jv)pDnNvkCc5X43dy
zHDdeE#CaN4(m7~md?SC8aW1t0Qb$J_+lih8bcaF7^4*@zaYLWC6ym*3XmnEA@KR0J
z^gxu&EDWP<gBfdLH)UYj0)9@T3L6D$c7eyB_yor1j;7i>-uvTt^~{E1y?gg;pch8n
zlJtk|q750@6*Z56MKYN%`VD^UUbn71e_}CfXigJUP+mB^i(M_JY1d=Ca>tp_)pl<3
zSY7O#^pbB2`RT+dPEanOfo@PuC^XyE7kRB*P-EXO64P-s35t?X{0kTHM1$yI;Lg6n
zo2f4=ZH7`_Qd~NUCxkjJz`}i@hXZ)g5A%_G>C3mJ&yu4z+NBdL<mrts4J)!RuI#v9
z85MqCjcq#sbf!JK>XzQGCNdg)Hf96H0%I{M<?C`*fWQurI`+b*1h$WMysv#E*DR4%
zT3VYJ5{PbHNzdKmN4wSw!1)CxX-`6~S1wTr(^jamT=30ZQdLSjk}IDuh5j2TSHIn2
zs7JzvjxT9k@#SLu7N-hLE0W+`nK6$74z|^Po6M)hL)7CL=AvFo&!ol%+UV>Sm^reB
zsw4`D(DADnr96F*I!tP#_SC!{bqnCdWkqURQ_SOUcU)K9eO6e%Pz|T-!l3z$rkz*e
zu0hn?29`3`MqVa#4uYmZ4S8SZ?kFZ_bO3S#t`~JAMdz{60?saz{$O0;bA3LiNOcn`
zeh$ZftT=Kj>~I%llgA1RCLeUr*)|c#BF+(eP=%wKs3%S<l?<=+7jc~$R3}GWfA~d1
zfw0_9$=%;9+KqbHsqg)9hrMqc6(>YsDrP1Mh2i{7MVre^{+;eRPI21lF1FZ0hj;j7
zJ+^BVyxa?*z(}cIP$zrORph+w8f$Tp-TTq}DPPC{1F6$RS!#M8<E;E6UjlWLpLBg<
zeAMaYfBU=H<O$3FS|@-9%6f45!irhpqVf4-nI6T@#2-J=A#Z?C;hJF8O0EmrJ-4bB
zLzcVL1P1{*D}i*`2)7M1$)R%<BPC;{7v)%QQzxI5Jbx@W-75V6v!F}UqYt~Uwt4Mt
z<q30Kz9yS~Mu)uzk5PgK!{Y%Q@+eQ8ak}GN)DIqX#z>rUIe&p!gNjsxTprRNmZz;1
z*p&_N5LfO5&OH}pQw~nNR;Z`S(&O!)^1i5BTs-DxE|Ng&cHGBQV%dZ)-rF))Iw{3M
za*1@e=ys2Gu9x=AY#kH@SCUct=Q=MIeZ;2IC95et=d-XFtt^`vO2|?(Ab%3!Df;8N
zKpln19)0I`>Ita53ZCpL23_)k7JV)~Q8Kndn?$E}_x+Q!VDN<xUyxv-<!B`G+L~P8
zU*;Kv)%Rj$xgn3;SFw+J+}~-cQ-kV6cU_2&_PZSfB}EGhYkBco;*(#SNuF;Bjy#6Z
zq6!x_9O^8wDVR+ZNekMdQT6_j#Q%UFX>@1O4*T;fg*jKY9XNq0Ez-!o?CpMshL5}P
zL9Pl$s*HRRwMD}Q*x>3*>F-B1wxE4?lh3Is+^ARK7KV!G2F6ydy)$__$&V-K)8f2^
z=}@om|3p^FDh;3Z_+vLV1A#|gD<dn#x`9LIPg_V^WYj#lDtJ7g=F_qGqR*06TQlM}
zKm27|MQKSy_X?UjaR%Rje3ynV=X)0eC4;QyvzX+wrx-rXDTof`HsmCfE4%$`dmC_5
zbk`KNJ$<sXPC9hu%bu{A#<oI!$DrzVxwMA!kp)z@Ht;PE3w@@!;UhVe8=5^<KHRZ!
zVHJF+a<Q!oL~b*I%SGeXOyECuh6EB}$QPS(K*q|YgI?eLdw|Y>=1CfA^OCcj9O|m<
zL8s^)_k1NZJ!0dX5y^t2q=;7GU)VyVA%&h2Rv$foPBr%LK}<M9Oz-mQ>4og?3L8Sb
z6qwjcmB^hM3Y%hX+w0Nrc&8E-10~Aou)jEj&R6}1U5RkwId_EiTG<iX?i6eO15tB|
zy<Vrkv@V=+!(g-XI{3M}ChBCXG^4KG*1(r$v6g5}y~z*wxgqOUg;LGR^m7kp)uzj?
zG(xUxZ%kE+kABCxtyJLG7mot-;-xsJ&AT6Ns0?6NR@ROQ^>!SM5fs&0ex~oWe|5b5
zklw;PsD3)t+XQn5;Qi@%Cki?7omV_PkF27;!2Y|v5G1DJ?g`_M)Z(|U&Vtc@c?;8Q
zwfDB?d%+E-ThG4@Sm_+YXql4)7poNuePGnItg;>D^YWHR)eg)PEZ?dVYwSI@i=a`1
z4<HZYu%Z0_#M=wn;MWza_q#yDQz#E_z|YDdBHM&|d}Uj|%IwpDkNZg{wz##e&Pt4z
zvTv{*yzO_=;bvV}nbe(?`Cl`Jom!Sp@0JO4f0=&m^g#CBRU*j;(SW?&lk%Z|2|c0{
z4hv&ef{rY3k3U+6aOf5Z-jLkZ&C2prC?WD__jm;(y-sb-a^Uff8Lm{A9~GV|Togay
zOm^-<A5XhL+yGuh07l0lD0?6IH#QLrPr^M_+8%#jg6TnrAQ!JPQAv(UnQI)ql_cu1
zavg^MLixQP;zyYFOkhhxerUg4J8U_RbQ=HvWZ%8q@l2w$b8}m|V?$N>F8pS593>sO
zlAsS;@={k}_pV~~+lO?2^sC$<B6A<IlRoDz-~Ii5BLAvqV$gkK(J$FYQ<_sQ6RR0W
z;;xe(8%$X;Pxl()W-koevOkqY_1i+nl;MTver_Ct1akwpj07|%@g&|mwhoo}kJvL5
zOT&h%wskfsY@;8B_>J0rzng6tO0}D0^+Q$;myoY*kxv;~&IQnQQ|n~u|KlSC>WO$8
zB4_sh@(`fnzXLGVVEEs>8ta8pn&Z9Hs0;f4DEoiG3k2Ns|MGjbU?UI|T1hc)LZ8|H
z?WUl!?*ASr1Q1s^C|!)&_DBEe4*B0~e{EB6B7n#>|CdPk|C2mi-X*Jx9;-D=HLFW%
zvI*>4*A(J^G0e6XB!}w~5B?UFHPG=~b?{V|;p)P}`F@$z!Mlc}ad3)-Cv)G9qyIai
z2CD%_rV%nP>))cTKziOLh~-wPG>CP&v6ekC?)#tU<3f$$T-BRiB8jLZCpoQVVv@kj
zv-Uy-ih;3&Vz|7@X}Y&dE;S=zF8P0Ac5d}&JH?yqi-j^P!*3sHDAL^96t=hV(Mj%_
z$sGw7mA<gWoJiWLv2&|WAuGRTifU+x>}{ow>dDC#_d~xYRqmDDNW;y0pJQfq+F*6Y
zYQ*Jvc2nU@<mh*|!t>_`n`hm&jSaj1ENohMHm2aYfrK=h25ua#KBLD+LW=Qcr?xgy
zA#>d&rEynEcnc|poSF@5bcl51os@9YhYugVD1S`G{|!C3?gRa4xT`(5)%Z00EjgnE
zZc|KT>JLa88qEx&VZ)*y$B-p@)ieuej}2P%Q?7p$UVy%LpaUpc1N8)9-Q)Y%x<tCS
z2}u&(p}qsx7k4L-hr3NPd_<V$9}oM`QjhO#xE{V09!M2r>(X47%8@1fjxDa_KKH4*
zxw*NivUgJRZTk2<uLB+4A0J3cD(%qbaFqw^Swc>RaXs#9ZAN%J{u(KhfxSjPoL8v-
z_U-KVzRR+IKTALay&~-YsLKT&wEs8QH@Ix*GAS2lf~myannOGP!WD7D>z1k7&(_sL
z!47!9J)+3kGN-&-jYd&xZfc=6=RJ1a@7A7F;-!AheSA{fcrfR)HwFH$xIF;2Gsa#5
zy{ebVjA`62ReB=ZEF>gkUc#RHUjSs&gfGM!?fZ6{>}uu+qY_~wOLEwfipO}0$kD+n
z=Uy1So!W(Qy_K_j69@j!f5D^9kL#nNR8>}fqHu>X?8*wyDZ(<;0}6l;CaGDdpL$|-
z0KEn_dEtNa41yb*PS!nL+D9Gasg};V`3^7s4cSb<3d@`u&{FS#Etq4o?;$=UF1_|4
z^j!RWc4<m#C#m$M4h=iPvlwRv>hVGC+GPZ0upYLAS2S!30Y!h|pg}iYJ6y3O>1&X8
z868>?*Rq8iMjX;?K%c);kk4(cX*u!NUJ392xUZJK3VMva2Dw#~m8*^xrBz)$>_}i1
zPwXBNk~m4F3_K}|$oHLl#T5_~B)Z_g3iq>KT`ExPnDWu#{l}{OoV9zqL>w%1E-_lL
z6Tio82ej?Mv-^%nPtTg#<Zbf*K>;8oz5#yqv_m?Cxc^OTAUhkh4Dg7*N<zY_wu!KD
zbp_lUGj9ttX6>d?FWZ*GLxgz)?jAY$++FYXc*1+JIN&78H)4s$0THQy2rWy+^xFi|
znXSz89EuELe=7F}kdh_%eV=u|Dq+3&NPwrYO(zYw1<Q_>%G=LFl({NRHkMI8E0{JG
z%EQCc*ZjD39a+O^D>u1z*YwmZwT&<%1OnJY{2<FsF}YT-_FXKt#1u1di;M8pCXQPE
zHoA=puX$yq*UMKSKs?pgkDfr-aYoRzY6>7|E+A+M4)egq5K8RXufYLp6%X6j^G6z8
z^0D@Ne{;w>)B|snC?hmt(6?Jv;v4gtg>Mwdu(p(LT_eYUZ2Y|G!<1HUm;4RuSJ_2c
zE#nvz6qE#0A;c2|ZbzLW>=!pg321e+PQMF)_yC9RZF>+-O?bNy74GMR{yV8jto@et
z@HAiPvT?$=CCoFo^$@Y&paC)#SYTBRTitOFIK;PW8&UqI42*`VZ{4_g!NTI%*z#~N
zfqbwRY&%&W<=mXun^Cpf(*(wM24EWOI@2TJN1R{_m?;X5{X=4)OkI*>uXp(!flW$0
z0h^n)K662co=fud3`)ZNe#@T^I7uLYQ>gwL$R}YD)dI_O1oBG+vS`UTVZlGoz_yh*
zg161-+ka8myW)35Soj{WjMn|Ia2_H?;$>jfWt#>(hI*;tqH0_(Edm5d@^7rchx?vE
zx7Yq6qzS5x?il*79Nk%@*Uo}=tY7UIt+{<fcq7qH{k4cXAIP(;<?Ro|g~{MmdrNr2
zNnz<B4&cp)F*!uOUwaE4fI^B=8+{EL$(9Bi{lI~t>V|g0ve-O8+Q&kV%W5E7r-4&L
zuM{Xh_zPnE9!e@PV1$la8xi&t9}n9lYu{0ixR?Q!NPV65b&dJfZ8?6GwAr&pYdUs^
zxvlN1)r~{<h)b(qgMD~e2M@v~<CS1B?G}}*L>XI;G@+yLySL3l5rpUXfUIg_v(?Xl
z9n_WJtJ79dC-F?P9>+k*e59T}Y&$_f0=+&Exr@cb88)4KAC?HaUayOP4fZewrt;ip
zJfI(rC>lU^j_mhJI7*5`#j)Ss>{GPYHVfJBLja)?Ok+sYM<0a0+B?`8I#D`0y88C)
zEh|$8V_nwRGzJ5O?dWdPP_v$%9&_ON`u#<!e<)Hy4LG~yYsp7M=Qv=%r;~#YR0%)?
z=zJM;2G19mRF2wRqY=6xY9u<B9Mf;EQ6`?BG@Vph!!2|E{Y~w2X%1_ACJrRDr~T!C
z)*ZM5J2l+lAWL}f)UkN&*A-D1H_yg(dz1zvGsUdsNMwf;fJ>>+jR&#AFVMoFrq8QF
z@h{an2aof99rn%OAIs1F&6}axfZAsgrX7R@O4tDUO^;I55CGxa;6*$lvcvKHwM&yH
zCiT5tju(oQ6Kw;+p)LqP2w}5-$?Yv*XA%0#H;G%bL&ocgMoLm1#q14}KRiOOQ!<fX
z*3KfrNszQ#;BASx+Hr!piGKiS&2GDX2fQfQgAbjV`%UcYq-(7gWeL!tja??sLh7eO
zj)<lPw4n?MYa}2$UI7~gzul%tz+e`GZ`>(h6M5e?G60sm0D<(bH<g13f7_7?kErKu
zL$+#x@(`=vz2#7q$#X3r*wYHHYzg%K!wBf1M8LV^Rw6_=4^VzLcB1_m@d5a97mkqs
z=E&b4a39GJ6(=6ePWJPoU=S0?e9uUP-ky`Kt3KetRm6uz9q{~Qq8;b&$9{jn=^_<t
z+qxgASa`D;Jl69CQ4046sJ_*wne7jTe&d=Sex(_ao<&XAMLZR(?}lx_bCW$IcsUlm
z|8k@;D{wBU$9zpKEFrB|kEe(nQtp(Q$9})z&j+pJK!$^Q=0x8E%L;t8;F*jM00#EG
z0B06-G1K7#(c$4IVEn?BC6}dhuKRW29~B}a{8<Ze5H`)Trni@C2ay2S{ebi<c_WM}
zu;uSS!mXWczW>-VI9E&GOHv#_5}XvPV+h##%aybOE!3uUfanDL^-?y?Wfw|$@Lq}B
z{eNIBB9r4Gi1#G5`S}&xB6L(!{Ie=qT%h~`BKEFu0SO*g=V-w`?m~!T8oE$U;O<6R
zUr|g#*`4Ky#1e=LvKh?||Je^eKefy!u=}EC9wyO(YpcMD`9yS{`#?xV1doU!iN@vv
zMce)JVu@VA-9_eJvrbvqQkr-?2#QyWeX)WMsCJR8`uX|kZxB9@@pt8eDNJ_=`^V8D
z)nOE(jMTk0n5dL@2Xr^$Ic-NeHtm{P72o-;8qE%mtV~hyKTPSo#l$1FhUII7bxG3v
zbfsjY*HEFv8Bj&&C!xoHA+{En1j4S8JwdaP9r;3we;ou!Ej}*4iuMY<OJN~0G(s27
zgZ$8Khwh+X80$W|@eeC2!KS;S){GP%SO~Q*j+INWggOnb&#*Mg<B`?MrB)VuhGvUT
z1axntHru{M)bY7<&4w2aCx1(=5th`(<jk(>!;Qao)gO-YRGS>5I&ZhD_VN(Xa;`H!
zxb@BqhJ5N2+v3U20iD-LhWQJB?48gq+y#v5t45cl4x#A6!;}P^2;D%&kA(&A#++6H
z-+Lob@|bWxSxC-58B?e){6pm9ytO032GvddDRYmVYX)Y%RuaA|F#=xSG@OQy3FdBt
zW28uQj0UvXr2Sl=w>RnK%a`X1Y-N7a0vs>e%00yrVq#+b%8}2<=*Kk*P35DmoFxG8
zmudoiwTVFYb&tcV2WG-Etbm_x-Tq1R__6WIz+jhJ_S<m-v-u%g1Lr8$*+l~1f#y~t
zZRVGVN&R)`_VqsTRMFNk5QWqF&)3XODspwgVzRaaIQ07Q2EZ?L5;5)(f^)PgL`6&#
z7y@88aD1OLl>e~XT^}ae6_VZ|qO*gqg5H3rHL3Zcfg8T(lzJm>kizw;)tJbLadv4I
zF6VF%cKn<8=z|@FNzh=Vz}=8p-;BOtt+9zEB4FC|!0X)^x^#dEN*;jm#WvyH|5$*N
z(JZ4|MCa0&BuSiA(DKfSx@|-@rw)Gz2IS4RHegiT(3VocvWb%q-4Bt(x$_PEw*G&B
z!8%|f)8?wQZFtp6Dkk(Boen(nCLqxa>|oCzUJlNYYR1J5>g?4cVWndJn??jIRg*Sf
z%;(|d)g%3pGF9)YQhdHXOOMD{xCC%1=q(kiP{c(>T{(HmPPBfPLP=5_gKz%1G`ywx
z0}tWdmAt|I4wb4+CPDp?;o$>%mwvgXU1K1=+2|<nz<Zk`h{=_>gZBrq#5pswutd%7
zytR8gB63yV)Un1z*y>*jHC_<ND3{~1|GB0^ECeMZ{3O$t-ifct(Oq>=h~9IzKp|r9
zkwtpg1Uk*oK489HlH@e%{z&kb=4C75rq&_BfwA&Tr}?z*c~KJ93cJR6^H<mL1XFhd
z;Tsxz$pDUo(RtElkryr$b5+wPqb1|&2CtJedj|F7=QG$#;g{MGjdeAIM(I2Cpti%R
zS$Ac|u6cr>xsDrUaK3&;5^v!NBzoF>nGU!gA!-34<ND3=2B7@oQ-R^7C)00<RC%h+
zJX}aA9ORDIYFy4ffCom%0%zfQ!`N81()3%Dk>nqaBBL`~k$R-*$-6k6H@g4ZK^0<L
z%vPHT99`or4g|(JkS$E}wCR(?^@+S%q<Kd(o-#UVb_t<E-3s@!hT&zTyX%->u7C47
zPda$n;L*I?tr-5fL&Z4*8!@rmgqOw-0rj*yFARf~#Zr+jizf<zwq`B%&UKy?B47C8
zNEK>D*j_f{_;TLYR_lpBI2WhPl1NNzygi>tiVf@^s)V)hU&b4gdc=#~U+V=Rm1byF
z{{&RU7LzS#^j&WtiL)5&aAz4Linf_xZAO0VSWWECKZw92Vp_B(A_ThX^$RP;n0|wc
zQWg&1E?q^QH5I8{SY>tVh|osq>f&!|)oRnRZzO(*;}@y*z@6!gDxB-^FqJ}?3IZeZ
zWnv4Iy3<o}$I0R>#w#X@FLk)B<#m363)LHDyBi33WnXQEyw&;p8k=1sYeqgxm^Kmh
z!7@!{5u0FJJ*!UP9}AxxiA`%_YbA4i#mY5KQk*Ka?JF@D&4R4qgg)L~QN*1O8oaXN
zBeD2tM&EGw=u1YXL&en8hBG|fuJrAM-!Je|$2|*(<L>rwOU?1fa3)%GMAySO3o~Bn
ztS0_0y61xB*_zeE9@;Ln&(^Xgyr3JJ@wy?9v~W-T;b)#|X(D-U&PhEp`TrGW_*qFM
zneOg-h7f-$%0s#2Q>DY6nkprxy?UlZ{FelScXkV7iM*>&gZPIR^KUVX+h9&K#~qyf
z<LR#<H8~AC^WKH#r=QH#?UPxrCP<3&rG>wOBL<ip0?0IrrE-m7rK(f$+J0?sc5f5t
z>iIeHMxvWQskLPYL@EOT2)<BtKrfwF;$owVf9~zeL=(Ri(gUW~a-5Mic#3(|Xp=v0
zQpHCq?Vk6%p~n;oi+?CWSn4o!WznW)^AGKBjyI<W8m<9OZvc+RfP{%VJi+ISO{2}b
zc=2N1Wl_MHV1@4wU0<TpgI5HEmEe~<!m+s|agd<KRV!2w-}SdDWvqNqS-rSr3dyDP
z<V=%49}D)^U<|3ui9}*PXgz@O0oa+tmDknzZ@739Ajjo{Vs5akNPh^3lOi{fIVK`4
z9s5fQgrKzkxu=?)Z>8X@l*KE(Kbq+Gf_K%~tV=33FCBk<5@}0>`Qs?uqP>0-#ZA$q
zlTwLSzmc#Ow>_MkKX{c$W`ax*w&)Z%_=`lQhJvtbbwyB?;-mS8L4(9^#?6CUG~lUr
z<VM3M4MVBg*J10QTtmhlvR@e2H!Dq?uUZKZ;!cA^h1#|%BOmTDU3iVK!OiV+M`N$r
z*WA4Kk4;QzvMpXHvgk~mozm1ZBrJ=K08advrPl}9V;m`YReyNKF%Zf{R#sLuARAfm
zK{EViz{t;uvFW~v&scLQ5K@iQK?`?R_b4kX&&8O)IeMa^ri7SZb6`YR44$RejBmY|
zzlnML!=}I`FT*O+@@MGL_$AV6?Ip~WPsMR({%~k~C22K7LFVMtg5f>K&uv7)9fN&t
zJ+0FTASWkdb1Uq(9qbBOzSKG2%HTGz?+%_C{Ix@64xM(2`P`e<7!kzU9QSiL=;hhB
zu}ao?WkkruV8^y>d3dvbnbIV!ugYQ6VKTdUxNkvs&!kn~y(F0?BH=U1bbVa<7CLJF
zq~G9Wg_JkipX(f(C2IfmKFlq);90=nCf8SSy%T$5*)86%<2QSTbmU|AVys?TMTzkr
z%G{NQ{I^W;&ATRBwi2`V^vWPSMl8nSuvfl2*bV~biUtRmmS0Iq_2GT$OVpL_#@DZW
zCT2>uo)S`A$8bsqsGlD@J}GhHgJ|6pkx_~uyqNOQKzZxrIBdtkyqlDBy}OIuB`%iX
z*#NR}%|g~w{GfRC)+yfK!U^Qk1BSIK)5m*V^`y;|;t-%`+HYbQ{}4E`O_7R_*BdCt
zPTq?yYMSZP3BoV@U)@+K0LdVoj}GWdp;K&#V=2L4e&0c7C;4zQA#t+xI9t&}K{zTX
z6f^uUK>!J|1xfBTwxvPBltov$$?}!9>n56U8_9?LYYDSIJhcu697^{znCGUWbvIHE
zXqA~#lf;2zq@m5mKWFw&kbff~IRQKb2FJ!o1iL&=71Ypklp2OmV6eHGVZ}Vx+hRKQ
z_2&tFxwdO^>&0Xp1O)LHvIlHF3SmKP)kqgU9G;O!DsAVM9&<VkyqU8vhA8y#d;6__
z{`<iJvIi-Xqk5=Vw7?~#!w~1b*>zV<W0Fe!Ho`0ZxqlMUr;tI|TYV-1Un&$3dInxu
zp8w+3@PE3wzwvPs)o`NIk&mV>fV*B{Kb`5NmgB6pK_WKoOi)DG9|zz!wtjy|ehO&Y
zpWB`Q4};TiUE6g5biplOVhv%Tc*i>bVeKjUGi<4nvn}W>-T8v>$rbmimz{L`x}z5g
z%iU+0p36MQ4!AP9h3UX9%I4&+qM>EX1SdWVb~j;$kcI8U=xR?4>L4fIt(9l@f5~x}
zzLI4$Ty31&x>NIK`+39nBd<(Ar}GIVkE_cpj+h3g#!c@0p2Lh(Yc>kV%Z})A0cLM;
z2206ab#;|2+j2xxuth#Qrq7Tu@_CK>eu;|}cH2y;0@gyLwXIAk8f8$Q>Y!GMWmAs5
zx{%315HQ>fX*<TJyu3WIQ^1#sLeVHCYH<XG{Oc<wTe0zt!UoHfNU3jNNj!zbJlO~9
zDi|ml-0cx%h)pZ&-hUrJgunBpmuQ)By9W-MhIw5MR%MUM^AeFg{M0Qgkym;^^#11d
zRoClmHJwj6KD$U?9chxfv{KM9SZuR=@Z*fgzRU{&B7D5diHF3*ns20YNVl=1*WdPx
zKv=^<936}II|tIRX>6*`84#m2?XSbLOv8!z#@X5%B)^kL^cqI|;ky`chXE$~b69GA
z?fe#{-0zRx)X3#7f1Y-Xk8=y?9)mNy@kvqqtB=Ezgb`C60(nRn@tp_K#rGo5zWP(d
z7>z+-KmWYkg$2ZiLbSLu-($ml)k*_QB-XU0J$3l-H0B0@PqEXOL@$jIX*T^r2E6z<
zLlttcU7ZSjUCUKF20oi8Z&KZI7f~yIMMd$V+s?-DJ%|e~fii31np$yRWbP~3;_`Yb
z+`%oOh5RQXMyg|==;F$jGOFeY6_;SB6!Qk%zlxrU7r^}#HqaEUN%B%lW13rv?`*vO
z8eb*7!dE-G?O><e)j6TF2MGkahh5rQzpLC_I`!&MC({A_Z?+1ulH4t3n%wFnB*&6N
z5w}zRgp5&_)4<Oi$D79*CsuoEw8sa3;_L0opu27`;@>((I+8k9#oJZSzpuCYLy>jF
zcC;mk3W_|=Sjg=sc&!auP}AD^oUCDr{RLe10UT0T(r`OiSDV3V<-ClT*tvylUBo(@
zz*vizU06o0$Su;5K(Y@jo*jd?9U-_I|NauB0%gXrM=XZ7y)1e$M2wdI&h&0#6KY!A
z5YxfAqr~1GY!vP^WC%l@7nM<Btc=0T;n0a(6fH71dQ_u|8OC3b`dBkxg!GXkXO=!h
zvg7dsulIpmONAGnpF<6eGRFEzrGIo93V@^W8pXiVM^K&c3TN#QK2b7~x*-Lt9;H|*
zOCadlD&?fjO=NHOsQ27{TZhL>18FsDujPt+$5LTY<AlM}^I0EYvyhb{mEP#pxF0?D
zq#?S9DrjP2f>|5Wr_?VI`-yHX^F<WoA%S2w1GdFTS+`5iq6r|LiWlpuUs~3Odzcr^
za)95V^q#b1(TEXHmiz>pbTJ<)bw^{<?gx1*{H=g+`ta_&aDBo%C<SZvM+{4n##=9W
zq5KT2LM*;OJe77aKhnd~nhC6%PK0B7+h2S2#E!v2g)Bpa1Qk8X`H>eGZO4#nq4GVy
zg!yhn_+%`kmcsjbdrczKmM*6aNqz3xKd>o^k%%#(ecAz8!kP{0#Khfkw%P+-^`kY4
zLJD%o$2|ludfLp7VsWe(nJ$@Yf`1JJ@0u8=giG%w(nll0!_D-Q{@x~7H`EERvPTb3
z;s&|+_S<e`u0t<Z&z$tcdO$vFVK_w;ahMscnPbBleOg@18N?_~-UONa10V@&4Ib&#
z$+xft;H-16&Z&<>-u5t~1p_?4BObwS<ABoB`CF<O#{I5HPE4CsBW0yMn^eB=AG9e|
z#|=?YSuZ*9<6nezB8yC$QBzet!dSO--jj0WN4uT~@84|4Eu>SD#B9vp_1IZ0eGk<g
zqOZFKs3q)1au<LCxDp3kH8!6TV8<F|!N<qb_^d7e{#whDB<<<g4fi{qPhB*&!T<QT
zb71rUxc@eR`?VC~wX^pyxTa<npON*10xN`zrj}fQsRmVuL`+kwuG{&0P^Qh#Wwem!
z3L!K#oM-F!=AU!z`@8ac57z4G@dx1b4$bDnC_97`htkbtR_c)E<}LX6UJ$-49LPiQ
z5T=iQfTxSjTECoXzL}@1haD9KaQT!sG}DN+9b^i~=gw8y5j=}j&e|on!p6<+yu7?=
znP*|Eo+TRq1)F@+Q25he5Fh2Krq8l!64jG8v_gLlrAmkP2D`}#&?lbXke1}DO?02v
zw+)eNodd4@Sa8)63iVL-Fb!`hM-|g*Oyovs4ZkF9Pu>$PZYtBUky1hr@P+bi$STa)
zJ<_2fR(;fYQDilelZ|{5Y$Mq#aMS!(IyNUSjOwwHkS;^Q5ylP|Au<`=!p_{DxvuTp
z@^b!d$3}dMg9DwJ&hqEk(tIh3uf$6H?{qEX4x!SR`4CaW47PfF*xDYps-?@eDCe^8
zNFdXPRxSQY0YZW#*}nt^_nT_Jp&L#4II6puz{#}%;0%fCW)OI%3{q)}tSl_2CU%}U
z^xTEsWvnU?#&Cn4r@tyw1qYl+!)OQWGpv%zR8>{4=a)9&nHUA<)+SE((oVNtB}U~~
zCer3EJ69vVj;##@e#HHvFcDy^xa~sb`Y|`?va)B%0D`!9^JPTWkCaoi;}f?oUt-;|
z1y5)(OgGu{xfSNsVU>MB#B92{DZ=DfQ8pqJ6s;NeofRs4cBA@YYD;mGOzP`aC}5Yw
z14=e=rx^kssLkFSi&$ot(u;`S9nzC0iT#_s<BNdXrrTWvP<V#B0wJB2ong8^RFhD2
z*VKx<@8mEjJE`T+n@UiF^%>H5Nlc%w$%v2~8_T%HVADo|ll2jWnWM`h(81u(4j=Ej
z388wI*6Dcd5?;oZrJJs4J}zG}?g!oculj8RSn^Wz3FU4j_Tc3X>BHuV_F=}0%J-o}
z3E>jG38ym3A8XT_t>6mhE7QTw(+v1|BnD2}N`k+=>FS^C5ndwZ0l&ZW@(A8o=rPE#
zS1giT`X*(^g^fprNrh&LNrXr;vR|8A6Wzh_E<gtV5SYMY+fMywG0eJT8R+lZ*qNKf
znwgB_dn;BVxWS~NE{1aHa8zpI-wS%0?pW-c?x6}BZQJ4f{scV<@4|nWf(lqYU0f7_
zcfJF?D08`a<UO$z0I<?)^F(DJ$18GZS}R>*+IP10>@9oubJy!7|5ljmRS-C1TY)t$
z@Uh&4{vxpST5h+~QL2%XK3t%SuHK(@V0XpgciRpgA}~D>5HLf){KrJgv+yihA_?#B
zqv$^Z6gz}mn_YOBG$M@wtITqCEeEu=1a@7e9sYKi0^eIwRBQ@f<Q_&?qZcgA2kRe7
zh!Y1;Gvg_theQ?gn-RA`8xR$0{Qy>b0A>U#oF>_fS5V>aZvz5R;xfexclcq09^3iX
z5R7UR(mW%U`PhWiVEaz*@&fP(ray}YdS_nd?b7VpCIWshCCbUY-Y?7!k87c}LU9*!
zZUjSZ40bZ_7cp!v*sf1V;Q=gpfS7954WIC$p@c-kRuAZiV@IMv)2fc(jlxbwMV+2=
zXmuM#CfJSamGyBS@3EWHZhMIS2z7WI7{R5sHbS$OKlET~_O=Uyef<cDbIn@@2T`6N
z^lZrl7k7uGCjMm^D8tbmo&jaaj73{Q$q@|LXJA-|4!5iL-=tE`scUIFe$59gfsLwp
z@g#tle+4B9xk57C!V?Tz0<WogyB<QQMVyCQf)(b|^$A75j~rc#F6XX)1g7GHdcT^6
zWBD7x@9?zX3&3hUyTg?7$`S&rfPjFo%e#4q`4DP8f9wo7l{U?tS0a6cvTz@2Sn!Z=
z1O<<pCnPr$Ux{C8K*Q+6eeS*QB&$_kCn;*>5;-6n#mmz(diZ`+EiJai1-}i}3ZgBq
zP~s7`RAS};Vt`)(Z7Bv5+38I?no|E+Pk``8A#x!3vE-o;{EzOQ0o#<da-0_nX)NTU
z%7Ksi`}ue_QX4HD*9B#yS_x3802heLIO3tfaR626ZFH*qVEPO&eyDkF)xYx`hf(19
zmPd~REz*Fxx3241y@jGEvXV9N`M=j+_i{je2Vli{(oPirYQ6-lmh^cz5<S(h%jSHk
z!9iY7mVY>79TnCVX@IIw(yb_~4+yX7PO$Hf)-TCu*@TSXBgw|$H7Quz9Pzn)$Po;F
z{?v%4M*LPl%cXDJNcmO5M}>~vcK?JHyA8C!mDht0kQS2O@}^&E>||1`^A+9K*Di>6
zd2XzO`JA==H1?vbxW2K8z=6o1coz@`RIG_{>}>g4Nf8OCjs-wTt^D_bGa4^Xif(Ib
zfeD?aRdcz;%a*zhjoPQO{I95Lmn20lUeN|m17y_n=;z~WWidPh6UdG@IU3*>d!&!g
zG4Cfsw~tS~rE<Sk-t<{|@n=^1^;Q~|m)ObKIS`>Nvo8rWHI=9zjxPv6bej37{BIDq
z0Ek1@lX6s2p@DFLj6Do1NkmQM0ix|N9XCD;j<28bm>@g<9R}|OZ<kpbw}&|>PanSN
z<sS6q3_Owc?3EJE)Zz+8cZ&r~Od5o1dLkyu9tVo^7mm4&$sI9<<!P)vzk|GG0~{&2
zHNF3aLkht$Vt!o#MjU$hi?&9!nkU6PI6s@;qz$h(_zt&8M&-Z-Da@IzFl(ePNqRcP
z>+`oP7@Bp~ZMv}@rc{adcu{%Gy&o*uNL6~o7*}?ODGA*#D!hfk_4pg+TOcC#fWW7%
z2kChJ!!dZ?!)5~mq!t##q@QaGO&-fl_ApW4^VtD(<CXMcbIlnSz=DBynySq56tjry
zuaLNCtGpi%WUCio0*dOG`)onIW6@;HR4K2ta14yd8%$AiyYi^d?BHVt-_;y+acm|J
zdzVOyTf|PNdJ}@PP;f1iq8BfJU8gy=Vv*tqINuXu0(tMxI9AV>2i8#L1OoMoFKjjv
zhCUpCEjwp2zJRL(6E6meCHDJB&2`-zgz*|`!k^VV7V7Qc3bUvE(zAKyFau6m&84N@
zd9>Jnch(kzbhy28TLnjUGG7+l-KL^uo|c}Yq~UggQ;7Q-jMaStyzi8WO>hQ1n)L;A
z#NDr*1E8;=1v;s1(R5}Vuu~`5-bsNS5pj-+j{d1Rmj9b|2ad<jR~NswhRMDG7bUZ<
zkh}i8$a+=F=tB;b`}obljjVH6ojY&fIBfmir0Dw`@wPBYDG>$=NTNF-Ef?XMo5OW5
zlP<0U23{qZN)4Yg2+;m<=Q*6hpL)ksbnPpxXtPPkRQs5FuKKCfUS639bg?S4-NJj;
zmEhQMZ(jN<u{<K;I2eU2P`J?eJoYdSM#FHUn<XDdh2LW#t?umUaR{ID48x1lJVLBk
z-a3y3N4X`7a<<eqhRpE0N-oiy#Mc4``E<m_@;+?uOxZ58JgAlg1JtA(S8e1q-}uEz
z4GCCpe(pAMwH~Jbh>fR8Q%Oy!!vU$Mby}BFfw5(lCxSeb9ouXa{f(5CTy{w?4EsJW
zld*Bjnw#Cc=LVb9;7K(lboUr%bXL{DPrU}lqr=<bQZ@TZ_S;qY(szrNehBuDn(koA
zq4b6ATr!8DfxF7b%lU96Ft{d~Lnz!5E=E;Y_?la!?Zu$w;t$bZhhZ8g*60NQesj7J
z-_uW2D`ZT*J+HgVo?1-qo$#`bMh%_FN`xT`H9B)V{?gO1GgtCT-~wYQG)pdzu``2G
zJtInlsqCH<>}AzH`LeXzp-#GEF))-y%m6Mz8iXk@s;1F4$DbRhcCP+<Sp0_OwO@&v
zjNUF&`ih;GT47391l(Zx+I@SYogwRdrDEM;$7~VvvV7R1<RedL$Z;y7?S~BD>Sp&E
z!GRv|6lw#6_d~A%a;g0-51w4J>7Wufao3L4zfYars-#~`<VPIx6=DKDIxA>3`VQ?(
zZviDck_9TmL0>nGGg7>x;k#F|mL1#_o}?)K>t0@g|ByIbFy*iSN;KFV<oOBAI}|p6
z!M&3ie4&nA<ea;0N@<(~$8NOG7R?sIh^TiDniRC-l5glfVbp%O{b-jbv!*zV!>NPW
zMg~vJBR`Ro!Km%|2Od<p@YX3fe(B~dB4Q6d{Eh9PVuU<E#m%?o5iB>sIk~?jE3JdG
zP4id4VK>6=;-`?3>U{TL%UpfjO6VCwx9J>`a3mOo<357Xb`vlM*|^0rw_Ti6YQj%V
z8AhSn^_)$Ety&MF!8_hI2MVX^a%RgXr21oBtXer=Bpj>>a-Vukj|)%Ueo)77H}ALi
zWXsZ*lMS*%!n+k^7QRf4Iu;k)Zk}2FO64{nHyD3&d*m0Jw4QbjPX>m^-6t~1UFC0F
z9U917sZk12pz(F{Xn4G*(N^5r>62a^(k}_h**TMDD|?<hr-~QdTI&U{wPfA61u_?q
zvX!3nwqyBi#~>F2^;##cHq8LHY+~DCJc@&UbQE9D1Pk-Nez9|PCFjoKe2qs3d2iNL
zn%ZWVoGacf?-R9yS9W*vb+~-}$t;-YK^Q1z)~vQ`*&aq&sd$VaO^oJ?Er5y;dWBWH
zo&fXzv^jknf*9GK!mVk^mO3=IeAVS)>iO+Gk&yGgtUUNZ&E2*IY)u_Z=8x?b^Y`Eg
z4zmn$`*~4w*VqGIt}{;#N1bxV=~?A*Oa0iZX72PJMwo_opOCi9BHhBcFFY2UHnZ7x
zNI$B9esp!YT)tjEo-<1Ab)8Y@mZo|vvBthcL#|}s{M>yyZE7k>#_cn6=k-=gkbdXW
zpRTSf^~)r|F|+vqB}u#NqxpNi<n6L=?{QA1*>Gk1uS15bPX*ofqCv}h;U-txk9*y8
z0);+i#x4KMYPpj3^v46e1L>e9_wsK~hf6I64Oe~`*1;X^?K?8(RmP#NJq1_#CgDvy
zzv2k#)^_C?&>hNxH{NckLpPPo<UDu%Az*%c_w|9qR-6G#4Hn~QJl1_Q_q^_nD~<HN
z{qs&WF6Td$l})&h-KdaA%^RmpS1L)8kGhl!DMRENUehy{U5A4|UQBW>xB$~LtDURM
zo%N!{^VMQZaE0(}wv;Z66`nac(dSyg7<LW~B%Zn#<jS=lhO*~v=sBANS1D%x$fj}4
zgKIABrt4ibsyJnpf?V2q>YjTn58H#~_&mVLPpwc~ZGFvA>O`E?Q5eux52MONO|5C3
zpbG{T4jC6&aSXscDyD~jcE=@uM3?9$BE4m}SuBGVn*O!wPN8c7k6%mIUL4XXjyyjZ
z*%WGjsWBqQQ_W$fP6_<;A|Um6bpWw7{m-@tdeaW%Tx)p3mouYd{FbP_zE(kPnV|c`
zovgmdt2u8xKvOFNPR}BL^6`R7?*7#2(l4r$mRFy>Jnlb!S~64PoGHw5j2j(VT`pd2
zN)~TuOUPH(V&3Uc0GBq@XU0xH3#|8jT-D;kE#4s+v;}Sr$<G2>C&FAchdqvtMcgw3
zT{o#z(WS2acMLR?aCw0-fWS|umqg`Q(y>Rn_^yezEDJ%`<_~nlt;szCpKt!}HZc4U
z)o8}-F9H{1r=nduOhVbs)Htfr(Nr{(y*JSRV@`4rowge~_8aI{<M5WHbFO>}HY)7M
z3Yz_*bD6;TFXYpbJ<K&`E}hoylTlL*NzxAGO=^-8rCG3M-=3?~Ot&V{7}4SIwB#e9
z+xBiVR@HQaVbY0%FbMpolW+19dY7sD58rSpG(I60Ne8M($p}cPSyb+ppME?BGgAh^
zh@3zi%)<WqD93>nKhxa$mrioAR))70KRXtMz&+NJ58Uv!R^ZtSS;O)#AJ&~L`kh82
zdg*_h!FYAhLOv~lu{?Aa9)ze#I~`|5`a>56=GG&5WcKLb@3KP^H%WIkGcue-es>Y(
zNVm!=Vyv$=y3N9m{ne|)tjU65Ec{-n$%vZhFDjU@>)$Tij#YLkocCDi_o$2NT<T+P
zmqf}NY`#$onp3(2)8Gv|l9dlVV`K>K(=6mAm3o=TR6JJ};PFd~cj&sxgsU{%3vLeA
zAq>uZePrkX!{?Kl3P&Aw%jr+-8-pov<{$;5pE{pD63i;|gwN+&oQKm*-MD3C(2edi
ziDwtZAeVqYlm_ioq|#DddH;^}Fblu{^&a&aQ#-5^6`zJ%+_@MIce4(hdmDF#Bn~c$
z6mT2TjEnrD;H{AmL{2fChQ=(Lqk(>3+F?L`w6Q8&FmleHUzTR)S#ag-sOCfBmbjK%
zi<evP4#RM+rQ??(G_qT*3ihYNVwwE>Zhazg=30&)&M^63EdOfgvHVEIF+KaMk|-3f
zn|c(E4uVT}t!cK}K~5`197gi&Y>er4ilW@35YETnDNhzn9#AUN%%)?=TX)G6&2D9>
zh%VF6c4~#j(>ha`#bz;q|3%qfhIO@mQKPUTD%c=cq_i~BDIy&LQql@acQ=9<G=g*r
z(xr5RNT+mncXz#W;a308d7lsGeAw6KLWSR2cg#7*9CM6kQ%m}tJ&~q0cWItjAWF#|
zDeC?IU`W8+fqGJGt@VH5vqIU0z#yNUm7Vt*)e}xLV^@+D`;mioX(7D8VMw(SS{;|o
zJ1%?ws)BopQF9EDry{Nj;2ze$A!kwsL7OQhgegD21pN1=dzUg0xX4?xBuv?jR&BpM
z*lIvdJ>z6x<T!|^ZVh=-L1*bN<kMSIbk6Lha}vg3Yq_BBOU6CJ-m1%fjD+hNwHkaR
z_xE0{g0_c!*g}5UR{CaxLYbL7fho0;tf5_hM^q3koc)}W<Ac_MX%Ju&15H*bNlnQi
z2M``jM55bi2EKf(cDx?UTI2golBLA2IDkiGx2OD=*$^r@JgS`>9Lf~V*U>JExkycf
zWIFe{Oq`Au)Dw4$@o8MGLE`h-L^K-jQdT3a<K?mH>;sVM<~*7edL#<A%wj*;e9Arb
zBrqyGP5Nayw>o8%Gw%B|d0vwK|M=?dl~+LASnwlh<^ATR*L3!Zz)aOCxqGsmU(PMi
z6_VTyRJkt!5Z!eI2~1USle<<l{fU^c<m|!*(8(iyp%FFbS7T^^sQ!`Hzz0=~MBCLg
zTL&7hlO5+1R_?ue$*1bD4DC-jmzXrmV7s*5!*<R}m$OOk!nexubc$VnDl2vmMlNM5
zk?@0q9YsWf`6JYub+~|`HF+BrE{|bk`j#U<cC?x@?n(|g1b>CNVymVqP(kkc47q<4
zc235wJL|Y9WI)W`gxrP)xzoJoCD#tEZ{E?%tyPsEi5H}wfqc)xu8J6+VWAuCg+Kx2
zfpu}$o$uP#!?xOuPDP<IsP04-SdO`R$bOSS-6VQ-i^lbBUwJ-;hUdg|jt1JR=(*jp
z8)HvS(Lj#NSC~d`J_=EFSgU?;%z6y!_kVDe|MSXsc?!2dq?^almJK6MiVjo0P4*+c
zv(hf4O0xFuH-5-^&J#evQtM?8ZzyN;J!{F!n&_pngJs6$h2Sdqe>l(ans4hYEeZz!
z14AVrbmY7dIZsiLWByTc5V7KIdf1NKdo4xWJ~4J(scK!j8)yk7piUBP+KdFqpjye&
zgP|28*~0p~q?{~7FE30>1;-^SIWBYj?#FrR$mNqp^<#g0*jcT2h__clG=1usjZ!_0
zz>{DT?t4aoU#UHL5M~pQc%$ph+&$YXhnu+NF6j_!xb<ar0gR7D=<M1&IkvG|yDzJ)
zy5{wiPBUlfq9!9@Oz`>iW{0G)y`kG$8GMce2&x&T61Z|ZU#>e+;5n@Rz?*4v_`x~6
z=3@bZrdD`19y}l#{sXDWz9J@u6CoJ8`%u?KAZ6!QHV>*ALnCQj%9Ibf|AUTCVcpTG
z(1yZZ*TJFW{>>Ad_`9U^je9X6mOCgI2Kq)(N6S}-e#43vpbLOM)XG3KIK0x`=rGaY
z>vUmm$7l90J^sZVSh1f{_)T}nBQ2hJ#HDA^r2J(1AhZn%y%wPR_f-9s@h_o*xI%$9
zB(5aQG<=US*5|Z6S1V<r)<4h1)XR<lGN7sih0YAOEUxwDG;6tS&5?BEH00aIuw<>?
zNMWmtpB;ErEH_1CGgbX4#T4sVo`77W7;WLJ?&uG%hi{cX(RS88s$nf_TmP705Z(8N
zmrHrgt2t5Nd>k0pnsiH>r0;uN;+B3uma^!uq_3`TFNxY8yDq*tkW$T{z%XJ{HXxUv
zTSg~2l?cccO{C4Zd-i)zN1xRWVyF!7s(ZeXTYQXa600PiCooFOr+`SM;f#GlqB2j3
zH%}#z&*DPD5?pO_UW_euj*aAx_UZ+{%|LD|iKGbHgSTlRL(*R^Q(<Fsk?$&(1>R6T
z@P?A2Dt@8x$XH-(y0hM!Z!dCuqIJXg(mTuijgZ#kE1F(=O^w`Jm`U^DA#$n;sfpP=
z=0yuzRmz4K=PzuPI{4|+RPL-2HNEt!iCZiVzpWs|E~)ts+}DBH`N7P_jmPdt;K?Q<
zz;+~BY#)TyUy>|l?M>!<`9L<Hhv{N7xxcB2KHf~s@DXaD<?`9J8yFa7nefeogq=M#
zZPR`xuD=<OoM@whvG;dXKUlj7hEa{dxLcL!b5j-BJZ?y@LM*-HaD}W8Ay73w+~mOY
z7R<BX*@#kPl)*1R+fAPZ@Dai4(7l|bn2ikUQvDX?xcO(Oc4593JV(W3bv<@UDzoG`
zPYyRasMOXx?y7rmyE9wK&?E&k0v6br<d3Lix7?xf$@GXTSHjC@wtQJ)bfYUgwz654
zIeu3O_uXyh>qK9-A@%z{wV&6k;fZxp>T{zPbX<p5-Ff>E!>oz@RZkP;;fWe0yzDMr
zW6db(WIc%NVOs{5b}ia*hI`Kc*Gx`%K)?A$lD=g};yst~;iRJsjih?B^6=7DmxD^$
zUr+{SP%JO^1nV0AL#baFxlGTmn!a|}eZVu8iLF4$cbU#%?6QoTLzptE2vH}di`Vq6
zE~YRlSN`~jq>zq5Bp!bSTNm4!Xn`&yn@Kgkf#LOW!Z+)zx0<hm2YMdItp%{!FlM`&
zTDp~PEP)YJmlu6FN&cXcpqXZySb6GIiig;SC1U<#*QANhnA(a~fgqoS!QCl*Vu^%}
zvNNWgvgT9az`ayE=<i8rb(m(HGxfDsY0Ry*SA!t>bfVDUT3uB&hK;*sO9BcD<bT#4
zQsRe^vk3yvska~Y??7r><9t0X?3btd;kcmAG>8j}x7u}QYgzU+Z`@k=7}wwg$q387
z!<?9UKUz|CZ{IEbk2A&Wt4x*>sV~$VmOwejt{;rvIB1K9l*ViDtOptJY$rc?>+xfd
z6oCf`&PlWT$uWNgHJ<_oMN=MHfaC5@f@QY6%NLNAhK(jM;O&MkpHLl?<YI5cByYc!
zukCsrZQg2w7mb1_TR`Ug6C;G<8mpjZ#epny4u{+u9G@KNQydk^s^24{Y1DJa-W{}P
zt5BIzpQd^j_95i%Y&IJ^neVqI`mDMCs+SG)MPuMMfvajN0L^7HHTP03|HvZ<D8)R_
zC^!@49o3;4H1su`?+HuT*Twep$czM6ggI|xhl#W?0C9YoQVN*zL3@_d5U^NX_%Hl;
z0`hBC8sDWN$bM$husQ~fW%c#8RQM}if!6<=a9iso9#aQdRC%Rb@bV$4cJ^J*NwUIf
z)z4FUhMG3sTf`Q$AeU(;M|`C-MIbjKi3CeX5kXeHJ>;eDQz&t5j0L8%U^%Og+oUJ8
zX9LjFJZpvErwG~-m7HHfhQf{n!nDjKfULc3sk56N(yVySb3;x(oR(RwLhP5_`An>t
zJ<;k*&eO3>1ole@lrlSIr|Lzz&zT5TFC2SJ9qiS$v$;{iL*fYMKoD~^IlwgEIK{%=
zQLXVMt}SCy;f!kB0-77I+*?`O)Bw4$2TAkz@-L0&%ifo?CJ>qrMSt#IXx%V>7NsA%
zv)F3cyu)0M{%|BE)aoCBm=^;;Pmgbx_hwcRKmkl;pJp4ef#PvHT=?^Ik9;X{Fe2RN
zNiPyOtfo!4jaQ<v^T&|*jC&APOc0#x-aQh$F$>mTVu(BBF?wF%`iDn{FYW5Z)E+LR
zp6o@R^d<GS>w6MP<b?^R5{KL2C3YPxp)egRW=j%BFm%O^i@1Fo-~H<Ad^j-|rk|n9
z9tm-S{s^{+b=-VwGwVpiILv5u8@?ghW}^;M=6Zr5+Ry<vX>j@B)EwmBYEjDCsvRx!
z00CR?L(bVR7oDEc%8#8Q^OT(w5?lhYHtkgSe8tR%%5s8XMaNJviz2wV_ED*OgMxH2
zxw2z&@GpMct*s_)9&^FKuOULRSND;K#xi&l+at?aJG|$hE7OVjG*{5S*Qz-f&8@eH
z<O;GYp{aOwvtPU?+QCLUvy=tN$5odSJr7Q6CmA4;O3?w@=elS;%xcjuk>7I=H{$f+
z%|1h-tAVdBBw#V?D7HVThIls_L%MzUw&Hw^x>x$g&?k>ViB~L#4pllOqJQ3SH(yDN
z+S>i9-!{#9<(%f<yiFFNcX1uuaq&xHYe&K<*vm<J5J$$Qh-}|UE0nhdG>kc=p5KoI
zg|wNb60<L^xCq<R);On4FxwRsgC0ulaPR|6?=%d~&OmV52(^>_o>s`qp1eLun{j!F
zcQ_~MtcS}PlumLpxO4(<MF&@N6Dr_B(+FAC%;1vsz)S--&hffrHoNpfLok4!gN&~s
z_cNMP^_of_QtrmTy~aVIKp-O*l|bur=9#QeSZcMM^H!>Q{8o?pBkuh+->9-2)e7=T
z%9H)flQ4ar=G)aHAEskup9_4P2N<k9twMb8cC8Qqk_2HHDqThuNZY{sg-TvEhMe@(
z1?W(;deI4t#&>Pm-oiQ(2uz>8bw4Ai0uk}gTN(#}_NQP~t3HsK`Mu_zH*szl6(6&@
z1A-eu=5AASWb3MKeq^nak{vPtduf=f(YpB`X-Mxrz5g4@ukR-wjSd}-J_rAWI`;GL
zk1lk5t8Jt$8*xb`;!=xap>h53ZAPl=q-C3tt$iWtN{3B|=GOf9@jAN2!N{yp=JiiT
zW%<pCC+o>O&Na<+CMi%=tN)`IUjevK+orqJ3&F1vPT{^7;YZ4-L@it!O`D6_kBTuK
z>hLsB)zsf)3eo3TqP$kkO&(#w=Rxec-!^WO3UH;9pn`O$4uql=p&gn78^})YY36ZW
z&?`G9v^odEaSm}Sy*uECVk@s;nkHMU8J<9Eq;Qw4j0bzZfw9)$e6)=gQBp?+ga=b^
z6PcXb5FEWdSY^`XDzw2eMnd{-;=rx36axwhpA(OrY9%>OJ4nJt&~_~JFdCE$bGwu-
zxE=5Jl(X!Ad?Rbli2%1}FXH}6UI<e6J@DaLubayhpeIjR;b%)xJ%D;PbiC*(4dvZx
z=~wAWvf>TzO*9BJ9`)y<VF;d2wCYMsSY9A=>(;$m#`8Zw`RPi~8M8FaG&dWKphdOb
zabX`h;PUv5FY>OPw?mKEG&5%X67ry0t%mCOX~>ugWXDn0|AMV42#3a*94Qx>eHPq3
zaE|m$59Qskh9OEwD(rsL{GxPTgXq^9(n`YEt{*sF{qTq>1trp9y`ILn?@zC>WXQ5)
z8RYE60B+mzTHrYxyUt3PcO~WZ+q*}1$ft3H#3ba5+;?oocXJFDWFsk1cu#{sMYrsu
zT3mB+j}%E;CDcaPg#u?DxZ<bx|GfTa5N(Kw-~emq={q?fF^>0@jk;c57k^#D491mp
zX)zL9=4<;;w-rC)AkdkjkpAn7-X@Z=GkckEViY&rSAU(>{ZRjyclQ*di@)Y<xv)Tl
zy9Ic5R(dTUtS9kFt5~(emU`b6qytmsNMq#XJf<)C3^|`1UF`uA3X|?{7?(Ws9yc$+
z=Cf6TwiAa$ThB!b2;ICYsy7LxF1xg!cStFsDt-U9=qf|5*`npOKAwXib-A5kU|<kR
zp}IUq<=gL@ap0p>>~U&IM6=hrhRR1cy!o-5K&CbSjql8F4k2>gQNp5B57(pNE*33h
z%yr60E-moKjZ2{Nh7G{>`y{XUIxuL0Uuv{h(^taSt|NS!YnY-^7^!b2W@D!Pgv=8W
zba8+Dc6sl26baVLK5v9k=^@8^9V=Q-s3d!_AB?&W`Yln7h-?B3`!tir7chPyMcE*m
z2jxI)mIP`pQoiPG*f%N<`$Ny7bt>!x33C<z+%*@lL^XmfhF$j&2AO-@aG3H&HUcXA
z_`bU##62%vIDJ#Iwk$6t#C*AMCLy?NGky+!@?l!p_+6i!XR$@%soPMx=U@!$V!v=-
zAfC*tCky|IG@s@eqxYJ;Xo;R<-H+bOK`d1e^TUq(_d>YF-3>cPK+3eKrYhoar8_RG
zc77BDJ-xdfp2XEelZ;){DNR^`p5q0@NIx+<=#F!+`l{>gCJxbzV9mzqiP@+=L`Y<<
zNFJB@bh?@sVac@!W>9>r_;&8e5t_snAM4`Q_QRf%9eGm;+a8{jg_L_b6zxU(7+m!S
z4p_QFwlj+`n0J93(PWsr<S&E$@Vvu?ne;!ifqw{b#o=wS8Z|h8Y6lso<j<4y5tuTy
znMWTC)-Z4`q(fT(I)6An-d=YYJ;!LItMG#~W%yO5VHtxOqkE<?gqn`W(Foe73UmkW
zxH7OP^UC`XHB0Uj>A_n;jKB8Lh=M~6XUwyiWx$}u67erUx;MYphK=YxPK1mXBy!R4
zk(!FjxmjpD(oolMj!lsO>!U%03vk+Elh2Rdts+;#&&G1`LOK`TPH_#As43&eXe`xj
zYUa5%@<jaWbwWH@6AzIVu_E?sz#P=SYV&M_W-nh|`CSXB+fD^~PfW$;avoDsia2g(
zM=#yk_b?@1wE}0^mU{q#??h#!-E_X*FIV8cSM5|a;*BJ3L*UeUwU#QQAU0iKu?y)E
z*7R6I`=SOzuX!mq)6sArg6m*gR@eD-pBm43B|zbRZBDI?xX!{M+%Yc9KW$I_!an;|
ze&Hjfi0R2Csx<l`?d|fTwRg~rG?nLFfO^VkQ8$lIQiY&ZkSn?kV6M2k2be-1hfIE6
zrFkp5;c6z+1BXjBpb(*6Ga|{NJ5~bg18(<!J(y{_U%0EG`+G_i+Q(7Le+RTFf_S$j
zhJkC2aR1Uox=mQ5-%Ua8+qL=OY8ZNKWi4#$O|07cr{JS3j<>UtoVoh};-MoL^Q90;
zi$vfEDQMRVo{#zsMhjr5C?a$WWUp(-B-*|~yDq@{rR$SSsgWt@cr>bQXKfy>IG6Wi
zB(7XvNxEmg*(Q{~$ViEXfb|BrVpwvohzv09M@)|pgc~;<+lJJvi77@SHoq8xh%c<N
z56({Yuuw>B_^kySKw$^gzy*xUOg=Y4UdZhT^k<FTyu*G9Fyymkq!-$W7vE6a6)>S(
zL=3QNO2%!y2%pL7c)Sr!eIcoGTh3M^GZ95>ZKdywT<yCS(;r}D5X3TQ^0NjTqo`3{
z$m2#2fbu+<_hTQ*y9H#?M2+ko903sZ@M!enBb|dC>Zok<7WPExvvbzv2dczU-Sdj>
z+G_v^TG=(oM77_Uho-rhDeiM~PpPM}LLNLC1jO{WMs6zQu72zhPN=k#E~KJ-!knL!
zgf&KN$OKCmXUa+m%~1?MiW0k=zrKA6guUy2fq&CJFHly|W?B~f%l8Ho_BX925yf{&
zT`^4!BxP*@Ks#QKTyY>z&uJGeEH(i4`6XIY5{1-lXbmNt;w;cFEUXP#sJuf(S^SD>
zaWj6y*`j^^W5({YwAYl>?Tt3lwq!n99BWlKwJmSn4??>mKoJ|B2kw)tmke`Ky>7Y$
zCT+Ga_huK?c*pgs?KLQ{_$=ueeW#OlOzx7FYVJYergsW^GtIu&o9DfP@dE?^+j(n~
zDNm86{NiU0o4!>mj`StXp9y@V;WM)B#`zJ0cEYu6>Kopsov0^kyF*#|MEZ+fiols=
zNVTIa?NPt4<1=<UYyi<>XkGX_*M?^Um#IU49Q@$9#!LWt1W7aHt=7^6*{Gu8p)kbJ
zg<GkGu_;ZzauRwiS`$eH$=eX(*b&r~vBrC^dPL&?c;CQxXi-!8Zy**(civDJ>E-kV
z{bBwkeSx;+cNXqp65k0gB{@xZkGpFL4jT-dwps+W*Q+u3`fhb;^>1x~4JmU<a@Cqz
zX-<+j=2>5Z-Pm9r<)H!EQKZ|*lG3o*e?0N%IPUf61C*N>W$vSLtTe97P&@XgDaT@p
zLB9}k<LEqCrVPq0{^~D)!{lHT*#Mf)LbZue%U(Kqo-V|Eegt|B_s)JX@YJJ&ZphP#
z*%YhC5lO=vZsc4qyGu<Ylx087=Pm(c?ya38_mPT{wz}sU*nLO6cfZ65S9OmC!J74h
z<EO6&0PWil+)CKXv8w&n+|^S&yp<uyoGh>J>*(0#sn)r@&9+DE)1kj5ZV~3pbr~bc
zL;KNyQhMC&;HTzmTZsG^7ozbq`@fXJ(<c8Nci5J~Pe?$c`V8S25&p-ry;uLdOG0`Q
z2vheT&)^~kN1<|SBu3t%cQ8C`080tTPIOl5&S*`#k#oGi(AqZwRs%z5!VwTN3v0ar
z!oCxiH2YAYj?nr8JFS<kpxh^e1)%~$O0sv(c>;l<kkZftPpGwl;b&-Y&ca6d@j|&0
zeh-tSwQR2KM)=6!8d@&ftvRGgioni^7mopz?@a=HF3dMdFaIOdm2|oQ5!A~`Bo_Lf
z9Iu=_HMSoZ=<j!cewQBZQDCB|q&c}XGg&2#DQ+jUx(3^L_^z!{4?Ao$<7FCJKHk2|
z0XZ<vSpM{_RtMDSX=NHpN`9oQrg{6MkG*;y!8KTqeVV&y3n%?^psSl-N<#Ur8sEa4
zWCC$f9lvXs-vVJJ##H46m%a=Q%8Mf|T0^z)=N;j&A&b_6B#>!B7*!EWCjOJ&BC?hs
zcBnDAo2|2!oX>Vk;UBwEN*-Rd#?l~!kSHUt_PiSaiPiAur)*<xIpD9+te=IP%?_mg
zQQQGn+iLR*XTpp>ic(A0lMd7S><vK&{rBCry9P~M=IWBTd%}l4SlAyC*xa_+vW!&#
zEq*T~c9wh)0RCg`GNAq37BWW~dZBHSvWx61B^%G&9Hu*Hf&F%OEIgSfwqF`ozbjeb
z+bkGH7~|qGW2?9~P$EgtJ?uA8drjJTi$p%g9AGeguS<Dc_&b8^C{U16Rxd)e6Pdt#
zv{kTON9?<3a|FG%0K^Ja@O4Td<yiqCr0?AiH~77i1G<xSI=q7xwN=Q$(oIPexysih
zWgG3<1EM;iwR~`=_Ps!S@^poCYTV+ua5@+RazZK;+3{o%0Gq2lYD%)r9y10G4i17N
zGLC}PUl9Z-eTw@MwOZ!Kd6~YGghi+Wygg*4r)C{j?9~w?=;;W%!&0B4rklN<--(1-
zn5Pk5!sh2huK3&cc3P16kth=*SeS}OuzXyX{sm@H$hKuFB^Y1-Sx*;)2veAR!RaN-
zS4_Fbv6Vnw91-($ir$ErX-av4iOqu~0BS|FouvqVEQ3lDD_}#)N57G5^I28a4Lgj=
zf3f`J*!xBel3GP|kIeUGBCMY5l%JbSz>M(Go`urhu5nak<A$`r10UV6R&(sL!3gt7
z;0@U^cS^sTshd&MnwNGHRcA-@y5?XlY+FVn_l8G6Fwxew+$cXeDEE$$c1tBf-Gt8W
zR}8BKA<tg`>Ord?f%=Ys_hg9b*9}A}+eeLUwO}_IMf58l+Bz&g%APl#o1@WDz7XJj
zkb*5>L+2`q(eLSuK-CDB!74Y$Mops$jjy`RwO<D{Yh)x*i-Db;Jq-OW(yJy#<SqN&
zhtmF~61e~`c@6h(fRt4vJU~A|qqQ)9B^GG-zBJFKgL&A&cj6o5uVC3YnB>#pYXt;H
zCcfJ;T_UkC;jf_5X7jULbSX&LNCctf*$r<*_2lyk%XR@a8{5!`wBXC|ST_N;bk^;V
zV3jbH1y?eQ>hk>5GRAqwQ|egWh|_`M{pb*D*=~2@ga+Jc9m}Qxq)OO?+PI9=5!IBa
zHyd%E%1C0I=Vo#$g*`BzJ0dUkhW7+6(S75T|CyRU2g3t**}k9s3#AGo^%&nF%M+m~
zf2dJ7Xr6zI$qevpz=e86*KU%Spbsq>A(uKXl{Z8Rgb*J?s`GlksELh?lJY_I349DX
zLs|U?W}ceKWzcg=1hgv8l%yXZ<-w%YXYXQq70UO!a=_rTW%LBvx8d@TrxO{w7h>qZ
zg#5%9^bC28VM~_o6^0|o?&;^M5Qaw(tTdAPyKC<Qef$b~KRqyjlt$rzieJPvso2Er
zQ890}INIvDZP!93^nw<K9$Jz4BRhnl-P}6#?;QM(hQ)8D`V~<3)E55r{yr~oE|F`v
z9eTO++_ebX(s92dnszfn!l&YIslN%5O&#IU#eayhps~geKY(QM!oOt6AEJ_9%vErO
z`f`4};5!?wVah3WA+vOw^8fkM0RL&-1vNh$oCU+_<$ru(DNVRi?-`fH|LOVYAp;F>
zCGz-DP&qCk>oAAdWR@)hlSO>Z?u}qf8oa#ML+l%5$ned%tGiW1Fxx_}52c$xpVP+Q
zmMldo<QR(9EhT@S01<JVls1DYdbwMWNpiMRrlvjAbccKJAnj3VmP@W2!Z)+C*AoEZ
zxB@(E!LZ=JyZ_g17lVqKaZkFZ7+-zKq(*LQME*S{|J0U14Gif%XhvYmKmRgi^Jw?S
z^>!pjpUdRl{_ZOQuwtv!wAKxjx0s;c40Dx;VBfvYZ-}}E?`zltl-YrQ^a}wG_}_sy
zWUu^_gt5dZfZllr!vD;G+^@zTZ0=V!p+Bq9?S2O#?AiUq8K!T}0)NUxFn*4q74KG*
zhBxhDl=d2fkRBPz{vwU@PrQ-$1b+Fxb~20*{5~iy<LIx>@$mGimvBFX*jRLX7a;18
zsO_A@c&Od1YnlO55%w5;V$Qi*ajpYH2$$hV2&xLOwhz%PBIv|Jy@bf~#Wd4n1Kkuv
z4`mt*8Fl~u@S@Aq&%eD0djU$QYkV=pMf1n9&;KZ{{m+QyXf%qni?%0Co{orFB|7<6
z1fS_pR|a6E{S?|h@#Ws`MHts(BvN|tNY9L;Vf<I>419@V;QHY2`rMd>sg6WthMqs0
z4DB<VhD7n=*BLNfgv|y~6``CCZLdRy$W*#hx*~xw-3R;zEtMT8I%yF4hV2J4JQqG>
zzQase90qtWk@*gtZ1I1bpNh{g<*EIjo*W=F4X{z=hE?^6rPN*E)zBd=n>qM3OZUdW
zU|a&J^m}#+U3fpitM|ykndW=Ldl%u&6zEU?omi2whiB=3gmUK3^>s=n-rQEgby2$P
zFvB?_f>0wAE&!uNs|a$u;-cDWR1YKuR;e<;Vm2VFkTCR6;m@l-i*kZo@f3gGjqJa<
z%t6SzbOHR7L_CqrU*eiKG3WxDu-P)*6B+ogNUST?1lh3vl^^`?xIIL)ZDA-<rUV_+
zP(y<VoF+g1)17c>M+iSw3*r%HTxd}dyaM5^CBQz)Us6S=GaUdSheQGBVAYl_hK_}B
z_(Kjf<l$Edy%>clPu6=0&&NSBx#RJ_^T_+c3EuC9x%ursmnbhH|I+YXQVRx<S5V5~
zZtg(1bQS<O<Sj13$&*09ft7P}F1%;qZ(jO_o&cxr{~SJp*DR#~ue@lv#PZK(nMc^t
zzK(gKD5%HIE(l9*?rXqPQG@_g1a_Q>iau=P9}#AdCXv)#__qxP@$Z0~OaxzI{J{CA
zAk6!}L%GO$#>~PK2#nm{06O^%6od}dzE(>R4TBFO@px9f_e*C5=<Zd~g9^w)7kTpX
zs#mzIkFsbOU?%wgP2k2^4ULFMTQ^JpJ<Ha~2XFP(fWarY+z@fNHw8uiZee;vOZjpA
z{OP>#=|@9DK3ea2^WO`6gb251CJw`V4h*769&HbL{JmBwkqg^JA_w))7IVo%0mpL@
z7-Hy9SNKu;^S_BsgavubodbFkf)EH9m7L@LykL;W{ipQzdo{s)?QFEa!Dro5{SnwT
zs18LHW{3#565Qf8)W~Q29oIff!Qg29@d+kv{pY`*2~8Yk7j5pG{w)xXfH9Q(M0uEL
zHG%W>egSILzk$*bbVcSa$36Y~11}-6=D%t3@V#Ge;c>6qAUbZmqPnmb7wI&<fS+D0
z69dD`bASI4JV7D^L$0yGzj{?tm*5GW83c&@eu50-3C7JB;0d&lkw}Vv|MoL7mE{($
z)$sR2A*G<1OyTd&;%5LU5}A}=$3iL~3tz_IXe#~vo2b)>PMX8tw*k_`?4xl+4uRJF
z{m5^KLAXek{f7Fa$QC0;->u>Ual?z>@f}Y;*6GVmdI!%)lh;S|=k?`c>o}d0sz}W9
zb$N%Z5pOkoc*?vlK~LUP{6gN~QzKp&D<@us&LQcL<Ia$7ZkK#`>(cV?S(QLYMv@1E
zl>QS<hkqY{h(G+k7-j!&G4=&M3XnltJAjPXQD6@Dy|6mqAnlq22fenNlgJ1t97{ZS
zpYx0H|InKz;D)Tpo<6$EJiJp`W6AW7e=t^L=(y?0zsxCeZRG=j(=L~t*>C0cBp+V$
z3T?QmsxOnY;i^kTa_9XKDd&v@!NlP$Mx)4R)Zb&0l*mulWleu~#zh*opBLXy2uReE
zngG_82m?39`-n^O=_riq))=DK1|OFhK5oiV($?RlVM4Yf{^_#+-?!xSeUm1J+EW^S
zhc!aQnrmWgjS!MF`u$qFV5j~cuu6piB7Of`lQs!h^FRde^Z-_MMn%Or9x%pe+cZ7f
zECyplxXYs6f^T?r&+`C9_N#U2f@;Uc$g2Q}c7WN2+B0ga&~MB>9R&ivs5<Z~+L2Dm
zZs<6=qz#`nUO^bI04zCW(f6R=v#0O?ep#6hIAa^1{AO3BAlIX}W6mWw*f1W;$YD<m
zN-8p=kT2LERNTbOVVE^&ms3y;+*r0#w?g=mGLqcPG|<OCrEwr^V3}R+X$Ie9R<EvL
z(b1$hA5ml;W-}trRY7*W9V7=}-~X0q`F-kBs6c|=>$gJFJZ*2o@3$%e+f*;BElA`|
zMFo6$K`>PlG?Yk9CO-)!oV&*MqNtd;`#CWJF(1vBwI7whO)iOSiZo`Uvb|~DIByow
zmwQl`=dQIv_q87lMHkhB>>D6;iAc(0rvyREsA#!p!GlF&<Np5&J)IuM7s7DVp186g
zJ}0mWSuot%vKvK!ghPR9p3_Ts8riXKusLj8eqQ6G#xtwC-?8y`@U617813dD(jP;j
zfITG=)0MO%E!&}!6bN}ksbFsOuJw=TK{wjP><VB+1j$1{Q!GVn$s}MRO(j~W1J4B!
z(E6-j-t;%_5SGK%aH3N9eGmN`Oa0$r)w?LbNif+57?$lwJH>j^nYZZbZq9;jfCP&O
zOiMlKhnc;5flf>)KQRkdD=&8>gd1__7mOS&TkC?OD1JAS*OM4Oc>H&Rw*vmoyU)Hw
z++CjGHZphNwsl>URsDTbQ9!Rds*5WU1}9LOTCk9_;s9r5eC5+!%%su^fJlN7cC_vX
zB*jB!vU08K&zS&WGTypm0?(GV*-_$iJ(BwLp&jHLyyUXhrX~6HIA(~^0craN$}71O
zKyC!oY`X)RKLf3x7KcnPCKGdNd_my}95(Vq5($@k@UA*_vS|X(xVXT%zTzgA2qU;M
zFQ7T`6g)u}ErOBkstDSM`|SMj2MAZX9qB9AeAhBfYeN7N1O<xf!F`;Hre+9a0PY>R
z?Q5}N;<&eSD=$%fc6{(I+<ANDq!Sf#$?cp5%2X_)GBW-bAt=psAFFWiWui^Cr)%l0
ze>^|kK}bM*djr5HFyMIRFe?0CvhzP$2!eF<S#NyR8wAVuwEWru5D0`Jm;4+cASD87
z{}5VhzLn2-U3%kgN^+UnX#qvokz=EY@iJS-y0|WI6eY-`9o9G9;x@Mkperl5Is*$S
z3N@v~a&^@)e_$Y5LFad7Wqj}`pM%oX9`Ejjyy-t(wi5z-Z4{%ff|E4`;m*JdObt?)
z4RY;nv{!4|11b2Yv5c7Rh!$#jh>KuiJp9TZ+90L#j41Z8fSU*Z+oVnBvIXZJ+io@c
z^wREO?G?xE{(}wss3^7r)s<D-kz7gb%#}T-iGB3~lP}UL1m|7&z)398&pJBypP||z
z*{IWQt#<uh=}6jd)7?uS&Y(nB_UX%m?f9$ryfx=}!t#2;nIC<At@UJ%IqHoH{`>0$
zM#GLfs@WEE&tx)G*uf<IZlmsm@1tXScoFWF(mTsT=~lAK7_;s1p0>w(GoEDKx%vtl
zfRoP*^Aj`0<rxlyJ&;MjYPRW0kxNj@)e|@=x82a%*qezC-`ki8_vP8I19kx;x<cYe
zY=F!!@W{r#e`zjMFI%g*DNQlkgCoG?n?|+A++2iO6L5R=v1+F>z&<%lK3(ZC8p?SI
zFjjtS{K)g4Ke;XPEj430&$ozKmJ15<BHpk@$HR-zy$|+kG<W7mZlfCv<TVdM%PYZn
zr0h$=4m3}HrYYyQ047BHrN4v15(VnYhYLNKtZ=^-kQQ7@xrWCP;(n9-7IQK1IhtUz
zU|QVGBqx;0H&V@hRqr2rYh${Z@hT3>X6ZPL6flA1V9{JhqOW3^L!maOy8{Brx|*+>
zUUsPQ!PDokUT)eGJ9oCa0pUzUahHPJZvuF{>fU)rOL#*ST+?EUt(*R{8$>L$Dxxdk
zLN~^0o1%77{CxA=Z-2D9>+m<%H3<~YgWazg*o1w?wHIt5bfGKTtyD4jab94V>*a?^
z3?k~}&i)4|DX$xYreSO^hr=&v9=xjQg>X&zE3ZkiPho8U_=1%?*OM8Ioh%stwBpED
zr*JRFXo%z6PaudlniV1ZoTm3zamM>^#}@nZSag2ywHktP4y}qeXI-duZ;10&@}M7<
zh%z{OEW{B?)xl@MT3^n#Li^*Q=ZAQjPOc1RJv{vug#k*-%G&<Bmoqc-oK}kz25f2g
z;AzGy3=hQT8mnl5R_D%teXz9Z;^S1d?Ox61&AE=ZVQY3CsI(=ke2jLRvmvSaok=gN
zdF3)x`tlX5BRYX%tIlXLT7jkg<F%ROGyceIe{3;ixInOld4E6^ND@J9QP`q|#FH(Y
zD_o3PO(A@WPJ%tQa91g1NULGXe-3IxZ)ZmpMa<+dhtItKx1|uPGhn!HG_%%jdcTzv
zl<If@TMO2JFcr0gza;EU70<q!<f-`_FFjbF{_xVlxQ}8!)546<WB5#cRWvc(ow1%+
zz@(>;PFGuK3TH!Ilx;YRVbf%NKr?jR=L<s+z{_m^yMCSD{RHck;WkAZpXN>2ee_+a
z3cjIT5b<BPz^uwwts;ZIhhQUb4ksimWC8RyUh)9Q|8?KPYyzOP?kjj4IXdWYZhrYu
z^0+|AZGp;-I||Qne=gBwS-m>?EH^J<J{SPjFCD$Ky7ABCEg8dvb1LPwQ|cq;AeHm~
z?J!bU5HnrYkjtoqH{_6gS!CYC>GP)Qf)Kv#*F+nk*eD2mzdQ*2HJc*X&BeikecTOw
zKIie<3qf(>$<N|rBD7F$t2=zRg(-e6`=##m@U1X~;}qpPUq2=-4;3fcTD|KSY6zr~
zNLR|$xCPH~Jsb?B$*w*&wZYsg5HKIeH>L}*7e>KCmyX!l8F99cn)-V7_U?Ji$i2e*
z@NXJqQ{~eu(7zw@k;6IvC3$Ay-?HtAz)j$^OqKedH!I=l*cD_yQt-eURci}hXXkwZ
zr#8WKq6Rq^Ug6$@FE`)VK*jEDD`5z$GyedMXrGiZoO0$)bqtMq*`WCuc6w&LPKssb
z{0gCi<U1sk%qW;YyE0Tcr^lhaHL$tRLl=KdIad#~op**p^mBXUQspI*%ac?$K?5K}
zt<<hs{2*A}i2<8R-mCwanNb<cCch7*f9X0?0^kFkdQWDAGTKd7-uGa-Tz}*FZ@=o_
zOF<!xqE>$R?yaP|5&GNzZuNFPctR!mc62!ZqsW=tJzEz%H3A;=3C<_Srj61nJ+LLT
z-~e53jrYOl?pJ|PX46cJvw?*H-(NT0B3_2;r+Dx9U`6x7J!ZYng|o4q;}?QwXMoiH
zV1FN8ioxb+w5T_t*VKIf11_fd_w}WV*Kk9geNdyN!u$nWlP>-^*6LGmg*J@UCm0PC
zNh^!CH1EKC1A}qNn&4mwFdPW7?Y8D4!s+0-@{oPUVp?;0H~+WAn9i18bOx(<iKyfI
zzwH7;VJ*owyZ`{XQRj@Jh_P$amr@1*-89%8DjeU`dUuzcFwt+mE49|U<J-$8an95f
zQH`0CS<;1OQw71;q+8I+(RNsV$5jF0rwJ0j*IKPnhEDr)<k`Uc2rY9w2!pDF?povH
zi+ETQ`*92Xd4|e$yB@h#?#9S)dk7D(+%FJ+QOU}+T#z>(b-7sMo_neOM~&F%1~d=C
zV8((-F(StrV|9P$ZCIV7;dE2zsw?jgFiw?O6*_a~@1Y`GL7Tnpha|@ZmiJfdi_^6p
zpZ=ND8`MKop8erRI=^C2NGko2=OzJnM#=T20-{<p72wNH!vvjYXy72;baS}2US|>p
z+d!NzPrt+AS}ns8)(*3FOGJgft{)zXhk|mpmPFeNyKk8n4-21}5rr}7`tJeV&<j-?
zlNzD?20c%Pt6~~y1}8>46(!3iKYJ?P*Z5-wP&8(Et;Wwb26A)*3L#DN!VQdw2dr(S
zOe`gnf^{2n7cAWYJ-?#TP2ha^a=JCz7=G02>hml*zbYcE`gQpwk9;F~4zkrm<@|0H
z+-G{jTLoL!z4txCZ!7K7>&1l^#i@MOrGoqh-qq84^9%|*^0Y0+7&qO{pc3)I*QhM+
z5B6_9FSbhm6jG~N&sH)Bnb|a?R~~bGu3q3Vmf&^_B~2bGzK8E8PZ3yYFI39?iQCvg
z?YyKWpH!?#t|75DmzI6OhJE>~z25TT76jr!T&LZ!2Uj;1GRwo@;rqqIS(=p^;SUT7
z+3MR8z}RmZ;n0bGWOO>T>G-Z$D`h^{pPLGA&0ai^QjRaTRhD=udb>ptIE(SNz-Q#U
z{@|=wFs*76q~_xppKb<p-+k9#Vw-IX(V>~Kl}j_>X{4UJ!DiI6RCyY6n0)~zaUq!$
zMTkd*(2DBM57${e&UJ>g`2P}hP5lPlb>~pXDwIs!>_+@Y#di{k+DRU^hd#Ad2^qdj
z@>F!v@0QiEd;eV7=ZVk7BnkS%CAJ*zW3Hp|he41MHYN!8ciY`qB^*7zA}le9LdR-z
z=|9_97aa`0m|8_U+KH*o4{s%NwgdaX`<qk@UU>bw_NE5r9WI-15ck!^>Fg*TIqq-%
z;%=Oqf%?%yH`9jBkvnvMI^39gtgh}8@x-{tqLTsnZFUR(xv?DlX<t0f?m^IwG1U*4
zsAHdr&_sI3U)>m9_USD`V$GaT_+0GGC-HhiL6%ky#>eq2@YB)tw1%Ay49y`JT*xzL
z@gwB(k%{EAemY-d!4g#Lqa4yzYHuxO;-5sq|H-}`k}M5$(Gt;xfH!qi`?O8>@{C7}
z+%q;^)ZSgfemMOqZsZV$!y;*JsMvZmC#QRRY2YC5GS#YdL?1K{Ukdu-8<L5Kj1ENK
z$@nj`L!TjI7)z3lYkG5*QM^(44;SE^25v?PZ98{L-s^_X=qj1&=NO8i_ov}>hI7pA
zrWe1_RAaDY;RyfjqKUmu7V2m(@sCgM`)MKxP49bpyuUdP=+j}W9;7=#aH(vl{(T9!
zxS<Xb;TC_dVpE&y8qW0nHcL9$wPXK#eGnU7bS-{QT(Z%v)g2@yLztPfiVI(I2VX?c
zVgQsJJvE4t;B<0iZ4OhSA#g4^#W!MEJVM&v?jaaJ@`6VPl38~fb200-S9K=KkjpAi
z$)|lIdRwBx6oEJU#s1{@s2RZy<P(Y5mu1tQGiYU)37R?6v}quhqPrgn7rHWR!_wSL
zwQ}|Zo=HOgCU$e*RGrk*z`ovKbp*F5lDRYd6zxpd(kUmUW`~xA#MXQl&3A9y04!os
z?T{G%n=No=a5|}{w;=WHAGGXI6>L#nX0XUekx$oSzxE(V1f^^|C_)=+i4{_>U#i@D
zlK~0J`9@JoZ&7)u6f$!7xqrUDdKqiH_uTB@3beq>HahU<uSkSGB;CB%pQEekZaTR<
zk#YX=tq_-U7nKe&cP>Uc<_n8u@L>uRinS<&)xxcX&vtvLn66~i)fPsgo7>~Xyd%7F
z%A3QOvo#ivf#p@ZIp#Cd;i;fv`i}Hr99%^=H@HO!Tzd%;R==Pnp8N06Ac>s^tS-N4
zPYB~_qW9`8Zx#`5^Mjjw5#s5AtyvERf}HN3ad9A#(m)|t>|S4Edu=V@QDG*8C-V85
zL0{Haa-!fnN18%>@AtPB!l6{zyTln0x$qc1l{()3m(rf9$q1q^9|q#d7KtWn)*&Gc
zftyH?3oCbW>hQixjR<+9ERzDaaPWj*&rc_NgX*KYFPiWeFB3JZaq9b~LFWj~Q6ly|
zbl%eYpo-B9qwuGo&Q#3SQXF@Y6=jg|2si6N+An(DX?Jg@e_p%;i}@{-%Qh=nwb&{v
z_*QD^;7dxm6xlSM6560MLY02sa!n@$8y^e#2)-2CDOI3W?_D6J_>{B^rQS7r5oyjy
zQg`*iDbm$F$k1rk8sOVTOQs?IQi7G`kkycSo$olJ_!6MNrSyvpO5`kCXt7R&A)-Um
zz*wW*-><%{ZqPiG0oouPW~=K|)wm5q<kEAFk0%O<>mzz{^m|%LHrvIt@`tRl%c##r
z6qi=<rz_`QQ=-g#5<`f_CXj^Ya7m+4k>q#Yjkpi%qtp^OJ9z1+o<xhi%jTtBx?t{d
z2Gz`)mt}-lInN+8Zz@r9cn)(!N4nawQUoW2$#Sx~N8cNq+fgHopfGeLgoKeWkNwf@
z<fL$9LPN_GQ&af!r#XLOAzyrMzDEVu!+bTONN%lr6ow)p6<U{R9|;8<*caM_2yzFh
zBS>$1K~x}});F8-8?8Bt;gY8A2%=G@4vwbJpjqCkw-mU>iYc}i=*Ef3!=%?)C&Tno
zo&3`$+@J{3o_PBB5N5|apM|CTk_gf#AT)WHllaSgiuMyg@qn!5R$;WTUo0Eh+A5{@
zm!o>O(Hdhz%x;}Qiy*hxm_?r{<tSzI%>vp(k0%2p`Ev7dM9##SBB;;T3>nSH*X}#d
zFluJ^8)=K1yzVd%8L7~@NcPt25|BjA^?uikc+r-6oqbf;g1k0TR?J7<$f@Xhr2UxJ
zgDVU2q~Bk9-(wz6h}X%29A0tZij0|LfM-}k<6~q}o+(}|_*w_HAxQZnx8KpNY~H`O
zpOh5pAPNDEn(4yc_P6X9mcVL=G*Hr!pxL0C9#%=Jp`?D}d*G-ZeAX?S-tA)*#!G+r
z(`y)Y9u*&07jgU7xWXFz?mrnHl*<}h{HEJMM)i!$ONXEKj-6k|hc>fc2)f`nWI{6h
zDO(T{e-A4WKBAeEg~#@g!h5PISOf)g)<Ko%8S6zd?YHnzy((x>r$i$b%cUb*yY;iA
zm)m7b%udKxGp@S_O?JKvo}TV{n%Nr3*~4TsSdf&yFGsB;C35CV-GfX=|8rib9FZKB
zSyh~%WtD-w$7niURbT`umu_K7FH(jmAFNF2eYM;Z$awHf1my!rp->oI5x{=kQ7v>$
zIPZz*joNRS`q3nq=rW$S=~$2fe+q4irTiXzABdV3%Y*DqCh|B#`9-&nmd|;|*{}-)
zi#U+2McAv9X9uTL2z^rbAd|Tw7ot?|R7xtfep8_Su?Qbk`r{Q=55f(P&ULc{1WKcx
z!7V1~{PaR-<#EwTWr2-*Xl(0Xd_nOm;wz+=+M{f%KflA?efLrn2aP`x(uJfI!j1n)
z7fdisjt{!`vZG~o&oTU2#b@L%E}Qyetgs1PrkY#}|FU8G-HjXbGIoAzr>+Sp96lEv
zOy1F(&5oMgH6c)(_SzAyQ0Lzvz8SM=!Yy1!c(xuw&;+gl)_Jl|zJt_3bn5u5+^UHo
z^qj6bHC}fGzjIrO_^rc9QaYi+@Qucf$M2-NBt-CD81<$4MD>I_c@+I^HJg`BV&SaD
zEWp^{9er|yl(3kX0If9DN!P1-#Q1yqa9(J_+lh0O6%SQ#T>Wt|=uQ=_vW=gP@yYJQ
z^|<e)gCCgG(QPSZlWz39!~HV2U;`tmO%R@xJLMPH=ab^bGoX{K-(0#gjAJvIQx20-
zsYv?fiArLIeu-)w4VRGrT%_l>r=0hTsPu?j!f?GX<%mpux3@vGCYRp;s-FD87bGcd
zPL{L3?k6Zas>UW~y>5ITHclF>q9N#gZ>mpz=J_M>&j5VQTB5rkB}Kuu+PMF#^E_sw
zpX$hF9jmwtTE?I(pV=$BZ>stB!qss#oc}Vhrk?6vJkJDssg-1_Ic{NX)fEP{6*)D+
zDgFfWC2GY~y;7(6=K`6>Tg;^jcGa%;SJ5peb!7>W>aR{PTl~*TLKh!(vRzZiZ0>Qi
zQ&dxKr3gijmD?WFzmHK9B*KHfiG=Lr(i^%?|5(_QApFEzdlc3Z;Fw^tT)$%eV;&Z|
z34?bufIV+B;w4I)kgw0p1Yud?51!&d5ihZ<rxf-tHi8V7A;g!t0l{UguyP{j!T4D^
zbS+sU7hirVOks*x!Rs4GvF{CBe1jW5*_X)TH^8Mc+`IMy8jVY}H20)9=@@<hHmczs
zvFLwh6j<+TlUvXW!f9@!WygDvM<u<Nu1rfR$-O1(<@a-0C{2=M?kii*t>(rDQZ@jj
zGz;8T#bPX}A_IM<ClP)RW^ti{&J^_F?em(+nwi9Q#9|tu&s?~o(^@LJz&+{rgzKHA
znGJ%;C9clG?_oW6;mXH_X9t7D)+-gdb5U&tYNXPh(K3O{)nh+D)T+qiy7)$zOyYJG
zSuRwt7vnUAF*h>lcT+EO3|+_UvMC)AJ`?|aFhfA?5u4>)do^Y!Tce~y+<~yEE3uG|
zSGezXuho!9&PY*WwOWfDCf0n^F=&xB!Ehc;E#r`;8Qq4N@v~Q7f_Eu(lT^;Qa6vR8
zzs~ZB^bcSL72$K#2owT$`&~n)YZp)n{T5IwFR8oov$R;gpJNmcSJ4jk-rI8|K;J(e
z-x2Ey%iidh-M-(Xh&Eu{Yt{EcHG<*B9@Yo^9=L_yi9){rW!V8g?Es>2o{C;X{KZZX
zII<E{AUE~ynET28yw(S=8K{jo9LK_IN*;Y?J#^)V8kF*BcgR<zInnt@Fjq0TdXI!h
ziM4OCx1$C|aL8w>;dO{JJOjP5+?9Bb9y`cwG}VIZgsvZxi-+8>j>PX*$WSSqUBEP_
z$g9oZ&)@>3JdZt?Bv$Q|H&ry#Tc;90-zyxmSQ#ENlY8bBgy%?x;pTSf>5os@#@243
z{UR}w)a;n-JvXbkZmZoUwqEof^q$<c*d6<zye7=?fddDCpR7T@&2z#bNo4LCRsE_n
z(avSqI8(GsYfz*wdkVBELCtD9xPU3IYqZZPUH?VMhkbb+Gn##%&ai&Yqa#5AhxkWm
zaV5vDa9tJ@z2v1wnAo}ivd+*B;h25)H}`k@V5Zg1mM0Su?Ae9D2!IpdbjUp~HZS}*
z$87}d5|;5VHk{hcp^S+X&m?yOgFi~|cntr<bZ?q8V>TJ(m6XSHuw_3Wxf*nt(C;D=
zGJlX|q5eIhXIzL-?4BpL;vWLjKK5DFhl4S#Urz=IJKgc;FR1F5s|~Wd;eD{WQTG~U
z4in!E<B5%_@asd_jQT0=y7<ELrF0ic@#Red#PnO<6{1G!mLE{f7RB@gsM==~ZIc#X
z>qk(cIx+ye`o1<l*CnO5ULK6SSZ`IURvD0UZNI#wqzc2Mz5S<Ap(KgSasj1kY6wMP
zeF2`sW*EL*@p-c4war!K>%V?>!wJwJN(D(~jksCba!mV7oBQrBCrQSwhD?f!C@4if
ziL~@*z4lC`*IVrUJ`IAK5A^2Av%cXJ#J$I4@T0G4ykAvT(>Nq-s%55bCePyb`sL0;
z15Wmsf=E?^8S-)-5~e+11gBm9MY8HIRh0=jl-A@K&0uv|8NcTf(1T$H?7Mu0?Qzcn
zU!<JC0=t+IhezSY&4oV0sff7T*|fT=+CXIb815OI8qnhwnW6A<M`S674zu-Latvd7
zPx-+zXQNB4qUA#OmL+QTK;L*26sTSM{^JWpR!jFfTEB_}P@j!#pjv;bkqYd6s<CH>
z<86zcIo6n%(l_3`amQ$wJBM-b<d)M!Xvf*#s1<^1__d-|9{Y=azKhDe+OpX%YIi7}
zNf&$|x$|>_UZ2LD;b)U4OEvio3%ZM?8Fx|L34U%$9|rz<LVHtq6MvkPBx_({an`}~
zy_v0bC=UEIPh@|LXV51~p-#S%$<^y(6T(2}oBXQz$n9E}qQ7?uDh~krv2Ev8Up3-A
zj$in24Nnh%wW$ZIB6-f*_UO_iA-|qjX&0X7G((Zb3RSRmSBqG}cie1?ouxfkFiQLk
zJg?>);}M<&dx1>~qdHY12B}!KOek){VKg?0?W0FV;0?YjgX5UZ${W_*1*Q{ejJM<0
z_vxEjZEKBHIasrPeZCo344XlV#NoTxnlb_%i8B_qevDMnKBV+|A-1MtN4HAp?oXm+
zdbyB<C|My>O({?O(O0*<`}{_VP?oM@q3=oXub5lTcQSAc%uzhb|B5ISed(3hiHxm?
zSV=XrPI(qD+LU8aPN}AUw7<m=`<U5(RBTPjUBzn>l`NkKJrz?fmRs^7`TiSTS+vp7
zn}LxUQGfBaGbnQ;ggf|-_>U1Vo<aoWCC~NF!^%9F0YV0etfN2@f|~b8R|u)_7~yrl
zdQVlV<z4{|djpa)Ccy)`L_-~Gn7h=fMV*}h>dt)u4KOlu=nz(`kXvIFV%W0pl5kg?
z(B66ERDx~EY&kDWrBtifQWv!}kRJ}M)@tjJnCHZ%Y%rX!nPFYL+!Vz<K%v*D2>r5x
zPv(BbB%<?Bl(q$9hi<~)mu*XH<5iJ0b;``7Sbl13=I$>~J`|845P*todzJgm2~@G<
zWtqH;n=;=mgLnuxJOxCV=;(yVaupopVSA*B4f#Aav$cg0Krc5^rzy*)CH<5a^r}8$
zlpWa!JT@FGX!H0kCS;55a2R0$`Y?}Mcc{y4DT2hbTU3jjUHR#k^$!Ql^NfbVe%Xra
zE0)-NPyE=hbQq^YQ+-j-2&zrb(8{Bo5n~XrIw9d7E_XhWTGp)hXlwu8x_Q<`g#-fD
zZR>aJr)`hlFZI>iH^IuhDW*Kz4jp3}{ysceKYSSo{X|ge(UdToiMyOCKfW5ANZ^ns
zV{d2uTgj~wy{Y(Mj-iX<Vd+0`3afRSLVU+@7c_8r%yE03bQS7a?>?Xq1l$ue;@Dc~
z@ilLGo}=@r?d=9V3PHNeCpCA`3_-w=^1^Hi2ib^o06x7fQ)u>jWSL~R?XZOZW|6S{
zxYevq@3$|Ztj226(Q4*Wovo0+;IU@lPwvdN#mz35J^p0=QNgzQ>4OCRSVDoILcCWc
z`+lB&Zgl;@-!yw&)4sBlyWxoG-2d*L?OLVmo{(0@jP4jQWx-+=9UN0Y$v%V&bnP$i
z6r-K?leg4H_q>e~Bgf0qY-ri5zs6adAeb(L>eYKpb!K*ELT{6$V+)IB%z^rS2VFCQ
z^nQ@WE{WbVmFQu6%s^BQk3ddPJ>UwjznTaqyi8RXt2afCs<j8CoD7I}9u|Y*uLW^1
zAzfPl(UmFODFrjyVn}Ew)Pm@xT{*QMV6MTA7O&lIyNF7JbVI-9`_}*0F*x1(6~VkB
zm*06%1!?HZko$SCk#4mepNLk`RFl?i{XI*B_NZAzt8LdwmOt2KNq3!-x`a|qO8BV>
zir=L@KjrIrQ}bpm>@!u>)mYktVa)n9=-0_4!d^{^V&+Ynn8K#45m*uSdZ^uki-7Lu
zFF9a^$6Xu;YHircw`F??47TOnET^OWP^51%|Kuqi!iTMVO;Y3xO)hgWsR`<xpElK`
zg^|jKZk%eL)Rle$N`lK+1912RuHs-N5&C_SeSdmh=f&QH8gCq&@3M!_p%SFoc8W)%
zw^nHf#L^p;RyyU!+lx3$#o2>T#p=_9H1MtYBR3)n&hxm(c9Rfxe7O6#5_{$_r^mx@
z9kb7NDAUgP*Ikq7u2<b@d1mhcb(bDgu{0t@{mfRa%CUfXN2KbuteTJXLU%fq-J^h*
zr%#2=#-6<jb7Ucv?Ql$Fr0K?{lwHLW?U}=0rDW$2k(yNK*PK}xPqCqOcV)`UF591{
z8riw!9#~|(g3s#3|9oBy!x$0dZCYpyqRG}Mr$EDc2;ur&$w3<4KQIa@qyM|Let&t%
z8_GGpfi*7G74XhZyMv?rZ7jD5sFbZ13*F88-l&OelSs7=ovcV~a_GfB3WZzsF87;E
znM9w=17D3FSl6eixqIcmN|oAeVLt1|og$go`88;@B2E7I8joEC&Epo(KptmLK*a$x
z&rW4BXkboIeA42kmbg73dLp747GgSu8hdfw=&Tq1rkKMRHQhzRM3WyzR0`2RmnNL<
zht#VL8V!Jss8y1%pIm|R&H51nWFj;I{nnpB!S(@F9@33Y&SURxd#FS%=892&VGI=A
zTI}nn3xrFxmpSt&2(4z$fmoZ&vDr6XB?$^hO@Ty-C@yxRu(0x$Vr8}o?6--YmaU?=
z0m6j0tN2=ETDdmFRTv9*lGNE3cij~&ulHdna46XMn$=DewkrR^7nvnH8?9Uwtb?bf
z{dkEPaYhG(@wEe;Gmu=H6tyF<H};`Fr(0-H_^uQ)P$X%Bn*IG8k7d!BzD#}RO(%-m
zd2pBaVw!O@`kcGmPptNVmj}8&0j<T2{ijFk|8K4S`|fv$HYP;t@849>n>C`D9r@WL
ziUg1eYWM@9LT0dHk!R`}_Q<*Hljg_bO<pBwDF`U6AQut6m4#m*<`;Xh)9&MvPnhZU
z{V#7+92Z4NR;5e)-2^>m)-iW=BF#G5Mwb1lKH&Rs>cid)D&Vbj|K@rT{o_-PIT9J@
zL+Dex(+!T}d*5dWsIds5F)(p5gzQ%fNt$$Lm<clL2-%64p|+V;Dhv^h@oVwl@KY8h
z8|#HOU}($B_Wt0O7jy0LnpMP3+G4J(xe^JJYS8p7E)bg>)*84J4dZa#dnUmVn%g7J
z&H#H{S)&>nz>F0xAbJ_(_6l>`cIa&p`J|d=59^47Ca4sj87Z4GF^6?mVvt1|q*`;`
zp&fnZ!QF8iI{>bP)2bHPTzWp(W5cd#iylYvRm>{N{TAu>*kQ3j?3zG>sfNJrqZMj_
zTwV3j!B5aV=ZJew7Z~yO!j(It2l$IVn`Y(Ov!xOf<Yt4SdW^bPT+k5A;)jH0!~f$P
z{U0mlf(2;22nKHO6qyT1K6b&N2=jDuVw7kj>JWxx%Tv;nPkw~bBC7#)a&@*#NNW@!
zVuvgr^56OYG4>|lRBzGycu7U6IEcz{aE^J5WQZj5EL6raWQYt0T|<P(6d5v4Wr}2$
zoRmt&%oHl6%<~)~^<P_cuY14W-}m`HPtSes<#vYs*?aA^-u1rkQcn_nG!QT(Hw4Af
z*=6Xyj*35XUmQt5rt>4mf_a9dMVtoYnpkS}J+x-=*<?G2020JDMHp6eLE}*vx(@OD
zSEqWZda~jiN5LRQcw>F#whhl{0#u9j#wVRe)YweMxdnW0cXQI^81WGE8uaR0uf%<p
zGmC|eQ&)%zp%QEJqcKR)`^Pt{uE!1+5J93MYjUdad~vnz1KmcC7F*_53ul(%YEEge
z1iDR9tdp<dd@2EP4ehg{IW>AUdExK{D`OI=kL#O!7Yah<gjnIC<bja=WhwV?MN!ys
z*b2%O7KP43!}P9gQY+{Cy7b^tJ^xxY&MVNv_k6yO%6b#{g?_JkZ3wMSYI<uyYi=nn
zy%KuLW>q5x<+D#<>146#a1i8N2IEElzW@In+cuvGb|+8`&iK^fiJCG42WzWv1XA+S
zJyLQ#uUrbFC<jnO(uHEr;nB}G4wSQc54nBem-luL*0B=`e{ai#_!2~bmFnNg$Q~A_
z@B#{m{Dre+kxb2=<kv(}2Vzls^X_oXQ~tnxUq3)QsA95D$jZCpbYcc6Y3rE}c-7b%
z`FwYIa^-ebD&3L-5%0ZUF)lgz(>`}-?ox<$r*eg~6uNvG;vVKFccKXxx*FcGMq|oh
z8uq@yxR!^#X~v<i!YSLlK3wSII&l?OjXifn-p;)?r_oon<C-@sGr1+qN!ZcEqIB%q
zs^g>=%e$x}5p`6;DVHiwejPmcc+B>RW}n|7`E4p9cN$F9(m9FL;;KUD%4M2<JelDk
z_E*e?TJ&6cd;vP8w8mrCp5LKYo^>!)F_RRJ6NfJ6kCNn2oRL~~jm)TuALn1=6e{r}
zF63RGl%|wU-PM9E&h;k6>f`@$5VV5e3DFD`&^08|hOtrk4Orm}l(Uz%Jp)ooHe~eV
z0W5^-NNW~amdKg|Ql86YOb<P4=d7zj9&$s$-~8q)b^~0$gTB27BBo_9_6I<6Z1<zU
zk$}~ah~ekruP-otbyI&J5TNtg5DDklJ}2A*BDCZ4l~M%preXf7-rBOr++dj9ya$fo
zWg7A!J&az(M@<owu}NhUyEmY@UFX^XJhP@tb*eawC10y(h=B86L+GUIXz}YE)wcCs
zjO{gp_SxfBz4IL0U-3?%arA28CkZ#n2_@Rh5)Q}_^`cR7`WM{7TvFPHMwQVKuvim_
z71T5uD$iQiSV?!z_n3`vj{H!pVtYn6Z6FsXvA``*Uj!J#C(P>#ReLo|xo(aY3tzp)
z0AFpcve^A~rB+Q-)ma_Pr(VBP^ZJ0?M(#+6)2c7LFirflV+c2L&TQ0$y-9E7VbPTJ
zZr=gS-1^FmI5rcf`T>uB5P^TH%U@Io^q^E?;ku6+ZX45QErKr1q-iPXNe|9~RfhYX
zVN9<lPn9LB;tob!JUxHTROLX&Cyz-!wtCFQ^i+@6vpx@?n%SYsz3CK(+6JI6#|T2&
zHuji0<a%PtuimR*4*>f~;I!nW^fvI-sD5VaIVh!cu2EdB8vE$4IHsw`(3@#U;ov-|
zO2Qj_X)0d0b{1tB&aEZ|vd9S3Xef#lfNr;7sj{=diG6;#P{{&!jt~;r6r3LWKrip$
zsG50bhTq!jkxA5=bL}oM#`u1O&T-YN2@{5+7D4!s`S%>;=Wq|Z^vOJ1@$^waO?+k1
z7Zs{`9(zVFYkOV=r4Kvah#;U+a2Au)%P%YO^RPgqbJR>Fc%avETDBU8q?ODnt{;2$
zT*++^pzn2GITsDZdxGnHJ%xx1_t50odjiE7nuNLttx>q%i<5o7%kA*q&`gndTV~T$
z=iv@T(+E)iN`H8MU!CtZVaEgoe*lc&(lY@G^WN`uQ9*47g(fyepnbQm)b70uRsiu>
znyy>|c;H`|*}ra-?V5<YVr{XqgN*#(u6>7V%4P1>kd3`08nFW_mo8&>^EkKsT*FKA
z59L?7nTd5uy6N<M<iTgS#B;{JtOFDBQp@p_N5I0Hyyx+*<Wj!<Gg4^+6OgJ@nu~xA
zo1_n9PxyIgt`V^Wf32gxu^VNBV;sRdFu{9pE*_AB^U~1}nHn3Fs!9K9)RW)OXM+Dh
z2#5>~=|7LTu0*;F&HY4BZE|T8A-ib(aBhNql=VwZxhUR1m&bHHi0AV0<Jxl1-BUYM
z%evrMYMD=biV%JSbfB;r6R1ItB22}lioEOIS`x6BAe9QCjbkr1xUZ4IyPNfS=b_kb
z@;wM_H@z8bV;-6yQZ6U_<RtBvgW14|Iigyz0=_%?=^`z9{6@=OV^gR3<p(CuVJI4%
znwyR(_1N`SeQdZg1suup?un~f3n9DqmhSrNh)}zykD(UcsjvOsX+EdB%SPHB&?@l3
z<DwRC`>Qur?N+^u>xiBi4x#1AK5AUxD`P@A7c$>f?KsyMI(>=ie>}4Hd$Z*KZ~?ZQ
zKVNzhJo{Y74iE82E$CF)Q_TSz=Fojue&R9hw0BB&ELK<|7qs`J)e}X8LViTesgQ6J
zmz0WzhSi3q>aS9C90L#e1k`Zo<bR?<XDvRwH2vYY@kjUNj2!V`UK)Mc?yuY#)GEc!
zDM!d{hF&0Uk%3>loVFR^aH6nP_<bx#Ef?-7uiA;!s=^7io?M;!302wC#KX@(E9I*p
zAGG~wvF#GMZ!^k3-Rod_=>HK%8fX!d?-ED+Zc>83Puz=9==neDQ%*laa7386JG4={
zq%YO(<I>>e#>a7!N1|e-X+vqhPKfClk!S|HrN<lnCPFHNvs@Rdo@zf$HafSY7^wv!
zu83jo!O_Fiw9>&NTphQQBcKb(29#J#>E*ZFO;?m6jifx58rI&|L-sDNm{o~$(=nn0
zdZFBHh-zcghlKwiGXHDpoQa_82SXM6N!(RlKyccwR6->|^ih;$sQsCl2mbin!ftaH
zc{+J7s7Bgiimk)6uMRc#Y(&~d7-iz#s3<mm9R;N25$iH1Kt3KQ_i$*77y3VQ4@7f3
zgRgg`Y|tU0=FkvflyC{sPf0de1|FqS!THu>rvvvN1B_qI=Es%yWkS%V7KMIf$-r$+
zvO6N%<wy20OZQ=t+==`<TMFT-%f9lVT%p>BT0@M*ifzQN`{g7+ip4YLwR6PKUMY>;
z|L14<iFkm|r7p=+avt1XOU32apbrVYFqD~`XP@J}iU)E5;L{ei>BKF9oZL-AkzYO}
z<#I=Q0Di2aANam}<GI4pq$kMjK0SR$=+}1XfkyKL&vHis^gN3r&#Q+^0q!jz2V8~|
zh#Rdb8g*$)1LA-Ft1KmC(j*vltd<fff@v@J6fl+!#qgX<1O#AY-!AbZDArzs%Eb~D
zB0*M=eBFK;#N{9w6p$%*5fD8~NDeIr>%y{K#yh*EfGVk1+NB|x^@f=afFvSfh0V-T
z2?Hza-=TEt&dv;#=_Zw)*-c5Ha)V5v^U?k_==^ueU4)t@+lWXIdImLJZAfnVpO<K0
zM0?*#ARc&<|L;SB?Sk!Lktn!v8tX@j9(H_18X$6sz;BRu#H$_pRfYeD$o&5Z|2|3E
zNSIqmfi$c5861IyVF=5mjvm8bT;0=3e=xI%)9{u<?UsVHTfg6bmtE#2x@`n6J_(ee
zA$J*Tf2o!%CS*AQvpxh77&buo%~JXQe=U(+6QhTa#^4L+ND_cBwbwQPfER_VXj|Ez
z6>6YDi)O(V551><FPg=j7l>@V2((c{a|%GTzJt5#>oGT%{eK4_LJS;kqmPG1cKTv)
zg64G;8Rz+}HbBuN2ot?BxPYHQ-g@iaZ@MRC5q!Tdy#M2CS79P@6kQSPnpgX$K;(hn
zoMc$?J(2M8vVE@I6u#R?;KhW%!}In#pn|G30fjHIHAV8(f#0BsF9Y^7ey+no2oXrd
z1tqip{aw(TDy91POWAAR{ut^#;u_-VxS*UWaPH5ip}az(PrX|4Q@id}pfxjtW!g{D
zwUt>1@B;~t510t#c+Om~kVeMyd&3SL1o(#dh`D*!e=(agY@bKF%~!HVmCNJnzkMt7
zez`MJ^KvusAEw@)#yVw#n6E?Cw>t1E*e*1P`ePK#(kBMZh5tSiFJae^3a10>|KDF`
zh}{=zvVPz=@_iAw^WKc<(Oc%ffOp#R?9$mkx1L2fEM0SPmKB*?>Wd=NSIywYs9zQ?
z?mQ9?Fq)#bG4Jm<9B}~c%~$*-yZ*B`1LU#Vq;X10Zw9{)Dk@${b}|l-k9@mH_JB{Q
zl>>i`W}>y!p3EO72!W~e;!Bb)Ksm$|(;|%9DgHbb3Wy}p+jCsV|K2mdzl;qBCC$sM
z?bG!9R-|#Btjo4!o}E?lRg9Ns?N)G392h>&sYB$yUgfj=Hs1rIBwLr73c=ETNp*Mr
z?5*Dil!nr`$X+TqqPLn^PAWl4tURKgYap^nF-=NLb=|j}qtx@%?5`#Am4h$fD7|%Q
zDNn&sV6CwE69fLNx=bo{Q2=ZCWT+=m&T;tDcE`Q$cG`Lnb;G(|){8q^9WnAEIA|tN
zZ9b<V0bdODD%m(NKo;V7dA2a1ePni_1CBL%^53h_x1F?F&M{Zmuy3$Oh|1_*<o##&
zdaNi8*Bo3>A|2v*U;UP2`opQf-_P~}d`dBf-`|3S6&<@pV3;NWWSRs>0$uxSeQweP
zbnSlspmUb=-wTgbA@yUez<r;i4(OefdlP*`vcPM>dD5x1{p#fW`*Z%%#NxQ-UrW7^
zgyOG>eu~)Y(8?Dx*Ufi~FeJsz*Z>K<BNLIQ5nXYp8<(&TLLlO*w&z%r{re3;j>Aeh
z)J?7M#=No`q6nvYX?l_ih77a*c_Lcl{nGE#u7vjMkXy&$Ur(7*p9HA61*qv^HuC)H
zv;pA!!woQ_6ZCVih+IP-t)_o3$DeImqA25c+omg9pUSY7%lLNAeKFEfkWtMB4-ow<
z{qr;_SxkP@VsMIZ{Scm7@B8&hgLr+R*_DX!faVL#tq9+bG^VBwlwpB{OQ)f-?fri?
znMJA(<2xFf(im%Y%-1rzcb&Z*8VPB~*ev{H=+7~SP5yP(u7|l*mi=m=^yc5Z8v)ll
zSLR2{{LPQ7RYWT4@X79VEZ>pGKcGx`mkjekb~XhBoW{O}Jt^nD5-<7hBk$XXgTq2=
z*<Idsz+TGKTE0*{qL;&3zG97(YQXi^0{EPSkd(at*MG=trv%>!U-5w;wt=$}rq!ff
zaKn*c=YFd86`KbpNO_hcF+dB<inG}dvljjICHJ<Rgl)CppsD|B8~=%m+kB%~v8I|%
ztQC}B@4RV^>8l{~QVEx0+w3#Qe1Tx5Q{mb6`q%gLAw%rGk%y(NA%^qNSX2YL!QPuS
zzv(iU@>jsUIZz*3I=R4=eEBdfV1#aa;eiWZ4ISZrH}U4X^Y-tz{k@4~6v$|f`!$6d
zSLbrDeW1?Opjwx^Tp7>1xju48!b9f3q_g>4eV3?LfqHSvEDupp`JarjZmIzKumAkm
zoOZGwL}qos)TvuDfb1k+R67?nB#f?ae0Az#6f*NCf+Q@x<d#`mDTo^^VO)az#?s8k
z4-|)vX7+A$ZYNdogZGNM+Zz9$U6-zoHKsD+Gb|UutZGp@Vj|iVFNM}eWu3luuB68L
zz~<KoP{7)fF#CRlSzl(W-0U}yP9>s{6zG4JDly!bHtrDuUo5trbQm(ous$N*19-#B
z#6F-ox2(<&s`nHahqCho(!E8ptb=!ZvegpI5cOX&R8^2Ve=@-;HQ4rEe-U^11_5_)
zHa?S;2y85qr5Gqg>4TqelF6Yvlyl~=^M8Vl)=|=+&)kPVJ#pE|$cu7~W}(sa*O7_{
zLP^bL!g_!N<S|E)rfRqkQ71N=pKG&^Y<m&`eoI0<u(9Vo`G!_#<dr5>&=Y{XZnVH>
zvcY9?8b1+v>P6O|tk4YV9tSvLu~Zs>cpgV~NAK4Pv@>oF@5&I4njAh@-n;k=Y+72t
zRpicOm*$7>eGIraozK)y{fD3X_wMkWqT(ztyuG1nD(85~p>4qxS0nxHN$JDAo6(~L
zHN(n-g_&TPbx6zY-kDYX{NcF>^`*h}SbpQnME9rQfo79t1pr1fa0#83=2@UNH*BI(
z#0N4GCo@YCsT>q&)h<_UzJ3MAnBk|4^GHkpgDxD}9<5BgZ36j~$g~mR(JM%Z4OX6(
zI?^=1{b*)#3J&l;XY%ixJU~%KHUwf*)>QAUocnXFYNF@t_IinAGmg?;y=LT99hJ`b
z>+%qWL;US&<->g>ltFA*sy5iQ&9@E-KV-(j?C}UamA!QQBEUklK3jk5^+hnLjT-)(
z@mkKV6a2?46~Y-3t}+N)wTb{xt?v1Je9EykS-eWyY{%~XWk-wmW-Y_$fK4^oFzoTS
zgz<DN<uN$)>E-6lbXzZ1NEt`SXgK^-tH}*dL=q2m3X__V)6~CJlr4i|VC1`iNy+H1
zW8mw8YEt|3rQO@fhG@2vdmzYDwV7=`=mU}ML$nuAXPUu2DPr1`O(<38a|SsN`xWR$
zD1GRoJ?#oiBD%bjwc;@X7d`=Fvn?4698v5zisVLT)819MEeth|fQ#KC!^D?BW>Ekk
zo9jdfW)uRxZGA*%^$dluhdBixv5E(6`pdP9RjHJhKr`rCtGxX7L2P^;w57z053+AZ
zaq+)*Qh+*^k;*6tw`ZMV`1OPiExL?&BpXg_rUO_*c#0OGB7?Bs^w|&=Ck!D}*sO+Z
z5xSz6uJj4+0OOX1H~EIg)w=v_onLd^uaV1Ww8@25kImp5Fj*2n<;P$5C>QX5O8^Io
z)8C2O^->;>Ww;KbZrz|&ldmNC^aN1jN76(27c3CxI8h@-A}h?f;4fTpc#nzd)8VhK
zD<075xQL|FMi<x@c_`so?Xk$vXaD!*uzmM-)(Wxx-_*9<)0Cd&xBaDubwPR8{*Wc$
zu?$i83Ypi%LxefWwirn!OA-ltHW8UxP67Yk5<+kJkFlQ!<Ld{j|9*SaqOgfnQn)K2
zG2MReMuM1=S;skC`<aJlB~-Q5V8U5ghd0na?_0NlP*<`P$mz8d_?LkU#tZj`I-TF@
zopa~Ew6-)p0*=ZpftTv?Pp$NUhN}e_WS6SGFQ^K8F56|S1P$nd4xr_!`(jAf&K_Cp
z6<G4jeR#${CDZTRtHGUQ?U2^M?hWN~8mr91BGy_Zah*&gc!h1nn!)$1E^c0T4W=ae
zw5ts}Uw*r-DdmRv{$!caKIgt`X#GTd_?RBUEEQUvayxz1H_){CtHy}EIMAO2v!Ip0
z(w?PCxA=Xrjn35@1p0?-FJH>Qt%<|bWwV<SFgU6OC_^Pe*Xru&0lyso@VIbP3@lrU
zYc#%T5p3W~#SM36id8{NEz?W?n+t#E{=ogE8h~G99JuoBh3KT=OP~K-7e{69iow3~
zI9h^tF_0xaL$yxXuP2*STeSKAMQFn#J{l6&3Wd5Irnme`(mN1X><HB5jqWoucx-<~
zd;x*cGl&1W)do0#oe5lDFRE-JBz(bwvi%m5)RUYWqF3I>gW6B>>;CA91$gGifr}7C
z96(q-wFkcAlEyK;%r`H5cSwz9DoK)mNH;N|vKv1iUE?Kj1X}Y!;tLIIe1U>qVSY0F
zXbbJFe8I3aM@DO#=F`*WX{r!;_vOK3{x+JxI}Ve1Aj>=xnlml37_b@iP%<HQ1OkZ(
zmE&ILsoH5E9gYAS^Rf4fe2^x5PW-4L0~1NY-Jy@Pqp<1gE&@lCwdLzOTnfcV=~-g-
zibgW~B(kKmfeVmRVccXA_Nwj8{fq{0@<&QQd*O0m@LfNIMl}fHW+t8!x%ZDAT#1d5
z7vDAB|NQy=^SL1Rw1oKgE>%OJ&hvNMS8uASR&JJBk=d)P<9ZJOoxNU`4|d46dI8<g
z5=9&6tcU{9^u{|7%DD>g5HzkozWnwz@IjlWAP^-a?%IgOLOj8frTtyi{;oVS6J(s&
zDjc2F7jL#3w^A$VO=Wn>r~Ukm9Q$&jvai<IycJL(f$dUb(qPYNW-{B65!i8{0f3eL
z>aBb<6Q7<{9+JN1HZ~dBH3ylY5KMfIrWL@=ht()!7{Ec*q;|ZU*W_t0hS#pQh^2@*
z*ccIhKpgq%Mfv49C<>1RTOi>%lz3Jx#{3)@dL>R4Z@?{;dwYO`i=pPpn8Yi+R!~nC
z>maxix?v$K5|2Bic6>nxpZsW9x7r@Tz5y;zZJ5y%M+6U#qfhE|CHns>d`r>)xEu3<
z0p{<)@T&qL!T_NgoD@u<0WqL^v$XX->wwFrrWl^MGtAy}AQ8DR0ve988JqkJkob3R
zJpAhK#IQJdZgixET2n;g{we&~5F1IT?QbM5c>FKA!p-2#Pf{E9`SQCGyzEX5TGCrW
zjUiq^vje@GDS?bG)C9p3twIhUp^tm<{o7&-ED5jE&Gbi*3ntR=1@q$`I>pLSVoqB;
zpb=O?;Wm;@F%k9no$1L*pu<by+^Niwfb=0;PA@>H%;$95|9ImYm<w!yg0M;)|FmZU
zDSb%K8YIrtSM$Ab1M}1+$XX!Q@s&Pk+m$Sza6R?{*=7rvB_4Ku)~6LEC27imJ=Fns
z^Vu?H3|QV300v`Psq`R4&fbLNn0z!Nh!#`5HX!~ejGmzY`R?t^I@L$T%ZB_K&pcQ5
zPsvh1=5*mrt7;!zZ{yFya6fXK{SHyMRVf5|K@@Lxlf$6+2YCLygS?h)vxo|UgrDxv
zM~O--DR2i>BCiuw3^Rn*p?c|(J%1{t&GUSBjx6)=O9Z<Z7CBCGJr#Zx@`_67>{x~g
z)w;jAD$lRTx)07V!JIiBFbDv#yf?l=FYc8-$aj*oo3)>)f@RAo5Qy>IDGisZB4tU`
zd~E+w4Y~su4b@f_!VoAm-Br5c1jm4szes~U0fQt*0D#*6u~3c=bAh;1cLFrGOhsa1
zwvc{VLb9;`<Mjvoo=NLQ@@SV7NJvN`!5VFn?tKkLB>5Mk%iJuNKyKuu6BBydpWW1-
zi4cH9U}0vtu=n=z*k3`{8L2EFw#CVI$)mE+%u-K|;Uu}z{ZJo4wDML9NFa+kRE6}M
zRDVt1>+!BE{02g&X5dWQ1NVg5;|O?OC{HpKH8IMwXCuB@dL^i)EkI~+vABmx%=Sxl
zfP$@>;<)rAAnKs%mOb;4YnPT4htd!})gan)pXu)C3dLKGp6f25ZTW+#gyi2V??{$8
z(S0N5zGEvHkGg=bMXfZMS7%e&r(em(AHpIaEmZ79kjhl7rlDE|iO%!$&}k)tHN`bX
z&uWV9G8-K>j%c}X2Ew}gVA%0$m$;&^wFu7$71kRduQ#yAHg<o0d4q4ee<hRq5LKS0
z71zy%HYo*GntbB{ieP<E`&~kE&O4;1fZ!viPPnLCjx%`zsdy-VG^OgmkxC>XXcyI&
zeF)8iBxs3z+mDX3p|&b!$G5jIz1MD<D0hPUQQYY}r9-VaMSP(Il$_1bpE5Iv$_qD3
zjp8B?qe;Qf-;%?7M>^Q{6g*0{``(v_3Cnf<eCxVSSr1%Qw&G`xS+`>~j53d*#!0qn
za3+#cSi~Oexe6eQf|X%<!N482@e^w?J>PW>X{OgA(^wlSLvC{?^$~=eGZ=dOr@D#Q
zPlBNzTf`b>QkQFz@3SjD)oW`FNva!lyH`%<DChEZ4zqyv!02D-2vH52N5Z^vI>=7G
zaA;wd;Bhj!7#{|?CBYvMw?sg_cnmbcB@A*nk$0f8zct{slxU>&I!?d=YIxg6kr@gF
zap~DegD0$B>&%tS!-($)@ux+zaU`CU)+m8OuN;wNjjpe)f%IieJiO)r#vAUrmXP)L
z6;2tn-=Z7bhae$)l->qR<;%qS%rlr2V5f>`)UY0p4ds8rncV~(IQ;^i)~Jdxm_a*N
z#(I{t11fA;@j9{f6W}G;453_i37<a;%_R*Fcf!R;`euE?*Wy|7U?o4ZPF&6So80Kp
zUP4Etr)qpuyyf!LyMue>^-<4GC=Q8h6eh36%^0ST|JCSa18q{%o8|AD<Aq)k!b_SQ
z`dyzZCqDxFlXfr3BkYoe+8GM^vVvCQM!^1>=L#zfb-C(tRmb>-=jS8#6)%;11`m3S
zPffw5O<jEBS=e`Lh#_mmu-47yC*;EV7yc4_c0hSdUOo;A)3F&w;Voh%xJv<b0<a;d
z|MhI?%L!<S7f9oq(mH<O3#$;xXwq1uWbS?@139;3HYoVGq?%)2SV22C8&*iXX&O?V
z$H^y?T_Ktn%!`k6$`3+7!WZRToqn(XJU3(a8lvE_Mxji!ZL;Poz@kFPy`y;&;Z3xK
z!~DT9ID>@;ElI}>3wQ`Cy$Efl4tAtDiI4QZ(D-N=NGT#<81q=}s7d<n{8xNR9w1m#
z2c@{#lbEC^9XnPhT|hnfSfVccZJcr~W#49>;Zb&H;MyXA#h}<M0NT4D_!9Ec*^uZ?
zlS<p)m5%CVV?uCL`zSCU!kz0QWoD5?dlEw0r%*}7%edV<kKMRBEO1Ot*Rfxr-&NaZ
z_+GPxuY*`M@gVP^F!o$3>;jP)wTUU{9M0Y&Ds3A%s1?R2DO%?ES}MY@Vdyv&xg)Yg
zSp*06t}&))o#>utHWvz=@2<CuUpz(_Qerz5U5U%=TweeqtAi_vq)tyR&}5B<k5__e
zz(D#3*&CUr8YnHK{_q0#pS18iX$bYpw;+)6FKENUaqQH@9x6UiAU!^+dHYv(dl)h&
zNXxj%Xn41a>$wrVaXaJ=Jb{Ce(u~)?k~ang-Qn6>T=n$C%b6KqI&lAm{ddyc1Y>Q|
zh_sj-riBlz%pdd{`8u^-BA2xbq=Hq7l`*1?xr+Qe4}QYr@)5X8=AJ%;;`b~?beE|d
z$$oORK2~N!w}B}oCqvZU8#i>)MV-vA$H;Hu9GxVlS6i;n^bwsr$9k3_5S21qO^$i+
z`IYJk*7=au@NT+t#A@NW+RXn^fMDq~o3-(z&#7-<-?QuuF(?sE%<7-?y-n(d(I%?e
z3t|TS^8w?io38wDOi4q++e|39-QvRZ5H{9C+n`a5g!_qFv1{vL>1!3^pPqU4X4RH;
zZM5f|EA;OH1FhojAC(_-T(Y5%HEn8XvUPA30Mf!EPKBnXU}X}HV2thC2QiNO3C7Vo
zqUFw%-&ILxe!eb_r#HIe3Kw)qc=GoDK$JC<O2+^;o2t$a+_BR)iX%eJpKjhA|NKHT
zg8gOgA!Kum!{_U1tTKs**tJ^Pb2xXsYl{|44RoGGILU~nAS2OaSLp8<hKr8p$dNGD
zUTJKsJ9K5}ShmqwnzuvYo)U1Ghm91TXRp18MwL3naqzs++D%~4HaZI!Lr9HuYxuBe
zIwv-4lE3hLZi28?BHSf)B40zO*CGAlJryV?hM`oxgUKFhPtVB{9IjPn6(CaL4FMjA
zCxVxpIe!<E8<Bo|l3)&Zk7l1B!6>pT+SsIe|1);e+eXGX(ny`q?(~hg4=n(d<u|Zz
z%oFx^KATF-TYIxxZP_9D-JmT)aDH`AmrGa;4l?X;>U^<)b4aHNb<Q$H!Q3A%KoEyn
zu!+a=)cCi=zE`*gr?W?AIZQ^IZ_CGTFpJv7qv5+=NfXD!nZ;Aw`od@j*}tTAPxD(@
zB-J?2czXZ#@iEOdc%xt?FBF1da3}u2y*(7BddElFhfv=%&NQ-P0Q2$Su7Ah5Jw86t
zk{xXp-p~C<1GZ^vZfBr~rFSQ(Sj)V=w^p~B^H;;D^YmT0ozfmMElD08qERN;2tJa<
z(YD77`XM^e?CM7${k!b+q+#q4Cgx4yg_D*P8|?+V=f*jSferFC#+YjjzKX1Af?2%<
z&>N_intS~1^|#aM^j70P@s3Ja-&xaXtg_?N;mCSi<tbJ~Qh*Vz-5laUz$N6$e#RPV
zp*Y10(i9_P<d6vLeEhL1;_n-tvF8kNJA=_&t;h$sy=FZyU+-HN3XaNo<}=$N>=j~%
zs55w$*SY|ie<cyF%<i@KsQkzE5#p8?AkPvdIL_w*9`?;UErwHRw8c22hOk#XDr~g#
z`^Rd0zB}|6<XKt7ml}w5#z0)jjnsNw*hl5jm}*n#{LuQ5Za}C+9Hz=$uVmmz3S$b3
z)%eEQ@N<}V#95)~4Hy}H5fXShmu7b<P1JDd3cIAgwG3B^+;%`)hm>xiL>Hmj{{E`M
zWJ6frA~WWJ%J0Y9egtSIMM@WzKR-9W_p-R#SONxm8{}}zSc}!wFeF5InRGwYF9wSO
z8gZ9~FQ>It2!`TdX_Oz&UQ5AeRMD;^SV|aHQLRYdy#`2WYyM86B^#A2P_WIa9*{BR
zi!-_cCpNJoL$Tf^Yxh^oGe{MB={)7Lx%R6>njJoEF0RVDEBMeEE`d`wNBBD87&C@@
z3Tu=wvQvP!WCO;O4PpT&_2JgRa32py$HMxepPCVI^vJfP8Q|_K8q(i}Dt@GTOmI2F
z-L#uukII68Em1W)obo|eh05}P^y<ZG)sz9IV@7Zh4c83v-oI-(_xYv5&skpa6Uxca
z4PNqol;m%prdLi?4KWOcb95bw%m^zjV<*2}eP^H#H%5r7JW%_&k*iTZ<aTs>)1ju}
zW7(}yt#?{402_e%$zVlc8nA@KLEB(f_0aiFLjXs|lVLj?Rbr*fb7UD|8bSLgk~*@+
z&b}Tm@6L6!O)9)KOiU{P`IUT`k>c5UQhbtp7{7$(S$VW6a8nt{t!DmpU~C@=ULBi2
zCh%<cWZZe7<EuZ7oh7S7LtCO}&*8mF4lx6G$_;mo_DAs%=<!(G@?navlve`W;g}{U
zDh%qDFsn*N*vL~Q!^0CDAOKo~q9hT-XZ?r_EM%;bNrjV{PdW{k`omR0t4167aGvoT
zi@tj_SMVm#;w-;eH58@V43tu{?0ceY3b)9T6OGQUEl(T1DF8mGCGbLhd)$wSK?2x<
z#J`^TeWAF^PJ5Fly7P}v4-TJjz(^4@8gViUgs)EfufZcU?9d8UnSA4sGF-a=rk2OS
zO3R6p9i*G(Rwv#=7oY{)lpXR8kn2Oz$xQbJADLumM0dFzcbK>nG)&|1LvP<AM}jYr
zW)r{#&!pGBllH>uw-cXo^VH9z*DcBg8m&>$F~Cv^5Bc__#w<i4V-L!}sy}+3{imT1
zcRI&*`IYwv)UG%MoAkWWF<R1*<jGZV{Gt?j*rC7=iLoZ4`!wDS=seF`l2Yxz4}=@I
zQpgaa!|iMt^vCxA?Py0ZYb}jeOgPZi5z2`SK$QO~tl$&(7Rv*EsX{ufSCF~k?hl1L
z%gSaqbzk!sOu-<_qfnozXH{0%KJ5#7Dlr4t2!4hO+$%boOc<&|nEN7ib3^h)4vp@<
zY~ma@8J=F;H+=Pn(~paTKllQHGy3r<L~8N`9ukdpT3rIa2OKi9L_|Owft2UgH-q!0
zy<k&0KAxS~+ShSy#x+;{L5KCJNtddbV~Sp@mWGy|U$cbmD<_mPam7)_CM3+{L{Pt`
zG?#_E=bmX|R9pcv_<0R|0uLc}U|t&FiaT{*(!V1?@5$vzy-gtEH{gF)jMPs$^z#=R
zw<l=nx~Woy_*fz|NvneZvp5>NO|L)id&sQLH<xoo2Z6721i{TWj+(<uM6%x9y$D}9
zIKdfME-lRo!d)f%fe~(YL&9Y&LiZ=CMM5qUDl2>Kc`%C==CyY7nrC)w6wafb3r*Vy
zuXxzfZ2!ranSlawHkHT<S!c)4(?7m1YPOxf4=`msWRj9!zN5MiIhUo2t_fOwe&NfN
z-#A?}IE9NG&S5R43A+@RexD{SYgOn&R)aU6wYRq{SMVv+t&)E!-o)62@5um5xB9Y<
zYp9tbKl?2veHF@5rqS2tkI(fb3ahjLfeTaux1~h!%ccTr0B?ky7vqV{QJp;VCZUuj
zP7gF&ld2{pX}ad0*fo;rSERAaFgm1i-eVv5R~WbTUxiNOUk>_*wgTPNlw_EdoCB;{
ze#*PV)haxTt}By0PQ6%O<|D&GNp|FmUV|C4oOFOAN09<7=@tY(k_G#iV*SfAsc|Uc
z<H6{GpU-wKL}miqkS-=iInSZyfsspxyuyBonJCKBROW;y2!m^G`h!pE7zFk_ecKT)
zWZn$ZTH?;6wZg>dEPBD`olFh+nSf}es>6ss5n=a*q4=I57q<E%&(FZ5DzK?pEa|hQ
zafq=~zEuj<il+&CM~IwU<y1-N=p<p;sj}2MA0eEPFJ9>b&#a8K@#$p%EYF@iB5D`p
zuJ5Ik!zvW-Z_s*XDVg=<)WW)H9#}_Og7h}kHd)C0Chc?lshA5yA^;m*4SY@diKD>j
zr{{Q~d{qGR=qDI&4$56H(NMU`{NzI4s3+X03TFIofh+mYixAg65^km{QV3Vjk>H<w
zS?g|4=EQcL8Es7&bW{F*nwB4;XF%o3!!6d7+el6gF!@Ubw*fN`0PKoU@D30pb^JPi
zpQdIPN17nVEm3sI-fDdCqJD$Y@B4*tEMP{Vq(jyNc-D8t=86W{>)@+!h<s09R$*l=
z-Ha&V;-TaA$*A#qBIQh~`^X_Q<<KC`U3_Qbydp^mM1@^VGf<Vy2!$jNDqC>Et%zz4
z6MYE%szYjQf%MNk0JaTF&rGJJc|1IP7^eW%Q$iR&B_+Hf$42CwEz-WgjE$p6>kxUj
z;m~7XvU0OgMV*5|ap%w!sQHx2I;YT6HG_V8`2}Hifnl?&iIbBA9k?++J|uY2XOuo9
zE^1EEkG)n=e5cF|b24(K0`bdFvH38Gv$v9KG(%2yZD2bhYB&;YwT{{Kwl}9pK5p;=
z7ft)41Rp}2oHS@Cl53AsXSqubeiXSm0*(I@ZC{?wuvy)SU+enBuU~>2@kn7KZ-u~~
z=6DFRf8tYQXk6tGDNef^>H<?-O{cqK4-&)4T~7F{kVZsahxRjjOMiPE{(?WV$#Iw!
z?9}}IG&@qd^i;Jpi?u)*Z)!3&>kg~ZfP;&1W1jS^Y2T|j&O@6qf(de&&R3Bp@BmM+
ze&(2i{nsYNo-phg%sVx?zC3M9?EMZ}&@z8YjJ=XkiQ^Yp)os}0RF5S?y`~Yi$yOG@
z6H0m#por5OS!W12&<YAio@}GT1R<EvXM2ma5Vb~@D3A9!-XMal$BDwO9yBh^-^(M|
zLgYU16>E*|U&ZWg)aIaS6Wr@qr_llpdQ3=#>f*9tTxtG?4uy3N?Rt3fhaQg-nzTui
zS~F@+V^%6M-cMD16uOMO2vD+Pmd*7TYh@QBXF-@!{tKC_%5QiBpgH2RWB(kZ>;PA=
ztxGBbfV*=UT-r}Ljh-LiBezK&#$WDwI#e;lE8vu15=S`!BY1>3Qr*(=6gtSHJl0Kx
z)rxp_oYy19Ujs^+qnK|w$n0ah&qDHQ7A<9p{7twv8|-MQf3-S(VS6&~$yTN4<$DKj
zizSfpOBt1$yL6V@xB$LHBjPJaWfE|(pLR$#@RZ!C{-eC$fl3DVMj#srK-(|mFMfRD
znhoHZlhKk^|9I~i2FKGkDTFYKIBu!WE7Xe>h0%G)K$dAxjg^C0{9eW6rdA$?3<I{x
zso)oSH!4~ko|`&+*k;o}MoaPJij1N@FGOS+0k^TBNrRJpYQTX%{^PZ~?>V)&8XACY
z0^I?7y+wM&{(j8eM>vlS=yE^u7v&az0W&VOg<2j360FGyP95?yWFp2qLb2XAUTHq1
z7FmJXH3`^+bi|RI4WNe)_WAb>2tJBF66Md6`v%E~tH7S4ViITrm!bQj|0c^{*>s>C
zC*A2D<)bpAjM??AHYFfoR;ZqVn&V-If#>p+jWK^9Q*iQA*3zY~K$-#$;WA_{s&3EO
zolj2w(EgsvH`8%3#4_4BuF~Z;M@aeJS2sjE@Nnxs@vLv&TJ}cpZa*2hr<%zjD!1Y*
zVj8`>a!(4b5Z_EPQsP=psVvnxX|Mp*H=JvkH3d8uN(gd0%ii(Q!qZTc=}?1W<u4=i
zTCJVIgzB)1yjH2u4XEXm$Q8qPynto2mggW&0>MO-pym`OCxQ{nG$7F00B~(RA1M_s
zcQ~Z?a%X0+&6}La;e|mO8@FJ7mn&K#an6h;;XH(Q-{2P9!RT}5%a`P&WnC&24egj@
z0uA-S?588rUTZGEp7U$>607eZIx5D#E4%<R(MA9&3`*z*XR$$lrBlT=Gp^$&q@gFN
zu>Uj$&j@;dCsia40#yK^qbCpyoIwmmwM$#=i2TLzW-vSImCy{K-D~K*a<esuwc|Uf
zo`<o9`b>~O7|-^*X6+${As3?!Q3v-(*V`Z}HZ^c~9()p4Gz%8NyfZ~c%%~N~`ou}?
z$W~^6GG@+$ETo8oDcxHkQS?e;amFJ*!93%8@`I#8Ux0`NVq(7<X^rDT8-Gckjc5p*
zb^TT``4DJ)oajdRr8}1SL=~ZB4kp`ShH+hA*cd}&bt3%5Ax;}~epG%lm-h2}Jpluj
zI=4o@65=*Xf7=kUGgCTF6QKfCf&{O~-uq&BMI5<0K;xJ46`_Vz<~-{CeYO2Gr@}*M
zupp0D-X{v{*$4hI@fgo)-&j4SI}|k#J7nep4Usze^6{w#aph|6;8S4dX-=+C9czeu
zX41~vs}cxM<f`}D^IDpXo|{ljPWD<8msGBCsXJsv8eZroN}T81)mreUnQnO)+H*R|
zZqPV35rW2q`)B&#ZAwO&?kab5pSX<sKZM-I8qzk(#QPt$W=<1WzzStR`5{fWd(%73
zLX&5_flSZckmK55AO1cw`AkkVkQ(m`uB7N~5pmP(-)`htJ?+}Q!^}z`$fH^)ixxt-
z6v-)B<`!I<v6tVcS5)6DW_gdIDiw8W(u*~lG2jPsbi4R%G2;_732Yrfa-`$@#3m*B
zyY$@T65my-$%8R8!#H0}E#-y`F*bSKcW=Q^Q1e{c-KwW1{Xh`I<B&%*>fUf0pYKTq
zYJh@l1VThl=JfW_)@L%?M_rPJqf1X!htxB~&u+Cb-_#Avc?{IFwV>qX4>|M!?rM@A
zuE;iI`U}W{RDfBqM8v8*D=WDi?uQ2~c>c%*udgxM-_KK8)dnbA3=Yc-GxdzuSk(J|
ztIgr^Q*n+|nt-hS(a8P9UZ~NU9x2%CdZ!DCD@Z|$8Z~RX6Z=^CJx$*K1onLb*tQ(Q
z``-H4hAiubwC?1ftD3whS+?kI*xN&I_4jetzI+Cj{Fa@W%4cy#ti`vDg5HdRpCz@5
z3&-{@e{$-O?7LFup&%xA=D#u0GT8wSgClXh?%~!DLwR%CKx)mMsNl~fj9x*X_c$+D
zgGcuroUbe`#FlGIeKL{8G@Dr~{f!raI*7#Z;)air>J&XsFf-UUpH`)e7>|lDn;)nh
zfp%S)*;8CP*SUh^VH&n#<&vzb_dt}2<<&PMzXVsl)dWEHcE`y-H~?D*b5!X&kd_>9
z)77kW%(X{yaVlpnU=qierwu&0@!D-L1mI%6zU<y$*CE0Hjh1p5o-PSdw4huyc;zzv
z{w)Q);0xV+gJUtKU4*dY6dL1^gU#Qemt#595UCK(_x@Z=H#U&DR)>p;HzA(exUk|O
zG|=_0z9Epm4(L1-;ix7XtJ{}ft#Eeyp-McBE=CCY6<)FJJnuR;c(h~1CcQ!0w0GQS
z!dqkcn#23DM&SKt1%)Q3<KF+TbtdrZF;YXkE_6365C}6gD^>j(t3^f{MYR^BC@tp}
zK=n;2C*goCpTBVPVW2}aQ#F8LT2n7@=t9o2J;FQGBz!PJU7rx3R)vDAVgllD0-cVw
zuoUyupjZ#VWTbn*^`vc84elgc9_Usc4&%|*W~GY;n^{XaqjXo*i^Y(a)8*@HNMmq~
zcn*E5ySHwb`-qWXY8t+%beso3QD0?<fRBtye~9RsAtOMX;H$^}r7VW0!v@>9{2Vn<
zj>sN69$d*uCie3R-{F8Cz9%{U#Z=a2_o=A&T=8Q92y9&c2BSIku=9&Pw5fVYtI53H
zh+bfQ`qVq2#p|$JdGiL)b!a3|^6gD>{v^7)D5xVEEJ6JhyEm%la0rFYtw>Tul;*+}
z*)&HnCY?s6w8VBV)~I|)Z8N(Sieq>T7eS<p(OV!#@b2acwsc{q6^A0Bl>QV!dfuGx
z8)CoynC9Zyl0tHxAUSnFWUqDuv#8%*ZOtzq1s>9aozz{e8tGI0)Q^4!ta=q|wMk9F
zdJh1k_C<GVwfnzoeSauo8&a|(n9SIZyv)iuh~P|_d#J(8CatZeBM!)VSog%afTfM%
z_Exb;NTo1<Nxvcog~#801sOTiKOL(Ry0W!@N^efN^8Kd}<!-4O^(E}N=_SF(kWofn
z(`xZWxKV^XNn;Re#^EotxP^`<H1FgQqlgm+k#UdBV||`2ua2lG*BV&b@}@5dZNUfK
z)v#mtE>%fV`-gqT?54Gsa=#{ABtj;07oOLXHS4=M!CrnfMF!Mqfwb#n9or%7vY@N7
zOkWKWiQi-pSRM`dq1b^fKcEacxLwLWrh-FzxU_<!`G$d<REI=Y^FfH!j3l{aULxy8
zx99e(KieT>J(DbN38~>f9R4lKZE**(&szgo&Rf?cGe+i~^j!+-mjO8(`HEq!XbI03
z2QIHWP=<3>tkF!YwBLO;2p2ST{eg=7RLbhNZ|x?v!X4R*r`q9~{`vHj4W1+c%NLKT
zk;XA`emD@b_+wkqrUxWm4twa{?d<p_voyhe)nnJ?uWmA27LX9++DHBFf5aRAn|UNM
zK%*S-&16y-@sLw#KRd><0K&{$({+PA(rIiBe)RJaXwyfrMm%U&Tx`jj)ih1#ch=9;
zJ+t0c0UPKC>OGzGY*hJsj{x3u3*MUKgJbO8%`daL;*{dZHS{TM^uW1qpG7DYUf{Gf
zq3`EB!QwQupPCvqizC=caznfb@?+ErLu|U=u&L@@v0Xr8k5D8n;h%H*A@k~pEmI0E
z7NzKaq!RyIvxPE5CO+`C_Q4gUHyOv+F#;?T<gU8bsILqnP#n)vhn#aQW4-N6V$%9L
zroah*^(j3)JaUwL9wJeFymf;#h`<jM*wub+{h?B-%z4EmFzLkiyOW#TbIrxbJH9KF
zzj6g16tl@ex+CUWL>1Z^{9cyXD<7BQ)u-XUY?LgU#<};h>UT#$_#oI()FdbJnaXZ`
zVKNj4#5BsbgaIF!a<ZYM3XWSjiU%}O4tSK-COU1dG<JhDeQbjI>=4_$jdDn2=X1j)
zp!w6VqDTb^-fztmy6&(7WECf0U`dV(l*h>bSvdHG+#Q*!AkXR`k1}~VNBO0@M>o$j
z3t1-XlJ;Xf?U&&nOm?A~9}AqXT|H>FXalQ0gjPfRnUvbEAtt|{_z%F!93{3qer&`M
z(sceA@*{1sr049Dk6R+=E^HE_*4>fwkLy`6Yk7632XDV-qHxDTRFl#i;=wq&+fziw
zf6;}1znDJyVQA5zKS~{VnQbJ}fZ2HxI)tzfa{>eJF?en61xjCCf9WUuNd?j-{n4Pn
z>r0UZ=qcF--uD4~JUQ3V_~TpR%>VR@2WxJ%7aI<Q!ttR*SZ%EGB-Ss_{{Y=9l=wAh
zljQF4DoeHt@)9}eb^U5AnC&@ttXOYG&^z!XgFEeo&i%b9VQt0WhhN0DEVj(uAu`fn
zPr~@idva3tD<lF0ImehjcQXt0x3qB)4|+MeWSLJ!-lqT1Ke^POkg7<taz`BAoQRmy
z_Ok~{x5`$4uSP1(@LO5%D?Hz!@Wnc5^2=QZb&z+&xFQeTh~ZJm#+=Rl1>6hx0wP!0
z<wa*^d&(>7xfVJY2rX9x7(Hc~;ShTR6)rk${q=(_PjQbAI>fyev0Kz5z}<ezu)U9(
zSGicNB%P_ntNrdN@!I&t%`YlbM(*z#wrFsPy+(Sbt!+x^soB%jRQl=C`Y9GjBY-DR
z&0h*g;P0cm&G#vKyaJD7Ht&Pqfo_P?*NAonJJHy&<JG1hjz-Mai14*zMWxW1-|&`&
zgPwAT%iix5Mc3fb;$Rh6PIdfIHf?SXq5`Z}!8F<8mTRlw9Ldb?8zz&@Bd1?l$VnyI
zijKw>TKTwcGFxRXlZLP@bp(wQzLN7Sbhw2~ZK@ZCSbZ_Ht8=I2`I*Q^wkUQeX;DUB
zqSH+M&(#7V5!jPUSgix3!ulRE5<qpL-|Z(i_)cSIO_&AY+}N**Es8~pq4@TO*V+D_
zGaIRsU!{!uUw!Pq(1qt#xw-EO%0K*(6qEx09<9+F%N!)miA3koMBol4foJrK-{PwX
zJ(8IN_>y@omb|?dg8wWiq`iV@6Nq{TAG6-CA?Ja$IeCZI)P>@g=7Dk!uf=fhwgg}J
zwt5~y*TGr|^QR*F(;hhmK{98=2JigarZkeJ$g<_I&U0_CjOTjDJ5qdOsf$hcCm6;g
zfFvg#=20af?>aDsr6-R?&L^cH2Py^6<9G?ziFoM!CIM=YxG>zLXn94AW;4xzTSA9Z
zvPEJ`>aOA^6H&w-4dyucyCc@9vyY0$2@ZDsmhVOE@hzHh!7QzMX{yOj-Du#5`8aSG
z9yuL~n*qib_rtEmk%Jbb1_(-zt!I!7aZ^A8rZkDslkcDl>JBaxEl@=?1B28Ar?yA<
z9SAVa_nd`Js+#lW68p%+qL;&1-`uiUIK@ctEHldTZ!*GI{PI4BbL`!`o8m}2zf7<d
zqdV)gq8lT?HA2-a<d&|_s%Cio%iP_E&eM>xA}A2|I7}IxTba-C=HWujAQEoY1>D63
z73X`qO8eZ9`MU_mJ0)*mCe|h*5~vxN-DR77efcW`h%kX2*4Y&Oi+uL`v|vesYot6J
zbK3Mf^$x2Jhm+1vrjZr6MBZoKeEto3sQg^_G-bVCX<_6A9E)`WLpNJz*`eq>j+^eM
z?iRrp<$Abb`3TnSFgaq*Iz5B(u}#FTS1>HYfATV*RhBRPc3J}Y!V*%`*a4$r!|{h;
z9#VY*io5ZtsH)A_a5o+uWNK{OmVy}YGHhf8hw*{sDB7>qA>|o9OcG~)-D|$GED=gm
z)Cyy48m15xW`71<+cWH6tnn`mI>F}!)K=<{QV&{|pmA4~0y-j>{%`@VPCsmR1pdxQ
z%klm)*2k&Wf4G-Id3h`R157LszP37dEBw?i?L!0u#Dm89^ke@-ddOj^9DnY5UV6o;
zRw_5dLu5A)^MBp&T4SW0;}dMLCVu(?k%dO<jtg8%UG-}$Ag2RzX1BNC5!i05q^M8<
zkUu5<U3ZT5tA|F!PB-!OOiz6n?;MKeRUneznTa(dqXOh9-=_VpZ|;TXRgYo;*GEUd
zusREL>IQ!I0MD%EOjN}SX@gms&f(jko9p?v8R8!mCqw}nrtK%4G~@f%Rx+{zBLOns
z%lJIY8>B%`OY1{64-oc*ub?cmA^qgR5i!O2^&c}F3r^QhYW`(m<wmmE@Jo^D-YMOB
zyl&t}_vw2MOlUMuh$0e!cRR?0F`xW|d_<rt)erJ3cHR3hL^#|A5KpxaFfpU9530y}
z1p@YLVXmMd=Xe{I{t6(Ii^$Q6N5KwJLR~kY^|aa71`;mg1F5iIU&Rv^&f@klw#~5m
z>sA`#*e=C+r=mXZ%(w3nwA#1p*!_4;w$c5X3j`G;8QXFsrQ-BbePN#&bK&(PW9wxb
z8}Da!M{15N$ELz}y^eBrM%c73F%`c%ydmBF>GFGp0ix4+pqneh9#Vs>9n~rUpsES>
zCXrMrJ_UUBX5hLVadU+y*9--!WlJ7(*nFpLV0j_9c7X9AhrxQS-Ij1f2=(c482=!w
zcD64%N8q{MhXZ?qKflg*erfmX+ZmwRFv|7oJU}2r8CB0%pQp=5qM07rC}?v_)}DYc
zT83GLXw!_ST0K_hBU?fIVh$1MN&F<xlK;vW%d@*<m^{R=2s4R92&>{?CEg5e%jmFi
z!CzYMJ17aolct((`CzP?SjGV+$u+h}9v`|Ki`vtXfh`~$ygrpe-g~y|)6X~xc@~S1
z0ZOV%4asgTi@N?AEY24EyAQPrX<iD-l=Z(AbIZMhIYAbutvjNI7Y_bj*yi28@pRo}
z<$agbf&PA1qQoc>97^-s-nTgeG2}0=>#SH3_=9Jq#4pcxT||m&lRYh*R?Pix>opdK
zd%lEveYaPS7d*w9h6yXSXCwKlAw!bpah>GNb*da4lFSqXLL<J^qs?L(MxWoxl6{be
zW>(!Zf*DoKP+XW5^62`a5p-y>8(=OcGG4<l>{lm%u#pW6lz=BO6I9!&D7RDPTqMD)
z>dt-6vHA6Ze*KeH6|2@4jeEPR2eVvfVq_KL#}ghQO}F_sD1ED`I5YEZ^9_BiJ$W@m
zGx*w`<K7RHN#(G~KBUqf5^^%66R|JMLiu9f0=in6<T*p+h-yri4^@Lh*#cCAH(L82
zCnM9_OkQ9zUIFfBqEj0@z^0|Cccq!r?Je*M@aMJSi&pX<!0wML6nauh7Uc8KAKmS~
zf=-%u@-(E=nIbrMo})cMe>r{g*D~sui_;X3Qw2>61iHWUi)lyf3z8Tf(uuwuCV?lR
zRfjbUVPZO_K`M9mJgMUgeHboU`k<fC#D_F5tPYZ(DlUDw>xji2(|dKP4ai+<1Q4HN
zjFMMHR0W>7ysR{Hm>}#RzqTh_pEd+Ok&HZb#Ebjsfr??&zZU1$q6(7O)JPFSJjP1S
zIO_Gj$P@p)xEIwP`qa~{Z)?(Mq1~m$$}D97zsO?8`fBC|PkO_@)1LZ_h(ch`|N3$h
zc=p($8>~T;H61BbgkRq}a=dj4ID4Pk99I!oc}UFh?#j<|`j==IVcbyyNF^jktJ=WJ
zo0_Sk3fTaQz}%FiTkj7>KvyIXnY9qAG_x*{eE?VZCe$2JOOyUct%V#VY?r}cd^U`0
z>PseU3rb%PEPN`dTP16pXrKQ66CvPRME$LtlH-DQ8xV4>dl_{9elEKFzGC-N%L=5c
z)-{XjReJP@gN3odGLO&#y;szlhTYQ;ylo(<v{j<XeHsFw)!Fl=0qq_@eLIg_XfIt0
zH*5o+s@V;@)wwSta3Gpi8RieL-Zdg|(uB^+VSf1<zpNpE!{hODhd`Mm4)bdWMD<#e
zMewaUOsWqgl&_uly`U8qc_m;2>~nZZPJ(+M5h((i;ptf*<Qo~kl=cyt61wvZ(d?20
zV8|xG_}3Q?%Q{tP-+3QXfvvGXru13I-8}tZ@YYRt&{wP{wc-t<_Otnq$uWL;C3Qym
zzYY<lZzkzLjr6sib&^jNCCgIy1!ajn+#DpV5jWvT*jVR2gq_ktiN;7C61KcmdNLZ!
zNRqCU>sw!dj({;}!=`SwNrIP^2~04%`SbRTRxpfDaNPh#dGi@nzL>->REi9yNa^Ur
zSa6Ii(r$Mex<wFg{HqlAydOBHHur%T74eyh(^D_5_vl+O-r(f!$A%rTR}acnY7+fx
z#;6NIdTEEk5qFJKU|OSuuQ?>;Q3m&`ZMr4$r8I*6uqBLYR@F5LKxa4(KKfV5<q6_H
z=PpR*#P+y~`u?0juYQfkOcw@!-qG4qkeR6q=V|lIKDZ&1?8cFwce~#<_}T;W?{8J8
za@bLT>9Wqvhds2D#fPmqgr~q7jVrVAx_x3H2u*lNN%NYorLzRT*yhzK__ZLA5Hw!3
z?FaKv6vi!#e}V!rOQTDk8%?UTcq@PT0kF$ORX-z}MGvBB*Df{6q8FI}h@s%t$}J5Y
zirLXS`ewoS&QI*+C#LB|tRw1Y!Qd*4Z<f30+elIpm^BLDw<V522V<U|H8vzP)9(Qq
zp#^(|$?5H-YZ{3{$w7#auiKMb1W(V43bt`oo#8XuE$3WG0o(kD<K5etGla>;z|?}@
zzw@2IewJLH7x?*(ie_z2KT14MPWr9T^5rZt-M!n%uRXYw`9=dym>-oJwEmm<E2l69
z{V?O+S_6hvj*bGeN)T9cuU9EpUc+49k8{G(_mdej-@Jzhtn+p7^Nj<dK=yO2J0|u9
zrV@5J#}+eQ1L5HyxkdE=8eJ+zgXrzM=G);KIB}d+a+;Pk8%;a1D{T?or}q(J-3Sb*
zzr6Yr<@0ezn>5v9v(3rejYwFMZD*h}CZcyKhVj^C-PUya3h?KUdUcPhKq#mZF%U|7
z%|gXPqr!QnD|YipKt$TthRc0QgM!0FIOd9N{#*8n519o<3Fa-|NsyCLR?bcS{i`9x
z7x?UATMJE`a>N-PZpv|7aAtwDve9<peS!Td1cw-Oq<jTo(~P}C_md<^kS=AOf9)Zw
zWYb;y8OFZ%;w!)#%=G5@oiE$KPBO{W%is(7KoTsVSY9e>ia-yhTj%x%66Q0C#SsCg
zJ5+e`Ee`S-3}xl9X3<q=14MRW-i;ajM-Z#T#J$h!tcm;ZL)&^#k3l2+d5tTew6|~G
zu|z4`J@2)F9@WIsis0kNtKW^FoC0jwvfeqq+9%~pH9+UxiUu&L_N88YS>&X8=FoT<
zbS~rcHb+~6Z_*yuP?a;sFLhtI?&UBAbDpA?FfnjZ<bcc2>YGNHyrkc8bu+*glkVZ0
zLPHA8W^?}pTr5cf>9u<rTDQzY|B4E7S7~{BaAWmnnpCLR_S+Xkm7fod=uAYu-H^Mx
zj6#;D`2(h48D+B1q>B*DlVHk0C)V+lnxB?=jNVxxs6|#tS+3oDLArexR~qLHvW#S^
zxYW?{odJ9=%YOiM-V*eJt=8V>-=P`bC`CPVscXdQB+|Bk;jr)o)|hVmzT<Nzxc(Xs
zvb=#}VM^~C=!z0f?-$4AtwATbr@bfB2|9y`KmdGvZxu*=4j|+`JNpy0cG)k_-?G}x
z(t?J$`<+x2l;@?5E$1t^UN)UqSar0Cw^U7`<HyFj=ZthgFR*8uflFi$13<rcp@vNo
z^dIRNt{c6SQd+E=jg&<J=Nj51`slX$`;V{1xX^%DV?M_}>B{!OY1N7vXlU{cSgF1D
zVinrFsgqEy!VWKThT09(n9p?8nL$%HVN1mYg3|ZMKQOnDuV$yyII&{4Jr@jAZFs`C
zpBu~o9O~Lo*VY*<<u1i+z`&?y9)87W@FGMH3GL(AnrSn{u3jp&MLk~M1Jl4Sc(yP1
z{#j#Jy+U%+fh3q4-5l|N$7TwAlWt0{zFhRlJ0NuN#WwIY?Qeg8<Vv@%gCHR46Th~2
zpTE+*VPLqNKYjN`1^N8`wl+{T?YmMu2%XnPC7YWSSAfvdrgFW`RlAR}&KOdeR$bc@
z=Lqx4A`iK7B(Cz<c-A?XDp-(Ll~=lTqkUD!^1CXWk+3WOR#2_nbrqLyIQ%Pq%iWD4
z|Lilmr=NJUZMBN8+=rua#Ubjbw`WE?U8R82AYY^=1B(|w2`i<%3<<LoGrVdp*=K$~
z;MvHC_!=sUrvuN?63F4jk2eQ`fJ`z$+a9v0<1h-@x~`2f5G?vX>G#7l`O8dhpM7t&
zO}Us9?EMMTK^?RkuM)=JmEK3(nZx{C{x4v-dACp|Bilzkpc}Nn6G<ADeYH;$NN3Ja
z5L$}RIqe^`t~?y@Qof=S*xV0qJLws~HhcEOkrNH}OV)FI9wz#}p${gyBg4{S^Z3jc
zaBYl`7fb*AzW8t;6LY$u6b5lyc-`DK0j==n)o+mNO%wNi^W|wjXlNJdI{p6<_7+fC
zuG#yrB8_xODBTDM($WpmAR-|k1}WW$bci(4AzdONNK1pHQc{uv(jX<G@a;#>Idf*#
zZ+-u@&aB0mnHSzC?)%>R+E)O-a5Xe(!3<`S%P}fk*O0sx|L>Ryw0E6IT^o^txPS&(
z%7eFy^Z$K^&lEDF*lMjyTX5S=!XgSBl)9KLwXb~*nJePXz*X|p9#M-;cN)HKA))){
z<jnkrMPe37H*5Uj<b)+-d(7)csf@?*hv6!Ejak$l_MQ1ifm(D@_zR;l=j<=iCwXO4
z$C7JV({W*oRM9TbeNMc}PG>{1n*EcOpU5oH1W6o2e3D2y5C#Y0L9r{G^&HtDiD)d!
z;joC+U4}ZUo1puP$Z@AVh9I4m7F~f7QAnM2%SRw!JhH7_dcm0BdA}8vyQw*t7jajB
zk`$Zt_`BfZ478`|7BGohIH|{8Xa)xUyOPLDZ;|jbs5TkbD-pHg252AkBP4_jHbJq3
zyR==ii08QfQQ@Isq@o3|{qlfuP&g^(bLz$YIc>->w>7VoUPBO-ai=il<t+ud-?hY_
zHxdQaZ_mV+jrzP#_UMe=bLC>=n)+hq-z(l%i>TW<@@1S&$qP@kcY?7YLE|*sZ)*zg
zj2ETBtxAGl_mP+mim<R|B;G^-H2gZB#si1*oO{+E_&x)?!wRv1310(!^;)PBl0O`t
zF}3=JAT~;X?ULzg?N46lSojdrgBo<-5pZ-Mn9QB1eCWv~;$jF&;brh1AN+1V^j<Kw
z9gf`I6i(afi3bnfGbP-7PU1=<%49HU&)7(2G2rJL1(<~tg+$vbMnVeWNmreQ>8-fF
zWJczCZSl%-kW8ZxA6?KLt5;PGMUOs#i1+5W-x$K^2a;B8(3Doh$b*(S3i^Yj-`gIx
z5_B=rLpnzPe&a@^pZcpP34AEZm584i$lSp=qOnoF^&_Ov8~J}lg`b<vmBJ8~xU5&|
zvtqScFMM;K(y0Y!(LA4!>1x7YkQHO9>Y-Or98@|2P@?Q3Wql6w4s35!s4O}8-#**f
zy_GVT!e*kjBv6Yw#SBh?<KdGRq%Irx6vOrnXWKCIff$ax6p|haq*2}?L^9kDuE}Ns
z6Xvlc2HzuubJi^Z0U28{T>0T%mbvK#1<6z*;*>jaIn|ngY=YQCOAfku#|NQIFS=_6
zn08v|f}9MHq-hAjJfb#7&gq8TYt<o<e%IW;nXB`}F(Q(0@Z5e{lBMY!g@QxQr1yif
zJ`1d=dl1_;UMlM{shDNA`$%QQ{aG*Osnv($_f3d3_>PtZS~iHV9)RkPLPF+^?3zFz
zQlXH<z`+ohQ&{)chWMwf4Sm|^zWi3qvhr2(554y`_p`N9V&iW}Tc#i%(GmAmCb+P~
zZmz5>$hxNF@w^9V8zjLP#B1LR{`V2I^9d<<r2^_Vb_3{)E=gZDP+Sn*N63aiEc1Mf
zo}a^pZg1W4i{%mo(A(#AihzQk501PHO)^s~6t;w@Zl;~TH%4snyqwmGW29}H+aM+f
ztNLQS0j7~-IPM$YSeWht3+U2NE*+~1{l3&lfDGQ2s3zfZ9LXp0Pb=YsW^a)Oad8m{
zYqK|ge9fki@B}{LO|TkptbJbkyYA9Jj@0@8oTsqCJw$=?+UKGf@hQVln3`eAGbxq}
z<Wo6RF-pCpo<A1^xEE11OP{b-DV8g!9z6S3!}PCLX#~v|5%bbM!9w4<ia2Q>Wy49I
z0^jz=vkp*-FMB`ef|kc3LTDl6wu{h+)5iF<>}jCO-3!8}N#Cs$-KuGGoGNEfCRiZ)
zUb7ZqWA2AyU{e*3Br)<SGm{qA45b{CB7Dm0KsBvKtD!;=IF*%-clJ{30z;*Nxv`E|
zuk8})V=zPgj=D(KppfgcjM<!#^#$0Zo%W|DJf{#zIxHxn6qAco{%U9bdIy^2n#qs9
zFXX?GX4m|8`46|LI|E*qV?G^RSY^7BbBK=%_|Eh_T19jfQA4Z1@5d(fuc=e0%Y$g)
zjzceCB&R>^P}^=d>xS-P&gcv}-V`uFxj*U*+M*<o*aXdF$Dz;+DT_f4DTU3;enhkl
z{5s7?RNJ*HSB4?3?8BhX_jwGYi|uf@NJe{q5-Krka0@LOUss%RhW#fIyr+1xISaoa
z<wtmi{kIZu^-TX@1$p-6&WPo4lmkcK)oy2et$1KqJ}GxD{{7f;=%D4Nyu;4%pX&49
zTg*Q{k0Kk;8ns~69RPip&QJNG(_s`=_?3fk0;QH<huro|V?UHCi(&j;h`$I@3<?(C
zu*fICLNYV;Q8MA>o}D`#hCSoMNe!wvYzlmU^Y|=OGhKCqMo{!>sr?ATfxE?@BSr-_
zRm1VDx4#1kmlt3HqoD`paSV;*7Z(G>K|&b3k~&;te{b7ED`>*d{)ans|5R&daQxp$
z$c5}-kF#TxR{;MHZNNDcBx3RQZhrBF(iIC}e*boADSTI_h&=38D4(+*%KZ0p|JO0}
z=S{Fgo_3W)=h{om5wY__lCNqO!YHT-KF@6tUDTUoCcmMjYg67q7FrA1gTXv#NA>B`
z3$Q_RVB~+{rVd=`mz!Ooc^|HU=Y1tOSqm3;P+$eq{P+$pOA63?N09*YWe5`uw~>GM
z<X0c`t7*D;30F>K606#YRAk2xGpA&bfqygvir>p62->i<Bc_MM*^hJ6U+&t0mxFt$
zr0Fiqv^>m8Sl`m0OhPv9C@l##`r+u=>CwA;wFQ4YXabV&JC~bXeSaS)|K95U_vg%N
zq_HqqES!NAB7})xMk<XY;h?x3L8Q(U5)4L&bqRf-5Z6iu;|IdXnp9iEPZ89@yu7dC
zeV(h#RRYtUvYoXf1mTq;wb)Bt$H$_i`Y|qEP!K!-dk<UIuoHYP;r&Pv*db*H#aMg#
zL!J>1N8RaVy0qZv(?>{a11xn4Rff-?{2z|FE@)4o6*+MuLS&5<G2&w^l|e%)23joG
zaAP#IDE`$B{L>fw>xTkc*yD7Lz84<A3yp<o=($;io}XxP1KQ#AkoE{D6-J5U0y-|>
z-!Q=w{t|`i+Rs@5E{}uHmtltrZ8!|)8{iRIV8lvj4qit>`E=Cyl(yt3P&JT#n(oyt
zF5m1yWamOm!NPmFYvxd7uz4ixzUbeN*X8Yrs@Q<OMj)p<3}-qB2_JG>@^7g^RE-Gf
z8m5}>i<S{)<&<@$s^LMXge#WUn4zQUV3HzKGPEdxjIc!MEQ+}PUl(~3xya?30$unJ
z8si5TCU9!LN0OsOTs9J&6QI@^wgy9spzDS%8_-2BgMfaZ_C5#~jm;;o25@0&`%zVC
z3Z@9zXRU(klzg<A$ag!20A<8^gF?jVkr`gi6}oDR*`~(Y`5>x3_;kActIhP^0rFL2
zcY5{Dp!eGj4N#0;R!*pT^pPH!B_O&yq>nQn^%W)XegvGvv4@yV>0moiIijV)v}K?p
z1vm8@xP67Iir$JsFCYRWpbpuTK3}X7VgLW#l{C7L>T$6}?~`2S70kH@m9SvUok___
zr*#Z>K^Ee<K%*soT#i6La1A{rpqg?wbqsYV*a3Rs^_*s9VkA#S-y%1jmkW|ooNojt
zn{`2A>rL3q7#?4kMWQGWt1jFu3REQWG_`iKdWy2NBkduWXW*Xsoc=t10D9u^O%yJP
z=?m!Le65$MW3q+<G||uF+vMZB0Jc30X|-9jc#1hsJpt6^v-OoBP?N$&9waLxyt>#*
z+Rp#R8<7*~fBuoYPXs6wZ<pLFBKQ#P`KcC`(B6KCMX-dem&<>B02F;&p*nWvY_cH#
z??6(S(w`OpB$m!F0B>Eo_9z0oyTiJpD!nZ1kh2O>7&YhIMohYqOez09xW;?T>ggLs
z5mfjV4KAOZ^|;ZJRSW}FhGg)?%)yCoS(quBY!iyY!5T5M{1n?dHk(-bUW^WYq(%#Q
zX=p~<E`i7tEMU4>bh1dj4+dtGTH(7t5xYpBWAbN5fEH~c)oDnna81_~xpeS*^Zl!*
z{0)NqbIAP952NbHnL{IUfUNYOW;ob3bB(_(BKE`3FBf6i)_hI{@qBtFs_7so;%^iG
z$|45IXC`ut1jTBJr^y7onRt-GuOpRQ4sO*g`b=ArImDtM=a$jVYoS(PPEC8n;g@fM
zOznudF59;grbl6tvrc<r$D?r4kk#D+Soq&t;r~9G|9Vo-5O9GDwo=RVX0U)-7w#bt
z>O*jaxwY&9FBFi{G$Q(H<d9_0ek^x@s_+t#3tN}9pi^|%5z#*%1NEZ}x5C%TgE0e7
zwyCk(_i*<BnsVn7gHa_PsA~)vn?bcPh|B}((Ld!~tsZY(eWY`ook<2$2_0VFv!k=q
zj0u>^Q6>w4i`(AavxB?w)8<vG|Bn++mKFo$KI!+LC$nb)<VH@Kc>QBc$dibQXhxg>
z!MO)RJ?CYCI7)~)L2m;H)|a7}JVNEm5pqP!q7E?3!DKj~KuIgR4etGjb672~cX9BR
zKz_-K7$CN+>jomfj)2hw$LspLvAy4$+W$Q2fB$P0Qff?6>m2|rkKE34tM6f$$2Ult
z4-qsoJ_u-}_Denl=`B~BUae;#hYTWT%>x=QTCq0i?*;YG<M`jh?cWD{rZ{rY(JYo2
zBXgannC%436kh>!-~+?9-q2APH!kP;O+l^N5NG^n(W1ekRhH5}`cJj&`-7?3tMc~C
zk%|ubVvm2fvB;N?M~VlLinWB_>snF`^|vpac{jlxSRf_6k7W^EzYWm|s=;`dZ^BIV
zh7|r8*qU9>KUt+mR!Vm;QfeZYz&{H`1B3^DsWANzFZOR4e8x8OKCtBKcdXO1%dXSc
zij}bf9A1`K{m(@gCBd&5z2RY5|L1FHV*UK}^F#ZuzqtTMv%l@Yzzir9iC|z^(1!!E
zADg2@W8|dB0r?_|%J5A71^UAuC`x$B-HwIr@-ss>SuvkF&iiHL*=ug}m~#cx*)Ki+
z+|Ykk3`s0}gN~fyy8pUCnIW)&v(X#sUj5HL&qPbRtF}vr5h(t1JCiC(R<dK|wdg;;
z=g$u$5%Ajr?-eHh*L7y*Ais?be3}09!aFcGH}t#Au~nT!Y&zX@?Tkn7<&M3DNjxT)
zfJ1o1%Kyw{*5B~UY3v?DDb9a=HAz%uNw{Pp$+4^d{YNe#e*|53l<hxXxt16c+x`1&
zE<tHcfaoZ@4%WAC9g-~>@qYOdKcqB=A=dZHFQHc-*p66~^#13*{HtI3&oxTMBU`>Q
zhFU+_-!)7D*6)t2;R%UBw|@a6D`(-YUA33!Vu@oV(+f0MKG^(4MGF79Y~(Bd-?dSJ
zBuaI2@be5TVrCr^l~G3I;r{stY9y(L>EMGtE4O}Aqt8D|A>!Jq#uT39=^1z6UZlh=
zqF*^weDVK$r9YoiL`KwKN_4dT>yNXC!2=@Yr?w3L`#h6;PQWUcd=<-G`K+ua)hF!g
zPpmxkSd-%Yd!?d(-4n@LtoDBXpK&?4*7@7$N&D$UsVq#n4)hw7P}jr-Rc_J$=a$2I
zKx2VZq$RpY`tM8d6=9mopJIO_8X0gS{!B9d&Uc27>Pr&DmM{Oj^_%dR>Q-M}ZJ$4x
zBySz3r>?h74aq;WNWT3^DM67{`4OY0rq~VswR@%Th~LMjmi~LZOW@0k3^n^MWKR9l
z@9JFjHN)}vGNx4(DKFFZt6$_ypKW<LZOBviY%lA|pXaG_<$OX&e&Xy(yh3L69TMgr
zSf|tExJtA-2KQdin8R~cQ2*}%iVhoWF-1;uBi8?1R|NrUP(hdL&*;p2PMJI3QYLLB
zI;I)T@2w%0nm-Cui(>GrZ2ydGHJDI0MvAW2C(aJ)`Z29Ms9UBU?HkTex8RE3V|a<f
z5heK^O<Z*IjdmN&E#s%Z65_ySlli|YkpE-4&B}M)r!CkAr~De>&9}My{k-_LXrG&O
z*qXk4)0Xlt&HHnuadDEhX>NY2{Qj4@a}2_*N+}a4!>dWmBGxHyislyFWxmNuJc?IO
z^TbH=6G`^nmTDDQqoKQ|pG%sN6~&{W3$xRIb~%`k)5GH}R9qMT=f514Klk>^tP#tn
z>fSUKJ%_r1S-X2Oy|bC@`|`RPJK5T+9?O50s-^GK%}(vF+iI6;+eNspxm6|Upz0s{
zCC?3FzvmU&p7SBs`f4uGmAjUp@*(F~_v%mG!hUf$2vQb{jQ@QvM$Vr%n3G9B9EL=`
z*4F)NDE<-Ek843m{}@k9`E7h$Voxre#>T#J^&CcYO8U}uhMbB&UpkjwgPkt&?DM5V
z?@P>>uLbPYC+#cc_yWmLa-GkVrcTSgJ(0NdNg6BpOXlr@kD6ltccsrtha5VX25+GN
z{A;OFqMT+ui3#+c%~fD~TlB!qs_13Qb5@ZTUy}OKp0-=nY7F!L`3A>CtbtL%0_a9m
zv0(;A3+Z>r4;e2Hnqfb+#>kbfJChZ5Q%mG|`SRpggj0Dxc3(-W)(jbX-Zs~LjX847
z7a-dt1gt}bj$Xj~%X>WAgt$*R-();0Jp(Ls8Bl#*tCI2{*R>)zerxseIOH;KqLOhL
zJ_3Z*(zan&emF;om=DGsm4NZTDCDnug*ie77i4L5W0dNjS9}_)Ec5Z>9mc}v9Zx2<
zS?F?k;of0?#2;+nwjAEOTNFL&jXc{-7pyqt?nRtibV)khi&<Ru>Q4sE63?;Ted$nI
z_^#%0qLsSV$6HN%+@Xbgo_S8rB1mDgQ-(q6xSCu#1h*t!eh!K=KJdZva|dM>U-B3T
zuK1t@Q?|0zee)2itm}(ikw=qt10w5*2(|1DI5n_Omdb}cRW||H5XlRpko@hU2$SnX
z=-Ul5;fsAkh!a<KoC^Qh?na3f7f0^7zUVmh5(p}~k}jg5i~94q_G!QCUf-xa){~+&
zdyRX@iIZ7^s?4;!Yvdkw?XH1!c?absX40o|q#p5~A>mVbZ!hf~YKMf>is}>Nn=Q9W
zTWM;4>P*NO-0zQX&hJ6L*-z@L%)|r0=yp3*1fPpFVEBmcD}Hi6%;pmqKKFwgYT`$6
zSwcXlO2!3>P3h<8dBB3s5nMzccsuo7>q>-dc${Pa!mc2@-i36of6%T1Wt8I(kFR9{
z_C*fkqmQ0RmZqpGNumqpeA$W(NbEsYz2qhlYkXo?M)vMwwxPTqC^QtBai<f`p+1k~
znowxpu{1rZO>|5&4E#i-lW^)@JSwU@r(MiY{3KKo3kxzoK00kpPq=r1GHFodV-<{L
z;>zTz?2D1fHPqx+v-y(k#yZEx47k&u{IIBmIhs(t7zBGcj++LLQV|{Kp<>H7J4-LY
zyo>AhucF_e&YvBiN45E`<dM4l%FvawNiFM3OGDyD3@CxLbNTt1rP4MrntE&kJ=h<8
zM*q~JQ4;4RmMqGt7yYhsuKeuFxNB*fc*dvQO&Laax(kKKGpaa=w`}tD{_iC04^%WR
zD=20tz%o%742D}^jEW>b4L5mvw!Le)4RvTg$Lj{O?|RWx=Rf6b0$A`Ez?x`-z&d5r
z9$6E>0+;7toPVz~{Jmuhuzl8Zcfj&WH|b9PRG6eQa>mqkslWP*c6FW*lVvV>C1<Il
z)qN+&dU9RY7_}%#vNXRp`v+&aI==+rpOVg;2(xZ4hg3^D{p`E3B0Kxl{7i*=cYmQ`
zryUlLw`RQA<2h^0y~MU!<g-6|;h!<61M^dv=JOcTCP=1sxX5^!57=P!i>#N#I)1ME
z;N}+sb^<~d<bYmw<C<(^JJNI7g)H1<7I5$@nu$IG($Yn?WoQv!jD`YCGV$}s=j<(l
zS{}G)TqXDAMK+R~bIu0;Eb=UZ9reb%&*tB#7P-Su&hF+{JmbAt^aceBF@||C)8LNS
zRz3hQ(6zcO%A(O}tpSAwg&$aElBi~wIi%FzN00Y-$Ds5*b>Hr?cdVLAy}5TyEV+YO
zLvdHVJV2tw{Wg(i>X19>NnzpE4;8MR-91V_)xrlwC-a)?a&w5k0+QT>;;YgH?INqB
zJCTVdA{*~6$W5pAA~--+&ocOi>|JtN8+q^Pi)m2ML;RMDo6r{$bCL)d<jp_Z?0@~Z
zoav9puF2s<MObhdh0AvkxX=ym)&D}(ihL!T*lNBEY2#*>2$048J&AJMCMzzf2m>0E
z;=<9obk4yLy3REPJ!hnzklaFbu>We0<#^juIi^AdPpmdFzLfKduXL!7-@=1CDch4)
zlr3oiqaB{&MO6==KydytVoS)%=Zk<kqNLY>O%(oO?!z^^xn|zCr9U3y&mwSeF+&Z*
zFCaFI&bFY6eFvbR5B`&`=>WDp4<K(%#jYw$hus&b@E%*nh{9K%SU=ng=Bzh0Fbxa6
zHR@qBj5J6}*)OaAsZMj~qMRzbb@OG#EEJ2aUl9vIJKYb!S=2|>nc5V7s~2_a76K`)
zPXz5PyIqGloZFmz|7?m1w6CblKBRS>O{K|u^&cp+<pjT`#3GUYey{%57rrwkhPqol
zBbHtZ*i&$Q*uC0Sp^~2oF7aE&Aue+AKBB;to$=aJML?br++Z_jCjem{_~Q!L-HNR2
z2fq!Y5(B-PjgHHG*Wf&om-ifi<Xkv+o`zXuQ^rvFB7TjjKta+ckAyJ+VlY*!TU0EY
zsfOVH`4?W>{#DjN@vT_DO0tos7O1zHSv(=fQU;cT$&g>eyeJNs-8aMNz2|RM8U2oi
zm$jSrog0y>YbRD@=zLjScC0fboGd%GdoPP<RF!r%E-#SaS5A_TMU(F5L4(p)3C&2|
zqK3Mb*bI2UW~lnczNT`&ZcXq4d>uZ-yrZ%eiT4BOq{2?H_(L+%_YT7>H+&2Z-G0P_
zSNN^lLVIX>8%f~+SL41`=tl?Uq}f0uISCH2Ysk`&mjfeL0hl02KZ#$rBrM>A>Um`1
zX>t8ug_K4vs-fcxMx*Eg7A&Ut!iOk-4?LxJU}H*#^-weS$YEk%>rSX2zCSw&jDs9e
zlsbe=!hmJ{N@1^D#Aj~AJ5rL%k(S1&DJvsqOPuIrWU!6hSN&Ibl?OWt8p96aGG+7j
zvp{weLbU_ijAZbYAl(o@FeFst2l`tYaSIsUZZ7G(KKKp}kPSL&Er`!x-+1{w$E_cL
zg<1f`*KqrXwz}Bp9Ah>szArKjqK0isUvOo+7bsFj>GG?k^csP1Xce|fX~+K6yk)9k
zGJd{ZVL!6YOoTyaCu<Dkfb}m}nu0h|bH9fPkKcLU((NGnJ}#(SKfzS4PA6~V3=eRG
z!8fN+b#Yl!Y2~5)!N}sJf<gMI?HjCa-gav3gRyx>G2;q@bg0TOmVY^gKrG#RWl%Xu
zF~9+$aUa#=Xv-$(B1O!{HHa+e2Gpyn$3v^O9}y1s12Vs5JqNrTvlsdgl6q{seD4VS
zH4L~v@?!g_6Y%=`kUO3lBqF?84TiU51sz4qxV|9W^-wdZd^k8zQv#e>M7?ZLq@_<-
zV1+!;gPr2NzaMBc>>cZcE2@l!9`5+x0*N3qI(!093w8(V(SjZZJJ{7z?nMYQ7n|lH
z7p|6~<+sjV_cg0M<xadHie)tN29rcezHAp#8ge@>eCq<J%`a81nW&;=5!$F*KdEIG
zM0a*78T~SH_PB&kMrQn3eAPFf7kV(F$T95zF&i;Fd@$|rqySiQx>^!xlot%z`@g)8
zP2-QDe|5zSR%-e=#!H}NDM`?40$(?DmTbh4+Us~J0W=)K=<~xsQF{nx{mWw6a0c+h
zVm0kNsBTuk$zSr0kq^IQ3Py18qahwjGJ3{8>O5Z&z}a$3UhCIi-K*~!OVq9L*vNv1
zaK0is3xvUYFddMunzDR;8oZ49Ve_{bgK2xwc@c=$bA;MY@C|vF@el<!xB}}AWAMN6
zLbYS-d^L_|=ol%oR`88~!)JRVgZD@<vz3zZVMYahJSB7|8g!`GOr1(v_(l)JgFa{>
zPyYr=ZP=5czDUMHwwdp`X8nM{FBBB0(rdJ!g0IcBN=Oa3|7Hh9evcJBby`$k7v;Tz
zv0Zvp2cFig@(_J|-$&Nj&jlP@NM8;DQJUo0JdiPX0DsYwsYM8%sosYZ6ld?C@Gd_n
zkhI5~TWI6!UvTCbzf=Pv|7VWcqwFaB^37Gzc^yfVROZfqqX6uwl9Z<ETv_SJ2{G-l
z{%WQLZ-#E>$%&1O<!phg^b8+NItYII9m0gTJ_`#Ea?e!gE{DSe4M}Lwb*dk&>7hkp
z1AqyYSkj5ipYnXpPUAmzpV`P!f~|lzyXDuBOMurC5rn9;EQxNkhuSZ!z2xZY`ZA*J
zCpCIime%hnckAM;5Z#-*s#-1TMDU^dqLbaYzlt2+7-AtH-XRv|yj!C8dq|M*o|w0}
zFCNz*ti$>)i+02rj=6pyYZItemNPzQ-rFS0uq(v7nGvIZxqJ3QuTx3DbAJDCOjs#>
zbcl^?P2UM09i<eAKVx7IQgf&7rAwIP1ag!K?AMk5RilrH`R25y8_J4_Pa|AfYQQ;{
zVT<TVIs)RU4S{G0(1wLSY4+iOp^u2;a?-;G(|&JMxZVz%Rm(W*vG|r<Fhk_&CB8al
zjHEw4xa7tav=i#2K(T_YL~AP^|AjzLOA~)Hf*6yu;xRH_rWN)zG<*^h;uI6+%~+}?
zek)!>3nfGO5?YlMYEi(s!j~@~RIfza0olw|u2$*9?`Mf5Q!xL-11QQSzV=GC8)2a(
zsf!=Cmpw{pk7lvy!nk-Ts$2uKBpQybSS;6dWITkmeA#*wp1$!ZepP*E?0Y1j(1<P2
zl2zgsCyK5#AU5n2to+)4K{l2llGE}eM*RvSi`hut&II@t;vAN`fv@HLoYtZ^x?13t
zs={!RH#zpPfdqtMf<?LH9%vBU)`fEHvFoO8NzYnP{bFpKgxBn$-SpC3IR})Q`D(}$
za6gZ+FQlRUKKj;InY(X}b>1@+RYQxNLr1`$q0Ae(@MA8u*Lj=qAjhJzCrna|&<z~j
z`@n5`!@1V;XlusH8%i<`HorGq5=}hG@;_`PTF_aNJe3TMhUf;49}s>260SxbCi#mH
ziX4g52ue&Jl0b&1OKyoKYlzVmVlFsudn+$Fk=pdaokyC3?8PMrHHEkm23W~3%o(UO
ze+&hy3^$>|W%n=96pcYgL@Jz`%tWL1Q;0$k<Qx5tdS<)vXTb5ZJ>8A;NyHTxZ$|Qx
zgiUWw*29k0mkMF=6dg)vLhz>L=oX}6j$HLkbD3o8-Q;|~T3Uye7x52Z^yMb4+IZ}B
zXW1C^o9Q<uJiA!;MH#X%WSLaBYRzLF#O3t;eHMg!?pB(3tX{{RC+K*PV)Nk7VDYy|
zs|`;yr?BYJefv$D*(PuKUk!urpn*bHpp@f9f!88u0_Oi|f%bt5AGG-c-nlP5mhGz2
zr&6~M*aud(BolE6@u95n2IH|Oo2EKbWYgp=Ra7LrcMtL*`L7=%;<~nAd~Pf9A<QQW
zL+%}+-q&Z@*EPCk8gfu*PM$przSkOnRsLKDun6BCT1jlCPxOfZ#$L{V+G8RrtDBT_
z1|fn4HkwjciY~=zH^_~80Xz4Q2gqc+Flen>9WT#DW^VMw)S@odFE}n*w6v-<EKN0O
z84VUTB4o+RR~Zl%dkjUy4Ssj<*=%e=%tev(z;Z9Y&0nyUHi{t3&Ar*PU$f;#$1t1%
zD`Xq9-dbz~q!_41nB~YQUU;<dnQW#pvhcYXUOo96XLQG86uWHP?Svm9AygJEJKa`~
zxBjf*Xoss^DgZ`V{NL`(Jip@c^<|jw*Ljg0J0Lt!X}M19LdB8<_8W58af<8D_o6SC
z@Yy}O_nd!>b+C=3h9f2Nw35{N;=N-)MfBp%6>pT$Q}UR#hkVen&B<M#edog@I6EQm
z=|uH53pN{jiHL?JLNOmD>{b}pR*rTa_0{#4%fv)Q;-xD4czex98W0Eq02tDF+giQV
zXHEqz>K&x~kH5`M_-%<R&&P|NIT*9~std2K&meV>VPW?73M=USNe`KgVhMTY&D#Bv
ze=Vh#=6xNi$wg+L1JbGO2C#|axYwty`=QYjv@>!xlmeY$MoCodb4OaxjlWpGj&(-}
zHNATFiDxCua=K=^n$BkpL5nB6*u{C}@tcqG!c|NO&z1)>wsZ9t_Ze4PE^PzZ+Ra9?
zWCW-xI?~iy`pz)<)6`bnbpgJrCtJCpZ^$-epY!@-;0uJjA8H3jQN+OO#d1t|1?wDa
z)Q902pnL5@f;NOd72+TRU3hBv*(KtCKebh?b{j_0btfSVW1$Aq`oVEjg5qi5WCywS
z0t|CwaS?XUjuTGh=@0qpG<=UfWDfZ|PZK5>8_;%m@^j@kSUD#8ohd%<pU0D69WK%5
zizLqv{S|j!XxSF}_kpHe$C+m;X~xaA7crg)eP0ZNT%!S%tE4aL8tBnqNmI@{_<T#b
z)xSM#eSJ4Qmf5N22JY2(fJ#`Id<_G|mI(9{VxDa=p~T-jr>Dd#H(rjoTy0P-ZK)Hz
zjF{~#(=m29U^`-Ee)kTfl=_7^a0Xo)KqFIo8l9J)=v#nJhl+E9qohpQ(o{j*<^%vo
zT!9&(r0)=y_t5QDSI2F;)+%#EDq-mWxf6Xbf6GwU!*<vN0n^3&c7(b4;YT7ZSayjc
zbakh?I)n-LEr7XJRy;xzT9>>D&}Q-4s`yj`fygj9W{r|8$^BYASlF4p&b|rT$h}_s
z3PSSi>n+mXK}5sXF2s2bsFVGO(YT_M@|d&4^pi__hKs<By@t34b}k|2;!@3sYzPx@
z_{em;f4Y2W!h(S3j0rolqwk>b_XrlxLQ$a6I*ZxrB@MZW81_of_ZFsupZ-S`cP}Uy
zmz^4YG<oE#QWe8=#u`rI`7XT!!AF3AuMCTwrYte-bAxfpw(Ll8O9=IvEs{%o)*U>c
z{8hV3rZni8WIrJ$@nL&tPA^kMY3+H9U|Cvx6xaeX5SQXg%YvK>-|auEBoA<qmq#ue
z*zZ3(tA#Pt!xGUDNmase%i>pvh&laZJ~D|-g~V{Q?G-R0?ME7r&PbXJd*$a;bXvDT
zB!f_qt&pB%7><S1E%hGe$w(?(9G?+{q;q|wqxBM7xuUl@hWGL$V~a&S=-;|%&E|ZF
zL)`5C$hg!#neMG9JGbX70C`^3p*|}P$tGkG0d3IDq966Z>VX!38c?K@p!Gydx>KyY
zC#?%n6+fw4E-gxi6(Y}YUZgV(?Sr9zpplv&`XapZ`<*WncR>T@?(J_n^mGF7m@Dk`
zi%EU?pKrL<(Csz^rfuDvyhZlluKQAaL+UF#{8qG{>iO5^x?Wo1pj7HWcqP|Xi){Z!
zmGM#RsgyJ@u@%GQm_kL_wgVcv5R1If<Q7q}ZL^TehMHj)Xc~sqQUv-%-?aPG%+cGK
z+Aw|XjxCU(3ncKETVB98sCYog{AK=}^0l<k*!@5qOCVlYj!_bSs<3!E97Uhp487Cx
zFYy|`m0s~V9h)1&IqMZCdC&7LscfqBP%VQcxmvH3n(~QrcBrMY7Px#Reaiv_$=&wP
zHI;~#Wf^iD3lpueIyP%9)m3OY)2#U)@`XCC?5@nYP`)l$zdoycVoqAx@Yi%OlMAyZ
zRj=D!Jvm#tcB1TDmoy0%gVG{A^Fu9}+)$*$Dva{;Lm85frVnlA$5bFfl{fX3?1-P5
z;eH1M7DvFPy09}LID}~MNs7t*pfeF6%S*V=;;Q60T8F@2J5mcU0s@!&sKSq20EzfV
zd@~n;b#WMQ1LOg(SjH;u!5coHNE-jTx$FL`OZ-8Q3z1X{=XIl!L>6P}liQM3fP0yd
zao!nvXNZ!J4X$qYjedX@BN23JOB>aTdNsf|@{G8R8MWZo6gzX#R|k~xV$qkYw|@3I
zQjm0ZjFKJ-qX>AOr+fm3t8=ODnh0TmJ4&hCW1EnXpJm1Ve>I6wg1(HlV$(S!tyi}E
z%QF1AO#R4*&c{Z8#02=X`D+r}={er5p<{S>Gdp<3C2_hnxekw5-Rzij7l}#o@U1m!
z@odX(`JW4LQf--=ySVMJgh%b+&hqdxV<PjNy$N#$|EV`()u-__GJ9Lb{e)hhcG2#h
zGP@-|iC^2koh<NM4Yx_C$g!csOJ1Dw?k<21dxuZIN7fQM3gTrfaNWH+D|k5RXs4cc
zn67xhN`q4BjlH&b^i-y2^h#}r_d;o5&O5zPadD-|yIQ`HRu||qbU2hm3V7l20-1Ph
zr-d8=$u!sf0kuF1mr>mpZ$2_w#J^MnXS0kdv!)3OzKeJ=N*AI7u8v4H2Urmh#`jLY
z!$1jz1t|3R7)W*=elHN2kY=o1XG>#eynL8kA2c5VFqPp9Q0YT@0;CA@(|IN{1Z{#P
zSnS3XG(q-^2Ysdqt&nnk=K9@8PojHd;XG58YLO1?BQ9|*7MNT}32seuNu}vzL&S{x
z=PU|mjYHkAxc<aDAD_X`U8K<IB}{-ru_~l{HAE7=+qJGWEQopIKYN73RYTi*%=*5{
zq8IX#rw!cj(fw#RzmUQuBXqW}@BeCkVLC5E8)|@m5bxppBCZ9R5_^|GSJRo$JuEzH
zYoMnOxK1GqH<ezauXE`Qg3CX|zgpsBlpt+Bq76){HgPn?`I`%1Gv$GC->%6BfD?w4
z!Si__;^?MzfykF6zN>C8Pco{-)nvh?p=)NI5bxe->$1`vbc^2lv70|dghp<K_zPJ~
z>2!ZPo#i;$)|^I<EyfiwHSUhqM0o20=Rf^q?{j9c<+_DsD+Yi_h3nU(Uk1~&cVPu7
zev(>dDVgQXC^9b}MB;YFyS_14K2}h7_a+`Uk5z4unRTJHyLZJ{CEDEh{$O_2X@tdq
zZgRd1O2&}gn2~Je15DaG_QFfF9*N@Kb^0KbcHH8SPP*(ePWSP>KmdNJTm@vgP^cX;
z;i|2otqVht%td;CXeuYQGgz=bC5SwGkb~Ri&1gZJLwo;H{+L}U7W!%%fnGdsVz=ws
z2;1R{8~8OKTwcvf@5LoPg&JFMzYfABlK_}vh`#)tEkE_Q-9&gDB19Z)WKNj7Uf}&o
zMbgiJ5c{-j9MG@^`>;E*_ffjp&_6k1^P!3~e5aj3Vt@O964a=j;1cx%`+N#bjdXPS
zfHTTIXcG4f^9^@lw@F6qA`qL;Ya;2Us2aHr5v^BdUDmN2uPIMsk&^gQB9rjv_+@o}
zPf($@dz<uSHb|8Tm65aW1-skBJ5Z>mZmTtLT$^Q+4X7<%xYbjy`0~P0y4NsWC0F@p
zmnrvkoDtn3*GlQi?ziGIDhU7EV;v8K3$Kr6dLNj*OwDn$p|O$(iMgFmyE#!*Txxln
zW)FY*%bmwV>l@FN&qRiXqM4h<7_4P_&P10TPCn*ozmgzb%fO6#AsJ^CnXw#1)=_(~
z^Y;DOiO%E|^BG(z9<@{Ip?-b>>pj;?<+izK!Nr>gM|TL?Z&9vTKF0Sngqq&jB|B=H
zIli0i)i?{6cY@Xy+2c5-9kwe5qns%GD(61>_2QzgP*K}CgtFS>yA0QOlofPj(VWrv
z;>TWNzTYypf=SWZ^mMb@e&Om@HGczKv-s=z7mR*EfYvirJEF&L?Gg1G(W4LCTGI1P
zesaz{J#;%QgsV`(M^dYiQBYtE-9YnqGQ)HvcvyqVwc`yafEoEBk}IPgM-&d_jfi$4
zll#{4K$;aIR0z+?nQL83le{70A+*u5h?3%ns#rSrg^uN%;o&#azJe|_rOrV+9y7XO
z{Dd~*41^i|0QlMV{Q^5O4f`Y7>P#X^J=-MjK8*}I8uTSf_&gb7Q#HmO4v7hy5FfpY
z60ZoYyD~Lt)$f@IAPwxnl3J$tSGlUJBGp4lWVkRc<H>!gZ23ZoHZ$yg_pN{6&w`^g
z!5OzU(Op$#Y(q_?gLPZ2OM1o+8gb#dV@xlC%l6|814;P~>2DUQ19U!uN_n6T%Pbi%
zp(z87)@x6F>{S;uE@tdLz9uUf;$ios)xD=``Qu9&4DLoj>6EgSY=U+I(OSdo?fPwT
z8Ou-)tWUGGbMmz+jI1x!8*f}*%f}=M1|%wSyL-`6NjPBCR93OJ!!>yQoaLRc5*EkA
zc<&d=*q@FNlgLyz?)O_b7+&1<8@z2&s|crb3da<`m<Z*Fsr?R|w(m?|2YFv#9;<yI
zUl1T3UxaHKj8Y?U;?i(oM5Z0g>KqFTDwaIDN)E2YH0O^g-2O%EOHbei3{)BQy++{9
zp`X-UZ^=7Fr?Q@NVx&{O1%+=KTUz3cG-%=V-_`-lAr>tkQ+C;_HPyG^S*R!;8_c{s
z<VO!JBoc5!eOf?|I*;We<O~(&Y!dilxZwok;DGt|^A)Z|hyIq`!2SEVZ}cK66!Rr8
zxi8U(dg~D2-FrqAvDqq<#8{Z1h%2Z3+T_bU|L%aaYj~Z&G({p~=kz0nSd7JEQ11aK
znIS~J?uLGX=t*(6<S@ewal9HZe+#Rg`Bz^S+fu;%=qHr{qV`4oSNT|-7@EYJ2in7d
zMuQ8i+-waONgSgmjJ@KUS^7hYti=!((G>+F@4qNJzHTU^V4dwbFCyFSDX^^=_yl8_
zq|iGv*i}@Ep+11N!RMQr%H(31$*=iT$AR|_HuJxTO_gu=Sf!d0V($t+TWAsbn(2Sh
zBAfI~=BbW3f=pv!3@GGi%@N=QH{AB5Gcw?6Ve^?ub7zi0rJ}Kpt2m5fzHK#k?ounC
zR{Z>}FLylc8*%Jt#!@!?Z(D5LeSW`9;Kk2=mN8zVlntUEI*X5r`{^}|QA}5ByfE5@
z-li)nw%jNFa)N8GKH|kfzE-k1Jd4;<JGi&J22K1g7_FvJa4gtfsm@1|i<7_@lxOQ(
zmXA0L!Ait<@jl{T7J2jasPvF3YW~8l(&q1(TC#?@Us828WKs<A!hWV6m1)(rzlc#6
z-eQ72x?nw>UgH8P!RMN3>Dx|o<_47c_|w{m0ADW9NjK*e_On;jlk6(iGWzX!A*R-J
z!Bz`(2!)vvm)c0kyQ3YKzuH?GCpzHWrI4vG4mn*zAA-cpQ*b{eg^^BH!nZ;l!dx9S
zhODf{(l|ZNig|frQqCB9dTjKb?_lk;5^(;+TJ$-R8M5;H>RGOHSovq?M<j)nWlJs{
zNR(RDWl{Td29z!~wUQuVak^7%F7!IJ)CC9BVlNc_Gw7`;Q|Ke*G>!Wdxo>^#aG89}
zUBpLsv(7D9{ml6lrVa<fD*8P%{8Kfs;`zR5P1nj_tngH|WH|fG`h5cUr(&*C(!m^;
zT?f;}`oT&f-kD=adJ_T*LQ|RxAI)(1OJBq@Lo8NRX8S48wi;5eJoJCB{`LU+RZo-%
z6CKAerWt#{pv+(U?8jc$I<EVQc%x>fL?~Q#j)NxEK&y7W58RQEG~NN{f`VYwa!R(Q
zMXmU<(f|&Eb#*SyOM`eB_TM`rk>L>s|BZEAYPI4|CKq~^#uqcLsIS#&zWzD#7?3AB
z6fovlST>1YGiyml-NT!nUGUq!erWZbtCh_M#EuVONEUSPqpbNn4a-kuOuc1PY83z0
zyG@bD0}7nIsSy{d%%7DVMH@OQ|LC3+#%S(MyTxz4ebHAv(Jhy#vw&!_+1Ih%qSRz2
zLSL#;p6De${gs((@$6MK>Pkvz;-v|~O*aq;{HVelY)gU&tmXKvGwZ?@r;>GuQT|G%
zuPFX%Qmfza&2gS!x#<JLU=gbI<r>M&@Vy8cfVfH!COI$Hp1MLy2a~OteW2Wn0z2VK
z)ZTll1AeB3X0X5?Kp1jw-R0Hfg=yoBXWtFp!N<&Vue4p^wXDE#6*YPxwyM_KcFHA9
z-BvlL+}nY|>Ahsmw7}jj+G3&^Wsif-eU+<=UQaZe8pdaQ|D}B6i2EggBJgC}jz0{?
z_>0{K=?^b-SjqKR%<>Q~UXnJYH82YZtJleRCixlJLu`YMFaPZWa}mWpMOmO!dBU9D
znQz;mWlk(qHCwvzA?o__Q;gMMf2kfWPr?k>5@=y=9B2nIaGOQU`7+=o?{HDP;Ober
zZV++(vuRrHubL0xl>9vnu;nhpBxq;GU3lNL3#?lkwEJ6JAEN3s2(B!mK1;zps2eiL
zzmjv-_*9%$R+*@Q!;!W(o|*IgLCwzOx?;=vT*nWTK{Z3v;TMaUHjj&1lk3*R7KU6b
zh6`sh2oCO|#t>speZZ^p3(TD_l6`xne#M^aH146XjPRJRjo_8BU-p*~5s1v^!;N3@
zU-oefhN^;s9$A1-$L?f=8$;hU0`!etM7V97{rwcT*Sux=MMXzqFEf*0bMk&gX+i#V
zZ6$ZKUb)Tfod_b9$$_QxQ8!hsa{MLhw_3ui%MuF!Q4ebHhro@Le0lkSU`XN;6u05d
z-|U6Yo67wHEI{s#ATyz+omK?en=-q!n9AOT7iQ`olFA%VxG$A8uL<{CG|wFf;k69K
z*MH^?{^1F;G$zlPR#fpf@zaMEZSl4^Z&^5d+x$8NMDLSfyD`t0`nFz~zl6f4Wjw*f
zr=*VOfX}=zUPWAUP{u<M&i~19xU&DypZr`&8xCQ^^_?seoCzVR9;G3Zof9*O*yA_8
zat6i?1Zy2=7YTmMwa0hX_UdwP=|Yr;N~Z!RJ&nt_seS*46?%=gsIAt=Oj)m|6r}%p
zIbhPPng8A>MKzv&V>dXo4Tstc6MfHT=0X^ub<bP`)x}gpJA(s;zFI*GPl97R4F(JB
z`^z?m5A=z;rWiNpb;Pz^sPhmvDrs%(xn*UKM?;QC!i!A!A|=DtuO5!qZd7Jcwm*Fm
zPHx6A5c`o(-**r0YGLjpB!5Ud6FBR!es%jwj(R2jJfrDC!u$tpznmJAv>@WM8lGA|
zS6A_sb{`!};xTr`xA!rus4?qe4s$^vXI$sn8ehjr^J!6tZ1m85B=5$$w|$15fzPNh
zY%{IIn8Y}fDWivJ95zj)@<H1=%hdL}kmc*f<)VoQFWpdRWm`%m+k5VK7y7%ColxWq
znAu3I9=$3~!*m@#qa(Pn*b{QQleDIpC!bxfMs~lYn&+L8h~8B_njYubvWu7BtU!rT
zlo;o@v~X`$F!V#Pm7$Ly)BbyJ5~iK^X=$Y}e&^}D0YkDGS@<oHv_bYuSVz6F7-{e0
zX)yg2ACtysQ143F5wcm+B7y6V?O8d&PzG&~or}{2a?ac6<E|3e3!g>xrUbSk<3#OZ
z4FBson#A`FeZIQhwk$O<YKgtm@#LHl3QarLzC?nZzC!R-yW@jJX8$%dp2XBidV(`Y
zPr5Pj=q^-Fr8{ixVmHV9ZFy^OYaPZh(A3|Ix}-~=lgSaY=x#+4ZW9A3S?C-Z(f!Hu
zuaE62bwvs|_Cz^(X)f@sElx<bcf4;em_07=tT#@MPvKvERA|^pgh_lASYfZ#1AhEr
zfopK14q<z;JMYH13gGRa`d{xq^mL}{JOC|(jZiaT$u{)r146!=05Kh&nyzD+9+aUI
zd9C!Ud=_<!G-(_;Mw06rU)x8qCR#q-A8a|Yn>N`3C16P&hIYzsscH}2Scu_ANVHdP
zzt;ugvOW+4P4C>p^D2*sW1j20bIyvi<Q{FiQVe$cQli!D5r`%CQk`uy6Yt_hV@7D4
zQj%>Al*rUl7yfe2o)dc{xG`)i;E?%ze-UV34rE1IrBt~T`3^6fb2qV1Rl`pjQf)pp
z0z08cftK7=_D-*>3_mKKx6lXFqX!!<vC{j{YpACVg^C;-Q$yxE847ASb&WA}Yhgjb
z!so2s_Z=sa+wmm08&B7{456c;i5u<#eC079A5yUCiO}cD&U6Oc0H<m^@Hnj5FyZcr
zAF4>*d==*zTuU<%+{lsJ7(WuhT8{6?-O99INTzAdRL+&u%))v0N`Qua)n3CumR1l2
zCO-ISU+f*86W`)8MRlzdWVW|j1p)RWncLygim^JJrraZF9G#{ia?E->^XAbUedoK4
zz3%OX$SE;9a-yUAhZ-kVV#OLy&WS9^Z5gzywPe~Fb14&YK_litHaCQhLSj&YvIwB>
zwF-7Mt;y9SyLfwjUv0v|T)#o}9doLTdM(KbT$8NMP{B`p^&pXcsQ&;XP^jAEd7XP<
z=H(nc8CGI)o=!Z#M-K<GN<9$~jOeaj38FbUqHo5rOl1<zr?n@+UglC8;xcX!@_zL0
zqN$#G`aMM9{hRT|`-d7Rn|X>U8|s;9Dte^z&9AVRjh`z`vy>s?QHYFii{!rF?N)V~
zz`53?3bWZ*{&O>8`r8{|;pDi88I7^p7AuQHwvP#8mp`x^B!z>1bNEE+Ar<x0@k|G~
z4;F0inOUi9nq*4v(qLp&!w5TkO`iobPe`Se`*x5MiFIGQT+BWRqL!jlt>wr{#WdXB
zEUmgY)VzDTjV*B>(~8IX1KMo7{my)23O}ppH&v@EU9XZQTx&DX&%`^ujr+azrr=eI
zAy)jBMpp7FW<_7Ku~RBiIDtb*$Qx2bDaHyNv6@7W@v7R$y1!_-j`P<@RAgv{o1C`J
z-$(sW8ctSnw=$B?W#e|+bkK-lpPVAD0anZOQ<;Xob;Km=wfBZTtcX0KWIcNuyUp_-
z6;avkwA+b891pOHrGgt5Cw?$7?kEi?Gr_l@$K;8|B~FjJgYB!ZBYV4z=4A(2Z$Z4a
z?YMNk-}T!<TRFlL0giZM<E7fFH_J5Q>gjD?cDZOu*|nPCRzJfO{ux44ZnEeGBkM~W
zrJFnIebI*;j~+k&{v}pjoGlcW){LF(a6{5CUfzP{!OC+xj6hGFngFS4Z;u@hV{u>|
zlJ#U%SK3jMR`<J=aiUxv>Olhy{lE_AAfdyAG|AMhw1?zU)Ug()d*okl*DOw~n9sRP
z**v5aw56VuO+XX$7Kaq_XQzT;Xb(8`kBuJsUSrSV95?z@J5;`_<pM-tlE%ArzL+|E
zNa5_hKjQKiO0K9tt32LT&zmvDLeJqn{C){LyzT003=RdPYrQnJX~h%BEQMiP5KZNk
za$r2m&rG_6E!kb%eo{I-Z_k$Zi)sQTgAv0lW~Y25tn|o2&jWAf*X_!=Ir;JTPwvjO
zy)0#nsGzQ>2o%KB75MBl9A5fmP14^JkLv;iXNAiVSC!7)-Eerk%qMV6a8#3;tTfya
z#5x#qc|X2Q;)yI)v`|C4XcPC1Y-26I!8FlN2KZ*LI;#fcK3j`d<mK4GlUxryF*Jn7
zXPF|YWgp3%S4#Hwis#7fuG21w)A{L#Qf|sU172oeSvqr+V};i;Xb8?DTM8a!(zz+p
zBkkw+K;1rhG9!;l9B)IP;iA`51v?aeh0j~ih3P3*_Ke{r%q4V|B_w`1=Q=@Y>?`=d
zIPh66hqKddN2VNQfheNyjEgy1P&!{jz5UbFKV2@S3_-XX8xEy*{jBcA1YQi_eo|J@
zI@sETdRxcrJVx-Pgv-})XhMhxdQmU<h88y>o`>2_#0_I)3}xS!SKhp3Ipk`=tX+7&
z6sjG5*~^!Xhd!R9H5x4EII=@}$7(O(v)=2;;bP@k5U=5F5w@)eMq|b?;BuvB?!lB@
z&=M}c)MGAk=@6yljDj0)q&F*pCda}59ZiFzT?`keP`?F^o8NsSoD5E)#Y6L{WJjaa
zkkHuB2Cp*7gzI4OGg^|sonCuAHmA(9JQ25zXQgP)IfrMl>2^jC65fqSZ!b;o8`BoB
z)D&#pjr#H<XBwQ-{zn&v*k;+;2pun^6%4YcWv`2c9a*(gDh+Y4tkMkB=8(DOvdo$E
z!ldW32JSi;^#Udi-oq$?>2!a4-pPY4P&?8*OvbK%^2yVfj@rxm$n?|6n{}}xh!o+6
z!^`uFC$rTxNEfq}_oyt`ZMInkq52@JbP?Xv<#YDr^sle`7H`^jT_*RW4R^XH!~qZ3
zqkJ^@6@@(=)f4N}j{}gm=wU@gy<n<JcaxyQ@hgr?L7THk>cb2fOX{DY*khr$xgF33
zS*_T@9--LVirHs8dkvFq8$!i3ftG0;&8iH`ygO{jq&A&(46n%`Ygx+T$9nQzyGASV
zxgzq9c%|RKr7E)VDn^$7UhOyz3R+@tl&LA;=HCwKVuXbA3g1!o@GkfHVx`+0PIPmX
zO<ERHEEn)#8TfF@pCw+1y%PWHH=B(3Bj2pqY3Gq+>Akc!uWsO!w)HzCC@Ft1;#!^6
zKU-6lD2SnX{`2*VW)`2IA}LhaIUg*$tdNdk)@b}^v|fZih;tbtgj(I}!qBjq3d-pt
zdM>Cuarssg>q%XXI`)z4<Hcu;sb$is3m5{ET)XdqTQaYDDXaDVaY!3Mt;%Mc)-;uh
z@?;R^R~V9F^>_3t@eG9>qu`XZ+PqCIHLCZx)t8H@mKN>NhbMa#K3u*cw5VRON`)sJ
z&aHd~(Xa0(WZh`7mCSZS6$^mvNpTQRdn3B3uR6bJe@I14k4eG}{Wdu48+&dpr4c<R
zo*FMz9_ip~Lt(v@*3!+Nt3NV6iSs7zel(H6s29q7;+aFwHrB0g4Q29Zu{LBh!k54P
zXk5%ek2NLNCfm?QeP}<*ZyMpS*OQ+CI#<i*px3Gr$Xbj1iyUQ66ymTc#d^1Y;fhC(
z2+DR`^$Bh;z8g!oV8IzsMC`9D%%=q(MVsuejdFZzR@b@1JR<?Jcfar-F=6qB+wOq`
z-^UBH4A-$rafR@DWiKyh#+FPWbiV<TwGx~0tzNF+TRmm=Bp0XX*w!c#1s|usC8SuH
zY@0+|=as@pEASAGBYhztn(%PQ_4MF1t_n@M`C^ED`KK!z0!;d!dU9#1T%NCQv#itq
zv{{<9JUh8#wU~PvB3X{vUVFH6)O<FjQ6dmmXPhA8HB_~G-KJx=T}_)bKI{M~=BBp^
zQ&91DW5j%gFZ$U`No5=c)NV0ZSaG#3Fdp2}rbzfOZ7Y$)Wan-6I?QK+ruB13+rh?t
zQt5N;4Zpy2_S-b(rp!vqA;L!qa;iOS(tMftxZ*vtzUSq>)g;#n{KD=?Ny?Y%cE&Ge
zjjXoUVz9hgn;3C+EADu<Wr0=O0nwbwC8=%z{bfX3=m~t<h3DhUP3cSMnO!&+P%rxu
z1Vp_P<9tqj_h;%FlC$-Y?s>cuj~yXa@9Qg^!48R`H+h5#tALk5#oShlietjLJl;y4
zfEk8W>V0OoMd6Bje2-AHwxsiyRnoMq_mu_=VktFMX2K)8&!CgAeE~^PB@beH$KJNd
zqjuC7JCEw>FS%W*$)^d4F&r!0KX2gNNy}wes*IWJwq@vUm96!p<8$vpt<`*j_|PY>
zR%eu`;Y3c2`c7ImZZ-OE^&ua+PCvJaaeNnTSs4*_`ud@JIctc8G#%*hw8`*Nc}V|V
zcDZ2XIByn?p=qRQB%Wbh@O=;MQ|hM&=kuF{)3)wpnDF^skSoDOZ6j+(l~_bFm;=S_
zO5gO8sKqQ#^5@_4X0ohA?YcDWtZ(o#qv6wJIgBPe@mikL{2*{?75_BL+0NOJ@fr+0
zDXh?3u&vfsjD}`dTaJa-<`QR1LGzvhEOLdq!?NRcX46S$O6mj{%+44Rnj0!mnNU0N
zFdI?Wo^MffunT^!u9Ylj<Dtod@w7mlP@UE~_C2{lt@x+VZ*HaRbSR_Jw93=kn-(>#
zoll*e$I3V7cG%fua41Th_mLqRHO$8J-%^OWlq8JOS>FxDxn2Lzn<hF+%tx}uMifme
zecQ?K?rn*80DM$7&Ehbzb2GkWbe{KnfBFG&{mgMQPhv#Lr~zPIK>)Mj9vEqG*>_Zm
zcj?C3R}JF^>$bZ&jEjeIFgQ0qYm^*1q#tQ{dX5g?CiioT9oAe-->{Zz-2k^D81GMW
zoR~&8=MVV!j(8KYqMs*wNr3ac`bnuL&&0RSw^|LovL-ueoMchB9uhR=5TXaDq91>I
zxJz(f@LA9kp80$9gb4yY+>Z?o?CYY2y}%Dkk<)E=eSw5R8guGZ@E~FOI5Fw8rQ3Nn
zIiB~Q0kqs}A{`qAa|wg7)rOidBR{gzmix=lvf&zySEzT}tB$J}BvT&<4VGz8T$D8(
zvEjzk?)2fpzBZB9#B+tyc;%kyAWoc$QyU6T7Hj9=*4x3S!@)Tl738ePUSOk@UP<oT
zJBZjnTcsHHtcY$M4+8|Tit3jlZoPKQ-KwN^cFSXSMEq|i!$Xu(SMsxy#xX;t4U+NG
zntXTl$_!F1{wQhozQE3^q_#WqJb^Jr4~c?r31E#|W1`!XX{%2;*9#Cef!goG$aqMs
zkjWQ?anU%d#tt%SQmr{+FZ_o_%wIRCTzEQWbi#<W=GEF-coP^Lqc~E+ZWvh1cGw5I
z<T5@S#xhkKx}f$Zu&FG`oW?==T*vt&lbz|Z^Auz^GOW2eYDw+Iwm&yybHosboYY(9
zZw$$6J5;l?!E6V(fr}!vcS3)4=c*zhYe9=*&UR5hpsViAt?Rb3Q#KbTzexRA5NtyJ
zkfqpuU=^ose?rWb?>!E#F!yMV#+XMR*bjN;?<iV9|KXEKSp?aN*<CvxI#S*)y81Tg
zzve?@jXZHT&LhSNxMq(6KGrbS_AuKBo;dX};gI&}B8f^hBJF~hr`hdBr9A|1D`yGf
zE1r4d@Y>2^y`7knqFt8Y;A)9YO((*CxaOx6C_d_u$iyENKpuB<Zn4Keg=%Ro)-Ffq
zCTdaCQ>%)vt_(G0d|*V&JxUx~Ut{u3KH~PqrEJ7p4;bOr^MOvXHz|b(zvgeY1}*^_
z_S{OkZRO^bM9v>lbv$ZOwPcoA>~caa0V7pUzuXP3B(P<FlIftZ`Tx=N=HXPgUHdrO
zmZ{jLB=bCmka?cREutu7Oft01V`UpMl?Wws#z-QOF*6}U$dt5^j2S93`K?{|^Ssad
zyua^p?86^!x<5YGb**cybDirvt&6MS_%}qKPmK_6bW7*XYrr$c(Y-zv0Uc&-KieI~
zPUeXzjRhPFdo^}bd*jVoC->=5cIDtSFEkY;qd52T0L7rKU#?TLgLm3DWBUWPWb+x1
zhu@GPo}(2OskMv`^$~aO@J!Sd5j|0VUTh;yFSnDT;wUuazJGeJ{K>R>*Ez0>-H~CX
z^w=ACEsM(gDHHz^P})GQ81kKzpx#E6ZdTs-a=qXt^fSyce4A#2+eFN^?v01Oe}o^m
zN5%XSL9JUlkCuulzk2Aq$UMWKcV~u!FIkvO2wjzJ^B%!6@^*N%9Zvg3vCWtZn=VqR
zeyFpi?1vICv!tb~DOrFlH^q@<gHy4;sp13jSuTq2<^q2KPcgt*e(YTgxn<fe%a03L
zoktyxk1#dM7{Lrb_o+MM8m-N;BIYFQpHD@4uV88@yhzV<n~hmVt}lO8s+OrP6#W+$
zK;e_~U~++wd-yA5KQ}Srqu*MGZs<GGR(RSwC1O)}Wh0<&<fS(BWXN(USW4IYHO88U
z^-4GQ^1kaVr>~`Yy`pqmW8iLRyc)MV4>BZcFOv5yNLJ+DnF4iPGzH(_gZWCSS^KR{
zl%tpb<O7?N$B&#=3KP*=&|`gHt%T0jJgh#N7JN7DEsvs>@T(ds%AdnMc4~~M(S7KB
zvD)4pLQYY;;_rsB3cKHnG5A*Jr!|U#Ac|#LPQEXM-9AmH3;U{tNosx!`W&gV7Y!6n
ze7Qw&Zpx8iXTjU!>0Oh}C=gh|Y0$@tifp=_$7u`l3)9Hj&e0@bCdi(W7A8M5IG&BQ
ziHe}S65PZc+m&@ECYFqnLeqC@n_GigWPPrm^L@3)Z2g6RU(bOXZ?WWEr?jV$m1wRh
z(bY1AdU}G858aV0ZXWg1Y&{(rOS8|Os+Rv592Vy7E~ffcgmkw_JByusEz19v+`30s
z`Q$h$Qiv4&u;BR5W!q_nWv!!{Eu+p2;qtO2yVf3BisyLY*$MiWlQ7z;xU4f7lcMub
zy!ZB^4NZ@OR2ELD+GLJ%CZ<-@cl|AqsilZ@N5t*6<evY(6i=mrmeecy*_`BA*O;;o
z)KxoivuAQxq@r8!rC+8OB?*JfJ=&A?iJQx#{k!EoilZX$u2HNimwKDmzx3ezjLu4O
z_!(VE8C1cWk~-nu3H`T>ax#q|OlEm&^6Azo(rDV9^|uGm(sp$17~h^AAWOU0ym@+b
znop{6MUu2sUAUSQX*8C|n{n>gl{Okz5;v9bVE}jPf8VQaldj^wx9v6YuHysn8|#zC
zf5ke{tCEJk;KwHPd|-c{{P+u%RJtXJ=Yqr?CKONkZ~A-LY3|l4OY1X|lsDuIts|+T
zYAutI+$+0{wx3{TO3|;zrj0Dci_ts0fK)NGIhfcl_-ZwaG>L)d7d*rJ$`G1ng<<@s
z?nti3LC?9dH`761{DygZpXuKV2Ax=G((rLwPNKVPD~=d~Day9VsLI1AIrBctwr{;Y
z&9&oiLR808ojSr~8ICyG>|Ta#%v}Ap_}(yam1Rs#gyCwYJa{qq)`foggja8AN~xiA
z_WQmc*^3Pj2_2Vzcu&;pY|GT0<8|5(<;-}c5yp2aij04X+QTsJ_whk9sFVdrZG`S4
zf)92!7%$Z@*QaP?PqaUJAnJbgL%O8fqtC)i>JH)WLPW2h)VJUkOqzJKC+iro&~lOo
zM;_Kgf#BkmBsE%~I^sa)uo!QB5ot4cSN#PvdNOyA_AyWOER)LZO*(T<9^avArb}e;
zY%HKJZ~Hu^lyT}J>j&DC#m4RmQ8ni24&qU<wDynnG31FuPojcDLxoP#>H5u)XA&|$
zJ0BMrZEjAN3A$0ETD+xO$j6fA_mqoo@7Yj{crM1?InjSIPS^NRvh$KtLMm<ATd#!W
z;bZ;BbA+t(JXvBT-S9I_=kPO46H&#=@b|)JQ!m7jP+sxD9l5h2J_cvRW)EY+XpON=
zVcT~Nr9Tf}5mku~vkCVr#1E+Ywi1rgqxm8=2k#R|abnC)y!Csi_f-F}h=u#}NeaL5
zG9v!lS7f-uDLlpe*%W}nRgt<6Dr*6JWnoOrucNnh<Yf~2o6~H{ut*8z8SsCKu~p}P
zZ*|vC&g3rMXmD03Pu%7L?<tW;f=jj6rOL)gxnwr_>Ku~1gtt{OiAI6sjTs{4W)CG@
zoNrTk<a2%x7DJVL2r8c{x&MLnF|B*NeB?LR4~%wPnTjmgx)S0?m!y`~+>;3S-_$J%
zLXW|YI%<NOQqG%x`FFCps29aI!)BXXL7<^0a@~vCjHF$_G|p=u2IH7ky<l=KATl*K
zbeCjP>{3;Pu4U&o+n2|>>H9JoJ<NiqRJV662{8O})qa{msJ1ZyH8<+U{`6ZXDLW?w
zteq)ltWHtea<Vt}=B22qxRR{V{`&9$u;eNIjb#0Si(7RvQEc=uI=B`7#kkp;W$~Ju
zt->h#vw7L_ZSoYNx{d%KDl&?szmH7`;`3M~t4i1v{%I%Q0ObD~g)<Q~yy%4s{k-Bd
zJ9OU|FP!-U@Ss9)zO(a|n(g;eEjSRmq!^k6O<nnO^rJa#*Ivb5??87Y=dA|!th<tY
zpV>W1S9wkB*1W%N7y2$%v-{kLdFwg=yT=lBMje#=&rm$B*7WEFUzPWu(rb(<XNX-+
z$1DFbc(t_VJ^tcz@<y0z3cG~BtOns>foN1R`5Drlq7NYSR~)Z`;fjwCXi+|E+Ak_&
zCX;mI)t@3)C4*L{F2VA7)mP@bQJ=)#%^6Q0hKDJ8IKM(QaC?m9pQs<&P-?|eOx?`%
zpCK#sLh%)id&dCrHQHhSmrR;Pv!vzZV<I7$S|t!gXXIZ=#Qlz}zdCWaH59w=n1fWR
zW+_+goX!-(8VyvQ8+!)qbe?6a2kNrS@nm<Gcd2UotI?l6^%J_;OOPf|A2AtIaFziR
zGl)qEc;KPVQ}s35jUE#o`Mp1uy=9r$->HV1J!7A%UG`S)SMH~eGnG-EKE-lU%I}h%
z<kShE@^z}6t6N>t<*F~a^i<`uBHa?^BP!|>UFpXf(}#qd-@oB!tYUsx#ZuTl4OP#>
zvl1`*R1G|CePg`bALW5r`#Gc+Av+eol)R>S6z^=VtaB&k`gmq!+0A4Z_Ufv6`jVe-
zK<+hH?+PxA;26hoMkZGVr6aYgIQ2T}5{;VqrWn~mmMRza7Ta{Pp<_X;M8}S=Pm1S;
z21t9QfFd*hD-fFRA$}j`kH#A|`82p#>vPE8dZe#_mYwKeUIEtnFUqeKZ(0=mVIFWw
zHTc_59)(wQ99}MIULxaBZuh*Fiiqp@q#p13hsBvm*2`8rh)OLS1b}sBBbWzmxV)UY
zcfkQ6+*i)O56lpz;fZ>0JR#m9agI}(%XpK2K$bBkr(qyrnOotpFXr^gZ=?@ob{Tx_
z9(J8w3gB%LW|a^T&K87fHAWv;Rx>}$zW<FyphJOhZda@nnPE9=r|~FOS82BCE>+qo
zupf$hahHgPd4%*BlV}ZVa5gKN&FiwFh>Y&dx=nUd+pzHa{NakOD(sTw?TW*_TB-UW
zt@0Te4=0o`^t|iSZ^8a(vrRYs-j?)e=q(N((<D-2lFR8)2bhe7Yl#WH9X3d50$sia
zV6Jv>7;MTKU=%s?N#e9rw>eBw#z_WIHxf!YUr>0N=e9^ad+6ITw;CS%IwSM_NiUuk
zSLc2|2oX}u5&c$9tg<P5YCwVuk*o#<hPwRnq2I^k7#fl-pw!pev@Xz?m)*Xn)573h
zHo~^u(5--c{Me@2l4_Tk1LZPgj3wwm(_g+$!nSH8cj;Spa~ZLc43)`US5n&+7Nimw
zDpdG2$#!=QG*b1Qs}k$*T_^Lpb#JC)ia;c6rALGKCuMs0%r`f1sNH^LS-p23#r#3)
z))p0IA_uat)#kE9BTLkGU6yZ=`fcq_!;wTuE=tQ+nRu!=U0MN<yj1pu+AJwE#0C9r
z)Z~Gh3t8}>vU)853!s{z0tQqz95wNx+Ig>9E@obcnj=lxd-h2-x~m7l$?Cc|Yu(ZP
z;o|2D$R%vZne}PMy9|CgQw$|bwg(@z{}hQZF=}&DZNm!PZTs8j*4^qt*3fppK@RN<
zlepO?-a)u!zGg(BQ*%IvS2e0xK0ZTEVD)z}y8FzaAzw6J?V2{dI@DC(^<wx2Za6?}
zQ>}~=E%}K_aZ76iRIz6bIIhKZ$waeET5{%}?$cBqy;ygMpG%mZSc(*C$M*2>_}?7N
zqTgpZWlY7CS&i@tWSZxcpE0TDVRULdzBW=l;YlE3*hGadp&NdRXZlGMhC;Br%Fp6^
z`W=U3>!7D1=$W5bvk9aUBDikGfsm!q;avH!ws6$s)9VK+vts?{@=;h60{eiaDvIAA
zYWgaJff0R4tfVoieK3j3k;F*vEaj)#wlWX9Ic|fMngV?>M~dK-AhDOP_NnXH7*C`e
z)tDsKOw8#~xX3$jl4_sI<W6}hCK{90FV6Zt5%r0i%rXRHA4C34Au4N!VdJ9t*w4#I
zV;~=Q@yAt@L>4|1M&y~PPqIpV*{TraUd-yId#sW)P&h)>itK`Z@8rBe<=?ruVOc`7
zfi-ArTZ|OymX}@EW3^8A&{@p?!&AbGuLhp{7laf@HkNZ2SEL?9l+7AZm%cKG6id@^
z4p&Ow`3^lgJg4_P&;-(#2tC~Vr>DYF5PYm?9zHTDG=@_zbv0IQTm|nX_Cztr_u&=E
zBn}p$!lGOgbV6V1g_+yz1ga`?B$Tg7uY{(<=%JfV2I?L@IV@w;oFGdN(8;X?7*@OL
zv7D|kHQlSR!k!vsO4(-d>N(7-PY=qHn2T)M>8?%6j6dKf-?%TdGR(O<`K6MO-i$2`
zt&m&wQ)T`1WSs6awL@1?(5H5REZ@<el{CA<KNp^_CF`*lmGSZCln%d5Dz+k3-kqup
zd#$m1%?cMpV;sZsz8}MLoR#JlxRlg;8`xrZc;1tB%qhL|WBC$bJYCzDO!Kjn$aVl&
zGz+pvHQ~L`baL-t?lUV?Z0vWcr*@b|G13`dVEfsCcWlVLamBSaQf3BoU0M)h#F<F2
z_^BndhAc;L@*we8kz1mS3vZF=g4E(;KM!A7)$vUU>9^2~<+|DPbiVp^o$ka7@dyr_
z9JfmE=8siN8tj8X61s1;X87wKJ|Fc7C<Zyy@1C-@ZsG0VAC%4L>M}$s9xQiDUumEl
z1|d1D8lF!SFKPBy4=;o?DNZG&y0cQpLX;xO_80Zr`n{#+nb7ryA)Itsuliq<NX~nB
zD(a^^JEEvoYW%g;N3=+@gIUA(p`It6Ae;0=tyPC^OAlU6xbYlD%ZolGj1WG$t^D)7
zM5Y+Opw5-1Yb16|Q?lbt6|x@ub0B#6t5{VMRO|D`?q}t(gma|vMm-)2P7`<38~HZ2
zKHbVR$XZ7B?(n-xN;pm^E6Lfz?KVha)Q6o@W!*XlO}JlsiNlOB%N|U)b*h<X_!(ob
z_PLBKZx<O74dNEA$p}os<r_u$GqpCDunCYsX6i_rmvW~Stump%&W_shOGUg*D^~v2
z!uw}-{H>SO`gY%uqW9tPNR9R*l#2QCo|0-FHe0s)iRe8JpB9#FwM$|Q<)5u)+UkZG
zKR$v9(Q`dy57s_C!{je`MyH1Pu;-|9tK+3DrkI;4+i<Fn8JX=-lEk^^5p3!AbuMiJ
zS(Wh=GK?<wy=K_79);ZP1OEy@PvniE9=VxWhfa~tyYnH0kVv60o!cuBcOSDS3zR5{
z)m}?JhD+)uPtV>wepgXhJ?S=hI%q2~fLQwjyol&Z7OBaSXS|BB1=3QjKM>^ur`n?b
z>1D1zGG6CNkpAOM<h0NItS_QA%ES!2h5l%*bMpXRVq$_fO8T>BlGVX=Z1!?WiJym2
zk#;Z7x`&(2A@B*U^@yc7@lGJJ#2i&!(mrSHcqcD~I+wMaO81)zz|Wya8&Qjzmo>%f
zk=jy4>RGBbq6p93oSGDdd4A&d{-Cw`fS<>iWbU6n=Q+|l?|QeKa10D12KHym)GwbM
zJfAmy9I}MQdQ+$L)%jYyTiPtED~dxa=T;uE3rO7Z-Ss546Jfq51!&uVFXv1ExDLIC
z&h(<2@VJ|1{1(tQf=X2ZYH67EmPN2<3QWd=6P}~gl)t^v7;Umi%SVFMO&sWH^kv9L
z#`&p|RQCx|(C#8NiB&IE$+hOL#;ANC3x{S!kW-=jpk=orKtnv$Ew~oi`2mKK<N%7k
z=-|j&1fc|4%$Hr*H`?*QvsBi+j1noBXGN}+-byYK8kG@-oi!X-6x(6^(nFdR)~wL>
zEi5X0?XwMThqnGwdQ;@CQPLGkA1}5x{<rI7B%RqrxjnK`yAg`pb+}Lpg^e<n_gBEt
zt9oFUY7Xz!;RS5?R49!U3%~e$t9?Hkij|4k<!fR4HtV5!2bG)9B6R9~{&!hd%NVn7
z&M*PtH*$vBdA#2n#N>NHEtYOn;g<C=gspSkja_+2p~Mxg*XIAzPx>n#^A(?21(PiW
z{)5x<D-s5I+<|%G%=H!}x4!=IXC_H;A^NzmPX;OM1*gt%`Lw}S$l_6&wvN)1sqFn0
zM;S?XSBVJ=+OwO?J~|B4bc1E%ZP5qMni8VCSzNpE;@sBV6dI~|KX7fqrec}2d}v_<
zZBzX&vTCV^t{$pqZVDzdSA$qyhkqRBRxp{^fh!36{+e70efp(&y|d%o{DwZ)bv!%M
zu91lPn>84$9OHXj7DaQZEnj0Mh=!v?h{weI#)aFgeAQK&XN^M*khOWO<nUL^C(q+&
z+wzOakIm!)ceZj`+XFaNid8+vWlBn|`7c2p?S%3*O10vP$~t`VfV1B7#k?Eyb$>n2
z{{AX{v`}<k#k`43RUDCFN#v1~@xlg7H>6}XcA70u@o|2$RsA5}KL@hSE+Cjsy2jGQ
zbIM7{DK^K-x#u7#BH!Qcw)Tz78rSj4r3)f`x~Y-#dq}#Y+Ca~r0(DK=S6{W?$@U@n
z=e^^vR_p{Gx%{i*fj^4pKOrAR1uZqB>rJ%BE5{=~QN$kPAD5y8>J+;57)hVJ>orWy
zlv2KBx75ew1{qs)H_fN-g(wAJQBeH^oQFO~@T0L7Ob}L0?DNm!7Dmpgv-dq|u6!!=
z)casPu`j{Y&=$Mg2Ne1b?A}@mKS3)xs7T!7dGju}ASS9=fLGX(Qo+~atq^qyQd3E0
zHt}xE>kQwv{DqtYEhe`l4(4Vn6!kqJysLv3UYQ{Hs~(FlRLOyeCpPSNRd;5!hqoiv
zp3y$~P0PprE#l8DsEFC6D2VgfY@fTK&Z8s*U6cXeM)E0Arwka}7cTte)*NUlFsO5f
z=@I2|Ra9xXj6AO{aYy|)I&oRA`R)a-8Og^s@mn*K!v%&+Mp5^f?8Hz9_zC5WMjJt6
zDTHmyi_72%p@E}42;ID68+s&o<T#$`_1XNN6cwMgWb5Jeh4P6Cul<wl@L=<1Sf%pl
z<~VRPJ9tkAyuzHqi9su&Py$NjytSx_9lcNIp%oC9O^b8xGP|}blVJ4l#C-S4lgm7a
zIX=|fMC;oea2k9-J!6*s09<cbEXZ=G2l*oMU%VF8x~F!!2;#cvE9*<eyraZ20y8;7
z<@Va(=E5uiQ*wW#JaS0S>YSx#$g_Xdu8UV72fIvkM}GAj&YSkQMkCDXn~$79jk8yg
zb>K|~z@RkmE?dN{OE`ltd+rwSLZ-UqyP<JCqQYJbKS)zzTKK%Q_J&sR`Hm<$J0zuw
z45215<->K3GYp`6t(gA;`h4}<uIp@Gr!$kENxl^EYg41hv^uWK(K#6_tLPQ<+c!L4
zor{>Es5%P9>z{!9vFNNHa#|UE-`1Xf`(W|sAc~ZBcX-hDG{_+S6kZhhdvpBr_568+
zG3OqC>~yG<TJpcu8{25gq_b99Z<=PWt(xD{6osw+f%?+-6EZLEz<E!B2Nv|@k}OGd
z#g(6}hRtwFilOqXY2|~kifLoctjqFet*>wuPML}mKKAF^*>|{Ot<Ur(|1?9o3udsn
zycDH+x(II2ep<~oV!PPv>!co}?q_PW245Ffzhr-Pq622Orf^MRbv;mHEs^=c)m(Vb
z^B@@1#(QsY7PX;@kwlar6m-4yLCae{m>Z&p(biA?bCzC2gBlU}s_J=R?|4t)twY9P
zFj@TeU8b*iyOni`Ho2%&(sXQ&NmxD$Rrp14M##VqcGpYZZ2n~<Lc+GB?QoI5P%qrZ
zb|$0xZT#uue$Q;!hnIjL($T;2%f(}J>E{ppyBtWpoRaou-?lo=pL|IiH5VZ1bLl^W
zLm)4RHHXU1SL&Mg8{ORl_a}Ua&_!~u7Ug}2a*tiklj8H%oy$fMl49&K68G~ygC4VO
z?!#WCLF+Q9w=h^vZg(FL=F?9<d2SSKG+){#*K69WbBOh`oMEkD|0Z=DFNlp-_QD74
ziJPDksV<kt^T?dD^x1aJnRVICVIYEo4oI}Eo2{W`1P~OZt6y`InW$O8XrG1Vn@Lao
zu!>$tujWhc_FoP9ey)(z30U(MIR&5(H-%?xpYY*WuUzIO1}Z-hQ}>v24+b@p`s|k}
z39ElLXnkm(USiK5gNd8;4<v&x5`KStfJ=R<&z%%*KPVn$k9qI>At!=k?jmS!SBT8*
z*1>X9aNfQacWm_aS>Vr!@|3c`TtuA(`1gpnImsO0e&JQVZ3S4J11gs<fGE|$ywMpT
z@BW>(_!Dr2@k5(W|M>>Co7X1mbdr~m<AYH!R!B<D#J(m@SGgD!Vi@xJL>~Ec4PHn>
zk&r|{PzKwF7rnlh{gDG(?XBh$rD1gvYl@t)40>iTlWD3$%fJezjx}(lzlTft@y_YV
z#ai#h8M2_uZ`?C><j5xwSuj@o1iyv6lBNl+&?HdVU7m0&9E`a^idF_icjw|N6Ms;S
zIF?{!J|@r!Bl?NQd5^LhBnTUCEPXl}Jui8k^}4@~fEPKVV8czp6X6=+IdtNMsu5F<
z^-MUT-<6zy=+GU1hcQ><?sd*&UV2^U^GD0p{dIRh2EF7~pVB<wFs!R<K0X1NU&ez@
zWL5T!FQ@yw+^48gzk6JumZjxtUhFFhYrKqN4K%!ya%l#`)T+ai%!?3<9k}Dy+acE%
zgv{HZnpoiN%1km0-FCyUN^B`se*iCqaF+Isw2BPzHf!F_V7t~uM!=p>G6Hj_G1%e@
zg>lb!!)nDM+2dbXCK+rlQyNyE?9;!uj>32jkXO8mL}abMSFTDz>V*8yCx-MquC+@A
z!Tsmc%xiPRLCxp5(ND9RJ!kt{_3YT!3YUyU(+%iO09#$8t5ju56TywrN1!>`YIoW^
z<;sQ_QDSaQOD;T#5}cE&r>3Y?mUyGI*lo<~d|#AYuGEu#sgmviDwYD;a-{sY^Ygs)
zk1zCiWBXyNIxY-6UBWY>p1)4naU##dE`5Cdn1&(W^NmL_lXF#k>PNSF=$?538c<lz
zOeqH2{cY_rHFna_6+%_9aE^<o#XnDXx?M3C#eM>&bg6Y9jFp)Mo()==YPfQXY8NjD
zn6)Ws?XT3lJCApOkZUlj-D)MIUNvwTDM`5Tv6xY`bAKNajh^cU30m|yWywO9x<TN}
z0m4@@$n046o_PP+l+Ya$CumjcUHk31q%&0g#-qSlIwR0~Wdh-^+Ds-B9o(`Mx)$5i
znJJB8quaZNWNRYM8g(Tgi~sjW<sH0d+E;}3Ub=r?zMW>1ckN{6VB}1u4VonVVO7?t
z?IlSMv7nxd2?S(cV?&r29tMBx_0(l|MS6Ftzy9(XJt*x|^N9SD4qw$JhMklS+7>#Q
z#uDp>%Yqk6_~EfHBDkVtJjwEG6v1nG@Fd|VQ{I3aC_@Zc$5H^Lo=Ua*QuS_%hVU3$
zBx$}zG)<=bA;-s)-J(~jO6;$I6S;n~4@y#ocyAq&VzSBb$!h+m%)|Qt$$*qMsoQ$v
z84g-&66GYP8YU%BLDVz1C$CnU_Q@0Sih-BW1&6UPTn}zsog4GOs?CG%b>}a@hGmEv
zLFrVQRa`SHvZ|}1QMFZaq(RX%f2N(eQ%!TWMVmeO9m(Zn)3y8An~xA#+4o65ssFPV
z|L#Z;cyF@^8c_~H8g}?#5vT#_;&pY7c17Y-QNqQ;Lqd3^ydO~aIU3X4grAC%CzN*Q
z_Hb(Zok-BGxRF6r+eEVX{Md=pUzt#OjZ09>ncPC<!eEz~oZo6mdd#VDY8AmDh|pHi
znSy4;N$gSBWlE$B<O-g`%taOPfiG0Uj_E}aT`FvkWe|$w$P?42cV~Y5YMFj520_SI
zsQK$V$Ow|&f5*25nn6SpuViz=jOeBiD@6af{2XK{Bg3^`*El|P*8bfAI_#y$m45pS
zRr51Vv*M4we|iHtTi>yC^g2aNfBsSfvIR~yB_eU2cgwj3P28(z-AV^*_);rW7<OPp
z-g0kyZOz4LS_bqL*7zT@%haXAgzexnV=mpwN86QH>QdMFz6?;Bd-D!@whC)=U}hr(
z)Ry&n%af;A5JehFwjPl}QB9rPd(nNbZH)diCg`vqAy<?=EtErzafx?UOR)ch3lnP0
ztHTb($*~i^CmE$&%?B4ZzyO;99Uy`KZFVNao|cEXRNSjYp*0S|H%R_f4}HPf(M3cS
z$cgH}{Aund+pYIHILL+{<Fy3TNSrHxp~V9v?E8Fk<?+{Pbv7FmkE4|WV1J`eUck|Q
zmnII<{*Ubte+NOg^EK<N3;qS+cY^_Giq!L!?X}N!(up(vZiJ-oGZjBw?|BR8y};GC
z!GxFF1mSJBpqrAasF}2n(?kgo#stmC{fItP0ssw3DR*%acJAQItI936e%*NgFD^hd
z2cZl<-k<6R2sU1u+z0)GcOY8G1(Ay30DZaKuKivQ?!*R9eD@0>SS0e`8Tp&zUVk3x
zp9e$HW;gM*g>@Dr{s{pZaVl~iMgw=)?@fj{j6VfiGkvhV&+y)c(J^7X8SX37DJ=-A
zeuzDZP!T1MEb5K#jRI%LhdkT8l-OB7<gX$|j*6k3kmy(&?XjR2q$OA0pm7Uf{o#Pg
zTpW3MHA~|g1iQx|K6Me41q+ah+RPrv#RQcR<lwDHsvm?I^%TJdOjO?MU4f+=TL1B+
zgf!=o{Oi*i|9Yc16lq<%A4ccoTOL3J-3G~KVDP_-)=0s4r=y%0&g4^`Yzij+A7tOp
zznZq75v#E{!KxtBC2Mr`!bZbGp7oF~zF-sc4y;x#40nSES`VO1L{m4U`i&5%sh4ma
zY^{uQfm~HGAaoigvZXuYQX}*%2;S+Uk9F*YV+Booa$fO@|D5(eFEfgokUZE<a%@HL
z&w*0OCd-rC?KCV>p~@)txO(TXG9A-GM)vZY>)3fvX<~cZSJn=XiRHkJ)NqTPxTdbt
zWRKYTB9<;?D6qP5DljCL3Cxn4Dm!DqeBvG0wJ_pNRyf0N81uyIeWe{<`w81`?4$Ri
zWNMb+oV}AajsJNx4mee?53c4}Q2leb^LTMzr)Y=Sh-5Pt$zT9f6SNm=awq^)Jp;Ba
zdUv>9{@JH=Nbh+pd)D!Jm4_jZk-vH`6wLj2=q}Tvwl@U8%-d%hX2qrnApAA>lqbEl
z>tdqH0$I4>S9Pq>LLoE23x7i){wqow!Ha#cc+<n7Rq5~6iVwMpOrH#@c3vJO{aTm_
ziP{x6IRF$dfbX(+Gu2y&TKz_j!pU$zrH}?`+H_$Z*LLhb8Q*&0kowcWmZ5R+ESPXC
zf)bKiCkO&@ffB)ew&PnV@bf5S3-^z%=S_Uc7anwt^PzG(wf^TugMF<+ioeuCRqbg1
zhBQv2rms`#9!6gyGg!Fs$FqcNK}3l^iZiRk<3~pbZFV&uj>xp>+ZppRm3YKQR%k8M
z;3pD^UFo0m{)yZ}a=mtTj>Y5%F7n_|F>=@cjG+W%4mu%-tmD`X&wuvsdP+!Q&;x<*
z;hkm`GJgX?JPk&Ug*DGGH$KO)f=H+7cU^=of#G~60cEO@;Y35w8#<wj!bMdezonC}
zmmGY9_&I*7VKor89ve@aA|)RGPi6$muBOxUHL8owii254>dBw~!u}d*u&U^7OZpx2
zY8t;FI10kT`&>%O!Oi`8!Q9#XK2`*V9uFRbQ4bVPRJjS=1b1My<<-Wc+cjX#L^NS1
z6$>wx5@`}09Lo8w?|D-vrOMUp32E-^Nlo*kbB1x?qL-g8QM7u$)xn-kx9J37y2x=g
zh-&hry*5m0X!g%u9HgITD+EY$o224ZyvfYdVzw?|=yo*x>c=Jiz5^R-v6^j+mz24E
z+emzfrVZH7z4DmVD!TMCa|JlTNOiE2z$N|0_9ui=onXRIW4()N_*XYbz2C*+t>nLg
z79Dz_9jXOkyu|LippI{_)54#GBwi5Rqf4s1Y)!0GBz4N9DC@`Bg+(a~FKULT4U66V
z+F3K^*QyZoFNt}^tGGU<%;mn9>7(9)_lEm|B;T<7k&9Lix4Qi+>}L|Po;{?JXk<90
z0yNStYMS6?i7g(f{<F;&$re@>3trPM>QB?d8{XwD19vhow0l}ypqDI&kI3ENca^nv
z!$OHHj~n+|y2Xi%4{b`Cr`Ma`f2VK%*}cGThdV#}mGi+PN>_SyqEa6(j`N=4E6bk+
zqsC^}?a&$Eh~A?0d>)LEyh<^UI`P?Zxn0XiYUZ(PJOB6;!@FjO+<%TeI|af$bIpP-
z4WqBLyl(eBq4dtK>z&%#*584GB)w(Z_rLS~6Z5lCVi^lVLuh3F3s(}dwhR?8xpDaM
zwTtXj*;8|<QmS)pk0^9hj;8Jo;VvTQ9Q+A4%9i432S#<b&OQ8nx?Pcj&6O$@)bs8z
zLi1HkSKfCeex^M(=)7F)pCGbAh8*`O&qlDZS5hdMkQok=sSh?}9<FboX`(2x*{&(a
z@>xDGj{9$MAJ5viy>74IL=|L+xQ2+htrT$hq?zrm?9jhb4`rnHHmj1?&(<gT4?`G`
z#TQ=)E|0!6;wLXsjz@&6A|g0#^sYKE?5=#Sdvo5<zJWVlf{<^UH73YYi>V7yb@NMU
z`X-3k7VT}re5ek{v~iAK@{NI=ZHY9g*`eDDF@jCv>v(}T)<imu$=%AS<ke&nW2tld
z$g%q%tslCze~8g8``J_ZU6qNVDiVItSY?@kYI!`p-^pF6XDgO=nS-#Alx{Ef?$tvF
zMf|$q&zdyW)S2(VU4Lu|9Fx<8t!hod(m)7fYyrie>r}l3KF<{}&IuoWape*?wsO%6
znvz}_9h)Mc8LRflH7zz3C~^det%Nuh`D116MQLr@Mhka^9mBo{@$IbMb&92@i_>x`
znH^!e<$fiIs#Ybn(W1@mn#?xCeka~92(O+=src)B=%1L;gu>A5(gnWwR@X$r<TXa{
zT4<0+LXY}Jlx|tG;Rn(anDAOPj|9IkMu$fS2VKGGUBX`1>cikqO;F)Q1ij+=yK4w(
zFxu>96J&loZOg?x`j6g0p7xx%e!f3TwCAy?AF&OHu;a71>e@#P2_dvtXnmREg-d^V
znj@8hk2HeZmz`?|uGH>m1ihafkl(MbD*TdD%}1Hruk;WfqJNR*d2R@Ny}1@ZyEArY
zy-fMt1uW^0vbu2@Kd<`@51v%NcDU>-B3$v=)OWeq;pUYppQraMS9>F<)_DH?fV{Z*
z)b{>baf>&O^2)C)^kqMu^IylgrZw+-a%Uf%?=OEx&3_Yr3a}y6*+E82gcg-IlHX^@
zUt%v&DUJXpzS1)F*RJ0sA2vFUxYl9jO>cYkbD}@e4A;7CR9{f4OWM3fvyn<<yz-)}
zLHi4>@att&x|#~3zstVMWpk%$+(qE)_Q!g%-~rX=t-e?)4MzX$LK>&)@e&WO6BD>=
z5*=<9yJ@F|$8REbQ6vF10*@bT<=d_J-MalidKq5z`eT)W%;&1tAEa&SywwsQJ?y{q
z#ax68nGkYYJVDv|m`BnWFFty{izE0PL{sw}>PIqvu+jG7SqK2~JQ6|t(hfW|#lhR-
zWsJS-S?-WsiQt{}yi8>PZ#*I6p-UqDqD<V*meT#5NlS*fl!AjiKwLpAek^_fCC0K>
zSj{Kd+IbhhBm`tKC{nxB(t7gX5^qa|%I<9>S$qgBVN7g&8k2R*`+Ap;jrxLl)y>Oq
zcg$Ed8E+?C`*Q$rX9$Oi;?y!1)~o!4@8fTy8ABBYj}DJn9}u4S0sXBWK+p<IZq7;Y
zuXpi%_D}i@$~8}<+%B6qMILA53_Et>2Mgkh@>?#d;E1jH7$32w-?u^|i4n@-uG#lP
zZYj;miy`h*g#LUo6!G`2D?u7GZcm(WKUlp4$N9G3-A$-X|91uebq(5w?gzzH4vR8(
z{0oetA^hCH=brT)B)%R3R>NPIX_*5lq$kWi>VPb2o@I@v$=l!n{4qU%`tX(eXSsm&
zW)jE4z0KknFbV64V6aZn_W|iHd|0|HnYF@S3c@r2;N%;e+q(VW<}>w62?9Ki1$-_o
z$aNg+d>FE-)Bd`kJpI^;B>v$v9)={w8a<^vgw&+VxdZ)Qk4zU6vS><TBy{qIq`4^m
z=l#`H{MMoq<v;zH>&XWNcMGg5+%@>d+hDPokSr9_c@A9<1$xHo11bQ*wh}ab#ty|=
ze=&MW1Hm5R?Qza!M6zW8jNR{n);_~|<A5VvgDe<@k3-0znFDV~wbPQ<aTKlmmu9%X
zmEXyV8v4oGZO^j#c5$(0;A)+h^1R2e6T!YVqK4MI%mxeAz{d2$HC|45dpY2M@otA&
zAjtex*snX+@~>~$oVS4O^1qkC9Jw&lTJtAHe&9s;ZL`W`O#08>FD0QbmB(vJ(-44#
zY47H(`gmkL_c{=WbRm4+*W6nDd<CyD<LDFTs5yQ36AorKfZ)O)5agt$?6)$;0lDLL
zguSWIhb++%0Zd|rktt)@)er31b(?KwlSIb&CQ!lTY_Vw(C)k*jf&8>N=(q}R`W@Vd
z%iM++0}@i*_|nB1@R&M-6ftbI4!amKG^rpKk=0~CAr=0=vMn8UL8NyAm65QPqr*5A
zL=zdBfzLrRk204bsZPKWkrXiR0SJ02e?9csd<YR`)U2d!7LgVXfSpG$k3Kk74E^|b
zVDuR5?2+<C8Ehhq3+F*i-pwImI54<zUg`d?*IN7^baxVi+=Q{nAL>o77o$^WzB%l}
zkYEqs*qJmxpf(lVT>6*@b(v+paQmGNs!}hIl)rlWSXBN)l_oORFK-pQ7HQXFp@bo>
zpL4ZNCq)VvG7u|yqz0@A{qKbdxrtnFRd{<~%LUQ0`R&J^n7=7SJy-V4S2<s+qoqbm
zEOMoCytgUKLatCT>%o3sV)1^YfnY^d0}}z?0F!w13+lQaXoSzHo>`3LZ`*f*wG)R9
zhXufE?1rpTi51ZRYwChj#;m^(vC{Z?!?&FxoR!K~SMEGW(`jLPCEPBGD5#`aic4cZ
zean|?UH6>GnJZOE;3TMdh!<p>7%sSd|LigcHkcMaVGe%x%0;PSwZt>>Lm<~6c;|5Q
zvFFb{II8d`pYa?}CFuh9*Tl_Q;V<@0KZfvit|FK9zmm6qZtB1H^`EcAJlS>I*=Idy
zce48nytM%sN`@ODv#^&!PTiu077K7~7f1p+1qZ;NAb3Ch>h%2nJlt~Ub6E4D%X6Un
zDcG(VO~Yg9$2jw=Znd^&BQWYNo+|hD$conI?SKFa!;p6X!DG!GwWik>W0nIKfjX1~
zBGYXWt2+m87jk0rprkQ~USKpkNvDB8udzfulCWsp`79lkf!Fumyb15%2RJtO6q`La
zks|8T02n-XpmVMHN4655std<$w(PJ6Bt+eu|0IO}0iOQ-PL6g{lBXT`sD4;}7R$OQ
z3}e?Nx8GljS3!~Gd1md-#EO0ejhP>n<F^~`z<J{V1W|YC6ETA&`<McrvOToLBv9iH
zg+k|Y*@pv=rq*qbp&xqGPBBP}q4_Z;oiMWy{xFmIB`G;!ViR9!=>;&Uc?gB81yCsG
z7uQu#V!=C@W~W{7)mU~O7ruuKRUyb?^k-ed4qHJ3zy_82Y@#e^+kF?rI={l8Y%-Pd
z!Fl%6(lzM?E@G3xxYy>d9J;(8|JV$n7CTfrALYbKtVEc0%9ZkeZ|8r<9F=J@nT&zl
zrui@SP@zuG^IyRkWIOd|hMHOym^TDg;)Bhl;M-)u=_VxE6@YF$#8a0Y8?WBlG?N7;
z*Zol&i6xlyiHAdLVvxr47IFW)V^O$*Z=c7lEsg8XxIWPz{tO9~On`OkilG;B03a)r
zPTjsUE*W&lzABJ_u&x86^6h#_Fn5{<N*sX_5qL@pLbX!Ur0MxCo&!sBJN&-Wz{IDb
z3w%!$0z6GBq}(U@Asgg3_nN*(+Svjk_y%Nwxb6ire3}CA&Q~)J;Z8n;Ds?3kBSRq-
zr~i~v|H^>y>G$6cb{!=~7Cck$oPQH%n@E->Y<%36^Myr)7GBG`xdVY7RYQ5gb*2v(
z5#r+<gi`G)*Ktk~W{e6BO=xdV%Pj3WmOWt(H(-0Y=UjJyAA2=Slg~j#BOOQ2d=f}i
z{;~tGQSdHMJWU?tu2Fcef}J05BKYys;2V|jw9kHp`z#G_&`|X3DI?Hotoyts9D~Z3
z5FK_F!lBF2Ugw_R8$ojrHU$C+DcSLgR5S>V464DsCaYB-b{N}*dJ~@g2GZqetFLq#
z;(Rp5UpdC^g_}Oa^x9~IslZ+AINcJVo3RU7#T?AQZoEh|=bE{-1HR+k09cd`QqK#q
zpOyp3^B-mkJ-xJqi7A<sN&%t0cK=iu|L2hCu+t(6h63nz4kFpnM(QY3ev{TG;OTw^
zeNhu8_X3qh4%QlAo#n%wPyGFNTQqIH-`dox+lPT4<M25z8a#VKtH_e=!|>)Hp(k_-
zUlU_OmL@8n<1b9IJN1-F^6*ab?BxAcg>m2f7Jz)wa3{Vy2cp&u5t;UX;7=WYMm4GT
z8E~=jef9;KOE=d8g_b1CfOJVTg01kERe&pbEHCyRDg3(3%S03GGH~tm+^)9+O33Gw
z8BnlsQbGDlz&wl3Axx#P#A;4xR%f>HG~r84vd?G2ZKOr&4_D^FjXKY?$fQHisW+8p
z2mMW3<rL+cR#y{ui|_J#3%PSpG$l<`y6D$<&JCY_17Q3QAlGz>HXh`RbY9%uDK%~g
z^ii(ukW=8c0gn01xHbtJ6-lz@f2A`2&H^;TLIj8n(~a_%74Ua4sL22HDZ8_`$mf&e
zJJ+5316Hi~H3Q}&xP7aCg~79Ai9Q*wWw!1OdwHo(n9wP=NeIAr5LCdr-hc4SbuQOK
zd}I7gyQ4SDpb^K3WX1T~WK6UX_n>86H1y70T04Gg5meY7f@*8i&^3_x9sdEsGTg})
z&rTRCe^I{j#y#1qPgc8Z!<}3#Qif`csSW<xxsr-L@yztjF-sttWX7GK&<E^nSRG5{
zq(4BmV0XRXp~QofzC}p?;$ubM%6wn%dm^FjUOjzmc3#50V;O>h;T1HUNK?g$e$gP|
zD^&)R$_MNG-l8M7LK1r+6S?MKL&GyqGy;8SK|FNXp$SLrJYK)DGENRELqM5Z{rE@^
z&@$6Vql{>F_Rv@^6QY)nv@@=+Bc~Oy*?AiUO`ZGYj?w@368rld4qn`^Ru*EV=&;l-
zttrqCn}debMLWbnm<q{ae%aX}$UTyjJ!iT1Vcb?oraet$-ChbwcFD@eY)~tHr~#vh
zWp6Xw4P<;q6{mdh2G?sJ5X96JkXjEr0{@sqbVlK_cJZi3%PA4Hz+mX3dxCKt?;#7{
z-RbwCi7jL1D&6Je^n(1_j{pbh0!8_P-)Sx?ArX6Lw4WMuhgDD_^<=)M3zm5hZqT8F
za;anl?*Cq}^~AND!#H<G+Mlm|n~DtE4L-8r!qYH*;JvHUM(P8+cpiZ+?`R%imb#(B
zt~oR3(sizV%?e3u>ijJ%*tWQSu$_J`JR{`R>Pb#bUGtshUi;l2bDF&oDL6oid_M-4
zBx696=W3R>qSAA)4E!G0Fk9RbVjb@E4Q`~gf!Ze$PI@3Eg8=ZW=L{CiSarap<KE-_
zIrMm~k4tl~CoQTU7eq9PN(ihP*ud8@;3Av!`jL97C3CoqzY+VWPXoi5oTg5S7-6WN
z|8Ho)Hy^~21|y#qm|=0$?+xYfdbC4_;VX2n%w|GCweD&@g&yCYSpM^-t`*?VpL2{M
zFVu}Rd<jWq@qR45&^H<B4<n$&?qC+#FaTDFYWXXn7&WTuC6)Ra%_a_#oRb$0;K9h8
zlWv1UG#JD`Q$Zz(9HU|t9!u|)l7f)lrnq(Dy+Kd0aqeA+1w6WW;b%|76{P@8gbq3r
z%=2cKtqTV-WO209Th~WRGwMMH<cU;I;<6zV;qL7t@W9fH4mQbO7WbiC%K&~_6knqv
zV}M^op68N>-j}&j2Ww5lE?0K!)7g40J#EIK{FOdmF0Fg<Fh*8TcV^m0f#hY)x756#
z_4Lj_K1yTgUWL;Y<8Msh28mEwJ@973mw;eV9a;XseETxzUa(q7tdrnVW#m3e<IT-4
z*0NubJn2dd14_&VWs|4F2B)6A4nt%;yT`r%|4Qvpj0p59o5xhW_+flBZOND`c3&T^
z5n?{gMrdwt64iX7GaSjoF5?|<hL1kicop7bmNkJ!T++{UIcU2(f^4B@8@2YS9<}ZQ
z$k({%1J5@>E-wO5i>`?U>QH4N$%UQA0&w)f0+SX2Ki5ZicOyo#8iy_#kfTLwZJRT2
zy&z;hRv!giSo-^9;kFnDaAkQjc=CRYQ@+OYtBtp53e^hIe*8SNCy9{s&|yo6UMFwi
z&$F=K(GEed9hq0~u8}Ud<jcHF7&QKvMd+_Idn?P?#x0K`jJ-4>>?(5({KI59W_hAa
zAYIQ~%ets!$ILMz9wB1g&p?{D(0=*~RaEEnb${@d^uK&gV+R_b@%UC^KB*K7O6Srv
z+_gm-+Gtg*kzD9_BN}V8wc^kB|46H#?HdttYlz(07^BBRJ>M%dMD@hFCDLAJw&T7d
z<fwy{E|+$JBG@2o?MA8^Vo#OV5K{TJ%Zjw<4TwPEaeFg?jDb7V!%c7Rz{oLU{?AZ}
z6N7p{4H&P1UGOlbIr=wb5E_Svkbhrvrkt$aS|Dz&nS*_O>2PGoChvxPxGJRQ=d3+J
zsJU2@8#S379QELlh*fdAH+#L>lGp|I<WnZK%^7?d`-wiv9V>9JTs*pD6dBrh#<Kc6
zo?U4f$?<&uFz_hl^fqKPbthL;(j)to?>0h<Y2&f9OHW3KK(aHhGl|zt)oQ(gEJC2R
z#DZ_U+;jATh?IX-OZgc}!b@^>I0hGFl<28xUJS4_$=f~`G+kCqJm)OJjTul;wj`jY
z9P|iaL~Y{*`^hxHC}uZQodm9QKQnq}(S%1m0_>++g4*2zN(JXMHSygohyO|7{+-GF
zlh1`vmmOW##`ZUobh=;>NDkB$$SPF;Jj}J!LdkF}YfXlo@NoUr^n?U~lKG0`7DD6X
zh{Gu{zsg${U%Mv@nwiFwcP9(UKO!z<O_%u%el0_N)}m5h;#*1}^U(EqsthiD3E&xc
z9#B(}G4>Er0kaTAq?nGt-TUQc&oA;=QYQ36_d28M>Im^zo`#^LVtgMC%5d+=6xF3O
zt_wJ8t2UyuXn<2JV}nX+JuWi}j2qZH`GMJk>;sWs%ZIbWmU(YOo?j?#vTV90(28%7
z?)0ah`=-_O)msPq;E6!>0)n`D>OPaS2moLoi<9et-$S<jFI>E&n_|cib-5lyHZ%=8
zCOm#0?Ak6NKHXp+04g0%=4h)Bw*JZjv>B`R&~{WM_M440XZjcA@`md`X6K<bRAN1I
zaB9&)*8_pWiF9!-tcHKeY_~wNyQV5*`zTEXm5JS7nY-}#XTtv@c!wbIER;kAMo)Tv
zm8j4pYm}Qx+pKraYlx)ILqWFo6muyE<vy;>2;Bvu={}$~=RvD&>Ee*C)q$ef?v|WK
z@ebHn7`rP0Z6iY%^c&*!RH*7I45V&Wz$lt#e&*m%rm76`LpM113PSyy2%-K|yW+w&
zjK1Wd_teJh*?Nw4vyRRdkae|<uoV$EHL+f2nhc->75vv$T8){vwk)2=`<3EMe?p$m
zceFR!hzIsNg3dmUe~5g4w$E1tvmOA7FZU5VQRVFXLK-`pzde?iOjMepy9GSQT}6=n
z&ptZ)sbinfxqDn3o2Pu^nIrXM`nwTolA=Y8YKG}nZ`g364$~9E1xh7;dw@rm#u+yU
zh?R7*!hKk{CXwDwyw9cgeu{l*_@$2<`r9H1FPDczJrPpV(emfYzjt!wN^JBZj;js>
zNWzEfcqa;d(;q#L^Z)4+{!jh(j1>3F9j=I-2nEyJjt&4NH4+GjmAZ0Mi&O~HL}VN(
zV|O4$RLA2HH;Qp`g-HfL+j@;hzaX$Az7lo~h5y~G;JGAL{}DBU@C8Vc_c$h5WvFA>
zb}FJly!E{~AG3RWTx$%>SdyDORK*|?1^~n0p%=O+d9lX6lQNn2J-2zN3-t~sj#tum
zWh_s8T~7EYV=^jT$_xw!@y%1uh-S%aW9d!2no`&clQO+-G4K$k8?U6qrxCA%cRhJZ
zLe&x9GA>;8NZrLVcnWl7zs!#AcNRSbQ!#$eE^(_cLk`!6;>qfL>KSH)?ntHEH@&ZY
zq@JRqKotD4T#zBY{%yL`%i3?W2AG|5kaxcGe>?X1ZAltO*yxKi0s?jdT@6)J|3l!W
zNr(vDlZKkWfW;a*4$s?~QEa1->)4cP`%l?fMFwDf_%uF5P?CNqlZpsIF0aG9T=6E|
zDaeMb&<YzG0?^*IPIacy&(1Lz$+4Dv&%GW1UO~i**BXw?@D>QtI{PKuzC?Ae3$Lv2
zEf<kSKlx&kbh%+g-?AqMN`9{X+yA|YAzCWRhRR^?C(ijNlJOG|-pZvnZ2E%2m_;oY
z=}03YfQ_#YwE#U=x>!H2K@L+=7+dLd$%UtC#G~yk-+sgNkU=%8yA=G$g*#xAKJ*nP
z2=)CH8H9d=EnYWxDn1%k8jcKnQ8NrP{5*Q)2n6f<vx*TU){U#R%lDiW*uOQqkXFw(
zX=+nN1(jJ~lgtWq=9fZ;t;U9R=>aynl)&}yuokc;hXz|m8W!816t`sIWnQQvS$xa>
zOq>|4z-4%aqvhrHgQA-{(j@K$LsMJJ9;x+U{}<6jyDulO9(_nhN>5PO>rWn>9%+2%
z7<&dUGr{K$AZQKWJpyd>p?jC)i-J#bKRrG}HclK$R#I4gOyE*!`emnd??GL?^>~q9
zb16g?>{fMNgOuzMldVdIaAVZ#o$tZOr{b1BOGsOGnhnX(4;DwrFqxs}PlL30E_f$s
zanybJQfGk3f&||h?>#D7_q;70vr)75cAvRNb4ga{)>)A~P<cq+`J9>4rxY0<lg#O6
z6e-|n3(>bT_1u4#YAQAWl&f+&9@A99vWQFFgN(HWZl<$TrMlS!ktfr6BV6>Ljb>6n
z;3@Bjt0m%NRs_;V;>V(4(97<*$0$J)(|XM4#pTMR=kcp2e*sKcNS`3HNt{LiP4s}4
z9@Kb~GYOXMS3)3p3!@_DAsgS`cWf`eew+PK5M<jlr#R-uArH>Nh@vf9Io@^k9%INw
zN4-jKF`VEn)0hxdeD!&uU5i=j;1{N%i>Umh#+4#Fz(Gx3Tt4?^=tITwLEzECi4NUh
znd`DOc8jB*tez??D^Td^H|gRM^>z}g&ReS>0)QJ~^v!0|sR_@O!EER-FiE+_L)ENB
zMre>Vi)#E0Cn-)OWU+gD*+(IkT}6y6sYD9jtAhpQ#khErzd^8-sFr00!L3iH%r}jP
z8brWfRH&HJ<uqwSIeHn&ef*`L$r%V3%N(E)X#R=;1e@64Xx5?}xGba#43WJWOtx>Z
zZoGZVq8ek{GMDm+N!s0;uBh&eeB!bJf{jl)!=T%z_N1Lap||CZdxTFgK56}e6=U-l
zP<;TkieRVdzY~^!H|hQb|1{Bvxg9vv$(|fTzp}+wg(w8vw<#oDztX$g^V^*TVMLt4
z5`(RWYtf>YHP0(>CYq0CwNgW&K&g3}7%e&ugq&vVJJvCRa%RV}0IcEh3o6%%NZ5ta
zCqwYSAyOT%qL5lT<-Ihd&|cTohCXOE^ngJ|8YcDga4C2rX<5zFp^LI!S+&~yT+4|S
z(R)V8*WppCV3w_nw|W-_LC`rLev-Le7J!jf4%H@JLC$@O*b#l$d3OTzIP*1vjvwJ!
zb!;vpN%Dkz+OJ%dx4mu}uj}(eZ#48BvX6OeGE#<wQXF;E(sc~2h+1h0_zp{frcqo_
zWPzgQ*UGpA7lHY_N<0AP%Fl=InfEk1qH>%L7zA$1`7GsSvRNZFX#GnIy&_eu{EsyN
zMGfJ{av_g)Cxor72i(AQ>sVEGeLYSvo3UG#lOXBwyfj3Zr)x{#MXC$r&w$NXY+3=;
zeb{Nfl1-dV^x04h(~;(lZOaSV%_oi-RFG90*|FNLrXq4gNLL;UTjYexT~{7y!lWy}
zcsi{+_Wa~(X*k2@lEYWde#Jqs=+cW;XV=PnVa6&U)en_8oAr?|YJ`1<O$KB0(B4#V
zURMSdWdj7y^a7!K6SEb#skE8uINX!l#J9xt{`Y^M<`8|v(SGBR5axS1iAg%HP3~`5
z+N2B&I3cCj7ya`xb<dwj{ff^lg5FUkWt=Wf!gWlHQOriAV{-)v2{?z`tiVn!^&*q}
zuP_d7AV^#2SaDcBnM8BE=+r>r=Q_V03#W|yLFXrkP;%GT9GK!Subso#esa*Xbi~o~
zs=fzAmESN`Qn!o3i_ztQH&)7?s9B&4-CPjCI_~~!hFd~P@jKzQ9gfcez_-ek3S}?T
zV}M9(nihFG^6DDf^Q!=pnf~n66TQAB+U6O_>=~sZe2v^yqZf2EQv!Q$x_cD$>*DN>
zZlmVsjV!;Zp4uzwH`{kIr~+oef?uiIFtJk2qsaZB%_SZ{lgpNKpwyR7KKU+brqfhM
zB<}9T0Ejk?zrZ`vhv^0MLh<$5;4NNkKOW>VaZ`l)FVrU`ttpUPz-&6@UbzR2jZ>;#
zZ;?BxFRtoYZ2B)BmT=CUk~`4#oAg$%X>2l-NDCp*W|(leOo_BGzc9w)fNP7Y!S~rZ
zXT+!g{=LlYdsg<w(yHE*sxxWJh6uL2j}c)f)4fYfYh6Z5=}9L6p?+cMv;Mi+Fs03M
zXHp|j@{N65U=vGea?xbz<m<>$2P1Y;63OlHo<jF6u6OePjbt{dBML;L56*s3ZxRNo
zLkQ9mmEF{Nyh{`ts*4Ld!Ks*!jeTm0IBBI2%hv93`4+f7ElHNME~ei!`AFe+Do?N?
za_$s(^|52m3z*^gdl?g*i`OpTIuD=~!!w-^P`ygNZds&{$wZMzB1*ulF1|Des=5yS
zGQu!Zu0V(VL<zLOvVdMQaheSpd!IUeWRz%e5EFPhEoqBiBUPkvO8KI4g7t|RamAG)
zUV|dFlI3Y}v49_6b9#lP*QBiz6$$yU4-Uht(<FUB$i?qrh&;qD=x<(iq!DwtG}yRS
z3uNo$<*R^QThh<|Dna{5Odd~|5=@gcGbd_#;jptlT{=;C75P*&{lYIuPW@SyiKLzR
z6A;G*(9^tmb$v2q&jG9TpUX?qR|!AvQLc+~V{lN%3;`bcx-Vv{%EmC(2`bDN67D$y
z5iiV0J6P?u2lMqQCLZ_<<25GDI|2E7W|;+-dExKpbsH(}_*x}7jHZP<1kd<L;y31;
zy*=j{23u2<%Iid2g8Sq%gYO}yy1&kZs}sna&v|(%V-L{>2#%eI)m(!t&A{kpgiN|F
z)+jFQ-KYPvqC$v*`#R*o^*KGBcvz`Pk#PTk@0e-jq&oS=Y{ve@`E9*K6+%Aq@9yH(
zuW<lz8pyTBND`bP$t%auVotVv8vw16M3F}DPfYr#I6=uktLQ0v)>ZGrIolQ_5K!wk
zNNzQ_SD%1;yL+r1hT_OHm;&L3Knuv1O&{a9muODThfO%GLFIa#{Y;90+30Z5g?vD5
zbhlMh%t|d~rCXCRC!~;@hKe$0@zxZ&j<)K^G<BKgQj)<S8?Muo+NixTqQX4r)9u~+
z^r+_qMEn7*ym0ky62)JpekoLAQAAu9jx&mDup8L_f%LTRoSbI-|LFP?XsX}teH_nm
z<RBb#r8s0JLuD*Nlvx=<A(T{PipYEnQA%dw7&1o4P#HqUkg<#zGe?MIp5wn?yx-67
z^E~VSUCU~T;+*?+-}~PC+ShgMS{!JT+mx*yx`;n*BV4MB^yQ0j@J}YoiW@-JCT6Vo
z#k)ID>nL*!-sLf;pXvWoZ`DV4>Da68a|oMFr`^y+e&9`@>d21_6Z=xUYj2}-hwA$5
zHyTOjYdum=?iP6Y$sZvhku<h1WC|gE#!lZk8Tiuax8DKs2RgKv$2gf&=@AHC7$x|E
z-{c#Z^rsRS6Lj84zbBtL>o2TVRJ%UCIl3}44PGnX$mLu`UUnDpu}p6cq+LpYNvR|5
zbgB7}Iq?g5cWeE_82b-iiwA_vWBtP2k;g&QkQN7#>^)C(&#w0&ESGaT*JXJ%KV;UW
z?lElmJD7Qw^#!cd6h_7))W%7x$R(OaQMY7dpG7LU9zUP=*04Ykyan>p+z7Q|wlYt?
zX{!X^dI4U{Wgq5TvPti95jwQ1!pO3)vv3q`bP!GQuhtP?MwCk6h?C6W6s+iDLn{Dg
zR5iuY&56qPSoy7Gd4W(H9dda+6kW7z=))gK2ENX!Rd%xo9dF^5ocsC{u>T63XRbyy
z5!s}|c7ch{UH*3PnVoPC)RR>g`HvmY&|nrZ6@qTNeYLuMQMIaP`R%8AbDE{|BNb23
zaM5iI+dF^AGrXmVGNp$Tx;}R=eR^!|s#S+d>n-}W+h-+QLxA<~UzLU8FDIaL`AA#0
zLCTvo$=w8<gYYYa;4_9Lb(*o=irt>BL-IiuH_nbpdx6_FH`D#iFU@)*evH?#)KL0}
zSaxboYdKuVgN&_IgI{-)6Wzm_Lk)5ohWf~tuh}^*(>i=-uyvRpZFweVxoZo^H_hmc
z<7MW!Hho#K1Fv+*{0*f1Z@W<V6Ul$f)s@zpN_5{-aZZ1GqDE+5;rpn~la=$vsq?fJ
z0=(ug5(4=(bN{Xw({PkL`m=b{1--m-WKN*AN<(J7gjEEP(f@3K{ZA_Ch%6V!muz8<
zTj4Lu{Wi(Ex)1NSh)3~gN?QrkspF+cXYTJ|+kUP);65W0;041+RiN$Kr8)_Gjt0<3
z3$3@aavsj?Z0+Sza3c{sSp$KpYuZ8fAWWc^h>F%t3N>xEu3}J9?LFfl;~c`!)%C_e
zQi1}-Y6BQXl)$XkNo=jzUu2}&QM&@25F-(p1h>F8uJ}kgtf6_T3T+9EdCe^t9lr?R
z38l_iDY9CPhhc{a<oU1e(B`~v)BaTbVyzWQIDG-8PWMS@W9aeEXcWMvy(GpgkuAys
zmG}>qY{Tvdtq}&RwGu<EpaU+~Zm^G@eG^4lgk>d=T#RZvlh*0rmUsI*d5(3yoM>ro
zvPQ%k+zw4lD2u790`w6h^~BLEJjHT36(d}+!l5k`Gq{M@Y3M-fuf%l-zw9Xb$vIzT
z#p5q)d)$&xJAtLYIhabWBy1Qnjk4HswtrgNJ2hRCFX8vPk^V%yYwEF*33*H!)%U7&
zz_V3nOmY&c2A_ZD-V9i_UiG<yvA$c37biEiBBEN(zyf=8<W8OFb_UQWk-q!H;1RT}
z;m4E&kUr^II7KuZo^dycoc(aW&on02kG9rP#WMIkZ)GZ@i0QdEJW>?ZIo0z=e81g&
zR!yHw%*@|5I{W`bRS#424t~ow4`5%+Vff`RNIXTssL>)|s==SBnPt-~k>GTr@YWN0
z`e_n{uZTEJUmwOR>r|&Y;Td`@n<7=+<JdF33#KE5ySkuNnJwX4NMYl{k?>MBf5EYd
zKT|4$%8&Jcj->eyGS(Zwv{lcRiAv_uKZ|t!p!B&dVRB_HRx3XqwtLqu9Qa!%+8L6Y
z3p)-+sopV#C~L(NU61E!_WGPAlQ;p>>mMLcLB`<E6=4dvCpI5orY#S1UA_bN^esu$
zL))tq*eRj|8S5(cts=(PThJg+ubM(K<UIEB$P(!cnY&aeqvAZuN+OL$oJ_}%Q$L#@
z#}@EiJKLVV$*ck#1I}GiION~3YQ>m%7Rx$4T33|`S&^DvxGIqeD=h8S+sVPPG8II3
z0&RUm{oRj9VTzU~kTnWz&Y498FZdG4LWEVc0~Xo>Zj>BUq~)!Wn(L|mF2Ih;MjCea
zdqxG)&dR`+?cv*0Gl<o_$G?0ZI2}z)P5d~@j?UNDdgMSr-2qdRox0pN%dP{9&k>Qm
z1TNq%JH`u(=-=eHQ(cngsNkzI2cd7}U8|7L1E9NWa~zwU8j0BGU7-254gbF=E{35^
z@@jimNz!FW3dRhr1cBo+PV*i-BTkMn!Y0o$@uzch5F*KQAFrnVyQg`3oDOAZGPuVV
z@cS{W^?s?j9Uk!mD^Zib5q|GxtpH0rKKk?(9*=xgpRXugGd~jk)5{T)tJVbt0lGc*
z4eg{U?1SR<n&0#J4_->qTZwC`TDR!zh@>R#xk;*e-lWoP<zCj++_?DL6CTgm|FC1;
zhYl&wekle2%go=Taz6;hUh{IO^Sb9dn7>eU&;y;oiG$2Uw$oyqOb#+i-m?w%7We~Q
zZ1%PI(;>PreUe>(n!*1q)?DWP_JuR$aJ0}p*a+;w8P=c+I<jjzx#ndqe}ox$G*l=K
z@JozR-l090G@2YMtjVlYj9nRV3)0h=JINE`=4XH2zbRck=Ao=1Qd&6@i^xkj#b2%r
zoGukrm?EJom{a@q(Zw$6>Q!1y^2o&}hao;dlbd%!JcB09?P{v*6hdmyS*-xsoa#&t
zq#VudCr{~{XPmLet`RlGx(*{PvL?-@yT=dp)FE~<D~H`2UPTbe|0KX;ezPXnT|E8M
zG2?z;zPTSgRVm#Hlj3^hUGEgHioY(XH*bN%?)k0n{Qv6_6B$&v=bu<-GnQnVMKrQ)
zXgomdG)n=BLh~mVQV<ArJQ4h4WKCOu${_Bln*Afh40<hjD^~DDh(w0hyME-JD=_u~
zu;DhN3Np#|)a@^^&vY2%b4U%2O5(qAM|wjaC$0QgR{Y5Lk9Qls42!+BV$wAu;)E0c
z(BhS|nyL=ABS!b<JnaUp)S~dul}ga-bYA7&BLY6uhO)D)PqQ|kF1yBFkaDnl3e87Z
z%X??eOI>_V^Y^9SvIADf&|FQQL7#cnAfE#8?!k3%6jx^heXf6e*gg8BH@2uT1B}0Y
z1)$s<r#p-}M$Q0O7m#KVHgrnANkHoT=*uL5Pz#E@AT5VR?wAHRbg}cNbQBcc{Z>A7
zOzLv_kQxj2<LXS0$e^Y2+Y5%T-af(czzp=5Z{k{aO%{%JT;r&SVghwBOPYJv&Dq|9
zdnVQas-oz2+FAuIl6DsKNS0FDwEs=M?_AxV%hrKmXaDs}S=+eQ3bZ2*>sItpx%`lO
znLEYzLu-4O8frO^s%GmvnGkHdl(y2u6~IooI4ae|cZj5ukV(uGy;*hbe-rNiv~H7w
zd~7S8v;Q8e?90yF#rqr#;fKfOZE8i+p<(s_lPBpWLsBsY0v*0fM+wG(R2pSIHP7bc
zQtiH_310B_Y)zE))#NB)?Mnsi*e`TtA~R|HMU}t;NS(MZDt<3={zYu`AS4F%erCk3
znmrJTU^G;TCs||N?qQSa3;9X}?NRMih+pTD>&fqjWt{HxKi4L_u7oDpIFgdt3)^Qe
zLFX>4JNLR^6}{Ga&Bkjub$t?IsaxBwyJm9=$+t_D<k$TpcC@BNG>)$WH?HfT2;rp$
ze)*(Id)<7rk8qsNNWB`>5rZpP#r{Td2I%F|r;a}=<s56e$IE<Mbv!a|(_=T~52OqS
zPICC?U~SE)ect=antE+mBHHF-Zxl<hgsE1yzXx;v=CeI`3ZxkRNoIGmTze3>T6JjF
zTQ7WZiO3V}v?}M1;*|>wvt=}=zhm_L-3@X&mb@;~tc;OzivU%BG6nitg)G9NZP_#r
zcbgW|)Y<M9t-HOrWXbe_+GD-Y)j7q+TT$`QJW|&@t9E|XeQ|>E*EeA}vop7IRF~)s
zixn6pp@K{sTNory5iPkfjTd{YQUPh|#vSHnjSIwOwv+zYc>Xde{r~5)mOo*(yP(Aa
zJQir|9K`S%n#a7LCP3m{CF$Vb<&ySZ@!rWG^L#pzdb%`afS{g15mX=5aAEQeOSo$k
z&4Ek@hbA@}DJTTrCL5RZ-70bj|5{=P4fC$&-{^rcua{Y24PCAGhp7mo*fS4)YA6f?
zS;(oXaR&HRSwBa=N$N;dg<-_pmhT{A`3mp4Rm&cpL8@(gdXL6%ZUK`c#xeP?96<Ua
zV6M%>FLhZ;tKz}|(kguc7GDCZ9Ut+<8m*4X2&2?t%ZLxuUoU3I0H7u%BWb?$0SGZ0
z@{He4=>jKgv;4{LTNqbRru~?mNmsOSJb3Koy~M_sG`_daT|Z47gc@0Y6`U9rN0H?R
zM7&D$+R-4{h0-}jzCVCtK0GPm+Z->9D{PYcQ@#`#+s6h|SqP~`<{6%n(r5*#n~sM7
zH+my)Jj3&z%U`pM_0{`m{0X{W&XiKa92Kz<cgV(g(ykSK{DGLb3Q%XucBE4W#AB7p
zZBqbFKV?yi`H^l&&Y>K`7E5kx*!QRAnz1(d;|(h{+1rC_|1g3heg%GHEa@dpITO^m
zO$k)Pq`3h5g+irUt8x}{=BDPyU##Rcsxssy#~K(;p58u@%#Kq08bEKMJMv?=q-c*m
zA9Rvrrpn-TpovXY(U1Sh%8eLQqB5v>3kC_emNLhoDM37_u=q$QB(9$t9GHLqisbE~
zLj`>gzFoOQR-SHSDuaTdn-87Becc)AHW7@>;Lr^hRibblafxg<C)uo;Tgmh%6^ZA4
z|M#JC-lcMCY?QicT0t6NivsaJezXI-vm`9ov~?x7Y?w}da#Y;o_61BXd14>d=EUci
z@5IvwLR@*)LSfplVyW=2QDrrb2)=_W@`p(!FJdk&A0Wq+a(vKY!qU&A+>n~UnWXSj
zXPGjiFFp)pn{k%PB_3B{8SqU?w4kuynm<j}(WUr}po+`Q8#HMyW0?!!A(@Ax#dDR?
zsCY$wIhQIo_t71J8m<tcvq}u8j*Ta@R)E#ZJ5UBX)k~6`#lM~06I4e?t7K|TO#>s$
z)LE>m#K5Se<<!*|kwk2p*!BstxSD`X6NIx+m$27dkhh`eW%uZQ!MM&^Id5q#%XU3>
zR_ecVU>_0mY`o$6kgxZE(+ZiaAgER_2S^Aw7qw|$-3blq(Lr2aK!9tvsBDajjZt_p
zQa^SMm*<Lar-RT{gkO`Xmfe1!nMXrQ0(-l<?~=>B!iI`D<(iD3ID@|*kO(SaWZjL@
z%4vM$4d8Z0A%3%JqPN_RTVP5m`ba~R?AgSW%g&fjOA(z9&YgBcfKO=xk)$rtls+j7
z>=3Tpl@3d!Oi2K87%wCABacGbbgg<A{+iiw;^i-mo~y9d?A0qev=N3B)%+f--@mUy
ztQEwAe?!DG{pwL_PFCJa+!5~dIyGz#wREZz3O3PMMH~ci=Yl`rq$UW=33WbiL<bG0
ziOb_f;SDp2HXPcoCPfyv-rhM~Hjy;#3<;*3(itra@sj%-E72`y^zX??nQ%I@*07@<
zD!kFZ`HH~IKfhr0#fS=7-r6{y9c9{J{h4~oM5U)I-^?ceEAZjk(97=_R(Ef_?sUT+
ztg8d$Sm~)x=iM#neBMex3F-Kpn|j7@lhNe0<_X%e&zziUM7bL$Gas5@pFd)q%=eo4
z@b-u%ieA*$57(g$cSI90>k_TtX9oYHfbA=G%wTEFzDU>&<#f19QyAdps%GrB6`pDZ
zJ;BkOA~SvfCKx8m^vCeo25;vK!Fd<l6da=)(}uA>4ad0L&(2QQhjuee=W9$qn_ZJP
z46rf;B3JOvGcV95?z}v7q2C(p&C`$F)gUp~0N!9k|I-i0AAAv0lc#z`4xklY?+%i#
zi^LyG$l1ctD8%tw%cW?1dsPoy<Sc))mya7a3V}qXHWd)*pmIF$!VnL~kza*WpF(*K
zjipXB$FPW#&dJ#;EY_HJa?LCJJxui$^?+JYptE<(v%BDCeOk6Xt<GizkO%5+8;;4n
z?Qd`-AVp9>|8N>8GeN+L@Vx$@o$$2U6+e$l3;qwOW8TLn@gYk2+OQRnO%8}6nmNol
z5*ub0Fqybh^Bd?XSewJA$~i{3=FQ{W?bZt!|4>YgS$#9xj9(Kc|EQGqNKqgOaB6|k
zgAxm)uYtfg*#P5hve4XR>*#6{3}yhW!^h!C+aW%E1=g9{m+xv$tXmc^gw)a^?h=)M
z0_jh;`j8+lPho3R#)49+2xnpl_e)pv+ltjV`V(nXK@KD5{C0gwIxb2-ks>JZyKQp4
zs{oeBbawPVeZ~&|8C*7~r}UfLIlTHSJ>9R-wI{d?@%PfIGOhj%e@ERd>{F9C7l{^Y
zhl5}zlnqtQDaUaYgxUG+93oZ;&ud<F|B4LJ<r`A%d5aK!@_-rU-^}K<Ze9wkGlrd2
zxPH`tPtw5@voynlr>*mVx>Y*nKGQr<)dkuK{2y!w?=cpR_9ILtV!yCgVUv&Op+9ux
z-bXgLlf5r8Ip$u(YkSql{lhE>dQF!5_mVxOhtx^r5&fT!YI*rxVbbf~L0&n0-=0*6
z=HFz6(~@%GK$0_*4{sc$<l`70yP!2PtFE|xh|e8*|BO4#=cSKl%Qa!$Rn1l@8SSb^
z=rIajgJT#&jgaG7{DhHS5*PNgs7+`*>C2o9G9V1xg@NRP;Cv@=G#@IgW>wyV$aY5C
zfsz7x-j{MeMmPPQ&jwJj9BuZ;pnezmpRf&(SgpqC$j9Y&YMs@5;&l{!3?e0FVEv;3
z2;=Ue&H%QwUiC1?QM~%)Y@Z;dH_x)7{^bIEI+&Ht=DB>O{(Bow$$)%A7qBPmXPnx@
zIs*=?mvC8zqYe>9$1ILs|563?!zNJ8kM>HWZTh-vBSeinJmwB?-i~}dV*HHp%G84l
zypt^tM=z}MNyH!t#gax9(5YQnbH<J%(eG@e0sfsUQ2bN`X!0&0)t%Avh&F0Tb1$I9
z^72>282r91opLI?*PHwR11$ZX1%nToQxhc8DzqYJ|F3O=JSvPkc{emr(>7!J>8yRx
zP2^@Cfz}_MmMpE3mQ2t|Rem>tW!go<`+<=$m+*HWgGZjcE-_OnKcJJML;<wmlG%aM
z(<j==jj!8+bxm3syT`o3k73Q{$E56&)l&U86HtngP*^pAUiEp*zn(2D6$i~bfSB|0
z)K@(jGweJ00kPX(4g-@QI4!SajPeoCTc0V*zrLmBe9L4T917Jz*g!v*00WZfRbx>W
zCxXuByfd{gWIc6Dr6V28-WV`at3Z`<GSP*Z%k7sskhVgId<pa2KG6(pxHOqgTGh)t
z2rox{B>DlU_5?CnOoV=(dC}pFUP39RJ7u-!IErSdh#VA)(S;$IXC1qZte7gESp&7a
z(@C>&!x{53%%X)Yb_v_!$^>^u1VWz7DuEM9q0^F1g-LZ<<C#ee7A{l4jrGF?-@#PV
zQ-m|S_LfPfO^rNB=wMj!KG=BcB0PmJt_%16yO_m~VB5%K?;PoLBD3=7rANMr5hTf6
zHg@_$F%wjmItZ9|GFB2Fr+Rp#-o_QwQ)yuB9-^-hiCSoZ{z#Y;wLdc!edw@gasxlF
zNj%U?a(~5(TVJhC$MVRx=|Xw#|LKA~s)|G>*m6uSwU~5_BZ#@p6m5JO*8#v%<jn|g
zps#ja)g4BfrVxMz9)7>CcHky0=3MB*98l&_cn4z0W^pBWcluv4U@X<aq2YFXPE=)j
zf~!g4sbtXG(8NYfdECKE$1$qQ^8Ft`O=9w|q{a0}R+5IWs_ifTA_lbXv<`v#xQt%<
z13@s(>9_cOe-^4KHLH|&0hab89qoOSlE(4CvWrll$P)SNJ_e6NS4#tvw&+(P5G5FC
z%Gr@-p%sgo{>tVy#nIZ|?7MS;7^xVFVsF9jttnoGA%E8lPbRhViOFHBnq?Y)dvZVW
zP^uj`|Blo7P~!VT`>X#4(<@ncS-`8|W}NV!^+GP>tKb6yo8Mb7c;*@A(kIQi$9EnP
zj?;wcP|XS3nrJv4zZ;~5qqG;?lrIx08Af0YHGjW%S4-2b``cqsWNCB0ADg&ijKe#s
zFn43@WjlmMKfQERJo*|DV!@`&*t7?FLZ%aI;h(b_KwGDPxc{xtQvC=RPizj>G9JEE
zGB3}FW=Vu4Q3rO0hl3I@q2I_yzO(pD$!!d`ccPm9L^rj{pH`P)PhRNC(*?a09kO37
zK%aB1lV2AHy^k~NKoF5ThIse|`+3v!2tQV8)bF=OMH%1nOs<zoBh8~!0cIh?<T}(6
zm9T+VEog<JFcf%+;ydltOZUWHe1Z{}pSk1LI64aOPa^mZW&O;n+`QNN{rDuxFFH|O
z_Q9It`@@scjg<)T2r@~yQ^`&vDI~Ib{3L&ad9=6doH+BpLo9j3oosi_h(_*pWYWh+
z%B|H%MuaOwYfQ7PYdo2?zm$EGM@)O5)2V8OG=8pBsw*vy3oC^)(Fc8e<UCgn{^Z@D
z`=O2$D4Ol9`5M~eAN}4Lx&8(C0b?CQ`rY=6e$YD{kA;8m0VPH3rYErM>K?x}`6wZ{
zaJhtUz2{n|8+D+d!+hptVA;NEC^Fr9h5zjt;saTO6~!4JR?<pZQbDk$2$CZ4>}=Z1
zd0+N*=>c8Rfmt>rw;RX%@DGwE@jg26l~b12jDAT<ddt=Hkq_0td+Qj4R#-X#heT(D
zKza!q#5dB0nv6pvN{|T)e=oMvwlBo{WP+GOCrgKW;+^k11C<SbZxvoLmS~I5PsvKw
z`Xs4CxK#857<I6_IHtoX&<}L%(l;n|oN26lIHZai<vM_8?g)ITB!_LQG6^1~#wVvw
zD!zr4zinegxtH6^Qvwof@3tRhLgjZbkRQE%9#6V)vn3o>eN2_OQ^O{A^9SK6XX0s}
z3NY2ck^I5LcF0~%`*Zw8bfh<GvkW8NZ$cxL)cJy>J7PWN{?UwO@p(#6Tk<jZ+`+b(
z0>jBxQ4-OgLwWYhvBz`f`sL0nP(^)bG*d<{4(tth4Ye+_{~F$SW7>v>*;{fzG>jVp
zB`U+81*<>E79gslGFHx#ZJN^GHrn7Zb6j}h9+&5F+G+#&Daj1bIt^T&>CsMf`^CSa
zv%B-x5{6%)C+P6Fj7h~Fc-MD)j1KL+YoF7=WRyDlPiBGpg!Xky;rdi@dwTC46CoKC
zqh>7_XjpVPWXOvA2PM4E-QPRlsMp~Ax5r!gmHX@k4NPy3eC)^AntPoNNuEGAea9Tp
zNVMIP$fGHw{&pYr7iT4tqa)bQ(jz|t{Tv993aoU9HvzJnpQ1Ig%6#Y3Pe0A+<Audq
zr*G5M_Cf1hHmrJ{v))3zN<jYI0DYUb-}ldSw#_5XTv#-UM~wD!tHR+UANC!Ta0aN1
zVWTUzv6V}@BM~!Gi(bfH#o^z<=~4|2V)^H>`tYI&T36h3q^S0fn~MExf!Dx~i_E+S
z_m|^3%IS}T`qQ2OynNXi6NNquHL3cCeR3_%5IxIz^^K(hohA4u<&;);BR^E0i((4J
z?nLi>q0I;vexc<bQ8Z)dfy4r+vZJf~$XW84a^jHpkfQD*EZ2Vg`YLeLe9()UppM2<
z9qakMk<qfaq)d#gWOTl=O%NFq+P6v}(t-*=R-vyojadJ}h$yqST0h|pJ=s8;ts&ds
zKOZ<n^fLQa4x`EHu49peB56w=^4feZffq$$G6A*r+Fghz1KB{Gc&p;Q7*dV)ogE^b
z1!UNDvA}w7^D)BaLtAq7{%<Ql1Tz(H#Meo`ZnzuI$u6Q&v={w@Xb?3zUnVj2LaZ!5
znxt%<(N|^$@umT(KMNRvtR)YFqm=csk?`Kq$|G?@>d621n)hCO%0;4AwzkWH!VMti
zvc4BCI*+^*c@$!Q^yvyFj$31hSqkN}Krs-Suki1;+Q|C&c<$r{L+3`iq<l}Wj}Qzf
zJqC)#qW1sMfBqcBnaLF71$&h|Vht&{CAjzAlhCnof=NzB53!iG&%AqfFMAB$aW(lH
zMMe{&^4g)5-;r&kvb4XANDRQV%jv@Jr{vte4{P)07dZEJ*WP>G+hzx#BP<I;l^6)?
zRKIBPBuWiD2(d|ioI+IGU_3DPI3ry|FfH-&_Lu2>%?Fkl#k0otPWH}RIpey&<;8*o
z<P~Eb;Su<I9O(1-pP$c#+I-@C%q&*c{evO@M|)CQar?2o-RD6Oo5&!wUI#ITK7V(|
zI@g=vfBwqs`7Ncr&Mnq-Zwslnl2yYGc1bLl*!wjC$gjCq(rftNzlJ0RJ<Dp){la@S
z!dqcmq0O?#^~nCSqd_}?$5zML@iy@xHPl5^hIm+i(12A^%FKq`$oT$Q!f=vo6*Uc2
z;@Em$5qX172i-^Lvv>@25qW+6nuL)B`9B}8muuXv31|EotKC~cSBADEYDV`uce&cE
zH6F1S!VjsgWk*Oor`=N9dnch9^4?lT259|AHITK5?Vr~FzB`ehv-bgn-3WEkICJeT
zrf=-zR6fD_d%<X&qBmintq+&D6S9rV_aoyNiHvU9NgRoLjk{2zwtt=up{pa~n(xVx
z$q}0N_fRE5MPm7q^FM<6WO=BqIuj&HDZeRQRo{DwBc$Mf=Hc^20HsNc%x)fcl>%|P
z{F>kMeX3)S9Es=P?G>qpF&j#cl=C8e7m$~WLBqa9i5;@}AL{{mHj3B|ovGmfFy!-t
z=x6xJs{&z(gJ_or^ajz$jM?A0<soR5<O67p5L-quMl+nDWfet<Y8TV^*G^y3TcrDE
zZBdKJFK*DL`$lIZVlAtRa{sIH0C=^K{`d38zh)3k(6WQx{)k>OlfrI+=&zfM(K~;k
zPj<>_>N3L6rH?yz?;9U7)7`MF>S4Hi4(9Rz!<*6DR-IV;i0oi%POTGrKNDCm`G2nW
zf2VUI5g?NbAc*J0BjL(I39|RY)Dj=}A1YP=wLZ$k+9751jNS0a{(6_viQb=2sYNsc
z`~tD8*Gk1gHV2^=k3#q)U=&Ps(*RVFMy0?4O%1{SJ%PVW2j*9LHJ|G?m3*?wJWyZn
z?Z3lQJR^~DRKwr><|<Y2+I2NwS@*wQ$N^khiEpm|`izao`0Q9p9J*n#r0$=-my$`M
zm!E^B>>8z{cZE6Ygbt$98F&O8t+2v)Xt73Lw=Lyv;-efYMyg`S{wBb;I_i{V6Jt4y
zY00OUUy|+pQPa&7;?vlNjnB}UlMArsx86$_=z|zY{%+j*bGGs$xh&s2#U`e}bMHjb
z!HHD5h3li<Up*B%QKR;QgO4S2NQo_QYLoyPJB~?uJAe*Z4z(IA1(9K6L)ZzEB*t_<
z?e5nm!*?pYfr~2y{>+g&>1w5wF)Ve#2hR8W5FeVo{}W9ZI>F<T^hJI0`2HGmg-V0x
zP-6Zzj~dX5Z?^n%{}<w5WtiJs_s}@_t>Z6!c5Vu0-|UY)cfo1_{%Zk8vUlAzco~O6
zzB1Z(wS%<QB(Gxi8e&TGW`<T~3TO1e=A~YqabMR;Ip0~t8*V-k;iwh~2DvE5`E*Wt
zq@DfR5+=`#CV&BU(tD?fuP7nJS`tn|L#%wP?gOKQbRzE$^GWLcSK5cPZCUS?@8=3|
zX@U>hSPi?uQfq=Zyuw;PpLpFFTvh9!>lbo&5trZn+<&*$VxZwQ>^%7f1iQ!;3_9eF
z71?r8cs(n0#vXDY6P~3UGYR9CchJ5S7=65%fyDDd)U^5%I_~p7E;96@$DL|Fo1dLk
z!PD|e|Njj(_-7PSo~6_mTBe_!ZQk2ja1i+x`424n7?gsxdK%;$+n;vVc8!adf%Xvv
zFq|4-DaOM!R7JeWRLL%L1N^{>X|9wXI>L`CyUbw|uNZdp&_f<QbZq4rWIyi(tu)NY
zN0x4TH-CrYm+;zY89ZnAOgyW&v8#mWD(>d!f5NK#99jdhOLUxUqDS^Qk#mG=vHcTo
zdn51WgP+jP3<U|ob=;yV3t|(205rno^NM%DbHA*4qbv}hzt-vis)>>xAxrrOnVBHO
zKYt}^OBoO*`~4U-YSX>p+M1~6-7|GD6-bQ5;}}GxV^b>ttH3+XQ|N*rOh&uK^}XA*
z7BUoyg^_~26cY9f&TRKe6zi9Q!;TNb;dNP|7&q1ro%89c0BH!AFvm!Jm`(Ph3AfHG
zy@+(%ff`R`MmyG-1_!DEd<TrajRFpse(d9V=o33_O<&i%mp{(3_b$y+l$g*4IMj%{
z3zQeNp?5YW+21#QdbD&K@iT)`?PHs!#b4M&Z1nd1Nrz^xg`n7JdC7zKB^~xW@^Fig
z|8pUslMZO<J^uYoZ9c0o0NHo*>?m8w_fr_UTq8ft6WgxmZZ%-*LskHi2N;kB;gP(P
zVec5y0}6kY<y~+fxp(LYoDT=>bM*-oE$k%4GaO}2KkW_~E_!1$5@hrb(j4OpW)TtI
zya`sFmcaEaL_#SOBnS;)Ug|KV6Z`1_<dlo5jffvysJtO!{udo)@xMgbMO7mI$(Ws*
zo3Y;ZUwA*|M^!*u795I#p%nhw>y$R?d-e<9lZK&rzXwWnLFI4)kqH3JL<P|3bO7OW
zKo)OkcdtsJv9jHzU74j(U8F)URLmkh067fi8%3*A^rBBZz~_?3kk_c{{bi^6^BCFh
zV7K$^u$bcC(`5dA;B`c)sM~iCHh~aaQEyUS=byg;u1JoG^Ah{P>kFbdJ0fDLxpyn>
zrE|!lVogvm@^9prqRV^xmH?o1sRQp6@j|GhIb%0uj5KJuvlq^`WrUr$)C4ZmaiLF}
zhC(2`s>3ElsGDSWQ-?KXSH$mjD9M3FD&>5DCqr_mP>pf%ba768h9hQ4D*tyox7jt?
zSH%zO&s0vPoZSid)d!Abx4y)Rfe8Ooq$VPun}G5~NbmHC>wkb>ja%;lV^?kgt5Eq$
zj1pMym?BAQcfRrddqG27wpaYWc?<G6(Hm33v@sGwE4S{$nG~wYRx*0#e*d>C0}l{`
zPVkhrglx7DI20?xu!~gysa_iDgoY*}LK@?nZD+Mdyk2y*Ww}AnpfL13+L{b3b;6FE
zzTE(!O9YUqC{W@xg<WFfzu*5ZEJ91B@x9=ct<~T9(pQnm(1@qXWdFG#c6yXn*0FV?
zW8oyXPG3+aK5q!*Y`#o=H<@({CTTXpJmr{?j>&H$<~5HSVKNJ9FiBJ*50sIM2uuyZ
z2$3a4fH8Dn0jtde3?_D!D|PW4ByVVjP|ZD;Nj@a}Ql>V@{m%-x0;-vK>+i@Nnfnb4
zv!>y#c1GA4dE6qjJC6vKK|fi)DG*S+t0Un{0RJ+ZxIk~c8RBqwaij@Af)NC_Us+&4
zvH6U{X>Py>!OszC(Hr;nPteT45S{@&A+1&dB%tA-%74vhhFEgwwJifpxB=WZjw|c;
zs#ISzfkmDGEJLL)P%A;E5p@2+c?tsWVLEZU{-;M*N+@D$W!G&o;;)s|N9EE=kUaJ9
zm6f}4XArU8K}L>&1oE@K3(nRw-wY1{<2&404YGK|UKMeQwDr+N99@3z+%{E32~#K;
z4UnRN(oMjW?^SOqI7!$V<8?1w{zpEh6ya0(k^pw6Q4E3Y#~OE+DYM>guKu>3{RxXL
zi1>24{eH}98w&Tq(YJnW<oxCHH_}93!=Un(-hH(@nA5eE2?gECmwxO>KxO{;12M;0
z;H<vc(k$4DM1TZQa%Od;4ic0maOM`4y8DNxEfCgR8#mSmo>b3@p&%V7f)4$=&oG7h
zPZKxBR^ypHolib}QvLc{+4WM^JcnD>HR`Z+=R-`|u7^a%1E<UWq;1{iI5<4em#m4E
z^O~`&gUyCAz-W`6+(KgAXu>MzLIQ^XAVGQ;!~gDG0$@8z1l!TtpZ}BEGd}ZC9Vacu
z_*af<@ILXSj(GFbL2-b;>p-ngjLAczE7G9hiv~GRnZO!YPDUda6L|l%rFbLi&r|f_
z#$cL&m{FExiwr79@MwZLdMVfFDJeRI@-yJ@<5bL2L1RS#xq}Y4HB4->{jJth%+d=O
z0%NX4XlEZ*mQ#7P3KN<b2SdKfrJ0=m3?bm#)-ps!zW#Hy891#pNze++cr9Gq#Ym-(
zr#y)BzDbNzL9Xx7bMtXTaCO#G6E1c@0wnZ;M7px|AGlM!Z4UJ@8S%S5x~wvEL~(o~
z1U*(rtBR)`M(?L8mDr$Wzb=;8|E&~oj%W?mFUU<P-*(gOIcsYb0R>~DBkS3{&t*C6
z^=eCJN0()#BN7!{9l@L8jd!nTmA7Wz8TDu(x*%v7k|#hoy!NZ-tq31&=5?Um3BVRE
z(;!G)0miaTIW2FV!{X5bm@x|1dBRP%96e9pvo!vfO3rr@-O*Z?`YwWjk|j7Ssur&S
zD=J;BC~*3lh*Z^Ya!zhtWCpY&@H1|IKQ}b_Zvmr%BZXV~)`8#{8T#Hz&yOQUiNb#w
z2=ox|%qC)kN_)~&8Fap$Cc$)vUG1VwdlxX3pnxi+59(y&0`W=<^N{v_LG;w;@90zZ
z*C(5A?(qGlWEa`Le38Ag_W>kw5AA9fUU$Rcml|)yoHg=!K7&3XT`_=C^|=gm>BoT`
zrI!^y`|4w@YddR@Ybaau9HYM@7>}JS9&&p#^TCK^uCM6QNVlJR4;VGu??QEi7FOJg
zFb;J`)W&}>Vrk&%so(3#%iRfHog$G;g+92YKMAXYN=2A@tXGV@_lksXfXTV{h!ZMQ
zKE%=n{&IW1rgXdV9`I$&CwG}I7HBy}&nH9@QN+=G&eiT(mH1`zdcKDDfC*nrA9qw^
zdMd7(B<Gn1-Ylpvv6>sG!|}kKV$zoZYDoRJ4?7>8y$^UPPpFkW%uDG}eeQbSL*)Hq
z5IjWjjG@iG_ws)#WBDqK6vd<1!Z&6Y#>&P(@T(y`I)%*aYe4#M%WKea6X+p$)IvPJ
z1-6OG$}`j0d6Y!SNQ+;@%ruHK8bZKTq;ZEZ5fM|`Q|_xaQCF)}2Rif3FGMl_AUtRZ
z16e(<o;m%`k<CX-KaU&_()jHTQ@PUNisFwCk+WdP0svJYpc}3kyK++&76mS{9&UFc
z^$46UJ@4uaj#`zhTG{Rj#^AJglbugmJ#4!#i!RbPMEXp7-aCKh=_21^W8}T%w}I=-
zwxVTxeZMw9i*N#&_!TK=AiODx>aRf(k0<rHQ~U099Lc$sMb+|P#r|Y-36Dw={i3mt
zx^Y6CnuIyT`SCw##QrBEtR$VZQPG!u_&<8EK7J-p_AC2MWKb<zW};g*;_nhN;Q?<5
zac&f>YS`XbvV;WaNY~3>79|JY-|f6;q5~s&)4t|`z}sMRWhTbi6A^9kpTy~PkY>HT
zyn~F3TJDFi@znp+W-tssi2<$X$9oJzw5}p)lqSXji>+t>l&4+Cp|62gT%?eZPK5+g
zA;ZWVhCIIu3=(LutC<N8<K9S&aYj;eFzz(N{wZ5!j6QG8i`Sq?b8Ee3Y0kjHd1pI@
z&JG0vt|<6jM>$50Yy(R!jp^W|-v~S+OPFDLFQLG^<{;80)C0xy-#0Lc;uz2xH(pER
zTbe?g6cB&Ig!of<9Oeq8#_J8gEaDdtTKhlgBW@h+DkCsX+OU^e{%4yVpjg);PA`{D
z{7lVU8smxib0P9F2f;=T#v1s4vUC$TayJh=uB4AZ6SCkmgsO5i(u3xqozqS;bztTH
z`fD1LpsJ91Ml?Z7Q(Y^CA@k2hY|`o(0k_Y<vI@c7f63o=o>1A-`SN960QL@$l|%fz
zVA4#M?xq{KIhxx0-gp6)V~qC?4S`P^DlA+!bnyq=j#uLSYzgkOYjR-z=^JVDteQ@R
zn-SPAw0tyt18w=Y=~YyT<2HB7@V0js5>*fz7KHAG8HMhKL6*);*PBT}#FVZP@qml1
z$wFBtnO3+_bU))UfA{wJL!%=~Lxz_uG=JUxdRv59I06`(BAW~yFCm?LA#DBA63P4>
znAUs%q4wb2&w17r0V%m5TC~?w{7>k@;e+Ui83!u0r{hU`8}32i_+cH5;j@rJ4IVEk
zBMyFMGx_*1Dl^GT=EuWfFquW}C9c6W=&e)fu;T0~5p}0QR-rw8KbMSxOfGaWR6&g!
z(bNw#;kw_2wuWH4EB2nH!-0dFS;tkHtRoh9@w(+uGIaot;@$nzrSyGvz6aE_sc%=<
zsu?}jEP=H%GC?IuncLE0e9D|>ZjXB)CR|`Y#mlW0c{y`orGV&J_EEQ=W)Y>0X?tJK
z@Jz@bO{qnE7xqT-DQA)|t**k+d9WQqETHXjSoviNgsbAnTNmK2;!|k)w6h{I;TS8T
zK;2DZXiZdYyH8wM03++j)DW4b-zO_g$sc2{UO0FE#-D0D`HaLf1vp?GA1G*EV9l?S
zD9Ch29iq1fN%Q)VOe<JiTPv!!`>0sM6Zot2=#Tp=sloqp0lw~EsL;tmtt&pKR^E<@
zp2D-TNd!T+z!x0ZJx0!jHBATd3vS0B>K%v$7dpiN6^a5d;^r=F<U%UJ2^=z^%Ky7)
zH<*oD>0EK;iLX0qD_?)?!Lu`<7M)HsM$2KV<-b|b5<)M<iKP`dQcBUjlFdqij%%Uo
zf(3W9!_#4}t*=+g{oF^|mKsjHkQ&+r7)%|>M*2V5^qpFf#DwCM92xAXOMoZ+R9L)p
z*XOzZhxk*EH0RJc@3SM~^#~Gp$IM2072IL;K*nr!e-Irl%Kq=^T!*$TsK*T{Zs#Yy
z1|J<AwJTDShgc;vaF^jgJhLJqZYGGg=$ANg!8fNkPQewIiHsl)S-Y8l8Kv(iuQss<
z<Nydkws^yXs3}N-R1x{Q=tk%Tn#SzotMQ0pv~;mau#++xW?6`MZmmYa%_!3?jSw|q
zFjlZ>{k0>u^6*G55T~WFl9CqzJO9FLx<4NT45<M}xY&T&$IDc3m(j}%kpo;5-{$kh
z;L*@=%S$Gi(F%1?<9_?F^<`N3F40Qp&f=-VpmO8sS^zz>8e;UhBj>s46!rKGTrsL|
zCphQv$(92p4$78_L|^u92hJgW`+IDhT(Zw!PP^sT@`UM*j7uGhAfRYedgUFT0&VI$
zAFlfG5v|o1N5$1Z?G^E`+MGd)1*OnH%EB%ydePLTF#SHGi0D(c--cr!C~El?V1n0(
zgeA-H<dvh&4{qe=`OG|T<R+ia!*zc?eLSmBheGS}X8R@fT!PqWStk@&1oU~tK&&Ot
z6MN<6am1=It>qIGh=^ZXRGzqr8FkP~AiutKnR{-}DDz8IH=S=31*hwXjP|c<Wxldy
zb*CYjhU1}Yg`jc~g$=KkKX<G4g?u)th(jtQ$^#b(e-W!^Jc<sKH1Z=xC;tH)#-I31
zpD_|<4R~~qCwT8)sL-oS^pf(j3>KhLD9dsuyal!)MErX9j$iKYvhI6|35m+TtJmK*
z`k&EVbp;;k1i0a73;cwcXpumL5u5%4dX>nxm78jzGU_jk6Tc&Dvb#rfi2Uo2D)tw+
zV_W1g5vp<7pujrZ>GO)-#&*=HTo*A2u<t*o|BD&~uex_3q-#=m&wC-BMmjVsXca8y
zOO&wkW*w_e5CMiS*k&Gh+w3bZ<3dZ>QRLHfT?OZZCZJp0bT<-dM@Am?a+NSXobKSx
zss*76Rk!-pvdcMHh;`AkdyiGftXVOt^MCSdXd;kF8c`33B;-=jWCh>|@%-7_RE;#U
zvk>LGHqlvE3nWC-4V6jkd3^c$={L*<AHu}rH{xnMBrNPX285323<1HeehCv+r@#v{
zn+rec1v8c1<#W<^>-={VRR+O3BrQAtpyqoTjUI_gHaX~3m=eza;Gd@$K6X2J521=g
z5UPp%g7$yn%!rE5jXY^-WV<KlqX`nlV83y`Jj8o7Xz4|=*Yjw8GW4&y3>q*MX!PwI
z*CM&n4ECBS1Ro7nIc&RGjVi6pE$83;;-jr9&Sd!3{|HUNHeK3MCRgtZ(CYF6k`MBZ
ztoYo?faLWQOpQ(JA(HtXHEHmt3On@!2$iQlw2#}Sv3P-e^9`dF&y~f0Jd;&_7f8ec
z$2=o#`Oa#vHrR7+KxLHXx-fQ8L*?z`&Y4w}NrCa6W6v#Qr#l>8wQg?*k2Mr&JMe~T
zeXC#da<ans*I)->Sd_r42*W`6TQ^-QtHfa!EFU_k)T#2S3vLQ74l!fvP-cx6;BkF5
z7yOjwiT*L6SE=q^;1G~VU#^y+nUOXQ6z5XX>_c~72l7v1>Uu+##xgQ{u{8sX{u`&;
z-rZKdcKR%<)Jes1aR9>W%UDic;{t}<t?v6jo{!>2#`~GqoWb93H1?AXH*yO66Cv)u
zVpn|lZx1sZ=FNN=+u@Gy-0QS#gCU>^6(_sD@Zzg@ZLdq9R%Z(bfk=A+J>~-9gitGJ
zzl9I7f)vD8^C?sV>R_5b?J817!+%Kw-#O;?loDgymiVvTic@?9uAcf85RCZlcEWhw
z<yU})Wd}ha+60+snm^O#L4rQH(kYnj;_t|Q%7m1cEix<9LO!k#v4OiUpd?<>Eu5_=
ztMbP(yVX&=Z0xDvI4~H+qF-p9!sHSL;_N#?l#;78{hicv1J<SU!%LQ17jc6J&-Y}!
zTo~`-!b+fQ2RuqvDZanXuoI~jwU_xuI-Qo^T}WOl2Emdc$Q}G@PAe+i&vnPi2*7}y
zaCz35)Y1o&JWt%0KM=o?&-ow-q8ml^ABgw-Yjr?cYVY;PktCw->TukuKA$5PcJB)t
z_B{B7<u~;49j8jGt?uVGAzrxybSqRnn{GLP0Tfvl9Q{=?9c&NVH(<WY9{kiKE;dKj
zgGi{HtTlQbR<hZk&KXXflY?fk`&^+jaz?<94%!p9`TFSosoeNDi6Y+#hxWdA43#BO
zdh!yxyLD+_R8m3nivN4vH#mU}xU!znNP|>gz5k1%^wmuiZ!s&z%6{`)yuBltqEWA5
z%|-39=Vu69#S)YGbj~37J8TjRQrQ$8#{_VgQPw;TxhEj!vUzbK)F*AcstP874SOZt
zfu|m4BcTxLFG`V-(}ci=?-wEkz+N1&K+^ct3V2qq1j<A_kp+gC^UjW19pj~?sD-g#
zuV~4<+g~#9WR^pbNma|0u?{|sXknSMd07Ka=&4<`<eXqef5)DjF{eMCp3Rc=G5@je
z)M3sP81vW^JGKn0aUENQXkvqD20t~kX`l*5@qr|X!7Ha}BF?x_CmipI7_&dEgYzP?
z&9VTjJKpq=_Z@|b1AgMHFa-U{KNjp$KXptQPNB9d#8!N+z~nNDuRr&oenU_$F?5IB
z{J7tWRKbap{5_J?Wa1;V|Aj?$;@6+iLnzyakIwU`EPcz=PD*3c?CS;v=7ovq)Q`P}
z6V6j@3cwYN);p(jljCm}AdKl9GE)v91f@*;(TBw4E((W;%XONM5H3mwPiO?$Cw(@p
z$Eoa7o%f3edB7UmPEo&iN^Qa?GSB4d(L+HG2s(DJc0l#dX>#S4vo7oDKEnt4s)t0>
zcxOrIz4AWWTjV!$&3l9O%EimuEYGumIe2|y-ikJd3aP6K{b16@Wg2Pwu?t7r^!Sqp
z!hJ^$w|Ybn(;+=pYYV?m?3#poPP^>5@N*S1O7k#&#8mX9oTpody8SrbFSXNim6gyP
z(=DE{eEl%Pv*&D(u^Y@OVKC*+F0YcjHdQHnh=L4s%-s$43+76JKGUZON#7fPsN=Pr
zex7qzL|_O!VEx+3fr0<DVR3i@W_!(hw|JB8@r|+v)eZ5;>Jm?*`Y@iMn~iES6Gj&-
zOtBJi`J)l{iSH)?ERGa5DU-_3wWt5=^k>KfaXZR>j3dkZw9=rIWAV$cGeb9fM={O+
zc^;AryB;C9wa9JR)N|VJ@+Nt3A`L?r(#2&Q7-=TezT12Ag{A*7!e~<`Sl~Oe8^=Oo
zc3X&)^>X0xhp}%K9dd=6sy*yy&W&e>$C^ai{z$A&jMc<Ej6om6lo*jOEO8EWW&&_3
zFc>L2zx@YBu3bD1jX#1h488|z1HNydA2%wn2%xVu`@H~bc{t$fRZ6Xhn-;<r_tG44
zp%w~f#np;|*G)#MiWRty>wK^=x<rF0V#J)^c#--KXz^dQ{`Q&!zIB>1MYHAn>X~#v
zR109zIoB(hAg53_rmUW2;x$UtE9nP6A(zNC`FIuF%;MThmc~Zth(~FY2Fr-SNnJk+
zz+TJsLF~yr7Jv%E0tl~Z`gdjkutC&EE^zH%z*QV0xBZ%ju2G6tuz!TGqz~0%#JEI6
zb_VQa0R<W@Ex^2FEK@_rn%*6;&?d)Ii`w{4<Cm$SH2IR%$rE!l69pcfM`3VUY^2By
zIm-ko%@<BaVC3-ubizC!|1fWXlvog27mg=Wsv7OmCSanYmZEb8qe-u8hqChY2nY4S
zj!TC4GxV#wNpoJ6J9emPf7z19tn!4D@ET?CPBXYX_v*7&xPy=RdXd_WDahv3bI5G~
zzdI6c&+4yOW_FmrXbw)*&3=I(4IV85A6!b-iqfuL3gZ!)u><zF93udlVNyzCfl5&F
z%EMlf$GS914Ir~}61oXtQU~0_>U(sbWR2jAz|)mWjM11KvO1v~9t+A>u8F)sEjM%V
zZ0^nuQbnFLFyfuRhrwd$Fr>cpjG@Wx`KW^_9G#_TfAlG_pb*k_l5y@6oYqJ54z2Q4
ziv|UiAE&!QWKjzhLq8h4i}q6pRXTKpZPeUV<DxvheBt3Kqv5Kus|G#ND^mK-zZTxS
zpW}4ux%p>!XzihM*RID|0jAFt26}eSeudFtTSZ7%qhetgK@LxC$dMo)m%tTrMPg><
z%M%zZPCgm$!`?yNc1+Rd00|0>e8mTtcjzWl`w5ZJ00XM2QeBVLq9hZcRHkM{k9j|K
zjmqr_5nfP-FOM;FOMcq;8ezBd(@qPl-D;JwnJt;2!Ez>^a5r$-d2fARdp}a|HCrEU
zP`MbjRmnV>`h95igSCsqrZW_Myifse;2$VE6AXDc6(wOluknPnvU0*}QN|?GR$D8A
zL)}<_DLQ!3Jl=I7+V+pf<XM(xr|qsAfA!7d=iG<4c(h%s8JP9oeE9HzW_GFlh1Z$V
zo{Bl=@d>?hDYviIrn+}kgJ&D7*YjK|X2QIbwtncYeYl76mF*(TD*F)c;S0BLBvlJv
zlVA6;sOwD>mkbL~(NDhE7-RY5io1Ng*{gDt+D^)T2Xs*>`IP2*MNj4(^xihwYI|>X
zMr59w;D7FOQ}xYkg_d67ldaT}8^#}$Kfe-)I@sLL#1uT{<=DsE<-{3UY2#jdg3SSE
zvy!-0Zq>dj_0Zk@<2SCl^}bfG)yl1RDWcK8g3=@X2CL52{Jd~tq;sGp40#ThFA!;G
zjv7AXh6nQRlaVJ!S&Eb_#w=IA@l&|75J`9E$y4!--^Jcr#k4|=O!$;Qx#d^Ep;v4c
zrssHNr&UjW;87jxoS|FX_+lkj<XXy{R2Y)Z9cNomTg$#(#V({ML-*K?NXixp5TkT(
zsOC<(f$<5}dDXWYt-IT;jax%&mCDJ#!1G+y*5%`OeVWUkc^gw@bdE8I6xceYvWovw
z)8l%|-N~85D00I4r<GSm`=g&um!r(IUDqY=X%#8npvV!)|0SnV^2l>ouJ!V}FS^E`
zY&Nn}Brf)H`!@BR6sSM9yU}_+-yI$F>CaZB-Ehx|FWVcf-d+V7QxB5120Vv@Ev4gy
zd)$Ej&2!~5@1R9vK1=0%fcnc+YFdvG(un-5LeDXVB*TFpjQ1^@^DmxHTCe35uFz;d
z_1?~{Gd>(+Sg(_7$>hSeGatzNY3t)B_mS-@uTG(Q`*QhuP1AQ*`v}CsiIRcu`s@rO
zNx!bHav|Ga9F6wqCO+W%ciEBkzs87NzDxJOFS88L1LmrjZDxgdOMRadx64A7Or!B)
zb{jt0<;KDCZI_EN>?c{@zWVTp;WzzkpMRM(eQc6Xa_f5iIj_8nD>a6z{GUDpF5_B7
z6>j+SUe{`tw&%So9K7sJPi}G=({n^@GdJ5OK=HO!dCWY@kmnb#_abjZO!+wXRwc`;
zq#L^{H|)@CRX;y%H-9?kXd`r!N!?nxP5P5^iA*-SZqY!$_>PMZ>oq5ljgFYgGfym=
zqVGElGci3{R3~I?onI|@A)8-oq!;r{R4dGURmP>X*(BWM)7D@>^S3(LS!!E*Khm#H
zvCe&kAM{OTdvbl&{C}@guD|;+CA7CGIY-Eg`Tr|cm{VXj4_Tl4^?@h5V_>~ba>&Kd
z!}0HPJCB=#y3cKQ3&=b!k`FxaBXf5<=pv0mGovim6tXuDxu+;8$FqD|{~YlA*?}gJ
zza1jn0KT~`$__8d24B_?`n>Xr){}4JBRsV67dulgRs*aSd5xH(Lza}CKg}|mr|d4?
z*lFg(GB!I|7GqeIgEAVU-;OX$HW=h>#dvSTjKA1jf58$(9bBzd@Xb(2?5O#x6if9~
zMRSkctHotql{^<GUKOT)=CptJ@nd{#hk4)0X#eNw=1bT8AI^ELeH5t^s<Z8T2!e4Q
z5SvwZ{5e0)L94nm+qz>jymkV!LVA9lR5SK?X9Tg(U3Rxcg7pAN(rw!iK)(;*<4`C$
z@v2#^|L@!$z$flS)Od3vLgK-U@l!W8J7U7^)*svT78?jvt@xc^_p7;M7MP2jpT-u~
z>r$;Q2mRJTV}zLaKGH6t++rgpk>g;|nE9GZ_k;2XZ+5v}AE(b^NxkeGF|XjzG!!u2
zBQplZ4$6)L4)xr-)93$YD%<mYR8HphJ*oM@Fhnq#ve^!@n?=rd%SfPn!d(-+BKY$+
zGGJ(6oJUf1XF6%;Zo#`Qkf=Nw@t|vaV!xg+=ahM3c&jhzZ=YIJV6}L)Oo5G&Ue6N&
zS!Jd$i%)m{oV0t?{5jmRz~7*qnJF}7vo-r<wJhCYqpd>=hP&@EZ~iaS)<2P7>(z;w
zr3oh}klW_5{C0{Hh|owct#g=?w&(qG<LsTZDZ}JhZiCfDucR4QQ8lgIuD4#!1;=XQ
zhn6!wCoXXE?tHv6{rKyLcYohdt-rJDpFBe5zWMFpNfbJpQm>Spel;r_LTb|P_9RQy
z&%+-jJ{XjbF_A})sI_hdT{_44XruW?<)2f-D<;EH+coF6YsRd17tZh0SKpJ>jCt|&
zfG*l1&StH&)ieD3?$Y_;VjF#-PPZ5C1F51|MiRpC?%!eUof_|T+Qt>QPR5^Tdsr-%
zPs-X2r>9!9f}Lwunsj5Z>2k_v`r8GA5ID7l7D8N(Nwq?QH=%s>{o9kW-spjwk3QJs
z-%Y^jPm}#_Q<2Zlq`N-hhxQQA?F&O<Og9~N-sn<m!GA4^L2{FN-aBPqxxXQrtM__L
zTuc_T4Jrhl#2i1r^W%Kt!0c_w@5VRX${jrx&&uT{*e+Q7inGp}E+7-`QcBj!=WY2^
zRMa_sDuhGchj<O8KZi$qc5~XUj2LG8I=?%)ODNy$dTZ-3WT+z`K=f!X_D~;{wlI2&
z2^%jiVSlt3kyH-zHntmHFn-_=*!7rn@v=(&APYfT+iNq^yR5t5%F3j4>mw~kKh}W}
zqC@+O-@kvS>wBljT-IAc=9{dMdCWP+eB=tGW!JLc@14?4_zl}{nR6uKU|VXxwjNuP
z9bM#8qF&T*Bn&RT`Ly|F>)~_vG1j5^$NDtI+JPv`s>E3Dg+H11Wl2bqR<@66BFBn}
z3|39C@A-djM)-u3{`F$G;QhShR-c2mW15d91aG{oZ7!LG#rHhf*WVxK+FIXNkzJKI
zmEe7;PP*=zrrA?ebE4JcmcZ=kXYDWA8cru|d{Lf=6S(g0$G-CV5;v{)dQAj_mX!Gf
z$CbXH-aD<kkIa&^J%8y2-#b;HEI-As{iq4=38~H7GkJN3n@weB&1nU54n=-y%eC{I
z`CNDnT%qThD*bi+ZKCh9@2=NJ+}L`5L!Hw&(suL#5?Ye}&RRu~%gy5hND@+(m`=Jd
ztGj*fDf+IxU8~I{;yJDU=FoSGq}N<dTi=JhhIg*Knrd;Wd@%8#x4;BnXr^tfY5!67
z0ZG!TP{I>rC9YyH>#=PAclR7FmhZ~fE>B*Y{^IarrS1CePsvY@7Wfxu;U4f5k_HQ!
zz@cTM_uBbZS8H;M&^&^dCx=g&Cw<lS`vj(r+VBu0x!5<`0>eurgSazo;Yt<GK2{fQ
z011^z1cGBqLnB7ez~hPMQk3=Lt4Ko~!4^+&^JQ0X9r(fKg8QOlz!!b%@9*KA-^x;)
z?sLoMT*`kOkHCDbr8kr;ZPXM<*!c~4V#=>}h&!cryO4(PK7!1x==IS^>UG4o7Wihy
zg}suBmqgtAn!{hG=zdFly}A@ze~#zT_zCux17pm^Qf-MM9-~wUS0~)~VV`C2p9Em5
z1ZOKilIoQHA@i(q?6j5ls@02Q3?ylV$yal!dP~FQx3>;o-`+8JM`3GVky8=jqkaDr
zIg<U#Gr&IMxrdyW|LiW}JbG;HWA7N=c5_%ccRAeJ^`>ML^XF4E!+!^dE8d=EF>$)G
z=tF+aOu*#W{qtkP{)2;q=^}26$Ct*e-d)HYMc?MNhoBd=Ci@7xrqKD4la8irwcmQp
zc?vd)HT?t5kII-SKr61Htx{52AhR7B|4>|UUl8l=?jB71|M7L*@l@~sKPNIuDTzp=
zva*WEI!;*?I>aGcNeC?~+es-hDmSAtvNu_Ua4KbHZ;47Adt}ew>vN9orF*}>f9|7u
zobw*9_v`h1&G+lwu-Lu2(pzP`a_d%MXV<k?u{bTRt63kPXunF;UHVeIGMy3mWp!m1
z3r_6tN00u^=$++0>go<Eos1`wY1ENh%N6XSH}0Q(nM5P6n8;)BL!JWXb*6ntxsEqj
zu@G8p?AZ>nsR`@gWSe7OZ8(Bf_P-f+JUQ!V^KeWq(i8i-xQ3+4R{1%5O!-L;0tv1;
zF(%F^T=GJ!@SEt_iEeS!5S?qEXyln|_9^-!(w~DOreb>m8fpp>99-q`S=bxq*yrw*
zo#*N4U$HBB5<XETf0^xN3;DWOTH-Y7B`$7fWq6{5(&FybOZmy*mk$!0Oe^XVxottA
z9j8lAZLSO6UPJ9V3X1R$=pB7MDVM?o+Ofoj%R|ahsnKC3Sh*jZJ&l3cDX>J78PLJm
zk*G~1hlYU0zmXWf(HINIOUR*2emk{mCafj&da}es(!^5Li`(rp16U8|9N|WBv;0we
zDwh@}I~fX;>F&f^wVF-B7bA}bI6Udyj7HpzC;hTxPCtfK?`Z>}HM)3$WwSr)U^2}w
zm<soZrcTtcf)k~xjUkw4$)}i-L@_E4lb4f^oo(u~MHp&}u(~;I#_f8hxcPf%YPnP|
zE9B9(h|)=x_7SFx+lt781KZQnG_BR!A4XV5c8#gs+(KleKpm(PBL)rXezKbTmB_2L
zR6JWGj0oTT?m2J4YHHV-6F>ZVr!zd<K4eQ;SA&t|GCCnHT7W`7zv+mBHC3O3H95$q
ze;}_(J&G971J6+#4L=ixPeL;>?OxxA1=t+yZIoS8Zi4KHWd&lCjYuy-&hUkS>%>QP
z!+Oghtsz=LVYlkND9S#r%xiT#Ma=Ma_ioqyE!950vWQK)$6pH4$xqB>hjc$&HK|JT
zF~fr8(!XCag$^yaIk3kr#oZ&jX|Ru4onZdUG>QtD{GO+0t$?>K4`LyBVv;cR@^^0E
z-g|(94dF9Y{iwkzI0MUCuRP~jL@YFx`AQ6-mrn&P&x^Vivll#Pp70+i68Os`&{E!$
zU_ezlYM=EkJm>qx$4^~ee)}BdSo|xTR9c4=u~7%pvKv{oTGfk2O!)OOdrU9~zUT#m
z&v^2aMDfj{ASvf@l-bGAHzk@*_ADQGdqn+i*3lu$Far8b%BOmf>6e9pn9hm1hf$TT
z4&XBoh*S{R{3#tOWW3XO)@P=EhldsWTt{nhuxpv4tMaX%PjyD6`^Q#S#(3mrOyycS
z?E9RfvqEH?m?QflhE(gXqbP%EQ|XjHEvpu&S`NSF9}fDus$kDCd$LcT110P+UlpV*
z^L?kXko|07x2zNVr>iW^&~!5_^;+$axCrmzklo{ZzlK`u1d?&{arTH-Tx{lpWixQr
zoi=Iz;B}$%s}h1|zji+8W^{CPWERe?G_p3sH4p+r41R?jsd8R;&s;1rpA@-}<kr2C
zAGw&{8&>Pe{lXhOPN&I)>4t;@IUnGx{0Ye>w&JDxIB_4)uQCX=*753N?EY6}UkT_q
zELQ3iKS6p@7rp*SRk$;8SFb~0tG^qYIn%hMe7z(hi&0t_Lc(9=+euV3(MBl>2VM#j
z61FlwPHz_s>r<v2)Sf=GBj@hNw><TbWqIy8HtT+U5)a-UP3e%)Y(X!;<TDA?&tDUS
z|4P|1qBWF6YwZXdJygL}{g$5)L@$!CxW%Bo<TF3}Gb5X=h(4(38g+Q>mQVyKfG5!L
zGJQlv(zq6(-0W@H9j8B3(awq8wPitgt%;FgJgMIdP6O5|LXDTdzkS5z0$-2leT`OS
z`0%f-e-&-gx8EY1U+I>RS5_vsGV$@)E+i#6*_41jE(rxQ2`wMbEoe$K1qtymG99!g
zoq$2qFq+3WDf8S%`FH(&KfafHFuqwBObP1z(kH%n+I?Eb<M{kX<0VDagS*w!%<p<>
zH(aKO9Pm%tzY~xw6Q!@OPu&q;-EDGT1x^4F5n<GTm-m}cW~HJ>g*32mrNinoyHZm{
zQF3Odisg9l1ur8qC%#-cC#ql!P4e_%`UDHzhJt3dhVEQk)rgB-nM+ZI+lW;K0LHQ#
z`<v;s>*228M`x<Gp7zeYnrQCgqJMbgv7cpL7z%xGpO_jqu=oYAxjsqV>L3|lr1Jw`
z=QCZCQ&_G)<a~bD0wl*-GVR)tEg>C|{xuTQsE%DY^`<1aU>7X2?}++wBIBv{sB;Cb
zt;umES;_oM)s6wpBlBMSr=oh_Zx#OH*k-)+(fAPJ+z&-muE^4A(M~I9-hJ)D`e?f_
z1V;{}Yce7z?oL6$A$sR;1&#cpg()#xOR+td(!KJotA{m8ZPmTBDxytGYhULBD`5(h
zCZh%G4DI~_xy_XB@!~EWL$w*p9T|tQ?vp{Y;p4zoXMw$1a~*e@>*+_GODuEy+VRCp
z67N}S;(baU;RiP`3<d&U?cW6p_&swJ7GDr)AZ(>CKF+wPR}-yTJu4xoK5KTZ<Evfz
z)+uXeO2bI_?nhv^$SL&UWyiO~h`7<(2i84hZgp?X`;eD5kpf|A1~m8{hmm+%zkl|(
zuR3WpM5F5;HaCC!N#!b4YJU}B$u?ooz)A=Q{GQ$u6N0i4{Q1--hF{JwAk?;%>9ldn
zr<<0|;mQ5#aS6$PEh7sTz>B{}*T-<ro*DJ>KF+*g{Tb0XZ{gIt94nAgP04}Spatts
z!x=b<MCMtS;DZq}IXlP0X66Rjfv{T+1D+&)t79RF#oFYmdQl^TXg=+Qftfku9ko2N
z_qNpM+oNQSDz{&vIgXr68eI<4jE~ZB`lZrWDg4yA9T#~zLkWn(DMLKDqr-6-YAQ}h
zLi1~hmiT;wH-ioON%t9M4X%7@T?C2Cm}t?5`K$pi%wv{>HaOF|Tg9Awa7@)t_Tb`%
zbrhpURl=uUSVr-zl*GMGKHYgACjYS~k##1C^=!I6-=W1{Jd4?7yApmx=(M*e-#Anx
z>@*d_5&m}Wqwu+OkL4k3g4gDOGgtR{P&e-2`|G$`VuGqgK{xYw*l63}US$6~u>Us)
z>N(f%zxq6<eizTCWc29_ucsMPjAhQ>aLP9p7O$lIYI1rRR7o?X>JdOZ06z!sly)(8
zmJ63MmTFrP>ub>bm~F)!VxLa8y6jsIL#F2YIGgm7J>s-eu^t{j#)3I#Pj4xRxk5K*
zM>&*(2-*#G452J_{jqPtzoZjeZgTEgp;>8Sdj4r;$lK}Ef#uk|xNq@Cd8QEZ8`3t>
zE-J8b38){x`$cRjrsJYc%*b0)1`AeV#n)BZX}^oHi<U8-Sh<>XQatCCP;W~M#r@kS
zets&Q`eQEI%f=@58sB)jjo(GS{Os8`VtKGSMha`|p^V4^p@g<cH#!HH&i+)xs7G~6
z5^c^+4vYD;q|d$%iy*Y=3E6;To&-7&+D5Oxc^CP|C;qxA#-X7ZiN)_nYM7e!FQqJ;
zS+vg>o4bU!Ll2LR3OQJ0m1;ZpS%)gy?-va~ep0Y2k<F-Pi~Hagmz<lmhvpia1#Wy5
zr|O{j73=Z(Qcd9wYg@Z`7a@?<fV2n%L2rO9=*6~+9tqCC?@17f1o^TK1WuJKCe8%e
z+^XN2DJA@^g}4&C<1_W`=<K1px3h$=e=jK_giTG=L)ePO?^M*8pW$lW?pd=EmMrgO
zYkA{44zcFf9Mf0alsu(+NzXc#5rt;E=%@i-uGR#ujrft?x;7bU*3jn|^jxDOk!QL_
zd;ZL-blN7ZoLYlVNxW|v-Has&LDQN~&4seh$bl6k!j6}!a{B4f_i?qmy<OKe|LW6s
z!^;}tGW7~t9}(xIVr!1>%cCBn@hUxg52KubuC^pFbOr@B;NNkNv`0cL)pxu~Gqrb+
zcO9|?29|KlOwgp|4+*9fKIef)%X%7|77ye+bne*~hzseVZ5}AaEIy2Whb%htCf@!-
z<v{I+f|rp?wWIc4ayN@zdnDN$W^7%4Xr|3>xg65$`|$PN@frv2w7Qt9repFl^IPfW
zRNWugKLRR$fKbNXnhO@Rq^+~WU}WqhA@1`yo&)WAq0ZcIbJAyInj08Sw#oLcF2lDL
zr;She`J_~;WDTQn@6~WK1C_iSJw8LX&on;G`$b~2I;s}>WrfuiLp?DYdA4_vsjnI8
z>dcL&XP9u`27VOlMI|h-wA-!^6Ci=yt0wJF$;W<zli^#$l1gz=ku1d407gwFQS;1K
zE$>V0bq_-oY^vR@q`^V@yb(1|cW>xRq!|4srmiDtjR9|y%&-!&RmDb@Jb5%9`p52d
z?i<RAxDs;F->s1!XI3_k#^=FY9Z&4`BZTH<;&tX@PP!t2iL-N9lcgtze6Ym0$OmUn
zeo+Z!CC%T{ZFD<!&%ndL;i>?%v8(khRbL~5S!WEYmRq?j7qNy-gitoXa#yO}=PNG8
z8tZHG9WCx|Ue+j1zBsdL9zyd_Tr0`*M6OeiW39JC){lnfIj!kxwAZa2y138ohFfXo
z>;#9GNQ71Z2)Vqrl$M?4!u3xhRLafSab+aZ?xUI%F7VwJT%<lQs92R8qCY=oX6*NQ
zBCc&;e=3@!ap$PK_vQF_p7~<jE8k$)HJi%%8TTz}_~v|+`pOH#u7gZry_==#gO5^u
z&c-=WI(=Hb{<+BofOC3hiyY8o+i4I9jMqy(GUa1Yia6KkBf*2L1D;pK|CUP9Kjw5O
zs-8Tfv)S4k(&YPV>qJi4u&Wy#&D>Pdh#vepG#}iBY97x~WLcZ=!IZwI&=6jVFw-3h
zJ{|7;kBB3KZ8PFz+jN%gc?NH`PqMVv{pTi|bp%^maMRJW#|9f{z@$++Vew2A#(4!Z
z_}Ta7JSPpX4Qpa$HFP@W`#UvCSCSRC0X(!gIc6$^4j*P|8k5o8Q^2?2>^^@KAMRdO
z+3YO{n;xF1mx9?<&%&n9e@GG~Z#o7w!71b>@;W;^yFTyDiH*DF6oHsdGN!#p-SA$P
zZH830)VmPZ2iz{<C&rQ;(rmj5&@((3lM+wQz`Z9C94n#)iYSeoA`QI`*fxx+ictNO
zSJDyq7E1xptxoy$BEsVT>jw-h(eNz>WyR<4EfEc{czm{}UPI7(BVE@GQJ3WDiA{KW
zj-;Y>0g8U%|L1o}!%PF!2cJul0E-o<{@eOqH|ECaviC5IhfR+Uflp4kzyt`_+S&a7
zZX(nhHX-&&Lm1fvXq`xnZ9z)w&9;B-MnnX@g{ObehX`FIC=AX0hHeKpE@LC5qmIBY
z(l3XHAe-HeMBa99{KB7eB!cy(*<HO8`Boih^n?R1$mab2qE=`w>>~BPaX%nIT?RpT
zbyApTb-HAw)s5MI6^ZIP$*;7oE*L+wdg0Jv8E#xf_5Tuuvo0PE>iM%8Md0?GGzxWx
z`3Ds0enJqSDDSo=MPyEGeY@IJq0VLI|99Uew}3i9Z0FfQok9-Px;#E(P}1_DIMaBA
z5IuL#kL!p-0#cfR`SP$duc#<3x!W}XA3+k<4ZO*~5(oR^y{{I7?9YIcBdfi$v(qsv
z(V?;VnD%qJIcpmm_5o8Z!O9t78_n*E34I5Nr;(w-i}b;WkF%BakKv!aXgvC#WGAWs
zg@r9!k+3(ssfQA7H%py75SsU_*vAt)^K5=l*6d<J-ol6s`PYOiR9Ul<mMV8$B&501
z*IS#v*Nj0Fi?h)T;tCz2^bH;uG#y9>a0teYRg@63?OG4T=ba<5Fg^~{2l`daqd;2u
zKbZgoiEsfl#iwKEJI=s4?4g?c7JMO^YcV(dy^tHVYw+}$HwhmrdbLQ)GC71&oIOxK
zmCnx`a92?j(FlJI$l?P;JiX;C^2fS7%#7yn#K1HJ;SUw>>C{4@+tF8yzy6eaH}=t}
zGrMY@#2^F_s)g-rP(7aVU;M%n0|T-st35{syn~sc(xG&AaB$#e8s|$szuS7A+2y^g
zpelC)c_{~Xl?!$6{XELwCd0K$ZGEo#H&>fb0>2z&Iiv<*5{wS2o!MZg-I*hErPG(%
z)AL@V<&H;YsPCqTUz(1&rQzB|3Q~XprOe5J+A|Yg+cw1a`v86q2vj=?n$6E?DkmI1
zv!c7aS$L;|kKBNGpC?vhI%28yu1m`cne99zS{-D5;Nq3|)$~7kn!th_g~-{C(=ccN
zr7!<rY{=mE#$h+yc!l=ZnUF5t1wG}|b1HuleZw)}wE@rce%c)a-t@zA7OLvxO&e;x
z893OZnKS%pbab?MP{HtS$R~+OJ<v1%-p%h%Ht<CNWiW1HuN48p2Il7GJU$t5`pnCN
z78Wm%Ajy9+Jb6u90e=}u`1`~R=Q=Ypqt`qTa|fFztdYc%z8B|qNhM_HYOdzFH9G$X
zLm}hf17J&1TO6E`EuA#cWs1xT$r_aE2{Md0@?K_whkqtkzY`yK)GrNg5dVuM|M^4&
z-1Nwaz?b`B9uph9wFtLMR$fRpGLmt`k+d(2ZdavKbHoc6|HnGkwi*RG39HCkh#2#O
zsP-GG9j$pGIsA0Ha`$rS<uzUCTn=3@s6(ub|2>X3;>51SXRTFD+K(C92w63o5zKX&
zqBr4+%uO3B@BNp2{k<8BlK?1G3JBj5w18E5a&#{yQ~}3PMYAwByTl)pp5Ex%Y+MY%
z6-e4AJ>v}6g5QiX)R`FI@5xSf#y)3ypbFvCjo;pQ$kwMoi!%<UBY;7%bvJIp(#j_#
z9fht+p3Tv9vmnUFTmHYlS2cj&f8U(y0^dqJqOSEZ0bak*p@@uDKR=~E9Z?W--Q)jV
zfHyDDu2iF*8@^?&9#v&lP(U*$pdXQvrrxmS{`)vG-S8B&rLqk(GY+@ljTHJs+_Uv%
z0>7D-U;q-=cdTIn*$XR5k?Cd!O|{+5aos$bkU`nwTfVGigb8Qy!^=vPu9xP|lcK=?
zI@pEXU<YV+<^tVBcHz^L=U|Wlem}7%FB??Jn~5^T3ym5}UCV0xQ}=A%bGrLoDfI&_
zy8YuAOIADu7$MtksEZ*<G<gOh*NMiNt#I4GHtDPA-Mz8&B-)xtp$xw;t0^mE|2Qf9
zF+9z5>7JCsmnFtTD|=ldH{nykNSh~##XZE#Z83~b8S%u{PZ|V&7J0cwtURiM<ZEpX
z(&2#{&zgZH7$c+k!13S&Pf6NhjoAg+W=JV4Fu-o*+m9o=J%txCx}3XrTvxkD`ApT$
zbK*#J4hA{Mz)QEsg+byDzEJ_8KwBtg>1%}hd0Ez1+1aMFzW)B+Th}PzwpGBTY}d%?
zl)2Jn4c1M!GMZ*-^7rF!A||lymz-xXS`kf7(LTji3XWzM=de9O<g>hT<WY&lt+At>
zNl`x#s$?o~x{n1$2EJmF-TYw8$;oVC?VL3C@NS)neHkHAF4X!{UO_X0X+bFpQJ`z7
z${<FILSO82U_oqs%2&~RzC?L4mDN%6E(d4#d$I2KM-p&&db-OV-L>AggJpFXrrLW<
z^2NxUR9Aq}%N1d3t1vo5KbG22E9%{H|06A}>UrSwCm)hEVKfVD)MtLQ4V7!KU~TT}
z*R4in<SZ$yTb+|(_Q0#SW?IfQ%Qhut`YyI$O5y9-y0YQ5%-<&gb?r2sg<$PrU8a>Y
zvuxS-@)$Ym0s)H1ArpK(U5@TYfPlcm$CRW$ioMuykldWeuM2pIqIi4@6`3PVZeleL
zXKLmd-}OLTio}Cp5Gq#{!#_h`60+FpxQE{5F^#{HY2C&XEwq7Y3A~h)Zc0a0SwU<N
z0*EmrvbH4^7<+M_xVs4-2I{cd6-7kDpOR^I3pO&9H5=|q@duuh)EY}V<~y@KnF#|d
zL(R^V`N&xk%{?)WsGC^E&SYO%%ftKMMT~xW7e>o42i`(p7>TjsP&V2V75DEAYNPn+
z=JugoPxw3v+>m;Qaae+(#wqiYycJnI@K;d-4bc<lxJu$Ka=eSU8a%^=*}wpMc$2?t
zO1xmi3~h1s4en3YKV#9bInI{6qTiG>@kHH0Z{D|CZ7p=Oo#rpxH=oU1ZQ03LSXWNT
zl~@YSYWRr-V(bZ?=|_ycQmT&R@7d=FWs?Ssk(UdUH>lhoW?mOdFiMMVU(}wmoS#w*
z<1Y@1<c3ad$dt~^yoJgY+Cx5{m;~FOW(+sF@EAPWgs%Yd{*G&m&}=*2W9pRd@Tpao
zyi+nif^8FtGOIF*jr0?CcH$;B_Mgt*L;6!_Iw6@~rhaIce~a0y(5n((*~aiA&Ppb{
zU>3>mr5!=^2@PUT`d(?(<+2K=LkIV>zB^P;lHtF#50nFP@=pt-v;`Gw9nt@)q8Q2A
z^Sd-qTPTA`X5Iv=(Q~4NLmRkb>Hs7{?`K(oFq&|oY_ujUSrM*M;C_5<BaY%xc>6nO
zqs%aJJj4^jgwHa2^L<}?+#5=l<iw8qoA78*TjVMm8dZ!*7YNiASAp8&c-pwF8_$aK
zU?x8?Na!<DVF+#+e|C-Rr>QFM$hODj4i2k?0*T##lv9GE&IrTg6AEyy&q$+9`ozp{
z94*S*GYU&DP`GAZ+(X&Fm$IMv2)PHm8aTezXTRD5kcyng<0SGSNgh{pbr)<=hcAVF
z<Eao1LC|O+7U%KZ!-hNqVT$TIyOK37q;=PD*vgAKd6ecZa4#Dy2Dc(Cu<Jses?HY?
za~0h^#}m#F5)z#Qf95;0T8FHOP|L2<9J{siv3f(Nwv%y}k(psCY;M!ge$sJY&@{ni
z^3X5FjtLJA7hcKUhv@rt7Gkx$NjM3WlC`!zE3+UTCi9Po7T%TV)PcyC0e?9lm_!H#
zf&V0?*Fh#^0W)t3fl-iQc))$10e%zS9kl4eO$R18Y}Ip7mQ0Za6dYByYS#vd3}mMB
zha{@ffynl4B>@NtF=Ze~?3<-!Nq=Q|gVD|8DeS<s7lX&;^HYWz-SbS@lK^H$nF7Lc
zcqNo#)s>Gvl}38a^m7*?QtHP~JRl1tizgwA`av)&JE6w{g^UIj$#v$(<dY7P$Q$(p
zqkAC%%8m-J$35&s6e=4y@9pE8-XPBVC;mr8J^`0qWYkq&<Al>XXkcus)NG|fl2Rlr
zQTpze`BbF&MYw$pMP`5s9ocq>;Q^;DhM((_%A51NSs%RGb2A*Ge+$X7@Rti<+nKr}
ztc@|nTVLS07S;HC2#0eWqy$!NzHPCOEZz`<in6fO#Q&tAe-w{=n-B}bb02>pMRL<b
znJ+L6pXpXkY!m%;CVG+b3O86i*%YmYFnJDNX0mT>R*1Ge3dHlOZO)5_WOB6qg{kME
z7HrH-s=tAJks<UU;Glt$9umgIK!K4N)9vWmJF=SloZJ8bRRO0|%V$Y{OTw&h7oKI}
z0E69^JXmN9W~SH@m-|7ogD6TmDWaZ9yUP0u;-V|o*RS)u!aB@P<fu(7-^R17TCI6B
z1x%o7FU!kL*t$46zMjl7?8^Jn4il``R2ASONl#=RSrkc4^u;!N0!wpCo7wr1r$tU2
znax@OO3gmtE^?lX^xyXUA;LF=ZIsDNe&_ZM3Uwaej5a)kF;N285@oeE1XmBkjGD*B
zBn*B1l<SKQHzMJpkW!T-vb%Mti^o+t#?>Ahd~0!N4HcSz(oKBMC7f-V-E9yLQN@DR
zb~a5`2Ko_koW_<s;6#ZXGX9uwmB3SC{9*`V6lX8ZN)-tG;kLvdS;HI^k5YHVDo0%h
z+X7qpT7-F`r11qO!?Wsa1ad9;K97E?0-Xjr%Ja955;=>vX5rx&uaFx~MxdjWHvzA-
zLFaD5Ye42^L80tBm8nJ@5B+AR{1cVmS}l-MCZ35BNM!u{;HOAqn)T{%v8??&Gnh5=
zCYU(>JMdeOvtL5Zn-xSW51coI(qQ+2v}kN0lVB~=j*93z|I(hEb@u@8n3HQr`rVc}
zk*z7IaP^DZG+2{PfOpkaZ>JH@FP!%Bkzb`*2J@?tMsEaqG_{I&0&-<^%;oz_VO8tN
zjsdYOvm<{pw6%$Tu&K*qd5_d)L<*67R`JosOR$q|=Q-KFfUoQG=}O9!82y}rMfo2z
zhRv#A>efOjoA60ArL{Fs-gJ;@<<~NQGEulq^l5$Tj*TYjX8f?)ZGi+PkA^6$og6*!
zFDgW5cvVC{+_*{;1%Rc4tOBIK0p6dcbP?Z{HAb05UqS8ij1n*2aFiMNj6eXi+ak{S
z48#;%|5UyBBSm;)lA#o;Y}~n#-3U2Uly32|#m*$fKTXBe#wf?}1M9Bx0WB>ptz4YT
zPotFslpuv48<D``T9(t9=|Zlmj@FbUG?dQbCy`dl#3|kAe*MRdl--19f>p;;BO+66
z7-g-S2H_UPr>l01!(oel_25c%fI_iUX>dY*$64{UDS^o)3+DzkkK7BI>DHDt!^H!$
zjV93Qb0al833ZggMLnt^!hy^$=FJbZoA1|X=3GSCSu8h_(h8i9NV1+XU3$}gHi49f
zPXYB@&43JKf#WPu)j>}M3ActJO0{`V<OGtvT)+6b7)<+C)svC?sXxRapGVo_vm$Np
z$fgBJBbiD%7)*8BKZD5-d?D`O$}d>B|2iw;qUmTYPn^lqzF%IkepEzZ1W;-Vc0WaL
z=1XDP+;!)7Ex<yM$C(Zd&2VziB6atL+*Iv-p=mXd5sB9r${7%bT^Cl{gx`gaHY*6e
z=h$G3nmiCSwJW&)SQmLFyyVu{DyLG}$INcF5wr`W0G;XX3rtgIJo3gMO=uLb$3{_D
zt&A8&$}5SW@wj&pQ~kldvOm~#1i3LLtbkjMEXqYNCBcMpQpE-i?P+@UbK|@tBYsGZ
zABW`lk*!2fO;T72&pYqm6hekrK`264D2;0{xxSJ$N}<O}lYrO)`1AOzLFt>QcM<26
zq!C(bg6yAj3P;?4U%T5nI;0vVrcP?jSm{2UTgk$v=tfvY{8G*!b1VK3UxDM~o!V-$
zfc{{*AVCb$UVh~_f4ko^7+}aG5h;9iP2_QpPdX*hRNo|oqKp>;xm`=ASYx6r!qRa=
zRb|4IiuzisuhtwH%0AxA;jb0T_4~}d$a$N;K*z<4JzD?|4u2DXU`9;0!0O8>JuLkH
zH-MIH*Hv>6-5$ft3xXw;o9c`0_-1_^%n5&YRw79;0I<bl+{4mOfaPKpEk$;8wkd$#
z=iK~qFK}4sOHY&9ZPbP#v<#tsfU%do&np8pq42;ul*jO@RkQH+`URTl*k%L<QYV^_
zA~}R)9(Y%1NQUP}F9+T`LNfL6>9E{_C-x-SD5pzJE@oV0atn>U#sF=~;a7GLL~ks5
zbp(EG5{$AB9-xb|eglbrC?HGlKevY8lK7>~INWx%oup|*)F@6yS_Y&T0f|f+QEI`>
za4lmhF(2`kg25gQWk(iM=YyFE@f?1bXjWyX8<7%p@y2=bB27>z#dia=b}Gzk{Bamn
zWt95q*VDP|qN4Cpv?WvTu;uqf0GEf$q(qU3F+S_jx$@H5hr`+qs_v`vgM*SVl{lbo
zwxo*=5<l*x%DNa~$L9m}MeMgXJw@A{<xpjMx;b1X7+N{6*PZaYV^DylN9Cr<j4Y>o
z>tD;mymSL%lHQ0tMPsc5np1Q%9TY2Gm*(;Z8B2=<-X6xPn;%;nKTjHDhwhhJ^B76I
zjr>H5fmgryh~bQZVkBLt1L83$P(8bA_iUf22D%H0>01EWDv4^bB4bX_?iS>Eykjml
zE~e2F>mZpiF@53TM!*FQ>7x0I$}O7z!|3R>T^w1J*Mmos#kmKv1&o^CA{VXts0O|5
zzMEHMMKaa1HBU>(#p@yRg9C}|BcZ4r_oL;XT=b(*VmYx@e(C&U_exblLiV>Ch=%%m
z;mDDrP(ze%Z{HfqQcTvSX@x})j9Cmr{V+0C0UoPd)hxA-`rqDDTo`kDEU$6ey!oi5
zNRGvPx}@}wH0*qR6if9qDZO_c0;GFXZvIqJN{EX!Ro-!qTWqf2&#;vpH&iVJEt}x-
zj@qU-JlaC_T~f>8Nb=opx;S&qHXmB8T@O4yGz^XnP}nZJv>%FX><vA;*^~!adfNjK
z2BkK`zYt8=S-!>j`#TMdFk-6GUC)2z6e&>6l*yEC-&HLW4|}x0{p_b1C&DwfoZrnI
zYRi(`A&|q2zeef1g8uZhZ_hZNzV{K;$!5pO0FAVS=aNIIFd0z~j7zJF{SfyiR8wg$
zb7@Rr_*N^D!hQ`PtiFMeQBT0!I2^BNHjs6dQ!)4yy4iI^F-j0`_ANqk+0c4dgXJw%
zebYWta-^;ZLLRf9(vdUM-P%MuS3MHMoE^a;1qiZXTstmKqGpZW)Z1iUu}8*N-Os(d
zb4$^Mfq2ej^jie?fpG3JJBd3rq!_p%0FT|_p|{AYZYaQ6q^KV}izySf8sP3RZCHQg
zq&YfvGEC`?T*y5e_y{Zgt6rBziz(FhO~RSe2^B|iW$)dQVMJsAf4}+r>_No*7HXdQ
zfF<?8x}ac%fK0^Pm6(!J_+!1>Fj2!);}EdiwRl<X!BcVRi1y*(;X@N8nqPpsd?Pe|
zE^AGOeOIkZcuBKcRwZJxJAZEWa30aLJ}-65Y&E^Cq*ENBYiI<V^3*4-Kx7ps@Ma~}
zvG@oukv@8H{X2FE!4}I03B2V&#RdUmp9P7UQ^WkmmU`t0g-!$2l;`o1_gu88TpBzA
zf7Mq1=<d56Y$UPmvNy>kv;)D67Y%ksko@G<QepFfCk6xZ(Gblz`C9kKhVw;ss<|5x
z^wn)VRwB*5=u>FT7Xu4o-1z*vE|vUarJ+B)WYykdOj%uQkgN@U5Tvm=ZJ2}>g0!Xb
zu}Q)9*L=z?|21s*R+@!jp6!8llJis2^FMD+9=OXV%-%FlX-9!t<rles1FM^vkodmH
zB(Q`FIhRMw4C6<6HV#9$OPRGJpy<Z4I>I0wiHC^lQ36Cp;fVe$0oBKfy5&42FLdt9
z|7@b<<`J2`fo(JiG<(A>(QJ=ssx(Np=Da5!LPq}wEKOLDLdiu|u3jkFtw&0B;WV}Y
z^n0+yWu29sAKN<bchTGGM;&GSzR{h-dpoR2MwO~OASmmPMCLz6JMd`7Txl!d1M&AH
z{96$-pyE4~!|?NNE|a%VMzV0Y<Cieev|^rSGFE@3xJp*%C*Kui)We_SldJB2%O7Y{
zf}4B~7`K#gU*yyzInaGbZEo$&R;vvH8JdWx|M2aV9B(91eSjP`VtGp_I2lP(S~_Vo
zQDWK!H4c`M?|Dwq4fKJ>9FyARwpCT|OE!O7kWenu>a|{pcM-+r5km;?FTqLv;E<PH
zM;nA47Abv`=Wc(y{A6uEYE)e1S%rm#iRj1OTyGgYqA%5E`JzpCEbK12iw)_m^r)59
zY1Hj|lTV|{_eFD0ozGFG;jpUe7A&lM3L=+lmG<jo3jdW-YvP~kqhYx6W`R9UJlHE*
zn3Vp3oksQW7tNV56OBZ3ocgiEdz+<0Z=3^u!_z|8S@njxJ?Re1KW7S>XJ@3?YlQU?
zoarFxt>l?fG(^VzbEC2DXVQr}3+K@uV^6i`EI)Ke(x{#^p#!9O7ZG3OsCKb>nv@6Y
z0RFS=ZrZz1YY{jp8}1V)zVo5gID{L2gE3y5s&CJfF4JTMH47YlN34SoDu#)dZq|>F
zlBZ3yBPrn<uu13(koR71t^5D1^*4`@KEafrZ0QzuPajX~&CboWr$oPR=I5P}-eG8S
z8#RBU_5%Oy<A&!`&ahbAj)_+!v8fh7*UJDuQsf8O>mLK-AJbOz5rcLJDU|c%O?fn~
zer<2x+s=Dc2=74Y`(9RlM-dFDl9>=L`9lEtZ6;`fvTbVtt3Sqq(e=QKSWqCEK^eF)
z3%3nMYwmQ4YfeYv_H*35jTFh6=T4bee~X|YmX_!PspDQ2MQ28s)E<gYLDJhMV!&Q8
zd->@R)cE&Oq4ILq1s)<J+^H*Oy_NAxc&^d&eBP7VuV24DDvYwB)MI>5@@wa4#WndA
zmoH*bjZ<%0qQXg%aRrf#s*A0b|Jpuzmbyqxb5w0I9V>*a9#*2|zko=zpXIC{eJQLa
zrl4cP8bSztFRQj$N-dDB_4HLi<X78pAl!)uV+{YK=-R?f?tpPsey}`9C?sgiy}+)O
z!3&+Tz_iSAiz$fqoT%ZTu8F|6h%66}V8IVhSJ;IvO*<C$SM=S2;X+>l6ZlaZw{GOC
zzez%fr_?U8z=q`OY1f-4p5!<XsrH$>wJR1=|CU3K7&l^qEub`p`nw?0Zym_$L~MWu
zECVyX{^s{T&J7U%FTrl6ixEeu+$v9@A?E3|e~14;b>%3F;<8(I@`+hYHV*~qfAlWq
z3f#>ZJn=3<D*FSHT7aqm(BGZWH5()0>nPH+C-i(>LaLMoC8sszCn1q0z+2CALHu&2
z9P%Sb79D>R)U}`0+noQEd{)&n;Xlc75q}2T*zS1k0g$Z3_NaxT%GIar=6loIv5uQy
zIu^t;SgWP}Pu{_6nb4h#Bz|})f1i%pzI_sN1<G8vmc9tc5APDs7tn!4HU98MqdQ1D
zX@o+c-xa!l(28`rEJcJiI873%VmLV1Ljws}`EQd*du#mBC)(glibl!9Fnb5fEtuC>
z;$KXCLjloDRYWL8<<{8CQxXofHfNUFo6bvD-Pu_mPovyMv(3;%m-h<ew}^O(@c1%%
zH|rcyQm+qCQZ({ai{?g-&OFb#Na%dBlN*UQBPV|upT#^~KNB5=U6@v*bPK-c`o)pn
zDc(|_W*dT-v&F<Q`+eEN-Ck<ctJUBK!f(`dz}8C-UEaui1WGDjqZ2mY8{aU-HXkO>
z8YPd%2`@E<%-)P-^qADAw90$^n)#{QLdRX#g(nW-d6eSpqq<CWK4_t><!Mqv^s6>p
z#MYKKCw3r<`_DR7z<@|hy_buMQs|lrhRTJ?;4$CDXHevBH;`U@2VK=s@cOmHo@*m7
zd<fhg$`-73K0mbYEnFf}x?DMiw+{j_VT1&3SWna-axq}t`B`vf0%rDzs3?g&w%bPj
zTP7bKxPCjkR8B_LqZ`xlSNw<e(DwG54>y+Wvtg~X%yJEROs@7Yh5FaMoewOb+<*O`
z#M35?KWC)}gB#b}E79F7Ds?;@(G2Mk-!8LX2ni89*lPbK!4i`&aC5V1>ps%2)z1Kt
zcLjy7TOH^kf(%5dAGy?v1-@!kA$?Co5r34@?XGM5iC>@E6U;7#34K_@q5g7<<D`|W
z%ciYc*Dzyw$z>_mx$)?ICBO4&BHH+u)G*j5sqFa1(c-m{p4XfnAeGWq>jI+H#AJ-E
z8wNpTAI!p&Gt>Ju?~(y16vFUzJkY~Bq?>E}EtHDsj_a12pB1$+a2pW)(ntc{oBnrO
z$Ub*7K>8JO+14LM12T(<G#-b?=Qv;NJ{g&omge?@(^|IBvfbr=$g-@3?dZqu7_fB@
z)#&C37Ha1R=0*Q#$WSEKkfVrwTp$7=k4Z+u1|9huCwn|GPm|G#CnHa2JU{T@gt10j
z^oUg`3JhV_=ESKXo6q?qncAyMrnEhPWKv+}%KrdK3RE0Q9^UAf|7-hGc0C5-F=VTn
zdE3B_b1+xw&BY9oYxJUQVVoKt!+#0;Z(L$|h|mvs+O5E#80usNvdvsa{G8anMMT?l
zRU)TrR;KH1<@!sN<PAr$!FbmU!q*Co|4l@eSA=*kv?7ys3%!Zo?Gs`}RQU8eVrsRW
zQ*ZY0z93aX()b~{>%Tt4*Ka)9f7XcMKs~2h5J<`4>TN7b#E!8V-nD!dkv(v;k&Q`~
zoFSbC7h%w;om|8E9|VUyGdD=*9fnhA(HEXpcKIV1wZ>n^Wv}_COCwI|pCzmRO#b`8
zJTa}Plsk0LE4SnO<W8E(>k~!c;o*n663$BBb4})LHxI)+fBB5N-`DigLv_7+?$j%z
zWF}khVEpF}PjF@Ok7xQOU8_?rFN?L}syp;Z(=MF5Ul+16AJ97;NOuTe8O;!-OxABl
zFyueSfueGg=1IepH-m^!s=k#%L6n6*m5ZrA@%(WRah`yOB|8+EQYl>q{i57R2ue5!
zWmDPYWB3oMK^+_+8ElWZ!tV{ZQ_NMT*l+<hFI~<Z-G#S*FFQo(BJkn(6WZu7lC<an
zE7Wff_uiOGShrUB-4N_UVn?y~DA(1yuHGlUf3VQ3E)3<U%EO(^%&L^G>bo5Dv(Wox
z9204SCa#D@&n60C%nj$^wfg}xU6qjnebzfxsOcn40X-BzTg{%U3KrGNgG*Kn_s^o)
zcMv_X#(PGQ$o97)XObG-qMAsq88R7%!M!-R=%GAsJw-q4DKAmIFf(b=R#b<SG9zRG
z-_-`erUJH<qY+NHHV9|id3+^-$<czV<ct-|GtlyNN(wH2pk_IwxP96>JK1!F#YiQy
z@b|hR45h7uqhnBFj%H3=GlOZ#Qg8<Gphf-iZU1j{_JDnTYVB`vl2^BhDzYI5WkLsw
z)nrk0`$NLmcJn%)={c~vg}Rhnj0BqSyL(IxWcuAqW?u>(W_8oclW9#ZzT$;ed(UdW
zZ$#P}F9oiG?crq?o!zHZO7gO^MXe8|kpsSrNJ6;IWX(1Gd#<*>2rVrwc$Ka5)>p+0
z$eU1Ai6Ww7{P$nkEH~g`?sd|Q>J9|LbCV-2>&$VAoU^7j)qh=#FvXeh^eG8cF?BNB
zf#w-W&k|>*9kb4K?jzTSuB+}@cL-3)5L|R3Q>(R<+Gv(c|0<N?NzV)0?Sk~tG)*`f
zi|{DyoWNEMntAJ6Gj_Hk{9t&2`Ct|-$kD2X^>%4ldUE@NO;Rv|mV_4o4SixA3u`&J
z-)t@gr1N2j+&6Qkb9;R_QuzZIpb%x9^zx-+_GbolfWdU!YcAhMwRPs|(=$((4K0t!
zB^$dLxh%N+zNF;`rE%ulVq*O_$|F>-wjUPDQrYYmJO6Ag3~7CuGkL0sKFvBY$zQ1R
zvScO>?!~#<SKnn~rSFQ`&cAhbTk@;0PpNcEtzjSBW^3;=E$nfWMfi^wR68Hzq#Lq6
z1I}eC{+2<byg0M_Zj;X6gHDnO1gjUMi4~4+crqjSqJfXhg802Ov9k=gLaAPLOf!1-
zh|X?G-(_vV=UP0&2Ya(?h}J7@=sP#eYCS{TN`}WSx6dv%sC7n;D9fx04LKT**s_zj
zKNCv%%a#yAmO_3YVs2H<BP{vXCX@E#+EhLrtiHoiM6(@dI||}|{if_c0$_bEim0nK
zBdImBs21sOlkPZTlwTuS?2s{RJw#Xf#s5*DO$i~(<i(EkXQb;9BF7Om=byL&%2yvh
z_fyEt;xRgw`2C_{eZswhw?^HY^=5-u=dsS0JN0J`+-94n|CwuTc3b9hv|N>({?_M0
z5gGLF49d-EV(L=q-JlkvCF_bZldQbpg5HdLctoLiw!?1Z(EW_z5SL67sln`1<>crM
z8<h7>qPJ>KINh+a@YC+<!3?>L$BUFfR=&s7*B+Hkk#?QSH1kmsmwX5=Sa3DJ%NE_4
zWiD}Cv$A*?S=^Nn_pb6{BcxCJIO}kac-$q~8oj(F{p>a1!T{&b%l%S}Q-A6$mTb(5
z<d-b74_R)*zuFCDynLeqZT6v%)<-X5nOZ_{7b{1D#&G)>Yxh2<y#47w>{HSbO*#N3
z8uuF)9mGKOaXTkg?6n1L>Q1{DZN?B>c11+&Fe691Tj$)p9P99zH%9enH}nnNmj0a8
z%BGfQ-VT4jERaSiszx-C=5|xN$Mx&i#kR99Oq>6uSSw$3M;5~Vb8Bq&7;*`IuA7cS
z%JwF$9qy?)Bi(@T;}-c9iDjwC^5kES%(^B%2Wn_=@}}1-sbB5NlZsdmxa5_);cw4q
z?V0j1KRKa1Zt6Cs7I8AuJZK@EeChU|%MYQCG2+$P1IaSDke_;cI(#ZcUKQIG%xhr_
z8S^ccXWpEe8FJflC#y(>W0GbyN4js#9$UvJL=_X<rqb-lVx^bMLo_ov*9ZN`OCSRA
z<T(zE0P%LCCaScocfN1C%y5wHeR)RQ3u8Kgj^{!HT8H1h%FWFs#@Ode$Q^!&Iyul<
z@XX_s;d2g0!N2kH_dik(W+o>Gn!68`&v;lbq>6j|oQq2GRb39ezn1eU2516j1O$!y
zU}l8~R!>|)WazGK2LUU$^Lt3CxLBdAGAnXTOm^1^eV(}=K?Yp`A=&>@unR~&aN<A)
zS~+??Hw!K_IKMT)+G0s}6ybgkL=FASVsnBvL83VpcYPtfI%vnFR#Ijowfc!|ae{7B
zj#pbx%cI+WwhYc3WnZ1b<LA|N>Q??>`c3$YG`p$XC|$jeisSSiyEEW26MTc+NQ5Pb
zArac60nR5qF<SxqQX8<cIX5~QG%sg7HP%j<GLdFT<YV~d8V-edKN~R5=JjnwHnfQ%
zsc;rg%)(0%j$T%R3Aw@sQGEBH8gJmFC&3z?kiBY^h;zCkO>yPR1kGG(_GO*DhIy5N
z)beI}#eat6*JDu>Q~@V^Dc=wHiHRB;l(EF^XHw%9apLZ{C)N{0Utr&it8%tq3(8}l
z-t{K-`&TEaA4xoal7Ip=0=!P<7zm=*4$1`cuo672PFbPcC_O0M_1J`X$o6Ou!vmMu
z;jCN;R|FA5$k!uCoBYii3>I)zGC@tPp9^DRigKfJ&Z@E=86LJyEfG?Ux{ep}k_(<1
zbP+QoSIKTufmDH&u@~3y4<nZgrm@HWy|U*J?rUFSv7n$p(NiwaOhHa8c&mlT>Fr@}
zF$GhGOV?R9U5#m+a1?e+3i0;nIFVtyMxzhA%XP+mwCU!z*{0^K73BQAN;m;q5R2#8
zj%Fyzt2B1(#GQRfM_C*WLj^4TJaSJJC1-QjU2FCzIbRJ$X1*wN{SY)i<#}J+$l)RT
zhj|4HPlWm+&CeA^5Ka}S!<jwGVGHgUbix>3%R|`+kHi(!S|>aSK$sjfj@HCK6=v%e
z<w-!0&WceAc56c$i*t_N=h3O}d``tCS8&Lca*+1&X1O8lIgPG_h@v90rCUdZbW0Sw
z2|z~t+=3?Qwsl4aAW{(s%GCQ|E@(f-_&^=2uQGo*<M9^c4oD9pr-%tY4<+{=y?Os-
zHL|h9L^X+X{od~)&}c^fVBMJvo!h+kcP#*rnez-3E1;mrj_GvB*Q?oQ1Apc@v!7x<
zSeJxh!*g7WJO_mZq*Y=PP&b7cPDT!Ay$Cm|BY(CLEB)X;qxqrDabc971nL|TdFZDu
zT(hdqKGjL9#dz9y^5a*vU}*A^)MqXHA3I!IA^sw8@irINoB%>1)#Txs=MG9y2}v)0
z(120rpNSRd4Doi4kBgC5x1sg&gqX{kNCC+tagYs9Pot2?lV=~?vy0eHAmkr{m~Cp|
zUpfWD_cLqDw;0)y_O)>jYlDd*fvf?-kT^u2Tj-~0rg9`%YZOf)P?q+Q1y#3c7piTG
ziR3WC`lJ6nmWeg6fGOM2k;5Vk_}f0kg@v7O&SUky@dQ^PMFCN26c;1KkB6_-tNrH)
zGzj9eP}y*baN!eM7=>$xv?~~hd+Rx^`0=!~Vzi5igQ72KqmWC4|Mhza5c&Yms9xr@
z_X6*5=kOVMDO~!~(Ly9YUUdt}fr;}y^4`_>=+e_XnguP5EK+dkzsCY67ln4?E!={C
zOjr7Fye)Z)i`Ky20gRP6J`6bAExHqz{8$mnu@<ZS&N}{Ys#0!2LJyDdb=y$#yAt)%
zc-#ik&elFs!s;W1*XoY688pX6hliW;?~+}>-{joj1X0qY3PRRi{3Rnps5hqhsw~{}
zHhnvNXA^!IwQ~x&x2!dlbiP@+u523_@BWiQL<HinHmzJDB6z8n`*F@&#BW0_6UH26
zpS(b`U?;xq-2Zbi=n>qcKFSU@8Pb#Dn3(AEeFZ9>1T@7-4#xlOuwwoxFRpuu30KF8
z$!8{fcy|NZNGS2CA0D=aD;k0;%YLi5Ymt;?+R`y;{(zbO-$mi;>G2$v(C$~5vWsZI
zX`}?@Hr%gVRok!lgdIE3jbxi!sgk$2ZRbGytrZj!Pa@g%avay?A@dtMh(e_MsvAg?
zxf3j0|I9*O{3zx3#*_(^)XHBrcVwY^2~w7F2@lca)lbi4Nxz8#6WGZW+Gn{jPyBkV
zZ);8t^{pST4avy#4I74GtWda+p0mQlmyT9b=M<1cc;y{*@W``y%6i0eV1OOyzbhM)
z4!12_zeDn1ge?$S5PvY%0KG&Wm|&kCej`IP(&?Chzg`ej-!ru{Rm_Vd0vHKYl{-&I
zXXjAgoti%!D&z|$^1#GwcN}$5Lo_rqsWJT*U~W4e7y|LZcj4UD5>6Nw)GJjTcb`k&
zI%Zt|zMb$7979eMh5o=ooyMuoukN@v{lv@r9dW71jdBbRL}5aAj{8LYt|hck<|H>?
zbpM)XIz-9?{WB24)qh{f$DhI%G&_GkbVrt(5&vx~(g4tC$xs$v%6x)!w5f0#bLsvj
zp|{nFHZ{4AQJCG)!9nX+(&4M-Bq+@^;9PYHN_M)k7QS9L384wGf5f8~ME6p>i>TNZ
z7r=0WbDY#l_Ep*b=g2E@%EhzLf=*I$d8*M;P2Hrqq#qT=xbKD`B0hM3+E^&Q*=}np
z-}8R`@sdG@9nyq$4~Cj3p8-w9G;FtlnQdTKuUG+X{qr;G;n_m>$(#gP8w2+Gj!Kjn
z)Ec$p!WsfL45WUNGva9cw^m%?YbL+0uV2&ak{1ednu`~7uFXE+H+?ne3mMT7@x=G-
zEjG6IRNBlgqcrCiPwbgFu*_cV9{S$4`c*aO2$aDl!@+2>>n^i~Bc;^IGNqBjdnB3$
zjtR#MWo}^%eMQKI-hRDrf7NvI+^CW4Ka>Go?XILA3rx?TTexuc@_t*4@Otd3%f&2>
zIsuQT3yb>v(xDaDHgI_HP@O#>jmFke(P|p@PbL2E(J}ka`oksgSeX1E`-#sH0T@b`
zOI7zBvyAm;kTBc;hDeLwODnVVZ3osJX`EynSPOlhd+jq^y8L$cbv2v4Jg4ID*~N%R
zE(Bjg^Nw9mu6A`utA-qo2%u3AIYy6m4z|1myd32LqPp_g@G)U0WD;(h^1Rz0bdMlL
zHF`79<<OW@t;>x4IfD5!)1qXZDzrwJ_LY=LzW4J<14lR5d6%gg;@(V_$bTAp!CxCn
z(j;|pke3Q6s6lG_FG2iG)Qhl%vXWn7w_CS=!<H?{aVqYEnvrMGuo<3NXtf!H^n?@)
zZ?^ozpZWTAh{2e1t90EGbj&~Q)Y<t}yycqZEGtSwyomTR<#1sO#(qQl-Yc9U({XRK
z-Nxf;?q7lw2=#HcOP0?MSex$f^e<YL58d2>F)_r?y>2^ZIK)qK%zlSbp=suXit<9z
zvtgfx@pHQ4=4lV5-)Q1#Y?h#_q0KFLKlG6C(s$+{c!<$?qJQRfiQ8nkGWys&fg1gY
z)RxZ(9!X^OmX@-45wzoMPf+C2<w?QrNkQrkyE5soH+NEIf7Uc<FIhc>{`F#qGuh80
z>sdTMnA5H;^zEs1txdE8S`(R|Hy5eyMpZu$qU4GVO-CQ-X}p(d3-MJGxGEmF_f!-Q
zv=ERn>mExA6KcftKtm<fh{?Ya4k3>?4%8zJxq~#%NygcgB#n+kn`LzVRD%lgx{LUH
z-u1=2BnMlw5}^WObT2%`DDZ{n&ZWG0IvS@M$KmM>cbn7@6Ea}mv*H{6!o905x^+tB
z@kwuy0u&J}eppnMBGRdbq6lg(vw=^aPdiSB(WKIi-F_+k^$f$)3;U7InmgMP3zpvt
zFP)n7Gna`O91ZFmgKlG2c5QiOc&p-Y1w5Q5mX8j$C+fKW)OiAo{l1IRwQ?$cbvf(o
z^+sm6v$0;P6AwP3pSv7+<Z`5BeDjB)7p<hUCnoB2Pu)xTGkWZmwam6iUSt`}y{YS?
zzM{K$>BbbICEfBP1<gXU-W*^pqXa(w8Uzi;_f|CP7#pU%>z?!P4pk5##!D1gru~S|
z`qdb@GSv$&9~j%2r>mTooqM<AoY0$|JDcTMZGS+&)v!wSp|WSFcH_U6T`w>;B4q-9
zn@XXXpq=hg>$D;i!hkeUZ1?5f$ti*~5YVN6T{FX{{`xAGbxC8->spPg4r%a4kPOln
zDF~i-$*ZU3>fNQ{{P||p7T4%ij>WdJJyrIfEeoLyVc6Rt!!fD4pZQxN`ow1J7YhUq
zIm1)M1qoSl8)Q!v>7f+8IYl29#gd3|w=6en9;EJY{S2TBjq9M=PwuW1T&{@ziOc#H
zj)N9I(~2(J>0A-%<BGc3jMHJ7lmT&9DTW^1!uik?J@f^<(sgf9F)ldJwfA7f(XxG_
z(%)SP{m^OE<`z75R9pE?{B^$C#h(d(i|3K8;eyNzlYA0cq>{hi)aes=lordtL32h<
z#=<9EvX;&bv^%KRiIpSnqp{DK_zG@1AXl1FAtZZLjwXlqQAxg!rup?7lPc+n9gSS>
zI}(t_b&p8bBV=^kAKv%`jWQ=cW@!%WxfsRr18F2iJrih_3>~c!rt1bT8h%Imu5-<a
z^&4->io8)CP=j=r=JuGo?{OlPqVd^D8k+fSmsrF$*j8SUtK8e0X@LaFxcMG=PzTco
zpUe6ng>+K*eR<UK1<F38_lfoX?h~Ly1F=Q`3LF-5lKn4&mMS8b&fIF!P-dSq5;!?+
z(>-HDT?{=3$({UU(RYvQFWq5&%jxu<G2%wAfJPBd@?5p0{+@B4p2qBm=~PJ#q7^;G
zfylqR!81{P@3*qv`Te!lEk!ZpTC^vj2UPDCi9@VV<c6=H>y6ZaUOWzo_I}8XVeoQp
z-w?Brfo^+7jcjZxxBSib>EupjRSBs;XS_o8wx6U1hy|@hLXz;-ddU8jNB3n8&1cIk
zW^e9Z9n>j#hJfw!E$SbQmI9$^@{=<3%q!)Y0AFPe=&OEYzwAm^df{k6ob69&pL}hy
zlm~rtDZI(61}ct9KUS5`IsLwUKS;|fXk1(M6_E}R1H75aP2Fg#H>UAs2xq2X+@I1>
zM7|1$bpSu}3JAy1z=*sZr#Lx!DufDi`R7C*e6mt7!^0cDvQ9(#-38ZI3%~pns_0(&
z+<gdXoTd{2?TPyxpD4=?YEaM26P4wjefhTCG4YEx-!O}cy6n`;9bK(YGhBZjxz`*R
zU)Sx*k)Gf8toM2Gz;n978N@8{ak_F40=?y=KrbN12+bU)l5{`~sU&n=WC5CAKFT-;
z4L?I3w?aLX_aSgKV(aUI>uGo2bNTKaPozpcoUbJDR+)7P>CKPenO)Roj-?*zAnES;
znmG1{oVCawINF5YpG0(<N+3tac8}Frk4S?X@2D!2(UXrA8?V_u?>vOOig0Unv0E;`
zYvkJNtbC-^>@8@AKib(3?TS@zRq21tdVS3u+M?>;&4o_FI}QO*B?Pf`&wZ*u8hH1*
z6!*J4ap@W|s>nK=(_zuGx?Hj<kq^D!3~whRjU_Amp_RY5(2sqL%;Tb0)^3S&);+{%
z5O{>%!oAt5X+v^WIR~y;H3r9$0^FD|P~KsW#vncs<pM`F>Th^48j3F8TY8o~Y#pGV
zBeyVyp1m{n!~O1~{ZGu%zrvnMvr&Z7{B`*?vgZmPc=&vN-}5g=jqKh|#R;n!jo)ED
z^va+;YXo@$U{X_VT2rNDwbf&}6>068HMWSL-xE+*dA~-Nf5@$vPNtg6TGih-)7LU{
zUTB0Ks##YJt?Rv`i<NLOIz@vxLxToM9{wKl&kEGquMt3w(xJe)I{$Pyis%m_^u|dz
zcp-sQ{^*8MtUZVuKY9kfRtHUXu&aIPCn%sEcK?nCJR)!Mg^))u)B+^?LN~KNJSzYl
z<kR>aPsm9WPq397^)aauyms?pW&deMzs%1=#;Xg5g6({5OC#G>x=I!ul~=w%FL2cw
z+1W1*KR+ZJEU6eToS$qfLyIFls&&m~5j#=;ZY1(d1$)~nJf5tpRZ($mr|Uw%*j&Kb
z{%cv9w~OG>4QvEFZou7Y<YE)2DFc?5O2qhG9<2@u2j5I>sr?#0{m)D0p9hUCe;qRv
z$hXga-ZfnA1n)uYV49i6u1;ZtI2K-VB)iH=@1mhN!10Uh=%xy*YZcYY4^A6~ld9zw
z7|U0#-3IUQ@Pw^Ep4q#|6}>*vHDQs%aLRoC@x>syzh(2f0;dOKkoG7a-Ry0Vw~r58
zosL#iIjeZ@S)!Jv`NO(0j^%oe!$_~*C#$ooE5?(tJhDk}3w|e;&)~_OynLOP_!6<u
zcCrG15tldAFy;a6##P<NsXauTS_{A8hW7oo=|>1h6|2G(6{9ZfPqBE2se#9JU%Xvi
z9>WHp7l^&^N`>RCrHQiEt5t}v!Gm!4tX^3uJPpWJ@_4DbcB#7ZNBY~z%HE4RTrNyP
z3+?P1B`jWvD^p;j-I!Ru@OAhubotI4gEw!36o^$yQR$wKu!ct$NfBmn_1yzS4<hVi
zG7JWdDaT2qD2i}`a%og7;AX>FxR?Q#5=57ewtRqYF2d&<u^)S{TW3WETTV=#%cZ~@
z&p$Oj1JSb?L&@ZR#Kb6qrxNNEPXA<f|HgdPP$GFiW;zLLvwQ#i_<gw-r3S_DGVWDF
zQHOxc$^^OP0cg&jUr}t@JzG4>PV-9v7YUEc4-k%{!2;pl;aumnUBE^=Z`*|#1;rA(
zrU7{8<5yU3@awl`-wtHBPp1T3C{8<zi=q+l`mulZ0%hDN7&@cU-M_oir!@*J?rP(S
za(+QecG6oI%U1>bU9OW0msuWy<#IvktTgy4BG-_1sDk{y<`NvYIA%kyf(rq!HIMJ@
ze`7iAO_}wx^PY>i7>%s4IEtIu==Qhb6!(dT#xe{B16jHn6aC;V7)(l+{F3$;`jytb
zkXC^pR4?+u5sp=)wYsj>vwUcZx;4;e6ZY<l??1~BbN~ILDtH$*GjS!T%QrZ&qpM48
z1v%WsnTKZ;cesyxg?O7M-%Y&~mT87hgc|Mn3hX&jyLpda5*|VDWQH(bs_*Ut5%*<!
z4^VGIG%`=1$I_1@E-U4MOQ$ExJeJDtsqoA89SuC=KJ?l>Wio&Gbku1Ak45!Xl7*QW
zfssiJSZr{8<583?MNW=R9em_=C2R{$u)aC4h}3IZ2)M#_ujMkhg}-wrY<`%PzQa!{
zCN`Z2CZ2t5m!JMlOi#XLPdT;grHVM3lBrz9H~o(TT`y1i>b#8}e?oe9V9xfw^p^wq
z%Olq#6+-(PRD{*siTD4HtFMlWvTNQK1OpUM!2$sTX)p+v4h8A%5|A!skysi<6a*}~
z8<vvpu#oPOR&r@!>H5u$@ALki_xleYS=l?znKN_EHP<=gVLrN_7Hu!R0+~gXazPE8
zW{)(=*!Je~z&ues>;I%l4mzED4>Q`z)AX4|&YiDbm854Lh(&s(m>WCpG(oj&;<(4y
z@gO&xFQ=Q~*{1mHfoqQ_4HGT~C$jsX&S04bpl`tic0!e(eg=8a<zPaA!$%1EVO$EB
ztjiySP6@&{kaN!U8tx1;Zvz0>3~oSv&|Q=vh|~V)sDCv+G8$?V|DR9sc7asA89lUy
zjM%?UZqtMfOva8B>GbqNDg+jZ6#{TxzRdm@`Tp3$rE8k@E!=(h-oeCnDc#*eJc9Hu
zL5`>=FZ_!uz<uCq`$qApybK#vOYSm*`_rEL)5lqsl8>|VODfGUV2AF%y;<S@kEeDw
z2$;+;ioo`Jz#n?#QEG*-htdWxbgvACRpV18O2>m(b+zX?_kVMqH%#7LN6y!3AU{|p
zr%Ug}M^6iw#Z5lz(<{+5nggK3|0HN>J7-B-^>-P7?nvFI->DjqbK1Yh&7f5kFq>cL
zs?#ri9F-Sb>z=T&bJYpq81#datc6N%I%`Xt9{+c~@<t!vI9@#cC<1yaMfS^#x%$uQ
zJ*VRV3C1HIu%&Y}s(O|h^c6wRqEXrXN%jgAt>S_@O0_q!v$~K=E>$+1k$^291!TI6
z19%9eoT%CKB$B~pIDnr1HTK@t!o&v!d5-GM-<^&>?6o}Mc;RP{Y7+ege#Rt{C=hvd
zvKvG&R@459I_y8<w1`ZwtOs7wlgp)1CHDPp$-(-l^kCQ`g<&RgHm)Sg$rOWNFGV8X
z6&+P>%#1i)<o!oW=Sxbzzq9*sRu4MW=^Mt4LV&x0^vLjhJc>l#hB9b<Fg6689+Ez!
z3phY?h({JX0oaXXeo2$0K{$^&WXD#Lu!(3zxS%smCzSfO8RB36F`-*}5NrN9z4HV_
zW||~DgrUA-13uUmS#8u$_p;gFW{+GSQi-y|AD;-Td`(sb2jMFq$Dqsatb?InE$fO|
z{HP1v2zGMLg}DCSZu$b!3;TRVCaWFfU}>ar27h}=`nPjbF6uUPTlJDbY{Jh<Dux#x
zs>ROVHrSihATQi=^4xIx0L?@1yq^5zUyPh;9xz#W?Zt=D6)In$J)8n#N!5al%#SGu
zAP;RJ(s5wtDAmSapS$nH6lz@Y2io9FxaG}jupNX98=>+}=u)}+Tpd1aoYFWeWk^Bg
zVT}mex1E|e2W{&4{wFnh$^6#cBVvC)Y;RccS)XB0lsW+YM{Gz4y2OldP-MtRTr_dF
zW_#-?b-CtL=1^BS`s49xFi>r_hL6M7XwJ&8xOefotC+b%Rv+;GZkqslbgO1fr-u7)
zV;!UuPUoBiadi5N0_y;VL?cwT2|>a}&B2}&8+XX?qVo^_n}-M0CQ%QkhwM*1ztxco
zy%lF`%M?(PA14r35u{q|0$stP5!?!HnW5Y$BgbI}n#tfM5EKWSE3JOy>*3T@-I)gV
z!F3<V+uDDKVyiNJQDpQwClU71#z51D1}pv5NcDk*8$@FC2di_1UGh3^iFN<G6S)fn
zAQ2<sv&a$W{~+%3pe8_eMQ&V*Kae5z51nd!=~~Ga^J4;zl9O2k+Kb@~zQI;$c=7=!
zM(LNmoK5|=vnqMrs^t&rrw%a@^ssn}!wDlC6LL(v4Clf7u*w@Q^Tu>KrK3o%CB`#Z
z4YttS*>MF<b1%kMsyS{Vr<@N#XAHJsXnC%nr^-@}10Vc&^uz|}^DBmtRt!kv47O_g
zB&R0|f!Z<msS@Xl*+dGlXC6=maFk8F|E^nai%dj_00bW`3^rUi7}K5fKCEiqB*lGy
zcUss$Q!oQB!cU=jO_5T$NHq+1n1djR6T+0@K#(nKC%c9y=f%%5VK34ysBpVIOiE@4
z^~MA6FC5pi<BEv$Yc5WcpI^*qLCqdT`qA6~5K%?CL(wE5IDV;3Q~OMs#g8P6g6bcs
zl62Ws1U||rAfbrk^(PsjSE>#oFFpuCF3X5ySLD!Dj%avq(P?yJEq!JcKo@V>=hL?B
zh5V91JJ96rcxkul{=uK?=kISZ`}qG>$P1{7_sQ~MIzR8Pspz;sMe1XuRmQ}^v-wj`
zk2T=5Lw5g~e|$a*lluKnKK{fZMMn~M;}jgx2N8E0Wbb^4Xt=SoDj>wdi$b}?=q}1l
zeOO{Hei6>o{RdXL=j01l{51AOQr@6k!d0~!n2C3iE1|?68i0ed1fVCKG%Mk_`D5vY
zj~G980D2~uMC(y*|1^!VA=CphmImp)GUcwP7C)cz8TyBtknzud$`8aHA;|U8yDgJ&
z_cGxLdaon@{UK6LbYe5vN(e<wL?J?NK`0s*c4k>>K=w8*5k)=l1aWN5ccw3D(Wei^
z^IaP{7jc)0p9!CU<?zThYl}4nMyB<S?9-xQr_o^nFc`Vv44*bXIuG+mZmUqIS9Udf
zy(c$hV+Bz7UpwmybE+YEVY8jL3*Uqd`xKMA*B;nzLIZ~IqF(LHF<Wi1X8oJZ!=INV
zey{Zz@Lu6}<a&OboCk1O3r^=(7Si?7lgDXM7v02Oa?hL2J-9+HAt6Vvf=?!K0X%Xl
z^cAljciqONT`{xU`5nO7jLpoMt#EI*x7gcWyL(;Y%`Ge&9=Y;D=G;TI0P_>ym_?}K
z?s4Nt3E<U_{PW{sD!laLR?;Lf_^9)mNG&7#-3AwWFOkd0KpCD#`SlZ%s4ZQ6N5k(l
z?2yhdfH!neoEs`DD9}~O)vsC!)z*f_ib@ynAM8R`?AE?MP2<xAjVzg)j%LT*3g591
zXIy(T2#u9B(4+1f5u(qJO%gpA%C)k{hIZwb_1Zjs#Z_&L45nMYsr${Nw$cCigIu%g
zQdJml&T}%pdR@=GHB(?^*<4yxi%eHrc*%zh5|XY53*1?r(=L{63Ui6tpNG@l?~q)@
zxmDWi?`~Dg`0POgEz_yyh}J%zsao1^t8Kz|jw3G9(ZaNXm_poeg;Q%Bb79i0@KDNv
zDD0kW_^r>VoT^h%#W6647JWH-4b}6nyyCp}?#g+F<*8oiBl)lKAZGLyj)oD!MaZ-g
z!3aJ~^wMJ(x^lhe%ZGb9Z%)Dha=-vC+-=@^aGpuE3ii)EZdF}~H&Cbgq!gd5sWM~y
z=lcaGAKtE1IrNts9&$3=8H9mVyFC!YtP0t}h8_D2j~74>^r&Fr{r*ZAP;VLejLtWN
zH`%odpNL)8c5KpNuUpXg(Wm@qM6scTchovac;9*S4|X8XhZ*ZofJ=?qR;OfnoMfD#
z9Aq%pl@^S26DR3x4Zr<03s@c0;EXQx4U=MYOQHxWv1rK8E*UBBcktaVH@GK7i|tPO
z^pQ{_@<W$Ae~XiTVfMBrx`jJtylQWKP-1^XV)<IP>L;c91Uu7i?3|_lXaO$wY26p$
zk^HYgA*OYfV=Q}Qc4T#HPwR`q^Iuny1$aXhGI24+mxfopL~atF*nDh1fWnCpiuSl}
zihP`_qf&e|0ThPJBXdS<H+1oCIYw>$wqS5$)YDG8#P&)NtC=hNn!~}~MwCgYR;tAT
zvr6hk-{<>#OP<qfWh;%W{GTIXe(xEm_+C|`xGe19_Icc@rfPklNvl|1&6CezW{k+6
zgxj@@-bsgDBPFG*vgd`*c89(8dOu|jLX@?@nvN+L1RlfOw)EG__cRMJU_-_Og%&Ex
z1|%CZ<#}Hv9GRpBTVe$((v78_KT4A&{iLdqnilur-|z1Ik#y77rIOQi=4YV3Zt~%q
z(<_1mB~H1!|2UXTI@DF;8cRFja3~_RrC&Eo<rxcozN(*?g4M%W`dB0UD{FhgSp8WU
zws$34(2LAudt-?%H7eBx(9o&0FK-Pzu^g+AOEP~FCVozfnYn7V#S{}4I&qob(K>nD
zuCPPGbZ4Q2H-_(-{TH@v&n3-P;R82x$Ab%#Yg*P897|+aSqJF;K7s!;6=fHBoYuMa
z)D=FiTsyTErJ{s;1JHKJnARQN@lY=t25%}OEw(XJAXy$ds+t!M+EzsGQ!4Y}+r_V6
zc^-J?#Pvs`uT;c%-&`L2w~IYO*l>Y<?87d~=mof5kt&jYZ>7DOLFa1C+=9l^Ga~V^
zF#FxB^S9UgiG8pEpSU#UyEAah(7~I#WYnVqXgsTH8u*So`XtHyJ+8XZzN8l-#mP|L
zPUcWrVN=o2k&1utf+aM~)m2HMty!3Ajj^KSQ`P}5g!Jraf%9bou2hfpQN68zCLz(a
zZQx&`F6b+rm<i+^dE)cHFjDvNdWR$(x6OW==YH4LU3LRg-k;`|3ZYmaD7!&iqC`cj
zHhglBGF_S8A|*tfWaxoRo`n=oiJr@3h-s}Cfg$;4#}hBk$NMXov+&$MwEOsYFIw|p
zYTnkF(QUnwgam@ye_gC43(!)jyEHV2uPB0vS5@F-^gBZMj0rv`V+_2iAzy3f!Vx-!
zjdmKq?2kPsZ-zz+=CSVUTC31MIJ`~xK)_5Y3pz*h_h=UnJL(j`a(=1pGY89pmv`9V
zg9`J^Ty9P4bscb&fuRX}&_qa_a}JO1Qiy`q=~HV?&~OR$BYzN&>uPbCjxm!6rqx^d
zXj9aqZrYhF)w=(K2K}aNtyd@JKz(xx27r5Euq3#<<cjJxRF4hGgm<5yp#;EB^=3Ym
zq9gpaakRgf(g38amLsl9g5Zxw@1ZkHS_?mF#tFHTI3{5Q!}Kv3xSq@~i{31)&o<L-
zadzv_)p`|*Q1=F_phA@@6UL$RvY{9T@RPVurO7#HFMj4Q8kwK{v_xHIHu3xLw{&E*
zwA+sSKo(agD|po^bW;Sl4tEl0>(b_8+zm<_o!EesGL|DTc^uO%QPXTrMML&YTO3hT
zq2u5`Z^gN<N4S8)2qe6^Fi?d4!osdod}{DPtK;P^_l@CcqW9W`=7#G-XoZ3ww65Xt
zMGi%);CEt31mMf1E(=9{*RE)Y#{MiY<KIaq3=YlOhF~gJVmnNtUc&6~_Ve@O{8Vqd
zOIYUMR52C`aHN=OL#DM9hUP+9P3R!(R<XZOG8$d`t}d-n3OKbWLZLf5vzfVXLKUL~
zT=FAs8^7#|6O+cCEu0M1iTp~&-^>P3G9i$whFo~}(|FCO$Byqv@aZ$JSRNgBOIN(0
z_IBa#INhqQxx0Edg!L#A!Q(>-;Z*#LwmkShZwJzTVPm|{^Fe2qTf^wKpD<c?V5!|e
z>t9qmO|*;MSZ<O!ebYz0*VoI3>l~xJLuhs69TWdmyQTM9>r&~k*^*Y!NSJrSwVPTN
z+)36?s9BNFT1D9u-v)Pb2kVvW&h1DX4lTIfsyu60yuZVI<i0qp=E_k{Rhsnah{yAX
zP^;+dvd%z6n7Ac3HO&R&Aw}RBOUVu~2MYrjzH&Z300P3toA7Tg-gW5&FTJuC<bo>q
z8lsgS5PLvl_|*2cEUkh!`<}4=?xe+Huv*Yzn-ybSf%bh;I3F^aZ#sGEzPr#${q;$Q
z&($Zob-hf=Erp0r9WF+CVnUA)&N4sQ&)@7x5DyeX(%+Ppo*z9L*Mq~|wnmnw_hk}i
zgP=Y^aw<2=dzWY~!(-m6qgeiRN2E49np*$Di&;V4%O|=^YN45$A!coM@<q^Zb7&sw
zbLqCxi-P1!r@b*3zyVaX(T^W`|MVg+11h?}*E{vvZKn=xHlG=6Hfpt1>pBF#9*X5-
zZj|&r2dgM;?g`W}z!77dBDH=niE&=!cKY^O^l!<$iAm3gLr?K2!>=zRpcg#nid2SC
z8%M>wQTekuQ=UQVf_kXo5K2)kvyR4g=iTA>>to)tJo0nXOl8fu`@Ix-GdwB9veW?$
z`X%z}*#P3YrDMmh!9(mL({KmgQ;KFzqTi<SS%Zl;z+t@L-VnMi0^nGr)2MqNf?k)1
z5MsLE*GuZipey{5F-{{t9cH^5xa>v*^eCcF6CO}~5`Nn~$h)oh{^ft5bq7(Uu*#Op
z<(8oALo@w-j(+T19E0Ft0Gxm|`^z6$*Y9Ee8MRhFn_J#^lzVlcy<IWs-4jiSiZ8tg
zg$>--LL&H`a<nWnQkpF@v!`~WUW$Yg!Mn*6w3H#QaA;r1dqsj>C%ZVJc#$<wOgLip
z3X@}d;AtcIqXSt9^Cu~m8%Cnhb)jAx$21KRG2~$<kS~jzN3KR%(2V+@tEm&slk4t@
z32Wtz3H;~jm-pS2dX%ef<<_13{^wHHIruik0{}3T?8R~^UK>Z{J+4YkQlga)pGb&s
z{f-`yZkg=Aa5X%|HWi3MYtTRHkw?0LOv}prEgr}-62Z5zwen3zUmw$p4e%nraQyJ$
z)M2t7o1~%?{!Z-Qi~Tn#c?GYps!^MBtgpRYgg9<io<LF@Msx+f^hD{gey!^&|Koa!
zBm+hBzoV3_SF4}+KF-@B?rAGIvJ4Lw2mI({gE?p^XevYI1~g-1D%4Iv{&025v-D$M
z*WOtW2n{vk@#qy~z@~^#l$q|IJ@U_&4=<yPqzMe?8Z3~oe-gg+A->@e6a0CtC(O$e
z=EvH#6#`-Om#L`(Lve|JG0<CLf~BL6LNl7byg>*3eZs$>;jKCVYKs`HRwOS;hf~1g
z27aO+!}HQV^ztI_lSqY2KE`aLT#%1e2}k0f$?6?X)RBLg0|Zofr?eo(-~?o~ouq<d
zK56sYtK!iexSxjN;?Fx)?{6HebqbtD#v#5O76NvzrQ__)v8q2k?dfA5KAkkWclb<H
z$doDR^Zgsr5fEV!Pi5Ina{|6z|Hg}VEGb?A0s%ai={iaYLOT|oI=S`KIJVTQq%hv~
z7NHQ8+4PaaEB<?U{$;>$O0-*Jt+B@8yciJhk0k{T`3r5Auxz&<4lx!&URwDG4ZA4(
z;aRRrr-%<99S%=)v;5L3dT6EpOsRK24AD1vX+O9jGGUvK|4!IhKpBkQAHKZBUWbeE
zwbC43gT4%|@t&Xe%_-!H|DD%Y*C5W!lbdAZQ#|-!b9nt5WA2XWRJZB^sU?Z&I8Gx&
zSBKnRfPkjj5&4H%+dF`B#f0do=G)+~axH%5r=T{+p3ALHwOH}f2kFB$G9m7y>DqtK
zx0gs3eAj3)`R?KE359fzBF}vdp8JNq_C;0Dfp4mcM;OCrXNkAh7px$wpFEX~o^F+}
zGhhP@Aw|PtjQs>7PI9zM@UKe<^;#5P>&@KHmL}m}EWIxA_kQ0HGd(ZCCyKn}@(IPy
zeoX}MuZOTzBW|MQ<aO+OFN0dM`nOkG#nlm-;H{~*P04U?C3N$YOWOp_J_3^P<gJgW
z5@2S8ARZhVEUdwu0u9M2+QRaE8pNO;7Qpqm+Bl{WU)U`HPQqpg#RAt;JTzcQQ)EUQ
zXHidy?r?q@b{vd&eU6a_B><QxRWg`1!l~h!*T&5l@{0w|#7|le@6mI9*<*8-*0m$?
zK@aV*AILJNc$ffpp_FGydx6|V6Ydho==BvbchG&im9pJIFj9d^+w1FZUAzH#jHFuz
z57Ii15ugF>KOSKSI5Kj)RvQ~=B9`w=_0N4Mk%bbC6&#VMFOIxhn+NvbjVD>YYK@r=
zy^Gw|lHrk`<U$;`fJ|0T^AlY#|9p51wl<+@+xZekmrsokypIrSQOpb!WnO}ut1Dx~
zV`pLN)%BqIFB#|{S_1wV=~}<0L#T4ZyODIW$~-a2o>SY!H3(V0$$Ib_17_i?$gQvt
z5Yx7&XwJaPG9e2toWek%$m`0o(tsx2lTPH(v`N@>OWS2*_<A7c=SzJMa6XUxyME%_
zfK@IxkRAbh_Z-5l33iT3gdUe#ce~Ap7??Ny1a!I=L^nhrvnPc(tK^@L*kGJR_`uSV
zjcHM&FuJe<FwUWc(d?X3t?T&QV&n1XGiQDRY&W6`7jiG}7?rzg*2)FkalXs`)^tl9
zwH?jeZ9d0&^P38>L*3IUVfvQGPG6>+yFJ6U1un{)NcV%V!5a#A6%l$pLdSZ)xbL_~
zgVtm4LAo#8N;@4bi~Qd(%w4BoP4pF*(=_;eJDOMG$_KB}s1IyPXU}mA!&6ETuCHkA
zELP;Y%p~|p&$61Qo#yGUm5t)b#x+5caNkS!o;=F>5nnKY1ekrA+hT`r4Oo!w8_R_F
zq3uAYMlj%%#yq;iI0HSM=yH(dOz#4>f6vSKCf%R#qeSjC9zU@o-im}kI7M#(aXegT
z0}M~|-o_}M<>)7B3tVon(001TDz#l@2lP=tY3O8D@$rPSUHi$B)W^yw<cdeU%g?pH
zkAUsCwgVS$A#3U(-1&9%`D4C_*&?96JXYvf@DW00@U$N~Ur!c(IXPp9we_g7frXfw
zkZ@J^xHbGOH<cpA)O<l3Pn*zA7O*lg2$2!(*=MCw`XoUMbY<w5p1)r+MpE0BUhP$}
z94yY>+Mdge+NMF-C*uZ7?2r+8OzP~ppLgS7{<MF`-ynn-9byKCF!L{zj1Ug+i1#N#
zk2dlQvCHbg3&^jHfE_ZT*#-Mvd+~Z_U0oG5i;<=IdcXD}dT*^S3Oa>yYvC7%2j9>N
zmfPA?AMCq!0X&QT!}=tB%rLqo=PbVVD0%o%vU~Kc;_Vt=h&5wbf1Ufb3|B^s;qMRc
zS1|6^eDY9-(;<pD7kb1X|BKeCh<8JAke(<}N!BL(rVIyA<N|qbj3+EFn^nluB2eh5
za3DhJvLNp;{UFD4TgRp-<R_$ffmas&(KL9qE8alvH{zE@g{L2pG!$C)VEK-|+G2ug
zV}=&6I%rJQV)<M%(2+EA@8+!zfcLmnu~hB)cicV+Xyi}({hvWo0zY=2%GmXA(8LHm
z2<TiJIEW4-#XoEJwjq2RS63&I_MSXjh$vMd&UUe4NoaNXD|r+^O7`yf9X!t17F4-i
zl`gA_g9S==h+mS^*&ZXK3BOy~XW%*PFyDVW;wZ-*EWB5fb@kR1N@p1Oz$r+LgBIL&
zLkXybJt}{KlQHH$c7;KwRzLQ=qE-HM!2fyH|1-fvZeOJHV2#l$>cL?5tWBApr23Sp
zWsag@4-l70t>vuQH4@q!JGu%aOcei8<z0vLA3{+CM~{Ug<)R42hIFx$x*l7TrtR^f
zsKFpe^c@xkxV!hI2F<^`IBDv4#W8(mYi7snr%6}~sxDdTLv*8hq4fzEMxLpNg1;QW
zf2Q6Vfu_A~=wf8TQ(#C{wZr#e5WPFWCO=Nq=`)}El(`8xC3dsjlJ%Bp^F=hIqI`z2
z>?K2;Y&E+pQ;>CY0f87`@`!(-@PiAq7&gKM$Xi~&W!--Z4Zls%fPR9GSI~7mtHrBP
zV75QcxJBP}A*3rH$BCWuf=1UJ$8G(U;tZ9vNbn@|IpXaKKE(60f~r|E%a3n!{nttl
zofkrhpU|{M;32&t!r&=g<3X(05@KuG&3PKG*MyE#bfIh+3v7aazMV4-7F<uCp@+Bn
zSi%xReuc056s^F$X-$~TOr)}Dzc1UQ+!As32a@bU7xrn-W?;IMLjF7utWQ@;k{zvf
zM?0#}OBx4dHYCVJ@t}s?fqyUs+Yg?jz<+o|fZe#^Y=7~8Nqt4=6Bc)8xu5Fri4_1f
zULc0YeJcxd-4K02LhUHQlH4b<wmW*<;Y4j$_k75{OB6x{;gYXoxdYCq*TB=g0GI<*
zh&1J_CeTO`_ns!AfaGd{+L>J%E1}T$`yURTRcwb!AkcvSYDC0cv&(9!TVT~KV6!n!
zcaXqP2J1<7X{72LIuUk?@g6-+)_h4}$WUCX&}NWl`XaF3F&4w+&x~j;J~SapA5poa
zfqC#Rp8&ZLJw))61+^C`nB>NwsPIHza~Fu5nAs5^T*|c!eW|49-c(V%lB(UAAKt5M
z-br*DYRfjh%mY#42w?`oxotB~5q%};M?8w*Bul}t9PjXB`Fp=!B*q}b;M7b~Fwb#1
zw@+F?0QuVrTXfz)w~HQ+fao<4?J^aC#BHW>a_Sojk47q8T9s|gVxEP%%x0uX)5f!m
zr;Eue55$W;Z-zFfBV07ZJ%#u6q8C8l$_4C76K(}+i?@NMezkfFA0k4)U*9O_Ovw29
z>i;@k2FL`N&jbxqJoKu4a6#`ekdwQq?{=YM_nmV$WgphoUla>P;F8Na$_b(}0yb{6
zkr`)}A8B$>ixF7%0){Ub*OjI)*5~xo<N_Kx%7+8>!iRcpdLxgtIaY8`07wB~HmjfR
zZy`kySj@)2RaNx|g(SayPG|;NSF^6N1>iEL!1ujb(Cz5}rlA>{IUh{nBW^s`R0wU2
zB!bldI5Nq?+WJ0%`Tym+>ES3IH|0C;q9tw;|0n)OqZhGaqtIhS7W1UDf#TCqA(-a7
z+geF=1>YYxpGktXZ4Iy8Fja}}B_M?QTx$ySL$!;5bu3^ff;G1XQEo)6pzBSuPj8a1
z5n^?_k|fE^GP0$)*l|1#Gl~Te0#y@t-fs^9ZG~nzo58gNudJ<XAj;oR1mz+*19@$#
zq`wcIKJxYZKSAy=(w#`wfkj(B9abQdTl>YAv;`Dt5l%jK)my1ctAFs)PfVpCQ~ywS
z$xs0PzSa|*&=UswM<)R@p~7D5c^68oBPFZ#g+WHMw{Mb-2%(CQ3ASR^xM)CdzDlV*
zMdm_5kGf6-$SeA?^6D;|zGetzs#q%J?{^*#L}SlcoCL*11zYb|^9Vk>NmjMhEf8KC
z3m%JMkg3fX`Fi$aLnNKV^TW6-LP@Zv+qYEw9~Wee#Km)fi)s<}5;K<_#=8kYBye_Z
z4V98{iJKs<Tb6;*2cQJ<2Kgr^9U2i%X6xnCwRma4y`7CV97jNkl;_R@uNiX;!fj-j
zCQW|q)wW`vu2Ueli{!P_zq17tt2M3b{O!B8L-(!^s%L9QI4%rC*cAV)n+GLd9a&|F
zTl`r^U04dul63BrE(*jLGC{mzR~g!@D?^1bO<S(OwB_1{O2Cy+9b1$c?NF-84Ado~
zHT@FH);*Y#r6p8SWtq1wMkWC@+RzMz7`KgKr&RE5@~@=VJ;)iQg(&z#*~Z-KWwwUi
zU;2r_U$B@+z%IOupLAAnkB~f4<Tp&r4O{7p6od&xbwO9Gz!BwE+7wVOA9^9!HDfX2
zdEh?HfcUJX*KVl0-7iwjuM-L}N6N}W+NX^0E^1v34c{1QpLdnsXquY_E|Lcv3r3!i
zq~wLo>U5hE;We!M3V@4Cdj~$mq;DEyIf}ZKB@F#f^A5FJmraHeKW7{8SRVT<(e(0)
z^O!f{E>7?hjV`GA&H#LP!ct^jmRxKz7zs?i;8H5M36I+_U)cy{hJZ8o9p&%`YWl5E
zVtB|aXkWJcg|r2zrAy8nB=tiuJ@Q3S+J*nYw(Khj+#2Ou=_RT3VL38(75m$BEg-z5
zoXcQpFM?E)cdZO4({NztZm--%iW3}Fcx41;9AB&ea|>t9DfO}c^6uIA;AgUI7Pfkw
zgFjoH2#EuWqvz}=))ic-(qbT%TR!N}^c3MtS)rnW>H3~?n*euPhRV<}VU$)|Mn=(Q
z^BeCox7#}JMgyj+6+p7vtPtm$m#3pmQaA~Pv?458&k+#K%E2nDYPmR`?eDec%|tGr
z`6bqgf`6<4DpOHFAs;83LMUmw%0Dk%pky0E9sj9`pLsr>UMAo?yW5U>?QcjathXaW
zWtG!!N_PdOi3tGX8P+j)!4YAL&vDrgf~U2HbeOu_r{$_2&F>e_=Dl8xUNk9Ks@|DW
z^*_mXA;M}WkY|8t*?~$H1^d66!Eem5#T9Q&MF9=T-<Cn7Do=`WePBEa3W#Xq1}Se^
zJ8B;5@N>@o6}TnuA9S8Ws5S-Rz3weZ8=ze+x6J@|@IV|YU447ivMY)O3Jo5uwiuns
zaO0rY18ScamZ~=N)4seLbl4LHr<)L*Fc#4a%{yr=>3rKM&39mz1*N4ckH13gKK<KS
z!9PF$I|1iEEh#;7^C;GEJ%5iDiPXqaZ-}LnxFA2avsB}x|Ee3UTwz%j`yCe%um7d{
z{i<2?)0gL*0rr)|n7t6*U4FhQYYT;c9;{>O!TydPzj<H@-{eP?X7EB`y#5G@I#T7@
z=j_mfs+?MbVvh}9IJT(<B#Aw!dy*$9f#>=TiR+0QDllz#rhS!QdF)T7K=wh9)YwZg
zUvhzxvOO@;M?kyop`CU5wP2RlOwI{Wv`Z@zN`**4EiEz7VAYlSzPB%w4Xb9ZPPIgt
z*_KV7NATBqnkej6dgR+kTT=&>b4ARXEA8UsR6`4}WhV*+80q>m_6-1E$BsF|{+w&q
zh3!=8C+QA0C6mWzT284wSGv>-RdSR!Ys&&GA=_SEv|LxV4(~Lbo%0k-DcVP9B-Q^m
zz>7By^G4LH-AT8ZU#Vp=Q*vANW<8_|=}wz!^;nK??~Oayoi_4&gI|MS_!DVMF1K?(
zg4Nd*sdyxfJ?u_3spKjtoIUAGw>X=wg;~wSG}1r?8*x6fCpU+~pt!lVi#m%;51a-o
zbP#b>kW=;H{hm)Xd+R!o4bx1!{04J*T&P+V+rthDHThc;fQFv2xaXG7Vq4EHVS2+t
zvA3QY<l67`)$C6S<gQjHUz~nQIn;qpLlBf;a5rh-c1h%vWp}#LyVyM;5LB4<={W_W
zUs2&Qjei7Hx6$O5O<hh}RyZFS%GA{qJ5ndpR}|+0OEvB&=!pnVYyz16RW1U+$wU18
z@wA>A)%ix$WpCSA>C=BG`JRiZ$?n?Da|c1tXMD27B1lsTO19`n!M4(*MIbL3I7yHp
z2zzr<LE6D!oN1{9^$)iV4X=#uc<lu218jd1^%J{*ZJxY!4Po1^JLXtt)hyg+*``o-
zG>-~}q>SYE4u|U*XyNH-^tESc+(}dzdpfZfKef=te$BS!xSME@vhc|?<9>bIpcB-5
znt9yf5_eS@WopJs5bu$mTmxY<=29a$^4j%5JK-e1>nU7n%MrH>#h5Gd6;2Yaw|z<<
z{lGx;UhhBfBmN7ahnkziL!sUDTLQxf@$>B3+Yqm|-L2IYSiJM9HXTjM;6_XCiZ9*`
zw~wWsokR%+YXfYYecYSLE2~!jH1lp|=hjKaXo|Vd+;LFW(aLhi)<7Zs+kn|n40CB=
z^PQ$mN1fMU7sqKmb@!Nc&4kgaJ(~=nn%9sd0eL2>X9>b8<66ci<L4ppp7?e{q;}*=
z(9wUi0E{;umP@4_k$KkRsK4iPeKP2w3=^(y3|v?9NfPGTrvTSB9Y>38bf&ZEDl$~l
zSt9y*4Oy;ddii79C@o3WXH1>vvP&F!RGtOGnPd%47u0-}>a&3QYwc8oQl_8ELY=Ls
z;b~%h5?|HsTx+PlW%tX;I8*^W;^LXp?l;<LiW!`O{i&padLF;dS7{Z^=Qo|77pF7j
zvg&0sqh`0^v7e}K|3NmOc$K}S?JtZX88R=sS8_)sPyX1ci;pDUTvW0ysLi#58fj~k
z-0>?8iu!8Tm*f#$j#e+5EWLC{KAFIBZM1UPh!AA6<3#U3BQS2AWGU-0de^}1kMF8F
zO?E!vdZ+H<C};Vf)c*ir5qA&<xH(@+sh<AWfz001lgaF3N#q-AV&Sp5%*nTn>rUOZ
z87NHT><pQ60htIt=X{4faa}N-T{$2UydR?5_&lwkw1<zz1AGlA*ZI6awH?XtWSiw)
zH~bSw8AH5mTdvx*hg2Jz<Fg7Oq4?j}lD*CGU{i?Q0gyha_=pU6<a`vV<v<X{e+gG*
zqV(S?gAq(=H*##pKt6SAy5&7cl$y$ZseTe+a2{UY-&x{Mh-3)U?L0X#GwLDN^mGA+
zxY|HvG7ixX{M3-~!_~=LjS|cDJlK~w$g19+m5Txmfuw27v)(_$3+YKg_AFWbqKSaR
z`)DvT38<Y-2G|a;?@|DPX=go~x*kC-DBkxZDf94Zs6p08T(0Z$l|rg9^{%vw$FzC3
zK?Y0aPO#-$R=68#FzT3A-mzL8laiNtRv9Jti?-}I``uLf(EW~QDYsdgll<hWFl&t{
z>yKD&ZGYC;d{fcsbbm$6t>1l=_!aegoug<_T5;4<6`n7xZYvZRfdv>>P!qZ`0L;V+
z3x6OrFUPc@A`1Pj`ZF?*ZZki65Nn4)D<_)Yxe@wHwr|&-2l+jNaBW{SizT8G0Lm2X
zWZ=ShJo4-}z{+&*VKx?Rpxl1tSxL8@-ElWnRD%bH7TFgqvltWAyqh`208{SCbPu-8
z{+<=i@gPw{_dQFWyG`2S5t@RI<$H$x`RMs+dwB!SD3;Bn=?Tp3ZjV2AN!5D<eqavD
zl;v215JF8AiC^4BD0Mv&ij|C(!VHYe$H_phMyOPeX52|^55&$ne-)UF^{vW!S0e2D
zT$Kt5-pr+L@5BVlwe(Cv)K*@imp>oh5fW%WzDi=S=^KN@B(qCQb_HRVReYn^i~Fx}
znkTpyW{Zh=m^@neLsa!A(T@WXoF;xcHM@=#CBsM`B@Z+e&zP}>rvLRZG(yVNfb8An
zYu{G_U$ep+p~E0DWnnBeDG_T*btzG!Z((P>QkUJ&Lqd85YP~K|$RVr}R?Za*jjXv`
zSX#<%AL$#JzduM%Bm2-?Dl8W%ZS%v@V83}^B#Sx~wXgxFoYxZt3)^{Q1YiyXPo`_f
zEvgL9x}|V@)lryk);#2-{7IeTuHanWZbX<W8|*4RqpG@>X0m8SHIrwec4@16XOWdG
zHjjPYD>&9WMCJ9sluF9YDz@_}7CEYy^f*UxTd7}YF;Y*fpsWyO(u?1f=iJ-kt}4WF
z_)&gOp)KiYdhV#^BIwt{ecOH8fpLEImml3!<peG^jioL4(%rh5(L!fPWWUWx*&)xU
z3t8ju_sMapr2W7AlSe4iEY5<uXq;_BB`d?;--jDRZ44wHbFqt-0q2_B*<ovz`)5=G
z>ndIrIeC%v3_baJT2AfNP_D%#cC|-;NaJ;#fBB*k;^IweC|@e~UrTS!5WGp8st@{T
z0bxO}juqLF)F@6h0+PYP;xg={a}*1yTqGB>>V>Bo%N?CYB*H)C66lDHeR@6Cby&6C
z&9@3P^j%VB%7}a`cO6xMp;aZ*TD{l}W5j<*ZgS4;e6N_-%(Jf%olbcYA$@o0(^s-o
zm({7(+p1d+WgILl?$$7rPl4DYRNFEm@w~NSq}P>)ChYT_G(Z1WZI~|UjUm~bIl2QO
zVJL&;ijzvMxSBA2=rMpfM?yLb${N&u4lGYtiHwBj)aa!-<8fs#W6_&c{Zt$G=CW%%
z_vQ>*?{btp$k4vBu1jU4zIL-E80&5|qFrolfzOhs%Badd`-7an^<GH@puHAcTHNB`
zBoLcX+wF}YtayazRtf+z?g+`8vXZ;SnWb6?8w2mJ8d(AIiPwf{!ZNfxoz<@0Z&zhZ
zq88SkBtnS6E;z@y_9#<)Vtfe8IuUy|)y53hG$KssSWz*IB;*5xk%oSLwi~}J!u!Kl
z?DBv{k8<I+@W?%$=OuPk3!Q-XQiYY*=+rLcM;W|Uaiy*jO)P48MhkoP45FhfD*|ay
zom-J5%tHz755rTB_=hU4a^$7#O`ONC`w!>tDN4wg!HzSuAz55WAd93$1{Z&HV*C5h
zL=ut>O44MDQJ@db(|l;WoSFFEpo``d&As-F`s<uas5w(lkip#6o@%ir`Nf2vv~f(K
zZ!rXx%{^Gh+=Xw6jO1~VWlv+7{ybK2(b09WENxZfrTQfTL85}u+27NuCRw;cR_(t;
z)x$&)L1H(@;b(pCS=MK^>VYs_s%*wX);K9x(?M!UlDMh%MgDhmPgnSe^Z4ncY2lG|
zfb<$BuW$Hz3^n>L{%I3fm1Vr4z9ct&%e<X_w5xr!j!IoKVML+h(t(a$l?lj89s%!5
zJ-xFI(oNp-RG--E5`q?0yRM++ksJ<O2si<t?<`W(dyi-=B38eopJl4eXY7gj2$@YU
zlj<*<B#A)#FQ?1T%ycCA6|~wGMG>Mf*F`3ulX4TkrM#o$n;<5)$COMd2_(;WGmvB-
zyFgQ-;7fQ#>BrMSo013shkRxwfVKxuQS-2=YQUDy_jVWS$ylgnn?20WO`VR_DlomI
zCN^N-#4ZOJTx9tMX`wmUv#%MvRX&leFjH-(cgu5*6F>DK6dSm0{+fgMl$xNlT?{d|
zyF&WMY(k!qVjv(R0DDdBB5bx9Ex5u|VN&fznO_uY-#c}<o^pkBtm(9}22~e~_!*Q+
z_EgOK8E85E-aT|~|4>riT%`R3bUV*~-Wb0A^7L5xc2KPGj4tI!2kDc=Y7ZH{({w^Z
z$Tl^L*r~{vs918F%0G3e9~$?J0o`P7hX2DPM2~;lax3`Fs5o{6bDPt=tf#QDG{tfh
zgUUNAmmpjv;0hE~O7(ye!><1t^ihP$98w^+M9RuMi!?=K#JfTwCK)-6a>>$omQTNJ
zkovDSNxHox#|S!snsXsLie%V%K<4_^S|w<$F8WGW%6Y#}x8t0-`U5tVhis_(jZ!OV
zZ#R03r*3NP*!=;vKN2aSOb7f^`0{tmV5q6=d0EL10+pX9M_*X1pVjk7IBzFB(pkb%
zu?t)ukGs~dWQeL+xP!H^hjyNnmz&;Px+_MkQJ6dcggBkEG*&bDQ(y^Bw0Hhz!Pi^&
z=DyO|*I#uRWcfJd{>1e7Ylow5uvv0)!7NPQk+(g3H*}PR>!I(egRb^2;jXp+=ttWB
z9eWIAdYV`89@~$(Y?H1c-BK1VtBmaUTW?2K!wUsG+XHl=tdPGd`|u!pvUROe(eQGW
zKo1DGu?C5VG|d85Ve8W$!E{TNPt<X#{(V@t1sf%OvMl^&BhEl3SPz3sanXC>;++>g
zu5LAI;IYL}9Okel7$5)JanQyLzpJ6vuc2iU@=a~Nv!EZTow%G)X;bvv-!e1H5D7FG
zt-vJYYqOsH$lnH11FFZKh=YT^=58$G(z(o3%R1PAw+bdVb$S%%S#C)=TZx7!;HMN5
zdeao>%wA`R^=KE?*tV}VRh8hfj}G6;+zvi9@6YD(qi01LJ2aksf85qg!>S_=Ql<eY
zhw+EL?&>rGfvAnXfi{P9PpD5Mp)5$+Po45lx1LLlvb}8iM*D-#U`4hflY~_UtMjDZ
zSy?-J8&KM|E;^1$%j7dtP<A8Lv;w#Ym7bB@P6awMC2My8b_@Tm*d2r{C+0Znd!)}p
zk5z0@mRj|I4d!9$t?)sI7(KT?-(7~uFW#zsbMf}I={714KA<CFp0(LQ@HP^+x7A&v
zkRWF;4>|KGvk52ahu!!qvG`{V5fJ4wC@K)p)tF^r8l>C0z8sVI$K^*$c|Z>LhEQy6
z0DnrdsbKE=XEC{reXReLwXG3E8ivo0Wl`hQ{8XEv&c1<&^Sxh-EkHKr5NftjZ0TYE
zvdmP28V}TuUUZLGPx0L3WyS9q2Rk(fP1K7a8u}J^YYq#!Hg&4`PHyb7KT8mjI?J1z
zfVh>biLF4k^=%Pl-({;TrA~|VYvuYkdJ4>ZF)T{#(s8^mvYF0L-N4hRaw?z4pp@HQ
z1*4`e-hDDw8~I}GOSE4_;FaZ$(P7L_#lYELN;iI<hs^$!2B5{tSbQh}!OmCJm^)YF
zHW^29OcM><rdwm8(D5KSY?)y{LQG${xDJ$SO#7Q&#@^yZVd;js)FN<r^7UeNCz+k&
zqlosB=wDZ>XAu)K?XGvVY=YcC|JKBK$*Bl2IqbEhRO*|<4w={hY=*PlDOGRDrjHE`
z9Kmx}f(_IzUYhCm_#AC?Y`qVm5ehn`Stl2I3(ORfqTFiUewdQtb@Tlqqa^l_<quXl
zcf?d9`)SS8#_}{$(7K!(dBvkcoQ$1)(#d^hh-wi8%b(;7+1Rb7rtM*C{k>n8D)fLg
zn}S?{!})|eC)53;+K^#9_tKwviVvZh6(F=+S?>xg7<swE<6cuHdWZd~BT`%AMe83W
z&YDOZukWGeAesB-_>1Vo#tXC&D?*0+m;R?GC0wIi?r>u&Ot|n0&n6)n0!(4eOq2bH
zi>mJ_wZipcY0t5&>1&v+rC8U++nBlup~21q3bhiY2k-P8e}3OGL#kK~KQrvqetQ7i
z5nD=|=D^@``|~Ce!-^|8-F_bLSiLIvoJ9MYe%ru#$RwkO7XgA*Hu5DLr!y26bk+D&
zYvTa|4+jkg6<;pAY_GehV%sF1t>Em!w{r?y*ln8<&;<)~)~dfA_gheD>s#}Q*WV;1
z$(`8|p{lEDBEXV_VCO$}#=1;hn%;7XUI6y>X>8G)t5_S)FuKExXq$EfX!e-tNDRm2
zDPNhTe$@_r&tPAh@}M;%+!;xlGgkFX^MAftOV%>JIeV!2?t!Y4iaR<eoRFYKmG^3#
zGh;8AcDhrQ)C!#JfVxU+M;vM&oW<*kCv{GE8ytFjcC^MbS)6$yDav}Mg}CJ0Fa>k=
zk_~Y1l?6Ga23@E9LYcb;N!Cw4Yp>mhEoYJXRhUV4EnbT9lO=t*K@L6#!~L9(k~nd@
zYs<Cl)Vn~C@rk|%DeeWqH|s@VnIr>Jygzw|o`|=G2?McHng4m9mh=92_W22?VdOBH
z)}xFidq73Ew#-5$%gxSw&GdCIL7%|3uct#*#^XXC(~)&BHGVs}EpFm)pDAhbk9gBp
zqvDqZ6)`~DqRYGATVfvm;t`B@p3nQSLS_}#R2+79bus=(%t(DllnN2XUw<Uedh7yE
zHvW0D<}rCj0RM+s%71=Pok=h8QYbs&!P&};Px9}0%3G!D40~n60bjoKxS8?R3wtw@
zPd+bopX!t|q*DGiksHqh$Ge&yL#7?SRIL<i!4jTemMq%lIB)iwFK4vL@|{9rJsHMh
zHv5L&?1~zOS+z}k?M=Q)n!H3C*Fx%Keq|fMaBkCsw%spWYW$rA#YiCpr4A%k%;Xf;
zB2<;{q>=jatClqmQu1`H3e}{t$O7r{wrdGY!3^r7zXg)4p&S^V5K<WEb1Q#EU6Hf<
zo&NhZp<tn<%GEcqxtOS&K}5goFxT7Saz=wHfHHQK$9LGl<7pw_n{Y6Ms*G*fJRxqO
zsu!6@dP!Kb6s^(ICfb62>?t_c11Q$XKYqXh`jlp-9b~jZ5Swx_4tgs_5ONhyYDls2
z+Ad!$L_&s9Et6R{54HzPU6JoGEtFMhKV5A5g|hJkNp>kOtMncc%~1`W!~HYIehebA
z0@X<FoYYzNaw!r_ffzY8vR7uh#s3yBtH*ff>CW>y)@SZLuBu9))?y!|N~r>(T%yU2
z%E}|igUSnE_sZ6->esD68xXR~*}clR7C1F%OHwiu+5Ie$yZt$#>58hegqm%4E(`nC
zPu8qdt4961dw^RU0lHh2Cv;y#SRq}>@^^hlkJJFT-p{Sxwj3F>UXK#>kcr#_PQ}Hp
z1*aXFFsl6I`_!x4AFLv!jQR?gzk;2A+)gL`+aj-EgA^OJX(m-%`9~Lzl?KL`#zv($
ztFj-;O!qnSaAnepUG_S>BhJRmU#$it-I<yTCf>{Nd8UTt;hPmWD;`+H@H_G3R<8Ij
zaX8jWat`589}Kz*7}k=eE3op(W_}}$97=V`jS|?Y-^KBk-QTwb5L7%0TOU(f=Kolc
zO<e4DIGqKIkhejfy#k33^TV^<s>3=V?}siNQ1)KBYbx71g;l4-XZ>7?c&nfOQK~=H
zO1#N4!_oyd1w@nl5tVBbT@!oU9cHsOq_R^c*O!2;n&VSU|B>@H39ETIpd=%|)JJo&
zaH-~Cgo}&oo?5+(4T%*Mb}YG#9N%0PojP5>7o-=G6)iO0ALgY%<BvzE`)?o@WAsqL
zG{5p1)$ZVX#=`UsO#r3#;xX-_KQxoVRWrwQdL%3D(q`mHhi9Be$^Ou))Hux5ebXCY
zmOUw-oi<<fxvQlQIm-^sf)O{K?as&xom3sWucqVAUG*T*Bt+F;m0UbA%PI@a${n7}
z%cY8wuxoc7gaa<9W|oHK?MQ#}=CLef?H6)|*ya>{izv<$H-^I!5>N-bt5M}$at3p~
z*#Syh9(sK~v>Yz1Jj}bvPu`BoeacAyUV}BQJ1a-LD&h`kC*RkN4NJ*<WA-xT6{K&c
zHXBJDXE`xt<48>BF2~?cC6?GDA@WOkNxN^6X|qfvYHKY0f4qvwd!qdF%N>5du<z$>
zU<+dLPuN6f6mh<4BH2uDeHE+B*B77<bb5>HdSW$_luZ-o#iydX&q`vavTwU&-Wl9U
z_jwhXsNp64`OKAqkExG>e1see;;4wL($hgal;2OKMLE{Ks>=I<($QMmr+Ft!R?Shh
z=%!_X0aM63tV1S_xEzF&#gD=wNqtp!2VfVE6`oB+tu%mJu%!Fxnqa2?@VWpn5DgoV
z?YGsnEmA5MIbjoXW!OW~z|UlNhy}$}A4<0Q%UEDeD*r0|Bu7<W6kI|@+Dkd7(`G@{
zN{abY)W1KH%Szu3r1_}cyd`b~DqZH+%{bOpGWEY*L6nD`9v(3nVikloQWjb}W$!7S
z>JTfTgwn8z?5o`CiXw0f$Rr2(PF?oUb!ORwBhSTH%g=NQlh~U}T@7hOa6=T?A2><_
z#lSuMtZ>0NGZ*k(2XpqUP5ITnsFR|+r!mHoqVy%iA&aNVGgy9z@T%HgWVRM25wZa!
z(@IT+aTQ?*+Oxejyzg*yPGf*)#(CDju7&B=Z?iLwf;PiMJM0E2jnbf3OSr4yK|sBh
zTV~sXRcob4`pDtv)P0nWT{2bV-7r2i;ERf=+-l<77rj2MrT}J2Rrz|!ob?sE#Xm*X
zLqOiYy-VYpMQOgu(^4_=@ej}N$JwjFRh63kR{jgp7fxVlBumpARM!8kIReS|jkiVQ
zMr+Ekd-Go_PPwAcnPE$m>nBu-9z*YsI~e`M<-ph);)SH?*W2Y)_ditae&n}Ri|QbK
zHgcf>&#m?q5j_U#Yp-U`?n1pQ;w!mu<(RzzRwD7jm7?T`(bcy7y`88t*Ci&-3~)Z4
zg{{i4u@^O%v0W6~MXYApmoNGC<!Mqu!J7STt61+UBGvxZd1;YFd;F-|YT!9@5|tFG
z!C9C565InN-+N!?mlmhhqdL(RzCx99GDg`TDsV}o!durbmFZtSC%DwJqG5}^Qn@qZ
zf@N7C;udrYiOAWASNu@T=rHVOfjO38_HHC~>8Wq3v<1m}x|m8{Y8$C)iYaKbxJlHh
zupd1dnbYo#UKp9KDIYO0<P~)CfTEon>q@%)do1QB=OZjRaUbO)8R@2hwvgn~yJSe-
z6b?cZBQdut-q#3UxUryL_w(u+1|`n&vaDv5+@`EnGB<+^`I#8A(#R>LhYT}y_jwsW
z&vJsTB7^!um9ODwxWz8`Ufy&}X8qEXf{B?c9uA!@6t_#FW~gD|N#MU#S7z8zUW@2C
z?Lra&#f_G|=5!R#T&cnw8PiLY=1nq-bun#yqUxTexwADL7vn;<+Wvh|L-DQUuP?r>
z_Lk%V%OZK={Ja?&2JS@#?g^>X<zlNSr^TUY<(9jq%}*N7OP=0hdYGI-NP0zv{N()P
zi~}w9_xQ^<4jdjK@9@(kHMrB4mx>=G^vAOic(i`}{^PF{`5#Xsa+4^3T{KImvd?dG
zBvrZ*8>(VUs{LhnFqiNO#!!Znxg6Pj+j{q$ngSU~Dbu|K5#PwrJf!-x5^I5@Q4NG_
zHjGSwY9aHj7a_q&*lA?{Uee_JUL<ywNq1a*p%=|7D5<PjKUg?i|L*!~xh5zs>|?iZ
zny?I?b~$2bvCE2iW)2wn)JLk}>z|L)g+mXi3F~cS7bBviX<kGP_RBah$0uGmzLq7^
z^ZhGV1gprT4L05#j<tCCv&te$DM<pQ*1dr4Iwnd$r`9V360g;ubCcx<AccsYP9I5~
zuELzh@z^9y&x>-(&JHMYUsnrJ?z;ORCSb2K)5m_DG+*=~l-q6f*VwCMU3#_59@szQ
zEe`vzIyV8J(kV22)i%>?0Mnb9(z_cOyL7M`cfi#W$&Id_-?)~tFCa_hP&MHK;*fiT
z<t_WMEVv2dA!aLK4Wr~FB-MydM3VDHnQ5&vV_*9=%?3D&?C2iY{a}*KhcXF&)8&kA
z6B;=+XH~MUT;DQ7<x9FD5^9&Ut5C_`s%_WRe0uf4z_#B6=}BKwX1|>hr91p;4-^sd
zZ@=o4N&cc5el=?oVc+-xJ7lN$fL^e<y}W@hXhfppDJ8v9gxc^et5*E-m)s0|rtKU2
z&G)n~fkeF3{c*PC*f52!4atSADu!Q+xqde;`&F0hMLT@Rq!!6G8A*H)py;QTp(RM4
zx6}2N_)3#4i5D)i<9VnNI;E`rbo@x|$*x^mf#yhVR>mg`N2^fd8&m#b)-2P5pn$pM
zhc4cNt$J^4d)G_@q)F!ZP|LMPN!5KxhF$!5C#Gfm8N*cm7Y`Xx)(;%dTfZZocxrWs
z<5Zc@m@u3^dz7~aeMU_|or<O=)r^INz+8%MCn6}88RrtAFA)v;?IoEP0Kc$yljJqV
zIF|r@3YA-;A&-%|>*(z*wZV>;7@;J~+apggVL2;~sfvNI&&peLaFQ|_%)1n+%$i9g
z9fkv?PsgLf#wz7Le^8D+n(4m@++tL@^4fX90zBykQ+o1TgU42mMy|dD)r%n>kZqcw
zxZr7?l?UP9sJB_E?dgeL)}w=}SG5w=Q^B1A9mOJ3LAzd$5g;+@E){1;2Y$IXVhVMl
z4Beij`J0@+rSlh#7sVK~Gc>+;!CEFQowmzVt(9z>T`Hf;sbtu%{Go0x)i$B}px#!y
zxb3v|mY{J@QPT9bVN{OUo?iT{Vx$6lL)b^t1Kqp>VdnA2G)Nhu7gR)#ZR${?{wImt
zPh^r$Q(wN!;yA?RlvV3J(%J9OCD#JW?N=o#(EzX3sQxQf*>>WKGjrin*y=fLgKd8;
z?N~QVqIRnDw72bgrOd0JKQzq6s5~i|lrz*TG8}TRh$%VW=e=BhRV_WWKZ@HiE<{rz
zi$+83Pe+S+9{n3ab<EMPc2}O(jL%(t9r{Pv<eK2J;`apm9Kjaf-u~V}?Z|U2xxM)u
zc=gvonSP9z&7`H{+`}`t$ad6R>HtTdZnLuJEj6^Vv*eB?4K{wF+Wq%pu5ybbQ_EhY
z?Mt-=sl82hxzq*Kh==4x4uKih-U|wEyQ+wCMAjVnMVoW#Kk57{6_E{+_2ZA9t}7TE
zy0;>#dhJ?C0_rNRNgEQ*^nCd;IZg}Q1MZqsFJHYsH)othn?6+#5Y+g$DSagvF}Bk6
zXiIp`vQoM;z}bpp;HzN~S+`%*Ce#;Y6*N|T5XK}Yb06p1VkFnrQ5jM|bw_D4@#9np
zO{jq|$vo4M#3yaqR@os1uL(u5V~adh-4=dcB<yVyi_;nr^?Q*%hS+N+yftRMk!ehv
zf=CYRsNaa^S;|+$NK_)*TUXALv~Q_t2It=oW6{rGsVTvgclS$MXdAUHr999RCQ+oq
z^y<&MHd4RJ599BuRwBftMC)*GTpd%9vpK98zJT=V+=xH>873BrLb4p%#Mujy+U_Ti
z(*`7DuDC=Il3?dmUZ?usB#EXZNn`3yjxuM*rHH(w_UP0(_g##AeMMm%l}?ju%d$50
zW{2nX=vm`M5A2sD5+Tuq#iWaE|7Zb%q%Zp7RLGB!-5c`~ARks`eUZq^>VTFExEX&$
zKEYCQrDtt0&sZ`|Ipm8mX70Wn&be=Oo!_$kXp;GL+0kCn&$J$)$#yxY>wl*Wd-c(1
z+OaAs{lL5@hcgL6{l`#14{HnJn@y)_FV%1D>J$TbWjOLUBfkN5fnS$P)W6)-WHjOk
zxjG-`1uUlNog{6<YLUJE%lxDFW;Llq@y)sIO@^4zB$3zDo$Y8YwTt;t*z~4;8?1if
ztvCAmt5zhBaHzzo29GkuwkU_3?~yDG^}{xzJ-+^HNx4>K=@wn7R70vg5tOy|ajvv*
zd_(JjE6CsWrrh*6Cw|KK;!qJj48EbMous~)x^wtlwX4_E%t{PkCxns4t`gNVyCc~4
z2AuC#+u`y_o5X2nZ4bEx0HM#Onr#h(l2(=O$;&i>`OB0WMq#Oc{t{4Q<AbF8i{rb3
zn9)iLTl~BZK@3w1CuPdAJ{&kAH&|@58v9_*q$MR(o2BvU*$Q@oxNq{OkBT4hegoS>
z8a9b*aX&cY{nE-&d8`>TVUAUzpQ%6miPA`v4Ef!oJ%T-(%)(&U=gaYot=yu^{=s*`
zGfF#sfdM0-Co0~29IUbBTDv7a-XRyu!i2l<|JpnAXsWyK@gts)DXvNpD&ZC(nF<#r
z^L))irb0sI5+M|22ubFdD-FslLkh){p~y@`h7?bf%)fmu^;A#KcYW4xt?ydjKR*4@
zy14K2K4+hO_Sxs0y<g`%Uf`vXwBn|@lDDr*f!&v`R&vxa-bU3Wv}-E$s7~X4$EmrN
z{dDw_8N525B*HA)jMxLbCeGAtJAjUD5hb8y>dBX7g6s-0;M30)_2)smHl9(I8^yK;
zd1!lWGMJY7s|_JD#k!Q5t&J|Gq!AmsO|XV$n*hhD4rf`~^IJ}Cv@O`9(}t64DV+q2
z#>vUfd{r}DhT5rGx#iNN=F$V*#67`iZMEm)B9qE0Uz{~Z$WxXsJi4UyIRk>B1xCfw
zOk@;#-izP$XUk2p+^B)+)$+ZqF=J%f)Nd+-IecVzt8lAwEQHhVKmTH1bV2PFqiy(x
z{P?*!*{TM8YUlB;6I}7R{g5EbJIY|LE8jBzXS`^$xTvGGbCs_;;lTD>vPyrm{w$!J
z()-}M`bhM-DstOJNt5jqU06Z4IRLDdY-4I4inAe&(WO$Y4O(T^C|d;@(PxvLd6+$T
z&X+yRxw*1(DaNGuqL*?HESS@4b{?lcsMB!k3%g&6Q%>cUcYc3_efQKNd<QCctIV7#
zkdVijz_y)&uv4|q+Z@Xw?)ejTC|c+hByWgDGZj{$G!l_B-NTbb;Mc@m-g*9+K~eAB
zxC`Jc-+~mYeG}%_OkW$p2K&sHs|VZOZ{1!*pW_sct@Yg3$;0pv;D+x)`BXBa01(oA
zWNG?|>f#E7xg9jj#o8ergDvbEy>ZxHT(j$o@pxuEgbDMVlJ+2A4^L-o<zptB`9pPb
zLnZh&Cts<G@=08%8fXXDg(HLu<R*B3UWKKJVe86LjnzlB{?kuQ@2d6$NXoa$U^=Ut
zGxcltPZgSPtXd+I3P18mVK{SG*ou$IeSYNT=rx~D!_S)zRYK&C?yT2c)>mR*xZW^b
z-sO$N@fVDOV04gU!$?UzKP_<MYDR>w(!1Ne_vS)l?^{rRo`l&|EtBWQNT#!@zIf55
z-P5q(5hpExoa~W`&ARp%C5KxW?XaVH8)Xe=;*k84nHRQ8A1G1!7N%wFmy($=FPUy^
z@C6-`MRK7X99i}S-+>;)t$Y+BmicES<Q*3GxgS)QnztJM2obUN=`CT!iEwuF4mjVg
z{wy63^=Ci&Mo)DZ&OpOIuw277*cjx>et^onFp5;^@qqLbM@20D3<4ohe>tTmeg2K#
z#)MI2&3H47`ku?Vu^!_uYm1G_KZVCkB49`2s8K&QIG1F$(=wcbPPIMSm1c279Iuc3
zXXm8C@n#)--{EW@PBy;PEIaAIB+`w1<(YmH=lUQNHoh(|vwM~-w>Z)DrQ$pAbi<W)
z+QOmkYJY)4pTsG4{MdHLx^k5M5@||pV;j~N5pPzV-NL$&)*bfFnw-7et@rbt%9b`l
z;U?~(2X(em<&*7TnkJB7O;8%PAe`q0lV0a%;Mt>L^PHbT+=h~zYAi_KJVY^SuefOQ
zm(TzQV^^kZ(JKW3u@2FC`sNh4%{e8$uTSa)TU4CMDjZ&ZeW0VT6u3EuWex_O7lHi`
ze@6lQg%&DFlEI_^HJT@gZ3H9Dn#`xlCBS6+Zfi^e9SUqT^vk2TFl&*3&G9rMC{K58
ztp$s40$?!_Gxg^+6)#poL|i~w&=!7Uo4X}L)WqzXGphizh()ZK8#c3(%{@ANNAWr^
z0gE)yg^$f&OH#jA@KPz$_144i+<@i;@q_o1AqU}=4GV8oR%SiM{!@{Ka`Ag(SGE2u
zl2CtP%^rX(ye_bct#HiGRfMUdLR0MN8ANcRt{J=4@idqze#N*kqRWqO?kcq*Z?EMd
z?}_wNRdATE@b=6^Pr3b#n80p0_Wq={3`d|}Dw+xgVZ-$dMr~gk#scNdJq(T+f(bJH
zlgNQWXRYXS4>s5to>x(hm@kCG_I6g6KLy3j7GTdiv_{{~r}38Uyr5UI>|5<)+!Upg
z8u6z069vru=OGbK1?HETG7WtK{<f~Bnq+<t<Fw*rn}7hMgqDB-yJkj`;PaV0=bg6X
z`+}W{q8rQd+v6$EA5O7ujIou(H{EU`pKVd1=g~1V=*Vo~--<nMd@5V-`XJ*ZfpTBs
zdNw}r)T;)hXI}>UORY@_3tN$A*6xZSnJ(HLCRRhm6WueLAlE3=WzIsuj$$kyK4nzK
z90~BpL2$mS8qd={^J-sG`5QrPOR|)gS)H#ScA=64MO6rHLV+_G0)r19u(<JRQc0x0
zk{WN+>TbAT@|e<bpDyz_L>mb~w6kZE6m^G<tq9-Z$V&QzsNr_yMyK&xI`v^}i5%BV
z!(8J{b)xR{%?@Rdi3GlzMdvQmB_*m!=LKKUt=Pk<y4suiyIJX$>|T@kmF@TO!Ah3+
z_DN<vW5Fq7<~`2wrX|{Mp8HVgnCZE(n<PUN%p*+-7JRB!j#7J2NTHEKX>F-#J2hTH
z>zE^7JcQpXGrxb3aqk}3;&%v$)~%N+L6N@H4UUX0?9zs19q77orNB7ZORawy|3TqK
z4)r1L)d9%7ICtY(L3r2<EO8D$PJ+OfmOR*eIcGwF&j$o#-C>B99IKSU-){5NOsQjf
zGPoI~)}y?7DiRG<KiT#R+qZA`yl?p8`5S$-ZlEQnrtlJGjcLevoZ+Of=8y?NdbmRK
zrFk0FBiWE1c8dYJY80grLd%ag^?ro354GAzvAEo{C7`^$c(0V22ByB5kT(=0J6i*n
zj_|l<2wX1izFNG2@Ax5~B~M5RbQulDvkgOx&I@sQIFoz<c~x33d2@yoQW?VivDEcr
z+msJfxMo=L;Fxs8jT;qt#Zsr`G!|hEnV=D1^7tgoH>x|k^$v((_sl>HhIo7EQq6wz
zKkh<^^q{VB*?L4!Qt|C~ax#gsXCUrao^-F{B1kEbTE$T4&mLD=tWA393PiOeX5N_;
zcP}o!eaP0K?ix$ftAPY@7w2B)Bl-K5Jv~~^twL(nxxoV2(QJ-36gzJ<;$c#@xUa<-
zwgtqQbCz;D0&c~poY5g&g6*2-`3J>i5sMUO($7@fQBRhV_Y)|#?Gg-3T<V78iW6T3
zF0N*c6k@KtbejY78V^ROWt@AgO#9Fbr2KS@i;OLP01}fB>>HmJ#{_r5F1SUOzaN^=
z#G%(pI2dyLeEp>yy1TqQB~NgTX<womj6nSsNP(Gb9a!!!?D`<b_>nNZr)521o>Gi#
z!P~00<ZM#~HxY`!!3GeGPzPV_=AC_BEyB9?Ge&93q)Zf=Lpj(ylH>W-chUlQ3-Y3|
zGoEq+P`~@y2K5e*yyY^MSv~2!k&e{{4qQxUYjhQ4ILYw`HIK!mOo#PcH_NG6?jC*~
z^#YP(3}$0td)&N#irsKRp;3+?6dA*>ZFcrh;N60}*f<rdw`Zu^cp~t5FTTOb;!#u-
zB;gz;vayQFr#kQ^@}qf59~`1;Ax<>{?%loC1=ElNgj#+e*903(tzy2dT;fJkJ*^;t
z3=b@~t3d&nFRgAFnGLSA1Tyvf3EjZDNxfEZT<!|xs$^zSt4cUlDXA=!-!&?B5bry!
zD25lLwxbpyFqRhEX{7#T8!SBaz&CCcVuXYpx3Xz-RtpOQts?eV6OaU6BECZC!v;_@
z-#Pi7!#4g>ZU+NtFfiBfU^VjW=pCE4a#?Ly8!q3DlKtS$v<oZJVKIBVA;8n3)bwN3
zst>n{rh|4Z6Q2CR6M7$JBiH?W%a$9J(yl!|prCDkk?(RFWLa_NrA{0)vd<4RaPP8?
zch|Pw!$+_#worCCS3GC5G7f%=OKxTY@HcCNwvE4+V5bo69eJQbC7;Ud7`r><p}z)%
zL+HNRChP$_iCLD+h2!GaYq<NT1A6KqMb|-xwC;>gqYhzvr0gEenr`YhIgHo+^}hiL
zS&HtuICiS2<;r73mWJH3tUF!Vo*z@ryaT5~m80XQ+#yLb9;JCrDNktQ?@dlfeGHPh
zQ4SfiTTj&npv>P1%!vIxj;LILE=~~>dY?w`W#DN(H;~H9yVT)%3MdgtBy^A!P6<%V
z>A{x>G$GUIZmUo3ZFfYRbtjUjT#Ekv<0jgL*)%8Ie@>iaa3X)O4GuuQp!k!1<`lbq
zxp_=_uNal7laHEUmD*(xIt>NmZ(lApD~H3ePgw7qn(&xSpLE@MM~uOa(z!sEqx{a|
zIemzVzr`YO!dKIp88&9S-Qg7PB!rP{l016O#@^x80|OMc+^?mVdU)18M<bX_=sZEU
zF{03>byxNd{H&(_(Z&E?6(Rf1`z@DSKdJT$gkz^+)y}78qtJLE@C@Rci4R@bO(qIQ
zV`W{yxN&}`s;n?y2oq1|B>a_v{?6%p5Jlbe$?D?+`%PQ<h0(ch>xzRm*G@iZ*q*Rr
zgE)Bk_6JI5%908bZsq|S&*Xl5%G9_;{cA1zc>4#zQzxn2ZJ+1!xD2~zSU*w&5Az}L
z{N{tA3ut1=^uG+yQs1$y`mGjc;_~VJsTSKt3WRK6T2H~{S2Z6?I}xt5@%@zh&ndTH
zc}6I$1R;LC1Jb<aS&M+=|GM|W%!?6?g^#x_2@B7`CYuUR-iKVR9f8sXd-U>0LNLSQ
zaEVKkmI88bKq=1bA59r)C=kY{T8GrIjzOM^AmnYO%>#ODFu=QaZ#YCX292lLq6DhL
zejK70&V9?p16tNHY^PcF;qeeaRq?hMd41k+1r{~!7>c_g)9LQG=dYIPjSn6V*>~4T
zmQBKguQ{U<6GN5D|Mp9E$~Exai@NQORXl2JCfL1<QkP;W36F^^2iVb{s1Z5@c@b&}
zTPl&|x1&M;JQ|IwM0_-?q@k3W%$BOGDl^xQBRVxK!Zs1;#+|E4<#o3+yK6)OqhSmr
zr^Tx$h_#|bO*OYYfiL~zCUp|WA>{h(-Rl-8zS$Kkey#`c(_ZUahFX-ruI-@u+Eyzp
z2y^PQ6mX+x)FiSKjL`(#tI0I0_;E>42>w!pi#-t@kkcLY;eEivpV2f!Tc$UC^^}gK
z%_U)+6U*&O_7}+2O3!N8Zzykf%Q}WICg8KW?2(S$N9{gVg{M7sLlYxn2wUs2fhvzS
zT!0L&YqB!b20UVEN`i298v2<P)1vA74l<@*7_9J4$~;mw;N?l3(F;t_y17}KN6zo&
zy{eUNDwCX=t7N=U-0BY7BmK7++NqicBVr|?E?r4D_gY%6K%q_C>CjS>a(D5|9dkEx
z2W(ilyHGN7lQDKU^`ef9yi5j<>gXMY>)7|5^;ci28~4~5+^1sN8w>G%%~Y(%Ov;q;
zS9L0K^u6AOiY8FY*T%_;QFoxsf?AOmi)ED_`0I9${VAH}?yuZ!J5&g~(W5K<uX=CS
z+$=AM&6Ooc=ooH?OJo1sGYY4GwkSWeh-Ku07AM0yISYR&Ladz{86-t=Fua?xPu%-B
z38NyC4J!R@2@Rk2XmH#nWmgT1G12n1hTYEthJxIF+M_gzgTWBgAjZkyWk4vY&(pXh
zi$Ocf=in=exK?nfnLW2j&S;F>QW8$|fXvXiG9Ng8xJJfAO^5GJWgLyllq$T?0e;_P
zzynt{6qTLLc%wtMF<?Ay(+-_7&}Nk97+p_jx#=%Ut#Suwh`~d;)5`6sY5VnV<}gui
zqajT`AInt-I4A?iV~eUa7)^LD%b{|JeeNjbVq0yr5a^P^b>Ee+ouUp2Ox?V%DZgNt
zhTi|sHsm+J=oNpX9kyd}<@gCWxKSf{3*l&jkYf_e)TV*o#aG{6;pGv_K*=TG@=j$2
zSU4;cg|h?HQM?82aC{aEd9r~MPx=(Rk1S3;Y;rcFp2=y@j%|>-PFGV$K{1)UkCv8J
z^&<P&Ob);x))*0XkSmleKh3%YfE$|rQWH`4)%6XE(Dr`Xrm+`?3I!hT^f=r{^6J|}
z`7Q7&CmP0u(({*S*C;|2GSRvcJemO5HX#SmY~Ft|r5~-1W}MaR%!V0UZR|1;NTyMS
z>$EQE-K|lW?Rxr`vrvXB0GHv8+iHZ8a40cb?t?3QaPM5;aqL&8&qy?dhyi7EF)R#c
z<(DJF&Q^VA;_ZsZ@TCVEPz_}Kcp9BbkC1x_d8Rrw{3KOeC`A0Xd6J18L0=T|lO<(O
zx|n7EK%5PzFDS#*P1KrZo8Ky>=f%e%7HRkCZ+NF&$>$@Zv{L1m5HkMM8p_A~>VbQv
zQ{~Ua3ZdLz%bnYZP=hT{j_obQCSdA(_dl$aQldko_#?_)_2lF>G<sT3<PKVG##O7M
zVka(%>XyX(B2_y+=#19~tN%je30KIXQ|!2aGFl?|q4@DQ^`^Z~$VSM$pUQV~KiNv3
z9fk{{P$InE&F?SCQfVXR_LED)VGbS*qX;Jr5pU{(+9~Bn!w%yMUc%k}s`0tCW{O5j
zy-AXC&segoYL8YI_f<WH(Q7yna#Sdbzhqa5l$i2*p<h?Y7Ak@>6-swOo&Bs3)#l5R
z0Al&1z+FRM+j_1Enu1M?1g+TrLS!_K*H1u(T{5HnN%wtocG9vNK0<-d0jY$;lJ{0k
zg)AqIdORTa(mG4!#JDb12F(<&xsT#=S0Qjaq;o?&Q4e&GHXVv4qWL1&pbcHOPB(!k
zHvu>a_2m~VyX83|arv3$y-Jc3Z6>){kZZPoggH1l6aZLtWSRVrGtD6tGt2JecYX%P
zX|n#y3m;3F&Np@AtLLFc72w+tZr-v;fQsPYmg#9i_&JN{AGBKH5}ElWDeCDG(qx`?
z@0`h~$5%tN<vAV`%hw*dbemj#|2ol!#4ne+llj6|6GHA1D<XMOB4G<N?;tLv0~lbB
zpDb7vNpL&#E@AVL%g2BfsaYen{Z#&kJ{Qer$5(B}50FV0CYXnX;zTGKO#GGYblyLI
zSNMcTEQBihR^c>k%JTECdr9$c2k~?Kh$2&f8)lL*D!Y)`Bp{SeH(}%{0}sE({*t#Q
zm5sEMC5t(8vdZVi-}@|XWoI@*18b0?!N9+fmWy$MYU)MIdT&HXl2%2~Gt)oxU~{Si
zS$s<c21+VrTfFbGOYP!02J_YP@v3zIjr?xP8F(}~>^tj?ZgEd&nQo7hQphe-4~}~3
z0ANMRXr_tQl?7aglBKcA`dfT)o_?@#yZ@AB->dP*Qfo^d#9)gQq`wAJ0fPeEsr!At
zKz6mxU?hR)bhaEfDZg=UGKH?)yTSJq0NZ$Y7>?&0-<V=ZzxBIQ#9&gr=xgRR^V-2x
z?h6VU#X@RAc4XOXC^W+g>Cl&V2?ynY35a-m(k`_Mn{&!=rRMCk`s@u`jT)4D@)Pvh
zvu|$L72!!LD;b=;>kq{HLsDa5beejP7;H8Yax+Z?(4!wz=O?9#I0kxakl@zD0)Qd&
zm?7ue^W=PwnG4IpgT{2bPbl8I!c_N;vMrw^HAy1rNKn}N3)V%X!30kqG9OKWpK*3z
z$uivS#DQb`cTeLEQOsUV?C)%=+Jt{cWmu<dNz1hDQ&N=Zkql|>Y3&y*qxzIOroI~L
zBcbACYnA^l{`06n4+n7seM<q-{{_3$CP8F;oZxefcs?^E^HK)fY0$Qx=OdFgHbp|y
zSQpy2nSdCK`;DKx57}5eYH;GYrX4U)WkAT1e==GgO`#)C3-HE#9e;ht=mghJGHEw~
zL7L4-B)2b8xZSQ24O#Bnd{)8!m-ZuzF-dPxIO#q2u`ki(RfVgir6sm0Dr)VuVW1xq
zVlEg|f4DkMDl;da9;Ocr<)Tcb$M_xDMu)%;&4+nvNQ^Tm+f042lHYQaQ6H|0A;Je9
z98b1vLSV2Q1YWnZbJ>rE*0(bN60*${X*zESu5XUZb?i{F3nV+XS^fG8u;sfo)N&B3
zPE9=!d%+K>)ei;3D-hMM1er`y@*&V)rtH1mDD(6$AdduhMD;_hulSZn-`ugpTW~JA
zI(*^mB?`@jlh4`t4aggqrhNti=lf@NhAJ72bUhUPwQR(NbR#x{0YS>v8S>x4$zjIb
zOrFMHF+#gECLn|Ne1yN$dzsZdh`DH0z&oFo%$E7eg7<qjrwx(|2aDr$4HLx<K$D@K
zfB%<@u&{<NS<wvg4AFe-|2%jqn1t*z8bwm0p_;oRl>D^(H}q3;$ns(qSirNHk=KsK
z?hf$LI!8FGwV7OLzS6ZkTUMRiM&y^pAui=8NS&)De+O!o?>&H@@uLI8zd4A0e{PE2
zzMqn*fw!<Bd@q;E>aFA>BokWI%So@^qz4TaG$WGa%txe7op<@ow+a6$AF3^2Fy1^j
zw}BVn*r+?D^&TBiJz!XnL)UzT6HPDUFzTa{N{3Y#6e@guf*&F3-pn<vz~enVsjO4U
zrA}u9d|X)rcPuTaFO#43cjMA<h`l^gG+|f9vj69o;X=?r@^ltI1Wic!0}UkKCQdYv
zBH**EY(5M4uIkma!zId&wgH)cTr9X}CRT5BEGJ<F&iYxaQ@EZ+W;b4W$?=BvuaXYO
zwyDi?Wp4LOa%VCI8{V+~-6O<LFTrKwR(<%q6scMv3d>y-KS7Qf`3)hbkccZyWj%0Z
zhuw4#3IlkS+GZy^F>ueUMw(EL+2<qM9}!RYZsj5`=gz4UeVIkQCk$)V12+7vx*2NK
zUR4<b`$`8C`!FuDFs22Nq0X~C8X6j!1@m;%>ia3PWoe?tMy$34lYVdcyo=?lca{y(
z_T}yBL0WzTSyduX$uEGav(eGkb0i4vC2G4ADYod$UU|PwuZBMjOJ7enRdTr_x!$d!
zv?FeS(H$>kLWky=wdzFdBAwb%^>HfULP<gz)jJ(fk?SB+MsYqGYBXMzL5<wR&(MJH
zDTK(9yP00t?t)p;SCmJyNz1H#`m^qG>tB>4csa>R%g&Cto`-YIZ)rY$4K?myP(D*D
znmvoqjce~>xR5-e8|GBpVpOGbWjF*SvthYG*PV-IufdnBRKb|NKAA9cjKuFej6QAk
zhqaK^nleGwkHxmrSVTLg^+(S2GaOH?<cTuV?>^v^&Jo);`2ow;M&;#QlBv!7hvnTT
zxkTg{aoqaiSjqb7<N8)8=bv&avqLR>s}Kcn?IgVf{Hmv1iqi0F+!8c-NHp^}48#au
zwqd6y?27aoZU`MaOMY}{?1|%KCb^wbS2m{DZc@%V$&SZ`SORe@`|-dlC>|{W1d2yV
z)c#p3-hhOJ(M<8Ej1Iy*b1-N@qZD{Vh!H(JSZFQJ<{d~Jhft8_!Si&`asv#dVV2?+
zAHOK!K#d1t180*D$|RK>8~+<%n-WQ8nAKRBof@LfRXYxb{{h0;Ha=`FMFZ_`b4TJQ
z1|#JuK@r7xz(!~dKTD>0Mt+D!;JMTN6-I;4jK;5w6xvDgtwE!S<|u`R)QfNzRxB~$
zd{3fG(5T{oPB-2GMWBRzFr~_oJ}@-=zM`@nmBLZk78L$5m{-13<amc}fB(Z07MU@8
zw|l!<4u0!wcy^FtGP6uSZQmm{BjliIg20cqS{NY{=qC#jFOfo@5GPsjW3K?L`EeYZ
z$;{<s`X6VVwH2)52pdQy425lfoOsl@k_P!JYxa|aldSCc8;{`tl|P=q^E&AHkpgE#
zCB~^DXE`A85_(@Md=9X%Acj)-iNR+6MQ|v#)^qNf8~nrh(cw>z<*el0e>TaeKHH%@
z0;kkvd6144S~XcSHH<^ixlmk&orE~cw|e|!%+Zs1RIY$Rvz`CUDHpVDIX&xUEWm&t
z);$I{7*rve6FQ&f2yq(^i^#n-)vjpsZ*6R#Z~XWP#|6UH(YjfQ`r`+Do`x9+ZsK(1
zHdsOZ`28OrU*8@_O&fx7#B_7dGWzQGW8i<Z`I~5S(2Sg`>B51!;xoKROS(3J16<O4
z`2BCZrynZ~<7#a{{C-^h+dFd+^mLsvtWW{3X;T;E1~2d@w(;*<`es?c9WtPV4?LRo
zzm&G>U=MfFuT`N1stzHroFF0%H_&A~_o(mh%Rnu73O0yaO8GoGFMWWnGI_5@G73GT
zh$gt<E_3^Te-r~I_z4#glfZyrswI2I(QJOj3<mvD*6cTL_MZjd+CiIiLgO|dL;js&
z2gWW8y1nhMVL{h_(=t+c(217V^-mGsieGU78PXvX;8evcNmR986eF7FjZez`w_%6l
z>gwtkz05^X_ZpEQxs3xcs9$l$!?x(|h9+XV99j~tXB+4stgp54FUinrJ;{;gyG;5S
zd;}FaIk~5%o#?WMXNWO8`9k~@vgSHU@NlNpID+BxAbHFO&Hkt_-_)1klJDFT^5Vm?
zQi9blVX>cK3g4Z`j%7l@+&^l^6=d>+SPuWvIO2~wvK(U@?D*O<txqC%goNxd*;T$G
z{r;^6X;=3l{w$6I(W-MS?9qtRK}!?3lo(-teytL2Gieo8>H_YCHDY98(DdJXg^y@?
zLN>wh?&I;imz7enGH?bK(%61d;u?=qM+Hi(wHe=rUIq33@jq~!tLO=YA?=Voi(p1#
zp#Q9Fi^O3~g-dY;Ma7;{Ziaf`kT1KQvHD91B!2o7OSxac5Z{Pxf~Ewbkts$<K=s$y
z0Q!9hH%dJDLfBc()Xtw7*K74jIZxIffqXjDU!Uokt|NmbftmK<cu$wiMfcum7mE-o
z&Jq>3ek_531@)5=lu_^xF%f@<ON4z2!VHzwS5$vlXxJXBUE7G9e@K$P-1xW2{zro(
zkYGodo=qe3767{fiqDP;pjoy0IFKxFqD`M)duF{FZv1adx?3cfW5#l46s?JGriC}R
zktE6?-RFey6A)m5R$RZnbiG?5VG`yh9DG8&H(-Gg$b}2@pCAB5jJvh7(+0CM6Aj6q
zkk<cFlYBHahDL@VNvfbgvo(BtgcFF6Dw!kbZzt>50&s%xmD+%>5~!KDZSaG6bSDv$
z912?6UE-qi@71?pYWc*wpz*q=;95wGb}F&591OakfBL+{jm2+R+BK%$&$_rK5>O|Y
z$Qu+-p|}4H>SoA6y5uIHNb^5DYC#-WO?!2wdH2?JK<|-~uOkbgEN3WpkB7rHaL50x
z`%+Q=WLe0e2EuL%_!)>W{+~Vhh`qs%8w-zHu06gAGxeYrszM~<fS0;FdG@cp{_|GR
z`hnYj`Xt;8xHT3gb>TDW6v-{F_C&irZH8)3yldlsx!wQ&5`$Z8Dbndr_SG?(q>gtI
z$=q+Mz<QKmkobS}yw)`1AKD@ddX1v<i2X@Q?E6=GTlfrz$&Y`YId&&Mzm?~R64AHK
z`43cg>$H!+?WW5#ER2hSLu2=0@A|B(wy5{CpJ$&6^M+#(jp1HhU%r&?lLaYpR{fMD
zpd~4~P+vQ)=Cedy`)&VTF{q~d`m|TFkCWjxF7A@LqIj`V(0HVCSiWhr>q(d{HzRF)
zjyCLiOU<qRRX5>!o@reJ$_ggpe&Qhv&!IgSx2pRx**VLvBko#U-up&d&`<*Tg$>{u
zXp3;N{hOA9A@u@kZ<`Mt3~tH{vlM;GoR*GF6rmxXjra@z#vJ)h45>Y0%(cGgTtmhG
zL-?qVo>FN%=02-k<I9kf>&wP)ZZL4Gl7vrZiXvh8rf?3;l#Y$m<UI<CD>Y4oSgNm9
zUoGCYL0HRQATvhkV>HAb|83X87F2&kxKzs-dwZH<%PV*6md#fn*kAUVP^D__xytq<
zx))?;18|kNXpZ17MfaBPFM7>wdKf-17dzl%uwXiS_~LRG%qDSrb*1HIU25H0J0N(b
zWR}=T-w!>0*COKY{7NXWEYrbWd9!*Zeo7m+VmDXkK8@YiddlroWIE?}FeQk{7r8tC
zgPdXg>T!-4Z%V1%wd+`R6M<uTwY&V%jsSC#KYSM*L$QAi-4y1_ci9mA^pubOh@0Py
zOR43%33v~K&*JxP4p!Gw=vJ|O{A<%;U$8~iWPeCm2ZqR@&+GS8{_x?{CS&O)Gda?W
z>CRu~aOcTZ86Zt6j!o=ElK5h;w6TZ8E29hUc27(w8N^hpt49m!M!0alFNs+83H=l9
z7-1pPIC0B&w+5#NHa=o=&~0*ozm*&~@y2DKh5Lv5WPel82`|4^!Imkp=RJc(_c|%4
z7<je_O!M;V3IC?*zc$A20_le2B|0vS-<11nmH%5TNbkQw_^%MwjKco~P4J_SO^c7{
UHm7GLfq#m})Q;YjJ>mcV0BBC8K>z>%

literal 0
HcmV?d00001

diff --git a/docs/source/_static/img/dynamo/torchinductor_backend.png b/docs/source/_static/img/dynamo/torchinductor_backend.png
new file mode 100644
index 0000000000000000000000000000000000000000..84e37aa7c4b63e7120ff77efa0b5a996e0d87ed7
GIT binary patch
literal 122529
zcmeFZbySpJ_cksF5(79x4Bah5D<L5bBPk$_bhoq&ARR+@2nZseAkqvqbV(^lrywDn
zlEUxi`8?0_{XVgN>;3Ef>s{+!i<x`YiGB99_u2cLeG#RhrbviSg@5PH9YVO0oaUW7
zSW0*9U=!fp1Ku!q7gGR#Fx)j2W$u&@)BL(~hv5!fPFmaN#dZ$%cb0C-a?rpsU9ea?
z9x+XoG_i--T^O;rh$C+Myp@|Vl#oj4YuobfE*>#U`@Avc+|hB%pT(@Kk9EH?e`N-2
zy^&zK@zB+k7}q?RV7ItBTZw-{MFs(5{FkSMKZu_t;k(cAxaWWQ6&w`#8jE-ymx%%M
zKc8B5hM>aV^{?+lzQ+B}M~UH^#D8nWq~V82tYjYu!u@Zp{x;m*{~QZ=^Z$nXFU#`(
z7W2Ph?EeHR{whnI_n$m`a((tL(>$8<0+qg*_3tpQQIo=iZxK8X!PQP`3nPDFE41LV
zQsU&d+fXdJ9LfA(TJ6zOEIFxvc|J-3ZO9a)oxnjtU_IZ})S+161rd!S&q-q&9et}*
zAtU(n=zoL0)E-HK)aa!}30>st^tO+s<@V*R*B3#H!=mdn)zjA5QEC4^v&h#Gw}uI2
z`dW;oW8&}WzEe9%biDgBSzV|z;{M*)!bih@N6S|#3_O-k2V>Jr8v2;B5k!x4jXlSu
zE{P<LHL6D>FSYAh>@#q`=d=y*wmLea&#kN5FhLpJ<R;CUA^~SVaE<^klHE(^H`si(
zjI_Qw=oN0-sWUt_0sKmk2WFoC@63kns)0d|F;$y;_vsw(5mKcDB*kqgV>-T|bsywk
z%e2Ym$Dv15Oq(OQ2d6CJ=hOY47t6gR$5UeX)%{E>Tb|)=(4z^x&KNq0S4({ozo$B5
zDEOH=Gh4j-?%o#%b1$&vc#Li=mQ@eiuf1^}Uf7B>Ffzo<3%K}@8F2B75Xtf7aDdmP
zykU(>5Kxk5PQU-@yp7Nz!u{y#XhgWisyN3>pqD-nF*#d&FH(vZT(J^-5X;1qf!1o_
zd%0S7@Z{z^KGUggn(BeE*R<6;!cYBd*=2Lc4?=)^GFU7Lq#Z@ca&IrN<`_QBy0o@X
zioUcwq0X%iD-`(sp5MAs)UppOxYvA?m1&*TKPa*i^X&;1DH)`MDyVw+@qlCPxDMga
zMembqvk4>jp>m;<Irov+E{Msr-(Jg*Sp@CG`0roKOkY7*xqiE%f~Rxt!VgbI0<Wt@
zcTp>^^3UxpqOdC3vMu67v+Up4<oXVV+?TXvd*j+c{JH2O0KjN)f|O=x)-WW9j-g`i
z<v^y`F3RHVdqbC&TG2npV}5T}&$G@hwmxT5T4yjG!AVpfKKt3xHcWd1ec@wP{Z$6!
zE1x!)x+w**O>CpTIa751!dd+A`^!GYhzwvE>$|g%c>&8&>{MlPbHQgmZ5TLm(s@?U
z7jHffm)#Fi=56_2*0kG5NPB(qInc?u_0R7MDGhaI%zRbafLX6ur=gN(@8keOc{>2C
zqlfPu4_}Bac@S&5(dg1E;`D&|PhYqaLVYX_Ro3!qCtu9Mss{xO&T6-Mv|C<a&bgm1
z__=ht00O;}F5FOP*eY)-5W#b9f3TIL%)`GDC*~}i_1+;`N}X6_<88=b5Dwkc*WY<J
zSL<twH`m_kzZGMhW7~d2(9^}>ML4Y5mwuX<!S<-bjg(RbCQ}ot&W@Dw#JgH1Z{<^$
zIQcGhcRI>iv2pRsdR`!1CJ%E;%JT|=_Ia~OWA8^}DuSx|;~(iD`^0Alz4SvY$tnWi
zKo-do!-6$MdVkBcEC-&W-KIUE8t*99G`{+IFGJSlf(;$wpchOP!;iNz0LyC5??-8i
z11f3x9-B6MF)X$x=ghJaBN%tS;}Yo5L*-DGwzb`j@3X#g7_<7;Fe7j&TyR`l>%@HW
zyESA31nis_<`m<25L>p_myxvE2a(Kie1JQ$F)4U;G$cF(XV0<RZCI!4T#l3RI1|7Z
z3q(<>h#z!WX-`&rww;uIMhlFoi8+n@&i_cbe$84PW|$%FZsOgz6z2^XsE|<_Tr7BN
zP;g}K*V@J_rmsg>b9?->AG5Lh0qan&J|0!RkK6vI(Jp!2ny}kP^Uk(7sb?8(iaOx&
z&Q-XcF=v96DEkZ60Q-nt<^n%z+C?@$B~Xi?I;hV3Q)Y&9_7C^;b-XE%Qdm<QBo2K8
zQ&{v_iS7HtF-(y(VKChCBb<6rQ<*!}%hu%DS$#R+=5jH+6#$n1^oX-(G>jicGgoze
zv6YSFMWfyg9a`zV#dwZOG5ZYMsMm{UYAjV@zC7dDe9)6fF>r-#wwtQ^BC3o~i7Pfk
zX@BPhC(zps6B4g}-<Z+wxiLQUq2B1!YRl<@pq84bl`;6?35hbX5#=8Ga<^!4A>h)f
zQ(R3R0PfK(aC3bP%lm!?&AVUE170p1Uj7+u<T6!Z_G~TaL1DZjh~PODM2esq`*rKp
zDsQ_Z>7U3!V-F@GM&jw!ab1(x(YsGBHr_ppmY8Mj?+u*-i9hk=y4-6G4Dp-zRKGy=
zKL0sqMBB7Yo>LuQz(rrcsy@ZuRgTp|LTfRC!?q~PdYFFn@5>1K0KUb*n-=e91~qCq
zlaGVg^AG}AeoN@Eua5^{)dbpK{4!!x>kP>;Wt6(umPFFjS~MaBW%B%20rZs(84Z$`
zr0wkjIQs_boT?KNxH((p4V>0)g*ncI8<8QU9srxfhsck9cSxeR%26F}ycc{4+eNBt
zGTv+nFZftENe=2~9MAGw#PBP~reFWAZ_PZJF#NpONVy#^Mrm<4#N(qWttWncwvt^H
zHMMT!J@09q`dsG&q2l9Be3s}Mliht6z%bTo#?%}>){L>8>FF>QG=;T#kJjRAT`AJ}
zc3u5sf5IPfk5FK{EpWdbCw$%-)^_&0<>vbG+szHI2Qg3bF3MzK|7JqM%;QAYGB#Qt
z1YkzG7G|#oh#h~Goh5#o6}JC1UsjnG6N_9R>3Pfk&b(;|Ve`=TarL;WjwDg@DcEZ3
zh0vU<tot{4@<(P2D?^sasyg1{o2|GWZ(d_1#W9ZrNDpplq~Q^Z0;Z|pjB#@@*-DVy
z3#`2ee%j>IhW!o@A+J^HHOmq{izhN>w}^wa?dPA<#XP$T-(jx>UmJJ882#b{0hdW2
z?YgCjyNgsJCVyWH^Zw~ZzTZ9g%LCl|E?#nG8^9T1j)N{6t;zsIq++=PlQ><2qC&6U
zf<0Bf(Rr^`d`=;T!tD2q!{?0@ub5L{pLRf{5AbwFR4po7O`7QVBb31l{@C>ejaVy|
z!%<r)#*vjjLD~k;%NHNmEv>r5GEEO20D(XbD-fzzHwzla&Q}cyFV5}r+u1PU=8cp!
z<xFbN=P<sGYM`d2lXhxYP1{nr`@oe9bCk~aXX0IM&Z;Svik$_&z4`CdT~H$*BgjY0
zd1qx3*M<u3^UXD}MQF**XU))H{$83fzbLiFsjqPWgj@P0jsQ=7WRUvPDmWF#nKUxx
zZ1za<3)NkYy;?-?XJE5f+Y7w$qkMj1Oxw)9b7w(sOGWbh=lhf740&p&nrC(=-DGBz
zH3K|NDKA{6$;#0en`^)#oakHv0g0U3^Gyu|1{|{J#`=uk8p&P9hmx7aK8P85wtQFu
zWFj7MfEI@<hY{}+GTkTet#dpl`Zh!Hh(MIkfjq=vR0H5Ys6Qr1C8gT#(JGpXsf_K&
z*tc{;4a-g+YgKe_Kh1ghh4ZeB;5XBH9v3s4R9c;SQ{$@TXs)$S<?QjNnY{zY^>6Z7
z61;5rFPawr@VtDtEq8z&-F>qDT}z$Y<NR~$_2Vj!W?-ip<gTAH2e$AynUfSxP4@1h
zs+qYDuZ)614-Cx0$euQdcT9DFc;__z2dyHrd#=LI6meg)n0H6`O<KfDtnEDMur$U;
z2Fl%OBA^uLGV(tee>V7b(j-u#ilZ}@2xNb)aB+mkmeHmn@+0)wo460fxs5JvEdV+b
zv$G9xotq{M98R2C$N3>8On_OF(WDl>et}-A+Mfm7%Amji%mQkHIC$H@rZ3hfM!z=$
zq1eA4cqWUDR|5Y;%&<?LjNi1B^=J`sRBPZl@@5Ytq;DzkMHI`&lRUbm%1^)N^}(**
z%YIhsk4y@o%a3c+gdcJit?kq=F2vA?R-GU%OE4y>1{-#Qn~=OsYMP!{kFAR{99eQ|
z!0@eQ9Yd#aQiRaXo5ri%Mx9iri4ieUhv$CYZ+t-q&U&P?LHZVJ&F++fWm&K0J359q
z$WI}a>+jQFd>n9sN(7>PnBna{P9ij?$7AZZK~26(rc+fv+gdP)$999B|1Ko!IS>{-
zRs?pYbv+xjrVfah2}OcOXV&FejB6ZYOn~|P&}_{pc<IA2Jw~3C%pFZJ2dSHr#!5xh
zhf0lVWk^uykv@k2yb{UFrPeC2+VUfsar>h3vUrp#sBqpB(|Q-M#ZSfzE7P#YdSyLf
z-MBh3`l?NCWNl&hOB1U=NV{C&1(QN7TJJfxA@#tEu?>0*0y~So`B<CW99wjvr(4v(
zK;HI?F1jdXu~B6~B9sF}+Z*s_wz%LYfO;M^ctPq}QqEajZW{d&8|Y5r@EFrS1{F+x
z2i@d^7H}ipxgJ?cXtlt`DZ?+<tibE@Y}b9l7q<CfRpz5^fW>nY0~rNVo7bB-vBAP*
zHL9{K0%9z-&R7L`ECg0YXk+3CzrGcFDNmTGt20fW3;Mh$fn#HKRnfGo9!=EEo?yzj
zl%+fwBtPxnbrPE)gdY9yeGbi#<xB3e{i^CRvzVcSVc%l9=cY&VwFqN*pC-G*C-JK<
zk0<7*_dI^H=)QSp(q%r-c%TK1@ERu}RytG20p_^V=-9hubrkEUgsMUV`-W2`Lup89
z1+Vpxn%E90!SU=t&;+*p+k3!ntL>4%Z_`^g&vUk)G+M9bOiV_b`$ZT;zwoOlyF5Q9
zFYwwoc~*5*Y*gkhyj00Rx(RXdtN5D?%tdtljeFMIH(a8=j!(COfETSIXuV2gPiJR(
z#rB%V63F^=ACs-G<wwv(&k61%318z)@LcZHn>;8@)iLPbkEbBF+;byuN0FLwo0DTU
zh07Iw2W`IA#)ORu1N(mX3gEMw1}9`ckSmFr0uJ6;PldN}qafU`iXNXvHmy;d7szs4
zpLhsXO?25;ZwJ>3Ldo1mB)lyvQQtL{Q~jg9j4Oh-O#W=@67HBU1anbMxZir{o!i5d
zKQSQVc5>2Zpd=X%2j8TT<sj1*PXYeZ^FuVS|5-<K7WH|mn_}*OdcMT@iU|n*v^R<%
z`i;kuvDyUEH*t4$X>)mlJrW26Iem@+z_uG@#Ht<B0IV2OdmA7$?xZl|2v-7l>_G|{
zp)xa;a_CKs4>4LQuJFbx==(rem_Yd}OTqS?I}qx6mNIthH*P%}t&s|Ro5y}xIe|U1
zJXizUa%DIR_sglt`NzTUFY_z-A;hbAZ(0w(aGlBu4*_}E^vhSyA_q=VSke{7ra{=6
zi{$g6ip{J>s^`2Tf*AE~I6oUQ?TRuB-c~iwA|t8vCgy<*h;Y8?Hg^O({QMBl)(n#d
zH4x}Nc>G-jciOlbw2MjYj55<Th-u&gZC6>OB{+roP{MiSjeoh4^Wa-x9xuOB?(({-
z$`m2zbT$gn8LJYgYz30kp=5PQyBEHzWSyu)A##Tzv(9k&1JTD4AJkWtl{0%;*h(F!
z{KapR1|U7yZPCtc9G#ijUCaYAF!z4ebU7aBkta6@4^gkRK1^vL-0LvSp_h)3AS`dT
zT}n;X-pQEDh8g?POo#FYHr;xCZFA{}OKeOA(0#W2xd@ey5k#~zDGi*H*l$977k|h;
z85UWQyP0{8Fc+CLH}~%(6t3B-B0RZ(DIMOs_ftZxD2&>r+3l@!Px!jc(4)!0x%g-A
z6y)3Ob~JT9{-{ydA6%h&Eyjdl-Om$h$b`(**~JJ2_lz0upUucFX~M=PfK;z$jp%A&
z^T;0nvtp*%whRo*n<%M4T9inxhAl0|*vDqdF2Y?q{anMmfNuf!p<RLT+EWRfiwko4
z9zj{_+y`U}ewH=M^FzTjrPyotVHf?u5y*N6CxuJdAltEvo@!43?F#0Y?zsZ%kmE2L
z%5-{7UBS-<IadyzgFu+LuEN7xF$)}X4AJ>*CvIK-bk{FSHguWsf43<AcK$Zm%64tP
z@W+Jb&fjBtSDj$`>;TBv6@$KI5bVdt-=oKoB;MAk1m%)O#&<(NV9Ati3i`mSgD_Q>
z6_3usa>VgBMI218_f}n|?5W-OGEYnC>w;xR>&Cze88O>H2u~<Y+`u5lfgqjE9I`l<
z>N;u<!MR_P2^U7jG>5%XN*^GCYv$*?Q;g?<ZTxi7O<RFKdo^QM{D>y2j_XTxP--Wz
zyHL%qqqp(3l?WKRk_*DRB}Ool*{@SYIcsL+O!nQ?4KpfzECdFzMs*KyS<WJvEg3f0
ztoZx0o=2UMCx7Gm+$;bQrV%X~KQ3(cGlLed7A}mbKar?<ZL7$*c2AmrK3Jp8ypSYp
zeO5K|X={Vx(Oh=(fZkld$+&JJ!|T2zmge(|p7UnVPRFpLn7juTlM$9F-2;>~<2GYg
zqCTsGG>05RRoFY&`b(lpX+5d%r}`tpi{{W0uOPf~o*kY@v2suF-lFG(eld5tHILkD
zzj8=AkhqPk0mm&vo}X=;j5szV3m%T3KQqi!JnInb6XzZg%&0X+uPE)sWXI5!O?%G)
z-i^B9-t^AXr&e#iv+cUWGQPkNg09qJM}vYP{?g5>GhLT`TXnLF_eGXo8>z0nLXcG7
zGdSsH8l)Bx)4l6i0c6&9Z-b2%Mg%52iC&1_-z$i3(JG(HVCuXPX~OP1;V)*nji?-R
zoa`->uw(kwbPr0Xg4lsa`O}a{YhHAR;qmjxBQ)mbZQ{(@M|Sz`qR-!Y+hZnrH1R>q
z38su0p9ZFNV5Q%GxRRrN#+NjNS;>Ty>e_A2F~h>RyA@xC?e01nkn7rkr>`Exc8L2R
zV@4;xE!wY-=V}K)2~uSi@<wlifa+l%AKWG}gY_;B3_>Gg9{hQ)5;xosfAVEmh{!7I
z+60Vfo^(jwhj=#FkHO*NbEvE(?tTNh5RawHt6e7S`hAq0n~5`U1jexi*W_HDjFZ1j
z-hX`hUC~K|oQbl!v&Zk9P6y%<Im_UW8nMR<8u%hlZcN%hX13G8!%t!<r#xyjqoaa8
zyZ<m(KAKvwq|ZNL^^Jz<zB87bW${VLyEuDWucl?Q3S291VO4}XC6j^0kEH=2Gx`Dd
z__OZQTBRQ?ZEA-uftRyIW3zt{?)NR$7I!=R+Q=Rut+$sbyD!3H85aEgc4~Dyt%*7w
z?&;;3zF{vS9eFt6gNuK)=c9(nYu*6n<>7JEImddmY;j%|3RZNzzxoG~KJPXA=~rf-
z0FPcj!-Wl=*Os*aU~P#}KGFwkRM~FLBTjB%-kdrkxJbRu!yWgqDFH#Y1KJMroj&Gx
z6^DS%y)I#y<NO@EFi}Xb!M9+rV%s>ZgP$q?a+g6Ue<Zlr*3TmsIMB=kqV(tmN;Itm
z)UTOBe~BOCEnm9@{~*h|)Zmg1nR)JQ6tf%(`^NMtbPS(fHO1@FVo!|y#rJ*b-nk^&
zi!zWHTmANC?K@LXX##x7Z7p>PPfO{Q>N@E==F4Ot_?dh4KHNZ#BpsuAP23zxVr>pZ
zB@@)|sVr=tQEUe=Em(4@Y-WD1NEkE(zeL&0G|Xn#s=^Q%F)8{R^hN8wMtKyxW%^Y`
z!j^~fY_D~qc}pDAE-FZ6S=^B73_;vtE=HgrL1jj<dpg*9a__O{kFmD~UlPWdVf@_1
zzgYNyn<6+O?yk*kJr!w^zn)mBShgwwe$ZTvYcx<Gq+Jq`W1**2q2|UdZyLg|(7!y3
zp`aoDt1$WTbWC4M57`p3Xr1U0jYAiZCO*CxuF)K=BB<1}ahtVgyu;IB*9_74eYE2z
zl!S?OWZd;M=E#-AjPa`Y$)68j{TA85Oj8ilBXwyx+2gu(La#R*p*YYsc$&DKbRvXW
zkAb;}B~_M4l*PbR>umTEbF2Epqkfj;htFkIKzU#!61T&7q9Yu4&%lS_@yq+?YdnZO
zUmGj`h}p??%4IQXa+rT*xYb?c>F2G--R|4?cj0Ld^)J7;HEY42=h#TDPK@|voErD#
z5V>KNZs~xIWOs*ZV!?~`(v&R*?>`^OGf)HGVOqrVs^n|Ac@x#k%@4&2NRj$bEaJG8
zyMw_Ux@SuKa98hr|6bz$oTvx%_L3SZ&4;9NcO~t85p^pUkaWHMs+B>a&&IAJu9;_p
z()yN+dZ=Kvr1yj*9%U+{D*FC3dZv|kRAyQ{o|J7D@gF?pQeldkPxL0(epPyi(Ywq>
z`h+D`Hc|{d?OUZpX8hB_soRnlS(`Ac6AJPh&8q*L!QNC*qR!B-#Vghj>8x)-l^`=!
z$1&i}9e2_#(k|~d{#@6oBzis1LjQ~b?5?6I&0tkLt_y}w9}F;s9uQQc<lY?YD4G9S
zZp{2?m2m0<^_Mi`8RF*N^Bj${sq8uJ<FUeLX%?##Kh|d}svSBNI~78zd;jPa=1bAZ
zae-13Anxk1{0I-JCF=09wYXCqfKJ`<LHkR$$Y!&ldPlwLbM&%0gy2%g2Nrj`hks&H
zcYSH@w!seL<1_DyEO}6dKf0X7@?O~50^i|znMZYs&q~iaIweJB*5<tLB=S$HVn**K
zZaX`Vqh^D1@SHb_Bpl{9cF~{VzGqn(9=Y{W5V{)c`LizSl%&9Z!VSJfQXUk^EqU-I
z-B_SJ-X@rW`#AoT?8BIwAz_P6m7~G=+{gyk5&rE(!0+3v;{`g*p|=IY6J6F3K|hI;
z;(yhtk2UYsOM7?xi0OBOf@xKsC>l;pujnOeOmC-xnW@C0|GGlTG$L$e%Bt-+Rla$?
zs1VI;jdJFMBgsvS%g<JJA78%wrgh($1@WD;hPpI20jXlk9)m5##i{wDH}rAJbkG@P
zms6y2+R_UcF(gz*431@CH3-JVwd}LZ4GR(*X5<R3+FdO>Aw3{b`<RVK$TqG&nED~2
z-$zHwB$w6OY^WSIB1@NumHv>eIltp){AJesM8az3)~<rFJa!Tg89XK?t-3IF|5PYx
z+8F24rN`G`1L+9<wmz}aXm#e*zd~#6od{j-b&t5r^SfX1F77OOd^7d}qA=xY6GY3q
zyu8JvU1Q#pb3$1ee`F=_!y8Krt5l85D`g(rP@fV77HMD~!(V&|+lLytw)e>ddj}+u
zu{ia7><rL}3Vcl+Y>1N7i9-Eu2vfy`(SQ}_Pch1=EywJ8k770UtjpSi6TH#JQ+=i@
z%-R7**^6>a78BkqCl!@uhFsSABH`tGJm?iaVNR_S!*pH|?v)N<dQfl~>w?MwUNxK=
zTIk)~($DQHU{Hy%axc&dW?ZTPJMWVud$7MUA6pB`_Cj|DED&fp*HNSWQH*4qs4vUq
z8LKVICSX3Lhp#hYky?E-q35h=DZdpkf+L!u6tc(4o=z*tmAG@KSK$Vs3DlownvhZQ
z<}hQn$N?F-yyAc`d~ik^i;#_WJ1t2wnU*%MV?-`*o)9_>NAGeRNuE;AbOvKj60)%&
z<T9sNGt-ZrneqqE(5bUQC~Zq!G~0S3@ECWwwsXG<58BEm{)l4yaW5O>=FUFw8>@rx
zrTruXC&429&doH7E@>W~5Xr8-btdaLX-fQqWC1S_LTonjMNHmEYy|DbR9AYP;IrP3
zEFV~QW-NrZKrFumMY*ZKLR)`wPEathaQ&_5_<`le-Tv)UVOVOLOD`<U`wBvht)Zl_
z3gdk45t+<0O7T5(N5)c*^<|Gc%}BljH!1Jj&Z|RwmSe)}LY6@idJfAtHA66g=_U`_
zC)cO9A=z@0_}()YAo_G_>nzN3l(Q2X_zf>l_`;G_N}@OMk6M8FnS^ZSJ#jg^dViHE
zh&KvnZt*b2Z^2K*LwqqL0k3t$>O<L9<H-_Fe~Udi#rpJnqFTaJ)vKFi{5_-cC)1Y6
zk7#;M36V<6@6Mo>de{tadrrl4qV&wB!uBZJqx_vqf4iZ+9-@1q`2*PZX$kxd4gJyn
zi&_hyu8y}18fIoGtZZZxZ}g-sxyv1N6De#CYUjFz(`0asox^hr>lk$ouj`bPriL?^
z6i~>WGqDYWml`ffQp?Ke*}_}tBdWc%25{m6NF!>}5ya7ji>5DE>Y(ZRQ_gZAUsz$`
zUz=pcJK!C4$~*Gj!40_)rqqG>ZgbM>D_(kV&y?f0lI9>+_FZb?X0c^o=cL*sf4YhO
z-}j^@j;bUBAFE->wcXx2$nO!D5FHm59R`{62Pd>gFpLzifPxF}yg;Zx6e=Wf8)Eho
z%}%4_gK|i6yIGR4^L$yDAV@FCx6VJ9JJpbehbk;aK`ht%6@sx_8hsB1ph{I|>6e<G
z7n%<m|5O;~CLu!IWcT$Es9jb&H*T3UnF^b#?($s2seBvEFLDDs+X+cf)NFTc0cWX%
zxI>|d=bzG*3t6`PZAt#jL+tINowH~3SNd%>c(_|6VkIG$SqJ;Ilj}0QEW@$c306#B
zYJG+)9~5xQj}|bcTy(Xuv@=-7c84x3d^^QP7q#RY%Z0*2Nxb=Ewm_JcMlz?A7C6ee
z&*W&!RHjtB4B`*gTKb#rfQai;R@3uNb8WU=S6o5ClO*cmKm5tpTNGvv3h}){iVlee
zEmFaa3RwfxI+3`3a2-QjIAQ>?`>t+0jCaQlaw>Zf_qtv17usRq6?(eRwY54@5Q}t;
znvi^QhqEig2HzWeDVFzeji<O76KP!UAT5(8Gl6Z%Xz^|H-dfMg_@ysy2k+bB0#};8
zU{ZS%z{Gvf5Z_9gRf+hsU)6fS5b62(42GI<azEA@X~^;>38vGta2Mf`8W(g=6V8@~
zNi(rY&!@tl80g;(&JRA}vPhOnU1kA6S-bHT$0daxm|)Ik$EvBC0osUZm3wZDP|an}
zwUMiZEqdzOrb;FK#{Ra)q(KBk6|1hW7O1Y(sqyQ07~9OYg3VCY=u=sJ3e$z$`U>jK
z!ddc!2c(X1^AEg`vNGzLhE)L-{%aH4DUkY(I<)I0QcA{T&gWh}|I0rg#tOPGz0xyS
z1l<m-4BLcfhQx#Y2toCzuX!wEwmlKuiH1|y7VAm#>|Q$jpa=-t0L|XOy(J!oqXk|o
z+V~Tq;5UYPwhU4NAFK*GTI&}B#0=W1d{mXAl>H-VQLU%~**;86FW8l=^fcIL`E8D~
zpMP+eA3^ew2Gzzx65FfMX*d$OBLltRE|nU6I+4sG=mTYHCV|kb<oL7o&aJ38&oA@5
zJj*Ggm&_mWZe`@CtT@n5noc%{QCB=Y*rBU-$V~Yb^%kj@6b5xIZ_}D^Pt?AT#J<=W
zSWD@~{}dzC$V2`;3R`Ug+gp=5DHyg!#*5fxi!9zQu^9g;x#in6toCFPVPYhmY=b<<
zzL9bBmMMe0j1*1LK}@QGQm8&A`;nhU!C32(WVbuAdj0P?rSn|;aR2;tz`k3CBE2NR
zDw9_kY>pbq5{~yHTqZ!;p5|C0y%|eQg~Bt)!_ZoKLX1LPo`N{5m&6yp&w4s)DeUDL
z%mlfPUzm1+^Ly97tU2gL!f9wG>+PoT1|IA3IX|*iPFt7l1&;SAn1c!nyn|d$Ih`NS
zlX~gvO<DS%r;iV5Fg!8&gDA}8sMI@1O#t$K1}`s!7{&L_u8{dm4L&(9YkkM}WO-5@
zg_A@S>!o)w7}WRGy<c_~<aGO$h#&Sbhr>e+GtSNi5B&L=r_QUnm2E$}6lXb9%#zz2
zDp~aG!JA%QEkwa()4Z72_!}ye)8@(924FGz8r4WDgSHE3jO2|QTjDm14V%}acfi>_
zZBQz|pB976?HBxkcm!;X6Sm<Wl<qH0<L`NKHCG)&24Yo-8XGm9-D_iMb<fKbhh;o0
zCiDBz(R6Q8(y#c@3oNa8m-Zna#uU~agO}wlilffd!Bpj8IXsqhAiUkk0=?qe>N99z
z<e+m|DU%cK$q><6X6=hzLiBSl29@D%GMsNSR#Q*Q!n~d=*38bwm<nR;O4d{Dy5;ht
zYhg6=-6Bv+$AChzJO&MqGErXT*LsswL*x<6*VYlR%jKnE_#fqghh3*rIf@LUmzb4n
zc$(YU@AzLhVs&9A@umv(g0n5^rmdA}gqFkh7dkP6&m4TU5-*M5N5>;hZ(Pb_`jC97
z&-svBKXbi9EPD6M7Cj#)M$gZ~xG!Q={h%jJz)=D7?NLE9Gond-`z@W%QqVUZVX?IG
zK#$_a_K27zi*lD*KYwj!Bf+H67EPSB5U+Ogpl(ae$6_#%iXx>4`fPmqcYj-14e*GY
zhitWO<I|O^e=u$xVeP%1PuK5Vw+<PPaEi-GA!Fv&Fr2t{H_GBt&uui2n5e(Z^RACJ
zMwnylZ=XYddbwJ7Wf>{cU9?#yMOl_4U`KUpX~GTL7{eK>rdX9t7s_I$MF$`BEJr7%
zh0nA&8@NuF?fewMBFJbGTh4!*={1j@ckueUqh0UiDq&K@5=^)JDor~Hh#J;CnY*14
zJB0^rkMhTlQE*4xxg7)Su;9q`23`>%mV~9<rJmw`&H9nUxKoMl*sxUjH-yI3^CQBO
zLz`&>;W&$vdKHKBT>2{-W^QXmmpqMnMA+YIGM9-uo91{Ve>_AlB6OS(hHkS;59ZXn
zyp)OE{7>(?C09JCR?c<N>08A@D$32GP&y%3^Y!17Rw?`Z2ry%s8d8PU#Os^KrYYR_
z$n#u(fcW0L&p{vbBt@g-(d!{79eue5*yL~86+=)B@w;MMF(6<6o(QDt_;HQAwYq6J
zOKKk2V_Lx*tavQA92td0R*SClQfHAX4%AO+Rc>U~-Z?L3BEYedoJ2C2m~e5)?XK!l
zYx070l9JeE<WpyQ>S#`4`jy%KQ~g2k{d@eZJgC&Ak3XP`I2jgT6hhiPd6$1U|FiS)
zmD*f7+TqMx&6TDpSfBjTt!0~or+)6G>a^zeAiRtG$8-fKabLDv@W3tK5$>Fq!4mFu
z>Z-_mlxjRW_wg`BIf=8{FrJspYo-<vM4%seZX}VkY5pyoZ_V8auX{3b3&efp58}XL
zUf@P&aM~RR=wUHtgqi1c?U<Q$`IA=_DqGBK?xD9IyXD}r?At54EgBoWBb(%Iij@pv
zQ5K5Qc;avO1oCt`s<)+s2Qdl0L8bt<X1DkrH6)%4W7>(m53@#43+c<Tdsn|BgK%<I
zJl^S5P1#Gn6hDWlSI~7)eeIRt=HOf;pWBxyfAsr|1navRHGQ;nk`?2y9}c-p9<s#g
zr02wckJ~R-^C#WVtGE&WH@-@?3roXeuBSEC1?O1AYjCuliJj9iUasTu0qk<ET5t8I
z8}HL5^rhJ8SA4Frulws>pzpr3B`fl8O@MN%4@-R)+K$kLU1Sxts(RML-kl%JLquAA
z@sA{j|0oq_5*<Q}ZE`%D0Y3%9?Ndznq|wimP=PwZj;-0Mxm!{xxXfd|larZ{Ds4nh
zB8OG=)j4<M=VZ>6MtU|jr{&?cyR6yO86OFGh`bO<QcZ^<_1Jf3?lr^?DATBET6OdS
zdJ|_?DfX+1{n+{I3n&Wr6<n;^R};c6L62Ta#<httqnv(%FekAdROYTt5eC%frxhu9
zV4^f`OFm_$-$_wL!S3l}(4HFkc9?NX5GH^Kdh{XPk8TUH_y_rQ(^@K${a2*5m;K?x
z8`lebVQ-w&>naBKkmuyFy$L-J2KFL<Gz-gQYT*vXHpx)7Ywi3-LB+0gCYFcT6HrZC
z`ctkk>fl)KOaIndri7Zy^EHW7%N=-NC4Re1JA3#UbPi!?)IMdkWo@YJ7m(D$)?bz4
z#yF6EnH#m`9@5kkWF9rUYM{)@y_ReqX7+8pN8kW2*4faO8Fb1rX#(TR*qX{_65<kO
zBRS$Y&6&YF6<wSQZDozJb5>z8iaeX>0j{hVBqM@LB6b<e0)0oCbe;_Mf~65B-3Rb0
zwBR94Zw3*aILhwb1L)r2_xBd^QdoyT@nBM({_IFwGlif<6Cd8XRljobYfrhH-mAPE
zV}XPUpA~F%ag1yJA5k}zv;7@Z?ZI&Hjb2yp!M&<NTZ<|cg$RvJP4cJE#4h0v%PT*T
z+o~p!?1@&oY|fx$rnvj>|EQ8EO}TY^VNF@i*fK3gPCFYRq}QjW*#$9KImsN1A`FX%
z(rDQyJ9^R|_2g)WE?2p7d$Ui~y;=&eaC<pzl`h|O?$#*EHn5#rb)-ITXU8>r{ndvD
zJVWNhsu0bKZ9yY7nm*n~zMoPwDl{j!e96Q`k9EgnX*xj}YYhb}H-Nx>0+;Z6{Y}Z`
zL=bi4My}GIIem3X>u(F(?CR$|IXlE@zF{i|`$VLyQ@cBOouY8;6q6`cm7AUhmPEi^
z`?@=9tjLg`cb#pM3`xxIk05iZ_}BUcBMg(aM&e6}c*N!(#7OKt5gt~Uqm$$8VRdrL
zDgJ||!1t~82`wjoX){2r+;$qfmgP19GO3FfFDG`soIOXlb6?~^1=%^z8hG7xq29`q
zFJTT>e2ZsnR@}-YRP%S<R8s6Ter1hi9Km;vL%;ntnB$|{xOgA^Zg6({w3;`L><?>p
z9B(f5k(5yRDCV!3v}DG5gD#Cxruu4BTBp%KZDHl-5r4bh*9HD{wp-W34f;l9I}S@|
z*==TBlyy0_H%y*_tA1(X(WH@kEq1#4KFI2cz4Be{54hiN3t8|N?&Kd=+O_i|y~6Gb
z)9-7T1-;&i;7#%yeFT9=4g_`VaiOsXin5)l&O3VAYf(bUsx#|?a2uSwcxpE$8EGY1
z-_0>BVmWrKSdBU=Hg}lgoUx5CE+vi9Ivn0)J=ibIhE-z`G4v<$aybnZWGY)UfR;*x
zH}T@Wgzb*a6-O^1t@757yw`H&q`nv;khp#w+u3gg+dHC}7aHVjyQm1B#a>S6f(uZ4
z5IO%a_i5i6_HnqrV}?f6Se~~cH(Chf!S>xbYQ7HPY;{nn$IwO<6k<cQ=q;><<ek+>
zx-WW4kHj>2JXpwoG7q9r=aZkwT%Xg!bt4y4F@+e;8G8x5*e^RbC!TVHAa->kJAE&+
zP8ZZ3=WZR9gBrvtT`3;fYZoNEDV9LrNphokEq-zqC%<ncy;9_=XxZ~BVo(PjMq#Rl
zI;)B@i^61Hi0vv_GF(ez*$OG#syjMgYh?89La92&Y&UL|x3ev4{&cP>Y#6B6@K}V4
zKPfDccWnEFh2*-f)EcP{F{EyHX`r9xs|;~vQgfqk40GF$g8p_B+pdt6rCOP~xDO)s
zyBq3`kb&q}!z%oji=A?M{3tNK@|4R<Z7k%gT3Y>RYP)<L%O~fs(24eF;lY_JSQ*MK
zj4?wkzl>iebj>=MK8~_ul<u@SSZ$2EfZa6<%TS}qDD04s_mkNTc7svnop7Ag4OCG}
zXSl=|etC7@nwE=}9n+HyxOfhJx2H8VJ3YukVlk&2Sq=U|5%P;Y`AcCx2|pfRx1wbw
zIZ^(@GORKdBq;gYS0-XQ8pSs;KI_7qQyfJnD#6=HEIM6mb;pbB34Dnblbz}M?djKv
zq@lQ+`oV%dPL{+<P*9m53fjbmcH2KMWa4O;z|*5`a901mXK>GQ{@i6<BD3B(6bB|+
zb^JM!lOHMPMK+(~VR(hFkHfFuNfp%%Id#`ulJu(@^r<fSO@@h0?D0MBeNq};1g&w`
z!!o)<l+5|A%}gi<oQ?hqPzEz|j5wc(ZBwLoQohgj)7jJ_czN3H<I%!>^osuk>vo?2
zn;le$$1v9q>KyrG`*<4r8G*Zx7t#c%u;I{q*{9k)G!-^2wzlVW(9-l}DY2MnS*2n&
zkCfzIA|9w%$JiNx8RejpSG==Zxo`Pw6$(9$p|&74?Ao8JK9gHg&?9_*XPt6ga@p`H
zUR1Qa1+#`tQ?AP<k^lM`p|BRT!fqsTq;c3Yr)`Ezt@-W}WtOfb<>J%>X2}WF2&x+i
zWf<vVa7s5nw4^+n$6`nf=}md62=lp|8T1Q&J6<WZJ1`?_%?*3F(qV*As+%`OOs{2G
z9pjfwA<}$#{(|XY*qLDcvR{y)cUE75RoS)$g?H%7rZaTE-iiQDl`(~AZ|#@kD0dnh
z<|?d82)WI@W$Kvy9zO!e0=q}gPL<a42=tDA*GU<|mRg2-ar9_d?DMaO7oANd+2+0z
z=nWCb2F~s6yhyZ_HCg|t#H3}S`_wb@4(cqhV5X%eWRHqfwPIOlsd{)<!$_}!*_nl#
zLUw2ULPf=~dMSZR$_NK3>J@7KjF~0Dv@UQ&;#7|0k49_Kl@#9cTH{Gua+E@!u!NmJ
z%ER#Og~J0c?#GDsRm3hGItHQ=SsC*ytb>O|457kTi7+D$YIumG>B5GJJ7;}qo|eWP
z4M(k4n=S{KD{}Ej)>p!4?1WG6@pEq{k-iHxc0mlUFa+X8;|kd!iORbv+_@3unIh~~
z-7n{Bmw&s_ime1Ufv7iyor$~*>=xrVo#b9~jA{0>Cs=ycA2x_oe6<*?`29eWf(xeX
zSc<)1rq?(n)&bM0@UeaF!!fFG`3A-mMO%Z~I04U2kJlncVk>O_=%dAap>LZ*Aek0!
zp<ZDEY-VA<c!;WhzcU5`{QE67OKm&>X`};^Te#3p*>HlWTnKs+PEDsT+SLoeCHuj~
zBZ<^DX>?iiL`++pg&#Vrf~sikg$KbC+n2!>{yN#aO)2MhmZc0BY!+)XsSnUHxZcPe
z`=6jYo2BuW)OD!u=Cnv`9j>4$|H=15gAL|LCZW&@TrUfO_M}ClN}d64;Ss@Djo2k<
zx<)w5T8blSjV8Ejia?6xX%>k2cW7CmVFP-Y3<I(cL>QXwV(%P-Lhs*|!Mss|hewG?
z7cbu&x={iJPB=;|nI<p6^qi7cyM<Br$vwjcCH$o~Wpr5JogJY_TKzoHhefBbI9W0O
z@6^3h`|l}!p6U%Q1zvg~Mh9d0SFC$G*1#rX><O+CUN16D*Esi!Uz9n`EziC`u<AR&
z&VN^470k5uOFhwo-(h&`qWe!sYRpJ9X0=t%n|WC|J`vQ~nVVQc9t7;9_4A>YrwP6O
z9YF}exxghf#!JF$kV+1F_og|RI@xqiJ_Cz+_glT*8&l%G;1M1^Igu(5i3F58--qm=
zvSx_r6~0XuW_=+x+n`^oX@JY#J(4FOmki1W*jH9EponJ9*ao__vF1y}bP6I(){dXF
z>Nr2f0u1LHQ-9T3cd?7BuhFC9V|Cm~ep;l|GPkFPL^&72vLbI*(h5Dc^vf6H*0Z<F
z!#0XqSNH68>%Fr5ou%~dLP}CkpH5?Q5zQ$tF9(dAOEUFXlj+a<$msp4l<pAc++!o!
zQ%4Uut1!-Nx>FxuJFJR@dcx_RMWHnG82DO4#EuA?P;O==)B_bve>xvzI2sK;nY!w?
zPmOGPpVPqZOn;=kxlBz(91c#YRPq9q^*L(EX;f`Rce4slD3=V)F*hqmL)WNr`O8yd
z3{5n_2L+Q8zCcM*|7{u8KGDHWSK*`Nco2P=-Vd9~?t+LY!wEbYL~Nl{L?m<bpzHy`
zKACmIV7=0zm0|n$<S;R0Sd9$5{zLvCwHej!?bQ3a9G6d<x8gIF?}eUxIPQ}B(TH+g
zs*|DmQnTDh*d<MS-d4?54Z8jWOPY_ohEKi?+$IhyIE+9{WGYRqhawyr>ax{%KaI?v
z<={O}gNfzWxv5uD_e`y2vIv~7iSB38uS5QzP8*7oU?QUeq*Kn4$H{Q=34=NMPo%lJ
zyYMoqIk?@%0)ql!uo{OU!H!wtj)5ve##tywD5c3&a={;_1V%>+?=EhW$4eUXy9Mcu
z6&d>tA6RSeBblLvjOp0sWem+z)71u*;v8Qdc+eFs;fn7rb4jAdjh~wk@br{!r}T=`
z<68UXUZUbrufL5XTw(;#>WeqNe|}IxJ0g7gnK$*(5su3B*mBjBzlR{WvaMH@nKRpH
z;xgHr2kE=Hl4E0--|ac2lkaPx$~ZS;ZUcLJ7@!vYZLN;}TZ|qfe6GT=@)L&42tm+u
z-`+xOduwTRsHm6&_=rKy#SrHG$;VF2V$ho-9=t*=xW<CG9L20Vl3;%8^#YRT&9wLC
z>lNvMahW?<?-07H1!HLVj5rb0wOsg#=9=^2@cq1{%(Xr7qcLf{B|MHC1Hz#IoU8LY
zJ4qn1v64%_4nHEd>FA5dm|y;CnQzN1OV7ZjvOOAFCDAq6>7A0p-gO76SwkirhJBBc
zeXawCBu`lqD!E0%reNuATF;TvG4_ch|K2GCGkLn(m||MYCd18wQkDUw18T~_<vi%R
zSFLG8g}}rs8gmOh2Fkx#qI;JHCS+$bS(ccBsWl7$@Ld?;%dCyN_I@W#PZbY?V+FKF
z^1~+VyqlO-A~0Rn>|7QsGAm4ZoSV{7o~As5p{W$2&QZOIml6(GSA}K5U2U!_I<Bg0
zF_s!%lF$2{xK8gdQKgx=nN(Slnz?lpF6L-x(oax0|7frX-r<U55x=qH2XDLk;<()>
zz8mU|Ap=9cx~lhG&};sAvRXt0r4g5nRNST^E)7QEZ+&hJuz&0-$A-l&w_PKX-Jl~r
zec!4=skGR*)&0pRx$tY&$#;3p>(S6WZ04qA{m@^{lX4WSxFY3LM#6B|kIObc7&?Bm
ze6^>m&*58wP9RXw!%yi`w{4p4tdd$&*VTH2UeAH<{aj%Sets9UU&b0I-E)~5D9-CR
zXiAp&e#xu<4<$ZPp0*;5OH^ftOq?&p5lPi_VZ-P*g{~GDk7%<pE*&*;$}!q(9`axi
z1kS+&Q;&e6f`U=qtr~1HJ$Juh3M*hKz{uudK1RusOi2eVO77oOme<{Z)1P={7=KF@
zbWIFPOr2zFyUZ^#5#H`l3H{NcDs?HM0N3<GR3amPEw5zOnn-fdUmBHtt<d{z<u&+<
z5Qr<8G-7}h#C&ch07~+RO$t%+tQ?OjdK^n^ZTsbZJ&SU@4%LRC2Q-Io)eeVV81rK#
z$J-kf*9;ogo$})Gk%GAX$3^`aroesuP#=#<(ln!KW%9Vn!=1*1J(Z_?@8UxuF0hE%
zN}RK&{<N1R)3T@_An)h4>Fu0Mi^&rsjUVBQx#(4)f>cD~a*n@RVp?zeoDYVr2$>2E
zVupl;w;a*l4D&os3o|pwn*)xjnxqAeCKA_uy12YZs*4_3T{Yx2+=xVU-=#rHHn}Pw
z+HzvoNgcRnjk_nw*(M{YRQQnF<p++YVc|>NM~SR-7&}XpY}X)UoF1w9vz^)VRi6u)
zMMM~_CR>SQ>&4j_z+vJg=H_!+7j-9Z>T+QGYQ+_16Gj1*ya$uii{w*?oNInJt=e4+
zF-ykw$_-v63c^~+ITBy@+SEfFO^??v(tg5cLEub%i@nQH^;(*R>tfK-s@+qc<|RF2
z#lihk(OqNcrY||CxbWOqrBYr>CZ_{6ke@O?*pFLkkba{)i;GuFc|b9#C`Syv)=T9J
z_hQSO%E2^ai*xT`q6o@Q#kS$15l!b;Y<s`A`=>`Z#+PDGL-Xj+=n!#qpD_$Oa6v>G
zA=p573ZHh5XM#;l<*OpJ&g0fS_4_wp21|AMuw(ld{c)2r!m4X_6wUj+VommBWX?sr
z7ZFk&(CZlI)piK8D;w)+BUj0D$L&!2GpK)CpWP@6_!@7N?!`yv*Xoj*U3gyy8c)oQ
z@L0BetmbJQTClIFk~C|3Y$}wyG0E~nG9JM)^cgCfa_Y9anC8j3-j83KGQQ^Y;Bpqa
z)q9lG`$$!|4zt~f_787p;I@naouzIY6U>fSMLfn!{{|%vR?K|JzH?F8G{~P}YoQwc
z9$W0A9<p%$?$gAnT(N{xa%YXo0aYmI9gN#dMgMDssdK$Z(t8Q{ct&?ZCf_vU&4S6u
zk<!tlp@Q!(j*rUMvfIUKB0UvL9X$d74oVAZVc&Vx*K&-f|4XzgtuTkv#>cMn4*VHm
zZz;>wrjUwl<XB28`)W@6X%b<)K&Uv{#Idn0jgYh~1|A1CBu$^RA7{a2N=OInrn$@B
zRlKP%W3~#}7AK4$H*iMr{f@Ccxj!bJb%j1Ru_o`=4h)q5*Y|x*4k@ciD&dD+etnuY
z2m-Oz?;}L?P4e4)l^UkxdHpmz;42reYgvB_VLUE6P5LTa8{LGPQS0m35$qR{Y};!;
z0x-F{a27Cjz_m$w0+J}uxlytKZHR!DK;2AQfOuXv@B5Nf=_djoWb!^9BXolzW;EX1
zSu`->abxU`hVWB^<^aNnM6^We>Gb+#43fJ10j@pPk^IA7zr&xo$voBq*i$CB0LL>u
zQTxfO;MeL5W*G6QYGn9g%rK4QD(#<k>>YQ-2K3E~etxJ=waAQwsHDsfecVy29sL|2
zN<K+Eh>Q|@ro#vYyQygZ3)Pk(>XrdZ&-7Iq6aBEYi@KkhuPR5?I_T*HLGUOmHa<};
z#*v0*&~m&=0{VeoZ%uOgrt^J_Idmjks-mSCyFJk~s%xGx!PCGxtodZi=YO%^gRWVC
zocKxa^Olbs7%;)PGCr(BM^>;?Q{z*HN6=zR`i?(rq|{8qkHN(zj_F*gO&1?tm8!n>
zNWijXCDK%?=+um&fCI&+adMF5Rc*!%v)cZTo<KnL&xZ%N^=M?5dmDnazu0Xu$%FNo
zc}#vBgzk~sV@KmaR->k+m!!Gn{E5{O|EL8hm3;YUOmglWcoTOc`iXd@W?!^y8ZA(B
z^S?;_wy6MebnrG2kUr_x-{YMd`?6io(`BgTXi@ub`TrdKpC7Dl!+N1-<LsE6Sdqf!
z#b2~8p3MvV^XI>R`05X&dyP--*-H=M4gwYQw@hBjjgwJjUfTCVT9E%W{$F$1paZ0v
zxA@F8nbHREu-<*I_hJOE0wi1$Ev8L?c=@l~XOG(|z}@&D0}Vk;fJ9j{4e(8=U$p?d
z&Me>_yV9EFyYaus2#is3d)GTo392n*!vg4*k|Bn%#%x)(Oh-TwbS6NDdCNxnUG+a2
zL2nyT-LEaWhlReraJXevKEkqpKTB?csMHVqbK)Q+WlZ{S!USzXfE8-te}=S|G6g8k
z`~U;YM^+sueZAwp6ofPJ_H?u0-v!c+2L#%TjWW=_i^<yxRHa*`=~KOG+^L(*0xG)4
z|NoniU@|MlJp>rs^cy>}_a!W$#vb3Yj$h3G$HeS!C-&(;!@*u*Da)apzwqpq<dy|{
zY9B5$4`gir8y4W8^4qek=U0uWf1m8FGNi}=^Zd}kQ9}6dn*SSOjc=)ArDvXQ|JvtY
zLj*mzB_4-8)1dwr)BFb`zVRvG-*peRmj7$d&l!N8@!!1_`Zq}Y_0{7R%;n3I-M|U=
ze|G*qEc<_x^>1G{Zdr`R-Jg-je+AoLJ!=O6JwJ#Ep7~!E`9Bl<`urB|(I(_TEc`F4
z__wbmm_X0ZDX?p$|I4K0-j?3-M9Ertc>c%M|EFgSY@p|leE9Yu|5XjTw@mm40merE
zwP*a>WXz28`@R1%ssEb_ev6F>Wc-Sq(?G?)jh53cKRo>sV13hG`R*^ZFDxbv+vPhv
ze*QlqJibjQdNsCW{#pl5_5}I9*UyqN5*0T#*j(~V&2NUt3|J~W+-p3X{wW#&$Z!7t
z&<;0oq&#T$CG)l6XIv_7CXN#iLs-;JJ$H$c3MX0p1CoEJ=`0Xfq79wD>n!LF1|I!*
z>!YMDQasLa`X{v-f1eXTm&b&sD#$wh<+(OQFyz3KhwrF$=wWOApbL$`cme5i^R%gx
z({}^$12et)n97;nnVZCa4HL9^>ofoGYl8lQr2)&5LGb>K7-w)7$VvE~;uGFBe}pk3
z+rxV~G}+?yI^r*IXk4W{44%ZB(D6Lh`THC$!T<yYP{L1S|Mr4`SYlb;r`Y`U_h(ld
z$Dh*J>dpf83j53#@9GsOiTpaIE!c|s3&>KUfQ%`f&uaf}BN?x`!)btPbElR@u3%fp
zoX*^PU1vjZ?d6LoYTj>!$7b4Lf5+cfF93WqaeCx`tq)K*Q``j>CW;JA&`~TmSf}^+
zD&0e-gphu*qLc0v_A{EzNO6K~yixHx)!(!d^y*gby#w97Te;u9#M0~1oS|gzg37Q2
zYwtpB9YRo_qoaw@dO`;+GxqOTb8DVztpDBmq_%)b*9>bs`fJjpEMJ}Z<z$rFgTC2?
z;Z9*j)82hQ?!!T>j!zh&_K#(r1DgC)C?fgG@}r4uiy4^s3pYwOn$^G+^2E6zI`Skl
zjyB_*_GDvG=q;*$NXZ4XJN}~dA?NnR*AOgo>B8-{3}V?=`_>xCIJjr#`6)pW>%pX5
z#L8dZet7&>_nS%pBxn_l{h@%8f&bcJtIO~_iqVMz_I&y|rZ|_Ce&X3<ru@Rv&eFd-
zlw&@?3rL9j*f-u<vPZI16ISBXj9j}`lW1e3S26C{a$3rd0NG1`zS67aDLeW1O)?05
z>+yKQpXJ;tA!r!PF>BRRN7VjE3gDP8-4JugiX^f|?Xvyd4*oy({yZAW_k94zGmQ+Q
zvF|k)OSZC$Fm|#e!q_6A>?uo@Vr<#7WlbSuU$aCcA&QbL5h|r@iAsx#@Ab^o=l%Zt
z&gZ}1U*B`S=kz*f#%rGYx$kS=?(6DDZZE2xP_{vc1j=KYnfHs&drQRjD;Ha~q$=AA
zkcdk4JNKS0pJiNlv>tDs7M+_+c~couW9`r5yJ3YqYKF;h6A}pIn(-DGK7rEN*(tWu
zB;=t^69qkAh2ggyhrHdJkh5gy78r@A60xc6Ms5#o#%tsB9TTPd*0%c2XHP7N2Rtm)
zw9Z;6rSM#tIPw<0pbcA~h;j6z2If?p+44*M{7(W@H|8-<Z;G3+4`d52r%5qZNlB6=
z;{Ot^q$f<V&*PiZ__d^D^LY-~8=%uLJtza-36X>`OQw2!vuNCMmVex;A%y}jA&H6`
zdR7<V?Pc;L8N1W0$D77mQj{HhQHS5^P}9c=vF)>yKT2k)0)!C!sN1E)koSJ%$0i8W
zm*}Nd_Is6L<dHg5104I2GZ#-R%k)Yv4oO>8QrNj0o?2KXV`s03sqSGWZKjghkLx^z
zm-E8zQ*&hV1-!UF2zQ;Ig(<%Dl$Cv-ed^oUE5VJ6I@L!Os(aFAdCxbIv87!A)e_7z
zOn<4+ripccrKNyoy=*{L`!(DNl865~ey%h=-DQ*H*zio3XT$WsW^K7ktdP!3*yfee
zVDJMF0{&m*?3lRjQqxbTTO(_2u-IzC+zK|uAd=rdJK)taGoHvTUy(jE+=N2gFdZqA
zX_iPO7TFq6puvGy;L*p-c;J}s#sybbWL@F~pvytc&xM;;io!nK+wS}cck7417&=*c
z?c9+@;E!zNrseqC;wKz}2hY9Jmks@~7L%T3A7FhRnqjb2)+L-iN&V%0o{#QR>ZkNi
zG3+wXRU|bB)WQ&oUSha+yy?=SWe+p0Q1ut9zm~<{JC6R;2txZ|8?6E^u>$)-*ECq`
z)}ir<^wGoXQ9*Fei^mQZ%4g$Na&-`{dwUA!c{WV{8Kd10s)L&;IncnOnyL#iKW9QL
zq#VNj_>bytsY3bI`vH*~a&Y%L*eM;#7c9@uU$q*y5v5A$NfUcD>7Iv>_h=pn-ry_u
zf)7A^f*)1g!{%ne`V+392{OHdn@_S(sn>pIy^n`Jl}pAs_Ri)~C4GKn`jUYoSaH%w
zBa(UQwCz8i!rm|5x}os>RoJ~da2a%H;n_`d9j#ZfPR5_Vz4Xb0-XQlr(A>Qmm>HFJ
zecvU2vi-7|25;`<we&~&pk!fK{R`cMhQ+*htON_jP6vI+e-DjIL>VTpsBaN%>?_K)
zX_2-a#+o~8M2XIRjiH|hMBdjw+?#Nn0i6m_r_S5&5C3=_=~~~*E?(U!c0A+VC6;@Y
zN_|;Jl(y`1ZdV*kqZk?gecoXqRCU|%J9~XB%^R?kld4o|;2yrHnr3@EIxQge<Go<@
z-)ma3y{zVP{=3ELS0_y}fkLF1cL%8<U3K98*VEe{KgUWz<Bbb1mp-}nR%3*2^LVuG
zWK=51vmb7vaFDwZ2(2U<2YWD|77mk&tk1-^4Tr)FxA(6j_b3SJCY~v8M4z-jd!PEI
zUsbw^a(Rw(ngP-eh>!YakG#keC7Gv(zi4}7!~{~J!w*CnhwhbDLwX67<Yrz$`YsFs
zgdO@_ggceKZy_xpT=Tq|7r6M@pF$H64pqV0&dmyF3UO-Jl8B|jJsNqFbNAvTbsaex
z#8OR1K6_QuQ6xScVz%zcH9^EB=`JvguFa&ey-&x+AatckYq^8;tdNFoDZv?+{J%|}
zRy>s?Z|TGbU6QJTe!RIn1Ra{VU7C(DktEQ`%1fe+&=%>!9v$L>_BBRS53{I(kM)H&
z(L><;Fx-#xrR4^LiKX(whx|aK9RM?JlJo8TmAGcOEPH5IRzZ1b?jz6J&vad(8eFn+
z(`?Bgc$+`Uk%^<7yNx5m)a((qjF}<;6G1pRqe~tjN5CB*6?x0)RO?KC`?L07PkFa)
zC?kDk)3}W|v(k&1-d+tI^pvq9+%lEpu=h*-E;D@(>DbYp=V%ERVh1(iE;I13(_V%S
zA5IfG7`w@2wao=#(&7j8?K?le-`k!E@Ya#S6&904C0P&|fc2nkU^@Z&xr~}*8I@{T
zrDN9$tpSXeYQ6yuT$^7w(QDa^nMmk*`0gxA&!LroGOgKy{QN|Xh^wqMYK0Nen5X?n
z-x2Ao8*<^!qm$<zOAZ0yPxzg=N}bkPZ{T~Rzep!l$6(67;1z|IrZj+-RLUw$07*8?
z3z<B|%qwuGcX9<9Di~j`YMyd`kMwqeKLtrXJd$6tiv*XypC8`-{lz$G(V3mc?m;M*
z3)1~Tw9GeW^`M_z@(>9ZN6)+aGpmFL*7J$#ce_t+9k`TYS|MV6uamF7sD}O<rGGxP
zXgYALj9Y~i`R$?ti5c^{>AQGkLj$_ty8QbXt%1FoSOYeu1z{?Oc!w#i-MkG0eigjX
zE6WvJMYq;z5DS8VxiXKbIqV!2pc+3$?8@QK`i&qlPInI8*!n7Re#Ah8`idSg_SJzL
z@sFQL{0V7th4?LdO@k~N(=5dQAgKv}(z~+D+#V=(L4s7&t)Js9XB~VOWKo`AvQp6-
zQ<QRpk_8jVn;CP*->YL|{5^C;_(Xi}-TEd*Kck)=mGrB{k|(N!j$PdQ&~jCyKxNA9
zh&876wV%n+Tr0s%QE*cYM@)>0Vx>O!ag52T9RYnoFp%1DAEQwL*g`zZ0^3P^pD3w#
zPVsnB9tX3&zH!H^OTYXRjc%ctM|P-xdvR*^(wir;&iyHlH@3IF-^<&WaCr4g1=a1a
z^_fE!y3+V1TB{sjjW=eY=kE-xt1_56Bv1iH0|C8+1OPAg*{FicdM1@nODkxA8h{Jo
z6IP2C>SIq{{s~<~WXXC89DSs>7>5gYj&C2=kS4vPb06F@X8#mEel%>cBujuEH5IXJ
z&elMEe+g7coXm}`)h|!g71jSN9vuRZ?2~prrhXsIa(g!D)-F_1vg{M3+2Go?r3z6|
z_gnEtaX<g{<}%BxgSI2t5P*;(et_F|?)v*yGQ<8XJGgHB{P6v{Uco#(D%jsJ%IH_u
zU^(qQN2ulVtdSC;C~Oe`+}8m@1rKn|M?ox&#o*fs2cdMI8`J~Pf5+1`*RAxM2K}g!
zQ3k{YMiQa>T#570&#hmdkPZmWCWq!TdOB13va9xiX!e+Js=XF4<>m^vOnJ_H0R7~_
zc<5b9@)gp>`ye|WVxP}J6AD95nuImg@6)n@glEM^n4C0GOdoUL_Nz)?pGb-w$e|dy
z9vAk_=(Ptby;hjr3DI!Z&q!wN3rII^mz>}~8xuSxB))F_{)*zkfWXEYd?A`&dmKN*
z%V$qU(Iyhl*y1tY&<DEEMg>*tR^StRI73n7o=~zN0<P*ma$p;$McM;pNMD{=hu&N2
zJ87H>!wq+ouHXCf(PwBb;lOnd4&M6c-KuVLu`@DJ%yvxubEi0_bOveJ3-i3&%5$KJ
zm7&Xd<c7IsvVEIm3jU5~lycbUF})n*26GAVZ74JYIw3AxrBFA!;Ji+Yzp0xf^U}NB
zYjFs!`X4BIUCW>9JmWthpU5zus<&7A-mjTYxjjrWW5e$+`IZl$xK6arwhBFQeS7|K
z`P?rhLkIsvvKLPHj+jP!L+q{Lc-?5tZhcRkIh*vgZJIrM?FMeAtC8M8_ZxXAMLa%-
zxeI*`cJa?6t?UphU<j`4pZxI>+Ff~F`ka0v+X32$rDh-Wd{I{#j6HxfN75<U;mu=B
zH`fao_NlY%pBs6zY?Zlub`Z3f4<dB?(_F5oemf}f`=jRg4K^z~X%GLnhxWd`Y_8DI
z&s(|gLpLd57~bcG^o5JO`V<P@*h-wXCZAqnztgeiu-a&8uU^yfZK|8gCKF7vNPp@Y
zLvMcSkI;<B#~Zr&EL^{Da|W(hDkMm?oq1>0!})L?`u`-_T>#-6^g-Swc)tsdSq#95
zR_JpVE>B@tiW9kh5EPwfMosqVsIYph66oXX`7DRov$e_4pKqtdN!7-sipt#fPn|Dq
zRUd{SsvS&6pG`gwX1kQqY^V5uQ{~Qo)cAWC=a%9{*L|+8Byv-@!}Jhn5iv!)dkRFI
zT4;atq^2Cyw5{wkTx4OcD<Zszo#8xdq4E2|LgD4X{J>c^FgtQf?U#pJTi1=Q2QQWD
zhZCaqn-R574QZ?-KrtIsEc0-bUwX~UUK@+Ak*B4p^SDy?%w1zG8?4YgJK}=iHSA$g
ziQ9Lsm_TzOqpZO^?(v!oqTu-<d8^XpD8}xNw+)Y_5+uUrG&_ju!v`FE?KNW`%KTbs
zROH(Pj0J|Se`3+^7kluB)c!d8R5Di+i|-m(cUP`6Y2P;OvS=|^linT3^Fwv5YiG_w
zRN*3+Rn^)nxr0gG4@mqic}m|fpU%G2)T6R{c@R@!h6udP4B#d;KBB)N=x}uszLH7!
zWsO1TSafPOA-9A&VWY}&uUf@QHp^}{57Glhl|e+G{CpGg1VcUGd~%>$dDPXcaW42E
z^c2$dY6A<M+b(3q5IRg=cxKXJ%T9Ep`T|YXnn(9k?ME&GW&?**m+zIJ$%G4V3!gCb
zM3xt;dTt!Lo~`fF3<n>HgMc~n{8LXCr)u!+?}yo6qi^dd-TV`4*LnN)`T&s@8j}Ts
zO18W&77%v<8m#0^->qvV>{##viV5Jkpxb(M#HlH2Dk=M2qd<53u6i-U4ovX0B9XQH
zw*$&xT@PCynt3VTq_L)+;@B@p0tEzn+#R(6#iGE}e@edF+xf2GWa;}18?Rh~*;gkC
z?(#g`4Nxy$87?~&BX36}?4b8;$j0q`-OVBfc2?CS*bF?EI$uZ7-<9HY-iEu=8RjOF
z$N8)YCaN39wd~>~qB}dFXvvz9Vei`nmG1i@*UoD8Bt~rPutZz4_jl62N%jMiZVyqZ
zSe}!~6)rG%DDS=>-K*$r?tPLe3|fM5nSE;eh}Wf|4^`|uVy05};=bnMeVTo8h7$GK
z4+;Z*C*AJM_!*K*7<tshDL7rKX`P`dMf@gDJA0(G5L%m*UM>APf5&YcU-Q_w7W4Ar
zA>vX7B5J-uL<+BZP~ROw#)My(DB%f%dfi|znugt^sPGDOrZZL2#;ydXIX!T=IiJ0L
z?Vj?E`GK@Vw{>h={n6O-n!B-8*wAQBez2iAf6yjSSBXr2hUQTh>=jnt*3lKJMP(F^
zR1hNObv*f<9B8O^RMNN31ird>EP&bJ%_h;gD~Gsm?Z(L%E`dOtqIWM`X>}jBKAms0
zCoa!GM2+@(Cs8J#1fzSHW0Wb2LE%N0UFdpmP7U9_o{MYX?BoI^C^IwKsriPF*j;VD
zaFz>2`-fY`SwEvR3s>=5EAdg<32wuJVO4{U=@x&wej8^cEdfR8kWeYbBh+?L4Nu6H
z_7>9QX8R#g72??#EXKHui4aQKacUy`#7>*zxVXm0br)>GGhliCnK_FTeX{IzL$^RE
zG(p|HH&Te!?E?6Ao293)j%WWAU|Sye*REUV;*(f3Wa3Eov(v8jO0I0Cqb4L{(dYn%
zl3Vi^=8og=cf5LZPzg*=HtuBgOmy4q<nLu%8t_4P3hj;Y=)JGW8?d_Gx%K=&M8T9$
z^Fn{H!zoGa9hM`IK~tOdkne%cpzr$oD`LYN(K7R|pnE7Uty-~NpIEom!s%Ov$4F7X
zxU-~Cn_Ej(_u?ZsIQF-ed<2a=YktR}_0w4za|7YmEO+1Hs#kxVxNqyPNQvS7AO|=3
z!T&Tw1=+a=&)`)m2@x;pEd`wS^Uq6Z62`&TyO9l@97jWx==GfrI+yQh;gNZZV}2pT
zn0v&CN$~*#y?#T7^FZiZg@dehU%`uuS26`%a<jiXx~sW<;pDdWgu$;kmASsk!lYPz
zB5TSf=LEB2m&ilMpop^9;sye&mb^&m!qVB3V{^Nn_KvxLA8DvMp^9S-M7Qun3<+DC
z;8I?5zY~vDb-872vTSEbLLa-#DJ4-;A0^OeS}El0<vbAr)@zRX*7}v|6BP;{E~C8_
zrQ)>Hrly&f$`EpQhHwpDR*X*+!Wak8p>vo_^dIigry?mm8Y1=jFCfixv)8{ep#$_y
z>p&Y7|Bhy~(U46j3N=SHPSZ0C;`R`CMYnPEkKH<S;|!}#yn*ahf==K0?JnC%M@Gx|
z8c*9_oGbZZ*2sJzV@EM-m3>`?Z^FgI+5;H>>Gn>|kcG$4gf%_({^^TfAc)}1Tss{<
zVdEi*L<JOn=ja@+w{i;T?88aD`GysH$QO8RqT*Kn4_8$Yd`i#L0ccu!dTZ^nI^Krh
z(vf+Q*ZL+_u#xp;yx8M)e|6zXOwDdZ|9j@6VTfaP29kx^fl<KJ)tIQkDy<^g^baQv
zfZecES(`j3lwl3!EsMeRxB=TxFI`@K>MD<q;UzQ)s{+&>)FW$Cy^O4zV8AHQ%B<I;
zxtq+<EFvzo9pBvD`}W_u<`nSa<Ym&mUvMv-gI~h$Y>B}m1#PSi`)7k*5xY>Z-CTzY
zH14eZlnBDQ#fqdm@9zmB_Tybv7W-;CkK&;4-X4ior6(9(k7T;tn$@^#L8mnY0xS$g
zNk<e=#B)C*Zl7z;dsAv<FWQ(od<=2*a42g?L<}4`@(}3t>tQM@be#QZ&9F~D8Mp5-
z{^;3-(o9n=8JuI$Ck}Je8}VO{<Vnnst0cz1?sm$uG<)DgJ=b>otoq8m?X71*%^=4e
z-}C%ZYIu9L$`<@J@$-FLr+5G63glBX7H2;kD{f%P;oNb5NTA%&<qK$|Y-qYW7?OQ-
z^fh5RqM_$Nz4(Sp0sn<%g_oWzo&xncXYbVqMKKDWHxZF-{()w`&NE`!a`WuI-=DMY
zc{0aiHR=f6HoYR(aJ!t`GF<nMCM$e*&o6An`&R95cVIRM_RkpeMo=b-*tDOC6%c79
z(Xo_ODs`v$nd9!;3mdZU2~1^fzu6P^FgW4@f2-f?sgoS&bi_g(p7{>l{p3rUlk5-o
zD`;hAn#R2?`VFZMd8A?C^OHS(8t>|>Wicadpjj1ofA#VBWV_wUSGd5}(B>R>K$9A2
zm&|&g*9u#Cw%+&g9`)ZTqECJ-eZDg=+=35G{qybRQ{Lc(!V@O&f6hL5WnY=68JYJM
ze}m8f{!cE_uR~9Al2Iqc#(&IW2<hQ?7KP&QmUE74gZ7X&FAzF49qpX%mc0{v-MIH_
z9DWAFe63c55<Au|$OpINc6!Yc1h~u4R1<@3QkjC%p*IK-!w3BqhRq&6<aibFwb*&H
z-=^PDi`yaadnmL^jt>~WSVX;XK3{-33)^y$=#CYPG?H_RKHj93fMIVZ-b$(OmFgVr
zbbid=o>+SgEO=LFdwJAD_rWJ<@oAfGOGR@R_GbL?&8JpLW9eSjJ2fO+)5sBV`D<UG
z7qMRXjf>RJV!l4|Mp{A(T<W=6KjV$fihcU59H^_`JBnx-aDRp_P_zYFLbqwK-jCmp
zA5PhuLue8EiE(B!XnQkvwU*=mKH*D${*2G{6S8A?60}EyXdo1P*|@HkqzE-hx-Pr<
zvc%SF{ARSXnAYS&d~#Oom<T6M7il3E-KP1CXCgSVRe&gm@BO9UPVA2!IIR(3OD(XM
z%X?1KlXY5U+EZ=)(?OwD_x?v<hDe=haHs7x9}stXwyWlo&Cg-WiMd@dtz)hC?aFta
zq}!WVd0+@p3m(uPxJGWY<o7CzejQ!Jshs&J7L)#KmfVIc+M?(K4hD@c_BYP_Ii7VC
z|EAsXA%5}0Ck^$al1%|}1iOpF=J3B-03?gRciiF@h#VJ8>*C&vlkS@rWmZ`9r|MPq
z9<kznXn}iA>w830Vhe2ie9_h=Q1(mqgqKbl*W7k_>~y$BFEgEzD&m9qo#)$kl-LXI
zf4!vmDwL(y*;mpT8-!t3d9YhH`__$1Q@E6<M;2E0Y$drRuifjxDGuJF3V}xkZ-IKk
z0=@p@I33jDrd@g&z4*SVZ_xTN7dqO$!m9Lg7(NIPd2=_*;nF11Xp-R%PSC_=`s(fa
z{>}EIIF0O65|70A>Gv+}xnm7YnL=FOL~|$*KeSRmOZ8p5V|;b&{iSL>0fT$VRhwkj
zp&GJJBg$H1AfHT+p;qynzEMu6<re~dqqa8DX<+`oxCZS7ORh{KO)arUG=-Y+15Wv*
zx`@X^p+J+Sd?dof9Geldcf$Ux$GTf~G(C-b-cjnpdtXu@BVMuQX_oZ<Wr7{JT`@02
z0%%Lv-)FyqK*lj)cGUGv)W|ub)YC|__`#2vEG=BVXbwIz!b=y3D@nuof*!nTH4$C#
zv~;yjhRUVHLjotPqkm=onvitUh>0`Qd@ty5Y<PU`=Gzd)di?!daVeeaNpq4TITTs@
z-#PFM6VSgs&7$QAwTDQUdf__5_J9u#!HYsjgKifSdv5Lvs9J&7M$o33;WD=UzDW+x
z_KPD%F~i;RHF*~}iPqTK9)IzAo~$u3n$&2*HEZpn)=TWm=vDleI9p7^5s3_-LX{u9
zZ?+<$;}6u(3DA*TzpzW_#O9^m!UfMjJkAwtEl=ekjy<mL`cr3D2thSD^Terlu6pr1
zcb-0^{#d&6>yv9A&R;&a+>W%sEh?>b?)cPjyWaa@)X0v<Go{{v5u4%&xo$&M2lEUP
z_P=P*B@*>R;t1AYv%?M=tGhzOWQLjB=A)T2VKOb)D?N;DIUC%uY&}&N>Hvm2>)4z4
z+mEjYcs<cUud_|_5D!1J++AoS(KC$0EP_pt6L4oCaD**=06!!UeTu6cYDA=^KT1j&
zLW{t{J0N-QPF!9D|KANOI_)vXu7#bx+sknoU(P6@8>=%Hf8yIAZ^w}Umo(p(1~w-Z
zyClZBTEnkejN1p!^yd#swXS1#3LBQ*!W$Nv&>FbZmzk-mN#}o?6o`{~dEBjMl%1^a
z4}cy2alUd2B4urauKcx4JC7f>d_7Ple<RXUY`t44<7CNTq0{|K(8rI{P7_1Sb(Xx}
z%_Y7o!pYV57VhBwaaP}xw{8%btl4u4)u3JX_0(`j4)%U4hl@=)ej^4vTgf39Di>c{
zYtQ=LLCAhMxn2)?Km9}BHW7zqmXF!Z<y5ydAod&keWwS?A%-=oq$Jbk5q6&o<Ym0h
zh-+r@Yzumv9x>xL-%~XB4DP^SF(YXFDo=7kWSf%?Ys0nvslKX*)pgx;3{=MwdRz_K
z*jJS?Ve5_I@*^NGIMxTuG#5#4&)>OX$1i8tq1>nKEnl--Ql;;A8tl4Qf+}){^h0aq
zguw6w+HRSNJ;y&mWsdP+`zTQ<#l-)|+M5#`mjh;5(S`UYG#baO7x-OqqqymJ3^nn?
zT4IiK>89lI0?<#lc{Fq?nhi7QLF}o8*otdqb+Zf-YB$_)J@gZa-of>dYcN8=Y7ot0
zgiRVUk{!wr6OLNamFPADiE-i}x}~=P&rWzjD<V7;%OqIze9zx$2F`A-YuS*^d6|+Q
zP2_4N@{2M}MAwtk4$FJE<wPgsv0bkth)$;qA6XbYiBt!fMhb5!w)Eb({@MO|l>y`t
z+*z$I*FUi(8`eW&&iNZXEJvM>9L@jU=C7nCsgN?4I{HS+bRQBg?c#)bE_%ehj}Lzu
z92Muod!bg!S8vnZbZ5TC4uZ46xyiDfmjy2N#FKiP53uW}Zu*&gz&Hj)8D>9KO}M8H
zeXH|5jBeNuuo+-=-nl<N9a*(X1~na!TAL1eVM6YPe!iW@rn}=97nm(ns8}4_Ow(rW
zeSGGlZoOn?##pI*rZPEX7mQrhKEW&B1Tn2HL&CUVn=}$@)!bpotj(;b(}YeNU)|y0
z%G0iUXX<vJ)oCOVOc+D3x``>j2uzzp+aLVTcBeu2cWT85j7X-_<rdWTJU!-?L`bVl
znwGOYRoO@m7B3;e;)_wEYOt+Av?mE|#ofDiAkl<|GUkGa_Dt<b(h(46o<SO&3Jl*v
zVx*s5RGbztRY5g>QbNV$-_fnta}QK0dw~4FG@Fpd7KcvjCi;`BtBzlX<2>4v$_=gr
zB!xaICezAo&#ml~iF=}HiZq0Pe=pFU&t{qgQWt7MAeRn^K!W#MyjB}W(C&I&vmdpv
zg+1jL<x{I?J{B;N&-)ndtTQ5`X+bu5RgqNtQ-&OF36#KV!PkBJ>SsJM#x%bFjS(`T
z3@q`R&8++t=V5K_f+^(0A)%wnaN<O?=DOay6X{F^jFhza8G+ZkJm$k+Tr{uO_j^D_
z<2F=ljf9=5*c(TXq>T#r2?KXN2bkU>Lyr{|5L!v(O(mGo*;tTu{Z*v;FzrE+E*RkN
zgdFMdu>$exX;}qGbv_ucYfSo&QiK(GW@O3GbU>gvaiKy{z|<D?4Lu%Hu%o)ELuiy-
zuO%8C&iD4wkSlb|VC|{UMj#`Ggxp<XG#S7nKNC%WN}G30Y!%AnN&)SzB&~>OC(n{H
zC48GY`Kb4h@M!J&aUbo1X0#KjC!m%p2AwuF%e+J`Cd<(5O4jODj0ua^`p^CaVgKi}
zigv;NS^44f|D=%sdom?1g~g}HOMgcSp8iN5-vi$grJ42pcqq8Moj?#Z&$aWM(y`k>
zNQe-K?13@R9?FUeoKf)_W-_K0P+$(2I|##8;=IW<QDrp1D;u1rH_XtR2*GsjbMhzG
z{InqzNE5@$-DGebLlRl>wj6+~GaOVZkHarAP`i0?_>PhF)&{a&wNusC;l2NrJH4od
z!>3GN`t|WxAJzY)hcpS+p7Ey750RJ6L$L6^wR{Y|F;az~JhqE>1h*kI&hXEMANs_P
z_P!Ejeq+mDd7g~+|IV;PphJE2VZumM`E$wx0;tE5Eq0N(?(HiH(@Os7WFiVfh{)k>
z`7Ah^v@g<$JW;}RO`VdXL1d*AM-bh@tmY9@$|ykmx(C4VM>caQ4l{Z^9Mv?w#g5%U
z_I;pbPDqjjtD#QyBjjxb%m^_%agJhum}M}cj0Rk+7MBs8iX|9zH5(=Fyd=IAySfK?
zcEJIlw;AX;Cd*VNM0JBLZm_^3NWAPh83Zy^wz&IFQDMHRzg!<*upl8z97j5_KVpdE
zhI)o!I#Nnrl2M^;Va54I<{w~t-poetyZAMgoGgWo<e3}C$@d1NQ8g9<P9dJE0&hB;
zB1_(rw_*Uh%s`usA&XuPZ<suOT4HH54RR{eNblG8KR?{qx9|=6ak?T6^ZR(u4=NX$
zI_MM}z_hN)*DGm$$e3+85JRLC-U!1f{LEpcuY6S09v|EJtNkvk{dm3eJfs_RwnOKK
zwf4wE5cp`xuYx|HVL9fMjo4KH4>(UE2$AsyZrC71R;9mrDYkv~KBNQtu7Toq{XP?T
z)bnw&%IX;t$llkqw7lY8gX}>LlF=GeUaoQ_QL`_!td^_W3*(kkrS^nNK^+1PWsfBJ
z3}5?O0|uaj9-BeP&+(3Lw%&JoSD>3H!>Sy=FchA)PI}aKEG1Wc-P-5d!<1^sG7sTq
zd4NFZ;)H4R1|C)nPHqn-7urC2yXY5!Tp!y0_+p>S%^rLq$b1ka$t2bBb;QUk`%F>X
zAj}1;ajeccGq17$FOr4Kq>taoadTrAWc0<`koRB4DsJj@aavzlkb=WwNt;kdYWAw$
z>Zwk_C)%hgeO9~LPnSC-`FiqCRX2^xfTQYNy2+%6Odz=v=3I1j8)u7qv%+!W!u>(D
zUn@VwN4NwWnd*9(wyFl@+f>QacLMM_>T!3>mZ^-7>V|&N*TklF2AkD2xWvnSXVh5|
zn&{UYf^iZ<Ffk%;gOx8UeMgO9z}ibgsXINpc@!j0ODfx;x74Y=hn!3h<lTT1(PdGH
z_XgrKeArpF3Mk~Po%{m@7H1fta`z1rNgD+>eXmXeLl1%Vm3LvfC_&1-^m?%SgRn``
z3D+DK?dBnLvg$WtBfOr`E?Vh8tgx7XX%xY1nrSJ0{v=a{vL3?N!AKlomPfXU2+@2I
zDr(OmTBFN)=-J@q>i!a>qGOHuesm1==Dw{p8d(_nl3;V07JB24?B1sq`nsaBsv-W)
zYTL5%G@X5o1LnoLhuvc)+ND1-kCw>%R}2S8d!nz05GoY{8v~zxyFK&T2##Yg)#MPT
zkMm2f7J27mh|TzN6lIa8`x>Sr$=G@|jdh)jJq|%6n<=+Vdvq>FbT!tS4xwKxDH}1&
z>4E5)_;IPl3R*K}dXR}c2SbXMFD!Stzl#^IIf#@67*2?u`wX*1m~(y#qOhXhAl)PT
z+*r|`^@B5naOCJEZ>MmDtkfqQpGp7l8)nzY@_5#1Xub4}L#iD@8r0qW0x`wCe5oGB
zfqgnwNJTQq8xdfVpWA9cvRGkSRifoIGx7FZ@kdU4g%G}U*zdRdI=u3reX$@%G9>aF
zZ+bz~^G`QNyKhUDrz{14NKBl*gcJiMLy)t#^UpWiFGyq>qNUS`Q1fx80-5W8)MiYb
zWh}oG_O0>8e1#s(L>8g{kBo4UkJiOc3>#2fThNJ+T;R&sd8RT+s+SXoV+!oMr)C*<
z@@*qI;7i}zUj=DM0F%6jfK*}w(Jpwg<UR0TEc;iYH}+SNeI}2c4@3-&*z{I(1v6mQ
zOPDkmt~Nq6SJjgCsw$s5*6_An0vejTLN-334`=x81~AssgHi!Okf+|QnHi5MW=c^?
z)78I6HbnkoPMdn5yqhC9W6YGszc=IF_d|4lg}M$7r5Eu4%3DrfT2FAuzI8pg;!P_x
zI}!KjdN&FgF(Sn_2SSFAmR5&uG7yrAXJmn2XpTNqrhfAwhR6FLQWHZqqQRwAkL-^S
z>H~1Q0hMQpVn50~H!?5gV)%f}BtpVBib(j$$sSO9c@nD123gIGm<}f^v)g*s$Go}`
z^qvbDPSrAPVln&*Ju>Z{LDzZjp+g5^`H=94SPcVe9Zd`O`EHs%XERHeD&Nm+DDlSr
zwhx4ukwG@GRXS?F*B<mxt9*rv1bbyKtT{Gz(}`!l`u$!}l2#aGMd2iz)s~xeDzYi&
zX~i5gg_j57Yu7lgFWz$^2$5nCH^~k-v-bV+-eA%Ana^<l#HAwAV=q)&#z?PSg<Axp
z<bdFhMWkZNss6_!WI7WtT20wKnCD`W5uf90Rn9>bC%D1wvq7%O{S1OFEh`(buoiew
zbW>A`T9amaQmwC1jS!KP8(W95L9<AR!L_QsFaFZ6iF;cv?_mak-Go!>m1tTN^)|R@
zu1GRmX9HSGixN2mx}myn?_g1+?Ve0NH2x3Nm<<^#&Ny#<xv_m=`6fea11{6FE<^cU
ziq;?%4R1>20b(i)JnR1b^_KdO|6ZDVzdt`d7Lvf~REy{U<6BNiD8u$?P$aJ9C7JHZ
z>?fVUkv2XIHE3ZELI@>$(P<511cE@2>grCgfIv2-A$f*BNS?t1#cbfN`g<1#i*^DG
z59y#nJE5d_n?vXx4jz24yM&4dO5G+wWZm@Z_THVIawd{z7$Pd1cRE|CsgQ{#@{nqG
z-~5OKsaxsZs6`HI{L!Iv0(P7pc{i12$?@kf)P_B0q~E#(B}~W&wbD0%b4fUX$^TJM
z4_<*VeIa_*tIV;y5e7W5d*_*~FrYej%y>U$OQ=xnGpj~!IFJGW4I?IPNNq@g2&R9g
zR3xTcGU+bVISK@Fw!CI+SSU(R_|4@@+O5EEm{i2jGZZ=69pGpdarFeuAu+b-gs6mZ
zQFzEO()mC34Jm~ghUK&mRzV&)^)ch3??viLRwDKSJAZ5#lHz_R=FHI}I9Jz`Dev~D
zbO<Rlc+Qw0VFR{6q$9z`>UY=tWx0m|@U#ba#B(S9h$bVOlST<Pt;J*2ui6c9tmhwj
zUF}I-O-`>N7yMfw)}xqUVF2&|Twl<J>kFTzdLbA6LG($qJ#@I0UKK|?+DWieb|=)x
z&`%P}WSjB)ge4@IewJ2(|BTQ0XE0@WswDJpw)YDpXiuJf0-Ce*xfN7Hiejn{oF4>{
z=iLl-kfrCb)>z+$444F`G?xDX5|q4kM@XvmN2WDjzX$^|JGR4`PFhEQk0(b|0hH0G
zY_hCdV9fGLK>18xgMvb9#xZGCZJeYzgMF&4@g9b<Tym`}9}->Mnc`0i_?X4Le(C1?
zdGLwCw#A}4s<NhyEMs|;1Ky?T=_V`}e}+JGHjGM}b*%P<LIAz;Fk4CI%1M}aaWh+h
z#ps}hmVayWU6}M?9QHJL$dv9>6cxJ2G<U90>vAr#+M*0G3cC|;82lC~U3-+lBBu78
zt1qhFmtNYmF1hfB(i>$cjpNbb<$8D+8PoNpO=`Ye>!Iakg8tX$UgJ2yF|GqmWVfb5
z6Rdm-UCob(m4+sz9BLk{6_-9-qS&3)oK+<cO3|XmgXz};4t>_bXutO2r#hysV6PN~
zSJe1pIJEqJK-rtuiO_W&*@wL6<?Qy+r$anj#42wtlJ-ZuczmAfu8ABl2BbvKIR?o<
z2@SfID}l)^soUaV(mL}EM7ir(N~!-GIGS6rZhgHcQ7L;xt0wO$WLVkm5K;pst?If9
zkAg|(^89J*A^!JeU)87_4qEo4z-+^_1?PW%x=V@=xse!5_C^I#5SN9U{|`_VxXApe
z+AK?3?QF=LN3Ay7c2CIrzlZsOLo<{AvIYzv_;2G{UkL3_7(}X3PCy@%Qd}9(zGp^J
zv;MmEDVz~9lPiv$1xa-wd7+4`p0W9R+kbqtaJJtaZ}22uG)m))j>h<E*h$B2?>jxh
z1qFlLQU-aUb1WzpOEqNpo!@UL_nnpN`?>*<xcw2t567bj;}-;jv#gxEMt76#V;tDW
zY`m=U2Ef6;&~VAJ=wn~*a8KK>Q|7Mpq0;(g+sM$(bYlhU8pDe)%Eq;uR>V{3@{hfW
zFptYzwMPai1j^e=mZxUNdOMC5w~D$Y^$5*FJA7`XM85|;OGFrP6LHIFT{Gau?Q@VE
z%K@rTxxmRT8_QA{u@LzKD2V?pBKN0EhNq!YOr$#Q*q>O(*8RQI;(Po0q|oz1O0`FP
znERQ1qOLFU&E~zWv?QzHTZog%>ShmyU4aPg4>K(>DTAYoB`wb&JY5Ub6E3-z49}~t
zTf`9SfNm6iv(jz-?Ao~}8wy4n-VoW>kik;?Ct{bEG(w1@9Q(bmvDsbHP3GmQidR4?
z6?Z9EKZ0_hHw<f!?i9Wd@$G=JgovNdZ)~yN-R-L#E8>&|cl8d@&E>l0cOPnfcNo&S
z#@%r7_mT%xzmEFZPp8<Wiqr4pd#;M&m(Kfs8@;-klu!fV3LL5fxn)(Gd~FI$|6yrV
zVZOmf?Ep{AQ~>|9Jz~_>+V<6*d{C6p=sTWaDu}rrAH6H#Vhd7!>I&24JT`whz<e{&
z>@s_ZEHYDv3TX}Q;_)0_dkJOSSn4QAGo)lRCgWb~#B1)i^mRva?MMC~Dm>?aPP>7@
zVW>n=$k#GTDZU^Tt4WMP4E@<<FId=E>PU~%eKDbgtMt}BKsahXGagm-F<h#_eh+-S
z#oi2J{T!_@{qDu$FZ_>@y-QG3`kTQQ#twn#X~0B<zH~7*V}G+c7OB$<6D034I8s4Y
z@mC(3CGq~C`u6YCvT$(*(($-Aq-@`}Ns7C8jv6tUeQS%}M8-12s0;RC7q0#gY0Z2Q
z9e|Z<(qDI3^6l*aW9ZRusNK3E)APgW!t3O7y=+#TJ(-AwQO?y`zxE5MCRyXj2}-j&
ztHgQ#JxpknYu$P=54M;bGNI*pQym6zSZ`+B!{`;B0~;ndvcB)(DV@mZEB0r^GYMj2
z!q^7x^(Gt5fTo9L)nv=40=cc%#B?kK^s<;C&dsbwLh&kJ^JvVQ3ew|UnY-Pn3%iGQ
zu_)6#a{-gyYx5^kVFooUKF?pp8Op#wBVBeDqDkvKcsC}4Vl3{Lk78cG5XvM9Fkd`z
z>8DHN<D+E`vEe;yoL~YCK?cSLnd>EL5^sxDI=eF$H1e)XQ!to?A?MTx9ZYaLt0K^h
zu@q2C;7z{SfmF|CT%XLM%l4>-qQlhJ>BnawXCm5u#h&#t-mAE6JH)b*$wth#_}AMK
zcF=6Ggj&Rq_Yf@o22m2fe8QQCCz{NK+9lsH*uYZ@!-dAu^zq;D`>AUVtSx|Pz*0h0
z`mc=9qs{Nw_bU?h%9?-hN-L098pCG2MFP1A5)H9drW##JP`t~qDh|niS5}92Fu2OS
zY;QuU>C`K^*<$nL$*AjEc17304l{B}JCJZJbw`KW$MH||*tOoM&tnoNi+N`&$l<^L
zzR`0ghhU~`VEn|k@8wmxv$FiX2NoR@*zGjWiu+2k%x#{1d|$Ok5-m7`R3xYiqF%ed
z&6LXm*9aJTz!k*h;F#p}QT>hO$E;fFFxi&(w(@;tY7gd!@hKFohB*@xt#X2p@pVlJ
zmKQzs=Amv^%Ma}LuM1IiP{_MIs2ns^b?Pxct}^wUQUzoWNZSu)JGx$FKH1pm@DP`F
z@|;&jA?qEvGj>O=olkTY2;8MLpQf*wX7qFW$$sfVCw#i?J4@DG148GYq@H`4xJyt*
zp4<Hh-N~Xu+Y^DO`IcKQd3<Vu6zj*1@!;9zjogXBYp<p}RG<C|x^()}#^lrJ9i(6?
zDua$xSqv6;(8_UEF?U>;%jZ_yqA=ulr##<jrwi&DA6_}RCr`4k?Tm3^tL0dC!TN{#
zVGjv*o`-ky{nge;dj@sAZy$1+f|0P-WNeL^R%4ksr6o)6#(Mtrw@_JUeR>Cypbe)F
zzvZ%ylKq)_K!N9`oYSpqt-9$q-P_EJ3h%u7v)b};o~f(Hso8?Q&vNDi3~tRamKBSU
z|H3}*WoGihPaMa&IR1PLHc2QrK_Hm<yxWaJV<cFKa=F^3GP6%u62d#@C4F)v`?*z0
zdoogApuWMqmFv|c-$<TmT>IpZw}S1_cMVw`o;TzB?#!pM?76t0)%fCL$%TwtmwF<@
zjP93D1(iNaYLoyw{?983-9k?<FAJ$2Zjaa=Wt?=h&^V9VxgHyS>7~-U!UL+SzE>uT
zs-@jaI8#_Be#8#H?>%tImt9b!f;}clS^f8$Jx}+3PPyz1hJulc@7#xDx0D3R;?KTo
zo9OUPFWPtL7Vp=03G-6^Od^3|*!%jt(%iMH2c2q7`k&Im0Uf!hs0L(go(K`byHF&Z
zFK?-Z+DtIot!5{7%Lg>Zb($ZZyN2*&GuDk4neK9$ejvcNXS|Q;N9f9_eUCdg$7)@n
z4$hO^qd>cB_<bu>zn*@cCLnrvsqALMi}m;3f$c9>?meb5@O;Wscx1i*F^k4=w5ipd
zM-~l`zO6a7z{+p9BJdTw)<X%e>MQL(x_ecm^s?Rx{hmu)7K-3~=!(uw=8pUJWc$R7
zk-^N>9aJcEsVEMY<5XF}LWSaC0i@vRghVHa&v(!DwFi8hvP{MZIkgSkP35o0xx?P?
zacK)>ifSLi#pyT*hI7s_X&C=z5sc<t<~YNhs6CL#Z5{TjRS=dnW5T8skBlrCgm>9c
zF9q0QGpd>n?K<Chj_`RwPd)gd)PczTo;1|^Y4Fl*_)^J3n0KfMri*a16NQe!<8WMN
z9-{PAsC4GoXZZCu>RYOxuo*@b100s6?ta}6*HN*3;9Q}12zLm^sx&Cx`#EG@T@x(H
zpRzsEy71}Y?{m6|5>(5_fzYr-0|KFe`346ILVpDYrp#9BSW}-v-(;P7^d(T-4_U6S
zMXfBt7{3>lx9fgi37LM|a9VZQTJMQ;g!9PIOc^XC=X8!4_J#4(ho$&0v<MM!s-)6A
zyL7J_wfpqc?60p|^zsRLIZzw8K6*<{{@0kppD{s(&710*Hz!}tertA9tedi+ll$}Q
zD6wiM6Ap+&m>P}G8x8b9q0?B;Y!=`|4H`m>Dn<?j4dmvIJo{G*VAw#T{^P#hk{?ui
z9{%;3MqQ~~xnj_X<g(S-D4sN{9S+dSW5=d=woy;t*g?Wm=bE$+_(lmC1f)N+Qr1oA
zxt%+=-c)<hYqdu8W^ElEFB2fCXS7?Xxw&}(*~bo25<Ny=VHQnC2q!#3MWkiehHd(9
zLvhqKK2SF=#e%`VaRG;T<CeouZI4!?g95Aux6{iM02;d_083}zigHt-+-M~lE4BJL
zEz&HvIZt^ZBlA|cCtCZOi`3`$ADwKL8sJ<m8GK1S_o+td^E2F+&_woU=MjP6xPU^X
z6}F^&X2PXhBjg<uBC)>e(>*hXJIG3cARt}^=)Rfl1C-ze2S83R*Htz{*WEh#N5~S~
zziBGe-+PTya)f#6F?p?OSgYw=I~VLNmtKM=qvvWu0xz9muB}R=I;nknUTX5j_RkwW
z?f3I$zm~2%3G6%{rszLEi_Xl^Os8q-VC#!USU8;U5Y|c7uulfy+S6;BGV7^P-&)*=
z7BVimHgT&<_WF9_N><p<td^NMkxO$T4T@m*U7Q_nQeiz9U8-f2C4hic1Hig#<}jTe
zeEx{)W>$6bMD6>tpx4MVv!JUI|L5n^Ftzs2S8ES9FX7E+OU=(ziKiK5LHPq3gXhQL
z42p6r(sq#c(^WBN_AI2$v<KQYI>+;!3inrD?_p_#fS!l#!0h+V?=CH^y^_0#Q+^4f
za;$Op)Z+z5N7&5IBXpmg8y@cLE?J=lOYW~bLS%MizhK<(vZ{KdewRU&j@jYaKdXVK
zTRJzYCoFd<R<NH@zNjl&M1zn8fet{xYGchui`6$NNa(^3OMK0>n^>&wmSz(S<J``<
z@iS*7=xNAoGM{&F`yKjyO#OOs(9h_e$oqASjvp`bdVOO8A<ZVC@&FaefR1%%`i`=Y
zgJJ^n{03^0dS}y|IhKk?>h}&n_1?9a6<5XOcfnF4Mg@zasja#4I=MN@31rc<A4HRD
zdj%Vyl%4J!)d0Ab$x%X&qlAC<MvEwOlTOmU6V#4Bsb^0s$XYLaEI;iAaQT04A2&=W
z_;~wTP*x4y(K);40o-yi<`_BFGc2*$%FnSAZm?0Hyzp>5jQKkW+m>UArKt@)b1Cnc
zkbfXk!6@y|L!^9^7b>+izFamKL?k>=#(_;4<vA1yI~<^|YKSqgy*sC$KDR02t9sLP
zsv>wM8y$aAKV8Qn!T<42CiByMqjQgr{BKsJLJ6`i1S^}^qWF*49;)RjB7hcXxY_*K
zO%|2P-&`d_77|1>8h^@d|2!T9MmTjF#^`h1sY#j#%Iy3{@>6d3XgXd!h<}tvBe4F;
z*48$!EFy%N%+dq27=7W1Vlfa8q#^p-`Dx~MS@>F=P{9+^ULsp}Yq7n@uMd6CFTd(|
z)lEi+UQoMqTz2#_0`C}51zb**qR2UgMN`y8rP?z!O$T3^w6@y&9#UGA(XKC8L8~}F
zv{(|u3?#TPq$cz9&m+ti4s+qC40hsgkw1S&dFkb_8k#XmRj~Wm>%|HX7CXasZ^YZv
z@CWV4{?7m0C1&dLBOL@L`!N`OvokLK$QktkZn|5qymTWZSct5(uO^OPcX=e6R+{T6
z{ejxuYx5+VneRs|eT7LgPdO)oip1jtg3L&axhH%s#+x#lSOyXETh1&ORYz7v<gGcB
z@b-QaukB#o^W^7A!?&v;!M?2ev&r^NE_dvcb}x4Ru}eixjrl5s!z~()`y=dM5J-|L
zUCf}Lbe+iarxranem67-_CGlrG0%kS#Z_Lx;(gMc+RvoVhHcCs2Y|-~;PL1&9kKyn
zj`~tM==bxi-K8B;m)U!9H-CiBPOx78{HG!!<F0Do=2~am>ziVSSxmi_6klGX2%K%;
zwymTYvhLf3uS?!hvUI;qtf0T;;FB;*M&GvG{_I!FdFxnj{a|&N&}lXMcHqRlAB}3u
zCnor+S<FP<eL$#~;Ft@`xHVVWkU7|p<i8U9yx~mbMeFnp+ZeMSpMwt^lX`wNW<>ds
z26^8K;yB!7)xCCvW4bWZk8bifEgA7@aZyQYjy#D!d|=0-(^tdO(SH1JL}d9CSYAUR
zL6c0H8R+-M;yH>*&gst$W8R%Oaj=D`a8oU2)a;QadGS&ZVmZ1h71RiTjO+@;<OcL=
zB+NGy-Wbj-Bm{Y69=}?-YV&xs_YQe+pj0W)i1zMn?8pNM5s^U)hV2oss6_LO3(27R
z`$t5={(rwG1M2^s@Zr4E1p_=ARYSfd-er$J`WM+ma1R>p=2(%@PcooenQ2!{2zP$f
zd2w=ld>p@AX7-O*f>0>ti--V*KV;LcOf@aX_Plv;x%?mDq%mVt@MF_rhUAssgEIcg
zNTDm73&^QQ)wQe0QYlqkTYr9?e(vTZX50y>X#C`A!5rlsgv3%+&)wC@6hcg6j&;P3
z^TY`{_!R#*&6LfnbQZUt<tmu@<UPRh6tYdpCRlI^;0&r*%A_cNsna45ip(h|>H&Ek
zC*Qh67SmLyB>0+?eT{6zp-TmUVN7f*Xeo~k=ZA+Es~B1##sn4WCbo=D-|5*n@d}>N
zBPFqji!F_E_5~t=Fd|*{7~#nsq(b^pYW&acYOUJKU{F!^*2aoW@he}nA)m?-kD8af
zm>{T+#tXV>_kzqtmQiMdpFCQ0qWpvx*xdJay4l}NF0)`$s-8~elX<es5&ej2V74Qq
zF*}&$A!#R#>vD?HKW^O*YRAmo%bx<2S6FPy0v!dGhzW#ZR*fL<84o(fpzfJP4&?+y
z`1(wPyapixUXA9lzxtQesp8Zq8^m~m#$90gx7f&sn@AI-7>cC96bqR$>Ob1P0mD1~
z?it-M$jZ|}K@DEibhb%&OI<xQFwOC@sSAY)04IAWIAH~`v%k4b8IgjpqXcv!?wR`k
zwH;D4oPjUu?-@*Y>wo+BGOJhJcV)6sMCI&jLWDqt5#^P|QgFf!nlg0+m3S~=3>qda
za{nF|2gg<YxBjp1pI#}rzk7W@QkGg!pLu2Xza<8NG?Y<fQqVUAJIe89r3{5I>fmcp
zBxUbt*gLI4o7msI14~dgfPEna`-@kP>$PcVZa({u6w+7@M*OpLfc9Nb%zt5o#nsuv
zs(SzDs`T&@Wz7Gz6*vhE1^IyA`sz<W7lC!a8IH|kwSRdNmy#RKgfDMu-K3Z%5|7dN
zHY&<V2#)RH;o4UyaHpli=reozSqM`w&JJJwuw-1N6o7ShU)SpY7jsl76>9KLo8IoJ
z)+i#I9RT5SI!=u=_s5lr$Sd#%OxaR+)k2kmICfBIhk&R^xtI3U6um2qdqzt+02S&L
z@W}!L`DeIeAR>wwYN!4_R07+4NZDo_vQ4iGV}G}a#&1&4id!=#og*}yetKPk-OZm2
zN7#WGD*(LL(&{wjATE;#=e$@|R0klC6(eAw0ZunRZWW09MBfHb?f0efFa<!SAjB4!
z|Nq(2pI_fk|N8b(<ueW0M2!7+SOWx1&W)9F$cv1Q;r1NP-NlDW+y4#aPZ2oiDLClV
zqaZ%=PB^m(zAtBi=O!|}$HMUeU2*w7ycFPoj{*i<>2UlK8F2&j84r?o(lCz3aCq%Z
z!{|EwNmI&AwlwG@?f46nNMPmNopG!wWWbHm-=<ZWiV{1PHn)t%Z+N{$LPPLN$NwdT
zfc1Kh@%cI#U5q=1Pe^X2GVm2Unw~0_i@0dW?+@+*BZ1S#fQy3Hog}BTOt&VgwageT
za-;a>O-okyGQ@vmrK}y^g#+%2JhLmBg40TR36-HzV}jZ4levjz)*Y5Lf29tVcm+Ht
z)hDYIK@-fepn`K#5xVs~zWpBw7a&$IO{^5fqKXly8<flYj7e)-l+nGiyX5NCzihAo
zN?_!2g$IQR7-%I_9^K>&mY~z0GI}_PCB6J7w`@R!+SCUzQJ`Mh)$rUicNae7`|6MY
ze&C8QeH){>e)&Hnjv+FY@&Y5H0?y!-mn{cgPEF<7J`kod$dil`(XCLXAc=rN1S_{S
z8v-7jB&pPPUBN!G=WFim%_|g{NjeT(Ufz5+7!f61&JagvbmLCRxHT~<`Ci$Y$R<bu
ziLfLnDZeijyvPQN#fF`8jnh;4ODBq_B1iMcxF{9@aaTJumW_O4#-5QuAnW8ij#j=R
zMmfDUyeUrMt~>cnU#a77>Q_W0!H#`{hpwxJG_)rF6CDS20I96D78KP($aLvFhPP;+
z%+Sj@9xdw6Pk-hgI*$W7&D{((DCiU-wYe$yQyDzhvAh3QFrrIA>gH&QSL`GMHe0an
z3|fE2Td{Ov!SXMJjsS!*Y;5hx5IRJ1>OXef)tn<G`$W23;IWhBH>KPLR$Y8=6@+}t
zZtE&?Q|Mj8^TtyhS@&l@oe$=<I%P))d2LcCB)g6f#Fy%XX%NLpcer-vYm&pDpsK;D
zjZ3Cbjj)vIHL>S-oLj@6Pdx+2uTW_Ee~JBKp@TlT#zANUwX3DSKY;Etp*Qj((>fCL
zLE{Y++K(V-B^*J0VLw$(rw{R+h96&k*KO$^QPm+IDx5F{N4)*g;UoEoOr$xvyu)8b
zx2zBL+<3@j?)mv`7ip*7)?p^GvA<>kGnfVVskI7YM5(yZ9ye};Se=CcV0-N7XeI^Q
z5CSr|voV&Bd`n|0%XMX;Z<bOvsx>GLDHA0F8|Rqk@K-Gr^PA4*lp0hU*SZzGBgY7U
z(Zx`@DMlqiV-&29V&rnn(jB@Nm}cVs!r?z*1d!<Brq^8?F#@HpdXg3X8g$_VF*w+9
z8yi*f!GcJj*Ag0qU)fV64(T0WV?h269T`54P{yT`oT_Q(_8cQiTwF>GAa$nMiX1V~
zH!YCD)qm5E$k+U9gOGAS_Ulmyi<WlOxUv4cv1Y!8#b4NaAtu3Jvj$?824(dY<d7-^
z9gl!1%bJx={$W=uHH`Ji{s>_lZy|{%%B$Zjf+?g@&=DS9+>viZVU1&S$E&rN=^IyP
zJ5&<n_5Vl2{<n=n7(va4_teN6!000p3RiZ5ul0<LEEWh(sCj>ViNRbVjj8?LSD1ln
zKSh?lBsBHsl%D2JT7^!rzpU{eo8d<(LE?q`5Yt~X4U1a~1Jo?gQW%7S41~l!1GL|@
z@7O_E{jb4+Bfv{278M2Sf3*QdKkXB0$3KbnZ>j$nc7K%uqN@RXo%A~>p#Jwm6v9dW
z^Xk7JQtkrEeoF25pI86=0LpUbe;%ZO<G(hWsP!7`ES>`k5JOTp5mV0L_<eQeX5}0R
z9bGF|>c1L20QBBhIyl?z<BpS1i+B0E%O|Ve@m~2|Q@rsyj%QKJbgr9iNB5m1I=JxJ
zy%onI$Nf(LfU}J-6Zu=_q}EK+IT-TIFk|u)Hj)xg$RZhXOt^E4`TN!$pCTfpYKj;f
zd(h16u{7U|Lt|}NS><Sj**i{Pq-PjA;yP`{c{RC8Uv_U>HA-(WUa226|D;zw;Y0C-
zEStl&w<NYywzo&^M{fXJ`~fM-{OTeHrAa^PGQzSy$>P?#4%R0>uTw@v#h9Kqd8_!U
zc2V&|WmDIhgZ+x&@@7c0!Pyzznk%2b`wl7izx?X)nczH8W3Lx~4z73@b3Fb7TReCH
zB=GAzeJa{2sg%nCYDzWUi)54jATOy?tGB+>C&n-lH`yLo8}6($V_|^7a*{w}qLYea
z(UxqYmm+U3HQl{VK^x}{Fa+Ov$O;J)9RI*>H}cvANlo-Uiuwti)Op_->}C|t=W%~N
zVO7(#^ZSJd^_M~>1qX@`%uF1Q*<bL}{>MGbcL$?q?`|pg{CW?L$9UMOKqO?;4-_Kk
zaj;it*s`E?R{wX)$rdl64zgVkBajm8$Xe*m*`r>+s{b<~;djRgf=tHVO5m&AVDHL&
zWj9kpsbbz$PJPHySZStl&Z0;@ReIKYYoecyoBz%UFa8j776iU9;++|pS71o*VejuP
zpHP0O#YSX}V12brjo0$hm=_>8o`L^c*cAI2ON<0SF!Xy<M0_y&8PKMBSzqlWvVdv3
z&VbCm>7`69G7KK?$m|nwikL`2$E%F7-5O*kujnrA!cTC7QKD)5Ey)c`3y{VbVX1md
z5f$?EgY^3GDld8&750DJ(ehsStP1QBWKfRjsR}w$DLhqb_JpFgA3%m+cl*VR<Q*hm
zFfjVYP8=~TU?I)59P!m?f8p1?BtT{x$B(Ely78PE4c|)7YQbWN&Ljv{phzyT+sE;T
z5IO_MnFHho1CEKI(0$<AaGh@7FHgD(SP)+ik|y&$XJXggUg8Ji4lbknY>2<K6DWOm
z-7(fU!YkNTXS6+Kr7ArTk$X=E@)K|jU`26%tja_;!~Xm*@ERaK_~U@w)G<k*su%z!
z=f?t8^tc!fXD!~6NAY)O(TG^>{T0kxf7Qox(RSl>2U`~s#6QRn&qi^ZTVUkaL}~Er
zC&ko`AEHR1|9~L8s}2tKz<Xbb;BW%L_f2yvDZH4&m4o8HdN7F|=yyjC*u0pD+CicP
zu^{}StU|lMgh2RD#O*MgXZ`loG-ljEPk3Y9OTESa5(Xg4x^&U;pO&S?zCyZUvh(6E
zSC_ajA1^PjVmoh$2$Sj&bc{PaGW_3m<SqeIJ6j`yOp5L?u!nUKFuOGFmm`e<m@9t<
z$y30bMv%tEF$qF2|2NBJ|2|6Wt!o(0q~*guFE|VoSWI*LFHQ3>zql8+-Mn8OmBt_+
z1qSCq$Q=(X<e5@qBtZ09&kl-{;nQV`JnzMY5VI$ouD&VzGxGvS_YtZhLy8h1fx*9P
z1RJjr-n9TfzUiaMhN}aa>>MOpOA+5%Y_a?0>lqDrL%yvXYS_~My$!LM%Ql{{Xn5Ca
zI7FleGc{Zlf(-&2Q=VA{XQ#?=>LOxV!CFg_RhQ6W@YBDJQg$ZcXNPKSJ`?`A`{^cr
zLV*L2%)QPU6A;P0J~^MeCsAk54iyt9l)V4{Veh@;v26eN;kr^Ob(<kFLiUJ^va`um
z_Q<Aep^WS(*)v<pO1bQr>|_hs*%6YNz4?94%RRoo=fCH_=a1+9>wf7v&+{0c<2XL!
z{eB<NHx=Exu&0Y32XEnVaAp*o(ZrsKauN_`N!`d|IEfO0%#iCDCl{0?q&Y6+NJnAQ
z;JDeG5INTS<ZvtWVi#~bG^G9YYiUCs7?Kv=cY((vL60g3LRU7C;hzln3t|NCY4};2
z%dN!!#zh=Cn}p!cf=Ill0kKc^Bgl%#K*IU5`V$s*>`a<7<imDu4O{zg2cJ(mVbUv@
z3Ogw{QEw2)DMDMdo@ZgRNeus0hTg!)i=3(u_D%s4df89cDv3P>|NR0kl72abLj>Jo
z=>LJi=E3A~)lM^TPn%j<u?6fRy~insUpb`AQsBtI&T(1UgoX9ELCi`Uhn%z){=ax(
zDY_#WEVlo*Eh#7bUSc&+ig)>TcSu$ioPnV*14*SCs;C=?>x&&2oD4ASzw0~y=9w<Q
z?C|)TDjn|v-#&>$jTh1@r4U<pEdTU@368^nfnYeY^l@gu6df)eZx?9qDz~i;nZOyp
zXAcq<v%#@nhgjryXMgm$3o5mAkDgnT`W^?jY%u8>j1JIMBA5_{v6E_IB4ACfHo1+6
z`Gmu6!w}-6aR-I73&Xc=Hs6YK23dm-ikL~T<D;c*cyxH6y`cMwCgXdy;v6)90k%u|
z?9b}4;4jh~TXYX>dx}MReSs&uH(-+b%@*(CoPim<EOmn!2m78`-Q+7sf}(8At&A4_
zk^$~?{Ns5HrT2LlqvNw9>pX(x8aze)=h?#J*a}^zqrGu4x85}BXuZSLSc8p1{hCx5
z4ry^NCUII9=*Ftv32{p>)86~K9<UXDtEvSNjyT{mYs+%(mo<rS?j?=yBrVxa1>S*H
z#Y{d%5=&0VK%Pehz+Z^0?D}8}ges`PZg*@UoBm4RaIWhY%CYNv&9b81D@nSKw7pl|
zDp~m3&I>}(xfd)kbL+&$J=hr|WnR4nL@Z=$HJm_($84Z#$+2jBJ7m21k$sR(y7G>9
z|0+PF-8ZA&3tw|<(;17T@>uUGVmXKJ{^Qiy`}Z5@t{eP#PP1?51jxaKn-hQ~X@MM%
z@1y6?p^K9hu=BkFlT}Q;tnMTJ1Uma9tKiK}*wJ>_ApU+W{sTm#D@<*oqGqq6=pLwc
zQS0;fen0?lMKp&KG#z7rk+)?7saM@_j{#?oftL!?r8q<9Dgou+NxhzYcJb-T(c$uu
zX>57#yCJ>aEMNeJJJik6!So`5PT7jO2pV>l_2C1+x;InrG|Mi1m<2@dTfJX`1M#A@
zC#;?VJH}MtQw!QDik+JG9w@%uj^1|6a~rizZ<8j|ixQb63~Tn>(DZUtnpTZ#$CG;d
zbWZhcc^a)uxOKHnDo%s4aK`rwY3L}Q6W%MhxVo24?VmV7dqQ3c0P>AJ8VU-@#L#P|
z;OEuNz>IiZ1$dD$<;JH!s)qV<OCJxKJZO~_+wB^3J+;y~P`z2QQ2MdT#>r1;<f;(J
zW@=O)l5qVw>-FbsQGSZM6uCW+Av~1SLw}1kbjdD5Ik!+}bWY<rYg(PP#t$eiJ8znK
zZJG(4U-9K~H)<MUNaV0^br$epsN&Z&fxQFC8qfc>#5n`!$7Ofo9JN}gt{=7Jy7px~
za;t6UBEQ}Ej?3fsyWAshe0SQtwwnDm_G;gKKd)`Sh0$ftE%qz%&h>&!TUE;*MSj9l
zR|h|O?S15O%wu`$K2R~n=k}9wyeutn+5MN0*+h4E=qd$K7_dl+%tY7XC0SDot+js0
z2$U;NkDuZWS$3=b<;JgDUTn)nD_?QY_vVCmADCqpWjV@Q8EjgnPxlVlCUORYx9?QR
z>Gay));W=yTMf}#bJ`55*$zS*6#lGKy(hxA7j3W;JyxR}x*@XqgQ@kv;Aq33==ar?
z&R?QipV&!dL&Wx$!Acg|3=SWY{_<-3=zeFjFVB;2uT`@;Qg5>`Hc6y+6y)JLAK5*p
zEY4l7By04jSsbp;S#9##a`uWH-EI_E+JBR!n|~!7;gt<00y{#T^LGg`KwYG{TR+Tg
z&(|Ez58f9FZHK1wn=N-hjt=%<o)9NBP<(`bg=EUOOmZD`b3Lp%7_O;GTPTpT3O)fK
zC%IQzWIy@xT+pw2$lg5J$TARdo4_-OJE*_6U*9?qh*wV+{`O$O%xql0sW&XPdi~wt
zd(n-|ySY?kE?q(Typ#vs(}eW0#yIkQ_JFp)B9UT0>g?Q~em2Gb@e_J8*SX4SdcM4t
z7blhLE9C~s$;CxLBw46Aq?a()G|}9&OXlIFvDhF%azRzd(aPj>3iSm}_6u1e<?QWV
zhrhf`5xZo2KfvqJ9MW}(FjOtas8B(hzF6CdS7D)X4QqCt2#(9_?RGtr>Cw{OHnP6&
z4yZt!VM|)$tS%eOxzlfZ&|jTZ+`9o#s0DuqP&Qz9yvTG9KyY;5lz+xWZUO20c~8zh
zt|cSw<=gZubSF{I5ihvhg(Osb3pSG_wBO@~@9M^5q4dtxYe_`{Rlm`AK@{`B`@Q!L
z)L<5?vP%^ZSW!`#=|bR|?a`(!qD>at;7SoB^Q69md99UfEA$V2brNuLkX6PnZAvVg
zUG#WfXT1EC4%cp}$fN7URE%`A6<;567Ue8)i?C&|P&on#@lZH}M>?EDHD>acW?9`O
zQ`P7zJ!IcM+8Q81U29+?a|wCQ%y+;fJb1FXr1%U58;7CCT(*Sv8&d3vvs7bhq@Wz%
z&>L63<;uXintWD-c@IGv`8p<i>`_>eUb()XZ6;zxnodF!_w)7BGS6u?gLsjNbNAvX
zAJsd&LVz$ucVuo77)dF<aslbfnga3k;MMA_Dz2ag(~6r*Pn(2H5o{V8*QK7Y!wrL@
zlAvJ+rx2|vfqpX<8vLiVrwW;uflReBD7NO4ALmMFW8pp4k?_EdERdt#gham2h%B*P
zKyyb#6qN)2wUa3#N%reRuye{$C~%*1K&tHukF^GNd=0jJ7jAXBbc{$G>~|jR*S+=H
z{zB6c3c{H~3Ju0d1?Im}>18PE`I8aJ=V=R?z3)wh&Xqr@27xCr))o~{KKi(7UXQDy
zJ35zFM{yN_94)-Xb*AeLU}0#mU?A*;(672>n$!8y&~6`{$Ilnv&+uKDp&H-v7#F~!
zd}&wjACAXaX2|cuR9CDXI({%Z-cAkQ$bEb+k1Ah$L50uOBNbG(D6X5>N)v~#%gooo
zQ2^C``ScX^SK?6SiPtyqZ&2`>_4710YB(G}+Vgti*U;1)rX@ft5$Pg(Au;T<;m_fp
zjIzN}+k<Zp1{baY!Yj1LB)#oDXqPg^4%*+1qIY@`p#0ae*U_Hye&tPxcYPNHU6o%*
z70`F2T1cP=e1CtCljU4{DNuPeD4eyGv?tA&S8hSHyRXZ%Dn55<QSboo>u{l~oVUTz
z`7c$^pFJqENqYd$2xP|yby874C7lDePvg^au=65WIRFZa$hiqC%Gm`ye>r<H0osjP
z(?vnCDuCsNdegn5^?RK%r+6>xsd#$0C+<HH+j{cE@29}$-2$4uf`z-z<vr{t0(t6j
z!}iNSg7Zz1%S`+*C-G-Rmy02cTfcW-OZO10x;1o&5^7}6`zzDkbM1ari1f5-WL2nU
zO{mB`4_D53J=7!tP!b{1!!-!*)AXIJ8r#Vhe~LTG=LpEt!S2J`Q<ZRbFEKBxT&4Xz
zP|=xnG-G>cx@lFjX;tzuPGIpW>0srOW1h~f?yLR8pFphBz->{Qsh*Rg08gPO=jl(K
zS+}vZH-l75c7EE-h30;-<SDBlTYA*I=vSE01lT+SzM(mXQOFTL=Ck;NJMFYQmNdE=
z^YQDWr(Y;d$Ja#0Z|HpBEBQ%Jvm>{eg70CzxksG#(pW(gM|z;VzoZbsa1E;GK8|jQ
z*l!Nomz(xagEPaR<;`aCdvtEKNmlr?2i5?hg=iVFmpsL!%G4+;@LRG^x?KF|zrkiq
zA-m(RXWlw5_cd|zk6X=_+h7`i!>9(HHr{x{ov6~&up;2KCZMgzlqp1gwB2MzxUI%P
zbbn(AK?4qF{JCJ{f23Kn+czi|yXsUwaimkTp)>6Bfl&yrFP>v<O|xoFU7NjhXr>J0
zS>GPah8>zaW$BDsk_N65zN}Zva}SB*f2jM*ijD+9?`l@O?7)Izn{%`(omgcRoA!p`
zSfT0*u(&U_|3yqV+A_41y5NfvWEE*3plg}DLvR*xUwOef=6&**)LR~osmAfwE7vdM
z=Ww$oKdnDy=PrIYwtO&V<3){E)+muP{c{&EgrR<w<g7G71^RO;v2zz#WqgrI9CoTQ
z2@~P3nBmKdABEGT4ME?2t#&b(VK%r?IuoL2&+k&t-&ipEc*Wn<W0Lr7Gcd5rY);Ax
z3)gIx<mu_y|J2_CaPIxl<sfkXtQTWB!*DOteGNM8)(JEMdZFvSn!%QW6EBEOd$Nq{
z>rx}6^kOnwO6cN(^+M~?e-o-Ayy@aWiO(itn8OZ#*M`zg<Ya8puj-055a-!tRD;g)
zga0>ZcdAUp8x5W^f6CaRh>3%2umUsnO8BZXoa@@ze1t*qq+!U(HC|9_DV*G>7O=X4
zET};MU@qRp+Xqmku-j)m{rts*g3T6Ip^T9wr;o#=1U9Z0Uue<tljgJ26m-Zbz7J@y
zKpd%^=CI`_Z}%SLd2VE}CH<_#eI~j$*xBJU6<|AK!rBKA-<|$NKoGiL3sRcNvGtfQ
zwW@DURK0uaVX2ZWjz&T0or2f)tH^2FLG1~4gKrw<^3{Zb>jLG0pp235oN`!#4PRx;
zPnx3@8nZNWQjTh&Jogo$ohGmSrV<@FubCTfTlBZ<$U2;E297t(p5j(#n)58l(z7Y0
zR}DVz|K~#%eMYVH(ZLjx*eyIk<HVUb3bL!#if>t|r#-KTxz*ECM~Hw^Igh!STjT?a
z`m`_?5>~`OdVcg7w$yJgu!}?&3*a992vwNU`P1l>!+k&HsnpGtd_NKE;kOO0CxH+<
zS1P8n{;fLjzT(e--@*;_PAZuRbiM1M1&xyS_Cw?Wt6*EzzuSaFxaNqg?4!b%L{|ce
zB2L*y(Xg8o-z$FDr~H@K<ZIA%HYvTK)I%>3f=icb4dmz5g_0GsuBD?t_($Ol{)tb&
zamEVC2u0C2ncz*S%aV93J$akxE36z=DvP7gB^xV}T1|-i2BI5n_9;rM7xfgZZR<5p
zN70WSzZ$aOb9@*y{v6k>RW{pbm9l1qQahWhcPdVRHM_-X?Md|?U#_n*F%5EWTx1mo
z{Pprzw?eec-U+)hN_^C+>??q<tz~9d9F9ryk1$dri|^NQj!S_TC#4H6+4hC<tXy<k
zP$O;vHbt39c`Fjlm=@5ZS3V3`T<g#DFS%4M+G-@DD^?l@Yd>#jSEGPb#y)EteVSn<
zpS<UG>wIYa?CZGICxdPJP8v<yjMZgYHpQ~VcXn`NF%d*7KZx-$=rG&{<Eguz!dss@
zlNMRkoB>7u5FF8(>cwL9VOE*DQra=`F+6#%xqVqkaX5}dG#ou&_$t4}EN1-N2d#gL
z%_2s6)i>Z?b~-s=1SdBL33PW_HQyZEl<}%WHljOv7dj@>$v+WVbw}-eODzp_lQAT@
zeJ_z7mF70SHx@#w<Owzx-#~HOUhPxUkPv#{FH2R8JPSBxEgnrP!j7W`8_hZ%>Kr8F
z8V6^Q@YJ-2wiREQ8};K9&30|XZ!&HvvFfW_)UY<%%zBsuWfji`7;QArXUvWuUN;G|
zg!k(8gE;;b#HkOVe*EhaV+IAud%<is&A3Wkv7J|9+B+BBi&gD8;#P+)imlxcE3zSb
zv+t^|W8c(i>^8<EJaH~$*c6Zs+PZ9Fbol|lIUHqiY<9lbc|Ka`SK*V$Sh~Nm#;8Nx
z<idUWnoNh(xWBZxNmr*!lO)aVH};H>=?T@H%I`@m)N}M#GKGu-m2qsid5pW1`AQf%
z!ot~h#a&*YgK-1K^R!|!gdei7f8h-UkZ)DdX}jT<vfQXC&^kAmb7q;NJZaKHKg2Lc
z+_Tj>BRMF7vCzx#)CW-hZez?mwlHQ62?>jtDRw~66NCpB-TImG2Z#<Up>?s6A(&_B
zmX;EIb(w^w^{99~YHpik;Z>uq^~j7ZPW+8=DRClI;>n3JcPXI;PN9ahE8+FeZ4_TI
z({K-^TDnbx&RE6cb){&{KIW(l{KA4m=@$YG^rWXk8&rm`pk-X8z7x)Ck#rsa*I6Vn
z=F(@SKEf0UMK(6B3l#G&>dk{E*R#qWH_??B^tV<11g2vWI*)~hn~q6lhFy>@JK!73
zT!bHOL3I<T1iY&HC9)iEW2WaQ<tF{%^2xivK@k%>{3#~trnmC6Cvl87>&YvV!JLn4
zRxg!<{aLrRG7d2EgRhH_6r!V~CRDp^AYL9?%-ak=jAq~uH!9uocF)tekRRz!>+KF!
zIr$5o%d>ye7RHd?)tk9m8DOlgMVuEzWn;|!O+6GK9ZNP(Ohkf^h;Llk*ZjTKvr_jc
z*~)r{5X=T62K8>=yF62~Pc0h;f_pms%F;UI4Vz!Y^_R>}h?25|Sny5rS=XhG9GMKs
zDc*t@N2Z$kZgvTgk@5O)Tw$~yqdKGV=FPm{OQHp9YV73{pYO51q)g*Bp409l{H&ws
z$mS=uk`ULZnrjgJSD^A$bbmm+Kq}K5@on{9A&CrJJ7b1>+Xgc{u`O>lvd!8_<2ddT
zx3y8vx7F9>hPFch#y@^W0dlTo5H$XLB=(N!8<Zux18q#W61HeXZ3AU442GS)86>O7
zw~hGXHu&5H9a1jmCHn4P%Tg1jE^eC_tAFk)%&fokI96x6yJ%E6P}REsQxF;s0KbM+
zW?x*MNuABr(qiuVs{KC{DdNKG29OFFoFNFx6a~Wd_@-ISPGZd`liQ=xPt}(i94$$!
zN+GE(G``b;%}r!&1`-lY6XM&aeDBzT0yLiNBNAJ3Rnu10;8d;yBxPVdB`=&lzMdg4
z%AItrMn=&p;5xTN*!Rq966spHymj^DCBz5Z5*0T<$h_I+=l&mmUxlBai*(<ts-<U$
z6UNkcNHy>|OqLL;RL<)yCoIycI<hBF^e{LC|46fZp|w6OcC;q;b}{nCvj4Z`_nvd_
zS-q}!-|egGLW{f}_!omm3uzzugvH$c02tH8o|7{o{!ORQ!o{SLFG0x6tF%!{c;Nv2
zd+mqUOp^Sbeh;;MZ=&U?c!!00O-8JkFevjZcSyRsT8>}$%<eSb)w(H<WjBMwI~)fI
z7y^3gM<zIu_EphdM_XQRJ@9C5i&rP2cT88pX@o8`JVytUe@IZ+faF5q;~}*AXT%j|
ztZ27+RP<Apa7HcUDa%{(G~CvT?AMUt)6TZ&p79sGWTP)!$120jE|ipIxh{$0&+U<E
ziB8&oQ^0l)Cr}om&=FQC%7)U?>Bo(CxeAh(?!aDZBj(m5G*<9)KSNY~zOvqGKKJr`
ze7$)Bh8~rS>p|LAFz3wUX`AXn7V;v1mdEFv3;mNN*tbo+R++rIi+4NbmBF;=bMFv?
zsw5;(157pKQNxJc%@T6Y(N-7a4Efy1Xmf9Jc5hnnzE*uPEdN2g#cBIRt(!a&FINdw
z9a@7hoNWBIHlgIcc0e?IM;i6od3iPWI^kixQ?3V1LA9LAebtd0OEhT`sZ^71aqnO<
z=)bZV1Pcd5h^Yrv^Xr+=$uQPSRIeoKiW&!vuSSJn3hUJZ$!%1G=SuBEtYkgtx0(H}
zt3$>>f_V5MmmBalY~#=;S*Y_|+O16KWm|h%SDzS5$-Q85lyy-az9}|Ge6mFkOn(qJ
zXp#``N9pKo#vx{(qRAD+K_28foaE}@FlCRV7$zc<MMvMur99~biX9zw;gektjApcm
znS2JM{gSNbT)(?|>i31w?+0;IQ6VX2B}kcN3OqjJ-GaEzUFn_R_!k-+P8%#$<|BDa
zv*EHvNt-@S+LaT#@4a^4hfrs<`36}vZg>GJFspC`P2HHFQC8__cX)kxeZ!_BC`o;Y
zQ=@#=(}<d$B-h~`TYFsqE=!Ed(~iy%X#*2h^}a3#rl8f$TKgRG2$G;(4XT0)l^1k7
zcR32|EUn)r(x{(onTI?n)os~VoBq`ske+=woJq*uK*6G3!&cC@goIK<cRCXrN$C}g
z)nxm4Q)cnF(WZm+KB23wRNRito@S&R=6gWwDt}BBGkIXbtAg`-r9#OuDvyS~#rR^J
zbW`O<SZf~3$Gt+%j%67zm5?-~D)2Y_V(kT3$`{-H?NY(&lBHFDqdx^Ra`qMdSgsuN
z!=18DkEI(r<___W62|YtkZhq)zj|{X6~6I`cz>Vr28VlavKDWHVYFLQIZMH-79M)X
zxh1k5%1Q5|P)Ha=sG3alugdv+Z(*o==Q2Fh7-aH5JeVNPnfIa`%p%-1l%x_Un;oDs
zHpqCiL$m*x=Brb3OBnemNf7FnthzYSytNwo)cvVTc&UxJfrVbd$$@q$tWiOP(%gk+
zU4%l1BlG%O3BK&p^cm73D*^XjqbFEw^OC~0Gp&BEc4j%{29!7W=eq#sG6d5RHol^D
z<DGl`vJ89aePLt0Sn2r>U-#UYcWqn_qO_i;TNV!&igp-4<XD*SRuga<SZD=cQ}$1`
z{uwrM^ub7LXmS|6pre5Vw3Du>G68WOlD;c8`8=>HZEXKlYarcHHfDc>d{C)`x`lP4
z9HQ(`Kd*B1@0Ycz=N7g;WHZ>yJKD`F+F*<s6fBj$IyteAfuONeHLL$+0I{)+_o`FN
z4&Ko!*AY*UjV&1!y3XgRhhlVqo-%k;egPKT>as5FQI8F>5_lXyFX%x*=$#&Vr}zN7
z9AQUh>Rx&~f2H@VO2G|xe=dK-P-z%9C6H4QO?S%HM70r5w1H;zjl@6=(O)69?j~6T
zMV29Pxb#_0nSrZni^;0LH2BJ|m2Pq5-B2kxaQ0lp@M%w)?MT!U21cKXz|Bionp4$0
zD`pF&9gANx>adua!E$y{0Br#SvRP-uFK_5>IyKq;$f;<s`DNUU9o*8_nbD}bH#9J|
zf_CepAmPT)I(Y?T;;w~0eHLs%vlElPf_7MVNjP5Cw2QxDS3c`+ySW#~N}{Sac{~3p
zub#g{JT(EmEZr50@;7q9n>UeA(uT<Bwy;uTD96eP8Y>r)ux(P&4i4&q7)izG4-}cJ
z;QDx!CR+&?%MMH15lpDDPu_Q~54|<rpqZslTu~P#d!wH2T|u?<j=!iQq`99C&jxC3
z3-q6F345v+%+5+vkZgC$pXDvF2ZzjuuXqEi8;^~X!sFhkIZkbM{Z+tht!N+MN%Z+k
zr6EJE>V@=HA;yMIlZk=J=6FH4o<*Hr&N%AM{t_8kP6$t9Oy^qvq5pUd?$Qz*I+|mG
zo&rH)L7t8~Z3dphpOg<j$gXT7w75Zu{AoS6$=8!G)SS`pxdz)aEVyQug-^<RmF6lw
z6Q@7{a&}0Y<D4~w{TGTP>jm-GsGW=*#DjbNZ3|7Nq!i!z`?alx?^K#XmMF<dIJhbI
zs`{{>=Wfs2Hyl4S$xhTip8>tn(WfNR!io(YqzxO)<xzR(<>s7DYHuDAioZbj)x}NF
zsOqP_Ss5oxtxq6XtG*p(N1tIu66TU3?J}hBX=Fv5D(zlO{1WkWusH3IT^AueTWip#
z4xJMM)afQ>(bA5AJRKEgy<!<tS3u*a=#JDZaT-6JaDa`K!AhX!*Bld<bxB1(m+n_@
zimiz?-x!nzuEys4)L(Oo(pK|Fu0AVGdbHze--&ybekAy}WsnTpC9=hwInAAeHdS*>
z|IGkCD>~+YoJ<o6O~o!2F4{zcyfP=n^;hN}CzoaRsilT*@iQTn<EX6Fn59Iv6d_o>
zhTmc2j6Q~S6V6J*wud*bKo*k8&drlN=P!NZ*MapI+N3IC`^=}O>_ZcDG8gORA~B=)
zhSb<)O6wEovzA^K&5o(#nf2#n<}2zMJILZFt6!B0{^0LqQOKs~*@;<v@+;)4zhCF~
zTj?V&(QP#JKGgd^&MuQ*EimVZm@LVXQLJzFL+Uz3J;o$5y}t%L7YCe3GA)Oc6fXvL
z5v+xha}yi&q!Lm!49Glvj8ue%r|a0A2wB6rEi5&JxqPxGOTQv#W;02JY{{)j?-I)w
zGOw%}W?!#vj;IgQ{2B=C65^I-4bJLcw0PWk^F3x+%|>&?)FVFQp`AVgZyRgA49ipt
zD$D?^N6`P5HPSj0eF)wKXA-X@S1u@*e0Z=BML5piDKwcPuv9X4Y6kKiBZIp{9vm?V
zu{D+#Beir(ExQIE9vBy+t*LYz>m9<(#P$XU-*Gx-zT?y>!IKZ34RBMZiq(DGV!ivd
z(oq(pPu$~{+FAeHj{T6LqM9UZbv@SQ(ucy3CngL%NExC-&)<0rm0|!9s;SOo4B4jc
z`Ri?k9}I$7S?=9w%_CNY0ANskuNJxfJTW&04oG>NzqlLTwG&PU+XRr=4C+)r8sW+)
zQb*qjv}3L-{cdkMMu_*T(V<|Kv;*BLW?L1F1ffM1w;NpR@3R(U1e)9cJ7_PfU^|KG
zfx4pNdyc*r;wIo%nqbE61zX%$d8W*#Fah`QeO+=LsNmFliD@AJNmmeUeIX)G2>h9+
z-}FpxcR#fYNfI{>39{0gzwh{QcIsEbqIEC>Gqv7gx-y^OwI$*nR=Q%XVygMd*7ge1
z(IuW0yZZz9lM_4Kko{Sh5HK*K>tBWxK6Sngi9#@uE7V|B=4noTGlAUt&{`l<J@4zd
z1a4c7@b8(^AePYr0GqO2$F*w8&uWS$7`%+S8!{iIbU1FDX5muPYV^k#Q#Nr*PCL3L
z9E!GQS!Z7*qSsG|;+A<Kov6E(F!0)nZ2oh7uG;i?!2anj0ppZWaKc?fmM-Vv1;up`
z)7Q;VO-uW(<wV3jdcZXch==ZTX+hu><bwk_(WtrMfNbg(dvOwk_+kz8{dE3*p~Is~
zPC4fVZ6i9h?$A4Bn7h`a;#NW&6?6R0HpU$ijFUH^-VA)|a5TjWletcW$JZ!1P#zXn
zs7$R#XeBTe)KbdPiPV3oK*&x=-xD<9&A88}FA@e7H|E}+PP1GYS-nM=-e8=tvim)R
znHobM?tgZH9D*^~KGNEPG3j9D02W=OGF6Bq3AMBDsXEth(9c?~{-GWStR^@qVf{tN
zxt<fveV##vHTq)*M>YShMs^EvPg`J6Y`<R3_8RjMByn!eSM01OX$jExuLkTkyp7(w
z(^vc+lVnD^c%r8`m<GxDrn%3c0cgTl*Q(|JsfDm+uOLHC@rd;(iKu3eyO24fOS(pz
zH;Lu&FTssoRkMSG^O|K6i#b77T~gtfDl~5K17dA3xS|4(K2(riau`xU<MKiYj5lNM
zpneq<wDN-pUtn5KDuzR@uQ(~wqli+~=aI*}4vkB5=&0x<<+^&<>RpMIWNKA1Qjegq
zsW;qo?^!DY2+~+b%pbRlb_R&g{PJA-Wob(|?^u7I!KTzXgg<!E&*WEB93zpeF4OK&
zY8O$7ujNyTKK_9<C*uGDdMAfKxTZ1_sj8KvX;YBW?{&vk^A1Na*L}ieFb>px;oz9|
za7Ir|BAomjX=y>5QgC*l60;bcT!8O06Xl;bnL^79_K831WE-fCd6wsLLdxfCVO#=9
z&w9cOR>y(0H78hMPLK3~SL&?`P7@aa0;1-5EfeOq630q}ueVZ5G|yBVU#*<0ta!se
ziN)BcVV~J{@75LH&1(S`43MJnGsW*|I9t1^`PDS-A>D-l+wVVz|9sE5>9jpN>7RV9
z-@G|&`=zK%#)TFC2#$hnvtUscngVf)UV53Q^)X*eM2yoVgUgty301z%-{JO|t6b|y
z44Ivs+;Yf%)GDG2^I$hD(*kKeKgds9Sxk@mn|n3b(lBY{Fcwo~V@ojJ+_WBtjVh~b
z`$%hS@{;Q2jnyP+l=x3){(}Xe7cyp$&+a1}{?u4n+qUqc=-d`i!r6jY7`fH2?J8Lh
zUs27@u!|z=2<ejGUTnB+m<=OSQ2P2YF%ava6-xIdZsBGtgGY&Z6w%B_+MWYJNj|@~
zR@l2TE7f4efCeQhoJPRfUAAgt0WOUsY*-iGU5k2NO93m=s@{nW=>Kd4suqDk(bbS}
zY+Z7$*}GU;d0r;@ZZa#awv+c#x!d-w{|nM`3$W69&k?+caZ0!QovD#Y0Jd;n;@Ak~
zUfso$lNoykJiyvUct52r>NIV>C$w|7eKg_(ki6@2xe=EA$4jiFMjU>OUsy}J3A7+3
z-zAB(8{xV0iRCf=4;6A&=?5Zv-~`Bv`Q}R;--fQ{-mw8FRUx;l002<wU^oJJ(=7?K
zAQCAIj(Gxwr!;Zv0e84PrE@fO2jocG-mIrHQDxrG79p|%j$5!cSkg?tsuPf;0T_8%
zf{po<Hy>mJ`Y$Xw;oxSYX(d9NL^lft0p-D4@vBXLv+y{ty(hLB5%#qMuB)N<Is|bD
zM;*h$Qd9+@_g~OcD&ulM6v;zsD$?31`YwP{BVF;8yih$PH{mwNLeRgrmz_3d8y6Z|
z2^~J%kNH^na1*XzJZgShb-0n&3H5^tG_g!hBo=EGT0Ydk7(b&48Ulg{uIm|ahZ-RZ
zcv&Zi1xWa4oHzSZOlJ37T2)!^9e|cZ+$g9r0vw>|?SZw^LP66bp=D4t#8zJ!KidE0
z{rNo94VAwG?FYZ~>v<ViR97Kdec4OVDq)C*0pz<~aq%R|>e5pv%EXDlO^k2dI<?|9
zK-Q<#7({70g`W(&AaI&Ddef|$+Ey;3!Qun(Sgy2Q7+3)mwXw9Y=0ZjBR7$81OM3&(
ziK*f@+P2)GFZXE!o2da*AK-#9G6HgBhRr!rg`up86x2#dRrOI+P>2x}cQ^-@$nvN7
zy$FX8?1Qna!$>7<0h=e~yUTc83ap4gM>-zX+$(sAV$Qp496=)HGYq7*^SFHv;m^U4
zb~;<=C$eCK1w+U@3~ac651f3NUxER(;V?;(kEkBfiyK~mwsRLp@(|5xA?PnGnp5*q
zxI^&#W2o~<{ID|EyHBdKHRCnV&em)5=t<%N9CZV<t=R;Cg@)Q6>+A+_X8xuIdPM@Z
zo?&o-6-_JeE$R2yLo5m=2(IyyJuq2}0Es<;3aD-+rH3foy9Yh4;#a3f;q(JzTRZP#
z1tB?PSRN3xCli3>$^QUE)xNcJB3zwpGfXuO&=WJ~vM*HqYUe**{SRYlh|oVQKIRP(
zen5cd<j@voI|Sqb<@YCra22MY5FGf3iVUap8OJwp0zaq=$+CnGfY(|`YASH`Iy;PI
z{*a^vtT`y;kiq*nNstvC1%t%$vAUcInkQfSTm3blKX4dfdB)rQ#DF@873?0_zJqAo
z3wNL^*_%ofkjn>TnlCtqc+vxqr2U7J+k&hF8`nz+vBCQstS-ipGK>re$bhw2895OH
zmIS5_7;SApaCxaN{_g5AjxYueHg@#PX?X2-u=uX4SU>Y$mMJ5oTjJe`vjAiFAK;UO
z+4C8X&jWvsbVZ~!;)X~SX4O<50H5A~sFU)d{lA!paR|b76`kz^xI*aK@5|q(&s~6&
z7f-#0GUIW7^bvxAwqi<%N#`38?R_s#1lo7tTZM3S&%x3mvb0ve77o2yKaG9hItP5Q
zcu3U-S{)2;zp=EWU}%DBz*kbnwF<Y+W*Nb<sBj&iOTUA=eSshd2#`p-pINT|%VYd+
zW@+=O;f=1ZM}3cpx(Gq{Ni;U&1WDkhFik&ggsw4(8{UMun<(~;B+v?i?{a*7bj)z;
zXI1j_>a?Xc@Trfzimf92BXv=mj<dNuE~ewmAQns;ueOG{fK-4P;DnGbo={Ib;e8ig
zp7gk!+<OB0_i|iDK@N*YjRDJNXPa?{A1@$gcL@-w-w_-SP3r0w+{k*tXIL_HEdxCM
z%332DP{-}iHooT-RNiQK6aOe`>6jf}4p2=8+qpon+rKGhH|@cBo}?1rK)~~ck?@#2
zf0hCvZN4cCLc-_K(Ao?vK{N*!2vcJY5ulKRG@3vQX04DeITm6w+BP1*S&Gwg!<&pR
z=9VLd@L%>4#}&+K{lda32ofo6Otj!eUM&ePJcj-{p&In&HWvn;Jh;8Ny{r{Jip2?U
zKqFk~%?RR<>EuCY(j8k?BAAmOteM(5`1ihxdfv6>IDs^8RwXEJ^CZluDj}@`1ut+}
zo*-Rug!a$9%oR7}MNLy#U#Or$_plt3sk8u?drhv=@R$WcDuO&{4#$zjvndjkDiT%v
z%cT03GOY!tN?~~l8v%KUxXW2edO4XMhx1rD5|C8+n`N5~+miO^w@bLe63Ld{8X&lk
zaQ7-Qtk1-AYAb>9nIEv_hAsN{!lcZFyuNKngp4<Dn!@pL{6lB^AJ`qPnui^%FB0_o
zaO@}~Ya~~8XdR;H<t%@{M%FhyI0Q-F;&do=n%CLZZ{>bwF(UJ$2$Tq(9Y&4Yu-`$~
zv`-c45Rm(byLk7*oy`w$Inpj159&Xo1a^k-iD*xHV>V=g56xSNB^rwJpw>{4aCkA9
zB&Wg?TIw(inB)QC|0pWRWEYE9YQ~1a4y>M_mIMTX72}ig-J=TA=LocbH&w_G*K;md
zIe9xr1<(&({yYt)!8w^<kp|?mc)MJcBD*bP;7Am{er+CXM-kZ$8Z_Ycf@|I`C*@yU
zTj^8^1I`zIiFqsHlZKqTu(P}vlH&feu`RgHKlUr0@XWzZ#{9s{Au5vJQeX0!I0i;l
zO-H}#6(RGzyZ~kTm9S%896Ql0oxk=5MVsQa%WhU0+U;vuyC@?#hJ;XWpL$rY&BIik
z@HT~~P49NlUBB;bEFS5l6et!nQR@@g#h>K^#{R7K8BXABnU$I>pI~*#G?zZkP}27C
zabSp_Nl#~nK9N_0)o$CQzrfKTV)im*_0(#89vS`U+s~l)y!D@C8Dqh7dVo5FJ8z#m
zYK=%w$n|xSqWL)9)>sYX(qH4t$@n7kgavqjP;f-F7ZOv&FMMjRkq<N~_3(HSotgt2
z`|IxB8AkP$()?|F){m<ge_b%h9VY5<Th82F<M*`D@3Fb)=g1e7kDnzH^%9GK{TuZf
zE$(g}9L^y6g8#Cm(6VHP;k!#UWQ1`q#1s8GGrT1ePtfW2WGKEe4-RE5P&8DooaUjY
z5M;G{A8RpWOS&;$Zq<8G*>j$wEH3<kUiXsZv)U*)QB*C_j#u-+w73T@ajNfhbE0B6
zb0f$e^$1NlXvgLbTb>^no;sy4Y-kwarn&KZPEd80J~8)8p(;n~bk3g8*KzK|M-y{<
zT)R}+yA>wUdQQO<RHigaz>IkbJlV@md(2ecCFj<u!n1Se9wA~pKH@#e$-iCfEhb)Y
z@~e3ZD`{tl;g@h?jpG<svh)V$469O!w4uJ*xG;Wk7fDbp1<sJLbqPQ^|6I*`G(4Z@
z-3E*p>~u%K@|78~3q%7`Ll1{t@2=i#s&H2E=ZxhVg~6sjLugUqE&JtiLOk?vdRQS}
z)yK8t2f?35NZ3`rQGatyk+)&D##z`ax`hhB9sB1m$N%&H|Nj3=<Fn3=<%?KZW&Nrs
z`&%pEf6|Ot+tA0MBMC5pb3!)P3%RptX&TuCvH!#kj&tKTh1TN}T9Du-sI)sl!ObQ`
zZlw?v`B%8!F{71$JO}oJV)XkT*e8?Y#e+!$>=+I+1%b0Y`8w`V5l}Kr$(;XD?D;RM
z!hR-IypJTIP>DKN&6i2$)$G0F&;KtqLENw#MHoyX`$El^Ze?x%Eo<<0ytEF}C|d1r
zUjGLOV`fRQ4;oIP9Mhy%fAg#?m;Yg4enCI~o1p7ah4V6hl7+~~-wb_A@FGg2?jGCh
z7vc{(c6<s~jG9wRWLKZOpqsZHH;)C52=yoX;^_Pb2a7*Kk2{7akkDWUr`8`QfxV0Z
zf0+CJgR#_Xm+Qvqaz_^pE=Z)JS5MphLi@75>GAqKf)#4|n_+`IXDEeoR5f44u;15Y
zz?TkI(X#!bdHO1$56P))4DB7I!ZD}E0?1KKOMozH&mMOu8RdwV%g^Ejx-(sL=;Ga5
zOc~u@Dce;q+#yXenV<F?YCL8ci^HjsDt0_|CxL>TVxiKzpQ6LuEoCM0OKLmfqs_1o
zg?hI}$2_(R;y|yFzrkeo4?`~DZc0I}oGm)PA8)|-noKl^FpdOct@#T03$uS8rjjT<
zH2kjlC52tX@Qe^)-y(1EEI<7EJFfm+?B;MDPU~ZZDkO36+dM*Ax-hqyg3=oO(UJ$l
z5NMnf$;anEjI>i6@x>St_1<9}vn14xoVW0I^Cz|`a+D?Z)ft@ZnOV`w*Bm80kKc@e
zS#v#ey#L4>_MO(`C>>J$5<FGt6B)r9WlRKbtbG96^(za7ONhVkeHGV%&$sa}UP<%z
zi72DKd<Gd>?JU!qi<u0_$X-_j_v!kubS8FY9-y)~%Ph~gH&eJvHnb1WT2wsD|1Cgi
zc@s1o;Z@487G$~<&&wlylqAt|JRhR;z&}Uu4Nnda2GmL4IBBRLGpF?ymV!tB5>~5+
znBe$#s3gN@>I5-ZY9lhDGT@kA)_cWr9qEfTsI#%~lIn~~1-*0WbZ<)E-m2vA{?!5+
ziKcIrt&o>)XMkU5du**waLUO!jExjWtyDofEgE;7zt4CH%i8)+-w|GW=32Pdskirh
zwM*@s@U%6)?#vTtU1JZRil*bWKQ@s}aL_Ww>gGUr5K$P?h?g+Yi;$_mOYuycI!a18
zCf>(nxBUdgJ-DJ75jz88S+0MC-R*(Q`Vi^<ha8M9`t(pJE4KnBGaq|q{6_^uCPM+0
zrI=E{|3zV4U)a!_-zY%&+NnoM(g@kqTLY$E!&)CZPA0@)*)~p_{;rJzGX;M=e_jh*
zp=+wPMjP0lN`plmVBH%=w){B~3ZZX`rB)xbPMtLRM@J5#4w!b!yV2NbCor%0fpL^x
zr$P0h+FoAE7@w8L&bJKAH@OZv9&2rW;=?Sw5`;+s;!nK!Z_881@YV#~z<RQ^rtq>v
zduL5#=x0F{kc1*A$LZNJQ4460+%e{OyM2Io--vy|x%%=uYb&-Bc&CRHQwRfR)qo79
z#mLTmkWu4ght;Cd`maL!tTq8>D3mk+nQ%Q-`cLRpR2$AU@9C`@JDjJGiFl51ATYN`
zULtl9h>{>Y@P>FS=G>iN*2}g+Y}ia06iDJ#tS}&Yio;0v&GdXog9cOgHijFni&U6w
zCR5Q@yM4Q5D!~BaK-%l9(CH5ecZ{HG*Y&*l)1+U}^(^9B@}p0tDU@3c2-~r8x>Ht@
zeqQYO9u09L<PakD6SIJzd*}QV{?f|T-mV;R_c$@QAJ&@6C&RBnBT?a27!YC;Ucw%+
zVBM5I40~$?4XFA2Xg>1O+^@k-R1EkH?6xzPAoOEUdOb&ozd{&U|5khbNy*)BPUl0y
zE%}@w4JYNDp=$RcZCJ+^2<1-_Rpof<-{eA&9ef-7<ChvLGJvP0Q}Xl=!+*VVc86KX
zUDAyC$y$bmkgp7R-^llPoJLMVHF>zaGK`L;Mn$fBq&ps_?6}SggTsoG5NrnujLu2?
zh-{|H^4tYb5#&90N9C=jsu=@={Dgos)j5J$@ZY=NWmpl?UO#toqIs@8j*D9Biha>6
zMk1oG9bzywP&xS=P$R2CWluUE@pW&He}ljRQ$sc<{Y+d8%-B;`FZzUc#Q9!d`W-<A
z`W*ed27TzHQu@G*ojvLm$p=bq!sxqv1X?7KFM^cAUhPLKf%hv2F{8tUImihGYZ@xw
zq$7_X&vi^uS&1gqzOuaR7fDQ>Ag-qsdF1h``$J@TO_(o(<a`6LH*3Fvc}iLlP6OwA
z5fvo*08a*!Xc7;`98uzIAprMdiN9jG$r8evi6>u~cWq-nr4U4vSs*|n`uZDUk2{8A
zXXR;VolBo2vhcEaJQ2znzff}Z`d#mG<!j1WBy~hrTW;><sSwlV|A#jH{;akVzso0L
z&XB=?D?K+;MW3OdqY;FRwep~iJR2-GB_cOA1)L%Kg_M6Siqq~x;Pc{{GW~{DTQF(C
zcgw2xHz0ZOT3k<w=vv|&bSS)@zcz6`7U8X2hkgBvHQ75l@#M27twPvkE(NZC`>hMI
zrk`=^-D*LGp|G6LUKQ_7>R7>-_DU1W4Jt)HRS<ppm+*WN)pq`SifOghs6g-zTL_7E
z=>V|My;IKfQmGS7d?x8v!QjcKCN0?A=pDo8LaIs?Lbkg$TaZPdN7Y3nEudwCzI&U}
z>-D3dPp`yZB{Kx__$rg4=Papm`5c8m!dT}+$3F9i_(0wOBR(o%n(0AcEm$c$qFSU~
zlIJ3F67`eJoQPKODto|h{+<42^_-dIqm`|Pe6V@`tKD(k5Hx4#Sv*c;7d0n#{l$A~
z>02OVt%h)7&M7nso_^2CQ1Enqs63GpZ1FlQ+*%4LdpJ|{k~s<Fu3oVa^ldDs%i0nE
zxFTpcqMt{8I3R!jqqj;^rYidmPkVI^a9O{?%A18uj-o>Hl1>4OA3gg0H(Y7PzG-!c
zllX8I`PeOpKv~@5$wj>;h!4m?x8%Wtiy37An#D`szsfa7O*1w>6XiG3-yg$?-Pkxz
z58+(1&muSC>7}3E>P#sOcno833z|2xFdBctI}}G{-)`W+`4B2|!8t=t;1_#GiuVw@
z?X<lFJ=~VP503Kt6c9lJwwu9MLqirtOlrQG@`&}z^F&YDw+bFiBR1+Gki|R4egngR
z5$VlOfAs6))mRUSQF(Q+86?5@<>HS`9Nc0U;7~;)?@x>y#i!7Fo>S4t1}%|nqZD}-
zUk?s=E8d*vw4Uk~d62pK0MnIH8jLhmDftn1e~PE%K(TgWE?Xg}B;D?`vAu8b3tS&V
z1sKPdl^v39ovbxYGCQsFy65CGdQJL3H2t_V1Y4+ghU7DospjvYjPrqDCe}F^U~Htb
z=UXcgX-J;CgQ@msK6|bIn*1l{EFo_TI21Alb1kJI(}b>zDDX(@V<0K`KF3f|1UaYh
z@LE<lZ4#_~vV<FMjEB6xnec$wyMxhv<kbo9-;n#u7|5`cLLMEMApgQ&ik65luP^Ww
zdA0DA6-f<*p4K>bhaavmeHG5&pOHVK*@Rw1wh*L#RVSnR0F`-8@#+<A)3XWY<J>qR
znH!QD2$((=W#bcBcEl)StyYCE>;LxN-d8=GJ-tnW0ayaWFjVLd6-?j;oNl}l%6m~-
zF~wqp>NWWNfIFL7zO=HuD)AwRHSz2<7xQib#XjBv42yt7WVKEmq||0%Ep@-owSo;J
zvR3O$1mpBy<T`7)y&{!JQt}ls`R<2E43MYWS|>sqbeIPC)cqmlf{J{Qa9{dd@SSA&
z=IiM@9q7{OcFBnCk1V`D=fV`gpYe7)Uwly^HQ3|c=S)@|(Y_4fX;Zo7o_T9AWYt!}
z3sy>#B8lU&#ryhw8e0FFPRz^*%1My8F_?=nM(jc_*+#3|SxLn+?+saG+~XcMGW%1o
z>q2tT!T+}k>;atD*X`0Rn2CrozGaWm=wf~K!+T&&M0tyB?3qIoj52bKZZ0bXk+Tgj
z!jRqOiHT9Uiy|g7U;S*WN|PURQ93nKF(v+~pE{CYvzAM4*_%#Hb)<?>^JTCPTP<fu
zbj2T6y77qvRD2=78C*O;k_ZU}45<g(fbA$vcFZ~H)a-fA@u(pKJ*=0NRLg24AA4KA
zuiU8MP_9~3Q@<ibF~a8kBH)+OA;%@365SU83coK&LU~7H1b*@Qn$}X?>4ImE?drNC
zA@XhY`A7%*ycY=?5Ey6`!TM?c@ksVrtpKR2W3&hSK9oNu8D>AqH#;Bl2CGCUNn&1}
zHB?!cw>l<dOZlTLmg>>V?>bQdC%i#)PU<bJ5_*(liqyEOM$Ey9L1!2KZd^o_0%rHp
z_udbYF2g4P=srA|^J>wCRmM;d=H_?ty9CH1)@Ia2@TbTnQ*iJk<i1OWX`xjjGN*`|
z!pqV)S$bJrrR1XB3x;V&dH0hkpjUF80XCKJ`;ywY47B-6cIoZLSrQRAf~iQJj`p^<
zw2}M^TKoE2YoYtEw<l?jmZgK$g+}TpbJQmSxaf%%!ZC`5&*0-H`2*<)<1n?UcIj=l
z;V?^e6>WR>N)CFDrJ}DY31ArN!Z10mK_#|=m~>W#fa?IJu~@%WLVKMLTF`r6%>!h8
zl9%n$ZJ3F4t4k=%-z&3Yy+L)_@hXl(J-)~3AP;{!8}H7H%_(;zBo6vGbf#=z#>pOq
zLjj=z1??@^K>BcY&&$TZL>&>tdWGxcXy{`lvOvP~Ti=Jsr(q!_GOen12}^eL4Dom0
zst}`0z&e8q`l8|?zHD6!UP~SD80O6jw2_D?i(%oVpW_Y04m%~WIZ@OUJy%=LgFxS0
z*S7|Y3>M`bEse=g2LLrxsD=z?vq~GTLBiB$Ri)${+Z?HL4#s|j=`U4t{&AWpWa@Al
znUp?$u(CHAC~wdU@|NwR%h`#&Ed5UFTS<#?{?AwH#*m#t3cHqV%w*n#cn+6B*(*3~
z!>8BY`O?uZi3eh1vgbszCTjAx7+Pds*T2P+Khw8=UXm6>HRM&=a|&^G$+UR8(^>>>
z+mPzC*EnJl)<(SkRxid)zxwTg^1wb`pw=_TvB|t0GK!Vu$>vx4cS7B9%yWlKrz|Kq
zQe$v%9Uw#Y`+n~RFzc^#UCG=tEK*=58j%TN82PP+RTJZ<Lk1I5*YxY<<1&>kZDV@G
zIBO9#uUA$om+0~@dq4aVQJUh<!kcJe;EY6bCsFFi>G||1SLowYkE;QPa0W9XPQQ}#
zJrptVhx4)+jXYb*KpahXB_3Z>V=7rkWO*8@pd3%i9S+j6R+*oDeMMT1Cvz!<2S)~x
zYLLFc!r*Om{;N)K$;}I;i=~w=%tS~0zNT@Dd1{E+izh22TPSruP?Pt(yCRw~JnIbQ
zA%x$Gs7Wj88g@k`kpCtODvBo4wo;gUeovJJIh|k|G{WN@WW8C9+d^h#!&As`MASgW
z{N<6`JNoPYhBMY$hGS6)+c9R*O7$cO7X`A1ktFU3?>9K*%J_$z|6l>Gs7I1@k+F*K
z#k3LqQy&rp%^p&liHe1iRa>vg@#>S7X}|ce?KO0VRE2GjRUF^Wh;qry;I>*67*`6|
z_RDUf*4u_ZP-xL~-W(rW_XG$C5QD;HCvAS8QLoKiRpL(KRsFSdW|R*VP@d*~H<*Bf
zdZ#lnnAJj1kgo&LK-KHoM~Z*{NQUB_E<Z1`Ad!}K+~*mTOTO|2kySAr5?9`uFV{P=
z@4qOqY55DTEZcmik0|B;t+r;ETZ@^HtEPr{N)S==2g!A4!|VEcKWpBTcxG3$hbZZN
zEZZZDdjp%Gs|`j5M}t(0kDP^!mPUc{sKalN|8i=@V`&QA<RJW1$;>;?=LCI`uJki}
z+i<f;G$tGO%F;J@v)esIt={vrHE;OAH+lw(?!XT42jP`c<6snt;_e@SalSmHj*)oz
z(8n+LmsTE1iPM>zJaC@$mp+$jlX~k^2{NYn`rc2u7U^CX*k-=B7f`jy=nA#G{qO1P
zeG>1&1ZCdIH3$7-_l*N%yY3ydE>`@qpEEy`>q@_VSE#KP^#)OzZs2@W{jVtL&#wp>
zs094m2>;c7@n%Hr;1Z3y^L5Q_UZGhi-*DiALAfAL3u?ss-)a;uCo0;v<b7<}T9Kw_
z2vT($3Qhm&T-#D(-aJ3F>DpH84fuvQKw3X2x5+ZaCimB(xyl4>s~`8|oEF$Ivbh${
zSN;9l4?g1xL}?$fYnTjMFbW5#RL-2A8h6NXBTLA*+R8JYy3zI3F=&`$mo%%)!^D1A
zhF)&hoMnIkyZ(?z|G~EZ`=aCjxenF3mU>F9jpMif`>+3Vht0pQK)#MJEf~jYG1xrD
z-%Uau`lkSe-R%B*mm+v|JS%53uq831`G0@le{P(?_Ui9H8&1N35+IO2hqcUq8*>u%
zPX+IPUV*%e7~F5gN^kZ5XTtoS`shEO`}@0bWiZ&27qeaer{T!w#s9-Y{_h8B>mgJk
zwl(b22}5MD!$#ofkt`A5cw^x9xQ!U8GEPN<DULZsPj=qdAH&{u@lOx0BU(r4391h@
zE;T<7!i1-hTV~V$&?@0<tE|2fSARq%dC9w#Cjq&U=AT9y21Dn$OA~lL5`8!?wkno*
z|3}yP51)eqZwC;(I~k|?->AY_4;|q}ze`5r(9Jf>dR@4>|AEp)d20<L*opV#CDH1X
zBkLJvo4G*{F@y-=-+}bbg?(2~zPw`yC-AD=2=Ad+(uBD1f@aP#cRE3`B{~XykC5nf
z_M&Ee$p3uJ9JX8w@0n}!gj!qX;ghG_^}o1Ng<kcg(VL5X9)%7rCw%QNn3@X?^FJRF
z*MUO`E>M`)MKBpfR+}Pp={HdfsPN2}G<v*B_M<V5eoWIpq=X#1Qgg8-+T$6i6^5c=
zbpfbg&W3yJZEmQXxh=<)Ez4OR@TcDVmFIA8A2w%%{Z2ZR`$6P#Kx<1>)-P%RXK-lf
zzDfHChCM&?M}2DkE@(@@OUd7e_X*ms$@s=}wv&zJVfs=xe)WIi*Ha&iq1w3b>?u0!
zra#|b<@vNqA^`US)OvYFN-sOMtP;|BD8|A$8%KR3mow5!i{<4*kA7PGj`dff(R#<4
zs{7;Xv&M1CTRU7{$>R>;+#98dWUti@b}9*Ews|)6Q#}Db5g^(XFMjxAcseS6a+ZKz
zQqT(XS=T%%4ZSL`T}k8eg+@_839F|}?p<3>)=6_x{;_nisP1F*n+Lw9K|j>ZIeJNu
zWF^PT+2+O!+a+0AX_4LSm-2=@Gt{XI$e4Zet5uQ+$0VKB%(U(M<u59#(;HnN%ppuu
zQee)i{HpVI;)gr>1CwT3+cylv`!-NwH05AAvmCs&tJ2KfeawVC56^tBJ;l*2m@!Kt
zNo@P))FN&mb|%?C4wuR_I<djcqVJ2_%9_E-ZMSH*oZTVJ<C)LWD`t9rgbkgP4I91X
zxRf)|d~+qCg4#5X`t*NqBtm__TEb&$Yox?VbDm<#)<2`Im~p+%M|?abfDm0yiOfu}
zda^J@`uy2F<NOG*sbBPTtA-5KYHZ^IqIh>Fo){MBdz@ieLq%0rH)m#P7VoioWD29-
z2ho~m*QB+I9OsfBetUPHk#^<Tt=Wv%4<>eXX>Trfu1s(q4!bhGT9Cj7aLVVbS#h=3
zpXu`r-Siq?nE&kgAw=V^E2~Ea*B&sxRBbroJfr6E^ed$__1hqg%vg&~kK!1sQivXk
zqwg#@+VoT0a^JQyR=@~Jd9O<3>$%gi4D?X$5$Naxa*6i3i1+@QRxkTl=dSBLwoqJc
z{%;>#{ok~Hh!Az{6+}g;zb|J$FN1NyWB1x@-J9p}6xHEKn;zkWT}99G?!tX=G++M!
zm=mc*f|ggDieo=FA5&%U^>~#$Wil9{?0be(X%c~WFEvD_N;7Su+`#RZ)u-6<?|c!u
ziBdO`Osq8;ixqO5kS{jt<ISA>^*W<OhwmmgPgQ2|ng#;yNfr1!F!Mxu*pq&O(&wC0
zpcJNetM~Q(^ZLT^`fGK031C2S8irIxazAo^P88nkt~IMWtZ?AAV}8kKBi8im8fPxC
z%9YICf%`r-HDx@l=|`e5lW?ytOQ~yShu#Z18<hv^I4{}1ZMK)oC#fX)jn+5bf>*|s
zi;GHp%wyqc@EdV=s4cJC`Jj6EsoQ+AeRE1QV~J!>**;}dbXPNxD>u+vp620@bx`?U
zIc!Yxk<{NjR%`4uMlk~dr%qX_N&nVM&a0Ku>UQh-gb4TYGb&x7DqQ%L>&_IH;cL)@
z8<FwQek}j=?uC1E84JbH7HhY@`s~`2^mrycV@TOFa7>q52^m6VpL4vTp2YTTW;jIp
zbsbSs2<#s}NzH`>8Sw8-dR!aBOL)P8im=~`PKl`|e-lM^`jP6)q(f4!^Pu+TlepS+
zG+xFQ1)P$@?$N#)&l$be+WoqwZAXaBR7Fs-IHGX>sA=q{uMe?TeQCUF|46yU$;zHN
z?Su%$^K*Wid7?q>Ifa+TUnR5SNSRlOVqnUjqsx-c7wyMydH=M*W3Jzbn~Lev(r{TF
zDzZ`Sr=b0qlP=m)`7h~&N*K`$=084Um9%c4!6KS9!~v3B3({P!%1*BmN7C)pD{$ce
z6qs>BwmVlcEvFZ9_V42+W6^$pQCSwFZ4H8qH~*SG({j#XC;oj<$&<k8OvvmNxN}GA
z#wO_)fgIzmM2{^k2E$GguOzD!eHW)1AdF^MJq^IU%{Fg5avKI}HOM__mLKwrKZ>%Q
zYInQl$^{~RwcRM2mwz-xM)>rO+Kc1gB{t)o;rym<S2#Ds*oMO>YJ0I#bwd}ZDxJ0S
zW$wdWbk)&Xvh!t4TBkt@;NAm7?&V|JHsR|lAitBOZZ(KMz7taA;@ppeWWu~{P;KrL
z#mxVQr?(7?>U+b5f8r>E<N(q!LrRHs_W(n83kXPeskDSJGz<+QH6Yz0jUduUcO#wB
zjqm3Fo^w9Jb-~PDYp-WLamTQYH+5FG#}DEsy11$lZ9XgC_saQH+A6D^6ueus0Gk<p
z=j#Ha@-G4_@%;7X6~@cB&L3m|0Oo>Yx1YpG+R)pjBkH9QBd_GEP#OJEBc6I-3>1#V
zd-qyLpd)YF)0al6G`Pr#KJcYXf+a>mK~_r$V|2RfDKi#AZZgDCE)spubo-#@&;+QF
zNmM86&Z}UH?%S&MtBReCf5T*0r{wn=BT05>?5+oV1uTfRSO);ZCZ@9?3yWRaJ`tS2
z<Bm#|Ng@`1rig#Pr~cUzrt{v-GqRJ*{oBeT#z2W{`_yk#w<85s*l!->rS_Ns-e!%O
z-;UPIdq4=r>JT6Y)c`^qbzzgPy)4h*OJ0CfK;6+tH0vvtmxT2_i@cv%&XqlZ9?kPW
z%Z@1Dl(ga-z6R9=n*^Pmm2V(T5}T%s-_;_U?3Bs8SZ_3)zE_PKOLG5BHe@2(26RRq
zWM^XiPxEHv#gF6n9_($+BgfbP_Qg!*fydXqP9n8|7{KK{Fw(E)!~dX=Q_UNE^7qS+
z@IOeNN9Z-1^RDX+4?dfJ5NUZ(aXWkgFI9Qa{&BAPbkntPa#iEGR4|QYO)!`bU?LsV
zvky+R$A0bn5k7i<klXhD&&hPIi)Ap5lggWF49BmV>_l(*ujbeLi;muNJ?fR5t2~Gn
z<veH2>3Puj+G`l{b0W#<Aq8X`0On^gW(S>ygXcA%vt+~1{YO!+aa8i`zCs^BJTKHd
zpj?f}EgA<OG=`*7j8^zLU3wW)27=lFu6?A)Goq&HPA7gD;dI}Gw3I2zi#S771E7gz
z^zi+BWBS6DhS(z2(TBQjei(P)S_5#~_IO$&K#X0^&zfzr`(JX?@Wn*$LXPIUF(Ywr
z<Q<^E#nuGK>V0#>9<wYteQ=xe0h|Lxrq9G4Xd_n6zJDmc>+C-8)oTC&mP6yZ;@@h@
zZx-?R+y4SCYQjwqL`wbmMu9!^_D_t#j&e_c=Khnw#^L(j%|fQ#1m1?<&nEtA0xcjU
z%RsF6c%D^O@1G@!yD^DO;|JN6mR3i(G+r#_9dVzl<_Vv%cixX5DSZ9B&#6lq#Y=rX
zg9Z%&_S68RcqYry&2C@lK~pKSW%&tbj}ss%HT9+GfXM&0J3#d2L37RVyBJ`9T1|Ty
zNNZ!9_#iejU~*QhX9@Vkuay3V5E=udEeF3AaTCE%?sJX9mlgjLCz$~ZUjOwewF25F
zpKoej_E+wF+gU8_mr}S5@Qt*)-zz>g0f6KMAd*iMDjhgEJq3op{j>;?8oO2eZ(I|d
zKuix}g907H9!*OxUN0oBGd&zdO52TmCjhw~Q8Jg1(F&Quy?b4t{ji5MVdAGb{mtl+
z22S4<Y@Y_Fyo@0smPr+_2cfV1uYitSQ;pHm^<>%08&pa&gU{b!+UEUl%$XjVBy;U1
zfJB9p=U}-hZz%UW<4y7M3pBGB3m`W$#>z=OR%h-psvpDTJh@&8jGB`tbyP7x(%F%3
z81U7L;Z3(F#YhOkc)7IZcY2w<z9`KRQvNMqxpu;@I_3e`Qe{WVrHuRC6DB$|H*Ass
zgO80mTZ)L&s(QUBUw{OkLB%D!8bYL+TE;`z^y4$2TT)|eMnXO0yTMcQAl{|&vMt$t
zm2>G*ss2MV05u*2d@+6Y>i<1>{mo=?JQMa`llMoDF88YWyPZEo4ErLj1(h!Ku0~;;
zl})bk6F}R~)>0$1{;2{{&4;{20MwbFWxe^~kCXB`9Jr1c)Jdb*uY-yXPEyt1%0%zf
zC6FS|@E@pG{zhUJj^8hR`D!iFe!N}WLNQI*RFjMjI~2U80f3u^-<<+0m}DgMSnT!?
z?hkj*IU(2G8Xyy<`mSki!Ku69AD|T`N*P3(r@lcgW0(co`KuRjrZ*Qsktc5+2nhO}
z*UUUT;oj}Z(9r#M*1heVA^U?|k~Ox+7r^$l#yKC*l;msbdXT5t1H>7PEhhwocq{d@
zxe}}Di13y}ibv$VssJ#5tffGGJML5UzIU=cHz(W`b@V^~=k1y894I@6A{Db4FH<30
zKff?uD%-p+!WR9%NPkD1?+t^F)%vuqTk>Puf{`7l+qBB!^48B3a`*tEbQ({_iyjc~
zZ2O<{UB7wYsH}V`^YIFPfM8R{^YY5S&14FMT1v(5E|H|IKw$1S|DK^jlXW#<Z6y-j
zxs{^xKxw9ypVTW#Z`Eu*U%KC4`u$nJD=H@{<J8vs5bzQ9KL@DSz%je+F05;MIb}8z
z&+u!?=a18D@MrmtlC&6+tqz97QR8JNlNTx4e)>4u>0HkQS|ua5CtW*#I#1Xvz>*9M
z04gKjtQY$u1&y-ao;bla75&wd$&G94c>~GO;D802`1*C>Fio)yuGn5xiM!KJe9^y9
z<{Z%l_?KA#(Vzwp45KZ<fvy@Jxp)n5b)RP63@XWy)BP8|AMxN1Inz;Oo#prL{L4kN
zwbO?hhd{uSQNZ#+i!Zi$FZO;bw)#PAC8_COCvOG-HQKhov7E2fSh%YQxZ54VCAPp`
z=FfYzbI|ZRi?lxgNN#5kMYqwd3PR1U0vc|3fOF%s0c@cSzc0lZ(lrM|=8^0#>zDu4
zyr)r5AfA}3zz<=(>jkr;KnzO6Ajofsy1t|?H@gt>t*)D4tITf&Hv*uvU`K@hGdM=Y
zv(pJRyzK@1Zch20X==HbG<g?TH~L&Jx-SMFc<AYg8T06z&HH5CRC^uwG7kdF|D{w6
z5X31m@LEqwsO>4nYFGO6H!pFUXQZKJ5tBryt+wnQ_6k7T*X@smzxf&`oyno(&HztF
zFMPczbF&H|WO%arWj&`dc86Ig=v@!+8q-p(fAKj;OJp}T4`+0vy<YUc*@wSXIIbOI
z+U9Sm#^!7ucBt-UYOxcyO#vsCRJ6`213`vk=n5co;3O%liXL?)SLuC%=G1s@S*V$s
zq(l!1@0NxpaxWpPz3;C!n;cqh=QyM~I<TdJezEWQ4|%S<{AX;3PaG&EpG{=WpyKjv
z&jxb3Je}cpwVv|SJaB1I9scr=WJtKEJx?yWFX}Ic{mV*xhU3w1{hV#{r7h>PmYxb(
zMj9YwN2XKb_N+H%k(_6SR8^{260JclT)}g(6?n?#NLE7nclt1u2!OL8uka}aNCEQL
zK!D^EOATN<9O1Z(Yw#xUNu!-<V;VT#GTESD_}$FMzhQNt=>fi}*H#5-R(-x*K8GzS
z@N#ADUIr@w>1GA2Lx1zvCDQSn*t;%<T`$!8V4um??t>w`B~3G@5(+<Fo%&djy0MJH
z!%>Yx;Qcl_bup0iNIfcnh0PG{pb7GjxLkgGGPiVJXR!&v?2*A28&}`A$PH&I9>=Et
zMJAtNf9kp9cTuJ-@r`Gl9;#DW)zZ-*;4oczEMnk-Z{AC2B=u_d+Fy7UJ2aPnkVR^A
zrKHa=YQd^My$4432r?>9<HaB}Olk)B{FcrDzSMK+=4NjL*=Gsj7XXDx5vSxMEk-05
zf+Xp*$2`Im!p;VC%;uxWxahDp8I2=GzfpO>F#GsD+z~m_N*lmlbHhCdTDX1@Yn|rX
z=Mbj@M6iB`ma|E}ZJ8N-SnV9MjDrs5HItW3(hHWGvh9=Sf$fr#bawg0+y;2X)#s-7
zMjzIb^iXHtjEs3_e}%h`$dWQ9JuNZE7ybLuz^*9Yu(9UBmpvF?ygu)x|H+`h2H?)j
zpJL)&<n{0FVz4Kke$Xt8S~snQ0>ZQBeAuu_!*jD&laUuA+{fOuS4|!L%~l|A!o+8v
z5$^^#8|CP{s-E|Wvi^VBzkB?huCnlK(^<ntIw)bG5ZQZ{T-S8xv(-wSkQfD4q_b=a
zi|psf*9QMyB{9}_;=_u#k&T3(xK0Lmm|^LF-@F`QJEYnPOz58&E4YgM?Ckf7vu!1m
zk}#-K0L<X=7RqRqW|<s<__OfWIy}Newoh_L<38F8M(+JeWUo?&j&NQH8^VBI_^6=8
zUK=;}O9XAKnIxm2wU`j21tB8_nUC65pPT6*NnDrYHFA&cMOrt~?FqZ(o-4cc>^JU~
zZ(RO~JuLJAY{Wm$IDTI+JXo;Ga2x>a-Dfm>WEbJuFGsp;mKx;Xo`#A(@qUHwMmaTD
z78Ic9@1loHk?gLSss=Y)l<OIrSKf|fj~Eej*7C_I9nJ-Nk1=vSCl;TT>46?yQ~KA7
zQ^3HX*K%tZiktI)IA5I`7B7En@E}NbGv4<L$-ZADZ9|aVu~jrztH@)x{i<l85qk3U
zfcrRGVAdutrt3+d^crqP2%yx+d}ZW&^lR?a(43N1!r%AU#N5-Kpik_$hu(dALvk&L
z7K6N-4re8pZ1+KPD%r^`lel%@bI5>wZa%dkvP&};9bO~dT?!@+lhhSfanO-uajdL2
z*>Dd3c?!5Ej}5Q})kc|M#xRchKpbViu3hg|;2$K^hw%{&ey@&3Ji9(fPedttY#9OE
zXm}`Lnc@T-WXqiTCMek_Tn@(iO8`uWL~-wLWH_tlI{DE9l&lVIEfpQB3)X)QEW$~u
z_~}m-7D*vW$-^(nt_vW|UJ$i56D$;t=m~msSbL@uN&TBK<+VBgyvxtJk+Wj>HV`CJ
zLL?gB0qU@+9N*`w*O$bvx9Edk_5nW-@+Dk#xD%Yq2pB}Mz%bjTcb>!(uy;{S?yV$y
zfRFu*OFC95_*_=#5RCjX=%-TRo+Y$0>Ce$}8eOY$6Itsirs8@D_~Msur0w#<#}#mo
zoz%q6w0Qt+hyro^5zY;!Da9c&F>epIAK#NOkPawKF9hQakho^qI|O^UCSL?qDZ;~F
z&Ia>hd$31H9)Z!2O3~=0FX~BU#j=t0TX9SnUar5M>!E`}bt!M2v&cS{4lGpE)RDGd
zTX?@@?Cp&i-@Di5E5o@yrY6A$*OR;q>H1i+@o-hR=>r*~u+;ptM0xF~;xDz%9+I)o
z9&oK1{PYj2;q*rKi<UX8fx+s(3XtE)l@e`sR3J8CbLv41A5~1u(R&^g0b~|kQMUbe
z44$7fTU%arh*kjMJh|Ws)r8H2siXg1n(HKejd2Cq-yw2$Iu<#oz%F5yL4_=NllVjA
zPZZBU+4seC2P!BQG>O5}xet@ptp_Hvu<>f>@~^U|g+pU1n=JXZP!#saESW`K)x2B(
zI+MJ^9Q;MT4o?usJTEEfV(_Wwe7!3cm-LK{t*Y3?yt@^ksvSc<DQQY%*vujH`RqzE
zI4@WWL>y=Xsz%0_D)Q<?k{n_FzQ&V^vnpLL{b!Kw0+)D5u!yI)XFmWOZzY9)AwRqT
zMe;#_`#tO`9Q|vvyy<`<LfXs}G(*6RO!l`$UHJh;D(ApXNd=9^X_ekj+d?LK0>XA?
z^Up|@Z-)d(n9^v^pW=UVx>v?*QgqFk&5`ePa^>c89IAeiYyd=GdvfZ?Z?qa3oohH=
z^RxlCt~JBb*tlZSia&c#e4AgvvpUlgiIfa1=rkL-G(+_NGZ-MJ6{_=3f4WaZUUhdq
zM~QfhME)W%DqXL7&?+?AVh4>z3oK3prx5GslQ{Z|MV|`6#A$Ii9!f~Fgv03cR`cHu
zY-%EsTKUsa;H+g~anUe#5gfmW?~)-~0mLk94w_F?tHkKu(0Fgbj%t96IM%A9tgde_
ztmWetK`y+9^ukjvo#vV)kc?2HmUYh{OTLY)MIE9JdWsiV>!*t(D{`wv{tMFoz5y_G
z$1g&)LYDfc3zd3&=q$s@j-q=si-Qq9@pMf1zY77v;#Bid7iDkjCth*{IeamaI|Z7f
zqUqp*r{Ovx(8ZchPCSuP`filY=L6|p(fC`CG==kLHNmfi-{|9H6VYb6zj*AZ7pn*v
zej8DM%scVqSr7Zo8t0IYZ8eFKI&$|ZFcsaL9j|}RIk!1lrJ9A7w-K=?e$X&iW|g8E
zl?}HYGWp~4NxG3qv_B#`2=i`bI94O)$pT*Qw9E9+G#FwA;m!)_o9k9@GWEZ*%%vI%
zje2eK#eG6YFPxI6Ejrc3wAn0#NQr{?jVyu-_Wj-cN75xVg=QUA>?s8no8vGT65T`3
zh*(QQJ?B83d3`y|G${eUzz7bML<B>58-<<z3M_bxF%(O4f9Ca#nM9Zb7ecG0=@jrJ
zZe41Co!fNeKJh+xgF}^u2AYLdF*Ml-4&X-0!lGKzsauBNM1dt=9tG2@2lY@zxCe?~
zl`90e*@ayvhXrzVMV^0+{#bIJ5&|atTu5MUw+;4?PPQp_zc<JS_5SNtrjy^x{d{iV
z-*e-RnU63~f8DPrzlF92JLcebU~Eu?8TAbtJXXr6BlgClO9k8sTt20iE(IL)6DhKK
zxe!P--5&RgknaqOF@JWER!UbzG><VC5y&pQ3C6|o>V-#yc<2LVlY&eiW;_l&GbF@e
zn}GGOo622z?t}Ra88!%xrJ>+YkrWW1yH4kG$68WK=ly9oAuv0xnCz3LDS@%)&xCM<
zfyan!uq~g>6hX^Rt1i;OKMRrw1<Lk<qLS|4xaIv3B)!+7Lp7&{vXYQOW(=>+gI!{d
z)wbxkR?-{N9-qkzC-${BFvFy?%B>fuTuoPUiCwlE(7B2rV_{TEKK&%Fu1-BECM<@D
zE+(T~g^;j)Q%t9bRqOeNiY~1%OmPR0QC}G@8KP8+Lp#ntGJdua+)(Z<#0Y+B7v0n)
zHOrYXx!>6#6U5vpa135}#+xJawIw!~i-l!OnP*U43;mM{uqkkK?6xaHfUtsWtCBRO
zrwJC8bvv+RNiRX+X9po5kNp!U;>Tl1HG6DywD;c|in;F83w#!gm1Mc`VZ01+c%F3n
zt8)W1fx<*M(rMLT^bUqkGEKC-%DQ|AM32xynkD3-6ey<B${tMzsd`+*4Ri^ZbYi8w
zhPdLnxIW<R@EMHI8D!zJz7&25tWunUHdcz%^p9AlgiWshhXoMg7TUYgeM@|3`!zEe
z{r9?yf-0f<XS%S%^edj3w+WW2YMQi&v;9u~CEt?}g7)~QQp<WJ+vq{zg<y+M5dm-l
zi`}yCCe{C>hN8f9pZRp5t)_p>IpulzZ72BYFuxKXhWS~KfHFj0(m6^0;n316FrB_w
z70M)F^c3tq$0!kX<WGk7&X!Jb*>(CIq3(mU-8hEc9#yP|dB)3<*op|ctNQ~?^HBe{
z2GD9zka$i<O?@oy^A1wKlf0*BS7<R-;4;O?<HUdaT?{G&{FMF%mRV4#(&)eKrG;_K
zUp~-<KRe=kEA5x8sce3e8v6Ko?7IcesW+*WEu^cD@QGm6nqIFj$Uoqx^1Q2AmY_w2
z3_fOf8uBD3wYdT)$P^fCZiR4HU|5}m9oSqlwo7p%-}6&U8aXKXOw_ix8Bz>|F<_%<
zPsmGdoJ(2*fJH{iLz(Q*zeniA3FG$QvX}2VA@U6IRkGwa^yh@&%%)@#mMDL)Ds!$+
z+0WoYBH*p4SSCmWM;X7HVKfLnF-(`!U##2l_A?azCc<=H3@B&%<OYNWNAcR<UoMsZ
zsidSLDEXS3MV#q4XDJ@HQ@vj|_wJ-{xg`g&oIAuUw(W~cF({L~yalMHQQ(vv8apsZ
z5zc07xqiap0aJbeKO4pIWh)-w4Eaz0Dw1bNMty8zu*VQ8&B9tM8^B_{i|zH$@30D^
zeuud0kmUU_-$X*`hnjq4NY$#({=psmUS%|Jg#wG`LVWaNbst9Klnnm>pqb1u6tiz(
z^LeY!p=OZ3NQ3^vBd8qI|1bQPKEj26!(^MHEByO!LxcoqFo7MH3c=!*EkhD9xm|l0
z>}zThUuS~7eIoUj3jbJSqKXB>3(`X;|MW=Y@ay*=@?CM3Oj)0eBJ}b1w!5<RZ_~ua
z7ql=fJh9W^fe6o2B8*>CUOqrILOA%Uhjt2C7xFsCfo;3Uzthv9&vDUfVt3b7m`gX8
zRoa`p%I{-hoMx!S;imAp;ofHW-)W$fA{u3gJ#qrN5guHtFUEh1-|Qs}q@^D5v>{`p
z|I)?>fE>bO%$(JPh0u|F3ZSeZ<UvyI$Vrq|#T{OtD7+VXk4MTyN+BPIlRk;CAq)}S
zzBy=eKF*tpj;57ZRG-viol+8HRNO3w*zM4m-&q?u#;9lgogH(Ys+)5Xao;KWT7W0h
z_c*z@vp$+Y_D4<ztq%edfw9D}qHT*qC^gD*!2arXdEwohqt(~UcXqaN5v}3-c}uj5
z922h@xU??}uRmV5YNG^s<tl%PppuG-BI9|dwn*2~?76TxS4)A~%XA~}+qg3moP+dj
z{NVlO&PY#Kg-ID$8}g`is#H&fU7}$%f{kh7?Q4sQjxYH(^p3x#Mx+555Dc>gOIjkz
zctzUqWrn`a=kb=#l5ZbjS{2tWt9#(L&SPq<>V7L4JroN2a2+axl_|2A@K$@XUEvqa
zGxMN3>tvpnyDb~f8CC6L#j7NYUD}`2(1@H0`82A+Ei7`aW<=uoiQR5TmVkCR&&bkG
z7u|T1HA-O)uk(bFVCTzERq*hL`Z~2lIS9k3)q*!~4gay{ZOm0<619v#gZ0|?V~ZEm
zZU*e(=)XxbM~G}@LT-s?KiXoD#1dcm3rvJ<1?%jh9C_H*#$sJ05lNqwr_kfn;gu{A
z&_{ZTT}R?D#O;0d;~F6Sct0-hLqzkGAHK#L)^{))O+>>j!OgsJ)86cUCb1`G6S{`G
z^l3|4$*Pa#tAkf9!@VN;POYeYZVvLpv;4by{4JfNk7Tzy-qE#^O_5b722a15S!dbo
z=YhQDA&1R(i}>75aBKp;WeSW;%~=%z!Aht2&>e~!>RhqcyS!qdNQH9~aV*rN^||f5
zMOk5?65dc8Wi9Eb;GcBYKV6I(*#23tc$dO@d(q#-0ZT{in6wP>-1jEX=;fe~x$|5}
z|E5h9yPD{pT!m6PZEYmsFZBC!YQuU1{`>vQNi*2gdz+DG&Y>p!t}9-S!Z&Kh9TTDo
z@2Kx;jcX}01Ts`cMc(waruV~SK@5&WFRPjs;=nB#t5=4h``!JV<Y7u6B@0}b@+b=+
zpe}esqHj(jcgpCS0J2!HM~q0iu)k#&Z{6`x-YP&Rb2vihH?L{Pi=0QhLgM68U8kzS
zgs=Wc2FzHdJn260!3LY2{$xv=dngoa>FZ+WJFW1{K_lgHN6PSkqf!q$O%KdYaz^~{
zb2EtA&kk|Y4%}E|BHO<6aH;@1P!7C`dS%8!5pi_gX`(^ds>)S*;%D1dW4q>=*<I?V
z2TK1aX~;SV-Hz;RR+3<pnx*WlP4L9EXZ+1N<x3qF_g%ci@kVHu8MCDHRNJS8K{fIu
zpVEEIsvQ*(4*z&r*GZ^^vmLggPS^Kq_Mrx}alYJmRzc+T*x$d8@S^g7-e#xGC|45+
zuZq{)vzXxtjT$F9ce#!4cw8hyAv7WcH@_T_%z0ORUUlAD%k!U)&G`mgQ49KLGn@7b
zE&_Bd>t^gq%4Qqg0I^Rd3eT##S-X<7-SVcFu&Uv7_h>KQ?fjH3{`@J=KuH{p+!2N)
zjCpWS=TfRssKfBda!swor3M`ZRHYsj%x~#h<exxd5LfV7)oZ%+uPSR(Y1{qv>Xb=w
zVnPzD1*J4>KK30J$25DKe@tcPxO<z2;Ke748Zcp}wE@AHG_8TUzJ93!I@VM(&EDb$
z@r{&DaCC3lB34CO*}_v!+VN`b$<!4KTO2w1t=RQeo&p&Q4K=?wazyHF<#w8R1WVAu
z8P>~k>ly}fN}>+&e7F8O-)ilb8MqX88vU`7i^Sbh(BhLk^!pbWXOd7Q3v~FB=|2K$
zpVYydu6!E>I=Xq{`P3FtOv}z)$or&GhQ9}YM8ikaq+~i`9iR(M8KZT3Q)$+d(=~+8
z|4@uj9`L;G1~Y&<BWeTkQy-C;2@=7Gr8n$%T^GJ)`Ogw&M#w~NdI~RcwqDXHn5wz!
z9=+~m*|-oXGh8LN{v3}szH--z;IHz5efQf4U&B$!_*plbrwviSLNzblA5_G`6w@zn
z_US~1L0CjTNTaLQdY}d42-CDuT0j3<!CR+>^<*oc7obc6N!{zF4G<M7RCH*(p*r~5
zhs-)(G2t=qL>3N9vlKs(GwORDT_#7rzC}mK7%z?bVf|u0%EiF6a)5IsMJ`UCUyMV5
zIG(+Cx4mP{>8&(PajCdf66FT9Vn&c^KssafGkoPJu>KxT{);DcalU>K)>((T87uQ*
zGD<m;r>w__ZR0%oSUfq+U*|QwQ~7r<m9qH!;+>9Pk5uUq1+tr?6=#X=<kSt{N1ANv
zw;_1W2UWuU;vW*?34)=#Pzj{d4;Z|!#u<)(*!J%}O<fW1lM@!<zu}j*u~#50>Q4@D
za8s_|Vwz(S!nea|6lwsevyo+Wx9Gd-tc|%MRdPH^WET6KfF_ba<}7RXJo0_7zlq|f
z6yDu>QZvJiVAC(N%oeG7N-rDGsVL=$5tgr?QRp{BuEBYc5_H1~V=KlXAgiWm-oDSD
zy#tYwBpRb@i5Y>6h+}V7Oi#i}L>xW%O1<{T0^xxr(P_&RPn@%Rq+8jr+6{CO4@s47
z-8duWxh#9$6x`L~+6HEylCiFcZ%YY6?=El$pLOsUpU`8Ef(eg9j@Ul|F$Y>Lbv#>X
zaHM6Y559XfT`I1)Qh6^3!*<wi01U81Sz2&%+HYY0(vIB|1?uv*(LEG%1@Mr|oXUmY
z8o6<|4S3<*1rZIhAUw3`i`?*e%SHM>?PQ#PE;gTFw-Rs6IsVJ=Xj8MAkx7E1wl<0s
zcx1W_b%3S<<XWuIr)c^;oh)Ef1D7cUTc~wE8h<X6Z)}>PLRj=ZYv}&5b!y4bFId_;
z2G<344$j$RI)qJ4#+H^u5L~e$8B9p$t-y7Bw)+|fihxp?^G+bJ+p#v%;nF$zFx#^H
zmYdY#!1*D6NG@y1UWNAb#~arH$NH7f@UfAJXU&^md~kwF%l)JFIY-!?bn@a3z(ukI
z2-|o$+qX&D(N)O9$T~=vB!$MPc83oXc>mXWvVC3%5fi?{DpF5uFUX^gXY#krTHgpg
zcBaw~U;ayaIo&EXAo;p8#1|#LT62%yRF_M6EP+iWfs?3u@~L6851+nXHX_QtmB0uI
zg-s=V!l3sZneu!@n@h6?Tj-X3=FBkK6`S0*lXvR1ClXhlw-m?iz;I~+`Qp-n74|Zz
zbC<zcUBC}H^CN6FjOvlRDfJf@ZHbChp<nMM%S1q<weqi}%eKSH57Z}{o(CQfAn%MP
z4vQ2#sD~@uqWMC`gPfvy-%KjA^!?yG%wWv+0?sKz{Lj`E_Ue(&18c#b)x13nm`{XF
z_r3}9neGO8rYrK4D}b<y1mVjCI)fr>aWW{ss>A=V&U7L`$J7$A*ZKVI>Bw>RcAaxt
zOe&X;<l?>`n4+AUkL1QZ;cn`Dp}ZTlErPb^g6*XWH-^7So4?lDy!mXVE&h!B=1rwK
z8)+bK?%Uy^KJnwn@^R=-HUbTX7&Q7Mk0p+whkEhzs(6<#{JPWC;<B-&4DpHts&ci@
z4Mb_{#Olg#A@Wpa2%V$opQ0Hb)or=UQ&^{`-!UAqEosroRxBH28&ldo!n-e^?|QtF
zfQyRL)uo*gg_=yV$xKJd4%rokXqFu`?!FAxD+EV!OzleS0<%n4)Kex$Nid{<J+moX
zvucuVaq_!k2-s1)1|#-7APoLBayPmT4KyTPVkpL}W#A)KW>E#_s(5*VV!{&z+0=NO
zamT1^F*J@3^%Mb_{P1gT#M4}TnLDAYwFIMDWgB{?Z`u)xgAsLWA5|%yk|`wlqqX0(
zsf0s&FD4BuLSMs-d#A!qJU8n5mm(DorFXE4*yF5hL3qDKd6$n9Pm?<Tiu_syP#r2@
zA$#<}<y+Fj=)_xIYnBU0{vW;fZZ{L-FFvU*sZ<<d-yH~qgJ@p_bu!D(bG}9tew#`t
zwcB_97!n>+M6!CGDf4oGOQ_h3)GG^G#n{rb6hajT=VqKIp#QWVHjm`<kF)3{{;r~M
zIevEw*c#ek^(Tf*GbA~OxG&sSZo>`}YY95#4dO<nr5$N+adFI2(2Hx^^pRpiibl2O
zuBuJHBZ^^yO<ffBwZ?rn40K?=Gdgx9s;PjdejRhk$zzdVt~Tb$(q(<f-2}ff;V;H*
z*}!NNgRl&7;5|KoQk2cF;K}#&I@`+;PKupc>mV)-1tPraU~NLv=GTO--^2!4OIWdP
zySG{uSoj4=q(>@a#m@Cnvkoe(VeH$(SGKgWgbe(ClpG}IgymA8OGdjpZ7Y`N-HwlO
zMT^_e`|U95X#s_s{902Cy4(@(lP9*eoQLB0(fkt64btZfwhsiY!#@fE(+{42PU-89
zX#zV>#H66aDx--@w}CK(wAZd}&_}T}4L;k%N_i5R9<0=3ps_SYkdBG1{Pqc^KRgLx
zy|O&2f<&vu)i+cVfZ`qlGPz-78$GLP3{R!2TZgUK$47HB-*IMy*;lmi^Z51XAX7j(
z8azPmnIA?{klW#~En=(n!2_yTsXnt&DqigssNZnCQ=%nCUZt-zRn<du7MwGTSJ%q5
zgz!O~nv8x?5$A81bSMSueapW_l%YB|=If9Y?LRj*rV7NckGL{)^A=;Atl*a>;XNC6
zDGauTsn*VMle*6zkq7b)!ERXevT_v`JEyNYW^rIp*{2@TD9MPpXY}7NHPk>%_G!*=
z>Py?7=M)Dr#Jq_G>^i3jwGy}1it8TXqqw%dmp(kBb>Y6QxSAcbw5sCYSCw(3r29N{
zJF7E<+48xSu01u;f5T|~r#E^mDk3+qgz0Rb!_w#pUL+`UE9@mwHmVY4Fg|;-No+xn
z!LvV82Rj#@bNp^>)EAY=<POXL8E@v}j>?fH7b@Pgd)70%B|6xF1RC61ZT}w2#9dO_
z^N)P+TZsO)U}K;9E}+d<Tq?*=5YuhxO{~GN1`V?SVGz?`39U#3si`lQ79zP0Y9aWF
z-j349;K?tyV*!{B6M!PAQgVO~uWo%RWWLIAa2w6FGrvZ3w}-3iNe9ljbW*I8H-87<
zGBDW8?UQK^ov5#o9!YVI-zIc>&3B$5uclA|Q4t~;&wEJzdA%d*<3k6-;xHe9v1GuU
zClS1yxhbP?%Y5-_gFo*-sX$I-%RYb@P@Q6_EJ8Y>8?(7|Ki;#~T4PW%%CByMgHou@
z8R;of9=}*!vh8a8#BXTG*v=d8ey(a8W`?(ec1DE$Y(HG;W2<l|JWa6zjC<_n9;J8@
zjehw%rX`p-7HVI5aqs_w*&Zk*Chvj%w4lL-%!Xc3JnJ|D^8?*SmWsXftS>5S{Fa`K
z5N#h?4U*3)qvzrVJnF@U+rJ^2#z;-37EJS>zwyswOG^Yf6+Yn&suYb{7={oSilV8m
zy2N9VC}7I<m9d&p<$}|G@7n7S64!Gg6~jah4q`~A(ipNhYy&K`eS_WtO{0AUoy^*i
ze5~XVT1~s@ddc5~o?-vd@FFRZu)nAMjS%5<#xB`Z)=0*cQaE3gf_cKxunaz<j@4+C
zN|0LAI{~_urX?`qxzo7&!r}MqA^T?-wi@(QPWh(-p1hx|F?VUh26%&l_@3cD1MPB+
z(!12TfMws)+h<}Y+2?bxV8C}c(mk4eriMX*I7fLL(lHztCRC>A)q1Awydxn@dsBCK
zM5UPoY`=fL&&mLV74ifxT9(8B?rXcp2PM2M#2wX{@1dsiA~5q*8EvSs{q;KRPwdua
zZNrZnx#c>epADPXmjybIT<15tdXhM2!90nkIbr5cw7Xn!2cW*_7bdxZP-~YLiozcR
zw}hif4oM_hBq$%Tumv|g<w{4NgA|kz=TM&4Gh+?6Y`#IDM?BNF{*)ScFRJorF)V&}
z#h%8NqHOODtckubW6eD0b7E2cURd`n^BxwLANUH1?e=eYf|&)Q`aHI2W`cPAx3$6+
zbmhAZHMoib0uCX16!#9<DI`Vg6b0e?Ar5vHPja~jG}D2kzOCDU?jK6-6Pz{9NxHF&
zmip(NW_p>L_9ev|TM*XBvp8POKAg*8lB_{MM&8jS{$NF}7O}dw<q_AnM*)!r#shxG
zYTya2vR8xk$+he~+`RYD03;66PU;<t&f7~)1~o$up6!m-f+b*%mRV9LZ73l_MrYN3
z!?2#F*#n<Igb?V<+|Il(XKAF2H;|xXwxZePyOV*Q&hgt=?;8>^+&`W5jy6853bnX6
zR3h?5sl`E0({(0GZ%yp+y`%q|g|Qb`F`k=W%a|Ou-W<$nnh0))w-ls<-^MePlUAkE
z51X)HOu(VqVtsO<=%nTH4P@~Y8{RG4qH{~U^Ylv#QP^SWxCIMGK2eV99J8i2a#^`V
zyW<;r<{FYmwIif2<2G$iki>lagYWbJNs#0K47e3?@k)<yMm@`bg!zj~G2xo@mJ9w&
zr#PH@)Wa*F9Afrol`}uH9s;uo$PU*Hnb9WWl|XvKCVD0?6X_7_Qrev-Cj`#-CtD)=
z5bC{I02z|7HPq1MKJIGJNx2K8E9)OIe|hN`1;Kmc7gcU^jdX+1HgL?ZVj&JkBm}70
zZ6=fo#+QT%JyS8rb6rYcZiPTe86Yutqql8}V?Ltj?|fvobT#hz`K94I<e(?shS~+;
zT;b3a0!3R>kmLy&!{gN~Tj|erEwK{!^=0XJS+0Tw<}}+xFsf0_&=-6@073=dbMC!w
z=%@?g3dRS{bB5d0-$a$7nh7}JtMwP^Ufs{5lUVPw*DXpQiy!_(hu6t-;=I0kNwh7!
z7LJzca1Wo=2j+Cowxtm^5@oI%oNvc~p8MRpG=`2{xn<%}TSs7Aa%n(yE)`rUP{Js+
z8g^l`h8WuLhvD9=ZxGZo{E}9IRD#uY!f^>!N@44MJ{)G{11zE<$nrDkTiqq)Zv+T>
z1>Oj>;i00BmhnRY;eBA5r%}Jqrse^40YM{Jb@H&p9v#zi9RWZAeje|ij*m6mN*vAq
zQA@dh2Ez|E#hAK2i2Hm~PctdPRs3V66L)gh&kEbqvzBW5RQ$30N`pz0RUqFJVzEvQ
zeX4fW&Yttm)Fq|$a{c`XKTP@;MG_1W+zquVWRNLVs)#y#0Rja9(_o2!ISXh5(h4<-
zJU&+#g(=tvBOx0y>6h$9M%r4nav||I|4z;|Dh@d@U?}u~y*vjN;f3X2jLWgw^m~8U
zv8W?34{(Cix%Ibh*veNpKE+t8%FT`ails(IN!H<xRbn{?VO5#N@kkfx<Ao_ak8(I@
z9**yZWv?NqR9l*#?c<IjkjavVkK>hD+p0DO0Z*s!O18y!oxR}^QO}1?Z`dIW0>Hpt
z9rSFQJMJ-Iyp%Qu9=H}dTHG4qB0aA&3c)hQ$C6nZWj%txI>eVyIGVLspg5JJKmsUt
z4pPxw+aDJ6#O=wS7LLjv7#u%cf7#|MvM1iH-*kH}zzOpQRWg!0d}IYD1*nv4{(e$?
z88+*`|7{AzRYPh))oe7H;cKS*&wqnU2j+^jidSzoEw4F&P*aV58s7<)v<c9)Hb%`t
z7&PsoQS=k{gjZ$(w=RS`MFl{n9U(~rl8&sDi(iw{sBomfK!pue>@e?_C4MN~RE`Iu
z=A4?!$}q6P0^qv}urxquJfSdw4)FwUXv+mnJ_J+zFFFc~`SYu1Hkz`6X!k8tRPg!D
zVOy3>NSvD{Ivy`-5pzP1K_+7jz<nvOuH(>(i(c9wxK5`K0(764yP%W9cEP9Q0E$Dp
zxWQJUyqxx54A}04W;ih$9ltaM?VHQxV0<QT*jCNuL?Sc{s+?+pmOBv&!IG}lX6y+6
zwv`*1u8}8~X7)4m7mo851>7f`DU@A-C~t70&}3c!G)m{Ut1nxp3kpv(%v(b&ZTNTN
z(-EUkrsY*V`Cd4}{;7+Fz#=@3mW*Qd&3-i8&VnIC5XX9V)ftAO7y7TPJ)clYntV=`
zPFk4!CFZ5)l(C*N8_d!sFPdL*)GvaGOL*Jf_%QG_0C;+jIvy6JUI2pc+5^bR(+1v{
z%_j2}3qk|lqqFH3RTv9CdE&Mkk#*iA0LDRCy)FY2rf7sng}$oNy?Fg~QF+3yc4sqo
zlub{xgra5K<Y8-!35K6hdU|Qi4pkE%IYvs>5C+bL?}w7WD7yZZuG7ztZ<F*)(%?!2
zMIC0JWtw!E!=);2><@r;3Y?8Gc0*-AYlJu_u_B$0K>Ce30yRs(0E3X3#67$OxhVIh
zHI!1F2ceGhF=B%F5ilm&0tJ!{ePG_-5sQ#qfT+8n51cET?v+4TVTNs;cJI|jno45+
z+ZKiI)ia`+wt0#;wYB<Kc@_qFP8ZuH-w+^Bh6D_@k~~Q}sdKz!rh)Yu*~>529Z}yA
z^<UEIvVnEZgf1sAG?I7(<u39dz=$>EY0Gd=yp;EDnFY-5KQs5KL@4?EMJ7Kt=wlH)
z#Km6slVlNf@sr~)`t({VK3OP9U}FM95;Fq*Ytc`r5f+nKyl%oQ`=2dKh1*iNxG{tb
z1iCG@D#fykkR*Ee{^8Coegj~~#&qJ>u2aVE$yvJ!`G_8c3^+Heleh!Xn7O0b$-q1k
z+|hs>c-eZn8lHXV*3Xer((J(#KcVpz7MFadadzUfj)f8qE-}qGB*@;leS}FhjDba0
z_9D1MJ>@s?!?F=SItzIp@p(}I0e&IGjc~5HsUnbC-BEk}e!<fU$YFO|%^c031linp
z=8f8SX87X$>@_dQxe1)>yn_)X2<`P}AxTV#(OmgLlRU<);tN^u*%mO+<8~k5lAr4a
zAkWqvnk6h45g+M>SupM~N_S<xB6j>dWuZ(9%DtV9tqZqzCIG^1;4!}FD~~^bIf>&t
zAgRB_-KGZqBpd7zC^>7;+DDNKe2DZ2edE=>c|r1rT|@ga%ksvZ;$)grBZHRCmEFAO
zk7Tpu!8SS>G>=LKCovuc+?Q-1I82PxL>hhz@20Yq*PCyVQ#v<SH2|na?`vhp*8s?Z
z_z_m1@J_sTG8BtTRWWA#qF??u{Z({%3ovssJTGgl(Do?@Yd?J?5rOXe65jW*fVxNw
zsP4z_pP#?|59lBA|8HDBN9%u?1N&y6^6(!6{pr;OQ0Qt<II&nR<)XjZ=4uw^(YK#?
zA`IxBs%{$lWE1c0RoYw*R^W{MA&4U}WgMpzz@^}OvlP!T_+F*Nc07@cVh?DnIlP;-
z{hHjK1xRvR^>ci8V1f$U0Xqf=S_;QABr!ZJhmKw|hu?-R=KoHB%?pg2PkIWdP*vnp
z9FdZs1Qb1PDku<^D-nbjZm-SkwUrZ=jvW#6@ZgH_P=;B<Z!Lz5ScELzXuBUt;y-I2
z|8K=#U~qs#NETVy@^{&^LBdi{wr_I1@v7?7pM6?ykCm#Vfv6^KldNwZ-cDW^0|)+@
z-6u)oTm-9p9AbnXSRA_pduw1ffN%QbQ8XRPlXreLS&$yZvZFT4^e^+9q}uV_4_&qm
zl+sJv?ZHK#Lq`Ci@i*dBy(tGcW-35l7V?C>cy+j#N@_K@0NaPCO;~tC82Tf28c@i#
zvjPCiJiu6fGN*sG`;k1f?DOXW8@$VY=i$Gjn)bJc9}jXUc)6N$7J+!T0r(Aj^|zG+
z!+{L|Dro=mg|OTIzk;<bhB`rXtg>;4kJ|-xr(UJh>L55sE}Fvf9xy3p&T_15f4VWQ
zr8W@(7uHjh|DjT|;?mO?q+>b4K@HuPEQ9>LP&lAb=#4jQoBiVP#UmXkU_TV@>*yIz
z5SRtp{9i1<>wvp6F|TC)V&K6dB-t@+i}-brbVbBctMaj1q5SXCw>={y10Vn<jCr5~
zn{)mc7|nM@k&i9KY>g2=!hiIB#1eO9D7~CT*mx&K((n|ih9~`{kdHDD0AQP1eR{6@
zs-pGUPIEOyD2IY!@C%W!w`?{mSlgiTn<gmy<$+FG^(?+DlsXWA27NDkogI3xQ(A<9
zAuc7I^%{kK_9dKk8SPg{i(xxL-x{ot=e&F2?=$g7P2z5R9vI|qFiTv8r`Z)3d7qIv
z>izHCeht<Zia>LN6lOcJu=hhvZh_rP3%LD5y8&eKd_XI_)$3;ed&0jvKn<N)!6&<!
zlCn^NDGi+i-mZ*<zxlrKjt2m0zE(<*xE_fe07mgfYZlj|B$yiliywa80vhVdt(<?l
zV+8;-hU&4{HIO_FM)CZ%Nd^EzoxA+)AfnTJwd=S80+-%*ixoHj_#)6E4rd3)1Yp_N
z7ebngA|uRf50M548!?hPc?eb+plula4)5m8;?>!dX$Ek)FU<qfs10EH_9|BN7<mQg
z_-6{~cd$V=S1Z33fIWhL0FE(A?!a*yM?u>&p2GTRP5AE39Ri<_Ai6|-FaM_YpQRH*
zT&9R9#d0+m7+VSaJf^@QpAm+?zuVh<dwA;~VgQPu(C(~Vk$^#5kOE^tSP2IIJFxVA
z$?vI-8zP2Gnt^vJoIlNbr%36Vb;|<t(W6|}TOs`kFk!A_VV(P7%LP-&>cgva#FkC>
zpJSm8<s*r~jUSKkmFc#$E5FSis`!w~NxIRo2<7s~J$xPz`VPP$xrn~C$E;Fuh3AOf
z@p}LkoA7v{-t`aaCs{mQ$UUiL96R9GGZbN`I%i9U@es07yKezB<Wlq5q)~pJRKx&q
zu2X#jx?-qBJYKvkwlMWOODdQpeqt3n>xv&!H&8@)B!*#>v5$>$Al@5MA)T&2HeS`*
z{L~Nmzxy#lvK_3=6@m5$>cq?s?=YvT2QY6r=+oJ6QzmnosIq0DRNk*31x|HG)eN*p
zjxGHR&~h<jO1H^v4U+1^f%iP9=C6r_*adK^V4+NMjb6{S;$d-=kSMrb+9}=1{7|W=
z3dt&9yuc6eL;asmJS0#Lg8!H#C1vCJT=2d^e4ab3@d0I8l1Og2`J-{#JTi?IaAH!p
z#t8u2Z7*;}ULe0W2BmX0{|!pBh!N@|D;IbF56GFOr_BI-rM=(S)+NAevR4&c0o4G|
zaF0){v!dY2m$k#vp)XPaIM1ZA(0Zam2HeS2uwk#dAN5a-Vg#7CkGTyW?5udZ4-vRb
zktj7Q6$yy#e&h#0x*;FcLWfILNdE)~j8th8)XqlH4U>U@xX%1=gJ)Q#7*7K10C9yY
zYrg*-T#mtlaWJ8&$AGt7@o}~H0zRJc0~Sa=*1&T@XApozI{k=}p?r%-7yEa6vQd!X
z#I3~H;^9W7tpyU6ebcmG#{&%U8b90szkLh9pzBmKetRzAiR`_3ATYkZaN!E+Nw+Vz
zeW5fUXEteIJ;)#mpgO<6ajpPA7?l;!Fc2rK%bSn;(B{8A##E*ftk<5}{^taahiSg;
z{SQYk?9~tOrEQN2qx|`tvxNJ!Sr}v63eUi+U;u9Ig%h;GR#uBjb8fvF0JwyrKWZey
zfD;w-U;H1N!UnnZ)`pGrk`uWzRoQGwkhBJarO<zhjEHoTbf>kRcmUH~9-eOUp>E31
zp5tK%7(#3ooLvA;4E&*~#9>MEVfoR~NMg!H+!8)^)!=ANk>t+{>m(gr1w;qCbQSF#
zGqo*nZ1{nRqHq77bUEUSXS)PazyWQMem%GU1f!L{*AI|fmxtB>FZBd;0O;nuO7}Tv
zNC>74khopJPiQMDNG~R>ke|~A*#&;|xNHV+sDn=_9n>T5?U^$IKh{=o+GnywL*q&A
zs`YcL3o^A?C-$)!db%i`Kcklk&f+&E^(RwL8-im$YPWaATmuY@J?yKGcnFf-b}KKS
zU#Ze6qoc%<hawJ%-o>gTv&;W%-YSD~ddyLxZ_jmU$95<6<2}R7SN?d!ARS_K5!{Id
z2`0EjlSrlm$&-rIn@oT(uJ_tiw%^47yE}?pO9Wh)-9r;0@!1WCy8d*`uK*ZuMi$V(
zu}U_uDq2Xb{kcS!4GQPtU?klK3`FmH07S5zu8EiBk&o*`9x#$8pP|5<sof?JA!!K|
z&P>qLLYFblU>&Q#i)jDbZM6VyfP_|oUBQ0>fWgz74qNAZ0qkzC^Yq`9pYHe<z*Vf+
zdXc!Ftc-=4k9+D2&L6W$S8*k?S89=k!sB=cb3lFY5Mss-Tqz8%R$!RI1Yx#|_yMJd
z`n|^!{VVxXytwcDx5yzw41;D!;emowWNGmifFD+YGr|P`{#25}3j6?*JG#t1x2K7O
zQgpJhiBBYFc!+b*^f7|D`IJDDsRcNTT73uU;-(NAng6<IAX>ftz*Us-!0B)<gpKwc
zJ`MR~4;X-%qEwtm<hGX|C{Y}PG+t}*RzqSJQU1i8&pHPmaOlmL=Ht(GVw_%cI>`Ls
zjPCPopO>*VSjo6<Bo1T;NpJJ1Fv$OS#!;xkKZAaS>RkN?5L5V%&?TEf_HsL$p73SB
zB9rUECevuDr2c$F4?)nnd;2>lb2_}Ns*7R+pi9{^f8P;fq$3!GvCw~%#zGM?W5l)s
z!&o8}(3E~5<Ism*<B4uIj16FyxOoP76OeFn?9qp8@gW&bzLyk-&g#{|B9tdsmI~1|
z{V$u|isMORYHXC0H?n@%2Z$Y(NSX{_3dl1zhFuK-Ltw=+sOU@+4RMSeWxD`O(}25g
zQq$K!fJca<-6*yr$|16;>z2ujFKN#1E|w&C{AzB~G<@+9T0hJ#fbKEbq}!<0#M8%t
zfJHlZ(dZ^nNbg%ckb^ZbuK<`c<!3blI*aZYT)D}2qrKdC6Pc%4hgW!kaa!3??+u<w
z(S3^21+ekmXrS18j3@Ym%41mS?nAS<45a7Ne>!&3-ZYkQ@v{V|0RRV>&DCN5z-4aT
z_V4a+4rH)@hXkWYP66^N`_#>F<QN1Av2(OL&HbIZL!WKvICBW5>rA%Xw1YEWiQNMZ
zGq{FNhQx15y)IY6h^&c6LT`=IHDG@qQcGK&CepLERRG3=i1<&R{)sOD<uCpB!vnN2
zYhR&4r|0+6%pCxUoIi(+RyqTrlcDhs@MuG~;_t>9fJa(5)jPZ+d881Ajw<Zc1fz;^
zTAw9o&>;#PT!GKh%klYi@d%a=<Sq198t4ej2`{TJUK@%V&otj9sgMZ2`WcGLMiuEq
z!HzA@pOL7Q$j*<-&2YWBPy}H`zRZ}<bp(K90-IR@{*^Ppt1PCl5Eq4e0esgN{&J=)
zR~(;ykcfpAd`lr=$rWZ$$-q6MtT<G*E2d}JAB3m`{a3<=usVGveGXjm%oC6Q{SA`<
z;aj6Mt6|Qg&c_dc^=hYN2*j|>*+$HpO3`TR>ms?2Emd19TG9jrF*62mz_@g`T54jR
znwihq{m(nm8=rwoNg=@o2O^ZGD*-B|8=rk21b4SHVlIKl?EzLy8-S%y@lvVkHWv=R
z#On_%(xFbBg&KJS00>9d4#_5s4oRm0@VN?lFjwf)ORkr5eO*VuRoZ0mHeMD^_66v@
zK)puU<{5(4j{7+UfWg+gl~^Ft{+9;-5F#2E;wb@4)Eqm?Q18O`sVJXr80yM?>A)O|
zdn%LUKnBp%aMM71GzDsLh4Qs`mW!_4)CxKj_V7Os*I=46wHiN#CIGNZ{lEcg2bu~q
zsY^i2YMp^P*+%=}vwX8VlYax;6_9VDP|Aari^WzPm%FnmfvunGd{hXu0MTGbs+;aL
z*Uc~{%?D_BXl}8JMwh`paNF_=sw(K2izcY>8)yvLi{N1p=IDjGXCHzTl0f>!ujm<r
zKx${e1v)o%5KSrYyK)5D82s4;&jFkaS>OW9uu}nwO6#9EE$PeSaYIvp_mgl$f4xmf
z3HSf+YpVIP2O!z=7>F<gVB>fiq5oUW(+qfaD5)4MyE<*#mVc#aRFyHRnnl^DMu$ih
z@eT7fBe3)(m1e-&rm)G@?+8}pOFTJ0S())Q0cv%}+=`Yh_pv#ghqFI=nx?UTbn_d{
zEge-7J^a1pB=6){S1XSd?}i9Zr?sjzj$#I6ahoXlE;_<qB_fx6{6hIzI=c5UOm0oR
z0(T*s-TdcyU<)K&nt*EW>!o;#ExZ7_`mkRO`INz1Juiu@FJ__{5e#?nOq(B-Fn<F~
zKprzz3*7j;6jOg8vm2P;B_|6kB(sAcOfg>iDZpJ2Qur11P${Iv-u&Y9H+ukU^0=+_
z*i<~rO|ERqB(s$R#-Q@YKNU@Zg&Tl3PtvGDYS$=Os7ewxUGdkzsh3af0SYD2_q!V)
zxiA}E|ETSKvG6{@MMB@6wPoPdPaLd2#-RS!#4)hf`k=p!8ePc^kFR(EZhM{afzR&V
z6S!1}sB30xWa<q=5QADV{z=3|_j_d3iGcs@2OfYpz_>))Ll6+0DvNq050S`oM&4h^
zZ82w{46HNxtdQO+Dc~1Uql_QDeOSPk8SrSDc+<!6MO&r~$)s$G;m1b*fNJqu?a!5{
z7M<t0@;+i%<eNZhw|Jb&Dj4c5b|B^a56B6KFUq5Qfc_gdsj-K~5bVuA(O!lGv^eJ1
zNnInyB=|N-!`DMAmp#mI8~L9--vQ7Nm%^A98%<zw;`P1{D4h2w32Iwro<MRgm!_WT
z->h_d)HXIgYp~oqlIhw-Ze+!EaZvda&yg}HwMe!0hlV_LQd-wXR?qLb2?MzPKY&2#
za8zmVR$&VJW&IC_#_a+}g1A_m_;m^lE95pIEY50KttrkV(=cA}`YpLFJ=Ln;yU(p1
zYqY+9+k0~@X96T}red&sZwc!&V+4l$2&GU=ZpvxqA=iJqj;Z5F#4oOLsEqkXebr8M
z6`{fr4ZuY<wnkoHGWP^SG0x-BmqkMgjB=i$ULSNR=?|DC&~|ic!0C{vu)si34ft7W
zR21i0MwcvQ<v~^~CQCdf{A0^{hkn{pxS<HCf6)}vGWn@_y##?U&IF%Fa-ZTBkz4{S
zNG9v7LH;@Ri!YHabZt%OB&7n7f`)TROJ=dQ^Ddhd;pqQI(|N~J`Tl=AI_cQwAY`6%
zke%%8b#M-mSw<x@GnAFR9V^Gm3gHkUBT7Pch(cC2NoHj4e%I;y`TfyfdfeyS*L|+*
zevjAl_2zWgn&;Fqoz>IgCJnk)5gyXI$OXw5?LS3agty5(_0CfBd)JvjW5)kFw84db
zBvZaIy(^_G&`s~<qnMipZZpUzHY*VBek{8prgU)z1hLu;H2Tu!sQB}Lu4FkU;Tk$v
z5}|DEa^%LsESgZ7SIE#oC5~{R$A(RM9Q)lI)3mXh5c(VVVWYWRP>qCeI0&ORVdSAj
zRQjCuAvSQ7!NqRlv{re}CkycNb8x5L^X=!C)6D+D+{o#qbyN(UkM?PHu!peCeJ!17
zE?XpTImy~%*#XWv03IdpPD!F}zx!^NRrZjo{a4fZ<fVBK*wMC+Wy(T51{T5V#r<Gu
zZ5=eh0zx8-uNwFw*zR&|)!RqGf4wBKNNNd|f5|NA%u}O>s=>i2Gfd+9O-GvN_(ttK
zTrae}rKUSJ2#4Tz?0JvOXTz1a<c_Xpk1){$5bhqp1;F#m`EC^}+-`1J&M%@tagl=(
zb8hIRjQ``@A*De@=(p|o$!3626$sW{SN<VWzx_pSV%+`nyC$GjAfI5jO8F)mGQYR{
zGB;;3_ft$~;ttKx&C(&T-7ZaCf+Bm`^<5&of3eKhX9EURN&cs*!{a;X4{xa%v6w<U
zd(g=reVLVpp#1p$UHw%N($~BILp-ypxVcJ2qIBs!_>BocN@7%EUOJTxXP=DF-Nndw
zkK*ztgZ`ersTYOi9z)0-dZ%u&Uxwayl*dU}2B4zUhIf5M7eL|TUAgM7i=6)bRZV$A
znjhER`BBqU11L<$)Bd?Q%ziJL&5zzs=F5e%4YbInqhpp9{<>=uSSS&o2t^;l5kEsj
zWPRzDl6ob+-N=}c&L`<lLP$ogTKLRG5F(F&l#h;~XrW)hYi*1%fx4b+$?ia7;>@nI
zk!*1<-TEoEM&nMFeSS0@s^cAalh>`<GY=)6{`$k#>U9&Ge+!AT%1^~uEAT2BDo3#B
z)}TL42hY+xud-+cPX2+y)l)t>Uq=qeHjO~T$w{kiO$H$38~uSnm^_jo4_j5T_Zk;2
zNR1$W$aVZ$8Ytf71jOr6!h6N4$B@!$nKggNX(Mn-?M)XIrseUJV1Q?i$nl2~u&U*X
zDqA0BP9GLKI|j{^Dwgv>_?nyVX@~_nO;YHoD~J#2DnaQCtW}p~*EVn`I>#=`zO&#w
zycE@}$ULPMgD&2s2`McL3DBu>SIb1IpCd60B^ZZsZ`5!JSc1$3%d9g<Nid3%VEnF_
z;D??26H$M_b}QAMObTUif%diuV|EJ2Rcc(Amt<yJ;`QGI{^?*!Za^Q!M}k<j!Rlac
zyL^JJ2Y<j1ftw(DGc}&#ty*XW&?&jHEytU*U+&y2)r^B0i?}@pxv4>c0<H_ml%YPX
zlaTLxHBr`JY|AxpJBz?(b$JBf_o3Zm%;zc2M7i^ozTmIMHa-=gObIs>ycnV<5MMNw
zvaPsNvJ5T82Q*zKwcL35T@k3a$Uf<CP?m2$$!2+yV_NHU8i37#-ZkKZ`99dt_^IZ=
z`90xJwJWML;}li$U0Y;o>a4z4@l8~r#Vu$~BagoB$(+2U>j)(!`RaC=uHk^}`b#u^
zC!dy(7=>z-`30-GN-pwJ-I<Gw+Ol}W55jU2KU@3jCVA5F<G;c5msPK_vzCB5f>caj
zdqHddC9(W-nD_V{^rYW)dBbU~(jC$~<=lQheBfHiI}WSrH%*U#R{&iE8eVeuQ8!Q2
z!w34@#$=6*gMoKK^#5L7dMX5LA*{%6nb?Jw!~95Hmp?H+TI}LVU=Tbalz2^eq?U;?
z#d8&oFy8aA)K5>NP=_HNrko#V#T&*&;Ft}<X=IlWRTJ5`dE0oU#cf@nA-#WDq+KXH
zRk)jVTYj0Pff{dvTun3Ze|gHea|}8rJJRwdKa!zzds+tSWi2dE9uy$wQyjftBfr?!
zzO^z_&poFj$={epA+1h<>D+nw&1nn-h1vont5Pg?%BIldvoCt8!N=fu1U_YhgkJby
z9RzqT%M7MpH})Q*3I`(G_MO(qKbNV7carHcufk~J6T6H{MXMgHd)=H@<XUJZC-3(@
zrWyR52Bm}6=66eY+)m$45>GqBI?RD`WtOQ!Vd{8&Lfkoi$tOFDzR{M3<9k7|Jz#L7
zaA-RZ#%J!y=W6O6?r3Zk63s~(#u*qmBzsyFbqybQ4TcC~kB?qE=S?#{`6CALX9Jf7
z-$kAxv{UbIHK@s7Zn7hzq(I{&X-b89eqV{gG4Tg1f+P}KnSz#^iV-q+&!6+jZkW!0
z@cLjr#ga7S#<OCFf$Nc~!dS!kyGfl4Vhx-G$A~qMv`B+A&@tz>Sy!zngggh$h-Oq;
zMEXjpUmie;$}~*j!_;?cOV&a7G9+#B)m?iaLJJ>0!{bo-;di~$=L=|ut5xlECG^1x
zyivDGos{vITTGMid8M6xHA6#%=q+N9ZD^=rO-`3DVX{3?SqJcz(Cca@l|dbt#G+8v
z<uNwd@7^$V{S+j_S7eU}6^zbQWaYiXdL>gwMl_emh#RV!lal;Z=UeNF<nY}?Dg&pq
zyI$QBcW#erL!!+gd9OMfv)q5fJJ?a!NgPbTR*Vyn=jk;c8ZK4EuxT1jID$sNuh8wk
z<@_pJQ=`F^K`DiYEm0RD0(9jXIs-%OzzHHnO&u5)UDx&5gdR&eH=hJwGIzs2>-jlf
z`w<{1)*r^brbf(Nrx<JJZ&hMnsj7|fT6&K3$IpJkx0So@Or7aPdYEHgTB@aL@F62(
zj{>@^2Csrw($>5Ci!6x@IQ_yzNq_0fZwUO>69TUmGSK+u9r2m;EW+CztsAj2l_O?O
z>U|DU(NY5ZD`btu*C478EGCOkdC^HNMp+l4ntHbKNbjr6r0Uf))ItB>Q0N2d=DhI5
zbHnGN3594mIN1qD6wOVKIEC}0HtNImVfS5I>YMNV0_lU4NaIe@A;_e%@Q}4Mj6P4k
zH7HI?5IwI(e>`xx5+kWI+7|$I;%bi+DRr&rMq!#0+Kxf@h>d5@^;@!=&D_G{Zb~+R
z+ZF+yNN(~0F|^?^jmUE@f_dtnr(;|NmLu4o*dy)st4~#;uNAs1GV6*gG9OE`nXLUP
zF<F)k;kFDI-!OXlG7BpRTpIGWe-FlqsuDJ*c&wH|U01tPr=E_Rb-)T8JO_Oo&*xlo
zUTx;_H9L#gFZr%(4)D3Jeo`xBoQ?bD5S9$90tohaCk=(EkADjd=;jU$kGLwVfLNy#
ztc^-LXU7;~INFbW4@yO{XJ^N@AM9}*!_S_pqn~j~bmQbh?zfET!8*HW3tXG5j={oA
z?}cSO`)EJ$L({+I0@Dq4K^wZP3)oj3(|p768)TnAQ~O=t2UR}Q-yAzS{T~(<w@NIc
z5teV3D7(Gn|By;ggHmTS-krmGJCux;)wA{t`2vFdTRFn(8Kul;rB9x8?RD7(8+WY4
zb0eSAHzOMR8EiFslaFtTMhtJTt^+<~ReyOP&mG>;0wxqZmDkuIBZPQtf`bg@aa}p|
zadCCjUw<6AK~=OwXJWWU4ub=op)t3e^V-x?*_GnA+NTugQ-94QLlHZn-;!(x108N$
z%*^NYU*$QbG+3~?y)~oMpTb{!t&)=0HsA`b3YG43h`5Qoj8wYL3U_ILk1W3)Do2L>
zx&Xo_T^p(`qm2b2_dzVLy2FJ%*0wA1AY<$?IWL%O_2Zd^b1S6MIt5BcLrfEI&t1cW
zlYQ;>>o2ets-zUI(bML1>7kUp6xihPPBkZm$}Z1s9jJOSIH(kh5E=5>n6VY_g^LNX
zu0G;Wbq{IkrcB8N=(dFLaQF^at9)Tn%b7TT?x$9cYLt68^Hu@c>XF{}W}66siL*~W
zq%k=|bSiXOuLM@Uqp_kJCl{EKx~?t->NW&O<)~plzASEXkXwfWrh5Nc{}5CoTVz<Z
z*%S3%b<ftB=2gUvI%!cz;B?i(ccwoREh_RA8dOdse&;ME6^bg#6wr*H=okaD@={Fr
z%}4-X=GN_RdxKcBoEY>`&CghSkJDZ6G<p2wt?TvVe!s96%lmSnchq+7if!6_j7G>G
z>(K08KUQT{vm4`c?a|=`#fuELDkh|3)PonozhYzS2SzjEFZqO^@pXOQHfoArm%WCW
zhI+klHB~+Oy^Fc<;w8PH41SpRR9*>WHzzZHv%&{V=JJakp5q2p0rAWW>RD)f`@+i6
zDZ<7B;FJ&Pv&R9=O()9Lz;~8VXa2J&+1-Vhg8lUG7DZ1+vwf9NC7q89Q*6goc-^$X
zE#4RX6ESA8G}SL>vE9yju~)i8@(E2dK|ey^B|~%pH;eXrixP$^cBTOyp{>DI>Sv1Y
z!#+Vr8gs9JeM%S6Zu4QC)v6}kS!-N!gtgNyuFr|DBD?E_UX9_q({INUa6FXGo8+q6
ziS|q~3W1#E&@=8*eE3w)DyLCX|1H2HFMRgb@<U=sFqyh>ZDRFod2KaASB@*-TpENV
zt}-P8W1<%KvW0Wg;bJ$k|6Tfu@0mgzi2~(wle}ZKBk#4;<_bf#bC<R{60W)Jd&~}R
zcyp`Ei)$=0`dw9uNl~1?Z#9D|T)@BXIHPFg9f65V14&jKx7D3n>d%BixxJjvJYj-0
z*+nI-4@{2@zwi^dFCiWJmrZsQlb1A(SQ2a@`Y9|bt>{VwAMu?K3X^&CD@KaH16A>k
zy~BY~6`RQFN~W4l7<GesluzIl9a`qj-8^$2N#Hm|EF=#xbwedn7)k>|!~N8Lo`(nU
zJU627wmq*R5Z@k0-x0#3PB$$~r-`9aO3UO#PJgsP|5BF62G}KDdB^yuXDa=gF2mpB
z_jQ0nuY)t*sZj~8`jaBXA6UxLb7*v1l~uD^EO0Hhv*7AFzluSc^zd5X(LM1%t}@-M
zB%itWA^X|TVU%!W`=Va#JJUh20ueNMWh9+@%;OlP)>HSZ?>na<D6SD}VTb7Ndk95h
zZ=CTZr7VTf<YVE5${ua}Y&%lGJJX>E8^KA<>>#`riLg|#V@s0C^t_ew=jjrY&%sk3
zHfQ@to5C@%TzAFyzQWhp)bvY`vq);>@3H;kvC#}f$cfcoa#6d_T!sQNPQOtjp8@X7
zj#`#!BNR3=9kLd<M8NI2Rp$NHO?I;Gt=x`5%)RJNdvAZ<sU)9LpVO2hJwfrPYqq4y
zj3aPnGU=f~N5;cX+%6Qjn)e$`$MvDhp!A`;<-?<9L?!iIl2L}?hYNFV#|3A#CrRO}
z238qHmzG(eqT?PBHc#vlujf<$5bS&-YTfeoeXe|R2m2$EuVFT?bty{^!~N?QS6V%G
ztx&Fj<|zp`w|QKWmgl01sc{u5^PY6qG^e1~$EGAci`zJF!<uDRJB>oXxgQFs8Fn8|
z=f5uhrLG*~I4Du+%pik!d|zN3w`DC%MV=ldaRmm|ysbj9n>NV4<Q-=(H=3y0jJzo}
zV%@ORR8{omlN)U}UYhXz;F)Jv&FGs4t?!ebJzRSmuO|zqYl2_sMUG15-^I$2$A7!0
zoe4cwox#P0@U`~%fblQYGuwUTPnpGUw*-4fUK(>g{C_O~$3?5{q$<q_c}v<T9Djc6
z^><~;Li^4&0~#*xyb5fYzU}u*W^3TxZV_U5No&?htbUcu&_X}s?H|6=|2;LEzyB76
zNUaxXD;XPw(jnY<->xF(-dtG>iRw9te6(DDhywG>LhEhVI+57x?KeSQg5^H9%;_Uf
z?B_}4H@k#!iaa`TUI->IweC)(n5!A3ZnmTPA~_?+(MDA6qTR5;3k7=N!HrRyrhzF{
z5><Svknc-y&->-}Y|)56-yciq3cbLY<Uzl5UQKz8?qE&%w(h5)Gs)|Dw7v}9E=TB}
z`KWl@>%+yYo6dt8x>!yF6pe_rJx!ugIY%sN#>mgy6oS1>>FTOkRz3;`iKnce6aJgg
z@iujSXuMRaHPKIPp}#LD@;on`RkWT3MuKMv^w8vU<axE#0UWM6pzRWKM&5UjyiBO)
zO&VE#xgu7aS{3m!K6E^+hU3!#icScpJf}5Y(5rZx7c@WHbFg-H-i6NX8MG8%6fXC=
zdKzwxN*b?y2#tnyt99EMd{x`)V6sS$V7gbQlb*Z!nQl0N2Qy}&=1XVUH(i4h`(VJG
zns8S?za{%LY56WANi>2#jFT@FVcD#+j|VTU)>UrC`iTlXKI!UjUZw|p4^|_owS18^
zxnB5QUqv$np~2>|QW;^|#w8FDj<`E01rfUDb0BvqKB<#c;D`Mu^-{9KYPQVSPSJ&q
z?$(sA-+Psr4@;DNq=>dTH4+0~)iqEUlN0`dJA{a7^&4jJxjvyUNq+*kW3c7kP|Q#N
ztft>PG%hahSs`l;HRv~AzBx-E<)H7pg$1or8+qaWWuMsI7K-J*Rd`NzU9ha`9fB}3
zV~v+pGtwv);-kq2Jx_t_h(V+jkn0YSh`i|)bJ2fut6h8@hK2AS*X+*(De?dL?7OI$
zvUR^ZdfvM@x$O+p{Kp3qrv60c2NR$c&pB~hFXt>4<wA=e0kQi}9ou*5$*6Ft9Hm>R
zEVVCkeiaOJt4-Q94tJvN>`9j&3xc5Ulm#y0FOB)$sy`9!=5DrO#a;$WecnCT^m3};
zg|H2>-vn^FW%E;Sd>M6i7T|Qe%NDbJw8N!eGw(R$25q)nj7Fqq&5SwtEz~DHnoFii
zdKdga4_K4vd|f$z`piRi^7y**mKszv1ZUcXOMf9ED)TXWL7Kir*`fOISFXjH>MhK*
zeM1c|P&{_suJ;JzI&8{gD3Lh%UQ?xiS<2aD!WP|uFa2Sr)M189@yT&&?B7IIYth-+
zs-%^o-k(|~he?YwjCVg*pS)cz4Lxw8xvB3GHGRLl+$$>hCLXhJ&-*5G!v{47-s{(`
zb}Mm_HcD#z<}=ru;Q>xur<kbSeI`O!Wg3?mj}7i|q`~d?QNQ{VU;`v2Z7qFDLdk<6
zotp5@X+1j6-H9UHIsPwI8tu^)!>U@i@^I#uXS=nlKBMxZH;wJSqCb}R2byj`<q7-G
zi}s&I<}t8$obBevUe$`okzmc&9Hde);aBJ1%l3Mdjm9(Rck`8t^nQ?s-$Y+UR&qX8
zT!Nf$<YguKd*k~?m&Zg~lP3P){fq~pTeTeLmozwUp3|Y=S6_RtB~&X#^#;FLiAy`i
zvRzFkANtY&HTn+U?BM*DM~B8w<VTB|RnU;?Pyd157aLWqyyLXBJiA$s96>|Iq{#j<
z$GI&{HM>T0^?7J_dnEVUBKd&xf;~);Tb1uS_l?FJHL5mhvsTmFj8fgpnUsnQAmsP7
znzWsr-^6V`l=4Haso65U86W=o2EN(2>ATe1d82~wB1tZSJhk}hOYdQSf$0a)MoZi}
z8ESHw28!f!8i}(EAi^$)Y@T%s1U`wp`qW6S@&}-I*#)^q;A)L0wnBaE1;uur*Gkw~
zU}~rmrK_%}X9j}icJ#Lh9LyrB-6c&yY=ObSpn<jf{jUw-&%T;FS*J-B>LTlnMV5TV
zFG5aCfcrw&J5ldw#w*nVeZF&;o=jRgl}E(d=@F{&S}72w{$^#7harh=>I0K6<SD;z
zttyckqy&iU?TB)%%BjqY9C%cBL&$&Z(^_O#_lb&3Hj;%HhTk?_QoVGqE+-7}{tcmS
z6R<gIe{w&ElKHK#Ja)Uj@JZmxCrJq!38hW9gL+PozP~~aw<@(4{B2PsB5Az)Ehz8I
zw~&I~gre}|jwrfTzQ|83SpqorAo0!wY5gnCCm^ZqgKmhf2Yd@EC9N!u%D+D;TJ6py
z)^%&s((+G-xwUAwAN#=NrIot&uvf=P?;87e1C3c4OwbiDQ#vPy{PLQ_Q9-8T$s&ui
zBERAG;*O%PQ344wyxS>=Xg~A8ru=V4H%Dkc4pObM>c3<j;6n46{J=+aYIeEZz&}Gp
zr)cdH;Btf4ztcw4{G9JU{w=`Y5h)O26uB*Y^8yQYlfx>i5#^xf5cOo5i<`lr$$){^
zPkis?>(s-i{VI8DRZeK$Is4B}NigAMgFK0=7~aK9hH|YC63z#E+$Vk?%W@&zs}&z}
z@XrEc<o%>QtPsZEp63#tavfBRSU|W#31tChGvstH_M}O4zP>pnYNU>){PVg6(IIku
z)-$ooJt!LSn+p_Cx{M+ybEv6}dylp=OdzH!NdafrePQpTx!UgY^Vj(?C(X#1MM^8r
z<G7mRCs(aQ%kolKA*6?0paj?rqx5m<cULK7ebjn2xUo5LKXy>=$tW(N8k=V$@?-X+
zQ+e&J*PEttUXVK(ZO&J=(MujN*^RV=QJAddqO7Y}Wc4!r)Z5jWud3SZ&<*)lWtL0d
z9r(>*F8&uEfUqzB!}Qpm|D1M%U*^4UW6OGnA;*45Y+dbE7++qP&rrbN$E^N<ppQR%
zCrk_e_?mMiJ@Yh^t*O4*JO@vHYc<jKrf=?sz(wux+MwA2*Is%ym8tZU*FDp;I%I?P
zJi>2H;?y8mzS>T9(A3*8AjCvGG*;Np5u1T`KrmUJ)a+u<eV+7HOs8K_rj=1Wq7F}V
z#rr{Xj?gTrhd7_#X;?@!+Rbx<^NSH$?J%6hPYC7ayvwuK3UkdXF45!+sL`1zo@iP=
zU4|MJhEg{QWg|i^Kk~%XEX>dP2Ovzxm=PZ1>+=fNix)Vf=hHdg2+V{?V+B58a_`=>
zP5q>o)^vFB#?h2b->dqDcSU=Kj~FGVLKwuEa4|a(x1v{|5&bV~$p_|NUaLb+2YQgY
z93J1wSBn1f^PC8r(%wLy9Us=<as9`&&%c_Au$WEOB#1!=)2sNFr%tMc4lacmQ5?Q}
zd`sj=<xA{4T^RSpy9cPWrErzzx=1@VeZFj#4M?L*bh^Mc``j#{HO>i*I<VRk{c*+k
zqQQ*hVC~6P&(h3`OT6l2+kqIrPbXsU2p8$A7-BHd%howg+@WZ?`Mz&OLBjC9>xZvF
zBk8T(BwhcWnt-{!tjST1Y9O7hbuCB<)23OYl#L-~%P#zo=%VlKIt-0kK;=h9O9B;9
zm79|HlIyvb!p9*G`JuV+TJr3!`r_|T`xr0_s}*{{;vzMV#WrqSI|l9R@tW&|^FGBX
zF5@SZg>7{X5`o=!COGnr3GJvQts1rBB#w)I2KLdx>bJwhiUP;4xl}Xa&(<g^4e~^U
z*Dno{P*)NQ%eNhjN+g5>+&*O&X4$z1pCB`bB*S~<u%hIRf`T@j0ycV!a!C+}fU56f
zGs+HZaoIJGw9M&AR8-F8qN~;mK8k`y2QtL6{qCt2mf%bDF|4|~vLYLeSYGhU93d|i
z*18mkfFu79miz~2`dt#^px01^N9rig3vPX`uanL&JWx8>yq|$iimz&qfw&XTN`hVI
zNaz5S|D+GXjVc<-TN-Y*ruh~}ISe5Z5bi$sF!InV0r$EM1b5HTh4Y7tn^Bt2Ye;FH
z5GLxM-4<d?5Lewy;?0mSF(*H-P^zoYgSOdx&R>*nRs4!TU0}-t7a&0J94bbU3S%8H
zodxP4&nTwdQC5QjYX~<HV*@T8k3Pv!wv|^51Bku{7!-iM?t<Qq{v|4zb~|H}g{cHq
z>8r*&Z+pGD$p=HvZ;S?Ph<*$=y+r$MT-iL_=$f=`nup_{^Q4UKiX8vu=`|2mia9cF
zrFdL(pvn6cs(Zg{P>bZ1y_1sNr*{L5fieCWezgWD44vKL-;-QqyXlDseSa=_@lAao
zG+jY7cfv)or`Uc_TUo!lMMa7XB?t{vSZNfNQwfkJE#f{%3p{>fa%GX&tW&8#*$e_A
zpC-J5N39^Z-&elJXAC<GC6CA_554%kmZ*fwcXMFTfL+JWCn+-V)1e{Z`MS)M1KhAH
z%=K`GD1{PE7$xC%KjQ)&>9bpI7N~28)q#E0zilkbl%TKRDENI<=_}J;fAHcphtih>
z5tQ^Q&kcUoxN$ft8VIds7Ciw$xV}7k;y~j_J8r4bZw5$U9umC-XB(h>()m7^<wOdB
z=b)Dz=@|QZSG&x(bW?4INwrzsghiCmkcO4|gWmXUks5;-=50i+KF=xKO*kHb)qdl!
za^*h$O8rxHuu<k=AdCh=DlIATz~5eXD1B4IgBRjwW$#*eG#Onc8Adt;$>FdZ?(J~)
zk5V)vCvUgRV-+zt+$JZCPMN2cBp;!1o79RlL|3V{C5X{M#>j*GGWU7kha<qfIu!j}
z-b={fgU<x=A*O2cn{od@aOGxv)tjJCT%2Mwr9@55=TAwxagEKxk-;f*(&ObfK6&D0
zVYWLmojtyqc%7t96SLinnm>9e_d%acR899al7gZ!4X>wI2Qytr(rXl|UG;j75&0-~
zBhgq6nbQB@PeF6~515^4WWVtIYoxwyQvsp9WS0zNP-jZ7lWwCsx_rf#YmEO%k252c
zafpGX#G<}%-}q4x?LJHW!7C-B#r3Pg=-^A4Yp)9rK1A0jA4sNTs$TyKdabETN*D(j
zdvzE)4s*qqOH`xueCW%jE99x`Fqd8Gz7*gVtcVl>Zkbj@18ah?kP&-l0NKD(wDy5@
z6fAUeKmGxJZ;oD;pp!`RljR306-OZpY)fnm9Kypqr&pfZHgI+2ft;}70uxF~7Z_qc
zy$9Eq1D%=&c9+=l!}=wyr|m8e7}`G^d!ms!3hWEAJ<UgR0R(hR9%A_`*uuDI^&)dJ
zjzjIy{hFH>cmm}nF~v2%!h6n7?4>6+Skzj<q3fePLotHFp%DA2_;UiY>BlqpO}(lL
zb#vfON)%)>gG0eJTqA*M+?=D~PcaSlSWf5L#ib6a7lUJ0KdgyyKg`&plG-$4MBcqf
zy94Bn$$S^HXp-Lr`LtjMW}B6bnFo3LN#=s0sH$$~n_@0J(uRFf%o31<MuWck2eNF6
zK_{y?CmH{I9kFFXa%So+vu-w^d{zm7^}MFjX(P!Pf}_HY+^=oy%b~I(hlHR|C|6I1
zmw9)h1am$Dwh}S>qS4eYg9m%ps$A`cL;@V!As^O7NuILkAhKuHNkSqG6fxWb+M>$_
zRW1{&IRSg$7e3(jbb04XiRQO-Ros_nsGC-w?AmNS&2t#N?f&tWfJ)LcyS4?sWEsct
zg`qzHH3>#t!$9jG4eV)PAqH7FdaJ@EG&9*OwS7qkC<v)LJ=bB0j;a|WdY&ow6h3r$
zwNU8L%TKZO9?rn%k%ILxm_+N6$L8ojWisV+?HaJ*Ck<gE`!%IHFu$d9C6}U<hX<XQ
zO-mryL6Vzg!wCWdd`a_HUi`De05}t@IFrSf{@SQ0V^lCYvKZfpHIU{2?yPEB=l=Sg
z9Q=u_3@{K=JlRwWI}rGr8?<jIv$^nNw`2S1=yOL7pP54u<o1;BL9E|#$Gh-73U2cJ
zT0(sT_)*!7Ra#<ga~cvacI@k#8$n8=o4-HEbpbv<S#Q7m$$k&2^sl}zv+R||EK8;$
zFn|q>d}I2d>>rE~0WZ#%&Tn;d7!`<AzLgnY6w}doO@eRGz;NDd;1y9)Ujo&8<_EON
z!fm`i<LSm6I}myuJDELjZVy}y3jkZ}1|0<d!3Gj=lAxC1^S1O4R02M&(Q$iOq&SZ~
z$C>Lez1lb)&H+e*asW(O{<01#zY@k$v>>8yPrTT$$I_sobMWYG`(HC<Ca*ro@GTnK
z|L=;x)ifrU;dMz=w@Sc7nB3KFeS(8j*m0Am&Hp}-Iz-?$YZ*iZ^+X%ol!FC5d&NIR
z{1n%V0j??QV&Rko;yuYG^aN>+pZ^a^_yR9>^rhx1g{UNXw3D;%YM-rWyx8u3>HY8L
z(a%8KCAZX^dL}r_+>aiA|6=XQK|SI(G62vO0B@lorECfY{GsRqOy<FEzvw^ELXVBK
zNt(3!u3+A!sEW5k7Le*e=b<M<Gy8cr|J!_gWF&wu(sDAh1#hso8U>h`4A1eHe_xhb
zt$AOg_?lDOmF9Ca(i&yP*k%&{KeF8azAhPah~5q+b<ZUg1)iHt@JDJ{iGP3a#RYG~
z>sgL)gqG{<Qv<+gnRVz0@!u^fB49BbC4)`nAdTE;XS(uZW->-<MR>WbcUbe^ZRl}9
z{2$#Njb>?-Uw0HBWl_(kClXBHao^X26|3z)O>RF$U}XzSq{6lo`rZF`>xK}C+M<`6
zjz2CbH}AeR-4a1RH+u15{KEe}6dW<=W{VGN>c-%sg;erOq>jMH|2uhbD$0gVR(gX@
zD{;y=szfc4(I?Nxz5n~JEgH?#p_N?aNqe|;Lmn;+-QpFl7_?gP#bZiB!N1i*P?G$e
zHIbx6-}<}z7yyJM(EwmbGTGUY{qGXh@M7&9Oskqd5R=ZNP^a(sJN%F%>Pi&orl@ZJ
z>2t{a_cfLjNK54g{o;1XhgA@0GDikr{hzq^-+d$Ef*BNAhm8p+0d4%bbXHlHHdjK>
zw{9X@N__5Y5F>Ab%!Zlf5j*MC%U1hnErt~;5PA?#)?!&w&*D2jtV|q0$p(^q>rE?#
zAa{GTq)u(g(QVb^m;W|e74(48<x(SU$5Rt<A~&fke?D8e@o&k&1?WjVOYRu5hK4`Y
znued<)&4uO{3<WNn0WaY%n+lLT4uJ|R9_TKt2R+!YUpwJxYK{PuA=(l;Q&{(cz!7h
z_wROmc$d+0SA3JKEaShwSHt@~LY?DwVp=qnQECL-1fUDuHe34P|9<K(phk=#_-js=
z1V`%-BcTXvI@`<A_zS~Ke)<C=J%LIBqoS%?t{+dmKuY~J9dr^$i6;8Qp$4z+DAlTg
zRoN5<6yOrLizXm;kF+;t%|`y)n0%zhDu@&_wZEl-b`{hrbOoBEvzf8?QY+LwkKF;u
zkqM&T&+;dmZED$fz)*!=An>s#wy#&b?)Vi!B^XF}G+B=3J;ziasx~-^@2iCWUuaU&
zyrE3lrKpie1o;zKz`#gp)S8iHt5hxR-@~s#y+o&3JXcb{-{To+*T&sjhI0e%f~2YF
zW+mu~t3QuLvR5etA~9S9>EBKK?~7(YhxF5j*^L0n8_kVGKUf>963#EL5Bc)^U3~p4
zaijrJt=UHuej~7hH@NV=a*|EpK;^D=XeH;B@V}`hIp8-z>C$)=aHF8hQFtnJK4i=&
zt29b1EhJmf@PE7Ng*WkIYDXaC+55D{Y>twU58lZQUxY!3{Kfg>>7MYiMBX2J2zFye
z!a+w{s0zLHf=DL(cq;dLpLUpchz_iX<KcMCw*U=9&Az@ulHx~b!1Mp;pHSs&L#H*t
zvb(@>ZNKq~loc9FcWP$_nSSz%{SD|tWD>0~SyF)NeQXUx6a%BH5r=S}*^rC9fGLsm
zDDaiqwjK+YOJQ?pH6IaWj8L}U;EJAL+P&eT`M+<k4BvzoI}Ni0vDI>8n9=s<%ekWi
zO`{D=9`{N1fhA(0?&R=WvwSMQh0!_0X_UR83<$LWQC+t;U_mRvy5v7rfp;YYZ&W1@
zCafaj(t;LzqyGEymte&oe&b>-UC6=uSH*rz;x<0WO!*e||I*9>+P}Gf4>=!$aVEd&
zdF$O9E2%-*>u|UtFx80Gz!0L`th7*)lk$a99l=4uZ-w4h`*X0I`rpIN=7J4gDzmQf
zDJM^o=Z=0@BbU4BS+@-os5j6iM7g+)7n8vPk?%VMKc9zQppzmF0FnpO1xyTK&0qwX
z9U~Bl=XLrp0qtjq#&3r6t}f;+tIaA;|C)W9SsQQCUH8F4+hg83a$~U~#VrW|0O;?{
z_MUdI0~HGHJ}<CWoUXCRdZ^#ugfT;=;QQr6ISBnEo6RPhCz|2YU4WWSGtwj{4hj-~
z^G#PSBaurp6M*nP1H$iFz(_&B6PFQmc4%(q2PFEvzytSIKYdL`JrWUNvqJLvp{xF}
z`6Y;zy7}tTz$=Q*@dO{pwEwow<9?Nk8Q^LzOIIL&zE=o?c>NFczeUkAg^;Ox+ss0f
z8o0YdE^?%&FdM&8A_emTQcSJmC3`{NSj7F>KPnOExt*Q?^WY;|uX7sBegI5*Z>I5x
zxOp!D0Oy>>OSP`p;F@9qnItCl#(QvkZ~`P@WO?eD2ETt3gqTbK)sVS+kjBmuQJPki
z=~#MC;obx|(Lxjv*D&>5=dt(&P_DVv$Qy7lvU30j&I@6eMXk`A{6>WD;Lz#iPQEet
zBd4n`=;Y6*MQ}h|1EbbH_7PdR&CFgPjV7)zf1#4G58pKRHMe!ZK-`qlckEMD?oPC8
zbY*u5g7^1GopzY#?y-{wfde6V$JW4cFz&P&kd(#!m*URVCD)(CTAZjFW>9qA%5nm4
zzHYdN42)mu_v)4c_Xul=<z(1yt%j2g$Qfy9Rsc&O3TkSB{9#Q1kxO<nfIi2UgVLkj
zP_aG{5Emzsby~|0F4LYaB;V`>|E9O>TU}|d&jyRt7km{0!(m;+9|9{6auGF7nSLmS
zyPHC=439&a8<yvW<d6oiPlD+&4`@Ir`xxA^rb60jGk^_PJj?GfL^Xpuun-?W)AKM@
z0Nk61iCsSW>n^-x9HT5xikwPRfvPfJ4-Fq>5uh*O#-0!=wo<_ujND#8zg_W4tkfyZ
zKLMsTl0qXynla;kD0wdtY-2+914M4f8Bo3ho5BE|$5Qksfb(w2ucscCC|Mb_GoVR5
zAi5NZuSYX{1^5H&l(5CmOuEmS?4CwDr@oc}aOlqxnpP{bnRodLoHj(qCb}f8B&w~f
zjg2udNfGr1w9qsD0>!*OEr)_{jm+=5ANj9`W9Q*O*8AHk=sk#DKS{s1EkK$Gr4o}N
zg@B($IEJo9bM1kdFmj7kuj;ev+#L`4#>9`O>#otA9RT9%M^V_#$VL6R_e98N50rPN
z;Z}e;uFG?!*S&7JcnVVjO+Uhu-QnBRgbU&4As2>y4mO0|^jZO8dau{ZN`zS{ut5e4
z8tDUE5o?1j*s;2}Zsvhnph&H@rB~;Bcw;wj>}<tz7_E%b+`RYdijm|@+Mc5uOfSjC
zZQ6ENV+#b=JOd36vORBkjGAl{<*4mX3U3Jkpg2_g4={5P+(V9={eUxjJhFK~Jl5uB
z%3clzy|34=INQ&Du7!a~DEJpRizITvAQd0ad{yZ%8sX#)!bY{GK#Q21^hVu21!RB<
z*sLk$$1sszkcxHK6_2zPg~CRJj{95#t-mP!^>6d}PzAghuXbBU#6?$?za1CArXIA=
z2ErO9c`<cF(Y>zA8Nfe8VUm;P?Z+{fWA7PKk%m1lFr!ihk!Qaj#0pEtBjg9B!Tuy)
zqXg0|;cdXyqN$@#y8y?SK~OqQB<NEwsb@f8lIuk-ZypeP1L7OsC>cUbAeqWfgFBiv
z@MBEEPlScB`KXX8izvrC@4$Qk-bAV<pocT7zogWseG5<b1}L<%fZ2+8bOXZE2Fyu{
z86i6a^h=*ArLYYUUk7<mGpN1>=5TL_kc`>*3b*WqP#e%gSg*SY))t+lO3y0h#Myqu
z9!%l@GLiLPxa#-ZqZ+I(DKNDp<b0$z#n$INYfy{2(O?WHOtnweU|emOGGC#_uEtT=
zM@4nG8ypADntO)R9@93+U>M(*qMB2Z;|vheNJk5GATMML+9Vzc-H1J!<jr-vI)4Ye
zSL)SY@GoGvpmbyzMGB~ZqI}z_z=)WSe?jj|8ax|s^ERPozM>CWOMw3pCV){R$HZZS
z++qu0Xq{)iHR|$QthsYl#KCu<S@U}{?Yqje@YyC=#|~i#aA4#%_+OIbekoE+ltPut
zNvhVu+IT7IbpYXs!EdbExZeRi8*$?$wXchflYsK+UhI-KcrjVs3*#UP*d{z->Z@gM
zp5ovei*r$&59$4d0~Z})vr)k6=4ID%K#cMR3@<T0wykVdU&M{wm{yrGzcT+AfUx8=
z=iT)Np9k%}vQU@HTzQI3J=&n>1`o|O|MNfYTlt&B0-0iHyDnMDqfnh!b1e57Dzx+S
zil+QiV;a*QH4?j7<^@CnmAT_*rN4(aL2R+jFhOG73m^cir(p8YTLxwiP2A>Y{Mm1S
z^tJ^0gEM52DCl0yaU#5I*y5}yp!k0+K#yRzQbZvBWEGhZ2qwL<jGuhAI2#<~fY46o
z$Ro?q3uT~bf7cR<CB3}xastMFzQi_OhXIayT}b|Cd=<u_7JQ@J3`*!TRq!xgz3bqb
z8LiCcdkdRn_a$mfzrdBaKgeDURgMt7BNo#>p4d7k``)F6ZGKA^shcGwx^(Z)V#jR^
zVfHLCt$r?4DptHz3>-FXjgmi$GA%qAZ6(zG*Apd2Ot_-kH>n#bP`tj#8|p`0#A*<C
zXpC|O_mS2Dc~lGYz%bD_uY~5jL-69o@XnH#Ihj5?{#)%&Q)}oA42dQYjI@0~+GBUi
z%%M=~2A{%nO=U0O0W1z+c<+_);1`Jc3wb78T>P|`*&c#;@R{OO(1u<Fg%wuEtUszA
zrq*jdxmIz1@)PrKDU!l^(XNM4LcT@(l0QFku7{vzQ7}3kFLw1i6@;QMtT1J^z&2`W
z2`~mE845j41nfcE4KS1|;6mIfN9ryBKjGUz1<G?8jtDNBOW4T)+x>k-fWWb3V9M2*
zT~X`KlqunWKpi-={8AZpm~%WA#wWvJnBjnpu0vAiJ^#KS*wG@~1MsFQy(sxhxj|>|
zLl~437J#SjSNXMHmys8Jr4pf*$UsRj*`ao7+6D6i2o)cW-1VM%>t(u$j9@f(7Onb>
zCg@RrlJ0@N3Bl>d{{`<$F~i$S6JTtT*bv8lS#^(ql@?9m3E%Pq7eFlDz|cF{cflM&
ze<C&O?!_o}Z(rRVw}N3}S@a(fnB!_MslTv?43@UO|2nwgHqbTtO5wTTbvad8C$^>x
zt|$3MvC6>#Z9qLCuD*ah{0=-ZoGyJQ9S{eT(zqq%Duio?41{R}*&LxK05g69w|_LN
z%tcOj1())u?c+ArN086e6i#kt_!?{l*2cW&m@MQY!6gF@lxqbTnZe*yfJCh!c-Kik
z9t->tFCCiuVbqMo$V05|tE9k+a*v0jj|v99wTs1=s(5o5<wgBE9Ju_D6DU%&)I76<
ztiOf4Fv}d!Jo>>xDf{I0*_EE#`kG%aZ1rTGvTb}LPW3GNT^pb~6i>LB`~lgBlqSua
z(PtU#x&MA{h?OD400t%Bc%QAGL`b=9c%<eYI6LH4pjg-lbLd33RgY!@e}!bs*gb;m
z!~><>Api$+CE(=sE3P}23NU;IPHP1Ty69Dxc8O!5nlV`2Pd}C)3$3O<HI88VIxESU
zDE!5cnz&f`^`YJIPCD+rh{6b(LiNnb!`rL0q;%vh$#OEx1vgdOLq8VR*K-nf5J?ny
zp*~CCBpZ~{+mh*CUHDqT@Xf{OeCuoIt0Rq|VgdSUFtCk3^_uF*E_effX3){5&;I#+
zsTAXT%|G&*Af&)mK8=qtjxOl6F2#U~5#Q0eYmgw#dhmG~%vKJO!Fs!N^&vtbnt`+S
z4D~{vix}e-L3ijvm>UJ-i`IX#7TZmLBD@$lj&mnuHS0wO%6iGqV#n%Tf$v3762*-a
zm&5@cT*D8lt#7ZbVc~8F)ihkXgxXi?cyWWMCmhFu`7(S^<y5uOCH+%3Fm{6eTbcEd
zRRxjw?aPbe$jl&X<Dekr(f-jf+M)f5A^l~puZuYF-e8o9=XRA>Y4xCeqT&*mDl*%n
zpp(UCdnTm!@L5Ec^o|I~FFsA)rF_ECPmd%FNN$-A?6?6aa*|HKAmXyt4A>u@?~iR=
zUu?y4Mz{mh8(WRr%0tRygv<IYNL1@I6Uu(=7qsiB_0?~2(}n<pXb(0VJWk&3EGX`<
zL@RKI+y&-Is_1yCf>=cqevChTLVLtO+!j`b4<$0++AqBNK20q>d0!)rd0<4Q3%*n2
zT)Of5!^Xp>3VUO-h`O9opOXQOFPZ5WXo<?;OC*KrvSJkmRrhi9o?q?Vr(tVgi|Tg8
zq%WmBWVez}h*5qbv5hOf(gVrY5DUshFI<DSBe@*Y8Ju?(`b9$SBPlTDPoNU|agsaF
zQW5TWY%FtP`$fs{1|=m9qz;vV_<`kDK?<_vmezScPGYJ`bhCL&P0^xPBYMX=<ch%j
zi!d57Be_SyPNG_B>%h0fRXeZcO(jTZi$dAy9_+eW6>Nbp;^oyq=lf&K5mb|#IpJ|<
z^D>X9D=ud72^sBvYRm~Vz((CM8ztb_l_Fj$y{XQ)iRIGV*{{ratg5MC0womoz2`DU
zc~nk+P-4MBaa8%d{#=>#AwEulM)p)<WfO#1Hw5wJkt|`F?-xbOnd2)rP<!2-yUjgJ
z?Q>B-OmwYi`)Ggmep|OKwQ4vnqsEW$E5~CH8~vl(yRu_ed0cw`m*i%FY+mll4Y{*p
zXyWy?a_<Qj>BnFY*gN^0;(G#=dt&MW8$P>7(_H;Vq9d?sExb=xxYR}Gv%?>cEq>mW
zrfB>-Uw!c0bHvnzHZ-md*lpyOcDRY>#jy>YGB7e;_K@UOgRFssz~Qh<Ll}S5GB+cs
zs)OxYZs$IwZM|JGMla@pP0tdYrWr?M|J-b&rG5LUGfGO$3&2BJ)KX)J1DTEP1cMNC
z^{|eWzyd5pjr*x9UOc?+NQy{=f^^uM^!+q_NU%mcI+&0KjMl=HN4#_LR;C{W->Z{m
z=)T;-16*G%RZ(?By=*OewVXOxQW%b6N0n&3@yyQc;U8C`J&*bJQfs2)>dUQ45mioG
z@RD=l@<Wj3fb^CiycdN2XrNO`A2!)i#SLf7+w<{~K5~GWJlFJ(8V0Y60lLKb+Y|&P
zG--jcD!SIDM#op57gd!#`*J@eR{7hm2KP=Xg1oo8f3cZk>%`#Mio*JOnq10{9z%UT
zC18(0CxY%G@^;}aN?<JBco)s@Jb~C(v6d5aisAQ9FMJ<K)nuRCRd&lYPvsF2pi?2e
zOu~=6oT!86rIS%j(YCL8P-c+ImP104_-sPk;}fux=>?`dB6|)XAAU#=kpt-jJv;n{
z7V;5mYk`Vn4qpH5SBoKHs)p{nS8f8Sros9u3oyF6=)jBVNJR&@>-S3rSp{E<Byn-2
zC{j5LQnFt|u@cuWPbFfo{M}68u?=my9YHTyR&A(aO62>}HxM9;59m@Bt}Blq?$FAd
z4-p3;;1&Ijz@19Y#H3v*f0SB1$D{L_2s5X_z^5Lp5Yv)UdlxX<MXS@XZzh?}v@OI0
zl{u-{ugg0iZ2Dv5>#HzeDK?rrGra5><eLw{QhYJJ#8PKmfISk|H|@Va{*C6hkgm`p
zA)Cr+LXuu&+N2gVzQ#{YW;pz^YV(1{>nnLL3~VSNtwhga_%&dcd4NwZnK&1#ly#0u
zqz4#dWD4EMYcS3?c@0RJa9DMTa9~tdOGJ&E!qpn~WY)MeNbjat-Ic=8O_^B95-k%J
z61592Ktlcb21uw>PB7A2T=%yg-jMZ`p{|dfk>oG$j{vznnGTfQu>g)6xH2#k){=oj
z+h`@iaFtQ*I!F#2N1v?6FZJkRDir%Qta8pN<FLh*T>q`&qG(i6!WwOKL-5-D8Kk-a
z%q14nz@QG-63kEf3Nodf?`gp8NkUxq{4Zv|+g-GSmV7k9;$81R$*hj2{Ck44*Ejrk
z`hpAef^L+$_X$!yP>J{9uDugRq1>c}H8rdc_px+mI?|Nz=Ea6^vB%-cokdCQQn_Ar
z&(JF6XVCR1`ExLCp~)xIp`;z&-N_-ptQIpKNP)=th5(t*6NIL-qO;k7U~V=rz_J6w
zUA5{d-Wux4*m}GJNAEh}IcKaw7Vo>7=M&Hz+HkKKz-|uF`T0n7VRgmF&4@YERd+Sp
z0=WvN7^wXaPL~SQe(|ubBbR>rq?UAPG|oVER2rfVJla}^ROsUWfKmxBL@vq?+r~|+
zwN_A!2zw1Zc3gbngwCS#950DLFPBV)y=bsW5z7Jda@zw;%IxKz%Aw>`I|FIO+ISw|
zLR&0(vt<;4w^E+dpA$6`(>LlD#w@BVVtF*{IFaVh9UQ~Ll$UP2`ZY4*T?WBEmEoj+
zxFYp3D}rrS<DM4oXN=(M^wo=s<;yi4k~P8Xo<sU}z?>YDNp1WzFNV^}GJ6bU*x3uS
zURL2&`41IAH<4!Qi`AEp$x(urk(u_@$|o2IS*%ay73?h~%>)h?U}IIcAU61X^B(5e
zOArGmhu*da_AlKlPXjrW5v!YRF7?4HJNl5!c;#M2Oc3vif@c4@ZO^Z=(QyI5E++|h
zFPi(wTm%zUlrQF-L2L;Fj{b%in<7!5l&>8+f^(W$t5rqAoRG%Os?T_0{(|J()-l$m
zkSF{9J?U0Sfh>r3pFJYwYNurc+;WxTc>DL*SI}0io}bQm(?gv8gcl$N`=7;)As?q_
zY(MThbYRjiI?fP*n~QIdS0;yCZbc{2DC@qDh`7`kt=my!pBVwk59>PL4Q&g@C8qm^
z*RBODOjY+EKKoU<F%z3+2Wo;}bhKtA@9U=-*g(J9wEajE^vb0+Z4X_$y)}0+wMYC(
z_<ffTOV6K!Mzh*n1>q|OSufNI>LWJV94_N!iBe3X7D0{aG{$RdtPLY6%$FOyOSudn
zzVXT<ikN+c#=!H9PQ+0eH}?=t9c(A3eh6hZ#={b60|AQB*^^B!wtio(LF3ZkPdQ`?
z@w#ygEIR{txD+yy>KsZsO;Zn4GrfL4<_-xumRZ)p6-OGxV@)+U+a*Wg5o$dNrFlC1
zkHSwukJ9BC!HnJ#W&HY|X&UZw?GfsQ8Q0-VHXZk&C~G~};{7?PihHS7oA(|@G$T-<
zliDe7t*IE<N%S%CL>_tfnI0ib#2`{HZFt0ub)j-&^>O>oz&-kLBaDmk$XK7+xcKIE
zd_Ke0Abz9A$3L|F+2t?ac#lGu!ZvT*{t+(LNq`+rCzNR^oYnEE;O=LBnEmE$5IHmt
ztIX?Znym?XclNMjuH-iGA3`IXI0+v)ce5_#Il7xBUdl_Hud}RJ90#nWV9_5vWYiAh
zsM{n=A4&eH`u2|*IZ-9LRC}|oM54lYj4gKp6JPVdZkO_Is-BkEDME|K2AIsITnlKU
zky*f_<#;|%7SsE!i+vXQ_EyvAmmy=)@Tok3_a($@PtGuuo3)@<kg$bukY0{N(huHY
zdJ*6b?0dM^q&c7_-jEMPU{v)&f}>#U)q49ai&wRYyXQ(f|H<mpra+k)YZEPr67L^z
zQ^R1fPk?-QqNmSR;oINm)sOM?fvIeM>=3HMH-y9Xr?L3kgWgcnUFrej8U_DQus*kz
zQt>8(?zpDJn)^72rTClSTjm!&|5kOL3#nQx68hN!{qq(8F&<O8IS*)BYqjF2TFr+<
z`ACOV5!ypJP)+K&*CTS;tJGf@25<Xs?<<>fq3A+~Xp}-B56CkeiJ-yT4|eJ+g%n<d
z@cuyF#WaZStKXW(*P9S9D=?S&;(P<8dOpUw%DOpY9|WvqT#hBLmu{8MDZWc_HS>rG
zAWP&T2pxKXX?}0XkJa1`$3?Pd%lCJfNa9(RX<T_Q^=%bDe+%7=@vEx#zCbz?Rzy$_
zg{d=oS`gw=IB8C9UC5T8V|<k#XHSkcCwH|~dAW8?pZWC{%=7~lzBw1=Jpa51Mll~3
zm}Z3XA96Z@IS_+g6;8Mx5{VCGR+W!eI<k$hd~>9xC=|4mgD;pY`5u3cl>(QBZw-98
zPCFPNn(Dw@L01>HGvW#6>juH~@(CY(_E#L(Uhq&;jhK~ozaUStBT$GljhP>mu&))m
zojZ)gC?w`7W`wpy+tcpxHRW^zV-aiEw~co2JdBd{Y?b!}jj(Y@O=k<|o$?{+_RVkO
zt%n_oMq+y|frZbB-Y=7&8QT5Rz*GGldwwzYd>WMkbI{GjB(O4N#FgREh-on=PqK1z
zcHq%y@11^u#?szXCecCN`Z;>e`>}js1iLHA&wlO-49VRUZF)MD`De_(wRs;YhQ~86
z`WZcv67Ns@gnuVB+?l(7o;r?37j%kzLxbsf6hdlU<L3{^9cl*12jinPes0Qx5R$bk
z9=ytg;dgnqAfY@)!WUHfffHz%i;#h%csSKD9h>35IkQ<)Yd66i(!95gIQ$zIvNNRz
zx$py_kg;SQy{W8pA-`=Z7XwEO#-^gA6~SQi^OxseJ}bz61Zus!eP!>aeHX!lq5nm(
z0ZLAs)VEA+lp>OQIVnlc97;3;veJ)UAV_=l1HEgj9YvGT#G7X@*Uu+J&F<AXEGvK4
z;d_wq!Q5mU2$(xBKj7vduRTJ-)Z(rQ#Iuyld>G~sB=uA#?tk6<l_vFk{<pwYua`#E
zs$Jujn=&s;7?fg$@0DLSPg>2%Hqj0(g|d@S9Q&}RpY8&2wrlO~d6nbKcIn7y3L1yE
z?#ah%dg^Si3ls+5Voy6UxzU#t3d=<HxpK5(XcIXNj8$%P0_(d67rPrxS^V*`fsXH1
z@HT<eS|1R?E@t~)*&i7JIP&T0(U&sn&<XrQ#(c~5<%mG`Cp(IKqC!vDe8&?NrLV%d
zQ`j{O(dCLzK2d{Q??MG*;y5$n(XKfB`~n&UOVkTTDuQtuX)#AOc@zR@Ab)}bE*1Ot
z1?kgQ{Amy3=tPsR%xcMshSM)VmeqlgiQ(rTh^K@{U7`U;pw^5>|Gpvh-DVLCe%3h?
z%t*N^d{@(>2XqyOxAdjaM!n2CnO|U-iB*1?!OKN^SBX1OWOS;E<(x!mR>Ylt#=xqN
zZggT=CG{%OJeZZ!$W-Q%{o`-bf}~+jm3v)Neg~CGQo`(b#!*sQ?HdZ2Tt?&Oz)3Vr
zw@6RoqUymdSDhFH()VW@t<oBECFq>7c-+`B!OW&eRp<Zf<q?O)zk_o_e{tD=05j1R
zmt#oGOV5p0_d=I%f_j*XiyQkmPnE2Dpk!8AkL-KdawhNiU^=e5YU$sD>9{jsyNjVh
zt}y|n4;g&Jf^NL}*F_R@4i~JxF401m90PYf<qRMj9p*scPt4}2ku4>fdCw|L<jSE`
zY@}K#L#wH(sxTejg+xuVTJ;k+GV`e}xvjgJKJ_FQdDzxQlU|9DY19rqG*p**E<i$|
ztSit=&ZtNR<iX?lf+v3c>LJj?I1GHSHlVzoI0FTK%$tqt2^ZDa0sd5=KKQx+Cy?=b
z8dSk8gGTMsJPJ*=3oKPz>#Nq6Jv$ID+A*TnJ8M8>Po<^Mpb?$K5&BB!|JZx$sHnC+
zU>L_iK@>p{MCnFAkPzt+q`PAzR0IL(Zbm^=kXE_`K^loc8UtwtNy$+J>1K$5fo~tY
z_qq3ZzV-h5t@W+-{&zT>z4xzk?>$~2L~<1+FAhLa*@ohsK_Lyy=Q$<yZdDgJGnTcj
zNcm6*_I{s9P}fFB)cOn$eNMv<#G7))FYQP6tLDv)CS<k27)rSAh4^Qz8tw6CNhv7V
zEms|E@6;pg?6l7qax0oVj+OJvXrs7I$u4@arQSL@_ICVF>p0ijw=Qhbs60$&zhK8X
zJT86oVDmb5$;95eSk%6x$tQiYSsdF{zq;=?JLcCN)Q}LuMAz=pDmm#QEW~>2Q^RV&
zY7g1aY+#h{K(^z<9|D9vfixudjk?uAFOsfNMiX{iA{$wBV2hD|3+I8Y#EMtbzQfQJ
z6^3euF=W%qm8SJ8Vq-HyN)uA6J{VpZLcc8GRLa6sS<<z*&_^jlO3ydtnee?+&$axg
zhRpVb7^IL)2@jYgq&ZO<o~QYu_`WP&QbGs{)z<}Ed2beTc<z<`kpC1S@hHo{r}qhC
z<=)e5sd9upjh%<OyydX!4DRx1lJ(_)TpBpZBigx*kv!ij%&kt}WTC6SNVU8_)hIsL
zYUedye{tn9J^;HDkg^=MxsegWBoHvKCK*QKV7KZ!Tz4Z&>rjQlwYqMicz5mm(c09F
z>Kcu)w=ZRbHdC;7{AaxaXRSlj6)o8>`81zo)UKb`Z~|RnpOH>;&(F#;>H@0YW6;)l
znGq800k!ld81{@`<sa;asOB#(>93}lJ!d>z3v>EZW`<<;%8zm*r=p0U|2@=d0E?BO
z)^d)z#&pSF`R0jwq~mYWpw*qg)uh{v(tFP{c1eZU$lU2-%?-clC&x&tt;0!{_<LvE
zv<SH}r<c2>YRDQlc$><1rvwhC+;AO3wV-pKfi)hExVN_l9=jjsx!t+h#KWv|DPH^G
z3#D9ho0JEw&M=3M%675c?DToyqQlf4bb_SUUb8lN3PvEDWvijA6whg~rC_0NWvYSO
z#aSHTuFzaWiuMJ+q>VWHte6`HR|GV!$;*%Z6#K1iCG(8bq6ubH{cb)ZU4!#*-6Sq^
zUxV-_H<&$JT8RJad&d(W+uZRM2%)sbhs7!h=g)nLHB{uPB(dWCfKO)IBiaPJ0g@&V
zu>B%|*sFBcg|u6_pAw-y!fXA1j~LS3ZMU3ocfVzF#AfdpuY|tXGnK6-oeYabI@UZ8
zCVYJ_gF<8ca#f=?)k&1Uy2#cJTw|gS;ZPDT-8dzS&zAN5tc^K;(0rb6a`&s)Wow@)
zu6r5ZSRcnwc|9GYViqJtUCZ$DG^jkQ<{GJkj+^Uqd)4Ni^hhtI>P9)qyuHQ#=#;$d
zmVBPkw{*KLNT^%PR$LR@yr`nLLL{@t+({MHv*}9ZiP?XR?C5UDFwy&G_mS;QImL#4
zoZJDD_SDn5^Vg$_**PQTrq9DAW@8bsdDrQk4|r5lr)UogBDi^+xYmENJ>#Xg)7a1O
zY+D&(r+`IdxKYD`L5uW?OBAc*JpZ2eP94^)e#ftAr-t))cs`z7TJ46h+t_F$i7>18
zWv#GatILF5HU3(EZ*H=Um#PBO;SKF?)e;<cqxbSN@c9ac(Oz3@Wf(^ho^OJkf#VaQ
zL?|D!C47<IskV`!`U7E4Bl`dskyV?!B~=)b*a&u>DB7momKHX~^=CCVlo}*SNE9WQ
zHNbuC?I9A`r3Fh=@j2?)cUv4ysr&wz9e=HLlMI}Rm<V@aD|W@R8mTFN+OfScTcN}9
zi(n^8Qq=`cqHdnKT%m~z&n&sCQlj$V-qoJzx}c$pT|;%UdvUTpLr<5cH*>h8ZHF9m
z_pT7vs%N(w$o$e+ou`aeFV*rb)DiEh>^%L%E^E-q(qDP5Skl@f+pGQ;LrdDK|7>^a
zkA?N^>ZXPd%MMPGg=qDx@~{Y+b(VlL(s|@yX|qgwZC%YK9t%@XjTV)Zmz8P07)*6w
z#<p#Sq5siN(2>bW#?8c5YV6@MRt9@2e@`5BEnC6=Ys$7f8*`H9(UxHy`aDJ5nGKe#
zYRAOEmyI}NQ<eAlFGerJ*{-N=DHA0zl)PNb_=p(kgnybsFcGR2F=mst5TERN+v2cV
z%~<zvZz-G&Gw~ia!Xqeo$}kGMmxdkT(_Ku=10TvPwssd<5VLPCl$zU(ZJ&6<&zIl$
zWiPW=EkS*o6BDrF7H$J>Pi{HMwA(x-j`7iS0V?KE?vW<fGJ5#<Q9(9wO>VE-0uZHG
z_^8aRNU^l8IaSy-)bsh9YQFcLI&^}&tq+b3ZH=78+GE<(eei=vmwA>cP-q^Jnmmkm
zSX*S!%V4VmnM+lt9GF-~uNs*(vCp1SAN_LRZwm{1$~u_oV#_%y4-m^IsSM~Dp?RHi
zXqLyFk}UOVGF?f!{7`v_C7+QAr^94W|68;)N$I5Foz83a?I!v`hv%OKTO9^`S0me|
zv#Tvcpi~4VesWULQoK;Ff1)2nbOI)dx^_#2?#^)2(#Vc*Z1b02)oMS?R(*|JQpU!A
zKh-lQEEv)50B`hDLp`%oIYwXn<FhIS&LN-EhDr!m=J8koNq3iF2hJfula2oJNVEr`
z^L@e3lN%E0=Ze`8mVvO=5DwdAnSK}7#-{VXcOp`2zy8I9JgFI;7UA^u2tC^6Lk4(l
z;7SEr`E(auIxBP1-Mn*VGXb(Bm4g`WxEn08OvubdiwK<WWkYdK3{M6CdxCiMajejI
z+mZE&<y_k@g95KQ)^dtZ$(*ht<v*G=**XNF+FH$1nJLOBZu5gFT7jl@Du<sk*-wi3
zg}!Mn<(tA{)0u}|QyP^L;vEN_j0%{Pho%bBex1>;4tRu2)#4U%wBSht3*}MFvGL`f
zVmGU%a>UbHNmuJ_t!=F&&5Ow5^kh8@--k&r-EC1b--w+zaL*E5C{I_ens0<7=brTT
z@||e+#pBuemp4#Ty^qhSNbC1_IKcLMxJoB6ZX>yivv3lM#A-wW=Ek57a~<rZykg>Q
zfUl3NyqVq6Q?dHH71k}KhYO{S=-mciT!E-^Z8L+n=H7@vLc(38vEmR`2f5CJnqaFW
zf`n=zIf0_Ls(dCa)YZOW;q<eT9p_02;z-il9ZmH7^;JDKN`<5p#?{S@cU2NFoMDG)
z>>?(U6}#x_hT`4#O#APrK+Da-B}{VsuFpC8_X=Tf*1<LQ{ssC)YKrKh3hZfHT>i<p
za>~bXw`$#JLiLY?x{k13wd&#$>@RV&zxZnJaF9|k3nn5G%5pA8KSEaLnE$*lBS1HJ
z(|CvX;S?<F6zk0DBM-gj&&3RIntaU6;@8W@GH7pZA>-uf1NvQ`jrx~}-#D9p7|KnN
zq}5c&a{>m>W$qQ_%Re-$z$kx_m#4D0tnqVRJB$SmkkXayw|P4E*5@?~bCOLHf9{a{
zDNc^$^XE7H{5x7!3WBnh$6hSVMF+^_tRN#d0b~eEGuZw$OAA7-Zw9eR6rpQ6JljJ$
zLCwyA$>Ig6!t1Tusc3#!Y(KtsKOHs`7GXbJlBN;$-7EP^x<a~S1CJC>6G8lUW61=+
zHX&TVhEYg5*YvUA7b>M-AR_lk<FnN|nlPgaE)Shu=D6d01IYQCip>Hwe^8+AD?Jqn
zVBBi?&LaJO(jLwpZvEWx@4Wzv%ljr(^^B=!>5}8M2c3Kh$Xd>5k21Z@CCXVm-D>7V
z8&WwY`2?&0|9bJem|%Wj3HR{l1&O{Kj6ib42Zo&*6W{rOmuK=NlO!s(xMsJOsiIRS
zjY+wv-;s+g{|epzUN9uv9DkHPUCZ-O99@6Cr1YsC-MpT3f$XC2mkS{g-}*b)5x}f8
z5^twP>hT{X#xUlQXm&8$sARa{S5~XArK^;mJhZ|WO9`Zs8vDfTc`hEql*9`HYcowd
zd!HtaHqpdaHFin^iAD?qeVWkdp(ihM_tUN}eK*)C2>TfwApKwkn5`2qJ(`=pJJavy
zo!X&GjsJ?HtlnFlu}LgpyPG?1vEXU`V}M0jn!V_BDE<io3xmZrDoi+Sz%-an>CU6I
z?{1t+@g^?hX*#b)waPch1_7)CBwGVbd~rWw>!yOUZqT>lA-9a$m!sEtuh!$w>ZeFg
z(?%&iE1E;4XzLNPQ!Sh|ptH!d#VbqSq|6c7i6gQ)m^C^_0*OTpEP%gblv;nO0&PlY
z+0zH-q!mYolm;&+^$}{E&d6vEhMCzvxawiVGkcDOnVC+`gf5)=-D!@Q%2!!=xm(Z2
zMEKCYLD;pe{AF{X&di7(H&HB#Eh|^_AEhj$z9wJi=$FB1r2W2BA`*~g6cC$>H_@kX
z9bc_045OroKKrYP8a4~}3_#44%P1T5pgq)n?`CD%kEI<87JP_LuqyM-Wkq!yapPfq
zWq_vL{$3--qJQ=?i`?6Q#hS%~h1#3qm}J+0h1y{XKslE9SGOR7AA@2k2fKfoo9d1B
zD{>g_NA_9Sf{#XFV>d&ar-|Y0ajUEsA?E-2Rxq*|&C`<q(qr`NeB6E2<?<(Qcm`WZ
z>O5D#q%}L4cq-<<zfDX;^bbTk^gkQRjj1H~-_yX>mZ<c{r`nB?r<GSYsN%AAQj+uV
z6qj@n2%#KwIYgAdmbs*nsrH@K_)@VSwYsJIi4*N}W|@CL{TmM8MZbd;-}@o<j#2p6
z`#JFbTOES<U-Ll|quv8#?(YO|KwAFK=R~M);C;3i>db$?2HHvn0@alhGyh=pXIwUT
zueb`k_+PU?sz*Z+9WimTgX`b3gZGNG;Qb`=)&F@-62T9|r0#UQ*1v#(h#YYZyl)`W
z|DV?smym!?p69tCul8@8FPVY&zO?WD=QXn+Iyl&BiBVhTzdvUdtOf74u4(_zYlP(^
zK<Mw%ng0JZI}rW<%YZ+m{r@uq=G1=7fS)b-ZAdF%zO?unSleb*IGmk#{$ck&%3WrK
z0-GXhlyB;uVmB(e!*k7|v3{D<{>llMG?01UQy!B45{;)+%7U?iIe{d~PZv*eP#V7g
zQpF>k&rnF!7KS3Dy>>~awl;m(^3Mq7?As+~bwigTZY$lKqGLh)pa8PQVpqZ<SdJj1
zTwsDJu28i6rD#t`riHcXf<-PG2XYn!8P*08@XR<pk`u5^pkQ5fM+1Qzu$k1+uI%o|
z_n*d=<opti`@^5_B#-u&^v<bmh}(|&7?qlfE(}%N4n`CP2j{$MBL%MU-H}2@aLyg)
zqO+O{qTqL<$7S>+{%ApO6ap!zt10FviMdxy*b1woFov{bUb-ltg0_u*OfBKsL@)WR
z_N6+*lYi_A>J%zqh?X+KJA{N{P7i_T%ggnnycvU#;eK?5zmJ2P2h)Be@wW-mqn@j6
zY?C?BK@EP`9b`I2Kw~Ewb%bGJBqXX*)PJ<Oxeq?7h{|c0!L#lijiI;rG05boZ4fM_
z0!VJ!b5PD{Ahi3gyuCvC$MUd|RuZ!f;uPR)Y#y=qNQ=}Fi1Kps;qeEgB&x3mGfa;i
zDtw5!mnK4=HrwekzwT2G8gsMH{cm2vpI!t$WwBTU#zjdHpBgf#@y95l(&4qB?><#~
zid)*C+W~;@%*o8WE=XY<9h_-(D=nYr7*xv>uo>3Py$D^|j&CfA5qz21qNrv$AWmfS
z%H7P`7b)Vs8ULcvbz%r7$Hy_({gg-GBqF3F%2Kp&8a`%j6X#jG%g5AD%!G(<CT5l4
z#`Rs_fE8hh0w$c;vw)NG;f3$pEYnjh;j@h&Z7;)+>2XIMf7|H-*xr)!@@Uo9WiL1L
z+&+D!@c5(Rl0*#MAci9z3NkAJ-5$EkP+l@YXbZWX=1xBr6TaJ)*ifTtVm4frGR>!K
zIygy}HeknRO}z)O01|h)p;4OLOFDf@S|3jTfi3~$e$Og%7xAovZnJMQ(Qpzz9YE?Y
z=7XFtKagj&wVzHkyY)P)%+FHt)$tNt#+l2)pF_!LZ|GUMR;D4%!mYlt7ib1Ie}ZVV
zsNK!t{I0o#*sQbWv&<5G!J@>mk_cL$0uB$cv*UX&-ie=psR0)8RoPU9H=|&!EB<<B
z&z767eu(%Z>X%F`GjIyJ1|Qqn7;?Q~a9<E3Y<&Vp8eVhtDrxI7*wsm~@UtF!oK8@{
z;w#mvdPanTeDsQ84||y^>i&}pNO<YS`)?1SwG)#NV<gccd9{wpV|zoFl15Hu)`9{Z
zfR+&sgG&H`lDIDZ{10xbmKmA%hvzOJaXkfN2Cs=w{4m&ss8c6ke&Dn1DOaICRLcpr
zw2Ga0qYL~Qh~nvLPEu=GmcSmz)@-NxLW<sOO1$B<Q)NH=!nIOqBCE027y0a7@%t(n
zl~D~4Y{6n@wu@g=Ub+ofM^YhoEKSWHVKnB_i9MlklD~!h4lM5i5Mb4zfMWq}z6*0;
zL%14|l`>i{UWLJ5^Dr};SU1^SM16z9IkLa;CLZJP2tN8a20xI9-dtAx^%2M$Kq0nj
ztJ3qrrCXL`r%PNHu^aMt<X++L`u(7kP>+FxUeF#D@FX$-lOY<}0pQPk0<6l(SUnPo
zo0^%Or_0cS9H{5V5jY72_+7wGY{g*VwsZ#IBYcPz0HLO3&xIl-nLcTv7%uYi5|2^O
zPL>0V;q95@xqVRbH0B!F$)06AM=ALtajTvn<ofXiKt*-3PWl8)?2Nlfo>hhe3Ei=c
zL{$N?8=(jTyWJkgR7o+6ssuN`2Or-jHq#0QR5E)UJ6?YvXe{uK-ow?I_>S^%C}GXo
zd*f3%&n4SqOMnYCwfU5p43@u{9~?ZLQ}3u5jG%?&f~?k4$CDi5VwX-P|D}Ab@R6MZ
z@#J^H;FTr^$jDw5?O$Lns|FUqNr)EG#%4L<?Q;^_(m`2|lA*FnHV_=8^_oN?lpe5e
znpq3!$5?ux0%*-O{NQYQs|GkWgrz0`SseGe_JVN|nCk&UNe%eSIKR2X(rS!c_;JiD
z8NOtV;ARyNpK;B2xWV9f3X1rxtU1Qx*+fz3Xf|AN$@SQ#^UE7Is?7D70K?e<t#zI3
z$WWnVB&sx%ACp0RjCw@^ETsln>C;?Wqhtp*bwj!QBrwOE0Rqv-o0me#EEta%3+sGj
z@xj5*vTpM1pjrp0_z|HB5Mx-g$CMZ{fcaOTjuDHc?l5>H^~3!#9&Ygq-)A5|oGZas
z2K$bKDNSP+wk07+&9ndqTAI>V6oZ?UVP?75M=8|~x7(G|a*Y8|`HY(*MaWj}XaEX8
zOl$EheoASa*L?k!$`_%8x<idg3682{NKVNq%h<XhCkZGl0!Z3Fce)BpZi6bhrP9+W
zi<8&xa|`>x0KTNT)_db6XE)aC>dWT9pXXox1}KRG4?A6ZI}~zHnQGdx8+>!4d9{AP
zd00wAWo5lKmp0Kw%0XQKQv?@KQH*&zoa8JwGdCRtQU~G$s=1Lx>5y?0O=I{W;?A9q
z^4ZU!0wFM#d)IVt0d`OWcI3}P>5OqkbK}Q07W>#DErW2#_MQW)i{NkZlTchT1}ZH0
zVG7cr`zU#@9F;g>p`5_@qwdRW!hziSX+$W6GbGA1BgK~$f)Pe1ZmAgM|HAMN`?QnY
zLK}Jla#}9H!XgeIUb*Y~^AmB<Db0pogHc4NK`_X040jobtyyMgm}z(o<Kg%?IJsj(
zYti64NcQPxd<lDR9pclyk28M8V}~+rNuD|LgzN6|5W@e7u~GS?l>vf1)j^MAmmAQ_
z3Wb-Wr|rltXfC%KWr&^`Q2JI(*ePGe1WEQ4tykYp0kx4h;8rvmii<&kpQHB%ffI}V
zoqC`ryHO#k$-!fBEqRnF4}=S8*{1uQ-_6hG#F3^Vn498ovY$&VngSok<_4&JY^&T?
zT80P`59Z>wodHis6grdrRvWtb$}mm*fCV}^w{HML8QZmOB;R6lt(Q8oXn99RP-rvE
z7zgOvTLH|EjdnGVp&D2#oI}UtbGXv54y#=3%whV&5h?f(edLm<w`i6p-6JDEp&+ya
z*{9)+*?p<>M^zx+Yjy#G_v_#e(5^GYwWm17a<P2U+_oF-WJ-#0`cZhXYK_^Vwe>zL
z7#FRv{Y}@oNRbCbx~rFvxby=Z14uScfu!cFW(fkQ9BVHJw+eE&Hq)NKQDj@IBh7B&
z>s=XdV<GvQx9qEPj>Bb*eso<}2+wA~)qzxKu<D5u&uviPN$CuUD*EJ&hQxtk4$4^i
zJ)+uFgc7PMl+6JMlVqPrC=bJ4nj&PqgegGmml>xY11jRcrX=s@&vU$$(Y>uK6lHUV
zf0nl_*Lj*62z8<mHt$w9<P0oRf`h+QfFu`11*|0kQyp!e?A!lf-H@f7l<{^oyEb*o
za@8Vo@0yDQax9}REaDPn#O_7lGn;FHt}QM&6>}ggEm7$6OZH@{qn@Rdv1bp0uR5;7
z)Xpn;7GI3*nV^AJ5k~kfnH-cpJ#_&nCWs2AK0@Y@2<Fi5R31E!Hgig*5o5^g;NpE_
zyFRo)E+!Wn_pNTd1OA#mlD_s?fu+@hCU~+MfKeJ4Ilsr^P$97u;CF~9bqL4e!_5Nj
z4&TQ&SAj6DY`{5(|A#VmNGWI<GXd&O4vKLi3~H&4l}~N%Tmcw6p8-0@qdx&f!~DS0
zUXhQDHh2<^C!54A4oe%sN!%%n@7?U9iFkyxV85gBQF#xm{&s#1Asz|_@LmAryLT0$
zuZU3dAg<)tTWg$lk{LsvXax67^+@pRy)4jk$sJCyyKU)@dK=32IA%SAM-)Of8Boi9
z!NCDgs~l!euIjn$tap@@wH6l;V(<tCQ#+$XehxHWlA52qgPdLBdm!>()AAO8CDelK
zlA(Y@_zyFd=L*P#om6|BBbXYA(voujk=xHm$h?+ifbV4mY=Gml1eNei_I{1m9~9Qm
z4#A=9PozB0B;_}ce+Jiub!~mrbAZgNHi*rdH3!*&WCHaZ@VC9yj*kBFrnu9$q$-yQ
zmFsMEtxg6ojwZ_?yfj(0AG7)E8nlozFl<xLA&k`F9k!x$X1cOMZ`hBfZN>uAX<w!a
zhuudi-vgV&x6&oZIRu9RGcJvwE4s3V&kZvDaTj|aN&1ZqGX+z0z-!f;Oysn6gXmO0
z#L?!}Q@;*VKpJhY*{>uar0J7BU(*VM)9+P0YM1o!$vGBRQ=pQzxUf;ke^fMI28JS<
zUEuNHPn=wAPgDdIEvsO)^K`W3bgzdBO<zm$arK4`gwu+Tjrf30;V&5bYWH!l*8~!L
zEmBwHE&E_&xfvSo`yR*eY`>6za)#z>K<{RuAcc(5*j2laI#z$!H0kr!<4`H<<m+#m
z=|9DovB`E+#q!llOv&dQM~2@)1hpOvsS^mTZmF1S{=S(PG+g~{UQW+}W}Df*CkYXl
zS7^VgzV0LIuG+<ttuA#Lq9)>pKEOw>y;cCgrbuIaPn(sUZZAvs;0H_ta}fWj86C_9
zcH5sl{LUQ>0D_Z)@-aQhtPtQ}mrCX`$L~*}WC}Geog2(%R<7%fuzr{y+p<e5RuanN
z(lhfSRTrnX-4|{#Y)FTAPLY@mge0;dw+B4m)XrRf%UPZFndW1i+=ay$`d#M_k91*r
z=afbd&r(LHFI?B9gT{Q<1A@Yp*bj(b21lXi#YpLL8RM7EsN{+e-zCL_-#E)F*aA`9
zYPgo266Yte<n~qs8u$>HzRD}&5FNuWCi(%yOqQ~1MxPFP6_9=zEHnzgp8H{DXEB;M
z?Yvo;M^#$MPcVpCkPc|4Rjr%?^qk(D`XtuOqnfeZ8xtS9L-n0z`z2*0{bgr*--$ro
z9?s*y{ZGqxC=CE8D2Wn3&7py<M@CtBM*X<TTknAH=>6H2b)uD}-^p;~+hq;NNX7xk
z)EmR~A&Pz`>k{R*J)=_OCU~%!{bdEFN1`+ei(GOY6luv@u5}<odk(vB0?GpqZ3tX?
zQFR{#881rw6o*`_iS)g~lBMs*D`X89*H(xgCr{R(vYDhHmd_5xeWiWq$D?{q*fe|o
zk}tKAp7K5E*!2t+QSYec#GlGztL4SM()kePWrhHAQ$O=M$VO#0mn0OrUJ6u9k6?%&
zkcas5{pYnsvrL|5HI_Vcx7&h!enzz{5L|pvLR&^YNDrRRatMaai{WU$Whs@P9i;6@
z3TsBE*=aINGF*sQGA$Fub<2XZrPVF(HMC+csEGAVbU-m4b1$<XE;HxQm79J2QWT4z
z`%Tw}%`S7AH|{51{ZlsB0%f>)<NyJJR2Ri_8esoQCZ5W68%;@{cS#(49<#V1IW8hy
zatGP=EathCcEPdkaX}sVh@qE|{Ai-`PjTdq6>a*^*A2=8_tpP!s+Zci(u|9mZl6^5
zxB_<;XcC<+xZoB<OEJyVPhwNR2I4#C6CEqwoOTV#CI6fMBoQzERAin2<vx{dNC9Xu
z#8bx0{k*Lt&8Z`lxzjE3;v>`3a>}kn1&b+gO@=>$lV<)$Z0v!jEXqo#iUC4EMS0Ub
z{D;hw*jxz`70!)dzMRWr==J4a;*ugiZf@0vx<9uDstVi>6pCxAYeE?#N`|;L;;OAF
zpODARK{acpo#p<vhTmLe4tdEk>0|P+$bs3>D*F^{QtoGA*(MuxhyaKmm0$wU6p@4&
zw?y#}na7@xewEu6{~Ak{-HOu&*H*HHouUt0v2uGjX7;CM{Tj-PRJH7&G~1f5mnPoo
zMK_vXIVveo&!SQ^L@t(2R+-I!5dDpzuD};!ng`o^p4R{?5QJ(1iLJS&!vda*j4bCe
zt;VVk<zjI%PxjIE80YEU;Ot^@+G}@iBG39@x5fWdOp8>2JmpRf8G=Dww=LVipw73^
zR-ETMpA9}D;{?U7@qAB*oe>(qx8bS-%~$RhptV3842%nLS`i3-DH012NNrtop>0}h
zK<*@o%5Qu~gwd+Zr|0sk0KQ+gJW0p!b;$$xq5^f*AoxJn0tqKmF+6(aaq2tzrITat
zgXwQTbF+p$922P^>H&WN+&^y+kq88-8&#Xy6q*?6RrR*>*#(yN@${!Z7yZ=#2(HKH
zMQ(_*00pQVyn3kT2d!afIlPr}|8cB?K(m?RsUoznVecAx)?s+lMY;LC{l}q@fCoRS
zVZIvwFUcZFmCNju;Lxy-&$%{cZkK{Sn(#Q}8J$V*r~Hl{17W2DMhE**xugB=`ry3O
z6+1b|&UAucfv-@u>M3|Gyf`P!J%W>a#NJ-Z_#W*%y)alP_9&|kct_EVY<_jhh@!=s
zep85$V51O1D6&rXSYU1+yA$2&;=8(LdAF5a-;gJNbMOkKu~ihM9}(rS9&U?5TYh;M
zLnkd*pb9No0gPK$Bbg}l97e_t|KPbiYF*GSqx3iguXE%Caa;23{6Rp6<n%aNl&wQz
zd6d;V;bjba*j9&s$^7L=62;&Bd3MKh!AC*R;vg5O3B;Y4tWtw%u5NxXGw0w)9?s(+
zP&YPO!+X13R4%qLBieSnblIo9EhfZfMp(PlBRyPd7-w0g#ymn}=!n$H)2h)sm7E{6
zf}QTSxiTIOV1tv;1Hl%*-7N~qOA?|+)KRZ0%=_nPn7z?6*QIde*t^((1(S4McJbQ|
zwZoO6Y%|ow^72*lX#d@0(PFHSKY#q6gng0?_+AkpAC|<hG0z7{_Wg@29gW%AcyiSc
zcDI1op5Z!vdJ{$7y#6yM+THPT8o-1hew+uWrGu&sNfF{l-&?hmP?ceKcTyevSukib
zu=Z)PBif!?zaO*ERBhJ7<9F{&)8>PKlm&+qqdrGH??ABu{+b&o=DkuoD)4+6z8zq4
zb=bJAIJsnECs1!W9?e6Q@0H-@njURc&@gUZngxs9PVV{@k(=+F?<1bP_<i1aQ(GI7
zd<x@apgFeE1=0}nZ)IM79sf~P8eyWE+4q%~-IgkO1v7j(_7Rz6NBRX{^_?UGVQA1C
zD311(=G8$F3nCl=(vma!zZr9ZuI<?iw&XuFQ@r;`GOzZAR+UoixLdmPyCE<LYNr7&
z;}Tc`89@{knLDYzZfJh!RUguB*7(Rhk^x%bYt`UDlI3uT_E<@~G@NSFhh`ui;@1#V
zB(iC;7X{BW0%iPy`6b1=tV>iSOKv4G?3=z_9Y5G^DfYwO)Opr?8{S;7ui`lN&l;fa
zg&_umKrA6|nUUrc!8&r%^GzNnb$bj^PGC+@btn&&L!{kb<yuL=7sm6@*UZkrz>3_v
z{qe**s#+@}K~R@P=cj3h<)1m8Z0;5yz{xkrW5eJ}4Cm-v+;<4CI-~e{c-ZtdB>wOM
zXz7!GzE-guzI@T7Y>+ig`c}r;4GkpW@;P@o=juP+fC{Lp`a5BLD8`4sX6Y3v_~OgH
zIX@P`+<rG8W^XGg8DuVy(Ff24aM%7Rc|N3gcZx%i#rrlySZKk1P<dCc3mS-oX8HRi
zc({ZB8ckHr`ma~Vv)O`3ICnzU?vJ7W=f7u}fe$1yq5St8U_f&vFn|3kOjS_N_TMX<
z0>12Qa1qIWEeiBu6nGokov6{jNc*4ZPLcqZi7<r!dk(O=paYPNzf$ghuYfuMus+Gm
z`}n^W1zHPr&PEB@p8uKn&jU#8V6i8nFaFax{R6H#WQvy$b^hHRfsnQ&0_zgBz4$}K
ze<sTXAqUv6^bRWB{6ic7XE0co$O`;m*1w?y<ll=)I`{Wpfd9*iW5xLYj1^^4C~!N%
zOxhKX^{JMgSqT?9*uE4~=fn4gsiw^tY^|hP^IjVuA`%mUa+XjTnUW|o5RLy!qY5cM
zfU8`>d5uaU`X==Pyk!Oh)N=gSN|`hSNcP`suYc-qmEJC{hPI!SY5c(B78MTfNZ>L$
zeUGIp8R$m(Ss*A&EY9<NZ@a^c{wV1jlKF)Hp$ftkw=me~v>FIWQZMj4>(3y`3r8r1
z4}?i`(W@v={gI|TYGuSd_I94oi4-92VxW2^brFntNI?<uD2VU(8ft=@sR`K9E~uTn
z!>AtwgZ~DOAnAT-KB)Y1(Oz;ZjzNE|BBpkbbuy*O;j364Mc3~6R5a_tQe=iAP5On)
z;{z@9GG0y$EeOZu-$)R!V60Pxy5Y^>Vlno@1;>{hh|EI7aIh6JvzJDMgr{~pWD@{g
z#)yJK;iC77(M5UKs(4U=0BW4uPYZ&L<`8(n2^9jee&O`@rJFzgU=l#GyZ`pQ!VdVW
z`6tlG&&zi}0sJwb)1R_xa5J2E!QO)O2iJF%g4@B(Q{X@0tM|@<k~#%y;q)NhGraMq
z<m~~-K{J=BfwX@QnratJwZ4xy2lPJ^=;7nEH>%(R<1$3RGB2Al&G<+%sOMUNx#E^D
zUj&6Wb)ejd_zqxaoAK~#X0`hIUhgTRi&p7+??KrSC>LW-+kt=HCPr(#z20K|h9TD!
zO!)5th$4tc6H5jtK>7~UdW(PIn2Ti^nRqYFp6idlRDf^`^%E@lIfWNgzLEf4tow8d
zRKVLLvH{wM7o)rC<qa~3P$Phy)?*%5L60dPe4a<+3D`$M@eGu}+<+}X*+LZkcHZt5
zM4N)mkEP}frdD-W`XS~dOPhQs&b8*##(w0~0C0X~C;-KoE+{D+E75ZSMSozf#H-5o
zq22=mzCc!a2I}GcJDy8~LgZ6vIWHiBK}PIE+B;H^!N~#qo;eNx`>n~M&9LD=)A4F*
zW2Q6Zp89+7P_!R#J(dkH9K$2lyDz$ZGfwk!AuOc;RlT=WIT4mqurXMy1DhJ>B>kVw
zJkJ4=oTJwZ6oQ)<Ac<OZJ;w<oDmXZ&Fdh;l#1Cy$F2kd8Zf}~?UdKkF46yWW!z;kI
z2rxi+fzJTA?q+pBA>rklgJ6vq@PUG7p&})7Zx{k6Z;u<30>7r(>v8O-pCS5bniQkG
zZ-BW4BLP<QUT-fj3!VfJo|RC!2-H>34O$S)wyHk}U`K=5Mm*DAP%%5(fsgV-Cyp}@
zpudozSBJviv_@?z7@QMyRg;Xv-a%a!8YtSb%(DYbr~o~NF7%>)DiMkWUVs!ZZwd_1
z%Mn$68f|n6d^yC@TiH^|b7OJN585*63s#%QJ--ZU{VZU$5D9q$NS+bk8765E2PCZ+
zz>GG_n*#asUt5wO{^UVu#RxvOO2a%c1Y4^c!B+<|+9v^6pi`IRJo^r6%UXj9d6LiR
z0+73ggG`sNL`@WQ5M==A0_<vOA$;_~bNXorF`z<1#fREg{s%F3XK?2vE!apfQpw$K
zcOfzU=m;bSWOn(vkcDCd;QMUFnF5N&0tnc>TOe^D25&FAmve%*ya17+TD1qH6qigh
zZPhHs4#+3SIzWifEJR!olwrlL;vw>!lD%`utr)cM{s{>L%fWQh<l0gYaREkuHJU3x
zg{iB2Oh1_jl?8-BUWGRd*uh1BTTv;d3iPV}30j)l02DEjh;@iuWeAV7LiPN1thji&
zWg*WEvK4QrbyE&LX$L=z4X95H0HyZ5#Q;$sG73Wb28Dq7r2ve^fV|&;`pnuBcbUUy
z%^LlEHqTkqT)f}-vr{a99MBF7DMg4f=U?kHJq)N=C8Qqj25JSVf8?ech)BtR9pS5g
zA`0!mWg}&OSgM(?^4)yp9vd@Vz8<w2&;-DR!XP-&zbx5t+fDhew%bjRD!Zm;K$|*0
z0Dyin6#0NvFM(WG=tBKH$mM`sb!r_lfaIV&*m~T<1loGE&+#Dfp4%DQ9gs63yg)<C
zi`NFU{p0|GXC%$yAnE@nT(Q&wuhT6xffASj!g%1ShahfpgSXm!P<8bGGy@`_bN3+N
zKH1)hA5d6@Vt|^-kc0CC7E9{5)A#SX_Wy}6|0#q1XV`ZZZS2!^Aa4o;`Q?^yTDm`R
zZUbPwpNlw@rau8IoVsfWHkP<SdRH|&06E5gLS2*xus1^^tEL*k2ql2O5_cXHZ#J95
zz~w>xBYVT87Ge@E0@7T-F8&Y!SoeRZ<%MdifMYpqq7WgefOM%B$?HRsrl125It6I|
z9q8h^_pyEA0np@WduRbcpa(5D@gB?|u$p&()m$SVk^*ld*tO3qFC59}RsmiUp$tG=
zZ8$-=tQCv^-HjZLHVj|`!qNhj+k}L(?_ZUG><s~YgctKr0@4q4L$z(}puB<;ga%2<
zO79>;Vh0f!3X=arE#}^8FrSmo4p1+IEWjm@9y)vw3MfD``u@17O9=#xZHgD_O#Ua8
zSpxC7w-gX!M5Ys<5cbUq)NTy~SP^d4g03kUAS_?yeNiYcW&?96elZgH_p<+x0&M35
zmpHh&01wgau-OE%d2ZlREBAK7X$5G&UY_MTWe7GR6gdEnTv?L^@*J>O+;+oji2VO5
zHb(-Q)6dS@0rY_hGx0uc&j^-O1dv7qmnZ|8x`Rh|gexG*A^@+bdbJ8_N`ZX@(SPkD
za0VN0rf<p`Q8AY}0|EXdX$*N6!N)*-;O+`Qj{a%>8$(Lo5Kszrf^WV8!}QBnAV&QH
zBxv2M0k#J8p_TvcLsP=b?`s?ew(G1I-=|HXl#q)7!`=a1m%8v@a`E@xR!HdyKuYm0
z?z8}%s{oyct#&jJI$&!lJ2<AoTXLtSJpUVzZ;axr2xA2WKn%-QfG`H0b&6EXy-<hS
zpaar)@N8*>h|Qb^{@2WD1`;lcIUVqCwp=77a~b4zH}Lx5LmX?h!wn_g;9ggYad1Y2
zmys~1M4PcGo}o96AF4K|eLST3uj1R^TSP$3?^qJLkePVa<k!<Fd#+bM*CTVc-9TK^
zRkJeyy1X2K60gVWMu9i}1|ZBPod|Az2x}sYHJOdCM|w#a9iC)d3!Zw7bfhcRJi`fg
zh&`W$BH(^txYa8x23`=Mo&#cV_E8qx`ay((PTnZMV(=~mJnrPm?+Y*j>2dd0taH;F
z_??WGkIGM}1of<dj2RUGWOx4sr~{%5NJWU*ap%#6uy4jN;S1hB$t;Yew%<pANa`JC
zNY?;Vs(no*NhuP6epxw6=QaZC4d_l6kGu=_y|^1IuopXJg7F=)I~j{TB47p5H&bV+
zu{blVLH+*3>b1sg`zEw^n>=G*;rkQ@PDpw69LonwkFDxcV9U=smD~o-6)0wsj!pi8
z4y<Xvt_fK`?AKtlF0;BRqcmZoN*lDn738`Ooo5o$pJ!P0R)Wr0SgRUHtC$7i^MYzW
zK<6wxS7q@^M-SJrhhsrVeWnb!V{pouzq4O*zz+IgKos~8C?c6?SqL<|^)U%-0!Ndk
z9KGts&S?+|O7R6#Dq6mMzng+qW&<5vO8q7Zy`d9y2j8Oz#_JY`ok1fZq$8YS0%LTc
zYbTi2PE`D;1xPmL2N8CwEcUCWO1v5t`9_`9IV?7lhpCwe-<SecXpXi4L{nMk1Lm$*
zdpMj2oMxn$fHG_@08=W_AV2U#Aefq#4%qAQYOL_?L{91+e04fdTu*7*OKCa6TO1X(
zU1Op1Sp&zY{1ia*ZT{>GrfGIJ`utw>pVJ56Hc>C}^9mULz11T^lT+hh)2#HWN&V4&
zQxL{W;09rRAWx>s`?qjLP0h{|rh^|E8S$-{gDwoU?7?T*@tW~FM{KeiY%Rs$1Xs^+
zgv*o@VaUk=cK{v1nv&V6!CV@=T-lYe`#ght1)OWm;1$+$Y+NI2a_;rzUQz^NnFpcs
zUt(Fr!@P$*+7EJSbO>6uw?O*ucm?hJ=GZ-d^i$S<h-z;yt97O`Av{ld*|w$F(W6k_
z>HYUbnV(cLcd4}fcf*3b*E@_F%Rls|tqj;BE0k?%c$YWZDqm?Ge9>CkeBFR8#MdF4
z_M(n<qp%e8TjnUzalvJXP2b%_eZW<wjig=lzkq%HiZg;XG@>Dsa^={hlzj<iztvi3
zL0Etkizo0WZD5u+VbQ|kU<q^h$VHu~Z-xaQb+j%^bKbZ2No{Ivf!pCHi(Rr5!K<ya
z0h6;A!J$G6EOH;a_Ee0T37^F@I!5TY60j*2v}t{{Cq8Nx+;iAMm9hKf>s9N4E~UU}
zCCwUq!vMct&O(!$mQRP?3md6lbPeNJ4RRp;^p3Nt!0@E%f|x|rYzGA3+o0X6EOe2R
zjS3w4G@=&p8|i$ZE^z7BZf)s)?bN31(Pk^RZ{GvMld;E#1BGUCOMd~r*_SScTE$=@
zlvUQ&xDLN$%oQec)nF0==Mx;y!G_9DwL=@r@Q&O;%cX>>sf?q|punC3D6>v~wd5P8
zbHBP_*+Hw1%Ms5Au@l;g@L%lg5@l{z+xGHr^`ZucP_bgB%(|<eoOYi&aWfw6wrb_r
z<mtV%L&Tasa2b%@P&!n0y4a<k7_{khs4=@w$d|?Cy-<=~&}mWf|DyEh)2kxUjj+RB
zrmHe@Ph{ThmYv@(>(&%=yD9W(2H7}+yj+L}XW{}bvLF;WXZ8x1=oDf-9tW<|2Du17
z=3{%gp!7JFOX??SN_0M6NH(l7L!N<{-EEG;17rX%fpUp%*WLhz-o(=Jgv!GHl8vli
zDeY9rB_Ed_vJL3a-4rx=%s2SnY$P~{_nSjPpa*xJP_qLR=cu|(Y#?KUC*z*qw^K`D
z;HO;_JZX5MA~yCuMD2elndtMKTs`PnrN(YMU=6JXZs(3oXTSwb?ucdITBxhvG7GdP
z)^h*E#<bTqr6Ay|^D8_^{k8wJ*EO=E#vjB2+>KMbjZ^P(OCCt{=_sGRbLXg{d>+$t
z(s2=3CEzU~8DP)s1l9oEoC=GrU$b89lOMX!UfuFwZ5{Zam8Y6jjvwdKH=7O;j$GGI
z%C4QXA@=Vc_SabVI$FU>o1tDwTGYDdw+Wj|3vnz`*Ug{p%T#3<>Da~+>VmxF-v_N=
z2sNxoMuDH+;2@sCL3q%*z>8yXj*#jg*OT3GlQlY+);hFu>JB&Pq9>4c&9A;XG+hQ8
zbSV<Ufca-6<bZZ!1iN6~`SMcV!NM1(rGA`9%h~v}Sq2VsJ?b2efb&GzNH3bSB>1-{
zq}FMn$%-A%jpb8td0f~n!r+Uh{2FKddV0Dg_)SV5JYyRZFdEKjcUzM^nmwX_GZ&Yz
z9vvo?)Ap$u$HtZvxOO^dZF(l7HW;3ju}+ua;<%-g@xc0vaV`(xNzmGP_4o5sXzm+X
zR-fAGP1t^e!1iVKs_(TSXdBUg1tiU>GHuq=tqm(4rC)nGNMf`I16us$Zy(7-11&#n
zt@gB|afXzC$xSxs*A=X>KeTWWQwSd-X~IwFEjzXnTq?d9bVDzsT+mK&(#j|f9CF@a
ztbb{eFasT|#dLIUC9Av9O|`kFEa0g~yTOsRgtIzZ?VY=cQG1DBW2i@5@g1*txde^|
zh^-dXKZVOgo|fJHm@?sb;jwkyd<dI~m$hB%*yJ{vRdUnn<=X>to2%gFiA~8=QP&l4
zx=gFM<0USsAIn=oWS{~>WeX(8^OXI?AQuCDbD6f^4}J<9wlsNvj(>h^5Q`L|X`fwF
z>LFYS`sI`=iQ{+jQ7f&>a?e-Fn6(N~%q^aNYd01{!LuCL7nfe?Gr!;qx$4OX{WiIG
zHAl<99cmE0YM!Bt^1i)T%bV1=)ny~IS@eM=FL;0!d*Bzex_Z%qFlaGRH6JIl%DD5P
z)w))N(574%!lfs9*m|_fl#*{)8nA31?$x-Nmf~^mg1!~IdOvkSgh<Dy)-d{@nbLz-
zQ>gR)mM-9??WWy#QTTVziS=%l>JcA}A%+Eem*H{VCN4Vgdv3}fHmeVZu%mp}BwK>>
ziuO|v&F2XpouM6T>L4o+S|`_)hPEOmj-4Ly|D2rg<Y<lQP(Eftt^BQxdC0c-aWoQi
z1iYeK0yuTMdb3?k-=r;Naw~VL4g?xv0?$nDY#0UMjKpL+$nK%zx;s}a@M=M}!g!AG
zPb+R#5(=IV=6%_@eOBBukp4Zh{yjak^%01@4S}N#jonIq?{<<`MU*;NyDHNJymD}3
zHGX1fMT!^1Evs%Tt&nb6_)pFTm>U%|^mGoS1#N(X`s<xd2c0P%u7^t*go>1)s2!e2
z{9Iyu%wd}^gWl+Ku72JN(W1o(y0umX$0uMlb`2Rx57Kw)XO}g0W{(as1Om5|g4|aZ
zJ<Xz>Zy?VE?N<lwTXz@+hV{43kJU{qJEnZIA@vn{ybO(*{HP+=2_l=)g*&rw4lKSH
zTf37HkVbZM<4X;p!PUym4C?ur00(J3Xm%Ck&%H4F1LhPSlA}2$Lblt$-l-jGCG+Hr
z?aI81*ewn0z}~hG>gbIC{>_xu(Y)-0ug2py5t-uC%I{+H1vHQ=zd4TLz{%+FTw;fm
zjt<Yxq^SY-=@tQj;F;?K63j_AUL_1KtbYb^M~aT}K4YnO3-61B#;vz4X)Eq|-L;oU
zeF~I3d)vue10+Y(m)ksWnbyd?CU8-JZvAeHz_r80AVO&i5{n<|zE@6X7E}Caku3+D
zYp)xpxFfy(Tx<ODRsZGTz-8+YxdW=U8Syc(JLALrWE<84nh9(y$WM)u<3Gt$`h(^Z
z0&Vr0d~YUrJQI?e#vCNc>RC|^7uD!WZ|tuDX=z&3waRnT%{!2};c#)>{YePzUK+-&
z?|~}vkgp>{C5%2v75Bhm`$38sI@i5uef1U3ULmw8))NXenEnJBU(c8OJnyO=oQZKl
z?N8E(_Uj&w$nM*uh*Fb}cyM{tffBfwfa%!N^;r&yu?tLY(+|W2rU!}bEcQB#Xhv8b
zV0~?Pv3HDQ7uY;U>#1&7eH*V7(DS)KzGSLrjohzzEkBg@j?t&RIE>RF5SPheZNl~@
z6&+*dBC~y8cKhjaNaz0HaucCCJcOz)a;Z0-k_x-Qk1fYlLh(n6?WLJ@V8sH(w!TFj
zv_+-zN18O~z|z4nmDD;fq^yVmd-g5peg;=z1zrL0tEyEG&S(s!`+)4=`&Yk}SyEX?
za3OKU(Q8|AQX(N%&bxX;rF(HeywmGAJ-C&uW=;nb3vY%u?S?BH7Ae2;Y>UZcZ$j1u
zDhcBkgk?8{&({z}MCq<?5<t$N0ISHS$2A#k6b;Ziq^X(>Lb-uGNLq`ETdNX?Pz~^*
zOVR$o|F8e%D3v1j(HRd-6{RU3Qu5e|Z;*KFx^W*Ig72P-TNrq<(TZDkD|}$(t2AW(
z0o(!KwD(%t{+ZSKkb_dLtd%c9E)sI;rwzpPLWE4;1X?jXFXGaXM!)>tF2R)~VIbWn
zySM6dYjSosa&<Sdo4RO$GCZKk0aN+T*eEJvFDYZB=wqy>)wH_Dm0!6M*2vMQ!?`HO
zqCmxYe~^XXdk%R7)XR2=E!K!rf{IVuxTqTU4>U#uKQ#4wz!X&tE~QV&XP<c%8S$oF
zKhD<&$~~&-e#Xd1PAen&JgN)EWq%Gp20Y{9tJy=*QlD-X1*ejB_qyXl^m-ZifhVgv
z<{uVEx~b9X@5gQg@Uz*;Ov}qyZPsPDZA#WIk8df!KKgpQyPqVlM+HwKZYF^8%s(wR
z%SM>}S>=ts)P&guHFXO%<#J;?bd`hT##Fr1=gsAvMT;YR+M@9Em8r9vrJAAb;kM=_
z3rZ^~4EK`Je464W@;~Rbg7D{tAB_Eiv*C134N~cuo8J*GpAHM03%i_>$fGdG|H@5Q
zkrGNDrM&|C!+p$EGY+P?#3sd-J09oY^7-phg<*NMDQb=1Fz<o`q9Wd@FzH7{<g}4J
z>i4~&v7?2E*&d9lZ*wBkOkSZ1gRV803^9j7Q%qX9y+eWn61FN_Rb(q+6!1lLXkj-;
z3!jsQ+_b)nRnO+}x>Hc?h7EP!9OwMb8+AX|xdk`5)9CeUsv76Z`K<lk2hzxUdJHYP
zvSLkMq>_b7dcg@9vyF5<x=f-M$KVchFSpD<+A1c+IFiPbZIkN^`UAaW3s8l{8id4L
zkWfm(3j|_(lDf3#7R37Z>k-|=5?IHh&Y(R3Ou^VJNvx90Mode%`PQ2>Y7=@~#$kVu
zucO(EBC!}|mO~9a<ONMYTiNZ_!|%T*lN$RgzdwW}7X_3pw#CP&6WVKLbTn5VUX_#?
zxhyWwSY10<d?Uf?g$g*FhJASSN#NkqIb_TY4K`(Qx`|jefke8yeK#1`028&XhtuR2
zZW#~V*)uv?uvjJ}T1&t7kF${8a0tu|8hDc7Ag1LVgHtH*z$l0BkI%}aDMsqArW~pX
zu77BiUb*8^2aC-mHWS|9uy1vjPNuog{ps@{4{O~}zjBaOvNjDzPmC;b!$vbeIul4v
z@nXy@0m?%$X@PSYNIJr+;CsX0M6R;WJlM(P>!{<G>N1rYY{^PlNOPsKLKn-%D?D^7
zg>%m6M8XyqJUcc)eQ&_S3Ha~-uJ3J}X7HaQGa#%#BvTgh{iW8lrB=Nn6kEkz-mjW_
zVle8_YLX?plH6;jeYBILmuh1YMhs&X6+&$?`wRTW!KU@OCw$*cWjFfL=EZTr%(J4Q
zA~OSV3jrIS5{WGel{&hfC8{&D#V6Nmdx&qFSc%#*0uu00X=K}sb`VA~2qWF@$qO-|
zx2lQn`h3JZL*duKwLkr{=!)XfP`&gF+BNmJD$K70*@qwGs3^`_eZw5^kyYP#Y1}p!
zTcn{Yeecdwox!8S(j#2cWc)XrGSiU7Vc`iCShmo?JFM`EhMu8N8b`le!7U`LE5yz$
zs_Ag}2!C!e>`=zWbUa|y4_mCz3Vs6-_I@m~C?jr-6=h9mTSa~<pw|dr8j_E{@#Mn;
zv$TVn7#&T$5xfziZ^-R<r_MSks{I<_O(H_kkrZ8L_Swp23iwQclec)cV3tbEvB~e+
zAG5~FqL{!|=WL}Bg^!{AU34<0MQKsM(&xH;vT47e`Quo;num`D;ccUDX}!12WrW9o
zN<Vc<Zay6yY6B{9q|k0fJe~7>gq+Qp$K&5USGQHhuKHO?XynR!i1%@g_`J0ZALVN{
zjl5}l`Bm(G-qA+tKpu}lXU}#C?i<?-MmUyrF$pO7opxb8+46I){i8QDmJg0^_(8Ye
zvd`9TV5?s;+Fqg5#h#MLPC*mMMF;uyMhmjDTzj&bFm&y&X3OW>DiWC2q3e&3OrhU%
znHCny;#q7I&kwG~S@I#TTw|spB4VgexqVAt#H%LAo>)=l7UJ*oZT~9Gf(oI@lY8@Z
z=R^X2rP?y$L>b6^oE!I2nre1af;OEdooyAb3@GVTPBhIceDH5~!f%UXza~f)R!-F6
zHyx0~!+BaF6T;bQ7k*DGNZQ}B>QE-Mxd-E6g{0;}53+5;q9aV(^&`fsZP^t)e0$KI
zzlp^qGxVc!zww);3Y~k)C?R5&;Q8AAl~utvC=pJT<QE|L>|xeDEOtC>!i!&XanQHp
zulwjsda?(1lyHxEUcy6Z$(7uz59-G8@h{a!I)YYRzX&1OhMyEg-0KiFWyH1UZeoR}
zSi?(JMt|SDD>pRAeX<%u7{@GEN;{f3_TOvS=aWH-Jsf4=wqMO5>__w}lel91CO5~o
zn`cDc<ubh6kc^#p`Wjff*S+DRpqQ@<DuZgvCd;gdOj?xaDfW@OzljCBKZ-Zy#+(l$
z5+i2!S?`{U42beiNnO@2RHC^aFjSE~_<_7NOCWTFgA4n}C<@=X+UQZ5o`J@KU*VDU
z3a{tBp+VXH`h%^7`R&ceN&G&Gi>cEy_QUtedAQlGEMyCA(_PmoUP9x{3uZ+I^JN~-
ztRAE_=}CdY)5M+E#o>!6`2?c8#z4&mi)8R4NuMflJS_WCabxH=-J05@H67VJ#>yo9
zm5FQDF?xhi^=}1H0}mG$`8p1uv~3cugqzLDXBb15Iod1rixe!0#0RsJ)f+}d=&B;n
zA@Ydw5c7dkU@s6<O}4|>fmU7vq)oQIw(T?@_N))TXY(+Rpr5E9O*D{`GSH=98|VD`
z1rOx&E)L|~%rUpv6t$>G;_PdU2{owrr8Y*HSMP2_E7TpynXiUZ4yah{Damg9{`)%F
z`iiNyh{wCr#19%4FeZvw@!`*hXp_6;^61`IEVg}mOUKgZIn7Zg!NGk(Pmz8w3S3M!
zVH3XMSZ6aN(!L_AG1mFpME#1}!Qtvr;I9Vq0M4;dV}eiry*yARY%&?2*YvWh*FeL2
z9wyNHTq|eO$BpL(PlM#Ib0}Y|ZXp?`zAzSGBbFR;pXq3wDQKlHAUA6Hm+;pbTrhjn
z(M+W7%)KA?2h@%?32RfsE`Sae)DQurXotNRNw4y?I{gAKRvRXDQ{#k60xdo_Vfbtf
zrt=Q2Ek6G2n1EMB{^3$@!Z&rNwz;PZ>mgiHcfpTTF)8)47UT9gx1O639-1CyciyYS
z*<KHGTu;IKf28&QXsxQ=A5&}fz8~+No|@9)`OBDbX6pr<%H&a%ZzX?El7~t%jefUs
z^nxa5jZLn%0qwX$vSa*a@(^$S&^%Pmu{JgKRI-)07dsKwk!|ahRU}%08R`m?ax&??
z$zp)}e$jg<Z~9At1b+z+v-q?ZZ?>k;a*?l9fo3lK1F^SDtoqWmQo~`p5iZ~&p|c(j
zpk@ZtNlJ;U0O@sp^W~GV5i)Hekqa(!6nOmoxX_!M<rdP13l@aJ7I3YamuKLoa&0U0
zTCtN^vQypWR08dAu7a4`1SnFs_RPd{IU=_<uQdL=@}gDtaNQ4E8hC3+qg^g-EHZ?a
zZv9)OW(&H7Z}VlKwAH)GCYcR(WVJO{qI_g74}FbRuC-NI%F@?<cah@d4g02T`xb^3
z=ddfDuT(Pig6>{hP5O2!DOOLmFSqDK+`Kl<p-}iiQLXt`sLnCv0czJZ$ehDHCyi`1
zAy@YPk_C(<?o<-_E9YXV<e`@KPbEGkAUj|6Cke>27y86aFq6K(H<ah`O1AVgGfv@d
z%4m#&o!9amz0!wr6|GXguZtC3+=Pm^d>b~7iexjG&;^$+;Zah%=D3BR(wv8E%BPly
z6rgw$6pFt5`qQ}t;bwWaBh86o#t|X0@M=ask>Z_bKSM9U5d%dUPL0G~k+)gnuW`82
zAcFO%JfW@Aw_lUZ!C|yY+CEii>UVUDMW<^D4SVrWzQgj%Fqy3^v50$Z)oXh<8t2<c
zUe;j=E3-7M>L21MSo^PllwQoildN4N;vf#kUH<=+cb#EPWm{V#Dxi*KMi3N4DFcKi
zy$IqcO+Y|E3<^lE5k-oCL86W|NKrtF0R$47AT3D8aS#GhBy<LBfDqb99r|4xr)2Ik
z_j|tI_xi)<JSXSuv-jF-?Y7?cMH5!eb}8HaYmYVd(kh~K=U<JwM7(A1c|ao9a?#oZ
z1vm50KEK`jywll-5HY#LW^1!DU{le>M+r(%=TOR9_F|Z2v56q1Eni=p>7B6sgZ^}0
zK{)GBS2T^w`^}x4Kkd(uJM&CO`<6pU_+*z$cQI8$@Th*Bay4k_;iH(j_?oAbYLgK`
z+gPqkjduAw4F&kyae1R?2i~1kf6C^Q=#eIugi)_Y=X1m!9d_dF6gwAMObzbequv2d
z$bui2<33}r8R1Ei8%qr^^V_3B)6d^xT%nM&A7?L^te;UDw=_Pnav_}0X!2C7iHPTw
zS9^KyH*ImJ*!KTcoUmB@I=aE&$KP%m{dV(4#=8P@H~-|Dhhkob{hH|7f8s3h>xs0+
zR7?ofk=Q8_TRH-}m1GUPnEUykh1KZ&KUUH!pOIl2Z=<Smrnxe8+e399xA|Ue-b`pN
zu6C&p@+fiE-Ues8q2jx0`(tWdZY(V9cJyzPOwL#KY;R5*_juUrZD+#YbwET=AeQTO
zgNnaOQ&a;_QbE%f_(NZ-9!g|<lcRs7L5*Hm`1#C{*`KVKEd~0g2;TbR58q3STy5p9
zF)YuleAuN&aozb|LNV^$%V|RKCSyzyshzH>#^Dz9esfS_ZrzuLMnjypvJPcNKBTcO
zVY7bcAS{#y=spNi@)}amq)*vqkZ3yi{B>~3@Q>=!lzHFEP(kTN|D8R){f=U$-{djs
zXk(U{9ZoI*)=~A*%+ZQ)A6+KIZO!V7-}0mv^KQh3g-i3Oyj64)u%%P4)=@pUir);~
zD0>6P5k-Bt!5jS3i497hX9P`8$Yvyu)q006v?j?W)+$zUY?0N<xbJutH}8M_Ue^9o
z{D!vLJ!#RC-8CAsxu0ph4Qcn0|MihXI=O-h+HzgxPhlzo-c>HN<w)h}y7+qpe(}Ug
zA2G6VOlt5lX`hTO_-qk0V9`i!Ady9Hlv&L+$UTygHoZ2vuN>ukqTS@l40$)Kt+ua`
zIJZrMZZunRTe@yBuKQ8=fth62`j1SNffDf^43$}&UXLWW8rV}OYo`mD3zBUb8J>3u
zGH>p1bN-Q?wuEIS|4vx4kM00ZY$@2J*KS{{^pe0ABWq%n@F8qbM3;VlA$kAu(B)`n
zH!W+45glJ;bX0If*CfteHbD60E1HQ!&xQE6oZGLy-=GAmo&!xq{cXm!J@ro7dS*7f
ze&%JaN^xk1T>jv=XnoW49oU&%a&Ds!Mc;HA2v1NjOmfM%#=sPEyx8^BQJ!9*V_hJp
zMGb=eiFJ480h9I}bgt?)!9kAiXL-aXQL^jB)p@V*G1q*_3+i|3rp{~|?e<-Eb-T<Y
zJBv1*ASVYx#_WmCDF;h^)2fJf32Xu}0kpB0MqVk7Uzs|Tj}C@m_Q#4O%3|Nq7gQB`
zC{G-jhV74H7C|&CaC^oP&?0~=u0*k4VF+-$H=^T-8H;N^;lomG$-mh4yELy}yI~Mm
ztMs##O-km#;KoDi^(zW<qzZ6k#<XmFf;m-fU-yG7oe1ucEcZ)8;oJ;^8oF6z77*C4
z+FPqLljXbX+${Zs-z_<-GIDi(%57Czb#T+F_>!|9HsB1U<SQ+-vWw14gN{$(p;sNg
zJmoGQ8lCpQ#sxLdTC8lUOlximw(UGHGdZF*=h3rUnD|3`m+lo<Vq~ir1$HwOiCX?b
z#%VckBf@21iA8Aa6;IM6f5;wl&(LXyeTQpwA(1ATQkqHNIJisKLroShV%u3kv>lHQ
zSM+~f^o*lbUQ)DUnYuci!27BxyQ$l(F>|#%Ld<iZplGSjw`(>~9B+c~ViJac+bG~~
z;|Dp0-D{pxT1(U73$1SSzE>vU6xc^`6&YBb!o+j8cqo<N=R(}9$jhV7+(to@51Y)T
z9ox7!su!&I$IwdzXCxiwify!+4$s2Mgg&2L@Zydv+gCoFD%~qVDrqtyoW5*@xv`*_
zQ92&ILxP6ulutgVF)4V&YYNljb33w3)LBT=_h{GW+D3(eM<m!wOCRpBQ|0G4<ZH;#
z6?nLFAI>z{H98?cU6v1fQl*;~-8gaddgiFWBU)+}sLN_Ssye`+#y)6OJFB_+e*4P#
z@VCceXkM9TdzrUPzx~;v#E=riciBO_oA%^--D1ULWX><GsUzWvA_Z=o0-1BQCBnG<
zqWA{hBc+tY)Fskv-MD;LdfHfReq6fkK)CZEnqd@vF1EXAqmb#y&KUp)RRu!GkzrB#
zHRQY?qyJG(<C9!tyI}2f2uWOJCS2?QPWl7hGqA_$Wvzrn>_#5Bx`WbVhVE7#nWo)g
z3cKs7jq-a8e0X*D<Mo#7w2Z!W8C^6CnAyAFaWvjl*#FWS(GVIpgR)Qwr{bLwcCpcQ
zB)3|nC0wZBhok=e!bcc#yX=I1x$Co{PHd^U`le?3rB}0hp5!G=QqQ}U+0WTV%d*=<
zw%m4lLfVI8S6UTHtmPFMQ02AS64aJyZSKHpWZk}aZfl|Y{LIv@iw@jzq%(T8&nJ-K
zx*5I>I>0qeiRuu!^Cmw`^2*#i0h?p=II;?RKY!3qd1RtLrhdY6#V<kdrR~Z-@naFu
z<)e<(HH2Sk#xCEdld%FZ9vx|WU|i>1fA=;*M5v)7!bkgXE^^Y!ZF~}gv(NkYjzOb_
zd8rz*KuB+OKA}ayAc@DWg^&R2<8rbeAXV=^(RSkEl=Z}nwey_eVm?iqQXc=btZ(o0
zk9(a)J|5=AX(}`sb45t%a4F>WH&G4nOt1d=H{KMAi4i&sEao8PIePaz{mH71ITfGf
z7jbbw%CA^Ip@?tB8WUkMzZ)H_KLKRXc0g32{v>|O(Z6@bfUoY(k4vj%>zvi#IMMi>
z|KM@+i#;)092|^)cnI3CsaGHg^@5{U77k=WM@wHDkSj?C>C2O=a;f4;y3u0Na?bH)
zGaH2-GQWvLt^%?KcQ>Q_J1`*Ne}HEImV$@f@({`?0v*9?U93JVWkaiCshwHWXn+#~
z7ZQ%CZ{?pxFxOdt{ET7;uVUCvgOml%+|M16XHNoBNJS+V1G%(|&jFioUX>f5Zkvd2
zzBEN;t7Kc3m(0KO>bpj>mt5`Pl_*#vU_C(jY|o)@(0Bht{!|@=mfs)J4naUM2e^JX
z%n6y@;HquNFo-O|0=c3F2d5Yr&H?F8vl247LF&CQzWxYx<!VogMJk@*8=33rkywjH
zH*W}uaCa&wy|8Bi*&uIxr6T~tX2}M>qfGbYt1JcG!x~T(fD=OI(&o@-1O!5qn2YTv
z03CS?1IJ!$!hp#p+h&@rk<?xT;E5!ha}J6u0Ax;gNIZap@4<E$JATy=p|&}o`rm8%
zASTZ=*m#(u1rLU$K=Col8Rqb<JqY(1Rptd4v}Q1%tCAOh%dlK&*TS}G6>qvLricxa
zkw=X7%>YgK=H_J{X#QV}jzKJ7U_9TYBZ(A>sPX=o$JeB{mv$QD*0kPTM^peaw72$j
zp9rq%1KX6-&#h4+{yg6JOG1eau(Sd(n%}2kpdc2++kZf<Lc(GL)ctWbw4;QO>;#!L
zL}DSHG~NgemC#M(4Rgqy@cJ<EJMhfPSYH8=XQtqU{!LC(D*#2;gON7aVei3REMQu)
z5+5o6IGY86{g=QVaRv}e?TijfXvY%Rb819R0Z<PJqZ{0L4~Y$M!d>i;12FSg;KMRE
zp?2(!jtJbR4$YsLE{p<1>=AfvX#p`V_ZUN+1!zxEhhyL_rz#GZg#Q8X8w!P~6P~34
zsM`o$tJE;HhmHVL)mM#ZAc}!UT|e25fJvxR@Ez&O>oLcm=C_PbuK|z0Vu;*9Z1aE6
z3}s6L($bT*iZy_;Q(BE%jrs)1cjs07of6SCSl6I5*zDSVKzw8B5#F)Lvyilo$PfLh
z4u824oRU2rrOCGbUtR|tkl5Jz9XK$+Kyl}y-GKE=fT&q$^_GSL5V&p=^tFr?^tJb*
zb^|yt0^eeuByz&9pw1QiyE>Nyfb;TRNVN&9$+Lw0F3$oH@?|S>rV?<b;n|Q_KrOF_
zqS#cA?nW*o0N2-5O--RFBsO-*D&r^wJ@Dd&O&R6z0654;9yFwRgX=TK_9_4X`rsN2
z7H24BU3NxDo_+~vC<j-^;UNeMD|pN*H6DRMS=b9;1*dk~41|y5*kZPNQm>g@C8UFy
zTPj_%<7<d8l5>s&+O>d1^>orjz<5pq#>;Ae3dL%z(8E{Dug*_ywM<;&8K4L*CS*FY
z4o`5L0j6UAH&X!(!r>ra5)vk0WhVf`A+IyLO$Lc4q)DY&lvA}V7kBxBj1YSkVD<CJ
z744^h1Oq)1W0R16Xgy{FjFX!-S?ogt4GXXwCW5cTRH7+7a8hz~1%2qBqYjSP2IHE1
zdD6J>_Zq7Bl9bV)gW@mvWGj$1xgh><9vyBl+Y`WC3lZL^HviQ9|Bp5SqeOUkZBqD$
z?%xSreq8kj1=RiPhyZ3P%*xww<pCP?=Ka<fMt_L{y*a)n9@w?H(=Af6MfKq~W`<BJ
z16>$Qk{XS7!*wfoJ3j|p6u_u)_#jkSpgB4?I7M1-4rV{iRdBl%!(JFAS*JG^L@@|(
z?FvQf232;mBh69NI6KDrMln>(MR4ek#DDX>AD}!ApIaHorC-A}j~_V$(7IQkO&?3o
zY~a{21fceo+fM`G0f>{I<xpO07~+Z2qvQto)0<FTB=(ACW)k?S-1uR0fUbXmqN@FJ
zLCmH3b`|Kd-;-#!RL?rvayttZ#DZ*>k%ljoE$HK6BxXU<Lrl?sQZ>5_y^n-_OnTE{
zRT6*V{a~D@6aaO-#%HwwW88zZ%ATKtXT26r*Fz*%!m}%cw2g=;@Fzd=%nMD5o6~W-
zwH|?aTVrm5qmK_+M1~B{9}W}->V_Z`LQ>T?M__ytxIV6?T?kM7x(aRLNAsEYj<ldO
zNh76d%7%D|Za8Q$NaPRep9p6@iW}+3VXP7U|Lh0iJoMfm!vc|{iRs8(-EvGM=ye2n
z;4XN-*(7m6(njjg^%TK)1}g77;wTHC3l;(f3mrw~7*uPknqeg1ykM2&VBY&I7m{Iu
zVNW(sexMyD38-i@3ZYeJkjx8#kX;Rv6k1l~teiTFV9qck+{r;oB&<OKd?cLroEyX;
zfwQ$}LE(Cekri4-RZT*>>nH3bVKPOl2LHK)Zm?1!6s_s8c3StRu%N(m#7%fwVrmb|
zw^U7a0To9awPX&4cq5+?dY@ki{=x_e-~k0sWHS>+KH+>iGO$<yoX?y)&`Mauh+Nqv
zV3&L=#S(#&#E!+<PY_%Ld_9WmM+QEf6%tVCYeuw?i=cQ!7xMw;9kCi5yj6-~!i%U~
z{2;z0j{oe#;oUv{);>R=JTf@N;-nG+;ZH#Q`}BIl05{FKNByhXATYC^V-LG7bkQ^i
zzdJw*cDQR4uL$66At*f}!1OFMC0&#79a=J;JY`+SGqtC5=}=F$2gZsm);fM}xxDN1
zKxxnfA=@x8qhsRoSOuPGS9J3-jpy|0Vz!A)m#l_PW77o}L~r1!2hpHt&6I7>H!RBv
z(t)t;v;qAa_@KpMVv@~INYvA)y6BgI)-jctWutqa2J^XGwph^A<7P$<znbvhYG*nF
zqvL|b@d0oEG-Z8dhHd@c_=y{ESQ@SLsJoS`x40)9HA|QonX;yi?e?=t*3=fU2Chtq
zVTTiRy#dh3Wba-uhVf<aU|4kKO`-YrrrVvx)3E`&hAa|8!v-F&8-6?)T1>Oi@^jxM
zO6_fiz+OkZ1X2F!1@@~ZBSZ+6c<faq;Z@?XI+gy|^pAU=D7TSKV0vwbsKXtTb{9l*
zHMnwcRZP_ZVx7j(CXKlqvirS|+h`NPg0EeB2yU%KBg;IxHurrk0=^Eu_-O%)*1x~s
zTDxGKx4%x7zkl-uboiy^NZ3XF?FH7=DXQBazn(_}$cvQ|uu}iy*4hQ@YWQ#UMAPND
zYpJj=_}dHLLTDQ#ertv>u&njmgEUIBD_ao3gyjX$Fa!vGGe-vNP4p%SW-gu%n_<>6
z;LR#FG}I2Ung6X6c;PQSJ=R)kp2vV0IJ~cm|NXQ71?I0>-H>Zirre;sJhu-1>1mxj
Kp8do5>;D4rHz)l7

literal 0
HcmV?d00001

diff --git a/docs/source/dynamo/custom-backends.rst b/docs/source/dynamo/custom-backends.rst
new file mode 100644
index 0000000000000..2c8b338045e62
--- /dev/null
+++ b/docs/source/dynamo/custom-backends.rst
@@ -0,0 +1,154 @@
+Custom Backends
+===============
+
+Debugging Backend
+-----------------
+
+Suppose you wanted to better understand what is going on during a
+compilation you can create a custom compiler which we’ll refer to as a
+backend that will print pretty print the fx ``GraphModule`` extracted
+from dynamo’s bytecode analysis and return a ``forward()`` callable.
+
+.. code-block:: python
+
+   from typing import List
+   import torch
+   import torch._dynamo as dynamo
+   def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+       print("my_compiler() called with FX graph:")
+       gm.graph.print_tabular()
+       return gm.forward  # return a python callable
+   @dynamo.optimize(my_compiler)
+   def fn(x, y):
+       a = torch.cos(x)
+       b = torch.sin(y)
+       return a + b
+   fn(torch.randn(10), torch.randn(10))
+
+Running the above example produces the following output:
+
+::
+
+   my_compiler() called with FX graph:
+   opcode         name    target                                                  args        kwargs
+   -------------  ------  ------------------------------------------------------  ----------  --------
+   placeholder    x       x                                                       ()          {}
+   placeholder    y       y                                                       ()          {}
+   call_function  cos     <built-in method cos of type object at 0x7f1a894649a8>  (x,)        {}
+   call_function  sin     <built-in method sin of type object at 0x7f1a894649a8>  (y,)        {}
+   call_function  add     <built-in function add>                                 (cos, sin)  {}
+   output         output  output                                                  ((add,),)   {}
+
+This works for ``torch.nn.Module`` as well as shown below
+
+.. code-block:: python
+
+   import torch
+   import torch._dynamo as dynamo
+   class MockModule(torch.nn.Module):
+       def __init__(self):
+           super().__init__()
+           self.relu = torch.nn.ReLU()
+       def forward(self, x):
+           return self.relu(torch.cos(x))
+   mod = MockModule()
+   optimized_mod = dynamo.optimize(my_compiler)(mod)
+   optimized_mod(torch.randn(10))
+
+Let’s take a look at one more example with control flow.
+
+.. code-block:: python
+
+   from typing import List
+   import torch
+   import torch._dynamo as dynamo
+   def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+       print("my_compiler() called with FX graph:")
+       gm.graph.print_tabular()
+       return gm.forward  # return a python callable
+   @dynamo.optimize(my_compiler)
+   def toy_example(a, b):
+       x = a / (torch.abs(a) + 1)
+       if b.sum() < 0:
+           b = b * -1
+       return x * b
+   for _ in range(100):
+       toy_example(torch.randn(10), torch.randn(10))
+
+Running this example produces the following output:
+
+::
+
+   my_compiler() called with FX graph:
+   opcode         name     target                                                  args              kwargs
+   -------------  -------  ------------------------------------------------------  ----------------  --------
+   placeholder    a        a                                                       ()                {}
+   placeholder    b        b                                                       ()                {}
+   call_function  abs_1    <built-in method abs of type object at 0x7f8d259298a0>  (a,)              {}
+   call_function  add      <built-in function add>                                 (abs_1, 1)        {}
+   call_function  truediv  <built-in function truediv>                             (a, add)          {}
+   call_method    sum_1    sum                                                     (b,)              {}
+   call_function  lt       <built-in function lt>                                  (sum_1, 0)        {}
+   output         output   output                                                  ((truediv, lt),)  {}
+
+   my_compiler() called with FX graph:
+   opcode         name    target                   args         kwargs
+   -------------  ------  -----------------------  -----------  --------
+   placeholder    b       b                        ()           {}
+   placeholder    x       x                        ()           {}
+   call_function  mul     <built-in function mul>  (b, -1)      {}
+   call_function  mul_1   <built-in function mul>  (x, mul)     {}
+   output         output  output                   ((mul_1,),)  {}
+
+   my_compiler() called with FX graph:
+   opcode         name    target                   args       kwargs
+   -------------  ------  -----------------------  ---------  --------
+   placeholder    b       b                        ()         {}
+   placeholder    x       x                        ()         {}
+   call_function  mul     <built-in function mul>  (x, b)     {}
+   output         output  output                   ((mul,),)  {}
+
+The order of the last two graphs is nondeterministic depending
+on which one is encountered first by the just-in-time compiler.
+
+Speedy Backend
+--------------
+
+Integrating a custom backend that offers superior performance is also
+easy and we’ll integrate a real one
+with `optimize_for_inference <https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html>`__:
+
+.. code-block :: python
+
+   def optimize_for_inference_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+       scripted = torch.jit.trace(gm, example_inputs)
+       return torch.jit.optimize_for_inference(scripted)
+
+And then you should be able to optimize any existing code with
+
+.. code-block:: python
+
+   @dynamo.optimize(optimize_for_inference_compiler)
+   def code_to_accelerate():
+       ...
+
+Composable Backends
+-------------------
+
+TorchDynamo includes many backends, which can be found in
+`backends.py <https://github.com/pytorch/pytorch/blob/master/torch/_dynamo/optimizations/backends.py>`__
+or ``torchdynamo.list_backends()``. You can combine these backends
+together with the following code:
+
+.. code-block:: python
+
+   from torch._dynamo.optimizations import BACKENDS
+   def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+       trt_compiled = BACKENDS["tensorrt"](gm, example_inputs)
+       if trt_compiled is not None:
+           return trt_compiled
+       # first backend failed, try something else...
+       cudagraphs_compiled = BACKENDS["cudagraphs"](gm, example_inputs)
+       if cudagraphs_compiled is not None:
+           return cudagraphs_compiled
+       return gm.forward
diff --git a/docs/source/dynamo/deep-dive.rst b/docs/source/dynamo/deep-dive.rst
new file mode 100644
index 0000000000000..c60047c2a3d8d
--- /dev/null
+++ b/docs/source/dynamo/deep-dive.rst
@@ -0,0 +1,145 @@
+TorchDynamo Deeper Dive
+=======================
+**Author**: `Jason Ansel <https://github.com/jansel>`_
+
+What is a guard?
+----------------
+
+TorchDynamo operates just-in-time and specializes graphs based on
+dynamic properties. For example, the first graph above has the following
+guards:
+
+::
+
+   GUARDS:
+    - local 'a' TENSOR_MATCH
+    - local 'b' TENSOR_MATCH
+    - global 'torch' FUNCTION_MATCH
+
+If any of those guards fail, the graph will be recaptured and
+recompiled. The interesting guard type there is ``TENSOR_MATCH``, which
+checks the following torch.Tensor properties:
+
+- Python class of the tensor (tensor subclassing, etc)
+- dtype
+- device
+- requires_grad
+- dispatch_key (with thread-local includes/excludes applied)
+- ndim
+- sizes\* (optional)
+- strides\* (optional)
+
+For sizes/strides you can disable this specialization by setting the
+following parameter:
+
+.. code-block:: python
+
+torch._dynamo.config.dynamic_shapes = True
+
+The full specialization mode allows the backend compiler to assume an
+entirely static graph. Unfortunately, most backends require this.
+Operators which return dynamic shapes will trigger a graph break when
+not in dynamic shape mode.
+
+What is dynamo doing?
+---------------------
+
+If you want to understand better what TorchDynamo is doing, you can set:
+
+.. code-block:: python
+
+   torchdynamo.config.debug = True
+
+which triggers useful (but spammy) printouts.
+
+For example, the printouts for the first graph in the ``toy_example``
+above are:
+
+::
+
+   __compiled_fn_0 <eval_with_key>.1
+   opcode         name     target                                                  args              kwargs
+   -------------  -------  ------------------------------------------------------  ----------------  --------
+   placeholder    a        a                                                       ()                {}
+   placeholder    b        b                                                       ()                {}
+   call_function  abs_1    <built-in method abs of type object at 0x7f9ca082f8a0>  (a,)              {}
+   call_function  add      <built-in function add>                                 (abs_1, 1)        {}
+   call_function  truediv  <built-in function truediv>                             (a, add)          {}
+   call_method    sum_1    sum                                                     (b,)              {}
+   call_function  lt       <built-in function lt>                                  (sum_1, 0)        {}
+   output         output   output                                                  ((truediv, lt),)  {}
+
+   ORIGINAL BYTECODE toy_example example.py 9
+    10           0 LOAD_FAST                0 (a)
+                 2 LOAD_GLOBAL              0 (torch)
+                 4 LOAD_METHOD              1 (abs)
+                 6 LOAD_FAST                0 (a)
+                 8 CALL_METHOD              1
+                10 LOAD_CONST               1 (1)
+                12 BINARY_ADD
+                14 BINARY_TRUE_DIVIDE
+                16 STORE_FAST               2 (x)
+
+    11          18 LOAD_FAST                1 (b)
+                20 LOAD_METHOD              2 (sum)
+                22 CALL_METHOD              0
+                24 LOAD_CONST               2 (0)
+                26 COMPARE_OP               0 (<)
+                28 POP_JUMP_IF_FALSE       38
+
+    12          30 LOAD_FAST                1 (b)
+                32 LOAD_CONST               3 (-1)
+                34 BINARY_MULTIPLY
+                36 STORE_FAST               1 (b)
+
+    13     >>   38 LOAD_FAST                2 (x)
+                40 LOAD_FAST                1 (b)
+                42 BINARY_MULTIPLY
+                44 RETURN_VALUE
+
+   MODIFIED BYTECODE
+     9           0 LOAD_GLOBAL              3 (__compiled_fn_0)
+                 2 LOAD_FAST                0 (a)
+                 4 LOAD_FAST                1 (b)
+                 6 CALL_FUNCTION            2
+                 8 UNPACK_SEQUENCE          2
+                10 STORE_FAST               2 (x)
+                12 POP_JUMP_IF_FALSE       24
+                14 LOAD_GLOBAL              4 (__resume_at_30_1)
+                16 LOAD_FAST                1 (b)
+                18 LOAD_FAST                2 (x)
+                20 CALL_FUNCTION            2
+                22 RETURN_VALUE
+           >>   24 LOAD_GLOBAL              5 (__resume_at_38_2)
+                26 LOAD_FAST                1 (b)
+                28 LOAD_FAST                2 (x)
+                30 CALL_FUNCTION            2
+                32 RETURN_VALUE
+
+   GUARDS:
+    - local 'a' TENSOR_MATCH
+    - local 'b' TENSOR_MATCH
+    - global 'torch' FUNCTION_MATCH
+
+At the top you can see the FX graph (which we already shared above).
+Next you see the original bytecode of the function, followed by the
+modified bytecode generated by TorchDynamo. Finally, you see the guards
+which we covered above.
+
+In the modified bytecode ``__compiled_fn_0`` is the return value of
+``my_compiler()`` (the compiled graph). ``__resume_at_30_1`` and
+``__resume_at_38_2`` are both generated continuation functions that pick
+up execution after a graph break (at bytecode offsets 30 and 38). Each
+of these functions take the form:
+
+::
+
+   __resume_at_<offset>:
+       ... restore stack state if needed ...
+       JUMP_ABSOLUTE <offset> into toy_example
+       ... original bytecode of toy_example ...
+
+By generating this `resume_at` function we force the remainder of the
+function to be executed in a new Python frame which recursively
+triggers TorchDynamo to restart its capture once execution reaches that
+point for the first time.
diff --git a/docs/source/dynamo/faq.rst b/docs/source/dynamo/faq.rst
new file mode 100644
index 0000000000000..2b66e81ebc694
--- /dev/null
+++ b/docs/source/dynamo/faq.rst
@@ -0,0 +1,376 @@
+Frequently Asked Questions
+==========================
+
+At a high level, the TorchDynamo stack consists of a graph capture from
+Python code using dynamo and a backend compiler. In this example the
+backend compiler consists of backward graph tracing using AOTAutograd
+and graph lowering using TorchInductor. There are of course many more
+compilers available `here <https://github.com/pytorch/torchdynamo/blob/0b8aaf340dad4777a080ef24bf09623f1aa6f3dd/README.md#existing-backend>`__
+but for this document we will focus on inductor as a motivating example.
+
+Torchdynamo supports training, using AotAutograd to capture backwards:
+
+   1. the ``.forward()`` graph and ``optimizer.step()`` is captured by torchdynamo’s python evalframe frontend
+   2. for each segment of ``.forward()`` that torchdynamo captures, it uses AotAutograd to generate a backward graph segment
+   3. each pair of forward, backward graph are (optionally) min-cut partitioned to save the minimal state between forward/backward
+   4. the forward, backward pairs are wrapped in autograd.function modules 5. usercode calling\ ``.backward()`` still triggers eager’s autograd engine, which runs each ‘compiled backward’ graph as if it were one op, also running any non-compiled eager ops’ .backward() functions
+
+Do you support Distributed code?
+--------------------------------
+
+DDP has been tested and works, support for other distributed training
+libraries is under discussion.
+
+The main reason why Distributed code is challenging with dynamo is
+because AOTAutograd unrolls both the forward and backward pass and
+provides 2 graphs for backends to optimize. This is a problem for
+distributed code because we’d like to ideally overlap communication
+operations with computations. Eager pytorch accomplishes this in
+different ways for DDP/FSDP- using autograd hooks, module hooks, and
+modifications/mutations of module states. In a naive application of
+dynamo, hooks that should run directly after an operation during
+backwards may be delayed until after the entire compiled region of
+backwards ops, due to how AOTAutograd compiled functions interact with
+dispatcher hooks.
+
+The basic strategy for optimizing DDP with Dynamo is outlined in
+`distributed.py <https://github.com/pytorch/pytorch/blob/master/torch/_dynamo/optimizations/distributed.py>`__
+where the main idea will be to graph break on `DDP bucket
+boundaries <https://pytorch.org/docs/stable/notes/ddp.html#internal-design>`__.
+
+When each node in DDP needs to synchronize its weights with the other
+nodes it organizes its gradients and parameters into buckets which
+reduces communication times and allows a node to broadcast a fraction of
+its gradients to other waiting nodes.
+
+Graph breaks in distributed code means you can expect dynamo and its
+backends to optimize the compute overhead of a distributed program but
+not its communication overhead. Graph-breaks may interfere with
+compilation speedups, if the reduced graph-size robs the compiler of
+fusion opportunities. However, there are diminishing returns with
+increasing graph size since most of the current compute optimizations
+are local fusions. So in practice this approach may be sufficient.
+
+Do I still need to export whole graphs?
+---------------------------------------
+
+For the vast majority of models you probably don’t and you can use
+``torch._dynamo()`` optimize as is but there are a few situations where
+full graphs are necessary and you can can ensure a full graph by simply
+running ``torch.dynamo(..., nopython=True)`` \* Large scale training
+runs, think $250K+ that require pipeline parallelism and other advanced
+sharding strategies \* Inference optimizers like
+`TensorRT <https://github.com/pytorch/TensorRT>`__ or
+`AITemplate <https://github.com/facebookincubator/AITemplate>`__ that rely
+on fusing much more aggressively than training optimizers \* Mobile training or
+inference.
+
+Future work will include tracing communication operations into graphs,
+coordinating these operations with compute optimizations, and optimizing
+the communciation operations.
+
+Why is my code crashing?
+------------------------
+
+If your code ran just fine without dynamo and started to crash with it
+enabled then the most important first step is figuring out which part of
+the stack your failure occurred in so try running things in the below
+order and only try the next step if the previous step succeeded.
+
+1. ``dynamo.optimize("eager")`` which only runs torchdynamo forward graph
+   capture and then runs the captured graph with PyTorch. If this fails
+   then there’s an issue with TorchDynamo.
+
+2. ``dynamo.optimize("aot_eager")``
+   which runs torchdynamo to capture a forward graph, and then AOTAutograd
+   to trace the backward graph without any additional backend compiler
+   steps. PyTorch eager will then be used to run the forward and backward
+   graphs. If this fails then there’s an issue with AOTAutograd.
+
+3. ``dynamo.optimize("inductor")`` which runs torchdynamo to capture a
+   forward graph, and then AOTAutograd to trace the backward graph with the
+   TorchInductor compiler. If this fails then there’s an issue with TorchInductor
+
+TorchDynamo Errors
+~~~~~~~~~~~~~~~~~~
+
+If the error that is generated occurs with the ``"eager"`` backend, then
+torchdynamo is the most likely source of the error.
+
+To debug these issues we recommend setting
+``torch._dynamo.config.verbose=True`` to get a full stack trace to both
+the error in torchdynamo and the user code. In addition to this flag,
+you can also set the ``log_level`` of torchdynamo through
+``torch._dynamo.config.log_level``. The available levels are the
+following: - ``logging.DEBUG``: Print every instruction that is
+encountered in addition to all below log levels - ``logging.INFO``:
+Print each function that is compiled (original and modified bytecode)
+and the graph that is captured in addition to all below log levels -
+``logging.WARNING`` (default): Print graph breaks in addition to all
+below log levels - ``logging.ERROR``: Print errors only
+
+If a model is sufficiently large, the logs can become overwhelming. If
+an error occurs deep within a model’s python code, it can be useful to
+execute only the frame in which the error occurs to enable easier
+debugging. There are 2 tools available to enable this:
+
+* ``env TORCHDYNAMO_DEBUG_FUNCTION=<desired_function_name>`` will only run TorchDynamo on functions with that name.
+
+* ``env torch._dynamo.config.replay_record_enabled = True``) which dumps an execution record when an error is encountered. This record can then be replayed to run only the frame where an error occurred.
+
+TorchInductor Errors
+--------------------
+
+With TorchInductor as the chosen backend, AOTAutograd is used to
+generate the backward graph from the forward graph captured by
+torchdynamo. It’s important to note that errors can occur during this
+tracing and also while TorchInductor lowers the forward and backward
+graphs to GPU code or C++.
+
+A model can often consist of hundreds or thousands of FX nodes, so
+narrowing the exact nodes where this problem occurred can be very
+difficult which is why we highly recommend you use our minifier to
+create tiny reproducible examples of failures you’re seeing. We can
+minify errors that occur either at the AOTAutograd layer or Inductor
+layer which you should try in the following order.
+
+1. ``env TORCHDYNAMO_REPRO_AFTER="aot" python your_model.py``
+2.  ``env TORCHDYNAMO_REPRO_AFTER="dynamo" python your_model.py``
+
+Minifying your error is the quickest path to getting it fixed.
+
+The minifier will actually create a ``repro.py`` for you at the location
+set by ``env TORCHDYNAMO_REPRO_DIR`` so make you have right access to
+that directory. You can then run ``python repro.py`` and confirm that
+you are getting the same error.
+
+.. note::
+   For other compilers such as nvfuser, the process is similar but
+   instead you would leverage ``env TORCHDYNAMO_REPRO_AFTER="dynamo" python your_model.py``.
+
+Why is compilation slow?
+------------------------
+
+Dynamo Compilation
+~~~~~~~~~~~~~~~~~~
+
+TorchDynamo has a builtin stats function for collecting and displaying
+the time spent in each compilation phase. These stats can be accessed by
+calling ``torch._dynamo.utils.compile_times()`` after executing
+``torch._dynamo``. By default, this returns a string representation of
+the compile times spent in each TorchDynamo function by name.
+
+Inductor Compilation
+~~~~~~~~~~~~~~~~~~~~
+
+TorchInductor has a builtin stats and trace function for displaying time
+spent in each compilation phase, output code, output graph visualization
+and IR dump. ``env TORCHINDUCTOR_TRACE=1 python repro.py``. This is a
+debugging tool designed to make it easier to debug/understand the
+internals of TorchInductor with an output that will look something like
+`this <https://gist.github.com/jansel/f4af078791ad681a0d4094adeb844396>`__
+
+Each file in that debug trace can be enabled/disabled via
+``torch._inductor.config.trace.*``. The profile and the diagram are both
+disabled by default since they are expensive to generate. See the
+`example debug directory
+output <https://gist.github.com/jansel/f4af078791ad681a0d4094adeb844396>`__
+for more examples.
+
+Excessive Recompilation
+~~~~~~~~~~~~~~~~~~~~~~~
+
+When TorchDynamo compiles a function (or part of one), it makes certain
+assumptions about locals and globals in order to allow compiler
+optimizations, and expresses these assumptions as guards that check
+particular values at runtime. If any of these guards fail, Dynamo will
+recompile that function (or part) up to
+``torch._dynamo.config.cache_size_limit`` times. If your program is
+hitting the cache limit, you will first need to determine which guard is
+failing and what part of your program is triggering it.
+
+The `recompilation profiler <#recompilation-profiler>`__ automates the
+process of setting TorchDynamo’s cache limit to 1 and running your
+program under an observation-only ‘compiler’ that records the causes of
+any guard failures. You should be sure to run your program for at least
+as long (as many iterations) as you were running when you ran into
+trouble, and the profiler will accumulate statistics over this duration.
+
+.. code-block:: python
+
+   prof = dynamo.utils.CompilationProfiler()
+   @dynamo.optimize(prof)
+   def my_model():
+       ...
+   my_model()
+   print(prof.report())
+
+Many of the reasons for graph breaks and excessive recompilation will be
+fixed with upcoming support for `tracing dynamic tensor
+shapes <https://docs.google.com/document/d/1QJB-GOnbv-9PygGlOMXwiO9K6vVNm8sNg_olixJ9koc/edit?usp=sharing>`__,
+more careful choices for guards and better tuned heuristics.
+
+Why are you recompiling in production?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In some cases, you may not want unexpected compiles after a program has
+warmed up. For example, if you are serving production traffic in a
+latency critical application. For this, TorchDynamo provides an
+alternate mode where prior compiled graphs are used, but no new ones are
+generated:
+
+.. code-block:: python
+
+   frozen_toy_example = dynamo.run(toy_example)
+   frozen_toy_example(torch.randn(10), torch.randn(10))
+
+How are you speeding up my code?
+--------------------------------
+
+There are 3 major ways to accelerat PyTorch code:
+
+1. Kernel fusion via vertical fusions which fuse sequential operations to avoid
+   excessive read/writes. For example, fuse 2 subsequent cosines means you
+   can can do 1 read 1 write instead 2 reads 2 writes 2. Horizontal fusion:
+   the simplest example being batching where a single matrix is multiplied
+   with a batch of examples but the more general scenario is a grouped GEMM
+   where a group of matrix multiplications are scheduled together
+
+2. Out of order execution: A general optimization for compilers, by looking ahead
+   at the exact data dependencies within a graph we can decide on the most
+   opportune time to execute a node and which buffers can be reused
+
+3. Automatic work placement: Similar of the out of order execution point,
+   but by matching nodes of a graph to resources like physical hardware or
+   memory we can design an appropriate schedule
+
+The above are general principles for accelerating PyTorch code but
+different backends will each make different tradeoffs on what to
+optimize. For example Inductor first takes care of fusing whatever it
+can and only then generates `Triton <https://openai.com/blog/triton/>`__
+kernels. It can also
+
+Triton in addition offers speedups because of automatic memory
+coalescing, memory management and scheduling within each Streaming
+Multiprocessor and has been designed to handle tiled computations.
+
+However, regardless of the backend you use it’s best to use a benchmark
+and see approach so try out the PyTorch profiler, visually inspect the
+generated kernels and try to see what’s going on for yourself.
+
+Why am I not seeing speedups?
+-----------------------------
+
+Graph Breaks
+~~~~~~~~~~~~
+
+The main reason you won’t see the speedups you’d like to by using dynamo
+is excessive graph breaks. So what’s a graph break?
+
+Given a program like:
+
+.. code-block:: python
+
+   @dynamo.optimize(...)
+   def some_fun(x):
+       ...
+   some_fun(x)
+   ...
+
+Torchdynamo will attempt to compile all of the torch/tensor operations
+within ``some_fun()`` into a single FX graph, but it may fail to capture
+everything into one graph.
+
+Some graph break reasons are insurmountable to TorchDynamo like calling
+into a C extension other than torch is invisible to torchdynamo, and
+could do arbitrary things without TorchDynamo being able to introduce
+necessary guards to ensure that the compiled program would be safe to reuse.
+
+   To maximize performance, it’s important to have as few graph breaks
+   as possible.
+
+Identifying the cause of a graph break
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To identify all graph breaks in a program and the associated reasons for
+the breaks, ``torch._dynamo.explain`` can be used. This tool runs
+TorchDynamo on the supplied function and aggregates the graph breaks
+that are encountered. Here is an example usage:
+
+.. code-block:: python
+
+   import torch
+   import torch._dynamo as dynamo
+   def toy_example(a, b):
+       x = a / (torch.abs(a) + 1)
+       print("woo")
+       if b.sum() < 0:
+           b = b * -1
+       return x * b
+   explanation, out_guards, graphs, ops_per_graph = dynamo.explain(toy_example, torch.randn(10), torch.randn(10))
+   print(explanation)
+   """
+   Dynamo produced 3 graphs, with 2 graph break and 6 ops.
+    Break reasons:
+   1. call_function BuiltinVariable(print) [ConstantVariable(str)] {}
+      File "t2.py", line 16, in toy_example
+       print("woo")
+
+   2. generic_jump
+      File "t2.py", line 17, in toy_example
+       if b.sum() < 0:
+    """
+
+To throw an error on the first graph break encountered you can use
+disable python fallback by using ``nopython=True``, this should be
+familiar if you’ve worked with export based compilers.
+
+.. code-block:: python
+
+   @dynamo.optimize(<compiler>, nopython=True)
+   def toy_example(a, b):
+      ...
+
+Why didn’t my code recompile when I changed it?
+-----------------------------------------------
+
+If you went ahead and enabled dynamic shapes via
+``env TORCHDYNAMO_DYNAMIC_SHAPES=1 python model.py`` then your code
+won’t recompile on shape changes. We’ve added support for dynamic shapes
+which avoids recompilations in the case when shapes vary by less than a
+factor of 2. This is especially useful in scenarios like varying image
+sizes in CV or variable sequence length in NLP. In inference scenarios
+it’s often not possible to know what a batch size will be beforehand
+because you take what you can get from different client apps.
+
+In general, TorchDynamo tries very hard not to recompile things
+unnecessarily so if for example torchdynamo finds 3 graphs and your
+change only modified one graph then only that graph will recompile. So
+another tip to avoid potentially slow compilation times is to warmup a
+model by compiling it once after which subsequent compilations will be
+much faster. Cold start compile times is still a metric we track
+visibly.
+
+Why am I getting incorrect results?
+-----------------------------------
+
+Accuracy issues can also be minified if you set the environment variable
+``TORCHDYNAMO_REPRO_LEVEL=4``, it operates with a similar git bisect
+model and a full repro might be something like
+``TORCHDYNAMO_REPRO_AFTER="aot" TORCHDYNAMO_REPRO_LEVEL=4`` the reason
+we need this is downstream compilers will codegen code whether it’s
+Triton code or the C++ backend, the numerics from those downstream
+compilers can be different in subtle ways yet have dramatic impact on
+your training stability. So the accuracy debugger is very useful for us
+to detect bugs in our codegen or with a backend compiler.
+
+Why am I getting OOMs?
+----------------------
+
+Dynamo is still an alpha product so there’s a few sources of OOMs and if
+you’re seeing an OOM try disabling the following configurations in this
+order and then open an issue on Github so we can solve the root problem
+1. If you’re using dynamic shapes try disabling them, we’ve disabled
+them by default: ``env TORCHDYNAMO_DYNAMIC_SHAPES=0 python model.py`` 2.
+CUDA graphs with Triton are enabled by default in inductor but removing
+them may alleviate some OOM issues: ``torch._inductor.config.triton.cudagraphs = False``.
\ No newline at end of file
diff --git a/docs/source/dynamo/get-started.rst b/docs/source/dynamo/get-started.rst
new file mode 100644
index 0000000000000..44434d49e525d
--- /dev/null
+++ b/docs/source/dynamo/get-started.rst
@@ -0,0 +1,181 @@
+Getting Started
+===============
+
+Let’s start with a simple example and make things more complicated step
+by step. Please note that you’re likely to see more significant speedups
+the newer your GPU is.
+
+.. code:: python
+
+   from torch._dynamo import optimize
+   import torch
+   def fn(x, y):
+       a = torch.cos(x).cuda()
+       b = torch.sin(y).cuda()
+       return a + b
+   new_fn = optimize("inductor")(fn)
+   input_tensor = torch.randn(10000).to(device="cuda:0")
+   a = new_fn()
+
+This example will not actually run faster. Its purpose is to demonstrate
+the ``torch.cos()`` and ``torch.sin()`` features which are
+examples of pointwise ops as in they operate element by element on a
+vector. A more famous pointwise op you might actually want to use would
+be something like ``torch.relu()``. Pointwise ops in eager mode are
+suboptimal because each one would need to need to read a tensor from
+memory, make some changes and then write back those changes. The single
+most important optimization that inductor does is fusion. So back to our
+example we can turn 2 reads and 2 writes into 1 read and 1 write which
+is crucial especially for newer GPUs where the bottleneck is memory
+bandwidth (how quickly you can send data to a GPU) instead of compute
+(how quickly your GPU can crunch floating point operations)
+
+Another major optimization that inductor makes available is automatic
+support for CUDA graphs.
+CUDA graphs help eliminate the overhead from launching individual
+kernels from a python program which is especially relevant for newer GPUs.
+
+dynamo supports many different backends but inductor specifically works
+by generating `Triton <https://github.com/openai/triton>`__ kernels and
+we can inspect them by running ``TORCHINDUCTOR_TRACE=1 python trig.py``
+with the actual generated kernel being
+
+.. code:: python
+
+   @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
+   @triton.jit
+   def kernel(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+       xnumel = 10000
+       xoffset = tl.program_id(0) * XBLOCK
+       xindex = xoffset + tl.reshape(tl.arange(0, XBLOCK), [XBLOCK])
+       xmask = xindex < xnumel
+       x0 = xindex
+       tmp0 = tl.load(in_ptr0 + (x0), xmask)
+       tmp1 = tl.sin(tmp0)
+       tmp2 = tl.sin(tmp1)
+       tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK], tl.int32)), tmp2, xmask)
+
+And you can verify that fusing the two ``sins`` did actually occur
+because the two ``sin`` operations occur within a single Triton kernel
+and the temporary variables are held in registers with very fast access.
+
+You can read up a lot more on Triton’s performance
+`here <https://openai.com/blog/triton/>`__ but the key is it’s in python
+so you can easily understand it even if you haven’t written all that
+many CUDA kernels.
+
+As a next step let’s try a real model like resnet50 from the PyTorch
+hub.
+
+.. code:: python
+
+   import torch
+   import torch._dynamo as dynamo
+   model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)
+   opt_model = dynamo.optimize("inductor")(model)
+   model(torch.randn(1,3,64,64))
+
+And that’s not the only available backend, you can run in a REPL
+``dynamo.list_backends()`` to see all the available ones. Try out the
+``aot_cudagraphs`` or ``nvfuser`` next as inspiration.
+
+Let’s do something a bit more interesting now, our community frequently
+uses pretrained models from
+`transformers <https://github.com/huggingface/transformers>`__ or
+`TIMM <https://github.com/rwightman/pytorch-image-models>`__ and one of
+our design goals is for dynamo and inductor to work out of the box with
+any model that people would like to author.
+
+So we’re going to directly download a pretrained model from the
+HuggingFace hub and optimize it:
+
+.. code:: python
+
+   import torch
+   from transformers import BertTokenizer, BertModel
+   import torch._dynamo as dynamo
+   # Copy pasted from here https://huggingface.co/bert-base-uncased
+   tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+   model = BertModel.from_pretrained("bert-base-uncased").to(device="cuda:0")
+   model = dynamo.optimize("inductor")(model) # This is the only line of code that we changed
+   text = "Replace me by any text you'd like."
+   encoded_input = tokenizer(text, return_tensors='pt').to(device="cuda:0")
+   output = model(**encoded_input)
+
+If you remove the ``to(device="cuda:0")`` from the model and
+encoded_input then triton will generate C++ kernels that will be
+optimized for running on your CPU. You can inspect both Triton or C++
+kernels for BERT, they’re obviously more complex than the trigonometry
+example we had above but you can similarly skim it and understand if you
+understand PyTorch.
+
+Similarly let’s try out a TIMM example
+
+.. code:: python
+
+   import timm
+   import torch._dynamo as dynamo
+   import torch
+   model = timm.create_model('resnext101_32x8d', pretrained=True, num_classes=2)
+   opt_model = dynamo.optimize("inductor")(model)
+   opt_model(torch.randn(64,3,7,7))
+
+Our goal with dynamo and inductor was to build the highest coverage ML compiler which should work with any model you throw at it.
+
+Existing Backends
+~~~~~~~~~~~~~~~~~
+
+TorchDynamo has a growing list of backends, which can be found in
+`backends.py <https://github.com/pytorch/pytorch/blob/master/torch/_dynamo/optimizations/backends.py>`__
+or ``torchdynamo.list_backends()`` each of which with its optional dependencies.
+
+Some of the most commonly used backend include:
+
+* **Debugging backends**: \* ``dynamo.optimize("eager")`` - Uses PyTorch
+  to run the extracted GraphModule. This is quite useful in debugging
+  TorchDynamo issues. \* ``dynamo.optimize("aot_eager")`` - Uses
+  AotAutograd with no compiler, i.e, just using PyTorch eager for the
+  AotAutograd’s extracted forward and backward graphs. This is useful for
+  debugging, and unlikely to give speedups.
+
+* **Training & inference backends**: \* ``dynamo.optimize("inductor")`` -
+  Uses TorchInductor backend with AotAutograd and cudagraphs by leveraging
+  codegened Triton kernels `Read
+  more <https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747>`__
+
+  * ``dynamo.optimize("nvfuser")`` - nvFuser with TorchScript. `Read more <https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593>`__
+
+  * ``dynamo.optimize("aot_nvfuser")`` - nvFuser with AotAutograd. `Read more <https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593>`__
+
+  * ``dynamo.optimize("aot_cudagraphs")`` - cudagraphs with AotAutograd. `Read more <https://github.com/pytorch/torchdynamo/pull/757>`__
+
+* **Inference-only backend**\ s: \* ``dynamo.optimize("ofi")`` - Uses
+  Torchscript optimize_for_inference. `Read
+  more <https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html>`__
+
+  * ``dynamo.optimize("fx2trt")`` - Uses Nvidia TensorRT for inferenc optimizations. `Read more <https://github.com/pytorch/TensorRT/blob/master/docsrc/tutorials/getting_started_with_fx_path.rst>`__
+
+  * ``dynamo.optimize("onnxrt")`` - Uses ONNXRT for inference on CPU/GPU. `Read more <https://onnxruntime.ai/>`__ \* ``dynamo.optimize("ipex")`` - Uses IPEX for inference on CPU. `Read more <https://github.com/intel/intel-extension-for-pytorch>`__
+
+Why do you need another way of optimizing PyTorch code?
+-------------------------------------------------------
+
+While a number of other code optimization tools exist in the PyTorch
+ecosystem, each of them has its own flow. Here is a few examples of
+existing methods and their limitations:
+
+-  ``torch.jit.trace()`` is silently wrong if it cannot trace e.g:
+   during control flow
+-  ``torch.jit.script()`` requires modifications to user or library code
+   by adding type annotations and removing non PyTorch code
+-  ``torch.fx.symbolic_trace()`` either traces correctly or gives a hard
+   error but it’s limited to traceable code so still can’t handle
+   control flow
+-  ``torch._dynamo`` works out of the box and produces partial graphs.
+   It still has the option of producing a single graph with
+   ``nopython=True`` which are needed for `some
+   situations <./documentation/FAQ.md#do-i-still-need-to-export-whole-graphs>`__
+   but allows a smoother transition where partial graphs can be
+   optimized without code modification
+
+.. |image0| image:: ../_static/img/dynamo/TorchDynamo.png
diff --git a/docs/source/dynamo/guards-overview.rst b/docs/source/dynamo/guards-overview.rst
new file mode 100644
index 0000000000000..99a004ec221c3
--- /dev/null
+++ b/docs/source/dynamo/guards-overview.rst
@@ -0,0 +1,513 @@
+Guards Overview
+===============
+
+From a UX perspective, TorchDynamo is very easy to use. The user invokes
+``torchdynamo.optimize`` as an annotation:
+
+.. code-block:: python
+
+   @torchdynamo.optimize(my_compiler)
+   def fn_foo(bar):
+
+Where a complete example looks like this:
+
+.. code-block:: python
+
+   from typing import List
+   import torch
+   import torchdynamo
+   def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+       print("my_compiler() called with FX graph:")
+       gm.graph.print_tabular()
+       return gm.forward  # return a python callable
+   @torchdynamo.optimize(my_compiler)
+   def toy_example(a, b):
+       x = a / (torch.abs(a) + 1)
+       if b.sum() < 0:
+           b = b * -1
+       return x * b
+   for _ in range(100):
+       toy_example(torch.randn(10), torch.randn(10))
+
+This allows TorchDynamo to capture the interpreted Python frames, grab
+any and all relevant information, and speed things up wherever it can.
+The speedup comes from a few places, and can be rather dependent on the
+backend (my_compiler above) provided, but the one speedup we care about
+most for today’s overview is **caching**. Caching itself is not a direct
+speedup, so much as a critical enablement to allow us to prevent
+recompilation. We dig a hole with dynamo, and caching allows us to get
+out. Its a speedup from that perspective, but relatively neutral when
+all things are considered - however, it enables us to hold perf
+neutrality while then enabling backends - the true source of our
+speedups.
+
+With even a pass-through no-op backend provided:
+
+.. code-block:: python
+
+   def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+       return gm.forward
+
+We can see TorchDynamo speeding up Python execution quite a bit, even on
+regular Python, not just PyTorch.
+
+Caching and Guards Overview
+---------------------------
+
+TorchDynamo operates through caching transformed (by TorchDynamo) user
+bytecode. When we receive a frame for evaluation, we check if the
+**objects referenced in the frame have changed** in certain ways, and if
+not, we read the previously transformed user bytecode to evaluate it.
+The details of how we do this will be saved for a later writeup.
+Instead, we will focus on how we can identify whether or not the
+**objects referenced in the frame have changed**. This is a critical
+piece of functionality in TorchDynamo, because it drives the entire
+invalidation lifecycle. We refer to this functionality as **guards**.
+
+At a very high level, the vastly oversimplified TLDR flow is this:
+
+1) We receive a python frame
+2) We convert the given frame from (1), passing it through instruction
+   translation
+3) For the objects captured in (2), we create tracking objects that are
+   (a) tracked on an output graph, which is an internal specialization
+   of a torch.fx.Tracer (and the topic of a later writeup), and (b)
+   guards, the topic of this document.
+4) We process the guard objects created in (3), turning them into a
+   generated python function, check_fn, associated with a piece of code.
+5) The check_fn is evaluated whenever we encounter this code a
+   subsequent time - if a check_fn passes and evaluates to True, we know
+   the code in the cache and the code encountered here is the same, and
+   can be safely used. If it fails and evaluates to False, we know the
+   code in the cache is not valid, and can be thrown out in favor of a
+   new entry, through recompilation or a graph break.
+
+Python Frame Evaluation and PEP 523
+-----------------------------------
+
+The functionality of TorchDynamo is based on
+`PEP 523 <https://peps.python.org/pep-0523/>`__.
+
+TorchDynamo installs a frame evaluation function on Python, via
+`_PyInterpreterState_SetEvalFrameFunc`. The overview of function
+selection, thread management, and cleanup is out of scope for this
+writeup, but the important part is that TorchDynamo has a hook where
+Python can hand control back to us during evaluation.
+
+The function we have installed is ``convert_frame`` or
+``convert_frame_assert`` in the ``nopython=True`` case, but glossing
+over that nuance for now, let’s take a look at ``convert_frame_assert``,
+as ``convert_frame`` proxies to it anyway.
+
+We can find it on `line 20 of convert_frame.py
+<https://github.com/pytorch/torchdynamo/blob/main/torchdynamo/convert_frame.py#L200>`__,
+with a signature as follows:
+
+.. code-block:: python
+
+   def  convert_frame_assert(compiler_fn: Callable, one_graph=True):
+
+This function wraps the entry point of where Python invokes TorchDynamo
+with a frame, glossing over the nuances of ``wrap_convert_context`` for
+now:
+
+.. code-block:: python
+
+   def  _convert_frame_assert(frame: types.FrameType, cache_size: int):
+
+Here is what this function does:
+
+1) Checks if it has seen this ``code``\ (see: f_code `here
+   <https://docs.python.org/3/library/inspect.html>`__) before and exits
+   early if it did.
+2) Checks if the code is an unsupported case.
+3) Checks if the ``cache_size`` (second arg above) crosses the limit
+   defined in the config, ``cache_size_limit``. If it has, the function
+   drops the frame and logs warnings. This helps to avoid constant
+   recompilation of a frame as it generally means that the frame is hot
+   in an unexpected way and caching it produces needless overhead,
+   as it is likely to get evicted the next time it is encountered.
+4) Passes the frame, alongside a function that creates an
+   ``InstructionTranslator`` through bytecode
+   transformation, via ``transform_code_object``. A few crucial things
+   happen under the hood here:
+
+   1) New code is produced through ``transform_code_object``.
+
+   2) An FX tracer named ``output`` is produced through
+      ``InstructionTranslator``.
+
+      This can be a bit confusing,
+      as ``InstructionTranslator`` is not an `fx` tracer, but its stored
+      in a variable named tracer, and its output*\ **is**\ *an `fx`tracer.*
+
+   3) The function produces guards and stores them on ``output`` above.
+
+   4) The function produces ``output_instructions`` and stores them on
+      ``output`` above.
+
+   5) The function maps the newly produced transformed code to the initial code it
+      read off the frame. This mapping is worth remembering, we will
+      refer to it much later on below where we cover guard failures.
+
+5) Using the transformed code from 4.1 and the guards from 4.3
+   the function produces a `GuardedCode`.
+
+Now that we have learned about frame evoluation, let’s review
+``InstructionTranslator``, and see how it turns the frame we handed
+it over into TorchDynamo internal types.
+
+InstructionTranslator
+---------------------
+
+`InstructionTranslator` does a lot! We won’t cover the details of
+everything it does, but most importantly for this document, it produces
+a mapping of ``symbolic_locals`` which maintains a mapping from the
+frame’s f_locals to TorchDynamo internal Variable objects (more on these
+in a moment. ``symbolic_locals`` is filled via traversing the frame’s
+locals:
+
+.. code-block:: python
+
+   self.symbolic_locals = collections.OrderedDict(
+       (k, VariableBuilder(self, LocalSource(k))(f_locals[k]))
+       for k in vars
+       if k in f_locals
+   )
+
+We will get to how this works later, from a few other examples that lead
+us to understanding ``VariableTracker`` and ``VariableBuilder``. The
+important component here, for us, for now, is the invocation of a call
+into ``VariableBuilder``. ``VariableBuilder``\ ’s call implementation
+proxies into a function called ``_wrap``, which in turn both constructs
+instances of ``VariableTracker`` and calls ``make_guards`` on them. More
+on that later.
+
+This mapping, in turn, is critical as each Variable has associated
+guards, which are then passed to ``self.output``, the instance of
+``OutputGraph``, an fx tracer, mentioned in 4.2 of the section above. If
+you recall, this ``OutputGraph``, stored in a variable called ``output``
+is where our guards are stored before being passed on to become
+``GuardedCode``
+
+How does ``InstructionTranslator`` do this? At the heart of it, there is
+a loop that is pumped, which drives a function ``step``.
+
+``step`` is just that - a single processing step, taking exactly one
+instruction and doing *something* with it. Note: These are real
+instructions processed by TorchDynamo’s ``transform_code_object``, and
+it’s pretty cool.
+
+.. note:: This section purposly skips the details of
+   `dis.get_instructions <https://docs.python.org/3/library/dis.html>`__,
+   and how we set up the ``Instruction`` class.
+
+For the toy example above, here is a snippet of a what a few
+``Instruction``\'s may look like:
+
+.. code-block:: python
+
+   Instruction(opcode=124, opname='LOAD_FAST', arg=0, argval='b', offset=32, starts_line=8, is_jump_target=True, target=None)
+   Instruction(opcode=100, opname='LOAD_CONST', arg=3, argval=-1, offset=34, starts_line=None, is_jump_target=False, target=None)
+   Instruction(opcode=20, opname='BINARY_MULTIPLY', arg=None, argval=None, offset=36, starts_line=None, is_jump_target=False, target=None)
+
+This is the core functionality of this function. Take a look at the ``opname``,
+and then take a look at this little snippet from inside ``step``;
+
+.. code-block:: python
+
+   if not hasattr(self, inst.opname):
+       unimplemented(f"missing: {inst.opname}")
+   getattr(self, inst.opname)(inst)
+
+As we can see, we check if the current class, the
+``InstructionTranslator`` has a attribute set matching the operator name
+(ex: LOAD_CONST). If it does, we invoke it, passing the whole
+instruction object in. If it does not, we drop the frame as
+unimplemented.
+
+For the LOAD_CONST example, we can see that we do indeed support it,
+with a relatively straightforward definition:
+
+::
+
+   def  LOAD_CONST(self, inst):
+   self.push(ConstantVariable(value=inst.argval))
+
+Passing over, for now, on the other details of ``InstructionTranslator``
+we can see that this function creates a new instance of the class
+``ConstantVariable`` , with a value, in our example case, -1, and then
+pushes it onto the stack.
+
+There are dozens of such methods - see symbolic_convert.py for all of
+them. Generally, we implement as many matching methods to python
+bytecode instructions as possible.
+
+Across both the logic downstream of ``step`` and the logic from invoking
+``VariableBuilder`` - we now have a lot of ``VariableTracker``\ s and of
+course, we’ve spoken about creating guards quiet a bit. Let’s dig into
+what Variables are, and get a little closer to understanding guards.
+
+Variables
+---------
+
+A ``ConstantVariable`` is an instance of\ ``VariableTracker``.
+``VariableTracker`` represents a tracked python local or stack value.
+
+When it comes to representing an object inside TorchDynamo, a
+VariableTracker does exactly what it says - it tracks a given variable.
+Its an extremely flexible class, but there are a few points to keep in
+mind:
+
+-  It manages the ``guard`` relationship around the underlying object
+   through:
+
+   -  `make_guard`
+   -  `replace_guards`
+   -  `add_guard(s)`
+   -  `propagate` - ``propagate(*vars: List[List["VariableTracker"]])`` -
+      Perhaps the most important of all, in that it combines guards from
+      all the provided VariableTracker instances passed in. It visits
+      the guards and combines the guards from these onto itself.
+
+-  It acts as a proxy on behalf of the underlying object, implementing
+   methods for the rest of TorchDynamo to get information about the
+   tracked object:
+
+   -  `call_method`
+   -  `call_function`
+   -  `python_type`
+   -  `as_proxy`
+   -  `is/as_python_proxy`
+
+-  It stores the variable ``source`` of type ``Source``, from
+   torchdynamo/source.py. This source type is a relatively self
+   contained class to help us organize and bookeep where the original
+   source came from, and helps provide convenience methods for things
+   like getting the name, and importantly for us, producing guards.
+
+And this class (``VariableTracker``) is built around subclassing,
+somewhere between a full Abstract Base Class and fully fleshed out class
+- it leaves many methods raising NotImplementedError - with reliance on
+subclasses (see: torchdynamo/variables/ for all subclasses) to fulfill
+contracts and custom behaviors.
+
+Knowing what we know now, we can see an example of how an instruction
+from ``dis``, ``BUILD_TUPLE``
+
+   BUILD_TUPLE(count) Creates a tuple consuming count items from the
+   stack, and pushes the resulting tuple onto the stack.
+
+In our case, our signature will be a *little* different due to the way
+we create ``Instruction`` objects, but the gist of it will be the same.
+Instead of passing in ``count``, we pass in an object with a little
+extra bookkeeping, and of course, we deal with turning regular old
+python objects into TorchDynamo notions:
+
+::
+
+   def BUILD_TUPLE(self, inst):
+       items = self.popn(inst.argval)
+       options = VariableTracker.propagate(items)
+       self.push(TupleVariable(items, **options))
+
+What is happening here? 1) We read argval, which in this case, is
+analogous to ``counts`` in the pydoc for the equivalent instruction.
+
+2) We ``popn`` the items, in this case, the signature is
+   ``def  popn(self, n: int) -> List[TensorVariable]:`` this hints at an
+   underlying contract - we are returning ``TensorVariables``. If we
+   take a closer look at sybmolic_convert.py and
+   ``InstructionTranslatorBase``/``InstructionTranslator``\ we see that
+   the only thing pushed onto and popped from our stack are
+   ``VariableTracker``\ s.
+
+3) We call ``VariableTracker.propogate`` (remember it, from above?) This
+   takes the guards from every single item popped off the stack in 2,
+   and recursively traverses it and combines all the guards into
+   ``options``: ``py  return {      "guards": guards,  }``
+
+4) We then make a new instance of a ``VariableTracker``,
+   ``TupleVariable``\ out of the ``items`` and ``options``. This then
+   allows us to install all the appropriate guards from the ``items``
+   that make up the new ``TupleVariable``
+
+Note: You may wonder - where did the first guards come from? Propagation
+is good and all, but don’t we need something created before it can be
+propagated. Yes! Remember that ``VariableBuilder`` above? It calls
+``make_guards`` as it creates ``VariableTracker`` instances, from
+``f_locals``. This in turn calls into the ``source``, to have it create
+guards.
+
+After all this, bytecode translation is done and we are one step closer
+to producing ``GuardedCode``. We now understand how locals become
+``VariableTracker``\ s, how instructions are handled, and where guards
+are called on for creation. Before we can go into seeing how code and
+guards are combined into a GuardedCode object, we need to dig a little
+bit into those ``make_guard`` and ``source.make_guard`` calls above. We
+can then understand, really, what was going on when we made guards
+alongside, and on, ``VariableTracker`` instances.
+
+Making Guards
+-------------
+
+Guards are just python objects, of the class ``Guard``, however, theres
+a good amount of detail around this little class.
+
+Looking at the definition of the dataclass (and therefore, ctor
+signature), we see that it has a name, a source, and a create function.
+
+::
+
+   @dataclasses.dataclass
+   class Guard:
+       name: str
+       source: GuardSource
+       create_fn: Callable
+
+The name should be the name of the variable.
+
+The source here is an enum indicating what *kind* of source the guard
+belongs to [Note: not to be confused with ``Source`` and the other types
+in source.py, as stored on ``VariableTracker``, as discussed above]
+
+And create_fn is the heart of how we go from having this simple
+dataclass to actually producing valid python code to be invoked for
+knowing whether or not things have changed in between invocations, and
+whether we can safely read from the code cache or not (In case you
+forgot what all this was for!)
+
+The most common code paths for getting an instance of a guard are
+through ``make_guards`` on ``VariableTracker``.
+``make_guards``->``source.make_guard``->``return Guard(self.name(), self.guard_source(), fn)``
+
+Or, in a concrete example:
+
+.. code-block:: python
+
+   ...
+   elif istype(value, range):
+       guards = self.make_guards(GuardBuilder.EQUALS_MATCH)
+       return RangeVariable(value=value, guards=guards)
+
+Since ``source`` was set at the construction time of this
+``VariableTracker``, all that was needed here was to provide the fn,
+``GuardBuilder.EQUALS_MATCH`` to the ``create_fn`` field.
+
+This ``create_fn`` must be a method on ``GuardBuilder``. The reason for
+this becomes apparent in our next step. Once we have all the guards
+created for a frame, we move on to ``CheckFunctionManager`` and
+``compile_check_fn``.
+
+Remember that ``convert_frame`` function way above, in the first
+section? Before it can produce a ``GuardedCode``, it needs to run the
+``CheckFunctionManager``, with all the guards, to produce a ``check_fn``
+which will then, in turn get passed in alongside the code into
+``GuardedCode``. This is the same ``check_fn`` that we store in our
+cache entry, and the same one we run to know whether or not to retrieve
+the code stored alongside. For reference, here is that code:
+
+.. code-block:: cpp
+
+   static CacheEntry *create_cache_entry(CacheEntry *next,
+                                         PyObject *guarded_code) {
+     CacheEntry *e = (CacheEntry *)malloc(sizeof(CacheEntry));
+     DEBUG_NULL_CHECK(e);
+     e->check_fn = PyObject_GetAttrString(guarded_code, "check_fn");
+     NULL_CHECK(e->check_fn);
+     e->code = (PyCodeObject *)PyObject_GetAttrString(guarded_code, "code");
+     NULL_CHECK(e->code);
+     e->next = next;
+     return e;
+   }
+
+We now know how a ``check_fn`` function is used, and who makes it, and
+what it is composed of, but what we do not yet know is how. How does a
+list of ``Guard`` objects become a function we can run later on?
+
+First, we iterate these guards:
+
+.. code-block:: python
+
+   for guard in sorted(guards or [], key=Guard.sort_key):
+       if not config.guard_nn_modules and guard.is_nn_module():
+           continue
+       guard.create(local_builder, global_builder)
+
+Calling ``guard.create`` runs that ``create_fn`` we set on the ``Guard``
+class above (don’t confuse it with the ``check_fn`` we are working on
+producing, the names are similar, so it can get a little confusing). In
+our example above, our ``create_fn`` is ``GuardBuilder.EQUALS_MATCH``.
+So we are now invoking it, passing in the ``self``, the guard itself,
+in.
+
+The signature is: ``def EQUALS_MATCH(self, guard: Guard):``
+
+And internally to that function, we can use the ``name`` on the guard to
+get back our original object, querying it for data and type information,
+which in turn gets us to the most important bit: appending code.
+
+At its simplest, ``EQUALS_MATCH`` appends just one line of code:
+``self.code.append(f"{ref} == {val!r}")``. Where ``ref`` is the name of
+the variable, and val is the value. It might produce code like this:
+
+.. code-block::
+
+   y == 2
+
+Pretty simple, but if we append a few other kinds of ``GuardBuilder``
+functions on (For a more complex case), and then combine them all with
+``and`` in between each statement (as we do), we might get something
+like this:
+
+.. code-block::
+
+   ___guarded_code.valid and ___check_type_id(y, 94367738391392) and y == 2 and ___check_tensors(x)
+
+Now we’re talking! Let’s see what we have here: 1) A check for
+``.valid`` (we will come back to invalidation later on) 2) A type id
+check 3) A value check 4) A tensor check
+
+This becomes the heart of the code our ``check_fn``, which in turn, as
+you recall, is evaluated the **next** time we encounter this code. It
+will then check:
+
+1) Is this code still valid?
+2) If (1), Does ``y`` still have a type of ``94367738391392``?
+3) If (2), is ``y`` still 2?
+4) If (3), let’s check on if tensor ``x`` changed in some specific ways
+
+If all of these are still true, then we can use the code cached
+alongside this ``check_fn``! Joyous day! [Note: a deeper dive for how
+and where this happens if saved for a later writeup, but reading
+``static PyCodeObject *lookup(CacheEntry *e, PyObject *f_locals) {`` of
+``_eval_frame.c`` is a good place to start for the inquisitive reader
+who has made it thus far].
+
+If not, then, we can move on to recompiling the code anew, and storing
+that in the cache alongside this code, and a whole new ``check_fn``,
+again to be checked on yet another subsequent frame.
+
+There are lots of other such functions on ``GuardBuilder`` which get
+coalesced into, at times massive, strings which then get evaluated as
+python code and stored into ``check_fn``. Our example above is
+illustrative of a simple case, but I urge you to read the other
+functions on ``GuardBuilder``, or better yet, dump the ``code`` variable
+in ``compile_check_fn`` to really see what’s getting produced,
+especially on larger, real models!
+
+Summary
+-------
+
+In this, we have glossed over: - The role of ``.valid`` and invalidation
+around weak references (and potentially soon to be NN Module
+invalidations) - How the C++ side of guard functions
+(``___check_type_id``, ``___check_tensors``, etc) operate - What happens
+when guards fail? - What happens if we produce invalid guard code?
+
+Despite all that, I hope this has been a useful read. We covered how
+user provided code, wrapped in a TorchDynamo context goes on to get
+traced and tracked internally, organized into ``VariableTracker``\ s
+``Source``\ s and subsequently ``Guard``\ s, and how those ``Guards`` in
+turn guide cache entry selection and invalidation when handing Python
+code.
diff --git a/docs/source/dynamo/index.rst b/docs/source/dynamo/index.rst
new file mode 100644
index 0000000000000..d34f6a7d27552
--- /dev/null
+++ b/docs/source/dynamo/index.rst
@@ -0,0 +1,44 @@
+TorchDynamo Documentation
+=========================
+
+**TorchDynamo** is a Python-level JIT compiler designed to make unmodified
+PyTorch programs faster. TorchDynamo hooks into the frame evaluation API
+in CPython (`PEP 523 <https://peps.python.org/pep-0523/>`__) to
+dynamically modify Python bytecode right before it is executed. It
+rewrites Python bytecode in order to extract sequences of PyTorch
+operations into an `FX Graph <https://pytorch.org/docs/stable/fx.html>`__
+which is then just-in-time compiled with a customizable backend.
+It creates this FX Graph through bytecode analysis and is designed to
+mix Python execution with compiled backends to get the best of both
+worlds: usability and performance.
+
+TorchDynamo makes it easy to experiment with different compiler
+backends to make PyTorch code faster with a single line decorator
+``torch._dynamo.optimize()``
+
+.. image:: ../_static/img/dynamo/TorchDynamo.png
+
+For more information about `TorchInductor`, one of the backends
+supported by `TorchDynamo Graph <https://pytorch.org/docs/stable/fx.html>`__
+into `Triton <https://github.com/openai/triton>`__ for GPUs or
+`C++/OpenMP <https://www.openmp.org/>`__ for CPUs. We have a
+`training performance dashboard <https://github.com/pytorch/torchdynamo/issues/681#issuecomment-1233828468>`__
+that provides performance comparison for different training backends. You can read
+more in the `TorchInductor post on PyTorch
+dev-discuss <https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747>`__.
+
+.. seealso::
+
+   * `TorchDynamo deep-dive video <https://www.youtube.com/watch?v=egZB5Uxki0I>`__
+   * `dev-discuss topics <https://dev-discuss.pytorch.org/search?q=TorchDynamo%20order%3Alatest>`__
+
+.. toctree::
+   :hidden:
+
+   installation
+   get-started
+   guards-overview
+   custom-backends
+   deep-dive
+   troubleshooting
+   faq
diff --git a/docs/source/dynamo/installation.rst b/docs/source/dynamo/installation.rst
new file mode 100644
index 0000000000000..6d1b09f0415ac
--- /dev/null
+++ b/docs/source/dynamo/installation.rst
@@ -0,0 +1,83 @@
+Installing TorchDynamo
+======================
+
+This section describes how to install TorchDynamo.
+
+Requirements and Setup
+----------------------
+
+Python 3.8 is recommended. Python 3.7 through 3.10 are supported and
+tested. Make sure to have a development version of Python installed
+locally as well.
+
+TorchDynamo is included in the nightly binaries of PyTorch. You can
+find more information `here <https://pytorch.org/get-started/locally/>`__
+
+Install GPU/CUDA version requirements
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To use GPU back ends (and in particular Triton), please make sure that
+the CUDA that you have installed locally matches the PyTorch version you
+are running.
+
+The following command installs GPU PyTorch+TorchDynamo along with GPU
+TorchDynamo dependencies (for CUDA 11.7):
+
+.. code-block:: python
+
+   pip3 install numpy --pre torch[dynamo] --force-reinstall --extra-index-url https://download.pytorch.org/whl/nightly/cu117
+
+CPU requirements
+~~~~~~~~~~~~~~~~
+
+There are no additional requirements for CPU TorchDynamo. CPU
+TorchDynamo is included in the nightly versions of PyTorch, which, for
+reference, can be installed with the following command:
+
+.. code-block:: shell
+
+   pip3 install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+
+
+Install from local source
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Build PyTorch from source:
+https://github.com/pytorch/pytorch#from-source, which has TorchDynamo
+included.
+
+To install GPU TorchDynamo dependencies, run ``make triton`` in the
+PyTorch repo root directory.
+
+Verify Installation
+~~~~~~~~~~~~~~~~~~~
+
+If you built PyTorch from source, then you can run the following
+commands (from the PyTorch repo root directory) that run minimal
+examples to check that TorchDynamo is installed correctly:
+
+.. code:: shell
+
+   cd tools/dynamo
+   python verify_dynamo.py
+
+If you do not have the PyTorch source locally, you can alternatively
+copy the script (``tools/dynamo/verify_dynamo.py``) from the PyTorch
+repo and run it locally.
+
+Docker installation
+-------------------
+
+We also provide all the required dependencies in the PyTorch nightly
+binaries which you can download with
+
+.. code-block::
+
+   docker pull ghcr.io/pytorch/pytorch-nightly
+
+And for ad hoc experiments just make sure that your container has access
+to all your GPUs
+
+.. code-block:: bash
+
+   docker run --gpus all -it ghcr.io/pytorch/pytorch-nightly:latest /bin/bash
diff --git a/docs/source/dynamo/troubleshooting.rst b/docs/source/dynamo/troubleshooting.rst
new file mode 100644
index 0000000000000..8542d02bfa9bc
--- /dev/null
+++ b/docs/source/dynamo/troubleshooting.rst
@@ -0,0 +1,665 @@
+TorchDynamo Troubleshooting
+===========================
+
+**Author**: `Michael Lazos <https://github.com/mlazos>`_
+
+TorchDynamo is still in active development, and many of the reasons for
+graph breaks and excessive recompilation will be fixed with upcoming
+support for `tracing dynamic tensor
+shapes <https://docs.google.com/document/d/1QJB-GOnbv-9PygGlOMXwiO9K6vVNm8sNg_olixJ9koc/edit?usp=sharing>`__,
+more careful choices for guards and better tuned heuristics.
+
+In the mean time, you may need to diagnose a particular issue and
+determine if it is easy to work around with a change to your model, or
+file an issue for support.
+
+Also, we are actively developing debug tools, profilers, and improving our
+errors/warnings. Please give us feedback if you have an issue with this
+infra, or an idea for an improvement. Below is a table of the available
+tools and their typical usage. For additional help see
+`Diagnosing Runtime Errors <#diagnosing-runtime-errors>`__.
+
+.. list-table:: Title
+   :widths: 25 25 50
+   :header-rows: 1
+
+   * - Tool
+     - Purpose
+     - Usage
+   * - Info logging
+     - View summarized steps of compilation
+     - ``torch._dynamo.config.log_level = logging.INFO``
+   * - Debug logging
+     - View detailed steps of compilation (print every instruction traced)
+     - ``torch._dynamo.config.log_level = logging.DEBUG`` and
+       ``torch._dynamo.config.verbose = True``
+   * - Minifier for any backend
+     - Find smallest subgraph which reproduces errors for any backend
+     - set environment variable ``TORCHDYNAMO_REPRO_AFTER="dynamo"``
+   * - Minifier for ``TorchInductor``
+     - If the error is known to occur after `AOTAutograd`` find
+       smallest subgraph wich reproduces errors during TorchInductor lowering
+     - set environment variable ``TORCHDYNAMO_REPRO_AFTER="aot"``
+   * - Accuracy minifier
+     - Finds the smallest subgraph which reproduces an accuracy issue
+       between an eager model model and optimized model
+     - ``TORCHDYNAMO_REPRO_AFTER=<"aot"/"dynamo"> TORCHDYNAMO_REPRO_LEVEL=4``
+   * - ``torch._dynamo.explain``
+     - Find graph breaks and display reasoning for them
+     - ``torch._dynamo.explain(fn, *inputs)``
+   * - Record/Replay
+     - Record and replay frames which to reproduce errors during graph capture
+     - ``torch._dynamo.config.replay_record_enabled = True``
+   * - TorchDynamo function name filtering
+     - Only compile functions with the given name to reduce noise when
+       debugging an issue
+     - set environment variable ``TORCHDYNAMO_DEBUG_FUNCTION=<name>``
+   * - TorchInductor Debug logging
+     - Print general TorchInductor debug info and generated Triton/C++ code
+     - ``torch._inductor.config.debug = True``
+   * - TorchInductor Tracing
+     - Show time taken in each TorchInductor stage + output code and graph
+       visualization
+     - set the environment variable TORCHINDUCTOR_TRACE=1 or
+       ``torch._inductor.config.trace.enabled = True``
+
+Diagnosing Runtime Errors
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Below is the TorchDynamo compiler stack.
+
+At a high level, the TorchDynamo stack consists of a graph capture from
+Python code (TorchDynamo) and a backend compiler. In this example the
+backend compiler consists of backward graph tracing (AOTAutograd) and
+graph lowering (TorchInductor)*. Errors can occur in any component of
+the stack and will provide full stack traces.
+
+You may use info logging
+(``torch._dynamo.config.log_level = logging.INFO``) and look for
+``Step #: ...`` outputs in order to determine in which component the
+error occurred in. Logs are made at the beginning and end of each step,
+so the step that an error should correspond to is the most recent logged
+step whose end has not yet been logged. The steps correspond to the
+following parts of the stack (according to the image above):
+
+==== ================
+Step Component
+==== ================
+1    TorchDynamo
+2    Compiler Backend
+3    TorchInductor
+==== ================
+
+The beginning and end of AOTAutograd is currently not logged, but we
+plan to add it soon.
+
+If info logging is insufficient, then there are also some backend
+options which can enable you to determine which component is causing the
+error if you’re unable to understand the error message that is
+generated. These are the following:
+
+-  ``"eager"``: only runs torchdynamo forward graph capture and then
+   runs the captured graph with PyTorch. This provides an indication as
+   to whether TorchDynamo is raising the error.
+
+-  ``"aot_eager"``: runs torchdynamo to capture a forward graph, and
+   then AOTAutograd to trace the backward graph without any additional
+   backend compiler steps. PyTorch eager will then be used to run the
+   forward and backward graphs. This is useful to narrow down the issue
+   to AOTAutograd.
+
+The general procedure to narrow down an issue is the following: 1. Run
+your program with the ``"eager"`` backend. If the error no longer
+occurs, the issue is in the backend compiler that is being used (if
+using TorchInductor, proceed to step 2, if not, see `this
+section <#minifying-backend-compiler-errors>`__). If the error still
+occurs with the ``"eager"`` backend, it is an `error while running
+torchdynamo <#torchdynamo-errors>`__.
+
+2. This step is only necessary if TorchInductor is used as the backend
+   compiler. Run the model with the ``"aot_eager"`` backend. If this
+   backend raises an error then the error is occurring during
+   AOTAutograd tracing. If the error no longer occurs with this backend,
+   then `the error is in
+   TorchInductor\* <#minifying-torchinductor-errors>`__.
+
+Each of these cases are analyzed in the following sections.
+
+\*Note on TorchInductor naming: The TorchInductor backend consists of
+both AOTAutograd tracing and the TorchInductor compiler itself. We will
+disambiguate by referring to TorchInductor as the backend, and
+TorchInductor lowering as the phase which lowers the graph traced by
+AOTAutograd.
+
+Torchdynamo Errors
+------------------
+
+If the error that is generated occurs with the ``"eager"`` backend, then
+torchdynamo is the most likely source of the error. Here is example code
+which will generate an error.
+
+.. code:: py
+
+   import torch
+
+   import torch._dynamo as dynamo
+
+
+   @dynamo.optimize("eager")
+   def test_assertion_error():
+       y = torch.ones(200, 200)
+       z = {y: 5}
+       return z
+
+
+   test_assertion_error()
+
+Which will generate the following error:
+
+::
+
+   torch._dynamo.convert_frame: [ERROR] WON'T CONVERT test_assertion_error /scratch/mlazos/torchdynamo/../test/errors.py line 26
+   due to:
+   Traceback (most recent call last):
+     File "/scratch/mlazos/torchdynamo/torchdynamo/symbolic_convert.py", line 837, in BUILD_MAP
+       assert isinstance(k, ConstantVariable) or (
+   AssertionError
+
+   from user code:
+      File "/scratch/mlazos/torchdynamo/../test/errors.py", line 34, in test_assertion_error
+       z = {y: 5}
+
+   Set torch._dynamo.config.verbose=True for more information
+   ==========
+
+As the message suggests you can set
+``torch._dynamo.config.verbose=True`` to get a full stack trace to both
+the error in torchdynamo and the user code. In addition to this flag,
+you can also set the ``log_level`` of torchdynamo through
+``torch._dynamo.config.log_level``. The available levels are the
+following: - ``logging.DEBUG``: Print every instruction that is
+encountered in addition to all below log levels - ``logging.INFO``:
+Print each function that is compiled (original and modified bytecode)
+and the graph that is captured in addition to all below log levels -
+``logging.WARNING`` (default): Print graph breaks in addition to all
+below log levels - ``logging.ERROR``: Print errors only
+
+If a model is sufficiently large, the logs can become overwhelming. If
+an error occurs deep within a model’s python code, it can be useful to
+execute only the frame in which the error occurs to enable easier
+debugging. There are two tools available to enable this: - Setting the
+environment variable TORCHDYNAMO_DEBUG_FUNCTION to the desired function
+name will only run torchdynamo on functions with that name. - There is a
+record/replay tool (set
+``torch._dynamo.config.replay_record_enabled = True``) which dumps an
+execution record when an error is encountered. This record can then be
+replayed to run only the frame where an error occurred.
+
+TorchInductor Errors
+--------------------
+
+If the error doesn’t occur with the ``"eager"`` backend, then the
+backend compiler is the source of the error (`example
+error <https://gist.github.com/mlazos/2f13681e3cc6c43b3911f336327032de%5D>`__).
+There are `different
+choices <https://github.com/pytorch/torchdynamo/blob/0b8aaf340dad4777a080ef24bf09623f1aa6f3dd/README.md#existing-backends>`__
+for backend compilers for torchdynamo, with TorchInductor or nvfuser
+fitting the needs of most users. This section focuses on TorchInductor
+as the motivating example, but some tools will be usable with other
+backend compilers.
+
+Below is the portion of the stack which we are focusing on:
+
+With TorchInductor as the chosen backend, AOTAutograd is used to
+generate the backward graph from the forward graph captured by
+torchdynamo. It’s important to note that errors can occur during this
+tracing and also while TorchInductor lowers the forward and backward
+graphs to GPU code or C++. A model can often consist of hundreds or
+thousands of FX nodes, so narrowing the exact nodes where this problem
+occurred can be very difficult. Fortunately, there are tools availabe to
+automatically minify these input graphs to the nodes which are causing
+the issue. The first step is to determine whether the error occurs
+during tracing of the backward graph with AOTAutograd or during
+TorchInductor lowering. As mentioned above in step 2, the
+``"aot_eager"`` backend can be used to run only AOTAutograd in isolation
+without lowering. If the error still occurs with this backend, this
+indicates that the error is occurring during AOTAutograd tracing.
+
+Here’s an example:
+
+.. code:: py
+
+   import torch
+
+   import torch._dynamo as dynamo
+
+   model = torch.nn.Sequential(*[torch.nn.Linear(200, 200) for _ in range(5)])
+   @dynamo.optimize("inductor")
+   def test_backend_error():
+
+       y = torch.ones(200, 200)
+       x = torch.ones(200, 200)
+       z = x + y
+       a = torch.ops.aten._foobar(z)  # dummy function which errors
+       return model(a)
+
+
+   test_backend_error()
+
+Running this should give you this error (with a longer stack trace below
+it)
+
+::
+
+   Traceback (most recent call last):
+     File "/scratch/mlazos/torchdynamo/torchinductor/graph.py", line 246, in call_function
+       return lowerings[target](*args, **kwargs)
+     File "/scratch/mlazos/torchdynamo/torchinductor/lowering.py", line 185, in wrapped
+       return decomp_fn(*args, **kwargs)
+     File "/scratch/mlazos/torchdynamo/torchinductor/lowering.py", line 810, in _foobar
+       assert False
+   AssertionError
+   ...
+
+`error with full stack
+trace <https://gist.github.com/mlazos/d6947854aa56d686800259a164c62100>`__
+
+If you then change ``@dynamo.optimize("inductor")`` to
+``@dynamo.optimize("aot_eager")``, it will run without error, because
+`the
+issue <https://github.com/pytorch/torchdynamo/blob/d09e50fbee388d466b5252a63045643166006f77/torchinductor/lowering.py#:~:text=%23%20This%20shouldn%27t%20be,assert%20False>`__
+is in the TorchInductor lowering process, not in AOTAutograd.
+
+Minifying TorchInductor Errors
+------------------------------
+
+From here, let’s run the minifier to get a minimal repro. Setting the
+environment variable TORCHDYNAMO_REPRO_AFTER=“aot” (or setting
+``torch._dynamo.config.repro_after="aot"`` directly) will generate a
+python program which reduces the graph produced by AOTAutograd to the
+smallest subgraph which reproduces the error. (See below for an example
+where we minify the graph produced by torchdynamo) Running the program
+with this environment variable should show nearly `identical
+output <https://gist.github.com/mlazos/0458ab828aa403c779fe73c012aa5982>`__,
+with an additional line indicating where ``minifier_launcher.py`` has
+been written to. The output directory is configurable by setting
+``torch._dynamo.config.base_dir`` to a valid directory name. The final
+step is to run the minifier and check that it runs successfully. A
+successful run looks like
+`this <https://gist.github.com/mlazos/e6ea41ccce68a7b1b8a7a09acb1b206a>`__.
+If the minifier runs successfully, it generates runnable python code
+which reproduces the exact error. For our example this is the following
+code:
+
+.. code:: py
+
+   import torch
+   from torch import tensor, device
+   import torch.fx as fx
+   from torch._dynamo.testing import rand_strided
+   from math import inf
+   from torch.fx.experimental.proxy_tensor import make_fx
+
+   # torch version: 1.13.0a0+gitfddfc44
+   # torch cuda version: 11.6
+   # torch git version: fddfc4488afb207971c54ad4bf58130fdc8a4dc5
+
+
+   # CUDA Info:
+   # nvcc: NVIDIA (R) Cuda compiler driver
+   # Copyright (c) 2005-2022 NVIDIA Corporation
+   # Built on Thu_Feb_10_18:23:41_PST_2022
+   # Cuda compilation tools, release 11.6, V11.6.112
+   # Build cuda_11.6.r11.6/compiler.30978841_0
+
+   # GPU Hardware Info:
+   # NVIDIA A100-SXM4-40GB : 8
+
+
+   from torch.nn import *
+   class Repro(torch.nn.Module):
+       def __init__(self):
+           super().__init__()
+
+
+
+       def forward(self, add):
+           _foobar = torch.ops.aten._foobar.default(add);  add = None
+           return (_foobar,)
+
+   args = [((200, 200), (200, 1), torch.float32, 'cpu')]
+   args = [rand_strided(shape, stride, dtype, device) for shape, stride, dtype, device in args]
+   mod = make_fx(Repro())(*args)
+   from torch._inductor.compile_fx import compile_fx_inner
+
+   compiled = compile_fx_inner(mod, args)
+   compiled(*args)
+
+The ``forward`` method of the ``Repro`` module contains the exact op
+which causes the issue. When filing an issue, please include any
+minified repros to aid in debugging.
+
+Minifying Backend Compiler Errors
+---------------------------------
+
+With backend compilers other than TorchInductor the process for finding
+the subgraph causing the error is nearly identical to the procedure in
+`errors in TorchInductor <#torchinductor-errors>`__ with one important
+caveat. Namely, that the minifier will now be run on the graph that is
+traced by TorchDynamo, not the output graph of AOTAutograd. Let’s walk
+through an example.
+
+.. code:: py
+
+   import torch
+
+   import torch._dynamo as dynamo
+
+   model = torch.nn.Sequential(*[torch.nn.Linear(200, 200) for _ in range(5)])
+   # toy compiler which fails if graph contains relu
+   def toy_compiler(gm: torch.fx.GraphModule, _):
+       for node in gm.graph.nodes:
+           if node.target == torch.relu:
+               assert False
+
+       return gm
+
+
+   @dynamo.optimize(toy_compiler)
+   def test_backend_error():
+       y = torch.ones(200, 200)
+       x = torch.ones(200, 200)
+       z = x + y
+       a = torch.relu(z)
+       return model(a)
+
+
+   test_backend_error()
+
+In order to run the code after TorchDynamo has traced the forward graph,
+the TORCHDYNAMO_REPRO_AFTER enviornment variable can be used. Running
+this program with TORCHDYNAMO_REPRO_AFTER=“dynamo” (or
+``torch._dynamo.config.repro_after="dynamo"``) should produce `this
+output <https://gist.github.com/mlazos/244e3d5b53667e44078e194762c0c92b>`__\ and
+the following code in ``{torch._dynamo.config.base_dir}/repro.py``.
+Note: the other option for TORCHDYNAMO_REPRO_AFTER are ``"aot"``, which
+will run the minifier after the backward graph has been generated.
+
+.. code:: py
+
+   import torch
+   import torch._dynamo as dynamo
+   from torch import tensor, device
+   import torch.fx as fx
+   from torch._dynamo.testing import rand_strided
+   from math import inf
+   from torch._dynamo.debug_utils import run_fwd_maybe_bwd
+
+
+   from torch.nn import *
+   class Repro(torch.nn.Module):
+       def __init__(self):
+           super().__init__()
+
+
+
+       def forward(self, add):
+           relu = torch.relu(add);  add = None
+           return (relu,)
+
+
+   mod = Repro().cuda()
+   opt_mod = dynamo.optimize("None")(mod)
+
+
+   args = [((200, 200), (200, 1), torch.float32, 'cpu', False)]
+   args = [rand_strided(sh, st, dt, dev).requires_grad_(rg) for (sh, st, dt, dev, rg) in args]
+
+
+   with torch.cuda.amp.autocast(enabled=False):
+       ref = run_fwd_maybe_bwd(mod, args)
+       res = run_fwd_maybe_bwd(opt_mod, args)
+
+The minifier successfully reduced the graph to the op that raises the
+error in ``toy_compiler``. The other difference from the procedure in
+`TorhInductor Errors <#torchinductor-errors>`__ is that the minifier is
+automatically run after encountering a backend compiler error. After a
+successful run, the minifier writes ``repro.py`` to
+``torch._dynamo.config.base_dir``.
+
+Performance Profiling
+~~~~~~~~~~~~~~~~~~~~~
+
+Accessing TorchDynamo Profiler
+------------------------------
+
+TorchDynamo has a builtin stats function for collecting and displaying
+the time spent in each compilation phase. These stats can be accessed by
+calling ``torch._dynamo.utils.compile_times()`` after executing
+Torch._Dynamo. By default, this returns a string representation of the
+compile times spent in each TorchDynamo function by name.
+
+TorchInductor Debug Tracing
+---------------------------
+
+TorchInductor has a builtin stats and trace function for displaying time
+spent in each compilation phase, output code, output graph visualization
+and IR dump. This is a debugging tool designed to make it easier to
+debug/understand the internals of TorchInductor.
+
+Setting the environment variable ``TORCHINDUCTOR_TRACE=1`` will cause a
+debug trace directory to be created and printed:
+
+::
+
+   $ env TORCHINDUCTOR_TRACE=1 python repro.py
+   torch._inductor.debug: [WARNING] model_forward_0 debug trace: /tmp/torchinductor_jansel/rh/crhwqgmbqtchqt3v3wdeeszjb352m4vbjbvdovaaeqpzi7tdjxqr.debug
+
+Here is an `example debug directory
+output <https://gist.github.com/jansel/f4af078791ad681a0d4094adeb844396>`__
+for the test program:
+
+::
+
+   torch.nn.Sequential(
+           torch.nn.Linear(10, 10),
+           torch.nn.LayerNorm(10),
+           torch.nn.ReLU(),
+       )
+
+Note each file in that debug trace can be enabled/disabled via
+``torch._inductor.config.trace.*``. The profile and the diagram are both
+disabled by default since they are expensive to generate.
+
+A single node in this new debug format looks like:
+
+::
+
+   buf1: SchedulerNode(ComputedBuffer)
+   buf1.writes =
+       {   MemoryDep(name='buf1', index=0, size=()),
+           MemoryDep(name='buf1', index=0, size=(s0,))}
+   buf1.unmet_dependencies = {MemoryDep(name='buf0', index=c0, size=(s0,))}
+   buf1.met_dependencies = {MemoryDep(name='primals_2', index=c0, size=(s0,))}
+   buf1.group.device = cuda:0
+   buf1.group.iteration = (1, s0)
+   buf1.sizes = ([], [s0])
+   class buf1_loop_body:
+       var_ranges = {z0: s0}
+       index0 = z0
+       index1 = 0
+       def body(self, ops):
+           get_index = self.get_index('index0')
+           load = ops.load('buf0', get_index, False)
+           get_index_1 = self.get_index('index0')
+           load_1 = ops.load('primals_2', get_index_1, False)
+           add = ops.add(load, load_1)
+           get_index_2 = self.get_index('index1')
+           reduction = ops.reduction('buf1', torch.float32, torch.float32, 'sum', get_index_2, add)
+           return reduction
+
+See the `example debug directory
+output <https://gist.github.com/jansel/f4af078791ad681a0d4094adeb844396>`__
+for more examples.
+
+Memory Profiling
+----------------
+
+TBD
+
+Graph Breaks
+------------
+
+Given a program like this:
+
+.. code-block:: python
+
+   @dynamo.optimize(...)
+   def some_fun(x):
+       ...
+   some_fun(x)
+   ...
+
+TorchDynamo will attempt to compile all of the torch/tensor operations
+within some_fun into a single FX graph, but it may fail to capture
+everything into one graph.
+
+Some graph break reasons are insurmountable to TorchDynamo, and can’t be
+easily fixed. - calling into a C extension other than torch is invisible
+to torchdynamo, and could do arbitrary things without TorchDynamo being
+able to introduce necessary `guards <./GuardsOverviewPt1.md>`__ to
+ensure that the compiled program would be safe to reuse. Graph breaks
+can hinder performance if the resulting fragments are small. To maximize
+performance, it’s important to have as few graph breaks as possible.
+
+Identifying the cause of a graph break
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To identify all graph breaks in a program and the associated reasons for
+the breaks, ``torch._dynamo.explain`` can be used. This tool runs
+TorchDynamo on the supplied function and aggregates the graph breaks
+that are encountered. Here is an example usage:
+
+.. code-block:: python
+
+   import torch
+   import torch._dynamo as dynamo
+   def toy_example(a, b):
+       x = a / (torch.abs(a) + 1)
+       print("woo")
+       if b.sum() < 0:
+           b = b * -1
+       return x * b
+   explanation, out_guards, graphs, ops_per_graph = dynamo.explain(toy_example, torch.randn(10), torch.randn(10))
+   print(explanation)
+   """
+   Dynamo produced 3 graphs, with 2 graph break and 6 ops.
+    Break reasons:
+   1. call_function BuiltinVariable(print) [ConstantVariable(str)] {}
+      File "t2.py", line 16, in toy_example
+       print("woo")
+
+   2. generic_jump
+      File "t2.py", line 17, in toy_example
+       if b.sum() < 0:
+    """
+
+Note on other outputs: - ``out_guards`` - a list of lists where each
+sublist contains the guards that must pass to ensure the traced graphs
+are valid - ``graphs`` - a list of graph modules which were successfully
+traced - ``ops_per_graph`` - a list of lists where each sublist contains
+the ops thatare run in the graph
+
+To throw an error on the first graph break encountered, ``nopython``
+mode can be used. This disables TorchDynamo’s python fallback, and only
+succeeds if the entire program is convertible to a single graph. Example
+usage:
+
+.. code-block:: python
+
+   @dynamo.optimize(<compiler>, nopython=True)
+   def toy_example(a, b):
+      ...
+
+Excessive Recompilation
+-----------------------
+
+When TorchDynamo compiles a function (or part of one), it makes certain
+assumptions about locals and globals in order to allow compiler
+optimizations, and expresses these assumptions as guards that check
+particular values at runtime. If any of these guards fail, Dynamo will
+recompile that function (or part) up to
+``torch._dynamo.config.cache_size_limit`` times. If your program is
+hitting the cache limit, you will first need to determine which guard is
+failing and what part of your program is triggering it.
+
+The `recompilation profiler <#recompilation-profiler>`__ automates the
+process of setting TorchDynamo’s cache limit to 1 and running your
+program under an observation-only ‘compiler’ that records the causes of
+any guard failures. You should be sure to run your program for at least
+as long (as many iterations) as you were running when you ran into
+trouble, and the profiler will accumulate statistics over this duration.
+
+If your program exhibits a bounded amount of dynamism, you may be able
+to tune the TorchDynamo cache limit to allow for each variation to be
+compiled and cached, but if the cache limit is too high you may find the
+cost of recompilation outweighs any optimization benefits.
+
+::
+
+   torch._dynamo.config.cache_size_limit = <your desired cache limit>
+
+Torchdynamo plans to support many common cases of dynamic tensor shapes,
+such as varying batch size or sequence length. It does not plan to
+support rank-dynamism. In the mean time, setting a specific cache limit
+can be used in coordination with bucketing techniques to achieve an
+acceptable number of recompilations for some dynamic models.
+
+.. code-block:: python
+
+   prof = dynamo.utils.CompilationProfiler()
+   @dynamo.optimize(prof)
+   def my_model():
+       ...
+   my_model()
+   print(prof.report())
+
+Accuracy Debugging
+~~~~~~~~~~~~~~~~~~
+
+Accuracy issues can also be minified if you set the environment variable
+``TORCHDYNAMO_REPRO_LEVEL=4``, it operates with a similar git bisect
+model and a full repro might be something like
+``TORCHDYNAMO_REPRO_AFTER="aot" TORCHDYNAMO_REPRO_LEVEL=4`` the reason
+we need this is downstream compilers will codegen code whether it’s
+Triton code or the C++ backend, the numerics from those downstream
+compilers can be different in subtle ways yet have dramatic impact on
+your training stability. So the accuracy debugger is very useful for us
+to detect bugs in our codegen or with a backend compiler.
+
+File an Issue
+~~~~~~~~~~~~~
+
+You should feel encouraged to `file a github
+issue <https://github.com/pytorch/torchdynamo/issues>`__ and expect a
+timely response.
+
+Before filing an issue, read over the `README <../README.md>`__,
+`TROUBLESHOOTING <./TROUBLESHOOTING.md>`__, and search for similar
+issues.
+
+When filing an issue, please include - your
+OS/python/pytorch/CUDA/triton info by running:
+
+.. code-block:: sh
+
+   python tools/verify_install.py
+
+-  A minimal repro script if possible, which can be generated by running
+   Minifier
+-  A description of the error
+-  the expected behavior
+-  A log (set ``torch._dynamo.config.log_file`` to a valid file name to
+   dump the logs to a file and
+   ``torch._dynamo.config.log_level = logging.DEBUG`` and
+   ``torch._dynamo.config.verbose = True``)
diff --git a/docs/source/index.rst b/docs/source/index.rst
index e4b6a124d6bdc..e43160f668fc7 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -42,6 +42,13 @@ Features described in this documentation are classified by release status:
 
    notes/*
 
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :caption: torch.compile
+
+   dynamo/*
+
 .. toctree::
    :maxdepth: 1
    :caption: Language Bindings

From 4b3f3f88f14de6cdc8efd275f570ba63c31068b2 Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Tue, 29 Nov 2022 02:35:37 +0000
Subject: [PATCH 1360/1922] Enable DDPOptimizer by default in dynamo (#88523)

Performance benchmarks on 6 popular models from 1-64 GPUs compiled with
torchinductor show performance gains or parity with eager, and showed
regressions without DDPOptimizer.  *Note: resnet50 with small batch size shows a regression with optimizer, in part due to failing to compile one subgraph due to input mutation, which will be fixed.
(hf_Bert, hf_T5_large, hf_T5, hf_GPT2_large, timm_vision_transformer, resnet50)

Correctness checks are implemented in CI (test_dynamo_distributed.py),
via single-gpu benchmark scripts iterating over many models
(benchmarks/dynamo/torchbench.py/timm_models.py/huggingface.py),
and via (multi-gpu benchmark scripts in torchbench)[https://github.com/pytorch/benchmark/tree/main/userbenchmark/ddp_experiments].

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88523
Approved by: https://github.com/davidberard98
---
 benchmarks/dynamo/distributed.py | 6 +++---
 torch/_dynamo/config.py          | 7 +++++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/benchmarks/dynamo/distributed.py b/benchmarks/dynamo/distributed.py
index 194b3906de03f..b490c48ade90e 100644
--- a/benchmarks/dynamo/distributed.py
+++ b/benchmarks/dynamo/distributed.py
@@ -81,8 +81,8 @@ def move_tensor(maybe_tensor):
         if args.verbose:
             dynamo.config.verbose = True
             dynamo.config.log_level = logging.DEBUG
-        if args.dynamo_optimize_ddp:
-            dynamo.config.optimize_ddp = True
+        if args.dynamo_no_optimize_ddp:
+            dynamo.config.optimize_ddp = False
         if args.dynamo == "inductor" and args.fsdp:
             torch._inductor.config.triton.cudagraphs = False
             log.warn("disabling inductor cudagraphs for compatibility with FSDP")
@@ -129,7 +129,7 @@ def print_compile(gm, ex):
     parser.add_argument("--trace_file", default="profile.json", help="Run the profiler")
     parser.add_argument("--repeat", default=10, help="Repeats for timing run")
     parser.add_argument(
-        "--dynamo_optimize_ddp",
+        "--dynamo_no_optimize_ddp",
         action="store_true",
         help="Enable dynamo's ddp optimizer",
     )
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index 26efff205389a..258df9989f89c 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -138,8 +138,11 @@
 enforce_cond_guards_match = True
 
 # Automatically split model graph into pieces to match DDP bucket sizes
-# to allow DDP comm/compute overlap
-optimize_ddp = False
+# to allow DDP comm/compute overlap.  Disable to allow DDP models to
+# run without graph-breaks, but also without comm/compute overlap.
+# set torch._dynamo.config.log_level to INFO or DEBUG for more info
+# about optimize_ddp behavior.
+optimize_ddp = True
 
 # If True, raises exception if TorchDynamo is called with a context manager
 raise_on_ctx_manager_usage = True

From e9eb293b9fbec7086a7a68d0e0b4fefdac99b91a Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Tue, 29 Nov 2022 02:36:15 +0000
Subject: [PATCH 1361/1922] Dynamo asserts FSDP wrapped modules use_orig_param
 (#89523)

- This is a strict requirement given the way dynamo+FSDP is implemented,
  but isn't convenient to assert.
- By plumbing use_orig_param field on all wrapped modules, we can
  do this assertion inside dynamo

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89523
Approved by: https://github.com/awgu
---
 test/distributed/test_dynamo_distributed.py           | 6 ++++++
 torch/_dynamo/variables/builder.py                    | 7 +++++++
 torch/distributed/fsdp/fully_sharded_data_parallel.py | 6 ++++++
 3 files changed, 19 insertions(+)

diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index d5b9f070b403e..94682526d6e8f 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -557,6 +557,12 @@ def opt_fn(inputs):
             for p_id in b.param_ids:
                 self.assertFalse(p_id in parameter_ids_to_ignore)
 
+    def test_fsdp_orig_params_assert(self):
+        # Test with basic FSDP wrapping (outer wrap around whole model)
+        m, inputs, correct_outputs = get_model(f"cuda:{self.rank}")
+        fsdp_m = FSDP(m, use_orig_params=False)
+        fsdp_m = torch._dynamo.optimize()(fsdp_m)
+        self.assertRaisesRegex(AssertionError, "Dynamo only supports FSDP with use_orig_params=True", fsdp_m, inputs)
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index ed88b515ff579..43c2c91c4a553 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -310,6 +310,13 @@ def index_source(key):
             elif getattr(value, "_is_fsdp_managed_module", False) or issubclass(
                 value.__class__, torch.nn.parallel.distributed.DistributedDataParallel
             ):
+                if getattr(value, "_is_fsdp_managed_module", False):
+                    # Note: we can't do this assert inside FSDP constructor,
+                    # since we don't know yet whether dynamo will be used
+                    assert getattr(
+                        value, "_fsdp_use_orig_params", False
+                    ), "Dynamo only supports FSDP with use_orig_params=True"
+
                 # See note [Dynamo treats FSDP wrapped modules as UnspecializedNNModule]
                 # in fully_sharded_data_parallel.py for more information
                 return UnspecializedNNModuleVariable(
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index c9605ee882868..78b10dbd07498 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -358,6 +358,12 @@ def __init__(
                 """
                 submodule._is_fsdp_managed_module = True
 
+                # Dynamo only supports FSDP with use_orig_params=True.
+                # This is hacky, but I could not think of another way to add an assertion to dynamo
+                # for this, since Dynamo skips all the FSDP code frames and thus can't inspect the
+                # FSDP module directly
+                submodule._fsdp_use_orig_params = use_orig_params
+
         if auto_wrap_policy is not None:
             auto_wrap_kwargs = {
                 "module": module,

From c9b1ba0144ffb5d271e216bee78edcdb8288f6f5 Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Tue, 29 Nov 2022 02:34:55 +0000
Subject: [PATCH 1362/1922] Test FSDP with submodule non-reentrant
 checkpointing (#89781)

With combining FSDP with reentrant checkpointing, the post backward
hook might run twice, and then hit [this
error](https://github.com/pytorch/pytorch/blob/e20ec44544c17d6d3d411f88b870e05043bda731/torch/distributed/fsdp/_runtime_utils.py#L487).
This is because reentrant backward uses nested autograd GraphTasks.
The inner GraphTask is not aware of the outer one and therefore
will flush pending `AccumulateGrad` invocations on exit, which in
turn triggers the post backward hooks registered by FSDP. Later,
the outer GraphTask will trigger that again, leading to the above
error.

PR #89791 relaxes the FSDP training state check, but we still run
into grad value check failures occasionally. Therefore, this PR only
lands the test for non-reentrant test, and we can enable the
reentrant test when the accuracy issues are addressed.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89781
Approved by: https://github.com/rohan-varma
---
 test/distributed/fsdp/test_fsdp_checkpoint.py | 85 +++++++++++++++++++
 1 file changed, 85 insertions(+)

diff --git a/test/distributed/fsdp/test_fsdp_checkpoint.py b/test/distributed/fsdp/test_fsdp_checkpoint.py
index f0e8188641459..ae00cfe96af4f 100644
--- a/test/distributed/fsdp/test_fsdp_checkpoint.py
+++ b/test/distributed/fsdp/test_fsdp_checkpoint.py
@@ -16,6 +16,8 @@
     CPUOffload,
     FullyShardedDataParallel as FSDP,
 )
+from torch.distributed.fsdp import ShardingStrategy
+
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import _maybe_wrap_fsdp, FSDPTest
 from torch.testing._internal.common_utils import (
@@ -279,5 +281,88 @@ def test_basic_checkpoint_end_to_end(
 
 instantiate_parametrized_tests(TestFSDPCheckpoint)
 
+
+class CheckpointModule(nn.Module):
+    def __init__(self, checkpoint: bool = False, use_reentrant: bool = True):
+        super().__init__()
+        self.seq = nn.Sequential(*[nn.Linear(100, 100) for _ in range(4)])
+        self.checkpoint = checkpoint
+        self.use_reentrant = use_reentrant
+
+    def forward(self, x):
+        return (
+            checkpoint(self.seq, x, use_reentrant=self.use_reentrant)
+            if self.checkpoint
+            else self.seq(x)
+        )
+
+
+class ModelWithCheckpointSubmodule(nn.Module):
+    def __init__(self, checkpoint: bool = False, use_reentrant: bool = True):
+        super().__init__()
+        self.l1 = nn.Linear(100, 100)
+        self.s1 = CheckpointModule(checkpoint, use_reentrant)
+        self.s2 = CheckpointModule(checkpoint, use_reentrant)
+        self.relu = nn.ReLU()
+        self.l2 = nn.Linear(100, 100)
+
+    def forward(self, x):
+        return self.l2(self.relu(self.s2(self.s1(self.l1(x)))))
+
+
+class TestModel(nn.Module):
+    def __init__(self, checkpoint: bool = False, use_reentrant: bool = True):
+        super().__init__()
+        self.l1 = nn.Linear(100, 100)
+        self.relu = nn.ReLU()
+        self.checkpoint1 = ModelWithCheckpointSubmodule(checkpoint, use_reentrant)
+        self.checkpoint2 = ModelWithCheckpointSubmodule(checkpoint, use_reentrant)
+        self.l2 = nn.Linear(100, 100)
+
+    def forward(self, x):
+        return self.l2(self.relu(self.checkpoint2(self.checkpoint1(self.l1(x)))))
+
+
+class TestFSDPCheckpointSubmodule(FSDPTest):
+
+    # TODO: grad value checks occasionally fails when use_reentrant = True
+    @skip_if_lt_x_gpu(2)
+    @parametrize("use_reentrant", [False])
+    def test_checkpoint_submodule(self, use_reentrant: bool):
+        model = TestModel(use_reentrant=use_reentrant).cuda()
+        model_ac = deepcopy(model)
+
+        for _, m in model_ac.named_modules():
+            if isinstance(m, CheckpointModule):
+                m.checkpoint = True
+
+        self.assertTrue(model_ac.checkpoint1.s1.checkpoint)
+        self.assertTrue(model_ac.checkpoint2.s2.checkpoint)
+
+        fsdp_kwargs = {
+            "device_id": torch.cuda.current_device(),
+            "sharding_strategy": ShardingStrategy.NO_SHARD,
+        }
+
+        # Wrap no checkpointing model submodules with FSDP
+        model.m1 = FSDP(module=model.checkpoint1, **fsdp_kwargs)
+        model.m2 = FSDP(module=model.checkpoint2, **fsdp_kwargs)
+
+        # Wrap checkpointing model submodules with FSDP
+        model_ac.m1 = FSDP(module=model_ac.checkpoint1, **fsdp_kwargs)
+        model_ac.m2 = FSDP(module=model_ac.checkpoint2, **fsdp_kwargs)
+
+        x = torch.randn(2, 100, device="cuda")
+
+        model(x).sum().backward()
+        model_ac(x).sum().backward()
+
+        for p1, p2 in zip(model.parameters(), model_ac.parameters()):
+            self.assertTrue(p1.grad.allclose(p2.grad))
+
+
+instantiate_parametrized_tests(TestFSDPCheckpointSubmodule)
+
+
 if __name__ == "__main__":
     run_tests()

From 53f65565b5abf402e9d88f71d1e95553e12a5bd8 Mon Sep 17 00:00:00 2001
From: Yanli Zhao <yanlizhao@fb.com>
Date: Tue, 29 Nov 2022 06:42:57 +0000
Subject: [PATCH 1363/1922] add memory_tracker tool to help profiling memory
 usages (#88825)

Adding a memory_tracker API to show operator level memory traces for allocated_memory, active_memory and reserved memory stats, it gave the summary about top 20 operators that generate memories as well.

The implementation mainly uses torchDispatchMode and module hooks to get traces and add markers.

Will add following up PRs:
1. allow tracing more than 1 iteration
2. dump json data for visualization
3. add unit test for DDP training
4. add unit test for FSDP training
5. add unit test for activation checkpointing + DDP/FSDP training
6. add traces for activation memories and top operators that generate activation memories
7. print summaries for more breakdowns like model size, optimizer states, etc
8. add traces for temporary memories or memories consumed by cuda streams or nccl library if possible
9. connect the tool with OOM memory debugging
10. add dynamic programming (dp) algorithm to find best activation checkpointing locations based on the operator level activation memory traces
11. add same traces & dp algorithm for module level memory stats, as FSDP wrapping depends on module level memories, for some model users/not model authors, if they have to apply activation checkpointing on module level, they need module level memory traces as well

======================================================

Current test result for the memory_tracker_example.py on notebook:

Top 20 ops that generates memory are:
bn1.forward.cudnn_batch_norm.default_0: 98.0009765625MB
maxpool.forward.max_pool2d_with_indices.default_0: 74.5MB
layer1.0.conv1.backward.max_pool2d_with_indices_backward.default_0: 49.0MB
layer1.0.bn1.forward.cudnn_batch_norm.default_1: 24.5009765625MB
layer1.0.bn2.forward.cudnn_batch_norm.default_2: 24.5009765625MB
layer1.1.bn1.forward.cudnn_batch_norm.default_3: 24.5009765625MB
layer1.1.bn2.forward.cudnn_batch_norm.default_4: 24.5009765625MB
layer1.2.bn1.forward.cudnn_batch_norm.default_5: 24.5009765625MB
layer1.2.bn2.forward.cudnn_batch_norm.default_6: 24.5009765625MB
layer1.0.conv1.forward.convolution.default_1: 24.5MB
layer1.0.conv2.forward.convolution.default_2: 24.5MB
layer1.1.conv1.forward.convolution.default_3: 24.5MB
layer1.1.conv2.forward.convolution.default_4: 24.5MB
layer1.2.conv1.forward.convolution.default_5: 24.5MB
layer1.2.conv2.forward.convolution.default_6: 24.5MB
maxpool.backward.threshold_backward.default_32: 23.5MB
layer2.0.downsample.backward.convolution_backward.default_26: 12.2802734375MB
layer2.0.bn1.forward.cudnn_batch_norm.default_7: 12.2509765625MB
layer2.0.bn2.forward.cudnn_batch_norm.default_8: 12.2509765625MB
layer2.0.downsample.1.forward.cudnn_batch_norm.default_9: 12.2509765625MB

<img width="1079" alt="Screen Shot 2022-11-10 at 10 03 06 AM" src="https://user-images.githubusercontent.com/48731194/201172577-ddfb769c-fb0f-4962-80df-92456b77903e.png">

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88825
Approved by: https://github.com/awgu
---
 .../distributed/_tools/test_memory_tracker.py |  67 +++++
 torch/distributed/_tools/__init__.py          |   1 +
 torch/distributed/_tools/memory_tracker.py    | 261 ++++++++++++++++++
 .../examples/memory_tracker_example.py        |  32 +++
 4 files changed, 361 insertions(+)
 create mode 100644 test/distributed/_tools/test_memory_tracker.py
 create mode 100644 torch/distributed/_tools/__init__.py
 create mode 100644 torch/distributed/_tools/memory_tracker.py
 create mode 100644 torch/distributed/examples/memory_tracker_example.py

diff --git a/test/distributed/_tools/test_memory_tracker.py b/test/distributed/_tools/test_memory_tracker.py
new file mode 100644
index 0000000000000..2e19ef6bf7294
--- /dev/null
+++ b/test/distributed/_tools/test_memory_tracker.py
@@ -0,0 +1,67 @@
+# Owner(s): ["oncall: distributed"]
+
+from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_utils import (
+    TestCase,
+    run_tests,
+)
+
+import torch
+import torch.nn as nn
+
+from torch.distributed._tools import MemoryTracker
+
+import unittest
+
+
+class TestMemoryTracker(TestCase):
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_local_model(self):
+        """
+        Minimal test case to check the memory tracker can collect the expected
+        memory stats at operator level, as well as can print the summary result
+        without crash.
+        """
+        # Create a model with a hierarchy of modules
+        torch.manual_seed(0)
+        model = nn.Sequential(
+            nn.Sequential(
+                nn.Conv2d(3, 64, kernel_size=(3, 3), padding=(1, 1), bias=False),
+                nn.BatchNorm2d(64),
+                nn.ReLU(inplace=False),
+                nn.AdaptiveAvgPool2d(output_size=(1, 1)),
+            ),
+            nn.Flatten(start_dim=1),
+            nn.Sequential(nn.Linear(64, 2), nn.ReLU(inplace=True)),
+        ).cuda()
+
+        # Run one iteration of forward and backward pass
+        tracker = MemoryTracker()
+        tracker.start_monitor(model)
+
+        x = torch.randn(size=(2, 3, 224, 224), device=torch.device("cuda"))
+        # torch.LongTensor expects cpu device type, not cuda device type in
+        # constructor, so calling .cuda() outside constructor here.
+        target = torch.LongTensor([0, 1]).cuda()
+        criterion = nn.CrossEntropyLoss()
+        criterion(model(x), target).backward()
+
+        self.assertTrue(len(tracker._hooks) > 0)
+
+        tracker.stop()
+
+        self.assertTrue(len(tracker._hooks) == 0)
+
+        tracker.summary()
+
+        self.assertTrue(tracker._op_index > 0)
+        self.assertTrue(len(tracker._operator_names) > 0)
+        self.assertEqual(len(tracker.memories_allocated), tracker._op_index)
+        self.assertEqual(len(tracker.memories_active), tracker._op_index)
+        self.assertEqual(len(tracker.memories_reserved), tracker._op_index)
+        self.assertTrue(len(tracker._markers) == 2)
+        self.assertTrue(tracker._cur_module_name != "")
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/distributed/_tools/__init__.py b/torch/distributed/_tools/__init__.py
new file mode 100644
index 0000000000000..eda274b5724f0
--- /dev/null
+++ b/torch/distributed/_tools/__init__.py
@@ -0,0 +1 @@
+from .memory_tracker import MemoryTracker
diff --git a/torch/distributed/_tools/memory_tracker.py b/torch/distributed/_tools/memory_tracker.py
new file mode 100644
index 0000000000000..f401304d67f6d
--- /dev/null
+++ b/torch/distributed/_tools/memory_tracker.py
@@ -0,0 +1,261 @@
+from collections import defaultdict
+
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    no_type_check,
+    Sequence,
+)
+
+import torch
+import torch.nn as nn
+from torch.utils.hooks import RemovableHandle
+from torch.utils._python_dispatch import TorchDispatchMode
+
+BYTES_PER_MB = 1024 * 1024.0
+
+
+class MemoryProfileDispatchMode(TorchDispatchMode):
+    """
+    Run in ``TorchDispatchMode`` to get memory stats at operator level.
+    """
+
+    def __init__(self, memory_tracker) -> None:
+        self.memory_tracker = memory_tracker
+
+    def __torch_dispatch__(self, func, types, args=..., kwargs=None):
+        rs = func(*args, **kwargs)
+        if func == torch.ops.aten.detach.default:
+            return rs
+        func_name: str = (
+            self.memory_tracker._cur_module_name
+            + "."
+            + func.__name__
+            + "_"
+            + str(self.memory_tracker._operator_names[func.__name__])
+        )
+        self.memory_tracker._operator_names[func.__name__] = (
+            self.memory_tracker._operator_names[func.__name__] + 1
+        )
+        self.memory_tracker._record_memory_stats(func_name)
+
+        return rs
+
+
+class MemoryTracker:
+    """
+    Collect and plot the memory stats including ``memories_allocated``, ``memories_active``
+    and ``memories_reserved`` at operator level.
+    It also prints a summary for the top 20 operators that generate the most memories.
+
+    Example usage:
+
+        >>> net.cuda()
+        >>> input = input.cuda()
+
+        >>> mem_tracker = MemoryTracker()
+        >>> mem_tracker.start_monitor(net)
+
+        >>> net.zero_grad(True)
+        >>> loss = net(input)
+        >>> if isinstance(loss, dict):
+        >>>    loss = loss['out']
+        >>> loss.sum().backward()
+        >>> net.zero_grad(set_to_none=True)
+
+        >>> mem_tracker.stop()
+        >>> mem_tracker.summary()
+        >>> mem_tracker.show_traces()
+    """
+
+    def __init__(self) -> None:
+        torch._C._log_api_usage_once("torch.distributed.memory_tracker")
+        self._hooks: List[RemovableHandle] = []
+        self._operator_names: Dict[str, int] = defaultdict(int)
+        self.memories_allocated: Dict[int, Dict[str, float]] = defaultdict(
+            lambda: defaultdict(float)
+        )
+        self.memories_active: Dict[int, Dict[str, float]] = defaultdict(
+            lambda: defaultdict(float)
+        )
+        self.memories_reserved: Dict[int, Dict[str, float]] = defaultdict(
+            lambda: defaultdict(float)
+        )
+        self._markers: Dict[str, int] = defaultdict(int)
+        self._cur_module_name: str = ""
+        self._op_index: int = 0
+
+    @no_type_check
+    def start_monitor(self, root_module: nn.Module) -> None:
+        """
+        Register module hooks and entering ``MemoryProfileDispatchMode``, so that
+        operator level memory stats can be tracked during module runtime.
+        """
+        self._clear_state()
+        root_module.__setattr__("_memory_tracker_is_root", True)
+        for name, m in root_module.named_modules():
+            if m is not root_module:
+                m.__setattr__("_memory_tracker_is_root", False)
+            # fused_proxy_group does not support hooks
+            if ".fused_proxy_grouped_embedding_bag" in name:
+                continue
+            # hook ordering with other hooks added by users is not managed, so
+            # the memory stats tracked here may not completely accurate.
+            h1 = m.register_forward_pre_hook(self._create_pre_forward_hook(name))
+            h2 = m.register_forward_hook(self._create_post_forward_hook(name))
+            h3 = m.register_backward_hook(self._create_backward_hook(name))
+            self._hooks.extend([h1, h2, h3])
+        torch.cuda.empty_cache()
+        assert getattr(self, "profile_mode", None) is None
+        self.profile_mode = MemoryProfileDispatchMode(self)
+        self.profile_mode.__enter__()
+
+    @no_type_check
+    def stop(self) -> None:
+        """
+        Remove module hooks and exit ``MemoryProfileDispatchMode`` to stop
+        tracking memory stats at operator level.
+        """
+        for h in self._hooks:
+            h.remove()
+        self._hooks.clear()
+        assert getattr(self, "profile_mode", None) is not None
+        self.profile_mode.__exit__(None, None, None)
+        self.profile_mode = None
+
+    @no_type_check
+    def summary(self, top: int = 20) -> None:
+        """
+        Print out the top operators that generate the most memories. The number
+        of the top operators can be configured.
+        """
+        op_diff: Dict[str, float] = defaultdict(float)
+        op_name, previous_allocated_memory = self.memories_allocated[0]
+        for i in range(1, self._op_index):
+            op_name, current_allocated_memory = self.memories_allocated[i]
+            op_diff[op_name] = current_allocated_memory - previous_allocated_memory
+            previous_allocated_memory = current_allocated_memory
+
+        print("------------------------------------------------")
+        print(f"Top {top} ops that generates memory are:")
+        for k, v in sorted(op_diff.items(), key=lambda item: item[1], reverse=True)[
+            :top
+        ]:
+            print(f"{k}: {v}MB")
+        print("------------------------------------------------")
+
+    @no_type_check
+    def show_traces(self) -> None:
+        """
+        Show the traces of ``memory_allocated``, ``memory_active`` and ``memory_reserved`` at
+        operator level and the marker 'fw_bw_boundary' at the boundary of forward pass
+        and backward pass.
+        """
+        import matplotlib.pyplot as plt
+
+        y_1 = [mb for (name, mb) in self.memories_allocated.values()]
+        y_2 = [mb for (name, mb) in self.memories_active.values()]
+        y_3 = [mb for (name, mb) in self.memories_reserved.values()]
+        min_val = min(y_1 + y_2 + y_3)
+        max_val = max(y_1 + y_2 + y_3)
+        x = list(i for i in range(len(y_1)))
+        fig = plt.figure(figsize=(16, 8))
+        plt.plot(x, list(y_1), label="memory_allocated")
+        plt.plot(x, list(y_2), label="memory_active")
+        plt.plot(x, list(y_3), label="memory_reserved")
+        plt.xlabel("# Operator Calls")
+        plt.ylabel("Memory (MB)")
+        for marker_name, marker in self._markers.items():
+            if marker_name == "fw_bw_boundary":
+                plt.plot(
+                    [marker, marker], [min_val, max_val], "r", lw=2, label=marker_name
+                )
+            else:
+                plt.plot(
+                    [marker, marker], [min_val, max_val], "k-", lw=2, label=marker_name
+                )
+        plt.legend()
+
+    def _create_pre_forward_hook(self, name: str) -> Callable:
+        """
+        The pre_foward_hook is to insert current module name with forward prefix for the operator
+        name, also it inserts the marker "fw_start" when the forward pass begins.
+        """
+
+        def _pre_forward_hook(module: nn.Module, inputs: Any) -> None:
+            self._cur_module_name = f"{name}.forward"
+            if (
+                hasattr(module, "_memory_tracker_is_root")
+                and module._memory_tracker_is_root
+            ):
+                self._add_marker("fw_start")
+
+        return _pre_forward_hook
+
+    def _create_post_forward_hook(self, name: str) -> Callable:
+        """
+        The post_forward_hook inserts the marker 'fw_bw_boundary' at the boundary
+        of forward pass and backward pass.
+        """
+
+        def _post_forward_hook(
+            module: nn.Module,
+            inputs: Sequence[torch.Tensor],
+            outputs: Sequence[torch.Tensor],
+        ) -> None:
+            if (
+                hasattr(module, "_memory_tracker_is_root")
+                and module._memory_tracker_is_root
+            ):
+                self._add_marker("fw_bw_boundary")
+
+        return _post_forward_hook
+
+    def _create_backward_hook(self, name: str) -> Callable:
+        """
+        The backward_hook inserts the current module name with backward prefix for the operator name.
+        """
+
+        def _backward_hook(
+            module: nn.Module, grad_input: torch.Tensor, grad_output: torch.Tensor
+        ) -> None:
+            self._cur_module_name = f"{name}.backward"
+
+        return _backward_hook
+
+    @no_type_check
+    def _record_memory_stats(self, fn_name: str) -> None:
+        """
+        Record current memory allocated, current memory active and current memory reserved.
+        The memory stats dict is indexed with ``self._op_index``.
+        """
+        memory_allocated: float = torch.cuda.memory_allocated() / BYTES_PER_MB
+        memory_reserved: float = torch.cuda.memory_reserved() / BYTES_PER_MB
+        memory_active: float = (
+            torch.cuda.memory_stats().get("active_bytes.all.current", 0) / BYTES_PER_MB
+        )
+        self.memories_allocated[self._op_index] = (fn_name, memory_allocated)
+        self.memories_reserved[self._op_index] = (fn_name, memory_reserved)
+        self.memories_active[self._op_index] = (fn_name, memory_active)
+        self._op_index += 1
+
+    def _add_marker(self, marker_name: str) -> None:
+        """
+        Set the marker's x-axis value.
+        """
+        marker_val = len(self.memories_allocated.values())
+        self._markers[marker_name] = marker_val
+
+    def _clear_state(self) -> None:
+        """
+        Clear states when start_monitor() is called.
+        """
+        self._operator_names.clear()
+        self.memories_allocated.clear()
+        self.memories_active.clear()
+        self.memories_reserved.clear()
+        self._markers.clear()
+        self._cur_module_name = ""
+        self._op_index = 0
diff --git a/torch/distributed/examples/memory_tracker_example.py b/torch/distributed/examples/memory_tracker_example.py
new file mode 100644
index 0000000000000..5440d10cd847a
--- /dev/null
+++ b/torch/distributed/examples/memory_tracker_example.py
@@ -0,0 +1,32 @@
+import torch
+import torchvision
+
+from torch.distributed._tools import MemoryTracker
+
+
+def run_one_model(net: torch.nn.Module, input: torch.Tensor):
+    net.cuda()
+    input = input.cuda()
+
+    # Create the memory Tracker
+    mem_tracker = MemoryTracker()
+    # start_monitor before the training iteration starts
+    mem_tracker.start_monitor(net)
+
+    # run one traing iteration
+    net.zero_grad(True)
+    loss = net(input)
+    if isinstance(loss, dict):
+        loss = loss["out"]
+    loss.sum().backward()
+    net.zero_grad(set_to_none=True)
+
+    # stop monitoring after the training iteration ends
+    mem_tracker.stop()
+    # print the memory stats summary
+    mem_tracker.summary()
+    # plot the memory traces at operator level
+    mem_tracker.show_traces()
+
+
+run_one_model(torchvision.models.resnet34(), torch.rand(32, 3, 224, 224, device="cuda"))

From 1b5138862b5e21f966e7f68b31198291a1aabf02 Mon Sep 17 00:00:00 2001
From: Michael Lazos <mlazos@fb.com>
Date: Tue, 29 Nov 2022 07:19:02 +0000
Subject: [PATCH 1364/1922] [dynamo] Don't copy the graph during checkpointing
 (copy_graphstate) (#89232)

copy_graphstate is called a ton, this makes copy_graphstate a lot faster, helps with https://github.com/pytorch/torchdynamo/issues/1803

tag each graph node with a timestamp, when checkpointing store the timestamp, when restoring remove nodes older than the timestamp stored in the state. This essentially has the same behavior as the original impl, just doesn't copy the whole graph.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89232
Approved by: https://github.com/jansel
---
 torch/_dynamo/output_graph.py   | 36 ++++++++++++++++++++++-----------
 torch/_dynamo/variables/misc.py |  6 +++---
 2 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index ff8b0998622e2..6a8db90bd8c8d 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -156,7 +156,6 @@ def __init__(
     ):
         super(OutputGraph, self).__init__()
 
-        # Mutable state checkpointed by copy_graphstate()
         self.graph = torch.fx.Graph()
         self.graphargs: List[GraphArg] = []
         self.guards: Set[Guard] = set()
@@ -164,6 +163,9 @@ def __init__(
         self.side_effects = SideEffects()
         self.code_options = dict(code_options)
         self.output_instructions: List[Instruction] = []
+        # used to track nodes that are added between calls of copy_graphstate
+        # and restore_graphstate
+        self.timestamp = 0
         # Node => computed real value (see utils.get_real_value)
         self.real_value_cache: Dict[fx.Node, torch.Tensor] = {}
 
@@ -199,34 +201,34 @@ def fake_mode(self):
     def copy_graphstate(self):
         """Create a checkpoint of the current state by copying everything"""
         assert self.nn_modules is not None
-        graph_nodes = set(self.graph.nodes)
-        return (
-            graph_nodes,
+        state = (
             list(self.graphargs),
             set(self.guards),
             dict(self.nn_modules),
             self.side_effects.clone(),
+            self.timestamp,
         )
+        self.timestamp += 1
+        return state
 
     def restore_graphstate(self, state):
         """Restore a checkpoint created by self.copy_graphstate()"""
         (
-            graph_nodes,
             self.graphargs,
             self.guards,
             self.nn_modules,
             self.side_effects,
+            self.timestamp,
         ) = state
         # FX deepcopy doesn't work for a partially created graph, so just remove new nodes
         for node in reversed(list(self.graph.nodes)):
-            if node not in graph_nodes:
+            if node.meta["creation_timestamp"] > self.timestamp:
                 # Erasing node alone does not remove the meta information
                 # So, remove the help tensor explicitly
                 if "example_value" in node.meta:
                     del node.meta["example_value"]
-                self.graph.erase_node(node)
+                self.remove_node(node)
                 self.real_value_cache.pop(node, None)
-                self.name_to_input.pop(node.name, None)
 
     def count_calls(self):
         return count_calls(self.graph)
@@ -559,9 +561,9 @@ def remove_unused_graphargs(self) -> None:
         for node in reversed(list(self.graph.nodes)):
             if len(list(node.users)) == 0:
                 if node.op == "get_attr":
-                    self.graph.erase_node(node)
+                    self.remove_node(node)
                 elif node.op == "call_function" and node.target is operator.getitem:
-                    self.graph.erase_node(node)
+                    self.remove_node(node)
 
         expanded_graphargs = []
         for arg in self.graphargs:
@@ -576,9 +578,8 @@ def remove_unused_graphargs(self) -> None:
             if arg.uses == 0:
                 if "example_value" in node.meta:
                     del node.meta["example_value"]
-                self.graph.erase_node(node)
+                self.remove_node(node)
                 self.real_value_cache.pop(node, None)
-                self.name_to_input.pop(node.name, None)
 
         self.graphargs = [arg for arg in self.graphargs if arg.uses > 0]
 
@@ -650,3 +651,14 @@ def create_proxy(
         rv.node.stack_trace = nn_module_stack_str + " | ".join(msgs)
 
         return rv
+
+    def create_node(self, *args, **kwargs):
+        node = super().create_node(*args, **kwargs)
+        node.meta["creation_timestamp"] = self.timestamp
+        return node
+
+    # Note: we did not override erase_node since
+    # we call self.graph.erase_node elsewhere
+    def remove_node(self, node):
+        self.graph.erase_node(node)
+        self.name_to_input.pop(node.name, None)
diff --git a/torch/_dynamo/variables/misc.py b/torch/_dynamo/variables/misc.py
index 7e1c91b68c41f..8cba35aaaa090 100644
--- a/torch/_dynamo/variables/misc.py
+++ b/torch/_dynamo/variables/misc.py
@@ -281,7 +281,7 @@ def enter(self, tx):
     def _call_func(self, tx, values):
         assert len(values) == 1
         value = values[0]
-        tx.output.graph.create_node(
+        tx.output.create_node(
             "call_function", torch._C._set_grad_enabled, (value,), {}
         ),
         torch._C._set_grad_enabled(value)
@@ -337,12 +337,12 @@ def __init__(self, target_values, initial_values=None, **kwargs):
         self.mode = None
 
     def exit(self, tx, *args):
-        tx.output.graph.create_node(
+        tx.output.create_node(
             "call_function", exit_functional_autocast, (self.mode,), {}
         )
 
     def enter(self, tx):
-        self.mode = tx.output.graph.create_node(
+        self.mode = tx.output.create_node(
             "call_function", enter_functional_autocast, (*self.target_values,), {}
         )
 

From ffc2321c18374d0b702b9dda6002a9de98bccf7e Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Tue, 29 Nov 2022 07:49:07 +0000
Subject: [PATCH 1365/1922] [minifier] Continue on assertion for accuracy
 minification (#89739)

During accuracy minification, minifier can create graphs which can cause assertion failures. This PR catches such assertions and let minifier move on, instead of getting stuck in minifying this issue.

It is possible that such graphs point to some real-although-unrelated issue. So, printing an assertion to flag and debug if needed.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89739
Approved by: https://github.com/mlazos
---
 torch/_dynamo/debug_utils.py | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index 36dd15e047351..6af1e6c4cfdd7 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -557,7 +557,19 @@ def same_two_models(gm, opt_gm, example_inputs, only_fwd=False):
         log.warning("Could not generate fp64 outputs")
         fp64_ref = None
 
-    res = run_fwd_maybe_bwd(opt_gm, example_inputs, only_fwd)
+    try:
+        res = run_fwd_maybe_bwd(opt_gm, example_inputs, only_fwd)
+    except Exception as e:
+        # This means that the the minified graph is bad/exposes a different problem.
+        # As we are checking accuracy here, lets log the exception and return True.
+        log.warning(
+            (
+                "While minifying the program in accuracy minification mode,"
+                "ran into a runtime exception which is likely an unrelated issue."
+                " Skipping this graph."
+            )
+        )
+        return True
 
     passing = same(ref, res, fp64_ref, tol=0.001, equal_nan=True)
     return passing
@@ -731,7 +743,20 @@ def dump_backend_state(gm, args, compiler_name, check_accuracy=False):
 
 
 def backend_accuracy_fails(gm, example_inputs, compiler_fn, only_fwd=False):
-    compiled_gm = compiler_fn(copy.deepcopy(gm), clone_inputs(example_inputs))
+    try:
+        compiled_gm = compiler_fn(copy.deepcopy(gm), clone_inputs(example_inputs))
+    except Exception as e:
+        # This means that the the minified graph is bad/exposes a different problem.
+        # As we are checking accuracy here, lets log the exception and return False.
+        log.warning(
+            (
+                "While minifying the program in accuracy minification mode,"
+                "ran into a runtime exception which is likely an unrelated issue."
+                " Skipping this graph"
+            )
+        )
+        return False
+
     return not same_two_models(gm, compiled_gm, example_inputs, only_fwd)
 
 
From a95b5eb12cfa24de62b40311b5ebff95b36b331f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aleksandar=20Samard=C5=BEi=C4=87?=
 <asamardzic@quansight.com>
Date: Mon, 28 Nov 2022 15:02:57 +0000
Subject: [PATCH 1366/1922] Refactoring to share vectorization code for
 int8/uint8. (#89650)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89650
Approved by: https://github.com/jgong5, https://github.com/lezcano, https://github.com/peterbell10
---
 aten/src/ATen/cpu/vec/vec256/vec256_int.h | 320 ++++---------------
 aten/src/ATen/cpu/vec/vec512/vec512_int.h | 360 +++++-----------------
 2 files changed, 129 insertions(+), 551 deletions(-)

diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_int.h b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
index 391baeb8b6a32..81e9d687d10a7 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
@@ -133,7 +133,6 @@ class Vectorized<int64_t> : public Vectorizedi {
   Vectorized<int64_t> conj() const {
     return *this;
   }
-  Vectorized<int64_t> frac() const;
   Vectorized<int64_t> neg() const;
   Vectorized<int64_t> operator==(const Vectorized<int64_t>& other) const {
     return _mm256_cmpeq_epi64(values, other.values);
@@ -253,7 +252,6 @@ class Vectorized<int32_t> : public Vectorizedi {
   Vectorized<int32_t> conj() const {
     return *this;
   }
-  Vectorized<int32_t> frac() const;
   Vectorized<int32_t> neg() const;
   Vectorized<int32_t> operator==(const Vectorized<int32_t>& other) const {
     return _mm256_cmpeq_epi32(values, other.values);
@@ -467,7 +465,6 @@ class Vectorized<int16_t> : public Vectorizedi {
   Vectorized<int16_t> conj() const {
     return *this;
   }
-  Vectorized<int16_t> frac() const;
   Vectorized<int16_t> neg() const;
   Vectorized<int16_t> operator==(const Vectorized<int16_t>& other) const {
     return _mm256_cmpeq_epi16(values, other.values);
@@ -496,34 +493,37 @@ class Vectorized<int16_t> : public Vectorizedi {
   Vectorized<int16_t> le(const Vectorized<int16_t>& other) const;
 };
 
-template <>
-class Vectorized<int8_t> : public Vectorizedi {
-private:
-  static const Vectorized<int8_t> ones;
+template <typename T>
+class Vectorized8 : public Vectorizedi {
+  static_assert(
+    std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
+    "Only int8_t/uint8_t are supported");
+protected:
+  static const Vectorized<T> ones;
 public:
-  using value_type = int8_t;
+  using value_type = T;
   static constexpr int size() {
     return 32;
   }
   using Vectorizedi::Vectorizedi;
-  Vectorized() {}
-  Vectorized(int8_t v) { values = _mm256_set1_epi8(v); }
-  Vectorized(int8_t val1, int8_t val2, int8_t val3, int8_t val4,
-         int8_t val5, int8_t val6, int8_t val7, int8_t val8,
-         int8_t val9, int8_t val10, int8_t val11, int8_t val12,
-         int8_t val13, int8_t val14, int8_t val15, int8_t val16,
-         int8_t val17, int8_t val18, int8_t val19, int8_t val20,
-         int8_t val21, int8_t val22, int8_t val23, int8_t val24,
-         int8_t val25, int8_t val26, int8_t val27, int8_t val28,
-         int8_t val29, int8_t val30, int8_t val31, int8_t val32) {
+  Vectorized8() {}
+  Vectorized8(T v) { values = _mm256_set1_epi8(v); }
+  Vectorized8(T val1, T val2, T val3, T val4,
+         T val5, T val6, T val7, T val8,
+         T val9, T val10, T val11, T val12,
+         T val13, T val14, T val15, T val16,
+         T val17, T val18, T val19, T val20,
+         T val21, T val22, T val23, T val24,
+         T val25, T val26, T val27, T val28,
+         T val29, T val30, T val31, T val32) {
     values = _mm256_setr_epi8(val1, val2, val3, val4, val5, val6, val7, val8,
                               val9, val10, val11, val12, val13, val14, val15, val16,
                               val17, val18, val19, val20, val21, val22, val23, val24,
                               val25, val26, val27, val28, val29, val30, val31, val32);
   }
   template <int64_t mask>
-  static Vectorized<int8_t> blend(Vectorized<int8_t> a, Vectorized<int8_t> b) {
-    __at_align__ int8_t tmp_values[size()];
+  static Vectorized<T> blend(Vectorized<T> a, Vectorized<T> b) {
+    __at_align__ T tmp_values[size()];
     a.store(tmp_values);
     if (mask & 0x01)
       tmp_values[0] = _mm256_extract_epi8(b.values, 0);
@@ -591,13 +591,13 @@ class Vectorized<int8_t> : public Vectorizedi {
       tmp_values[31] = _mm256_extract_epi8(b.values, 31);
     return loadu(tmp_values);
   }
-  static Vectorized<int8_t> blendv(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b,
-                               const Vectorized<int8_t>& mask) {
+  static Vectorized<T> blendv(const Vectorized<T>& a, const Vectorized<T>& b,
+                               const Vectorized<T>& mask) {
     return _mm256_blendv_epi8(a.values, b.values, mask.values);
   }
   template <typename step_t>
-  static Vectorized<int8_t> arange(int8_t base = 0, step_t step = static_cast<step_t>(1)) {
-    return Vectorized<int8_t>(
+  static Vectorized<T> arange(T base = 0, step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(
       base,             base +      step, base +  2 * step, base +  3 * step,
       base +  4 * step, base +  5 * step, base +  6 * step, base +  7 * step,
       base +  8 * step, base +  9 * step, base + 10 * step, base + 11 * step,
@@ -607,8 +607,8 @@ class Vectorized<int8_t> : public Vectorizedi {
       base + 24 * step, base + 25 * step, base + 26 * step, base + 27 * step,
       base + 28 * step, base + 29 * step, base + 30 * step, base + 31 * step);
   }
-  static Vectorized<int8_t>
-  set(Vectorized<int8_t> a, Vectorized<int8_t> b, int8_t count = size()) {
+  static Vectorized<T>
+  set(Vectorized<T> a, Vectorized<T> b, T count = size()) {
     switch (count) {
       case 0:
         return a;
@@ -677,18 +677,18 @@ class Vectorized<int8_t> : public Vectorizedi {
     }
     return b;
   }
-  static Vectorized<int8_t> loadu(const void* ptr) {
+  static Vectorized<T> loadu(const void* ptr) {
     return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
   }
-  static Vectorized<int8_t> loadu(const void* ptr, int8_t count) {
-    __at_align__ int8_t tmp_values[size()];
+  static Vectorized<T> loadu(const void* ptr, T count) {
+    __at_align__ T tmp_values[size()];
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
     for (const auto i : c10::irange(size())) {
       tmp_values[i] = 0;
     }
-    std::memcpy(tmp_values, ptr, count * sizeof(int8_t));
+    std::memcpy(tmp_values, ptr, count * sizeof(T));
     return loadu(tmp_values);
   }
   void store(void* ptr, int count = size()) const {
@@ -697,27 +697,35 @@ class Vectorized<int8_t> : public Vectorizedi {
       // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
     } else if (count > 0) {
-      __at_align__ int8_t tmp_values[size()];
+      __at_align__ T tmp_values[size()];
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
-      std::memcpy(ptr, tmp_values, count * sizeof(int8_t));
+      std::memcpy(ptr, tmp_values, count * sizeof(T));
     }
   }
-  const int8_t& operator[](int idx) const  = delete;
-  int8_t& operator[](int idx)  = delete;
-  Vectorized<int8_t> abs() const {
-    return _mm256_abs_epi8(values);
-  }
-  Vectorized<int8_t> real() const {
+  const T& operator[](int idx) const  = delete;
+  T& operator[](int idx)  = delete;
+  Vectorized<T> real() const {
     return *this;
   }
-  Vectorized<int8_t> imag() const {
+  Vectorized<T> imag() const {
     return _mm256_set1_epi8(0);
   }
-  Vectorized<int8_t> conj() const {
+  Vectorized<T> conj() const {
     return *this;
   }
-  Vectorized<int8_t> frac() const;
+};
+
+template<>
+class Vectorized<int8_t>: public Vectorized8<int8_t> {
+public:
+  using Vectorized8::Vectorized8;
+
   Vectorized<int8_t> neg() const;
+
+  Vectorized<int8_t> abs() const {
+   return _mm256_abs_epi8(values);
+  }
+
   Vectorized<int8_t> operator==(const Vectorized<int8_t>& other) const {
     return _mm256_cmpeq_epi8(values, other.values);
   }
@@ -731,10 +739,10 @@ class Vectorized<int8_t> : public Vectorizedi {
     return invert(_mm256_cmpgt_epi8(values, other.values));
   }
   Vectorized<int8_t> operator>(const Vectorized<int8_t>& other) const {
-    return _mm256_cmpgt_epi8(values, other.values);
+    return other < *this;
   }
   Vectorized<int8_t> operator>=(const Vectorized<int8_t>& other) const {
-    return invert(_mm256_cmpgt_epi8(other.values, values));
+    return other <= *this;
   }
 
   Vectorized<int8_t> eq(const Vectorized<int8_t>& other) const;
@@ -745,228 +753,17 @@ class Vectorized<int8_t> : public Vectorizedi {
   Vectorized<int8_t> le(const Vectorized<int8_t>& other) const;
 };
 
-template <>
-class Vectorized<uint8_t> : public Vectorizedi {
-private:
-  static const Vectorized<uint8_t> ones;
+template<>
+class Vectorized<uint8_t>: public Vectorized8<uint8_t> {
 public:
-  using value_type = uint8_t;
-  static constexpr int size() {
-    return 32;
-  }
-  using Vectorizedi::Vectorizedi;
-  Vectorized() {}
-  Vectorized(uint8_t v) { values = _mm256_set1_epi8(v); }
-  Vectorized(uint8_t val1, uint8_t val2, uint8_t val3, uint8_t val4,
-         uint8_t val5, uint8_t val6, uint8_t val7, uint8_t val8,
-         uint8_t val9, uint8_t val10, uint8_t val11, uint8_t val12,
-         uint8_t val13, uint8_t val14, uint8_t val15, uint8_t val16,
-         uint8_t val17, uint8_t val18, uint8_t val19, uint8_t val20,
-         uint8_t val21, uint8_t val22, uint8_t val23, uint8_t val24,
-         uint8_t val25, uint8_t val26, uint8_t val27, uint8_t val28,
-         uint8_t val29, uint8_t val30, uint8_t val31, uint8_t val32) {
-    values = _mm256_setr_epi8(val1, val2, val3, val4, val5, val6, val7, val8,
-                              val9, val10, val11, val12, val13, val14, val15, val16,
-                              val17, val18, val19, val20, val21, val22, val23, val24,
-                              val25, val26, val27, val28, val29, val30, val31, val32);
-  }
-  template <int64_t mask>
-  static Vectorized<uint8_t> blend(Vectorized<uint8_t> a, Vectorized<uint8_t> b) {
-    __at_align__ uint8_t tmp_values[size()];
-    a.store(tmp_values);
-    if (mask & 0x01)
-      tmp_values[0] = _mm256_extract_epi8(b.values, 0);
-    if (mask & 0x02)
-      tmp_values[1] = _mm256_extract_epi8(b.values, 1);
-    if (mask & 0x04)
-      tmp_values[2] = _mm256_extract_epi8(b.values, 2);
-    if (mask & 0x08)
-      tmp_values[3] = _mm256_extract_epi8(b.values, 3);
-    if (mask & 0x10)
-      tmp_values[4] = _mm256_extract_epi8(b.values, 4);
-    if (mask & 0x20)
-      tmp_values[5] = _mm256_extract_epi8(b.values, 5);
-    if (mask & 0x40)
-      tmp_values[6] = _mm256_extract_epi8(b.values, 6);
-    if (mask & 0x80)
-      tmp_values[7] = _mm256_extract_epi8(b.values, 7);
-    if (mask & 0x100)
-      tmp_values[8] = _mm256_extract_epi8(b.values, 8);
-    if (mask & 0x200)
-      tmp_values[9] = _mm256_extract_epi8(b.values, 9);
-    if (mask & 0x400)
-      tmp_values[10] = _mm256_extract_epi8(b.values, 10);
-    if (mask & 0x800)
-      tmp_values[11] = _mm256_extract_epi8(b.values, 11);
-    if (mask & 0x1000)
-      tmp_values[12] = _mm256_extract_epi8(b.values, 12);
-    if (mask & 0x2000)
-      tmp_values[13] = _mm256_extract_epi8(b.values, 13);
-    if (mask & 0x4000)
-      tmp_values[14] = _mm256_extract_epi8(b.values, 14);
-    if (mask & 0x8000)
-      tmp_values[15] = _mm256_extract_epi8(b.values, 15);
-    if (mask & 0x010000)
-      tmp_values[16] = _mm256_extract_epi8(b.values, 16);
-    if (mask & 0x020000)
-      tmp_values[17] = _mm256_extract_epi8(b.values, 17);
-    if (mask & 0x040000)
-      tmp_values[18] = _mm256_extract_epi8(b.values, 18);
-    if (mask & 0x080000)
-      tmp_values[19] = _mm256_extract_epi8(b.values, 19);
-    if (mask & 0x100000)
-      tmp_values[20] = _mm256_extract_epi8(b.values, 20);
-    if (mask & 0x200000)
-      tmp_values[21] = _mm256_extract_epi8(b.values, 21);
-    if (mask & 0x400000)
-      tmp_values[22] = _mm256_extract_epi8(b.values, 22);
-    if (mask & 0x800000)
-      tmp_values[23] = _mm256_extract_epi8(b.values, 23);
-    if (mask & 0x1000000)
-      tmp_values[24] = _mm256_extract_epi8(b.values, 24);
-    if (mask & 0x2000000)
-      tmp_values[25] = _mm256_extract_epi8(b.values, 25);
-    if (mask & 0x4000000)
-      tmp_values[26] = _mm256_extract_epi8(b.values, 26);
-    if (mask & 0x8000000)
-      tmp_values[27] = _mm256_extract_epi8(b.values, 27);
-    if (mask & 0x10000000)
-      tmp_values[28] = _mm256_extract_epi8(b.values, 28);
-    if (mask & 0x20000000)
-      tmp_values[29] = _mm256_extract_epi8(b.values, 29);
-    if (mask & 0x40000000)
-      tmp_values[30] = _mm256_extract_epi8(b.values, 30);
-    if (mask & 0x80000000)
-      tmp_values[31] = _mm256_extract_epi8(b.values, 31);
-    return loadu(tmp_values);
-  }
-  static Vectorized<uint8_t> blendv(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b,
-                               const Vectorized<uint8_t>& mask) {
-    return _mm256_blendv_epi8(a.values, b.values, mask.values);
-  }
-  template <typename step_t>
-  static Vectorized<uint8_t> arange(uint8_t base = 0, step_t step = static_cast<step_t>(1)) {
-    return Vectorized<uint8_t>(
-      base,             base +      step, base +  2 * step, base +  3 * step,
-      base +  4 * step, base +  5 * step, base +  6 * step, base +  7 * step,
-      base +  8 * step, base +  9 * step, base + 10 * step, base + 11 * step,
-      base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step,
-      base + 16 * step, base + 17 * step, base + 18 * step, base + 19 * step,
-      base + 20 * step, base + 21 * step, base + 22 * step, base + 23 * step,
-      base + 24 * step, base + 25 * step, base + 26 * step, base + 27 * step,
-      base + 28 * step, base + 29 * step, base + 30 * step, base + 31 * step);
-  }
-  static Vectorized<uint8_t>
-  set(Vectorized<uint8_t> a, Vectorized<uint8_t> b, uint8_t count = size()) {
-    switch (count) {
-      case 0:
-        return a;
-      case 1:
-        return blend<0x1>(a, b);
-      case 2:
-        return blend<0x3>(a, b);
-      case 3:
-        return blend<0x7>(a, b);
-      case 4:
-        return blend<0xF>(a, b);
-      case 5:
-        return blend<0x1F>(a, b);
-      case 6:
-        return blend<0x3F>(a, b);
-      case 7:
-        return blend<0x7F>(a, b);
-      case 8:
-        return blend<0xFF>(a, b);
-      case 9:
-        return blend<0x1FF>(a, b);
-      case 10:
-        return blend<0x3FF>(a, b);
-      case 11:
-        return blend<0x7FF>(a, b);
-      case 12:
-        return blend<0xFFF>(a, b);
-      case 13:
-        return blend<0x1FFF>(a, b);
-      case 14:
-        return blend<0x3FFF>(a, b);
-      case 15:
-        return blend<0x7FFF>(a, b);
-      case 16:
-        return blend<0xFFFF>(a, b);
-      case 17:
-        return blend<0x1FFFF>(a, b);
-      case 18:
-        return blend<0x3FFFF>(a, b);
-      case 19:
-        return blend<0x7FFFF>(a, b);
-      case 20:
-        return blend<0xFFFFF>(a, b);
-      case 21:
-        return blend<0x1FFFFF>(a, b);
-      case 22:
-        return blend<0x3FFFFF>(a, b);
-      case 23:
-        return blend<0x7FFFFF>(a, b);
-      case 24:
-        return blend<0xFFFFFF>(a, b);
-      case 25:
-        return blend<0x1FFFFFF>(a, b);
-      case 26:
-        return blend<0x3FFFFFF>(a, b);
-      case 27:
-        return blend<0x7FFFFFF>(a, b);
-      case 28:
-        return blend<0xFFFFFFF>(a, b);
-      case 29:
-        return blend<0x1FFFFFFF>(a, b);
-      case 30:
-        return blend<0x3FFFFFFF>(a, b);
-      case 31:
-        return blend<0x7FFFFFFF>(a, b);
-    }
-    return b;
-  }
-  static Vectorized<uint8_t> loadu(const void* ptr) {
-    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
-  }
-  static Vectorized<uint8_t> loadu(const void* ptr, uint8_t count) {
-    __at_align__ uint8_t tmp_values[size()];
-    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
-    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
-    // instructions while a loop would be compiled to one instruction.
-    for (const auto i : c10::irange(size())) {
-      tmp_values[i] = 0;
-    }
-    std::memcpy(tmp_values, ptr, count * sizeof(uint8_t));
-    return loadu(tmp_values);
-  }
-  void store(void* ptr, int count = size()) const {
-    if (count == size()) {
-      // ptr need not to be aligned here. See
-      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
-      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
-    } else if (count > 0) {
-      __at_align__ uint8_t tmp_values[size()];
-      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
-      std::memcpy(ptr, tmp_values, count * sizeof(uint8_t));
-    }
-  }
-  const uint8_t& operator[](int idx) const  = delete;
-  uint8_t& operator[](int idx)  = delete;
+  using Vectorized8::Vectorized8;
+
+  Vectorized<uint8_t> neg() const;
+
   Vectorized<uint8_t> abs() const {
-    return values;
-  }
-  Vectorized<uint8_t> real() const {
     return *this;
   }
-  Vectorized<uint8_t> imag() const {
-    return _mm256_set1_epi8(0);
-  }
-  Vectorized<uint8_t> conj() const {
-    return *this;
-  }
-  Vectorized<uint8_t> frac() const;
-  Vectorized<uint8_t> neg() const;
+
   Vectorized<uint8_t> operator==(const Vectorized<uint8_t>& other) const {
     return _mm256_cmpeq_epi8(values, other.values);
   }
@@ -1021,7 +818,6 @@ Vectorized<uint8_t> inline operator+(const Vectorized<uint8_t>& a, const Vectori
   return _mm256_add_epi8(a, b);
 }
 
-
 template <>
 Vectorized<int64_t> inline operator-(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
   return _mm256_sub_epi64(a, b);
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_int.h b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
index a2550fbfc1dfa..73aae89d51be3 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_int.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
@@ -135,7 +135,6 @@ class Vectorized<int64_t> : public Vectorizedi {
   Vectorized<int64_t> conj() const {
     return *this;
   }
-  Vectorized<int64_t> frac() const;
   Vectorized<int64_t> neg() const;
   Vectorized<int64_t> operator==(const Vectorized<int64_t>& other) const {
     auto mask = _mm512_cmpeq_epi64_mask(values, other.values);
@@ -285,7 +284,6 @@ class Vectorized<int32_t> : public Vectorizedi {
   Vectorized<int32_t> conj() const {
     return *this;
   }
-  Vectorized<int32_t> frac() const;
   Vectorized<int32_t> neg() const;
   Vectorized<int32_t> operator==(const Vectorized<int32_t>& other) const {
     auto mask = _mm512_cmpeq_epi32_mask(values, other.values);
@@ -517,7 +515,6 @@ class Vectorized<int16_t> : public Vectorizedi {
   Vectorized<int16_t> conj() const {
     return *this;
   }
-  Vectorized<int16_t> frac() const;
   Vectorized<int16_t> neg() const;
   Vectorized<int16_t> operator==(const Vectorized<int16_t>& other) const {
     auto mask = _mm512_cmpeq_epi16_mask(values, other.values);
@@ -552,35 +549,38 @@ class Vectorized<int16_t> : public Vectorizedi {
   Vectorized<int16_t> le(const Vectorized<int16_t>& other) const;
 };
 
-template <>
-class Vectorized<int8_t> : public Vectorizedi {
-private:
+template <typename T>
+class Vectorized8 : public Vectorizedi {
+  static_assert(
+    std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
+    "Only int8_t/uint8_t are supported");
+protected:
   static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0};
-  static const Vectorized<int8_t> ones;
+  static const Vectorized<T> ones;
 public:
-  using value_type = int8_t;
+  using value_type = T;
   static constexpr int size() {
     return 64;
   }
   using Vectorizedi::Vectorizedi;
-  Vectorized() {}
-  Vectorized(int8_t v) { values = _mm512_set1_epi8(v); }
-  Vectorized(int8_t val1, int8_t val2, int8_t val3, int8_t val4,
-         int8_t val5, int8_t val6, int8_t val7, int8_t val8,
-         int8_t val9, int8_t val10, int8_t val11, int8_t val12,
-         int8_t val13, int8_t val14, int8_t val15, int8_t val16,
-         int8_t val17, int8_t val18, int8_t val19, int8_t val20,
-         int8_t val21, int8_t val22, int8_t val23, int8_t val24,
-         int8_t val25, int8_t val26, int8_t val27, int8_t val28,
-         int8_t val29, int8_t val30, int8_t val31, int8_t val32,
-         int8_t val33, int8_t val34, int8_t val35, int8_t val36,
-         int8_t val37, int8_t val38, int8_t val39, int8_t val40,
-         int8_t val41, int8_t val42, int8_t val43, int8_t val44,
-         int8_t val45, int8_t val46, int8_t val47, int8_t val48,
-         int8_t val49, int8_t val50, int8_t val51, int8_t val52,
-         int8_t val53, int8_t val54, int8_t val55, int8_t val56,
-         int8_t val57, int8_t val58, int8_t val59, int8_t val60,
-         int8_t val61, int8_t val62, int8_t val63, int8_t val64){
+  Vectorized8() {}
+  Vectorized8(T v) { values = _mm512_set1_epi8(v); }
+  Vectorized8(T val1, T val2, T val3, T val4,
+         T val5, T val6, T val7, T val8,
+         T val9, T val10, T val11, T val12,
+         T val13, T val14, T val15, T val16,
+         T val17, T val18, T val19, T val20,
+         T val21, T val22, T val23, T val24,
+         T val25, T val26, T val27, T val28,
+         T val29, T val30, T val31, T val32,
+         T val33, T val34, T val35, T val36,
+         T val37, T val38, T val39, T val40,
+         T val41, T val42, T val43, T val44,
+         T val45, T val46, T val47, T val48,
+         T val49, T val50, T val51, T val52,
+         T val53, T val54, T val55, T val56,
+         T val57, T val58, T val59, T val60,
+         T val61, T val62, T val63, T val64){
     values = _mm512_set_epi8(val64, val63, val62, val61, val60, val59, val58, val57,
                               val56, val55, val54, val53,val52, val51, val50, val49,
                               val48, val47, val46, val45, val44, val43, val42, val41,
@@ -591,18 +591,12 @@ class Vectorized<int8_t> : public Vectorizedi {
                               val8, val7, val6, val5, val4, val3, val2, val1);
   }
   template <int64_t mask>
-  static Vectorized<int8_t> blend(Vectorized<int8_t> a, Vectorized<int8_t> b) {
+  static Vectorized<T> blend(Vectorized<T> a, Vectorized<T> b) {
     return _mm512_mask_blend_epi8(mask, a.values, b.values);
   }
-  static Vectorized<int8_t> blendv(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b,
-                               const Vectorized<int8_t>& mask) {
-    auto msb_one = _mm512_set1_epi8(0xFF);
-    auto mask_ = _mm512_cmp_epi8_mask(mask, msb_one, _MM_CMPINT_EQ);
-    return _mm512_mask_blend_epi8(mask_, a.values, b.values);
-  }
   template <typename step_t>
-  static Vectorized<int8_t> arange(int8_t base = 0, step_t step = static_cast<step_t>(1)) {
-    return Vectorized<int8_t>(
+  static Vectorized<T> arange(T base = 0, step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(
       base,             base +      step, base +  2 * step, base +  3 * step,
       base +  4 * step, base +  5 * step, base +  6 * step, base +  7 * step,
       base +  8 * step, base +  9 * step, base + 10 * step, base + 11 * step,
@@ -620,8 +614,8 @@ class Vectorized<int8_t> : public Vectorizedi {
       base + 56 * step, base + 57 * step, base + 58 * step, base + 59 * step,
       base + 60 * step, base + 61 * step, base + 62 * step, base + 63 * step);
   }
-  static Vectorized<int8_t>
-  set(Vectorized<int8_t> a, Vectorized<int8_t> b, int8_t count = size()) {
+  static Vectorized<T>
+  set(Vectorized<T> a, Vectorized<T> b, T count = size()) {
     switch (count) {
       case 0:
         return a;
@@ -754,18 +748,18 @@ class Vectorized<int8_t> : public Vectorizedi {
     }
     return b;
   }
-  static Vectorized<int8_t> loadu(const void* ptr) {
+  static Vectorized<T> loadu(const void* ptr) {
     return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
   }
-  static Vectorized<int8_t> loadu(const void* ptr, int8_t count) {
-    __at_align__ int8_t tmp_values[size()];
+  static Vectorized<T> loadu(const void* ptr, T count) {
+    __at_align__ T tmp_values[size()];
     // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
     // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
     // instructions while a loop would be compiled to one instruction.
     for (const auto i : c10::irange(size())) {
       tmp_values[i] = 0;
     }
-    std::memcpy(tmp_values, ptr, count * sizeof(int8_t));
+    std::memcpy(tmp_values, ptr, count * sizeof(T));
     return loadu(tmp_values);
   }
   void store(void* ptr, int count = size()) const {
@@ -774,27 +768,42 @@ class Vectorized<int8_t> : public Vectorizedi {
       // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm512-storeu-si512.html
       _mm512_storeu_si512(reinterpret_cast<__m512i*>(ptr), values);
     } else if (count > 0) {
-      __at_align__ int8_t tmp_values[size()];
+      __at_align__ T tmp_values[size()];
       _mm512_storeu_si512(reinterpret_cast<__m512i*>(tmp_values), values);
-      std::memcpy(ptr, tmp_values, count * sizeof(int8_t));
+      std::memcpy(ptr, tmp_values, count * sizeof(T));
     }
   }
-  const int8_t& operator[](int idx) const  = delete;
-  int8_t& operator[](int idx)  = delete;
-  Vectorized<int8_t> abs() const {
-    return _mm512_abs_epi8(values);
-  }
-  Vectorized<int8_t> real() const {
+  const T& operator[](int idx) const  = delete;
+  T& operator[](int idx)  = delete;
+  Vectorized<T> real() const {
     return *this;
   }
-  Vectorized<int8_t> imag() const {
+  Vectorized<T> imag() const {
     return _mm512_set1_epi8(0);
   }
-  Vectorized<int8_t> conj() const {
+  Vectorized<T> conj() const {
     return *this;
   }
-  Vectorized<int8_t> frac() const;
+};
+
+template<>
+class Vectorized<int8_t>: public Vectorized8<int8_t> {
+public:
+  using Vectorized8::Vectorized8;
+
+  static Vectorized<int8_t> blendv(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b,
+                               const Vectorized<int8_t>& mask) {
+    auto msb_one = _mm512_set1_epi8(0xFF);
+    auto mask_ = _mm512_cmp_epi8_mask(mask, msb_one, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_epi8(mask_, a.values, b.values);
+  }
+
   Vectorized<int8_t> neg() const;
+
+  Vectorized<int8_t> abs() const {
+    return _mm512_abs_epi8(values);
+  }
+
   Vectorized<int8_t> operator==(const Vectorized<int8_t>& other) const {
     auto mask = _mm512_cmpeq_epi8_mask(values, other.values);
     return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF);
@@ -812,12 +821,10 @@ class Vectorized<int8_t> : public Vectorizedi {
     return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF);
   }
   Vectorized<int8_t> operator>(const Vectorized<int8_t>& other) const {
-    auto mask = _mm512_cmpgt_epi8_mask(values, other.values);
-    return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF);
+    return other < *this;
   }
   Vectorized<int8_t> operator>=(const Vectorized<int8_t>& other) const {
-    auto mask = _mm512_cmpge_epi8_mask(values, other.values);
-    return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF);
+    return other <= *this;
   }
 
   Vectorized<int8_t> eq(const Vectorized<int8_t>& other) const;
@@ -828,249 +835,24 @@ class Vectorized<int8_t> : public Vectorizedi {
   Vectorized<int8_t> le(const Vectorized<int8_t>& other) const;
 };
 
-template <>
-class Vectorized<uint8_t> : public Vectorizedi {
-private:
-  static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0};
-  static const Vectorized<uint8_t> ones;
+template<>
+class Vectorized<uint8_t>: public Vectorized8<uint8_t> {
 public:
-  using value_type = uint8_t;
-  static constexpr int size() {
-    return 64;
-  }
-  using Vectorizedi::Vectorizedi;
-  Vectorized() {}
-  Vectorized(uint8_t v) { values = _mm512_set1_epi8(v); }
-  Vectorized(uint8_t val1, uint8_t val2, uint8_t val3, uint8_t val4,
-         uint8_t val5, uint8_t val6, uint8_t val7, uint8_t val8,
-         uint8_t val9, uint8_t val10, uint8_t val11, uint8_t val12,
-         uint8_t val13, uint8_t val14, uint8_t val15, uint8_t val16,
-         uint8_t val17, uint8_t val18, uint8_t val19, uint8_t val20,
-         uint8_t val21, uint8_t val22, uint8_t val23, uint8_t val24,
-         uint8_t val25, uint8_t val26, uint8_t val27, uint8_t val28,
-         uint8_t val29, uint8_t val30, uint8_t val31, uint8_t val32,
-         uint8_t val33, uint8_t val34, uint8_t val35, uint8_t val36,
-         uint8_t val37, uint8_t val38, uint8_t val39, uint8_t val40,
-         uint8_t val41, uint8_t val42, uint8_t val43, uint8_t val44,
-         uint8_t val45, uint8_t val46, uint8_t val47, uint8_t val48,
-         uint8_t val49, uint8_t val50, uint8_t val51, uint8_t val52,
-         uint8_t val53, uint8_t val54, uint8_t val55, uint8_t val56,
-         uint8_t val57, uint8_t val58, uint8_t val59, uint8_t val60,
-         uint8_t val61, uint8_t val62, uint8_t val63, uint8_t val64){
-    values = _mm512_set_epi8(val64, val63, val62, val61, val60, val59, val58, val57,
-                              val56, val55, val54, val53,val52, val51, val50, val49,
-                              val48, val47, val46, val45, val44, val43, val42, val41,
-                              val40, val39, val38, val37, val36, val35, val34, val33,
-                              val32, val31, val30, val29, val28, val27, val26, val25,
-                              val24, val23, val22, val21, val20, val19, val18, val17,
-                              val16, val15, val14, val13, val12, val11, val10, val9,
-                              val8, val7, val6, val5, val4, val3, val2, val1);
-  }
-  template <int64_t mask>
-  static Vectorized<uint8_t> blend(Vectorized<uint8_t> a, Vectorized<uint8_t> b) {
-    return _mm512_mask_blend_epi8(mask, a.values, b.values);
-  }
+  using Vectorized8::Vectorized8;
+
   static Vectorized<uint8_t> blendv(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b,
                                const Vectorized<uint8_t>& mask) {
     auto msb_one = _mm512_set1_epi8(0xFF);
     auto mask_ = _mm512_cmp_epu8_mask(mask, msb_one, _MM_CMPINT_EQ);
     return _mm512_mask_blend_epi8(mask_, a.values, b.values);
   }
-  template <typename step_t>
-  static Vectorized<uint8_t> arange(uint8_t base = 0, step_t step = static_cast<step_t>(1)) {
-    return Vectorized<uint8_t>(
-      base,             base +      step, base +  2 * step, base +  3 * step,
-      base +  4 * step, base +  5 * step, base +  6 * step, base +  7 * step,
-      base +  8 * step, base +  9 * step, base + 10 * step, base + 11 * step,
-      base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step,
-      base + 16 * step, base + 17 * step, base + 18 * step, base + 19 * step,
-      base + 20 * step, base + 21 * step, base + 22 * step, base + 23 * step,
-      base + 24 * step, base + 25 * step, base + 26 * step, base + 27 * step,
-      base + 28 * step, base + 29 * step, base + 30 * step, base + 31 * step,
-      base + 32 * step, base + 33 * step, base + 34 * step, base + 35 * step,
-      base + 36 * step, base + 37 * step, base + 38 * step, base + 39 * step,
-      base + 40 * step, base + 41 * step, base + 42 * step, base + 43 * step,
-      base + 44 * step, base + 45 * step, base + 46 * step, base + 47 * step,
-      base + 48 * step, base + 49 * step, base + 50 * step, base + 51 * step,
-      base + 52 * step, base + 53 * step, base + 54 * step, base + 55 * step,
-      base + 56 * step, base + 57 * step, base + 58 * step, base + 59 * step,
-      base + 60 * step, base + 61 * step, base + 62 * step, base + 63 * step);
-  }
-  static Vectorized<uint8_t>
-  set(Vectorized<uint8_t> a, Vectorized<uint8_t> b, uint8_t count = size()) {
-    switch (count) {
-      case 0:
-        return a;
-      case 1:
-        return blend<0x1>(a, b);
-      case 2:
-        return blend<0x3>(a, b);
-      case 3:
-        return blend<0x7>(a, b);
-      case 4:
-        return blend<0xF>(a, b);
-      case 5:
-        return blend<0x1F>(a, b);
-      case 6:
-        return blend<0x3F>(a, b);
-      case 7:
-        return blend<0x7F>(a, b);
-      case 8:
-        return blend<0xFF>(a, b);
-      case 9:
-        return blend<0x1FF>(a, b);
-      case 10:
-        return blend<0x3FF>(a, b);
-      case 11:
-        return blend<0x7FF>(a, b);
-      case 12:
-        return blend<0xFFF>(a, b);
-      case 13:
-        return blend<0x1FFF>(a, b);
-      case 14:
-        return blend<0x3FFF>(a, b);
-      case 15:
-        return blend<0x7FFF>(a, b);
-      case 16:
-        return blend<0xFFFF>(a, b);
-      case 17:
-        return blend<0x1FFFF>(a, b);
-      case 18:
-        return blend<0x3FFFF>(a, b);
-      case 19:
-        return blend<0x7FFFF>(a, b);
-      case 20:
-        return blend<0xFFFFF>(a, b);
-      case 21:
-        return blend<0x1FFFFF>(a, b);
-      case 22:
-        return blend<0x3FFFFF>(a, b);
-      case 23:
-        return blend<0x7FFFFF>(a, b);
-      case 24:
-        return blend<0xFFFFFF>(a, b);
-      case 25:
-        return blend<0x1FFFFFF>(a, b);
-      case 26:
-        return blend<0x3FFFFFF>(a, b);
-      case 27:
-        return blend<0x7FFFFFF>(a, b);
-      case 28:
-        return blend<0xFFFFFFF>(a, b);
-      case 29:
-        return blend<0x1FFFFFFF>(a, b);
-      case 30:
-        return blend<0x3FFFFFFF>(a, b);
-      case 31:
-        return blend<0x7FFFFFFF>(a, b);
-      case 32:
-        return blend<0xFFFFFFFF>(a, b);
-      case 33:
-        return blend<0x1FFFFFFFF>(a, b);
-      case 34:
-        return blend<0x3FFFFFFFF>(a, b);
-      case 35:
-        return blend<0x7FFFFFFFF>(a, b);
-      case 36:
-        return blend<0xFFFFFFFFF>(a, b);
-      case 37:
-        return blend<0x1FFFFFFFFF>(a, b);
-      case 38:
-        return blend<0x3FFFFFFFFF>(a, b);
-      case 39:
-        return blend<0x7FFFFFFFFF>(a, b);
-      case 40:
-        return blend<0xFFFFFFFFFF>(a, b);
-      case 41:
-        return blend<0x1FFFFFFFFFF>(a, b);
-      case 42:
-        return blend<0x3FFFFFFFFFF>(a, b);
-      case 43:
-        return blend<0x7FFFFFFFFFF>(a, b);
-      case 44:
-        return blend<0xFFFFFFFFFFF>(a, b);
-      case 45:
-        return blend<0x1FFFFFFFFFFF>(a, b);
-      case 46:
-        return blend<0x3FFFFFFFFFFF>(a, b);
-      case 47:
-        return blend<0x7FFFFFFFFFFF>(a, b);
-      case 48:
-        return blend<0xFFFFFFFFFFFF>(a, b);
-      case 49:
-        return blend<0x1FFFFFFFFFFFF>(a, b);
-      case 50:
-        return blend<0x3FFFFFFFFFFFF>(a, b);
-      case 51:
-        return blend<0x7FFFFFFFFFFFF>(a, b);
-      case 52:
-        return blend<0xFFFFFFFFFFFFF>(a, b);
-      case 53:
-        return blend<0x1FFFFFFFFFFFFF>(a, b);
-      case 54:
-        return blend<0x3FFFFFFFFFFFFF>(a, b);
-      case 55:
-        return blend<0x7FFFFFFFFFFFFF>(a, b);
-      case 56:
-        return blend<0xFFFFFFFFFFFFFF>(a, b);
-      case 57:
-        return blend<0x1FFFFFFFFFFFFFF>(a, b);
-      case 58:
-        return blend<0x3FFFFFFFFFFFFFF>(a, b);
-      case 59:
-        return blend<0x7FFFFFFFFFFFFFF>(a, b);
-      case 60:
-        return blend<0xFFFFFFFFFFFFFFF>(a, b);
-      case 61:
-        return blend<0x1FFFFFFFFFFFFFFF>(a, b);
-      case 62:
-        return blend<0x3FFFFFFFFFFFFFFF>(a, b);
-      case 63:
-        return blend<0x7FFFFFFFFFFFFFFF>(a, b);
-    }
-    return b;
-  }
-  static Vectorized<uint8_t> loadu(const void* ptr) {
-    return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
-  }
-  static Vectorized<uint8_t> loadu(const void* ptr, uint8_t count) {
-    __at_align__ uint8_t tmp_values[size()];
-    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
-    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
-    // instructions while a loop would be compiled to one instruction.
-    for (const auto i : c10::irange(size())) {
-      tmp_values[i] = 0;
-    }
-    std::memcpy(tmp_values, ptr, count * sizeof(uint8_t));
-    return loadu(tmp_values);
-  }
-  void store(void* ptr, int count = size()) const {
-    if (count == size()) {
-      // ptr need not to be aligned here. See
-      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm512-storeu-si512.html
-      _mm512_storeu_si512(reinterpret_cast<__m512i*>(ptr), values);
-    } else if (count > 0) {
-      __at_align__ uint8_t tmp_values[size()];
-      _mm512_storeu_si512(reinterpret_cast<__m512i*>(tmp_values), values);
-      std::memcpy(ptr, tmp_values, count * sizeof(uint8_t));
-    }
-  }
-  const uint8_t& operator[](int idx) const  = delete;
-  uint8_t& operator[](int idx)  = delete;
+
+  Vectorized<uint8_t> neg() const;
+
   Vectorized<uint8_t> abs() const {
-    return values;
-  }
-  Vectorized<uint8_t> real() const {
     return *this;
   }
-  Vectorized<uint8_t> imag() const {
-    return _mm512_set1_epi8(0);
-  }
-  Vectorized<uint8_t> conj() const {
-    return *this;
-  }
-  Vectorized<uint8_t> frac() const;
-  Vectorized<uint8_t> neg() const;
+
   Vectorized<uint8_t> operator==(const Vectorized<uint8_t>& other) const {
     auto mask = _mm512_cmpeq_epu8_mask(values, other.values);
     return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF);

From fccb9541eb3176fdd986d5747cf667b8fe2529e0 Mon Sep 17 00:00:00 2001
From: Chen Lai <chenlai@fb.com>
Date: Mon, 28 Nov 2022 21:19:16 -0800
Subject: [PATCH 1367/1922] [PyTorch Edge] Set training for module only
 (#89488)

Update previous recursive logic.

Continue setting training attribute only if the slot is an object and a module.

For the corresponding JIT module, they get the module list first and set module one by one. there is method to get all modules iteratively, instead of recursively.

This change patch one fix to set training attribute for `model_f269583363.ptl`. Another patch is needed, because current lite interpreter doesn't have the correct type when loading object with setstate.

Differential Revision: [D41466417](https://our.internmc.facebook.com/intern/diff/D41466417/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89488
Approved by: https://github.com/iseeyuan
---
 torch/csrc/jit/mobile/module.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/jit/mobile/module.cpp b/torch/csrc/jit/mobile/module.cpp
index 5da8cb4a55da6..8f61cc2402e1b 100644
--- a/torch/csrc/jit/mobile/module.cpp
+++ b/torch/csrc/jit/mobile/module.cpp
@@ -97,6 +97,10 @@ c10::optional<Method> Module::find_method(const std::string& basename) const {
 }
 
 namespace {
+// For JIT, there is a private function to get all modules by iteration in
+// struct slot_iterator_impl (jit/api/module.h). The following function use
+// recursion to mimic the logic without allocating extra memory to get module
+// list and set training attribute directly.
 void set_train_recurse(
     const c10::intrusive_ptr<c10::ivalue::Object>& obj,
     bool on) {
@@ -109,7 +113,9 @@ void set_train_recurse(
         "call .eval() before saving your model?");
   }
   for (const auto& slot : obj->slots()) {
-    if (slot.isObject()) {
+    // slots is a list of IValue. Continue setting training attribute only
+    // if the slot is an object and a module.
+    if (slot.isObject() && slot.toObjectRef().type()->is_module()) {
       set_train_recurse(slot.toObject(), on);
     }
   }

From 7dd6e757282a06eb3dd5b7e44f038ac80f70ab43 Mon Sep 17 00:00:00 2001
From: Jiong Gong <jiong.gong@intel.com>
Date: Tue, 29 Nov 2022 13:55:19 +0000
Subject: [PATCH 1368/1922] Guard the boundary of index computed in
 compute_source_index_and_lambda (#89252)

Improve the fix in https://github.com/pytorch/pytorch/pull/89210
See discussion in https://github.com/pytorch/pytorch/issues/89212#issuecomment-1318911969
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89252
Approved by: https://github.com/mingfeima, https://github.com/weiwangmeta
---
 aten/src/ATen/native/UpSample.h | 10 ++++--
 test/test_nn.py                 | 61 +++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/UpSample.h b/aten/src/ATen/native/UpSample.h
index 144b5921eed33..d4e8112229c91 100644
--- a/aten/src/ATen/native/UpSample.h
+++ b/aten/src/ATen/native/UpSample.h
@@ -449,10 +449,16 @@ static inline void compute_source_index_and_lambda(
     const auto real_input_index =
         area_pixel_compute_source_index<opmath_t>(
             ratio, output_index, align_corners, /*cubic=*/false);
-    input_index0 = static_cast<int64_t>(real_input_index);
+    // when `real_input_index` becomes larger than the range the floating point
+    // type can accurately represent, the type casting to `int64_t` might exceed
+    // `input_size - 1`, causing overflow. So we guard it with `std::min` below.
+    input_index0 = std::min(static_cast<int64_t>(real_input_index), input_size - 1);
     int64_t offset = (input_index0 < input_size - 1) ? 1 : 0;
     input_index1 = input_index0 + offset;
-    lambda1 = real_input_index - input_index0;
+    lambda1 = std::min(
+      std::max(real_input_index - input_index0, static_cast<opmath_t>(0)),
+      static_cast<opmath_t>(1)
+    );
     lambda0 = static_cast<scalar_t>(1.) - lambda1;
   }
 }
diff --git a/test/test_nn.py b/test/test_nn.py
index 552f299e4b8f4..8d108c473994c 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -6903,6 +6903,67 @@ def test_interpolate_illegal_memory_access(self):
         self.assertEqual(out_ref, out)
         self.assertEqual(input_ref.grad, input.grad)
 
+    def test_interpolate_buffer_overflow(self):
+        # Test buffer overflow issue due to inaccurate floating point
+        # representation for integer values. See issue below for details.
+        # https://github.com/pytorch/pytorch/issues/88939
+
+        def helper(size, dtype, mode, device, is_channels_last):
+            input = torch.ones(size, dtype=dtype, device=device)
+            if is_channels_last:
+                if len(size) == 3:
+                    input = input.transpose(1, 2).contiguous().transpose(1, 2)
+                elif len(size) == 4:
+                    input = input.to(memory_format=torch.channels_last)
+                else:
+                    input = input.to(memory_format=torch.channels_last_3d)
+            output1 = F.interpolate(input, 2, mode=mode, align_corners=True)
+            # reset the corner value and expect the output is changed as well
+            # the output won't be changed on buffer overflow
+            input[(-1,) * len(size)] = 0.5
+            output2 = F.interpolate(input, 2, mode=mode, align_corners=True)
+            self.assertNotEqual(output1, output2)
+
+        size_dtype_list = []
+        # We set the size larger than the floating point exactly representable range
+        # float: exact representable range (-2**24,2**24)
+        size_dtype_list.append(([1, 10, 2**24 + 4], torch.float))
+        size_dtype_list.append(([1, 10, 2, 2**24 + 4], torch.float))
+        size_dtype_list.append(([1, 10, 2, 2, 2**24 + 4], torch.float))
+        # bfloat16: exact representable range (-2**8, 2**8)
+        size_dtype_list.append(([1, 10, 2**8 + 4], torch.bfloat16))
+        size_dtype_list.append(([1, 10, 2, 2**8 + 4], torch.bfloat16))
+        size_dtype_list.append(([1, 10, 2, 2, 2**8 + 4], torch.bfloat16))
+        # half: exact representable range (-2**11, 2**11)
+        size_dtype_list.append(([1, 10, 2**11 + 4], torch.half))
+        size_dtype_list.append(([1, 10, 2, 2**11 + 4], torch.half))
+        size_dtype_list.append(([1, 10, 2, 2, 2**11 + 4], torch.half))
+
+        # TODO: turn on cuda test after buffer overflow issue is fixed in cuda kernel
+        # devices = ['cpu'] + (['cuda'] if torch.cuda.is_available() else [])
+        devices = ['cpu']
+
+        for mode in ('linear', 'bilinear', 'bicubic', 'trilinear'):
+            for size_dtype in size_dtype_list:
+                size, dtype = size_dtype
+                if (
+                    mode == 'linear' and len(size) != 3
+                    or (mode == 'bilinear' and len(size) != 4)
+                    or (mode == 'bicubic' and len(size) != 4)
+                    or (mode == 'trilinear' and len(size) != 5)
+                ):
+                    continue
+                for device in devices:
+                    if (
+                        device == 'cpu' and dtype == torch.half
+                        or (device == 'cuda' and dtype == torch.bfloat16)
+                    ):
+                        # no half precision support on cpu or bfloat16 on cuda yet
+                        continue
+                    for is_channels_last in (True, False):
+                        helper(size, dtype, mode, device, is_channels_last)
+
+
     def test_interpolate(self):
         def _test_interpolate_helper(in_t, scale_factor, layer):
             out_size = int(math.floor(in_t.shape[-1] * scale_factor))

From abbf8117586912b7ab317d8bb13f27288494aa07 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Mon, 28 Nov 2022 17:34:28 -0800
Subject: [PATCH 1369/1922] Move functorch/_src to torch/_functorch (#88756)

This will be the last disruptive functorch internals change.

Why are we moving these files?
- As a part of rationalizing functorch we are moving the code in
functorch/_src to torch/_functorch
- This is so that we can offer the functorch APIs as native PyTorch APIs
(coming soon) and resolve some internal build issues.

Why are we moving all of these files at once?
- It's better to break developers all at once rather than many times

Test Plan:
- wait for tests

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88756
Approved by: https://github.com/ezyang
---
 .lintrunner.toml                                  | 10 ++++++++++
 benchmarks/dynamo/common.py                       |  2 +-
 functorch/__init__.py                             | 10 +++++-----
 functorch/_src/__init__.py                        |  5 -----
 functorch/_src/aot_autograd/__init__.py           |  7 +++++++
 functorch/_src/eager_transforms/__init__.py       |  6 ++++++
 functorch/_src/make_functional/__init__.py        |  2 ++
 functorch/_src/vmap/__init__.py                   | 15 +++++++++++++++
 functorch/benchmarks/chrome_trace_parser.py       |  2 +-
 functorch/benchmarks/cse.py                       |  2 +-
 functorch/compile/__init__.py                     | 12 ++++++------
 functorch/experimental/__init__.py                |  4 ++--
 test/dynamo/test_aot_cudagraphs.py                |  6 +++---
 test/dynamo/test_repros.py                        |  6 +++---
 test/functorch/discover_coverage.py               |  2 +-
 test/functorch/test_aotdispatch.py                | 12 ++++++------
 test/functorch/test_eager_transforms.py           |  4 ++--
 test/functorch/test_memory_efficient_fusion.py    |  2 +-
 test/functorch/test_minifier.py                   |  2 +-
 test/functorch/test_ops.py                        |  2 +-
 test/functorch/test_vmap.py                       |  2 +-
 test/inductor/test_torchinductor.py               |  2 +-
 test/test_functionalization.py                    |  2 +-
 torch/_dynamo/debug_utils.py                      |  2 +-
 torch/_dynamo/eval_frame.py                       |  2 +-
 torch/_dynamo/optimizations/training.py           |  4 ++--
 torch/_functorch/__init__.py                      |  5 +++++
 .../_src => torch/_functorch}/aot_autograd.py     |  0
 .../_src => torch/_functorch}/benchmark_utils.py  |  0
 .../_src => torch/_functorch}/compile_utils.py    |  0
 {functorch/_src => torch/_functorch}/compilers.py |  0
 {functorch/_src => torch/_functorch}/config.py    |  0
 .../_src => torch/_functorch}/eager_transforms.py |  0
 .../_src => torch/_functorch}/fx_minifier.py      |  0
 .../_src => torch/_functorch}/make_functional.py  |  0
 .../_functorch}/named_members_polyfill.py         |  0
 .../_src => torch/_functorch}/partitioners.py     |  0
 .../_src => torch/_functorch}/python_key.py       |  0
 .../_src => torch/_functorch}/pytree_hacks.py     |  0
 .../_functorch}/top_operators_github_usage.py     |  0
 {functorch/_src => torch/_functorch}/vmap.py      |  0
 torch/_inductor/compile_fx.py                     |  4 ++--
 42 files changed, 88 insertions(+), 48 deletions(-)
 create mode 100644 functorch/_src/aot_autograd/__init__.py
 create mode 100644 functorch/_src/eager_transforms/__init__.py
 create mode 100644 functorch/_src/make_functional/__init__.py
 create mode 100644 functorch/_src/vmap/__init__.py
 rename {functorch/_src => torch/_functorch}/aot_autograd.py (100%)
 rename {functorch/_src => torch/_functorch}/benchmark_utils.py (100%)
 rename {functorch/_src => torch/_functorch}/compile_utils.py (100%)
 rename {functorch/_src => torch/_functorch}/compilers.py (100%)
 rename {functorch/_src => torch/_functorch}/config.py (100%)
 rename {functorch/_src => torch/_functorch}/eager_transforms.py (100%)
 rename {functorch/_src => torch/_functorch}/fx_minifier.py (100%)
 rename {functorch/_src => torch/_functorch}/make_functional.py (100%)
 rename {functorch/_src => torch/_functorch}/named_members_polyfill.py (100%)
 rename {functorch/_src => torch/_functorch}/partitioners.py (100%)
 rename {functorch/_src => torch/_functorch}/python_key.py (100%)
 rename {functorch/_src => torch/_functorch}/pytree_hacks.py (100%)
 rename {functorch/_src => torch/_functorch}/top_operators_github_usage.py (100%)
 rename {functorch/_src => torch/_functorch}/vmap.py (100%)

diff --git a/.lintrunner.toml b/.lintrunner.toml
index fa7e484fb3e18..a843471c3ee8d 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -101,6 +101,16 @@ exclude_patterns = [
     'torch/csrc/**',
     'torch/_dynamo/**/*.py',
     'torch/_inductor/**/*.py',
+    'torch/_functorch/aot_autograd.py',
+    'torch/_functorch/benchmark_utils.py',
+    'torch/_functorch/compile_utils.py',
+    'torch/_functorch/compilers.py',
+    'torch/_functorch/eager_transforms.py',
+    'torch/_functorch/fx_minifier.py',
+    'torch/_functorch/partitioners.py',
+    'torch/_functorch/make_functional.py',
+    'torch/_functorch/top_operators_github_usage.py',
+    'torch/_functorch/vmap.py',
     'torch/distributed/elastic/agent/server/api.py',
     'torch/testing/_internal/**',
     'torch/distributed/fsdp/fully_sharded_data_parallel.py',
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index cabbe3c411617..4b3e38bd81d68 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -23,13 +23,13 @@
 import torch._dynamo
 import torch._dynamo.utils
 import torch.distributed
-from functorch._src.aot_autograd import set_model_name
 from scipy.stats import gmean, ttest_ind
 from torch._dynamo.optimizations import backends
 from torch._dynamo.optimizations.log_args import conv_args_analysis
 from torch._dynamo.profiler import fx_insert_profiling, Profiler
 from torch._dynamo.testing import dummy_fx_compile, format_speedup, same
 from torch._dynamo.utils import clone_inputs
+from torch._functorch.aot_autograd import set_model_name
 from torch._inductor import config as inductor_config
 from torch._inductor.utils import fresh_inductor_cache
 from torch._subclasses.fake_tensor import FakeTensorMode
diff --git a/functorch/__init__.py b/functorch/__init__.py
index 971ce793d7203..c02ae3c443b6f 100644
--- a/functorch/__init__.py
+++ b/functorch/__init__.py
@@ -8,19 +8,19 @@
 
 # Top-level APIs. Please think carefully before adding something to the
 # top-level namespace:
-# - private helper functions should go into functorch._src
+# - private helper functions should go into torch._functorch
 # - very experimental things should go into functorch.experimental
 # - compilation related things should go into functorch.compile
 
 # functorch transforms
-from ._src.vmap import vmap
-from ._src.eager_transforms import (
+from torch._functorch.vmap import vmap
+from torch._functorch.eager_transforms import (
     grad, grad_and_value, vjp, jacrev, jvp, jacfwd, hessian, functionalize
 )
-from ._src.python_key import make_fx
+from torch._functorch.python_key import make_fx
 
 # utilities. Maybe these should go in their own namespace in the future?
-from ._src.make_functional import (
+from torch._functorch.make_functional import (
     make_functional_with_buffers,
     make_functional,
     combine_state_for_ensemble,
diff --git a/functorch/_src/__init__.py b/functorch/_src/__init__.py
index 10a55772ab58b..e69de29bb2d1d 100644
--- a/functorch/_src/__init__.py
+++ b/functorch/_src/__init__.py
@@ -1,5 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
diff --git a/functorch/_src/aot_autograd/__init__.py b/functorch/_src/aot_autograd/__init__.py
new file mode 100644
index 0000000000000..1bbf22fd2743a
--- /dev/null
+++ b/functorch/_src/aot_autograd/__init__.py
@@ -0,0 +1,7 @@
+# This file has moved. It is not public API. If you are not a PyTorch developer
+# and you are relying on the following imports, please file an issue.
+from torch._functorch.aot_autograd import (
+    aot_autograd_decompositions,
+    KNOWN_TYPES,
+    PytreeThunk,
+)
diff --git a/functorch/_src/eager_transforms/__init__.py b/functorch/_src/eager_transforms/__init__.py
new file mode 100644
index 0000000000000..b9d2ebb5ae5a9
--- /dev/null
+++ b/functorch/_src/eager_transforms/__init__.py
@@ -0,0 +1,6 @@
+# This file has moved. It is not public API. If you are not a PyTorch developer
+# and you are relying on the following imports, please file an issue.
+from torch._functorch.eager_transforms import (
+    _unwrap_functional_tensor,
+    _assert_wrapped_functional,
+)
diff --git a/functorch/_src/make_functional/__init__.py b/functorch/_src/make_functional/__init__.py
new file mode 100644
index 0000000000000..507038070931d
--- /dev/null
+++ b/functorch/_src/make_functional/__init__.py
@@ -0,0 +1,2 @@
+# This file has moved. Please update your imports
+from torch._functorch.make_functional import _swap_state
diff --git a/functorch/_src/vmap/__init__.py b/functorch/_src/vmap/__init__.py
new file mode 100644
index 0000000000000..44aebc90e1745
--- /dev/null
+++ b/functorch/_src/vmap/__init__.py
@@ -0,0 +1,15 @@
+# This file has moved. It is not public API. If you are not a PyTorch developer
+# and you are relying on the following imports, please file an issue.
+from torch._functorch.vmap import (
+    _add_batch_dim,
+    _broadcast_to_and_flatten,
+    _get_name,
+    _remove_batch_dim,
+    _validate_and_get_batch_size,
+    Tensor,
+    tree_flatten,
+    tree_unflatten,
+    _process_batched_inputs,
+    _create_batched_inputs,
+    _unwrap_batched,
+)
diff --git a/functorch/benchmarks/chrome_trace_parser.py b/functorch/benchmarks/chrome_trace_parser.py
index 54d2bf1447fb1..ccc8b89544bc3 100755
--- a/functorch/benchmarks/chrome_trace_parser.py
+++ b/functorch/benchmarks/chrome_trace_parser.py
@@ -5,7 +5,7 @@
 import logging
 import pandas as pd
 
-from functorch._src.benchmark_utils import compute_utilization
+from torch._functorch.benchmark_utils import compute_utilization
 
 # process the chrome traces output by the pytorch profiler
 # require the json input file's name to be in format {model_name}_chrome_trace_*.json
diff --git a/functorch/benchmarks/cse.py b/functorch/benchmarks/cse.py
index 028677d6ee259..14cde14eb3085 100644
--- a/functorch/benchmarks/cse.py
+++ b/functorch/benchmarks/cse.py
@@ -3,7 +3,7 @@
 from functorch import make_fx
 from torch.profiler import profile, ProfilerActivity
 
-from functorch._src.compile_utils import fx_graph_cse
+from torch._functorch.compile_utils import fx_graph_cse
 
 def profile_it(f, inp):
     for _ in range(5):
diff --git a/functorch/compile/__init__.py b/functorch/compile/__init__.py
index 12549dceda9fb..569c1b6819bdd 100644
--- a/functorch/compile/__init__.py
+++ b/functorch/compile/__init__.py
@@ -1,6 +1,6 @@
-from .._src.python_key import pythonkey_decompose
-from .._src.fx_minifier import minifier
-from .._src.aot_autograd import (
+from torch._functorch.python_key import pythonkey_decompose
+from torch._functorch.fx_minifier import minifier
+from torch._functorch.aot_autograd import (
     aot_function,
     aot_module,
     compiled_function,
@@ -12,7 +12,7 @@
     make_boxed_func,
     make_boxed_compiler
 )
-from .._src.compilers import (
+from torch._functorch.compilers import (
     ts_compile,
     draw_graph_compile,
     nop,
@@ -22,10 +22,10 @@
     print_compile,
     default_decompositions
 )
-from .._src.partitioners import (
+from torch._functorch.partitioners import (
     min_cut_rematerialization_partition,
     default_partition,
     draw_graph,
     draw_joint_graph,
 )
-from .._src import config
+from torch._functorch import config
diff --git a/functorch/experimental/__init__.py b/functorch/experimental/__init__.py
index 3a4c92ffbe7a5..dde503f93bb62 100644
--- a/functorch/experimental/__init__.py
+++ b/functorch/experimental/__init__.py
@@ -1,5 +1,5 @@
 # PyTorch forward-mode is not mature yet
-from .._src.eager_transforms import hessian, jacfwd, jvp
-from .._src.vmap import chunk_vmap
+from torch._functorch.eager_transforms import hessian, jacfwd, jvp
+from torch._functorch.vmap import chunk_vmap
 from .batch_norm_replacement import replace_all_batch_norm_modules_
 from functorch import functionalize
diff --git a/test/dynamo/test_aot_cudagraphs.py b/test/dynamo/test_aot_cudagraphs.py
index 5b2e6eb2f9eac..5299e92a060f7 100644
--- a/test/dynamo/test_aot_cudagraphs.py
+++ b/test/dynamo/test_aot_cudagraphs.py
@@ -104,7 +104,7 @@ def fn(x, y):
         y = torch.randn((), device="cpu")
         fn(x, y)
 
-    @patch("functorch._src.config.use_functionalize", True)
+    @patch("torch._functorch.config.use_functionalize", True)
     def test_mutate_input(self):
         def model(x, y):
             y.add_(3)
@@ -159,7 +159,7 @@ def fn(y):
         y = torch.randn(3, device="cuda:0", requires_grad=True)
         fn(y)
 
-    @patch("functorch._src.config.use_functionalize", True)
+    @patch("torch._functorch.config.use_functionalize", True)
     @patch_all()
     def test_mutated_metadata(self):
         # more tortured example at
@@ -180,7 +180,7 @@ def fn(x):
         x = torch.empty(0, device="cuda:0")
         fn(x)
 
-    @patch("functorch._src.config.use_functionalize", True)
+    @patch("torch._functorch.config.use_functionalize", True)
     @patch_all()
     def test_dead_fill(self):
         def model(x):
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index ec5ea4ac1fb55..af666451590ff 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -11,8 +11,6 @@
 from typing import List
 from unittest.mock import patch
 
-import functorch._src.config
-
 import numpy as np
 import torch
 
@@ -20,6 +18,8 @@
 import torch._dynamo.testing
 import torch._dynamo.utils
 
+import torch._functorch.config
+
 try:
     from test_minifier import requires_cuda
 except ImportError:
@@ -1681,7 +1681,7 @@ def fn(x):
         opt_fn(x)
         self.assertEqual(cnt.frame_count, 1)
 
-    @patch.object(functorch._src.config, "use_dynamic_shapes", True)
+    @patch.object(torch._functorch.config, "use_dynamic_shapes", True)
     def test_bigbird_unsqueeze_inplace(self):
         def fn(reshape_2):
             view_2 = reshape_2.clone()
diff --git a/test/functorch/discover_coverage.py b/test/functorch/discover_coverage.py
index e52f317087b4c..3f4f74b9224de 100644
--- a/test/functorch/discover_coverage.py
+++ b/test/functorch/discover_coverage.py
@@ -3,7 +3,7 @@
 from torch.testing._internal.common_methods_invocations import op_db
 from functorch_additional_op_db import additional_op_db
 from enum import Enum
-import functorch._src.top_operators_github_usage as top_ops
+import torch._functorch.top_operators_github_usage as top_ops
 import pprint
 import unittest
 import enum
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 3a604281ca956..edf9d375e5189 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -22,7 +22,7 @@
     grad, vjp, vmap, jacrev,
     make_fx
 )
-from functorch._src.aot_autograd import aot_module_simplified
+from torch._functorch.aot_autograd import aot_module_simplified
 from functorch.compile import (
     nnc_jit, compiled_function, compiled_module,
     min_cut_rematerialization_partition, aot_function, aot_module,
@@ -991,7 +991,7 @@ def f(a, b, c):
         inp = [torch.randn(5, requires_grad=True) for _ in range(3)]
         f(*inp).sum().backward()
 
-    @patch('functorch._src.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
+    @patch('torch._functorch.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
     def test_compilation_context(self, counter):
         def f(x):
             return x.sin().sin()
@@ -1016,8 +1016,8 @@ def f(x, y):
         x = torch.randn(3, 3, requires_grad=True)
         self.verify_aot_autograd(f, [x, x])
 
-    @patch('functorch._src.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
-    @patch("functorch._src.config.debug_assert", True)
+    @patch('torch._functorch.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
+    @patch("torch._functorch.config.debug_assert", True)
     def test_invalid_dupe(self, counter):
         class F(torch.nn.Module):
             def forward(self, x, y):
@@ -1037,8 +1037,8 @@ def forward(self, x, y):
             """At compilation time, graph 1 was compiled under the assumption that input 1 would be a duplicate of input 0, but at runtime this was not the case.  This indicates a guard bug in AOTAutograd or Dynamo, please file a bug to PyTorch."""  # noqa: B950
         )
 
-    @patch('functorch._src.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
-    @patch("functorch._src.config.debug_assert", True)
+    @patch('torch._functorch.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
+    @patch("torch._functorch.config.debug_assert", True)
     def test_invalid_requires_grad(self, counter):
         class F(torch.nn.Module):
             def forward(self, x, y):
diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py
index e123da0d9d3c9..e9d0cbfb4f919 100644
--- a/test/functorch/test_eager_transforms.py
+++ b/test/functorch/test_eager_transforms.py
@@ -32,10 +32,10 @@
     jvp, make_functional, make_functional_with_buffers,
     combine_state_for_ensemble, make_fx
 )
-from functorch._src.make_functional import (
+from torch._functorch.make_functional import (
     functional_init, functional_init_with_buffers,
 )
-from functorch._src.eager_transforms import enable_fwd_grad, _slice_argnums
+from torch._functorch.eager_transforms import enable_fwd_grad, _slice_argnums
 from functorch.experimental import functionalize
 from torch._ops import PyOperator
 from torch._functorch.utils import enable_autograd_function
diff --git a/test/functorch/test_memory_efficient_fusion.py b/test/functorch/test_memory_efficient_fusion.py
index b0f18f06b8295..e12da51004504 100644
--- a/test/functorch/test_memory_efficient_fusion.py
+++ b/test/functorch/test_memory_efficient_fusion.py
@@ -6,7 +6,7 @@
 from functorch import make_fx
 from torch.nn import functional as F
 from functorch.compile import memory_efficient_fusion
-from functorch._src.compile_utils import fx_graph_cse
+from torch._functorch.compile_utils import fx_graph_cse
 from torch.testing._internal.common_utils import TestCase, run_tests
 import inspect
 import random
diff --git a/test/functorch/test_minifier.py b/test/functorch/test_minifier.py
index 49af42795592d..7ed13921d9077 100644
--- a/test/functorch/test_minifier.py
+++ b/test/functorch/test_minifier.py
@@ -2,7 +2,7 @@
 
 import torch
 from functorch.compile import minifier
-from functorch._src.compile_utils import get_placeholders, get_outputs
+from torch._functorch.compile_utils import get_placeholders, get_outputs
 from functorch import make_fx
 from torch.testing._internal.common_utils import TestCase, run_tests
 
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index c0ae683cdfbf7..93e87e7be54a4 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -41,7 +41,7 @@
 from torch.utils._pytree import tree_flatten, tree_unflatten, tree_map
 from functorch import grad, vjp, vmap, jacrev, jacfwd
 import torch.autograd.forward_ad as fwAD
-from functorch._src.eager_transforms import _as_tuple, jvp
+from torch._functorch.eager_transforms import _as_tuple, jvp
 
 aten = torch.ops.aten
 
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index 4b460560d8a90..dcad523217f3f 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -49,7 +49,7 @@
 from functorch import vmap, grad, grad_and_value, jvp, vjp, jacfwd
 from functorch.experimental import chunk_vmap
 from torch._C._functorch import reshape_dim_into, reshape_dim_outof
-from functorch._src.make_functional import functional_init_with_buffers
+from torch._functorch.make_functional import functional_init_with_buffers
 
 FALLBACK_REGEX = 'There is a performance drop'
 
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 209a5bd0a7a33..0c622e49d105e 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -5527,7 +5527,7 @@ def noop_backend(
                 Instead, it transforms the fx graph so that its functions are
                 aten operations. It then saves this graph.
                 """
-                from functorch._src.aot_autograd import Interpreter
+                from torch._functorch.aot_autograd import Interpreter
                 from torch._inductor.decomposition import select_decomp_table
                 from torch._subclasses import FakeTensorMode
 
diff --git a/test/test_functionalization.py b/test/test_functionalization.py
index d699c03ed4173..d477c2cc595c3 100644
--- a/test/test_functionalization.py
+++ b/test/test_functionalization.py
@@ -150,7 +150,7 @@ def f(input):
 
         def g(x):
             loss = f(x).sum()
-            from functorch._src.aot_autograd import setup_stacktrace_preservation_hooks
+            from torch._functorch.aot_autograd import setup_stacktrace_preservation_hooks
             import torch.fx.traceback as fx_traceback
             setup_stacktrace_preservation_hooks([loss.grad_fn])
             with fx_traceback.override_stack_trace():
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index 6af1e6c4cfdd7..b788ca46245f9 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -502,7 +502,7 @@ def run_fwd_maybe_bwd(gm, args, only_fwd=False):
     """
     Runs a forward and possibly backward iteration for a given mod and args.
     """
-    from functorch._src.aot_autograd import make_boxed_func
+    from torch._functorch.aot_autograd import make_boxed_func
 
     from .testing import collect_results, reduce_to_scalar_loss, requires_bwd_pass
 
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index a04bc72aa6cbb..cc0a1648bf2c0 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -120,7 +120,7 @@ def enable_dynamic(enable: bool = True):
         yield
         return
     with patch("torch._dynamo.config.dynamic_shapes", True), patch(
-        "functorch._src.config.use_dynamic_shapes", True
+        "torch._functorch.config.use_dynamic_shapes", True
     ):
         yield
 
diff --git a/torch/_dynamo/optimizations/training.py b/torch/_dynamo/optimizations/training.py
index 76eeedd519ca7..8c12750d47e19 100644
--- a/torch/_dynamo/optimizations/training.py
+++ b/torch/_dynamo/optimizations/training.py
@@ -6,8 +6,6 @@
 from importlib import import_module
 from typing import Set
 
-from functorch._src.compilers import debug_nop
-
 from functorch.compile import (
     aot_module_simplified,
     min_cut_rematerialization_partition,
@@ -16,6 +14,8 @@
 )
 
 import torch
+
+from torch._functorch.compilers import debug_nop
 from torch.fx import GraphModule
 from torch.fx.passes.backends.cudagraphs import partition_cudagraphs
 from torch.multiprocessing.reductions import StorageWeakRef
diff --git a/torch/_functorch/__init__.py b/torch/_functorch/__init__.py
index e69de29bb2d1d..10a55772ab58b 100644
--- a/torch/_functorch/__init__.py
+++ b/torch/_functorch/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/functorch/_src/aot_autograd.py b/torch/_functorch/aot_autograd.py
similarity index 100%
rename from functorch/_src/aot_autograd.py
rename to torch/_functorch/aot_autograd.py
diff --git a/functorch/_src/benchmark_utils.py b/torch/_functorch/benchmark_utils.py
similarity index 100%
rename from functorch/_src/benchmark_utils.py
rename to torch/_functorch/benchmark_utils.py
diff --git a/functorch/_src/compile_utils.py b/torch/_functorch/compile_utils.py
similarity index 100%
rename from functorch/_src/compile_utils.py
rename to torch/_functorch/compile_utils.py
diff --git a/functorch/_src/compilers.py b/torch/_functorch/compilers.py
similarity index 100%
rename from functorch/_src/compilers.py
rename to torch/_functorch/compilers.py
diff --git a/functorch/_src/config.py b/torch/_functorch/config.py
similarity index 100%
rename from functorch/_src/config.py
rename to torch/_functorch/config.py
diff --git a/functorch/_src/eager_transforms.py b/torch/_functorch/eager_transforms.py
similarity index 100%
rename from functorch/_src/eager_transforms.py
rename to torch/_functorch/eager_transforms.py
diff --git a/functorch/_src/fx_minifier.py b/torch/_functorch/fx_minifier.py
similarity index 100%
rename from functorch/_src/fx_minifier.py
rename to torch/_functorch/fx_minifier.py
diff --git a/functorch/_src/make_functional.py b/torch/_functorch/make_functional.py
similarity index 100%
rename from functorch/_src/make_functional.py
rename to torch/_functorch/make_functional.py
diff --git a/functorch/_src/named_members_polyfill.py b/torch/_functorch/named_members_polyfill.py
similarity index 100%
rename from functorch/_src/named_members_polyfill.py
rename to torch/_functorch/named_members_polyfill.py
diff --git a/functorch/_src/partitioners.py b/torch/_functorch/partitioners.py
similarity index 100%
rename from functorch/_src/partitioners.py
rename to torch/_functorch/partitioners.py
diff --git a/functorch/_src/python_key.py b/torch/_functorch/python_key.py
similarity index 100%
rename from functorch/_src/python_key.py
rename to torch/_functorch/python_key.py
diff --git a/functorch/_src/pytree_hacks.py b/torch/_functorch/pytree_hacks.py
similarity index 100%
rename from functorch/_src/pytree_hacks.py
rename to torch/_functorch/pytree_hacks.py
diff --git a/functorch/_src/top_operators_github_usage.py b/torch/_functorch/top_operators_github_usage.py
similarity index 100%
rename from functorch/_src/top_operators_github_usage.py
rename to torch/_functorch/top_operators_github_usage.py
diff --git a/functorch/_src/vmap.py b/torch/_functorch/vmap.py
similarity index 100%
rename from functorch/_src/vmap.py
rename to torch/_functorch/vmap.py
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index 3472f0e2efec1..34cbec8c60716 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -6,10 +6,10 @@
 from typing import List
 
 import functorch
-from functorch._src.aot_autograd import make_boxed_func
 from functorch.compile import min_cut_rematerialization_partition
 
 import torch.fx
+from torch._functorch.aot_autograd import make_boxed_func
 from torch._subclasses.fake_tensor import FakeTensor
 
 from . import config, metrics, overrides
@@ -391,7 +391,7 @@ def bw_compiler(model: torch.fx.GraphModule, example_inputs):
     with overrides.patch_functions():
 
         # TODO: can add logging before/after the call to create_aot_dispatcher_function
-        # in functorch/_src/aot_autograd.py::aot_module_simplified::aot_function_simplified::new_func
+        # in torch._functorch/aot_autograd.py::aot_module_simplified::aot_function_simplified::new_func
         # once torchdynamo is merged into pytorch
         return aot_autograd(
             fw_compiler=fw_compiler,

From b2bb4dbe03810613210e2c007ed693794539f389 Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Mon, 28 Nov 2022 13:14:38 -0800
Subject: [PATCH 1370/1922] [Quant][docs] Fix BackendConfig example in
 docstring/README (#89319)

Summary: The example in the BackendConfig docstring and the README
was not runnable. This fixes a typo (`bias_type` -> `bias_dtype`),
removes the call to an internal helper function, and adds an
additional BackendPatternConfig to make the example BackendConfig
more realistic and useful.

Reviewers: jerryzh168, vkuzo

Subscribers: jerryzh168, vkuzo

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89319
Approved by: https://github.com/jerryzh168
---
 .../ao/quantization/backend_config/README.md  | 29 +++++++++++++---
 .../backend_config/backend_config.py          | 34 ++++++++++++++-----
 2 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/torch/ao/quantization/backend_config/README.md b/torch/ao/quantization/backend_config/README.md
index 985765e6badc4..5d37fce9ec502 100644
--- a/torch/ao/quantization/backend_config/README.md
+++ b/torch/ao/quantization/backend_config/README.md
@@ -49,15 +49,24 @@ The BackendConfig is comprised of a list of BackendPatternConfigs, each of which
 
 ```
 import torch
-from torch.ao.quantization.backend_config import BackendConfig, BackendPatternConfig, DTypeConfig, ObservationType
-from torch.ao.quantization.fuser_method_mappings import reverse_sequential_wrapper2
+from torch.ao.quantization.backend_config import (
+    BackendConfig,
+    BackendPatternConfig,
+    DTypeConfig,
+    ObservationType,
+)
 
 weighted_int8_dtype_config = DTypeConfig(
     input_dtype=torch.quint8,
     output_dtype=torch.quint8,
     weight_dtype=torch.qint8,
-    bias_type=torch.float)
+    bias_dtype=torch.float)
+
+def fuse_conv2d_relu(is_qat, relu, conv):
+    """Return a fused ConvReLU2d from individual conv and relu modules."""
+    return torch.ao.nn.intrinsic.ConvReLU2d(conv, relu)
 
+# For quantizing Linear
 linear_config = BackendPatternConfig(torch.nn.Linear) \
     .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT) \
     .add_dtype_config(weighted_int8_dtype_config) \
@@ -65,15 +74,25 @@ linear_config = BackendPatternConfig(torch.nn.Linear) \
     .set_qat_module(torch.ao.nn.qat.Linear) \
     .set_reference_quantized_module(torch.ao.nn.quantized.reference.Linear)
 
+# For fusing Conv2d + ReLU into ConvReLU2d
 conv_relu_config = BackendPatternConfig((torch.nn.ReLU, torch.nn.Conv2d)) \
     .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT) \
     .add_dtype_config(weighted_int8_dtype_config) \
     .set_fused_module(torch.ao.nn.intrinsic.ConvReLU2d) \
-    .set_fuser_method(reverse_sequential_wrapper2(torch.ao.nn.intrinsic.ConvReLU2d))
+    .set_fuser_method(fuse_conv2d_relu)
+
+# For quantizing ConvReLU2d
+fused_conv_relu_config = BackendPatternConfig(torch.ao.nn.intrinsic.ConvReLU2d) \
+    .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT) \
+    .add_dtype_config(weighted_int8_dtype_config) \
+    .set_root_module(torch.nn.Conv2d) \
+    .set_qat_module(torch.ao.nn.intrinsic.qat.ConvReLU2d) \
+    .set_reference_quantized_module(torch.ao.nn.quantized.reference.Conv2d)
 
 backend_config = BackendConfig("my_backend") \
     .set_backend_pattern_config(linear_config) \
-    .set_backend_pattern_config(conv_relu_config)
+    .set_backend_pattern_config(conv_relu_config) \
+    .set_backend_pattern_config(fused_conv_relu_config)
 ```
 
 ### Observer Insertion
diff --git a/torch/ao/quantization/backend_config/backend_config.py b/torch/ao/quantization/backend_config/backend_config.py
index e8af42ff4b6a9..4b3d4d3aa8130 100644
--- a/torch/ao/quantization/backend_config/backend_config.py
+++ b/torch/ao/quantization/backend_config/backend_config.py
@@ -228,31 +228,49 @@ class BackendConfig:
     Example usage::
 
         import torch
-        from torch.ao.quantization.backend_config import BackendConfig, BackendPatternConfig, DTypeConfig, ObservationType
-        from torch.ao.quantization.fuser_method_mappings import _reverse_sequential_wrapper2
+        from torch.ao.quantization.backend_config import (
+            BackendConfig,
+            BackendPatternConfig,
+            DTypeConfig,
+            ObservationType,
+        )
 
         weighted_int8_dtype_config = DTypeConfig(
             input_dtype=torch.quint8,
             output_dtype=torch.quint8,
             weight_dtype=torch.qint8,
-            bias_type=torch.float)
+            bias_dtype=torch.float)
 
+        def fuse_conv2d_relu(is_qat, relu, conv):
+            return torch.ao.nn.intrinsic.ConvReLU2d(conv, relu)
+
+        # For quantizing Linear
         linear_config = BackendPatternConfig(torch.nn.Linear) \
             .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT) \
             .add_dtype_config(weighted_int8_dtype_config) \
             .set_root_module(torch.nn.Linear) \
-            .set_qat_module(torch.nn.qat.Linear) \
-            .set_reference_quantized_module(torch.nn.quantized._reference.Linear)
+            .set_qat_module(torch.ao.nn.qat.Linear) \
+            .set_reference_quantized_module(torch.ao.nn.quantized.reference.Linear)
 
+        # For fusing Conv2d + ReLU into ConvReLU2d
         conv_relu_config = BackendPatternConfig((torch.nn.ReLU, torch.nn.Conv2d)) \
             .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT) \
             .add_dtype_config(weighted_int8_dtype_config) \
-            .set_fused_module(torch.nn.intrinsic.ConvReLU2d) \
-            .set_fuser_method(_reverse_sequential_wrapper2(torch.nn.intrinsic.ConvReLU2d))
+            .set_fused_module(torch.ao.nn.intrinsic.ConvReLU2d) \
+            .set_fuser_method(fuse_conv2d_relu)
+
+        # For quantizing ConvReLU2d
+        fused_conv_relu_config = BackendPatternConfig(torch.ao.nn.intrinsic.ConvReLU2d) \
+            .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT) \
+            .add_dtype_config(weighted_int8_dtype_config) \
+            .set_root_module(torch.nn.Conv2d) \
+            .set_qat_module(torch.ao.nn.intrinsic.qat.ConvReLU2d) \
+            .set_reference_quantized_module(torch.ao.nn.quantized.reference.Conv2d)
 
         backend_config = BackendConfig("my_backend") \
             .set_backend_pattern_config(linear_config) \
-            .set_backend_pattern_config(conv_relu_config)
+            .set_backend_pattern_config(conv_relu_config) \
+            .set_backend_pattern_config(fused_conv_relu_config)
 
     """
     def __init__(self, name: str = ""):

From 565afdc85d6fc45144dbe68d883ba6f7c06d0ff8 Mon Sep 17 00:00:00 2001
From: Ajay Hotchandani <ajayh@meta.com>
Date: Tue, 29 Nov 2022 15:17:58 +0000
Subject: [PATCH 1371/1922] [aarch64] add SLEEF dependency for aten_cpu
 (#89475)

Reviewed By: kimishpatel, dmm-fb

Differential Revision: D41350031

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89475
Approved by: https://github.com/kimishpatel, https://github.com/ezyang
---
 buckbuild.bzl | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/buckbuild.bzl b/buckbuild.bzl
index 75c16ba006550..d3d5dda57c41a 100644
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@@ -147,6 +147,7 @@ THIRD_PARTY_LIBS = {
     "rt": ["//xplat/third-party/linker_lib:rt", "//third_party:rt"],
     "ruy": ["//third-party/ruy:ruy_xplat_lib", "//third_party:ruy_lib"],
     "typing-extensions": ["//third-party/typing-extensions:typing-extensions", "//third_party:typing-extensions"],
+    "sleef": ["//third-party/sleef:sleef", "//third_party:sleef"],
 }
 
 def third_party(name):
@@ -1930,7 +1931,12 @@ def define_buck_targets(
                 third_party("glog"),
                 third_party("XNNPACK"),
                 third_party("pocketfft"),
-            ],
+            ] + select({
+                "DEFAULT": [],
+                "ovr_config//runtime:fbcode-arm64": [
+                  third_party("sleef"),
+                ],
+            }),
             compiler_flags = get_aten_compiler_flags(),
             exported_preprocessor_flags = get_aten_preprocessor_flags(),
             exported_deps = [

From c9fb13857c033bb1c1629682ca4779d8c51ce67e Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@fb.com>
Date: Mon, 28 Nov 2022 14:32:47 -0800
Subject: [PATCH 1372/1922] don't run input mutation analysis in dynamo
 (#89760)

Right now we're running the analysis pass and then discarding the result. Instead, we should just stop running the analysis pass

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89760
Approved by: https://github.com/soumith, https://github.com/ezyang
---
 torch/_dynamo/optimizations/training.py | 39 +------------------------
 1 file changed, 1 insertion(+), 38 deletions(-)

diff --git a/torch/_dynamo/optimizations/training.py b/torch/_dynamo/optimizations/training.py
index 8c12750d47e19..a1b22a7f6c313 100644
--- a/torch/_dynamo/optimizations/training.py
+++ b/torch/_dynamo/optimizations/training.py
@@ -24,7 +24,6 @@
 
 from .. import config, eval_frame
 from ..utils import clone_inputs, count_calls, counters
-from .analysis import has_mutation
 from .backends import BACKENDS
 from .normalize import normalize_ir
 
@@ -107,48 +106,12 @@ def raise_or_warn(reason):
             log.warning(msg)
         return False
 
-    import functorch.compile
-
     # 1) LSTM module (tts_angular) - https://github.com/pytorch/functorch/issues/586
     for submod in gm.modules():
         if submod.__class__.__name__ == "LSTM":
             return raise_or_warn("LSTM")
 
-    # 2) Mutation in the graph
-    mutated = False
-    try:
-        if not torch.is_inference_mode_enabled():
-            if functorch.compile.config.use_functionalize:
-                # There are two problematic classes we still exclude for now with
-                # functionalization:
-                #   - data mutation of inputs (fixed when we stop recording the
-                #   copy_ directly into the graph)
-                #   - metadata mutation of inputs (fixed if we do an extra partition
-                #   to avoid AotAutograd on the mutated inputs, or if we some how
-                #   get custom autograd function to reflect metadata changes to the
-                #   original tensor)
-                mutated = has_mutation(gm, example_inputs, inputs_only=True)
-            else:
-                mutated = has_mutation(gm, example_inputs)
-        else:
-            log.info(
-                "inference_mode enabled. TorchDynamo could not check for mutation."
-            )
-    except NotImplementedError as e:
-        if "SparseTensorImpl" not in str(e):
-            # TODO - TorchDynamo mutation analysis cannot handle sparse tensors.
-            # So, there is a chance that we could call Aot Autograd when it is
-            # unsafe.
-            # The exception is fairly guarded with string check, so any other
-            # mutation analysis bugs will raise exceptions and will be caught.
-            raise e
-        pass
-
-    # TODO: delete the logic for this later.
-    # Now that aot autograd supports aliasing and mutation, we don't need it.
-    # if mutated:
-    # return raise_or_warn("mutation")
-
+    # 2) Mutation in the graphs are now always handled by AOT Autograd.
     return True
 
 
From 0650aa3666cff9c15f45293a5af87eb5555457d3 Mon Sep 17 00:00:00 2001
From: Kshiteej K <kshitijkalambarkar@gmail.com>
Date: Tue, 29 Nov 2022 16:46:20 +0000
Subject: [PATCH 1373/1922] [follow-up] Python Attr Serialization (#88913)

Ref: https://github.com/pytorch/pytorch/pull/81616#issuecomment-1307595402
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88913
Approved by: https://github.com/albanD
---
 test/test_serialization.py |  6 +-----
 torch/_utils.py            |  2 --
 torch/nn/parameter.py      | 14 +++++++++++---
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/test/test_serialization.py b/test/test_serialization.py
index b97c35c46762a..190fb7545fcf0 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -948,11 +948,7 @@ def _test_save_load_attr(t):
 
         t = torch.zeros(3, 3)
         _test_save_load_attr(t)
-        # This should start failing once Parameter
-        # supports saving Python Attribute.
-        err_msg = "'Parameter' object has no attribute"
-        with self.assertRaisesRegex(AttributeError, err_msg):
-            _test_save_load_attr(torch.nn.Parameter(t))
+        _test_save_load_attr(torch.nn.Parameter(t))
 
     def test_weights_only_assert(self):
         class HelloWorld:
diff --git a/torch/_utils.py b/torch/_utils.py
index 1bf3cf96ad1ce..89defb6bf78a5 100644
--- a/torch/_utils.py
+++ b/torch/_utils.py
@@ -352,8 +352,6 @@ def _rebuild_parameter(data, requires_grad, backward_hooks):
     return param
 
 
-# TODO(kshitij12345): Support serializing nn.Parameter with Python Attributes.
-# NOTE: We are just defining it here now for future use.
 def _rebuild_parameter_with_state(data, requires_grad, backward_hooks, state):
     param = torch.nn.Parameter(data, requires_grad)
     # NB: This line exists only for backwards compatibility; the
diff --git a/torch/nn/parameter.py b/torch/nn/parameter.py
index 68908001238ec..e2100d782c6af 100644
--- a/torch/nn/parameter.py
+++ b/torch/nn/parameter.py
@@ -60,11 +60,19 @@ def __repr__(self):
         return 'Parameter containing:\n' + super(Parameter, self).__repr__()
 
     def __reduce_ex__(self, proto):
-        # TODO(kshitij12345): Support saving Python Attribute
+        state = torch._utils._get_obj_state(self)
+
         # See Note [Don't serialize hooks]
+        hooks = OrderedDict()
+        if not state:
+            return (
+                torch._utils._rebuild_parameter,
+                (self.data, self.requires_grad, hooks)
+            )
+
         return (
-            torch._utils._rebuild_parameter,
-            (self.data, self.requires_grad, OrderedDict())
+            torch._utils._rebuild_parameter_with_state,
+            (self.data, self.requires_grad, hooks, state)
         )
 
     __torch_function__ = _disabled_torch_function_impl

From d54f8da96196d3f6b3553079f097d376444c070e Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Tue, 29 Nov 2022 17:17:11 +0000
Subject: [PATCH 1374/1922] Revert "Move functorch/_src to torch/_functorch
 (#88756)"

This reverts commit 52bc5c1cfe098fd4b4b13902b4fea83b455b9773.

Reverted https://github.com/pytorch/pytorch/pull/88756 on behalf of https://github.com/clee2000 due to broke imports in tests https://hud.pytorch.org/pytorch/pytorch/commit/52bc5c1cfe098fd4b4b13902b4fea83b455b9773 https://github.com/pytorch/pytorch/actions/runs/3574742513/jobs/6010814968 probably a landrace
---
 .lintrunner.toml                                  | 10 ----------
 benchmarks/dynamo/common.py                       |  2 +-
 functorch/__init__.py                             | 10 +++++-----
 functorch/_src/__init__.py                        |  5 +++++
 .../_functorch => functorch/_src}/aot_autograd.py |  0
 functorch/_src/aot_autograd/__init__.py           |  7 -------
 .../_src}/benchmark_utils.py                      |  0
 .../_src}/compile_utils.py                        |  0
 {torch/_functorch => functorch/_src}/compilers.py |  0
 {torch/_functorch => functorch/_src}/config.py    |  0
 .../_src}/eager_transforms.py                     |  0
 functorch/_src/eager_transforms/__init__.py       |  6 ------
 .../_functorch => functorch/_src}/fx_minifier.py  |  0
 .../_src}/make_functional.py                      |  0
 functorch/_src/make_functional/__init__.py        |  2 --
 .../_src}/named_members_polyfill.py               |  0
 .../_functorch => functorch/_src}/partitioners.py |  0
 .../_functorch => functorch/_src}/python_key.py   |  0
 .../_functorch => functorch/_src}/pytree_hacks.py |  0
 .../_src}/top_operators_github_usage.py           |  0
 {torch/_functorch => functorch/_src}/vmap.py      |  0
 functorch/_src/vmap/__init__.py                   | 15 ---------------
 functorch/benchmarks/chrome_trace_parser.py       |  2 +-
 functorch/benchmarks/cse.py                       |  2 +-
 functorch/compile/__init__.py                     | 12 ++++++------
 functorch/experimental/__init__.py                |  4 ++--
 test/dynamo/test_aot_cudagraphs.py                |  6 +++---
 test/dynamo/test_repros.py                        |  6 +++---
 test/functorch/discover_coverage.py               |  2 +-
 test/functorch/test_aotdispatch.py                | 12 ++++++------
 test/functorch/test_eager_transforms.py           |  4 ++--
 test/functorch/test_memory_efficient_fusion.py    |  2 +-
 test/functorch/test_minifier.py                   |  2 +-
 test/functorch/test_ops.py                        |  2 +-
 test/functorch/test_vmap.py                       |  2 +-
 test/inductor/test_torchinductor.py               |  2 +-
 test/test_functionalization.py                    |  2 +-
 torch/_dynamo/debug_utils.py                      |  2 +-
 torch/_dynamo/eval_frame.py                       |  2 +-
 torch/_dynamo/optimizations/training.py           |  4 ++--
 torch/_functorch/__init__.py                      |  5 -----
 torch/_inductor/compile_fx.py                     |  4 ++--
 42 files changed, 48 insertions(+), 88 deletions(-)
 rename {torch/_functorch => functorch/_src}/aot_autograd.py (100%)
 delete mode 100644 functorch/_src/aot_autograd/__init__.py
 rename {torch/_functorch => functorch/_src}/benchmark_utils.py (100%)
 rename {torch/_functorch => functorch/_src}/compile_utils.py (100%)
 rename {torch/_functorch => functorch/_src}/compilers.py (100%)
 rename {torch/_functorch => functorch/_src}/config.py (100%)
 rename {torch/_functorch => functorch/_src}/eager_transforms.py (100%)
 delete mode 100644 functorch/_src/eager_transforms/__init__.py
 rename {torch/_functorch => functorch/_src}/fx_minifier.py (100%)
 rename {torch/_functorch => functorch/_src}/make_functional.py (100%)
 delete mode 100644 functorch/_src/make_functional/__init__.py
 rename {torch/_functorch => functorch/_src}/named_members_polyfill.py (100%)
 rename {torch/_functorch => functorch/_src}/partitioners.py (100%)
 rename {torch/_functorch => functorch/_src}/python_key.py (100%)
 rename {torch/_functorch => functorch/_src}/pytree_hacks.py (100%)
 rename {torch/_functorch => functorch/_src}/top_operators_github_usage.py (100%)
 rename {torch/_functorch => functorch/_src}/vmap.py (100%)
 delete mode 100644 functorch/_src/vmap/__init__.py

diff --git a/.lintrunner.toml b/.lintrunner.toml
index a843471c3ee8d..fa7e484fb3e18 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -101,16 +101,6 @@ exclude_patterns = [
     'torch/csrc/**',
     'torch/_dynamo/**/*.py',
     'torch/_inductor/**/*.py',
-    'torch/_functorch/aot_autograd.py',
-    'torch/_functorch/benchmark_utils.py',
-    'torch/_functorch/compile_utils.py',
-    'torch/_functorch/compilers.py',
-    'torch/_functorch/eager_transforms.py',
-    'torch/_functorch/fx_minifier.py',
-    'torch/_functorch/partitioners.py',
-    'torch/_functorch/make_functional.py',
-    'torch/_functorch/top_operators_github_usage.py',
-    'torch/_functorch/vmap.py',
     'torch/distributed/elastic/agent/server/api.py',
     'torch/testing/_internal/**',
     'torch/distributed/fsdp/fully_sharded_data_parallel.py',
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 4b3e38bd81d68..cabbe3c411617 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -23,13 +23,13 @@
 import torch._dynamo
 import torch._dynamo.utils
 import torch.distributed
+from functorch._src.aot_autograd import set_model_name
 from scipy.stats import gmean, ttest_ind
 from torch._dynamo.optimizations import backends
 from torch._dynamo.optimizations.log_args import conv_args_analysis
 from torch._dynamo.profiler import fx_insert_profiling, Profiler
 from torch._dynamo.testing import dummy_fx_compile, format_speedup, same
 from torch._dynamo.utils import clone_inputs
-from torch._functorch.aot_autograd import set_model_name
 from torch._inductor import config as inductor_config
 from torch._inductor.utils import fresh_inductor_cache
 from torch._subclasses.fake_tensor import FakeTensorMode
diff --git a/functorch/__init__.py b/functorch/__init__.py
index c02ae3c443b6f..971ce793d7203 100644
--- a/functorch/__init__.py
+++ b/functorch/__init__.py
@@ -8,19 +8,19 @@
 
 # Top-level APIs. Please think carefully before adding something to the
 # top-level namespace:
-# - private helper functions should go into torch._functorch
+# - private helper functions should go into functorch._src
 # - very experimental things should go into functorch.experimental
 # - compilation related things should go into functorch.compile
 
 # functorch transforms
-from torch._functorch.vmap import vmap
-from torch._functorch.eager_transforms import (
+from ._src.vmap import vmap
+from ._src.eager_transforms import (
     grad, grad_and_value, vjp, jacrev, jvp, jacfwd, hessian, functionalize
 )
-from torch._functorch.python_key import make_fx
+from ._src.python_key import make_fx
 
 # utilities. Maybe these should go in their own namespace in the future?
-from torch._functorch.make_functional import (
+from ._src.make_functional import (
     make_functional_with_buffers,
     make_functional,
     combine_state_for_ensemble,
diff --git a/functorch/_src/__init__.py b/functorch/_src/__init__.py
index e69de29bb2d1d..10a55772ab58b 100644
--- a/functorch/_src/__init__.py
+++ b/functorch/_src/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/torch/_functorch/aot_autograd.py b/functorch/_src/aot_autograd.py
similarity index 100%
rename from torch/_functorch/aot_autograd.py
rename to functorch/_src/aot_autograd.py
diff --git a/functorch/_src/aot_autograd/__init__.py b/functorch/_src/aot_autograd/__init__.py
deleted file mode 100644
index 1bbf22fd2743a..0000000000000
--- a/functorch/_src/aot_autograd/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# This file has moved. It is not public API. If you are not a PyTorch developer
-# and you are relying on the following imports, please file an issue.
-from torch._functorch.aot_autograd import (
-    aot_autograd_decompositions,
-    KNOWN_TYPES,
-    PytreeThunk,
-)
diff --git a/torch/_functorch/benchmark_utils.py b/functorch/_src/benchmark_utils.py
similarity index 100%
rename from torch/_functorch/benchmark_utils.py
rename to functorch/_src/benchmark_utils.py
diff --git a/torch/_functorch/compile_utils.py b/functorch/_src/compile_utils.py
similarity index 100%
rename from torch/_functorch/compile_utils.py
rename to functorch/_src/compile_utils.py
diff --git a/torch/_functorch/compilers.py b/functorch/_src/compilers.py
similarity index 100%
rename from torch/_functorch/compilers.py
rename to functorch/_src/compilers.py
diff --git a/torch/_functorch/config.py b/functorch/_src/config.py
similarity index 100%
rename from torch/_functorch/config.py
rename to functorch/_src/config.py
diff --git a/torch/_functorch/eager_transforms.py b/functorch/_src/eager_transforms.py
similarity index 100%
rename from torch/_functorch/eager_transforms.py
rename to functorch/_src/eager_transforms.py
diff --git a/functorch/_src/eager_transforms/__init__.py b/functorch/_src/eager_transforms/__init__.py
deleted file mode 100644
index b9d2ebb5ae5a9..0000000000000
--- a/functorch/_src/eager_transforms/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# This file has moved. It is not public API. If you are not a PyTorch developer
-# and you are relying on the following imports, please file an issue.
-from torch._functorch.eager_transforms import (
-    _unwrap_functional_tensor,
-    _assert_wrapped_functional,
-)
diff --git a/torch/_functorch/fx_minifier.py b/functorch/_src/fx_minifier.py
similarity index 100%
rename from torch/_functorch/fx_minifier.py
rename to functorch/_src/fx_minifier.py
diff --git a/torch/_functorch/make_functional.py b/functorch/_src/make_functional.py
similarity index 100%
rename from torch/_functorch/make_functional.py
rename to functorch/_src/make_functional.py
diff --git a/functorch/_src/make_functional/__init__.py b/functorch/_src/make_functional/__init__.py
deleted file mode 100644
index 507038070931d..0000000000000
--- a/functorch/_src/make_functional/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# This file has moved. Please update your imports
-from torch._functorch.make_functional import _swap_state
diff --git a/torch/_functorch/named_members_polyfill.py b/functorch/_src/named_members_polyfill.py
similarity index 100%
rename from torch/_functorch/named_members_polyfill.py
rename to functorch/_src/named_members_polyfill.py
diff --git a/torch/_functorch/partitioners.py b/functorch/_src/partitioners.py
similarity index 100%
rename from torch/_functorch/partitioners.py
rename to functorch/_src/partitioners.py
diff --git a/torch/_functorch/python_key.py b/functorch/_src/python_key.py
similarity index 100%
rename from torch/_functorch/python_key.py
rename to functorch/_src/python_key.py
diff --git a/torch/_functorch/pytree_hacks.py b/functorch/_src/pytree_hacks.py
similarity index 100%
rename from torch/_functorch/pytree_hacks.py
rename to functorch/_src/pytree_hacks.py
diff --git a/torch/_functorch/top_operators_github_usage.py b/functorch/_src/top_operators_github_usage.py
similarity index 100%
rename from torch/_functorch/top_operators_github_usage.py
rename to functorch/_src/top_operators_github_usage.py
diff --git a/torch/_functorch/vmap.py b/functorch/_src/vmap.py
similarity index 100%
rename from torch/_functorch/vmap.py
rename to functorch/_src/vmap.py
diff --git a/functorch/_src/vmap/__init__.py b/functorch/_src/vmap/__init__.py
deleted file mode 100644
index 44aebc90e1745..0000000000000
--- a/functorch/_src/vmap/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# This file has moved. It is not public API. If you are not a PyTorch developer
-# and you are relying on the following imports, please file an issue.
-from torch._functorch.vmap import (
-    _add_batch_dim,
-    _broadcast_to_and_flatten,
-    _get_name,
-    _remove_batch_dim,
-    _validate_and_get_batch_size,
-    Tensor,
-    tree_flatten,
-    tree_unflatten,
-    _process_batched_inputs,
-    _create_batched_inputs,
-    _unwrap_batched,
-)
diff --git a/functorch/benchmarks/chrome_trace_parser.py b/functorch/benchmarks/chrome_trace_parser.py
index ccc8b89544bc3..54d2bf1447fb1 100755
--- a/functorch/benchmarks/chrome_trace_parser.py
+++ b/functorch/benchmarks/chrome_trace_parser.py
@@ -5,7 +5,7 @@
 import logging
 import pandas as pd
 
-from torch._functorch.benchmark_utils import compute_utilization
+from functorch._src.benchmark_utils import compute_utilization
 
 # process the chrome traces output by the pytorch profiler
 # require the json input file's name to be in format {model_name}_chrome_trace_*.json
diff --git a/functorch/benchmarks/cse.py b/functorch/benchmarks/cse.py
index 14cde14eb3085..028677d6ee259 100644
--- a/functorch/benchmarks/cse.py
+++ b/functorch/benchmarks/cse.py
@@ -3,7 +3,7 @@
 from functorch import make_fx
 from torch.profiler import profile, ProfilerActivity
 
-from torch._functorch.compile_utils import fx_graph_cse
+from functorch._src.compile_utils import fx_graph_cse
 
 def profile_it(f, inp):
     for _ in range(5):
diff --git a/functorch/compile/__init__.py b/functorch/compile/__init__.py
index 569c1b6819bdd..12549dceda9fb 100644
--- a/functorch/compile/__init__.py
+++ b/functorch/compile/__init__.py
@@ -1,6 +1,6 @@
-from torch._functorch.python_key import pythonkey_decompose
-from torch._functorch.fx_minifier import minifier
-from torch._functorch.aot_autograd import (
+from .._src.python_key import pythonkey_decompose
+from .._src.fx_minifier import minifier
+from .._src.aot_autograd import (
     aot_function,
     aot_module,
     compiled_function,
@@ -12,7 +12,7 @@
     make_boxed_func,
     make_boxed_compiler
 )
-from torch._functorch.compilers import (
+from .._src.compilers import (
     ts_compile,
     draw_graph_compile,
     nop,
@@ -22,10 +22,10 @@
     print_compile,
     default_decompositions
 )
-from torch._functorch.partitioners import (
+from .._src.partitioners import (
     min_cut_rematerialization_partition,
     default_partition,
     draw_graph,
     draw_joint_graph,
 )
-from torch._functorch import config
+from .._src import config
diff --git a/functorch/experimental/__init__.py b/functorch/experimental/__init__.py
index dde503f93bb62..3a4c92ffbe7a5 100644
--- a/functorch/experimental/__init__.py
+++ b/functorch/experimental/__init__.py
@@ -1,5 +1,5 @@
 # PyTorch forward-mode is not mature yet
-from torch._functorch.eager_transforms import hessian, jacfwd, jvp
-from torch._functorch.vmap import chunk_vmap
+from .._src.eager_transforms import hessian, jacfwd, jvp
+from .._src.vmap import chunk_vmap
 from .batch_norm_replacement import replace_all_batch_norm_modules_
 from functorch import functionalize
diff --git a/test/dynamo/test_aot_cudagraphs.py b/test/dynamo/test_aot_cudagraphs.py
index 5299e92a060f7..5b2e6eb2f9eac 100644
--- a/test/dynamo/test_aot_cudagraphs.py
+++ b/test/dynamo/test_aot_cudagraphs.py
@@ -104,7 +104,7 @@ def fn(x, y):
         y = torch.randn((), device="cpu")
         fn(x, y)
 
-    @patch("torch._functorch.config.use_functionalize", True)
+    @patch("functorch._src.config.use_functionalize", True)
     def test_mutate_input(self):
         def model(x, y):
             y.add_(3)
@@ -159,7 +159,7 @@ def fn(y):
         y = torch.randn(3, device="cuda:0", requires_grad=True)
         fn(y)
 
-    @patch("torch._functorch.config.use_functionalize", True)
+    @patch("functorch._src.config.use_functionalize", True)
     @patch_all()
     def test_mutated_metadata(self):
         # more tortured example at
@@ -180,7 +180,7 @@ def fn(x):
         x = torch.empty(0, device="cuda:0")
         fn(x)
 
-    @patch("torch._functorch.config.use_functionalize", True)
+    @patch("functorch._src.config.use_functionalize", True)
     @patch_all()
     def test_dead_fill(self):
         def model(x):
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index af666451590ff..ec5ea4ac1fb55 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -11,6 +11,8 @@
 from typing import List
 from unittest.mock import patch
 
+import functorch._src.config
+
 import numpy as np
 import torch
 
@@ -18,8 +20,6 @@
 import torch._dynamo.testing
 import torch._dynamo.utils
 
-import torch._functorch.config
-
 try:
     from test_minifier import requires_cuda
 except ImportError:
@@ -1681,7 +1681,7 @@ def fn(x):
         opt_fn(x)
         self.assertEqual(cnt.frame_count, 1)
 
-    @patch.object(torch._functorch.config, "use_dynamic_shapes", True)
+    @patch.object(functorch._src.config, "use_dynamic_shapes", True)
     def test_bigbird_unsqueeze_inplace(self):
         def fn(reshape_2):
             view_2 = reshape_2.clone()
diff --git a/test/functorch/discover_coverage.py b/test/functorch/discover_coverage.py
index 3f4f74b9224de..e52f317087b4c 100644
--- a/test/functorch/discover_coverage.py
+++ b/test/functorch/discover_coverage.py
@@ -3,7 +3,7 @@
 from torch.testing._internal.common_methods_invocations import op_db
 from functorch_additional_op_db import additional_op_db
 from enum import Enum
-import torch._functorch.top_operators_github_usage as top_ops
+import functorch._src.top_operators_github_usage as top_ops
 import pprint
 import unittest
 import enum
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index edf9d375e5189..3a604281ca956 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -22,7 +22,7 @@
     grad, vjp, vmap, jacrev,
     make_fx
 )
-from torch._functorch.aot_autograd import aot_module_simplified
+from functorch._src.aot_autograd import aot_module_simplified
 from functorch.compile import (
     nnc_jit, compiled_function, compiled_module,
     min_cut_rematerialization_partition, aot_function, aot_module,
@@ -991,7 +991,7 @@ def f(a, b, c):
         inp = [torch.randn(5, requires_grad=True) for _ in range(3)]
         f(*inp).sum().backward()
 
-    @patch('torch._functorch.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
+    @patch('functorch._src.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
     def test_compilation_context(self, counter):
         def f(x):
             return x.sin().sin()
@@ -1016,8 +1016,8 @@ def f(x, y):
         x = torch.randn(3, 3, requires_grad=True)
         self.verify_aot_autograd(f, [x, x])
 
-    @patch('torch._functorch.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
-    @patch("torch._functorch.config.debug_assert", True)
+    @patch('functorch._src.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
+    @patch("functorch._src.config.debug_assert", True)
     def test_invalid_dupe(self, counter):
         class F(torch.nn.Module):
             def forward(self, x, y):
@@ -1037,8 +1037,8 @@ def forward(self, x, y):
             """At compilation time, graph 1 was compiled under the assumption that input 1 would be a duplicate of input 0, but at runtime this was not the case.  This indicates a guard bug in AOTAutograd or Dynamo, please file a bug to PyTorch."""  # noqa: B950
         )
 
-    @patch('torch._functorch.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
-    @patch("torch._functorch.config.debug_assert", True)
+    @patch('functorch._src.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
+    @patch("functorch._src.config.debug_assert", True)
     def test_invalid_requires_grad(self, counter):
         class F(torch.nn.Module):
             def forward(self, x, y):
diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py
index e9d0cbfb4f919..e123da0d9d3c9 100644
--- a/test/functorch/test_eager_transforms.py
+++ b/test/functorch/test_eager_transforms.py
@@ -32,10 +32,10 @@
     jvp, make_functional, make_functional_with_buffers,
     combine_state_for_ensemble, make_fx
 )
-from torch._functorch.make_functional import (
+from functorch._src.make_functional import (
     functional_init, functional_init_with_buffers,
 )
-from torch._functorch.eager_transforms import enable_fwd_grad, _slice_argnums
+from functorch._src.eager_transforms import enable_fwd_grad, _slice_argnums
 from functorch.experimental import functionalize
 from torch._ops import PyOperator
 from torch._functorch.utils import enable_autograd_function
diff --git a/test/functorch/test_memory_efficient_fusion.py b/test/functorch/test_memory_efficient_fusion.py
index e12da51004504..b0f18f06b8295 100644
--- a/test/functorch/test_memory_efficient_fusion.py
+++ b/test/functorch/test_memory_efficient_fusion.py
@@ -6,7 +6,7 @@
 from functorch import make_fx
 from torch.nn import functional as F
 from functorch.compile import memory_efficient_fusion
-from torch._functorch.compile_utils import fx_graph_cse
+from functorch._src.compile_utils import fx_graph_cse
 from torch.testing._internal.common_utils import TestCase, run_tests
 import inspect
 import random
diff --git a/test/functorch/test_minifier.py b/test/functorch/test_minifier.py
index 7ed13921d9077..49af42795592d 100644
--- a/test/functorch/test_minifier.py
+++ b/test/functorch/test_minifier.py
@@ -2,7 +2,7 @@
 
 import torch
 from functorch.compile import minifier
-from torch._functorch.compile_utils import get_placeholders, get_outputs
+from functorch._src.compile_utils import get_placeholders, get_outputs
 from functorch import make_fx
 from torch.testing._internal.common_utils import TestCase, run_tests
 
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index 93e87e7be54a4..c0ae683cdfbf7 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -41,7 +41,7 @@
 from torch.utils._pytree import tree_flatten, tree_unflatten, tree_map
 from functorch import grad, vjp, vmap, jacrev, jacfwd
 import torch.autograd.forward_ad as fwAD
-from torch._functorch.eager_transforms import _as_tuple, jvp
+from functorch._src.eager_transforms import _as_tuple, jvp
 
 aten = torch.ops.aten
 
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index dcad523217f3f..4b460560d8a90 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -49,7 +49,7 @@
 from functorch import vmap, grad, grad_and_value, jvp, vjp, jacfwd
 from functorch.experimental import chunk_vmap
 from torch._C._functorch import reshape_dim_into, reshape_dim_outof
-from torch._functorch.make_functional import functional_init_with_buffers
+from functorch._src.make_functional import functional_init_with_buffers
 
 FALLBACK_REGEX = 'There is a performance drop'
 
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 0c622e49d105e..209a5bd0a7a33 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -5527,7 +5527,7 @@ def noop_backend(
                 Instead, it transforms the fx graph so that its functions are
                 aten operations. It then saves this graph.
                 """
-                from torch._functorch.aot_autograd import Interpreter
+                from functorch._src.aot_autograd import Interpreter
                 from torch._inductor.decomposition import select_decomp_table
                 from torch._subclasses import FakeTensorMode
 
diff --git a/test/test_functionalization.py b/test/test_functionalization.py
index d477c2cc595c3..d699c03ed4173 100644
--- a/test/test_functionalization.py
+++ b/test/test_functionalization.py
@@ -150,7 +150,7 @@ def f(input):
 
         def g(x):
             loss = f(x).sum()
-            from torch._functorch.aot_autograd import setup_stacktrace_preservation_hooks
+            from functorch._src.aot_autograd import setup_stacktrace_preservation_hooks
             import torch.fx.traceback as fx_traceback
             setup_stacktrace_preservation_hooks([loss.grad_fn])
             with fx_traceback.override_stack_trace():
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index b788ca46245f9..6af1e6c4cfdd7 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -502,7 +502,7 @@ def run_fwd_maybe_bwd(gm, args, only_fwd=False):
     """
     Runs a forward and possibly backward iteration for a given mod and args.
     """
-    from torch._functorch.aot_autograd import make_boxed_func
+    from functorch._src.aot_autograd import make_boxed_func
 
     from .testing import collect_results, reduce_to_scalar_loss, requires_bwd_pass
 
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index cc0a1648bf2c0..a04bc72aa6cbb 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -120,7 +120,7 @@ def enable_dynamic(enable: bool = True):
         yield
         return
     with patch("torch._dynamo.config.dynamic_shapes", True), patch(
-        "torch._functorch.config.use_dynamic_shapes", True
+        "functorch._src.config.use_dynamic_shapes", True
     ):
         yield
 
diff --git a/torch/_dynamo/optimizations/training.py b/torch/_dynamo/optimizations/training.py
index a1b22a7f6c313..7013fcdf3107f 100644
--- a/torch/_dynamo/optimizations/training.py
+++ b/torch/_dynamo/optimizations/training.py
@@ -6,6 +6,8 @@
 from importlib import import_module
 from typing import Set
 
+from functorch._src.compilers import debug_nop
+
 from functorch.compile import (
     aot_module_simplified,
     min_cut_rematerialization_partition,
@@ -14,8 +16,6 @@
 )
 
 import torch
-
-from torch._functorch.compilers import debug_nop
 from torch.fx import GraphModule
 from torch.fx.passes.backends.cudagraphs import partition_cudagraphs
 from torch.multiprocessing.reductions import StorageWeakRef
diff --git a/torch/_functorch/__init__.py b/torch/_functorch/__init__.py
index 10a55772ab58b..e69de29bb2d1d 100644
--- a/torch/_functorch/__init__.py
+++ b/torch/_functorch/__init__.py
@@ -1,5 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index 34cbec8c60716..3472f0e2efec1 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -6,10 +6,10 @@
 from typing import List
 
 import functorch
+from functorch._src.aot_autograd import make_boxed_func
 from functorch.compile import min_cut_rematerialization_partition
 
 import torch.fx
-from torch._functorch.aot_autograd import make_boxed_func
 from torch._subclasses.fake_tensor import FakeTensor
 
 from . import config, metrics, overrides
@@ -391,7 +391,7 @@ def bw_compiler(model: torch.fx.GraphModule, example_inputs):
     with overrides.patch_functions():
 
         # TODO: can add logging before/after the call to create_aot_dispatcher_function
-        # in torch._functorch/aot_autograd.py::aot_module_simplified::aot_function_simplified::new_func
+        # in functorch/_src/aot_autograd.py::aot_module_simplified::aot_function_simplified::new_func
         # once torchdynamo is merged into pytorch
         return aot_autograd(
             fw_compiler=fw_compiler,

From 1171760844eecc7db5bb0a7d1d2ed2390806488c Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Mon, 28 Nov 2022 22:21:37 +0000
Subject: [PATCH 1375/1922] [FSDP] Relax post-backward assert (#89791)

This assert was accidentally made stricter when transitioning from per-FSDP-instance training state to per-handle training state. This PR relaxes it again, which should restore compatibility for some reentrant AC plus FSDP cases.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89791
Approved by: https://github.com/zhaojuanmao
---
 torch/distributed/fsdp/_runtime_utils.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index 71265f68e9428..8cb0ce323179a 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -482,9 +482,14 @@ def _post_backward_hook(
         "FullyShardedDataParallel._post_backward_hook"
     ):
         _assert_in_training_states(state, [TrainingState.FORWARD_BACKWARD])
+        # For multiple applications of reentrant AC across submodules sharing
+        # the same `FlatParameter`, the post-backward hook may run multiple
+        # times in one backward, in which case we permit the state to already
+        # be in `BACKWARD_POST`.
         p_assert(
-            handle._training_state == HandleTrainingState.BACKWARD_PRE,
-            f"Expects `BACKWARD_PRE` state but got {handle._training_state}",
+            handle._training_state
+            in (HandleTrainingState.BACKWARD_PRE, HandleTrainingState.BACKWARD_POST),
+            f"Expects `BACKWARD_PRE` or `BACKWARD_POST` state but got {handle._training_state}",
         )
         handle._training_state = HandleTrainingState.BACKWARD_POST
 

From 7d65e2d3a695c12f227a65c7c18a038155ea2408 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Tue, 29 Nov 2022 17:55:43 +0000
Subject: [PATCH 1376/1922] Revert "[GHA] Decrease Windows test timeout to 120
 minutes (#89694)"

This reverts commit faa032c5e58502de6ea461e531109d2acc22e56a.

Reverted https://github.com/pytorch/pytorch/pull/89694 on behalf of https://github.com/clee2000 due to broke periodic b/c they take ~2.5 hrs, also broke mem leak check b/c its slow, should probably look into having this be a parameter
---
 .github/workflows/_win-test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml
index ef77e0055e369..0cabb8ec469aa 100644
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@@ -56,7 +56,7 @@ jobs:
       matrix: ${{ fromJSON(needs.filter.outputs.test-matrix) }}
       fail-fast: false
     runs-on: ${{ matrix.runner }}
-    timeout-minutes: 120
+    timeout-minutes: 300
     steps:
       - name: Enable git symlinks on Windows
         shell: bash

From 4dc42d25db274a86c422e9717a83e466aa328954 Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Mon, 28 Nov 2022 17:02:31 -0800
Subject: [PATCH 1377/1922] [Quant][fx][bc-breaking] Remove
 backend_config_utils.py (#89810)

Summary: Previously under torch/ao/quantization we have
backend_config/utils.py and fx/backend_config_utils.py, which
was confusing. This commit deletes the latter and moves
everything there to more suitable util files.

BC-breaking note: The following public APIs under the
`torch.ao.quantization.fx.backend_config_utils` namespace
are removed in this commit.

```
get_quantize_handler_cls
get_fusion_pattern_to_fuse_handler_cls
get_native_quant_patterns
get_pattern_to_quantize_handlers
```

Test Plan:
python test/test_quantization.py TestQuantizeFx
python test/test_quantization.py TestQuantizeFxOps

Reviewers: jerryzh168, vkuzo

Subscribers: jerryzh168, vkuzo

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89810
Approved by: https://github.com/jerryzh168
---
 test/quantization/fx/test_numeric_suite_fx.py |   4 +-
 torch/ao/ns/_numeric_suite_fx.py              |   4 +-
 torch/ao/ns/fx/pattern_utils.py               |   5 +-
 .../quantization/fx/backend_config_utils.py   | 113 ------------------
 torch/ao/quantization/fx/fuse.py              |   8 +-
 torch/ao/quantization/fx/fusion_patterns.py   |  10 ++
 torch/ao/quantization/fx/prepare.py           |   7 +-
 .../quantization/fx/quantization_patterns.py  |  71 ++++++++++-
 8 files changed, 93 insertions(+), 129 deletions(-)
 delete mode 100644 torch/ao/quantization/fx/backend_config_utils.py

diff --git a/test/quantization/fx/test_numeric_suite_fx.py b/test/quantization/fx/test_numeric_suite_fx.py
index 7f46cf0a442b3..e5900104aa0a1 100644
--- a/test/quantization/fx/test_numeric_suite_fx.py
+++ b/test/quantization/fx/test_numeric_suite_fx.py
@@ -85,7 +85,7 @@
 )
 from torch.ao.ns.fx.qconfig_multi_mapping import QConfigMultiMapping
 from torch.ao.quantization.backend_config import get_native_backend_config
-from torch.ao.quantization.fx.backend_config_utils import get_pattern_to_quantize_handlers
+from torch.ao.quantization.fx.quantization_patterns import _get_pattern_to_quantize_handlers
 
 
 # Note: these models are not for use outside of this file. While it's good
@@ -299,7 +299,7 @@ def get_all_quant_patterns():
     all_quant_patterns = get_default_quant_patterns()
     # some of the patterns are moved to (native) backend_config_dict so we need to
     # add them back here
-    for pattern, quantize_handler in get_pattern_to_quantize_handlers(get_native_backend_config()).items():
+    for pattern, quantize_handler in _get_pattern_to_quantize_handlers(get_native_backend_config()).items():
         all_quant_patterns[pattern] = quantize_handler
     return all_quant_patterns
 
diff --git a/torch/ao/ns/_numeric_suite_fx.py b/torch/ao/ns/_numeric_suite_fx.py
index 860430c40b9f9..2c563b8c82d66 100644
--- a/torch/ao/ns/_numeric_suite_fx.py
+++ b/torch/ao/ns/_numeric_suite_fx.py
@@ -121,9 +121,9 @@
 )
 from torch.ao.quantization.backend_config.utils import get_fusion_pattern_to_root_node_getter
 from torch.ao.quantization.backend_config import BackendConfig
-from torch.ao.quantization.fx.backend_config_utils import get_pattern_to_quantize_handlers
 from torch.ao.quantization.fx.match_utils import find_matches
 from torch.ao.quantization.fx.qconfig_mapping_utils import generate_node_name_to_qconfig
+from torch.ao.quantization.fx.quantization_patterns import _get_pattern_to_quantize_handlers
 from torch.ao.quantization.qconfig import QConfigAny
 from torch.ao.ns.fx.n_shadows_utils import (
     OutputProp,
@@ -803,7 +803,7 @@ def prepare_n_shadows_model(
     # Find the set of subgraphs in the original graph which we need to
     # consider.
     modules = dict(mt.named_modules(remove_duplicate=False))
-    patterns = get_pattern_to_quantize_handlers(backend_config)
+    patterns = _get_pattern_to_quantize_handlers(backend_config)
     root_node_getter_mapping = \
         get_fusion_pattern_to_root_node_getter(backend_config)
     standalone_module_names: List[str] = []
diff --git a/torch/ao/ns/fx/pattern_utils.py b/torch/ao/ns/fx/pattern_utils.py
index b8e6a0ee4dc11..5bcb65c415ba5 100644
--- a/torch/ao/ns/fx/pattern_utils.py
+++ b/torch/ao/ns/fx/pattern_utils.py
@@ -6,9 +6,10 @@
 from torch.fx import GraphModule
 from torch.fx.graph import Node
 
+from torch.ao.quantization.backend_config import get_native_backend_config
+from torch.ao.quantization.fx.quantization_patterns import _get_pattern_to_quantize_handlers
 from torch.ao.quantization.utils import getattr_from_fqn
 from .ns_types import NSNodeTargetType
-from torch.ao.quantization.fx.backend_config_utils import get_native_quant_patterns
 from torch.ao.quantization import (
     ObserverBase,
     FakeQuantizeBase,
@@ -66,7 +67,7 @@ def get_reversed_fusions() -> List[Tuple[NSFusionType, int]]:
     # * multiple ops: (torch.nn.ReLU, torch.nn.Conv2d)
     # For fusions, we only care about patterns composed of multiple ops.
     # TODO(future PR): allow customizations from default patterns.
-    all_quant_patterns = get_native_quant_patterns()
+    all_quant_patterns = _get_pattern_to_quantize_handlers(get_native_backend_config())
 
     default_base_op_idx = 0
     for quant_pattern, _quant_handler in all_quant_patterns.items():
diff --git a/torch/ao/quantization/fx/backend_config_utils.py b/torch/ao/quantization/fx/backend_config_utils.py
deleted file mode 100644
index 50c6b6a27ede0..0000000000000
--- a/torch/ao/quantization/fx/backend_config_utils.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import torch
-from torch.ao.quantization.fx.pattern_utils import get_default_quant_patterns, sorted_patterns_dict
-from torch.ao.quantization.backend_config import (
-    get_native_backend_config,
-    ObservationType,
-)
-from torch.ao.quantization.utils import (
-    get_combined_dict,
-    Pattern,
-    NodePattern,
-    QuantizerCls,
-)
-
-from ..backend_config import BackendConfig
-from .quantization_patterns import QuantizeHandler
-from .fusion_patterns import DefaultFuseHandler
-
-from typing import Callable, Dict
-
-def get_quantize_handler_cls(
-        observation_type,
-        dtype_configs,
-        num_tensor_args_to_observation_type,
-        input_output_observed):
-
-    class ConfigurableQuantizeHandler(QuantizeHandler):
-        def __init__(
-                self,
-                node_pattern: NodePattern,
-                modules: Dict[str, torch.nn.Module],
-                root_node_getter: Callable = None):
-            super().__init__(node_pattern, modules, root_node_getter)
-            if num_tensor_args_to_observation_type:
-                assert self.num_tensor_args in num_tensor_args_to_observation_type, \
-                    f"Must provide observation_type config for tensor number {self.num_tensor_args}" \
-                    f" in num_tensor_args_to_observation_type for {node_pattern}"
-                self.observation_type = num_tensor_args_to_observation_type[self.num_tensor_args]
-            else:
-                self.observation_type = observation_type
-            self.dtype_configs = dtype_configs
-            self.input_output_observed_ = input_output_observed
-
-        def is_general_tensor_value_op(self) -> bool:
-            return self.observation_type == ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT
-
-        # This is temporary, and will be removed soon
-        def input_output_observed(self):
-            return self.input_output_observed_
-
-
-    return ConfigurableQuantizeHandler
-
-def get_pattern_to_quantize_handlers(backend_config: BackendConfig) -> Dict[Pattern, QuantizerCls]:
-    """
-    Note: Quantize handler is just a holder for some check methods like
-    (should_insert_observer_for_output), maybe this can be a enum as well,
-    we can refactor this after we convert the path for fbgemm/qnnpack fully to the
-    new path, this is not exposed to backend developers
-    """
-    pattern_to_quantize_handlers = {}
-    for pattern, config in backend_config.configs.items():
-        observation_type = config.observation_type
-        dtype_configs = config.dtype_configs
-        num_tensor_args_to_observation_type = config._num_tensor_args_to_observation_type
-        input_output_observed = config._input_output_observed
-        if input_output_observed is None:
-            input_output_observed = True
-        pattern_to_quantize_handlers[pattern] = \
-            get_quantize_handler_cls(
-                observation_type,
-                dtype_configs,
-                num_tensor_args_to_observation_type,
-                input_output_observed)
-
-    return pattern_to_quantize_handlers
-
-# TODO: move this to torch/ao/quantization/backend_config/utils.py
-def get_fusion_pattern_to_fuse_handler_cls(
-        backend_config: BackendConfig) -> Dict[Pattern, Callable]:
-    fusion_pattern_to_fuse_handlers: Dict[Pattern, Callable] = {}
-    for pattern, config in backend_config.configs.items():
-        if config.fuser_method is not None:
-            # TODO: is this logic right?
-            fusion_pattern_to_fuse_handlers[pattern] = DefaultFuseHandler
-
-    return fusion_pattern_to_fuse_handlers
-
-# TODO: remove when all uses are changed to backend_config
-def get_native_quant_patterns(additional_quant_patterns: Dict[Pattern, QuantizerCls] = None) -> Dict[Pattern, QuantizerCls]:
-    """
-    Return a map from pattern to quantize handlers based on the default patterns and the native backend_config.
-    The returned map is sorted such that longer patterns will be encountered first when iterating through it.
-    """
-    patterns = get_default_quant_patterns()
-    if additional_quant_patterns is not None:
-        patterns = get_combined_dict(patterns, additional_quant_patterns)
-    # TODO: currently we just extend the quantize handlers generated from
-    # `get_native_backend_config`
-    # in the future we can just assign backend_config when everything is defined
-    for pattern, quantize_handler in get_pattern_to_quantize_handlers(get_native_backend_config()).items():
-        patterns[pattern] = quantize_handler
-    return sorted_patterns_dict(patterns)
-
-get_fusion_pattern_to_fuse_handler_cls.__module__ = "torch.ao.quantization.fx.backend_config_utils"
-get_native_quant_patterns.__module__ = "torch.ao.quantization.fx.backend_config_utils"
-get_pattern_to_quantize_handlers.__module__ = "torch.ao.quantization.fx.backend_config_utils"
-
-__all__ = [
-    "get_quantize_handler_cls",
-    "get_fusion_pattern_to_fuse_handler_cls",
-    "get_native_quant_patterns",
-    "get_pattern_to_quantize_handlers",
-]
diff --git a/torch/ao/quantization/fx/fuse.py b/torch/ao/quantization/fx/fuse.py
index 8a4fcb6d11251..3053b0329c82f 100644
--- a/torch/ao/quantization/fx/fuse.py
+++ b/torch/ao/quantization/fx/fuse.py
@@ -24,11 +24,13 @@
     get_fusion_pattern_to_root_node_getter,
     get_fusion_pattern_to_extra_inputs_getter,
 )
-from .backend_config_utils import get_fusion_pattern_to_fuse_handler_cls
 
 from .custom_config import FuseCustomConfig
 
-from .fusion_patterns import *  # noqa: F401,F403
+from .fusion_patterns import (
+    _get_fusion_pattern_to_fuse_handler_cls,
+    FuseHandler,
+)
 
 from typing import Any, Callable, Dict, List, Tuple, Union
 import warnings
@@ -69,7 +71,7 @@ def fuse(
     if backend_config is None:
         backend_config = get_native_backend_config()
 
-    fusion_pattern_to_fuse_handler_cls = sorted_patterns_dict(get_fusion_pattern_to_fuse_handler_cls(backend_config))
+    fusion_pattern_to_fuse_handler_cls = sorted_patterns_dict(_get_fusion_pattern_to_fuse_handler_cls(backend_config))
     fuser_method_mapping = get_fuser_method_mapping(backend_config)
     fusion_pattern_to_root_node_getter = get_fusion_pattern_to_root_node_getter(backend_config)
     fusion_pattern_to_extra_inputs_getter = get_fusion_pattern_to_extra_inputs_getter(backend_config)
diff --git a/torch/ao/quantization/fx/fusion_patterns.py b/torch/ao/quantization/fx/fusion_patterns.py
index 075a0cfa03315..5ec6f8430feb7 100644
--- a/torch/ao/quantization/fx/fusion_patterns.py
+++ b/torch/ao/quantization/fx/fusion_patterns.py
@@ -1,4 +1,5 @@
 import torch
+from torch.ao.quantization.backend_config import BackendConfig
 from torch.fx.graph import Node, Graph
 from ..utils import _parent_name, NodePattern, Pattern
 from ..fuser_method_mappings import get_fuser_method_new
@@ -108,3 +109,12 @@ def get_matched_types(m):
         args.extend(extra_args)
         node.args = tuple(args)
         return node
+
+def _get_fusion_pattern_to_fuse_handler_cls(
+        backend_config: BackendConfig) -> Dict[Pattern, Callable]:
+    fusion_pattern_to_fuse_handlers: Dict[Pattern, Callable] = {}
+    for pattern, config in backend_config.configs.items():
+        if config.fuser_method is not None:
+            # TODO: is this logic right?
+            fusion_pattern_to_fuse_handlers[pattern] = DefaultFuseHandler
+    return fusion_pattern_to_fuse_handlers
diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index 932b40e03e0f7..185980816f2cd 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -34,6 +34,7 @@
 )
 
 from .quantization_patterns import (
+    _get_pattern_to_quantize_handlers,
     QuantizeHandler,
 )
 
@@ -98,10 +99,6 @@
     DTypeConfig,
     get_native_backend_config,
 )
-from .backend_config_utils import (
-    get_pattern_to_quantize_handlers,
-)
-
 from .custom_config import (
     PrepareCustomConfig,
     StandaloneModuleConfigEntry,
@@ -1520,7 +1517,7 @@ def prepare(
     pattern_to_quantize_handler: Dict[Pattern, QuantizeHandler] = {}
     if backend_config is None:
         backend_config = get_native_backend_config()
-    pattern_to_quantize_handler = get_pattern_to_quantize_handlers(backend_config)
+    pattern_to_quantize_handler = _get_pattern_to_quantize_handlers(backend_config)
     pattern_to_quantize_handler = sorted_patterns_dict(pattern_to_quantize_handler)
 
     root_node_getter_mapping = \
diff --git a/torch/ao/quantization/fx/quantization_patterns.py b/torch/ao/quantization/fx/quantization_patterns.py
index f8d72de9c96ae..cce588cb536ca 100644
--- a/torch/ao/quantization/fx/quantization_patterns.py
+++ b/torch/ao/quantization/fx/quantization_patterns.py
@@ -6,10 +6,19 @@
 from .utils import (
     all_node_args_have_no_tensors,
 )
-from torch.ao.quantization.utils import NodePattern
+from torch.ao.quantization.backend_config import (
+    BackendConfig,
+    DTypeConfig,
+    ObservationType,
+)
+from torch.ao.quantization.utils import (
+    NodePattern,
+    Pattern,
+    QuantizerCls,
+)
 
 from abc import ABC
-from typing import Callable, Dict
+from typing import Callable, Dict, List, Type
 
 __all__ = [
     "QuantizeHandler",
@@ -101,6 +110,64 @@ def is_custom_module(self):
     def is_standalone_module(self):
         return self.is_standalone_module_
 
+def _get_quantize_handler_cls(
+        observation_type: ObservationType,
+        dtype_configs: List[DTypeConfig],
+        num_tensor_args_to_observation_type: Dict[int, ObservationType],
+        input_output_observed: bool) -> Type[QuantizeHandler]:
+    """
+    Return a configurable QuantizeHandler that matches the given specifications from the backend.
+    """
+
+    class ConfigurableQuantizeHandler(QuantizeHandler):
+        def __init__(
+                self,
+                node_pattern: NodePattern,
+                modules: Dict[str, torch.nn.Module],
+                root_node_getter: Callable = None):
+            super().__init__(node_pattern, modules, root_node_getter)
+            if num_tensor_args_to_observation_type:
+                assert self.num_tensor_args in num_tensor_args_to_observation_type, \
+                    f"Must provide observation_type config for tensor number {self.num_tensor_args}" \
+                    f" in num_tensor_args_to_observation_type for {node_pattern}"
+                self.observation_type = num_tensor_args_to_observation_type[self.num_tensor_args]
+            else:
+                self.observation_type = observation_type
+            self.dtype_configs = dtype_configs
+            self.input_output_observed_ = input_output_observed
+
+        def is_general_tensor_value_op(self) -> bool:
+            return self.observation_type == ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT
+
+        # This is temporary, and will be removed soon
+        def input_output_observed(self):
+            return self.input_output_observed_
+
+    return ConfigurableQuantizeHandler
+
+def _get_pattern_to_quantize_handlers(backend_config: BackendConfig) -> Dict[Pattern, QuantizerCls]:
+    """
+    Note: Quantize handler is just a holder for some check methods like
+    (should_insert_observer_for_output), maybe this can be a enum as well,
+    we can refactor this after we convert the path for fbgemm/qnnpack fully to the
+    new path, this is not exposed to backend developers
+    """
+    pattern_to_quantize_handlers = {}
+    for pattern, config in backend_config.configs.items():
+        observation_type = config.observation_type
+        dtype_configs = config.dtype_configs
+        num_tensor_args_to_observation_type = config._num_tensor_args_to_observation_type
+        input_output_observed = config._input_output_observed
+        if input_output_observed is None:
+            input_output_observed = True
+        pattern_to_quantize_handlers[pattern] = \
+            _get_quantize_handler_cls(
+                observation_type,
+                dtype_configs,
+                num_tensor_args_to_observation_type,
+                input_output_observed)
+    return pattern_to_quantize_handlers
+
 # TODO: remove this class, this is still exposed in torch.quantization
 # but we should be able to break bc
 class BinaryOpQuantizeHandler(QuantizeHandler):

From 8477edc683d9ca25acdb31565421a04bad9ddeaa Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Mon, 28 Nov 2022 13:18:14 -0800
Subject: [PATCH 1378/1922] [Quant][docs] Use get_default_qconfig_mapping
 (#87299)

Summary: The recommended way to use QConfigMapping is through
`get_default_qconfig_mapping`. However, the docs still references
usages that use `QConfigMapping().set_global(...)`. This doesn't
actually work well in practice when the model has fixed qparams
ops for example. This commit updates these usages.

Reviewers: vkuzo

Subscribers: vkuzo
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87299
Approved by: https://github.com/jerryzh168
---
 docs/source/quantization.rst | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index 4b87e8b181555..c55a2a354f15f 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -427,7 +427,11 @@ There are multiple quantization types in post training quantization (weight only
 FXPTQ API Example::
 
   import torch
-  from torch.ao.quantization import QConfigMapping
+  from torch.ao.quantization import (
+    get_default_qconfig_mapping,
+    get_default_qat_qconfig_mapping,
+    QConfigMapping,
+  )
   import torch.quantization.quantize_fx as quantize_fx
   import copy
 
@@ -454,7 +458,7 @@ FXPTQ API Example::
   #
 
   model_to_quantize = copy.deepcopy(model_fp)
-  qconfig_mapping = QConfigMapping().set_global(torch.quantization.get_default_qconfig('qnnpack'))
+  qconfig_mapping = get_default_qconfig_mapping("qnnpack")
   model_to_quantize.eval()
   # prepare
   model_prepared = quantize_fx.prepare_fx(model_to_quantize, qconfig_mapping, example_inputs)
@@ -467,7 +471,7 @@ FXPTQ API Example::
   #
 
   model_to_quantize = copy.deepcopy(model_fp)
-  qconfig_mapping = QConfigMapping().set_global(torch.quantization.get_default_qat_qconfig('qnnpack'))
+  qconfig_mapping = get_default_qat_qconfig_mapping("qnnpack")
   model_to_quantize.train()
   # prepare
   model_prepared = quantize_fx.prepare_qat_fx(model_to_quantize, qconfig_mapping, example_inputs)

From b229bfbab7c6ebcd49597c627853990b2f9f7921 Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Tue, 29 Nov 2022 18:15:15 +0000
Subject: [PATCH 1379/1922] [test_nn] split multihead_attention from test_nn
 (#89748)

Ref: https://github.com/pytorch/pytorch/issues/63085
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89748
Approved by: https://github.com/albanD
---
 test/nn/test_multihead_attention.py | 689 ++++++++++++++++++++++++++++
 test/test_nn.py                     | 648 --------------------------
 2 files changed, 689 insertions(+), 648 deletions(-)
 create mode 100644 test/nn/test_multihead_attention.py

diff --git a/test/nn/test_multihead_attention.py b/test/nn/test_multihead_attention.py
new file mode 100644
index 0000000000000..9c622ffe6e897
--- /dev/null
+++ b/test/nn/test_multihead_attention.py
@@ -0,0 +1,689 @@
+# Owner(s): ["module: nn"]
+import contextlib
+import random
+import unittest
+import unittest.mock as mock
+
+from torch.nn import MultiheadAttention
+from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes, \
+    onlyCUDA
+from torch.testing._internal.common_nn import NNTestCase
+from torch.testing._internal.common_utils import run_tests, \
+    TEST_NUMPY, TEST_WITH_CROSSREF, \
+    parametrize as parametrize_test, instantiate_parametrized_tests
+import torch.nn as nn
+import torch
+
+if TEST_NUMPY:
+    import numpy as np
+
+
+# WARNING: If you add a new top-level test case to this file, you MUST
+# update test/run_test.py to list it, otherwise it will NOT be run in
+# CI.
+
+class TestMultiheadAttentionNN(NNTestCase):
+    _do_cuda_memory_leak_check = True
+    _do_cuda_non_default_stream = True
+
+    @unittest.skipIf(not TEST_NUMPY, "numpy not found")
+    @parametrize_test("average_attn_weights", [True, False])
+    def test_multihead_attention(self, average_attn_weights):
+        def _scaled_dot_attn_ref(Q, K, V, dims, unseen_mask=None, key_padding_mask=None,
+                                 average_attn_weights=average_attn_weights):
+            """ Numpy-based reference implementation of scaled dot attention
+            for testing"""
+
+            QKT = _batchmatmul(
+                Q,
+                np.transpose(K, axes=[0, 1, 3, 2])
+                / np.sqrt(dims[3], dtype=np.float32),  # divide by sqrt(d_head)
+            )
+            b1, b2, s1, s2 = QKT.shape
+            if unseen_mask is not None or key_padding_mask is not None:
+                # assert s1 == s2
+                for i in range(b1):
+                    for j in range(b2):
+                        for m in range(s1):
+                            for n in range(s2):
+                                if unseen_mask is not None and unseen_mask[m][n] == 0:
+                                    QKT[i, j, m, n] = -np.inf
+                                if key_padding_mask is not None and key_padding_mask[i][n]:
+                                    QKT[i, j, m, n] = -np.inf
+
+            reference = _softmax(QKT)
+            ref_attn_weight = reference
+            if average_attn_weights:
+                ref_attn_weight = np.sum(ref_attn_weight, axis=1) / b2
+            reference = _batchmatmul(reference, V)
+            return reference, ref_attn_weight
+
+        def _batchmatmul(a, b):  # batchmatmul over 4 dim matrix
+            """ Numpy-based batch matrix multiply over 4 dim matrix"""
+            assert a.shape[0] == b.shape[0]
+            assert a.shape[1] == b.shape[1]
+            retval = np.zeros(
+                (a.shape[0], a.shape[1], a.shape[2], b.shape[3]), dtype=np.float32
+            )
+            for i in range(a.shape[0]):
+                for j in range(a.shape[1]):
+                    retval[i, j, :, :] = np.matmul(a[i, j, :, :], b[i, j, :, :])
+            return retval
+
+        def _softmax(x):  # softmax over 4 dim matrix
+            """ Numpy-based reference softmax over 4 dim matrix"""
+            np.seterr(invalid='ignore')
+            output = np.zeros(x.shape, dtype=np.float64)
+            for i in range(x.shape[0]):
+                for j in range(x.shape[1]):
+                    for k in range(x.shape[2]):
+                        x_curr = x[i, j, k, :]
+                        e_x = np.exp(x_curr - np.amax(x_curr))
+                        output[i, j, k, :] = e_x / np.sum(e_x)
+            return output
+
+        def _split_heads_ref(X, dims, nheads, d_head):
+            X_split = np.reshape(X, dims[:2] + [nheads, d_head])
+            X_split_transposed = np.transpose(X_split, [0, 2, 1, 3])
+            reference = np.reshape(X_split_transposed, [dims[0], nheads, dims[1], d_head])
+            return reference
+
+        def _combine_heads_ref(X, dims, nheads, d_head):
+            X_transposed = np.transpose(X, [0, 2, 1, 3])
+            reference = np.reshape(X_transposed, dims[:2] + [nheads * d_head])
+            return reference
+
+        def _fc(X, X_weight, X_bias):
+            X_fc_b = X_bias.detach().numpy()
+            X_fc_w = X_weight.detach().numpy()
+            return np.matmul(X, np.transpose(X_fc_w)) + X_fc_b
+
+        def _create_src_lengths_mask(batch_size, src_lengths):
+            """
+            Generate boolean mask to prevent attention beyond the end of source
+            Inputs:
+              batch_size : int
+              src_lengths : [batch_size] of sentence lengths
+            Outputs:
+              [batch_size, max_src_len]
+            """
+            max_srclen = src_lengths.max()
+            src_indices = torch.arange(0, max_srclen).unsqueeze(0).to(src_lengths)
+            src_indices = src_indices.expand(batch_size, max_srclen)
+            src_lengths = src_lengths.unsqueeze(dim=1).expand(batch_size, max_srclen)
+            # returns [batch_size, max_seq_len]
+            return (src_indices < src_lengths).int().detach()
+
+        def _multihead_attn_test_helper(add_key_padding_mask=False, add_bias_kv=False, add_zero_attn=False,
+                                        saved_kv=False, same_embed_dim=False,
+                                        average_attn_weights=average_attn_weights):
+            for _ in range(100):
+                batch_sz, seq_len = [random.randint(2, 10) for r in range(2)]
+                d_head = random.randint(3, 10)
+                nheads = random.randint(2, 5) * 2
+                d_model = d_head * nheads
+                if same_embed_dim:
+                    kv_dim = d_model
+                else:
+                    kv_dim = random.randint(5, 20)
+                dims = [batch_sz, seq_len, kv_dim]
+
+                saved_k = None
+                saved_k_tensor = None
+                saved_v = None
+                saved_v_tensor = None
+                if saved_kv:
+                    saved_k = np.random.rand(batch_sz * nheads, seq_len, d_head)
+                    saved_k_tensor = torch.from_numpy(saved_k).to(torch.get_default_dtype())
+                    saved_v = np.random.rand(batch_sz * nheads, seq_len, d_head)
+                    saved_v_tensor = torch.from_numpy(saved_v).to(torch.get_default_dtype())
+
+                key_padding_mask = None
+                key_padding_mask_tensor = None
+                if add_key_padding_mask:
+                    seq_mask = np.random.randint(0, 2, (1, seq_len))
+                    key_padding_mask = (np.repeat(seq_mask, batch_sz, axis=0) == 1)
+                    key_padding_mask_tensor = torch.from_numpy(key_padding_mask)
+                decoder_state = np.random.rand(batch_sz, d_model)
+                K = np.random.rand(*dims)
+                V = K
+                Q = np.expand_dims(decoder_state, 1)
+                attn_mask = np.random.randint(0, 2, size=(1, seq_len))
+                attn_mask_tensor = torch.from_numpy(attn_mask).float()
+                attn_mask_tensor.masked_fill_(attn_mask_tensor == 0, float('-inf'))
+                attn_mask_tensor.masked_fill_(attn_mask_tensor > 0, float('0.0'))
+                attn_mask_tensor = attn_mask_tensor.double()
+
+                decoder_state_tensor = torch.from_numpy(decoder_state).to(torch.get_default_dtype())
+                source_hid_tensor = torch.from_numpy(K).to(torch.get_default_dtype()).transpose(0, 1)
+
+                multihead_attn_module = MultiheadAttention(d_model, nheads,
+                                                           add_bias_kv=add_bias_kv,
+                                                           add_zero_attn=add_zero_attn,
+                                                           kdim=kv_dim, vdim=kv_dim)
+
+                if add_bias_kv:
+                    bias_k = multihead_attn_module.bias_k.detach().numpy()
+                    bias_v = multihead_attn_module.bias_v.detach().numpy()
+                else:
+                    bias_k = None
+                    bias_v = None
+
+                _Q = decoder_state_tensor.unsqueeze(1).transpose(0, 1)
+                _V = source_hid_tensor
+                _K = source_hid_tensor
+
+                if multihead_attn_module._qkv_same_embed_dim:
+                    result, result_weight = torch.nn.functional.multi_head_attention_forward(
+                        _Q, _K, _V,
+                        d_model, nheads,
+                        multihead_attn_module.in_proj_weight, multihead_attn_module.in_proj_bias,
+                        multihead_attn_module.bias_k, multihead_attn_module.bias_v,
+                        multihead_attn_module.add_zero_attn, multihead_attn_module.dropout,
+                        multihead_attn_module.out_proj.weight, multihead_attn_module.out_proj.bias,
+                        multihead_attn_module.training, key_padding_mask_tensor, True, attn_mask_tensor,
+                        static_k=saved_k_tensor, static_v=saved_v_tensor,
+                        average_attn_weights=average_attn_weights)
+                else:
+                    result, result_weight = torch.nn.functional.multi_head_attention_forward(
+                        _Q, _K, _V,
+                        d_model, nheads,
+                        None, multihead_attn_module.in_proj_bias,
+                        multihead_attn_module.bias_k, multihead_attn_module.bias_v,
+                        multihead_attn_module.add_zero_attn, multihead_attn_module.dropout,
+                        multihead_attn_module.out_proj.weight, multihead_attn_module.out_proj.bias,
+                        multihead_attn_module.training, key_padding_mask_tensor, True, attn_mask_tensor,
+                        True, multihead_attn_module.q_proj_weight,
+                        multihead_attn_module.k_proj_weight, multihead_attn_module.v_proj_weight,
+                        static_k=saved_k_tensor, static_v=saved_v_tensor,
+                        average_attn_weights=average_attn_weights)
+
+                result = result.squeeze(0).detach().numpy()
+
+                if multihead_attn_module._qkv_same_embed_dim:
+                    q_proj_weight = multihead_attn_module.in_proj_weight[:d_model]
+                    k_proj_weight = multihead_attn_module.in_proj_weight[d_model:(d_model * 2)]
+                    v_proj_weight = multihead_attn_module.in_proj_weight[(d_model * 2):]
+                else:
+                    q_proj_weight = multihead_attn_module.q_proj_weight
+                    k_proj_weight = multihead_attn_module.k_proj_weight
+                    v_proj_weight = multihead_attn_module.v_proj_weight
+
+                Q_fc = _fc(Q, q_proj_weight, multihead_attn_module.in_proj_bias[:d_model])
+                K_fc = _fc(K, k_proj_weight, multihead_attn_module.in_proj_bias[d_model:(d_model * 2)])
+                V_fc = _fc(V, v_proj_weight, multihead_attn_module.in_proj_bias[(d_model * 2):])
+
+                if add_bias_kv:
+                    K_fc = np.concatenate((K_fc, np.repeat(bias_k, K_fc.shape[0], axis=0)), axis=1)
+                    V_fc = np.concatenate((V_fc, np.repeat(bias_v, V_fc.shape[0], axis=0)), axis=1)
+                    if attn_mask is not None:
+                        attn_mask = np.concatenate((attn_mask, np.ones([1, 1])), axis=1)
+                    if key_padding_mask is not None:
+                        key_padding_mask = np.concatenate(
+                            (key_padding_mask, np.full((batch_sz, 1), False, dtype=bool)), axis=1)
+                    dims[1] += 1
+                Q_split = _split_heads_ref(
+                    Q_fc, [batch_sz, 1, d_model], nheads, d_head
+                )
+
+                if saved_k is not None:
+                    K_split = np.reshape(saved_k, [dims[0], nheads, dims[1], d_head])
+                else:
+                    K_split = _split_heads_ref(K_fc, dims, nheads, d_head)
+
+                if saved_v is not None:
+                    V_split = np.reshape(saved_v, [dims[0], nheads, dims[1], d_head])
+                else:
+                    V_split = _split_heads_ref(V_fc, dims, nheads, d_head)
+
+                if add_zero_attn:
+                    dims[1] += 1
+                    K_split = np.concatenate(
+                        (K_split, np.zeros([K_split.shape[0], K_split.shape[1], 1, K_split.shape[3]])), axis=2)
+                    V_split = np.concatenate(
+                        (V_split, np.zeros([V_split.shape[0], V_split.shape[1], 1, V_split.shape[3]])), axis=2)
+
+                    if attn_mask is not None:
+                        attn_mask = np.concatenate((attn_mask, np.ones([1, 1])), axis=1)
+
+                    if key_padding_mask is not None:
+                        key_padding_mask = np.concatenate(
+                            (key_padding_mask, np.full((batch_sz, 1), False, dtype=bool)), axis=1)
+                attn_heads, ref_attn_weight = _scaled_dot_attn_ref(
+                    Q=Q_split,
+                    K=K_split,
+                    V=V_split,
+                    dims=Q_split.shape,
+                    unseen_mask=attn_mask,
+                    key_padding_mask=key_padding_mask
+                )
+                combined_attn_heads = _combine_heads_ref(
+                    X=attn_heads, dims=[batch_sz, 1], nheads=nheads, d_head=d_head
+                )
+
+                reference = _fc(combined_attn_heads, multihead_attn_module.out_proj.weight,
+                                multihead_attn_module.out_proj.bias)
+                reference = np.squeeze(reference, axis=1)
+
+                # result = reference
+                self.assertEqual(tuple(result.shape), (batch_sz, d_model))
+                np.testing.assert_allclose(result, reference, atol=1e-5)
+
+                # result_weight = ref_attn_weight
+                result_weight = result_weight.detach().numpy()
+                self.assertEqual(tuple(result_weight.shape), tuple(ref_attn_weight.shape))
+                np.testing.assert_allclose(result_weight, ref_attn_weight, atol=1e-5)
+
+        def test_multihead_attn_add_bias_kv():
+            _multihead_attn_test_helper(add_bias_kv=True)
+
+        def test_multihead_attn_add_zero_attn():
+            _multihead_attn_test_helper(add_zero_attn=True)
+
+        def test_multihead_attn_no_masking():
+            _multihead_attn_test_helper()
+
+        def test_multihead_attn_key_padding_mask():
+            _multihead_attn_test_helper(add_key_padding_mask=True)
+
+        def test_multihead_attn_saved_kv():
+            _multihead_attn_test_helper(saved_kv=True)
+
+        def test_multihead_attn_add_bias_kv_zero_attn():
+            _multihead_attn_test_helper(add_key_padding_mask=True, add_bias_kv=True,
+                                        add_zero_attn=True)
+
+        def test_multihead_attn_all_arguments1():
+            _multihead_attn_test_helper(add_key_padding_mask=True, add_zero_attn=True, saved_kv=True)
+
+        def test_multihead_attn_all_arguments2():
+            _multihead_attn_test_helper(add_key_padding_mask=True, add_bias_kv=True,
+                                        add_zero_attn=True, saved_kv=True)
+
+        def test_multihead_attn_all_arguments3():
+            _multihead_attn_test_helper(add_key_padding_mask=True, add_zero_attn=True,
+                                        saved_kv=True, same_embed_dim=True)
+
+        test_multihead_attn_add_zero_attn()  # Test MultiheadAttention with add_zero_attn
+        test_multihead_attn_add_bias_kv()  # Test MultiheadAttention with add_bias_kv
+        test_multihead_attn_no_masking()   # Test MultiheadAttention without masking
+        test_multihead_attn_key_padding_mask()  # Test MultiheadAttention with src lengths
+        test_multihead_attn_saved_kv()  # Test MultiheadAttention with static kv.
+        test_multihead_attn_add_bias_kv_zero_attn()  # Test MultiheadAttention with bias_kv and zero_attn.
+        test_multihead_attn_all_arguments1()  # Test MultiheadAttention with all the argument.
+        with self.assertRaisesRegex(AssertionError, "bias cannot be added to static key."):
+            test_multihead_attn_all_arguments2()  # Test MultiheadAttention with all the argument.
+        test_multihead_attn_all_arguments3()  # Test MultiheadAttention with all the argument.
+
+    def test_multihead_attn_3d_attn_mask(self):
+        embed_dim = 8
+        num_heads = 4
+        batch_size = 8
+        src_len = 3
+        tgt_len = 2
+
+        query = torch.rand(batch_size, tgt_len, embed_dim)  # [N, T, D]
+        key = torch.rand(batch_size, src_len, embed_dim)  # [N, S, D]
+        value = key  # [N, S, D]
+        attn_mask = torch.randint(0, 2, (batch_size, tgt_len, src_len)).float()  # [N, T, S]
+        attn_mask = attn_mask.masked_fill(attn_mask == 0, float('-inf')).masked_fill(attn_mask == 1, float(0.0))
+
+        mta_model = torch.nn.MultiheadAttention(embed_dim, num_heads)
+
+        # Generate 3D results
+        attn_mask_3d = torch.repeat_interleave(attn_mask, num_heads, dim=0)  # [N * H, T, S]
+        output_3d = mta_model(query.transpose(0, 1), key.transpose(
+            0, 1), value.transpose(0, 1), attn_mask=attn_mask_3d)[0]
+        output_3d = output_3d.transpose(0, 1)  # [N, T, D]
+
+        for i in range(0, batch_size):
+            output_2d = mta_model(query[i].unsqueeze(0).transpose(0, 1),
+                                  key[i].unsqueeze(0).transpose(0, 1),
+                                  value[i].unsqueeze(0).transpose(0, 1),
+                                  attn_mask=attn_mask[i])[0]
+
+            # output_2d in shape of [T, 1, D]
+            self.assertEqual(output_3d[i].unsqueeze(0).transpose(0, 1), output_2d)
+
+    def test_multihead_attn_no_bias(self):
+        embed_dim = 8
+        num_heads = 4
+        mha = torch.nn.MultiheadAttention(embed_dim, num_heads, bias=False)
+
+        # Verify that bias=False applies to both in and out projection layers.
+        self.assertIsNone(mha.in_proj_bias)
+        self.assertIsNone(mha.out_proj.bias)
+
+    def _test_multihead_attn_invalid_shape_impl(self, mha):
+        # Batched (3D) query cases
+        query = torch.randn(4, 4, 4)
+        key = torch.randn(4, 4, 4)
+        value = torch.randn(4, 4, 4)
+
+        msg = "expected `key` and `value` to be 3-D but found 2-D and 3-D tensors respectively"
+        # 3D query, 2D key and 3D value
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, torch.randn(4, 4), value)
+
+        msg = "expected `key` and `value` to be 3-D but found 3-D and 2-D tensors respectively"
+        # 3D query, 3D key and 2D value
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, key, torch.randn(4, 4))
+
+        msg = "expected `key_padding_mask` to be `None` or 2-D but found 1-D tensor instead"
+        # 3D query, 3D key, 3D value and 1D key_padding_mask
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, key, value, key_padding_mask=torch.tensor([False, False, True, True], dtype=torch.bool))
+
+        msg = "expected `attn_mask` to be `None`, 2-D or 3-D but found 1-D tensor instead"
+        # 3D query, 3D key, 3D value and 1D attn_mask
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, key, value, attn_mask=torch.tensor([False, False, True, True], dtype=torch.bool))
+
+        # Unbatched (2D) query cases
+        query = torch.randn(4, 4)
+        key = torch.randn(4, 4)
+        value = torch.randn(4, 4)
+
+        msg = "expected `key` and `value` to be 2-D but found 3-D and 2-D tensors respectively"
+        # 2D query, 3D key and 2D value
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, torch.randn(4, 4, 4), value)
+
+        msg = "expected `key` and `value` to be 2-D but found 2-D and 3-D tensors respectively"
+        # 2D query, 3D key and 2D value
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, key, torch.randn(4, 4, 4))
+
+        msg = "expected `key_padding_mask` to be `None` or 1-D but found 2-D tensor instead"
+        # 2D query, 2D key, 2D value and 1D key_padding_mask
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, key, value, key_padding_mask=torch.tensor([[False, False, True, True] * 2], dtype=torch.bool))
+
+        msg = "expected `attn_mask` to be `None`, 2-D or 3-D but found 1-D tensor instead"
+        # 2D query, 2D key, 2D value and 1D attn_mask
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, key, value, attn_mask=torch.tensor([False, False, True, True], dtype=torch.bool))
+
+        msg = r"Expected `attn_mask` shape to be \(4, 4, 4\)"
+        # 2D query, 2D key, 2D value and 3D incorrect attn_mask
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, key, value, attn_mask=torch.randn(5, 4, 4).bernoulli_().to(torch.bool))
+
+    def test_multihead_attn_invalid_shape(self):
+        mha = torch.nn.MultiheadAttention(4, 4)
+        self._test_multihead_attn_invalid_shape_impl(mha)
+        # Give the test a chance to hit the fast path. (Right now, it
+        # won't, but gating may be less restricted in the future.)
+        with torch.no_grad():
+            self._test_multihead_attn_invalid_shape_impl(mha.eval())
+
+    @torch.no_grad()
+    def test_multihead_attn_fast_path_invalid_shape(self):
+        mha = torch.nn.MultiheadAttention(4, 4, batch_first=True).eval()
+
+        # Batched (3D) query cases
+        query = torch.randn(4, 4, 4)
+        key = torch.randn(4, 4, 4)
+        value = torch.randn(4, 4, 4)
+
+        # Currently, this case will just go to the slow path and get
+        # the usual message because it fails the requirement to be
+        # batched.
+        msg = "expected `key` and `value` to be 3-D but found 2-D and 3-D tensors respectively"
+        # 3D query, 2D key and 3D value
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, torch.randn(3, 3), value, need_weights=False)
+
+        # Currently, this case will just go to the slow path and get
+        # the usual message because it fails the requirement to be
+        # batched.
+        msg = "expected `key` and `value` to be 3-D but found 3-D and 2-D tensors respectively"
+        # 3D query, 3D key and 2D value
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, key, torch.randn(3, 3), need_weights=False)
+
+        msg = "expected `key_padding_mask` to be `None` or 2-D but found 1-D tensor instead"
+        # 3D query, 3D key, 3D value and 1D key_padding_mask
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, key, value, key_padding_mask=torch.tensor(
+                [False, True, True], dtype=torch.bool), need_weights=False)
+
+        msg = "expected `attn_mask` to be `None`, 2-D or 3-D but found 1-D tensor instead"
+        # 3D query, 3D key, 3D value and 1D attn_mask
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, key, value, attn_mask=torch.tensor([False, True, True], dtype=torch.bool), need_weights=False)
+
+        # Unbatched (2D) query cases
+        # NOTE: error messages are the same as regular path because the fast path doesn't support 2D.
+        query = torch.randn(4, 4)
+        key = torch.randn(4, 4)
+        value = torch.randn(4, 4)
+
+        msg = "expected `key` and `value` to be 2-D but found 3-D and 2-D tensors respectively"
+        # 2D query, 3D key and 2D value
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, torch.randn(4, 4, 4), value)
+
+        msg = "expected `key` and `value` to be 2-D but found 2-D and 3-D tensors respectively"
+        # 2D query, 3D key and 2D value
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, key, torch.randn(4, 4, 4))
+
+        msg = "expected `key_padding_mask` to be `None` or 1-D but found 2-D tensor instead"
+        # 2D query, 2D key, 2D value and 1D key_padding_mask
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, key, value, key_padding_mask=torch.tensor([[False, False, True, True] * 2], dtype=torch.bool))
+
+        msg = "expected `attn_mask` to be `None`, 2-D or 3-D but found 1-D tensor instead"
+        # 2D query, 2D key, 2D value and 1D attn_mask
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, key, value, attn_mask=torch.tensor([False, False, True, True], dtype=torch.bool))
+
+        msg = r"Expected `attn_mask` shape to be \(4, 4, 4\)"
+        # 2D query, 2D key, 2D value and 3D incorrect attn_mask
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(query, key, value, attn_mask=torch.randn(5, 4, 4).bernoulli_().to(torch.bool))
+
+    def test_multihead_attn_nested_tensor_outside_fast_path(self):
+        mha = torch.nn.MultiheadAttention(4, 4, batch_first=True).eval()
+        nt = torch.nested.nested_tensor([torch.randn(4, 4)])
+        # One tested platform (linux-bionic-py3.7-clang) has a torch_function for one
+        # or more of these. Take advantage of that to test the torch_function bailout.
+        has_torch_func = torch.overrides.has_torch_function(
+            (nt, mha.in_proj_weight, mha.in_proj_bias, mha.out_proj.weight, mha.out_proj.bias))
+        if has_torch_func:
+            msg = "MultiheadAttention does not support NestedTensor.*argument has_torch_function"
+        else:
+            msg = ("MultiheadAttention does not support NestedTensor outside of its fast path.*grad is " +
+                   "enabled and.*or biases requires_grad")
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(nt, nt, nt)
+
+        if has_torch_func:
+            # Just give up, they're all going to fail with the same message.
+            return
+
+        with torch.no_grad():
+            mha(nt, nt, nt)
+        with torch.inference_mode():
+            mha(nt, nt, nt)
+        nt = torch.nested.nested_tensor([torch.randn(4, 4, requires_grad=False)])
+        nt.requires_grad = False
+        with self.assertRaisesRegex(AssertionError, msg):
+            mha(nt, nt, nt)
+        mha.in_proj_weight.requires_grad = False
+        mha.in_proj_bias.requires_grad = False
+        mha.out_proj.weight.requires_grad = False
+        mha.out_proj.bias.requires_grad = False
+        mha(nt, nt, nt)
+
+
+class TestMultiheadAttentionNNDeviceType(NNTestCase):
+    def test_multihead_self_attn_two_masks_fast_path(self, device):
+        """
+        Multihead self-attention should give the same result on the fast path (BetterTransformer) as on the slow path
+        when both attention mask (mask type 0) and key padding mask (mask type 1) are provided
+        """
+        with torch.no_grad():
+            embed_dim = 14
+            num_heads = 7
+            batch_size = 8
+            src_len = 5
+
+            query = value = key = torch.rand(batch_size, src_len, embed_dim).to(device)
+            # Create masks of two different types
+            attn_mask = torch.randint(0, 2, (src_len, src_len)).bool().to(device)
+            key_padding_mask = torch.randint(0, 2, (batch_size, src_len)).bool().to(device)
+
+            # We'll need expanded versions of the masks for masking out the outputs below
+            attn_mask_expanded = attn_mask.reshape(1, 1, src_len, src_len) \
+                                          .expand(batch_size, num_heads, src_len, src_len)
+            key_padding_mask_expanded = key_padding_mask.reshape(batch_size, 1, 1, src_len) \
+                                                        .expand(batch_size, num_heads, src_len, src_len)
+            merged_mask = attn_mask_expanded.logical_or(key_padding_mask_expanded)
+
+            # Compute attention on the fast path
+            mta_model = torch.nn.MultiheadAttention(embed_dim, num_heads, batch_first=True, device=device)
+            mta_model.training = False
+            result_fast_path, _ = mta_model(query, key, value, attn_mask=attn_mask, key_padding_mask=key_padding_mask)
+
+            # Compute attention on the slow path
+            result_ref, _ = torch.nn.functional.multi_head_attention_forward(query.transpose(0, 1),
+                                                                             key.transpose(0, 1),
+                                                                             value.transpose(0, 1),
+                                                                             embed_dim, num_heads,
+                                                                             mta_model.in_proj_weight,
+                                                                             mta_model.in_proj_bias,
+                                                                             mta_model.bias_k, mta_model.bias_v,
+                                                                             mta_model.add_zero_attn,
+                                                                             mta_model.dropout,
+                                                                             mta_model.out_proj.weight,
+                                                                             mta_model.out_proj.bias,
+                                                                             training=mta_model.training,
+                                                                             key_padding_mask=key_padding_mask,
+                                                                             need_weights=False,
+                                                                             attn_mask=attn_mask,
+                                                                             use_separate_proj_weight=False,
+                                                                             q_proj_weight=mta_model.q_proj_weight,
+                                                                             k_proj_weight=mta_model.k_proj_weight,
+                                                                             v_proj_weight=mta_model.v_proj_weight,
+                                                                             average_attn_weights=False,
+                                                                             )
+            result_ref = result_ref.transpose(0, 1)  # Convert to batch-first
+
+            # Rows which are completely masked out are nan, we need to exclude them from comparison
+            mask_out = merged_mask[:, 0, :, :].all(-1, keepdim=True).expand(batch_size, src_len, embed_dim)
+            result_fast_path_masked = result_fast_path.masked_fill(mask_out, 0)
+            result_ref_masked = result_ref.masked_fill(mask_out, 0)
+
+            self.assertEqual(result_fast_path_masked, result_ref_masked)
+
+    @torch.no_grad()
+    @unittest.skipIf(TEST_WITH_CROSSREF, 'CrossRef turns on TorchFunctionMode, and so disables fastpath.')
+    def test_multihead_self_attn_two_masks_fast_path_mock(self, device):
+        """
+        Multihead self-attention should take fast path when both attention mask (mask type 0)
+        and key padding mask (mask type 1) are provided at the same time on CPU and CUDA
+        """
+        if device not in ['cpu', 'cuda']:
+            self.skipTest("Fastpath only runs on CPU and CUDA.")
+        with torch.autocast(device_type=device, enabled=False):
+            embed_dim = 14
+            num_heads = 7
+            batch_size = 8
+            src_len = 5
+
+            query = value = key = torch.rand(batch_size, src_len, embed_dim).to(device)
+            # Create masks of two different types
+            attn_mask = torch.randint(0, 2, (src_len, src_len)).bool().to(device)
+            key_padding_mask = torch.randint(0, 2, (batch_size, src_len)).bool().to(device)
+
+            with mock.patch('torch._native_multi_head_attention') as fastpath_mock:
+                # Compute attention on the fast path
+                mta_model = torch.nn.MultiheadAttention(embed_dim, num_heads, batch_first=True, device=device).eval()
+                mta_model.training = False
+                mta_model(query, key, value, attn_mask=attn_mask, key_padding_mask=key_padding_mask)
+                # If mock was called, fastpath was taken
+                self.assertTrue(fastpath_mock.called)
+
+    @onlyCUDA
+    @dtypes(torch.half, torch.float, torch.double)
+    def test_multihead_attention_dtype(self, device, dtype):
+        embed_dim = 128
+        num_heads = 8
+        sl = 10
+        bs = 8
+        model = nn.MultiheadAttention(embed_dim, num_heads).cuda().to(dtype)
+        q = torch.randn(sl, bs, embed_dim, device=device, dtype=dtype)
+        k = torch.randn(sl, bs, embed_dim, device=device, dtype=dtype)
+        v = torch.randn(sl, bs, embed_dim, device=device, dtype=dtype)
+        out = model(q, k, v)
+        self.assertEqual(q.size(), out[0].size())
+        self.assertEqual(dtype, out[0].dtype)
+
+    @onlyCUDA
+    @dtypes(torch.half, torch.float, torch.double)
+    def test_multihead_attention_dtype_batch_first(self, device, dtype):
+        embed_dim = 128
+        num_heads = 8
+        sl = 10
+        bs = 8
+        # With batch_first=True, we have the possibility of hitting
+        # the native fast path if we call .eval() and enable inference
+        # mode. Test both paths.
+        for training in (True, False):
+            model = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True).cuda().to(dtype)
+            if not training:
+                model = model.eval()
+                cm = torch.no_grad()
+            else:
+                cm = contextlib.nullcontext()
+            with cm:
+                q = torch.randn(bs, sl, embed_dim, device=device, dtype=dtype)
+                k = torch.randn(bs, sl, embed_dim, device=device, dtype=dtype)
+                v = torch.randn(bs, sl, embed_dim, device=device, dtype=dtype)
+                # fast path currently doesn't support weights
+                out = model(q, k, v, need_weights=False)
+                self.assertEqual(q.size(), out[0].size())
+                self.assertEqual(dtype, out[0].dtype)
+
+    @dtypes(torch.double)
+    @torch.no_grad()
+    def test_multihead_attn_fast_path_query_and_bias_have_different_dtypes(self, device, dtype):
+        mha = torch.nn.MultiheadAttention(4, 4, batch_first=True, dtype=dtype, device=device).eval()
+        mha.in_proj_bias = torch.nn.Parameter(mha.in_proj_bias.to(torch.half).to(device))
+        query = torch.randn(4, 4, 4, dtype=dtype, device=device)
+        mha(query, query, query)
+
+    @dtypes(torch.double)
+    @torch.no_grad()
+    def test_multihead_attn_fast_path_small_test(self, device, dtype):
+        mha = torch.nn.MultiheadAttention(4, 4, batch_first=True, dtype=dtype, device=device).eval()
+        query = torch.randn(4, 4, 4, dtype=dtype, device=device)
+        mha(query, query, query)
+
+    @dtypes(torch.double)
+    @torch.no_grad()
+    def test_multihead_attn_in_proj_bias_none(self, device, dtype):
+        mha = torch.nn.MultiheadAttention(2, 2, bias=False, dtype=dtype, device=device)
+        query = torch.rand(2, 2, 2, dtype=dtype, device=device)
+        mha(query, query, query)
+
+    @dtypes(torch.double)
+    @torch.no_grad()
+    def test_multihead_attn_in_proj_weight_none(self, device, dtype):
+        # Setting kdim == vdim == 2 means that vdim != embed_dim
+        # will cause the logic to use per-input project weights, thereby
+        # forcing self.in_proj_weight = None
+        mha = torch.nn.MultiheadAttention(4, 4, vdim=2, kdim=2, dtype=dtype, device=device)
+        query = torch.rand(4, 4, 4, dtype=dtype, device=device)
+        key = torch.rand(4, 4, 2, dtype=dtype, device=device)
+        mha(query, key, key)
+
+
+instantiate_device_type_tests(TestMultiheadAttentionNNDeviceType, globals())
+instantiate_parametrized_tests(TestMultiheadAttentionNN)
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_nn.py b/test/test_nn.py
index 8d108c473994c..a7a12054d579b 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -5,7 +5,6 @@
 import random
 import unittest
 import io
-import unittest.mock as mock
 import itertools
 import warnings
 import pickle
@@ -47,7 +46,6 @@
     dtypesIfCUDA, precisionOverride, skipCUDAIfCudnnVersionLessThan, onlyCUDA, onlyCPU, \
     skipCUDAIfRocm, skipCUDAIf, skipCUDAIfNotRocm, \
     onlyNativeDeviceTypes, deviceCountAtLeast, largeTensorTest, expectedFailureMeta, skipMeta, get_all_device_types
-from torch.nn import MultiheadAttention
 
 from hypothesis import given
 import torch.testing._internal.hypothesis_utils as hu
@@ -2236,491 +2234,6 @@ def test_nested_tensor_from_mask_error(self):
         mask[0, 2] = False
         self.assertRaises(RuntimeError, lambda: torch._nested_tensor_from_mask(input, mask))
 
-    @unittest.skipIf(not TEST_NUMPY, "numpy not found")
-    @parametrize_test("average_attn_weights", [True, False])
-    def test_multihead_attention(self, average_attn_weights):
-        def _scaled_dot_attn_ref(Q, K, V, dims, unseen_mask=None, key_padding_mask=None,
-                                 average_attn_weights=average_attn_weights):
-            """ Numpy-based reference implementation of scaled dot attention
-            for testing"""
-
-            QKT = _batchmatmul(
-                Q,
-                np.transpose(K, axes=[0, 1, 3, 2])
-                / np.sqrt(dims[3], dtype=np.float32),  # divide by sqrt(d_head)
-            )
-            b1, b2, s1, s2 = QKT.shape
-            if unseen_mask is not None or key_padding_mask is not None:
-                # assert s1 == s2
-                for i in range(b1):
-                    for j in range(b2):
-                        for m in range(s1):
-                            for n in range(s2):
-                                if unseen_mask is not None and unseen_mask[m][n] == 0:
-                                    QKT[i, j, m, n] = -np.inf
-                                if key_padding_mask is not None and key_padding_mask[i][n]:
-                                    QKT[i, j, m, n] = -np.inf
-
-            reference = _softmax(QKT)
-            ref_attn_weight = reference
-            if average_attn_weights:
-                ref_attn_weight = np.sum(ref_attn_weight, axis=1) / b2
-            reference = _batchmatmul(reference, V)
-            return reference, ref_attn_weight
-
-        def _batchmatmul(a, b):  # batchmatmul over 4 dim matrix
-            """ Numpy-based batch matrix multiply over 4 dim matrix"""
-            assert a.shape[0] == b.shape[0]
-            assert a.shape[1] == b.shape[1]
-            retval = np.zeros(
-                (a.shape[0], a.shape[1], a.shape[2], b.shape[3]), dtype=np.float32
-            )
-            for i in range(a.shape[0]):
-                for j in range(a.shape[1]):
-                    retval[i, j, :, :] = np.matmul(a[i, j, :, :], b[i, j, :, :])
-            return retval
-
-        def _softmax(x):  # softmax over 4 dim matrix
-            """ Numpy-based reference softmax over 4 dim matrix"""
-            np.seterr(invalid='ignore')
-            output = np.zeros(x.shape, dtype=np.float64)
-            for i in range(x.shape[0]):
-                for j in range(x.shape[1]):
-                    for k in range(x.shape[2]):
-                        x_curr = x[i, j, k, :]
-                        e_x = np.exp(x_curr - np.amax(x_curr))
-                        output[i, j, k, :] = e_x / np.sum(e_x)
-            return output
-
-        def _split_heads_ref(X, dims, nheads, d_head):
-            X_split = np.reshape(X, dims[:2] + [nheads, d_head])
-            X_split_transposed = np.transpose(X_split, [0, 2, 1, 3])
-            reference = np.reshape(X_split_transposed, [dims[0], nheads, dims[1], d_head])
-            return reference
-
-        def _combine_heads_ref(X, dims, nheads, d_head):
-            X_transposed = np.transpose(X, [0, 2, 1, 3])
-            reference = np.reshape(X_transposed, dims[:2] + [nheads * d_head])
-            return reference
-
-        def _fc(X, X_weight, X_bias):
-            X_fc_b = X_bias.detach().numpy()
-            X_fc_w = X_weight.detach().numpy()
-            return np.matmul(X, np.transpose(X_fc_w)) + X_fc_b
-
-        def _create_src_lengths_mask(batch_size, src_lengths):
-            """
-            Generate boolean mask to prevent attention beyond the end of source
-            Inputs:
-              batch_size : int
-              src_lengths : [batch_size] of sentence lengths
-            Outputs:
-              [batch_size, max_src_len]
-            """
-            max_srclen = src_lengths.max()
-            src_indices = torch.arange(0, max_srclen).unsqueeze(0).to(src_lengths)
-            src_indices = src_indices.expand(batch_size, max_srclen)
-            src_lengths = src_lengths.unsqueeze(dim=1).expand(batch_size, max_srclen)
-            # returns [batch_size, max_seq_len]
-            return (src_indices < src_lengths).int().detach()
-
-        def _multihead_attn_test_helper(add_key_padding_mask=False, add_bias_kv=False, add_zero_attn=False,
-                                        saved_kv=False, same_embed_dim=False,
-                                        average_attn_weights=average_attn_weights):
-            for _ in range(100):
-                batch_sz, seq_len = [random.randint(2, 10) for r in range(2)]
-                d_head = random.randint(3, 10)
-                nheads = random.randint(2, 5) * 2
-                d_model = d_head * nheads
-                if same_embed_dim:
-                    kv_dim = d_model
-                else:
-                    kv_dim = random.randint(5, 20)
-                dims = [batch_sz, seq_len, kv_dim]
-
-                saved_k = None
-                saved_k_tensor = None
-                saved_v = None
-                saved_v_tensor = None
-                if saved_kv:
-                    saved_k = np.random.rand(batch_sz * nheads, seq_len, d_head)
-                    saved_k_tensor = torch.from_numpy(saved_k).to(torch.get_default_dtype())
-                    saved_v = np.random.rand(batch_sz * nheads, seq_len, d_head)
-                    saved_v_tensor = torch.from_numpy(saved_v).to(torch.get_default_dtype())
-
-                key_padding_mask = None
-                key_padding_mask_tensor = None
-                if add_key_padding_mask:
-                    seq_mask = np.random.randint(0, 2, (1, seq_len))
-                    key_padding_mask = (np.repeat(seq_mask, batch_sz, axis=0) == 1)
-                    key_padding_mask_tensor = torch.from_numpy(key_padding_mask)
-                decoder_state = np.random.rand(batch_sz, d_model)
-                K = np.random.rand(*dims)
-                V = K
-                Q = np.expand_dims(decoder_state, 1)
-                attn_mask = np.random.randint(0 , 2, size=(1, seq_len))
-                attn_mask_tensor = torch.from_numpy(attn_mask).float()
-                attn_mask_tensor.masked_fill_(attn_mask_tensor == 0, float('-inf'))
-                attn_mask_tensor.masked_fill_(attn_mask_tensor > 0, float('0.0'))
-                attn_mask_tensor = attn_mask_tensor.double()
-
-                decoder_state_tensor = torch.from_numpy(decoder_state).to(torch.get_default_dtype())
-                source_hid_tensor = torch.from_numpy(K).to(torch.get_default_dtype()).transpose(0, 1)
-
-                multihead_attn_module = MultiheadAttention(d_model, nheads,
-                                                           add_bias_kv=add_bias_kv,
-                                                           add_zero_attn=add_zero_attn,
-                                                           kdim=kv_dim, vdim=kv_dim)
-
-                if add_bias_kv:
-                    bias_k = multihead_attn_module.bias_k.detach().numpy()
-                    bias_v = multihead_attn_module.bias_v.detach().numpy()
-                else:
-                    bias_k = None
-                    bias_v = None
-
-                _Q = decoder_state_tensor.unsqueeze(1).transpose(0, 1)
-                _V = source_hid_tensor
-                _K = source_hid_tensor
-
-                if multihead_attn_module._qkv_same_embed_dim:
-                    result, result_weight = torch.nn.functional.multi_head_attention_forward(
-                        _Q, _K, _V,
-                        d_model, nheads,
-                        multihead_attn_module.in_proj_weight, multihead_attn_module.in_proj_bias,
-                        multihead_attn_module.bias_k, multihead_attn_module.bias_v,
-                        multihead_attn_module.add_zero_attn, multihead_attn_module.dropout,
-                        multihead_attn_module.out_proj.weight, multihead_attn_module.out_proj.bias,
-                        multihead_attn_module.training, key_padding_mask_tensor, True, attn_mask_tensor,
-                        static_k=saved_k_tensor, static_v=saved_v_tensor,
-                        average_attn_weights=average_attn_weights)
-                else:
-                    result, result_weight = torch.nn.functional.multi_head_attention_forward(
-                        _Q, _K, _V,
-                        d_model, nheads,
-                        None, multihead_attn_module.in_proj_bias,
-                        multihead_attn_module.bias_k, multihead_attn_module.bias_v,
-                        multihead_attn_module.add_zero_attn, multihead_attn_module.dropout,
-                        multihead_attn_module.out_proj.weight, multihead_attn_module.out_proj.bias,
-                        multihead_attn_module.training, key_padding_mask_tensor, True, attn_mask_tensor,
-                        True, multihead_attn_module.q_proj_weight,
-                        multihead_attn_module.k_proj_weight, multihead_attn_module.v_proj_weight,
-                        static_k=saved_k_tensor, static_v=saved_v_tensor,
-                        average_attn_weights=average_attn_weights)
-
-                result = result.squeeze(0).detach().numpy()
-
-                if multihead_attn_module._qkv_same_embed_dim:
-                    q_proj_weight = multihead_attn_module.in_proj_weight[:d_model]
-                    k_proj_weight = multihead_attn_module.in_proj_weight[d_model:(d_model * 2)]
-                    v_proj_weight = multihead_attn_module.in_proj_weight[(d_model * 2):]
-                else:
-                    q_proj_weight = multihead_attn_module.q_proj_weight
-                    k_proj_weight = multihead_attn_module.k_proj_weight
-                    v_proj_weight = multihead_attn_module.v_proj_weight
-
-                Q_fc = _fc(Q, q_proj_weight, multihead_attn_module.in_proj_bias[:d_model])
-                K_fc = _fc(K, k_proj_weight, multihead_attn_module.in_proj_bias[d_model:(d_model * 2)])
-                V_fc = _fc(V, v_proj_weight, multihead_attn_module.in_proj_bias[(d_model * 2):])
-
-                if add_bias_kv:
-                    K_fc = np.concatenate((K_fc, np.repeat(bias_k, K_fc.shape[0], axis=0)), axis=1)
-                    V_fc = np.concatenate((V_fc, np.repeat(bias_v, V_fc.shape[0], axis=0)), axis=1)
-                    if attn_mask is not None:
-                        attn_mask = np.concatenate((attn_mask, np.ones([1, 1])), axis=1)
-                    if key_padding_mask is not None:
-                        key_padding_mask = np.concatenate((key_padding_mask, np.full((batch_sz, 1), False, dtype=bool)), axis=1)
-                    dims[1] += 1
-                Q_split = _split_heads_ref(
-                    Q_fc, [batch_sz, 1, d_model], nheads, d_head
-                )
-
-                if saved_k is not None:
-                    K_split = np.reshape(saved_k, [dims[0], nheads, dims[1], d_head])
-                else:
-                    K_split = _split_heads_ref(K_fc, dims, nheads, d_head)
-
-                if saved_v is not None:
-                    V_split = np.reshape(saved_v, [dims[0], nheads, dims[1], d_head])
-                else:
-                    V_split = _split_heads_ref(V_fc, dims, nheads, d_head)
-
-                if add_zero_attn:
-                    dims[1] += 1
-                    K_split = np.concatenate((K_split, np.zeros([K_split.shape[0], K_split.shape[1], 1, K_split.shape[3]])), axis=2)
-                    V_split = np.concatenate((V_split, np.zeros([V_split.shape[0], V_split.shape[1], 1, V_split.shape[3]])), axis=2)
-
-                    if attn_mask is not None:
-                        attn_mask = np.concatenate((attn_mask, np.ones([1, 1])), axis=1)
-
-                    if key_padding_mask is not None:
-                        key_padding_mask = np.concatenate((key_padding_mask, np.full((batch_sz, 1), False, dtype=bool)), axis=1)
-                attn_heads, ref_attn_weight = _scaled_dot_attn_ref(
-                    Q=Q_split,
-                    K=K_split,
-                    V=V_split,
-                    dims=Q_split.shape,
-                    unseen_mask=attn_mask,
-                    key_padding_mask=key_padding_mask
-                )
-                combined_attn_heads = _combine_heads_ref(
-                    X=attn_heads, dims=[batch_sz, 1], nheads=nheads, d_head=d_head
-                )
-
-                reference = _fc(combined_attn_heads, multihead_attn_module.out_proj.weight, multihead_attn_module.out_proj.bias)
-                reference = np.squeeze(reference, axis=1)
-
-                # result = reference
-                self.assertEqual(tuple(result.shape), (batch_sz, d_model))
-                np.testing.assert_allclose(result, reference, atol=1e-5)
-
-                # result_weight = ref_attn_weight
-                result_weight = result_weight.detach().numpy()
-                self.assertEqual(tuple(result_weight.shape), tuple(ref_attn_weight.shape))
-                np.testing.assert_allclose(result_weight, ref_attn_weight, atol=1e-5)
-
-        def test_multihead_attn_add_bias_kv():
-            _multihead_attn_test_helper(add_bias_kv=True)
-
-        def test_multihead_attn_add_zero_attn():
-            _multihead_attn_test_helper(add_zero_attn=True)
-
-        def test_multihead_attn_no_masking():
-            _multihead_attn_test_helper()
-
-        def test_multihead_attn_key_padding_mask():
-            _multihead_attn_test_helper(add_key_padding_mask=True)
-
-        def test_multihead_attn_saved_kv():
-            _multihead_attn_test_helper(saved_kv=True)
-
-        def test_multihead_attn_add_bias_kv_zero_attn():
-            _multihead_attn_test_helper(add_key_padding_mask=True, add_bias_kv=True,
-                                        add_zero_attn=True)
-
-        def test_multihead_attn_all_arguments1():
-            _multihead_attn_test_helper(add_key_padding_mask=True, add_zero_attn=True, saved_kv=True)
-
-        def test_multihead_attn_all_arguments2():
-            _multihead_attn_test_helper(add_key_padding_mask=True, add_bias_kv=True,
-                                        add_zero_attn=True, saved_kv=True)
-
-        def test_multihead_attn_all_arguments3():
-            _multihead_attn_test_helper(add_key_padding_mask=True, add_zero_attn=True,
-                                        saved_kv=True, same_embed_dim=True)
-
-        test_multihead_attn_add_zero_attn()  # Test MultiheadAttention with add_zero_attn
-        test_multihead_attn_add_bias_kv()  # Test MultiheadAttention with add_bias_kv
-        test_multihead_attn_no_masking()   # Test MultiheadAttention without masking
-        test_multihead_attn_key_padding_mask()  # Test MultiheadAttention with src lengths
-        test_multihead_attn_saved_kv()  # Test MultiheadAttention with static kv.
-        test_multihead_attn_add_bias_kv_zero_attn()  # Test MultiheadAttention with bias_kv and zero_attn.
-        test_multihead_attn_all_arguments1()  # Test MultiheadAttention with all the argument.
-        with self.assertRaisesRegex(AssertionError, "bias cannot be added to static key."):
-            test_multihead_attn_all_arguments2()  # Test MultiheadAttention with all the argument.
-        test_multihead_attn_all_arguments3()  # Test MultiheadAttention with all the argument.
-
-    def test_multihead_attn_3d_attn_mask(self):
-        embed_dim = 8
-        num_heads = 4
-        batch_size = 8
-        src_len = 3
-        tgt_len = 2
-
-        query = torch.rand(batch_size, tgt_len, embed_dim)  # [N, T, D]
-        key = torch.rand(batch_size, src_len, embed_dim)  # [N, S, D]
-        value = key  # [N, S, D]
-        attn_mask = torch.randint(0, 2, (batch_size, tgt_len, src_len)).float()  # [N, T, S]
-        attn_mask = attn_mask.masked_fill(attn_mask == 0, float('-inf')).masked_fill(attn_mask == 1, float(0.0))
-
-        mta_model = torch.nn.MultiheadAttention(embed_dim, num_heads)
-
-        # Generate 3D results
-        attn_mask_3d = torch.repeat_interleave(attn_mask, num_heads, dim=0)  # [N * H, T, S]
-        output_3d = mta_model(query.transpose(0, 1), key.transpose(0, 1), value.transpose(0, 1), attn_mask=attn_mask_3d)[0]
-        output_3d = output_3d.transpose(0, 1)  # [N, T, D]
-
-        for i in range(0, batch_size):
-            output_2d = mta_model(query[i].unsqueeze(0).transpose(0, 1),
-                                  key[i].unsqueeze(0).transpose(0, 1),
-                                  value[i].unsqueeze(0).transpose(0, 1),
-                                  attn_mask=attn_mask[i])[0]
-
-            # output_2d in shape of [T, 1, D]
-            self.assertEqual(output_3d[i].unsqueeze(0).transpose(0, 1), output_2d)
-
-    def test_multihead_attn_no_bias(self):
-        embed_dim = 8
-        num_heads = 4
-        mha = torch.nn.MultiheadAttention(embed_dim, num_heads, bias=False)
-
-        # Verify that bias=False applies to both in and out projection layers.
-        self.assertIsNone(mha.in_proj_bias)
-        self.assertIsNone(mha.out_proj.bias)
-
-    def _test_multihead_attn_invalid_shape_impl(self, mha):
-        # Batched (3D) query cases
-        query = torch.randn(4, 4, 4)
-        key = torch.randn(4, 4, 4)
-        value = torch.randn(4, 4, 4)
-
-        msg = "expected `key` and `value` to be 3-D but found 2-D and 3-D tensors respectively"
-        # 3D query, 2D key and 3D value
-        with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, torch.randn(4, 4), value)
-
-        msg = "expected `key` and `value` to be 3-D but found 3-D and 2-D tensors respectively"
-        # 3D query, 3D key and 2D value
-        with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, key, torch.randn(4, 4))
-
-        msg = "expected `key_padding_mask` to be `None` or 2-D but found 1-D tensor instead"
-        # 3D query, 3D key, 3D value and 1D key_padding_mask
-        with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, key, value, key_padding_mask=torch.tensor([False, False, True, True], dtype=torch.bool))
-
-        msg = "expected `attn_mask` to be `None`, 2-D or 3-D but found 1-D tensor instead"
-        # 3D query, 3D key, 3D value and 1D attn_mask
-        with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, key, value, attn_mask=torch.tensor([False, False, True, True], dtype=torch.bool))
-
-        # Unbatched (2D) query cases
-        query = torch.randn(4, 4)
-        key = torch.randn(4, 4)
-        value = torch.randn(4, 4)
-
-        msg = "expected `key` and `value` to be 2-D but found 3-D and 2-D tensors respectively"
-        # 2D query, 3D key and 2D value
-        with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, torch.randn(4, 4, 4), value)
-
-        msg = "expected `key` and `value` to be 2-D but found 2-D and 3-D tensors respectively"
-        # 2D query, 3D key and 2D value
-        with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, key, torch.randn(4, 4, 4))
-
-        msg = "expected `key_padding_mask` to be `None` or 1-D but found 2-D tensor instead"
-        # 2D query, 2D key, 2D value and 1D key_padding_mask
-        with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, key, value, key_padding_mask=torch.tensor([[False, False, True, True] * 2], dtype=torch.bool))
-
-        msg = "expected `attn_mask` to be `None`, 2-D or 3-D but found 1-D tensor instead"
-        # 2D query, 2D key, 2D value and 1D attn_mask
-        with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, key, value, attn_mask=torch.tensor([False, False, True, True], dtype=torch.bool))
-
-        msg = r"Expected `attn_mask` shape to be \(4, 4, 4\)"
-        # 2D query, 2D key, 2D value and 3D incorrect attn_mask
-        with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, key, value, attn_mask=torch.randn(5, 4, 4).bernoulli_().to(torch.bool))
-
-    def test_multihead_attn_invalid_shape(self):
-        mha = torch.nn.MultiheadAttention(4, 4)
-        self._test_multihead_attn_invalid_shape_impl(mha)
-        # Give the test a chance to hit the fast path. (Right now, it
-        # won't, but gating may be less restricted in the future.)
-        with torch.no_grad():
-            self._test_multihead_attn_invalid_shape_impl(mha.eval())
-
-    @torch.no_grad()
-    def test_multihead_attn_fast_path_invalid_shape(self):
-        mha = torch.nn.MultiheadAttention(4, 4, batch_first=True).eval()
-
-        # Batched (3D) query cases
-        query = torch.randn(4, 4, 4)
-        key = torch.randn(4, 4, 4)
-        value = torch.randn(4, 4, 4)
-
-        # Currently, this case will just go to the slow path and get
-        # the usual message because it fails the requirement to be
-        # batched.
-        msg = "expected `key` and `value` to be 3-D but found 2-D and 3-D tensors respectively"
-        # 3D query, 2D key and 3D value
-        with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, torch.randn(3, 3), value, need_weights=False)
-
-        # Currently, this case will just go to the slow path and get
-        # the usual message because it fails the requirement to be
-        # batched.
-        msg = "expected `key` and `value` to be 3-D but found 3-D and 2-D tensors respectively"
-        # 3D query, 3D key and 2D value
-        with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, key, torch.randn(3, 3), need_weights=False)
-
-        msg = "expected `key_padding_mask` to be `None` or 2-D but found 1-D tensor instead"
-        # 3D query, 3D key, 3D value and 1D key_padding_mask
-        with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, key, value, key_padding_mask=torch.tensor([False, True, True], dtype=torch.bool), need_weights=False)
-
-        msg = "expected `attn_mask` to be `None`, 2-D or 3-D but found 1-D tensor instead"
-        # 3D query, 3D key, 3D value and 1D attn_mask
-        with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, key, value, attn_mask=torch.tensor([False, True, True], dtype=torch.bool), need_weights=False)
-
-        # Unbatched (2D) query cases
-        # NOTE: error messages are the same as regular path because the fast path doesn't support 2D.
-        query = torch.randn(4, 4)
-        key = torch.randn(4, 4)
-        value = torch.randn(4, 4)
-
-        msg = "expected `key` and `value` to be 2-D but found 3-D and 2-D tensors respectively"
-        # 2D query, 3D key and 2D value
-        with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, torch.randn(4, 4, 4), value)
-
-        msg = "expected `key` and `value` to be 2-D but found 2-D and 3-D tensors respectively"
-        # 2D query, 3D key and 2D value
-        with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, key, torch.randn(4, 4, 4))
-
-        msg = "expected `key_padding_mask` to be `None` or 1-D but found 2-D tensor instead"
-        # 2D query, 2D key, 2D value and 1D key_padding_mask
-        with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, key, value, key_padding_mask=torch.tensor([[False, False, True, True] * 2], dtype=torch.bool))
-
-        msg = "expected `attn_mask` to be `None`, 2-D or 3-D but found 1-D tensor instead"
-        # 2D query, 2D key, 2D value and 1D attn_mask
-        with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, key, value, attn_mask=torch.tensor([False, False, True, True], dtype=torch.bool))
-
-        msg = r"Expected `attn_mask` shape to be \(4, 4, 4\)"
-        # 2D query, 2D key, 2D value and 3D incorrect attn_mask
-        with self.assertRaisesRegex(AssertionError, msg):
-            mha(query, key, value, attn_mask=torch.randn(5, 4, 4).bernoulli_().to(torch.bool))
-
-    def test_multihead_attn_nested_tensor_outside_fast_path(self):
-        mha = torch.nn.MultiheadAttention(4, 4, batch_first=True).eval()
-        nt = torch.nested.nested_tensor([torch.randn(4, 4)])
-        # One tested platform (linux-bionic-py3.7-clang) has a torch_function for one
-        # or more of these. Take advantage of that to test the torch_function bailout.
-        has_torch_func = torch.overrides.has_torch_function(
-            (nt, mha.in_proj_weight, mha.in_proj_bias, mha.out_proj.weight, mha.out_proj.bias))
-        if has_torch_func:
-            msg = "MultiheadAttention does not support NestedTensor.*argument has_torch_function"
-        else:
-            msg = ("MultiheadAttention does not support NestedTensor outside of its fast path.*grad is " +
-                   "enabled and.*or biases requires_grad")
-        with self.assertRaisesRegex(AssertionError, msg):
-            mha(nt, nt, nt)
-
-        if has_torch_func:
-            # Just give up, they're all going to fail with the same message.
-            return
-
-        with torch.no_grad():
-            mha(nt, nt, nt)
-        with torch.inference_mode():
-            mha(nt, nt, nt)
-        nt = torch.nested.nested_tensor([torch.randn(4, 4, requires_grad=False)])
-        nt.requires_grad = False
-        with self.assertRaisesRegex(AssertionError, msg):
-            mha(nt, nt, nt)
-        mha.in_proj_weight.requires_grad = False
-        mha.in_proj_bias.requires_grad = False
-        mha.out_proj.weight.requires_grad = False
-        mha.out_proj.bias.requires_grad = False
-        mha(nt, nt, nt)
-
     def test_normalize(self):
         inputs = torch.randn(1, 3, 4, 4, requires_grad=True)
         self.assertTrue(gradcheck(lambda x: F.normalize(x, p=1, dim=-1), (inputs,)))
@@ -9989,93 +9502,6 @@ def softmax_on_device(mask, input, device):
                     cuda_res = softmax_on_device(mask, input, "cuda")
                     self.assertEqual(cpu_res, cuda_res, exact_dtype=True)
 
-    def test_multihead_self_attn_two_masks_fast_path(self, device):
-        """
-        Multihead self-attention should give the same result on the fast path (BetterTransformer) as on the slow path
-        when both attention mask (mask type 0) and key padding mask (mask type 1) are provided
-        """
-        with torch.no_grad():
-            embed_dim = 14
-            num_heads = 7
-            batch_size = 8
-            src_len = 5
-
-            query = value = key = torch.rand(batch_size, src_len, embed_dim).to(device)
-            # Create masks of two different types
-            attn_mask = torch.randint(0, 2, (src_len, src_len)).bool().to(device)
-            key_padding_mask = torch.randint(0, 2, (batch_size, src_len)).bool().to(device)
-
-            # We'll need expanded versions of the masks for masking out the outputs below
-            attn_mask_expanded = attn_mask.reshape(1, 1, src_len, src_len) \
-                                          .expand(batch_size, num_heads, src_len, src_len)
-            key_padding_mask_expanded = key_padding_mask.reshape(batch_size, 1, 1, src_len) \
-                                                        .expand(batch_size, num_heads, src_len, src_len)
-            merged_mask = attn_mask_expanded.logical_or(key_padding_mask_expanded)
-
-            # Compute attention on the fast path
-            mta_model = torch.nn.MultiheadAttention(embed_dim, num_heads, batch_first=True, device=device)
-            mta_model.training = False
-            result_fast_path, _ = mta_model(query, key, value, attn_mask=attn_mask, key_padding_mask=key_padding_mask)
-
-            # Compute attention on the slow path
-            result_ref, _ = torch.nn.functional.multi_head_attention_forward(query.transpose(0, 1),
-                                                                             key.transpose(0, 1),
-                                                                             value.transpose(0, 1),
-                                                                             embed_dim, num_heads,
-                                                                             mta_model.in_proj_weight,
-                                                                             mta_model.in_proj_bias,
-                                                                             mta_model.bias_k, mta_model.bias_v,
-                                                                             mta_model.add_zero_attn,
-                                                                             mta_model.dropout,
-                                                                             mta_model.out_proj.weight,
-                                                                             mta_model.out_proj.bias,
-                                                                             training=mta_model.training,
-                                                                             key_padding_mask=key_padding_mask,
-                                                                             need_weights=False,
-                                                                             attn_mask=attn_mask,
-                                                                             use_separate_proj_weight=False,
-                                                                             q_proj_weight=mta_model.q_proj_weight,
-                                                                             k_proj_weight=mta_model.k_proj_weight,
-                                                                             v_proj_weight=mta_model.v_proj_weight,
-                                                                             average_attn_weights=False,
-                                                                             )
-            result_ref = result_ref.transpose(0, 1)  # Convert to batch-first
-
-            # Rows which are completely masked out are nan, we need to exclude them from comparison
-            mask_out = merged_mask[:, 0, :, :].all(-1, keepdim=True).expand(batch_size, src_len, embed_dim)
-            result_fast_path_masked = result_fast_path.masked_fill(mask_out, 0)
-            result_ref_masked = result_ref.masked_fill(mask_out, 0)
-
-            self.assertEqual(result_fast_path_masked, result_ref_masked)
-
-    @torch.no_grad()
-    @unittest.skipIf(TEST_WITH_CROSSREF, 'CrossRef turns on TorchFunctionMode, and so disables fastpath.')
-    def test_multihead_self_attn_two_masks_fast_path_mock(self, device):
-        """
-        Multihead self-attention should take fast path when both attention mask (mask type 0)
-        and key padding mask (mask type 1) are provided at the same time on CPU and CUDA
-        """
-        if device not in ['cpu', 'cuda']:
-            self.skipTest("Fastpath only runs on CPU and CUDA.")
-        with torch.autocast(device_type=device, enabled=False):
-            embed_dim = 14
-            num_heads = 7
-            batch_size = 8
-            src_len = 5
-
-            query = value = key = torch.rand(batch_size, src_len, embed_dim).to(device)
-            # Create masks of two different types
-            attn_mask = torch.randint(0, 2, (src_len, src_len)).bool().to(device)
-            key_padding_mask = torch.randint(0, 2, (batch_size, src_len)).bool().to(device)
-
-            with mock.patch('torch._native_multi_head_attention') as fastpath_mock:
-                # Compute attention on the fast path
-                mta_model = torch.nn.MultiheadAttention(embed_dim, num_heads, batch_first=True, device=device).eval()
-                mta_model.training = False
-                mta_model(query, key, value, attn_mask=attn_mask, key_padding_mask=key_padding_mask)
-                # If mock was called, fastpath was taken
-                self.assertTrue(fastpath_mock.called)
-
     def test_masked_softmax(self, device):
         sizes = [(1, 1, 32), (3, 16, 310), (12, 4, 1024), (4, 2, 1200)]
         for (B, num_heads, L) in sizes:
@@ -10837,47 +10263,6 @@ def test_softmax(self, device, dtype):
         # should be bitwise equal
         self.assertEqual(input.grad, inputf.grad.to(dtype), atol=0, rtol=0)
 
-    @onlyCUDA
-    @dtypes(torch.half, torch.float, torch.double)
-    def test_multihead_attention_dtype(self, device, dtype):
-        embed_dim = 128
-        num_heads = 8
-        sl = 10
-        bs = 8
-        model = nn.MultiheadAttention(embed_dim, num_heads).cuda().to(dtype)
-        q = torch.randn(sl, bs, embed_dim, device=device, dtype=dtype)
-        k = torch.randn(sl, bs, embed_dim, device=device, dtype=dtype)
-        v = torch.randn(sl, bs, embed_dim, device=device, dtype=dtype)
-        out = model(q, k, v)
-        self.assertEqual(q.size(), out[0].size())
-        self.assertEqual(dtype, out[0].dtype)
-
-    @onlyCUDA
-    @dtypes(torch.half, torch.float, torch.double)
-    def test_multihead_attention_dtype_batch_first(self, device, dtype):
-        embed_dim = 128
-        num_heads = 8
-        sl = 10
-        bs = 8
-        # With batch_first=True, we have the possibility of hitting
-        # the native fast path if we call .eval() and enable inference
-        # mode. Test both paths.
-        for training in (True, False):
-            model = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True).cuda().to(dtype)
-            if not training:
-                model = model.eval()
-                cm = torch.no_grad()
-            else:
-                cm = contextlib.nullcontext()
-            with cm:
-                q = torch.randn(bs, sl, embed_dim, device=device, dtype=dtype)
-                k = torch.randn(bs, sl, embed_dim, device=device, dtype=dtype)
-                v = torch.randn(bs, sl, embed_dim, device=device, dtype=dtype)
-                # fast path currently doesn't support weights
-                out = model(q, k, v, need_weights=False)
-                self.assertEqual(q.size(), out[0].size())
-                self.assertEqual(dtype, out[0].dtype)
-
     def _test_batchnorm_grad(self, device, dtype=torch.double):
         bs, n_feat, size_feat = 4, 5, 6
         input = torch.arange(bs * n_feat * size_feat, device=device,
@@ -12398,39 +11783,6 @@ def perm_fn(x):
                 with cm:
                     _test(batch_first=batch_first, training=training, atol=atol, rtol=rtol)
 
-    @dtypes(torch.double)
-    @torch.no_grad()
-    def test_multihead_attn_fast_path_query_and_bias_have_different_dtypes(self, device, dtype):
-        mha = torch.nn.MultiheadAttention(4, 4, batch_first=True, dtype=dtype, device=device).eval()
-        mha.in_proj_bias = torch.nn.Parameter(mha.in_proj_bias.to(torch.half).to(device))
-        query = torch.randn(4, 4, 4, dtype=dtype, device=device)
-        mha(query, query, query)
-
-    @dtypes(torch.double)
-    @torch.no_grad()
-    def test_multihead_attn_fast_path_small_test(self, device, dtype):
-        mha = torch.nn.MultiheadAttention(4, 4, batch_first=True, dtype=dtype, device=device).eval()
-        query = torch.randn(4, 4, 4, dtype=dtype, device=device)
-        mha(query, query, query)
-
-    @dtypes(torch.double)
-    @torch.no_grad()
-    def test_multihead_attn_in_proj_bias_none(self, device, dtype):
-        mha = torch.nn.MultiheadAttention(2, 2, bias=False, dtype=dtype, device=device)
-        query = torch.rand(2, 2, 2, dtype=dtype, device=device)
-        mha(query, query, query)
-
-    @dtypes(torch.double)
-    @torch.no_grad()
-    def test_multihead_attn_in_proj_weight_none(self, device, dtype):
-        # Setting kdim == vdim == 2 means that vdim != embed_dim
-        # will cause the logic to use per-input project weights, thereby
-        # forcing self.in_proj_weight = None
-        mha = torch.nn.MultiheadAttention(4, 4, vdim=2, kdim=2, dtype=dtype, device=device)
-        query = torch.rand(4, 4, 4, dtype=dtype, device=device)
-        key = torch.rand(4, 4, 2, dtype=dtype, device=device)
-        mha(query, key, key)
-
     @onlyCPU
     @dtypes(torch.double)
     def test_transformerencoderlayer_fast_path(self, device, dtype):

From 6bfbba694965732adafa616c6641bfa7208209a5 Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Tue, 29 Nov 2022 18:44:25 +0000
Subject: [PATCH 1380/1922] Fix CopySlices logic to ensure wrapped node runs
 properly. (#89812)

This should remove the failures seen by https://github.com/pytorch/pytorch/pull/89720 in functionalization
Locally verified that running the following on top of this PR does pass: `python benchmarks/dynamo/huggingface.py --accuracy --backend aot_eager --training --only MobileBertForMaskedLM`
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89812
Approved by: https://github.com/soumith, https://github.com/voznesenskym, https://github.com/ezyang
---
 test/test_autograd.py                    | 53 ++++++++++++++++++++++++
 torch/csrc/autograd/functions/tensor.cpp | 42 ++++++++++++++++---
 2 files changed, 89 insertions(+), 6 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index 4b1e97cb3b2b5..5dd695e14b323 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -62,6 +62,59 @@ def graph_desc(fn):
 
 
 class TestAutograd(TestCase):
+    def test_copy_slices_graph_task_updates(self):
+        def f1(x, y):
+            out = x.clone().view(-1)
+            out += y
+            return out
+
+        def f2(x, y):
+            out = x.clone().view(-1)
+            b = out * 2
+            out += y
+            return out + b
+
+        x = torch.rand(2, requires_grad=True)
+        y = torch.rand(2, requires_grad=True)
+
+        y_safe = torch._C._functions.DelayedError("Boom!", 1)(y)
+
+        for f in [f1, f2]:
+            # Ensure that the error Node works
+            out = f(x, y_safe)
+            with self.assertRaisesRegex(RuntimeError, "Boom!"):
+                out.sum().backward()
+
+            out = f(x, y_safe)
+            with self.assertRaisesRegex(RuntimeError, "Boom!"):
+                torch.autograd.grad(out.sum(), y)
+
+            # Ensure that if we don't ask for y, it doesn't crash
+            out = f(x, y_safe)
+            torch.autograd.grad(out.sum(), x)
+
+            out = f(x, y_safe)
+            torch.autograd.grad(out.sum(), y_safe)
+
+            out = f(x, y_safe)
+            torch.autograd.grad(out.sum(), (x, y_safe))
+
+        # Ensure that we don't run extra view Node
+        def f3(x, y):
+            out = x.clone().view(-1)
+
+            def hook(*args):
+                # This should never be called!
+                self.assertTrue(False)
+            out.register_hook(hook)
+
+            b = out + y
+            out += y
+            return out + b, b
+
+        out, b = f3(x, y_safe)
+        torch.autograd.grad(out.sum(), (b, y_safe))
+
 
     def test_grad_mode_class_decoration(self):
         # Decorating class is deprecated and should not be used
diff --git a/torch/csrc/autograd/functions/tensor.cpp b/torch/csrc/autograd/functions/tensor.cpp
index 377c40ce388e2..0c60ab221a1c2 100644
--- a/torch/csrc/autograd/functions/tensor.cpp
+++ b/torch/csrc/autograd/functions/tensor.cpp
@@ -88,18 +88,48 @@ auto CopySlices::apply(variable_list&& inputs) -> variable_list {
         result.as_strided_symint(view.sym_sizes(), view.sym_strides(), offset);
   }
 
-  // Adding the missing nodes to the current graph's `exec_info`.
-  // This is a workaround because the current `GraphTask::init_to_execute`
-  // does not traverse into CopySlices node.
+  // Since the gradient edge for the 0th input is different between `this` and
+  // `fn`, make sure that the one from `fn` has the same metadata in the current
+  // GraphTask's exec_info as the one on `this`.
   const auto exec_info = get_current_graph_task_exec_info();
   if (exec_info && !exec_info->empty()) {
-    for (const auto& next : fn->next_edges()) {
-      if (next.is_valid()) {
-        add_node_to_current_graph_task_exec_info(next.function.get());
+    const auto& fn_edge = fn->next_edge(0);
+    const auto& this_edge = this->next_edge(0);
+    TORCH_INTERNAL_ASSERT(fn_edge.is_valid() == this_edge.is_valid());
+    if (fn_edge.is_valid()) {
+      const auto fn_next_node = fn_edge.function.get();
+      auto it = exec_info->find(fn_next_node);
+      if (it == exec_info->end()) {
+        // Node is not in the exec_info already
+        if (task_should_compute_output(0)) {
+          // And we need gradient for the corresponding output
+          add_node_to_current_graph_task_exec_info(fn_next_node);
+          // There is no need to remove this after execution because we are
+          // guaranteed that this->next_edge(0) must be in the history of
+          // fn->next_edge(0) (we cannot easily assert this as it might be far
+          // away if there were many chained views). This means that, since
+          // fn->next_edge(0) was not needed (no exec_info entry for it), we
+          // know that nothing downstream of fn->next_edge(0) is needed either
+          // (otherwise the whole path from that Node to this->next_edge(0)
+          // would be needed as well). This means that no other Node will ever
+          // look at fn->next_edge(0) metadata and thus there is no need to
+          // clean them up.
+        }
+      } else {
+        TORCH_INTERNAL_ASSERT(
+            it->second.should_execute() == task_should_compute_output(0));
       }
     }
   }
 
+  // Sanity check that the graph was never modified after the fact (it is
+  // read-only!)
+  TORCH_INTERNAL_ASSERT(num_outputs() == fn->num_outputs());
+  for (const auto i : c10::irange(1, this->num_outputs())) {
+    TORCH_INTERNAL_ASSERT(
+        fn->next_edge(i).function.get() == this->next_edge(i).function.get());
+  }
+
   // TODO: We clone grad_slice because we modify it below and "fn" might save
   // it for the backward of res. We might be able to avoid the clone() if
   // double-backprop is disabled.

From e8fce72f22d5982b27d9bb856fb720a1aa406d2a Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Tue, 29 Nov 2022 10:57:15 -0800
Subject: [PATCH 1381/1922] Don't unsafely clone autograd meta (#89720)

Addresses this CR comment https://github.com/pytorch/pytorch/pull/88817/files#r1024618045

This appears to fix Dynamo+DDP+hf_BERT test but I don't
know how to make a minimum reproducer.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89720
Approved by: https://github.com/soumith, https://github.com/bdhirsh, https://github.com/malfet
---
 torch/csrc/autograd/python_torch_functions_manual.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/autograd/python_torch_functions_manual.cpp b/torch/csrc/autograd/python_torch_functions_manual.cpp
index 0a9a71a01a6c6..6aaaaf0eff6e9 100644
--- a/torch/csrc/autograd/python_torch_functions_manual.cpp
+++ b/torch/csrc/autograd/python_torch_functions_manual.cpp
@@ -1,6 +1,9 @@
 #include <torch/csrc/Dtype.h>
 #include <torch/csrc/DynamicTypes.h>
 #include <torch/csrc/Exceptions.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/functions/basic_ops.h>
+#include <torch/csrc/autograd/functions/utils.h>
 #include <torch/csrc/autograd/generated/variable_factories.h>
 #include <torch/csrc/autograd/python_torch_functions.h>
 #include <torch/csrc/autograd/python_variable.h>
@@ -421,8 +424,11 @@ static PyObject* THPVariable__to_functional_tensor(
     if (inner_autograd_meta) {
       wrapped.set_requires_grad(self_.requires_grad());
       if (wrapped.requires_grad()) {
-        impl::get_autograd_meta(wrapped)->grad_fn_ =
-            inner_autograd_meta->grad_fn_;
+        auto new_grad_fn = std::shared_ptr<torch::autograd::Error>(
+            new torch::autograd::Error(
+                "Cannot backprop through mirrored meta, file a bug in PyTorch"),
+            torch::autograd::deleteNode);
+        torch::autograd::set_history(wrapped, new_grad_fn);
       }
     }
   }

From 6091803c5f4b4d5271d2ff7994720095667e5837 Mon Sep 17 00:00:00 2001
From: Erjia Guan <erjia@meta.com>
Date: Tue, 29 Nov 2022 19:21:53 +0000
Subject: [PATCH 1382/1922] Deprecating DataPipes (#89794)

Summary: per title

Test Plan:
`buck2 test buck2 test //caffe2/test:datapipe` https://www.internalfb.com/intern/testinfra/testconsole/testrun/6473924589747074/
`buck2 test mode/opt //pytorch/data/test:tests`

Differential Revision: D41563765

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89794
Approved by: https://github.com/wenleix, https://github.com/NivekT
---
 torch/utils/data/datapipes/iter/__init__.py   |  2 --
 torch/utils/data/datapipes/iter/fileopener.py | 19 +------------------
 2 files changed, 1 insertion(+), 20 deletions(-)

diff --git a/torch/utils/data/datapipes/iter/__init__.py b/torch/utils/data/datapipes/iter/__init__.py
index 72384ec5ce947..b3007799e29b9 100644
--- a/torch/utils/data/datapipes/iter/__init__.py
+++ b/torch/utils/data/datapipes/iter/__init__.py
@@ -20,7 +20,6 @@
     FileListerIterDataPipe as FileLister,
 )
 from torch.utils.data.datapipes.iter.fileopener import (
-    FileLoaderIterDataPipe as FileLoader,
     FileOpenerIterDataPipe as FileOpener,
 )
 from torch.utils.data.datapipes.iter.grouping import (
@@ -44,7 +43,6 @@
            'Concater',
            'Demultiplexer',
            'FileLister',
-           'FileLoader',
            'FileOpener',
            'Filter',
            'Forker',
diff --git a/torch/utils/data/datapipes/iter/fileopener.py b/torch/utils/data/datapipes/iter/fileopener.py
index f3e00b397cd9f..03d5761a9f164 100644
--- a/torch/utils/data/datapipes/iter/fileopener.py
+++ b/torch/utils/data/datapipes/iter/fileopener.py
@@ -3,11 +3,10 @@
 
 from torch.utils.data.datapipes._decorator import functional_datapipe
 from torch.utils.data.datapipes.datapipe import IterDataPipe
-from torch.utils.data.datapipes.utils.common import get_file_binaries_from_pathnames, _deprecation_warning
+from torch.utils.data.datapipes.utils.common import get_file_binaries_from_pathnames
 
 __all__ = [
     "FileOpenerIterDataPipe",
-    "FileLoaderIterDataPipe",
 ]
 
 
@@ -71,19 +70,3 @@ def __len__(self):
         if self.length == -1:
             raise TypeError("{} instance doesn't have valid length".format(type(self).__name__))
         return self.length
-
-
-class FileLoaderIterDataPipe(IterDataPipe[Tuple[str, IOBase]]):
-
-    def __new__(
-            cls,
-            datapipe: Iterable[str],
-            mode: str = 'b',
-            length: int = -1):
-        _deprecation_warning(
-            cls.__name__,
-            deprecation_version="1.12",
-            removal_version="1.13",
-            new_class_name="FileOpener",
-        )
-        return FileOpenerIterDataPipe(datapipe=datapipe, mode=mode, length=length)

From 8b8f9c8a0188f0f644ab99f170de1ed0b7317afa Mon Sep 17 00:00:00 2001
From: David Berard <dberard@fb.com>
Date: Mon, 28 Nov 2022 12:19:25 -0800
Subject: [PATCH 1383/1922] Move gpu slow tests to sm86 (#87880)

NVFuser tests (which are slow tests) would be better to run on more
modern GPU hardware.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87880
Approved by: https://github.com/malfet
---
 .github/workflows/trunk.yml                   |  4 +--
 test/test_torch.py                            |  7 ++--
 .../_internal/opinfo/definitions/_masked.py   | 33 +++++++++++++++++++
 3 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 6779a362209c2..85526deccbf90 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -60,8 +60,6 @@ jobs:
           { config: "default", shard: 3, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
           { config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
           { config: "functorch", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "slow", shard: 1, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "slow", shard: 2, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" },
           { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
           { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
           { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
@@ -92,6 +90,8 @@ jobs:
           { config: "default", shard: 2, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "default", shard: 3, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "default", shard: 4, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "slow", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "slow", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "functorch", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
         ]}
 
diff --git a/test/test_torch.py b/test/test_torch.py
index 31759213ecefc..19949af23630b 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -114,6 +114,8 @@ def test_cuda_vitals_gpu_only(self, device):
             self.assertIn('CUDA.used\t\t true', torch.read_vitals())
 
 
+is_cuda_sm86 = torch.cuda.is_available() and torch.cuda.get_device_capability(0) == (8, 6)
+
 class TestTorchDeviceType(TestCase):
     exact_dtype = True
 
@@ -3912,9 +3914,10 @@ def test_dim_function_empty(self, device):
 
     # FIXME: find a test suite for the pdist operator
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "sandcastle OOM with current tpx gpu/re configuration")
+    @unittest.skipIf(is_cuda_sm86, "OOMs on sm86 configuration")
     @skipIfRocm
     @onlyCUDA
-    @largeTensorTest('10GB', device='cpu')
+    @largeTensorTest('12GB', device='cpu')
     @largeTensorTest('5GB', device='cuda')
     def test_pdist_norm_large(self, device):
         # use dim0>=46342 for forward, see:
@@ -3924,7 +3927,7 @@ def test_pdist_norm_large(self, device):
         # Will require 1249975000 float32s
         expected_cpu = torch.pdist(x, p=2)                  # ~1250M * 4 bytes = 5 GB on CPU
         actual_gpu = torch.pdist(x.to(device), p=2)         # 5 GB on GPU
-        self.assertEqual(expected_cpu, actual_gpu.cpu())    # Another 5 GB on CPU
+        self.assertEqual(expected_cpu, actual_gpu.cpu())    # Another 5 GB on CPU + 1.25GB for expected == actual
 
     # FIXME: move to elementwise ternary test suite
     @onlyNativeDeviceTypes
diff --git a/torch/testing/_internal/opinfo/definitions/_masked.py b/torch/testing/_internal/opinfo/definitions/_masked.py
index 10ada2cba7436..20025b98e3c48 100644
--- a/torch/testing/_internal/opinfo/definitions/_masked.py
+++ b/torch/testing/_internal/opinfo/definitions/_masked.py
@@ -399,6 +399,17 @@ def sample_inputs_masked_normalize(op_info, device, dtype, requires_grad, **kwar
             DecorateInfo(
                 unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
             ),
+            # Failing accuracy and extremal on sm86 (#89609)
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCudaFuserOpInfo",
+                "test_nvfuser_correctness",
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCudaFuserOpInfo",
+                "test_nvfuser_extremal_values",
+            ),
         ),
         decorators=[
             DecorateInfo(
@@ -596,6 +607,17 @@ def sample_inputs_masked_normalize(op_info, device, dtype, requires_grad, **kwar
                 "test_mask_layout",
                 dtypes=(torch.bool, *integral_types(), *complex_types()),
             ),
+            # Failing accuracy and extremal on sm86 (#89609)
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCudaFuserOpInfo",
+                "test_nvfuser_correctness",
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCudaFuserOpInfo",
+                "test_nvfuser_extremal_values",
+            ),
         ),
         sample_inputs_func=sample_inputs_masked_reduction,
         sample_inputs_sparse_coo_func=sample_inputs_sparse_coo_masked_reduction,
@@ -635,6 +657,17 @@ def sample_inputs_masked_normalize(op_info, device, dtype, requires_grad, **kwar
                 "test_mask_layout",
                 dtypes=(torch.bool, *integral_types(), *complex_types()),
             ),
+            # Failing accuracy and extremal on sm86 (#89609)
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCudaFuserOpInfo",
+                "test_nvfuser_correctness",
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCudaFuserOpInfo",
+                "test_nvfuser_extremal_values",
+            ),
         ),
         sample_inputs_func=sample_inputs_masked_reduction,
         sample_inputs_sparse_coo_func=sample_inputs_sparse_coo_masked_reduction,

From 2f593c59ce58025514e0271505f0970f36d50b25 Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Tue, 29 Nov 2022 21:43:23 +0000
Subject: [PATCH 1384/1922] Stream actually overrides __new__ so we need to
 patch it as well (#89592)

Avoids
```
$ python foo.py
Traceback (most recent call last):
  File "foo.py", line 3, in <module>
    a = torch.cuda.Stream()
  File "/home/albandes/local/pytorch/3.8_debug_source/torch/cuda/streams.py", line 34, in __new__
    return super(Stream, cls).__new__(cls, priority=priority, **kwargs)
TypeError: object.__new__() takes exactly one argument (the type to instantiate)
```
And now gets
```
$ python foo.py
Traceback (most recent call last):
  File "foo.py", line 3, in <module>
    a = torch.cuda.Stream()
  File "/home/albandes/local/pytorch/3.8_debug_source/torch/cuda/streams.py", line 34, in __new__
    return super(Stream, cls).__new__(cls, priority=priority, **kwargs)
  File "/home/albandes/local/pytorch/3.8_debug_source/torch/cuda/_utils.py", line 44, in err_fn
    raise RuntimeError(
RuntimeError: Tried to instantiate dummy base class Stream

```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89592
Approved by: https://github.com/soumith
---
 test/test_torch.py   | 13 +++++++++++++
 torch/cuda/_utils.py | 15 ++++++++++-----
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/test/test_torch.py b/test/test_torch.py
index 19949af23630b..df907d826d805 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -8478,6 +8478,19 @@ def test_conj_neg_tolist(self):
         self.assertEqual(y1, y1_expect.tolist())
         self.assertEqual(y2, y1_expect.imag.tolist())
 
+    @unittest.skipIf(torch.backends.cuda.is_built(), "Skipped for cuda-enabled build")
+    def test_no_cuda_monkeypatch(self):
+        # Note that this is not in test_cuda.py as this whole file is skipped when cuda
+        # is not available.
+        with self.assertRaisesRegex(RuntimeError, "Tried to instantiate dummy base class Stream"):
+            torch.cuda.Stream()
+
+        with self.assertRaisesRegex(RuntimeError, "Tried to instantiate dummy base class Event"):
+            torch.cuda.Event()
+
+        with self.assertRaisesRegex(RuntimeError, "Tried to instantiate dummy base class CUDAGraph"):
+            torch.cuda.graphs.CUDAGraph()
+
 # The following block extends TestTorch with negative dim wrapping tests
 # FIXME: replace these with OpInfo sample inputs or systemic OpInfo tests
 # Functions to test negative dimension wrapping
diff --git a/torch/cuda/_utils.py b/torch/cuda/_utils.py
index ff0a9a09711af..d3df895385236 100644
--- a/torch/cuda/_utils.py
+++ b/torch/cuda/_utils.py
@@ -35,8 +35,13 @@ def _get_device_index(device: Any, optional: bool = False,
 
 
 def _dummy_type(name: str) -> type:
-    def init_err(self):
-        class_name = self.__class__.__name__
-        raise RuntimeError(
-            "Tried to instantiate dummy base class {}".format(class_name))
-    return type(name, (object,), {"__init__": init_err})
+    def get_err_fn(is_init: bool):
+        def err_fn(obj, *args, **kwargs):
+            if is_init:
+                class_name = obj.__class__.__name__
+            else:
+                class_name = obj.__name__
+            raise RuntimeError(
+                "Tried to instantiate dummy base class {}".format(class_name))
+        return err_fn
+    return type(name, (object,), {"__init__": get_err_fn(True), "__new__": get_err_fn(False)})

From be32e6410a4df1e0c7eb79dcb4ef13fb927d8537 Mon Sep 17 00:00:00 2001
From: Driss Guessous <drisspg@fb.com>
Date: Tue, 29 Nov 2022 22:18:39 +0000
Subject: [PATCH 1385/1922] replace double transpose with single permute in
 nn.f.mha (#89847)

# Summary

I forgot about permute which was exactly what I wanted. Quick perf bump
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89847
Approved by: https://github.com/cpuhrsch, https://github.com/albanD
---
 torch/nn/functional.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 5da45046332b8..7b8324c7aa849 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -5185,7 +5185,7 @@ def multi_head_attention_forward(
 
     attn_output, attn_output_weights = _scaled_dot_product_attention(
         q, k, v, attn_mask, dropout_p, need_weights, False)
-    attn_output = attn_output.transpose(1, 2).transpose(0, 1).contiguous().view(bsz * tgt_len, embed_dim)
+    attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
 
     attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
     attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))

From feb6375f97b7b328df889d111d6c1868c7517640 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Tue, 29 Nov 2022 17:23:22 +0000
Subject: [PATCH 1386/1922] [FSDP] Another fix for `DTensor`,
 `use_orig_params=True` (#89845)

The issue for `test_2d_parallel.py` is that `DTensor` does not support the idiom `param.data = view` where `view` is a `DTensor`. To work around this, we do not preserve the parameter variable `param` and instead create a new parameter variable altogether via `nn.Parameter(view)`. Preserving the parameter variable when unsharded was not a strict requirement -- it just made sense to do that if we are already doing that when _sharded_, where it _is_ a strict requirement to support the optimizer step. The sharded case is not an issue for 2D because sharded implies local tensor, not `DTensor`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89845
Approved by: https://github.com/zhaojuanmao
---
 torch/distributed/fsdp/flat_param.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index 3ffb5bd123df3..515bf2f64d198 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -23,6 +23,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import Tensor
+from torch.distributed._tensor import DTensor
 from torch.distributed.fsdp._common_utils import (
     _set_fsdp_flattened,
     HandleTrainingState,
@@ -1291,6 +1292,12 @@ def _use_unsharded_views(self, as_params: bool) -> None:
             if hasattr(module, param_name):
                 delattr(module, param_name)
             if self._use_orig_params and as_params:
+                if type(view) is DTensor:
+                    # A `DTensor` `view` is not compatible with assigning
+                    # `param.data = view`, so we cannot preserve the parameter
+                    # variable.
+                    setattr(module, param_name, nn.Parameter(view))
+                    continue
                 param = self.flat_param._params[i]  # type: ignore[index]
                 setattr(module, param_name, param)
                 param.data = view

From 73993fe01be925b538b16653ca95ea69ea2cc21a Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@meta.com>
Date: Tue, 29 Nov 2022 23:45:53 +0000
Subject: [PATCH 1387/1922] Enable instance norm running mean test (#89793)

Followup action to https://github.com/pytorch/pytorch/pull/88697
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89793
Approved by: https://github.com/bdhirsh
---
 test/test_functionalization.py | 60 ++++++++++++++++++----------------
 1 file changed, 32 insertions(+), 28 deletions(-)

diff --git a/test/test_functionalization.py b/test/test_functionalization.py
index d699c03ed4173..aa97b2a392389 100644
--- a/test/test_functionalization.py
+++ b/test/test_functionalization.py
@@ -1305,13 +1305,15 @@ def forward(self, a_1):
 
 
     def test_instance_norm_running_mean_is_x(self):
+        size = 100
+
         def f(x):
             with enable_python_dispatcher():
-                return torch.instance_norm(torch.randn(20, 100, 35, 45), None, None, running_mean=x, running_var=torch.ones(100),
-                                           use_input_stats=True, momentum=0.1, eps=1e-5, cudnn_enabled=False)
-        # TODO: uncomment following line after functionalization can handle input mutations
-        # self.assert_functionalization(f, torch.zeros(100))
-        logs = self.get_logs(f, torch.zeros(100))
+                return torch.instance_norm(
+                    torch.arange(20 * size * 35 * 45, dtype=torch.float32).reshape(20, size, 35, 45), None, None,
+                    x, torch.ones(size), use_input_stats=True, momentum=0.1, eps=1e-5, cudnn_enabled=False)
+        self.assert_functionalization(f, torch.zeros(size))
+        logs = self.get_logs(f, torch.zeros(size))
         # On Windows, for instance_norm, the alias_copy's are reordered to come right before they need to be used
         # whereas on other platforms, the alias_copy's are before the view_copy's.
         # e.g., the alias_copy after the getitem_4 assignment would be moved to be right before the copy assignment.
@@ -1321,66 +1323,68 @@ def f(x):
 
 
 def forward(self, a_1):
-    randn = torch.ops.aten.randn.default([20, 100, 35, 45], device = device(type='cpu'), pin_memory = False)
+    arange = torch.ops.aten.arange.default(3150000, dtype = torch.float32, device = device(type='cpu'), pin_memory = False)
+    view_copy = torch.ops.aten.view_copy.default(arange, [20, 100, 35, 45]);  arange = None
     ones = torch.ops.aten.ones.default([100], device = device(type='cpu'), pin_memory = False)
     repeat = torch.ops.aten.repeat.default(a_1, [20])
     repeat_1 = torch.ops.aten.repeat.default(ones, [20])
-    view_copy = torch.ops.aten.view_copy.default(randn, [1, 2000, 35, 45]);  randn = None
+    view_copy_1 = torch.ops.aten.view_copy.default(view_copy, [1, 2000, 35, 45]);  view_copy = None
     empty = torch.ops.aten.empty.memory_format([0], dtype = torch.uint8, layout = torch.strided, device = device(type='cpu'))
-    _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(view_copy, None, None, repeat, repeat_1, True, 0.1, 1e-05);  view_copy = repeat = repeat_1 = None
+    _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(view_copy_1, None, None, repeat, repeat_1, True, 0.1, 1e-05);  view_copy_1 = repeat = repeat_1 = None
     getitem = _native_batch_norm_legit_functional[0]
     getitem_1 = _native_batch_norm_legit_functional[1]
     getitem_2 = _native_batch_norm_legit_functional[2]
     getitem_3 = _native_batch_norm_legit_functional[3]
     getitem_4 = _native_batch_norm_legit_functional[4];  _native_batch_norm_legit_functional = None
     alias_copy = torch.ops.aten.alias_copy.default(a_1)
-    view_copy_1 = torch.ops.aten.view_copy.default(getitem_3, [20, 100])
-    view_copy_2 = torch.ops.aten.view_copy.default(getitem_3, [20, 100]);  getitem_3 = None
-    mean = torch.ops.aten.mean.dim(view_copy_2, [0]);  view_copy_2 = None
+    view_copy_2 = torch.ops.aten.view_copy.default(getitem_3, [20, 100])
+    view_copy_3 = torch.ops.aten.view_copy.default(getitem_3, [20, 100]);  getitem_3 = None
+    mean = torch.ops.aten.mean.dim(view_copy_3, [0]);  view_copy_3 = None
     copy = torch.ops.aten.copy.default(alias_copy, mean);  alias_copy = mean = None
     alias_copy_1 = torch.ops.aten.alias_copy.default(ones);  ones = None
-    view_copy_3 = torch.ops.aten.view_copy.default(getitem_4, [20, 100])
-    view_copy_4 = torch.ops.aten.view_copy.default(getitem_4, [20, 100]);  getitem_4 = None
-    mean_1 = torch.ops.aten.mean.dim(view_copy_4, [0]);  view_copy_4 = None
+    view_copy_4 = torch.ops.aten.view_copy.default(getitem_4, [20, 100])
+    view_copy_5 = torch.ops.aten.view_copy.default(getitem_4, [20, 100]);  getitem_4 = None
+    mean_1 = torch.ops.aten.mean.dim(view_copy_5, [0]);  view_copy_5 = None
     copy_1 = torch.ops.aten.copy.default(alias_copy_1, mean_1);  alias_copy_1 = mean_1 = None
-    view_copy_5 = torch.ops.aten.view_copy.default(getitem, [20, 100, 35, 45]);  getitem = None
+    view_copy_6 = torch.ops.aten.view_copy.default(getitem, [20, 100, 35, 45]);  getitem = None
     alias_copy_2 = torch.ops.aten.alias_copy.default(copy);  copy = None
     copy_ = torch.ops.aten.copy_.default(a_1, alias_copy_2);  a_1 = alias_copy_2 = None
-    return view_copy_5
+    return view_copy_6
     """)  # noqa: B950
 
-            reinplaced_logs = self.get_logs(f, torch.zeros(100), reapply_views=True, run_reinplace=True)
+            reinplaced_logs = self.get_logs(f, torch.zeros(size), reapply_views=True, run_reinplace=True)
             self.assertExpectedInline(reinplaced_logs, """\
 
 
 def forward(self, a_1):
-    randn = torch.ops.aten.randn.default([20, 100, 35, 45], device = device(type='cpu'), pin_memory = False)
+    arange = torch.ops.aten.arange.default(3150000, dtype = torch.float32, device = device(type='cpu'), pin_memory = False)
+    view = torch.ops.aten.view.default(arange, [20, 100, 35, 45]);  arange = None
     ones = torch.ops.aten.ones.default([100], device = device(type='cpu'), pin_memory = False)
     repeat = torch.ops.aten.repeat.default(a_1, [20])
     repeat_1 = torch.ops.aten.repeat.default(ones, [20])
-    view = torch.ops.aten.view.default(randn, [1, 2000, 35, 45]);  randn = None
+    view_1 = torch.ops.aten.view.default(view, [1, 2000, 35, 45]);  view = None
     empty = torch.ops.aten.empty.memory_format([0], dtype = torch.uint8, layout = torch.strided, device = device(type='cpu'))
-    _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(view, None, None, repeat, repeat_1, True, 0.1, 1e-05);  view = repeat = repeat_1 = None
+    _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(view_1, None, None, repeat, repeat_1, True, 0.1, 1e-05);  view_1 = repeat = repeat_1 = None
     getitem = _native_batch_norm_legit_functional[0]
     getitem_1 = _native_batch_norm_legit_functional[1]
     getitem_2 = _native_batch_norm_legit_functional[2]
     getitem_3 = _native_batch_norm_legit_functional[3]
     getitem_4 = _native_batch_norm_legit_functional[4];  _native_batch_norm_legit_functional = None
     alias = torch.ops.aten.alias.default(a_1)
-    view_1 = torch.ops.aten.view.default(getitem_3, [20, 100])
-    view_2 = torch.ops.aten.view.default(getitem_3, [20, 100]);  getitem_3 = None
-    mean = torch.ops.aten.mean.dim(view_2, [0]);  view_2 = None
+    view_2 = torch.ops.aten.view.default(getitem_3, [20, 100])
+    view_3 = torch.ops.aten.view.default(getitem_3, [20, 100]);  getitem_3 = None
+    mean = torch.ops.aten.mean.dim(view_3, [0]);  view_3 = None
     copy = torch.ops.aten.copy.default(alias, mean);  alias = mean = None
     alias_1 = torch.ops.aten.alias.default(ones);  ones = None
-    view_3 = torch.ops.aten.view.default(getitem_4, [20, 100])
-    view_4 = torch.ops.aten.view.default(getitem_4, [20, 100]);  getitem_4 = None
-    mean_1 = torch.ops.aten.mean.dim(view_4, [0]);  view_4 = None
+    view_4 = torch.ops.aten.view.default(getitem_4, [20, 100])
+    view_5 = torch.ops.aten.view.default(getitem_4, [20, 100]);  getitem_4 = None
+    mean_1 = torch.ops.aten.mean.dim(view_5, [0]);  view_5 = None
     copy_1 = torch.ops.aten.copy_.default(alias_1, mean_1);  alias_1 = mean_1 = None
-    view_5 = torch.ops.aten.view.default(getitem, [20, 100, 35, 45]);  getitem = None
+    view_6 = torch.ops.aten.view.default(getitem, [20, 100, 35, 45]);  getitem = None
     alias_2 = torch.ops.aten.alias.default(copy);  copy = None
     copy_ = torch.ops.aten.copy_.default(a_1, alias_2);  a_1 = alias_2 = None
-    return view_5
+    return view_6
     """)  # noqa: B950
 
 
From 6da0dfe9c51bbb01bfa3321c2483dfedc48097bf Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Tue, 29 Nov 2022 11:10:41 -0800
Subject: [PATCH 1388/1922] [quant][docs] Move some of the descriptions out of
 codeblock (#89795)

Summary:
This is to make sure the description texts are wrapping around code, instead of being displayed as a single line

Test Plan:
visual inspections

Reviewers:

Subscribers:

Tasks:

Tags:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89795
Approved by: https://github.com/andrewor14
---
 torch/ao/quantization/fx/README.md | 159 ++++++++++++++++++-----------
 1 file changed, 102 insertions(+), 57 deletions(-)

diff --git a/torch/ao/quantization/fx/README.md b/torch/ao/quantization/fx/README.md
index 622acd30956cd..7816247dc3291 100644
--- a/torch/ao/quantization/fx/README.md
+++ b/torch/ao/quantization/fx/README.md
@@ -169,15 +169,20 @@ input - qat_linear_relu - output
   'pattern': nnqat.LinearReLU,
   'dtype_configs': [{input: torch.quint8, output: torch.quint8, weight: torch.qint8}],
 }
+```
+
+step 1: assign qconfig to each op (please see [TODO: link] for details)
 
-# step 1: assign qconfig to each op (please see [TODO: link] for details)
-# step 2: determine which qconfigs are valid according to the backend configuration (please see [TODO: link] for details)
+step 2: determine which qconfigs are valid according to the backend configuration (please see [TODO: link] for details)
 (we should add a warning here)
-# step 3: for subgraphs with validated qconfigs, insert qstub/dqstub/qdqstub needed
-# To talk about what happens in this step, let’s first define some terms. Let’s view the computation graph we showed about as a Graph consists of nodes and edges, each node here will be an FX Node that represents some computation, for example linear, and each edge will be a connection between two nodes, and each edge can both be viewed as the output of the previous Node or the input of the next Node.
 
-# The end goal for this step is to insert QDQStubs at edges so that we produce a graph of quantized reference model when each QDQStub represents a quantize operator followed by a dequantize operator.
+step 3: for subgraphs with validated qconfigs, insert qstub/dqstub/qdqstub needed
+
+To talk about what happens in this step, let’s first define some terms. Let’s view the computation graph we showed above as a Graph consists of nodes and edges, each node here will be an FX Node that represents some computation, for example linear, and each edge will be a connection between two nodes, and each edge can both be viewed as the output of the previous Node or the input of the next Node.
 
+The end goal for this step is to insert QDQStubs at edges so that we produce a graph of quantized reference model when each QDQStub represents a quantize operator followed by a dequantize operator.
+
+```
 # graph 2:
 input - QDQStub1 (FakeQuantize) - qat_linear_relu - QDQStub2 (FakeQuantize) - output
                                       |
@@ -185,11 +190,13 @@ input - QDQStub1 (FakeQuantize) - qat_linear_relu - QDQStub2 (FakeQuantize) - ou
                   (need to be updated with QDQStub + FakeQuantize)
                                       |
                                     weight
+```
 Note: weight + FakeQuantize is a part of qat_linear_relu
 
-# The overall logic to insert QDQStub1 and QDQStub2 inplace is the following:
-# 0. For each node in the original graph, we compute the target_dtype for input and output for it based on qconfig, for graph1, configured with qconfig_mapping, we have:
-# node_name_to_target_dtype =
+The overall logic to insert QDQStub1 and QDQStub2 inplace is the following:
+0. For each node in the original graph, we compute the target_dtype for input and output for it based on qconfig, for graph1, configured with qconfig_mapping, we have:
+```
+# node_name_to_target_dtype_info =
 # {
 #     # this is placeholder node in FX Graph
 #     “input” : {“input_activation”: torch.float32, “output_activation”: torch.float32},
@@ -197,35 +204,44 @@ Note: weight + FakeQuantize is a part of qat_linear_relu
 #     # this is the return node in FX Graph
 #     “output”: {“input_activation”: torch.float32, “output_activation”: torch.float32}
 # }
-# Note: this map is generated before we insert qdqstub to graph1, and will not change in the process.
-#
-# 1. Inserting QDQStub1 (for input of qat_linear_relu)
-#    We need to look at the edge between `input` Node and `qat_linear_relu` Node here, we need to decide if we need to insert a
-#    QDQStub at this edge, which could serve as an input argument for `qat_linear_relu` Node (and also output for `input` Node)
-#    The way we decide if we want to insert QDQStub here is to figure out
-#    (1). The target dtype for output of `input` Node, which is torch.float32
-#    (2). The target dtype for input of `qat_linear_relu` Node, which is torch.quint8
-#    There is a mismatch here and (2) is a quantized dtype, so we need to insert QDQStub at the edge.
-#    We also need to attach observer/fakequant module to the QDQStub we inserted here.
-# 2. Insert QDQStub2 (for output of qat_linear_relu)
-#    The logic for inserting QDQStub for output is much easier, since we assume all modules/functions in the graph produce fp32 output
-#    by default (we can have additional checks and extend this to work for other dtypes after we have type inference ready),
-#    we just need to look at the target output dtype for qat_linear_relu Node, and if it is a quantized dtype (quint8, qint8, float16),
-#    we would insert a QDQStub here.
-#
-# Questions: How to avoid inserting duplicate QDQStubs?
-# e.g. when we have a single input being used by multiple ops:
-# input — linear1 —-
-#      \--- linear2 —
-# how do we make sure we only insert one QDQStub for input of both linear1 and linear2?
-# input - QDQStub — linear1 -
-#              \ —- linear2 -
-#
-# The way we do it right now is before we insert QDQStub, we look at all users of `input` Node here and make sure there is no QDQStubs
-# with the same target_dtype, that is, if we already inserted a QDQStub with dtype quint8 for linear1, and linear2 is also connected to it, if we request another QDQStub with dtype quint8 when processing linear2 Node, we’ll detect that the desired QDQStub already exists and do nothing
-
-# Question: What is the logic for keeping output to be float32?
-# Let’s say the output of `qat_linear_relu` Node is configured as float32, both in qconfig_mapping and backend_config:
+```
+Note: this map is generated before we insert qdqstub to graph1, and will not change in the process.
+
+1. Inserting QDQStub1 (for input of qat_linear_relu)
+   We need to look at the edge between `input` Node and `qat_linear_relu` Node here, we need to decide if we need to insert a
+   QDQStub at this edge, which could serve as an input argument for `qat_linear_relu` Node (and also output for `input` Node)
+   The way we decide if we want to insert QDQStub here is to figure out
+
+   (1). The target dtype for output of `input` Node, which is torch.float32
+
+   (2). The target dtype for input of `qat_linear_relu` Node, which is torch.quint8
+   There is a mismatch here and (2) is a quantized dtype, so we need to insert QDQStub at the edge.
+
+   We also need to attach observer/fakequant module to the QDQStub we inserted here.
+2. Insert QDQStub2 (for output of qat_linear_relu)
+   The logic for inserting QDQStub for output is much easier, since we assume all modules/functions in the graph produce fp32 output
+   by default (we can have additional checks and extend this to work for other dtypes after we have type inference ready),
+   we just need to look at the target output dtype for qat_linear_relu Node, and if it is a quantized dtype (quint8, qint8, float16),
+   we would insert a QDQStub here.
+
+Questions: How to avoid inserting duplicate QDQStubs?
+e.g. when we have a single input being used by multiple ops:
+```
+input — linear1 —-
+     \--- linear2 —
+```
+how do we make sure we only insert one QDQStub for input of both linear1 and linear2?
+```
+input - QDQStub — linear1 -
+             \ —- linear2 -
+```
+
+The way we do it right now is before we insert QDQStub, we look at all users of `input` Node here and make sure there is no QDQStubs
+with the same target_dtype, that is, if we already inserted a QDQStub with dtype quint8 for linear1, and linear2 is also connected to it, if we request another QDQStub with dtype quint8 when processing linear2 Node, we’ll detect that the desired QDQStub already exists and do nothing
+
+Question: What is the logic for keeping output to be float32?
+Let’s say the output of `qat_linear_relu` Node is configured as float32, both in qconfig_mapping and backend_config:
+```
 # qconfig_mapping (simplified, shown as dict)
 {'qat_linear_relu': QConfig(
   weight=MinMaxObserver.with_args(dtype=torch.qint8),
@@ -238,13 +254,13 @@ Note: weight + FakeQuantize is a part of qat_linear_relu
   'pattern': nnqat.LinearReLU,
   'dtype_configs': [{input: torch.quint8, output: torch.float32, weight: torch.qint8}],
 }
-
-# What we’ll do here is when we are trying to insert output QDQStub for `qat_linear_relu`, we look at the target output dtype for this node (node_name_to_target_dtype[“qat_linear_relu”][“output_activation”], and find that it is float, which is not a quantized dtype, so
-# will do nothing here.
-# Note that this does not prevent other operators following `qat_linear_relu` to insert a QDQStub at the output of `qat_linear_relu`, since we are dealing with an `edge` of the graph here, and an `edge` is connected to two nodes, which means
-# the output of `qat_linear_relu` will also be the input of a node following `qat_linear_relu`.
 ```
 
+What we’ll do here is when we are trying to insert output QDQStub for `qat_linear_relu`, we look at the target output dtype for this node (node_name_to_target_dtype_info[“qat_linear_relu”][“output_activation”], and find that it is float, which is not a quantized dtype, so
+will do nothing here.
+Note that this does not prevent other operators following `qat_linear_relu` to insert a QDQStub at the output of `qat_linear_relu`, since we are dealing with an `edge` of the graph here, and an `edge` is connected to two nodes, which means
+the output of `qat_linear_relu` will also be the input of a node following `qat_linear_relu`.
+
 `backend_config` configurations used in this step:
 ```
 BackendConfig(nniqat.LinearReLU)
@@ -255,16 +271,27 @@ BackendConfig(nniqat.LinearReLU)
 ```
 
 Pattern in this case is the same as before, it defines the pattern for the subgraph we are dealing with
+
 `set_observation_type`: sets the observation type for the patter, currently only two types:
+
+`OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT` means the output observer instance will be different from the input, which is the most common type of observer placement.
+
+`OUTPUT_SHARE_OBSERVER_WITH_INPUT` means the output observer is shared with input, they will be the same instance. This is useful for operators like cat.
+
+`set_dtype_configs`: sets a list of supported (activation, weight, bias, etc.) dtype combinations for qconfigs for the pattern. Note that we represent different modes of quantization (static/dynamic/`weight_only`) purely through this combination, for example, fbgemm static quantization can be represented as:
 ```
-OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT means the output observer instance will be different from the input, which is the most common type of observer placement.
-OUTPUT_SHARE_OBSERVER_WITH_INPUT means the output observer is shared with input, they will be the same instance. This is useful for operators like cat.
+{
+  "input_activation": torch.quint8,
+  "weight": torch.qint8,
+  "output_activation": torch.quint8
+}
 ```
 
-`set_dtype_configs`: sets a list of supported (activation, weight, bias, etc.) dtype combinations for qconfigs for the pattern. Note that we represent different modes of quantization (static/dynamic/`weight_only`) purely through this combination, for example, fbgemm static quantization can be represented as: {"`input_activation`": torch.quint8, "weight": torch.qint8, "`output_activation`": torch.quint8}
 Note: the dtype config will be used to configure the support for dynamic quantization as well
+
 Note: we may extend this to support more fine grained configurations of args, kwargs, attributes and outputs in the future
-Note: we are referring to observer here, which is an implementation detail, we can change this to talk about quantization parameters instead, e.g. `QParamsType.OUTPUT_USE_DIFFERENT_QPARAMS_AS_INPUT and QParamsType.OUTPUT_USE_SAME_QPARAMS_AS_INPUT`
+
+Note: we are referring to observer here, which is an implementation detail, we can change this to talk about quantization parameters instead, e.g. `QParamsType.OUTPUT_USE_DIFFERENT_QPARAMS_AS_INPUT` and `QParamsType.OUTPUT_USE_SAME_QPARAMS_AS_INPUT`
 
 ### 2. Calibration/Training
 After we insert observers, we run the model to calibrate observers or to fine tune. This step is identical to eager mode quantization. After that the observer/fakequantize modules contain sufficient information to determine quantization parameters according to the observed data.
@@ -292,7 +319,9 @@ def forward(self, x):
 ```
 
 After we insert observers, we’ll need to convert the model to a reference quantized model. Reference quantized model is a model that uses reference patterns to represent quantized operators, this serves as the standard interface for quantized operators between PyTorch quantization and backend lowering passes. For more details, please take a look at this [RFC](https://github.com/pytorch/rfcs/blob/master/RFC-0019-Extending-PyTorch-Quantization-to-Custom-Backends.md). This pass is pretty straightforward, what we do is:
+
 (1). for each QDQStub (attached with Observer for FakeQuantize modules) in the graph, we'll convert it to calls to quantize and dequantize functions based on the attributes of attached Observer and FakeQuantize modules (e.g. qscheme, dtype etc.)
+
 (2). for weighted modules like linear/conv, we convert them to corresponding reference quantized module.
 
 Example:
@@ -319,9 +348,13 @@ input - quantize - dequantize - reference_linear_relu - quantize - dequantize -
 Note: weight + quantize + dequantize is a part of reference_linear_relu module
 
 To decide which quantize node we want to use, we’ll look at:
+
 (1). dtype of attached Observer/FakeQuantize module
+
 (2). qscheme of attached Observer/FakeQuantize module
+
 (3). (optionally) other attributes of attached Observer/FakeQuantize module
+
 The quantize operator we can choose from right now are: (quantize_per_tensor, quantize_per_channel, to, quantize_per_tensor_dynamic)
 
 ```
@@ -338,8 +371,9 @@ Pattern in this case is the same as before, it defines the pattern for the subgr
 
 `set_reference_quantized_module_for_root`: Sets the corresponding reference quantized module class for root module class, e.g. when root_module is nn.Linear, this will be nn.quantized.reference.Linear, used to swap the root module to be a reference quantized module.
 
-Note: we are only swapping `root_module` here, for example, in the current example, the original module is nniqat.LinearReLU, when we are converting weight modules(step (2)), we first convert nniqat.LinearReLU to a float module, in this case, the fused LinearReLU module: nni.LinearReLU, and then swap the root_module (nn.Linear) with reference quantized module (nnqr.Linear), so we end up with a nni.LinearReLU module, which is a sequential module of a nnqr.Linear and nn.ReLU.
-Basically, the corresponding reference quantized module for both nniqat.LinearReLU and nni.LinearReLU would be a nni.LinearReLU sequential module (originally nn.Linear + nn.ReLU) with nn.Linear being replaced by nnqr.Linear: nni.LinearReLU(nnqr.Linear, nn.ReLU).
+Note: we are only swapping `root_module` here, for example, in the current example, the original module is `nniqat.LinearReLU`, when we are converting weight modules(step (2)), we first convert `nniqat.LinearReLU` to a float module, in this case, the fused LinearReLU module: `nni.LinearReLU`, and then swap the root_module (`nn.Linear`) with reference quantized module (`nnqr.Linear`), so we end up with a `nni.LinearReLU` module, which is a sequential module of a `nnqr.Linear` and `nn.ReLU`.
+
+Basically, the corresponding reference quantized module for both `nniqat.LinearReLU` and `nni.LinearReLU` would be a `nni.LinearReLU` Sequential module (originally `nn.Linear` + `nn.ReLU`) with `nn.Linear` being replaced by `nnqr.Linear`: `nni.LinearReLU(nnqr.Linear, nn.ReLU)`.
 
 `set_fused_module`: This is the corresponding fused module class for the pattern, used to identify fused modules that needs to be converted to reference quantized module
 
@@ -359,21 +393,32 @@ def forward(self, x):
 ```
 
 Currently, PyTorch has native quantized backends: fbgemm and qnnpack, so we need a lowering pass to lower the reference quantized model to a model that is using native quantized operators in PyTorch. What this pass did is
-* Recognize the reference patterns like: "dequantize - `float_op` - quantize" in the graph and replace them with the quantized modules (under torch.nn.quantized namespace) or operators (under torch.ops.quantized namespace, or torch namespace)
+
+1. Recognize the reference patterns like: "dequantize - `float_op` - quantize" in the graph and replace them with the quantized modules (under torch.nn.quantized namespace) or operators (under torch.ops.quantized namespace, or torch namespace)
 In general there are three types of patterns:
-** Static quantization: "dequantize - `float_op` - `quantize_per_tensor`"
-** Dynamic quantization: "`quantize_per_tensor_dynamic` - dequantize - `float_op`"
-** Weight only quantization:
+
+* Static quantization:
+```
+dequantize -> float_op -> quantize_per_tensor
 ```
-                                           Input - float_op - output
-   weight - quantize_per_tensor - dequantize /
+
+* Dynamic quantization:
 ```
-* Prepack and fold the weights for quantized linear and quantized conv operator
-* The lowering pass is also going to keep some patterns for quantized operators unfused, since user may explicitly request some operators to stay in float by configuring the qconfig to be None
+quantize_per_tensor_dynamic -> dequantize -> float_op
+```
+
+* Weight only quantization:
+```
+                                       input - float_op - output
+      weight - quantize_per_tensor - dequantize /
+```
+
+2. Prepack and fold the weights for quantized linear and quantized conv operator
+3. The lowering pass is also going to keep some patterns for quantized operators unfused, since user may explicitly request some operators to stay in float by configuring the qconfig to be None
 
 There are no configurations related to lowering in `backend_config` since it is backend developer’s responsibility to implement lowering pass and each of the backend developers may have their own configurations. So from end to end, `backend_config` and together with qconfig_mapping controls what Reference Quantized Model is produced by FX Graph Mode Quantization, not lowered model.
 
-However, for some operator based backends, like the current pytorch native backends including fbgemm and qnnpack. We could interpret `backend_config` in terms of configurations for operators as well. e.g. configuring `input_dtype`=quint8, `weight_dtype`=qint8, `output_dtype`=torch.quint8 for nn.Linear is saying that the quantized linear will take a quint8 activation and qint8 weight as input and outputs a quint8 activation. But there is no guarantee that this interpretation will always work in the future, especially when we add new flavors of quantized operators.
+However, for some operator based backends, like the current pytorch native backends including fbgemm and qnnpack. We could interpret `backend_config` in terms of configurations for operators as well. e.g. configuring `input_dtype=quint8`, `weight_dtype=qint8`, `output_dtype=torch.quint8` for nn.Linear is saying that the quantized linear will take a `quint8` activation and `qint8` weight as input and outputs a `quint8` activation. But there is no guarantee that this interpretation will always work in the future, especially when we add new flavors of quantized operators.
 
 ## Extensibility
 

From 2887defb4c3e1c770d66eee132f9432b6fa4d8b7 Mon Sep 17 00:00:00 2001
From: Michael Lazos <mlazos@fb.com>
Date: Wed, 30 Nov 2022 00:53:02 +0000
Subject: [PATCH 1389/1922] Update code style for optimizer code (#89862)

Separating out whitespace-only changes
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89862
Approved by: https://github.com/albanD, https://github.com/soumith
---
 torch/optim/adadelta.py | 216 ++++++++++++++++------------
 torch/optim/adagrad.py  |  15 +-
 torch/optim/adam.py     |   1 +
 torch/optim/adamax.py   | 249 ++++++++++++++++++--------------
 torch/optim/adamw.py    | 312 +++++++++++++++++++++++-----------------
 torch/optim/asgd.py     | 249 ++++++++++++++++++--------------
 torch/optim/nadam.py    |   1 +
 torch/optim/radam.py    | 251 +++++++++++++++++++-------------
 torch/optim/rmsprop.py  | 254 ++++++++++++++++++--------------
 torch/optim/rprop.py    | 206 +++++++++++++++-----------
 10 files changed, 1021 insertions(+), 733 deletions(-)

diff --git a/torch/optim/adadelta.py b/torch/optim/adadelta.py
index e5b33bb6e255e..d46311f26626f 100644
--- a/torch/optim/adadelta.py
+++ b/torch/optim/adadelta.py
@@ -4,7 +4,8 @@
 from .optimizer import Optimizer, _use_grad_for_differentiable
 from typing import List, Optional
 
-__all__ = ['Adadelta', 'adadelta']
+__all__ = ["Adadelta", "adadelta"]
+
 
 class Adadelta(Optimizer):
     r"""Implements Adadelta algorithm.
@@ -53,9 +54,18 @@ class Adadelta(Optimizer):
         https://arxiv.org/abs/1212.5701
     """
 
-    def __init__(self, params, lr=1.0, rho=0.9, eps=1e-6, weight_decay=0,
-                 foreach: Optional[bool] = None, *, maximize: bool = False,
-                 differentiable: bool = False):
+    def __init__(
+        self,
+        params,
+        lr=1.0,
+        rho=0.9,
+        eps=1e-6,
+        weight_decay=0,
+        foreach: Optional[bool] = None,
+        *,
+        maximize: bool = False,
+        differentiable: bool = False,
+    ):
         if not 0.0 <= lr:
             raise ValueError("Invalid learning rate: {}".format(lr))
         if not 0.0 <= rho <= 1.0:
@@ -65,17 +75,23 @@ def __init__(self, params, lr=1.0, rho=0.9, eps=1e-6, weight_decay=0,
         if not 0.0 <= weight_decay:
             raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
 
-        defaults = dict(lr=lr, rho=rho, eps=eps, weight_decay=weight_decay,
-                        maximize=maximize, foreach=foreach,
-                        differentiable=differentiable)
+        defaults = dict(
+            lr=lr,
+            rho=rho,
+            eps=eps,
+            weight_decay=weight_decay,
+            maximize=maximize,
+            foreach=foreach,
+            differentiable=differentiable,
+        )
         super(Adadelta, self).__init__(params, defaults)
 
     def __setstate__(self, state):
         super().__setstate__(state)
         for group in self.param_groups:
-            group.setdefault('foreach', None)
-            group.setdefault('maximize', False)
-            group.setdefault('differentiable', False)
+            group.setdefault("foreach", None)
+            group.setdefault("maximize", False)
+            group.setdefault("differentiable", False)
 
     @_use_grad_for_differentiable
     def step(self, closure=None):
@@ -96,64 +112,73 @@ def step(self, closure=None):
             square_avgs = []
             acc_deltas = []
             lr, rho, eps, weight_decay, foreach, maximize, differentiable = (
-                group['lr'],
-                group['rho'],
-                group['eps'],
-                group['weight_decay'],
-                group['foreach'],
-                group['maximize'],
-                group['differentiable'])
-
-            for p in group['params']:
+                group["lr"],
+                group["rho"],
+                group["eps"],
+                group["weight_decay"],
+                group["foreach"],
+                group["maximize"],
+                group["differentiable"],
+            )
+
+            for p in group["params"]:
                 if p.grad is None:
                     continue
                 params_with_grad.append(p)
                 if p.grad.is_sparse:
-                    raise RuntimeError('Adadelta does not support sparse gradients')
+                    raise RuntimeError("Adadelta does not support sparse gradients")
                 grads.append(p.grad)
 
                 state = self.state[p]
 
                 # Lazy state initialization
                 if len(state) == 0:
-                    state['step'] = 0
-                    state['square_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                    state['acc_delta'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
-                square_avgs.append(state['square_avg'])
-                acc_deltas.append(state['acc_delta'])
-
-                state['step'] += 1
-
-            adadelta(params_with_grad,
-                     grads,
-                     square_avgs,
-                     acc_deltas,
-                     lr=lr,
-                     rho=rho,
-                     eps=eps,
-                     weight_decay=weight_decay,
-                     foreach=foreach,
-                     maximize=maximize,
-                     differentiable=differentiable)
+                    state["step"] = 0
+                    state["square_avg"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+                    state["acc_delta"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+
+                square_avgs.append(state["square_avg"])
+                acc_deltas.append(state["acc_delta"])
+
+                state["step"] += 1
+
+            adadelta(
+                params_with_grad,
+                grads,
+                square_avgs,
+                acc_deltas,
+                lr=lr,
+                rho=rho,
+                eps=eps,
+                weight_decay=weight_decay,
+                foreach=foreach,
+                maximize=maximize,
+                differentiable=differentiable,
+            )
 
         return loss
 
 
-def adadelta(params: List[Tensor],
-             grads: List[Tensor],
-             square_avgs: List[Tensor],
-             acc_deltas: List[Tensor],
-             # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
-             # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
-             foreach: bool = None,
-             differentiable: bool = False,
-             *,
-             lr: float,
-             rho: float,
-             eps: float,
-             weight_decay: float,
-             maximize: bool):
+def adadelta(
+    params: List[Tensor],
+    grads: List[Tensor],
+    square_avgs: List[Tensor],
+    acc_deltas: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+    foreach: bool = None,
+    differentiable: bool = False,
+    *,
+    lr: float,
+    rho: float,
+    eps: float,
+    weight_decay: float,
+    maximize: bool,
+):
     r"""Functional API that performs Adadelta algorithm computation.
 
     See :class:`~torch.optim.Adadelta` for details.
@@ -164,38 +189,44 @@ def adadelta(params: List[Tensor],
         foreach = False
 
     if foreach and torch.jit.is_scripting():
-        raise RuntimeError('torch.jit.script not supported with foreach optimizers')
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
 
     if foreach and not torch.jit.is_scripting():
         func = _multi_tensor_adadelta
     else:
         func = _single_tensor_adadelta
 
-    func(params,
-         grads,
-         square_avgs,
-         acc_deltas,
-         lr=lr,
-         rho=rho,
-         eps=eps,
-         weight_decay=weight_decay,
-         maximize=maximize,
-         differentiable=differentiable)
-
-
-def _single_tensor_adadelta(params: List[Tensor],
-                            grads: List[Tensor],
-                            square_avgs: List[Tensor],
-                            acc_deltas: List[Tensor],
-                            *,
-                            lr: float,
-                            rho: float,
-                            eps: float,
-                            weight_decay: float,
-                            maximize: bool,
-                            differentiable: bool):
-
-    for (param, grad, square_avg, acc_delta) in zip(params, grads, square_avgs, acc_deltas):
+    func(
+        params,
+        grads,
+        square_avgs,
+        acc_deltas,
+        lr=lr,
+        rho=rho,
+        eps=eps,
+        weight_decay=weight_decay,
+        maximize=maximize,
+        differentiable=differentiable,
+    )
+
+
+def _single_tensor_adadelta(
+    params: List[Tensor],
+    grads: List[Tensor],
+    square_avgs: List[Tensor],
+    acc_deltas: List[Tensor],
+    *,
+    lr: float,
+    rho: float,
+    eps: float,
+    weight_decay: float,
+    maximize: bool,
+    differentiable: bool,
+):
+
+    for (param, grad, square_avg, acc_delta) in zip(
+        params, grads, square_avgs, acc_deltas
+    ):
         grad = grad if not maximize else -grad
 
         if weight_decay != 0:
@@ -206,7 +237,6 @@ def _single_tensor_adadelta(params: List[Tensor],
             acc_delta = torch.view_as_real(acc_delta)
             grad = torch.view_as_real(grad)
 
-
         square_avg.mul_(rho).addcmul_(grad, grad, value=1 - rho)
         std = square_avg.add(eps).sqrt_()
         delta = acc_delta.add(eps).sqrt_()
@@ -220,17 +250,19 @@ def _single_tensor_adadelta(params: List[Tensor],
         param.add_(delta, alpha=-lr)
 
 
-def _multi_tensor_adadelta(params: List[Tensor],
-                           grads: List[Tensor],
-                           square_avgs: List[Tensor],
-                           acc_deltas: List[Tensor],
-                           *,
-                           lr: float,
-                           weight_decay: float,
-                           rho: float,
-                           eps: float,
-                           maximize: bool,
-                           differentiable: bool):
+def _multi_tensor_adadelta(
+    params: List[Tensor],
+    grads: List[Tensor],
+    square_avgs: List[Tensor],
+    acc_deltas: List[Tensor],
+    *,
+    lr: float,
+    weight_decay: float,
+    rho: float,
+    eps: float,
+    maximize: bool,
+    differentiable: bool,
+):
 
     assert not differentiable, "_foreach ops don't support autograd"
 
diff --git a/torch/optim/adagrad.py b/torch/optim/adagrad.py
index b7da23a84b11c..d95dd69bb90c6 100644
--- a/torch/optim/adagrad.py
+++ b/torch/optim/adagrad.py
@@ -4,7 +4,8 @@
 from .optimizer import Optimizer, _use_grad_for_differentiable
 from typing import List, Optional
 
-__all__ = ['Adagrad', 'adagrad']
+__all__ = ["Adagrad", "adagrad"]
+
 
 class Adagrad(Optimizer):
     r"""Implements Adagrad algorithm.
@@ -60,7 +61,7 @@ def __init__(
         foreach: Optional[bool] = None,
         *,
         maximize: bool = False,
-        differentiable: bool = False
+        differentiable: bool = False,
     ):
         if not 0.0 <= lr:
             raise ValueError("Invalid learning rate: {}".format(lr))
@@ -85,7 +86,7 @@ def __init__(
             initial_accumulator_value=initial_accumulator_value,
             foreach=foreach,
             maximize=maximize,
-            differentiable=differentiable
+            differentiable=differentiable,
         )
         super(Adagrad, self).__init__(params, defaults)
 
@@ -166,7 +167,7 @@ def step(self, closure=None):
                 has_sparse_grad=has_sparse_grad,
                 foreach=group["foreach"],
                 maximize=group["maximize"],
-                differentiable=group["differentiable"]
+                differentiable=group["differentiable"],
             )
 
         return loss
@@ -222,7 +223,7 @@ def adagrad(
         eps=eps,
         has_sparse_grad=has_sparse_grad,
         maximize=maximize,
-        differentiable=differentiable
+        differentiable=differentiable,
     )
 
 
@@ -245,7 +246,7 @@ def _single_tensor_adagrad(
     eps: float,
     has_sparse_grad: bool,
     maximize: bool,
-    differentiable: bool
+    differentiable: bool,
 ):
 
     for (param, grad, state_sum, step_t) in zip(params, grads, state_sums, state_steps):
@@ -304,7 +305,7 @@ def _multi_tensor_adagrad(
     eps: float,
     has_sparse_grad: bool,
     maximize: bool,
-    differentiable: bool
+    differentiable: bool,
 ):
 
     assert not differentiable, "_foreach ops don't support autograd"
diff --git a/torch/optim/adam.py b/torch/optim/adam.py
index 50aad78781c9d..0560e7506d415 100644
--- a/torch/optim/adam.py
+++ b/torch/optim/adam.py
@@ -412,6 +412,7 @@ def _single_tensor_adam(params: List[Tensor],
             param.addcdiv_(exp_avg, denom, value=-step_size)
 
 
+
 def _multi_tensor_adam(params: List[Tensor],
                        grads: List[Tensor],
                        exp_avgs: List[Tensor],
diff --git a/torch/optim/adamax.py b/torch/optim/adamax.py
index 7e63c3c33b3c6..a3d0cdec86390 100644
--- a/torch/optim/adamax.py
+++ b/torch/optim/adamax.py
@@ -4,7 +4,8 @@
 from .optimizer import Optimizer, _use_grad_for_differentiable
 from typing import List, Optional
 
-__all__ = ['Adamax', 'adamax']
+__all__ = ["Adamax", "adamax"]
+
 
 class Adamax(Optimizer):
     r"""Implements Adamax algorithm (a variant of Adam based on infinity norm).
@@ -50,9 +51,18 @@ class Adamax(Optimizer):
         https://arxiv.org/abs/1412.6980
     """
 
-    def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8,
-                 weight_decay=0, foreach: Optional[bool] = None, *, maximize: bool = False,
-                 differentiable: bool = False):
+    def __init__(
+        self,
+        params,
+        lr=2e-3,
+        betas=(0.9, 0.999),
+        eps=1e-8,
+        weight_decay=0,
+        foreach: Optional[bool] = None,
+        *,
+        maximize: bool = False,
+        differentiable: bool = False,
+    ):
         if not 0.0 <= lr:
             raise ValueError("Invalid learning rate: {}".format(lr))
         if not 0.0 <= eps:
@@ -64,21 +74,30 @@ def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8,
         if not 0.0 <= weight_decay:
             raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
 
-        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
-                        foreach=foreach, maximize=maximize, differentiable=differentiable)
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            foreach=foreach,
+            maximize=maximize,
+            differentiable=differentiable,
+        )
         super(Adamax, self).__init__(params, defaults)
 
     def __setstate__(self, state):
         super().__setstate__(state)
         for group in self.param_groups:
-            group.setdefault('foreach', None)
-            group.setdefault('maximize', False)
-            group.setdefault('differentiable', False)
+            group.setdefault("foreach", None)
+            group.setdefault("maximize", False)
+            group.setdefault("differentiable", False)
         state_values = list(self.state.values())
-        step_is_tensor = (len(state_values) != 0) and torch.is_tensor(state_values[0]['step'])
+        step_is_tensor = (len(state_values) != 0) and torch.is_tensor(
+            state_values[0]["step"]
+        )
         if not step_is_tensor:
             for s in state_values:
-                s['step'] = torch.tensor(float(s['step']))
+                s["step"] = torch.tensor(float(s["step"]))
 
     @_use_grad_for_differentiable
     def step(self, closure=None):
@@ -100,114 +119,128 @@ def step(self, closure=None):
             exp_infs = []
             state_steps = []
 
-            beta1, beta2 = group['betas']
-            eps = group['eps']
-            lr = group['lr']
-            weight_decay = group['weight_decay']
-            foreach = group['foreach']
-            maximize = group['maximize']
-            differentiable = group['differentiable']
+            beta1, beta2 = group["betas"]
+            eps = group["eps"]
+            lr = group["lr"]
+            weight_decay = group["weight_decay"]
+            foreach = group["foreach"]
+            maximize = group["maximize"]
+            differentiable = group["differentiable"]
 
-            for p in group['params']:
+            for p in group["params"]:
                 if p.grad is None:
                     continue
                 params_with_grad.append(p)
                 if p.grad.is_sparse:
-                    raise RuntimeError('Adamax does not support sparse gradients')
+                    raise RuntimeError("Adamax does not support sparse gradients")
                 grads.append(p.grad)
 
                 state = self.state[p]
 
                 # State initialization
                 if len(state) == 0:
-                    state['step'] = torch.tensor(0.)
-                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                    state['exp_inf'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
-                exp_avgs.append(state['exp_avg'])
-                exp_infs.append(state['exp_inf'])
-                state_steps.append(state['step'])
-
-            adamax(params_with_grad,
-                   grads,
-                   exp_avgs,
-                   exp_infs,
-                   state_steps,
-                   eps=eps,
-                   beta1=beta1,
-                   beta2=beta2,
-                   lr=lr,
-                   weight_decay=weight_decay,
-                   foreach=foreach,
-                   maximize=maximize,
-                   differentiable=differentiable)
+                    state["step"] = torch.tensor(0.0)
+                    state["exp_avg"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+                    state["exp_inf"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+
+                exp_avgs.append(state["exp_avg"])
+                exp_infs.append(state["exp_inf"])
+                state_steps.append(state["step"])
+
+            adamax(
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_infs,
+                state_steps,
+                eps=eps,
+                beta1=beta1,
+                beta2=beta2,
+                lr=lr,
+                weight_decay=weight_decay,
+                foreach=foreach,
+                maximize=maximize,
+                differentiable=differentiable,
+            )
 
         return loss
 
 
-def adamax(params: List[Tensor],
-           grads: List[Tensor],
-           exp_avgs: List[Tensor],
-           exp_infs: List[Tensor],
-           state_steps: List[Tensor],
-           # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
-           # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
-           foreach: bool = None,
-           maximize: bool = False,
-           differentiable: bool = False,
-           *,
-           eps: float,
-           beta1: float,
-           beta2: float,
-           lr: float,
-           weight_decay: float):
+def adamax(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_infs: List[Tensor],
+    state_steps: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+    foreach: bool = None,
+    maximize: bool = False,
+    differentiable: bool = False,
+    *,
+    eps: float,
+    beta1: float,
+    beta2: float,
+    lr: float,
+    weight_decay: float,
+):
     r"""Functional API that performs adamax algorithm computation.
 
     See :class:`~torch.optim.Adamax` for details.
     """
 
     if not all(isinstance(t, torch.Tensor) for t in state_steps):
-        raise RuntimeError("API has changed, `state_steps` argument must contain a list of singleton tensors")
+        raise RuntimeError(
+            "API has changed, `state_steps` argument must contain a list of singleton tensors"
+        )
 
     if foreach is None:
         # Placeholder for more complex foreach logic to be added when value is not set
         foreach = False
 
     if foreach and torch.jit.is_scripting():
-        raise RuntimeError('torch.jit.script not supported with foreach optimizers')
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
 
     if foreach and not torch.jit.is_scripting():
         func = _multi_tensor_adamax
     else:
         func = _single_tensor_adamax
 
-    func(params,
-         grads,
-         exp_avgs,
-         exp_infs,
-         state_steps,
-         eps=eps,
-         beta1=beta1,
-         beta2=beta2,
-         lr=lr,
-         weight_decay=weight_decay,
-         maximize=maximize,
-         differentiable=differentiable)
-
-
-def _single_tensor_adamax(params: List[Tensor],
-                          grads: List[Tensor],
-                          exp_avgs: List[Tensor],
-                          exp_infs: List[Tensor],
-                          state_steps: List[Tensor],
-                          *,
-                          eps: float,
-                          beta1: float,
-                          beta2: float,
-                          lr: float,
-                          weight_decay: float,
-                          maximize: bool,
-                          differentiable: bool):
+    func(
+        params,
+        grads,
+        exp_avgs,
+        exp_infs,
+        state_steps,
+        eps=eps,
+        beta1=beta1,
+        beta2=beta2,
+        lr=lr,
+        weight_decay=weight_decay,
+        maximize=maximize,
+        differentiable=differentiable,
+    )
+
+
+def _single_tensor_adamax(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_infs: List[Tensor],
+    state_steps: List[Tensor],
+    *,
+    eps: float,
+    beta1: float,
+    beta2: float,
+    lr: float,
+    weight_decay: float,
+    maximize: bool,
+    differentiable: bool,
+):
 
     for i, param in enumerate(params):
         grad = grads[i]
@@ -231,35 +264,36 @@ def _single_tensor_adamax(params: List[Tensor],
         # Update biased first moment estimate.
         exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
         # Update the exponentially weighted infinity norm.
-        norm_buf = torch.cat([
-            exp_inf.mul_(beta2).unsqueeze(0),
-            grad.abs().add_(eps).unsqueeze_(0)
-        ], 0)
+        norm_buf = torch.cat(
+            [exp_inf.mul_(beta2).unsqueeze(0), grad.abs().add_(eps).unsqueeze_(0)], 0
+        )
 
         if not differentiable:
             torch.amax(norm_buf, 0, keepdim=False, out=exp_inf)
         else:
             exp_inf.copy_(torch.amax(norm_buf, 0, keepdim=False))
 
-        bias_correction = 1 - beta1 ** step
+        bias_correction = 1 - beta1**step
         clr = lr / bias_correction
 
         param.addcdiv_(exp_avg, exp_inf, value=-clr)
 
 
-def _multi_tensor_adamax(params: List[Tensor],
-                         grads: List[Tensor],
-                         exp_avgs: List[Tensor],
-                         exp_infs: List[Tensor],
-                         state_steps: List[Tensor],
-                         *,
-                         beta1: float,
-                         beta2: float,
-                         lr: float,
-                         weight_decay: float,
-                         eps: float,
-                         maximize: bool,
-                         differentiable: bool):
+def _multi_tensor_adamax(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_infs: List[Tensor],
+    state_steps: List[Tensor],
+    *,
+    beta1: float,
+    beta2: float,
+    lr: float,
+    weight_decay: float,
+    eps: float,
+    maximize: bool,
+    differentiable: bool,
+):
 
     assert not differentiable, "_foreach ops don't support autograd"
 
@@ -288,10 +322,9 @@ def _multi_tensor_adamax(params: List[Tensor],
     torch._foreach_mul_(exp_infs, beta2)
 
     for exp_inf, grad in zip(exp_infs, grads):
-        norm_buf = torch.cat([
-            exp_inf.unsqueeze(0),
-            grad.abs().add_(eps).unsqueeze_(0)
-        ], 0)
+        norm_buf = torch.cat(
+            [exp_inf.unsqueeze(0), grad.abs().add_(eps).unsqueeze_(0)], 0
+        )
         torch.max(norm_buf, 0, keepdim=False, out=(exp_inf, exp_inf.new().long()))
 
     bias_corrections = [1 - beta1 ** step.item() for step in state_steps]
diff --git a/torch/optim/adamw.py b/torch/optim/adamw.py
index 9855f05be84e9..5f6fca66ab25c 100644
--- a/torch/optim/adamw.py
+++ b/torch/optim/adamw.py
@@ -4,7 +4,8 @@
 from .optimizer import Optimizer, _use_grad_for_differentiable
 from typing import List, Optional
 
-__all__ = ['AdamW', 'adamw']
+__all__ = ["AdamW", "adamw"]
+
 
 class AdamW(Optimizer):
     r"""Implements AdamW algorithm.
@@ -72,11 +73,20 @@ class AdamW(Optimizer):
         https://openreview.net/forum?id=ryQu7f-RZ
     """
 
-    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
-                 weight_decay=1e-2, amsgrad=False, *, maximize: bool = False,
-                 foreach: Optional[bool] = None,
-                 capturable: bool = False,
-                 differentiable: bool = False):
+    def __init__(
+        self,
+        params,
+        lr=1e-3,
+        betas=(0.9, 0.999),
+        eps=1e-8,
+        weight_decay=1e-2,
+        amsgrad=False,
+        *,
+        maximize: bool = False,
+        foreach: Optional[bool] = None,
+        capturable: bool = False,
+        differentiable: bool = False,
+    ):
         if not 0.0 <= lr:
             raise ValueError("Invalid learning rate: {}".format(lr))
         if not 0.0 <= eps:
@@ -87,25 +97,34 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
         if not 0.0 <= weight_decay:
             raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
-        defaults = dict(lr=lr, betas=betas, eps=eps,
-                        weight_decay=weight_decay, amsgrad=amsgrad,
-                        foreach=foreach, maximize=maximize, capturable=capturable,
-                        differentiable=differentiable)
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            amsgrad=amsgrad,
+            foreach=foreach,
+            maximize=maximize,
+            capturable=capturable,
+            differentiable=differentiable,
+        )
         super(AdamW, self).__init__(params, defaults)
 
     def __setstate__(self, state):
         super().__setstate__(state)
         for group in self.param_groups:
-            group.setdefault('amsgrad', False)
-            group.setdefault('maximize', False)
-            group.setdefault('foreach', None)
-            group.setdefault('capturable', False)
-            group.setdefault('differentiable', False)
+            group.setdefault("amsgrad", False)
+            group.setdefault("maximize", False)
+            group.setdefault("foreach", None)
+            group.setdefault("capturable", False)
+            group.setdefault("differentiable", False)
         state_values = list(self.state.values())
-        step_is_tensor = (len(state_values) != 0) and torch.is_tensor(state_values[0]['step'])
+        step_is_tensor = (len(state_values) != 0) and torch.is_tensor(
+            state_values[0]["step"]
+        )
         if not step_is_tensor:
             for s in state_values:
-                s['step'] = torch.tensor(float(s['step']))
+                s["step"] = torch.tensor(float(s["step"]))
 
     @_use_grad_for_differentiable
     def step(self, closure=None):
@@ -129,132 +148,151 @@ def step(self, closure=None):
             exp_avg_sqs = []
             max_exp_avg_sqs = []
             state_steps = []
-            amsgrad = group['amsgrad']
-            beta1, beta2 = group['betas']
-            differentiable = group['differentiable']
+            amsgrad = group["amsgrad"]
+            beta1, beta2 = group["betas"]
+            differentiable = group["differentiable"]
 
-            for p in group['params']:
+            for p in group["params"]:
                 if p.grad is None:
                     continue
                 params_with_grad.append(p)
                 if p.grad.is_sparse:
-                    raise RuntimeError('AdamW does not support sparse gradients')
+                    raise RuntimeError("AdamW does not support sparse gradients")
                 grads.append(p.grad)
 
                 state = self.state[p]
 
                 # State initialization
                 if len(state) == 0:
-                    state['step'] = torch.zeros((1,), dtype=torch.float, device=p.device) \
-                        if self.defaults['capturable'] else torch.tensor(0.)
+                    state["step"] = (
+                        torch.zeros((1,), dtype=torch.float, device=p.device)
+                        if self.defaults["capturable"]
+                        else torch.tensor(0.0)
+                    )
                     # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    state["exp_avg"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
                     # Exponential moving average of squared gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    state["exp_avg_sq"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
                     if amsgrad:
                         # Maintains max of all exp. moving avg. of sq. grad. values
-                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                        state["max_exp_avg_sq"] = torch.zeros_like(
+                            p, memory_format=torch.preserve_format
+                        )
 
-                exp_avgs.append(state['exp_avg'])
-                exp_avg_sqs.append(state['exp_avg_sq'])
+                exp_avgs.append(state["exp_avg"])
+                exp_avg_sqs.append(state["exp_avg_sq"])
 
                 if amsgrad:
-                    max_exp_avg_sqs.append(state['max_exp_avg_sq'])
-
-                state_steps.append(state['step'])
-
-            adamw(params_with_grad,
-                  grads,
-                  exp_avgs,
-                  exp_avg_sqs,
-                  max_exp_avg_sqs,
-                  state_steps,
-                  amsgrad=amsgrad,
-                  beta1=beta1,
-                  beta2=beta2,
-                  lr=group['lr'],
-                  weight_decay=group['weight_decay'],
-                  eps=group['eps'],
-                  maximize=group['maximize'],
-                  foreach=group['foreach'],
-                  capturable=group['capturable'],
-                  differentiable=group['differentiable'])
+                    max_exp_avg_sqs.append(state["max_exp_avg_sq"])
+
+                state_steps.append(state["step"])
+
+            adamw(
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_avg_sqs,
+                max_exp_avg_sqs,
+                state_steps,
+                amsgrad=amsgrad,
+                beta1=beta1,
+                beta2=beta2,
+                lr=group["lr"],
+                weight_decay=group["weight_decay"],
+                eps=group["eps"],
+                maximize=group["maximize"],
+                foreach=group["foreach"],
+                capturable=group["capturable"],
+                differentiable=group["differentiable"],
+            )
 
         return loss
 
 
-def adamw(params: List[Tensor],
-          grads: List[Tensor],
-          exp_avgs: List[Tensor],
-          exp_avg_sqs: List[Tensor],
-          max_exp_avg_sqs: List[Tensor],
-          state_steps: List[Tensor],
-          # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
-          # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
-          foreach: bool = None,
-          capturable: bool = False,
-          differentiable: bool = False,
-          *,
-          amsgrad: bool,
-          beta1: float,
-          beta2: float,
-          lr: float,
-          weight_decay: float,
-          eps: float,
-          maximize: bool):
+def adamw(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    max_exp_avg_sqs: List[Tensor],
+    state_steps: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+    foreach: bool = None,
+    capturable: bool = False,
+    differentiable: bool = False,
+    *,
+    amsgrad: bool,
+    beta1: float,
+    beta2: float,
+    lr: float,
+    weight_decay: float,
+    eps: float,
+    maximize: bool,
+):
     r"""Functional API that performs AdamW algorithm computation.
 
     See :class:`~torch.optim.AdamW` for details.
     """
 
     if not all(isinstance(t, torch.Tensor) for t in state_steps):
-        raise RuntimeError("API has changed, `state_steps` argument must contain a list of singleton tensors")
+        raise RuntimeError(
+            "API has changed, `state_steps` argument must contain a list of singleton tensors"
+        )
 
     if foreach is None:
         # Placeholder for more complex foreach logic to be added when value is not set
         foreach = False
 
     if foreach and torch.jit.is_scripting():
-        raise RuntimeError('torch.jit.script not supported with foreach optimizers')
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
 
     if foreach and not torch.jit.is_scripting():
         func = _multi_tensor_adamw
     else:
         func = _single_tensor_adamw
 
-    func(params,
-         grads,
-         exp_avgs,
-         exp_avg_sqs,
-         max_exp_avg_sqs,
-         state_steps,
-         amsgrad=amsgrad,
-         beta1=beta1,
-         beta2=beta2,
-         lr=lr,
-         weight_decay=weight_decay,
-         eps=eps,
-         maximize=maximize,
-         capturable=capturable,
-         differentiable=differentiable)
-
-
-def _single_tensor_adamw(params: List[Tensor],
-                         grads: List[Tensor],
-                         exp_avgs: List[Tensor],
-                         exp_avg_sqs: List[Tensor],
-                         max_exp_avg_sqs: List[Tensor],
-                         state_steps: List[Tensor],
-                         *,
-                         amsgrad: bool,
-                         beta1: float,
-                         beta2: float,
-                         lr: float,
-                         weight_decay: float,
-                         eps: float,
-                         maximize: bool,
-                         capturable: bool,
-                         differentiable: bool):
+    func(
+        params,
+        grads,
+        exp_avgs,
+        exp_avg_sqs,
+        max_exp_avg_sqs,
+        state_steps,
+        amsgrad=amsgrad,
+        beta1=beta1,
+        beta2=beta2,
+        lr=lr,
+        weight_decay=weight_decay,
+        eps=eps,
+        maximize=maximize,
+        capturable=capturable,
+        differentiable=differentiable,
+    )
+
+
+def _single_tensor_adamw(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    max_exp_avg_sqs: List[Tensor],
+    state_steps: List[Tensor],
+    *,
+    amsgrad: bool,
+    beta1: float,
+    beta2: float,
+    lr: float,
+    weight_decay: float,
+    eps: float,
+    maximize: bool,
+    capturable: bool,
+    differentiable: bool,
+):
 
     for i, param in enumerate(params):
         grad = grads[i] if not maximize else -grads[i]
@@ -263,7 +301,9 @@ def _single_tensor_adamw(params: List[Tensor],
         step_t = state_steps[i]
 
         if capturable:
-            assert param.is_cuda and step_t.is_cuda, "If capturable=True, params and state_steps must be CUDA tensors."
+            assert (
+                param.is_cuda and step_t.is_cuda
+            ), "If capturable=True, params and state_steps must be CUDA tensors."
 
         if torch.is_complex(param):
             grad = torch.view_as_real(grad)
@@ -304,16 +344,20 @@ def _single_tensor_adamw(params: List[Tensor],
                 # Uses the max. for normalizing running avg. of gradient
                 # Folds in (admittedly ugly) 1-elem step_size math here to avoid extra param-set-sized read+write
                 # (can't fold it into addcdiv_ below because addcdiv_ requires value is a Number, not a Tensor)
-                denom = (max_exp_avg_sqs[i].sqrt() / (bias_correction2_sqrt * step_size_neg)).add_(eps / step_size_neg)
+                denom = (
+                    max_exp_avg_sqs[i].sqrt() / (bias_correction2_sqrt * step_size_neg)
+                ).add_(eps / step_size_neg)
             else:
-                denom = (exp_avg_sq.sqrt() / (bias_correction2_sqrt * step_size_neg)).add_(eps / step_size_neg)
+                denom = (
+                    exp_avg_sq.sqrt() / (bias_correction2_sqrt * step_size_neg)
+                ).add_(eps / step_size_neg)
 
             param.addcdiv_(exp_avg, denom)
         else:
             step = step_t.item()
 
-            bias_correction1 = 1 - beta1 ** step
-            bias_correction2 = 1 - beta2 ** step
+            bias_correction1 = 1 - beta1**step
+            bias_correction2 = 1 - beta2**step
 
             step_size = lr / bias_correction1
 
@@ -330,28 +374,31 @@ def _single_tensor_adamw(params: List[Tensor],
             param.addcdiv_(exp_avg, denom, value=-step_size)
 
 
-def _multi_tensor_adamw(params: List[Tensor],
-                        grads: List[Tensor],
-                        exp_avgs: List[Tensor],
-                        exp_avg_sqs: List[Tensor],
-                        max_exp_avg_sqs: List[Tensor],
-                        state_steps: List[Tensor],
-                        *,
-                        amsgrad: bool,
-                        beta1: float,
-                        beta2: float,
-                        lr: float,
-                        weight_decay: float,
-                        eps: float,
-                        maximize: bool,
-                        capturable: bool,
-                        differentiable: bool):
+def _multi_tensor_adamw(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    max_exp_avg_sqs: List[Tensor],
+    state_steps: List[Tensor],
+    *,
+    amsgrad: bool,
+    beta1: float,
+    beta2: float,
+    lr: float,
+    weight_decay: float,
+    eps: float,
+    maximize: bool,
+    capturable: bool,
+    differentiable: bool,
+):
     if len(params) == 0:
         return
 
     if capturable:
-        assert all(p.is_cuda and step.is_cuda for p, step in zip(params, state_steps)), \
-            "If capturable=True, params and state_steps must be CUDA tensors."
+        assert all(
+            p.is_cuda and step.is_cuda for p, step in zip(params, state_steps)
+        ), "If capturable=True, params and state_steps must be CUDA tensors."
 
     if maximize:
         grads = torch._foreach_neg(tuple(grads))  # type: ignore[assignment]
@@ -360,7 +407,9 @@ def _multi_tensor_adamw(params: List[Tensor],
 
     grads = [torch.view_as_real(x) if torch.is_complex(x) else x for x in grads]
     exp_avgs = [torch.view_as_real(x) if torch.is_complex(x) else x for x in exp_avgs]
-    exp_avg_sqs = [torch.view_as_real(x) if torch.is_complex(x) else x for x in exp_avg_sqs]
+    exp_avg_sqs = [
+        torch.view_as_real(x) if torch.is_complex(x) else x for x in exp_avg_sqs
+    ]
     params = [torch.view_as_real(x) if torch.is_complex(x) else x for x in params]
 
     # update steps
@@ -401,13 +450,18 @@ def _multi_tensor_adamw(params: List[Tensor],
             max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs)
             # Folds in (admittedly ugly) 1-elem step_size math here to avoid extra param-set-sized read+write
             # (can't fold it into addcdiv_ below because addcdiv_ requires value is a Number, not a Tensor)
-            torch._foreach_div_(max_exp_avg_sq_sqrt, torch._foreach_mul(bias_correction2_sqrt, step_size))
+            torch._foreach_div_(
+                max_exp_avg_sq_sqrt,
+                torch._foreach_mul(bias_correction2_sqrt, step_size),
+            )
             eps_over_step_size = torch._foreach_div(step_size, eps)
             torch._foreach_reciprocal_(eps_over_step_size)
             denom = torch._foreach_add(max_exp_avg_sq_sqrt, eps_over_step_size)
         else:
             exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs)
-            torch._foreach_div_(exp_avg_sq_sqrt, torch._foreach_mul(bias_correction2_sqrt, step_size))
+            torch._foreach_div_(
+                exp_avg_sq_sqrt, torch._foreach_mul(bias_correction2_sqrt, step_size)
+            )
             eps_over_step_size = torch._foreach_div(step_size, eps)
             torch._foreach_reciprocal_(eps_over_step_size)
             denom = torch._foreach_add(exp_avg_sq_sqrt, eps_over_step_size)
diff --git a/torch/optim/asgd.py b/torch/optim/asgd.py
index d0b215e9573b0..9fbac14e83566 100644
--- a/torch/optim/asgd.py
+++ b/torch/optim/asgd.py
@@ -5,7 +5,8 @@
 from .optimizer import Optimizer, _use_grad_for_differentiable
 from typing import List, Optional
 
-__all__ = ['ASGD', 'asgd']
+__all__ = ["ASGD", "asgd"]
+
 
 class ASGD(Optimizer):
     """Implements Averaged Stochastic Gradient Descent.
@@ -30,38 +31,60 @@ class ASGD(Optimizer):
         https://dl.acm.org/citation.cfm?id=131098
     """
 
-    def __init__(self, params, lr=1e-2, lambd=1e-4, alpha=0.75, t0=1e6, weight_decay=0,
-                 foreach: Optional[bool] = None, maximize: bool = False,
-                 differentiable: bool = False):
+    def __init__(
+        self,
+        params,
+        lr=1e-2,
+        lambd=1e-4,
+        alpha=0.75,
+        t0=1e6,
+        weight_decay=0,
+        foreach: Optional[bool] = None,
+        maximize: bool = False,
+        differentiable: bool = False,
+    ):
         if not 0.0 <= lr:
             raise ValueError("Invalid learning rate: {}".format(lr))
         if not 0.0 <= weight_decay:
             raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
 
-        defaults = dict(lr=lr, lambd=lambd, alpha=alpha, t0=t0,
-                        weight_decay=weight_decay, foreach=foreach, maximize=maximize,
-                        differentiable=differentiable)
+        defaults = dict(
+            lr=lr,
+            lambd=lambd,
+            alpha=alpha,
+            t0=t0,
+            weight_decay=weight_decay,
+            foreach=foreach,
+            maximize=maximize,
+            differentiable=differentiable,
+        )
         super(ASGD, self).__init__(params, defaults)
 
     def __setstate__(self, state):
         super().__setstate__(state)
         for group in self.param_groups:
-            group.setdefault('foreach', None)
-            group.setdefault('maximize', False)
-            group.setdefault('differentiable', False)
+            group.setdefault("foreach", None)
+            group.setdefault("maximize", False)
+            group.setdefault("differentiable", False)
         state_values = list(self.state.values())
-        step_is_tensor = (len(state_values) != 0) and torch.is_tensor(state_values[0]['step'])
+        step_is_tensor = (len(state_values) != 0) and torch.is_tensor(
+            state_values[0]["step"]
+        )
         if not step_is_tensor:
             for s in state_values:
-                s['step'] = torch.tensor(float(s['step']))
-        eta_is_tensor = (len(state_values) != 0) and torch.is_tensor(state_values[0]['eta'])
+                s["step"] = torch.tensor(float(s["step"]))
+        eta_is_tensor = (len(state_values) != 0) and torch.is_tensor(
+            state_values[0]["eta"]
+        )
         if not eta_is_tensor:
             for s in state_values:
-                s['eta'] = torch.tensor(s['eta'])
-        mu_is_tensor = (len(state_values) != 0) and torch.is_tensor(state_values[0]['mu'])
+                s["eta"] = torch.tensor(s["eta"])
+        mu_is_tensor = (len(state_values) != 0) and torch.is_tensor(
+            state_values[0]["mu"]
+        )
         if not mu_is_tensor:
             for s in state_values:
-                s['mu'] = torch.tensor(float(s['mu']))
+                s["mu"] = torch.tensor(float(s["mu"]))
 
     @_use_grad_for_differentiable
     def step(self, closure=None):
@@ -84,61 +107,67 @@ def step(self, closure=None):
             etas = []
             state_steps = []
 
-            for p in group['params']:
+            for p in group["params"]:
                 if p.grad is not None:
                     params_with_grad.append(p)
                     if p.grad.is_sparse:
-                        raise RuntimeError('ASGD does not support sparse gradients')
+                        raise RuntimeError("ASGD does not support sparse gradients")
                     grads.append(p.grad)
 
                     state = self.state[p]
                     # State initialization
                     if len(state) == 0:
-                        state['step'] = torch.tensor(0.)
-                        state['eta'] = torch.tensor(group['lr'])
-                        state['mu'] = torch.tensor(1.)
-                        state['ax'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
-                    mus.append(state['mu'])
-                    axs.append(state['ax'])
-                    etas.append(state['eta'])
-                    state_steps.append(state['step'])
-
-            asgd(params_with_grad,
-                 grads,
-                 axs,
-                 mus,
-                 etas,
-                 state_steps,
-                 lambd=group['lambd'],
-                 lr=group['lr'],
-                 t0=group['t0'],
-                 alpha=group['alpha'],
-                 weight_decay=group['weight_decay'],
-                 foreach=group['foreach'],
-                 maximize=group['maximize'],
-                 differentiable=group['differentiable'])
+                        state["step"] = torch.tensor(0.0)
+                        state["eta"] = torch.tensor(group["lr"])
+                        state["mu"] = torch.tensor(1.0)
+                        state["ax"] = torch.zeros_like(
+                            p, memory_format=torch.preserve_format
+                        )
+
+                    mus.append(state["mu"])
+                    axs.append(state["ax"])
+                    etas.append(state["eta"])
+                    state_steps.append(state["step"])
+
+            asgd(
+                params_with_grad,
+                grads,
+                axs,
+                mus,
+                etas,
+                state_steps,
+                lambd=group["lambd"],
+                lr=group["lr"],
+                t0=group["t0"],
+                alpha=group["alpha"],
+                weight_decay=group["weight_decay"],
+                foreach=group["foreach"],
+                maximize=group["maximize"],
+                differentiable=group["differentiable"],
+            )
 
         return loss
 
 
-def asgd(params: List[Tensor],
-         grads: List[Tensor],
-         axs: List[Tensor],
-         mus: List[Tensor],
-         etas: List[Tensor],
-         state_steps: List[Tensor],
-         # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
-         # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
-         foreach: bool = None,
-         maximize: bool = False,
-         differentiable: bool = False,
-         *,
-         lambd: float,
-         lr: float,
-         t0: float,
-         alpha: float,
-         weight_decay: float):
+def asgd(
+    params: List[Tensor],
+    grads: List[Tensor],
+    axs: List[Tensor],
+    mus: List[Tensor],
+    etas: List[Tensor],
+    state_steps: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+    foreach: bool = None,
+    maximize: bool = False,
+    differentiable: bool = False,
+    *,
+    lambd: float,
+    lr: float,
+    t0: float,
+    alpha: float,
+    weight_decay: float,
+):
     r"""Functional API that performs asgd algorithm computation.
 
     See :class:`~torch.optim.ASGD` for details.
@@ -149,42 +178,46 @@ def asgd(params: List[Tensor],
         foreach = False
 
     if foreach and torch.jit.is_scripting():
-        raise RuntimeError('torch.jit.script not supported with foreach optimizers')
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
 
     if foreach and not torch.jit.is_scripting():
         func = _multi_tensor_asgd
     else:
         func = _single_tensor_asgd
 
-    func(params,
-         grads,
-         axs,
-         mus,
-         etas,
-         state_steps,
-         lambd=lambd,
-         lr=lr,
-         t0=t0,
-         alpha=alpha,
-         weight_decay=weight_decay,
-         maximize=maximize,
-         differentiable=differentiable)
-
-
-def _single_tensor_asgd(params: List[Tensor],
-                        grads: List[Tensor],
-                        axs: List[Tensor],
-                        mus: List[Tensor],
-                        etas: List[Tensor],
-                        state_steps: List[Tensor],
-                        *,
-                        lambd: float,
-                        lr: float,
-                        t0: float,
-                        alpha: float,
-                        weight_decay: float,
-                        maximize: bool,
-                        differentiable: bool):
+    func(
+        params,
+        grads,
+        axs,
+        mus,
+        etas,
+        state_steps,
+        lambd=lambd,
+        lr=lr,
+        t0=t0,
+        alpha=alpha,
+        weight_decay=weight_decay,
+        maximize=maximize,
+        differentiable=differentiable,
+    )
+
+
+def _single_tensor_asgd(
+    params: List[Tensor],
+    grads: List[Tensor],
+    axs: List[Tensor],
+    mus: List[Tensor],
+    etas: List[Tensor],
+    state_steps: List[Tensor],
+    *,
+    lambd: float,
+    lr: float,
+    t0: float,
+    alpha: float,
+    weight_decay: float,
+    maximize: bool,
+    differentiable: bool,
+):
 
     for i, param in enumerate(params):
         grad = grads[i]
@@ -224,20 +257,22 @@ def _single_tensor_asgd(params: List[Tensor],
         mu.copy_(new_mu)
 
 
-def _multi_tensor_asgd(params: List[Tensor],
-                       grads: List[Tensor],
-                       axs: List[Tensor],
-                       mus: List[Tensor],
-                       etas: List[Tensor],
-                       state_steps: List[Tensor],
-                       *,
-                       lambd: float,
-                       lr: float,
-                       t0: float,
-                       alpha: float,
-                       weight_decay: float,
-                       maximize: bool,
-                       differentiable: bool):
+def _multi_tensor_asgd(
+    params: List[Tensor],
+    grads: List[Tensor],
+    axs: List[Tensor],
+    mus: List[Tensor],
+    etas: List[Tensor],
+    state_steps: List[Tensor],
+    *,
+    lambd: float,
+    lr: float,
+    t0: float,
+    alpha: float,
+    weight_decay: float,
+    maximize: bool,
+    differentiable: bool,
+):
 
     if len(params) == 0:
         return
@@ -248,7 +283,9 @@ def _multi_tensor_asgd(params: List[Tensor],
         grads = torch._foreach_neg(grads)
 
     def _view_complex_as_real(tensor_list):
-        return [torch.view_as_real(t) if torch.is_complex(t) else t for t in tensor_list]
+        return [
+            torch.view_as_real(t) if torch.is_complex(t) else t for t in tensor_list
+        ]
 
     grads = _view_complex_as_real(grads)
     params = _view_complex_as_real(params)
@@ -276,7 +313,9 @@ def _view_complex_as_real(tensor_list):
 
     # update eta and mu
     for i in range(len(mus)):
-        new_eta = torch.tensor(lr / math.pow((1 + lambd * lr * state_steps[i].item()), alpha))
+        new_eta = torch.tensor(
+            lr / math.pow((1 + lambd * lr * state_steps[i].item()), alpha)
+        )
         etas[i].copy_(new_eta)
         new_mu = torch.tensor(1 / max(1, state_steps[i].item() - t0))
         mus[i].copy_(new_mu)
diff --git a/torch/optim/nadam.py b/torch/optim/nadam.py
index 54d0c37e0672c..59f3b790b3132 100644
--- a/torch/optim/nadam.py
+++ b/torch/optim/nadam.py
@@ -174,6 +174,7 @@ def nadam(params: List[Tensor],
     See :class:`~torch.optim.NAdam` for details.
     """
 
+
     if not all(isinstance(t, torch.Tensor) for t in state_steps):
         raise RuntimeError("API has changed, `state_steps` argument must contain a list of singleton tensors")
 
diff --git a/torch/optim/radam.py b/torch/optim/radam.py
index c389e48ccf3fd..4a3d271e65d33 100644
--- a/torch/optim/radam.py
+++ b/torch/optim/radam.py
@@ -5,7 +5,8 @@
 from .optimizer import Optimizer, _use_grad_for_differentiable
 from typing import List, Optional
 
-__all__ = ['RAdam', 'radam']
+__all__ = ["RAdam", "radam"]
+
 
 class RAdam(Optimizer):
     r"""Implements RAdam algorithm.
@@ -66,9 +67,17 @@ class RAdam(Optimizer):
         https://github.com/LiyuanLucasLiu/RAdam
     """
 
-    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
-                 weight_decay=0, *, foreach: Optional[bool] = None,
-                 differentiable: bool = False):
+    def __init__(
+        self,
+        params,
+        lr=1e-3,
+        betas=(0.9, 0.999),
+        eps=1e-8,
+        weight_decay=0,
+        *,
+        foreach: Optional[bool] = None,
+        differentiable: bool = False,
+    ):
         if not 0.0 <= lr:
             raise ValueError("Invalid learning rate: {}".format(lr))
         if not 0.0 <= eps:
@@ -79,20 +88,28 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
         if not 0.0 <= weight_decay:
             raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
-        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
-                        foreach=foreach, differentiable=differentiable)
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            foreach=foreach,
+            differentiable=differentiable,
+        )
         super(RAdam, self).__init__(params, defaults)
 
     def __setstate__(self, state):
         super().__setstate__(state)
         for group in self.param_groups:
-            group.setdefault('foreach', None)
-            group.setdefault('differentiable', False)
+            group.setdefault("foreach", None)
+            group.setdefault("differentiable", False)
         state_values = list(self.state.values())
-        step_is_tensor = (len(state_values) != 0) and torch.is_tensor(state_values[0]['step'])
+        step_is_tensor = (len(state_values) != 0) and torch.is_tensor(
+            state_values[0]["step"]
+        )
         if not step_is_tensor:
             for s in state_values:
-                s['step'] = torch.tensor(float(s['step']))
+                s["step"] = torch.tensor(float(s["step"]))
 
     @_use_grad_for_differentiable
     def step(self, closure=None):
@@ -113,104 +130,118 @@ def step(self, closure=None):
             exp_avgs = []
             exp_avg_sqs = []
             state_steps = []
-            beta1, beta2 = group['betas']
+            beta1, beta2 = group["betas"]
 
-            for p in group['params']:
+            for p in group["params"]:
                 if p.grad is not None:
                     params_with_grad.append(p)
                     if p.grad.is_sparse:
-                        raise RuntimeError('RAdam does not support sparse gradients')
+                        raise RuntimeError("RAdam does not support sparse gradients")
                     grads.append(p.grad)
 
                     state = self.state[p]
                     # Lazy state initialization
                     if len(state) == 0:
-                        state['step'] = torch.tensor(0.)
+                        state["step"] = torch.tensor(0.0)
                         # Exponential moving average of gradient values
-                        state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                        state["exp_avg"] = torch.zeros_like(
+                            p, memory_format=torch.preserve_format
+                        )
                         # Exponential moving average of squared gradient values
-                        state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
-                    exp_avgs.append(state['exp_avg'])
-                    exp_avg_sqs.append(state['exp_avg_sq'])
-                    state_steps.append(state['step'])
-
-            radam(params_with_grad,
-                  grads,
-                  exp_avgs,
-                  exp_avg_sqs,
-                  state_steps,
-                  beta1=beta1,
-                  beta2=beta2,
-                  lr=group['lr'],
-                  weight_decay=group['weight_decay'],
-                  eps=group['eps'],
-                  foreach=group['foreach'],
-                  differentiable=group['differentiable'])
+                        state["exp_avg_sq"] = torch.zeros_like(
+                            p, memory_format=torch.preserve_format
+                        )
+
+                    exp_avgs.append(state["exp_avg"])
+                    exp_avg_sqs.append(state["exp_avg_sq"])
+                    state_steps.append(state["step"])
+
+            radam(
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_avg_sqs,
+                state_steps,
+                beta1=beta1,
+                beta2=beta2,
+                lr=group["lr"],
+                weight_decay=group["weight_decay"],
+                eps=group["eps"],
+                foreach=group["foreach"],
+                differentiable=group["differentiable"],
+            )
 
         return loss
 
 
-def radam(params: List[Tensor],
-          grads: List[Tensor],
-          exp_avgs: List[Tensor],
-          exp_avg_sqs: List[Tensor],
-          state_steps: List[Tensor],
-          # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
-          # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
-          foreach: bool = None,
-          differentiable: bool = False,
-          *,
-          beta1: float,
-          beta2: float,
-          lr: float,
-          weight_decay: float,
-          eps: float):
+def radam(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    state_steps: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+    foreach: bool = None,
+    differentiable: bool = False,
+    *,
+    beta1: float,
+    beta2: float,
+    lr: float,
+    weight_decay: float,
+    eps: float,
+):
     r"""Functional API that performs RAdam algorithm computation.
 
     See :class:`~torch.optim.RAdam` for details.
     """
 
     if not all(isinstance(t, torch.Tensor) for t in state_steps):
-        raise RuntimeError("API has changed, `state_steps` argument must contain a list of singleton tensors")
+        raise RuntimeError(
+            "API has changed, `state_steps` argument must contain a list of singleton tensors"
+        )
 
     if foreach is None:
         # Placeholder for more complex foreach logic to be added when value is not set
         foreach = False
 
     if foreach and torch.jit.is_scripting():
-        raise RuntimeError('torch.jit.script not supported with foreach optimizers')
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
 
     if foreach and not torch.jit.is_scripting():
         func = _multi_tensor_radam
     else:
         func = _single_tensor_radam
 
-    func(params,
-         grads,
-         exp_avgs,
-         exp_avg_sqs,
-         state_steps,
-         beta1=beta1,
-         beta2=beta2,
-         lr=lr,
-         weight_decay=weight_decay,
-         eps=eps,
-         differentiable=differentiable)
-
-
-def _single_tensor_radam(params: List[Tensor],
-                         grads: List[Tensor],
-                         exp_avgs: List[Tensor],
-                         exp_avg_sqs: List[Tensor],
-                         state_steps: List[Tensor],
-                         *,
-                         beta1: float,
-                         beta2: float,
-                         lr: float,
-                         weight_decay: float,
-                         eps: float,
-                         differentiable: bool):
+    func(
+        params,
+        grads,
+        exp_avgs,
+        exp_avg_sqs,
+        state_steps,
+        beta1=beta1,
+        beta2=beta2,
+        lr=lr,
+        weight_decay=weight_decay,
+        eps=eps,
+        differentiable=differentiable,
+    )
+
+
+def _single_tensor_radam(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    state_steps: List[Tensor],
+    *,
+    beta1: float,
+    beta2: float,
+    lr: float,
+    weight_decay: float,
+    eps: float,
+    differentiable: bool,
+):
 
     for i, param in enumerate(params):
         grad = grads[i]
@@ -221,8 +252,8 @@ def _single_tensor_radam(params: List[Tensor],
         step_t += 1
         step = step_t.item()
 
-        bias_correction1 = 1 - beta1 ** step
-        bias_correction2 = 1 - beta2 ** step
+        bias_correction1 = 1 - beta1**step
+        bias_correction2 = 1 - beta2**step
 
         if weight_decay != 0:
             grad = grad.add(param, alpha=weight_decay)
@@ -237,11 +268,16 @@ def _single_tensor_radam(params: List[Tensor],
         # maximum length of the approximated SMA
         rho_inf = 2 / (1 - beta2) - 1
         # compute the length of the approximated SMA
-        rho_t = rho_inf - 2 * step * (beta2 ** step) / bias_correction2
+        rho_t = rho_inf - 2 * step * (beta2**step) / bias_correction2
 
-        if rho_t > 5.:
+        if rho_t > 5.0:
             # Compute the variance rectification term and update parameters accordingly
-            rect = math.sqrt((rho_t - 4) * (rho_t - 2) * rho_inf / ((rho_inf - 4) * (rho_inf - 2) * rho_t))
+            rect = math.sqrt(
+                (rho_t - 4)
+                * (rho_t - 2)
+                * rho_inf
+                / ((rho_inf - 4) * (rho_inf - 2) * rho_t)
+            )
             exp_avg_sq_sqrt = exp_avg_sq.sqrt()
             if differentiable:
                 exp_avg_sq_sqrt = exp_avg_sq_sqrt.add(eps)
@@ -253,18 +289,20 @@ def _single_tensor_radam(params: List[Tensor],
             param.add_(bias_corrected_exp_avg * lr, alpha=-1.0)
 
 
-def _multi_tensor_radam(params: List[Tensor],
-                        grads: List[Tensor],
-                        exp_avgs: List[Tensor],
-                        exp_avg_sqs: List[Tensor],
-                        state_steps: List[Tensor],
-                        *,
-                        beta1: float,
-                        beta2: float,
-                        lr: float,
-                        weight_decay: float,
-                        eps: float,
-                        differentiable: bool):
+def _multi_tensor_radam(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    state_steps: List[Tensor],
+    *,
+    beta1: float,
+    beta2: float,
+    lr: float,
+    weight_decay: float,
+    eps: float,
+    differentiable: bool,
+):
 
     if len(params) == 0:
         return
@@ -277,7 +315,10 @@ def _multi_tensor_radam(params: List[Tensor],
     # maximum length of the approximated SMA
     rho_inf = 2 / (1 - beta2) - 1
     # compute the length of the approximated SMA
-    rho_t_list = [rho_inf - 2 * step.item() * (beta2 ** step.item()) / (1 - beta2 ** step.item()) for step in state_steps]
+    rho_t_list = [
+        rho_inf - 2 * step.item() * (beta2 ** step.item()) / (1 - beta2 ** step.item())
+        for step in state_steps
+    ]
 
     bias_correction1 = [1 - beta1 ** step.item() for step in state_steps]
     bias_correction2 = [1 - beta2 ** step.item() for step in state_steps]
@@ -291,9 +332,18 @@ def _multi_tensor_radam(params: List[Tensor],
     torch._foreach_mul_(exp_avg_sqs, beta2)
     torch._foreach_addcmul_(exp_avg_sqs, grads, grads, 1 - beta2)
 
-    rect = [math.sqrt((rho_t - 4) * (rho_t - 2) * rho_inf / ((rho_inf - 4) * (rho_inf - 2) * rho_t))
-            if rho_t > 5 else 0 for rho_t in rho_t_list]
-    unrectified = [0 if rect > 0 else 1. for rect in rect]
+    rect = [
+        math.sqrt(
+            (rho_t - 4)
+            * (rho_t - 2)
+            * rho_inf
+            / ((rho_inf - 4) * (rho_inf - 2) * rho_t)
+        )
+        if rho_t > 5
+        else 0
+        for rho_t in rho_t_list
+    ]
+    unrectified = [0 if rect > 0 else 1.0 for rect in rect]
 
     exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs)
     bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
@@ -301,6 +351,11 @@ def _multi_tensor_radam(params: List[Tensor],
     step_size = [(lr * rect / bc) * -1 for rect, bc in zip(rect, bias_correction1)]
     torch._foreach_addcdiv_(params, exp_avgs, denom, step_size)
 
-    denom = [torch.ones_like(exp_av, memory_format=torch.preserve_format) for exp_av in exp_avgs]
-    step_size = [(lr * rect / bc) * -1 for rect, bc in zip(unrectified, bias_correction1)]
+    denom = [
+        torch.ones_like(exp_av, memory_format=torch.preserve_format)
+        for exp_av in exp_avgs
+    ]
+    step_size = [
+        (lr * rect / bc) * -1 for rect, bc in zip(unrectified, bias_correction1)
+    ]
     torch._foreach_addcdiv_(params, exp_avgs, denom, step_size)
diff --git a/torch/optim/rmsprop.py b/torch/optim/rmsprop.py
index 22a5bd4488a79..0bf97158e8fa9 100644
--- a/torch/optim/rmsprop.py
+++ b/torch/optim/rmsprop.py
@@ -3,7 +3,8 @@
 from .optimizer import Optimizer, _use_grad_for_differentiable
 from typing import List, Optional
 
-__all__ = ['RMSprop', 'rmsprop']
+__all__ = ["RMSprop", "rmsprop"]
+
 
 class RMSprop(Optimizer):
     r"""Implements RMSprop algorithm.
@@ -67,9 +68,19 @@ class RMSprop(Optimizer):
 
     """
 
-    def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0,
-                 centered=False, foreach: Optional[bool] = None, maximize: bool = False,
-                 differentiable: bool = False):
+    def __init__(
+        self,
+        params,
+        lr=1e-2,
+        alpha=0.99,
+        eps=1e-8,
+        weight_decay=0,
+        momentum=0,
+        centered=False,
+        foreach: Optional[bool] = None,
+        maximize: bool = False,
+        differentiable: bool = False,
+    ):
         if not 0.0 <= lr:
             raise ValueError("Invalid learning rate: {}".format(lr))
         if not 0.0 <= eps:
@@ -81,19 +92,27 @@ def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, moment
         if not 0.0 <= alpha:
             raise ValueError("Invalid alpha value: {}".format(alpha))
 
-        defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps, centered=centered,
-                        weight_decay=weight_decay, foreach=foreach, maximize=maximize,
-                        differentiable=differentiable)
+        defaults = dict(
+            lr=lr,
+            momentum=momentum,
+            alpha=alpha,
+            eps=eps,
+            centered=centered,
+            weight_decay=weight_decay,
+            foreach=foreach,
+            maximize=maximize,
+            differentiable=differentiable,
+        )
         super(RMSprop, self).__init__(params, defaults)
 
     def __setstate__(self, state):
         super().__setstate__(state)
         for group in self.param_groups:
-            group.setdefault('momentum', 0)
-            group.setdefault('centered', False)
-            group.setdefault('foreach', None)
-            group.setdefault('maximize', False)
-            group.setdefault('differentiable', False)
+            group.setdefault("momentum", 0)
+            group.setdefault("centered", False)
+            group.setdefault("foreach", None)
+            group.setdefault("maximize", False)
+            group.setdefault("differentiable", False)
 
     @_use_grad_for_differentiable
     def step(self, closure=None):
@@ -115,73 +134,82 @@ def step(self, closure=None):
             grad_avgs = []
             momentum_buffer_list = []
 
-            for p in group['params']:
+            for p in group["params"]:
                 if p.grad is None:
                     continue
                 params_with_grad.append(p)
 
                 if p.grad.is_sparse:
-                    raise RuntimeError('RMSprop does not support sparse gradients')
+                    raise RuntimeError("RMSprop does not support sparse gradients")
                 grads.append(p.grad)
 
                 state = self.state[p]
 
                 # State initialization
                 if len(state) == 0:
-                    state['step'] = 0
-                    state['square_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                    if group['momentum'] > 0:
-                        state['momentum_buffer'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                    if group['centered']:
-                        state['grad_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                square_avgs.append(state['square_avg'])
-
-                if group['momentum'] > 0:
-                    momentum_buffer_list.append(state['momentum_buffer'])
-                if group['centered']:
-                    grad_avgs.append(state['grad_avg'])
-
-                if group['differentiable'] and isinstance(state['step'], Tensor):
-                    raise RuntimeError('`step` can\'t be a tensor')
-
-                state['step'] += 1
-
-
-            rmsprop(params_with_grad,
-                    grads,
-                    square_avgs,
-                    grad_avgs,
-                    momentum_buffer_list,
-                    lr=group['lr'],
-                    alpha=group['alpha'],
-                    eps=group['eps'],
-                    weight_decay=group['weight_decay'],
-                    momentum=group['momentum'],
-                    centered=group['centered'],
-                    foreach=group['foreach'],
-                    maximize=group["maximize"],
-                    differentiable=group["differentiable"])
+                    state["step"] = 0
+                    state["square_avg"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+                    if group["momentum"] > 0:
+                        state["momentum_buffer"] = torch.zeros_like(
+                            p, memory_format=torch.preserve_format
+                        )
+                    if group["centered"]:
+                        state["grad_avg"] = torch.zeros_like(
+                            p, memory_format=torch.preserve_format
+                        )
+                square_avgs.append(state["square_avg"])
+
+                if group["momentum"] > 0:
+                    momentum_buffer_list.append(state["momentum_buffer"])
+                if group["centered"]:
+                    grad_avgs.append(state["grad_avg"])
+
+                if group["differentiable"] and isinstance(state["step"], Tensor):
+                    raise RuntimeError("`step` can't be a tensor")
+
+                state["step"] += 1
+
+            rmsprop(
+                params_with_grad,
+                grads,
+                square_avgs,
+                grad_avgs,
+                momentum_buffer_list,
+                lr=group["lr"],
+                alpha=group["alpha"],
+                eps=group["eps"],
+                weight_decay=group["weight_decay"],
+                momentum=group["momentum"],
+                centered=group["centered"],
+                foreach=group["foreach"],
+                maximize=group["maximize"],
+                differentiable=group["differentiable"],
+            )
 
         return loss
 
 
-def rmsprop(params: List[Tensor],
-            grads: List[Tensor],
-            square_avgs: List[Tensor],
-            grad_avgs: List[Tensor],
-            momentum_buffer_list: List[Tensor],
-            # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
-            # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
-            foreach: bool = None,
-            maximize: bool = False,
-            differentiable: bool = False,
-            *,
-            lr: float,
-            alpha: float,
-            eps: float,
-            weight_decay: float,
-            momentum: float,
-            centered: bool):
+def rmsprop(
+    params: List[Tensor],
+    grads: List[Tensor],
+    square_avgs: List[Tensor],
+    grad_avgs: List[Tensor],
+    momentum_buffer_list: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+    foreach: bool = None,
+    maximize: bool = False,
+    differentiable: bool = False,
+    *,
+    lr: float,
+    alpha: float,
+    eps: float,
+    weight_decay: float,
+    momentum: float,
+    centered: bool,
+):
     r"""Functional API that performs rmsprop algorithm computation.
     See :class:`~torch.optim.RMSProp` for details.
     """
@@ -191,42 +219,46 @@ def rmsprop(params: List[Tensor],
         foreach = False
 
     if foreach and torch.jit.is_scripting():
-        raise RuntimeError('torch.jit.script not supported with foreach optimizers')
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
 
     if foreach and not torch.jit.is_scripting():
         func = _multi_tensor_rmsprop
     else:
         func = _single_tensor_rmsprop
 
-    func(params,
-         grads,
-         square_avgs,
-         grad_avgs,
-         momentum_buffer_list,
-         lr=lr,
-         alpha=alpha,
-         eps=eps,
-         weight_decay=weight_decay,
-         momentum=momentum,
-         centered=centered,
-         maximize=maximize,
-         differentiable=differentiable)
-
-
-def _single_tensor_rmsprop(params: List[Tensor],
-                           grads: List[Tensor],
-                           square_avgs: List[Tensor],
-                           grad_avgs: List[Tensor],
-                           momentum_buffer_list: List[Tensor],
-                           *,
-                           lr: float,
-                           alpha: float,
-                           eps: float,
-                           weight_decay: float,
-                           momentum: float,
-                           centered: bool,
-                           maximize: bool,
-                           differentiable: bool):
+    func(
+        params,
+        grads,
+        square_avgs,
+        grad_avgs,
+        momentum_buffer_list,
+        lr=lr,
+        alpha=alpha,
+        eps=eps,
+        weight_decay=weight_decay,
+        momentum=momentum,
+        centered=centered,
+        maximize=maximize,
+        differentiable=differentiable,
+    )
+
+
+def _single_tensor_rmsprop(
+    params: List[Tensor],
+    grads: List[Tensor],
+    square_avgs: List[Tensor],
+    grad_avgs: List[Tensor],
+    momentum_buffer_list: List[Tensor],
+    *,
+    lr: float,
+    alpha: float,
+    eps: float,
+    weight_decay: float,
+    momentum: float,
+    centered: bool,
+    maximize: bool,
+    differentiable: bool,
+):
 
     for i, param in enumerate(params):
         grad = grads[i]
@@ -268,20 +300,22 @@ def _single_tensor_rmsprop(params: List[Tensor],
             param.addcdiv_(grad, avg, value=-lr)
 
 
-def _multi_tensor_rmsprop(params: List[Tensor],
-                          grads: List[Tensor],
-                          square_avgs: List[Tensor],
-                          grad_avgs: List[Tensor],
-                          momentum_buffer_list: List[Tensor],
-                          *,
-                          lr: float,
-                          alpha: float,
-                          eps: float,
-                          weight_decay: float,
-                          momentum: float,
-                          centered: bool,
-                          maximize: bool,
-                          differentiable: bool):
+def _multi_tensor_rmsprop(
+    params: List[Tensor],
+    grads: List[Tensor],
+    square_avgs: List[Tensor],
+    grad_avgs: List[Tensor],
+    momentum_buffer_list: List[Tensor],
+    *,
+    lr: float,
+    alpha: float,
+    eps: float,
+    weight_decay: float,
+    momentum: float,
+    centered: bool,
+    maximize: bool,
+    differentiable: bool,
+):
 
     if len(params) == 0:
         return
@@ -295,7 +329,9 @@ def _multi_tensor_rmsprop(params: List[Tensor],
         torch._foreach_add_(grads, params, alpha=weight_decay)
 
     def _view_complex_as_real(tensor_list):
-        return [torch.view_as_real(t) if torch.is_complex(t) else t for t in tensor_list]
+        return [
+            torch.view_as_real(t) if torch.is_complex(t) else t for t in tensor_list
+        ]
 
     grads = _view_complex_as_real(grads)
     params = _view_complex_as_real(params)
diff --git a/torch/optim/rprop.py b/torch/optim/rprop.py
index 20e196a09df95..feab409291537 100644
--- a/torch/optim/rprop.py
+++ b/torch/optim/rprop.py
@@ -3,7 +3,8 @@
 from .optimizer import Optimizer, _use_grad_for_differentiable
 from typing import List, Optional
 
-__all__ = ['Rprop', 'rprop']
+__all__ = ["Rprop", "rprop"]
+
 
 class Rprop(Optimizer):
     r"""Implements the resilient backpropagation algorithm.
@@ -56,23 +57,38 @@ class Rprop(Optimizer):
             minimizing (default: False)
     """
 
-    def __init__(self, params, lr=1e-2, etas=(0.5, 1.2), step_sizes=(1e-6, 50),
-                 *, foreach: Optional[bool] = None, maximize: bool = False,
-                 differentiable: bool = False):
+    def __init__(
+        self,
+        params,
+        lr=1e-2,
+        etas=(0.5, 1.2),
+        step_sizes=(1e-6, 50),
+        *,
+        foreach: Optional[bool] = None,
+        maximize: bool = False,
+        differentiable: bool = False,
+    ):
         if not 0.0 <= lr:
             raise ValueError("Invalid learning rate: {}".format(lr))
         if not 0.0 < etas[0] < 1.0 < etas[1]:
             raise ValueError("Invalid eta values: {}, {}".format(etas[0], etas[1]))
 
-        defaults = dict(lr=lr, etas=etas, step_sizes=step_sizes, foreach=foreach, maximize=maximize, differentiable=differentiable)
+        defaults = dict(
+            lr=lr,
+            etas=etas,
+            step_sizes=step_sizes,
+            foreach=foreach,
+            maximize=maximize,
+            differentiable=differentiable,
+        )
         super(Rprop, self).__init__(params, defaults)
 
     def __setstate__(self, state):
         super().__setstate__(state)
         for group in self.param_groups:
-            group.setdefault('foreach', None)
-            group.setdefault('maximize', False)
-            group.setdefault('differentiable', False)
+            group.setdefault("foreach", None)
+            group.setdefault("maximize", False)
+            group.setdefault("differentiable", False)
 
     @_use_grad_for_differentiable
     def step(self, closure=None):
@@ -92,67 +108,79 @@ def step(self, closure=None):
             grads = []
             prevs = []
             step_sizes = []
-            etaminus, etaplus = group['etas']
-            step_size_min, step_size_max = group['step_sizes']
-            foreach = group['foreach']
-            maximize = group['maximize']
+            etaminus, etaplus = group["etas"]
+            step_size_min, step_size_max = group["step_sizes"]
+            foreach = group["foreach"]
+            maximize = group["maximize"]
 
-            for p in group['params']:
+            for p in group["params"]:
                 if p.grad is None:
                     continue
                 params.append(p)
                 grad = p.grad
                 if grad.is_sparse:
-                    raise RuntimeError('Rprop does not support sparse gradients')
+                    raise RuntimeError("Rprop does not support sparse gradients")
 
                 grads.append(grad)
                 state = self.state[p]
 
                 # State initialization
                 if len(state) == 0:
-                    state['step'] = 0
-                    state['prev'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    state["step"] = 0
+                    state["prev"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
                     if p.dtype.is_complex:
                         # Complex Number should be as if they are two independent real numbers.
                         # Hence the step_size shouldn't be zero for imaginary part.
-                        state['step_size'] = grad.new().resize_as_(grad).fill_(complex(group['lr'], group['lr']))
+                        state["step_size"] = (
+                            grad.new()
+                            .resize_as_(grad)
+                            .fill_(complex(group["lr"], group["lr"]))
+                        )
                     else:
-                        state['step_size'] = grad.new().resize_as_(grad).fill_(group['lr'])
-
-                prevs.append(state['prev'])
-                step_sizes.append(state['step_size'])
-
-                state['step'] += 1
-
-            rprop(params,
-                  grads,
-                  prevs,
-                  step_sizes,
-                  step_size_min=step_size_min,
-                  step_size_max=step_size_max,
-                  etaminus=etaminus,
-                  etaplus=etaplus,
-                  foreach=foreach,
-                  maximize=maximize,
-                  differentiable=group['differentiable'])
+                        state["step_size"] = (
+                            grad.new().resize_as_(grad).fill_(group["lr"])
+                        )
+
+                prevs.append(state["prev"])
+                step_sizes.append(state["step_size"])
+
+                state["step"] += 1
+
+            rprop(
+                params,
+                grads,
+                prevs,
+                step_sizes,
+                step_size_min=step_size_min,
+                step_size_max=step_size_max,
+                etaminus=etaminus,
+                etaplus=etaplus,
+                foreach=foreach,
+                maximize=maximize,
+                differentiable=group["differentiable"],
+            )
 
         return loss
 
 
-def rprop(params: List[Tensor],
-          grads: List[Tensor],
-          prevs: List[Tensor],
-          step_sizes: List[Tensor],
-          # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
-          # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
-          foreach: bool = None,
-          maximize: bool = False,
-          differentiable: bool = False,
-          *,
-          step_size_min: float,
-          step_size_max: float,
-          etaminus: float,
-          etaplus: float):
+def rprop(
+    params: List[Tensor],
+    grads: List[Tensor],
+    prevs: List[Tensor],
+    step_sizes: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+    foreach: bool = None,
+    maximize: bool = False,
+    differentiable: bool = False,
+    *,
+    step_size_min: float,
+    step_size_max: float,
+    etaminus: float,
+    etaplus: float,
+):
     r"""Functional API that performs rprop algorithm computation.
 
     See :class:`~torch.optim.Rprop` for details.
@@ -163,36 +191,40 @@ def rprop(params: List[Tensor],
         foreach = False
 
     if foreach and torch.jit.is_scripting():
-        raise RuntimeError('torch.jit.script not supported with foreach optimizers')
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
 
     if foreach and not torch.jit.is_scripting():
         func = _multi_tensor_rprop
     else:
         func = _single_tensor_rprop
 
-    func(params,
-         grads,
-         prevs,
-         step_sizes,
-         step_size_min=step_size_min,
-         step_size_max=step_size_max,
-         etaminus=etaminus,
-         etaplus=etaplus,
-         maximize=maximize,
-         differentiable=differentiable)
-
-
-def _single_tensor_rprop(params: List[Tensor],
-                         grads: List[Tensor],
-                         prevs: List[Tensor],
-                         step_sizes: List[Tensor],
-                         *,
-                         step_size_min: float,
-                         step_size_max: float,
-                         etaminus: float,
-                         etaplus: float,
-                         maximize: bool,
-                         differentiable: bool):
+    func(
+        params,
+        grads,
+        prevs,
+        step_sizes,
+        step_size_min=step_size_min,
+        step_size_max=step_size_max,
+        etaminus=etaminus,
+        etaplus=etaplus,
+        maximize=maximize,
+        differentiable=differentiable,
+    )
+
+
+def _single_tensor_rprop(
+    params: List[Tensor],
+    grads: List[Tensor],
+    prevs: List[Tensor],
+    step_sizes: List[Tensor],
+    *,
+    step_size_min: float,
+    step_size_max: float,
+    etaminus: float,
+    etaplus: float,
+    maximize: bool,
+    differentiable: bool,
+):
 
     for i, param in enumerate(params):
         grad = grads[i]
@@ -226,17 +258,19 @@ def _single_tensor_rprop(params: List[Tensor],
         prev.copy_(grad)
 
 
-def _multi_tensor_rprop(params: List[Tensor],
-                        grads: List[Tensor],
-                        prevs: List[Tensor],
-                        step_sizes: List[Tensor],
-                        *,
-                        step_size_min: float,
-                        step_size_max: float,
-                        etaminus: float,
-                        etaplus: float,
-                        maximize: bool,
-                        differentiable: bool):
+def _multi_tensor_rprop(
+    params: List[Tensor],
+    grads: List[Tensor],
+    prevs: List[Tensor],
+    step_sizes: List[Tensor],
+    *,
+    step_size_min: float,
+    step_size_max: float,
+    etaminus: float,
+    etaplus: float,
+    maximize: bool,
+    differentiable: bool,
+):
 
     if len(params) == 0:
         return
@@ -245,7 +279,9 @@ def _multi_tensor_rprop(params: List[Tensor],
 
     # Handle complex params
     def _view_complex_as_real(tensor_list):
-        return [torch.view_as_real(t) if torch.is_complex(t) else t for t in tensor_list]
+        return [
+            torch.view_as_real(t) if torch.is_complex(t) else t for t in tensor_list
+        ]
 
     grads = _view_complex_as_real(grads)
     prevs = _view_complex_as_real(prevs)

From a291eba88aad73ffd7587d5d28ebce705bd7db1c Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Tue, 29 Nov 2022 17:37:36 +0000
Subject: [PATCH 1390/1922] [inductor] Add aten._native_batch_norm_legit to
 decomposition (#89843)

Summary: Seeing a lot of fallback warnings when running dm_nfnet_f0

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89843
Approved by: https://github.com/eellison
---
 torch/_inductor/decomposition.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index 4f094aabce91b..4df5c08998ae0 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -66,6 +66,7 @@
         aten.mv,
         aten.narrow,
         aten.native_batch_norm,
+        aten._native_batch_norm_legit,
         aten._native_batch_norm_legit_functional,
         aten.native_batch_norm_backward,
         aten.native_dropout_backward,

From d41d937a7c06662ed2a559426d4ec5325425e565 Mon Sep 17 00:00:00 2001
From: Sergii Dymchenko <sdym@fb.com>
Date: Wed, 30 Nov 2022 01:01:24 +0000
Subject: [PATCH 1391/1922] Remove beauby and dzdang from CODEOWNERS (#89811)

GitHub linter complained because the users no longer on the project.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89811
Approved by: https://github.com/weiwangmeta
---
 CODEOWNERS | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index 179e87198dba2..bd40cd14ad540 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -25,8 +25,8 @@
 /aten/src/ATen/native/ao_sparse @z-a-f @salilsdesai @kimishpatel @digantdesai @jianyuh
 /aten/src/ATen/native/quantized @jerryzh168 @z-a-f @salilsdesai @kimishpatel @digantdesai @jianyuh
 /aten/src/ATen/native/quantized/cpu @jerryzh168 @z-a-f @salilsdesai @kimishpatel @digantdesai @jianyuh
-/aten/src/ATen/native/quantized/cuda @jerryzh168 @dzdang
-/aten/src/ATen/native/quantized/cudnn @jerryzh168 @dzdang
+/aten/src/ATen/native/quantized/cuda @jerryzh168
+/aten/src/ATen/native/quantized/cudnn @jerryzh168
 /test/test_quantization.py @jerryzh168
 /test/ao/ @jerryzh168 @z-a-f @hdcharles
 /test/quantization/ @jerryzh168 @z-a-f
@@ -39,8 +39,8 @@ nn/quantizable/ @jerryzh168 @z-a-f
 nn/qat/ @jerryzh168
 
 # Tensorpipe RPC Agent.
-/torch/csrc/distributed/rpc/tensorpipe_agent.cpp @jiayisuse @osalpekar @lw @beauby
-/torch/csrc/distributed/rpc/tensorpipe_agent.h @jiayisuse @osalpekar @lw @beauby
+/torch/csrc/distributed/rpc/tensorpipe_agent.cpp @jiayisuse @osalpekar @lw
+/torch/csrc/distributed/rpc/tensorpipe_agent.h @jiayisuse @osalpekar @lw
 
 # Distributed package
 # This list is mostly if you'd like to be tagged as reviewer, feel free to add

From 490d485790bfc90ac9ec86072db27d4c659a147c Mon Sep 17 00:00:00 2001
From: Scott Ramsby <scramsby@meta.com>
Date: Wed, 30 Nov 2022 01:01:46 +0000
Subject: [PATCH 1392/1922] [Caffe2] Fix merge logic bug (#89551)

Summary: `ExprGroup::getMergeCandidates()` had a logic bug. The vector being initialized had its arguments mis-ordered. This didn't trigger a build warning because the warning about implicit cast from an integral type to `bool` wasn't enabled.

Test Plan: `buck test fbsource//arvr/mode/win/vs2019/cuda11/opt fbsource//arvr/mode/hybrid_execution //arvr/libraries/neural_net_inference/TorchScript/...`

Differential Revision: D41488939

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89551
Approved by: https://github.com/davidberard98, https://github.com/jjsjann123
---
 torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp b/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp
index 1e2806b11fd42..5b659e3e94605 100644
--- a/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp
@@ -413,9 +413,10 @@ std::vector<ExprGroup*> ExprGroup::getMergeCandidates(
         "Shouldn't still be traversing in fallback mode if a merge was found.");
   }
 
-  std::vector<bool> can_merge(true, neighbors.size());
+  std::vector<bool> can_merge(neighbors.size(), true);
 
-  // Find neighbors with a level that is only 1 differant than this groups level
+  // Find neighbors with a level that is only 1 different than this group's
+  // level
   for (const auto i : c10::irange(neighbors.size())) {
     if (std::abs(neighbors[i]->payload()->level - payload()->level) > 1) {
       can_merge[i] = false;

From 07d61fb1de5aeb8fd0484288d39dfe1ee3844bd6 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Wed, 30 Nov 2022 01:01:57 +0000
Subject: [PATCH 1393/1922] [CI] Add TorchTrition conda packages (#89841)

As we need them to make triton available on both platforms
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89841
Approved by: https://github.com/msaroufim
---
 .github/scripts/build_triton_wheel.py    | 28 ++++++++-
 .github/workflows/build-triton-wheel.yml | 80 ++++++++++++++++++++++--
 2 files changed, 100 insertions(+), 8 deletions(-)

diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py
index d9d2a2e98bd35..b0c7e3f8b3bd9 100644
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@@ -2,6 +2,7 @@
 from subprocess import check_call
 from pathlib import Path
 from tempfile import TemporaryDirectory
+from typing import Optional
 import sys
 import shutil
 SCRIPT_DIR = Path(__file__).parent
@@ -29,12 +30,30 @@ def patch_setup_py(path: Path, *, version: str = "2.0.0", name: str = "triton")
         f.write(orig)
 
 
-def build_triton(commit_hash: str) -> Path:
+def build_triton(commit_hash: str, build_conda: bool = False, py_version : Optional[str] = None) -> Path:
     with TemporaryDirectory() as tmpdir:
         triton_basedir = Path(tmpdir) / "triton"
         triton_pythondir = triton_basedir / "python"
         check_call(["git", "clone", "https://github.com/openai/triton"], cwd=tmpdir)
         check_call(["git", "checkout", commit_hash], cwd=triton_basedir)
+        if build_conda:
+            with open(triton_basedir / "meta.yaml", "w") as meta:
+                print(f"package:\n  name: torchtriton\n  version: 2.0.0+{commit_hash[:10]}\n", file=meta)
+                print("source:\n  path: .\n", file=meta)
+                print("build:\n  string: py{{py}}\n  number: 1\n  script: cd python; "
+                      "python setup.py install --single-version-externally-managed --record=record.txt\n", file=meta)
+                print("requirements:\n  host:\n    - python\n    - setuptools\n  run:\n    - python\n"
+                      "    - filelock\n    - pytorch\n", file=meta)
+                print("about:\n  home: https://github.com/openai/triton\n  license: MIT\n  summary:"
+                      " 'A language and compiler for custom Deep Learning operation'", file=meta)
+
+            if py_version is None:
+                py_version = f"{sys.version_info.major}.{sys.version_info.minor}"
+            check_call(["conda", "build", "--python", py_version, "--output-folder", tmpdir, "."], cwd=triton_basedir)
+            conda_path = list(Path(tmpdir).glob("linux-64/torchtriton*.bz2"))[0]
+            shutil.copy(conda_path, Path.cwd())
+            return Path.cwd() / conda_path.name
+
         patch_setup_py(triton_pythondir / "setup.py", name="torchtriton", version=f"2.0.0+{commit_hash[:10]}")
         check_call([sys.executable, "setup.py", "bdist_wheel"], cwd=triton_pythondir)
         whl_path = list((triton_pythondir / "dist").glob("*.whl"))[0]
@@ -43,8 +62,13 @@ def build_triton(commit_hash: str) -> Path:
 
 
 def main() -> None:
+    from argparse import ArgumentParser
+    parser = ArgumentParser("Build Triton binaries")
+    parser.add_argument("--build-conda", action="store_true")
+    parser.add_argument("--py-version", type=str)
+    args = parser.parse_args()
     pin = read_triton_pin()
-    build_triton(pin)
+    build_triton(pin, build_conda=args.build_conda, py_version=args.py_version)
 
 
 if __name__ == "__main__":
diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml
index fac2a1340b42c..171495c0322d2 100644
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@@ -91,7 +91,7 @@ jobs:
 
       - uses: actions/upload-artifact@v3
         with:
-          name: "pytorch-triton-${{ matrix.py_vers }}"
+          name: "pytorch-triton-wheel-${{ matrix.py_vers }}"
           if-no-files-found: error
           path:
             ${{ runner.temp }}/artifacts/*
@@ -110,27 +110,27 @@ jobs:
       - name: Download Build Artifacts (3.7)
         uses: actions/download-artifact@v3
         with:
-          name: "pytorch-triton-3.7"
+          name: "pytorch-triton-wheel-3.7"
           path: "${{ runner.temp }}/artifacts/"
       - name: Download Build Artifacts (3.8)
         uses: actions/download-artifact@v3
         with:
-          name: "pytorch-triton-3.8"
+          name: "pytorch-triton-wheel-3.8"
           path: "${{ runner.temp }}/artifacts/"
       - name: Download Build Artifacts (3.9)
         uses: actions/download-artifact@v3
         with:
-          name: "pytorch-triton-3.9"
+          name: "pytorch-triton-wheel-3.9"
           path: "${{ runner.temp }}/artifacts/"
       - name: Download Build Artifacts (3.10)
         uses: actions/download-artifact@v3
         with:
-          name: "pytorch-triton-3.10"
+          name: "pytorch-triton-wheel-3.10"
           path: "${{ runner.temp }}/artifacts/"
       - name: Download Build Artifacts (3.11)
         uses: actions/download-artifact@v3
         with:
-          name: "pytorch-triton-3.11"
+          name: "pytorch-triton-wheel-3.11"
           path: "${{ runner.temp }}/artifacts/"
       - name: Upload binaries
         if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/master' || github.event.ref == 'refs/heads/main') }}
@@ -147,3 +147,71 @@ jobs:
             for pkg in "${PKG_DIR}/"*.whl; do
               aws s3 cp --no-progress --acl public-read "${pkg}" "${s3_dir}"
              done
+  build-conda:
+    runs-on: [self-hosted, linux.2xlarge]
+    strategy:
+      fail-fast: false
+      matrix:
+        py_vers: [ "3.7", "3.8", "3.9", "3.10" ]
+    timeout-minutes: 40
+    env:
+      DOCKER_IMAGE: pytorch/conda-builder:cuda11.6
+      PY_VERS: ${{ matrix.py_vers }}
+      ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+    steps:
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        with:
+          submodules: false
+
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: ${{ env.DOCKER_IMAGE }}
+
+      - name: Build Triton conda package
+        run: |
+          set -x
+          mkdir -p "${RUNNER_TEMP}/artifacts/"
+          container_name=$(docker run \
+            --tty \
+            --detach \
+            -v "${GITHUB_WORKSPACE}:/pytorch" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w /artifacts/ \
+            -e ANACONDA_API_TOKEN \
+            "${DOCKER_IMAGE}" \
+          )
+
+          docker exec -t "${container_name}" yum install -y llvm11 llvm11-devel llvm11-static llvm11-libs zlib-devel
+          docker exec -t "${container_name}" python /pytorch/.github/scripts/build_triton_wheel.py --build-conda --py-version="${PY_VERS}"
+
+      - name: Upload artifacts to Anaconda
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/master' || github.event.ref == 'refs/heads/main') }}
+        run: |
+          container_name=$(docker container ps --format '{{.ID}}')
+          docker exec -t "${container_name}" sh -c "anaconda upload /artifacts/torch*.tar.bz2 -u pytorch-nightly --label main --no-progress --force"
+
+      - name: Chown artifacts
+        run: |
+          container_name=$(docker container ps --format '{{.ID}}')
+          docker exec -t "${container_name}" chown -R 1000.1000 /artifacts
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: "pytorch-triton-conda-${{ matrix.py_vers }}"
+          if-no-files-found: error
+          path:
+            ${{ runner.temp }}/artifacts/*
+
+      - name: Teardown Linux
+        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        if: always()

From 8d7bc85303a5d8dfbebaa4617c3dd4982803b426 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 30 Nov 2022 01:02:34 +0000
Subject: [PATCH 1394/1922] [dashboaard] Fix flag compilers (#89853)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89853
Approved by: https://github.com/williamwen42
---
 benchmarks/dynamo/runner.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py
index 90702c50bea76..3f45b55fd77ee 100755
--- a/benchmarks/dynamo/runner.py
+++ b/benchmarks/dynamo/runner.py
@@ -1309,6 +1309,7 @@ def extract(key):
     args.compilers = compilers
     args.devices = devices
     args.dtypes = dtypes
+    flag_compilers = list(set(flag_compilers) & set(compilers))
     args.flag_compilers = flag_compilers
     args.suites = suites
 

From 936461cb80ce3f7fac8964df3899799dd26bec89 Mon Sep 17 00:00:00 2001
From: Iris <wz337@cornell.edu>
Date: Wed, 30 Nov 2022 01:06:55 +0000
Subject: [PATCH 1395/1922] Fix typo in filesystem.py (#89849)

As title.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89849
Approved by: https://github.com/H-Huang
---
 torch/distributed/checkpoint/filesystem.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/distributed/checkpoint/filesystem.py b/torch/distributed/checkpoint/filesystem.py
index 0e679c3039219..cfac79c17fe38 100644
--- a/torch/distributed/checkpoint/filesystem.py
+++ b/torch/distributed/checkpoint/filesystem.py
@@ -57,7 +57,7 @@ class _StoragePrefix:
     prefix: str
 
 
-DEFAULT_SUFIX = ".distcp"
+DEFAULT_SUFFIX = ".distcp"
 
 
 def _trim(tensor: torch.Tensor) -> torch.Tensor:
@@ -190,7 +190,7 @@ def write_data(
 
         def gen_file():
             nonlocal file_count
-            file_name = f"{storage_plan.prefix}{file_count}{DEFAULT_SUFIX}"
+            file_name = f"{storage_plan.prefix}{file_count}{DEFAULT_SUFFIX}"
             file_count += 1
             return file_name
 

From a171017cc4d033d1f778092791d2e6f99d039f8a Mon Sep 17 00:00:00 2001
From: Iris <wz337@cornell.edu>
Date: Wed, 30 Nov 2022 01:13:29 +0000
Subject: [PATCH 1396/1922] [checkpoint] Improve test (test_nested_dict.py)
 (#89854)

Improve the test_nested_dict.py test:
1. Add comments to show flatten_dict and mapping result.
2. Update test_mapping unit test to ensure the key value pair matching in mapping.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89854
Approved by: https://github.com/H-Huang
---
 .../checkpoint/test_nested_dict.py            | 32 +++++++++++++++----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/test/distributed/checkpoint/test_nested_dict.py b/test/distributed/checkpoint/test_nested_dict.py
index 676c7c64de3fa..33d618b3bdddd 100644
--- a/test/distributed/checkpoint/test_nested_dict.py
+++ b/test/distributed/checkpoint/test_nested_dict.py
@@ -19,6 +19,19 @@ def test_flattening_round_trip(self) -> None:
         }
 
         flatten_dict, mapping = flatten_state_dict(state_dict)
+        """
+        flatten_dict:
+            {
+                'key0': 1,
+                'key1': [1, 2],
+                'key2': {1: 2, 2: 3},
+                'key3': tensor([1]),
+                'key4.0.0': tensor(2),
+                'key4.0.1': 'x',
+                'key4.1': [1, 2, 3],
+                'key4.2': {'key6': [44]}
+            }
+        """
         restored = unflatten_state_dict(flatten_dict, mapping)
 
         self.assertEqual(state_dict, restored)
@@ -30,12 +43,19 @@ def test_mapping(self) -> None:
             "k3": ["x", 99, [{"k3": "y"}]],
         }
 
-        _, mapping = flatten_state_dict(state_dict)
-        self.assertIn(("k0",), mapping.values())
-        self.assertIn(("k2", 0), mapping.values())
-        self.assertIn(("k2", 1), mapping.values())
-        self.assertIn(("k2", 2, 0, "k3"), mapping.values())
-        self.assertIn(("k3",), mapping.values())
+        flatten_dict, mapping = flatten_state_dict(state_dict)
+        """
+        flatten_dict:
+        {'k0': [1], 'k2.0': tensor([1]), 'k2.1': 99, 'k2.2.0.k3': tensor(1), 'k3': ['x', 99, [{'k3': 'y'}]]}
+        mapping:
+        {'k0': ('k0',), 'k2.0': ('k2', 0), 'k2.1': ('k2', 1), 'k2.2.0.k3': ('k2', 2, 0, 'k3'), 'k3': ('k3',)}
+        """
+
+        self.assertEqual(("k0",), mapping["k0"])
+        self.assertEqual(("k2", 0), mapping["k2.0"])
+        self.assertEqual(("k2", 1), mapping["k2.1"])
+        self.assertEqual(("k2", 2, 0, "k3"), mapping["k2.2.0.k3"])
+        self.assertEqual(("k3",), mapping["k3"])
 
 
 if __name__ == "__main__":

From 6199e1169de0c54217054cb14731e6db9fbdede5 Mon Sep 17 00:00:00 2001
From: Elias Ellison <elias.ellison@gmail.com>
Date: Tue, 29 Nov 2022 18:25:55 +0000
Subject: [PATCH 1397/1922] Make fake tensors preserve dense strides in type
 conversion (#89803)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89803
Approved by: https://github.com/ngimel
---
 test/test_fake_tensor.py         | 20 ++++++++++++++++++++
 torch/_inductor/ir.py            |  1 -
 torch/_prims/__init__.py         |  6 +++++-
 torch/_subclasses/fake_tensor.py | 15 ---------------
 4 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
index 1a213bb767b48..86c1884d50b03 100644
--- a/test/test_fake_tensor.py
+++ b/test/test_fake_tensor.py
@@ -20,6 +20,7 @@
 import contextlib
 import weakref
 import copy
+from torch.utils._pytree import tree_flatten
 
 class FakeTensorTest(TestCase):
     def checkType(self, t, device_str, size):
@@ -137,6 +138,18 @@ def test_mode(self):
 
         self.assertTrue(isinstance(out, FakeTensor))
 
+    def check_function_with_fake(self, fn):
+        out = fn()
+        with torch._subclasses.FakeTensorMode():
+            out_fake = fn()
+
+        for a, b in zip(tree_flatten(out), tree_flatten(out_fake)):
+            if not isinstance(a, FakeTensor):
+                self.assertTrue(not isinstance(b, FakeTensor))
+                continue
+
+            prims.utils.compare_tensor_meta(a, b, check_strides=True)
+
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_non_kwarg_device(self):
         with FakeTensorMode():
@@ -146,6 +159,13 @@ def test_non_kwarg_device(self):
             z = x.to(torch.device("cuda"))
             self.assertEqual(z.device.type, "cuda")
 
+    def test_non_overlapping_stride_zero(self):
+        def foo():
+            x = torch.empty_strided([1, 3, 427, 640], (0, 1, 1920, 3))
+            return x.half()
+
+        self.check_function_with_fake(foo)
+
     def test_fake_mode_error(self):
         x = torch.rand([4, 4])
 
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 4c7d94ce9875a..66d1f9658a406 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -2985,7 +2985,6 @@ def create(cls, kernel, *args, **kwargs):
             aten._fft_c2c.out,
             aten._linalg_svd.default,
             aten._linalg_svd.U,
-            aten.upsample_bilinear2d.default,
         )
         context = (
             FakeTensorMode if kernel not in fake_incorrect_kernels else nullcontext
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index 67e16ca102ac1..e764229b8bb5c 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -1942,7 +1942,11 @@ def _convert_element_type_meta(a: TensorLikeType, dtype: torch.dtype) -> TensorL
     assert isinstance(a, TensorLike)
     assert isinstance(dtype, torch.dtype)
 
-    strides = utils.compute_elementwise_output_strides(a)
+    # dtype conversion preserves dense strides
+    if torch._prims_common.is_non_overlapping_and_dense(a):
+        strides = a.stride()
+    else:
+        strides = utils.compute_elementwise_output_strides(a)
 
     return TensorMeta(a, strides=strides, dtype=dtype)
 
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 9a0ac050e6b94..87b68bfc251a0 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -334,21 +334,6 @@ def _sparse_coo_tensor_with_dims_and_tensors(fake_mode, func, *args, **kwargs):
     return constructors(fake_mode, func, *args, **kwargs)
 
 
-# _to_copy fails when run with FakeTensors to cuda device
-# TODO: debug
-@register_op_impl(aten._to_copy.default)
-def to_copy(fake_mode, func, *args, **kwargs):
-    _, new_kwargs = normalize_function(
-        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
-    )
-
-    input_device = new_kwargs.pop("device", None)
-    out_device = input_device if input_device else new_kwargs["input"].device
-    with in_kernel_invocation_manager(fake_mode):
-        input = new_kwargs.pop("input").to("meta")
-        return FakeTensor(fake_mode, aten._to_copy(input, **new_kwargs), out_device)
-
-
 # index.Tensor data-dependent in only some conditions
 @register_op_impl(
     lambda func: torch.Tag.dynamic_output_shape in func.tags  # type: ignore[attr-defined]

From a16a9e9bba380bf2ae9047760a784fad0f9e24a4 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Tue, 29 Nov 2022 22:59:32 +0000
Subject: [PATCH 1398/1922] [FSDP][Easy] Remove outdated TODO (#89217)

**Overview**
This PR removes an outdated TODO:
```
# TODO (awgu): When exposing the original parameters, we need to also
# use this attribute to prevent re-synchronizing parameters.
```

**Justification**
We only pass `managed_params` to `_sync_module_params_and_buffers()`, where `managed_params` is defined as
```
managed_params = list(_get_orig_params(root_module, state._ignored_params))
```
This `_get_orig_params()` call excludes parameters already flattened by FSDP. Thus, `_sync_module_params_and_buffers()` will not re-sync already-synchronized parameters. Each parameter appears in `managed_params` for some FSDP instance exactly once and hence is only synchronized once.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89217
Approved by: https://github.com/mrshenli
---
 torch/distributed/fsdp/_init_utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index 7e128251fcc49..75750e8717ef5 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -664,8 +664,6 @@ def _sync_module_params_and_buffers(
     """
     _check_params_for_sync_module_states(params)
     module_states: List[torch.Tensor] = []
-    # TODO (awgu): When exposing the original parameters, we need to also
-    # use this attribute to prevent re-synchronizing parameters.
     for buffer in module.buffers():
         # Avoid re-synchronizing buffers in case of nested wrapping
         if not getattr(buffer, FSDP_SYNCED, False):

From 92b077f2cf5bfd7822b84659a6eb6a4def0bc13b Mon Sep 17 00:00:00 2001
From: zhxchen17 <zhxchen17@fb.com>
Date: Mon, 28 Nov 2022 19:36:06 -0800
Subject: [PATCH 1399/1922] [functorch] Move `cond.py` to `_cond.py` and expose
 `cond()` under functorch.experimental.control_flow. (#89819)

Summary:
Similar to https://github.com/pytorch/pytorch/pull/88767 we want to reduce the chance that users
accidentally import private functions from `functorch.experimental.cond` as if they were public
interfaces. We also move `cond()` under `control_flow.py` to stay consistent with `map()` op.

Test Plan:
CI

Reviewers:

Subscribers:

Tasks:

Tags:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89819
Approved by: https://github.com/zou3519
---
 functorch/experimental/{cond.py => _cond.py} | 2 --
 functorch/experimental/control_flow.py       | 1 +
 test/dynamo/test_export.py                   | 2 +-
 test/dynamo/test_misc.py                     | 8 ++++----
 test/functorch/test_control_flow.py          | 2 +-
 5 files changed, 7 insertions(+), 8 deletions(-)
 rename functorch/experimental/{cond.py => _cond.py} (97%)

diff --git a/functorch/experimental/cond.py b/functorch/experimental/_cond.py
similarity index 97%
rename from functorch/experimental/cond.py
rename to functorch/experimental/_cond.py
index bc6f776d073f6..a3c1936560439 100644
--- a/functorch/experimental/cond.py
+++ b/functorch/experimental/_cond.py
@@ -1,5 +1,3 @@
-# TODO(zhxchen17) Expose API through functorhc.experimental.control_flow
-#                 and rename this file to _cond.py.
 import torch
 
 import torch.utils._pytree as pytree
diff --git a/functorch/experimental/control_flow.py b/functorch/experimental/control_flow.py
index c46c83fd005d9..fb235b10cc460 100644
--- a/functorch/experimental/control_flow.py
+++ b/functorch/experimental/control_flow.py
@@ -1 +1,2 @@
 from ._map import map  # noqa: F401
+from ._cond import cond  # noqa: F401
diff --git a/test/dynamo/test_export.py b/test/dynamo/test_export.py
index fb630f06d29f5..e6b505dea51e5 100644
--- a/test/dynamo/test_export.py
+++ b/test/dynamo/test_export.py
@@ -1434,7 +1434,7 @@ def nop(x):
 
     @patch.object(torch._dynamo.config, "capture_scalar_outputs", True)
     def test_export_with_module_layer(self):
-        from functorch.experimental.cond import cond
+        from functorch.experimental.control_flow import cond
 
         def true_fn(layer, val):
             return layer(val) * torch.tensor(2)
diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index bd551fb36a51b..81a07a188a245 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -2334,7 +2334,7 @@ def f_onnx(x):
         self.assertEqual(f_onnx(input_two_dims), 8)
 
     def test_cond(self):
-        from functorch.experimental.cond import cond
+        from functorch.experimental.control_flow import cond
 
         def true_fn(x):
             return x.sin()
@@ -2352,7 +2352,7 @@ def f(pred, x):
         self.assertTrue(same(torch.sin(torch.tensor([0.25, 0.25])), b))
 
     def test_cond_nested(self):
-        from functorch.experimental.cond import cond
+        from functorch.experimental.control_flow import cond
 
         def true_fn_nested(x):
             return x * 10
@@ -2397,7 +2397,7 @@ def f(pred, pred2, x):
         self.assertTrue(cc.frame_count, 2)
 
     def test_cond_export(self):
-        from functorch.experimental.cond import cond
+        from functorch.experimental.control_flow import cond
 
         def true_fn_nested(x):
             return x * 10
@@ -2442,7 +2442,7 @@ def f(pred, pred2, x):
         )  # * -1 then add x
 
     def test_cond_export_single_arg(self):
-        from functorch.experimental.cond import cond
+        from functorch.experimental.control_flow import cond
 
         def true_fn(x):
             return x
diff --git a/test/functorch/test_control_flow.py b/test/functorch/test_control_flow.py
index 39e1967d1b278..afa10a3de5ee5 100644
--- a/test/functorch/test_control_flow.py
+++ b/test/functorch/test_control_flow.py
@@ -1,7 +1,7 @@
 # Owner(s): ["module: functorch"]
 import torch
-from functorch.experimental.cond import cond
 from functorch.experimental import control_flow
+from functorch.experimental.control_flow import cond
 from torch.fx.experimental.proxy_tensor import make_fx
 
 from torch.testing._internal.common_utils import run_tests, TestCase

From dad8d7d3c240c1bd4d4fec774c8f29d334f1b7c3 Mon Sep 17 00:00:00 2001
From: Michael Lazos <mlazos@fb.com>
Date: Wed, 30 Nov 2022 01:59:41 +0000
Subject: [PATCH 1400/1922] add env/config flag to disable dynamo (#89828)

as title

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89828
Approved by: https://github.com/anijain2305
---
 test/dynamo/test_misc.py    | 24 ++++++++++++------------
 torch/_dynamo/config.py     |  3 +++
 torch/_dynamo/eval_frame.py |  6 +++++-
 3 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index 81a07a188a245..e7af1420c6f66 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -2957,19 +2957,19 @@ def fn(x, y):
         res = opt_fn(x, y)
         self.assertTrue(same(ref, res))
 
-    @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
-    def test_get_device_index(self):
-        def fn(x):
-            x = x + 1
-            a = torch._utils._get_device_index(x.device)
-            b = torch._utils._get_device_index(1)
-            return a, b
+    def test_disable_flag(self):
 
-        x = torch.rand(4, device="cuda")
-        ref = fn(x)
-        opt_fn = torch._dynamo.optimize("eager", nopython=True)(fn)
-        res = opt_fn(x)
-        self.assertTrue(same(ref, res))
+        cnt = torch._dynamo.testing.CompileCounter()
+
+        with patch.dict(os.environ, {"TORCH_COMPILE_DISABLE": "1"}):
+
+            def fn(x, y):
+                x = x + 1
+                y = y + 1
+
+            opt_fn = torch._dynamo.optimize(cnt)
+
+        self.assertEqual(cnt.frame_count, 0)
 
 
 class CustomFunc1(torch.autograd.Function):
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index 258df9989f89c..89e13bd484242 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -90,6 +90,9 @@
 # Show a warning on every graph break
 print_graph_breaks = False
 
+# Disable dynamo
+disable = os.environ.get("TORCH_COMPILE_DISABLE", False)
+
 # If a PyTorch module is in this allowlist, torchdynamo will be allowed
 # to inline objects from it or its children.
 skipfiles_inline_module_allowlist = {
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index a04bc72aa6cbb..0d4665134d9bf 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -300,7 +300,11 @@ def __init__(self):
 def catch_errors_wrapper(callback):
     @functools.wraps(callback)
     def catch_errors(frame, cache_size):
-        if frame.f_lasti >= 0 or skipfiles.check(frame.f_code.co_filename):
+        if (
+            frame.f_lasti >= 0
+            or skipfiles.check(frame.f_code.co_filename)
+            or config.disable
+        ):
             log.debug(f"skipping {frame.f_code.co_name} {frame.f_code.co_filename}")
             return None
         if frame.f_code.co_filename == "<string>" and frame.f_code.co_name == "__new__":

From 42a2bc50a30fe6cf51029da325f6e794092f02da Mon Sep 17 00:00:00 2001
From: Elias Ellison <elias.ellison@gmail.com>
Date: Tue, 29 Nov 2022 20:52:09 +0000
Subject: [PATCH 1401/1922] Enable rsqrt (#89771)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89771
Approved by: https://github.com/anijain2305
---
 torch/_inductor/decomposition.py |  5 -----
 torch/_inductor/lowering.py      | 22 ++++++++++------------
 2 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index 4df5c08998ae0..1bfd4b487361f 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -352,11 +352,6 @@ def convolution_backward(
     return (grad_inp, grad_weight, grad_bias)
 
 
-@register_decomposition([aten.rsqrt])
-def rsqrt(x):
-    return torch.reciprocal(torch.sqrt(x))
-
-
 @register_decomposition([aten.log2])
 def log2(x):
     return torch.log(x) * (1.0 / math.log(2.0))
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index f65c3eab3b3f9..00bfa4b72ee7c 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -3497,18 +3497,16 @@ def fn(a, b):
     return make_pointwise(fn)(a, b)
 
 
-# TODO - enable builtin and disable decomp to lower to ptx instruction
-# Causes compilation to not complete on timm_vision_transformers inference
-# @register_lowering(aten.rsqrt)
-# def rsqrt(x):
-#     dtype = x.get_dtype()
-#     if is_integer_dtype(dtype) or is_boolean_dtype(dtype):
-#         x = to_dtype(x, torch.get_default_dtype())
-#
-#     def _rsqrt(x):
-#         return ops.rsqrt(x)
-#
-#     return make_pointwise(_rsqrt)(x)
+@register_lowering(aten.rsqrt)
+def rsqrt(x):
+    dtype = x.get_dtype()
+    if is_integer_dtype(dtype) or is_boolean_dtype(dtype):
+        x = to_dtype(x, torch.get_default_dtype())
+
+    def _rsqrt(x):
+        return ops.rsqrt(x)
+
+    return make_pointwise(_rsqrt)(x)
 
 
 @register_lowering([aten.sum, prims.sum])

From 8c0e65eb71d177df8e05ff01b5f3b42ed989d83a Mon Sep 17 00:00:00 2001
From: Pearu Peterson <pearu.peterson@gmail.com>
Date: Wed, 30 Nov 2022 00:35:45 +0200
Subject: [PATCH 1402/1922] Generator of tensor inputs with variable layout and
 structure (batch/non-batch, hybrid/non-hybrid, block/non-block) (#88914)

This PR introduces `TestCase.generate_simple_inputs` method that is an improved and generalized version of the `TestSparseCompressed._generate_small_inputs` method.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88914
Approved by: https://github.com/cpuhrsch
---
 ...pressedCPU.test_print_SparseBSC_cpu.expect | 8090 +++++-----------
 ...pressedCPU.test_print_SparseBSR_cpu.expect | 7976 +++++-----------
 ...pressedCPU.test_print_SparseCSC_cpu.expect | 2554 +++---
 ...pressedCPU.test_print_SparseCSR_cpu.expect | 2220 +++--
 ...essedCUDA.test_print_SparseBSC_cuda.expect | 8094 +++++------------
 ...essedCUDA.test_print_SparseBSR_cuda.expect | 7990 +++++-----------
 ...essedCUDA.test_print_SparseCSC_cuda.expect | 2568 +++---
 ...essedCUDA.test_print_SparseCSR_cuda.expect | 2260 +++--
 test/test_sparse.py                           |   48 +
 test/test_sparse_csr.py                       |  280 +-
 torch/testing/_internal/common_utils.py       |  290 +
 11 files changed, 15037 insertions(+), 27333 deletions(-)

diff --git a/test/expect/TestSparseCompressedCPU.test_print_SparseBSC_cpu.expect b/test/expect/TestSparseCompressedCPU.test_print_SparseBSC_cpu.expect
index 696fcbb08cf12..7c0cccd56cd1d 100644
--- a/test/expect/TestSparseCompressedCPU.test_print_SparseBSC_cpu.expect
+++ b/test/expect/TestSparseCompressedCPU.test_print_SparseBSC_cpu.expect
@@ -1,6979 +1,3583 @@
-########## torch.float32/torch.int32/size=()+(3, 4)+() ##########
+########## torch.float32/torch.int32/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[ 1., 11.]],
+tensor(ccol_indices=tensor([[[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]],
 
-                      [[ 2., 12.]],
+                            [[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]]]),
+       row_indices=tensor([[[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]],
 
-                      [[ 3., 13.]],
-
-                      [[ 4., 14.]]]), size=(3, 4), nnz=4,
-       layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0, 2, 4], dtype=torch.int32)
-# _row_indices
-tensor([0, 1, 0, 2], dtype=torch.int32)
-# _values
-tensor([[[ 1., 11.]],
-
-        [[ 2., 12.]],
-
-        [[ 3., 13.]],
-
-        [[ 4., 14.]]])
-
-########## torch.float32/torch.int32/size=()+(0, 0)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0]),
-       row_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0, 1, 2)), size=(0, 0), nnz=0,
-       layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0], dtype=torch.int32)
-# _row_indices
-tensor([], dtype=torch.int32)
-# _values
-tensor([], size=(0, 1, 2))
-
-########## torch.float32/torch.int32/size=(2,)+(6, 2)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       row_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[[[1.],
-                        [2.]],
-
-                       [[2.],
-                        [3.]],
-
-                       [[3.],
-                        [4.]],
-
-                       [[4.],
-                        [5.]]],
-
-
-                      [[[5.],
-                        [6.]],
-
-                       [[6.],
-                        [7.]],
-
-                       [[7.],
-                        [8.]],
-
-                       [[8.],
-                        [9.]]]]), size=(2, 6, 2), nnz=4,
-       layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]], dtype=torch.int32)
-# _row_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]], dtype=torch.int32)
-# _values
-tensor([[[[1.],
-          [2.]],
-
-         [[2.],
-          [3.]],
-
-         [[3.],
-          [4.]],
-
-         [[4.],
-          [5.]]],
-
-
-        [[[5.],
-          [6.]],
-
-         [[6.],
-          [7.]],
-
-         [[7.],
-          [8.]],
-
-         [[8.],
-          [9.]]]])
-
-########## torch.float32/torch.int32/size=(2, 3)+(9, 4)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1., 11.],
-                         [ 2., 12.],
-                         [ 3., 13.]],
-
-                        [[ 2., 12.],
-                         [ 3., 13.],
-                         [ 4., 14.]],
-
-                        [[ 3., 13.],
-                         [ 4., 14.],
-                         [ 5., 15.]],
-
-                        [[ 4., 14.],
-                         [ 5., 15.],
-                         [ 6., 16.]]],
-
-
-                       [[[ 5., 15.],
-                         [ 6., 16.],
-                         [ 7., 17.]],
-
-                        [[ 6., 16.],
-                         [ 7., 17.],
-                         [ 8., 18.]],
-
-                        [[ 7., 17.],
-                         [ 8., 18.],
-                         [ 9., 19.]],
-
-                        [[ 8., 18.],
-                         [ 9., 19.],
-                         [10., 20.]]],
-
-
-                       [[[ 9., 19.],
-                         [10., 20.],
-                         [11., 21.]],
-
-                        [[10., 20.],
-                         [11., 21.],
-                         [12., 22.]],
-
-                        [[11., 21.],
-                         [12., 22.],
-                         [13., 23.]],
-
-                        [[12., 22.],
-                         [13., 23.],
-                         [14., 24.]]]],
-
-
-
-                      [[[[13., 23.],
-                         [14., 24.],
-                         [15., 25.]],
-
-                        [[14., 24.],
-                         [15., 25.],
-                         [16., 26.]],
-
-                        [[15., 25.],
-                         [16., 26.],
-                         [17., 27.]],
-
-                        [[16., 26.],
-                         [17., 27.],
-                         [18., 28.]]],
-
-
-                       [[[17., 27.],
-                         [18., 28.],
-                         [19., 29.]],
-
-                        [[18., 28.],
-                         [19., 29.],
-                         [20., 30.]],
-
-                        [[19., 29.],
-                         [20., 30.],
-                         [21., 31.]],
-
-                        [[20., 30.],
-                         [21., 31.],
-                         [22., 32.]]],
-
-
-                       [[[21., 31.],
-                         [22., 32.],
-                         [23., 33.]],
-
-                        [[22., 32.],
-                         [23., 33.],
-                         [24., 34.]],
-
-                        [[23., 33.],
-                         [24., 34.],
-                         [25., 35.]],
-
-                        [[24., 34.],
-                         [25., 35.],
-                         [26., 36.]]]]]), size=(2, 3, 9, 4), nnz=4,
-       layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
-
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], dtype=torch.int32)
-# _row_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
-
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], dtype=torch.int32)
-# _values
-tensor([[[[[ 1., 11.],
-           [ 2., 12.],
-           [ 3., 13.]],
-
-          [[ 2., 12.],
-           [ 3., 13.],
-           [ 4., 14.]],
-
-          [[ 3., 13.],
-           [ 4., 14.],
-           [ 5., 15.]],
-
-          [[ 4., 14.],
-           [ 5., 15.],
-           [ 6., 16.]]],
-
-
-         [[[ 5., 15.],
-           [ 6., 16.],
-           [ 7., 17.]],
-
-          [[ 6., 16.],
-           [ 7., 17.],
-           [ 8., 18.]],
-
-          [[ 7., 17.],
-           [ 8., 18.],
-           [ 9., 19.]],
-
-          [[ 8., 18.],
-           [ 9., 19.],
-           [10., 20.]]],
-
-
-         [[[ 9., 19.],
-           [10., 20.],
-           [11., 21.]],
-
-          [[10., 20.],
-           [11., 21.],
-           [12., 22.]],
-
-          [[11., 21.],
-           [12., 22.],
-           [13., 23.]],
-
-          [[12., 22.],
-           [13., 23.],
-           [14., 24.]]]],
-
-
-
-        [[[[13., 23.],
-           [14., 24.],
-           [15., 25.]],
-
-          [[14., 24.],
-           [15., 25.],
-           [16., 26.]],
-
-          [[15., 25.],
-           [16., 26.],
-           [17., 27.]],
-
-          [[16., 26.],
-           [17., 27.],
-           [18., 28.]]],
-
-
-         [[[17., 27.],
-           [18., 28.],
-           [19., 29.]],
-
-          [[18., 28.],
-           [19., 29.],
-           [20., 30.]],
-
-          [[19., 29.],
-           [20., 30.],
-           [21., 31.]],
-
-          [[20., 30.],
-           [21., 31.],
-           [22., 32.]]],
-
-
-         [[[21., 31.],
-           [22., 32.],
-           [23., 33.]],
-
-          [[22., 32.],
-           [23., 33.],
-           [24., 34.]],
-
-          [[23., 33.],
-           [24., 34.],
-           [25., 35.]],
-
-          [[24., 34.],
-           [25., 35.],
-           [26., 36.]]]]])
-
-
-########## torch.float64/torch.int32/size=()+(3, 4)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[ 1., 11.]],
-
-                      [[ 2., 12.]],
-
-                      [[ 3., 13.]],
-
-                      [[ 4., 14.]]]), size=(3, 4), nnz=4, dtype=torch.float64,
-       layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0, 2, 4], dtype=torch.int32)
-# _row_indices
-tensor([0, 1, 0, 2], dtype=torch.int32)
-# _values
-tensor([[[ 1., 11.]],
-
-        [[ 2., 12.]],
-
-        [[ 3., 13.]],
-
-        [[ 4., 14.]]], dtype=torch.float64)
-
-########## torch.float64/torch.int32/size=()+(0, 0)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0]),
-       row_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0, 1, 2)), size=(0, 0), nnz=0,
-       dtype=torch.float64, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0], dtype=torch.int32)
-# _row_indices
-tensor([], dtype=torch.int32)
-# _values
-tensor([], size=(0, 1, 2), dtype=torch.float64)
-
-########## torch.float64/torch.int32/size=(2,)+(6, 2)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       row_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[[[1.],
-                        [2.]],
-
-                       [[2.],
-                        [3.]],
-
-                       [[3.],
-                        [4.]],
-
-                       [[4.],
-                        [5.]]],
-
-
-                      [[[5.],
-                        [6.]],
-
-                       [[6.],
-                        [7.]],
-
-                       [[7.],
-                        [8.]],
-
-                       [[8.],
-                        [9.]]]]), size=(2, 6, 2), nnz=4, dtype=torch.float64,
-       layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]], dtype=torch.int32)
-# _row_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]], dtype=torch.int32)
-# _values
-tensor([[[[1.],
-          [2.]],
-
-         [[2.],
-          [3.]],
-
-         [[3.],
-          [4.]],
-
-         [[4.],
-          [5.]]],
-
-
-        [[[5.],
-          [6.]],
-
-         [[6.],
-          [7.]],
-
-         [[7.],
-          [8.]],
-
-         [[8.],
-          [9.]]]], dtype=torch.float64)
-
-########## torch.float64/torch.int32/size=(2, 3)+(9, 4)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1., 11.],
-                         [ 2., 12.],
-                         [ 3., 13.]],
-
-                        [[ 2., 12.],
-                         [ 3., 13.],
-                         [ 4., 14.]],
-
-                        [[ 3., 13.],
-                         [ 4., 14.],
-                         [ 5., 15.]],
-
-                        [[ 4., 14.],
-                         [ 5., 15.],
-                         [ 6., 16.]]],
-
-
-                       [[[ 5., 15.],
-                         [ 6., 16.],
-                         [ 7., 17.]],
-
-                        [[ 6., 16.],
-                         [ 7., 17.],
-                         [ 8., 18.]],
-
-                        [[ 7., 17.],
-                         [ 8., 18.],
-                         [ 9., 19.]],
-
-                        [[ 8., 18.],
-                         [ 9., 19.],
-                         [10., 20.]]],
-
-
-                       [[[ 9., 19.],
-                         [10., 20.],
-                         [11., 21.]],
-
-                        [[10., 20.],
-                         [11., 21.],
-                         [12., 22.]],
-
-                        [[11., 21.],
-                         [12., 22.],
-                         [13., 23.]],
-
-                        [[12., 22.],
-                         [13., 23.],
-                         [14., 24.]]]],
-
-
-
-                      [[[[13., 23.],
-                         [14., 24.],
-                         [15., 25.]],
-
-                        [[14., 24.],
-                         [15., 25.],
-                         [16., 26.]],
-
-                        [[15., 25.],
-                         [16., 26.],
-                         [17., 27.]],
-
-                        [[16., 26.],
-                         [17., 27.],
-                         [18., 28.]]],
-
-
-                       [[[17., 27.],
-                         [18., 28.],
-                         [19., 29.]],
-
-                        [[18., 28.],
-                         [19., 29.],
-                         [20., 30.]],
-
-                        [[19., 29.],
-                         [20., 30.],
-                         [21., 31.]],
-
-                        [[20., 30.],
-                         [21., 31.],
-                         [22., 32.]]],
-
-
-                       [[[21., 31.],
-                         [22., 32.],
-                         [23., 33.]],
-
-                        [[22., 32.],
-                         [23., 33.],
-                         [24., 34.]],
-
-                        [[23., 33.],
-                         [24., 34.],
-                         [25., 35.]],
-
-                        [[24., 34.],
-                         [25., 35.],
-                         [26., 36.]]]]]), size=(2, 3, 9, 4), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
-
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], dtype=torch.int32)
-# _row_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
-
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], dtype=torch.int32)
-# _values
-tensor([[[[[ 1., 11.],
-           [ 2., 12.],
-           [ 3., 13.]],
-
-          [[ 2., 12.],
-           [ 3., 13.],
-           [ 4., 14.]],
-
-          [[ 3., 13.],
-           [ 4., 14.],
-           [ 5., 15.]],
-
-          [[ 4., 14.],
-           [ 5., 15.],
-           [ 6., 16.]]],
-
-
-         [[[ 5., 15.],
-           [ 6., 16.],
-           [ 7., 17.]],
-
-          [[ 6., 16.],
-           [ 7., 17.],
-           [ 8., 18.]],
-
-          [[ 7., 17.],
-           [ 8., 18.],
-           [ 9., 19.]],
-
-          [[ 8., 18.],
-           [ 9., 19.],
-           [10., 20.]]],
-
-
-         [[[ 9., 19.],
-           [10., 20.],
-           [11., 21.]],
-
-          [[10., 20.],
-           [11., 21.],
-           [12., 22.]],
-
-          [[11., 21.],
-           [12., 22.],
-           [13., 23.]],
-
-          [[12., 22.],
-           [13., 23.],
-           [14., 24.]]]],
-
-
-
-        [[[[13., 23.],
-           [14., 24.],
-           [15., 25.]],
-
-          [[14., 24.],
-           [15., 25.],
-           [16., 26.]],
-
-          [[15., 25.],
-           [16., 26.],
-           [17., 27.]],
-
-          [[16., 26.],
-           [17., 27.],
-           [18., 28.]]],
-
-
-         [[[17., 27.],
-           [18., 28.],
-           [19., 29.]],
-
-          [[18., 28.],
-           [19., 29.],
-           [20., 30.]],
-
-          [[19., 29.],
-           [20., 30.],
-           [21., 31.]],
-
-          [[20., 30.],
-           [21., 31.],
-           [22., 32.]]],
-
-
-         [[[21., 31.],
-           [22., 32.],
-           [23., 33.]],
-
-          [[22., 32.],
-           [23., 33.],
-           [24., 34.]],
-
-          [[23., 33.],
-           [24., 34.],
-           [25., 35.]],
-
-          [[24., 34.],
-           [25., 35.],
-           [26., 36.]]]]], dtype=torch.float64)
-
-
-########## torch.float32/torch.int64/size=()+(3, 4)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[ 1., 11.]],
-
-                      [[ 2., 12.]],
-
-                      [[ 3., 13.]],
-
-                      [[ 4., 14.]]]), size=(3, 4), nnz=4,
-       layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0, 2, 4])
-# _row_indices
-tensor([0, 1, 0, 2])
-# _values
-tensor([[[ 1., 11.]],
-
-        [[ 2., 12.]],
-
-        [[ 3., 13.]],
-
-        [[ 4., 14.]]])
-
-########## torch.float32/torch.int64/size=()+(0, 0)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0]),
-       row_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0, 1, 2)), size=(0, 0), nnz=0,
-       layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0])
-# _row_indices
-tensor([], dtype=torch.int64)
-# _values
-tensor([], size=(0, 1, 2))
-
-########## torch.float32/torch.int64/size=(2,)+(6, 2)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       row_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[[[1.],
-                        [2.]],
-
-                       [[2.],
-                        [3.]],
-
-                       [[3.],
-                        [4.]],
-
-                       [[4.],
-                        [5.]]],
-
-
-                      [[[5.],
-                        [6.]],
-
-                       [[6.],
-                        [7.]],
-
-                       [[7.],
-                        [8.]],
-
-                       [[8.],
-                        [9.]]]]), size=(2, 6, 2), nnz=4,
-       layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]])
-# _row_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]])
-# _values
-tensor([[[[1.],
-          [2.]],
-
-         [[2.],
-          [3.]],
-
-         [[3.],
-          [4.]],
-
-         [[4.],
-          [5.]]],
-
-
-        [[[5.],
-          [6.]],
-
-         [[6.],
-          [7.]],
-
-         [[7.],
-          [8.]],
-
-         [[8.],
-          [9.]]]])
-
-########## torch.float32/torch.int64/size=(2, 3)+(9, 4)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1., 11.],
-                         [ 2., 12.],
-                         [ 3., 13.]],
-
-                        [[ 2., 12.],
-                         [ 3., 13.],
-                         [ 4., 14.]],
-
-                        [[ 3., 13.],
-                         [ 4., 14.],
-                         [ 5., 15.]],
-
-                        [[ 4., 14.],
-                         [ 5., 15.],
-                         [ 6., 16.]]],
-
-
-                       [[[ 5., 15.],
-                         [ 6., 16.],
-                         [ 7., 17.]],
-
-                        [[ 6., 16.],
-                         [ 7., 17.],
-                         [ 8., 18.]],
-
-                        [[ 7., 17.],
-                         [ 8., 18.],
-                         [ 9., 19.]],
-
-                        [[ 8., 18.],
-                         [ 9., 19.],
-                         [10., 20.]]],
-
-
-                       [[[ 9., 19.],
-                         [10., 20.],
-                         [11., 21.]],
-
-                        [[10., 20.],
-                         [11., 21.],
-                         [12., 22.]],
-
-                        [[11., 21.],
-                         [12., 22.],
-                         [13., 23.]],
-
-                        [[12., 22.],
-                         [13., 23.],
-                         [14., 24.]]]],
-
-
-
-                      [[[[13., 23.],
-                         [14., 24.],
-                         [15., 25.]],
-
-                        [[14., 24.],
-                         [15., 25.],
-                         [16., 26.]],
-
-                        [[15., 25.],
-                         [16., 26.],
-                         [17., 27.]],
-
-                        [[16., 26.],
-                         [17., 27.],
-                         [18., 28.]]],
-
-
-                       [[[17., 27.],
-                         [18., 28.],
-                         [19., 29.]],
-
-                        [[18., 28.],
-                         [19., 29.],
-                         [20., 30.]],
-
-                        [[19., 29.],
-                         [20., 30.],
-                         [21., 31.]],
-
-                        [[20., 30.],
-                         [21., 31.],
-                         [22., 32.]]],
-
-
-                       [[[21., 31.],
-                         [22., 32.],
-                         [23., 33.]],
-
-                        [[22., 32.],
-                         [23., 33.],
-                         [24., 34.]],
-
-                        [[23., 33.],
-                         [24., 34.],
-                         [25., 35.]],
-
-                        [[24., 34.],
-                         [25., 35.],
-                         [26., 36.]]]]]), size=(2, 3, 9, 4), nnz=4,
-       layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
-
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]])
-# _row_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
-
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]])
-# _values
-tensor([[[[[ 1., 11.],
-           [ 2., 12.],
-           [ 3., 13.]],
-
-          [[ 2., 12.],
-           [ 3., 13.],
-           [ 4., 14.]],
-
-          [[ 3., 13.],
-           [ 4., 14.],
-           [ 5., 15.]],
-
-          [[ 4., 14.],
-           [ 5., 15.],
-           [ 6., 16.]]],
-
-
-         [[[ 5., 15.],
-           [ 6., 16.],
-           [ 7., 17.]],
-
-          [[ 6., 16.],
-           [ 7., 17.],
-           [ 8., 18.]],
-
-          [[ 7., 17.],
-           [ 8., 18.],
-           [ 9., 19.]],
-
-          [[ 8., 18.],
-           [ 9., 19.],
-           [10., 20.]]],
-
-
-         [[[ 9., 19.],
-           [10., 20.],
-           [11., 21.]],
-
-          [[10., 20.],
-           [11., 21.],
-           [12., 22.]],
-
-          [[11., 21.],
-           [12., 22.],
-           [13., 23.]],
-
-          [[12., 22.],
-           [13., 23.],
-           [14., 24.]]]],
-
-
-
-        [[[[13., 23.],
-           [14., 24.],
-           [15., 25.]],
-
-          [[14., 24.],
-           [15., 25.],
-           [16., 26.]],
-
-          [[15., 25.],
-           [16., 26.],
-           [17., 27.]],
-
-          [[16., 26.],
-           [17., 27.],
-           [18., 28.]]],
-
-
-         [[[17., 27.],
-           [18., 28.],
-           [19., 29.]],
-
-          [[18., 28.],
-           [19., 29.],
-           [20., 30.]],
-
-          [[19., 29.],
-           [20., 30.],
-           [21., 31.]],
-
-          [[20., 30.],
-           [21., 31.],
-           [22., 32.]]],
-
-
-         [[[21., 31.],
-           [22., 32.],
-           [23., 33.]],
-
-          [[22., 32.],
-           [23., 33.],
-           [24., 34.]],
-
-          [[23., 33.],
-           [24., 34.],
-           [25., 35.]],
-
-          [[24., 34.],
-           [25., 35.],
-           [26., 36.]]]]])
-
-
-########## torch.float64/torch.int64/size=()+(3, 4)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[ 1., 11.]],
-
-                      [[ 2., 12.]],
-
-                      [[ 3., 13.]],
-
-                      [[ 4., 14.]]]), size=(3, 4), nnz=4, dtype=torch.float64,
-       layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0, 2, 4])
-# _row_indices
-tensor([0, 1, 0, 2])
-# _values
-tensor([[[ 1., 11.]],
-
-        [[ 2., 12.]],
-
-        [[ 3., 13.]],
-
-        [[ 4., 14.]]], dtype=torch.float64)
-
-########## torch.float64/torch.int64/size=()+(0, 0)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0]),
-       row_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0, 1, 2)), size=(0, 0), nnz=0,
-       dtype=torch.float64, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0])
-# _row_indices
-tensor([], dtype=torch.int64)
-# _values
-tensor([], size=(0, 1, 2), dtype=torch.float64)
-
-########## torch.float64/torch.int64/size=(2,)+(6, 2)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       row_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[[[1.],
-                        [2.]],
-
-                       [[2.],
-                        [3.]],
-
-                       [[3.],
-                        [4.]],
-
-                       [[4.],
-                        [5.]]],
-
-
-                      [[[5.],
-                        [6.]],
-
-                       [[6.],
-                        [7.]],
-
-                       [[7.],
-                        [8.]],
-
-                       [[8.],
-                        [9.]]]]), size=(2, 6, 2), nnz=4, dtype=torch.float64,
-       layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]])
-# _row_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]])
-# _values
-tensor([[[[1.],
-          [2.]],
-
-         [[2.],
-          [3.]],
-
-         [[3.],
-          [4.]],
-
-         [[4.],
-          [5.]]],
-
-
-        [[[5.],
-          [6.]],
-
-         [[6.],
-          [7.]],
-
-         [[7.],
-          [8.]],
-
-         [[8.],
-          [9.]]]], dtype=torch.float64)
-
-########## torch.float64/torch.int64/size=(2, 3)+(9, 4)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1., 11.],
-                         [ 2., 12.],
-                         [ 3., 13.]],
-
-                        [[ 2., 12.],
-                         [ 3., 13.],
-                         [ 4., 14.]],
-
-                        [[ 3., 13.],
-                         [ 4., 14.],
-                         [ 5., 15.]],
-
-                        [[ 4., 14.],
-                         [ 5., 15.],
-                         [ 6., 16.]]],
-
-
-                       [[[ 5., 15.],
-                         [ 6., 16.],
-                         [ 7., 17.]],
-
-                        [[ 6., 16.],
-                         [ 7., 17.],
-                         [ 8., 18.]],
-
-                        [[ 7., 17.],
-                         [ 8., 18.],
-                         [ 9., 19.]],
-
-                        [[ 8., 18.],
-                         [ 9., 19.],
-                         [10., 20.]]],
-
-
-                       [[[ 9., 19.],
-                         [10., 20.],
-                         [11., 21.]],
-
-                        [[10., 20.],
-                         [11., 21.],
-                         [12., 22.]],
-
-                        [[11., 21.],
-                         [12., 22.],
-                         [13., 23.]],
-
-                        [[12., 22.],
-                         [13., 23.],
-                         [14., 24.]]]],
-
-
-
-                      [[[[13., 23.],
-                         [14., 24.],
-                         [15., 25.]],
-
-                        [[14., 24.],
-                         [15., 25.],
-                         [16., 26.]],
-
-                        [[15., 25.],
-                         [16., 26.],
-                         [17., 27.]],
-
-                        [[16., 26.],
-                         [17., 27.],
-                         [18., 28.]]],
-
-
-                       [[[17., 27.],
-                         [18., 28.],
-                         [19., 29.]],
-
-                        [[18., 28.],
-                         [19., 29.],
-                         [20., 30.]],
-
-                        [[19., 29.],
-                         [20., 30.],
-                         [21., 31.]],
-
-                        [[20., 30.],
-                         [21., 31.],
-                         [22., 32.]]],
-
-
-                       [[[21., 31.],
-                         [22., 32.],
-                         [23., 33.]],
-
-                        [[22., 32.],
-                         [23., 33.],
-                         [24., 34.]],
-
-                        [[23., 33.],
-                         [24., 34.],
-                         [25., 35.]],
-
-                        [[24., 34.],
-                         [25., 35.],
-                         [26., 36.]]]]]), size=(2, 3, 9, 4), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
-
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]])
-# _row_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
-
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]])
-# _values
-tensor([[[[[ 1., 11.],
-           [ 2., 12.],
-           [ 3., 13.]],
-
-          [[ 2., 12.],
-           [ 3., 13.],
-           [ 4., 14.]],
-
-          [[ 3., 13.],
-           [ 4., 14.],
-           [ 5., 15.]],
-
-          [[ 4., 14.],
-           [ 5., 15.],
-           [ 6., 16.]]],
-
-
-         [[[ 5., 15.],
-           [ 6., 16.],
-           [ 7., 17.]],
-
-          [[ 6., 16.],
-           [ 7., 17.],
-           [ 8., 18.]],
-
-          [[ 7., 17.],
-           [ 8., 18.],
-           [ 9., 19.]],
-
-          [[ 8., 18.],
-           [ 9., 19.],
-           [10., 20.]]],
-
-
-         [[[ 9., 19.],
-           [10., 20.],
-           [11., 21.]],
-
-          [[10., 20.],
-           [11., 21.],
-           [12., 22.]],
-
-          [[11., 21.],
-           [12., 22.],
-           [13., 23.]],
-
-          [[12., 22.],
-           [13., 23.],
-           [14., 24.]]]],
-
-
-
-        [[[[13., 23.],
-           [14., 24.],
-           [15., 25.]],
-
-          [[14., 24.],
-           [15., 25.],
-           [16., 26.]],
-
-          [[15., 25.],
-           [16., 26.],
-           [17., 27.]],
-
-          [[16., 26.],
-           [17., 27.],
-           [18., 28.]]],
-
-
-         [[[17., 27.],
-           [18., 28.],
-           [19., 29.]],
-
-          [[18., 28.],
-           [19., 29.],
-           [20., 30.]],
-
-          [[19., 29.],
-           [20., 30.],
-           [21., 31.]],
-
-          [[20., 30.],
-           [21., 31.],
-           [22., 32.]]],
-
-
-         [[[21., 31.],
-           [22., 32.],
-           [23., 33.]],
-
-          [[22., 32.],
-           [23., 33.],
-           [24., 34.]],
-
-          [[23., 33.],
-           [24., 34.],
-           [25., 35.]],
-
-          [[24., 34.],
-           [25., 35.],
-           [26., 36.]]]]], dtype=torch.float64)
-
-
-########## torch.float32/torch.int32/size=()+(6, 6)+(2,) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[  1., 101.],
-                        [ 11., 111.],
-                        [ 21., 121.]],
-
-                       [[  2., 102.],
-                        [ 12., 112.],
-                        [ 22., 122.]]],
-
-
-                      [[[  2., 102.],
-                        [ 12., 112.],
-                        [ 22., 122.]],
-
-                       [[  3., 103.],
-                        [ 13., 113.],
-                        [ 23., 123.]]],
-
-
-                      [[[  3., 103.],
-                        [ 13., 113.],
-                        [ 23., 123.]],
-
-                       [[  4., 104.],
-                        [ 14., 114.],
-                        [ 24., 124.]]],
-
-
-                      [[[  4., 104.],
-                        [ 14., 114.],
-                        [ 24., 124.]],
-
-                       [[  5., 105.],
-                        [ 15., 115.],
-                        [ 25., 125.]]]]), size=(6, 6, 2), nnz=4,
-       layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0, 2, 4], dtype=torch.int32)
-# _row_indices
-tensor([0, 1, 0, 2], dtype=torch.int32)
-# _values
-tensor([[[[  1., 101.],
-          [ 11., 111.],
-          [ 21., 121.]],
-
-         [[  2., 102.],
-          [ 12., 112.],
-          [ 22., 122.]]],
-
-
-        [[[  2., 102.],
-          [ 12., 112.],
-          [ 22., 122.]],
-
-         [[  3., 103.],
-          [ 13., 113.],
-          [ 23., 123.]]],
-
-
-        [[[  3., 103.],
-          [ 13., 113.],
-          [ 23., 123.]],
-
-         [[  4., 104.],
-          [ 14., 114.],
-          [ 24., 124.]]],
-
-
-        [[[  4., 104.],
-          [ 14., 114.],
-          [ 24., 124.]],
-
-         [[  5., 105.],
-          [ 15., 115.],
-          [ 25., 125.]]]])
-
-########## torch.float32/torch.int32/size=()+(9, 4)+(4, 2) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[[1.0000e+00, 1.0010e+03],
-                         [1.0100e+02, 1.1010e+03],
-                         [2.0100e+02, 1.2010e+03],
-                         [3.0100e+02, 1.3010e+03]],
-
-                        [[1.1000e+01, 1.0110e+03],
-                         [1.1100e+02, 1.1110e+03],
-                         [2.1100e+02, 1.2110e+03],
-                         [3.1100e+02, 1.3110e+03]]],
-
-
-                       [[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
-
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]]],
-
-
-                       [[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
-
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]]]],
-
-
-
-                      [[[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
-
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]]],
-
-
-                       [[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
-
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]]],
-
-
-                       [[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
-
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]]]],
-
-
-
-                      [[[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
-
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]]],
-
-
-                       [[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
-
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]]],
-
-
-                       [[[5.0000e+00, 1.0050e+03],
-                         [1.0500e+02, 1.1050e+03],
-                         [2.0500e+02, 1.2050e+03],
-                         [3.0500e+02, 1.3050e+03]],
-
-                        [[1.5000e+01, 1.0150e+03],
-                         [1.1500e+02, 1.1150e+03],
-                         [2.1500e+02, 1.2150e+03],
-                         [3.1500e+02, 1.3150e+03]]]],
-
-
-
-                      [[[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
-
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]]],
-
-
-                       [[[5.0000e+00, 1.0050e+03],
-                         [1.0500e+02, 1.1050e+03],
-                         [2.0500e+02, 1.2050e+03],
-                         [3.0500e+02, 1.3050e+03]],
-
-                        [[1.5000e+01, 1.0150e+03],
-                         [1.1500e+02, 1.1150e+03],
-                         [2.1500e+02, 1.2150e+03],
-                         [3.1500e+02, 1.3150e+03]]],
-
-
-                       [[[6.0000e+00, 1.0060e+03],
-                         [1.0600e+02, 1.1060e+03],
-                         [2.0600e+02, 1.2060e+03],
-                         [3.0600e+02, 1.3060e+03]],
-
-                        [[1.6000e+01, 1.0160e+03],
-                         [1.1600e+02, 1.1160e+03],
-                         [2.1600e+02, 1.2160e+03],
-                         [3.1600e+02, 1.3160e+03]]]]]), size=(9, 4, 4, 2),
-       nnz=4, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0, 2, 4], dtype=torch.int32)
-# _row_indices
-tensor([0, 1, 0, 2], dtype=torch.int32)
-# _values
-tensor([[[[[1.0000e+00, 1.0010e+03],
-           [1.0100e+02, 1.1010e+03],
-           [2.0100e+02, 1.2010e+03],
-           [3.0100e+02, 1.3010e+03]],
-
-          [[1.1000e+01, 1.0110e+03],
-           [1.1100e+02, 1.1110e+03],
-           [2.1100e+02, 1.2110e+03],
-           [3.1100e+02, 1.3110e+03]]],
-
-
-         [[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
-
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]]],
-
-
-         [[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
-
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]]]],
-
-
-
-        [[[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
-
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]]],
-
-
-         [[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
-
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]]],
-
-
-         [[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
-
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]]]],
-
-
-
-        [[[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
-
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]]],
-
-
-         [[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
-
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]]],
-
-
-         [[[5.0000e+00, 1.0050e+03],
-           [1.0500e+02, 1.1050e+03],
-           [2.0500e+02, 1.2050e+03],
-           [3.0500e+02, 1.3050e+03]],
-
-          [[1.5000e+01, 1.0150e+03],
-           [1.1500e+02, 1.1150e+03],
-           [2.1500e+02, 1.2150e+03],
-           [3.1500e+02, 1.3150e+03]]]],
-
-
-
-        [[[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
-
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]]],
-
-
-         [[[5.0000e+00, 1.0050e+03],
-           [1.0500e+02, 1.1050e+03],
-           [2.0500e+02, 1.2050e+03],
-           [3.0500e+02, 1.3050e+03]],
-
-          [[1.5000e+01, 1.0150e+03],
-           [1.1500e+02, 1.1150e+03],
-           [2.1500e+02, 1.2150e+03],
-           [3.1500e+02, 1.3150e+03]]],
-
-
-         [[[6.0000e+00, 1.0060e+03],
-           [1.0600e+02, 1.1060e+03],
-           [2.0600e+02, 1.2060e+03],
-           [3.0600e+02, 1.3060e+03]],
-
-          [[1.6000e+01, 1.0160e+03],
-           [1.1600e+02, 1.1160e+03],
-           [2.1600e+02, 1.2160e+03],
-           [3.1600e+02, 1.3160e+03]]]]])
-
-########## torch.float32/torch.int32/size=(2, 3)+(6, 6)+(2, 1) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[[[  1.],
-                           [101.]],
-
-                          [[ 11.],
-                           [111.]],
-
-                          [[ 21.],
-                           [121.]]],
-
-
-                         [[[  2.],
-                           [102.]],
-
-                          [[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]]]],
-
-
-
-                        [[[[  2.],
-                           [102.]],
-
-                          [[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]]],
-
-
-                         [[[  3.],
-                           [103.]],
-
-                          [[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]]]],
-
-
-
-                        [[[[  3.],
-                           [103.]],
-
-                          [[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]]],
-
-
-                         [[[  4.],
-                           [104.]],
-
-                          [[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]]]],
-
-
-
-                        [[[[  4.],
-                           [104.]],
-
-                          [[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]]],
-
-
-                         [[[  5.],
-                           [105.]],
-
-                          [[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]]]]],
-
-
-
-
-                       [[[[[  5.],
-                           [105.]],
-
-                          [[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]]],
-
-
-                         [[[  6.],
-                           [106.]],
-
-                          [[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]]]],
-
-
-
-                        [[[[  6.],
-                           [106.]],
-
-                          [[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]]],
-
-
-                         [[[  7.],
-                           [107.]],
-
-                          [[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]]]],
-
-
-
-                        [[[[  7.],
-                           [107.]],
-
-                          [[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]]],
-
-
-                         [[[  8.],
-                           [108.]],
-
-                          [[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]]]],
-
-
-
-                        [[[[  8.],
-                           [108.]],
-
-                          [[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]]],
-
-
-                         [[[  9.],
-                           [109.]],
-
-                          [[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]]]]],
-
-
-
-
-                       [[[[[  9.],
-                           [109.]],
-
-                          [[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]]],
-
-
-                         [[[ 10.],
-                           [110.]],
-
-                          [[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]]]],
-
-
-
-                        [[[[ 10.],
-                           [110.]],
-
-                          [[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]]],
-
-
-                         [[[ 11.],
-                           [111.]],
-
-                          [[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]]]],
-
-
-
-                        [[[[ 11.],
-                           [111.]],
-
-                          [[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]]],
-
-
-                         [[[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]]]],
-
-
-
-                        [[[[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]]],
-
-
-                         [[[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]]]]]],
-
-
-
-
-
-                      [[[[[[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]]],
-
-
-                         [[[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]]]],
-
-
-
-                        [[[[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]]],
-
-
-                         [[[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]],
-
-                          [[ 35.],
-                           [135.]]]],
-
-
-
-                        [[[[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]],
-
-                          [[ 35.],
-                           [135.]]],
-
-
-                         [[[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]],
-
-                          [[ 36.],
-                           [136.]]]],
-
-
-
-                        [[[[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]],
-
-                          [[ 36.],
-                           [136.]]],
-
-
-                         [[[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]],
-
-                          [[ 37.],
-                           [137.]]]]],
-
-
-
-
-                       [[[[[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]],
-
-                          [[ 37.],
-                           [137.]]],
-
-
-                         [[[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]],
-
-                          [[ 38.],
-                           [138.]]]],
-
-
-
-                        [[[[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]],
-
-                          [[ 38.],
-                           [138.]]],
-
-
-                         [[[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]],
-
-                          [[ 39.],
-                           [139.]]]],
-
-
-
-                        [[[[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]],
-
-                          [[ 39.],
-                           [139.]]],
-
-
-                         [[[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]],
-
-                          [[ 40.],
-                           [140.]]]],
-
-
-
-                        [[[[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]],
-
-                          [[ 40.],
-                           [140.]]],
-
-
-                         [[[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]],
-
-                          [[ 41.],
-                           [141.]]]]],
-
-
-
-
-                       [[[[[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]],
-
-                          [[ 41.],
-                           [141.]]],
-
-
-                         [[[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]],
-
-                          [[ 42.],
-                           [142.]]]],
-
-
-
-                        [[[[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]],
-
-                          [[ 42.],
-                           [142.]]],
-
-
-                         [[[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]],
-
-                          [[ 43.],
-                           [143.]]]],
-
-
-
-                        [[[[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]],
-
-                          [[ 43.],
-                           [143.]]],
-
-
-                         [[[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]],
-
-                          [[ 44.],
-                           [144.]]]],
-
-
-
-                        [[[[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]],
-
-                          [[ 44.],
-                           [144.]]],
-
-
-                         [[[ 25.],
-                           [125.]],
-
-                          [[ 35.],
-                           [135.]],
-
-                          [[ 45.],
-                           [145.]]]]]]]), size=(2, 3, 6, 6, 2, 1), nnz=4,
-       layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
-
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], dtype=torch.int32)
-# _row_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
-
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], dtype=torch.int32)
-# _values
-tensor([[[[[[[  1.],
-             [101.]],
-
-            [[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]]],
-
-
-           [[[  2.],
-             [102.]],
-
-            [[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]]],
-
-
-
-          [[[[  2.],
-             [102.]],
-
-            [[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]],
-
-
-           [[[  3.],
-             [103.]],
-
-            [[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]]],
-
-
-
-          [[[[  3.],
-             [103.]],
-
-            [[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]],
-
-
-           [[[  4.],
-             [104.]],
-
-            [[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]]]],
-
-
-
-          [[[[  4.],
-             [104.]],
-
-            [[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]]],
-
-
-           [[[  5.],
-             [105.]],
-
-            [[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]]]]],
-
-
-
-
-         [[[[[  5.],
-             [105.]],
-
-            [[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]]],
-
-
-           [[[  6.],
-             [106.]],
-
-            [[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]]]],
-
-
-
-          [[[[  6.],
-             [106.]],
-
-            [[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]]],
-
-
-           [[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]]],
-
-
-
-          [[[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]],
-
-
-           [[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]]],
-
-
-
-          [[[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]],
-
-
-           [[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]],
-
-            [[ 29.],
-             [129.]]]]],
-
-
-
-
-         [[[[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]],
-
-            [[ 29.],
-             [129.]]],
-
-
-           [[[ 10.],
-             [110.]],
-
-            [[ 20.],
-             [120.]],
-
-            [[ 30.],
-             [130.]]]],
-
-
-
-          [[[[ 10.],
-             [110.]],
-
-            [[ 20.],
-             [120.]],
-
-            [[ 30.],
-             [130.]]],
-
-
-           [[[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]],
-
-            [[ 31.],
-             [131.]]]],
-
-
-
-          [[[[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]],
-
-            [[ 31.],
-             [131.]]],
-
-
-           [[[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]],
-
-            [[ 32.],
-             [132.]]]],
-
-
-
-          [[[[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]],
-
-            [[ 32.],
-             [132.]]],
-
-
-           [[[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]],
-
-            [[ 33.],
-             [133.]]]]]],
-
-
-
-
-
-        [[[[[[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]],
-
-            [[ 33.],
-             [133.]]],
-
-
-           [[[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]],
-
-            [[ 34.],
-             [134.]]]],
-
-
-
-          [[[[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]],
-
-            [[ 34.],
-             [134.]]],
-
-
-           [[[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]],
-
-            [[ 35.],
-             [135.]]]],
-
-
-
-          [[[[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]],
-
-            [[ 35.],
-             [135.]]],
-
-
-           [[[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]],
-
-            [[ 36.],
-             [136.]]]],
-
-
-
-          [[[[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]],
-
-            [[ 36.],
-             [136.]]],
-
-
-           [[[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]],
-
-            [[ 37.],
-             [137.]]]]],
-
-
-
-
-         [[[[[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]],
-
-            [[ 37.],
-             [137.]]],
-
-
-           [[[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]],
-
-            [[ 38.],
-             [138.]]]],
-
-
-
-          [[[[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]],
-
-            [[ 38.],
-             [138.]]],
-
-
-           [[[ 19.],
-             [119.]],
-
-            [[ 29.],
-             [129.]],
-
-            [[ 39.],
-             [139.]]]],
-
-
-
-          [[[[ 19.],
-             [119.]],
-
-            [[ 29.],
-             [129.]],
-
-            [[ 39.],
-             [139.]]],
-
-
-           [[[ 20.],
-             [120.]],
-
-            [[ 30.],
-             [130.]],
-
-            [[ 40.],
-             [140.]]]],
-
-
-
-          [[[[ 20.],
-             [120.]],
-
-            [[ 30.],
-             [130.]],
-
-            [[ 40.],
-             [140.]]],
-
-
-           [[[ 21.],
-             [121.]],
-
-            [[ 31.],
-             [131.]],
-
-            [[ 41.],
-             [141.]]]]],
-
-
-
-
-         [[[[[ 21.],
-             [121.]],
-
-            [[ 31.],
-             [131.]],
-
-            [[ 41.],
-             [141.]]],
-
-
-           [[[ 22.],
-             [122.]],
-
-            [[ 32.],
-             [132.]],
-
-            [[ 42.],
-             [142.]]]],
-
-
-
-          [[[[ 22.],
-             [122.]],
-
-            [[ 32.],
-             [132.]],
-
-            [[ 42.],
-             [142.]]],
-
-
-           [[[ 23.],
-             [123.]],
-
-            [[ 33.],
-             [133.]],
-
-            [[ 43.],
-             [143.]]]],
-
-
-
-          [[[[ 23.],
-             [123.]],
-
-            [[ 33.],
-             [133.]],
-
-            [[ 43.],
-             [143.]]],
-
-
-           [[[ 24.],
-             [124.]],
-
-            [[ 34.],
-             [134.]],
-
-            [[ 44.],
-             [144.]]]],
-
-
-
-          [[[[ 24.],
-             [124.]],
-
-            [[ 34.],
-             [134.]],
-
-            [[ 44.],
-             [144.]]],
-
-
-           [[[ 25.],
-             [125.]],
-
-            [[ 35.],
-             [135.]],
-
-            [[ 45.],
-             [145.]]]]]]])
-
-
-########## torch.float64/torch.int32/size=()+(6, 6)+(2,) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[  1., 101.],
-                        [ 11., 111.],
-                        [ 21., 121.]],
-
-                       [[  2., 102.],
-                        [ 12., 112.],
-                        [ 22., 122.]]],
-
-
-                      [[[  2., 102.],
-                        [ 12., 112.],
-                        [ 22., 122.]],
-
-                       [[  3., 103.],
-                        [ 13., 113.],
-                        [ 23., 123.]]],
-
-
-                      [[[  3., 103.],
-                        [ 13., 113.],
-                        [ 23., 123.]],
-
-                       [[  4., 104.],
-                        [ 14., 114.],
-                        [ 24., 124.]]],
-
-
-                      [[[  4., 104.],
-                        [ 14., 114.],
-                        [ 24., 124.]],
-
-                       [[  5., 105.],
-                        [ 15., 115.],
-                        [ 25., 125.]]]]), size=(6, 6, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0, 2, 4], dtype=torch.int32)
-# _row_indices
-tensor([0, 1, 0, 2], dtype=torch.int32)
-# _values
-tensor([[[[  1., 101.],
-          [ 11., 111.],
-          [ 21., 121.]],
-
-         [[  2., 102.],
-          [ 12., 112.],
-          [ 22., 122.]]],
-
-
-        [[[  2., 102.],
-          [ 12., 112.],
-          [ 22., 122.]],
-
-         [[  3., 103.],
-          [ 13., 113.],
-          [ 23., 123.]]],
-
-
-        [[[  3., 103.],
-          [ 13., 113.],
-          [ 23., 123.]],
-
-         [[  4., 104.],
-          [ 14., 114.],
-          [ 24., 124.]]],
-
-
-        [[[  4., 104.],
-          [ 14., 114.],
-          [ 24., 124.]],
-
-         [[  5., 105.],
-          [ 15., 115.],
-          [ 25., 125.]]]], dtype=torch.float64)
-
-########## torch.float64/torch.int32/size=()+(9, 4)+(4, 2) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[[1.0000e+00, 1.0010e+03],
-                         [1.0100e+02, 1.1010e+03],
-                         [2.0100e+02, 1.2010e+03],
-                         [3.0100e+02, 1.3010e+03]],
-
-                        [[1.1000e+01, 1.0110e+03],
-                         [1.1100e+02, 1.1110e+03],
-                         [2.1100e+02, 1.2110e+03],
-                         [3.1100e+02, 1.3110e+03]]],
-
-
-                       [[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
-
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]]],
-
-
-                       [[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
-
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]]]],
-
-
-
-                      [[[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
-
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]]],
-
-
-                       [[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
-
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]]],
-
-
-                       [[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
-
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]]]],
-
-
-
-                      [[[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
-
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]]],
-
-
-                       [[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
-
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]]],
-
-
-                       [[[5.0000e+00, 1.0050e+03],
-                         [1.0500e+02, 1.1050e+03],
-                         [2.0500e+02, 1.2050e+03],
-                         [3.0500e+02, 1.3050e+03]],
-
-                        [[1.5000e+01, 1.0150e+03],
-                         [1.1500e+02, 1.1150e+03],
-                         [2.1500e+02, 1.2150e+03],
-                         [3.1500e+02, 1.3150e+03]]]],
-
-
-
-                      [[[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
-
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]]],
-
-
-                       [[[5.0000e+00, 1.0050e+03],
-                         [1.0500e+02, 1.1050e+03],
-                         [2.0500e+02, 1.2050e+03],
-                         [3.0500e+02, 1.3050e+03]],
-
-                        [[1.5000e+01, 1.0150e+03],
-                         [1.1500e+02, 1.1150e+03],
-                         [2.1500e+02, 1.2150e+03],
-                         [3.1500e+02, 1.3150e+03]]],
-
-
-                       [[[6.0000e+00, 1.0060e+03],
-                         [1.0600e+02, 1.1060e+03],
-                         [2.0600e+02, 1.2060e+03],
-                         [3.0600e+02, 1.3060e+03]],
-
-                        [[1.6000e+01, 1.0160e+03],
-                         [1.1600e+02, 1.1160e+03],
-                         [2.1600e+02, 1.2160e+03],
-                         [3.1600e+02, 1.3160e+03]]]]]), size=(9, 4, 4, 2),
-       nnz=4, dtype=torch.float64, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0, 2, 4], dtype=torch.int32)
-# _row_indices
-tensor([0, 1, 0, 2], dtype=torch.int32)
-# _values
-tensor([[[[[1.0000e+00, 1.0010e+03],
-           [1.0100e+02, 1.1010e+03],
-           [2.0100e+02, 1.2010e+03],
-           [3.0100e+02, 1.3010e+03]],
-
-          [[1.1000e+01, 1.0110e+03],
-           [1.1100e+02, 1.1110e+03],
-           [2.1100e+02, 1.2110e+03],
-           [3.1100e+02, 1.3110e+03]]],
-
-
-         [[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
-
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]]],
-
-
-         [[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
-
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]]]],
-
-
-
-        [[[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
-
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]]],
-
-
-         [[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
-
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]]],
-
-
-         [[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
-
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]]]],
-
-
-
-        [[[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
-
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]]],
-
-
-         [[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
-
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]]],
-
-
-         [[[5.0000e+00, 1.0050e+03],
-           [1.0500e+02, 1.1050e+03],
-           [2.0500e+02, 1.2050e+03],
-           [3.0500e+02, 1.3050e+03]],
-
-          [[1.5000e+01, 1.0150e+03],
-           [1.1500e+02, 1.1150e+03],
-           [2.1500e+02, 1.2150e+03],
-           [3.1500e+02, 1.3150e+03]]]],
-
-
-
-        [[[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
-
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]]],
-
-
-         [[[5.0000e+00, 1.0050e+03],
-           [1.0500e+02, 1.1050e+03],
-           [2.0500e+02, 1.2050e+03],
-           [3.0500e+02, 1.3050e+03]],
-
-          [[1.5000e+01, 1.0150e+03],
-           [1.1500e+02, 1.1150e+03],
-           [2.1500e+02, 1.2150e+03],
-           [3.1500e+02, 1.3150e+03]]],
-
-
-         [[[6.0000e+00, 1.0060e+03],
-           [1.0600e+02, 1.1060e+03],
-           [2.0600e+02, 1.2060e+03],
-           [3.0600e+02, 1.3060e+03]],
-
-          [[1.6000e+01, 1.0160e+03],
-           [1.1600e+02, 1.1160e+03],
-           [2.1600e+02, 1.2160e+03],
-           [3.1600e+02, 1.3160e+03]]]]], dtype=torch.float64)
-
-########## torch.float64/torch.int32/size=(2, 3)+(6, 6)+(2, 1) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[[[  1.],
-                           [101.]],
-
-                          [[ 11.],
-                           [111.]],
-
-                          [[ 21.],
-                           [121.]]],
-
-
-                         [[[  2.],
-                           [102.]],
-
-                          [[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]]]],
-
-
-
-                        [[[[  2.],
-                           [102.]],
-
-                          [[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]]],
-
-
-                         [[[  3.],
-                           [103.]],
-
-                          [[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]]]],
-
-
-
-                        [[[[  3.],
-                           [103.]],
-
-                          [[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]]],
-
-
-                         [[[  4.],
-                           [104.]],
-
-                          [[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]]]],
-
-
-
-                        [[[[  4.],
-                           [104.]],
-
-                          [[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]]],
-
-
-                         [[[  5.],
-                           [105.]],
-
-                          [[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]]]]],
-
-
-
-
-                       [[[[[  5.],
-                           [105.]],
-
-                          [[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]]],
-
-
-                         [[[  6.],
-                           [106.]],
-
-                          [[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]]]],
-
-
-
-                        [[[[  6.],
-                           [106.]],
-
-                          [[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]]],
-
-
-                         [[[  7.],
-                           [107.]],
-
-                          [[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]]]],
-
-
-
-                        [[[[  7.],
-                           [107.]],
-
-                          [[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]]],
-
-
-                         [[[  8.],
-                           [108.]],
-
-                          [[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]]]],
-
-
-
-                        [[[[  8.],
-                           [108.]],
-
-                          [[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]]],
-
-
-                         [[[  9.],
-                           [109.]],
-
-                          [[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]]]]],
-
-
-
-
-                       [[[[[  9.],
-                           [109.]],
-
-                          [[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]]],
-
-
-                         [[[ 10.],
-                           [110.]],
-
-                          [[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]]]],
-
-
-
-                        [[[[ 10.],
-                           [110.]],
-
-                          [[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]]],
-
-
-                         [[[ 11.],
-                           [111.]],
-
-                          [[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]]]],
-
-
-
-                        [[[[ 11.],
-                           [111.]],
-
-                          [[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]]],
-
-
-                         [[[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]]]],
-
-
-
-                        [[[[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]]],
-
-
-                         [[[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]]]]]],
-
-
-
-
-
-                      [[[[[[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]]],
-
-
-                         [[[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]]]],
-
-
-
-                        [[[[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]]],
-
-
-                         [[[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]],
-
-                          [[ 35.],
-                           [135.]]]],
-
-
-
-                        [[[[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]],
-
-                          [[ 35.],
-                           [135.]]],
-
-
-                         [[[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]],
-
-                          [[ 36.],
-                           [136.]]]],
-
-
-
-                        [[[[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]],
-
-                          [[ 36.],
-                           [136.]]],
-
-
-                         [[[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]],
-
-                          [[ 37.],
-                           [137.]]]]],
-
-
-
-
-                       [[[[[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]],
-
-                          [[ 37.],
-                           [137.]]],
-
-
-                         [[[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]],
-
-                          [[ 38.],
-                           [138.]]]],
-
-
-
-                        [[[[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]],
-
-                          [[ 38.],
-                           [138.]]],
-
-
-                         [[[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]],
-
-                          [[ 39.],
-                           [139.]]]],
-
-
-
-                        [[[[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]],
-
-                          [[ 39.],
-                           [139.]]],
-
-
-                         [[[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]],
-
-                          [[ 40.],
-                           [140.]]]],
-
-
-
-                        [[[[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]],
-
-                          [[ 40.],
-                           [140.]]],
-
-
-                         [[[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]],
-
-                          [[ 41.],
-                           [141.]]]]],
-
-
-
-
-                       [[[[[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]],
-
-                          [[ 41.],
-                           [141.]]],
-
-
-                         [[[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]],
-
-                          [[ 42.],
-                           [142.]]]],
-
-
-
-                        [[[[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]],
-
-                          [[ 42.],
-                           [142.]]],
-
-
-                         [[[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]],
-
-                          [[ 43.],
-                           [143.]]]],
-
-
-
-                        [[[[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]],
-
-                          [[ 43.],
-                           [143.]]],
-
-
-                         [[[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]],
-
-                          [[ 44.],
-                           [144.]]]],
-
-
-
-                        [[[[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]],
-
-                          [[ 44.],
-                           [144.]]],
+                           [[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]]]),
+       values=tensor([[[[[1.],
+                         [3.]],
 
+                        [[2.],
+                         [0.]],
 
-                         [[[ 25.],
-                           [125.]],
-
-                          [[ 35.],
-                           [135.]],
-
-                          [[ 45.],
-                           [145.]]]]]]]), size=(2, 3, 6, 6, 2, 1), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
-
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], dtype=torch.int32)
-# _row_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
-
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], dtype=torch.int32)
-# _values
-tensor([[[[[[[  1.],
-             [101.]],
-
-            [[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]]],
-
-
-           [[[  2.],
-             [102.]],
-
-            [[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]]],
-
-
-
-          [[[[  2.],
-             [102.]],
-
-            [[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]],
-
-
-           [[[  3.],
-             [103.]],
-
-            [[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]]],
-
-
-
-          [[[[  3.],
-             [103.]],
-
-            [[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]],
-
-
-           [[[  4.],
-             [104.]],
-
-            [[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]]]],
+                        [[0.],
+                         [4.]]],
 
 
+                       [[[1.],
+                         [4.]],
 
-          [[[[  4.],
-             [104.]],
+                        [[2.],
+                         [0.]],
 
-            [[ 14.],
-             [114.]],
+                        [[3.],
+                         [0.]]],
 
-            [[ 24.],
-             [124.]]],
 
+                       [[[1.],
+                         [2.]],
 
-           [[[  5.],
-             [105.]],
+                        [[0.],
+                         [3.]],
 
-            [[ 15.],
-             [115.]],
+                        [[0.],
+                         [4.]]]],
 
-            [[ 25.],
-             [125.]]]]],
 
 
+                      [[[[0.],
+                         [2.]],
 
+                        [[1.],
+                         [3.]],
 
-         [[[[[  5.],
-             [105.]],
+                        [[0.],
+                         [4.]]],
 
-            [[ 15.],
-             [115.]],
 
-            [[ 25.],
-             [125.]]],
+                       [[[1.],
+                         [3.]],
 
+                        [[0.],
+                         [4.]],
 
-           [[[  6.],
-             [106.]],
+                        [[2.],
+                         [0.]]],
 
-            [[ 16.],
-             [116.]],
 
-            [[ 26.],
-             [126.]]]],
+                       [[[1.],
+                         [0.]],
 
+                        [[2.],
+                         [4.]],
 
+                        [[3.],
+                         [0.]]]]]), size=(2, 3, 2, 3), nnz=3,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]],
 
-          [[[[  6.],
-             [106.]],
-
-            [[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]]],
-
-
-           [[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]]],
-
-
-
-          [[[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]],
-
-
-           [[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]]],
-
-
-
-          [[[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]],
+        [[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]]], dtype=torch.int32)
+# _row_indices
+tensor([[[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]],
 
+        [[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]]], dtype=torch.int32)
+# _values
+tensor([[[[[1.],
+           [3.]],
 
-           [[[  9.],
-             [109.]],
+          [[2.],
+           [0.]],
 
-            [[ 19.],
-             [119.]],
+          [[0.],
+           [4.]]],
 
-            [[ 29.],
-             [129.]]]]],
 
+         [[[1.],
+           [4.]],
 
+          [[2.],
+           [0.]],
 
+          [[3.],
+           [0.]]],
 
-         [[[[[  9.],
-             [109.]],
 
-            [[ 19.],
-             [119.]],
+         [[[1.],
+           [2.]],
 
-            [[ 29.],
-             [129.]]],
+          [[0.],
+           [3.]],
 
+          [[0.],
+           [4.]]]],
 
-           [[[ 10.],
-             [110.]],
 
-            [[ 20.],
-             [120.]],
 
-            [[ 30.],
-             [130.]]]],
+        [[[[0.],
+           [2.]],
 
+          [[1.],
+           [3.]],
 
+          [[0.],
+           [4.]]],
 
-          [[[[ 10.],
-             [110.]],
 
-            [[ 20.],
-             [120.]],
+         [[[1.],
+           [3.]],
 
-            [[ 30.],
-             [130.]]],
+          [[0.],
+           [4.]],
 
+          [[2.],
+           [0.]]],
 
-           [[[ 11.],
-             [111.]],
 
-            [[ 21.],
-             [121.]],
+         [[[1.],
+           [0.]],
 
-            [[ 31.],
-             [131.]]]],
+          [[2.],
+           [4.]],
 
+          [[3.],
+           [0.]]]]])
 
+########## torch.float32/torch.int32/size=()+(8, 6)+() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 4, 7]),
+       row_indices=tensor([0, 1, 2, 3, 0, 2, 3]),
+       values=tensor([[[ 0.,  1.,  0.],
+                       [ 0.,  4.,  0.]],
 
-          [[[[ 11.],
-             [111.]],
+                      [[ 6.,  7.,  8.],
+                       [ 0.,  0.,  0.]],
 
-            [[ 21.],
-             [121.]],
+                      [[ 0.,  9.,  0.],
+                       [13.,  0., 14.]],
 
-            [[ 31.],
-             [131.]]],
+                      [[ 0.,  0.,  0.],
+                       [20., 21., 22.]],
 
+                      [[ 2.,  0.,  3.],
+                       [ 0.,  5.,  0.]],
 
-           [[[ 12.],
-             [112.]],
+                      [[10., 11., 12.],
+                       [15., 16., 17.]],
 
-            [[ 22.],
-             [122.]],
+                      [[ 0., 18., 19.],
+                       [ 0., 23., 24.]]]), size=(8, 6), nnz=7,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0, 4, 7], dtype=torch.int32)
+# _row_indices
+tensor([0, 1, 2, 3, 0, 2, 3], dtype=torch.int32)
+# _values
+tensor([[[ 0.,  1.,  0.],
+         [ 0.,  4.,  0.]],
 
-            [[ 32.],
-             [132.]]]],
+        [[ 6.,  7.,  8.],
+         [ 0.,  0.,  0.]],
 
+        [[ 0.,  9.,  0.],
+         [13.,  0., 14.]],
 
+        [[ 0.,  0.,  0.],
+         [20., 21., 22.]],
 
-          [[[[ 12.],
-             [112.]],
+        [[ 2.,  0.,  3.],
+         [ 0.,  5.,  0.]],
 
-            [[ 22.],
-             [122.]],
+        [[10., 11., 12.],
+         [15., 16., 17.]],
 
-            [[ 32.],
-             [132.]]],
+        [[ 0., 18., 19.],
+         [ 0., 23., 24.]]])
 
 
-           [[[ 13.],
-             [113.]],
+########## torch.float64/torch.int32/size=(2, 3)+(2, 3)+() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]],
 
-            [[ 23.],
-             [123.]],
+                            [[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]]]),
+       row_indices=tensor([[[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]],
 
-            [[ 33.],
-             [133.]]]]]],
+                           [[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]]]),
+       values=tensor([[[[[1.],
+                         [3.]],
 
+                        [[2.],
+                         [0.]],
 
+                        [[0.],
+                         [4.]]],
 
 
+                       [[[1.],
+                         [4.]],
 
-        [[[[[[ 13.],
-             [113.]],
+                        [[2.],
+                         [0.]],
 
-            [[ 23.],
-             [123.]],
+                        [[3.],
+                         [0.]]],
 
-            [[ 33.],
-             [133.]]],
 
+                       [[[1.],
+                         [2.]],
 
-           [[[ 14.],
-             [114.]],
+                        [[0.],
+                         [3.]],
 
-            [[ 24.],
-             [124.]],
+                        [[0.],
+                         [4.]]]],
 
-            [[ 34.],
-             [134.]]]],
 
 
+                      [[[[0.],
+                         [2.]],
 
-          [[[[ 14.],
-             [114.]],
+                        [[1.],
+                         [3.]],
 
-            [[ 24.],
-             [124.]],
+                        [[0.],
+                         [4.]]],
 
-            [[ 34.],
-             [134.]]],
 
+                       [[[1.],
+                         [3.]],
 
-           [[[ 15.],
-             [115.]],
+                        [[0.],
+                         [4.]],
 
-            [[ 25.],
-             [125.]],
+                        [[2.],
+                         [0.]]],
 
-            [[ 35.],
-             [135.]]]],
 
+                       [[[1.],
+                         [0.]],
 
+                        [[2.],
+                         [4.]],
 
-          [[[[ 15.],
-             [115.]],
+                        [[3.],
+                         [0.]]]]]), size=(2, 3, 2, 3), nnz=3,
+       dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]],
 
-            [[ 25.],
-             [125.]],
+        [[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]]], dtype=torch.int32)
+# _row_indices
+tensor([[[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]],
 
-            [[ 35.],
-             [135.]]],
+        [[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]]], dtype=torch.int32)
+# _values
+tensor([[[[[1.],
+           [3.]],
 
+          [[2.],
+           [0.]],
 
-           [[[ 16.],
-             [116.]],
+          [[0.],
+           [4.]]],
 
-            [[ 26.],
-             [126.]],
 
-            [[ 36.],
-             [136.]]]],
+         [[[1.],
+           [4.]],
 
+          [[2.],
+           [0.]],
 
+          [[3.],
+           [0.]]],
 
-          [[[[ 16.],
-             [116.]],
 
-            [[ 26.],
-             [126.]],
+         [[[1.],
+           [2.]],
 
-            [[ 36.],
-             [136.]]],
+          [[0.],
+           [3.]],
 
+          [[0.],
+           [4.]]]],
 
-           [[[ 17.],
-             [117.]],
 
-            [[ 27.],
-             [127.]],
 
-            [[ 37.],
-             [137.]]]]],
+        [[[[0.],
+           [2.]],
 
+          [[1.],
+           [3.]],
 
+          [[0.],
+           [4.]]],
 
 
-         [[[[[ 17.],
-             [117.]],
+         [[[1.],
+           [3.]],
 
-            [[ 27.],
-             [127.]],
+          [[0.],
+           [4.]],
 
-            [[ 37.],
-             [137.]]],
+          [[2.],
+           [0.]]],
 
 
-           [[[ 18.],
-             [118.]],
+         [[[1.],
+           [0.]],
 
-            [[ 28.],
-             [128.]],
+          [[2.],
+           [4.]],
 
-            [[ 38.],
-             [138.]]]],
+          [[3.],
+           [0.]]]]], dtype=torch.float64)
 
+########## torch.float64/torch.int32/size=()+(8, 6)+() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 4, 7]),
+       row_indices=tensor([0, 1, 2, 3, 0, 2, 3]),
+       values=tensor([[[ 0.,  1.,  0.],
+                       [ 0.,  4.,  0.]],
 
+                      [[ 6.,  7.,  8.],
+                       [ 0.,  0.,  0.]],
 
-          [[[[ 18.],
-             [118.]],
+                      [[ 0.,  9.,  0.],
+                       [13.,  0., 14.]],
 
-            [[ 28.],
-             [128.]],
+                      [[ 0.,  0.,  0.],
+                       [20., 21., 22.]],
 
-            [[ 38.],
-             [138.]]],
+                      [[ 2.,  0.,  3.],
+                       [ 0.,  5.,  0.]],
 
+                      [[10., 11., 12.],
+                       [15., 16., 17.]],
 
-           [[[ 19.],
-             [119.]],
+                      [[ 0., 18., 19.],
+                       [ 0., 23., 24.]]]), size=(8, 6), nnz=7,
+       dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0, 4, 7], dtype=torch.int32)
+# _row_indices
+tensor([0, 1, 2, 3, 0, 2, 3], dtype=torch.int32)
+# _values
+tensor([[[ 0.,  1.,  0.],
+         [ 0.,  4.,  0.]],
 
-            [[ 29.],
-             [129.]],
+        [[ 6.,  7.,  8.],
+         [ 0.,  0.,  0.]],
 
-            [[ 39.],
-             [139.]]]],
+        [[ 0.,  9.,  0.],
+         [13.,  0., 14.]],
 
+        [[ 0.,  0.,  0.],
+         [20., 21., 22.]],
 
+        [[ 2.,  0.,  3.],
+         [ 0.,  5.,  0.]],
 
-          [[[[ 19.],
-             [119.]],
+        [[10., 11., 12.],
+         [15., 16., 17.]],
 
-            [[ 29.],
-             [129.]],
+        [[ 0., 18., 19.],
+         [ 0., 23., 24.]]], dtype=torch.float64)
 
-            [[ 39.],
-             [139.]]],
 
+########## torch.float32/torch.int64/size=(2, 3)+(2, 3)+() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]],
 
-           [[[ 20.],
-             [120.]],
+                            [[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]]]),
+       row_indices=tensor([[[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]],
 
-            [[ 30.],
-             [130.]],
+                           [[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]]]),
+       values=tensor([[[[[1.],
+                         [3.]],
 
-            [[ 40.],
-             [140.]]]],
+                        [[2.],
+                         [0.]],
 
+                        [[0.],
+                         [4.]]],
 
 
-          [[[[ 20.],
-             [120.]],
+                       [[[1.],
+                         [4.]],
 
-            [[ 30.],
-             [130.]],
+                        [[2.],
+                         [0.]],
 
-            [[ 40.],
-             [140.]]],
+                        [[3.],
+                         [0.]]],
 
 
-           [[[ 21.],
-             [121.]],
+                       [[[1.],
+                         [2.]],
 
-            [[ 31.],
-             [131.]],
+                        [[0.],
+                         [3.]],
 
-            [[ 41.],
-             [141.]]]]],
+                        [[0.],
+                         [4.]]]],
 
 
+                      [[[[0.],
+                         [2.]],
 
-         [[[[[ 21.],
-             [121.]],
+                        [[1.],
+                         [3.]],
 
-            [[ 31.],
-             [131.]],
+                        [[0.],
+                         [4.]]],
 
-            [[ 41.],
-             [141.]]],
 
+                       [[[1.],
+                         [3.]],
 
-           [[[ 22.],
-             [122.]],
+                        [[0.],
+                         [4.]],
 
-            [[ 32.],
-             [132.]],
+                        [[2.],
+                         [0.]]],
 
-            [[ 42.],
-             [142.]]]],
 
+                       [[[1.],
+                         [0.]],
 
+                        [[2.],
+                         [4.]],
 
-          [[[[ 22.],
-             [122.]],
+                        [[3.],
+                         [0.]]]]]), size=(2, 3, 2, 3), nnz=3,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]],
 
-            [[ 32.],
-             [132.]],
+        [[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]]])
+# _row_indices
+tensor([[[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]],
 
-            [[ 42.],
-             [142.]]],
+        [[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]]])
+# _values
+tensor([[[[[1.],
+           [3.]],
 
+          [[2.],
+           [0.]],
 
-           [[[ 23.],
-             [123.]],
+          [[0.],
+           [4.]]],
 
-            [[ 33.],
-             [133.]],
 
-            [[ 43.],
-             [143.]]]],
+         [[[1.],
+           [4.]],
 
+          [[2.],
+           [0.]],
 
+          [[3.],
+           [0.]]],
 
-          [[[[ 23.],
-             [123.]],
 
-            [[ 33.],
-             [133.]],
+         [[[1.],
+           [2.]],
 
-            [[ 43.],
-             [143.]]],
+          [[0.],
+           [3.]],
 
+          [[0.],
+           [4.]]]],
 
-           [[[ 24.],
-             [124.]],
 
-            [[ 34.],
-             [134.]],
 
-            [[ 44.],
-             [144.]]]],
+        [[[[0.],
+           [2.]],
 
+          [[1.],
+           [3.]],
 
+          [[0.],
+           [4.]]],
 
-          [[[[ 24.],
-             [124.]],
 
-            [[ 34.],
-             [134.]],
+         [[[1.],
+           [3.]],
 
-            [[ 44.],
-             [144.]]],
+          [[0.],
+           [4.]],
 
+          [[2.],
+           [0.]]],
 
-           [[[ 25.],
-             [125.]],
 
-            [[ 35.],
-             [135.]],
+         [[[1.],
+           [0.]],
 
-            [[ 45.],
-             [145.]]]]]]], dtype=torch.float64)
+          [[2.],
+           [4.]],
 
+          [[3.],
+           [0.]]]]])
 
-########## torch.float32/torch.int64/size=()+(6, 6)+(2,) ##########
+########## torch.float32/torch.int64/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[  1., 101.],
-                        [ 11., 111.],
-                        [ 21., 121.]],
-
-                       [[  2., 102.],
-                        [ 12., 112.],
-                        [ 22., 122.]]],
-
-
-                      [[[  2., 102.],
-                        [ 12., 112.],
-                        [ 22., 122.]],
-
-                       [[  3., 103.],
-                        [ 13., 113.],
-                        [ 23., 123.]]],
+tensor(ccol_indices=tensor([0, 4, 7]),
+       row_indices=tensor([0, 1, 2, 3, 0, 2, 3]),
+       values=tensor([[[ 0.,  1.,  0.],
+                       [ 0.,  4.,  0.]],
 
+                      [[ 6.,  7.,  8.],
+                       [ 0.,  0.,  0.]],
 
-                      [[[  3., 103.],
-                        [ 13., 113.],
-                        [ 23., 123.]],
+                      [[ 0.,  9.,  0.],
+                       [13.,  0., 14.]],
 
-                       [[  4., 104.],
-                        [ 14., 114.],
-                        [ 24., 124.]]],
+                      [[ 0.,  0.,  0.],
+                       [20., 21., 22.]],
 
+                      [[ 2.,  0.,  3.],
+                       [ 0.,  5.,  0.]],
 
-                      [[[  4., 104.],
-                        [ 14., 114.],
-                        [ 24., 124.]],
+                      [[10., 11., 12.],
+                       [15., 16., 17.]],
 
-                       [[  5., 105.],
-                        [ 15., 115.],
-                        [ 25., 125.]]]]), size=(6, 6, 2), nnz=4,
+                      [[ 0., 18., 19.],
+                       [ 0., 23., 24.]]]), size=(8, 6), nnz=7,
        layout=torch.sparse_bsc)
 # _ccol_indices
-tensor([0, 2, 4])
+tensor([0, 4, 7])
 # _row_indices
-tensor([0, 1, 0, 2])
+tensor([0, 1, 2, 3, 0, 2, 3])
 # _values
-tensor([[[[  1., 101.],
-          [ 11., 111.],
-          [ 21., 121.]],
+tensor([[[ 0.,  1.,  0.],
+         [ 0.,  4.,  0.]],
 
-         [[  2., 102.],
-          [ 12., 112.],
-          [ 22., 122.]]],
+        [[ 6.,  7.,  8.],
+         [ 0.,  0.,  0.]],
 
+        [[ 0.,  9.,  0.],
+         [13.,  0., 14.]],
 
-        [[[  2., 102.],
-          [ 12., 112.],
-          [ 22., 122.]],
+        [[ 0.,  0.,  0.],
+         [20., 21., 22.]],
 
-         [[  3., 103.],
-          [ 13., 113.],
-          [ 23., 123.]]],
+        [[ 2.,  0.,  3.],
+         [ 0.,  5.,  0.]],
 
+        [[10., 11., 12.],
+         [15., 16., 17.]],
 
-        [[[  3., 103.],
-          [ 13., 113.],
-          [ 23., 123.]],
+        [[ 0., 18., 19.],
+         [ 0., 23., 24.]]])
 
-         [[  4., 104.],
-          [ 14., 114.],
-          [ 24., 124.]]],
 
-
-        [[[  4., 104.],
-          [ 14., 114.],
-          [ 24., 124.]],
-
-         [[  5., 105.],
-          [ 15., 115.],
-          [ 25., 125.]]]])
-
-########## torch.float32/torch.int64/size=()+(9, 4)+(4, 2) ##########
+########## torch.float64/torch.int64/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[[1.0000e+00, 1.0010e+03],
-                         [1.0100e+02, 1.1010e+03],
-                         [2.0100e+02, 1.2010e+03],
-                         [3.0100e+02, 1.3010e+03]],
-
-                        [[1.1000e+01, 1.0110e+03],
-                         [1.1100e+02, 1.1110e+03],
-                         [2.1100e+02, 1.2110e+03],
-                         [3.1100e+02, 1.3110e+03]]],
-
-
-                       [[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
-
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]]],
-
+tensor(ccol_indices=tensor([[[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]],
 
-                       [[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
+                            [[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]]]),
+       row_indices=tensor([[[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]],
 
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]]]],
+                           [[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]]]),
+       values=tensor([[[[[1.],
+                         [3.]],
 
+                        [[2.],
+                         [0.]],
 
+                        [[0.],
+                         [4.]]],
 
-                      [[[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
 
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]]],
+                       [[[1.],
+                         [4.]],
 
+                        [[2.],
+                         [0.]],
 
-                       [[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
+                        [[3.],
+                         [0.]]],
 
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]]],
 
+                       [[[1.],
+                         [2.]],
 
-                       [[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
+                        [[0.],
+                         [3.]],
 
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]]]],
+                        [[0.],
+                         [4.]]]],
 
 
-                      [[[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
+                      [[[[0.],
+                         [2.]],
 
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]]],
+                        [[1.],
+                         [3.]],
 
+                        [[0.],
+                         [4.]]],
 
-                       [[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
 
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]]],
+                       [[[1.],
+                         [3.]],
 
+                        [[0.],
+                         [4.]],
 
-                       [[[5.0000e+00, 1.0050e+03],
-                         [1.0500e+02, 1.1050e+03],
-                         [2.0500e+02, 1.2050e+03],
-                         [3.0500e+02, 1.3050e+03]],
+                        [[2.],
+                         [0.]]],
 
-                        [[1.5000e+01, 1.0150e+03],
-                         [1.1500e+02, 1.1150e+03],
-                         [2.1500e+02, 1.2150e+03],
-                         [3.1500e+02, 1.3150e+03]]]],
 
+                       [[[1.],
+                         [0.]],
 
+                        [[2.],
+                         [4.]],
 
-                      [[[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
-
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]]],
-
-
-                       [[[5.0000e+00, 1.0050e+03],
-                         [1.0500e+02, 1.1050e+03],
-                         [2.0500e+02, 1.2050e+03],
-                         [3.0500e+02, 1.3050e+03]],
-
-                        [[1.5000e+01, 1.0150e+03],
-                         [1.1500e+02, 1.1150e+03],
-                         [2.1500e+02, 1.2150e+03],
-                         [3.1500e+02, 1.3150e+03]]],
-
-
-                       [[[6.0000e+00, 1.0060e+03],
-                         [1.0600e+02, 1.1060e+03],
-                         [2.0600e+02, 1.2060e+03],
-                         [3.0600e+02, 1.3060e+03]],
-
-                        [[1.6000e+01, 1.0160e+03],
-                         [1.1600e+02, 1.1160e+03],
-                         [2.1600e+02, 1.2160e+03],
-                         [3.1600e+02, 1.3160e+03]]]]]), size=(9, 4, 4, 2),
-       nnz=4, layout=torch.sparse_bsc)
+                        [[3.],
+                         [0.]]]]]), size=(2, 3, 2, 3), nnz=3,
+       dtype=torch.float64, layout=torch.sparse_bsc)
 # _ccol_indices
-tensor([0, 2, 4])
-# _row_indices
-tensor([0, 1, 0, 2])
-# _values
-tensor([[[[[1.0000e+00, 1.0010e+03],
-           [1.0100e+02, 1.1010e+03],
-           [2.0100e+02, 1.2010e+03],
-           [3.0100e+02, 1.3010e+03]],
-
-          [[1.1000e+01, 1.0110e+03],
-           [1.1100e+02, 1.1110e+03],
-           [2.1100e+02, 1.2110e+03],
-           [3.1100e+02, 1.3110e+03]]],
-
-
-         [[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
-
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]]],
-
-
-         [[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
+tensor([[[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]],
 
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]]]],
-
-
-
-        [[[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
-
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]]],
-
-
-         [[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
+        [[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]]])
+# _row_indices
+tensor([[[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]],
 
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]]],
+        [[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]]])
+# _values
+tensor([[[[[1.],
+           [3.]],
 
+          [[2.],
+           [0.]],
 
-         [[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
+          [[0.],
+           [4.]]],
 
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]]]],
 
+         [[[1.],
+           [4.]],
 
+          [[2.],
+           [0.]],
 
-        [[[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
+          [[3.],
+           [0.]]],
 
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]]],
 
+         [[[1.],
+           [2.]],
 
-         [[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
+          [[0.],
+           [3.]],
 
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]]],
+          [[0.],
+           [4.]]]],
 
 
-         [[[5.0000e+00, 1.0050e+03],
-           [1.0500e+02, 1.1050e+03],
-           [2.0500e+02, 1.2050e+03],
-           [3.0500e+02, 1.3050e+03]],
 
-          [[1.5000e+01, 1.0150e+03],
-           [1.1500e+02, 1.1150e+03],
-           [2.1500e+02, 1.2150e+03],
-           [3.1500e+02, 1.3150e+03]]]],
+        [[[[0.],
+           [2.]],
 
+          [[1.],
+           [3.]],
 
+          [[0.],
+           [4.]]],
 
-        [[[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
 
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]]],
+         [[[1.],
+           [3.]],
 
+          [[0.],
+           [4.]],
 
-         [[[5.0000e+00, 1.0050e+03],
-           [1.0500e+02, 1.1050e+03],
-           [2.0500e+02, 1.2050e+03],
-           [3.0500e+02, 1.3050e+03]],
+          [[2.],
+           [0.]]],
 
-          [[1.5000e+01, 1.0150e+03],
-           [1.1500e+02, 1.1150e+03],
-           [2.1500e+02, 1.2150e+03],
-           [3.1500e+02, 1.3150e+03]]],
 
+         [[[1.],
+           [0.]],
 
-         [[[6.0000e+00, 1.0060e+03],
-           [1.0600e+02, 1.1060e+03],
-           [2.0600e+02, 1.2060e+03],
-           [3.0600e+02, 1.3060e+03]],
+          [[2.],
+           [4.]],
 
-          [[1.6000e+01, 1.0160e+03],
-           [1.1600e+02, 1.1160e+03],
-           [2.1600e+02, 1.2160e+03],
-           [3.1600e+02, 1.3160e+03]]]]])
+          [[3.],
+           [0.]]]]], dtype=torch.float64)
 
-########## torch.float32/torch.int64/size=(2, 3)+(6, 6)+(2, 1) ##########
+########## torch.float64/torch.int64/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
+tensor(ccol_indices=tensor([0, 4, 7]),
+       row_indices=tensor([0, 1, 2, 3, 0, 2, 3]),
+       values=tensor([[[ 0.,  1.,  0.],
+                       [ 0.,  4.,  0.]],
 
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
+                      [[ 6.,  7.,  8.],
+                       [ 0.,  0.,  0.]],
 
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[[[  1.],
-                           [101.]],
+                      [[ 0.,  9.,  0.],
+                       [13.,  0., 14.]],
 
-                          [[ 11.],
-                           [111.]],
+                      [[ 0.,  0.,  0.],
+                       [20., 21., 22.]],
 
-                          [[ 21.],
-                           [121.]]],
+                      [[ 2.,  0.,  3.],
+                       [ 0.,  5.,  0.]],
 
+                      [[10., 11., 12.],
+                       [15., 16., 17.]],
 
-                         [[[  2.],
-                           [102.]],
-
-                          [[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]]]],
+                      [[ 0., 18., 19.],
+                       [ 0., 23., 24.]]]), size=(8, 6), nnz=7,
+       dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0, 4, 7])
+# _row_indices
+tensor([0, 1, 2, 3, 0, 2, 3])
+# _values
+tensor([[[ 0.,  1.,  0.],
+         [ 0.,  4.,  0.]],
 
+        [[ 6.,  7.,  8.],
+         [ 0.,  0.,  0.]],
 
+        [[ 0.,  9.,  0.],
+         [13.,  0., 14.]],
 
-                        [[[[  2.],
-                           [102.]],
+        [[ 0.,  0.,  0.],
+         [20., 21., 22.]],
 
-                          [[ 12.],
-                           [112.]],
+        [[ 2.,  0.,  3.],
+         [ 0.,  5.,  0.]],
 
-                          [[ 22.],
-                           [122.]]],
+        [[10., 11., 12.],
+         [15., 16., 17.]],
 
+        [[ 0., 18., 19.],
+         [ 0., 23., 24.]]], dtype=torch.float64)
 
-                         [[[  3.],
-                           [103.]],
 
-                          [[ 13.],
-                           [113.]],
+########## torch.float32/torch.int32/size=(2, 3)+(2, 3)+(4,) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]],
 
-                          [[ 23.],
-                           [123.]]]],
+                            [[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]]]),
+       row_indices=tensor([[[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]],
 
+                           [[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]]]),
+       values=tensor([[[[[[1., 2., 3., 4.]],
 
+                         [[3., 4., 5., 6.]]],
 
-                        [[[[  3.],
-                           [103.]],
 
-                          [[ 13.],
-                           [113.]],
+                        [[[2., 3., 4., 5.]],
 
-                          [[ 23.],
-                           [123.]]],
+                         [[0., 0., 0., 0.]]],
 
 
-                         [[[  4.],
-                           [104.]],
+                        [[[0., 0., 0., 0.]],
 
-                          [[ 14.],
-                           [114.]],
+                         [[4., 5., 6., 7.]]]],
 
-                          [[ 24.],
-                           [124.]]]],
 
 
+                       [[[[1., 2., 3., 4.]],
 
-                        [[[[  4.],
-                           [104.]],
+                         [[4., 5., 6., 7.]]],
 
-                          [[ 14.],
-                           [114.]],
 
-                          [[ 24.],
-                           [124.]]],
+                        [[[2., 3., 4., 5.]],
 
+                         [[0., 0., 0., 0.]]],
 
-                         [[[  5.],
-                           [105.]],
 
-                          [[ 15.],
-                           [115.]],
+                        [[[3., 4., 5., 6.]],
 
-                          [[ 25.],
-                           [125.]]]]],
+                         [[0., 0., 0., 0.]]]],
 
 
+                       [[[[1., 2., 3., 4.]],
 
-                       [[[[[  5.],
-                           [105.]],
+                         [[2., 3., 4., 5.]]],
 
-                          [[ 15.],
-                           [115.]],
 
-                          [[ 25.],
-                           [125.]]],
+                        [[[0., 0., 0., 0.]],
 
+                         [[3., 4., 5., 6.]]],
 
-                         [[[  6.],
-                           [106.]],
 
-                          [[ 16.],
-                           [116.]],
+                        [[[0., 0., 0., 0.]],
 
-                          [[ 26.],
-                           [126.]]]],
+                         [[4., 5., 6., 7.]]]]],
 
 
-                        [[[[  6.],
-                           [106.]],
 
-                          [[ 16.],
-                           [116.]],
+                      [[[[[0., 0., 0., 0.]],
 
-                          [[ 26.],
-                           [126.]]],
+                         [[2., 3., 4., 5.]]],
 
 
-                         [[[  7.],
-                           [107.]],
+                        [[[1., 2., 3., 4.]],
 
-                          [[ 17.],
-                           [117.]],
+                         [[3., 4., 5., 6.]]],
 
-                          [[ 27.],
-                           [127.]]]],
 
+                        [[[0., 0., 0., 0.]],
 
+                         [[4., 5., 6., 7.]]]],
 
-                        [[[[  7.],
-                           [107.]],
 
-                          [[ 17.],
-                           [117.]],
 
-                          [[ 27.],
-                           [127.]]],
+                       [[[[1., 2., 3., 4.]],
 
+                         [[3., 4., 5., 6.]]],
 
-                         [[[  8.],
-                           [108.]],
 
-                          [[ 18.],
-                           [118.]],
+                        [[[0., 0., 0., 0.]],
 
-                          [[ 28.],
-                           [128.]]]],
+                         [[4., 5., 6., 7.]]],
 
 
+                        [[[2., 3., 4., 5.]],
 
-                        [[[[  8.],
-                           [108.]],
+                         [[0., 0., 0., 0.]]]],
 
-                          [[ 18.],
-                           [118.]],
 
-                          [[ 28.],
-                           [128.]]],
 
+                       [[[[1., 2., 3., 4.]],
 
-                         [[[  9.],
-                           [109.]],
+                         [[0., 0., 0., 0.]]],
 
-                          [[ 19.],
-                           [119.]],
 
-                          [[ 29.],
-                           [129.]]]]],
+                        [[[2., 3., 4., 5.]],
 
+                         [[4., 5., 6., 7.]]],
 
 
+                        [[[3., 4., 5., 6.]],
 
-                       [[[[[  9.],
-                           [109.]],
+                         [[0., 0., 0., 0.]]]]]]), size=(2, 3, 2, 3, 4), nnz=3,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]],
 
-                          [[ 19.],
-                           [119.]],
+        [[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]]], dtype=torch.int32)
+# _row_indices
+tensor([[[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]],
 
-                          [[ 29.],
-                           [129.]]],
+        [[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]]], dtype=torch.int32)
+# _values
+tensor([[[[[[1., 2., 3., 4.]],
 
+           [[3., 4., 5., 6.]]],
 
-                         [[[ 10.],
-                           [110.]],
 
-                          [[ 20.],
-                           [120.]],
+          [[[2., 3., 4., 5.]],
 
-                          [[ 30.],
-                           [130.]]]],
+           [[0., 0., 0., 0.]]],
 
 
+          [[[0., 0., 0., 0.]],
 
-                        [[[[ 10.],
-                           [110.]],
+           [[4., 5., 6., 7.]]]],
 
-                          [[ 20.],
-                           [120.]],
 
-                          [[ 30.],
-                           [130.]]],
 
+         [[[[1., 2., 3., 4.]],
 
-                         [[[ 11.],
-                           [111.]],
+           [[4., 5., 6., 7.]]],
 
-                          [[ 21.],
-                           [121.]],
 
-                          [[ 31.],
-                           [131.]]]],
+          [[[2., 3., 4., 5.]],
 
+           [[0., 0., 0., 0.]]],
 
 
-                        [[[[ 11.],
-                           [111.]],
+          [[[3., 4., 5., 6.]],
 
-                          [[ 21.],
-                           [121.]],
+           [[0., 0., 0., 0.]]]],
 
-                          [[ 31.],
-                           [131.]]],
 
 
-                         [[[ 12.],
-                           [112.]],
+         [[[[1., 2., 3., 4.]],
 
-                          [[ 22.],
-                           [122.]],
+           [[2., 3., 4., 5.]]],
 
-                          [[ 32.],
-                           [132.]]]],
 
+          [[[0., 0., 0., 0.]],
 
+           [[3., 4., 5., 6.]]],
 
-                        [[[[ 12.],
-                           [112.]],
 
-                          [[ 22.],
-                           [122.]],
+          [[[0., 0., 0., 0.]],
 
-                          [[ 32.],
-                           [132.]]],
+           [[4., 5., 6., 7.]]]]],
 
 
-                         [[[ 13.],
-                           [113.]],
 
-                          [[ 23.],
-                           [123.]],
 
-                          [[ 33.],
-                           [133.]]]]]],
+        [[[[[0., 0., 0., 0.]],
 
+           [[2., 3., 4., 5.]]],
 
 
+          [[[1., 2., 3., 4.]],
 
+           [[3., 4., 5., 6.]]],
 
-                      [[[[[[ 13.],
-                           [113.]],
 
-                          [[ 23.],
-                           [123.]],
+          [[[0., 0., 0., 0.]],
 
-                          [[ 33.],
-                           [133.]]],
+           [[4., 5., 6., 7.]]]],
 
 
-                         [[[ 14.],
-                           [114.]],
 
-                          [[ 24.],
-                           [124.]],
+         [[[[1., 2., 3., 4.]],
 
-                          [[ 34.],
-                           [134.]]]],
+           [[3., 4., 5., 6.]]],
 
 
+          [[[0., 0., 0., 0.]],
 
-                        [[[[ 14.],
-                           [114.]],
+           [[4., 5., 6., 7.]]],
 
-                          [[ 24.],
-                           [124.]],
 
-                          [[ 34.],
-                           [134.]]],
+          [[[2., 3., 4., 5.]],
 
+           [[0., 0., 0., 0.]]]],
 
-                         [[[ 15.],
-                           [115.]],
 
-                          [[ 25.],
-                           [125.]],
 
-                          [[ 35.],
-                           [135.]]]],
+         [[[[1., 2., 3., 4.]],
 
+           [[0., 0., 0., 0.]]],
 
 
-                        [[[[ 15.],
-                           [115.]],
+          [[[2., 3., 4., 5.]],
 
-                          [[ 25.],
-                           [125.]],
+           [[4., 5., 6., 7.]]],
 
-                          [[ 35.],
-                           [135.]]],
 
+          [[[3., 4., 5., 6.]],
 
-                         [[[ 16.],
-                           [116.]],
+           [[0., 0., 0., 0.]]]]]])
 
-                          [[ 26.],
-                           [126.]],
+########## torch.float32/torch.int32/size=()+(8, 6)+(4, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 4, 7]),
+       row_indices=tensor([0, 1, 2, 3, 0, 2, 3]),
+       values=tensor([[[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
+
+                        [[ 1., 11.],
+                         [ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.]],
 
-                          [[ 36.],
-                           [136.]]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
 
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                        [[[[ 16.],
-                           [116.]],
+                        [[ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.]],
 
-                          [[ 26.],
-                           [126.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-                          [[ 36.],
-                           [136.]]],
 
 
-                         [[[ 17.],
-                           [117.]],
+                      [[[[ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.]],
 
-                          [[ 27.],
-                           [127.]],
+                        [[ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.]],
 
-                          [[ 37.],
-                           [137.]]]]],
+                        [[ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.],
+                         [11., 21.]]],
 
 
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                       [[[[[ 17.],
-                           [117.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-                          [[ 27.],
-                           [127.]],
 
-                          [[ 37.],
-                           [137.]]],
 
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                         [[[ 18.],
-                           [118.]],
+                        [[ 9., 19.],
+                         [10., 20.],
+                         [11., 21.],
+                         [12., 22.]],
 
-                          [[ 28.],
-                           [128.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
-                          [[ 38.],
-                           [138.]]]],
 
+                       [[[13., 23.],
+                         [14., 24.],
+                         [15., 25.],
+                         [16., 26.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                        [[[[ 18.],
-                           [118.]],
+                        [[14., 24.],
+                         [15., 25.],
+                         [16., 26.],
+                         [17., 27.]]]],
 
-                          [[ 28.],
-                           [128.]],
 
-                          [[ 38.],
-                           [138.]]],
 
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                         [[[ 19.],
-                           [119.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 29.],
-                           [129.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
-                          [[ 39.],
-                           [139.]]]],
 
+                       [[[20., 30.],
+                         [21., 31.],
+                         [22., 32.],
+                         [23., 33.]],
 
+                        [[21., 31.],
+                         [22., 32.],
+                         [23., 33.],
+                         [24., 34.]],
 
-                        [[[[ 19.],
-                           [119.]],
+                        [[22., 32.],
+                         [23., 33.],
+                         [24., 34.],
+                         [25., 35.]]]],
 
-                          [[ 29.],
-                           [129.]],
 
-                          [[ 39.],
-                           [139.]]],
 
+                      [[[[ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.]],
 
-                         [[[ 20.],
-                           [120.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 30.],
-                           [130.]],
+                        [[ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.]]],
 
-                          [[ 40.],
-                           [140.]]]],
 
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.]],
 
-                        [[[[ 20.],
-                           [120.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-                          [[ 30.],
-                           [130.]],
 
-                          [[ 40.],
-                           [140.]]],
 
+                      [[[[10., 20.],
+                         [11., 21.],
+                         [12., 22.],
+                         [13., 23.]],
 
-                         [[[ 21.],
-                           [121.]],
+                        [[11., 21.],
+                         [12., 22.],
+                         [13., 23.],
+                         [14., 24.]],
 
-                          [[ 31.],
-                           [131.]],
+                        [[12., 22.],
+                         [13., 23.],
+                         [14., 24.],
+                         [15., 25.]]],
 
-                          [[ 41.],
-                           [141.]]]]],
 
+                       [[[15., 25.],
+                         [16., 26.],
+                         [17., 27.],
+                         [18., 28.]],
 
+                        [[16., 26.],
+                         [17., 27.],
+                         [18., 28.],
+                         [19., 29.]],
 
+                        [[17., 27.],
+                         [18., 28.],
+                         [19., 29.],
+                         [20., 30.]]]],
 
-                       [[[[[ 21.],
-                           [121.]],
 
-                          [[ 31.],
-                           [131.]],
 
-                          [[ 41.],
-                           [141.]]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[18., 28.],
+                         [19., 29.],
+                         [20., 30.],
+                         [21., 31.]],
 
-                         [[[ 22.],
-                           [122.]],
+                        [[19., 29.],
+                         [20., 30.],
+                         [21., 31.],
+                         [22., 32.]]],
 
-                          [[ 32.],
-                           [132.]],
 
-                          [[ 42.],
-                           [142.]]]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[23., 33.],
+                         [24., 34.],
+                         [25., 35.],
+                         [26., 36.]],
 
+                        [[24., 34.],
+                         [25., 35.],
+                         [26., 36.],
+                         [27., 37.]]]]]), size=(8, 6, 4, 2), nnz=7,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0, 4, 7], dtype=torch.int32)
+# _row_indices
+tensor([0, 1, 2, 3, 0, 2, 3], dtype=torch.int32)
+# _values
+tensor([[[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                        [[[[ 22.],
-                           [122.]],
+          [[ 1., 11.],
+           [ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.]],
 
-                          [[ 32.],
-                           [132.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-                          [[ 42.],
-                           [142.]]],
 
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                         [[[ 23.],
-                           [123.]],
+          [[ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.]],
 
-                          [[ 33.],
-                           [133.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-                          [[ 43.],
-                           [143.]]]],
 
 
+        [[[[ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.]],
 
-                        [[[[ 23.],
-                           [123.]],
+          [[ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.],
+           [10., 20.]],
 
-                          [[ 33.],
-                           [133.]],
+          [[ 8., 18.],
+           [ 9., 19.],
+           [10., 20.],
+           [11., 21.]]],
 
-                          [[ 43.],
-                           [143.]]],
 
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                         [[[ 24.],
-                           [124.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                          [[ 34.],
-                           [134.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-                          [[ 44.],
-                           [144.]]]],
 
 
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                        [[[[ 24.],
-                           [124.]],
+          [[ 9., 19.],
+           [10., 20.],
+           [11., 21.],
+           [12., 22.]],
 
-                          [[ 34.],
-                           [134.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-                          [[ 44.],
-                           [144.]]],
 
+         [[[13., 23.],
+           [14., 24.],
+           [15., 25.],
+           [16., 26.]],
 
-                         [[[ 25.],
-                           [125.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                          [[ 35.],
-                           [135.]],
+          [[14., 24.],
+           [15., 25.],
+           [16., 26.],
+           [17., 27.]]]],
 
-                          [[ 45.],
-                           [145.]]]]]]]), size=(2, 3, 6, 6, 2, 1), nnz=4,
-       layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]])
-# _row_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]])
-# _values
-tensor([[[[[[[  1.],
-             [101.]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 11.],
-             [111.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 21.],
-             [121.]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
 
-           [[[  2.],
-             [102.]],
+         [[[20., 30.],
+           [21., 31.],
+           [22., 32.],
+           [23., 33.]],
 
-            [[ 12.],
-             [112.]],
+          [[21., 31.],
+           [22., 32.],
+           [23., 33.],
+           [24., 34.]],
 
-            [[ 22.],
-             [122.]]]],
+          [[22., 32.],
+           [23., 33.],
+           [24., 34.],
+           [25., 35.]]]],
 
 
-          [[[[  2.],
-             [102.]],
+        [[[[ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.]],
 
-            [[ 12.],
-             [112.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 22.],
-             [122.]]],
+          [[ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.]]],
 
 
-           [[[  3.],
-             [103.]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 13.],
-             [113.]],
+          [[ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.]],
 
-            [[ 23.],
-             [123.]]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
 
-          [[[[  3.],
-             [103.]],
+        [[[[10., 20.],
+           [11., 21.],
+           [12., 22.],
+           [13., 23.]],
 
-            [[ 13.],
-             [113.]],
+          [[11., 21.],
+           [12., 22.],
+           [13., 23.],
+           [14., 24.]],
 
-            [[ 23.],
-             [123.]]],
+          [[12., 22.],
+           [13., 23.],
+           [14., 24.],
+           [15., 25.]]],
 
 
-           [[[  4.],
-             [104.]],
+         [[[15., 25.],
+           [16., 26.],
+           [17., 27.],
+           [18., 28.]],
 
-            [[ 14.],
-             [114.]],
+          [[16., 26.],
+           [17., 27.],
+           [18., 28.],
+           [19., 29.]],
 
-            [[ 24.],
-             [124.]]]],
+          [[17., 27.],
+           [18., 28.],
+           [19., 29.],
+           [20., 30.]]]],
 
 
-          [[[[  4.],
-             [104.]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 14.],
-             [114.]],
+          [[18., 28.],
+           [19., 29.],
+           [20., 30.],
+           [21., 31.]],
 
-            [[ 24.],
-             [124.]]],
+          [[19., 29.],
+           [20., 30.],
+           [21., 31.],
+           [22., 32.]]],
 
 
-           [[[  5.],
-             [105.]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 15.],
-             [115.]],
+          [[23., 33.],
+           [24., 34.],
+           [25., 35.],
+           [26., 36.]],
 
-            [[ 25.],
-             [125.]]]]],
+          [[24., 34.],
+           [25., 35.],
+           [26., 36.],
+           [27., 37.]]]]])
 
 
+########## torch.float64/torch.int32/size=(2, 3)+(2, 3)+(4,) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]],
 
+                            [[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]]]),
+       row_indices=tensor([[[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]],
 
-         [[[[[  5.],
-             [105.]],
+                           [[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]]]),
+       values=tensor([[[[[[1., 2., 3., 4.]],
 
-            [[ 15.],
-             [115.]],
+                         [[3., 4., 5., 6.]]],
 
-            [[ 25.],
-             [125.]]],
 
+                        [[[2., 3., 4., 5.]],
 
-           [[[  6.],
-             [106.]],
+                         [[0., 0., 0., 0.]]],
 
-            [[ 16.],
-             [116.]],
 
-            [[ 26.],
-             [126.]]]],
+                        [[[0., 0., 0., 0.]],
 
+                         [[4., 5., 6., 7.]]]],
 
 
-          [[[[  6.],
-             [106.]],
 
-            [[ 16.],
-             [116.]],
+                       [[[[1., 2., 3., 4.]],
 
-            [[ 26.],
-             [126.]]],
+                         [[4., 5., 6., 7.]]],
 
 
-           [[[  7.],
-             [107.]],
+                        [[[2., 3., 4., 5.]],
 
-            [[ 17.],
-             [117.]],
+                         [[0., 0., 0., 0.]]],
 
-            [[ 27.],
-             [127.]]]],
 
+                        [[[3., 4., 5., 6.]],
 
+                         [[0., 0., 0., 0.]]]],
 
-          [[[[  7.],
-             [107.]],
 
-            [[ 17.],
-             [117.]],
 
-            [[ 27.],
-             [127.]]],
+                       [[[[1., 2., 3., 4.]],
 
+                         [[2., 3., 4., 5.]]],
 
-           [[[  8.],
-             [108.]],
 
-            [[ 18.],
-             [118.]],
+                        [[[0., 0., 0., 0.]],
 
-            [[ 28.],
-             [128.]]]],
+                         [[3., 4., 5., 6.]]],
 
 
+                        [[[0., 0., 0., 0.]],
 
-          [[[[  8.],
-             [108.]],
+                         [[4., 5., 6., 7.]]]]],
 
-            [[ 18.],
-             [118.]],
 
-            [[ 28.],
-             [128.]]],
 
 
-           [[[  9.],
-             [109.]],
+                      [[[[[0., 0., 0., 0.]],
 
-            [[ 19.],
-             [119.]],
+                         [[2., 3., 4., 5.]]],
 
-            [[ 29.],
-             [129.]]]]],
 
+                        [[[1., 2., 3., 4.]],
 
+                         [[3., 4., 5., 6.]]],
 
 
-         [[[[[  9.],
-             [109.]],
+                        [[[0., 0., 0., 0.]],
 
-            [[ 19.],
-             [119.]],
+                         [[4., 5., 6., 7.]]]],
 
-            [[ 29.],
-             [129.]]],
 
 
-           [[[ 10.],
-             [110.]],
+                       [[[[1., 2., 3., 4.]],
 
-            [[ 20.],
-             [120.]],
+                         [[3., 4., 5., 6.]]],
 
-            [[ 30.],
-             [130.]]]],
 
+                        [[[0., 0., 0., 0.]],
 
+                         [[4., 5., 6., 7.]]],
 
-          [[[[ 10.],
-             [110.]],
 
-            [[ 20.],
-             [120.]],
+                        [[[2., 3., 4., 5.]],
 
-            [[ 30.],
-             [130.]]],
+                         [[0., 0., 0., 0.]]]],
 
 
-           [[[ 11.],
-             [111.]],
 
-            [[ 21.],
-             [121.]],
+                       [[[[1., 2., 3., 4.]],
 
-            [[ 31.],
-             [131.]]]],
+                         [[0., 0., 0., 0.]]],
 
 
+                        [[[2., 3., 4., 5.]],
 
-          [[[[ 11.],
-             [111.]],
+                         [[4., 5., 6., 7.]]],
 
-            [[ 21.],
-             [121.]],
 
-            [[ 31.],
-             [131.]]],
+                        [[[3., 4., 5., 6.]],
 
+                         [[0., 0., 0., 0.]]]]]]), size=(2, 3, 2, 3, 4), nnz=3,
+       dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]],
 
-           [[[ 12.],
-             [112.]],
+        [[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]]], dtype=torch.int32)
+# _row_indices
+tensor([[[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]],
 
-            [[ 22.],
-             [122.]],
+        [[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]]], dtype=torch.int32)
+# _values
+tensor([[[[[[1., 2., 3., 4.]],
 
-            [[ 32.],
-             [132.]]]],
+           [[3., 4., 5., 6.]]],
 
 
+          [[[2., 3., 4., 5.]],
 
-          [[[[ 12.],
-             [112.]],
+           [[0., 0., 0., 0.]]],
 
-            [[ 22.],
-             [122.]],
 
-            [[ 32.],
-             [132.]]],
+          [[[0., 0., 0., 0.]],
 
+           [[4., 5., 6., 7.]]]],
 
-           [[[ 13.],
-             [113.]],
 
-            [[ 23.],
-             [123.]],
 
-            [[ 33.],
-             [133.]]]]]],
+         [[[[1., 2., 3., 4.]],
 
+           [[4., 5., 6., 7.]]],
 
 
+          [[[2., 3., 4., 5.]],
 
+           [[0., 0., 0., 0.]]],
 
-        [[[[[[ 13.],
-             [113.]],
 
-            [[ 23.],
-             [123.]],
+          [[[3., 4., 5., 6.]],
 
-            [[ 33.],
-             [133.]]],
+           [[0., 0., 0., 0.]]]],
 
 
-           [[[ 14.],
-             [114.]],
 
-            [[ 24.],
-             [124.]],
+         [[[[1., 2., 3., 4.]],
 
-            [[ 34.],
-             [134.]]]],
+           [[2., 3., 4., 5.]]],
 
 
+          [[[0., 0., 0., 0.]],
 
-          [[[[ 14.],
-             [114.]],
+           [[3., 4., 5., 6.]]],
 
-            [[ 24.],
-             [124.]],
 
-            [[ 34.],
-             [134.]]],
+          [[[0., 0., 0., 0.]],
 
+           [[4., 5., 6., 7.]]]]],
 
-           [[[ 15.],
-             [115.]],
 
-            [[ 25.],
-             [125.]],
 
-            [[ 35.],
-             [135.]]]],
 
+        [[[[[0., 0., 0., 0.]],
 
+           [[2., 3., 4., 5.]]],
 
-          [[[[ 15.],
-             [115.]],
 
-            [[ 25.],
-             [125.]],
+          [[[1., 2., 3., 4.]],
 
-            [[ 35.],
-             [135.]]],
+           [[3., 4., 5., 6.]]],
 
 
-           [[[ 16.],
-             [116.]],
+          [[[0., 0., 0., 0.]],
 
-            [[ 26.],
-             [126.]],
+           [[4., 5., 6., 7.]]]],
 
-            [[ 36.],
-             [136.]]]],
 
 
+         [[[[1., 2., 3., 4.]],
 
-          [[[[ 16.],
-             [116.]],
+           [[3., 4., 5., 6.]]],
 
-            [[ 26.],
-             [126.]],
 
-            [[ 36.],
-             [136.]]],
+          [[[0., 0., 0., 0.]],
 
+           [[4., 5., 6., 7.]]],
 
-           [[[ 17.],
-             [117.]],
 
-            [[ 27.],
-             [127.]],
+          [[[2., 3., 4., 5.]],
 
-            [[ 37.],
-             [137.]]]]],
+           [[0., 0., 0., 0.]]]],
 
 
+         [[[[1., 2., 3., 4.]],
 
-         [[[[[ 17.],
-             [117.]],
+           [[0., 0., 0., 0.]]],
 
-            [[ 27.],
-             [127.]],
 
-            [[ 37.],
-             [137.]]],
+          [[[2., 3., 4., 5.]],
 
+           [[4., 5., 6., 7.]]],
 
-           [[[ 18.],
-             [118.]],
 
-            [[ 28.],
-             [128.]],
+          [[[3., 4., 5., 6.]],
 
-            [[ 38.],
-             [138.]]]],
+           [[0., 0., 0., 0.]]]]]], dtype=torch.float64)
 
+########## torch.float64/torch.int32/size=()+(8, 6)+(4, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 4, 7]),
+       row_indices=tensor([0, 1, 2, 3, 0, 2, 3]),
+       values=tensor([[[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
+
+                        [[ 1., 11.],
+                         [ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
-          [[[[ 18.],
-             [118.]],
 
-            [[ 28.],
-             [128.]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 38.],
-             [138.]]],
+                        [[ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-           [[[ 19.],
-             [119.]],
 
-            [[ 29.],
-             [129.]],
 
-            [[ 39.],
-             [139.]]]],
+                      [[[[ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.]],
 
+                        [[ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.]],
 
+                        [[ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.],
+                         [11., 21.]]],
 
-          [[[[ 19.],
-             [119.]],
 
-            [[ 29.],
-             [129.]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 39.],
-             [139.]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-           [[[ 20.],
-             [120.]],
 
-            [[ 30.],
-             [130.]],
 
-            [[ 40.],
-             [140.]]]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 9., 19.],
+                         [10., 20.],
+                         [11., 21.],
+                         [12., 22.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
-          [[[[ 20.],
-             [120.]],
 
-            [[ 30.],
-             [130.]],
+                       [[[13., 23.],
+                         [14., 24.],
+                         [15., 25.],
+                         [16., 26.]],
 
-            [[ 40.],
-             [140.]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[14., 24.],
+                         [15., 25.],
+                         [16., 26.],
+                         [17., 27.]]]],
 
-           [[[ 21.],
-             [121.]],
 
-            [[ 31.],
-             [131.]],
 
-            [[ 41.],
-             [141.]]]]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
 
-         [[[[[ 21.],
-             [121.]],
+                       [[[20., 30.],
+                         [21., 31.],
+                         [22., 32.],
+                         [23., 33.]],
 
-            [[ 31.],
-             [131.]],
+                        [[21., 31.],
+                         [22., 32.],
+                         [23., 33.],
+                         [24., 34.]],
 
-            [[ 41.],
-             [141.]]],
+                        [[22., 32.],
+                         [23., 33.],
+                         [24., 34.],
+                         [25., 35.]]]],
 
 
-           [[[ 22.],
-             [122.]],
 
-            [[ 32.],
-             [132.]],
+                      [[[[ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.]],
 
-            [[ 42.],
-             [142.]]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.]]],
 
 
-          [[[[ 22.],
-             [122.]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 32.],
-             [132.]],
+                        [[ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.]],
 
-            [[ 42.],
-             [142.]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
 
-           [[[ 23.],
-             [123.]],
 
-            [[ 33.],
-             [133.]],
+                      [[[[10., 20.],
+                         [11., 21.],
+                         [12., 22.],
+                         [13., 23.]],
 
-            [[ 43.],
-             [143.]]]],
+                        [[11., 21.],
+                         [12., 22.],
+                         [13., 23.],
+                         [14., 24.]],
 
+                        [[12., 22.],
+                         [13., 23.],
+                         [14., 24.],
+                         [15., 25.]]],
 
 
-          [[[[ 23.],
-             [123.]],
+                       [[[15., 25.],
+                         [16., 26.],
+                         [17., 27.],
+                         [18., 28.]],
 
-            [[ 33.],
-             [133.]],
+                        [[16., 26.],
+                         [17., 27.],
+                         [18., 28.],
+                         [19., 29.]],
 
-            [[ 43.],
-             [143.]]],
+                        [[17., 27.],
+                         [18., 28.],
+                         [19., 29.],
+                         [20., 30.]]]],
 
 
-           [[[ 24.],
-             [124.]],
 
-            [[ 34.],
-             [134.]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 44.],
-             [144.]]]],
+                        [[18., 28.],
+                         [19., 29.],
+                         [20., 30.],
+                         [21., 31.]],
 
+                        [[19., 29.],
+                         [20., 30.],
+                         [21., 31.],
+                         [22., 32.]]],
 
 
-          [[[[ 24.],
-             [124.]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 34.],
-             [134.]],
+                        [[23., 33.],
+                         [24., 34.],
+                         [25., 35.],
+                         [26., 36.]],
 
-            [[ 44.],
-             [144.]]],
+                        [[24., 34.],
+                         [25., 35.],
+                         [26., 36.],
+                         [27., 37.]]]]]), size=(8, 6, 4, 2), nnz=7,
+       dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0, 4, 7], dtype=torch.int32)
+# _row_indices
+tensor([0, 1, 2, 3, 0, 2, 3], dtype=torch.int32)
+# _values
+tensor([[[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 1., 11.],
+           [ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.]],
 
-           [[[ 25.],
-             [125.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-            [[ 35.],
-             [135.]],
 
-            [[ 45.],
-             [145.]]]]]]])
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.]],
 
-########## torch.float64/torch.int64/size=()+(6, 6)+(2,) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[  1., 101.],
-                        [ 11., 111.],
-                        [ 21., 121.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-                       [[  2., 102.],
-                        [ 12., 112.],
-                        [ 22., 122.]]],
 
 
-                      [[[  2., 102.],
-                        [ 12., 112.],
-                        [ 22., 122.]],
+        [[[[ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.]],
 
-                       [[  3., 103.],
-                        [ 13., 113.],
-                        [ 23., 123.]]],
+          [[ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.],
+           [10., 20.]],
 
+          [[ 8., 18.],
+           [ 9., 19.],
+           [10., 20.],
+           [11., 21.]]],
 
-                      [[[  3., 103.],
-                        [ 13., 113.],
-                        [ 23., 123.]],
 
-                       [[  4., 104.],
-                        [ 14., 114.],
-                        [ 24., 124.]]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                      [[[  4., 104.],
-                        [ 14., 114.],
-                        [ 24., 124.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-                       [[  5., 105.],
-                        [ 15., 115.],
-                        [ 25., 125.]]]]), size=(6, 6, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0, 2, 4])
-# _row_indices
-tensor([0, 1, 0, 2])
-# _values
-tensor([[[[  1., 101.],
-          [ 11., 111.],
-          [ 21., 121.]],
 
-         [[  2., 102.],
-          [ 12., 112.],
-          [ 22., 122.]]],
 
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-        [[[  2., 102.],
-          [ 12., 112.],
-          [ 22., 122.]],
+          [[ 9., 19.],
+           [10., 20.],
+           [11., 21.],
+           [12., 22.]],
 
-         [[  3., 103.],
-          [ 13., 113.],
-          [ 23., 123.]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
 
-        [[[  3., 103.],
-          [ 13., 113.],
-          [ 23., 123.]],
+         [[[13., 23.],
+           [14., 24.],
+           [15., 25.],
+           [16., 26.]],
 
-         [[  4., 104.],
-          [ 14., 114.],
-          [ 24., 124.]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[14., 24.],
+           [15., 25.],
+           [16., 26.],
+           [17., 27.]]]],
 
-        [[[  4., 104.],
-          [ 14., 114.],
-          [ 24., 124.]],
 
-         [[  5., 105.],
-          [ 15., 115.],
-          [ 25., 125.]]]], dtype=torch.float64)
 
-########## torch.float64/torch.int64/size=()+(9, 4)+(4, 2) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[[1.0000e+00, 1.0010e+03],
-                         [1.0100e+02, 1.1010e+03],
-                         [2.0100e+02, 1.2010e+03],
-                         [3.0100e+02, 1.3010e+03]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                        [[1.1000e+01, 1.0110e+03],
-                         [1.1100e+02, 1.1110e+03],
-                         [2.1100e+02, 1.2110e+03],
-                         [3.1100e+02, 1.3110e+03]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-                       [[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
 
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]]],
+         [[[20., 30.],
+           [21., 31.],
+           [22., 32.],
+           [23., 33.]],
 
+          [[21., 31.],
+           [22., 32.],
+           [23., 33.],
+           [24., 34.]],
 
-                       [[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
+          [[22., 32.],
+           [23., 33.],
+           [24., 34.],
+           [25., 35.]]]],
 
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]]]],
 
 
+        [[[[ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.]],
 
-                      [[[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]]],
+          [[ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.]]],
 
 
-                       [[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]]],
+          [[ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-                       [[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
 
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]]]],
 
+        [[[[10., 20.],
+           [11., 21.],
+           [12., 22.],
+           [13., 23.]],
 
+          [[11., 21.],
+           [12., 22.],
+           [13., 23.],
+           [14., 24.]],
 
-                      [[[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
+          [[12., 22.],
+           [13., 23.],
+           [14., 24.],
+           [15., 25.]]],
 
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]]],
 
+         [[[15., 25.],
+           [16., 26.],
+           [17., 27.],
+           [18., 28.]],
 
-                       [[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
+          [[16., 26.],
+           [17., 27.],
+           [18., 28.],
+           [19., 29.]],
 
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]]],
+          [[17., 27.],
+           [18., 28.],
+           [19., 29.],
+           [20., 30.]]]],
 
 
-                       [[[5.0000e+00, 1.0050e+03],
-                         [1.0500e+02, 1.1050e+03],
-                         [2.0500e+02, 1.2050e+03],
-                         [3.0500e+02, 1.3050e+03]],
 
-                        [[1.5000e+01, 1.0150e+03],
-                         [1.1500e+02, 1.1150e+03],
-                         [2.1500e+02, 1.2150e+03],
-                         [3.1500e+02, 1.3150e+03]]]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[18., 28.],
+           [19., 29.],
+           [20., 30.],
+           [21., 31.]],
 
+          [[19., 29.],
+           [20., 30.],
+           [21., 31.],
+           [22., 32.]]],
 
-                      [[[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
 
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[23., 33.],
+           [24., 34.],
+           [25., 35.],
+           [26., 36.]],
 
-                       [[[5.0000e+00, 1.0050e+03],
-                         [1.0500e+02, 1.1050e+03],
-                         [2.0500e+02, 1.2050e+03],
-                         [3.0500e+02, 1.3050e+03]],
+          [[24., 34.],
+           [25., 35.],
+           [26., 36.],
+           [27., 37.]]]]], dtype=torch.float64)
 
-                        [[1.5000e+01, 1.0150e+03],
-                         [1.1500e+02, 1.1150e+03],
-                         [2.1500e+02, 1.2150e+03],
-                         [3.1500e+02, 1.3150e+03]]],
 
+########## torch.float32/torch.int64/size=(2, 3)+(2, 3)+(4,) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]],
 
-                       [[[6.0000e+00, 1.0060e+03],
-                         [1.0600e+02, 1.1060e+03],
-                         [2.0600e+02, 1.2060e+03],
-                         [3.0600e+02, 1.3060e+03]],
+                            [[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]]]),
+       row_indices=tensor([[[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]],
 
-                        [[1.6000e+01, 1.0160e+03],
-                         [1.1600e+02, 1.1160e+03],
-                         [2.1600e+02, 1.2160e+03],
-                         [3.1600e+02, 1.3160e+03]]]]]), size=(9, 4, 4, 2),
-       nnz=4, dtype=torch.float64, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0, 2, 4])
-# _row_indices
-tensor([0, 1, 0, 2])
-# _values
-tensor([[[[[1.0000e+00, 1.0010e+03],
-           [1.0100e+02, 1.1010e+03],
-           [2.0100e+02, 1.2010e+03],
-           [3.0100e+02, 1.3010e+03]],
+                           [[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]]]),
+       values=tensor([[[[[[1., 2., 3., 4.]],
 
-          [[1.1000e+01, 1.0110e+03],
-           [1.1100e+02, 1.1110e+03],
-           [2.1100e+02, 1.2110e+03],
-           [3.1100e+02, 1.3110e+03]]],
+                         [[3., 4., 5., 6.]]],
 
 
-         [[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
+                        [[[2., 3., 4., 5.]],
 
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]]],
+                         [[0., 0., 0., 0.]]],
 
 
-         [[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
+                        [[[0., 0., 0., 0.]],
 
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]]]],
+                         [[4., 5., 6., 7.]]]],
 
 
-        [[[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
+                       [[[[1., 2., 3., 4.]],
 
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]]],
+                         [[4., 5., 6., 7.]]],
 
 
-         [[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
+                        [[[2., 3., 4., 5.]],
 
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]]],
+                         [[0., 0., 0., 0.]]],
 
 
-         [[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
+                        [[[3., 4., 5., 6.]],
 
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]]]],
+                         [[0., 0., 0., 0.]]]],
 
 
-        [[[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
+                       [[[[1., 2., 3., 4.]],
 
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]]],
+                         [[2., 3., 4., 5.]]],
 
 
-         [[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
+                        [[[0., 0., 0., 0.]],
 
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]]],
+                         [[3., 4., 5., 6.]]],
 
 
-         [[[5.0000e+00, 1.0050e+03],
-           [1.0500e+02, 1.1050e+03],
-           [2.0500e+02, 1.2050e+03],
-           [3.0500e+02, 1.3050e+03]],
+                        [[[0., 0., 0., 0.]],
 
-          [[1.5000e+01, 1.0150e+03],
-           [1.1500e+02, 1.1150e+03],
-           [2.1500e+02, 1.2150e+03],
-           [3.1500e+02, 1.3150e+03]]]],
+                         [[4., 5., 6., 7.]]]]],
 
 
-        [[[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
 
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]]],
+                      [[[[[0., 0., 0., 0.]],
 
+                         [[2., 3., 4., 5.]]],
 
-         [[[5.0000e+00, 1.0050e+03],
-           [1.0500e+02, 1.1050e+03],
-           [2.0500e+02, 1.2050e+03],
-           [3.0500e+02, 1.3050e+03]],
 
-          [[1.5000e+01, 1.0150e+03],
-           [1.1500e+02, 1.1150e+03],
-           [2.1500e+02, 1.2150e+03],
-           [3.1500e+02, 1.3150e+03]]],
+                        [[[1., 2., 3., 4.]],
 
+                         [[3., 4., 5., 6.]]],
 
-         [[[6.0000e+00, 1.0060e+03],
-           [1.0600e+02, 1.1060e+03],
-           [2.0600e+02, 1.2060e+03],
-           [3.0600e+02, 1.3060e+03]],
 
-          [[1.6000e+01, 1.0160e+03],
-           [1.1600e+02, 1.1160e+03],
-           [2.1600e+02, 1.2160e+03],
-           [3.1600e+02, 1.3160e+03]]]]], dtype=torch.float64)
+                        [[[0., 0., 0., 0.]],
 
-########## torch.float64/torch.int64/size=(2, 3)+(6, 6)+(2, 1) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
+                         [[4., 5., 6., 7.]]]],
 
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
 
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[[[  1.],
-                           [101.]],
 
-                          [[ 11.],
-                           [111.]],
+                       [[[[1., 2., 3., 4.]],
 
-                          [[ 21.],
-                           [121.]]],
+                         [[3., 4., 5., 6.]]],
 
 
-                         [[[  2.],
-                           [102.]],
+                        [[[0., 0., 0., 0.]],
 
-                          [[ 12.],
-                           [112.]],
+                         [[4., 5., 6., 7.]]],
 
-                          [[ 22.],
-                           [122.]]]],
 
+                        [[[2., 3., 4., 5.]],
 
+                         [[0., 0., 0., 0.]]]],
 
-                        [[[[  2.],
-                           [102.]],
 
-                          [[ 12.],
-                           [112.]],
 
-                          [[ 22.],
-                           [122.]]],
+                       [[[[1., 2., 3., 4.]],
 
+                         [[0., 0., 0., 0.]]],
 
-                         [[[  3.],
-                           [103.]],
 
-                          [[ 13.],
-                           [113.]],
+                        [[[2., 3., 4., 5.]],
 
-                          [[ 23.],
-                           [123.]]]],
+                         [[4., 5., 6., 7.]]],
 
 
+                        [[[3., 4., 5., 6.]],
 
-                        [[[[  3.],
-                           [103.]],
+                         [[0., 0., 0., 0.]]]]]]), size=(2, 3, 2, 3, 4), nnz=3,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]],
 
-                          [[ 13.],
-                           [113.]],
+        [[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]]])
+# _row_indices
+tensor([[[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]],
 
-                          [[ 23.],
-                           [123.]]],
+        [[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]]])
+# _values
+tensor([[[[[[1., 2., 3., 4.]],
 
+           [[3., 4., 5., 6.]]],
 
-                         [[[  4.],
-                           [104.]],
 
-                          [[ 14.],
-                           [114.]],
+          [[[2., 3., 4., 5.]],
 
-                          [[ 24.],
-                           [124.]]]],
+           [[0., 0., 0., 0.]]],
 
 
+          [[[0., 0., 0., 0.]],
 
-                        [[[[  4.],
-                           [104.]],
+           [[4., 5., 6., 7.]]]],
 
-                          [[ 14.],
-                           [114.]],
 
-                          [[ 24.],
-                           [124.]]],
 
+         [[[[1., 2., 3., 4.]],
 
-                         [[[  5.],
-                           [105.]],
+           [[4., 5., 6., 7.]]],
 
-                          [[ 15.],
-                           [115.]],
 
-                          [[ 25.],
-                           [125.]]]]],
+          [[[2., 3., 4., 5.]],
 
+           [[0., 0., 0., 0.]]],
 
 
+          [[[3., 4., 5., 6.]],
 
-                       [[[[[  5.],
-                           [105.]],
+           [[0., 0., 0., 0.]]]],
 
-                          [[ 15.],
-                           [115.]],
 
-                          [[ 25.],
-                           [125.]]],
 
+         [[[[1., 2., 3., 4.]],
 
-                         [[[  6.],
-                           [106.]],
+           [[2., 3., 4., 5.]]],
 
-                          [[ 16.],
-                           [116.]],
 
-                          [[ 26.],
-                           [126.]]]],
+          [[[0., 0., 0., 0.]],
 
+           [[3., 4., 5., 6.]]],
 
 
-                        [[[[  6.],
-                           [106.]],
+          [[[0., 0., 0., 0.]],
 
-                          [[ 16.],
-                           [116.]],
+           [[4., 5., 6., 7.]]]]],
 
-                          [[ 26.],
-                           [126.]]],
 
 
-                         [[[  7.],
-                           [107.]],
 
-                          [[ 17.],
-                           [117.]],
+        [[[[[0., 0., 0., 0.]],
 
-                          [[ 27.],
-                           [127.]]]],
+           [[2., 3., 4., 5.]]],
 
 
+          [[[1., 2., 3., 4.]],
 
-                        [[[[  7.],
-                           [107.]],
+           [[3., 4., 5., 6.]]],
 
-                          [[ 17.],
-                           [117.]],
 
-                          [[ 27.],
-                           [127.]]],
+          [[[0., 0., 0., 0.]],
 
+           [[4., 5., 6., 7.]]]],
 
-                         [[[  8.],
-                           [108.]],
 
-                          [[ 18.],
-                           [118.]],
 
-                          [[ 28.],
-                           [128.]]]],
+         [[[[1., 2., 3., 4.]],
 
+           [[3., 4., 5., 6.]]],
 
 
-                        [[[[  8.],
-                           [108.]],
+          [[[0., 0., 0., 0.]],
 
-                          [[ 18.],
-                           [118.]],
+           [[4., 5., 6., 7.]]],
 
-                          [[ 28.],
-                           [128.]]],
 
+          [[[2., 3., 4., 5.]],
 
-                         [[[  9.],
-                           [109.]],
+           [[0., 0., 0., 0.]]]],
 
-                          [[ 19.],
-                           [119.]],
 
-                          [[ 29.],
-                           [129.]]]]],
 
+         [[[[1., 2., 3., 4.]],
 
+           [[0., 0., 0., 0.]]],
 
 
-                       [[[[[  9.],
-                           [109.]],
+          [[[2., 3., 4., 5.]],
 
-                          [[ 19.],
-                           [119.]],
+           [[4., 5., 6., 7.]]],
 
-                          [[ 29.],
-                           [129.]]],
 
+          [[[3., 4., 5., 6.]],
 
-                         [[[ 10.],
-                           [110.]],
+           [[0., 0., 0., 0.]]]]]])
 
-                          [[ 20.],
-                           [120.]],
+########## torch.float32/torch.int64/size=()+(8, 6)+(4, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 4, 7]),
+       row_indices=tensor([0, 1, 2, 3, 0, 2, 3]),
+       values=tensor([[[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
+
+                        [[ 1., 11.],
+                         [ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.]],
 
-                          [[ 30.],
-                           [130.]]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
 
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                        [[[[ 10.],
-                           [110.]],
+                        [[ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.]],
 
-                          [[ 20.],
-                           [120.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-                          [[ 30.],
-                           [130.]]],
 
 
-                         [[[ 11.],
-                           [111.]],
+                      [[[[ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.]],
 
-                          [[ 21.],
-                           [121.]],
+                        [[ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.]],
 
-                          [[ 31.],
-                           [131.]]]],
+                        [[ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.],
+                         [11., 21.]]],
 
 
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                        [[[[ 11.],
-                           [111.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 21.],
-                           [121.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-                          [[ 31.],
-                           [131.]]],
 
 
-                         [[[ 12.],
-                           [112.]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 22.],
-                           [122.]],
+                        [[ 9., 19.],
+                         [10., 20.],
+                         [11., 21.],
+                         [12., 22.]],
 
-                          [[ 32.],
-                           [132.]]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
 
+                       [[[13., 23.],
+                         [14., 24.],
+                         [15., 25.],
+                         [16., 26.]],
 
-                        [[[[ 12.],
-                           [112.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 22.],
-                           [122.]],
+                        [[14., 24.],
+                         [15., 25.],
+                         [16., 26.],
+                         [17., 27.]]]],
 
-                          [[ 32.],
-                           [132.]]],
 
 
-                         [[[ 13.],
-                           [113.]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 23.],
-                           [123.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 33.],
-                           [133.]]]]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
 
+                       [[[20., 30.],
+                         [21., 31.],
+                         [22., 32.],
+                         [23., 33.]],
 
+                        [[21., 31.],
+                         [22., 32.],
+                         [23., 33.],
+                         [24., 34.]],
 
+                        [[22., 32.],
+                         [23., 33.],
+                         [24., 34.],
+                         [25., 35.]]]],
 
-                      [[[[[[ 13.],
-                           [113.]],
 
-                          [[ 23.],
-                           [123.]],
 
-                          [[ 33.],
-                           [133.]]],
+                      [[[[ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                         [[[ 14.],
-                           [114.]],
+                        [[ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.]]],
 
-                          [[ 24.],
-                           [124.]],
 
-                          [[ 34.],
-                           [134.]]]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-                        [[[[ 14.],
-                           [114.]],
 
-                          [[ 24.],
-                           [124.]],
 
-                          [[ 34.],
-                           [134.]]],
+                      [[[[10., 20.],
+                         [11., 21.],
+                         [12., 22.],
+                         [13., 23.]],
 
+                        [[11., 21.],
+                         [12., 22.],
+                         [13., 23.],
+                         [14., 24.]],
 
-                         [[[ 15.],
-                           [115.]],
+                        [[12., 22.],
+                         [13., 23.],
+                         [14., 24.],
+                         [15., 25.]]],
 
-                          [[ 25.],
-                           [125.]],
 
-                          [[ 35.],
-                           [135.]]]],
+                       [[[15., 25.],
+                         [16., 26.],
+                         [17., 27.],
+                         [18., 28.]],
 
+                        [[16., 26.],
+                         [17., 27.],
+                         [18., 28.],
+                         [19., 29.]],
 
+                        [[17., 27.],
+                         [18., 28.],
+                         [19., 29.],
+                         [20., 30.]]]],
 
-                        [[[[ 15.],
-                           [115.]],
 
-                          [[ 25.],
-                           [125.]],
 
-                          [[ 35.],
-                           [135.]]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[18., 28.],
+                         [19., 29.],
+                         [20., 30.],
+                         [21., 31.]],
 
-                         [[[ 16.],
-                           [116.]],
+                        [[19., 29.],
+                         [20., 30.],
+                         [21., 31.],
+                         [22., 32.]]],
 
-                          [[ 26.],
-                           [126.]],
 
-                          [[ 36.],
-                           [136.]]]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[23., 33.],
+                         [24., 34.],
+                         [25., 35.],
+                         [26., 36.]],
 
+                        [[24., 34.],
+                         [25., 35.],
+                         [26., 36.],
+                         [27., 37.]]]]]), size=(8, 6, 4, 2), nnz=7,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0, 4, 7])
+# _row_indices
+tensor([0, 1, 2, 3, 0, 2, 3])
+# _values
+tensor([[[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                        [[[[ 16.],
-                           [116.]],
+          [[ 1., 11.],
+           [ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.]],
 
-                          [[ 26.],
-                           [126.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-                          [[ 36.],
-                           [136.]]],
 
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                         [[[ 17.],
-                           [117.]],
+          [[ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.]],
 
-                          [[ 27.],
-                           [127.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-                          [[ 37.],
-                           [137.]]]]],
 
 
+        [[[[ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.]],
 
+          [[ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.],
+           [10., 20.]],
 
-                       [[[[[ 17.],
-                           [117.]],
+          [[ 8., 18.],
+           [ 9., 19.],
+           [10., 20.],
+           [11., 21.]]],
 
-                          [[ 27.],
-                           [127.]],
 
-                          [[ 37.],
-                           [137.]]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                         [[[ 18.],
-                           [118.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-                          [[ 28.],
-                           [128.]],
 
-                          [[ 38.],
-                           [138.]]]],
 
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 9., 19.],
+           [10., 20.],
+           [11., 21.],
+           [12., 22.]],
 
-                        [[[[ 18.],
-                           [118.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-                          [[ 28.],
-                           [128.]],
 
-                          [[ 38.],
-                           [138.]]],
+         [[[13., 23.],
+           [14., 24.],
+           [15., 25.],
+           [16., 26.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                         [[[ 19.],
-                           [119.]],
+          [[14., 24.],
+           [15., 25.],
+           [16., 26.],
+           [17., 27.]]]],
 
-                          [[ 29.],
-                           [129.]],
 
-                          [[ 39.],
-                           [139.]]]],
 
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                        [[[[ 19.],
-                           [119.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-                          [[ 29.],
-                           [129.]],
 
-                          [[ 39.],
-                           [139.]]],
+         [[[20., 30.],
+           [21., 31.],
+           [22., 32.],
+           [23., 33.]],
 
+          [[21., 31.],
+           [22., 32.],
+           [23., 33.],
+           [24., 34.]],
 
-                         [[[ 20.],
-                           [120.]],
+          [[22., 32.],
+           [23., 33.],
+           [24., 34.],
+           [25., 35.]]]],
 
-                          [[ 30.],
-                           [130.]],
 
-                          [[ 40.],
-                           [140.]]]],
 
+        [[[[ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                        [[[[ 20.],
-                           [120.]],
+          [[ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.]]],
 
-                          [[ 30.],
-                           [130.]],
 
-                          [[ 40.],
-                           [140.]]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.]],
 
-                         [[[ 21.],
-                           [121.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-                          [[ 31.],
-                           [131.]],
 
-                          [[ 41.],
-                           [141.]]]]],
 
+        [[[[10., 20.],
+           [11., 21.],
+           [12., 22.],
+           [13., 23.]],
 
+          [[11., 21.],
+           [12., 22.],
+           [13., 23.],
+           [14., 24.]],
 
+          [[12., 22.],
+           [13., 23.],
+           [14., 24.],
+           [15., 25.]]],
 
-                       [[[[[ 21.],
-                           [121.]],
 
-                          [[ 31.],
-                           [131.]],
+         [[[15., 25.],
+           [16., 26.],
+           [17., 27.],
+           [18., 28.]],
 
-                          [[ 41.],
-                           [141.]]],
+          [[16., 26.],
+           [17., 27.],
+           [18., 28.],
+           [19., 29.]],
 
+          [[17., 27.],
+           [18., 28.],
+           [19., 29.],
+           [20., 30.]]]],
 
-                         [[[ 22.],
-                           [122.]],
 
-                          [[ 32.],
-                           [132.]],
 
-                          [[ 42.],
-                           [142.]]]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[18., 28.],
+           [19., 29.],
+           [20., 30.],
+           [21., 31.]],
 
+          [[19., 29.],
+           [20., 30.],
+           [21., 31.],
+           [22., 32.]]],
 
-                        [[[[ 22.],
-                           [122.]],
 
-                          [[ 32.],
-                           [132.]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                          [[ 42.],
-                           [142.]]],
+          [[23., 33.],
+           [24., 34.],
+           [25., 35.],
+           [26., 36.]],
 
+          [[24., 34.],
+           [25., 35.],
+           [26., 36.],
+           [27., 37.]]]]])
 
-                         [[[ 23.],
-                           [123.]],
 
-                          [[ 33.],
-                           [133.]],
+########## torch.float64/torch.int64/size=(2, 3)+(2, 3)+(4,) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]],
 
-                          [[ 43.],
-                           [143.]]]],
+                            [[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]]]),
+       row_indices=tensor([[[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]],
 
+                           [[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]]]),
+       values=tensor([[[[[[1., 2., 3., 4.]],
 
+                         [[3., 4., 5., 6.]]],
 
-                        [[[[ 23.],
-                           [123.]],
 
-                          [[ 33.],
-                           [133.]],
+                        [[[2., 3., 4., 5.]],
 
-                          [[ 43.],
-                           [143.]]],
+                         [[0., 0., 0., 0.]]],
 
 
-                         [[[ 24.],
-                           [124.]],
+                        [[[0., 0., 0., 0.]],
 
-                          [[ 34.],
-                           [134.]],
+                         [[4., 5., 6., 7.]]]],
 
-                          [[ 44.],
-                           [144.]]]],
 
 
+                       [[[[1., 2., 3., 4.]],
 
-                        [[[[ 24.],
-                           [124.]],
+                         [[4., 5., 6., 7.]]],
 
-                          [[ 34.],
-                           [134.]],
 
-                          [[ 44.],
-                           [144.]]],
+                        [[[2., 3., 4., 5.]],
 
+                         [[0., 0., 0., 0.]]],
 
-                         [[[ 25.],
-                           [125.]],
 
-                          [[ 35.],
-                           [135.]],
+                        [[[3., 4., 5., 6.]],
 
-                          [[ 45.],
-                           [145.]]]]]]]), size=(2, 3, 6, 6, 2, 1), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+                         [[0., 0., 0., 0.]]]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]])
-# _row_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]])
-# _values
-tensor([[[[[[[  1.],
-             [101.]],
 
-            [[ 11.],
-             [111.]],
+                       [[[[1., 2., 3., 4.]],
 
-            [[ 21.],
-             [121.]]],
+                         [[2., 3., 4., 5.]]],
 
 
-           [[[  2.],
-             [102.]],
+                        [[[0., 0., 0., 0.]],
 
-            [[ 12.],
-             [112.]],
+                         [[3., 4., 5., 6.]]],
 
-            [[ 22.],
-             [122.]]]],
 
+                        [[[0., 0., 0., 0.]],
 
+                         [[4., 5., 6., 7.]]]]],
 
-          [[[[  2.],
-             [102.]],
 
-            [[ 12.],
-             [112.]],
 
-            [[ 22.],
-             [122.]]],
 
+                      [[[[[0., 0., 0., 0.]],
 
-           [[[  3.],
-             [103.]],
+                         [[2., 3., 4., 5.]]],
 
-            [[ 13.],
-             [113.]],
 
-            [[ 23.],
-             [123.]]]],
+                        [[[1., 2., 3., 4.]],
 
+                         [[3., 4., 5., 6.]]],
 
 
-          [[[[  3.],
-             [103.]],
+                        [[[0., 0., 0., 0.]],
 
-            [[ 13.],
-             [113.]],
+                         [[4., 5., 6., 7.]]]],
 
-            [[ 23.],
-             [123.]]],
 
 
-           [[[  4.],
-             [104.]],
+                       [[[[1., 2., 3., 4.]],
 
-            [[ 14.],
-             [114.]],
+                         [[3., 4., 5., 6.]]],
 
-            [[ 24.],
-             [124.]]]],
 
+                        [[[0., 0., 0., 0.]],
 
+                         [[4., 5., 6., 7.]]],
 
-          [[[[  4.],
-             [104.]],
 
-            [[ 14.],
-             [114.]],
+                        [[[2., 3., 4., 5.]],
 
-            [[ 24.],
-             [124.]]],
+                         [[0., 0., 0., 0.]]]],
 
 
-           [[[  5.],
-             [105.]],
 
-            [[ 15.],
-             [115.]],
+                       [[[[1., 2., 3., 4.]],
 
-            [[ 25.],
-             [125.]]]]],
+                         [[0., 0., 0., 0.]]],
 
 
+                        [[[2., 3., 4., 5.]],
 
+                         [[4., 5., 6., 7.]]],
 
-         [[[[[  5.],
-             [105.]],
 
-            [[ 15.],
-             [115.]],
+                        [[[3., 4., 5., 6.]],
 
-            [[ 25.],
-             [125.]]],
+                         [[0., 0., 0., 0.]]]]]]), size=(2, 3, 2, 3, 4), nnz=3,
+       dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]],
 
+        [[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]]])
+# _row_indices
+tensor([[[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]],
 
-           [[[  6.],
-             [106.]],
+        [[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]]])
+# _values
+tensor([[[[[[1., 2., 3., 4.]],
 
-            [[ 16.],
-             [116.]],
+           [[3., 4., 5., 6.]]],
 
-            [[ 26.],
-             [126.]]]],
 
+          [[[2., 3., 4., 5.]],
 
+           [[0., 0., 0., 0.]]],
 
-          [[[[  6.],
-             [106.]],
 
-            [[ 16.],
-             [116.]],
+          [[[0., 0., 0., 0.]],
 
-            [[ 26.],
-             [126.]]],
+           [[4., 5., 6., 7.]]]],
 
 
-           [[[  7.],
-             [107.]],
 
-            [[ 17.],
-             [117.]],
+         [[[[1., 2., 3., 4.]],
 
-            [[ 27.],
-             [127.]]]],
+           [[4., 5., 6., 7.]]],
 
 
+          [[[2., 3., 4., 5.]],
 
-          [[[[  7.],
-             [107.]],
+           [[0., 0., 0., 0.]]],
 
-            [[ 17.],
-             [117.]],
 
-            [[ 27.],
-             [127.]]],
+          [[[3., 4., 5., 6.]],
 
+           [[0., 0., 0., 0.]]]],
 
-           [[[  8.],
-             [108.]],
 
-            [[ 18.],
-             [118.]],
 
-            [[ 28.],
-             [128.]]]],
+         [[[[1., 2., 3., 4.]],
 
+           [[2., 3., 4., 5.]]],
 
 
-          [[[[  8.],
-             [108.]],
+          [[[0., 0., 0., 0.]],
 
-            [[ 18.],
-             [118.]],
+           [[3., 4., 5., 6.]]],
 
-            [[ 28.],
-             [128.]]],
 
+          [[[0., 0., 0., 0.]],
 
-           [[[  9.],
-             [109.]],
+           [[4., 5., 6., 7.]]]]],
 
-            [[ 19.],
-             [119.]],
 
-            [[ 29.],
-             [129.]]]]],
 
 
+        [[[[[0., 0., 0., 0.]],
 
+           [[2., 3., 4., 5.]]],
 
-         [[[[[  9.],
-             [109.]],
 
-            [[ 19.],
-             [119.]],
+          [[[1., 2., 3., 4.]],
 
-            [[ 29.],
-             [129.]]],
+           [[3., 4., 5., 6.]]],
 
 
-           [[[ 10.],
-             [110.]],
+          [[[0., 0., 0., 0.]],
 
-            [[ 20.],
-             [120.]],
+           [[4., 5., 6., 7.]]]],
 
-            [[ 30.],
-             [130.]]]],
 
 
+         [[[[1., 2., 3., 4.]],
 
-          [[[[ 10.],
-             [110.]],
+           [[3., 4., 5., 6.]]],
 
-            [[ 20.],
-             [120.]],
 
-            [[ 30.],
-             [130.]]],
+          [[[0., 0., 0., 0.]],
 
+           [[4., 5., 6., 7.]]],
 
-           [[[ 11.],
-             [111.]],
 
-            [[ 21.],
-             [121.]],
+          [[[2., 3., 4., 5.]],
 
-            [[ 31.],
-             [131.]]]],
+           [[0., 0., 0., 0.]]]],
 
 
-          [[[[ 11.],
-             [111.]],
+         [[[[1., 2., 3., 4.]],
 
-            [[ 21.],
-             [121.]],
+           [[0., 0., 0., 0.]]],
 
-            [[ 31.],
-             [131.]]],
 
+          [[[2., 3., 4., 5.]],
 
-           [[[ 12.],
-             [112.]],
+           [[4., 5., 6., 7.]]],
 
-            [[ 22.],
-             [122.]],
 
-            [[ 32.],
-             [132.]]]],
+          [[[3., 4., 5., 6.]],
 
+           [[0., 0., 0., 0.]]]]]], dtype=torch.float64)
 
+########## torch.float64/torch.int64/size=()+(8, 6)+(4, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 4, 7]),
+       row_indices=tensor([0, 1, 2, 3, 0, 2, 3]),
+       values=tensor([[[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
+
+                        [[ 1., 11.],
+                         [ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.]],
 
-          [[[[ 12.],
-             [112.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
-            [[ 22.],
-             [122.]],
 
-            [[ 32.],
-             [132.]]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.]],
 
-           [[[ 13.],
-             [113.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-            [[ 23.],
-             [123.]],
 
-            [[ 33.],
-             [133.]]]]]],
 
+                      [[[[ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.]],
 
+                        [[ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.]],
 
+                        [[ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.],
+                         [11., 21.]]],
 
 
-        [[[[[[ 13.],
-             [113.]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 23.],
-             [123.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 33.],
-             [133.]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
 
-           [[[ 14.],
-             [114.]],
 
-            [[ 24.],
-             [124.]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 34.],
-             [134.]]]],
+                        [[ 9., 19.],
+                         [10., 20.],
+                         [11., 21.],
+                         [12., 22.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
 
-          [[[[ 14.],
-             [114.]],
+                       [[[13., 23.],
+                         [14., 24.],
+                         [15., 25.],
+                         [16., 26.]],
 
-            [[ 24.],
-             [124.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 34.],
-             [134.]]],
+                        [[14., 24.],
+                         [15., 25.],
+                         [16., 26.],
+                         [17., 27.]]]],
 
 
-           [[[ 15.],
-             [115.]],
 
-            [[ 25.],
-             [125.]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 35.],
-             [135.]]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
 
-          [[[[ 15.],
-             [115.]],
+                       [[[20., 30.],
+                         [21., 31.],
+                         [22., 32.],
+                         [23., 33.]],
 
-            [[ 25.],
-             [125.]],
+                        [[21., 31.],
+                         [22., 32.],
+                         [23., 33.],
+                         [24., 34.]],
 
-            [[ 35.],
-             [135.]]],
+                        [[22., 32.],
+                         [23., 33.],
+                         [24., 34.],
+                         [25., 35.]]]],
 
 
-           [[[ 16.],
-             [116.]],
 
-            [[ 26.],
-             [126.]],
+                      [[[[ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.]],
 
-            [[ 36.],
-             [136.]]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.]]],
 
 
-          [[[[ 16.],
-             [116.]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 26.],
-             [126.]],
+                        [[ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.]],
 
-            [[ 36.],
-             [136.]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
 
-           [[[ 17.],
-             [117.]],
 
-            [[ 27.],
-             [127.]],
+                      [[[[10., 20.],
+                         [11., 21.],
+                         [12., 22.],
+                         [13., 23.]],
 
-            [[ 37.],
-             [137.]]]]],
+                        [[11., 21.],
+                         [12., 22.],
+                         [13., 23.],
+                         [14., 24.]],
 
+                        [[12., 22.],
+                         [13., 23.],
+                         [14., 24.],
+                         [15., 25.]]],
 
 
+                       [[[15., 25.],
+                         [16., 26.],
+                         [17., 27.],
+                         [18., 28.]],
 
-         [[[[[ 17.],
-             [117.]],
+                        [[16., 26.],
+                         [17., 27.],
+                         [18., 28.],
+                         [19., 29.]],
 
-            [[ 27.],
-             [127.]],
+                        [[17., 27.],
+                         [18., 28.],
+                         [19., 29.],
+                         [20., 30.]]]],
 
-            [[ 37.],
-             [137.]]],
 
 
-           [[[ 18.],
-             [118.]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 28.],
-             [128.]],
+                        [[18., 28.],
+                         [19., 29.],
+                         [20., 30.],
+                         [21., 31.]],
 
-            [[ 38.],
-             [138.]]]],
+                        [[19., 29.],
+                         [20., 30.],
+                         [21., 31.],
+                         [22., 32.]]],
 
 
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-          [[[[ 18.],
-             [118.]],
+                        [[23., 33.],
+                         [24., 34.],
+                         [25., 35.],
+                         [26., 36.]],
 
-            [[ 28.],
-             [128.]],
+                        [[24., 34.],
+                         [25., 35.],
+                         [26., 36.],
+                         [27., 37.]]]]]), size=(8, 6, 4, 2), nnz=7,
+       dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0, 4, 7])
+# _row_indices
+tensor([0, 1, 2, 3, 0, 2, 3])
+# _values
+tensor([[[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 38.],
-             [138.]]],
+          [[ 1., 11.],
+           [ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-           [[[ 19.],
-             [119.]],
 
-            [[ 29.],
-             [129.]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 39.],
-             [139.]]]],
+          [[ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
 
-          [[[[ 19.],
-             [119.]],
 
-            [[ 29.],
-             [129.]],
+        [[[[ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.]],
 
-            [[ 39.],
-             [139.]]],
+          [[ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.],
+           [10., 20.]],
 
+          [[ 8., 18.],
+           [ 9., 19.],
+           [10., 20.],
+           [11., 21.]]],
 
-           [[[ 20.],
-             [120.]],
 
-            [[ 30.],
-             [130.]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 40.],
-             [140.]]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
 
-          [[[[ 20.],
-             [120.]],
 
-            [[ 30.],
-             [130.]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 40.],
-             [140.]]],
+          [[ 9., 19.],
+           [10., 20.],
+           [11., 21.],
+           [12., 22.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-           [[[ 21.],
-             [121.]],
 
-            [[ 31.],
-             [131.]],
+         [[[13., 23.],
+           [14., 24.],
+           [15., 25.],
+           [16., 26.]],
 
-            [[ 41.],
-             [141.]]]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[14., 24.],
+           [15., 25.],
+           [16., 26.],
+           [17., 27.]]]],
 
 
-         [[[[[ 21.],
-             [121.]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 31.],
-             [131.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 41.],
-             [141.]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
 
-           [[[ 22.],
-             [122.]],
+         [[[20., 30.],
+           [21., 31.],
+           [22., 32.],
+           [23., 33.]],
 
-            [[ 32.],
-             [132.]],
+          [[21., 31.],
+           [22., 32.],
+           [23., 33.],
+           [24., 34.]],
 
-            [[ 42.],
-             [142.]]]],
+          [[22., 32.],
+           [23., 33.],
+           [24., 34.],
+           [25., 35.]]]],
 
 
-          [[[[ 22.],
-             [122.]],
+        [[[[ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.]],
 
-            [[ 32.],
-             [132.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 42.],
-             [142.]]],
+          [[ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.]]],
 
 
-           [[[ 23.],
-             [123.]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 33.],
-             [133.]],
+          [[ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.]],
 
-            [[ 43.],
-             [143.]]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
 
-          [[[[ 23.],
-             [123.]],
+        [[[[10., 20.],
+           [11., 21.],
+           [12., 22.],
+           [13., 23.]],
 
-            [[ 33.],
-             [133.]],
+          [[11., 21.],
+           [12., 22.],
+           [13., 23.],
+           [14., 24.]],
 
-            [[ 43.],
-             [143.]]],
+          [[12., 22.],
+           [13., 23.],
+           [14., 24.],
+           [15., 25.]]],
 
 
-           [[[ 24.],
-             [124.]],
+         [[[15., 25.],
+           [16., 26.],
+           [17., 27.],
+           [18., 28.]],
 
-            [[ 34.],
-             [134.]],
+          [[16., 26.],
+           [17., 27.],
+           [18., 28.],
+           [19., 29.]],
 
-            [[ 44.],
-             [144.]]]],
+          [[17., 27.],
+           [18., 28.],
+           [19., 29.],
+           [20., 30.]]]],
 
 
-          [[[[ 24.],
-             [124.]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 34.],
-             [134.]],
+          [[18., 28.],
+           [19., 29.],
+           [20., 30.],
+           [21., 31.]],
 
-            [[ 44.],
-             [144.]]],
+          [[19., 29.],
+           [20., 30.],
+           [21., 31.],
+           [22., 32.]]],
 
 
-           [[[ 25.],
-             [125.]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 35.],
-             [135.]],
+          [[23., 33.],
+           [24., 34.],
+           [25., 35.],
+           [26., 36.]],
 
-            [[ 45.],
-             [145.]]]]]]], dtype=torch.float64)
+          [[24., 34.],
+           [25., 35.],
+           [26., 36.],
+           [27., 37.]]]]], dtype=torch.float64)
 
diff --git a/test/expect/TestSparseCompressedCPU.test_print_SparseBSR_cpu.expect b/test/expect/TestSparseCompressedCPU.test_print_SparseBSR_cpu.expect
index 267056b76e678..8fe3223332bb5 100644
--- a/test/expect/TestSparseCompressedCPU.test_print_SparseBSR_cpu.expect
+++ b/test/expect/TestSparseCompressedCPU.test_print_SparseBSR_cpu.expect
@@ -1,6945 +1,3583 @@
-########## torch.float32/torch.int32/size=()+(4, 3)+() ##########
+########## torch.float32/torch.int32/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[1.],
-                       [2.]],
+tensor(crow_indices=tensor([[[0, 3],
+                             [0, 3],
+                             [0, 3]],
 
-                      [[2.],
-                       [3.]],
+                            [[0, 3],
+                             [0, 3],
+                             [0, 3]]]),
+       col_indices=tensor([[[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]],
 
-                      [[3.],
-                       [4.]],
+                           [[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]]]),
+       values=tensor([[[[[1.],
+                         [3.]],
 
-                      [[4.],
-                       [5.]]]), size=(4, 3), nnz=4, layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0, 2, 4], dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 2], dtype=torch.int32)
-# _values
-tensor([[[1.],
-         [2.]],
-
-        [[2.],
-         [3.]],
-
-        [[3.],
-         [4.]],
-
-        [[4.],
-         [5.]]])
-
-########## torch.float32/torch.int32/size=()+(0, 0)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([0]),
-       col_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0, 2, 1)), size=(0, 0), nnz=0,
-       layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0], dtype=torch.int32)
-# _col_indices
-tensor([], dtype=torch.int32)
-# _values
-tensor([], size=(0, 2, 1))
-
-########## torch.float32/torch.int32/size=(2,)+(2, 6)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       col_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[[[ 1., 11.]],
-
-                       [[ 2., 12.]],
-
-                       [[ 3., 13.]],
-
-                       [[ 4., 14.]]],
-
-
-                      [[[ 5., 15.]],
-
-                       [[ 6., 16.]],
-
-                       [[ 7., 17.]],
-
-                       [[ 8., 18.]]]]), size=(2, 2, 6), nnz=4,
-       layout=torch.sparse_bsr)
-# _crow_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]], dtype=torch.int32)
-# _col_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]], dtype=torch.int32)
-# _values
-tensor([[[[ 1., 11.]],
-
-         [[ 2., 12.]],
-
-         [[ 3., 13.]],
-
-         [[ 4., 14.]]],
-
-
-        [[[ 5., 15.]],
-
-         [[ 6., 16.]],
+                        [[2.],
+                         [0.]],
 
-         [[ 7., 17.]],
+                        [[0.],
+                         [4.]]],
 
-         [[ 8., 18.]]]])
 
-########## torch.float32/torch.int32/size=(2, 3)+(4, 9)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1., 11., 21.],
-                         [ 2., 12., 22.]],
-
-                        [[ 2., 12., 22.],
-                         [ 3., 13., 23.]],
-
-                        [[ 3., 13., 23.],
-                         [ 4., 14., 24.]],
-
-                        [[ 4., 14., 24.],
-                         [ 5., 15., 25.]]],
-
-
-                       [[[ 5., 15., 25.],
-                         [ 6., 16., 26.]],
+                       [[[1.],
+                         [4.]],
 
-                        [[ 6., 16., 26.],
-                         [ 7., 17., 27.]],
+                        [[2.],
+                         [0.]],
 
-                        [[ 7., 17., 27.],
-                         [ 8., 18., 28.]],
+                        [[3.],
+                         [0.]]],
 
-                        [[ 8., 18., 28.],
-                         [ 9., 19., 29.]]],
 
+                       [[[1.],
+                         [2.]],
 
-                       [[[ 9., 19., 29.],
-                         [10., 20., 30.]],
+                        [[0.],
+                         [3.]],
 
-                        [[10., 20., 30.],
-                         [11., 21., 31.]],
+                        [[0.],
+                         [4.]]]],
 
-                        [[11., 21., 31.],
-                         [12., 22., 32.]],
 
-                        [[12., 22., 32.],
-                         [13., 23., 33.]]]],
 
+                      [[[[0.],
+                         [2.]],
 
+                        [[1.],
+                         [3.]],
 
-                      [[[[13., 23., 33.],
-                         [14., 24., 34.]],
+                        [[0.],
+                         [4.]]],
 
-                        [[14., 24., 34.],
-                         [15., 25., 35.]],
 
-                        [[15., 25., 35.],
-                         [16., 26., 36.]],
+                       [[[1.],
+                         [3.]],
 
-                        [[16., 26., 36.],
-                         [17., 27., 37.]]],
+                        [[0.],
+                         [4.]],
 
+                        [[2.],
+                         [0.]]],
 
-                       [[[17., 27., 37.],
-                         [18., 28., 38.]],
 
-                        [[18., 28., 38.],
-                         [19., 29., 39.]],
+                       [[[1.],
+                         [0.]],
 
-                        [[19., 29., 39.],
-                         [20., 30., 40.]],
+                        [[2.],
+                         [4.]],
 
-                        [[20., 30., 40.],
-                         [21., 31., 41.]]],
-
-
-                       [[[21., 31., 41.],
-                         [22., 32., 42.]],
-
-                        [[22., 32., 42.],
-                         [23., 33., 43.]],
-
-                        [[23., 33., 43.],
-                         [24., 34., 44.]],
-
-                        [[24., 34., 44.],
-                         [25., 35., 45.]]]]]), size=(2, 3, 4, 9), nnz=4,
+                        [[3.],
+                         [0.]]]]]), size=(2, 3, 2, 3), nnz=3,
        layout=torch.sparse_bsr)
 # _crow_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+tensor([[[0, 3],
+         [0, 3],
+         [0, 3]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], dtype=torch.int32)
+        [[0, 3],
+         [0, 3],
+         [0, 3]]], dtype=torch.int32)
 # _col_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
+tensor([[[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], dtype=torch.int32)
+        [[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]]], dtype=torch.int32)
 # _values
-tensor([[[[[ 1., 11., 21.],
-           [ 2., 12., 22.]],
-
-          [[ 2., 12., 22.],
-           [ 3., 13., 23.]],
-
-          [[ 3., 13., 23.],
-           [ 4., 14., 24.]],
-
-          [[ 4., 14., 24.],
-           [ 5., 15., 25.]]],
+tensor([[[[[1.],
+           [3.]],
 
+          [[2.],
+           [0.]],
 
-         [[[ 5., 15., 25.],
-           [ 6., 16., 26.]],
+          [[0.],
+           [4.]]],
 
-          [[ 6., 16., 26.],
-           [ 7., 17., 27.]],
 
-          [[ 7., 17., 27.],
-           [ 8., 18., 28.]],
+         [[[1.],
+           [4.]],
 
-          [[ 8., 18., 28.],
-           [ 9., 19., 29.]]],
+          [[2.],
+           [0.]],
 
+          [[3.],
+           [0.]]],
 
-         [[[ 9., 19., 29.],
-           [10., 20., 30.]],
 
-          [[10., 20., 30.],
-           [11., 21., 31.]],
+         [[[1.],
+           [2.]],
 
-          [[11., 21., 31.],
-           [12., 22., 32.]],
+          [[0.],
+           [3.]],
 
-          [[12., 22., 32.],
-           [13., 23., 33.]]]],
+          [[0.],
+           [4.]]]],
 
 
-        [[[[13., 23., 33.],
-           [14., 24., 34.]],
+        [[[[0.],
+           [2.]],
 
-          [[14., 24., 34.],
-           [15., 25., 35.]],
+          [[1.],
+           [3.]],
 
-          [[15., 25., 35.],
-           [16., 26., 36.]],
+          [[0.],
+           [4.]]],
 
-          [[16., 26., 36.],
-           [17., 27., 37.]]],
 
+         [[[1.],
+           [3.]],
 
-         [[[17., 27., 37.],
-           [18., 28., 38.]],
+          [[0.],
+           [4.]],
 
-          [[18., 28., 38.],
-           [19., 29., 39.]],
-
-          [[19., 29., 39.],
-           [20., 30., 40.]],
-
-          [[20., 30., 40.],
-           [21., 31., 41.]]],
-
-
-         [[[21., 31., 41.],
-           [22., 32., 42.]],
-
-          [[22., 32., 42.],
-           [23., 33., 43.]],
-
-          [[23., 33., 43.],
-           [24., 34., 44.]],
-
-          [[24., 34., 44.],
-           [25., 35., 45.]]]]])
-
-
-########## torch.float64/torch.int32/size=()+(4, 3)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[1.],
-                       [2.]],
-
-                      [[2.],
-                       [3.]],
-
-                      [[3.],
-                       [4.]],
-
-                      [[4.],
-                       [5.]]]), size=(4, 3), nnz=4, dtype=torch.float64,
-       layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0, 2, 4], dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 2], dtype=torch.int32)
-# _values
-tensor([[[1.],
-         [2.]],
+          [[2.],
+           [0.]]],
 
-        [[2.],
-         [3.]],
 
-        [[3.],
-         [4.]],
+         [[[1.],
+           [0.]],
 
-        [[4.],
-         [5.]]], dtype=torch.float64)
+          [[2.],
+           [4.]],
 
-########## torch.float64/torch.int32/size=()+(0, 0)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([0]),
-       col_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0, 2, 1)), size=(0, 0), nnz=0,
-       dtype=torch.float64, layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0], dtype=torch.int32)
-# _col_indices
-tensor([], dtype=torch.int32)
-# _values
-tensor([], size=(0, 2, 1), dtype=torch.float64)
+          [[3.],
+           [0.]]]]])
 
-########## torch.float64/torch.int32/size=(2,)+(2, 6)+() ##########
+########## torch.float32/torch.int32/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(crow_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       col_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[[[ 1., 11.]],
-
-                       [[ 2., 12.]],
-
-                       [[ 3., 13.]],
+tensor(crow_indices=tensor([0, 2, 3, 5, 7]),
+       col_indices=tensor([0, 1, 0, 0, 1, 0, 1]),
+       values=tensor([[[ 0.,  1.,  0.],
+                       [ 0.,  4.,  0.]],
 
-                       [[ 4., 14.]]],
+                      [[ 2.,  0.,  3.],
+                       [ 0.,  5.,  0.]],
 
+                      [[ 6.,  7.,  8.],
+                       [ 0.,  0.,  0.]],
 
-                      [[[ 5., 15.]],
+                      [[ 0.,  9.,  0.],
+                       [13.,  0., 14.]],
 
-                       [[ 6., 16.]],
+                      [[10., 11., 12.],
+                       [15., 16., 17.]],
 
-                       [[ 7., 17.]],
+                      [[ 0.,  0.,  0.],
+                       [20., 21., 22.]],
 
-                       [[ 8., 18.]]]]), size=(2, 2, 6), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_bsr)
+                      [[ 0., 18., 19.],
+                       [ 0., 23., 24.]]]), size=(8, 6), nnz=7,
+       layout=torch.sparse_bsr)
 # _crow_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]], dtype=torch.int32)
+tensor([0, 2, 3, 5, 7], dtype=torch.int32)
 # _col_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]], dtype=torch.int32)
+tensor([0, 1, 0, 0, 1, 0, 1], dtype=torch.int32)
 # _values
-tensor([[[[ 1., 11.]],
+tensor([[[ 0.,  1.,  0.],
+         [ 0.,  4.,  0.]],
 
-         [[ 2., 12.]],
+        [[ 2.,  0.,  3.],
+         [ 0.,  5.,  0.]],
 
-         [[ 3., 13.]],
+        [[ 6.,  7.,  8.],
+         [ 0.,  0.,  0.]],
 
-         [[ 4., 14.]]],
+        [[ 0.,  9.,  0.],
+         [13.,  0., 14.]],
 
+        [[10., 11., 12.],
+         [15., 16., 17.]],
 
-        [[[ 5., 15.]],
+        [[ 0.,  0.,  0.],
+         [20., 21., 22.]],
 
-         [[ 6., 16.]],
+        [[ 0., 18., 19.],
+         [ 0., 23., 24.]]])
 
-         [[ 7., 17.]],
 
-         [[ 8., 18.]]]], dtype=torch.float64)
-
-########## torch.float64/torch.int32/size=(2, 3)+(4, 9)+() ##########
+########## torch.float64/torch.int32/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
-tensor(crow_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1., 11., 21.],
-                         [ 2., 12., 22.]],
-
-                        [[ 2., 12., 22.],
-                         [ 3., 13., 23.]],
-
-                        [[ 3., 13., 23.],
-                         [ 4., 14., 24.]],
-
-                        [[ 4., 14., 24.],
-                         [ 5., 15., 25.]]],
+tensor(crow_indices=tensor([[[0, 3],
+                             [0, 3],
+                             [0, 3]],
 
+                            [[0, 3],
+                             [0, 3],
+                             [0, 3]]]),
+       col_indices=tensor([[[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]],
 
-                       [[[ 5., 15., 25.],
-                         [ 6., 16., 26.]],
+                           [[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]]]),
+       values=tensor([[[[[1.],
+                         [3.]],
 
-                        [[ 6., 16., 26.],
-                         [ 7., 17., 27.]],
+                        [[2.],
+                         [0.]],
 
-                        [[ 7., 17., 27.],
-                         [ 8., 18., 28.]],
+                        [[0.],
+                         [4.]]],
 
-                        [[ 8., 18., 28.],
-                         [ 9., 19., 29.]]],
 
+                       [[[1.],
+                         [4.]],
 
-                       [[[ 9., 19., 29.],
-                         [10., 20., 30.]],
+                        [[2.],
+                         [0.]],
 
-                        [[10., 20., 30.],
-                         [11., 21., 31.]],
+                        [[3.],
+                         [0.]]],
 
-                        [[11., 21., 31.],
-                         [12., 22., 32.]],
 
-                        [[12., 22., 32.],
-                         [13., 23., 33.]]]],
+                       [[[1.],
+                         [2.]],
 
+                        [[0.],
+                         [3.]],
 
+                        [[0.],
+                         [4.]]]],
 
-                      [[[[13., 23., 33.],
-                         [14., 24., 34.]],
 
-                        [[14., 24., 34.],
-                         [15., 25., 35.]],
 
-                        [[15., 25., 35.],
-                         [16., 26., 36.]],
+                      [[[[0.],
+                         [2.]],
 
-                        [[16., 26., 36.],
-                         [17., 27., 37.]]],
+                        [[1.],
+                         [3.]],
 
+                        [[0.],
+                         [4.]]],
 
-                       [[[17., 27., 37.],
-                         [18., 28., 38.]],
 
-                        [[18., 28., 38.],
-                         [19., 29., 39.]],
+                       [[[1.],
+                         [3.]],
 
-                        [[19., 29., 39.],
-                         [20., 30., 40.]],
+                        [[0.],
+                         [4.]],
 
-                        [[20., 30., 40.],
-                         [21., 31., 41.]]],
+                        [[2.],
+                         [0.]]],
 
 
-                       [[[21., 31., 41.],
-                         [22., 32., 42.]],
+                       [[[1.],
+                         [0.]],
 
-                        [[22., 32., 42.],
-                         [23., 33., 43.]],
+                        [[2.],
+                         [4.]],
 
-                        [[23., 33., 43.],
-                         [24., 34., 44.]],
-
-                        [[24., 34., 44.],
-                         [25., 35., 45.]]]]]), size=(2, 3, 4, 9), nnz=4,
+                        [[3.],
+                         [0.]]]]]), size=(2, 3, 2, 3), nnz=3,
        dtype=torch.float64, layout=torch.sparse_bsr)
 # _crow_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+tensor([[[0, 3],
+         [0, 3],
+         [0, 3]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], dtype=torch.int32)
+        [[0, 3],
+         [0, 3],
+         [0, 3]]], dtype=torch.int32)
 # _col_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
+tensor([[[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], dtype=torch.int32)
+        [[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]]], dtype=torch.int32)
 # _values
-tensor([[[[[ 1., 11., 21.],
-           [ 2., 12., 22.]],
-
-          [[ 2., 12., 22.],
-           [ 3., 13., 23.]],
-
-          [[ 3., 13., 23.],
-           [ 4., 14., 24.]],
-
-          [[ 4., 14., 24.],
-           [ 5., 15., 25.]]],
-
-
-         [[[ 5., 15., 25.],
-           [ 6., 16., 26.]],
-
-          [[ 6., 16., 26.],
-           [ 7., 17., 27.]],
-
-          [[ 7., 17., 27.],
-           [ 8., 18., 28.]],
-
-          [[ 8., 18., 28.],
-           [ 9., 19., 29.]]],
-
-
-         [[[ 9., 19., 29.],
-           [10., 20., 30.]],
-
-          [[10., 20., 30.],
-           [11., 21., 31.]],
-
-          [[11., 21., 31.],
-           [12., 22., 32.]],
-
-          [[12., 22., 32.],
-           [13., 23., 33.]]]],
-
-
+tensor([[[[[1.],
+           [3.]],
 
-        [[[[13., 23., 33.],
-           [14., 24., 34.]],
+          [[2.],
+           [0.]],
 
-          [[14., 24., 34.],
-           [15., 25., 35.]],
+          [[0.],
+           [4.]]],
 
-          [[15., 25., 35.],
-           [16., 26., 36.]],
 
-          [[16., 26., 36.],
-           [17., 27., 37.]]],
+         [[[1.],
+           [4.]],
 
+          [[2.],
+           [0.]],
 
-         [[[17., 27., 37.],
-           [18., 28., 38.]],
+          [[3.],
+           [0.]]],
 
-          [[18., 28., 38.],
-           [19., 29., 39.]],
 
-          [[19., 29., 39.],
-           [20., 30., 40.]],
+         [[[1.],
+           [2.]],
 
-          [[20., 30., 40.],
-           [21., 31., 41.]]],
+          [[0.],
+           [3.]],
 
+          [[0.],
+           [4.]]]],
 
-         [[[21., 31., 41.],
-           [22., 32., 42.]],
 
-          [[22., 32., 42.],
-           [23., 33., 43.]],
 
-          [[23., 33., 43.],
-           [24., 34., 44.]],
+        [[[[0.],
+           [2.]],
 
-          [[24., 34., 44.],
-           [25., 35., 45.]]]]], dtype=torch.float64)
+          [[1.],
+           [3.]],
 
+          [[0.],
+           [4.]]],
 
-########## torch.float32/torch.int64/size=()+(4, 3)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[1.],
-                       [2.]],
 
-                      [[2.],
-                       [3.]],
+         [[[1.],
+           [3.]],
 
-                      [[3.],
-                       [4.]],
+          [[0.],
+           [4.]],
 
-                      [[4.],
-                       [5.]]]), size=(4, 3), nnz=4, layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0, 2, 4])
-# _col_indices
-tensor([0, 1, 0, 2])
-# _values
-tensor([[[1.],
-         [2.]],
+          [[2.],
+           [0.]]],
 
-        [[2.],
-         [3.]],
 
-        [[3.],
-         [4.]],
+         [[[1.],
+           [0.]],
 
-        [[4.],
-         [5.]]])
+          [[2.],
+           [4.]],
 
-########## torch.float32/torch.int64/size=()+(0, 0)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([0]),
-       col_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0, 2, 1)), size=(0, 0), nnz=0,
-       layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0])
-# _col_indices
-tensor([], dtype=torch.int64)
-# _values
-tensor([], size=(0, 2, 1))
+          [[3.],
+           [0.]]]]], dtype=torch.float64)
 
-########## torch.float32/torch.int64/size=(2,)+(2, 6)+() ##########
+########## torch.float64/torch.int32/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(crow_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       col_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[[[ 1., 11.]],
-
-                       [[ 2., 12.]],
+tensor(crow_indices=tensor([0, 2, 3, 5, 7]),
+       col_indices=tensor([0, 1, 0, 0, 1, 0, 1]),
+       values=tensor([[[ 0.,  1.,  0.],
+                       [ 0.,  4.,  0.]],
 
-                       [[ 3., 13.]],
+                      [[ 2.,  0.,  3.],
+                       [ 0.,  5.,  0.]],
 
-                       [[ 4., 14.]]],
+                      [[ 6.,  7.,  8.],
+                       [ 0.,  0.,  0.]],
 
+                      [[ 0.,  9.,  0.],
+                       [13.,  0., 14.]],
 
-                      [[[ 5., 15.]],
+                      [[10., 11., 12.],
+                       [15., 16., 17.]],
 
-                       [[ 6., 16.]],
+                      [[ 0.,  0.,  0.],
+                       [20., 21., 22.]],
 
-                       [[ 7., 17.]],
-
-                       [[ 8., 18.]]]]), size=(2, 2, 6), nnz=4,
-       layout=torch.sparse_bsr)
+                      [[ 0., 18., 19.],
+                       [ 0., 23., 24.]]]), size=(8, 6), nnz=7,
+       dtype=torch.float64, layout=torch.sparse_bsr)
 # _crow_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]])
+tensor([0, 2, 3, 5, 7], dtype=torch.int32)
 # _col_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]])
+tensor([0, 1, 0, 0, 1, 0, 1], dtype=torch.int32)
 # _values
-tensor([[[[ 1., 11.]],
-
-         [[ 2., 12.]],
+tensor([[[ 0.,  1.,  0.],
+         [ 0.,  4.,  0.]],
 
-         [[ 3., 13.]],
+        [[ 2.,  0.,  3.],
+         [ 0.,  5.,  0.]],
 
-         [[ 4., 14.]]],
+        [[ 6.,  7.,  8.],
+         [ 0.,  0.,  0.]],
 
+        [[ 0.,  9.,  0.],
+         [13.,  0., 14.]],
 
-        [[[ 5., 15.]],
+        [[10., 11., 12.],
+         [15., 16., 17.]],
 
-         [[ 6., 16.]],
+        [[ 0.,  0.,  0.],
+         [20., 21., 22.]],
 
-         [[ 7., 17.]],
+        [[ 0., 18., 19.],
+         [ 0., 23., 24.]]], dtype=torch.float64)
 
-         [[ 8., 18.]]]])
 
-########## torch.float32/torch.int64/size=(2, 3)+(4, 9)+() ##########
+########## torch.float32/torch.int64/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
-tensor(crow_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1., 11., 21.],
-                         [ 2., 12., 22.]],
+tensor(crow_indices=tensor([[[0, 3],
+                             [0, 3],
+                             [0, 3]],
 
-                        [[ 2., 12., 22.],
-                         [ 3., 13., 23.]],
+                            [[0, 3],
+                             [0, 3],
+                             [0, 3]]]),
+       col_indices=tensor([[[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]],
 
-                        [[ 3., 13., 23.],
-                         [ 4., 14., 24.]],
+                           [[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]]]),
+       values=tensor([[[[[1.],
+                         [3.]],
 
-                        [[ 4., 14., 24.],
-                         [ 5., 15., 25.]]],
+                        [[2.],
+                         [0.]],
 
+                        [[0.],
+                         [4.]]],
 
-                       [[[ 5., 15., 25.],
-                         [ 6., 16., 26.]],
 
-                        [[ 6., 16., 26.],
-                         [ 7., 17., 27.]],
+                       [[[1.],
+                         [4.]],
 
-                        [[ 7., 17., 27.],
-                         [ 8., 18., 28.]],
+                        [[2.],
+                         [0.]],
 
-                        [[ 8., 18., 28.],
-                         [ 9., 19., 29.]]],
+                        [[3.],
+                         [0.]]],
 
 
-                       [[[ 9., 19., 29.],
-                         [10., 20., 30.]],
+                       [[[1.],
+                         [2.]],
 
-                        [[10., 20., 30.],
-                         [11., 21., 31.]],
+                        [[0.],
+                         [3.]],
 
-                        [[11., 21., 31.],
-                         [12., 22., 32.]],
+                        [[0.],
+                         [4.]]]],
 
-                        [[12., 22., 32.],
-                         [13., 23., 33.]]]],
 
 
+                      [[[[0.],
+                         [2.]],
 
-                      [[[[13., 23., 33.],
-                         [14., 24., 34.]],
+                        [[1.],
+                         [3.]],
 
-                        [[14., 24., 34.],
-                         [15., 25., 35.]],
+                        [[0.],
+                         [4.]]],
 
-                        [[15., 25., 35.],
-                         [16., 26., 36.]],
 
-                        [[16., 26., 36.],
-                         [17., 27., 37.]]],
+                       [[[1.],
+                         [3.]],
 
+                        [[0.],
+                         [4.]],
 
-                       [[[17., 27., 37.],
-                         [18., 28., 38.]],
+                        [[2.],
+                         [0.]]],
 
-                        [[18., 28., 38.],
-                         [19., 29., 39.]],
 
-                        [[19., 29., 39.],
-                         [20., 30., 40.]],
+                       [[[1.],
+                         [0.]],
 
-                        [[20., 30., 40.],
-                         [21., 31., 41.]]],
+                        [[2.],
+                         [4.]],
 
-
-                       [[[21., 31., 41.],
-                         [22., 32., 42.]],
-
-                        [[22., 32., 42.],
-                         [23., 33., 43.]],
-
-                        [[23., 33., 43.],
-                         [24., 34., 44.]],
-
-                        [[24., 34., 44.],
-                         [25., 35., 45.]]]]]), size=(2, 3, 4, 9), nnz=4,
+                        [[3.],
+                         [0.]]]]]), size=(2, 3, 2, 3), nnz=3,
        layout=torch.sparse_bsr)
 # _crow_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+tensor([[[0, 3],
+         [0, 3],
+         [0, 3]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]])
+        [[0, 3],
+         [0, 3],
+         [0, 3]]])
 # _col_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
+tensor([[[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]])
+        [[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]]])
 # _values
-tensor([[[[[ 1., 11., 21.],
-           [ 2., 12., 22.]],
-
-          [[ 2., 12., 22.],
-           [ 3., 13., 23.]],
-
-          [[ 3., 13., 23.],
-           [ 4., 14., 24.]],
-
-          [[ 4., 14., 24.],
-           [ 5., 15., 25.]]],
-
-
-         [[[ 5., 15., 25.],
-           [ 6., 16., 26.]],
-
-          [[ 6., 16., 26.],
-           [ 7., 17., 27.]],
-
-          [[ 7., 17., 27.],
-           [ 8., 18., 28.]],
-
-          [[ 8., 18., 28.],
-           [ 9., 19., 29.]]],
-
-
-         [[[ 9., 19., 29.],
-           [10., 20., 30.]],
-
-          [[10., 20., 30.],
-           [11., 21., 31.]],
-
-          [[11., 21., 31.],
-           [12., 22., 32.]],
+tensor([[[[[1.],
+           [3.]],
 
-          [[12., 22., 32.],
-           [13., 23., 33.]]]],
+          [[2.],
+           [0.]],
 
+          [[0.],
+           [4.]]],
 
 
-        [[[[13., 23., 33.],
-           [14., 24., 34.]],
+         [[[1.],
+           [4.]],
 
-          [[14., 24., 34.],
-           [15., 25., 35.]],
+          [[2.],
+           [0.]],
 
-          [[15., 25., 35.],
-           [16., 26., 36.]],
+          [[3.],
+           [0.]]],
 
-          [[16., 26., 36.],
-           [17., 27., 37.]]],
 
+         [[[1.],
+           [2.]],
 
-         [[[17., 27., 37.],
-           [18., 28., 38.]],
+          [[0.],
+           [3.]],
 
-          [[18., 28., 38.],
-           [19., 29., 39.]],
+          [[0.],
+           [4.]]]],
 
-          [[19., 29., 39.],
-           [20., 30., 40.]],
 
-          [[20., 30., 40.],
-           [21., 31., 41.]]],
 
+        [[[[0.],
+           [2.]],
 
-         [[[21., 31., 41.],
-           [22., 32., 42.]],
+          [[1.],
+           [3.]],
 
-          [[22., 32., 42.],
-           [23., 33., 43.]],
+          [[0.],
+           [4.]]],
 
-          [[23., 33., 43.],
-           [24., 34., 44.]],
 
-          [[24., 34., 44.],
-           [25., 35., 45.]]]]])
+         [[[1.],
+           [3.]],
 
+          [[0.],
+           [4.]],
 
-########## torch.float64/torch.int64/size=()+(4, 3)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[1.],
-                       [2.]],
-
-                      [[2.],
-                       [3.]],
-
-                      [[3.],
-                       [4.]],
-
-                      [[4.],
-                       [5.]]]), size=(4, 3), nnz=4, dtype=torch.float64,
-       layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0, 2, 4])
-# _col_indices
-tensor([0, 1, 0, 2])
-# _values
-tensor([[[1.],
-         [2.]],
+          [[2.],
+           [0.]]],
 
-        [[2.],
-         [3.]],
 
-        [[3.],
-         [4.]],
+         [[[1.],
+           [0.]],
 
-        [[4.],
-         [5.]]], dtype=torch.float64)
+          [[2.],
+           [4.]],
 
-########## torch.float64/torch.int64/size=()+(0, 0)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([0]),
-       col_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0, 2, 1)), size=(0, 0), nnz=0,
-       dtype=torch.float64, layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0])
-# _col_indices
-tensor([], dtype=torch.int64)
-# _values
-tensor([], size=(0, 2, 1), dtype=torch.float64)
+          [[3.],
+           [0.]]]]])
 
-########## torch.float64/torch.int64/size=(2,)+(2, 6)+() ##########
+########## torch.float32/torch.int64/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(crow_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       col_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[[[ 1., 11.]],
-
-                       [[ 2., 12.]],
+tensor(crow_indices=tensor([0, 2, 3, 5, 7]),
+       col_indices=tensor([0, 1, 0, 0, 1, 0, 1]),
+       values=tensor([[[ 0.,  1.,  0.],
+                       [ 0.,  4.,  0.]],
 
-                       [[ 3., 13.]],
+                      [[ 2.,  0.,  3.],
+                       [ 0.,  5.,  0.]],
 
-                       [[ 4., 14.]]],
+                      [[ 6.,  7.,  8.],
+                       [ 0.,  0.,  0.]],
 
+                      [[ 0.,  9.,  0.],
+                       [13.,  0., 14.]],
 
-                      [[[ 5., 15.]],
+                      [[10., 11., 12.],
+                       [15., 16., 17.]],
 
-                       [[ 6., 16.]],
+                      [[ 0.,  0.,  0.],
+                       [20., 21., 22.]],
 
-                       [[ 7., 17.]],
-
-                       [[ 8., 18.]]]]), size=(2, 2, 6), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_bsr)
+                      [[ 0., 18., 19.],
+                       [ 0., 23., 24.]]]), size=(8, 6), nnz=7,
+       layout=torch.sparse_bsr)
 # _crow_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]])
+tensor([0, 2, 3, 5, 7])
 # _col_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]])
+tensor([0, 1, 0, 0, 1, 0, 1])
 # _values
-tensor([[[[ 1., 11.]],
-
-         [[ 2., 12.]],
+tensor([[[ 0.,  1.,  0.],
+         [ 0.,  4.,  0.]],
 
-         [[ 3., 13.]],
+        [[ 2.,  0.,  3.],
+         [ 0.,  5.,  0.]],
 
-         [[ 4., 14.]]],
+        [[ 6.,  7.,  8.],
+         [ 0.,  0.,  0.]],
 
+        [[ 0.,  9.,  0.],
+         [13.,  0., 14.]],
 
-        [[[ 5., 15.]],
+        [[10., 11., 12.],
+         [15., 16., 17.]],
 
-         [[ 6., 16.]],
+        [[ 0.,  0.,  0.],
+         [20., 21., 22.]],
 
-         [[ 7., 17.]],
+        [[ 0., 18., 19.],
+         [ 0., 23., 24.]]])
 
-         [[ 8., 18.]]]], dtype=torch.float64)
 
-########## torch.float64/torch.int64/size=(2, 3)+(4, 9)+() ##########
+########## torch.float64/torch.int64/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
-tensor(crow_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1., 11., 21.],
-                         [ 2., 12., 22.]],
-
-                        [[ 2., 12., 22.],
-                         [ 3., 13., 23.]],
-
-                        [[ 3., 13., 23.],
-                         [ 4., 14., 24.]],
-
-                        [[ 4., 14., 24.],
-                         [ 5., 15., 25.]]],
-
+tensor(crow_indices=tensor([[[0, 3],
+                             [0, 3],
+                             [0, 3]],
 
-                       [[[ 5., 15., 25.],
-                         [ 6., 16., 26.]],
+                            [[0, 3],
+                             [0, 3],
+                             [0, 3]]]),
+       col_indices=tensor([[[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]],
 
-                        [[ 6., 16., 26.],
-                         [ 7., 17., 27.]],
+                           [[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]]]),
+       values=tensor([[[[[1.],
+                         [3.]],
 
-                        [[ 7., 17., 27.],
-                         [ 8., 18., 28.]],
+                        [[2.],
+                         [0.]],
 
-                        [[ 8., 18., 28.],
-                         [ 9., 19., 29.]]],
+                        [[0.],
+                         [4.]]],
 
 
-                       [[[ 9., 19., 29.],
-                         [10., 20., 30.]],
+                       [[[1.],
+                         [4.]],
 
-                        [[10., 20., 30.],
-                         [11., 21., 31.]],
+                        [[2.],
+                         [0.]],
 
-                        [[11., 21., 31.],
-                         [12., 22., 32.]],
+                        [[3.],
+                         [0.]]],
 
-                        [[12., 22., 32.],
-                         [13., 23., 33.]]]],
 
+                       [[[1.],
+                         [2.]],
 
+                        [[0.],
+                         [3.]],
 
-                      [[[[13., 23., 33.],
-                         [14., 24., 34.]],
+                        [[0.],
+                         [4.]]]],
 
-                        [[14., 24., 34.],
-                         [15., 25., 35.]],
 
-                        [[15., 25., 35.],
-                         [16., 26., 36.]],
 
-                        [[16., 26., 36.],
-                         [17., 27., 37.]]],
+                      [[[[0.],
+                         [2.]],
 
+                        [[1.],
+                         [3.]],
 
-                       [[[17., 27., 37.],
-                         [18., 28., 38.]],
+                        [[0.],
+                         [4.]]],
 
-                        [[18., 28., 38.],
-                         [19., 29., 39.]],
 
-                        [[19., 29., 39.],
-                         [20., 30., 40.]],
+                       [[[1.],
+                         [3.]],
 
-                        [[20., 30., 40.],
-                         [21., 31., 41.]]],
+                        [[0.],
+                         [4.]],
 
+                        [[2.],
+                         [0.]]],
 
-                       [[[21., 31., 41.],
-                         [22., 32., 42.]],
 
-                        [[22., 32., 42.],
-                         [23., 33., 43.]],
+                       [[[1.],
+                         [0.]],
 
-                        [[23., 33., 43.],
-                         [24., 34., 44.]],
+                        [[2.],
+                         [4.]],
 
-                        [[24., 34., 44.],
-                         [25., 35., 45.]]]]]), size=(2, 3, 4, 9), nnz=4,
+                        [[3.],
+                         [0.]]]]]), size=(2, 3, 2, 3), nnz=3,
        dtype=torch.float64, layout=torch.sparse_bsr)
 # _crow_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+tensor([[[0, 3],
+         [0, 3],
+         [0, 3]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]])
+        [[0, 3],
+         [0, 3],
+         [0, 3]]])
 # _col_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
+tensor([[[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]])
+        [[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]]])
 # _values
-tensor([[[[[ 1., 11., 21.],
-           [ 2., 12., 22.]],
+tensor([[[[[1.],
+           [3.]],
 
-          [[ 2., 12., 22.],
-           [ 3., 13., 23.]],
+          [[2.],
+           [0.]],
 
-          [[ 3., 13., 23.],
-           [ 4., 14., 24.]],
+          [[0.],
+           [4.]]],
 
-          [[ 4., 14., 24.],
-           [ 5., 15., 25.]]],
 
+         [[[1.],
+           [4.]],
 
-         [[[ 5., 15., 25.],
-           [ 6., 16., 26.]],
+          [[2.],
+           [0.]],
 
-          [[ 6., 16., 26.],
-           [ 7., 17., 27.]],
+          [[3.],
+           [0.]]],
 
-          [[ 7., 17., 27.],
-           [ 8., 18., 28.]],
 
-          [[ 8., 18., 28.],
-           [ 9., 19., 29.]]],
+         [[[1.],
+           [2.]],
 
+          [[0.],
+           [3.]],
 
-         [[[ 9., 19., 29.],
-           [10., 20., 30.]],
+          [[0.],
+           [4.]]]],
 
-          [[10., 20., 30.],
-           [11., 21., 31.]],
 
-          [[11., 21., 31.],
-           [12., 22., 32.]],
 
-          [[12., 22., 32.],
-           [13., 23., 33.]]]],
+        [[[[0.],
+           [2.]],
 
+          [[1.],
+           [3.]],
 
+          [[0.],
+           [4.]]],
 
-        [[[[13., 23., 33.],
-           [14., 24., 34.]],
 
-          [[14., 24., 34.],
-           [15., 25., 35.]],
+         [[[1.],
+           [3.]],
 
-          [[15., 25., 35.],
-           [16., 26., 36.]],
+          [[0.],
+           [4.]],
 
-          [[16., 26., 36.],
-           [17., 27., 37.]]],
+          [[2.],
+           [0.]]],
 
 
-         [[[17., 27., 37.],
-           [18., 28., 38.]],
+         [[[1.],
+           [0.]],
 
-          [[18., 28., 38.],
-           [19., 29., 39.]],
+          [[2.],
+           [4.]],
 
-          [[19., 29., 39.],
-           [20., 30., 40.]],
+          [[3.],
+           [0.]]]]], dtype=torch.float64)
 
-          [[20., 30., 40.],
-           [21., 31., 41.]]],
-
-
-         [[[21., 31., 41.],
-           [22., 32., 42.]],
-
-          [[22., 32., 42.],
-           [23., 33., 43.]],
-
-          [[23., 33., 43.],
-           [24., 34., 44.]],
-
-          [[24., 34., 44.],
-           [25., 35., 45.]]]]], dtype=torch.float64)
-
-
-########## torch.float32/torch.int32/size=()+(6, 6)+(2,) ##########
+########## torch.float64/torch.int64/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[  1., 101.],
-                        [ 11., 111.]],
-
-                       [[  2., 102.],
-                        [ 12., 112.]],
+tensor(crow_indices=tensor([0, 2, 3, 5, 7]),
+       col_indices=tensor([0, 1, 0, 0, 1, 0, 1]),
+       values=tensor([[[ 0.,  1.,  0.],
+                       [ 0.,  4.,  0.]],
 
-                       [[  3., 103.],
-                        [ 13., 113.]]],
+                      [[ 2.,  0.,  3.],
+                       [ 0.,  5.,  0.]],
 
+                      [[ 6.,  7.,  8.],
+                       [ 0.,  0.,  0.]],
 
-                      [[[  2., 102.],
-                        [ 12., 112.]],
+                      [[ 0.,  9.,  0.],
+                       [13.,  0., 14.]],
 
-                       [[  3., 103.],
-                        [ 13., 113.]],
+                      [[10., 11., 12.],
+                       [15., 16., 17.]],
 
-                       [[  4., 104.],
-                        [ 14., 114.]]],
+                      [[ 0.,  0.,  0.],
+                       [20., 21., 22.]],
 
-
-                      [[[  3., 103.],
-                        [ 13., 113.]],
-
-                       [[  4., 104.],
-                        [ 14., 114.]],
-
-                       [[  5., 105.],
-                        [ 15., 115.]]],
-
-
-                      [[[  4., 104.],
-                        [ 14., 114.]],
-
-                       [[  5., 105.],
-                        [ 15., 115.]],
-
-                       [[  6., 106.],
-                        [ 16., 116.]]]]), size=(6, 6, 2), nnz=4,
-       layout=torch.sparse_bsr)
+                      [[ 0., 18., 19.],
+                       [ 0., 23., 24.]]]), size=(8, 6), nnz=7,
+       dtype=torch.float64, layout=torch.sparse_bsr)
 # _crow_indices
-tensor([0, 2, 4], dtype=torch.int32)
+tensor([0, 2, 3, 5, 7])
 # _col_indices
-tensor([0, 1, 0, 2], dtype=torch.int32)
+tensor([0, 1, 0, 0, 1, 0, 1])
 # _values
-tensor([[[[  1., 101.],
-          [ 11., 111.]],
-
-         [[  2., 102.],
-          [ 12., 112.]],
-
-         [[  3., 103.],
-          [ 13., 113.]]],
-
-
-        [[[  2., 102.],
-          [ 12., 112.]],
-
-         [[  3., 103.],
-          [ 13., 113.]],
-
-         [[  4., 104.],
-          [ 14., 114.]]],
-
+tensor([[[ 0.,  1.,  0.],
+         [ 0.,  4.,  0.]],
 
-        [[[  3., 103.],
-          [ 13., 113.]],
+        [[ 2.,  0.,  3.],
+         [ 0.,  5.,  0.]],
 
-         [[  4., 104.],
-          [ 14., 114.]],
+        [[ 6.,  7.,  8.],
+         [ 0.,  0.,  0.]],
 
-         [[  5., 105.],
-          [ 15., 115.]]],
+        [[ 0.,  9.,  0.],
+         [13.,  0., 14.]],
 
+        [[10., 11., 12.],
+         [15., 16., 17.]],
 
-        [[[  4., 104.],
-          [ 14., 114.]],
+        [[ 0.,  0.,  0.],
+         [20., 21., 22.]],
 
-         [[  5., 105.],
-          [ 15., 115.]],
+        [[ 0., 18., 19.],
+         [ 0., 23., 24.]]], dtype=torch.float64)
 
-         [[  6., 106.],
-          [ 16., 116.]]]])
 
-########## torch.float32/torch.int32/size=()+(4, 9)+(4, 2) ##########
+########## torch.float32/torch.int32/size=(2, 3)+(2, 3)+(4,) ##########
 # sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[[1.0000e+00, 1.0010e+03],
-                         [1.0100e+02, 1.1010e+03],
-                         [2.0100e+02, 1.2010e+03],
-                         [3.0100e+02, 1.3010e+03]],
+tensor(crow_indices=tensor([[[0, 3],
+                             [0, 3],
+                             [0, 3]],
 
-                        [[1.1000e+01, 1.0110e+03],
-                         [1.1100e+02, 1.1110e+03],
-                         [2.1100e+02, 1.2110e+03],
-                         [3.1100e+02, 1.3110e+03]],
+                            [[0, 3],
+                             [0, 3],
+                             [0, 3]]]),
+       col_indices=tensor([[[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]],
 
-                        [[2.1000e+01, 1.0210e+03],
-                         [1.2100e+02, 1.1210e+03],
-                         [2.2100e+02, 1.2210e+03],
-                         [3.2100e+02, 1.3210e+03]]],
+                           [[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]]]),
+       values=tensor([[[[[[1., 2., 3., 4.]],
 
+                         [[3., 4., 5., 6.]]],
 
-                       [[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
 
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]],
+                        [[[2., 3., 4., 5.]],
 
-                        [[2.2000e+01, 1.0220e+03],
-                         [1.2200e+02, 1.1220e+03],
-                         [2.2200e+02, 1.2220e+03],
-                         [3.2200e+02, 1.3220e+03]]]],
+                         [[0., 0., 0., 0.]]],
 
 
+                        [[[0., 0., 0., 0.]],
 
-                      [[[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
+                         [[4., 5., 6., 7.]]]],
 
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]],
 
-                        [[2.2000e+01, 1.0220e+03],
-                         [1.2200e+02, 1.1220e+03],
-                         [2.2200e+02, 1.2220e+03],
-                         [3.2200e+02, 1.3220e+03]]],
 
+                       [[[[1., 2., 3., 4.]],
 
-                       [[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
+                         [[4., 5., 6., 7.]]],
 
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]],
 
-                        [[2.3000e+01, 1.0230e+03],
-                         [1.2300e+02, 1.1230e+03],
-                         [2.2300e+02, 1.2230e+03],
-                         [3.2300e+02, 1.3230e+03]]]],
+                        [[[2., 3., 4., 5.]],
 
+                         [[0., 0., 0., 0.]]],
 
 
-                      [[[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
+                        [[[3., 4., 5., 6.]],
 
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]],
+                         [[0., 0., 0., 0.]]]],
 
-                        [[2.3000e+01, 1.0230e+03],
-                         [1.2300e+02, 1.1230e+03],
-                         [2.2300e+02, 1.2230e+03],
-                         [3.2300e+02, 1.3230e+03]]],
 
 
-                       [[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
+                       [[[[1., 2., 3., 4.]],
 
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]],
+                         [[2., 3., 4., 5.]]],
 
-                        [[2.4000e+01, 1.0240e+03],
-                         [1.2400e+02, 1.1240e+03],
-                         [2.2400e+02, 1.2240e+03],
-                         [3.2400e+02, 1.3240e+03]]]],
 
+                        [[[0., 0., 0., 0.]],
 
+                         [[3., 4., 5., 6.]]],
 
-                      [[[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
 
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]],
+                        [[[0., 0., 0., 0.]],
 
-                        [[2.4000e+01, 1.0240e+03],
-                         [1.2400e+02, 1.1240e+03],
-                         [2.2400e+02, 1.2240e+03],
-                         [3.2400e+02, 1.3240e+03]]],
+                         [[4., 5., 6., 7.]]]]],
 
 
-                       [[[5.0000e+00, 1.0050e+03],
-                         [1.0500e+02, 1.1050e+03],
-                         [2.0500e+02, 1.2050e+03],
-                         [3.0500e+02, 1.3050e+03]],
 
-                        [[1.5000e+01, 1.0150e+03],
-                         [1.1500e+02, 1.1150e+03],
-                         [2.1500e+02, 1.2150e+03],
-                         [3.1500e+02, 1.3150e+03]],
 
-                        [[2.5000e+01, 1.0250e+03],
-                         [1.2500e+02, 1.1250e+03],
-                         [2.2500e+02, 1.2250e+03],
-                         [3.2500e+02, 1.3250e+03]]]]]), size=(4, 9, 4, 2),
-       nnz=4, layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0, 2, 4], dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 2], dtype=torch.int32)
-# _values
-tensor([[[[[1.0000e+00, 1.0010e+03],
-           [1.0100e+02, 1.1010e+03],
-           [2.0100e+02, 1.2010e+03],
-           [3.0100e+02, 1.3010e+03]],
-
-          [[1.1000e+01, 1.0110e+03],
-           [1.1100e+02, 1.1110e+03],
-           [2.1100e+02, 1.2110e+03],
-           [3.1100e+02, 1.3110e+03]],
-
-          [[2.1000e+01, 1.0210e+03],
-           [1.2100e+02, 1.1210e+03],
-           [2.2100e+02, 1.2210e+03],
-           [3.2100e+02, 1.3210e+03]]],
-
-
-         [[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
-
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]],
+                      [[[[[0., 0., 0., 0.]],
 
-          [[2.2000e+01, 1.0220e+03],
-           [1.2200e+02, 1.1220e+03],
-           [2.2200e+02, 1.2220e+03],
-           [3.2200e+02, 1.3220e+03]]]],
+                         [[2., 3., 4., 5.]]],
 
 
+                        [[[1., 2., 3., 4.]],
 
-        [[[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
+                         [[3., 4., 5., 6.]]],
 
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]],
 
-          [[2.2000e+01, 1.0220e+03],
-           [1.2200e+02, 1.1220e+03],
-           [2.2200e+02, 1.2220e+03],
-           [3.2200e+02, 1.3220e+03]]],
+                        [[[0., 0., 0., 0.]],
 
+                         [[4., 5., 6., 7.]]]],
 
-         [[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
 
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]],
 
-          [[2.3000e+01, 1.0230e+03],
-           [1.2300e+02, 1.1230e+03],
-           [2.2300e+02, 1.2230e+03],
-           [3.2300e+02, 1.3230e+03]]]],
+                       [[[[1., 2., 3., 4.]],
 
+                         [[3., 4., 5., 6.]]],
 
 
-        [[[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
+                        [[[0., 0., 0., 0.]],
 
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]],
+                         [[4., 5., 6., 7.]]],
 
-          [[2.3000e+01, 1.0230e+03],
-           [1.2300e+02, 1.1230e+03],
-           [2.2300e+02, 1.2230e+03],
-           [3.2300e+02, 1.3230e+03]]],
 
+                        [[[2., 3., 4., 5.]],
 
-         [[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
+                         [[0., 0., 0., 0.]]]],
 
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]],
 
-          [[2.4000e+01, 1.0240e+03],
-           [1.2400e+02, 1.1240e+03],
-           [2.2400e+02, 1.2240e+03],
-           [3.2400e+02, 1.3240e+03]]]],
 
+                       [[[[1., 2., 3., 4.]],
 
+                         [[0., 0., 0., 0.]]],
 
-        [[[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
 
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]],
+                        [[[2., 3., 4., 5.]],
 
-          [[2.4000e+01, 1.0240e+03],
-           [1.2400e+02, 1.1240e+03],
-           [2.2400e+02, 1.2240e+03],
-           [3.2400e+02, 1.3240e+03]]],
+                         [[4., 5., 6., 7.]]],
 
 
-         [[[5.0000e+00, 1.0050e+03],
-           [1.0500e+02, 1.1050e+03],
-           [2.0500e+02, 1.2050e+03],
-           [3.0500e+02, 1.3050e+03]],
+                        [[[3., 4., 5., 6.]],
 
-          [[1.5000e+01, 1.0150e+03],
-           [1.1500e+02, 1.1150e+03],
-           [2.1500e+02, 1.2150e+03],
-           [3.1500e+02, 1.3150e+03]],
-
-          [[2.5000e+01, 1.0250e+03],
-           [1.2500e+02, 1.1250e+03],
-           [2.2500e+02, 1.2250e+03],
-           [3.2500e+02, 1.3250e+03]]]]])
-
-########## torch.float32/torch.int32/size=(2, 3)+(6, 6)+(2, 1) ##########
-# sparse tensor
-tensor(crow_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[[[  1.],
-                           [101.]],
+                         [[0., 0., 0., 0.]]]]]]), size=(2, 3, 2, 3, 4), nnz=3,
+       layout=torch.sparse_bsr)
+# _crow_indices
+tensor([[[0, 3],
+         [0, 3],
+         [0, 3]],
 
-                          [[ 11.],
-                           [111.]]],
+        [[0, 3],
+         [0, 3],
+         [0, 3]]], dtype=torch.int32)
+# _col_indices
+tensor([[[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]],
 
+        [[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]]], dtype=torch.int32)
+# _values
+tensor([[[[[[1., 2., 3., 4.]],
 
-                         [[[  2.],
-                           [102.]],
+           [[3., 4., 5., 6.]]],
 
-                          [[ 12.],
-                           [112.]]],
 
+          [[[2., 3., 4., 5.]],
 
-                         [[[  3.],
-                           [103.]],
+           [[0., 0., 0., 0.]]],
 
-                          [[ 13.],
-                           [113.]]]],
 
+          [[[0., 0., 0., 0.]],
 
+           [[4., 5., 6., 7.]]]],
 
-                        [[[[  2.],
-                           [102.]],
 
-                          [[ 12.],
-                           [112.]]],
 
+         [[[[1., 2., 3., 4.]],
 
-                         [[[  3.],
-                           [103.]],
+           [[4., 5., 6., 7.]]],
 
-                          [[ 13.],
-                           [113.]]],
 
+          [[[2., 3., 4., 5.]],
 
-                         [[[  4.],
-                           [104.]],
+           [[0., 0., 0., 0.]]],
 
-                          [[ 14.],
-                           [114.]]]],
 
+          [[[3., 4., 5., 6.]],
 
+           [[0., 0., 0., 0.]]]],
 
-                        [[[[  3.],
-                           [103.]],
 
-                          [[ 13.],
-                           [113.]]],
 
+         [[[[1., 2., 3., 4.]],
 
-                         [[[  4.],
-                           [104.]],
+           [[2., 3., 4., 5.]]],
 
-                          [[ 14.],
-                           [114.]]],
 
+          [[[0., 0., 0., 0.]],
 
-                         [[[  5.],
-                           [105.]],
+           [[3., 4., 5., 6.]]],
 
-                          [[ 15.],
-                           [115.]]]],
 
+          [[[0., 0., 0., 0.]],
 
+           [[4., 5., 6., 7.]]]]],
 
-                        [[[[  4.],
-                           [104.]],
 
-                          [[ 14.],
-                           [114.]]],
 
 
-                         [[[  5.],
-                           [105.]],
+        [[[[[0., 0., 0., 0.]],
 
-                          [[ 15.],
-                           [115.]]],
+           [[2., 3., 4., 5.]]],
 
 
-                         [[[  6.],
-                           [106.]],
+          [[[1., 2., 3., 4.]],
 
-                          [[ 16.],
-                           [116.]]]]],
+           [[3., 4., 5., 6.]]],
 
 
+          [[[0., 0., 0., 0.]],
 
+           [[4., 5., 6., 7.]]]],
 
-                       [[[[[  5.],
-                           [105.]],
 
-                          [[ 15.],
-                           [115.]]],
 
+         [[[[1., 2., 3., 4.]],
 
-                         [[[  6.],
-                           [106.]],
+           [[3., 4., 5., 6.]]],
 
-                          [[ 16.],
-                           [116.]]],
 
+          [[[0., 0., 0., 0.]],
 
-                         [[[  7.],
-                           [107.]],
+           [[4., 5., 6., 7.]]],
 
-                          [[ 17.],
-                           [117.]]]],
 
+          [[[2., 3., 4., 5.]],
 
+           [[0., 0., 0., 0.]]]],
 
-                        [[[[  6.],
-                           [106.]],
 
-                          [[ 16.],
-                           [116.]]],
 
+         [[[[1., 2., 3., 4.]],
 
-                         [[[  7.],
-                           [107.]],
+           [[0., 0., 0., 0.]]],
 
-                          [[ 17.],
-                           [117.]]],
 
+          [[[2., 3., 4., 5.]],
 
-                         [[[  8.],
-                           [108.]],
+           [[4., 5., 6., 7.]]],
 
-                          [[ 18.],
-                           [118.]]]],
 
+          [[[3., 4., 5., 6.]],
 
+           [[0., 0., 0., 0.]]]]]])
 
-                        [[[[  7.],
-                           [107.]],
+########## torch.float32/torch.int32/size=()+(8, 6)+(4, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([0, 2, 3, 5, 7]),
+       col_indices=tensor([0, 1, 0, 0, 1, 0, 1]),
+       values=tensor([[[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 17.],
-                           [117.]]],
+                        [[ 1., 11.],
+                         [ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
-                         [[[  8.],
-                           [108.]],
 
-                          [[ 18.],
-                           [118.]]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.]],
 
-                         [[[  9.],
-                           [109.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-                          [[ 19.],
-                           [119.]]]],
 
 
+                      [[[[ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.]],
 
-                        [[[[  8.],
-                           [108.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 18.],
-                           [118.]]],
+                        [[ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.]]],
 
 
-                         [[[  9.],
-                           [109.]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 19.],
-                           [119.]]],
+                        [[ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-                         [[[ 10.],
-                           [110.]],
 
-                          [[ 20.],
-                           [120.]]]]],
 
+                      [[[[ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.]],
 
+                        [[ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.]],
 
+                        [[ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.],
+                         [11., 21.]]],
 
-                       [[[[[  9.],
-                           [109.]],
 
-                          [[ 19.],
-                           [119.]]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                         [[[ 10.],
-                           [110.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-                          [[ 20.],
-                           [120.]]],
 
 
-                         [[[ 11.],
-                           [111.]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 21.],
-                           [121.]]]],
+                        [[ 9., 19.],
+                         [10., 20.],
+                         [11., 21.],
+                         [12., 22.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
 
-                        [[[[ 10.],
-                           [110.]],
+                       [[[13., 23.],
+                         [14., 24.],
+                         [15., 25.],
+                         [16., 26.]],
 
-                          [[ 20.],
-                           [120.]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[14., 24.],
+                         [15., 25.],
+                         [16., 26.],
+                         [17., 27.]]]],
 
-                         [[[ 11.],
-                           [111.]],
 
-                          [[ 21.],
-                           [121.]]],
 
+                      [[[[10., 20.],
+                         [11., 21.],
+                         [12., 22.],
+                         [13., 23.]],
 
-                         [[[ 12.],
-                           [112.]],
+                        [[11., 21.],
+                         [12., 22.],
+                         [13., 23.],
+                         [14., 24.]],
 
-                          [[ 22.],
-                           [122.]]]],
+                        [[12., 22.],
+                         [13., 23.],
+                         [14., 24.],
+                         [15., 25.]]],
 
 
+                       [[[15., 25.],
+                         [16., 26.],
+                         [17., 27.],
+                         [18., 28.]],
 
-                        [[[[ 11.],
-                           [111.]],
+                        [[16., 26.],
+                         [17., 27.],
+                         [18., 28.],
+                         [19., 29.]],
 
-                          [[ 21.],
-                           [121.]]],
+                        [[17., 27.],
+                         [18., 28.],
+                         [19., 29.],
+                         [20., 30.]]]],
 
 
-                         [[[ 12.],
-                           [112.]],
 
-                          [[ 22.],
-                           [122.]]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                         [[[ 13.],
-                           [113.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
-                          [[ 23.],
-                           [123.]]]],
 
+                       [[[20., 30.],
+                         [21., 31.],
+                         [22., 32.],
+                         [23., 33.]],
 
+                        [[21., 31.],
+                         [22., 32.],
+                         [23., 33.],
+                         [24., 34.]],
 
-                        [[[[ 12.],
-                           [112.]],
+                        [[22., 32.],
+                         [23., 33.],
+                         [24., 34.],
+                         [25., 35.]]]],
 
-                          [[ 22.],
-                           [122.]]],
 
 
-                         [[[ 13.],
-                           [113.]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 23.],
-                           [123.]]],
+                        [[18., 28.],
+                         [19., 29.],
+                         [20., 30.],
+                         [21., 31.]],
 
+                        [[19., 29.],
+                         [20., 30.],
+                         [21., 31.],
+                         [22., 32.]]],
 
-                         [[[ 14.],
-                           [114.]],
 
-                          [[ 24.],
-                           [124.]]]]]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[23., 33.],
+                         [24., 34.],
+                         [25., 35.],
+                         [26., 36.]],
 
+                        [[24., 34.],
+                         [25., 35.],
+                         [26., 36.],
+                         [27., 37.]]]]]), size=(8, 6, 4, 2), nnz=7,
+       layout=torch.sparse_bsr)
+# _crow_indices
+tensor([0, 2, 3, 5, 7], dtype=torch.int32)
+# _col_indices
+tensor([0, 1, 0, 0, 1, 0, 1], dtype=torch.int32)
+# _values
+tensor([[[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 1., 11.],
+           [ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-                      [[[[[[ 13.],
-                           [113.]],
 
-                          [[ 23.],
-                           [123.]]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.]],
 
-                         [[[ 14.],
-                           [114.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-                          [[ 24.],
-                           [124.]]],
 
 
-                         [[[ 15.],
-                           [115.]],
+        [[[[ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.]],
 
-                          [[ 25.],
-                           [125.]]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.]]],
 
 
-                        [[[[ 14.],
-                           [114.]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                          [[ 24.],
-                           [124.]]],
+          [[ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-                         [[[ 15.],
-                           [115.]],
 
-                          [[ 25.],
-                           [125.]]],
 
+        [[[[ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.]],
 
-                         [[[ 16.],
-                           [116.]],
+          [[ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.],
+           [10., 20.]],
 
-                          [[ 26.],
-                           [126.]]]],
+          [[ 8., 18.],
+           [ 9., 19.],
+           [10., 20.],
+           [11., 21.]]],
 
 
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                        [[[[ 15.],
-                           [115.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                          [[ 25.],
-                           [125.]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
 
-                         [[[ 16.],
-                           [116.]],
 
-                          [[ 26.],
-                           [126.]]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 9., 19.],
+           [10., 20.],
+           [11., 21.],
+           [12., 22.]],
 
-                         [[[ 17.],
-                           [117.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-                          [[ 27.],
-                           [127.]]]],
 
+         [[[13., 23.],
+           [14., 24.],
+           [15., 25.],
+           [16., 26.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                        [[[[ 16.],
-                           [116.]],
+          [[14., 24.],
+           [15., 25.],
+           [16., 26.],
+           [17., 27.]]]],
 
-                          [[ 26.],
-                           [126.]]],
 
 
-                         [[[ 17.],
-                           [117.]],
+        [[[[10., 20.],
+           [11., 21.],
+           [12., 22.],
+           [13., 23.]],
 
-                          [[ 27.],
-                           [127.]]],
+          [[11., 21.],
+           [12., 22.],
+           [13., 23.],
+           [14., 24.]],
 
+          [[12., 22.],
+           [13., 23.],
+           [14., 24.],
+           [15., 25.]]],
 
-                         [[[ 18.],
-                           [118.]],
 
-                          [[ 28.],
-                           [128.]]]]],
+         [[[15., 25.],
+           [16., 26.],
+           [17., 27.],
+           [18., 28.]],
 
+          [[16., 26.],
+           [17., 27.],
+           [18., 28.],
+           [19., 29.]],
 
+          [[17., 27.],
+           [18., 28.],
+           [19., 29.],
+           [20., 30.]]]],
 
 
-                       [[[[[ 17.],
-                           [117.]],
 
-                          [[ 27.],
-                           [127.]]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                         [[[ 18.],
-                           [118.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-                          [[ 28.],
-                           [128.]]],
 
+         [[[20., 30.],
+           [21., 31.],
+           [22., 32.],
+           [23., 33.]],
 
-                         [[[ 19.],
-                           [119.]],
+          [[21., 31.],
+           [22., 32.],
+           [23., 33.],
+           [24., 34.]],
 
-                          [[ 29.],
-                           [129.]]]],
+          [[22., 32.],
+           [23., 33.],
+           [24., 34.],
+           [25., 35.]]]],
 
 
-                        [[[[ 18.],
-                           [118.]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                          [[ 28.],
-                           [128.]]],
+          [[18., 28.],
+           [19., 29.],
+           [20., 30.],
+           [21., 31.]],
 
+          [[19., 29.],
+           [20., 30.],
+           [21., 31.],
+           [22., 32.]]],
 
-                         [[[ 19.],
-                           [119.]],
 
-                          [[ 29.],
-                           [129.]]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[23., 33.],
+           [24., 34.],
+           [25., 35.],
+           [26., 36.]],
 
-                         [[[ 20.],
-                           [120.]],
+          [[24., 34.],
+           [25., 35.],
+           [26., 36.],
+           [27., 37.]]]]])
 
-                          [[ 30.],
-                           [130.]]]],
 
+########## torch.float64/torch.int32/size=(2, 3)+(2, 3)+(4,) ##########
+# sparse tensor
+tensor(crow_indices=tensor([[[0, 3],
+                             [0, 3],
+                             [0, 3]],
 
+                            [[0, 3],
+                             [0, 3],
+                             [0, 3]]]),
+       col_indices=tensor([[[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]],
 
-                        [[[[ 19.],
-                           [119.]],
+                           [[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]]]),
+       values=tensor([[[[[[1., 2., 3., 4.]],
 
-                          [[ 29.],
-                           [129.]]],
+                         [[3., 4., 5., 6.]]],
 
 
-                         [[[ 20.],
-                           [120.]],
+                        [[[2., 3., 4., 5.]],
 
-                          [[ 30.],
-                           [130.]]],
+                         [[0., 0., 0., 0.]]],
 
 
-                         [[[ 21.],
-                           [121.]],
+                        [[[0., 0., 0., 0.]],
 
-                          [[ 31.],
-                           [131.]]]],
+                         [[4., 5., 6., 7.]]]],
 
 
-                        [[[[ 20.],
-                           [120.]],
+                       [[[[1., 2., 3., 4.]],
 
-                          [[ 30.],
-                           [130.]]],
+                         [[4., 5., 6., 7.]]],
 
 
-                         [[[ 21.],
-                           [121.]],
+                        [[[2., 3., 4., 5.]],
 
-                          [[ 31.],
-                           [131.]]],
+                         [[0., 0., 0., 0.]]],
 
 
-                         [[[ 22.],
-                           [122.]],
+                        [[[3., 4., 5., 6.]],
 
-                          [[ 32.],
-                           [132.]]]]],
+                         [[0., 0., 0., 0.]]]],
 
 
+                       [[[[1., 2., 3., 4.]],
 
-                       [[[[[ 21.],
-                           [121.]],
+                         [[2., 3., 4., 5.]]],
 
-                          [[ 31.],
-                           [131.]]],
 
+                        [[[0., 0., 0., 0.]],
 
-                         [[[ 22.],
-                           [122.]],
+                         [[3., 4., 5., 6.]]],
 
-                          [[ 32.],
-                           [132.]]],
 
+                        [[[0., 0., 0., 0.]],
 
-                         [[[ 23.],
-                           [123.]],
+                         [[4., 5., 6., 7.]]]]],
 
-                          [[ 33.],
-                           [133.]]]],
 
 
-                        [[[[ 22.],
-                           [122.]],
+                      [[[[[0., 0., 0., 0.]],
 
-                          [[ 32.],
-                           [132.]]],
+                         [[2., 3., 4., 5.]]],
 
 
-                         [[[ 23.],
-                           [123.]],
+                        [[[1., 2., 3., 4.]],
 
-                          [[ 33.],
-                           [133.]]],
+                         [[3., 4., 5., 6.]]],
 
 
-                         [[[ 24.],
-                           [124.]],
+                        [[[0., 0., 0., 0.]],
 
-                          [[ 34.],
-                           [134.]]]],
+                         [[4., 5., 6., 7.]]]],
 
 
-                        [[[[ 23.],
-                           [123.]],
+                       [[[[1., 2., 3., 4.]],
 
-                          [[ 33.],
-                           [133.]]],
+                         [[3., 4., 5., 6.]]],
 
 
-                         [[[ 24.],
-                           [124.]],
+                        [[[0., 0., 0., 0.]],
 
-                          [[ 34.],
-                           [134.]]],
+                         [[4., 5., 6., 7.]]],
 
 
-                         [[[ 25.],
-                           [125.]],
+                        [[[2., 3., 4., 5.]],
 
-                          [[ 35.],
-                           [135.]]]],
+                         [[0., 0., 0., 0.]]]],
 
 
-                        [[[[ 24.],
-                           [124.]],
+                       [[[[1., 2., 3., 4.]],
 
-                          [[ 34.],
-                           [134.]]],
+                         [[0., 0., 0., 0.]]],
 
 
-                         [[[ 25.],
-                           [125.]],
+                        [[[2., 3., 4., 5.]],
 
-                          [[ 35.],
-                           [135.]]],
+                         [[4., 5., 6., 7.]]],
 
 
-                         [[[ 26.],
-                           [126.]],
+                        [[[3., 4., 5., 6.]],
 
-                          [[ 36.],
-                           [136.]]]]]]]), size=(2, 3, 6, 6, 2, 1), nnz=4,
-       layout=torch.sparse_bsr)
+                         [[0., 0., 0., 0.]]]]]]), size=(2, 3, 2, 3, 4), nnz=3,
+       dtype=torch.float64, layout=torch.sparse_bsr)
 # _crow_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+tensor([[[0, 3],
+         [0, 3],
+         [0, 3]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], dtype=torch.int32)
+        [[0, 3],
+         [0, 3],
+         [0, 3]]], dtype=torch.int32)
 # _col_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
+tensor([[[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], dtype=torch.int32)
+        [[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]]], dtype=torch.int32)
 # _values
-tensor([[[[[[[  1.],
-             [101.]],
-
-            [[ 11.],
-             [111.]]],
-
-
-           [[[  2.],
-             [102.]],
-
-            [[ 12.],
-             [112.]]],
-
-
-           [[[  3.],
-             [103.]],
-
-            [[ 13.],
-             [113.]]]],
+tensor([[[[[[1., 2., 3., 4.]],
 
+           [[3., 4., 5., 6.]]],
 
 
-          [[[[  2.],
-             [102.]],
+          [[[2., 3., 4., 5.]],
 
-            [[ 12.],
-             [112.]]],
+           [[0., 0., 0., 0.]]],
 
 
-           [[[  3.],
-             [103.]],
+          [[[0., 0., 0., 0.]],
 
-            [[ 13.],
-             [113.]]],
+           [[4., 5., 6., 7.]]]],
 
 
-           [[[  4.],
-             [104.]],
 
-            [[ 14.],
-             [114.]]]],
+         [[[[1., 2., 3., 4.]],
 
+           [[4., 5., 6., 7.]]],
 
 
-          [[[[  3.],
-             [103.]],
+          [[[2., 3., 4., 5.]],
 
-            [[ 13.],
-             [113.]]],
+           [[0., 0., 0., 0.]]],
 
 
-           [[[  4.],
-             [104.]],
+          [[[3., 4., 5., 6.]],
 
-            [[ 14.],
-             [114.]]],
+           [[0., 0., 0., 0.]]]],
 
 
-           [[[  5.],
-             [105.]],
 
-            [[ 15.],
-             [115.]]]],
+         [[[[1., 2., 3., 4.]],
 
+           [[2., 3., 4., 5.]]],
 
 
-          [[[[  4.],
-             [104.]],
+          [[[0., 0., 0., 0.]],
 
-            [[ 14.],
-             [114.]]],
+           [[3., 4., 5., 6.]]],
 
 
-           [[[  5.],
-             [105.]],
+          [[[0., 0., 0., 0.]],
 
-            [[ 15.],
-             [115.]]],
+           [[4., 5., 6., 7.]]]]],
 
 
-           [[[  6.],
-             [106.]],
 
-            [[ 16.],
-             [116.]]]]],
 
+        [[[[[0., 0., 0., 0.]],
 
+           [[2., 3., 4., 5.]]],
 
 
-         [[[[[  5.],
-             [105.]],
+          [[[1., 2., 3., 4.]],
 
-            [[ 15.],
-             [115.]]],
+           [[3., 4., 5., 6.]]],
 
 
-           [[[  6.],
-             [106.]],
+          [[[0., 0., 0., 0.]],
 
-            [[ 16.],
-             [116.]]],
+           [[4., 5., 6., 7.]]]],
 
 
-           [[[  7.],
-             [107.]],
 
-            [[ 17.],
-             [117.]]]],
+         [[[[1., 2., 3., 4.]],
 
+           [[3., 4., 5., 6.]]],
 
 
-          [[[[  6.],
-             [106.]],
+          [[[0., 0., 0., 0.]],
 
-            [[ 16.],
-             [116.]]],
+           [[4., 5., 6., 7.]]],
 
 
-           [[[  7.],
-             [107.]],
+          [[[2., 3., 4., 5.]],
 
-            [[ 17.],
-             [117.]]],
+           [[0., 0., 0., 0.]]]],
 
 
-           [[[  8.],
-             [108.]],
 
-            [[ 18.],
-             [118.]]]],
+         [[[[1., 2., 3., 4.]],
 
+           [[0., 0., 0., 0.]]],
 
 
-          [[[[  7.],
-             [107.]],
+          [[[2., 3., 4., 5.]],
 
-            [[ 17.],
-             [117.]]],
+           [[4., 5., 6., 7.]]],
 
 
-           [[[  8.],
-             [108.]],
+          [[[3., 4., 5., 6.]],
 
-            [[ 18.],
-             [118.]]],
+           [[0., 0., 0., 0.]]]]]], dtype=torch.float64)
 
+########## torch.float64/torch.int32/size=()+(8, 6)+(4, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([0, 2, 3, 5, 7]),
+       col_indices=tensor([0, 1, 0, 0, 1, 0, 1]),
+       values=tensor([[[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-           [[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]]]],
-
-
-
-          [[[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]]],
-
-
-           [[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]]],
-
-
-           [[[ 10.],
-             [110.]],
-
-            [[ 20.],
-             [120.]]]]],
+                        [[ 1., 11.],
+                         [ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
 
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-         [[[[[  9.],
-             [109.]],
+                        [[ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.]],
 
-            [[ 19.],
-             [119.]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
 
-           [[[ 10.],
-             [110.]],
 
-            [[ 20.],
-             [120.]]],
+                      [[[[ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-           [[[ 11.],
-             [111.]],
+                        [[ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.]]],
 
-            [[ 21.],
-             [121.]]]],
 
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.]],
 
-          [[[[ 10.],
-             [110.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-            [[ 20.],
-             [120.]]],
 
 
-           [[[ 11.],
-             [111.]],
+                      [[[[ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.]],
 
-            [[ 21.],
-             [121.]]],
+                        [[ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.]],
 
+                        [[ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.],
+                         [11., 21.]]],
 
-           [[[ 12.],
-             [112.]],
 
-            [[ 22.],
-             [122.]]]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-          [[[[ 11.],
-             [111.]],
 
-            [[ 21.],
-             [121.]]],
 
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-           [[[ 12.],
-             [112.]],
+                        [[ 9., 19.],
+                         [10., 20.],
+                         [11., 21.],
+                         [12., 22.]],
 
-            [[ 22.],
-             [122.]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
 
-           [[[ 13.],
-             [113.]],
+                       [[[13., 23.],
+                         [14., 24.],
+                         [15., 25.],
+                         [16., 26.]],
 
-            [[ 23.],
-             [123.]]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[14., 24.],
+                         [15., 25.],
+                         [16., 26.],
+                         [17., 27.]]]],
 
 
-          [[[[ 12.],
-             [112.]],
 
-            [[ 22.],
-             [122.]]],
+                      [[[[10., 20.],
+                         [11., 21.],
+                         [12., 22.],
+                         [13., 23.]],
 
+                        [[11., 21.],
+                         [12., 22.],
+                         [13., 23.],
+                         [14., 24.]],
 
-           [[[ 13.],
-             [113.]],
+                        [[12., 22.],
+                         [13., 23.],
+                         [14., 24.],
+                         [15., 25.]]],
 
-            [[ 23.],
-             [123.]]],
 
+                       [[[15., 25.],
+                         [16., 26.],
+                         [17., 27.],
+                         [18., 28.]],
 
-           [[[ 14.],
-             [114.]],
+                        [[16., 26.],
+                         [17., 27.],
+                         [18., 28.],
+                         [19., 29.]],
 
-            [[ 24.],
-             [124.]]]]]],
+                        [[17., 27.],
+                         [18., 28.],
+                         [19., 29.],
+                         [20., 30.]]]],
 
 
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-        [[[[[[ 13.],
-             [113.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
-            [[ 23.],
-             [123.]]],
 
+                       [[[20., 30.],
+                         [21., 31.],
+                         [22., 32.],
+                         [23., 33.]],
 
-           [[[ 14.],
-             [114.]],
+                        [[21., 31.],
+                         [22., 32.],
+                         [23., 33.],
+                         [24., 34.]],
 
-            [[ 24.],
-             [124.]]],
+                        [[22., 32.],
+                         [23., 33.],
+                         [24., 34.],
+                         [25., 35.]]]],
 
 
-           [[[ 15.],
-             [115.]],
 
-            [[ 25.],
-             [125.]]]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[18., 28.],
+                         [19., 29.],
+                         [20., 30.],
+                         [21., 31.]],
 
+                        [[19., 29.],
+                         [20., 30.],
+                         [21., 31.],
+                         [22., 32.]]],
 
-          [[[[ 14.],
-             [114.]],
 
-            [[ 24.],
-             [124.]]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[23., 33.],
+                         [24., 34.],
+                         [25., 35.],
+                         [26., 36.]],
 
-           [[[ 15.],
-             [115.]],
+                        [[24., 34.],
+                         [25., 35.],
+                         [26., 36.],
+                         [27., 37.]]]]]), size=(8, 6, 4, 2), nnz=7,
+       dtype=torch.float64, layout=torch.sparse_bsr)
+# _crow_indices
+tensor([0, 2, 3, 5, 7], dtype=torch.int32)
+# _col_indices
+tensor([0, 1, 0, 0, 1, 0, 1], dtype=torch.int32)
+# _values
+tensor([[[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 25.],
-             [125.]]],
+          [[ 1., 11.],
+           [ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-           [[[ 16.],
-             [116.]],
 
-            [[ 26.],
-             [126.]]]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-          [[[[ 15.],
-             [115.]],
 
-            [[ 25.],
-             [125.]]],
 
+        [[[[ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.]],
 
-           [[[ 16.],
-             [116.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 26.],
-             [126.]]],
+          [[ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.]]],
 
 
-           [[[ 17.],
-             [117.]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 27.],
-             [127.]]]],
+          [[ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
 
-          [[[[ 16.],
-             [116.]],
 
-            [[ 26.],
-             [126.]]],
+        [[[[ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.]],
 
+          [[ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.],
+           [10., 20.]],
 
-           [[[ 17.],
-             [117.]],
+          [[ 8., 18.],
+           [ 9., 19.],
+           [10., 20.],
+           [11., 21.]]],
 
-            [[ 27.],
-             [127.]]],
 
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-           [[[ 18.],
-             [118.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 28.],
-             [128.]]]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
 
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-         [[[[[ 17.],
-             [117.]],
+          [[ 9., 19.],
+           [10., 20.],
+           [11., 21.],
+           [12., 22.]],
 
-            [[ 27.],
-             [127.]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
 
-           [[[ 18.],
-             [118.]],
+         [[[13., 23.],
+           [14., 24.],
+           [15., 25.],
+           [16., 26.]],
 
-            [[ 28.],
-             [128.]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[14., 24.],
+           [15., 25.],
+           [16., 26.],
+           [17., 27.]]]],
 
-           [[[ 19.],
-             [119.]],
 
-            [[ 29.],
-             [129.]]]],
 
+        [[[[10., 20.],
+           [11., 21.],
+           [12., 22.],
+           [13., 23.]],
 
+          [[11., 21.],
+           [12., 22.],
+           [13., 23.],
+           [14., 24.]],
 
-          [[[[ 18.],
-             [118.]],
+          [[12., 22.],
+           [13., 23.],
+           [14., 24.],
+           [15., 25.]]],
 
-            [[ 28.],
-             [128.]]],
 
+         [[[15., 25.],
+           [16., 26.],
+           [17., 27.],
+           [18., 28.]],
 
-           [[[ 19.],
-             [119.]],
+          [[16., 26.],
+           [17., 27.],
+           [18., 28.],
+           [19., 29.]],
 
-            [[ 29.],
-             [129.]]],
+          [[17., 27.],
+           [18., 28.],
+           [19., 29.],
+           [20., 30.]]]],
 
 
-           [[[ 20.],
-             [120.]],
 
-            [[ 30.],
-             [130.]]]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-          [[[[ 19.],
-             [119.]],
 
-            [[ 29.],
-             [129.]]],
+         [[[20., 30.],
+           [21., 31.],
+           [22., 32.],
+           [23., 33.]],
 
+          [[21., 31.],
+           [22., 32.],
+           [23., 33.],
+           [24., 34.]],
 
-           [[[ 20.],
-             [120.]],
+          [[22., 32.],
+           [23., 33.],
+           [24., 34.],
+           [25., 35.]]]],
 
-            [[ 30.],
-             [130.]]],
 
 
-           [[[ 21.],
-             [121.]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 31.],
-             [131.]]]],
+          [[18., 28.],
+           [19., 29.],
+           [20., 30.],
+           [21., 31.]],
 
+          [[19., 29.],
+           [20., 30.],
+           [21., 31.],
+           [22., 32.]]],
 
 
-          [[[[ 20.],
-             [120.]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 30.],
-             [130.]]],
+          [[23., 33.],
+           [24., 34.],
+           [25., 35.],
+           [26., 36.]],
 
+          [[24., 34.],
+           [25., 35.],
+           [26., 36.],
+           [27., 37.]]]]], dtype=torch.float64)
 
-           [[[ 21.],
-             [121.]],
 
-            [[ 31.],
-             [131.]]],
+########## torch.float32/torch.int64/size=(2, 3)+(2, 3)+(4,) ##########
+# sparse tensor
+tensor(crow_indices=tensor([[[0, 3],
+                             [0, 3],
+                             [0, 3]],
 
+                            [[0, 3],
+                             [0, 3],
+                             [0, 3]]]),
+       col_indices=tensor([[[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]],
 
-           [[[ 22.],
-             [122.]],
+                           [[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]]]),
+       values=tensor([[[[[[1., 2., 3., 4.]],
 
-            [[ 32.],
-             [132.]]]]],
+                         [[3., 4., 5., 6.]]],
 
 
+                        [[[2., 3., 4., 5.]],
 
+                         [[0., 0., 0., 0.]]],
 
-         [[[[[ 21.],
-             [121.]],
 
-            [[ 31.],
-             [131.]]],
+                        [[[0., 0., 0., 0.]],
 
+                         [[4., 5., 6., 7.]]]],
 
-           [[[ 22.],
-             [122.]],
 
-            [[ 32.],
-             [132.]]],
 
+                       [[[[1., 2., 3., 4.]],
 
-           [[[ 23.],
-             [123.]],
+                         [[4., 5., 6., 7.]]],
 
-            [[ 33.],
-             [133.]]]],
 
+                        [[[2., 3., 4., 5.]],
 
+                         [[0., 0., 0., 0.]]],
 
-          [[[[ 22.],
-             [122.]],
 
-            [[ 32.],
-             [132.]]],
+                        [[[3., 4., 5., 6.]],
 
+                         [[0., 0., 0., 0.]]]],
 
-           [[[ 23.],
-             [123.]],
 
-            [[ 33.],
-             [133.]]],
 
+                       [[[[1., 2., 3., 4.]],
 
-           [[[ 24.],
-             [124.]],
+                         [[2., 3., 4., 5.]]],
 
-            [[ 34.],
-             [134.]]]],
 
+                        [[[0., 0., 0., 0.]],
 
+                         [[3., 4., 5., 6.]]],
 
-          [[[[ 23.],
-             [123.]],
 
-            [[ 33.],
-             [133.]]],
+                        [[[0., 0., 0., 0.]],
 
+                         [[4., 5., 6., 7.]]]]],
 
-           [[[ 24.],
-             [124.]],
 
-            [[ 34.],
-             [134.]]],
 
 
-           [[[ 25.],
-             [125.]],
+                      [[[[[0., 0., 0., 0.]],
 
-            [[ 35.],
-             [135.]]]],
+                         [[2., 3., 4., 5.]]],
 
 
+                        [[[1., 2., 3., 4.]],
 
-          [[[[ 24.],
-             [124.]],
+                         [[3., 4., 5., 6.]]],
 
-            [[ 34.],
-             [134.]]],
 
+                        [[[0., 0., 0., 0.]],
 
-           [[[ 25.],
-             [125.]],
+                         [[4., 5., 6., 7.]]]],
 
-            [[ 35.],
-             [135.]]],
 
 
-           [[[ 26.],
-             [126.]],
+                       [[[[1., 2., 3., 4.]],
 
-            [[ 36.],
-             [136.]]]]]]])
+                         [[3., 4., 5., 6.]]],
 
 
-########## torch.float64/torch.int32/size=()+(6, 6)+(2,) ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[  1., 101.],
-                        [ 11., 111.]],
+                        [[[0., 0., 0., 0.]],
 
-                       [[  2., 102.],
-                        [ 12., 112.]],
+                         [[4., 5., 6., 7.]]],
 
-                       [[  3., 103.],
-                        [ 13., 113.]]],
 
+                        [[[2., 3., 4., 5.]],
 
-                      [[[  2., 102.],
-                        [ 12., 112.]],
+                         [[0., 0., 0., 0.]]]],
 
-                       [[  3., 103.],
-                        [ 13., 113.]],
 
-                       [[  4., 104.],
-                        [ 14., 114.]]],
 
+                       [[[[1., 2., 3., 4.]],
 
-                      [[[  3., 103.],
-                        [ 13., 113.]],
+                         [[0., 0., 0., 0.]]],
 
-                       [[  4., 104.],
-                        [ 14., 114.]],
 
-                       [[  5., 105.],
-                        [ 15., 115.]]],
+                        [[[2., 3., 4., 5.]],
 
+                         [[4., 5., 6., 7.]]],
 
-                      [[[  4., 104.],
-                        [ 14., 114.]],
 
-                       [[  5., 105.],
-                        [ 15., 115.]],
+                        [[[3., 4., 5., 6.]],
 
-                       [[  6., 106.],
-                        [ 16., 116.]]]]), size=(6, 6, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_bsr)
+                         [[0., 0., 0., 0.]]]]]]), size=(2, 3, 2, 3, 4), nnz=3,
+       layout=torch.sparse_bsr)
 # _crow_indices
-tensor([0, 2, 4], dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 2], dtype=torch.int32)
-# _values
-tensor([[[[  1., 101.],
-          [ 11., 111.]],
-
-         [[  2., 102.],
-          [ 12., 112.]],
-
-         [[  3., 103.],
-          [ 13., 113.]]],
+tensor([[[0, 3],
+         [0, 3],
+         [0, 3]],
 
+        [[0, 3],
+         [0, 3],
+         [0, 3]]])
+# _col_indices
+tensor([[[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]],
 
-        [[[  2., 102.],
-          [ 12., 112.]],
-
-         [[  3., 103.],
-          [ 13., 113.]],
-
-         [[  4., 104.],
-          [ 14., 114.]]],
+        [[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]]])
+# _values
+tensor([[[[[[1., 2., 3., 4.]],
 
+           [[3., 4., 5., 6.]]],
 
-        [[[  3., 103.],
-          [ 13., 113.]],
 
-         [[  4., 104.],
-          [ 14., 114.]],
+          [[[2., 3., 4., 5.]],
 
-         [[  5., 105.],
-          [ 15., 115.]]],
+           [[0., 0., 0., 0.]]],
 
 
-        [[[  4., 104.],
-          [ 14., 114.]],
+          [[[0., 0., 0., 0.]],
 
-         [[  5., 105.],
-          [ 15., 115.]],
+           [[4., 5., 6., 7.]]]],
 
-         [[  6., 106.],
-          [ 16., 116.]]]], dtype=torch.float64)
 
-########## torch.float64/torch.int32/size=()+(4, 9)+(4, 2) ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[[1.0000e+00, 1.0010e+03],
-                         [1.0100e+02, 1.1010e+03],
-                         [2.0100e+02, 1.2010e+03],
-                         [3.0100e+02, 1.3010e+03]],
 
-                        [[1.1000e+01, 1.0110e+03],
-                         [1.1100e+02, 1.1110e+03],
-                         [2.1100e+02, 1.2110e+03],
-                         [3.1100e+02, 1.3110e+03]],
+         [[[[1., 2., 3., 4.]],
 
-                        [[2.1000e+01, 1.0210e+03],
-                         [1.2100e+02, 1.1210e+03],
-                         [2.2100e+02, 1.2210e+03],
-                         [3.2100e+02, 1.3210e+03]]],
+           [[4., 5., 6., 7.]]],
 
 
-                       [[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
+          [[[2., 3., 4., 5.]],
 
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]],
+           [[0., 0., 0., 0.]]],
 
-                        [[2.2000e+01, 1.0220e+03],
-                         [1.2200e+02, 1.1220e+03],
-                         [2.2200e+02, 1.2220e+03],
-                         [3.2200e+02, 1.3220e+03]]]],
 
+          [[[3., 4., 5., 6.]],
 
+           [[0., 0., 0., 0.]]]],
 
-                      [[[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
 
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]],
 
-                        [[2.2000e+01, 1.0220e+03],
-                         [1.2200e+02, 1.1220e+03],
-                         [2.2200e+02, 1.2220e+03],
-                         [3.2200e+02, 1.3220e+03]]],
+         [[[[1., 2., 3., 4.]],
 
+           [[2., 3., 4., 5.]]],
 
-                       [[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
 
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]],
+          [[[0., 0., 0., 0.]],
 
-                        [[2.3000e+01, 1.0230e+03],
-                         [1.2300e+02, 1.1230e+03],
-                         [2.2300e+02, 1.2230e+03],
-                         [3.2300e+02, 1.3230e+03]]]],
+           [[3., 4., 5., 6.]]],
 
 
+          [[[0., 0., 0., 0.]],
 
-                      [[[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
+           [[4., 5., 6., 7.]]]]],
 
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]],
 
-                        [[2.3000e+01, 1.0230e+03],
-                         [1.2300e+02, 1.1230e+03],
-                         [2.2300e+02, 1.2230e+03],
-                         [3.2300e+02, 1.3230e+03]]],
 
 
-                       [[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
+        [[[[[0., 0., 0., 0.]],
 
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]],
+           [[2., 3., 4., 5.]]],
 
-                        [[2.4000e+01, 1.0240e+03],
-                         [1.2400e+02, 1.1240e+03],
-                         [2.2400e+02, 1.2240e+03],
-                         [3.2400e+02, 1.3240e+03]]]],
 
+          [[[1., 2., 3., 4.]],
 
+           [[3., 4., 5., 6.]]],
 
-                      [[[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
 
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]],
+          [[[0., 0., 0., 0.]],
 
-                        [[2.4000e+01, 1.0240e+03],
-                         [1.2400e+02, 1.1240e+03],
-                         [2.2400e+02, 1.2240e+03],
-                         [3.2400e+02, 1.3240e+03]]],
+           [[4., 5., 6., 7.]]]],
 
 
-                       [[[5.0000e+00, 1.0050e+03],
-                         [1.0500e+02, 1.1050e+03],
-                         [2.0500e+02, 1.2050e+03],
-                         [3.0500e+02, 1.3050e+03]],
 
-                        [[1.5000e+01, 1.0150e+03],
-                         [1.1500e+02, 1.1150e+03],
-                         [2.1500e+02, 1.2150e+03],
-                         [3.1500e+02, 1.3150e+03]],
+         [[[[1., 2., 3., 4.]],
 
-                        [[2.5000e+01, 1.0250e+03],
-                         [1.2500e+02, 1.1250e+03],
-                         [2.2500e+02, 1.2250e+03],
-                         [3.2500e+02, 1.3250e+03]]]]]), size=(4, 9, 4, 2),
-       nnz=4, dtype=torch.float64, layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0, 2, 4], dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 2], dtype=torch.int32)
-# _values
-tensor([[[[[1.0000e+00, 1.0010e+03],
-           [1.0100e+02, 1.1010e+03],
-           [2.0100e+02, 1.2010e+03],
-           [3.0100e+02, 1.3010e+03]],
+           [[3., 4., 5., 6.]]],
 
-          [[1.1000e+01, 1.0110e+03],
-           [1.1100e+02, 1.1110e+03],
-           [2.1100e+02, 1.2110e+03],
-           [3.1100e+02, 1.3110e+03]],
 
-          [[2.1000e+01, 1.0210e+03],
-           [1.2100e+02, 1.1210e+03],
-           [2.2100e+02, 1.2210e+03],
-           [3.2100e+02, 1.3210e+03]]],
+          [[[0., 0., 0., 0.]],
 
+           [[4., 5., 6., 7.]]],
 
-         [[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
 
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]],
+          [[[2., 3., 4., 5.]],
 
-          [[2.2000e+01, 1.0220e+03],
-           [1.2200e+02, 1.1220e+03],
-           [2.2200e+02, 1.2220e+03],
-           [3.2200e+02, 1.3220e+03]]]],
+           [[0., 0., 0., 0.]]]],
 
 
-        [[[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
+         [[[[1., 2., 3., 4.]],
 
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]],
+           [[0., 0., 0., 0.]]],
 
-          [[2.2000e+01, 1.0220e+03],
-           [1.2200e+02, 1.1220e+03],
-           [2.2200e+02, 1.2220e+03],
-           [3.2200e+02, 1.3220e+03]]],
 
+          [[[2., 3., 4., 5.]],
 
-         [[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
+           [[4., 5., 6., 7.]]],
 
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]],
 
-          [[2.3000e+01, 1.0230e+03],
-           [1.2300e+02, 1.1230e+03],
-           [2.2300e+02, 1.2230e+03],
-           [3.2300e+02, 1.3230e+03]]]],
+          [[[3., 4., 5., 6.]],
 
+           [[0., 0., 0., 0.]]]]]])
 
+########## torch.float32/torch.int64/size=()+(8, 6)+(4, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([0, 2, 3, 5, 7]),
+       col_indices=tensor([0, 1, 0, 0, 1, 0, 1]),
+       values=tensor([[[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-        [[[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
+                        [[ 1., 11.],
+                         [ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.]],
 
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
-          [[2.3000e+01, 1.0230e+03],
-           [1.2300e+02, 1.1230e+03],
-           [2.2300e+02, 1.2230e+03],
-           [3.2300e+02, 1.3230e+03]]],
 
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-         [[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
+                        [[ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.]],
 
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-          [[2.4000e+01, 1.0240e+03],
-           [1.2400e+02, 1.1240e+03],
-           [2.2400e+02, 1.2240e+03],
-           [3.2400e+02, 1.3240e+03]]]],
 
 
+                      [[[[ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.]],
 
-        [[[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]],
+                        [[ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.]]],
 
-          [[2.4000e+01, 1.0240e+03],
-           [1.2400e+02, 1.1240e+03],
-           [2.2400e+02, 1.2240e+03],
-           [3.2400e+02, 1.3240e+03]]],
 
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-         [[[5.0000e+00, 1.0050e+03],
-           [1.0500e+02, 1.1050e+03],
-           [2.0500e+02, 1.2050e+03],
-           [3.0500e+02, 1.3050e+03]],
+                        [[ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.]],
 
-          [[1.5000e+01, 1.0150e+03],
-           [1.1500e+02, 1.1150e+03],
-           [2.1500e+02, 1.2150e+03],
-           [3.1500e+02, 1.3150e+03]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-          [[2.5000e+01, 1.0250e+03],
-           [1.2500e+02, 1.1250e+03],
-           [2.2500e+02, 1.2250e+03],
-           [3.2500e+02, 1.3250e+03]]]]], dtype=torch.float64)
 
-########## torch.float64/torch.int32/size=(2, 3)+(6, 6)+(2, 1) ##########
-# sparse tensor
-tensor(crow_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
 
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
+                      [[[[ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.]],
 
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[[[  1.],
-                           [101.]],
+                        [[ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.]],
 
-                          [[ 11.],
-                           [111.]]],
+                        [[ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.],
+                         [11., 21.]]],
 
 
-                         [[[  2.],
-                           [102.]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 12.],
-                           [112.]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-                         [[[  3.],
-                           [103.]],
 
-                          [[ 13.],
-                           [113.]]]],
 
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 9., 19.],
+                         [10., 20.],
+                         [11., 21.],
+                         [12., 22.]],
 
-                        [[[[  2.],
-                           [102.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
-                          [[ 12.],
-                           [112.]]],
 
+                       [[[13., 23.],
+                         [14., 24.],
+                         [15., 25.],
+                         [16., 26.]],
 
-                         [[[  3.],
-                           [103.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 13.],
-                           [113.]]],
+                        [[14., 24.],
+                         [15., 25.],
+                         [16., 26.],
+                         [17., 27.]]]],
 
 
-                         [[[  4.],
-                           [104.]],
 
-                          [[ 14.],
-                           [114.]]]],
+                      [[[[10., 20.],
+                         [11., 21.],
+                         [12., 22.],
+                         [13., 23.]],
 
+                        [[11., 21.],
+                         [12., 22.],
+                         [13., 23.],
+                         [14., 24.]],
 
+                        [[12., 22.],
+                         [13., 23.],
+                         [14., 24.],
+                         [15., 25.]]],
 
-                        [[[[  3.],
-                           [103.]],
 
-                          [[ 13.],
-                           [113.]]],
+                       [[[15., 25.],
+                         [16., 26.],
+                         [17., 27.],
+                         [18., 28.]],
 
+                        [[16., 26.],
+                         [17., 27.],
+                         [18., 28.],
+                         [19., 29.]],
 
-                         [[[  4.],
-                           [104.]],
+                        [[17., 27.],
+                         [18., 28.],
+                         [19., 29.],
+                         [20., 30.]]]],
 
-                          [[ 14.],
-                           [114.]]],
 
 
-                         [[[  5.],
-                           [105.]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 15.],
-                           [115.]]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
 
-                        [[[[  4.],
-                           [104.]],
+                       [[[20., 30.],
+                         [21., 31.],
+                         [22., 32.],
+                         [23., 33.]],
 
-                          [[ 14.],
-                           [114.]]],
+                        [[21., 31.],
+                         [22., 32.],
+                         [23., 33.],
+                         [24., 34.]],
 
+                        [[22., 32.],
+                         [23., 33.],
+                         [24., 34.],
+                         [25., 35.]]]],
 
-                         [[[  5.],
-                           [105.]],
 
-                          [[ 15.],
-                           [115.]]],
 
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                         [[[  6.],
-                           [106.]],
+                        [[18., 28.],
+                         [19., 29.],
+                         [20., 30.],
+                         [21., 31.]],
 
-                          [[ 16.],
-                           [116.]]]]],
+                        [[19., 29.],
+                         [20., 30.],
+                         [21., 31.],
+                         [22., 32.]]],
 
 
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[23., 33.],
+                         [24., 34.],
+                         [25., 35.],
+                         [26., 36.]],
 
-                       [[[[[  5.],
-                           [105.]],
+                        [[24., 34.],
+                         [25., 35.],
+                         [26., 36.],
+                         [27., 37.]]]]]), size=(8, 6, 4, 2), nnz=7,
+       layout=torch.sparse_bsr)
+# _crow_indices
+tensor([0, 2, 3, 5, 7])
+# _col_indices
+tensor([0, 1, 0, 0, 1, 0, 1])
+# _values
+tensor([[[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                          [[ 15.],
-                           [115.]]],
+          [[ 1., 11.],
+           [ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-                         [[[  6.],
-                           [106.]],
 
-                          [[ 16.],
-                           [116.]]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.]],
 
-                         [[[  7.],
-                           [107.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-                          [[ 17.],
-                           [117.]]]],
 
 
+        [[[[ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.]],
 
-                        [[[[  6.],
-                           [106.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                          [[ 16.],
-                           [116.]]],
+          [[ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.]]],
 
 
-                         [[[  7.],
-                           [107.]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                          [[ 17.],
-                           [117.]]],
+          [[ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-                         [[[  8.],
-                           [108.]],
 
-                          [[ 18.],
-                           [118.]]]],
 
+        [[[[ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.]],
 
+          [[ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.],
+           [10., 20.]],
 
-                        [[[[  7.],
-                           [107.]],
+          [[ 8., 18.],
+           [ 9., 19.],
+           [10., 20.],
+           [11., 21.]]],
 
-                          [[ 17.],
-                           [117.]]],
 
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                         [[[  8.],
-                           [108.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                          [[ 18.],
-                           [118.]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
 
-                         [[[  9.],
-                           [109.]],
 
-                          [[ 19.],
-                           [119.]]]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 9., 19.],
+           [10., 20.],
+           [11., 21.],
+           [12., 22.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-                        [[[[  8.],
-                           [108.]],
 
-                          [[ 18.],
-                           [118.]]],
+         [[[13., 23.],
+           [14., 24.],
+           [15., 25.],
+           [16., 26.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                         [[[  9.],
-                           [109.]],
+          [[14., 24.],
+           [15., 25.],
+           [16., 26.],
+           [17., 27.]]]],
 
-                          [[ 19.],
-                           [119.]]],
 
 
-                         [[[ 10.],
-                           [110.]],
+        [[[[10., 20.],
+           [11., 21.],
+           [12., 22.],
+           [13., 23.]],
 
-                          [[ 20.],
-                           [120.]]]]],
+          [[11., 21.],
+           [12., 22.],
+           [13., 23.],
+           [14., 24.]],
 
+          [[12., 22.],
+           [13., 23.],
+           [14., 24.],
+           [15., 25.]]],
 
 
+         [[[15., 25.],
+           [16., 26.],
+           [17., 27.],
+           [18., 28.]],
 
-                       [[[[[  9.],
-                           [109.]],
+          [[16., 26.],
+           [17., 27.],
+           [18., 28.],
+           [19., 29.]],
 
-                          [[ 19.],
-                           [119.]]],
+          [[17., 27.],
+           [18., 28.],
+           [19., 29.],
+           [20., 30.]]]],
 
 
-                         [[[ 10.],
-                           [110.]],
 
-                          [[ 20.],
-                           [120.]]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                         [[[ 11.],
-                           [111.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-                          [[ 21.],
-                           [121.]]]],
 
+         [[[20., 30.],
+           [21., 31.],
+           [22., 32.],
+           [23., 33.]],
 
+          [[21., 31.],
+           [22., 32.],
+           [23., 33.],
+           [24., 34.]],
 
-                        [[[[ 10.],
-                           [110.]],
+          [[22., 32.],
+           [23., 33.],
+           [24., 34.],
+           [25., 35.]]]],
 
-                          [[ 20.],
-                           [120.]]],
 
 
-                         [[[ 11.],
-                           [111.]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                          [[ 21.],
-                           [121.]]],
+          [[18., 28.],
+           [19., 29.],
+           [20., 30.],
+           [21., 31.]],
 
+          [[19., 29.],
+           [20., 30.],
+           [21., 31.],
+           [22., 32.]]],
 
-                         [[[ 12.],
-                           [112.]],
 
-                          [[ 22.],
-                           [122.]]]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[23., 33.],
+           [24., 34.],
+           [25., 35.],
+           [26., 36.]],
 
+          [[24., 34.],
+           [25., 35.],
+           [26., 36.],
+           [27., 37.]]]]])
 
-                        [[[[ 11.],
-                           [111.]],
 
-                          [[ 21.],
-                           [121.]]],
+########## torch.float64/torch.int64/size=(2, 3)+(2, 3)+(4,) ##########
+# sparse tensor
+tensor(crow_indices=tensor([[[0, 3],
+                             [0, 3],
+                             [0, 3]],
 
+                            [[0, 3],
+                             [0, 3],
+                             [0, 3]]]),
+       col_indices=tensor([[[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]],
 
-                         [[[ 12.],
-                           [112.]],
+                           [[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]]]),
+       values=tensor([[[[[[1., 2., 3., 4.]],
 
-                          [[ 22.],
-                           [122.]]],
+                         [[3., 4., 5., 6.]]],
 
 
-                         [[[ 13.],
-                           [113.]],
+                        [[[2., 3., 4., 5.]],
 
-                          [[ 23.],
-                           [123.]]]],
+                         [[0., 0., 0., 0.]]],
 
 
+                        [[[0., 0., 0., 0.]],
 
-                        [[[[ 12.],
-                           [112.]],
+                         [[4., 5., 6., 7.]]]],
 
-                          [[ 22.],
-                           [122.]]],
 
 
-                         [[[ 13.],
-                           [113.]],
+                       [[[[1., 2., 3., 4.]],
 
-                          [[ 23.],
-                           [123.]]],
+                         [[4., 5., 6., 7.]]],
 
 
-                         [[[ 14.],
-                           [114.]],
+                        [[[2., 3., 4., 5.]],
 
-                          [[ 24.],
-                           [124.]]]]]],
+                         [[0., 0., 0., 0.]]],
 
 
+                        [[[3., 4., 5., 6.]],
 
+                         [[0., 0., 0., 0.]]]],
 
 
-                      [[[[[[ 13.],
-                           [113.]],
 
-                          [[ 23.],
-                           [123.]]],
+                       [[[[1., 2., 3., 4.]],
 
+                         [[2., 3., 4., 5.]]],
 
-                         [[[ 14.],
-                           [114.]],
 
-                          [[ 24.],
-                           [124.]]],
+                        [[[0., 0., 0., 0.]],
 
+                         [[3., 4., 5., 6.]]],
 
-                         [[[ 15.],
-                           [115.]],
 
-                          [[ 25.],
-                           [125.]]]],
+                        [[[0., 0., 0., 0.]],
 
+                         [[4., 5., 6., 7.]]]]],
 
 
-                        [[[[ 14.],
-                           [114.]],
 
-                          [[ 24.],
-                           [124.]]],
 
+                      [[[[[0., 0., 0., 0.]],
 
-                         [[[ 15.],
-                           [115.]],
+                         [[2., 3., 4., 5.]]],
 
-                          [[ 25.],
-                           [125.]]],
 
+                        [[[1., 2., 3., 4.]],
 
-                         [[[ 16.],
-                           [116.]],
+                         [[3., 4., 5., 6.]]],
 
-                          [[ 26.],
-                           [126.]]]],
 
+                        [[[0., 0., 0., 0.]],
 
+                         [[4., 5., 6., 7.]]]],
 
-                        [[[[ 15.],
-                           [115.]],
 
-                          [[ 25.],
-                           [125.]]],
 
+                       [[[[1., 2., 3., 4.]],
 
-                         [[[ 16.],
-                           [116.]],
+                         [[3., 4., 5., 6.]]],
 
-                          [[ 26.],
-                           [126.]]],
 
+                        [[[0., 0., 0., 0.]],
 
-                         [[[ 17.],
-                           [117.]],
+                         [[4., 5., 6., 7.]]],
 
-                          [[ 27.],
-                           [127.]]]],
 
+                        [[[2., 3., 4., 5.]],
 
+                         [[0., 0., 0., 0.]]]],
 
-                        [[[[ 16.],
-                           [116.]],
 
-                          [[ 26.],
-                           [126.]]],
 
+                       [[[[1., 2., 3., 4.]],
 
-                         [[[ 17.],
-                           [117.]],
+                         [[0., 0., 0., 0.]]],
 
-                          [[ 27.],
-                           [127.]]],
 
+                        [[[2., 3., 4., 5.]],
 
-                         [[[ 18.],
-                           [118.]],
+                         [[4., 5., 6., 7.]]],
 
-                          [[ 28.],
-                           [128.]]]]],
 
+                        [[[3., 4., 5., 6.]],
 
+                         [[0., 0., 0., 0.]]]]]]), size=(2, 3, 2, 3, 4), nnz=3,
+       dtype=torch.float64, layout=torch.sparse_bsr)
+# _crow_indices
+tensor([[[0, 3],
+         [0, 3],
+         [0, 3]],
 
+        [[0, 3],
+         [0, 3],
+         [0, 3]]])
+# _col_indices
+tensor([[[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]],
 
-                       [[[[[ 17.],
-                           [117.]],
+        [[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]]])
+# _values
+tensor([[[[[[1., 2., 3., 4.]],
 
-                          [[ 27.],
-                           [127.]]],
+           [[3., 4., 5., 6.]]],
 
 
-                         [[[ 18.],
-                           [118.]],
+          [[[2., 3., 4., 5.]],
 
-                          [[ 28.],
-                           [128.]]],
+           [[0., 0., 0., 0.]]],
 
 
-                         [[[ 19.],
-                           [119.]],
+          [[[0., 0., 0., 0.]],
 
-                          [[ 29.],
-                           [129.]]]],
+           [[4., 5., 6., 7.]]]],
 
 
-                        [[[[ 18.],
-                           [118.]],
+         [[[[1., 2., 3., 4.]],
 
-                          [[ 28.],
-                           [128.]]],
+           [[4., 5., 6., 7.]]],
 
 
-                         [[[ 19.],
-                           [119.]],
+          [[[2., 3., 4., 5.]],
 
-                          [[ 29.],
-                           [129.]]],
+           [[0., 0., 0., 0.]]],
 
 
-                         [[[ 20.],
-                           [120.]],
+          [[[3., 4., 5., 6.]],
 
-                          [[ 30.],
-                           [130.]]]],
+           [[0., 0., 0., 0.]]]],
 
 
-                        [[[[ 19.],
-                           [119.]],
+         [[[[1., 2., 3., 4.]],
 
-                          [[ 29.],
-                           [129.]]],
+           [[2., 3., 4., 5.]]],
 
 
-                         [[[ 20.],
-                           [120.]],
+          [[[0., 0., 0., 0.]],
 
-                          [[ 30.],
-                           [130.]]],
+           [[3., 4., 5., 6.]]],
 
 
-                         [[[ 21.],
-                           [121.]],
+          [[[0., 0., 0., 0.]],
 
-                          [[ 31.],
-                           [131.]]]],
+           [[4., 5., 6., 7.]]]]],
 
 
-                        [[[[ 20.],
-                           [120.]],
 
-                          [[ 30.],
-                           [130.]]],
+        [[[[[0., 0., 0., 0.]],
 
+           [[2., 3., 4., 5.]]],
 
-                         [[[ 21.],
-                           [121.]],
 
-                          [[ 31.],
-                           [131.]]],
+          [[[1., 2., 3., 4.]],
 
+           [[3., 4., 5., 6.]]],
 
-                         [[[ 22.],
-                           [122.]],
 
-                          [[ 32.],
-                           [132.]]]]],
+          [[[0., 0., 0., 0.]],
 
+           [[4., 5., 6., 7.]]]],
 
 
-                       [[[[[ 21.],
-                           [121.]],
+         [[[[1., 2., 3., 4.]],
 
-                          [[ 31.],
-                           [131.]]],
+           [[3., 4., 5., 6.]]],
 
 
-                         [[[ 22.],
-                           [122.]],
+          [[[0., 0., 0., 0.]],
 
-                          [[ 32.],
-                           [132.]]],
+           [[4., 5., 6., 7.]]],
 
 
-                         [[[ 23.],
-                           [123.]],
+          [[[2., 3., 4., 5.]],
 
-                          [[ 33.],
-                           [133.]]]],
+           [[0., 0., 0., 0.]]]],
 
 
-                        [[[[ 22.],
-                           [122.]],
+         [[[[1., 2., 3., 4.]],
 
-                          [[ 32.],
-                           [132.]]],
+           [[0., 0., 0., 0.]]],
 
 
-                         [[[ 23.],
-                           [123.]],
+          [[[2., 3., 4., 5.]],
 
-                          [[ 33.],
-                           [133.]]],
+           [[4., 5., 6., 7.]]],
 
 
-                         [[[ 24.],
-                           [124.]],
+          [[[3., 4., 5., 6.]],
 
-                          [[ 34.],
-                           [134.]]]],
+           [[0., 0., 0., 0.]]]]]], dtype=torch.float64)
 
+########## torch.float64/torch.int64/size=()+(8, 6)+(4, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([0, 2, 3, 5, 7]),
+       col_indices=tensor([0, 1, 0, 0, 1, 0, 1]),
+       values=tensor([[[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 1., 11.],
+                         [ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.]],
 
-                        [[[[ 23.],
-                           [123.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
-                          [[ 33.],
-                           [133.]]],
 
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                         [[[ 24.],
-                           [124.]],
+                        [[ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.]],
 
-                          [[ 34.],
-                           [134.]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
 
-                         [[[ 25.],
-                           [125.]],
 
-                          [[ 35.],
-                           [135.]]]],
+                      [[[[ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.]]],
 
-                        [[[[ 24.],
-                           [124.]],
 
-                          [[ 34.],
-                           [134.]]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.]],
 
-                         [[[ 25.],
-                           [125.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-                          [[ 35.],
-                           [135.]]],
 
 
-                         [[[ 26.],
-                           [126.]],
+                      [[[[ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.]],
 
-                          [[ 36.],
-                           [136.]]]]]]]), size=(2, 3, 6, 6, 2, 1), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_bsr)
-# _crow_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+                        [[ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], dtype=torch.int32)
-# _col_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
+                        [[ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.],
+                         [11., 21.]]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], dtype=torch.int32)
-# _values
-tensor([[[[[[[  1.],
-             [101.]],
 
-            [[ 11.],
-             [111.]]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-           [[[  2.],
-             [102.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-            [[ 12.],
-             [112.]]],
 
 
-           [[[  3.],
-             [103.]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 13.],
-             [113.]]]],
+                        [[ 9., 19.],
+                         [10., 20.],
+                         [11., 21.],
+                         [12., 22.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
 
-          [[[[  2.],
-             [102.]],
+                       [[[13., 23.],
+                         [14., 24.],
+                         [15., 25.],
+                         [16., 26.]],
 
-            [[ 12.],
-             [112.]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[14., 24.],
+                         [15., 25.],
+                         [16., 26.],
+                         [17., 27.]]]],
 
-           [[[  3.],
-             [103.]],
 
-            [[ 13.],
-             [113.]]],
 
+                      [[[[10., 20.],
+                         [11., 21.],
+                         [12., 22.],
+                         [13., 23.]],
 
-           [[[  4.],
-             [104.]],
+                        [[11., 21.],
+                         [12., 22.],
+                         [13., 23.],
+                         [14., 24.]],
 
-            [[ 14.],
-             [114.]]]],
+                        [[12., 22.],
+                         [13., 23.],
+                         [14., 24.],
+                         [15., 25.]]],
 
 
+                       [[[15., 25.],
+                         [16., 26.],
+                         [17., 27.],
+                         [18., 28.]],
 
-          [[[[  3.],
-             [103.]],
+                        [[16., 26.],
+                         [17., 27.],
+                         [18., 28.],
+                         [19., 29.]],
 
-            [[ 13.],
-             [113.]]],
+                        [[17., 27.],
+                         [18., 28.],
+                         [19., 29.],
+                         [20., 30.]]]],
 
 
-           [[[  4.],
-             [104.]],
 
-            [[ 14.],
-             [114.]]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-           [[[  5.],
-             [105.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
-            [[ 15.],
-             [115.]]]],
 
+                       [[[20., 30.],
+                         [21., 31.],
+                         [22., 32.],
+                         [23., 33.]],
 
+                        [[21., 31.],
+                         [22., 32.],
+                         [23., 33.],
+                         [24., 34.]],
 
-          [[[[  4.],
-             [104.]],
+                        [[22., 32.],
+                         [23., 33.],
+                         [24., 34.],
+                         [25., 35.]]]],
 
-            [[ 14.],
-             [114.]]],
 
 
-           [[[  5.],
-             [105.]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 15.],
-             [115.]]],
+                        [[18., 28.],
+                         [19., 29.],
+                         [20., 30.],
+                         [21., 31.]],
 
+                        [[19., 29.],
+                         [20., 30.],
+                         [21., 31.],
+                         [22., 32.]]],
 
-           [[[  6.],
-             [106.]],
 
-            [[ 16.],
-             [116.]]]]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[23., 33.],
+                         [24., 34.],
+                         [25., 35.],
+                         [26., 36.]],
 
-
-
-         [[[[[  5.],
-             [105.]],
-
-            [[ 15.],
-             [115.]]],
-
-
-           [[[  6.],
-             [106.]],
-
-            [[ 16.],
-             [116.]]],
-
-
-           [[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]]]],
-
-
-
-          [[[[  6.],
-             [106.]],
-
-            [[ 16.],
-             [116.]]],
-
-
-           [[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]]],
-
-
-           [[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]]]],
-
-
-
-          [[[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]]],
-
-
-           [[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]]],
-
-
-           [[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]]]],
-
-
-
-          [[[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]]],
-
-
-           [[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]]],
-
-
-           [[[ 10.],
-             [110.]],
-
-            [[ 20.],
-             [120.]]]]],
-
-
-
-
-         [[[[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]]],
-
-
-           [[[ 10.],
-             [110.]],
-
-            [[ 20.],
-             [120.]]],
-
-
-           [[[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]]]],
-
-
-
-          [[[[ 10.],
-             [110.]],
-
-            [[ 20.],
-             [120.]]],
-
-
-           [[[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]]],
-
-
-           [[[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]]],
-
-
-
-          [[[[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]]],
-
-
-           [[[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]],
-
-
-           [[[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]]],
-
-
-
-          [[[[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]],
-
-
-           [[[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]],
-
-
-           [[[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]]]]]],
-
-
-
-
-
-        [[[[[[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]],
-
-
-           [[[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]]],
-
-
-           [[[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]]]],
-
-
-
-          [[[[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]]],
-
-
-           [[[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]]],
-
-
-           [[[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]]]],
-
-
-
-          [[[[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]]],
-
-
-           [[[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]]],
-
-
-           [[[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]]],
-
-
-
-          [[[[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]]],
-
-
-           [[[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]],
-
-
-           [[[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]]]],
-
-
-
-
-         [[[[[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]],
-
-
-           [[[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]],
-
-
-           [[[ 19.],
-             [119.]],
-
-            [[ 29.],
-             [129.]]]],
-
-
-
-          [[[[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]],
-
-
-           [[[ 19.],
-             [119.]],
-
-            [[ 29.],
-             [129.]]],
-
-
-           [[[ 20.],
-             [120.]],
-
-            [[ 30.],
-             [130.]]]],
-
-
-
-          [[[[ 19.],
-             [119.]],
-
-            [[ 29.],
-             [129.]]],
-
-
-           [[[ 20.],
-             [120.]],
-
-            [[ 30.],
-             [130.]]],
-
-
-           [[[ 21.],
-             [121.]],
-
-            [[ 31.],
-             [131.]]]],
-
-
-
-          [[[[ 20.],
-             [120.]],
-
-            [[ 30.],
-             [130.]]],
-
-
-           [[[ 21.],
-             [121.]],
-
-            [[ 31.],
-             [131.]]],
-
-
-           [[[ 22.],
-             [122.]],
-
-            [[ 32.],
-             [132.]]]]],
-
-
-
-
-         [[[[[ 21.],
-             [121.]],
-
-            [[ 31.],
-             [131.]]],
-
-
-           [[[ 22.],
-             [122.]],
-
-            [[ 32.],
-             [132.]]],
-
-
-           [[[ 23.],
-             [123.]],
-
-            [[ 33.],
-             [133.]]]],
-
-
-
-          [[[[ 22.],
-             [122.]],
-
-            [[ 32.],
-             [132.]]],
-
-
-           [[[ 23.],
-             [123.]],
-
-            [[ 33.],
-             [133.]]],
-
-
-           [[[ 24.],
-             [124.]],
-
-            [[ 34.],
-             [134.]]]],
-
-
-
-          [[[[ 23.],
-             [123.]],
-
-            [[ 33.],
-             [133.]]],
-
-
-           [[[ 24.],
-             [124.]],
-
-            [[ 34.],
-             [134.]]],
-
-
-           [[[ 25.],
-             [125.]],
-
-            [[ 35.],
-             [135.]]]],
-
-
-
-          [[[[ 24.],
-             [124.]],
-
-            [[ 34.],
-             [134.]]],
-
-
-           [[[ 25.],
-             [125.]],
-
-            [[ 35.],
-             [135.]]],
-
-
-           [[[ 26.],
-             [126.]],
-
-            [[ 36.],
-             [136.]]]]]]], dtype=torch.float64)
-
-
-########## torch.float32/torch.int64/size=()+(6, 6)+(2,) ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[  1., 101.],
-                        [ 11., 111.]],
-
-                       [[  2., 102.],
-                        [ 12., 112.]],
-
-                       [[  3., 103.],
-                        [ 13., 113.]]],
-
-
-                      [[[  2., 102.],
-                        [ 12., 112.]],
-
-                       [[  3., 103.],
-                        [ 13., 113.]],
-
-                       [[  4., 104.],
-                        [ 14., 114.]]],
-
-
-                      [[[  3., 103.],
-                        [ 13., 113.]],
-
-                       [[  4., 104.],
-                        [ 14., 114.]],
-
-                       [[  5., 105.],
-                        [ 15., 115.]]],
-
-
-                      [[[  4., 104.],
-                        [ 14., 114.]],
-
-                       [[  5., 105.],
-                        [ 15., 115.]],
-
-                       [[  6., 106.],
-                        [ 16., 116.]]]]), size=(6, 6, 2), nnz=4,
-       layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0, 2, 4])
-# _col_indices
-tensor([0, 1, 0, 2])
-# _values
-tensor([[[[  1., 101.],
-          [ 11., 111.]],
-
-         [[  2., 102.],
-          [ 12., 112.]],
-
-         [[  3., 103.],
-          [ 13., 113.]]],
-
-
-        [[[  2., 102.],
-          [ 12., 112.]],
-
-         [[  3., 103.],
-          [ 13., 113.]],
-
-         [[  4., 104.],
-          [ 14., 114.]]],
-
-
-        [[[  3., 103.],
-          [ 13., 113.]],
-
-         [[  4., 104.],
-          [ 14., 114.]],
-
-         [[  5., 105.],
-          [ 15., 115.]]],
-
-
-        [[[  4., 104.],
-          [ 14., 114.]],
-
-         [[  5., 105.],
-          [ 15., 115.]],
-
-         [[  6., 106.],
-          [ 16., 116.]]]])
-
-########## torch.float32/torch.int64/size=()+(4, 9)+(4, 2) ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[[1.0000e+00, 1.0010e+03],
-                         [1.0100e+02, 1.1010e+03],
-                         [2.0100e+02, 1.2010e+03],
-                         [3.0100e+02, 1.3010e+03]],
-
-                        [[1.1000e+01, 1.0110e+03],
-                         [1.1100e+02, 1.1110e+03],
-                         [2.1100e+02, 1.2110e+03],
-                         [3.1100e+02, 1.3110e+03]],
-
-                        [[2.1000e+01, 1.0210e+03],
-                         [1.2100e+02, 1.1210e+03],
-                         [2.2100e+02, 1.2210e+03],
-                         [3.2100e+02, 1.3210e+03]]],
-
-
-                       [[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
-
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]],
-
-                        [[2.2000e+01, 1.0220e+03],
-                         [1.2200e+02, 1.1220e+03],
-                         [2.2200e+02, 1.2220e+03],
-                         [3.2200e+02, 1.3220e+03]]]],
-
-
-
-                      [[[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
-
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]],
-
-                        [[2.2000e+01, 1.0220e+03],
-                         [1.2200e+02, 1.1220e+03],
-                         [2.2200e+02, 1.2220e+03],
-                         [3.2200e+02, 1.3220e+03]]],
-
-
-                       [[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
-
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]],
-
-                        [[2.3000e+01, 1.0230e+03],
-                         [1.2300e+02, 1.1230e+03],
-                         [2.2300e+02, 1.2230e+03],
-                         [3.2300e+02, 1.3230e+03]]]],
-
-
-
-                      [[[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
-
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]],
-
-                        [[2.3000e+01, 1.0230e+03],
-                         [1.2300e+02, 1.1230e+03],
-                         [2.2300e+02, 1.2230e+03],
-                         [3.2300e+02, 1.3230e+03]]],
-
-
-                       [[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
-
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]],
-
-                        [[2.4000e+01, 1.0240e+03],
-                         [1.2400e+02, 1.1240e+03],
-                         [2.2400e+02, 1.2240e+03],
-                         [3.2400e+02, 1.3240e+03]]]],
-
-
-
-                      [[[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
-
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]],
-
-                        [[2.4000e+01, 1.0240e+03],
-                         [1.2400e+02, 1.1240e+03],
-                         [2.2400e+02, 1.2240e+03],
-                         [3.2400e+02, 1.3240e+03]]],
-
-
-                       [[[5.0000e+00, 1.0050e+03],
-                         [1.0500e+02, 1.1050e+03],
-                         [2.0500e+02, 1.2050e+03],
-                         [3.0500e+02, 1.3050e+03]],
-
-                        [[1.5000e+01, 1.0150e+03],
-                         [1.1500e+02, 1.1150e+03],
-                         [2.1500e+02, 1.2150e+03],
-                         [3.1500e+02, 1.3150e+03]],
-
-                        [[2.5000e+01, 1.0250e+03],
-                         [1.2500e+02, 1.1250e+03],
-                         [2.2500e+02, 1.2250e+03],
-                         [3.2500e+02, 1.3250e+03]]]]]), size=(4, 9, 4, 2),
-       nnz=4, layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0, 2, 4])
-# _col_indices
-tensor([0, 1, 0, 2])
-# _values
-tensor([[[[[1.0000e+00, 1.0010e+03],
-           [1.0100e+02, 1.1010e+03],
-           [2.0100e+02, 1.2010e+03],
-           [3.0100e+02, 1.3010e+03]],
-
-          [[1.1000e+01, 1.0110e+03],
-           [1.1100e+02, 1.1110e+03],
-           [2.1100e+02, 1.2110e+03],
-           [3.1100e+02, 1.3110e+03]],
-
-          [[2.1000e+01, 1.0210e+03],
-           [1.2100e+02, 1.1210e+03],
-           [2.2100e+02, 1.2210e+03],
-           [3.2100e+02, 1.3210e+03]]],
-
-
-         [[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
-
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]],
-
-          [[2.2000e+01, 1.0220e+03],
-           [1.2200e+02, 1.1220e+03],
-           [2.2200e+02, 1.2220e+03],
-           [3.2200e+02, 1.3220e+03]]]],
-
-
-
-        [[[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
-
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]],
-
-          [[2.2000e+01, 1.0220e+03],
-           [1.2200e+02, 1.1220e+03],
-           [2.2200e+02, 1.2220e+03],
-           [3.2200e+02, 1.3220e+03]]],
-
-
-         [[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
-
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]],
-
-          [[2.3000e+01, 1.0230e+03],
-           [1.2300e+02, 1.1230e+03],
-           [2.2300e+02, 1.2230e+03],
-           [3.2300e+02, 1.3230e+03]]]],
-
-
-
-        [[[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
-
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]],
-
-          [[2.3000e+01, 1.0230e+03],
-           [1.2300e+02, 1.1230e+03],
-           [2.2300e+02, 1.2230e+03],
-           [3.2300e+02, 1.3230e+03]]],
-
-
-         [[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
-
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]],
-
-          [[2.4000e+01, 1.0240e+03],
-           [1.2400e+02, 1.1240e+03],
-           [2.2400e+02, 1.2240e+03],
-           [3.2400e+02, 1.3240e+03]]]],
-
-
-
-        [[[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
-
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]],
-
-          [[2.4000e+01, 1.0240e+03],
-           [1.2400e+02, 1.1240e+03],
-           [2.2400e+02, 1.2240e+03],
-           [3.2400e+02, 1.3240e+03]]],
-
-
-         [[[5.0000e+00, 1.0050e+03],
-           [1.0500e+02, 1.1050e+03],
-           [2.0500e+02, 1.2050e+03],
-           [3.0500e+02, 1.3050e+03]],
-
-          [[1.5000e+01, 1.0150e+03],
-           [1.1500e+02, 1.1150e+03],
-           [2.1500e+02, 1.2150e+03],
-           [3.1500e+02, 1.3150e+03]],
-
-          [[2.5000e+01, 1.0250e+03],
-           [1.2500e+02, 1.1250e+03],
-           [2.2500e+02, 1.2250e+03],
-           [3.2500e+02, 1.3250e+03]]]]])
-
-########## torch.float32/torch.int64/size=(2, 3)+(6, 6)+(2, 1) ##########
-# sparse tensor
-tensor(crow_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[[[  1.],
-                           [101.]],
-
-                          [[ 11.],
-                           [111.]]],
-
-
-                         [[[  2.],
-                           [102.]],
-
-                          [[ 12.],
-                           [112.]]],
-
-
-                         [[[  3.],
-                           [103.]],
-
-                          [[ 13.],
-                           [113.]]]],
-
-
-
-                        [[[[  2.],
-                           [102.]],
-
-                          [[ 12.],
-                           [112.]]],
-
-
-                         [[[  3.],
-                           [103.]],
-
-                          [[ 13.],
-                           [113.]]],
-
-
-                         [[[  4.],
-                           [104.]],
-
-                          [[ 14.],
-                           [114.]]]],
-
-
-
-                        [[[[  3.],
-                           [103.]],
-
-                          [[ 13.],
-                           [113.]]],
-
-
-                         [[[  4.],
-                           [104.]],
-
-                          [[ 14.],
-                           [114.]]],
-
-
-                         [[[  5.],
-                           [105.]],
-
-                          [[ 15.],
-                           [115.]]]],
-
-
-
-                        [[[[  4.],
-                           [104.]],
-
-                          [[ 14.],
-                           [114.]]],
-
-
-                         [[[  5.],
-                           [105.]],
-
-                          [[ 15.],
-                           [115.]]],
-
-
-                         [[[  6.],
-                           [106.]],
-
-                          [[ 16.],
-                           [116.]]]]],
-
-
-
-
-                       [[[[[  5.],
-                           [105.]],
-
-                          [[ 15.],
-                           [115.]]],
-
-
-                         [[[  6.],
-                           [106.]],
-
-                          [[ 16.],
-                           [116.]]],
-
-
-                         [[[  7.],
-                           [107.]],
-
-                          [[ 17.],
-                           [117.]]]],
-
-
-
-                        [[[[  6.],
-                           [106.]],
-
-                          [[ 16.],
-                           [116.]]],
-
-
-                         [[[  7.],
-                           [107.]],
-
-                          [[ 17.],
-                           [117.]]],
-
-
-                         [[[  8.],
-                           [108.]],
-
-                          [[ 18.],
-                           [118.]]]],
-
-
-
-                        [[[[  7.],
-                           [107.]],
-
-                          [[ 17.],
-                           [117.]]],
-
-
-                         [[[  8.],
-                           [108.]],
-
-                          [[ 18.],
-                           [118.]]],
-
-
-                         [[[  9.],
-                           [109.]],
-
-                          [[ 19.],
-                           [119.]]]],
-
-
-
-                        [[[[  8.],
-                           [108.]],
-
-                          [[ 18.],
-                           [118.]]],
-
-
-                         [[[  9.],
-                           [109.]],
-
-                          [[ 19.],
-                           [119.]]],
-
-
-                         [[[ 10.],
-                           [110.]],
-
-                          [[ 20.],
-                           [120.]]]]],
-
-
-
-
-                       [[[[[  9.],
-                           [109.]],
-
-                          [[ 19.],
-                           [119.]]],
-
-
-                         [[[ 10.],
-                           [110.]],
-
-                          [[ 20.],
-                           [120.]]],
-
-
-                         [[[ 11.],
-                           [111.]],
-
-                          [[ 21.],
-                           [121.]]]],
-
-
-
-                        [[[[ 10.],
-                           [110.]],
-
-                          [[ 20.],
-                           [120.]]],
-
-
-                         [[[ 11.],
-                           [111.]],
-
-                          [[ 21.],
-                           [121.]]],
-
-
-                         [[[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]]]],
-
-
-
-                        [[[[ 11.],
-                           [111.]],
-
-                          [[ 21.],
-                           [121.]]],
-
-
-                         [[[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]]],
-
-
-                         [[[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]]]],
-
-
-
-                        [[[[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]]],
-
-
-                         [[[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]]],
-
-
-                         [[[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]]]]]],
-
-
-
-
-
-                      [[[[[[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]]],
-
-
-                         [[[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]]],
-
-
-                         [[[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]]]],
-
-
-
-                        [[[[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]]],
-
-
-                         [[[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]]],
-
-
-                         [[[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]]]],
-
-
-
-                        [[[[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]]],
-
-
-                         [[[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]]],
-
-
-                         [[[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]]]],
-
-
-
-                        [[[[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]]],
-
-
-                         [[[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]]],
-
-
-                         [[[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]]]]],
-
-
-
-
-                       [[[[[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]]],
-
-
-                         [[[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]]],
-
-
-                         [[[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]]]],
-
-
-
-                        [[[[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]]],
-
-
-                         [[[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]]],
-
-
-                         [[[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]]]],
-
-
-
-                        [[[[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]]],
-
-
-                         [[[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]]],
-
-
-                         [[[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]]]],
-
-
-
-                        [[[[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]]],
-
-
-                         [[[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]]],
-
-
-                         [[[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]]]]],
-
-
-
-
-                       [[[[[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]]],
-
-
-                         [[[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]]],
-
-
-                         [[[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]]]],
-
-
-
-                        [[[[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]]],
-
-
-                         [[[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]]],
-
-
-                         [[[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]]]],
-
-
-
-                        [[[[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]]],
-
-
-                         [[[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]]],
-
-
-                         [[[ 25.],
-                           [125.]],
-
-                          [[ 35.],
-                           [135.]]]],
-
-
-
-                        [[[[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]]],
-
-
-                         [[[ 25.],
-                           [125.]],
-
-                          [[ 35.],
-                           [135.]]],
-
-
-                         [[[ 26.],
-                           [126.]],
-
-                          [[ 36.],
-                           [136.]]]]]]]), size=(2, 3, 6, 6, 2, 1), nnz=4,
-       layout=torch.sparse_bsr)
-# _crow_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
-
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]])
-# _col_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
-
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]])
-# _values
-tensor([[[[[[[  1.],
-             [101.]],
-
-            [[ 11.],
-             [111.]]],
-
-
-           [[[  2.],
-             [102.]],
-
-            [[ 12.],
-             [112.]]],
-
-
-           [[[  3.],
-             [103.]],
-
-            [[ 13.],
-             [113.]]]],
-
-
-
-          [[[[  2.],
-             [102.]],
-
-            [[ 12.],
-             [112.]]],
-
-
-           [[[  3.],
-             [103.]],
-
-            [[ 13.],
-             [113.]]],
-
-
-           [[[  4.],
-             [104.]],
-
-            [[ 14.],
-             [114.]]]],
-
-
-
-          [[[[  3.],
-             [103.]],
-
-            [[ 13.],
-             [113.]]],
-
-
-           [[[  4.],
-             [104.]],
-
-            [[ 14.],
-             [114.]]],
-
-
-           [[[  5.],
-             [105.]],
-
-            [[ 15.],
-             [115.]]]],
-
-
-
-          [[[[  4.],
-             [104.]],
-
-            [[ 14.],
-             [114.]]],
-
-
-           [[[  5.],
-             [105.]],
-
-            [[ 15.],
-             [115.]]],
-
-
-           [[[  6.],
-             [106.]],
-
-            [[ 16.],
-             [116.]]]]],
-
-
-
-
-         [[[[[  5.],
-             [105.]],
-
-            [[ 15.],
-             [115.]]],
-
-
-           [[[  6.],
-             [106.]],
-
-            [[ 16.],
-             [116.]]],
-
-
-           [[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]]]],
-
-
-
-          [[[[  6.],
-             [106.]],
-
-            [[ 16.],
-             [116.]]],
-
-
-           [[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]]],
-
-
-           [[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]]]],
-
-
-
-          [[[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]]],
-
-
-           [[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]]],
-
-
-           [[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]]]],
-
-
-
-          [[[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]]],
-
-
-           [[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]]],
-
-
-           [[[ 10.],
-             [110.]],
-
-            [[ 20.],
-             [120.]]]]],
-
-
-
-
-         [[[[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]]],
-
-
-           [[[ 10.],
-             [110.]],
-
-            [[ 20.],
-             [120.]]],
-
-
-           [[[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]]]],
-
-
-
-          [[[[ 10.],
-             [110.]],
-
-            [[ 20.],
-             [120.]]],
-
-
-           [[[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]]],
-
-
-           [[[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]]],
-
-
-
-          [[[[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]]],
-
-
-           [[[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]],
-
-
-           [[[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]]],
-
-
-
-          [[[[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]],
-
-
-           [[[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]],
-
-
-           [[[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]]]]]],
-
-
-
-
-
-        [[[[[[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]],
-
-
-           [[[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]]],
-
-
-           [[[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]]]],
-
-
-
-          [[[[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]]],
-
-
-           [[[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]]],
-
-
-           [[[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]]]],
-
-
-
-          [[[[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]]],
-
-
-           [[[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]]],
-
-
-           [[[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]]],
-
-
-
-          [[[[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]]],
-
-
-           [[[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]],
-
-
-           [[[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]]]],
-
-
-
-
-         [[[[[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]],
-
-
-           [[[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]],
-
-
-           [[[ 19.],
-             [119.]],
-
-            [[ 29.],
-             [129.]]]],
-
-
-
-          [[[[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]],
-
-
-           [[[ 19.],
-             [119.]],
-
-            [[ 29.],
-             [129.]]],
-
-
-           [[[ 20.],
-             [120.]],
-
-            [[ 30.],
-             [130.]]]],
-
-
-
-          [[[[ 19.],
-             [119.]],
-
-            [[ 29.],
-             [129.]]],
-
-
-           [[[ 20.],
-             [120.]],
-
-            [[ 30.],
-             [130.]]],
-
-
-           [[[ 21.],
-             [121.]],
-
-            [[ 31.],
-             [131.]]]],
-
-
-
-          [[[[ 20.],
-             [120.]],
-
-            [[ 30.],
-             [130.]]],
-
-
-           [[[ 21.],
-             [121.]],
-
-            [[ 31.],
-             [131.]]],
-
-
-           [[[ 22.],
-             [122.]],
-
-            [[ 32.],
-             [132.]]]]],
-
-
-
-
-         [[[[[ 21.],
-             [121.]],
-
-            [[ 31.],
-             [131.]]],
-
-
-           [[[ 22.],
-             [122.]],
-
-            [[ 32.],
-             [132.]]],
-
-
-           [[[ 23.],
-             [123.]],
-
-            [[ 33.],
-             [133.]]]],
-
-
-
-          [[[[ 22.],
-             [122.]],
-
-            [[ 32.],
-             [132.]]],
-
-
-           [[[ 23.],
-             [123.]],
-
-            [[ 33.],
-             [133.]]],
-
-
-           [[[ 24.],
-             [124.]],
-
-            [[ 34.],
-             [134.]]]],
-
-
-
-          [[[[ 23.],
-             [123.]],
-
-            [[ 33.],
-             [133.]]],
-
-
-           [[[ 24.],
-             [124.]],
-
-            [[ 34.],
-             [134.]]],
-
-
-           [[[ 25.],
-             [125.]],
-
-            [[ 35.],
-             [135.]]]],
-
-
-
-          [[[[ 24.],
-             [124.]],
-
-            [[ 34.],
-             [134.]]],
-
-
-           [[[ 25.],
-             [125.]],
-
-            [[ 35.],
-             [135.]]],
-
-
-           [[[ 26.],
-             [126.]],
-
-            [[ 36.],
-             [136.]]]]]]])
-
-
-########## torch.float64/torch.int64/size=()+(6, 6)+(2,) ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[  1., 101.],
-                        [ 11., 111.]],
-
-                       [[  2., 102.],
-                        [ 12., 112.]],
-
-                       [[  3., 103.],
-                        [ 13., 113.]]],
-
-
-                      [[[  2., 102.],
-                        [ 12., 112.]],
-
-                       [[  3., 103.],
-                        [ 13., 113.]],
-
-                       [[  4., 104.],
-                        [ 14., 114.]]],
-
-
-                      [[[  3., 103.],
-                        [ 13., 113.]],
-
-                       [[  4., 104.],
-                        [ 14., 114.]],
-
-                       [[  5., 105.],
-                        [ 15., 115.]]],
-
-
-                      [[[  4., 104.],
-                        [ 14., 114.]],
-
-                       [[  5., 105.],
-                        [ 15., 115.]],
-
-                       [[  6., 106.],
-                        [ 16., 116.]]]]), size=(6, 6, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0, 2, 4])
-# _col_indices
-tensor([0, 1, 0, 2])
-# _values
-tensor([[[[  1., 101.],
-          [ 11., 111.]],
-
-         [[  2., 102.],
-          [ 12., 112.]],
-
-         [[  3., 103.],
-          [ 13., 113.]]],
-
-
-        [[[  2., 102.],
-          [ 12., 112.]],
-
-         [[  3., 103.],
-          [ 13., 113.]],
-
-         [[  4., 104.],
-          [ 14., 114.]]],
-
-
-        [[[  3., 103.],
-          [ 13., 113.]],
-
-         [[  4., 104.],
-          [ 14., 114.]],
-
-         [[  5., 105.],
-          [ 15., 115.]]],
-
-
-        [[[  4., 104.],
-          [ 14., 114.]],
-
-         [[  5., 105.],
-          [ 15., 115.]],
-
-         [[  6., 106.],
-          [ 16., 116.]]]], dtype=torch.float64)
-
-########## torch.float64/torch.int64/size=()+(4, 9)+(4, 2) ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[[1.0000e+00, 1.0010e+03],
-                         [1.0100e+02, 1.1010e+03],
-                         [2.0100e+02, 1.2010e+03],
-                         [3.0100e+02, 1.3010e+03]],
-
-                        [[1.1000e+01, 1.0110e+03],
-                         [1.1100e+02, 1.1110e+03],
-                         [2.1100e+02, 1.2110e+03],
-                         [3.1100e+02, 1.3110e+03]],
-
-                        [[2.1000e+01, 1.0210e+03],
-                         [1.2100e+02, 1.1210e+03],
-                         [2.2100e+02, 1.2210e+03],
-                         [3.2100e+02, 1.3210e+03]]],
-
-
-                       [[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
-
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]],
-
-                        [[2.2000e+01, 1.0220e+03],
-                         [1.2200e+02, 1.1220e+03],
-                         [2.2200e+02, 1.2220e+03],
-                         [3.2200e+02, 1.3220e+03]]]],
-
-
-
-                      [[[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
-
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]],
-
-                        [[2.2000e+01, 1.0220e+03],
-                         [1.2200e+02, 1.1220e+03],
-                         [2.2200e+02, 1.2220e+03],
-                         [3.2200e+02, 1.3220e+03]]],
-
-
-                       [[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
-
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]],
-
-                        [[2.3000e+01, 1.0230e+03],
-                         [1.2300e+02, 1.1230e+03],
-                         [2.2300e+02, 1.2230e+03],
-                         [3.2300e+02, 1.3230e+03]]]],
-
-
-
-                      [[[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
-
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]],
-
-                        [[2.3000e+01, 1.0230e+03],
-                         [1.2300e+02, 1.1230e+03],
-                         [2.2300e+02, 1.2230e+03],
-                         [3.2300e+02, 1.3230e+03]]],
-
-
-                       [[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
-
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]],
-
-                        [[2.4000e+01, 1.0240e+03],
-                         [1.2400e+02, 1.1240e+03],
-                         [2.2400e+02, 1.2240e+03],
-                         [3.2400e+02, 1.3240e+03]]]],
-
-
-
-                      [[[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
-
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]],
-
-                        [[2.4000e+01, 1.0240e+03],
-                         [1.2400e+02, 1.1240e+03],
-                         [2.2400e+02, 1.2240e+03],
-                         [3.2400e+02, 1.3240e+03]]],
-
-
-                       [[[5.0000e+00, 1.0050e+03],
-                         [1.0500e+02, 1.1050e+03],
-                         [2.0500e+02, 1.2050e+03],
-                         [3.0500e+02, 1.3050e+03]],
-
-                        [[1.5000e+01, 1.0150e+03],
-                         [1.1500e+02, 1.1150e+03],
-                         [2.1500e+02, 1.2150e+03],
-                         [3.1500e+02, 1.3150e+03]],
-
-                        [[2.5000e+01, 1.0250e+03],
-                         [1.2500e+02, 1.1250e+03],
-                         [2.2500e+02, 1.2250e+03],
-                         [3.2500e+02, 1.3250e+03]]]]]), size=(4, 9, 4, 2),
-       nnz=4, dtype=torch.float64, layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0, 2, 4])
-# _col_indices
-tensor([0, 1, 0, 2])
-# _values
-tensor([[[[[1.0000e+00, 1.0010e+03],
-           [1.0100e+02, 1.1010e+03],
-           [2.0100e+02, 1.2010e+03],
-           [3.0100e+02, 1.3010e+03]],
-
-          [[1.1000e+01, 1.0110e+03],
-           [1.1100e+02, 1.1110e+03],
-           [2.1100e+02, 1.2110e+03],
-           [3.1100e+02, 1.3110e+03]],
-
-          [[2.1000e+01, 1.0210e+03],
-           [1.2100e+02, 1.1210e+03],
-           [2.2100e+02, 1.2210e+03],
-           [3.2100e+02, 1.3210e+03]]],
-
-
-         [[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
-
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]],
-
-          [[2.2000e+01, 1.0220e+03],
-           [1.2200e+02, 1.1220e+03],
-           [2.2200e+02, 1.2220e+03],
-           [3.2200e+02, 1.3220e+03]]]],
-
-
-
-        [[[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
-
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]],
-
-          [[2.2000e+01, 1.0220e+03],
-           [1.2200e+02, 1.1220e+03],
-           [2.2200e+02, 1.2220e+03],
-           [3.2200e+02, 1.3220e+03]]],
-
-
-         [[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
-
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]],
-
-          [[2.3000e+01, 1.0230e+03],
-           [1.2300e+02, 1.1230e+03],
-           [2.2300e+02, 1.2230e+03],
-           [3.2300e+02, 1.3230e+03]]]],
-
-
-
-        [[[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
-
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]],
-
-          [[2.3000e+01, 1.0230e+03],
-           [1.2300e+02, 1.1230e+03],
-           [2.2300e+02, 1.2230e+03],
-           [3.2300e+02, 1.3230e+03]]],
-
-
-         [[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
-
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]],
-
-          [[2.4000e+01, 1.0240e+03],
-           [1.2400e+02, 1.1240e+03],
-           [2.2400e+02, 1.2240e+03],
-           [3.2400e+02, 1.3240e+03]]]],
-
-
-
-        [[[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
-
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]],
-
-          [[2.4000e+01, 1.0240e+03],
-           [1.2400e+02, 1.1240e+03],
-           [2.2400e+02, 1.2240e+03],
-           [3.2400e+02, 1.3240e+03]]],
-
-
-         [[[5.0000e+00, 1.0050e+03],
-           [1.0500e+02, 1.1050e+03],
-           [2.0500e+02, 1.2050e+03],
-           [3.0500e+02, 1.3050e+03]],
-
-          [[1.5000e+01, 1.0150e+03],
-           [1.1500e+02, 1.1150e+03],
-           [2.1500e+02, 1.2150e+03],
-           [3.1500e+02, 1.3150e+03]],
-
-          [[2.5000e+01, 1.0250e+03],
-           [1.2500e+02, 1.1250e+03],
-           [2.2500e+02, 1.2250e+03],
-           [3.2500e+02, 1.3250e+03]]]]], dtype=torch.float64)
-
-########## torch.float64/torch.int64/size=(2, 3)+(6, 6)+(2, 1) ##########
-# sparse tensor
-tensor(crow_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[[[  1.],
-                           [101.]],
-
-                          [[ 11.],
-                           [111.]]],
-
-
-                         [[[  2.],
-                           [102.]],
-
-                          [[ 12.],
-                           [112.]]],
-
-
-                         [[[  3.],
-                           [103.]],
-
-                          [[ 13.],
-                           [113.]]]],
-
-
-
-                        [[[[  2.],
-                           [102.]],
-
-                          [[ 12.],
-                           [112.]]],
-
-
-                         [[[  3.],
-                           [103.]],
-
-                          [[ 13.],
-                           [113.]]],
-
-
-                         [[[  4.],
-                           [104.]],
-
-                          [[ 14.],
-                           [114.]]]],
-
-
-
-                        [[[[  3.],
-                           [103.]],
-
-                          [[ 13.],
-                           [113.]]],
-
-
-                         [[[  4.],
-                           [104.]],
-
-                          [[ 14.],
-                           [114.]]],
-
-
-                         [[[  5.],
-                           [105.]],
-
-                          [[ 15.],
-                           [115.]]]],
-
-
-
-                        [[[[  4.],
-                           [104.]],
-
-                          [[ 14.],
-                           [114.]]],
-
-
-                         [[[  5.],
-                           [105.]],
-
-                          [[ 15.],
-                           [115.]]],
-
-
-                         [[[  6.],
-                           [106.]],
-
-                          [[ 16.],
-                           [116.]]]]],
-
-
-
-
-                       [[[[[  5.],
-                           [105.]],
-
-                          [[ 15.],
-                           [115.]]],
-
-
-                         [[[  6.],
-                           [106.]],
-
-                          [[ 16.],
-                           [116.]]],
-
-
-                         [[[  7.],
-                           [107.]],
-
-                          [[ 17.],
-                           [117.]]]],
-
-
-
-                        [[[[  6.],
-                           [106.]],
-
-                          [[ 16.],
-                           [116.]]],
-
-
-                         [[[  7.],
-                           [107.]],
-
-                          [[ 17.],
-                           [117.]]],
-
-
-                         [[[  8.],
-                           [108.]],
-
-                          [[ 18.],
-                           [118.]]]],
-
-
-
-                        [[[[  7.],
-                           [107.]],
-
-                          [[ 17.],
-                           [117.]]],
-
-
-                         [[[  8.],
-                           [108.]],
-
-                          [[ 18.],
-                           [118.]]],
-
-
-                         [[[  9.],
-                           [109.]],
-
-                          [[ 19.],
-                           [119.]]]],
-
-
-
-                        [[[[  8.],
-                           [108.]],
-
-                          [[ 18.],
-                           [118.]]],
-
-
-                         [[[  9.],
-                           [109.]],
-
-                          [[ 19.],
-                           [119.]]],
-
-
-                         [[[ 10.],
-                           [110.]],
-
-                          [[ 20.],
-                           [120.]]]]],
-
-
-
-
-                       [[[[[  9.],
-                           [109.]],
-
-                          [[ 19.],
-                           [119.]]],
-
-
-                         [[[ 10.],
-                           [110.]],
-
-                          [[ 20.],
-                           [120.]]],
-
-
-                         [[[ 11.],
-                           [111.]],
-
-                          [[ 21.],
-                           [121.]]]],
-
-
-
-                        [[[[ 10.],
-                           [110.]],
-
-                          [[ 20.],
-                           [120.]]],
-
-
-                         [[[ 11.],
-                           [111.]],
-
-                          [[ 21.],
-                           [121.]]],
-
-
-                         [[[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]]]],
-
-
-
-                        [[[[ 11.],
-                           [111.]],
-
-                          [[ 21.],
-                           [121.]]],
-
-
-                         [[[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]]],
-
-
-                         [[[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]]]],
-
-
-
-                        [[[[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]]],
-
-
-                         [[[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]]],
-
-
-                         [[[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]]]]]],
-
-
-
-
-
-                      [[[[[[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]]],
-
-
-                         [[[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]]],
-
-
-                         [[[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]]]],
-
-
-
-                        [[[[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]]],
-
-
-                         [[[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]]],
-
-
-                         [[[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]]]],
-
-
-
-                        [[[[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]]],
-
-
-                         [[[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]]],
-
-
-                         [[[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]]]],
-
-
-
-                        [[[[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]]],
-
-
-                         [[[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]]],
-
-
-                         [[[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]]]]],
-
-
-
-
-                       [[[[[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]]],
-
-
-                         [[[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]]],
-
-
-                         [[[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]]]],
-
-
-
-                        [[[[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]]],
-
-
-                         [[[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]]],
-
-
-                         [[[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]]]],
-
-
-
-                        [[[[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]]],
-
-
-                         [[[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]]],
-
-
-                         [[[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]]]],
-
-
-
-                        [[[[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]]],
-
-
-                         [[[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]]],
-
-
-                         [[[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]]]]],
-
-
-
-
-                       [[[[[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]]],
-
-
-                         [[[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]]],
-
-
-                         [[[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]]]],
-
-
-
-                        [[[[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]]],
-
-
-                         [[[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]]],
-
-
-                         [[[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]]]],
-
-
-
-                        [[[[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]]],
-
-
-                         [[[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]]],
-
-
-                         [[[ 25.],
-                           [125.]],
-
-                          [[ 35.],
-                           [135.]]]],
-
-
-
-                        [[[[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]]],
-
-
-                         [[[ 25.],
-                           [125.]],
-
-                          [[ 35.],
-                           [135.]]],
-
-
-                         [[[ 26.],
-                           [126.]],
-
-                          [[ 36.],
-                           [136.]]]]]]]), size=(2, 3, 6, 6, 2, 1), nnz=4,
+                        [[24., 34.],
+                         [25., 35.],
+                         [26., 36.],
+                         [27., 37.]]]]]), size=(8, 6, 4, 2), nnz=7,
        dtype=torch.float64, layout=torch.sparse_bsr)
 # _crow_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
-
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]])
+tensor([0, 2, 3, 5, 7])
 # _col_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
-
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]])
+tensor([0, 1, 0, 0, 1, 0, 1])
 # _values
-tensor([[[[[[[  1.],
-             [101.]],
-
-            [[ 11.],
-             [111.]]],
-
-
-           [[[  2.],
-             [102.]],
-
-            [[ 12.],
-             [112.]]],
-
-
-           [[[  3.],
-             [103.]],
-
-            [[ 13.],
-             [113.]]]],
-
-
-
-          [[[[  2.],
-             [102.]],
-
-            [[ 12.],
-             [112.]]],
-
-
-           [[[  3.],
-             [103.]],
-
-            [[ 13.],
-             [113.]]],
-
-
-           [[[  4.],
-             [104.]],
-
-            [[ 14.],
-             [114.]]]],
-
-
-
-          [[[[  3.],
-             [103.]],
-
-            [[ 13.],
-             [113.]]],
-
-
-           [[[  4.],
-             [104.]],
-
-            [[ 14.],
-             [114.]]],
-
-
-           [[[  5.],
-             [105.]],
-
-            [[ 15.],
-             [115.]]]],
-
-
-
-          [[[[  4.],
-             [104.]],
-
-            [[ 14.],
-             [114.]]],
-
-
-           [[[  5.],
-             [105.]],
-
-            [[ 15.],
-             [115.]]],
-
-
-           [[[  6.],
-             [106.]],
-
-            [[ 16.],
-             [116.]]]]],
-
-
-
-
-         [[[[[  5.],
-             [105.]],
-
-            [[ 15.],
-             [115.]]],
-
-
-           [[[  6.],
-             [106.]],
-
-            [[ 16.],
-             [116.]]],
-
-
-           [[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]]]],
-
-
-
-          [[[[  6.],
-             [106.]],
-
-            [[ 16.],
-             [116.]]],
-
-
-           [[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]]],
-
-
-           [[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]]]],
-
-
-
-          [[[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]]],
-
-
-           [[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]]],
-
-
-           [[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]]]],
-
-
-
-          [[[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]]],
-
-
-           [[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]]],
-
-
-           [[[ 10.],
-             [110.]],
-
-            [[ 20.],
-             [120.]]]]],
-
-
-
-
-         [[[[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]]],
-
-
-           [[[ 10.],
-             [110.]],
-
-            [[ 20.],
-             [120.]]],
-
-
-           [[[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]]]],
-
-
-
-          [[[[ 10.],
-             [110.]],
-
-            [[ 20.],
-             [120.]]],
-
-
-           [[[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]]],
-
-
-           [[[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]]],
-
-
-
-          [[[[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]]],
-
-
-           [[[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]],
-
-
-           [[[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]]],
-
-
-
-          [[[[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]],
-
-
-           [[[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]],
-
-
-           [[[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]]]]]],
-
-
-
-
-
-        [[[[[[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]],
-
-
-           [[[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]]],
-
-
-           [[[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]]]],
-
-
-
-          [[[[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]]],
-
-
-           [[[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]]],
-
-
-           [[[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]]]],
-
-
-
-          [[[[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]]],
-
-
-           [[[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]]],
-
-
-           [[[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]]],
-
-
-
-          [[[[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]]],
-
-
-           [[[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]],
-
-
-           [[[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]]]],
-
-
-
-
-         [[[[[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]],
-
-
-           [[[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]],
-
-
-           [[[ 19.],
-             [119.]],
-
-            [[ 29.],
-             [129.]]]],
-
-
-
-          [[[[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]],
-
-
-           [[[ 19.],
-             [119.]],
-
-            [[ 29.],
-             [129.]]],
-
-
-           [[[ 20.],
-             [120.]],
-
-            [[ 30.],
-             [130.]]]],
-
+tensor([[[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 1., 11.],
+           [ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.]],
 
-          [[[[ 19.],
-             [119.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-            [[ 29.],
-             [129.]]],
 
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-           [[[ 20.],
-             [120.]],
+          [[ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.]],
 
-            [[ 30.],
-             [130.]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
 
-           [[[ 21.],
-             [121.]],
 
-            [[ 31.],
-             [131.]]]],
+        [[[[ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.]]],
 
-          [[[[ 20.],
-             [120.]],
 
-            [[ 30.],
-             [130.]]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.]],
 
-           [[[ 21.],
-             [121.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-            [[ 31.],
-             [131.]]],
 
 
-           [[[ 22.],
-             [122.]],
+        [[[[ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.]],
 
-            [[ 32.],
-             [132.]]]]],
+          [[ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.],
+           [10., 20.]],
 
+          [[ 8., 18.],
+           [ 9., 19.],
+           [10., 20.],
+           [11., 21.]]],
 
 
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-         [[[[[ 21.],
-             [121.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 31.],
-             [131.]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
 
-           [[[ 22.],
-             [122.]],
 
-            [[ 32.],
-             [132.]]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 9., 19.],
+           [10., 20.],
+           [11., 21.],
+           [12., 22.]],
 
-           [[[ 23.],
-             [123.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-            [[ 33.],
-             [133.]]]],
 
+         [[[13., 23.],
+           [14., 24.],
+           [15., 25.],
+           [16., 26.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-          [[[[ 22.],
-             [122.]],
+          [[14., 24.],
+           [15., 25.],
+           [16., 26.],
+           [17., 27.]]]],
 
-            [[ 32.],
-             [132.]]],
 
 
-           [[[ 23.],
-             [123.]],
+        [[[[10., 20.],
+           [11., 21.],
+           [12., 22.],
+           [13., 23.]],
 
-            [[ 33.],
-             [133.]]],
+          [[11., 21.],
+           [12., 22.],
+           [13., 23.],
+           [14., 24.]],
 
+          [[12., 22.],
+           [13., 23.],
+           [14., 24.],
+           [15., 25.]]],
 
-           [[[ 24.],
-             [124.]],
 
-            [[ 34.],
-             [134.]]]],
+         [[[15., 25.],
+           [16., 26.],
+           [17., 27.],
+           [18., 28.]],
 
+          [[16., 26.],
+           [17., 27.],
+           [18., 28.],
+           [19., 29.]],
 
+          [[17., 27.],
+           [18., 28.],
+           [19., 29.],
+           [20., 30.]]]],
 
-          [[[[ 23.],
-             [123.]],
 
-            [[ 33.],
-             [133.]]],
 
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-           [[[ 24.],
-             [124.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 34.],
-             [134.]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
 
-           [[[ 25.],
-             [125.]],
+         [[[20., 30.],
+           [21., 31.],
+           [22., 32.],
+           [23., 33.]],
 
-            [[ 35.],
-             [135.]]]],
+          [[21., 31.],
+           [22., 32.],
+           [23., 33.],
+           [24., 34.]],
 
+          [[22., 32.],
+           [23., 33.],
+           [24., 34.],
+           [25., 35.]]]],
 
 
-          [[[[ 24.],
-             [124.]],
 
-            [[ 34.],
-             [134.]]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[18., 28.],
+           [19., 29.],
+           [20., 30.],
+           [21., 31.]],
 
-           [[[ 25.],
-             [125.]],
+          [[19., 29.],
+           [20., 30.],
+           [21., 31.],
+           [22., 32.]]],
 
-            [[ 35.],
-             [135.]]],
 
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-           [[[ 26.],
-             [126.]],
+          [[23., 33.],
+           [24., 34.],
+           [25., 35.],
+           [26., 36.]],
 
-            [[ 36.],
-             [136.]]]]]]], dtype=torch.float64)
+          [[24., 34.],
+           [25., 35.],
+           [26., 36.],
+           [27., 37.]]]]], dtype=torch.float64)
 
diff --git a/test/expect/TestSparseCompressedCPU.test_print_SparseCSC_cpu.expect b/test/expect/TestSparseCompressedCPU.test_print_SparseCSC_cpu.expect
index 15e9bb56a85c7..70c00eb95db6a 100644
--- a/test/expect/TestSparseCompressedCPU.test_print_SparseCSC_cpu.expect
+++ b/test/expect/TestSparseCompressedCPU.test_print_SparseCSC_cpu.expect
@@ -1,1411 +1,1653 @@
-########## torch.float32/torch.int32/size=()+(3, 2)+() ##########
+########## torch.float32/torch.int32/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([1., 2., 3., 4.]), size=(3, 2), nnz=4,
-       layout=torch.sparse_csc)
-# _ccol_indices
-tensor([0, 2, 4], dtype=torch.int32)
-# _row_indices
-tensor([0, 1, 0, 2], dtype=torch.int32)
-# _values
-tensor([1., 2., 3., 4.])
-
-########## torch.float32/torch.int32/size=()+(0, 0)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0]),
-       row_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0,)), size=(0, 0), nnz=0,
-       layout=torch.sparse_csc)
-# _ccol_indices
-tensor([0], dtype=torch.int32)
-# _row_indices
-tensor([], dtype=torch.int32)
-# _values
-tensor([])
-
-########## torch.float32/torch.int32/size=(2,)+(3, 2)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       row_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[1., 2., 3., 4.],
-                      [5., 6., 7., 8.]]), size=(2, 3, 2), nnz=4,
-       layout=torch.sparse_csc)
-# _ccol_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]], dtype=torch.int32)
-# _row_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]], dtype=torch.int32)
-# _values
-tensor([[1., 2., 3., 4.],
-        [5., 6., 7., 8.]])
-
-########## torch.float32/torch.int32/size=(2, 3)+(3, 2)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
+tensor(ccol_indices=tensor([[[0, 2, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 2, 3, 4]],
 
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
+                            [[0, 1, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 1, 3, 4]]]),
        row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[ 1.,  2.,  3.,  4.],
-                       [ 5.,  6.,  7.,  8.],
-                       [ 9., 10., 11., 12.]],
-
-                      [[13., 14., 15., 16.],
-                       [17., 18., 19., 20.],
-                       [21., 22., 23., 24.]]]), size=(2, 3, 3, 2), nnz=4,
+                            [0, 1, 0, 0],
+                            [0, 1, 1, 1]],
+
+                           [[1, 0, 1, 1],
+                            [0, 1, 1, 0],
+                            [0, 0, 1, 0]]]),
+       values=tensor([[[1., 3., 2., 4.],
+                       [1., 4., 2., 3.],
+                       [1., 2., 3., 4.]],
+
+                      [[2., 1., 3., 4.],
+                       [1., 3., 4., 2.],
+                       [1., 2., 4., 3.]]]), size=(2, 3, 2, 3), nnz=4,
        layout=torch.sparse_csc)
 # _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+tensor([[[0, 2, 3, 4],
+         [0, 2, 3, 4],
+         [0, 2, 3, 4]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], dtype=torch.int32)
+        [[0, 1, 3, 4],
+         [0, 2, 3, 4],
+         [0, 1, 3, 4]]], dtype=torch.int32)
 # _row_indices
 tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
+         [0, 1, 0, 0],
+         [0, 1, 1, 1]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], dtype=torch.int32)
+        [[1, 0, 1, 1],
+         [0, 1, 1, 0],
+         [0, 0, 1, 0]]], dtype=torch.int32)
 # _values
-tensor([[[ 1.,  2.,  3.,  4.],
-         [ 5.,  6.,  7.,  8.],
-         [ 9., 10., 11., 12.]],
+tensor([[[1., 3., 2., 4.],
+         [1., 4., 2., 3.],
+         [1., 2., 3., 4.]],
 
-        [[13., 14., 15., 16.],
-         [17., 18., 19., 20.],
-         [21., 22., 23., 24.]]])
+        [[2., 1., 3., 4.],
+         [1., 3., 4., 2.],
+         [1., 2., 4., 3.]]])
 
-
-########## torch.float64/torch.int32/size=()+(3, 2)+() ##########
+########## torch.float32/torch.int32/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([1., 2., 3., 4.]), size=(3, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csc)
+tensor(ccol_indices=tensor([ 0,  3,  8, 11, 14, 19, 24]),
+       row_indices=tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6,
+                           7, 0, 4, 5, 6, 7]),
+       values=tensor([ 6., 13., 20.,  1.,  4.,  7.,  9., 21.,  8., 14., 22.,
+                       2., 10., 15.,  5., 11., 16., 18., 23.,  3., 12., 17.,
+                      19., 24.]), size=(8, 6), nnz=24, layout=torch.sparse_csc)
 # _ccol_indices
-tensor([0, 2, 4], dtype=torch.int32)
+tensor([ 0,  3,  8, 11, 14, 19, 24], dtype=torch.int32)
 # _row_indices
-tensor([0, 1, 0, 2], dtype=torch.int32)
+tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6, 7, 0, 4, 5, 6, 7],
+       dtype=torch.int32)
 # _values
-tensor([1., 2., 3., 4.], dtype=torch.float64)
+tensor([ 6., 13., 20.,  1.,  4.,  7.,  9., 21.,  8., 14., 22.,  2., 10., 15.,
+         5., 11., 16., 18., 23.,  3., 12., 17., 19., 24.])
 
-########## torch.float64/torch.int32/size=()+(0, 0)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0]),
-       row_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0,)), size=(0, 0), nnz=0, dtype=torch.float64,
-       layout=torch.sparse_csc)
-# _ccol_indices
-tensor([0], dtype=torch.int32)
-# _row_indices
-tensor([], dtype=torch.int32)
-# _values
-tensor([], dtype=torch.float64)
 
-########## torch.float64/torch.int32/size=(2,)+(3, 2)+() ##########
+########## torch.float64/torch.int32/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
-tensor(ccol_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       row_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[1., 2., 3., 4.],
-                      [5., 6., 7., 8.]]), size=(2, 3, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csc)
-# _ccol_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]], dtype=torch.int32)
-# _row_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]], dtype=torch.int32)
-# _values
-tensor([[1., 2., 3., 4.],
-        [5., 6., 7., 8.]], dtype=torch.float64)
-
-########## torch.float64/torch.int32/size=(2, 3)+(3, 2)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
+tensor(ccol_indices=tensor([[[0, 2, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 2, 3, 4]],
 
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
+                            [[0, 1, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 1, 3, 4]]]),
        row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[ 1.,  2.,  3.,  4.],
-                       [ 5.,  6.,  7.,  8.],
-                       [ 9., 10., 11., 12.]],
-
-                      [[13., 14., 15., 16.],
-                       [17., 18., 19., 20.],
-                       [21., 22., 23., 24.]]]), size=(2, 3, 3, 2), nnz=4,
+                            [0, 1, 0, 0],
+                            [0, 1, 1, 1]],
+
+                           [[1, 0, 1, 1],
+                            [0, 1, 1, 0],
+                            [0, 0, 1, 0]]]),
+       values=tensor([[[1., 3., 2., 4.],
+                       [1., 4., 2., 3.],
+                       [1., 2., 3., 4.]],
+
+                      [[2., 1., 3., 4.],
+                       [1., 3., 4., 2.],
+                       [1., 2., 4., 3.]]]), size=(2, 3, 2, 3), nnz=4,
        dtype=torch.float64, layout=torch.sparse_csc)
 # _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+tensor([[[0, 2, 3, 4],
+         [0, 2, 3, 4],
+         [0, 2, 3, 4]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], dtype=torch.int32)
+        [[0, 1, 3, 4],
+         [0, 2, 3, 4],
+         [0, 1, 3, 4]]], dtype=torch.int32)
 # _row_indices
 tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
+         [0, 1, 0, 0],
+         [0, 1, 1, 1]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], dtype=torch.int32)
+        [[1, 0, 1, 1],
+         [0, 1, 1, 0],
+         [0, 0, 1, 0]]], dtype=torch.int32)
 # _values
-tensor([[[ 1.,  2.,  3.,  4.],
-         [ 5.,  6.,  7.,  8.],
-         [ 9., 10., 11., 12.]],
-
-        [[13., 14., 15., 16.],
-         [17., 18., 19., 20.],
-         [21., 22., 23., 24.]]], dtype=torch.float64)
+tensor([[[1., 3., 2., 4.],
+         [1., 4., 2., 3.],
+         [1., 2., 3., 4.]],
 
+        [[2., 1., 3., 4.],
+         [1., 3., 4., 2.],
+         [1., 2., 4., 3.]]], dtype=torch.float64)
 
-########## torch.float32/torch.int64/size=()+(3, 2)+() ##########
+########## torch.float64/torch.int32/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([1., 2., 3., 4.]), size=(3, 2), nnz=4,
+tensor(ccol_indices=tensor([ 0,  3,  8, 11, 14, 19, 24]),
+       row_indices=tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6,
+                           7, 0, 4, 5, 6, 7]),
+       values=tensor([ 6., 13., 20.,  1.,  4.,  7.,  9., 21.,  8., 14., 22.,
+                       2., 10., 15.,  5., 11., 16., 18., 23.,  3., 12., 17.,
+                      19., 24.]), size=(8, 6), nnz=24, dtype=torch.float64,
        layout=torch.sparse_csc)
 # _ccol_indices
-tensor([0, 2, 4])
+tensor([ 0,  3,  8, 11, 14, 19, 24], dtype=torch.int32)
 # _row_indices
-tensor([0, 1, 0, 2])
+tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6, 7, 0, 4, 5, 6, 7],
+       dtype=torch.int32)
 # _values
-tensor([1., 2., 3., 4.])
+tensor([ 6., 13., 20.,  1.,  4.,  7.,  9., 21.,  8., 14., 22.,  2., 10., 15.,
+         5., 11., 16., 18., 23.,  3., 12., 17., 19., 24.], dtype=torch.float64)
 
-########## torch.float32/torch.int64/size=()+(0, 0)+() ##########
+
+########## torch.float32/torch.int64/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
-tensor(ccol_indices=tensor([0]),
-       row_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0,)), size=(0, 0), nnz=0,
+tensor(ccol_indices=tensor([[[0, 2, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 2, 3, 4]],
+
+                            [[0, 1, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 1, 3, 4]]]),
+       row_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 0],
+                            [0, 1, 1, 1]],
+
+                           [[1, 0, 1, 1],
+                            [0, 1, 1, 0],
+                            [0, 0, 1, 0]]]),
+       values=tensor([[[1., 3., 2., 4.],
+                       [1., 4., 2., 3.],
+                       [1., 2., 3., 4.]],
+
+                      [[2., 1., 3., 4.],
+                       [1., 3., 4., 2.],
+                       [1., 2., 4., 3.]]]), size=(2, 3, 2, 3), nnz=4,
        layout=torch.sparse_csc)
 # _ccol_indices
-tensor([0])
+tensor([[[0, 2, 3, 4],
+         [0, 2, 3, 4],
+         [0, 2, 3, 4]],
+
+        [[0, 1, 3, 4],
+         [0, 2, 3, 4],
+         [0, 1, 3, 4]]])
 # _row_indices
-tensor([], dtype=torch.int64)
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 0],
+         [0, 1, 1, 1]],
+
+        [[1, 0, 1, 1],
+         [0, 1, 1, 0],
+         [0, 0, 1, 0]]])
 # _values
-tensor([])
+tensor([[[1., 3., 2., 4.],
+         [1., 4., 2., 3.],
+         [1., 2., 3., 4.]],
+
+        [[2., 1., 3., 4.],
+         [1., 3., 4., 2.],
+         [1., 2., 4., 3.]]])
 
-########## torch.float32/torch.int64/size=(2,)+(3, 2)+() ##########
+########## torch.float32/torch.int64/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(ccol_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       row_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[1., 2., 3., 4.],
-                      [5., 6., 7., 8.]]), size=(2, 3, 2), nnz=4,
-       layout=torch.sparse_csc)
+tensor(ccol_indices=tensor([ 0,  3,  8, 11, 14, 19, 24]),
+       row_indices=tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6,
+                           7, 0, 4, 5, 6, 7]),
+       values=tensor([ 6., 13., 20.,  1.,  4.,  7.,  9., 21.,  8., 14., 22.,
+                       2., 10., 15.,  5., 11., 16., 18., 23.,  3., 12., 17.,
+                      19., 24.]), size=(8, 6), nnz=24, layout=torch.sparse_csc)
 # _ccol_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]])
+tensor([ 0,  3,  8, 11, 14, 19, 24])
 # _row_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]])
+tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6, 7, 0, 4, 5, 6, 7])
 # _values
-tensor([[1., 2., 3., 4.],
-        [5., 6., 7., 8.]])
+tensor([ 6., 13., 20.,  1.,  4.,  7.,  9., 21.,  8., 14., 22.,  2., 10., 15.,
+         5., 11., 16., 18., 23.,  3., 12., 17., 19., 24.])
+
 
-########## torch.float32/torch.int64/size=(2, 3)+(3, 2)+() ##########
+########## torch.float64/torch.int64/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
+tensor(ccol_indices=tensor([[[0, 2, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 2, 3, 4]],
 
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
+                            [[0, 1, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 1, 3, 4]]]),
        row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[ 1.,  2.,  3.,  4.],
-                       [ 5.,  6.,  7.,  8.],
-                       [ 9., 10., 11., 12.]],
-
-                      [[13., 14., 15., 16.],
-                       [17., 18., 19., 20.],
-                       [21., 22., 23., 24.]]]), size=(2, 3, 3, 2), nnz=4,
-       layout=torch.sparse_csc)
+                            [0, 1, 0, 0],
+                            [0, 1, 1, 1]],
+
+                           [[1, 0, 1, 1],
+                            [0, 1, 1, 0],
+                            [0, 0, 1, 0]]]),
+       values=tensor([[[1., 3., 2., 4.],
+                       [1., 4., 2., 3.],
+                       [1., 2., 3., 4.]],
+
+                      [[2., 1., 3., 4.],
+                       [1., 3., 4., 2.],
+                       [1., 2., 4., 3.]]]), size=(2, 3, 2, 3), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_csc)
 # _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+tensor([[[0, 2, 3, 4],
+         [0, 2, 3, 4],
+         [0, 2, 3, 4]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]])
+        [[0, 1, 3, 4],
+         [0, 2, 3, 4],
+         [0, 1, 3, 4]]])
 # _row_indices
 tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
+         [0, 1, 0, 0],
+         [0, 1, 1, 1]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]])
+        [[1, 0, 1, 1],
+         [0, 1, 1, 0],
+         [0, 0, 1, 0]]])
 # _values
-tensor([[[ 1.,  2.,  3.,  4.],
-         [ 5.,  6.,  7.,  8.],
-         [ 9., 10., 11., 12.]],
-
-        [[13., 14., 15., 16.],
-         [17., 18., 19., 20.],
-         [21., 22., 23., 24.]]])
+tensor([[[1., 3., 2., 4.],
+         [1., 4., 2., 3.],
+         [1., 2., 3., 4.]],
 
+        [[2., 1., 3., 4.],
+         [1., 3., 4., 2.],
+         [1., 2., 4., 3.]]], dtype=torch.float64)
 
-########## torch.float64/torch.int64/size=()+(3, 2)+() ##########
+########## torch.float64/torch.int64/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([1., 2., 3., 4.]), size=(3, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csc)
-# _ccol_indices
-tensor([0, 2, 4])
-# _row_indices
-tensor([0, 1, 0, 2])
-# _values
-tensor([1., 2., 3., 4.], dtype=torch.float64)
-
-########## torch.float64/torch.int64/size=()+(0, 0)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0]),
-       row_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0,)), size=(0, 0), nnz=0, dtype=torch.float64,
+tensor(ccol_indices=tensor([ 0,  3,  8, 11, 14, 19, 24]),
+       row_indices=tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6,
+                           7, 0, 4, 5, 6, 7]),
+       values=tensor([ 6., 13., 20.,  1.,  4.,  7.,  9., 21.,  8., 14., 22.,
+                       2., 10., 15.,  5., 11., 16., 18., 23.,  3., 12., 17.,
+                      19., 24.]), size=(8, 6), nnz=24, dtype=torch.float64,
        layout=torch.sparse_csc)
 # _ccol_indices
-tensor([0])
+tensor([ 0,  3,  8, 11, 14, 19, 24])
 # _row_indices
-tensor([], dtype=torch.int64)
+tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6, 7, 0, 4, 5, 6, 7])
 # _values
-tensor([], dtype=torch.float64)
+tensor([ 6., 13., 20.,  1.,  4.,  7.,  9., 21.,  8., 14., 22.,  2., 10., 15.,
+         5., 11., 16., 18., 23.,  3., 12., 17., 19., 24.], dtype=torch.float64)
 
-########## torch.float64/torch.int64/size=(2,)+(3, 2)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       row_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[1., 2., 3., 4.],
-                      [5., 6., 7., 8.]]), size=(2, 3, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csc)
-# _ccol_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]])
-# _row_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]])
-# _values
-tensor([[1., 2., 3., 4.],
-        [5., 6., 7., 8.]], dtype=torch.float64)
 
-########## torch.float64/torch.int64/size=(2, 3)+(3, 2)+() ##########
+########## torch.float32/torch.int32/size=(2, 3)+(2, 3)+(4,) ##########
 # sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
+tensor(ccol_indices=tensor([[[0, 2, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 2, 3, 4]],
 
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
+                            [[0, 1, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 1, 3, 4]]]),
        row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[ 1.,  2.,  3.,  4.],
-                       [ 5.,  6.,  7.,  8.],
-                       [ 9., 10., 11., 12.]],
-
-                      [[13., 14., 15., 16.],
-                       [17., 18., 19., 20.],
-                       [21., 22., 23., 24.]]]), size=(2, 3, 3, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csc)
+                            [0, 1, 0, 0],
+                            [0, 1, 1, 1]],
+
+                           [[1, 0, 1, 1],
+                            [0, 1, 1, 0],
+                            [0, 0, 1, 0]]]),
+       values=tensor([[[[1., 2., 3., 4.],
+                        [3., 4., 5., 6.],
+                        [2., 3., 4., 5.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [4., 5., 6., 7.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]]],
+
+
+                      [[[2., 3., 4., 5.],
+                        [1., 2., 3., 4.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.],
+                        [2., 3., 4., 5.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [4., 5., 6., 7.],
+                        [3., 4., 5., 6.]]]]), size=(2, 3, 2, 3, 4), nnz=4,
+       layout=torch.sparse_csc)
 # _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+tensor([[[0, 2, 3, 4],
+         [0, 2, 3, 4],
+         [0, 2, 3, 4]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]])
+        [[0, 1, 3, 4],
+         [0, 2, 3, 4],
+         [0, 1, 3, 4]]], dtype=torch.int32)
 # _row_indices
 tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
-
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]])
-# _values
-tensor([[[ 1.,  2.,  3.,  4.],
-         [ 5.,  6.,  7.,  8.],
-         [ 9., 10., 11., 12.]],
+         [0, 1, 0, 0],
+         [0, 1, 1, 1]],
 
-        [[13., 14., 15., 16.],
-         [17., 18., 19., 20.],
-         [21., 22., 23., 24.]]], dtype=torch.float64)
-
-
-########## torch.float32/torch.int32/size=()+(3, 2)+(2,) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[1., 2.],
-                      [2., 3.],
-                      [3., 4.],
-                      [4., 5.]]), size=(3, 2, 2), nnz=4,
-       layout=torch.sparse_csc)
-# _ccol_indices
-tensor([0, 2, 4], dtype=torch.int32)
-# _row_indices
-tensor([0, 1, 0, 2], dtype=torch.int32)
+        [[1, 0, 1, 1],
+         [0, 1, 1, 0],
+         [0, 0, 1, 0]]], dtype=torch.int32)
 # _values
-tensor([[1., 2.],
-        [2., 3.],
-        [3., 4.],
-        [4., 5.]])
-
-########## torch.float32/torch.int32/size=()+(3, 2)+(4, 2) ##########
+tensor([[[[1., 2., 3., 4.],
+          [3., 4., 5., 6.],
+          [2., 3., 4., 5.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [4., 5., 6., 7.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]]],
+
+
+        [[[2., 3., 4., 5.],
+          [1., 2., 3., 4.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.],
+          [2., 3., 4., 5.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [4., 5., 6., 7.],
+          [3., 4., 5., 6.]]]])
+
+########## torch.float32/torch.int32/size=()+(8, 6)+(4, 2) ##########
 # sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[ 1., 11.],
+tensor(ccol_indices=tensor([ 0,  3,  8, 11, 14, 19, 24]),
+       row_indices=tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6,
+                           7, 0, 4, 5, 6, 7]),
+       values=tensor([[[ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.]],
+
+                      [[13., 23.],
+                       [14., 24.],
+                       [15., 25.],
+                       [16., 26.]],
+
+                      [[20., 30.],
+                       [21., 31.],
+                       [22., 32.],
+                       [23., 33.]],
+
+                      [[ 1., 11.],
                        [ 2., 12.],
                        [ 3., 13.],
                        [ 4., 14.]],
 
+                      [[ 4., 14.],
+                       [ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.]],
+
+                      [[ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.]],
+
+                      [[ 9., 19.],
+                       [10., 20.],
+                       [11., 21.],
+                       [12., 22.]],
+
+                      [[21., 31.],
+                       [22., 32.],
+                       [23., 33.],
+                       [24., 34.]],
+
+                      [[ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.],
+                       [11., 21.]],
+
+                      [[14., 24.],
+                       [15., 25.],
+                       [16., 26.],
+                       [17., 27.]],
+
+                      [[22., 32.],
+                       [23., 33.],
+                       [24., 34.],
+                       [25., 35.]],
+
                       [[ 2., 12.],
                        [ 3., 13.],
                        [ 4., 14.],
                        [ 5., 15.]],
 
+                      [[10., 20.],
+                       [11., 21.],
+                       [12., 22.],
+                       [13., 23.]],
+
+                      [[15., 25.],
+                       [16., 26.],
+                       [17., 27.],
+                       [18., 28.]],
+
+                      [[ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.]],
+
+                      [[11., 21.],
+                       [12., 22.],
+                       [13., 23.],
+                       [14., 24.]],
+
+                      [[16., 26.],
+                       [17., 27.],
+                       [18., 28.],
+                       [19., 29.]],
+
+                      [[18., 28.],
+                       [19., 29.],
+                       [20., 30.],
+                       [21., 31.]],
+
+                      [[23., 33.],
+                       [24., 34.],
+                       [25., 35.],
+                       [26., 36.]],
+
                       [[ 3., 13.],
                        [ 4., 14.],
                        [ 5., 15.],
                        [ 6., 16.]],
 
-                      [[ 4., 14.],
-                       [ 5., 15.],
-                       [ 6., 16.],
-                       [ 7., 17.]]]), size=(3, 2, 4, 2), nnz=4,
+                      [[12., 22.],
+                       [13., 23.],
+                       [14., 24.],
+                       [15., 25.]],
+
+                      [[17., 27.],
+                       [18., 28.],
+                       [19., 29.],
+                       [20., 30.]],
+
+                      [[19., 29.],
+                       [20., 30.],
+                       [21., 31.],
+                       [22., 32.]],
+
+                      [[24., 34.],
+                       [25., 35.],
+                       [26., 36.],
+                       [27., 37.]]]), size=(8, 6, 4, 2), nnz=24,
        layout=torch.sparse_csc)
 # _ccol_indices
-tensor([0, 2, 4], dtype=torch.int32)
+tensor([ 0,  3,  8, 11, 14, 19, 24], dtype=torch.int32)
 # _row_indices
-tensor([0, 1, 0, 2], dtype=torch.int32)
+tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6, 7, 0, 4, 5, 6, 7],
+       dtype=torch.int32)
 # _values
-tensor([[[ 1., 11.],
+tensor([[[ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.]],
+
+        [[13., 23.],
+         [14., 24.],
+         [15., 25.],
+         [16., 26.]],
+
+        [[20., 30.],
+         [21., 31.],
+         [22., 32.],
+         [23., 33.]],
+
+        [[ 1., 11.],
          [ 2., 12.],
          [ 3., 13.],
          [ 4., 14.]],
 
-        [[ 2., 12.],
-         [ 3., 13.],
-         [ 4., 14.],
-         [ 5., 15.]],
-
-        [[ 3., 13.],
-         [ 4., 14.],
-         [ 5., 15.],
-         [ 6., 16.]],
-
         [[ 4., 14.],
          [ 5., 15.],
          [ 6., 16.],
-         [ 7., 17.]]])
-
-########## torch.float32/torch.int32/size=(2, 3)+(3, 2)+(2, 1) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1.],
-                         [ 2.]],
-
-                        [[ 2.],
-                         [ 3.]],
-
-                        [[ 3.],
-                         [ 4.]],
-
-                        [[ 4.],
-                         [ 5.]]],
-
-
-                       [[[ 5.],
-                         [ 6.]],
-
-                        [[ 6.],
-                         [ 7.]],
-
-                        [[ 7.],
-                         [ 8.]],
-
-                        [[ 8.],
-                         [ 9.]]],
-
-
-                       [[[ 9.],
-                         [10.]],
-
-                        [[10.],
-                         [11.]],
-
-                        [[11.],
-                         [12.]],
-
-                        [[12.],
-                         [13.]]]],
+         [ 7., 17.]],
+
+        [[ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.],
+         [10., 20.]],
+
+        [[ 9., 19.],
+         [10., 20.],
+         [11., 21.],
+         [12., 22.]],
+
+        [[21., 31.],
+         [22., 32.],
+         [23., 33.],
+         [24., 34.]],
+
+        [[ 8., 18.],
+         [ 9., 19.],
+         [10., 20.],
+         [11., 21.]],
+
+        [[14., 24.],
+         [15., 25.],
+         [16., 26.],
+         [17., 27.]],
+
+        [[22., 32.],
+         [23., 33.],
+         [24., 34.],
+         [25., 35.]],
 
+        [[ 2., 12.],
+         [ 3., 13.],
+         [ 4., 14.],
+         [ 5., 15.]],
 
+        [[10., 20.],
+         [11., 21.],
+         [12., 22.],
+         [13., 23.]],
 
-                      [[[[13.],
-                         [14.]],
+        [[15., 25.],
+         [16., 26.],
+         [17., 27.],
+         [18., 28.]],
 
-                        [[14.],
-                         [15.]],
+        [[ 5., 15.],
+         [ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.]],
 
-                        [[15.],
-                         [16.]],
+        [[11., 21.],
+         [12., 22.],
+         [13., 23.],
+         [14., 24.]],
 
-                        [[16.],
-                         [17.]]],
+        [[16., 26.],
+         [17., 27.],
+         [18., 28.],
+         [19., 29.]],
 
+        [[18., 28.],
+         [19., 29.],
+         [20., 30.],
+         [21., 31.]],
 
-                       [[[17.],
-                         [18.]],
+        [[23., 33.],
+         [24., 34.],
+         [25., 35.],
+         [26., 36.]],
 
-                        [[18.],
-                         [19.]],
+        [[ 3., 13.],
+         [ 4., 14.],
+         [ 5., 15.],
+         [ 6., 16.]],
 
-                        [[19.],
-                         [20.]],
+        [[12., 22.],
+         [13., 23.],
+         [14., 24.],
+         [15., 25.]],
 
-                        [[20.],
-                         [21.]]],
+        [[17., 27.],
+         [18., 28.],
+         [19., 29.],
+         [20., 30.]],
 
+        [[19., 29.],
+         [20., 30.],
+         [21., 31.],
+         [22., 32.]],
 
-                       [[[21.],
-                         [22.]],
+        [[24., 34.],
+         [25., 35.],
+         [26., 36.],
+         [27., 37.]]])
 
-                        [[22.],
-                         [23.]],
 
-                        [[23.],
-                         [24.]],
+########## torch.float64/torch.int32/size=(2, 3)+(2, 3)+(4,) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 2, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 2, 3, 4]],
 
-                        [[24.],
-                         [25.]]]]]), size=(2, 3, 3, 2, 2, 1), nnz=4,
-       layout=torch.sparse_csc)
+                            [[0, 1, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 1, 3, 4]]]),
+       row_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 0],
+                            [0, 1, 1, 1]],
+
+                           [[1, 0, 1, 1],
+                            [0, 1, 1, 0],
+                            [0, 0, 1, 0]]]),
+       values=tensor([[[[1., 2., 3., 4.],
+                        [3., 4., 5., 6.],
+                        [2., 3., 4., 5.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [4., 5., 6., 7.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]]],
+
+
+                      [[[2., 3., 4., 5.],
+                        [1., 2., 3., 4.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.],
+                        [2., 3., 4., 5.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [4., 5., 6., 7.],
+                        [3., 4., 5., 6.]]]]), size=(2, 3, 2, 3, 4), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_csc)
 # _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+tensor([[[0, 2, 3, 4],
+         [0, 2, 3, 4],
+         [0, 2, 3, 4]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], dtype=torch.int32)
+        [[0, 1, 3, 4],
+         [0, 2, 3, 4],
+         [0, 1, 3, 4]]], dtype=torch.int32)
 # _row_indices
 tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
+         [0, 1, 0, 0],
+         [0, 1, 1, 1]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], dtype=torch.int32)
+        [[1, 0, 1, 1],
+         [0, 1, 1, 0],
+         [0, 0, 1, 0]]], dtype=torch.int32)
 # _values
-tensor([[[[[ 1.],
-           [ 2.]],
-
-          [[ 2.],
-           [ 3.]],
-
-          [[ 3.],
-           [ 4.]],
-
-          [[ 4.],
-           [ 5.]]],
-
-
-         [[[ 5.],
-           [ 6.]],
-
-          [[ 6.],
-           [ 7.]],
-
-          [[ 7.],
-           [ 8.]],
-
-          [[ 8.],
-           [ 9.]]],
-
-
-         [[[ 9.],
-           [10.]],
-
-          [[10.],
-           [11.]],
-
-          [[11.],
-           [12.]],
-
-          [[12.],
-           [13.]]]],
-
-
-
-        [[[[13.],
-           [14.]],
-
-          [[14.],
-           [15.]],
-
-          [[15.],
-           [16.]],
-
-          [[16.],
-           [17.]]],
-
-
-         [[[17.],
-           [18.]],
-
-          [[18.],
-           [19.]],
-
-          [[19.],
-           [20.]],
-
-          [[20.],
-           [21.]]],
-
-
-         [[[21.],
-           [22.]],
-
-          [[22.],
-           [23.]],
-
-          [[23.],
-           [24.]],
-
-          [[24.],
-           [25.]]]]])
-
-
-########## torch.float64/torch.int32/size=()+(3, 2)+(2,) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[1., 2.],
-                      [2., 3.],
-                      [3., 4.],
-                      [4., 5.]]), size=(3, 2, 2), nnz=4, dtype=torch.float64,
-       layout=torch.sparse_csc)
-# _ccol_indices
-tensor([0, 2, 4], dtype=torch.int32)
-# _row_indices
-tensor([0, 1, 0, 2], dtype=torch.int32)
-# _values
-tensor([[1., 2.],
-        [2., 3.],
-        [3., 4.],
-        [4., 5.]], dtype=torch.float64)
-
-########## torch.float64/torch.int32/size=()+(3, 2)+(4, 2) ##########
+tensor([[[[1., 2., 3., 4.],
+          [3., 4., 5., 6.],
+          [2., 3., 4., 5.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [4., 5., 6., 7.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]]],
+
+
+        [[[2., 3., 4., 5.],
+          [1., 2., 3., 4.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.],
+          [2., 3., 4., 5.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [4., 5., 6., 7.],
+          [3., 4., 5., 6.]]]], dtype=torch.float64)
+
+########## torch.float64/torch.int32/size=()+(8, 6)+(4, 2) ##########
 # sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[ 1., 11.],
+tensor(ccol_indices=tensor([ 0,  3,  8, 11, 14, 19, 24]),
+       row_indices=tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6,
+                           7, 0, 4, 5, 6, 7]),
+       values=tensor([[[ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.]],
+
+                      [[13., 23.],
+                       [14., 24.],
+                       [15., 25.],
+                       [16., 26.]],
+
+                      [[20., 30.],
+                       [21., 31.],
+                       [22., 32.],
+                       [23., 33.]],
+
+                      [[ 1., 11.],
                        [ 2., 12.],
                        [ 3., 13.],
                        [ 4., 14.]],
 
+                      [[ 4., 14.],
+                       [ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.]],
+
+                      [[ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.]],
+
+                      [[ 9., 19.],
+                       [10., 20.],
+                       [11., 21.],
+                       [12., 22.]],
+
+                      [[21., 31.],
+                       [22., 32.],
+                       [23., 33.],
+                       [24., 34.]],
+
+                      [[ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.],
+                       [11., 21.]],
+
+                      [[14., 24.],
+                       [15., 25.],
+                       [16., 26.],
+                       [17., 27.]],
+
+                      [[22., 32.],
+                       [23., 33.],
+                       [24., 34.],
+                       [25., 35.]],
+
                       [[ 2., 12.],
                        [ 3., 13.],
                        [ 4., 14.],
                        [ 5., 15.]],
 
+                      [[10., 20.],
+                       [11., 21.],
+                       [12., 22.],
+                       [13., 23.]],
+
+                      [[15., 25.],
+                       [16., 26.],
+                       [17., 27.],
+                       [18., 28.]],
+
+                      [[ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.]],
+
+                      [[11., 21.],
+                       [12., 22.],
+                       [13., 23.],
+                       [14., 24.]],
+
+                      [[16., 26.],
+                       [17., 27.],
+                       [18., 28.],
+                       [19., 29.]],
+
+                      [[18., 28.],
+                       [19., 29.],
+                       [20., 30.],
+                       [21., 31.]],
+
+                      [[23., 33.],
+                       [24., 34.],
+                       [25., 35.],
+                       [26., 36.]],
+
                       [[ 3., 13.],
                        [ 4., 14.],
                        [ 5., 15.],
                        [ 6., 16.]],
 
-                      [[ 4., 14.],
-                       [ 5., 15.],
-                       [ 6., 16.],
-                       [ 7., 17.]]]), size=(3, 2, 4, 2), nnz=4,
+                      [[12., 22.],
+                       [13., 23.],
+                       [14., 24.],
+                       [15., 25.]],
+
+                      [[17., 27.],
+                       [18., 28.],
+                       [19., 29.],
+                       [20., 30.]],
+
+                      [[19., 29.],
+                       [20., 30.],
+                       [21., 31.],
+                       [22., 32.]],
+
+                      [[24., 34.],
+                       [25., 35.],
+                       [26., 36.],
+                       [27., 37.]]]), size=(8, 6, 4, 2), nnz=24,
        dtype=torch.float64, layout=torch.sparse_csc)
 # _ccol_indices
-tensor([0, 2, 4], dtype=torch.int32)
+tensor([ 0,  3,  8, 11, 14, 19, 24], dtype=torch.int32)
 # _row_indices
-tensor([0, 1, 0, 2], dtype=torch.int32)
+tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6, 7, 0, 4, 5, 6, 7],
+       dtype=torch.int32)
 # _values
-tensor([[[ 1., 11.],
+tensor([[[ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.]],
+
+        [[13., 23.],
+         [14., 24.],
+         [15., 25.],
+         [16., 26.]],
+
+        [[20., 30.],
+         [21., 31.],
+         [22., 32.],
+         [23., 33.]],
+
+        [[ 1., 11.],
          [ 2., 12.],
          [ 3., 13.],
          [ 4., 14.]],
 
-        [[ 2., 12.],
-         [ 3., 13.],
-         [ 4., 14.],
-         [ 5., 15.]],
-
-        [[ 3., 13.],
-         [ 4., 14.],
-         [ 5., 15.],
-         [ 6., 16.]],
-
         [[ 4., 14.],
          [ 5., 15.],
          [ 6., 16.],
-         [ 7., 17.]]], dtype=torch.float64)
-
-########## torch.float64/torch.int32/size=(2, 3)+(3, 2)+(2, 1) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1.],
-                         [ 2.]],
-
-                        [[ 2.],
-                         [ 3.]],
-
-                        [[ 3.],
-                         [ 4.]],
-
-                        [[ 4.],
-                         [ 5.]]],
-
-
-                       [[[ 5.],
-                         [ 6.]],
-
-                        [[ 6.],
-                         [ 7.]],
-
-                        [[ 7.],
-                         [ 8.]],
-
-                        [[ 8.],
-                         [ 9.]]],
-
-
-                       [[[ 9.],
-                         [10.]],
-
-                        [[10.],
-                         [11.]],
-
-                        [[11.],
-                         [12.]],
-
-                        [[12.],
-                         [13.]]]],
+         [ 7., 17.]],
+
+        [[ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.],
+         [10., 20.]],
+
+        [[ 9., 19.],
+         [10., 20.],
+         [11., 21.],
+         [12., 22.]],
+
+        [[21., 31.],
+         [22., 32.],
+         [23., 33.],
+         [24., 34.]],
+
+        [[ 8., 18.],
+         [ 9., 19.],
+         [10., 20.],
+         [11., 21.]],
+
+        [[14., 24.],
+         [15., 25.],
+         [16., 26.],
+         [17., 27.]],
+
+        [[22., 32.],
+         [23., 33.],
+         [24., 34.],
+         [25., 35.]],
 
+        [[ 2., 12.],
+         [ 3., 13.],
+         [ 4., 14.],
+         [ 5., 15.]],
 
+        [[10., 20.],
+         [11., 21.],
+         [12., 22.],
+         [13., 23.]],
 
-                      [[[[13.],
-                         [14.]],
+        [[15., 25.],
+         [16., 26.],
+         [17., 27.],
+         [18., 28.]],
 
-                        [[14.],
-                         [15.]],
+        [[ 5., 15.],
+         [ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.]],
 
-                        [[15.],
-                         [16.]],
+        [[11., 21.],
+         [12., 22.],
+         [13., 23.],
+         [14., 24.]],
 
-                        [[16.],
-                         [17.]]],
+        [[16., 26.],
+         [17., 27.],
+         [18., 28.],
+         [19., 29.]],
 
+        [[18., 28.],
+         [19., 29.],
+         [20., 30.],
+         [21., 31.]],
 
-                       [[[17.],
-                         [18.]],
+        [[23., 33.],
+         [24., 34.],
+         [25., 35.],
+         [26., 36.]],
 
-                        [[18.],
-                         [19.]],
+        [[ 3., 13.],
+         [ 4., 14.],
+         [ 5., 15.],
+         [ 6., 16.]],
 
-                        [[19.],
-                         [20.]],
+        [[12., 22.],
+         [13., 23.],
+         [14., 24.],
+         [15., 25.]],
 
-                        [[20.],
-                         [21.]]],
+        [[17., 27.],
+         [18., 28.],
+         [19., 29.],
+         [20., 30.]],
 
+        [[19., 29.],
+         [20., 30.],
+         [21., 31.],
+         [22., 32.]],
 
-                       [[[21.],
-                         [22.]],
+        [[24., 34.],
+         [25., 35.],
+         [26., 36.],
+         [27., 37.]]], dtype=torch.float64)
 
-                        [[22.],
-                         [23.]],
 
-                        [[23.],
-                         [24.]],
+########## torch.float32/torch.int64/size=(2, 3)+(2, 3)+(4,) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 2, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 2, 3, 4]],
 
-                        [[24.],
-                         [25.]]]]]), size=(2, 3, 3, 2, 2, 1), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csc)
+                            [[0, 1, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 1, 3, 4]]]),
+       row_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 0],
+                            [0, 1, 1, 1]],
+
+                           [[1, 0, 1, 1],
+                            [0, 1, 1, 0],
+                            [0, 0, 1, 0]]]),
+       values=tensor([[[[1., 2., 3., 4.],
+                        [3., 4., 5., 6.],
+                        [2., 3., 4., 5.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [4., 5., 6., 7.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]]],
+
+
+                      [[[2., 3., 4., 5.],
+                        [1., 2., 3., 4.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.],
+                        [2., 3., 4., 5.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [4., 5., 6., 7.],
+                        [3., 4., 5., 6.]]]]), size=(2, 3, 2, 3, 4), nnz=4,
+       layout=torch.sparse_csc)
 # _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+tensor([[[0, 2, 3, 4],
+         [0, 2, 3, 4],
+         [0, 2, 3, 4]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], dtype=torch.int32)
+        [[0, 1, 3, 4],
+         [0, 2, 3, 4],
+         [0, 1, 3, 4]]])
 # _row_indices
 tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
+         [0, 1, 0, 0],
+         [0, 1, 1, 1]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], dtype=torch.int32)
+        [[1, 0, 1, 1],
+         [0, 1, 1, 0],
+         [0, 0, 1, 0]]])
 # _values
-tensor([[[[[ 1.],
-           [ 2.]],
-
-          [[ 2.],
-           [ 3.]],
-
-          [[ 3.],
-           [ 4.]],
-
-          [[ 4.],
-           [ 5.]]],
-
-
-         [[[ 5.],
-           [ 6.]],
-
-          [[ 6.],
-           [ 7.]],
-
-          [[ 7.],
-           [ 8.]],
-
-          [[ 8.],
-           [ 9.]]],
-
-
-         [[[ 9.],
-           [10.]],
-
-          [[10.],
-           [11.]],
-
-          [[11.],
-           [12.]],
-
-          [[12.],
-           [13.]]]],
-
-
-
-        [[[[13.],
-           [14.]],
-
-          [[14.],
-           [15.]],
-
-          [[15.],
-           [16.]],
-
-          [[16.],
-           [17.]]],
-
-
-         [[[17.],
-           [18.]],
-
-          [[18.],
-           [19.]],
-
-          [[19.],
-           [20.]],
-
-          [[20.],
-           [21.]]],
-
-
-         [[[21.],
-           [22.]],
-
-          [[22.],
-           [23.]],
-
-          [[23.],
-           [24.]],
-
-          [[24.],
-           [25.]]]]], dtype=torch.float64)
-
-
-########## torch.float32/torch.int64/size=()+(3, 2)+(2,) ##########
+tensor([[[[1., 2., 3., 4.],
+          [3., 4., 5., 6.],
+          [2., 3., 4., 5.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [4., 5., 6., 7.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]]],
+
+
+        [[[2., 3., 4., 5.],
+          [1., 2., 3., 4.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.],
+          [2., 3., 4., 5.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [4., 5., 6., 7.],
+          [3., 4., 5., 6.]]]])
+
+########## torch.float32/torch.int64/size=()+(8, 6)+(4, 2) ##########
 # sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[1., 2.],
-                      [2., 3.],
-                      [3., 4.],
-                      [4., 5.]]), size=(3, 2, 2), nnz=4,
-       layout=torch.sparse_csc)
-# _ccol_indices
-tensor([0, 2, 4])
-# _row_indices
-tensor([0, 1, 0, 2])
-# _values
-tensor([[1., 2.],
-        [2., 3.],
-        [3., 4.],
-        [4., 5.]])
-
-########## torch.float32/torch.int64/size=()+(3, 2)+(4, 2) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[ 1., 11.],
+tensor(ccol_indices=tensor([ 0,  3,  8, 11, 14, 19, 24]),
+       row_indices=tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6,
+                           7, 0, 4, 5, 6, 7]),
+       values=tensor([[[ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.]],
+
+                      [[13., 23.],
+                       [14., 24.],
+                       [15., 25.],
+                       [16., 26.]],
+
+                      [[20., 30.],
+                       [21., 31.],
+                       [22., 32.],
+                       [23., 33.]],
+
+                      [[ 1., 11.],
                        [ 2., 12.],
                        [ 3., 13.],
                        [ 4., 14.]],
 
+                      [[ 4., 14.],
+                       [ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.]],
+
+                      [[ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.]],
+
+                      [[ 9., 19.],
+                       [10., 20.],
+                       [11., 21.],
+                       [12., 22.]],
+
+                      [[21., 31.],
+                       [22., 32.],
+                       [23., 33.],
+                       [24., 34.]],
+
+                      [[ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.],
+                       [11., 21.]],
+
+                      [[14., 24.],
+                       [15., 25.],
+                       [16., 26.],
+                       [17., 27.]],
+
+                      [[22., 32.],
+                       [23., 33.],
+                       [24., 34.],
+                       [25., 35.]],
+
                       [[ 2., 12.],
                        [ 3., 13.],
                        [ 4., 14.],
                        [ 5., 15.]],
 
+                      [[10., 20.],
+                       [11., 21.],
+                       [12., 22.],
+                       [13., 23.]],
+
+                      [[15., 25.],
+                       [16., 26.],
+                       [17., 27.],
+                       [18., 28.]],
+
+                      [[ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.]],
+
+                      [[11., 21.],
+                       [12., 22.],
+                       [13., 23.],
+                       [14., 24.]],
+
+                      [[16., 26.],
+                       [17., 27.],
+                       [18., 28.],
+                       [19., 29.]],
+
+                      [[18., 28.],
+                       [19., 29.],
+                       [20., 30.],
+                       [21., 31.]],
+
+                      [[23., 33.],
+                       [24., 34.],
+                       [25., 35.],
+                       [26., 36.]],
+
                       [[ 3., 13.],
                        [ 4., 14.],
                        [ 5., 15.],
                        [ 6., 16.]],
 
-                      [[ 4., 14.],
-                       [ 5., 15.],
-                       [ 6., 16.],
-                       [ 7., 17.]]]), size=(3, 2, 4, 2), nnz=4,
+                      [[12., 22.],
+                       [13., 23.],
+                       [14., 24.],
+                       [15., 25.]],
+
+                      [[17., 27.],
+                       [18., 28.],
+                       [19., 29.],
+                       [20., 30.]],
+
+                      [[19., 29.],
+                       [20., 30.],
+                       [21., 31.],
+                       [22., 32.]],
+
+                      [[24., 34.],
+                       [25., 35.],
+                       [26., 36.],
+                       [27., 37.]]]), size=(8, 6, 4, 2), nnz=24,
        layout=torch.sparse_csc)
 # _ccol_indices
-tensor([0, 2, 4])
+tensor([ 0,  3,  8, 11, 14, 19, 24])
 # _row_indices
-tensor([0, 1, 0, 2])
+tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6, 7, 0, 4, 5, 6, 7])
 # _values
-tensor([[[ 1., 11.],
+tensor([[[ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.]],
+
+        [[13., 23.],
+         [14., 24.],
+         [15., 25.],
+         [16., 26.]],
+
+        [[20., 30.],
+         [21., 31.],
+         [22., 32.],
+         [23., 33.]],
+
+        [[ 1., 11.],
          [ 2., 12.],
          [ 3., 13.],
          [ 4., 14.]],
 
-        [[ 2., 12.],
-         [ 3., 13.],
-         [ 4., 14.],
-         [ 5., 15.]],
-
-        [[ 3., 13.],
-         [ 4., 14.],
-         [ 5., 15.],
-         [ 6., 16.]],
-
         [[ 4., 14.],
          [ 5., 15.],
          [ 6., 16.],
-         [ 7., 17.]]])
-
-########## torch.float32/torch.int64/size=(2, 3)+(3, 2)+(2, 1) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1.],
-                         [ 2.]],
-
-                        [[ 2.],
-                         [ 3.]],
-
-                        [[ 3.],
-                         [ 4.]],
-
-                        [[ 4.],
-                         [ 5.]]],
-
-
-                       [[[ 5.],
-                         [ 6.]],
-
-                        [[ 6.],
-                         [ 7.]],
-
-                        [[ 7.],
-                         [ 8.]],
-
-                        [[ 8.],
-                         [ 9.]]],
-
-
-                       [[[ 9.],
-                         [10.]],
-
-                        [[10.],
-                         [11.]],
-
-                        [[11.],
-                         [12.]],
-
-                        [[12.],
-                         [13.]]]],
+         [ 7., 17.]],
+
+        [[ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.],
+         [10., 20.]],
+
+        [[ 9., 19.],
+         [10., 20.],
+         [11., 21.],
+         [12., 22.]],
+
+        [[21., 31.],
+         [22., 32.],
+         [23., 33.],
+         [24., 34.]],
+
+        [[ 8., 18.],
+         [ 9., 19.],
+         [10., 20.],
+         [11., 21.]],
+
+        [[14., 24.],
+         [15., 25.],
+         [16., 26.],
+         [17., 27.]],
+
+        [[22., 32.],
+         [23., 33.],
+         [24., 34.],
+         [25., 35.]],
 
+        [[ 2., 12.],
+         [ 3., 13.],
+         [ 4., 14.],
+         [ 5., 15.]],
 
+        [[10., 20.],
+         [11., 21.],
+         [12., 22.],
+         [13., 23.]],
 
-                      [[[[13.],
-                         [14.]],
+        [[15., 25.],
+         [16., 26.],
+         [17., 27.],
+         [18., 28.]],
 
-                        [[14.],
-                         [15.]],
+        [[ 5., 15.],
+         [ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.]],
 
-                        [[15.],
-                         [16.]],
+        [[11., 21.],
+         [12., 22.],
+         [13., 23.],
+         [14., 24.]],
 
-                        [[16.],
-                         [17.]]],
+        [[16., 26.],
+         [17., 27.],
+         [18., 28.],
+         [19., 29.]],
 
+        [[18., 28.],
+         [19., 29.],
+         [20., 30.],
+         [21., 31.]],
 
-                       [[[17.],
-                         [18.]],
+        [[23., 33.],
+         [24., 34.],
+         [25., 35.],
+         [26., 36.]],
 
-                        [[18.],
-                         [19.]],
+        [[ 3., 13.],
+         [ 4., 14.],
+         [ 5., 15.],
+         [ 6., 16.]],
 
-                        [[19.],
-                         [20.]],
+        [[12., 22.],
+         [13., 23.],
+         [14., 24.],
+         [15., 25.]],
 
-                        [[20.],
-                         [21.]]],
+        [[17., 27.],
+         [18., 28.],
+         [19., 29.],
+         [20., 30.]],
 
+        [[19., 29.],
+         [20., 30.],
+         [21., 31.],
+         [22., 32.]],
 
-                       [[[21.],
-                         [22.]],
+        [[24., 34.],
+         [25., 35.],
+         [26., 36.],
+         [27., 37.]]])
 
-                        [[22.],
-                         [23.]],
 
-                        [[23.],
-                         [24.]],
+########## torch.float64/torch.int64/size=(2, 3)+(2, 3)+(4,) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 2, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 2, 3, 4]],
 
-                        [[24.],
-                         [25.]]]]]), size=(2, 3, 3, 2, 2, 1), nnz=4,
-       layout=torch.sparse_csc)
+                            [[0, 1, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 1, 3, 4]]]),
+       row_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 0],
+                            [0, 1, 1, 1]],
+
+                           [[1, 0, 1, 1],
+                            [0, 1, 1, 0],
+                            [0, 0, 1, 0]]]),
+       values=tensor([[[[1., 2., 3., 4.],
+                        [3., 4., 5., 6.],
+                        [2., 3., 4., 5.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [4., 5., 6., 7.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]]],
+
+
+                      [[[2., 3., 4., 5.],
+                        [1., 2., 3., 4.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.],
+                        [2., 3., 4., 5.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [4., 5., 6., 7.],
+                        [3., 4., 5., 6.]]]]), size=(2, 3, 2, 3, 4), nnz=4,
+       dtype=torch.float64, layout=torch.sparse_csc)
 # _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+tensor([[[0, 2, 3, 4],
+         [0, 2, 3, 4],
+         [0, 2, 3, 4]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]])
+        [[0, 1, 3, 4],
+         [0, 2, 3, 4],
+         [0, 1, 3, 4]]])
 # _row_indices
 tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
+         [0, 1, 0, 0],
+         [0, 1, 1, 1]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]])
+        [[1, 0, 1, 1],
+         [0, 1, 1, 0],
+         [0, 0, 1, 0]]])
 # _values
-tensor([[[[[ 1.],
-           [ 2.]],
-
-          [[ 2.],
-           [ 3.]],
-
-          [[ 3.],
-           [ 4.]],
-
-          [[ 4.],
-           [ 5.]]],
-
-
-         [[[ 5.],
-           [ 6.]],
-
-          [[ 6.],
-           [ 7.]],
-
-          [[ 7.],
-           [ 8.]],
-
-          [[ 8.],
-           [ 9.]]],
-
-
-         [[[ 9.],
-           [10.]],
-
-          [[10.],
-           [11.]],
-
-          [[11.],
-           [12.]],
-
-          [[12.],
-           [13.]]]],
-
-
-
-        [[[[13.],
-           [14.]],
-
-          [[14.],
-           [15.]],
-
-          [[15.],
-           [16.]],
-
-          [[16.],
-           [17.]]],
-
-
-         [[[17.],
-           [18.]],
-
-          [[18.],
-           [19.]],
-
-          [[19.],
-           [20.]],
-
-          [[20.],
-           [21.]]],
-
-
-         [[[21.],
-           [22.]],
-
-          [[22.],
-           [23.]],
-
-          [[23.],
-           [24.]],
-
-          [[24.],
-           [25.]]]]])
-
-
-########## torch.float64/torch.int64/size=()+(3, 2)+(2,) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[1., 2.],
-                      [2., 3.],
-                      [3., 4.],
-                      [4., 5.]]), size=(3, 2, 2), nnz=4, dtype=torch.float64,
-       layout=torch.sparse_csc)
-# _ccol_indices
-tensor([0, 2, 4])
-# _row_indices
-tensor([0, 1, 0, 2])
-# _values
-tensor([[1., 2.],
-        [2., 3.],
-        [3., 4.],
-        [4., 5.]], dtype=torch.float64)
-
-########## torch.float64/torch.int64/size=()+(3, 2)+(4, 2) ##########
+tensor([[[[1., 2., 3., 4.],
+          [3., 4., 5., 6.],
+          [2., 3., 4., 5.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [4., 5., 6., 7.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]]],
+
+
+        [[[2., 3., 4., 5.],
+          [1., 2., 3., 4.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.],
+          [2., 3., 4., 5.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [4., 5., 6., 7.],
+          [3., 4., 5., 6.]]]], dtype=torch.float64)
+
+########## torch.float64/torch.int64/size=()+(8, 6)+(4, 2) ##########
 # sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[ 1., 11.],
+tensor(ccol_indices=tensor([ 0,  3,  8, 11, 14, 19, 24]),
+       row_indices=tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6,
+                           7, 0, 4, 5, 6, 7]),
+       values=tensor([[[ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.]],
+
+                      [[13., 23.],
+                       [14., 24.],
+                       [15., 25.],
+                       [16., 26.]],
+
+                      [[20., 30.],
+                       [21., 31.],
+                       [22., 32.],
+                       [23., 33.]],
+
+                      [[ 1., 11.],
                        [ 2., 12.],
                        [ 3., 13.],
                        [ 4., 14.]],
 
+                      [[ 4., 14.],
+                       [ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.]],
+
+                      [[ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.]],
+
+                      [[ 9., 19.],
+                       [10., 20.],
+                       [11., 21.],
+                       [12., 22.]],
+
+                      [[21., 31.],
+                       [22., 32.],
+                       [23., 33.],
+                       [24., 34.]],
+
+                      [[ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.],
+                       [11., 21.]],
+
+                      [[14., 24.],
+                       [15., 25.],
+                       [16., 26.],
+                       [17., 27.]],
+
+                      [[22., 32.],
+                       [23., 33.],
+                       [24., 34.],
+                       [25., 35.]],
+
                       [[ 2., 12.],
                        [ 3., 13.],
                        [ 4., 14.],
                        [ 5., 15.]],
 
+                      [[10., 20.],
+                       [11., 21.],
+                       [12., 22.],
+                       [13., 23.]],
+
+                      [[15., 25.],
+                       [16., 26.],
+                       [17., 27.],
+                       [18., 28.]],
+
+                      [[ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.]],
+
+                      [[11., 21.],
+                       [12., 22.],
+                       [13., 23.],
+                       [14., 24.]],
+
+                      [[16., 26.],
+                       [17., 27.],
+                       [18., 28.],
+                       [19., 29.]],
+
+                      [[18., 28.],
+                       [19., 29.],
+                       [20., 30.],
+                       [21., 31.]],
+
+                      [[23., 33.],
+                       [24., 34.],
+                       [25., 35.],
+                       [26., 36.]],
+
                       [[ 3., 13.],
                        [ 4., 14.],
                        [ 5., 15.],
                        [ 6., 16.]],
 
-                      [[ 4., 14.],
-                       [ 5., 15.],
-                       [ 6., 16.],
-                       [ 7., 17.]]]), size=(3, 2, 4, 2), nnz=4,
+                      [[12., 22.],
+                       [13., 23.],
+                       [14., 24.],
+                       [15., 25.]],
+
+                      [[17., 27.],
+                       [18., 28.],
+                       [19., 29.],
+                       [20., 30.]],
+
+                      [[19., 29.],
+                       [20., 30.],
+                       [21., 31.],
+                       [22., 32.]],
+
+                      [[24., 34.],
+                       [25., 35.],
+                       [26., 36.],
+                       [27., 37.]]]), size=(8, 6, 4, 2), nnz=24,
        dtype=torch.float64, layout=torch.sparse_csc)
 # _ccol_indices
-tensor([0, 2, 4])
+tensor([ 0,  3,  8, 11, 14, 19, 24])
 # _row_indices
-tensor([0, 1, 0, 2])
+tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6, 7, 0, 4, 5, 6, 7])
 # _values
-tensor([[[ 1., 11.],
+tensor([[[ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.]],
+
+        [[13., 23.],
+         [14., 24.],
+         [15., 25.],
+         [16., 26.]],
+
+        [[20., 30.],
+         [21., 31.],
+         [22., 32.],
+         [23., 33.]],
+
+        [[ 1., 11.],
          [ 2., 12.],
          [ 3., 13.],
          [ 4., 14.]],
 
-        [[ 2., 12.],
-         [ 3., 13.],
-         [ 4., 14.],
-         [ 5., 15.]],
-
-        [[ 3., 13.],
-         [ 4., 14.],
-         [ 5., 15.],
-         [ 6., 16.]],
-
         [[ 4., 14.],
          [ 5., 15.],
          [ 6., 16.],
-         [ 7., 17.]]], dtype=torch.float64)
-
-########## torch.float64/torch.int64/size=(2, 3)+(3, 2)+(2, 1) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1.],
-                         [ 2.]],
-
-                        [[ 2.],
-                         [ 3.]],
-
-                        [[ 3.],
-                         [ 4.]],
-
-                        [[ 4.],
-                         [ 5.]]],
-
-
-                       [[[ 5.],
-                         [ 6.]],
-
-                        [[ 6.],
-                         [ 7.]],
-
-                        [[ 7.],
-                         [ 8.]],
-
-                        [[ 8.],
-                         [ 9.]]],
-
-
-                       [[[ 9.],
-                         [10.]],
-
-                        [[10.],
-                         [11.]],
-
-                        [[11.],
-                         [12.]],
-
-                        [[12.],
-                         [13.]]]],
-
-
-
-                      [[[[13.],
-                         [14.]],
-
-                        [[14.],
-                         [15.]],
-
-                        [[15.],
-                         [16.]],
-
-                        [[16.],
-                         [17.]]],
-
-
-                       [[[17.],
-                         [18.]],
-
-                        [[18.],
-                         [19.]],
-
-                        [[19.],
-                         [20.]],
-
-                        [[20.],
-                         [21.]]],
-
-
-                       [[[21.],
-                         [22.]],
-
-                        [[22.],
-                         [23.]],
-
-                        [[23.],
-                         [24.]],
-
-                        [[24.],
-                         [25.]]]]]), size=(2, 3, 3, 2, 2, 1), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csc)
-# _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
-
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]])
-# _row_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
-
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]])
-# _values
-tensor([[[[[ 1.],
-           [ 2.]],
-
-          [[ 2.],
-           [ 3.]],
-
-          [[ 3.],
-           [ 4.]],
-
-          [[ 4.],
-           [ 5.]]],
-
-
-         [[[ 5.],
-           [ 6.]],
-
-          [[ 6.],
-           [ 7.]],
-
-          [[ 7.],
-           [ 8.]],
-
-          [[ 8.],
-           [ 9.]]],
-
-
-         [[[ 9.],
-           [10.]],
-
-          [[10.],
-           [11.]],
-
-          [[11.],
-           [12.]],
-
-          [[12.],
-           [13.]]]],
-
-
-
-        [[[[13.],
-           [14.]],
-
-          [[14.],
-           [15.]],
-
-          [[15.],
-           [16.]],
-
-          [[16.],
-           [17.]]],
+         [ 7., 17.]],
+
+        [[ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.],
+         [10., 20.]],
+
+        [[ 9., 19.],
+         [10., 20.],
+         [11., 21.],
+         [12., 22.]],
+
+        [[21., 31.],
+         [22., 32.],
+         [23., 33.],
+         [24., 34.]],
+
+        [[ 8., 18.],
+         [ 9., 19.],
+         [10., 20.],
+         [11., 21.]],
+
+        [[14., 24.],
+         [15., 25.],
+         [16., 26.],
+         [17., 27.]],
+
+        [[22., 32.],
+         [23., 33.],
+         [24., 34.],
+         [25., 35.]],
 
+        [[ 2., 12.],
+         [ 3., 13.],
+         [ 4., 14.],
+         [ 5., 15.]],
 
-         [[[17.],
-           [18.]],
+        [[10., 20.],
+         [11., 21.],
+         [12., 22.],
+         [13., 23.]],
 
-          [[18.],
-           [19.]],
+        [[15., 25.],
+         [16., 26.],
+         [17., 27.],
+         [18., 28.]],
 
-          [[19.],
-           [20.]],
+        [[ 5., 15.],
+         [ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.]],
 
-          [[20.],
-           [21.]]],
+        [[11., 21.],
+         [12., 22.],
+         [13., 23.],
+         [14., 24.]],
 
+        [[16., 26.],
+         [17., 27.],
+         [18., 28.],
+         [19., 29.]],
 
-         [[[21.],
-           [22.]],
+        [[18., 28.],
+         [19., 29.],
+         [20., 30.],
+         [21., 31.]],
 
-          [[22.],
-           [23.]],
+        [[23., 33.],
+         [24., 34.],
+         [25., 35.],
+         [26., 36.]],
 
-          [[23.],
-           [24.]],
+        [[ 3., 13.],
+         [ 4., 14.],
+         [ 5., 15.],
+         [ 6., 16.]],
 
-          [[24.],
-           [25.]]]]], dtype=torch.float64)
+        [[12., 22.],
+         [13., 23.],
+         [14., 24.],
+         [15., 25.]],
+
+        [[17., 27.],
+         [18., 28.],
+         [19., 29.],
+         [20., 30.]],
+
+        [[19., 29.],
+         [20., 30.],
+         [21., 31.],
+         [22., 32.]],
+
+        [[24., 34.],
+         [25., 35.],
+         [26., 36.],
+         [27., 37.]]], dtype=torch.float64)
 
diff --git a/test/expect/TestSparseCompressedCPU.test_print_SparseCSR_cpu.expect b/test/expect/TestSparseCompressedCPU.test_print_SparseCSR_cpu.expect
index 3ab2e1135aa55..f95a8a0819953 100644
--- a/test/expect/TestSparseCompressedCPU.test_print_SparseCSR_cpu.expect
+++ b/test/expect/TestSparseCompressedCPU.test_print_SparseCSR_cpu.expect
@@ -1,48 +1,3 @@
-########## torch.float32/torch.int32/size=()+(2, 3)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([1., 2., 3., 4.]), size=(2, 3), nnz=4,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 2], dtype=torch.int32)
-# _values
-tensor([1., 2., 3., 4.])
-
-########## torch.float32/torch.int32/size=()+(0, 0)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([0]),
-       col_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0,)), size=(0, 0), nnz=0,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([0], dtype=torch.int32)
-# _col_indices
-tensor([], dtype=torch.int32)
-# _values
-tensor([])
-
-########## torch.float32/torch.int32/size=(2,)+(2, 3)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       col_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[1., 2., 3., 4.],
-                      [5., 6., 7., 8.]]), size=(2, 2, 3), nnz=4,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]], dtype=torch.int32)
-# _col_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]], dtype=torch.int32)
-# _values
-tensor([[1., 2., 3., 4.],
-        [5., 6., 7., 8.]])
-
 ########## torch.float32/torch.int32/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
 tensor(crow_indices=tensor([[[0, 2, 4],
@@ -52,20 +7,20 @@ tensor(crow_indices=tensor([[[0, 2, 4],
                             [[0, 1, 4],
                              [0, 2, 4],
                              [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
+       col_indices=tensor([[[0, 1, 0, 2],
                             [0, 1, 2, 0],
                             [0, 0, 1, 2]],
 
                            [[1, 0, 1, 2],
                             [0, 2, 0, 1],
                             [0, 1, 2, 1]]]),
-       values=tensor([[[ 1.,  2.,  3.,  4.],
-                       [ 5.,  6.,  7.,  8.],
-                       [ 9., 10., 11., 12.]],
+       values=tensor([[[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]],
 
-                      [[13., 14., 15., 16.],
-                       [17., 18., 19., 20.],
-                       [21., 22., 23., 24.]]]), size=(2, 3, 2, 3), nnz=4,
+                      [[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]]]), size=(2, 3, 2, 3), nnz=4,
        layout=torch.sparse_csr)
 # _crow_indices
 tensor([[[0, 2, 4],
@@ -76,7 +31,7 @@ tensor([[[0, 2, 4],
          [0, 2, 4],
          [0, 3, 4]]], dtype=torch.int32)
 # _col_indices
-tensor([[[0, 1, 0, 1],
+tensor([[[0, 1, 0, 2],
          [0, 1, 2, 0],
          [0, 0, 1, 2]],
 
@@ -84,59 +39,31 @@ tensor([[[0, 1, 0, 1],
          [0, 2, 0, 1],
          [0, 1, 2, 1]]], dtype=torch.int32)
 # _values
-tensor([[[ 1.,  2.,  3.,  4.],
-         [ 5.,  6.,  7.,  8.],
-         [ 9., 10., 11., 12.]],
-
-        [[13., 14., 15., 16.],
-         [17., 18., 19., 20.],
-         [21., 22., 23., 24.]]])
+tensor([[[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]],
 
+        [[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]]])
 
-########## torch.float64/torch.int32/size=()+(2, 3)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([1., 2., 3., 4.]), size=(2, 3), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 2], dtype=torch.int32)
-# _values
-tensor([1., 2., 3., 4.], dtype=torch.float64)
-
-########## torch.float64/torch.int32/size=()+(0, 0)+() ##########
+########## torch.float32/torch.int32/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(crow_indices=tensor([0]),
-       col_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0,)), size=(0, 0), nnz=0, dtype=torch.float64,
-       layout=torch.sparse_csr)
+tensor(crow_indices=tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24]),
+       col_indices=tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4,
+                           5, 0, 1, 2, 4, 5]),
+       values=tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11.,
+                      12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22.,
+                      23., 24.]), size=(8, 6), nnz=24, layout=torch.sparse_csr)
 # _crow_indices
-tensor([0], dtype=torch.int32)
+tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24], dtype=torch.int32)
 # _col_indices
-tensor([], dtype=torch.int32)
+tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4, 5, 0, 1, 2, 4, 5],
+       dtype=torch.int32)
 # _values
-tensor([], dtype=torch.float64)
+tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14.,
+        15., 16., 17., 18., 19., 20., 21., 22., 23., 24.])
 
-########## torch.float64/torch.int32/size=(2,)+(2, 3)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       col_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[1., 2., 3., 4.],
-                      [5., 6., 7., 8.]]), size=(2, 2, 3), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csr)
-# _crow_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]], dtype=torch.int32)
-# _col_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]], dtype=torch.int32)
-# _values
-tensor([[1., 2., 3., 4.],
-        [5., 6., 7., 8.]], dtype=torch.float64)
 
 ########## torch.float64/torch.int32/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
@@ -147,20 +74,20 @@ tensor(crow_indices=tensor([[[0, 2, 4],
                             [[0, 1, 4],
                              [0, 2, 4],
                              [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
+       col_indices=tensor([[[0, 1, 0, 2],
                             [0, 1, 2, 0],
                             [0, 0, 1, 2]],
 
                            [[1, 0, 1, 2],
                             [0, 2, 0, 1],
                             [0, 1, 2, 1]]]),
-       values=tensor([[[ 1.,  2.,  3.,  4.],
-                       [ 5.,  6.,  7.,  8.],
-                       [ 9., 10., 11., 12.]],
+       values=tensor([[[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]],
 
-                      [[13., 14., 15., 16.],
-                       [17., 18., 19., 20.],
-                       [21., 22., 23., 24.]]]), size=(2, 3, 2, 3), nnz=4,
+                      [[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]]]), size=(2, 3, 2, 3), nnz=4,
        dtype=torch.float64, layout=torch.sparse_csr)
 # _crow_indices
 tensor([[[0, 2, 4],
@@ -171,7 +98,7 @@ tensor([[[0, 2, 4],
          [0, 2, 4],
          [0, 3, 4]]], dtype=torch.int32)
 # _col_indices
-tensor([[[0, 1, 0, 1],
+tensor([[[0, 1, 0, 2],
          [0, 1, 2, 0],
          [0, 0, 1, 2]],
 
@@ -179,59 +106,32 @@ tensor([[[0, 1, 0, 1],
          [0, 2, 0, 1],
          [0, 1, 2, 1]]], dtype=torch.int32)
 # _values
-tensor([[[ 1.,  2.,  3.,  4.],
-         [ 5.,  6.,  7.,  8.],
-         [ 9., 10., 11., 12.]],
-
-        [[13., 14., 15., 16.],
-         [17., 18., 19., 20.],
-         [21., 22., 23., 24.]]], dtype=torch.float64)
+tensor([[[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]],
 
+        [[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]]], dtype=torch.float64)
 
-########## torch.float32/torch.int64/size=()+(2, 3)+() ##########
+########## torch.float64/torch.int32/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([1., 2., 3., 4.]), size=(2, 3), nnz=4,
+tensor(crow_indices=tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24]),
+       col_indices=tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4,
+                           5, 0, 1, 2, 4, 5]),
+       values=tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11.,
+                      12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22.,
+                      23., 24.]), size=(8, 6), nnz=24, dtype=torch.float64,
        layout=torch.sparse_csr)
 # _crow_indices
-tensor([0, 2, 4])
+tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24], dtype=torch.int32)
 # _col_indices
-tensor([0, 1, 0, 2])
+tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4, 5, 0, 1, 2, 4, 5],
+       dtype=torch.int32)
 # _values
-tensor([1., 2., 3., 4.])
+tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14.,
+        15., 16., 17., 18., 19., 20., 21., 22., 23., 24.], dtype=torch.float64)
 
-########## torch.float32/torch.int64/size=()+(0, 0)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([0]),
-       col_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0,)), size=(0, 0), nnz=0,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([0])
-# _col_indices
-tensor([], dtype=torch.int64)
-# _values
-tensor([])
-
-########## torch.float32/torch.int64/size=(2,)+(2, 3)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       col_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[1., 2., 3., 4.],
-                      [5., 6., 7., 8.]]), size=(2, 2, 3), nnz=4,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]])
-# _col_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]])
-# _values
-tensor([[1., 2., 3., 4.],
-        [5., 6., 7., 8.]])
 
 ########## torch.float32/torch.int64/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
@@ -242,20 +142,20 @@ tensor(crow_indices=tensor([[[0, 2, 4],
                             [[0, 1, 4],
                              [0, 2, 4],
                              [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
+       col_indices=tensor([[[0, 1, 0, 2],
                             [0, 1, 2, 0],
                             [0, 0, 1, 2]],
 
                            [[1, 0, 1, 2],
                             [0, 2, 0, 1],
                             [0, 1, 2, 1]]]),
-       values=tensor([[[ 1.,  2.,  3.,  4.],
-                       [ 5.,  6.,  7.,  8.],
-                       [ 9., 10., 11., 12.]],
+       values=tensor([[[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]],
 
-                      [[13., 14., 15., 16.],
-                       [17., 18., 19., 20.],
-                       [21., 22., 23., 24.]]]), size=(2, 3, 2, 3), nnz=4,
+                      [[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]]]), size=(2, 3, 2, 3), nnz=4,
        layout=torch.sparse_csr)
 # _crow_indices
 tensor([[[0, 2, 4],
@@ -266,7 +166,7 @@ tensor([[[0, 2, 4],
          [0, 2, 4],
          [0, 3, 4]]])
 # _col_indices
-tensor([[[0, 1, 0, 1],
+tensor([[[0, 1, 0, 2],
          [0, 1, 2, 0],
          [0, 0, 1, 2]],
 
@@ -274,59 +174,30 @@ tensor([[[0, 1, 0, 1],
          [0, 2, 0, 1],
          [0, 1, 2, 1]]])
 # _values
-tensor([[[ 1.,  2.,  3.,  4.],
-         [ 5.,  6.,  7.,  8.],
-         [ 9., 10., 11., 12.]],
-
-        [[13., 14., 15., 16.],
-         [17., 18., 19., 20.],
-         [21., 22., 23., 24.]]])
+tensor([[[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]],
 
+        [[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]]])
 
-########## torch.float64/torch.int64/size=()+(2, 3)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([1., 2., 3., 4.]), size=(2, 3), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4])
-# _col_indices
-tensor([0, 1, 0, 2])
-# _values
-tensor([1., 2., 3., 4.], dtype=torch.float64)
-
-########## torch.float64/torch.int64/size=()+(0, 0)+() ##########
+########## torch.float32/torch.int64/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(crow_indices=tensor([0]),
-       col_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0,)), size=(0, 0), nnz=0, dtype=torch.float64,
-       layout=torch.sparse_csr)
+tensor(crow_indices=tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24]),
+       col_indices=tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4,
+                           5, 0, 1, 2, 4, 5]),
+       values=tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11.,
+                      12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22.,
+                      23., 24.]), size=(8, 6), nnz=24, layout=torch.sparse_csr)
 # _crow_indices
-tensor([0])
+tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24])
 # _col_indices
-tensor([], dtype=torch.int64)
+tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4, 5, 0, 1, 2, 4, 5])
 # _values
-tensor([], dtype=torch.float64)
+tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14.,
+        15., 16., 17., 18., 19., 20., 21., 22., 23., 24.])
 
-########## torch.float64/torch.int64/size=(2,)+(2, 3)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       col_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[1., 2., 3., 4.],
-                      [5., 6., 7., 8.]]), size=(2, 2, 3), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csr)
-# _crow_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]])
-# _col_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]])
-# _values
-tensor([[1., 2., 3., 4.],
-        [5., 6., 7., 8.]], dtype=torch.float64)
 
 ########## torch.float64/torch.int64/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
@@ -337,20 +208,20 @@ tensor(crow_indices=tensor([[[0, 2, 4],
                             [[0, 1, 4],
                              [0, 2, 4],
                              [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
+       col_indices=tensor([[[0, 1, 0, 2],
                             [0, 1, 2, 0],
                             [0, 0, 1, 2]],
 
                            [[1, 0, 1, 2],
                             [0, 2, 0, 1],
                             [0, 1, 2, 1]]]),
-       values=tensor([[[ 1.,  2.,  3.,  4.],
-                       [ 5.,  6.,  7.,  8.],
-                       [ 9., 10., 11., 12.]],
+       values=tensor([[[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]],
 
-                      [[13., 14., 15., 16.],
-                       [17., 18., 19., 20.],
-                       [21., 22., 23., 24.]]]), size=(2, 3, 2, 3), nnz=4,
+                      [[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]]]), size=(2, 3, 2, 3), nnz=4,
        dtype=torch.float64, layout=torch.sparse_csr)
 # _crow_indices
 tensor([[[0, 2, 4],
@@ -361,7 +232,7 @@ tensor([[[0, 2, 4],
          [0, 2, 4],
          [0, 3, 4]]])
 # _col_indices
-tensor([[[0, 1, 0, 1],
+tensor([[[0, 1, 0, 2],
          [0, 1, 2, 0],
          [0, 0, 1, 2]],
 
@@ -369,84 +240,33 @@ tensor([[[0, 1, 0, 1],
          [0, 2, 0, 1],
          [0, 1, 2, 1]]])
 # _values
-tensor([[[ 1.,  2.,  3.,  4.],
-         [ 5.,  6.,  7.,  8.],
-         [ 9., 10., 11., 12.]],
-
-        [[13., 14., 15., 16.],
-         [17., 18., 19., 20.],
-         [21., 22., 23., 24.]]], dtype=torch.float64)
+tensor([[[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]],
 
+        [[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]]], dtype=torch.float64)
 
-########## torch.float32/torch.int32/size=()+(2, 3)+(2,) ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[1., 2.],
-                      [2., 3.],
-                      [3., 4.],
-                      [4., 5.]]), size=(2, 3, 2), nnz=4,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 2], dtype=torch.int32)
-# _values
-tensor([[1., 2.],
-        [2., 3.],
-        [3., 4.],
-        [4., 5.]])
-
-########## torch.float32/torch.int32/size=()+(2, 3)+(4, 2) ##########
+########## torch.float64/torch.int64/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[ 1., 11.],
-                       [ 2., 12.],
-                       [ 3., 13.],
-                       [ 4., 14.]],
-
-                      [[ 2., 12.],
-                       [ 3., 13.],
-                       [ 4., 14.],
-                       [ 5., 15.]],
-
-                      [[ 3., 13.],
-                       [ 4., 14.],
-                       [ 5., 15.],
-                       [ 6., 16.]],
-
-                      [[ 4., 14.],
-                       [ 5., 15.],
-                       [ 6., 16.],
-                       [ 7., 17.]]]), size=(2, 3, 4, 2), nnz=4,
+tensor(crow_indices=tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24]),
+       col_indices=tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4,
+                           5, 0, 1, 2, 4, 5]),
+       values=tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11.,
+                      12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22.,
+                      23., 24.]), size=(8, 6), nnz=24, dtype=torch.float64,
        layout=torch.sparse_csr)
 # _crow_indices
-tensor([0, 2, 4], dtype=torch.int32)
+tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24])
 # _col_indices
-tensor([0, 1, 0, 2], dtype=torch.int32)
+tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4, 5, 0, 1, 2, 4, 5])
 # _values
-tensor([[[ 1., 11.],
-         [ 2., 12.],
-         [ 3., 13.],
-         [ 4., 14.]],
+tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14.,
+        15., 16., 17., 18., 19., 20., 21., 22., 23., 24.], dtype=torch.float64)
 
-        [[ 2., 12.],
-         [ 3., 13.],
-         [ 4., 14.],
-         [ 5., 15.]],
-
-        [[ 3., 13.],
-         [ 4., 14.],
-         [ 5., 15.],
-         [ 6., 16.]],
 
-        [[ 4., 14.],
-         [ 5., 15.],
-         [ 6., 16.],
-         [ 7., 17.]]])
-
-########## torch.float32/torch.int32/size=(2, 3)+(2, 3)+(2, 1) ##########
+########## torch.float32/torch.int32/size=(2, 3)+(2, 3)+(4,) ##########
 # sparse tensor
 tensor(crow_indices=tensor([[[0, 2, 4],
                              [0, 3, 4],
@@ -455,90 +275,43 @@ tensor(crow_indices=tensor([[[0, 2, 4],
                             [[0, 1, 4],
                              [0, 2, 4],
                              [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
+       col_indices=tensor([[[0, 1, 0, 2],
                             [0, 1, 2, 0],
                             [0, 0, 1, 2]],
 
                            [[1, 0, 1, 2],
                             [0, 2, 0, 1],
                             [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1.],
-                         [ 2.]],
-
-                        [[ 2.],
-                         [ 3.]],
-
-                        [[ 3.],
-                         [ 4.]],
-
-                        [[ 4.],
-                         [ 5.]]],
-
-
-                       [[[ 5.],
-                         [ 6.]],
-
-                        [[ 6.],
-                         [ 7.]],
-
-                        [[ 7.],
-                         [ 8.]],
-
-                        [[ 8.],
-                         [ 9.]]],
-
-
-                       [[[ 9.],
-                         [10.]],
-
-                        [[10.],
-                         [11.]],
-
-                        [[11.],
-                         [12.]],
-
-                        [[12.],
-                         [13.]]]],
-
-
-
-                      [[[[13.],
-                         [14.]],
-
-                        [[14.],
-                         [15.]],
-
-                        [[15.],
-                         [16.]],
-
-                        [[16.],
-                         [17.]]],
-
-
-                       [[[17.],
-                         [18.]],
-
-                        [[18.],
-                         [19.]],
-
-                        [[19.],
-                         [20.]],
-
-                        [[20.],
-                         [21.]]],
-
-
-                       [[[21.],
-                         [22.]],
-
-                        [[22.],
-                         [23.]],
-
-                        [[23.],
-                         [24.]],
-
-                        [[24.],
-                         [25.]]]]]), size=(2, 3, 2, 3, 2, 1), nnz=4,
+       values=tensor([[[[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]]],
+
+
+                      [[[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]]]]), size=(2, 3, 2, 3, 4), nnz=4,
        layout=torch.sparse_csr)
 # _crow_indices
 tensor([[[0, 2, 4],
@@ -549,7 +322,7 @@ tensor([[[0, 2, 4],
          [0, 2, 4],
          [0, 3, 4]]], dtype=torch.int32)
 # _col_indices
-tensor([[[0, 1, 0, 1],
+tensor([[[0, 1, 0, 2],
          [0, 1, 2, 0],
          [0, 0, 1, 2]],
 
@@ -557,108 +330,42 @@ tensor([[[0, 1, 0, 1],
          [0, 2, 0, 1],
          [0, 1, 2, 1]]], dtype=torch.int32)
 # _values
-tensor([[[[[ 1.],
-           [ 2.]],
-
-          [[ 2.],
-           [ 3.]],
-
-          [[ 3.],
-           [ 4.]],
-
-          [[ 4.],
-           [ 5.]]],
-
-
-         [[[ 5.],
-           [ 6.]],
-
-          [[ 6.],
-           [ 7.]],
-
-          [[ 7.],
-           [ 8.]],
-
-          [[ 8.],
-           [ 9.]]],
-
-
-         [[[ 9.],
-           [10.]],
-
-          [[10.],
-           [11.]],
-
-          [[11.],
-           [12.]],
-
-          [[12.],
-           [13.]]]],
-
-
-
-        [[[[13.],
-           [14.]],
-
-          [[14.],
-           [15.]],
-
-          [[15.],
-           [16.]],
-
-          [[16.],
-           [17.]]],
-
-
-         [[[17.],
-           [18.]],
-
-          [[18.],
-           [19.]],
-
-          [[19.],
-           [20.]],
-
-          [[20.],
-           [21.]]],
-
-
-         [[[21.],
-           [22.]],
-
-          [[22.],
-           [23.]],
-
-          [[23.],
-           [24.]],
-
-          [[24.],
-           [25.]]]]])
-
-
-########## torch.float64/torch.int32/size=()+(2, 3)+(2,) ##########
+tensor([[[[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]]],
+
+
+        [[[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]]]])
+
+########## torch.float32/torch.int32/size=()+(8, 6)+(4, 2) ##########
 # sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[1., 2.],
-                      [2., 3.],
-                      [3., 4.],
-                      [4., 5.]]), size=(2, 3, 2), nnz=4, dtype=torch.float64,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 2], dtype=torch.int32)
-# _values
-tensor([[1., 2.],
-        [2., 3.],
-        [3., 4.],
-        [4., 5.]], dtype=torch.float64)
-
-########## torch.float64/torch.int32/size=()+(2, 3)+(4, 2) ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
+tensor(crow_indices=tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24]),
+       col_indices=tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4,
+                           5, 0, 1, 2, 4, 5]),
        values=tensor([[[ 1., 11.],
                        [ 2., 12.],
                        [ 3., 13.],
@@ -677,12 +384,113 @@ tensor(crow_indices=tensor([0, 2, 4]),
                       [[ 4., 14.],
                        [ 5., 15.],
                        [ 6., 16.],
-                       [ 7., 17.]]]), size=(2, 3, 4, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csr)
+                       [ 7., 17.]],
+
+                      [[ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.]],
+
+                      [[ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.]],
+
+                      [[ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.]],
+
+                      [[ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.],
+                       [11., 21.]],
+
+                      [[ 9., 19.],
+                       [10., 20.],
+                       [11., 21.],
+                       [12., 22.]],
+
+                      [[10., 20.],
+                       [11., 21.],
+                       [12., 22.],
+                       [13., 23.]],
+
+                      [[11., 21.],
+                       [12., 22.],
+                       [13., 23.],
+                       [14., 24.]],
+
+                      [[12., 22.],
+                       [13., 23.],
+                       [14., 24.],
+                       [15., 25.]],
+
+                      [[13., 23.],
+                       [14., 24.],
+                       [15., 25.],
+                       [16., 26.]],
+
+                      [[14., 24.],
+                       [15., 25.],
+                       [16., 26.],
+                       [17., 27.]],
+
+                      [[15., 25.],
+                       [16., 26.],
+                       [17., 27.],
+                       [18., 28.]],
+
+                      [[16., 26.],
+                       [17., 27.],
+                       [18., 28.],
+                       [19., 29.]],
+
+                      [[17., 27.],
+                       [18., 28.],
+                       [19., 29.],
+                       [20., 30.]],
+
+                      [[18., 28.],
+                       [19., 29.],
+                       [20., 30.],
+                       [21., 31.]],
+
+                      [[19., 29.],
+                       [20., 30.],
+                       [21., 31.],
+                       [22., 32.]],
+
+                      [[20., 30.],
+                       [21., 31.],
+                       [22., 32.],
+                       [23., 33.]],
+
+                      [[21., 31.],
+                       [22., 32.],
+                       [23., 33.],
+                       [24., 34.]],
+
+                      [[22., 32.],
+                       [23., 33.],
+                       [24., 34.],
+                       [25., 35.]],
+
+                      [[23., 33.],
+                       [24., 34.],
+                       [25., 35.],
+                       [26., 36.]],
+
+                      [[24., 34.],
+                       [25., 35.],
+                       [26., 36.],
+                       [27., 37.]]]), size=(8, 6, 4, 2), nnz=24,
+       layout=torch.sparse_csr)
 # _crow_indices
-tensor([0, 2, 4], dtype=torch.int32)
+tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24], dtype=torch.int32)
 # _col_indices
-tensor([0, 1, 0, 2], dtype=torch.int32)
+tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4, 5, 0, 1, 2, 4, 5],
+       dtype=torch.int32)
 # _values
 tensor([[[ 1., 11.],
          [ 2., 12.],
@@ -702,9 +510,110 @@ tensor([[[ 1., 11.],
         [[ 4., 14.],
          [ 5., 15.],
          [ 6., 16.],
-         [ 7., 17.]]], dtype=torch.float64)
+         [ 7., 17.]],
 
-########## torch.float64/torch.int32/size=(2, 3)+(2, 3)+(2, 1) ##########
+        [[ 5., 15.],
+         [ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.]],
+
+        [[ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.]],
+
+        [[ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.],
+         [10., 20.]],
+
+        [[ 8., 18.],
+         [ 9., 19.],
+         [10., 20.],
+         [11., 21.]],
+
+        [[ 9., 19.],
+         [10., 20.],
+         [11., 21.],
+         [12., 22.]],
+
+        [[10., 20.],
+         [11., 21.],
+         [12., 22.],
+         [13., 23.]],
+
+        [[11., 21.],
+         [12., 22.],
+         [13., 23.],
+         [14., 24.]],
+
+        [[12., 22.],
+         [13., 23.],
+         [14., 24.],
+         [15., 25.]],
+
+        [[13., 23.],
+         [14., 24.],
+         [15., 25.],
+         [16., 26.]],
+
+        [[14., 24.],
+         [15., 25.],
+         [16., 26.],
+         [17., 27.]],
+
+        [[15., 25.],
+         [16., 26.],
+         [17., 27.],
+         [18., 28.]],
+
+        [[16., 26.],
+         [17., 27.],
+         [18., 28.],
+         [19., 29.]],
+
+        [[17., 27.],
+         [18., 28.],
+         [19., 29.],
+         [20., 30.]],
+
+        [[18., 28.],
+         [19., 29.],
+         [20., 30.],
+         [21., 31.]],
+
+        [[19., 29.],
+         [20., 30.],
+         [21., 31.],
+         [22., 32.]],
+
+        [[20., 30.],
+         [21., 31.],
+         [22., 32.],
+         [23., 33.]],
+
+        [[21., 31.],
+         [22., 32.],
+         [23., 33.],
+         [24., 34.]],
+
+        [[22., 32.],
+         [23., 33.],
+         [24., 34.],
+         [25., 35.]],
+
+        [[23., 33.],
+         [24., 34.],
+         [25., 35.],
+         [26., 36.]],
+
+        [[24., 34.],
+         [25., 35.],
+         [26., 36.],
+         [27., 37.]]])
+
+
+########## torch.float64/torch.int32/size=(2, 3)+(2, 3)+(4,) ##########
 # sparse tensor
 tensor(crow_indices=tensor([[[0, 2, 4],
                              [0, 3, 4],
@@ -713,90 +622,43 @@ tensor(crow_indices=tensor([[[0, 2, 4],
                             [[0, 1, 4],
                              [0, 2, 4],
                              [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
+       col_indices=tensor([[[0, 1, 0, 2],
                             [0, 1, 2, 0],
                             [0, 0, 1, 2]],
 
                            [[1, 0, 1, 2],
                             [0, 2, 0, 1],
                             [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1.],
-                         [ 2.]],
-
-                        [[ 2.],
-                         [ 3.]],
-
-                        [[ 3.],
-                         [ 4.]],
-
-                        [[ 4.],
-                         [ 5.]]],
-
-
-                       [[[ 5.],
-                         [ 6.]],
-
-                        [[ 6.],
-                         [ 7.]],
-
-                        [[ 7.],
-                         [ 8.]],
-
-                        [[ 8.],
-                         [ 9.]]],
-
-
-                       [[[ 9.],
-                         [10.]],
-
-                        [[10.],
-                         [11.]],
-
-                        [[11.],
-                         [12.]],
-
-                        [[12.],
-                         [13.]]]],
-
-
-
-                      [[[[13.],
-                         [14.]],
-
-                        [[14.],
-                         [15.]],
-
-                        [[15.],
-                         [16.]],
-
-                        [[16.],
-                         [17.]]],
-
-
-                       [[[17.],
-                         [18.]],
-
-                        [[18.],
-                         [19.]],
-
-                        [[19.],
-                         [20.]],
-
-                        [[20.],
-                         [21.]]],
-
-
-                       [[[21.],
-                         [22.]],
-
-                        [[22.],
-                         [23.]],
-
-                        [[23.],
-                         [24.]],
-
-                        [[24.],
-                         [25.]]]]]), size=(2, 3, 2, 3, 2, 1), nnz=4,
+       values=tensor([[[[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]]],
+
+
+                      [[[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]]]]), size=(2, 3, 2, 3, 4), nnz=4,
        dtype=torch.float64, layout=torch.sparse_csr)
 # _crow_indices
 tensor([[[0, 2, 4],
@@ -807,7 +669,7 @@ tensor([[[0, 2, 4],
          [0, 2, 4],
          [0, 3, 4]]], dtype=torch.int32)
 # _col_indices
-tensor([[[0, 1, 0, 1],
+tensor([[[0, 1, 0, 2],
          [0, 1, 2, 0],
          [0, 0, 1, 2]],
 
@@ -815,108 +677,42 @@ tensor([[[0, 1, 0, 1],
          [0, 2, 0, 1],
          [0, 1, 2, 1]]], dtype=torch.int32)
 # _values
-tensor([[[[[ 1.],
-           [ 2.]],
-
-          [[ 2.],
-           [ 3.]],
-
-          [[ 3.],
-           [ 4.]],
-
-          [[ 4.],
-           [ 5.]]],
-
-
-         [[[ 5.],
-           [ 6.]],
-
-          [[ 6.],
-           [ 7.]],
-
-          [[ 7.],
-           [ 8.]],
-
-          [[ 8.],
-           [ 9.]]],
-
-
-         [[[ 9.],
-           [10.]],
-
-          [[10.],
-           [11.]],
-
-          [[11.],
-           [12.]],
-
-          [[12.],
-           [13.]]]],
-
-
-
-        [[[[13.],
-           [14.]],
-
-          [[14.],
-           [15.]],
-
-          [[15.],
-           [16.]],
-
-          [[16.],
-           [17.]]],
-
-
-         [[[17.],
-           [18.]],
-
-          [[18.],
-           [19.]],
-
-          [[19.],
-           [20.]],
-
-          [[20.],
-           [21.]]],
-
-
-         [[[21.],
-           [22.]],
-
-          [[22.],
-           [23.]],
-
-          [[23.],
-           [24.]],
-
-          [[24.],
-           [25.]]]]], dtype=torch.float64)
-
-
-########## torch.float32/torch.int64/size=()+(2, 3)+(2,) ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[1., 2.],
-                      [2., 3.],
-                      [3., 4.],
-                      [4., 5.]]), size=(2, 3, 2), nnz=4,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4])
-# _col_indices
-tensor([0, 1, 0, 2])
-# _values
-tensor([[1., 2.],
-        [2., 3.],
-        [3., 4.],
-        [4., 5.]])
-
-########## torch.float32/torch.int64/size=()+(2, 3)+(4, 2) ##########
+tensor([[[[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]]],
+
+
+        [[[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]]]], dtype=torch.float64)
+
+########## torch.float64/torch.int32/size=()+(8, 6)+(4, 2) ##########
 # sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
+tensor(crow_indices=tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24]),
+       col_indices=tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4,
+                           5, 0, 1, 2, 4, 5]),
        values=tensor([[[ 1., 11.],
                        [ 2., 12.],
                        [ 3., 13.],
@@ -935,12 +731,113 @@ tensor(crow_indices=tensor([0, 2, 4]),
                       [[ 4., 14.],
                        [ 5., 15.],
                        [ 6., 16.],
-                       [ 7., 17.]]]), size=(2, 3, 4, 2), nnz=4,
-       layout=torch.sparse_csr)
+                       [ 7., 17.]],
+
+                      [[ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.]],
+
+                      [[ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.]],
+
+                      [[ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.]],
+
+                      [[ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.],
+                       [11., 21.]],
+
+                      [[ 9., 19.],
+                       [10., 20.],
+                       [11., 21.],
+                       [12., 22.]],
+
+                      [[10., 20.],
+                       [11., 21.],
+                       [12., 22.],
+                       [13., 23.]],
+
+                      [[11., 21.],
+                       [12., 22.],
+                       [13., 23.],
+                       [14., 24.]],
+
+                      [[12., 22.],
+                       [13., 23.],
+                       [14., 24.],
+                       [15., 25.]],
+
+                      [[13., 23.],
+                       [14., 24.],
+                       [15., 25.],
+                       [16., 26.]],
+
+                      [[14., 24.],
+                       [15., 25.],
+                       [16., 26.],
+                       [17., 27.]],
+
+                      [[15., 25.],
+                       [16., 26.],
+                       [17., 27.],
+                       [18., 28.]],
+
+                      [[16., 26.],
+                       [17., 27.],
+                       [18., 28.],
+                       [19., 29.]],
+
+                      [[17., 27.],
+                       [18., 28.],
+                       [19., 29.],
+                       [20., 30.]],
+
+                      [[18., 28.],
+                       [19., 29.],
+                       [20., 30.],
+                       [21., 31.]],
+
+                      [[19., 29.],
+                       [20., 30.],
+                       [21., 31.],
+                       [22., 32.]],
+
+                      [[20., 30.],
+                       [21., 31.],
+                       [22., 32.],
+                       [23., 33.]],
+
+                      [[21., 31.],
+                       [22., 32.],
+                       [23., 33.],
+                       [24., 34.]],
+
+                      [[22., 32.],
+                       [23., 33.],
+                       [24., 34.],
+                       [25., 35.]],
+
+                      [[23., 33.],
+                       [24., 34.],
+                       [25., 35.],
+                       [26., 36.]],
+
+                      [[24., 34.],
+                       [25., 35.],
+                       [26., 36.],
+                       [27., 37.]]]), size=(8, 6, 4, 2), nnz=24,
+       dtype=torch.float64, layout=torch.sparse_csr)
 # _crow_indices
-tensor([0, 2, 4])
+tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24], dtype=torch.int32)
 # _col_indices
-tensor([0, 1, 0, 2])
+tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4, 5, 0, 1, 2, 4, 5],
+       dtype=torch.int32)
 # _values
 tensor([[[ 1., 11.],
          [ 2., 12.],
@@ -960,9 +857,110 @@ tensor([[[ 1., 11.],
         [[ 4., 14.],
          [ 5., 15.],
          [ 6., 16.],
-         [ 7., 17.]]])
+         [ 7., 17.]],
 
-########## torch.float32/torch.int64/size=(2, 3)+(2, 3)+(2, 1) ##########
+        [[ 5., 15.],
+         [ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.]],
+
+        [[ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.]],
+
+        [[ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.],
+         [10., 20.]],
+
+        [[ 8., 18.],
+         [ 9., 19.],
+         [10., 20.],
+         [11., 21.]],
+
+        [[ 9., 19.],
+         [10., 20.],
+         [11., 21.],
+         [12., 22.]],
+
+        [[10., 20.],
+         [11., 21.],
+         [12., 22.],
+         [13., 23.]],
+
+        [[11., 21.],
+         [12., 22.],
+         [13., 23.],
+         [14., 24.]],
+
+        [[12., 22.],
+         [13., 23.],
+         [14., 24.],
+         [15., 25.]],
+
+        [[13., 23.],
+         [14., 24.],
+         [15., 25.],
+         [16., 26.]],
+
+        [[14., 24.],
+         [15., 25.],
+         [16., 26.],
+         [17., 27.]],
+
+        [[15., 25.],
+         [16., 26.],
+         [17., 27.],
+         [18., 28.]],
+
+        [[16., 26.],
+         [17., 27.],
+         [18., 28.],
+         [19., 29.]],
+
+        [[17., 27.],
+         [18., 28.],
+         [19., 29.],
+         [20., 30.]],
+
+        [[18., 28.],
+         [19., 29.],
+         [20., 30.],
+         [21., 31.]],
+
+        [[19., 29.],
+         [20., 30.],
+         [21., 31.],
+         [22., 32.]],
+
+        [[20., 30.],
+         [21., 31.],
+         [22., 32.],
+         [23., 33.]],
+
+        [[21., 31.],
+         [22., 32.],
+         [23., 33.],
+         [24., 34.]],
+
+        [[22., 32.],
+         [23., 33.],
+         [24., 34.],
+         [25., 35.]],
+
+        [[23., 33.],
+         [24., 34.],
+         [25., 35.],
+         [26., 36.]],
+
+        [[24., 34.],
+         [25., 35.],
+         [26., 36.],
+         [27., 37.]]], dtype=torch.float64)
+
+
+########## torch.float32/torch.int64/size=(2, 3)+(2, 3)+(4,) ##########
 # sparse tensor
 tensor(crow_indices=tensor([[[0, 2, 4],
                              [0, 3, 4],
@@ -971,90 +969,43 @@ tensor(crow_indices=tensor([[[0, 2, 4],
                             [[0, 1, 4],
                              [0, 2, 4],
                              [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
+       col_indices=tensor([[[0, 1, 0, 2],
                             [0, 1, 2, 0],
                             [0, 0, 1, 2]],
 
                            [[1, 0, 1, 2],
                             [0, 2, 0, 1],
                             [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1.],
-                         [ 2.]],
-
-                        [[ 2.],
-                         [ 3.]],
-
-                        [[ 3.],
-                         [ 4.]],
-
-                        [[ 4.],
-                         [ 5.]]],
-
-
-                       [[[ 5.],
-                         [ 6.]],
-
-                        [[ 6.],
-                         [ 7.]],
-
-                        [[ 7.],
-                         [ 8.]],
-
-                        [[ 8.],
-                         [ 9.]]],
-
-
-                       [[[ 9.],
-                         [10.]],
-
-                        [[10.],
-                         [11.]],
-
-                        [[11.],
-                         [12.]],
-
-                        [[12.],
-                         [13.]]]],
-
-
-
-                      [[[[13.],
-                         [14.]],
-
-                        [[14.],
-                         [15.]],
-
-                        [[15.],
-                         [16.]],
-
-                        [[16.],
-                         [17.]]],
-
-
-                       [[[17.],
-                         [18.]],
-
-                        [[18.],
-                         [19.]],
-
-                        [[19.],
-                         [20.]],
-
-                        [[20.],
-                         [21.]]],
-
-
-                       [[[21.],
-                         [22.]],
-
-                        [[22.],
-                         [23.]],
-
-                        [[23.],
-                         [24.]],
-
-                        [[24.],
-                         [25.]]]]]), size=(2, 3, 2, 3, 2, 1), nnz=4,
+       values=tensor([[[[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]]],
+
+
+                      [[[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]]]]), size=(2, 3, 2, 3, 4), nnz=4,
        layout=torch.sparse_csr)
 # _crow_indices
 tensor([[[0, 2, 4],
@@ -1065,7 +1016,7 @@ tensor([[[0, 2, 4],
          [0, 2, 4],
          [0, 3, 4]]])
 # _col_indices
-tensor([[[0, 1, 0, 1],
+tensor([[[0, 1, 0, 2],
          [0, 1, 2, 0],
          [0, 0, 1, 2]],
 
@@ -1073,108 +1024,42 @@ tensor([[[0, 1, 0, 1],
          [0, 2, 0, 1],
          [0, 1, 2, 1]]])
 # _values
-tensor([[[[[ 1.],
-           [ 2.]],
-
-          [[ 2.],
-           [ 3.]],
-
-          [[ 3.],
-           [ 4.]],
-
-          [[ 4.],
-           [ 5.]]],
-
-
-         [[[ 5.],
-           [ 6.]],
-
-          [[ 6.],
-           [ 7.]],
-
-          [[ 7.],
-           [ 8.]],
-
-          [[ 8.],
-           [ 9.]]],
-
-
-         [[[ 9.],
-           [10.]],
-
-          [[10.],
-           [11.]],
-
-          [[11.],
-           [12.]],
-
-          [[12.],
-           [13.]]]],
-
-
-
-        [[[[13.],
-           [14.]],
-
-          [[14.],
-           [15.]],
-
-          [[15.],
-           [16.]],
-
-          [[16.],
-           [17.]]],
-
-
-         [[[17.],
-           [18.]],
-
-          [[18.],
-           [19.]],
-
-          [[19.],
-           [20.]],
-
-          [[20.],
-           [21.]]],
-
-
-         [[[21.],
-           [22.]],
-
-          [[22.],
-           [23.]],
-
-          [[23.],
-           [24.]],
-
-          [[24.],
-           [25.]]]]])
-
-
-########## torch.float64/torch.int64/size=()+(2, 3)+(2,) ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[1., 2.],
-                      [2., 3.],
-                      [3., 4.],
-                      [4., 5.]]), size=(2, 3, 2), nnz=4, dtype=torch.float64,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4])
-# _col_indices
-tensor([0, 1, 0, 2])
-# _values
-tensor([[1., 2.],
-        [2., 3.],
-        [3., 4.],
-        [4., 5.]], dtype=torch.float64)
-
-########## torch.float64/torch.int64/size=()+(2, 3)+(4, 2) ##########
+tensor([[[[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]]],
+
+
+        [[[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]]]])
+
+########## torch.float32/torch.int64/size=()+(8, 6)+(4, 2) ##########
 # sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
+tensor(crow_indices=tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24]),
+       col_indices=tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4,
+                           5, 0, 1, 2, 4, 5]),
        values=tensor([[[ 1., 11.],
                        [ 2., 12.],
                        [ 3., 13.],
@@ -1193,12 +1078,112 @@ tensor(crow_indices=tensor([0, 2, 4]),
                       [[ 4., 14.],
                        [ 5., 15.],
                        [ 6., 16.],
-                       [ 7., 17.]]]), size=(2, 3, 4, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csr)
+                       [ 7., 17.]],
+
+                      [[ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.]],
+
+                      [[ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.]],
+
+                      [[ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.]],
+
+                      [[ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.],
+                       [11., 21.]],
+
+                      [[ 9., 19.],
+                       [10., 20.],
+                       [11., 21.],
+                       [12., 22.]],
+
+                      [[10., 20.],
+                       [11., 21.],
+                       [12., 22.],
+                       [13., 23.]],
+
+                      [[11., 21.],
+                       [12., 22.],
+                       [13., 23.],
+                       [14., 24.]],
+
+                      [[12., 22.],
+                       [13., 23.],
+                       [14., 24.],
+                       [15., 25.]],
+
+                      [[13., 23.],
+                       [14., 24.],
+                       [15., 25.],
+                       [16., 26.]],
+
+                      [[14., 24.],
+                       [15., 25.],
+                       [16., 26.],
+                       [17., 27.]],
+
+                      [[15., 25.],
+                       [16., 26.],
+                       [17., 27.],
+                       [18., 28.]],
+
+                      [[16., 26.],
+                       [17., 27.],
+                       [18., 28.],
+                       [19., 29.]],
+
+                      [[17., 27.],
+                       [18., 28.],
+                       [19., 29.],
+                       [20., 30.]],
+
+                      [[18., 28.],
+                       [19., 29.],
+                       [20., 30.],
+                       [21., 31.]],
+
+                      [[19., 29.],
+                       [20., 30.],
+                       [21., 31.],
+                       [22., 32.]],
+
+                      [[20., 30.],
+                       [21., 31.],
+                       [22., 32.],
+                       [23., 33.]],
+
+                      [[21., 31.],
+                       [22., 32.],
+                       [23., 33.],
+                       [24., 34.]],
+
+                      [[22., 32.],
+                       [23., 33.],
+                       [24., 34.],
+                       [25., 35.]],
+
+                      [[23., 33.],
+                       [24., 34.],
+                       [25., 35.],
+                       [26., 36.]],
+
+                      [[24., 34.],
+                       [25., 35.],
+                       [26., 36.],
+                       [27., 37.]]]), size=(8, 6, 4, 2), nnz=24,
+       layout=torch.sparse_csr)
 # _crow_indices
-tensor([0, 2, 4])
+tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24])
 # _col_indices
-tensor([0, 1, 0, 2])
+tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4, 5, 0, 1, 2, 4, 5])
 # _values
 tensor([[[ 1., 11.],
          [ 2., 12.],
@@ -1218,9 +1203,110 @@ tensor([[[ 1., 11.],
         [[ 4., 14.],
          [ 5., 15.],
          [ 6., 16.],
-         [ 7., 17.]]], dtype=torch.float64)
+         [ 7., 17.]],
 
-########## torch.float64/torch.int64/size=(2, 3)+(2, 3)+(2, 1) ##########
+        [[ 5., 15.],
+         [ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.]],
+
+        [[ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.]],
+
+        [[ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.],
+         [10., 20.]],
+
+        [[ 8., 18.],
+         [ 9., 19.],
+         [10., 20.],
+         [11., 21.]],
+
+        [[ 9., 19.],
+         [10., 20.],
+         [11., 21.],
+         [12., 22.]],
+
+        [[10., 20.],
+         [11., 21.],
+         [12., 22.],
+         [13., 23.]],
+
+        [[11., 21.],
+         [12., 22.],
+         [13., 23.],
+         [14., 24.]],
+
+        [[12., 22.],
+         [13., 23.],
+         [14., 24.],
+         [15., 25.]],
+
+        [[13., 23.],
+         [14., 24.],
+         [15., 25.],
+         [16., 26.]],
+
+        [[14., 24.],
+         [15., 25.],
+         [16., 26.],
+         [17., 27.]],
+
+        [[15., 25.],
+         [16., 26.],
+         [17., 27.],
+         [18., 28.]],
+
+        [[16., 26.],
+         [17., 27.],
+         [18., 28.],
+         [19., 29.]],
+
+        [[17., 27.],
+         [18., 28.],
+         [19., 29.],
+         [20., 30.]],
+
+        [[18., 28.],
+         [19., 29.],
+         [20., 30.],
+         [21., 31.]],
+
+        [[19., 29.],
+         [20., 30.],
+         [21., 31.],
+         [22., 32.]],
+
+        [[20., 30.],
+         [21., 31.],
+         [22., 32.],
+         [23., 33.]],
+
+        [[21., 31.],
+         [22., 32.],
+         [23., 33.],
+         [24., 34.]],
+
+        [[22., 32.],
+         [23., 33.],
+         [24., 34.],
+         [25., 35.]],
+
+        [[23., 33.],
+         [24., 34.],
+         [25., 35.],
+         [26., 36.]],
+
+        [[24., 34.],
+         [25., 35.],
+         [26., 36.],
+         [27., 37.]]])
+
+
+########## torch.float64/torch.int64/size=(2, 3)+(2, 3)+(4,) ##########
 # sparse tensor
 tensor(crow_indices=tensor([[[0, 2, 4],
                              [0, 3, 4],
@@ -1229,90 +1315,43 @@ tensor(crow_indices=tensor([[[0, 2, 4],
                             [[0, 1, 4],
                              [0, 2, 4],
                              [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
+       col_indices=tensor([[[0, 1, 0, 2],
                             [0, 1, 2, 0],
                             [0, 0, 1, 2]],
 
                            [[1, 0, 1, 2],
                             [0, 2, 0, 1],
                             [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1.],
-                         [ 2.]],
-
-                        [[ 2.],
-                         [ 3.]],
-
-                        [[ 3.],
-                         [ 4.]],
-
-                        [[ 4.],
-                         [ 5.]]],
-
-
-                       [[[ 5.],
-                         [ 6.]],
-
-                        [[ 6.],
-                         [ 7.]],
-
-                        [[ 7.],
-                         [ 8.]],
-
-                        [[ 8.],
-                         [ 9.]]],
-
-
-                       [[[ 9.],
-                         [10.]],
-
-                        [[10.],
-                         [11.]],
-
-                        [[11.],
-                         [12.]],
-
-                        [[12.],
-                         [13.]]]],
-
-
-
-                      [[[[13.],
-                         [14.]],
-
-                        [[14.],
-                         [15.]],
-
-                        [[15.],
-                         [16.]],
-
-                        [[16.],
-                         [17.]]],
-
-
-                       [[[17.],
-                         [18.]],
-
-                        [[18.],
-                         [19.]],
-
-                        [[19.],
-                         [20.]],
-
-                        [[20.],
-                         [21.]]],
-
-
-                       [[[21.],
-                         [22.]],
-
-                        [[22.],
-                         [23.]],
-
-                        [[23.],
-                         [24.]],
-
-                        [[24.],
-                         [25.]]]]]), size=(2, 3, 2, 3, 2, 1), nnz=4,
+       values=tensor([[[[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]]],
+
+
+                      [[[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]]]]), size=(2, 3, 2, 3, 4), nnz=4,
        dtype=torch.float64, layout=torch.sparse_csr)
 # _crow_indices
 tensor([[[0, 2, 4],
@@ -1323,7 +1362,7 @@ tensor([[[0, 2, 4],
          [0, 2, 4],
          [0, 3, 4]]])
 # _col_indices
-tensor([[[0, 1, 0, 1],
+tensor([[[0, 1, 0, 2],
          [0, 1, 2, 0],
          [0, 0, 1, 2]],
 
@@ -1331,81 +1370,284 @@ tensor([[[0, 1, 0, 1],
          [0, 2, 0, 1],
          [0, 1, 2, 1]]])
 # _values
-tensor([[[[[ 1.],
-           [ 2.]],
-
-          [[ 2.],
-           [ 3.]],
-
-          [[ 3.],
-           [ 4.]],
-
-          [[ 4.],
-           [ 5.]]],
-
-
-         [[[ 5.],
-           [ 6.]],
-
-          [[ 6.],
-           [ 7.]],
-
-          [[ 7.],
-           [ 8.]],
-
-          [[ 8.],
-           [ 9.]]],
-
-
-         [[[ 9.],
-           [10.]],
-
-          [[10.],
-           [11.]],
-
-          [[11.],
-           [12.]],
-
-          [[12.],
-           [13.]]]],
-
-
-
-        [[[[13.],
-           [14.]],
-
-          [[14.],
-           [15.]],
-
-          [[15.],
-           [16.]],
-
-          [[16.],
-           [17.]]],
-
-
-         [[[17.],
-           [18.]],
+tensor([[[[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]]],
+
+
+        [[[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]]]], dtype=torch.float64)
+
+########## torch.float64/torch.int64/size=()+(8, 6)+(4, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24]),
+       col_indices=tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4,
+                           5, 0, 1, 2, 4, 5]),
+       values=tensor([[[ 1., 11.],
+                       [ 2., 12.],
+                       [ 3., 13.],
+                       [ 4., 14.]],
 
-          [[18.],
-           [19.]],
+                      [[ 2., 12.],
+                       [ 3., 13.],
+                       [ 4., 14.],
+                       [ 5., 15.]],
 
-          [[19.],
-           [20.]],
+                      [[ 3., 13.],
+                       [ 4., 14.],
+                       [ 5., 15.],
+                       [ 6., 16.]],
 
-          [[20.],
-           [21.]]],
+                      [[ 4., 14.],
+                       [ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.]],
 
+                      [[ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.]],
+
+                      [[ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.]],
+
+                      [[ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.]],
+
+                      [[ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.],
+                       [11., 21.]],
+
+                      [[ 9., 19.],
+                       [10., 20.],
+                       [11., 21.],
+                       [12., 22.]],
+
+                      [[10., 20.],
+                       [11., 21.],
+                       [12., 22.],
+                       [13., 23.]],
+
+                      [[11., 21.],
+                       [12., 22.],
+                       [13., 23.],
+                       [14., 24.]],
+
+                      [[12., 22.],
+                       [13., 23.],
+                       [14., 24.],
+                       [15., 25.]],
+
+                      [[13., 23.],
+                       [14., 24.],
+                       [15., 25.],
+                       [16., 26.]],
+
+                      [[14., 24.],
+                       [15., 25.],
+                       [16., 26.],
+                       [17., 27.]],
+
+                      [[15., 25.],
+                       [16., 26.],
+                       [17., 27.],
+                       [18., 28.]],
+
+                      [[16., 26.],
+                       [17., 27.],
+                       [18., 28.],
+                       [19., 29.]],
+
+                      [[17., 27.],
+                       [18., 28.],
+                       [19., 29.],
+                       [20., 30.]],
+
+                      [[18., 28.],
+                       [19., 29.],
+                       [20., 30.],
+                       [21., 31.]],
+
+                      [[19., 29.],
+                       [20., 30.],
+                       [21., 31.],
+                       [22., 32.]],
+
+                      [[20., 30.],
+                       [21., 31.],
+                       [22., 32.],
+                       [23., 33.]],
+
+                      [[21., 31.],
+                       [22., 32.],
+                       [23., 33.],
+                       [24., 34.]],
+
+                      [[22., 32.],
+                       [23., 33.],
+                       [24., 34.],
+                       [25., 35.]],
+
+                      [[23., 33.],
+                       [24., 34.],
+                       [25., 35.],
+                       [26., 36.]],
+
+                      [[24., 34.],
+                       [25., 35.],
+                       [26., 36.],
+                       [27., 37.]]]), size=(8, 6, 4, 2), nnz=24,
+       dtype=torch.float64, layout=torch.sparse_csr)
+# _crow_indices
+tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24])
+# _col_indices
+tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4, 5, 0, 1, 2, 4, 5])
+# _values
+tensor([[[ 1., 11.],
+         [ 2., 12.],
+         [ 3., 13.],
+         [ 4., 14.]],
 
-         [[[21.],
-           [22.]],
+        [[ 2., 12.],
+         [ 3., 13.],
+         [ 4., 14.],
+         [ 5., 15.]],
 
-          [[22.],
-           [23.]],
+        [[ 3., 13.],
+         [ 4., 14.],
+         [ 5., 15.],
+         [ 6., 16.]],
 
-          [[23.],
-           [24.]],
+        [[ 4., 14.],
+         [ 5., 15.],
+         [ 6., 16.],
+         [ 7., 17.]],
 
-          [[24.],
-           [25.]]]]], dtype=torch.float64)
+        [[ 5., 15.],
+         [ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.]],
+
+        [[ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.]],
+
+        [[ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.],
+         [10., 20.]],
+
+        [[ 8., 18.],
+         [ 9., 19.],
+         [10., 20.],
+         [11., 21.]],
+
+        [[ 9., 19.],
+         [10., 20.],
+         [11., 21.],
+         [12., 22.]],
+
+        [[10., 20.],
+         [11., 21.],
+         [12., 22.],
+         [13., 23.]],
+
+        [[11., 21.],
+         [12., 22.],
+         [13., 23.],
+         [14., 24.]],
+
+        [[12., 22.],
+         [13., 23.],
+         [14., 24.],
+         [15., 25.]],
+
+        [[13., 23.],
+         [14., 24.],
+         [15., 25.],
+         [16., 26.]],
+
+        [[14., 24.],
+         [15., 25.],
+         [16., 26.],
+         [17., 27.]],
+
+        [[15., 25.],
+         [16., 26.],
+         [17., 27.],
+         [18., 28.]],
+
+        [[16., 26.],
+         [17., 27.],
+         [18., 28.],
+         [19., 29.]],
+
+        [[17., 27.],
+         [18., 28.],
+         [19., 29.],
+         [20., 30.]],
+
+        [[18., 28.],
+         [19., 29.],
+         [20., 30.],
+         [21., 31.]],
+
+        [[19., 29.],
+         [20., 30.],
+         [21., 31.],
+         [22., 32.]],
+
+        [[20., 30.],
+         [21., 31.],
+         [22., 32.],
+         [23., 33.]],
+
+        [[21., 31.],
+         [22., 32.],
+         [23., 33.],
+         [24., 34.]],
+
+        [[22., 32.],
+         [23., 33.],
+         [24., 34.],
+         [25., 35.]],
+
+        [[23., 33.],
+         [24., 34.],
+         [25., 35.],
+         [26., 36.]],
+
+        [[24., 34.],
+         [25., 35.],
+         [26., 36.],
+         [27., 37.]]], dtype=torch.float64)
 
diff --git a/test/expect/TestSparseCompressedCUDA.test_print_SparseBSC_cuda.expect b/test/expect/TestSparseCompressedCUDA.test_print_SparseBSC_cuda.expect
index 46bdb44b2a983..9e563794f07bb 100644
--- a/test/expect/TestSparseCompressedCUDA.test_print_SparseBSC_cuda.expect
+++ b/test/expect/TestSparseCompressedCUDA.test_print_SparseBSC_cuda.expect
@@ -1,6981 +1,3583 @@
-########## torch.float32/torch.int32/size=()+(3, 4)+() ##########
+########## torch.float32/torch.int32/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[ 1., 11.]],
+tensor(ccol_indices=tensor([[[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]],
 
-                      [[ 2., 12.]],
+                            [[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]]]),
+       row_indices=tensor([[[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]],
 
-                      [[ 3., 13.]],
-
-                      [[ 4., 14.]]]), device='cuda:0', size=(3, 4), nnz=4,
-       layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
-# _row_indices
-tensor([0, 1, 0, 2], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[[ 1., 11.]],
-
-        [[ 2., 12.]],
-
-        [[ 3., 13.]],
-
-        [[ 4., 14.]]], device='cuda:0')
-
-########## torch.float32/torch.int32/size=()+(0, 0)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0]),
-       row_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0, 1, 2)), device='cuda:0', size=(0, 0), nnz=0,
-       layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0], device='cuda:0', dtype=torch.int32)
-# _row_indices
-tensor([], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([], device='cuda:0', size=(0, 1, 2))
-
-########## torch.float32/torch.int32/size=(2,)+(6, 2)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       row_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[[[1.],
-                        [2.]],
-
-                       [[2.],
-                        [3.]],
-
-                       [[3.],
-                        [4.]],
-
-                       [[4.],
-                        [5.]]],
-
-
-                      [[[5.],
-                        [6.]],
-
-                       [[6.],
-                        [7.]],
-
-                       [[7.],
-                        [8.]],
-
-                       [[8.],
-                        [9.]]]]), device='cuda:0', size=(2, 6, 2), nnz=4,
-       layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]], device='cuda:0', dtype=torch.int32)
-# _row_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[[[1.],
-          [2.]],
-
-         [[2.],
-          [3.]],
-
-         [[3.],
-          [4.]],
-
-         [[4.],
-          [5.]]],
-
-
-        [[[5.],
-          [6.]],
-
-         [[6.],
-          [7.]],
-
-         [[7.],
-          [8.]],
-
-         [[8.],
-          [9.]]]], device='cuda:0')
-
-########## torch.float32/torch.int32/size=(2, 3)+(9, 4)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1., 11.],
-                         [ 2., 12.],
-                         [ 3., 13.]],
-
-                        [[ 2., 12.],
-                         [ 3., 13.],
-                         [ 4., 14.]],
-
-                        [[ 3., 13.],
-                         [ 4., 14.],
-                         [ 5., 15.]],
-
-                        [[ 4., 14.],
-                         [ 5., 15.],
-                         [ 6., 16.]]],
-
-
-                       [[[ 5., 15.],
-                         [ 6., 16.],
-                         [ 7., 17.]],
-
-                        [[ 6., 16.],
-                         [ 7., 17.],
-                         [ 8., 18.]],
-
-                        [[ 7., 17.],
-                         [ 8., 18.],
-                         [ 9., 19.]],
-
-                        [[ 8., 18.],
-                         [ 9., 19.],
-                         [10., 20.]]],
-
-
-                       [[[ 9., 19.],
-                         [10., 20.],
-                         [11., 21.]],
-
-                        [[10., 20.],
-                         [11., 21.],
-                         [12., 22.]],
-
-                        [[11., 21.],
-                         [12., 22.],
-                         [13., 23.]],
-
-                        [[12., 22.],
-                         [13., 23.],
-                         [14., 24.]]]],
-
-
-
-                      [[[[13., 23.],
-                         [14., 24.],
-                         [15., 25.]],
-
-                        [[14., 24.],
-                         [15., 25.],
-                         [16., 26.]],
-
-                        [[15., 25.],
-                         [16., 26.],
-                         [17., 27.]],
-
-                        [[16., 26.],
-                         [17., 27.],
-                         [18., 28.]]],
-
-
-                       [[[17., 27.],
-                         [18., 28.],
-                         [19., 29.]],
-
-                        [[18., 28.],
-                         [19., 29.],
-                         [20., 30.]],
-
-                        [[19., 29.],
-                         [20., 30.],
-                         [21., 31.]],
-
-                        [[20., 30.],
-                         [21., 31.],
-                         [22., 32.]]],
-
-
-                       [[[21., 31.],
-                         [22., 32.],
-                         [23., 33.]],
-
-                        [[22., 32.],
-                         [23., 33.],
-                         [24., 34.]],
-
-                        [[23., 33.],
-                         [24., 34.],
-                         [25., 35.]],
-
-                        [[24., 34.],
-                         [25., 35.],
-                         [26., 36.]]]]]), device='cuda:0', size=(2, 3, 9, 4),
-       nnz=4, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
-
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], device='cuda:0', dtype=torch.int32)
-# _row_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
-
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[[[[ 1., 11.],
-           [ 2., 12.],
-           [ 3., 13.]],
-
-          [[ 2., 12.],
-           [ 3., 13.],
-           [ 4., 14.]],
-
-          [[ 3., 13.],
-           [ 4., 14.],
-           [ 5., 15.]],
-
-          [[ 4., 14.],
-           [ 5., 15.],
-           [ 6., 16.]]],
-
-
-         [[[ 5., 15.],
-           [ 6., 16.],
-           [ 7., 17.]],
-
-          [[ 6., 16.],
-           [ 7., 17.],
-           [ 8., 18.]],
-
-          [[ 7., 17.],
-           [ 8., 18.],
-           [ 9., 19.]],
-
-          [[ 8., 18.],
-           [ 9., 19.],
-           [10., 20.]]],
-
-
-         [[[ 9., 19.],
-           [10., 20.],
-           [11., 21.]],
-
-          [[10., 20.],
-           [11., 21.],
-           [12., 22.]],
-
-          [[11., 21.],
-           [12., 22.],
-           [13., 23.]],
-
-          [[12., 22.],
-           [13., 23.],
-           [14., 24.]]]],
-
-
-
-        [[[[13., 23.],
-           [14., 24.],
-           [15., 25.]],
-
-          [[14., 24.],
-           [15., 25.],
-           [16., 26.]],
-
-          [[15., 25.],
-           [16., 26.],
-           [17., 27.]],
-
-          [[16., 26.],
-           [17., 27.],
-           [18., 28.]]],
-
-
-         [[[17., 27.],
-           [18., 28.],
-           [19., 29.]],
-
-          [[18., 28.],
-           [19., 29.],
-           [20., 30.]],
-
-          [[19., 29.],
-           [20., 30.],
-           [21., 31.]],
-
-          [[20., 30.],
-           [21., 31.],
-           [22., 32.]]],
-
-
-         [[[21., 31.],
-           [22., 32.],
-           [23., 33.]],
-
-          [[22., 32.],
-           [23., 33.],
-           [24., 34.]],
-
-          [[23., 33.],
-           [24., 34.],
-           [25., 35.]],
-
-          [[24., 34.],
-           [25., 35.],
-           [26., 36.]]]]], device='cuda:0')
-
-
-########## torch.float64/torch.int32/size=()+(3, 4)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[ 1., 11.]],
-
-                      [[ 2., 12.]],
-
-                      [[ 3., 13.]],
-
-                      [[ 4., 14.]]]), device='cuda:0', size=(3, 4), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
-# _row_indices
-tensor([0, 1, 0, 2], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[[ 1., 11.]],
-
-        [[ 2., 12.]],
-
-        [[ 3., 13.]],
-
-        [[ 4., 14.]]], device='cuda:0', dtype=torch.float64)
-
-########## torch.float64/torch.int32/size=()+(0, 0)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0]),
-       row_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0, 1, 2)), device='cuda:0', size=(0, 0), nnz=0,
-       dtype=torch.float64, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0], device='cuda:0', dtype=torch.int32)
-# _row_indices
-tensor([], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([], device='cuda:0', size=(0, 1, 2), dtype=torch.float64)
-
-########## torch.float64/torch.int32/size=(2,)+(6, 2)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       row_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[[[1.],
-                        [2.]],
-
-                       [[2.],
-                        [3.]],
-
-                       [[3.],
-                        [4.]],
-
-                       [[4.],
-                        [5.]]],
-
-
-                      [[[5.],
-                        [6.]],
-
-                       [[6.],
-                        [7.]],
-
-                       [[7.],
-                        [8.]],
-
-                       [[8.],
-                        [9.]]]]), device='cuda:0', size=(2, 6, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]], device='cuda:0', dtype=torch.int32)
-# _row_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[[[1.],
-          [2.]],
-
-         [[2.],
-          [3.]],
-
-         [[3.],
-          [4.]],
-
-         [[4.],
-          [5.]]],
-
-
-        [[[5.],
-          [6.]],
-
-         [[6.],
-          [7.]],
-
-         [[7.],
-          [8.]],
-
-         [[8.],
-          [9.]]]], device='cuda:0', dtype=torch.float64)
-
-########## torch.float64/torch.int32/size=(2, 3)+(9, 4)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1., 11.],
-                         [ 2., 12.],
-                         [ 3., 13.]],
-
-                        [[ 2., 12.],
-                         [ 3., 13.],
-                         [ 4., 14.]],
-
-                        [[ 3., 13.],
-                         [ 4., 14.],
-                         [ 5., 15.]],
-
-                        [[ 4., 14.],
-                         [ 5., 15.],
-                         [ 6., 16.]]],
-
-
-                       [[[ 5., 15.],
-                         [ 6., 16.],
-                         [ 7., 17.]],
-
-                        [[ 6., 16.],
-                         [ 7., 17.],
-                         [ 8., 18.]],
-
-                        [[ 7., 17.],
-                         [ 8., 18.],
-                         [ 9., 19.]],
-
-                        [[ 8., 18.],
-                         [ 9., 19.],
-                         [10., 20.]]],
-
-
-                       [[[ 9., 19.],
-                         [10., 20.],
-                         [11., 21.]],
-
-                        [[10., 20.],
-                         [11., 21.],
-                         [12., 22.]],
-
-                        [[11., 21.],
-                         [12., 22.],
-                         [13., 23.]],
-
-                        [[12., 22.],
-                         [13., 23.],
-                         [14., 24.]]]],
-
-
-
-                      [[[[13., 23.],
-                         [14., 24.],
-                         [15., 25.]],
-
-                        [[14., 24.],
-                         [15., 25.],
-                         [16., 26.]],
-
-                        [[15., 25.],
-                         [16., 26.],
-                         [17., 27.]],
-
-                        [[16., 26.],
-                         [17., 27.],
-                         [18., 28.]]],
-
-
-                       [[[17., 27.],
-                         [18., 28.],
-                         [19., 29.]],
-
-                        [[18., 28.],
-                         [19., 29.],
-                         [20., 30.]],
-
-                        [[19., 29.],
-                         [20., 30.],
-                         [21., 31.]],
-
-                        [[20., 30.],
-                         [21., 31.],
-                         [22., 32.]]],
-
-
-                       [[[21., 31.],
-                         [22., 32.],
-                         [23., 33.]],
-
-                        [[22., 32.],
-                         [23., 33.],
-                         [24., 34.]],
-
-                        [[23., 33.],
-                         [24., 34.],
-                         [25., 35.]],
-
-                        [[24., 34.],
-                         [25., 35.],
-                         [26., 36.]]]]]), device='cuda:0', size=(2, 3, 9, 4),
-       nnz=4, dtype=torch.float64, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
-
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], device='cuda:0', dtype=torch.int32)
-# _row_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
-
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[[[[ 1., 11.],
-           [ 2., 12.],
-           [ 3., 13.]],
-
-          [[ 2., 12.],
-           [ 3., 13.],
-           [ 4., 14.]],
-
-          [[ 3., 13.],
-           [ 4., 14.],
-           [ 5., 15.]],
-
-          [[ 4., 14.],
-           [ 5., 15.],
-           [ 6., 16.]]],
-
-
-         [[[ 5., 15.],
-           [ 6., 16.],
-           [ 7., 17.]],
-
-          [[ 6., 16.],
-           [ 7., 17.],
-           [ 8., 18.]],
-
-          [[ 7., 17.],
-           [ 8., 18.],
-           [ 9., 19.]],
-
-          [[ 8., 18.],
-           [ 9., 19.],
-           [10., 20.]]],
-
-
-         [[[ 9., 19.],
-           [10., 20.],
-           [11., 21.]],
-
-          [[10., 20.],
-           [11., 21.],
-           [12., 22.]],
-
-          [[11., 21.],
-           [12., 22.],
-           [13., 23.]],
-
-          [[12., 22.],
-           [13., 23.],
-           [14., 24.]]]],
-
-
-
-        [[[[13., 23.],
-           [14., 24.],
-           [15., 25.]],
-
-          [[14., 24.],
-           [15., 25.],
-           [16., 26.]],
-
-          [[15., 25.],
-           [16., 26.],
-           [17., 27.]],
-
-          [[16., 26.],
-           [17., 27.],
-           [18., 28.]]],
-
-
-         [[[17., 27.],
-           [18., 28.],
-           [19., 29.]],
-
-          [[18., 28.],
-           [19., 29.],
-           [20., 30.]],
-
-          [[19., 29.],
-           [20., 30.],
-           [21., 31.]],
-
-          [[20., 30.],
-           [21., 31.],
-           [22., 32.]]],
-
-
-         [[[21., 31.],
-           [22., 32.],
-           [23., 33.]],
-
-          [[22., 32.],
-           [23., 33.],
-           [24., 34.]],
-
-          [[23., 33.],
-           [24., 34.],
-           [25., 35.]],
-
-          [[24., 34.],
-           [25., 35.],
-           [26., 36.]]]]], device='cuda:0', dtype=torch.float64)
-
-
-########## torch.float32/torch.int64/size=()+(3, 4)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[ 1., 11.]],
-
-                      [[ 2., 12.]],
-
-                      [[ 3., 13.]],
-
-                      [[ 4., 14.]]]), device='cuda:0', size=(3, 4), nnz=4,
-       layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0, 2, 4], device='cuda:0')
-# _row_indices
-tensor([0, 1, 0, 2], device='cuda:0')
-# _values
-tensor([[[ 1., 11.]],
-
-        [[ 2., 12.]],
-
-        [[ 3., 13.]],
-
-        [[ 4., 14.]]], device='cuda:0')
-
-########## torch.float32/torch.int64/size=()+(0, 0)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0]),
-       row_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0, 1, 2)), device='cuda:0', size=(0, 0), nnz=0,
-       layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0], device='cuda:0')
-# _row_indices
-tensor([], device='cuda:0', dtype=torch.int64)
-# _values
-tensor([], device='cuda:0', size=(0, 1, 2))
-
-########## torch.float32/torch.int64/size=(2,)+(6, 2)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       row_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[[[1.],
-                        [2.]],
-
-                       [[2.],
-                        [3.]],
-
-                       [[3.],
-                        [4.]],
-
-                       [[4.],
-                        [5.]]],
-
-
-                      [[[5.],
-                        [6.]],
-
-                       [[6.],
-                        [7.]],
-
-                       [[7.],
-                        [8.]],
-
-                       [[8.],
-                        [9.]]]]), device='cuda:0', size=(2, 6, 2), nnz=4,
-       layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]], device='cuda:0')
-# _row_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]], device='cuda:0')
-# _values
-tensor([[[[1.],
-          [2.]],
-
-         [[2.],
-          [3.]],
-
-         [[3.],
-          [4.]],
-
-         [[4.],
-          [5.]]],
-
-
-        [[[5.],
-          [6.]],
-
-         [[6.],
-          [7.]],
-
-         [[7.],
-          [8.]],
-
-         [[8.],
-          [9.]]]], device='cuda:0')
-
-########## torch.float32/torch.int64/size=(2, 3)+(9, 4)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1., 11.],
-                         [ 2., 12.],
-                         [ 3., 13.]],
-
-                        [[ 2., 12.],
-                         [ 3., 13.],
-                         [ 4., 14.]],
-
-                        [[ 3., 13.],
-                         [ 4., 14.],
-                         [ 5., 15.]],
-
-                        [[ 4., 14.],
-                         [ 5., 15.],
-                         [ 6., 16.]]],
-
-
-                       [[[ 5., 15.],
-                         [ 6., 16.],
-                         [ 7., 17.]],
-
-                        [[ 6., 16.],
-                         [ 7., 17.],
-                         [ 8., 18.]],
-
-                        [[ 7., 17.],
-                         [ 8., 18.],
-                         [ 9., 19.]],
-
-                        [[ 8., 18.],
-                         [ 9., 19.],
-                         [10., 20.]]],
-
-
-                       [[[ 9., 19.],
-                         [10., 20.],
-                         [11., 21.]],
-
-                        [[10., 20.],
-                         [11., 21.],
-                         [12., 22.]],
-
-                        [[11., 21.],
-                         [12., 22.],
-                         [13., 23.]],
-
-                        [[12., 22.],
-                         [13., 23.],
-                         [14., 24.]]]],
-
-
-
-                      [[[[13., 23.],
-                         [14., 24.],
-                         [15., 25.]],
-
-                        [[14., 24.],
-                         [15., 25.],
-                         [16., 26.]],
-
-                        [[15., 25.],
-                         [16., 26.],
-                         [17., 27.]],
-
-                        [[16., 26.],
-                         [17., 27.],
-                         [18., 28.]]],
-
-
-                       [[[17., 27.],
-                         [18., 28.],
-                         [19., 29.]],
-
-                        [[18., 28.],
-                         [19., 29.],
-                         [20., 30.]],
-
-                        [[19., 29.],
-                         [20., 30.],
-                         [21., 31.]],
-
-                        [[20., 30.],
-                         [21., 31.],
-                         [22., 32.]]],
-
-
-                       [[[21., 31.],
-                         [22., 32.],
-                         [23., 33.]],
-
-                        [[22., 32.],
-                         [23., 33.],
-                         [24., 34.]],
-
-                        [[23., 33.],
-                         [24., 34.],
-                         [25., 35.]],
-
-                        [[24., 34.],
-                         [25., 35.],
-                         [26., 36.]]]]]), device='cuda:0', size=(2, 3, 9, 4),
-       nnz=4, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
-
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], device='cuda:0')
-# _row_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
-
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], device='cuda:0')
-# _values
-tensor([[[[[ 1., 11.],
-           [ 2., 12.],
-           [ 3., 13.]],
-
-          [[ 2., 12.],
-           [ 3., 13.],
-           [ 4., 14.]],
-
-          [[ 3., 13.],
-           [ 4., 14.],
-           [ 5., 15.]],
-
-          [[ 4., 14.],
-           [ 5., 15.],
-           [ 6., 16.]]],
-
-
-         [[[ 5., 15.],
-           [ 6., 16.],
-           [ 7., 17.]],
-
-          [[ 6., 16.],
-           [ 7., 17.],
-           [ 8., 18.]],
-
-          [[ 7., 17.],
-           [ 8., 18.],
-           [ 9., 19.]],
-
-          [[ 8., 18.],
-           [ 9., 19.],
-           [10., 20.]]],
-
-
-         [[[ 9., 19.],
-           [10., 20.],
-           [11., 21.]],
-
-          [[10., 20.],
-           [11., 21.],
-           [12., 22.]],
-
-          [[11., 21.],
-           [12., 22.],
-           [13., 23.]],
-
-          [[12., 22.],
-           [13., 23.],
-           [14., 24.]]]],
-
-
-
-        [[[[13., 23.],
-           [14., 24.],
-           [15., 25.]],
-
-          [[14., 24.],
-           [15., 25.],
-           [16., 26.]],
-
-          [[15., 25.],
-           [16., 26.],
-           [17., 27.]],
-
-          [[16., 26.],
-           [17., 27.],
-           [18., 28.]]],
-
-
-         [[[17., 27.],
-           [18., 28.],
-           [19., 29.]],
-
-          [[18., 28.],
-           [19., 29.],
-           [20., 30.]],
-
-          [[19., 29.],
-           [20., 30.],
-           [21., 31.]],
-
-          [[20., 30.],
-           [21., 31.],
-           [22., 32.]]],
-
-
-         [[[21., 31.],
-           [22., 32.],
-           [23., 33.]],
-
-          [[22., 32.],
-           [23., 33.],
-           [24., 34.]],
-
-          [[23., 33.],
-           [24., 34.],
-           [25., 35.]],
-
-          [[24., 34.],
-           [25., 35.],
-           [26., 36.]]]]], device='cuda:0')
-
-
-########## torch.float64/torch.int64/size=()+(3, 4)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[ 1., 11.]],
-
-                      [[ 2., 12.]],
-
-                      [[ 3., 13.]],
-
-                      [[ 4., 14.]]]), device='cuda:0', size=(3, 4), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0, 2, 4], device='cuda:0')
-# _row_indices
-tensor([0, 1, 0, 2], device='cuda:0')
-# _values
-tensor([[[ 1., 11.]],
-
-        [[ 2., 12.]],
-
-        [[ 3., 13.]],
-
-        [[ 4., 14.]]], device='cuda:0', dtype=torch.float64)
-
-########## torch.float64/torch.int64/size=()+(0, 0)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0]),
-       row_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0, 1, 2)), device='cuda:0', size=(0, 0), nnz=0,
-       dtype=torch.float64, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0], device='cuda:0')
-# _row_indices
-tensor([], device='cuda:0', dtype=torch.int64)
-# _values
-tensor([], device='cuda:0', size=(0, 1, 2), dtype=torch.float64)
-
-########## torch.float64/torch.int64/size=(2,)+(6, 2)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       row_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[[[1.],
-                        [2.]],
-
-                       [[2.],
-                        [3.]],
-
-                       [[3.],
-                        [4.]],
-
-                       [[4.],
-                        [5.]]],
-
-
-                      [[[5.],
-                        [6.]],
-
-                       [[6.],
-                        [7.]],
-
-                       [[7.],
-                        [8.]],
-
-                       [[8.],
-                        [9.]]]]), device='cuda:0', size=(2, 6, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]], device='cuda:0')
-# _row_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]], device='cuda:0')
-# _values
-tensor([[[[1.],
-          [2.]],
-
-         [[2.],
-          [3.]],
-
-         [[3.],
-          [4.]],
-
-         [[4.],
-          [5.]]],
-
-
-        [[[5.],
-          [6.]],
-
-         [[6.],
-          [7.]],
-
-         [[7.],
-          [8.]],
-
-         [[8.],
-          [9.]]]], device='cuda:0', dtype=torch.float64)
-
-########## torch.float64/torch.int64/size=(2, 3)+(9, 4)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1., 11.],
-                         [ 2., 12.],
-                         [ 3., 13.]],
-
-                        [[ 2., 12.],
-                         [ 3., 13.],
-                         [ 4., 14.]],
-
-                        [[ 3., 13.],
-                         [ 4., 14.],
-                         [ 5., 15.]],
-
-                        [[ 4., 14.],
-                         [ 5., 15.],
-                         [ 6., 16.]]],
-
-
-                       [[[ 5., 15.],
-                         [ 6., 16.],
-                         [ 7., 17.]],
-
-                        [[ 6., 16.],
-                         [ 7., 17.],
-                         [ 8., 18.]],
-
-                        [[ 7., 17.],
-                         [ 8., 18.],
-                         [ 9., 19.]],
-
-                        [[ 8., 18.],
-                         [ 9., 19.],
-                         [10., 20.]]],
-
-
-                       [[[ 9., 19.],
-                         [10., 20.],
-                         [11., 21.]],
-
-                        [[10., 20.],
-                         [11., 21.],
-                         [12., 22.]],
-
-                        [[11., 21.],
-                         [12., 22.],
-                         [13., 23.]],
-
-                        [[12., 22.],
-                         [13., 23.],
-                         [14., 24.]]]],
-
-
-
-                      [[[[13., 23.],
-                         [14., 24.],
-                         [15., 25.]],
-
-                        [[14., 24.],
-                         [15., 25.],
-                         [16., 26.]],
-
-                        [[15., 25.],
-                         [16., 26.],
-                         [17., 27.]],
-
-                        [[16., 26.],
-                         [17., 27.],
-                         [18., 28.]]],
-
-
-                       [[[17., 27.],
-                         [18., 28.],
-                         [19., 29.]],
-
-                        [[18., 28.],
-                         [19., 29.],
-                         [20., 30.]],
-
-                        [[19., 29.],
-                         [20., 30.],
-                         [21., 31.]],
-
-                        [[20., 30.],
-                         [21., 31.],
-                         [22., 32.]]],
-
-
-                       [[[21., 31.],
-                         [22., 32.],
-                         [23., 33.]],
-
-                        [[22., 32.],
-                         [23., 33.],
-                         [24., 34.]],
-
-                        [[23., 33.],
-                         [24., 34.],
-                         [25., 35.]],
-
-                        [[24., 34.],
-                         [25., 35.],
-                         [26., 36.]]]]]), device='cuda:0', size=(2, 3, 9, 4),
-       nnz=4, dtype=torch.float64, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
-
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], device='cuda:0')
-# _row_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
-
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], device='cuda:0')
-# _values
-tensor([[[[[ 1., 11.],
-           [ 2., 12.],
-           [ 3., 13.]],
-
-          [[ 2., 12.],
-           [ 3., 13.],
-           [ 4., 14.]],
-
-          [[ 3., 13.],
-           [ 4., 14.],
-           [ 5., 15.]],
-
-          [[ 4., 14.],
-           [ 5., 15.],
-           [ 6., 16.]]],
-
-
-         [[[ 5., 15.],
-           [ 6., 16.],
-           [ 7., 17.]],
-
-          [[ 6., 16.],
-           [ 7., 17.],
-           [ 8., 18.]],
-
-          [[ 7., 17.],
-           [ 8., 18.],
-           [ 9., 19.]],
-
-          [[ 8., 18.],
-           [ 9., 19.],
-           [10., 20.]]],
-
-
-         [[[ 9., 19.],
-           [10., 20.],
-           [11., 21.]],
-
-          [[10., 20.],
-           [11., 21.],
-           [12., 22.]],
-
-          [[11., 21.],
-           [12., 22.],
-           [13., 23.]],
-
-          [[12., 22.],
-           [13., 23.],
-           [14., 24.]]]],
-
-
-
-        [[[[13., 23.],
-           [14., 24.],
-           [15., 25.]],
-
-          [[14., 24.],
-           [15., 25.],
-           [16., 26.]],
-
-          [[15., 25.],
-           [16., 26.],
-           [17., 27.]],
-
-          [[16., 26.],
-           [17., 27.],
-           [18., 28.]]],
-
-
-         [[[17., 27.],
-           [18., 28.],
-           [19., 29.]],
-
-          [[18., 28.],
-           [19., 29.],
-           [20., 30.]],
-
-          [[19., 29.],
-           [20., 30.],
-           [21., 31.]],
-
-          [[20., 30.],
-           [21., 31.],
-           [22., 32.]]],
-
-
-         [[[21., 31.],
-           [22., 32.],
-           [23., 33.]],
-
-          [[22., 32.],
-           [23., 33.],
-           [24., 34.]],
-
-          [[23., 33.],
-           [24., 34.],
-           [25., 35.]],
-
-          [[24., 34.],
-           [25., 35.],
-           [26., 36.]]]]], device='cuda:0', dtype=torch.float64)
-
-
-########## torch.float32/torch.int32/size=()+(6, 6)+(2,) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[  1., 101.],
-                        [ 11., 111.],
-                        [ 21., 121.]],
-
-                       [[  2., 102.],
-                        [ 12., 112.],
-                        [ 22., 122.]]],
-
-
-                      [[[  2., 102.],
-                        [ 12., 112.],
-                        [ 22., 122.]],
-
-                       [[  3., 103.],
-                        [ 13., 113.],
-                        [ 23., 123.]]],
-
-
-                      [[[  3., 103.],
-                        [ 13., 113.],
-                        [ 23., 123.]],
-
-                       [[  4., 104.],
-                        [ 14., 114.],
-                        [ 24., 124.]]],
-
-
-                      [[[  4., 104.],
-                        [ 14., 114.],
-                        [ 24., 124.]],
-
-                       [[  5., 105.],
-                        [ 15., 115.],
-                        [ 25., 125.]]]]), device='cuda:0', size=(6, 6, 2),
-       nnz=4, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
-# _row_indices
-tensor([0, 1, 0, 2], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[[[  1., 101.],
-          [ 11., 111.],
-          [ 21., 121.]],
-
-         [[  2., 102.],
-          [ 12., 112.],
-          [ 22., 122.]]],
-
-
-        [[[  2., 102.],
-          [ 12., 112.],
-          [ 22., 122.]],
-
-         [[  3., 103.],
-          [ 13., 113.],
-          [ 23., 123.]]],
-
-
-        [[[  3., 103.],
-          [ 13., 113.],
-          [ 23., 123.]],
-
-         [[  4., 104.],
-          [ 14., 114.],
-          [ 24., 124.]]],
-
-
-        [[[  4., 104.],
-          [ 14., 114.],
-          [ 24., 124.]],
-
-         [[  5., 105.],
-          [ 15., 115.],
-          [ 25., 125.]]]], device='cuda:0')
-
-########## torch.float32/torch.int32/size=()+(9, 4)+(4, 2) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[[1.0000e+00, 1.0010e+03],
-                         [1.0100e+02, 1.1010e+03],
-                         [2.0100e+02, 1.2010e+03],
-                         [3.0100e+02, 1.3010e+03]],
-
-                        [[1.1000e+01, 1.0110e+03],
-                         [1.1100e+02, 1.1110e+03],
-                         [2.1100e+02, 1.2110e+03],
-                         [3.1100e+02, 1.3110e+03]]],
-
-
-                       [[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
-
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]]],
-
-
-                       [[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
-
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]]]],
-
-
-
-                      [[[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
-
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]]],
-
-
-                       [[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
-
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]]],
-
-
-                       [[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
-
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]]]],
-
-
-
-                      [[[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
-
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]]],
-
-
-                       [[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
-
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]]],
-
-
-                       [[[5.0000e+00, 1.0050e+03],
-                         [1.0500e+02, 1.1050e+03],
-                         [2.0500e+02, 1.2050e+03],
-                         [3.0500e+02, 1.3050e+03]],
-
-                        [[1.5000e+01, 1.0150e+03],
-                         [1.1500e+02, 1.1150e+03],
-                         [2.1500e+02, 1.2150e+03],
-                         [3.1500e+02, 1.3150e+03]]]],
-
-
-
-                      [[[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
-
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]]],
-
-
-                       [[[5.0000e+00, 1.0050e+03],
-                         [1.0500e+02, 1.1050e+03],
-                         [2.0500e+02, 1.2050e+03],
-                         [3.0500e+02, 1.3050e+03]],
-
-                        [[1.5000e+01, 1.0150e+03],
-                         [1.1500e+02, 1.1150e+03],
-                         [2.1500e+02, 1.2150e+03],
-                         [3.1500e+02, 1.3150e+03]]],
-
-
-                       [[[6.0000e+00, 1.0060e+03],
-                         [1.0600e+02, 1.1060e+03],
-                         [2.0600e+02, 1.2060e+03],
-                         [3.0600e+02, 1.3060e+03]],
-
-                        [[1.6000e+01, 1.0160e+03],
-                         [1.1600e+02, 1.1160e+03],
-                         [2.1600e+02, 1.2160e+03],
-                         [3.1600e+02, 1.3160e+03]]]]]), device='cuda:0',
-       size=(9, 4, 4, 2), nnz=4, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
-# _row_indices
-tensor([0, 1, 0, 2], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[[[[1.0000e+00, 1.0010e+03],
-           [1.0100e+02, 1.1010e+03],
-           [2.0100e+02, 1.2010e+03],
-           [3.0100e+02, 1.3010e+03]],
-
-          [[1.1000e+01, 1.0110e+03],
-           [1.1100e+02, 1.1110e+03],
-           [2.1100e+02, 1.2110e+03],
-           [3.1100e+02, 1.3110e+03]]],
-
-
-         [[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
-
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]]],
-
-
-         [[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
-
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]]]],
-
-
-
-        [[[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
-
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]]],
-
-
-         [[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
-
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]]],
-
-
-         [[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
-
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]]]],
-
-
-
-        [[[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
-
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]]],
-
-
-         [[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
-
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]]],
-
-
-         [[[5.0000e+00, 1.0050e+03],
-           [1.0500e+02, 1.1050e+03],
-           [2.0500e+02, 1.2050e+03],
-           [3.0500e+02, 1.3050e+03]],
-
-          [[1.5000e+01, 1.0150e+03],
-           [1.1500e+02, 1.1150e+03],
-           [2.1500e+02, 1.2150e+03],
-           [3.1500e+02, 1.3150e+03]]]],
-
-
-
-        [[[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
-
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]]],
-
-
-         [[[5.0000e+00, 1.0050e+03],
-           [1.0500e+02, 1.1050e+03],
-           [2.0500e+02, 1.2050e+03],
-           [3.0500e+02, 1.3050e+03]],
-
-          [[1.5000e+01, 1.0150e+03],
-           [1.1500e+02, 1.1150e+03],
-           [2.1500e+02, 1.2150e+03],
-           [3.1500e+02, 1.3150e+03]]],
-
-
-         [[[6.0000e+00, 1.0060e+03],
-           [1.0600e+02, 1.1060e+03],
-           [2.0600e+02, 1.2060e+03],
-           [3.0600e+02, 1.3060e+03]],
-
-          [[1.6000e+01, 1.0160e+03],
-           [1.1600e+02, 1.1160e+03],
-           [2.1600e+02, 1.2160e+03],
-           [3.1600e+02, 1.3160e+03]]]]], device='cuda:0')
-
-########## torch.float32/torch.int32/size=(2, 3)+(6, 6)+(2, 1) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[[[  1.],
-                           [101.]],
-
-                          [[ 11.],
-                           [111.]],
-
-                          [[ 21.],
-                           [121.]]],
-
-
-                         [[[  2.],
-                           [102.]],
-
-                          [[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]]]],
-
-
-
-                        [[[[  2.],
-                           [102.]],
-
-                          [[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]]],
-
-
-                         [[[  3.],
-                           [103.]],
-
-                          [[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]]]],
-
-
-
-                        [[[[  3.],
-                           [103.]],
-
-                          [[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]]],
-
-
-                         [[[  4.],
-                           [104.]],
-
-                          [[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]]]],
-
-
-
-                        [[[[  4.],
-                           [104.]],
-
-                          [[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]]],
-
-
-                         [[[  5.],
-                           [105.]],
-
-                          [[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]]]]],
-
-
-
-
-                       [[[[[  5.],
-                           [105.]],
-
-                          [[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]]],
-
-
-                         [[[  6.],
-                           [106.]],
-
-                          [[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]]]],
-
-
-
-                        [[[[  6.],
-                           [106.]],
-
-                          [[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]]],
-
-
-                         [[[  7.],
-                           [107.]],
-
-                          [[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]]]],
-
-
-
-                        [[[[  7.],
-                           [107.]],
-
-                          [[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]]],
-
-
-                         [[[  8.],
-                           [108.]],
-
-                          [[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]]]],
-
-
-
-                        [[[[  8.],
-                           [108.]],
-
-                          [[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]]],
-
-
-                         [[[  9.],
-                           [109.]],
-
-                          [[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]]]]],
-
-
-
-
-                       [[[[[  9.],
-                           [109.]],
-
-                          [[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]]],
-
-
-                         [[[ 10.],
-                           [110.]],
-
-                          [[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]]]],
-
-
-
-                        [[[[ 10.],
-                           [110.]],
-
-                          [[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]]],
-
-
-                         [[[ 11.],
-                           [111.]],
-
-                          [[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]]]],
-
-
-
-                        [[[[ 11.],
-                           [111.]],
-
-                          [[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]]],
-
-
-                         [[[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]]]],
-
-
-
-                        [[[[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]]],
-
-
-                         [[[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]]]]]],
-
-
-
-
-
-                      [[[[[[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]]],
-
-
-                         [[[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]]]],
-
-
-
-                        [[[[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]]],
-
-
-                         [[[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]],
-
-                          [[ 35.],
-                           [135.]]]],
-
-
-
-                        [[[[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]],
-
-                          [[ 35.],
-                           [135.]]],
-
-
-                         [[[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]],
-
-                          [[ 36.],
-                           [136.]]]],
-
-
-
-                        [[[[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]],
-
-                          [[ 36.],
-                           [136.]]],
-
-
-                         [[[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]],
-
-                          [[ 37.],
-                           [137.]]]]],
-
-
-
-
-                       [[[[[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]],
-
-                          [[ 37.],
-                           [137.]]],
-
-
-                         [[[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]],
-
-                          [[ 38.],
-                           [138.]]]],
-
-
-
-                        [[[[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]],
-
-                          [[ 38.],
-                           [138.]]],
-
-
-                         [[[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]],
-
-                          [[ 39.],
-                           [139.]]]],
-
-
-
-                        [[[[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]],
-
-                          [[ 39.],
-                           [139.]]],
-
-
-                         [[[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]],
-
-                          [[ 40.],
-                           [140.]]]],
-
-
-
-                        [[[[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]],
-
-                          [[ 40.],
-                           [140.]]],
-
-
-                         [[[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]],
-
-                          [[ 41.],
-                           [141.]]]]],
-
-
-
-
-                       [[[[[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]],
-
-                          [[ 41.],
-                           [141.]]],
-
-
-                         [[[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]],
-
-                          [[ 42.],
-                           [142.]]]],
-
-
-
-                        [[[[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]],
-
-                          [[ 42.],
-                           [142.]]],
-
-
-                         [[[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]],
-
-                          [[ 43.],
-                           [143.]]]],
-
-
-
-                        [[[[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]],
-
-                          [[ 43.],
-                           [143.]]],
-
-
-                         [[[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]],
-
-                          [[ 44.],
-                           [144.]]]],
-
-
-
-                        [[[[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]],
-
-                          [[ 44.],
-                           [144.]]],
-
-
-                         [[[ 25.],
-                           [125.]],
-
-                          [[ 35.],
-                           [135.]],
-
-                          [[ 45.],
-                           [145.]]]]]]]), device='cuda:0',
-       size=(2, 3, 6, 6, 2, 1), nnz=4, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
-
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], device='cuda:0', dtype=torch.int32)
-# _row_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
-
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[[[[[[  1.],
-             [101.]],
-
-            [[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]]],
-
-
-           [[[  2.],
-             [102.]],
-
-            [[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]]],
-
-
-
-          [[[[  2.],
-             [102.]],
-
-            [[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]],
-
-
-           [[[  3.],
-             [103.]],
-
-            [[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]]],
-
-
-
-          [[[[  3.],
-             [103.]],
-
-            [[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]],
-
-
-           [[[  4.],
-             [104.]],
-
-            [[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]]]],
-
-
-
-          [[[[  4.],
-             [104.]],
-
-            [[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]]],
-
-
-           [[[  5.],
-             [105.]],
-
-            [[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]]]]],
-
-
-
-
-         [[[[[  5.],
-             [105.]],
-
-            [[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]]],
-
-
-           [[[  6.],
-             [106.]],
-
-            [[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]]]],
-
-
-
-          [[[[  6.],
-             [106.]],
-
-            [[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]]],
-
-
-           [[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]]],
-
-
-
-          [[[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]],
-
-
-           [[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]]],
-
-
-
-          [[[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]],
-
-
-           [[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]],
-
-            [[ 29.],
-             [129.]]]]],
-
-
-
-
-         [[[[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]],
-
-            [[ 29.],
-             [129.]]],
-
-
-           [[[ 10.],
-             [110.]],
-
-            [[ 20.],
-             [120.]],
-
-            [[ 30.],
-             [130.]]]],
-
-
-
-          [[[[ 10.],
-             [110.]],
-
-            [[ 20.],
-             [120.]],
-
-            [[ 30.],
-             [130.]]],
-
-
-           [[[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]],
-
-            [[ 31.],
-             [131.]]]],
-
-
-
-          [[[[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]],
-
-            [[ 31.],
-             [131.]]],
-
-
-           [[[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]],
-
-            [[ 32.],
-             [132.]]]],
-
-
-
-          [[[[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]],
-
-            [[ 32.],
-             [132.]]],
-
-
-           [[[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]],
-
-            [[ 33.],
-             [133.]]]]]],
-
-
-
-
-
-        [[[[[[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]],
-
-            [[ 33.],
-             [133.]]],
-
-
-           [[[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]],
-
-            [[ 34.],
-             [134.]]]],
-
-
-
-          [[[[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]],
-
-            [[ 34.],
-             [134.]]],
-
-
-           [[[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]],
-
-            [[ 35.],
-             [135.]]]],
-
-
-
-          [[[[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]],
-
-            [[ 35.],
-             [135.]]],
-
-
-           [[[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]],
-
-            [[ 36.],
-             [136.]]]],
-
-
-
-          [[[[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]],
-
-            [[ 36.],
-             [136.]]],
-
-
-           [[[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]],
-
-            [[ 37.],
-             [137.]]]]],
-
-
-
-
-         [[[[[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]],
-
-            [[ 37.],
-             [137.]]],
-
-
-           [[[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]],
-
-            [[ 38.],
-             [138.]]]],
-
-
-
-          [[[[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]],
-
-            [[ 38.],
-             [138.]]],
-
-
-           [[[ 19.],
-             [119.]],
-
-            [[ 29.],
-             [129.]],
-
-            [[ 39.],
-             [139.]]]],
-
-
-
-          [[[[ 19.],
-             [119.]],
-
-            [[ 29.],
-             [129.]],
-
-            [[ 39.],
-             [139.]]],
-
-
-           [[[ 20.],
-             [120.]],
-
-            [[ 30.],
-             [130.]],
-
-            [[ 40.],
-             [140.]]]],
-
-
-
-          [[[[ 20.],
-             [120.]],
-
-            [[ 30.],
-             [130.]],
-
-            [[ 40.],
-             [140.]]],
-
-
-           [[[ 21.],
-             [121.]],
-
-            [[ 31.],
-             [131.]],
-
-            [[ 41.],
-             [141.]]]]],
-
-
-
-
-         [[[[[ 21.],
-             [121.]],
-
-            [[ 31.],
-             [131.]],
-
-            [[ 41.],
-             [141.]]],
-
-
-           [[[ 22.],
-             [122.]],
-
-            [[ 32.],
-             [132.]],
-
-            [[ 42.],
-             [142.]]]],
-
-
-
-          [[[[ 22.],
-             [122.]],
-
-            [[ 32.],
-             [132.]],
-
-            [[ 42.],
-             [142.]]],
-
-
-           [[[ 23.],
-             [123.]],
-
-            [[ 33.],
-             [133.]],
-
-            [[ 43.],
-             [143.]]]],
-
-
-
-          [[[[ 23.],
-             [123.]],
-
-            [[ 33.],
-             [133.]],
-
-            [[ 43.],
-             [143.]]],
-
-
-           [[[ 24.],
-             [124.]],
-
-            [[ 34.],
-             [134.]],
-
-            [[ 44.],
-             [144.]]]],
-
-
-
-          [[[[ 24.],
-             [124.]],
-
-            [[ 34.],
-             [134.]],
-
-            [[ 44.],
-             [144.]]],
-
-
-           [[[ 25.],
-             [125.]],
-
-            [[ 35.],
-             [135.]],
-
-            [[ 45.],
-             [145.]]]]]]], device='cuda:0')
-
-
-########## torch.float64/torch.int32/size=()+(6, 6)+(2,) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[  1., 101.],
-                        [ 11., 111.],
-                        [ 21., 121.]],
-
-                       [[  2., 102.],
-                        [ 12., 112.],
-                        [ 22., 122.]]],
-
-
-                      [[[  2., 102.],
-                        [ 12., 112.],
-                        [ 22., 122.]],
-
-                       [[  3., 103.],
-                        [ 13., 113.],
-                        [ 23., 123.]]],
-
-
-                      [[[  3., 103.],
-                        [ 13., 113.],
-                        [ 23., 123.]],
-
-                       [[  4., 104.],
-                        [ 14., 114.],
-                        [ 24., 124.]]],
-
-
-                      [[[  4., 104.],
-                        [ 14., 114.],
-                        [ 24., 124.]],
-
-                       [[  5., 105.],
-                        [ 15., 115.],
-                        [ 25., 125.]]]]), device='cuda:0', size=(6, 6, 2),
-       nnz=4, dtype=torch.float64, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
-# _row_indices
-tensor([0, 1, 0, 2], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[[[  1., 101.],
-          [ 11., 111.],
-          [ 21., 121.]],
-
-         [[  2., 102.],
-          [ 12., 112.],
-          [ 22., 122.]]],
-
-
-        [[[  2., 102.],
-          [ 12., 112.],
-          [ 22., 122.]],
-
-         [[  3., 103.],
-          [ 13., 113.],
-          [ 23., 123.]]],
-
-
-        [[[  3., 103.],
-          [ 13., 113.],
-          [ 23., 123.]],
-
-         [[  4., 104.],
-          [ 14., 114.],
-          [ 24., 124.]]],
-
-
-        [[[  4., 104.],
-          [ 14., 114.],
-          [ 24., 124.]],
-
-         [[  5., 105.],
-          [ 15., 115.],
-          [ 25., 125.]]]], device='cuda:0', dtype=torch.float64)
-
-########## torch.float64/torch.int32/size=()+(9, 4)+(4, 2) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[[1.0000e+00, 1.0010e+03],
-                         [1.0100e+02, 1.1010e+03],
-                         [2.0100e+02, 1.2010e+03],
-                         [3.0100e+02, 1.3010e+03]],
-
-                        [[1.1000e+01, 1.0110e+03],
-                         [1.1100e+02, 1.1110e+03],
-                         [2.1100e+02, 1.2110e+03],
-                         [3.1100e+02, 1.3110e+03]]],
-
-
-                       [[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
-
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]]],
-
-
-                       [[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
-
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]]]],
-
-
-
-                      [[[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
-
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]]],
-
-
-                       [[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
-
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]]],
-
-
-                       [[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
-
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]]]],
-
-
-
-                      [[[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
-
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]]],
-
-
-                       [[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
-
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]]],
-
-
-                       [[[5.0000e+00, 1.0050e+03],
-                         [1.0500e+02, 1.1050e+03],
-                         [2.0500e+02, 1.2050e+03],
-                         [3.0500e+02, 1.3050e+03]],
-
-                        [[1.5000e+01, 1.0150e+03],
-                         [1.1500e+02, 1.1150e+03],
-                         [2.1500e+02, 1.2150e+03],
-                         [3.1500e+02, 1.3150e+03]]]],
-
-
-
-                      [[[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
-
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]]],
-
-
-                       [[[5.0000e+00, 1.0050e+03],
-                         [1.0500e+02, 1.1050e+03],
-                         [2.0500e+02, 1.2050e+03],
-                         [3.0500e+02, 1.3050e+03]],
-
-                        [[1.5000e+01, 1.0150e+03],
-                         [1.1500e+02, 1.1150e+03],
-                         [2.1500e+02, 1.2150e+03],
-                         [3.1500e+02, 1.3150e+03]]],
-
-
-                       [[[6.0000e+00, 1.0060e+03],
-                         [1.0600e+02, 1.1060e+03],
-                         [2.0600e+02, 1.2060e+03],
-                         [3.0600e+02, 1.3060e+03]],
-
-                        [[1.6000e+01, 1.0160e+03],
-                         [1.1600e+02, 1.1160e+03],
-                         [2.1600e+02, 1.2160e+03],
-                         [3.1600e+02, 1.3160e+03]]]]]), device='cuda:0',
-       size=(9, 4, 4, 2), nnz=4, dtype=torch.float64, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
-# _row_indices
-tensor([0, 1, 0, 2], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[[[[1.0000e+00, 1.0010e+03],
-           [1.0100e+02, 1.1010e+03],
-           [2.0100e+02, 1.2010e+03],
-           [3.0100e+02, 1.3010e+03]],
-
-          [[1.1000e+01, 1.0110e+03],
-           [1.1100e+02, 1.1110e+03],
-           [2.1100e+02, 1.2110e+03],
-           [3.1100e+02, 1.3110e+03]]],
-
-
-         [[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
-
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]]],
-
-
-         [[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
-
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]]]],
-
-
-
-        [[[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
-
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]]],
-
-
-         [[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
-
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]]],
-
-
-         [[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
-
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]]]],
-
-
-
-        [[[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
-
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]]],
-
-
-         [[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
-
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]]],
-
-
-         [[[5.0000e+00, 1.0050e+03],
-           [1.0500e+02, 1.1050e+03],
-           [2.0500e+02, 1.2050e+03],
-           [3.0500e+02, 1.3050e+03]],
-
-          [[1.5000e+01, 1.0150e+03],
-           [1.1500e+02, 1.1150e+03],
-           [2.1500e+02, 1.2150e+03],
-           [3.1500e+02, 1.3150e+03]]]],
-
-
-
-        [[[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
-
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]]],
-
-
-         [[[5.0000e+00, 1.0050e+03],
-           [1.0500e+02, 1.1050e+03],
-           [2.0500e+02, 1.2050e+03],
-           [3.0500e+02, 1.3050e+03]],
-
-          [[1.5000e+01, 1.0150e+03],
-           [1.1500e+02, 1.1150e+03],
-           [2.1500e+02, 1.2150e+03],
-           [3.1500e+02, 1.3150e+03]]],
-
-
-         [[[6.0000e+00, 1.0060e+03],
-           [1.0600e+02, 1.1060e+03],
-           [2.0600e+02, 1.2060e+03],
-           [3.0600e+02, 1.3060e+03]],
-
-          [[1.6000e+01, 1.0160e+03],
-           [1.1600e+02, 1.1160e+03],
-           [2.1600e+02, 1.2160e+03],
-           [3.1600e+02, 1.3160e+03]]]]], device='cuda:0', dtype=torch.float64)
-
-########## torch.float64/torch.int32/size=(2, 3)+(6, 6)+(2, 1) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[[[  1.],
-                           [101.]],
-
-                          [[ 11.],
-                           [111.]],
-
-                          [[ 21.],
-                           [121.]]],
-
-
-                         [[[  2.],
-                           [102.]],
-
-                          [[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]]]],
-
-
-
-                        [[[[  2.],
-                           [102.]],
-
-                          [[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]]],
-
-
-                         [[[  3.],
-                           [103.]],
-
-                          [[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]]]],
-
-
-
-                        [[[[  3.],
-                           [103.]],
-
-                          [[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]]],
-
-
-                         [[[  4.],
-                           [104.]],
-
-                          [[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]]]],
-
-
-
-                        [[[[  4.],
-                           [104.]],
-
-                          [[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]]],
-
-
-                         [[[  5.],
-                           [105.]],
-
-                          [[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]]]]],
-
-
-
-
-                       [[[[[  5.],
-                           [105.]],
-
-                          [[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]]],
-
-
-                         [[[  6.],
-                           [106.]],
-
-                          [[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]]]],
-
-
-
-                        [[[[  6.],
-                           [106.]],
-
-                          [[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]]],
-
-
-                         [[[  7.],
-                           [107.]],
-
-                          [[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]]]],
-
-
-
-                        [[[[  7.],
-                           [107.]],
-
-                          [[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]]],
-
-
-                         [[[  8.],
-                           [108.]],
-
-                          [[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]]]],
-
-
-
-                        [[[[  8.],
-                           [108.]],
-
-                          [[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]]],
-
-
-                         [[[  9.],
-                           [109.]],
-
-                          [[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]]]]],
-
-
-
-
-                       [[[[[  9.],
-                           [109.]],
-
-                          [[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]]],
-
-
-                         [[[ 10.],
-                           [110.]],
-
-                          [[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]]]],
-
-
-
-                        [[[[ 10.],
-                           [110.]],
-
-                          [[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]]],
-
-
-                         [[[ 11.],
-                           [111.]],
-
-                          [[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]]]],
-
-
-
-                        [[[[ 11.],
-                           [111.]],
-
-                          [[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]]],
-
-
-                         [[[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]]]],
-
-
-
-                        [[[[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]]],
-
-
-                         [[[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]]]]]],
-
-
-
-
-
-                      [[[[[[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]]],
-
-
-                         [[[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]]]],
-
-
-
-                        [[[[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]]],
-
-
-                         [[[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]],
-
-                          [[ 35.],
-                           [135.]]]],
-
-
-
-                        [[[[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]],
-
-                          [[ 35.],
-                           [135.]]],
-
-
-                         [[[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]],
-
-                          [[ 36.],
-                           [136.]]]],
-
-
-
-                        [[[[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]],
-
-                          [[ 36.],
-                           [136.]]],
-
-
-                         [[[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]],
-
-                          [[ 37.],
-                           [137.]]]]],
-
-
-
-
-                       [[[[[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]],
-
-                          [[ 37.],
-                           [137.]]],
-
-
-                         [[[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]],
-
-                          [[ 38.],
-                           [138.]]]],
-
-
-
-                        [[[[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]],
-
-                          [[ 38.],
-                           [138.]]],
-
-
-                         [[[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]],
-
-                          [[ 39.],
-                           [139.]]]],
-
-
-
-                        [[[[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]],
-
-                          [[ 39.],
-                           [139.]]],
-
-
-                         [[[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]],
-
-                          [[ 40.],
-                           [140.]]]],
-
-
-
-                        [[[[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]],
-
-                          [[ 40.],
-                           [140.]]],
-
-
-                         [[[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]],
-
-                          [[ 41.],
-                           [141.]]]]],
-
-
-
-
-                       [[[[[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]],
-
-                          [[ 41.],
-                           [141.]]],
-
-
-                         [[[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]],
-
-                          [[ 42.],
-                           [142.]]]],
-
-
-
-                        [[[[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]],
-
-                          [[ 42.],
-                           [142.]]],
-
-
-                         [[[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]],
-
-                          [[ 43.],
-                           [143.]]]],
-
-
-
-                        [[[[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]],
-
-                          [[ 43.],
-                           [143.]]],
+                           [[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]]]),
+       values=tensor([[[[[1.],
+                         [3.]],
 
+                        [[2.],
+                         [0.]],
 
-                         [[[ 24.],
-                           [124.]],
+                        [[0.],
+                         [4.]]],
 
-                          [[ 34.],
-                           [134.]],
 
-                          [[ 44.],
-                           [144.]]]],
+                       [[[1.],
+                         [4.]],
 
+                        [[2.],
+                         [0.]],
 
+                        [[3.],
+                         [0.]]],
 
-                        [[[[ 24.],
-                           [124.]],
 
-                          [[ 34.],
-                           [134.]],
+                       [[[1.],
+                         [2.]],
 
-                          [[ 44.],
-                           [144.]]],
+                        [[0.],
+                         [3.]],
 
+                        [[0.],
+                         [4.]]]],
 
-                         [[[ 25.],
-                           [125.]],
-
-                          [[ 35.],
-                           [135.]],
-
-                          [[ 45.],
-                           [145.]]]]]]]), device='cuda:0',
-       size=(2, 3, 6, 6, 2, 1), nnz=4, dtype=torch.float64,
-       layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
-
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], device='cuda:0', dtype=torch.int32)
-# _row_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
-
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[[[[[[  1.],
-             [101.]],
-
-            [[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]]],
-
-
-           [[[  2.],
-             [102.]],
-
-            [[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]]],
-
-
-
-          [[[[  2.],
-             [102.]],
-
-            [[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]],
-
-
-           [[[  3.],
-             [103.]],
-
-            [[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]]],
-
-
-
-          [[[[  3.],
-             [103.]],
-
-            [[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]],
-
-
-           [[[  4.],
-             [104.]],
-
-            [[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]]]],
 
 
+                      [[[[0.],
+                         [2.]],
 
-          [[[[  4.],
-             [104.]],
+                        [[1.],
+                         [3.]],
 
-            [[ 14.],
-             [114.]],
+                        [[0.],
+                         [4.]]],
 
-            [[ 24.],
-             [124.]]],
 
+                       [[[1.],
+                         [3.]],
 
-           [[[  5.],
-             [105.]],
+                        [[0.],
+                         [4.]],
 
-            [[ 15.],
-             [115.]],
+                        [[2.],
+                         [0.]]],
 
-            [[ 25.],
-             [125.]]]]],
 
+                       [[[1.],
+                         [0.]],
 
+                        [[2.],
+                         [4.]],
 
+                        [[3.],
+                         [0.]]]]]), device='cuda:0', size=(2, 3, 2, 3), nnz=3,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]],
 
-         [[[[[  5.],
-             [105.]],
-
-            [[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]]],
-
-
-           [[[  6.],
-             [106.]],
-
-            [[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]]]],
-
-
-
-          [[[[  6.],
-             [106.]],
-
-            [[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]]],
-
-
-           [[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]]],
-
-
-
-          [[[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]],
-
-
-           [[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]]],
-
-
-
-          [[[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]],
+        [[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]]], device='cuda:0', dtype=torch.int32)
+# _row_indices
+tensor([[[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]],
 
+        [[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]]], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[[[1.],
+           [3.]],
 
-           [[[  9.],
-             [109.]],
+          [[2.],
+           [0.]],
 
-            [[ 19.],
-             [119.]],
+          [[0.],
+           [4.]]],
 
-            [[ 29.],
-             [129.]]]]],
 
+         [[[1.],
+           [4.]],
 
+          [[2.],
+           [0.]],
 
+          [[3.],
+           [0.]]],
 
-         [[[[[  9.],
-             [109.]],
 
-            [[ 19.],
-             [119.]],
+         [[[1.],
+           [2.]],
 
-            [[ 29.],
-             [129.]]],
+          [[0.],
+           [3.]],
 
+          [[0.],
+           [4.]]]],
 
-           [[[ 10.],
-             [110.]],
 
-            [[ 20.],
-             [120.]],
 
-            [[ 30.],
-             [130.]]]],
+        [[[[0.],
+           [2.]],
 
+          [[1.],
+           [3.]],
 
+          [[0.],
+           [4.]]],
 
-          [[[[ 10.],
-             [110.]],
 
-            [[ 20.],
-             [120.]],
+         [[[1.],
+           [3.]],
 
-            [[ 30.],
-             [130.]]],
+          [[0.],
+           [4.]],
 
+          [[2.],
+           [0.]]],
 
-           [[[ 11.],
-             [111.]],
 
-            [[ 21.],
-             [121.]],
+         [[[1.],
+           [0.]],
 
-            [[ 31.],
-             [131.]]]],
+          [[2.],
+           [4.]],
 
+          [[3.],
+           [0.]]]]], device='cuda:0')
 
+########## torch.float32/torch.int32/size=()+(8, 6)+() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 4, 7]),
+       row_indices=tensor([0, 1, 2, 3, 0, 2, 3]),
+       values=tensor([[[ 0.,  1.,  0.],
+                       [ 0.,  4.,  0.]],
 
-          [[[[ 11.],
-             [111.]],
+                      [[ 6.,  7.,  8.],
+                       [ 0.,  0.,  0.]],
 
-            [[ 21.],
-             [121.]],
+                      [[ 0.,  9.,  0.],
+                       [13.,  0., 14.]],
 
-            [[ 31.],
-             [131.]]],
+                      [[ 0.,  0.,  0.],
+                       [20., 21., 22.]],
 
+                      [[ 2.,  0.,  3.],
+                       [ 0.,  5.,  0.]],
 
-           [[[ 12.],
-             [112.]],
+                      [[10., 11., 12.],
+                       [15., 16., 17.]],
 
-            [[ 22.],
-             [122.]],
+                      [[ 0., 18., 19.],
+                       [ 0., 23., 24.]]]), device='cuda:0', size=(8, 6), nnz=7,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0, 4, 7], device='cuda:0', dtype=torch.int32)
+# _row_indices
+tensor([0, 1, 2, 3, 0, 2, 3], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[ 0.,  1.,  0.],
+         [ 0.,  4.,  0.]],
 
-            [[ 32.],
-             [132.]]]],
+        [[ 6.,  7.,  8.],
+         [ 0.,  0.,  0.]],
 
+        [[ 0.,  9.,  0.],
+         [13.,  0., 14.]],
 
+        [[ 0.,  0.,  0.],
+         [20., 21., 22.]],
 
-          [[[[ 12.],
-             [112.]],
+        [[ 2.,  0.,  3.],
+         [ 0.,  5.,  0.]],
 
-            [[ 22.],
-             [122.]],
+        [[10., 11., 12.],
+         [15., 16., 17.]],
 
-            [[ 32.],
-             [132.]]],
+        [[ 0., 18., 19.],
+         [ 0., 23., 24.]]], device='cuda:0')
 
 
-           [[[ 13.],
-             [113.]],
+########## torch.float64/torch.int32/size=(2, 3)+(2, 3)+() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]],
 
-            [[ 23.],
-             [123.]],
+                            [[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]]]),
+       row_indices=tensor([[[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]],
 
-            [[ 33.],
-             [133.]]]]]],
+                           [[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]]]),
+       values=tensor([[[[[1.],
+                         [3.]],
 
+                        [[2.],
+                         [0.]],
 
+                        [[0.],
+                         [4.]]],
 
 
+                       [[[1.],
+                         [4.]],
 
-        [[[[[[ 13.],
-             [113.]],
+                        [[2.],
+                         [0.]],
 
-            [[ 23.],
-             [123.]],
+                        [[3.],
+                         [0.]]],
 
-            [[ 33.],
-             [133.]]],
 
+                       [[[1.],
+                         [2.]],
 
-           [[[ 14.],
-             [114.]],
+                        [[0.],
+                         [3.]],
 
-            [[ 24.],
-             [124.]],
+                        [[0.],
+                         [4.]]]],
 
-            [[ 34.],
-             [134.]]]],
 
 
+                      [[[[0.],
+                         [2.]],
 
-          [[[[ 14.],
-             [114.]],
+                        [[1.],
+                         [3.]],
 
-            [[ 24.],
-             [124.]],
+                        [[0.],
+                         [4.]]],
 
-            [[ 34.],
-             [134.]]],
 
+                       [[[1.],
+                         [3.]],
 
-           [[[ 15.],
-             [115.]],
+                        [[0.],
+                         [4.]],
 
-            [[ 25.],
-             [125.]],
+                        [[2.],
+                         [0.]]],
 
-            [[ 35.],
-             [135.]]]],
 
+                       [[[1.],
+                         [0.]],
 
+                        [[2.],
+                         [4.]],
 
-          [[[[ 15.],
-             [115.]],
+                        [[3.],
+                         [0.]]]]]), device='cuda:0', size=(2, 3, 2, 3), nnz=3,
+       dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]],
 
-            [[ 25.],
-             [125.]],
+        [[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]]], device='cuda:0', dtype=torch.int32)
+# _row_indices
+tensor([[[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]],
 
-            [[ 35.],
-             [135.]]],
+        [[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]]], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[[[1.],
+           [3.]],
 
+          [[2.],
+           [0.]],
 
-           [[[ 16.],
-             [116.]],
+          [[0.],
+           [4.]]],
 
-            [[ 26.],
-             [126.]],
 
-            [[ 36.],
-             [136.]]]],
+         [[[1.],
+           [4.]],
 
+          [[2.],
+           [0.]],
 
+          [[3.],
+           [0.]]],
 
-          [[[[ 16.],
-             [116.]],
 
-            [[ 26.],
-             [126.]],
+         [[[1.],
+           [2.]],
 
-            [[ 36.],
-             [136.]]],
+          [[0.],
+           [3.]],
 
+          [[0.],
+           [4.]]]],
 
-           [[[ 17.],
-             [117.]],
 
-            [[ 27.],
-             [127.]],
 
-            [[ 37.],
-             [137.]]]]],
+        [[[[0.],
+           [2.]],
 
+          [[1.],
+           [3.]],
 
+          [[0.],
+           [4.]]],
 
 
-         [[[[[ 17.],
-             [117.]],
+         [[[1.],
+           [3.]],
 
-            [[ 27.],
-             [127.]],
+          [[0.],
+           [4.]],
 
-            [[ 37.],
-             [137.]]],
+          [[2.],
+           [0.]]],
 
 
-           [[[ 18.],
-             [118.]],
+         [[[1.],
+           [0.]],
 
-            [[ 28.],
-             [128.]],
+          [[2.],
+           [4.]],
 
-            [[ 38.],
-             [138.]]]],
+          [[3.],
+           [0.]]]]], device='cuda:0', dtype=torch.float64)
 
+########## torch.float64/torch.int32/size=()+(8, 6)+() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 4, 7]),
+       row_indices=tensor([0, 1, 2, 3, 0, 2, 3]),
+       values=tensor([[[ 0.,  1.,  0.],
+                       [ 0.,  4.,  0.]],
 
+                      [[ 6.,  7.,  8.],
+                       [ 0.,  0.,  0.]],
 
-          [[[[ 18.],
-             [118.]],
+                      [[ 0.,  9.,  0.],
+                       [13.,  0., 14.]],
 
-            [[ 28.],
-             [128.]],
+                      [[ 0.,  0.,  0.],
+                       [20., 21., 22.]],
 
-            [[ 38.],
-             [138.]]],
+                      [[ 2.,  0.,  3.],
+                       [ 0.,  5.,  0.]],
 
+                      [[10., 11., 12.],
+                       [15., 16., 17.]],
 
-           [[[ 19.],
-             [119.]],
+                      [[ 0., 18., 19.],
+                       [ 0., 23., 24.]]]), device='cuda:0', size=(8, 6), nnz=7,
+       dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0, 4, 7], device='cuda:0', dtype=torch.int32)
+# _row_indices
+tensor([0, 1, 2, 3, 0, 2, 3], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[ 0.,  1.,  0.],
+         [ 0.,  4.,  0.]],
 
-            [[ 29.],
-             [129.]],
+        [[ 6.,  7.,  8.],
+         [ 0.,  0.,  0.]],
 
-            [[ 39.],
-             [139.]]]],
+        [[ 0.,  9.,  0.],
+         [13.,  0., 14.]],
 
+        [[ 0.,  0.,  0.],
+         [20., 21., 22.]],
 
+        [[ 2.,  0.,  3.],
+         [ 0.,  5.,  0.]],
 
-          [[[[ 19.],
-             [119.]],
+        [[10., 11., 12.],
+         [15., 16., 17.]],
 
-            [[ 29.],
-             [129.]],
+        [[ 0., 18., 19.],
+         [ 0., 23., 24.]]], device='cuda:0', dtype=torch.float64)
 
-            [[ 39.],
-             [139.]]],
 
+########## torch.float32/torch.int64/size=(2, 3)+(2, 3)+() ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]],
 
-           [[[ 20.],
-             [120.]],
+                            [[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]]]),
+       row_indices=tensor([[[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]],
 
-            [[ 30.],
-             [130.]],
+                           [[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]]]),
+       values=tensor([[[[[1.],
+                         [3.]],
 
-            [[ 40.],
-             [140.]]]],
+                        [[2.],
+                         [0.]],
 
+                        [[0.],
+                         [4.]]],
 
 
-          [[[[ 20.],
-             [120.]],
+                       [[[1.],
+                         [4.]],
 
-            [[ 30.],
-             [130.]],
+                        [[2.],
+                         [0.]],
 
-            [[ 40.],
-             [140.]]],
+                        [[3.],
+                         [0.]]],
 
 
-           [[[ 21.],
-             [121.]],
+                       [[[1.],
+                         [2.]],
 
-            [[ 31.],
-             [131.]],
+                        [[0.],
+                         [3.]],
 
-            [[ 41.],
-             [141.]]]]],
+                        [[0.],
+                         [4.]]]],
 
 
+                      [[[[0.],
+                         [2.]],
 
-         [[[[[ 21.],
-             [121.]],
+                        [[1.],
+                         [3.]],
 
-            [[ 31.],
-             [131.]],
+                        [[0.],
+                         [4.]]],
 
-            [[ 41.],
-             [141.]]],
 
+                       [[[1.],
+                         [3.]],
 
-           [[[ 22.],
-             [122.]],
+                        [[0.],
+                         [4.]],
 
-            [[ 32.],
-             [132.]],
+                        [[2.],
+                         [0.]]],
 
-            [[ 42.],
-             [142.]]]],
 
+                       [[[1.],
+                         [0.]],
 
+                        [[2.],
+                         [4.]],
 
-          [[[[ 22.],
-             [122.]],
+                        [[3.],
+                         [0.]]]]]), device='cuda:0', size=(2, 3, 2, 3), nnz=3,
+       layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]],
 
-            [[ 32.],
-             [132.]],
+        [[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]]], device='cuda:0')
+# _row_indices
+tensor([[[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]],
 
-            [[ 42.],
-             [142.]]],
+        [[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]]], device='cuda:0')
+# _values
+tensor([[[[[1.],
+           [3.]],
 
+          [[2.],
+           [0.]],
 
-           [[[ 23.],
-             [123.]],
+          [[0.],
+           [4.]]],
 
-            [[ 33.],
-             [133.]],
 
-            [[ 43.],
-             [143.]]]],
+         [[[1.],
+           [4.]],
 
+          [[2.],
+           [0.]],
 
+          [[3.],
+           [0.]]],
 
-          [[[[ 23.],
-             [123.]],
 
-            [[ 33.],
-             [133.]],
+         [[[1.],
+           [2.]],
 
-            [[ 43.],
-             [143.]]],
+          [[0.],
+           [3.]],
 
+          [[0.],
+           [4.]]]],
 
-           [[[ 24.],
-             [124.]],
 
-            [[ 34.],
-             [134.]],
 
-            [[ 44.],
-             [144.]]]],
+        [[[[0.],
+           [2.]],
 
+          [[1.],
+           [3.]],
 
+          [[0.],
+           [4.]]],
 
-          [[[[ 24.],
-             [124.]],
 
-            [[ 34.],
-             [134.]],
+         [[[1.],
+           [3.]],
 
-            [[ 44.],
-             [144.]]],
+          [[0.],
+           [4.]],
 
+          [[2.],
+           [0.]]],
 
-           [[[ 25.],
-             [125.]],
 
-            [[ 35.],
-             [135.]],
+         [[[1.],
+           [0.]],
 
-            [[ 45.],
-             [145.]]]]]]], device='cuda:0', dtype=torch.float64)
+          [[2.],
+           [4.]],
 
+          [[3.],
+           [0.]]]]], device='cuda:0')
 
-########## torch.float32/torch.int64/size=()+(6, 6)+(2,) ##########
+########## torch.float32/torch.int64/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[  1., 101.],
-                        [ 11., 111.],
-                        [ 21., 121.]],
-
-                       [[  2., 102.],
-                        [ 12., 112.],
-                        [ 22., 122.]]],
-
-
-                      [[[  2., 102.],
-                        [ 12., 112.],
-                        [ 22., 122.]],
+tensor(ccol_indices=tensor([0, 4, 7]),
+       row_indices=tensor([0, 1, 2, 3, 0, 2, 3]),
+       values=tensor([[[ 0.,  1.,  0.],
+                       [ 0.,  4.,  0.]],
 
-                       [[  3., 103.],
-                        [ 13., 113.],
-                        [ 23., 123.]]],
+                      [[ 6.,  7.,  8.],
+                       [ 0.,  0.,  0.]],
 
+                      [[ 0.,  9.,  0.],
+                       [13.,  0., 14.]],
 
-                      [[[  3., 103.],
-                        [ 13., 113.],
-                        [ 23., 123.]],
+                      [[ 0.,  0.,  0.],
+                       [20., 21., 22.]],
 
-                       [[  4., 104.],
-                        [ 14., 114.],
-                        [ 24., 124.]]],
+                      [[ 2.,  0.,  3.],
+                       [ 0.,  5.,  0.]],
 
+                      [[10., 11., 12.],
+                       [15., 16., 17.]],
 
-                      [[[  4., 104.],
-                        [ 14., 114.],
-                        [ 24., 124.]],
-
-                       [[  5., 105.],
-                        [ 15., 115.],
-                        [ 25., 125.]]]]), device='cuda:0', size=(6, 6, 2),
-       nnz=4, layout=torch.sparse_bsc)
+                      [[ 0., 18., 19.],
+                       [ 0., 23., 24.]]]), device='cuda:0', size=(8, 6), nnz=7,
+       layout=torch.sparse_bsc)
 # _ccol_indices
-tensor([0, 2, 4], device='cuda:0')
+tensor([0, 4, 7], device='cuda:0')
 # _row_indices
-tensor([0, 1, 0, 2], device='cuda:0')
+tensor([0, 1, 2, 3, 0, 2, 3], device='cuda:0')
 # _values
-tensor([[[[  1., 101.],
-          [ 11., 111.],
-          [ 21., 121.]],
-
-         [[  2., 102.],
-          [ 12., 112.],
-          [ 22., 122.]]],
-
+tensor([[[ 0.,  1.,  0.],
+         [ 0.,  4.,  0.]],
 
-        [[[  2., 102.],
-          [ 12., 112.],
-          [ 22., 122.]],
+        [[ 6.,  7.,  8.],
+         [ 0.,  0.,  0.]],
 
-         [[  3., 103.],
-          [ 13., 113.],
-          [ 23., 123.]]],
+        [[ 0.,  9.,  0.],
+         [13.,  0., 14.]],
 
+        [[ 0.,  0.,  0.],
+         [20., 21., 22.]],
 
-        [[[  3., 103.],
-          [ 13., 113.],
-          [ 23., 123.]],
+        [[ 2.,  0.,  3.],
+         [ 0.,  5.,  0.]],
 
-         [[  4., 104.],
-          [ 14., 114.],
-          [ 24., 124.]]],
+        [[10., 11., 12.],
+         [15., 16., 17.]],
 
+        [[ 0., 18., 19.],
+         [ 0., 23., 24.]]], device='cuda:0')
 
-        [[[  4., 104.],
-          [ 14., 114.],
-          [ 24., 124.]],
 
-         [[  5., 105.],
-          [ 15., 115.],
-          [ 25., 125.]]]], device='cuda:0')
-
-########## torch.float32/torch.int64/size=()+(9, 4)+(4, 2) ##########
+########## torch.float64/torch.int64/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[[1.0000e+00, 1.0010e+03],
-                         [1.0100e+02, 1.1010e+03],
-                         [2.0100e+02, 1.2010e+03],
-                         [3.0100e+02, 1.3010e+03]],
-
-                        [[1.1000e+01, 1.0110e+03],
-                         [1.1100e+02, 1.1110e+03],
-                         [2.1100e+02, 1.2110e+03],
-                         [3.1100e+02, 1.3110e+03]]],
-
-
-                       [[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
+tensor(ccol_indices=tensor([[[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]],
 
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]]],
+                            [[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]]]),
+       row_indices=tensor([[[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]],
 
+                           [[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]]]),
+       values=tensor([[[[[1.],
+                         [3.]],
 
-                       [[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
+                        [[2.],
+                         [0.]],
 
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]]]],
+                        [[0.],
+                         [4.]]],
 
 
+                       [[[1.],
+                         [4.]],
 
-                      [[[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
+                        [[2.],
+                         [0.]],
 
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]]],
+                        [[3.],
+                         [0.]]],
 
 
-                       [[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
+                       [[[1.],
+                         [2.]],
 
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]]],
+                        [[0.],
+                         [3.]],
 
+                        [[0.],
+                         [4.]]]],
 
-                       [[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
 
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]]]],
 
+                      [[[[0.],
+                         [2.]],
 
+                        [[1.],
+                         [3.]],
 
-                      [[[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
+                        [[0.],
+                         [4.]]],
 
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]]],
 
+                       [[[1.],
+                         [3.]],
 
-                       [[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
+                        [[0.],
+                         [4.]],
 
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]]],
+                        [[2.],
+                         [0.]]],
 
 
-                       [[[5.0000e+00, 1.0050e+03],
-                         [1.0500e+02, 1.1050e+03],
-                         [2.0500e+02, 1.2050e+03],
-                         [3.0500e+02, 1.3050e+03]],
+                       [[[1.],
+                         [0.]],
 
-                        [[1.5000e+01, 1.0150e+03],
-                         [1.1500e+02, 1.1150e+03],
-                         [2.1500e+02, 1.2150e+03],
-                         [3.1500e+02, 1.3150e+03]]]],
+                        [[2.],
+                         [4.]],
 
-
-
-                      [[[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
-
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]]],
-
-
-                       [[[5.0000e+00, 1.0050e+03],
-                         [1.0500e+02, 1.1050e+03],
-                         [2.0500e+02, 1.2050e+03],
-                         [3.0500e+02, 1.3050e+03]],
-
-                        [[1.5000e+01, 1.0150e+03],
-                         [1.1500e+02, 1.1150e+03],
-                         [2.1500e+02, 1.2150e+03],
-                         [3.1500e+02, 1.3150e+03]]],
-
-
-                       [[[6.0000e+00, 1.0060e+03],
-                         [1.0600e+02, 1.1060e+03],
-                         [2.0600e+02, 1.2060e+03],
-                         [3.0600e+02, 1.3060e+03]],
-
-                        [[1.6000e+01, 1.0160e+03],
-                         [1.1600e+02, 1.1160e+03],
-                         [2.1600e+02, 1.2160e+03],
-                         [3.1600e+02, 1.3160e+03]]]]]), device='cuda:0',
-       size=(9, 4, 4, 2), nnz=4, layout=torch.sparse_bsc)
+                        [[3.],
+                         [0.]]]]]), device='cuda:0', size=(2, 3, 2, 3), nnz=3,
+       dtype=torch.float64, layout=torch.sparse_bsc)
 # _ccol_indices
-tensor([0, 2, 4], device='cuda:0')
-# _row_indices
-tensor([0, 1, 0, 2], device='cuda:0')
-# _values
-tensor([[[[[1.0000e+00, 1.0010e+03],
-           [1.0100e+02, 1.1010e+03],
-           [2.0100e+02, 1.2010e+03],
-           [3.0100e+02, 1.3010e+03]],
-
-          [[1.1000e+01, 1.0110e+03],
-           [1.1100e+02, 1.1110e+03],
-           [2.1100e+02, 1.2110e+03],
-           [3.1100e+02, 1.3110e+03]]],
-
-
-         [[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
-
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]]],
+tensor([[[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]],
 
+        [[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]]], device='cuda:0')
+# _row_indices
+tensor([[[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]],
 
-         [[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
-
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]]]],
-
-
-
-        [[[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
-
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]]],
-
-
-         [[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
-
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]]],
+        [[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]]], device='cuda:0')
+# _values
+tensor([[[[[1.],
+           [3.]],
 
+          [[2.],
+           [0.]],
 
-         [[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
+          [[0.],
+           [4.]]],
 
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]]]],
 
+         [[[1.],
+           [4.]],
 
+          [[2.],
+           [0.]],
 
-        [[[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
+          [[3.],
+           [0.]]],
 
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]]],
 
+         [[[1.],
+           [2.]],
 
-         [[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
+          [[0.],
+           [3.]],
 
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]]],
+          [[0.],
+           [4.]]]],
 
 
-         [[[5.0000e+00, 1.0050e+03],
-           [1.0500e+02, 1.1050e+03],
-           [2.0500e+02, 1.2050e+03],
-           [3.0500e+02, 1.3050e+03]],
 
-          [[1.5000e+01, 1.0150e+03],
-           [1.1500e+02, 1.1150e+03],
-           [2.1500e+02, 1.2150e+03],
-           [3.1500e+02, 1.3150e+03]]]],
+        [[[[0.],
+           [2.]],
 
+          [[1.],
+           [3.]],
 
+          [[0.],
+           [4.]]],
 
-        [[[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
 
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]]],
+         [[[1.],
+           [3.]],
 
+          [[0.],
+           [4.]],
 
-         [[[5.0000e+00, 1.0050e+03],
-           [1.0500e+02, 1.1050e+03],
-           [2.0500e+02, 1.2050e+03],
-           [3.0500e+02, 1.3050e+03]],
+          [[2.],
+           [0.]]],
 
-          [[1.5000e+01, 1.0150e+03],
-           [1.1500e+02, 1.1150e+03],
-           [2.1500e+02, 1.2150e+03],
-           [3.1500e+02, 1.3150e+03]]],
 
+         [[[1.],
+           [0.]],
 
-         [[[6.0000e+00, 1.0060e+03],
-           [1.0600e+02, 1.1060e+03],
-           [2.0600e+02, 1.2060e+03],
-           [3.0600e+02, 1.3060e+03]],
+          [[2.],
+           [4.]],
 
-          [[1.6000e+01, 1.0160e+03],
-           [1.1600e+02, 1.1160e+03],
-           [2.1600e+02, 1.2160e+03],
-           [3.1600e+02, 1.3160e+03]]]]], device='cuda:0')
+          [[3.],
+           [0.]]]]], device='cuda:0', dtype=torch.float64)
 
-########## torch.float32/torch.int64/size=(2, 3)+(6, 6)+(2, 1) ##########
+########## torch.float64/torch.int64/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[[[  1.],
-                           [101.]],
+tensor(ccol_indices=tensor([0, 4, 7]),
+       row_indices=tensor([0, 1, 2, 3, 0, 2, 3]),
+       values=tensor([[[ 0.,  1.,  0.],
+                       [ 0.,  4.,  0.]],
 
-                          [[ 11.],
-                           [111.]],
+                      [[ 6.,  7.,  8.],
+                       [ 0.,  0.,  0.]],
 
-                          [[ 21.],
-                           [121.]]],
+                      [[ 0.,  9.,  0.],
+                       [13.,  0., 14.]],
 
+                      [[ 0.,  0.,  0.],
+                       [20., 21., 22.]],
 
-                         [[[  2.],
-                           [102.]],
+                      [[ 2.,  0.,  3.],
+                       [ 0.,  5.,  0.]],
 
-                          [[ 12.],
-                           [112.]],
+                      [[10., 11., 12.],
+                       [15., 16., 17.]],
 
-                          [[ 22.],
-                           [122.]]]],
+                      [[ 0., 18., 19.],
+                       [ 0., 23., 24.]]]), device='cuda:0', size=(8, 6), nnz=7,
+       dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0, 4, 7], device='cuda:0')
+# _row_indices
+tensor([0, 1, 2, 3, 0, 2, 3], device='cuda:0')
+# _values
+tensor([[[ 0.,  1.,  0.],
+         [ 0.,  4.,  0.]],
 
+        [[ 6.,  7.,  8.],
+         [ 0.,  0.,  0.]],
 
+        [[ 0.,  9.,  0.],
+         [13.,  0., 14.]],
 
-                        [[[[  2.],
-                           [102.]],
+        [[ 0.,  0.,  0.],
+         [20., 21., 22.]],
 
-                          [[ 12.],
-                           [112.]],
+        [[ 2.,  0.,  3.],
+         [ 0.,  5.,  0.]],
 
-                          [[ 22.],
-                           [122.]]],
+        [[10., 11., 12.],
+         [15., 16., 17.]],
 
+        [[ 0., 18., 19.],
+         [ 0., 23., 24.]]], device='cuda:0', dtype=torch.float64)
 
-                         [[[  3.],
-                           [103.]],
 
-                          [[ 13.],
-                           [113.]],
+########## torch.float32/torch.int32/size=(2, 3)+(2, 3)+(4,) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]],
 
-                          [[ 23.],
-                           [123.]]]],
+                            [[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]]]),
+       row_indices=tensor([[[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]],
 
+                           [[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]]]),
+       values=tensor([[[[[[1., 2., 3., 4.]],
 
+                         [[3., 4., 5., 6.]]],
 
-                        [[[[  3.],
-                           [103.]],
 
-                          [[ 13.],
-                           [113.]],
+                        [[[2., 3., 4., 5.]],
 
-                          [[ 23.],
-                           [123.]]],
+                         [[0., 0., 0., 0.]]],
 
 
-                         [[[  4.],
-                           [104.]],
+                        [[[0., 0., 0., 0.]],
 
-                          [[ 14.],
-                           [114.]],
+                         [[4., 5., 6., 7.]]]],
 
-                          [[ 24.],
-                           [124.]]]],
 
 
+                       [[[[1., 2., 3., 4.]],
 
-                        [[[[  4.],
-                           [104.]],
+                         [[4., 5., 6., 7.]]],
 
-                          [[ 14.],
-                           [114.]],
 
-                          [[ 24.],
-                           [124.]]],
+                        [[[2., 3., 4., 5.]],
 
+                         [[0., 0., 0., 0.]]],
 
-                         [[[  5.],
-                           [105.]],
 
-                          [[ 15.],
-                           [115.]],
+                        [[[3., 4., 5., 6.]],
 
-                          [[ 25.],
-                           [125.]]]]],
+                         [[0., 0., 0., 0.]]]],
 
 
+                       [[[[1., 2., 3., 4.]],
 
-                       [[[[[  5.],
-                           [105.]],
+                         [[2., 3., 4., 5.]]],
 
-                          [[ 15.],
-                           [115.]],
 
-                          [[ 25.],
-                           [125.]]],
+                        [[[0., 0., 0., 0.]],
 
+                         [[3., 4., 5., 6.]]],
 
-                         [[[  6.],
-                           [106.]],
 
-                          [[ 16.],
-                           [116.]],
+                        [[[0., 0., 0., 0.]],
 
-                          [[ 26.],
-                           [126.]]]],
+                         [[4., 5., 6., 7.]]]]],
 
 
-                        [[[[  6.],
-                           [106.]],
 
-                          [[ 16.],
-                           [116.]],
+                      [[[[[0., 0., 0., 0.]],
 
-                          [[ 26.],
-                           [126.]]],
+                         [[2., 3., 4., 5.]]],
 
 
-                         [[[  7.],
-                           [107.]],
+                        [[[1., 2., 3., 4.]],
 
-                          [[ 17.],
-                           [117.]],
+                         [[3., 4., 5., 6.]]],
 
-                          [[ 27.],
-                           [127.]]]],
 
+                        [[[0., 0., 0., 0.]],
 
+                         [[4., 5., 6., 7.]]]],
 
-                        [[[[  7.],
-                           [107.]],
 
-                          [[ 17.],
-                           [117.]],
 
-                          [[ 27.],
-                           [127.]]],
+                       [[[[1., 2., 3., 4.]],
 
+                         [[3., 4., 5., 6.]]],
 
-                         [[[  8.],
-                           [108.]],
 
-                          [[ 18.],
-                           [118.]],
+                        [[[0., 0., 0., 0.]],
 
-                          [[ 28.],
-                           [128.]]]],
+                         [[4., 5., 6., 7.]]],
 
 
+                        [[[2., 3., 4., 5.]],
 
-                        [[[[  8.],
-                           [108.]],
+                         [[0., 0., 0., 0.]]]],
 
-                          [[ 18.],
-                           [118.]],
 
-                          [[ 28.],
-                           [128.]]],
 
+                       [[[[1., 2., 3., 4.]],
 
-                         [[[  9.],
-                           [109.]],
+                         [[0., 0., 0., 0.]]],
 
-                          [[ 19.],
-                           [119.]],
 
-                          [[ 29.],
-                           [129.]]]]],
+                        [[[2., 3., 4., 5.]],
 
+                         [[4., 5., 6., 7.]]],
 
 
+                        [[[3., 4., 5., 6.]],
 
-                       [[[[[  9.],
-                           [109.]],
+                         [[0., 0., 0., 0.]]]]]]), device='cuda:0',
+       size=(2, 3, 2, 3, 4), nnz=3, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]],
 
-                          [[ 19.],
-                           [119.]],
+        [[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]]], device='cuda:0', dtype=torch.int32)
+# _row_indices
+tensor([[[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]],
 
-                          [[ 29.],
-                           [129.]]],
+        [[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]]], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[[[[1., 2., 3., 4.]],
 
+           [[3., 4., 5., 6.]]],
 
-                         [[[ 10.],
-                           [110.]],
 
-                          [[ 20.],
-                           [120.]],
+          [[[2., 3., 4., 5.]],
 
-                          [[ 30.],
-                           [130.]]]],
+           [[0., 0., 0., 0.]]],
 
 
+          [[[0., 0., 0., 0.]],
 
-                        [[[[ 10.],
-                           [110.]],
+           [[4., 5., 6., 7.]]]],
 
-                          [[ 20.],
-                           [120.]],
 
-                          [[ 30.],
-                           [130.]]],
 
+         [[[[1., 2., 3., 4.]],
 
-                         [[[ 11.],
-                           [111.]],
+           [[4., 5., 6., 7.]]],
 
-                          [[ 21.],
-                           [121.]],
 
-                          [[ 31.],
-                           [131.]]]],
+          [[[2., 3., 4., 5.]],
 
+           [[0., 0., 0., 0.]]],
 
 
-                        [[[[ 11.],
-                           [111.]],
+          [[[3., 4., 5., 6.]],
 
-                          [[ 21.],
-                           [121.]],
+           [[0., 0., 0., 0.]]]],
 
-                          [[ 31.],
-                           [131.]]],
 
 
-                         [[[ 12.],
-                           [112.]],
+         [[[[1., 2., 3., 4.]],
 
-                          [[ 22.],
-                           [122.]],
+           [[2., 3., 4., 5.]]],
 
-                          [[ 32.],
-                           [132.]]]],
 
+          [[[0., 0., 0., 0.]],
 
+           [[3., 4., 5., 6.]]],
 
-                        [[[[ 12.],
-                           [112.]],
 
-                          [[ 22.],
-                           [122.]],
+          [[[0., 0., 0., 0.]],
 
-                          [[ 32.],
-                           [132.]]],
+           [[4., 5., 6., 7.]]]]],
 
 
-                         [[[ 13.],
-                           [113.]],
 
-                          [[ 23.],
-                           [123.]],
 
-                          [[ 33.],
-                           [133.]]]]]],
+        [[[[[0., 0., 0., 0.]],
 
+           [[2., 3., 4., 5.]]],
 
 
+          [[[1., 2., 3., 4.]],
 
+           [[3., 4., 5., 6.]]],
 
-                      [[[[[[ 13.],
-                           [113.]],
 
-                          [[ 23.],
-                           [123.]],
+          [[[0., 0., 0., 0.]],
 
-                          [[ 33.],
-                           [133.]]],
+           [[4., 5., 6., 7.]]]],
 
 
-                         [[[ 14.],
-                           [114.]],
 
-                          [[ 24.],
-                           [124.]],
+         [[[[1., 2., 3., 4.]],
 
-                          [[ 34.],
-                           [134.]]]],
+           [[3., 4., 5., 6.]]],
 
 
+          [[[0., 0., 0., 0.]],
 
-                        [[[[ 14.],
-                           [114.]],
+           [[4., 5., 6., 7.]]],
 
-                          [[ 24.],
-                           [124.]],
 
-                          [[ 34.],
-                           [134.]]],
+          [[[2., 3., 4., 5.]],
 
+           [[0., 0., 0., 0.]]]],
 
-                         [[[ 15.],
-                           [115.]],
 
-                          [[ 25.],
-                           [125.]],
 
-                          [[ 35.],
-                           [135.]]]],
+         [[[[1., 2., 3., 4.]],
 
+           [[0., 0., 0., 0.]]],
 
 
-                        [[[[ 15.],
-                           [115.]],
+          [[[2., 3., 4., 5.]],
 
-                          [[ 25.],
-                           [125.]],
+           [[4., 5., 6., 7.]]],
 
-                          [[ 35.],
-                           [135.]]],
 
+          [[[3., 4., 5., 6.]],
 
-                         [[[ 16.],
-                           [116.]],
+           [[0., 0., 0., 0.]]]]]], device='cuda:0')
 
-                          [[ 26.],
-                           [126.]],
+########## torch.float32/torch.int32/size=()+(8, 6)+(4, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 4, 7]),
+       row_indices=tensor([0, 1, 2, 3, 0, 2, 3]),
+       values=tensor([[[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
+
+                        [[ 1., 11.],
+                         [ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.]],
 
-                          [[ 36.],
-                           [136.]]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
 
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                        [[[[ 16.],
-                           [116.]],
+                        [[ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.]],
 
-                          [[ 26.],
-                           [126.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-                          [[ 36.],
-                           [136.]]],
 
 
-                         [[[ 17.],
-                           [117.]],
+                      [[[[ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.]],
 
-                          [[ 27.],
-                           [127.]],
+                        [[ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.]],
 
-                          [[ 37.],
-                           [137.]]]]],
+                        [[ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.],
+                         [11., 21.]]],
 
 
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                       [[[[[ 17.],
-                           [117.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-                          [[ 27.],
-                           [127.]],
 
-                          [[ 37.],
-                           [137.]]],
 
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                         [[[ 18.],
-                           [118.]],
+                        [[ 9., 19.],
+                         [10., 20.],
+                         [11., 21.],
+                         [12., 22.]],
 
-                          [[ 28.],
-                           [128.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
-                          [[ 38.],
-                           [138.]]]],
 
+                       [[[13., 23.],
+                         [14., 24.],
+                         [15., 25.],
+                         [16., 26.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                        [[[[ 18.],
-                           [118.]],
+                        [[14., 24.],
+                         [15., 25.],
+                         [16., 26.],
+                         [17., 27.]]]],
 
-                          [[ 28.],
-                           [128.]],
 
-                          [[ 38.],
-                           [138.]]],
 
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                         [[[ 19.],
-                           [119.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 29.],
-                           [129.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
-                          [[ 39.],
-                           [139.]]]],
 
+                       [[[20., 30.],
+                         [21., 31.],
+                         [22., 32.],
+                         [23., 33.]],
 
+                        [[21., 31.],
+                         [22., 32.],
+                         [23., 33.],
+                         [24., 34.]],
 
-                        [[[[ 19.],
-                           [119.]],
+                        [[22., 32.],
+                         [23., 33.],
+                         [24., 34.],
+                         [25., 35.]]]],
 
-                          [[ 29.],
-                           [129.]],
 
-                          [[ 39.],
-                           [139.]]],
 
+                      [[[[ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.]],
 
-                         [[[ 20.],
-                           [120.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 30.],
-                           [130.]],
+                        [[ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.]]],
 
-                          [[ 40.],
-                           [140.]]]],
 
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.]],
 
-                        [[[[ 20.],
-                           [120.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-                          [[ 30.],
-                           [130.]],
 
-                          [[ 40.],
-                           [140.]]],
 
+                      [[[[10., 20.],
+                         [11., 21.],
+                         [12., 22.],
+                         [13., 23.]],
 
-                         [[[ 21.],
-                           [121.]],
+                        [[11., 21.],
+                         [12., 22.],
+                         [13., 23.],
+                         [14., 24.]],
 
-                          [[ 31.],
-                           [131.]],
+                        [[12., 22.],
+                         [13., 23.],
+                         [14., 24.],
+                         [15., 25.]]],
 
-                          [[ 41.],
-                           [141.]]]]],
 
+                       [[[15., 25.],
+                         [16., 26.],
+                         [17., 27.],
+                         [18., 28.]],
 
+                        [[16., 26.],
+                         [17., 27.],
+                         [18., 28.],
+                         [19., 29.]],
 
+                        [[17., 27.],
+                         [18., 28.],
+                         [19., 29.],
+                         [20., 30.]]]],
 
-                       [[[[[ 21.],
-                           [121.]],
 
-                          [[ 31.],
-                           [131.]],
 
-                          [[ 41.],
-                           [141.]]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[18., 28.],
+                         [19., 29.],
+                         [20., 30.],
+                         [21., 31.]],
 
-                         [[[ 22.],
-                           [122.]],
+                        [[19., 29.],
+                         [20., 30.],
+                         [21., 31.],
+                         [22., 32.]]],
 
-                          [[ 32.],
-                           [132.]],
 
-                          [[ 42.],
-                           [142.]]]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[23., 33.],
+                         [24., 34.],
+                         [25., 35.],
+                         [26., 36.]],
 
+                        [[24., 34.],
+                         [25., 35.],
+                         [26., 36.],
+                         [27., 37.]]]]]), device='cuda:0', size=(8, 6, 4, 2),
+       nnz=7, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0, 4, 7], device='cuda:0', dtype=torch.int32)
+# _row_indices
+tensor([0, 1, 2, 3, 0, 2, 3], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                        [[[[ 22.],
-                           [122.]],
+          [[ 1., 11.],
+           [ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.]],
 
-                          [[ 32.],
-                           [132.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-                          [[ 42.],
-                           [142.]]],
 
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                         [[[ 23.],
-                           [123.]],
+          [[ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.]],
 
-                          [[ 33.],
-                           [133.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-                          [[ 43.],
-                           [143.]]]],
 
 
+        [[[[ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.]],
 
-                        [[[[ 23.],
-                           [123.]],
+          [[ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.],
+           [10., 20.]],
 
-                          [[ 33.],
-                           [133.]],
+          [[ 8., 18.],
+           [ 9., 19.],
+           [10., 20.],
+           [11., 21.]]],
 
-                          [[ 43.],
-                           [143.]]],
 
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                         [[[ 24.],
-                           [124.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                          [[ 34.],
-                           [134.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-                          [[ 44.],
-                           [144.]]]],
 
 
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                        [[[[ 24.],
-                           [124.]],
+          [[ 9., 19.],
+           [10., 20.],
+           [11., 21.],
+           [12., 22.]],
 
-                          [[ 34.],
-                           [134.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-                          [[ 44.],
-                           [144.]]],
 
+         [[[13., 23.],
+           [14., 24.],
+           [15., 25.],
+           [16., 26.]],
 
-                         [[[ 25.],
-                           [125.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                          [[ 35.],
-                           [135.]],
+          [[14., 24.],
+           [15., 25.],
+           [16., 26.],
+           [17., 27.]]]],
 
-                          [[ 45.],
-                           [145.]]]]]]]), device='cuda:0',
-       size=(2, 3, 6, 6, 2, 1), nnz=4, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], device='cuda:0')
-# _row_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], device='cuda:0')
-# _values
-tensor([[[[[[[  1.],
-             [101.]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 11.],
-             [111.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 21.],
-             [121.]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
 
-           [[[  2.],
-             [102.]],
+         [[[20., 30.],
+           [21., 31.],
+           [22., 32.],
+           [23., 33.]],
 
-            [[ 12.],
-             [112.]],
+          [[21., 31.],
+           [22., 32.],
+           [23., 33.],
+           [24., 34.]],
 
-            [[ 22.],
-             [122.]]]],
+          [[22., 32.],
+           [23., 33.],
+           [24., 34.],
+           [25., 35.]]]],
 
 
-          [[[[  2.],
-             [102.]],
+        [[[[ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.]],
 
-            [[ 12.],
-             [112.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 22.],
-             [122.]]],
+          [[ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.]]],
 
 
-           [[[  3.],
-             [103.]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 13.],
-             [113.]],
+          [[ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.]],
 
-            [[ 23.],
-             [123.]]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
 
-          [[[[  3.],
-             [103.]],
+        [[[[10., 20.],
+           [11., 21.],
+           [12., 22.],
+           [13., 23.]],
 
-            [[ 13.],
-             [113.]],
+          [[11., 21.],
+           [12., 22.],
+           [13., 23.],
+           [14., 24.]],
 
-            [[ 23.],
-             [123.]]],
+          [[12., 22.],
+           [13., 23.],
+           [14., 24.],
+           [15., 25.]]],
 
 
-           [[[  4.],
-             [104.]],
+         [[[15., 25.],
+           [16., 26.],
+           [17., 27.],
+           [18., 28.]],
 
-            [[ 14.],
-             [114.]],
+          [[16., 26.],
+           [17., 27.],
+           [18., 28.],
+           [19., 29.]],
 
-            [[ 24.],
-             [124.]]]],
+          [[17., 27.],
+           [18., 28.],
+           [19., 29.],
+           [20., 30.]]]],
 
 
-          [[[[  4.],
-             [104.]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 14.],
-             [114.]],
+          [[18., 28.],
+           [19., 29.],
+           [20., 30.],
+           [21., 31.]],
 
-            [[ 24.],
-             [124.]]],
+          [[19., 29.],
+           [20., 30.],
+           [21., 31.],
+           [22., 32.]]],
 
 
-           [[[  5.],
-             [105.]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 15.],
-             [115.]],
+          [[23., 33.],
+           [24., 34.],
+           [25., 35.],
+           [26., 36.]],
 
-            [[ 25.],
-             [125.]]]]],
+          [[24., 34.],
+           [25., 35.],
+           [26., 36.],
+           [27., 37.]]]]], device='cuda:0')
 
 
+########## torch.float64/torch.int32/size=(2, 3)+(2, 3)+(4,) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]],
 
+                            [[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]]]),
+       row_indices=tensor([[[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]],
 
-         [[[[[  5.],
-             [105.]],
+                           [[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]]]),
+       values=tensor([[[[[[1., 2., 3., 4.]],
 
-            [[ 15.],
-             [115.]],
+                         [[3., 4., 5., 6.]]],
 
-            [[ 25.],
-             [125.]]],
 
+                        [[[2., 3., 4., 5.]],
 
-           [[[  6.],
-             [106.]],
+                         [[0., 0., 0., 0.]]],
 
-            [[ 16.],
-             [116.]],
 
-            [[ 26.],
-             [126.]]]],
+                        [[[0., 0., 0., 0.]],
 
+                         [[4., 5., 6., 7.]]]],
 
 
-          [[[[  6.],
-             [106.]],
 
-            [[ 16.],
-             [116.]],
+                       [[[[1., 2., 3., 4.]],
 
-            [[ 26.],
-             [126.]]],
+                         [[4., 5., 6., 7.]]],
 
 
-           [[[  7.],
-             [107.]],
+                        [[[2., 3., 4., 5.]],
 
-            [[ 17.],
-             [117.]],
+                         [[0., 0., 0., 0.]]],
 
-            [[ 27.],
-             [127.]]]],
 
+                        [[[3., 4., 5., 6.]],
 
+                         [[0., 0., 0., 0.]]]],
 
-          [[[[  7.],
-             [107.]],
 
-            [[ 17.],
-             [117.]],
 
-            [[ 27.],
-             [127.]]],
+                       [[[[1., 2., 3., 4.]],
 
+                         [[2., 3., 4., 5.]]],
 
-           [[[  8.],
-             [108.]],
 
-            [[ 18.],
-             [118.]],
+                        [[[0., 0., 0., 0.]],
 
-            [[ 28.],
-             [128.]]]],
+                         [[3., 4., 5., 6.]]],
 
 
+                        [[[0., 0., 0., 0.]],
 
-          [[[[  8.],
-             [108.]],
+                         [[4., 5., 6., 7.]]]]],
 
-            [[ 18.],
-             [118.]],
 
-            [[ 28.],
-             [128.]]],
 
 
-           [[[  9.],
-             [109.]],
+                      [[[[[0., 0., 0., 0.]],
 
-            [[ 19.],
-             [119.]],
+                         [[2., 3., 4., 5.]]],
 
-            [[ 29.],
-             [129.]]]]],
 
+                        [[[1., 2., 3., 4.]],
 
+                         [[3., 4., 5., 6.]]],
 
 
-         [[[[[  9.],
-             [109.]],
+                        [[[0., 0., 0., 0.]],
 
-            [[ 19.],
-             [119.]],
+                         [[4., 5., 6., 7.]]]],
 
-            [[ 29.],
-             [129.]]],
 
 
-           [[[ 10.],
-             [110.]],
+                       [[[[1., 2., 3., 4.]],
 
-            [[ 20.],
-             [120.]],
+                         [[3., 4., 5., 6.]]],
 
-            [[ 30.],
-             [130.]]]],
 
+                        [[[0., 0., 0., 0.]],
 
+                         [[4., 5., 6., 7.]]],
 
-          [[[[ 10.],
-             [110.]],
 
-            [[ 20.],
-             [120.]],
+                        [[[2., 3., 4., 5.]],
 
-            [[ 30.],
-             [130.]]],
+                         [[0., 0., 0., 0.]]]],
 
 
-           [[[ 11.],
-             [111.]],
 
-            [[ 21.],
-             [121.]],
+                       [[[[1., 2., 3., 4.]],
 
-            [[ 31.],
-             [131.]]]],
+                         [[0., 0., 0., 0.]]],
 
 
+                        [[[2., 3., 4., 5.]],
 
-          [[[[ 11.],
-             [111.]],
+                         [[4., 5., 6., 7.]]],
 
-            [[ 21.],
-             [121.]],
 
-            [[ 31.],
-             [131.]]],
+                        [[[3., 4., 5., 6.]],
 
+                         [[0., 0., 0., 0.]]]]]]), device='cuda:0',
+       size=(2, 3, 2, 3, 4), nnz=3, dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]],
 
-           [[[ 12.],
-             [112.]],
+        [[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]]], device='cuda:0', dtype=torch.int32)
+# _row_indices
+tensor([[[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]],
 
-            [[ 22.],
-             [122.]],
+        [[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]]], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[[[[1., 2., 3., 4.]],
 
-            [[ 32.],
-             [132.]]]],
+           [[3., 4., 5., 6.]]],
 
 
+          [[[2., 3., 4., 5.]],
 
-          [[[[ 12.],
-             [112.]],
+           [[0., 0., 0., 0.]]],
 
-            [[ 22.],
-             [122.]],
 
-            [[ 32.],
-             [132.]]],
+          [[[0., 0., 0., 0.]],
 
+           [[4., 5., 6., 7.]]]],
 
-           [[[ 13.],
-             [113.]],
 
-            [[ 23.],
-             [123.]],
 
-            [[ 33.],
-             [133.]]]]]],
+         [[[[1., 2., 3., 4.]],
 
+           [[4., 5., 6., 7.]]],
 
 
+          [[[2., 3., 4., 5.]],
 
+           [[0., 0., 0., 0.]]],
 
-        [[[[[[ 13.],
-             [113.]],
 
-            [[ 23.],
-             [123.]],
+          [[[3., 4., 5., 6.]],
 
-            [[ 33.],
-             [133.]]],
+           [[0., 0., 0., 0.]]]],
 
 
-           [[[ 14.],
-             [114.]],
 
-            [[ 24.],
-             [124.]],
+         [[[[1., 2., 3., 4.]],
 
-            [[ 34.],
-             [134.]]]],
+           [[2., 3., 4., 5.]]],
 
 
+          [[[0., 0., 0., 0.]],
 
-          [[[[ 14.],
-             [114.]],
+           [[3., 4., 5., 6.]]],
 
-            [[ 24.],
-             [124.]],
 
-            [[ 34.],
-             [134.]]],
+          [[[0., 0., 0., 0.]],
 
+           [[4., 5., 6., 7.]]]]],
 
-           [[[ 15.],
-             [115.]],
 
-            [[ 25.],
-             [125.]],
 
-            [[ 35.],
-             [135.]]]],
 
+        [[[[[0., 0., 0., 0.]],
 
+           [[2., 3., 4., 5.]]],
 
-          [[[[ 15.],
-             [115.]],
 
-            [[ 25.],
-             [125.]],
+          [[[1., 2., 3., 4.]],
 
-            [[ 35.],
-             [135.]]],
+           [[3., 4., 5., 6.]]],
 
 
-           [[[ 16.],
-             [116.]],
+          [[[0., 0., 0., 0.]],
 
-            [[ 26.],
-             [126.]],
+           [[4., 5., 6., 7.]]]],
 
-            [[ 36.],
-             [136.]]]],
 
 
+         [[[[1., 2., 3., 4.]],
 
-          [[[[ 16.],
-             [116.]],
+           [[3., 4., 5., 6.]]],
 
-            [[ 26.],
-             [126.]],
 
-            [[ 36.],
-             [136.]]],
+          [[[0., 0., 0., 0.]],
 
+           [[4., 5., 6., 7.]]],
 
-           [[[ 17.],
-             [117.]],
 
-            [[ 27.],
-             [127.]],
+          [[[2., 3., 4., 5.]],
 
-            [[ 37.],
-             [137.]]]]],
+           [[0., 0., 0., 0.]]]],
 
 
+         [[[[1., 2., 3., 4.]],
 
-         [[[[[ 17.],
-             [117.]],
+           [[0., 0., 0., 0.]]],
 
-            [[ 27.],
-             [127.]],
 
-            [[ 37.],
-             [137.]]],
+          [[[2., 3., 4., 5.]],
 
+           [[4., 5., 6., 7.]]],
 
-           [[[ 18.],
-             [118.]],
 
-            [[ 28.],
-             [128.]],
+          [[[3., 4., 5., 6.]],
 
-            [[ 38.],
-             [138.]]]],
+           [[0., 0., 0., 0.]]]]]], device='cuda:0', dtype=torch.float64)
 
+########## torch.float64/torch.int32/size=()+(8, 6)+(4, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 4, 7]),
+       row_indices=tensor([0, 1, 2, 3, 0, 2, 3]),
+       values=tensor([[[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
+
+                        [[ 1., 11.],
+                         [ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
-          [[[[ 18.],
-             [118.]],
 
-            [[ 28.],
-             [128.]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 38.],
-             [138.]]],
+                        [[ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-           [[[ 19.],
-             [119.]],
 
-            [[ 29.],
-             [129.]],
 
-            [[ 39.],
-             [139.]]]],
+                      [[[[ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.]],
 
+                        [[ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.]],
 
+                        [[ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.],
+                         [11., 21.]]],
 
-          [[[[ 19.],
-             [119.]],
 
-            [[ 29.],
-             [129.]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 39.],
-             [139.]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-           [[[ 20.],
-             [120.]],
 
-            [[ 30.],
-             [130.]],
 
-            [[ 40.],
-             [140.]]]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 9., 19.],
+                         [10., 20.],
+                         [11., 21.],
+                         [12., 22.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
-          [[[[ 20.],
-             [120.]],
 
-            [[ 30.],
-             [130.]],
+                       [[[13., 23.],
+                         [14., 24.],
+                         [15., 25.],
+                         [16., 26.]],
 
-            [[ 40.],
-             [140.]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[14., 24.],
+                         [15., 25.],
+                         [16., 26.],
+                         [17., 27.]]]],
 
-           [[[ 21.],
-             [121.]],
 
-            [[ 31.],
-             [131.]],
 
-            [[ 41.],
-             [141.]]]]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
 
-         [[[[[ 21.],
-             [121.]],
+                       [[[20., 30.],
+                         [21., 31.],
+                         [22., 32.],
+                         [23., 33.]],
 
-            [[ 31.],
-             [131.]],
+                        [[21., 31.],
+                         [22., 32.],
+                         [23., 33.],
+                         [24., 34.]],
 
-            [[ 41.],
-             [141.]]],
+                        [[22., 32.],
+                         [23., 33.],
+                         [24., 34.],
+                         [25., 35.]]]],
 
 
-           [[[ 22.],
-             [122.]],
 
-            [[ 32.],
-             [132.]],
+                      [[[[ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.]],
 
-            [[ 42.],
-             [142.]]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.]]],
 
 
-          [[[[ 22.],
-             [122.]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 32.],
-             [132.]],
+                        [[ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.]],
 
-            [[ 42.],
-             [142.]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
 
-           [[[ 23.],
-             [123.]],
 
-            [[ 33.],
-             [133.]],
+                      [[[[10., 20.],
+                         [11., 21.],
+                         [12., 22.],
+                         [13., 23.]],
 
-            [[ 43.],
-             [143.]]]],
+                        [[11., 21.],
+                         [12., 22.],
+                         [13., 23.],
+                         [14., 24.]],
 
+                        [[12., 22.],
+                         [13., 23.],
+                         [14., 24.],
+                         [15., 25.]]],
 
 
-          [[[[ 23.],
-             [123.]],
+                       [[[15., 25.],
+                         [16., 26.],
+                         [17., 27.],
+                         [18., 28.]],
 
-            [[ 33.],
-             [133.]],
+                        [[16., 26.],
+                         [17., 27.],
+                         [18., 28.],
+                         [19., 29.]],
 
-            [[ 43.],
-             [143.]]],
+                        [[17., 27.],
+                         [18., 28.],
+                         [19., 29.],
+                         [20., 30.]]]],
 
 
-           [[[ 24.],
-             [124.]],
 
-            [[ 34.],
-             [134.]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 44.],
-             [144.]]]],
+                        [[18., 28.],
+                         [19., 29.],
+                         [20., 30.],
+                         [21., 31.]],
 
+                        [[19., 29.],
+                         [20., 30.],
+                         [21., 31.],
+                         [22., 32.]]],
 
 
-          [[[[ 24.],
-             [124.]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 34.],
-             [134.]],
+                        [[23., 33.],
+                         [24., 34.],
+                         [25., 35.],
+                         [26., 36.]],
 
-            [[ 44.],
-             [144.]]],
+                        [[24., 34.],
+                         [25., 35.],
+                         [26., 36.],
+                         [27., 37.]]]]]), device='cuda:0', size=(8, 6, 4, 2),
+       nnz=7, dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0, 4, 7], device='cuda:0', dtype=torch.int32)
+# _row_indices
+tensor([0, 1, 2, 3, 0, 2, 3], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 1., 11.],
+           [ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.]],
 
-           [[[ 25.],
-             [125.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-            [[ 35.],
-             [135.]],
 
-            [[ 45.],
-             [145.]]]]]]], device='cuda:0')
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.]],
 
-########## torch.float64/torch.int64/size=()+(6, 6)+(2,) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[  1., 101.],
-                        [ 11., 111.],
-                        [ 21., 121.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-                       [[  2., 102.],
-                        [ 12., 112.],
-                        [ 22., 122.]]],
 
 
-                      [[[  2., 102.],
-                        [ 12., 112.],
-                        [ 22., 122.]],
+        [[[[ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.]],
 
-                       [[  3., 103.],
-                        [ 13., 113.],
-                        [ 23., 123.]]],
+          [[ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.],
+           [10., 20.]],
 
+          [[ 8., 18.],
+           [ 9., 19.],
+           [10., 20.],
+           [11., 21.]]],
 
-                      [[[  3., 103.],
-                        [ 13., 113.],
-                        [ 23., 123.]],
 
-                       [[  4., 104.],
-                        [ 14., 114.],
-                        [ 24., 124.]]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                      [[[  4., 104.],
-                        [ 14., 114.],
-                        [ 24., 124.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-                       [[  5., 105.],
-                        [ 15., 115.],
-                        [ 25., 125.]]]]), device='cuda:0', size=(6, 6, 2),
-       nnz=4, dtype=torch.float64, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0, 2, 4], device='cuda:0')
-# _row_indices
-tensor([0, 1, 0, 2], device='cuda:0')
-# _values
-tensor([[[[  1., 101.],
-          [ 11., 111.],
-          [ 21., 121.]],
 
-         [[  2., 102.],
-          [ 12., 112.],
-          [ 22., 122.]]],
 
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-        [[[  2., 102.],
-          [ 12., 112.],
-          [ 22., 122.]],
+          [[ 9., 19.],
+           [10., 20.],
+           [11., 21.],
+           [12., 22.]],
 
-         [[  3., 103.],
-          [ 13., 113.],
-          [ 23., 123.]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
 
-        [[[  3., 103.],
-          [ 13., 113.],
-          [ 23., 123.]],
+         [[[13., 23.],
+           [14., 24.],
+           [15., 25.],
+           [16., 26.]],
 
-         [[  4., 104.],
-          [ 14., 114.],
-          [ 24., 124.]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[14., 24.],
+           [15., 25.],
+           [16., 26.],
+           [17., 27.]]]],
 
-        [[[  4., 104.],
-          [ 14., 114.],
-          [ 24., 124.]],
 
-         [[  5., 105.],
-          [ 15., 115.],
-          [ 25., 125.]]]], device='cuda:0', dtype=torch.float64)
 
-########## torch.float64/torch.int64/size=()+(9, 4)+(4, 2) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[[1.0000e+00, 1.0010e+03],
-                         [1.0100e+02, 1.1010e+03],
-                         [2.0100e+02, 1.2010e+03],
-                         [3.0100e+02, 1.3010e+03]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                        [[1.1000e+01, 1.0110e+03],
-                         [1.1100e+02, 1.1110e+03],
-                         [2.1100e+02, 1.2110e+03],
-                         [3.1100e+02, 1.3110e+03]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-                       [[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
 
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]]],
+         [[[20., 30.],
+           [21., 31.],
+           [22., 32.],
+           [23., 33.]],
 
+          [[21., 31.],
+           [22., 32.],
+           [23., 33.],
+           [24., 34.]],
 
-                       [[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
+          [[22., 32.],
+           [23., 33.],
+           [24., 34.],
+           [25., 35.]]]],
 
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]]]],
 
 
+        [[[[ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.]],
 
-                      [[[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]]],
+          [[ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.]]],
 
 
-                       [[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]]],
+          [[ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-                       [[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
 
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]]]],
 
+        [[[[10., 20.],
+           [11., 21.],
+           [12., 22.],
+           [13., 23.]],
 
+          [[11., 21.],
+           [12., 22.],
+           [13., 23.],
+           [14., 24.]],
 
-                      [[[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
+          [[12., 22.],
+           [13., 23.],
+           [14., 24.],
+           [15., 25.]]],
 
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]]],
 
+         [[[15., 25.],
+           [16., 26.],
+           [17., 27.],
+           [18., 28.]],
 
-                       [[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
+          [[16., 26.],
+           [17., 27.],
+           [18., 28.],
+           [19., 29.]],
 
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]]],
+          [[17., 27.],
+           [18., 28.],
+           [19., 29.],
+           [20., 30.]]]],
 
 
-                       [[[5.0000e+00, 1.0050e+03],
-                         [1.0500e+02, 1.1050e+03],
-                         [2.0500e+02, 1.2050e+03],
-                         [3.0500e+02, 1.3050e+03]],
 
-                        [[1.5000e+01, 1.0150e+03],
-                         [1.1500e+02, 1.1150e+03],
-                         [2.1500e+02, 1.2150e+03],
-                         [3.1500e+02, 1.3150e+03]]]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[18., 28.],
+           [19., 29.],
+           [20., 30.],
+           [21., 31.]],
 
+          [[19., 29.],
+           [20., 30.],
+           [21., 31.],
+           [22., 32.]]],
 
-                      [[[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
 
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[23., 33.],
+           [24., 34.],
+           [25., 35.],
+           [26., 36.]],
 
-                       [[[5.0000e+00, 1.0050e+03],
-                         [1.0500e+02, 1.1050e+03],
-                         [2.0500e+02, 1.2050e+03],
-                         [3.0500e+02, 1.3050e+03]],
+          [[24., 34.],
+           [25., 35.],
+           [26., 36.],
+           [27., 37.]]]]], device='cuda:0', dtype=torch.float64)
 
-                        [[1.5000e+01, 1.0150e+03],
-                         [1.1500e+02, 1.1150e+03],
-                         [2.1500e+02, 1.2150e+03],
-                         [3.1500e+02, 1.3150e+03]]],
 
+########## torch.float32/torch.int64/size=(2, 3)+(2, 3)+(4,) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]],
 
-                       [[[6.0000e+00, 1.0060e+03],
-                         [1.0600e+02, 1.1060e+03],
-                         [2.0600e+02, 1.2060e+03],
-                         [3.0600e+02, 1.3060e+03]],
+                            [[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]]]),
+       row_indices=tensor([[[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]],
 
-                        [[1.6000e+01, 1.0160e+03],
-                         [1.1600e+02, 1.1160e+03],
-                         [2.1600e+02, 1.2160e+03],
-                         [3.1600e+02, 1.3160e+03]]]]]), device='cuda:0',
-       size=(9, 4, 4, 2), nnz=4, dtype=torch.float64, layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([0, 2, 4], device='cuda:0')
-# _row_indices
-tensor([0, 1, 0, 2], device='cuda:0')
-# _values
-tensor([[[[[1.0000e+00, 1.0010e+03],
-           [1.0100e+02, 1.1010e+03],
-           [2.0100e+02, 1.2010e+03],
-           [3.0100e+02, 1.3010e+03]],
+                           [[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]]]),
+       values=tensor([[[[[[1., 2., 3., 4.]],
 
-          [[1.1000e+01, 1.0110e+03],
-           [1.1100e+02, 1.1110e+03],
-           [2.1100e+02, 1.2110e+03],
-           [3.1100e+02, 1.3110e+03]]],
+                         [[3., 4., 5., 6.]]],
 
 
-         [[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
+                        [[[2., 3., 4., 5.]],
 
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]]],
+                         [[0., 0., 0., 0.]]],
 
 
-         [[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
+                        [[[0., 0., 0., 0.]],
 
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]]]],
+                         [[4., 5., 6., 7.]]]],
 
 
-        [[[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
+                       [[[[1., 2., 3., 4.]],
 
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]]],
+                         [[4., 5., 6., 7.]]],
 
 
-         [[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
+                        [[[2., 3., 4., 5.]],
 
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]]],
+                         [[0., 0., 0., 0.]]],
 
 
-         [[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
+                        [[[3., 4., 5., 6.]],
 
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]]]],
+                         [[0., 0., 0., 0.]]]],
 
 
-        [[[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
+                       [[[[1., 2., 3., 4.]],
 
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]]],
+                         [[2., 3., 4., 5.]]],
 
 
-         [[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
+                        [[[0., 0., 0., 0.]],
 
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]]],
+                         [[3., 4., 5., 6.]]],
 
 
-         [[[5.0000e+00, 1.0050e+03],
-           [1.0500e+02, 1.1050e+03],
-           [2.0500e+02, 1.2050e+03],
-           [3.0500e+02, 1.3050e+03]],
+                        [[[0., 0., 0., 0.]],
 
-          [[1.5000e+01, 1.0150e+03],
-           [1.1500e+02, 1.1150e+03],
-           [2.1500e+02, 1.2150e+03],
-           [3.1500e+02, 1.3150e+03]]]],
+                         [[4., 5., 6., 7.]]]]],
 
 
-        [[[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
 
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]]],
+                      [[[[[0., 0., 0., 0.]],
 
+                         [[2., 3., 4., 5.]]],
 
-         [[[5.0000e+00, 1.0050e+03],
-           [1.0500e+02, 1.1050e+03],
-           [2.0500e+02, 1.2050e+03],
-           [3.0500e+02, 1.3050e+03]],
 
-          [[1.5000e+01, 1.0150e+03],
-           [1.1500e+02, 1.1150e+03],
-           [2.1500e+02, 1.2150e+03],
-           [3.1500e+02, 1.3150e+03]]],
+                        [[[1., 2., 3., 4.]],
 
+                         [[3., 4., 5., 6.]]],
 
-         [[[6.0000e+00, 1.0060e+03],
-           [1.0600e+02, 1.1060e+03],
-           [2.0600e+02, 1.2060e+03],
-           [3.0600e+02, 1.3060e+03]],
 
-          [[1.6000e+01, 1.0160e+03],
-           [1.1600e+02, 1.1160e+03],
-           [2.1600e+02, 1.2160e+03],
-           [3.1600e+02, 1.3160e+03]]]]], device='cuda:0', dtype=torch.float64)
+                        [[[0., 0., 0., 0.]],
 
-########## torch.float64/torch.int64/size=(2, 3)+(6, 6)+(2, 1) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
+                         [[4., 5., 6., 7.]]]],
 
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
 
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[[[  1.],
-                           [101.]],
 
-                          [[ 11.],
-                           [111.]],
+                       [[[[1., 2., 3., 4.]],
 
-                          [[ 21.],
-                           [121.]]],
+                         [[3., 4., 5., 6.]]],
 
 
-                         [[[  2.],
-                           [102.]],
+                        [[[0., 0., 0., 0.]],
 
-                          [[ 12.],
-                           [112.]],
+                         [[4., 5., 6., 7.]]],
 
-                          [[ 22.],
-                           [122.]]]],
 
+                        [[[2., 3., 4., 5.]],
 
+                         [[0., 0., 0., 0.]]]],
 
-                        [[[[  2.],
-                           [102.]],
 
-                          [[ 12.],
-                           [112.]],
 
-                          [[ 22.],
-                           [122.]]],
+                       [[[[1., 2., 3., 4.]],
 
+                         [[0., 0., 0., 0.]]],
 
-                         [[[  3.],
-                           [103.]],
 
-                          [[ 13.],
-                           [113.]],
+                        [[[2., 3., 4., 5.]],
 
-                          [[ 23.],
-                           [123.]]]],
+                         [[4., 5., 6., 7.]]],
 
 
+                        [[[3., 4., 5., 6.]],
 
-                        [[[[  3.],
-                           [103.]],
+                         [[0., 0., 0., 0.]]]]]]), device='cuda:0',
+       size=(2, 3, 2, 3, 4), nnz=3, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]],
 
-                          [[ 13.],
-                           [113.]],
+        [[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]]], device='cuda:0')
+# _row_indices
+tensor([[[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]],
 
-                          [[ 23.],
-                           [123.]]],
+        [[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]]], device='cuda:0')
+# _values
+tensor([[[[[[1., 2., 3., 4.]],
 
+           [[3., 4., 5., 6.]]],
 
-                         [[[  4.],
-                           [104.]],
 
-                          [[ 14.],
-                           [114.]],
+          [[[2., 3., 4., 5.]],
 
-                          [[ 24.],
-                           [124.]]]],
+           [[0., 0., 0., 0.]]],
 
 
+          [[[0., 0., 0., 0.]],
 
-                        [[[[  4.],
-                           [104.]],
+           [[4., 5., 6., 7.]]]],
 
-                          [[ 14.],
-                           [114.]],
 
-                          [[ 24.],
-                           [124.]]],
 
+         [[[[1., 2., 3., 4.]],
 
-                         [[[  5.],
-                           [105.]],
+           [[4., 5., 6., 7.]]],
 
-                          [[ 15.],
-                           [115.]],
 
-                          [[ 25.],
-                           [125.]]]]],
+          [[[2., 3., 4., 5.]],
 
+           [[0., 0., 0., 0.]]],
 
 
+          [[[3., 4., 5., 6.]],
 
-                       [[[[[  5.],
-                           [105.]],
+           [[0., 0., 0., 0.]]]],
 
-                          [[ 15.],
-                           [115.]],
 
-                          [[ 25.],
-                           [125.]]],
 
+         [[[[1., 2., 3., 4.]],
 
-                         [[[  6.],
-                           [106.]],
+           [[2., 3., 4., 5.]]],
 
-                          [[ 16.],
-                           [116.]],
 
-                          [[ 26.],
-                           [126.]]]],
+          [[[0., 0., 0., 0.]],
 
+           [[3., 4., 5., 6.]]],
 
 
-                        [[[[  6.],
-                           [106.]],
+          [[[0., 0., 0., 0.]],
 
-                          [[ 16.],
-                           [116.]],
+           [[4., 5., 6., 7.]]]]],
 
-                          [[ 26.],
-                           [126.]]],
 
 
-                         [[[  7.],
-                           [107.]],
 
-                          [[ 17.],
-                           [117.]],
+        [[[[[0., 0., 0., 0.]],
 
-                          [[ 27.],
-                           [127.]]]],
+           [[2., 3., 4., 5.]]],
 
 
+          [[[1., 2., 3., 4.]],
 
-                        [[[[  7.],
-                           [107.]],
+           [[3., 4., 5., 6.]]],
 
-                          [[ 17.],
-                           [117.]],
 
-                          [[ 27.],
-                           [127.]]],
+          [[[0., 0., 0., 0.]],
 
+           [[4., 5., 6., 7.]]]],
 
-                         [[[  8.],
-                           [108.]],
 
-                          [[ 18.],
-                           [118.]],
 
-                          [[ 28.],
-                           [128.]]]],
+         [[[[1., 2., 3., 4.]],
 
+           [[3., 4., 5., 6.]]],
 
 
-                        [[[[  8.],
-                           [108.]],
+          [[[0., 0., 0., 0.]],
 
-                          [[ 18.],
-                           [118.]],
+           [[4., 5., 6., 7.]]],
 
-                          [[ 28.],
-                           [128.]]],
 
+          [[[2., 3., 4., 5.]],
 
-                         [[[  9.],
-                           [109.]],
+           [[0., 0., 0., 0.]]]],
 
-                          [[ 19.],
-                           [119.]],
 
-                          [[ 29.],
-                           [129.]]]]],
 
+         [[[[1., 2., 3., 4.]],
 
+           [[0., 0., 0., 0.]]],
 
 
-                       [[[[[  9.],
-                           [109.]],
+          [[[2., 3., 4., 5.]],
 
-                          [[ 19.],
-                           [119.]],
+           [[4., 5., 6., 7.]]],
 
-                          [[ 29.],
-                           [129.]]],
 
+          [[[3., 4., 5., 6.]],
 
-                         [[[ 10.],
-                           [110.]],
+           [[0., 0., 0., 0.]]]]]], device='cuda:0')
 
-                          [[ 20.],
-                           [120.]],
+########## torch.float32/torch.int64/size=()+(8, 6)+(4, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 4, 7]),
+       row_indices=tensor([0, 1, 2, 3, 0, 2, 3]),
+       values=tensor([[[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
+
+                        [[ 1., 11.],
+                         [ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.]],
 
-                          [[ 30.],
-                           [130.]]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
 
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                        [[[[ 10.],
-                           [110.]],
+                        [[ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.]],
 
-                          [[ 20.],
-                           [120.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-                          [[ 30.],
-                           [130.]]],
 
 
-                         [[[ 11.],
-                           [111.]],
+                      [[[[ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.]],
 
-                          [[ 21.],
-                           [121.]],
+                        [[ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.]],
 
-                          [[ 31.],
-                           [131.]]]],
+                        [[ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.],
+                         [11., 21.]]],
 
 
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                        [[[[ 11.],
-                           [111.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 21.],
-                           [121.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-                          [[ 31.],
-                           [131.]]],
 
 
-                         [[[ 12.],
-                           [112.]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 22.],
-                           [122.]],
+                        [[ 9., 19.],
+                         [10., 20.],
+                         [11., 21.],
+                         [12., 22.]],
 
-                          [[ 32.],
-                           [132.]]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
 
+                       [[[13., 23.],
+                         [14., 24.],
+                         [15., 25.],
+                         [16., 26.]],
 
-                        [[[[ 12.],
-                           [112.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 22.],
-                           [122.]],
+                        [[14., 24.],
+                         [15., 25.],
+                         [16., 26.],
+                         [17., 27.]]]],
 
-                          [[ 32.],
-                           [132.]]],
 
 
-                         [[[ 13.],
-                           [113.]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 23.],
-                           [123.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 33.],
-                           [133.]]]]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
 
+                       [[[20., 30.],
+                         [21., 31.],
+                         [22., 32.],
+                         [23., 33.]],
 
+                        [[21., 31.],
+                         [22., 32.],
+                         [23., 33.],
+                         [24., 34.]],
 
+                        [[22., 32.],
+                         [23., 33.],
+                         [24., 34.],
+                         [25., 35.]]]],
 
-                      [[[[[[ 13.],
-                           [113.]],
 
-                          [[ 23.],
-                           [123.]],
 
-                          [[ 33.],
-                           [133.]]],
+                      [[[[ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                         [[[ 14.],
-                           [114.]],
+                        [[ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.]]],
 
-                          [[ 24.],
-                           [124.]],
 
-                          [[ 34.],
-                           [134.]]]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-                        [[[[ 14.],
-                           [114.]],
 
-                          [[ 24.],
-                           [124.]],
 
-                          [[ 34.],
-                           [134.]]],
+                      [[[[10., 20.],
+                         [11., 21.],
+                         [12., 22.],
+                         [13., 23.]],
 
+                        [[11., 21.],
+                         [12., 22.],
+                         [13., 23.],
+                         [14., 24.]],
 
-                         [[[ 15.],
-                           [115.]],
+                        [[12., 22.],
+                         [13., 23.],
+                         [14., 24.],
+                         [15., 25.]]],
 
-                          [[ 25.],
-                           [125.]],
 
-                          [[ 35.],
-                           [135.]]]],
+                       [[[15., 25.],
+                         [16., 26.],
+                         [17., 27.],
+                         [18., 28.]],
 
+                        [[16., 26.],
+                         [17., 27.],
+                         [18., 28.],
+                         [19., 29.]],
 
+                        [[17., 27.],
+                         [18., 28.],
+                         [19., 29.],
+                         [20., 30.]]]],
 
-                        [[[[ 15.],
-                           [115.]],
 
-                          [[ 25.],
-                           [125.]],
 
-                          [[ 35.],
-                           [135.]]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[18., 28.],
+                         [19., 29.],
+                         [20., 30.],
+                         [21., 31.]],
 
-                         [[[ 16.],
-                           [116.]],
+                        [[19., 29.],
+                         [20., 30.],
+                         [21., 31.],
+                         [22., 32.]]],
 
-                          [[ 26.],
-                           [126.]],
 
-                          [[ 36.],
-                           [136.]]]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[23., 33.],
+                         [24., 34.],
+                         [25., 35.],
+                         [26., 36.]],
 
+                        [[24., 34.],
+                         [25., 35.],
+                         [26., 36.],
+                         [27., 37.]]]]]), device='cuda:0', size=(8, 6, 4, 2),
+       nnz=7, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0, 4, 7], device='cuda:0')
+# _row_indices
+tensor([0, 1, 2, 3, 0, 2, 3], device='cuda:0')
+# _values
+tensor([[[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                        [[[[ 16.],
-                           [116.]],
+          [[ 1., 11.],
+           [ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.]],
 
-                          [[ 26.],
-                           [126.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-                          [[ 36.],
-                           [136.]]],
 
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                         [[[ 17.],
-                           [117.]],
+          [[ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.]],
 
-                          [[ 27.],
-                           [127.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-                          [[ 37.],
-                           [137.]]]]],
 
 
+        [[[[ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.]],
 
+          [[ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.],
+           [10., 20.]],
 
-                       [[[[[ 17.],
-                           [117.]],
+          [[ 8., 18.],
+           [ 9., 19.],
+           [10., 20.],
+           [11., 21.]]],
 
-                          [[ 27.],
-                           [127.]],
 
-                          [[ 37.],
-                           [137.]]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                         [[[ 18.],
-                           [118.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-                          [[ 28.],
-                           [128.]],
 
-                          [[ 38.],
-                           [138.]]]],
 
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 9., 19.],
+           [10., 20.],
+           [11., 21.],
+           [12., 22.]],
 
-                        [[[[ 18.],
-                           [118.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-                          [[ 28.],
-                           [128.]],
 
-                          [[ 38.],
-                           [138.]]],
+         [[[13., 23.],
+           [14., 24.],
+           [15., 25.],
+           [16., 26.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                         [[[ 19.],
-                           [119.]],
+          [[14., 24.],
+           [15., 25.],
+           [16., 26.],
+           [17., 27.]]]],
 
-                          [[ 29.],
-                           [129.]],
 
-                          [[ 39.],
-                           [139.]]]],
 
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                        [[[[ 19.],
-                           [119.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-                          [[ 29.],
-                           [129.]],
 
-                          [[ 39.],
-                           [139.]]],
+         [[[20., 30.],
+           [21., 31.],
+           [22., 32.],
+           [23., 33.]],
 
+          [[21., 31.],
+           [22., 32.],
+           [23., 33.],
+           [24., 34.]],
 
-                         [[[ 20.],
-                           [120.]],
+          [[22., 32.],
+           [23., 33.],
+           [24., 34.],
+           [25., 35.]]]],
 
-                          [[ 30.],
-                           [130.]],
 
-                          [[ 40.],
-                           [140.]]]],
 
+        [[[[ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                        [[[[ 20.],
-                           [120.]],
+          [[ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.]]],
 
-                          [[ 30.],
-                           [130.]],
 
-                          [[ 40.],
-                           [140.]]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.]],
 
-                         [[[ 21.],
-                           [121.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-                          [[ 31.],
-                           [131.]],
 
-                          [[ 41.],
-                           [141.]]]]],
 
+        [[[[10., 20.],
+           [11., 21.],
+           [12., 22.],
+           [13., 23.]],
 
+          [[11., 21.],
+           [12., 22.],
+           [13., 23.],
+           [14., 24.]],
 
+          [[12., 22.],
+           [13., 23.],
+           [14., 24.],
+           [15., 25.]]],
 
-                       [[[[[ 21.],
-                           [121.]],
 
-                          [[ 31.],
-                           [131.]],
+         [[[15., 25.],
+           [16., 26.],
+           [17., 27.],
+           [18., 28.]],
 
-                          [[ 41.],
-                           [141.]]],
+          [[16., 26.],
+           [17., 27.],
+           [18., 28.],
+           [19., 29.]],
 
+          [[17., 27.],
+           [18., 28.],
+           [19., 29.],
+           [20., 30.]]]],
 
-                         [[[ 22.],
-                           [122.]],
 
-                          [[ 32.],
-                           [132.]],
 
-                          [[ 42.],
-                           [142.]]]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[18., 28.],
+           [19., 29.],
+           [20., 30.],
+           [21., 31.]],
 
+          [[19., 29.],
+           [20., 30.],
+           [21., 31.],
+           [22., 32.]]],
 
-                        [[[[ 22.],
-                           [122.]],
 
-                          [[ 32.],
-                           [132.]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                          [[ 42.],
-                           [142.]]],
+          [[23., 33.],
+           [24., 34.],
+           [25., 35.],
+           [26., 36.]],
 
+          [[24., 34.],
+           [25., 35.],
+           [26., 36.],
+           [27., 37.]]]]], device='cuda:0')
 
-                         [[[ 23.],
-                           [123.]],
 
-                          [[ 33.],
-                           [133.]],
+########## torch.float64/torch.int64/size=(2, 3)+(2, 3)+(4,) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]],
 
-                          [[ 43.],
-                           [143.]]]],
+                            [[0, 1, 2, 3],
+                             [0, 1, 2, 3],
+                             [0, 1, 2, 3]]]),
+       row_indices=tensor([[[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]],
 
+                           [[0, 0, 0],
+                            [0, 0, 0],
+                            [0, 0, 0]]]),
+       values=tensor([[[[[[1., 2., 3., 4.]],
 
+                         [[3., 4., 5., 6.]]],
 
-                        [[[[ 23.],
-                           [123.]],
 
-                          [[ 33.],
-                           [133.]],
+                        [[[2., 3., 4., 5.]],
 
-                          [[ 43.],
-                           [143.]]],
+                         [[0., 0., 0., 0.]]],
 
 
-                         [[[ 24.],
-                           [124.]],
+                        [[[0., 0., 0., 0.]],
 
-                          [[ 34.],
-                           [134.]],
+                         [[4., 5., 6., 7.]]]],
 
-                          [[ 44.],
-                           [144.]]]],
 
 
+                       [[[[1., 2., 3., 4.]],
 
-                        [[[[ 24.],
-                           [124.]],
+                         [[4., 5., 6., 7.]]],
 
-                          [[ 34.],
-                           [134.]],
 
-                          [[ 44.],
-                           [144.]]],
+                        [[[2., 3., 4., 5.]],
 
+                         [[0., 0., 0., 0.]]],
 
-                         [[[ 25.],
-                           [125.]],
 
-                          [[ 35.],
-                           [135.]],
+                        [[[3., 4., 5., 6.]],
 
-                          [[ 45.],
-                           [145.]]]]]]]), device='cuda:0',
-       size=(2, 3, 6, 6, 2, 1), nnz=4, dtype=torch.float64,
-       layout=torch.sparse_bsc)
-# _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+                         [[0., 0., 0., 0.]]]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], device='cuda:0')
-# _row_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], device='cuda:0')
-# _values
-tensor([[[[[[[  1.],
-             [101.]],
 
-            [[ 11.],
-             [111.]],
+                       [[[[1., 2., 3., 4.]],
 
-            [[ 21.],
-             [121.]]],
+                         [[2., 3., 4., 5.]]],
 
 
-           [[[  2.],
-             [102.]],
+                        [[[0., 0., 0., 0.]],
 
-            [[ 12.],
-             [112.]],
+                         [[3., 4., 5., 6.]]],
 
-            [[ 22.],
-             [122.]]]],
 
+                        [[[0., 0., 0., 0.]],
 
+                         [[4., 5., 6., 7.]]]]],
 
-          [[[[  2.],
-             [102.]],
 
-            [[ 12.],
-             [112.]],
 
-            [[ 22.],
-             [122.]]],
 
+                      [[[[[0., 0., 0., 0.]],
 
-           [[[  3.],
-             [103.]],
+                         [[2., 3., 4., 5.]]],
 
-            [[ 13.],
-             [113.]],
 
-            [[ 23.],
-             [123.]]]],
+                        [[[1., 2., 3., 4.]],
 
+                         [[3., 4., 5., 6.]]],
 
 
-          [[[[  3.],
-             [103.]],
+                        [[[0., 0., 0., 0.]],
 
-            [[ 13.],
-             [113.]],
+                         [[4., 5., 6., 7.]]]],
 
-            [[ 23.],
-             [123.]]],
 
 
-           [[[  4.],
-             [104.]],
+                       [[[[1., 2., 3., 4.]],
 
-            [[ 14.],
-             [114.]],
+                         [[3., 4., 5., 6.]]],
 
-            [[ 24.],
-             [124.]]]],
 
+                        [[[0., 0., 0., 0.]],
 
+                         [[4., 5., 6., 7.]]],
 
-          [[[[  4.],
-             [104.]],
 
-            [[ 14.],
-             [114.]],
+                        [[[2., 3., 4., 5.]],
 
-            [[ 24.],
-             [124.]]],
+                         [[0., 0., 0., 0.]]]],
 
 
-           [[[  5.],
-             [105.]],
 
-            [[ 15.],
-             [115.]],
+                       [[[[1., 2., 3., 4.]],
 
-            [[ 25.],
-             [125.]]]]],
+                         [[0., 0., 0., 0.]]],
 
 
+                        [[[2., 3., 4., 5.]],
 
+                         [[4., 5., 6., 7.]]],
 
-         [[[[[  5.],
-             [105.]],
 
-            [[ 15.],
-             [115.]],
+                        [[[3., 4., 5., 6.]],
 
-            [[ 25.],
-             [125.]]],
+                         [[0., 0., 0., 0.]]]]]]), device='cuda:0',
+       size=(2, 3, 2, 3, 4), nnz=3, dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([[[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]],
 
+        [[0, 1, 2, 3],
+         [0, 1, 2, 3],
+         [0, 1, 2, 3]]], device='cuda:0')
+# _row_indices
+tensor([[[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]],
 
-           [[[  6.],
-             [106.]],
+        [[0, 0, 0],
+         [0, 0, 0],
+         [0, 0, 0]]], device='cuda:0')
+# _values
+tensor([[[[[[1., 2., 3., 4.]],
 
-            [[ 16.],
-             [116.]],
+           [[3., 4., 5., 6.]]],
 
-            [[ 26.],
-             [126.]]]],
 
+          [[[2., 3., 4., 5.]],
 
+           [[0., 0., 0., 0.]]],
 
-          [[[[  6.],
-             [106.]],
 
-            [[ 16.],
-             [116.]],
+          [[[0., 0., 0., 0.]],
 
-            [[ 26.],
-             [126.]]],
+           [[4., 5., 6., 7.]]]],
 
 
-           [[[  7.],
-             [107.]],
 
-            [[ 17.],
-             [117.]],
+         [[[[1., 2., 3., 4.]],
 
-            [[ 27.],
-             [127.]]]],
+           [[4., 5., 6., 7.]]],
 
 
+          [[[2., 3., 4., 5.]],
 
-          [[[[  7.],
-             [107.]],
+           [[0., 0., 0., 0.]]],
 
-            [[ 17.],
-             [117.]],
 
-            [[ 27.],
-             [127.]]],
+          [[[3., 4., 5., 6.]],
 
+           [[0., 0., 0., 0.]]]],
 
-           [[[  8.],
-             [108.]],
 
-            [[ 18.],
-             [118.]],
 
-            [[ 28.],
-             [128.]]]],
+         [[[[1., 2., 3., 4.]],
 
+           [[2., 3., 4., 5.]]],
 
 
-          [[[[  8.],
-             [108.]],
+          [[[0., 0., 0., 0.]],
 
-            [[ 18.],
-             [118.]],
+           [[3., 4., 5., 6.]]],
 
-            [[ 28.],
-             [128.]]],
 
+          [[[0., 0., 0., 0.]],
 
-           [[[  9.],
-             [109.]],
+           [[4., 5., 6., 7.]]]]],
 
-            [[ 19.],
-             [119.]],
 
-            [[ 29.],
-             [129.]]]]],
 
 
+        [[[[[0., 0., 0., 0.]],
 
+           [[2., 3., 4., 5.]]],
 
-         [[[[[  9.],
-             [109.]],
 
-            [[ 19.],
-             [119.]],
+          [[[1., 2., 3., 4.]],
 
-            [[ 29.],
-             [129.]]],
+           [[3., 4., 5., 6.]]],
 
 
-           [[[ 10.],
-             [110.]],
+          [[[0., 0., 0., 0.]],
 
-            [[ 20.],
-             [120.]],
+           [[4., 5., 6., 7.]]]],
 
-            [[ 30.],
-             [130.]]]],
 
 
+         [[[[1., 2., 3., 4.]],
 
-          [[[[ 10.],
-             [110.]],
+           [[3., 4., 5., 6.]]],
 
-            [[ 20.],
-             [120.]],
 
-            [[ 30.],
-             [130.]]],
+          [[[0., 0., 0., 0.]],
 
+           [[4., 5., 6., 7.]]],
 
-           [[[ 11.],
-             [111.]],
 
-            [[ 21.],
-             [121.]],
+          [[[2., 3., 4., 5.]],
 
-            [[ 31.],
-             [131.]]]],
+           [[0., 0., 0., 0.]]]],
 
 
-          [[[[ 11.],
-             [111.]],
+         [[[[1., 2., 3., 4.]],
 
-            [[ 21.],
-             [121.]],
+           [[0., 0., 0., 0.]]],
 
-            [[ 31.],
-             [131.]]],
 
+          [[[2., 3., 4., 5.]],
 
-           [[[ 12.],
-             [112.]],
+           [[4., 5., 6., 7.]]],
 
-            [[ 22.],
-             [122.]],
 
-            [[ 32.],
-             [132.]]]],
+          [[[3., 4., 5., 6.]],
 
+           [[0., 0., 0., 0.]]]]]], device='cuda:0', dtype=torch.float64)
 
+########## torch.float64/torch.int64/size=()+(8, 6)+(4, 2) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([0, 4, 7]),
+       row_indices=tensor([0, 1, 2, 3, 0, 2, 3]),
+       values=tensor([[[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
+
+                        [[ 1., 11.],
+                         [ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.]],
 
-          [[[[ 12.],
-             [112.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
-            [[ 22.],
-             [122.]],
 
-            [[ 32.],
-             [132.]]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.]],
 
-           [[[ 13.],
-             [113.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-            [[ 23.],
-             [123.]],
 
-            [[ 33.],
-             [133.]]]]]],
 
+                      [[[[ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.]],
 
+                        [[ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.]],
 
+                        [[ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.],
+                         [11., 21.]]],
 
 
-        [[[[[[ 13.],
-             [113.]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 23.],
-             [123.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 33.],
-             [133.]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
 
-           [[[ 14.],
-             [114.]],
 
-            [[ 24.],
-             [124.]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 34.],
-             [134.]]]],
+                        [[ 9., 19.],
+                         [10., 20.],
+                         [11., 21.],
+                         [12., 22.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
 
-          [[[[ 14.],
-             [114.]],
+                       [[[13., 23.],
+                         [14., 24.],
+                         [15., 25.],
+                         [16., 26.]],
 
-            [[ 24.],
-             [124.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 34.],
-             [134.]]],
+                        [[14., 24.],
+                         [15., 25.],
+                         [16., 26.],
+                         [17., 27.]]]],
 
 
-           [[[ 15.],
-             [115.]],
 
-            [[ 25.],
-             [125.]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 35.],
-             [135.]]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
 
-          [[[[ 15.],
-             [115.]],
+                       [[[20., 30.],
+                         [21., 31.],
+                         [22., 32.],
+                         [23., 33.]],
 
-            [[ 25.],
-             [125.]],
+                        [[21., 31.],
+                         [22., 32.],
+                         [23., 33.],
+                         [24., 34.]],
 
-            [[ 35.],
-             [135.]]],
+                        [[22., 32.],
+                         [23., 33.],
+                         [24., 34.],
+                         [25., 35.]]]],
 
 
-           [[[ 16.],
-             [116.]],
 
-            [[ 26.],
-             [126.]],
+                      [[[[ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.]],
 
-            [[ 36.],
-             [136.]]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.]]],
 
 
-          [[[[ 16.],
-             [116.]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 26.],
-             [126.]],
+                        [[ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.]],
 
-            [[ 36.],
-             [136.]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
 
-           [[[ 17.],
-             [117.]],
 
-            [[ 27.],
-             [127.]],
+                      [[[[10., 20.],
+                         [11., 21.],
+                         [12., 22.],
+                         [13., 23.]],
 
-            [[ 37.],
-             [137.]]]]],
+                        [[11., 21.],
+                         [12., 22.],
+                         [13., 23.],
+                         [14., 24.]],
 
+                        [[12., 22.],
+                         [13., 23.],
+                         [14., 24.],
+                         [15., 25.]]],
 
 
+                       [[[15., 25.],
+                         [16., 26.],
+                         [17., 27.],
+                         [18., 28.]],
 
-         [[[[[ 17.],
-             [117.]],
+                        [[16., 26.],
+                         [17., 27.],
+                         [18., 28.],
+                         [19., 29.]],
 
-            [[ 27.],
-             [127.]],
+                        [[17., 27.],
+                         [18., 28.],
+                         [19., 29.],
+                         [20., 30.]]]],
 
-            [[ 37.],
-             [137.]]],
 
 
-           [[[ 18.],
-             [118.]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 28.],
-             [128.]],
+                        [[18., 28.],
+                         [19., 29.],
+                         [20., 30.],
+                         [21., 31.]],
 
-            [[ 38.],
-             [138.]]]],
+                        [[19., 29.],
+                         [20., 30.],
+                         [21., 31.],
+                         [22., 32.]]],
 
 
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-          [[[[ 18.],
-             [118.]],
+                        [[23., 33.],
+                         [24., 34.],
+                         [25., 35.],
+                         [26., 36.]],
 
-            [[ 28.],
-             [128.]],
+                        [[24., 34.],
+                         [25., 35.],
+                         [26., 36.],
+                         [27., 37.]]]]]), device='cuda:0', size=(8, 6, 4, 2),
+       nnz=7, dtype=torch.float64, layout=torch.sparse_bsc)
+# _ccol_indices
+tensor([0, 4, 7], device='cuda:0')
+# _row_indices
+tensor([0, 1, 2, 3, 0, 2, 3], device='cuda:0')
+# _values
+tensor([[[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 38.],
-             [138.]]],
+          [[ 1., 11.],
+           [ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-           [[[ 19.],
-             [119.]],
 
-            [[ 29.],
-             [129.]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 39.],
-             [139.]]]],
+          [[ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
 
-          [[[[ 19.],
-             [119.]],
 
-            [[ 29.],
-             [129.]],
+        [[[[ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.]],
 
-            [[ 39.],
-             [139.]]],
+          [[ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.],
+           [10., 20.]],
 
+          [[ 8., 18.],
+           [ 9., 19.],
+           [10., 20.],
+           [11., 21.]]],
 
-           [[[ 20.],
-             [120.]],
 
-            [[ 30.],
-             [130.]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 40.],
-             [140.]]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
 
-          [[[[ 20.],
-             [120.]],
 
-            [[ 30.],
-             [130.]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 40.],
-             [140.]]],
+          [[ 9., 19.],
+           [10., 20.],
+           [11., 21.],
+           [12., 22.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-           [[[ 21.],
-             [121.]],
 
-            [[ 31.],
-             [131.]],
+         [[[13., 23.],
+           [14., 24.],
+           [15., 25.],
+           [16., 26.]],
 
-            [[ 41.],
-             [141.]]]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[14., 24.],
+           [15., 25.],
+           [16., 26.],
+           [17., 27.]]]],
 
 
-         [[[[[ 21.],
-             [121.]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 31.],
-             [131.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 41.],
-             [141.]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
 
-           [[[ 22.],
-             [122.]],
+         [[[20., 30.],
+           [21., 31.],
+           [22., 32.],
+           [23., 33.]],
 
-            [[ 32.],
-             [132.]],
+          [[21., 31.],
+           [22., 32.],
+           [23., 33.],
+           [24., 34.]],
 
-            [[ 42.],
-             [142.]]]],
+          [[22., 32.],
+           [23., 33.],
+           [24., 34.],
+           [25., 35.]]]],
 
 
-          [[[[ 22.],
-             [122.]],
+        [[[[ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.]],
 
-            [[ 32.],
-             [132.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 42.],
-             [142.]]],
+          [[ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.]]],
 
 
-           [[[ 23.],
-             [123.]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 33.],
-             [133.]],
+          [[ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.]],
 
-            [[ 43.],
-             [143.]]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
 
-          [[[[ 23.],
-             [123.]],
+        [[[[10., 20.],
+           [11., 21.],
+           [12., 22.],
+           [13., 23.]],
 
-            [[ 33.],
-             [133.]],
+          [[11., 21.],
+           [12., 22.],
+           [13., 23.],
+           [14., 24.]],
 
-            [[ 43.],
-             [143.]]],
+          [[12., 22.],
+           [13., 23.],
+           [14., 24.],
+           [15., 25.]]],
 
 
-           [[[ 24.],
-             [124.]],
+         [[[15., 25.],
+           [16., 26.],
+           [17., 27.],
+           [18., 28.]],
 
-            [[ 34.],
-             [134.]],
+          [[16., 26.],
+           [17., 27.],
+           [18., 28.],
+           [19., 29.]],
 
-            [[ 44.],
-             [144.]]]],
+          [[17., 27.],
+           [18., 28.],
+           [19., 29.],
+           [20., 30.]]]],
 
 
-          [[[[ 24.],
-             [124.]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 34.],
-             [134.]],
+          [[18., 28.],
+           [19., 29.],
+           [20., 30.],
+           [21., 31.]],
 
-            [[ 44.],
-             [144.]]],
+          [[19., 29.],
+           [20., 30.],
+           [21., 31.],
+           [22., 32.]]],
 
 
-           [[[ 25.],
-             [125.]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 35.],
-             [135.]],
+          [[23., 33.],
+           [24., 34.],
+           [25., 35.],
+           [26., 36.]],
 
-            [[ 45.],
-             [145.]]]]]]], device='cuda:0', dtype=torch.float64)
+          [[24., 34.],
+           [25., 35.],
+           [26., 36.],
+           [27., 37.]]]]], device='cuda:0', dtype=torch.float64)
 
diff --git a/test/expect/TestSparseCompressedCUDA.test_print_SparseBSR_cuda.expect b/test/expect/TestSparseCompressedCUDA.test_print_SparseBSR_cuda.expect
index 0dd1aff7d4dc2..66bc7fa9885e4 100644
--- a/test/expect/TestSparseCompressedCUDA.test_print_SparseBSR_cuda.expect
+++ b/test/expect/TestSparseCompressedCUDA.test_print_SparseBSR_cuda.expect
@@ -1,6949 +1,3583 @@
-########## torch.float32/torch.int32/size=()+(4, 3)+() ##########
+########## torch.float32/torch.int32/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[1.],
-                       [2.]],
+tensor(crow_indices=tensor([[[0, 3],
+                             [0, 3],
+                             [0, 3]],
 
-                      [[2.],
-                       [3.]],
+                            [[0, 3],
+                             [0, 3],
+                             [0, 3]]]),
+       col_indices=tensor([[[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]],
 
-                      [[3.],
-                       [4.]],
+                           [[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]]]),
+       values=tensor([[[[[1.],
+                         [3.]],
 
-                      [[4.],
-                       [5.]]]), device='cuda:0', size=(4, 3), nnz=4,
-       layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 2], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[[1.],
-         [2.]],
-
-        [[2.],
-         [3.]],
-
-        [[3.],
-         [4.]],
-
-        [[4.],
-         [5.]]], device='cuda:0')
-
-########## torch.float32/torch.int32/size=()+(0, 0)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([0]),
-       col_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0, 2, 1)), device='cuda:0', size=(0, 0), nnz=0,
-       layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0], device='cuda:0', dtype=torch.int32)
-# _col_indices
-tensor([], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([], device='cuda:0', size=(0, 2, 1))
-
-########## torch.float32/torch.int32/size=(2,)+(2, 6)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       col_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[[[ 1., 11.]],
-
-                       [[ 2., 12.]],
-
-                       [[ 3., 13.]],
-
-                       [[ 4., 14.]]],
-
-
-                      [[[ 5., 15.]],
-
-                       [[ 6., 16.]],
-
-                       [[ 7., 17.]],
-
-                       [[ 8., 18.]]]]), device='cuda:0', size=(2, 2, 6), nnz=4,
-       layout=torch.sparse_bsr)
-# _crow_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]], device='cuda:0', dtype=torch.int32)
-# _col_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[[[ 1., 11.]],
-
-         [[ 2., 12.]],
-
-         [[ 3., 13.]],
-
-         [[ 4., 14.]]],
-
-
-        [[[ 5., 15.]],
+                        [[2.],
+                         [0.]],
 
-         [[ 6., 16.]],
+                        [[0.],
+                         [4.]]],
 
-         [[ 7., 17.]],
 
-         [[ 8., 18.]]]], device='cuda:0')
+                       [[[1.],
+                         [4.]],
 
-########## torch.float32/torch.int32/size=(2, 3)+(4, 9)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1., 11., 21.],
-                         [ 2., 12., 22.]],
-
-                        [[ 2., 12., 22.],
-                         [ 3., 13., 23.]],
-
-                        [[ 3., 13., 23.],
-                         [ 4., 14., 24.]],
-
-                        [[ 4., 14., 24.],
-                         [ 5., 15., 25.]]],
-
-
-                       [[[ 5., 15., 25.],
-                         [ 6., 16., 26.]],
+                        [[2.],
+                         [0.]],
 
-                        [[ 6., 16., 26.],
-                         [ 7., 17., 27.]],
+                        [[3.],
+                         [0.]]],
 
-                        [[ 7., 17., 27.],
-                         [ 8., 18., 28.]],
 
-                        [[ 8., 18., 28.],
-                         [ 9., 19., 29.]]],
+                       [[[1.],
+                         [2.]],
 
+                        [[0.],
+                         [3.]],
 
-                       [[[ 9., 19., 29.],
-                         [10., 20., 30.]],
+                        [[0.],
+                         [4.]]]],
 
-                        [[10., 20., 30.],
-                         [11., 21., 31.]],
 
-                        [[11., 21., 31.],
-                         [12., 22., 32.]],
 
-                        [[12., 22., 32.],
-                         [13., 23., 33.]]]],
+                      [[[[0.],
+                         [2.]],
 
+                        [[1.],
+                         [3.]],
 
+                        [[0.],
+                         [4.]]],
 
-                      [[[[13., 23., 33.],
-                         [14., 24., 34.]],
 
-                        [[14., 24., 34.],
-                         [15., 25., 35.]],
+                       [[[1.],
+                         [3.]],
 
-                        [[15., 25., 35.],
-                         [16., 26., 36.]],
+                        [[0.],
+                         [4.]],
 
-                        [[16., 26., 36.],
-                         [17., 27., 37.]]],
+                        [[2.],
+                         [0.]]],
 
 
-                       [[[17., 27., 37.],
-                         [18., 28., 38.]],
+                       [[[1.],
+                         [0.]],
 
-                        [[18., 28., 38.],
-                         [19., 29., 39.]],
+                        [[2.],
+                         [4.]],
 
-                        [[19., 29., 39.],
-                         [20., 30., 40.]],
-
-                        [[20., 30., 40.],
-                         [21., 31., 41.]]],
-
-
-                       [[[21., 31., 41.],
-                         [22., 32., 42.]],
-
-                        [[22., 32., 42.],
-                         [23., 33., 43.]],
-
-                        [[23., 33., 43.],
-                         [24., 34., 44.]],
-
-                        [[24., 34., 44.],
-                         [25., 35., 45.]]]]]), device='cuda:0',
-       size=(2, 3, 4, 9), nnz=4, layout=torch.sparse_bsr)
+                        [[3.],
+                         [0.]]]]]), device='cuda:0', size=(2, 3, 2, 3), nnz=3,
+       layout=torch.sparse_bsr)
 # _crow_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+tensor([[[0, 3],
+         [0, 3],
+         [0, 3]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], device='cuda:0', dtype=torch.int32)
+        [[0, 3],
+         [0, 3],
+         [0, 3]]], device='cuda:0', dtype=torch.int32)
 # _col_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
+tensor([[[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], device='cuda:0', dtype=torch.int32)
+        [[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]]], device='cuda:0', dtype=torch.int32)
 # _values
-tensor([[[[[ 1., 11., 21.],
-           [ 2., 12., 22.]],
-
-          [[ 2., 12., 22.],
-           [ 3., 13., 23.]],
+tensor([[[[[1.],
+           [3.]],
 
-          [[ 3., 13., 23.],
-           [ 4., 14., 24.]],
+          [[2.],
+           [0.]],
 
-          [[ 4., 14., 24.],
-           [ 5., 15., 25.]]],
+          [[0.],
+           [4.]]],
 
 
-         [[[ 5., 15., 25.],
-           [ 6., 16., 26.]],
+         [[[1.],
+           [4.]],
 
-          [[ 6., 16., 26.],
-           [ 7., 17., 27.]],
+          [[2.],
+           [0.]],
 
-          [[ 7., 17., 27.],
-           [ 8., 18., 28.]],
+          [[3.],
+           [0.]]],
 
-          [[ 8., 18., 28.],
-           [ 9., 19., 29.]]],
 
+         [[[1.],
+           [2.]],
 
-         [[[ 9., 19., 29.],
-           [10., 20., 30.]],
+          [[0.],
+           [3.]],
 
-          [[10., 20., 30.],
-           [11., 21., 31.]],
+          [[0.],
+           [4.]]]],
 
-          [[11., 21., 31.],
-           [12., 22., 32.]],
 
-          [[12., 22., 32.],
-           [13., 23., 33.]]]],
 
+        [[[[0.],
+           [2.]],
 
+          [[1.],
+           [3.]],
 
-        [[[[13., 23., 33.],
-           [14., 24., 34.]],
+          [[0.],
+           [4.]]],
 
-          [[14., 24., 34.],
-           [15., 25., 35.]],
 
-          [[15., 25., 35.],
-           [16., 26., 36.]],
+         [[[1.],
+           [3.]],
 
-          [[16., 26., 36.],
-           [17., 27., 37.]]],
+          [[0.],
+           [4.]],
 
+          [[2.],
+           [0.]]],
 
-         [[[17., 27., 37.],
-           [18., 28., 38.]],
-
-          [[18., 28., 38.],
-           [19., 29., 39.]],
-
-          [[19., 29., 39.],
-           [20., 30., 40.]],
-
-          [[20., 30., 40.],
-           [21., 31., 41.]]],
-
-
-         [[[21., 31., 41.],
-           [22., 32., 42.]],
-
-          [[22., 32., 42.],
-           [23., 33., 43.]],
-
-          [[23., 33., 43.],
-           [24., 34., 44.]],
-
-          [[24., 34., 44.],
-           [25., 35., 45.]]]]], device='cuda:0')
-
-
-########## torch.float64/torch.int32/size=()+(4, 3)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[1.],
-                       [2.]],
-
-                      [[2.],
-                       [3.]],
-
-                      [[3.],
-                       [4.]],
-
-                      [[4.],
-                       [5.]]]), device='cuda:0', size=(4, 3), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 2], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[[1.],
-         [2.]],
-
-        [[2.],
-         [3.]],
 
-        [[3.],
-         [4.]],
+         [[[1.],
+           [0.]],
 
-        [[4.],
-         [5.]]], device='cuda:0', dtype=torch.float64)
+          [[2.],
+           [4.]],
 
-########## torch.float64/torch.int32/size=()+(0, 0)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([0]),
-       col_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0, 2, 1)), device='cuda:0', size=(0, 0), nnz=0,
-       dtype=torch.float64, layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0], device='cuda:0', dtype=torch.int32)
-# _col_indices
-tensor([], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([], device='cuda:0', size=(0, 2, 1), dtype=torch.float64)
+          [[3.],
+           [0.]]]]], device='cuda:0')
 
-########## torch.float64/torch.int32/size=(2,)+(2, 6)+() ##########
+########## torch.float32/torch.int32/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(crow_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       col_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[[[ 1., 11.]],
+tensor(crow_indices=tensor([0, 2, 3, 5, 7]),
+       col_indices=tensor([0, 1, 0, 0, 1, 0, 1]),
+       values=tensor([[[ 0.,  1.,  0.],
+                       [ 0.,  4.,  0.]],
 
-                       [[ 2., 12.]],
+                      [[ 2.,  0.,  3.],
+                       [ 0.,  5.,  0.]],
 
-                       [[ 3., 13.]],
+                      [[ 6.,  7.,  8.],
+                       [ 0.,  0.,  0.]],
 
-                       [[ 4., 14.]]],
+                      [[ 0.,  9.,  0.],
+                       [13.,  0., 14.]],
 
+                      [[10., 11., 12.],
+                       [15., 16., 17.]],
 
-                      [[[ 5., 15.]],
+                      [[ 0.,  0.,  0.],
+                       [20., 21., 22.]],
 
-                       [[ 6., 16.]],
-
-                       [[ 7., 17.]],
-
-                       [[ 8., 18.]]]]), device='cuda:0', size=(2, 2, 6), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_bsr)
+                      [[ 0., 18., 19.],
+                       [ 0., 23., 24.]]]), device='cuda:0', size=(8, 6), nnz=7,
+       layout=torch.sparse_bsr)
 # _crow_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]], device='cuda:0', dtype=torch.int32)
+tensor([0, 2, 3, 5, 7], device='cuda:0', dtype=torch.int32)
 # _col_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]], device='cuda:0', dtype=torch.int32)
+tensor([0, 1, 0, 0, 1, 0, 1], device='cuda:0', dtype=torch.int32)
 # _values
-tensor([[[[ 1., 11.]],
+tensor([[[ 0.,  1.,  0.],
+         [ 0.,  4.,  0.]],
 
-         [[ 2., 12.]],
+        [[ 2.,  0.,  3.],
+         [ 0.,  5.,  0.]],
 
-         [[ 3., 13.]],
+        [[ 6.,  7.,  8.],
+         [ 0.,  0.,  0.]],
 
-         [[ 4., 14.]]],
+        [[ 0.,  9.,  0.],
+         [13.,  0., 14.]],
 
+        [[10., 11., 12.],
+         [15., 16., 17.]],
 
-        [[[ 5., 15.]],
+        [[ 0.,  0.,  0.],
+         [20., 21., 22.]],
 
-         [[ 6., 16.]],
+        [[ 0., 18., 19.],
+         [ 0., 23., 24.]]], device='cuda:0')
 
-         [[ 7., 17.]],
 
-         [[ 8., 18.]]]], device='cuda:0', dtype=torch.float64)
-
-########## torch.float64/torch.int32/size=(2, 3)+(4, 9)+() ##########
+########## torch.float64/torch.int32/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
-tensor(crow_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1., 11., 21.],
-                         [ 2., 12., 22.]],
-
-                        [[ 2., 12., 22.],
-                         [ 3., 13., 23.]],
+tensor(crow_indices=tensor([[[0, 3],
+                             [0, 3],
+                             [0, 3]],
 
-                        [[ 3., 13., 23.],
-                         [ 4., 14., 24.]],
+                            [[0, 3],
+                             [0, 3],
+                             [0, 3]]]),
+       col_indices=tensor([[[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]],
 
-                        [[ 4., 14., 24.],
-                         [ 5., 15., 25.]]],
+                           [[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]]]),
+       values=tensor([[[[[1.],
+                         [3.]],
 
+                        [[2.],
+                         [0.]],
 
-                       [[[ 5., 15., 25.],
-                         [ 6., 16., 26.]],
+                        [[0.],
+                         [4.]]],
 
-                        [[ 6., 16., 26.],
-                         [ 7., 17., 27.]],
 
-                        [[ 7., 17., 27.],
-                         [ 8., 18., 28.]],
+                       [[[1.],
+                         [4.]],
 
-                        [[ 8., 18., 28.],
-                         [ 9., 19., 29.]]],
+                        [[2.],
+                         [0.]],
 
+                        [[3.],
+                         [0.]]],
 
-                       [[[ 9., 19., 29.],
-                         [10., 20., 30.]],
 
-                        [[10., 20., 30.],
-                         [11., 21., 31.]],
+                       [[[1.],
+                         [2.]],
 
-                        [[11., 21., 31.],
-                         [12., 22., 32.]],
+                        [[0.],
+                         [3.]],
 
-                        [[12., 22., 32.],
-                         [13., 23., 33.]]]],
+                        [[0.],
+                         [4.]]]],
 
 
-                      [[[[13., 23., 33.],
-                         [14., 24., 34.]],
+                      [[[[0.],
+                         [2.]],
 
-                        [[14., 24., 34.],
-                         [15., 25., 35.]],
+                        [[1.],
+                         [3.]],
 
-                        [[15., 25., 35.],
-                         [16., 26., 36.]],
+                        [[0.],
+                         [4.]]],
 
-                        [[16., 26., 36.],
-                         [17., 27., 37.]]],
 
+                       [[[1.],
+                         [3.]],
 
-                       [[[17., 27., 37.],
-                         [18., 28., 38.]],
+                        [[0.],
+                         [4.]],
 
-                        [[18., 28., 38.],
-                         [19., 29., 39.]],
+                        [[2.],
+                         [0.]]],
 
-                        [[19., 29., 39.],
-                         [20., 30., 40.]],
 
-                        [[20., 30., 40.],
-                         [21., 31., 41.]]],
+                       [[[1.],
+                         [0.]],
 
+                        [[2.],
+                         [4.]],
 
-                       [[[21., 31., 41.],
-                         [22., 32., 42.]],
-
-                        [[22., 32., 42.],
-                         [23., 33., 43.]],
-
-                        [[23., 33., 43.],
-                         [24., 34., 44.]],
-
-                        [[24., 34., 44.],
-                         [25., 35., 45.]]]]]), device='cuda:0',
-       size=(2, 3, 4, 9), nnz=4, dtype=torch.float64, layout=torch.sparse_bsr)
+                        [[3.],
+                         [0.]]]]]), device='cuda:0', size=(2, 3, 2, 3), nnz=3,
+       dtype=torch.float64, layout=torch.sparse_bsr)
 # _crow_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+tensor([[[0, 3],
+         [0, 3],
+         [0, 3]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], device='cuda:0', dtype=torch.int32)
+        [[0, 3],
+         [0, 3],
+         [0, 3]]], device='cuda:0', dtype=torch.int32)
 # _col_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
+tensor([[[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], device='cuda:0', dtype=torch.int32)
+        [[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]]], device='cuda:0', dtype=torch.int32)
 # _values
-tensor([[[[[ 1., 11., 21.],
-           [ 2., 12., 22.]],
-
-          [[ 2., 12., 22.],
-           [ 3., 13., 23.]],
-
-          [[ 3., 13., 23.],
-           [ 4., 14., 24.]],
-
-          [[ 4., 14., 24.],
-           [ 5., 15., 25.]]],
-
-
-         [[[ 5., 15., 25.],
-           [ 6., 16., 26.]],
-
-          [[ 6., 16., 26.],
-           [ 7., 17., 27.]],
-
-          [[ 7., 17., 27.],
-           [ 8., 18., 28.]],
-
-          [[ 8., 18., 28.],
-           [ 9., 19., 29.]]],
-
-
-         [[[ 9., 19., 29.],
-           [10., 20., 30.]],
-
-          [[10., 20., 30.],
-           [11., 21., 31.]],
-
-          [[11., 21., 31.],
-           [12., 22., 32.]],
+tensor([[[[[1.],
+           [3.]],
 
-          [[12., 22., 32.],
-           [13., 23., 33.]]]],
+          [[2.],
+           [0.]],
 
+          [[0.],
+           [4.]]],
 
 
-        [[[[13., 23., 33.],
-           [14., 24., 34.]],
+         [[[1.],
+           [4.]],
 
-          [[14., 24., 34.],
-           [15., 25., 35.]],
+          [[2.],
+           [0.]],
 
-          [[15., 25., 35.],
-           [16., 26., 36.]],
+          [[3.],
+           [0.]]],
 
-          [[16., 26., 36.],
-           [17., 27., 37.]]],
 
+         [[[1.],
+           [2.]],
 
-         [[[17., 27., 37.],
-           [18., 28., 38.]],
+          [[0.],
+           [3.]],
 
-          [[18., 28., 38.],
-           [19., 29., 39.]],
+          [[0.],
+           [4.]]]],
 
-          [[19., 29., 39.],
-           [20., 30., 40.]],
 
-          [[20., 30., 40.],
-           [21., 31., 41.]]],
 
+        [[[[0.],
+           [2.]],
 
-         [[[21., 31., 41.],
-           [22., 32., 42.]],
+          [[1.],
+           [3.]],
 
-          [[22., 32., 42.],
-           [23., 33., 43.]],
+          [[0.],
+           [4.]]],
 
-          [[23., 33., 43.],
-           [24., 34., 44.]],
 
-          [[24., 34., 44.],
-           [25., 35., 45.]]]]], device='cuda:0', dtype=torch.float64)
+         [[[1.],
+           [3.]],
 
+          [[0.],
+           [4.]],
 
-########## torch.float32/torch.int64/size=()+(4, 3)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[1.],
-                       [2.]],
-
-                      [[2.],
-                       [3.]],
-
-                      [[3.],
-                       [4.]],
-
-                      [[4.],
-                       [5.]]]), device='cuda:0', size=(4, 3), nnz=4,
-       layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0')
-# _col_indices
-tensor([0, 1, 0, 2], device='cuda:0')
-# _values
-tensor([[[1.],
-         [2.]],
+          [[2.],
+           [0.]]],
 
-        [[2.],
-         [3.]],
 
-        [[3.],
-         [4.]],
+         [[[1.],
+           [0.]],
 
-        [[4.],
-         [5.]]], device='cuda:0')
+          [[2.],
+           [4.]],
 
-########## torch.float32/torch.int64/size=()+(0, 0)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([0]),
-       col_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0, 2, 1)), device='cuda:0', size=(0, 0), nnz=0,
-       layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0], device='cuda:0')
-# _col_indices
-tensor([], device='cuda:0', dtype=torch.int64)
-# _values
-tensor([], device='cuda:0', size=(0, 2, 1))
+          [[3.],
+           [0.]]]]], device='cuda:0', dtype=torch.float64)
 
-########## torch.float32/torch.int64/size=(2,)+(2, 6)+() ##########
+########## torch.float64/torch.int32/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(crow_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       col_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[[[ 1., 11.]],
-
-                       [[ 2., 12.]],
+tensor(crow_indices=tensor([0, 2, 3, 5, 7]),
+       col_indices=tensor([0, 1, 0, 0, 1, 0, 1]),
+       values=tensor([[[ 0.,  1.,  0.],
+                       [ 0.,  4.,  0.]],
 
-                       [[ 3., 13.]],
+                      [[ 2.,  0.,  3.],
+                       [ 0.,  5.,  0.]],
 
-                       [[ 4., 14.]]],
+                      [[ 6.,  7.,  8.],
+                       [ 0.,  0.,  0.]],
 
+                      [[ 0.,  9.,  0.],
+                       [13.,  0., 14.]],
 
-                      [[[ 5., 15.]],
+                      [[10., 11., 12.],
+                       [15., 16., 17.]],
 
-                       [[ 6., 16.]],
+                      [[ 0.,  0.,  0.],
+                       [20., 21., 22.]],
 
-                       [[ 7., 17.]],
-
-                       [[ 8., 18.]]]]), device='cuda:0', size=(2, 2, 6), nnz=4,
-       layout=torch.sparse_bsr)
+                      [[ 0., 18., 19.],
+                       [ 0., 23., 24.]]]), device='cuda:0', size=(8, 6), nnz=7,
+       dtype=torch.float64, layout=torch.sparse_bsr)
 # _crow_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]], device='cuda:0')
+tensor([0, 2, 3, 5, 7], device='cuda:0', dtype=torch.int32)
 # _col_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]], device='cuda:0')
+tensor([0, 1, 0, 0, 1, 0, 1], device='cuda:0', dtype=torch.int32)
 # _values
-tensor([[[[ 1., 11.]],
-
-         [[ 2., 12.]],
+tensor([[[ 0.,  1.,  0.],
+         [ 0.,  4.,  0.]],
 
-         [[ 3., 13.]],
+        [[ 2.,  0.,  3.],
+         [ 0.,  5.,  0.]],
 
-         [[ 4., 14.]]],
+        [[ 6.,  7.,  8.],
+         [ 0.,  0.,  0.]],
 
+        [[ 0.,  9.,  0.],
+         [13.,  0., 14.]],
 
-        [[[ 5., 15.]],
+        [[10., 11., 12.],
+         [15., 16., 17.]],
 
-         [[ 6., 16.]],
+        [[ 0.,  0.,  0.],
+         [20., 21., 22.]],
 
-         [[ 7., 17.]],
+        [[ 0., 18., 19.],
+         [ 0., 23., 24.]]], device='cuda:0', dtype=torch.float64)
 
-         [[ 8., 18.]]]], device='cuda:0')
 
-########## torch.float32/torch.int64/size=(2, 3)+(4, 9)+() ##########
+########## torch.float32/torch.int64/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
-tensor(crow_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1., 11., 21.],
-                         [ 2., 12., 22.]],
-
-                        [[ 2., 12., 22.],
-                         [ 3., 13., 23.]],
-
-                        [[ 3., 13., 23.],
-                         [ 4., 14., 24.]],
-
-                        [[ 4., 14., 24.],
-                         [ 5., 15., 25.]]],
-
+tensor(crow_indices=tensor([[[0, 3],
+                             [0, 3],
+                             [0, 3]],
 
-                       [[[ 5., 15., 25.],
-                         [ 6., 16., 26.]],
+                            [[0, 3],
+                             [0, 3],
+                             [0, 3]]]),
+       col_indices=tensor([[[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]],
 
-                        [[ 6., 16., 26.],
-                         [ 7., 17., 27.]],
+                           [[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]]]),
+       values=tensor([[[[[1.],
+                         [3.]],
 
-                        [[ 7., 17., 27.],
-                         [ 8., 18., 28.]],
+                        [[2.],
+                         [0.]],
 
-                        [[ 8., 18., 28.],
-                         [ 9., 19., 29.]]],
+                        [[0.],
+                         [4.]]],
 
 
-                       [[[ 9., 19., 29.],
-                         [10., 20., 30.]],
+                       [[[1.],
+                         [4.]],
 
-                        [[10., 20., 30.],
-                         [11., 21., 31.]],
+                        [[2.],
+                         [0.]],
 
-                        [[11., 21., 31.],
-                         [12., 22., 32.]],
+                        [[3.],
+                         [0.]]],
 
-                        [[12., 22., 32.],
-                         [13., 23., 33.]]]],
 
+                       [[[1.],
+                         [2.]],
 
+                        [[0.],
+                         [3.]],
 
-                      [[[[13., 23., 33.],
-                         [14., 24., 34.]],
+                        [[0.],
+                         [4.]]]],
 
-                        [[14., 24., 34.],
-                         [15., 25., 35.]],
 
-                        [[15., 25., 35.],
-                         [16., 26., 36.]],
 
-                        [[16., 26., 36.],
-                         [17., 27., 37.]]],
+                      [[[[0.],
+                         [2.]],
 
+                        [[1.],
+                         [3.]],
 
-                       [[[17., 27., 37.],
-                         [18., 28., 38.]],
+                        [[0.],
+                         [4.]]],
 
-                        [[18., 28., 38.],
-                         [19., 29., 39.]],
 
-                        [[19., 29., 39.],
-                         [20., 30., 40.]],
+                       [[[1.],
+                         [3.]],
 
-                        [[20., 30., 40.],
-                         [21., 31., 41.]]],
+                        [[0.],
+                         [4.]],
 
+                        [[2.],
+                         [0.]]],
 
-                       [[[21., 31., 41.],
-                         [22., 32., 42.]],
 
-                        [[22., 32., 42.],
-                         [23., 33., 43.]],
+                       [[[1.],
+                         [0.]],
 
-                        [[23., 33., 43.],
-                         [24., 34., 44.]],
+                        [[2.],
+                         [4.]],
 
-                        [[24., 34., 44.],
-                         [25., 35., 45.]]]]]), device='cuda:0',
-       size=(2, 3, 4, 9), nnz=4, layout=torch.sparse_bsr)
+                        [[3.],
+                         [0.]]]]]), device='cuda:0', size=(2, 3, 2, 3), nnz=3,
+       layout=torch.sparse_bsr)
 # _crow_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+tensor([[[0, 3],
+         [0, 3],
+         [0, 3]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], device='cuda:0')
+        [[0, 3],
+         [0, 3],
+         [0, 3]]], device='cuda:0')
 # _col_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
+tensor([[[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], device='cuda:0')
+        [[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]]], device='cuda:0')
 # _values
-tensor([[[[[ 1., 11., 21.],
-           [ 2., 12., 22.]],
-
-          [[ 2., 12., 22.],
-           [ 3., 13., 23.]],
-
-          [[ 3., 13., 23.],
-           [ 4., 14., 24.]],
-
-          [[ 4., 14., 24.],
-           [ 5., 15., 25.]]],
-
-
-         [[[ 5., 15., 25.],
-           [ 6., 16., 26.]],
-
-          [[ 6., 16., 26.],
-           [ 7., 17., 27.]],
-
-          [[ 7., 17., 27.],
-           [ 8., 18., 28.]],
+tensor([[[[[1.],
+           [3.]],
 
-          [[ 8., 18., 28.],
-           [ 9., 19., 29.]]],
+          [[2.],
+           [0.]],
 
+          [[0.],
+           [4.]]],
 
-         [[[ 9., 19., 29.],
-           [10., 20., 30.]],
 
-          [[10., 20., 30.],
-           [11., 21., 31.]],
+         [[[1.],
+           [4.]],
 
-          [[11., 21., 31.],
-           [12., 22., 32.]],
+          [[2.],
+           [0.]],
 
-          [[12., 22., 32.],
-           [13., 23., 33.]]]],
+          [[3.],
+           [0.]]],
 
 
+         [[[1.],
+           [2.]],
 
-        [[[[13., 23., 33.],
-           [14., 24., 34.]],
+          [[0.],
+           [3.]],
 
-          [[14., 24., 34.],
-           [15., 25., 35.]],
+          [[0.],
+           [4.]]]],
 
-          [[15., 25., 35.],
-           [16., 26., 36.]],
 
-          [[16., 26., 36.],
-           [17., 27., 37.]]],
 
+        [[[[0.],
+           [2.]],
 
-         [[[17., 27., 37.],
-           [18., 28., 38.]],
+          [[1.],
+           [3.]],
 
-          [[18., 28., 38.],
-           [19., 29., 39.]],
+          [[0.],
+           [4.]]],
 
-          [[19., 29., 39.],
-           [20., 30., 40.]],
 
-          [[20., 30., 40.],
-           [21., 31., 41.]]],
+         [[[1.],
+           [3.]],
 
+          [[0.],
+           [4.]],
 
-         [[[21., 31., 41.],
-           [22., 32., 42.]],
+          [[2.],
+           [0.]]],
 
-          [[22., 32., 42.],
-           [23., 33., 43.]],
 
-          [[23., 33., 43.],
-           [24., 34., 44.]],
+         [[[1.],
+           [0.]],
 
-          [[24., 34., 44.],
-           [25., 35., 45.]]]]], device='cuda:0')
+          [[2.],
+           [4.]],
 
+          [[3.],
+           [0.]]]]], device='cuda:0')
 
-########## torch.float64/torch.int64/size=()+(4, 3)+() ##########
+########## torch.float32/torch.int64/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[1.],
-                       [2.]],
+tensor(crow_indices=tensor([0, 2, 3, 5, 7]),
+       col_indices=tensor([0, 1, 0, 0, 1, 0, 1]),
+       values=tensor([[[ 0.,  1.,  0.],
+                       [ 0.,  4.,  0.]],
 
-                      [[2.],
-                       [3.]],
+                      [[ 2.,  0.,  3.],
+                       [ 0.,  5.,  0.]],
 
-                      [[3.],
-                       [4.]],
+                      [[ 6.,  7.,  8.],
+                       [ 0.,  0.,  0.]],
 
-                      [[4.],
-                       [5.]]]), device='cuda:0', size=(4, 3), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0')
-# _col_indices
-tensor([0, 1, 0, 2], device='cuda:0')
-# _values
-tensor([[[1.],
-         [2.]],
-
-        [[2.],
-         [3.]],
-
-        [[3.],
-         [4.]],
-
-        [[4.],
-         [5.]]], device='cuda:0', dtype=torch.float64)
-
-########## torch.float64/torch.int64/size=()+(0, 0)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([0]),
-       col_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0, 2, 1)), device='cuda:0', size=(0, 0), nnz=0,
-       dtype=torch.float64, layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0], device='cuda:0')
-# _col_indices
-tensor([], device='cuda:0', dtype=torch.int64)
-# _values
-tensor([], device='cuda:0', size=(0, 2, 1), dtype=torch.float64)
-
-########## torch.float64/torch.int64/size=(2,)+(2, 6)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       col_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[[[ 1., 11.]],
-
-                       [[ 2., 12.]],
+                      [[ 0.,  9.,  0.],
+                       [13.,  0., 14.]],
 
-                       [[ 3., 13.]],
+                      [[10., 11., 12.],
+                       [15., 16., 17.]],
 
-                       [[ 4., 14.]]],
+                      [[ 0.,  0.,  0.],
+                       [20., 21., 22.]],
 
-
-                      [[[ 5., 15.]],
-
-                       [[ 6., 16.]],
-
-                       [[ 7., 17.]],
-
-                       [[ 8., 18.]]]]), device='cuda:0', size=(2, 2, 6), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_bsr)
+                      [[ 0., 18., 19.],
+                       [ 0., 23., 24.]]]), device='cuda:0', size=(8, 6), nnz=7,
+       layout=torch.sparse_bsr)
 # _crow_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]], device='cuda:0')
+tensor([0, 2, 3, 5, 7], device='cuda:0')
 # _col_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]], device='cuda:0')
+tensor([0, 1, 0, 0, 1, 0, 1], device='cuda:0')
 # _values
-tensor([[[[ 1., 11.]],
+tensor([[[ 0.,  1.,  0.],
+         [ 0.,  4.,  0.]],
 
-         [[ 2., 12.]],
+        [[ 2.,  0.,  3.],
+         [ 0.,  5.,  0.]],
 
-         [[ 3., 13.]],
+        [[ 6.,  7.,  8.],
+         [ 0.,  0.,  0.]],
 
-         [[ 4., 14.]]],
+        [[ 0.,  9.,  0.],
+         [13.,  0., 14.]],
 
+        [[10., 11., 12.],
+         [15., 16., 17.]],
 
-        [[[ 5., 15.]],
+        [[ 0.,  0.,  0.],
+         [20., 21., 22.]],
 
-         [[ 6., 16.]],
+        [[ 0., 18., 19.],
+         [ 0., 23., 24.]]], device='cuda:0')
 
-         [[ 7., 17.]],
 
-         [[ 8., 18.]]]], device='cuda:0', dtype=torch.float64)
-
-########## torch.float64/torch.int64/size=(2, 3)+(4, 9)+() ##########
+########## torch.float64/torch.int64/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
-tensor(crow_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
+tensor(crow_indices=tensor([[[0, 3],
+                             [0, 3],
+                             [0, 3]],
 
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1., 11., 21.],
-                         [ 2., 12., 22.]],
+                            [[0, 3],
+                             [0, 3],
+                             [0, 3]]]),
+       col_indices=tensor([[[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]],
 
-                        [[ 2., 12., 22.],
-                         [ 3., 13., 23.]],
+                           [[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]]]),
+       values=tensor([[[[[1.],
+                         [3.]],
 
-                        [[ 3., 13., 23.],
-                         [ 4., 14., 24.]],
+                        [[2.],
+                         [0.]],
 
-                        [[ 4., 14., 24.],
-                         [ 5., 15., 25.]]],
+                        [[0.],
+                         [4.]]],
 
 
-                       [[[ 5., 15., 25.],
-                         [ 6., 16., 26.]],
+                       [[[1.],
+                         [4.]],
 
-                        [[ 6., 16., 26.],
-                         [ 7., 17., 27.]],
+                        [[2.],
+                         [0.]],
 
-                        [[ 7., 17., 27.],
-                         [ 8., 18., 28.]],
+                        [[3.],
+                         [0.]]],
 
-                        [[ 8., 18., 28.],
-                         [ 9., 19., 29.]]],
 
+                       [[[1.],
+                         [2.]],
 
-                       [[[ 9., 19., 29.],
-                         [10., 20., 30.]],
+                        [[0.],
+                         [3.]],
 
-                        [[10., 20., 30.],
-                         [11., 21., 31.]],
+                        [[0.],
+                         [4.]]]],
 
-                        [[11., 21., 31.],
-                         [12., 22., 32.]],
 
-                        [[12., 22., 32.],
-                         [13., 23., 33.]]]],
 
+                      [[[[0.],
+                         [2.]],
 
+                        [[1.],
+                         [3.]],
 
-                      [[[[13., 23., 33.],
-                         [14., 24., 34.]],
+                        [[0.],
+                         [4.]]],
 
-                        [[14., 24., 34.],
-                         [15., 25., 35.]],
 
-                        [[15., 25., 35.],
-                         [16., 26., 36.]],
+                       [[[1.],
+                         [3.]],
 
-                        [[16., 26., 36.],
-                         [17., 27., 37.]]],
+                        [[0.],
+                         [4.]],
 
+                        [[2.],
+                         [0.]]],
 
-                       [[[17., 27., 37.],
-                         [18., 28., 38.]],
 
-                        [[18., 28., 38.],
-                         [19., 29., 39.]],
+                       [[[1.],
+                         [0.]],
 
-                        [[19., 29., 39.],
-                         [20., 30., 40.]],
+                        [[2.],
+                         [4.]],
 
-                        [[20., 30., 40.],
-                         [21., 31., 41.]]],
-
-
-                       [[[21., 31., 41.],
-                         [22., 32., 42.]],
-
-                        [[22., 32., 42.],
-                         [23., 33., 43.]],
-
-                        [[23., 33., 43.],
-                         [24., 34., 44.]],
-
-                        [[24., 34., 44.],
-                         [25., 35., 45.]]]]]), device='cuda:0',
-       size=(2, 3, 4, 9), nnz=4, dtype=torch.float64, layout=torch.sparse_bsr)
+                        [[3.],
+                         [0.]]]]]), device='cuda:0', size=(2, 3, 2, 3), nnz=3,
+       dtype=torch.float64, layout=torch.sparse_bsr)
 # _crow_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+tensor([[[0, 3],
+         [0, 3],
+         [0, 3]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], device='cuda:0')
+        [[0, 3],
+         [0, 3],
+         [0, 3]]], device='cuda:0')
 # _col_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
+tensor([[[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], device='cuda:0')
+        [[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]]], device='cuda:0')
 # _values
-tensor([[[[[ 1., 11., 21.],
-           [ 2., 12., 22.]],
-
-          [[ 2., 12., 22.],
-           [ 3., 13., 23.]],
-
-          [[ 3., 13., 23.],
-           [ 4., 14., 24.]],
+tensor([[[[[1.],
+           [3.]],
 
-          [[ 4., 14., 24.],
-           [ 5., 15., 25.]]],
+          [[2.],
+           [0.]],
 
+          [[0.],
+           [4.]]],
 
-         [[[ 5., 15., 25.],
-           [ 6., 16., 26.]],
 
-          [[ 6., 16., 26.],
-           [ 7., 17., 27.]],
+         [[[1.],
+           [4.]],
 
-          [[ 7., 17., 27.],
-           [ 8., 18., 28.]],
+          [[2.],
+           [0.]],
 
-          [[ 8., 18., 28.],
-           [ 9., 19., 29.]]],
+          [[3.],
+           [0.]]],
 
 
-         [[[ 9., 19., 29.],
-           [10., 20., 30.]],
+         [[[1.],
+           [2.]],
 
-          [[10., 20., 30.],
-           [11., 21., 31.]],
+          [[0.],
+           [3.]],
 
-          [[11., 21., 31.],
-           [12., 22., 32.]],
+          [[0.],
+           [4.]]]],
 
-          [[12., 22., 32.],
-           [13., 23., 33.]]]],
 
 
+        [[[[0.],
+           [2.]],
 
-        [[[[13., 23., 33.],
-           [14., 24., 34.]],
+          [[1.],
+           [3.]],
 
-          [[14., 24., 34.],
-           [15., 25., 35.]],
+          [[0.],
+           [4.]]],
 
-          [[15., 25., 35.],
-           [16., 26., 36.]],
 
-          [[16., 26., 36.],
-           [17., 27., 37.]]],
+         [[[1.],
+           [3.]],
 
+          [[0.],
+           [4.]],
 
-         [[[17., 27., 37.],
-           [18., 28., 38.]],
+          [[2.],
+           [0.]]],
 
-          [[18., 28., 38.],
-           [19., 29., 39.]],
 
-          [[19., 29., 39.],
-           [20., 30., 40.]],
+         [[[1.],
+           [0.]],
 
-          [[20., 30., 40.],
-           [21., 31., 41.]]],
+          [[2.],
+           [4.]],
 
+          [[3.],
+           [0.]]]]], device='cuda:0', dtype=torch.float64)
 
-         [[[21., 31., 41.],
-           [22., 32., 42.]],
-
-          [[22., 32., 42.],
-           [23., 33., 43.]],
-
-          [[23., 33., 43.],
-           [24., 34., 44.]],
-
-          [[24., 34., 44.],
-           [25., 35., 45.]]]]], device='cuda:0', dtype=torch.float64)
-
-
-########## torch.float32/torch.int32/size=()+(6, 6)+(2,) ##########
+########## torch.float64/torch.int64/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[  1., 101.],
-                        [ 11., 111.]],
-
-                       [[  2., 102.],
-                        [ 12., 112.]],
-
-                       [[  3., 103.],
-                        [ 13., 113.]]],
-
+tensor(crow_indices=tensor([0, 2, 3, 5, 7]),
+       col_indices=tensor([0, 1, 0, 0, 1, 0, 1]),
+       values=tensor([[[ 0.,  1.,  0.],
+                       [ 0.,  4.,  0.]],
 
-                      [[[  2., 102.],
-                        [ 12., 112.]],
+                      [[ 2.,  0.,  3.],
+                       [ 0.,  5.,  0.]],
 
-                       [[  3., 103.],
-                        [ 13., 113.]],
+                      [[ 6.,  7.,  8.],
+                       [ 0.,  0.,  0.]],
 
-                       [[  4., 104.],
-                        [ 14., 114.]]],
+                      [[ 0.,  9.,  0.],
+                       [13.,  0., 14.]],
 
+                      [[10., 11., 12.],
+                       [15., 16., 17.]],
 
-                      [[[  3., 103.],
-                        [ 13., 113.]],
+                      [[ 0.,  0.,  0.],
+                       [20., 21., 22.]],
 
-                       [[  4., 104.],
-                        [ 14., 114.]],
-
-                       [[  5., 105.],
-                        [ 15., 115.]]],
-
-
-                      [[[  4., 104.],
-                        [ 14., 114.]],
-
-                       [[  5., 105.],
-                        [ 15., 115.]],
-
-                       [[  6., 106.],
-                        [ 16., 116.]]]]), device='cuda:0', size=(6, 6, 2),
-       nnz=4, layout=torch.sparse_bsr)
+                      [[ 0., 18., 19.],
+                       [ 0., 23., 24.]]]), device='cuda:0', size=(8, 6), nnz=7,
+       dtype=torch.float64, layout=torch.sparse_bsr)
 # _crow_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
+tensor([0, 2, 3, 5, 7], device='cuda:0')
 # _col_indices
-tensor([0, 1, 0, 2], device='cuda:0', dtype=torch.int32)
+tensor([0, 1, 0, 0, 1, 0, 1], device='cuda:0')
 # _values
-tensor([[[[  1., 101.],
-          [ 11., 111.]],
-
-         [[  2., 102.],
-          [ 12., 112.]],
-
-         [[  3., 103.],
-          [ 13., 113.]]],
+tensor([[[ 0.,  1.,  0.],
+         [ 0.,  4.,  0.]],
 
+        [[ 2.,  0.,  3.],
+         [ 0.,  5.,  0.]],
 
-        [[[  2., 102.],
-          [ 12., 112.]],
+        [[ 6.,  7.,  8.],
+         [ 0.,  0.,  0.]],
 
-         [[  3., 103.],
-          [ 13., 113.]],
+        [[ 0.,  9.,  0.],
+         [13.,  0., 14.]],
 
-         [[  4., 104.],
-          [ 14., 114.]]],
+        [[10., 11., 12.],
+         [15., 16., 17.]],
 
+        [[ 0.,  0.,  0.],
+         [20., 21., 22.]],
 
-        [[[  3., 103.],
-          [ 13., 113.]],
+        [[ 0., 18., 19.],
+         [ 0., 23., 24.]]], device='cuda:0', dtype=torch.float64)
 
-         [[  4., 104.],
-          [ 14., 114.]],
 
-         [[  5., 105.],
-          [ 15., 115.]]],
-
-
-        [[[  4., 104.],
-          [ 14., 114.]],
-
-         [[  5., 105.],
-          [ 15., 115.]],
-
-         [[  6., 106.],
-          [ 16., 116.]]]], device='cuda:0')
-
-########## torch.float32/torch.int32/size=()+(4, 9)+(4, 2) ##########
+########## torch.float32/torch.int32/size=(2, 3)+(2, 3)+(4,) ##########
 # sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[[1.0000e+00, 1.0010e+03],
-                         [1.0100e+02, 1.1010e+03],
-                         [2.0100e+02, 1.2010e+03],
-                         [3.0100e+02, 1.3010e+03]],
-
-                        [[1.1000e+01, 1.0110e+03],
-                         [1.1100e+02, 1.1110e+03],
-                         [2.1100e+02, 1.2110e+03],
-                         [3.1100e+02, 1.3110e+03]],
-
-                        [[2.1000e+01, 1.0210e+03],
-                         [1.2100e+02, 1.1210e+03],
-                         [2.2100e+02, 1.2210e+03],
-                         [3.2100e+02, 1.3210e+03]]],
-
+tensor(crow_indices=tensor([[[0, 3],
+                             [0, 3],
+                             [0, 3]],
 
-                       [[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
+                            [[0, 3],
+                             [0, 3],
+                             [0, 3]]]),
+       col_indices=tensor([[[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]],
 
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]],
+                           [[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]]]),
+       values=tensor([[[[[[1., 2., 3., 4.]],
 
-                        [[2.2000e+01, 1.0220e+03],
-                         [1.2200e+02, 1.1220e+03],
-                         [2.2200e+02, 1.2220e+03],
-                         [3.2200e+02, 1.3220e+03]]]],
+                         [[3., 4., 5., 6.]]],
 
 
+                        [[[2., 3., 4., 5.]],
 
-                      [[[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
+                         [[0., 0., 0., 0.]]],
 
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]],
 
-                        [[2.2000e+01, 1.0220e+03],
-                         [1.2200e+02, 1.1220e+03],
-                         [2.2200e+02, 1.2220e+03],
-                         [3.2200e+02, 1.3220e+03]]],
+                        [[[0., 0., 0., 0.]],
 
+                         [[4., 5., 6., 7.]]]],
 
-                       [[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
 
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]],
 
-                        [[2.3000e+01, 1.0230e+03],
-                         [1.2300e+02, 1.1230e+03],
-                         [2.2300e+02, 1.2230e+03],
-                         [3.2300e+02, 1.3230e+03]]]],
+                       [[[[1., 2., 3., 4.]],
 
+                         [[4., 5., 6., 7.]]],
 
 
-                      [[[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
+                        [[[2., 3., 4., 5.]],
 
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]],
+                         [[0., 0., 0., 0.]]],
 
-                        [[2.3000e+01, 1.0230e+03],
-                         [1.2300e+02, 1.1230e+03],
-                         [2.2300e+02, 1.2230e+03],
-                         [3.2300e+02, 1.3230e+03]]],
 
+                        [[[3., 4., 5., 6.]],
 
-                       [[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
+                         [[0., 0., 0., 0.]]]],
 
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]],
 
-                        [[2.4000e+01, 1.0240e+03],
-                         [1.2400e+02, 1.1240e+03],
-                         [2.2400e+02, 1.2240e+03],
-                         [3.2400e+02, 1.3240e+03]]]],
 
+                       [[[[1., 2., 3., 4.]],
 
+                         [[2., 3., 4., 5.]]],
 
-                      [[[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
-
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]],
-
-                        [[2.4000e+01, 1.0240e+03],
-                         [1.2400e+02, 1.1240e+03],
-                         [2.2400e+02, 1.2240e+03],
-                         [3.2400e+02, 1.3240e+03]]],
-
-
-                       [[[5.0000e+00, 1.0050e+03],
-                         [1.0500e+02, 1.1050e+03],
-                         [2.0500e+02, 1.2050e+03],
-                         [3.0500e+02, 1.3050e+03]],
-
-                        [[1.5000e+01, 1.0150e+03],
-                         [1.1500e+02, 1.1150e+03],
-                         [2.1500e+02, 1.2150e+03],
-                         [3.1500e+02, 1.3150e+03]],
-
-                        [[2.5000e+01, 1.0250e+03],
-                         [1.2500e+02, 1.1250e+03],
-                         [2.2500e+02, 1.2250e+03],
-                         [3.2500e+02, 1.3250e+03]]]]]), device='cuda:0',
-       size=(4, 9, 4, 2), nnz=4, layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 2], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[[[[1.0000e+00, 1.0010e+03],
-           [1.0100e+02, 1.1010e+03],
-           [2.0100e+02, 1.2010e+03],
-           [3.0100e+02, 1.3010e+03]],
 
-          [[1.1000e+01, 1.0110e+03],
-           [1.1100e+02, 1.1110e+03],
-           [2.1100e+02, 1.2110e+03],
-           [3.1100e+02, 1.3110e+03]],
+                        [[[0., 0., 0., 0.]],
 
-          [[2.1000e+01, 1.0210e+03],
-           [1.2100e+02, 1.1210e+03],
-           [2.2100e+02, 1.2210e+03],
-           [3.2100e+02, 1.3210e+03]]],
+                         [[3., 4., 5., 6.]]],
 
 
-         [[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
+                        [[[0., 0., 0., 0.]],
 
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]],
+                         [[4., 5., 6., 7.]]]]],
 
-          [[2.2000e+01, 1.0220e+03],
-           [1.2200e+02, 1.1220e+03],
-           [2.2200e+02, 1.2220e+03],
-           [3.2200e+02, 1.3220e+03]]]],
 
 
-        [[[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
+                      [[[[[0., 0., 0., 0.]],
 
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]],
+                         [[2., 3., 4., 5.]]],
 
-          [[2.2000e+01, 1.0220e+03],
-           [1.2200e+02, 1.1220e+03],
-           [2.2200e+02, 1.2220e+03],
-           [3.2200e+02, 1.3220e+03]]],
 
+                        [[[1., 2., 3., 4.]],
 
-         [[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
+                         [[3., 4., 5., 6.]]],
 
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]],
 
-          [[2.3000e+01, 1.0230e+03],
-           [1.2300e+02, 1.1230e+03],
-           [2.2300e+02, 1.2230e+03],
-           [3.2300e+02, 1.3230e+03]]]],
+                        [[[0., 0., 0., 0.]],
 
+                         [[4., 5., 6., 7.]]]],
 
 
-        [[[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
 
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]],
+                       [[[[1., 2., 3., 4.]],
 
-          [[2.3000e+01, 1.0230e+03],
-           [1.2300e+02, 1.1230e+03],
-           [2.2300e+02, 1.2230e+03],
-           [3.2300e+02, 1.3230e+03]]],
+                         [[3., 4., 5., 6.]]],
 
 
-         [[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
+                        [[[0., 0., 0., 0.]],
 
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]],
+                         [[4., 5., 6., 7.]]],
 
-          [[2.4000e+01, 1.0240e+03],
-           [1.2400e+02, 1.1240e+03],
-           [2.2400e+02, 1.2240e+03],
-           [3.2400e+02, 1.3240e+03]]]],
 
+                        [[[2., 3., 4., 5.]],
 
+                         [[0., 0., 0., 0.]]]],
 
-        [[[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
 
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]],
 
-          [[2.4000e+01, 1.0240e+03],
-           [1.2400e+02, 1.1240e+03],
-           [2.2400e+02, 1.2240e+03],
-           [3.2400e+02, 1.3240e+03]]],
+                       [[[[1., 2., 3., 4.]],
 
+                         [[0., 0., 0., 0.]]],
 
-         [[[5.0000e+00, 1.0050e+03],
-           [1.0500e+02, 1.1050e+03],
-           [2.0500e+02, 1.2050e+03],
-           [3.0500e+02, 1.3050e+03]],
 
-          [[1.5000e+01, 1.0150e+03],
-           [1.1500e+02, 1.1150e+03],
-           [2.1500e+02, 1.2150e+03],
-           [3.1500e+02, 1.3150e+03]],
+                        [[[2., 3., 4., 5.]],
 
-          [[2.5000e+01, 1.0250e+03],
-           [1.2500e+02, 1.1250e+03],
-           [2.2500e+02, 1.2250e+03],
-           [3.2500e+02, 1.3250e+03]]]]], device='cuda:0')
+                         [[4., 5., 6., 7.]]],
 
-########## torch.float32/torch.int32/size=(2, 3)+(6, 6)+(2, 1) ##########
-# sparse tensor
-tensor(crow_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
 
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
+                        [[[3., 4., 5., 6.]],
 
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[[[  1.],
-                           [101.]],
+                         [[0., 0., 0., 0.]]]]]]), device='cuda:0',
+       size=(2, 3, 2, 3, 4), nnz=3, layout=torch.sparse_bsr)
+# _crow_indices
+tensor([[[0, 3],
+         [0, 3],
+         [0, 3]],
 
-                          [[ 11.],
-                           [111.]]],
+        [[0, 3],
+         [0, 3],
+         [0, 3]]], device='cuda:0', dtype=torch.int32)
+# _col_indices
+tensor([[[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]],
 
+        [[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]]], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[[[[1., 2., 3., 4.]],
 
-                         [[[  2.],
-                           [102.]],
+           [[3., 4., 5., 6.]]],
 
-                          [[ 12.],
-                           [112.]]],
 
+          [[[2., 3., 4., 5.]],
 
-                         [[[  3.],
-                           [103.]],
+           [[0., 0., 0., 0.]]],
 
-                          [[ 13.],
-                           [113.]]]],
 
+          [[[0., 0., 0., 0.]],
 
+           [[4., 5., 6., 7.]]]],
 
-                        [[[[  2.],
-                           [102.]],
 
-                          [[ 12.],
-                           [112.]]],
 
+         [[[[1., 2., 3., 4.]],
 
-                         [[[  3.],
-                           [103.]],
+           [[4., 5., 6., 7.]]],
 
-                          [[ 13.],
-                           [113.]]],
 
+          [[[2., 3., 4., 5.]],
 
-                         [[[  4.],
-                           [104.]],
+           [[0., 0., 0., 0.]]],
 
-                          [[ 14.],
-                           [114.]]]],
 
+          [[[3., 4., 5., 6.]],
 
+           [[0., 0., 0., 0.]]]],
 
-                        [[[[  3.],
-                           [103.]],
 
-                          [[ 13.],
-                           [113.]]],
 
+         [[[[1., 2., 3., 4.]],
 
-                         [[[  4.],
-                           [104.]],
+           [[2., 3., 4., 5.]]],
 
-                          [[ 14.],
-                           [114.]]],
 
+          [[[0., 0., 0., 0.]],
 
-                         [[[  5.],
-                           [105.]],
+           [[3., 4., 5., 6.]]],
 
-                          [[ 15.],
-                           [115.]]]],
 
+          [[[0., 0., 0., 0.]],
 
+           [[4., 5., 6., 7.]]]]],
 
-                        [[[[  4.],
-                           [104.]],
 
-                          [[ 14.],
-                           [114.]]],
 
 
-                         [[[  5.],
-                           [105.]],
+        [[[[[0., 0., 0., 0.]],
 
-                          [[ 15.],
-                           [115.]]],
+           [[2., 3., 4., 5.]]],
 
 
-                         [[[  6.],
-                           [106.]],
+          [[[1., 2., 3., 4.]],
 
-                          [[ 16.],
-                           [116.]]]]],
+           [[3., 4., 5., 6.]]],
 
 
+          [[[0., 0., 0., 0.]],
 
+           [[4., 5., 6., 7.]]]],
 
-                       [[[[[  5.],
-                           [105.]],
 
-                          [[ 15.],
-                           [115.]]],
 
+         [[[[1., 2., 3., 4.]],
 
-                         [[[  6.],
-                           [106.]],
+           [[3., 4., 5., 6.]]],
 
-                          [[ 16.],
-                           [116.]]],
 
+          [[[0., 0., 0., 0.]],
 
-                         [[[  7.],
-                           [107.]],
+           [[4., 5., 6., 7.]]],
 
-                          [[ 17.],
-                           [117.]]]],
 
+          [[[2., 3., 4., 5.]],
 
+           [[0., 0., 0., 0.]]]],
 
-                        [[[[  6.],
-                           [106.]],
 
-                          [[ 16.],
-                           [116.]]],
 
+         [[[[1., 2., 3., 4.]],
 
-                         [[[  7.],
-                           [107.]],
+           [[0., 0., 0., 0.]]],
 
-                          [[ 17.],
-                           [117.]]],
 
+          [[[2., 3., 4., 5.]],
 
-                         [[[  8.],
-                           [108.]],
+           [[4., 5., 6., 7.]]],
 
-                          [[ 18.],
-                           [118.]]]],
 
+          [[[3., 4., 5., 6.]],
 
+           [[0., 0., 0., 0.]]]]]], device='cuda:0')
 
-                        [[[[  7.],
-                           [107.]],
+########## torch.float32/torch.int32/size=()+(8, 6)+(4, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([0, 2, 3, 5, 7]),
+       col_indices=tensor([0, 1, 0, 0, 1, 0, 1]),
+       values=tensor([[[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 17.],
-                           [117.]]],
+                        [[ 1., 11.],
+                         [ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
-                         [[[  8.],
-                           [108.]],
 
-                          [[ 18.],
-                           [118.]]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.]],
 
-                         [[[  9.],
-                           [109.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-                          [[ 19.],
-                           [119.]]]],
 
 
+                      [[[[ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.]],
 
-                        [[[[  8.],
-                           [108.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 18.],
-                           [118.]]],
+                        [[ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.]]],
 
 
-                         [[[  9.],
-                           [109.]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 19.],
-                           [119.]]],
+                        [[ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-                         [[[ 10.],
-                           [110.]],
 
-                          [[ 20.],
-                           [120.]]]]],
 
+                      [[[[ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.]],
 
+                        [[ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.]],
 
+                        [[ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.],
+                         [11., 21.]]],
 
-                       [[[[[  9.],
-                           [109.]],
 
-                          [[ 19.],
-                           [119.]]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                         [[[ 10.],
-                           [110.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-                          [[ 20.],
-                           [120.]]],
 
 
-                         [[[ 11.],
-                           [111.]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 21.],
-                           [121.]]]],
+                        [[ 9., 19.],
+                         [10., 20.],
+                         [11., 21.],
+                         [12., 22.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
 
-                        [[[[ 10.],
-                           [110.]],
+                       [[[13., 23.],
+                         [14., 24.],
+                         [15., 25.],
+                         [16., 26.]],
 
-                          [[ 20.],
-                           [120.]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[14., 24.],
+                         [15., 25.],
+                         [16., 26.],
+                         [17., 27.]]]],
 
-                         [[[ 11.],
-                           [111.]],
 
-                          [[ 21.],
-                           [121.]]],
 
+                      [[[[10., 20.],
+                         [11., 21.],
+                         [12., 22.],
+                         [13., 23.]],
 
-                         [[[ 12.],
-                           [112.]],
+                        [[11., 21.],
+                         [12., 22.],
+                         [13., 23.],
+                         [14., 24.]],
 
-                          [[ 22.],
-                           [122.]]]],
+                        [[12., 22.],
+                         [13., 23.],
+                         [14., 24.],
+                         [15., 25.]]],
 
 
+                       [[[15., 25.],
+                         [16., 26.],
+                         [17., 27.],
+                         [18., 28.]],
 
-                        [[[[ 11.],
-                           [111.]],
+                        [[16., 26.],
+                         [17., 27.],
+                         [18., 28.],
+                         [19., 29.]],
 
-                          [[ 21.],
-                           [121.]]],
+                        [[17., 27.],
+                         [18., 28.],
+                         [19., 29.],
+                         [20., 30.]]]],
 
 
-                         [[[ 12.],
-                           [112.]],
 
-                          [[ 22.],
-                           [122.]]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                         [[[ 13.],
-                           [113.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
-                          [[ 23.],
-                           [123.]]]],
 
+                       [[[20., 30.],
+                         [21., 31.],
+                         [22., 32.],
+                         [23., 33.]],
 
+                        [[21., 31.],
+                         [22., 32.],
+                         [23., 33.],
+                         [24., 34.]],
 
-                        [[[[ 12.],
-                           [112.]],
+                        [[22., 32.],
+                         [23., 33.],
+                         [24., 34.],
+                         [25., 35.]]]],
 
-                          [[ 22.],
-                           [122.]]],
 
 
-                         [[[ 13.],
-                           [113.]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 23.],
-                           [123.]]],
+                        [[18., 28.],
+                         [19., 29.],
+                         [20., 30.],
+                         [21., 31.]],
 
+                        [[19., 29.],
+                         [20., 30.],
+                         [21., 31.],
+                         [22., 32.]]],
 
-                         [[[ 14.],
-                           [114.]],
 
-                          [[ 24.],
-                           [124.]]]]]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[23., 33.],
+                         [24., 34.],
+                         [25., 35.],
+                         [26., 36.]],
 
+                        [[24., 34.],
+                         [25., 35.],
+                         [26., 36.],
+                         [27., 37.]]]]]), device='cuda:0', size=(8, 6, 4, 2),
+       nnz=7, layout=torch.sparse_bsr)
+# _crow_indices
+tensor([0, 2, 3, 5, 7], device='cuda:0', dtype=torch.int32)
+# _col_indices
+tensor([0, 1, 0, 0, 1, 0, 1], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 1., 11.],
+           [ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-                      [[[[[[ 13.],
-                           [113.]],
 
-                          [[ 23.],
-                           [123.]]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.]],
 
-                         [[[ 14.],
-                           [114.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-                          [[ 24.],
-                           [124.]]],
 
 
-                         [[[ 15.],
-                           [115.]],
+        [[[[ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.]],
 
-                          [[ 25.],
-                           [125.]]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.]]],
 
 
-                        [[[[ 14.],
-                           [114.]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                          [[ 24.],
-                           [124.]]],
+          [[ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-                         [[[ 15.],
-                           [115.]],
 
-                          [[ 25.],
-                           [125.]]],
 
+        [[[[ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.]],
 
-                         [[[ 16.],
-                           [116.]],
+          [[ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.],
+           [10., 20.]],
 
-                          [[ 26.],
-                           [126.]]]],
+          [[ 8., 18.],
+           [ 9., 19.],
+           [10., 20.],
+           [11., 21.]]],
 
 
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                        [[[[ 15.],
-                           [115.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                          [[ 25.],
-                           [125.]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
 
-                         [[[ 16.],
-                           [116.]],
 
-                          [[ 26.],
-                           [126.]]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 9., 19.],
+           [10., 20.],
+           [11., 21.],
+           [12., 22.]],
 
-                         [[[ 17.],
-                           [117.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-                          [[ 27.],
-                           [127.]]]],
 
+         [[[13., 23.],
+           [14., 24.],
+           [15., 25.],
+           [16., 26.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                        [[[[ 16.],
-                           [116.]],
+          [[14., 24.],
+           [15., 25.],
+           [16., 26.],
+           [17., 27.]]]],
 
-                          [[ 26.],
-                           [126.]]],
 
 
-                         [[[ 17.],
-                           [117.]],
+        [[[[10., 20.],
+           [11., 21.],
+           [12., 22.],
+           [13., 23.]],
 
-                          [[ 27.],
-                           [127.]]],
+          [[11., 21.],
+           [12., 22.],
+           [13., 23.],
+           [14., 24.]],
 
+          [[12., 22.],
+           [13., 23.],
+           [14., 24.],
+           [15., 25.]]],
 
-                         [[[ 18.],
-                           [118.]],
 
-                          [[ 28.],
-                           [128.]]]]],
+         [[[15., 25.],
+           [16., 26.],
+           [17., 27.],
+           [18., 28.]],
 
+          [[16., 26.],
+           [17., 27.],
+           [18., 28.],
+           [19., 29.]],
 
+          [[17., 27.],
+           [18., 28.],
+           [19., 29.],
+           [20., 30.]]]],
 
 
-                       [[[[[ 17.],
-                           [117.]],
 
-                          [[ 27.],
-                           [127.]]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                         [[[ 18.],
-                           [118.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-                          [[ 28.],
-                           [128.]]],
 
+         [[[20., 30.],
+           [21., 31.],
+           [22., 32.],
+           [23., 33.]],
 
-                         [[[ 19.],
-                           [119.]],
+          [[21., 31.],
+           [22., 32.],
+           [23., 33.],
+           [24., 34.]],
 
-                          [[ 29.],
-                           [129.]]]],
+          [[22., 32.],
+           [23., 33.],
+           [24., 34.],
+           [25., 35.]]]],
 
 
-                        [[[[ 18.],
-                           [118.]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                          [[ 28.],
-                           [128.]]],
+          [[18., 28.],
+           [19., 29.],
+           [20., 30.],
+           [21., 31.]],
 
+          [[19., 29.],
+           [20., 30.],
+           [21., 31.],
+           [22., 32.]]],
 
-                         [[[ 19.],
-                           [119.]],
 
-                          [[ 29.],
-                           [129.]]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[23., 33.],
+           [24., 34.],
+           [25., 35.],
+           [26., 36.]],
 
-                         [[[ 20.],
-                           [120.]],
+          [[24., 34.],
+           [25., 35.],
+           [26., 36.],
+           [27., 37.]]]]], device='cuda:0')
 
-                          [[ 30.],
-                           [130.]]]],
 
+########## torch.float64/torch.int32/size=(2, 3)+(2, 3)+(4,) ##########
+# sparse tensor
+tensor(crow_indices=tensor([[[0, 3],
+                             [0, 3],
+                             [0, 3]],
 
+                            [[0, 3],
+                             [0, 3],
+                             [0, 3]]]),
+       col_indices=tensor([[[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]],
 
-                        [[[[ 19.],
-                           [119.]],
+                           [[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]]]),
+       values=tensor([[[[[[1., 2., 3., 4.]],
 
-                          [[ 29.],
-                           [129.]]],
+                         [[3., 4., 5., 6.]]],
 
 
-                         [[[ 20.],
-                           [120.]],
+                        [[[2., 3., 4., 5.]],
 
-                          [[ 30.],
-                           [130.]]],
+                         [[0., 0., 0., 0.]]],
 
 
-                         [[[ 21.],
-                           [121.]],
+                        [[[0., 0., 0., 0.]],
 
-                          [[ 31.],
-                           [131.]]]],
+                         [[4., 5., 6., 7.]]]],
 
 
-                        [[[[ 20.],
-                           [120.]],
+                       [[[[1., 2., 3., 4.]],
 
-                          [[ 30.],
-                           [130.]]],
+                         [[4., 5., 6., 7.]]],
 
 
-                         [[[ 21.],
-                           [121.]],
+                        [[[2., 3., 4., 5.]],
 
-                          [[ 31.],
-                           [131.]]],
+                         [[0., 0., 0., 0.]]],
 
 
-                         [[[ 22.],
-                           [122.]],
+                        [[[3., 4., 5., 6.]],
 
-                          [[ 32.],
-                           [132.]]]]],
+                         [[0., 0., 0., 0.]]]],
 
 
+                       [[[[1., 2., 3., 4.]],
 
-                       [[[[[ 21.],
-                           [121.]],
+                         [[2., 3., 4., 5.]]],
 
-                          [[ 31.],
-                           [131.]]],
 
+                        [[[0., 0., 0., 0.]],
 
-                         [[[ 22.],
-                           [122.]],
+                         [[3., 4., 5., 6.]]],
 
-                          [[ 32.],
-                           [132.]]],
 
+                        [[[0., 0., 0., 0.]],
 
-                         [[[ 23.],
-                           [123.]],
+                         [[4., 5., 6., 7.]]]]],
 
-                          [[ 33.],
-                           [133.]]]],
 
 
-                        [[[[ 22.],
-                           [122.]],
+                      [[[[[0., 0., 0., 0.]],
 
-                          [[ 32.],
-                           [132.]]],
+                         [[2., 3., 4., 5.]]],
 
 
-                         [[[ 23.],
-                           [123.]],
+                        [[[1., 2., 3., 4.]],
 
-                          [[ 33.],
-                           [133.]]],
+                         [[3., 4., 5., 6.]]],
 
 
-                         [[[ 24.],
-                           [124.]],
+                        [[[0., 0., 0., 0.]],
 
-                          [[ 34.],
-                           [134.]]]],
+                         [[4., 5., 6., 7.]]]],
 
 
-                        [[[[ 23.],
-                           [123.]],
+                       [[[[1., 2., 3., 4.]],
 
-                          [[ 33.],
-                           [133.]]],
+                         [[3., 4., 5., 6.]]],
 
 
-                         [[[ 24.],
-                           [124.]],
+                        [[[0., 0., 0., 0.]],
 
-                          [[ 34.],
-                           [134.]]],
+                         [[4., 5., 6., 7.]]],
 
 
-                         [[[ 25.],
-                           [125.]],
+                        [[[2., 3., 4., 5.]],
 
-                          [[ 35.],
-                           [135.]]]],
+                         [[0., 0., 0., 0.]]]],
 
 
-                        [[[[ 24.],
-                           [124.]],
+                       [[[[1., 2., 3., 4.]],
 
-                          [[ 34.],
-                           [134.]]],
+                         [[0., 0., 0., 0.]]],
 
 
-                         [[[ 25.],
-                           [125.]],
+                        [[[2., 3., 4., 5.]],
 
-                          [[ 35.],
-                           [135.]]],
+                         [[4., 5., 6., 7.]]],
 
 
-                         [[[ 26.],
-                           [126.]],
+                        [[[3., 4., 5., 6.]],
 
-                          [[ 36.],
-                           [136.]]]]]]]), device='cuda:0',
-       size=(2, 3, 6, 6, 2, 1), nnz=4, layout=torch.sparse_bsr)
+                         [[0., 0., 0., 0.]]]]]]), device='cuda:0',
+       size=(2, 3, 2, 3, 4), nnz=3, dtype=torch.float64, layout=torch.sparse_bsr)
 # _crow_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+tensor([[[0, 3],
+         [0, 3],
+         [0, 3]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], device='cuda:0', dtype=torch.int32)
+        [[0, 3],
+         [0, 3],
+         [0, 3]]], device='cuda:0', dtype=torch.int32)
 # _col_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
+tensor([[[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], device='cuda:0', dtype=torch.int32)
+        [[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]]], device='cuda:0', dtype=torch.int32)
 # _values
-tensor([[[[[[[  1.],
-             [101.]],
-
-            [[ 11.],
-             [111.]]],
-
-
-           [[[  2.],
-             [102.]],
-
-            [[ 12.],
-             [112.]]],
-
-
-           [[[  3.],
-             [103.]],
+tensor([[[[[[1., 2., 3., 4.]],
 
-            [[ 13.],
-             [113.]]]],
+           [[3., 4., 5., 6.]]],
 
 
+          [[[2., 3., 4., 5.]],
 
-          [[[[  2.],
-             [102.]],
+           [[0., 0., 0., 0.]]],
 
-            [[ 12.],
-             [112.]]],
 
+          [[[0., 0., 0., 0.]],
 
-           [[[  3.],
-             [103.]],
+           [[4., 5., 6., 7.]]]],
 
-            [[ 13.],
-             [113.]]],
 
 
-           [[[  4.],
-             [104.]],
+         [[[[1., 2., 3., 4.]],
 
-            [[ 14.],
-             [114.]]]],
+           [[4., 5., 6., 7.]]],
 
 
+          [[[2., 3., 4., 5.]],
 
-          [[[[  3.],
-             [103.]],
+           [[0., 0., 0., 0.]]],
 
-            [[ 13.],
-             [113.]]],
 
+          [[[3., 4., 5., 6.]],
 
-           [[[  4.],
-             [104.]],
+           [[0., 0., 0., 0.]]]],
 
-            [[ 14.],
-             [114.]]],
 
 
-           [[[  5.],
-             [105.]],
+         [[[[1., 2., 3., 4.]],
 
-            [[ 15.],
-             [115.]]]],
+           [[2., 3., 4., 5.]]],
 
 
+          [[[0., 0., 0., 0.]],
 
-          [[[[  4.],
-             [104.]],
+           [[3., 4., 5., 6.]]],
 
-            [[ 14.],
-             [114.]]],
 
+          [[[0., 0., 0., 0.]],
 
-           [[[  5.],
-             [105.]],
+           [[4., 5., 6., 7.]]]]],
 
-            [[ 15.],
-             [115.]]],
 
 
-           [[[  6.],
-             [106.]],
 
-            [[ 16.],
-             [116.]]]]],
+        [[[[[0., 0., 0., 0.]],
 
+           [[2., 3., 4., 5.]]],
 
 
+          [[[1., 2., 3., 4.]],
 
-         [[[[[  5.],
-             [105.]],
+           [[3., 4., 5., 6.]]],
 
-            [[ 15.],
-             [115.]]],
 
+          [[[0., 0., 0., 0.]],
 
-           [[[  6.],
-             [106.]],
+           [[4., 5., 6., 7.]]]],
 
-            [[ 16.],
-             [116.]]],
 
 
-           [[[  7.],
-             [107.]],
+         [[[[1., 2., 3., 4.]],
 
-            [[ 17.],
-             [117.]]]],
+           [[3., 4., 5., 6.]]],
 
 
+          [[[0., 0., 0., 0.]],
 
-          [[[[  6.],
-             [106.]],
+           [[4., 5., 6., 7.]]],
 
-            [[ 16.],
-             [116.]]],
 
+          [[[2., 3., 4., 5.]],
 
-           [[[  7.],
-             [107.]],
+           [[0., 0., 0., 0.]]]],
 
-            [[ 17.],
-             [117.]]],
 
 
-           [[[  8.],
-             [108.]],
+         [[[[1., 2., 3., 4.]],
 
-            [[ 18.],
-             [118.]]]],
+           [[0., 0., 0., 0.]]],
 
 
+          [[[2., 3., 4., 5.]],
 
-          [[[[  7.],
-             [107.]],
+           [[4., 5., 6., 7.]]],
 
-            [[ 17.],
-             [117.]]],
 
+          [[[3., 4., 5., 6.]],
 
-           [[[  8.],
-             [108.]],
+           [[0., 0., 0., 0.]]]]]], device='cuda:0', dtype=torch.float64)
 
-            [[ 18.],
-             [118.]]],
-
-
-           [[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]]]],
-
-
-
-          [[[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]]],
-
-
-           [[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]]],
-
-
-           [[[ 10.],
-             [110.]],
+########## torch.float64/torch.int32/size=()+(8, 6)+(4, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([0, 2, 3, 5, 7]),
+       col_indices=tensor([0, 1, 0, 0, 1, 0, 1]),
+       values=tensor([[[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 20.],
-             [120.]]]]],
+                        [[ 1., 11.],
+                         [ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
 
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-         [[[[[  9.],
-             [109.]],
+                        [[ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.]],
 
-            [[ 19.],
-             [119.]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
 
-           [[[ 10.],
-             [110.]],
 
-            [[ 20.],
-             [120.]]],
+                      [[[[ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-           [[[ 11.],
-             [111.]],
+                        [[ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.]]],
 
-            [[ 21.],
-             [121.]]]],
 
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.]],
 
-          [[[[ 10.],
-             [110.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-            [[ 20.],
-             [120.]]],
 
 
-           [[[ 11.],
-             [111.]],
+                      [[[[ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.]],
 
-            [[ 21.],
-             [121.]]],
+                        [[ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.]],
 
+                        [[ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.],
+                         [11., 21.]]],
 
-           [[[ 12.],
-             [112.]],
 
-            [[ 22.],
-             [122.]]]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-          [[[[ 11.],
-             [111.]],
 
-            [[ 21.],
-             [121.]]],
 
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-           [[[ 12.],
-             [112.]],
+                        [[ 9., 19.],
+                         [10., 20.],
+                         [11., 21.],
+                         [12., 22.]],
 
-            [[ 22.],
-             [122.]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
 
-           [[[ 13.],
-             [113.]],
+                       [[[13., 23.],
+                         [14., 24.],
+                         [15., 25.],
+                         [16., 26.]],
 
-            [[ 23.],
-             [123.]]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[14., 24.],
+                         [15., 25.],
+                         [16., 26.],
+                         [17., 27.]]]],
 
 
-          [[[[ 12.],
-             [112.]],
 
-            [[ 22.],
-             [122.]]],
+                      [[[[10., 20.],
+                         [11., 21.],
+                         [12., 22.],
+                         [13., 23.]],
 
+                        [[11., 21.],
+                         [12., 22.],
+                         [13., 23.],
+                         [14., 24.]],
 
-           [[[ 13.],
-             [113.]],
+                        [[12., 22.],
+                         [13., 23.],
+                         [14., 24.],
+                         [15., 25.]]],
 
-            [[ 23.],
-             [123.]]],
 
+                       [[[15., 25.],
+                         [16., 26.],
+                         [17., 27.],
+                         [18., 28.]],
 
-           [[[ 14.],
-             [114.]],
+                        [[16., 26.],
+                         [17., 27.],
+                         [18., 28.],
+                         [19., 29.]],
 
-            [[ 24.],
-             [124.]]]]]],
+                        [[17., 27.],
+                         [18., 28.],
+                         [19., 29.],
+                         [20., 30.]]]],
 
 
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-        [[[[[[ 13.],
-             [113.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
-            [[ 23.],
-             [123.]]],
 
+                       [[[20., 30.],
+                         [21., 31.],
+                         [22., 32.],
+                         [23., 33.]],
 
-           [[[ 14.],
-             [114.]],
+                        [[21., 31.],
+                         [22., 32.],
+                         [23., 33.],
+                         [24., 34.]],
 
-            [[ 24.],
-             [124.]]],
+                        [[22., 32.],
+                         [23., 33.],
+                         [24., 34.],
+                         [25., 35.]]]],
 
 
-           [[[ 15.],
-             [115.]],
 
-            [[ 25.],
-             [125.]]]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[18., 28.],
+                         [19., 29.],
+                         [20., 30.],
+                         [21., 31.]],
 
+                        [[19., 29.],
+                         [20., 30.],
+                         [21., 31.],
+                         [22., 32.]]],
 
-          [[[[ 14.],
-             [114.]],
 
-            [[ 24.],
-             [124.]]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[23., 33.],
+                         [24., 34.],
+                         [25., 35.],
+                         [26., 36.]],
 
-           [[[ 15.],
-             [115.]],
+                        [[24., 34.],
+                         [25., 35.],
+                         [26., 36.],
+                         [27., 37.]]]]]), device='cuda:0', size=(8, 6, 4, 2),
+       nnz=7, dtype=torch.float64, layout=torch.sparse_bsr)
+# _crow_indices
+tensor([0, 2, 3, 5, 7], device='cuda:0', dtype=torch.int32)
+# _col_indices
+tensor([0, 1, 0, 0, 1, 0, 1], device='cuda:0', dtype=torch.int32)
+# _values
+tensor([[[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 25.],
-             [125.]]],
+          [[ 1., 11.],
+           [ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-           [[[ 16.],
-             [116.]],
 
-            [[ 26.],
-             [126.]]]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-          [[[[ 15.],
-             [115.]],
 
-            [[ 25.],
-             [125.]]],
 
+        [[[[ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.]],
 
-           [[[ 16.],
-             [116.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 26.],
-             [126.]]],
+          [[ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.]]],
 
 
-           [[[ 17.],
-             [117.]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 27.],
-             [127.]]]],
+          [[ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
 
-          [[[[ 16.],
-             [116.]],
 
-            [[ 26.],
-             [126.]]],
+        [[[[ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.]],
 
+          [[ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.],
+           [10., 20.]],
 
-           [[[ 17.],
-             [117.]],
+          [[ 8., 18.],
+           [ 9., 19.],
+           [10., 20.],
+           [11., 21.]]],
 
-            [[ 27.],
-             [127.]]],
 
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-           [[[ 18.],
-             [118.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 28.],
-             [128.]]]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
 
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-         [[[[[ 17.],
-             [117.]],
+          [[ 9., 19.],
+           [10., 20.],
+           [11., 21.],
+           [12., 22.]],
 
-            [[ 27.],
-             [127.]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
 
-           [[[ 18.],
-             [118.]],
+         [[[13., 23.],
+           [14., 24.],
+           [15., 25.],
+           [16., 26.]],
 
-            [[ 28.],
-             [128.]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[14., 24.],
+           [15., 25.],
+           [16., 26.],
+           [17., 27.]]]],
 
-           [[[ 19.],
-             [119.]],
 
-            [[ 29.],
-             [129.]]]],
 
+        [[[[10., 20.],
+           [11., 21.],
+           [12., 22.],
+           [13., 23.]],
 
+          [[11., 21.],
+           [12., 22.],
+           [13., 23.],
+           [14., 24.]],
 
-          [[[[ 18.],
-             [118.]],
+          [[12., 22.],
+           [13., 23.],
+           [14., 24.],
+           [15., 25.]]],
 
-            [[ 28.],
-             [128.]]],
 
+         [[[15., 25.],
+           [16., 26.],
+           [17., 27.],
+           [18., 28.]],
 
-           [[[ 19.],
-             [119.]],
+          [[16., 26.],
+           [17., 27.],
+           [18., 28.],
+           [19., 29.]],
 
-            [[ 29.],
-             [129.]]],
+          [[17., 27.],
+           [18., 28.],
+           [19., 29.],
+           [20., 30.]]]],
 
 
-           [[[ 20.],
-             [120.]],
 
-            [[ 30.],
-             [130.]]]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-          [[[[ 19.],
-             [119.]],
 
-            [[ 29.],
-             [129.]]],
+         [[[20., 30.],
+           [21., 31.],
+           [22., 32.],
+           [23., 33.]],
 
+          [[21., 31.],
+           [22., 32.],
+           [23., 33.],
+           [24., 34.]],
 
-           [[[ 20.],
-             [120.]],
+          [[22., 32.],
+           [23., 33.],
+           [24., 34.],
+           [25., 35.]]]],
 
-            [[ 30.],
-             [130.]]],
 
 
-           [[[ 21.],
-             [121.]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 31.],
-             [131.]]]],
+          [[18., 28.],
+           [19., 29.],
+           [20., 30.],
+           [21., 31.]],
 
+          [[19., 29.],
+           [20., 30.],
+           [21., 31.],
+           [22., 32.]]],
 
 
-          [[[[ 20.],
-             [120.]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 30.],
-             [130.]]],
+          [[23., 33.],
+           [24., 34.],
+           [25., 35.],
+           [26., 36.]],
 
+          [[24., 34.],
+           [25., 35.],
+           [26., 36.],
+           [27., 37.]]]]], device='cuda:0', dtype=torch.float64)
 
-           [[[ 21.],
-             [121.]],
 
-            [[ 31.],
-             [131.]]],
+########## torch.float32/torch.int64/size=(2, 3)+(2, 3)+(4,) ##########
+# sparse tensor
+tensor(crow_indices=tensor([[[0, 3],
+                             [0, 3],
+                             [0, 3]],
 
+                            [[0, 3],
+                             [0, 3],
+                             [0, 3]]]),
+       col_indices=tensor([[[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]],
 
-           [[[ 22.],
-             [122.]],
+                           [[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]]]),
+       values=tensor([[[[[[1., 2., 3., 4.]],
 
-            [[ 32.],
-             [132.]]]]],
+                         [[3., 4., 5., 6.]]],
 
 
+                        [[[2., 3., 4., 5.]],
 
+                         [[0., 0., 0., 0.]]],
 
-         [[[[[ 21.],
-             [121.]],
 
-            [[ 31.],
-             [131.]]],
+                        [[[0., 0., 0., 0.]],
 
+                         [[4., 5., 6., 7.]]]],
 
-           [[[ 22.],
-             [122.]],
 
-            [[ 32.],
-             [132.]]],
 
+                       [[[[1., 2., 3., 4.]],
 
-           [[[ 23.],
-             [123.]],
+                         [[4., 5., 6., 7.]]],
 
-            [[ 33.],
-             [133.]]]],
 
+                        [[[2., 3., 4., 5.]],
 
+                         [[0., 0., 0., 0.]]],
 
-          [[[[ 22.],
-             [122.]],
 
-            [[ 32.],
-             [132.]]],
+                        [[[3., 4., 5., 6.]],
 
+                         [[0., 0., 0., 0.]]]],
 
-           [[[ 23.],
-             [123.]],
 
-            [[ 33.],
-             [133.]]],
 
+                       [[[[1., 2., 3., 4.]],
 
-           [[[ 24.],
-             [124.]],
+                         [[2., 3., 4., 5.]]],
 
-            [[ 34.],
-             [134.]]]],
 
+                        [[[0., 0., 0., 0.]],
 
+                         [[3., 4., 5., 6.]]],
 
-          [[[[ 23.],
-             [123.]],
 
-            [[ 33.],
-             [133.]]],
+                        [[[0., 0., 0., 0.]],
 
+                         [[4., 5., 6., 7.]]]]],
 
-           [[[ 24.],
-             [124.]],
 
-            [[ 34.],
-             [134.]]],
 
 
-           [[[ 25.],
-             [125.]],
+                      [[[[[0., 0., 0., 0.]],
 
-            [[ 35.],
-             [135.]]]],
+                         [[2., 3., 4., 5.]]],
 
 
+                        [[[1., 2., 3., 4.]],
 
-          [[[[ 24.],
-             [124.]],
+                         [[3., 4., 5., 6.]]],
 
-            [[ 34.],
-             [134.]]],
 
+                        [[[0., 0., 0., 0.]],
 
-           [[[ 25.],
-             [125.]],
+                         [[4., 5., 6., 7.]]]],
 
-            [[ 35.],
-             [135.]]],
 
 
-           [[[ 26.],
-             [126.]],
+                       [[[[1., 2., 3., 4.]],
 
-            [[ 36.],
-             [136.]]]]]]], device='cuda:0')
+                         [[3., 4., 5., 6.]]],
 
 
-########## torch.float64/torch.int32/size=()+(6, 6)+(2,) ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[  1., 101.],
-                        [ 11., 111.]],
+                        [[[0., 0., 0., 0.]],
 
-                       [[  2., 102.],
-                        [ 12., 112.]],
+                         [[4., 5., 6., 7.]]],
 
-                       [[  3., 103.],
-                        [ 13., 113.]]],
 
+                        [[[2., 3., 4., 5.]],
 
-                      [[[  2., 102.],
-                        [ 12., 112.]],
+                         [[0., 0., 0., 0.]]]],
 
-                       [[  3., 103.],
-                        [ 13., 113.]],
 
-                       [[  4., 104.],
-                        [ 14., 114.]]],
 
+                       [[[[1., 2., 3., 4.]],
 
-                      [[[  3., 103.],
-                        [ 13., 113.]],
+                         [[0., 0., 0., 0.]]],
 
-                       [[  4., 104.],
-                        [ 14., 114.]],
 
-                       [[  5., 105.],
-                        [ 15., 115.]]],
+                        [[[2., 3., 4., 5.]],
 
+                         [[4., 5., 6., 7.]]],
 
-                      [[[  4., 104.],
-                        [ 14., 114.]],
 
-                       [[  5., 105.],
-                        [ 15., 115.]],
+                        [[[3., 4., 5., 6.]],
 
-                       [[  6., 106.],
-                        [ 16., 116.]]]]), device='cuda:0', size=(6, 6, 2),
-       nnz=4, dtype=torch.float64, layout=torch.sparse_bsr)
+                         [[0., 0., 0., 0.]]]]]]), device='cuda:0',
+       size=(2, 3, 2, 3, 4), nnz=3, layout=torch.sparse_bsr)
 # _crow_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 2], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[[[  1., 101.],
-          [ 11., 111.]],
-
-         [[  2., 102.],
-          [ 12., 112.]],
-
-         [[  3., 103.],
-          [ 13., 113.]]],
-
-
-        [[[  2., 102.],
-          [ 12., 112.]],
+tensor([[[0, 3],
+         [0, 3],
+         [0, 3]],
 
-         [[  3., 103.],
-          [ 13., 113.]],
-
-         [[  4., 104.],
-          [ 14., 114.]]],
-
-
-        [[[  3., 103.],
-          [ 13., 113.]],
-
-         [[  4., 104.],
-          [ 14., 114.]],
+        [[0, 3],
+         [0, 3],
+         [0, 3]]], device='cuda:0')
+# _col_indices
+tensor([[[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]],
 
-         [[  5., 105.],
-          [ 15., 115.]]],
+        [[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]]], device='cuda:0')
+# _values
+tensor([[[[[[1., 2., 3., 4.]],
 
+           [[3., 4., 5., 6.]]],
 
-        [[[  4., 104.],
-          [ 14., 114.]],
 
-         [[  5., 105.],
-          [ 15., 115.]],
+          [[[2., 3., 4., 5.]],
 
-         [[  6., 106.],
-          [ 16., 116.]]]], device='cuda:0', dtype=torch.float64)
+           [[0., 0., 0., 0.]]],
 
-########## torch.float64/torch.int32/size=()+(4, 9)+(4, 2) ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[[1.0000e+00, 1.0010e+03],
-                         [1.0100e+02, 1.1010e+03],
-                         [2.0100e+02, 1.2010e+03],
-                         [3.0100e+02, 1.3010e+03]],
 
-                        [[1.1000e+01, 1.0110e+03],
-                         [1.1100e+02, 1.1110e+03],
-                         [2.1100e+02, 1.2110e+03],
-                         [3.1100e+02, 1.3110e+03]],
+          [[[0., 0., 0., 0.]],
 
-                        [[2.1000e+01, 1.0210e+03],
-                         [1.2100e+02, 1.1210e+03],
-                         [2.2100e+02, 1.2210e+03],
-                         [3.2100e+02, 1.3210e+03]]],
+           [[4., 5., 6., 7.]]]],
 
 
-                       [[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
 
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]],
+         [[[[1., 2., 3., 4.]],
 
-                        [[2.2000e+01, 1.0220e+03],
-                         [1.2200e+02, 1.1220e+03],
-                         [2.2200e+02, 1.2220e+03],
-                         [3.2200e+02, 1.3220e+03]]]],
+           [[4., 5., 6., 7.]]],
 
 
+          [[[2., 3., 4., 5.]],
 
-                      [[[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
+           [[0., 0., 0., 0.]]],
 
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]],
 
-                        [[2.2000e+01, 1.0220e+03],
-                         [1.2200e+02, 1.1220e+03],
-                         [2.2200e+02, 1.2220e+03],
-                         [3.2200e+02, 1.3220e+03]]],
+          [[[3., 4., 5., 6.]],
 
+           [[0., 0., 0., 0.]]]],
 
-                       [[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
 
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]],
 
-                        [[2.3000e+01, 1.0230e+03],
-                         [1.2300e+02, 1.1230e+03],
-                         [2.2300e+02, 1.2230e+03],
-                         [3.2300e+02, 1.3230e+03]]]],
+         [[[[1., 2., 3., 4.]],
 
+           [[2., 3., 4., 5.]]],
 
 
-                      [[[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
+          [[[0., 0., 0., 0.]],
 
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]],
+           [[3., 4., 5., 6.]]],
 
-                        [[2.3000e+01, 1.0230e+03],
-                         [1.2300e+02, 1.1230e+03],
-                         [2.2300e+02, 1.2230e+03],
-                         [3.2300e+02, 1.3230e+03]]],
 
+          [[[0., 0., 0., 0.]],
 
-                       [[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
+           [[4., 5., 6., 7.]]]]],
 
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]],
 
-                        [[2.4000e+01, 1.0240e+03],
-                         [1.2400e+02, 1.1240e+03],
-                         [2.2400e+02, 1.2240e+03],
-                         [3.2400e+02, 1.3240e+03]]]],
 
 
+        [[[[[0., 0., 0., 0.]],
 
-                      [[[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
+           [[2., 3., 4., 5.]]],
 
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]],
 
-                        [[2.4000e+01, 1.0240e+03],
-                         [1.2400e+02, 1.1240e+03],
-                         [2.2400e+02, 1.2240e+03],
-                         [3.2400e+02, 1.3240e+03]]],
+          [[[1., 2., 3., 4.]],
 
+           [[3., 4., 5., 6.]]],
 
-                       [[[5.0000e+00, 1.0050e+03],
-                         [1.0500e+02, 1.1050e+03],
-                         [2.0500e+02, 1.2050e+03],
-                         [3.0500e+02, 1.3050e+03]],
 
-                        [[1.5000e+01, 1.0150e+03],
-                         [1.1500e+02, 1.1150e+03],
-                         [2.1500e+02, 1.2150e+03],
-                         [3.1500e+02, 1.3150e+03]],
+          [[[0., 0., 0., 0.]],
 
-                        [[2.5000e+01, 1.0250e+03],
-                         [1.2500e+02, 1.1250e+03],
-                         [2.2500e+02, 1.2250e+03],
-                         [3.2500e+02, 1.3250e+03]]]]]), device='cuda:0',
-       size=(4, 9, 4, 2), nnz=4, dtype=torch.float64, layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 2], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[[[[1.0000e+00, 1.0010e+03],
-           [1.0100e+02, 1.1010e+03],
-           [2.0100e+02, 1.2010e+03],
-           [3.0100e+02, 1.3010e+03]],
+           [[4., 5., 6., 7.]]]],
 
-          [[1.1000e+01, 1.0110e+03],
-           [1.1100e+02, 1.1110e+03],
-           [2.1100e+02, 1.2110e+03],
-           [3.1100e+02, 1.3110e+03]],
 
-          [[2.1000e+01, 1.0210e+03],
-           [1.2100e+02, 1.1210e+03],
-           [2.2100e+02, 1.2210e+03],
-           [3.2100e+02, 1.3210e+03]]],
 
+         [[[[1., 2., 3., 4.]],
 
-         [[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
+           [[3., 4., 5., 6.]]],
 
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]],
 
-          [[2.2000e+01, 1.0220e+03],
-           [1.2200e+02, 1.1220e+03],
-           [2.2200e+02, 1.2220e+03],
-           [3.2200e+02, 1.3220e+03]]]],
+          [[[0., 0., 0., 0.]],
 
+           [[4., 5., 6., 7.]]],
 
 
-        [[[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
+          [[[2., 3., 4., 5.]],
 
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]],
+           [[0., 0., 0., 0.]]]],
 
-          [[2.2000e+01, 1.0220e+03],
-           [1.2200e+02, 1.1220e+03],
-           [2.2200e+02, 1.2220e+03],
-           [3.2200e+02, 1.3220e+03]]],
 
 
-         [[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
+         [[[[1., 2., 3., 4.]],
 
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]],
+           [[0., 0., 0., 0.]]],
 
-          [[2.3000e+01, 1.0230e+03],
-           [1.2300e+02, 1.1230e+03],
-           [2.2300e+02, 1.2230e+03],
-           [3.2300e+02, 1.3230e+03]]]],
 
+          [[[2., 3., 4., 5.]],
 
+           [[4., 5., 6., 7.]]],
 
-        [[[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
 
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]],
+          [[[3., 4., 5., 6.]],
 
-          [[2.3000e+01, 1.0230e+03],
-           [1.2300e+02, 1.1230e+03],
-           [2.2300e+02, 1.2230e+03],
-           [3.2300e+02, 1.3230e+03]]],
+           [[0., 0., 0., 0.]]]]]], device='cuda:0')
 
+########## torch.float32/torch.int64/size=()+(8, 6)+(4, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([0, 2, 3, 5, 7]),
+       col_indices=tensor([0, 1, 0, 0, 1, 0, 1]),
+       values=tensor([[[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-         [[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
+                        [[ 1., 11.],
+                         [ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.]],
 
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
-          [[2.4000e+01, 1.0240e+03],
-           [1.2400e+02, 1.1240e+03],
-           [2.2400e+02, 1.2240e+03],
-           [3.2400e+02, 1.3240e+03]]]],
 
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.]],
 
-        [[[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]],
 
-          [[2.4000e+01, 1.0240e+03],
-           [1.2400e+02, 1.1240e+03],
-           [2.2400e+02, 1.2240e+03],
-           [3.2400e+02, 1.3240e+03]]],
 
+                      [[[[ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.]],
 
-         [[[5.0000e+00, 1.0050e+03],
-           [1.0500e+02, 1.1050e+03],
-           [2.0500e+02, 1.2050e+03],
-           [3.0500e+02, 1.3050e+03]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-          [[1.5000e+01, 1.0150e+03],
-           [1.1500e+02, 1.1150e+03],
-           [2.1500e+02, 1.2150e+03],
-           [3.1500e+02, 1.3150e+03]],
+                        [[ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.]]],
 
-          [[2.5000e+01, 1.0250e+03],
-           [1.2500e+02, 1.1250e+03],
-           [2.2500e+02, 1.2250e+03],
-           [3.2500e+02, 1.3250e+03]]]]], device='cuda:0', dtype=torch.float64)
 
-########## torch.float64/torch.int32/size=(2, 3)+(6, 6)+(2, 1) ##########
-# sparse tensor
-tensor(crow_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
+                        [[ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.]],
 
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[[[  1.],
-                           [101.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-                          [[ 11.],
-                           [111.]]],
 
 
-                         [[[  2.],
-                           [102.]],
+                      [[[[ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.]],
 
-                          [[ 12.],
-                           [112.]]],
+                        [[ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.]],
 
+                        [[ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.],
+                         [11., 21.]]],
 
-                         [[[  3.],
-                           [103.]],
 
-                          [[ 13.],
-                           [113.]]]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-                        [[[[  2.],
-                           [102.]],
 
-                          [[ 12.],
-                           [112.]]],
 
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                         [[[  3.],
-                           [103.]],
+                        [[ 9., 19.],
+                         [10., 20.],
+                         [11., 21.],
+                         [12., 22.]],
 
-                          [[ 13.],
-                           [113.]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
 
-                         [[[  4.],
-                           [104.]],
+                       [[[13., 23.],
+                         [14., 24.],
+                         [15., 25.],
+                         [16., 26.]],
 
-                          [[ 14.],
-                           [114.]]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[14., 24.],
+                         [15., 25.],
+                         [16., 26.],
+                         [17., 27.]]]],
 
 
-                        [[[[  3.],
-                           [103.]],
 
-                          [[ 13.],
-                           [113.]]],
+                      [[[[10., 20.],
+                         [11., 21.],
+                         [12., 22.],
+                         [13., 23.]],
 
+                        [[11., 21.],
+                         [12., 22.],
+                         [13., 23.],
+                         [14., 24.]],
 
-                         [[[  4.],
-                           [104.]],
+                        [[12., 22.],
+                         [13., 23.],
+                         [14., 24.],
+                         [15., 25.]]],
 
-                          [[ 14.],
-                           [114.]]],
 
+                       [[[15., 25.],
+                         [16., 26.],
+                         [17., 27.],
+                         [18., 28.]],
 
-                         [[[  5.],
-                           [105.]],
+                        [[16., 26.],
+                         [17., 27.],
+                         [18., 28.],
+                         [19., 29.]],
 
-                          [[ 15.],
-                           [115.]]]],
+                        [[17., 27.],
+                         [18., 28.],
+                         [19., 29.],
+                         [20., 30.]]]],
 
 
-                        [[[[  4.],
-                           [104.]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 14.],
-                           [114.]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
-                         [[[  5.],
-                           [105.]],
 
-                          [[ 15.],
-                           [115.]]],
+                       [[[20., 30.],
+                         [21., 31.],
+                         [22., 32.],
+                         [23., 33.]],
 
+                        [[21., 31.],
+                         [22., 32.],
+                         [23., 33.],
+                         [24., 34.]],
 
-                         [[[  6.],
-                           [106.]],
+                        [[22., 32.],
+                         [23., 33.],
+                         [24., 34.],
+                         [25., 35.]]]],
 
-                          [[ 16.],
-                           [116.]]]]],
 
 
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[18., 28.],
+                         [19., 29.],
+                         [20., 30.],
+                         [21., 31.]],
 
-                       [[[[[  5.],
-                           [105.]],
+                        [[19., 29.],
+                         [20., 30.],
+                         [21., 31.],
+                         [22., 32.]]],
 
-                          [[ 15.],
-                           [115.]]],
 
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                         [[[  6.],
-                           [106.]],
+                        [[23., 33.],
+                         [24., 34.],
+                         [25., 35.],
+                         [26., 36.]],
 
-                          [[ 16.],
-                           [116.]]],
+                        [[24., 34.],
+                         [25., 35.],
+                         [26., 36.],
+                         [27., 37.]]]]]), device='cuda:0', size=(8, 6, 4, 2),
+       nnz=7, layout=torch.sparse_bsr)
+# _crow_indices
+tensor([0, 2, 3, 5, 7], device='cuda:0')
+# _col_indices
+tensor([0, 1, 0, 0, 1, 0, 1], device='cuda:0')
+# _values
+tensor([[[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 1., 11.],
+           [ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.]],
 
-                         [[[  7.],
-                           [107.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-                          [[ 17.],
-                           [117.]]]],
 
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.]],
 
-                        [[[[  6.],
-                           [106.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-                          [[ 16.],
-                           [116.]]],
 
 
-                         [[[  7.],
-                           [107.]],
+        [[[[ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.]],
 
-                          [[ 17.],
-                           [117.]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.]]],
 
-                         [[[  8.],
-                           [108.]],
 
-                          [[ 18.],
-                           [118.]]]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-                        [[[[  7.],
-                           [107.]],
 
-                          [[ 17.],
-                           [117.]]],
 
+        [[[[ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.]],
 
-                         [[[  8.],
-                           [108.]],
+          [[ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.],
+           [10., 20.]],
 
-                          [[ 18.],
-                           [118.]]],
+          [[ 8., 18.],
+           [ 9., 19.],
+           [10., 20.],
+           [11., 21.]]],
 
 
-                         [[[  9.],
-                           [109.]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                          [[ 19.],
-                           [119.]]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
 
-                        [[[[  8.],
-                           [108.]],
 
-                          [[ 18.],
-                           [118.]]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 9., 19.],
+           [10., 20.],
+           [11., 21.],
+           [12., 22.]],
 
-                         [[[  9.],
-                           [109.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-                          [[ 19.],
-                           [119.]]],
 
+         [[[13., 23.],
+           [14., 24.],
+           [15., 25.],
+           [16., 26.]],
 
-                         [[[ 10.],
-                           [110.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                          [[ 20.],
-                           [120.]]]]],
+          [[14., 24.],
+           [15., 25.],
+           [16., 26.],
+           [17., 27.]]]],
 
 
+        [[[[10., 20.],
+           [11., 21.],
+           [12., 22.],
+           [13., 23.]],
 
-                       [[[[[  9.],
-                           [109.]],
+          [[11., 21.],
+           [12., 22.],
+           [13., 23.],
+           [14., 24.]],
 
-                          [[ 19.],
-                           [119.]]],
+          [[12., 22.],
+           [13., 23.],
+           [14., 24.],
+           [15., 25.]]],
 
 
-                         [[[ 10.],
-                           [110.]],
+         [[[15., 25.],
+           [16., 26.],
+           [17., 27.],
+           [18., 28.]],
 
-                          [[ 20.],
-                           [120.]]],
+          [[16., 26.],
+           [17., 27.],
+           [18., 28.],
+           [19., 29.]],
 
+          [[17., 27.],
+           [18., 28.],
+           [19., 29.],
+           [20., 30.]]]],
 
-                         [[[ 11.],
-                           [111.]],
 
-                          [[ 21.],
-                           [121.]]]],
 
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-                        [[[[ 10.],
-                           [110.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-                          [[ 20.],
-                           [120.]]],
 
+         [[[20., 30.],
+           [21., 31.],
+           [22., 32.],
+           [23., 33.]],
 
-                         [[[ 11.],
-                           [111.]],
+          [[21., 31.],
+           [22., 32.],
+           [23., 33.],
+           [24., 34.]],
 
-                          [[ 21.],
-                           [121.]]],
+          [[22., 32.],
+           [23., 33.],
+           [24., 34.],
+           [25., 35.]]]],
 
 
-                         [[[ 12.],
-                           [112.]],
 
-                          [[ 22.],
-                           [122.]]]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[18., 28.],
+           [19., 29.],
+           [20., 30.],
+           [21., 31.]],
 
+          [[19., 29.],
+           [20., 30.],
+           [21., 31.],
+           [22., 32.]]],
 
-                        [[[[ 11.],
-                           [111.]],
 
-                          [[ 21.],
-                           [121.]]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[23., 33.],
+           [24., 34.],
+           [25., 35.],
+           [26., 36.]],
 
-                         [[[ 12.],
-                           [112.]],
+          [[24., 34.],
+           [25., 35.],
+           [26., 36.],
+           [27., 37.]]]]], device='cuda:0')
 
-                          [[ 22.],
-                           [122.]]],
 
+########## torch.float64/torch.int64/size=(2, 3)+(2, 3)+(4,) ##########
+# sparse tensor
+tensor(crow_indices=tensor([[[0, 3],
+                             [0, 3],
+                             [0, 3]],
 
-                         [[[ 13.],
-                           [113.]],
+                            [[0, 3],
+                             [0, 3],
+                             [0, 3]]]),
+       col_indices=tensor([[[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]],
 
-                          [[ 23.],
-                           [123.]]]],
+                           [[0, 1, 2],
+                            [0, 1, 2],
+                            [0, 1, 2]]]),
+       values=tensor([[[[[[1., 2., 3., 4.]],
 
+                         [[3., 4., 5., 6.]]],
 
 
-                        [[[[ 12.],
-                           [112.]],
+                        [[[2., 3., 4., 5.]],
 
-                          [[ 22.],
-                           [122.]]],
+                         [[0., 0., 0., 0.]]],
 
 
-                         [[[ 13.],
-                           [113.]],
+                        [[[0., 0., 0., 0.]],
 
-                          [[ 23.],
-                           [123.]]],
+                         [[4., 5., 6., 7.]]]],
 
 
-                         [[[ 14.],
-                           [114.]],
 
-                          [[ 24.],
-                           [124.]]]]]],
+                       [[[[1., 2., 3., 4.]],
 
+                         [[4., 5., 6., 7.]]],
 
 
+                        [[[2., 3., 4., 5.]],
 
+                         [[0., 0., 0., 0.]]],
 
-                      [[[[[[ 13.],
-                           [113.]],
 
-                          [[ 23.],
-                           [123.]]],
+                        [[[3., 4., 5., 6.]],
 
+                         [[0., 0., 0., 0.]]]],
 
-                         [[[ 14.],
-                           [114.]],
 
-                          [[ 24.],
-                           [124.]]],
 
+                       [[[[1., 2., 3., 4.]],
 
-                         [[[ 15.],
-                           [115.]],
+                         [[2., 3., 4., 5.]]],
 
-                          [[ 25.],
-                           [125.]]]],
 
+                        [[[0., 0., 0., 0.]],
 
+                         [[3., 4., 5., 6.]]],
 
-                        [[[[ 14.],
-                           [114.]],
 
-                          [[ 24.],
-                           [124.]]],
+                        [[[0., 0., 0., 0.]],
 
+                         [[4., 5., 6., 7.]]]]],
 
-                         [[[ 15.],
-                           [115.]],
 
-                          [[ 25.],
-                           [125.]]],
 
 
-                         [[[ 16.],
-                           [116.]],
+                      [[[[[0., 0., 0., 0.]],
 
-                          [[ 26.],
-                           [126.]]]],
+                         [[2., 3., 4., 5.]]],
 
 
+                        [[[1., 2., 3., 4.]],
 
-                        [[[[ 15.],
-                           [115.]],
+                         [[3., 4., 5., 6.]]],
 
-                          [[ 25.],
-                           [125.]]],
 
+                        [[[0., 0., 0., 0.]],
 
-                         [[[ 16.],
-                           [116.]],
+                         [[4., 5., 6., 7.]]]],
 
-                          [[ 26.],
-                           [126.]]],
 
 
-                         [[[ 17.],
-                           [117.]],
+                       [[[[1., 2., 3., 4.]],
 
-                          [[ 27.],
-                           [127.]]]],
+                         [[3., 4., 5., 6.]]],
 
 
+                        [[[0., 0., 0., 0.]],
 
-                        [[[[ 16.],
-                           [116.]],
+                         [[4., 5., 6., 7.]]],
 
-                          [[ 26.],
-                           [126.]]],
 
+                        [[[2., 3., 4., 5.]],
 
-                         [[[ 17.],
-                           [117.]],
+                         [[0., 0., 0., 0.]]]],
 
-                          [[ 27.],
-                           [127.]]],
 
 
-                         [[[ 18.],
-                           [118.]],
+                       [[[[1., 2., 3., 4.]],
 
-                          [[ 28.],
-                           [128.]]]]],
+                         [[0., 0., 0., 0.]]],
 
 
+                        [[[2., 3., 4., 5.]],
 
+                         [[4., 5., 6., 7.]]],
 
-                       [[[[[ 17.],
-                           [117.]],
 
-                          [[ 27.],
-                           [127.]]],
+                        [[[3., 4., 5., 6.]],
 
+                         [[0., 0., 0., 0.]]]]]]), device='cuda:0',
+       size=(2, 3, 2, 3, 4), nnz=3, dtype=torch.float64, layout=torch.sparse_bsr)
+# _crow_indices
+tensor([[[0, 3],
+         [0, 3],
+         [0, 3]],
 
-                         [[[ 18.],
-                           [118.]],
+        [[0, 3],
+         [0, 3],
+         [0, 3]]], device='cuda:0')
+# _col_indices
+tensor([[[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]],
 
-                          [[ 28.],
-                           [128.]]],
+        [[0, 1, 2],
+         [0, 1, 2],
+         [0, 1, 2]]], device='cuda:0')
+# _values
+tensor([[[[[[1., 2., 3., 4.]],
 
+           [[3., 4., 5., 6.]]],
 
-                         [[[ 19.],
-                           [119.]],
 
-                          [[ 29.],
-                           [129.]]]],
+          [[[2., 3., 4., 5.]],
 
+           [[0., 0., 0., 0.]]],
 
 
-                        [[[[ 18.],
-                           [118.]],
+          [[[0., 0., 0., 0.]],
 
-                          [[ 28.],
-                           [128.]]],
+           [[4., 5., 6., 7.]]]],
 
 
-                         [[[ 19.],
-                           [119.]],
 
-                          [[ 29.],
-                           [129.]]],
+         [[[[1., 2., 3., 4.]],
 
+           [[4., 5., 6., 7.]]],
 
-                         [[[ 20.],
-                           [120.]],
 
-                          [[ 30.],
-                           [130.]]]],
+          [[[2., 3., 4., 5.]],
 
+           [[0., 0., 0., 0.]]],
 
 
-                        [[[[ 19.],
-                           [119.]],
+          [[[3., 4., 5., 6.]],
 
-                          [[ 29.],
-                           [129.]]],
+           [[0., 0., 0., 0.]]]],
 
 
-                         [[[ 20.],
-                           [120.]],
 
-                          [[ 30.],
-                           [130.]]],
+         [[[[1., 2., 3., 4.]],
 
+           [[2., 3., 4., 5.]]],
 
-                         [[[ 21.],
-                           [121.]],
 
-                          [[ 31.],
-                           [131.]]]],
+          [[[0., 0., 0., 0.]],
 
+           [[3., 4., 5., 6.]]],
 
 
-                        [[[[ 20.],
-                           [120.]],
+          [[[0., 0., 0., 0.]],
 
-                          [[ 30.],
-                           [130.]]],
+           [[4., 5., 6., 7.]]]]],
 
 
-                         [[[ 21.],
-                           [121.]],
 
-                          [[ 31.],
-                           [131.]]],
 
+        [[[[[0., 0., 0., 0.]],
 
-                         [[[ 22.],
-                           [122.]],
+           [[2., 3., 4., 5.]]],
 
-                          [[ 32.],
-                           [132.]]]]],
 
+          [[[1., 2., 3., 4.]],
 
+           [[3., 4., 5., 6.]]],
 
 
-                       [[[[[ 21.],
-                           [121.]],
+          [[[0., 0., 0., 0.]],
 
-                          [[ 31.],
-                           [131.]]],
+           [[4., 5., 6., 7.]]]],
 
 
-                         [[[ 22.],
-                           [122.]],
 
-                          [[ 32.],
-                           [132.]]],
+         [[[[1., 2., 3., 4.]],
 
+           [[3., 4., 5., 6.]]],
 
-                         [[[ 23.],
-                           [123.]],
 
-                          [[ 33.],
-                           [133.]]]],
+          [[[0., 0., 0., 0.]],
 
+           [[4., 5., 6., 7.]]],
 
 
-                        [[[[ 22.],
-                           [122.]],
+          [[[2., 3., 4., 5.]],
 
-                          [[ 32.],
-                           [132.]]],
+           [[0., 0., 0., 0.]]]],
 
 
-                         [[[ 23.],
-                           [123.]],
 
-                          [[ 33.],
-                           [133.]]],
+         [[[[1., 2., 3., 4.]],
 
+           [[0., 0., 0., 0.]]],
 
-                         [[[ 24.],
-                           [124.]],
 
-                          [[ 34.],
-                           [134.]]]],
+          [[[2., 3., 4., 5.]],
 
+           [[4., 5., 6., 7.]]],
 
 
-                        [[[[ 23.],
-                           [123.]],
+          [[[3., 4., 5., 6.]],
 
-                          [[ 33.],
-                           [133.]]],
+           [[0., 0., 0., 0.]]]]]], device='cuda:0', dtype=torch.float64)
 
+########## torch.float64/torch.int64/size=()+(8, 6)+(4, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([0, 2, 3, 5, 7]),
+       col_indices=tensor([0, 1, 0, 0, 1, 0, 1]),
+       values=tensor([[[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                         [[[ 24.],
-                           [124.]],
+                        [[ 1., 11.],
+                         [ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.]],
 
-                          [[ 34.],
-                           [134.]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
 
-                         [[[ 25.],
-                           [125.]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                          [[ 35.],
-                           [135.]]]],
+                        [[ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
 
-                        [[[[ 24.],
-                           [124.]],
 
-                          [[ 34.],
-                           [134.]]],
+                      [[[[ 2., 12.],
+                         [ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                         [[[ 25.],
-                           [125.]],
+                        [[ 3., 13.],
+                         [ 4., 14.],
+                         [ 5., 15.],
+                         [ 6., 16.]]],
 
-                          [[ 35.],
-                           [135.]]],
 
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-                         [[[ 26.],
-                           [126.]],
+                        [[ 5., 15.],
+                         [ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.]],
 
-                          [[ 36.],
-                           [136.]]]]]]]), device='cuda:0',
-       size=(2, 3, 6, 6, 2, 1), nnz=4, dtype=torch.float64,
-       layout=torch.sparse_bsr)
-# _crow_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], device='cuda:0', dtype=torch.int32)
-# _col_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[[[[[[  1.],
-             [101.]],
 
-            [[ 11.],
-             [111.]]],
+                      [[[[ 6., 16.],
+                         [ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.]],
 
+                        [[ 7., 17.],
+                         [ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.]],
 
-           [[[  2.],
-             [102.]],
+                        [[ 8., 18.],
+                         [ 9., 19.],
+                         [10., 20.],
+                         [11., 21.]]],
 
-            [[ 12.],
-             [112.]]],
 
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-           [[[  3.],
-             [103.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 13.],
-             [113.]]]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]]],
 
 
-          [[[[  2.],
-             [102.]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 12.],
-             [112.]]],
+                        [[ 9., 19.],
+                         [10., 20.],
+                         [11., 21.],
+                         [12., 22.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
-           [[[  3.],
-             [103.]],
 
-            [[ 13.],
-             [113.]]],
+                       [[[13., 23.],
+                         [14., 24.],
+                         [15., 25.],
+                         [16., 26.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-           [[[  4.],
-             [104.]],
+                        [[14., 24.],
+                         [15., 25.],
+                         [16., 26.],
+                         [17., 27.]]]],
 
-            [[ 14.],
-             [114.]]]],
 
 
+                      [[[[10., 20.],
+                         [11., 21.],
+                         [12., 22.],
+                         [13., 23.]],
 
-          [[[[  3.],
-             [103.]],
+                        [[11., 21.],
+                         [12., 22.],
+                         [13., 23.],
+                         [14., 24.]],
 
-            [[ 13.],
-             [113.]]],
+                        [[12., 22.],
+                         [13., 23.],
+                         [14., 24.],
+                         [15., 25.]]],
 
 
-           [[[  4.],
-             [104.]],
+                       [[[15., 25.],
+                         [16., 26.],
+                         [17., 27.],
+                         [18., 28.]],
 
-            [[ 14.],
-             [114.]]],
+                        [[16., 26.],
+                         [17., 27.],
+                         [18., 28.],
+                         [19., 29.]],
 
+                        [[17., 27.],
+                         [18., 28.],
+                         [19., 29.],
+                         [20., 30.]]]],
 
-           [[[  5.],
-             [105.]],
 
-            [[ 15.],
-             [115.]]]],
 
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-          [[[[  4.],
-             [104.]],
+                        [[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]]],
 
-            [[ 14.],
-             [114.]]],
 
+                       [[[20., 30.],
+                         [21., 31.],
+                         [22., 32.],
+                         [23., 33.]],
 
-           [[[  5.],
-             [105.]],
+                        [[21., 31.],
+                         [22., 32.],
+                         [23., 33.],
+                         [24., 34.]],
 
-            [[ 15.],
-             [115.]]],
+                        [[22., 32.],
+                         [23., 33.],
+                         [24., 34.],
+                         [25., 35.]]]],
 
 
-           [[[  6.],
-             [106.]],
 
-            [[ 16.],
-             [116.]]]]],
+                      [[[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
+                        [[18., 28.],
+                         [19., 29.],
+                         [20., 30.],
+                         [21., 31.]],
 
+                        [[19., 29.],
+                         [20., 30.],
+                         [21., 31.],
+                         [22., 32.]]],
 
 
-         [[[[[  5.],
-             [105.]],
+                       [[[ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.],
+                         [ 0.,  0.]],
 
-            [[ 15.],
-             [115.]]],
+                        [[23., 33.],
+                         [24., 34.],
+                         [25., 35.],
+                         [26., 36.]],
 
-
-           [[[  6.],
-             [106.]],
-
-            [[ 16.],
-             [116.]]],
-
-
-           [[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]]]],
-
-
-
-          [[[[  6.],
-             [106.]],
-
-            [[ 16.],
-             [116.]]],
-
-
-           [[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]]],
-
-
-           [[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]]]],
-
-
-
-          [[[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]]],
-
-
-           [[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]]],
-
-
-           [[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]]]],
-
-
-
-          [[[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]]],
-
-
-           [[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]]],
-
-
-           [[[ 10.],
-             [110.]],
-
-            [[ 20.],
-             [120.]]]]],
-
-
-
-
-         [[[[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]]],
-
-
-           [[[ 10.],
-             [110.]],
-
-            [[ 20.],
-             [120.]]],
-
-
-           [[[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]]]],
-
-
-
-          [[[[ 10.],
-             [110.]],
-
-            [[ 20.],
-             [120.]]],
-
-
-           [[[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]]],
-
-
-           [[[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]]],
-
-
-
-          [[[[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]]],
-
-
-           [[[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]],
-
-
-           [[[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]]],
-
-
-
-          [[[[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]],
-
-
-           [[[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]],
-
-
-           [[[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]]]]]],
-
-
-
-
-
-        [[[[[[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]],
-
-
-           [[[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]]],
-
-
-           [[[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]]]],
-
-
-
-          [[[[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]]],
-
-
-           [[[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]]],
-
-
-           [[[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]]]],
-
-
-
-          [[[[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]]],
-
-
-           [[[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]]],
-
-
-           [[[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]]],
-
-
-
-          [[[[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]]],
-
-
-           [[[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]],
-
-
-           [[[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]]]],
-
-
-
-
-         [[[[[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]],
-
-
-           [[[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]],
-
-
-           [[[ 19.],
-             [119.]],
-
-            [[ 29.],
-             [129.]]]],
-
-
-
-          [[[[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]],
-
-
-           [[[ 19.],
-             [119.]],
-
-            [[ 29.],
-             [129.]]],
-
-
-           [[[ 20.],
-             [120.]],
-
-            [[ 30.],
-             [130.]]]],
-
-
-
-          [[[[ 19.],
-             [119.]],
-
-            [[ 29.],
-             [129.]]],
-
-
-           [[[ 20.],
-             [120.]],
-
-            [[ 30.],
-             [130.]]],
-
-
-           [[[ 21.],
-             [121.]],
-
-            [[ 31.],
-             [131.]]]],
-
-
-
-          [[[[ 20.],
-             [120.]],
-
-            [[ 30.],
-             [130.]]],
-
-
-           [[[ 21.],
-             [121.]],
-
-            [[ 31.],
-             [131.]]],
-
-
-           [[[ 22.],
-             [122.]],
-
-            [[ 32.],
-             [132.]]]]],
-
-
-
-
-         [[[[[ 21.],
-             [121.]],
-
-            [[ 31.],
-             [131.]]],
-
-
-           [[[ 22.],
-             [122.]],
-
-            [[ 32.],
-             [132.]]],
-
-
-           [[[ 23.],
-             [123.]],
-
-            [[ 33.],
-             [133.]]]],
-
-
-
-          [[[[ 22.],
-             [122.]],
-
-            [[ 32.],
-             [132.]]],
-
-
-           [[[ 23.],
-             [123.]],
-
-            [[ 33.],
-             [133.]]],
-
-
-           [[[ 24.],
-             [124.]],
-
-            [[ 34.],
-             [134.]]]],
-
-
-
-          [[[[ 23.],
-             [123.]],
-
-            [[ 33.],
-             [133.]]],
-
-
-           [[[ 24.],
-             [124.]],
-
-            [[ 34.],
-             [134.]]],
-
-
-           [[[ 25.],
-             [125.]],
-
-            [[ 35.],
-             [135.]]]],
-
-
-
-          [[[[ 24.],
-             [124.]],
-
-            [[ 34.],
-             [134.]]],
-
-
-           [[[ 25.],
-             [125.]],
-
-            [[ 35.],
-             [135.]]],
-
-
-           [[[ 26.],
-             [126.]],
-
-            [[ 36.],
-             [136.]]]]]]], device='cuda:0', dtype=torch.float64)
-
-
-########## torch.float32/torch.int64/size=()+(6, 6)+(2,) ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[  1., 101.],
-                        [ 11., 111.]],
-
-                       [[  2., 102.],
-                        [ 12., 112.]],
-
-                       [[  3., 103.],
-                        [ 13., 113.]]],
-
-
-                      [[[  2., 102.],
-                        [ 12., 112.]],
-
-                       [[  3., 103.],
-                        [ 13., 113.]],
-
-                       [[  4., 104.],
-                        [ 14., 114.]]],
-
-
-                      [[[  3., 103.],
-                        [ 13., 113.]],
-
-                       [[  4., 104.],
-                        [ 14., 114.]],
-
-                       [[  5., 105.],
-                        [ 15., 115.]]],
-
-
-                      [[[  4., 104.],
-                        [ 14., 114.]],
-
-                       [[  5., 105.],
-                        [ 15., 115.]],
-
-                       [[  6., 106.],
-                        [ 16., 116.]]]]), device='cuda:0', size=(6, 6, 2),
-       nnz=4, layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0')
-# _col_indices
-tensor([0, 1, 0, 2], device='cuda:0')
-# _values
-tensor([[[[  1., 101.],
-          [ 11., 111.]],
-
-         [[  2., 102.],
-          [ 12., 112.]],
-
-         [[  3., 103.],
-          [ 13., 113.]]],
-
-
-        [[[  2., 102.],
-          [ 12., 112.]],
-
-         [[  3., 103.],
-          [ 13., 113.]],
-
-         [[  4., 104.],
-          [ 14., 114.]]],
-
-
-        [[[  3., 103.],
-          [ 13., 113.]],
-
-         [[  4., 104.],
-          [ 14., 114.]],
-
-         [[  5., 105.],
-          [ 15., 115.]]],
-
-
-        [[[  4., 104.],
-          [ 14., 114.]],
-
-         [[  5., 105.],
-          [ 15., 115.]],
-
-         [[  6., 106.],
-          [ 16., 116.]]]], device='cuda:0')
-
-########## torch.float32/torch.int64/size=()+(4, 9)+(4, 2) ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[[1.0000e+00, 1.0010e+03],
-                         [1.0100e+02, 1.1010e+03],
-                         [2.0100e+02, 1.2010e+03],
-                         [3.0100e+02, 1.3010e+03]],
-
-                        [[1.1000e+01, 1.0110e+03],
-                         [1.1100e+02, 1.1110e+03],
-                         [2.1100e+02, 1.2110e+03],
-                         [3.1100e+02, 1.3110e+03]],
-
-                        [[2.1000e+01, 1.0210e+03],
-                         [1.2100e+02, 1.1210e+03],
-                         [2.2100e+02, 1.2210e+03],
-                         [3.2100e+02, 1.3210e+03]]],
-
-
-                       [[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
-
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]],
-
-                        [[2.2000e+01, 1.0220e+03],
-                         [1.2200e+02, 1.1220e+03],
-                         [2.2200e+02, 1.2220e+03],
-                         [3.2200e+02, 1.3220e+03]]]],
-
-
-
-                      [[[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
-
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]],
-
-                        [[2.2000e+01, 1.0220e+03],
-                         [1.2200e+02, 1.1220e+03],
-                         [2.2200e+02, 1.2220e+03],
-                         [3.2200e+02, 1.3220e+03]]],
-
-
-                       [[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
-
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]],
-
-                        [[2.3000e+01, 1.0230e+03],
-                         [1.2300e+02, 1.1230e+03],
-                         [2.2300e+02, 1.2230e+03],
-                         [3.2300e+02, 1.3230e+03]]]],
-
-
-
-                      [[[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
-
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]],
-
-                        [[2.3000e+01, 1.0230e+03],
-                         [1.2300e+02, 1.1230e+03],
-                         [2.2300e+02, 1.2230e+03],
-                         [3.2300e+02, 1.3230e+03]]],
-
-
-                       [[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
-
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]],
-
-                        [[2.4000e+01, 1.0240e+03],
-                         [1.2400e+02, 1.1240e+03],
-                         [2.2400e+02, 1.2240e+03],
-                         [3.2400e+02, 1.3240e+03]]]],
-
-
-
-                      [[[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
-
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]],
-
-                        [[2.4000e+01, 1.0240e+03],
-                         [1.2400e+02, 1.1240e+03],
-                         [2.2400e+02, 1.2240e+03],
-                         [3.2400e+02, 1.3240e+03]]],
-
-
-                       [[[5.0000e+00, 1.0050e+03],
-                         [1.0500e+02, 1.1050e+03],
-                         [2.0500e+02, 1.2050e+03],
-                         [3.0500e+02, 1.3050e+03]],
-
-                        [[1.5000e+01, 1.0150e+03],
-                         [1.1500e+02, 1.1150e+03],
-                         [2.1500e+02, 1.2150e+03],
-                         [3.1500e+02, 1.3150e+03]],
-
-                        [[2.5000e+01, 1.0250e+03],
-                         [1.2500e+02, 1.1250e+03],
-                         [2.2500e+02, 1.2250e+03],
-                         [3.2500e+02, 1.3250e+03]]]]]), device='cuda:0',
-       size=(4, 9, 4, 2), nnz=4, layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0')
-# _col_indices
-tensor([0, 1, 0, 2], device='cuda:0')
-# _values
-tensor([[[[[1.0000e+00, 1.0010e+03],
-           [1.0100e+02, 1.1010e+03],
-           [2.0100e+02, 1.2010e+03],
-           [3.0100e+02, 1.3010e+03]],
-
-          [[1.1000e+01, 1.0110e+03],
-           [1.1100e+02, 1.1110e+03],
-           [2.1100e+02, 1.2110e+03],
-           [3.1100e+02, 1.3110e+03]],
-
-          [[2.1000e+01, 1.0210e+03],
-           [1.2100e+02, 1.1210e+03],
-           [2.2100e+02, 1.2210e+03],
-           [3.2100e+02, 1.3210e+03]]],
-
-
-         [[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
-
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]],
-
-          [[2.2000e+01, 1.0220e+03],
-           [1.2200e+02, 1.1220e+03],
-           [2.2200e+02, 1.2220e+03],
-           [3.2200e+02, 1.3220e+03]]]],
-
-
-
-        [[[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
-
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]],
-
-          [[2.2000e+01, 1.0220e+03],
-           [1.2200e+02, 1.1220e+03],
-           [2.2200e+02, 1.2220e+03],
-           [3.2200e+02, 1.3220e+03]]],
-
-
-         [[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
-
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]],
-
-          [[2.3000e+01, 1.0230e+03],
-           [1.2300e+02, 1.1230e+03],
-           [2.2300e+02, 1.2230e+03],
-           [3.2300e+02, 1.3230e+03]]]],
-
-
-
-        [[[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
-
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]],
-
-          [[2.3000e+01, 1.0230e+03],
-           [1.2300e+02, 1.1230e+03],
-           [2.2300e+02, 1.2230e+03],
-           [3.2300e+02, 1.3230e+03]]],
-
-
-         [[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
-
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]],
-
-          [[2.4000e+01, 1.0240e+03],
-           [1.2400e+02, 1.1240e+03],
-           [2.2400e+02, 1.2240e+03],
-           [3.2400e+02, 1.3240e+03]]]],
-
-
-
-        [[[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
-
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]],
-
-          [[2.4000e+01, 1.0240e+03],
-           [1.2400e+02, 1.1240e+03],
-           [2.2400e+02, 1.2240e+03],
-           [3.2400e+02, 1.3240e+03]]],
-
-
-         [[[5.0000e+00, 1.0050e+03],
-           [1.0500e+02, 1.1050e+03],
-           [2.0500e+02, 1.2050e+03],
-           [3.0500e+02, 1.3050e+03]],
-
-          [[1.5000e+01, 1.0150e+03],
-           [1.1500e+02, 1.1150e+03],
-           [2.1500e+02, 1.2150e+03],
-           [3.1500e+02, 1.3150e+03]],
-
-          [[2.5000e+01, 1.0250e+03],
-           [1.2500e+02, 1.1250e+03],
-           [2.2500e+02, 1.2250e+03],
-           [3.2500e+02, 1.3250e+03]]]]], device='cuda:0')
-
-########## torch.float32/torch.int64/size=(2, 3)+(6, 6)+(2, 1) ##########
-# sparse tensor
-tensor(crow_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[[[  1.],
-                           [101.]],
-
-                          [[ 11.],
-                           [111.]]],
-
-
-                         [[[  2.],
-                           [102.]],
-
-                          [[ 12.],
-                           [112.]]],
-
-
-                         [[[  3.],
-                           [103.]],
-
-                          [[ 13.],
-                           [113.]]]],
-
-
-
-                        [[[[  2.],
-                           [102.]],
-
-                          [[ 12.],
-                           [112.]]],
-
-
-                         [[[  3.],
-                           [103.]],
-
-                          [[ 13.],
-                           [113.]]],
-
-
-                         [[[  4.],
-                           [104.]],
-
-                          [[ 14.],
-                           [114.]]]],
-
-
-
-                        [[[[  3.],
-                           [103.]],
-
-                          [[ 13.],
-                           [113.]]],
-
-
-                         [[[  4.],
-                           [104.]],
-
-                          [[ 14.],
-                           [114.]]],
-
-
-                         [[[  5.],
-                           [105.]],
-
-                          [[ 15.],
-                           [115.]]]],
-
-
-
-                        [[[[  4.],
-                           [104.]],
-
-                          [[ 14.],
-                           [114.]]],
-
-
-                         [[[  5.],
-                           [105.]],
-
-                          [[ 15.],
-                           [115.]]],
-
-
-                         [[[  6.],
-                           [106.]],
-
-                          [[ 16.],
-                           [116.]]]]],
-
-
-
-
-                       [[[[[  5.],
-                           [105.]],
-
-                          [[ 15.],
-                           [115.]]],
-
-
-                         [[[  6.],
-                           [106.]],
-
-                          [[ 16.],
-                           [116.]]],
-
-
-                         [[[  7.],
-                           [107.]],
-
-                          [[ 17.],
-                           [117.]]]],
-
-
-
-                        [[[[  6.],
-                           [106.]],
-
-                          [[ 16.],
-                           [116.]]],
-
-
-                         [[[  7.],
-                           [107.]],
-
-                          [[ 17.],
-                           [117.]]],
-
-
-                         [[[  8.],
-                           [108.]],
-
-                          [[ 18.],
-                           [118.]]]],
-
-
-
-                        [[[[  7.],
-                           [107.]],
-
-                          [[ 17.],
-                           [117.]]],
-
-
-                         [[[  8.],
-                           [108.]],
-
-                          [[ 18.],
-                           [118.]]],
-
-
-                         [[[  9.],
-                           [109.]],
-
-                          [[ 19.],
-                           [119.]]]],
-
-
-
-                        [[[[  8.],
-                           [108.]],
-
-                          [[ 18.],
-                           [118.]]],
-
-
-                         [[[  9.],
-                           [109.]],
-
-                          [[ 19.],
-                           [119.]]],
-
-
-                         [[[ 10.],
-                           [110.]],
-
-                          [[ 20.],
-                           [120.]]]]],
-
-
-
-
-                       [[[[[  9.],
-                           [109.]],
-
-                          [[ 19.],
-                           [119.]]],
-
-
-                         [[[ 10.],
-                           [110.]],
-
-                          [[ 20.],
-                           [120.]]],
-
-
-                         [[[ 11.],
-                           [111.]],
-
-                          [[ 21.],
-                           [121.]]]],
-
-
-
-                        [[[[ 10.],
-                           [110.]],
-
-                          [[ 20.],
-                           [120.]]],
-
-
-                         [[[ 11.],
-                           [111.]],
-
-                          [[ 21.],
-                           [121.]]],
-
-
-                         [[[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]]]],
-
-
-
-                        [[[[ 11.],
-                           [111.]],
-
-                          [[ 21.],
-                           [121.]]],
-
-
-                         [[[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]]],
-
-
-                         [[[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]]]],
-
-
-
-                        [[[[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]]],
-
-
-                         [[[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]]],
-
-
-                         [[[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]]]]]],
-
-
-
-
-
-                      [[[[[[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]]],
-
-
-                         [[[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]]],
-
-
-                         [[[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]]]],
-
-
-
-                        [[[[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]]],
-
-
-                         [[[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]]],
-
-
-                         [[[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]]]],
-
-
-
-                        [[[[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]]],
-
-
-                         [[[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]]],
-
-
-                         [[[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]]]],
-
-
-
-                        [[[[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]]],
-
-
-                         [[[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]]],
-
-
-                         [[[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]]]]],
-
-
-
-
-                       [[[[[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]]],
-
-
-                         [[[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]]],
-
-
-                         [[[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]]]],
-
-
-
-                        [[[[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]]],
-
-
-                         [[[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]]],
-
-
-                         [[[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]]]],
-
-
-
-                        [[[[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]]],
-
-
-                         [[[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]]],
-
-
-                         [[[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]]]],
-
-
-
-                        [[[[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]]],
-
-
-                         [[[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]]],
-
-
-                         [[[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]]]]],
-
-
-
-
-                       [[[[[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]]],
-
-
-                         [[[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]]],
-
-
-                         [[[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]]]],
-
-
-
-                        [[[[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]]],
-
-
-                         [[[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]]],
-
-
-                         [[[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]]]],
-
-
-
-                        [[[[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]]],
-
-
-                         [[[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]]],
-
-
-                         [[[ 25.],
-                           [125.]],
-
-                          [[ 35.],
-                           [135.]]]],
-
-
-
-                        [[[[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]]],
-
-
-                         [[[ 25.],
-                           [125.]],
-
-                          [[ 35.],
-                           [135.]]],
-
-
-                         [[[ 26.],
-                           [126.]],
-
-                          [[ 36.],
-                           [136.]]]]]]]), device='cuda:0',
-       size=(2, 3, 6, 6, 2, 1), nnz=4, layout=torch.sparse_bsr)
-# _crow_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
-
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], device='cuda:0')
-# _col_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
-
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], device='cuda:0')
-# _values
-tensor([[[[[[[  1.],
-             [101.]],
-
-            [[ 11.],
-             [111.]]],
-
-
-           [[[  2.],
-             [102.]],
-
-            [[ 12.],
-             [112.]]],
-
-
-           [[[  3.],
-             [103.]],
-
-            [[ 13.],
-             [113.]]]],
-
-
-
-          [[[[  2.],
-             [102.]],
-
-            [[ 12.],
-             [112.]]],
-
-
-           [[[  3.],
-             [103.]],
-
-            [[ 13.],
-             [113.]]],
-
-
-           [[[  4.],
-             [104.]],
-
-            [[ 14.],
-             [114.]]]],
-
-
-
-          [[[[  3.],
-             [103.]],
-
-            [[ 13.],
-             [113.]]],
-
-
-           [[[  4.],
-             [104.]],
-
-            [[ 14.],
-             [114.]]],
-
-
-           [[[  5.],
-             [105.]],
-
-            [[ 15.],
-             [115.]]]],
-
-
-
-          [[[[  4.],
-             [104.]],
-
-            [[ 14.],
-             [114.]]],
-
-
-           [[[  5.],
-             [105.]],
-
-            [[ 15.],
-             [115.]]],
-
-
-           [[[  6.],
-             [106.]],
-
-            [[ 16.],
-             [116.]]]]],
-
-
-
-
-         [[[[[  5.],
-             [105.]],
-
-            [[ 15.],
-             [115.]]],
-
-
-           [[[  6.],
-             [106.]],
-
-            [[ 16.],
-             [116.]]],
-
-
-           [[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]]]],
-
-
-
-          [[[[  6.],
-             [106.]],
-
-            [[ 16.],
-             [116.]]],
-
-
-           [[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]]],
-
-
-           [[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]]]],
-
-
-
-          [[[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]]],
-
-
-           [[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]]],
-
-
-           [[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]]]],
-
-
-
-          [[[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]]],
-
-
-           [[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]]],
-
-
-           [[[ 10.],
-             [110.]],
-
-            [[ 20.],
-             [120.]]]]],
-
-
-
-
-         [[[[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]]],
-
-
-           [[[ 10.],
-             [110.]],
-
-            [[ 20.],
-             [120.]]],
-
-
-           [[[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]]]],
-
-
-
-          [[[[ 10.],
-             [110.]],
-
-            [[ 20.],
-             [120.]]],
-
-
-           [[[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]]],
-
-
-           [[[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]]],
-
-
-
-          [[[[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]]],
-
-
-           [[[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]],
-
-
-           [[[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]]],
-
-
-
-          [[[[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]],
-
-
-           [[[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]],
-
-
-           [[[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]]]]]],
-
-
-
-
-
-        [[[[[[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]],
-
-
-           [[[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]]],
-
-
-           [[[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]]]],
-
-
-
-          [[[[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]]],
-
-
-           [[[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]]],
-
-
-           [[[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]]]],
-
-
-
-          [[[[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]]],
-
-
-           [[[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]]],
-
-
-           [[[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]]],
-
-
-
-          [[[[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]]],
-
-
-           [[[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]],
-
-
-           [[[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]]]],
-
-
-
-
-         [[[[[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]],
-
-
-           [[[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]],
-
-
-           [[[ 19.],
-             [119.]],
-
-            [[ 29.],
-             [129.]]]],
-
-
-
-          [[[[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]],
-
-
-           [[[ 19.],
-             [119.]],
-
-            [[ 29.],
-             [129.]]],
-
-
-           [[[ 20.],
-             [120.]],
-
-            [[ 30.],
-             [130.]]]],
-
-
-
-          [[[[ 19.],
-             [119.]],
-
-            [[ 29.],
-             [129.]]],
-
-
-           [[[ 20.],
-             [120.]],
-
-            [[ 30.],
-             [130.]]],
-
-
-           [[[ 21.],
-             [121.]],
-
-            [[ 31.],
-             [131.]]]],
-
-
-
-          [[[[ 20.],
-             [120.]],
-
-            [[ 30.],
-             [130.]]],
-
-
-           [[[ 21.],
-             [121.]],
-
-            [[ 31.],
-             [131.]]],
-
-
-           [[[ 22.],
-             [122.]],
-
-            [[ 32.],
-             [132.]]]]],
-
-
-
-
-         [[[[[ 21.],
-             [121.]],
-
-            [[ 31.],
-             [131.]]],
-
-
-           [[[ 22.],
-             [122.]],
-
-            [[ 32.],
-             [132.]]],
-
-
-           [[[ 23.],
-             [123.]],
-
-            [[ 33.],
-             [133.]]]],
-
-
-
-          [[[[ 22.],
-             [122.]],
-
-            [[ 32.],
-             [132.]]],
-
-
-           [[[ 23.],
-             [123.]],
-
-            [[ 33.],
-             [133.]]],
-
-
-           [[[ 24.],
-             [124.]],
-
-            [[ 34.],
-             [134.]]]],
-
-
-
-          [[[[ 23.],
-             [123.]],
-
-            [[ 33.],
-             [133.]]],
-
-
-           [[[ 24.],
-             [124.]],
-
-            [[ 34.],
-             [134.]]],
-
-
-           [[[ 25.],
-             [125.]],
-
-            [[ 35.],
-             [135.]]]],
-
-
-
-          [[[[ 24.],
-             [124.]],
-
-            [[ 34.],
-             [134.]]],
-
-
-           [[[ 25.],
-             [125.]],
-
-            [[ 35.],
-             [135.]]],
-
-
-           [[[ 26.],
-             [126.]],
-
-            [[ 36.],
-             [136.]]]]]]], device='cuda:0')
-
-
-########## torch.float64/torch.int64/size=()+(6, 6)+(2,) ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[  1., 101.],
-                        [ 11., 111.]],
-
-                       [[  2., 102.],
-                        [ 12., 112.]],
-
-                       [[  3., 103.],
-                        [ 13., 113.]]],
-
-
-                      [[[  2., 102.],
-                        [ 12., 112.]],
-
-                       [[  3., 103.],
-                        [ 13., 113.]],
-
-                       [[  4., 104.],
-                        [ 14., 114.]]],
-
-
-                      [[[  3., 103.],
-                        [ 13., 113.]],
-
-                       [[  4., 104.],
-                        [ 14., 114.]],
-
-                       [[  5., 105.],
-                        [ 15., 115.]]],
-
-
-                      [[[  4., 104.],
-                        [ 14., 114.]],
-
-                       [[  5., 105.],
-                        [ 15., 115.]],
-
-                       [[  6., 106.],
-                        [ 16., 116.]]]]), device='cuda:0', size=(6, 6, 2),
-       nnz=4, dtype=torch.float64, layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0')
-# _col_indices
-tensor([0, 1, 0, 2], device='cuda:0')
-# _values
-tensor([[[[  1., 101.],
-          [ 11., 111.]],
-
-         [[  2., 102.],
-          [ 12., 112.]],
-
-         [[  3., 103.],
-          [ 13., 113.]]],
-
-
-        [[[  2., 102.],
-          [ 12., 112.]],
-
-         [[  3., 103.],
-          [ 13., 113.]],
-
-         [[  4., 104.],
-          [ 14., 114.]]],
-
-
-        [[[  3., 103.],
-          [ 13., 113.]],
-
-         [[  4., 104.],
-          [ 14., 114.]],
-
-         [[  5., 105.],
-          [ 15., 115.]]],
-
-
-        [[[  4., 104.],
-          [ 14., 114.]],
-
-         [[  5., 105.],
-          [ 15., 115.]],
-
-         [[  6., 106.],
-          [ 16., 116.]]]], device='cuda:0', dtype=torch.float64)
-
-########## torch.float64/torch.int64/size=()+(4, 9)+(4, 2) ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[[[1.0000e+00, 1.0010e+03],
-                         [1.0100e+02, 1.1010e+03],
-                         [2.0100e+02, 1.2010e+03],
-                         [3.0100e+02, 1.3010e+03]],
-
-                        [[1.1000e+01, 1.0110e+03],
-                         [1.1100e+02, 1.1110e+03],
-                         [2.1100e+02, 1.2110e+03],
-                         [3.1100e+02, 1.3110e+03]],
-
-                        [[2.1000e+01, 1.0210e+03],
-                         [1.2100e+02, 1.1210e+03],
-                         [2.2100e+02, 1.2210e+03],
-                         [3.2100e+02, 1.3210e+03]]],
-
-
-                       [[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
-
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]],
-
-                        [[2.2000e+01, 1.0220e+03],
-                         [1.2200e+02, 1.1220e+03],
-                         [2.2200e+02, 1.2220e+03],
-                         [3.2200e+02, 1.3220e+03]]]],
-
-
-
-                      [[[[2.0000e+00, 1.0020e+03],
-                         [1.0200e+02, 1.1020e+03],
-                         [2.0200e+02, 1.2020e+03],
-                         [3.0200e+02, 1.3020e+03]],
-
-                        [[1.2000e+01, 1.0120e+03],
-                         [1.1200e+02, 1.1120e+03],
-                         [2.1200e+02, 1.2120e+03],
-                         [3.1200e+02, 1.3120e+03]],
-
-                        [[2.2000e+01, 1.0220e+03],
-                         [1.2200e+02, 1.1220e+03],
-                         [2.2200e+02, 1.2220e+03],
-                         [3.2200e+02, 1.3220e+03]]],
-
-
-                       [[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
-
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]],
-
-                        [[2.3000e+01, 1.0230e+03],
-                         [1.2300e+02, 1.1230e+03],
-                         [2.2300e+02, 1.2230e+03],
-                         [3.2300e+02, 1.3230e+03]]]],
-
-
-
-                      [[[[3.0000e+00, 1.0030e+03],
-                         [1.0300e+02, 1.1030e+03],
-                         [2.0300e+02, 1.2030e+03],
-                         [3.0300e+02, 1.3030e+03]],
-
-                        [[1.3000e+01, 1.0130e+03],
-                         [1.1300e+02, 1.1130e+03],
-                         [2.1300e+02, 1.2130e+03],
-                         [3.1300e+02, 1.3130e+03]],
-
-                        [[2.3000e+01, 1.0230e+03],
-                         [1.2300e+02, 1.1230e+03],
-                         [2.2300e+02, 1.2230e+03],
-                         [3.2300e+02, 1.3230e+03]]],
-
-
-                       [[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
-
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]],
-
-                        [[2.4000e+01, 1.0240e+03],
-                         [1.2400e+02, 1.1240e+03],
-                         [2.2400e+02, 1.2240e+03],
-                         [3.2400e+02, 1.3240e+03]]]],
-
-
-
-                      [[[[4.0000e+00, 1.0040e+03],
-                         [1.0400e+02, 1.1040e+03],
-                         [2.0400e+02, 1.2040e+03],
-                         [3.0400e+02, 1.3040e+03]],
-
-                        [[1.4000e+01, 1.0140e+03],
-                         [1.1400e+02, 1.1140e+03],
-                         [2.1400e+02, 1.2140e+03],
-                         [3.1400e+02, 1.3140e+03]],
-
-                        [[2.4000e+01, 1.0240e+03],
-                         [1.2400e+02, 1.1240e+03],
-                         [2.2400e+02, 1.2240e+03],
-                         [3.2400e+02, 1.3240e+03]]],
-
-
-                       [[[5.0000e+00, 1.0050e+03],
-                         [1.0500e+02, 1.1050e+03],
-                         [2.0500e+02, 1.2050e+03],
-                         [3.0500e+02, 1.3050e+03]],
-
-                        [[1.5000e+01, 1.0150e+03],
-                         [1.1500e+02, 1.1150e+03],
-                         [2.1500e+02, 1.2150e+03],
-                         [3.1500e+02, 1.3150e+03]],
-
-                        [[2.5000e+01, 1.0250e+03],
-                         [1.2500e+02, 1.1250e+03],
-                         [2.2500e+02, 1.2250e+03],
-                         [3.2500e+02, 1.3250e+03]]]]]), device='cuda:0',
-       size=(4, 9, 4, 2), nnz=4, dtype=torch.float64, layout=torch.sparse_bsr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0')
-# _col_indices
-tensor([0, 1, 0, 2], device='cuda:0')
-# _values
-tensor([[[[[1.0000e+00, 1.0010e+03],
-           [1.0100e+02, 1.1010e+03],
-           [2.0100e+02, 1.2010e+03],
-           [3.0100e+02, 1.3010e+03]],
-
-          [[1.1000e+01, 1.0110e+03],
-           [1.1100e+02, 1.1110e+03],
-           [2.1100e+02, 1.2110e+03],
-           [3.1100e+02, 1.3110e+03]],
-
-          [[2.1000e+01, 1.0210e+03],
-           [1.2100e+02, 1.1210e+03],
-           [2.2100e+02, 1.2210e+03],
-           [3.2100e+02, 1.3210e+03]]],
-
-
-         [[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
-
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]],
-
-          [[2.2000e+01, 1.0220e+03],
-           [1.2200e+02, 1.1220e+03],
-           [2.2200e+02, 1.2220e+03],
-           [3.2200e+02, 1.3220e+03]]]],
-
-
-
-        [[[[2.0000e+00, 1.0020e+03],
-           [1.0200e+02, 1.1020e+03],
-           [2.0200e+02, 1.2020e+03],
-           [3.0200e+02, 1.3020e+03]],
-
-          [[1.2000e+01, 1.0120e+03],
-           [1.1200e+02, 1.1120e+03],
-           [2.1200e+02, 1.2120e+03],
-           [3.1200e+02, 1.3120e+03]],
-
-          [[2.2000e+01, 1.0220e+03],
-           [1.2200e+02, 1.1220e+03],
-           [2.2200e+02, 1.2220e+03],
-           [3.2200e+02, 1.3220e+03]]],
-
-
-         [[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
-
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]],
-
-          [[2.3000e+01, 1.0230e+03],
-           [1.2300e+02, 1.1230e+03],
-           [2.2300e+02, 1.2230e+03],
-           [3.2300e+02, 1.3230e+03]]]],
-
-
-
-        [[[[3.0000e+00, 1.0030e+03],
-           [1.0300e+02, 1.1030e+03],
-           [2.0300e+02, 1.2030e+03],
-           [3.0300e+02, 1.3030e+03]],
-
-          [[1.3000e+01, 1.0130e+03],
-           [1.1300e+02, 1.1130e+03],
-           [2.1300e+02, 1.2130e+03],
-           [3.1300e+02, 1.3130e+03]],
-
-          [[2.3000e+01, 1.0230e+03],
-           [1.2300e+02, 1.1230e+03],
-           [2.2300e+02, 1.2230e+03],
-           [3.2300e+02, 1.3230e+03]]],
-
-
-         [[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
-
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]],
-
-          [[2.4000e+01, 1.0240e+03],
-           [1.2400e+02, 1.1240e+03],
-           [2.2400e+02, 1.2240e+03],
-           [3.2400e+02, 1.3240e+03]]]],
-
-
-
-        [[[[4.0000e+00, 1.0040e+03],
-           [1.0400e+02, 1.1040e+03],
-           [2.0400e+02, 1.2040e+03],
-           [3.0400e+02, 1.3040e+03]],
-
-          [[1.4000e+01, 1.0140e+03],
-           [1.1400e+02, 1.1140e+03],
-           [2.1400e+02, 1.2140e+03],
-           [3.1400e+02, 1.3140e+03]],
-
-          [[2.4000e+01, 1.0240e+03],
-           [1.2400e+02, 1.1240e+03],
-           [2.2400e+02, 1.2240e+03],
-           [3.2400e+02, 1.3240e+03]]],
-
-
-         [[[5.0000e+00, 1.0050e+03],
-           [1.0500e+02, 1.1050e+03],
-           [2.0500e+02, 1.2050e+03],
-           [3.0500e+02, 1.3050e+03]],
-
-          [[1.5000e+01, 1.0150e+03],
-           [1.1500e+02, 1.1150e+03],
-           [2.1500e+02, 1.2150e+03],
-           [3.1500e+02, 1.3150e+03]],
-
-          [[2.5000e+01, 1.0250e+03],
-           [1.2500e+02, 1.1250e+03],
-           [2.2500e+02, 1.2250e+03],
-           [3.2500e+02, 1.3250e+03]]]]], device='cuda:0', dtype=torch.float64)
-
-########## torch.float64/torch.int64/size=(2, 3)+(6, 6)+(2, 1) ##########
-# sparse tensor
-tensor(crow_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[[[  1.],
-                           [101.]],
-
-                          [[ 11.],
-                           [111.]]],
-
-
-                         [[[  2.],
-                           [102.]],
-
-                          [[ 12.],
-                           [112.]]],
-
-
-                         [[[  3.],
-                           [103.]],
-
-                          [[ 13.],
-                           [113.]]]],
-
-
-
-                        [[[[  2.],
-                           [102.]],
-
-                          [[ 12.],
-                           [112.]]],
-
-
-                         [[[  3.],
-                           [103.]],
-
-                          [[ 13.],
-                           [113.]]],
-
-
-                         [[[  4.],
-                           [104.]],
-
-                          [[ 14.],
-                           [114.]]]],
-
-
-
-                        [[[[  3.],
-                           [103.]],
-
-                          [[ 13.],
-                           [113.]]],
-
-
-                         [[[  4.],
-                           [104.]],
-
-                          [[ 14.],
-                           [114.]]],
-
-
-                         [[[  5.],
-                           [105.]],
-
-                          [[ 15.],
-                           [115.]]]],
-
-
-
-                        [[[[  4.],
-                           [104.]],
-
-                          [[ 14.],
-                           [114.]]],
-
-
-                         [[[  5.],
-                           [105.]],
-
-                          [[ 15.],
-                           [115.]]],
-
-
-                         [[[  6.],
-                           [106.]],
-
-                          [[ 16.],
-                           [116.]]]]],
-
-
-
-
-                       [[[[[  5.],
-                           [105.]],
-
-                          [[ 15.],
-                           [115.]]],
-
-
-                         [[[  6.],
-                           [106.]],
-
-                          [[ 16.],
-                           [116.]]],
-
-
-                         [[[  7.],
-                           [107.]],
-
-                          [[ 17.],
-                           [117.]]]],
-
-
-
-                        [[[[  6.],
-                           [106.]],
-
-                          [[ 16.],
-                           [116.]]],
-
-
-                         [[[  7.],
-                           [107.]],
-
-                          [[ 17.],
-                           [117.]]],
-
-
-                         [[[  8.],
-                           [108.]],
-
-                          [[ 18.],
-                           [118.]]]],
-
-
-
-                        [[[[  7.],
-                           [107.]],
-
-                          [[ 17.],
-                           [117.]]],
-
-
-                         [[[  8.],
-                           [108.]],
-
-                          [[ 18.],
-                           [118.]]],
-
-
-                         [[[  9.],
-                           [109.]],
-
-                          [[ 19.],
-                           [119.]]]],
-
-
-
-                        [[[[  8.],
-                           [108.]],
-
-                          [[ 18.],
-                           [118.]]],
-
-
-                         [[[  9.],
-                           [109.]],
-
-                          [[ 19.],
-                           [119.]]],
-
-
-                         [[[ 10.],
-                           [110.]],
-
-                          [[ 20.],
-                           [120.]]]]],
-
-
-
-
-                       [[[[[  9.],
-                           [109.]],
-
-                          [[ 19.],
-                           [119.]]],
-
-
-                         [[[ 10.],
-                           [110.]],
-
-                          [[ 20.],
-                           [120.]]],
-
-
-                         [[[ 11.],
-                           [111.]],
-
-                          [[ 21.],
-                           [121.]]]],
-
-
-
-                        [[[[ 10.],
-                           [110.]],
-
-                          [[ 20.],
-                           [120.]]],
-
-
-                         [[[ 11.],
-                           [111.]],
-
-                          [[ 21.],
-                           [121.]]],
-
-
-                         [[[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]]]],
-
-
-
-                        [[[[ 11.],
-                           [111.]],
-
-                          [[ 21.],
-                           [121.]]],
-
-
-                         [[[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]]],
-
-
-                         [[[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]]]],
-
-
-
-                        [[[[ 12.],
-                           [112.]],
-
-                          [[ 22.],
-                           [122.]]],
-
-
-                         [[[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]]],
-
-
-                         [[[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]]]]]],
-
-
-
-
-
-                      [[[[[[ 13.],
-                           [113.]],
-
-                          [[ 23.],
-                           [123.]]],
-
-
-                         [[[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]]],
-
-
-                         [[[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]]]],
-
-
-
-                        [[[[ 14.],
-                           [114.]],
-
-                          [[ 24.],
-                           [124.]]],
-
-
-                         [[[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]]],
-
-
-                         [[[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]]]],
-
-
-
-                        [[[[ 15.],
-                           [115.]],
-
-                          [[ 25.],
-                           [125.]]],
-
-
-                         [[[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]]],
-
-
-                         [[[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]]]],
-
-
-
-                        [[[[ 16.],
-                           [116.]],
-
-                          [[ 26.],
-                           [126.]]],
-
-
-                         [[[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]]],
-
-
-                         [[[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]]]]],
-
-
-
-
-                       [[[[[ 17.],
-                           [117.]],
-
-                          [[ 27.],
-                           [127.]]],
-
-
-                         [[[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]]],
-
-
-                         [[[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]]]],
-
-
-
-                        [[[[ 18.],
-                           [118.]],
-
-                          [[ 28.],
-                           [128.]]],
-
-
-                         [[[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]]],
-
-
-                         [[[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]]]],
-
-
-
-                        [[[[ 19.],
-                           [119.]],
-
-                          [[ 29.],
-                           [129.]]],
-
-
-                         [[[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]]],
-
-
-                         [[[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]]]],
-
-
-
-                        [[[[ 20.],
-                           [120.]],
-
-                          [[ 30.],
-                           [130.]]],
-
-
-                         [[[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]]],
-
-
-                         [[[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]]]]],
-
-
-
-
-                       [[[[[ 21.],
-                           [121.]],
-
-                          [[ 31.],
-                           [131.]]],
-
-
-                         [[[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]]],
-
-
-                         [[[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]]]],
-
-
-
-                        [[[[ 22.],
-                           [122.]],
-
-                          [[ 32.],
-                           [132.]]],
-
-
-                         [[[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]]],
-
-
-                         [[[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]]]],
-
-
-
-                        [[[[ 23.],
-                           [123.]],
-
-                          [[ 33.],
-                           [133.]]],
-
-
-                         [[[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]]],
-
-
-                         [[[ 25.],
-                           [125.]],
-
-                          [[ 35.],
-                           [135.]]]],
-
-
-
-                        [[[[ 24.],
-                           [124.]],
-
-                          [[ 34.],
-                           [134.]]],
-
-
-                         [[[ 25.],
-                           [125.]],
-
-                          [[ 35.],
-                           [135.]]],
-
-
-                         [[[ 26.],
-                           [126.]],
-
-                          [[ 36.],
-                           [136.]]]]]]]), device='cuda:0',
-       size=(2, 3, 6, 6, 2, 1), nnz=4, dtype=torch.float64,
-       layout=torch.sparse_bsr)
+                        [[24., 34.],
+                         [25., 35.],
+                         [26., 36.],
+                         [27., 37.]]]]]), device='cuda:0', size=(8, 6, 4, 2),
+       nnz=7, dtype=torch.float64, layout=torch.sparse_bsr)
 # _crow_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
-
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], device='cuda:0')
+tensor([0, 2, 3, 5, 7], device='cuda:0')
 # _col_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
-
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], device='cuda:0')
+tensor([0, 1, 0, 0, 1, 0, 1], device='cuda:0')
 # _values
-tensor([[[[[[[  1.],
-             [101.]],
-
-            [[ 11.],
-             [111.]]],
-
-
-           [[[  2.],
-             [102.]],
-
-            [[ 12.],
-             [112.]]],
-
-
-           [[[  3.],
-             [103.]],
-
-            [[ 13.],
-             [113.]]]],
-
-
-
-          [[[[  2.],
-             [102.]],
-
-            [[ 12.],
-             [112.]]],
-
-
-           [[[  3.],
-             [103.]],
-
-            [[ 13.],
-             [113.]]],
-
-
-           [[[  4.],
-             [104.]],
-
-            [[ 14.],
-             [114.]]]],
-
-
-
-          [[[[  3.],
-             [103.]],
-
-            [[ 13.],
-             [113.]]],
-
-
-           [[[  4.],
-             [104.]],
-
-            [[ 14.],
-             [114.]]],
-
-
-           [[[  5.],
-             [105.]],
-
-            [[ 15.],
-             [115.]]]],
-
-
-
-          [[[[  4.],
-             [104.]],
-
-            [[ 14.],
-             [114.]]],
-
-
-           [[[  5.],
-             [105.]],
-
-            [[ 15.],
-             [115.]]],
-
-
-           [[[  6.],
-             [106.]],
-
-            [[ 16.],
-             [116.]]]]],
-
-
-
-
-         [[[[[  5.],
-             [105.]],
-
-            [[ 15.],
-             [115.]]],
-
-
-           [[[  6.],
-             [106.]],
-
-            [[ 16.],
-             [116.]]],
-
-
-           [[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]]]],
-
-
-
-          [[[[  6.],
-             [106.]],
-
-            [[ 16.],
-             [116.]]],
-
-
-           [[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]]],
-
-
-           [[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]]]],
-
-
-
-          [[[[  7.],
-             [107.]],
-
-            [[ 17.],
-             [117.]]],
-
-
-           [[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]]],
-
-
-           [[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]]]],
-
-
-
-          [[[[  8.],
-             [108.]],
-
-            [[ 18.],
-             [118.]]],
-
-
-           [[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]]],
-
-
-           [[[ 10.],
-             [110.]],
-
-            [[ 20.],
-             [120.]]]]],
-
-
-
-
-         [[[[[  9.],
-             [109.]],
-
-            [[ 19.],
-             [119.]]],
-
-
-           [[[ 10.],
-             [110.]],
-
-            [[ 20.],
-             [120.]]],
-
-
-           [[[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]]]],
-
-
-
-          [[[[ 10.],
-             [110.]],
-
-            [[ 20.],
-             [120.]]],
-
-
-           [[[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]]],
-
-
-           [[[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]]],
-
-
-
-          [[[[ 11.],
-             [111.]],
-
-            [[ 21.],
-             [121.]]],
-
-
-           [[[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]],
-
-
-           [[[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]]],
-
-
-
-          [[[[ 12.],
-             [112.]],
-
-            [[ 22.],
-             [122.]]],
-
-
-           [[[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]],
-
-
-           [[[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]]]]]],
-
-
-
-
-
-        [[[[[[ 13.],
-             [113.]],
-
-            [[ 23.],
-             [123.]]],
-
-
-           [[[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]]],
-
-
-           [[[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]]]],
-
-
-
-          [[[[ 14.],
-             [114.]],
-
-            [[ 24.],
-             [124.]]],
-
-
-           [[[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]]],
-
-
-           [[[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]]]],
-
-
-
-          [[[[ 15.],
-             [115.]],
-
-            [[ 25.],
-             [125.]]],
-
-
-           [[[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]]],
-
-
-           [[[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]]],
-
-
-
-          [[[[ 16.],
-             [116.]],
-
-            [[ 26.],
-             [126.]]],
-
-
-           [[[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]],
-
-
-           [[[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]]]],
-
-
-
-
-         [[[[[ 17.],
-             [117.]],
-
-            [[ 27.],
-             [127.]]],
-
-
-           [[[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]],
-
-
-           [[[ 19.],
-             [119.]],
-
-            [[ 29.],
-             [129.]]]],
-
-
-
-          [[[[ 18.],
-             [118.]],
-
-            [[ 28.],
-             [128.]]],
-
-
-           [[[ 19.],
-             [119.]],
-
-            [[ 29.],
-             [129.]]],
-
-
-           [[[ 20.],
-             [120.]],
-
-            [[ 30.],
-             [130.]]]],
-
+tensor([[[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 1., 11.],
+           [ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.]],
 
-          [[[[ 19.],
-             [119.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-            [[ 29.],
-             [129.]]],
 
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-           [[[ 20.],
-             [120.]],
+          [[ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.]],
 
-            [[ 30.],
-             [130.]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
 
-           [[[ 21.],
-             [121.]],
 
-            [[ 31.],
-             [131.]]]],
+        [[[[ 2., 12.],
+           [ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 3., 13.],
+           [ 4., 14.],
+           [ 5., 15.],
+           [ 6., 16.]]],
 
-          [[[[ 20.],
-             [120.]],
 
-            [[ 30.],
-             [130.]]],
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 5., 15.],
+           [ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.]],
 
-           [[[ 21.],
-             [121.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
-            [[ 31.],
-             [131.]]],
 
 
-           [[[ 22.],
-             [122.]],
+        [[[[ 6., 16.],
+           [ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.]],
 
-            [[ 32.],
-             [132.]]]]],
+          [[ 7., 17.],
+           [ 8., 18.],
+           [ 9., 19.],
+           [10., 20.]],
 
+          [[ 8., 18.],
+           [ 9., 19.],
+           [10., 20.],
+           [11., 21.]]],
 
 
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-         [[[[[ 21.],
-             [121.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 31.],
-             [131.]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]]],
 
 
-           [[[ 22.],
-             [122.]],
 
-            [[ 32.],
-             [132.]]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[ 9., 19.],
+           [10., 20.],
+           [11., 21.],
+           [12., 22.]],
 
-           [[[ 23.],
-             [123.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
-            [[ 33.],
-             [133.]]]],
 
+         [[[13., 23.],
+           [14., 24.],
+           [15., 25.],
+           [16., 26.]],
 
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-          [[[[ 22.],
-             [122.]],
+          [[14., 24.],
+           [15., 25.],
+           [16., 26.],
+           [17., 27.]]]],
 
-            [[ 32.],
-             [132.]]],
 
 
-           [[[ 23.],
-             [123.]],
+        [[[[10., 20.],
+           [11., 21.],
+           [12., 22.],
+           [13., 23.]],
 
-            [[ 33.],
-             [133.]]],
+          [[11., 21.],
+           [12., 22.],
+           [13., 23.],
+           [14., 24.]],
 
+          [[12., 22.],
+           [13., 23.],
+           [14., 24.],
+           [15., 25.]]],
 
-           [[[ 24.],
-             [124.]],
 
-            [[ 34.],
-             [134.]]]],
+         [[[15., 25.],
+           [16., 26.],
+           [17., 27.],
+           [18., 28.]],
 
+          [[16., 26.],
+           [17., 27.],
+           [18., 28.],
+           [19., 29.]],
 
+          [[17., 27.],
+           [18., 28.],
+           [19., 29.],
+           [20., 30.]]]],
 
-          [[[[ 23.],
-             [123.]],
 
-            [[ 33.],
-             [133.]]],
 
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-           [[[ 24.],
-             [124.]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-            [[ 34.],
-             [134.]]],
+          [[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]]],
 
 
-           [[[ 25.],
-             [125.]],
+         [[[20., 30.],
+           [21., 31.],
+           [22., 32.],
+           [23., 33.]],
 
-            [[ 35.],
-             [135.]]]],
+          [[21., 31.],
+           [22., 32.],
+           [23., 33.],
+           [24., 34.]],
 
+          [[22., 32.],
+           [23., 33.],
+           [24., 34.],
+           [25., 35.]]]],
 
 
-          [[[[ 24.],
-             [124.]],
 
-            [[ 34.],
-             [134.]]],
+        [[[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
+          [[18., 28.],
+           [19., 29.],
+           [20., 30.],
+           [21., 31.]],
 
-           [[[ 25.],
-             [125.]],
+          [[19., 29.],
+           [20., 30.],
+           [21., 31.],
+           [22., 32.]]],
 
-            [[ 35.],
-             [135.]]],
 
+         [[[ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 0.,  0.]],
 
-           [[[ 26.],
-             [126.]],
+          [[23., 33.],
+           [24., 34.],
+           [25., 35.],
+           [26., 36.]],
 
-            [[ 36.],
-             [136.]]]]]]], device='cuda:0', dtype=torch.float64)
+          [[24., 34.],
+           [25., 35.],
+           [26., 36.],
+           [27., 37.]]]]], device='cuda:0', dtype=torch.float64)
 
diff --git a/test/expect/TestSparseCompressedCUDA.test_print_SparseCSC_cuda.expect b/test/expect/TestSparseCompressedCUDA.test_print_SparseCSC_cuda.expect
index 64435343b7cb6..65efcec63319b 100644
--- a/test/expect/TestSparseCompressedCUDA.test_print_SparseCSC_cuda.expect
+++ b/test/expect/TestSparseCompressedCUDA.test_print_SparseCSC_cuda.expect
@@ -1,1411 +1,1661 @@
-########## torch.float32/torch.int32/size=()+(3, 2)+() ##########
+########## torch.float32/torch.int32/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(3, 2), nnz=4,
-       layout=torch.sparse_csc)
+tensor(ccol_indices=tensor([[[0, 2, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 2, 3, 4]],
+
+                            [[0, 1, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 1, 3, 4]]]),
+       row_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 0],
+                            [0, 1, 1, 1]],
+
+                           [[1, 0, 1, 1],
+                            [0, 1, 1, 0],
+                            [0, 0, 1, 0]]]),
+       values=tensor([[[1., 3., 2., 4.],
+                       [1., 4., 2., 3.],
+                       [1., 2., 3., 4.]],
+
+                      [[2., 1., 3., 4.],
+                       [1., 3., 4., 2.],
+                       [1., 2., 4., 3.]]]), device='cuda:0', size=(2, 3, 2, 3),
+       nnz=4, layout=torch.sparse_csc)
 # _ccol_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
+tensor([[[0, 2, 3, 4],
+         [0, 2, 3, 4],
+         [0, 2, 3, 4]],
+
+        [[0, 1, 3, 4],
+         [0, 2, 3, 4],
+         [0, 1, 3, 4]]], device='cuda:0', dtype=torch.int32)
 # _row_indices
-tensor([0, 1, 0, 2], device='cuda:0', dtype=torch.int32)
+tensor([[[0, 1, 0, 1],
+         [0, 1, 0, 0],
+         [0, 1, 1, 1]],
+
+        [[1, 0, 1, 1],
+         [0, 1, 1, 0],
+         [0, 0, 1, 0]]], device='cuda:0', dtype=torch.int32)
 # _values
-tensor([1., 2., 3., 4.], device='cuda:0')
+tensor([[[1., 3., 2., 4.],
+         [1., 4., 2., 3.],
+         [1., 2., 3., 4.]],
 
-########## torch.float32/torch.int32/size=()+(0, 0)+() ##########
+        [[2., 1., 3., 4.],
+         [1., 3., 4., 2.],
+         [1., 2., 4., 3.]]], device='cuda:0')
+
+########## torch.float32/torch.int32/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(ccol_indices=tensor([0]),
-       row_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0,)), device='cuda:0', size=(0, 0), nnz=0,
+tensor(ccol_indices=tensor([ 0,  3,  8, 11, 14, 19, 24]),
+       row_indices=tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6,
+                           7, 0, 4, 5, 6, 7]),
+       values=tensor([ 6., 13., 20.,  1.,  4.,  7.,  9., 21.,  8., 14., 22.,
+                       2., 10., 15.,  5., 11., 16., 18., 23.,  3., 12., 17.,
+                      19., 24.]), device='cuda:0', size=(8, 6), nnz=24,
        layout=torch.sparse_csc)
 # _ccol_indices
-tensor([0], device='cuda:0', dtype=torch.int32)
+tensor([ 0,  3,  8, 11, 14, 19, 24], device='cuda:0', dtype=torch.int32)
 # _row_indices
-tensor([], device='cuda:0', dtype=torch.int32)
+tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6, 7, 0, 4, 5, 6, 7],
+       device='cuda:0', dtype=torch.int32)
 # _values
-tensor([], device='cuda:0')
+tensor([ 6., 13., 20.,  1.,  4.,  7.,  9., 21.,  8., 14., 22.,  2., 10., 15.,
+         5., 11., 16., 18., 23.,  3., 12., 17., 19., 24.], device='cuda:0')
 
-########## torch.float32/torch.int32/size=(2,)+(3, 2)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       row_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[1., 2., 3., 4.],
-                      [5., 6., 7., 8.]]), device='cuda:0', size=(2, 3, 2),
-       nnz=4, layout=torch.sparse_csc)
-# _ccol_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]], device='cuda:0', dtype=torch.int32)
-# _row_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[1., 2., 3., 4.],
-        [5., 6., 7., 8.]], device='cuda:0')
 
-########## torch.float32/torch.int32/size=(2, 3)+(3, 2)+() ##########
+########## torch.float64/torch.int32/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
+tensor(ccol_indices=tensor([[[0, 2, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 2, 3, 4]],
 
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
+                            [[0, 1, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 1, 3, 4]]]),
        row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[ 1.,  2.,  3.,  4.],
-                       [ 5.,  6.,  7.,  8.],
-                       [ 9., 10., 11., 12.]],
-
-                      [[13., 14., 15., 16.],
-                       [17., 18., 19., 20.],
-                       [21., 22., 23., 24.]]]), device='cuda:0',
-       size=(2, 3, 3, 2), nnz=4, layout=torch.sparse_csc)
+                            [0, 1, 0, 0],
+                            [0, 1, 1, 1]],
+
+                           [[1, 0, 1, 1],
+                            [0, 1, 1, 0],
+                            [0, 0, 1, 0]]]),
+       values=tensor([[[1., 3., 2., 4.],
+                       [1., 4., 2., 3.],
+                       [1., 2., 3., 4.]],
+
+                      [[2., 1., 3., 4.],
+                       [1., 3., 4., 2.],
+                       [1., 2., 4., 3.]]]), device='cuda:0', size=(2, 3, 2, 3),
+       nnz=4, dtype=torch.float64, layout=torch.sparse_csc)
 # _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+tensor([[[0, 2, 3, 4],
+         [0, 2, 3, 4],
+         [0, 2, 3, 4]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], device='cuda:0', dtype=torch.int32)
+        [[0, 1, 3, 4],
+         [0, 2, 3, 4],
+         [0, 1, 3, 4]]], device='cuda:0', dtype=torch.int32)
 # _row_indices
 tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
+         [0, 1, 0, 0],
+         [0, 1, 1, 1]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], device='cuda:0', dtype=torch.int32)
+        [[1, 0, 1, 1],
+         [0, 1, 1, 0],
+         [0, 0, 1, 0]]], device='cuda:0', dtype=torch.int32)
 # _values
-tensor([[[ 1.,  2.,  3.,  4.],
-         [ 5.,  6.,  7.,  8.],
-         [ 9., 10., 11., 12.]],
+tensor([[[1., 3., 2., 4.],
+         [1., 4., 2., 3.],
+         [1., 2., 3., 4.]],
 
-        [[13., 14., 15., 16.],
-         [17., 18., 19., 20.],
-         [21., 22., 23., 24.]]], device='cuda:0')
+        [[2., 1., 3., 4.],
+         [1., 3., 4., 2.],
+         [1., 2., 4., 3.]]], device='cuda:0', dtype=torch.float64)
 
-
-########## torch.float64/torch.int32/size=()+(3, 2)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(3, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csc)
-# _ccol_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
-# _row_indices
-tensor([0, 1, 0, 2], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([1., 2., 3., 4.], device='cuda:0', dtype=torch.float64)
-
-########## torch.float64/torch.int32/size=()+(0, 0)+() ##########
+########## torch.float64/torch.int32/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(ccol_indices=tensor([0]),
-       row_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0,)), device='cuda:0', size=(0, 0), nnz=0,
+tensor(ccol_indices=tensor([ 0,  3,  8, 11, 14, 19, 24]),
+       row_indices=tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6,
+                           7, 0, 4, 5, 6, 7]),
+       values=tensor([ 6., 13., 20.,  1.,  4.,  7.,  9., 21.,  8., 14., 22.,
+                       2., 10., 15.,  5., 11., 16., 18., 23.,  3., 12., 17.,
+                      19., 24.]), device='cuda:0', size=(8, 6), nnz=24,
        dtype=torch.float64, layout=torch.sparse_csc)
 # _ccol_indices
-tensor([0], device='cuda:0', dtype=torch.int32)
+tensor([ 0,  3,  8, 11, 14, 19, 24], device='cuda:0', dtype=torch.int32)
 # _row_indices
-tensor([], device='cuda:0', dtype=torch.int32)
+tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6, 7, 0, 4, 5, 6, 7],
+       device='cuda:0', dtype=torch.int32)
 # _values
-tensor([], device='cuda:0', dtype=torch.float64)
+tensor([ 6., 13., 20.,  1.,  4.,  7.,  9., 21.,  8., 14., 22.,  2., 10., 15.,
+         5., 11., 16., 18., 23.,  3., 12., 17., 19., 24.], device='cuda:0',
+       dtype=torch.float64)
 
-########## torch.float64/torch.int32/size=(2,)+(3, 2)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       row_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[1., 2., 3., 4.],
-                      [5., 6., 7., 8.]]), device='cuda:0', size=(2, 3, 2),
-       nnz=4, dtype=torch.float64, layout=torch.sparse_csc)
-# _ccol_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]], device='cuda:0', dtype=torch.int32)
-# _row_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[1., 2., 3., 4.],
-        [5., 6., 7., 8.]], device='cuda:0', dtype=torch.float64)
 
-########## torch.float64/torch.int32/size=(2, 3)+(3, 2)+() ##########
+########## torch.float32/torch.int64/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
+tensor(ccol_indices=tensor([[[0, 2, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 2, 3, 4]],
 
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
+                            [[0, 1, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 1, 3, 4]]]),
        row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[ 1.,  2.,  3.,  4.],
-                       [ 5.,  6.,  7.,  8.],
-                       [ 9., 10., 11., 12.]],
-
-                      [[13., 14., 15., 16.],
-                       [17., 18., 19., 20.],
-                       [21., 22., 23., 24.]]]), device='cuda:0',
-       size=(2, 3, 3, 2), nnz=4, dtype=torch.float64, layout=torch.sparse_csc)
+                            [0, 1, 0, 0],
+                            [0, 1, 1, 1]],
+
+                           [[1, 0, 1, 1],
+                            [0, 1, 1, 0],
+                            [0, 0, 1, 0]]]),
+       values=tensor([[[1., 3., 2., 4.],
+                       [1., 4., 2., 3.],
+                       [1., 2., 3., 4.]],
+
+                      [[2., 1., 3., 4.],
+                       [1., 3., 4., 2.],
+                       [1., 2., 4., 3.]]]), device='cuda:0', size=(2, 3, 2, 3),
+       nnz=4, layout=torch.sparse_csc)
 # _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+tensor([[[0, 2, 3, 4],
+         [0, 2, 3, 4],
+         [0, 2, 3, 4]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], device='cuda:0', dtype=torch.int32)
+        [[0, 1, 3, 4],
+         [0, 2, 3, 4],
+         [0, 1, 3, 4]]], device='cuda:0')
 # _row_indices
 tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
+         [0, 1, 0, 0],
+         [0, 1, 1, 1]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], device='cuda:0', dtype=torch.int32)
+        [[1, 0, 1, 1],
+         [0, 1, 1, 0],
+         [0, 0, 1, 0]]], device='cuda:0')
 # _values
-tensor([[[ 1.,  2.,  3.,  4.],
-         [ 5.,  6.,  7.,  8.],
-         [ 9., 10., 11., 12.]],
-
-        [[13., 14., 15., 16.],
-         [17., 18., 19., 20.],
-         [21., 22., 23., 24.]]], device='cuda:0', dtype=torch.float64)
+tensor([[[1., 3., 2., 4.],
+         [1., 4., 2., 3.],
+         [1., 2., 3., 4.]],
 
+        [[2., 1., 3., 4.],
+         [1., 3., 4., 2.],
+         [1., 2., 4., 3.]]], device='cuda:0')
 
-########## torch.float32/torch.int64/size=()+(3, 2)+() ##########
+########## torch.float32/torch.int64/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(3, 2), nnz=4,
+tensor(ccol_indices=tensor([ 0,  3,  8, 11, 14, 19, 24]),
+       row_indices=tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6,
+                           7, 0, 4, 5, 6, 7]),
+       values=tensor([ 6., 13., 20.,  1.,  4.,  7.,  9., 21.,  8., 14., 22.,
+                       2., 10., 15.,  5., 11., 16., 18., 23.,  3., 12., 17.,
+                      19., 24.]), device='cuda:0', size=(8, 6), nnz=24,
        layout=torch.sparse_csc)
 # _ccol_indices
-tensor([0, 2, 4], device='cuda:0')
+tensor([ 0,  3,  8, 11, 14, 19, 24], device='cuda:0')
 # _row_indices
-tensor([0, 1, 0, 2], device='cuda:0')
+tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6, 7, 0, 4, 5, 6, 7],
+       device='cuda:0')
 # _values
-tensor([1., 2., 3., 4.], device='cuda:0')
+tensor([ 6., 13., 20.,  1.,  4.,  7.,  9., 21.,  8., 14., 22.,  2., 10., 15.,
+         5., 11., 16., 18., 23.,  3., 12., 17., 19., 24.], device='cuda:0')
 
-########## torch.float32/torch.int64/size=()+(0, 0)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0]),
-       row_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0,)), device='cuda:0', size=(0, 0), nnz=0,
-       layout=torch.sparse_csc)
-# _ccol_indices
-tensor([0], device='cuda:0')
-# _row_indices
-tensor([], device='cuda:0', dtype=torch.int64)
-# _values
-tensor([], device='cuda:0')
-
-########## torch.float32/torch.int64/size=(2,)+(3, 2)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       row_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[1., 2., 3., 4.],
-                      [5., 6., 7., 8.]]), device='cuda:0', size=(2, 3, 2),
-       nnz=4, layout=torch.sparse_csc)
-# _ccol_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]], device='cuda:0')
-# _row_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]], device='cuda:0')
-# _values
-tensor([[1., 2., 3., 4.],
-        [5., 6., 7., 8.]], device='cuda:0')
 
-########## torch.float32/torch.int64/size=(2, 3)+(3, 2)+() ##########
+########## torch.float64/torch.int64/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
+tensor(ccol_indices=tensor([[[0, 2, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 2, 3, 4]],
 
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
+                            [[0, 1, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 1, 3, 4]]]),
        row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[ 1.,  2.,  3.,  4.],
-                       [ 5.,  6.,  7.,  8.],
-                       [ 9., 10., 11., 12.]],
-
-                      [[13., 14., 15., 16.],
-                       [17., 18., 19., 20.],
-                       [21., 22., 23., 24.]]]), device='cuda:0',
-       size=(2, 3, 3, 2), nnz=4, layout=torch.sparse_csc)
+                            [0, 1, 0, 0],
+                            [0, 1, 1, 1]],
+
+                           [[1, 0, 1, 1],
+                            [0, 1, 1, 0],
+                            [0, 0, 1, 0]]]),
+       values=tensor([[[1., 3., 2., 4.],
+                       [1., 4., 2., 3.],
+                       [1., 2., 3., 4.]],
+
+                      [[2., 1., 3., 4.],
+                       [1., 3., 4., 2.],
+                       [1., 2., 4., 3.]]]), device='cuda:0', size=(2, 3, 2, 3),
+       nnz=4, dtype=torch.float64, layout=torch.sparse_csc)
 # _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+tensor([[[0, 2, 3, 4],
+         [0, 2, 3, 4],
+         [0, 2, 3, 4]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], device='cuda:0')
+        [[0, 1, 3, 4],
+         [0, 2, 3, 4],
+         [0, 1, 3, 4]]], device='cuda:0')
 # _row_indices
 tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
+         [0, 1, 0, 0],
+         [0, 1, 1, 1]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], device='cuda:0')
+        [[1, 0, 1, 1],
+         [0, 1, 1, 0],
+         [0, 0, 1, 0]]], device='cuda:0')
 # _values
-tensor([[[ 1.,  2.,  3.,  4.],
-         [ 5.,  6.,  7.,  8.],
-         [ 9., 10., 11., 12.]],
-
-        [[13., 14., 15., 16.],
-         [17., 18., 19., 20.],
-         [21., 22., 23., 24.]]], device='cuda:0')
+tensor([[[1., 3., 2., 4.],
+         [1., 4., 2., 3.],
+         [1., 2., 3., 4.]],
 
+        [[2., 1., 3., 4.],
+         [1., 3., 4., 2.],
+         [1., 2., 4., 3.]]], device='cuda:0', dtype=torch.float64)
 
-########## torch.float64/torch.int64/size=()+(3, 2)+() ##########
+########## torch.float64/torch.int64/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(3, 2), nnz=4,
+tensor(ccol_indices=tensor([ 0,  3,  8, 11, 14, 19, 24]),
+       row_indices=tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6,
+                           7, 0, 4, 5, 6, 7]),
+       values=tensor([ 6., 13., 20.,  1.,  4.,  7.,  9., 21.,  8., 14., 22.,
+                       2., 10., 15.,  5., 11., 16., 18., 23.,  3., 12., 17.,
+                      19., 24.]), device='cuda:0', size=(8, 6), nnz=24,
        dtype=torch.float64, layout=torch.sparse_csc)
 # _ccol_indices
-tensor([0, 2, 4], device='cuda:0')
+tensor([ 0,  3,  8, 11, 14, 19, 24], device='cuda:0')
 # _row_indices
-tensor([0, 1, 0, 2], device='cuda:0')
+tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6, 7, 0, 4, 5, 6, 7],
+       device='cuda:0')
 # _values
-tensor([1., 2., 3., 4.], device='cuda:0', dtype=torch.float64)
+tensor([ 6., 13., 20.,  1.,  4.,  7.,  9., 21.,  8., 14., 22.,  2., 10., 15.,
+         5., 11., 16., 18., 23.,  3., 12., 17., 19., 24.], device='cuda:0',
+       dtype=torch.float64)
 
-########## torch.float64/torch.int64/size=()+(0, 0)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0]),
-       row_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0,)), device='cuda:0', size=(0, 0), nnz=0,
-       dtype=torch.float64, layout=torch.sparse_csc)
-# _ccol_indices
-tensor([0], device='cuda:0')
-# _row_indices
-tensor([], device='cuda:0', dtype=torch.int64)
-# _values
-tensor([], device='cuda:0', dtype=torch.float64)
 
-########## torch.float64/torch.int64/size=(2,)+(3, 2)+() ##########
+########## torch.float32/torch.int32/size=(2, 3)+(2, 3)+(4,) ##########
 # sparse tensor
-tensor(ccol_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       row_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[1., 2., 3., 4.],
-                      [5., 6., 7., 8.]]), device='cuda:0', size=(2, 3, 2),
-       nnz=4, dtype=torch.float64, layout=torch.sparse_csc)
-# _ccol_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]], device='cuda:0')
-# _row_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]], device='cuda:0')
-# _values
-tensor([[1., 2., 3., 4.],
-        [5., 6., 7., 8.]], device='cuda:0', dtype=torch.float64)
-
-########## torch.float64/torch.int64/size=(2, 3)+(3, 2)+() ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
+tensor(ccol_indices=tensor([[[0, 2, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 2, 3, 4]],
 
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
+                            [[0, 1, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 1, 3, 4]]]),
        row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[ 1.,  2.,  3.,  4.],
-                       [ 5.,  6.,  7.,  8.],
-                       [ 9., 10., 11., 12.]],
-
-                      [[13., 14., 15., 16.],
-                       [17., 18., 19., 20.],
-                       [21., 22., 23., 24.]]]), device='cuda:0',
-       size=(2, 3, 3, 2), nnz=4, dtype=torch.float64, layout=torch.sparse_csc)
+                            [0, 1, 0, 0],
+                            [0, 1, 1, 1]],
+
+                           [[1, 0, 1, 1],
+                            [0, 1, 1, 0],
+                            [0, 0, 1, 0]]]),
+       values=tensor([[[[1., 2., 3., 4.],
+                        [3., 4., 5., 6.],
+                        [2., 3., 4., 5.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [4., 5., 6., 7.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]]],
+
+
+                      [[[2., 3., 4., 5.],
+                        [1., 2., 3., 4.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.],
+                        [2., 3., 4., 5.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [4., 5., 6., 7.],
+                        [3., 4., 5., 6.]]]]), device='cuda:0',
+       size=(2, 3, 2, 3, 4), nnz=4, layout=torch.sparse_csc)
 # _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+tensor([[[0, 2, 3, 4],
+         [0, 2, 3, 4],
+         [0, 2, 3, 4]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], device='cuda:0')
+        [[0, 1, 3, 4],
+         [0, 2, 3, 4],
+         [0, 1, 3, 4]]], device='cuda:0', dtype=torch.int32)
 # _row_indices
 tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
+         [0, 1, 0, 0],
+         [0, 1, 1, 1]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], device='cuda:0')
+        [[1, 0, 1, 1],
+         [0, 1, 1, 0],
+         [0, 0, 1, 0]]], device='cuda:0', dtype=torch.int32)
 # _values
-tensor([[[ 1.,  2.,  3.,  4.],
-         [ 5.,  6.,  7.,  8.],
-         [ 9., 10., 11., 12.]],
-
-        [[13., 14., 15., 16.],
-         [17., 18., 19., 20.],
-         [21., 22., 23., 24.]]], device='cuda:0', dtype=torch.float64)
-
-
-########## torch.float32/torch.int32/size=()+(3, 2)+(2,) ##########
+tensor([[[[1., 2., 3., 4.],
+          [3., 4., 5., 6.],
+          [2., 3., 4., 5.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [4., 5., 6., 7.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]]],
+
+
+        [[[2., 3., 4., 5.],
+          [1., 2., 3., 4.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.],
+          [2., 3., 4., 5.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [4., 5., 6., 7.],
+          [3., 4., 5., 6.]]]], device='cuda:0')
+
+########## torch.float32/torch.int32/size=()+(8, 6)+(4, 2) ##########
 # sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[1., 2.],
-                      [2., 3.],
-                      [3., 4.],
-                      [4., 5.]]), device='cuda:0', size=(3, 2, 2), nnz=4,
-       layout=torch.sparse_csc)
-# _ccol_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
-# _row_indices
-tensor([0, 1, 0, 2], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[1., 2.],
-        [2., 3.],
-        [3., 4.],
-        [4., 5.]], device='cuda:0')
-
-########## torch.float32/torch.int32/size=()+(3, 2)+(4, 2) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[ 1., 11.],
+tensor(ccol_indices=tensor([ 0,  3,  8, 11, 14, 19, 24]),
+       row_indices=tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6,
+                           7, 0, 4, 5, 6, 7]),
+       values=tensor([[[ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.]],
+
+                      [[13., 23.],
+                       [14., 24.],
+                       [15., 25.],
+                       [16., 26.]],
+
+                      [[20., 30.],
+                       [21., 31.],
+                       [22., 32.],
+                       [23., 33.]],
+
+                      [[ 1., 11.],
                        [ 2., 12.],
                        [ 3., 13.],
                        [ 4., 14.]],
 
+                      [[ 4., 14.],
+                       [ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.]],
+
+                      [[ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.]],
+
+                      [[ 9., 19.],
+                       [10., 20.],
+                       [11., 21.],
+                       [12., 22.]],
+
+                      [[21., 31.],
+                       [22., 32.],
+                       [23., 33.],
+                       [24., 34.]],
+
+                      [[ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.],
+                       [11., 21.]],
+
+                      [[14., 24.],
+                       [15., 25.],
+                       [16., 26.],
+                       [17., 27.]],
+
+                      [[22., 32.],
+                       [23., 33.],
+                       [24., 34.],
+                       [25., 35.]],
+
                       [[ 2., 12.],
                        [ 3., 13.],
                        [ 4., 14.],
                        [ 5., 15.]],
 
+                      [[10., 20.],
+                       [11., 21.],
+                       [12., 22.],
+                       [13., 23.]],
+
+                      [[15., 25.],
+                       [16., 26.],
+                       [17., 27.],
+                       [18., 28.]],
+
+                      [[ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.]],
+
+                      [[11., 21.],
+                       [12., 22.],
+                       [13., 23.],
+                       [14., 24.]],
+
+                      [[16., 26.],
+                       [17., 27.],
+                       [18., 28.],
+                       [19., 29.]],
+
+                      [[18., 28.],
+                       [19., 29.],
+                       [20., 30.],
+                       [21., 31.]],
+
+                      [[23., 33.],
+                       [24., 34.],
+                       [25., 35.],
+                       [26., 36.]],
+
                       [[ 3., 13.],
                        [ 4., 14.],
                        [ 5., 15.],
                        [ 6., 16.]],
 
-                      [[ 4., 14.],
-                       [ 5., 15.],
-                       [ 6., 16.],
-                       [ 7., 17.]]]), device='cuda:0', size=(3, 2, 4, 2),
-       nnz=4, layout=torch.sparse_csc)
+                      [[12., 22.],
+                       [13., 23.],
+                       [14., 24.],
+                       [15., 25.]],
+
+                      [[17., 27.],
+                       [18., 28.],
+                       [19., 29.],
+                       [20., 30.]],
+
+                      [[19., 29.],
+                       [20., 30.],
+                       [21., 31.],
+                       [22., 32.]],
+
+                      [[24., 34.],
+                       [25., 35.],
+                       [26., 36.],
+                       [27., 37.]]]), device='cuda:0', size=(8, 6, 4, 2),
+       nnz=24, layout=torch.sparse_csc)
 # _ccol_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
+tensor([ 0,  3,  8, 11, 14, 19, 24], device='cuda:0', dtype=torch.int32)
 # _row_indices
-tensor([0, 1, 0, 2], device='cuda:0', dtype=torch.int32)
+tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6, 7, 0, 4, 5, 6, 7],
+       device='cuda:0', dtype=torch.int32)
 # _values
-tensor([[[ 1., 11.],
+tensor([[[ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.]],
+
+        [[13., 23.],
+         [14., 24.],
+         [15., 25.],
+         [16., 26.]],
+
+        [[20., 30.],
+         [21., 31.],
+         [22., 32.],
+         [23., 33.]],
+
+        [[ 1., 11.],
          [ 2., 12.],
          [ 3., 13.],
          [ 4., 14.]],
 
-        [[ 2., 12.],
-         [ 3., 13.],
-         [ 4., 14.],
-         [ 5., 15.]],
-
-        [[ 3., 13.],
-         [ 4., 14.],
-         [ 5., 15.],
-         [ 6., 16.]],
-
         [[ 4., 14.],
          [ 5., 15.],
          [ 6., 16.],
-         [ 7., 17.]]], device='cuda:0')
-
-########## torch.float32/torch.int32/size=(2, 3)+(3, 2)+(2, 1) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1.],
-                         [ 2.]],
-
-                        [[ 2.],
-                         [ 3.]],
-
-                        [[ 3.],
-                         [ 4.]],
-
-                        [[ 4.],
-                         [ 5.]]],
-
-
-                       [[[ 5.],
-                         [ 6.]],
-
-                        [[ 6.],
-                         [ 7.]],
-
-                        [[ 7.],
-                         [ 8.]],
-
-                        [[ 8.],
-                         [ 9.]]],
-
-
-                       [[[ 9.],
-                         [10.]],
-
-                        [[10.],
-                         [11.]],
-
-                        [[11.],
-                         [12.]],
-
-                        [[12.],
-                         [13.]]]],
+         [ 7., 17.]],
+
+        [[ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.],
+         [10., 20.]],
+
+        [[ 9., 19.],
+         [10., 20.],
+         [11., 21.],
+         [12., 22.]],
+
+        [[21., 31.],
+         [22., 32.],
+         [23., 33.],
+         [24., 34.]],
+
+        [[ 8., 18.],
+         [ 9., 19.],
+         [10., 20.],
+         [11., 21.]],
+
+        [[14., 24.],
+         [15., 25.],
+         [16., 26.],
+         [17., 27.]],
+
+        [[22., 32.],
+         [23., 33.],
+         [24., 34.],
+         [25., 35.]],
 
+        [[ 2., 12.],
+         [ 3., 13.],
+         [ 4., 14.],
+         [ 5., 15.]],
 
+        [[10., 20.],
+         [11., 21.],
+         [12., 22.],
+         [13., 23.]],
 
-                      [[[[13.],
-                         [14.]],
+        [[15., 25.],
+         [16., 26.],
+         [17., 27.],
+         [18., 28.]],
 
-                        [[14.],
-                         [15.]],
+        [[ 5., 15.],
+         [ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.]],
 
-                        [[15.],
-                         [16.]],
+        [[11., 21.],
+         [12., 22.],
+         [13., 23.],
+         [14., 24.]],
 
-                        [[16.],
-                         [17.]]],
+        [[16., 26.],
+         [17., 27.],
+         [18., 28.],
+         [19., 29.]],
 
+        [[18., 28.],
+         [19., 29.],
+         [20., 30.],
+         [21., 31.]],
 
-                       [[[17.],
-                         [18.]],
+        [[23., 33.],
+         [24., 34.],
+         [25., 35.],
+         [26., 36.]],
 
-                        [[18.],
-                         [19.]],
+        [[ 3., 13.],
+         [ 4., 14.],
+         [ 5., 15.],
+         [ 6., 16.]],
 
-                        [[19.],
-                         [20.]],
+        [[12., 22.],
+         [13., 23.],
+         [14., 24.],
+         [15., 25.]],
 
-                        [[20.],
-                         [21.]]],
+        [[17., 27.],
+         [18., 28.],
+         [19., 29.],
+         [20., 30.]],
 
+        [[19., 29.],
+         [20., 30.],
+         [21., 31.],
+         [22., 32.]],
 
-                       [[[21.],
-                         [22.]],
+        [[24., 34.],
+         [25., 35.],
+         [26., 36.],
+         [27., 37.]]], device='cuda:0')
 
-                        [[22.],
-                         [23.]],
 
-                        [[23.],
-                         [24.]],
+########## torch.float64/torch.int32/size=(2, 3)+(2, 3)+(4,) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 2, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 2, 3, 4]],
 
-                        [[24.],
-                         [25.]]]]]), device='cuda:0', size=(2, 3, 3, 2, 2, 1),
-       nnz=4, layout=torch.sparse_csc)
+                            [[0, 1, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 1, 3, 4]]]),
+       row_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 0],
+                            [0, 1, 1, 1]],
+
+                           [[1, 0, 1, 1],
+                            [0, 1, 1, 0],
+                            [0, 0, 1, 0]]]),
+       values=tensor([[[[1., 2., 3., 4.],
+                        [3., 4., 5., 6.],
+                        [2., 3., 4., 5.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [4., 5., 6., 7.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]]],
+
+
+                      [[[2., 3., 4., 5.],
+                        [1., 2., 3., 4.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.],
+                        [2., 3., 4., 5.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [4., 5., 6., 7.],
+                        [3., 4., 5., 6.]]]]), device='cuda:0',
+       size=(2, 3, 2, 3, 4), nnz=4, dtype=torch.float64, layout=torch.sparse_csc)
 # _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+tensor([[[0, 2, 3, 4],
+         [0, 2, 3, 4],
+         [0, 2, 3, 4]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], device='cuda:0', dtype=torch.int32)
+        [[0, 1, 3, 4],
+         [0, 2, 3, 4],
+         [0, 1, 3, 4]]], device='cuda:0', dtype=torch.int32)
 # _row_indices
 tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
+         [0, 1, 0, 0],
+         [0, 1, 1, 1]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], device='cuda:0', dtype=torch.int32)
+        [[1, 0, 1, 1],
+         [0, 1, 1, 0],
+         [0, 0, 1, 0]]], device='cuda:0', dtype=torch.int32)
 # _values
-tensor([[[[[ 1.],
-           [ 2.]],
-
-          [[ 2.],
-           [ 3.]],
-
-          [[ 3.],
-           [ 4.]],
-
-          [[ 4.],
-           [ 5.]]],
-
-
-         [[[ 5.],
-           [ 6.]],
-
-          [[ 6.],
-           [ 7.]],
-
-          [[ 7.],
-           [ 8.]],
-
-          [[ 8.],
-           [ 9.]]],
-
-
-         [[[ 9.],
-           [10.]],
-
-          [[10.],
-           [11.]],
-
-          [[11.],
-           [12.]],
-
-          [[12.],
-           [13.]]]],
-
-
-
-        [[[[13.],
-           [14.]],
-
-          [[14.],
-           [15.]],
-
-          [[15.],
-           [16.]],
-
-          [[16.],
-           [17.]]],
-
-
-         [[[17.],
-           [18.]],
-
-          [[18.],
-           [19.]],
-
-          [[19.],
-           [20.]],
-
-          [[20.],
-           [21.]]],
-
-
-         [[[21.],
-           [22.]],
-
-          [[22.],
-           [23.]],
-
-          [[23.],
-           [24.]],
-
-          [[24.],
-           [25.]]]]], device='cuda:0')
-
-
-########## torch.float64/torch.int32/size=()+(3, 2)+(2,) ##########
+tensor([[[[1., 2., 3., 4.],
+          [3., 4., 5., 6.],
+          [2., 3., 4., 5.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [4., 5., 6., 7.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]]],
+
+
+        [[[2., 3., 4., 5.],
+          [1., 2., 3., 4.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.],
+          [2., 3., 4., 5.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [4., 5., 6., 7.],
+          [3., 4., 5., 6.]]]], device='cuda:0', dtype=torch.float64)
+
+########## torch.float64/torch.int32/size=()+(8, 6)+(4, 2) ##########
 # sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[1., 2.],
-                      [2., 3.],
-                      [3., 4.],
-                      [4., 5.]]), device='cuda:0', size=(3, 2, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csc)
-# _ccol_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
-# _row_indices
-tensor([0, 1, 0, 2], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[1., 2.],
-        [2., 3.],
-        [3., 4.],
-        [4., 5.]], device='cuda:0', dtype=torch.float64)
-
-########## torch.float64/torch.int32/size=()+(3, 2)+(4, 2) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[ 1., 11.],
+tensor(ccol_indices=tensor([ 0,  3,  8, 11, 14, 19, 24]),
+       row_indices=tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6,
+                           7, 0, 4, 5, 6, 7]),
+       values=tensor([[[ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.]],
+
+                      [[13., 23.],
+                       [14., 24.],
+                       [15., 25.],
+                       [16., 26.]],
+
+                      [[20., 30.],
+                       [21., 31.],
+                       [22., 32.],
+                       [23., 33.]],
+
+                      [[ 1., 11.],
                        [ 2., 12.],
                        [ 3., 13.],
                        [ 4., 14.]],
 
+                      [[ 4., 14.],
+                       [ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.]],
+
+                      [[ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.]],
+
+                      [[ 9., 19.],
+                       [10., 20.],
+                       [11., 21.],
+                       [12., 22.]],
+
+                      [[21., 31.],
+                       [22., 32.],
+                       [23., 33.],
+                       [24., 34.]],
+
+                      [[ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.],
+                       [11., 21.]],
+
+                      [[14., 24.],
+                       [15., 25.],
+                       [16., 26.],
+                       [17., 27.]],
+
+                      [[22., 32.],
+                       [23., 33.],
+                       [24., 34.],
+                       [25., 35.]],
+
                       [[ 2., 12.],
                        [ 3., 13.],
                        [ 4., 14.],
                        [ 5., 15.]],
 
+                      [[10., 20.],
+                       [11., 21.],
+                       [12., 22.],
+                       [13., 23.]],
+
+                      [[15., 25.],
+                       [16., 26.],
+                       [17., 27.],
+                       [18., 28.]],
+
+                      [[ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.]],
+
+                      [[11., 21.],
+                       [12., 22.],
+                       [13., 23.],
+                       [14., 24.]],
+
+                      [[16., 26.],
+                       [17., 27.],
+                       [18., 28.],
+                       [19., 29.]],
+
+                      [[18., 28.],
+                       [19., 29.],
+                       [20., 30.],
+                       [21., 31.]],
+
+                      [[23., 33.],
+                       [24., 34.],
+                       [25., 35.],
+                       [26., 36.]],
+
                       [[ 3., 13.],
                        [ 4., 14.],
                        [ 5., 15.],
                        [ 6., 16.]],
 
-                      [[ 4., 14.],
-                       [ 5., 15.],
-                       [ 6., 16.],
-                       [ 7., 17.]]]), device='cuda:0', size=(3, 2, 4, 2),
-       nnz=4, dtype=torch.float64, layout=torch.sparse_csc)
+                      [[12., 22.],
+                       [13., 23.],
+                       [14., 24.],
+                       [15., 25.]],
+
+                      [[17., 27.],
+                       [18., 28.],
+                       [19., 29.],
+                       [20., 30.]],
+
+                      [[19., 29.],
+                       [20., 30.],
+                       [21., 31.],
+                       [22., 32.]],
+
+                      [[24., 34.],
+                       [25., 35.],
+                       [26., 36.],
+                       [27., 37.]]]), device='cuda:0', size=(8, 6, 4, 2),
+       nnz=24, dtype=torch.float64, layout=torch.sparse_csc)
 # _ccol_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
+tensor([ 0,  3,  8, 11, 14, 19, 24], device='cuda:0', dtype=torch.int32)
 # _row_indices
-tensor([0, 1, 0, 2], device='cuda:0', dtype=torch.int32)
+tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6, 7, 0, 4, 5, 6, 7],
+       device='cuda:0', dtype=torch.int32)
 # _values
-tensor([[[ 1., 11.],
+tensor([[[ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.]],
+
+        [[13., 23.],
+         [14., 24.],
+         [15., 25.],
+         [16., 26.]],
+
+        [[20., 30.],
+         [21., 31.],
+         [22., 32.],
+         [23., 33.]],
+
+        [[ 1., 11.],
          [ 2., 12.],
          [ 3., 13.],
          [ 4., 14.]],
 
-        [[ 2., 12.],
-         [ 3., 13.],
-         [ 4., 14.],
-         [ 5., 15.]],
-
-        [[ 3., 13.],
-         [ 4., 14.],
-         [ 5., 15.],
-         [ 6., 16.]],
-
         [[ 4., 14.],
          [ 5., 15.],
          [ 6., 16.],
-         [ 7., 17.]]], device='cuda:0', dtype=torch.float64)
-
-########## torch.float64/torch.int32/size=(2, 3)+(3, 2)+(2, 1) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1.],
-                         [ 2.]],
-
-                        [[ 2.],
-                         [ 3.]],
-
-                        [[ 3.],
-                         [ 4.]],
-
-                        [[ 4.],
-                         [ 5.]]],
-
-
-                       [[[ 5.],
-                         [ 6.]],
-
-                        [[ 6.],
-                         [ 7.]],
-
-                        [[ 7.],
-                         [ 8.]],
-
-                        [[ 8.],
-                         [ 9.]]],
-
-
-                       [[[ 9.],
-                         [10.]],
-
-                        [[10.],
-                         [11.]],
-
-                        [[11.],
-                         [12.]],
-
-                        [[12.],
-                         [13.]]]],
+         [ 7., 17.]],
+
+        [[ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.],
+         [10., 20.]],
+
+        [[ 9., 19.],
+         [10., 20.],
+         [11., 21.],
+         [12., 22.]],
+
+        [[21., 31.],
+         [22., 32.],
+         [23., 33.],
+         [24., 34.]],
+
+        [[ 8., 18.],
+         [ 9., 19.],
+         [10., 20.],
+         [11., 21.]],
+
+        [[14., 24.],
+         [15., 25.],
+         [16., 26.],
+         [17., 27.]],
+
+        [[22., 32.],
+         [23., 33.],
+         [24., 34.],
+         [25., 35.]],
 
+        [[ 2., 12.],
+         [ 3., 13.],
+         [ 4., 14.],
+         [ 5., 15.]],
 
+        [[10., 20.],
+         [11., 21.],
+         [12., 22.],
+         [13., 23.]],
 
-                      [[[[13.],
-                         [14.]],
+        [[15., 25.],
+         [16., 26.],
+         [17., 27.],
+         [18., 28.]],
 
-                        [[14.],
-                         [15.]],
+        [[ 5., 15.],
+         [ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.]],
 
-                        [[15.],
-                         [16.]],
+        [[11., 21.],
+         [12., 22.],
+         [13., 23.],
+         [14., 24.]],
 
-                        [[16.],
-                         [17.]]],
+        [[16., 26.],
+         [17., 27.],
+         [18., 28.],
+         [19., 29.]],
 
+        [[18., 28.],
+         [19., 29.],
+         [20., 30.],
+         [21., 31.]],
 
-                       [[[17.],
-                         [18.]],
+        [[23., 33.],
+         [24., 34.],
+         [25., 35.],
+         [26., 36.]],
 
-                        [[18.],
-                         [19.]],
+        [[ 3., 13.],
+         [ 4., 14.],
+         [ 5., 15.],
+         [ 6., 16.]],
 
-                        [[19.],
-                         [20.]],
+        [[12., 22.],
+         [13., 23.],
+         [14., 24.],
+         [15., 25.]],
 
-                        [[20.],
-                         [21.]]],
+        [[17., 27.],
+         [18., 28.],
+         [19., 29.],
+         [20., 30.]],
 
+        [[19., 29.],
+         [20., 30.],
+         [21., 31.],
+         [22., 32.]],
 
-                       [[[21.],
-                         [22.]],
+        [[24., 34.],
+         [25., 35.],
+         [26., 36.],
+         [27., 37.]]], device='cuda:0', dtype=torch.float64)
 
-                        [[22.],
-                         [23.]],
 
-                        [[23.],
-                         [24.]],
+########## torch.float32/torch.int64/size=(2, 3)+(2, 3)+(4,) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 2, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 2, 3, 4]],
 
-                        [[24.],
-                         [25.]]]]]), device='cuda:0', size=(2, 3, 3, 2, 2, 1),
-       nnz=4, dtype=torch.float64, layout=torch.sparse_csc)
+                            [[0, 1, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 1, 3, 4]]]),
+       row_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 0],
+                            [0, 1, 1, 1]],
+
+                           [[1, 0, 1, 1],
+                            [0, 1, 1, 0],
+                            [0, 0, 1, 0]]]),
+       values=tensor([[[[1., 2., 3., 4.],
+                        [3., 4., 5., 6.],
+                        [2., 3., 4., 5.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [4., 5., 6., 7.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]]],
+
+
+                      [[[2., 3., 4., 5.],
+                        [1., 2., 3., 4.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.],
+                        [2., 3., 4., 5.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [4., 5., 6., 7.],
+                        [3., 4., 5., 6.]]]]), device='cuda:0',
+       size=(2, 3, 2, 3, 4), nnz=4, layout=torch.sparse_csc)
 # _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+tensor([[[0, 2, 3, 4],
+         [0, 2, 3, 4],
+         [0, 2, 3, 4]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], device='cuda:0', dtype=torch.int32)
+        [[0, 1, 3, 4],
+         [0, 2, 3, 4],
+         [0, 1, 3, 4]]], device='cuda:0')
 # _row_indices
 tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
+         [0, 1, 0, 0],
+         [0, 1, 1, 1]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], device='cuda:0', dtype=torch.int32)
+        [[1, 0, 1, 1],
+         [0, 1, 1, 0],
+         [0, 0, 1, 0]]], device='cuda:0')
 # _values
-tensor([[[[[ 1.],
-           [ 2.]],
-
-          [[ 2.],
-           [ 3.]],
-
-          [[ 3.],
-           [ 4.]],
-
-          [[ 4.],
-           [ 5.]]],
-
-
-         [[[ 5.],
-           [ 6.]],
-
-          [[ 6.],
-           [ 7.]],
-
-          [[ 7.],
-           [ 8.]],
-
-          [[ 8.],
-           [ 9.]]],
-
-
-         [[[ 9.],
-           [10.]],
-
-          [[10.],
-           [11.]],
-
-          [[11.],
-           [12.]],
-
-          [[12.],
-           [13.]]]],
-
-
-
-        [[[[13.],
-           [14.]],
-
-          [[14.],
-           [15.]],
-
-          [[15.],
-           [16.]],
-
-          [[16.],
-           [17.]]],
-
-
-         [[[17.],
-           [18.]],
-
-          [[18.],
-           [19.]],
-
-          [[19.],
-           [20.]],
-
-          [[20.],
-           [21.]]],
-
-
-         [[[21.],
-           [22.]],
-
-          [[22.],
-           [23.]],
-
-          [[23.],
-           [24.]],
-
-          [[24.],
-           [25.]]]]], device='cuda:0', dtype=torch.float64)
-
-
-########## torch.float32/torch.int64/size=()+(3, 2)+(2,) ##########
+tensor([[[[1., 2., 3., 4.],
+          [3., 4., 5., 6.],
+          [2., 3., 4., 5.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [4., 5., 6., 7.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]]],
+
+
+        [[[2., 3., 4., 5.],
+          [1., 2., 3., 4.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.],
+          [2., 3., 4., 5.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [4., 5., 6., 7.],
+          [3., 4., 5., 6.]]]], device='cuda:0')
+
+########## torch.float32/torch.int64/size=()+(8, 6)+(4, 2) ##########
 # sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[1., 2.],
-                      [2., 3.],
-                      [3., 4.],
-                      [4., 5.]]), device='cuda:0', size=(3, 2, 2), nnz=4,
-       layout=torch.sparse_csc)
-# _ccol_indices
-tensor([0, 2, 4], device='cuda:0')
-# _row_indices
-tensor([0, 1, 0, 2], device='cuda:0')
-# _values
-tensor([[1., 2.],
-        [2., 3.],
-        [3., 4.],
-        [4., 5.]], device='cuda:0')
-
-########## torch.float32/torch.int64/size=()+(3, 2)+(4, 2) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[ 1., 11.],
+tensor(ccol_indices=tensor([ 0,  3,  8, 11, 14, 19, 24]),
+       row_indices=tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6,
+                           7, 0, 4, 5, 6, 7]),
+       values=tensor([[[ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.]],
+
+                      [[13., 23.],
+                       [14., 24.],
+                       [15., 25.],
+                       [16., 26.]],
+
+                      [[20., 30.],
+                       [21., 31.],
+                       [22., 32.],
+                       [23., 33.]],
+
+                      [[ 1., 11.],
                        [ 2., 12.],
                        [ 3., 13.],
                        [ 4., 14.]],
 
+                      [[ 4., 14.],
+                       [ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.]],
+
+                      [[ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.]],
+
+                      [[ 9., 19.],
+                       [10., 20.],
+                       [11., 21.],
+                       [12., 22.]],
+
+                      [[21., 31.],
+                       [22., 32.],
+                       [23., 33.],
+                       [24., 34.]],
+
+                      [[ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.],
+                       [11., 21.]],
+
+                      [[14., 24.],
+                       [15., 25.],
+                       [16., 26.],
+                       [17., 27.]],
+
+                      [[22., 32.],
+                       [23., 33.],
+                       [24., 34.],
+                       [25., 35.]],
+
                       [[ 2., 12.],
                        [ 3., 13.],
                        [ 4., 14.],
                        [ 5., 15.]],
 
+                      [[10., 20.],
+                       [11., 21.],
+                       [12., 22.],
+                       [13., 23.]],
+
+                      [[15., 25.],
+                       [16., 26.],
+                       [17., 27.],
+                       [18., 28.]],
+
+                      [[ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.]],
+
+                      [[11., 21.],
+                       [12., 22.],
+                       [13., 23.],
+                       [14., 24.]],
+
+                      [[16., 26.],
+                       [17., 27.],
+                       [18., 28.],
+                       [19., 29.]],
+
+                      [[18., 28.],
+                       [19., 29.],
+                       [20., 30.],
+                       [21., 31.]],
+
+                      [[23., 33.],
+                       [24., 34.],
+                       [25., 35.],
+                       [26., 36.]],
+
                       [[ 3., 13.],
                        [ 4., 14.],
                        [ 5., 15.],
                        [ 6., 16.]],
 
-                      [[ 4., 14.],
-                       [ 5., 15.],
-                       [ 6., 16.],
-                       [ 7., 17.]]]), device='cuda:0', size=(3, 2, 4, 2),
-       nnz=4, layout=torch.sparse_csc)
+                      [[12., 22.],
+                       [13., 23.],
+                       [14., 24.],
+                       [15., 25.]],
+
+                      [[17., 27.],
+                       [18., 28.],
+                       [19., 29.],
+                       [20., 30.]],
+
+                      [[19., 29.],
+                       [20., 30.],
+                       [21., 31.],
+                       [22., 32.]],
+
+                      [[24., 34.],
+                       [25., 35.],
+                       [26., 36.],
+                       [27., 37.]]]), device='cuda:0', size=(8, 6, 4, 2),
+       nnz=24, layout=torch.sparse_csc)
 # _ccol_indices
-tensor([0, 2, 4], device='cuda:0')
+tensor([ 0,  3,  8, 11, 14, 19, 24], device='cuda:0')
 # _row_indices
-tensor([0, 1, 0, 2], device='cuda:0')
+tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6, 7, 0, 4, 5, 6, 7],
+       device='cuda:0')
 # _values
-tensor([[[ 1., 11.],
+tensor([[[ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.]],
+
+        [[13., 23.],
+         [14., 24.],
+         [15., 25.],
+         [16., 26.]],
+
+        [[20., 30.],
+         [21., 31.],
+         [22., 32.],
+         [23., 33.]],
+
+        [[ 1., 11.],
          [ 2., 12.],
          [ 3., 13.],
          [ 4., 14.]],
 
-        [[ 2., 12.],
-         [ 3., 13.],
-         [ 4., 14.],
-         [ 5., 15.]],
-
-        [[ 3., 13.],
-         [ 4., 14.],
-         [ 5., 15.],
-         [ 6., 16.]],
-
         [[ 4., 14.],
          [ 5., 15.],
          [ 6., 16.],
-         [ 7., 17.]]], device='cuda:0')
-
-########## torch.float32/torch.int64/size=(2, 3)+(3, 2)+(2, 1) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1.],
-                         [ 2.]],
-
-                        [[ 2.],
-                         [ 3.]],
-
-                        [[ 3.],
-                         [ 4.]],
-
-                        [[ 4.],
-                         [ 5.]]],
-
-
-                       [[[ 5.],
-                         [ 6.]],
-
-                        [[ 6.],
-                         [ 7.]],
-
-                        [[ 7.],
-                         [ 8.]],
-
-                        [[ 8.],
-                         [ 9.]]],
-
-
-                       [[[ 9.],
-                         [10.]],
-
-                        [[10.],
-                         [11.]],
-
-                        [[11.],
-                         [12.]],
-
-                        [[12.],
-                         [13.]]]],
+         [ 7., 17.]],
+
+        [[ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.],
+         [10., 20.]],
+
+        [[ 9., 19.],
+         [10., 20.],
+         [11., 21.],
+         [12., 22.]],
+
+        [[21., 31.],
+         [22., 32.],
+         [23., 33.],
+         [24., 34.]],
+
+        [[ 8., 18.],
+         [ 9., 19.],
+         [10., 20.],
+         [11., 21.]],
+
+        [[14., 24.],
+         [15., 25.],
+         [16., 26.],
+         [17., 27.]],
+
+        [[22., 32.],
+         [23., 33.],
+         [24., 34.],
+         [25., 35.]],
 
+        [[ 2., 12.],
+         [ 3., 13.],
+         [ 4., 14.],
+         [ 5., 15.]],
 
+        [[10., 20.],
+         [11., 21.],
+         [12., 22.],
+         [13., 23.]],
 
-                      [[[[13.],
-                         [14.]],
+        [[15., 25.],
+         [16., 26.],
+         [17., 27.],
+         [18., 28.]],
 
-                        [[14.],
-                         [15.]],
+        [[ 5., 15.],
+         [ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.]],
 
-                        [[15.],
-                         [16.]],
+        [[11., 21.],
+         [12., 22.],
+         [13., 23.],
+         [14., 24.]],
 
-                        [[16.],
-                         [17.]]],
+        [[16., 26.],
+         [17., 27.],
+         [18., 28.],
+         [19., 29.]],
 
+        [[18., 28.],
+         [19., 29.],
+         [20., 30.],
+         [21., 31.]],
 
-                       [[[17.],
-                         [18.]],
+        [[23., 33.],
+         [24., 34.],
+         [25., 35.],
+         [26., 36.]],
 
-                        [[18.],
-                         [19.]],
+        [[ 3., 13.],
+         [ 4., 14.],
+         [ 5., 15.],
+         [ 6., 16.]],
 
-                        [[19.],
-                         [20.]],
+        [[12., 22.],
+         [13., 23.],
+         [14., 24.],
+         [15., 25.]],
 
-                        [[20.],
-                         [21.]]],
+        [[17., 27.],
+         [18., 28.],
+         [19., 29.],
+         [20., 30.]],
 
+        [[19., 29.],
+         [20., 30.],
+         [21., 31.],
+         [22., 32.]],
 
-                       [[[21.],
-                         [22.]],
+        [[24., 34.],
+         [25., 35.],
+         [26., 36.],
+         [27., 37.]]], device='cuda:0')
 
-                        [[22.],
-                         [23.]],
 
-                        [[23.],
-                         [24.]],
+########## torch.float64/torch.int64/size=(2, 3)+(2, 3)+(4,) ##########
+# sparse tensor
+tensor(ccol_indices=tensor([[[0, 2, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 2, 3, 4]],
 
-                        [[24.],
-                         [25.]]]]]), device='cuda:0', size=(2, 3, 3, 2, 2, 1),
-       nnz=4, layout=torch.sparse_csc)
+                            [[0, 1, 3, 4],
+                             [0, 2, 3, 4],
+                             [0, 1, 3, 4]]]),
+       row_indices=tensor([[[0, 1, 0, 1],
+                            [0, 1, 0, 0],
+                            [0, 1, 1, 1]],
+
+                           [[1, 0, 1, 1],
+                            [0, 1, 1, 0],
+                            [0, 0, 1, 0]]]),
+       values=tensor([[[[1., 2., 3., 4.],
+                        [3., 4., 5., 6.],
+                        [2., 3., 4., 5.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [4., 5., 6., 7.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]]],
+
+
+                      [[[2., 3., 4., 5.],
+                        [1., 2., 3., 4.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.],
+                        [2., 3., 4., 5.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [4., 5., 6., 7.],
+                        [3., 4., 5., 6.]]]]), device='cuda:0',
+       size=(2, 3, 2, 3, 4), nnz=4, dtype=torch.float64, layout=torch.sparse_csc)
 # _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
+tensor([[[0, 2, 3, 4],
+         [0, 2, 3, 4],
+         [0, 2, 3, 4]],
 
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], device='cuda:0')
+        [[0, 1, 3, 4],
+         [0, 2, 3, 4],
+         [0, 1, 3, 4]]], device='cuda:0')
 # _row_indices
 tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
+         [0, 1, 0, 0],
+         [0, 1, 1, 1]],
 
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], device='cuda:0')
+        [[1, 0, 1, 1],
+         [0, 1, 1, 0],
+         [0, 0, 1, 0]]], device='cuda:0')
 # _values
-tensor([[[[[ 1.],
-           [ 2.]],
-
-          [[ 2.],
-           [ 3.]],
-
-          [[ 3.],
-           [ 4.]],
-
-          [[ 4.],
-           [ 5.]]],
-
-
-         [[[ 5.],
-           [ 6.]],
-
-          [[ 6.],
-           [ 7.]],
-
-          [[ 7.],
-           [ 8.]],
-
-          [[ 8.],
-           [ 9.]]],
-
-
-         [[[ 9.],
-           [10.]],
-
-          [[10.],
-           [11.]],
-
-          [[11.],
-           [12.]],
-
-          [[12.],
-           [13.]]]],
-
-
-
-        [[[[13.],
-           [14.]],
-
-          [[14.],
-           [15.]],
-
-          [[15.],
-           [16.]],
-
-          [[16.],
-           [17.]]],
-
-
-         [[[17.],
-           [18.]],
-
-          [[18.],
-           [19.]],
-
-          [[19.],
-           [20.]],
-
-          [[20.],
-           [21.]]],
-
-
-         [[[21.],
-           [22.]],
-
-          [[22.],
-           [23.]],
-
-          [[23.],
-           [24.]],
-
-          [[24.],
-           [25.]]]]], device='cuda:0')
-
-
-########## torch.float64/torch.int64/size=()+(3, 2)+(2,) ##########
+tensor([[[[1., 2., 3., 4.],
+          [3., 4., 5., 6.],
+          [2., 3., 4., 5.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [4., 5., 6., 7.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]]],
+
+
+        [[[2., 3., 4., 5.],
+          [1., 2., 3., 4.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.],
+          [2., 3., 4., 5.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [4., 5., 6., 7.],
+          [3., 4., 5., 6.]]]], device='cuda:0', dtype=torch.float64)
+
+########## torch.float64/torch.int64/size=()+(8, 6)+(4, 2) ##########
 # sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[1., 2.],
-                      [2., 3.],
-                      [3., 4.],
-                      [4., 5.]]), device='cuda:0', size=(3, 2, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csc)
-# _ccol_indices
-tensor([0, 2, 4], device='cuda:0')
-# _row_indices
-tensor([0, 1, 0, 2], device='cuda:0')
-# _values
-tensor([[1., 2.],
-        [2., 3.],
-        [3., 4.],
-        [4., 5.]], device='cuda:0', dtype=torch.float64)
-
-########## torch.float64/torch.int64/size=()+(3, 2)+(4, 2) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([0, 2, 4]),
-       row_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[ 1., 11.],
+tensor(ccol_indices=tensor([ 0,  3,  8, 11, 14, 19, 24]),
+       row_indices=tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6,
+                           7, 0, 4, 5, 6, 7]),
+       values=tensor([[[ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.]],
+
+                      [[13., 23.],
+                       [14., 24.],
+                       [15., 25.],
+                       [16., 26.]],
+
+                      [[20., 30.],
+                       [21., 31.],
+                       [22., 32.],
+                       [23., 33.]],
+
+                      [[ 1., 11.],
                        [ 2., 12.],
                        [ 3., 13.],
                        [ 4., 14.]],
 
+                      [[ 4., 14.],
+                       [ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.]],
+
+                      [[ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.]],
+
+                      [[ 9., 19.],
+                       [10., 20.],
+                       [11., 21.],
+                       [12., 22.]],
+
+                      [[21., 31.],
+                       [22., 32.],
+                       [23., 33.],
+                       [24., 34.]],
+
+                      [[ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.],
+                       [11., 21.]],
+
+                      [[14., 24.],
+                       [15., 25.],
+                       [16., 26.],
+                       [17., 27.]],
+
+                      [[22., 32.],
+                       [23., 33.],
+                       [24., 34.],
+                       [25., 35.]],
+
                       [[ 2., 12.],
                        [ 3., 13.],
                        [ 4., 14.],
                        [ 5., 15.]],
 
+                      [[10., 20.],
+                       [11., 21.],
+                       [12., 22.],
+                       [13., 23.]],
+
+                      [[15., 25.],
+                       [16., 26.],
+                       [17., 27.],
+                       [18., 28.]],
+
+                      [[ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.]],
+
+                      [[11., 21.],
+                       [12., 22.],
+                       [13., 23.],
+                       [14., 24.]],
+
+                      [[16., 26.],
+                       [17., 27.],
+                       [18., 28.],
+                       [19., 29.]],
+
+                      [[18., 28.],
+                       [19., 29.],
+                       [20., 30.],
+                       [21., 31.]],
+
+                      [[23., 33.],
+                       [24., 34.],
+                       [25., 35.],
+                       [26., 36.]],
+
                       [[ 3., 13.],
                        [ 4., 14.],
                        [ 5., 15.],
                        [ 6., 16.]],
 
-                      [[ 4., 14.],
-                       [ 5., 15.],
-                       [ 6., 16.],
-                       [ 7., 17.]]]), device='cuda:0', size=(3, 2, 4, 2),
-       nnz=4, dtype=torch.float64, layout=torch.sparse_csc)
+                      [[12., 22.],
+                       [13., 23.],
+                       [14., 24.],
+                       [15., 25.]],
+
+                      [[17., 27.],
+                       [18., 28.],
+                       [19., 29.],
+                       [20., 30.]],
+
+                      [[19., 29.],
+                       [20., 30.],
+                       [21., 31.],
+                       [22., 32.]],
+
+                      [[24., 34.],
+                       [25., 35.],
+                       [26., 36.],
+                       [27., 37.]]]), device='cuda:0', size=(8, 6, 4, 2),
+       nnz=24, dtype=torch.float64, layout=torch.sparse_csc)
 # _ccol_indices
-tensor([0, 2, 4], device='cuda:0')
+tensor([ 0,  3,  8, 11, 14, 19, 24], device='cuda:0')
 # _row_indices
-tensor([0, 1, 0, 2], device='cuda:0')
+tensor([2, 5, 7, 0, 1, 2, 4, 7, 2, 5, 7, 0, 4, 5, 1, 4, 5, 6, 7, 0, 4, 5, 6, 7],
+       device='cuda:0')
 # _values
-tensor([[[ 1., 11.],
+tensor([[[ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.]],
+
+        [[13., 23.],
+         [14., 24.],
+         [15., 25.],
+         [16., 26.]],
+
+        [[20., 30.],
+         [21., 31.],
+         [22., 32.],
+         [23., 33.]],
+
+        [[ 1., 11.],
          [ 2., 12.],
          [ 3., 13.],
          [ 4., 14.]],
 
-        [[ 2., 12.],
-         [ 3., 13.],
-         [ 4., 14.],
-         [ 5., 15.]],
-
-        [[ 3., 13.],
-         [ 4., 14.],
-         [ 5., 15.],
-         [ 6., 16.]],
-
         [[ 4., 14.],
          [ 5., 15.],
          [ 6., 16.],
-         [ 7., 17.]]], device='cuda:0', dtype=torch.float64)
-
-########## torch.float64/torch.int64/size=(2, 3)+(3, 2)+(2, 1) ##########
-# sparse tensor
-tensor(ccol_indices=tensor([[[0, 2, 4],
-                             [0, 3, 4],
-                             [0, 1, 4]],
-
-                            [[0, 1, 4],
-                             [0, 2, 4],
-                             [0, 3, 4]]]),
-       row_indices=tensor([[[0, 1, 0, 1],
-                            [0, 1, 2, 0],
-                            [0, 0, 1, 2]],
-
-                           [[1, 0, 1, 2],
-                            [0, 2, 0, 1],
-                            [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1.],
-                         [ 2.]],
-
-                        [[ 2.],
-                         [ 3.]],
-
-                        [[ 3.],
-                         [ 4.]],
-
-                        [[ 4.],
-                         [ 5.]]],
-
-
-                       [[[ 5.],
-                         [ 6.]],
-
-                        [[ 6.],
-                         [ 7.]],
-
-                        [[ 7.],
-                         [ 8.]],
-
-                        [[ 8.],
-                         [ 9.]]],
-
-
-                       [[[ 9.],
-                         [10.]],
-
-                        [[10.],
-                         [11.]],
-
-                        [[11.],
-                         [12.]],
-
-                        [[12.],
-                         [13.]]]],
-
-
-
-                      [[[[13.],
-                         [14.]],
-
-                        [[14.],
-                         [15.]],
-
-                        [[15.],
-                         [16.]],
-
-                        [[16.],
-                         [17.]]],
-
-
-                       [[[17.],
-                         [18.]],
-
-                        [[18.],
-                         [19.]],
-
-                        [[19.],
-                         [20.]],
-
-                        [[20.],
-                         [21.]]],
-
-
-                       [[[21.],
-                         [22.]],
-
-                        [[22.],
-                         [23.]],
-
-                        [[23.],
-                         [24.]],
-
-                        [[24.],
-                         [25.]]]]]), device='cuda:0', size=(2, 3, 3, 2, 2, 1),
-       nnz=4, dtype=torch.float64, layout=torch.sparse_csc)
-# _ccol_indices
-tensor([[[0, 2, 4],
-         [0, 3, 4],
-         [0, 1, 4]],
-
-        [[0, 1, 4],
-         [0, 2, 4],
-         [0, 3, 4]]], device='cuda:0')
-# _row_indices
-tensor([[[0, 1, 0, 1],
-         [0, 1, 2, 0],
-         [0, 0, 1, 2]],
-
-        [[1, 0, 1, 2],
-         [0, 2, 0, 1],
-         [0, 1, 2, 1]]], device='cuda:0')
-# _values
-tensor([[[[[ 1.],
-           [ 2.]],
-
-          [[ 2.],
-           [ 3.]],
-
-          [[ 3.],
-           [ 4.]],
-
-          [[ 4.],
-           [ 5.]]],
-
-
-         [[[ 5.],
-           [ 6.]],
-
-          [[ 6.],
-           [ 7.]],
-
-          [[ 7.],
-           [ 8.]],
-
-          [[ 8.],
-           [ 9.]]],
-
-
-         [[[ 9.],
-           [10.]],
-
-          [[10.],
-           [11.]],
-
-          [[11.],
-           [12.]],
-
-          [[12.],
-           [13.]]]],
-
-
-
-        [[[[13.],
-           [14.]],
-
-          [[14.],
-           [15.]],
-
-          [[15.],
-           [16.]],
-
-          [[16.],
-           [17.]]],
+         [ 7., 17.]],
+
+        [[ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.],
+         [10., 20.]],
+
+        [[ 9., 19.],
+         [10., 20.],
+         [11., 21.],
+         [12., 22.]],
+
+        [[21., 31.],
+         [22., 32.],
+         [23., 33.],
+         [24., 34.]],
+
+        [[ 8., 18.],
+         [ 9., 19.],
+         [10., 20.],
+         [11., 21.]],
+
+        [[14., 24.],
+         [15., 25.],
+         [16., 26.],
+         [17., 27.]],
+
+        [[22., 32.],
+         [23., 33.],
+         [24., 34.],
+         [25., 35.]],
 
+        [[ 2., 12.],
+         [ 3., 13.],
+         [ 4., 14.],
+         [ 5., 15.]],
 
-         [[[17.],
-           [18.]],
+        [[10., 20.],
+         [11., 21.],
+         [12., 22.],
+         [13., 23.]],
 
-          [[18.],
-           [19.]],
+        [[15., 25.],
+         [16., 26.],
+         [17., 27.],
+         [18., 28.]],
 
-          [[19.],
-           [20.]],
+        [[ 5., 15.],
+         [ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.]],
 
-          [[20.],
-           [21.]]],
+        [[11., 21.],
+         [12., 22.],
+         [13., 23.],
+         [14., 24.]],
 
+        [[16., 26.],
+         [17., 27.],
+         [18., 28.],
+         [19., 29.]],
 
-         [[[21.],
-           [22.]],
+        [[18., 28.],
+         [19., 29.],
+         [20., 30.],
+         [21., 31.]],
 
-          [[22.],
-           [23.]],
+        [[23., 33.],
+         [24., 34.],
+         [25., 35.],
+         [26., 36.]],
 
-          [[23.],
-           [24.]],
+        [[ 3., 13.],
+         [ 4., 14.],
+         [ 5., 15.],
+         [ 6., 16.]],
 
-          [[24.],
-           [25.]]]]], device='cuda:0', dtype=torch.float64)
+        [[12., 22.],
+         [13., 23.],
+         [14., 24.],
+         [15., 25.]],
+
+        [[17., 27.],
+         [18., 28.],
+         [19., 29.],
+         [20., 30.]],
+
+        [[19., 29.],
+         [20., 30.],
+         [21., 31.],
+         [22., 32.]],
+
+        [[24., 34.],
+         [25., 35.],
+         [26., 36.],
+         [27., 37.]]], device='cuda:0', dtype=torch.float64)
 
diff --git a/test/expect/TestSparseCompressedCUDA.test_print_SparseCSR_cuda.expect b/test/expect/TestSparseCompressedCUDA.test_print_SparseCSR_cuda.expect
index ddb5272c79cab..a02ee510ff8a5 100644
--- a/test/expect/TestSparseCompressedCUDA.test_print_SparseCSR_cuda.expect
+++ b/test/expect/TestSparseCompressedCUDA.test_print_SparseCSR_cuda.expect
@@ -1,48 +1,3 @@
-########## torch.float32/torch.int32/size=()+(2, 3)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 3), nnz=4,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 2], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([1., 2., 3., 4.], device='cuda:0')
-
-########## torch.float32/torch.int32/size=()+(0, 0)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([0]),
-       col_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0,)), device='cuda:0', size=(0, 0), nnz=0,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([0], device='cuda:0', dtype=torch.int32)
-# _col_indices
-tensor([], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([], device='cuda:0')
-
-########## torch.float32/torch.int32/size=(2,)+(2, 3)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       col_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[1., 2., 3., 4.],
-                      [5., 6., 7., 8.]]), device='cuda:0', size=(2, 2, 3),
-       nnz=4, layout=torch.sparse_csr)
-# _crow_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]], device='cuda:0', dtype=torch.int32)
-# _col_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[1., 2., 3., 4.],
-        [5., 6., 7., 8.]], device='cuda:0')
-
 ########## torch.float32/torch.int32/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
 tensor(crow_indices=tensor([[[0, 2, 4],
@@ -52,21 +7,21 @@ tensor(crow_indices=tensor([[[0, 2, 4],
                             [[0, 1, 4],
                              [0, 2, 4],
                              [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
+       col_indices=tensor([[[0, 1, 0, 2],
                             [0, 1, 2, 0],
                             [0, 0, 1, 2]],
 
                            [[1, 0, 1, 2],
                             [0, 2, 0, 1],
                             [0, 1, 2, 1]]]),
-       values=tensor([[[ 1.,  2.,  3.,  4.],
-                       [ 5.,  6.,  7.,  8.],
-                       [ 9., 10., 11., 12.]],
-
-                      [[13., 14., 15., 16.],
-                       [17., 18., 19., 20.],
-                       [21., 22., 23., 24.]]]), device='cuda:0',
-       size=(2, 3, 2, 3), nnz=4, layout=torch.sparse_csr)
+       values=tensor([[[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]],
+
+                      [[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]]]), device='cuda:0', size=(2, 3, 2, 3),
+       nnz=4, layout=torch.sparse_csr)
 # _crow_indices
 tensor([[[0, 2, 4],
          [0, 3, 4],
@@ -76,7 +31,7 @@ tensor([[[0, 2, 4],
          [0, 2, 4],
          [0, 3, 4]]], device='cuda:0', dtype=torch.int32)
 # _col_indices
-tensor([[[0, 1, 0, 1],
+tensor([[[0, 1, 0, 2],
          [0, 1, 2, 0],
          [0, 0, 1, 2]],
 
@@ -84,59 +39,33 @@ tensor([[[0, 1, 0, 1],
          [0, 2, 0, 1],
          [0, 1, 2, 1]]], device='cuda:0', dtype=torch.int32)
 # _values
-tensor([[[ 1.,  2.,  3.,  4.],
-         [ 5.,  6.,  7.,  8.],
-         [ 9., 10., 11., 12.]],
+tensor([[[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]],
 
-        [[13., 14., 15., 16.],
-         [17., 18., 19., 20.],
-         [21., 22., 23., 24.]]], device='cuda:0')
-
-
-########## torch.float64/torch.int32/size=()+(2, 3)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 3), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 2], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([1., 2., 3., 4.], device='cuda:0', dtype=torch.float64)
+        [[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]]], device='cuda:0')
 
-########## torch.float64/torch.int32/size=()+(0, 0)+() ##########
+########## torch.float32/torch.int32/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(crow_indices=tensor([0]),
-       col_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0,)), device='cuda:0', size=(0, 0), nnz=0,
-       dtype=torch.float64, layout=torch.sparse_csr)
+tensor(crow_indices=tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24]),
+       col_indices=tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4,
+                           5, 0, 1, 2, 4, 5]),
+       values=tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11.,
+                      12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22.,
+                      23., 24.]), device='cuda:0', size=(8, 6), nnz=24,
+       layout=torch.sparse_csr)
 # _crow_indices
-tensor([0], device='cuda:0', dtype=torch.int32)
+tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24], device='cuda:0',
+       dtype=torch.int32)
 # _col_indices
-tensor([], device='cuda:0', dtype=torch.int32)
+tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4, 5, 0, 1, 2, 4, 5],
+       device='cuda:0', dtype=torch.int32)
 # _values
-tensor([], device='cuda:0', dtype=torch.float64)
+tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14.,
+        15., 16., 17., 18., 19., 20., 21., 22., 23., 24.], device='cuda:0')
 
-########## torch.float64/torch.int32/size=(2,)+(2, 3)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       col_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[1., 2., 3., 4.],
-                      [5., 6., 7., 8.]]), device='cuda:0', size=(2, 2, 3),
-       nnz=4, dtype=torch.float64, layout=torch.sparse_csr)
-# _crow_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]], device='cuda:0', dtype=torch.int32)
-# _col_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[1., 2., 3., 4.],
-        [5., 6., 7., 8.]], device='cuda:0', dtype=torch.float64)
 
 ########## torch.float64/torch.int32/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
@@ -147,21 +76,21 @@ tensor(crow_indices=tensor([[[0, 2, 4],
                             [[0, 1, 4],
                              [0, 2, 4],
                              [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
+       col_indices=tensor([[[0, 1, 0, 2],
                             [0, 1, 2, 0],
                             [0, 0, 1, 2]],
 
                            [[1, 0, 1, 2],
                             [0, 2, 0, 1],
                             [0, 1, 2, 1]]]),
-       values=tensor([[[ 1.,  2.,  3.,  4.],
-                       [ 5.,  6.,  7.,  8.],
-                       [ 9., 10., 11., 12.]],
-
-                      [[13., 14., 15., 16.],
-                       [17., 18., 19., 20.],
-                       [21., 22., 23., 24.]]]), device='cuda:0',
-       size=(2, 3, 2, 3), nnz=4, dtype=torch.float64, layout=torch.sparse_csr)
+       values=tensor([[[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]],
+
+                      [[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]]]), device='cuda:0', size=(2, 3, 2, 3),
+       nnz=4, dtype=torch.float64, layout=torch.sparse_csr)
 # _crow_indices
 tensor([[[0, 2, 4],
          [0, 3, 4],
@@ -171,7 +100,7 @@ tensor([[[0, 2, 4],
          [0, 2, 4],
          [0, 3, 4]]], device='cuda:0', dtype=torch.int32)
 # _col_indices
-tensor([[[0, 1, 0, 1],
+tensor([[[0, 1, 0, 2],
          [0, 1, 2, 0],
          [0, 0, 1, 2]],
 
@@ -179,59 +108,34 @@ tensor([[[0, 1, 0, 1],
          [0, 2, 0, 1],
          [0, 1, 2, 1]]], device='cuda:0', dtype=torch.int32)
 # _values
-tensor([[[ 1.,  2.,  3.,  4.],
-         [ 5.,  6.,  7.,  8.],
-         [ 9., 10., 11., 12.]],
+tensor([[[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]],
 
-        [[13., 14., 15., 16.],
-         [17., 18., 19., 20.],
-         [21., 22., 23., 24.]]], device='cuda:0', dtype=torch.float64)
-
-
-########## torch.float32/torch.int64/size=()+(2, 3)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 3), nnz=4,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0')
-# _col_indices
-tensor([0, 1, 0, 2], device='cuda:0')
-# _values
-tensor([1., 2., 3., 4.], device='cuda:0')
+        [[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]]], device='cuda:0', dtype=torch.float64)
 
-########## torch.float32/torch.int64/size=()+(0, 0)+() ##########
+########## torch.float64/torch.int32/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(crow_indices=tensor([0]),
-       col_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0,)), device='cuda:0', size=(0, 0), nnz=0,
-       layout=torch.sparse_csr)
+tensor(crow_indices=tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24]),
+       col_indices=tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4,
+                           5, 0, 1, 2, 4, 5]),
+       values=tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11.,
+                      12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22.,
+                      23., 24.]), device='cuda:0', size=(8, 6), nnz=24,
+       dtype=torch.float64, layout=torch.sparse_csr)
 # _crow_indices
-tensor([0], device='cuda:0')
+tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24], device='cuda:0',
+       dtype=torch.int32)
 # _col_indices
-tensor([], device='cuda:0', dtype=torch.int64)
+tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4, 5, 0, 1, 2, 4, 5],
+       device='cuda:0', dtype=torch.int32)
 # _values
-tensor([], device='cuda:0')
+tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14.,
+        15., 16., 17., 18., 19., 20., 21., 22., 23., 24.], device='cuda:0',
+       dtype=torch.float64)
 
-########## torch.float32/torch.int64/size=(2,)+(2, 3)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       col_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[1., 2., 3., 4.],
-                      [5., 6., 7., 8.]]), device='cuda:0', size=(2, 2, 3),
-       nnz=4, layout=torch.sparse_csr)
-# _crow_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]], device='cuda:0')
-# _col_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]], device='cuda:0')
-# _values
-tensor([[1., 2., 3., 4.],
-        [5., 6., 7., 8.]], device='cuda:0')
 
 ########## torch.float32/torch.int64/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
@@ -242,21 +146,21 @@ tensor(crow_indices=tensor([[[0, 2, 4],
                             [[0, 1, 4],
                              [0, 2, 4],
                              [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
+       col_indices=tensor([[[0, 1, 0, 2],
                             [0, 1, 2, 0],
                             [0, 0, 1, 2]],
 
                            [[1, 0, 1, 2],
                             [0, 2, 0, 1],
                             [0, 1, 2, 1]]]),
-       values=tensor([[[ 1.,  2.,  3.,  4.],
-                       [ 5.,  6.,  7.,  8.],
-                       [ 9., 10., 11., 12.]],
-
-                      [[13., 14., 15., 16.],
-                       [17., 18., 19., 20.],
-                       [21., 22., 23., 24.]]]), device='cuda:0',
-       size=(2, 3, 2, 3), nnz=4, layout=torch.sparse_csr)
+       values=tensor([[[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]],
+
+                      [[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]]]), device='cuda:0', size=(2, 3, 2, 3),
+       nnz=4, layout=torch.sparse_csr)
 # _crow_indices
 tensor([[[0, 2, 4],
          [0, 3, 4],
@@ -266,7 +170,7 @@ tensor([[[0, 2, 4],
          [0, 2, 4],
          [0, 3, 4]]], device='cuda:0')
 # _col_indices
-tensor([[[0, 1, 0, 1],
+tensor([[[0, 1, 0, 2],
          [0, 1, 2, 0],
          [0, 0, 1, 2]],
 
@@ -274,59 +178,32 @@ tensor([[[0, 1, 0, 1],
          [0, 2, 0, 1],
          [0, 1, 2, 1]]], device='cuda:0')
 # _values
-tensor([[[ 1.,  2.,  3.,  4.],
-         [ 5.,  6.,  7.,  8.],
-         [ 9., 10., 11., 12.]],
+tensor([[[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]],
 
-        [[13., 14., 15., 16.],
-         [17., 18., 19., 20.],
-         [21., 22., 23., 24.]]], device='cuda:0')
+        [[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]]], device='cuda:0')
 
-
-########## torch.float64/torch.int64/size=()+(2, 3)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([1., 2., 3., 4.]), device='cuda:0', size=(2, 3), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0')
-# _col_indices
-tensor([0, 1, 0, 2], device='cuda:0')
-# _values
-tensor([1., 2., 3., 4.], device='cuda:0', dtype=torch.float64)
-
-########## torch.float64/torch.int64/size=()+(0, 0)+() ##########
+########## torch.float32/torch.int64/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(crow_indices=tensor([0]),
-       col_indices=tensor([], size=(0,)),
-       values=tensor([], size=(0,)), device='cuda:0', size=(0, 0), nnz=0,
-       dtype=torch.float64, layout=torch.sparse_csr)
+tensor(crow_indices=tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24]),
+       col_indices=tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4,
+                           5, 0, 1, 2, 4, 5]),
+       values=tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11.,
+                      12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22.,
+                      23., 24.]), device='cuda:0', size=(8, 6), nnz=24,
+       layout=torch.sparse_csr)
 # _crow_indices
-tensor([0], device='cuda:0')
+tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24], device='cuda:0')
 # _col_indices
-tensor([], device='cuda:0', dtype=torch.int64)
+tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4, 5, 0, 1, 2, 4, 5],
+       device='cuda:0')
 # _values
-tensor([], device='cuda:0', dtype=torch.float64)
+tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14.,
+        15., 16., 17., 18., 19., 20., 21., 22., 23., 24.], device='cuda:0')
 
-########## torch.float64/torch.int64/size=(2,)+(2, 3)+() ##########
-# sparse tensor
-tensor(crow_indices=tensor([[0, 2, 4],
-                            [0, 3, 4]]),
-       col_indices=tensor([[0, 1, 0, 1],
-                           [0, 1, 2, 0]]),
-       values=tensor([[1., 2., 3., 4.],
-                      [5., 6., 7., 8.]]), device='cuda:0', size=(2, 2, 3),
-       nnz=4, dtype=torch.float64, layout=torch.sparse_csr)
-# _crow_indices
-tensor([[0, 2, 4],
-        [0, 3, 4]], device='cuda:0')
-# _col_indices
-tensor([[0, 1, 0, 1],
-        [0, 1, 2, 0]], device='cuda:0')
-# _values
-tensor([[1., 2., 3., 4.],
-        [5., 6., 7., 8.]], device='cuda:0', dtype=torch.float64)
 
 ########## torch.float64/torch.int64/size=(2, 3)+(2, 3)+() ##########
 # sparse tensor
@@ -337,21 +214,21 @@ tensor(crow_indices=tensor([[[0, 2, 4],
                             [[0, 1, 4],
                              [0, 2, 4],
                              [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
+       col_indices=tensor([[[0, 1, 0, 2],
                             [0, 1, 2, 0],
                             [0, 0, 1, 2]],
 
                            [[1, 0, 1, 2],
                             [0, 2, 0, 1],
                             [0, 1, 2, 1]]]),
-       values=tensor([[[ 1.,  2.,  3.,  4.],
-                       [ 5.,  6.,  7.,  8.],
-                       [ 9., 10., 11., 12.]],
-
-                      [[13., 14., 15., 16.],
-                       [17., 18., 19., 20.],
-                       [21., 22., 23., 24.]]]), device='cuda:0',
-       size=(2, 3, 2, 3), nnz=4, dtype=torch.float64, layout=torch.sparse_csr)
+       values=tensor([[[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]],
+
+                      [[1., 2., 3., 4.],
+                       [1., 2., 3., 4.],
+                       [1., 2., 3., 4.]]]), device='cuda:0', size=(2, 3, 2, 3),
+       nnz=4, dtype=torch.float64, layout=torch.sparse_csr)
 # _crow_indices
 tensor([[[0, 2, 4],
          [0, 3, 4],
@@ -361,7 +238,7 @@ tensor([[[0, 2, 4],
          [0, 2, 4],
          [0, 3, 4]]], device='cuda:0')
 # _col_indices
-tensor([[[0, 1, 0, 1],
+tensor([[[0, 1, 0, 2],
          [0, 1, 2, 0],
          [0, 0, 1, 2]],
 
@@ -369,84 +246,35 @@ tensor([[[0, 1, 0, 1],
          [0, 2, 0, 1],
          [0, 1, 2, 1]]], device='cuda:0')
 # _values
-tensor([[[ 1.,  2.,  3.,  4.],
-         [ 5.,  6.,  7.,  8.],
-         [ 9., 10., 11., 12.]],
-
-        [[13., 14., 15., 16.],
-         [17., 18., 19., 20.],
-         [21., 22., 23., 24.]]], device='cuda:0', dtype=torch.float64)
+tensor([[[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]],
 
+        [[1., 2., 3., 4.],
+         [1., 2., 3., 4.],
+         [1., 2., 3., 4.]]], device='cuda:0', dtype=torch.float64)
 
-########## torch.float32/torch.int32/size=()+(2, 3)+(2,) ##########
+########## torch.float64/torch.int64/size=()+(8, 6)+() ##########
 # sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[1., 2.],
-                      [2., 3.],
-                      [3., 4.],
-                      [4., 5.]]), device='cuda:0', size=(2, 3, 2), nnz=4,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 2], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[1., 2.],
-        [2., 3.],
-        [3., 4.],
-        [4., 5.]], device='cuda:0')
-
-########## torch.float32/torch.int32/size=()+(2, 3)+(4, 2) ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[[ 1., 11.],
-                       [ 2., 12.],
-                       [ 3., 13.],
-                       [ 4., 14.]],
-
-                      [[ 2., 12.],
-                       [ 3., 13.],
-                       [ 4., 14.],
-                       [ 5., 15.]],
-
-                      [[ 3., 13.],
-                       [ 4., 14.],
-                       [ 5., 15.],
-                       [ 6., 16.]],
-
-                      [[ 4., 14.],
-                       [ 5., 15.],
-                       [ 6., 16.],
-                       [ 7., 17.]]]), device='cuda:0', size=(2, 3, 4, 2),
-       nnz=4, layout=torch.sparse_csr)
+tensor(crow_indices=tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24]),
+       col_indices=tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4,
+                           5, 0, 1, 2, 4, 5]),
+       values=tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11.,
+                      12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22.,
+                      23., 24.]), device='cuda:0', size=(8, 6), nnz=24,
+       dtype=torch.float64, layout=torch.sparse_csr)
 # _crow_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
+tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24], device='cuda:0')
 # _col_indices
-tensor([0, 1, 0, 2], device='cuda:0', dtype=torch.int32)
+tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4, 5, 0, 1, 2, 4, 5],
+       device='cuda:0')
 # _values
-tensor([[[ 1., 11.],
-         [ 2., 12.],
-         [ 3., 13.],
-         [ 4., 14.]],
-
-        [[ 2., 12.],
-         [ 3., 13.],
-         [ 4., 14.],
-         [ 5., 15.]],
+tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14.,
+        15., 16., 17., 18., 19., 20., 21., 22., 23., 24.], device='cuda:0',
+       dtype=torch.float64)
 
-        [[ 3., 13.],
-         [ 4., 14.],
-         [ 5., 15.],
-         [ 6., 16.]],
 
-        [[ 4., 14.],
-         [ 5., 15.],
-         [ 6., 16.],
-         [ 7., 17.]]], device='cuda:0')
-
-########## torch.float32/torch.int32/size=(2, 3)+(2, 3)+(2, 1) ##########
+########## torch.float32/torch.int32/size=(2, 3)+(2, 3)+(4,) ##########
 # sparse tensor
 tensor(crow_indices=tensor([[[0, 2, 4],
                              [0, 3, 4],
@@ -455,91 +283,44 @@ tensor(crow_indices=tensor([[[0, 2, 4],
                             [[0, 1, 4],
                              [0, 2, 4],
                              [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
+       col_indices=tensor([[[0, 1, 0, 2],
                             [0, 1, 2, 0],
                             [0, 0, 1, 2]],
 
                            [[1, 0, 1, 2],
                             [0, 2, 0, 1],
                             [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1.],
-                         [ 2.]],
-
-                        [[ 2.],
-                         [ 3.]],
-
-                        [[ 3.],
-                         [ 4.]],
-
-                        [[ 4.],
-                         [ 5.]]],
-
-
-                       [[[ 5.],
-                         [ 6.]],
-
-                        [[ 6.],
-                         [ 7.]],
-
-                        [[ 7.],
-                         [ 8.]],
-
-                        [[ 8.],
-                         [ 9.]]],
-
-
-                       [[[ 9.],
-                         [10.]],
-
-                        [[10.],
-                         [11.]],
-
-                        [[11.],
-                         [12.]],
-
-                        [[12.],
-                         [13.]]]],
-
-
-
-                      [[[[13.],
-                         [14.]],
-
-                        [[14.],
-                         [15.]],
-
-                        [[15.],
-                         [16.]],
-
-                        [[16.],
-                         [17.]]],
-
-
-                       [[[17.],
-                         [18.]],
-
-                        [[18.],
-                         [19.]],
-
-                        [[19.],
-                         [20.]],
-
-                        [[20.],
-                         [21.]]],
-
-
-                       [[[21.],
-                         [22.]],
-
-                        [[22.],
-                         [23.]],
-
-                        [[23.],
-                         [24.]],
-
-                        [[24.],
-                         [25.]]]]]), device='cuda:0', size=(2, 3, 2, 3, 2, 1),
-       nnz=4, layout=torch.sparse_csr)
+       values=tensor([[[[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]]],
+
+
+                      [[[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]]]]), device='cuda:0',
+       size=(2, 3, 2, 3, 4), nnz=4, layout=torch.sparse_csr)
 # _crow_indices
 tensor([[[0, 2, 4],
          [0, 3, 4],
@@ -549,7 +330,7 @@ tensor([[[0, 2, 4],
          [0, 2, 4],
          [0, 3, 4]]], device='cuda:0', dtype=torch.int32)
 # _col_indices
-tensor([[[0, 1, 0, 1],
+tensor([[[0, 1, 0, 2],
          [0, 1, 2, 0],
          [0, 0, 1, 2]],
 
@@ -557,108 +338,42 @@ tensor([[[0, 1, 0, 1],
          [0, 2, 0, 1],
          [0, 1, 2, 1]]], device='cuda:0', dtype=torch.int32)
 # _values
-tensor([[[[[ 1.],
-           [ 2.]],
-
-          [[ 2.],
-           [ 3.]],
-
-          [[ 3.],
-           [ 4.]],
-
-          [[ 4.],
-           [ 5.]]],
-
-
-         [[[ 5.],
-           [ 6.]],
-
-          [[ 6.],
-           [ 7.]],
-
-          [[ 7.],
-           [ 8.]],
-
-          [[ 8.],
-           [ 9.]]],
-
-
-         [[[ 9.],
-           [10.]],
-
-          [[10.],
-           [11.]],
-
-          [[11.],
-           [12.]],
-
-          [[12.],
-           [13.]]]],
-
-
-
-        [[[[13.],
-           [14.]],
-
-          [[14.],
-           [15.]],
-
-          [[15.],
-           [16.]],
-
-          [[16.],
-           [17.]]],
-
-
-         [[[17.],
-           [18.]],
-
-          [[18.],
-           [19.]],
-
-          [[19.],
-           [20.]],
-
-          [[20.],
-           [21.]]],
-
-
-         [[[21.],
-           [22.]],
-
-          [[22.],
-           [23.]],
-
-          [[23.],
-           [24.]],
-
-          [[24.],
-           [25.]]]]], device='cuda:0')
-
-
-########## torch.float64/torch.int32/size=()+(2, 3)+(2,) ##########
+tensor([[[[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]]],
+
+
+        [[[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]]]], device='cuda:0')
+
+########## torch.float32/torch.int32/size=()+(8, 6)+(4, 2) ##########
 # sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[1., 2.],
-                      [2., 3.],
-                      [3., 4.],
-                      [4., 5.]]), device='cuda:0', size=(2, 3, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
-# _col_indices
-tensor([0, 1, 0, 2], device='cuda:0', dtype=torch.int32)
-# _values
-tensor([[1., 2.],
-        [2., 3.],
-        [3., 4.],
-        [4., 5.]], device='cuda:0', dtype=torch.float64)
-
-########## torch.float64/torch.int32/size=()+(2, 3)+(4, 2) ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
+tensor(crow_indices=tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24]),
+       col_indices=tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4,
+                           5, 0, 1, 2, 4, 5]),
        values=tensor([[[ 1., 11.],
                        [ 2., 12.],
                        [ 3., 13.],
@@ -677,12 +392,114 @@ tensor(crow_indices=tensor([0, 2, 4]),
                       [[ 4., 14.],
                        [ 5., 15.],
                        [ 6., 16.],
-                       [ 7., 17.]]]), device='cuda:0', size=(2, 3, 4, 2),
-       nnz=4, dtype=torch.float64, layout=torch.sparse_csr)
+                       [ 7., 17.]],
+
+                      [[ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.]],
+
+                      [[ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.]],
+
+                      [[ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.]],
+
+                      [[ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.],
+                       [11., 21.]],
+
+                      [[ 9., 19.],
+                       [10., 20.],
+                       [11., 21.],
+                       [12., 22.]],
+
+                      [[10., 20.],
+                       [11., 21.],
+                       [12., 22.],
+                       [13., 23.]],
+
+                      [[11., 21.],
+                       [12., 22.],
+                       [13., 23.],
+                       [14., 24.]],
+
+                      [[12., 22.],
+                       [13., 23.],
+                       [14., 24.],
+                       [15., 25.]],
+
+                      [[13., 23.],
+                       [14., 24.],
+                       [15., 25.],
+                       [16., 26.]],
+
+                      [[14., 24.],
+                       [15., 25.],
+                       [16., 26.],
+                       [17., 27.]],
+
+                      [[15., 25.],
+                       [16., 26.],
+                       [17., 27.],
+                       [18., 28.]],
+
+                      [[16., 26.],
+                       [17., 27.],
+                       [18., 28.],
+                       [19., 29.]],
+
+                      [[17., 27.],
+                       [18., 28.],
+                       [19., 29.],
+                       [20., 30.]],
+
+                      [[18., 28.],
+                       [19., 29.],
+                       [20., 30.],
+                       [21., 31.]],
+
+                      [[19., 29.],
+                       [20., 30.],
+                       [21., 31.],
+                       [22., 32.]],
+
+                      [[20., 30.],
+                       [21., 31.],
+                       [22., 32.],
+                       [23., 33.]],
+
+                      [[21., 31.],
+                       [22., 32.],
+                       [23., 33.],
+                       [24., 34.]],
+
+                      [[22., 32.],
+                       [23., 33.],
+                       [24., 34.],
+                       [25., 35.]],
+
+                      [[23., 33.],
+                       [24., 34.],
+                       [25., 35.],
+                       [26., 36.]],
+
+                      [[24., 34.],
+                       [25., 35.],
+                       [26., 36.],
+                       [27., 37.]]]), device='cuda:0', size=(8, 6, 4, 2),
+       nnz=24, layout=torch.sparse_csr)
 # _crow_indices
-tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
+tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24], device='cuda:0',
+       dtype=torch.int32)
 # _col_indices
-tensor([0, 1, 0, 2], device='cuda:0', dtype=torch.int32)
+tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4, 5, 0, 1, 2, 4, 5],
+       device='cuda:0', dtype=torch.int32)
 # _values
 tensor([[[ 1., 11.],
          [ 2., 12.],
@@ -702,9 +519,110 @@ tensor([[[ 1., 11.],
         [[ 4., 14.],
          [ 5., 15.],
          [ 6., 16.],
-         [ 7., 17.]]], device='cuda:0', dtype=torch.float64)
+         [ 7., 17.]],
 
-########## torch.float64/torch.int32/size=(2, 3)+(2, 3)+(2, 1) ##########
+        [[ 5., 15.],
+         [ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.]],
+
+        [[ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.]],
+
+        [[ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.],
+         [10., 20.]],
+
+        [[ 8., 18.],
+         [ 9., 19.],
+         [10., 20.],
+         [11., 21.]],
+
+        [[ 9., 19.],
+         [10., 20.],
+         [11., 21.],
+         [12., 22.]],
+
+        [[10., 20.],
+         [11., 21.],
+         [12., 22.],
+         [13., 23.]],
+
+        [[11., 21.],
+         [12., 22.],
+         [13., 23.],
+         [14., 24.]],
+
+        [[12., 22.],
+         [13., 23.],
+         [14., 24.],
+         [15., 25.]],
+
+        [[13., 23.],
+         [14., 24.],
+         [15., 25.],
+         [16., 26.]],
+
+        [[14., 24.],
+         [15., 25.],
+         [16., 26.],
+         [17., 27.]],
+
+        [[15., 25.],
+         [16., 26.],
+         [17., 27.],
+         [18., 28.]],
+
+        [[16., 26.],
+         [17., 27.],
+         [18., 28.],
+         [19., 29.]],
+
+        [[17., 27.],
+         [18., 28.],
+         [19., 29.],
+         [20., 30.]],
+
+        [[18., 28.],
+         [19., 29.],
+         [20., 30.],
+         [21., 31.]],
+
+        [[19., 29.],
+         [20., 30.],
+         [21., 31.],
+         [22., 32.]],
+
+        [[20., 30.],
+         [21., 31.],
+         [22., 32.],
+         [23., 33.]],
+
+        [[21., 31.],
+         [22., 32.],
+         [23., 33.],
+         [24., 34.]],
+
+        [[22., 32.],
+         [23., 33.],
+         [24., 34.],
+         [25., 35.]],
+
+        [[23., 33.],
+         [24., 34.],
+         [25., 35.],
+         [26., 36.]],
+
+        [[24., 34.],
+         [25., 35.],
+         [26., 36.],
+         [27., 37.]]], device='cuda:0')
+
+
+########## torch.float64/torch.int32/size=(2, 3)+(2, 3)+(4,) ##########
 # sparse tensor
 tensor(crow_indices=tensor([[[0, 2, 4],
                              [0, 3, 4],
@@ -713,91 +631,44 @@ tensor(crow_indices=tensor([[[0, 2, 4],
                             [[0, 1, 4],
                              [0, 2, 4],
                              [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
+       col_indices=tensor([[[0, 1, 0, 2],
                             [0, 1, 2, 0],
                             [0, 0, 1, 2]],
 
                            [[1, 0, 1, 2],
                             [0, 2, 0, 1],
                             [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1.],
-                         [ 2.]],
-
-                        [[ 2.],
-                         [ 3.]],
-
-                        [[ 3.],
-                         [ 4.]],
-
-                        [[ 4.],
-                         [ 5.]]],
-
-
-                       [[[ 5.],
-                         [ 6.]],
-
-                        [[ 6.],
-                         [ 7.]],
-
-                        [[ 7.],
-                         [ 8.]],
-
-                        [[ 8.],
-                         [ 9.]]],
-
-
-                       [[[ 9.],
-                         [10.]],
-
-                        [[10.],
-                         [11.]],
-
-                        [[11.],
-                         [12.]],
-
-                        [[12.],
-                         [13.]]]],
-
-
-
-                      [[[[13.],
-                         [14.]],
-
-                        [[14.],
-                         [15.]],
-
-                        [[15.],
-                         [16.]],
-
-                        [[16.],
-                         [17.]]],
-
-
-                       [[[17.],
-                         [18.]],
-
-                        [[18.],
-                         [19.]],
-
-                        [[19.],
-                         [20.]],
-
-                        [[20.],
-                         [21.]]],
-
-
-                       [[[21.],
-                         [22.]],
-
-                        [[22.],
-                         [23.]],
-
-                        [[23.],
-                         [24.]],
-
-                        [[24.],
-                         [25.]]]]]), device='cuda:0', size=(2, 3, 2, 3, 2, 1),
-       nnz=4, dtype=torch.float64, layout=torch.sparse_csr)
+       values=tensor([[[[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]]],
+
+
+                      [[[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]]]]), device='cuda:0',
+       size=(2, 3, 2, 3, 4), nnz=4, dtype=torch.float64, layout=torch.sparse_csr)
 # _crow_indices
 tensor([[[0, 2, 4],
          [0, 3, 4],
@@ -807,7 +678,7 @@ tensor([[[0, 2, 4],
          [0, 2, 4],
          [0, 3, 4]]], device='cuda:0', dtype=torch.int32)
 # _col_indices
-tensor([[[0, 1, 0, 1],
+tensor([[[0, 1, 0, 2],
          [0, 1, 2, 0],
          [0, 0, 1, 2]],
 
@@ -815,108 +686,42 @@ tensor([[[0, 1, 0, 1],
          [0, 2, 0, 1],
          [0, 1, 2, 1]]], device='cuda:0', dtype=torch.int32)
 # _values
-tensor([[[[[ 1.],
-           [ 2.]],
-
-          [[ 2.],
-           [ 3.]],
-
-          [[ 3.],
-           [ 4.]],
-
-          [[ 4.],
-           [ 5.]]],
-
-
-         [[[ 5.],
-           [ 6.]],
-
-          [[ 6.],
-           [ 7.]],
-
-          [[ 7.],
-           [ 8.]],
-
-          [[ 8.],
-           [ 9.]]],
-
-
-         [[[ 9.],
-           [10.]],
-
-          [[10.],
-           [11.]],
-
-          [[11.],
-           [12.]],
-
-          [[12.],
-           [13.]]]],
-
-
-
-        [[[[13.],
-           [14.]],
-
-          [[14.],
-           [15.]],
-
-          [[15.],
-           [16.]],
-
-          [[16.],
-           [17.]]],
-
-
-         [[[17.],
-           [18.]],
-
-          [[18.],
-           [19.]],
-
-          [[19.],
-           [20.]],
-
-          [[20.],
-           [21.]]],
-
-
-         [[[21.],
-           [22.]],
-
-          [[22.],
-           [23.]],
-
-          [[23.],
-           [24.]],
-
-          [[24.],
-           [25.]]]]], device='cuda:0', dtype=torch.float64)
-
-
-########## torch.float32/torch.int64/size=()+(2, 3)+(2,) ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[1., 2.],
-                      [2., 3.],
-                      [3., 4.],
-                      [4., 5.]]), device='cuda:0', size=(2, 3, 2), nnz=4,
-       layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0')
-# _col_indices
-tensor([0, 1, 0, 2], device='cuda:0')
-# _values
-tensor([[1., 2.],
-        [2., 3.],
-        [3., 4.],
-        [4., 5.]], device='cuda:0')
-
-########## torch.float32/torch.int64/size=()+(2, 3)+(4, 2) ##########
+tensor([[[[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]]],
+
+
+        [[[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]]]], device='cuda:0', dtype=torch.float64)
+
+########## torch.float64/torch.int32/size=()+(8, 6)+(4, 2) ##########
 # sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
+tensor(crow_indices=tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24]),
+       col_indices=tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4,
+                           5, 0, 1, 2, 4, 5]),
        values=tensor([[[ 1., 11.],
                        [ 2., 12.],
                        [ 3., 13.],
@@ -935,12 +740,114 @@ tensor(crow_indices=tensor([0, 2, 4]),
                       [[ 4., 14.],
                        [ 5., 15.],
                        [ 6., 16.],
-                       [ 7., 17.]]]), device='cuda:0', size=(2, 3, 4, 2),
-       nnz=4, layout=torch.sparse_csr)
+                       [ 7., 17.]],
+
+                      [[ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.]],
+
+                      [[ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.]],
+
+                      [[ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.]],
+
+                      [[ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.],
+                       [11., 21.]],
+
+                      [[ 9., 19.],
+                       [10., 20.],
+                       [11., 21.],
+                       [12., 22.]],
+
+                      [[10., 20.],
+                       [11., 21.],
+                       [12., 22.],
+                       [13., 23.]],
+
+                      [[11., 21.],
+                       [12., 22.],
+                       [13., 23.],
+                       [14., 24.]],
+
+                      [[12., 22.],
+                       [13., 23.],
+                       [14., 24.],
+                       [15., 25.]],
+
+                      [[13., 23.],
+                       [14., 24.],
+                       [15., 25.],
+                       [16., 26.]],
+
+                      [[14., 24.],
+                       [15., 25.],
+                       [16., 26.],
+                       [17., 27.]],
+
+                      [[15., 25.],
+                       [16., 26.],
+                       [17., 27.],
+                       [18., 28.]],
+
+                      [[16., 26.],
+                       [17., 27.],
+                       [18., 28.],
+                       [19., 29.]],
+
+                      [[17., 27.],
+                       [18., 28.],
+                       [19., 29.],
+                       [20., 30.]],
+
+                      [[18., 28.],
+                       [19., 29.],
+                       [20., 30.],
+                       [21., 31.]],
+
+                      [[19., 29.],
+                       [20., 30.],
+                       [21., 31.],
+                       [22., 32.]],
+
+                      [[20., 30.],
+                       [21., 31.],
+                       [22., 32.],
+                       [23., 33.]],
+
+                      [[21., 31.],
+                       [22., 32.],
+                       [23., 33.],
+                       [24., 34.]],
+
+                      [[22., 32.],
+                       [23., 33.],
+                       [24., 34.],
+                       [25., 35.]],
+
+                      [[23., 33.],
+                       [24., 34.],
+                       [25., 35.],
+                       [26., 36.]],
+
+                      [[24., 34.],
+                       [25., 35.],
+                       [26., 36.],
+                       [27., 37.]]]), device='cuda:0', size=(8, 6, 4, 2),
+       nnz=24, dtype=torch.float64, layout=torch.sparse_csr)
 # _crow_indices
-tensor([0, 2, 4], device='cuda:0')
+tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24], device='cuda:0',
+       dtype=torch.int32)
 # _col_indices
-tensor([0, 1, 0, 2], device='cuda:0')
+tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4, 5, 0, 1, 2, 4, 5],
+       device='cuda:0', dtype=torch.int32)
 # _values
 tensor([[[ 1., 11.],
          [ 2., 12.],
@@ -960,9 +867,110 @@ tensor([[[ 1., 11.],
         [[ 4., 14.],
          [ 5., 15.],
          [ 6., 16.],
-         [ 7., 17.]]], device='cuda:0')
+         [ 7., 17.]],
 
-########## torch.float32/torch.int64/size=(2, 3)+(2, 3)+(2, 1) ##########
+        [[ 5., 15.],
+         [ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.]],
+
+        [[ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.]],
+
+        [[ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.],
+         [10., 20.]],
+
+        [[ 8., 18.],
+         [ 9., 19.],
+         [10., 20.],
+         [11., 21.]],
+
+        [[ 9., 19.],
+         [10., 20.],
+         [11., 21.],
+         [12., 22.]],
+
+        [[10., 20.],
+         [11., 21.],
+         [12., 22.],
+         [13., 23.]],
+
+        [[11., 21.],
+         [12., 22.],
+         [13., 23.],
+         [14., 24.]],
+
+        [[12., 22.],
+         [13., 23.],
+         [14., 24.],
+         [15., 25.]],
+
+        [[13., 23.],
+         [14., 24.],
+         [15., 25.],
+         [16., 26.]],
+
+        [[14., 24.],
+         [15., 25.],
+         [16., 26.],
+         [17., 27.]],
+
+        [[15., 25.],
+         [16., 26.],
+         [17., 27.],
+         [18., 28.]],
+
+        [[16., 26.],
+         [17., 27.],
+         [18., 28.],
+         [19., 29.]],
+
+        [[17., 27.],
+         [18., 28.],
+         [19., 29.],
+         [20., 30.]],
+
+        [[18., 28.],
+         [19., 29.],
+         [20., 30.],
+         [21., 31.]],
+
+        [[19., 29.],
+         [20., 30.],
+         [21., 31.],
+         [22., 32.]],
+
+        [[20., 30.],
+         [21., 31.],
+         [22., 32.],
+         [23., 33.]],
+
+        [[21., 31.],
+         [22., 32.],
+         [23., 33.],
+         [24., 34.]],
+
+        [[22., 32.],
+         [23., 33.],
+         [24., 34.],
+         [25., 35.]],
+
+        [[23., 33.],
+         [24., 34.],
+         [25., 35.],
+         [26., 36.]],
+
+        [[24., 34.],
+         [25., 35.],
+         [26., 36.],
+         [27., 37.]]], device='cuda:0', dtype=torch.float64)
+
+
+########## torch.float32/torch.int64/size=(2, 3)+(2, 3)+(4,) ##########
 # sparse tensor
 tensor(crow_indices=tensor([[[0, 2, 4],
                              [0, 3, 4],
@@ -971,91 +979,44 @@ tensor(crow_indices=tensor([[[0, 2, 4],
                             [[0, 1, 4],
                              [0, 2, 4],
                              [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
+       col_indices=tensor([[[0, 1, 0, 2],
                             [0, 1, 2, 0],
                             [0, 0, 1, 2]],
 
                            [[1, 0, 1, 2],
                             [0, 2, 0, 1],
                             [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1.],
-                         [ 2.]],
-
-                        [[ 2.],
-                         [ 3.]],
-
-                        [[ 3.],
-                         [ 4.]],
-
-                        [[ 4.],
-                         [ 5.]]],
-
-
-                       [[[ 5.],
-                         [ 6.]],
-
-                        [[ 6.],
-                         [ 7.]],
-
-                        [[ 7.],
-                         [ 8.]],
-
-                        [[ 8.],
-                         [ 9.]]],
-
-
-                       [[[ 9.],
-                         [10.]],
-
-                        [[10.],
-                         [11.]],
-
-                        [[11.],
-                         [12.]],
-
-                        [[12.],
-                         [13.]]]],
-
-
-
-                      [[[[13.],
-                         [14.]],
-
-                        [[14.],
-                         [15.]],
-
-                        [[15.],
-                         [16.]],
-
-                        [[16.],
-                         [17.]]],
-
-
-                       [[[17.],
-                         [18.]],
-
-                        [[18.],
-                         [19.]],
-
-                        [[19.],
-                         [20.]],
-
-                        [[20.],
-                         [21.]]],
-
-
-                       [[[21.],
-                         [22.]],
-
-                        [[22.],
-                         [23.]],
-
-                        [[23.],
-                         [24.]],
-
-                        [[24.],
-                         [25.]]]]]), device='cuda:0', size=(2, 3, 2, 3, 2, 1),
-       nnz=4, layout=torch.sparse_csr)
+       values=tensor([[[[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]]],
+
+
+                      [[[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]]]]), device='cuda:0',
+       size=(2, 3, 2, 3, 4), nnz=4, layout=torch.sparse_csr)
 # _crow_indices
 tensor([[[0, 2, 4],
          [0, 3, 4],
@@ -1065,7 +1026,7 @@ tensor([[[0, 2, 4],
          [0, 2, 4],
          [0, 3, 4]]], device='cuda:0')
 # _col_indices
-tensor([[[0, 1, 0, 1],
+tensor([[[0, 1, 0, 2],
          [0, 1, 2, 0],
          [0, 0, 1, 2]],
 
@@ -1073,108 +1034,42 @@ tensor([[[0, 1, 0, 1],
          [0, 2, 0, 1],
          [0, 1, 2, 1]]], device='cuda:0')
 # _values
-tensor([[[[[ 1.],
-           [ 2.]],
-
-          [[ 2.],
-           [ 3.]],
-
-          [[ 3.],
-           [ 4.]],
-
-          [[ 4.],
-           [ 5.]]],
-
-
-         [[[ 5.],
-           [ 6.]],
-
-          [[ 6.],
-           [ 7.]],
-
-          [[ 7.],
-           [ 8.]],
-
-          [[ 8.],
-           [ 9.]]],
-
-
-         [[[ 9.],
-           [10.]],
-
-          [[10.],
-           [11.]],
-
-          [[11.],
-           [12.]],
-
-          [[12.],
-           [13.]]]],
-
-
-
-        [[[[13.],
-           [14.]],
-
-          [[14.],
-           [15.]],
-
-          [[15.],
-           [16.]],
-
-          [[16.],
-           [17.]]],
-
-
-         [[[17.],
-           [18.]],
-
-          [[18.],
-           [19.]],
-
-          [[19.],
-           [20.]],
-
-          [[20.],
-           [21.]]],
-
-
-         [[[21.],
-           [22.]],
-
-          [[22.],
-           [23.]],
-
-          [[23.],
-           [24.]],
-
-          [[24.],
-           [25.]]]]], device='cuda:0')
-
-
-########## torch.float64/torch.int64/size=()+(2, 3)+(2,) ##########
-# sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
-       values=tensor([[1., 2.],
-                      [2., 3.],
-                      [3., 4.],
-                      [4., 5.]]), device='cuda:0', size=(2, 3, 2), nnz=4,
-       dtype=torch.float64, layout=torch.sparse_csr)
-# _crow_indices
-tensor([0, 2, 4], device='cuda:0')
-# _col_indices
-tensor([0, 1, 0, 2], device='cuda:0')
-# _values
-tensor([[1., 2.],
-        [2., 3.],
-        [3., 4.],
-        [4., 5.]], device='cuda:0', dtype=torch.float64)
-
-########## torch.float64/torch.int64/size=()+(2, 3)+(4, 2) ##########
+tensor([[[[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]]],
+
+
+        [[[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]]]], device='cuda:0')
+
+########## torch.float32/torch.int64/size=()+(8, 6)+(4, 2) ##########
 # sparse tensor
-tensor(crow_indices=tensor([0, 2, 4]),
-       col_indices=tensor([0, 1, 0, 2]),
+tensor(crow_indices=tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24]),
+       col_indices=tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4,
+                           5, 0, 1, 2, 4, 5]),
        values=tensor([[[ 1., 11.],
                        [ 2., 12.],
                        [ 3., 13.],
@@ -1193,12 +1088,113 @@ tensor(crow_indices=tensor([0, 2, 4]),
                       [[ 4., 14.],
                        [ 5., 15.],
                        [ 6., 16.],
-                       [ 7., 17.]]]), device='cuda:0', size=(2, 3, 4, 2),
-       nnz=4, dtype=torch.float64, layout=torch.sparse_csr)
+                       [ 7., 17.]],
+
+                      [[ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.]],
+
+                      [[ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.]],
+
+                      [[ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.]],
+
+                      [[ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.],
+                       [11., 21.]],
+
+                      [[ 9., 19.],
+                       [10., 20.],
+                       [11., 21.],
+                       [12., 22.]],
+
+                      [[10., 20.],
+                       [11., 21.],
+                       [12., 22.],
+                       [13., 23.]],
+
+                      [[11., 21.],
+                       [12., 22.],
+                       [13., 23.],
+                       [14., 24.]],
+
+                      [[12., 22.],
+                       [13., 23.],
+                       [14., 24.],
+                       [15., 25.]],
+
+                      [[13., 23.],
+                       [14., 24.],
+                       [15., 25.],
+                       [16., 26.]],
+
+                      [[14., 24.],
+                       [15., 25.],
+                       [16., 26.],
+                       [17., 27.]],
+
+                      [[15., 25.],
+                       [16., 26.],
+                       [17., 27.],
+                       [18., 28.]],
+
+                      [[16., 26.],
+                       [17., 27.],
+                       [18., 28.],
+                       [19., 29.]],
+
+                      [[17., 27.],
+                       [18., 28.],
+                       [19., 29.],
+                       [20., 30.]],
+
+                      [[18., 28.],
+                       [19., 29.],
+                       [20., 30.],
+                       [21., 31.]],
+
+                      [[19., 29.],
+                       [20., 30.],
+                       [21., 31.],
+                       [22., 32.]],
+
+                      [[20., 30.],
+                       [21., 31.],
+                       [22., 32.],
+                       [23., 33.]],
+
+                      [[21., 31.],
+                       [22., 32.],
+                       [23., 33.],
+                       [24., 34.]],
+
+                      [[22., 32.],
+                       [23., 33.],
+                       [24., 34.],
+                       [25., 35.]],
+
+                      [[23., 33.],
+                       [24., 34.],
+                       [25., 35.],
+                       [26., 36.]],
+
+                      [[24., 34.],
+                       [25., 35.],
+                       [26., 36.],
+                       [27., 37.]]]), device='cuda:0', size=(8, 6, 4, 2),
+       nnz=24, layout=torch.sparse_csr)
 # _crow_indices
-tensor([0, 2, 4], device='cuda:0')
+tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24], device='cuda:0')
 # _col_indices
-tensor([0, 1, 0, 2], device='cuda:0')
+tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4, 5, 0, 1, 2, 4, 5],
+       device='cuda:0')
 # _values
 tensor([[[ 1., 11.],
          [ 2., 12.],
@@ -1218,9 +1214,110 @@ tensor([[[ 1., 11.],
         [[ 4., 14.],
          [ 5., 15.],
          [ 6., 16.],
-         [ 7., 17.]]], device='cuda:0', dtype=torch.float64)
+         [ 7., 17.]],
 
-########## torch.float64/torch.int64/size=(2, 3)+(2, 3)+(2, 1) ##########
+        [[ 5., 15.],
+         [ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.]],
+
+        [[ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.]],
+
+        [[ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.],
+         [10., 20.]],
+
+        [[ 8., 18.],
+         [ 9., 19.],
+         [10., 20.],
+         [11., 21.]],
+
+        [[ 9., 19.],
+         [10., 20.],
+         [11., 21.],
+         [12., 22.]],
+
+        [[10., 20.],
+         [11., 21.],
+         [12., 22.],
+         [13., 23.]],
+
+        [[11., 21.],
+         [12., 22.],
+         [13., 23.],
+         [14., 24.]],
+
+        [[12., 22.],
+         [13., 23.],
+         [14., 24.],
+         [15., 25.]],
+
+        [[13., 23.],
+         [14., 24.],
+         [15., 25.],
+         [16., 26.]],
+
+        [[14., 24.],
+         [15., 25.],
+         [16., 26.],
+         [17., 27.]],
+
+        [[15., 25.],
+         [16., 26.],
+         [17., 27.],
+         [18., 28.]],
+
+        [[16., 26.],
+         [17., 27.],
+         [18., 28.],
+         [19., 29.]],
+
+        [[17., 27.],
+         [18., 28.],
+         [19., 29.],
+         [20., 30.]],
+
+        [[18., 28.],
+         [19., 29.],
+         [20., 30.],
+         [21., 31.]],
+
+        [[19., 29.],
+         [20., 30.],
+         [21., 31.],
+         [22., 32.]],
+
+        [[20., 30.],
+         [21., 31.],
+         [22., 32.],
+         [23., 33.]],
+
+        [[21., 31.],
+         [22., 32.],
+         [23., 33.],
+         [24., 34.]],
+
+        [[22., 32.],
+         [23., 33.],
+         [24., 34.],
+         [25., 35.]],
+
+        [[23., 33.],
+         [24., 34.],
+         [25., 35.],
+         [26., 36.]],
+
+        [[24., 34.],
+         [25., 35.],
+         [26., 36.],
+         [27., 37.]]], device='cuda:0')
+
+
+########## torch.float64/torch.int64/size=(2, 3)+(2, 3)+(4,) ##########
 # sparse tensor
 tensor(crow_indices=tensor([[[0, 2, 4],
                              [0, 3, 4],
@@ -1229,91 +1326,44 @@ tensor(crow_indices=tensor([[[0, 2, 4],
                             [[0, 1, 4],
                              [0, 2, 4],
                              [0, 3, 4]]]),
-       col_indices=tensor([[[0, 1, 0, 1],
+       col_indices=tensor([[[0, 1, 0, 2],
                             [0, 1, 2, 0],
                             [0, 0, 1, 2]],
 
                            [[1, 0, 1, 2],
                             [0, 2, 0, 1],
                             [0, 1, 2, 1]]]),
-       values=tensor([[[[[ 1.],
-                         [ 2.]],
-
-                        [[ 2.],
-                         [ 3.]],
-
-                        [[ 3.],
-                         [ 4.]],
-
-                        [[ 4.],
-                         [ 5.]]],
-
-
-                       [[[ 5.],
-                         [ 6.]],
-
-                        [[ 6.],
-                         [ 7.]],
-
-                        [[ 7.],
-                         [ 8.]],
-
-                        [[ 8.],
-                         [ 9.]]],
-
-
-                       [[[ 9.],
-                         [10.]],
-
-                        [[10.],
-                         [11.]],
-
-                        [[11.],
-                         [12.]],
-
-                        [[12.],
-                         [13.]]]],
-
-
-
-                      [[[[13.],
-                         [14.]],
-
-                        [[14.],
-                         [15.]],
-
-                        [[15.],
-                         [16.]],
-
-                        [[16.],
-                         [17.]]],
-
-
-                       [[[17.],
-                         [18.]],
-
-                        [[18.],
-                         [19.]],
-
-                        [[19.],
-                         [20.]],
-
-                        [[20.],
-                         [21.]]],
-
-
-                       [[[21.],
-                         [22.]],
-
-                        [[22.],
-                         [23.]],
-
-                        [[23.],
-                         [24.]],
-
-                        [[24.],
-                         [25.]]]]]), device='cuda:0', size=(2, 3, 2, 3, 2, 1),
-       nnz=4, dtype=torch.float64, layout=torch.sparse_csr)
+       values=tensor([[[[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]]],
+
+
+                      [[[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]],
+
+                       [[1., 2., 3., 4.],
+                        [2., 3., 4., 5.],
+                        [3., 4., 5., 6.],
+                        [4., 5., 6., 7.]]]]), device='cuda:0',
+       size=(2, 3, 2, 3, 4), nnz=4, dtype=torch.float64, layout=torch.sparse_csr)
 # _crow_indices
 tensor([[[0, 2, 4],
          [0, 3, 4],
@@ -1323,7 +1373,7 @@ tensor([[[0, 2, 4],
          [0, 2, 4],
          [0, 3, 4]]], device='cuda:0')
 # _col_indices
-tensor([[[0, 1, 0, 1],
+tensor([[[0, 1, 0, 2],
          [0, 1, 2, 0],
          [0, 0, 1, 2]],
 
@@ -1331,81 +1381,285 @@ tensor([[[0, 1, 0, 1],
          [0, 2, 0, 1],
          [0, 1, 2, 1]]], device='cuda:0')
 # _values
-tensor([[[[[ 1.],
-           [ 2.]],
-
-          [[ 2.],
-           [ 3.]],
-
-          [[ 3.],
-           [ 4.]],
-
-          [[ 4.],
-           [ 5.]]],
-
-
-         [[[ 5.],
-           [ 6.]],
-
-          [[ 6.],
-           [ 7.]],
-
-          [[ 7.],
-           [ 8.]],
-
-          [[ 8.],
-           [ 9.]]],
-
-
-         [[[ 9.],
-           [10.]],
-
-          [[10.],
-           [11.]],
-
-          [[11.],
-           [12.]],
-
-          [[12.],
-           [13.]]]],
-
-
-
-        [[[[13.],
-           [14.]],
-
-          [[14.],
-           [15.]],
-
-          [[15.],
-           [16.]],
-
-          [[16.],
-           [17.]]],
-
-
-         [[[17.],
-           [18.]],
+tensor([[[[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]]],
+
+
+        [[[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]],
+
+         [[1., 2., 3., 4.],
+          [2., 3., 4., 5.],
+          [3., 4., 5., 6.],
+          [4., 5., 6., 7.]]]], device='cuda:0', dtype=torch.float64)
+
+########## torch.float64/torch.int64/size=()+(8, 6)+(4, 2) ##########
+# sparse tensor
+tensor(crow_indices=tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24]),
+       col_indices=tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4,
+                           5, 0, 1, 2, 4, 5]),
+       values=tensor([[[ 1., 11.],
+                       [ 2., 12.],
+                       [ 3., 13.],
+                       [ 4., 14.]],
 
-          [[18.],
-           [19.]],
+                      [[ 2., 12.],
+                       [ 3., 13.],
+                       [ 4., 14.],
+                       [ 5., 15.]],
 
-          [[19.],
-           [20.]],
+                      [[ 3., 13.],
+                       [ 4., 14.],
+                       [ 5., 15.],
+                       [ 6., 16.]],
 
-          [[20.],
-           [21.]]],
+                      [[ 4., 14.],
+                       [ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.]],
 
+                      [[ 5., 15.],
+                       [ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.]],
+
+                      [[ 6., 16.],
+                       [ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.]],
+
+                      [[ 7., 17.],
+                       [ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.]],
+
+                      [[ 8., 18.],
+                       [ 9., 19.],
+                       [10., 20.],
+                       [11., 21.]],
+
+                      [[ 9., 19.],
+                       [10., 20.],
+                       [11., 21.],
+                       [12., 22.]],
+
+                      [[10., 20.],
+                       [11., 21.],
+                       [12., 22.],
+                       [13., 23.]],
+
+                      [[11., 21.],
+                       [12., 22.],
+                       [13., 23.],
+                       [14., 24.]],
+
+                      [[12., 22.],
+                       [13., 23.],
+                       [14., 24.],
+                       [15., 25.]],
+
+                      [[13., 23.],
+                       [14., 24.],
+                       [15., 25.],
+                       [16., 26.]],
+
+                      [[14., 24.],
+                       [15., 25.],
+                       [16., 26.],
+                       [17., 27.]],
+
+                      [[15., 25.],
+                       [16., 26.],
+                       [17., 27.],
+                       [18., 28.]],
+
+                      [[16., 26.],
+                       [17., 27.],
+                       [18., 28.],
+                       [19., 29.]],
+
+                      [[17., 27.],
+                       [18., 28.],
+                       [19., 29.],
+                       [20., 30.]],
+
+                      [[18., 28.],
+                       [19., 29.],
+                       [20., 30.],
+                       [21., 31.]],
+
+                      [[19., 29.],
+                       [20., 30.],
+                       [21., 31.],
+                       [22., 32.]],
+
+                      [[20., 30.],
+                       [21., 31.],
+                       [22., 32.],
+                       [23., 33.]],
+
+                      [[21., 31.],
+                       [22., 32.],
+                       [23., 33.],
+                       [24., 34.]],
+
+                      [[22., 32.],
+                       [23., 33.],
+                       [24., 34.],
+                       [25., 35.]],
+
+                      [[23., 33.],
+                       [24., 34.],
+                       [25., 35.],
+                       [26., 36.]],
+
+                      [[24., 34.],
+                       [25., 35.],
+                       [26., 36.],
+                       [27., 37.]]]), device='cuda:0', size=(8, 6, 4, 2),
+       nnz=24, dtype=torch.float64, layout=torch.sparse_csr)
+# _crow_indices
+tensor([ 0,  3,  5,  8,  8, 12, 17, 19, 24], device='cuda:0')
+# _col_indices
+tensor([1, 3, 5, 1, 4, 0, 1, 2, 1, 3, 4, 5, 0, 2, 3, 4, 5, 4, 5, 0, 1, 2, 4, 5],
+       device='cuda:0')
+# _values
+tensor([[[ 1., 11.],
+         [ 2., 12.],
+         [ 3., 13.],
+         [ 4., 14.]],
 
-         [[[21.],
-           [22.]],
+        [[ 2., 12.],
+         [ 3., 13.],
+         [ 4., 14.],
+         [ 5., 15.]],
 
-          [[22.],
-           [23.]],
+        [[ 3., 13.],
+         [ 4., 14.],
+         [ 5., 15.],
+         [ 6., 16.]],
 
-          [[23.],
-           [24.]],
+        [[ 4., 14.],
+         [ 5., 15.],
+         [ 6., 16.],
+         [ 7., 17.]],
 
-          [[24.],
-           [25.]]]]], device='cuda:0', dtype=torch.float64)
+        [[ 5., 15.],
+         [ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.]],
+
+        [[ 6., 16.],
+         [ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.]],
+
+        [[ 7., 17.],
+         [ 8., 18.],
+         [ 9., 19.],
+         [10., 20.]],
+
+        [[ 8., 18.],
+         [ 9., 19.],
+         [10., 20.],
+         [11., 21.]],
+
+        [[ 9., 19.],
+         [10., 20.],
+         [11., 21.],
+         [12., 22.]],
+
+        [[10., 20.],
+         [11., 21.],
+         [12., 22.],
+         [13., 23.]],
+
+        [[11., 21.],
+         [12., 22.],
+         [13., 23.],
+         [14., 24.]],
+
+        [[12., 22.],
+         [13., 23.],
+         [14., 24.],
+         [15., 25.]],
+
+        [[13., 23.],
+         [14., 24.],
+         [15., 25.],
+         [16., 26.]],
+
+        [[14., 24.],
+         [15., 25.],
+         [16., 26.],
+         [17., 27.]],
+
+        [[15., 25.],
+         [16., 26.],
+         [17., 27.],
+         [18., 28.]],
+
+        [[16., 26.],
+         [17., 27.],
+         [18., 28.],
+         [19., 29.]],
+
+        [[17., 27.],
+         [18., 28.],
+         [19., 29.],
+         [20., 30.]],
+
+        [[18., 28.],
+         [19., 29.],
+         [20., 30.],
+         [21., 31.]],
+
+        [[19., 29.],
+         [20., 30.],
+         [21., 31.],
+         [22., 32.]],
+
+        [[20., 30.],
+         [21., 31.],
+         [22., 32.],
+         [23., 33.]],
+
+        [[21., 31.],
+         [22., 32.],
+         [23., 33.],
+         [24., 34.]],
+
+        [[22., 32.],
+         [23., 33.],
+         [24., 34.],
+         [25., 35.]],
+
+        [[23., 33.],
+         [24., 34.],
+         [25., 35.],
+         [26., 36.]],
+
+        [[24., 34.],
+         [25., 35.],
+         [26., 36.],
+         [27., 37.]]], device='cuda:0', dtype=torch.float64)
 
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 4bfccaff0e2c9..5b399271bab62 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -4049,6 +4049,52 @@ def test_basic(self):
         self.assertEqual(r.values(), torch.empty(0, 4, device='meta'))
 
 
+class TestSparseAny(TestCase):
+
+    def test_generate_simple_inputs(self):
+        # Temporarily disable BSC and BSC layouts as these don't support select yet, see the next PR in the stack.
+        layouts = [torch.strided, torch.sparse_coo, torch.sparse_csr, torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc][:-2]
+
+        tested_combinations = set()
+        for tensors in zip(*map(self.generate_simple_inputs, layouts)):
+            for i, t in enumerate(tensors):
+                self.assertEqual(t.layout, layouts[i])
+
+                # all layouts must produce semantically the same tensors
+                self.assertEqual(t, tensors[0])
+
+                if t.layout is torch.strided:
+                    is_hybrid = None
+                else:
+                    is_hybrid = t.dense_dim() > 0
+                if t.layout in {torch.sparse_csr, torch.sparse_bsr}:
+                    is_batch = t.crow_indices().ndim > 1
+                elif t.layout in {torch.sparse_csc, torch.sparse_bsc}:
+                    is_batch = t.ccol_indices().ndim > 1
+                else:
+                    is_batch = None
+                if t.layout in {torch.sparse_bsr, torch.sparse_bsc}:
+                    blocksize = t.values().shape[1:3]
+                    nontrivial_blocksize = 1 not in blocksize
+                else:
+                    nontrivial_blocksize = None
+                tested_combinations.add((t.layout, is_hybrid, is_batch, nontrivial_blocksize))
+
+        # Ensure that the inputs generation covers all layout,
+        # non-hybrid/hybrid, and non-batch/batch combinations:
+        for layout in layouts:
+            for is_hybrid in [False, True]:
+                if layout is torch.strided:
+                    is_hybrid = None
+                for is_batch in [False, True]:
+                    if layout in {torch.sparse_coo, torch.strided}:
+                        is_batch = None
+                    for nontrivial_blocksize in [False, True]:
+                        if layout not in {torch.sparse_bsr, torch.sparse_bsc}:
+                            nontrivial_blocksize = None
+                        key = (layout, is_hybrid, is_batch, nontrivial_blocksize)
+                        assert key in tested_combinations, key
+
 
 # e.g., TestSparseUnaryUfuncsCPU and TestSparseUnaryUfuncsCUDA
 instantiate_device_type_tests(TestSparseUnaryUfuncs, globals(), except_for='meta')
@@ -4058,5 +4104,7 @@ def test_basic(self):
 # e.g., TestSparseCPU and TestSparseCUDA
 instantiate_device_type_tests(TestSparse, globals(), except_for='meta')
 
+instantiate_device_type_tests(TestSparseAny, globals(), except_for='meta')
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index 6eee8a9d5b8bd..7275ecfa9f55b 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -1,6 +1,5 @@
 # Owner(s): ["module: sparse"]
 
-import copy
 import torch
 import random
 import itertools
@@ -198,147 +197,6 @@ def genTensor(self, size, nnz, *, layout, device=None, dtype=torch.float, index_
             device = self.device_type
         return self.genSparseCompressedTensor(size, nnz, device=device, dtype=dtype, index_dtype=index_dtype, layout=layout)
 
-    def _generate_small_inputs_utils(self, layout, device=None, dtype=None):
-
-        def shape(shape, basedim=0, blocksize=(1, 1), dense_shape=()):
-            # Below, we define compressed and plain indices that
-            # correspond to row compressed tensors. In order to reuse
-            # the indices tensors for column compressed tensors, we
-            # swap the row and columns in shape dims (basedim and
-            # basedim + 1, respectively) to obtain the correct shape
-            # for column compressed tensors. Batch and dense
-            # dimensions remain as they are.
-            #
-            # Similarly, we reuse indices of non-block tensors for
-            # block tensors, that means, we'll need to multiply the
-            # base shape of the non-block tensor with blocksize to get
-            # the base shape of a block tensor.
-            if layout is torch.sparse_csc:
-                shape = shape[:basedim] + (shape[basedim + 1], shape[basedim]) + shape[basedim + 2:]
-            elif layout is torch.sparse_bsc:
-                shape = shape[:basedim] + (shape[basedim + 1] * blocksize[1], shape[basedim] * blocksize[0]) + shape[basedim + 2:]
-            elif layout is torch.sparse_bsr:
-                shape = shape[:basedim] + (shape[basedim] * blocksize[0], shape[basedim + 1] * blocksize[1]) + shape[basedim + 2:]
-            return shape
-
-        def values(lst, basedim=0, blocksize=(1, 1), densesize=(), device=device, dtype=dtype):
-            # Below, we define values for non-blocked and non-hybrid
-            # tensors. To reuse these for blocked tensors, we replace
-            # all values in lst with a double-list that "shape"
-            # corresponds to blocksize.
-            # To support hybrid tensors, the values in lst are further
-            # replaced with a N-list where N==len(densesize) and the
-            # shape corresponds to densesize.
-
-            max_val = torch.iinfo(dtype).max if dtype in [torch.int16, torch.int8, torch.uint8] else None
-
-            def list_add(lst, value):
-                # recursively add a value to lst items
-                if isinstance(lst, list):
-                    return [list_add(item, value) for item in lst]
-                rc = lst + value
-                return rc if max_val is None else (rc % max_val)
-
-            def stretch_values(value, bdim, values_item_shape):
-                # replace a value with a new value that extends the
-                # dimensionality of the value by
-                # len(values_item_shape) from right. The left
-                # dimensions up to bdim are considered as batch
-                # dimensions.
-                if not values_item_shape:
-                    return value
-                if isinstance(value, list) and bdim >= 0:
-                    return [stretch_values(item, bdim - 1, values_item_shape) for item in value]
-                new_value = functools.reduce(lambda x, dims: [copy.deepcopy(x) for _ in range(dims)],
-                                             reversed(values_item_shape), None)
-                for p in itertools.product(*map(list, map(range, values_item_shape))):
-                    row = functools.reduce(lambda x, i: x.__getitem__(i), p[:-1], new_value)
-                    row[p[-1]] = list_add(value, sum([i * 10 ** d for d, i in enumerate(p)]))
-                return new_value
-
-            if layout is torch.sparse_bsr:
-                values_item_shape = blocksize + densesize
-            elif layout is torch.sparse_bsc:
-                values_item_shape = tuple(reversed(blocksize)) + densesize
-            else:
-                values_item_shape = densesize
-
-            if not lst:
-                return torch.tensor(lst, device=device, dtype=dtype).reshape(0, *values_item_shape)
-
-            lst = stretch_values(lst, basedim, values_item_shape)
-
-            return torch.tensor(lst, device=device, dtype=dtype)
-
-        return shape, values
-
-    def _generate_small_inputs(self, layout, device=None, dtype=None, index_dtype=None,
-                               enable_batched=True, enable_hybrid=True):
-        """Generator of inputs to sparse compressed tensor factory functions.
-
-        The input is defined as a 4-tuple:
-          compressed_indices, plain_indices, values, expected_size_from_shape_inference
-        """
-        if index_dtype is None:
-            index_dtype = torch.int64
-
-        shape, values = self._generate_small_inputs_utils(layout, device, dtype)
-
-        # a regular tensor
-        yield (torch.tensor([0, 2, 4], device=device, dtype=index_dtype),
-               torch.tensor([0, 1, 0, 2], device=device, dtype=index_dtype),
-               values([1, 2, 3, 4], 0, (2, 1)),
-               shape((2, 3), 0, (2, 1)))
-
-        # a tensor with zero dimensions
-        yield (torch.tensor([0, ], device=device, dtype=index_dtype),
-               torch.tensor([], device=device, dtype=index_dtype),
-               values([], 0, (2, 1)),
-               shape((0, 0), 0, (2, 1)))
-
-        if enable_batched:
-            # a batched tensor with one batch dimension
-            yield (torch.tensor([[0, 2, 4], [0, 3, 4]], device=device, dtype=index_dtype),
-                   torch.tensor([[0, 1, 0, 1], [0, 1, 2, 0]], device=device, dtype=index_dtype),
-                   values([[1, 2, 3, 4], [5, 6, 7, 8]], 1, (1, 2)),
-                   shape((2, 2, 3), 1, (1, 2)))
-
-            # a batched tensor with two batch dimensions
-            yield (torch.tensor([[[0, 2, 4], [0, 3, 4], [0, 1, 4]],
-                                 [[0, 1, 4], [0, 2, 4], [0, 3, 4]]],
-                                device=device, dtype=index_dtype),
-                   torch.tensor([[[0, 1, 0, 1], [0, 1, 2, 0], [0, 0, 1, 2]],
-                                 [[1, 0, 1, 2], [0, 2, 0, 1], [0, 1, 2, 1]]],
-                                device=device, dtype=index_dtype),
-                   values([[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],
-                           [[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]], 2, (2, 3)),
-                   shape((2, 3, 2, 3), 2, (2, 3)))
-
-        if enable_hybrid:
-            # a tensor with one dense dimension
-            yield (torch.tensor([0, 2, 4], device=device, dtype=index_dtype),
-                   torch.tensor([0, 1, 0, 2], device=device, dtype=index_dtype),
-                   values([1, 2, 3, 4], 0, (3, 2), (2,)),
-                   shape((2, 3, 2), 0, (3, 2)))
-
-            # a tensor with two dense dimensions
-            yield (torch.tensor([0, 2, 4], device=device, dtype=index_dtype),
-                   torch.tensor([0, 1, 0, 2], device=device, dtype=index_dtype),
-                   values([1, 2, 3, 4], 0, (2, 3), (4, 2)),
-                   shape((2, 3, 4, 2), 0, (2, 3)))
-
-        if enable_batched and enable_hybrid:
-            # a batched tensor with two batch dimensions and two dense dimensions
-            yield (torch.tensor([[[0, 2, 4], [0, 3, 4], [0, 1, 4]],
-                                 [[0, 1, 4], [0, 2, 4], [0, 3, 4]]],
-                                device=device, dtype=index_dtype),
-                   torch.tensor([[[0, 1, 0, 1], [0, 1, 2, 0], [0, 0, 1, 2]],
-                                 [[1, 0, 1, 2], [0, 2, 0, 1], [0, 1, 2, 1]]],
-                                device=device, dtype=index_dtype),
-                   values([[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],
-                           [[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]], 2, (3, 2), (2, 1)),
-                   shape((2, 3, 2, 3, 2, 1), 2, (3, 2)))
-
     @all_sparse_compressed_layouts()
     @onlyCPU
     def test_layout(self, layout):
@@ -352,11 +210,14 @@ def test_layout(self, layout):
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_sparse_compressed_constructor(self, layout, device, dtype,
                                            use_factory_function, shape_and_device_inference, input_kind):
-        if input_kind == 'list' and shape_and_device_inference and torch.device(device).type == 'cuda':
-            # list inputs to factory/constructor function without
-            # specifying device will result a sparse compressed tensor
-            # on CPU. So, skip testing against cuda device as unused.
-            self.skipTest("nothing to test")
+        if input_kind == 'list' and shape_and_device_inference:
+            if torch.device(device).type == 'cuda':
+                # list inputs to factory/constructor function without
+                # specifying device will result a sparse compressed tensor
+                # on CPU. So, skip testing against cuda device as unused.
+                self.skipTest("nothing to test")
+            if dtype not in {torch.float32, torch.complex64, torch.int64, torch.bool}:
+                self.skipTest("dtype not supported with list values")
 
         expected_devices = [torch.device(device)]
         if TEST_CUDA and torch.device(device).type == 'cuda' and torch.cuda.device_count() >= 2 and not shape_and_device_inference:
@@ -369,29 +230,34 @@ def test_sparse_compressed_constructor(self, layout, device, dtype,
             torch.sparse_bsc: torch.sparse_bsc_tensor,
         }[layout]
         compressed_indices_mth, plain_indices_mth = sparse_compressed_indices_methods[layout]
-        for index_dtype in [torch.int32, torch.int64]:
+        if input_kind == 'list':
+            index_dtypes = [torch.int64]
+        else:
+            index_dtypes = [torch.int32, torch.int64]
+        for index_dtype in index_dtypes:
             for expected_device in expected_devices:
-                for compressed_indices, plain_indices, values, size in self._generate_small_inputs(
-                        layout, expected_device, dtype, index_dtype):
+                for (compressed_indices, plain_indices, values), kwargs in self.generate_simple_inputs(
+                        layout, device=expected_device, dtype=dtype, index_dtype=index_dtype,
+                        # skip zero-sized tensors for list inputs:
+                        enable_zero_sized=input_kind != 'list',
+                        output_tensor=False):
+                    size = kwargs['size']
+                    if shape_and_device_inference and 0 in size:
+                        # skip shape inference for zero-sized tensor
+                        # inputs because (i) the shape determined from
+                        # an empty list is ambiguous, and (ii) the
+                        # size of the plain dimension defined as
+                        # max(plain_indices) is undefined if
+                        # plain_indices has no values
+                        continue
+                    compressed_indices_expect = compressed_indices
+                    plain_indices_expect = plain_indices
+                    values_expect = values
+
                     if input_kind == 'list':
-                        if size == (0, 0):
-                            # for this degenerate case, plain_indices must
-                            # remain a tensor because
-                            # tensor(plain_indices) results a float dtype
-                            # when plain_indices is an empty list
-                            if index_dtype == torch.int32:
-                                # skip testing int32 case because
-                                # tensor(compressed_indices) results a
-                                # int64 dtype when compressed_indices is
-                                # [0] (a list of single int zero).
-                                continue
-                        else:
-                            plain_indices = plain_indices.tolist()
                         compressed_indices = compressed_indices.tolist()
+                        plain_indices = plain_indices.tolist()
                         values = values.tolist()
-                        if size == (0, 0) and layout in {torch.sparse_bsr, torch.sparse_bsc}:
-                            # in the block sparse case, values of type list needs to represent a 3-D tensor
-                            values = [[[]]]
 
                     if use_factory_function:
                         if shape_and_device_inference:
@@ -407,9 +273,9 @@ def test_sparse_compressed_constructor(self, layout, device, dtype,
                                                                     dtype=dtype, layout=layout, device=expected_device)
                     self.assertEqual(layout, sparse.layout)
                     self.assertEqual(size, sparse.shape)
-                    self.assertEqual(compressed_indices, compressed_indices_mth(sparse))
-                    self.assertEqual(plain_indices, plain_indices_mth(sparse))
-                    self.assertEqual(values, sparse.values())
+                    self.assertEqual(compressed_indices_expect, compressed_indices_mth(sparse))
+                    self.assertEqual(plain_indices_expect, plain_indices_mth(sparse))
+                    self.assertEqual(values_expect, sparse.values())
                     self.assertEqual(sparse.device, sparse.values().device)
                     self.assertEqual(sparse.device, expected_device)
 
@@ -455,10 +321,10 @@ def test_empty_errors(self, layout, device, dtype):
     @all_sparse_compressed_layouts()
     @dtypes(*all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16))
     def test_clone(self, layout, device, dtype):
-        for compressed_indices, plain_indices, values, size in self._generate_small_inputs(
-                layout, device, dtype, index_dtype=torch.int32):
-            sparse = torch.sparse_compressed_tensor(compressed_indices, plain_indices, values, size,
-                                                    dtype=dtype, layout=layout, device=device)
+        for sparse in self.generate_simple_inputs(
+                layout, device=device, dtype=dtype, index_dtype=torch.int32,
+                # Temporarily disable testing batch block tensors:
+                enable_batch=layout in {torch.sparse_csr, torch.sparse_csc}):
             cloned_sparse = sparse.clone()
             self.assertEqual(sparse, cloned_sparse)
 
@@ -467,10 +333,37 @@ def test_print(self, layout, device):
         compressed_indices_mth, plain_indices_mth = sparse_compressed_indices_methods[layout]
         printed = []
         for enable_hybrid in [False, True]:
+            # using local patterns for test_print stability
+            patterns = [
+                # 2 x 3 batch of 3 x 2 tensors, trivial blocksize, non-hybrid/hybrid:
+                ([[[[1, 2, 0],
+                    [1, 0, 3]],
+                   [[1, 2, 3],
+                    [1, 0, 0]],
+                   [[1, 0, 0],
+                    [1, 2, 3]]],
+                  [[[0, 2, 0],
+                    [1, 2, 3]],
+                   [[1, 0, 3],
+                    [1, 2, 0]],
+                   [[1, 2, 3],
+                    [0, 2, 0]]]], [(2, 1)], [(), (4,)] if enable_hybrid else [()]),
+                # tensor with non-trivial blocksize, non-hybrid/hybrid:
+                ([[0, 1, 0, 2, 0, 2],
+                  [0, 1, 0, 0, 2, 0],
+                  [3, 3, 3, 0, 0, 0],
+                  [0, 0, 0, 0, 0, 0],
+                  [0, 5, 0, 6, 6, 6],
+                  [5, 0, 5, 6, 6, 6],
+                  [0, 0, 0, 0, 8, 8],
+                  [7, 7, 7, 0, 8, 8]], [(2, 3)], [(), (4, 2)] if enable_hybrid else [()]),
+            ]
             for index_dtype in [torch.int32, torch.int64]:
                 for dtype in [torch.float32, torch.float64]:
-                    for compressed_indices, plain_indices, values, size in self._generate_small_inputs(
-                            layout, device, dtype, index_dtype, enable_hybrid=enable_hybrid):
+                    for (compressed_indices, plain_indices, values), kwargs in self.generate_simple_inputs(
+                            layout, device=device, dtype=dtype, index_dtype=index_dtype, enable_hybrid=enable_hybrid,
+                            enable_zero_sized=False, output_tensor=False, patterns=patterns):
+                        size = tuple(kwargs['size'])
                         block_ndim = 2 if layout in {torch.sparse_bsr, torch.sparse_bsc} else 0
                         base_ndim = 2
                         batch_ndim = compressed_indices.dim() - 1
@@ -647,9 +540,7 @@ def test_consistency(self, layout, device, dtype, op):
     @all_sparse_compressed_layouts('layout2')
     @dtypes(*all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16))
     def test_empty_like(self, layout, layout2, device, dtype):
-        for compressed_indices, plain_indices, values, size in self._generate_small_inputs(layout):
-            sparse = torch.sparse_compressed_tensor(compressed_indices, plain_indices, values, size,
-                                                    dtype=dtype, layout=layout, device=device)
+        for sparse in self.generate_simple_inputs(layout):
             if layout == layout2:
                 result = torch.empty_like(sparse, layout=layout2)
                 compressed_indices_mth, plain_indices_mth = sparse_compressed_indices_methods[result.layout]
@@ -671,14 +562,28 @@ def test_empty_like(self, layout, layout2, device, dtype):
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_validate(self, layout, device, dtype):
         for index_dtype in [torch.int32, torch.int64]:
-            for compressed_indices, plain_indices, values, size in self._generate_small_inputs(
-                    layout, device, dtype, index_dtype, enable_batched=True, enable_hybrid=True):
+            for (compressed_indices, plain_indices, values), kwargs in self.generate_simple_inputs(
+                    layout, device=device, dtype=dtype, index_dtype=index_dtype, output_tensor=False):
+                size = kwargs['size']
                 torch._validate_sparse_compressed_tensor_args(compressed_indices, plain_indices, values, size, layout)
 
     def _generate_invalid_input(self, layout, device):
         from functools import partial
 
-        shape, values = self._generate_small_inputs_utils(layout, device=device)
+        def shape(shape, basedim=0):
+            blocksize = (1, 1)
+            if layout is torch.sparse_csc:
+                shape = shape[:basedim] + (shape[basedim + 1], shape[basedim]) + shape[basedim + 2:]
+            elif layout is torch.sparse_bsc:
+                shape = shape[:basedim] + (shape[basedim + 1] * blocksize[1], shape[basedim] * blocksize[0]) + shape[basedim + 2:]
+            elif layout is torch.sparse_bsr:
+                shape = shape[:basedim] + (shape[basedim] * blocksize[0], shape[basedim + 1] * blocksize[1]) + shape[basedim + 2:]
+            return shape
+
+        def values(lst, device=device):
+            if layout in {torch.sparse_bsr, torch.sparse_bsc}:
+                lst = [[[item]] for item in lst]
+            return torch.tensor(lst, device=device)
 
         tensor = partial(torch.tensor, device=device)
         values = partial(values, device=device)
@@ -925,7 +830,8 @@ def test_invalid_input(self, layout, device, target):
     @onlyCPU
     @all_sparse_compressed_layouts()
     def test_dim(self, layout):
-        for compressed_indices, plain_indices, values, size in self._generate_small_inputs(layout):
+        for (compressed_indices, plain_indices, values), kwargs in self.generate_simple_inputs(layout, output_tensor=False):
+            size = kwargs['size']
             batch_dim = compressed_indices.dim() - 1
             sparse_dim = 2
             block_dim = 2 if layout in {torch.sparse_bsr, torch.sparse_bsc} else 0
@@ -940,10 +846,7 @@ def test_dim(self, layout):
     @dtypes(*all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16))
     def test_to_dtype(self, layout, device, dtype):
         # to_dense does not support hybrid inputs
-        input_gen = self._generate_small_inputs(layout, device=device, enable_hybrid=False)
-        for compressed_indices, plain_indices, values, size in input_gen:
-            sparse = torch.sparse_compressed_tensor(compressed_indices, plain_indices, values, size,
-                                                    dtype=dtype, layout=layout, device=device)
+        for sparse in self.generate_simple_inputs(layout, dtype=dtype, device=device, enable_hybrid=False):
             for to_dtype in all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16):
                 sparse_to_dtype = sparse.to(to_dtype)
                 dense_to_dtype = sparse.to_dense().to(to_dtype)
@@ -955,10 +858,7 @@ def test_to_dtype(self, layout, device, dtype):
     def test_pickle(self, layout, dtype, device):
         import pickle
 
-        input_gen = self._generate_small_inputs(layout)
-        for compressed_indices, plain_indices, values, size in input_gen:
-            sparse = torch.sparse_compressed_tensor(compressed_indices, plain_indices, values, size,
-                                                    dtype=dtype, device=device, layout=layout)
+        for sparse in self.generate_simple_inputs(layout, device=device, dtype=dtype):
             serialized = pickle.dumps(sparse)
             sparse_loaded = pickle.loads(serialized)
 
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 3b3e81439c482..dc8b04d063718 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -2446,6 +2446,296 @@ def genSparseTensor(self, size, sparse_dim, nnz, is_uncoalesced, device, dtype):
             x = x.detach().clone()._coalesced_(False)
         return x, x._indices().clone(), x._values().clone()
 
+    def generate_simple_inputs(self, layout,
+                               device=None,
+                               dtype=None,
+                               index_dtype=None,
+                               enable_batch=True,
+                               enable_hybrid=True,
+                               enable_zero_sized=True,
+                               enable_batch_variable_nse=False,
+                               output_tensor=True,
+                               patterns=None):
+        """Generator of simple inputs for tensor constructors of the given layout.
+
+        The generated tensor inputs have the following properties:
+
+        - tensor shapes are minimal but not trivial
+        - tensor values are sorted sequences for COO and CSR formats, e.g. [1, 2, 3, 4]
+        - the generated tensors represent the same mathematical tensor for all layouts
+        - the generated tensors include regular, zero-sized, and optionally, batched or/and hybrid tensors.
+
+        If output_tensor is True, yield tensors with the given
+        layout. Otherwise, yield inputs to the corresponding tensor
+        constructors:
+
+          - sparse compressed input is defined as
+            (compressed_indices, plain_indices, values), dict(size=expected_size_from_shape_inference, device=device, dtype=dtype)
+
+          - sparse COO input is defined as
+            (indices, values), dict(size=expected_size_from_shape_inference, device=device, dtype=dtype)
+
+          - strided input is defined as
+            (values,), dict(device=device, dtype=dtype)
+        """
+        if index_dtype is None:
+            index_dtype = torch.int64
+
+        is_compressed_sparse_layout = layout in {torch.sparse_csr, torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc}
+
+        if output_tensor:
+            for args, kwargs in self.generate_simple_inputs(layout, device=device, dtype=dtype, index_dtype=index_dtype,
+                                                            enable_batch=enable_batch, enable_hybrid=enable_hybrid,
+                                                            enable_zero_sized=enable_zero_sized,
+                                                            enable_batch_variable_nse=enable_batch_variable_nse,
+                                                            output_tensor=False):
+                if layout is torch.strided:
+                    assert len(args) == 1
+                    size = kwargs.pop('size', None)  # to ensure that a zero-sized tensor has the desired shape
+                    assert size is not None
+                    yield args[0].reshape(size)
+                elif layout is torch.sparse_coo:
+                    yield torch.sparse_coo_tensor(*args, **kwargs)
+                elif is_compressed_sparse_layout:
+                    kwargs.update(layout=layout)
+                    yield torch.sparse_compressed_tensor(*args, **kwargs)
+                else:
+                    assert 0  # unreachable
+            return
+
+        def get_blockpattern(pattern, blocksize):
+            basesize = pattern.shape
+            assert basesize[0] % blocksize[0] == 0, (basesize, blocksize)
+            assert basesize[1] % blocksize[1] == 0, (basesize, blocksize)
+            blockpattern = pattern.reshape(-1,
+                                           blocksize[0],
+                                           basesize[1] // blocksize[1],
+                                           blocksize[1]).transpose(-3, -2).any(-1).any(-1)
+            block_ids = torch.arange(1, blockpattern.numel() + 1).reshape(blockpattern.shape)
+            return (blockpattern != 0) * block_ids
+
+        def get_sparse_data(pattern):
+            basesize = pattern.shape
+            assert len(basesize) == 2, basesize  # pattern is expected to be a matrix
+
+            # We cannot use `torch.sparse_xyz_tensor(pattern)` to
+            # compute the sparse layout indices and values because
+            # generate_simple_inputs is used to generate the inputs to
+            # test `torch.sparse_xyz_tensor` factory functions, so
+            # we'll compute the indices and values independently of
+            # the factory functions.
+
+            indices = torch.where(pattern != 0)
+            coo_indices = torch.stack(indices)
+            crow_indices = torch.zeros(basesize[0] + 1, dtype=torch.int64)
+            crow_indices[1:] = torch.cumsum(coo_indices[0].bincount(minlength=basesize[0]), 0)
+            col_indices = coo_indices[1]
+            strided_values = torch.zeros(basesize, dtype=torch.int64)
+
+            # the property of `values == range(1, 1+nnz)` is used in
+            # get_sparse_data_with_block to relate BSR and BSC values,
+            # so, don't change the following line:
+            values = torch.arange(1, 1 + len(indices[0]), dtype=torch.int64)
+            strided_values[indices] = values
+
+            indices_T = torch.where(pattern.transpose(0, 1) != 0)
+            coo_indices_T = torch.stack(indices_T)
+            ccol_indices = torch.zeros(basesize[1] + 1, dtype=torch.int64)
+            ccol_indices[1:] = torch.cumsum(coo_indices_T[0].bincount(minlength=basesize[1]), 0)
+            row_indices = coo_indices_T[1]
+            csc_values = strided_values.transpose(0, 1)[indices_T]
+
+            return {torch.sparse_coo: (coo_indices, values),
+                    torch.sparse_csr: (crow_indices, col_indices, values),
+                    torch.sparse_csc: (ccol_indices, row_indices, csc_values),
+                    torch.strided: (strided_values,)}
+
+        def get_sparse_data_with_block(pattern, blocksize):
+            nonblock_data = get_sparse_data(pattern)
+            blockpattern = get_blockpattern(pattern, blocksize)
+            block_data = get_sparse_data(blockpattern)
+
+            strided_values = nonblock_data[torch.strided][0]
+            block_indices = block_data[torch.sparse_coo][0]
+            bsr_values = torch.stack([strided_values[bi * blocksize[0]:(bi + 1) * blocksize[0],
+                                                     bj * blocksize[1]:(bj + 1) * blocksize[1]]
+                                      for bi, bj in block_indices.transpose(0, 1)])
+
+            # here we use the property `values == range(1, 1+nnz)` and
+            # `values` relation to `csc_values` (see get_sparse_data)
+            # to get BSC blocks via reordering the BSR blocks:
+            bsc_values = bsr_values[block_data[torch.sparse_csc][2] - 1]
+
+            return {torch.sparse_bsr: (*block_data[torch.sparse_csr][:2], bsr_values),
+                    torch.sparse_bsc: (*block_data[torch.sparse_csc][:2], bsc_values),
+                    **nonblock_data}
+
+        def get_batch_sparse_data(pattern, blocksize):
+            size = pattern.shape
+            if len(size) <= 2:  # non-batch
+                return get_sparse_data_with_block(pattern, blocksize)
+
+            # batch data is created recursively:
+            batch_data = {}
+            for i, item in enumerate(pattern):
+                for layout, d in get_batch_sparse_data(item, blocksize).items():
+                    target = batch_data.get(layout)
+                    if layout is torch.sparse_coo:
+                        # a "batch COO" means a COO with the leading
+                        # sparse dimensions interpreted as batch
+                        # dimensions
+                        ext_coo_indices1 = torch.cat((torch.full((1, len(d[1])), i, dtype=torch.int64), d[0]))
+                        if target is None:
+                            target = batch_data[layout] = (ext_coo_indices1, d[1])
+                        else:
+                            target[0].set_(torch.cat((target[0], ext_coo_indices1), 1))
+                            target[1].set_(torch.cat((target[1], d[1])))
+                    else:
+                        if target is None:
+                            target = batch_data[layout] = tuple(d[j].unsqueeze(0) for j in range(len(d)))
+                        else:
+                            for j in range(len(d)):
+                                target[j].set_(torch.cat((target[j], d[j].unsqueeze(0))))
+            return batch_data
+
+        def generate_values(base, densesize):
+            """Generates a tensor of shape densesize with values equal to
+
+              base + i_1 * 10^0 + ... + i_d * 10^{d - 1}
+
+            at indices i_1, ..., i_d (with 0 <= i_j < densesize[j] for any 1 <= j <=
+            len(densesize))
+
+            This mapping produces unique values as long as
+            densesize[i] < 10 for all i in range(len(densesize)).
+            """
+
+            if not densesize:
+                return base
+            if not isinstance(base, int) and base.ndim > 0:
+                return torch.stack([generate_values(b, densesize) for b in base])
+            if base == 0:
+                return torch.zeros(densesize, dtype=torch.int64)
+            r = torch.arange(densesize[0], dtype=torch.int64)
+            for i, d in enumerate(densesize[1:]):
+                y = torch.arange(d, dtype=torch.int64) * (10 ** (i + 1))
+                r = r[..., None] + y[None, ...]
+            r.add_(base)
+            return r
+
+        if patterns is None:
+            # A pattern is a 3-tuple with the following items:
+            #
+            # - a list of integers with the depth of two or more. The
+            #   integers define the sparsity patterns of the generated
+            #   inputs: zero values correspond to unspecified
+            #   elements/blocks, and non-zero values to the specified
+            #   elements.
+            #
+            #   For debugging convenience, the elements with the same
+            #   value typically belong to the same block. However, it
+            #   is not a hard requirement: as long as the shape of a
+            #   pattern divides with block sizes, the pattern will be
+            #   a valid one.
+            #
+            #   If the depth of the list is larger than two, inputs
+            #   with batch dimensions will be generated.
+            #
+            # - a list of 2-tuples of block sizes, used to generate
+            #   BSR/BSC tensors with various block size parameters
+            #
+            # - a list of tuples of dense dimensions, used to generate
+            #   hybrid tensors with various dense dimensions
+            #
+            patterns = [
+                # a simple 3 x 2 tensor: non-hybrid, hybrid with 1 and 2 dense dimensions
+                ([[1, 2, 0],
+                  [1, 0, 3]], [(2, 1), (1, 3)], [(), (2,), (4, 5)]),
+                # 2 x 3 batch of 3 x 2 tensors: non-hybrid and hybrid with 2 dense dimensions
+                ([[[[1, 2, 0],
+                    [1, 0, 3]],
+                   [[1, 2, 3],
+                    [1, 0, 0]],
+                   [[1, 0, 0],
+                    [1, 2, 3]]],
+                  [[[0, 2, 0],
+                    [1, 2, 3]],
+                   [[1, 0, 3],
+                    [1, 2, 0]],
+                   [[1, 2, 3],
+                    [0, 2, 0]]]], [(2, 1), (2, 3)], [(), (2,)]),
+                # tensor with non-trivial blocksize
+                ([[0, 1, 0, 2, 0, 2],
+                  [0, 1, 0, 0, 2, 0],
+                  [3, 3, 3, 0, 0, 0],
+                  [0, 0, 0, 0, 0, 0],
+                  [0, 5, 0, 6, 6, 6],
+                  [5, 0, 5, 6, 6, 6],
+                  [0, 0, 0, 0, 8, 8],
+                  [7, 7, 7, 0, 8, 8]], [(2, 3)], [(), (4, 5)]),
+                # batch tensor with variable NSE
+                # Requires https://github.com/pytorch/pytorch/pull/84843 or similar.
+                ([[[1, 2],
+                   [3, 4]],
+                  [[1, 0],
+                   [0, 0]]], [(1, 1)], ([()] if enable_batch_variable_nse else []))]
+
+        # the main loop of the method:
+        for pattern, blocksizes, densesizes in patterns:
+            if not enable_hybrid:
+                densesizes = [s for s in densesizes if not s]
+            if not (densesizes and blocksizes):
+                continue
+            pattern = torch.tensor(pattern, dtype=torch.int64)
+            if not enable_batch and pattern.ndim > 2:
+                continue
+            for blocksize in blocksizes:
+                data = get_batch_sparse_data(pattern, blocksize)[layout]
+                for densesize in densesizes:
+                    indices = [a.to(device=device, dtype=index_dtype) for a in data[:-1]]
+                    values = generate_values(data[-1], densesize).to(device=device, dtype=dtype)
+                    yield (*indices, values), dict(device=device, dtype=dtype,
+                                                   size=pattern.shape + densesize)
+
+        # zero-sized tensor inputs, non-batch, non-hybrid/hybrid
+        if enable_zero_sized:
+            for basesize, blocksizes, densesizes in [
+                    ((2, 0), [(1, 2)], [(), (2,), (2, 3)] if enable_hybrid else [()]),
+                    ((0, 2), [(1, 2), (2, 1), (3, 2)], [()]),
+                    ((0, 0), [(1, 2)], [()]),
+            ]:
+                for blocksize in blocksizes:
+                    for densesize in densesizes:
+                        if layout == torch.strided:
+                            indices = ()
+                            values = torch.empty((basesize + densesize), device=device, dtype=dtype)
+                        elif layout == torch.sparse_coo:
+                            indices = (torch.empty(len(basesize), 0, device=device, dtype=index_dtype),)
+                            values = torch.empty((0, *densesize), device=device, dtype=dtype)
+                        elif layout == torch.sparse_csr:
+                            crow_indices = torch.tensor([0] * (basesize[0] + 1), device=device, dtype=index_dtype)
+                            col_indices = torch.empty(0, device=device, dtype=index_dtype)
+                            indices = (crow_indices, col_indices)
+                            values = torch.empty((0, *densesize), device=device, dtype=dtype)
+                        elif layout == torch.sparse_csc:
+                            ccol_indices = torch.tensor([0] * (basesize[1] + 1), device=device, dtype=index_dtype)
+                            row_indices = torch.empty(0, device=device, dtype=index_dtype)
+                            indices = (ccol_indices, row_indices)
+                            values = torch.empty((0, *densesize), device=device, dtype=dtype)
+                        elif layout == torch.sparse_bsr:
+                            crow_indices = torch.tensor([0] * (basesize[0] // blocksize[0] + 1), device=device, dtype=index_dtype)
+                            col_indices = torch.empty(0, device=device, dtype=index_dtype)
+                            indices = (crow_indices, col_indices)
+                            values = torch.empty((0, *blocksize, *densesize), device=device, dtype=dtype)
+                        elif layout == torch.sparse_bsc:
+                            ccol_indices = torch.tensor([0] * (basesize[1] // blocksize[1] + 1), device=device, dtype=index_dtype)
+                            row_indices = torch.empty(0, device=device, dtype=index_dtype)
+                            indices = (ccol_indices, row_indices)
+                            values = torch.empty((0, *blocksize, *densesize), device=device, dtype=dtype)
+                        else:
+                            assert 0  # unreachable
+                        yield (*indices, values), dict(device=device, dtype=dtype, size=basesize + densesize)
+
     def safeToDense(self, t):
         # coalesce is only implemented for COO
         if t.layout == torch.sparse_coo:

From 27cbfc4460cc04966fa9c18bc8c95c550ca93edf Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Tue, 29 Nov 2022 22:57:55 +0000
Subject: [PATCH 1403/1922] [FSDP] Remove unneeded stream sync from
 `clip_grad_norm_()` (#89308)

We do not need to have the pre-unshard and unshard streams wait for the computation stream because we are not using the pre-unshard or unshard streams in `clip_grad_norm_()`.

The other change is simply avoiding a loop to get `grads`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89308
Approved by: https://github.com/mrshenli
---
 .../fsdp/fully_sharded_data_parallel.py        | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 78b10dbd07498..044ad2e8f59b9 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -56,7 +56,6 @@
     _reshard,
     _root_pre_forward,
     _should_free_in_backward,
-    _wait_for_computation_stream,
 )
 from torch.distributed.fsdp._wrap_utils import _auto_wrap
 from torch.distributed.fsdp.api import (
@@ -1188,11 +1187,6 @@ def clip_grad_norm_(
                 "`clip_grad_norm_()` should only be called on the root FSDP instance"
             )
         self._assert_state(TrainingState.IDLE)
-        _wait_for_computation_stream(
-            torch.cuda.current_stream(),
-            self._streams["unshard"],
-            self._streams["pre_unshard"],
-        )
         # If every FSDP instance uses `NO_SHARD`, then we can directly use
         # the normal `nn.utils` one targeting local gradients
         all_no_shard = all(
@@ -1209,6 +1203,7 @@ def clip_grad_norm_(
         norm_type = float(norm_type)
         sharded_params = set()
         nonsharded_params = set()  # `NO_SHARD` or not FSDP-managed
+        grads: List[torch.Tensor] = []
         for handle in FullyShardedDataParallel._fsdp_handles(self):
             target_set = (
                 sharded_params if handle.uses_sharded_strategy else nonsharded_params
@@ -1216,14 +1211,20 @@ def clip_grad_norm_(
             if handle._use_orig_params:
                 for param in handle.flat_param._params:
                     target_set.add(param)
+                    if param.grad is not None:
+                        grads.append(param.grad)
             else:
                 target_set.add(handle.flat_param)
+                if handle.flat_param.grad is not None:
+                    grads.append(handle.flat_param.grad)
         for param in self.parameters():
             not_fsdp_managed = (
                 param not in sharded_params and param not in nonsharded_params
             )
             if not_fsdp_managed:
                 nonsharded_params.add(param)
+                if param.grad is not None:
+                    grads.append(param.grad)
         local_sharded_norm = _get_grad_norm(sharded_params, norm_type).to(
             self.compute_device
         )
@@ -1246,14 +1247,11 @@ def clip_grad_norm_(
         if self.cpu_offload.offload_params:
             total_norm = total_norm.cpu()
 
-        clip_coef = torch.tensor(
-            max_norm, dtype=total_norm.dtype, device=total_norm.device
-        ) / (total_norm + 1e-6)
+        clip_coef = max_norm / (total_norm + 1e-6)
         # Multiplying by the clamped coefficient is meaningless when it is
         # equal to 1, but it avoids the host-device sync that would result from
         # `if clip_coef < 1`
         clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
-        grads = [param.grad for param in self.parameters() if param.grad is not None]
         for grad in grads:
             grad.detach().mul_(clip_coef_clamped.to(grad.device))
         return total_norm

From 5848e03735405e4e8e57d4248676e9a4cda16aa2 Mon Sep 17 00:00:00 2001
From: Iris <wz337@cornell.edu>
Date: Wed, 30 Nov 2022 02:23:27 +0000
Subject: [PATCH 1404/1922] [Checkpoint][2D][5/N] Add checkpoint_utils for
 distributed checkpoint to testing/_internal/distributed/ (#89873)

Moving checkpoint_utils from Tau: https://github.com/wz337/PiPPy/blob/6acf4054cfd10c8377d65fa1e4f18230d6711edd/spmd/testing/checkpoint_utils.py

Checkpoint_utils: add a wrapper to initialize a temp directory for checkpoint testing.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89873
Approved by: https://github.com/XilunWu, https://github.com/awgu, https://github.com/fduwjj
---
 .../_internal/distributed/checkpoint_utils.py | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 torch/testing/_internal/distributed/checkpoint_utils.py

diff --git a/torch/testing/_internal/distributed/checkpoint_utils.py b/torch/testing/_internal/distributed/checkpoint_utils.py
new file mode 100644
index 0000000000000..d72810580335d
--- /dev/null
+++ b/torch/testing/_internal/distributed/checkpoint_utils.py
@@ -0,0 +1,37 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+
+import shutil
+import tempfile
+from functools import wraps
+from typing import Any, Callable, Dict, Optional, Tuple
+
+import torch.distributed as dist
+
+
+def with_temp_dir(
+    func: Optional[Callable] = None,
+) -> Optional[Callable]:
+    """
+    Wrapper to initialize temp directory for distributed checkpoint.
+    """
+    assert func is not None
+
+    @wraps(func)
+    def wrapper(self, *args: Tuple[object], **kwargs: Dict[str, Any]) -> None:
+        # Only create temp_dir when rank is 0
+        if dist.get_rank() == 0:
+            temp_dir = tempfile.mkdtemp()
+        else:
+            temp_dir = ""
+        object_list = [temp_dir]
+        # Broadcast temp_dir to all the other ranks
+        dist.broadcast_object_list(object_list)
+        self.temp_dir = object_list[0]
+        print(f"Using temp directory: {self.temp_dir }")
+        try:
+            func(self)
+        finally:
+            if dist.get_rank() == 0:
+                shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+    return wrapper

From 835d920d4386df6f1679564b20a6e21e43da0afb Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Tue, 29 Nov 2022 22:56:37 +0000
Subject: [PATCH 1405/1922] [FSDP] Limit all gather after pre-unshard (#89057)

To reuse memory when allocating the unsharded `FlatParameter` in the unshard stream, we only need to block the CPU thread on the preceding free event (i.e. `event.synchronize()`) before allocating the unsharded memory, which happens in `handle.unshard()`. Notably, this can be done after the pre-unshard logic, which at most performs _sharded_ allocations (low precision shard or H2D sharded `FlatParameter` copy) in its own pre-unshard stream. This enables the pre-unshard to overlap with any pending ops.

With this change, I believe that we should use `limit_all_gathers=True` all the time to stay true to FSDP's proposed memory semantics.

If a user wants to set `limit_all_gathers=False`, that would mean that he/she wants to overlap ops that are issued after the unshard logic's all-gather with ops that are pending at the time when FSDP _would_ block the CPU thread via `event.synchronize()`.
- If the user is willing to not reuse memory for that all-gather, then the user may as well have applied `NO_SHARD` and optionally ZeRO-1 (if this niche is important, then maybe we should consider hardening ZeRO-1). This is because now the unsharded memory for the all-gather additionally contributes to peak memory since it cannot reuse memory.
- If the user wanted to reuse memory for that all-gather, then we needed to block the CPU thread. There is no way around that given the caching allocator semantics.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89057
Approved by: https://github.com/mrshenli
---
 torch/distributed/fsdp/_exec_order_utils.py | 8 ++++++--
 torch/distributed/fsdp/_runtime_utils.py    | 8 ++++----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/torch/distributed/fsdp/_exec_order_utils.py b/torch/distributed/fsdp/_exec_order_utils.py
index a7082113a3859..222afa775138c 100644
--- a/torch/distributed/fsdp/_exec_order_utils.py
+++ b/torch/distributed/fsdp/_exec_order_utils.py
@@ -48,7 +48,7 @@ def __init__(
         self.handles_post_forward_order: List[_HandlesKey] = []
         # Maps each handles key to its index in `handles_post_forward_order`
         self.handles_to_post_forward_order_index: Dict[_HandlesKey, int] = {}
-        self.is_first_iter = True
+        self._iter = 0
 
         # Gives the max number of backward/forward prefetched all-gathers by a
         # single module
@@ -99,6 +99,10 @@ def init(
         # to check that all ranks have the same handles in the same order.
         # https://github.com/pytorch/pytorch/issues/79620
 
+    @property
+    def is_first_iter(self) -> bool:
+        return self._iter == 0
+
     def get_handles_to_backward_prefetch(
         self,
         current_handles_key: _HandlesKey,
@@ -375,7 +379,7 @@ def next_iter(self):
         called in the post-backward callback since that marks the true end of
         an iteration.
         """
-        self.is_first_iter = False
+        self._iter += 1
         self.handles_to_post_forward_order_index.clear()
         self.handles_post_forward_order.clear()
         if self._checking_order:
diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index 8cb0ce323179a..b2cd316e2bf81 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -138,10 +138,6 @@ def _unshard(
     """
     if not handles:
         return
-    if state.limit_all_gathers:
-        event = state._free_event_queue.dequeue_if_needed()
-        if event:
-            event.synchronize()
     any_ran_pre_unshard = False
     with torch.cuda.stream(pre_unshard_stream):
         for handle in handles:
@@ -149,6 +145,10 @@ def _unshard(
             any_ran_pre_unshard = any_ran_pre_unshard or ran_pre_unshard
     if any_ran_pre_unshard:
         unshard_stream.wait_stream(pre_unshard_stream)
+    if state.limit_all_gathers:
+        event = state._free_event_queue.dequeue_if_needed()
+        if event:
+            event.synchronize()
     with torch.cuda.stream(unshard_stream):
         for handle in handles:
             handle.unshard()

From 6796dca74d6854a5a3fcaaadc63775b627da2c61 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Tue, 29 Nov 2022 22:56:37 +0000
Subject: [PATCH 1406/1922] [FSDP] Include module classes in
 `ModuleWrapPolicy.__repr__` (#89058)

Before:
```
<torch.distributed.fsdp.wrap.ModuleWrapPolicy object at 0x7fd4280f0fd0>
```
After:
```
<torch.distributed.fsdp.wrap.ModuleWrapPolicy object at 0x7fd4280f0fd0>({<class 'transformers.models.t5.modeling_t5.T5Block'>})
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89058
Approved by: https://github.com/mrshenli
---
 torch/distributed/fsdp/wrap.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/torch/distributed/fsdp/wrap.py b/torch/distributed/fsdp/wrap.py
index e20c07f18d132..0fbaee2e61243 100644
--- a/torch/distributed/fsdp/wrap.py
+++ b/torch/distributed/fsdp/wrap.py
@@ -88,11 +88,15 @@ def __init__(self, module_classes: Set[Type[nn.Module]]):
             _module_wrap_policy,
             module_classes=module_classes,
         )
+        self._module_classes_str = str(module_classes)
 
     @property
     def policy(self):
         return self._policy
 
+    def __repr__(self) -> str:
+        return super().__repr__() + f"({self._module_classes_str})"
+
 
 def lambda_auto_wrap_policy(
     module: nn.Module, recurse: bool, nonwrapped_numel: int, lambda_fn: Callable

From 8ccd4b8cc29aba23990bfdc4bea8918daaeed88f Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@fb.com>
Date: Wed, 30 Nov 2022 04:25:31 +0000
Subject: [PATCH 1407/1922] Add arguments to collect_results (#89611)

Fixes https://github.com/pytorch/torchdynamo/issues/1901. Test script:
```python
import copy

import torch
import torch._dynamo as dynamo
import torch._dynamo.config

dynamo.config.repro_after = "dynamo"
dynamo.config.repro_level = 4

def custom_backend(gm: torch.fx.GraphModule, example_inputs):
    gm = copy.deepcopy(gm)
    for node in gm.graph.nodes:
        if len(node.args) > 1:
            node.target = torch.add
            node.args = (node.args[0], 0)
    gm.recompile()
    return gm

inp = torch.ones(5)
inp.requires_grad_(True)

@dynamo.optimize(custom_backend)
def foo(x):
    x = x * x
    return x.sum()

y = foo(inp)
print(y)
y.backward()
print(inp.grad)
```
Before, the script will finish but output an incorrect gradient. After the change, the accuracy minifier is triggered.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89611
Approved by: https://github.com/ezyang
---
 torch/_dynamo/debug_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index 6af1e6c4cfdd7..a1591d27b16f4 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -529,7 +529,7 @@ def run_fwd_maybe_bwd(gm, args, only_fwd=False):
     if requires_bwd_pass(out):
         loss = reduce_to_scalar_loss(out)
         loss.backward()
-    return collect_results(gm, out, None, [])
+    return collect_results(gm, out, None, args)
 
 
 def same_two_models(gm, opt_gm, example_inputs, only_fwd=False):

From 6706cc6dee22af6ae0979caf75b41ed21836454d Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Tue, 29 Nov 2022 01:27:19 +0000
Subject: [PATCH 1408/1922] update subscriber list (#89799)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89799
Approved by: https://github.com/mrshenli
---
 CODEOWNERS | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index bd40cd14ad540..546d76430266a 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -45,16 +45,16 @@ nn/qat/ @jerryzh168
 # Distributed package
 # This list is mostly if you'd like to be tagged as reviewer, feel free to add
 # or remove yourself from it.
-/torch/csrc/distributed/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @H-Huang @awgu @kwen2501
-/torch/distributed/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @H-Huang @awgu @kwen2501
+/torch/csrc/distributed/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @H-Huang @awgu @kwen2501 @wanchaol
+/torch/distributed/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @H-Huang @awgu @kwen2501 @wanchaol
 /torch/distributed/_composable @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @H-Huang @awgu @kwen2501 @yhcharles
-/torch/nn/parallel/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @H-Huang @awgu @kwen2501
+/torch/nn/parallel/ @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @H-Huang @awgu @kwen2501 @wanchaol
 
 # Distributed tests
 # This list is mostly if you'd like to be tagged as reviewer, feel free to add
 # or remove yourself from it.
-/test/distributed @mrshenli @pritamdamania87 @zhaojuanmao @rohan-varma @H-Huang @awgu @kwen2501
-/torch/testing/_internal/distributed @mrshenli @pritamdamania87 @zhaojuanmao @rohan-varma @H-Huang @awgu @kwen2501
+/test/distributed @mrshenli @pritamdamania87 @zhaojuanmao @rohan-varma @H-Huang @awgu @kwen2501 @wanchaol
+/torch/testing/_internal/distributed @mrshenli @pritamdamania87 @zhaojuanmao @rohan-varma @H-Huang @awgu @kwen2501 @wanchaol
 
 # ONNX Export
 /torch/csrc/jit/passes/onnx.h @bowenbao @abock

From 31709033d00d91619fb7a74a9ad2eaef02d7983e Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Wed, 30 Nov 2022 01:32:50 +0000
Subject: [PATCH 1409/1922] [dtensor] update README (#89800)

This PR updates README to include the RFC details
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89800
Approved by: https://github.com/mrshenli
---
 torch/distributed/_tensor/README.md | 168 +++++++++++++++++++++++++++-
 1 file changed, 166 insertions(+), 2 deletions(-)

diff --git a/torch/distributed/_tensor/README.md b/torch/distributed/_tensor/README.md
index 9bbd71b764e5f..4f10464e69264 100644
--- a/torch/distributed/_tensor/README.md
+++ b/torch/distributed/_tensor/README.md
@@ -1,3 +1,167 @@
-# Distributed Tensor
+# PyTorch DistributedTensor (DTensor)
 
-This is a prototype distributed tensor implementation that implements most of the basic parts in the RFC https://docs.google.com/document/d/15R3fmoPbzedlKSjtpQ97HFPidp9QTXLEap6gyIvRrMY/edit#
+This folder contains the DistributedTensor (a.k.a DTensor) implementation in PyTorch.
+
+## Introduction
+We propose distributed tensor primitives to allow easier distributed computation authoring in SPMD(Single Program Multiple Devices) paradigm. The primitives are simple but powerful when used to express tensor distributions with both sharding and replication parallelism strategies. This could empower native Tensor parallelism among other advanced parallelism explorations. For example, to shard a big tensor across devices with 3 lines of code:
+
+```python
+import torch
+from torch.distributed import DeviceMesh, Shard, distribute_tensor
+
+# Create a mesh topology with the available devices.
+mesh = DeviceMesh("cuda", list(range(world_size)))
+big_tensor = torch.randn(100000, 88)
+# Shard this tensor over the mesh by sharding `big_tensor`'s 0th dimension over the 0th dimension of `mesh`.
+my_dtensor = distribute_tensor(big_tensor, mesh, [Shard(dim=0)])
+```
+
+## Motivation
+
+Today there are mainly three ways to scale up distributed training: Data Parallel, Tensor Parallel and Pipeline Parallel. Each of them works on a separate dimension where solutions have been built independently (i.e. PyTorch DDP, FSDP, ShardedTensor, PiPPy, etc.). When training really large models, users would like to use these technologies together (i.e. 3-D Parallelism), while the interoperability of the existing solutions are not great and often hard to use (i.e. users might want arbitrary combinations of the data parallel, tensor parallel and pipeline parallel). This is becoming an issue for users and one of the biggest reasons is that there is no common abstraction that build the bridge between different parallelism strategies.
+
+An ideal scenario is that users could build their distributed program just like authoring in a single node/device, without worrying about how to do distributed training in a cluster, and our solutions could help them run distributed training in an efficient manner. For example, researchers just need to build the big transformer model, and PyTorch Distributed automatically figures out how to split the model and run pipeline parallel across different nodes, how to run data parallel and tensor parallel within each node. In order to achieve this, we need some common abstractions to distribute tensor values and distributed computations accordingly.
+
+There're many recent works that working on tensor level parallelism to provide common abstractions, see the `Related Works` in the last section for more details. Inspired by [GSPMD](https://arxiv.org/pdf/2105.04663.pdf), [Oneflow](https://arxiv.org/pdf/2110.15032.pdf) and [TF’s DTensor](https://www.tensorflow.org/guide/dtensor_overview), we introduce DistributedTensor as the next generation of ShardedTensor to provide basic abstractions for distributing storage and computation. It serves as one of the basic building blocks for distributed program translations and describes the layout of a distributed training program. With the DistributedTensor abstraction, we can seamlessly build parallelism strategies such as tensor parallelism, DDP and FSDP.
+
+## Value Propsition
+
+DistributedTensor primarily:
+-   Offers a uniform way to save/load `state_dict` during checkpointing, even when there’re complex tensor storage distribution strategies such as combining tensor parallelism with parameter sharding in FSDP.
+-   Enables Tensor Parallelism in eager mode. Compared to ShardedTensor, DistributedTensor allows additional flexibility to mix sharding and replication.
+-   Serves as the entry point of an SPMD programming model and the foundational building block for compiler-based distributed training.
+
+## PyTorch DistributedTensor
+
+### DistributedTensor API
+
+We offer both a lower level DistributedTensor API and a module level API to create a `nn.Module` with “distributed” parameters.
+
+#### Basic DistributedTensor API Examples
+
+Here are some basic DistributedTensor API examples that showcase:
+1. How to construct a DistributedTensor directly, to represent different types of sharding, replication, sharding + replication strategies.
+2. How to create DistributedTensor from a local `torch.Tensor`.
+3. How to “reshard” an existing DistributedTensor to a different DistributedTensor with modified placement strategy or world size.
+
+```python
+import torch
+import torch.distributed as distributed
+from torch.distributed import DTensor, DeviceMesh, Shard, Replicate, distribute_module
+
+# construct a device mesh with available devices (multi-host or single host)
+device_mesh = DeviceMesh(device_type="cuda", [0, 1, 2, 3])
+# if we want to do row-wise sharding
+rowwise_placement=[Shard(0)]
+# if we want to do col-wise sharding
+colwise_placement=[Shard(1)]
+# distributed tensor returned will be sharded across the dimension specified in placements
+distributed.empty((8, 12), device_mesh=device_mesh, placements=rowwise_placement)
+
+# if we want to do replication across a certain device list
+replica_placement = [Replicate()]
+# distributed tensor will be replicated to all four GPUs.
+distributed.empty((8, 12), device_mesh=device_mesh, placements=replica_placement)
+
+# if we want to distributed a tensor with both replication and sharding
+device_mesh = DeviceMesh(device_type="cuda", [[0, 1], [2, 3]])
+# replicate across the first dimension of device mesh, then sharding on the second dimension of device mesh
+spec=[Replicate(), Shard(0)]
+distributed.empty((8, 8), device_mesh=device_mesh, placements=spec)
+
+# create a DistributedTensor that shards on dim 0, from a local torch.Tensor
+local_tensor = torch.randn((8, 8), requires_grad=True)
+rowwise_tensor = DTensor.from_local(local_tensor, device_mesh, rowwise_placement)
+
+# reshard the current rowise tensor to a colwise tensor or replicate tensor
+colwise_tensor = rowwise_tensor.redistribute(device_mesh, colwise_placement)
+replica_tensor = colwise_tensor.redistribute(device_mesh, replica_placement)
+
+```
+
+#### High level User Facing APIs
+
+Users can use DistributedTensor tensor constructors directly to create a distributed tensor (i.e. `distributed.ones/empty`), but for existing modules like `nn.Linear` that are already having `torch.Tensor` as parameters, how to make them distributed parameters? We offer a way to directly distribute a `torch.Tensor` and a module level APIs to directly distribute the module parameters. Below is the high level API we introduce:
+
+```python
+def distribute_tensor(tensor: torch.Tensor, device_mesh: DeviceMesh=None, placements: List[Placement]=None):
+    '''
+    distribute the tensor according to device_mesh and placements, `tensor` could be a "meta" tensor.
+    '''
+
+def distribute_module(
+    module: nn.Module,
+    device_mesh: DeviceMesh=None,
+    partition_fn: Callable[[str, nn.Module, DeviceMesh], ...]=None,
+    input_fn: Callable[...., None]=None,
+    output_fn: Callable[...., None]=None,
+):
+    '''
+    This function converts all module parameters to distributed tensor parameters according to the `partition_fn` specified.
+    It could also control the input/output of the module by specifying the `input_fn` and `output_fn`.
+    '''
+```
+
+#### High level API examples:
+
+```python
+def MyModule(nn.Module):
+    def __init__(self):
+        super.__init__()
+        self.fc1 = nn.Linear(8, 8)
+        self.fc2 = nn.Linear(8, 8)
+        self.relu = nn.ReLU()
+
+    def forward(self, input):
+        return self.relu(self.fc1(input) + self.fc2(input))
+
+mesh = DeviceMesh(device_type="cuda", [[0, 1], [2, 3]])
+
+def shard_params(mod_name, mod, mesh):
+    rowwise_placement = [Shard(0)]
+    def to_dist_tensor(t): return distribute_tensor(t, mesh, rowwise_placement)
+    mod._apply(to_dist_tensor)
+
+sharded_module = distribute_module(model, device_mesh, partition_fn=shard_params)
+
+def shard_fc(mod_name, mod, mesh):
+    rowwise_placement = [Shard(0)]
+    if mod_name == "fc1":
+        mod.weight = torch.nn.Parameter(distribute_tensor(mod.weight, mesh, rowwise_placement))
+
+sharded_module = distribute_module(model, device_mesh, partition_fn=shard_fc)
+```
+
+## Compiler and DistributedTensor
+
+DistributedTensor provides efficient solutions for cases like Tensor Parallelism. But when using the DTensor's replication in a data parallel fashion, it might become observably slower compared to our existing solutions like DDP/FSDP. This is mainly because mainly because DDP/FSDP have a global view of the entire model architecture, thus could optimize for data parallel specifically, i.e. collective fusion and computation overlap, etc. In contract, DistributedTensor as a Tensor-like object can only optimize within individual tensor operations.
+
+To improve efficiency of DistributedTensor-based data parallel training, we are exploring a compiler-based solution on top of DistributedTensor, which can extract graph information from user programs to expose more performance optimization opportunities.
+
+## Related Works
+
+This work is mainly inspired by [GSPMD](https://arxiv.org/pdf/2105.04663.pdf), [Oneflow](https://arxiv.org/pdf/2110.15032.pdf) and [TF’s DTensor](https://www.tensorflow.org/guide/dtensor_overview). All of these three works use a single “distributed tensor” concept for both replication and sharding, and the solutions could enable users to build up their distributed training program in a uniform SPMD programming model. Specifically:
+
+GSPMD:
+-   GSPMD is now the fundamental component of JAX/TensorFlow distributed training and enables various optimizations with the XLA compiler to allow users to train their models efficiently in a large scale setting.
+-   Fundamentally, GSPMD have three types of sharding strategies within a tensor: “tiled”, “replicated”, “partially tiled” to represent sharding and replication.
+-   At the core of GSPMD Partitioner, it utilizes the XLA compiler to do advanced optimizations, i.e. sharding propagation and compiler based fusion.
+-   XLA mark_sharding API: PyTorch XLA’s [mark_sharding](https://github.com/pytorch/xla/pull/3476) API uses [XLAShardedTensor](https://github.com/pytorch/xla/issues/3871) abstraction (i.e. sharding specs) in PyTorch/XLA. Under the hood XLAShardedTensor is utilizing the GPSMD partitioner to enable SPMD style training on TPU.
+
+OneFlow GlobalTensor:
+
+-  OneFlow is building up their own solution of the “GlobalTensor” concept, which is a variant form of GSPMD sharding, allowing users to explore different parallel strategies with GlobalTensor.
+-  OneFlow also has three types of tensor, but they are slightly different from GSPMD: “split”, “broadcast”, and “partial sum”. They don’t use partially tiled and instead have a concept of partial sum to partition the values.
+
+TensorFlow DTensor:
+-   [DTensor Concepts](https://www.tensorflow.org/guide/dtensor_overview) is an extension of TensorFlow synchronous distributed training. its sharding API, supported features and its compilation passes with MLIR.
+-   DTensor also allows sharding and replication on an n-d mesh like device network.
+-   DTensor implements MLIR passes to do propagation and operator implementations.
+
+There are also several cutting edge research fields that embeds tensor sharding as part of the system, i.e. [Megatron-LM](https://arxiv.org/pdf/1909.08053.pdf) for tensor parallelism on Transformer based models. [DeepSpeed](https://github.com/microsoft/DeepSpeed) for training large scale models with different optimization techniques on top of tensor sharding.
+
+### Additional context
+
+RFC: https://github.com/pytorch/pytorch/issues/88838
+
+We are gathering early feedbacks about this proposal. We have also posted this [RFC](https://dev-discuss.pytorch.org/t/rfc-pytorch-distributedtensor/740) to the dev-discuss forum, please feel free to comment directly in the above issue or in the forum post. To see a complete design doc with additional details about DTesnor, please refer to this [doc](https://docs.google.com/document/d/1nFeJ8NSFNhNlCkNgWK31ZGRqm1L9rd0i_XN_RprphaI/edit#heading=h.6sovjqv9jiqn)

From 26864b0c4eee6ffd1cccb962cfe26b54c92255cb Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Tue, 29 Nov 2022 23:08:29 +0000
Subject: [PATCH 1410/1922] Update DDP docs for Dynamo/DDPOptimizer (#89096)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89096
Approved by: https://github.com/msaroufim
---
 docs/source/notes/ddp.rst | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/docs/source/notes/ddp.rst b/docs/source/notes/ddp.rst
index 3a52697069b7f..66dbc6d010545 100644
--- a/docs/source/notes/ddp.rst
+++ b/docs/source/notes/ddp.rst
@@ -65,7 +65,18 @@ updated, and all models on different processes should be exactly the same.
         os.environ["MASTER_PORT"] = "29500"
         main()
 
+DDP works with TorchDynamo.  When used with TorchDynamo, apply the DDP model wrapper
+before compiling the model, such that torchdynamo can apply ``DDPOptimizer``
+(graph-break optimizations) based on DDP bucket sizes.  (See `TorchDynamo DDPOptimizer <./ddp.html#torchdynamo-ddpoptimizer>`_ for more information.)
 
+TorchDynamo support for DDP currently requires setting `static_graph=True` and `find_unused_parameters=True`, due to
+interactions between the graph tracing process and DDP's mechanism for observing operations happening on its module,
+but this should be fixed ultimately.
+
+.. code::
+
+        ddp_model = DDP(model, device_ids=[rank])
+        ddp_model = torch.compile(ddp_model)
 
 Internal Design
 ^^^^^^^^^^^^^^^
@@ -193,3 +204,24 @@ DistributedDataParallel
 .. image:: https://user-images.githubusercontent.com/16999635/72313120-4e7c1c80-3658-11ea-9c6d-44336b2daeac.png
     :alt: ddp_code.png
     :width: 400 px
+
+
+TorchDynamo DDPOptimizer
+------------------------
+
+DDP's performance advantage comes from overlapping allreduce collectives with computations during backwards.
+AotAutograd prevents this overlap when used with TorchDynamo for compiling a whole forward and whole backward graph,
+becuase allreduce ops are launched by autograd hooks _after_ the whole optimized backwards computation finishes.
+
+TorchDynamo's DDPOptimizer helps by breaking the forward graph at the logical boundaries of DDP's allreduce buckets
+during backwards.  Note: the goal is to break the graph during backwards, and the simplest implementation is to
+break the forward graphs and then call AotAutograd and compilation on each section.  This allows DDP's allreduce hooks
+to fire in-between sections of backwards, and schedule communications to overlap with compute.
+
+See `this blog post <https://dev-discuss.pytorch.org/t/torchdynamo-update-9-making-ddp-work-with-torchdynamo/860/1>`_ for
+a more in-depth explanation and experimental results, or read the docs and code at
+`torch/_dynamo/optimizations/distributed.py <https://github.com/pytorch/pytorch/blob/4908a12542798a3e8641faae6b74f068fdfc6778/torch/_dynamo/optimizations/distributed.py#L56>`_
+
+To Debug DDPOptimizer, set `torch._dynamo.config.log_level` to DEBUG (for full graph dumps) or INFO
+(for basic info about bucket boundaries).  To disable DDPOptimizer, set `torch._dynamo.config.optimize_ddp=False`.
+DDP and TorchDynamo should still work correctly without DDPOptimizer, but with performance degradation.
\ No newline at end of file

From 125a3dae6897a6f65ea791164f492e65196630a3 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 30 Nov 2022 06:05:44 +0000
Subject: [PATCH 1411/1922] [Dynamo] Fix source/reconstruction bugs in NNModule
 named_* calls (#89729)

Fixes https://github.com/pytorch/torchdynamo/issues/1931

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89729
Approved by: https://github.com/ezyang
---
 test/dynamo/test_modules.py          | 59 ++++++++++++++++++++++++++++
 torch/_dynamo/variables/nn_module.py | 21 ++++++----
 2 files changed, 72 insertions(+), 8 deletions(-)

diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py
index 6dde69effff99..f510fb87522c5 100644
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@@ -595,6 +595,57 @@ def forward(self, x):
         return self.activation(self.linear(self.initializer + x)) * self.scale
 
 
+class ModuleForwardHasGraphBreak(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.layer1 = BasicModule()
+        self.layer2 = BasicModule()
+        self.layer3 = torch.nn.Sequential(BasicModule(), BasicModule())
+        self.layer4 = torch.nn.ModuleList(
+            [
+                torch.nn.Linear(10, 10),
+                torch.nn.ReLU(),
+                torch.nn.Linear(10, 10),
+                torch.nn.ReLU(),
+            ]
+        )
+        self.layer5 = torch.nn.ModuleDict(
+            {
+                "0": torch.nn.Linear(10, 10),
+            }
+        )
+        self.scale = torch.randn(1, 10)
+
+    def forward(self, x):
+        """
+        This is used to test if the results of functions like `named_parameters`
+        can be reconstructed correctly after graph break.
+
+        https://github.com/pytorch/torchdynamo/issues/1931
+        """
+        x = self.layer1(x)
+        params1 = dict(self.named_parameters())
+        params2 = list(self.parameters())
+        buffers1 = dict(self.named_buffers())
+        buffers2 = list(self.buffers())
+        modules1 = dict(self.named_modules())
+        modules2 = list(self.modules())
+        torch._dynamo.graph_break()
+        y = modules2
+        y = modules1
+        y = buffers2
+        y = buffers1
+        y = params2
+        y = params1
+        x = (
+            self.layer2(x)
+            + y["layer3.1.linear1.weight"]
+            + y["layer4.2.weight"]
+            + y["layer5.0.weight"]
+        )
+        return x * self.scale
+
+
 def make_test(fn, expected_ops=None):
     def test_fn(self):
         return torch._dynamo.testing.standard_test(
@@ -646,6 +697,14 @@ class NNModuleTests(torch._dynamo.test_case.TestCase):
     test_module_name_string = make_test(ModuleNameString())
     test_module_attribute_precedence = make_test(ModuleAttributePrecedence())
 
+    def test_module_forward_has_graph_break(self):
+        m = ModuleForwardHasGraphBreak()
+        x = torch.rand([10, 10])
+        ref = m(x)
+        opt_m = torch._dynamo.optimize("eager")(m)
+        res = opt_m(x)
+        self.assertTrue(torch.allclose(ref, res))
+
     def test_unsupportedmethod(self):
         m = UnsupportedMethodCall()
         i = torch.randn(10)
diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py
index 7dbd0ba331f9d..454daae1d1f63 100644
--- a/torch/_dynamo/variables/nn_module.py
+++ b/torch/_dynamo/variables/nn_module.py
@@ -1,7 +1,6 @@
 import functools
 import inspect
 import itertools
-import re
 import types
 from contextlib import contextmanager
 from typing import Dict, List
@@ -283,18 +282,15 @@ def get_kwargs(*names):
             bound_args = bound_args.arguments
             return {k: bound_args[k] for k in names}
 
-        def wrap_values(items, getsource=AttrSource):
+        def wrap_values(items):
             result = []
             for name, submod in items:
-                # layer.0.foo => layer[0].foo
-                name = re.sub(r"[.]([0-9]+)([.]|$)", r"[\1]\2", name)
-                src = NNModuleSource(getsource(self.source, name))
                 result.append(
                     tx.output.register_attr_or_module(
                         submod,
                         key,
                         name,
-                        source=src,
+                        source=NNModuleSource(gen_source(self.source, name)),
                         **options,
                     )
                 )
@@ -308,12 +304,21 @@ def named_embed(name, obj):
                         obj,
                         key,
                         name,
-                        source=NNModuleSource(AttrSource(self.source, name)),
+                        source=NNModuleSource(gen_source(self.source, name)),
                         **options,
                     ),
                 ]
             )
 
+        def gen_source(source, name):
+            name_split = name.split(".")
+            if name_split[0] == "":
+                return source
+            while len(name_split) > 0:
+                x = name_split.pop(0)
+                source = AttrSource(source, x)
+            return source
+
         if name == "children":
             assert not (args or kwargs)
             return wrap_values(module.named_children())
@@ -344,7 +349,7 @@ def named_embed(name, obj):
             return wrap_values(module.named_parameters(**get_kwargs("recurse")))
         elif name == "values":
             assert not (args or kwargs)
-            return wrap_values(module.items(), GetItemSource)
+            return wrap_values(module.items())
         elif name == "items":
             assert not (args or kwargs)
             result = []

From e110c5791b3b7564035fb3734809452fe9084bc9 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Wed, 30 Nov 2022 06:07:11 +0000
Subject: [PATCH 1412/1922] Dynamo, FX, Inductor Progress Bars (#88384)

There are 3 progress bars each gated behind their own config, all off by default for now
1. Dynamo: Macro level config for dynamo, AOT, inductor
2. FX: Progress bar for each pass, with their names
3. Inductor

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88384
Approved by: https://github.com/wconstab, https://github.com/mlazos
---
 torch/_dynamo/logging.py                | 21 +++++++++++++++++++++
 torch/_dynamo/optimizations/analysis.py |  1 +
 torch/_inductor/codecache.py            | 22 ++++++++++++++++++++--
 torch/_inductor/codegen/common.py       |  2 ++
 torch/_inductor/config.py               |  6 ++++++
 torch/_inductor/graph.py                |  1 +
 torch/_inductor/ir.py                   |  3 +++
 torch/_inductor/sizevars.py             |  1 +
 torch/_inductor/virtualized.py          |  3 +++
 torch/fx/config.py                      |  6 ++++++
 torch/fx/interpreter.py                 |  8 ++++++--
 11 files changed, 70 insertions(+), 4 deletions(-)
 create mode 100644 torch/fx/config.py

diff --git a/torch/_dynamo/logging.py b/torch/_dynamo/logging.py
index 95ee727f1ddf1..b2fa67fbdf6ae 100644
--- a/torch/_dynamo/logging.py
+++ b/torch/_dynamo/logging.py
@@ -2,10 +2,15 @@
 import logging
 import os
 
+from torch.hub import tqdm
+
 # logging level for dynamo generated graphs/bytecode/guards
 logging.CODE = 15
 logging.addLevelName(logging.CODE, "CODE")
 
+# Disable progress bar by default, not in dynamo config because otherwise get a circular import
+disable_progress = True
+
 
 # Return all loggers that torchdynamo/torchinductor is responsible for
 def get_loggers():
@@ -78,8 +83,24 @@ def init_logging(log_level, log_file_name=None):
 
 _step_counter = itertools.count(1)
 
+# Update num_steps if more phases are added: Dynamo, AOT, Backend
+# This is very inductor centric
+# _inductor.utils.has_triton() gives a circular import error here
+
+if not disable_progress:
+    try:
+        import triton  # noqa: F401
+
+        num_steps = 3
+    except ImportError:
+        num_steps = 2
+    pbar = tqdm(total=num_steps, desc="torch.compile()", delay=15)
+
 
 def get_step_logger(logger):
+    if not disable_progress:
+        pbar.set_postfix_str(f"{logger.name}")
+        pbar.update(1)
     step = next(_step_counter)
 
     def log(level, msg):
diff --git a/torch/_dynamo/optimizations/analysis.py b/torch/_dynamo/optimizations/analysis.py
index d83e57fdca6e2..f732fb322438f 100644
--- a/torch/_dynamo/optimizations/analysis.py
+++ b/torch/_dynamo/optimizations/analysis.py
@@ -21,6 +21,7 @@ def __init__(self, *args, **kwargs):
         self.input_alias_groups = set()
         self.storage_to_alias_group = dict()
         self.make_alias_group = itertools.count(1)
+        self.name = "ShapeAliasingAndMutation"
 
     def tensor_alias_group(self, value: torch.Tensor):
         """Assign a unique identifier to the storage of a given tensor"""
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index bca8c59830be8..7bca753034fa3 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -21,6 +21,8 @@
 from typing import Any, Callable, Dict, List
 
 import torch
+
+from torch.hub import tqdm
 from torch.utils import cpp_extension
 from . import config, cuda_properties, exc
 
@@ -560,7 +562,7 @@ def warm_pool(cls):
         if hasattr(pool, "_start_queue_management_thread"):
             pool._start_queue_management_thread()
         else:
-            for i in range(config.compile_threads):
+            for _ in range(config.compile_threads):
                 pool._adjust_process_count()
             pool._start_executor_manager_thread()
         _compile_end()
@@ -601,10 +603,26 @@ def task():
         return self.submit(task)
 
     def wait(self, scope: Dict[str, Any]):
+        num_kernels = len(
+            [
+                value
+                for key, value in scope.items()
+                if isinstance(value, (Future, TritonFuture))
+            ]
+        )
+        pbar = tqdm(
+            total=num_kernels,
+            desc="Inductor Compilation",
+            disable=config.disable_progress,
+            delay=15,
+        )
         if config.compile_threads > 1:
-            for key, result in list(scope.items()):
+            for key, result in scope.items():
+                if config.verbose_progress:
+                    pbar.set_postfix_str(key)
                 if isinstance(result, (Future, TritonFuture)):
                     scope[key] = result.result()
+                    pbar.update(1)
 
         _compile_end()
 
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index da64f3e63584e..329dbaaa18fc6 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -566,6 +566,8 @@ def reduction(self, name, dtype, src_dtype, reduction_type, index, value):
 
     def __enter__(self):
         class CSEProxy:
+            self.name = "CSEProxy"
+
             @staticmethod
             def __getattr__(name):
                 def inner(*args, **kwargs):
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index 1639fc6aa860a..545f545c5026a 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -4,6 +4,12 @@
 # add some debug printouts
 debug = False
 
+# Whether to disable a progress bar for autotuning
+disable_progress = True
+
+# Whether to enable printing the source code for each future
+verbose_progress = False
+
 # dead code elimination
 dce = False
 
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 44f136b356a71..9b6ff5f424e49 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -88,6 +88,7 @@ def __init__(
         self.randomness_seeds = []
         self.name_to_buffer = {}
         self.creation_time = time.time()
+        self.name = "GraphLowering"
 
     def get_dtype(self, buffer_name):
         if buffer_name in self.constants:
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 66d1f9658a406..b8e872cdeee5c 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -3951,6 +3951,8 @@ def add_index(expr, category, buf_name=None):
             )
 
         class CaptureIndexing(V.WrapperHandler):
+            self.name = "CaptureIndexing"
+
             def load(self, name: str, index: sympy.Expr):
                 index = add_index(index, "reads", name)
                 return self._inner.load(name, index)
@@ -4033,6 +4035,7 @@ def __init__(self):
                 self.garbage_collect_values = False
                 self.env = {}
                 self.fetch_attr = submodules.__getitem__
+                self.name = V.get_ops_handler().name
 
         return InterpreterShim().run(V.get_ops_handler())
 
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
index 67902bb23b2d3..baea5a109bba6 100644
--- a/torch/_inductor/sizevars.py
+++ b/torch/_inductor/sizevars.py
@@ -567,6 +567,7 @@ class SimplifyIndexing(V.WrapperHandler):  # type: ignore[name-defined]
 
     def __init__(self, inner, var_ranges: VarRanges):
         super().__init__(inner)
+        self.name = "SimplifyIndexing"
         self._simplify: Callable[
             [Expr], Expr
         ] = lambda index: V.graph.sizevars.simplify_with_ranges(index, var_ranges)
diff --git a/torch/_inductor/virtualized.py b/torch/_inductor/virtualized.py
index 27e60b1daf1df..cff6770997371 100644
--- a/torch/_inductor/virtualized.py
+++ b/torch/_inductor/virtualized.py
@@ -57,6 +57,9 @@ def _arg_str(a):
 
 class MockHandler:
     def __getattr__(self, name):
+        if name == "name":
+            return "MockHandler"
+
         def inner(*args, **kwargs):
             fargs = [_arg_str(a) for a in args]
             fargs.extend(f"{k}={v}" for k, v in kwargs.items())
diff --git a/torch/fx/config.py b/torch/fx/config.py
new file mode 100644
index 0000000000000..da5120d6edf18
--- /dev/null
+++ b/torch/fx/config.py
@@ -0,0 +1,6 @@
+# Whether to disable showing progress on compilation passes
+# Need to add a new config otherwise wil get a circular import if dynamo config is imported here
+disable_progress = True
+
+# If True this also shows the node names in each pass, for small models this is great but larger models it's quite noisy
+verbose_progress = False
diff --git a/torch/fx/interpreter.py b/torch/fx/interpreter.py
index 6428d4c5c3bb5..683a6bd90b501 100644
--- a/torch/fx/interpreter.py
+++ b/torch/fx/interpreter.py
@@ -4,10 +4,12 @@
 from .proxy import Proxy
 from ._symbolic_trace import Tracer
 from ._compatibility import compatibility
+from . import config
 import torch.fx.traceback as fx_traceback
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 import inspect
 from contextlib import contextmanager
+from torch.hub import tqdm
 
 __all__ = ['Interpreter', 'Transformer']
 
@@ -72,7 +74,7 @@ def __init__(self, module : GraphModule, garbage_collect_values : bool = True):
         self.module = module
         self.submodules = dict(self.module.named_modules())
         self.env : Dict[Node, Any] = {}
-
+        self.name = "Interpreter"
         self.garbage_collect_values = garbage_collect_values
 
         if self.garbage_collect_values:
@@ -118,7 +120,9 @@ def run(self, *args, initial_env : Optional[Dict[Node, Any]] = None, enable_io_p
             args = self.module.graph.process_inputs(*args)
         self.args_iter : Iterator[Any] = iter(args)
 
-        for node in self.module.graph.nodes:
+        for node in tqdm(self.module.graph.nodes,
+                         desc=f"{self.name}: {str(list(self.module.graph.nodes)) if config.verbose_progress else ''}",
+                         initial=1, position=0, leave=True, disable=config.disable_progress, delay=15):
             if node in self.env:
                 # Short circuit if we have this value. This could
                 # be used, for example, for partial evaluation

From 1eefdde4e07fe8b3acd6726f0817ac0cd370e330 Mon Sep 17 00:00:00 2001
From: fduwjj <fduwjj@fb.com>
Date: Wed, 30 Nov 2022 03:21:09 +0000
Subject: [PATCH 1413/1922] [PT-D][Tensor Parallel] Add more test cases when we
 use use_orig_params for FSDP wrapping (#89779)

Differential Revision: [D41600656](https://our.internmc.facebook.com/intern/diff/D41600656)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89779
Approved by: https://github.com/wanchaol
---
 .../_tensor/parallel/test_2d_parallel.py      | 149 ++++++++++--------
 torch/distributed/_tensor/dispatch.py         |   4 +-
 2 files changed, 84 insertions(+), 69 deletions(-)

diff --git a/test/distributed/_tensor/parallel/test_2d_parallel.py b/test/distributed/_tensor/parallel/test_2d_parallel.py
index ea41d5388660e..da6d1f5cfabd2 100644
--- a/test/distributed/_tensor/parallel/test_2d_parallel.py
+++ b/test/distributed/_tensor/parallel/test_2d_parallel.py
@@ -10,10 +10,8 @@
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
 from torch.distributed._tensor import (
-    distribute_tensor,
     DeviceMesh,
     DTensor as DT,
-    Shard,
     Replicate,
 )
 from torch.distributed._tensor.parallel import (
@@ -52,67 +50,26 @@ def forward(self, x):
         return x
 
 
-def _aggregate_local_tensor(module: torch.nn.Module) -> torch.nn.Module:
-    def hook_func(_module, _input, output):
-        if isinstance(output, DT):
-            replica_placement = [Replicate()]
-            return output.redistribute(
-                output.device_mesh, replica_placement
-            ).to_local()
-
-    module.register_forward_hook(hook_func)
-    return module
-
-
-def _replicate_input_tensor(
-    module: torch.nn.Module, device_mesh, replica_placement
-) -> torch.nn.Module:
-    def hook_func(_, input):
-        if not isinstance(input[0], DT):
-            return DT.from_local(
-                input[0], device_mesh, replica_placement, run_check=False
-            )
-
-    module.register_forward_pre_hook(hook_func)
-    return module
-
-
-def shard_module(m, pg):
-    start_idx = distributed_c10d.get_global_rank(pg, 0)
-    device_mesh = DeviceMesh(
-        "cuda", list(range(start_idx, start_idx + pg.size())), dim_groups=[pg]
-    )
-    col_wise_sharding = [Shard(0)]
-    row_wise_sharding = [Shard(1)]
-    replicate = [Replicate()]
-    m.net1.weight = torch.nn.Parameter(
-        distribute_tensor(m.net1.weight, device_mesh, col_wise_sharding),
-    )
-    m.net2.weight = torch.nn.Parameter(
-        distribute_tensor(m.net2.weight, device_mesh, row_wise_sharding)
-    )
-    m.net1.bias = torch.nn.Parameter(
-        distribute_tensor(m.net1.bias, device_mesh, col_wise_sharding)
-    )
-    m.net2.bias = torch.nn.Parameter(
-        distribute_tensor(m.net2.bias, device_mesh, replicate)
-    )
-    m = _replicate_input_tensor(m, device_mesh, replicate)
-    m.net2 = _aggregate_local_tensor(m.net2)
-
-
-def _shard_wrap_module(module, module_shard, fsdp_wrap, mesh_2d, fsdp_pg):
+def _distribute_and_fsdp_wrap_module(
+    module, module_shard, mesh_2d, fsdp_pg, use_orig_params, fsdp_nested
+):
     if module_shard:
-        parallelize_module(module, mesh_2d, PairwiseParallel(), tp_mesh_dim=1)
+        module = parallelize_module(module, mesh_2d, PairwiseParallel(), tp_mesh_dim=1)
+    pg = fsdp_pg if module_shard else distributed_c10d._get_default_group()
 
-    if fsdp_wrap and module_shard:
-        return FSDP(module, process_group=fsdp_pg)
-    if fsdp_wrap:
-        return FSDP(module, process_group=distributed_c10d._get_default_group())
-    return module
+    if fsdp_nested:
+        module.net1 = FSDP(
+            module.net1, process_group=pg, use_orig_params=use_orig_params
+        )
+        module.net2 = FSDP(
+            module.net2, process_group=pg, use_orig_params=use_orig_params
+        )
+    return FSDP(
+        module, process_group=pg, use_orig_params=use_orig_params
+    )
 
 
-def init_model(model_parallel_size=TP_DEGREE):
+def init_model(model_parallel_size=TP_DEGREE, use_orig_params=False, fsdp_nested=False):
     rank = dist.get_rank()
     torch.cuda.set_device(rank)
     world_size = dist.get_world_size()
@@ -128,7 +85,9 @@ def init_model(model_parallel_size=TP_DEGREE):
     fsdp_pg = twod_mesh.get_dim_groups()[0]
 
     # Create Input
-    model = _shard_wrap_module(model, True, True, twod_mesh, fsdp_pg)
+    model = _distribute_and_fsdp_wrap_module(
+        model, True, twod_mesh, fsdp_pg, use_orig_params, fsdp_nested
+    )
     return model, fsdp_pg
 
 
@@ -182,19 +141,50 @@ def test_2d_fsdp_integration_functionality(self) -> None:
             is_nested_tensor(optim_state["state"]["net3.bias"]["exp_avg"])
         )
 
-    @with_comms
-    @skip_if_lt_x_gpu(4)
-    def test_2d_fsdp_integration_correctness(self) -> None:
+    def _compare_params(self, m1, m2):
+        with FSDP.summon_full_params(m1):
+            with FSDP.summon_full_params(m2):
+                for n_p1, n_p2 in zip(m1.named_parameters(), m2.named_parameters()):
+                    p1 = n_p1[1]
+                    p2 = n_p2[1]
+                    self.assertEqual(n_p1[0], n_p2[0])
+                    name = n_p1[0]
+                    if name == "net2.bias" and self.rank != 0:
+                        continue
+                    if type(p2) is DT:
+                        p2 = p2.redistribute(
+                            p2.device_mesh, [Replicate()]
+                        ).to_local()
+                    self.assertTrue(torch.allclose(p1, p2), f"{p1} vs {p2}")
+
+    def _test_2d_e2e_flow(self, use_orig_params=False, fsdp_nested=False, multi_param_group=False) -> None:
         if not is_available():
             self.skipTest("FSDP 2d parallel integration not available")
         torch.manual_seed(0)
         model = SimpleModel().cuda(self.rank)
-        model = FSDP(model)
+        model = FSDP(model, use_orig_params=use_orig_params)
         torch.manual_seed(0)
-        model_2d, dp_pg = init_model()
-
-        optim = torch.optim.Adam(model.parameters(), lr=0.0001)
-        optim_2d = torch.optim.Adam(model_2d.parameters(), lr=0.0001)
+        model_2d, dp_pg = init_model(use_orig_params=use_orig_params, fsdp_nested=fsdp_nested)
+        # Check named parameters are returning the same name at least.
+        param_names_2d = [name for name, _ in model_2d.named_parameters()]
+        for name, _ in model.named_parameters():
+            self.assertTrue(name in param_names_2d)
+        self._compare_params(model, model_2d)
+
+        if multi_param_group and use_orig_params:
+            param_group = [
+                {"params": model.net1.parameters(), "lr": 0.02},
+                {"params": model.net2.parameters(), "lr": 0.15},
+            ]
+            optim = torch.optim.Adam(param_group, lr=0.01)
+            param_group = [
+                {"params": model_2d.net1.parameters(), "lr": 0.02},
+                {"params": model_2d.net2.parameters(), "lr": 0.15},
+            ]
+            optim_2d = torch.optim.Adam(param_group, lr=0.01)
+        else:
+            optim = torch.optim.Adam(model.parameters(), lr=0.01)
+            optim_2d = torch.optim.Adam(model_2d.parameters(), lr=0.01)
 
         for i in range(5):
             # Ensure all input across TP ranks are same.
@@ -209,6 +199,29 @@ def test_2d_fsdp_integration_correctness(self) -> None:
             optim_2d.step()
             self.assertEqual(model(input), model_2d(input))
 
+        # Ensure all params are still the same after optimizer update.
+        self._compare_params(model, model_2d)
+
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    def test_2d_fsdp_integration_correctness(self) -> None:
+        self._test_2d_e2e_flow()
+
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    def test_2d_fsdp_integration_use_orig_params(self) -> None:
+        self._test_2d_e2e_flow(use_orig_params=True)
+
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    def test_2d_fsdp_integration_fsdp_nested(self) -> None:
+        self._test_2d_e2e_flow(fsdp_nested=True)
+
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    def test_2d_fsdp_integration_fsdp_nested_param_groups(self) -> None:
+        self._test_2d_e2e_flow(fsdp_nested=True, use_orig_params=True, multi_param_group=True)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/distributed/_tensor/dispatch.py b/torch/distributed/_tensor/dispatch.py
index 8c9e5a22efb83..38ea056aa91a9 100644
--- a/torch/distributed/_tensor/dispatch.py
+++ b/torch/distributed/_tensor/dispatch.py
@@ -151,7 +151,9 @@ def _reshape_alias(
 
 _CURRENT_DECOMPOSITION_TABLE: Dict[
     Callable[..., object], Callable[..., object]
-] = {torch.ops.aten._reshape_alias.default: _reshape_alias}
+] = {
+    torch.ops.aten._reshape_alias.default: _reshape_alias,
+}
 
 
 def propagate_input_sharding(

From 177738928a36624aae5f38f580b20318f8dee797 Mon Sep 17 00:00:00 2001
From: "Wang, Eikan" <eikan.wang@intel.com>
Date: Tue, 29 Nov 2022 12:38:59 +0000
Subject: [PATCH 1414/1922] Vectorize erf (#89837)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89837
Approved by: https://github.com/jgong5, https://github.com/desertfire, https://github.com/jansel
---
 test/inductor/test_torchinductor.py | 34 +++++++++++++++++++++++++++++
 torch/_inductor/codegen/cpp.py      |  4 ++++
 2 files changed, 38 insertions(+)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 209a5bd0a7a33..f7c116149cad7 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -42,6 +42,7 @@
     from functorch.compile import config as functorch_config
     from torch._decomp import get_decompositions
     from torch._inductor import codecache, config, metrics
+    from torch._inductor.codegen.cpp import CppOverrides, CppVecOverrides
     from torch._inductor.compile_fx import compile_fx, complex_memory_overlap
     from torch._inductor.ir import IndexingDiv, ModularIndexing
     from torch._inductor.overrides import (
@@ -4946,6 +4947,39 @@ def fn(value, mask):
                 assert same(real_out, compiled_out, equal_nan=True)
                 assert metrics.generated_cpp_vec_kernel_count >= 1
 
+        def test_cpu_vec_cosim(self):
+            cpp_vec_op_list = []
+            cpp_op_list = []
+
+            for k, v in CppVecOverrides.__dict__.items():
+                if isinstance(v, staticmethod):
+                    cpp_vec_op_list.append(k)
+            for k, v in CppOverrides.__dict__.items():
+                if isinstance(v, staticmethod):
+                    cpp_op_list.append(k)
+
+            self.assertEqual(cpp_op_list.sort(), cpp_vec_op_list.sort())
+
+        @unittest.skipIf(
+            not codecache.valid_vec_isa_list(), "Does not support vectorization"
+        )
+        @patch("torch.cuda.is_available", lambda: False)
+        def test_erf_cpu_only(self):
+            def fn(x):
+                return (torch.erf(x),)
+
+            x = torch.randn((2, 9))
+            x[0, 0] = torch.nan
+            x[1, -1] = torch.nan
+
+            with patch.object(config.cpp, "simdlen", None):
+                torch._dynamo.reset()
+                metrics.reset()
+                traced = make_fx(fn)(x)
+                compiled = compile_fx_inner(traced, [x])
+                assert same(fn(x)[0], compiled([x])[0], equal_nan=True)
+                assert metrics.generated_cpp_vec_kernel_count == 1
+
         @unittest.skipIf(
             not codecache.valid_vec_isa_list(), "Does not support vectorization"
         )
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index 309b8cf9c8e7a..9fd4f769bcb07 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -208,6 +208,10 @@ def cos(x):
     def exp(x):
         return f"{x}.exp()"
 
+    @staticmethod
+    def erf(x):
+        return f"{x}.erf()"
+
     @staticmethod
     def sqrt(x):
         return f"{x}.sqrt()"

From 13853ae513aede61b31f554b55602d48241c9929 Mon Sep 17 00:00:00 2001
From: mingfeima <mingfei.ma@intel.com>
Date: Tue, 29 Nov 2022 14:01:36 +0800
Subject: [PATCH 1415/1922] fix RowwiseMoments vectorization issue on CPU
 (#84404)

Originally `cpu/moments_utils.h` uses namespace of at::native::utils,
this file contains `Vectorized<>`, in order to make it properly vectorized
on different archs, need to use anonymous namespace or inline namespace.
Otherwise it would be linked to scalar version of the code.

This PR is to fix vectorization issue from `RowwiseMoments` which is used to calculate `mean` and `rstd` in norm layers.
Attach benchmark data, generally fp32 will get 2-3x speedup and bf16 has larger speedup.

This patch will improves layer_norm (input size 32x128x1024) float32 inference:
* avx512 single socket: 2.1x
```bash
before: LayerNorm((1024,), eps=1e-05, elementwise_affine=True) : 32x128x1024: fp32: 0.439 ms; bf16: 2.479 ms
after:  LayerNorm((1024,), eps=1e-05, elementwise_affine=True) : 32x128x1024: fp32: 0.210 ms; bf16: 0.770 ms
```
* avx512 single core: 3.2x
```bash
before: LayerNorm((1024,), eps=1e-05, elementwise_affine=True) : 32x128x1024: fp32: 6.308 ms; bf16: 39.765 ms
after:  LayerNorm((1024,), eps=1e-05, elementwise_affine=True) : 32x128x1024: fp32: 2.661 ms; bf16: 12.267 ms
```
* avx2 single socket: 2.3x
```bash
before: LayerNorm((1024,), eps=1e-05, elementwise_affine=True) : 32x128x1024: fp32: 1.248 ms; bf16: 8.487 ms
after:  LayerNorm((1024,), eps=1e-05, elementwise_affine=True) : 32x128x1024: fp32: 0.540 ms; bf16: 2.030 ms
```
* avx2 single core: 2.5x
```bash
before: LayerNorm((1024,), eps=1e-05, elementwise_affine=True) : 32x128x1024: fp32: 10.792 ms; bf16: 66.366 ms
after:  LayerNorm((1024,), eps=1e-05, elementwise_affine=True) : 32x128x1024: fp32: 4.349 ms; bf16: 19.252 ms
```

Attached some original VTune profiling results here to further indicate the issue:

1. original bottlenecks
![master_bottleneck](https://user-images.githubusercontent.com/20233731/180125611-deed41b7-dd2e-4437-a7d9-6ad0096e5850.png)

we can see `RowwiseMomentsImpl<>` takes majority of the runtime here.

2. Instruction level breakdown of `RowwiseMomentsImpl<>`
![rowwise_momentum_impl](https://user-images.githubusercontent.com/20233731/180125759-a3b48bc4-8e54-4219-92b4-defde5e86046.png)

we can see it's all **scalar** instructions here.

3. after the fix, the bottlenecks
![fixed_bottleneck](https://user-images.githubusercontent.com/20233731/180125880-8d08eb1b-af09-4f80-ae58-80215365d407.png)

getting better.

4. after the fix, Instruction level breakdown of `RowwiseMomentsImpl<>`
![fixed_rowwsie_momentum_impl](https://user-images.githubusercontent.com/20233731/180125989-b45db4ad-e6ed-460a-8d51-74fbeecf8b02.png)

now it is all **vectorized** instructions.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84404
Approved by: https://github.com/jgong5
---
 aten/src/ATen/native/cpu/group_norm_kernel.cpp | 2 +-
 aten/src/ATen/native/cpu/layer_norm_kernel.cpp | 2 +-
 aten/src/ATen/native/cpu/moments_utils.h       | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/native/cpu/group_norm_kernel.cpp b/aten/src/ATen/native/cpu/group_norm_kernel.cpp
index ff84f9b60784e..a8dc08aa6dab8 100644
--- a/aten/src/ATen/native/cpu/group_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/group_norm_kernel.cpp
@@ -57,7 +57,7 @@ void GroupNormKernelImplInternal(
       const T* X_ptr = X_data + i * inner_size;
       T mean_val;
       T rstd_val;
-      std::tie(mean_val, rstd_val) = utils::RowwiseMoments(X_ptr, inner_size);
+      std::tie(mean_val, rstd_val) = RowwiseMoments(X_ptr, inner_size);
       rstd_val = T(1) / std::sqrt(std::max(rstd_val, T(0)) + eps);
       if (gamma_null && beta_null) {
         T* Y_ptr = Y_data + i * inner_size;
diff --git a/aten/src/ATen/native/cpu/layer_norm_kernel.cpp b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
index f7104875b8247..22cd0b69559e7 100644
--- a/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
@@ -55,7 +55,7 @@ void LayerNormKernelImplInternal(
       T* Y_ptr = Y_data + i * N;
       T mean_val;
       T rstd_val;
-      std::tie(mean_val, rstd_val) = utils::RowwiseMoments(X_ptr, N);
+      std::tie(mean_val, rstd_val) = RowwiseMoments(X_ptr, N);
       rstd_val = T(1) / std::sqrt(rstd_val + eps);
       const T_ACC scale = rstd_val;
       const T_ACC bias = -rstd_val * mean_val;
diff --git a/aten/src/ATen/native/cpu/moments_utils.h b/aten/src/ATen/native/cpu/moments_utils.h
index 18e6899619046..58e0e48682924 100644
--- a/aten/src/ATen/native/cpu/moments_utils.h
+++ b/aten/src/ATen/native/cpu/moments_utils.h
@@ -14,7 +14,7 @@
 
 namespace at {
 namespace native {
-namespace utils {
+inline namespace CPU_CAPABILITY {
 
 constexpr int64_t kChunkSize = 16;
 
@@ -63,7 +63,7 @@ std::pair<T, T> RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) {
   constexpr int64_t kVecSize = Vec::size();
   const int64_t n = N / kVecSize;
   const int64_t m = divup(n, kChunkSize);
-  const int64_t depth = CeilLog2(m);
+  const int64_t depth = utils::CeilLog2(m);
 
   const Vec kZeroVec(T(0));
   c10::SmallVector<int64_t, kMaxDepth> m0_stk(depth, 0);
@@ -136,7 +136,7 @@ std::pair<T, T> RowwiseMoments(const T* X, int64_t N, int64_t ddof = 0) {
   constexpr int64_t kVecSize = Vec::size();
   const int64_t n = N / kVecSize;
   const int64_t m = divup(n, kChunkSize);
-  const int64_t depth = CeilLog2(m);
+  const int64_t depth = utils::CeilLog2(m);
   if (depth <= 4) {
     return RowwiseMomentsImpl<T, 4>(X, N, ddof);
   } else if (depth <= 8) {
@@ -150,6 +150,6 @@ std::pair<T, T> RowwiseMoments(const T* X, int64_t N, int64_t ddof = 0) {
   }
 }
 
-} // namespace utils
+} // namespace CPU_CAPABILITY
 } // namespace native
 } // namespace at

From e1c86cc25bbde9d6203a03996d2ea695c57bcefd Mon Sep 17 00:00:00 2001
From: Iris <wz337@cornell.edu>
Date: Wed, 30 Nov 2022 08:19:38 +0000
Subject: [PATCH 1416/1922] [PT-D][Checkpoint]Add MultiThreaded
 FileSystemWriter for distributed checkpointing and Update tests  (#87987)

This PR includes:

Changes from @kumpera (https://github.com/pytorch/pytorch/pull/86327): adding MultiThreaded FileSystemWriter for distributed checkpointing, which adds two knobs to FileSystemWriter: thread_count and per_thread_copy_ahead. This increases up to 50% performance improvement on 32 GPUS workloads on AWS.
Add parametrize tests to /test/distributed/_shard/checkpoint/test_file_system_checkpoint.py and /test/distributed/_shard/checkpoint/test_file_system_checkpoint_cpu.py
Modify @with_comms in ShardedTensorTestBase to take in *args and **kwargs.
Tests:

```
python3 test/distributed/checkpoint/test_file_system_checkpoint_cpu.py
```

test/distributed/checkpoint/test_file_system_checkpoint.py(GPU tests) runs fine locally but would timeout on CI. We will use thread-based PG and update this test in following PR.

[T134844615]

## Add docstring and update comments in the following PRs.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/87987
Approved by: https://github.com/fduwjj
---
 .../checkpoint/test_file_system_checkpoint.py | 119 +++++---
 .../test_file_system_checkpoint_cpu.py        | 143 ++++++---
 torch/distributed/checkpoint/filesystem.py    | 289 +++++++++++++++---
 torch/testing/_internal/common_distributed.py |   3 +-
 .../_shard/sharded_tensor/__init__.py         |   2 +-
 5 files changed, 430 insertions(+), 126 deletions(-)

diff --git a/test/distributed/checkpoint/test_file_system_checkpoint.py b/test/distributed/checkpoint/test_file_system_checkpoint.py
index 7ef4e72e4fe0e..016467144e8ff 100644
--- a/test/distributed/checkpoint/test_file_system_checkpoint.py
+++ b/test/distributed/checkpoint/test_file_system_checkpoint.py
@@ -8,21 +8,27 @@
 import torch
 import torch.distributed as dist
 from torch.distributed._shard import sharded_tensor
-from torch.distributed._shard.sharded_tensor import ShardedTensor, state_dict_hook
+from torch.distributed._shard.sharded_tensor import (
+    ShardedTensor,
+    state_dict_hook,
+)
 from torch.distributed._shard.sharding_spec import (
     ChunkShardingSpec,
     EnumerableShardingSpec,
     ShardingSpec,
     ShardMetadata,
 )
-from torch.testing._internal.common_distributed import requires_nccl, skip_if_lt_x_gpu
+from torch.testing._internal.common_distributed import (
+    requires_nccl,
+    skip_if_lt_x_gpu,
+)
 from torch.testing._internal.common_utils import TestCase
 from torch.testing._internal.distributed._shard.sharded_tensor import (
     ShardedTensorTestBase,
     with_comms,
 )
 from torch.testing._internal.distributed._shard.sharded_tensor._test_st_common import (
-    MyShardedModel1
+    MyShardedModel1,
 )
 
 
@@ -73,7 +79,8 @@ def assert_state_dict_equal(
                 )
         elif isinstance(value_1, torch.Tensor):
             self.assertTrue(
-                torch.equal(value_1, value_2), f"Key {key}'s tensor does not match"
+                torch.equal(value_1, value_2),
+                f"Key {key}'s tensor does not match",
             )
 
     return True
@@ -105,35 +112,59 @@ def test_read_write_only_tensor(self) -> None:
             state_dict_to_save = MyTestModule().state_dict()
 
             fs_writer = FileSystemWriter(path=path)
-            save_state_dict(state_dict=state_dict_to_save, storage_writer=fs_writer, no_dist=True)
+            save_state_dict(
+                state_dict=state_dict_to_save,
+                storage_writer=fs_writer,
+                no_dist=True,
+            )
 
             state_dict_to_load_to = MyTestModule().state_dict()
 
             with self.assertRaises(AssertionError):
-                assert_state_dict_equal(self, state_dict_to_load_to, state_dict_to_save)
+                assert_state_dict_equal(
+                    self, state_dict_to_load_to, state_dict_to_save
+                )
 
             # Load from file without any resharding
             fs_reader = FileSystemReader(path=path)
-            load_state_dict(state_dict=state_dict_to_load_to, storage_reader=fs_reader, no_dist=True)
+            load_state_dict(
+                state_dict=state_dict_to_load_to,
+                storage_reader=fs_reader,
+                no_dist=True,
+            )
 
-            assert_state_dict_equal(self, state_dict_to_load_to, state_dict_to_save)
+            assert_state_dict_equal(
+                self, state_dict_to_load_to, state_dict_to_save
+            )
 
         with tempfile.TemporaryDirectory() as path:
             state_dict_to_save = MyTestModule().state_dict()
 
             fs_writer = FileSystemWriter(path=path, single_file_per_rank=True)
-            save_state_dict(state_dict=state_dict_to_save, storage_writer=fs_writer, no_dist=True)
+            save_state_dict(
+                state_dict=state_dict_to_save,
+                storage_writer=fs_writer,
+                no_dist=True,
+            )
 
             state_dict_to_load_to = MyTestModule().state_dict()
 
             with self.assertRaises(AssertionError):
-                assert_state_dict_equal(self, state_dict_to_load_to, state_dict_to_save)
+                assert_state_dict_equal(
+                    self, state_dict_to_load_to, state_dict_to_save
+                )
 
             # Load from file without any resharding
             fs_reader = FileSystemReader(path=path)
-            load_state_dict(state_dict=state_dict_to_load_to, storage_reader=fs_reader, no_dist=True)
+            load_state_dict(
+                state_dict=state_dict_to_load_to,
+                storage_reader=fs_reader,
+                no_dist=True,
+            )
 
-            assert_state_dict_equal(self, state_dict_to_load_to, state_dict_to_save)
+            assert_state_dict_equal(
+                self, state_dict_to_load_to, state_dict_to_save
+            )
 
 
 class TestDistributedStateDictSaveLoadWithSharedTensor(ShardedTensorTestBase):
@@ -180,11 +211,15 @@ def test_read_write_shard_tensor(self) -> None:
         dist.barrier()
 
         with self.assertRaises(AssertionError):
-            assert_state_dict_equal(self, state_dict_to_load_to, state_dict_to_save)
+            assert_state_dict_equal(
+                self, state_dict_to_load_to, state_dict_to_save
+            )
 
         # Test load.
         fs_reader = FileSystemReader(path=path)
-        load_state_dict(state_dict=state_dict_to_load_to, storage_reader=fs_reader)
+        load_state_dict(
+            state_dict=state_dict_to_load_to, storage_reader=fs_reader
+        )
 
         assert_state_dict_equal(self, state_dict_to_load_to, state_dict_to_save)
         dist.barrier()
@@ -201,7 +236,11 @@ def get_file_path(self) -> str:
         return paths[0]
 
     def load_tensor(self, tensor: ShardedTensor) -> torch.Tensor:
-        res = torch.zeros(tensor.shape, device="cuda:0") if dist.get_rank() == 0 else None
+        res = (
+            torch.zeros(tensor.shape, device="cuda:0")
+            if dist.get_rank() == 0
+            else None
+        )
         tensor.gather(out=res)
         return res
 
@@ -295,7 +334,9 @@ def test_load_with_different_shard_plan(self) -> None:
                 state_dict_to_save = model_to_save.state_dict()
 
                 fs_writer = FileSystemWriter(path=path)
-                save_state_dict(state_dict=state_dict_to_save, storage_writer=fs_writer)
+                save_state_dict(
+                    state_dict=state_dict_to_save, storage_writer=fs_writer
+                )
 
                 dist.barrier()
 
@@ -316,7 +357,8 @@ def test_load_with_different_shard_plan(self) -> None:
 
                 if dist.get_rank() == 0:
                     self.assertTrue(
-                        torch.allclose(store_tensor, load_tensor), msg=f"{s0} vs {s1}"
+                        torch.allclose(store_tensor, load_tensor),
+                        msg=f"{s0} vs {s1}",
                     )
 
     @with_comms(init_rpc=False)
@@ -361,7 +403,9 @@ def test_load_rowwise_to_colwise(self) -> None:
 
         fs_reader = FileSystemReader(path=path)
 
-        load_state_dict(state_dict=state_dict_to_load_to, storage_reader=fs_reader)
+        load_state_dict(
+            state_dict=state_dict_to_load_to, storage_reader=fs_reader
+        )
 
         # We can't use torch.allclose since each ST has a different sharding spec
         store_tensor = self.load_tensor(model_to_save.sharded_tensor)
@@ -370,32 +414,24 @@ def test_load_rowwise_to_colwise(self) -> None:
         if dist.get_rank() == 0:
             self.assertTrue(torch.allclose(store_tensor, load_tensor))
 
-
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(2)
     @requires_nccl()
     def test_save_load_bytes(self) -> None:
         path = self.get_file_path()
 
-        state_dict_to_save = {
-            'bytes0': [1],
-            'bytes1': 'string'
-        }
+        state_dict_to_save = {"bytes0": [1], "bytes1": "string"}
 
         fs_writer = FileSystemWriter(path=path)
         save_state_dict(state_dict=state_dict_to_save, storage_writer=fs_writer)
 
-        state_dict_to_load = {
-            'bytes0': [2],
-            'bytes1': 'other'
-        }
+        state_dict_to_load = {"bytes0": [2], "bytes1": "other"}
 
         fs_reader = FileSystemReader(path=path)
         load_state_dict(state_dict=state_dict_to_load, storage_reader=fs_reader)
 
-        self.assertEqual([1], state_dict_to_load['bytes0'])
-        self.assertEqual('string', state_dict_to_load['bytes1'])
-
+        self.assertEqual([1], state_dict_to_load["bytes0"])
+        self.assertEqual("string", state_dict_to_load["bytes1"])
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(2)
@@ -454,8 +490,8 @@ def test_switch_between_sharded_tensor_to_tensor(self) -> None:
         for save_spec in specs:
             for load_spec in specs:
                 save_dict = {
-                    'sharded': sharded_tensor.rand(save_spec, tensor_size),
-                    'replicated': torch.rand(tensor_size, device=self.rank)
+                    "sharded": sharded_tensor.rand(save_spec, tensor_size),
+                    "replicated": torch.rand(tensor_size, device=self.rank),
                 }
 
                 fs_writer = FileSystemWriter(path=path)
@@ -463,25 +499,28 @@ def test_switch_between_sharded_tensor_to_tensor(self) -> None:
 
                 # Freaky Friday the tensors
                 load_dict = {
-                    'sharded': torch.zeros(tensor_size, device=self.rank),
-                    'replicated': sharded_tensor.zeros(load_spec, tensor_size)
+                    "sharded": torch.zeros(tensor_size, device=self.rank),
+                    "replicated": sharded_tensor.zeros(load_spec, tensor_size),
                 }
 
                 fs_reader = FileSystemReader(path=path)
                 load_state_dict(state_dict=load_dict, storage_reader=fs_reader)
 
-                save_dict_sharded = self.load_tensor(save_dict['sharded'])
-                load_dict_replicated = self.load_tensor(load_dict['replicated'])
+                save_dict_sharded = self.load_tensor(save_dict["sharded"])
+                load_dict_replicated = self.load_tensor(load_dict["replicated"])
 
                 if dist.get_rank() == 0:
                     self.assertTrue(
-                        torch.allclose(save_dict_sharded, load_dict['sharded']),
-                        f"save-spec {save_spec} load-spec {load_spec}"
+                        torch.allclose(save_dict_sharded, load_dict["sharded"]),
+                        f"save-spec {save_spec} load-spec {load_spec}",
                     )
                     self.assertTrue(
-                        torch.allclose(save_dict['replicated'], load_dict_replicated),
-                        f"save-spec {save_spec} load-spec {load_spec}"
+                        torch.allclose(
+                            save_dict["replicated"], load_dict_replicated
+                        ),
+                        f"save-spec {save_spec} load-spec {load_spec}",
                     )
 
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/checkpoint/test_file_system_checkpoint_cpu.py b/test/distributed/checkpoint/test_file_system_checkpoint_cpu.py
index 2ff2d9d127919..52e414545c049 100644
--- a/test/distributed/checkpoint/test_file_system_checkpoint_cpu.py
+++ b/test/distributed/checkpoint/test_file_system_checkpoint_cpu.py
@@ -9,7 +9,10 @@
 import torch
 import torch.distributed as dist
 from torch.distributed._shard import sharded_tensor
-from torch.distributed._shard.sharded_tensor import ShardedTensor, state_dict_hook
+from torch.distributed._shard.sharded_tensor import (
+    ShardedTensor,
+    state_dict_hook,
+)
 from torch.distributed._shard.sharding_spec import (
     ChunkShardingSpec,
     EnumerableShardingSpec,
@@ -22,11 +25,13 @@
     with_comms,
 )
 from torch.testing._internal.distributed._shard.sharded_tensor._test_st_common import (
-    MyShardedModel1
+    MyShardedModel1,
 )
 
 
 from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
     TEST_WITH_DEV_DBG_ASAN,
     run_tests,
 )
@@ -47,6 +52,9 @@
     sys.exit(0)
 
 
+_THREAD_COUNTS = {1, 2}
+
+
 def assert_state_dict_equal(
     self: TestCase,
     state_dict_1: Dict[str, torch.Tensor],
@@ -73,7 +81,8 @@ def assert_state_dict_equal(
                 )
         elif isinstance(value_1, torch.Tensor):
             self.assertTrue(
-                torch.equal(value_1, value_2), f"Key {key}'s tensor does not match"
+                torch.equal(value_1, value_2),
+                f"Key {key}'s tensor does not match",
             )
 
     return True
@@ -100,23 +109,36 @@ def __init__(
 
 
 class TestDistributedStateDictSaveLoad(TestCase):
-    def test_read_write_only_tensor(self) -> None:
+    @parametrize("thread_count", _THREAD_COUNTS)
+    def test_read_write_only_tensor(self, thread_count) -> None:
         with tempfile.TemporaryDirectory() as path:
             state_dict_to_save = MyTestModule().state_dict()
 
-            fs_writer = FileSystemWriter(path=path)
-            save_state_dict(state_dict=state_dict_to_save, storage_writer=fs_writer, no_dist=True)
+            fs_writer = FileSystemWriter(path=path, thread_count=thread_count)
+            save_state_dict(
+                state_dict=state_dict_to_save,
+                storage_writer=fs_writer,
+                no_dist=True,
+            )
 
             state_dict_to_load_to = MyTestModule().state_dict()
 
             with self.assertRaises(AssertionError):
-                assert_state_dict_equal(self, state_dict_to_load_to, state_dict_to_save)
+                assert_state_dict_equal(
+                    self, state_dict_to_load_to, state_dict_to_save
+                )
 
             # Load from file without any resharding
             fs_reader = FileSystemReader(path=path)
-            load_state_dict(state_dict=state_dict_to_load_to, storage_reader=fs_reader, no_dist=True)
+            load_state_dict(
+                state_dict=state_dict_to_load_to,
+                storage_reader=fs_reader,
+                no_dist=True,
+            )
 
-            assert_state_dict_equal(self, state_dict_to_load_to, state_dict_to_save)
+            assert_state_dict_equal(
+                self, state_dict_to_load_to, state_dict_to_save
+            )
 
 
 class TestDistributedStateDictSaveLoadWithSharedTensor(ShardedTensorTestBase):
@@ -125,7 +147,8 @@ def world_size(self) -> int:
         return 2
 
     @with_comms(init_rpc=False, backend="gloo")
-    def test_read_write_shard_tensor(self) -> None:
+    @parametrize("thread_count", _THREAD_COUNTS)
+    def test_read_write_shard_tensor(self, thread_count) -> None:
         paths = [tempfile.mkdtemp()]
         dist.broadcast_object_list(paths)
 
@@ -146,7 +169,7 @@ def test_read_write_shard_tensor(self) -> None:
         model_to_save._register_state_dict_hook(state_dict_hook)
         state_dict_to_save = model_to_save.state_dict()
 
-        fs_writer = FileSystemWriter(path=path)
+        fs_writer = FileSystemWriter(path=path, thread_count=thread_count)
         save_state_dict(state_dict=state_dict_to_save, storage_writer=fs_writer)
 
         dist.barrier()
@@ -161,11 +184,15 @@ def test_read_write_shard_tensor(self) -> None:
         dist.barrier()
 
         with self.assertRaises(AssertionError):
-            assert_state_dict_equal(self, state_dict_to_load_to, state_dict_to_save)
+            assert_state_dict_equal(
+                self, state_dict_to_load_to, state_dict_to_save
+            )
 
         # Test load.
         fs_reader = FileSystemReader(path=path)
-        load_state_dict(state_dict=state_dict_to_load_to, storage_reader=fs_reader)
+        load_state_dict(
+            state_dict=state_dict_to_load_to, storage_reader=fs_reader
+        )
 
         assert_state_dict_equal(self, state_dict_to_load_to, state_dict_to_save)
         dist.barrier()
@@ -182,12 +209,17 @@ def get_file_path(self) -> str:
         return paths[0]
 
     def load_tensor(self, tensor: ShardedTensor) -> torch.Tensor:
-        res = torch.zeros(tensor.shape, device="cpu") if dist.get_rank() == 0 else None
+        res = (
+            torch.zeros(tensor.shape, device="cpu")
+            if dist.get_rank() == 0
+            else None
+        )
         tensor.gather(out=res)
         return res
 
     @with_comms(init_rpc=False, backend="gloo")
-    def test_load_with_different_shard_plan(self) -> None:
+    @parametrize("thread_count", _THREAD_COUNTS)
+    def test_load_with_different_shard_plan(self, thread_count) -> None:
         path = self.get_file_path()
 
         # We hardcode the assumption of how many shards are around
@@ -273,8 +305,12 @@ def test_load_with_different_shard_plan(self) -> None:
                 model_to_save._register_state_dict_hook(state_dict_hook)
                 state_dict_to_save = model_to_save.state_dict()
 
-                fs_writer = FileSystemWriter(path=path)
-                save_state_dict(state_dict=state_dict_to_save, storage_writer=fs_writer)
+                fs_writer = FileSystemWriter(
+                    path=path, thread_count=thread_count
+                )
+                save_state_dict(
+                    state_dict=state_dict_to_save, storage_writer=fs_writer
+                )
 
                 dist.barrier()
 
@@ -295,11 +331,13 @@ def test_load_with_different_shard_plan(self) -> None:
 
                 if dist.get_rank() == 0:
                     self.assertTrue(
-                        torch.allclose(store_tensor, load_tensor), msg=f"{s0} vs {s1}"
+                        torch.allclose(store_tensor, load_tensor),
+                        msg=f"{s0} vs {s1}",
                     )
 
     @with_comms(init_rpc=False, backend="gloo")
-    def test_load_rowwise_to_colwise(self) -> None:
+    @parametrize("thread_count", _THREAD_COUNTS)
+    def test_load_rowwise_to_colwise(self, thread_count) -> None:
         path = self.get_file_path()
         self.assertEqual(self.world_size, dist.get_world_size())
 
@@ -329,7 +367,7 @@ def test_load_rowwise_to_colwise(self) -> None:
         model_to_save._register_state_dict_hook(state_dict_hook)
         state_dict_to_save = model_to_save.state_dict()
 
-        fs_writer = FileSystemWriter(path=path)
+        fs_writer = FileSystemWriter(path=path, thread_count=thread_count)
         save_state_dict(state_dict=state_dict_to_save, storage_writer=fs_writer)
 
         model_to_load = MyShardedModel3(dst_spec).cuda(dist.get_rank())
@@ -338,7 +376,9 @@ def test_load_rowwise_to_colwise(self) -> None:
 
         fs_reader = FileSystemReader(path=path)
 
-        load_state_dict(state_dict=state_dict_to_load_to, storage_reader=fs_reader)
+        load_state_dict(
+            state_dict=state_dict_to_load_to, storage_reader=fs_reader
+        )
 
         # We can't use torch.allclose since each ST has a different sharding spec
         store_tensor = self.load_tensor(model_to_save.sharded_tensor)
@@ -347,33 +387,29 @@ def test_load_rowwise_to_colwise(self) -> None:
         if dist.get_rank() == 0:
             self.assertTrue(torch.allclose(store_tensor, load_tensor))
 
-
     @with_comms(init_rpc=False, backend="gloo")
-    def test_save_load_bytes(self) -> None:
+    @parametrize("thread_count", _THREAD_COUNTS)
+    def test_save_load_bytes(self, thread_count) -> None:
         path = self.get_file_path()
 
-        state_dict_to_save = {
-            'bytes0': [1],
-            'bytes1': 'string'
-        }
+        state_dict_to_save = {"bytes0": [1], "bytes1": "string"}
 
-        fs_writer = FileSystemWriter(path=path)
+        fs_writer = FileSystemWriter(path=path, thread_count=thread_count)
         save_state_dict(state_dict=state_dict_to_save, storage_writer=fs_writer)
 
-        state_dict_to_load = {
-            'bytes0': [2],
-            'bytes1': 'other'
-        }
+        state_dict_to_load = {"bytes0": [2], "bytes1": "other"}
 
         fs_reader = FileSystemReader(path=path)
         load_state_dict(state_dict=state_dict_to_load, storage_reader=fs_reader)
 
-        self.assertEqual([1], state_dict_to_load['bytes0'])
-        self.assertEqual('string', state_dict_to_load['bytes1'])
-
+        self.assertEqual([1], state_dict_to_load["bytes0"])
+        self.assertEqual("string", state_dict_to_load["bytes1"])
 
     @with_comms(init_rpc=False, backend="gloo")
-    def test_switch_between_sharded_tensor_to_tensor(self) -> None:
+    @parametrize("thread_count", _THREAD_COUNTS)
+    def test_switch_between_sharded_tensor_to_tensor(
+        self, thread_count
+    ) -> None:
         path = self.get_file_path()
         tensor_size = 32
 
@@ -427,34 +463,47 @@ def test_switch_between_sharded_tensor_to_tensor(self) -> None:
         for save_spec in specs:
             for load_spec in specs:
                 save_dict = {
-                    'sharded': sharded_tensor.rand(save_spec, tensor_size),
-                    'replicated': torch.rand(tensor_size, device=f"cpu:{self.rank}")
+                    "sharded": sharded_tensor.rand(save_spec, tensor_size),
+                    "replicated": torch.rand(
+                        tensor_size, device=f"cpu:{self.rank}"
+                    ),
                 }
 
-                fs_writer = FileSystemWriter(path=path)
+                fs_writer = FileSystemWriter(
+                    path=path, thread_count=thread_count
+                )
                 save_state_dict(state_dict=save_dict, storage_writer=fs_writer)
 
                 # Freaky Friday the tensors
                 load_dict = {
-                    'sharded': torch.zeros(tensor_size, device=f"cpu:{self.rank}"),
-                    'replicated': sharded_tensor.zeros(load_spec, tensor_size)
+                    "sharded": torch.zeros(
+                        tensor_size, device=f"cpu:{self.rank}"
+                    ),
+                    "replicated": sharded_tensor.zeros(load_spec, tensor_size),
                 }
 
                 fs_reader = FileSystemReader(path=path)
                 load_state_dict(state_dict=load_dict, storage_reader=fs_reader)
 
-                save_dict_sharded = self.load_tensor(save_dict['sharded'])
-                load_dict_replicated = self.load_tensor(load_dict['replicated'])
+                save_dict_sharded = self.load_tensor(save_dict["sharded"])
+                load_dict_replicated = self.load_tensor(load_dict["replicated"])
 
                 if dist.get_rank() == 0:
                     self.assertTrue(
-                        torch.allclose(save_dict_sharded, load_dict['sharded']),
-                        f"save-spec {save_spec} load-spec {load_spec}"
+                        torch.allclose(save_dict_sharded, load_dict["sharded"]),
+                        f"save-spec {save_spec} load-spec {load_spec}",
                     )
                     self.assertTrue(
-                        torch.allclose(save_dict['replicated'], load_dict_replicated),
-                        f"save-spec {save_spec} load-spec {load_spec}"
+                        torch.allclose(
+                            save_dict["replicated"], load_dict_replicated
+                        ),
+                        f"save-spec {save_spec} load-spec {load_spec}",
                     )
 
+
+instantiate_parametrized_tests(TestDistributedStateDictSaveLoad)
+instantiate_parametrized_tests(TestDistributedStateDictSaveLoadWithSharedTensor)
+instantiate_parametrized_tests(TestDistributedReshardOnLoad)
+
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/distributed/checkpoint/filesystem.py b/torch/distributed/checkpoint/filesystem.py
index cfac79c17fe38..253d33ff869b4 100644
--- a/torch/distributed/checkpoint/filesystem.py
+++ b/torch/distributed/checkpoint/filesystem.py
@@ -1,3 +1,8 @@
+from abc import ABC, abstractmethod
+import queue
+import threading
+import collections
+
 from dataclasses import dataclass
 import os
 import dataclasses
@@ -33,7 +38,6 @@
 
 from torch.distributed._shard._utils import narrow_tensor_by_index
 
-
 __all__ = [
     "FileSystemWriter",
     "SlicedBufferedReader",
@@ -75,6 +79,160 @@ def _result_from_write_item(
     )
 
 
+class _TensorLoader(ABC):
+    @abstractmethod
+    def add(self, size, obj):
+        pass
+
+    def start_loading(self):
+        pass
+
+    @abstractmethod
+    def values(self):
+        pass
+
+
+class _SerialCpuLoader(_TensorLoader):
+    def __init__(self, resolve_fun):
+        self.resolve_fun = resolve_fun
+        self.items = []
+
+    def add(self, size, obj):
+        self.items.append((size, obj))
+
+    def start_loading(self):
+        pass
+
+    def values(self):
+        for _, obj in self.items:
+            tensor = self.resolve_fun(obj).detach()
+            tensor = tensor.cpu()
+            if tensor.storage().size() != tensor.numel():
+                tensor = tensor.clone()
+            yield (
+                tensor,
+                obj,
+            )
+
+
+class _OverlappingCpuLoader(_TensorLoader):
+    def __init__(self, resolve_fun, stream=None, inflight_threshhold=1_000_000):
+        self.resolve_fun = resolve_fun
+        self.items = []
+        self.inflight_threshhold = inflight_threshhold
+        self.in_flight_data = 0
+        self.current_items: collections.deque = collections.deque()
+        self.idx = 0
+        self.started = False
+        self.stream = stream or torch.cuda.current_stream()
+        if self.stream != torch.cuda.current_stream():
+            self.stream.wait_stream(torch.cuda.current_stream())
+
+    @property
+    def _done(self):
+        return self.idx >= len(self.items)
+
+    def _drain(self):
+        drained = []
+        if self.in_flight_data >= self.inflight_threshhold:
+            self.stream.synchronize()
+        while self.in_flight_data >= self.inflight_threshhold:
+            val = self.current_items.popleft()
+            self.in_flight_data -= val[0].numel() * val[0].element_size()
+            drained.append(val)
+        return drained
+
+    def _refill(self):
+        with torch.cuda.stream(self.stream):
+            while (
+                not self._done
+                and self.in_flight_data < self.inflight_threshhold
+            ):
+                _, obj = self.items[self.idx]
+                self.idx += 1
+                tensor = self.resolve_fun(obj).detach()
+                if tensor.is_cuda:
+                    tensor = tensor.to(device="cpu", non_blocking=True)
+                elif tensor.device == torch.device("cpu"):
+                    if tensor.storage().size() != tensor.numel():
+                        # this forces the tensor to be both contiguous and with minimal storage
+                        tensor = tensor.clone()
+
+                self.current_items.append(
+                    (
+                        tensor,
+                        obj,
+                    )
+                )
+                self.in_flight_data += tensor.numel() * tensor.element_size()
+
+    def _finish(self):
+        assert self._done
+        if len(self.current_items) > 0:
+            self.stream.synchronize()
+        return self.current_items
+
+    def add(self, size, obj):
+        if self.started:
+            raise RuntimeError("cannot add items after loading started")
+        self.items.append((size, obj))
+
+    def start_loading(self):
+        if self.started:
+            return
+        self.started = True
+        self.items.sort(key=lambda x: x[0])
+        self._refill()
+
+    def values(self):
+        self.start_loading()
+        while not self._done:
+            drained = self._drain()
+            self._refill()
+            for obj in drained:
+                yield obj
+
+        for val in self._finish():
+            yield val
+
+
+def _item_size(item: WriteItem) -> int:
+    size = 1
+    assert item.tensor_data is not None
+    # can't use math.prod as PT needs to support older python
+    for s in item.tensor_data.size:
+        size *= s
+
+    dtype = item.tensor_data.properties.dtype
+    return size * torch._utils._element_size(dtype)
+
+
+def _split_by_size_and_type(
+    bins, items: List[WriteItem]
+) -> List[List[WriteItem]]:
+    if bins == 1:
+        return [items]
+
+    bytes_w = [wi for wi in items if wi.type == WriteItemType.BYTE_IO]
+    tensor_w = [wi for wi in items if wi.type != WriteItemType.BYTE_IO]
+
+    buckets: List[List[WriteItem]] = [[] for _ in range(bins)]
+    bucket_sizes = [0 for _ in range(bins)]
+
+    tensor_w.sort(key=_item_size, reverse=True)
+
+    for i, wi in enumerate(bytes_w):
+        buckets[i % bins].append(wi)
+
+    for wi in tensor_w:
+        # TODO replace with headq
+        idx = min(enumerate(bucket_sizes), key=lambda x: x[1])[0]
+        buckets[idx].append(wi)
+        bucket_sizes[idx] += _item_size(wi)
+
+    return buckets
+
+
 def _write_item(stream, data, write_item, storage_key):
     offset = stream.tell()
 
@@ -93,38 +251,57 @@ def _write_item(stream, data, write_item, storage_key):
 
 
 def _write_files_from_queue(
-    file_queue: List,
+    file_queue: queue.Queue,
+    result_queue: queue.Queue,
     planner: SavePlanner,
+    inflight_threshhold: int,
     use_fsync: bool,
 ):
-    write_results = []
-
-    for file_path, file_name, write_items in file_queue:
-        tensor_w = [
-            wi for wi in write_items if wi.type != WriteItemType.BYTE_IO
-        ]
-        bytes_w = [wi for wi in write_items if wi.type == WriteItemType.BYTE_IO]
-
-        with open(file_path, "wb") as stream:
-            for write_item in bytes_w:
-                data = planner.resolve_data(write_item)
-                write_results.append(
-                    _write_item(stream, data, write_item, file_name)
+    try:
+        while True:
+            file_name, storage_key, write_items = file_queue.get_nowait()
+            loader: _TensorLoader
+
+            if torch.cuda.is_available() and inflight_threshhold > 0:
+                loader = _OverlappingCpuLoader(
+                    lambda x: planner.resolve_data(x),
+                    inflight_threshhold=inflight_threshhold,
                 )
-
-            for write_item in tensor_w:
-                tensor = _trim(
-                    cast(torch.Tensor, planner.resolve_data(write_item))
-                )
-                assert not tensor.is_cuda
-                write_results.append(
-                    _write_item(stream, tensor, write_item, file_name)
+            else:
+                loader = _SerialCpuLoader(
+                    lambda x: planner.resolve_data(x),
                 )
 
-            if use_fsync:
-                os.fsync(stream.fileno())
-
-    return write_results
+            tensor_w = [
+                wi for wi in write_items if wi.type != WriteItemType.BYTE_IO
+            ]
+            for write_item in tensor_w:
+                loader.add(_item_size(write_item), write_item)
+            loader.start_loading()
+
+            bytes_w = [
+                wi for wi in write_items if wi.type == WriteItemType.BYTE_IO
+            ]
+            write_results = []
+
+            with open(file_name, "wb") as stream:
+                for write_item in bytes_w:
+                    data = planner.resolve_data(write_item)
+                    write_results.append(
+                        _write_item(stream, data, write_item, storage_key)
+                    )
+
+                for tensor, write_item in loader.values():
+                    assert not tensor.is_cuda
+                    write_results.append(
+                        _write_item(stream, tensor, write_item, storage_key)
+                    )
+
+                if use_fsync:
+                    os.fsync(stream.fileno())
+            result_queue.put(write_results)
+    except queue.Empty:
+        pass
 
 
 class FileSystemWriter(StorageWriter):
@@ -146,6 +323,8 @@ def __init__(
         path: Union[str, os.PathLike],
         single_file_per_rank: bool = False,
         sync_files: bool = True,
+        thread_count: int = 1,
+        per_thread_copy_ahead: int = 10_000_000,
     ) -> None:
         """
         Initialize the writer pointing to `path`
@@ -153,7 +332,9 @@ def __init__(
         Args:
             path: diretory where the checkpoint will be writen to.
             single_file_per_rank: Produce one file per rank instead of one file per tensor/blob. Default to True.
-            sync_files: force files to be synced to permanent storage. Default to True.
+            sync_files : force files to be synced to permanent storage. Default to True.
+            thread_count: Number of IO threads to use to write. Default to 1.
+            per_thread_copy_ahead: How many bytes to copy from the GPU ahead of saving then. Default 10Mb.
 
         N. B. If sync_files is disabled, there's no guarantee that the checkpoint will be consistent in the case of a failure.
         """
@@ -161,6 +342,8 @@ def __init__(
         self.path = Path(path)
         self.single_file_per_rank = single_file_per_rank
         self.sync_files = sync_files
+        self.thread_count = thread_count
+        self.per_thread_copy_ahead = per_thread_copy_ahead
 
     def init(self, is_coordinator: bool) -> None:
         pass
@@ -194,24 +377,56 @@ def gen_file():
             file_count += 1
             return file_name
 
-        file_queue = []
+        file_queue: queue.Queue = queue.Queue()
         if self.single_file_per_rank:
-            file_name = gen_file()
-            file_queue.append((self.path / file_name, file_name, plan.items))
+            for bucket in _split_by_size_and_type(
+                self.thread_count, plan.items
+            ):
+                file_name = gen_file()
+                file_queue.put((self.path / file_name, file_name, bucket))
         else:
             for item in plan.items:
                 file_name = gen_file()
-                file_queue.append((self.path / file_name, file_name, [item]))
-
-        results = _write_files_from_queue(
+                file_queue.put((self.path / file_name, file_name, [item]))
+
+        result_queue: queue.Queue = queue.Queue()
+
+        threads = []
+        for _ in range(1, self.thread_count):
+            t = threading.Thread(
+                target=_write_files_from_queue,
+                args=(
+                    file_queue,
+                    result_queue,
+                    planner,
+                    self.per_thread_copy_ahead,
+                    self.sync_files,
+                ),
+            )
+            t.start()
+            threads.append(t)
+
+        _write_files_from_queue(
             file_queue=file_queue,
+            result_queue=result_queue,
             planner=planner,
+            inflight_threshhold=self.per_thread_copy_ahead,
             use_fsync=self.sync_files,
         )
 
-        fut: Future[List[WriteResult]] = Future()
-        fut.set_result(results)
-        return fut
+        for t in threads:
+            t.join()
+
+        res = []
+        try:
+            while True:
+                res += result_queue.get_nowait()
+        except queue.Empty:
+            pass
+
+            fut: Future[List[WriteResult]] = Future()
+            fut.set_result(res)
+            return fut
 
     def finish(
         self, metadata: Metadata, results: List[List[WriteResult]]
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 272dd7479ce5e..c92ba02653f7d 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -365,9 +365,10 @@ def create_tcp_store(
     # TSAN runs much slower.
     TIMEOUT_DEFAULT = 500
 else:
-    TIMEOUT_DEFAULT = int(os.getenv("DISTRIBUTED_TESTS_DEFAULT_TIMEOUT", "300"))
+    TIMEOUT_DEFAULT = int(os.getenv('DISTRIBUTED_TESTS_DEFAULT_TIMEOUT', '300'))
 TIMEOUT_OVERRIDE = {"test_ddp_uneven_inputs": 400}
 
+
 # https://github.com/pytorch/pytorch/issues/75665
 if TEST_WITH_ROCM:
     TIMEOUT_OVERRIDE["test_join_kwargs"] = 200
diff --git a/torch/testing/_internal/distributed/_shard/sharded_tensor/__init__.py b/torch/testing/_internal/distributed/_shard/sharded_tensor/__init__.py
index c8f22d0903756..0ca11a7fbd724 100644
--- a/torch/testing/_internal/distributed/_shard/sharded_tensor/__init__.py
+++ b/torch/testing/_internal/distributed/_shard/sharded_tensor/__init__.py
@@ -91,6 +91,6 @@ def wrapper(self, *args, **kwargs):
         if backend == "nccl" and torch.cuda.device_count() < self.world_size:
             sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
         self.init_comms(init_rpc=init_rpc, backend=backend)
-        func(self)
+        func(self, *args, **kwargs)
         self.destroy_comms(destroy_rpc=init_rpc)
     return wrapper

From 1a15ad4b86ee13b3be54b3306f6df5b93b90b082 Mon Sep 17 00:00:00 2001
From: Pearu Peterson <pearu.peterson@gmail.com>
Date: Wed, 30 Nov 2022 00:35:45 +0200
Subject: [PATCH 1417/1922] Row and column select support for block compressed
 sparse tensors (#88733)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As in the title:

- Support `select` and `select_copy` on block sparse compressed tensors
- Fixes incorrect results when selecting dense dimensions

The PR also improves the performance of indexing sparse compressed tensors considerably:

<details>

Before:

```python
In [3]: a=torch.rand((1000, 1000)).to_sparse_csr()

In [4]: %timeit a.select(0, 0)
606 µs ± 4.27 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

In [5]: %timeit a.select(1, 0)
527 µs ± 57.7 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)

In [6]: %timeit a[0, 0]
617 µs ± 3.74 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

In [7]: a = a.cuda()

In [8]: %timeit a.select(0, 0); torch.cuda.synchronize();
1.19 ms ± 137 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)

In [9]: %timeit a.select(1, 0); torch.cuda.synchronize();
1.2 ms ± 119 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)

In [10]: %timeit a[0, 0]; torch.cuda.synchronize();
1.23 ms ± 482 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
```

This PR:

```python
In [3]: a=torch.rand((1000, 1000)).to_sparse_csr()

In [4]: %timeit a.select(0, 0)
4.75 µs ± 8.94 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)

In [5]: %timeit a.select(1, 0)
565 µs ± 156 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)

In [6]: %timeit a[0, 0]
13.1 µs ± 435 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)

In [7]: a = a.cuda()

In [8]: %timeit a.select(0, 0); torch.cuda.synchronize();
21.6 µs ± 23.9 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

In [9]: %timeit a.select(1, 0); torch.cuda.synchronize();
1.15 ms ± 3.13 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

In [10]: %timeit a[0, 0]; torch.cuda.synchronize();
63.7 µs ± 2.5 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
```

</details>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88733
Approved by: https://github.com/nikitaved, https://github.com/amjames, https://github.com/cpuhrsch
---
 aten/src/ATen/native/native_functions.yaml    |   1 +
 .../ATen/native/sparse/SparseCsrTensor.cpp    | 299 +++++++++++++++++-
 test/test_sparse.py                           |   3 +-
 test/test_sparse_csr.py                       | 111 ++++---
 torch/testing/_internal/common_utils.py       |   3 +-
 5 files changed, 351 insertions(+), 66 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 8cab3667e142b..ec86ce36b4247 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -12921,6 +12921,7 @@
   variants: function
   dispatch:
     CompositeExplicitAutogradNonFunctional: select_copy_symint
+    SparseCsrCPU, SparseCsrCUDA: select_copy_sparse_csr
   tags: view_copy
 
 - func: detach_copy(Tensor self) -> Tensor
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensor.cpp b/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
index ef205c5673ae8..59db274a978b2 100644
--- a/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
@@ -23,6 +23,7 @@
 #include <ATen/ops/_sparse_bsr_tensor_unsafe_native.h>
 #include <ATen/ops/_sparse_bsc_tensor_unsafe_native.h>
 #include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
+#include <ATen/ops/_sparse_coo_tensor_unsafe.h>
 #include <ATen/ops/_validate_sparse_compressed_tensor_args_native.h>
 #include <ATen/ops/_validate_sparse_csr_tensor_args_native.h>
 #include <ATen/ops/_validate_sparse_csc_tensor_args_native.h>
@@ -42,6 +43,8 @@
 #include <ATen/ops/resize_native.h>
 #include <ATen/ops/row_indices_native.h>
 #include <ATen/ops/select_native.h>
+#include <ATen/ops/select_copy.h>
+#include <ATen/ops/select_copy_native.h>
 #include <ATen/ops/sparse_compressed_tensor_native.h>
 #include <ATen/ops/sparse_csr_tensor_native.h>
 #include <ATen/ops/sparse_csc_tensor_native.h>
@@ -50,6 +53,7 @@
 #include <ATen/ops/sparse_dim_native.h>
 #include <ATen/ops/values_native.h>
 #include <ATen/ops/_validate_compressed_sparse_indices.h>
+#include <ATen/ops/where.h>
 #endif
 
 namespace at {
@@ -59,6 +63,50 @@ using namespace at::sparse_csr;
 
 namespace {
 
+bool solve_arange(const Tensor& input, int64_t& start, int64_t& end, int64_t& step) {
+  /*
+    This function solves the equation
+
+      input == arange(start, end, step)
+
+    for integers start, end, and step, if possible. If the solution
+    exists, returns true.
+  */
+  int64_t n = input.numel();
+  if (n == 0) {
+    // a trivial solution
+    start = end = 0;
+    step = 1;
+  } else if (n == 1) {
+    // a simple solution
+    start = input[0].item<int64_t>();
+    end = start + 1;
+    step = 1;
+  } else {
+    Tensor first_last = input.slice(0, 0, n, n - 1).cpu();
+    int64_t start_candidate = first_last[0].item<int64_t>();
+    int64_t end_candidate = first_last[1].item<int64_t>() + 1;
+    if (end_candidate - start_candidate == n) {
+      // a special solution
+      start = start_candidate;
+      end = end_candidate;
+      step = 1;
+    } else {
+      // detect if general solution exists
+      Tensor possible_steps = input.slice(0, 1).sub(input.slice(0, 0, n - 1));
+      Tensor possible_step = possible_steps[0];
+      if ((possible_steps.eq(possible_step)).all().item<bool>()) {
+        start = start_candidate;
+        end = end_candidate;
+        step = possible_step.item<int64_t>();
+      } else {
+        // no solution
+        return false;
+      }
+    }
+  }
+  return true;
+}
 
 } // end anonymous namespace
 
@@ -744,17 +792,19 @@ Tensor empty_like_sparse_csr(
   }
 }
 
-Tensor select_sparse_csr(const Tensor& self, int64_t dim, int64_t index) {
+template <bool require_view, bool require_copy>
+Tensor select_sparse_csr_worker(const Tensor& self, int64_t dim, int64_t index) {
+  constexpr const char* select_name = (require_view ? "select()" : "select_copy()");
   AT_DISPATCH_ALL_SPARSE_COMPRESSED_LAYOUTS(
-      self.layout(), "select()", []() { return; });
+      self.layout(), "select", []() { return; });
   TORCH_CHECK_INDEX(
-      self.dim() != 0, "select() cannot be applied to a 0-dim tensor.");
+      self.dim() != 0, select_name, " cannot be applied to a 0-dim tensor.");
   dim = maybe_wrap_dim(dim, self.dim());
   auto size = self.size(dim);
   if (index < -size || index >= size) {
     TORCH_CHECK_INDEX(
         false,
-        "select(): index ",
+        select_name, ": index ",
         index,
         " out of range for tensor of size ",
         self.sizes(),
@@ -765,6 +815,14 @@ Tensor select_sparse_csr(const Tensor& self, int64_t dim, int64_t index) {
     index += size;
   }
 
+  auto select_strided = [](const Tensor& self, int64_t dim, int64_t index) {
+    if (require_copy) {
+      return at::select_copy(self, dim, index);
+    } else {
+      return self.select(dim, index);
+    }
+  };
+
   TORCH_INTERNAL_ASSERT(dim >= 0 && dim < self.dim());
 
   auto new_sizes = DimVector(self.sizes());
@@ -790,7 +848,7 @@ Tensor select_sparse_csr(const Tensor& self, int64_t dim, int64_t index) {
     return at::native::_sparse_compressed_tensor_unsafe(
         compressed_indices.select(dim, index),
         plain_indices.select(dim, index),
-        self.values().select(dim, index),
+        select_strided(self.values(), dim, index),
         new_sizes,
         optTypeMetaToScalarType(options.dtype_opt()),
         options.layout_opt(),
@@ -798,28 +856,237 @@ Tensor select_sparse_csr(const Tensor& self, int64_t dim, int64_t index) {
         options.pinned_memory_opt());
   } else if (dim < n_batch + 2) {
     // Selecting sparse dimension
-    TORCH_CHECK(
-        self.layout() == kSparseCsr || self.layout() == kSparseCsc,
-        "select(): selecting non-batch dimensions is currently only supported for non-blocked sparse compressed layouts tensors.");
     TORCH_CHECK(
         n_batch == 0,
-        "select(): selecting rows or columns is not implemented for batched sparse compressed tensors.")
-    // Converting to COO and calling select is slightly slower than operating
-    // on the CSR indices directly for constructing a COO vector, however
-    // current version is more readable and easier to understand.
-    return self.to_sparse().select(dim, index);
+        select_name, ": selecting sparse dimensions is not implemented for batched sparse compressed tensors.")
+    TORCH_INTERNAL_ASSERT(dim == 0 || dim == 1);
+
+    DimVector blocksize{1, 1};
+    AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(self.layout(), "select", [&] {}, [&] {
+      blocksize[0] = std::max<int64_t>(1, self.values().size(n_batch + 1));
+      blocksize[1] = std::max<int64_t>(1, self.values().size(n_batch + 2));
+    });
+
+    auto indices_options = compressed_indices.options();
+    int64_t fast_dim = AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(self.layout(), "select", [&]() { return 0; }, [&]() { return 1; });
+    int64_t other_dim = (dim == 0 ? 1 : 0);
+    Tensor indices;
+    Tensor values;
+    bool is_view = dim == fast_dim;
+    if (is_view) {
+      // select is always a view operation
+      Tensor start_end = compressed_indices.narrow(0, index / blocksize[dim], 2).cpu();
+      int64_t start = start_end[0].item<int64_t>();
+      int64_t end = start_end[1].item<int64_t>();
+      indices = plain_indices.slice(0, start, end);
+      values = self.values().slice(0, start, end);
+    } else {
+      Tensor decompressed_indices = at::_convert_indices_from_csr_to_coo(compressed_indices, plain_indices)
+        .select(0, 0);
+
+      Tensor dim_indices = at::where(plain_indices.eq(index / blocksize[dim]))[0];
+      // Notice that dim_indices is a sorted sequence of non-negative
+      // distinct integers. Below we'll try to solve `dim_indices ==
+      // arange(start, stop, step)`. If the solution exists then the
+      // select will be a view operation also for the `dim !=
+      // fast_dim` case.
+      int64_t start{}, end{}, step{};
+      if (solve_arange(dim_indices, start, end, step)) {
+        indices = decompressed_indices.slice(0, start, end, step);
+        values = self.values().slice(0, start, end, step);
+        is_view = true;
+      } else {
+        // select will be a copy operation due to index_select!
+        indices = decompressed_indices.index_select(0, dim_indices);
+        values = self.values().index_select(0, dim_indices);
+      }
+    }
+
+    AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(self.layout(), "select", [&]() {},
+        [&]() {
+          /*
+            The formula for select indices and values below are best
+            explained by an example. Consider a BSR tensor with a
+            block size (2, 3) having four blocks (the other two blocks
+            contain all zeros and hence will not be specified):
+
+              [ 1  2  3] | [ 7  8  9]
+              [ 4  5  6] | [10 11 12]
+              ---------------------
+              [13 14 15] | [ 0  0  0]
+              [16 17 18] | [ 0  0  0]
+              -----------------------
+              [ 0  0  0] | [19 20 21]
+              [ 0  0  0] | [22 23 24]
+
+            that represents a 6 x 6 tensor:
+
+              [  1  2  3  7  8  9 ]
+              [  4  5  6 10 11 12 ]
+              [ 13 14 15  0  0  0 ]
+              [ 16 17 18  0  0  0 ]
+              [  0  0  0 19 20 21 ]
+              [  0  0  0 22 23 24 ]
+
+            The corresponding data for the BSR representation is:
+
+              crow_indices = [0 2 3 4]
+              col_indices =  [0 1 0 1]
+              values = [ [[1 2 3], [4 5 6]], [[7 8 9], [10 11 12]], [[13 14 15], [16 17 18]], [[19 20 21], [22 23 24]] ]
+              shape = (6, 6)
+
+            From crow_indices, we can find that
+
+              row_indices = [0 0 1 2]
+
+            In the following, we'll illustrate the details of
+            computing the result of torch.select_copy(input, dim,
+            index) where dim is 0 or 1, and index is in
+            range(shape[dim]).
+
+            Select a row of a BSR tensor
+            ----------------------------
+
+            We will consider first the dim=0 case that corresponds to
+            selecting a index-th row of the tensor. For instance, for
+            dim=0 and index=1, the expected result would represent a
+            1D tensor:
+
+              [  4  5  6 10 11 12 ]
+
+            that is a concatenated tensor of certain slices from the
+            first and the second block that is computed as follows:
+
+              values[dim_indices].select(1 + dim, index % blocksize[dim]).flatten(0, 1)
+              -> values[[0, 1]][:, 1 % 2].flatten(0, 1)
+              -> [ [[1 2 3], [4 5 6]], [[7 8 9], [10 11 12]] ][:, 1].flatten(0, 1)
+              -> [ [4 5 6], [10 11 12]].flatten(0, 1)
+              -> [ 4 5 6 10 11 12]
+
+            where dim_indices is found as
+
+              where(row_indices == index//blocksize[dim])
+              -> where([0 0 1 2] == 1//2)
+              -> [0 1]
+
+            The corresponding column indices are computed as
+
+              (col_indices[dim_indices].mul(blocksize[other_dim]).unsqueeze(1) + arange(blocksize[other_dim]).unsqueeze(0)).flatten(0, 1)
+
+            where other_dim is 1 if dim is 0, and 0 if dim is 1. Let's
+            expand the above expression with the data in the example:
+
+              -> (col_indices[[0, 1]].mul(3).unsqueeze(1) + arange(3).unsqueeze(0)).flatten(0, 1)
+              -> ([[0 1].mul(3).unsqueeze(1) + [[0 1 2]]).flatten(0, 1)
+              -> ([[[0], [3]] + [[0 1 2]]).flatten(0, 1)     <- here addition will use broadcasting rules!
+              -> ([[[0 1 2], [3 4 5]]).flatten(0, 1)
+              -> [0 1 2 3 4 5]
+
+            Finally, the select(dim=0, index=1) op on the given sparse
+            compressed tensors will return a COO tensor:
+
+              sparse_coo_tensor([0 1 2 3 4 5].unsqueeze(0), [4 5 6 10 11 12], (6,))
+
+            that represents the expected result: [ 4 5 6 10 11 12 ]
+
+            Select a column of a BSR tensor
+            -------------------------------
+
+            Next, we'll consider the dim=1 case that corresponds to
+            selecting the index-th column of the tensor. For instance,
+            for dim=1 and index=4, the expected result would represent
+            a 1D tensor:
+
+              [  8 11 0  0 20 23]
+
+            that is a concatenated tensor of certain slices from the
+            second and the last block:
+
+              values[dim_indices].select(1 + dim, index % blocksize[dim]).flatten(0, 1)
+              -> values[[1, 3]][:, :, 4 % 3 ].flatten(0, 1)
+              -> [ [[7 8 9], [10 11 12]], [[19 20 21], [22 23 24]] ][:, 1, 1].flatten(0, 1)
+              -> [ [8 11], [20 23]].flatten(0, 1)
+              -> [ 8 11 20 23 ]
+
+            The corresponding row indices are computed as
+
+              (row_indices[dim_indices].mul(blocksize[other_dim]).unsqueeze(1) + arange(blocksize[other_dim]).unsqueeze(0)).flatten(0, 1)
+
+            where dim_indices is
+
+              where(col_indices == index//blocksize[dim])
+              -> where([0 1 0 1] == 4//3)
+              -> [1 3]
+
+            and we have
+
+              (row_indices[dim_indices].mul(blocksize[other_dim]).unsqueeze(1) + arange(blocksize[other_dim]).unsqueeze(0)).flatten(0, 1)
+              -> (row_indices[[1 3]].mul(2).unsqueeze(1) + arange(2).unsqueeze(0)).flatten(0, 1)
+              -> ([0 4].unsqueeze(1) + [0 1].unsqueeze(0)).flatten(0, 1)
+              -> ([[0], [4]] + [[0 1]]).flatten(0, 1)     <- here addition will use broadcasting rules!
+              -> ([[0 1], [4 5]]).flatten(0, 1)
+              -> [ 0 1 4 5 ]
+
+            Finally, the select(dim=1, index=4) op on the given sparse
+            compressed tensors will return a COO tensor:
+
+              sparse_coo_tensor([0 1 4 5].unsqueeze(0), [8 11 20 23], (6,))
+
+            that represents the expected result: [ 8 11 0 0 20 23 ]
+
+           */
+          Tensor subblock_indices = at::arange(0, blocksize[other_dim], indices_options);
+          indices = indices.mul(blocksize[other_dim]).unsqueeze(1).add(subblock_indices.unsqueeze(0)).flatten(0, 1);
+          values = values.select(dim + 1, index % blocksize[dim]).flatten(0, 1);
+          // flatten(0, 1) can be a view or a copy operation. If view
+          // is required, it will be checked below via is_alias_of,
+          // otherwise, we'll check if copy is made here to avoid
+          // unnecessary clone below:
+          if (require_copy) {
+            is_view = values.is_alias_of(self.values());
+          }
+        });
+
+    if (require_view) {
+      TORCH_CHECK(values.is_alias_of(self.values()), select_name,
+                  ": no view exists for the given input, consider using torch.select_copy.");
+    }
+
+    indices = indices.unsqueeze(0).to(kLong);
+    if (require_copy && is_view) {
+      values = values.clone();
+    }
+    return at::_sparse_coo_tensor_unsafe(indices, values, new_sizes)._coalesced_(true);
   } else {
     // Selecting dense dimension
-    return AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(
+    Tensor new_values = AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(
         self.layout(),
         "select",
         // Non blocked layout (2 sparse dims become 1 nnz dim in values, so dim
         // is found one position to the left)
-        [&]() { return self.values().select(dim - 1, index); },
+        [&]() { return select_strided(self.values(), dim - 1, index); },
         // Block layout (2 sparse dims become 1 nnz dim + 2 block-shape dims in
         // values, so dim is found 1 position to the right)
-        [&]() { return self.values().select(dim + 1, index); });
+        [&]() { return select_strided(self.values(), dim + 1, index); });
+    return at::native::_sparse_compressed_tensor_unsafe(
+        compressed_indices,
+        plain_indices,
+        new_values,
+        new_sizes,
+        optTypeMetaToScalarType(options.dtype_opt()),
+        options.layout_opt(),
+        options.device_opt(),
+        options.pinned_memory_opt());
   }
 }
+
+Tensor select_sparse_csr(const Tensor& self, int64_t dim, int64_t index) {
+  return select_sparse_csr_worker<true, false>(self, dim, index);
+}
+
+Tensor select_copy_sparse_csr(const Tensor& self, int64_t dim, int64_t index) {
+  return select_sparse_csr_worker<false, true>(self, dim, index);
+}
+
 } // namespace native
 } // namespace at
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 5b399271bab62..e03e2f1682893 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -4052,8 +4052,7 @@ def test_basic(self):
 class TestSparseAny(TestCase):
 
     def test_generate_simple_inputs(self):
-        # Temporarily disable BSC and BSC layouts as these don't support select yet, see the next PR in the stack.
-        layouts = [torch.strided, torch.sparse_coo, torch.sparse_csr, torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc][:-2]
+        layouts = [torch.strided, torch.sparse_coo, torch.sparse_csr, torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc]
 
         tested_combinations = set()
         for tensors in zip(*map(self.generate_simple_inputs, layouts)):
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index 7275ecfa9f55b..225cde31483eb 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -322,9 +322,7 @@ def test_empty_errors(self, layout, device, dtype):
     @dtypes(*all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16))
     def test_clone(self, layout, device, dtype):
         for sparse in self.generate_simple_inputs(
-                layout, device=device, dtype=dtype, index_dtype=torch.int32,
-                # Temporarily disable testing batch block tensors:
-                enable_batch=layout in {torch.sparse_csr, torch.sparse_csc}):
+                layout, device=device, dtype=dtype, index_dtype=torch.int32):
             cloned_sparse = sparse.clone()
             self.assertEqual(sparse, cloned_sparse)
 
@@ -864,6 +862,51 @@ def test_pickle(self, layout, dtype, device):
 
             self.assertEqual(sparse, sparse_loaded)
 
+    @all_sparse_compressed_layouts()
+    @parametrize("index_dtype", [torch.int32, torch.int64])
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
+    def test_select_copy(self, device, dtype, index_dtype, layout):
+
+        def is_view_of(base, other):
+            # a shameless copy of TestViewOps.is_view_of
+            if ((not other._is_view() or
+                 other is base or
+                 other._base is not base or
+                 base.device != other.device)):
+                return False
+            if base.device.type == 'cpu' or base.device.type == 'cuda':
+                if base._storage().data_ptr() != other._storage().data_ptr():
+                    return False
+            return True
+
+        kwargs = dict(device=device, dtype=dtype, index_dtype=index_dtype)
+        for sparse, dense in zip(self.generate_simple_inputs(layout, **kwargs),
+                                 self.generate_simple_inputs(torch.strided, **kwargs)):
+            if layout in {torch.sparse_csr, torch.sparse_bsr}:
+                n_batchdim = sparse.crow_indices().ndim - 1
+            elif layout in {torch.sparse_csc, torch.sparse_bsc}:
+                n_batchdim = sparse.ccol_indices().ndim - 1
+            else:
+                assert 0  # unreachable
+            self.assertEqual(sparse, dense)
+            for dim in range(sparse.ndim):
+                if sparse.shape[dim] == 0:
+                    with self.assertRaisesRegex(IndexError, "index 0 out of range for tensor of size"):
+                        torch.select_copy(sparse, dim, 0)
+                    with self.assertRaisesRegex(IndexError, "index 0 out of range for tensor of size"):
+                        torch.select_copy(dense, dim, 0)
+                elif n_batchdim and dim >= n_batchdim and dim < n_batchdim + 2:
+                    with self.assertRaisesRegex(
+                            RuntimeError,
+                            "selecting sparse dimensions is not implemented for batched sparse compressed tensors"):
+                        torch.select_copy(sparse, dim, 0)
+                else:
+                    for index in {0, sparse.shape[dim] // 2, sparse.shape[dim] - 1}:
+                        dense_select = torch.select_copy(dense, dim, index)
+                        sparse_select = torch.select_copy(sparse, dim, index)
+                        self.assertEqual(sparse_select, dense_select)
+                        self.assertFalse(is_view_of(sparse_select.values(), sparse.values()))
+
 
 def _npref_block_addmm_addmv(c, a, b, alpha, beta):
     return alpha * (a @ b) + beta * c
@@ -939,52 +982,26 @@ def test_select(self, device, dtype, index_dtype, layout):
                                                        device=device)
         self.assertEqual(expected_sparse_selected12, sparse_selected12)
 
-        # Select from dense dimensions
-        sparse_hybrid = self.genSparseCompressedTensor(shape + (4, 2),
-                                                       nnz,
-                                                       device=device,
-                                                       layout=layout,
-                                                       dtype=dtype,
-                                                       index_dtype=index_dtype,
-                                                       blocksize=blocksize,
-                                                       dense_dims=2)
-        sparse_hybrid_dense_selected = sparse_hybrid.select(4, 1)
-        expected_sparse_hybrid_dense_selected = sparse_hybrid.values().select(-2, 1)
-        self.assertEqual(expected_sparse_hybrid_dense_selected, sparse_hybrid_dense_selected)
-
-
-
         # selecting rows/col with batch dims not allowed
         sparse_non_batched = sparse[0, 0]
-        # select from sparse dimensions if layout supports is
-        if layout in {torch.sparse_csr, torch.sparse_csc}:
-
-            for select_args in [(0, 0), (1, 1)]:
-                sparse_selected = sparse_non_batched.select(*select_args)
-                dense_selected = sparse_non_batched.to_dense().select(*select_args)
-                self.assertEqual(dense_selected, sparse_selected)
-
-            self.assertEqual(sparse[0, 0, 0, 0], sparse.to_dense()[0, 0, 0, 0])
-            # assigning to sparse through indexing is disabled, not tested generally because only layouts supporting
-            # sparse dim select will get far enough to test
-            with self.assertRaisesRegex(TypeError, "Cannot assign to a sparse tensor"):
-                sparse[0, 0, 0, 0] = 99.0
-
-            # select from sparse dimensions without removing batch dims, not tested generally because only layouts
-            # supporting sparse dim select will get far enough
-            msg = "selecting rows or columns is not implemented for batched sparse compressed tensors."
-            with self.assertRaisesRegex(RuntimeError, msg):
-                sparse.select(-2, 0)
-
-            with self.assertRaisesRegex(RuntimeError, msg):
-                sparse.select(-1, 0)
-        # ensure raises if layout does not support
-        else:
-            msg = (
-                "selecting non-batch dimensions is currently only supported for non-blocked sparse "
-                "compressed layouts tensors.")
-            with self.assertRaisesRegex(RuntimeError, msg):
-                sparse_non_batched.select(0, 0)
+        # select from sparse dimensions
+        for select_args in [(0, 0), (1, 1)]:
+            sparse_selected = sparse_non_batched.select(*select_args)
+            dense_selected = sparse_non_batched.to_dense().select(*select_args)
+            self.assertEqual(dense_selected, sparse_selected)
+
+        self.assertEqual(sparse[0, 0, 0, 0], sparse.to_dense()[0, 0, 0, 0])
+        # assigning to sparse through indexing is disabled
+        with self.assertRaisesRegex(TypeError, "Cannot assign to a sparse tensor"):
+            sparse[0, 0, 0, 0] = 99.0
+
+        # select from sparse dimensions without removing batch dims
+        msg = "selecting sparse dimensions is not implemented for batched sparse compressed tensors."
+        with self.assertRaisesRegex(RuntimeError, msg):
+            sparse.select(-2, 0)
+
+        with self.assertRaisesRegex(RuntimeError, msg):
+            sparse.select(-1, 0)
 
     @skipMeta
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index dc8b04d063718..9eecc5d5b2dc4 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -1863,7 +1863,8 @@ def to_dense(tensor):
             def partial_to_dense(tensor):
                 if tensor.layout not in compressed_sparse_layouts or tensor.values().ndim == 1:
                     return tensor.to_dense()
-                return torch.stack([partial_to_dense(sub_tensor) for sub_tensor in tensor])
+                lst = [partial_to_dense(torch.select_copy(tensor, 0, i)) for i in range(len(tensor))]
+                return torch.stack(lst) if lst else tensor.to_dense()
 
             return partial_to_dense(tensor)
 

From 44df4ba4bc0c65fdd48994b9c2432abd4d885195 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 30 Nov 2022 11:37:56 +0000
Subject: [PATCH 1418/1922] Revert "Add bits tensor types (#88594)"

This reverts commit f3b1315eee92ac108f9ceacafaf4ad560c78769d.

Reverted https://github.com/pytorch/pytorch/pull/88594 on behalf of https://github.com/jeanschmidt due to breaking internal builds
---
 aten/src/ATen/DLConvertor.cpp      |  7 ----
 c10/core/ScalarType.h              | 64 +++++++++---------------------
 c10/test/util/bits16_test.py       | 43 --------------------
 c10/test/util/bits_test.py         | 56 --------------------------
 c10/util/bits.h                    | 61 ----------------------------
 torch/csrc/utils/tensor_dtypes.cpp | 10 -----
 6 files changed, 18 insertions(+), 223 deletions(-)
 delete mode 100644 c10/test/util/bits16_test.py
 delete mode 100644 c10/test/util/bits_test.py
 delete mode 100644 c10/util/bits.h

diff --git a/aten/src/ATen/DLConvertor.cpp b/aten/src/ATen/DLConvertor.cpp
index 542adb9698176..614dc46158e8f 100644
--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@@ -60,13 +60,6 @@ DLDataType getDLDataType(const Tensor& t) {
     case ScalarType::QUInt2x4:
       TORCH_CHECK(false, "QUInt/QInt types are not supported by dlpack");
       break;
-    case ScalarType::Bits1x8:
-    case ScalarType::Bits2x4:
-    case ScalarType::Bits4x2:
-    case ScalarType::Bits8:
-    case ScalarType::Bits16:
-      TORCH_CHECK(false, "Bit types are not supported by dlpack");
-      break;
     case ScalarType::Undefined:
       TORCH_CHECK(false, "Undefined is not a valid ScalarType");
     case ScalarType::NumOptions:
diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h
index 2fa3c9ceb4ea4..51de905def9c1 100644
--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@@ -3,7 +3,6 @@
 #include <c10/util/BFloat16.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Half.h>
-#include <c10/util/bits.h>
 #include <c10/util/complex.h>
 #include <c10/util/qint32.h>
 #include <c10/util/qint8.h>
@@ -44,12 +43,7 @@ namespace c10 {
   _(c10::qint32, QInt32) /* 14 */                        \
   _(at::BFloat16, BFloat16) /* 15 */                     \
   _(c10::quint4x2, QUInt4x2) /* 16 */                    \
-  _(c10::quint2x4, QUInt2x4) /* 17 */                    \
-  _(c10::bits1x8, Bits1x8) /* 18 */                      \
-  _(c10::bits2x4, Bits2x4) /* 19 */                      \
-  _(c10::bits4x2, Bits4x2) /* 20 */                      \
-  _(c10::bits8, Bits8) /* 21 */                          \
-  _(c10::bits16, Bits16) /* 22 */
+  _(c10::quint2x4, QUInt2x4) /* 17 */
 
 // If you want to support ComplexHalf for real, add ComplexHalf
 // into this macro (and change the name).  But beware: convert()
@@ -278,12 +272,6 @@ static inline bool isQIntType(ScalarType t) {
       t == ScalarType::QUInt2x4;
 }
 
-static inline bool isBitsType(ScalarType t) {
-  return t == ScalarType::Bits1x8 || t == ScalarType::Bits2x4 ||
-      t == ScalarType::Bits4x2 || t == ScalarType::Bits8 ||
-      t == ScalarType::Bits16;
-}
-
 static inline ScalarType toQIntType(ScalarType t) {
   switch (t) {
     case ScalarType::Byte:
@@ -321,12 +309,6 @@ static inline bool isSignedType(ScalarType t) {
     return std::numeric_limits<ctype>::is_signed;
 
   switch (t) {
-    case ScalarType::Bits1x8:
-    case ScalarType::Bits2x4:
-    case ScalarType::Bits4x2:
-    case ScalarType::Bits8:
-    case ScalarType::Bits16:
-      TORCH_CHECK(false, "Bits types are undefined");
     case ScalarType::ComplexHalf:
     case ScalarType::ComplexFloat:
     case ScalarType::ComplexDouble:
@@ -441,38 +423,28 @@ static inline ScalarType promoteTypes(ScalarType a, ScalarType b) {
         toString(b));
   }
 
-  if (isBitsType(a) && a == b) {
-    return a;
-  } else if (isBitsType(a) || isBitsType(b)) {
-    return ScalarType::Undefined;
-  }
-
   // this matrix has to be consistent with
   // AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS undefined is used where we
   // are not sure about the correct value for type promotion.
   static constexpr ScalarType _promoteTypesLookup[static_cast<int>(
       ScalarType::NumOptions)][static_cast<int>(ScalarType::NumOptions)] = {
-      // clang-format off
-      /*        u1  i1  i2  i4  i8  f2  f4  f8  c2  c4  c8  b1  q1  q2  q3  bf  q4  q5*/
-      /* u1 */ {u1, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, u1, ud, ud, ud, bf, ud, ud},
-      /* i1 */ {i2, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, i1, ud, ud, ud, bf, ud, ud},
-      /* i2 */ {i2, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, i2, ud, ud, ud, bf, ud, ud},
-      /* i4 */ {i4, i4, i4, i4, i8, f2, f4, f8, c2, c4, c8, i4, ud, ud, ud, bf, ud, ud},
-      /* i8 */ {i8, i8, i8, i8, i8, f2, f4, f8, c2, c4, c8, i8, ud, ud, ud, bf, ud, ud},
-      /* f2 */ {f2, f2, f2, f2, f2, f2, f4, f8, c2, c4, c8, f2, ud, ud, ud, f4, ud, ud},
-      /* f4 */ {f4, f4, f4, f4, f4, f4, f4, f8, c4, c4, c8, f4, ud, ud, ud, f4, ud, ud},
-      /* f8 */ {f8, f8, f8, f8, f8, f8, f8, f8, c8, c8, c8, f8, ud, ud, ud, f8, ud, ud},
-      /* c2 */ {c2, c2, c2, c2, c2, c2, c4, c8, c2, c4, c8, c2, ud, ud, ud, c4, ud, ud},
-      /* c4 */ {c4, c4, c4, c4, c4, c4, c4, c8, c4, c4, c8, c4, ud, ud, ud, c4, ud, ud},
-      /* c8 */ {c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, ud, ud, ud, c8, ud, ud},
-      /* b1 */ {u1, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, b1, ud, ud, ud, bf, ud, ud},
-      /* q1 */ {ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud},
-      /* q2 */ {ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud},
-      /* q3 */ {ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud},
-      /* bf */ {bf, bf, bf, bf, bf, f4, f4, f8, c4, c4, c8, bf, ud, ud, ud, bf, ud, ud},
-      /* q4 */ {ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud},
-      /* q5 */ {ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud},
-      // clang-format on
+      /*        u1  i1  i2  i4  i8  f2  f4  f8  c2  c4  c8  b1  q1  q2  q3  bf*/
+      /* u1 */ {u1, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, u1, ud, ud, ud, bf},
+      /* i1 */ {i2, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, i1, ud, ud, ud, bf},
+      /* i2 */ {i2, i2, i2, i4, i8, f2, f4, f8, c2, c4, c8, i2, ud, ud, ud, bf},
+      /* i4 */ {i4, i4, i4, i4, i8, f2, f4, f8, c2, c4, c8, i4, ud, ud, ud, bf},
+      /* i8 */ {i8, i8, i8, i8, i8, f2, f4, f8, c2, c4, c8, i8, ud, ud, ud, bf},
+      /* f2 */ {f2, f2, f2, f2, f2, f2, f4, f8, c2, c4, c8, f2, ud, ud, ud, f4},
+      /* f4 */ {f4, f4, f4, f4, f4, f4, f4, f8, c4, c4, c8, f4, ud, ud, ud, f4},
+      /* f8 */ {f8, f8, f8, f8, f8, f8, f8, f8, c8, c8, c8, f8, ud, ud, ud, f8},
+      /* c2 */ {c2, c2, c2, c2, c2, c2, c4, c8, c2, c4, c8, c2, ud, ud, ud, c4},
+      /* c4 */ {c4, c4, c4, c4, c4, c4, c4, c8, c4, c4, c8, c4, ud, ud, ud, c4},
+      /* c8 */ {c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, c8, ud, ud, ud, c8},
+      /* b1 */ {u1, i1, i2, i4, i8, f2, f4, f8, c2, c4, c8, b1, ud, ud, ud, bf},
+      /* q1 */ {ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud},
+      /* q2 */ {ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud},
+      /* q3 */ {ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud, ud},
+      /* bf */ {bf, bf, bf, bf, bf, f4, f4, f8, c4, c4, c8, bf, ud, ud, ud, bf},
   };
   return _promoteTypesLookup[static_cast<int>(a)][static_cast<int>(b)];
 }
diff --git a/c10/test/util/bits16_test.py b/c10/test/util/bits16_test.py
deleted file mode 100644
index 97a8220f16fc8..0000000000000
--- a/c10/test/util/bits16_test.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import torch
-from torch.testing._internal.common_utils import run_tests, TestCase
-from torch.utils._mode_utils import no_dispatch
-from torch.utils._pytree import tree_map
-
-class TensorSubclassDemo(torch.Tensor):
-    def __new__(cls, elem):
-        assert elem.dtype == torch.bits16
-        return torch.Tensor._make_subclass(cls, elem, elem.requires_grad)
-
-    def __init__(self, elem):
-        super().__init__()
-
-    @classmethod
-    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
-        def unwrap(t):
-            if isinstance(t, torch.Tensor):
-                with no_dispatch():
-                    return t.view(torch.int16)
-            return t
-
-        args = tree_map(unwrap, args)
-        kwargs = tree_map(unwrap, kwargs)
-        with no_dispatch():
-            out = func(*args, **kwargs)
-        return out.view(torch.bits16)
-
-    def __repr__(self) -> str:
-        with no_dispatch():
-            return f"TensorSubclassDemo{self.view(torch.int16)}"
-
-
-class TestBits16(TestCase):
-    def test(self):
-        t = torch.zeros(20, dtype=torch.int16).view(torch.bits16)
-        _ = torch.empty(20, dtype=torch.bits16)
-
-        s = TensorSubclassDemo(t)
-        s = s + 1
-
-
-if __name__ == '__main__':
-    run_tests()
diff --git a/c10/test/util/bits_test.py b/c10/test/util/bits_test.py
deleted file mode 100644
index c87c8428b29a1..0000000000000
--- a/c10/test/util/bits_test.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import torch
-from torch.testing._internal.common_utils import run_tests, TestCase
-from torch.utils._mode_utils import no_dispatch
-from torch.utils._pytree import tree_map
-
-class Int16Tensor(torch.Tensor):
-    def __new__(cls, elem):
-        assert elem.dtype == torch.bits16
-        return torch.Tensor._make_subclass(cls, elem, elem.requires_grad)
-
-    def __init__(self, elem):
-        super().__init__()
-
-    @classmethod
-    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
-        def unwrap(t):
-            if isinstance(t, torch.Tensor):
-                with no_dispatch():
-                    return t.view(torch.int16)
-            return t
-        args = tree_map(unwrap, args)
-        kwargs = tree_map(unwrap, kwargs)
-
-        with no_dispatch():
-            out = func(*args, **kwargs)
-
-        def wrap(t):
-            if isinstance(t, torch.Tensor):
-                with no_dispatch():
-                    return t.view(torch.bits16)
-            return t
-        out = tree_map(wrap, out)
-        return out
-
-    def __repr__(self) -> str:
-        with no_dispatch():
-            t16 = self.view(torch.int16)
-            return f"TensorSubclassDemo{self.view(torch.int16)}"
-
-
-class TestBits(TestCase):
-    def test_types(self):
-        bits_types = [torch.bits1x8, torch.bits2x4, torch.bits4x2, torch.bits8, torch.bits16]
-        for bits_type in bits_types:
-            _ = torch.zeros(20, dtype=torch.int32).view(bits_type)
-            _ = torch.empty(20, dtype=bits_type)
-
-    def test_subclass(self):
-        t = torch.zeros(20, dtype=torch.int16).view(torch.bits16)
-        s = Int16Tensor(t)
-        s = s + 1 - 1
-        self.assertTrue(torch.allclose(s, torch.zeros(20, dtype=torch.bits16)))
-
-
-if __name__ == '__main__':
-    run_tests()
diff --git a/c10/util/bits.h b/c10/util/bits.h
deleted file mode 100644
index 89abf454791ef..0000000000000
--- a/c10/util/bits.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#pragma once
-#include <cstdint>
-
-#include <c10/macros/Macros.h>
-
-namespace c10 {
-
-/**
- * bits1x8 is an uninterpreted dtype of a tensor with 1 bit (packed to byte
- * boundary), without any semantics defined.
- */
-struct alignas(1) bits1x8 {
-  using underlying = uint8_t;
-  uint8_t val_;
-  bits1x8() = default;
-  C10_HOST_DEVICE explicit bits1x8(uint8_t val) : val_(val) {}
-};
-
-/**
- * bits2x4 is an uninterpreted dtype of a tensor with 2 bits (packed to byte
- * boundary), without any semantics defined.
- */
-struct alignas(1) bits2x4 {
-  using underlying = uint8_t;
-  uint8_t val_;
-  bits2x4() = default;
-  C10_HOST_DEVICE explicit bits2x4(uint8_t val) : val_(val) {}
-};
-
-/**
- * bits4x2 is an uninterpreted dtype of a tensor with 4 bits (packed to byte
- * boundary), without any semantics defined.
- */
-struct alignas(1) bits4x2 {
-  using underlying = uint8_t;
-  uint8_t val_;
-  bits4x2() = default;
-  C10_HOST_DEVICE explicit bits4x2(uint8_t val) : val_(val) {}
-};
-
-/**
- * bits8 is an uninterpreted dtype of a tensor with 8 bits, without any
- * semantics defined.
- */
-struct alignas(1) bits8 {
-  uint8_t val_;
-  bits8() = default;
-  C10_HOST_DEVICE explicit bits8(uint8_t val) : val_(val) {}
-};
-
-/**
- * bits16 is an uninterpreted dtype of a tensor with 16 bits, without any
- * semantics defined.
- */
-struct alignas(2) bits16 {
-  uint16_t val_;
-  bits16() = default;
-  C10_HOST_DEVICE explicit bits16(uint16_t val) : val_(val) {}
-};
-
-} // namespace c10
diff --git a/torch/csrc/utils/tensor_dtypes.cpp b/torch/csrc/utils/tensor_dtypes.cpp
index 07ed3297d557d..3e0e3acf38c29 100644
--- a/torch/csrc/utils/tensor_dtypes.cpp
+++ b/torch/csrc/utils/tensor_dtypes.cpp
@@ -52,16 +52,6 @@ std::pair<std::string, std::string> getDtypeNames(at::ScalarType scalarType) {
       return std::make_pair("quint4x2", "");
     case at::ScalarType::QUInt2x4:
       return std::make_pair("quint2x4", "");
-    case at::ScalarType::Bits1x8:
-      return std::make_pair("bits1x8", "");
-    case at::ScalarType::Bits2x4:
-      return std::make_pair("bits2x4", "");
-    case at::ScalarType::Bits4x2:
-      return std::make_pair("bits4x2", "");
-    case at::ScalarType::Bits8:
-      return std::make_pair("bits8", "");
-    case at::ScalarType::Bits16:
-      return std::make_pair("bits16", "");
     default:
       throw std::runtime_error("Unimplemented scalar type");
   }

From ab959fa67f707c67e1ed24e3aee586bddb379531 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 30 Nov 2022 12:06:18 +0000
Subject: [PATCH 1419/1922] Revert "[aarch64] add SLEEF dependency for aten_cpu
 (#89475)"

This reverts commit 3cef87f9fd59adb681d910b8edbc1f33e0be5ad2.

Reverted https://github.com/pytorch/pytorch/pull/89475 on behalf of https://github.com/jeanschmidt due to breaking internal builds
---
 buckbuild.bzl | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/buckbuild.bzl b/buckbuild.bzl
index d3d5dda57c41a..75c16ba006550 100644
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@@ -147,7 +147,6 @@ THIRD_PARTY_LIBS = {
     "rt": ["//xplat/third-party/linker_lib:rt", "//third_party:rt"],
     "ruy": ["//third-party/ruy:ruy_xplat_lib", "//third_party:ruy_lib"],
     "typing-extensions": ["//third-party/typing-extensions:typing-extensions", "//third_party:typing-extensions"],
-    "sleef": ["//third-party/sleef:sleef", "//third_party:sleef"],
 }
 
 def third_party(name):
@@ -1931,12 +1930,7 @@ def define_buck_targets(
                 third_party("glog"),
                 third_party("XNNPACK"),
                 third_party("pocketfft"),
-            ] + select({
-                "DEFAULT": [],
-                "ovr_config//runtime:fbcode-arm64": [
-                  third_party("sleef"),
-                ],
-            }),
+            ],
             compiler_flags = get_aten_compiler_flags(),
             exported_preprocessor_flags = get_aten_preprocessor_flags(),
             exported_deps = [

From 3657a6db57b44fa3add69f55cce8e2603b83efb8 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 30 Nov 2022 12:57:32 +0000
Subject: [PATCH 1420/1922] [benchmarks][dynamo] Trying CI - Set train() for
 TIMM models accuracy tests (#89780)

Moving to train mode for TIMM models and also raising batch size for accuracy testing.

Raising batch size seems to remove a lot of noise/instability coming from batch_norm decomposition.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89780
Approved by: https://github.com/ngimel
---
 benchmarks/dynamo/common.py      | 40 ++++++++++++++++++++++++--------
 benchmarks/dynamo/timm_models.py |  9 +++++++
 benchmarks/dynamo/torchbench.py  | 16 ++++++++++---
 torch/_inductor/ir.py            |  1 +
 4 files changed, 53 insertions(+), 13 deletions(-)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index cabbe3c411617..87aeb4d7c3561 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -111,6 +111,7 @@
     *CI_SKIP_INDCUTOR_INFERENCE,
     # TorchBench
     "Background_Matting",  # fp64_OOM
+    "dlrm",  # Fails on CI - unable to repro locally
     "mobilenet_v3_large",  # accuracy
     "resnet50_quantized_qat",  # Eager model failed to run
     # Huggingface
@@ -121,13 +122,17 @@
     # TIMM
     "convit_base",  # fp64_OOM
     "dm_nfnet_f0",  # accuracy
+    "convmixer_768_32",  # accuracy - Unable to repro on A100
+    "hrnet_w18",  # accuracy - Unable to repro on A100
+    "sebotnet33ts_256",  # accuracy - Unable to repro on A100
+    "hrnet_w18",  # accuracy - Unable to repro on A100
+    "eca_botnext26ts_256",  # accuracy - Fails on A100
     "eca_halonext26ts",  # accuracy
     "fbnetv3_b",  # accuracy
     "levit_128",  # fp64_OOM
     "res2net101_26w_4s",  # accuracy
-    "resnest101e",  # accuracy
-    "rexnet_100",  # accuracy
     "spnasnet_100",  # accuracy
+    "resnest101e",  # accuracy
     "swin_base_patch4_window7_224",  # accuracy
     "xcit_large_24_p8_224",  # fp64_OOM
 ]
@@ -193,6 +198,10 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         pass
 
 
+def nothing(f):
+    return f
+
+
 @functools.lru_cache(None)
 def patch_torch_manual_seed():
     """Make torch manual seed deterministic. Helps with accuracy testing."""
@@ -1062,10 +1071,6 @@ def record_status(accuracy_status):
             )
             return "PASS" if accuracy_status in ("pass", "pass_due_to_skip") else "FAIL"
 
-        tolerance, cos_similarity = self.get_tolerance_and_cosine_flag(
-            self.args.training, current_device, name
-        )
-
         if name in self.skip_accuracy_checks_large_models_dashboard:
             return record_status("pass_due_to_skip")
 
@@ -1089,11 +1094,18 @@ def deepcopy_and_maybe_ddp(model):
                 )
             )
         except Exception:
-            log.warning(f"fp64 golden ref were not generated for {name}")
+            log.warning(
+                f"fp64 golden ref were not generated for {name}. Setting accuracy check to cosine"
+            )
+            self.args.cosine = True
             fp64_outputs = None
             if self.args.ci and self.args.training:
                 return record_status("fp64_OOM")
 
+        tolerance, cos_similarity = self.get_tolerance_and_cosine_flag(
+            self.args.training, current_device, name
+        )
+
         # Cast the model to float16/float32 as necessary
         model, example_inputs = self.maybe_cast(model, example_inputs)
         accuracy_status = "pass"
@@ -1693,11 +1705,18 @@ def run(runner, args, original_dir=None):
         if args.batch_size is None:
             if runner.suite_name == "huggingface":
                 args.batch_size = 1
+            elif runner.suite_name == "torchbench":
+                args.batch_size = 4
             else:
-                args.batch_size = 2
+                # Larger batch size of TIMM models to have stable batch_norm
+                assert runner.suite_name == "timm_models"
+                args.batch_size = 8
 
         # Remove sources of randomness
-        args.use_eval_mode = True
+        if runner.suite_name != "timm_models":
+            # TODO - Using train mode for timm_models. Move to train mode for HF and Torchbench as well.
+            args.use_eval_mode = True
+        inductor_config.fallback_random = True
 
         # Remove randomeness when torch manual seed is called
         patch_torch_manual_seed()
@@ -1880,7 +1899,8 @@ def run(runner, args, original_dir=None):
             nopython=args.nopython,
         )
     elif args.nothing:
-        pass
+        optimize_ctx = nothing
+        output_filename = "nothing.csv"
     elif args.backend:
         optimize_ctx = torch._dynamo.optimize(args.backend, nopython=args.nopython)
         experiment = speedup_experiment
diff --git a/benchmarks/dynamo/timm_models.py b/benchmarks/dynamo/timm_models.py
index 70d06ab318189..133be5f85dbad 100755
--- a/benchmarks/dynamo/timm_models.py
+++ b/benchmarks/dynamo/timm_models.py
@@ -75,6 +75,11 @@ def pip_install(package):
 }
 
 
+MAX_BATCH_SIZE_FOR_ACCURACY_CHECK = {
+    "cait_m36_384": 4,
+}
+
+
 def refresh_model_names():
     import glob
 
@@ -223,6 +228,10 @@ def load_model(
             )
         batch_size = batch_size or recorded_batch_size
 
+        # Control the memory footprint for few models
+        if self.args.accuracy and model_name in MAX_BATCH_SIZE_FOR_ACCURACY_CHECK:
+            batch_size = min(batch_size, MAX_BATCH_SIZE_FOR_ACCURACY_CHECK[model_name])
+
         # example_inputs = torch.randn(
         #     (batch_size,) + input_size, device=device, dtype=data_dtype
         # )
diff --git a/benchmarks/dynamo/torchbench.py b/benchmarks/dynamo/torchbench.py
index 24a049f14ba2a..1ace31840e7a9 100755
--- a/benchmarks/dynamo/torchbench.py
+++ b/benchmarks/dynamo/torchbench.py
@@ -119,8 +119,7 @@ def setup_torchbench_cwd():
 }
 
 REQUIRE_COSINE_TOLERACE = {
-    # https://github.com/pytorch/torchdynamo/issues/556
-    "resnet50_quantized_qat",
+    # Just keeping it here even though its empty, if we need this in future.
 }
 
 # non-deterministic output / cant check correctness
@@ -183,6 +182,12 @@ def setup_torchbench_cwd():
 }
 
 
+MAX_BATCH_SIZE_FOR_ACCURACY_CHECK = {
+    "hf_GPT2": 2,
+    "pytorch_unet": 2,
+}
+
+
 class TorchBenchmarkRunner(BenchmarkRunner):
     def __init__(self):
         super(TorchBenchmarkRunner, self).__init__()
@@ -250,6 +255,10 @@ def load_model(
         if batch_size is None and is_training and model_name in USE_SMALL_BATCH_SIZE:
             batch_size = USE_SMALL_BATCH_SIZE[model_name]
 
+        # Control the memory footprint for few models
+        if self.args.accuracy and model_name in MAX_BATCH_SIZE_FOR_ACCURACY_CHECK:
+            batch_size = min(batch_size, MAX_BATCH_SIZE_FOR_ACCURACY_CHECK[model_name])
+
         # workaround "RuntimeError: not allowed to set torch.backends.cudnn flags"
         torch.backends.__allow_nonbracketed_mutation_flag = True
         extra_args = []
@@ -327,9 +336,10 @@ def get_tolerance_and_cosine_flag(self, is_training, current_device, name):
         tolerance = 1e-4
         cosine = self.args.cosine
         # Increase the tolerance for torch allclose
-        if self.args.float16:
+        if self.args.float16 or self.args.amp:
             return 1e-3, cosine
         if is_training and current_device == "cuda":
+            tolerance = 1e-3
             if name in REQUIRE_COSINE_TOLERACE:
                 cosine = True
             elif name in REQUIRE_HIGHER_TOLERANCE:
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index b8e872cdeee5c..7a182752bb16a 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -2985,6 +2985,7 @@ def create(cls, kernel, *args, **kwargs):
             aten._fft_c2c.out,
             aten._linalg_svd.default,
             aten._linalg_svd.U,
+            aten._fused_moving_avg_obs_fq_helper_functional,
         )
         context = (
             FakeTensorMode if kernel not in fake_incorrect_kernels else nullcontext

From 0d8ff601bade93ce5e8df3cf78d48d2a3dbf256a Mon Sep 17 00:00:00 2001
From: "haozhe.zhu" <haozhe.zhu@intel.com>
Date: Tue, 29 Nov 2022 23:54:54 +0000
Subject: [PATCH 1421/1922] add bf16 in fp32 out fast path for embedingbag in
 caffe2 perfkernel (#89198)

Add BF16 in FP32 out kernel into Caffe2 emb perfkernels. And also update the python code-gen files to generate the kernel.
The ut will be covered in the next PR(#89199) in this stack ( Tested by nn.EmbeddingBag with BF16 data type)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89198
Approved by: https://github.com/jgong5, https://github.com/kit1980
---
 caffe2/perfkernels/embedding_lookup_idx.cc    |    5 +
 .../perfkernels/embedding_lookup_idx_avx2.cc  | 1658 +++++++++++++++--
 caffe2/perfkernels/hp_emblookup_codegen.py    |   52 +-
 3 files changed, 1533 insertions(+), 182 deletions(-)

diff --git a/caffe2/perfkernels/embedding_lookup_idx.cc b/caffe2/perfkernels/embedding_lookup_idx.cc
index 2c9900b73e06b..48c869ee70381 100644
--- a/caffe2/perfkernels/embedding_lookup_idx.cc
+++ b/caffe2/perfkernels/embedding_lookup_idx.cc
@@ -1,5 +1,6 @@
 #include "caffe2/perfkernels/embedding_lookup_idx.h"
 
+#include <c10/util/BFloat16.h>
 #include <c10/util/Half.h>
 #include <c10/util/irange.h>
 #include "caffe2/core/common.h"
@@ -214,6 +215,8 @@ EMBEDDING_IDX_SPECIALIZATION(int32_t, float, float, float, false);
 EMBEDDING_IDX_SPECIALIZATION(int64_t, float, float, float, false);
 EMBEDDING_IDX_SPECIALIZATION(int32_t, half, at::Half, float, false);
 EMBEDDING_IDX_SPECIALIZATION(int64_t, half, at::Half, float, false);
+EMBEDDING_IDX_SPECIALIZATION(int32_t, bfloat16, at::BFloat16, float, false);
+EMBEDDING_IDX_SPECIALIZATION(int64_t, bfloat16, at::BFloat16, float, false);
 EMBEDDING_IDX_SPECIALIZATION(int32_t, uint8_t, uint8_t, float, false);
 EMBEDDING_IDX_SPECIALIZATION(int64_t, uint8_t, uint8_t, float, false);
 
@@ -221,6 +224,8 @@ EMBEDDING_IDX_SPECIALIZATION(int32_t, float, float, float, true);
 EMBEDDING_IDX_SPECIALIZATION(int64_t, float, float, float, true);
 EMBEDDING_IDX_SPECIALIZATION(int32_t, half, at::Half, float, true);
 EMBEDDING_IDX_SPECIALIZATION(int64_t, half, at::Half, float, true);
+EMBEDDING_IDX_SPECIALIZATION(int32_t, bfloat16, at::BFloat16, float, true);
+EMBEDDING_IDX_SPECIALIZATION(int64_t, bfloat16, at::BFloat16, float, true);
 EMBEDDING_IDX_SPECIALIZATION(int32_t, uint8_t, uint8_t, float, true);
 EMBEDDING_IDX_SPECIALIZATION(int64_t, uint8_t, uint8_t, float, true);
 
diff --git a/caffe2/perfkernels/embedding_lookup_idx_avx2.cc b/caffe2/perfkernels/embedding_lookup_idx_avx2.cc
index 674af836ba10b..3ed48a1c52322 100644
--- a/caffe2/perfkernels/embedding_lookup_idx_avx2.cc
+++ b/caffe2/perfkernels/embedding_lookup_idx_avx2.cc
@@ -6,6 +6,7 @@
 //// --------------------------
 
 #include <c10/util/Half.h>
+#include <c10/util/BFloat16.h>
 #include <immintrin.h>
 namespace caffe2 {
 
@@ -341,6 +342,7 @@ static bool EmbeddingLookupIdx_int32_t_float_float__avx2_fma(
     }
   } else {
     // generic code
+    // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-avoid-c-arrays)
     for (int rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
       float* op = &out[rangeIndex * block_size];
       int64_t j = 0;
@@ -471,6 +473,7 @@ static bool EmbeddingLookupIdx_int64_t_float_float__avx2_fma(
     bool normalize_by_lengths,
     float* out) {
   const int64_t prefdist_T0 = 16;
+  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
   const int64_t fused_block_size = block_size + 0;
   int64_t dataInd = 0;
   if (block_size == 128) {
@@ -511,7 +514,9 @@ static bool EmbeddingLookupIdx_int64_t_float_float__avx2_fma(
         __m256 vwgt = _mm256_set1_ps(wgt);
         const float* ip = &input[idx * fused_block_size];
         const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             ? (dataInd + prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             : dataInd;
         const int64_t idx_pref_T0 = indices[next_T0];
         if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
@@ -626,7 +631,9 @@ static bool EmbeddingLookupIdx_int64_t_float_float__avx2_fma(
         __m256 vwgt = _mm256_set1_ps(wgt);
         const float* ip = &input[idx * fused_block_size];
         const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             ? (dataInd + prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             : dataInd;
         const int64_t idx_pref_T0 = indices[next_T0];
         if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
@@ -701,7 +708,9 @@ static bool EmbeddingLookupIdx_int64_t_float_float__avx2_fma(
         __m256 vwgt = _mm256_set1_ps(wgt);
         const float* ip = &input[idx * fused_block_size];
         const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             ? (dataInd + prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             : dataInd;
         const int64_t idx_pref_T0 = indices[next_T0];
         if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
@@ -756,7 +765,9 @@ static bool EmbeddingLookupIdx_int64_t_float_float__avx2_fma(
         __m256 vwgt = _mm256_set1_ps(wgt);
         const float* ip = &input[idx * fused_block_size];
         const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             ? (dataInd + prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             : dataInd;
         const int64_t idx_pref_T0 = indices[next_T0];
         if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
@@ -780,6 +791,7 @@ static bool EmbeddingLookupIdx_int64_t_float_float__avx2_fma(
     }
   } else {
     // generic code
+    // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-avoid-c-arrays)
     for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
       float* op = &out[rangeIndex * block_size];
       int64_t j = 0;
@@ -807,7 +819,9 @@ static bool EmbeddingLookupIdx_int64_t_float_float__avx2_fma(
         __m256 vwgt = _mm256_set1_ps(wgt);
         const float* ip = &input[idx * fused_block_size];
         const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             ? (dataInd + prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             : dataInd;
         const int64_t idx_pref_T0 = indices[next_T0];
         if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
@@ -1477,6 +1491,7 @@ static bool EmbeddingLookupIdx_int64_t_half_float__avx2_fma(
     bool normalize_by_lengths,
     float* out) {
   const int64_t prefdist_T0 = 16;
+  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
   const int64_t fused_block_size = block_size + 0;
   int64_t dataInd = 0;
   if (block_size == 128) {
@@ -1517,7 +1532,9 @@ static bool EmbeddingLookupIdx_int64_t_half_float__avx2_fma(
         __m256 vwgt = _mm256_set1_ps(wgt);
         const at::Half* ip = &input[idx * fused_block_size];
         const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             ? (dataInd + prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             : dataInd;
         const int64_t idx_pref_T0 = indices[next_T0];
         if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
@@ -1692,7 +1709,9 @@ static bool EmbeddingLookupIdx_int64_t_half_float__avx2_fma(
         __m256 vwgt = _mm256_set1_ps(wgt);
         const at::Half* ip = &input[idx * fused_block_size];
         const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             ? (dataInd + prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             : dataInd;
         const int64_t idx_pref_T0 = indices[next_T0];
         if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
@@ -1797,7 +1816,9 @@ static bool EmbeddingLookupIdx_int64_t_half_float__avx2_fma(
         __m256 vwgt = _mm256_set1_ps(wgt);
         const at::Half* ip = &input[idx * fused_block_size];
         const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             ? (dataInd + prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             : dataInd;
         const int64_t idx_pref_T0 = indices[next_T0];
         if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
@@ -1867,7 +1888,9 @@ static bool EmbeddingLookupIdx_int64_t_half_float__avx2_fma(
         __m256 vwgt = _mm256_set1_ps(wgt);
         const at::Half* ip = &input[idx * fused_block_size];
         const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             ? (dataInd + prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             : dataInd;
         const int64_t idx_pref_T0 = indices[next_T0];
         if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
@@ -1928,7 +1951,9 @@ static bool EmbeddingLookupIdx_int64_t_half_float__avx2_fma(
         __m256 vwgt = _mm256_set1_ps(wgt);
         const at::Half* ip = &input[idx * fused_block_size];
         const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             ? (dataInd + prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             : dataInd;
         const int64_t idx_pref_T0 = indices[next_T0];
         if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
@@ -2022,12 +2047,12 @@ bool EmbeddingLookupIdx_int64_t_half_float_true__avx2_fma(
 }
 
 template <bool IS_WEIGHT_POSITIONAL>
-static bool EmbeddingLookupIdx_int32_t_uint8_t_float__avx2_fma(
+static bool EmbeddingLookupIdx_int32_t_bfloat16_float__avx2_fma(
     const int64_t block_size,
     const int64_t output_size,
     const int64_t index_size,
     const int64_t data_size,
-    const uint8_t* input,
+    const at::BFloat16* input,
     const int* indices,
     const int* offsets,
     const float* weights,
@@ -2070,16 +2095,11 @@ static bool EmbeddingLookupIdx_int32_t_uint8_t_float__avx2_fma(
           return false;
         }
         float wgt = 1.f;
-        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-        float bio;
         if (weights) {
           wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
         }
-        bio = wgt * scale_bias[2 * idx + 1];
-        wgt = wgt * scale_bias[2 * idx];
-        __m256 vbio = _mm256_set1_ps(bio);
         __m256 vwgt = _mm256_set1_ps(wgt);
-        const uint8_t* ip = &input[idx * fused_block_size];
+        const at::BFloat16* ip = &input[idx * fused_block_size];
         const int next_T0 = (dataInd < index_size - prefdist_T0)
             // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             ? (dataInd + prefdist_T0)
@@ -2089,104 +2109,138 @@ static bool EmbeddingLookupIdx_int32_t_uint8_t_float__avx2_fma(
         if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
           return false;
         }
-        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        const at::BFloat16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
         vop0 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
-            _mm256_add_ps(vop0, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (0)))),
+                16)),
+            vop0);
         _mm_prefetch(
             reinterpret_cast<const char*>(&ip_next_T0[0]), _MM_HINT_T0);
         vop8 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
-            _mm256_add_ps(vop8, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (8)))),
+                16)),
+            vop8);
         // skip unnecessary prefetch of (&ip_next_T0[8])
         vop16 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (16))))),
-            _mm256_add_ps(vop16, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (16)))),
+                16)),
+            vop16);
         // skip unnecessary prefetch of (&ip_next_T0[16])
         vop24 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (24))))),
-            _mm256_add_ps(vop24, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (24)))),
+                16)),
+            vop24);
         // skip unnecessary prefetch of (&ip_next_T0[24])
         vop32 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (32))))),
-            _mm256_add_ps(vop32, vbio));
-        // skip unnecessary prefetch of (&ip_next_T0[32])
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (32)))),
+                16)),
+            vop32);
+        _mm_prefetch(
+            reinterpret_cast<const char*>(&ip_next_T0[32]), _MM_HINT_T0);
         vop40 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (40))))),
-            _mm256_add_ps(vop40, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (40)))),
+                16)),
+            vop40);
         // skip unnecessary prefetch of (&ip_next_T0[40])
         vop48 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (48))))),
-            _mm256_add_ps(vop48, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (48)))),
+                16)),
+            vop48);
         // skip unnecessary prefetch of (&ip_next_T0[48])
         vop56 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (56))))),
-            _mm256_add_ps(vop56, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (56)))),
+                16)),
+            vop56);
         // skip unnecessary prefetch of (&ip_next_T0[56])
         vop64 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (64))))),
-            _mm256_add_ps(vop64, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (64)))),
+                16)),
+            vop64);
         _mm_prefetch(
             reinterpret_cast<const char*>(&ip_next_T0[64]), _MM_HINT_T0);
         vop72 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (72))))),
-            _mm256_add_ps(vop72, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (72)))),
+                16)),
+            vop72);
         // skip unnecessary prefetch of (&ip_next_T0[72])
         vop80 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (80))))),
-            _mm256_add_ps(vop80, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (80)))),
+                16)),
+            vop80);
         // skip unnecessary prefetch of (&ip_next_T0[80])
         vop88 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (88))))),
-            _mm256_add_ps(vop88, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (88)))),
+                16)),
+            vop88);
         // skip unnecessary prefetch of (&ip_next_T0[88])
         vop96 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (96))))),
-            _mm256_add_ps(vop96, vbio));
-        // skip unnecessary prefetch of (&ip_next_T0[96])
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (96)))),
+                16)),
+            vop96);
+        _mm_prefetch(
+            reinterpret_cast<const char*>(&ip_next_T0[96]), _MM_HINT_T0);
         vop104 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (104))))),
-            _mm256_add_ps(vop104, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (104)))),
+                16)),
+            vop104);
         // skip unnecessary prefetch of (&ip_next_T0[104])
         vop112 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (112))))),
-            _mm256_add_ps(vop112, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (112)))),
+                16)),
+            vop112);
         // skip unnecessary prefetch of (&ip_next_T0[112])
         vop120 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (120))))),
-            _mm256_add_ps(vop120, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (120)))),
+                16)),
+            vop120);
         // skip unnecessary prefetch of (&ip_next_T0[120])
       }
       if (!normalize_by_lengths || length == 0) {
@@ -2250,16 +2304,11 @@ static bool EmbeddingLookupIdx_int32_t_uint8_t_float__avx2_fma(
           return false;
         }
         float wgt = 1.f;
-        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-        float bio;
         if (weights) {
           wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
         }
-        bio = wgt * scale_bias[2 * idx + 1];
-        wgt = wgt * scale_bias[2 * idx];
-        __m256 vbio = _mm256_set1_ps(bio);
         __m256 vwgt = _mm256_set1_ps(wgt);
-        const uint8_t* ip = &input[idx * fused_block_size];
+        const at::BFloat16* ip = &input[idx * fused_block_size];
         const int next_T0 = (dataInd < index_size - prefdist_T0)
             // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             ? (dataInd + prefdist_T0)
@@ -2269,55 +2318,72 @@ static bool EmbeddingLookupIdx_int32_t_uint8_t_float__avx2_fma(
         if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
           return false;
         }
-        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        const at::BFloat16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
         vop0 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
-            _mm256_add_ps(vop0, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (0)))),
+                16)),
+            vop0);
         _mm_prefetch(
             reinterpret_cast<const char*>(&ip_next_T0[0]), _MM_HINT_T0);
         vop8 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
-            _mm256_add_ps(vop8, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (8)))),
+                16)),
+            vop8);
         // skip unnecessary prefetch of (&ip_next_T0[8])
         vop16 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (16))))),
-            _mm256_add_ps(vop16, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (16)))),
+                16)),
+            vop16);
         // skip unnecessary prefetch of (&ip_next_T0[16])
         vop24 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (24))))),
-            _mm256_add_ps(vop24, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (24)))),
+                16)),
+            vop24);
         // skip unnecessary prefetch of (&ip_next_T0[24])
         vop32 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (32))))),
-            _mm256_add_ps(vop32, vbio));
-        // skip unnecessary prefetch of (&ip_next_T0[32])
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (32)))),
+                16)),
+            vop32);
+        _mm_prefetch(
+            reinterpret_cast<const char*>(&ip_next_T0[32]), _MM_HINT_T0);
         vop40 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (40))))),
-            _mm256_add_ps(vop40, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (40)))),
+                16)),
+            vop40);
         // skip unnecessary prefetch of (&ip_next_T0[40])
         vop48 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (48))))),
-            _mm256_add_ps(vop48, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (48)))),
+                16)),
+            vop48);
         // skip unnecessary prefetch of (&ip_next_T0[48])
         vop56 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (56))))),
-            _mm256_add_ps(vop56, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (56)))),
+                16)),
+            vop56);
         // skip unnecessary prefetch of (&ip_next_T0[56])
       }
       if (!normalize_by_lengths || length == 0) {
@@ -2361,16 +2427,11 @@ static bool EmbeddingLookupIdx_int32_t_uint8_t_float__avx2_fma(
           return false;
         }
         float wgt = 1.f;
-        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-        float bio;
         if (weights) {
           wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
         }
-        bio = wgt * scale_bias[2 * idx + 1];
-        wgt = wgt * scale_bias[2 * idx];
-        __m256 vbio = _mm256_set1_ps(bio);
         __m256 vwgt = _mm256_set1_ps(wgt);
-        const uint8_t* ip = &input[idx * fused_block_size];
+        const at::BFloat16* ip = &input[idx * fused_block_size];
         const int next_T0 = (dataInd < index_size - prefdist_T0)
             // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             ? (dataInd + prefdist_T0)
@@ -2380,31 +2441,39 @@ static bool EmbeddingLookupIdx_int32_t_uint8_t_float__avx2_fma(
         if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
           return false;
         }
-        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        const at::BFloat16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
         vop0 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
-            _mm256_add_ps(vop0, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (0)))),
+                16)),
+            vop0);
         _mm_prefetch(
             reinterpret_cast<const char*>(&ip_next_T0[0]), _MM_HINT_T0);
         vop8 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
-            _mm256_add_ps(vop8, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (8)))),
+                16)),
+            vop8);
         // skip unnecessary prefetch of (&ip_next_T0[8])
         vop16 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (16))))),
-            _mm256_add_ps(vop16, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (16)))),
+                16)),
+            vop16);
         // skip unnecessary prefetch of (&ip_next_T0[16])
         vop24 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (24))))),
-            _mm256_add_ps(vop24, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (24)))),
+                16)),
+            vop24);
         // skip unnecessary prefetch of (&ip_next_T0[24])
       }
       if (!normalize_by_lengths || length == 0) {
@@ -2438,16 +2507,11 @@ static bool EmbeddingLookupIdx_int32_t_uint8_t_float__avx2_fma(
           return false;
         }
         float wgt = 1.f;
-        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-        float bio;
         if (weights) {
           wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
         }
-        bio = wgt * scale_bias[2 * idx + 1];
-        wgt = wgt * scale_bias[2 * idx];
-        __m256 vbio = _mm256_set1_ps(bio);
         __m256 vwgt = _mm256_set1_ps(wgt);
-        const uint8_t* ip = &input[idx * fused_block_size];
+        const at::BFloat16* ip = &input[idx * fused_block_size];
         const int next_T0 = (dataInd < index_size - prefdist_T0)
             // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             ? (dataInd + prefdist_T0)
@@ -2457,19 +2521,23 @@ static bool EmbeddingLookupIdx_int32_t_uint8_t_float__avx2_fma(
         if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
           return false;
         }
-        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        const at::BFloat16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
         vop0 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
-            _mm256_add_ps(vop0, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (0)))),
+                16)),
+            vop0);
         _mm_prefetch(
             reinterpret_cast<const char*>(&ip_next_T0[0]), _MM_HINT_T0);
         vop8 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
-            _mm256_add_ps(vop8, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (8)))),
+                16)),
+            vop8);
         // skip unnecessary prefetch of (&ip_next_T0[8])
       }
       if (!normalize_by_lengths || length == 0) {
@@ -2483,6 +2551,8 @@ static bool EmbeddingLookupIdx_int32_t_uint8_t_float__avx2_fma(
     }
   } else {
     // generic code
+    // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-avoid-c-arrays)
+    alignas(64) at::BFloat16 vtmp1[8] = {0};
     for (int rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
       float* op = &out[rangeIndex * block_size];
       int64_t j = 0;
@@ -2504,16 +2574,11 @@ static bool EmbeddingLookupIdx_int32_t_uint8_t_float__avx2_fma(
           return false;
         }
         float wgt = 1.f;
-        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-        float bio;
         if (weights) {
           wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
         }
-        bio = wgt * scale_bias[2 * idx + 1];
-        wgt = wgt * scale_bias[2 * idx];
-        __m256 vbio = _mm256_set1_ps(bio);
         __m256 vwgt = _mm256_set1_ps(wgt);
-        const uint8_t* ip = &input[idx * fused_block_size];
+        const at::BFloat16* ip = &input[idx * fused_block_size];
         const int next_T0 = (dataInd < index_size - prefdist_T0)
             // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             ? (dataInd + prefdist_T0)
@@ -2523,21 +2588,27 @@ static bool EmbeddingLookupIdx_int32_t_uint8_t_float__avx2_fma(
         if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
           return false;
         }
-        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        const at::BFloat16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
         j = 0;
         for (; j + 8 <= block_size; j += 8) {
           _mm256_storeu_ps(
               &op[j],
               _mm256_fmadd_ps(
                   vwgt,
-                  _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(
-                      reinterpret_cast<const __m128i*>(&ip[j])))),
-                  _mm256_add_ps(_mm256_loadu_ps(&op[j]), vbio)));
+                  _mm256_castsi256_ps(_mm256_slli_epi32(
+                      _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                          reinterpret_cast<const __m128i*>(&ip[j]))),
+                      16)),
+                  _mm256_loadu_ps(&op[j])));
           _mm_prefetch(
               reinterpret_cast<const char*>(&ip_next_T0[j]), _MM_HINT_T0);
         }
         for (; j < block_size; j++) {
-          op[j] = std::fma(wgt, (float)ip[j], bio + op[j]);
+          vtmp1[0] = ip[j];
+          __m256 vtmp2 = _mm256_castsi256_ps(_mm256_slli_epi32(
+              _mm256_cvtepu16_epi32(*(reinterpret_cast<const __m128i*>(vtmp1))),
+              16));
+          op[j] = std::fma(wgt, ((float*)(&vtmp2))[0], op[j]);
         }
       }
       if (normalize_by_lengths && length) {
@@ -2556,19 +2627,19 @@ static bool EmbeddingLookupIdx_int32_t_uint8_t_float__avx2_fma(
   }
   return dataInd == index_size;
 }
-bool EmbeddingLookupIdx_int32_t_uint8_t_float_false__avx2_fma(
+bool EmbeddingLookupIdx_int32_t_bfloat16_float_false__avx2_fma(
     const int64_t block_size,
     const int64_t output_size,
     const int64_t index_size,
     const int64_t data_size,
-    const uint8_t* input,
+    const at::BFloat16* input,
     const int* indices,
     const int* offsets,
     const float* weights,
     const float* scale_bias,
     bool normalize_by_lengths,
     float* out) {
-  return EmbeddingLookupIdx_int32_t_uint8_t_float__avx2_fma<false>(
+  return EmbeddingLookupIdx_int32_t_bfloat16_float__avx2_fma<false>(
       block_size,
       output_size,
       index_size,
@@ -2581,19 +2652,19 @@ bool EmbeddingLookupIdx_int32_t_uint8_t_float_false__avx2_fma(
       normalize_by_lengths,
       out);
 }
-bool EmbeddingLookupIdx_int32_t_uint8_t_float_true__avx2_fma(
+bool EmbeddingLookupIdx_int32_t_bfloat16_float_true__avx2_fma(
     const int64_t block_size,
     const int64_t output_size,
     const int64_t index_size,
     const int64_t data_size,
-    const uint8_t* input,
+    const at::BFloat16* input,
     const int* indices,
     const int* offsets,
     const float* weights,
     const float* scale_bias,
     bool normalize_by_lengths,
     float* out) {
-  return EmbeddingLookupIdx_int32_t_uint8_t_float__avx2_fma<true>(
+  return EmbeddingLookupIdx_int32_t_bfloat16_float__avx2_fma<true>(
       block_size,
       output_size,
       index_size,
@@ -2608,12 +2679,12 @@ bool EmbeddingLookupIdx_int32_t_uint8_t_float_true__avx2_fma(
 }
 
 template <bool IS_WEIGHT_POSITIONAL>
-static bool EmbeddingLookupIdx_int64_t_uint8_t_float__avx2_fma(
+static bool EmbeddingLookupIdx_int64_t_bfloat16_float__avx2_fma(
     const int64_t block_size,
     const int64_t output_size,
     const int64_t index_size,
     const int64_t data_size,
-    const uint8_t* input,
+    const at::BFloat16* input,
     const int64_t* indices,
     const int64_t* offsets,
     const float* weights,
@@ -2621,6 +2692,7 @@ static bool EmbeddingLookupIdx_int64_t_uint8_t_float__avx2_fma(
     bool normalize_by_lengths,
     float* out) {
   const int64_t prefdist_T0 = 16;
+  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
   const int64_t fused_block_size = block_size + 0;
   int64_t dataInd = 0;
   if (block_size == 128) {
@@ -2655,83 +2727,1304 @@ static bool EmbeddingLookupIdx_int64_t_uint8_t_float__avx2_fma(
           return false;
         }
         float wgt = 1.f;
-        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-        float bio;
         if (weights) {
           wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
         }
-        bio = wgt * scale_bias[2 * idx + 1];
-        wgt = wgt * scale_bias[2 * idx];
-        __m256 vbio = _mm256_set1_ps(bio);
         __m256 vwgt = _mm256_set1_ps(wgt);
-        const uint8_t* ip = &input[idx * fused_block_size];
+        const at::BFloat16* ip = &input[idx * fused_block_size];
         const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             ? (dataInd + prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             : dataInd;
         const int64_t idx_pref_T0 = indices[next_T0];
         if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
           return false;
         }
-        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        const at::BFloat16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
         vop0 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
-            _mm256_add_ps(vop0, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (0)))),
+                16)),
+            vop0);
         _mm_prefetch(
             reinterpret_cast<const char*>(&ip_next_T0[0]), _MM_HINT_T0);
         vop8 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
-            _mm256_add_ps(vop8, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (8)))),
+                16)),
+            vop8);
         // skip unnecessary prefetch of (&ip_next_T0[8])
         vop16 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (16))))),
-            _mm256_add_ps(vop16, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (16)))),
+                16)),
+            vop16);
         // skip unnecessary prefetch of (&ip_next_T0[16])
         vop24 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (24))))),
-            _mm256_add_ps(vop24, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (24)))),
+                16)),
+            vop24);
         // skip unnecessary prefetch of (&ip_next_T0[24])
         vop32 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (32))))),
-            _mm256_add_ps(vop32, vbio));
-        // skip unnecessary prefetch of (&ip_next_T0[32])
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (32)))),
+                16)),
+            vop32);
+        _mm_prefetch(
+            reinterpret_cast<const char*>(&ip_next_T0[32]), _MM_HINT_T0);
         vop40 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (40))))),
-            _mm256_add_ps(vop40, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (40)))),
+                16)),
+            vop40);
         // skip unnecessary prefetch of (&ip_next_T0[40])
         vop48 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (48))))),
-            _mm256_add_ps(vop48, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (48)))),
+                16)),
+            vop48);
         // skip unnecessary prefetch of (&ip_next_T0[48])
         vop56 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (56))))),
-            _mm256_add_ps(vop56, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (56)))),
+                16)),
+            vop56);
         // skip unnecessary prefetch of (&ip_next_T0[56])
         vop64 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (64))))),
-            _mm256_add_ps(vop64, vbio));
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (64)))),
+                16)),
+            vop64);
         _mm_prefetch(
             reinterpret_cast<const char*>(&ip_next_T0[64]), _MM_HINT_T0);
         vop72 = _mm256_fmadd_ps(
             vwgt,
-            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (72)))),
+                16)),
+            vop72);
+        // skip unnecessary prefetch of (&ip_next_T0[72])
+        vop80 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (80)))),
+                16)),
+            vop80);
+        // skip unnecessary prefetch of (&ip_next_T0[80])
+        vop88 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (88)))),
+                16)),
+            vop88);
+        // skip unnecessary prefetch of (&ip_next_T0[88])
+        vop96 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (96)))),
+                16)),
+            vop96);
+        _mm_prefetch(
+            reinterpret_cast<const char*>(&ip_next_T0[96]), _MM_HINT_T0);
+        vop104 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (104)))),
+                16)),
+            vop104);
+        // skip unnecessary prefetch of (&ip_next_T0[104])
+        vop112 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (112)))),
+                16)),
+            vop112);
+        // skip unnecessary prefetch of (&ip_next_T0[112])
+        vop120 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (120)))),
+                16)),
+            vop120);
+        // skip unnecessary prefetch of (&ip_next_T0[120])
+      }
+      if (!normalize_by_lengths || length == 0) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+        _mm256_storeu_ps(&op[64], vop64);
+        _mm256_storeu_ps(&op[72], vop72);
+        _mm256_storeu_ps(&op[80], vop80);
+        _mm256_storeu_ps(&op[88], vop88);
+        _mm256_storeu_ps(&op[96], vop96);
+        _mm256_storeu_ps(&op[104], vop104);
+        _mm256_storeu_ps(&op[112], vop112);
+        _mm256_storeu_ps(&op[120], vop120);
+      } else {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / length);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+        _mm256_storeu_ps(&op[64], _mm256_mul_ps(vop64, vlen_inv));
+        _mm256_storeu_ps(&op[72], _mm256_mul_ps(vop72, vlen_inv));
+        _mm256_storeu_ps(&op[80], _mm256_mul_ps(vop80, vlen_inv));
+        _mm256_storeu_ps(&op[88], _mm256_mul_ps(vop88, vlen_inv));
+        _mm256_storeu_ps(&op[96], _mm256_mul_ps(vop96, vlen_inv));
+        _mm256_storeu_ps(&op[104], _mm256_mul_ps(vop104, vlen_inv));
+        _mm256_storeu_ps(&op[112], _mm256_mul_ps(vop112, vlen_inv));
+        _mm256_storeu_ps(&op[120], _mm256_mul_ps(vop120, vlen_inv));
+      }
+    }
+  } else if (block_size == 64) {
+    // unrolling 8 times
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      if (dataInd != offsets[rangeIndex] - offsets[0]) {
+        return false;
+      }
+      int64_t end_offset = offsets[rangeIndex + 1];
+      int64_t length = end_offset - offsets[rangeIndex];
+      for (int64_t start = dataInd; dataInd < end_offset - offsets[0];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        if (idx < 0 || idx >= data_size) {
+          return false;
+        }
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const at::BFloat16* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+            ? (dataInd + prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
+          return false;
+        }
+        const at::BFloat16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (0)))),
+                16)),
+            vop0);
+        _mm_prefetch(
+            reinterpret_cast<const char*>(&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (8)))),
+                16)),
+            vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (16)))),
+                16)),
+            vop16);
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (24)))),
+                16)),
+            vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (32)))),
+                16)),
+            vop32);
+        _mm_prefetch(
+            reinterpret_cast<const char*>(&ip_next_T0[32]), _MM_HINT_T0);
+        vop40 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (40)))),
+                16)),
+            vop40);
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (48)))),
+                16)),
+            vop48);
+        // skip unnecessary prefetch of (&ip_next_T0[48])
+        vop56 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (56)))),
+                16)),
+            vop56);
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+      }
+      if (!normalize_by_lengths || length == 0) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+      } else {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / length);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+      }
+    }
+  } else if (block_size == 32) {
+    // unrolling 4 times
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      if (dataInd != offsets[rangeIndex] - offsets[0]) {
+        return false;
+      }
+      int64_t end_offset = offsets[rangeIndex + 1];
+      int64_t length = end_offset - offsets[rangeIndex];
+      for (int64_t start = dataInd; dataInd < end_offset - offsets[0];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        if (idx < 0 || idx >= data_size) {
+          return false;
+        }
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const at::BFloat16* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+            ? (dataInd + prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
+          return false;
+        }
+        const at::BFloat16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (0)))),
+                16)),
+            vop0);
+        _mm_prefetch(
+            reinterpret_cast<const char*>(&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (8)))),
+                16)),
+            vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (16)))),
+                16)),
+            vop16);
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (24)))),
+                16)),
+            vop24);
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+      }
+      if (!normalize_by_lengths || length == 0) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+      } else {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / length);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+      }
+    }
+  } else if (block_size == 16) {
+    // unrolling 2 times
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      if (dataInd != offsets[rangeIndex] - offsets[0]) {
+        return false;
+      }
+      int64_t end_offset = offsets[rangeIndex + 1];
+      int64_t length = end_offset - offsets[rangeIndex];
+      for (int64_t start = dataInd; dataInd < end_offset - offsets[0];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        if (idx < 0 || idx >= data_size) {
+          return false;
+        }
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const at::BFloat16* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+            ? (dataInd + prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
+          return false;
+        }
+        const at::BFloat16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (0)))),
+                16)),
+            vop0);
+        _mm_prefetch(
+            reinterpret_cast<const char*>(&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_castsi256_ps(_mm256_slli_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                    reinterpret_cast<const __m128i*>(ip + (8)))),
+                16)),
+            vop8);
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+      }
+      if (!normalize_by_lengths || length == 0) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+      } else {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / length);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+      }
+    }
+  } else {
+    // generic code
+    // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-avoid-c-arrays)
+    alignas(64) at::BFloat16 vtmp1[8] = {0};
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      int64_t j = 0;
+      for (; j + 8 <= block_size; j += 8) {
+        _mm256_storeu_ps(op + j, _mm256_setzero_ps());
+      }
+      for (; j < block_size; j++) {
+        op[j] = 0.0f;
+      }
+      if (dataInd != offsets[rangeIndex] - offsets[0]) {
+        return false;
+      }
+      int64_t end_offset = offsets[rangeIndex + 1];
+      int64_t length = end_offset - offsets[rangeIndex];
+      for (int64_t start = dataInd; dataInd < end_offset - offsets[0];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        if (idx < 0 || idx >= data_size) {
+          return false;
+        }
+        float wgt = 1.f;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const at::BFloat16* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+            ? (dataInd + prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
+          return false;
+        }
+        const at::BFloat16* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j],
+              _mm256_fmadd_ps(
+                  vwgt,
+                  _mm256_castsi256_ps(_mm256_slli_epi32(
+                      _mm256_cvtepu16_epi32(_mm_loadu_si128(
+                          reinterpret_cast<const __m128i*>(&ip[j]))),
+                      16)),
+                  _mm256_loadu_ps(&op[j])));
+          _mm_prefetch(
+              reinterpret_cast<const char*>(&ip_next_T0[j]), _MM_HINT_T0);
+        }
+        for (; j < block_size; j++) {
+          vtmp1[0] = ip[j];
+          __m256 vtmp2 = _mm256_castsi256_ps(_mm256_slli_epi32(
+              _mm256_cvtepu16_epi32(*(reinterpret_cast<const __m128i*>(vtmp1))),
+              16));
+          op[j] = std::fma(wgt, ((float*)(&vtmp2))[0], op[j]);
+        }
+      }
+      if (normalize_by_lengths && length) {
+        float len_inv = 1.0f / length;
+        __m256 vlen_inv = _mm256_set1_ps(len_inv);
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j], _mm256_mul_ps(_mm256_loadu_ps(&op[j]), vlen_inv));
+        }
+        for (; j < block_size; j++) {
+          op[j] = len_inv * op[j];
+        }
+      }
+    }
+  }
+  return dataInd == index_size;
+}
+bool EmbeddingLookupIdx_int64_t_bfloat16_float_false__avx2_fma(
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
+    const at::BFloat16* input,
+    const int64_t* indices,
+    const int64_t* offsets,
+    const float* weights,
+    const float* scale_bias,
+    bool normalize_by_lengths,
+    float* out) {
+  return EmbeddingLookupIdx_int64_t_bfloat16_float__avx2_fma<false>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      offsets,
+      weights,
+      scale_bias,
+      normalize_by_lengths,
+      out);
+}
+bool EmbeddingLookupIdx_int64_t_bfloat16_float_true__avx2_fma(
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
+    const at::BFloat16* input,
+    const int64_t* indices,
+    const int64_t* offsets,
+    const float* weights,
+    const float* scale_bias,
+    bool normalize_by_lengths,
+    float* out) {
+  return EmbeddingLookupIdx_int64_t_bfloat16_float__avx2_fma<true>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      offsets,
+      weights,
+      scale_bias,
+      normalize_by_lengths,
+      out);
+}
+
+template <bool IS_WEIGHT_POSITIONAL>
+static bool EmbeddingLookupIdx_int32_t_uint8_t_float__avx2_fma(
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
+    const uint8_t* input,
+    const int* indices,
+    const int* offsets,
+    const float* weights,
+    const float* scale_bias,
+    bool normalize_by_lengths,
+    float* out) {
+  const int prefdist_T0 = 16;
+  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+  const int fused_block_size = block_size + 0;
+  int64_t dataInd = 0;
+  if (block_size == 128) {
+    // unrolling 16 times
+    for (int rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      __m256 vop64 = _mm256_setzero_ps();
+      __m256 vop72 = _mm256_setzero_ps();
+      __m256 vop80 = _mm256_setzero_ps();
+      __m256 vop88 = _mm256_setzero_ps();
+      __m256 vop96 = _mm256_setzero_ps();
+      __m256 vop104 = _mm256_setzero_ps();
+      __m256 vop112 = _mm256_setzero_ps();
+      __m256 vop120 = _mm256_setzero_ps();
+      if (dataInd != offsets[rangeIndex] - offsets[0]) {
+        return false;
+      }
+      int64_t end_offset = offsets[rangeIndex + 1];
+      int64_t length = end_offset - offsets[rangeIndex];
+      for (int64_t start = dataInd; dataInd < end_offset - offsets[0];
+           ++dataInd) {
+        const int idx = indices[dataInd];
+        if (idx < 0 || idx >= data_size) {
+          return false;
+        }
+        float wgt = 1.f;
+        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        bio = wgt * scale_bias[2 * idx + 1];
+        wgt = wgt * scale_bias[2 * idx];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int next_T0 = (dataInd < index_size - prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+            ? (dataInd + prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+            : dataInd;
+        const int idx_pref_T0 = indices[next_T0];
+        if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
+          return false;
+        }
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
+            _mm256_add_ps(vop0, vbio));
+        _mm_prefetch(
+            reinterpret_cast<const char*>(&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
+            _mm256_add_ps(vop8, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (16))))),
+            _mm256_add_ps(vop16, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (24))))),
+            _mm256_add_ps(vop24, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (32))))),
+            _mm256_add_ps(vop32, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[32])
+        vop40 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (40))))),
+            _mm256_add_ps(vop40, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (48))))),
+            _mm256_add_ps(vop48, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[48])
+        vop56 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (56))))),
+            _mm256_add_ps(vop56, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+        vop64 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (64))))),
+            _mm256_add_ps(vop64, vbio));
+        _mm_prefetch(
+            reinterpret_cast<const char*>(&ip_next_T0[64]), _MM_HINT_T0);
+        vop72 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (72))))),
+            _mm256_add_ps(vop72, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[72])
+        vop80 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (80))))),
+            _mm256_add_ps(vop80, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[80])
+        vop88 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (88))))),
+            _mm256_add_ps(vop88, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[88])
+        vop96 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (96))))),
+            _mm256_add_ps(vop96, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[96])
+        vop104 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (104))))),
+            _mm256_add_ps(vop104, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[104])
+        vop112 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (112))))),
+            _mm256_add_ps(vop112, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[112])
+        vop120 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (120))))),
+            _mm256_add_ps(vop120, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[120])
+      }
+      if (!normalize_by_lengths || length == 0) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+        _mm256_storeu_ps(&op[64], vop64);
+        _mm256_storeu_ps(&op[72], vop72);
+        _mm256_storeu_ps(&op[80], vop80);
+        _mm256_storeu_ps(&op[88], vop88);
+        _mm256_storeu_ps(&op[96], vop96);
+        _mm256_storeu_ps(&op[104], vop104);
+        _mm256_storeu_ps(&op[112], vop112);
+        _mm256_storeu_ps(&op[120], vop120);
+      } else {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / length);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+        _mm256_storeu_ps(&op[64], _mm256_mul_ps(vop64, vlen_inv));
+        _mm256_storeu_ps(&op[72], _mm256_mul_ps(vop72, vlen_inv));
+        _mm256_storeu_ps(&op[80], _mm256_mul_ps(vop80, vlen_inv));
+        _mm256_storeu_ps(&op[88], _mm256_mul_ps(vop88, vlen_inv));
+        _mm256_storeu_ps(&op[96], _mm256_mul_ps(vop96, vlen_inv));
+        _mm256_storeu_ps(&op[104], _mm256_mul_ps(vop104, vlen_inv));
+        _mm256_storeu_ps(&op[112], _mm256_mul_ps(vop112, vlen_inv));
+        _mm256_storeu_ps(&op[120], _mm256_mul_ps(vop120, vlen_inv));
+      }
+    }
+  } else if (block_size == 64) {
+    // unrolling 8 times
+    for (int rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      if (dataInd != offsets[rangeIndex] - offsets[0]) {
+        return false;
+      }
+      int64_t end_offset = offsets[rangeIndex + 1];
+      int64_t length = end_offset - offsets[rangeIndex];
+      for (int64_t start = dataInd; dataInd < end_offset - offsets[0];
+           ++dataInd) {
+        const int idx = indices[dataInd];
+        if (idx < 0 || idx >= data_size) {
+          return false;
+        }
+        float wgt = 1.f;
+        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        bio = wgt * scale_bias[2 * idx + 1];
+        wgt = wgt * scale_bias[2 * idx];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int next_T0 = (dataInd < index_size - prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+            ? (dataInd + prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+            : dataInd;
+        const int idx_pref_T0 = indices[next_T0];
+        if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
+          return false;
+        }
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
+            _mm256_add_ps(vop0, vbio));
+        _mm_prefetch(
+            reinterpret_cast<const char*>(&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
+            _mm256_add_ps(vop8, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (16))))),
+            _mm256_add_ps(vop16, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (24))))),
+            _mm256_add_ps(vop24, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (32))))),
+            _mm256_add_ps(vop32, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[32])
+        vop40 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (40))))),
+            _mm256_add_ps(vop40, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (48))))),
+            _mm256_add_ps(vop48, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[48])
+        vop56 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (56))))),
+            _mm256_add_ps(vop56, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+      }
+      if (!normalize_by_lengths || length == 0) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+        _mm256_storeu_ps(&op[32], vop32);
+        _mm256_storeu_ps(&op[40], vop40);
+        _mm256_storeu_ps(&op[48], vop48);
+        _mm256_storeu_ps(&op[56], vop56);
+      } else {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / length);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+        _mm256_storeu_ps(&op[32], _mm256_mul_ps(vop32, vlen_inv));
+        _mm256_storeu_ps(&op[40], _mm256_mul_ps(vop40, vlen_inv));
+        _mm256_storeu_ps(&op[48], _mm256_mul_ps(vop48, vlen_inv));
+        _mm256_storeu_ps(&op[56], _mm256_mul_ps(vop56, vlen_inv));
+      }
+    }
+  } else if (block_size == 32) {
+    // unrolling 4 times
+    for (int rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      if (dataInd != offsets[rangeIndex] - offsets[0]) {
+        return false;
+      }
+      int64_t end_offset = offsets[rangeIndex + 1];
+      int64_t length = end_offset - offsets[rangeIndex];
+      for (int64_t start = dataInd; dataInd < end_offset - offsets[0];
+           ++dataInd) {
+        const int idx = indices[dataInd];
+        if (idx < 0 || idx >= data_size) {
+          return false;
+        }
+        float wgt = 1.f;
+        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        bio = wgt * scale_bias[2 * idx + 1];
+        wgt = wgt * scale_bias[2 * idx];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int next_T0 = (dataInd < index_size - prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+            ? (dataInd + prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+            : dataInd;
+        const int idx_pref_T0 = indices[next_T0];
+        if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
+          return false;
+        }
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
+            _mm256_add_ps(vop0, vbio));
+        _mm_prefetch(
+            reinterpret_cast<const char*>(&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
+            _mm256_add_ps(vop8, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (16))))),
+            _mm256_add_ps(vop16, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (24))))),
+            _mm256_add_ps(vop24, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+      }
+      if (!normalize_by_lengths || length == 0) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+        _mm256_storeu_ps(&op[16], vop16);
+        _mm256_storeu_ps(&op[24], vop24);
+      } else {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / length);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+        _mm256_storeu_ps(&op[16], _mm256_mul_ps(vop16, vlen_inv));
+        _mm256_storeu_ps(&op[24], _mm256_mul_ps(vop24, vlen_inv));
+      }
+    }
+  } else if (block_size == 16) {
+    // unrolling 2 times
+    for (int rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      if (dataInd != offsets[rangeIndex] - offsets[0]) {
+        return false;
+      }
+      int64_t end_offset = offsets[rangeIndex + 1];
+      int64_t length = end_offset - offsets[rangeIndex];
+      for (int64_t start = dataInd; dataInd < end_offset - offsets[0];
+           ++dataInd) {
+        const int idx = indices[dataInd];
+        if (idx < 0 || idx >= data_size) {
+          return false;
+        }
+        float wgt = 1.f;
+        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        bio = wgt * scale_bias[2 * idx + 1];
+        wgt = wgt * scale_bias[2 * idx];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int next_T0 = (dataInd < index_size - prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+            ? (dataInd + prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+            : dataInd;
+        const int idx_pref_T0 = indices[next_T0];
+        if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
+          return false;
+        }
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
+            _mm256_add_ps(vop0, vbio));
+        _mm_prefetch(
+            reinterpret_cast<const char*>(&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
+            _mm256_add_ps(vop8, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+      }
+      if (!normalize_by_lengths || length == 0) {
+        _mm256_storeu_ps(&op[0], vop0);
+        _mm256_storeu_ps(&op[8], vop8);
+      } else {
+        __m256 vlen_inv = _mm256_set1_ps(1.0f / length);
+        _mm256_storeu_ps(&op[0], _mm256_mul_ps(vop0, vlen_inv));
+        _mm256_storeu_ps(&op[8], _mm256_mul_ps(vop8, vlen_inv));
+      }
+    }
+  } else {
+    // generic code
+    // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-avoid-c-arrays)
+    for (int rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      int64_t j = 0;
+      for (; j + 8 <= block_size; j += 8) {
+        _mm256_storeu_ps(op + j, _mm256_setzero_ps());
+      }
+      for (; j < block_size; j++) {
+        op[j] = 0.0f;
+      }
+      if (dataInd != offsets[rangeIndex] - offsets[0]) {
+        return false;
+      }
+      int64_t end_offset = offsets[rangeIndex + 1];
+      int64_t length = end_offset - offsets[rangeIndex];
+      for (int64_t start = dataInd; dataInd < end_offset - offsets[0];
+           ++dataInd) {
+        const int idx = indices[dataInd];
+        if (idx < 0 || idx >= data_size) {
+          return false;
+        }
+        float wgt = 1.f;
+        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        bio = wgt * scale_bias[2 * idx + 1];
+        wgt = wgt * scale_bias[2 * idx];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int next_T0 = (dataInd < index_size - prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+            ? (dataInd + prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+            : dataInd;
+        const int idx_pref_T0 = indices[next_T0];
+        if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
+          return false;
+        }
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j],
+              _mm256_fmadd_ps(
+                  vwgt,
+                  _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_loadl_epi64(
+                      reinterpret_cast<const __m128i*>(&ip[j])))),
+                  _mm256_add_ps(_mm256_loadu_ps(&op[j]), vbio)));
+          _mm_prefetch(
+              reinterpret_cast<const char*>(&ip_next_T0[j]), _MM_HINT_T0);
+        }
+        for (; j < block_size; j++) {
+          op[j] = std::fma(wgt, (float)ip[j], bio + op[j]);
+        }
+      }
+      if (normalize_by_lengths && length) {
+        float len_inv = 1.0f / length;
+        __m256 vlen_inv = _mm256_set1_ps(len_inv);
+        j = 0;
+        for (; j + 8 <= block_size; j += 8) {
+          _mm256_storeu_ps(
+              &op[j], _mm256_mul_ps(_mm256_loadu_ps(&op[j]), vlen_inv));
+        }
+        for (; j < block_size; j++) {
+          op[j] = len_inv * op[j];
+        }
+      }
+    }
+  }
+  return dataInd == index_size;
+}
+bool EmbeddingLookupIdx_int32_t_uint8_t_float_false__avx2_fma(
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
+    const uint8_t* input,
+    const int* indices,
+    const int* offsets,
+    const float* weights,
+    const float* scale_bias,
+    bool normalize_by_lengths,
+    float* out) {
+  return EmbeddingLookupIdx_int32_t_uint8_t_float__avx2_fma<false>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      offsets,
+      weights,
+      scale_bias,
+      normalize_by_lengths,
+      out);
+}
+bool EmbeddingLookupIdx_int32_t_uint8_t_float_true__avx2_fma(
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
+    const uint8_t* input,
+    const int* indices,
+    const int* offsets,
+    const float* weights,
+    const float* scale_bias,
+    bool normalize_by_lengths,
+    float* out) {
+  return EmbeddingLookupIdx_int32_t_uint8_t_float__avx2_fma<true>(
+      block_size,
+      output_size,
+      index_size,
+      data_size,
+      input,
+      indices,
+      offsets,
+      weights,
+      scale_bias,
+      normalize_by_lengths,
+      out);
+}
+
+template <bool IS_WEIGHT_POSITIONAL>
+static bool EmbeddingLookupIdx_int64_t_uint8_t_float__avx2_fma(
+    const int64_t block_size,
+    const int64_t output_size,
+    const int64_t index_size,
+    const int64_t data_size,
+    const uint8_t* input,
+    const int64_t* indices,
+    const int64_t* offsets,
+    const float* weights,
+    const float* scale_bias,
+    bool normalize_by_lengths,
+    float* out) {
+  const int64_t prefdist_T0 = 16;
+  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+  const int64_t fused_block_size = block_size + 0;
+  int64_t dataInd = 0;
+  if (block_size == 128) {
+    // unrolling 16 times
+    for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
+      float* op = &out[rangeIndex * block_size];
+      __m256 vop0 = _mm256_setzero_ps();
+      __m256 vop8 = _mm256_setzero_ps();
+      __m256 vop16 = _mm256_setzero_ps();
+      __m256 vop24 = _mm256_setzero_ps();
+      __m256 vop32 = _mm256_setzero_ps();
+      __m256 vop40 = _mm256_setzero_ps();
+      __m256 vop48 = _mm256_setzero_ps();
+      __m256 vop56 = _mm256_setzero_ps();
+      __m256 vop64 = _mm256_setzero_ps();
+      __m256 vop72 = _mm256_setzero_ps();
+      __m256 vop80 = _mm256_setzero_ps();
+      __m256 vop88 = _mm256_setzero_ps();
+      __m256 vop96 = _mm256_setzero_ps();
+      __m256 vop104 = _mm256_setzero_ps();
+      __m256 vop112 = _mm256_setzero_ps();
+      __m256 vop120 = _mm256_setzero_ps();
+      if (dataInd != offsets[rangeIndex] - offsets[0]) {
+        return false;
+      }
+      int64_t end_offset = offsets[rangeIndex + 1];
+      int64_t length = end_offset - offsets[rangeIndex];
+      for (int64_t start = dataInd; dataInd < end_offset - offsets[0];
+           ++dataInd) {
+        const int64_t idx = indices[dataInd];
+        if (idx < 0 || idx >= data_size) {
+          return false;
+        }
+        float wgt = 1.f;
+        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+        float bio;
+        if (weights) {
+          wgt = weights[IS_WEIGHT_POSITIONAL ? (dataInd - start) : dataInd];
+        }
+        bio = wgt * scale_bias[2 * idx + 1];
+        wgt = wgt * scale_bias[2 * idx];
+        __m256 vbio = _mm256_set1_ps(bio);
+        __m256 vwgt = _mm256_set1_ps(wgt);
+        const uint8_t* ip = &input[idx * fused_block_size];
+        const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+            ? (dataInd + prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+            : dataInd;
+        const int64_t idx_pref_T0 = indices[next_T0];
+        if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
+          return false;
+        }
+        const uint8_t* ip_next_T0 = &input[idx_pref_T0 * fused_block_size];
+        vop0 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (0))))),
+            _mm256_add_ps(vop0, vbio));
+        _mm_prefetch(
+            reinterpret_cast<const char*>(&ip_next_T0[0]), _MM_HINT_T0);
+        vop8 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (8))))),
+            _mm256_add_ps(vop8, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[8])
+        vop16 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (16))))),
+            _mm256_add_ps(vop16, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[16])
+        vop24 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (24))))),
+            _mm256_add_ps(vop24, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[24])
+        vop32 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (32))))),
+            _mm256_add_ps(vop32, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[32])
+        vop40 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (40))))),
+            _mm256_add_ps(vop40, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[40])
+        vop48 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (48))))),
+            _mm256_add_ps(vop48, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[48])
+        vop56 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (56))))),
+            _mm256_add_ps(vop56, vbio));
+        // skip unnecessary prefetch of (&ip_next_T0[56])
+        vop64 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
+                _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (64))))),
+            _mm256_add_ps(vop64, vbio));
+        _mm_prefetch(
+            reinterpret_cast<const char*>(&ip_next_T0[64]), _MM_HINT_T0);
+        vop72 = _mm256_fmadd_ps(
+            vwgt,
+            _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
                 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ip + (72))))),
             _mm256_add_ps(vop72, vbio));
         // skip unnecessary prefetch of (&ip_next_T0[72])
@@ -2844,7 +4137,9 @@ static bool EmbeddingLookupIdx_int64_t_uint8_t_float__avx2_fma(
         __m256 vwgt = _mm256_set1_ps(wgt);
         const uint8_t* ip = &input[idx * fused_block_size];
         const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             ? (dataInd + prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             : dataInd;
         const int64_t idx_pref_T0 = indices[next_T0];
         if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
@@ -2953,7 +4248,9 @@ static bool EmbeddingLookupIdx_int64_t_uint8_t_float__avx2_fma(
         __m256 vwgt = _mm256_set1_ps(wgt);
         const uint8_t* ip = &input[idx * fused_block_size];
         const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             ? (dataInd + prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             : dataInd;
         const int64_t idx_pref_T0 = indices[next_T0];
         if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
@@ -3028,7 +4325,9 @@ static bool EmbeddingLookupIdx_int64_t_uint8_t_float__avx2_fma(
         __m256 vwgt = _mm256_set1_ps(wgt);
         const uint8_t* ip = &input[idx * fused_block_size];
         const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             ? (dataInd + prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             : dataInd;
         const int64_t idx_pref_T0 = indices[next_T0];
         if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
@@ -3060,6 +4359,7 @@ static bool EmbeddingLookupIdx_int64_t_uint8_t_float__avx2_fma(
     }
   } else {
     // generic code
+    // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-avoid-c-arrays)
     for (int64_t rangeIndex = 0; rangeIndex < output_size; ++rangeIndex) {
       float* op = &out[rangeIndex * block_size];
       int64_t j = 0;
@@ -3092,7 +4392,9 @@ static bool EmbeddingLookupIdx_int64_t_uint8_t_float__avx2_fma(
         __m256 vwgt = _mm256_set1_ps(wgt);
         const uint8_t* ip = &input[idx * fused_block_size];
         const int64_t next_T0 = (dataInd < index_size - prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             ? (dataInd + prefdist_T0)
+            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
             : dataInd;
         const int64_t idx_pref_T0 = indices[next_T0];
         if (idx_pref_T0 < 0 || idx_pref_T0 >= data_size) {
diff --git a/caffe2/perfkernels/hp_emblookup_codegen.py b/caffe2/perfkernels/hp_emblookup_codegen.py
index 402f3bb92a415..7e4208caf6556 100644
--- a/caffe2/perfkernels/hp_emblookup_codegen.py
+++ b/caffe2/perfkernels/hp_emblookup_codegen.py
@@ -4,7 +4,7 @@
 import sys
 
 
-sizeof = {"float": 4, "at::Half": 2, "uint8_t": 1}
+sizeof = {"float": 4, "at::Half": 2, "at::BFloat16": 2, "uint8_t": 1}
 
 
 def unroll(uf, IndexType, InType, OutType, use_weights, isa, fused, use_offsets):
@@ -24,6 +24,16 @@ def compute(regid, InType, use_weights, isa, prefetch):
                 "                _mm_loadu_si128(reinterpret_cast<const __m128i*>(ip + (%d)))),\n"  # noqa
                 "            vop%d);" % (regid, regid, regid)
             )
+        elif InType == "at::BFloat16":
+            code.append(
+                "        vop%d = _mm256_fmadd_ps(\n"
+                "            vwgt,\n"
+                "            _mm256_castsi256_ps(_mm256_slli_epi32(\n"
+                "                _mm256_cvtepu16_epi32(_mm_loadu_si128(\n"
+                "                    reinterpret_cast<const __m128i*>(ip + (%d)))),\n"
+                "                16)),\n"  # noqa
+                "            vop%d);" % (regid, regid, regid)
+            )
         elif InType == "uint8_t":
             code.append(
                 "        vop%d = _mm256_fmadd_ps(\n"
@@ -104,6 +114,7 @@ def compute(regid, InType, use_weights, isa, prefetch):
 
     if InType == "uint8_t":
         code.append("        " + OutType + " wgt = 1.f;")
+        code.append("        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)")
         code.append("        " + OutType + " bio;")
         code.append("        if (weights) {")
         code.append(
@@ -133,7 +144,10 @@ def compute(regid, InType, use_weights, isa, prefetch):
     code.append("        const {}* ip = &input[idx * fused_block_size];".format(InType))
     code.append(
         "        const {} next_T0 = (dataInd < index_size - prefdist_T0)\n"
-        "            ? (dataInd + prefdist_T0)\n            : dataInd;".format(
+        "            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)\n"
+        "            ? (dataInd + prefdist_T0)\n"
+        "            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)\n"
+        "            : dataInd;".format(
             IndexType
         )
     )
@@ -206,6 +220,18 @@ def compute(InType, use_weights, isa):
                 "                      reinterpret_cast<const __m128i*>(&ip[j]))),\n"
                 "                  _mm256_loadu_ps(&op[j])));"
             )
+        elif InType == "at::BFloat16":
+            code.append(
+                "          _mm256_storeu_ps(\n"
+                "              &op[j],\n"
+                "              _mm256_fmadd_ps(\n"
+                "                  vwgt,\n"
+                "                  _mm256_castsi256_ps(_mm256_slli_epi32(\n"
+                "                      _mm256_cvtepu16_epi32(_mm_loadu_si128(\n"
+                "                          reinterpret_cast<const __m128i*>(&ip[j]))),\n"
+                "                      16)),\n"
+                "                  _mm256_loadu_ps(&op[j])));"
+            )
         elif InType == "uint8_t":
             code.append(
                 "          _mm256_storeu_ps(\n"
@@ -229,7 +255,8 @@ def compute(InType, use_weights, isa):
     code = []
     if InType == "at::Half":
         code.append("    alignas(64) at::Half vtmp1[8] = {0};")
-
+    if InType == "at::BFloat16":
+        code.append("    alignas(64) at::BFloat16 vtmp1[8] = {0};")
 
 
     if use_offsets:
@@ -291,6 +318,7 @@ def compute(InType, use_weights, isa):
 
     if InType == "uint8_t":
         code.append("        " + OutType + " wgt = 1.f;")
+        code.append("        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)")
         code.append("        " + OutType + " bio;")
         code.append("        if (weights) {")
         code.append(
@@ -320,7 +348,10 @@ def compute(InType, use_weights, isa):
     code.append("        const {}* ip = &input[idx * fused_block_size];".format(InType))
     code.append(
         "        const {} next_T0 = (dataInd < index_size - prefdist_T0)\n"
-        "            ? (dataInd + prefdist_T0)\n            : dataInd;".format(
+        "            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)\n"
+        "            ? (dataInd + prefdist_T0)\n"
+        "            // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)\n"
+        "            : dataInd;".format(
             IndexType
         )
     )
@@ -351,6 +382,14 @@ def compute(InType, use_weights, isa):
             "              _mm256_cvtph_ps(*(reinterpret_cast<const __m128i*>(vtmp1)));"
         )
         code.append("          op[j] = std::fma(wgt, ((float*)(&vtmp2))[0], op[j]);")
+    elif InType == "at::BFloat16":
+        code.append("          vtmp1[0] = ip[j];")
+        code.append(
+            "          __m256 vtmp2 = _mm256_castsi256_ps(_mm256_slli_epi32(\n"
+            "              _mm256_cvtepu16_epi32(*(reinterpret_cast<const __m128i*>(vtmp1))),\n"
+            "              16));"
+        )
+        code.append("          op[j] = std::fma(wgt, ((float*)(&vtmp2))[0], op[j]);")
     elif InType == "uint8_t":
         code.append("          op[j] = std::fma(wgt, (float)ip[j], bio + op[j]);")
     else:
@@ -408,6 +447,8 @@ def compute(InType, use_weights, isa):
     ["int64_t", "int64_t", "float", "float", "float", "float"],
     ["int32_t", "int", "half", "at::Half", "float", "float"],
     ["int64_t", "int64_t", "half", "at::Half", "float", "float"],
+    ["int32_t", "int", "bfloat16", "at::BFloat16", "float", "float"],
+    ["int64_t", "int64_t", "bfloat16", "at::BFloat16", "float", "float"],
     ["int32_t", "int", "uint8_t", "uint8_t", "float", "float"],
     ["int64_t", "int64_t", "uint8_t", "uint8_t", "float", "float"],
 ]
@@ -422,6 +463,7 @@ def compute(InType, use_weights, isa):
 code.append("//// --------------------------\n")
 
 code.append("#include <c10/util/Half.h>")
+code.append("#include <c10/util/BFloat16.h>")
 code.append("#include <immintrin.h>")
 
 code.append("namespace caffe2 {\n")
@@ -461,6 +503,7 @@ def compute(InType, use_weights, isa):
     code += args
 
     code.append("  const " + IndexType + " prefdist_T0 = 16;")
+    code.append("  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)")
     # block_size is the number of elements and fused_block_size is the size of
     # an entire row, including scale and bias.
     offset = (8 // sizeof[InType]) if opts.fused else 0
@@ -484,6 +527,7 @@ def compute(InType, use_weights, isa):
     code += unroll(2, IndexType, InType, OutType, True, "AVX2", opts.fused, opts.use_offsets)
     code.append("  } else {")
     code.append("    // generic code")
+    code.append("    // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-avoid-c-arrays)")
     code += generic(IndexType, InType, OutType, True, "AVX2", opts.fused, opts.use_offsets)
     code.append("  }")
     code.append("  return dataInd == index_size;")

From 9f70ad01e37daf736d92293c4742b6d27d3e1de6 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Tue, 29 Nov 2022 22:59:32 +0000
Subject: [PATCH 1422/1922] [FSDP][Easy] Remove internal default arg (#89227)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89227
Approved by: https://github.com/mrshenli
---
 test/distributed/fsdp/test_fsdp_flatten_params.py | 2 +-
 torch/distributed/fsdp/flat_param.py              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/distributed/fsdp/test_fsdp_flatten_params.py b/test/distributed/fsdp/test_fsdp_flatten_params.py
index 4f7178df4a109..b1ae0938cc9f3 100644
--- a/test/distributed/fsdp/test_fsdp_flatten_params.py
+++ b/test/distributed/fsdp/test_fsdp_flatten_params.py
@@ -39,7 +39,7 @@ def world_size(self) -> int:
         return 1
 
     def _get_default_config(self):
-        return HandleConfig(HandleShardingStrategy.FULL_SHARD, False, None, None)
+        return HandleConfig(HandleShardingStrategy.FULL_SHARD, False, None, None, False)
 
     def _get_transformer(self, seed=0):
         torch.manual_seed(seed)  # keep everything deterministic
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index 515bf2f64d198..0aeb59527a12a 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -113,7 +113,7 @@ class HandleConfig:
     offload_params: bool
     low_prec_param_dtype: Optional[torch.dtype]
     low_prec_reduce_dtype: Optional[torch.dtype]
-    keep_low_precision_grads: bool = False
+    keep_low_precision_grads: bool
 
 
 class FlatParameter(nn.Parameter):

From f7022589b330bbe5bbe6e3561c495e9e0306150b Mon Sep 17 00:00:00 2001
From: "Wu, Chunyuan" <chunyuan.wu@intel.com>
Date: Wed, 30 Nov 2022 18:35:05 +0800
Subject: [PATCH 1423/1922] Add a cpp wrapper for Inductor (#88167)

## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```

### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
    std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
    at::Tensor arg0_1, arg1_1;
    std::tie(arg0_1, arg1_1) = args;
    auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
    auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
    auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
    assert(kernel0_lib != nullptr);
    void (*kernel0)(const float*,const float*,float*,float*);
    *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
    kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
    arg0_1.reset();
    arg1_1.reset();
    return std::make_tuple(buf0, buf1); }''' )

module = load_inline(
    name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
    cpp_sources=[wrapper],
    functions=['call_0'],
    extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
    extra_ldflags=['-shared  -lgomp'],
    extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])

def _wrap_func(f):
    def g(args):
        return f(args)
    return g
call = _wrap_func(module.call_0)
```

### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
   - [x] ATen GEMM-related OPs: #88667
   - [ ] ATen Conv
   - [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
---
 test/inductor/test_torchinductor.py |  18 ++
 torch/_inductor/codecache.py        |  60 +++--
 torch/_inductor/codegen/common.py   |  40 +++-
 torch/_inductor/codegen/cpp.py      |  42 +++-
 torch/_inductor/codegen/wrapper.py  | 355 ++++++++++++++++++++++++----
 torch/_inductor/config.py           |   3 +
 torch/_inductor/graph.py            |  70 +++++-
 torch/_inductor/sizevars.py         |  37 ++-
 8 files changed, 547 insertions(+), 78 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index f7c116149cad7..41fce3998dd0d 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -4712,6 +4712,24 @@ def fn(a, b):
             e.name for e in prof.profiler.function_events
         )
 
+    @patch.object(config, "cpp_wrapper", True)
+    @unittest.skipIf(HAS_CUDA, "cpp_wrapper only supports cpu")
+    def test_cpp_wrapper(self):
+        device = "cpu"
+        for name in [
+            "test_as_strided",  # buffer reuse
+            "test_cat",  # alias
+            "test_profiler_mark_wrapper_call",  # TODO: fallback to default wrapper for now
+            "test_relu",  # multiple inputs
+            "test_silu",  # single input, single output
+            "test_transpose",  # multiple outputs, buffer clear
+        ]:
+            test_name = f"{name}_{device}"
+            assert hasattr(self, test_name), "undefined function"
+            func = getattr(self, test_name)
+            assert callable(func), "not a callable"
+            func()
+
 
 if HAS_CPU:
 
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 7bca753034fa3..8a27d8e1d0820 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -75,12 +75,17 @@ def code_hash(code):
     )
 
 
-def write(source_code, ext, extra=""):
+def get_code_path(source_code, ext, extra):
     basename = code_hash(source_code + extra)
     subdir = os.path.join(cache_dir(), basename[1:3])
+    path = os.path.join(subdir, f"{basename}.{ext}")
+    return basename, subdir, path
+
+
+def write(source_code, ext, extra=""):
+    basename, subdir, path = get_code_path(source_code, ext, extra)
     if not os.path.exists(subdir):
         os.makedirs(subdir, exist_ok=True)
-    path = os.path.join(subdir, f"{basename}.{ext}")
     if not os.path.exists(path):
         # use a temp file for thread safety
         fd, tmp_path = tempfile.mkstemp(dir=subdir)
@@ -314,13 +319,28 @@ def pick_vec_isa():
     return invalid_vec_isa
 
 
-def cpp_compile_command(
-    input,
-    output,
-    warning_all=True,
-    shared=True,
-    include_pytorch=False,
-    vec_isa: VecISA = invalid_vec_isa,
+def get_shared(shared=True):
+    return "-shared -fPIC" if shared else ""
+
+
+def get_warning_all_flag(warning_all=True):
+    return "-Wall" if warning_all else ""
+
+
+def cpp_flags():
+    return "-std=c++14 -Wno-unused-variable"
+
+
+def optimization_flags():
+    return "-march=native -O3 -ffast-math -fno-finite-math-only -fopenmp"
+
+
+def use_custom_generated_macros():
+    return "-D C10_USING_CUSTOM_GENERATED_MACROS"
+
+
+def get_include_and_linking_paths(
+    include_pytorch=False, vec_isa: VecISA = invalid_vec_isa
 ):
     if sys.platform == "linux" and (
         include_pytorch
@@ -348,17 +368,29 @@ def cpp_compile_command(
     ipaths = " ".join(["-I" + p for p in ipaths])
     lpaths = " ".join(["-L" + p for p in lpaths])
     libs = " ".join(["-l" + p for p in libs])
+    return ipaths, lpaths, libs, macros
+
+
+def cpp_compile_command(
+    input,
+    output,
+    warning_all=True,
+    shared=True,
+    include_pytorch=False,
+    vec_isa: VecISA = invalid_vec_isa,
+):
+    ipaths, lpaths, libs, macros = get_include_and_linking_paths(
+        include_pytorch, vec_isa
+    )
 
-    shared_lib = "-shared -fPIC" if shared else ""
-    warning_all_flag = "-Wall" if warning_all else ""
     return re.sub(
         r"[ \n]+",
         " ",
         f"""
-            {cpp_compiler()} {input} {shared_lib} {warning_all_flag} -std=c++14 -Wno-unused-variable
+            {cpp_compiler()} {input} {get_shared(shared)} {get_warning_all_flag(warning_all)} {cpp_flags()}
             {ipaths} {lpaths} {libs} {macros}
-            -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp
-            -D C10_USING_CUSTOM_GENERATED_MACROS
+            {optimization_flags()}
+            {use_custom_generated_macros()}
             -o{output}
         """,
     ).strip()
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index 329dbaaa18fc6..c549de21e46ee 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -324,6 +324,12 @@ def call_names(self):
             self.input_buffers.keys(), self.output_buffers.keys(), self.sizevars.keys()
         )
 
+    def wrap_ptr_arg(self, buf, dtype):
+        return f"c_void_p({buf}.data_ptr())"
+
+    def wrap_size_arg(self, size):
+        return f"c_long({size})"
+
     def cpp_argdefs(self):
         from .cpp import DTYPE_TO_CPP, INDEX_TYPE
 
@@ -338,28 +344,36 @@ def cpp_argdefs(self):
 
         call_args = []
         arg_defs = []
+        arg_types = []
         for inplaced in unique(self.inplace_buffers.values()):
             outer = inplaced.other_names[-1]
             inner = inplaced.inner_name
             dtype = buffer_types[outer]
-            arg_defs.append(f"{DTYPE_TO_CPP[dtype]}* __restrict__ {inner}")
-            call_args.append(f"c_void_p({outer}.data_ptr())")
+            cpp_dtype = DTYPE_TO_CPP[dtype]
+            arg_defs.append(f"{cpp_dtype}* __restrict__ {inner}")
+            call_args.append(self.wrap_ptr_arg(outer, dtype))
+            arg_types.append(f"{cpp_dtype}*")
         for outer, inner in self.input_buffers.items():
             if outer in self.inplace_buffers:
                 continue
             dtype = buffer_types[outer]
-            arg_defs.append(f"const {DTYPE_TO_CPP[dtype]}* __restrict__ {inner}")
-            call_args.append(f"c_void_p({outer}.data_ptr())")
+            cpp_dtype = DTYPE_TO_CPP[dtype]
+            arg_defs.append(f"const {cpp_dtype}* __restrict__ {inner}")
+            call_args.append(self.wrap_ptr_arg(outer, dtype))
+            arg_types.append(f"const {cpp_dtype}*")
         for outer, inner in self.output_buffers.items():
             if outer in self.inplace_buffers or inner == "REMOVED":
                 continue
             dtype = buffer_types[outer]
-            arg_defs.append(f"{DTYPE_TO_CPP[dtype]}* __restrict__ {inner}")
-            call_args.append(f"c_void_p({outer}.data_ptr())")
+            cpp_dtype = DTYPE_TO_CPP[dtype]
+            arg_defs.append(f"{cpp_dtype}* __restrict__ {inner}")
+            call_args.append(self.wrap_ptr_arg(outer, dtype))
+            arg_types.append(f"{cpp_dtype}*")
         for outer, inner in self.sizevars.items():
             arg_defs.append(f"const {INDEX_TYPE} {inner}")
-            call_args.append(f"c_long({outer})")
-        return arg_defs, call_args
+            call_args.append(self.wrap_size_arg(outer))
+            arg_types.append(f"const {INDEX_TYPE}")
+        return arg_defs, call_args, arg_types
 
     def python_argdefs(self):
         arg_defs = []
@@ -429,6 +443,16 @@ def update_on_args(self, args, kwargs):
         pass
 
 
+class CppWrapperKernelArgs(KernelArgs):
+    def wrap_ptr_arg(self, buf, dtype):
+        from .cpp import DTYPE_TO_CPP
+
+        return f"({DTYPE_TO_CPP[dtype]}*)({buf}.data_ptr())"
+
+    def wrap_size_arg(self, size):
+        return f"{size}"
+
+
 class CSE:
     """Common subexpression elimination"""
 
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index 9fd4f769bcb07..feb4ea1b9ea90 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -18,6 +18,7 @@
 from ..virtualized import ops, V
 from .common import (
     BracesBuffer,
+    CppWrapperKernelArgs,
     DeferredIndentedBuffer,
     ExprPrinter,
     IndentedBuffer,
@@ -38,6 +39,20 @@
     torch.bool: "bool",
     torch.bfloat16: "bfloat16",
 }
+
+DTYPE_TO_ATEN = {
+    torch.float32: "at::ScalarType::Float",
+    torch.float64: "at::ScalarType::Double",
+    torch.float16: "at::ScalarType::Half",
+    torch.int64: "at::ScalarType::Long",
+    torch.int32: "at::ScalarType::Int",
+    torch.int16: "at::ScalarType::Short",
+    torch.int8: "at::ScalarType::Char",
+    torch.uint8: "at::ScalarType::Byte",
+    torch.bool: "at::ScalarType::Bool",
+    torch.bfloat16: "at::ScalarType::BFloat16",
+}
+
 INDEX_TYPE = "long"
 
 RTYPE_TO_CPP = {
@@ -1212,11 +1227,19 @@ def gen_vectorized_loop(loop, kernel, write_reduction_suffix=False):
 class CppScheduling:
     def __init__(self, scheduler):
         self.scheduler = scheduler
-        self.kernel_group = KernelGroup()
+        self.get_kernel_group()
 
     def group_fn(self, sizes):
         return tuple(tuple(map(V.graph.sizevars.simplify, s)) for s in sizes)
 
+    def get_kernel_group(self):
+        from .wrapper import CppWrapperCodeGen
+
+        if isinstance(V.graph.wrapper_code, CppWrapperCodeGen):
+            self.kernel_group = CppWrapperKernelGroup()
+        else:
+            self.kernel_group = KernelGroup()
+
     @staticmethod
     def can_fuse_horizontal(node1, node2):
         _, (vars1, reduce1) = node1.group
@@ -1341,7 +1364,7 @@ def codegen_nodes(self, nodes):
 
     def flush(self):
         self.kernel_group.codegen_define_and_call(V.graph.wrapper_code)
-        self.kernel_group = KernelGroup()
+        self.get_kernel_group()
 
 
 class KernelGroup:
@@ -1372,8 +1395,9 @@ def codegen_define_and_call(self, wrapper):
             return
 
         kernel_name = "kernel_cpp_" + wrapper.next_kernel_suffix()
-        arg_defs, call_args = self.args.cpp_argdefs()
+        arg_defs, call_args, arg_types = self.args.cpp_argdefs()
         arg_defs = ",\n".ljust(25).join(arg_defs)
+        arg_types = ",".join(arg_types)
         code = BracesBuffer()
         # TODO: support kernel profile on other platforms
         enable_kernel_profile = (
@@ -1403,11 +1427,15 @@ def codegen_define_and_call(self, wrapper):
         # not use BracesBuffer, so we have no good indicator of a C++ buffer atm.
         codecache_str = codecache_str.replace("#pragma CMT", "//")
         wrapper.define_kernel(kernel_name, codecache_str)
-
+        wrapper.load_kernel(kernel_name, code, arg_types)
         # generate the code to call this
-        wrapper.writeline(
-            "{}({})".format(kernel_name, ", ".join(call_args)),
-        )
+        wrapper.generate_kernel_call(kernel_name, call_args)
+
+
+class CppWrapperKernelGroup(KernelGroup):
+    def __init__(self):
+        super().__init__()
+        self.args = CppWrapperKernelArgs()
 
 
 class WorkSharing:
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
index 251c21e1364e9..63fcb745a189a 100644
--- a/torch/_inductor/codegen/wrapper.py
+++ b/torch/_inductor/codegen/wrapper.py
@@ -7,6 +7,7 @@
 from typing import Any, Dict, List
 
 from .. import codecache, config, ir
+from ..codecache import cpp_compile_command, get_code_path
 from ..utils import dynamo_utils, has_triton, sympy_dot, sympy_product
 from ..virtualized import V
 from .common import CodeGen, DeferredLine, IndentedBuffer, Kernel
@@ -28,18 +29,18 @@ def buffer_reuse_key(node: ir.Buffer):
     )
 
 
-def make_buffer_reuse(old, new):
+def make_buffer_reuse(old, new, del_func, declare, ending, as_strided):
     assert old.get_dtype() == new.get_dtype()
     del_line = ""
     if old.get_name() not in V.graph.get_output_names():
-        del_line = f"; del {old.get_name()}"
+        del_line = del_func(old.get_name())
     if old.get_size() == new.get_size() and old.get_stride() == new.get_stride():
-        return f"{new.get_name()} = {old.get_name()}{del_line}"
+        return f"{declare}{new.get_name()} = {old.get_name()}{del_line}{ending}"
 
     return (
-        f"{new.get_name()} = as_strided({old.get_name()}, "
+        f"{declare}{new.get_name()} = {as_strided}({old.get_name()}, "
         f"{V.graph.sizevars.codegen_shape_tuple(new.get_size())}, "
-        f"{V.graph.sizevars.codegen_shape_tuple(new.get_stride())}){del_line}"
+        f"{V.graph.sizevars.codegen_shape_tuple(new.get_stride())}){del_line}{ending}"
     )
 
 
@@ -56,6 +57,21 @@ def make_buffer_allocation(buffer):
     )
 
 
+def make_cpp_buffer_allocation(buffer):
+    from .cpp import DTYPE_TO_ATEN
+
+    # TODO: map layout and device here
+    dtype = buffer.get_dtype()
+    shape = tuple(buffer.get_size())
+    stride = tuple(buffer.get_stride())
+    return (
+        f"auto {buffer.get_name()} = at::empty_strided("
+        f"{V.graph.sizevars.codegen_shape_tuple(shape)}, "
+        f"{V.graph.sizevars.codegen_shape_tuple(stride)}, "
+        f"{DTYPE_TO_ATEN[dtype]}); "
+    )
+
+
 class MemoryPlanningState:
     def __init__(self):
         super().__init__()
@@ -108,6 +124,27 @@ def codegen(self, code: IndentedBuffer):
         code.writeline(make_buffer_allocation(self.node))
 
 
+@dataclasses.dataclass
+class CppAllocateLine(AllocateLine):
+    def plan(self, state: MemoryPlanningState):
+        if self.node.get_name() in V.graph.removed_buffers:
+            return NullLine()
+
+        # try to reuse a recently freed buffer
+        key = buffer_reuse_key(self.node)
+
+        if key in state:
+            free_line = state.pop(key)
+            free_line.is_reused = True
+            return CppReuseLine(free_line.node, self.node)
+
+        return self
+
+    def codegen(self, code: IndentedBuffer):
+        assert self.node.get_name() not in V.graph.removed_buffers
+        code.writeline(make_cpp_buffer_allocation(self.node))
+
+
 @dataclasses.dataclass
 class FreeIfNotReusedLine(MemoryPlanningLine):
     node: ir.Buffer
@@ -126,6 +163,17 @@ def codegen(self, code: IndentedBuffer):
             code.writeline(f"del {self.node.get_name()}")
 
 
+@dataclasses.dataclass
+class CppFreeIfNotReusedLine(FreeIfNotReusedLine):
+    node: ir.Buffer
+    is_reused: bool = False
+
+    def codegen(self, code: IndentedBuffer):
+        assert (self.node.get_name()) not in V.graph.removed_buffers
+        if not self.is_reused:
+            code.writeline(f"{self.node.get_name()}.reset();")
+
+
 @dataclasses.dataclass
 class ReuseLine(MemoryPlanningLine):
     node: ir.Buffer
@@ -139,7 +187,38 @@ def plan(self, state: MemoryPlanningState):
     def codegen(self, code: IndentedBuffer):
         assert self.node.get_name() not in V.graph.removed_buffers
         assert self.reused_as.get_name() not in V.graph.removed_buffers
-        code.writeline(make_buffer_reuse(self.node, self.reused_as) + "  # reuse")
+        code.writeline(
+            make_buffer_reuse(
+                self.node,
+                self.reused_as,
+                del_func=lambda name: f"; del {name}",
+                declare="",
+                ending="",
+                as_strided="as_strided",
+            )
+            + "  # reuse"
+        )
+
+
+@dataclasses.dataclass
+class CppReuseLine(ReuseLine):
+    node: ir.Buffer
+    reused_as: ir.Buffer
+
+    def codegen(self, code: IndentedBuffer):
+        assert self.node.get_name() not in V.graph.removed_buffers
+        assert self.reused_as.get_name() not in V.graph.removed_buffers
+        code.writeline(
+            make_buffer_reuse(
+                self.node,
+                self.reused_as,
+                del_func=lambda name: f"; {name}.reset()",
+                declare="auto ",
+                ending=";",
+                as_strided="at::as_strided",
+            )
+            + "  // reuse"
+        )
 
 
 @dataclasses.dataclass
@@ -170,6 +249,7 @@ def __init__(self):
         self._names_iter = count()
         self.header = IndentedBuffer()
         self.prefix = IndentedBuffer()
+        self.wrapper_call = IndentedBuffer()
         self.kernels = {}
         self.lines = []
         self.header.splice(
@@ -219,6 +299,20 @@ def __init__(self):
                     f"from {config.inductor_import}.triton_ops.batched_matmul import bmm_out as triton_bmm_out"
                 )
 
+        self.write_prefix()
+
+        for name, value in V.graph.constants.items():
+            # include a hash so our code cache gives different constants different files
+            hashed = hashlib.sha256(repr(value).encode("utf-8")).hexdigest()
+            self.header.writeline(f"{name} = None  # {hashed}")
+
+        self.allocated = set()
+        self.freed = set()
+        self.write_get_cuda_stream = functools.lru_cache(None)(
+            self.write_get_cuda_stream
+        )
+
+    def write_prefix(self):
         self.prefix.splice(
             """
 
@@ -228,28 +322,17 @@ def __init__(self):
             def call(args):
             """
         )
-        with self.prefix.indent():
+        with self.wrapper_call.indent():
             inp_len = len(V.graph.graph_inputs.keys())
             if inp_len != 0:
                 lhs = f"{', '.join(V.graph.graph_inputs.keys())}{'' if inp_len != 1 else ','}"
-                self.prefix.writeline(f"{lhs} = args")
-                self.prefix.writeline("args.clear()")
+                self.wrapper_call.writeline(f"{lhs} = args")
+                self.wrapper_call.writeline("args.clear()")
             for name in V.graph.randomness_seeds:
-                self.prefix.writeline(
+                self.wrapper_call.writeline(
                     f"torch.randint(2**31, size=(), dtype=torch.int64, out={name})"
                 )
-            V.graph.sizevars.codegen(self.prefix, V.graph.graph_inputs)
-
-        for name, value in V.graph.constants.items():
-            # include a hash so our code cache gives different constants different files
-            hashed = hashlib.sha256(repr(value).encode("utf-8")).hexdigest()
-            self.header.writeline(f"{name} = None  # {hashed}")
-
-        self.allocated = set()
-        self.freed = set()
-        self.write_get_cuda_stream = functools.lru_cache(None)(
-            self.write_get_cuda_stream
-        )
+            V.graph.sizevars.codegen(self.wrapper_call, V.graph.graph_inputs)
 
     def write_get_cuda_stream(self, index):
         name = f"stream{index}"
@@ -259,6 +342,14 @@ def write_get_cuda_stream(self, index):
     def next_kernel_suffix(self):
         return f"{next(self._names_iter)}"
 
+    def write_allocate_line(self, buffer):
+        self.writeline(AllocateLine(buffer))
+
+    def get_deferred_line(self, name, layout):
+        return DeferredLine(
+            name, f"{name} = {layout.view.codegen_reference()}  # alias"
+        )
+
     def codegen_allocation(self, buffer):
         name = buffer.get_name()
         if name in V.graph.removed_buffers or name in self.allocated:
@@ -279,20 +370,24 @@ def codegen_allocation(self, buffer):
             if not layout.maybe_guard_aligned():
                 V.graph.unaligned_buffers.add(name)
             self.codegen_allocation(layout.view.data)
-            allocation = DeferredLine(
-                name, f"{name} = {layout.view.codegen_reference()}  # alias"
-            )
+            allocation = self.get_deferred_line(name, layout)
             self.writeline(allocation)
             return
 
-        self.writeline(AllocateLine(buffer))
+        self.write_allocate_line(buffer)
+
+    def write_del_line(self, name):
+        self.writeline(f"del {name}")
+
+    def write_free_if_not_reused_line(self, buffer):
+        self.writeline(FreeIfNotReusedLine(buffer))
 
     def codegen_free(self, buffer):
         name = buffer.get_name()
 
         # can be freed but not reused
         if isinstance(buffer, ir.InputBuffer):
-            self.writeline(f"del {name}")
+            self.write_del_line(name)
             return
 
         if not self.can_reuse(buffer):
@@ -301,10 +396,10 @@ def codegen_free(self, buffer):
 
         layout = buffer.get_layout()
         if isinstance(layout, (ir.AliasedLayout, ir.MultiOutputLayout)):
-            self.writeline(f"del {name}")
+            self.write_del_line(name)
             return
 
-        self.writeline(FreeIfNotReusedLine(buffer))
+        self.write_free_if_not_reused_line(buffer)
 
     def can_reuse(self, buffer):
         name = buffer.get_name()
@@ -317,12 +412,24 @@ def can_reuse(self, buffer):
             return False
         return True
 
+    def write_reuse_line(self, input_buffer, output_buffer):
+        self.writeline(ReuseLine(input_buffer, output_buffer))
+
     def codegen_inplace_reuse(self, input_buffer, output_buffer):
         assert buffer_reuse_key(input_buffer) == buffer_reuse_key(output_buffer)
         self.codegen_allocation(input_buffer)
         self.freed.add(input_buffer.get_name())
         self.allocated.add(output_buffer.get_name())
-        self.writeline(ReuseLine(input_buffer, output_buffer))
+        self.write_reuse_line(input_buffer, output_buffer)
+
+    def generate_return(self, output_refs):
+        if output_refs:
+            self.wrapper_call.writeline("return (" + ", ".join(output_refs) + ", )")
+        else:
+            self.wrapper_call.writeline("return ()")
+
+    def generate_end(self, result):
+        return
 
     @dynamo_utils.dynamo_timed
     def generate(self):
@@ -332,11 +439,15 @@ def generate(self):
 
         out_names = V.graph.get_output_names()
         with contextlib.ExitStack() as stack:
-            stack.enter_context(result.indent())
+            stack.enter_context(self.wrapper_call.indent())
             if config.profiler_mark_wrapper_call:
-                result.writeline("from torch.profiler import record_function")
-                result.writeline("with record_function('inductor_wrapper_call'):")
-                stack.enter_context(result.indent())
+                self.wrapper_call.writeline(
+                    "from torch.profiler import record_function"
+                )
+                self.wrapper_call.writeline(
+                    "with record_function('inductor_wrapper_call'):"
+                )
+                stack.enter_context(self.wrapper_call.indent())
             while (
                 self.lines
                 and isinstance(self.lines[-1], MemoryPlanningLine)
@@ -353,15 +464,17 @@ def generate(self):
 
             for line in self.lines:
                 if isinstance(line, MemoryPlanningLine):
-                    line.codegen(result)
+                    line.codegen(self.wrapper_call)
                 else:
-                    result.writeline(line)
+                    self.wrapper_call.writeline(line)
 
             output_refs = [x.codegen_reference() for x in V.graph.graph_outputs]
-            if output_refs:
-                result.writeline("return (" + ", ".join(output_refs) + ", )")
-            else:
-                result.writeline("return ()")
+            self.generate_return(output_refs)
+
+        with result.indent():
+            result.splice(self.wrapper_call)
+
+        self.generate_end(result)
 
         self.add_benchmark_harness(result)
 
@@ -377,8 +490,8 @@ def add_benchmark_harness(self, output):
         def add_fake_input(name, shape, stride, device, dtype):
             output.writeline(
                 f"{name} = rand_strided("
-                f"{V.graph.sizevars.codegen_shape_tuple(shape)}, "
-                f"{V.graph.sizevars.codegen_shape_tuple(stride)}, "
+                f"{V.graph.sizevars.codegen_benchmark_shape_tuple(shape)}, "
+                f"{V.graph.sizevars.codegen_benchmark_shape_tuple(stride)}, "
                 f"device='{device.type}', dtype={dtype})"
             )
 
@@ -411,6 +524,17 @@ def add_fake_input(name, shape, stride, device, dtype):
     def define_kernel(self, name: str, kernel: str):
         self.header.splice(f"\n\n{name} = {kernel}")
 
+    def load_kernel(self, name: str = None, kernel: str = None, arg_types: List = None):
+        return
+
+    def wrap_kernel_call(self, name, call_args):
+        return "{}({})".format(name, ", ".join(call_args))
+
+    def generate_kernel_call(self, name, call_args):
+        self.writeline(
+            self.wrap_kernel_call(name, call_args),
+        )
+
     def call_kernel(self, name: str, kernel: Kernel):
         tmp = IndentedBuffer()
         kernel.call_kernel(self, tmp, name)
@@ -421,3 +545,150 @@ def call_kernel(self, name: str, kernel: Kernel):
 
     def writeline(self, line):
         self.lines.append(line)
+
+
+class CppWrapperCodeGen(WrapperCodeGen):
+    """
+    The outer wrapper that calls the kernels.
+    """
+
+    call_func_id = count()
+
+    def __init__(self):
+        self._call_func_id = next(CppWrapperCodeGen.call_func_id)
+        super().__init__()
+
+    def write_prefix(self):
+        self.prefix.splice(
+            """
+            async_compile.wait(globals())
+            del async_compile
+            from torch.utils.cpp_extension import load_inline
+            wrapper = (
+            '''
+            #include <dlfcn.h>
+            #include <assert.h>
+            """
+        )
+        with self.wrapper_call.indent():
+            inputs_len = len(V.graph.graph_inputs.keys())
+            output_refs = [x.codegen_reference() for x in V.graph.graph_outputs]
+            if output_refs:
+                if len(output_refs) == 1:
+                    output_types = "at::Tensor"
+                else:
+                    output_types = "std::vector<at::Tensor>"
+            else:
+                output_types = "void"
+
+            if inputs_len != 0:
+                inputs_args = ["at::Tensor&"] * len(V.graph.graph_inputs.keys())
+                inputs_args = ", ".join(inputs_args)
+                inputs_args = f"std::tuple<{inputs_args}>"
+
+                self.wrapper_call.writeline(
+                    f"{output_types} call_{self._call_func_id}({inputs_args} args) {{"
+                )
+                inputs_keys_str = ", ".join(V.graph.graph_inputs.keys())
+                self.wrapper_call.writeline(f"at::Tensor {inputs_keys_str};")
+                self.wrapper_call.writeline(f"std::tie({inputs_keys_str}) = args;")
+            else:
+                self.wrapper_call.writeline(
+                    f"{output_types} call_{self._call_func_id}(std::tuple<> args) {{"
+                )
+            for name in V.graph.randomness_seeds:
+                self.wrapper_call.writeline(f"at::Tensor {name};")
+                self.wrapper_call.writeline(
+                    f"{name} = at::randint(std::pow(2, 31), {{}}, at::ScalarType::Long);"
+                )
+            V.graph.sizevars.codegen(self.wrapper_call, V.graph.graph_inputs)
+
+    def write_allocate_line(self, buffer):
+        self.writeline(CppAllocateLine(buffer))
+
+    def write_del_line(self, name):
+        self.writeline(f"{name}.reset();")
+        return
+
+    def write_free_if_not_reused_line(self, buffer):
+        self.writeline(CppFreeIfNotReusedLine(buffer))
+        return
+
+    def write_reuse_line(self, input_buffer, output_buffer):
+        self.writeline(CppReuseLine(input_buffer, output_buffer))
+
+    def get_deferred_line(self, name, layout):
+        return DeferredLine(
+            name, f"auto {name} = {layout.view.codegen_reference()};  // alias"
+        )
+
+    def get_kernel_path(self, code):
+        from ..codecache import pick_vec_isa
+
+        picked_vec_isa = pick_vec_isa()
+        ext = "so"
+        extra = cpp_compile_command("i", "o", vec_isa=picked_vec_isa)
+        # \n is required to match with the CodeCache behavior
+        source_code = "\n" + code.getvalue()
+        _, _, kernel_path = get_code_path(source_code, ext, extra)
+        return kernel_path
+
+    def load_kernel(self, name: str = None, kernel: str = None, arg_types: List = None):
+        kernel_path = self.get_kernel_path(kernel)
+
+        self.writeline(f'auto {name}_lib = dlopen("{kernel_path}", RTLD_NOW);')
+        self.writeline(f"assert({name}_lib != nullptr);")
+        self.writeline(f"void (*{name})({arg_types});")
+        self.writeline(f'*(void **) (&{name}) = dlsym({name}_lib, "kernel");')
+
+    def wrap_kernel_call(self, name, call_args):
+        return "{}({});".format(name, ", ".join(call_args))
+
+    def generate_return(self, output_refs):
+        if output_refs:
+            if len(output_refs) == 1:
+                self.wrapper_call.writeline("return " + output_refs[0] + "; }''' )")
+            else:
+                self.wrapper_call.writeline(
+                    "return std::vector<at::Tensor>({"
+                    + ", ".join(output_refs)
+                    + "}); }''' )"
+                )
+        else:
+            self.wrapper_call.writeline("return; }''' )")
+
+    def generate_end(self, result):
+        shared = codecache.get_shared()
+        warning_all_flag = codecache.get_warning_all_flag()
+        cpp_flags = codecache.cpp_flags()
+        ipaths, lpaths, libs, macros = codecache.get_include_and_linking_paths()
+        optimization_flags = codecache.optimization_flags()
+        use_custom_generated_macros = codecache.use_custom_generated_macros()
+
+        extra_cflags = f"{cpp_flags} {optimization_flags} {warning_all_flag} {macros} {use_custom_generated_macros}"
+        extra_ldflags = f"{shared} {lpaths} {libs}"
+        extra_include_paths = f"{ipaths}"
+
+        # get the hash of the wrapper code to name the extension
+        wrapper_call_hash = codecache.code_hash(self.wrapper_call.getvalue())
+        result.splice(
+            f"""
+            module = load_inline(
+                name='inline_extension_{wrapper_call_hash}',
+                cpp_sources=[wrapper],
+                functions=['call_{self._call_func_id}'],
+                extra_cflags=['{extra_cflags}'],
+                extra_ldflags=['{extra_ldflags}'],
+                extra_include_paths=['{extra_include_paths}'])
+            """
+        )
+        # Wrap the func to support setting result._boxed_call = True
+        result.splice(
+            f"""
+            def _wrap_func(f):
+                def g(args):
+                    return f(args)
+                return g
+            call = _wrap_func(module.call_{self._call_func_id})
+            """
+        )
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index 545f545c5026a..d4e3aa60aac22 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -10,6 +10,9 @@
 # Whether to enable printing the source code for each future
 verbose_progress = False
 
+# use cpp wrapper instead of python wrapper
+cpp_wrapper = False
+
 # dead code elimination
 dce = False
 
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 9b6ff5f424e49..061d2eb082af9 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -2,6 +2,7 @@
 import operator
 import os
 import re
+import sys
 import time
 
 import sympy
@@ -13,7 +14,7 @@
 from torch.utils._mode_utils import no_dispatch
 
 from . import config, ir
-from .codegen.wrapper import WrapperCodeGen
+from .codegen.wrapper import CppWrapperCodeGen, WrapperCodeGen
 from .exc import (
     LoweringException,
     MissingOperatorWithDecomp,
@@ -26,7 +27,7 @@
     make_fallback,
     needs_realized_inputs,
 )
-from .sizevars import SizeVarAllocator
+from .sizevars import CppSizeVarAllocator, SizeVarAllocator
 from .utils import dynamo_utils, gather_origins, get_dtype_size, sympy_product
 from .virtualized import V
 
@@ -89,6 +90,7 @@ def __init__(
         self.name_to_buffer = {}
         self.creation_time = time.time()
         self.name = "GraphLowering"
+        self._can_use_cpp_wrapper = config.cpp_wrapper
 
     def get_dtype(self, buffer_name):
         if buffer_name in self.constants:
@@ -138,7 +140,21 @@ def increment_randomness_offset(self, numel):
     def run(self, *args):
         return super().run(*args)
 
+    def disable_cpp_wrapper(self, cond):
+        self._can_use_cpp_wrapper = False
+        log.debug("Set _can_use_cpp_wrapper to False due to %s", cond)
+
+    def check_buffer_for_cpp_wrapper(self, buffer: ir.ComputedBuffer):
+        if isinstance(buffer, ir.ExternKernel):
+            self.disable_cpp_wrapper("ExternKernel")
+        if isinstance(buffer, ir.ComputedBuffer):
+            if buffer.data.get_reduction_type():
+                self.disable_cpp_wrapper("Reduction")
+
     def register_buffer(self, buffer: ir.ComputedBuffer):
+        if config.cpp_wrapper:
+            self.check_buffer_for_cpp_wrapper(buffer)
+
         name = f"buf{len(self.buffers)}"
         self.buffers.append(buffer)
         self.name_to_buffer[name] = buffer
@@ -362,10 +378,58 @@ def run_node(self, n: torch.fx.Node):
                 result.realize_hint()
         return result
 
+    def check_platform(self):
+        if sys.platform != "linux":
+            self.disable_cpp_wrapper("platform not linux")
+
+    def check_profiler_mark_wrapper_call(self):
+        if config.profiler_mark_wrapper_call:
+            self.disable_cpp_wrapper("profiler not supported")
+
+    def check_device_for_cpp_buffer(self):
+        if len(self.device_types) == 1:
+            device = self.device_types.pop()
+            if device == "cpu":
+                return
+        self.disable_cpp_wrapper("device not CPU")
+
+    def check_input_for_cpp_buffer(self):
+        for _, value in self.graph_inputs.items():
+            if value.get_dtype() != torch.float32:
+                self.disable_cpp_wrapper("inputs not FP32")
+
+    def check_output_for_cpp_buffer(self):
+        for item in self.graph_outputs:
+            if isinstance(item, ir.NoneAsConstantBuffer):
+                self.disable_cpp_wrapper("NoneAsConstantBuffer")
+
+    def check_constant_for_cpp_buffer(self):
+        if self.constants:
+            self.disable_cpp_wrapper("Constants")
+
+    def check_cpp_wrapper(self):
+        self.check_platform()
+        self.check_profiler_mark_wrapper_call()
+        self.check_device_for_cpp_buffer()
+        self.check_input_for_cpp_buffer()
+        self.check_output_for_cpp_buffer()
+        self.check_constant_for_cpp_buffer()
+
+    def init_wrapper_code(self):
+        if config.cpp_wrapper:
+            self.check_cpp_wrapper()
+            if self._can_use_cpp_wrapper:
+                self.sizevars = CppSizeVarAllocator(self._shape_env)
+                self.wrapper_code = CppWrapperCodeGen()
+                return
+        self.wrapper_code = WrapperCodeGen()
+        return
+
     def codegen(self):
         from .scheduler import Scheduler
 
-        self.wrapper_code = WrapperCodeGen()
+        self.init_wrapper_code()
+
         self.scheduler = Scheduler(self.buffers)
         self.scheduler.codegen()
         return self.wrapper_code.generate()
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
index baea5a109bba6..7997d5fde09fa 100644
--- a/torch/_inductor/sizevars.py
+++ b/torch/_inductor/sizevars.py
@@ -50,6 +50,8 @@ def __init__(self, shape_env=None):
         self.stride_vars = self.make_stride_vars_cache()
         self.simplify_with_ranges = self.make_simplify_with_ranges_cache()
         self._simplify_loops = self.make_simplify_loops_cache()
+        self.declare = ""
+        self.ending = ""
 
     def seed(self):
         """
@@ -446,12 +448,14 @@ def codegen(self, code: IndentedBuffer, graph_inputs: Dict[str, ir.Buffer]):
 
         @functools.lru_cache(None)
         def sizeof(name):
-            code.writeline(f"{name}_size = {name}.size()")
+            code.writeline(f"{self.declare}{name}_size = {name}.size(){self.ending}")
             return f"{name}_size"
 
         @functools.lru_cache(None)
         def strideof(name):
-            code.writeline(f"{name}_stride = {name}.stride()")
+            code.writeline(
+                f"{self.declare}{name}_stride = {name}.stride(){self.ending}"
+            )
             return f"{name}_stride"
 
         # Assign all symbolic shapes needed to local variables
@@ -465,7 +469,9 @@ def strideof(name):
                 if shape in needed:
                     needed.remove(shape)
                     added.add(shape)
-                    code.writeline(f"{shape} = {sizeof(name)}[{dim}]")
+                    code.writeline(
+                        f"{self.declare}{shape} = {sizeof(name)}[{dim}]{self.ending}"
+                    )
                 elif isinstance(shape, sympy.Symbol):
                     assert shape in added, f"{shape} is needed but not added"
 
@@ -475,7 +481,9 @@ def strideof(name):
                 shape = self.simplify(shape)
                 if shape in needed:
                     needed.remove(shape)
-                    code.writeline(f"{shape} = {strideof(name)}[{dim}]")
+                    code.writeline(
+                        f"{self.declare}{shape} = {strideof(name)}[{dim}]{self.ending}"
+                    )
                 elif isinstance(shape, sympy.Symbol):
                     assert shape in added, f"{shape} is needed but not added"
         assert not needed
@@ -493,6 +501,9 @@ def codegen_shape_tuple(self, shape: Tuple[Expr, ...]) -> str:
             return f"({parts[0]}, )"
         return f"({', '.join(parts)})"
 
+    def codegen_benchmark_shape_tuple(self, shape: Tuple[Expr, ...]) -> str:
+        return self.codegen_shape_tuple(shape)
+
 
 def join_dimensions(expr: Expr) -> Expr:
     from .ir import ModularIndexing
@@ -559,6 +570,24 @@ def _join_dimensions_cached(expr: Expr) -> Expr:
     return expr
 
 
+class CppSizeVarAllocator(SizeVarAllocator):
+    def __init__(self, shape_env=None):
+        super().__init__(shape_env)
+        self.declare = "auto "
+        self.ending = ";"
+
+    def codegen_shape_tuple(self, shape: Tuple[Expr, ...]) -> str:
+        parts = list(map(self.codegen_sizevar, shape))
+        if len(parts) == 0:
+            return "{}"
+        if len(parts) == 1:
+            return f"{{{parts[0]}, }}"
+        return f"{{{', '.join(parts)}}}"
+
+    def codegen_benchmark_shape_tuple(self, shape: Tuple[Expr, ...]) -> str:
+        return super().codegen_shape_tuple(shape)
+
+
 class SimplifyIndexing(V.WrapperHandler):  # type: ignore[name-defined]
     """
     A wrapper around .virtualize.ops that uses var range information to

From 5f4b3cf8eb74b44b9107be1fb572bd4f45d5612a Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 30 Nov 2022 16:03:48 +0000
Subject: [PATCH 1424/1922] Revert "replace double transpose with single
 permute in nn.f.mha (#89847)"

This reverts commit b9afa928271dfd6b80ddb2367fa1c4f4aba25fe4.

Reverted https://github.com/pytorch/pytorch/pull/89847 on behalf of https://github.com/jeanschmidt due to Need to revert this commit as it is causing conflict when reverting #89470
---
 torch/nn/functional.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 7b8324c7aa849..5da45046332b8 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -5185,7 +5185,7 @@ def multi_head_attention_forward(
 
     attn_output, attn_output_weights = _scaled_dot_product_attention(
         q, k, v, attn_mask, dropout_p, need_weights, False)
-    attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
+    attn_output = attn_output.transpose(1, 2).transpose(0, 1).contiguous().view(bsz * tgt_len, embed_dim)
 
     attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
     attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))

From 6e30278f084ee3abede50e87385344deb1a8c280 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 30 Nov 2022 16:16:24 +0000
Subject: [PATCH 1425/1922] Revert "Call _sdp_attention  in nn.functional.mha
 (#89470)"

This reverts commit 4d7ec302202caaf35bb8c997d035c54f0c24e192.

Reverted https://github.com/pytorch/pytorch/pull/89470 on behalf of https://github.com/jeanschmidt due to breaking internal builds
---
 .../ATen/native/transformers/attention.cpp    |  9 ++++---
 .../ATen/native/transformers/cuda/sdp_utils.h | 26 ++-----------------
 c10/core/SymFloat.cpp                         | 10 -------
 c10/core/SymFloat.h                           |  3 ---
 test/onnx/test_models_onnxruntime.py          |  1 -
 torch/nn/functional.py                        | 21 +++++++--------
 6 files changed, 17 insertions(+), 53 deletions(-)

diff --git a/aten/src/ATen/native/transformers/attention.cpp b/aten/src/ATen/native/transformers/attention.cpp
index 06ea49bb516c4..9c5be12ef24db 100644
--- a/aten/src/ATen/native/transformers/attention.cpp
+++ b/aten/src/ATen/native/transformers/attention.cpp
@@ -9,6 +9,7 @@
 #include <ATen/native/transformers/attention.h>
 #include <ATen/native/transformers/sdp_utils_cpp.h>
 
+
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/NativeFunctions.h>
 #else
@@ -740,10 +741,10 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math(
   }
     auto attn_mask = attn_mask_;
     // Naive, composite implementation defined here.
+    const auto embed_size = query_.size(-1);
 
     // Scale q,k before matmul for stability see https://tinyurl.com/sudb9s96 for math
-    const auto embed_size = SymFloat(query_.sym_size(-1));
-    const auto scaling_factor = embed_size.sqrt().sqrt();
+    const double scaling_factor = ::sqrt(::sqrt(static_cast<double>(embed_size)));
     const auto query = query_ / scaling_factor;
     if (is_causal) {
         TORCH_CHECK(!attn_mask.has_value(),
@@ -752,8 +753,8 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math(
                 "_scaled_dot_product_attention: Nested tensors for query / key are not supported when is_causal=True");
 
         // Replace attn_mask with causal mask; lower triangular elements take part in attention.
-        const auto L = query.sym_size(-2), S = key.sym_size(-2);
-        attn_mask = at::ones_symint({L, S}, query.options().dtype(at::kBool)).tril();
+        const auto L = query.size(-2), S = key.size(-2);
+        attn_mask = at::ones({L, S}, query.options().dtype(at::kBool)).tril();
     }
     if (attn_mask.has_value()) {
         TORCH_CHECK(!query.is_nested() && !key.is_nested(),
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.h b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
index 2b57ef6dd6f6c..55e9aeb184a22 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.h
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
@@ -40,9 +40,7 @@ inline bool check_tensor_dtype(
          allowed_dtypes.end()))) {
     TORCH_CHECK(
         !debug,
-        "Expected query, key and value to all be of dtype: {",
-        c10::Join(", ", allowed_dtypes), "}. Got ",
-        "Query dtype: ",
+        "Expected query, key and value to be of dtype float16 or bfloat16 but got Query dtype: ",
         params.query.dtype(),
         ", Key dtype: ",
         params.key.dtype(),
@@ -164,25 +162,6 @@ inline bool check_head_dim_size(sdp_params params, bool debug) {
   return true;
 }
 
-inline bool check_head_dim_size_mem_efficient(sdp_params params, bool debug) {
-  const int64_t query_size_last = params.query.size(-1);
-  if (!(query_size_last == params.key.size(-1) &&
-        query_size_last == params.value.size(-1) && query_size_last >= 8)) {
-    TORCH_CHECK(
-        !debug,
-        "Mem efficient attention requires last dimension of inputs to be >= 8.",
-        "Got Query.size(-1): ",
-        query_size_last,
-        ", Key.size(-1): ",
-        params.key.size(-1),
-        ", Value.size(-1): ",
-        params.value.size(-1),
-        " instead.");
-    return false;
-  }
-  return true;
-}
-
 inline bool check_runtime_disabled_flash(sdp_params params, bool debug) {
   // We check the global context to see if user has explicitly turned of flash
   // sdp kernels
@@ -280,14 +259,13 @@ inline bool use_mem_efficient_attention(sdp_params params, bool debug) {
       at::kHalf, at::kFloat, at::kBFloat16};
 
   //  Define gate functions that determine if a flash kernel can be ran
-  constexpr std::array<bool(*)(sdp_params, bool), 9> constraints{{
+  constexpr std::array<bool(*)(sdp_params, bool), 8> constraints{{
       check_gpu_sm50_or_greater,
       check_runtime_disabled_mem_efficient,
       check_requires_grad_and_nested,
       check_for_attn_weights,
       check_tensor_shapes,
       check_for_attn_mask,
-      check_head_dim_size_mem_efficient,
       check_for_seq_len_1_nested_tensor,
       check_for_non_zero_dropout}};
   for (auto& constraint : constraints) {
diff --git a/c10/core/SymFloat.cpp b/c10/core/SymFloat.cpp
index 161313c777dda..511c50e3398ee 100644
--- a/c10/core/SymFloat.cpp
+++ b/c10/core/SymFloat.cpp
@@ -1,7 +1,6 @@
 #include <c10/core/SymFloat.h>
 #include <c10/core/SymNodeImpl.h>
 #include <array>
-#include <cmath>
 #include <utility>
 
 namespace c10 {
@@ -71,15 +70,6 @@ std::ostream& operator<<(std::ostream& os, const SymFloat& s) {
   return os;
 }
 
-SymFloat SymFloat::sqrt() const {
-  if (!is_symbolic()) {
-    return SymFloat(std::sqrt(data_));
-  }
-  auto other = SymFloat(-0.5);
-  auto res = normalize_symfloats(*this, other);
-  return SymFloat(res[0]->pow(res[1]));
-}
-
 double SymFloat::guard_float(const char* file, int64_t line) const {
   if (!is_symbolic()) {
     return data_;
diff --git a/c10/core/SymFloat.h b/c10/core/SymFloat.h
index 50512dc6fb206..ff9e101e31afb 100644
--- a/c10/core/SymFloat.h
+++ b/c10/core/SymFloat.h
@@ -40,9 +40,6 @@ class C10_API SymFloat {
   SymFloat operator*(const SymFloat&) const;
   SymFloat operator/(const SymFloat&) const;
 
-  // Need guidance on where to put this code
-  SymFloat sqrt() const;
-
   // Insert a guard for the float to be its concrete value, and then return
   // that value.  This operation always works, even if the float is symbolic,
   // so long as we know what the underlying value is. Don't blindly put this
diff --git a/test/onnx/test_models_onnxruntime.py b/test/onnx/test_models_onnxruntime.py
index 4b7bdb58ae514..de1003ce449e0 100644
--- a/test/onnx/test_models_onnxruntime.py
+++ b/test/onnx/test_models_onnxruntime.py
@@ -394,7 +394,6 @@ def forward(self, images, features: Mapping[str, torch.Tensor]):
         )
 
     @skipScriptTest()  # TODO: #75625
-    @skipIfUnsupportedMinOpsetVersion(20)
     def test_transformer_encoder(self):
         class MyModule(torch.nn.Module):
             def __init__(self, ninp, nhead, nhid, dropout, nlayers):
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 5da45046332b8..a1a102d786f16 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -5173,20 +5173,19 @@ def multi_head_attention_forward(
     # (deep breath) calculate attention and out projection
     #
 
+    B, Nt, E = q.shape
+    q_scaled = q / math.sqrt(E)
     if attn_mask is not None:
-        if attn_mask.size(0) == 1:
-            attn_mask = attn_mask.unsqueeze(0)
-        else:
-            attn_mask = attn_mask.view(bsz, num_heads, -1, src_len)
-
-    q = q.view(bsz, num_heads, tgt_len, head_dim)
-    k = k.view(bsz, num_heads, src_len, head_dim)
-    v = v.view(bsz, num_heads, src_len, head_dim)
+        attn_output_weights = torch.baddbmm(attn_mask, q_scaled, k.transpose(-2, -1))
+    else:
+        attn_output_weights = torch.bmm(q_scaled, k.transpose(-2, -1))
+    attn_output_weights = softmax(attn_output_weights, dim=-1)
+    if dropout_p > 0.0:
+        attn_output_weights = dropout(attn_output_weights, p=dropout_p)
 
-    attn_output, attn_output_weights = _scaled_dot_product_attention(
-        q, k, v, attn_mask, dropout_p, need_weights, False)
-    attn_output = attn_output.transpose(1, 2).transpose(0, 1).contiguous().view(bsz * tgt_len, embed_dim)
+    attn_output = torch.bmm(attn_output_weights, v)
 
+    attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim)
     attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
     attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
 

From b755cf5f93fbd1eefe075a2a7f20f78c99ddce4e Mon Sep 17 00:00:00 2001
From: Sergii Dymchenko <sdym@fb.com>
Date: Wed, 30 Nov 2022 17:00:36 +0000
Subject: [PATCH 1426/1922] Fix TODOs related to #38095 in test_mps.py (#89815)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89815
Approved by: https://github.com/weiwangmeta, https://github.com/kulinseth
---
 test/test_mps.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index fc923daa57940..5b45bd4e6e692 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -1882,8 +1882,7 @@ def _nll_loss_helper(self, input_size, reduction, expected):
 
         output_cpu = F.nll_loss(input, target, reduction=reduction)
         output_mps = F.nll_loss(input_mps, target_mps, reduction=reduction)
-        # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-        self.assertEqualIgnoreType(output_cpu, output_mps.to('cpu'))
+        self.assertEqual(output_cpu, output_mps.to('cpu'))
 
         output_cpu.sum().backward()
         output_mps.sum().backward()
@@ -1902,8 +1901,7 @@ def _nll_loss_1d_helper(self, input_size, reduction):
 
         output_cpu = F.nll_loss(input, target, reduction=reduction)
         output_mps = F.nll_loss(input_mps, target_mps, reduction=reduction)
-        # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-        self.assertEqualIgnoreType(output_cpu, output_mps.to('cpu'))
+        self.assertEqual(output_cpu, output_mps.to('cpu'))
 
         output_cpu.sum().backward()
         output_mps.sum().backward()

From dcd596713024e6a3c02363548db93c30d166000d Mon Sep 17 00:00:00 2001
From: Xilun Wu <12968408+XilunWu@users.noreply.github.com>
Date: Wed, 30 Nov 2022 17:56:19 +0000
Subject: [PATCH 1427/1922] Update Reviewers for PyTorch Distributed team
 (#89889)

- Reflect PyTorch Distributed team member change on the merge rule
- Added new team members since 2021
- Removed one member no longer on PyTorch Distributed team
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89889
Approved by: https://github.com/soumith
---
 .github/merge_rules.yaml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/merge_rules.yaml b/.github/merge_rules.yaml
index 1837cce32b2f6..7baf3e0e8df92 100644
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@@ -228,9 +228,14 @@
   - wanchaol
   - fduwjj
   - H-Huang
-  - d4l3k
   - aazzolini
   - kwen2501
+  - XilunWu
+  - wz337
+  - awgu
+  - fegin
+  - kumpera
+  - yhcharles
   mandatory_checks_name:
   - EasyCLA
   - Lint

From 677fdeea7050d4837717b706d2871ccc69ac551a Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Wed, 30 Nov 2022 18:25:47 +0000
Subject: [PATCH 1428/1922] Fix binary ios builds (#89929)

curl on CircleCI MacOS runners does not support `--retry-all-errors`

Should fix https://app.circleci.com/pipelines/github/pytorch/pytorch/616842/workflows/5d1162c8-eeae-4627-a1b2-17b493b15b59/jobs/17230369?invite=true#step-105-62

Cleanup after https://github.com/pytorch/pytorch/pull/89157 that were missed by https://github.com/pytorch/pytorch/pull/89298

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89929
Approved by: https://github.com/seemethere, https://github.com/atalman
---
 .circleci/scripts/binary_ios_build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/scripts/binary_ios_build.sh b/.circleci/scripts/binary_ios_build.sh
index 4bb5ea28af733..6c7674ed510ee 100644
--- a/.circleci/scripts/binary_ios_build.sh
+++ b/.circleci/scripts/binary_ios_build.sh
@@ -8,7 +8,7 @@ PROJ_ROOT=/Users/distiller/project
 export TCLLIBPATH="/usr/local/lib"
 
 # Install conda
-curl --retry 3 --retry-all-errors -o ~/conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+curl --retry 3 -o ~/conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
 chmod +x ~/conda.sh
 /bin/bash ~/conda.sh -b -p ~/anaconda
 export PATH="~/anaconda/bin:${PATH}"

From 85db11150a27130f4d42afc07b1483bc45b3997f Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Wed, 30 Nov 2022 15:53:42 +0000
Subject: [PATCH 1429/1922] [FSDP][Easy] ufmt `test_fsdp_checkpoint.py`
 (#89916)

I am in the habit now to run `ufmt format test/distributed/fsdp` before committing, and this changed `test_fsdp_checkpoint.py`. I separated this into its own PR. This change should be safe to force merge to save CI.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89916
Approved by: https://github.com/mrshenli
---
 test/distributed/fsdp/test_fsdp_checkpoint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/distributed/fsdp/test_fsdp_checkpoint.py b/test/distributed/fsdp/test_fsdp_checkpoint.py
index ae00cfe96af4f..994f591ec5e7d 100644
--- a/test/distributed/fsdp/test_fsdp_checkpoint.py
+++ b/test/distributed/fsdp/test_fsdp_checkpoint.py
@@ -12,11 +12,11 @@
     checkpoint_wrapper,
     offload_wrapper,
 )
+from torch.distributed.fsdp import ShardingStrategy
 from torch.distributed.fsdp.fully_sharded_data_parallel import (
     CPUOffload,
     FullyShardedDataParallel as FSDP,
 )
-from torch.distributed.fsdp import ShardingStrategy
 
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import _maybe_wrap_fsdp, FSDPTest

From cc5d07b505f64d5447c5289e0f3c2b2ccf0af916 Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Wed, 30 Nov 2022 18:35:50 +0000
Subject: [PATCH 1430/1922] beef up inplace/view note on copy slices (#89856)

Follow up doc update from https://github.com/pytorch/pytorch/pull/89812
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89856
Approved by: https://github.com/ezyang, https://github.com/soulitzer
---
 torch/csrc/autograd/functions/tensor.cpp |  7 +-
 torch/csrc/autograd/functions/tensor.h   | 84 ++++++++++++++++++++++--
 2 files changed, 81 insertions(+), 10 deletions(-)

diff --git a/torch/csrc/autograd/functions/tensor.cpp b/torch/csrc/autograd/functions/tensor.cpp
index 0c60ab221a1c2..51afb8203186e 100644
--- a/torch/csrc/autograd/functions/tensor.cpp
+++ b/torch/csrc/autograd/functions/tensor.cpp
@@ -88,9 +88,10 @@ auto CopySlices::apply(variable_list&& inputs) -> variable_list {
         result.as_strided_symint(view.sym_sizes(), view.sym_strides(), offset);
   }
 
-  // Since the gradient edge for the 0th input is different between `this` and
-  // `fn`, make sure that the one from `fn` has the same metadata in the current
-  // GraphTask's exec_info as the one on `this`.
+  // See Note [View + Inplace update for view tensor] For more details on this
+  // block Since the gradient edge for the 0th input is different between `this`
+  // and `fn`, make sure that the one from `fn` has the same metadata in the
+  // current GraphTask's exec_info as the one on `this`.
   const auto exec_info = get_current_graph_task_exec_info();
   if (exec_info && !exec_info->empty()) {
     const auto& fn_edge = fn->next_edge(0);
diff --git a/torch/csrc/autograd/functions/tensor.h b/torch/csrc/autograd/functions/tensor.h
index cd77c8ceb7244..06f155a754b07 100644
--- a/torch/csrc/autograd/functions/tensor.h
+++ b/torch/csrc/autograd/functions/tensor.h
@@ -21,7 +21,60 @@ struct TORCH_API CopyBackwards : public Node {
 };
 
 // Note [View + Inplace update for base tensor]
-// Performs grad_view = fn(grad_view), but out-of-place.
+//
+// This note covers a few important topics related to view + inplace handling.
+//   - It explains what is the CopySlices Node and why we need it.
+//   - It explains the considerations on what is saved for backward in
+//   CopySlices.
+//   - It explains why we need to sometimes change the exec_info of the current
+//   backward
+//
+// What is CopySlices?
+// ~~~~~~~~~~~~~~~~~~~
+//
+// We support autograd with inplace mutation; e.g., if you write x.mul_(2)
+// the autograd will work as if you now had multiple Tensors under the hood and
+// you did
+//   x = t.clone()
+//   x0 = x
+//   x1 = x0 * 2
+//   x = x1
+// As you can see here, after this operation, x.grad_fn now points to x1.grad_fn
+// (the MulBackward node) and this node points to x's original grad_fn (which is
+// also x0.grad_fn). It is important to keep in mind that after the inplace,
+// there is no Tensor object that represents the x0 state anymore. But the graph
+// for it is still around in autograd (in case x was used before being modified
+// inplace). See Example 1 in
+// https://docs.google.com/drawings/d/1-T5DyYfChMX1ONQkY-zU-hj_ayQ2zmA5CBOKDWqvEhE
+// We call this rebasing the history of the Tensor.
+//
+// Now, a difficult situation is what happens if x is a differentiable view
+// of a base b.
+//   b = t.clone()
+//   x = b.select(0, 0)
+//   x *= 2
+// With the same approach as above, this will become
+//   b = t.clone()
+//   x = b.select(0, 0)
+//   b0 = b
+//   x0 = x
+//   x1 = x0 * 2
+//   b1 = b0.select_scatter(x1, 0, 0)
+//   x2 = b1.select(0, 0)
+//   x = x2
+//   b = b1
+// As you can see here, not only we need to modify x's grad_fn, we also need to
+// modify the one from b. We also need to ensure that the new grad_fn on x is
+// linked to b's new grad_fn. The chain the select_scatter, multiplication and
+// select is what CopySlices does, all wrapped into a single Node.
+//
+// See Example 1 in
+// https://docs.google.com/drawings/d/1-T5DyYfChMX1ONQkY-zU-hj_ayQ2zmA5CBOKDWqvEhE
+//
+// What do we need to save in CopySlices to run backward?
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//
+// We need to perform grad_view = fn(grad_view), but out-of-place.
 // view_fn_ is an optional lambda function saved in DifferentiableViewMeta
 // from forward pass, so that we can recover we when as_strided is not
 // supported. It preserves the invariants:
@@ -57,8 +110,6 @@ struct TORCH_API CopyBackwards : public Node {
 // efficient than the as_strided one so we should be careful to only use it when
 // necessary.
 //
-// What do we use in CopySlices backward?
-// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 //   - For CPU/CUDA we save TensorGeometry of both base and view tensors,
 //     That's all we need to pass into as_strided.
 //     E.g. int[] sizes, int[] strides, and int storage_offset.
@@ -66,7 +117,7 @@ struct TORCH_API CopyBackwards : public Node {
 //     by **value**.
 //     E.g for at::narrow, int dim, int start, in length are saved.
 //
-// Theorectically we could also save Tensor `view` in CopySlices Node, but
+// Theoretically we could also save Tensor `view` in CopySlices Node, but
 // it's far more expensive than what we currently save.
 //   1. We cannot afford keeping large tensors alive to recover views only.
 //   2. There are inplace checks when Tensors are loaded back to make sure
@@ -76,9 +127,28 @@ struct TORCH_API CopyBackwards : public Node {
 // allows the user to modify the original Tensor without preventing the
 // backward pass from running.
 //
-// When an in-place operation is done on a differentiable view, the base's
-// grad_fn is updated to become a `CopySlice` wrapping the backward of the
-// in-place operation.
+// Why do we manually change exec_info in the apply?
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//
+// Using the same example as before,
+//   b = t.clone()
+//   x = b.select(0, 0)
+//   x *= y
+//
+// You can see the visualization at
+// https://docs.google.com/drawings/d/1Bx-Hcz-zlIv7PabQqnPhUIVIs9F8WWi48svqMsAUMFs
+// which contains the wrapped MulBackward Node and show what it links to.
+// Since a backward can happen between any subset of the inputs (t and y) and
+// outputs (o, x, b). It is possible to get into a state where CopySlices's 0th
+// next function (CloneBackward) needs gradient but MulBackward's 0th next
+// function (SelectBackward) is not. This happens if you do autograd.grad
+// between x and t for example.
+// In such a case, we do need to mark SelectBackward as requiring gradient such
+// that, during the execution of MulBackward, we will actually compute gradient
+// for the 0th input.
+//
+// All the other next functions are always shared (this is asserted in the apply
+// code) and so nothing needs to be done for them.
 
 // See Note [View + Inplace update for view tensor] for what we do to view
 // tensor when an in-place operation happens.

From c86be2d6cd51c613c476bbe99a6541c432cdd902 Mon Sep 17 00:00:00 2001
From: Khushi Agrawal <khushiagrawal411@gmail.com>
Date: Wed, 30 Nov 2022 19:07:27 +0000
Subject: [PATCH 1431/1922] [jiterator, complex32] lerp : cuda (#75584)

Follows #74748 and #74537

Pull Request resolved: https://github.com/pytorch/pytorch/pull/75584
Approved by: https://github.com/anjali411
---
 aten/src/ATen/native/cuda/Lerp.cu             | 90 ++++++++++++++++++-
 .../_internal/common_methods_invocations.py   |  2 +-
 2 files changed, 87 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/native/cuda/Lerp.cu b/aten/src/ATen/native/cuda/Lerp.cu
index c1adb5b6fc030..697b61aa7866c 100644
--- a/aten/src/ATen/native/cuda/Lerp.cu
+++ b/aten/src/ATen/native/cuda/Lerp.cu
@@ -3,16 +3,54 @@
 #include <ATen/Dispatch.h>
 #include <ATen/TensorIterator.h>
 #include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/JitLoops.cuh>
 #include <ATen/OpMathType.h>
 
 namespace at {
 namespace native {
 namespace {
 
+const char lerp_tensor_name[] = "lerp_tensor";
 void lerp_tensor_kernel(at::TensorIteratorBase& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+  auto dtype = iter.common_dtype();
+  if(at::isComplexType(dtype)) {
+#if AT_USE_JITERATOR()
+  static const auto lerp_tensor_string = jiterator_stringify(
+      template <typename T>
+      T lerp_tensor(T self_val, T end_val, T weight_val) {
+        return (std::abs(weight_val) < 0.5)
+            ? self_val + weight_val * (end_val - self_val)
+            : end_val -
+                (end_val - self_val) * (static_cast<T>(1) - weight_val);
+      }
+  ); // lerp_tensor_string
+  AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "lerp_cuda", [&] {
+        jitted_gpu_kernel<
+          /*name=*/ lerp_tensor_name,
+          /*return_dtype=*/ scalar_t,
+          /*common_dtype=*/ scalar_t,
+          /*arity=*/ 3>(iter, lerp_tensor_string);
+      });
+#else
+  AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "lerp_cuda", [&] {
+      using opmath_t = at::opmath_type<scalar_t>;
+      at::native::gpu_kernel(
+        iter,
+        [] GPU_LAMBDA(
+            scalar_t self_val,
+            scalar_t end_val,
+            scalar_t weight_val) -> scalar_t {
+           opmath_t self_val_f = self_val;
+           opmath_t end_val_f = end_val;
+           opmath_t weight_val_f = weight_val;
+          return lerp(self_val, end_val, weight_val);
+        });
+      });
+#endif
+  } else {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
       at::ScalarType::Half, at::ScalarType::BFloat16,
-      iter.common_dtype(), "lerp_cuda",
+      dtype, "lerp_cuda",
       [&] {
         at::native::gpu_kernel(
             iter,
@@ -23,12 +61,54 @@ void lerp_tensor_kernel(at::TensorIteratorBase& iter) {
               return lerp(self_val, end_val, weight_val);
             });
       });
+  }
 }
 
+const char lerp_scalar_name[] = "lerp_scalar";
 void lerp_scalar_kernel(at::TensorIteratorBase& iter, const c10::Scalar& weight) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+  auto dtype = iter.common_dtype();
+  if (at::isComplexType(dtype)) {
+#if AT_USE_JITERATOR()
+  static const auto lerp_scalar_string = jiterator_stringify(
+      template <typename T>
+      T lerp_scalar(T self_val, T end_val, T weight_val) {
+        return (std::abs(weight_val) < 0.5)
+            ? self_val + weight_val * (end_val - self_val)
+            : end_val -
+                (end_val - self_val) * (static_cast<T>(1) - weight_val);
+      }
+  ); // lerp_scalar_string
+  AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "lerp_cuda", [&] {
+      using opmath_t = at::opmath_type<scalar_t>;
+      auto weight_val = weight.to<opmath_t>();
+      jitted_gpu_kernel<
+        /*name=*/ lerp_scalar_name,
+        /*return_dtype=*/ scalar_t,
+        /*common_dtype=*/ scalar_t,
+        /*arity=*/ 2>(
+        iter,
+        lerp_scalar_string,
+        /*scalar_pos=*/ at::cuda::jit::BinaryFuncVariant::NoScalar,
+        /*scalar_val=*/ 0,
+        /*extra_args=*/ std::make_tuple(weight_val));
+  });
+#else
+  AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "lerp_cuda", [&] {
+    using opmath_t = at::opmath_type<scalar_t>;
+    auto weight_val = weight.to<opmath_t>();
+    at::native::gpu_kernel(
+        iter,
+        [=] GPU_LAMBDA(scalar_t self_val, scalar_t end_val) {
+          opmath_t self_val_f = self_val;
+          opmath_t end_val_f = end_val;
+          return lerp(self_val, end_val, weight_val);
+        });
+  });
+#endif
+  } else {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
       at::ScalarType::Half, at::ScalarType::BFloat16,
-      iter.common_dtype(), "lerp_cuda",
+      dtype, "lerp_cuda",
       [&]{
         using opmath_t = at::opmath_type<scalar_t>;
         auto weight_val = weight.to<opmath_t>();
@@ -38,6 +118,8 @@ void lerp_scalar_kernel(at::TensorIteratorBase& iter, const c10::Scalar& weight)
             });
       });
     }
+}
+
 } // anonymous namespace
 
 REGISTER_DISPATCH(lerp_kernel_tensor_weight, &lerp_tensor_kernel);
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index bf6f15f825f34..dd2cd7763b77f 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -13675,7 +13675,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    ),),
     OpInfo('lerp',
            dtypes=floating_and_complex_types_and(torch.bfloat16),
-           dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.chalf, torch.half, torch.bfloat16),
            dtypesIfROCM=floating_and_complex_types_and(torch.half, torch.bfloat16),
            sample_inputs_func=sample_inputs_lerp,
            supports_forward_ad=True,

From 67da4d06d8e7d70ea5c790cd8872d8f0bec6b809 Mon Sep 17 00:00:00 2001
From: Kulin Seth <kulin_seth@apple.com>
Date: Wed, 30 Nov 2022 20:00:32 +0000
Subject: [PATCH 1432/1922] [MPS] Enable fp16 for linear backward (#89774)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89774
Approved by: https://github.com/albanD, https://github.com/malfet
---
 aten/src/ATen/native/mps/operations/Linear.mm | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Linear.mm b/aten/src/ATen/native/mps/operations/Linear.mm
index ddaa6ce979638..b49e70cdf4915 100644
--- a/aten/src/ATen/native/mps/operations/Linear.mm
+++ b/aten/src/ATen/native/mps/operations/Linear.mm
@@ -27,9 +27,8 @@ Tensor _mps_linear(
 
   auto weight = (weight_arg.dim() == 1) ? weight_arg.view({1, weight_arg.size(0)}) : weight_arg;
 
-  TORCH_CHECK(input.scalar_type() == ScalarType::Double
-              || input.scalar_type() == ScalarType::Float
-              || input.scalar_type() == ScalarType::Half, "MPS device does not support linear for non-float inputs");
+  TORCH_CHECK(input.scalar_type() == ScalarType::Float ||
+              input.scalar_type() == ScalarType::Half, "MPS device does not support linear for non-float inputs");
 
   // See [Note: hacky wrapper removal for optional tensor]
   auto bias = bias_opt.has_value()
@@ -170,8 +169,9 @@ Tensor _mps_linear_backward_input(
 {
   TORCH_CHECK(grad_output.is_mps(),
       "mps_linear_backward: grad_output needs to be mps layout");
-  TORCH_CHECK(weight.device().is_mps() && weight.scalar_type() == kFloat,
-      "mps_linear_backward: weight needs to be a dense tensor");
+  TORCH_CHECK(weight.device().is_mps() &&
+             (weight.scalar_type() == kFloat || (weight.scalar_type() == kHalf)),
+             "mps_linear_backward: unsupported weights data type: ", weight.scalar_type());
 
   TORCH_CHECK(grad_output.scalar_type() == ScalarType::Double
               || grad_output.scalar_type() == ScalarType::Float
@@ -253,9 +253,8 @@ Tensor _mps_linear_backward_input(
   TORCH_CHECK(grad_output.is_mps() && input.is_mps(),
       "_mps_linear_backward: grad_output and input needs to be mps layout");
 
-  TORCH_CHECK(grad_output.scalar_type() == ScalarType::Double
-              || grad_output.scalar_type() == ScalarType::Float
-              || grad_output.scalar_type() == ScalarType::Half, "MPS device does not support linear backward for non-float inputs");
+  TORCH_CHECK(grad_output.scalar_type() == ScalarType::Float ||
+              grad_output.scalar_type() == ScalarType::Half, "MPS device does not support linear backward for non-float inputs");
 
    struct CachedGraph : public mps::MPSCachedGraph
   {

From 1bb8a8d0844943a65759323c82cfde0ffa59aa53 Mon Sep 17 00:00:00 2001
From: Pearu Peterson <pearu.peterson@gmail.com>
Date: Wed, 30 Nov 2022 18:25:58 +0200
Subject: [PATCH 1433/1922] Add layout and blocksize arguments to
 Tensor.to_sparse method (#89502)

This PR extends the `Tensor.to_sparse()` method to `Tensor.to_sparse(layout=None, blocksize=None)` in a BC manner (`layout=None` means `layout=torch.sparse_coo`).

In addition, the PR adds support for the following conversions:
- non-hybrid/hybrid COO tensor to CSR or CSC or a COO tensor
- short, bool, byte, char, bfloat16, int, long, half CSR tensor to a BSR tensor

and fixes the following conversions:
- hybrid COO to COO tensor
- non-batch/batch hybrid BSR to BSR or BSC tensor

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89502
Approved by: https://github.com/amjames, https://github.com/cpuhrsch
---
 aten/src/ATen/SparseCsrTensorUtils.h          |  15 ++
 aten/src/ATen/native/TensorConversions.cpp    | 139 +++++++++++++---
 aten/src/ATen/native/native_functions.yaml    |   3 +-
 .../native/sparse/SparseCsrTensorMath.cpp     |   9 +-
 aten/src/ATen/native/sparse/SparseTensor.cpp  |  35 +++-
 test/test_sparse.py                           | 156 +++++++++++++++++-
 test/test_sparse_csr.py                       |  82 ++++-----
 tools/autograd/derivatives.yaml               |   2 +-
 torch/testing/_internal/common_utils.py       |   2 +-
 9 files changed, 361 insertions(+), 82 deletions(-)

diff --git a/aten/src/ATen/SparseCsrTensorUtils.h b/aten/src/ATen/SparseCsrTensorUtils.h
index e76d2707c6f49..13ed74c7e8a55 100644
--- a/aten/src/ATen/SparseCsrTensorUtils.h
+++ b/aten/src/ATen/SparseCsrTensorUtils.h
@@ -122,6 +122,13 @@
     }                                                                     \
   }()
 
+#define AT_DISPATCH_SPARSE_VALUE_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                   \
+      TYPE,                                             \
+      NAME,                                             \
+      AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND4(      \
+          kComplexHalf, kHalf, kBool, kBFloat16, __VA_ARGS__))
+
 namespace at {
 namespace sparse_csr {
 
@@ -287,5 +294,13 @@ inline Layout flip_compressed_layout(Layout layout) {
   }
 }
 
+inline DimVector getBlockSize(Tensor const& self) {
+  int64_t n_batch = numBatchDimensions(self);
+  Tensor values = self.values();
+  return {
+      std::max<int64_t>(1, values.size(n_batch + 1)),
+      std::max<int64_t>(1, values.size(n_batch + 2))};
+}
+
 } // namespace sparse_csr
 } // namespace at
diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp
index 96275bde82994..e6c7bd3875d2a 100644
--- a/aten/src/ATen/native/TensorConversions.cpp
+++ b/aten/src/ATen/native/TensorConversions.cpp
@@ -942,7 +942,9 @@ Tensor dense_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize) {
       self.size(-1),
       " needs to be divisible by blocksize[1] ",
       blocksize[1]);
-
+  // TODO: specify the number of dense dimensions, or equivalently,
+  // the number of batch dimensions. Until then, below we'll assume
+  // that the number of dense dimensions is 0.
   auto n_batch_dim = self.dim() - 2;
 
   auto values = _batch_tile_tensor(self, blocksize);
@@ -1054,8 +1056,7 @@ void _check_blocksize_matches(
     const std::string& name) {
   if (blocksize_opt.has_value()) {
     const auto blocksize = *blocksize_opt;
-    const auto self_values = self.values();
-    const auto self_blocksize = at::DimVector({self_values.size(-2), self_values.size(-1)});
+    const auto self_blocksize = at::sparse_csr::getBlockSize(self);
     TORCH_CHECK(self_blocksize == blocksize,
         name, "(): the provided blocksize does not match the blocksize of the to be converted tensor, ",
         "got (", blocksize[0], ", ", blocksize[1], ") ",
@@ -1144,7 +1145,7 @@ Tensor sparse_compressed_to_flipped(
   const auto sparse_dims = [&]() -> at::DimVector {
     auto sparse_dims = at::DimVector(self.sizes().slice(n_batches, 2));
     if (layout == at::kSparseBsr || layout == at::kSparseBsc) {
-      std::array<int64_t, 2> blocksize = {values.size(-2), values.size(-1)};
+      auto blocksize = at::sparse_csr::getBlockSize(self);
       sparse_dims[0] /= blocksize[0];
       sparse_dims[1] /= blocksize[1];
     }
@@ -1306,9 +1307,9 @@ Tensor sparse_compressed_to_sparse_csr(const Tensor& self) {
 
 Tensor coo_to_sparse_csr(const Tensor& self) {
   TORCH_CHECK(
-      self.dim() == 2,
-      "Only 2D tensors can be converted to the SparseCsr layout but got shape: ",
-      self.sizes());
+      self.sparse_dim() == 2,
+      "Only tensors with two sparse dimensions can be converted to the SparseCsr layout, got self with ",
+      self.sparse_dim(), " sparse dimensions.");
   auto coalesced_self = self.coalesce();
   auto row_indices = coalesced_self.indices()[0];
   bool out_int32 = (row_indices.scalar_type() == at::kInt);
@@ -1326,9 +1327,9 @@ Tensor coo_to_sparse_csr(const Tensor& self) {
 
 Tensor coo_to_sparse_csc(const Tensor& self) {
   TORCH_CHECK(
-      self.dim() == 2,
-      "Only 2D tensors can be converted to the SparseCsc layout but got shape: ",
-      self.sizes());
+      self.sparse_dim() == 2,
+      "Only tensors with two sparse dimensions can be converted to the SparseCsc layout, got self with ",
+      self.sparse_dim(), " sparse dimensions.");
   auto coalesced_self = self.transpose(0, 1).coalesce().to_sparse_csr();
   return at::native::_sparse_csc_tensor_unsafe(
       coalesced_self.crow_indices(),
@@ -1341,15 +1342,11 @@ Tensor coo_to_sparse_csc(const Tensor& self) {
 }
 
 Tensor coo_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize) {
-  AT_ERROR(
-      "Conversion from ", self.layout(), " to SparseBsr is currently not supported.");
-  return self;
+  return self.to_sparse_csr().to_sparse_bsr(blocksize);
 }
 
 Tensor coo_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize) {
-  AT_ERROR(
-      "Conversion from ", self.layout(), " to SparseBsc is currently not supported.");
-  return self;
+  return self.to_sparse_bsr(blocksize).to_sparse_bsc(blocksize);
 }
 
 namespace {
@@ -1598,9 +1595,20 @@ Tensor _csr_to_block_csr_cpu(const Tensor& self, IntArrayRef blocksize) {
             input_crow_indices.data_ptr<index_t>(),
             input_col_indices.data_ptr<index_t>());
       });
+  DimVector values_size{num_blocks, blocksize[0], blocksize[1]};
+
+  // While we don't support conversion of hybrid csr-to-bsr yet, we'll
+  // compute hybrid compatible values sizes to meet the invariants of
+  // the BSR tensor when the support will be implemented.
+  int64_t numel_dense = 1;
+  for (int i=0; i<self.dense_dim(); i++) {
+    values_size.push_back(self.size(2 + i));
+    numel_dense *= self.size(2 + i);
+  }
+  TORCH_CHECK(numel_dense == 1, "conversion from hybrid csr to block csr is not supported yet.");
 
   Tensor result_values =
-      input_values.new_zeros({num_blocks, blocksize[0], blocksize[1]});
+      input_values.new_zeros(values_size);
   Tensor result_crow_indices =
       input_crow_indices.new_empty({(n_row / blocksize[0]) + 1});
   Tensor result_col_indices = input_col_indices.new_empty({num_blocks});
@@ -1608,7 +1616,7 @@ Tensor _csr_to_block_csr_cpu(const Tensor& self, IntArrayRef blocksize) {
   // Next we copy over non-zero elements into the allocated blocks.
   AT_DISPATCH_INDEX_TYPES(
       input_crow_indices.scalar_type(), "_csr_to_block_csr_cpu", [&] {
-        AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
+        AT_DISPATCH_SPARSE_VALUE_TYPES(
             input_values.scalar_type(), "_csr_to_block_csr_cpu", [&] {
               _csr_to_block_csr_cpu_kernel<index_t, scalar_t>(
                   n_row,
@@ -1635,9 +1643,17 @@ Tensor _csr_to_block_csr_cpu(const Tensor& self, IntArrayRef blocksize) {
 
 Tensor sparse_compressed_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize) {
   if (self.layout() == kSparseBsc) {
+    DimVector self_blocksize = at::sparse_csr::getBlockSize(self);
+    TORCH_CHECK(self_blocksize == blocksize, "to_sparse_bsr:",
+                "conversion from ", self.layout(), "[blocksize=", self_blocksize, "] to ", kSparseBsr,
+                "[blocksize=", DimVector(blocksize),"] is not implemented.");
     return sparse_compressed_to_flipped(self, blocksize, "to_sparse_bsr");
   }
   if (self.layout() == kSparseBsr) {
+    DimVector self_blocksize = at::sparse_csr::getBlockSize(self);
+    TORCH_CHECK(self_blocksize == blocksize, "to_sparse_bsr:",
+                "conversion from ", self.layout(), "[blocksize=", self_blocksize, "] to ", kSparseBsr,
+                "[blocksize=", blocksize,"] is not implemented.");
     return sparse_compressed_clone(self, blocksize, "to_sparse_bsr");
   }
   if (self.layout() == kSparseCsr) {
@@ -1679,10 +1695,18 @@ Tensor sparse_compressed_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize
 
 Tensor sparse_compressed_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize) {
   if (self.layout() == kSparseBsr) {
-    return sparse_compressed_to_flipped(self, blocksize, "to_sparse_bsr");
+    DimVector self_blocksize = at::sparse_csr::getBlockSize(self);
+    TORCH_CHECK(self_blocksize == blocksize, "to_sparse_bsc:",
+                "conversion from ", self.layout(), "[blocksize=", self_blocksize, "] to ", kSparseBsc,
+                "[blocksize=", blocksize,"] is not implemented.");
+    return sparse_compressed_to_flipped(self, blocksize, "to_sparse_bsc");
   }
   if (self.layout() == kSparseBsc) {
-    return sparse_compressed_clone(self, blocksize, "to_sparse_bsr");
+    DimVector self_blocksize = at::sparse_csr::getBlockSize(self);
+    TORCH_CHECK(self_blocksize == blocksize, "to_sparse_bsc:",
+                "conversion from ", self.layout(), "[blocksize=", self_blocksize, "] to ", kSparseBsc,
+                "[blocksize=", blocksize,"] is not implemented.");
+    return sparse_compressed_clone(self, blocksize, "to_sparse_bsc");
   }
   AT_ERROR(
       "sparse_compressed_to_sparse_bsc expected SparseBsr or SparseBsc layout but got ",
@@ -1730,10 +1754,81 @@ Tensor sparse_compressed_to_sparse(const Tensor& self, int64_t sparse_dim) {
       self.layout());
 }
 
-Tensor sparse_compressed_to_sparse(const Tensor& self) {
-  return sparse_compressed_to_sparse(self, 2);
+Tensor sparse_compressed_to_sparse(const Tensor& self, c10::optional<c10::Layout> layout, OptionalIntArrayRef blocksize) {
+  Layout layout_ = layout.value_or(kSparse);
+  TORCH_CHECK(!blocksize.has_value() || layout_ == kSparseBsr || layout_ == kSparseBsc,
+              "to_sparse: ", self.layout(), " to ", layout_,
+              " conversion does not use the specified blocksize ", blocksize.value(), ".");
+  if (self.layout() == layout_ && (!blocksize.has_value() || at::sparse_csr::getBlockSize(self) == *blocksize)) {
+    return self;
+  }
+  switch (layout_) {
+  case kStrided:
+    return sparse_compressed_to_dense(self);
+  case kSparse:
+    return sparse_compressed_to_sparse(self, 2);
+  case kSparseCsr:
+    return sparse_compressed_to_sparse_csr(self);
+  case kSparseCsc:
+    return sparse_compressed_to_sparse_csc(self);
+  case kSparseBsr:
+    if (blocksize.has_value()) {
+      return sparse_compressed_to_sparse_bsr(self, *blocksize);
+    } else {
+      DimVector blocksize_ = at::sparse_csr::getBlockSize(self);
+      TORCH_CHECK(blocksize_.size() == 2, "to_sparse: ", self.layout(), " to ", layout_,
+                  " conversion requires blocksize specified.");
+      return sparse_compressed_to_sparse_bsr(self, blocksize_);
+    }
+  case kSparseBsc:
+    if (blocksize.has_value()) {
+      return sparse_compressed_to_sparse_bsc(self, *blocksize);
+    } else {
+      DimVector blocksize_ = at::sparse_csr::getBlockSize(self);
+      TORCH_CHECK(blocksize_.size() == 2, "to_sparse: ", self.layout(), " to ", layout_,
+                  " conversion requires blocksize specified.");
+      return sparse_compressed_to_sparse_bsc(self, blocksize_);
+    }
+  default:
+    break;
+  }
+  AT_ERROR("to_sparse: ", self.layout(), " to ", layout_, " conversion not implemented.");
+  return Tensor();
+}
+
+Tensor sparse_coo_to_sparse(const Tensor& self, c10::optional<c10::Layout> layout, OptionalIntArrayRef blocksize) {
+  Layout layout_ = layout.value_or(kSparse);
+  TORCH_CHECK(!blocksize.has_value() || layout_ == kSparseBsr || layout_ == kSparseBsc,
+              "to_sparse: ", self.layout(), " to ", layout_,
+              " conversion does not use the specified blocksize ", blocksize.value(), ".");
+  if (self.layout() == layout_) {
+    return self;
+  }
+  switch (layout_) {
+  case kStrided:
+    return self.to_dense();
+  case kSparse:
+    return self;
+  case kSparseCsr:
+    return self.to_sparse_csr();
+  case kSparseCsc:
+    return self.to_sparse_csc();
+  case kSparseBsr:
+    TORCH_CHECK(blocksize.has_value(), "to_sparse: ", self.layout(), " to ", layout_,
+                " conversion requires blocksize specified.");
+    return self.to_sparse_bsr(*blocksize);
+  case kSparseBsc:
+    TORCH_CHECK(blocksize.has_value(), "to_sparse: ", self.layout(), " to ", layout_,
+                " conversion requires blocksize specified.");
+    return self.to_sparse_bsc(*blocksize);
+    default:
+      break;
+  }
+  AT_ERROR("to_sparse not implemented for ", self.layout(), " to ", *layout, " conversion");
+  return Tensor();
 }
 
+
 // Sparse layout conversions End
 
 Tensor to_meta(const Tensor& tensor) {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index ec86ce36b4247..51834e7bfd5f7 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -6574,10 +6574,11 @@
     SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse
   autogen: to_sparse.sparse_dim_out
 
-- func: to_sparse(Tensor self) -> Tensor
+- func: to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None) -> Tensor
   variants: method
   dispatch:
     CPU, CUDA: dense_to_sparse
+    SparseCPU, SparseCUDA: sparse_coo_to_sparse
     SparseCsrCPU, SparseCsrCUDA: sparse_compressed_to_sparse
   autogen: to_sparse.out
 
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
index 20e4aff1784cb..3818b558cc1d1 100644
--- a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
@@ -121,7 +121,8 @@ namespace meta {
 
 TORCH_META_FUNC(_convert_indices_from_coo_to_csr)
 (const Tensor& self, const int64_t size, const bool out_int32) {
-  TORCH_CHECK(self.dim() <= 1, "Input is supposed to be a vector");
+  TORCH_CHECK(self.dim() <= 1, "Input is supposed to be a vector, but got ",
+              self.dim(), " dimensional tensor.");
   ScalarType scalar_type = out_int32 ? ScalarType::Int : ScalarType::Long;
   c10::TensorOptions options =
       TensorOptions().device(self.options().device()).dtype(scalar_type);
@@ -134,8 +135,10 @@ TORCH_META_FUNC(_convert_indices_from_csr_to_coo)
  const bool out_int32,
  const bool transpose) {
   TORCH_CHECK(
-      crow_indices.dim() == 1, "crow_indices is supposed to be a vector");
-  TORCH_CHECK(col_indices.dim() == 1, "col_indices is supposed to be a vector");
+    crow_indices.dim() == 1, "crow_indices is supposed to be a vector, but got ",
+    crow_indices.dim(), " dimensional tensor.");
+  TORCH_CHECK(col_indices.dim() == 1, "col_indices is supposed to be a vector, but got ",
+              col_indices.dim(), " dimensional tensor.");
   ScalarType scalar_type = out_int32 ? ScalarType::Int : ScalarType::Long;
   c10::TensorOptions options = crow_indices.options().dtype(scalar_type);
   set_output_raw_strided(0, {2, col_indices.numel()}, {}, options, {});
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index 38f3e11f8fd4e..859218b2f7042 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -518,7 +518,40 @@ const SparseTensor& resize_as_sparse_(const SparseTensor& self, const SparseTens
   return self;
 }
 
-SparseTensor dense_to_sparse(const Tensor& self) {
+SparseTensor dense_to_sparse(const Tensor& self, c10::optional<c10::Layout> layout, OptionalIntArrayRef blocksize) {
+  if (layout.has_value()) {
+    if (blocksize.has_value() && !(*layout == kSparseBsr || *layout == kSparseBsc)) {
+      AT_ERROR("to_sparse for ", self.layout(), " to ", *layout, " conversion does not use specified blocksize");
+    }
+    if (self.layout() == *layout) {
+      return self;
+    }
+    switch (*layout) {
+    case kStrided:
+      return self;
+    case kSparse:
+      return dense_to_sparse(self, self.dim());
+    case kSparseCsr:
+      return self.to_sparse_csr();
+    case kSparseCsc:
+      return self.to_sparse_csc();
+    case kSparseBsr:
+      if (blocksize.has_value()) {
+        return self.to_sparse_bsr(*blocksize);
+      }
+      AT_ERROR("to_sparse for ", self.layout(), " to ", *layout, " conversion requires blocksize");
+      break;
+    case kSparseBsc:
+      if (blocksize.has_value()) {
+        return self.to_sparse_bsc(*blocksize);
+      }
+      break;
+      AT_ERROR("to_sparse for ", self.layout(), " to ", *layout, " conversion requires blocksize");
+    default:
+      break;
+    }
+    AT_ERROR("to_sparse not implemented for ", self.layout(), " to ", *layout, " conversion");
+  }
   return dense_to_sparse(self, self.dim());
 }
 
diff --git a/test/test_sparse.py b/test/test_sparse.py
index e03e2f1682893..93a2241d06804 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -9,7 +9,8 @@
 from torch.testing import make_tensor
 from torch.testing._internal.common_utils import TestCase, run_tests, skipIfRocm, do_test_dtypes, \
     do_test_empty_full, load_tests, TEST_NUMPY, TEST_SCIPY, IS_WINDOWS, gradcheck, coalescedonoff, \
-    DeterministicGuard, first_sample, TEST_WITH_CROSSREF, TEST_WITH_ROCM, skipIfTorchDynamo
+    DeterministicGuard, first_sample, TEST_WITH_CROSSREF, TEST_WITH_ROCM, skipIfTorchDynamo, \
+    parametrize, subtest
 from torch.testing._internal.common_cuda import TEST_CUDA, _get_torch_cuda_version
 from numbers import Number
 from typing import Dict, Any
@@ -40,6 +41,17 @@
     IS_WINDOWS and torch.version.cuda and LooseVersion(torch.version.cuda) > "11.2"
 ) or (not IS_WINDOWS and CUDA11OrLater)
 
+def all_sparse_layouts(test_name='layout', include_strided=False):
+    return parametrize(test_name, [
+        subtest(torch.strided, name='Strided'),
+        subtest(torch.sparse_coo, name='SparseCOO'),
+        subtest(torch.sparse_csr, name='SparseCSR'),
+        subtest(torch.sparse_csc, name='SparseCSC'),
+        subtest(torch.sparse_bsr, name='SparseBSR'),
+        subtest(torch.sparse_bsc, name='SparseBSC'),
+    ][(0 if include_strided else 1):])
+
+
 class CrossRefSparseFakeMode(torch._subclasses.CrossRefFakeMode):
     def __init__(self):
         super(CrossRefSparseFakeMode, self).__init__(
@@ -413,9 +425,6 @@ def test_to_sparse(self, device, dtype, coalesced):
                     self.assertEqual(expected.size(), result.size())
                     self.assertEqual(dim, result.sparse_dim())
 
-            sp, _, _ = self._gen_sparse(2, 10, [3, 3, 3], dtype=value_type, device=device, coalesced=coalesced)
-            self.assertRaises(RuntimeError, lambda: sp.to_sparse())
-
     @dtypes(torch.double, torch.cdouble)
     def test_sparse_bool(self, device, dtype):
         a = torch.tensor([True, False], dtype=dtype, device=device).to(torch.bool)
@@ -4094,6 +4103,145 @@ def test_generate_simple_inputs(self):
                         key = (layout, is_hybrid, is_batch, nontrivial_blocksize)
                         assert key in tested_combinations, key
 
+    @all_sparse_layouts('from_layout', include_strided=True)
+    @all_sparse_layouts('to_layout', include_strided=False)
+    @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    def test_to_sparse(self, from_layout, to_layout, device, dtype):
+        """
+        This test tests conversion from any layout to any sparse layout.
+        """
+
+        for t in self.generate_simple_inputs(
+                from_layout, device=device, dtype=dtype,
+                enable_hybrid=(
+                    # TODO: to support conversion strided->hybrid
+                    # CSR/CSC/BSR/BSC, to_sparse() requires extra keyword
+                    # argument, either nof_batch_dims or
+                    # nof_dense_dims
+                    not (from_layout is torch.strided and to_layout in
+                         {torch.sparse_bsr, torch.sparse_bsc, torch.sparse_csr, torch.sparse_csc}))):
+
+            if to_layout in {torch.sparse_bsr, torch.sparse_bsc}:
+                if from_layout == torch.sparse_bsr:
+                    batch_ndim = t.crow_indices().dim() - 1
+                    blocksize = t.values().shape[batch_ndim + 1:batch_ndim + 3]
+                elif from_layout == torch.sparse_bsc:
+                    batch_ndim = t.ccol_indices().dim() - 1
+                    blocksize = t.values().shape[batch_ndim + 1:batch_ndim + 3]
+                else:
+                    blocksize = (1, 1)
+            else:
+                blocksize = None
+
+            if from_layout is torch.strided:
+                is_batch = None
+                is_hybrid = None
+            else:
+                is_batch = t.dim() > (t.sparse_dim() + t.dense_dim())
+                is_hybrid = t.dense_dim() > 0
+
+            def explicit_to_sparse(x):
+                # Used to check that the explicit conversion methods
+                # are consistent with the `to_sparse(*, layout,
+                # blocksize)` method.
+                if to_layout is torch.sparse_coo:
+                    return x.to_sparse_coo()
+                elif to_layout is torch.sparse_csr:
+                    return x.to_sparse_csr()
+                elif to_layout is torch.sparse_csc:
+                    return x.to_sparse_csc()
+                elif to_layout is torch.sparse_bsr:
+                    return x.to_sparse_bsr(blocksize)
+                elif to_layout is torch.sparse_bsc:
+                    return x.to_sparse_bsc(blocksize)
+                else:
+                    assert 0  # unreachable
+
+            # TODO: The following exception cases all correspond to
+            # not implemented conversions
+            if from_layout is torch.sparse_coo and to_layout in {
+                    torch.sparse_bsr, torch.sparse_bsc} and t.sparse_dim() == 2 and is_hybrid:
+                with self.assertRaisesRegex(RuntimeError, "conversion from Csr to Bsr is only possible for 2d inputs"):
+                    t.to_sparse(layout=to_layout, blocksize=blocksize)
+                with self.assertRaisesRegex(RuntimeError, "conversion from Csr to Bsr is only possible for 2d inputs"):
+                    explicit_to_sparse(t)
+                continue
+            elif from_layout is torch.sparse_csr and to_layout in {torch.sparse_bsr} and (is_batch or is_hybrid):
+                with self.assertRaisesRegex(RuntimeError, "conversion from Csr to Bsr is only possible for 2d inputs"):
+                    t.to_sparse(layout=to_layout, blocksize=blocksize)
+                with self.assertRaisesRegex(RuntimeError, "conversion from Csr to Bsr is only possible for 2d inputs"):
+                    explicit_to_sparse(t)
+                continue
+            elif from_layout is torch.sparse_coo and to_layout in {
+                    torch.sparse_csr, torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc} and t.sparse_dim() != 2:
+                with self.assertRaisesRegex(
+                        RuntimeError, "Only tensors with two sparse dimensions can be converted to the Sparse(Csr|Csc) layout"):
+                    t.to_sparse(layout=to_layout, blocksize=blocksize)
+                with self.assertRaisesRegex(
+                        RuntimeError, "Only tensors with two sparse dimensions can be converted to the Sparse(Csr|Csc) layout"):
+                    explicit_to_sparse(t)
+                continue
+            elif from_layout in {torch.sparse_csr, torch.sparse_csc} and to_layout is torch.sparse_coo and is_batch:
+                with self.assertRaisesRegex(RuntimeError,
+                                            "crow_indices is supposed to be a vector, but got \\d+ dimensional tensor"):
+                    t.to_sparse(layout=to_layout, blocksize=blocksize)
+                with self.assertRaisesRegex(RuntimeError,
+                                            "crow_indices is supposed to be a vector, but got \\d+ dimensional tensor"):
+                    explicit_to_sparse(t)
+                continue
+            elif from_layout in {torch.sparse_bsr, torch.sparse_bsc} and to_layout is torch.sparse_coo:
+                with self.assertRaisesRegex(
+                        RuntimeError,
+                        "sparse_compressed_to_sparse expected SparseCsr or SparseCsc layout but got Sparse(Bsr|Bsc)"):
+                    t.to_sparse(layout=to_layout, blocksize=blocksize)
+                with self.assertRaisesRegex(
+                        RuntimeError,
+                        "sparse_compressed_to_sparse expected SparseCsr or SparseCsc layout but got Sparse(Bsr|Bsc)"):
+                    explicit_to_sparse(t)
+                self.skipTest('NOT IMPL')
+            elif (from_layout, to_layout) in {(torch.sparse_bsc, torch.sparse_csr), (torch.sparse_bsc, torch.sparse_csc),
+                                              (torch.sparse_bsr, torch.sparse_csr), (torch.sparse_bsr, torch.sparse_csc),
+                                              (torch.sparse_csc, torch.sparse_bsr), (torch.sparse_csc, torch.sparse_bsc),
+                                              (torch.sparse_csr, torch.sparse_bsc)}:
+                with self.assertRaisesRegex(
+                        RuntimeError,
+                        r"sparse_compressed_to_sparse_(csr|csc|bsr|bsc) expected\s*(SparseCsr[,]|)\s*Sparse(Csr|Bsr)"
+                        " or Sparse(Csc|Bsc) layout but got Sparse(Csr|Csc|Bsr|Bsc)"):
+                    t.to_sparse(layout=to_layout, blocksize=blocksize)
+                with self.assertRaisesRegex(
+                        RuntimeError,
+                        r"sparse_compressed_to_sparse_(csr|csc|bsr|bsc) expected\s*(SparseCsr[,]|)\s*Sparse(Csr|Bsr)"
+                        " or Sparse(Csc|Bsc) layout but got Sparse(Csr|Csc|Bsr|Bsc)"):
+                    explicit_to_sparse(t)
+                self.skipTest('NOT IMPL')
+            else:
+                r = t.to_sparse(layout=to_layout, blocksize=blocksize)
+
+                self.assertEqual(r.layout, to_layout)
+
+                # to_sparse method uses unsafe construction of sparse
+                # tensors. Here we explicitly validate the results to
+                # make sure that the sparse tensors are consistent
+                # with the corresponding sparse tensor invariants.
+                if r.layout in {torch.sparse_csr, torch.sparse_bsr, torch.sparse_csc, torch.sparse_bsc}:
+                    if r.layout in {torch.sparse_csr, torch.sparse_bsr}:
+                        compressed_indices, plain_indices = r.crow_indices(), r.col_indices()
+                    else:
+                        compressed_indices, plain_indices = r.ccol_indices(), r.row_indices()
+                    torch._validate_sparse_compressed_tensor_args(compressed_indices, plain_indices, r.values(),
+                                                                  r.shape, r.layout)
+                elif r.layout is torch.sparse_coo:
+                    torch._validate_sparse_coo_tensor_args(r._indices(), r._values(), r.shape)
+                else:
+                    assert 0  # unreachable
+
+                # Finally, we'll test tensor equality:
+                self.assertEqual(r, t)
+
+                # Also, check consistency with explicit conversion methods:
+                r2 = explicit_to_sparse(t)
+                self.assertEqual(r2, r)
+
 
 # e.g., TestSparseUnaryUfuncsCPU and TestSparseUnaryUfuncsCUDA
 instantiate_device_type_tests(TestSparseUnaryUfuncs, globals(), except_for='meta')
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index 225cde31483eb..e8eb8564b860d 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -2761,33 +2761,6 @@ def test_exercise_detach(self, device, dtype):
             detached_inp = inp.detach()
             self.assertEqual(inp, detached_inp)
 
-    def _convert_to_layout(self, a, target_layout, blocksize=(2, 2)):
-        """
-        Helper function to call the correct layout conversion
-        with reasonable defaults for the block size. Clearly there
-        is a need for a to.layout overload.
-        """
-        if target_layout is torch.sparse_csr:
-            result = a.to_sparse_csr()
-        elif target_layout is torch.sparse_csc:
-            result = a.to_sparse_csc()
-        elif target_layout is torch.sparse_bsr:
-            result = a.to_sparse_bsr(blocksize)
-        elif target_layout is torch.sparse_bsc:
-            result = a.to_sparse_bsc(blocksize)
-        else:
-            raise NotImplementedError(repr(a))
-        assert result.layout is target_layout
-        # to_sparse_xyz methods use unsafe construction of sparse
-        # compressed tensors. Here we explicitly validate the results
-        # to make sure that the sparse tensors are consistent with the
-        # corresponding sparse tensor invariants.
-        compressed_indices_mth, plain_indices_mth = sparse_compressed_indices_methods[result.layout]
-        compressed_indices, plain_indices = compressed_indices_mth(result), plain_indices_mth(result)
-        torch._validate_sparse_compressed_tensor_args(compressed_indices, plain_indices, result.values(),
-                                                      result.shape, result.layout)
-        return result
-
     def _construct_sp_matrix(self, tensor, layout, blocksize=(2, 2)):
         if tensor.layout in [torch.sparse_coo, torch.sparse_csr, torch.sparse_csc, torch.strided]:
             tensor = tensor.to_dense()
@@ -2806,9 +2779,13 @@ def _construct_sp_matrix(self, tensor, layout, blocksize=(2, 2)):
     @all_sparse_compressed_layouts('to_layout')
     @all_sparse_compressed_layouts('from_layout')
     def test_compressed_layout_conversions_coverage(self, device, from_layout, to_layout):
-        """
-        This test performs a smoke test for covered conversion and verifies
+        """This test performs a smoke test for covered conversion and verifies
         that an exception is thrown for unsupported conversions.
+
+        TODO: This test covers a subset of
+        TestSparseAny.test_to_sparse tests and can be
+        eliminated. Keeping the test until the new
+        `Tensor.to_sparse(*, layout, blocksize)` has landed.
         """
 
         allowed_pairwise_layouts_sets = {
@@ -2835,19 +2812,21 @@ def _to_from_layout(layout_a, layout_b, a):
                 if a.dim() > 2:
                     expect_error = True
 
-            b = self._convert_to_layout(a, layout_a)
+            blocksize_a = (1, 1) if layout_a in {torch.sparse_bsr, torch.sparse_bsc} else None
+            blocksize_b = (1, 1) if layout_b in {torch.sparse_bsr, torch.sparse_bsc} else None
+            b = a.to_sparse(layout=layout_a, blocksize=blocksize_a)
             if expect_error:
                 with self.assertRaises(RuntimeError):
-                    self._convert_to_layout(b, layout_b)
+                    b.to_sparse(layout=layout_b, blocksize=blocksize_b)
             else:
-                c = self._convert_to_layout(b, layout_b)
+                c = b.to_sparse(layout=layout_b, blocksize=blocksize_b)
                 self.assertEqual(a.to_dense(), c.to_dense())
 
                 # change of blocksize upon conversion is not yet supported.
                 if b.layout in block_layouts:
                     for block_layout in block_layouts:
-                        with self.assertRaisesRegex(RuntimeError, "blocksize does not match the blocksize"):
-                            self._convert_to_layout(b, block_layout, blocksize=3)
+                        with self.assertRaisesRegex(RuntimeError, "conversion from.*to.*is not implemented"):
+                            b.to_sparse(layout=block_layout, blocksize=(3, 3))
 
         batch_dims = [(), (2,), (2, 2), (2, 2, 2)]
         sparse_dims = (6, 12)
@@ -2861,11 +2840,13 @@ def _to_from_layout(layout_a, layout_b, a):
     @hybrid_nonhybrid()
     @unittest.skipIf(not TEST_SCIPY, "SciPy not found")
     def test_dense_to_from_sparse_compressed(self, device, hybrid, batched, layout):
-        """
-        This test tests conversion from dense to/from CSR and CSC
+        """This test tests conversion from dense to/from CSR and CSC
         by comparing to SciPy's implementation.
 
-        TODO: Eventually this is meant to be merged into test_compressed_layout_conversions_coverage
+        Here we test only those conversion combinations that SciPy
+        supports to ensure that PyTorch conversions are in the same
+        page with SciPy.  Independent from SciPy, all conversion
+        combinations are tested in TestSparseAny.test_to_sparse.
         """
 
         # adjust this block as support is added
@@ -2912,7 +2893,7 @@ def _check_batched(pt_tensor, dense, check_batch=None, batch_shape=(), blocksize
             for batch_index in np.ndindex(batch_shape):
                 pt_matrix = pt_tensor[batch_index]
                 dense_matrix = dense[batch_index]
-                dense_matrix_pt = self._convert_to_layout(dense_matrix, layout, blocksize)
+                dense_matrix_pt = dense_matrix.to_sparse(layout=layout, blocksize=blocksize or None)
                 # sanity check, selecting batch of to_<layout> and dense[batch].to_<layout> should give the same result
                 self.assertEqual(pt_matrix, dense_matrix_pt)
                 check_batch(pt_matrix, dense_matrix, blocksize, **kwargs)
@@ -2956,12 +2937,12 @@ def _generate_subject(sparse_shape, batch_shape, hybrid_shape):
         batch_sizes = [(3,), (1, 3), (2, 1, 3)] if batched else [()]
         hybrid_sizes = [(4, ), (2, 2)] if hybrid else [()]
         if not hybrid:
-            # general cases, always run, hybrid excluded untill dense->sparse api exists
+            # general cases, always run, hybrid excluded until dense->sparse api exists
             for sparse_shape, blocksize, batch_shape, hybrid_shape in itertools.product(
                     sparse_sizes, blocksizes, batch_sizes, hybrid_sizes):
                 dense = _generate_subject(sparse_shape, batch_shape, hybrid_shape)
                 if expect_to_layout_support:
-                    sparse = self._convert_to_layout(dense, layout, blocksize)
+                    sparse = dense.to_sparse(layout=layout, blocksize=blocksize or None)
                     check_content(sparse, dense, blocksize=blocksize, batch_shape=batch_shape, hybrid_shape=hybrid_shape)
                     if expect_from_layout_support:
                         dense_back = sparse.to_dense()
@@ -2971,7 +2952,7 @@ def _generate_subject(sparse_shape, batch_shape, hybrid_shape):
                             sparse.to_dense()
                 else:
                     with self.assertRaises(RuntimeError):
-                        self._convert_to_layout(dense, layout, blocksize)
+                        dense.to_sparse(layout=layout, blocksize=blocksize or None)
 
         # special cases for batched tensors
         if batched and expect_to_layout_support:
@@ -3005,7 +2986,7 @@ def _generate_subject(sparse_shape, batch_shape, hybrid_shape):
                 mask = mask.transpose(-3, -2)
                 mask = mask.reshape_as(dense)
             dense = dense * mask
-            sparse = self._convert_to_layout(dense, layout, blocksize)
+            sparse = dense.to_sparse(layout=layout, blocksize=blocksize or None)
             check_content(sparse, dense, blocksize=blocksize, batch_shape=batch_shape, hybrid_shape=hybrid_shape)
 
             if expect_from_layout_support:
@@ -3023,14 +3004,14 @@ def _generate_subject(sparse_shape, batch_shape, hybrid_shape):
             dense = dense * mask
             msg = "Expect the same number of specified elements per batch."
             with self.assertRaisesRegex(RuntimeError, msg):
-                self._convert_to_layout(dense, layout, blocksize)
+                dense.to_sparse(layout=layout, blocksize=blocksize or None)
 
             # Should throw if there is a zero in the batch size
             dense = make_tensor((0,) + shape, dtype=torch.float, device=device)
             layout_code = str(layout).split("_")[-1]
             msg = f"to_sparse_{layout_code}: Expected product of batch dimensions to be non-zero."
             with self.assertRaisesRegex(RuntimeError, msg):
-                self._convert_to_layout(dense, layout, blocksize=blocksize)
+                dense.to_sparse(layout=layout, blocksize=blocksize or None)
 
         if hybrid:
             # conversion from sparse -> dense should be blocked with dense dims
@@ -3063,21 +3044,24 @@ def test_sparse_to_sparse_compressed(self, device, dtype, coalesced, layout):
         This test tests conversion from COO to CSR and CSC and CSC to CSR and CSC
         by comparing to SciPy's implementation.
 
-        TODO: Eventually this is meant to be merged into test_compressed_layout_conversions_coverage
+        Here we test only those conversion combinations that SciPy
+        supports to ensure that PyTorch conversions are in the same
+        page with SciPy.  Independent from SciPy, all conversion
+        combinations are tested in TestSparseAny.test_to_sparse.
         """
         if layout is torch.sparse_bsc:
             # TODO: Remove this once support has been enabled
-            return
+            self.skipTest('NOT IMPL')
         if layout is torch.sparse_bsr:
             # TODO: Remove this once support has been enabled
-            return
+            self.skipTest('NOT IMPL')
 
         for shape in [(0, 10), (6, 0), (6, 10), (0, 0)]:
             sparse_dim = 2
             nnz = shape[0] * shape[1] // 2
             sparse, _, _ = self.genSparseTensor(shape, sparse_dim, nnz, coalesced, device, dtype)
             sp_matrix = self._construct_sp_matrix(sparse, layout)
-            pt_matrix = self._convert_to_layout(sparse, layout)
+            pt_matrix = sparse.to_sparse(layout=layout)
 
             compressed_indices_mth = {
                 torch.sparse_csr: torch.Tensor.crow_indices,
@@ -3097,7 +3081,7 @@ def test_sparse_to_sparse_compressed(self, device, dtype, coalesced, layout):
 
             sparse_csc = sparse.to_sparse_csc()
             sp_matrix = self._construct_sp_matrix(sparse_csc, layout)
-            pt_matrix = self._convert_to_layout(sparse_csc, layout)
+            pt_matrix = sparse_csc.to_sparse(layout=layout)
 
             self.assertEqual(layout, pt_matrix.layout)
             self.assertEqual(sp_matrix.shape, pt_matrix.shape)
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index b2627fedafc92..bebd32e66701a 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -1662,7 +1662,7 @@
 - name: _to_dense(Tensor self, ScalarType? dtype=None) -> Tensor
   self: to_dense_backward(grad, self)
 
-- name: to_sparse(Tensor self) -> Tensor
+- name: to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None) -> Tensor
   self: grad.to_dense()
 
 - name: to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 9eecc5d5b2dc4..434d2855c3929 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -1853,7 +1853,7 @@ def __init__(self, actual, expected, *, rtol_override=0.0, atol_override=0.0, **
 
     def _handle_hybrid_sparse_csr(self, actual, expected):
         compressed_sparse_layouts = {torch.sparse_csr, torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc}
-        if not ((actual.layout in compressed_sparse_layouts) ^ (expected.layout in compressed_sparse_layouts)):
+        if not ((actual.layout in compressed_sparse_layouts) or (expected.layout in compressed_sparse_layouts)):
             return actual, expected
 
         def to_dense(tensor):

From f27ef3601bc1abbc5ca8fc3c4350d112d041eb13 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Wed, 30 Nov 2022 17:20:38 +0000
Subject: [PATCH 1434/1922] [FSDP][BE] Move dynamo annotation to separate file
 (#89890)

This PR makes two minor changes: It (1) moves the recently-added module annotation logic for dynamo support to a separate file `torch/distributed/fsdp/_dynamo_utils.py` and ~~(2) saves the annotated attribute names to global variables `FSDP_MANAGED_MODULE` and `FSDP_USE_ORIG_PARAMS`~~.
Update: Since the distributed package may not be included in some builds, it is not safe to import from `torch.distributed...` to a file in `_dynamo/`. I will not include change (2) in this PR. The alternative is to define those globals (privately) in the dynamo file and import from there in the FSDP file.
- The first change is mainly a personal choice, where I wanted to avoid the dynamo explanation from dominating the FSDP constructor space-wise. I added the `(see function for details)` to the inline comment to forward interested readers.
- The second change follows the custom we have taken in the past for such attributes (e.g. `FSDP_FLATTENED`). My understanding (in the past as well as currently) is that this is a good practice.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89890
Approved by: https://github.com/wconstab
---
 torch/distributed/fsdp/_dynamo_utils.py       | 45 +++++++++++++++++++
 .../fsdp/fully_sharded_data_parallel.py       | 34 ++------------
 2 files changed, 48 insertions(+), 31 deletions(-)
 create mode 100644 torch/distributed/fsdp/_dynamo_utils.py

diff --git a/torch/distributed/fsdp/_dynamo_utils.py b/torch/distributed/fsdp/_dynamo_utils.py
new file mode 100644
index 0000000000000..3a6c63dc5af8b
--- /dev/null
+++ b/torch/distributed/fsdp/_dynamo_utils.py
@@ -0,0 +1,45 @@
+from typing import Set
+
+import torch.nn as nn
+
+
+def _annotate_modules_for_dynamo(
+    module: nn.Module,
+    ignored_modules: Set[nn.Module],
+    use_orig_params: bool,
+):
+    """
+    Annotates the submodules in ``module`` 's tree, except those in
+    ``ignored_modules``, indicating that the submodules are FSDP-managed and
+    saving the ``use_orig_params`` setting passed to the FSDP constructor.
+    """
+    for submodule in module.modules():
+        if submodule not in ignored_modules:
+            """[note: Dynamo treats FSDP wrapped modules as UnspecializedNNModule]
+
+            Dynamo doesn't get to see this instance (FullyShardedDataParallel) during tracing, since
+            it skips tracing all the torch.distributed.fsdp code.
+                - Why? Running the FSDP code eagerly avoids lots of issues trying to trace complex hooks, and also
+                gets us graph-breaks on FSDP module boundaries which we want anyway for comm ops.
+                - However, we _also_ want dynamo to treat the wrapped module inside FSDP 'unspecially' (*),
+                and we need a way to indicate to dynamo which modules are wrapped by FSDP.
+
+            (*) UnspecializedNNModules in dynamo are traced-through without any assumptions, and with thorough
+            guards.  NNModules otherwise are 'specialized', meaning there is less overhead due to assuming
+            their code is well-behaved.
+
+            One particular issue with specialized NNModules for FSDP is that the
+            views created for orig_params are captured into the compiled graph on the first iteration, and while
+            they are always going to point to the correct flatparameter and give correct results, their order
+            of creation influences the order of backward execution, preventing overlap of comm and computation
+            during backward.  We need to _use_ the new parameter views created on each forward iteration, in
+            order for backward to interleave hooks with compute per layer.  UnspecializedNNModule lets us achieve
+            this by capturing the module code more 'functionally' and passing parameters in as inputs each time.
+            """
+            submodule._is_fsdp_managed_module = True  # type: ignore[assignment]
+
+            # Dynamo only supports FSDP with use_orig_params=True.
+            # This is hacky, but I could not think of another way to add an assertion to dynamo
+            # for this, since Dynamo skips all the FSDP code frames and thus can't inspect the
+            # FSDP module directly
+            submodule._fsdp_use_orig_params = use_orig_params  # type: ignore[assignment]
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 044ad2e8f59b9..779499f449532 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -35,6 +35,7 @@
     HandleTrainingState,
     TrainingState,
 )
+from torch.distributed.fsdp._dynamo_utils import _annotate_modules_for_dynamo
 from torch.distributed.fsdp._init_utils import (
     _check_orig_params_flattened,
     _get_default_comm_hook,
@@ -331,37 +332,8 @@ def __init__(
         super().__init__()
         _init_ignored_module_states(self, module, ignored_modules)
 
-        # Add module annotations for Dynamo support
-        for submodule in module.modules():
-            if submodule not in self._ignored_modules:
-                """[note: Dynamo treats FSDP wrapped modules as UnspecializedNNModule]
-
-                Dynamo doesn't get to see this instance (FullyShardedDataParallel) during tracing, since
-                it skips tracing all the torch.distributed.fsdp code.
-                 - Why? Running the FSDP code eagerly avoids lots of issues trying to trace complex hooks, and also
-                   gets us graph-breaks on FSDP module boundaries which we want anyway for comm ops.
-                 - However, we _also_ want dynamo to treat the wrapped module inside FSDP 'unspecially' (*),
-                   and we need a way to indicate to dynamo which modules are wrapped by FSDP.
-
-                (*) UnspecializedNNModules in dynamo are traced-through without any assumptions, and with thorough
-                guards.  NNModules otherwise are 'specialized', meaning there is less overhead due to assuming
-                their code is well-behaved.
-
-                One particular issue with specialized NNModules for FSDP is that the
-                views created for orig_params are captured into the compiled graph on the first iteration, and while
-                they are always going to point to the correct flatparameter and give correct results, their order
-                of creation influences the order of backward execution, preventing overlap of comm and computation
-                during backward.  We need to _use_ the new parameter views created on each forward iteration, in
-                order for backward to interleave hooks with compute per layer.  UnspecializedNNModule lets us achieve
-                this by capturing the module code more 'functionally' and passing parameters in as inputs each time.
-                """
-                submodule._is_fsdp_managed_module = True
-
-                # Dynamo only supports FSDP with use_orig_params=True.
-                # This is hacky, but I could not think of another way to add an assertion to dynamo
-                # for this, since Dynamo skips all the FSDP code frames and thus can't inspect the
-                # FSDP module directly
-                submodule._fsdp_use_orig_params = use_orig_params
+        # Add module annotations for Dynamo support (see function for details)
+        _annotate_modules_for_dynamo(module, self._ignored_modules, use_orig_params)
 
         if auto_wrap_policy is not None:
             auto_wrap_kwargs = {

From 5432f224b2d73debf140600baccdde0405d640e8 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Wed, 30 Nov 2022 16:24:26 +0000
Subject: [PATCH 1435/1922] [FSDP] Slightly refactor fx symbolic tracer
 (#89917)

I made a pass over Linjian's `_symbolic_trace.py` and tidied it up a bit. Aside from simple stylistic changes, this PR makes the following changes:
- Save `visited_params: Set[nn.Parameter]` to avoid linear overhead to check a parameter already being visited when appending to the parameter execution order list (`param_forward_order`)
- Move the tracer patching logic to a class `_ExecOrderTracer` to have a reference to `self.exec_info` without having a fragmented 2-step initialization (like the old `_init_execution_info(root_module)` plus `_patch_tracer(tracer, root_module, execution_info)`)
- Define `_ParamUsageInfo` to formalize the `Tuple[nn.Module, List[str, nn.Parameter]]` elements being mapped to in the execution info `dict`, and clarify the documentation regarding what this represents
- Change the unit test to use `TestCase`, not `FSDPTest`, to avoid initializing a process group

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89917
Approved by: https://github.com/zhaojuanmao, https://github.com/fegin
---
 test/distributed/fsdp/test_fsdp_fx.py     |  61 +++---
 torch/distributed/fsdp/_symbolic_trace.py | 250 ----------------------
 torch/distributed/fsdp/_trace_utils.py    | 237 ++++++++++++++++++++
 3 files changed, 268 insertions(+), 280 deletions(-)
 delete mode 100644 torch/distributed/fsdp/_symbolic_trace.py
 create mode 100644 torch/distributed/fsdp/_trace_utils.py

diff --git a/test/distributed/fsdp/test_fsdp_fx.py b/test/distributed/fsdp/test_fsdp_fx.py
index 7b0e0a3ddf2f2..f8f5c1800ed6d 100644
--- a/test/distributed/fsdp/test_fsdp_fx.py
+++ b/test/distributed/fsdp/test_fsdp_fx.py
@@ -1,12 +1,10 @@
 # Owner(s): ["oncall: distributed"]
 
-from typing import Any
-
 import torch
-from torch.distributed.fsdp._symbolic_trace import _init_execution_info, _patch_tracer
-from torch.testing._internal.common_fsdp import FSDPTest
+from torch.distributed.fsdp._trace_utils import _ExecOrderTracer
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
+    TestCase,
     run_tests,
 )
 
@@ -26,38 +24,37 @@ def __init__(self) -> None:
         )
         self.relu = torch.nn.ReLU()
 
-    def forward(self, x: Any, run_all_layers: bool):
+    def forward(self, x: torch.Tensor, run_all_layers: bool) -> torch.Tensor:
         z = self.relu(self.layer0(x))
         z = self.relu(self.layer2(z))
         z = z @ self.weight1
         if run_all_layers:
             z = self.relu(self.layer1(z))
             z = z @ self.weight2
-            # used to test the case where a module is called more than once
+            # Use `layer0` twice to check the handling of multiplicity in the
+            # saved data structures
             z = self.relu(self.layer0(x))
         return z
 
 
-class TestSymbolicTracing(FSDPTest):
+class TestSymbolicTracing(TestCase):
     def test_symbolic_tracing_outputs(self):
         """
-        test ``execution_info.module_forward_order`` and ``execution_info.module_to_execution_infos``
-        after running ``tracer.trace()`` inside ``_patch_tracer``.
+        Tests running ``tracer.trace()`` inside ``patch_tracer()`` by checking
+        the saved data structures.
         """
         model = Model()
         tracer = torch.fx.Tracer()
-        execution_info = _init_execution_info(model)
-        original_call_module = tracer.call_module
-        original_create_proxy = tracer.create_proxy
-        with _patch_tracer(
-            tracer=tracer, root_module=model, execution_info=execution_info
-        ):
+        orig_call_module = tracer.call_module
+        orig_create_proxy = tracer.create_proxy
+        exec_order_tracer = _ExecOrderTracer()
+        with exec_order_tracer.patch_tracer(tracer=tracer, root_module=model):
             concrete_args = {"run_all_layers": True}
             tracer.trace(model, concrete_args)
-        # the member functions of tracer should not be changed
-        self.assertEqual(original_call_module, tracer.call_module)
-        self.assertEqual(original_create_proxy, tracer.create_proxy)
-        # test tracer.module_forward_order
+        # Check that the tracer methods are unchanged after exiting the context
+        self.assertEqual(orig_call_module, tracer.call_module)
+        self.assertEqual(orig_create_proxy, tracer.create_proxy)
+        # Check `module_forward_order`
         correct_module_forward_order = [
             model,
             model.layer0,
@@ -72,12 +69,11 @@ def test_symbolic_tracing_outputs(self):
             model.layer0,
             model.relu,
         ]
+        exec_info = exec_order_tracer.exec_info
+        self.assertEqual(exec_info.module_forward_order, correct_module_forward_order)
+        # Check `module_to_param_usage_infos`
         self.assertEqual(
-            execution_info.module_forward_order, correct_module_forward_order
-        )
-        # test execution_info.module_to_execution_infos
-        self.assertEqual(
-            execution_info.module_to_execution_infos[model],
+            exec_info.module_to_param_usage_infos[model],
             [
                 (model.layer0, list(model.layer0.named_parameters())),
                 (model.layer2, list(model.layer2.named_parameters())),
@@ -88,22 +84,22 @@ def test_symbolic_tracing_outputs(self):
             ],
         )
         self.assertEqual(
-            execution_info.module_to_execution_infos[model.layer0],
+            exec_info.module_to_param_usage_infos[model.layer0],
             [(model.layer0, list(model.layer0.named_parameters()))],
         )
         self.assertEqual(
-            execution_info.module_to_execution_infos[model.layer1],
+            exec_info.module_to_param_usage_infos[model.layer1],
             [(model.layer1, list(model.layer1.named_parameters()))],
         )
         self.assertEqual(
-            execution_info.module_to_execution_infos[model.layer2],
+            exec_info.module_to_param_usage_infos[model.layer2],
             [
                 (model.layer2[0], list(model.layer2[0].named_parameters())),
                 (model.layer2[2], list(model.layer2[2].named_parameters())),
             ],
         )
-        self.assertEqual(execution_info.module_to_execution_infos[model.relu], [])
-        # test tracer.param_exec_order
+        self.assertEqual(exec_info.module_to_param_usage_infos[model.relu], [])
+        # Check `param_forward_order`
         correct_param_order = [
             model.layer0.weight,
             model.layer0.bias,
@@ -113,7 +109,12 @@ def test_symbolic_tracing_outputs(self):
             model.layer1.weight,
             model.weight2,
         ]
-        self.assertEqual(execution_info.param_exec_order, correct_param_order)
+        self.assertEqual(exec_info.param_forward_order, correct_param_order)
+        # Check `visited_params`
+        self.assertEqual(
+            len(exec_info.visited_params), len(exec_info.param_forward_order)
+        )
+        self.assertEqual(exec_info.visited_params, set(exec_info.param_forward_order))
 
 
 instantiate_parametrized_tests(TestSymbolicTracing)
diff --git a/torch/distributed/fsdp/_symbolic_trace.py b/torch/distributed/fsdp/_symbolic_trace.py
deleted file mode 100644
index f6fe5e432252e..0000000000000
--- a/torch/distributed/fsdp/_symbolic_trace.py
+++ /dev/null
@@ -1,250 +0,0 @@
-import contextlib
-import functools
-from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, Generator, List, Optional, Tuple
-
-import torch
-
-__all__ = ["TracingConfig"]
-
-
-@dataclass
-class TracingConfig:
-    """
-    Configurations used in ``ParamExecOrderWrapPolicy`` for symbolic tracing of
-    a model.
-
-    Args:
-        tracer (torch.fx.Tracer): An instance of ``torch.fx.Tracer`` that will
-            be used to perform symbolic tracing. ``tracer`` is default to be
-            ``torch.fx.Tracer()``, but can also be instance of some child class
-            of ``torch.fx.Tracer``. For example, one may want to use
-            ``HFTracer`` for models in Transformers: .. _Transformers:
-            https://huggingface.co/docs/transformers/index
-        concrete_args (Optional[Dict[str, Any]]): Concrete arguments that should
-            not be treated as ``torch.fx.Proxy`` when tracing the forward
-            function. ``concrete_args`` allows one to partially specialize the
-            forward function, including removing control flow or data
-            structures. ``concrete_args`` is also the argument used in
-            :meth:`~torch.fx.Tracer.trace`.
-    """
-
-    tracer: torch.fx.Tracer = torch.fx.Tracer()
-    concrete_args: Optional[Dict[str, Any]] = None
-
-
-@dataclass
-class _ExecutionInfo:
-    """
-    Contains the execution order information in the model forward pass.
-
-    Attributes:
-        current_module: record the module that is currently being traced.
-
-        module_forward_order: a list of modules, where the ordering is based on
-            when their forward function is called. ``module_forward_order``
-            includes the info of how many times a module is called + used to
-            check the forward order in different iterations.
-
-        param_exec_order: a list of parameters ordered based on their execution
-        order.
-
-        module_to_execution_infos: a dict that maps each module to a list of
-            tuples each containing a module and a list of named parameters.
-            ``module_execution_info_dict`` is used as the parameter execution
-            order info. For a given module, each tuple: 1. either contains this
-            module and part of its ``named_parameters`` that will be executed
-            together, 2. or contains one of its child modules and all of the
-            child module's ``named_parameters``. The list of tuples is ordered
-            based on the parameter execution order.
-    """
-
-    current_module: torch.nn.Module
-    module_forward_order: List[torch.nn.Module]
-    module_to_execution_infos: Dict[
-        torch.nn.Module,
-        List[Tuple[torch.nn.Module, List[Tuple[str, torch.nn.Parameter]]]],
-    ]
-    param_exec_order: List[torch.nn.Parameter] = field(default_factory=list)
-
-
-def _init_execution_info(root_module: torch.nn.Module) -> _ExecutionInfo:
-    """
-    Create an instance of _ExecutionInfo with initialization based on
-    ``root_module``.
-
-    Args:
-        root_module (torch.nn.Module): the module to get the execution
-        information via ``tracer.trace()`` inside ``_patch_tracer``.
-    """
-    return _ExecutionInfo(
-        current_module=root_module,
-        module_forward_order=[root_module],
-        module_to_execution_infos={root_module: []},
-    )
-
-
-def _patched_create_proxy(
-    create_proxy: Callable,
-    execution_info: _ExecutionInfo,
-    prefixed_param_name_to_param: Dict[str, torch.nn.Parameter],
-    kind: str,
-    target: torch.fx.node.Target,
-    args: Tuple[Any, ...],
-    kwargs: Dict[str, Any],
-    name: Optional[str] = None,
-    type_expr: Optional[Any] = None,
-    proxy_factory_fn: Callable[[torch.fx.Node], torch.fx.Proxy] = None,
-) -> torch.fx.Proxy:
-    """
-    Override of :meth:`~torch.fx.Tracer.create_proxy`. ``Tracer.create_proxy``
-    is called in symbolic tracing for each leaf function/method/module. This
-    override intercepts the recording of each of these operations to update
-    ``execution_info.module_to_execution_infos``.
-
-    Args:
-        create_proxy (Callable):
-            The ``create_proxy`` function to be patched.
-        execution_info (_ExecutionInfo):
-            Used to record the execution information.
-        prefixed_param_name_to_param (Dict[str, torch.nn.Parameter]):
-            A dict that maps each prefixed parameter name to the parameter.
-        kind (str):
-            The type of the target method. One of 'call_function',
-            'call_method', 'get_attr', 'call_module', 'placeholder', or
-            'output'. The semantics of these opcodes are described in the
-            ``torch.fx.Graph`` docstring. This is the input to ``create_proxy``.
-        target (torch.fx.node.Target):
-            Contains the string name of the method. This is the input to
-            ``create_proxy``.
-        args (Tuple[Any, ...]):
-            Arguments of the method. This is the input to ``create_proxy``.
-        kwargs (Dict[str, Any]):
-            Keyword arguments of the method. This is the input to
-            ``create_proxy``.
-        name (Optional[str]):
-            An optional string name for the ``Node`` created in
-            ``create_proxy``. This is the input to ``create_proxy``.
-        type_expr (Optional[Any]):
-            An optional type annotation representing the Python type the output
-            of a node will have. This is the input to ``create_proxy``.
-        proxy_factory_fn (Callable[[torch.fx.Node], torch.fx.Proxy]):
-            An alternative proxy constructor used in ``create_proxy``. This is
-            the input to ``create_proxy``.
-    """
-    proxy = create_proxy(kind, target, args, kwargs, name, type_expr, proxy_factory_fn)
-
-    module = execution_info.current_module
-    if kind in ["call_function", "call_method"]:
-        if args is not None:
-            named_params: List[Tuple[str, torch.nn.Parameter]] = []
-            for arg in args:
-                if (
-                    isinstance(arg, torch.fx.Proxy)
-                    and arg.node.target in prefixed_param_name_to_param
-                ):
-                    param = prefixed_param_name_to_param[arg.node.target]
-                    named_params.append((arg.node.target, param))
-                    if param not in set(execution_info.param_exec_order):
-                        execution_info.param_exec_order.append(param)
-            if named_params:
-                execution_info.module_to_execution_infos[module].append(
-                    (module, named_params)
-                )
-    elif kind == "call_module":
-        named_params = list(module.named_parameters())
-        if named_params:
-            execution_info.module_to_execution_infos[module].append(
-                (module, named_params)
-            )
-        for (_, p) in named_params:
-            if p not in set(execution_info.param_exec_order):
-                execution_info.param_exec_order.append(p)
-    return proxy
-
-
-def _patched_call_module(
-    call_module: Callable,
-    execution_info: _ExecutionInfo,
-    module: torch.nn.Module,
-    forward: Callable[..., Any],
-    args: Tuple[Any, ...],
-    kwargs: Dict[str, Any],
-) -> Any:
-    """
-    Override of :meth:`~torch.fx.Tracer.call_module`. ``Tracer.call_module`` is
-    called in symbolic tracing for each non-root module. This override
-    intercepts the recording of each operation to update
-    ``execution_info.module_forward_order`` and
-    ``execution_info.module_to_execution_infos``.
-
-    Args:
-        call_module (Callable):
-            The ``call_module`` function to be patched.
-        execution_info (_ExecutionInfo):
-            Used to repord the execution information.
-        module (torch.nn.Module):
-            The module for which a call is being emitted.
-        forward (Callable[..., Any]):
-            The ``forward()`` method of the ``torch.nn.Module`` to be invoked.
-        args (Tuple[Any, ...]):
-            ``args`` of the module callsite.
-        kwargs (Dict[str, Any]):
-            ``kwargs`` of the module callsite.
-    """
-    execution_info.module_forward_order.append(module)
-    named_params = list(module.named_parameters())
-    if named_params:
-        execution_info.module_to_execution_infos[execution_info.current_module].append(
-            (module, list(module.named_parameters()))
-        )
-    # Stores away current_module for restoration later
-    prev_current_module = execution_info.current_module
-    execution_info.current_module = module
-    # Note that if the forward of module is called multiple times, this will record
-    # the execution info of the last forward pass.
-    execution_info.module_to_execution_infos[module] = []
-    output = call_module(module, forward, args, kwargs)
-    execution_info.current_module = prev_current_module
-    return output
-
-
-@contextlib.contextmanager
-def _patch_tracer(
-    tracer: torch.fx.Tracer,
-    root_module: torch.nn.Module,
-    execution_info: _ExecutionInfo,
-) -> Generator:
-    """
-    Within the context manager, patches the input tracer so that during
-    ``tracer.trace()``, the forward order of all modules and the parameter
-    execution information are recorded. The patches of the input tracer will be
-    removed after the context manager exits.
-
-    Args:
-        tracer (torch.fx.Tracer): the input ``tracer`` whose member functions
-            will be patched within the context manager.
-        root_module (torch.nn.Module): the top-level module to be traced
-            and should not contain any FSDP modules.
-        execution_info (_ExecutionInfo): used to record the execution order
-            information when performing ``tracer.trace()`` within the context
-            manager.
-    """
-    original_call_module = tracer.call_module
-    original_create_proxy = tracer.create_proxy
-
-    tracer.call_module = functools.partial(
-        _patched_call_module, original_call_module, execution_info
-    )
-    prefixed_param_name_to_param = dict(root_module.named_parameters())
-    tracer.create_proxy = functools.partial(
-        _patched_create_proxy,
-        original_create_proxy,
-        execution_info,
-        prefixed_param_name_to_param,
-    )
-    try:
-        yield
-    finally:
-        tracer.call_module = original_call_module
-        tracer.create_proxy = original_create_proxy
diff --git a/torch/distributed/fsdp/_trace_utils.py b/torch/distributed/fsdp/_trace_utils.py
new file mode 100644
index 0000000000000..cb2ca8ad44a30
--- /dev/null
+++ b/torch/distributed/fsdp/_trace_utils.py
@@ -0,0 +1,237 @@
+import functools
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Set, Tuple
+
+import torch
+import torch.nn as nn
+
+
+@dataclass
+class TracingConfig:
+    """
+    This represents a symbolic tracing configuration.
+
+    Args:
+        tracer (torch.fx.Tracer): An instance of :class:`torch.fx.Tracer` to
+            use for symbolic tracing. The default value is the native
+            :class:`torch.fx.Tracer` constructed with default arguments.
+            However, the user may want to pass a different value such as the
+            ``HFTracer`` for models in the HuggingFace Transformers_ library.
+            .. _Transformers: https://huggingface.co/docs/transformers/index
+        concrete_args (Optional[Dict[str, Any]]): Concrete arguments that
+            should not be treated as ``torch.fx.Proxy`` when tracing the
+            module ``forward()``. Passing ``concrete_args`` allows partially
+            specializing the forward, e.g. to remove control flow or data
+            structures. This ``concrete_args`` here is the same argument used
+            in :meth:`~torch.fx.Tracer.trace`.
+    """
+
+    tracer: torch.fx.Tracer = torch.fx.Tracer()
+    concrete_args: Optional[Dict[str, Any]] = None
+
+
+class _ParamUsageInfo(NamedTuple):
+    """
+    This is used for ``_ExecutionInfo.module_to_param_usage_infos`` to record
+    execution information. The ``dict`` maps modules to a list of these
+    ``_ParamUsageInfo`` instances, where each instance represents a group of
+    parameters used together.
+
+    Specifically, for each module key in the ``dict``, each instance of this
+    class represents either:
+    (1) the module and some sublist of its ``named_parameters()`` used
+    together in execution (see ``_patched_create_proxy()``), or
+    (2) a submodule and all of ``submodule.named_parameters()`` (see
+    ``_patched_call_module()``).
+
+    Type (1) corresponds to directly using parameters in ops without calling
+    ``forward()``, and type (2) corresponds to calling ``forward()``. The
+    mapped-to lists in the ``dict`` follow the execution order.
+    """
+
+    module: nn.Module
+    named_params: List[Tuple[str, nn.Parameter]]
+
+
+class _ExecutionInfo:
+    """
+    This represents the execution order information from the forward pass.
+
+    Attributes:
+        curr_module (nn.Module): Current module being traced.
+        module_forward_order (List[nn.Module]): The modules in (pre-)forward
+            order, i.e. the order in which their ``forward()`` methods are
+            called. Each call to a module's ``forward()`` corresponds to one
+            element in the list.
+        module_to_param_usage_infos (Dict[nn.Module, List[_ParamUsageInfo]]):
+            Maps a module to a list of module execution infos. See
+            :class:`_ParamUsageInfo` for details.
+        param_forward_order (List[nn.Parameter]): The parameters in forward
+            execution order, where only a parameter's first participation is
+            included.
+        visited_params (Set[nn.Parameter]): The parameters visited so far
+            during the trace. This is only used during tracing for fast
+            membership check. Invariant: The parameters in
+            ``param_forward_order`` are exactly those in ``visited_params``.
+    """
+
+    def __init__(self, root_module: nn.Module) -> None:
+        self.curr_module: nn.Module = root_module
+        self.module_forward_order: List[nn.Module] = [root_module]
+        self.module_to_param_usage_infos: Dict[nn.Module, List[_ParamUsageInfo]] = {
+            root_module: []
+        }
+        self.param_forward_order: List[nn.Parameter] = []
+        self.visited_params: Set[nn.Parameter] = set()
+
+
+class _ExecOrderTracer:
+    def __init__(self) -> None:
+        self.exec_info: Optional[_ExecutionInfo] = None
+
+    @contextmanager
+    def patch_tracer(self, tracer: torch.fx.Tracer, root_module: nn.Module):
+        self.exec_info = _ExecutionInfo(root_module)
+        orig_call_module = tracer.call_module
+        orig_create_proxy = tracer.create_proxy
+        tracer.call_module = functools.partial(
+            self._patched_call_module, orig_call_module, self.exec_info
+        )
+        fqn_to_param = dict(root_module.named_parameters())
+        tracer.create_proxy = functools.partial(
+            self._patched_create_proxy,
+            orig_create_proxy,
+            self.exec_info,
+            fqn_to_param,
+        )
+        try:
+            yield
+        finally:
+            tracer.call_module = orig_call_module
+            tracer.create_proxy = orig_create_proxy
+
+    def _patched_call_module(
+        self,
+        call_module: Callable,
+        exec_info: _ExecutionInfo,
+        # Below are the expected arguments to `call_module()`
+        module: nn.Module,
+        forward: Callable,
+        args: Tuple[Any, ...],
+        kwargs: Dict[str, Any],
+    ) -> Any:
+        """
+        Overrides ``call_module`` to save execution information to
+        ``exec_info``. Note that ``call_module`` is called during symbolic
+        tracing for each non-root module.
+
+        Args:
+            call_module (Callable): Original ``call_module`` to override.
+            exec_info (_ExecutionInfo): Used to record execution information.
+            module (nn.Module): Module corresponding to this ``call_module``.
+            forward (Callable): ``forward()`` method of ``module`` to be called
+                for this ``call_module``.
+            args (Tuple[Any, ...]): Positional arguments for ``forward``.
+            kwargs (Dict[str, Any]): Keyword arguments for ``forward``.
+
+        Returns:
+            Same return value as ``call_module``.
+        """
+        exec_info.module_forward_order.append(module)
+        named_params = list(module.named_parameters())
+        curr_module = exec_info.curr_module
+        if named_params:
+            assert (
+                curr_module in exec_info.module_to_param_usage_infos
+            ), "The current module should have already been processed by a patched `call_module`"
+            exec_info.module_to_param_usage_infos[exec_info.curr_module].append(
+                _ParamUsageInfo(module, named_params)
+            )
+        prev_curr_module = curr_module
+        exec_info.curr_module = module
+        exec_info.module_to_param_usage_infos[module] = []
+        output = call_module(module, forward, args, kwargs)
+        exec_info.curr_module = prev_curr_module
+        return output
+
+    def _patched_create_proxy(
+        self,
+        create_proxy: Callable,
+        exec_info: _ExecutionInfo,
+        fqn_to_param: Dict[str, nn.Parameter],
+        # Below are the expected arguments to `create_proxy()`
+        kind: str,
+        target: torch.fx.node.Target,
+        args: Tuple[Any, ...],
+        kwargs: Dict[str, Any],
+        name: Optional[str] = None,
+        type_expr: Optional[Any] = None,
+        proxy_factory_fn: Callable[[torch.fx.Node], torch.fx.Proxy] = None,
+    ) -> torch.fx.Proxy:
+        """
+        Overrides ``create_proxy`` to save execution information to
+        ``exec_info``. Note that ``create_proxy`` is called during symbolic
+        tracing for each leaf function/method/module.
+
+        Args:
+            create_proxy (Callable): Original ``create_proxy`` to override.
+            exec_info (_ExecutionInfo): Used to record execution information.
+            fqn_to_param (Dict[str, nn.Parameter]): ``dict`` version of the
+                root module's ``named_parameters()`` with FQN as key and
+                parameter as value.
+            kind (str): Kind of the target method ('call_function',
+                'call_method', 'get_attr', 'call_module', 'placeholder', or
+                'output'). See :class:`torch.fx.Graph` for details. This is
+                passed to ``create_proxy``.
+            target (torch.fx.node.Target): Contains the string name of the
+                function/method/module. This is passed to ``create_proxy``.
+            args (Tuple[Any, ...]): Positional arguments for the function/
+                method/module. This is passed to ``create_proxy``.
+            kwargs (Dict[str, Any]): Keyword arguments for the function/method/
+                module. This is passed to ``create_proxy``
+            name (Optional[str]): An optional string name for the ``Node``
+                created in ``create_proxy``. This is passed to
+                ``create_proxy``.
+            type_expr (Optional[Any]): An optional type annotation representing
+                the Python type that the output of the node has. This is passed
+                to ``create_proxy``.
+            proxy_factory_fn (Callable[[torch.fx.Node], torch.fx.Proxy]):
+                An alternative proxy constructor used in ``create_proxy``. This
+                is passed to ``create_proxy``.
+
+        Returns:
+            torch.fx.Proxy: Created ``Node`` wrapped in a ``Proxy`` object.
+        """
+        proxy = create_proxy(
+            kind, target, args, kwargs, name, type_expr, proxy_factory_fn
+        )
+        curr_module = exec_info.curr_module
+        if kind in ("call_function", "call_method"):
+            if args is not None:
+                named_params: List[Tuple[str, nn.Parameter]] = []
+                for arg in args:
+                    if (
+                        isinstance(arg, torch.fx.Proxy)
+                        and arg.node.target in fqn_to_param
+                    ):
+                        param = fqn_to_param[arg.node.target]
+                        named_params.append((arg.node.target, param))
+                        if param not in exec_info.visited_params:
+                            exec_info.visited_params.add(param)
+                            exec_info.param_forward_order.append(param)
+                if named_params:
+                    exec_info.module_to_param_usage_infos[curr_module].append(
+                        _ParamUsageInfo(curr_module, named_params)
+                    )
+        elif kind == "call_module":
+            named_params = list(curr_module.named_parameters())
+            if named_params:
+                exec_info.module_to_param_usage_infos[curr_module].append(
+                    _ParamUsageInfo(curr_module, named_params)
+                )
+            for _, param in named_params:
+                if param not in exec_info.visited_params:
+                    exec_info.visited_params.add(param)
+                    exec_info.param_forward_order.append(param)
+        return proxy

From 22287f907138df05a13854ec5406af4c0e97e499 Mon Sep 17 00:00:00 2001
From: "Yu, Guangye" <guangye.yu@intel.com>
Date: Wed, 30 Nov 2022 20:38:02 +0000
Subject: [PATCH 1436/1922] add XPU backend to support torch.save and
 torch.load (#89679)

# Motivate
We need to add XPU backend to support torch.save and torch.load when parameter _use_new_zipfile_serialization=False.

# Solution
We give a design via wrap data as a tensor:
>1. and use an in-place copy for H2D
>2. directly call a tensor.to() for D2H.

This can help us:
>1. unify the generic code for all backends.
>2. support all the non-CPU device backends.

# Additional Context
No need more UT.
test/test_serialization.py will cover this code change.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89679
Approved by: https://github.com/ezyang
---
 test/test_torch.py           |  2 +-
 torch/csrc/serialization.cpp | 60 +++++++++++++++---------------------
 2 files changed, 25 insertions(+), 37 deletions(-)

diff --git a/test/test_torch.py b/test/test_torch.py
index df907d826d805..102370aea2be2 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -358,7 +358,7 @@ def test_storage_meta_errors(self, device, dtype):
             s0.tolist()
 
         with tempfile.NamedTemporaryFile() as f:
-            with self.assertRaisesRegex(RuntimeError, r'Device not recognized'):
+            with self.assertRaisesRegex(NotImplementedError, r'Cannot copy out'):
                 s0._write_file(f, True, True, s0.element_size())
 
         for device in ['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']:
diff --git a/torch/csrc/serialization.cpp b/torch/csrc/serialization.cpp
index 385a074b1ccb4..5cc88ad0f1887 100644
--- a/torch/csrc/serialization.cpp
+++ b/torch/csrc/serialization.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/python_headers.h>
 #include <system_error>
 
+#include <ATen/ops/from_blob.h>
 #include <c10/core/CPUAllocator.h>
 #include <torch/csrc/THP.h>
 #include <torch/csrc/serialization.h>
@@ -228,32 +229,22 @@ void THPStorage_writeFileRaw(
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   uint8_t* data;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-  std::unique_ptr<char[]> cpu_data;
+  at::Tensor cpu_tensor;
   int64_t size_bytes = self->nbytes();
   int64_t numel = size_bytes / element_size;
   if (self->device_type() == at::kCPU) {
     data = self->data<uint8_t>();
-#if defined(USE_CUDA) && defined(TORCH_HIP_VERSION) && \
-    (TORCH_HIP_VERSION >= 301)
-  } else if (self->device_type() == at::kCUDA) {
-    cpu_data = std::unique_ptr<char[]>(new char[size_bytes]);
-    data = (uint8_t*)cpu_data.get();
-    C10_CUDA_CHECK(hipMemcpyWithStream(
-        data,
-        self->data<uint8_t>(),
-        size_bytes,
-        cudaMemcpyDeviceToHost,
-        c10::hip::getCurrentHIPStreamMasqueradingAsCUDA()));
-#elif defined(USE_CUDA)
-  } else if (self->device_type() == at::kCUDA) {
-    cpu_data = std::unique_ptr<char[]>(new char[size_bytes]);
-    data = (uint8_t*)cpu_data.get();
-    C10_CUDA_CHECK(cudaMemcpy(
-        data, self->data<uint8_t>(), size_bytes, cudaMemcpyDeviceToHost));
-#endif
   } else {
-    TORCH_CHECK(
-        false, "writeFileRaw: Device not recognized: ", self->device_type());
+    // Here we use a tensor.to() to impl D2H for all non-CPU device.
+    auto device_tensor = at::from_blob(
+        self->data<void>(),
+        {size_bytes},
+        {1},
+        NULL,
+        at::device(self->device()).dtype(c10::kByte),
+        {self->device()});
+    cpu_tensor = device_tensor.to(at::kCPU);
+    data = (uint8_t*)cpu_tensor.data_ptr();
   }
   if (save_size) {
     if (torch::utils::THP_nativeByteOrder() ==
@@ -409,22 +400,19 @@ c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw(
     }
   }
 
-#if defined(USE_CUDA) && defined(TORCH_HIP_VERSION) && \
-    (TORCH_HIP_VERSION >= 301)
-  if (storage->device_type() == at::kCUDA) {
-    C10_CUDA_CHECK(hipMemcpyWithStream(
-        storage->data<uint8_t>(),
-        data,
-        nbytes,
-        cudaMemcpyHostToDevice,
-        c10::hip::getCurrentHIPStreamMasqueradingAsCUDA()));
-  }
-#elif defined(USE_CUDA)
-  if (storage->device_type() == at::kCUDA) {
-    C10_CUDA_CHECK(cudaMemcpy(
-        storage->data<uint8_t>(), data, nbytes, cudaMemcpyHostToDevice));
+  if (storage->device_type() != at::kCPU) {
+    // Here we use a tensor.copy_() to impl H2D for all non-CPU device.
+    auto cpu_tensor = at::from_blob(
+        (void*)data, {nbytes}, at::device(at::kCPU).dtype(c10::kByte));
+    auto device_tensor = at::from_blob(
+        storage->data<void>(),
+        {nbytes},
+        {1},
+        NULL,
+        at::device(storage->device()).dtype(c10::kByte),
+        {storage->device()});
+    device_tensor.copy_(cpu_tensor);
   }
-#endif
   return storage;
 }
 

From 911ffc5e282da92c917bbc639148d1a493029a0f Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@meta.com>
Date: Wed, 30 Nov 2022 20:46:13 +0000
Subject: [PATCH 1437/1922] [BE] Beef up test_functionalization to test
 functionalizing multi-parameter functions (#89798)

Previously, `assert_functionalization` only took in uni-Tensor-parameter functions. This PR beefs up the check to allow for functions that take multiple parameters.

This PR also changes the test_instance_norm test to check that the multiparam change works.

## Test plan
Locally tested, CI should also pass.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89798
Approved by: https://github.com/samdow
---
 test/test_functionalization.py | 472 ++++++++++++++-------------------
 1 file changed, 203 insertions(+), 269 deletions(-)

diff --git a/test/test_functionalization.py b/test/test_functionalization.py
index aa97b2a392389..cf1d6b0145358 100644
--- a/test/test_functionalization.py
+++ b/test/test_functionalization.py
@@ -7,7 +7,7 @@
     xfail_inherited_tests
 )
 from torch.testing._internal.logging_tensor import LoggingTensor, capture_logs
-from torch.utils._pytree import tree_map
+from torch.utils._pytree import tree_map, tree_map_only, tree_flatten
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.passes.reinplace import reinplace
 from torch._dispatch.python import enable_crossref_functionalize, enable_python_dispatcher
@@ -25,27 +25,34 @@ def are_aliased(x, y):
 
 # We can unify testing and use functionalize() here instead
 # if/when functorch moves into core.
-# This is basically a crappy version of `functionalize()` for single-tensor-arg inputs.
+# This is basically a crappy version of `functionalize()`.
 def _functionalize(f, *, reapply_views: bool, crossref: bool):
-    def wrapped(a):
+    def to_fun(t: torch.Tensor):
+        func_t = torch._to_functional_tensor(t)
+        func_t.requires_grad = t.requires_grad
+        return func_t
+
+    def wrapped(*inputs):
         ctx = nullcontext()
         if crossref:
             ctx = enable_crossref_functionalize()
         with ctx:
-            input_functional = torch._to_functional_tensor(a)
-            input_functional.requires_grad = a.requires_grad
+            inputs_functional = tree_map_only(torch.Tensor, to_fun, inputs)
             torch._enable_functionalization(reapply_views=reapply_views)
             try:
-                out = f(input_functional)
+                out = f(*inputs_functional)
             finally:
                 torch._disable_functionalization()
-            torch._sync(input_functional)
-            inpt_new = torch._from_functional_tensor(input_functional)
-            if inpt_new is not a:
-                # Existing deficiency in functionalize():
-                # we don't correctly mutate input metadata (yet?)
-                if inpt_new.shape == a.shape:
-                    a.copy_(inpt_new)
+            flat_inputs, _ = tree_flatten(inputs)
+            flat_inputs_functional, _ = tree_flatten(inputs_functional)
+            for inpt, input_functional in zip(flat_inputs, flat_inputs_functional):
+                torch._sync(input_functional)
+                inpt_new = torch._from_functional_tensor(input_functional)
+                if inpt_new is not inpt:
+                    # Existing deficiency in functionalize():
+                    # we don't correctly mutate input metadata (yet?)
+                    if inpt_new.shape == inpt.shape:
+                        inpt.copy_(inpt_new)
             tree_map(torch._sync, out)
             out_unwrapped = tree_map(torch._from_functional_tensor, out)
             return out_unwrapped
@@ -57,40 +64,40 @@ class TestFunctionalization(TestCase):
 
     crossref = False
 
-    def get_logs(self, func, inpt, *, reapply_views=False, run_reinplace=False):
-        inpt_clone = inpt.clone()
-        traced_f = make_fx(_functionalize(func, reapply_views=reapply_views, crossref=self.crossref))(inpt)
+    def get_logs(self, func, *inpts, reapply_views=False, run_reinplace=False):
+        inpts_clone = tree_map_only(torch.Tensor, torch.clone, inpts)
+        traced_f = make_fx(_functionalize(func, reapply_views=reapply_views, crossref=self.crossref))(*inpts)
         if run_reinplace:
-            traced_f = reinplace(traced_f, inpt_clone)
+            traced_f = reinplace(traced_f, *inpts_clone)
         return traced_f.code
 
-    def assert_functionalization(self, func, inpt, *, reapply_views=False, mutated_input_metadata=False):
-        input_clone = inpt.clone()
-        input_clone2 = inpt.clone()
-        input_clone3 = inpt.clone()
+    def assert_functionalization(self, func, *inpts, reapply_views=False, mutated_input_metadata=False):
+        clones1 = tree_map_only(torch.Tensor, torch.clone, inpts)
+        clones2 = tree_map_only(torch.Tensor, torch.clone, inpts)
+        clones3 = tree_map_only(torch.Tensor, torch.clone, inpts)
 
         # Compare outputs (and mutated inputs), with and without functionalization.
-        out_ref = func(inpt)
-        out_functional = _functionalize(func, reapply_views=reapply_views, crossref=self.crossref)(input_clone)
+        out_ref = func(*inpts)
+        out_functional = _functionalize(func, reapply_views=reapply_views, crossref=self.crossref)(*clones1)
+
         # The reinplacing pass is only valid to run with reapply_views=True.
-        functional_func = make_fx(_functionalize(func, reapply_views=True, crossref=self.crossref))(input_clone2)
-        reinplace_func = reinplace(
-            make_fx(
-                _functionalize(func, reapply_views=True, crossref=self.crossref)
-            )(input_clone2),
-            input_clone2
-        )
+        functional_func = make_fx(_functionalize(func, reapply_views=True, crossref=self.crossref))(*clones2)
+        reinplace_func = reinplace(functional_func, *clones2)
 
         # NOTE: for now, need to pass in fresh inputs here, because make_fx
         # will directly mutate the inputs that you trace with.
         # Once this is fixed we can clean this up.
-        out_reinplace = reinplace_func(input_clone3)
+        out_reinplace = reinplace_func(*clones3)
 
         # functionalize() deficiency: input metadata mutations aren't propagated properly,
         # so we just need to skip checks here for the tests that exercise that.
         if not mutated_input_metadata:
-            self.assertEqual(inpt, input_clone)  # input mutations should still occur
-            self.assertEqual(inpt, input_clone3)
+            flat_inpts, _ = tree_flatten(inpts)
+            flat_clones1, _ = tree_flatten(clones1)
+            flat_clones3, _ = tree_flatten(clones3)
+            for inpt, input_clone, input_clone3 in zip(flat_inpts, flat_clones1, flat_clones3):
+                self.assertEqual(inpt, input_clone)  # input mutations should still occur
+                self.assertEqual(inpt, input_clone3)
 
         # Handle tests with multi-tensor outputs
         if isinstance(out_ref, tuple):
@@ -163,8 +170,8 @@ def g(x):
 
 
-def forward(self, a_1):
-    view_copy = torch.ops.aten.view_copy.default(a_1, [1, 1024, 128, 128]);  a_1 = None
+def forward(self, arg0_1):
+    view_copy = torch.ops.aten.view_copy.default(arg0_1, [1, 1024, 128, 128]);  arg0_1 = None
     clone = torch.ops.aten.clone.default(view_copy);  view_copy = None
     view_copy_1 = torch.ops.aten.view_copy.default(clone, [16, 64, 128, 128])
     relu = torch.ops.aten.relu.default(view_copy_1);  view_copy_1 = None
@@ -202,13 +209,13 @@ def f(x):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     ones = torch.ops.aten.ones.default([4, 2], device = device(type='cpu'), pin_memory = False)
-    view_copy = torch.ops.aten.view_copy.default(a_1, [4, 2])
+    view_copy = torch.ops.aten.view_copy.default(arg0_1, [4, 2])
     add = torch.ops.aten.add.Tensor(view_copy, ones);  view_copy = ones = None
     view_copy_1 = torch.ops.aten.view_copy.default(add, [4, 2])
     mul = torch.ops.aten.mul.Tensor(view_copy_1, view_copy_1)
-    copy_ = torch.ops.aten.copy_.default(a_1, view_copy_1);  a_1 = view_copy_1 = None
+    copy_ = torch.ops.aten.copy_.default(arg0_1, view_copy_1);  arg0_1 = view_copy_1 = None
     return add
     """)
 
@@ -217,13 +224,13 @@ def forward(self, a_1):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     ones = torch.ops.aten.ones.default([4, 2], device = device(type='cpu'), pin_memory = False)
-    view = torch.ops.aten.view.default(a_1, [4, 2])
+    view = torch.ops.aten.view.default(arg0_1, [4, 2])
     add = torch.ops.aten.add.Tensor(view, ones);  view = ones = None
     view_1 = torch.ops.aten.view.default(add, [4, 2])
     mul = torch.ops.aten.mul.Tensor(view_1, view_1)
-    copy_ = torch.ops.aten.copy_.default(a_1, view_1);  a_1 = view_1 = None
+    copy_ = torch.ops.aten.copy_.default(arg0_1, view_1);  arg0_1 = view_1 = None
     return add
     """)
 
@@ -242,9 +249,9 @@ def f(x):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     ones = torch.ops.aten.ones.default([4, 2], device = device(type='cpu'), pin_memory = False)
-    view_copy = torch.ops.aten.view_copy.default(a_1, [4, 2]);  a_1 = None
+    view_copy = torch.ops.aten.view_copy.default(arg0_1, [4, 2]);  arg0_1 = None
     empty = torch.ops.aten.empty.memory_format([], device = device(type='cpu'), pin_memory = False)
     add = torch.ops.aten.add.Tensor(view_copy, ones);  view_copy = ones = None
     mul = torch.ops.aten.mul.Tensor(add, add);  add = None
@@ -256,9 +263,9 @@ def forward(self, a_1):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     ones = torch.ops.aten.ones.default([4, 2], device = device(type='cpu'), pin_memory = False)
-    view = torch.ops.aten.view.default(a_1, [4, 2]);  a_1 = None
+    view = torch.ops.aten.view.default(arg0_1, [4, 2]);  arg0_1 = None
     empty = torch.ops.aten.empty.memory_format([], device = device(type='cpu'), pin_memory = False)
     add = torch.ops.aten.add.Tensor(view, ones);  view = ones = None
     mul = torch.ops.aten.mul.Tensor(add, add);  add = None
@@ -279,10 +286,10 @@ def f(x):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     empty = torch.ops.aten.empty.memory_format([4], device = device(type='cpu'), pin_memory = False)
     empty_1 = torch.ops.aten.empty.memory_format([4], device = device(type='cpu'), pin_memory = False)
-    aminmax = torch.ops.aten.aminmax.default(a_1, dim = 0);  a_1 = None
+    aminmax = torch.ops.aten.aminmax.default(arg0_1, dim = 0);  arg0_1 = None
     getitem = aminmax[0]
     getitem_1 = aminmax[1];  aminmax = None
     return getitem
@@ -293,10 +300,10 @@ def forward(self, a_1):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     empty = torch.ops.aten.empty.memory_format([4], device = device(type='cpu'), pin_memory = False)
     empty_1 = torch.ops.aten.empty.memory_format([4], device = device(type='cpu'), pin_memory = False)
-    aminmax = torch.ops.aten.aminmax.default(a_1, dim = 0);  a_1 = None
+    aminmax = torch.ops.aten.aminmax.default(arg0_1, dim = 0);  arg0_1 = None
     getitem = aminmax[0]
     getitem_1 = aminmax[1];  aminmax = None
     return getitem
@@ -317,7 +324,7 @@ def f(x):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     _tensor_constant0 = self._tensor_constant0
     lift_fresh_copy = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
     view_copy = torch.ops.aten.view_copy.default(lift_fresh_copy, [-1]);  lift_fresh_copy = None
@@ -331,7 +338,7 @@ def forward(self, a_1):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     _tensor_constant0 = self._tensor_constant0
     lift_fresh_copy = torch.ops.aten.lift_fresh_copy.default(_tensor_constant0);  _tensor_constant0 = None
     view = torch.ops.aten.view.default(lift_fresh_copy, [-1]);  lift_fresh_copy = None
@@ -367,11 +374,11 @@ def f(x):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     ones = torch.ops.aten.ones.default([4, 2], device = device(type='cpu'), pin_memory = False)
-    view_copy = torch.ops.aten.view_copy.default(a_1, [4, 2])
-    add = torch.ops.aten.add.Tensor(a_1, ones);  ones = None
-    copy_ = torch.ops.aten.copy_.default(a_1, add);  a_1 = None
+    view_copy = torch.ops.aten.view_copy.default(arg0_1, [4, 2])
+    add = torch.ops.aten.add.Tensor(arg0_1, ones);  ones = None
+    copy_ = torch.ops.aten.copy_.default(arg0_1, add);  arg0_1 = None
     view_copy_1 = torch.ops.aten.view_copy.default(add, [4, 2]);  add = None
     return view_copy_1
     """)
@@ -381,11 +388,11 @@ def forward(self, a_1):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     ones = torch.ops.aten.ones.default([4, 2], device = device(type='cpu'), pin_memory = False)
-    view = torch.ops.aten.view.default(a_1, [4, 2])
-    add = torch.ops.aten.add.Tensor(a_1, ones);  ones = None
-    copy_ = torch.ops.aten.copy_.default(a_1, add);  a_1 = None
+    view = torch.ops.aten.view.default(arg0_1, [4, 2])
+    add = torch.ops.aten.add.Tensor(arg0_1, ones);  ones = None
+    copy_ = torch.ops.aten.copy_.default(arg0_1, add);  arg0_1 = None
     view_1 = torch.ops.aten.view.default(add, [4, 2]);  add = None
     return view_1
     """)
@@ -401,15 +408,15 @@ def f(x):
 
 
-def forward(self, a_1):
-    _fused_moving_avg_obs_fq_helper_functional = torch.ops.aten._fused_moving_avg_obs_fq_helper_functional.default(a_1, a_1, a_1, a_1, a_1, a_1, a_1, 1.0, 0, 1, 0)
+def forward(self, arg0_1):
+    _fused_moving_avg_obs_fq_helper_functional = torch.ops.aten._fused_moving_avg_obs_fq_helper_functional.default(arg0_1, arg0_1, arg0_1, arg0_1, arg0_1, arg0_1, arg0_1, 1.0, 0, 1, 0)
     getitem = _fused_moving_avg_obs_fq_helper_functional[0]
     getitem_1 = _fused_moving_avg_obs_fq_helper_functional[1]
     getitem_2 = _fused_moving_avg_obs_fq_helper_functional[2]
     getitem_3 = _fused_moving_avg_obs_fq_helper_functional[3]
     getitem_4 = _fused_moving_avg_obs_fq_helper_functional[4]
     getitem_5 = _fused_moving_avg_obs_fq_helper_functional[5];  _fused_moving_avg_obs_fq_helper_functional = None
-    copy_ = torch.ops.aten.copy_.default(a_1, getitem_5);  a_1 = getitem_5 = None
+    copy_ = torch.ops.aten.copy_.default(arg0_1, getitem_5);  arg0_1 = getitem_5 = None
     return (getitem, getitem_1)
     """)  # noqa: B950
 
@@ -424,11 +431,11 @@ def f(x):
 
 
-def forward(self, a_1):
-    as_strided_copy = torch.ops.aten.as_strided_copy.default(a_1, [2], [2], 1)
+def forward(self, arg0_1):
+    as_strided_copy = torch.ops.aten.as_strided_copy.default(arg0_1, [2], [2], 1)
     add = torch.ops.aten.add.Tensor(as_strided_copy, 1);  as_strided_copy = None
-    as_strided_scatter = torch.ops.aten.as_strided_scatter.default(a_1, add, [2], [2], 1);  add = None
-    copy_ = torch.ops.aten.copy_.default(a_1, as_strided_scatter);  a_1 = None
+    as_strided_scatter = torch.ops.aten.as_strided_scatter.default(arg0_1, add, [2], [2], 1);  add = None
+    copy_ = torch.ops.aten.copy_.default(arg0_1, as_strided_scatter);  arg0_1 = None
     return as_strided_scatter
     """)
 
@@ -443,8 +450,8 @@ def f(x):
 
 
-def forward(self, a_1):
-    block_diag = torch.ops.aten.block_diag.default([a_1, a_1]);  a_1 = None
+def forward(self, arg0_1):
+    block_diag = torch.ops.aten.block_diag.default([arg0_1, arg0_1]);  arg0_1 = None
     return block_diag
     """)
 
@@ -459,9 +466,9 @@ def f(x):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     empty = torch.ops.aten.empty.memory_format([0], device = device(type='cpu'), pin_memory = False)
-    cat = torch.ops.aten.cat.default([a_1]);  a_1 = None
+    cat = torch.ops.aten.cat.default([arg0_1]);  arg0_1 = None
     return cat
     """)
 
@@ -470,9 +477,9 @@ def forward(self, a_1):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     empty = torch.ops.aten.empty.memory_format([0], device = device(type='cpu'), pin_memory = False)
-    cat = torch.ops.aten.cat.default([a_1]);  a_1 = None
+    cat = torch.ops.aten.cat.default([arg0_1]);  arg0_1 = None
     return cat
     """)
 
@@ -491,12 +498,12 @@ def f(x):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     ones = torch.ops.aten.ones.default([2], device = device(type='cpu'), pin_memory = False)
-    clone = torch.ops.aten.clone.default(a_1)
+    clone = torch.ops.aten.clone.default(arg0_1)
     diagonal_copy = torch.ops.aten.diagonal_copy.default(clone);  clone = None
     add = torch.ops.aten.add.Tensor(diagonal_copy, ones);  diagonal_copy = ones = None
-    mul = torch.ops.aten.mul.Tensor(a_1, a_1);  a_1 = None
+    mul = torch.ops.aten.mul.Tensor(arg0_1, arg0_1);  arg0_1 = None
     return mul
     """)
 
@@ -505,12 +512,12 @@ def forward(self, a_1):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     ones = torch.ops.aten.ones.default([2], device = device(type='cpu'), pin_memory = False)
-    clone = torch.ops.aten.clone.default(a_1)
+    clone = torch.ops.aten.clone.default(arg0_1)
     diagonal = torch.ops.aten.diagonal.default(clone);  clone = None
     add = torch.ops.aten.add_.Tensor(diagonal, ones);  diagonal = ones = None
-    mul = torch.ops.aten.mul.Tensor(a_1, a_1);  a_1 = None
+    mul = torch.ops.aten.mul.Tensor(arg0_1, arg0_1);  arg0_1 = None
     return mul
     """)
 
@@ -528,12 +535,12 @@ def f(x):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     ones = torch.ops.aten.ones.default([2], device = device(type='cpu'), pin_memory = False)
-    diagonal_copy = torch.ops.aten.diagonal_copy.default(a_1)
+    diagonal_copy = torch.ops.aten.diagonal_copy.default(arg0_1)
     add = torch.ops.aten.add.Tensor(diagonal_copy, ones);  diagonal_copy = ones = None
-    diagonal_scatter = torch.ops.aten.diagonal_scatter.default(a_1, add);  add = None
-    copy_ = torch.ops.aten.copy_.default(a_1, diagonal_scatter);  a_1 = None
+    diagonal_scatter = torch.ops.aten.diagonal_scatter.default(arg0_1, add);  add = None
+    copy_ = torch.ops.aten.copy_.default(arg0_1, diagonal_scatter);  arg0_1 = None
     return diagonal_scatter
     """)
 
@@ -552,20 +559,20 @@ def f(x):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     ones = torch.ops.aten.ones.default([2], device = device(type='cpu'), pin_memory = False)
-    split_copy = torch.ops.aten.split_copy.Tensor(a_1, 2)
+    split_copy = torch.ops.aten.split_copy.Tensor(arg0_1, 2)
     getitem = split_copy[0]
     getitem_1 = split_copy[1];  split_copy = None
     diagonal_copy = torch.ops.aten.diagonal_copy.default(getitem_1);  getitem_1 = None
     add = torch.ops.aten.add.Tensor(diagonal_copy, ones);  diagonal_copy = ones = None
-    split_copy_1 = torch.ops.aten.split_copy.Tensor(a_1, 2)
+    split_copy_1 = torch.ops.aten.split_copy.Tensor(arg0_1, 2)
     getitem_2 = split_copy_1[0]
     getitem_3 = split_copy_1[1];  split_copy_1 = None
     diagonal_scatter = torch.ops.aten.diagonal_scatter.default(getitem_3, add);  getitem_3 = None
-    slice_scatter = torch.ops.aten.slice_scatter.default(a_1, diagonal_scatter, 0, 2, 4);  diagonal_scatter = None
+    slice_scatter = torch.ops.aten.slice_scatter.default(arg0_1, diagonal_scatter, 0, 2, 4);  diagonal_scatter = None
     mul = torch.ops.aten.mul.Tensor(slice_scatter, slice_scatter)
-    copy_ = torch.ops.aten.copy_.default(a_1, slice_scatter);  a_1 = slice_scatter = None
+    copy_ = torch.ops.aten.copy_.default(arg0_1, slice_scatter);  arg0_1 = slice_scatter = None
     return add
     """)  # noqa: B950
 
@@ -583,12 +590,12 @@ def f(x):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     ones = torch.ops.aten.ones.default([4], device = device(type='cpu'), pin_memory = False)
-    transpose_copy = torch.ops.aten.transpose_copy.int(a_1, 1, 0)
+    transpose_copy = torch.ops.aten.transpose_copy.int(arg0_1, 1, 0)
     select_copy = torch.ops.aten.select_copy.int(transpose_copy, 0, 0);  transpose_copy = None
     add = torch.ops.aten.add.Tensor(select_copy, ones);  select_copy = ones = None
-    transpose_copy_1 = torch.ops.aten.transpose_copy.int(a_1, 1, 0);  a_1 = None
+    transpose_copy_1 = torch.ops.aten.transpose_copy.int(arg0_1, 1, 0);  arg0_1 = None
     select_scatter = torch.ops.aten.select_scatter.default(transpose_copy_1, add, 0, 0);  transpose_copy_1 = add = None
     transpose_copy_2 = torch.ops.aten.transpose_copy.int(select_scatter, 1, 0);  select_scatter = None
     transpose_copy_3 = torch.ops.aten.transpose_copy.int(transpose_copy_2, 1, 0);  transpose_copy_2 = None
@@ -610,13 +617,13 @@ def f(x):
 
 
-def forward(self, a_1):
-    view_copy = torch.ops.aten.view_copy.default(a_1, [8])
+def forward(self, arg0_1):
+    view_copy = torch.ops.aten.view_copy.default(arg0_1, [8])
     arange = torch.ops.aten.arange.default(4, device = device(type='cpu'), pin_memory = False)
     arange_1 = torch.ops.aten.arange.default(4, dtype = torch.float32, device = device(type='cpu'), pin_memory = False)
     index_put = torch.ops.aten.index_put.default(view_copy, [arange], arange_1);  view_copy = arange = arange_1 = None
     view_copy_1 = torch.ops.aten.view_copy.default(index_put, [4, 2])
-    copy_ = torch.ops.aten.copy_.default(a_1, view_copy_1);  a_1 = view_copy_1 = None
+    copy_ = torch.ops.aten.copy_.default(arg0_1, view_copy_1);  arg0_1 = view_copy_1 = None
     return index_put
     """)  # noqa: B950
 
@@ -635,14 +642,14 @@ def f(x):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     ones = torch.ops.aten.ones.default([4, 2], device = device(type='cpu'), pin_memory = False)
-    view_copy = torch.ops.aten.view_copy.default(a_1, [4, 2])
+    view_copy = torch.ops.aten.view_copy.default(arg0_1, [4, 2])
     add = torch.ops.aten.add.Tensor(view_copy, 1);  view_copy = None
     mul = torch.ops.aten.mul.Tensor(add, 2)
     div = torch.ops.aten.div.Tensor(mul, 1);  mul = None
     view_copy_1 = torch.ops.aten.view_copy.default(add, [4, 2]);  add = None
-    copy_ = torch.ops.aten.copy_.default(a_1, view_copy_1);  a_1 = view_copy_1 = None
+    copy_ = torch.ops.aten.copy_.default(arg0_1, view_copy_1);  arg0_1 = view_copy_1 = None
     return div
     """)
 
@@ -660,8 +667,8 @@ def f(x):
 
 
-def forward(self, a_1):
-    clone = torch.ops.aten.clone.default(a_1);  a_1 = None
+def forward(self, arg0_1):
+    clone = torch.ops.aten.clone.default(arg0_1);  arg0_1 = None
     ge = torch.ops.aten.ge.Scalar(clone, 0);  clone = None
     _to_copy = torch.ops.aten._to_copy.default(ge, dtype = torch.float32, layout = torch.strided);  ge = None
     return _to_copy
@@ -672,8 +679,8 @@ def forward(self, a_1):
 
 
-def forward(self, a_1):
-    clone = torch.ops.aten.clone.default(a_1);  a_1 = None
+def forward(self, arg0_1):
+    clone = torch.ops.aten.clone.default(arg0_1);  arg0_1 = None
     ge = torch.ops.aten.ge.Scalar(clone, 0);  clone = None
     _to_copy = torch.ops.aten._to_copy.default(ge, dtype = torch.float32, layout = torch.strided);  ge = None
     return _to_copy
@@ -708,8 +715,8 @@ def f(x):
 
 
-def forward(self, a_1):
-    view_copy = torch.ops.aten.view_copy.default(a_1, [4, 2]);  a_1 = None
+def forward(self, arg0_1):
+    view_copy = torch.ops.aten.view_copy.default(arg0_1, [4, 2]);  arg0_1 = None
     return view_copy
     """)
 
@@ -733,9 +740,9 @@ def f(x):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     ones = torch.ops.aten.ones.default([2, 2], device = device(type='cpu'), pin_memory = False)
-    add = torch.ops.aten.add.Tensor(a_1, a_1);  a_1 = None
+    add = torch.ops.aten.add.Tensor(arg0_1, arg0_1);  arg0_1 = None
     view_copy = torch.ops.aten.view_copy.default(add, [8])
     view_copy_1 = torch.ops.aten.view_copy.default(view_copy, [2, 4]);  view_copy = None
     transpose_copy = torch.ops.aten.transpose_copy.int(view_copy_1, 1, 0)
@@ -779,9 +786,9 @@ def forward(self, a_1):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     ones = torch.ops.aten.ones.default([2, 2], device = device(type='cpu'), pin_memory = False)
-    add = torch.ops.aten.add.Tensor(a_1, a_1);  a_1 = None
+    add = torch.ops.aten.add.Tensor(arg0_1, arg0_1);  arg0_1 = None
     view = torch.ops.aten.view.default(add, [8])
     view_1 = torch.ops.aten.view.default(view, [2, 4]);  view = None
     transpose = torch.ops.aten.transpose.int(view_1, 1, 0)
@@ -824,13 +831,13 @@ def f(x):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     ones = torch.ops.aten.ones.default([4, 2], device = device(type='cpu'), pin_memory = False)
-    view = torch.ops.aten.view.default(a_1, [4, 2])
+    view = torch.ops.aten.view.default(arg0_1, [4, 2])
     add = torch.ops.aten.add.Tensor(view, ones);  view = ones = None
     view_1 = torch.ops.aten.view.default(add, [4, 2])
     mul = torch.ops.aten.mul.Tensor(view_1, view_1)
-    copy_ = torch.ops.aten.copy_.default(a_1, view_1);  a_1 = view_1 = None
+    copy_ = torch.ops.aten.copy_.default(arg0_1, view_1);  arg0_1 = view_1 = None
     return add
     """)
 
@@ -874,11 +881,11 @@ def f(x):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     zeros = torch.ops.aten.zeros.default([2, 2], device = device(type='cpu'), pin_memory = False)
     diagonal_copy = torch.ops.aten.diagonal_copy.default(zeros);  zeros = None
-    copy = torch.ops.aten.copy.default(diagonal_copy, a_1);  diagonal_copy = None
-    add = torch.ops.aten.add.Tensor(copy, a_1);  copy = a_1 = None
+    copy = torch.ops.aten.copy.default(diagonal_copy, arg0_1);  diagonal_copy = None
+    add = torch.ops.aten.add.Tensor(copy, arg0_1);  copy = arg0_1 = None
     return add
     """)
 
@@ -887,11 +894,11 @@ def forward(self, a_1):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     zeros = torch.ops.aten.zeros.default([2, 2], device = device(type='cpu'), pin_memory = False)
     diagonal = torch.ops.aten.diagonal.default(zeros);  zeros = None
-    copy = torch.ops.aten.copy_.default(diagonal, a_1)
-    add = torch.ops.aten.add_.Tensor(diagonal, a_1);  a_1 = None
+    copy = torch.ops.aten.copy_.default(diagonal, arg0_1)
+    add = torch.ops.aten.add_.Tensor(diagonal, arg0_1);  arg0_1 = None
     return diagonal
     """)
 
@@ -902,11 +909,11 @@ def forward(self, a_1):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     zeros = torch.ops.aten.zeros.default([2, 2], device = device(type='cpu'), pin_memory = False)
     diagonal_copy = torch.ops.aten.diagonal_copy.default(zeros);  zeros = None
-    copy = torch.ops.aten.copy.default(diagonal_copy, a_1);  diagonal_copy = None
-    add = torch.ops.aten.add.Tensor(copy, a_1);  copy = a_1 = None
+    copy = torch.ops.aten.copy.default(diagonal_copy, arg0_1);  diagonal_copy = None
+    add = torch.ops.aten.add.Tensor(copy, arg0_1);  copy = arg0_1 = None
     return add
     """)
 
@@ -915,11 +922,11 @@ def forward(self, a_1):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     zeros = torch.ops.aten.zeros.default([2, 2], device = device(type='cpu'), pin_memory = False)
     diagonal = torch.ops.aten.diagonal.default(zeros);  zeros = None
-    copy = torch.ops.aten.copy_.default(diagonal, a_1)
-    add = torch.ops.aten.add_.Tensor(diagonal, a_1);  a_1 = None
+    copy = torch.ops.aten.copy_.default(diagonal, arg0_1)
+    add = torch.ops.aten.add_.Tensor(diagonal, arg0_1);  arg0_1 = None
     return diagonal
     """)
 
@@ -930,11 +937,11 @@ def forward(self, a_1):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     zeros = torch.ops.aten.zeros.default([2, 2], device = device(type='cpu'), pin_memory = False)
     diagonal_copy = torch.ops.aten.diagonal_copy.default(zeros);  zeros = None
-    copy = torch.ops.aten.copy.default(diagonal_copy, a_1);  diagonal_copy = None
-    add = torch.ops.aten.add.Tensor(copy, a_1);  copy = a_1 = None
+    copy = torch.ops.aten.copy.default(diagonal_copy, arg0_1);  diagonal_copy = None
+    add = torch.ops.aten.add.Tensor(copy, arg0_1);  copy = arg0_1 = None
     return add
     """)  # noqa: B950
 
@@ -943,11 +950,11 @@ def forward(self, a_1):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     zeros = torch.ops.aten.zeros.default([2, 2], device = device(type='cpu'), pin_memory = False)
     diagonal = torch.ops.aten.diagonal.default(zeros);  zeros = None
-    copy = torch.ops.aten.copy_.default(diagonal, a_1)
-    add = torch.ops.aten.add_.Tensor(diagonal, a_1);  a_1 = None
+    copy = torch.ops.aten.copy_.default(diagonal, arg0_1)
+    add = torch.ops.aten.add_.Tensor(diagonal, arg0_1);  arg0_1 = None
     return diagonal
     """)  # noqa: B950
 
@@ -958,11 +965,11 @@ def forward(self, a_1):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     zeros = torch.ops.aten.zeros.default([2, 2], device = device(type='cpu'), pin_memory = False)
     diagonal_copy = torch.ops.aten.diagonal_copy.default(zeros);  zeros = None
-    copy = torch.ops.aten.copy.default(diagonal_copy, a_1);  diagonal_copy = None
-    add = torch.ops.aten.add.Tensor(copy, a_1);  copy = a_1 = None
+    copy = torch.ops.aten.copy.default(diagonal_copy, arg0_1);  diagonal_copy = None
+    add = torch.ops.aten.add.Tensor(copy, arg0_1);  copy = arg0_1 = None
     return add
     """)  # noqa: B950
 
@@ -971,11 +978,11 @@ def forward(self, a_1):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     zeros = torch.ops.aten.zeros.default([2, 2], device = device(type='cpu'), pin_memory = False)
     diagonal = torch.ops.aten.diagonal.default(zeros);  zeros = None
-    copy = torch.ops.aten.copy_.default(diagonal, a_1)
-    add = torch.ops.aten.add_.Tensor(diagonal, a_1);  a_1 = None
+    copy = torch.ops.aten.copy_.default(diagonal, arg0_1)
+    add = torch.ops.aten.add_.Tensor(diagonal, arg0_1);  arg0_1 = None
     return diagonal
     """)  # noqa: B950
 
@@ -991,8 +998,8 @@ def f(x):
 
 
-def forward(self, a_1):
-    expand_copy = torch.ops.aten.expand_copy.default(a_1, [2, 2]);  a_1 = None
+def forward(self, arg0_1):
+    expand_copy = torch.ops.aten.expand_copy.default(arg0_1, [2, 2]);  arg0_1 = None
     return expand_copy
     """)
 
@@ -1009,8 +1016,8 @@ def f(x):
 
 
-def forward(self, a_1):
-    add = torch.ops.aten.add.Tensor(a_1, a_1);  a_1 = None
+def forward(self, arg0_1):
+    add = torch.ops.aten.add.Tensor(arg0_1, arg0_1);  arg0_1 = None
     diagonal_copy = torch.ops.aten.diagonal_copy.default(add)
     fill = torch.ops.aten.fill.Scalar(diagonal_copy, 0);  diagonal_copy = None
     diagonal_scatter = torch.ops.aten.diagonal_scatter.default(add, fill);  add = fill = None
@@ -1022,8 +1029,8 @@ def forward(self, a_1):
 
 
-def forward(self, a_1):
-    add = torch.ops.aten.add.Tensor(a_1, a_1);  a_1 = None
+def forward(self, arg0_1):
+    add = torch.ops.aten.add.Tensor(arg0_1, arg0_1);  arg0_1 = None
     diagonal = torch.ops.aten.diagonal.default(add)
     fill = torch.ops.aten.fill_.Scalar(diagonal, 0);  diagonal = None
     return add
@@ -1046,8 +1053,8 @@ def f(w):
 
 
-def forward(self, a_1):
-    add = torch.ops.aten.add.Tensor(a_1, 1);  a_1 = None
+def forward(self, arg0_1):
+    add = torch.ops.aten.add.Tensor(arg0_1, 1);  arg0_1 = None
     view_copy = torch.ops.aten.view_copy.default(add, [4, 4])
     resize = torch.ops.aten.resize.default(view_copy, [3, 3])
     as_strided_copy = torch.ops.aten.as_strided_copy.default(view_copy, [3, 3], [3, 1]);  view_copy = None
@@ -1069,8 +1076,8 @@ def forward(self, a_1):
 
 
-def forward(self, a_1):
-    add = torch.ops.aten.add.Tensor(a_1, 1);  a_1 = None
+def forward(self, arg0_1):
+    add = torch.ops.aten.add.Tensor(arg0_1, 1);  arg0_1 = None
     view = torch.ops.aten.view.default(add, [4, 4])
     resize = torch.ops.aten.resize.default(view, [3, 3])
     as_strided = torch.ops.aten.as_strided.default(view, [3, 3], [3, 1]);  view = None
@@ -1109,8 +1116,8 @@ def f(x):
 
 
-def forward(self, a_1):
-    add = torch.ops.aten.add.Tensor(a_1, 1);  a_1 = None
+def forward(self, arg0_1):
+    add = torch.ops.aten.add.Tensor(arg0_1, 1);  arg0_1 = None
     resize = torch.ops.aten.resize.default(add, [5, 5]);  add = None
     view_copy = torch.ops.aten.view_copy.default(resize, [25]);  resize = None
     fill = torch.ops.aten.fill.Scalar(view_copy, 1);  view_copy = None
@@ -1124,8 +1131,8 @@ def forward(self, a_1):
 
 
-def forward(self, a_1):
-    add = torch.ops.aten.add.Tensor(a_1, 1);  a_1 = None
+def forward(self, arg0_1):
+    add = torch.ops.aten.add.Tensor(arg0_1, 1);  arg0_1 = None
     resize = torch.ops.aten.resize_.default(add, [5, 5])
     view = torch.ops.aten.view.default(add, [25]);  add = None
     fill = torch.ops.aten.fill_.Scalar(view, 1)
@@ -1208,7 +1215,7 @@ def f(x):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     zeros = torch.ops.aten.zeros.default([10], device = device(type='cpu'), pin_memory = False)
     select_copy = torch.ops.aten.select_copy.int(zeros, 0, 5)
     fill = torch.ops.aten.fill.Scalar(select_copy, 1);  select_copy = None
@@ -1221,7 +1228,7 @@ def forward(self, a_1):
 
 
-def forward(self, a_1):
+def forward(self, arg0_1):
     zeros = torch.ops.aten.zeros.default([10], device = device(type='cpu'), pin_memory = False)
     select = torch.ops.aten.select.int(zeros, 0, 5)
     fill = torch.ops.aten.fill_.Scalar(select, 1);  select = None
@@ -1230,26 +1237,26 @@ def forward(self, a_1):
 
 
     def test_instance_norm(self):
-        def f(x):
+        size = 100
+
+        def f(x, running_mean, running_var):
             with enable_python_dispatcher():
-                return torch.instance_norm(x, None, None, running_mean=torch.zeros(100), running_var=torch.ones(100),
+                return torch.instance_norm(x, None, None, running_mean, running_var,
                                            use_input_stats=True, momentum=0.1, eps=1e-5, cudnn_enabled=False)
-        self.assert_functionalization(f, torch.randn(20, 100, 35, 45))
+        self.assert_functionalization(f, torch.randn(20, size, 35, 45), torch.zeros(size), torch.ones(size))
         # On Windows, for instance_norm, the alias_copy's are reordered to come right before they need to be used
         # whereas on other platforms, the alias_copy's are before the view_copy's.
         # e.g., the alias_copy after the getitem_4 assignment would be moved to be right before the copy assignment.
         if not IS_WINDOWS:
-            logs = self.get_logs(f, torch.randn(20, 100, 35, 45))
+            logs = self.get_logs(f, torch.randn(20, size, 35, 45), torch.zeros(size), torch.ones(size))
             self.assertExpectedInline(logs, """\
 
 
-def forward(self, a_1):
-    zeros = torch.ops.aten.zeros.default([100], device = device(type='cpu'), pin_memory = False)
-    ones = torch.ops.aten.ones.default([100], device = device(type='cpu'), pin_memory = False)
-    repeat = torch.ops.aten.repeat.default(zeros, [20])
-    repeat_1 = torch.ops.aten.repeat.default(ones, [20])
-    view_copy = torch.ops.aten.view_copy.default(a_1, [1, 2000, 35, 45]);  a_1 = None
+def forward(self, arg0_1, arg1_1, arg2_1):
+    repeat = torch.ops.aten.repeat.default(arg1_1, [20])
+    repeat_1 = torch.ops.aten.repeat.default(arg2_1, [20])
+    view_copy = torch.ops.aten.view_copy.default(arg0_1, [1, 2000, 35, 45]);  arg0_1 = None
     empty = torch.ops.aten.empty.memory_format([0], dtype = torch.uint8, layout = torch.strided, device = device(type='cpu'))
     _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(view_copy, None, None, repeat, repeat_1, True, 0.1, 1e-05);  view_copy = repeat = repeat_1 = None
     getitem = _native_batch_norm_legit_functional[0]
@@ -1257,31 +1264,36 @@ def forward(self, a_1):
     getitem_2 = _native_batch_norm_legit_functional[2]
     getitem_3 = _native_batch_norm_legit_functional[3]
     getitem_4 = _native_batch_norm_legit_functional[4];  _native_batch_norm_legit_functional = None
-    alias_copy = torch.ops.aten.alias_copy.default(zeros);  zeros = None
+    alias_copy = torch.ops.aten.alias_copy.default(arg1_1)
     view_copy_1 = torch.ops.aten.view_copy.default(getitem_3, [20, 100])
     view_copy_2 = torch.ops.aten.view_copy.default(getitem_3, [20, 100]);  getitem_3 = None
     mean = torch.ops.aten.mean.dim(view_copy_2, [0]);  view_copy_2 = None
     copy = torch.ops.aten.copy.default(alias_copy, mean);  alias_copy = mean = None
-    alias_copy_1 = torch.ops.aten.alias_copy.default(ones);  ones = None
+    alias_copy_1 = torch.ops.aten.alias_copy.default(arg2_1)
     view_copy_3 = torch.ops.aten.view_copy.default(getitem_4, [20, 100])
     view_copy_4 = torch.ops.aten.view_copy.default(getitem_4, [20, 100]);  getitem_4 = None
     mean_1 = torch.ops.aten.mean.dim(view_copy_4, [0]);  view_copy_4 = None
     copy_1 = torch.ops.aten.copy.default(alias_copy_1, mean_1);  alias_copy_1 = mean_1 = None
     view_copy_5 = torch.ops.aten.view_copy.default(getitem, [20, 100, 35, 45]);  getitem = None
+    alias_copy_2 = torch.ops.aten.alias_copy.default(copy);  copy = None
+    copy_ = torch.ops.aten.copy_.default(arg1_1, alias_copy_2);  arg1_1 = alias_copy_2 = None
+    alias_copy_3 = torch.ops.aten.alias_copy.default(copy_1);  copy_1 = None
+    copy__1 = torch.ops.aten.copy_.default(arg2_1, alias_copy_3);  arg2_1 = alias_copy_3 = None
     return view_copy_5
     """)  # noqa: B950
 
-            reinplaced_logs = self.get_logs(f, torch.randn(20, 100, 35, 45), reapply_views=True, run_reinplace=True)
+            reinplaced_logs = self.get_logs(
+                f, torch.randn(20, size, 35, 45), torch.zeros(size), torch.ones(size),
+                reapply_views=True, run_reinplace=True
+            )
             self.assertExpectedInline(reinplaced_logs, """\
 
 
-def forward(self, a_1):
-    zeros = torch.ops.aten.zeros.default([100], device = device(type='cpu'), pin_memory = False)
-    ones = torch.ops.aten.ones.default([100], device = device(type='cpu'), pin_memory = False)
-    repeat = torch.ops.aten.repeat.default(zeros, [20])
-    repeat_1 = torch.ops.aten.repeat.default(ones, [20])
-    view = torch.ops.aten.view.default(a_1, [1, 2000, 35, 45]);  a_1 = None
+def forward(self, arg0_1, arg1_1, arg2_1):
+    repeat = torch.ops.aten.repeat.default(arg1_1, [20])
+    repeat_1 = torch.ops.aten.repeat.default(arg2_1, [20])
+    view = torch.ops.aten.view.default(arg0_1, [1, 2000, 35, 45]);  arg0_1 = None
     empty = torch.ops.aten.empty.memory_format([0], dtype = torch.uint8, layout = torch.strided, device = device(type='cpu'))
     _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(view, None, None, repeat, repeat_1, True, 0.1, 1e-05);  view = repeat = repeat_1 = None
     getitem = _native_batch_norm_legit_functional[0]
@@ -1289,144 +1301,66 @@ def forward(self, a_1):
     getitem_2 = _native_batch_norm_legit_functional[2]
     getitem_3 = _native_batch_norm_legit_functional[3]
     getitem_4 = _native_batch_norm_legit_functional[4];  _native_batch_norm_legit_functional = None
-    alias = torch.ops.aten.alias.default(zeros);  zeros = None
+    alias = torch.ops.aten.alias.default(arg1_1)
     view_1 = torch.ops.aten.view.default(getitem_3, [20, 100])
     view_2 = torch.ops.aten.view.default(getitem_3, [20, 100]);  getitem_3 = None
     mean = torch.ops.aten.mean.dim(view_2, [0]);  view_2 = None
-    copy = torch.ops.aten.copy_.default(alias, mean);  alias = mean = None
-    alias_1 = torch.ops.aten.alias.default(ones);  ones = None
+    copy = torch.ops.aten.copy.default(alias, mean);  alias = mean = None
+    alias_1 = torch.ops.aten.alias.default(arg2_1)
     view_3 = torch.ops.aten.view.default(getitem_4, [20, 100])
     view_4 = torch.ops.aten.view.default(getitem_4, [20, 100]);  getitem_4 = None
     mean_1 = torch.ops.aten.mean.dim(view_4, [0]);  view_4 = None
-    copy_1 = torch.ops.aten.copy_.default(alias_1, mean_1);  alias_1 = mean_1 = None
+    copy_1 = torch.ops.aten.copy.default(alias_1, mean_1);  alias_1 = mean_1 = None
     view_5 = torch.ops.aten.view.default(getitem, [20, 100, 35, 45]);  getitem = None
-    return view_5
-    """)  # noqa: B950
-
-
-    def test_instance_norm_running_mean_is_x(self):
-        size = 100
-
-        def f(x):
-            with enable_python_dispatcher():
-                return torch.instance_norm(
-                    torch.arange(20 * size * 35 * 45, dtype=torch.float32).reshape(20, size, 35, 45), None, None,
-                    x, torch.ones(size), use_input_stats=True, momentum=0.1, eps=1e-5, cudnn_enabled=False)
-        self.assert_functionalization(f, torch.zeros(size))
-        logs = self.get_logs(f, torch.zeros(size))
-        # On Windows, for instance_norm, the alias_copy's are reordered to come right before they need to be used
-        # whereas on other platforms, the alias_copy's are before the view_copy's.
-        # e.g., the alias_copy after the getitem_4 assignment would be moved to be right before the copy assignment.
-        if not IS_WINDOWS:
-            self.assertExpectedInline(logs, """\
-
-
-
-def forward(self, a_1):
-    arange = torch.ops.aten.arange.default(3150000, dtype = torch.float32, device = device(type='cpu'), pin_memory = False)
-    view_copy = torch.ops.aten.view_copy.default(arange, [20, 100, 35, 45]);  arange = None
-    ones = torch.ops.aten.ones.default([100], device = device(type='cpu'), pin_memory = False)
-    repeat = torch.ops.aten.repeat.default(a_1, [20])
-    repeat_1 = torch.ops.aten.repeat.default(ones, [20])
-    view_copy_1 = torch.ops.aten.view_copy.default(view_copy, [1, 2000, 35, 45]);  view_copy = None
-    empty = torch.ops.aten.empty.memory_format([0], dtype = torch.uint8, layout = torch.strided, device = device(type='cpu'))
-    _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(view_copy_1, None, None, repeat, repeat_1, True, 0.1, 1e-05);  view_copy_1 = repeat = repeat_1 = None
-    getitem = _native_batch_norm_legit_functional[0]
-    getitem_1 = _native_batch_norm_legit_functional[1]
-    getitem_2 = _native_batch_norm_legit_functional[2]
-    getitem_3 = _native_batch_norm_legit_functional[3]
-    getitem_4 = _native_batch_norm_legit_functional[4];  _native_batch_norm_legit_functional = None
-    alias_copy = torch.ops.aten.alias_copy.default(a_1)
-    view_copy_2 = torch.ops.aten.view_copy.default(getitem_3, [20, 100])
-    view_copy_3 = torch.ops.aten.view_copy.default(getitem_3, [20, 100]);  getitem_3 = None
-    mean = torch.ops.aten.mean.dim(view_copy_3, [0]);  view_copy_3 = None
-    copy = torch.ops.aten.copy.default(alias_copy, mean);  alias_copy = mean = None
-    alias_copy_1 = torch.ops.aten.alias_copy.default(ones);  ones = None
-    view_copy_4 = torch.ops.aten.view_copy.default(getitem_4, [20, 100])
-    view_copy_5 = torch.ops.aten.view_copy.default(getitem_4, [20, 100]);  getitem_4 = None
-    mean_1 = torch.ops.aten.mean.dim(view_copy_5, [0]);  view_copy_5 = None
-    copy_1 = torch.ops.aten.copy.default(alias_copy_1, mean_1);  alias_copy_1 = mean_1 = None
-    view_copy_6 = torch.ops.aten.view_copy.default(getitem, [20, 100, 35, 45]);  getitem = None
-    alias_copy_2 = torch.ops.aten.alias_copy.default(copy);  copy = None
-    copy_ = torch.ops.aten.copy_.default(a_1, alias_copy_2);  a_1 = alias_copy_2 = None
-    return view_copy_6
-    """)  # noqa: B950
-
-            reinplaced_logs = self.get_logs(f, torch.zeros(size), reapply_views=True, run_reinplace=True)
-            self.assertExpectedInline(reinplaced_logs, """\
-
-
-
-def forward(self, a_1):
-    arange = torch.ops.aten.arange.default(3150000, dtype = torch.float32, device = device(type='cpu'), pin_memory = False)
-    view = torch.ops.aten.view.default(arange, [20, 100, 35, 45]);  arange = None
-    ones = torch.ops.aten.ones.default([100], device = device(type='cpu'), pin_memory = False)
-    repeat = torch.ops.aten.repeat.default(a_1, [20])
-    repeat_1 = torch.ops.aten.repeat.default(ones, [20])
-    view_1 = torch.ops.aten.view.default(view, [1, 2000, 35, 45]);  view = None
-    empty = torch.ops.aten.empty.memory_format([0], dtype = torch.uint8, layout = torch.strided, device = device(type='cpu'))
-    _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(view_1, None, None, repeat, repeat_1, True, 0.1, 1e-05);  view_1 = repeat = repeat_1 = None
-    getitem = _native_batch_norm_legit_functional[0]
-    getitem_1 = _native_batch_norm_legit_functional[1]
-    getitem_2 = _native_batch_norm_legit_functional[2]
-    getitem_3 = _native_batch_norm_legit_functional[3]
-    getitem_4 = _native_batch_norm_legit_functional[4];  _native_batch_norm_legit_functional = None
-    alias = torch.ops.aten.alias.default(a_1)
-    view_2 = torch.ops.aten.view.default(getitem_3, [20, 100])
-    view_3 = torch.ops.aten.view.default(getitem_3, [20, 100]);  getitem_3 = None
-    mean = torch.ops.aten.mean.dim(view_3, [0]);  view_3 = None
-    copy = torch.ops.aten.copy.default(alias, mean);  alias = mean = None
-    alias_1 = torch.ops.aten.alias.default(ones);  ones = None
-    view_4 = torch.ops.aten.view.default(getitem_4, [20, 100])
-    view_5 = torch.ops.aten.view.default(getitem_4, [20, 100]);  getitem_4 = None
-    mean_1 = torch.ops.aten.mean.dim(view_5, [0]);  view_5 = None
-    copy_1 = torch.ops.aten.copy_.default(alias_1, mean_1);  alias_1 = mean_1 = None
-    view_6 = torch.ops.aten.view.default(getitem, [20, 100, 35, 45]);  getitem = None
     alias_2 = torch.ops.aten.alias.default(copy);  copy = None
-    copy_ = torch.ops.aten.copy_.default(a_1, alias_2);  a_1 = alias_2 = None
-    return view_6
+    copy_ = torch.ops.aten.copy_.default(arg1_1, alias_2);  arg1_1 = alias_2 = None
+    alias_3 = torch.ops.aten.alias.default(copy_1);  copy_1 = None
+    copy__1 = torch.ops.aten.copy_.default(arg2_1, alias_3);  arg2_1 = alias_3 = None
+    return view_5
     """)  # noqa: B950
 
 
     def test_batch_norm(self):
-        def f(x):
+        def f(x, running_mean, running_var):
             with enable_python_dispatcher():
-                return torch.batch_norm(x, None, None, torch.zeros(100), torch.ones(100), False, 0.1, 1e-5, False)
+                return torch.batch_norm(x, None, None, running_mean, running_var, False, 0.1, 1e-5, False)
 
-        self.assert_functionalization(f, torch.randn(20, 100, 35, 45))
-        logs = self.get_logs(f, torch.randn(20, 100, 35, 45))
+        self.assert_functionalization(f, torch.randn(20, 100, 35, 45), torch.zeros(100), torch.ones(100))
+        logs = self.get_logs(f, torch.randn(20, 100, 35, 45), torch.zeros(100), torch.ones(100))
         self.assertExpectedInline(logs, """\
 
 
-def forward(self, a_1):
-    zeros = torch.ops.aten.zeros.default([100], device = device(type='cpu'), pin_memory = False)
-    ones = torch.ops.aten.ones.default([100], device = device(type='cpu'), pin_memory = False)
+def forward(self, arg0_1, arg1_1, arg2_1):
     empty = torch.ops.aten.empty.memory_format([0], dtype = torch.uint8, layout = torch.strided, device = device(type='cpu'))
-    _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(a_1, None, None, zeros, ones, False, 0.1, 1e-05);  a_1 = zeros = ones = None
+    _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(arg0_1, None, None, arg1_1, arg2_1, False, 0.1, 1e-05);  arg0_1 = None
     getitem = _native_batch_norm_legit_functional[0]
     getitem_1 = _native_batch_norm_legit_functional[1]
     getitem_2 = _native_batch_norm_legit_functional[2]
     getitem_3 = _native_batch_norm_legit_functional[3]
     getitem_4 = _native_batch_norm_legit_functional[4];  _native_batch_norm_legit_functional = None
+    copy_ = torch.ops.aten.copy_.default(arg1_1, getitem_3);  arg1_1 = getitem_3 = None
+    copy__1 = torch.ops.aten.copy_.default(arg2_1, getitem_4);  arg2_1 = getitem_4 = None
     return getitem
     """)  # noqa: B950
 
-        reinplaced_logs = self.get_logs(f, torch.randn(20, 100, 35, 45), reapply_views=True, run_reinplace=True)
+        reinplaced_logs = self.get_logs(
+            f, torch.randn(20, 100, 35, 45), torch.zeros(100), torch.ones(100), reapply_views=True, run_reinplace=True
+        )
         self.assertExpectedInline(reinplaced_logs, """\
 
 
-def forward(self, a_1):
-    zeros = torch.ops.aten.zeros.default([100], device = device(type='cpu'), pin_memory = False)
-    ones = torch.ops.aten.ones.default([100], device = device(type='cpu'), pin_memory = False)
+def forward(self, arg0_1, arg1_1, arg2_1):
     empty = torch.ops.aten.empty.memory_format([0], dtype = torch.uint8, layout = torch.strided, device = device(type='cpu'))
-    _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(a_1, None, None, zeros, ones, False, 0.1, 1e-05);  a_1 = zeros = ones = None
+    _native_batch_norm_legit_functional = torch.ops.aten._native_batch_norm_legit_functional.default(arg0_1, None, None, arg1_1, arg2_1, False, 0.1, 1e-05);  arg0_1 = None
     getitem = _native_batch_norm_legit_functional[0]
     getitem_1 = _native_batch_norm_legit_functional[1]
     getitem_2 = _native_batch_norm_legit_functional[2]
     getitem_3 = _native_batch_norm_legit_functional[3]
     getitem_4 = _native_batch_norm_legit_functional[4];  _native_batch_norm_legit_functional = None
+    copy_ = torch.ops.aten.copy_.default(arg1_1, getitem_3);  arg1_1 = getitem_3 = None
+    copy__1 = torch.ops.aten.copy_.default(arg2_1, getitem_4);  arg2_1 = getitem_4 = None
     return getitem
     """)  # noqa: B950
 

From 7ded04b938ad872d5cb741b4f209df56599f8698 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Tue, 29 Nov 2022 17:03:43 -0800
Subject: [PATCH 1438/1922] [quant][be] Simplify `insert_observers_for_model`
 in fx/prepare.py (#89887)

Summary:
att

Test Plan:
python test/test_quantization.py TestQuantizeFx

Reviewers:

Subscribers:

Tasks:

Tags:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89887
Approved by: https://github.com/andrewor14
---
 torch/ao/quantization/fx/prepare.py | 32 ++++++++++++++---------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index 185980816f2cd..1d623fdeda205 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -1102,14 +1102,10 @@ def swap_custom_module_to_observed(
 
 def insert_observers_for_model(
     model: GraphModule,
-    modules: Dict[str, torch.nn.Module],
     matches: Dict[str, _MatchResultWithQConfig],
     node_name_to_qconfig: Dict[str, QConfigAny],
-    graph: Graph,
     prepare_custom_config: PrepareCustomConfig,
     equalization_config_map: Dict[str, Any],
-    input_quantized_idxs: List[int],
-    output_quantized_idxs: List[int],
     backend_config: BackendConfig,
     observed_node_names: Set[str],
     is_qat: bool,
@@ -1181,6 +1177,8 @@ def insert_observers_for_model(
     for node in model.graph.nodes:
         root_node, _, pattern, qhandler, qconfig = matches.get(
             node.name, (None, None, None, None, None))
+        input_quantized_idxs: List[int] = prepare_custom_config.input_quantized_indexes
+        output_quantized_idxs: List[int] = prepare_custom_config.output_quantized_indexes
         node_name_to_target_dtype_info[node.name] = get_target_activation_dtype_for_node(
             node, qconfig, inputs_seen_counter, outputs_seen_counter,
             input_quantized_idxs, output_quantized_idxs, qhandler,
@@ -1281,7 +1279,7 @@ def insert_observers_for_model(
                     if is_input_node_of_the_pattern:
                         # this modifies node inplace
                         maybe_insert_input_observers_for_node(
-                            node, qconfig, model, modules, graph,
+                            node, qconfig, model, modules, model.graph,
                             node_name_to_target_dtype_info,
                             qhandler,
                             prepare_custom_config,
@@ -1289,7 +1287,7 @@ def insert_observers_for_model(
 
                         # Insert equalization input observers if needed
                         maybe_insert_input_equalization_observers_for_node(
-                            node, equalization_qconfig, model, modules, graph,
+                            node, equalization_qconfig, model, modules, model.graph,
                             node_name_to_target_dtype_info, is_quantized_branch, backend_config)
 
                     is_last_node_of_pattern = node is last_node
@@ -1310,12 +1308,12 @@ def insert_observers_for_model(
                             # these output observers are the same as DeQuantStubs. In the future, we
                             # should resolve this inconsistency by inserting DeQuantStubs for all custom
                             # modules, not just for LSTM.
-                            _insert_dequant_stubs_for_custom_module_lstm_output(node, model, modules, graph)
+                            _insert_dequant_stubs_for_custom_module_lstm_output(node, model, modules, model.graph)
                             swap_custom_module_to_observed(node, qconfig, modules, prepare_custom_config)
                         else:
                             # this returns the new observer node if it was needed
                             maybe_output_obs_node = maybe_insert_output_observer_for_node(
-                                node, model, modules, graph, matches,
+                                node, model, modules, model.graph, matches,
                                 node_name_to_target_dtype_info, pattern, qhandler, is_qat)
 
                             if maybe_output_obs_node is not None:
@@ -1358,7 +1356,7 @@ def insert_observers_for_model(
                     maybe_insert_observers_before_graph_output(
                         node, output_quantized_idxs,
                         node_name_to_target_dtype_info, node_name_to_qconfig,
-                        model, modules, graph)
+                        model, modules, model.graph)
 
         #
         # After this point, the current node has input and output observers
@@ -1563,9 +1561,6 @@ def prepare(
         match_with_qconfig = (*match_without_qconfig, node_name_to_qconfig[node_name])
         matches[node_name] = match_with_qconfig
 
-    input_quantized_idxs: List[int] = prepare_custom_config.input_quantized_indexes
-    output_quantized_idxs: List[int] = prepare_custom_config.output_quantized_indexes
-
     run_prepare_fx_on_standalone_modules(
         model, is_qat, modules, matches, prepare_custom_config, backend_config)
 
@@ -1575,14 +1570,15 @@ def prepare(
     observed_node_names: Set[str] = set()
 
     result_node = insert_observers_for_model(
-        model, modules, matches, node_name_to_qconfig,
-        model.graph, prepare_custom_config,
+        model,
+        matches,
+        node_name_to_qconfig,
+        prepare_custom_config,
         equalization_node_name_to_qconfig,
-        input_quantized_idxs,
-        output_quantized_idxs,
         backend_config,
         observed_node_names,
-        is_qat)
+        is_qat
+    )
 
     save_state(model, node_name_to_qconfig, node_name_to_scope,
                prepare_custom_config, equalization_node_name_to_qconfig, qconfig_mapping, is_qat, observed_node_names)
@@ -1597,6 +1593,8 @@ def prepare(
         # these inputs are observed in parent
         # converting List[int] to Tensor since module attribute is
         # Union[Tensor, Module]
+        input_quantized_idxs: List[int] = prepare_custom_config.input_quantized_indexes
+        output_quantized_idxs: List[int] = prepare_custom_config.output_quantized_indexes
         model._standalone_module_input_quantized_idxs = \
             torch.tensor(input_quantized_idxs)
         model._standalone_module_output_quantized_idxs = torch.tensor(output_quantized_idxs)

From 2fffabaca4d53e0a390d987a186b0d0efec56435 Mon Sep 17 00:00:00 2001
From: Sijia Chen <sijiac@meta.com>
Date: Wed, 30 Nov 2022 21:15:21 +0000
Subject: [PATCH 1439/1922] [FIX][QAT] Switch to use `kwargs` when `args` is
 empty (#89778)

Summary:
When `ref_node.args` is empty, the QAT will throw index out of range. Here is an example, line 574 is using `tensors = ....` in torch.cat func, which will be treated as `kwargs`
{F800357376}

f388506954

To fix the issue, we will use the value of the first kwarg if args is empty

Test Plan: f388545532

Reviewed By: bigning, lyoka

Differential Revision: D41396771

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89778
Approved by: https://github.com/lyoka, https://github.com/houseroad
---
 torch/ao/quantization/fx/_lower_to_native_backend.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch/ao/quantization/fx/_lower_to_native_backend.py b/torch/ao/quantization/fx/_lower_to_native_backend.py
index 93c3d07e18805..c60385537fe41 100644
--- a/torch/ao/quantization/fx/_lower_to_native_backend.py
+++ b/torch/ao/quantization/fx/_lower_to_native_backend.py
@@ -829,7 +829,8 @@ def special_pattern_replacement(model: QuantizedGraphModule):
         is_call_function, is_call_method, is_call_module = is_special_pattern_node(ref_node, modules)
         if not (is_call_module or is_call_function or is_call_method):
             continue
-        dq_node_or_nodes = ref_node.args[0]
+        assert len(ref_node.args) > 0 or len(ref_node.kwargs) > 0
+        dq_node_or_nodes = ref_node.args[0] if len(ref_node.args) > 0 else list(ref_node.kwargs.values())[0]
         assert isinstance(dq_node_or_nodes, Node) or isinstance(dq_node_or_nodes, (tuple, list))
         is_dequantize = False
         if isinstance(dq_node_or_nodes, Node):

From 177f031485b8966492cdba328b51a5dcda0b513a Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Wed, 30 Nov 2022 02:49:48 +0000
Subject: [PATCH 1440/1922] [quant][decomposed] Add support for int32 for
 decomposed q/dq ops (#89881)

Summary:
att

Test Plan:
python test/test_quantization.py -k test_decomposed_quantize_per_tensor
python test/test_qunatization.py -k test_decomposed_dequantize_per_tensor

Reviewers:

Subscribers:

Tasks:

Tags:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89881
Approved by: https://github.com/cccclai
---
 .../core/test_quantized_tensor.py             | 53 ++++++++++---------
 torch/ao/quantization/fx/_decomposed.py       | 19 ++++---
 2 files changed, 38 insertions(+), 34 deletions(-)

diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py
index 4e7ba6409b5b9..241aab5da3237 100644
--- a/test/quantization/core/test_quantized_tensor.py
+++ b/test/quantization/core/test_quantized_tensor.py
@@ -1467,36 +1467,41 @@ def test_decomposed_quantize_per_tensor(self):
         # register the ops
         import torch.ao.quantization.fx._decomposed
         X = torch.randn(5, 10)
-        qdtype = torch.quint8
-        dtype = torch.uint8
-        scale, zero_point = _calculate_dynamic_qparams(X, qdtype)
-        quant_min, quant_max = 0, 255
-
-        quantized_X = torch.quantize_per_tensor(X, scale, zero_point, qdtype)
-        quantized_decomposed_X = \
-            torch.ops.quantized_decomposed.quantize_per_tensor(
-                X, scale, zero_point, quant_min, quant_max, dtype)
-        self.assertEqual(quantized_decomposed_X.dtype, dtype)
-        self.assertEqual(quantized_X.int_repr(), quantized_decomposed_X)
+        test_cases = [
+            (torch.quint8, torch.uint8, 0, 255),
+            (torch.qint8, torch.int8, -128, 127),
+            (torch.qint32, torch.int32, -2**31, 2**31 - 1),
+        ]
+        for qdtype, dtype, quant_min, quant_max in test_cases:
+            scale, zero_point = _calculate_dynamic_qparams(X, qdtype)
+            quantized_X = torch.quantize_per_tensor(X, scale, zero_point, qdtype)
+            quantized_decomposed_X = \
+                torch.ops.quantized_decomposed.quantize_per_tensor(
+                    X, scale, zero_point, quant_min, quant_max, dtype)
+            self.assertEqual(quantized_decomposed_X.dtype, dtype)
+            self.assertEqual(quantized_X.int_repr(), quantized_decomposed_X)
 
     def test_decomposed_dequantize_per_tensor(self):
         import torch.ao.quantization.fx._decomposed
         X = torch.randn(5, 10)
-        dtype = torch.uint8
-        qdtype = torch.quint8
-        scale, zero_point = _calculate_dynamic_qparams(X, qdtype)
-        quant_min, quant_max = 0, 255
+        test_cases = [
+            (torch.quint8, torch.uint8, 0, 255),
+            (torch.qint8, torch.int8, -128, 127),
+            (torch.qint32, torch.int32, -2**31, 2**31 - 1),
+        ]
 
-        quantized_X = torch.quantize_per_tensor(X, scale, zero_point, qdtype)
-        dequantized_X = torch.dequantize(quantized_X)
+        for qdtype, dtype, quant_min, quant_max in test_cases:
+            scale, zero_point = _calculate_dynamic_qparams(X, qdtype)
+            quantized_X = torch.quantize_per_tensor(X, scale, zero_point, qdtype)
+            dequantized_X = torch.dequantize(quantized_X)
 
-        quantized_decomposed_X = torch.ops.quantized_decomposed.quantize_per_tensor(
-            X, scale, zero_point, quant_min, quant_max, dtype)
-        dequantized_decomposed_X = torch.ops.quantized_decomposed.dequantize_per_tensor(
-            quantized_decomposed_X, scale, zero_point, quant_min, quant_max, dtype
-        )
-        self.assertEqual(quantized_X.int_repr(), quantized_decomposed_X)
-        self.assertEqual(dequantized_X, dequantized_decomposed_X)
+            quantized_decomposed_X = torch.ops.quantized_decomposed.quantize_per_tensor(
+                X, scale, zero_point, quant_min, quant_max, dtype)
+            dequantized_decomposed_X = torch.ops.quantized_decomposed.dequantize_per_tensor(
+                quantized_decomposed_X, scale, zero_point, quant_min, quant_max, dtype
+            )
+            self.assertEqual(quantized_X.int_repr(), quantized_decomposed_X)
+            self.assertEqual(dequantized_X, dequantized_decomposed_X)
 
     def test_decomposed_dynamic_quant_pattern(self):
         import torch.ao.quantization.fx._decomposed
diff --git a/torch/ao/quantization/fx/_decomposed.py b/torch/ao/quantization/fx/_decomposed.py
index ec814d6a17bb3..0e020a15a826d 100644
--- a/torch/ao/quantization/fx/_decomposed.py
+++ b/torch/ao/quantization/fx/_decomposed.py
@@ -7,18 +7,17 @@
 # name is not too long
 quantized_decomposed_lib = Library("quantized_decomposed", "DEF")
 
+_DTYPE_TO_QVALUE_BOUNDS = {
+    torch.uint8: (0, 255),
+    torch.int8: (-128, 127),
+    torch.int32: (-(2**31), 2**31 - 1)
+}
+
 # Helper to check the passed in quant min and max are valid for the dtype
 def _quant_min_max_bounds_check(quant_min, quant_max, dtype):
-    quant_min_lower_bound = 0
-    quant_max_upper_bound = 0
-    if dtype == torch.uint8:
-        quant_min_lower_bound = 0
-        quant_max_upper_bound = 255
-    elif dtype == torch.int8:
-        quant_min_lower_bound = -128
-        quant_max_upper_bound = 127
-    else:
+    if dtype not in _DTYPE_TO_QVALUE_BOUNDS:
         raise ValueError(f"Unsupported dtype: {dtype}")
+    quant_min_lower_bound, quant_max_upper_bound = _DTYPE_TO_QVALUE_BOUNDS[dtype]
 
     assert quant_min >= quant_min_lower_bound, \
         "quant_min out of bound for dtype, " \
@@ -126,7 +125,7 @@ def dequantize_per_tensor(
        dequantized float32 Tensor
     """
     assert input.dtype == dtype, f"Expecting input to have dtype: {dtype}"
-    if dtype in [torch.uint8, torch.int8]:
+    if dtype in [torch.uint8, torch.int8, torch.int32]:
         # TODO: investigate why
         # (input - zero_point).to(torch.float32) * scale
         # failed the test

From 3c267868ffee5531cc4cecc8cb1a39d58b48823a Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@fb.com>
Date: Wed, 30 Nov 2022 21:24:42 +0000
Subject: [PATCH 1441/1922] Add dynamo smoke tests to CI (#89302)

Add dynamo smoke tests to CI, which checks for python/torch/cuda versions and runs simple dynamo examples on a few backends, including inductor. Smoke tests will run on dynamo and inductor shards.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89302
Approved by: https://github.com/malfet
---
 .jenkins/pytorch/test.sh      |  2 ++
 tools/dynamo/verify_dynamo.py | 11 +++++++++++
 2 files changed, 13 insertions(+)

diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index ca50a31beb60b..29f9657beaf33 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -215,6 +215,7 @@ test_dynamo_shard() {
     echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
     exit 1
   fi
+  python tools/dynamo/verify_dynamo.py
   # Temporarily disable test_fx for dynamo pending the investigation on TTS
   # regression in https://github.com/pytorch/torchdynamo/issues/784
   time python test/run_test.py \
@@ -249,6 +250,7 @@ test_inductor_distributed() {
 }
 
 test_inductor() {
+  python tools/dynamo/verify_dynamo.py
   python test/run_test.py --include test_modules test_ops --verbose
   PYTORCH_TEST_WITH_INDUCTOR=0 python test/run_test.py --include inductor/test_torchinductor --include inductor/test_torchinductor_opinfo --verbose
   # TODO: investigate "RuntimeError: CUDA driver API confirmed a leak"
diff --git a/tools/dynamo/verify_dynamo.py b/tools/dynamo/verify_dynamo.py
index cbc582a561573..df03e6331728b 100644
--- a/tools/dynamo/verify_dynamo.py
+++ b/tools/dynamo/verify_dynamo.py
@@ -94,6 +94,17 @@ def check_dynamo(backend, device, err_msg):
     try:
         import torch._dynamo as dynamo
 
+        if device == "cuda":
+            import torch._inductor.utils as utils
+
+            if not utils.has_triton():
+                print(
+                    f"WARNING: CUDA available but triton cannot be used. "
+                    f"Your GPU may not be supported. "
+                    f"Skipping CUDA check on {backend} backend\n"
+                )
+                return
+
         dynamo.reset()
 
         @dynamo.optimize(backend, nopython=True)

From b69f3e702dfa3bcfee94952fedf0c279ed4ec3f3 Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Wed, 30 Nov 2022 18:08:01 +0000
Subject: [PATCH 1442/1922] [Easy][FSDP] Fix pyre error (#89930)

This PR attemps to fix the following pyre error:

```
Incompatible parameter type [6]: In call
`dist.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel.__init__`,
for 7th parameter `auto_wrap_policy` expected
`Optional[typing.Callable[..., typing.Any]]` but got
`Optional[_FSDPPolicy]`.
```

Besides, this also removes the type inconsistency in code and docstring.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89930
Approved by: https://github.com/awgu
---
 torch/distributed/fsdp/fully_sharded_data_parallel.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 779499f449532..08cd0362c9fa3 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -96,6 +96,7 @@
 )
 from ._utils import p_assert
 from .flat_param import FlatParameter, FlatParamHandle
+from .wrap import _FSDPPolicy
 
 
 __all__ = [
@@ -317,7 +318,7 @@ def __init__(
         process_group: Optional[ProcessGroup] = None,
         sharding_strategy: Optional[ShardingStrategy] = None,
         cpu_offload: Optional[CPUOffload] = None,
-        auto_wrap_policy: Optional[Callable] = None,
+        auto_wrap_policy: Optional[Union[Callable, _FSDPPolicy]] = None,
         backward_prefetch: Optional[BackwardPrefetch] = BackwardPrefetch.BACKWARD_PRE,
         mixed_precision: Optional[MixedPrecision] = None,
         ignored_modules: Optional[Iterable[torch.nn.Module]] = None,

From bc47d441f43e58eb5d12a7c89340b0c9ec16d38e Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Mon, 28 Nov 2022 20:01:36 -0800
Subject: [PATCH 1443/1922] Small fix for `torch._C.Graph` type hint (#89821)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89821
Approved by: https://github.com/kit1980
---
 torch/_C/__init__.pyi.in | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index cc1f1ed66714a..bbe9a71e8718a 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -615,7 +615,7 @@ class Graph:
     def nodes(self) -> Iterator[Node]: ...
     def param_node(self) -> Node: ...
     def return_node(self) -> Node: ...
-    def addInput(self, name: str) -> Value: ...
+    def addInput(self, name: str = "") -> Value: ...
     def eraseInput(self, i: _int) -> None: ...
     def registerOutput(self, n: Value) -> _int: ...
     def eraseOutput(self, i: _int) -> None: ...
@@ -631,6 +631,7 @@ class Graph:
     def insertPoint(self) -> Node: ...
     def insertGraph(self, callee: Graph, inputs: List[Value]) -> List[Value]: ...
     def makeMultiOutputIntoTuple(self) -> None: ...
+    def copy(self) -> Graph: ...
     ...
 
 
From bc96c6ac0ea5cbd316895f360ba8c213b76c1148 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@meta.com>
Date: Wed, 30 Nov 2022 22:07:29 +0000
Subject: [PATCH 1444/1922] Subscribing janeyx99 to optimizer PRs (#89943)

Adding myself to keep updated with what's up in the world of optimizers
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89943
Approved by: https://github.com/albanD
---
 CODEOWNERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index 546d76430266a..c5699b137a5ed 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -15,7 +15,7 @@
 /torch/autograd/ @albanD @soulitzer
 /tools/autograd/ @albanD @soulitzer
 /torch/nn/ @albanD @jbschlosser
-/torch/optim/ @albanD
+/torch/optim/ @albanD @janeyx99
 /test/test_public_bindings.py @albanD
 /test/allowlist_for_publicAPI.json @albanD @anjali411
 /docs/source/conf.py @albanD

From 501747eb61675a453a7a3067b2995ee06d73e061 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Wed, 30 Nov 2022 19:24:01 +0000
Subject: [PATCH 1445/1922] Move tensor_parallel out to distributed.tensor
 folder (#89878)

This PR moves tensor parallel from torch.distributed._tensor.parallel
to torch.distributed.tensor.parallel, to prepare for beta release
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89878
Approved by: https://github.com/fduwjj
---
 docs/source/distributed.rst                       |  1 +
 docs/source/distributed.tensor.parallel.rst       |  7 +++++++
 docs/source/index.rst                             |  1 +
 .../{_tensor => tensor}/parallel/__init__.py      |  0
 .../parallel/test_2d_parallel.py                  |  4 ++--
 .../parallel/test_parallelize_api.py              |  8 ++++----
 .../parallel/test_tp_examples.py                  |  2 +-
 .../{_tensor => tensor}/parallel/test_tp_style.py |  2 +-
 .../parallel/test_view_sharding_dim_change.py     |  2 +-
 torch/distributed/tensor/__init__.py              |  0
 .../{_tensor => tensor}/parallel/__init__.py      |  6 +++---
 .../utils.py => tensor/parallel/_utils.py}        |  0
 .../parallel/_view_with_dim_change.py             |  0
 .../{_tensor => tensor}/parallel/api.py           |  6 +++---
 .../{_tensor => tensor}/parallel/fsdp.py          |  0
 .../parallel/multihead_attention_tp.py            |  4 +++-
 .../{_tensor => tensor}/parallel/style.py         | 15 ++++++++++++++-
 17 files changed, 41 insertions(+), 17 deletions(-)
 create mode 100644 docs/source/distributed.tensor.parallel.rst
 rename test/distributed/{_tensor => tensor}/parallel/__init__.py (100%)
 rename test/distributed/{_tensor => tensor}/parallel/test_2d_parallel.py (98%)
 rename test/distributed/{_tensor => tensor}/parallel/test_parallelize_api.py (97%)
 rename test/distributed/{_tensor => tensor}/parallel/test_tp_examples.py (99%)
 rename test/distributed/{_tensor => tensor}/parallel/test_tp_style.py (99%)
 rename test/distributed/{_tensor => tensor}/parallel/test_view_sharding_dim_change.py (93%)
 create mode 100644 torch/distributed/tensor/__init__.py
 rename torch/distributed/{_tensor => tensor}/parallel/__init__.py (80%)
 rename torch/distributed/{_tensor/parallel/utils.py => tensor/parallel/_utils.py} (100%)
 rename torch/distributed/{_tensor => tensor}/parallel/_view_with_dim_change.py (100%)
 rename torch/distributed/{_tensor => tensor}/parallel/api.py (98%)
 rename torch/distributed/{_tensor => tensor}/parallel/fsdp.py (100%)
 rename torch/distributed/{_tensor => tensor}/parallel/multihead_attention_tp.py (98%)
 rename torch/distributed/{_tensor => tensor}/parallel/style.py (96%)

diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst
index 62e16ebb8a7b7..c5cea6f47d23a 100644
--- a/docs/source/distributed.rst
+++ b/docs/source/distributed.rst
@@ -856,3 +856,4 @@ the `NCCL` backend is used and the user attempts to use a GPU that is not availa
 .. py:module:: torch.distributed.pipeline
 .. py:module:: torch.distributed.pipeline.sync
 .. py:module:: torch.distributed.pipeline.sync.skip
+.. py:module:: torch.distributed.tensor
diff --git a/docs/source/distributed.tensor.parallel.rst b/docs/source/distributed.tensor.parallel.rst
new file mode 100644
index 0000000000000..64544539edd43
--- /dev/null
+++ b/docs/source/distributed.tensor.parallel.rst
@@ -0,0 +1,7 @@
+.. role:: hidden
+    :class: hidden-section
+
+Tensor Parallelism
+========================
+.. py:module:: torch.distributed.tensor.parallel
+.. currentmodule:: torch.distributed.tensor.parallel
diff --git a/docs/source/index.rst b/docs/source/index.rst
index e43160f668fc7..eaf2664159afe 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -77,6 +77,7 @@ Features described in this documentation are classified by release status:
    torch.distributed.elastic <distributed.elastic>
    torch.distributed.fsdp <fsdp>
    torch.distributed.optim <distributed.optim>
+   torch.distributed.tensor.parallel <distributed.tensor.parallel>
    torch.distributed.checkpoint <distributed.checkpoint>
    torch.distributions <distributions>
    torch._dynamo <_dynamo>
diff --git a/test/distributed/_tensor/parallel/__init__.py b/test/distributed/tensor/parallel/__init__.py
similarity index 100%
rename from test/distributed/_tensor/parallel/__init__.py
rename to test/distributed/tensor/parallel/__init__.py
diff --git a/test/distributed/_tensor/parallel/test_2d_parallel.py b/test/distributed/tensor/parallel/test_2d_parallel.py
similarity index 98%
rename from test/distributed/_tensor/parallel/test_2d_parallel.py
rename to test/distributed/tensor/parallel/test_2d_parallel.py
index da6d1f5cfabd2..f203846cb711a 100644
--- a/test/distributed/_tensor/parallel/test_2d_parallel.py
+++ b/test/distributed/tensor/parallel/test_2d_parallel.py
@@ -14,7 +14,7 @@
     DTensor as DT,
     Replicate,
 )
-from torch.distributed._tensor.parallel import (
+from torch.distributed.tensor.parallel import (
     PairwiseParallel,
     parallelize_module,
 )
@@ -23,7 +23,7 @@
 
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.distributed._tensor.parallel.fsdp import is_available
+from torch.distributed.tensor.parallel.fsdp import is_available
 
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
diff --git a/test/distributed/_tensor/parallel/test_parallelize_api.py b/test/distributed/tensor/parallel/test_parallelize_api.py
similarity index 97%
rename from test/distributed/_tensor/parallel/test_parallelize_api.py
rename to test/distributed/tensor/parallel/test_parallelize_api.py
index 82ba0b0032c6a..bb2055575f69c 100644
--- a/test/distributed/_tensor/parallel/test_parallelize_api.py
+++ b/test/distributed/tensor/parallel/test_parallelize_api.py
@@ -4,18 +4,18 @@
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import DTensorTestBase, with_comms
 from torch.distributed._tensor import DeviceMesh, Replicate, DTensor
-from torch.distributed._tensor.parallel.style import (
+from torch.distributed.tensor.parallel.style import (
     ColwiseParallel,
     PairwiseParallel,
     ParallelStyle,
     RowwiseParallel,
 )
-from torch.distributed._tensor.parallel.api import (
+from torch.distributed.tensor.parallel.api import (
     _parallelize_linear,
     _parallelize_mlp,
 )
-from torch.distributed._tensor.parallel.utils import _create_1d_device_mesh
-from torch.distributed._tensor.parallel.style import (
+from torch.distributed.tensor.parallel._utils import _create_1d_device_mesh
+from torch.distributed.tensor.parallel.style import (
     make_input_replicate_1d,
     make_output_replicate_1d,
 )
diff --git a/test/distributed/_tensor/parallel/test_tp_examples.py b/test/distributed/tensor/parallel/test_tp_examples.py
similarity index 99%
rename from test/distributed/_tensor/parallel/test_tp_examples.py
rename to test/distributed/tensor/parallel/test_tp_examples.py
index 73cf9e05b223e..963ea797fd839 100644
--- a/test/distributed/_tensor/parallel/test_tp_examples.py
+++ b/test/distributed/tensor/parallel/test_tp_examples.py
@@ -14,7 +14,7 @@
     DeviceMesh,
     Replicate,
 )
-from torch.distributed._tensor.parallel import (
+from torch.distributed.tensor.parallel import (
     PairwiseParallel,
     TensorParallelMultiheadAttention,
     parallelize_module,
diff --git a/test/distributed/_tensor/parallel/test_tp_style.py b/test/distributed/tensor/parallel/test_tp_style.py
similarity index 99%
rename from test/distributed/_tensor/parallel/test_tp_style.py
rename to test/distributed/tensor/parallel/test_tp_style.py
index 0562c6713da46..d7ad841281cbd 100644
--- a/test/distributed/_tensor/parallel/test_tp_style.py
+++ b/test/distributed/tensor/parallel/test_tp_style.py
@@ -5,7 +5,7 @@
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import DTensorTestBase, with_comms
 from torch.distributed._tensor import distribute_tensor, DeviceMesh, Shard, Replicate
-from torch.distributed._tensor.parallel.style import (
+from torch.distributed.tensor.parallel.style import (
     RowwiseParallel,
     ColwiseParallel,
     make_input_shard_1d,
diff --git a/test/distributed/_tensor/parallel/test_view_sharding_dim_change.py b/test/distributed/tensor/parallel/test_view_sharding_dim_change.py
similarity index 93%
rename from test/distributed/_tensor/parallel/test_view_sharding_dim_change.py
rename to test/distributed/tensor/parallel/test_view_sharding_dim_change.py
index 4648d930b9eb6..b02382e20bccd 100644
--- a/test/distributed/_tensor/parallel/test_view_sharding_dim_change.py
+++ b/test/distributed/tensor/parallel/test_view_sharding_dim_change.py
@@ -8,7 +8,7 @@
     with_comms,
 )
 from torch.distributed._tensor import DeviceMesh, DTensor, Shard
-from torch.distributed._tensor.parallel._view_with_dim_change import (
+from torch.distributed.tensor.parallel._view_with_dim_change import (
     _view_with_sharding_dim_change,
 )
 
diff --git a/torch/distributed/tensor/__init__.py b/torch/distributed/tensor/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/torch/distributed/_tensor/parallel/__init__.py b/torch/distributed/tensor/parallel/__init__.py
similarity index 80%
rename from torch/distributed/_tensor/parallel/__init__.py
rename to torch/distributed/tensor/parallel/__init__.py
index bf050d57d1697..fc0c760d6ccdc 100644
--- a/torch/distributed/_tensor/parallel/__init__.py
+++ b/torch/distributed/tensor/parallel/__init__.py
@@ -1,9 +1,9 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
-from torch.distributed._tensor.parallel.multihead_attention_tp import (
+from torch.distributed.tensor.parallel.multihead_attention_tp import (
     TensorParallelMultiheadAttention,
 )
 
-from torch.distributed._tensor.parallel.style import (
+from torch.distributed.tensor.parallel.style import (
     ColwiseParallel,
     PairwiseParallel,
     ParallelStyle,
@@ -16,7 +16,7 @@
     make_output_tensor,
 )
 
-from torch.distributed._tensor.parallel.api import (
+from torch.distributed.tensor.parallel.api import (
     parallelize_module,
 )
 
diff --git a/torch/distributed/_tensor/parallel/utils.py b/torch/distributed/tensor/parallel/_utils.py
similarity index 100%
rename from torch/distributed/_tensor/parallel/utils.py
rename to torch/distributed/tensor/parallel/_utils.py
diff --git a/torch/distributed/_tensor/parallel/_view_with_dim_change.py b/torch/distributed/tensor/parallel/_view_with_dim_change.py
similarity index 100%
rename from torch/distributed/_tensor/parallel/_view_with_dim_change.py
rename to torch/distributed/tensor/parallel/_view_with_dim_change.py
diff --git a/torch/distributed/_tensor/parallel/api.py b/torch/distributed/tensor/parallel/api.py
similarity index 98%
rename from torch/distributed/_tensor/parallel/api.py
rename to torch/distributed/tensor/parallel/api.py
index a1c513078b95a..e1f2328de5964 100644
--- a/torch/distributed/_tensor/parallel/api.py
+++ b/torch/distributed/tensor/parallel/api.py
@@ -9,14 +9,14 @@
     Replicate,
     DeviceMesh,
 )
-from torch.distributed._tensor.parallel import TensorParallelMultiheadAttention
-from torch.distributed._tensor.parallel.style import (
+from torch.distributed.tensor.parallel import TensorParallelMultiheadAttention
+from torch.distributed.tensor.parallel.style import (
     ColwiseParallel,
     PairwiseParallel,
     ParallelStyle,
     RowwiseParallel,
 )
-from torch.distributed._tensor.parallel.utils import _create_1d_device_mesh
+from torch.distributed.tensor.parallel._utils import _create_1d_device_mesh
 
 
 __all__ = [
diff --git a/torch/distributed/_tensor/parallel/fsdp.py b/torch/distributed/tensor/parallel/fsdp.py
similarity index 100%
rename from torch/distributed/_tensor/parallel/fsdp.py
rename to torch/distributed/tensor/parallel/fsdp.py
diff --git a/torch/distributed/_tensor/parallel/multihead_attention_tp.py b/torch/distributed/tensor/parallel/multihead_attention_tp.py
similarity index 98%
rename from torch/distributed/_tensor/parallel/multihead_attention_tp.py
rename to torch/distributed/tensor/parallel/multihead_attention_tp.py
index 3071f42632fd5..03dc25161bdf8 100644
--- a/torch/distributed/_tensor/parallel/multihead_attention_tp.py
+++ b/torch/distributed/tensor/parallel/multihead_attention_tp.py
@@ -6,12 +6,14 @@
 import torch
 from torch.distributed._tensor import DTensor as DT
 from torch.distributed._tensor.placement_types import Shard
-from torch.distributed._tensor.parallel._view_with_dim_change import (
+from torch.distributed.tensor.parallel._view_with_dim_change import (
     _view_with_sharding_dim_change,
 )
 
 from typing import Optional, Union
 
+__all__ = ["TensorParallelMultiheadAttention"]
+
 
 # TODO: Add a test to test equivalence between our Multihead Attention
 # with other mainstream ones (Megatron-LM or PyTorch).
diff --git a/torch/distributed/_tensor/parallel/style.py b/torch/distributed/tensor/parallel/style.py
similarity index 96%
rename from torch/distributed/_tensor/parallel/style.py
rename to torch/distributed/tensor/parallel/style.py
index e414cb0dc09d5..05ac1db708c1c 100644
--- a/torch/distributed/_tensor/parallel/style.py
+++ b/torch/distributed/tensor/parallel/style.py
@@ -4,13 +4,26 @@
 from abc import ABC
 from typing import Union, Optional
 from torch.distributed._tensor import DTensor, Shard, Replicate, DeviceMesh
-from torch.distributed._tensor.parallel.utils import (
+from torch.distributed.tensor.parallel._utils import (
     _PrepareInputType,
     _PrepareOutputType,
     _prepare_input_validate,
     _prepare_output_validate,
 )
 
+__all__ = [
+    "ParallelStyle",
+    "RowwiseParallel",
+    "ColwiseParallel",
+    "PairwiseParallel",
+    "make_input_replicate_1d",
+    "make_input_shard_1d",
+    "make_input_shard_1d_dim_last",
+    "make_output_replicate_1d",
+    "make_output_tensor",
+    "make_output_shard_1d"
+]
+
 
 class ParallelStyle(ABC):
     """

From 9363dde1b7ff529dea796912ed819575bf4bf3c7 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 30 Nov 2022 14:42:27 -0500
Subject: [PATCH 1446/1922] [EASY] Replace direct use of Guard ctor with
 make_guard (#89945)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89945
Approved by: https://github.com/albanD
---
 torch/_dynamo/variables/user_defined.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/torch/_dynamo/variables/user_defined.py b/torch/_dynamo/variables/user_defined.py
index 8cc9528ed67c4..d86969d83774d 100644
--- a/torch/_dynamo/variables/user_defined.py
+++ b/torch/_dynamo/variables/user_defined.py
@@ -12,7 +12,7 @@
 
 from .. import variables
 from ..exc import unimplemented
-from ..guards import Guard, GuardBuilder
+from ..guards import GuardBuilder
 from ..source import AttrSource, ODictGetItemSource, RandomValueSource
 from ..utils import is_namedtuple_cls, namedtuple_fields
 from .base import MutableLocal, VariableTracker
@@ -178,13 +178,7 @@ def call_method(
                 assert all(map(ConstantVariable.is_literal, keys))
                 return TupleVariable(
                     [ConstantVariable(k, **options) for k in keys], **options
-                ).add_guard(
-                    Guard(
-                        self.source.name(),
-                        self.source.guard_source(),
-                        GuardBuilder.ODICT_KEYS,
-                    )
-                )
+                ).add_guard(self.source.make_guard(GuardBuilder.ODICT_KEYS))
 
             if (
                 method is collections.OrderedDict.items

From 1f47a4e293143e22f1c1db74f2667a8527e8bbd5 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Tue, 29 Nov 2022 18:53:32 -0800
Subject: [PATCH 1447/1922] Correct the label for quantization PRs (#89888)

Summary:
att

Test Plan:
NA

Reviewers:

Subscribers:

Tasks:

Tags:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89888
Approved by: https://github.com/andrewor14
---
 .github/labeler.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/labeler.yml b/.github/labeler.yml
index 6d3902e31da67..14f1765462569 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -47,8 +47,9 @@
 "NNC":
 - torch/csrc/jit/tensorexpr/**
 
-"oncall: quantization":
+"release notes: quantization":
 - torch/ao/quantization/**
 - torch/quantization/**
 - aten/src/ATen/quantized/**
 - aten/src/ATen/native/quantized/cpu/**
+- test/quantization/**

From ea4e084b9a8bdab92cd85ea850673f61759acece Mon Sep 17 00:00:00 2001
From: Dmitry Tomshin <dmitry.tomshin@gmail.com>
Date: Wed, 30 Nov 2022 23:42:53 +0000
Subject: [PATCH 1448/1922] Issue 68576 prefetch factor docstring changes
 (#89874)

Fixes #68576

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89874
Approved by: https://github.com/kit1980
---
 torch/utils/data/dataloader.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index c836c9fa975f6..c86ac8813f9df 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -175,7 +175,9 @@ class DataLoader(Generic[T_co]):
             `base_seed` for workers. (default: ``None``)
         prefetch_factor (int, optional, keyword-only arg): Number of batches loaded
             in advance by each worker. ``2`` means there will be a total of
-            2 * num_workers batches prefetched across all workers. (default: ``2``)
+            2 * num_workers batches prefetched across all workers. (default value depends
+            on the set value for num_workers. If value of num_workers=0 default is ``None``.
+            Otherwise if value of num_workers>0 default is ``2``).
         persistent_workers (bool, optional): If ``True``, the data loader will not shutdown
             the worker processes after a dataset has been consumed once. This allows to
             maintain the workers `Dataset` instances alive. (default: ``False``)

From 37d9476ebc04421f2253ea02aaac5878c6fac442 Mon Sep 17 00:00:00 2001
From: Pearu Peterson <pearu.peterson@gmail.com>
Date: Wed, 30 Nov 2022 19:53:56 +0200
Subject: [PATCH 1449/1922] Add device note to the docs of sparse tensor
 factory functions (#89910)

Fixes #89402

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89910
Approved by: https://github.com/amjames, https://github.com/cpuhrsch
---
 torch/_torch_docs.py | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 34195f938b40a..af848690de50f 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -108,6 +108,16 @@ def merge_dicts(*dicts):
         returned Tensor. Default: ``torch.contiguous_format``.
 """
     ),
+    {
+        "sparse_factory_device_note": """\
+.. note::
+
+   If the ``device`` argument is not specified the device of the given
+   :attr:`values` and indices tensor(s) must match. If, however, the
+   argument is specified the input Tensors will be converted to the
+   given device and in turn determine the device of the constructed
+   sparse tensor."""
+    },
 )
 
 factory_like_common_args = parse_kwargs(
@@ -10156,6 +10166,8 @@ def merge_dicts(*dicts):
 have a look at :ref:`the note on the data type of the indices
 <sparse-compressed-docs>`.
 
+{sparse_factory_device_note}
+
 Args:
     compressed_indices (array_like): (B+1)-dimensional array of size
         ``(*batchsize, compressed_dim_size + 1)``.  The last element of
@@ -10168,10 +10180,12 @@ def merge_dicts(*dicts):
     plain_indices (array_like): Plain dimension (column or row)
         co-ordinates of each element or block in values. (B+1)-dimensional
         tensor with the same length as values.
+
     values (array_list): Initial values for the tensor. Can be a list,
         tuple, NumPy ``ndarray``, scalar, and other types.  that
-        represents a (1+K)-dimensional or (1+2+K)-dimensional tensor
-        where ``K`` is the number of dense dimensions.
+        represents a (1+K)-dimensional (for CSR and CSC layouts) or
+        (1+2+K)-dimensional tensor (for BSR and BSC layouts) where
+        ``K`` is the number of dense dimensions.
     size (list, tuple, :class:`torch.Size`, optional): Size of the
         sparse tensor: ``(*batchsize, nrows * blocksize[0], ncols *
         blocksize[1], *densesize)`` where ``blocksize[0] ==
@@ -10221,6 +10235,8 @@ def merge_dicts(*dicts):
 in CSR format are typically faster than that for sparse tensors in COO format. Make you have a look
 at :ref:`the note on the data type of the indices <sparse-csr-docs>`.
 
+{sparse_factory_device_note}
+
 Args:
     crow_indices (array_like): (B+1)-dimensional array of size
         ``(*batchsize, nrows + 1)``.  The last element of each batch
@@ -10281,6 +10297,8 @@ def merge_dicts(*dicts):
 for sparse tensors in COO format. Make you have a look at :ref:`the
 note on the data type of the indices <sparse-csc-docs>`.
 
+{sparse_factory_device_note}
+
 Args:
     ccol_indices (array_like): (B+1)-dimensional array of size
         ``(*batchsize, ncols + 1)``.  The last element of each batch
@@ -10341,6 +10359,8 @@ def merge_dicts(*dicts):
 for sparse tensors in COO format. Make you have a look at :ref:`the
 note on the data type of the indices <sparse-bsr-docs>`.
 
+{sparse_factory_device_note}
+
 Args:
     crow_indices (array_like): (B+1)-dimensional array of size
         ``(*batchsize, nrowblocks + 1)``.  The last element of each
@@ -10406,6 +10426,8 @@ def merge_dicts(*dicts):
 for sparse tensors in COO format. Make you have a look at :ref:`the
 note on the data type of the indices <sparse-bsc-docs>`.
 
+{sparse_factory_device_note}
+
 Args:
     ccol_indices (array_like): (B+1)-dimensional array of size
         ``(*batchsize, ncolblocks + 1)``. The last element of each
@@ -10471,6 +10493,8 @@ def merge_dicts(*dicts):
 
    This function returns an :ref:`uncoalesced tensor <sparse-uncoalesced-coo-docs>`.
 
+{sparse_factory_device_note}
+
 Args:
     indices (array_like): Initial data for the tensor. Can be a list, tuple,
         NumPy ``ndarray``, scalar, and other types. Will be cast to a :class:`torch.LongTensor`

From 676d521a9164b61b376fc5d722d8b2e3202e764b Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 30 Nov 2022 15:38:32 -0500
Subject: [PATCH 1450/1922] Implement gamma cdf (#89955)

Authored by tillahoffmann originally at https://github.com/pytorch/pytorch/pull/72518

Implements the cumulative distribution function for the gamma distribution. The tests needed a small adjustment to pass because gradients cannot be evaluated with respect to the first argument of the incomplete gamma function (and they're not needed for the test).

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89955
Approved by: https://github.com/wconstab, https://github.com/malfet
---
 test/distributions/test_distributions.py | 3 +++
 torch/distributions/gamma.py             | 5 +++++
 2 files changed, 8 insertions(+)

diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py
index 219eacf4790b0..84236b5b51e02 100644
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@@ -2975,6 +2975,9 @@ def test_cdf_log_prob(self):
         # Tests if the differentiation of the CDF gives the PDF at a given value
         for Dist, params in EXAMPLES:
             for i, param in enumerate(params):
+                # We do not need grads wrt params here, e.g. shape of gamma distribution.
+                param = {key: value.detach() if isinstance(value, torch.Tensor) else value
+                         for key, value in param.items()}
                 dist = Dist(**param)
                 samples = dist.sample()
                 if not dist.support.is_discrete:
diff --git a/torch/distributions/gamma.py b/torch/distributions/gamma.py
index fbe497d95daf7..d6522b202d231 100644
--- a/torch/distributions/gamma.py
+++ b/torch/distributions/gamma.py
@@ -86,3 +86,8 @@ def _natural_params(self):
 
     def _log_normalizer(self, x, y):
         return torch.lgamma(x + 1) + (x + 1) * torch.log(-y.reciprocal())
+
+    def cdf(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        return torch.special.gammainc(self.concentration, self.rate * value)

From 27b28b6485f3f0806e05f8a0ddb320790c279402 Mon Sep 17 00:00:00 2001
From: Iris <wz337@cornell.edu>
Date: Thu, 1 Dec 2022 01:55:51 +0000
Subject: [PATCH 1451/1922] [Checkpoint] Minor update to checkpoint utils
 (#89964)

Change to only print temp directory once on rank0.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89964
Approved by: https://github.com/XilunWu
---
 torch/testing/_internal/distributed/checkpoint_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/torch/testing/_internal/distributed/checkpoint_utils.py b/torch/testing/_internal/distributed/checkpoint_utils.py
index d72810580335d..bcdcb7b2818f3 100644
--- a/torch/testing/_internal/distributed/checkpoint_utils.py
+++ b/torch/testing/_internal/distributed/checkpoint_utils.py
@@ -21,13 +21,15 @@ def wrapper(self, *args: Tuple[object], **kwargs: Dict[str, Any]) -> None:
         # Only create temp_dir when rank is 0
         if dist.get_rank() == 0:
             temp_dir = tempfile.mkdtemp()
+            print(f"Using temp directory: {self.temp_dir }")
         else:
             temp_dir = ""
         object_list = [temp_dir]
+
         # Broadcast temp_dir to all the other ranks
         dist.broadcast_object_list(object_list)
         self.temp_dir = object_list[0]
-        print(f"Using temp directory: {self.temp_dir }")
+
         try:
             func(self)
         finally:

From dac4322199ffd85063b507dd1c26b87f101a32f5 Mon Sep 17 00:00:00 2001
From: mingfeima <mingfei.ma@intel.com>
Date: Tue, 29 Nov 2022 14:01:38 +0800
Subject: [PATCH 1452/1922] RowwiseMoments: use float as acc type for bfloat16
 inputs (#84405)

To fix https://github.com/pytorch/pytorch/issues/77507

Originally `utils::RowwiseMoments<BFloat16>` will still accululate on BFloat16,
which is not only slow but also introducing additional rounding errors.

This patch will do accumulation on float for the bfloat16 inputs:
each of bfloat16 vec (size 16) will be converted to two float vec (size 8),
and accumulated on m1(mean) and m2(rstd) vecs which are all float vecs.

No effect on float performance, will improve bfloat16 performance:
* avx512 single socket:
```
before: LayerNorm((1024,), eps=1e-05, elementwise_affine=True) : 32x128x1024: fp32: 0.210 ms; bf16: 0.770 ms
after:  LayerNorm((1024,), eps=1e-05, elementwise_affine=True) : 32x128x1024: fp32: 0.215 ms; bf16: 0.178 ms
```
* avx512 single core:
```
before: LayerNorm((1024,), eps=1e-05, elementwise_affine=True) : 32x128x1024: fp32: 2.661 ms; bf16: 12.267 ms
after:  LayerNorm((1024,), eps=1e-05, elementwise_affine=True) : 32x128x1024: fp32: 2.618 ms; bf16: 2.309 ms
```
* avx2 single socket:
```
before: LayerNorm((1024,), eps=1e-05, elementwise_affine=True) : 32x128x1024: fp32: 0.540 ms; bf16: 2.030 ms
after:  LayerNorm((1024,), eps=1e-05, elementwise_affine=True) : 32x128x1024: fp32: 0.527 ms; bf16: 0.458 ms
```
* avx2 single core:
```
before: LayerNorm((1024,), eps=1e-05, elementwise_affine=True) : 32x128x1024: fp32: 4.349 ms; bf16: 19.252 ms
after:  LayerNorm((1024,), eps=1e-05, elementwise_affine=True) : 32x128x1024: fp32: 4.416 ms; bf16: 3.524 ms
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84405
Approved by: https://github.com/jgong5
---
 .../src/ATen/native/cpu/group_norm_kernel.cpp | 12 ++-
 .../src/ATen/native/cpu/layer_norm_kernel.cpp |  4 +-
 aten/src/ATen/native/cpu/moments_utils.h      | 99 ++++++++++++++-----
 3 files changed, 83 insertions(+), 32 deletions(-)

diff --git a/aten/src/ATen/native/cpu/group_norm_kernel.cpp b/aten/src/ATen/native/cpu/group_norm_kernel.cpp
index a8dc08aa6dab8..6f40e13f3256f 100644
--- a/aten/src/ATen/native/cpu/group_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/group_norm_kernel.cpp
@@ -52,13 +52,15 @@ void GroupNormKernelImplInternal(
   const bool beta_null = beta_data == nullptr;
   const int64_t inner_size = D * HxW;
 
+  using T_ACC = vec::vec_scalar_t<T>;
+
   at::parallel_for(0, N * G, 1, [&](int64_t start, int64_t end) {
     for (const auto i : c10::irange(start, end)) {
       const T* X_ptr = X_data + i * inner_size;
-      T mean_val;
-      T rstd_val;
+      T_ACC mean_val;
+      T_ACC rstd_val;
       std::tie(mean_val, rstd_val) = RowwiseMoments(X_ptr, inner_size);
-      rstd_val = T(1) / std::sqrt(std::max(rstd_val, T(0)) + eps);
+      rstd_val = T_ACC(1) / std::sqrt(std::max(rstd_val, T_ACC(0)) + eps);
       if (gamma_null && beta_null) {
         T* Y_ptr = Y_data + i * inner_size;
         for (const auto j : c10::irange(inner_size)) {
@@ -68,8 +70,8 @@ void GroupNormKernelImplInternal(
         const int64_t g = i % G;
         for (const auto j : c10::irange(D)) {
           const int64_t c = g * D + j;
-          const T scale = rstd_val * (gamma_null ? T(1) : gamma_data[c]);
-          const T bias = -scale * mean_val + (beta_null ? T(0) : beta_data[c]);
+          const T_ACC scale = rstd_val * (gamma_null ? T(1) : gamma_data[c]);
+          const T_ACC bias = -scale * mean_val + (beta_null ? T(0) : beta_data[c]);
           X_ptr = X_data + (i * D + j) * HxW;
           T* Y_ptr = Y_data + (i * D + j) * HxW;
           for (const auto k : c10::irange(HxW)) {
diff --git a/aten/src/ATen/native/cpu/layer_norm_kernel.cpp b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
index 22cd0b69559e7..d0cdf3ad60199 100644
--- a/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
@@ -53,8 +53,8 @@ void LayerNormKernelImplInternal(
     for (const auto i : c10::irange(start, end)) {
       const T* X_ptr = X_data + i * N;
       T* Y_ptr = Y_data + i * N;
-      T mean_val;
-      T rstd_val;
+      T_ACC mean_val;
+      T_ACC rstd_val;
       std::tie(mean_val, rstd_val) = RowwiseMoments(X_ptr, N);
       rstd_val = T(1) / std::sqrt(rstd_val + eps);
       const T_ACC scale = rstd_val;
diff --git a/aten/src/ATen/native/cpu/moments_utils.h b/aten/src/ATen/native/cpu/moments_utils.h
index 58e0e48682924..8afd3612abb64 100644
--- a/aten/src/ATen/native/cpu/moments_utils.h
+++ b/aten/src/ATen/native/cpu/moments_utils.h
@@ -16,6 +16,8 @@ namespace at {
 namespace native {
 inline namespace CPU_CAPABILITY {
 
+template<typename T> using acc_t = vec::vec_scalar_t<T>;
+
 constexpr int64_t kChunkSize = 16;
 
 template <typename T>
@@ -52,20 +54,71 @@ C10_ALWAYS_INLINE void AddMomentsVec(
   m0 = n;
 }
 
+template <typename T>
+inline void UpdateMomentsVec(
+    int64_t m0,
+    const T* X_ptr,
+    const std::array<vec::Vectorized<acc_t<T>>, kChunkSize>& c_vecs,
+    int64_t& m0_stk0,
+    vec::Vectorized<acc_t<T>>& m1_stk0,
+    vec::Vectorized<acc_t<T>>& m2_stk0) {
+  using Vec = vec::Vectorized<acc_t<T>>;
+  Vec m1_vec(0);
+  Vec m2_vec(0);
+  for (const auto j : c10::irange(m0)) {
+    const Vec x_vec = Vec::loadu(X_ptr + j * Vec::size());
+    const Vec delta_vec = x_vec - m1_vec;
+    m1_vec += delta_vec * c_vecs[j];
+    m2_vec += delta_vec * (x_vec - m1_vec);
+  }
+  AddMomentsVec(m0, m1_vec, m2_vec, m0_stk0, m1_stk0, m2_stk0);
+}
+
+// each bfloat16 vector will be converted to two float vectors,
+// and accumulated successively on m1_stk0/m2_stk0.
+template <>
+inline void UpdateMomentsVec<BFloat16>(
+    int64_t m0,
+    const BFloat16* X_ptr,
+    const std::array<vec::Vectorized<float>, kChunkSize>& c_vecs,
+    int64_t& m0_stk0,
+    vec::Vectorized<float>& m1_stk0,
+    vec::Vectorized<float>& m2_stk0) {
+  using bVec = vec::Vectorized<BFloat16>;
+  using fVec = vec::Vectorized<float>;
+  fVec m1_fvec0(0), m1_fvec1(0);
+  fVec m2_fvec0(0), m2_fvec1(0);
+  for (const auto j : c10::irange(m0)) {
+    const bVec x_bvec = bVec::loadu(X_ptr + j * bVec::size());
+    fVec x_fvec0, x_fvec1;
+    std::tie(x_fvec0, x_fvec1) = convert_bfloat16_float(x_bvec);
+    const fVec delta_fvec0 = x_fvec0 - m1_fvec0;
+    const fVec delta_fvec1 = x_fvec1 - m1_fvec1;
+    m1_fvec0 += delta_fvec0 * c_vecs[j];
+    m1_fvec1 += delta_fvec1 * c_vecs[j];
+    m2_fvec0 += delta_fvec0 * (x_fvec0 - m1_fvec0);
+    m2_fvec1 += delta_fvec1 * (x_fvec1 - m1_fvec1);
+  }
+  AddMomentsVec(m0, m1_fvec0, m2_fvec0, m0_stk0, m1_stk0, m2_stk0);
+  AddMomentsVec(m0, m1_fvec1, m2_fvec1, m0_stk0, m1_stk0, m2_stk0);
+}
+
 // Compute rowwise moments by Welford algorithm and cascade sum to improve
 // numerical stability.
 // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
 // https://en.wikipedia.org/wiki/Pairwise_summation
 template <typename T, int64_t kMaxDepth>
-std::pair<T, T> RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) {
-  using Vec = vec::Vectorized<T>;
+std::pair<acc_t<T>, acc_t<T>> RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) {
+  using T_ACC = acc_t<T>;
 
-  constexpr int64_t kVecSize = Vec::size();
+  constexpr int64_t kVecSize = vec::Vectorized<T>::size();
+  constexpr int64_t kAccVecSize = vec::Vectorized<T_ACC>::size();
   const int64_t n = N / kVecSize;
   const int64_t m = divup(n, kChunkSize);
   const int64_t depth = utils::CeilLog2(m);
 
-  const Vec kZeroVec(T(0));
+  using Vec = vec::Vectorized<T_ACC>;
+  const Vec kZeroVec(T_ACC(0));
   c10::SmallVector<int64_t, kMaxDepth> m0_stk(depth, 0);
   c10::SmallVector<Vec, kMaxDepth> m1_stk(depth, kZeroVec);
   c10::SmallVector<Vec, kMaxDepth> m2_stk(depth, kZeroVec);
@@ -76,19 +129,12 @@ std::pair<T, T> RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) {
     static std::array<Vec, kChunkSize> c_vecs = ([]() {
       std::array<Vec, kChunkSize> result;
       for (const auto i : c10::irange(kChunkSize)) {
-        result[i] = Vec(T(1) / static_cast<T>(i + 1));
+        result[i] = Vec(T_ACC(1) / static_cast<T_ACC>(i + 1));
       }
       return result;
     })();
-    Vec m1_vec(0);
-    Vec m2_vec(0);
-    for (const auto j : c10::irange(m0)) {
-      const Vec x_vec = Vec::loadu(X_ptr + j * kVecSize);
-      const Vec delta_vec = x_vec - m1_vec;
-      m1_vec += delta_vec * c_vecs[j];
-      m2_vec += delta_vec * (x_vec - m1_vec);
-    }
-    AddMomentsVec(m0, m1_vec, m2_vec, m0_stk[0], m1_stk[0], m2_stk[0]);
+    UpdateMomentsVec(m0, X_ptr, c_vecs, m0_stk[0], m1_stk[0], m2_stk[0]);
+
     int64_t mask = i + 1;
     for (int64_t j = 1; j < depth && (mask & 1) == 0; ++j) {
       AddMomentsVec(
@@ -109,29 +155,32 @@ std::pair<T, T> RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) {
         m0_stk[i], m1_stk[i], m2_stk[i], m0_stk[0], m1_stk[0], m2_stk[0]);
   }
 
-  std::array<T, kVecSize> m1_arr{};
-  std::array<T, kVecSize> m2_arr{};
+  std::array<T_ACC, kAccVecSize> m1_arr{};
+  std::array<T_ACC, kAccVecSize> m2_arr{};
   m1_stk[0].store(m1_arr.data());
   m2_stk[0].store(m2_arr.data());
 
   int64_t m0 = 0;
-  T m1 = 0;
-  T m2 = 0;
+  T_ACC m1 = 0;
+  T_ACC m2 = 0;
   for (int64_t i = n * kVecSize; i < N; ++i) {
-    const T delta = X[i] - m1;
+    T_ACC x = static_cast<T_ACC>(X[i]);
+    const T_ACC delta = x - m1;
     ++m0;
-    m1 += delta / static_cast<T>(m0);
-    m2 += delta * (X[i] - m1);
+    m1 += delta / static_cast<T_ACC>(m0);
+    m2 += delta * (x - m1);
   }
-  for (const auto i : c10::irange(kVecSize)) {
-    AddMoments(n, m1_arr[i], m2_arr[i], m0, m1, m2);
+  // for BFloat16, each vector in m1_arr/m2_arr holds 2*n accumulated result
+  int64_t m0_add = n * kVecSize / kAccVecSize;
+  for (const auto i : c10::irange(kAccVecSize)) {
+    AddMoments(m0_add, m1_arr[i], m2_arr[i], m0, m1, m2);
   }
 
-  return std::make_pair(m1, m2 / static_cast<T>(N - ddof));
+  return std::make_pair(m1, m2 / static_cast<T_ACC>(N - ddof));
 }
 
 template <typename T>
-std::pair<T, T> RowwiseMoments(const T* X, int64_t N, int64_t ddof = 0) {
+std::pair<acc_t<T>, acc_t<T>> RowwiseMoments(const T* X, int64_t N, int64_t ddof = 0) {
   using Vec = vec::Vectorized<T>;
   constexpr int64_t kVecSize = Vec::size();
   const int64_t n = N / kVecSize;

From 3a401330364981b1ce295b320d95c608186e48c6 Mon Sep 17 00:00:00 2001
From: XiaobingSuper <xiaobing.zhang@intel.com>
Date: Wed, 30 Nov 2022 01:05:06 +0000
Subject: [PATCH 1453/1922] fix mkldnn quantization issue for weight reorder
 error (#86876)

Differential Revision: [D40351062](https://our.internmc.facebook.com/intern/diff/D40351062)

For mkldnn quantization path, we will do weight prepack using dummy data to query the expected weight format, the packed weight's format may differ from the real input case(the weight format depends on the input's shape), and there will have a block weight to block weight reorder if the packed weight format differs with the expected weight format.  The mkldnn may meet the following issue when doing such reorder(test on ICX machine):

```
test_conv_reorder_issue_onednn
    torch.ops.quantized.conv2d(qx, w_packed, output_scale=1.0, output_zero_point=0)
  File "/home/weiwen/.conda/envs/int8-dev/lib/python3.9/site-packages/torch/_ops.py", line 472, in __call__
    return self._op(*args, **kwargs or {})
RuntimeError: could not create a primitive descriptor for a reorder primitive
```

This PR will fix it: if the block weight to block weight reorder is failed, we will reorder the block weight to plain weight first, and then reorder the plain weight to the target block weight.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/86876
Approved by: https://github.com/jgong5, https://github.com/jerryzh168
---
 aten/src/ATen/native/quantized/cpu/qconv.cpp | 13 ++++++---
 test/quantization/core/test_quantized_op.py  | 29 ++++++++++++++++++++
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index b6fa57b9e3ede..31945234f2a9a 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -1329,14 +1329,19 @@ at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_impl(
       ideep::convolution_forward::compute(
           pd, primitive, src, weights, expected_bias, dst, src_zp_tensor, groups());
     } else {
-      ideep::convolution_forward::compute_v2(
-          src, weights, b, dst_dims, dst,
+      src.set_zero_point(src_zero_points);
+      dst.set_zero_point(dst_zero_points);
+      ConvParams params;
+      ideep::convolution_forward::prepare(
+          params, src, weights, b, dst_dims, dst,
           strides, dilates, padding_l, padding_r, groups(),
           src_scales, weights_scales, ideep::scale_t(scale_size, inv_output_scale),
-          src_zero_points, dst_zero_points, op_attr,
-          dnnl::algorithm::convolution_direct,
+          op_attr, dnnl::algorithm::convolution_direct,
           dnnl::prop_kind::forward_inference,
           ideep::u8s8, ideep::engine::cpu_engine());
+      onednn_utils::try_reorder(
+            weights, (ideep::tensor::desc)params.pd.weights_desc(), weights_scales);
+      ideep::convolution_forward::compute(params, src, weights, b, dst);
     }
   }
   return output;
diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index a63acc99383b3..116a76a2d1f5e 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -5454,6 +5454,35 @@ def test_qconv3d_unpack(
             (stride_d, stride_h, stride_w), (pad_d, pad_h, pad_w), (o_pad, o_pad, o_pad),
             channelwise)
 
+    def test_conv_reorder_issue_onednn(self):
+        """ Ensure reorder failure issue in conv is fixed for onednn backend.
+            Onednn backend used to encounter reorder failure
+            when running conv with dynamic input shapes.
+            Solved by https://github.com/pytorch/pytorch/pull/86876
+        """
+        if 'onednn' not in supported_qengines:
+            return
+        with override_quantized_engine('onednn'):
+            bs = 1
+            ic, oc = 128, 512
+            kh, kw = 1, 1
+            ih, iw = 28, 28
+            bias = None
+            strides, paddings, dilates, groups = (1, 1), (0, 0), (1, 1), 1
+            w = torch.randn((oc, ic, kh, kw))
+            qw = torch.quantize_per_tensor(w, scale=1.0, zero_point=0, dtype=torch.qint8)
+            x = torch.randn((bs, ic, ih, iw))
+            qx = torch.quantize_per_tensor(x, scale=1.0, zero_point=0, dtype=torch.quint8)
+            w_packed = torch.ops.quantized.conv2d_prepack(
+                qw, bias, strides, paddings, dilates, groups
+            )
+            torch.ops.quantized.conv2d(qx, w_packed, output_scale=1.0, output_zero_point=0)
+            ih, iw = 5, 4
+            x = torch.randn((bs, ic, ih, iw))
+            qx = torch.quantize_per_tensor(x, scale=1.0, zero_point=0, dtype=torch.quint8)
+            # The following should pass when input shape is changed
+            torch.ops.quantized.conv2d(qx, w_packed, output_scale=1.0, output_zero_point=0)
+
 class TestPadding(TestCase):
     @given(batch_size=st.integers(1, 64),
            channels=st.integers(1, 64),

From 7dbcf67a31f0bf7770d6127a6533845bc7095b30 Mon Sep 17 00:00:00 2001
From: XiaobingSuper <xiaobing.zhang@intel.com>
Date: Mon, 28 Nov 2022 20:15:12 -0500
Subject: [PATCH 1454/1922] quantization: make x86 as default backend (#88799)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88799
Approved by: https://github.com/kit1980
---
 aten/src/ATen/Context.cpp                |  2 +-
 torch/ao/quantization/qconfig.py         | 12 ++++++------
 torch/ao/quantization/qconfig_mapping.py |  8 ++++----
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index 7086a05ab6c7a..b391cd4aab904 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -332,8 +332,8 @@ const std::vector<at::QEngine>& Context::supportedQEngines() {
 
 #ifdef USE_FBGEMM
     if (fbgemm::fbgemmSupportedCPU()) {
-      // The X86 qengine is available if and only if FBGEMM is available
       engines.push_back(at::kX86);
+      // The X86 qengine is available if and only if FBGEMM is available
       engines.push_back(at::kFBGEMM);
     }
 #endif
diff --git a/torch/ao/quantization/qconfig.py b/torch/ao/quantization/qconfig.py
index 09fa02ff3ddb2..2dec48498aa58 100644
--- a/torch/ao/quantization/qconfig.py
+++ b/torch/ao/quantization/qconfig.py
@@ -218,13 +218,13 @@ def __new__(cls, activation=torch.nn.Identity, weight=torch.nn.Identity):
 Default qconfig for operators that reuse the observers from input Tensor, e.g. reshape
 """
 
-def get_default_qconfig(backend='fbgemm', version=0):
+def get_default_qconfig(backend='x86', version=0):
     """
     Returns the default PTQ qconfig for the specified backend.
 
     Args:
       * `backend` (str): a string representing the target backend. Currently supports
-        `x86`, `fbgemm` (default), `qnnpack` and `onednn`.
+        `x86` (default), `fbgemm`, `qnnpack` and `onednn`.
 
     Return:
         qconfig
@@ -301,13 +301,13 @@ def get_default_qconfig(backend='fbgemm', version=0):
 default_embedding_qat_qconfig_4bit = QConfig(activation=NoopObserver.with_args(dtype=torch.float32),
                                              weight=default_embedding_fake_quant_4bit)
 
-def get_default_qat_qconfig(backend='fbgemm', version=1):
+def get_default_qat_qconfig(backend='x86', version=1):
     """
     Returns the default QAT qconfig for the specified backend.
 
     Args:
       * `backend` (str): a string representing the target backend. Currently supports
-        `x86`, `fbgemm` (default), `qnnpack` and `onednn`.
+        `x86` (default), `fbgemm`, `qnnpack` and `onednn`.
       * `version`: version, for backwards compatibility. Can be `None` or `1`.
 
     Return:
@@ -402,13 +402,13 @@ def get_default_qat_qconfig(backend='fbgemm', version=1):
                                                        eps=2 ** -12),
     weight=fused_per_channel_wt_fake_quant_range_neg_127_to_127)
 
-def get_default_qconfig_dict(backend='fbgemm', version=0):
+def get_default_qconfig_dict(backend='x86', version=0):
     warnings.warn(
         "torch.ao.quantization.get_default_qconfig_dict is deprecated and will be removed in "
         "a future version. Please use torch.ao.quantization.get_default_qconfig_mapping instead.")
     return torch.ao.quantization.get_default_qconfig_mapping(backend, version).to_dict()
 
-def get_default_qat_qconfig_dict(backend='fbgemm', version=1):
+def get_default_qat_qconfig_dict(backend='x86', version=1):
     warnings.warn(
         "torch.ao.quantization.get_default_qat_qconfig_dict is deprecated and will be removed in "
         "a future version. Please use torch.ao.quantization.get_default_qat_qconfig_mapping instead.")
diff --git a/torch/ao/quantization/qconfig_mapping.py b/torch/ao/quantization/qconfig_mapping.py
index 65c85d033c5f9..1957996526d91 100644
--- a/torch/ao/quantization/qconfig_mapping.py
+++ b/torch/ao/quantization/qconfig_mapping.py
@@ -122,25 +122,25 @@ def _get_default_qconfig_mapping(is_qat: bool, backend: str, version: int) -> QC
 
     return qconfig_mapping
 
-def get_default_qconfig_mapping(backend="fbgemm", version=0) -> QConfigMapping:
+def get_default_qconfig_mapping(backend="x86", version=0) -> QConfigMapping:
     """
     Return the default QConfigMapping for post training quantization.
 
     Args:
       * ``backend`` (str) : the quantization backend for the default qconfig mapping, should be
-         one of ["x86", "fbgemm" (default), "qnnpack", "onednn"]
+         one of ["x86" (default), "fbgemm", "qnnpack", "onednn"]
       * ``version`` (int) : the version for the default qconfig mapping
     """
     # TODO: add assert for backend choices
     return _get_default_qconfig_mapping(False, backend, version)
 
-def get_default_qat_qconfig_mapping(backend="fbgemm", version=1) -> QConfigMapping:
+def get_default_qat_qconfig_mapping(backend="x86", version=1) -> QConfigMapping:
     """
     Return the default QConfigMapping for quantization aware training.
 
     Args:
       * ``backend`` (str) : the quantization backend for the default qconfig mapping, should be
-         one of ["x86", "fbgemm" (default), "qnnpack", "onednn"]
+         one of ["x86" (default), "fbgemm", "qnnpack", "onednn"]
       * ``version`` (int) : the version for the default qconfig mapping
     """
     return _get_default_qconfig_mapping(True, backend, version)

From dcb35ebc70b3867e1753097d2f78cb129fd78306 Mon Sep 17 00:00:00 2001
From: JackCaoG <jackcao@google.com>
Date: Thu, 1 Dec 2022 02:10:33 +0000
Subject: [PATCH 1455/1922] Minor fix for dynamo xla integration test (#89891)

Fix the test before I added them to the xla CI.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89891
Approved by: https://github.com/kit1980, https://github.com/shunting314
---
 test/dynamo/test_torchxla_integration.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/dynamo/test_torchxla_integration.py b/test/dynamo/test_torchxla_integration.py
index 70be4d8e87dcc..ecefed93e9370 100644
--- a/test/dynamo/test_torchxla_integration.py
+++ b/test/dynamo/test_torchxla_integration.py
@@ -104,6 +104,8 @@ def test_wrapper(self):
             xla_inputs_copy = copy.deepcopy(xla_inputs)
 
             expected = xla_module(*xla_inputs)
+            # make sure above lazy computation is executed.
+            xm.mark_step()
 
             actual = optimized_mod(*xla_inputs_copy)
 
@@ -117,7 +119,7 @@ def test_wrapper(self):
             # to handle inplace updates.
             if not allclose(xla_inputs, xla_inputs_copy):
                 print(
-                    f"Incorrect updated arguments at iter {i}. expected\n{rand_args}, actual\n{rand_args_copy}"
+                    f"Incorrect updated arguments at iter {i}. expected\n{xla_inputs}, actual\n{xla_inputs_copy}"
                 )
                 self.assertTrue(False)
 

From dc72c4b50e01149000a2f877265e0961a1dd6f55 Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Thu, 1 Dec 2022 02:10:49 +0000
Subject: [PATCH 1456/1922] [Vulkan][TCC] Helper functions for vulkan quantized
 tests (#89922)

Summary: Helper functions for producing random inputs/scale/zero points and also computing suitable scale and zero points of a tensor, used in the testing of quantized ops.

Test Plan:
On Mac
```
cd ~/fbsource
buck1 run -c pt.vulkan_full_precision=1 //xplat/caffe2:pt_vulkan_quantized_api_test_binAppleMac\#macosx-arm64
```

On Android
```
cd ~/fbsource
buck1 build -c ndk.custom_libcxx=false -c pt.enable_qpl=0 -c pt.vulkan_full_precision=1 //xplat/caffe2:pt_vulkan_quantized_api_test_binAndroid\#android-arm64 --show-output
adb push buck-out/gen/xplat/caffe2/pt_vulkan_quantized_api_test_binAndroid\#android-arm64 /data/local/tmp/vulkan_quantized_api_test
adb shell "/data/local/tmp/vulkan_quantized_api_test"
```

Reviewed By: kimishpatel

Differential Revision: D41595034

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89922
Approved by: https://github.com/digantdesai
---
 .../ATen/test/vulkan_quantized_api_test.cpp   | 75 +++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/aten/src/ATen/test/vulkan_quantized_api_test.cpp b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
index c30fac431d7bd..5419122692ee1 100644
--- a/aten/src/ATen/test/vulkan_quantized_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
@@ -9,6 +9,8 @@
 #include <ATen/native/vulkan/ops/Copy.h>
 #include <ATen/native/vulkan/ops/Factory.h>
 #include <ATen/native/vulkan/ops/QuantizedFunctions.h>
+#include <ATen/native/quantized/cpu/QuantUtils.h>
+#include <string.h>
 
 #include <c10/util/irange.h>
 
@@ -109,6 +111,79 @@ inline std::vector<c10::IValue> callOpByName(
 
 namespace {
 
+double rand01() {
+  return (double)rand() / (double)RAND_MAX;
+}
+
+int64_t rand_pos_int(const int max_val) {
+  return 1 + int64_t(rand01() * (max_val - 1));
+}
+
+at::Tensor produce_random_tensor(
+    const at::IntArrayRef tensor_shape,
+    const float a,
+    const float b,
+    const float c) {
+  return (a + b * at::rand({1}, at::device(at::kCPU).dtype(at::kFloat))) *
+         (at::rand(tensor_shape, at::device(at::kCPU).dtype(at::kFloat)) - c);
+}
+
+double produce_random_scale(const double scale_min, const double scale_max) {
+  return rand01() * (scale_max - scale_min) + scale_min;
+}
+
+int64_t produce_random_zero_point(const c10::ScalarType dtype) {
+  int64_t zero_point;
+  switch (dtype) {
+    case c10::ScalarType::QUInt8:
+      zero_point = int64_t(rand01() * 255);
+      break;
+    case c10::ScalarType::QInt8:
+      zero_point = int64_t(rand01() * 255) - 127;
+      break;
+    case c10::ScalarType::QInt32:
+      zero_point = int64_t(rand01() * 100000) - 200000;
+      break;
+    default:
+      TORCH_CHECK(
+        false, "Vulkan quantization currently not supported for dtype ", dtype
+      );
+  }
+  return zero_point;
+}
+
+std::tuple<double, int64_t> compute_quant_params(
+    const at::Tensor tensor,
+    const c10::ScalarType dtype = c10::ScalarType::QUInt8) {
+  int zero_point_min;
+  int zero_point_max;
+  if (dtype == c10::ScalarType::QUInt8) {
+    zero_point_min = 0;
+    zero_point_max = 255;
+  } else if (dtype == c10::ScalarType::QInt8) {
+    zero_point_min = -128;
+    zero_point_max = 127;
+  } else {
+    TORCH_CHECK(false, "Computation of quant params only available for dtypes",
+                       "QUInt8 and QInt8");
+  }
+  const auto tensor_max = tensor.max().item<double>();
+  const auto tensor_min = tensor.min().item<double>();
+  auto q_params = quant_utils::ChooseQuantizationParams(
+      /*min=*/tensor_min,
+      /*max=*/tensor_max,
+      /*qmin=*/zero_point_min,
+      /*qmax=*/zero_point_max,
+      /*preserve_sparsity=*/false,
+      /*force_scale_power_of_two=*/false,
+      /*reduce_range=*/false);
+  return std::tuple<double, int64_t>(q_params.scale, q_params.zero_point);
+}
+
+} // namespace
+
+namespace {
+
 class VulkanAPITest : public ::testing::Test {
  public:
   void SetUp() {

From 825926e020f309e5184d582fb71b26b3cb490612 Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Wed, 30 Nov 2022 10:37:16 -0800
Subject: [PATCH 1457/1922] [inductor] Disable parallel compilation inside
 fbcode (#89926)

Forking python processes using `multiprocessing` doesn't play nicely
with certain aspects of FB infra, so let's disable it until we find a better
solution.

Differential Revision: [D41618774](https://our.internmc.facebook.com/intern/diff/D41618774/)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89926
Approved by: https://github.com/desertfire
---
 torch/_inductor/config.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index d4e3aa60aac22..ad4e10e394d3a 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -1,5 +1,6 @@
 import os
 import sys
+from functools import lru_cache
 
 # add some debug printouts
 debug = False
@@ -68,15 +69,25 @@
 
 comment_origin = False
 
+
+@lru_cache(1)
+def is_fbcode():
+    try:
+        import torch.fb  # noqa: F401
+    except ImportError:
+        return False
+    return True
+
+
 compile_threads = (
-    min(
+    1
+    if sys.platform == "win32" or is_fbcode()
+    else min(
         32,
         len(os.sched_getaffinity(0))
         if hasattr(os, "sched_getaffinity")
         else os.cpu_count(),
     )
-    if sys.platform != "win32"
-    else 1
 )
 
 # If kernel is fused, the name is generated from the origin node op names

From 5a2c2e71451be62d952e16a31fe16bc8c6742075 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Thu, 1 Dec 2022 03:30:27 +0000
Subject: [PATCH 1458/1922] [codemod][llvm15] LLVM-15 fixes for
 caffe2/test/cpp/jit/test_graph_executor.cpp (#89936)

Summary: This fixes issues which block `caffe2/test/cpp/jit/test_graph_executor.cpp` from compiling with LLVM-15.

Test Plan: Sandcastle

Reviewed By: meyering

Differential Revision: D41603459

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89936
Approved by: https://github.com/soumith
---
 test/cpp/jit/test_graph_executor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/cpp/jit/test_graph_executor.cpp b/test/cpp/jit/test_graph_executor.cpp
index 6913e5f3ac2a8..acda804453f56 100644
--- a/test/cpp/jit/test_graph_executor.cpp
+++ b/test/cpp/jit/test_graph_executor.cpp
@@ -59,7 +59,7 @@ TEST(GraphExecutorTest, runAsync_executor) {
     mtx.lock();
     ++asyncCounter;
     mtx.unlock();
-    at::launch(move(f));
+    at::launch(std::move(f));
   };
   std::vector<IValue> stack;
   // NOLINTNEXTLINE(modernize-use-emplace)

From 4a40c043cfeb3ee402be2ad93fe7480ff7a906f1 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Thu, 1 Dec 2022 03:39:28 +0000
Subject: [PATCH 1459/1922] [codemod][llvm15] LLVM-15 fixes for
 caffe2/caffe2/video/video_decoder.h (#89940)

Summary: This fixes issues which block `caffe2/caffe2/video/video_decoder.h` from compiling with LLVM-15.

Test Plan: Sandcastle

Reviewed By: meyering

Differential Revision: D41603451

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89940
Approved by: https://github.com/soumith
---
 caffe2/video/video_decoder.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/caffe2/video/video_decoder.h b/caffe2/video/video_decoder.h
index a091142389d63..ba607fd8da3f0 100644
--- a/caffe2/video/video_decoder.h
+++ b/caffe2/video/video_decoder.h
@@ -508,11 +508,11 @@ class CallbackImpl : public Callback {
   }
 
   void frameDecoded(std::unique_ptr<DecodedFrame> frame) override {
-    frames.push_back(move(frame));
+    frames.push_back(std::move(frame));
   }
 
   void audioDecoded(std::unique_ptr<DecodedAudio> audio_sample) override {
-    audio_samples.push_back(move(audio_sample));
+    audio_samples.push_back(std::move(audio_sample));
   }
 
   void videoDecodingStarted(const VideoMeta& /*videoMeta*/) override {

From 61c2154c8d75ad3d8f4fa67439a188292d99f7fe Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Thu, 1 Dec 2022 03:46:19 +0000
Subject: [PATCH 1460/1922] [codemod][llvm15] LLVM-15 fixes for
 caffe2/caffe2/video/video_decoder.cc (#89937)

Summary: This fixes issues which block `caffe2/caffe2/video/video_decoder.cc` from compiling with LLVM-15.

Test Plan: Sandcastle

Reviewed By: meyering

Differential Revision: D41603386

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89937
Approved by: https://github.com/soumith
---
 caffe2/video/video_decoder.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/caffe2/video/video_decoder.cc b/caffe2/video/video_decoder.cc
index 8993241d39dc5..86bfbfa5ad2a0 100644
--- a/caffe2/video/video_decoder.cc
+++ b/caffe2/video/video_decoder.cc
@@ -606,7 +606,7 @@ void VideoDecoder::decodeLoop(
               unique_ptr<DecodedFrame> frame = make_unique<DecodedFrame>();
               frame->width_ = outWidth;
               frame->height_ = outHeight;
-              frame->data_ = move(buffer);
+              frame->data_ = std::move(buffer);
               frame->size_ = size;
               frame->index_ = frameIndex;
               frame->outputFrameIndex_ = outputFrameIndex;
@@ -735,10 +735,10 @@ bool DecodeMultipleClipsFromVideo(
   }
 
   for (auto& frame : callback.frames) {
-    sampledFrames.push_back(move(frame));
+    sampledFrames.push_back(std::move(frame));
   }
   for (auto& audio_sample : callback.audio_samples) {
-    sampledAudio.push_back(move(audio_sample));
+    sampledAudio.push_back(std::move(audio_sample));
   }
 
   for (int i = 0; i < buffer_rgb.size(); i++) {

From 5af9c02a55dd2b8458f434abecd50eeff6527dc7 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Thu, 1 Dec 2022 04:01:29 +0000
Subject: [PATCH 1461/1922] [vision hash update] update the pinned vision hash
 (#89749)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned vision hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89749
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/vision.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index 6874c288beca3..120f58b23e09f 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-72686211e2a8b78e5a5dc8c28be34eb9cfcdad4c
+a718345a8d60c73a441f6254d6eae456c8a6d787

From f6c828d1550bdbe81f35c21bc8477ea778835eca Mon Sep 17 00:00:00 2001
From: mingfeima <mingfei.ma@intel.com>
Date: Tue, 29 Nov 2022 14:01:39 +0800
Subject: [PATCH 1462/1922] add mixed data type support for LayerNorm (#81851)

1. If user uses amp to run bfloat16 models, `torch.autocast` will
keep module paramters in acc dtype which will leave `gamma` and`beta`
in float while input/output will be in bfloat16.

2. If user explicitly cast the model to bfloat16 such as:
```
  x = torch.randn(n, t, c).bfloat16()
  ln = nn.LayerNorm(c).bfloat16()
  y = ln(x)
```
The input/output and gamma/beta will all be in bfloat16.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/81851
Approved by: https://github.com/ezyang
---
 .../src/ATen/native/cpu/batch_norm_kernel.cpp |   9 --
 .../src/ATen/native/cpu/layer_norm_kernel.cpp | 115 ++++++++++++++++--
 aten/src/ATen/native/cpu/utils.h              |  10 ++
 aten/src/ATen/native/layer_norm.cpp           |  10 +-
 test/test_nn.py                               |  14 +++
 5 files changed, 134 insertions(+), 24 deletions(-)

diff --git a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
index c00b764f08055..7c8b22210e238 100644
--- a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
@@ -789,15 +789,6 @@ void batch_norm_cpu_collect_stats_contiguous_impl<BFloat16>(
   }
 }
 
-static inline std::tuple<Vectorized<float>, Vectorized<float>> load2f(const BFloat16* ptr) {
-  return convert_bfloat16_float(Vectorized<BFloat16>::loadu(ptr));
-}
-
-static inline std::tuple<Vectorized<float>, Vectorized<float>> load2f(const float* ptr) {
-  using Vec = Vectorized<float>;
-  return std::make_tuple(Vec::loadu(ptr), Vec::loadu(ptr + Vec::size()));
-}
-
 template <typename param_t>
 inline void batch_norm_cpu_collect_stats_channels_last_internal(
     Tensor& mean, Tensor& var_sum, const Tensor& input) {
diff --git a/aten/src/ATen/native/cpu/layer_norm_kernel.cpp b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
index d0cdf3ad60199..5fbbf2597529c 100644
--- a/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp
@@ -9,6 +9,7 @@
 #include <ATen/cpu/vec/functional.h>
 #include <ATen/cpu/vec/vec.h>
 #include <ATen/native/cpu/moments_utils.h>
+#include <ATen/native/cpu/mixed_data_type.h>
 #include <c10/util/irange.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
@@ -22,22 +23,18 @@ namespace native {
 
 namespace {
 
-template <typename T>
+template <typename T, typename T_ACC>
 void LayerNormKernelImplInternal(
     const Tensor& X,
     const Tensor& gamma,
     const Tensor& beta,
     int64_t M,
     int64_t N,
-    T eps,
+    T_ACC eps,
     Tensor* Y,
     Tensor* mean,
     Tensor* rstd) {
-  using T_ACC = vec::vec_scalar_t<T>;
-  using Vec = vec::Vectorized<T_ACC>;
-  TORCH_DCHECK_EQ(X.numel(), M * N);
-  DCHECK(!gamma.defined() || gamma.numel() == N);
-  DCHECK(!beta.defined() || beta.numel() == N);
+  using Vec = vec::Vectorized<T>;
   const T* X_data = X.data_ptr<T>();
   const T* gamma_data = gamma.defined() ? gamma.data_ptr<T>() : nullptr;
   const T* beta_data = beta.defined() ? beta.data_ptr<T>() : nullptr;
@@ -53,12 +50,12 @@ void LayerNormKernelImplInternal(
     for (const auto i : c10::irange(start, end)) {
       const T* X_ptr = X_data + i * N;
       T* Y_ptr = Y_data + i * N;
-      T_ACC mean_val;
-      T_ACC rstd_val;
+      T mean_val;
+      T rstd_val;
       std::tie(mean_val, rstd_val) = RowwiseMoments(X_ptr, N);
       rstd_val = T(1) / std::sqrt(rstd_val + eps);
-      const T_ACC scale = rstd_val;
-      const T_ACC bias = -rstd_val * mean_val;
+      const T scale = rstd_val;
+      const T bias = -rstd_val * mean_val;
       if (gamma_null || beta_null) {
         for (const auto j : c10::irange(N)) {
           const T gamma_v = gamma_null ? T(1) : gamma_data[j];
@@ -86,6 +83,94 @@ void LayerNormKernelImplInternal(
   });
 }
 
+template <typename param_t>
+void layer_norm_kernel_mixed_type(
+    const Tensor& X,
+    const Tensor& gamma,
+    const Tensor& beta,
+    int64_t M,
+    int64_t N,
+    float eps,
+    Tensor* Y,
+    Tensor* mean,
+    Tensor* rstd) {
+  using bVec = Vectorized<BFloat16>;
+  using fVec = Vectorized<float>;
+  const BFloat16* X_data = X.data_ptr<BFloat16>();
+  const param_t* gamma_data = gamma.defined() ? gamma.data_ptr<param_t>() : nullptr;
+  const param_t* beta_data = beta.defined() ? beta.data_ptr<param_t>() : nullptr;
+  BFloat16* Y_data = Y->data_ptr<BFloat16>();
+  param_t* mean_data = mean ? mean->data_ptr<param_t>() : nullptr;
+  param_t* rstd_data = rstd ? rstd->data_ptr<param_t>() : nullptr;
+
+  const bool gamma_null = gamma_data == nullptr;
+  const bool beta_null = beta_data == nullptr;
+  const bool mean_null = mean_data == nullptr;
+  const bool rstd_null = rstd_data == nullptr;
+  at::parallel_for(0, M, 1, [&](int64_t start, int64_t end) {
+    for (const auto i : c10::irange(start, end)) {
+      const BFloat16* X_ptr = X_data + i * N;
+      BFloat16* Y_ptr = Y_data + i * N;
+      float mean_val;
+      float rstd_val;
+      std::tie(mean_val, rstd_val) = RowwiseMoments(X_ptr, N);
+      rstd_val = float(1) / std::sqrt(rstd_val + eps);
+      const float scale = rstd_val;
+      const float bias = -rstd_val * mean_val;
+      if (gamma_null || beta_null) {
+        for (const auto j : c10::irange(N)) {
+          const param_t gamma_v = gamma_null ? param_t(1) : gamma_data[j];
+          const param_t beta_v = beta_null ? param_t(0) : beta_data[j];
+          Y_ptr[j] = (X_ptr[j] * scale + bias) * gamma_v + beta_v;
+        }
+      } else {
+        int64_t d = 0;
+        for (; d < N - (N % bVec::size()); d += bVec::size()) {
+          bVec x_bvec = bVec::loadu(X_ptr + d);
+          fVec x_fvec0, x_fvec1;
+          std::tie(x_fvec0, x_fvec1) = convert_bfloat16_float(x_bvec);
+          fVec gamma_fvec0, gamma_fvec1;
+          std::tie(gamma_fvec0, gamma_fvec1) = load2f(gamma_data + d);
+          fVec beta_fvec0, beta_fvec1;
+          std::tie(beta_fvec0, beta_fvec1) = load2f(beta_data + d);
+          fVec y_fvec0 = (x_fvec0 * fVec(scale) + fVec(bias)) * gamma_fvec0 + beta_fvec0;
+          fVec y_fvec1 = (x_fvec1 * fVec(scale) + fVec(bias)) * gamma_fvec1 + beta_fvec1;
+          bVec y_bvec = convert_float_bfloat16(y_fvec0, y_fvec1);
+          y_bvec.store(Y_ptr + d);
+        }
+        for (; d < N; d++) {
+          Y_ptr[d] = (X_ptr[d] * scale + bias) * gamma_data[d] + beta_data[d];
+        }
+      }
+      if (!mean_null) {
+        mean_data[i] = mean_val;
+      }
+      if (!rstd_null) {
+        rstd_data[i] = rstd_val;
+      }
+    }
+  });
+}
+
+template <>
+void LayerNormKernelImplInternal<BFloat16, float>(
+    const Tensor& X,
+    const Tensor& gamma,
+    const Tensor& beta,
+    int64_t M,
+    int64_t N,
+    float eps,
+    Tensor* Y,
+    Tensor* mean,
+    Tensor* rstd) {
+  const bool mixed_type = is_mixed_type(X, gamma, beta);
+  if (mixed_type) {
+    layer_norm_kernel_mixed_type<float>(X, gamma, beta, M, N, eps, Y, mean, rstd);
+  } else {
+    layer_norm_kernel_mixed_type<BFloat16>(X, gamma, beta, M, N, eps, Y, mean, rstd);
+  }
+}
+
 void LayerNormKernelImpl(
     const Tensor& X,
     const Tensor& gamma,
@@ -96,10 +181,14 @@ void LayerNormKernelImpl(
     Tensor* Y,
     Tensor* mean,
     Tensor* rstd) {
+  TORCH_DCHECK_EQ(X.numel(), M * N);
+  DCHECK(!gamma.defined() || gamma.numel() == N);
+  DCHECK(!beta.defined() || beta.numel() == N);
   AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, X.scalar_type(),
       "LayerNormKernelImpl", [&]() {
-    LayerNormKernelImplInternal<scalar_t>(
-        X, gamma, beta, M, N, static_cast<scalar_t>(eps), Y, mean, rstd);
+    using acc_t = vec::vec_scalar_t<scalar_t>;
+    LayerNormKernelImplInternal<scalar_t, acc_t>(
+        X, gamma, beta, M, N, static_cast<acc_t>(eps), Y, mean, rstd);
   });
 }
 
diff --git a/aten/src/ATen/native/cpu/utils.h b/aten/src/ATen/native/cpu/utils.h
index 5c607f06b3a5a..1fd30475e9ff4 100644
--- a/aten/src/ATen/native/cpu/utils.h
+++ b/aten/src/ATen/native/cpu/utils.h
@@ -61,6 +61,16 @@ template <typename scalar_t> struct VectorizedType { using type = Vectorized<sca
 template <> struct VectorizedType<BFloat16> { using type = Vec2; };
 template <typename scalar_t> using VecType = typename VectorizedType<scalar_t>::type;
 
+// Helper for mixed data type parameter Vec::load
+inline std::tuple<Vectorized<float>, Vectorized<float>> load2f(const BFloat16* ptr) {
+  return convert_bfloat16_float(Vectorized<BFloat16>::loadu(ptr));
+}
+
+inline std::tuple<Vectorized<float>, Vectorized<float>> load2f(const float* ptr) {
+  using Vec = Vectorized<float>;
+  return std::make_tuple(Vec::loadu(ptr), Vec::loadu(ptr + Vec::size()));
+}
+
 } // namespace
 
 namespace utils {
diff --git a/aten/src/ATen/native/layer_norm.cpp b/aten/src/ATen/native/layer_norm.cpp
index 8269a4d3af9e1..37a3f1a750ab2 100644
--- a/aten/src/ATen/native/layer_norm.cpp
+++ b/aten/src/ATen/native/layer_norm.cpp
@@ -3,6 +3,7 @@
 
 #include <ATen/core/Tensor.h>
 #include <ATen/Parallel.h>
+#include <ATen/native/cpu/mixed_data_type.h>
 #include <c10/util/irange.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
@@ -78,6 +79,10 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_cpu(
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
   const Tensor& bias = *bias_maybe_owned;
 
+  bool mixed_type = is_mixed_type(input, weight, bias);
+  if (mixed_type) {
+    check_mixed_data_type(input, weight, bias);
+  }
 
   auto M_N = _check_layer_norm_inputs(input, normalized_shape, weight, bias);
   auto M = M_N.first;
@@ -93,8 +98,9 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_cpu(
       c10::nullopt /* device */,
       c10::nullopt /* pin_memory */,
       at::MemoryFormat::Contiguous);
-  Tensor mean = at::empty({M}, X->options());
-  Tensor rstd = at::empty({M}, X->options());
+  const auto dtype = param_scalar_type(input, mixed_type);
+  Tensor mean = at::empty({M}, X->options().dtype(dtype));
+  Tensor rstd = at::empty({M}, X->options().dtype(dtype));
 
   layer_norm_with_mean_rstd_out(Y, mean, rstd, *X, normalized_shape, *gamma, *beta, eps, M, N);
   return std::make_tuple(std::move(Y), std::move(mean), std::move(rstd));
diff --git a/test/test_nn.py b/test/test_nn.py
index a7a12054d579b..60fb0e6c0cff3 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -7672,6 +7672,17 @@ def _test_LayerNorm_cuda_half(self, device):
         output.sum().backward()
         self.assertEqualTypeString(output, input)
 
+    def _test_LayerNorm_cpu_mixed_dtype(self, device):
+        for elementwise_affine in [True, False]:
+            # layer norm input shape is normalized to m x n, cpu vectorized on n,
+            # so make sure n exceeds vector length
+            input = torch.empty(2, 3, 11, 3, device=device, dtype=torch.bfloat16).random_(1, 10)
+            m = nn.LayerNorm([11, 3], elementwise_affine=elementwise_affine).to(device, torch.bfloat16)
+            m2 = deepcopy(m).to(device, torch.float)
+            out = m(input)
+            out2 = m2(input)
+            self.assertEqual(out, out2)
+
     def _test_GroupNorm_general(self, device, dtype=torch.float):
         good_shape_g = {
             (1, 2, 3, 4): 2,
@@ -8086,6 +8097,9 @@ def test_LayerNorm_general(self, device):
         if self.device_type == 'cuda':
             self._test_LayerNorm_cuda_half(device)
 
+        if self.device_type == 'cpu':
+            self._test_LayerNorm_cpu_mixed_dtype(device)
+
     @onlyNativeDeviceTypes
     def test_LayerNorm_numeric(self, device):
         def layer_norm_ref(X, gamma, beta, normalized_shape, eps):

From 6baf4e9e615c5b8ea18e8e64d77823265163e782 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 30 Nov 2022 22:06:08 -0500
Subject: [PATCH 1463/1922] Add manual meta implementations to
 quantize_per_tensor.tensor and co (#89958)

When you are writing a meta function, you cannot call item() on the tensor because there is no real data on the tensor and it will fail. The error message was not very good in this case, see also https://github.com/pytorch/pytorch/issues/89959

This PR takes a brute force approach to resolving the problem: just manually define meta implementations for the naughty functions that are calling item(). However, this results in a lot of code duplication. The easiest way to avoid this situation is to rewrite the decomps so they don't call item. It should not be that difficult to use direct tensors on your operations, as scalar tensors can broadcast too.

I could only test this with `buck test @mode/opt -c python.package_style=inplace //executorch/backends/test:test_backends` in internal with D41555454. Test coverage needs to be improved, otherwise don't blame us when we break you.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89958
Approved by: https://github.com/jerryzh168
---
 torch/ao/quantization/fx/_decomposed.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/torch/ao/quantization/fx/_decomposed.py b/torch/ao/quantization/fx/_decomposed.py
index 0e020a15a826d..a6f5ad7a3d0b9 100644
--- a/torch/ao/quantization/fx/_decomposed.py
+++ b/torch/ao/quantization/fx/_decomposed.py
@@ -83,6 +83,14 @@ def quantize_per_tensor_tensor(
     assert scale.numel() == 1, f"Exepecting scale tensor to be one element, but received : {scale.numel()}"
     return quantize_per_tensor(input, scale.item(), zero_point.item(), quant_min, quant_max, dtype)
 
+@impl(quantized_decomposed_lib, "quantize_per_tensor.tensor", "Meta")
+def quantize_per_tensor_tensor_meta(input, scale, zero_point, quant_min, quant_max, dtype):
+    assert zero_point.numel() == 1, f"Exepecting zero_point tensor to be one element, but received : {zero_point.numel()}"
+    assert scale.numel() == 1, f"Exepecting scale tensor to be one element, but received : {scale.numel()}"
+    assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+    _quant_min_max_bounds_check(quant_min, quant_max, dtype)
+    return torch.empty_like(input, dtype=dtype)
+
 # Note: quant_min/quant_max/dtype are not used in the operator, but for now it's kept in
 # the signature as metadata for the input Tensor, this might be useful for pattern
 # matching in the future
@@ -156,6 +164,16 @@ def dequantize_per_tensor_tensor(
     assert scale.numel() == 1, f"Exepecting scale tensor to be one element, but received : {scale.numel()}"
     return dequantize_per_tensor(input, scale.item(), zero_point.item(), quant_min, quant_max, dtype)
 
+@impl(quantized_decomposed_lib, "dequantize_per_tensor.tensor", "Meta")
+def dequantize_per_tensor_tensor_meta(input, scale, zero_point, quant_min, quant_max, dtype):
+    assert zero_point.numel() == 1, f"Exepecting zero_point tensor to be one element, but received : {zero_point.numel()}"
+    assert scale.numel() == 1, f"Exepecting scale tensor to be one element, but received : {scale.numel()}"
+    assert input.dtype == dtype, f"Expecting input to have dtype: {dtype}"
+    if dtype in [torch.uint8, torch.int8, torch.int32]:
+        return torch.empty_like(input, dtype=torch.float32)
+    else:
+        raise ValueError(f"Unsupported dtype in dequantize_per_tensor: {dtype}")
+
 
 quantized_decomposed_lib.define(
     "choose_qparams.tensor(Tensor input, int quant_min, int quant_max, "

From a51a3002b0c536379012c23daeefe7c06805f510 Mon Sep 17 00:00:00 2001
From: Yanli Zhao <yanlizhao@fb.com>
Date: Thu, 1 Dec 2022 06:34:10 +0000
Subject: [PATCH 1464/1922] ad more error info for cublasLtMatmul (#89983)

hit an error at 'cublasLtMatmul' when running bfloat16 for a complicate model, this error info will help debugging and also is  good for future error reporting
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89983
Approved by: https://github.com/ngimel
---
 aten/src/ATen/cuda/CUDABlas.cpp | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index 866f53ee7f87f..648b55774f194 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -741,7 +741,7 @@ void gemm_and_bias(
     TORCH_CUDABLAS_CHECK(CUBLAS_STATUS_NOT_SUPPORTED);
   }
 
-  TORCH_CUDABLAS_CHECK(cublasLtMatmul(
+  cublasStatus_t cublasStatus = cublasLtMatmul(
       ltHandle,
       computeDesc.descriptor(),
       &alpha_val,
@@ -757,7 +757,33 @@ void gemm_and_bias(
       &heuristicResult.algo,
       workspace.data_ptr(),
       workspaceSize,
-      at::cuda::getCurrentCUDAStream()));
+      at::cuda::getCurrentCUDAStream());
+  TORCH_CHECK(
+      cublasStatus == CUBLAS_STATUS_SUCCESS,
+      "CUDA error: ",
+      at::cuda::blas::_cublasGetErrorEnum(cublasStatus),
+      " when calling cublasLtMatmul with transpose_mat1 ",
+      transpose_mat1,
+      " transpose_mat2 ",
+      transpose_mat2,
+      " m ",
+      m,
+      " n ",
+      n,
+      " k ",
+      k,
+      " mat1_ld ",
+      mat1_ld,
+      " mat2_ld ",
+      mat2_ld,
+      " result_ld ",
+      result_ld,
+      " abcType ",
+      abcType,
+      " computeType ",
+      computeType,
+      " scaleType ",
+      scaleType);
 }
 
 template void gemm_and_bias(

From 80b8dbc4fc1212706c333011a78ab59d549370be Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 1 Dec 2022 09:28:02 +0100
Subject: [PATCH 1465/1922] enable ufmt for torch/testing/*.py (#89525)

I've tried to soft-enforce this manually already, albeit with a line length of 120. This just adds it to the CI. Note that this only applies to `torch/testing/*.py` and thus everything under `torch/testing/_internal/**/*` is *not* affected.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89525
Approved by: https://github.com/kit1980
---
 .lintrunner.toml             |   1 +
 torch/testing/__init__.py    |   2 +-
 torch/testing/_comparison.py | 268 +++++++++++++++++++++++++++--------
 torch/testing/_creation.py   |  50 +++++--
 torch/testing/_deprecated.py |  14 +-
 5 files changed, 256 insertions(+), 79 deletions(-)

diff --git a/.lintrunner.toml b/.lintrunner.toml
index fa7e484fb3e18..e5f1956212913 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -779,6 +779,7 @@ include_patterns = [
     'torchgen/**/*.py',
     'functorch/functorch/_src/aot_autograd.py',
     'functorch/functorch/_src/compilers.py',
+    'torch/testing/*.py',
 ]
 command = [
     'python3',
diff --git a/torch/testing/__init__.py b/torch/testing/__init__.py
index ad69ef1d24901..d437ed9e9727b 100644
--- a/torch/testing/__init__.py
+++ b/torch/testing/__init__.py
@@ -1,4 +1,4 @@
-from ._comparison import assert_close as assert_close
 from torch._C import FileCheck as FileCheck
+from ._comparison import assert_close as assert_close
 from ._creation import make_tensor as make_tensor
 from ._deprecated import *  # noqa: F403
diff --git a/torch/testing/_comparison.py b/torch/testing/_comparison.py
index 6999986f52945..3cc729457cbdc 100644
--- a/torch/testing/_comparison.py
+++ b/torch/testing/_comparison.py
@@ -2,7 +2,18 @@
 import cmath
 import collections.abc
 import contextlib
-from typing import NoReturn, Callable, Sequence, List, Union, Optional, Type, Tuple, Any, Collection
+from typing import (
+    Any,
+    Callable,
+    Collection,
+    List,
+    NoReturn,
+    Optional,
+    Sequence,
+    Tuple,
+    Type,
+    Union,
+)
 
 import torch
 
@@ -17,7 +28,9 @@
 class ErrorMeta(Exception):
     """Internal testing exception that makes that carries error meta data."""
 
-    def __init__(self, type: Type[Exception], msg: str, *, id: Tuple[Any, ...] = ()) -> None:
+    def __init__(
+        self, type: Type[Exception], msg: str, *, id: Tuple[Any, ...] = ()
+    ) -> None:
         super().__init__(
             "If you are a user and see this message during normal operation "
             "please file an issue at https://github.com/pytorch/pytorch/issues. "
@@ -28,7 +41,9 @@ def __init__(self, type: Type[Exception], msg: str, *, id: Tuple[Any, ...] = ())
         self.msg = msg
         self.id = id
 
-    def to_error(self, msg: Optional[Union[str, Callable[[str], str]]] = None) -> Exception:
+    def to_error(
+        self, msg: Optional[Union[str, Callable[[str], str]]] = None
+    ) -> Exception:
         if not isinstance(msg, str):
             generated_msg = self.msg
             if self.id:
@@ -56,12 +71,20 @@ def to_error(self, msg: Optional[Union[str, Callable[[str], str]]] = None) -> Ex
 _DTYPE_PRECISIONS.update(
     {
         dtype: _DTYPE_PRECISIONS[torch.float32]
-        for dtype in (torch.quint8, torch.quint2x4, torch.quint4x2, torch.qint8, torch.qint32)
+        for dtype in (
+            torch.quint8,
+            torch.quint2x4,
+            torch.quint4x2,
+            torch.qint8,
+            torch.qint32,
+        )
     }
 )
 
 
-def default_tolerances(*inputs: Union[torch.Tensor, torch.dtype]) -> Tuple[float, float]:
+def default_tolerances(
+    *inputs: Union[torch.Tensor, torch.dtype]
+) -> Tuple[float, float]:
     """Returns the default absolute and relative testing tolerances for a set of inputs based on the dtype.
 
     See :func:`assert_close` for a table of the default tolerance for each dtype.
@@ -76,13 +99,18 @@ def default_tolerances(*inputs: Union[torch.Tensor, torch.dtype]) -> Tuple[float
         elif isinstance(input, torch.dtype):
             dtypes.append(input)
         else:
-            raise TypeError(f"Expected a torch.Tensor or a torch.dtype, but got {type(input)} instead.")
+            raise TypeError(
+                f"Expected a torch.Tensor or a torch.dtype, but got {type(input)} instead."
+            )
     rtols, atols = zip(*[_DTYPE_PRECISIONS.get(dtype, (0.0, 0.0)) for dtype in dtypes])
     return max(rtols), max(atols)
 
 
 def get_tolerances(
-    *inputs: Union[torch.Tensor, torch.dtype], rtol: Optional[float], atol: Optional[float], id: Tuple[Any, ...] = ()
+    *inputs: Union[torch.Tensor, torch.dtype],
+    rtol: Optional[float],
+    atol: Optional[float],
+    id: Tuple[Any, ...] = (),
 ) -> Tuple[float, float]:
     """Gets absolute and relative to be used for numeric comparisons.
 
@@ -141,7 +169,13 @@ def _make_mismatch_msg(
     """
     equality = rtol == 0 and atol == 0
 
-    def make_diff_msg(*, type: str, diff: float, idx: Optional[Union[int, Tuple[int, ...]]], tol: float) -> str:
+    def make_diff_msg(
+        *,
+        type: str,
+        diff: float,
+        idx: Optional[Union[int, Tuple[int, ...]]],
+        tol: float,
+    ) -> str:
         if idx is None:
             msg = f"{type.title()} difference: {diff}"
         else:
@@ -219,6 +253,7 @@ def make_tensor_mismatch_msg(
             as callable in which case it will be called by the default value to create the description at runtime.
             Defaults to "Tensor-likes".
     """
+
     def unravel_flat_index(flat_index: int) -> Tuple[int, ...]:
         if not mismatches.shape:
             return ()
@@ -332,7 +367,10 @@ def __repr__(self) -> str:
                 ("id", self.id),
                 ("actual", self.actual),
                 ("expected", self.expected),
-                *[(extra, getattr(self, extra)) if isinstance(extra, str) else extra for extra in self.extra_repr()],
+                *[
+                    (extra, getattr(self, extra)) if isinstance(extra, str) else extra
+                    for extra in self.extra_repr()
+                ],
             ]
         ]
         return "\n".join((head, *body, *tail))
@@ -357,7 +395,9 @@ def compare(self) -> None:
             ) from error
 
         if not equal:
-            raise self._make_error_meta(AssertionError, f"{self.actual} != {self.expected}")
+            raise self._make_error_meta(
+                AssertionError, f"{self.actual} != {self.expected}"
+            )
 
 
 class NonePair(Pair):
@@ -371,7 +411,9 @@ def __init__(self, actual: Any, expected: Any, **other_parameters: Any) -> None:
 
     def compare(self) -> None:
         if not (self.actual is None and self.expected is None):
-            raise self._make_error_meta(AssertionError, f"None mismatch: {self.actual} is not {self.expected}")
+            raise self._make_error_meta(
+                AssertionError, f"None mismatch: {self.actual} is not {self.expected}"
+            )
 
 
 class BooleanPair(Pair):
@@ -383,7 +425,14 @@ class BooleanPair(Pair):
 
     """
 
-    def __init__(self, actual: Any, expected: Any, *, id: Tuple[Any, ...], **other_parameters: Any) -> None:
+    def __init__(
+        self,
+        actual: Any,
+        expected: Any,
+        *,
+        id: Tuple[Any, ...],
+        **other_parameters: Any,
+    ) -> None:
         actual, expected = self._process_inputs(actual, expected, id=id)
         super().__init__(actual, expected, **other_parameters)
 
@@ -394,9 +443,13 @@ def _supported_types(self) -> Tuple[Type, ...]:
             cls.append(np.bool_)
         return tuple(cls)
 
-    def _process_inputs(self, actual: Any, expected: Any, *, id: Tuple[Any, ...]) -> Tuple[bool, bool]:
+    def _process_inputs(
+        self, actual: Any, expected: Any, *, id: Tuple[Any, ...]
+    ) -> Tuple[bool, bool]:
         self._check_inputs_isinstance(actual, expected, cls=self._supported_types)
-        actual, expected = [self._to_bool(bool_like, id=id) for bool_like in (actual, expected)]
+        actual, expected = [
+            self._to_bool(bool_like, id=id) for bool_like in (actual, expected)
+        ]
         return actual, expected
 
     def _to_bool(self, bool_like: Any, *, id: Tuple[Any, ...]) -> bool:
@@ -405,11 +458,16 @@ def _to_bool(self, bool_like: Any, *, id: Tuple[Any, ...]) -> bool:
         elif isinstance(bool_like, np.bool_):
             return bool_like.item()
         else:
-            raise ErrorMeta(TypeError, f"Unknown boolean type {type(bool_like)}.", id=id)
+            raise ErrorMeta(
+                TypeError, f"Unknown boolean type {type(bool_like)}.", id=id
+            )
 
     def compare(self) -> None:
         if self.actual is not self.expected:
-            raise self._make_error_meta(AssertionError, f"Booleans mismatch: {self.actual} is not {self.expected}")
+            raise self._make_error_meta(
+                AssertionError,
+                f"Booleans mismatch: {self.actual} is not {self.expected}",
+            )
 
 
 class NumberPair(Pair):
@@ -464,7 +522,10 @@ def __init__(
         super().__init__(actual, expected, id=id, **other_parameters)
 
         self.rtol, self.atol = get_tolerances(
-            *[self._TYPE_TO_DTYPE[type(input)] for input in (actual, expected)], rtol=rtol, atol=atol, id=id
+            *[self._TYPE_TO_DTYPE[type(input)] for input in (actual, expected)],
+            rtol=rtol,
+            atol=atol,
+            id=id,
         )
         self.equal_nan = equal_nan
         self.check_dtype = check_dtype
@@ -480,16 +541,22 @@ def _process_inputs(
         self, actual: Any, expected: Any, *, id: Tuple[Any, ...]
     ) -> Tuple[Union[int, float, complex], Union[int, float, complex]]:
         self._check_inputs_isinstance(actual, expected, cls=self._supported_types)
-        actual, expected = [self._to_number(number_like, id=id) for number_like in (actual, expected)]
+        actual, expected = [
+            self._to_number(number_like, id=id) for number_like in (actual, expected)
+        ]
         return actual, expected
 
-    def _to_number(self, number_like: Any, *, id: Tuple[Any, ...]) -> Union[int, float, complex]:
+    def _to_number(
+        self, number_like: Any, *, id: Tuple[Any, ...]
+    ) -> Union[int, float, complex]:
         if NUMPY_AVAILABLE and isinstance(number_like, np.number):
             return number_like.item()
         elif isinstance(number_like, self._NUMBER_TYPES):
             return number_like
         else:
-            raise ErrorMeta(TypeError, f"Unknown number type {type(number_like)}.", id=id)
+            raise ErrorMeta(
+                TypeError, f"Unknown number type {type(number_like)}.", id=id
+            )
 
     def compare(self) -> None:
         if self.check_dtype and type(self.actual) is not type(self.expected):
@@ -511,7 +578,10 @@ def compare(self) -> None:
             return
 
         raise self._make_error_meta(
-            AssertionError, make_scalar_mismatch_msg(self.actual, self.expected, rtol=self.rtol, atol=self.atol)
+            AssertionError,
+            make_scalar_mismatch_msg(
+                self.actual, self.expected, rtol=self.rtol, atol=self.atol
+            ),
         )
 
     def extra_repr(self) -> Sequence[str]:
@@ -565,10 +635,14 @@ def __init__(
         check_is_coalesced: bool = True,
         **other_parameters: Any,
     ):
-        actual, expected = self._process_inputs(actual, expected, id=id, allow_subclasses=allow_subclasses)
+        actual, expected = self._process_inputs(
+            actual, expected, id=id, allow_subclasses=allow_subclasses
+        )
         super().__init__(actual, expected, id=id, **other_parameters)
 
-        self.rtol, self.atol = get_tolerances(actual, expected, rtol=rtol, atol=atol, id=self.id)
+        self.rtol, self.atol = get_tolerances(
+            actual, expected, rtol=rtol, atol=atol, id=self.id
+        )
         self.equal_nan = equal_nan
         self.check_device = check_device
         self.check_dtype = check_dtype
@@ -579,7 +653,9 @@ def __init__(
     def _process_inputs(
         self, actual: Any, expected: Any, *, id: Tuple[Any, ...], allow_subclasses: bool
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        directly_related = isinstance(actual, type(expected)) or isinstance(expected, type(actual))
+        directly_related = isinstance(actual, type(expected)) or isinstance(
+            expected, type(actual)
+        )
         if not directly_related:
             raise UnsupportedInputs()
 
@@ -601,13 +677,17 @@ def _to_tensor(self, tensor_like: Any) -> torch.Tensor:
             raise UnsupportedInputs()
 
     def _check_supported(self, tensor: torch.Tensor, *, id: Tuple[Any, ...]) -> None:
-        if tensor.layout not in {torch.strided,
-                                 torch.sparse_coo,
-                                 torch.sparse_csr,
-                                 torch.sparse_csc,
-                                 torch.sparse_bsr,
-                                 torch.sparse_bsc}:
-            raise ErrorMeta(ValueError, f"Unsupported tensor layout {tensor.layout}", id=id)
+        if tensor.layout not in {
+            torch.strided,
+            torch.sparse_coo,
+            torch.sparse_csr,
+            torch.sparse_csc,
+            torch.sparse_bsr,
+            torch.sparse_bsc,
+        }:
+            raise ErrorMeta(
+                ValueError, f"Unsupported tensor layout {tensor.layout}", id=id
+            )
 
     def compare(self) -> None:
         actual, expected = self.actual, self.expected
@@ -642,7 +722,9 @@ def _compare_attributes(
         are optional and can be disabled through the corresponding ``check_*`` flag during construction of the pair.
         """
 
-        def raise_mismatch_error(attribute_name: str, actual_value: Any, expected_value: Any) -> NoReturn:
+        def raise_mismatch_error(
+            attribute_name: str, actual_value: Any, expected_value: Any
+        ) -> NoReturn:
             raise self._make_error_meta(
                 AssertionError,
                 f"The values for attribute '{attribute_name}' do not match: {actual_value} != {expected_value}.",
@@ -652,14 +734,20 @@ def raise_mismatch_error(attribute_name: str, actual_value: Any, expected_value:
             raise_mismatch_error("shape", actual.shape, expected.shape)
 
         if actual.is_quantized != expected.is_quantized:
-            raise_mismatch_error("is_quantized", actual.is_quantized, expected.is_quantized)
+            raise_mismatch_error(
+                "is_quantized", actual.is_quantized, expected.is_quantized
+            )
         elif actual.is_quantized and actual.qscheme() != expected.qscheme():
             raise_mismatch_error("qscheme()", actual.qscheme(), expected.qscheme())
 
         if actual.layout != expected.layout:
             if self.check_layout:
                 raise_mismatch_error("layout", actual.layout, expected.layout)
-        elif actual.layout == torch.strided and self.check_stride and actual.stride() != expected.stride():
+        elif (
+            actual.layout == torch.strided
+            and self.check_stride
+            and actual.stride() != expected.stride()
+        ):
             raise_mismatch_error("stride()", actual.stride(), expected.stride())
 
         if self.check_device and actual.device != expected.device:
@@ -668,7 +756,9 @@ def raise_mismatch_error(attribute_name: str, actual_value: Any, expected_value:
         if self.check_dtype and actual.dtype != expected.dtype:
             raise_mismatch_error("dtype", actual.dtype, expected.dtype)
 
-    def _equalize_attributes(self, actual: torch.Tensor, expected: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    def _equalize_attributes(
+        self, actual: torch.Tensor, expected: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Equalizes some attributes of two tensors for value comparison.
 
         If ``actual`` and ``expected`` are ...
@@ -704,7 +794,9 @@ def _equalize_attributes(self, actual: torch.Tensor, expected: torch.Tensor) ->
         if actual.layout != expected.layout:
             # These checks are needed, since Tensor.to_dense() fails on tensors that are already strided
             actual = actual.to_dense() if actual.layout != torch.strided else actual
-            expected = expected.to_dense() if expected.layout != torch.strided else expected
+            expected = (
+                expected.to_dense() if expected.layout != torch.strided else expected
+            )
 
         return actual, expected
 
@@ -713,15 +805,28 @@ def _compare_values(self, actual: torch.Tensor, expected: torch.Tensor) -> None:
             compare_fn = self._compare_quantized_values
         elif actual.is_sparse:
             compare_fn = self._compare_sparse_coo_values
-        elif actual.layout in {torch.sparse_csr, torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc}:
+        elif actual.layout in {
+            torch.sparse_csr,
+            torch.sparse_csc,
+            torch.sparse_bsr,
+            torch.sparse_bsc,
+        }:
             compare_fn = self._compare_sparse_compressed_values
         else:
             compare_fn = self._compare_regular_values_close
 
-        compare_fn(actual, expected, rtol=self.rtol, atol=self.atol, equal_nan=self.equal_nan)
+        compare_fn(
+            actual, expected, rtol=self.rtol, atol=self.atol, equal_nan=self.equal_nan
+        )
 
     def _compare_quantized_values(
-        self, actual: torch.Tensor, expected: torch.Tensor, *, rtol: float, atol: float, equal_nan: bool
+        self,
+        actual: torch.Tensor,
+        expected: torch.Tensor,
+        *,
+        rtol: float,
+        atol: float,
+        equal_nan: bool,
     ) -> None:
         """Compares quantized tensors by comparing the :meth:`~torch.Tensor.dequantize`'d variants for closeness.
 
@@ -741,7 +846,13 @@ def _compare_quantized_values(
         )
 
     def _compare_sparse_coo_values(
-        self, actual: torch.Tensor, expected: torch.Tensor, *, rtol: float, atol: float, equal_nan: bool
+        self,
+        actual: torch.Tensor,
+        expected: torch.Tensor,
+        *,
+        rtol: float,
+        atol: float,
+        equal_nan: bool,
     ) -> None:
         """Compares sparse COO tensors by comparing
 
@@ -783,7 +894,13 @@ def _compare_sparse_coo_values(
         )
 
     def _compare_sparse_compressed_values(
-        self, actual: torch.Tensor, expected: torch.Tensor, *, rtol: float, atol: float, equal_nan: bool
+        self,
+        actual: torch.Tensor,
+        expected: torch.Tensor,
+        *,
+        rtol: float,
+        atol: float,
+        equal_nan: bool,
     ) -> None:
         """Compares sparse compressed tensors by comparing
 
@@ -793,10 +910,26 @@ def _compare_sparse_compressed_values(
         - the values for closeness.
         """
         format_name, compressed_indices_method, plain_indices_method = {
-            torch.sparse_csr: ('CSR', torch.Tensor.crow_indices, torch.Tensor.col_indices),
-            torch.sparse_csc: ('CSC', torch.Tensor.ccol_indices, torch.Tensor.row_indices),
-            torch.sparse_bsr: ('BSR', torch.Tensor.crow_indices, torch.Tensor.col_indices),
-            torch.sparse_bsc: ('BSC', torch.Tensor.ccol_indices, torch.Tensor.row_indices),
+            torch.sparse_csr: (
+                "CSR",
+                torch.Tensor.crow_indices,
+                torch.Tensor.col_indices,
+            ),
+            torch.sparse_csc: (
+                "CSC",
+                torch.Tensor.ccol_indices,
+                torch.Tensor.row_indices,
+            ),
+            torch.sparse_bsr: (
+                "BSR",
+                torch.Tensor.crow_indices,
+                torch.Tensor.col_indices,
+            ),
+            torch.sparse_bsc: (
+                "BSC",
+                torch.Tensor.ccol_indices,
+                torch.Tensor.row_indices,
+            ),
         }[actual.layout]
 
         if actual._nnz() != expected._nnz():
@@ -828,15 +961,17 @@ def _compare_sparse_compressed_values(
         )
 
     def _compare_regular_values_equal(
-            self,
-            actual: torch.Tensor,
-            expected: torch.Tensor,
-            *,
-            equal_nan: bool = False,
-            identifier: Optional[Union[str, Callable[[str], str]]] = None,
+        self,
+        actual: torch.Tensor,
+        expected: torch.Tensor,
+        *,
+        equal_nan: bool = False,
+        identifier: Optional[Union[str, Callable[[str], str]]] = None,
     ) -> None:
         """Checks if the values of two tensors are equal."""
-        self._compare_regular_values_close(actual, expected, rtol=0, atol=0, equal_nan=equal_nan, identifier=identifier)
+        self._compare_regular_values_close(
+            actual, expected, rtol=0, atol=0, equal_nan=equal_nan, identifier=identifier
+        )
 
     def _compare_regular_values_close(
         self,
@@ -850,14 +985,24 @@ def _compare_regular_values_close(
     ) -> None:
         """Checks if the values of two tensors are close up to a desired tolerance."""
         actual, expected = self._promote_for_comparison(actual, expected)
-        matches = torch.isclose(actual, expected, rtol=rtol, atol=atol, equal_nan=equal_nan)
+        matches = torch.isclose(
+            actual, expected, rtol=rtol, atol=atol, equal_nan=equal_nan
+        )
         if torch.all(matches):
             return
 
         if actual.shape == torch.Size([]):
-            msg = make_scalar_mismatch_msg(actual.item(), expected.item(), rtol=rtol, atol=atol, identifier=identifier)
+            msg = make_scalar_mismatch_msg(
+                actual.item(),
+                expected.item(),
+                rtol=rtol,
+                atol=atol,
+                identifier=identifier,
+            )
         else:
-            msg = make_tensor_mismatch_msg(actual, expected, ~matches, rtol=rtol, atol=atol, identifier=identifier)
+            msg = make_tensor_mismatch_msg(
+                actual, expected, ~matches, rtol=rtol, atol=atol, identifier=identifier
+            )
         raise self._make_error_meta(AssertionError, msg)
 
     def _promote_for_comparison(
@@ -927,15 +1072,16 @@ def originate_pairs(
     Returns:
         (List[Pair]): Originated pairs.
     """
-    if (
-        isinstance(actual, torch.TypedStorage)
-        and isinstance(expected, torch.TypedStorage)
+    if isinstance(actual, torch.TypedStorage) and isinstance(
+        expected, torch.TypedStorage
     ):
         actual_len = actual._size()
         expected_len = expected._size()
         if actual_len != expected_len:
             raise ErrorMeta(
-                AssertionError, f"The length of the sequences mismatch: {actual_len} != {expected_len}", id=id
+                AssertionError,
+                f"The length of the sequences mismatch: {actual_len} != {expected_len}",
+                id=id,
             )
 
         pairs = []
@@ -964,7 +1110,9 @@ def originate_pairs(
         expected_len = len(expected)
         if actual_len != expected_len:
             raise ErrorMeta(
-                AssertionError, f"The length of the sequences mismatch: {actual_len} != {expected_len}", id=id
+                AssertionError,
+                f"The length of the sequences mismatch: {actual_len} != {expected_len}",
+                id=id,
             )
 
         pairs = []
diff --git a/torch/testing/_creation.py b/torch/testing/_creation.py
index 33b9739a7f360..efdd44d3d9d23 100644
--- a/torch/testing/_creation.py
+++ b/torch/testing/_creation.py
@@ -2,16 +2,21 @@
 This module contains tensor creation utilities.
 """
 
-import torch
-from typing import Optional, List, Tuple, Union, cast
-import math
 import collections.abc
+import math
+from typing import cast, List, Optional, Tuple, Union
+
+import torch
 
 # Used by make_tensor for generating complex tensor.
-complex_to_corresponding_float_type_map = {torch.complex32: torch.float16,
-                                           torch.complex64: torch.float32,
-                                           torch.complex128: torch.float64}
-float_to_corresponding_complex_type_map = {v: k for k, v in complex_to_corresponding_float_type_map.items()}
+complex_to_corresponding_float_type_map = {
+    torch.complex32: torch.float16,
+    torch.complex64: torch.float32,
+    torch.complex128: torch.float64,
+}
+float_to_corresponding_complex_type_map = {
+    v: k for k, v in complex_to_corresponding_float_type_map.items()
+}
 
 
 def _uniform_random(t: torch.Tensor, low: float, high: float):
@@ -95,10 +100,12 @@ def make_tensor(
         tensor([[False, False],
                 [False, True]], device='cuda:0')
     """
+
     def _modify_low_high(low, high, lowest, highest, default_low, default_high, dtype):
         """
         Modifies (and raises ValueError when appropriate) low and high values given by the user (input_low, input_high) if required.
         """
+
         def clamp(a, l, h):
             return min(max(a, l), h)
 
@@ -133,27 +140,36 @@ def clamp(a, l, h):
         result = torch.randint(0, 2, shape, device=device, dtype=dtype)  # type: ignore[call-overload]
     elif dtype is torch.uint8:
         ranges = (torch.iinfo(dtype).min, torch.iinfo(dtype).max)
-        low, high = cast(Tuple[int, int], _modify_low_high(low, high, ranges[0], ranges[1], 0, 10, dtype))
-        result = torch.randint(low, high, shape, device=device, dtype=dtype)   # type: ignore[call-overload]
+        low, high = cast(
+            Tuple[int, int],
+            _modify_low_high(low, high, ranges[0], ranges[1], 0, 10, dtype),
+        )
+        result = torch.randint(low, high, shape, device=device, dtype=dtype)  # type: ignore[call-overload]
     elif dtype in _integral_types:
         ranges = (torch.iinfo(dtype).min, torch.iinfo(dtype).max)
         low, high = _modify_low_high(low, high, ranges[0], ranges[1], -9, 10, dtype)
         result = torch.randint(low, high, shape, device=device, dtype=dtype)  # type: ignore[call-overload]
     elif dtype in _floating_types:
         ranges_floats = (torch.finfo(dtype).min, torch.finfo(dtype).max)
-        m_low, m_high = _modify_low_high(low, high, ranges_floats[0], ranges_floats[1], -9, 9, dtype)
+        m_low, m_high = _modify_low_high(
+            low, high, ranges_floats[0], ranges_floats[1], -9, 9, dtype
+        )
         result = torch.empty(shape, device=device, dtype=dtype)
         _uniform_random(result, m_low, m_high)
     elif dtype in _complex_types:
         float_dtype = complex_to_corresponding_float_type_map[dtype]
         ranges_floats = (torch.finfo(float_dtype).min, torch.finfo(float_dtype).max)
-        m_low, m_high = _modify_low_high(low, high, ranges_floats[0], ranges_floats[1], -9, 9, dtype)
+        m_low, m_high = _modify_low_high(
+            low, high, ranges_floats[0], ranges_floats[1], -9, 9, dtype
+        )
         result = torch.empty(shape, device=device, dtype=dtype)
         result_real = torch.view_as_real(result)
         _uniform_random(result_real, m_low, m_high)
     else:
-        raise TypeError(f"The requested dtype '{dtype}' is not supported by torch.testing.make_tensor()."
-                        " To request support, file an issue at: https://github.com/pytorch/pytorch/issues")
+        raise TypeError(
+            f"The requested dtype '{dtype}' is not supported by torch.testing.make_tensor()."
+            " To request support, file an issue at: https://github.com/pytorch/pytorch/issues"
+        )
 
     assert not (noncontiguous and memory_format is not None)
     if noncontiguous and result.numel() > 1:
@@ -166,10 +182,14 @@ def clamp(a, l, h):
         if dtype in _integral_types or dtype is torch.bool:
             replace_with = torch.tensor(1, device=device, dtype=dtype)
         elif dtype in _floating_types:
-            replace_with = torch.tensor(torch.finfo(dtype).tiny, device=device, dtype=dtype)
+            replace_with = torch.tensor(
+                torch.finfo(dtype).tiny, device=device, dtype=dtype
+            )
         else:  # dtype in _complex_types:
             float_dtype = complex_to_corresponding_float_type_map[dtype]
-            float_eps = torch.tensor(torch.finfo(float_dtype).tiny, device=device, dtype=float_dtype)
+            float_eps = torch.tensor(
+                torch.finfo(float_dtype).tiny, device=device, dtype=float_dtype
+            )
             replace_with = torch.complex(float_eps, float_eps)
         result[result == 0] = replace_with
 
diff --git a/torch/testing/_deprecated.py b/torch/testing/_deprecated.py
index 731158ddb41ee..a9ef0c58cb9dc 100644
--- a/torch/testing/_deprecated.py
+++ b/torch/testing/_deprecated.py
@@ -13,7 +13,9 @@
 __all__ = ["assert_allclose"]
 
 
-def warn_deprecated(instructions: Union[str, Callable[[str, Tuple[Any, ...], Dict[str, Any], Any], str]]) -> Callable:
+def warn_deprecated(
+    instructions: Union[str, Callable[[str, Tuple[Any, ...], Dict[str, Any], Any], str]]
+) -> Callable:
     def outer_wrapper(fn: Callable) -> Callable:
         name = fn.__name__
         head = f"torch.testing.{name}() is deprecated since 1.12 and will be removed in 1.14. "
@@ -21,7 +23,11 @@ def outer_wrapper(fn: Callable) -> Callable:
         @functools.wraps(fn)
         def inner_wrapper(*args: Any, **kwargs: Any) -> Any:
             return_value = fn(*args, **kwargs)
-            tail = instructions(name, args, kwargs, return_value) if callable(instructions) else instructions
+            tail = (
+                instructions(name, args, kwargs, return_value)
+                if callable(instructions)
+                else instructions
+            )
             msg = (head + tail).strip()
             warnings.warn(msg, FutureWarning)
             return return_value
@@ -38,7 +44,9 @@ def inner_wrapper(*args: Any, **kwargs: Any) -> Any:
 }
 
 
-def _get_default_rtol_and_atol(actual: torch.Tensor, expected: torch.Tensor) -> Tuple[float, float]:
+def _get_default_rtol_and_atol(
+    actual: torch.Tensor, expected: torch.Tensor
+) -> Tuple[float, float]:
     actual_rtol, actual_atol = _DTYPE_PRECISIONS.get(actual.dtype, (0.0, 0.0))
     expected_rtol, expected_atol = _DTYPE_PRECISIONS.get(expected.dtype, (0.0, 0.0))
     return max(actual_rtol, expected_rtol), max(actual_atol, expected_atol)

From afcaa26df11162692319d3ae0f60db94f8c25de9 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 1 Dec 2022 09:28:02 +0100
Subject: [PATCH 1466/1922] document torch.testing.assert_allclose (#89526)

After our failed attempt to remove `assert_allclose` in #87974, we decided to add it to the documentation after all. Although we drop the expected removal date, the function continues to be deprecated in favor of `assert_close`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89526
Approved by: https://github.com/mruberry
---
 docs/source/testing.rst      |  1 +
 torch/testing/__init__.py    |  3 +-
 torch/testing/_comparison.py | 60 ++++++++++++++++++++++++-
 torch/testing/_deprecated.py | 85 ------------------------------------
 4 files changed, 60 insertions(+), 89 deletions(-)
 delete mode 100644 torch/testing/_deprecated.py

diff --git a/docs/source/testing.rst b/docs/source/testing.rst
index 122aa651b9579..8837c4a0ec1a7 100644
--- a/docs/source/testing.rst
+++ b/docs/source/testing.rst
@@ -6,3 +6,4 @@ torch.testing
 
 .. autofunction:: assert_close
 .. autofunction:: make_tensor
+.. autofunction:: assert_allclose
diff --git a/torch/testing/__init__.py b/torch/testing/__init__.py
index d437ed9e9727b..58b8f828e3546 100644
--- a/torch/testing/__init__.py
+++ b/torch/testing/__init__.py
@@ -1,4 +1,3 @@
 from torch._C import FileCheck as FileCheck
-from ._comparison import assert_close as assert_close
+from ._comparison import assert_allclose, assert_close as assert_close
 from ._creation import make_tensor as make_tensor
-from ._deprecated import *  # noqa: F403
diff --git a/torch/testing/_comparison.py b/torch/testing/_comparison.py
index 3cc729457cbdc..71824c9815f84 100644
--- a/torch/testing/_comparison.py
+++ b/torch/testing/_comparison.py
@@ -2,10 +2,12 @@
 import cmath
 import collections.abc
 import contextlib
+import warnings
 from typing import (
     Any,
     Callable,
     Collection,
+    Dict,
     List,
     NoReturn,
     Optional,
@@ -83,7 +85,8 @@ def to_error(
 
 
 def default_tolerances(
-    *inputs: Union[torch.Tensor, torch.dtype]
+    *inputs: Union[torch.Tensor, torch.dtype],
+    dtype_precisions: Optional[Dict[torch.dtype, Tuple[float, float]]] = None,
 ) -> Tuple[float, float]:
     """Returns the default absolute and relative testing tolerances for a set of inputs based on the dtype.
 
@@ -102,7 +105,8 @@ def default_tolerances(
             raise TypeError(
                 f"Expected a torch.Tensor or a torch.dtype, but got {type(input)} instead."
             )
-    rtols, atols = zip(*[_DTYPE_PRECISIONS.get(dtype, (0.0, 0.0)) for dtype in dtypes])
+    dtype_precisions = dtype_precisions or _DTYPE_PRECISIONS
+    rtols, atols = zip(*[dtype_precisions.get(dtype, (0.0, 0.0)) for dtype in dtypes])
     return max(rtols), max(atols)
 
 
@@ -1531,3 +1535,55 @@ def assert_close(
         check_stride=check_stride,
         msg=msg,
     )
+
+
+def assert_allclose(
+    actual: Any,
+    expected: Any,
+    rtol: Optional[float] = None,
+    atol: Optional[float] = None,
+    equal_nan: bool = True,
+    msg: str = "",
+) -> None:
+    """
+    .. warning::
+
+       :func:`torch.testing.assert_allclose` is deprecated since ``1.12`` and will be removed in a future release.
+       Please use :func:`torch.testing.assert_close` instead. You can find detailed upgrade instructions
+       `here <https://github.com/pytorch/pytorch/issues/61844>`_.
+    """
+    warnings.warn(
+        "`torch.testing.assert_allclose()` is deprecated since 1.12 and will be removed in a future release. "
+        "Please use `torch.testing.assert_close()` instead. "
+        "You can find detailed upgrade instructions in https://github.com/pytorch/pytorch/issues/61844.",
+        FutureWarning,
+        stacklevel=2,
+    )
+
+    if not isinstance(actual, torch.Tensor):
+        actual = torch.tensor(actual)
+    if not isinstance(expected, torch.Tensor):
+        expected = torch.tensor(expected, dtype=actual.dtype)
+
+    if rtol is None and atol is None:
+        rtol, atol = default_tolerances(
+            actual,
+            expected,
+            dtype_precisions={
+                torch.float16: (1e-3, 1e-3),
+                torch.float32: (1e-4, 1e-5),
+                torch.float64: (1e-5, 1e-8),
+            },
+        )
+
+    torch.testing.assert_close(
+        actual,
+        expected,
+        rtol=rtol,
+        atol=atol,
+        equal_nan=equal_nan,
+        check_device=True,
+        check_dtype=False,
+        check_stride=False,
+        msg=msg or None,
+    )
diff --git a/torch/testing/_deprecated.py b/torch/testing/_deprecated.py
deleted file mode 100644
index a9ef0c58cb9dc..0000000000000
--- a/torch/testing/_deprecated.py
+++ /dev/null
@@ -1,85 +0,0 @@
-"""This module exists since the `torch.testing` exposed a lot of stuff that shouldn't have been public. Although this
-was never documented anywhere, some other internal FB projects as well as downstream OSS projects might use this. Thus,
-we don't internalize without warning, but still go through a deprecation cycle.
-"""
-
-import functools
-import warnings
-from typing import Any, Callable, Dict, Optional, Tuple, Union
-
-import torch
-
-
-__all__ = ["assert_allclose"]
-
-
-def warn_deprecated(
-    instructions: Union[str, Callable[[str, Tuple[Any, ...], Dict[str, Any], Any], str]]
-) -> Callable:
-    def outer_wrapper(fn: Callable) -> Callable:
-        name = fn.__name__
-        head = f"torch.testing.{name}() is deprecated since 1.12 and will be removed in 1.14. "
-
-        @functools.wraps(fn)
-        def inner_wrapper(*args: Any, **kwargs: Any) -> Any:
-            return_value = fn(*args, **kwargs)
-            tail = (
-                instructions(name, args, kwargs, return_value)
-                if callable(instructions)
-                else instructions
-            )
-            msg = (head + tail).strip()
-            warnings.warn(msg, FutureWarning)
-            return return_value
-
-        return inner_wrapper
-
-    return outer_wrapper
-
-
-_DTYPE_PRECISIONS = {
-    torch.float16: (1e-3, 1e-3),
-    torch.float32: (1e-4, 1e-5),
-    torch.float64: (1e-5, 1e-8),
-}
-
-
-def _get_default_rtol_and_atol(
-    actual: torch.Tensor, expected: torch.Tensor
-) -> Tuple[float, float]:
-    actual_rtol, actual_atol = _DTYPE_PRECISIONS.get(actual.dtype, (0.0, 0.0))
-    expected_rtol, expected_atol = _DTYPE_PRECISIONS.get(expected.dtype, (0.0, 0.0))
-    return max(actual_rtol, expected_rtol), max(actual_atol, expected_atol)
-
-
-@warn_deprecated(
-    "Use torch.testing.assert_close() instead. "
-    "For detailed upgrade instructions see https://github.com/pytorch/pytorch/issues/61844."
-)
-def assert_allclose(
-    actual: Any,
-    expected: Any,
-    rtol: Optional[float] = None,
-    atol: Optional[float] = None,
-    equal_nan: bool = True,
-    msg: str = "",
-) -> None:
-    if not isinstance(actual, torch.Tensor):
-        actual = torch.tensor(actual)
-    if not isinstance(expected, torch.Tensor):
-        expected = torch.tensor(expected, dtype=actual.dtype)
-
-    if rtol is None and atol is None:
-        rtol, atol = _get_default_rtol_and_atol(actual, expected)
-
-    torch.testing.assert_close(
-        actual,
-        expected,
-        rtol=rtol,
-        atol=atol,
-        equal_nan=equal_nan,
-        check_device=True,
-        check_dtype=False,
-        check_stride=False,
-        msg=msg or None,
-    )

From 3a68091eeb6030e0e359f24f872f17cdb403b2f5 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 1 Dec 2022 09:28:03 +0100
Subject: [PATCH 1467/1922] fix assert_close docstring (#89620)

Two improvements here:

1. To render a bullet list correctly, a blank line before and after is needed. Compare

    ![Screenshot from 2022-11-24 09-34-10](https://user-images.githubusercontent.com/6849766/203732792-18071831-c7d9-4138-9002-e67e29f342fa.png)

    vs.

    ![Screenshot from 2022-11-24 09-34-52](https://user-images.githubusercontent.com/6849766/203732806-1ded7a4b-ca30-46c8-89a2-5c83ea33dbe7.png)

2. #72508 added proper support for meta tensors. Thus, we no longer throw an error if we encounter them.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89620
Approved by: https://github.com/kit1980
---
 torch/testing/_comparison.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/testing/_comparison.py b/torch/testing/_comparison.py
index 71824c9815f84..664aadd7e75e0 100644
--- a/torch/testing/_comparison.py
+++ b/torch/testing/_comparison.py
@@ -1296,10 +1296,12 @@ def assert_close(
     only considered equal to each other if ``equal_nan`` is ``True``.
 
     In addition, they are only considered close if they have the same
+
     - :attr:`~torch.Tensor.device` (if ``check_device`` is ``True``),
     - ``dtype`` (if ``check_dtype`` is ``True``),
     - ``layout`` (if ``check_layout`` is ``True``), and
     - stride (if ``check_stride`` is ``True``).
+
     If either ``actual`` or ``expected`` is a meta tensor, only the attribute checks will be performed.
 
     If ``actual`` and ``expected`` are sparse (either having COO, CSR, CSC, BSR, or BSC layout), their strided members are
@@ -1350,8 +1352,6 @@ def assert_close(
     Raises:
         ValueError: If no :class:`torch.Tensor` can be constructed from an input.
         ValueError: If only ``rtol`` or ``atol`` is specified.
-        NotImplementedError: If a tensor is a meta tensor. This is a temporary restriction and will be relaxed in the
-            future.
         AssertionError: If corresponding inputs are not Python scalars and are not directly related.
         AssertionError: If ``allow_subclasses`` is ``False``, but corresponding inputs are not Python scalars and have
             different types.

From cbf262c130975484f5e47a25ed674dbf26d065e8 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 1 Dec 2022 09:28:03 +0100
Subject: [PATCH 1468/1922] remove torch.equal usages (#89527)

Preparation for the next PR in this stack: #89559.

I replaced

- `self.assertTrue(torch.equal(...))` with `self.assertEqual(..., rtol=0, atol=0, exact_device=True)`,
- the same for `self.assertFalse(...)` with `self.assertNotEqual(...)`, and
- `assert torch.equal(...)` with `torch.testing.assert_close(..., rtol=0, atol=0)` (note that we don't need to set `check_device=True` here since that is the default).

There were a few instances where the result of `torch.equal` is used directly. In that cases I've replaced with `(... == ...).all().item()` while sometimes also dropping the `.item()` depending on the context.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89527
Approved by: https://github.com/mruberry
---
 docs/source/nested.rst                        |  2 +-
 .../sharded_tensor/ops/test_binary_cmp.py     |  2 +-
 .../sharded_tensor/ops/test_tensor_ops.py     |  4 +-
 .../sharded_tensor/test_sharded_tensor.py     |  8 ++--
 .../checkpoint/test_file_system_checkpoint.py | 20 +++++---
 .../test_file_system_checkpoint_cpu.py        | 20 +++++---
 .../fsdp/test_fsdp_clip_grad_norm.py          |  6 +--
 test/distributed/fsdp/test_fsdp_misc.py       |  2 +-
 .../fsdp/test_fsdp_summon_full_params.py      |  4 +-
 test/distributed/pipeline/sync/test_pipe.py   |  2 +-
 test/fx/test_dce_pass.py                      |  4 +-
 test/fx/test_fx_const_fold.py                 | 48 +++++++++----------
 test/jit/test_save_load.py                    |  4 +-
 test/nn/test_lazy_modules.py                  |  4 +-
 .../core/experimental/test_fake_quantize.py   |  8 ++--
 .../core/experimental/test_linear.py          |  4 +-
 .../experimental/test_quantized_tensor.py     |  2 +-
 .../core/experimental/test_quantizer.py       |  8 ++--
 test/quantization/core/test_quantized_op.py   |  4 +-
 .../core/test_quantized_tensor.py             |  6 +--
 .../quantization/core/test_workflow_module.py |  8 ++--
 test/quantization/fx/test_quantize_fx.py      |  8 ++--
 test/test_autocast.py                         |  2 +-
 test/test_autograd.py                         |  4 +-
 test/test_cuda.py                             | 10 ++--
 test/test_jit.py                              | 12 ++---
 test/test_mps.py                              |  4 +-
 test/test_namedtensor.py                      | 46 +++++++++---------
 test/test_nn.py                               | 10 ++--
 test/test_serialization.py                    | 20 ++++----
 test/test_tensorexpr.py                       |  2 +-
 .../_shard/sharded_tensor/_ops/binary_cmp.py  |  2 +-
 .../tensor/parallel/multihead_attention_tp.py |  7 +--
 torch/onnx/symbolic_opset9.py                 |  2 +-
 .../_internal/distributed/distributed_test.py | 13 ++---
 .../distributed/nn/api/remote_module_test.py  |  4 +-
 .../distributed/rpc/dist_autograd_test.py     |  2 +-
 .../_internal/distributed/rpc/rpc_test.py     |  5 +-
 38 files changed, 169 insertions(+), 154 deletions(-)

diff --git a/docs/source/nested.rst b/docs/source/nested.rst
index ac07f8acb5a23..1a4f825da1f1e 100644
--- a/docs/source/nested.rst
+++ b/docs/source/nested.rst
@@ -116,7 +116,7 @@ If all dimensions are regular, the NestedTensor is intended to be semantically i
 torch.Size([2, 20, 128])
 >>> torch.stack([a, a]).size()
 torch.Size([2, 20, 128])
->>> torch.equal(torch.stack(nt.unbind()), torch.stack([a, a]))
+>>> (torch.stack(nt.unbind()) == torch.stack([a, a])).all().item()
 True
 
 In the future we might make it easier to detect this condition and convert seamlessly.
diff --git a/test/distributed/_shard/sharded_tensor/ops/test_binary_cmp.py b/test/distributed/_shard/sharded_tensor/ops/test_binary_cmp.py
index 33fc49f81c0f7..1c496e5603904 100644
--- a/test/distributed/_shard/sharded_tensor/ops/test_binary_cmp.py
+++ b/test/distributed/_shard/sharded_tensor/ops/test_binary_cmp.py
@@ -132,7 +132,7 @@ def test_torch_equal(self):
 
         spec, alt_spec = self.get_gpu_specs()
         st1, st2 = self.get_random_tensors(spec, spec, 10, 10)
-        self.assertTrue(torch.equal(st1, st2))
+        self.assertEqual(st1, st2, rtol=0, atol=0, exact_device=True)
 
     @with_comms
     @skip_if_lt_x_gpu(4)
diff --git a/test/distributed/_shard/sharded_tensor/ops/test_tensor_ops.py b/test/distributed/_shard/sharded_tensor/ops/test_tensor_ops.py
index 977fa701b44e0..322f3a3572b82 100644
--- a/test/distributed/_shard/sharded_tensor/ops/test_tensor_ops.py
+++ b/test/distributed/_shard/sharded_tensor/ops/test_tensor_ops.py
@@ -58,9 +58,9 @@ def test_inplace_copy(self):
         )
         st = sharded_tensor.rand(spec, (12, 5))
         ones_st = sharded_tensor.ones(spec, (12, 5))
-        self.assertFalse(torch.equal(ones_st, st))
+        self.assertNotEqual(ones_st, st, rtol=0, atol=0, exact_device=True)
         st.copy_(ones_st)
-        self.assertTrue(torch.equal(st, ones_st))
+        self.assertEqual(st, ones_st, rtol=0, atol=0, exact_device=True)
 
         # no grad inplace_copy should work between two with different requires_grad
         st_with_grad = sharded_tensor.rand(spec, (12, 5), requires_grad=True)
diff --git a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
index 5c548db8324dc..351a27b5e6b10 100644
--- a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
+++ b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
@@ -1125,8 +1125,8 @@ def test_state_dict(self):
         self.assertTrue("sharded_tensor1" in loaded_dict_keys)
         self.assertTrue("submodule.sharded_tensor2" in loaded_dict_keys)
         # Verify after load.
-        self.assertTrue(torch.equal(m.sharded_tensor1, module_load.sharded_tensor1))
-        self.assertTrue(torch.equal(m.submodule.sharded_tensor2, module_load.submodule.sharded_tensor2))
+        self.assertEqual(m.sharded_tensor1, module_load.sharded_tensor1, rtol=0, atol=0, exact_device=True)
+        self.assertEqual(m.submodule.sharded_tensor2, module_load.submodule.sharded_tensor2, rtol=0, atol=0, exact_device=True)
 
     @with_comms
     @skip_if_lt_x_gpu(4)
@@ -1161,8 +1161,8 @@ def test_state_dict_new_group(self):
             module_load.load_state_dict(state_dict_deser, strict=False)
 
         # Verify after load.
-        self.assertTrue(torch.equal(m.sharded_tensor1, module_load.sharded_tensor1))
-        self.assertTrue(torch.equal(m.submodule.sharded_tensor2, module_load.submodule.sharded_tensor2))
+        self.assertEqual(m.sharded_tensor1, module_load.sharded_tensor1, rtol=0, atol=0, exact_device=True)
+        self.assertEqual(m.submodule.sharded_tensor2, module_load.submodule.sharded_tensor2, rtol=0, atol=0, exact_device=True)
 
     @with_comms
     @skip_if_lt_x_gpu(4)
diff --git a/test/distributed/checkpoint/test_file_system_checkpoint.py b/test/distributed/checkpoint/test_file_system_checkpoint.py
index 016467144e8ff..91d9609540340 100644
--- a/test/distributed/checkpoint/test_file_system_checkpoint.py
+++ b/test/distributed/checkpoint/test_file_system_checkpoint.py
@@ -73,14 +73,22 @@ def assert_state_dict_equal(
             for local_shard_1, local_shard_2 in zip(
                 value_1.local_shards(), value_2.local_shards()
             ):
-                self.assertTrue(
-                    torch.equal(local_shard_1.tensor, local_shard_1.tensor),
-                    f"Key {key}'s shard does not match",
+                self.assertEqual(
+                    local_shard_1.tensor,
+                    local_shard_1.tensor,
+                    rtol=0,
+                    atol=0,
+                    exact_device=True,
+                    msg=f"Key {key}'s shard does not match"
                 )
         elif isinstance(value_1, torch.Tensor):
-            self.assertTrue(
-                torch.equal(value_1, value_2),
-                f"Key {key}'s tensor does not match",
+            self.assertEqual(
+                value_1,
+                value_2,
+                rtol=0,
+                atol=0,
+                exact_device=True,
+                msg=f"Key {key}'s tensor does not match"
             )
 
     return True
diff --git a/test/distributed/checkpoint/test_file_system_checkpoint_cpu.py b/test/distributed/checkpoint/test_file_system_checkpoint_cpu.py
index 52e414545c049..16cb348cef7e5 100644
--- a/test/distributed/checkpoint/test_file_system_checkpoint_cpu.py
+++ b/test/distributed/checkpoint/test_file_system_checkpoint_cpu.py
@@ -75,14 +75,22 @@ def assert_state_dict_equal(
             for local_shard_1, local_shard_2 in zip(
                 value_1.local_shards(), value_2.local_shards()
             ):
-                self.assertTrue(
-                    torch.equal(local_shard_1.tensor, local_shard_1.tensor),
-                    f"Key {key}'s shard does not match",
+                self.assertEqual(
+                    local_shard_1.tensor,
+                    local_shard_1.tensor,
+                    rtol=0,
+                    atol=0,
+                    exact_device=True,
+                    msg=f"Key {key}'s shard does not match",
                 )
         elif isinstance(value_1, torch.Tensor):
-            self.assertTrue(
-                torch.equal(value_1, value_2),
-                f"Key {key}'s tensor does not match",
+            self.assertEqual(
+                value_1,
+                value_2,
+                rtol=0,
+                atol=0,
+                exact_device=True,
+                msg=f"Key {key}'s tensor does not match",
             )
 
     return True
diff --git a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
index 97b37ff2f185f..772ced8d18363 100644
--- a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
+++ b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
@@ -191,12 +191,12 @@ def _test_ddp_parity(
 
         # Check that the gradients were modified by `clip_grad_norm_()`
         for param, orig_grad in zip(ddp_model.parameters(), orig_ddp_grads):
-            assert not torch.equal(param.grad, orig_grad)
+            self.assertNotEqual(param.grad, orig_grad, rtol=0, atol=0, exact_device=True)
         for param, orig_grad in zip(fsdp_model.parameters(), orig_fsdp_grads):
             if param.grad is None:
-                self.assertEqual(param.grad, orig_grad)  # `None`
+                self.assertIsNone(orig_grad)
             else:
-                assert not torch.equal(param.grad, orig_grad)
+                self.assertNotEqual(param.grad, orig_grad, rtol=0, atol=0, exact_device=True)
 
         # Run an optimizer step to ensure gradients matched after clipping
         ddp_optim.step()
diff --git a/test/distributed/fsdp/test_fsdp_misc.py b/test/distributed/fsdp/test_fsdp_misc.py
index 8c972f8515634..3dbf5c82be5d6 100644
--- a/test/distributed/fsdp/test_fsdp_misc.py
+++ b/test/distributed/fsdp/test_fsdp_misc.py
@@ -159,7 +159,7 @@ def _check_equal(local, fsdp):
                     # above check would be vacuously true.
                     self.assertTrue(
                         any(
-                            not torch.equal(p1, p2)
+                            (p1 != p2).all()
                             for p1, p2 in zip(prev_params, m_local.parameters())
                         )
                     )
diff --git a/test/distributed/fsdp/test_fsdp_summon_full_params.py b/test/distributed/fsdp/test_fsdp_summon_full_params.py
index 18055dbebffbf..acd7de93ed783 100644
--- a/test/distributed/fsdp/test_fsdp_summon_full_params.py
+++ b/test/distributed/fsdp/test_fsdp_summon_full_params.py
@@ -155,9 +155,7 @@ def test_summon_full_param_shard_value(self, mixed_precision):
 
             # shards are padded but the full_param tensor is not
             a, b = my_shard[0 : my_slice.numel()], my_slice
-            self.assertTrue(
-                torch.equal(my_shard[0 : my_slice.numel()].cpu(), my_slice.cpu())
-            )
+            self.assertEqual(my_shard[0 : my_slice.numel()].cpu(), my_slice.cpu(), rtol=0, atol=0, exact_device=True)
 
     @skip_if_lt_x_gpu(2)
     @parametrize("recurse", [True, False])
diff --git a/test/distributed/pipeline/sync/test_pipe.py b/test/distributed/pipeline/sync/test_pipe.py
index abfa738603a1f..8eedc7a3c4908 100644
--- a/test/distributed/pipeline/sync/test_pipe.py
+++ b/test/distributed/pipeline/sync/test_pipe.py
@@ -777,7 +777,7 @@ def forward(self, a, b):
     model = Pipe(nn.Sequential(Module1().cuda(0), Module2().cuda(0)), chunks=2, checkpoint=checkpoint)
     t = torch.rand(10)
     res = model(t, t, t).local_value()
-    assert torch.equal(res, (t + t + t) + (t * t * t))
+    torch.testing.assert_close(res, (t + t + t) + (t * t * t), rtol=0, atol=0)
 
 @skip_if_no_cuda
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Need atleast two GPUs")
diff --git a/test/fx/test_dce_pass.py b/test/fx/test_dce_pass.py
index 4f46b9982ba94..3de9223cef9d9 100644
--- a/test/fx/test_dce_pass.py
+++ b/test/fx/test_dce_pass.py
@@ -60,7 +60,7 @@ def is_leaf_module(self, m, qualname):
         traced.recompile()
         # Make sure we run and get the same results before/after DCE.
         inputs = [torch.tensor([1.5])] * new_num_phs
-        self.assertTrue(torch.equal(m(*inputs), traced(*inputs)))
+        self.assertEqual(m(*inputs), traced(*inputs), rtol=0, atol=0, exact_device=True)
 
     def test_simple(self):
         """
@@ -176,7 +176,7 @@ def __init__(self):
                 super().__init__()
 
             def forward(self, a: torch.Tensor) -> torch.Tensor:
-                torch._assert(torch.equal(a, a), "a must equal a")
+                torch._assert((a == a).all(), "a must equal a")
                 return a * 2
 
         # Note: Don't need to specify torch._assert as having side effects
diff --git a/test/fx/test_fx_const_fold.py b/test/fx/test_fx_const_fold.py
index d7f3b16f2466c..6d1a76593631f 100644
--- a/test/fx/test_fx_const_fold.py
+++ b/test/fx/test_fx_const_fold.py
@@ -79,7 +79,7 @@ def forward(self, x, y):
         in_x, in_y = torch.tensor([[-0.45]]), torch.tensor([0.9])
         base_result = mod(in_x, in_y)
         fold_result = mod_folded(in_x, in_y)
-        self.assertTrue(torch.equal(fold_result, base_result))
+        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
 
     def test_const_fold_basic_one_attr_name_collision(self):
         r"""
@@ -125,7 +125,7 @@ def forward(self, x, y):
         in_x, in_y = torch.tensor([[5.0]]), torch.tensor([4.0])
         base_result = mod(in_x, in_y)
         fold_result = mod_folded(in_x, in_y)
-        self.assertTrue(torch.equal(fold_result, base_result))
+        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
 
     def test_const_fold_basic_placeholder_reordered(self):
         """
@@ -157,7 +157,7 @@ def forward(self, x, y):
         in_y = torch.tensor([[0.45]])
         base_result = mod(in_x, in_y)
         fold_result = mod_folded(in_x, in_y)
-        self.assertTrue(torch.equal(fold_result, base_result))
+        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
 
     def test_const_fold_noop(self):
         r"""
@@ -188,7 +188,7 @@ def forward(self, x):
         in_x = torch.tensor([[-0.45]])
         base_result = mod(in_x)
         fold_result = mod_folded(in_x)
-        self.assertTrue(torch.equal(fold_result, base_result))
+        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
 
     def test_const_fold_basic_two_attr_three_input(self):
         r"""
@@ -237,7 +237,7 @@ def forward(self, x, y, z):
         )
         base_result = mod(in_x, in_y, in_z)
         fold_result = mod_folded(in_x, in_y, in_z)
-        self.assertTrue(torch.equal(fold_result, base_result))
+        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
 
     def test_const_fold_basic_two_attr(self):
         r"""
@@ -274,7 +274,7 @@ def forward(self, x):
         in_x = torch.randn(2, 3)
         fold_result = mod_folded(in_x)
         base_result = mod(in_x)
-        self.assertTrue(torch.equal(fold_result, base_result))
+        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
 
     def test_const_fold_multi_const_folded_attrs(self):
         r"""
@@ -325,7 +325,7 @@ def forward(self, x, y):
         in_x, in_y = torch.randn(4, 4), torch.randn(4)
         fold_result = mod_folded(in_x, in_y)
         base_result = mod(in_x, in_y)
-        self.assertTrue(torch.equal(fold_result, base_result))
+        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
 
     def test_const_fold_submod_hierarchy(self):
         r"""
@@ -359,7 +359,7 @@ def forward(self, x):
         in_x = torch.randn(2, 3)
         fold_result = mod_folded(in_x)
         base_result = mod(in_x)
-        self.assertTrue(torch.equal(fold_result, base_result))
+        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
 
     def test_retain_node_meta(self):
         r"""
@@ -412,7 +412,7 @@ def forward(self, x):
         in_x = torch.randn(2, 3)
         fold_result = gm_folded(in_x)
         base_result = mod(in_x)
-        self.assertTrue(torch.equal(fold_result, base_result))
+        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
 
     def test_const_fold_has_inlined_call_module_node(self):
         class ConstFoldTestModule(torch.nn.Module):
@@ -433,7 +433,7 @@ def forward(self, x):
         in_x = torch.randn(2, 3)
         fold_result = gm_folded(in_x)
         base_result = mod(in_x)
-        self.assertTrue(torch.equal(fold_result, base_result))
+        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
 
     def test_const_fold_module_attr(self):
         class ConstFoldTestModule(torch.nn.Module):
@@ -455,7 +455,7 @@ def forward(self, x):
         in_x = torch.randn(2, 3)
         fold_result = gm_folded(in_x)
         base_result = mod(in_x)
-        self.assertTrue(torch.equal(fold_result, base_result))
+        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
 
     def test_const_fold_unused_placeholder(self):
         class ConstFoldTestModule(torch.nn.Module):
@@ -474,7 +474,7 @@ def forward(self, x, y, z):
         in_x = torch.randn(2, 3)
         fold_result = gm_folded(in_x, in_x, in_x)
         base_result = mod(in_x, in_x, in_x)
-        self.assertTrue(torch.equal(fold_result, base_result))
+        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
 
     def test_dict_output(self):
         class ConstFoldTestModule(torch.nn.Module):
@@ -493,7 +493,7 @@ def forward(self, x):
         in_x = torch.randn(2, 3)
         fold_result = gm_folded(in_x)
         base_result = mod(in_x)
-        self.assertTrue(torch.equal(fold_result["result"], base_result["result"]))
+        self.assertEqual(fold_result["result"], base_result["result"], rtol=0, atol=0, exact_device=True)
 
     def test_two_outputs(self):
         class ConstFoldTestModule(torch.nn.Module):
@@ -512,8 +512,8 @@ def forward(self, x):
         in_x = torch.randn(2, 3)
         fold_result = gm_folded(in_x)
         base_result = mod(in_x)
-        self.assertTrue(torch.equal(fold_result[0], base_result[0]))
-        self.assertTrue(torch.equal(fold_result[1], base_result[1]))
+        self.assertEqual(fold_result[0], base_result[0], rtol=0, atol=0, exact_device=True)
+        self.assertEqual(fold_result[1], base_result[1], rtol=0, atol=0, exact_device=True)
 
     def test_three_outputs(self):
         class ConstFoldTestModule(torch.nn.Module):
@@ -532,9 +532,9 @@ def forward(self, x):
         in_x = torch.randn(2, 3)
         fold_result = gm_folded(in_x)
         base_result = mod(in_x)
-        self.assertTrue(torch.equal(fold_result[0], base_result[0]))
-        self.assertTrue(torch.equal(fold_result[1], base_result[1]))
-        self.assertTrue(torch.equal(fold_result[2], base_result[2]))
+        self.assertEqual(fold_result[0], base_result[0], rtol=0, atol=0, exact_device=True)
+        self.assertEqual(fold_result[1], base_result[1], rtol=0, atol=0, exact_device=True)
+        self.assertEqual(fold_result[2], base_result[2], rtol=0, atol=0, exact_device=True)
 
     def test_check_inline_non_const(self):
         r"""
@@ -566,7 +566,7 @@ def forward(self, x):
         in_x = torch.randn(2, 3)
         fold_result = gm_folded(in_x)
         base_result = mod(in_x)
-        self.assertTrue(torch.equal(fold_result, base_result))
+        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
 
     def test_check_inline_non_const_mult_return(self):
         r"""
@@ -598,8 +598,8 @@ def forward(self, x):
         in_x = torch.randn(2, 3)
         fold_result = gm_folded(in_x)
         base_result = mod(in_x)
-        self.assertTrue(torch.equal(fold_result[0], base_result[0]))
-        self.assertTrue(torch.equal(fold_result[1], base_result[1]))
+        self.assertEqual(fold_result[0], base_result[0], rtol=0, atol=0, exact_device=True)
+        self.assertEqual(fold_result[1], base_result[1], rtol=0, atol=0, exact_device=True)
 
     def test_check_skip_folding_quant_dequant_pattern(self):
         r"""
@@ -645,7 +645,7 @@ def skip_folding_quant_dequant(node: torch.fx.Node):
         # Now run both folded and non-folded to check results equal.
         fold_result = gm_folded(in_x)
         base_result = mod(in_x)
-        self.assertTrue(torch.equal(fold_result, base_result))
+        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
 
     def test_fold_module(self):
         r"""
@@ -667,7 +667,7 @@ def forward(self, x):
 
         # Now run both folded and non-folded to check results equal.
         inp = torch.randn(4, 4)
-        self.assertTrue(torch.equal(mod_folded(inp), mod(inp)))
+        self.assertEqual(mod_folded(inp), mod(inp), rtol=0, atol=0, exact_device=True)
 
     def test_const_fold_tensor_meta(self):
         self._test_const_fold_tensor_meta(True)
@@ -708,4 +708,4 @@ def forward(self, x, y):
         # Now run both folded and non-folded to check results equal.
         base_result = mod(in_x, in_y)
         fold_result = mod_folded(in_x, in_y)
-        self.assertTrue(torch.equal(fold_result, base_result))
+        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
diff --git a/test/jit/test_save_load.py b/test/jit/test_save_load.py
index 16babb7c7a254..daf6915af0084 100644
--- a/test/jit/test_save_load.py
+++ b/test/jit/test_save_load.py
@@ -404,7 +404,7 @@ def forward(self, a):
             m2 = torch.jit.load(path)
 
         x = torch.tensor([1.0, 2.0, 3.0, 4.0])
-        self.assertTrue(torch.equal(m(x), m2(x)))
+        self.assertEqual(m(x), m2(x), rtol=0, atol=0, exact_device=True)
 
     def test_save_nonexit_file(self):
         class Foo(torch.nn.Module):
@@ -880,7 +880,7 @@ def forward(self, a):
             m2 = torch.jit.load(path)
 
         x = torch.tensor([1.0, 2.0, 3.0, 4.0])
-        self.assertTrue(torch.equal(m(x), m2(x)))
+        self.assertEqual(m(x), m2(x), rtol=0, atol=0, exact_device=True)
 
     def test_save_namedtuple_input_only(self):
         """
diff --git a/test/nn/test_lazy_modules.py b/test/nn/test_lazy_modules.py
index c3a9dff200224..0264f959f614d 100644
--- a/test/nn/test_lazy_modules.py
+++ b/test/nn/test_lazy_modules.py
@@ -118,7 +118,7 @@ def test_linear(self):
         self.assertTrue(module.weight.shape == (10, 5))
         self.assertTrue(module.bias.shape == (10,))
         y = module(input)
-        self.assertTrue(torch.equal(torch.nn.functional.linear(input, module.weight, module.bias), y))
+        self.assertEqual(torch.nn.functional.linear(input, module.weight, module.bias), y, rtol=0, atol=0, exact_device=True)
 
     @suppress_warnings
     def test_lazy_linear_pickle(self):
@@ -170,7 +170,7 @@ def _check_lazy_conv(self, cls, lazy_cls, func, init_args, input_shape,
         if module.bias is not None:
             self.assertEqual(module.bias.shape, expected_bias_shape)
         y = module(input)
-        self.assertTrue(torch.equal(func(input, module.weight, module.bias), y))
+        self.assertEqual(func(input, module.weight, module.bias), y, rtol=0, atol=0, exact_device=True)
 
     def _check_lazy_conv_pickle(self, cls, lazy_cls, init_args, input_shape,
                                 expected_weight_shape, expected_bias_shape):
diff --git a/test/quantization/core/experimental/test_fake_quantize.py b/test/quantization/core/experimental/test_fake_quantize.py
index 4e9464aca800a..609ec0366a85f 100644
--- a/test/quantization/core/experimental/test_fake_quantize.py
+++ b/test/quantization/core/experimental/test_fake_quantize.py
@@ -29,9 +29,9 @@ def test_fake_calc_qparams(self):
         qparams_expected = observer.calculate_qparams(signed=False)
 
         self.assertEqual(alpha, qparams_expected[0])
-        self.assertTrue(torch.equal(gamma, qparams_expected[1]))
-        self.assertTrue(torch.equal(quantization_levels, qparams_expected[2]))
-        self.assertTrue(torch.equal(level_indices, qparams_expected[3]))
+        self.assertEqual(gamma, qparams_expected[1], rtol=0, atol=0, exact_device=True)
+        self.assertEqual(quantization_levels, qparams_expected[2], rtol=0, atol=0, exact_device=True)
+        self.assertEqual(level_indices, qparams_expected[3], rtol=0, atol=0, exact_device=True)
 
     r""" Tests fake quantize forward() method
          by comparing result with expected
@@ -58,7 +58,7 @@ def test_forward(self):
         X_to_apot = quantize_APoT(X, alpha, gamma, quantization_levels, level_indices)
         X_expected = dequantize_APoT(X_to_apot)
 
-        self.assertTrue(torch.equal(X_reduced_precision_fp, X_expected))
+        self.assertEqual(X_reduced_precision_fp, X_expected, rtol=0, atol=0, exact_device=True)
 
     r""" Tests fake quantize forward() method
          throws error when qparams are None
diff --git a/test/quantization/core/experimental/test_linear.py b/test/quantization/core/experimental/test_linear.py
index 6a46b4fc3ccbf..d36d7fbdff667 100644
--- a/test/quantization/core/experimental/test_linear.py
+++ b/test/quantization/core/experimental/test_linear.py
@@ -31,7 +31,7 @@ def test_linear_APoT_k1(self):
 
         fp_linear_result = fp_linear(activation).data
 
-        self.assertTrue(torch.equal(apot_linear_result, fp_linear_result))
+        self.assertEqual(apot_linear_result, fp_linear_result, rtol=0, atol=0, exact_device=True)
 
     """
         Test linear_APoT_fn by comparing to uniform linear
@@ -59,7 +59,7 @@ def test_linear_APoT_k2(self):
 
         fp_linear_result = fp_linear(activation).data
 
-        self.assertTrue(torch.equal(apot_linear_result, fp_linear_result))
+        self.assertEqual(apot_linear_result, fp_linear_result, rtol=0, atol=0, exact_device=True)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/quantization/core/experimental/test_quantized_tensor.py b/test/quantization/core/experimental/test_quantized_tensor.py
index 02286b94f8db3..0bb9a4e536fbc 100644
--- a/test/quantization/core/experimental/test_quantized_tensor.py
+++ b/test/quantization/core/experimental/test_quantized_tensor.py
@@ -35,7 +35,7 @@ def test_int_repr(self):
         # 0.0215 in tensor2quantize nearest 0.0208 in quantization_levels -> 3 in level_indices
         expected_qtensor_data = torch.tensor([0, 3, 8, 13, 5, 12], dtype=torch.int32)
 
-        self.assertTrue(torch.equal(qtensor_data, expected_qtensor_data))
+        self.assertEqual(qtensor_data, expected_qtensor_data, rtol=0, atol=0, exact_device=True)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/quantization/core/experimental/test_quantizer.py b/test/quantization/core/experimental/test_quantizer.py
index d689ee8e99e15..5fa342aa82f1d 100644
--- a/test/quantization/core/experimental/test_quantizer.py
+++ b/test/quantization/core/experimental/test_quantizer.py
@@ -48,7 +48,7 @@ def test_quantize_APoT_rand_k1(self):
         qtensor_data = qtensor.data.int()
         uniform_quantized_tensor = uniform_quantized.data.int()
 
-        self.assertTrue(torch.equal(qtensor_data, uniform_quantized_tensor))
+        self.assertEqual(qtensor_data, uniform_quantized_tensor, rtol=0, atol=0, exact_device=True)
 
     r""" Tests quantize_APoT for k != 1.
         Tests quantize_APoT result on random 1-dim tensor and hardcoded values for
@@ -92,7 +92,7 @@ def test_quantize_APoT_k2(self):
         # 0.0215 in tensor2quantize nearest 0.0208 in quantization_levels -> 3 in level_indices
         expected_qtensor = torch.tensor([0, 3, 8, 13, 5, 12], dtype=torch.int32)
 
-        self.assertTrue(torch.equal(qtensor_data, expected_qtensor))
+        self.assertEqual(qtensor_data, expected_qtensor, rtol=0, atol=0, exact_device=True)
 
     r""" Tests dequantize_apot result on random 1-dim tensor
         and hardcoded values for b, k.
@@ -137,7 +137,7 @@ def test_dequantize_quantize_rand_b4(self):
 
         result = final_apot.data.int()
 
-        self.assertTrue(torch.equal(original_input, result))
+        self.assertEqual(original_input, result, rtol=0, atol=0, exact_device=True)
 
     r""" Tests dequantize_apot result on random 1-dim tensor
         and hardcoded values for b, k.
@@ -182,7 +182,7 @@ def test_dequantize_quantize_rand_b6(self):
 
         result = final_apot.data.int()
 
-        self.assertTrue(torch.equal(original_input, result))
+        self.assertEqual(original_input, result, rtol=0, atol=0, exact_device=True)
 
     r""" Tests for correct dimensions in dequantize_apot result
          on random 3-dim tensor with random dimension sizes
diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index 116a76a2d1f5e..3333e19e2535f 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -2244,7 +2244,7 @@ def test_quantized_mean_qnnpack(self, keep):
             XQ = torch.quantize_per_tensor(X, scale=0.2, zero_point=0, dtype=torch.quint8)
             YQ = torch.quantize_per_tensor(Y, scale=0.2, zero_point=0, dtype=torch.quint8)
             MQ = XQ.mean((2, 3), keepdim=keep)
-            self.assertTrue(torch.equal(MQ, YQ))
+            self.assertEqual(MQ, YQ, rtol=0, atol=0, exact_device=True)
 
     @override_qengines
     def test_std(self):
@@ -3171,7 +3171,7 @@ def test_linear_prepack_fp16_numerics(self, input_channels, output_channels, exp
         w_packed_fp16 = torch.ops.quantized.linear_prepack_fp16(w, bias)
         w_unpacked_fp16 = torch.ops.quantized.linear_unpack_fp16(w_packed_fp16)
         w_fp16 = w.to(torch.float16).to(torch.float32)
-        self.assertTrue(torch.equal(w_fp16, w_unpacked_fp16[0]))
+        self.assertEqual(w_fp16, w_unpacked_fp16[0], rtol=0, atol=0, exact_device=True)
 
     @skipIfNoFBGEMM
     def test_qlinear_dynamic_fp16(self):
diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py
index 241aab5da3237..aacfa607909fd 100644
--- a/test/quantization/core/test_quantized_tensor.py
+++ b/test/quantization/core/test_quantized_tensor.py
@@ -1201,7 +1201,7 @@ def test_qtensor_view(self):
             self.assertNotEqual(b.int_repr(), c.int_repr())
             # torch.equal is not supported for the cuda backend
             if device == 'cpu':
-                self.assertFalse(torch.equal(b, c))
+                self.assertNotEqual(b, c, rtol=0, atol=0, exact_device=True)
 
             # a case can't view non-contiguos Tensor
             a_int = torch.randint(0, 100, [1, 2, 3, 4], device=device, dtype=dtype)
@@ -1248,7 +1248,7 @@ def test_qtensor_resize(self):
             self.assertNotEqual(b.int_repr(), c.int_repr())
             # torch.equal is not supported for the cuda backend
             if device == 'cpu':
-                self.assertFalse(torch.equal(b, c))
+                self.assertNotEqual(b, c, rtol=0, atol=0, exact_device=True)
 
             # Throws an error if numel is wrong
             q1_int = torch.randint(0, 100, sizes1, dtype=dtype, device=device)
@@ -1282,7 +1282,7 @@ def test_qtensor_reshape(self):
             self.assertNotEqual(b.int_repr(), c.int_repr())
             # torch.equal is not supported for the cuda backend
             if device == 'cpu':
-                self.assertFalse(torch.equal(b, c))
+                self.assertNotEqual(b, c, rtol=0, atol=0, exact_device=True)
 
             # we can use reshape for non-contiguous Tensor
             a_int = torch.randint(0, 100, [1, 2, 3, 4], dtype=dtype, device=device)
diff --git a/test/quantization/core/test_workflow_module.py b/test/quantization/core/test_workflow_module.py
index 6ac8bed90ca3f..ab9dc277d2552 100644
--- a/test/quantization/core/test_workflow_module.py
+++ b/test/quantization/core/test_workflow_module.py
@@ -583,12 +583,12 @@ def test_observer_scriptable(self, qdtype):
         x = torch.rand(3, 4)
         obs(x)
         scripted(x)
-        self.assertTrue(torch.equal(obs.get_tensor_value()[0], scripted.get_tensor_value()[0]))
+        self.assertEqual(obs.get_tensor_value()[0], scripted.get_tensor_value()[0], rtol=0, atol=0, exact_device=True)
         buf = io.BytesIO()
         torch.jit.save(scripted, buf)
         buf.seek(0)
         loaded = torch.jit.load(buf)
-        self.assertTrue(torch.equal(obs.get_tensor_value()[0], loaded.get_tensor_value()[0]))
+        self.assertEqual(obs.get_tensor_value()[0], loaded.get_tensor_value()[0], rtol=0, atol=0, exact_device=True)
 
 class TestHistogramObserver(QuantizationTestCase):
     @given(qdtype=st.sampled_from((torch.qint8, torch.quint8)),
@@ -606,12 +606,12 @@ def test_observer_scriptable(self, qdtype, qscheme):
             x = torch.rand(3, 4)
             obs(x)
             scripted(x)
-            self.assertTrue(torch.equal(obs.histogram, scripted.histogram))
+            self.assertEqual(obs.histogram, scripted.histogram, rtol=0, atol=0, exact_device=True)
             buf = io.BytesIO()
             torch.jit.save(scripted, buf)
             buf.seek(0)
             loaded = torch.jit.load(buf)
-            self.assertTrue(torch.equal(obs.histogram, scripted.histogram))
+            self.assertEqual(obs.histogram, scripted.histogram, rtol=0, atol=0, exact_device=True)
 
     @given(qdtype=st.sampled_from((torch.qint8, torch.quint8)),
            qscheme=st.sampled_from((torch.per_tensor_affine, torch.per_tensor_symmetric)),
diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 107e6eb589f2e..f33fe306d8203 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -1012,7 +1012,7 @@ def checkSerDeser(model, is_dynamic):
                         module.weight_scale = None
                         model.load_state_dict(state_dict)
                         module = getattr(model, module_name)
-                        self.assertTrue(torch.equal(prev_scale, module.weight_scale))
+                        self.assertEqual(prev_scale, module.weight_scale, rtol=0, atol=0, exact_device=True)
 
 
             checkWeightQParams(qr)
@@ -4551,7 +4551,7 @@ def forward(self, x):
             m_ref = convert_to_reference_fx(m_copy)
             result = m(*example_inputs)
             result_ref = m_ref(*example_inputs)
-            self.assertTrue(torch.equal(result, result_ref))
+            self.assertEqual(result, result_ref, rtol=0, atol=0, exact_device=True)
 
     def test_ref_conv_module(self):
         """ Make sure the numerics for models with ref conv module
@@ -4589,7 +4589,7 @@ def forward(self, x):
             m_ref = convert_to_reference_fx(m_copy)
             result = m(data)
             result_ref = m_ref(data)
-            self.assertTrue(torch.equal(result, result_ref))
+            self.assertEqual(result, result_ref, rtol=0, atol=0, exact_device=True)
 
     def test_sub_scalar(self):
         class M(torch.nn.Module):
@@ -4690,7 +4690,7 @@ def forward(self, x):
             }
             self.checkGraphModuleNodes(m, expected_node_occurrence=expected_node_occurrence)
             # checking result match
-            self.assertTrue(torch.equal(out_ref, out))
+            self.assertEqual(out_ref, out, rtol=0, atol=0, exact_device=True)
 
     def test_convert_qconfig_mapping(self):
         class Linear(torch.nn.Module):
diff --git a/test/test_autocast.py b/test/test_autocast.py
index 1a8263a79f93d..4bae83db3788e 100644
--- a/test/test_autocast.py
+++ b/test/test_autocast.py
@@ -59,7 +59,7 @@ def cast(val, to_type):
             # For example, lstm_cell returns a tuple and equal returns bool.
             def compare(first, second):
                 if isinstance(first, torch.Tensor):
-                    return torch.equal(first, second)
+                    return (first == second).all().item()
                 elif isinstance(first, collections.abc.Iterable):
                     return all(compare(f, s) for f, s in zip(first, second))
                 else:
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 5dd695e14b323..44a5c7ccc4509 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -3764,8 +3764,8 @@ def test_calculate_shape_util(self):
         grad = torch.nested.as_nested_tensor([torch.randn(5, 10, requires_grad=True), torch.randn(5, 10, requires_grad=True)])
         out_shape, grad_shape = _calculate_shape(out, grad, False)
 
-        assert torch.equal(out_shape, torch.tensor([[10, 5], [10, 5], [10, 5]]))
-        assert torch.equal(grad_shape, torch.tensor([[5, 10], [5, 10]]))
+        torch.testing.assert_close(out_shape, torch.tensor([[10, 5], [10, 5], [10, 5]]), rtol=0, atol=0)
+        torch.testing.assert_close(grad_shape, torch.tensor([[5, 10], [5, 10]]), rtol=0, atol=0)
 
     def test_nested_anomaly_detect_nan(self):
         size = 10
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 40eaaa97a3b7e..b4c2281b1de52 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -2877,7 +2877,7 @@ def cast(val, to_type):
             # For example, lstm_cell returns a tuple and equal returns bool.
             def compare(first, second):
                 if isinstance(first, torch.Tensor):
-                    return torch.equal(first, second)
+                    return (first == second).all().item()
                 elif isinstance(first, collections.abc.Iterable):
                     return all(compare(f, s) for f, s in zip(first, second))
                 else:
@@ -4693,13 +4693,13 @@ def test_gather_namedtuple(self):
         for i, x in enumerate(out):
             self.assertTrue(isinstance(x, type(out2[-1])))  # x must be a tensor
             cat = torch.cat((outputs[0][i].to('cpu'), outputs[1][i].to('cpu')))
-            self.assertTrue(torch.equal(x, cat))
+            self.assertEqual(x, cat, rtol=0, atol=0, exact_device=True)
 
         out = scatter_gather.gather(outputs, 0)  # test on GPU
         for i, x in enumerate(out):
             self.assertTrue(isinstance(x, type(out2[-1])))
             cat = torch.cat((outputs[0][i].to(0), outputs[1][i].to(0)))
-            self.assertTrue(torch.equal(x, cat))
+            self.assertEqual(x, cat, rtol=0, atol=0, exact_device=True)
 
         class TestNamedTupleInput_1(NamedTuple):
             a: torch.tensor
@@ -4719,13 +4719,13 @@ class TestNamedTupleInput_1(NamedTuple):
         for i, x in enumerate(out):
             self.assertTrue(isinstance(x, type(out2[-1])))
             cat = torch.cat((outputs[0][i].to(0), outputs[1][i].to(0)))
-            self.assertTrue(torch.equal(x, cat))
+            self.assertEqual(x, cat, rtol=0, atol=0, exact_device=True)
 
         out = scatter_gather.gather(outputs, 'cpu')  # test on CPU
         for i, x in enumerate(out):
             self.assertTrue(isinstance(x, type(out2[-1])))
             cat = torch.cat((outputs[0][i].to('cpu'), outputs[1][i].to('cpu')))
-            self.assertTrue(torch.equal(x, cat))
+            self.assertEqual(x, cat, rtol=0, atol=0, exact_device=True)
 
     @unittest.skipIf(TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync")
     def test_memory_snapshot(self):
diff --git a/test/test_jit.py b/test/test_jit.py
index 6cbc091d506b5..c1aeee2e66c18 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -744,7 +744,7 @@ def check(x, y):
     def test_matrix_transpose(self):
         @torch.jit.script
         def check(x):
-            return torch.equal(x.mT, x.transpose(-2, -1))
+            return bool((x.mT == x.transpose(-2, -1)).all())
 
         x = torch.rand(3, 4)
         self.assertTrue(check(x))
@@ -752,7 +752,7 @@ def check(x):
     def test_transpose(self):
         @torch.jit.script
         def check(x):
-            return torch.equal(x.T, x.t())
+            return bool((x.T == x.t()).all())
 
         x = torch.rand(3, 4)
         self.assertTrue(check(x))
@@ -760,7 +760,7 @@ def check(x):
     def test_matrix_conj_transpose(self):
         @torch.jit.script
         def check(x):
-            return torch.equal(x.mH, x.transpose(-2, -1).conj())
+            return bool((x.mH == x.transpose(-2, -1).conj()).all())
 
         x = torch.rand(3, 4)
         self.assertTrue(check(x))
@@ -771,7 +771,7 @@ def check(x):
     def test_conj_transpose(self):
         @torch.jit.script
         def check(x):
-            return torch.equal(x.H, x.t().conj())
+            return bool((x.H == x.t().conj()).all())
 
         x = torch.rand(3, 4)
         self.assertTrue(check(x))
@@ -1139,7 +1139,7 @@ def test_script_backward_twice_with_saved_values(input1, input2):
             # type: (Tensor, Tensor) -> Tensor
             tmp1 = torch.mul(input1, input2)
             tmp2 = torch.abs(tmp1)
-            if torch.equal(input1, input2):
+            if (input1 == input2).all():
                 tmp2 = torch.acos(tmp2)
             else:
                 tmp2 = torch.atan(tmp2)
@@ -1694,7 +1694,7 @@ def doit(x, y):
         t_node = g2.create("prim::TensorTest").t_("a", torch.ones([2, 2]))
         self.assertEqual(t_node.attributeNames(), ["a"])
         g2.appendNode(t_node)
-        self.assertTrue(torch.equal(torch.ones(2, 2), t_node.t("a")))
+        self.assertEqual(torch.ones(2, 2), t_node.t("a"), rtol=0, atol=0, exact_device=True)
         for node in g.nodes():
             self.assertTrue(g2.findNode(node.kind()) is not None)
 
diff --git a/test/test_mps.py b/test/test_mps.py
index 5b45bd4e6e692..458d0f8f1baa4 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -1650,7 +1650,7 @@ def test_div_bugs(self):
     def test_bool_expand(self):
         x = torch.tensor([[1], [0]], dtype=torch.bool, device='mps')
         y = torch.tensor([0, 1], dtype=torch.bool, device='mps')
-        self.assertFalse(torch.equal(x.expand(2, 2), y.expand(2, 2)))
+        self.assertNotEqual(x.expand(2, 2), y.expand(2, 2), rtol=0, atol=0, exact_device=True)
 
     # Empty unary op should return tensor of the same size
     def test_empty_neg(self):
@@ -4960,7 +4960,7 @@ def helper(alpha):
         # see https://github.com/pytorch/pytorch/issues/79835#issuecomment-1164984534
         x = torch.ones(4, dtype=torch.int32, device='mps')
         self.assertEqual(x + 1, torch.full((4,), 2, dtype=torch.int32, device='mps'))
-        self.assertTrue(torch.equal(x + 1.5, torch.full((4,), 2.5, device='mps')))
+        self.assertEqual(x + 1.5, torch.full((4,), 2.5, device='mps'), rtol=0, atol=0, exact_device=True)
 
     def test_types_binary_op(self):
         # Float * Bool
diff --git a/test/test_namedtensor.py b/test/test_namedtensor.py
index 751a56f168e78..cf7de3fe52b53 100644
--- a/test/test_namedtensor.py
+++ b/test/test_namedtensor.py
@@ -1082,31 +1082,31 @@ def test_flatten_nodims(self):
 
     def test_unflatten(self):
         # test args: tensor, int, namedshape
-        self.assertTrue(torch.equal(
-            torch.ones(4, names=('A',)).unflatten('A', (('A', 2), ('B', 2))),
-            torch.ones(2, 2, names=('A', 'B'))))
-        self.assertTrue(torch.equal(
-            torch.ones(4, names=('A',)).unflatten('A', [('A', 2), ('B', 2)]),
-            torch.ones(2, 2, names=('A', 'B'))))
-        self.assertTrue(torch.equal(
-            torch.ones(4, names=('A',)).unflatten('A', (['A', 2], ['B', 2])),
-            torch.ones(2, 2, names=('A', 'B'))))
-        self.assertTrue(torch.equal(
-            torch.ones(2, 10, names=('A', 'B')).unflatten('B', (['B1', -1],)),
-            torch.ones(2, 10, names=('A', 'B1'))))
-        self.assertTrue(torch.equal(
-            torch.ones(2, 3 * 4 * 5 * 6, names=('A', 'B'))
-                 .unflatten('B', (['B1', 3], ['B2', 4], ['B3', -1], ['B4', 6])),
-            torch.ones(2, 3, 4, 5, 6, names=('A', 'B1', 'B2', 'B3', 'B4'))))
-        self.assertTrue(torch.equal(
-            torch.ones(2, 0, names=('A', 'B'))
-                 .unflatten('B', (['B1', 3], ['B2', -1], ['B3', 4])),
-            torch.ones(2, 3, 0, 4, names=('A', 'B1', 'B2', 'B3'))))
+        self.assertTrue(
+            (torch.ones(4, names=('A',)).unflatten('A', (('A', 2), ('B', 2))) ==
+             torch.ones(2, 2, names=('A', 'B'))).all())
+        self.assertTrue(
+            (torch.ones(4, names=('A',)).unflatten('A', [('A', 2), ('B', 2)]) ==
+             torch.ones(2, 2, names=('A', 'B'))).all())
+        self.assertTrue(
+            (torch.ones(4, names=('A',)).unflatten('A', (['A', 2], ['B', 2])) ==
+             torch.ones(2, 2, names=('A', 'B'))).all())
+        self.assertTrue(
+            (torch.ones(2, 10, names=('A', 'B')).unflatten('B', (['B1', -1],)) ==
+             torch.ones(2, 10, names=('A', 'B1'))).all())
+        self.assertTrue(
+            (torch.ones(2, 3 * 4 * 5 * 6, names=('A', 'B'))
+                  .unflatten('B', (['B1', 3], ['B2', 4], ['B3', -1], ['B4', 6])) ==
+             torch.ones(2, 3, 4, 5, 6, names=('A', 'B1', 'B2', 'B3', 'B4'))).all())
+        self.assertTrue(
+            (torch.ones(2, 0, names=('A', 'B'))
+                  .unflatten('B', (['B1', 3], ['B2', -1], ['B3', 4])) ==
+             torch.ones(2, 3, 0, 4, names=('A', 'B1', 'B2', 'B3'))).all())
 
         # test args: namedtensor, str, namedshape
-        self.assertTrue(torch.equal(
-            torch.ones(2, 4, names=('A', 'B')).unflatten('B', (('B1', 2), ('B2', 2))),
-            torch.ones(2, 2, 2, names=('A', 'B1', 'B2'))))
+        self.assertTrue(
+            (torch.ones(2, 4, names=('A', 'B')).unflatten('B', (('B1', 2), ('B2', 2))) ==
+             torch.ones(2, 2, 2, names=('A', 'B1', 'B2'))).all())
 
         # test invalid args: namedtensor, str, sizes
         with self.assertRaisesRegex(TypeError, r"unflatten\(\): argument 'dim' \(position 1\) must be int, not str"):
diff --git a/test/test_nn.py b/test/test_nn.py
index 60fb0e6c0cff3..dd056cf103db2 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -1749,7 +1749,7 @@ def test_vector_to_parameters(self):
         vector_to_parameters(vec, model.parameters())
 
         sample = next(model.parameters())[0, 0, 0]
-        self.assertTrue(torch.equal(sample.data, vec.data[:5]))
+        self.assertEqual(sample.data, vec.data[:5], rtol=0, atol=0, exact_device=True)
 
     def test_rnn_weight_norm(self):
         def check_weight_norm(l, name, num_params):
@@ -5119,9 +5119,9 @@ def test_batchnorm_buffer_update_when_stats_are_not_tracked(self):
         # Forward random tensor
         _ = bn(torch.rand(input_size))
         # Ensure none of the buffers has been updated
-        self.assertTrue(torch.equal(num_batches, bn.num_batches_tracked))
-        self.assertTrue(torch.equal(running_mean, bn.running_mean))
-        self.assertTrue(torch.equal(running_var, bn.running_var))
+        self.assertEqual(num_batches, bn.num_batches_tracked, rtol=0, atol=0, exact_device=True)
+        self.assertEqual(running_mean, bn.running_mean, rtol=0, atol=0, exact_device=True)
+        self.assertEqual(running_var, bn.running_var, rtol=0, atol=0, exact_device=True)
 
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
     def test_batchnorm_nhwc_cuda(self):
@@ -5133,7 +5133,7 @@ def test_batchnorm_nhwc_cuda(self):
             inp2 = inp1.contiguous(memory_format=torch.channels_last)
             out1 = model(inp1)
             out2 = model(inp2)
-            self.assertTrue(torch.equal(out1, out2))
+            self.assertEqual(out1, out2, rtol=0, atol=0, exact_device=True)
 
     def test_pairwise_distance(self):
         input1 = torch.randn(4, 4, requires_grad=True)
diff --git a/test/test_serialization.py b/test/test_serialization.py
index 190fb7545fcf0..b727ca2ebbbdf 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -282,7 +282,7 @@ def test_serialization_offset_gzip(self):
         with gzip.open(f2.name, 'rb') as f:
             j = pickle.load(f)
             b = torch.load(f)
-        self.assertTrue(torch.equal(a, b))
+        self.assertEqual(a, b, rtol=0, atol=0, exact_device=True)
         self.assertEqual(i, j)
 
     def _test_serialization_sparse(self, weights_only):
@@ -554,7 +554,7 @@ def _test_serialization_filelike(self, tensor, mock, desc):
         msg = 'filelike serialization with {}'
 
         b = torch.load(data)
-        self.assertTrue(torch.equal(tensor, b), msg.format(desc))
+        self.assertEqual(tensor, b), msg.format(desc, rtol=0, atol=0, exact_device=True)
 
     @unittest.skipIf((3, 8, 0) <= sys.version_info < (3, 8, 2), "See https://bugs.python.org/issue39681")
     def test_serialization_filelike_missing_attrs(self):
@@ -827,7 +827,7 @@ def test_serialization_container_filelike(self):
 
     def test_serialization_offset(self):
         a = torch.randn(5, 5)
-        b = torch.randn(1024, 1024, 512, dtype=torch.float32)
+        b = torch.randn(1024, 512, dtype=torch.float32)
         m = torch.nn.Conv2d(1, 1, (1, 3))
         i, j = 41, 43
         with tempfile.NamedTemporaryFile() as f:
@@ -836,15 +836,15 @@ def test_serialization_offset(self):
             pickle.dump(j, f)
             torch.save(b, f)
             torch.save(m, f)
-            self.assertTrue(f.tell() > 2 * 1024 * 1024 * 1024)
+            self.assertTrue(f.tell() > 2 * 1024 * 1024)
             f.seek(0)
             i_loaded = pickle.load(f)
             a_loaded = torch.load(f)
             j_loaded = pickle.load(f)
             b_loaded = torch.load(f)
             m_loaded = torch.load(f)
-        self.assertTrue(torch.equal(a, a_loaded))
-        self.assertTrue(torch.equal(b, b_loaded))
+        self.assertEqual(a, a_loaded, rtol=0, atol=0, exact_device=True)
+        self.assertEqual(b, b_loaded, rtol=0, atol=0, exact_device=True)
         self.assertTrue(m.kernel_size == m_loaded.kernel_size)
         self.assertEqual(i, i_loaded)
         self.assertEqual(j, j_loaded)
@@ -852,21 +852,21 @@ def test_serialization_offset(self):
     @parametrize('weights_only', (True, False))
     def test_serialization_offset_filelike(self, weights_only):
         a = torch.randn(5, 5)
-        b = torch.randn(1024, 1024, 512, dtype=torch.float32)
+        b = torch.randn(1024, 512, dtype=torch.float32)
         i, j = 41, 43
         with BytesIOContext() as f:
             pickle.dump(i, f)
             torch.save(a, f)
             pickle.dump(j, f)
             torch.save(b, f)
-            self.assertTrue(f.tell() > 2 * 1024 * 1024 * 1024)
+            self.assertTrue(f.tell() > 2 * 1024 * 1024)
             f.seek(0)
             i_loaded = pickle.load(f)
             a_loaded = torch.load(f, weights_only=weights_only)
             j_loaded = pickle.load(f)
             b_loaded = torch.load(f, weights_only=weights_only)
-        self.assertTrue(torch.equal(a, a_loaded))
-        self.assertTrue(torch.equal(b, b_loaded))
+        self.assertEqual(a, a_loaded, rtol=0, atol=0, exact_device=True)
+        self.assertEqual(b, b_loaded, rtol=0, atol=0, exact_device=True)
         self.assertEqual(i, i_loaded)
         self.assertEqual(j, j_loaded)
 
diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py
index cf894f3749eb9..30cec4d1dffc0 100644
--- a/test/test_tensorexpr.py
+++ b/test/test_tensorexpr.py
@@ -1475,7 +1475,7 @@ def test(x):
                 scripted = torch.jit.script(test)
                 out = warmup_and_run_forward(scripted, x)
                 self.assertLastGraphAllFused()
-                assert torch.equal(out, test(x))
+                torch.testing.assert_close(out, test(x), rtol=0, atol=0)
 
     def test_simple_add(self):
         val = torch._C._jit_get_te_generate_block_code()
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/binary_cmp.py b/torch/distributed/_shard/sharded_tensor/_ops/binary_cmp.py
index fa1eded53b5ce..70f2cb6ea7c0d 100644
--- a/torch/distributed/_shard/sharded_tensor/_ops/binary_cmp.py
+++ b/torch/distributed/_shard/sharded_tensor/_ops/binary_cmp.py
@@ -17,7 +17,7 @@ def _communicate_result(result, pg):
 
     expected_result = torch.ones(1, device=torch.device(torch.cuda.current_device())) * dist.get_world_size(pg)
 
-    return torch.equal(result_tensor, expected_result)
+    return (result_tensor == expected_result).all().item()
 
 def binary_cmp(cmp_fun, types, args, kwargs=None, process_group=None):
     if len(args) != 2:
diff --git a/torch/distributed/tensor/parallel/multihead_attention_tp.py b/torch/distributed/tensor/parallel/multihead_attention_tp.py
index 03dc25161bdf8..ff5c547b396ba 100644
--- a/torch/distributed/tensor/parallel/multihead_attention_tp.py
+++ b/torch/distributed/tensor/parallel/multihead_attention_tp.py
@@ -151,9 +151,10 @@ def forward(
                 self.value(value), 1, (sk, b * nh, hn)
             )
         else:
-            assert torch.equal(query, key) and torch.equal(
-                query, value
-            ), "inputs are different for self-attention."
+            torch.testing.assert_close(
+                (query, query), (key, value), rtol=0, atol=0, msg="inputs are different for self-attention."
+            )
+
             # =====================
             # Query
             # =====================
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 9984f602425cd..37ba0e56e6ba0 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -894,7 +894,7 @@ def expand_as(g: jit_utils.GraphContext, self, other):
         self_t = self_t.to(torch.double)
         dims = []
         for d in range(self_t.dim()):
-            if torch.equal(self_t.mean(d).unsqueeze(d).expand_as(self_t), self_t):
+            if (self_t.mean(d).unsqueeze(d).expand_as(self_t) == self_t).all():
                 dims.append(d)
                 self = g.op("Constant", value_t=self_t.mean(dims).to(orig_type))
 
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 814dd3d5ad5f8..fa0c4a389ef66 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -114,9 +114,10 @@ def __init__(self, x):
 
     def __eq__(self, other):
         def eq(value, other):
-            if isinstance(value, torch.Tensor):
-                return torch.equal(value, other)
-            return value == other
+            result = value == other
+            if isinstance(result, torch.Tensor):
+                result = result.all().item()
+            return result
 
         for attr, value in self.__dict__.items():
             other_value = other.__dict__[attr]
@@ -352,7 +353,7 @@ def __init__(self):
 
     def forward(self, x):
         # Second layer is used dependent on input x.
-        use_second_layer = torch.equal(x, torch.ones(20, 10, device=x.device))
+        use_second_layer = (x == torch.ones(20, 10, device=x.device)).all()
         if use_second_layer:
             return self.lin2(F.relu(self.lin1(x)))
         else:
@@ -3312,7 +3313,7 @@ def _run_all_gather_coalesced_and_verify(
 
             for l1, l2 in zip(output_tensor_lists, expected_tensors):
                 for t1, t2 in zip(l1, l2):
-                    if not torch.equal(t1, t2):
+                    if not (t1 == t2).all():
                         return False
             return True
 
@@ -7452,7 +7453,7 @@ def forward(self, x):
                     # Control-flow that is rank and input dependent for the
                     # model.
                     use_second_layer = (
-                        torch.equal(x, torch.ones(batch, dim, device=x.device))
+                        (x == torch.ones(batch, dim, device=x.device)).all()
                         and self.rank == 1
                     )
 
diff --git a/torch/testing/_internal/distributed/nn/api/remote_module_test.py b/torch/testing/_internal/distributed/nn/api/remote_module_test.py
index 997006353bfbd..8a4d5c09512ec 100644
--- a/torch/testing/_internal/distributed/nn/api/remote_module_test.py
+++ b/torch/testing/_internal/distributed/nn/api/remote_module_test.py
@@ -242,7 +242,7 @@ def test_remote_parameters(self):
         ):
             param_rrefs = remote_module.remote_parameters()
             self.assertEqual(len(param_rrefs), 1)
-            self.assertTrue(torch.equal(param_rrefs[0].to_here(), _PARAM_VAL))
+            self.assertEqual(param_rrefs[0].to_here(), _PARAM_VAL, rtol=0, atol=0, exact_device=True)
 
     @dist_utils.dist_init
     def test_get_module_rref(self):
@@ -257,7 +257,7 @@ def test_get_module_rref(self):
             rref = remote_module.get_module_rref()
             self.assertEqual(rref, remote_module.module_rref)
             for param in rref.to_here().parameters():
-                self.assertTrue(torch.equal(param, _PARAM_VAL))
+                self.assertEqual(param, _PARAM_VAL, rtol=0, atol=0, exact_device=True)
 
     @dist_utils.dist_init
     def test_train_eval(self):
diff --git a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
index 5d7831659fc1c..d64369767ff70 100644
--- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
@@ -71,7 +71,7 @@ def _compare_owner_value(context_id, rref, grad):
         grad = grad.to_dense()
     else:
         assert not grad.is_sparse
-    return torch.equal(x, grad)
+    return (x == grad).all()
 
 
 def create_tensor():
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index 4c0239ac653ee..a48c8760d1373 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -237,8 +237,7 @@ def non_cont_test(t_view, t_cont):
         raise Exception('t_view is contiguous!')
     if not t_cont.is_contiguous():
         raise Exception('t_cont is not contiguous!')
-    if not torch.equal(t_view, t_cont):
-        raise Exception('t_view is not equal to t_cont!')
+    torch.testing.assert_close(t_view, t_cont, rtol=0, atol=0, msg='t_view is not equal to t_cont!')
     return t_view
 
 def my_function(a, b, c):
@@ -1068,7 +1067,7 @@ def _trainer_func(self, rref, sparse):
             ps_gradient = rref.rpc_sync().get_gradient(rref)
             if ps_gradient.is_sparse:
                 ps_gradient = ps_gradient.to_dense().double()
-            self.assertTrue(torch.equal(gradient, ps_gradient))
+            self.assertEqual(gradient, ps_gradient, rtol=0, atol=0, exact_device=True)
 
     def _my_parameter_server(self, sparse):
         ps_rref = RRef(MyParameterServer(self.world_size - 1))

From 4e06f0f21c57fa889efb228a0045d52ba53006bd Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Thu, 1 Dec 2022 03:10:07 +0000
Subject: [PATCH 1469/1922] [FSDP][Dynamo] Define annotation attributes as
 globals (#89913)

This was separated out from the previous PR to decouple. Since not all builds include `torch.distributed`, we should define the globals in the dynamo file and import to distributed instead of vice versa. Unlike the version from the previous PR, this PR prefixes the globals with `_` to future proof against `_dynamo/` eventually becoming public.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89913
Approved by: https://github.com/wconstab
---
 torch/_dynamo/variables/builder.py      | 11 ++++++++---
 torch/distributed/fsdp/_dynamo_utils.py |  5 +++--
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index 43c2c91c4a553..f630d3644c2dc 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -99,6 +99,11 @@
 from .user_defined import UserDefinedClassVariable, UserDefinedObjectVariable
 
 
+# Names of attributes used to annotate modules for FSDP + Dynamo
+_FSDP_MANAGED_MODULE = "_is_fsdp_managed_module"
+_FSDP_USE_ORIG_PARAMS = "_fsdp_use_orig_params"
+
+
 class _missing:
     pass
 
@@ -307,14 +312,14 @@ def index_source(key):
                 return self.tx.output.side_effects.track_object_existing(
                     self.source, value, result
                 )
-            elif getattr(value, "_is_fsdp_managed_module", False) or issubclass(
+            elif getattr(value, _FSDP_MANAGED_MODULE, False) or issubclass(
                 value.__class__, torch.nn.parallel.distributed.DistributedDataParallel
             ):
-                if getattr(value, "_is_fsdp_managed_module", False):
+                if getattr(value, _FSDP_MANAGED_MODULE, False):
                     # Note: we can't do this assert inside FSDP constructor,
                     # since we don't know yet whether dynamo will be used
                     assert getattr(
-                        value, "_fsdp_use_orig_params", False
+                        value, _FSDP_USE_ORIG_PARAMS, False
                     ), "Dynamo only supports FSDP with use_orig_params=True"
 
                 # See note [Dynamo treats FSDP wrapped modules as UnspecializedNNModule]
diff --git a/torch/distributed/fsdp/_dynamo_utils.py b/torch/distributed/fsdp/_dynamo_utils.py
index 3a6c63dc5af8b..dfaa803798a3f 100644
--- a/torch/distributed/fsdp/_dynamo_utils.py
+++ b/torch/distributed/fsdp/_dynamo_utils.py
@@ -1,6 +1,7 @@
 from typing import Set
 
 import torch.nn as nn
+from torch._dynamo.variables.builder import _FSDP_MANAGED_MODULE, _FSDP_USE_ORIG_PARAMS
 
 
 def _annotate_modules_for_dynamo(
@@ -36,10 +37,10 @@ def _annotate_modules_for_dynamo(
             order for backward to interleave hooks with compute per layer.  UnspecializedNNModule lets us achieve
             this by capturing the module code more 'functionally' and passing parameters in as inputs each time.
             """
-            submodule._is_fsdp_managed_module = True  # type: ignore[assignment]
+            setattr(submodule, _FSDP_MANAGED_MODULE, True)
 
             # Dynamo only supports FSDP with use_orig_params=True.
             # This is hacky, but I could not think of another way to add an assertion to dynamo
             # for this, since Dynamo skips all the FSDP code frames and thus can't inspect the
             # FSDP module directly
-            submodule._fsdp_use_orig_params = use_orig_params  # type: ignore[assignment]
+            setattr(submodule, _FSDP_USE_ORIG_PARAMS, use_orig_params)

From 1ac57d0c9e78d5e0fb5be257ed4b9440dd4f6a31 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Thu, 1 Dec 2022 13:39:25 +0000
Subject: [PATCH 1470/1922] When dealing with dupe arguments, prefer leafifying
 if possible (#89896)

See code comment for details. I also had to do some extra fixes:

* `run_functionalized_fw_and_collect_metadata` now is able to handle duplicated arguments
* `aot_wrapper_dedupe` now always returns boxed compiled functions
* `aot_wrapper_dedupe` is now applied to inference compiler along with autograd compiler (preexisting)

Fixes https://github.com/pytorch/torchdynamo/issues/1939
Fixes DebertaV2ForQuestionAnswering DebertaForMaskedLM DebertaForQuestionAnswering DebertaV2ForMaskedLM

Repro command:

```
python benchmarks/dynamo/huggingface.py --performance --float32 -dcuda --training --inductor --no-skip --dashboard --only DebertaForQuestionAnswering --cold_start_latency
```

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89896
Approved by: https://github.com/bdhirsh
---
 functorch/_src/aot_autograd.py     | 264 ++++++++++++++++++++++-------
 test/dynamo/test_repros.py         |  12 ++
 test/functorch/test_aotdispatch.py |  37 +++-
 3 files changed, 248 insertions(+), 65 deletions(-)

diff --git a/functorch/_src/aot_autograd.py b/functorch/_src/aot_autograd.py
index bfd5602fb0ad1..3eea7ac42f96b 100644
--- a/functorch/_src/aot_autograd.py
+++ b/functorch/_src/aot_autograd.py
@@ -5,9 +5,10 @@
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from enum import Enum
-from functools import wraps
+from functools import wraps, partial
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 from torch.fx.experimental.proxy_tensor import is_sym_node
+import logging
 
 import torch
 import torch.fx.traceback as fx_traceback
@@ -236,9 +237,15 @@ def gen_alias_from_base(aliased_base_tensor, size, stride, storage_offset, targe
 # TODO: Provide a faster version of this that assumes flat arguments
 # (so no pytree necessary)
 def run_functionalized_fw_and_collect_metadata(f):
+    memo = {}
+
     def to_fun(t):
         if isinstance(t, Tensor):
-            return torch._to_functional_tensor(t, mirror_autograd_meta=True)
+            if t in memo:
+                return memo[t]
+            r = torch._to_functional_tensor(t, mirror_autograd_meta=True)
+            memo[t] = r
+            return r
         else:
             return t
 
@@ -873,6 +880,7 @@ def aot_dispatch_base(flat_fn, flat_args: List[Tensor], aot_config: AOTConfig):
     def new_fn(args):
         fw_outs = call_func_with_args(compiled_fw, args, disable_amp=disable_amp)
         return fw_outs
+    new_fn._boxed_call = True
 
     return new_fn
 
@@ -1066,25 +1074,145 @@ def format_guard_bug_msg(aot_config, expected):
     )
 
 
-# Wraps aot_dispatch_deduplicated_autograd, ensuring that duplicate arguments
-# are dropped from the inner compilation function.
+# MOTIVATION:
 #
-# In Haskell types, suppose you have:
+# When tracing functions for future execution, one must be careful not to pass
+# in the same input tensor multiple times (e.g., f(x, x), as this can result
+# in graphs that are ONLY valid if you later pass a new tensor in exactly the
+# same way (e.g., f(y, y)).  (NB: we really mean duplicate; two distinct
+# tensors that alias each other is a different situation that is covered by
+# aot_dispatch_deduplicated_autograd). Here are two examples:
 #
-#   add_dupe_args :: DedupedArgs -> Args
-#   remove_dupe_args :: Args -> DedupedArgs
+# (1) Suppose you have a function:
 #
-#   aot_dispatch_deduplicated_autograd
-#       :: (DedupedArgs -> R) -> DedupedArgs -> AOTConfig -> (DedupedArgs -> R)
-#   aot_dispatch_autograd
-#       :: (Args -> R) -> Args -> AOTConfig -> (Args -> R)
+#   def f(x, y):
+#       return x + y
 #
-# Then the code below can be written in point-free style as:
+# If you make_fx(f)(x, x), you will trace out:
 #
-#   aot_dispatch_deduplicate_autograd f a c =
-#       aot_dispatch_autograd (f . add_dupe_args) (remove_dupe_args a) c . remove_dupe_args
+#   def f(x, y):
+#       return y + y
 #
-def aot_dispatch_autograd(flat_fn, flat_args: List[Tensor], aot_config: AOTConfig):
+# Oops!
+#
+# (2) For most tensors x and y, you can compute f's gradient with respect to
+# these to inputs by saying torch.autograd.grad(f(x, y), (x, y)).  However,
+# if x is y, you will trace out a program that gets incorrect gradients:
+#
+#   >>> x = torch.randn(1, requires_grad=True)
+#   >>> torch.autograd.grad(x + x, (x, x))
+#   (tensor([2.]), tensor([2.]))
+#
+# In other words, the gradient is double-counted.  Deduplicating the arguments
+# gives you an appropriate gradient:
+#
+#   >>> y = torch.randn(1, requires_grad=True)
+#   >>> torch.autograd.grad(x + y, (x, y))
+#   (tensor([1.]), tensor([1.]))
+#
+# HOW TO DEDUPLICATE:
+#
+# There are a few strategies, in order of preference:
+#
+# 1. For every duplicate argument to the function, detach it into
+#    a separate leaf tensor, so that it is no longer duplicated.
+#
+#       PRO: The resulting compiled graph works for any configuration
+#       of duplicated arguments.
+#
+#       CON: It does not (naively) work if you mutate the metadata of inputs:
+#
+#           def f(x, y):
+#               x.transpose_(0, 1)
+#               y.transpose_(0, 2)
+#
+#           x = torch.randn(2, 3, 4)
+#           f(x, x)
+#
+#       The ordering of the transposes inside f dictates whether or not
+#       you get [4, 2, 3] or [3, 4, 2].  This means that you cannot precompute
+#       what metadata mutations should get applied to each input; you need to
+#       assume they aren't duplicates (what we do today) or preserve
+#       the original metadata mutations exactly in order, so that they work
+#       for any duplicate configuration.
+#
+#       CON: It does not (naively) work if you mutate the data of inputs.
+#       In particular, leaf tensors that require grad cannot be mutated,
+#       this makes it impossible to differentiate with respect to the original
+#       base.
+#
+# 2. For every duplicate argument to the function, remove it, so it is
+#    no longer part of the "true" signature:
+#
+#       PRO: Implemented naively, it still works for metadata/data mutation.
+#
+#       CON: The resulting compiled graph is duplicate-specialized: it only
+#       works if future calls duplicate arguments in exactly the same way.
+#       Horribly, Dynamo doesn't guard on this at the moment.  But even if
+#       it did, you could still end up recompiling a bunch of each duplicate.
+#
+# Our strategy is to do (1) if we can, and do (2) otherwise, erroring if
+# Dynamo's guards are not enough.  In practice, this seems to cover
+# everything.
+#
+def aot_wrapper_dedupe(flat_fn, flat_args: List[Tensor], aot_config: AOTConfig, *, compiler_fn):
+    # Get information about whether or not flat_fn mutates its arguments
+    # or not
+    try:
+        with enable_python_dispatcher():
+            fw_metadata, _out, _num_aliasing_metadata_outs = run_functionalized_fw_and_collect_metadata(
+                flat_fn
+            )(*flat_args)
+    except RuntimeError as e:
+        logging.warning(
+            "Failed to collect metadata on function, produced code may be suboptimal.  "
+            "Known situations this can occur are inference mode only compilation involving "
+            "resize_ or prims (!schema.hasAnyAliasInfo() INTERNAL ASSERT FAILED); "
+            "if your situation looks different please file a bug to PyTorch.",
+            exc_info=True
+        )
+        # Analysis failed, fall back to duplicate specialize
+        # TODO: Known analysis problems:
+        #   - resize_: TestInductorOpInfoCPU.test_comprehensive_resize__cpu_bool
+        #   - prims: test_tmp_not_defined_issue1_cpu
+        pass
+    else:
+        # Strategy 1: For any input that is not mutated, we can leafify it if we
+        # need to remove a duplicate.
+        leaf_flat_args = []
+        args_set = set()
+        ok = True
+
+        for i, a in enumerate(flat_args):
+            if a not in args_set:
+                args_set.add(a)
+                leaf_flat_args.append(a)
+            elif fw_metadata.mutated_input_info[i] == MutationType.none:
+                leaf_flat_args.append(a.detach().requires_grad_(a.requires_grad))
+            else:
+                ok = False
+                break
+
+        if ok:
+            return compiler_fn(flat_fn, leaf_flat_args, aot_config)
+
+    # Strategy 2: Duplicate specialize.
+    #
+    # In Haskell types, suppose you have:
+    #
+    #   add_dupe_args :: DedupedArgs -> Args
+    #   remove_dupe_args :: Args -> DedupedArgs
+    #
+    #   compiler_fn
+    #       :: (DedupedArgs -> R) -> DedupedArgs -> AOTConfig -> (DedupedArgs -> R)
+    #   deped_compiler_fn
+    #       :: (Args -> R) -> Args -> AOTConfig -> (Args -> R)
+    #
+    # Then the code below can be written in point-free style as:
+    #
+    #   deduped_compiler_fn f a c =
+    #       compiler_fn (f . add_dupe_args) (remove_dupe_args a) c . remove_dupe_args
+    #
     # Suppose you have:
     #
     #   [a, b, a, c]
@@ -1096,7 +1224,7 @@ def aot_dispatch_autograd(flat_fn, flat_args: List[Tensor], aot_config: AOTConfi
     #
     # This is done via (respectively):
     #
-    #   seen_args = {2}  # what to drop
+    #   seen_args = {a: 0, b: 1, c: 2}
     #   add_dupe_map = {  # how to get args from the deduped list
     #       0: 0,
     #       1: 1,
@@ -1107,7 +1235,6 @@ def aot_dispatch_autograd(flat_fn, flat_args: List[Tensor], aot_config: AOTConfi
 
     seen_args = {}
     keep_arg_mask = []
-    dropped_args = False
     add_dupe_map = {}
     duped_arg_len = len(flat_args)
 
@@ -1115,7 +1242,6 @@ def aot_dispatch_autograd(flat_fn, flat_args: List[Tensor], aot_config: AOTConfi
     for i, t in enumerate(flat_args):
         if t in seen_args:
             keep_arg_mask.append(False)
-            dropped_args = True
             add_dupe_map[i] = seen_args[t]
             continue
         keep_arg_mask.append(True)
@@ -1133,54 +1259,58 @@ def remove_dupe_args(args):
     def add_dupe_args(args):
         return [args[add_dupe_map[i]] for i in range(duped_arg_len)]
 
-    def maybe_wrap_debug(f):
-        if not config.debug_assert:
-            return f
-
-        @wraps(f)
-        def debug_wrapper(*args):
-            # Test that the computed remove/add arg functions are an inverse
-            new_args = add_dupe_args(remove_dupe_args(args))
-            seen = {}
-            for i, (x, y) in enumerate(zip(new_args, args)):
-                seen[y] = None
-                assert x is y, format_guard_bug_msg(
-                    aot_config,
-                    f"{describe_input(i, aot_config)} would be a duplicate of "
-                    f"{describe_input(add_dupe_map[i], aot_config)}"
-                )
-            # This is only an error if there is metadata mutation on both of
-            # the duped arguments; in this case, we need to know what order
-            # the metadata mutation applies in.  You'll get the correct result
-            # otherwise, because a graph that assumes distinct inputs works if
-            # you dupe the inputs (the gradient contributions from each input
-            # will get summed up appropriately.)
-            """
-            assert len(seen) == unique_args, format_guard_bug_msg(aot_config,
-                f"there would be {unique_args} distinct arguments"
-            )
-            """
-            return f(*args)
-
-        return debug_wrapper
-
-    # Fastpath
-    if not dropped_args:
-        return maybe_wrap_debug(aot_dispatch_deduplicated_autograd(flat_fn, flat_args, aot_config))
-
     deduped_flat_args = remove_dupe_args(flat_args)
 
     @wraps(flat_fn)
     def wrapped_flat_fn(*args):
         return flat_fn(*add_dupe_args(args))
 
-    compiled_fn = aot_dispatch_deduplicated_autograd(wrapped_flat_fn, deduped_flat_args, aot_config)
+    compiled_fn = compiler_fn(wrapped_flat_fn, deduped_flat_args, aot_config)
+
+    if not hasattr(compiled_fn, "_boxed_call"):
+        compiled_fn = make_boxed_func(compiled_fn)
 
     @wraps(compiled_fn)
-    def wrapped_compiled_fn(*args):
-        return compiled_fn(*remove_dupe_args(args))
+    def wrapped_compiled_fn(args):
+        deduped_args = remove_dupe_args(args)
+        args.clear()
+        return compiled_fn(deduped_args)
+    wrapped_compiled_fn._boxed_call = True
+
+    # This can be uncommented when we properly guard for duplicates,
+    # but right now we must not do it.
+    # if not config.debug_assert:
+    #     return wrapped_compiled_fn
+
+    @wraps(wrapped_compiled_fn)
+    def debugged_compiled_fn(args):
+        # Test that the computed remove/add arg functions are an inverse
+        new_args = add_dupe_args(remove_dupe_args(args))
+        seen = {}
+        for i, (x, y) in enumerate(zip(new_args, args)):
+            seen[y] = None
+            assert x is y, format_guard_bug_msg(
+                aot_config,
+                f"{describe_input(i, aot_config)} would be a duplicate of "
+                f"{describe_input(add_dupe_map[i], aot_config)}"
+            )
+        # This is only an error if there is metadata mutation on both of
+        # the duped arguments; in this case, we need to know what order
+        # the metadata mutation applies in.  You'll get the correct result
+        # otherwise, because a graph that assumes distinct inputs works if
+        # you dupe the inputs (the gradient contributions from each input
+        # will get summed up appropriately.)
+        #
+        # TODO: work out how to setup this assert correctly
+        """
+        assert len(seen) == unique_args, format_guard_bug_msg(aot_config,
+            f"there would be {unique_args} distinct arguments"
+        )
+        """
+        return wrapped_compiled_fn(args)
+    debugged_compiled_fn._boxed_call = True
 
-    return maybe_wrap_debug(wrapped_compiled_fn)
+    return debugged_compiled_fn
 
 
 def describe_input(i, aot_config):
@@ -1190,11 +1320,11 @@ def describe_input(i, aot_config):
         return f"input {i - aot_config.num_params_buffers}"
 
 
-# Like aot_dispatch_autograd, but with the precondition that there
+# Has the precondition that there
 # are no duplicate arguments in flat_args (e.g., the same Tensor
 # object never shows up twice.  However, two tensor inputs MAY alias
 # the same storage, so long as they have separate TensorImpls.)
-def aot_dispatch_deduplicated_autograd(flat_fn, flat_args: List[Tensor], aot_config: AOTConfig):
+def aot_dispatch_autograd(flat_fn, flat_args: List[Tensor], aot_config: AOTConfig):
 
     with enable_python_dispatcher():
         _fw_metadata, out, _num_aliasing_metadata_outs = run_functionalized_fw_and_collect_metadata(
@@ -1652,11 +1782,19 @@ def convert(idx, x):
         # crappy version of dispatcher
         # TODO: Do this properly
         if needs_autograd:
-            return make_boxed_func(
-                aot_dispatch_autograd(flat_fn, fake_flat_tensor_args, aot_config)
-            )
+            compiler_fn = aot_dispatch_autograd
         else:
-            return aot_dispatch_base(flat_fn, fake_flat_tensor_args, aot_config)
+            compiler_fn = aot_dispatch_base
+
+        compiler_fn = partial(aot_wrapper_dedupe, compiler_fn=compiler_fn)
+        # You can put more passes here
+
+        compiled_fn = compiler_fn(flat_fn, fake_flat_tensor_args, aot_config)
+
+        if not hasattr(compiled_fn, '_boxed_call'):
+            compiled_fn = make_boxed_func(compiled_fn)
+
+        return compiled_fn
 
 
 # Inspired by autodidax (thanks!)
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index ec5ea4ac1fb55..2c0a7acb5ccc6 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -1947,6 +1947,18 @@ def fn(x):
         self.assertEqual(cnt.frame_count, 1)
         self.assertEqual(cnt.op_count, 100)
 
+    def test_avoid_dupe_specialization(self):
+        def f(x, y):
+            return (x + y) * 1
+
+        opt_f = torch._dynamo.optimize("aot_eager")(f)
+
+        for b in [True, False]:
+            x = torch.randn(4, requires_grad=b)
+            y = torch.randn(4, requires_grad=b)
+            self.assertEqual(f(x, x), opt_f(x, x))
+            self.assertEqual(f(x, y), opt_f(x, y))
+
     def test_while_loop_graph_break(self):
         # Repro of tacotron2 cache_size_recompilation
         def inner(x):
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 3a604281ca956..09dc2e74c8955 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1016,15 +1016,48 @@ def f(x, y):
         x = torch.randn(3, 3, requires_grad=True)
         self.verify_aot_autograd(f, [x, x])
 
+    def test_dupe_arg_torture(self):
+        def f(x, y):
+            x.t_()
+            y.t_()
+            return x + y
+
+        x = torch.randn(3, 3, requires_grad=True).clone()
+        self.verify_aot_autograd(f, [x, x])
+
     @patch('functorch._src.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
     @patch("functorch._src.config.debug_assert", True)
-    def test_invalid_dupe(self, counter):
+    def test_invalid_dupe_left_bias(self, counter):
+        # This test checks that, just because only the first
+        # argument did a metadata mutation, we still correctly
+        # switch to strategy 2 (deduplicate)
+        # See: https://github.com/pytorch/pytorch/pull/89896#discussion_r1036224447
         class F(torch.nn.Module):
             def forward(self, x, y):
+                x.t_()
                 return (x + y,)
 
-        x = torch.randn(3, 3, requires_grad=True)
+        x = torch.randn(3, 3, requires_grad=True).clone()
         y = torch.randn(3, 3, requires_grad=True)
+        self.verify_aot_autograd(F(), [x, x])
+
+        fxx = aot_module_simplified(F(), (x, x), nop)
+        self.assertExpectedRaisesInline(
+            AssertionError, lambda: fxx(x, y),
+            """At compilation time, graph 1 was compiled under the assumption that input 1 would be a duplicate of input 0, but at runtime this was not the case.  This indicates a guard bug in AOTAutograd or Dynamo, please file a bug to PyTorch."""  # noqa: B950
+        )
+
+    @patch('functorch._src.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
+    @patch("functorch._src.config.debug_assert", True)
+    def test_invalid_dupe(self, counter):
+        class F(torch.nn.Module):
+            def forward(self, x, y):
+                x.t_()
+                y.t_()
+                return (x + y,)
+
+        x = torch.randn(3, 3, requires_grad=True).clone()
+        y = torch.randn(3, 3, requires_grad=True).clone()
 
         fxy = aot_module_simplified(F(), (x, y), nop)
         fxy(x, y)

From 19d140d4528783d3d2917f90fda3f683f9a50c39 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Tue, 29 Nov 2022 14:03:44 -0800
Subject: [PATCH 1471/1922] Add definitely_not_01 set to ShapeEnv. (#89871)

This set tracks symbols which we know are definitely not 0/1, and thus
can be further simplified when we try to work out their static value
without guards.  Right now, all allocated symbols are in this set,
but we will later add symbols which don't uphold this.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89871
Approved by: https://github.com/albanD
---
 torch/fx/experimental/symbolic_shapes.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index 41121808e24e9..b236d8f5463e4 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -442,6 +442,8 @@ def __init__(self):
         # they get assigned the same symbolic variable
         self.val_to_var: Dict[int, "sympy.Expr"] = {0: sympy.Integer(0), 1: sympy.Integer(1)}
         self.tls = threading.local()
+        # Set holds symbols which definitely are not 0 or 1.
+        self.definitely_not_01: Set["sympy.Symbol"] = set()
 
     def _suppress_guards_tls(self):
         return getattr(self.tls, "suppress_guards", False)
@@ -519,6 +521,7 @@ def create_symbol(self, val: int) -> "sympy.Expr":
         sympy_expr = sympy.Symbol(f"s{len(self.var_to_val)}", positive=True, integer=True)
         self.var_to_val[sympy_expr] = sympy.Integer(val)
         self.val_to_var[val] = sympy_expr
+        self.definitely_not_01.add(sympy_expr)
         return sympy_expr
 
     def evaluate_guards_for_args(self, *args):
@@ -565,6 +568,7 @@ def _maybe_evaluate_static(self, expr: "sympy.Expr") -> "Optional[sympy.Expr]":
         new_shape_env = {
             k: sympy.Symbol(f"shape_{idx}", positive=True, integer=True) + 1
             for idx, k in enumerate(symbols)
+            if k in self.definitely_not_01
         }
         new_expr = expr.xreplace(new_shape_env)
         floor_div_replace = {}

From d0b3be93dc87ac6e0f61155c2a285c58afe39154 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 30 Nov 2022 06:48:46 -0800
Subject: [PATCH 1472/1922] Guarantee symbol allocation for all
 sizes/strides/storage offset (#89879)

We may need to express guards on the size/stride/storage offset of
a tensor, but we cannot do this if it's already been duck sized.
This PR guarantees that we allocate a symbol (or negation of the
symbol) whenever we ask to create a SymInt, and propagates this
symbol to SymNode so that Dynamo can look at it (not in this PR).

This PR doesn't actually add guards, nor does Dynamo do anything
with these symbols.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89879
Approved by: https://github.com/albanD
---
 test/functorch/test_aotdispatch.py       |  4 +-
 test/test_dynamic_shapes.py              | 32 +++++-----
 test/test_proxy_tensor.py                |  2 +-
 torch/fx/experimental/symbolic_shapes.py | 78 +++++++++++++++++-------
 4 files changed, 76 insertions(+), 40 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 09dc2e74c8955..a7246a2f09c1a 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1647,8 +1647,8 @@ def forward(self, x, y):
         res[0].sum().backward()
 
         self.assertExpectedInline(shape_env.format_guards(), """\
- - Eq(s1, 20)
- - Eq(s2, 30)""")
+ - Eq(s3, 20)
+ - Eq(s9, 30)""")
 
         assert torch.allclose(ref[0], res[0])
         assert torch.allclose(inputs[0].grad, cloned_inputs[0].grad)
diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index 953b6d9a53f64..6fc27d00a54e4 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -344,7 +344,7 @@ def test_guard_int(self):
         shape_env = ShapeEnv()
         a0 = create_symint(shape_env, 2)
         self.assertEqual(guard_int(a0), 2)
-        self.assertEqual(str(shape_env.guards[0][0]), "Eq(s0, 2)")
+        self.assertExpectedInline(str(shape_env.guards[0][0]), """Eq(s1, 2)""")
 
     @skipIfNoSympy
     def test_sym_int(self):
@@ -353,19 +353,19 @@ def test_sym_int(self):
         r = sym_int(a0)
         self.assertEqual(r, 5)
         self.assertIsInstance(r, torch.SymInt, msg=type(r))
-        self.assertEqual(str(shape_env.guards[0][0]), "Eq(s0, 5)")
+        self.assertExpectedInline(str(shape_env.guards[0][0]), """Eq(s1, 5)""")
 
         a1 = create_symint(shape_env, 7)
         r = sym_int(a1 / 2)
         self.assertEqual(guard_int(r), 3)
         self.assertIsInstance(r, torch.SymInt, msg=type(r))
-        self.assertEqual(str(shape_env.guards[1][0]), "Eq(floor(s1/2), 3)")
+        self.assertExpectedInline(str(shape_env.guards[1][0]), """Eq(floor(s3/2), 3)""")
 
         a2 = create_symint(shape_env, -3)
         r = sym_int(a2 / 2)
         self.assertEqual(guard_int(r), -1)
         self.assertIsInstance(r, torch.SymInt, msg=type(r))
-        self.assertEqual(str(shape_env.guards[2][0]), "Eq(ceiling(-s2/2), -1)")
+        self.assertExpectedInline(str(shape_env.guards[2][0]), """Eq(ceiling(-s5/2), -1)""")
 
     @skipIfNoSympy
     def test_sym_sqrt(self):
@@ -374,7 +374,7 @@ def test_sym_sqrt(self):
         r = sym_sqrt(a0)
         self.assertEqual(r, 2)
         self.assertIsInstance(r, torch.SymFloat, msg=type(r))
-        self.assertEqual(str(shape_env.guards[0][0]), "Eq(sqrt(s0), 2)")
+        self.assertExpectedInline(str(shape_env.guards[0][0]), """Eq(sqrt(s1), 2)""")
 
     @skipIfNoSympy
     def test_sym_floor(self):
@@ -383,7 +383,7 @@ def test_sym_floor(self):
         r = math.floor(a0 / 2)
         self.assertEqual(r, 2)
         self.assertIsInstance(r, torch.SymInt, msg=type(r))
-        self.assertEqual(str(shape_env.guards[0][0]), "Eq(floor(s0/2), 2)")
+        self.assertExpectedInline(str(shape_env.guards[0][0]), """Eq(floor(s1/2), 2)""")
 
     @skipIfNoSympy
     def test_int_conversion(self):
@@ -430,18 +430,18 @@ def f(a, b):
 
         self.assertExpectedInline(mock_stdout.getvalue().strip(), """\
 class f(torch.nn.Module):
-    def forward(self, a_1: f32[s0, s1], b_1: f32[s2, s1]):
+    def forward(self, a_1: f32[s1, s3], b_1: f32[s8, s3]):
         # No stacktrace found for following nodes
-        sym_size: Sym(s0) = torch.ops.aten.sym_size(a_1, 0)
-        sym_size_1: Sym(s2) = torch.ops.aten.sym_size(b_1, 0)
-        add: Sym(s0 + s2) = sym_size + sym_size_1;  sym_size = sym_size_1 = None
-        sym_size_2: Sym(s1) = torch.ops.aten.sym_size(a_1, 1)
-        sym_size_3: Sym(s1) = torch.ops.aten.sym_size(b_1, 1);  b_1 = None
-        add_1: Sym(2*s1) = sym_size_2 + sym_size_3;  sym_size_2 = sym_size_3 = None
-        new_empty: f32[s0 + s2, 2*s1] = torch.ops.aten.new_empty.default(a_1, [add, add_1], dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False);  a_1 = add = add_1 = None
+        sym_size: Sym(s1) = torch.ops.aten.sym_size(a_1, 0)
+        sym_size_1: Sym(s8) = torch.ops.aten.sym_size(b_1, 0)
+        add: Sym(s1 + s8) = sym_size + sym_size_1;  sym_size = sym_size_1 = None
+        sym_size_2: Sym(s3) = torch.ops.aten.sym_size(a_1, 1)
+        sym_size_3: Sym(s3) = torch.ops.aten.sym_size(b_1, 1);  b_1 = None
+        add_1: Sym(2*s3) = sym_size_2 + sym_size_3;  sym_size_2 = sym_size_3 = None
+        new_empty: f32[s1 + s8, 2*s3] = torch.ops.aten.new_empty.default(a_1, [add, add_1], dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False);  a_1 = add = add_1 = None
         native_dropout = torch.ops.aten.native_dropout.default(new_empty, 0.5, True);  new_empty = None
-        getitem: f32[s0 + s2, 2*s1] = native_dropout[0]
-        getitem_1: b8[s0 + s2, 2*s1] = native_dropout[1];  native_dropout = None
+        getitem: f32[s1 + s8, 2*s3] = native_dropout[0]
+        getitem_1: b8[s1 + s8, 2*s3] = native_dropout[1];  native_dropout = None
         return (getitem, getitem_1)""")  # noqa: B950
 
 # This environment variable controls whether or not we print expected failure
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index f82848dfb1076..38911c7981be7 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -987,7 +987,7 @@ def f(a, b):
             assert b.shape[0] == 8
             return a.cos()
         fx_g = make_fx(f, tracing_mode="symbolic")(torch.randn(16), torch.randn(8))
-        self.assertExpectedInline(str(fx_g.shape_env.get_guard_expr()), "Eq(s1, 8) & Eq(s0, 2*s1)")
+        self.assertExpectedInline(str(fx_g.shape_env.get_guard_expr()), """Eq(s5, 8) & Eq(s1, 2*s5)""")
 
     def test_sym_storage_offset(self):
         def f(x, y):
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index b236d8f5463e4..35578401bfdb0 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -1,6 +1,6 @@
 import torch
 import torch.utils._pytree as pytree
-from typing import Set, Dict, List, Type, Optional, cast, Union
+from typing import Set, Dict, List, Type, Optional, cast
 import sys
 import operator
 import builtins
@@ -148,11 +148,18 @@ class SymNode:
     This is a type erased SymInt/SymFloat which we use to do actual operations.
     End users don't touch this.  Magic methods are NOT defined on this object.
     """
-    def __init__(self, expr, shape_env, pytype, constant=None):
+    def __init__(self, expr, shape_env, pytype, constant=None, symbol=None):
         self._expr = expr
         self.shape_env = shape_env
         self.pytype = pytype
         self.constant = constant
+        # Unlike expr, sympy.Symbol is guaranteed to either be a
+        # symbol or its negation a symbol, and it never gets simplified into a
+        # constant or another symbol.  This only exists for freshly
+        # create_symint; intermediate values are None.  The usage of this
+        # property is fairly short-lived: it lives long enough so that Dynamo
+        # can get its hands on symbols and setup Source associations
+        self.symbol: Optional[sympy.Expr] = symbol
 
     @property
     def expr(self):
@@ -499,30 +506,59 @@ def create_symbolic_sizes_strides(self, ex: torch.Tensor):
                 )
                 stride[i] = self.create_symbol(val)
         assert all(x is not None for x in stride)
-        return [self.create_symintnode(i) for i in size], [self.create_symintnode(i) for i in stride]  # type: ignore[arg-type]
-
-    def create_symintnode(self, expr: Union["sympy.Expr", int]):
-        return SymInt(SymNode(expr, self, int))
-
-    def create_symbol(self, val: int) -> "sympy.Expr":
+        sym_size = [self.create_symintnode(i) for i in size]
+        sym_stride = []
+        for stride_expr in stride:
+            # NB: Don't duck size the stride; instead use the expression
+            # we computed
+            # TODO: We actually allocated an unnecessary extra symbol
+            # here in the smallest unbound stride case, but it's not
+            # a big deal because the non-0/1 symbol immediately
+            # evaporates from its duck-sizing simplification
+            s = self.create_symbol(val, simplify=False)
+            assert stride_expr is not None
+            assert isinstance(s, sympy.Symbol)
+            self.replacements[s] = stride_expr
+            sym_stride.append(self.create_symintnode(s))
+        return sym_size, sym_stride
+
+    def create_symintnode(self, sym: "sympy.Expr"):
+        assert isinstance(sym, sympy.Symbol) or isinstance(-sym, sympy.Symbol)
+        return SymInt(SymNode(sym.xreplace(self.replacements), self, int, symbol=sym))
+
+    # This is guaranteed to return a symbol or its negation is a sympy.Symbol,
+    # but there may be a replacement that allows it to be immediately
+    # simplified
+    def create_symbol(self, val: int, *, simplify: bool = True) -> "sympy.Expr":
         if not HAS_SYMPY:
             raise RuntimeError("Need sympy installed to create symbolic shapes")
+
         if val < 0:
-            # all sympy base variables must be positive and > 1
-            return -self.create_symbol(-val)
+            return -self.create_symbol(-val, simplify=simplify)
+
+        symbol = sympy.Symbol(f"s{len(self.var_to_val)}", positive=True, integer=True)
+        self.var_to_val[symbol] = sympy.Integer(val)
+
+        if not simplify:
+            return symbol
+
+        # Now attempt to simplify this symbol
+        # TODO: Create a guard whenever this happens
+        # TODO: Do this duck sizing lazily later
+
         # This implements duck-shaping: input sizes that match are assigned
         # the same symint
-        # TODO: Create a guard whenever this happens
-        # TODO: But how do I represent the guard in this case?
-        # Note: val_to_var is also initialized with 0/1 mapping to constants, so
-        # this also ensures that all symbols are > 1
-        if val in self.val_to_var:
-            return self.val_to_var[val]
-        sympy_expr = sympy.Symbol(f"s{len(self.var_to_val)}", positive=True, integer=True)
-        self.var_to_val[sympy_expr] = sympy.Integer(val)
-        self.val_to_var[val] = sympy_expr
-        self.definitely_not_01.add(sympy_expr)
-        return sympy_expr
+        if val not in self.val_to_var:
+            sympy_expr = sympy.Symbol(f"s{len(self.var_to_val)}", positive=True, integer=True)
+            self.var_to_val[sympy_expr] = sympy.Integer(val)
+            self.val_to_var[val] = sympy_expr
+            self.definitely_not_01.add(sympy_expr)
+
+        self.replacements[symbol] = self.val_to_var[val]
+
+        # Return the *symbol*; you're expected to apply the replacement to get
+        # the simplified variable
+        return symbol
 
     def evaluate_guards_for_args(self, *args):
         new_env = ShapeEnv()

From 05171b218be02224f4c641d4c9e3d10537696b67 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 30 Nov 2022 11:32:29 -0800
Subject: [PATCH 1473/1922] Type torch._dynamo.guards (#89919)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89919
Approved by: https://github.com/albanD
---
 .lintrunner.toml               |   1 +
 torch/_dynamo/convert_frame.py |  38 +++++-----
 torch/_dynamo/eval_frame.py    |   2 +-
 torch/_dynamo/guards.py        | 129 ++++++++++++++++++++++-----------
 torch/_dynamo/output_graph.py  |   2 +-
 torch/_dynamo/types.py         |  24 +++++-
 6 files changed, 130 insertions(+), 66 deletions(-)

diff --git a/.lintrunner.toml b/.lintrunner.toml
index e5f1956212913..10787ce672263 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -148,6 +148,7 @@ include_patterns = [
     'torch/_dynamo/convert_frame.py',
     'torch/_dynamo/types.py',
     'torch/_dynamo/output_graph.py',
+    'torch/_dynamo/guards.py',
     'torch/_dynamo/optimizations/__init__.py',
     'torch/_dynamo/optimizations/backends.py',
     'torch/_dynamo/optimizations/training.py',
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index a60ba3c100096..c135a3d6c59c8 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -4,10 +4,9 @@
 import os
 import traceback
 import types
-import typing
 import weakref
 from traceback import FrameSummary
-from typing import Callable, cast, Dict, List, Optional
+from typing import Callable, cast, Dict, List, Optional, Set
 
 import torch
 from torch.fx.graph_module import _forward_from_src as original_forward_from_src
@@ -24,8 +23,8 @@
     unimplemented,
     Unsupported,
 )
-from .guards import CheckFunctionManager, GuardedCode
-from .output_graph import OutputGraph
+from .guards import CheckFunctionManager, Guard, GuardedCode
+from .output_graph import CompilerFn, OutputGraph
 from .replay_record import ExecutionRecord
 from .symbolic_convert import InstructionTranslator
 from .utils import (
@@ -266,7 +265,10 @@ def exception_handler(e, code, frame=None):
 
 
 def convert_frame_assert(
-    compiler_fn: Callable, guard_export_fn=None, one_graph=True, export=False
+    compiler_fn: CompilerFn,
+    guard_export_fn=None,
+    one_graph: bool = True,
+    export: bool = False,
 ):
     """Fully convert a frame into an FX graph"""
     init_logging()
@@ -352,14 +354,14 @@ def format_guard_failures(code):
 
 def _compile(
     code: types.CodeType,
-    globals,
-    locals,
-    builtins,
-    compiler_fn,
-    one_graph,
-    export,
-    guard_export_fn=None,
-    frame=None,
+    globals: Dict[str, object],
+    locals: Dict[str, object],
+    builtins: Dict[str, object],
+    compiler_fn: CompilerFn,
+    one_graph: bool,
+    export: bool,
+    guard_export_fn: Optional[Callable[[Set[Guard]], None]] = None,
+    frame: Optional[types.FrameType] = None,
 ) -> Optional[GuardedCode]:
     output: Optional[OutputGraph] = None
 
@@ -456,7 +458,7 @@ def transform(instructions, code_options):
         raise InternalTorchDynamoError() from e
 
 
-def convert_frame(compiler_fn: typing.Callable, guard_export_fn=None):
+def convert_frame(compiler_fn: CompilerFn, guard_export_fn=None):
     """Try to convert a frame into an FX graph, if error leave frame unmodified"""
     inner_convert = convert_frame_assert(compiler_fn, guard_export_fn, one_graph=False)
 
@@ -497,10 +499,10 @@ def replay(filename):
             record.locals,
             record.builtins,
             eager,
-            False,  # one_graph
-            None,  # export_fn
-            None,  # frame
-            False,  # Export
+            one_graph=False,
+            export=False,
+            guard_export_fn=None,
+            frame=None,
         )
     except Exception:
         pass
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 0d4665134d9bf..75e65bc270454 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -371,7 +371,7 @@ def lookup_backend(compiler_fn):
     return compiler_fn
 
 
-class _NullDecorator(contextlib.nullcontext):
+class _NullDecorator(contextlib.nullcontext):  # type: ignore[type-arg]
     def __call__(self, fn):
         assert callable(fn)
         return fn
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index a5aa42856834e..3cef0ed185b72 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -8,7 +8,8 @@
 import types
 import weakref
 from inspect import currentframe, getframeinfo
-from typing import Any, Callable, Dict, List, Optional, Set
+from typing import Any, Callable, Dict, List, Optional, Set, Type, Union
+from weakref import ReferenceType
 
 import numpy as np
 
@@ -20,6 +21,7 @@
 from . import config, convert_frame, mutation_guard
 from .eval_frame import set_guard_error_hook, set_guard_fail_hook
 from .exc import unimplemented
+from .types import GuardedCode, GuardFn  # noqa: F401
 from .utils import (
     dict_const_keys,
     dict_param_key_ids,
@@ -60,6 +62,7 @@ class GuardSource(enum.Enum):
     GLOBAL_NN_MODULE = 3
     CONSTANT = 4
     RANDOM_VALUE = 5
+    SHAPE_ENV = 6
 
     def select(self, locals_, globals_):
         if self in (GuardSource.LOCAL, GuardSource.LOCAL_NN_MODULE):
@@ -68,7 +71,7 @@ def select(self, locals_, globals_):
             return globals_
         raise NotImplementedError()
 
-    def is_nn_module(self):
+    def is_nn_module(self) -> bool:
         return self in (GuardSource.GLOBAL_NN_MODULE, GuardSource.LOCAL_NN_MODULE)
 
     def is_local(self):
@@ -77,15 +80,30 @@ def is_local(self):
 
 @dataclasses.dataclass
 class Guard:
+    # The name of a Guard specifies what exactly it is the guard is guarding
+    # on.  The meaning of the name is dependent on the create_fn; you must
+    # look at the use-site inside create_fn to know what name means.
+    #
+    # That being said, although you might think this is just a "name", name is
+    # usually an arbitrary Python expression that will be evaluated with all
+    # globals (and locals, if you create a LOCAL guard) to extract the Python
+    # object that we want to perform guard tests on.  This evaluation
+    # typically happens in GuardBuilder.eval.  In these cases, name is
+    # typically produced by Source.name() (not to be confused with
+    # GuardSource)--morally, we could have stored a Source here.
+    #
+    # Occasionally, name is not a valid Python expression; sometimes
+    # it is meaningless.  Example create_fns that are like this include
+    # GRAD_MODE and SYMBOL_MATCH.
     name: str
     source: GuardSource
-    create_fn: Callable
+    create_fn: Callable[["GuardBuilder", "Guard"], None]
     is_volatile: bool = False
 
     # Export only. These values are written to at time of guard check_fn creation.
     guard_types: Optional[List[str]] = None
     code_list: Optional[List[str]] = None
-    obj_weakref: Optional[Any] = None
+    obj_weakref: Optional[object] = None
     guarded_class_weakref: Optional[type] = None
 
     def __hash__(self):
@@ -192,7 +210,11 @@ def strip_getattr_getitem(name):
 
 class GuardBuilder:
     def __init__(
-        self, id_ref: Callable, scope: Dict[str, Any], guarded_code, renames=True
+        self,
+        id_ref: Callable[[Type[object]], str],
+        scope: Optional[Dict[str, object]],
+        guarded_code: "CheckFunctionManager",
+        renames=True,
     ):
         self.id_ref = id_ref
         if scope:
@@ -200,19 +222,34 @@ def __init__(
                 scope = {rename_implicit(k): v for k, v in scope.items()}
         else:
             scope = dict()
-        self.scope = scope
+        self.scope: Dict[str, object] = scope
         self.argnames: List[str] = []
         # Code is python expression strings generated for each guard
         self.code: List[str] = []
-        self.tensor_check_names = []
-        self.tensor_check_ids = {}
-        self.tensor_check_examples = []
-        self.guarded_code = guarded_code
 
-    def get(self, name: str):
+        # Most of the time, we generate Python code in a guard to directly
+        # check various properties.  However, tensors are a bit special;
+        # it is too slow to check their properties one-by-one in Python.
+        # Instead, there is a C++ function TensorGuards.check which takes
+        # all of the tensor arguments and checks them all against compile-time
+        # examples entirely in C++.  Thus, every time we process a
+        # TENSOR_MATCH guard, we just add another entry to
+        # tensor_check_names/tensor_check_examples, saying "for this local,
+        # check it against this example", and it all ends up getting
+        # swept up into a single call to ___check_tensors.  Invariant:
+        # len(tensor_check_names) == len(tensor_check_examples).
+        self.tensor_check_names: List[str] = []
+        self.tensor_check_examples: List[torch.Tensor] = []
+
+        self.tensor_check_ids: Dict[str, int] = {}
+        # TODO: tf is this naming
+        self.guarded_code: CheckFunctionManager = guarded_code
+
+    def get(self, name: str) -> Any:
         return eval(name, self.scope, CLOSURE_VARS)
 
-    def arg_ref(self, guard: Guard):
+    def arg_ref(self, guard: Union[str, Guard]) -> str:
+        name: str
         if isinstance(guard, str):
             name = guard
         else:
@@ -237,7 +274,9 @@ def ID_MATCH(self, guard: Guard):
         m = re.match(r"^type\((.+)\)$", guard.name)
         if m:
             # optional optimization to produce cleaner/faster guard code
-            return self.TYPE_MATCH(Guard(m.group(1), guard.source, None))
+            return self.TYPE_MATCH(
+                Guard(m.group(1), guard.source, GuardBuilder.TYPE_MATCH)
+            )
 
         code = f"___check_obj_id({self.arg_ref(guard)}, {self.id_ref(self.get(guard.name))})"
         self._produce_guard_code(guard, [code])
@@ -295,8 +334,8 @@ def EQUALS_MATCH(self, guard: Guard):
         ), t.__name__
         if istype(val, (torch.device, torch.dtype)):
             # TODO(jansel): is this slow? perhaps optimize it
-            code = f"str({ref}) == {str(val)!r}"
-            self._produce_guard_code(guard, [code])
+            code = [f"str({ref}) == {str(val)!r}"]
+            self._produce_guard_code(guard, code)
             return
 
         # Special case for nan because float("nan") == float("nan") evaluates to False
@@ -442,15 +481,15 @@ def GRAD_MODE(self, guard: Guard):
     # This is a bit of a crutch for export case for symbolic shape guards.
     # SYMBOL_MATCH is only ever, and must only ever, be used for setting this value on
     # the create_fn field for tracking guards in export.
-    @staticmethod
-    def SYMBOL_MATCH():
-        pass
+    def SYMBOL_MATCH(self, guard: Guard):
+        raise AssertionError("this should not actually be called")
 
     def TENSOR_MATCH(self, guard: Guard):
         if guard.is_nn_module():
             self.ID_MATCH(guard)
         else:
             value = self.get(guard.name)
+            assert isinstance(value, torch.Tensor)
             tensor_name = self.arg_ref(guard)
             self.tensor_check_names.append(tensor_name)
             self.tensor_check_examples.append(value)
@@ -472,8 +511,16 @@ def TENSOR_MATCH(self, guard: Guard):
 
     # A util that appends guarded code, or, in the case of export, adds data onto guards
     def _produce_guard_code(self, guard, code_list, provided_guarded_object=None):
-        caller = currentframe().f_back
+        # WARNING: It is important that cur_frame/caller do NOT stay in
+        # the current frame, because they will keep things live longer
+        # than they should.  See TestMisc.test_release_module_memory
+        cur_frame = currentframe()
+        assert cur_frame is not None
+        caller = cur_frame.f_back
+        del cur_frame
+        assert caller is not None
         func_name = getframeinfo(caller)[2]
+        del caller
         # We use func_name for export, so might as well get a nice defensive check out of it
         assert func_name in dir(
             self.__class__
@@ -504,12 +551,6 @@ def _produce_guard_code(self, guard, code_list, provided_guarded_object=None):
         )
 
 
-@dataclasses.dataclass
-class GuardedCode:
-    code: types.CodeType
-    check_fn: Callable
-
-
 from sympy.printing.str import StrPrinter
 
 
@@ -546,7 +587,11 @@ def tensor_ref_as_str(tensor_ref, id_to_name_map):
         return f"{id_to_name_map[tensor_ref.ref_id]}.{tensor_ref.kind}()"
 
     def __init__(
-        self, expr_to_tensor_ref, id_to_name_map, shape_env, intermediary_symbols
+        self,
+        expr_to_tensor_ref: Dict[sympy.Symbol, Dict[TensorReference, None]],
+        id_to_name_map,
+        shape_env,
+        intermediary_symbols,
     ):
         super().__init__()
         self.expr_to_tensor_ref = expr_to_tensor_ref
@@ -585,12 +630,12 @@ def __init__(
         self,
         output_graph=None,
         guards: Optional[Set[Guard]] = None,
-        f_locals: Optional[Dict] = None,
-        f_globals: Optional[Dict] = None,
+        f_locals: Optional[Dict[str, object]] = None,
+        f_globals: Optional[Dict[str, object]] = None,
     ):
         self.valid = True
-        self._weakrefs = []
-        self._seen_ids = set()
+        self._weakrefs: List["ReferenceType[object]"] = []
+        self._seen_ids: Set[int] = set()
         self.output_graph = output_graph
 
         # Note: right overrides left
@@ -633,17 +678,17 @@ def combine_scopes(left, right):
 
     def _parse_symbolic_shape_expressions(self, tensor_check_names, tensor_check_ids):
         # Pre join output
-        finished_expressions = []
+        finished_expressions: List[str] = []
 
         # A mapping of tensor_ids to tensor names
-        id_to_name_map = {}
+        id_to_name_map: Dict[int, str] = {}
 
         # We should not have a shape env, or guards if we are not in config.dynamic shapes
         # But check it anyway.
         if not config.dynamic_shapes:
             return None
 
-        expr_to_tensor_ref = {}
+        expr_to_tensor_ref: Dict[sympy.Symbol, Dict[TensorReference, None]] = {}
         guard_printer = DynamoGuardPrinter(
             expr_to_tensor_ref,
             id_to_name_map,
@@ -665,7 +710,7 @@ def _parse_symbolic_shape_expressions(self, tensor_check_names, tensor_check_ids
                     obj_expr = tensor_ref.expr
                     if obj_expr not in expr_to_tensor_ref:
                         expr_to_tensor_ref[obj_expr] = {}
-                    expr_to_tensor_ref[obj_expr][tensor_ref] = ""
+                    expr_to_tensor_ref[obj_expr][tensor_ref] = None
 
         guard_expression = self.output_graph.shape_env.get_guard_expr()
         expr_as_str = guard_printer.doprint(guard_expression)
@@ -696,10 +741,10 @@ def _parse_symbolic_shape_expressions(self, tensor_check_names, tensor_check_ids
     def compile_check_fn(self, local_builder, global_builder, guards_out):
         assert not (set(local_builder.argnames) & set(global_builder.argnames))
         # see parallel handling of ".0" / "___implicit0" in _eval_frame.c
-        args = [a for a in local_builder.scope.keys() if a == "___implicit0"]
-        args += [a for a in local_builder.argnames if a != "___implicit0"]
-        args += ["**___kwargs_ignored"]
-        args = ",".join(args)
+        largs = [a for a in local_builder.scope.keys() if a == "___implicit0"]
+        largs += [a for a in local_builder.argnames if a != "___implicit0"]
+        largs += ["**___kwargs_ignored"]
+        args = ",".join(largs)
 
         code_parts = (
             ["___guarded_code.valid"] + local_builder.code + global_builder.code
@@ -742,7 +787,7 @@ def compile_check_fn(self, local_builder, global_builder, guards_out):
                 guards_out.add(
                     Guard(
                         name="symbolic_shape_expression",
-                        source=None,
+                        source=GuardSource.SHAPE_ENV,
                         create_fn=GuardBuilder.SYMBOL_MATCH,
                         code_list=symbolic_shape_expression,
                     )
@@ -777,7 +822,7 @@ def ___make_guard_fn({','.join(closure_vars.keys())}):
         if os.environ.get("TORCHDYNAMO_PRINT_GUARDS", None) == "1":
             print("GUARDS", code)
         set_guard_fail_hook(guard_fail_hook)
-        out = dict()
+        out: Dict[str, Any] = dict()
         # print("RUNNING PY CODE", py_code)
         exec(py_code, global_builder.scope, out)
         guard_fn = out["___make_guard_fn"](*closure_vars.values())
@@ -804,7 +849,7 @@ def id_ref(self, obj):
 
 
 def guard_fail_hook(
-    guard_fn: Callable, code: types.CodeType, f_locals: Dict[str, Any], last: bool
+    guard_fn: GuardFn, code: types.CodeType, f_locals: Dict[str, object], last: bool
 ) -> None:
     """
     called whenever a guard fails.
@@ -828,7 +873,7 @@ def guard_fail_hook(
 
 
 def guard_error_hook(
-    guard_fn: Callable, code: types.CodeType, f_locals: Dict[str, Any], last: bool
+    guard_fn: GuardFn, code: types.CodeType, f_locals: Dict[str, object], last: bool
 ):
     print(
         f"ERROR RUNNING GUARDS {code.co_name} {code.co_filename}:{code.co_firstlineno}"
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index 6a8db90bd8c8d..3a0209c1511b6 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -79,7 +79,7 @@ def _gen_rand_values():
 class FakeRootModule(torch.nn.Module):
     """Trick the constructor of fx.GraphModule"""
 
-    def __init__(self, nn_modules: dict):
+    def __init__(self, nn_modules: Dict[str, torch.nn.Module]):
         super(FakeRootModule, self).__init__()
         for k, v in nn_modules.items():
             setattr(self, k, v)
diff --git a/torch/_dynamo/types.py b/torch/_dynamo/types.py
index fccc8dfe9f28e..7dfcfd7b51111 100644
--- a/torch/_dynamo/types.py
+++ b/torch/_dynamo/types.py
@@ -1,9 +1,25 @@
+import dataclasses
 import types
-from typing import Any, Callable, Dict, Optional, Union
+from typing import Dict, List, Optional, OrderedDict, Union
 
 from typing_extensions import Protocol
 
-from torch._dynamo.guards import GuardedCode
+
+class GuardFn(Protocol):
+    closure_vars: OrderedDict[str, object]
+    code_parts: List[str]
+    verbose_code_parts: List[str]
+    global_scope: Dict[str, object]
+
+    # maps locals of user function to bool
+    def __call__(self, *maybe_dotzero: object, **f_locals: object) -> bool:
+        ...
+
+
+@dataclasses.dataclass
+class GuardedCode:
+    code: types.CodeType
+    check_fn: GuardFn
 
 
 class DynamoCallbackFn(Protocol):
@@ -19,9 +35,9 @@ def __call__(
 class DynamoGuardHook(Protocol):
     def __call__(
         self,
-        guard_fn: Callable,
+        guard_fn: GuardFn,
         code: types.CodeType,
-        f_locals: Dict[str, Any],
+        f_locals: Dict[str, object],
         last: bool,
     ) -> None:
         ...

From f93c3a1a808eed87d5f3770a4edfc71c4dab81c8 Mon Sep 17 00:00:00 2001
From: Kshiteej K <kshitijkalambarkar@gmail.com>
Date: Thu, 1 Dec 2022 14:43:30 +0000
Subject: [PATCH 1474/1922] add vjp test with non-contig inputs (#89375)

Ref: https://github.com/pytorch/functorch/issues/1029

We update `test_vjp` to do contiguous and non-contiguous sample testing.

Prev Time: ~32s
New Time : ~50s
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89375
Approved by: https://github.com/zou3519
---
 test/functorch/test_ops.py | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index c0ae683cdfbf7..36fe608fe496d 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -10,7 +10,7 @@
 import unittest
 
 from torch.testing._internal.common_utils import TestCase, run_tests, is_iterable_of_tensors, IS_MACOS, \
-    IS_ARM64, IS_X86, parametrize, TEST_WITH_ASAN, noncontiguous_like
+    IS_ARM64, IS_X86, parametrize, TEST_WITH_ASAN, noncontiguous_like, IS_WINDOWS
 import torch
 from torch import Tensor
 import functools
@@ -512,12 +512,36 @@ def maybe_clone_inputs():
     @ops(op_db + additional_op_db, allowed_dtypes=(torch.float,))
     @skipOps('TestOperators', 'test_vjp', vjp_fail.union({
         xfail('sparse.sampled_addmm', ''),
+
+        # ---- Non-Contiguous Failures ----
+        # This is expected to fail as the operator
+        # expects last dim to have stride=1
+        xfail('view_as_complex'),
+        # RuntimeError: query: last dimension must be contiguous
+        # NOTE: This passes on Windows!
+        decorate('nn.functional._scaled_dot_product_attention',
+                 decorator=unittest.skipIf(not IS_WINDOWS, "expects contiguous inputs")),
+        # BUG
+        # AssertionError: Tensor-likes are not close!
+        xfail('as_strided'),
+        xfail('as_strided_scatter'),
+        xfail('_softmax_backward_data', device_type='cpu'),
     }))
     @opsToleranceOverride('TestOperators', 'test_vjp', (
         tol1('nn.functional.conv_transpose3d',
              {torch.float32: tol(atol=5e-05, rtol=9e-05)}, device_type='cuda'),
         tol1('nn.functional.binary_cross_entropy_with_logits',
              {torch.float32: tol(atol=1e-04, rtol=1e-04)}),
+        tol1('__rmatmul__',
+             {torch.float32: tol(atol=1e-05, rtol=1e-05)}),
+        tol1('matmul',
+             {torch.float32: tol(atol=1e-05, rtol=1e-05)}),
+        tol2('linalg.pinv', 'hermitian',
+             {torch.float32: tol(atol=1e-05, rtol=1e-05)}),
+        tol1('linalg.tensorsolve',
+             {torch.float32: tol(atol=1e-05, rtol=1e-05)}),
+        tol1('svd_lowrank',
+             {torch.float32: tol(atol=1e-04, rtol=1e-04)}),
     ))
     def test_vjp(self, device, dtype, op):
         if not op.supports_autograd:
@@ -534,14 +558,22 @@ def _test(_op, inplace=False):
                 result = fn(*primals)
                 cotangents = tree_map(lambda x: torch.randn_like(x), result)
 
+                noncontig_fn, noncontig_primals = normalize_op_input_output(_op, sample.noncontiguous())
+                noncontig_cotangents = tree_map(lambda x: noncontiguous_like(x), cotangents)
+
                 out, vjp_fn = vjp(fn, *primals)
                 self.assertEqual(out, result)
                 result_vjps = vjp_fn(cotangents)
 
+                out_noncontig, vjp_fn = vjp(noncontig_fn, *noncontig_primals)
+                self.assertEqual(out_noncontig, result)
+                noncontig_result_vjps = vjp_fn(noncontig_cotangents)
+
                 _, vjp_fn = ref_vjp(fn, *primals)
                 expected_vjps = vjp_fn(cotangents)
 
                 self.assertEqual(result_vjps, expected_vjps)
+                self.assertEqual(noncontig_result_vjps, expected_vjps)
 
         _test(op)
         for a_op in op.aliases:

From e2df6f57189a2e56a04678a6f657614d95e34ca1 Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Thu, 1 Dec 2022 01:47:56 +0000
Subject: [PATCH 1475/1922] Add error repro test for FSDP ignored modules with
 mixed precision (#89971)

The ignored modules are still using the original precision, which
leads to the following error.

```
RuntimeError: mat1 and mat2 must have the same dtype
```

This is not blocking me at the moment, but the fix seems not too
hard. We can add a pre-forward hook to each ignored module to
convert activations to original precision, and a post-forward hook
to convert it back to the specified precision.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89971
Approved by: https://github.com/awgu
---
 .../fsdp/test_fsdp_mixed_precision.py         | 42 +++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/test/distributed/fsdp/test_fsdp_mixed_precision.py b/test/distributed/fsdp/test_fsdp_mixed_precision.py
index d03ed1179e0f4..4eef830ba13b4 100644
--- a/test/distributed/fsdp/test_fsdp_mixed_precision.py
+++ b/test/distributed/fsdp/test_fsdp_mixed_precision.py
@@ -756,5 +756,47 @@ def test_mixed_precision_e2e_full_shard(self):
 
 instantiate_parametrized_tests(TestFSDPMixedPrecisionSharded)
 
+
+class IgnoredModule(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.l = nn.Linear(100, 100)
+
+    def forward(self, x):
+        return self.l(x)
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.l1 = nn.Linear(100, 100)
+        self.ignored = IgnoredModule()
+        self.l2 = nn.Linear(100, 100)
+
+    def forward(self, x):
+        return self.l2(self.ignored(self.l1(x)))
+
+
+class TestFSDPMixedPrecisionIgnoredModules(FSDPTest):
+    @property
+    def world_size(self):
+        return 1
+
+    @skip_if_lt_x_gpu(1)
+    def test_mixed_precision_with_ignored_module(self):
+        model = Model().cuda()
+        float16 = MixedPrecision(param_dtype=torch.float16)
+        model = FSDP(
+            model,
+            ignored_modules=[model.ignored],
+            mixed_precision=float16,
+        )
+
+        x = torch.ones(2, 100, device=torch.cuda.current_device())
+
+        with self.assertRaisesRegex(RuntimeError, "must have the same dtype"):
+            model(x).sum().backward()
+
+
 if __name__ == "__main__":
     run_tests()

From ce9b1b983d1434df000868a42937241d73dd2463 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Wed, 30 Nov 2022 16:17:49 -0800
Subject: [PATCH 1476/1922] Remove DDP import (#89982)

This import is only used for typing, removing it to avoid circular ref
in next diffs

Differential Revision: [D41636897](https://our.internmc.facebook.com/intern/diff/D41636897/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89982
Approved by: https://github.com/zhaojuanmao
---
 torch/distributed/algorithms/ddp_comm_hooks/__init__.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
index 037b63b6ba929..7fdd6f6a07eaa 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
@@ -2,7 +2,6 @@
 from functools import partial
 
 import torch.distributed as dist
-from torch.nn.parallel import DistributedDataParallel
 
 from . import (
     debugging_hooks as debugging,
@@ -87,7 +86,7 @@ class DDPCommHookType(Enum):
 
 
 def register_ddp_comm_hook(
-    comm_hook_type: DDPCommHookType, model: DistributedDataParallel, state=None
+    comm_hook_type: DDPCommHookType, model, state=None
 ):
     """
     Registers the hooks of ``torch.distributed.algorithms.ddp_comm_hooks``

From b881b3a48239b5b74b9239547ad4c5b4070358b7 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 1 Dec 2022 15:21:54 +0000
Subject: [PATCH 1477/1922] fix citation file in MANIFEST (#89994)

#86200 changed the `CITATION` file to `CITATION.cff`, but this change was not reflected in the `MANIFEST.in`. Meaning, `CITATION.cff` will not be included in wheels.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89994
Approved by: https://github.com/malfet
---
 MANIFEST.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index 403b90b702df2..f6ffb4e02a8af 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,6 +1,6 @@
 include MANIFEST.in
 include CMakeLists.txt
-include CITATION
+include CITATION.cff
 include LICENSE
 include NOTICE
 include .gitmodules

From 66bf6ffb94f8b685653133d437dfc6a50556c5f7 Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Thu, 1 Dec 2022 07:53:23 -0800
Subject: [PATCH 1478/1922] [Quant][fx][bc-breaking] Rename fx/*patterns.py
 (#89872)

Summary: This commit renames fx/quantization_patterns.py
to fx/quantize_handler.py, and fx/fusion_patterns.py to
fx/fuse_handler.py. This is because these files contain
only QuantizeHandler and FuseHandler respectively, so the
new names are more descriptive. A future commit will
further break BC by removing all the empty *QuantizeHandler
classes.

BC-breaking notes:

The following classes under the
`torch.ao.quantization.fx.quantization_patterns` namespace
are migrated to the `torch.ao.quantization.fx.quantize_handler`
namespace:
```
QuantizeHandler
BinaryOpQuantizeHandler
CatQuantizeHandler
ConvReluQuantizeHandler
LinearReLUQuantizeHandler
BatchNormQuantizeHandler
EmbeddingQuantizeHandler
RNNDynamicQuantizeHandler
DefaultNodeQuantizeHandler
FixedQParamsOpQuantizeHandler
CopyNodeQuantizeHandler
GeneralTensorShapeOpQuantizeHandler
CustomModuleQuantizeHandler
StandaloneModuleQuantizeHandler
```

The following classes under the
`torch.ao.quantization.fx.fusion_patterns` namespace are
migrated to the `torch.ao.quantization.fx.fuse_handler`
namespace:
```
DefaultFuseHandler
FuseHandler
```

Test Plan:
python test/test_quantization.py TestQuantizeFx
python test/test_quantization.py TestQuantizeFxOps

Reviewers: jerryzh168, vkuzo

Subscribers: jerryzh168, vkuzo

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89872
Approved by: https://github.com/jerryzh168
---
 .github/scripts/gql_mocks.json                |  4 +-
 test/quantization/ao_migration/common.py      | 13 +++--
 .../ao_migration/test_quantization_fx.py      | 27 ++++++++--
 test/quantization/core/test_backend_config.py |  2 +-
 test/quantization/fx/test_numeric_suite_fx.py | 50 +++++++++----------
 test/quantization/fx/test_quantize_fx.py      |  2 +-
 torch/ao/ns/_numeric_suite_fx.py              |  2 +-
 torch/ao/ns/fx/pattern_utils.py               |  2 +-
 torch/ao/quantization/fx/fuse.py              |  5 +-
 .../{fusion_patterns.py => fuse_handler.py}   |  1 -
 torch/ao/quantization/fx/match_utils.py       |  2 +-
 torch/ao/quantization/fx/pattern_utils.py     |  1 -
 torch/ao/quantization/fx/prepare.py           | 13 ++---
 ...zation_patterns.py => quantize_handler.py} |  1 -
 torch/ao/quantization/fx/utils.py             |  2 +-
 torch/quantization/fx/fusion_patterns.py      |  2 +-
 .../quantization/fx/quantization_patterns.py  |  2 +-
 17 files changed, 73 insertions(+), 58 deletions(-)
 rename torch/ao/quantization/fx/{fusion_patterns.py => fuse_handler.py} (99%)
 rename torch/ao/quantization/fx/{quantization_patterns.py => quantize_handler.py} (99%)

diff --git a/.github/scripts/gql_mocks.json b/.github/scripts/gql_mocks.json
index 7f6dbc05d3415..073658b0d6bc8 100644
--- a/.github/scripts/gql_mocks.json
+++ b/.github/scripts/gql_mocks.json
@@ -18634,7 +18634,7 @@
                 "path": "torch/ao/quantization/fx/fuse.py"
               },
               {
-                "path": "torch/ao/quantization/fx/fusion_patterns.py"
+                "path": "torch/ao/quantization/fx/fuse_handler.py"
               },
               {
                 "path": "torch/ao/quantization/fx/match_utils.py"
@@ -18646,7 +18646,7 @@
                 "path": "torch/ao/quantization/fx/prepare.py"
               },
               {
-                "path": "torch/ao/quantization/fx/quantization_patterns.py"
+                "path": "torch/ao/quantization/fx/quantize_handler.py"
               },
               {
                 "path": "torch/ao/quantization/qconfig.py"
diff --git a/test/quantization/ao_migration/common.py b/test/quantization/ao_migration/common.py
index bade3b7ff4d26..50045a39e7ab5 100644
--- a/test/quantization/ao_migration/common.py
+++ b/test/quantization/ao_migration/common.py
@@ -6,7 +6,8 @@
 class AOMigrationTestCase(TestCase):
     def _test_package_import(self, package_name: str,
                              base: Optional[str] = None,
-                             skip: List[str] = None):
+                             skip: List[str] = None,
+                             new_package_name: Optional[str] = None):
         r"""Tests the module import by making sure that all the internals match
         (except the dunder methods).
 
@@ -19,8 +20,10 @@ def _test_package_import(self, package_name: str,
         base = base or 'quantization'
         old_base = 'torch.' + base
         new_base = 'torch.ao.' + base
+        if new_package_name is None:
+            new_package_name = package_name
         old_module = importlib.import_module(f'{old_base}.{package_name}')
-        new_module = importlib.import_module(f'{new_base}.{package_name}')
+        new_module = importlib.import_module(f'{new_base}.{new_package_name}')
         old_module_dir = set(dir(old_module))
         new_module_dir = set(dir(new_module))
         # Remove magic modules from checking in subsets
@@ -36,15 +39,17 @@ def _test_package_import(self, package_name: str,
             f"{old_module_dir - new_module_dir}"
 
     def _test_function_import(self, package_name: str, function_list: List[str],
-                              base: Optional[str] = None):
+                              base: Optional[str] = None, new_package_name: Optional[str] = None):
         r"""Tests individual function list import by comparing the functions
         and their hashes."""
         if base is None:
             base = 'quantization'
         old_base = 'torch.' + base
         new_base = 'torch.ao.' + base
+        if new_package_name is None:
+            new_package_name = package_name
         old_location = importlib.import_module(f'{old_base}.{package_name}')
-        new_location = importlib.import_module(f'{new_base}.{package_name}')
+        new_location = importlib.import_module(f'{new_base}.{new_package_name}')
         for fn_name in function_list:
             old_function = getattr(old_location, fn_name)
             new_function = getattr(new_location, fn_name)
diff --git a/test/quantization/ao_migration/test_quantization_fx.py b/test/quantization/ao_migration/test_quantization_fx.py
index 03d1da6f2cfb4..c75727717a736 100644
--- a/test/quantization/ao_migration/test_quantization_fx.py
+++ b/test/quantization/ao_migration/test_quantization_fx.py
@@ -26,7 +26,10 @@ def test_function_import_quantize_fx(self):
         self._test_function_import('quantize_fx', function_list)
 
     def test_package_import_fx(self):
-        self._test_package_import('fx')
+        self._test_package_import('fx', skip=[
+            'fusion_patterns',
+            'quantization_patterns',
+        ])
 
     def test_function_import_fx(self):
         function_list = [
@@ -99,7 +102,10 @@ def test_function_import_fx_equalize(self):
         self._test_function_import('fx._equalize', function_list)
 
     def test_package_import_fx_quantization_patterns(self):
-        self._test_package_import('fx.quantization_patterns')
+        self._test_package_import(
+            'fx.quantization_patterns',
+            new_package_name='fx.quantize_handler',
+        )
 
     def test_function_import_fx_quantization_patterns(self):
         function_list = [
@@ -118,7 +124,11 @@ def test_function_import_fx_quantization_patterns(self):
             'GeneralTensorShapeOpQuantizeHandler',
             'StandaloneModuleQuantizeHandler'
         ]
-        self._test_function_import('fx.quantization_patterns', function_list)
+        self._test_function_import(
+            'fx.quantization_patterns',
+            function_list,
+            new_package_name='fx.quantize_handler',
+        )
 
     def test_package_import_fx_match_utils(self):
         self._test_package_import('fx.match_utils')
@@ -158,14 +168,21 @@ def test_function_import_fx_fuse(self):
         self._test_function_import('fx.fuse', function_list)
 
     def test_package_import_fx_fusion_patterns(self):
-        self._test_package_import('fx.fusion_patterns')
+        self._test_package_import(
+            'fx.fusion_patterns',
+            new_package_name='fx.fuse_handler',
+        )
 
     def test_function_import_fx_fusion_patterns(self):
         function_list = [
             'FuseHandler',
             'DefaultFuseHandler'
         ]
-        self._test_function_import('fx.fusion_patterns', function_list)
+        self._test_function_import(
+            'fx.fusion_patterns',
+            function_list,
+            new_package_name='fx.fuse_handler',
+        )
 
     # we removed matching test for torch.quantization.fx.quantization_types
     # old: torch.quantization.fx.quantization_types
diff --git a/test/quantization/core/test_backend_config.py b/test/quantization/core/test_backend_config.py
index e641e58bb2aac..6cf8f3d5e5c61 100644
--- a/test/quantization/core/test_backend_config.py
+++ b/test/quantization/core/test_backend_config.py
@@ -14,7 +14,7 @@
     ObservationType,
 )
 from torch.ao.quantization.fuser_method_mappings import _reverse_sequential_wrapper2
-from torch.ao.quantization.fx.quantization_patterns import _default_root_node_getter
+from torch.ao.quantization.fx.quantize_handler import _default_root_node_getter
 
 
 class TestBackendConfig(QuantizationTestCase):
diff --git a/test/quantization/fx/test_numeric_suite_fx.py b/test/quantization/fx/test_numeric_suite_fx.py
index e5900104aa0a1..74098b597753c 100644
--- a/test/quantization/fx/test_numeric_suite_fx.py
+++ b/test/quantization/fx/test_numeric_suite_fx.py
@@ -41,7 +41,7 @@
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_quantization import NodeSpec as ns
 from torch.ao.quantization.fx.pattern_utils import get_default_quant_patterns
-import torch.ao.quantization.fx.quantization_patterns as qp
+import torch.ao.quantization.fx.quantize_handler as qh
 from torch.ao.ns.fx.pattern_utils import (
     get_type_a_related_to_b,
 )
@@ -85,7 +85,7 @@
 )
 from torch.ao.ns.fx.qconfig_multi_mapping import QConfigMultiMapping
 from torch.ao.quantization.backend_config import get_native_backend_config
-from torch.ao.quantization.fx.quantization_patterns import _get_pattern_to_quantize_handlers
+from torch.ao.quantization.fx.quantize_handler import _get_pattern_to_quantize_handlers
 
 
 # Note: these models are not for use outside of this file. While it's good
@@ -669,21 +669,21 @@ def _op_is_unmatchable(op):
                 base_op = pattern
 
             qhandler_cls_all_ops_quantizeable = [
-                qp.CatQuantizeHandler,
-                qp.ConvReluQuantizeHandler,
-                qp.LinearReLUQuantizeHandler,
-                qp.BatchNormQuantizeHandler,
-                qp.EmbeddingQuantizeHandler,
-                qp.RNNDynamicQuantizeHandler,
+                qh.CatQuantizeHandler,
+                qh.ConvReluQuantizeHandler,
+                qh.LinearReLUQuantizeHandler,
+                qh.BatchNormQuantizeHandler,
+                qh.EmbeddingQuantizeHandler,
+                qh.RNNDynamicQuantizeHandler,
             ]
 
             qhandler_cls_quant_op_same_signature = [
-                qp.FixedQParamsOpQuantizeHandler,
-                qp.CopyNodeQuantizeHandler,
-                qp.GeneralTensorShapeOpQuantizeHandler,
+                qh.FixedQParamsOpQuantizeHandler,
+                qh.CopyNodeQuantizeHandler,
+                qh.GeneralTensorShapeOpQuantizeHandler,
             ]
 
-            if qhandler_cls == qp.BinaryOpQuantizeHandler:
+            if qhandler_cls == qh.BinaryOpQuantizeHandler:
                 # these ops do not have quantized equivalents
                 ops_to_skip = [
                     torch.bmm,
@@ -697,11 +697,11 @@ def _op_is_unmatchable(op):
                 self.assertTrue(
                     _op_in_base_sets_of_related_ops(base_op),
                     f"{base_op} not in sets of related ops")
-            elif qhandler_cls == qp.RNNDynamicQuantizeHandler:
+            elif qhandler_cls == qh.RNNDynamicQuantizeHandler:
                 # TODO(future PR): add support for all classes in
                 # RNNDynamicQuantizeHandler
                 pass
-            elif qhandler_cls == qp.DefaultNodeQuantizeHandler:
+            elif qhandler_cls == qh.DefaultNodeQuantizeHandler:
                 self.assertTrue(
                     _op_in_base_sets_of_related_ops(base_op),
                     f"{base_op} not in sets of related ops")
@@ -1606,23 +1606,23 @@ def test_op_io_dtype_coverage(self):
 
             if (
                 qhandler_cls in (
-                    qp.BinaryOpQuantizeHandler,
-                    qp.RNNDynamicQuantizeHandler,
+                    qh.BinaryOpQuantizeHandler,
+                    qh.RNNDynamicQuantizeHandler,
                 )
             ):
                 # TODO(future PR): implement shadowing for binary ops
                 # TODO(future PR): implement shadowing for RNN ops
                 continue
-            elif qhandler_cls == qp.CatQuantizeHandler:
+            elif qhandler_cls == qh.CatQuantizeHandler:
                 self.assertTrue(
                     base_op in FUNS_IO_TYPE_FP32_OR_INT8,
                     f"missing IO type handling for {base_op}")
             elif (
                 qhandler_cls in (
-                    qp.ConvReluQuantizeHandler,
-                    qp.LinearReLUQuantizeHandler,
-                    qp.BatchNormQuantizeHandler,
-                    qp.DefaultNodeQuantizeHandler,
+                    qh.ConvReluQuantizeHandler,
+                    qh.LinearReLUQuantizeHandler,
+                    qh.BatchNormQuantizeHandler,
+                    qh.DefaultNodeQuantizeHandler,
                 )
             ):
                 self.assertTrue(
@@ -1630,9 +1630,9 @@ def test_op_io_dtype_coverage(self):
                     f"missing IO type handling for {base_op}")
             elif (
                 qhandler_cls in (
-                    qp.FixedQParamsOpQuantizeHandler,
-                    qp.CopyNodeQuantizeHandler,
-                    qp.GeneralTensorShapeOpQuantizeHandler,
+                    qh.FixedQParamsOpQuantizeHandler,
+                    qh.CopyNodeQuantizeHandler,
+                    qh.GeneralTensorShapeOpQuantizeHandler,
                 )
             ):
                 if (
@@ -1650,7 +1650,7 @@ def test_op_io_dtype_coverage(self):
                     # version, so it does not fit into the cases above.
                     (base_op is torch.nn.Softmax),
                     f"missing IO type handling for {base_op}")
-            elif qhandler_cls == qp.EmbeddingQuantizeHandler:
+            elif qhandler_cls == qh.EmbeddingQuantizeHandler:
                 # embedding shadowing is not implemented, for now
                 continue
             else:
diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index f33fe306d8203..cbd256a1fbbe9 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -24,7 +24,7 @@
 )
 
 
-from torch.ao.quantization.fx.quantization_patterns import DefaultNodeQuantizeHandler
+from torch.ao.quantization.fx.quantize_handler import DefaultNodeQuantizeHandler
 
 from torch.ao.quantization.fx.match_utils import (
     is_match,
diff --git a/torch/ao/ns/_numeric_suite_fx.py b/torch/ao/ns/_numeric_suite_fx.py
index 2c563b8c82d66..db4bd87fb9734 100644
--- a/torch/ao/ns/_numeric_suite_fx.py
+++ b/torch/ao/ns/_numeric_suite_fx.py
@@ -123,7 +123,7 @@
 from torch.ao.quantization.backend_config import BackendConfig
 from torch.ao.quantization.fx.match_utils import find_matches
 from torch.ao.quantization.fx.qconfig_mapping_utils import generate_node_name_to_qconfig
-from torch.ao.quantization.fx.quantization_patterns import _get_pattern_to_quantize_handlers
+from torch.ao.quantization.fx.quantize_handler import _get_pattern_to_quantize_handlers
 from torch.ao.quantization.qconfig import QConfigAny
 from torch.ao.ns.fx.n_shadows_utils import (
     OutputProp,
diff --git a/torch/ao/ns/fx/pattern_utils.py b/torch/ao/ns/fx/pattern_utils.py
index 5bcb65c415ba5..b91024bc76c09 100644
--- a/torch/ao/ns/fx/pattern_utils.py
+++ b/torch/ao/ns/fx/pattern_utils.py
@@ -7,7 +7,7 @@
 from torch.fx.graph import Node
 
 from torch.ao.quantization.backend_config import get_native_backend_config
-from torch.ao.quantization.fx.quantization_patterns import _get_pattern_to_quantize_handlers
+from torch.ao.quantization.fx.quantize_handler import _get_pattern_to_quantize_handlers
 from torch.ao.quantization.utils import getattr_from_fqn
 from .ns_types import NSNodeTargetType
 from torch.ao.quantization import (
diff --git a/torch/ao/quantization/fx/fuse.py b/torch/ao/quantization/fx/fuse.py
index 3053b0329c82f..930be1bdcc4fc 100644
--- a/torch/ao/quantization/fx/fuse.py
+++ b/torch/ao/quantization/fx/fuse.py
@@ -27,7 +27,7 @@
 
 from .custom_config import FuseCustomConfig
 
-from .fusion_patterns import (
+from .fuse_handler import (
     _get_fusion_pattern_to_fuse_handler_cls,
     FuseHandler,
 )
@@ -40,6 +40,9 @@
 
 __all__ = [
     "fuse",
+    # TODO: We should make this private in the future
+    # This is currently needed for test_public_bindings for some reason
+    "FuseHandler",
 ]
 
 
diff --git a/torch/ao/quantization/fx/fusion_patterns.py b/torch/ao/quantization/fx/fuse_handler.py
similarity index 99%
rename from torch/ao/quantization/fx/fusion_patterns.py
rename to torch/ao/quantization/fx/fuse_handler.py
index 5ec6f8430feb7..2106dc4e33143 100644
--- a/torch/ao/quantization/fx/fusion_patterns.py
+++ b/torch/ao/quantization/fx/fuse_handler.py
@@ -39,7 +39,6 @@ def fuse(self,
              is_qat: bool) -> Node:
         pass
 
-# TODO: move this to backend_config_utils
 class DefaultFuseHandler(FuseHandler):
     def __init__(
             self,
diff --git a/torch/ao/quantization/fx/match_utils.py b/torch/ao/quantization/fx/match_utils.py
index e50b89f9ce408..56d66d00b2b6f 100644
--- a/torch/ao/quantization/fx/match_utils.py
+++ b/torch/ao/quantization/fx/match_utils.py
@@ -5,7 +5,7 @@
     Node,
 )
 from torch.ao.quantization.utils import Pattern
-from .quantization_patterns import (
+from .quantize_handler import (
     QuantizeHandler,
 )
 from ..qconfig import (
diff --git a/torch/ao/quantization/fx/pattern_utils.py b/torch/ao/quantization/fx/pattern_utils.py
index c4971b542627a..10b67d075b216 100644
--- a/torch/ao/quantization/fx/pattern_utils.py
+++ b/torch/ao/quantization/fx/pattern_utils.py
@@ -2,7 +2,6 @@
 from typing import Dict, Any
 from torch.ao.quantization.utils import Pattern
 from ..fake_quantize import FixedQParamsFakeQuantize
-# from .quantization_patterns import BinaryOpQuantizeHandler
 from ..observer import ObserverBase
 import copy
 
diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index 1d623fdeda205..2ed88d16126bc 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -33,7 +33,8 @@
     update_qconfig_for_fusion,
 )
 
-from .quantization_patterns import (
+from .quantize_handler import (
+    _default_root_node_getter,
     _get_pattern_to_quantize_handlers,
     QuantizeHandler,
 )
@@ -297,12 +298,6 @@ def add_matched_node_name_to_set(matched_node_pattern: NodePattern, s: Set[str])
         for maybe_node in matched_node_pattern:
             add_matched_node_name_to_set(maybe_node, s)
 
-# this is temporary, will be removed soon
-def _default_root_node_getter(node_pattern):
-    while not isinstance(node_pattern, Node):
-        node_pattern = node_pattern[-1]
-    return node_pattern
-
 def insert_observer(
     node: Node,
     observer: ObserverBase,
@@ -929,9 +924,7 @@ def maybe_propagate_dtype_for_node(
 ) -> None:
     """
     Assigns `target_dtype` to `node`, setting `is_dynamic` to False. If `node`
-    is a general tensor shape op
-    (see GeneralTensorShapeOpQuantizeHandler in quantization_patterns.py for more details)
-    also call this function recursively on
+    is a general tensor shape op, also call this function recursively on
     the first argument, to propagate the dtype to the caller.
     """
     node_name_to_target_dtype_info[node.name]["input_activation_dtype"] = (target_dtype, False)
diff --git a/torch/ao/quantization/fx/quantization_patterns.py b/torch/ao/quantization/fx/quantize_handler.py
similarity index 99%
rename from torch/ao/quantization/fx/quantization_patterns.py
rename to torch/ao/quantization/fx/quantize_handler.py
index cce588cb536ca..8670eee3ed776 100644
--- a/torch/ao/quantization/fx/quantization_patterns.py
+++ b/torch/ao/quantization/fx/quantize_handler.py
@@ -44,7 +44,6 @@ def _default_root_node_getter(node_pattern):
         node_pattern = node_pattern[-1]
     return node_pattern
 
-# TODO: move to backend_config_utils.py
 # Base Pattern Handler
 class QuantizeHandler(ABC):
     """ Base handler class for the quantizer patterns
diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py
index edf440de28e12..4ff6d03983ea6 100644
--- a/torch/ao/quantization/fx/utils.py
+++ b/torch/ao/quantization/fx/utils.py
@@ -578,7 +578,7 @@ def _is_custom_module_lstm(
     """
     mod = _get_module(node, named_modules)
     if qconfig is not None and qhandler is not None:
-        assert isinstance(qhandler, torch.ao.quantization.fx.quantization_patterns.QuantizeHandler)  # type: ignore[attr-defined]
+        assert isinstance(qhandler, torch.ao.quantization.fx.quantize_handler.QuantizeHandler)  # type: ignore[attr-defined]
         return isinstance(mod, torch.nn.LSTM) and \
             activation_is_statically_quantized(qconfig) and \
             qhandler.is_custom_module()
diff --git a/torch/quantization/fx/fusion_patterns.py b/torch/quantization/fx/fusion_patterns.py
index 967c1be07aff4..36c74bd277226 100644
--- a/torch/quantization/fx/fusion_patterns.py
+++ b/torch/quantization/fx/fusion_patterns.py
@@ -6,7 +6,7 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
-from torch.ao.quantization.fx.fusion_patterns import (
+from torch.ao.quantization.fx.fuse_handler import (
     FuseHandler,
     DefaultFuseHandler,
 )
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index 09602287115c1..6177e9bd04b81 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -6,7 +6,7 @@
 appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
 here.
 """
-from torch.ao.quantization.fx.quantization_patterns import (
+from torch.ao.quantization.fx.quantize_handler import (
     QuantizeHandler,
     BinaryOpQuantizeHandler,
     CatQuantizeHandler,

From 15a9cbf2de962f9b92181ab6dd698cdc96096a74 Mon Sep 17 00:00:00 2001
From: AllenTiTaiWang <titaiwang@microsoft.com>
Date: Thu, 1 Dec 2022 04:15:03 +0000
Subject: [PATCH 1479/1922] [ONNX] Supports scatter_add with different static
 shape of src and index (#89787)

Prior to this change, the converter doesn't support `scatter_add` with different shape of `src` and `index`, while [it's claimed to be supported by PyTorch](https://pytorch.org/docs/stable/generated/torch.Tensor.scatter_add_.html#torch.Tensor.scatter_add_) in a way that scatter shape would be accommodated to index shape. This PR adds `onnx::Slice` to adjust the shape of `src` when a static and mismatched shape is found. However, if both of the shape (src and index) is set to dynamic, they are expected to be the same shape from ONNX due to the spec. More ScatterElements details on https://github.com/onnx/onnx/issues/4672
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89787
Approved by: https://github.com/BowenBao
---
 test/onnx/test_pytorch_onnx_onnxruntime.py | 41 ++++++++++++++++++++++
 torch/onnx/symbolic_opset16.py             | 36 ++++++++++++++++++-
 2 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 184cc5f4ae672..41e9a9973953a 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -3928,6 +3928,47 @@ def forward(self, src, index):
         index = torch.tensor([[0, 0], [1, 1], [0, 1]], dtype=torch.int64)
         self.run_test(ScatterModel(), (src, index))
 
+    @skipIfUnsupportedMinOpsetVersion(16)
+    def test_scatter_add_different_size_index_src(self):
+        class ScatterModel(torch.nn.Module):
+            def forward(self, input, indices, src):
+                return input.scatter_add(0, indices, src)
+
+        src = torch.ones((2, 5))
+        input = torch.zeros(3, 5, dtype=src.dtype)
+        indices = torch.tensor([[0, 1, 2, 0, 0]])
+        self.run_test(ScatterModel(), input_args=(input, indices, src))
+
+    @common_utils.parametrize(
+        "src, indices",
+        [
+            common_utils.subtest(
+                [torch.ones((1, 5)), torch.tensor([[0, 1, 2, 0, 0]])],
+                name="src_indices_dynamic_combination1",
+            ),
+            common_utils.subtest(
+                [torch.ones((2, 5)), torch.tensor([[0, 1, 2, 0, 0], [1, 0, 2, 1, 2]])],
+                name="src_indices_dynamic_combination2",
+            ),
+        ],
+    )
+    @skipIfUnsupportedMinOpsetVersion(16)
+    def test_scatter_add_dynamic_index(self, src, indices):
+        class ScatterModel(torch.nn.Module):
+            def forward(self, input, indices, src):
+                return input.scatter_add(0, indices, src)
+
+        input = torch.zeros(3, 5, dtype=src.dtype)
+        # NOTE: Although index and src are set with different dynamic axes and name,
+        # they are required to be the same shape on all axes. In static shape, converter
+        # can apply Slice op to accommodate.
+        self.run_test(
+            ScatterModel(),
+            input_args=(input, indices, src),
+            input_names=["input", "indices", "src"],
+            dynamic_axes={"indices": {0: "a", 1: "b"}, "src": {0: "c", 1: "d"}},
+        )
+
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_bucketize(self):
         class BucketModel(torch.nn.Module):
diff --git a/torch/onnx/symbolic_opset16.py b/torch/onnx/symbolic_opset16.py
index 75cb96890a12f..0287fae5664aa 100644
--- a/torch/onnx/symbolic_opset16.py
+++ b/torch/onnx/symbolic_opset16.py
@@ -75,12 +75,46 @@ def scatter_add(g: jit_utils.GraphContext, self, dim, index, src):
     src_sizes = symbolic_helper._get_tensor_sizes(src)
     index_sizes = symbolic_helper._get_tensor_sizes(index)
 
-    if src_sizes != index_sizes:
+    if len(src_sizes) != len(index_sizes):
         return symbolic_helper._unimplemented(
             "scatter_add",
             f"`index` ({index_sizes}) should have the same dimensionality as `src` ({src_sizes})",
         )
 
+    if src_sizes != index_sizes:
+        # In ONNX, src and index are required to be the same rank and shape
+        # However, in PyTorch, src is only required to have the same rank as index,
+        # and shape would be accomodated. In static shape, converter can apply Slice op
+        # to accommodate. We use Slice to adjust to shape of src if it's not the same
+        # as index.
+        # More detail on: https://github.com/onnx/onnx/issues/4672
+        axes = list()
+        ends = list()
+        # Align the dynamic sizes of src and index
+        # NOTE: Even if users set src and index with different dynamic axes, they are
+        # still expected to have the same shape in runtime in terms of ONNX spec.
+        # So the usage of different shape of src and index on dynamic size is not
+        # supported.
+        # More detail on: https://github.com/onnx/onnx/issues/4672
+        for idx, d in enumerate(index_sizes):
+            if d is None or src_sizes[idx] == d:
+                # 1. the axe with dynamic shape is ignored, and will be aligned by
+                # setType later
+                # 2. if the shape are the same, we don't need to slice
+                continue
+            if src_sizes[idx] < d:
+                return symbolic_helper._unimplemented(
+                    "scatter_add",
+                    f"`index` ({index_sizes}) should have smaller or equal (<=) size at any dimension than `src` ({src_sizes})",
+                )
+            axes.append(idx)
+            ends.append(d)
+        starts = [0] * len(ends)
+        if axes and starts and ends:
+            src = symbolic_helper._slice_helper(
+                g, src, axes=axes, starts=starts, ends=ends
+            )
+
     src = symbolic_helper._maybe_get_scalar(src)
     if symbolic_helper._is_value(src):
         return g.op("ScatterElements", self, index, src, axis_i=dim, reduction_s="add")

From d4015a9bf56593ec642d3a515b178b0158acf883 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@fb.com>
Date: Thu, 1 Dec 2022 18:53:16 +0000
Subject: [PATCH 1480/1922] Editorial pass on Dyamo docs (#89921)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89921
Approved by: https://github.com/msaroufim
---
 docs/source/dynamo/custom-backends.rst |  19 +-
 docs/source/dynamo/deep-dive.rst       |  18 +-
 docs/source/dynamo/get-started.rst     |  83 ++++----
 docs/source/dynamo/guards-overview.rst | 282 ++++++++++++-------------
 docs/source/dynamo/index.rst           |   9 +-
 docs/source/dynamo/installation.rst    |  50 ++---
 docs/source/dynamo/troubleshooting.rst | 144 ++++++-------
 docs/source/index.rst                  |  13 +-
 8 files changed, 315 insertions(+), 303 deletions(-)

diff --git a/docs/source/dynamo/custom-backends.rst b/docs/source/dynamo/custom-backends.rst
index 2c8b338045e62..7322fceb51815 100644
--- a/docs/source/dynamo/custom-backends.rst
+++ b/docs/source/dynamo/custom-backends.rst
@@ -4,10 +4,13 @@ Custom Backends
 Debugging Backend
 -----------------
 
-Suppose you wanted to better understand what is going on during a
-compilation you can create a custom compiler which we’ll refer to as a
-backend that will print pretty print the fx ``GraphModule`` extracted
-from dynamo’s bytecode analysis and return a ``forward()`` callable.
+If you want to better understand what is going on during a
+compilation, you can create a custom compiler, which is referred to as
+backend in this section, that will print pretty print the fx
+``GraphModule`` extracted from Dynamo’s bytecode analysis
+and return a ``forward()`` callable.
+
+For example:
 
 .. code-block:: python
 
@@ -39,7 +42,7 @@ Running the above example produces the following output:
    call_function  add     <built-in function add>                                 (cos, sin)  {}
    output         output  output                                                  ((add,),)   {}
 
-This works for ``torch.nn.Module`` as well as shown below
+This works for ``torch.nn.Module`` as well as shown below:
 
 .. code-block:: python
 
@@ -55,7 +58,7 @@ This works for ``torch.nn.Module`` as well as shown below
    optimized_mod = dynamo.optimize(my_compiler)(mod)
    optimized_mod(torch.randn(10))
 
-Let’s take a look at one more example with control flow.
+Let’s take a look at one more example with control flow:
 
 .. code-block:: python
 
@@ -118,13 +121,13 @@ Integrating a custom backend that offers superior performance is also
 easy and we’ll integrate a real one
 with `optimize_for_inference <https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html>`__:
 
-.. code-block :: python
+.. code-block:: python
 
    def optimize_for_inference_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
        scripted = torch.jit.trace(gm, example_inputs)
        return torch.jit.optimize_for_inference(scripted)
 
-And then you should be able to optimize any existing code with
+And then you should be able to optimize any existing code with:
 
 .. code-block:: python
 
diff --git a/docs/source/dynamo/deep-dive.rst b/docs/source/dynamo/deep-dive.rst
index c60047c2a3d8d..468fdc6ff9467 100644
--- a/docs/source/dynamo/deep-dive.rst
+++ b/docs/source/dynamo/deep-dive.rst
@@ -18,7 +18,7 @@ guards:
 
 If any of those guards fail, the graph will be recaptured and
 recompiled. The interesting guard type there is ``TENSOR_MATCH``, which
-checks the following torch.Tensor properties:
+checks the following ``torch.Tensor`` properties:
 
 - Python class of the tensor (tensor subclassing, etc)
 - dtype
@@ -34,14 +34,14 @@ following parameter:
 
 .. code-block:: python
 
-torch._dynamo.config.dynamic_shapes = True
+   torch._dynamo.config.dynamic_shapes = True
 
 The full specialization mode allows the backend compiler to assume an
 entirely static graph. Unfortunately, most backends require this.
 Operators which return dynamic shapes will trigger a graph break when
 not in dynamic shape mode.
 
-What is dynamo doing?
+What is Dynamo doing?
 ---------------------
 
 If you want to understand better what TorchDynamo is doing, you can set:
@@ -50,10 +50,10 @@ If you want to understand better what TorchDynamo is doing, you can set:
 
    torchdynamo.config.debug = True
 
-which triggers useful (but spammy) printouts.
+This code triggers useful (but spammy) printouts.
 
 For example, the printouts for the first graph in the ``toy_example``
-above are:
+are:
 
 ::
 
@@ -121,12 +121,12 @@ above are:
     - local 'b' TENSOR_MATCH
     - global 'torch' FUNCTION_MATCH
 
-At the top you can see the FX graph (which we already shared above).
-Next you see the original bytecode of the function, followed by the
+At the top you can see the FX graph.
+Next, you see the original bytecode of the function, followed by the
 modified bytecode generated by TorchDynamo. Finally, you see the guards
 which we covered above.
 
-In the modified bytecode ``__compiled_fn_0`` is the return value of
+In the modified bytecode, ``__compiled_fn_0`` is the return value of
 ``my_compiler()`` (the compiled graph). ``__resume_at_30_1`` and
 ``__resume_at_38_2`` are both generated continuation functions that pick
 up execution after a graph break (at bytecode offsets 30 and 38). Each
@@ -139,7 +139,7 @@ of these functions take the form:
        JUMP_ABSOLUTE <offset> into toy_example
        ... original bytecode of toy_example ...
 
-By generating this `resume_at` function we force the remainder of the
+By generating this `resume_at` function, we force the remainder of the
 function to be executed in a new Python frame which recursively
 triggers TorchDynamo to restart its capture once execution reaches that
 point for the first time.
diff --git a/docs/source/dynamo/get-started.rst b/docs/source/dynamo/get-started.rst
index 44434d49e525d..fa1be5d43764d 100644
--- a/docs/source/dynamo/get-started.rst
+++ b/docs/source/dynamo/get-started.rst
@@ -1,9 +1,8 @@
 Getting Started
 ===============
 
-Let’s start with a simple example and make things more complicated step
-by step. Please note that you’re likely to see more significant speedups
-the newer your GPU is.
+Let’s start with a simple example. Note that you are likely to see more
+significant speedups the newer your GPU is.
 
 .. code:: python
 
@@ -15,32 +14,32 @@ the newer your GPU is.
        return a + b
    new_fn = optimize("inductor")(fn)
    input_tensor = torch.randn(10000).to(device="cuda:0")
-   a = new_fn()
+   a = new_fn(input_tensor, input_tensor)
 
 This example will not actually run faster. Its purpose is to demonstrate
 the ``torch.cos()`` and ``torch.sin()`` features which are
 examples of pointwise ops as in they operate element by element on a
-vector. A more famous pointwise op you might actually want to use would
+vector. A more famous pointwise op you might want to use would
 be something like ``torch.relu()``. Pointwise ops in eager mode are
-suboptimal because each one would need to need to read a tensor from
-memory, make some changes and then write back those changes. The single
+suboptimal because each one would need to read a tensor from
+memory, make some changes, and then write back those changes. The single
 most important optimization that inductor does is fusion. So back to our
 example we can turn 2 reads and 2 writes into 1 read and 1 write which
 is crucial especially for newer GPUs where the bottleneck is memory
-bandwidth (how quickly you can send data to a GPU) instead of compute
-(how quickly your GPU can crunch floating point operations)
+bandwidth (how quickly you can send data to a GPU) rather than compute
+(how quickly your GPU can crunch floating point operations).
 
 Another major optimization that inductor makes available is automatic
 support for CUDA graphs.
 CUDA graphs help eliminate the overhead from launching individual
-kernels from a python program which is especially relevant for newer GPUs.
+kernels from a Python program which is especially relevant for newer GPUs.
 
-dynamo supports many different backends but inductor specifically works
+TorchDynamo supports many different backends but inductor specifically works
 by generating `Triton <https://github.com/openai/triton>`__ kernels and
 we can inspect them by running ``TORCHINDUCTOR_TRACE=1 python trig.py``
 with the actual generated kernel being
 
-.. code:: python
+.. code-block:: python
 
    @pointwise(size_hints=[16384], filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: 'i32'}, 'device': 0, 'constants': {}, 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]})
    @triton.jit
@@ -60,14 +59,14 @@ because the two ``sin`` operations occur within a single Triton kernel
 and the temporary variables are held in registers with very fast access.
 
 You can read up a lot more on Triton’s performance
-`here <https://openai.com/blog/triton/>`__ but the key is it’s in python
-so you can easily understand it even if you haven’t written all that
+`here <https://openai.com/blog/triton/>`__ but the key is it’s in Python
+so you can easily understand it even if you have not written all that
 many CUDA kernels.
 
-As a next step let’s try a real model like resnet50 from the PyTorch
+Next, let’s try a real model like resnet50 from the PyTorch
 hub.
 
-.. code:: python
+.. code-block:: python
 
    import torch
    import torch._dynamo as dynamo
@@ -75,21 +74,21 @@ hub.
    opt_model = dynamo.optimize("inductor")(model)
    model(torch.randn(1,3,64,64))
 
-And that’s not the only available backend, you can run in a REPL
-``dynamo.list_backends()`` to see all the available ones. Try out the
+And that is not the only available backend, you can run in a REPL
+``dynamo.list_backends()`` to see all the available backends. Try out the
 ``aot_cudagraphs`` or ``nvfuser`` next as inspiration.
 
 Let’s do something a bit more interesting now, our community frequently
 uses pretrained models from
 `transformers <https://github.com/huggingface/transformers>`__ or
 `TIMM <https://github.com/rwightman/pytorch-image-models>`__ and one of
-our design goals is for dynamo and inductor to work out of the box with
+our design goals is for Dynamo and inductor to work out of the box with
 any model that people would like to author.
 
-So we’re going to directly download a pretrained model from the
+So we will directly download a pretrained model from the
 HuggingFace hub and optimize it:
 
-.. code:: python
+.. code-block:: python
 
    import torch
    from transformers import BertTokenizer, BertModel
@@ -103,7 +102,7 @@ HuggingFace hub and optimize it:
    output = model(**encoded_input)
 
 If you remove the ``to(device="cuda:0")`` from the model and
-encoded_input then triton will generate C++ kernels that will be
+``encoded_input``, then Triton will generate C++ kernels that will be
 optimized for running on your CPU. You can inspect both Triton or C++
 kernels for BERT, they’re obviously more complex than the trigonometry
 example we had above but you can similarly skim it and understand if you
@@ -111,7 +110,7 @@ understand PyTorch.
 
 Similarly let’s try out a TIMM example
 
-.. code:: python
+.. code-block:: python
 
    import timm
    import torch._dynamo as dynamo
@@ -120,7 +119,8 @@ Similarly let’s try out a TIMM example
    opt_model = dynamo.optimize("inductor")(model)
    opt_model(torch.randn(64,3,7,7))
 
-Our goal with dynamo and inductor was to build the highest coverage ML compiler which should work with any model you throw at it.
+Our goal with Dynamo and inductor is to build the highest coverage ML compiler
+which should work with any model you throw at it.
 
 Existing Backends
 ~~~~~~~~~~~~~~~~~
@@ -129,42 +129,41 @@ TorchDynamo has a growing list of backends, which can be found in
 `backends.py <https://github.com/pytorch/pytorch/blob/master/torch/_dynamo/optimizations/backends.py>`__
 or ``torchdynamo.list_backends()`` each of which with its optional dependencies.
 
-Some of the most commonly used backend include:
+Some of the most commonly used backends include:
 
-* **Debugging backends**: \* ``dynamo.optimize("eager")`` - Uses PyTorch
+* **Debugging backends**:
+  * ``dynamo.optimize("eager")`` - Uses PyTorch
   to run the extracted GraphModule. This is quite useful in debugging
-  TorchDynamo issues. \* ``dynamo.optimize("aot_eager")`` - Uses
-  AotAutograd with no compiler, i.e, just using PyTorch eager for the
+  TorchDynamo issues.
+  * ``dynamo.optimize("aot_eager")`` - Uses
+  AotAutograd with no compiler, for example, just using PyTorch eager for the
   AotAutograd’s extracted forward and backward graphs. This is useful for
   debugging, and unlikely to give speedups.
 
-* **Training & inference backends**: \* ``dynamo.optimize("inductor")`` -
-  Uses TorchInductor backend with AotAutograd and cudagraphs by leveraging
+* **Training & inference backends**:
+  * ``dynamo.optimize("inductor")`` - Uses ``TorchInductor`` backend
+  with AotAutograd and cudagraphs by leveraging
   codegened Triton kernels `Read
   more <https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747>`__
-
   * ``dynamo.optimize("nvfuser")`` - nvFuser with TorchScript. `Read more <https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593>`__
-
   * ``dynamo.optimize("aot_nvfuser")`` - nvFuser with AotAutograd. `Read more <https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-1-nvfuser-and-its-primitives/593>`__
-
   * ``dynamo.optimize("aot_cudagraphs")`` - cudagraphs with AotAutograd. `Read more <https://github.com/pytorch/torchdynamo/pull/757>`__
 
-* **Inference-only backend**\ s: \* ``dynamo.optimize("ofi")`` - Uses
-  Torchscript optimize_for_inference. `Read
+* **Inference-only backends**:
+  * ``dynamo.optimize("ofi")`` - Uses
+  Torchscript ``optimize_for_inference``. `Read
   more <https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html>`__
-
-  * ``dynamo.optimize("fx2trt")`` - Uses Nvidia TensorRT for inferenc optimizations. `Read more <https://github.com/pytorch/TensorRT/blob/master/docsrc/tutorials/getting_started_with_fx_path.rst>`__
-
+  * ``dynamo.optimize("fx2trt")`` - Uses Nvidia TensorRT for inference optimizations. `Read more <https://github.com/pytorch/TensorRT/blob/master/docsrc/tutorials/getting_started_with_fx_path.rst>`__
   * ``dynamo.optimize("onnxrt")`` - Uses ONNXRT for inference on CPU/GPU. `Read more <https://onnxruntime.ai/>`__ \* ``dynamo.optimize("ipex")`` - Uses IPEX for inference on CPU. `Read more <https://github.com/intel/intel-extension-for-pytorch>`__
 
 Why do you need another way of optimizing PyTorch code?
 -------------------------------------------------------
 
 While a number of other code optimization tools exist in the PyTorch
-ecosystem, each of them has its own flow. Here is a few examples of
-existing methods and their limitations:
+ecosystem, each of them has its own flow.
+Here is a few examples of existing methods and their limitations:
 
--  ``torch.jit.trace()`` is silently wrong if it cannot trace e.g:
+-  ``torch.jit.trace()`` is silently wrong if it cannot trace, for example:
    during control flow
 -  ``torch.jit.script()`` requires modifications to user or library code
    by adding type annotations and removing non PyTorch code
@@ -177,5 +176,3 @@ existing methods and their limitations:
    situations <./documentation/FAQ.md#do-i-still-need-to-export-whole-graphs>`__
    but allows a smoother transition where partial graphs can be
    optimized without code modification
-
-.. |image0| image:: ../_static/img/dynamo/TorchDynamo.png
diff --git a/docs/source/dynamo/guards-overview.rst b/docs/source/dynamo/guards-overview.rst
index 99a004ec221c3..a86cd202564b7 100644
--- a/docs/source/dynamo/guards-overview.rst
+++ b/docs/source/dynamo/guards-overview.rst
@@ -32,12 +32,11 @@ Where a complete example looks like this:
 This allows TorchDynamo to capture the interpreted Python frames, grab
 any and all relevant information, and speed things up wherever it can.
 The speedup comes from a few places, and can be rather dependent on the
-backend (my_compiler above) provided, but the one speedup we care about
-most for today’s overview is **caching**. Caching itself is not a direct
-speedup, so much as a critical enablement to allow us to prevent
+backend (`my_compiler` in the example above) provided, but the one speedup
+that is important in this section is **caching**. Caching itself is not
+a direct speedup but a critical enablement that prevents
 recompilation. We dig a hole with dynamo, and caching allows us to get
-out. Its a speedup from that perspective, but relatively neutral when
-all things are considered - however, it enables us to hold perf
+out. It enables us to hold perf
 neutrality while then enabling backends - the true source of our
 speedups.
 
@@ -48,39 +47,39 @@ With even a pass-through no-op backend provided:
    def my_compiler(gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
        return gm.forward
 
-We can see TorchDynamo speeding up Python execution quite a bit, even on
+We can see TorchDynamo speeding up Python execution even on
 regular Python, not just PyTorch.
 
 Caching and Guards Overview
 ---------------------------
 
 TorchDynamo operates through caching transformed (by TorchDynamo) user
-bytecode. When we receive a frame for evaluation, we check if the
+bytecode. When TorchDynamo receives a frame for evaluation, it checks if the
 **objects referenced in the frame have changed** in certain ways, and if
-not, we read the previously transformed user bytecode to evaluate it.
-The details of how we do this will be saved for a later writeup.
-Instead, we will focus on how we can identify whether or not the
+not, TorchDynamo reads the previously transformed user bytecode to evaluate it.
+In this section, we will focus on how we can identify whether or not the
 **objects referenced in the frame have changed**. This is a critical
 piece of functionality in TorchDynamo, because it drives the entire
-invalidation lifecycle. We refer to this functionality as **guards**.
-
-At a very high level, the vastly oversimplified TLDR flow is this:
-
-1) We receive a python frame
-2) We convert the given frame from (1), passing it through instruction
-   translation
-3) For the objects captured in (2), we create tracking objects that are
-   (a) tracked on an output graph, which is an internal specialization
-   of a torch.fx.Tracer (and the topic of a later writeup), and (b)
-   guards, the topic of this document.
-4) We process the guard objects created in (3), turning them into a
-   generated python function, check_fn, associated with a piece of code.
-5) The check_fn is evaluated whenever we encounter this code a
-   subsequent time - if a check_fn passes and evaluates to True, we know
-   the code in the cache and the code encountered here is the same, and
-   can be safely used. If it fails and evaluates to False, we know the
-   code in the cache is not valid, and can be thrown out in favor of a
-   new entry, through recompilation or a graph break.
+invalidation lifecycle. This functionality is called **guards**.
+
+At a very high level, the flow can be summarized like this:
+
+1. TorchDynamo receives a Python frame.
+2. It converts the frame (1) passing it through instruction
+   translation.
+3. For the objects captured in (2), TorchDynamo creates tracking objects that
+   are:
+   * tracked on an output graph, which is an internal specialization
+   of a `torch.fx.Tracer`
+   * guards
+4. TorchDynamo processes the guard objects created in (3), turning them into a
+   generated Python function, `check_fn`, associated with a piece of code.
+5. The `check_fn` is evaluated whenever we encounter this code a
+   subsequent time - if a `check_fn` passes and evaluates to `True`, TorchDynamo
+   identifies the code in the cache and the code encountered here as same, and
+   can be safely used. If it fails and evaluates to `False`, TorchDynamo
+   identifies the code in the cache as not valid, and can be thrown out in
+   favor of a new entry, through recompilation or a graph break.
 
 Python Frame Evaluation and PEP 523
 -----------------------------------
@@ -88,16 +87,14 @@ Python Frame Evaluation and PEP 523
 The functionality of TorchDynamo is based on
 `PEP 523 <https://peps.python.org/pep-0523/>`__.
 
-TorchDynamo installs a frame evaluation function on Python, via
-`_PyInterpreterState_SetEvalFrameFunc`. The overview of function
-selection, thread management, and cleanup is out of scope for this
-writeup, but the important part is that TorchDynamo has a hook where
+TorchDynamo installs a frame evaluation function on Python by using
+`_PyInterpreterState_SetEvalFrameFunc`. TorchDynamo has a hook where
 Python can hand control back to us during evaluation.
 
 The function we have installed is ``convert_frame`` or
 ``convert_frame_assert`` in the ``nopython=True`` case, but glossing
 over that nuance for now, let’s take a look at ``convert_frame_assert``,
-as ``convert_frame`` proxies to it anyway.
+as ``convert_frame`` proxies to it.
 
 We can find it on `line 20 of convert_frame.py
 <https://github.com/pytorch/torchdynamo/blob/main/torchdynamo/convert_frame.py#L200>`__,
@@ -108,8 +105,7 @@ with a signature as follows:
    def  convert_frame_assert(compiler_fn: Callable, one_graph=True):
 
 This function wraps the entry point of where Python invokes TorchDynamo
-with a frame, glossing over the nuances of ``wrap_convert_context`` for
-now:
+with a frame:
 
 .. code-block:: python
 
@@ -117,43 +113,43 @@ now:
 
 Here is what this function does:
 
-1) Checks if it has seen this ``code``\ (see: f_code `here
+1. Checks if it has seen this ``code``\ (see: f_code `here
    <https://docs.python.org/3/library/inspect.html>`__) before and exits
    early if it did.
-2) Checks if the code is an unsupported case.
-3) Checks if the ``cache_size`` (second arg above) crosses the limit
+2. Checks if the code is an unsupported case.
+3. Checks if the ``cache_size`` (second arg above) crosses the limit
    defined in the config, ``cache_size_limit``. If it has, the function
    drops the frame and logs warnings. This helps to avoid constant
    recompilation of a frame as it generally means that the frame is hot
    in an unexpected way and caching it produces needless overhead,
    as it is likely to get evicted the next time it is encountered.
-4) Passes the frame, alongside a function that creates an
+4. Passes the frame, alongside a function that creates an
    ``InstructionTranslator`` through bytecode
    transformation, via ``transform_code_object``. A few crucial things
    happen under the hood here:
 
-   1) New code is produced through ``transform_code_object``.
+   1. New code is produced through ``transform_code_object``.
 
-   2) An FX tracer named ``output`` is produced through
+   2. An FX tracer named ``output`` is produced through
       ``InstructionTranslator``.
 
       This can be a bit confusing,
       as ``InstructionTranslator`` is not an `fx` tracer, but its stored
       in a variable named tracer, and its output*\ **is**\ *an `fx`tracer.*
 
-   3) The function produces guards and stores them on ``output`` above.
+   3. The function produces guards and stores them on ``output`` above.
 
-   4) The function produces ``output_instructions`` and stores them on
+   4. The function produces ``output_instructions`` and stores them on
       ``output`` above.
 
-   5) The function maps the newly produced transformed code to the initial code it
+   5. The function maps the newly produced transformed code to the initial code it
       read off the frame. This mapping is worth remembering, we will
       refer to it much later on below where we cover guard failures.
 
-5) Using the transformed code from 4.1 and the guards from 4.3
+5. Using the transformed code from 4.1 and the guards from 4.3,
    the function produces a `GuardedCode`.
 
-Now that we have learned about frame evoluation, let’s review
+Now that we have learned about frame evaluation, let’s review
 ``InstructionTranslator``, and see how it turns the frame we handed
 it over into TorchDynamo internal types.
 
@@ -163,7 +159,7 @@ InstructionTranslator
 `InstructionTranslator` does a lot! We won’t cover the details of
 everything it does, but most importantly for this document, it produces
 a mapping of ``symbolic_locals`` which maintains a mapping from the
-frame’s f_locals to TorchDynamo internal Variable objects (more on these
+frame’s ``f_locals`` to TorchDynamo internal Variable objects (more on these
 in a moment. ``symbolic_locals`` is filled via traversing the frame’s
 locals:
 
@@ -175,9 +171,7 @@ locals:
        if k in f_locals
    )
 
-We will get to how this works later, from a few other examples that lead
-us to understanding ``VariableTracker`` and ``VariableBuilder``. The
-important component here, for us, for now, is the invocation of a call
+The important component here  is the invocation of a call
 into ``VariableBuilder``. ``VariableBuilder``\ ’s call implementation
 proxies into a function called ``_wrap``, which in turn both constructs
 instances of ``VariableTracker`` and calls ``make_guards`` on them. More
@@ -194,15 +188,15 @@ How does ``InstructionTranslator`` do this? At the heart of it, there is
 a loop that is pumped, which drives a function ``step``.
 
 ``step`` is just that - a single processing step, taking exactly one
-instruction and doing *something* with it. Note: These are real
-instructions processed by TorchDynamo’s ``transform_code_object``, and
-it’s pretty cool.
+instruction and doing *something* with it.
+
+.. note:: These are real instructions processed by TorchDynamo’s
+   ``transform_code_object``, and it is pretty cool.
 
 .. note:: This section purposly skips the details of
-   `dis.get_instructions <https://docs.python.org/3/library/dis.html>`__,
-   and how we set up the ``Instruction`` class.
+   `dis.get_instructions <https://docs.python.org/3/library/dis.html>`__.
 
-For the toy example above, here is a snippet of a what a few
+For the example above, here is a snippet of a what a few
 ``Instruction``\'s may look like:
 
 .. code-block:: python
@@ -220,13 +214,13 @@ and then take a look at this little snippet from inside ``step``;
        unimplemented(f"missing: {inst.opname}")
    getattr(self, inst.opname)(inst)
 
-As we can see, we check if the current class, the
-``InstructionTranslator`` has a attribute set matching the operator name
-(ex: LOAD_CONST). If it does, we invoke it, passing the whole
-instruction object in. If it does not, we drop the frame as
+As we can see, the function checks if the current class, the
+``InstructionTranslator`` has an attribute set matching the operator name
+(for example, ``LOAD_CONST``). If it does, the function invokes it, passing the
+whole instruction object in. If it does not, the function drops the frame as
 unimplemented.
 
-For the LOAD_CONST example, we can see that we do indeed support it,
+For the ``LOAD_CONST`` example, we can see that we do indeed support it,
 with a relatively straightforward definition:
 
 ::
@@ -234,13 +228,12 @@ with a relatively straightforward definition:
    def  LOAD_CONST(self, inst):
    self.push(ConstantVariable(value=inst.argval))
 
-Passing over, for now, on the other details of ``InstructionTranslator``
-we can see that this function creates a new instance of the class
+We can see that this function creates a new instance of the class
 ``ConstantVariable`` , with a value, in our example case, -1, and then
 pushes it onto the stack.
 
-There are dozens of such methods - see symbolic_convert.py for all of
-them. Generally, we implement as many matching methods to python
+There are dozens of such methods - see ``symbolic_convert.py`` for all of
+them. Generally, we implement as many matching methods to Python
 bytecode instructions as possible.
 
 Across both the logic downstream of ``step`` and the logic from invoking
@@ -252,50 +245,50 @@ Variables
 ---------
 
 A ``ConstantVariable`` is an instance of\ ``VariableTracker``.
-``VariableTracker`` represents a tracked python local or stack value.
+``VariableTracker`` represents a tracked Python local or stack value.
 
 When it comes to representing an object inside TorchDynamo, a
-VariableTracker does exactly what it says - it tracks a given variable.
-Its an extremely flexible class, but there are a few points to keep in
+``VariableTracker`` does exactly what it says - it tracks a given variable.
+It is an extremely flexible class, but there are a few points to keep in
 mind:
 
 -  It manages the ``guard`` relationship around the underlying object
    through:
 
-   -  `make_guard`
-   -  `replace_guards`
-   -  `add_guard(s)`
-   -  `propagate` - ``propagate(*vars: List[List["VariableTracker"]])`` -
+   -  ``make_guard``
+   -  ``replace_guards``
+   -  ``add_guard(s)``
+   -  ``propagate`` - ``propagate(*vars: List[List["VariableTracker"]])`` -
       Perhaps the most important of all, in that it combines guards from
-      all the provided VariableTracker instances passed in. It visits
+      all the provided ``VariableTracker`` instances passed in. It visits
       the guards and combines the guards from these onto itself.
 
 -  It acts as a proxy on behalf of the underlying object, implementing
    methods for the rest of TorchDynamo to get information about the
    tracked object:
 
-   -  `call_method`
-   -  `call_function`
-   -  `python_type`
-   -  `as_proxy`
-   -  `is/as_python_proxy`
+   -  ``call_method``
+   -  ``call_function``
+   -  ``python_type``
+   -  ``as_proxy``
+   -  ``is/as_python_proxy``
 
 -  It stores the variable ``source`` of type ``Source``, from
-   torchdynamo/source.py. This source type is a relatively self
-   contained class to help us organize and bookeep where the original
+   ``torchdynamo/source.py``. This source type is a relatively self
+   contained class that helps us organize and bookeep where the original
    source came from, and helps provide convenience methods for things
    like getting the name, and importantly for us, producing guards.
 
 And this class (``VariableTracker``) is built around subclassing,
 somewhere between a full Abstract Base Class and fully fleshed out class
-- it leaves many methods raising NotImplementedError - with reliance on
-subclasses (see: torchdynamo/variables/ for all subclasses) to fulfill
+- it leaves many methods raising ``NotImplementedError`` - with reliance on
+subclasses. See ``torchdynamo/variables/`` for all subclasses to fulfill
 contracts and custom behaviors.
 
 Knowing what we know now, we can see an example of how an instruction
-from ``dis``, ``BUILD_TUPLE``
+from ``dis``, ``BUILD_TUPLE``:
 
-   BUILD_TUPLE(count) Creates a tuple consuming count items from the
+   ``BUILD_TUPLE(count)`` Creates a tuple consuming count items from the
    stack, and pushes the resulting tuple onto the stack.
 
 In our case, our signature will be a *little* different due to the way
@@ -311,33 +304,35 @@ python objects into TorchDynamo notions:
        options = VariableTracker.propagate(items)
        self.push(TupleVariable(items, **options))
 
-What is happening here? 1) We read argval, which in this case, is
-analogous to ``counts`` in the pydoc for the equivalent instruction.
+Here is what this code does:
+
+1. The function reads ``argval``, which in this case, is
+   analogous to ``counts`` in the pydoc for the equivalent instruction.
 
-2) We ``popn`` the items, in this case, the signature is
+2. The function ``popn`` the items, in this case, the signature is
    ``def  popn(self, n: int) -> List[TensorVariable]:`` this hints at an
    underlying contract - we are returning ``TensorVariables``. If we
-   take a closer look at sybmolic_convert.py and
+   take a closer look at ``sybmolic_convert.py`` and
    ``InstructionTranslatorBase``/``InstructionTranslator``\ we see that
    the only thing pushed onto and popped from our stack are
    ``VariableTracker``\ s.
 
-3) We call ``VariableTracker.propogate`` (remember it, from above?) This
+3) The function calls ``VariableTracker.propogate``. This
    takes the guards from every single item popped off the stack in 2,
    and recursively traverses it and combines all the guards into
    ``options``: ``py  return {      "guards": guards,  }``
 
-4) We then make a new instance of a ``VariableTracker``,
+4) The function then makes a new instance of a ``VariableTracker``,
    ``TupleVariable``\ out of the ``items`` and ``options``. This then
    allows us to install all the appropriate guards from the ``items``
    that make up the new ``TupleVariable``
 
-Note: You may wonder - where did the first guards come from? Propagation
-is good and all, but don’t we need something created before it can be
-propagated. Yes! Remember that ``VariableBuilder`` above? It calls
-``make_guards`` as it creates ``VariableTracker`` instances, from
-``f_locals``. This in turn calls into the ``source``, to have it create
-guards.
+.. note:: Where did the first guards come from? Propagation
+   is a good technique, but we need something created before it can be
+   propagated. ``VariableBuilder`` calls
+   ``make_guards`` as it creates ``VariableTracker`` instances, from
+   ``f_locals``. This in turn calls into the ``source``, to have it create
+   guards.
 
 After all this, bytecode translation is done and we are one step closer
 to producing ``GuardedCode``. We now understand how locals become
@@ -345,14 +340,14 @@ to producing ``GuardedCode``. We now understand how locals become
 are called on for creation. Before we can go into seeing how code and
 guards are combined into a GuardedCode object, we need to dig a little
 bit into those ``make_guard`` and ``source.make_guard`` calls above. We
-can then understand, really, what was going on when we made guards
+can then understand, what was going on when we made guards
 alongside, and on, ``VariableTracker`` instances.
 
 Making Guards
 -------------
 
-Guards are just python objects, of the class ``Guard``, however, theres
-a good amount of detail around this little class.
+Guards are just Python objects, of the class ``Guard``. Let's look at them
+in more detail.
 
 Looking at the definition of the dataclass (and therefore, ctor
 signature), we see that it has a name, a source, and a create function.
@@ -368,14 +363,15 @@ signature), we see that it has a name, a source, and a create function.
 The name should be the name of the variable.
 
 The source here is an enum indicating what *kind* of source the guard
-belongs to [Note: not to be confused with ``Source`` and the other types
-in source.py, as stored on ``VariableTracker``, as discussed above]
+belongs to.
+
+.. note:: Not to be confused with ``Source`` and the other types
+   in ``source.py``, as stored on ``VariableTracker``.
 
-And create_fn is the heart of how we go from having this simple
-dataclass to actually producing valid python code to be invoked for
+``create_fn`` provides the main functionality to transition from a simple
+dataclass to actually producing valid Python code to be invoked for
 knowing whether or not things have changed in between invocations, and
-whether we can safely read from the code cache or not (In case you
-forgot what all this was for!)
+whether we can safely read from the code cache or not.
 
 The most common code paths for getting an instance of a guard are
 through ``make_guards`` on ``VariableTracker``.
@@ -391,7 +387,7 @@ Or, in a concrete example:
        return RangeVariable(value=value, guards=guards)
 
 Since ``source`` was set at the construction time of this
-``VariableTracker``, all that was needed here was to provide the fn,
+``VariableTracker``, all that was needed here was to provide the ``fn``,
 ``GuardBuilder.EQUALS_MATCH`` to the ``create_fn`` field.
 
 This ``create_fn`` must be a method on ``GuardBuilder``. The reason for
@@ -399,11 +395,10 @@ this becomes apparent in our next step. Once we have all the guards
 created for a frame, we move on to ``CheckFunctionManager`` and
 ``compile_check_fn``.
 
-Remember that ``convert_frame`` function way above, in the first
-section? Before it can produce a ``GuardedCode``, it needs to run the
-``CheckFunctionManager``, with all the guards, to produce a ``check_fn``
-which will then, in turn get passed in alongside the code into
-``GuardedCode``. This is the same ``check_fn`` that we store in our
+Before the ``convert_frame`` function can produce a ``GuardedCode``,
+it needs to run the ``CheckFunctionManager``, with all the guards, to
+produce a ``check_fn`` which will then, in turn get passed in alongside
+the code into ``GuardedCode``. This is the same ``check_fn`` that we store in our
 cache entry, and the same one we run to know whether or not to retrieve
 the code stored alongside. For reference, here is that code:
 
@@ -449,14 +444,14 @@ which in turn gets us to the most important bit: appending code.
 
 At its simplest, ``EQUALS_MATCH`` appends just one line of code:
 ``self.code.append(f"{ref} == {val!r}")``. Where ``ref`` is the name of
-the variable, and val is the value. It might produce code like this:
+the variable, and ``val`` is the value. It might produce code like this:
 
 .. code-block::
 
    y == 2
 
-Pretty simple, but if we append a few other kinds of ``GuardBuilder``
-functions on (For a more complex case), and then combine them all with
+This is a basic example. But if we append a few other kinds of ``GuardBuilder``
+functions and then combine them all with
 ``and`` in between each statement (as we do), we might get something
 like this:
 
@@ -464,25 +459,28 @@ like this:
 
    ___guarded_code.valid and ___check_type_id(y, 94367738391392) and y == 2 and ___check_tensors(x)
 
-Now we’re talking! Let’s see what we have here: 1) A check for
-``.valid`` (we will come back to invalidation later on) 2) A type id
-check 3) A value check 4) A tensor check
+Here is what this code performs:
 
-This becomes the heart of the code our ``check_fn``, which in turn, as
-you recall, is evaluated the **next** time we encounter this code. It
+1. A check for ``.valid``
+2. A type ID check
+3. A value check
+4. A tensor check
+
+This becomes the heart of the code our ``check_fn``, which in turn
+is evaluated the **next** time we encounter this code. It
 will then check:
 
-1) Is this code still valid?
-2) If (1), Does ``y`` still have a type of ``94367738391392``?
-3) If (2), is ``y`` still 2?
-4) If (3), let’s check on if tensor ``x`` changed in some specific ways
+1. Is this code still valid?
+2. If (1), Does ``y`` still have a type of ``94367738391392``?
+3. If (2), is ``y`` still 2?
+4. If (3), let’s check on if tensor ``x`` changed in some specific ways.
 
 If all of these are still true, then we can use the code cached
-alongside this ``check_fn``! Joyous day! [Note: a deeper dive for how
-and where this happens if saved for a later writeup, but reading
-``static PyCodeObject *lookup(CacheEntry *e, PyObject *f_locals) {`` of
-``_eval_frame.c`` is a good place to start for the inquisitive reader
-who has made it thus far].
+alongside this ``check_fn``.
+
+.. note:: For a deeper dive for how and where this happens
+   you can read ``static PyCodeObject *lookup(CacheEntry *e, PyObject *f_locals) {`` of
+   ``_eval_frame.c``.
 
 If not, then, we can move on to recompiling the code anew, and storing
 that in the cache alongside this code, and a whole new ``check_fn``,
@@ -490,24 +488,24 @@ again to be checked on yet another subsequent frame.
 
 There are lots of other such functions on ``GuardBuilder`` which get
 coalesced into, at times massive, strings which then get evaluated as
-python code and stored into ``check_fn``. Our example above is
-illustrative of a simple case, but I urge you to read the other
-functions on ``GuardBuilder``, or better yet, dump the ``code`` variable
-in ``compile_check_fn`` to really see what’s getting produced,
-especially on larger, real models!
+Python code and stored into ``check_fn``. The example above
+illustrates of a simple case. To understand this functionality better, read
+the other functions on ``GuardBuilder``, or better yet, dump the ``code`` variable
+in ``compile_check_fn`` to see what is getting produced,
+especially on larger, real models.
 
 Summary
 -------
 
-In this, we have glossed over: - The role of ``.valid`` and invalidation
-around weak references (and potentially soon to be NN Module
-invalidations) - How the C++ side of guard functions
-(``___check_type_id``, ``___check_tensors``, etc) operate - What happens
-when guards fail? - What happens if we produce invalid guard code?
+In this section, we have reviewed:
+
+- The role of ``.valid`` and invalidation around weak references (and potentially soon to be NN Moduleinvalidations).
+- How the C++ side of guard functions (``___check_type_id``, ``___check_tensors``, etc) operate
+- What happens when guards fail.
+- What happens if we produce invalid guard code.
 
-Despite all that, I hope this has been a useful read. We covered how
-user provided code, wrapped in a TorchDynamo context goes on to get
-traced and tracked internally, organized into ``VariableTracker``\ s
+We covered how user provided code wrapped in a TorchDynamo context
+goes on to get traced and tracked internally, organized into ``VariableTracker``\ s
 ``Source``\ s and subsequently ``Guard``\ s, and how those ``Guards`` in
 turn guide cache entry selection and invalidation when handing Python
 code.
diff --git a/docs/source/dynamo/index.rst b/docs/source/dynamo/index.rst
index d34f6a7d27552..506880981b018 100644
--- a/docs/source/dynamo/index.rst
+++ b/docs/source/dynamo/index.rst
@@ -1,5 +1,5 @@
-TorchDynamo Documentation
-=========================
+TorchDynamo Overview
+====================
 
 **TorchDynamo** is a Python-level JIT compiler designed to make unmodified
 PyTorch programs faster. TorchDynamo hooks into the frame evaluation API
@@ -10,7 +10,7 @@ operations into an `FX Graph <https://pytorch.org/docs/stable/fx.html>`__
 which is then just-in-time compiled with a customizable backend.
 It creates this FX Graph through bytecode analysis and is designed to
 mix Python execution with compiled backends to get the best of both
-worlds: usability and performance.
+worlds — usability and performance.
 
 TorchDynamo makes it easy to experiment with different compiler
 backends to make PyTorch code faster with a single line decorator
@@ -18,7 +18,7 @@ backends to make PyTorch code faster with a single line decorator
 
 .. image:: ../_static/img/dynamo/TorchDynamo.png
 
-For more information about `TorchInductor`, one of the backends
+`TorchInductor` is one of the backends
 supported by `TorchDynamo Graph <https://pytorch.org/docs/stable/fx.html>`__
 into `Triton <https://github.com/openai/triton>`__ for GPUs or
 `C++/OpenMP <https://www.openmp.org/>`__ for CPUs. We have a
@@ -33,6 +33,7 @@ dev-discuss <https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-co
    * `dev-discuss topics <https://dev-discuss.pytorch.org/search?q=TorchDynamo%20order%3Alatest>`__
 
 .. toctree::
+   :maxdepth: 1
    :hidden:
 
    installation
diff --git a/docs/source/dynamo/installation.rst b/docs/source/dynamo/installation.rst
index 6d1b09f0415ac..687e9b072bafe 100644
--- a/docs/source/dynamo/installation.rst
+++ b/docs/source/dynamo/installation.rst
@@ -2,28 +2,30 @@ Installing TorchDynamo
 ======================
 
 This section describes how to install TorchDynamo.
+TorchDynamo is included in the nightly binaries of PyTorch. For
+more information, see `Getting Started <https://pytorch.org/get-started/locally/>`__.
 
-Requirements and Setup
-----------------------
+Requirements
+------------
 
-Python 3.8 is recommended. Python 3.7 through 3.10 are supported and
-tested. Make sure to have a development version of Python installed
-locally as well.
+You must have the following prerequisites to use TorchDynamo:
 
-TorchDynamo is included in the nightly binaries of PyTorch. You can
-find more information `here <https://pytorch.org/get-started/locally/>`__
+* A Linux or macOS environment
+* Python 3.8 (recommended). Python 3.7 through 3.10 are supported and
+  tested. Make sure to have a development version of Python installed
+  locally as well.
 
-Install GPU/CUDA version requirements
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+GPU/CUDA Requirements
+~~~~~~~~~~~~~~~~~~~~~
 
-To use GPU back ends (and in particular Triton), please make sure that
+To use GPU back ends, and in particular Triton, make sure that
 the CUDA that you have installed locally matches the PyTorch version you
 are running.
 
-The following command installs GPU PyTorch+TorchDynamo along with GPU
+The following command installs GPU PyTorch + TorchDynamo along with GPU
 TorchDynamo dependencies (for CUDA 11.7):
 
-.. code-block:: python
+.. code-block:: shell
 
    pip3 install numpy --pre torch[dynamo] --force-reinstall --extra-index-url https://download.pytorch.org/whl/nightly/cu117
 
@@ -31,19 +33,19 @@ CPU requirements
 ~~~~~~~~~~~~~~~~
 
 There are no additional requirements for CPU TorchDynamo. CPU
-TorchDynamo is included in the nightly versions of PyTorch, which, for
-reference, can be installed with the following command:
+TorchDynamo is included in the nightly versions of PyTorch.
+To install, run the following command:
 
 .. code-block:: shell
 
    pip3 install --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 
 
-Install from local source
+Install from Local Source
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Build PyTorch from source:
-https://github.com/pytorch/pytorch#from-source, which has TorchDynamo
+Alternatively, you can build PyTorch from `source
+<https://github.com/pytorch/pytorch#from-source>`__, which has TorchDynamo
 included.
 
 To install GPU TorchDynamo dependencies, run ``make triton`` in the
@@ -53,30 +55,30 @@ Verify Installation
 ~~~~~~~~~~~~~~~~~~~
 
 If you built PyTorch from source, then you can run the following
-commands (from the PyTorch repo root directory) that run minimal
-examples to check that TorchDynamo is installed correctly:
+commands (from the PyTorch repo root directory)
+to check that TorchDynamo is installed correctly:
 
-.. code:: shell
+.. code-block:: shell
 
    cd tools/dynamo
    python verify_dynamo.py
 
 If you do not have the PyTorch source locally, you can alternatively
 copy the script (``tools/dynamo/verify_dynamo.py``) from the PyTorch
-repo and run it locally.
+repository and run it locally.
 
-Docker installation
+Docker Installation
 -------------------
 
 We also provide all the required dependencies in the PyTorch nightly
-binaries which you can download with
+binaries which you can download with the following command:
 
 .. code-block::
 
    docker pull ghcr.io/pytorch/pytorch-nightly
 
 And for ad hoc experiments just make sure that your container has access
-to all your GPUs
+to all your GPUs:
 
 .. code-block:: bash
 
diff --git a/docs/source/dynamo/troubleshooting.rst b/docs/source/dynamo/troubleshooting.rst
index 8542d02bfa9bc..3fb33d91ddef8 100644
--- a/docs/source/dynamo/troubleshooting.rst
+++ b/docs/source/dynamo/troubleshooting.rst
@@ -9,7 +9,7 @@ support for `tracing dynamic tensor
 shapes <https://docs.google.com/document/d/1QJB-GOnbv-9PygGlOMXwiO9K6vVNm8sNg_olixJ9koc/edit?usp=sharing>`__,
 more careful choices for guards and better tuned heuristics.
 
-In the mean time, you may need to diagnose a particular issue and
+In the meantime, you may need to diagnose a particular issue and
 determine if it is easy to work around with a change to your model, or
 file an issue for support.
 
@@ -69,7 +69,7 @@ Diagnosing Runtime Errors
 Below is the TorchDynamo compiler stack.
 
 At a high level, the TorchDynamo stack consists of a graph capture from
-Python code (TorchDynamo) and a backend compiler. In this example the
+Python code (TorchDynamo) and a backend compiler. In this example, the
 backend compiler consists of backward graph tracing (AOTAutograd) and
 graph lowering (TorchInductor)*. Errors can occur in any component of
 the stack and will provide full stack traces.
@@ -77,7 +77,7 @@ the stack and will provide full stack traces.
 You may use info logging
 (``torch._dynamo.config.log_level = logging.INFO``) and look for
 ``Step #: ...`` outputs in order to determine in which component the
-error occurred in. Logs are made at the beginning and end of each step,
+error has occurred. Logs are made at the beginning and end of each step,
 so the step that an error should correspond to is the most recent logged
 step whose end has not yet been logged. The steps correspond to the
 following parts of the stack (according to the image above):
@@ -108,15 +108,16 @@ generated. These are the following:
    forward and backward graphs. This is useful to narrow down the issue
    to AOTAutograd.
 
-The general procedure to narrow down an issue is the following: 1. Run
-your program with the ``"eager"`` backend. If the error no longer
-occurs, the issue is in the backend compiler that is being used (if
-using TorchInductor, proceed to step 2, if not, see `this
-section <#minifying-backend-compiler-errors>`__). If the error still
-occurs with the ``"eager"`` backend, it is an `error while running
-torchdynamo <#torchdynamo-errors>`__.
+The general procedure to narrow down an issue is the following:
 
-2. This step is only necessary if TorchInductor is used as the backend
+1. Run your program with the ``"eager"`` backend. If the error no longer
+   occurs, the issue is in the backend compiler that is being used (if
+   using TorchInductor, proceed to step 2. If not, see `this
+   section <#minifying-backend-compiler-errors>`__). If the error still
+   occurs with the ``"eager"`` backend, it is an `error while running
+   torchdynamo <#torchdynamo-errors>`__.
+
+2. This step is only necessary if ``TorchInductor`` is used as the backend
    compiler. Run the model with the ``"aot_eager"`` backend. If this
    backend raises an error then the error is occurring during
    AOTAutograd tracing. If the error no longer occurs with this backend,
@@ -125,20 +126,20 @@ torchdynamo <#torchdynamo-errors>`__.
 
 Each of these cases are analyzed in the following sections.
 
-\*Note on TorchInductor naming: The TorchInductor backend consists of
-both AOTAutograd tracing and the TorchInductor compiler itself. We will
-disambiguate by referring to TorchInductor as the backend, and
-TorchInductor lowering as the phase which lowers the graph traced by
-AOTAutograd.
+.. note:: The TorchInductor backend consists of
+   both AOTAutograd tracing and the TorchInductor compiler itself. We will
+   disambiguate by referring to ``TorchInductor`` as the backend, and
+   TorchInductor lowering as the phase which lowers the graph traced by
+   AOTAutograd.
 
 Torchdynamo Errors
 ------------------
 
 If the error that is generated occurs with the ``"eager"`` backend, then
-torchdynamo is the most likely source of the error. Here is example code
+TorchDynamo is the most likely source of the error. Here is a sample code
 which will generate an error.
 
-.. code:: py
+.. code-block:: py
 
    import torch
 
@@ -174,36 +175,36 @@ Which will generate the following error:
 
 As the message suggests you can set
 ``torch._dynamo.config.verbose=True`` to get a full stack trace to both
-the error in torchdynamo and the user code. In addition to this flag,
+the error in TorchDynamo and the user code. In addition to this flag,
 you can also set the ``log_level`` of torchdynamo through
 ``torch._dynamo.config.log_level``. The available levels are the
-following: - ``logging.DEBUG``: Print every instruction that is
-encountered in addition to all below log levels - ``logging.INFO``:
+following:
+- ``logging.DEBUG``: Print every instruction that is
+encountered in addition to all below log levels.
+- ``logging.INFO``:
 Print each function that is compiled (original and modified bytecode)
-and the graph that is captured in addition to all below log levels -
-``logging.WARNING`` (default): Print graph breaks in addition to all
-below log levels - ``logging.ERROR``: Print errors only
+and the graph that is captured in addition to all below log levels.
+- ``logging.WARNING`` (default): Print graph breaks in addition to all
+below log levels.
+- ``logging.ERROR``: Print errors only.
 
 If a model is sufficiently large, the logs can become overwhelming. If
-an error occurs deep within a model’s python code, it can be useful to
+an error occurs deep within a model’s Python code, it can be useful to
 execute only the frame in which the error occurs to enable easier
-debugging. There are two tools available to enable this: - Setting the
-environment variable TORCHDYNAMO_DEBUG_FUNCTION to the desired function
-name will only run torchdynamo on functions with that name. - There is a
-record/replay tool (set
-``torch._dynamo.config.replay_record_enabled = True``) which dumps an
-execution record when an error is encountered. This record can then be
-replayed to run only the frame where an error occurred.
+debugging. There are two tools available to enable this:
+
+- Setting the environment variable ``TORCHDYNAMO_DEBUG_FUNCTION`` to the desired function name will only run torchdynamo on functions with that name.
+- Enabling the record/replay tool (set ``torch._dynamo.config.replay_record_enabled = True``) which dumps anexecution record when an error is encountered. This record can then be replayed to run only the frame where an error occurred.
 
 TorchInductor Errors
 --------------------
 
-If the error doesn’t occur with the ``"eager"`` backend, then the
+If the error does not occur with the ``"eager"`` backend, then the
 backend compiler is the source of the error (`example
 error <https://gist.github.com/mlazos/2f13681e3cc6c43b3911f336327032de%5D>`__).
 There are `different
 choices <https://github.com/pytorch/torchdynamo/blob/0b8aaf340dad4777a080ef24bf09623f1aa6f3dd/README.md#existing-backends>`__
-for backend compilers for torchdynamo, with TorchInductor or nvfuser
+for backend compilers for TorchDynamo, with TorchInductor or nvfuser
 fitting the needs of most users. This section focuses on TorchInductor
 as the motivating example, but some tools will be usable with other
 backend compilers.
@@ -212,7 +213,7 @@ Below is the portion of the stack which we are focusing on:
 
 With TorchInductor as the chosen backend, AOTAutograd is used to
 generate the backward graph from the forward graph captured by
-torchdynamo. It’s important to note that errors can occur during this
+torchdynamo. It is important to note that errors can occur during this
 tracing and also while TorchInductor lowers the forward and backward
 graphs to GPU code or C++. A model can often consist of hundreds or
 thousands of FX nodes, so narrowing the exact nodes where this problem
@@ -225,9 +226,9 @@ TorchInductor lowering. As mentioned above in step 2, the
 without lowering. If the error still occurs with this backend, this
 indicates that the error is occurring during AOTAutograd tracing.
 
-Here’s an example:
+Here is an example:
 
-.. code:: py
+.. code-block:: py
 
    import torch
 
@@ -246,8 +247,8 @@ Here’s an example:
 
    test_backend_error()
 
-Running this should give you this error (with a longer stack trace below
-it)
+Running this should give you this error with a longer stack trace below
+it:
 
 ::
 
@@ -274,9 +275,9 @@ Minifying TorchInductor Errors
 ------------------------------
 
 From here, let’s run the minifier to get a minimal repro. Setting the
-environment variable TORCHDYNAMO_REPRO_AFTER=“aot” (or setting
+environment variable ``TORCHDYNAMO_REPRO_AFTER=“aot”`` (or setting
 ``torch._dynamo.config.repro_after="aot"`` directly) will generate a
-python program which reduces the graph produced by AOTAutograd to the
+Python program which reduces the graph produced by AOTAutograd to the
 smallest subgraph which reproduces the error. (See below for an example
 where we minify the graph produced by torchdynamo) Running the program
 with this environment variable should show nearly `identical
@@ -291,7 +292,7 @@ If the minifier runs successfully, it generates runnable python code
 which reproduces the exact error. For our example this is the following
 code:
 
-.. code:: py
+.. code-block:: python
 
    import torch
    from torch import tensor, device
@@ -349,7 +350,7 @@ caveat. Namely, that the minifier will now be run on the graph that is
 traced by TorchDynamo, not the output graph of AOTAutograd. Let’s walk
 through an example.
 
-.. code:: py
+.. code-block:: py
 
    import torch
 
@@ -377,15 +378,16 @@ through an example.
    test_backend_error()
 
 In order to run the code after TorchDynamo has traced the forward graph,
-the TORCHDYNAMO_REPRO_AFTER enviornment variable can be used. Running
-this program with TORCHDYNAMO_REPRO_AFTER=“dynamo” (or
+you can use the ``TORCHDYNAMO_REPRO_AFTER`` enviornment variable. Running
+this program with ``TORCHDYNAMO_REPRO_AFTER=“dynamo”`` (or
 ``torch._dynamo.config.repro_after="dynamo"``) should produce `this
 output <https://gist.github.com/mlazos/244e3d5b53667e44078e194762c0c92b>`__\ and
 the following code in ``{torch._dynamo.config.base_dir}/repro.py``.
-Note: the other option for TORCHDYNAMO_REPRO_AFTER are ``"aot"``, which
-will run the minifier after the backward graph has been generated.
 
-.. code:: py
+.. note:: The other option for TORCHDYNAMO_REPRO_AFTER are ``"aot"``, which
+   will run the minifier after the backward graph has been generated.
+
+.. code-block:: python
 
    import torch
    import torch._dynamo as dynamo
@@ -445,7 +447,7 @@ TorchInductor Debug Tracing
 TorchInductor has a builtin stats and trace function for displaying time
 spent in each compilation phase, output code, output graph visualization
 and IR dump. This is a debugging tool designed to make it easier to
-debug/understand the internals of TorchInductor.
+understand and troubleshoot the internals of TorchInductor.
 
 Setting the environment variable ``TORCHINDUCTOR_TRACE=1`` will cause a
 debug trace directory to be created and printed:
@@ -467,7 +469,7 @@ for the test program:
            torch.nn.ReLU(),
        )
 
-Note each file in that debug trace can be enabled/disabled via
+Each file in that debug trace can be enabled and disabled through
 ``torch._inductor.config.trace.*``. The profile and the diagram are both
 disabled by default since they are expensive to generate.
 
@@ -502,10 +504,11 @@ See the `example debug directory
 output <https://gist.github.com/jansel/f4af078791ad681a0d4094adeb844396>`__
 for more examples.
 
-Memory Profiling
-----------------
+..
+  _Memory Profiling
+  ----------------
 
-TBD
+  TBD
 
 Graph Breaks
 ------------
@@ -532,7 +535,7 @@ ensure that the compiled program would be safe to reuse. Graph breaks
 can hinder performance if the resulting fragments are small. To maximize
 performance, it’s important to have as few graph breaks as possible.
 
-Identifying the cause of a graph break
+Identifying the Cause of a Graph Break
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 To identify all graph breaks in a program and the associated reasons for
@@ -564,15 +567,15 @@ that are encountered. Here is an example usage:
        if b.sum() < 0:
     """
 
-Note on other outputs: - ``out_guards`` - a list of lists where each
-sublist contains the guards that must pass to ensure the traced graphs
-are valid - ``graphs`` - a list of graph modules which were successfully
-traced - ``ops_per_graph`` - a list of lists where each sublist contains
-the ops thatare run in the graph
+Outputs include:
+
+- ``out_guards`` - a list of lists where each sublist contains the guards that must pass to ensure the traced graphs are valid.
+- ``graphs`` - a list of graph modules which were successfully traced.
+- ``ops_per_graph`` - a list of lists where each sublist contains the ops that are run in the graph.
 
-To throw an error on the first graph break encountered, ``nopython``
-mode can be used. This disables TorchDynamo’s python fallback, and only
-succeeds if the entire program is convertible to a single graph. Example
+To throw an error on the first graph break encountered, use the ``nopython``
+mode. This mode disables TorchDynamo’s Python fallback, and only
+succeeds if the entire program is convertible into a single graph. Example
 usage:
 
 .. code-block:: python
@@ -595,7 +598,7 @@ failing and what part of your program is triggering it.
 
 The `recompilation profiler <#recompilation-profiler>`__ automates the
 process of setting TorchDynamo’s cache limit to 1 and running your
-program under an observation-only ‘compiler’ that records the causes of
+program under an observation-only 'compiler' that records the causes of
 any guard failures. You should be sure to run your program for at least
 as long (as many iterations) as you were running when you ran into
 trouble, and the profiler will accumulate statistics over this duration.
@@ -611,7 +614,7 @@ cost of recompilation outweighs any optimization benefits.
 
 Torchdynamo plans to support many common cases of dynamic tensor shapes,
 such as varying batch size or sequence length. It does not plan to
-support rank-dynamism. In the mean time, setting a specific cache limit
+support rank-dynamism. In the meantime, setting a specific cache limit
 can be used in coordination with bucketing techniques to achieve an
 acceptable number of recompilations for some dynamic models.
 
@@ -640,25 +643,24 @@ to detect bugs in our codegen or with a backend compiler.
 File an Issue
 ~~~~~~~~~~~~~
 
-You should feel encouraged to `file a github
-issue <https://github.com/pytorch/torchdynamo/issues>`__ and expect a
-timely response.
+If you experience problems with TorchDynamo, `file a github
+issue <https://github.com/pytorch/torchdynamo/issues>`__.
 
 Before filing an issue, read over the `README <../README.md>`__,
 `TROUBLESHOOTING <./TROUBLESHOOTING.md>`__, and search for similar
 issues.
 
-When filing an issue, please include - your
-OS/python/pytorch/CUDA/triton info by running:
+When filing an issue, include the information about your
+OS, Python< PyTorch, CUDA, and Triton versions info by running:
 
-.. code-block:: sh
+.. code-block:: shell
 
    python tools/verify_install.py
 
 -  A minimal repro script if possible, which can be generated by running
    Minifier
 -  A description of the error
--  the expected behavior
+-  The expected behavior
 -  A log (set ``torch._dynamo.config.log_file`` to a valid file name to
    dump the logs to a file and
    ``torch._dynamo.config.log_level = logging.DEBUG`` and
diff --git a/docs/source/index.rst b/docs/source/index.rst
index eaf2664159afe..93fccd2cb66f4 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -46,8 +46,16 @@ Features described in this documentation are classified by release status:
    :glob:
    :maxdepth: 1
    :caption: torch.compile
+   :hidden:
 
-   dynamo/*
+   dynamo/index
+   dynamo/installation
+   dynamo/get-started
+   dynamo/guards-overview
+   dynamo/custom-backends
+   dynamo/deep-dive
+   dynamo/troubleshooting
+   dynamo/faq
 
 .. toctree::
    :maxdepth: 1
@@ -58,7 +66,8 @@ Features described in this documentation are classified by release status:
    torch::deploy <deploy>
 
 .. toctree::
-   :maxdepth: 1
+   :glob:
+   :maxdepth: 2
    :caption: Python API
 
    torch

From d28b71e660c158fe3cfd02300fb7636ff039f540 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@meta.com>
Date: Thu, 1 Dec 2022 20:17:07 +0000
Subject: [PATCH 1481/1922] Revert "Dynamo, FX, Inductor Progress Bars
 (#88384)" (#90018)

This breaks in environments that use the fake tqdm https://github.com/pytorch/pytorch/blob/015b05af18b78ca9c77c997bc277eec66b5b1542/torch/hub.py#L26 which doesn't support the 'desc' kwarg and is not iterable

Original try using pytorchbot did not go through because of a merge
conflict: https://github.com/pytorch/pytorch/pull/88384#issuecomment-1334272489

This reverts commit 011452a2a1c745d4b12f83f89eca039f482d134b.

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90018
Approved by: https://github.com/drisspg, https://github.com/dbort
---
 torch/_dynamo/logging.py                | 21 ---------------------
 torch/_dynamo/optimizations/analysis.py |  1 -
 torch/_inductor/codecache.py            | 22 ++--------------------
 torch/_inductor/codegen/common.py       |  2 --
 torch/_inductor/config.py               |  6 ------
 torch/_inductor/graph.py                |  1 -
 torch/_inductor/ir.py                   |  3 ---
 torch/_inductor/sizevars.py             |  1 -
 torch/_inductor/virtualized.py          |  3 ---
 torch/fx/config.py                      |  6 ------
 torch/fx/interpreter.py                 |  8 ++------
 11 files changed, 4 insertions(+), 70 deletions(-)
 delete mode 100644 torch/fx/config.py

diff --git a/torch/_dynamo/logging.py b/torch/_dynamo/logging.py
index b2fa67fbdf6ae..95ee727f1ddf1 100644
--- a/torch/_dynamo/logging.py
+++ b/torch/_dynamo/logging.py
@@ -2,15 +2,10 @@
 import logging
 import os
 
-from torch.hub import tqdm
-
 # logging level for dynamo generated graphs/bytecode/guards
 logging.CODE = 15
 logging.addLevelName(logging.CODE, "CODE")
 
-# Disable progress bar by default, not in dynamo config because otherwise get a circular import
-disable_progress = True
-
 
 # Return all loggers that torchdynamo/torchinductor is responsible for
 def get_loggers():
@@ -83,24 +78,8 @@ def init_logging(log_level, log_file_name=None):
 
 _step_counter = itertools.count(1)
 
-# Update num_steps if more phases are added: Dynamo, AOT, Backend
-# This is very inductor centric
-# _inductor.utils.has_triton() gives a circular import error here
-
-if not disable_progress:
-    try:
-        import triton  # noqa: F401
-
-        num_steps = 3
-    except ImportError:
-        num_steps = 2
-    pbar = tqdm(total=num_steps, desc="torch.compile()", delay=15)
-
 
 def get_step_logger(logger):
-    if not disable_progress:
-        pbar.set_postfix_str(f"{logger.name}")
-        pbar.update(1)
     step = next(_step_counter)
 
     def log(level, msg):
diff --git a/torch/_dynamo/optimizations/analysis.py b/torch/_dynamo/optimizations/analysis.py
index f732fb322438f..d83e57fdca6e2 100644
--- a/torch/_dynamo/optimizations/analysis.py
+++ b/torch/_dynamo/optimizations/analysis.py
@@ -21,7 +21,6 @@ def __init__(self, *args, **kwargs):
         self.input_alias_groups = set()
         self.storage_to_alias_group = dict()
         self.make_alias_group = itertools.count(1)
-        self.name = "ShapeAliasingAndMutation"
 
     def tensor_alias_group(self, value: torch.Tensor):
         """Assign a unique identifier to the storage of a given tensor"""
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 8a27d8e1d0820..dca8af6cf1b9e 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -21,8 +21,6 @@
 from typing import Any, Callable, Dict, List
 
 import torch
-
-from torch.hub import tqdm
 from torch.utils import cpp_extension
 from . import config, cuda_properties, exc
 
@@ -594,7 +592,7 @@ def warm_pool(cls):
         if hasattr(pool, "_start_queue_management_thread"):
             pool._start_queue_management_thread()
         else:
-            for _ in range(config.compile_threads):
+            for i in range(config.compile_threads):
                 pool._adjust_process_count()
             pool._start_executor_manager_thread()
         _compile_end()
@@ -635,26 +633,10 @@ def task():
         return self.submit(task)
 
     def wait(self, scope: Dict[str, Any]):
-        num_kernels = len(
-            [
-                value
-                for key, value in scope.items()
-                if isinstance(value, (Future, TritonFuture))
-            ]
-        )
-        pbar = tqdm(
-            total=num_kernels,
-            desc="Inductor Compilation",
-            disable=config.disable_progress,
-            delay=15,
-        )
         if config.compile_threads > 1:
-            for key, result in scope.items():
-                if config.verbose_progress:
-                    pbar.set_postfix_str(key)
+            for key, result in list(scope.items()):
                 if isinstance(result, (Future, TritonFuture)):
                     scope[key] = result.result()
-                    pbar.update(1)
 
         _compile_end()
 
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index c549de21e46ee..b1e710c0ef91e 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -590,8 +590,6 @@ def reduction(self, name, dtype, src_dtype, reduction_type, index, value):
 
     def __enter__(self):
         class CSEProxy:
-            self.name = "CSEProxy"
-
             @staticmethod
             def __getattr__(name):
                 def inner(*args, **kwargs):
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index ad4e10e394d3a..50795b7143d8a 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -5,12 +5,6 @@
 # add some debug printouts
 debug = False
 
-# Whether to disable a progress bar for autotuning
-disable_progress = True
-
-# Whether to enable printing the source code for each future
-verbose_progress = False
-
 # use cpp wrapper instead of python wrapper
 cpp_wrapper = False
 
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 061d2eb082af9..1c1bf776ba403 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -89,7 +89,6 @@ def __init__(
         self.randomness_seeds = []
         self.name_to_buffer = {}
         self.creation_time = time.time()
-        self.name = "GraphLowering"
         self._can_use_cpp_wrapper = config.cpp_wrapper
 
     def get_dtype(self, buffer_name):
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 7a182752bb16a..56d5e8ef1d4c7 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -3952,8 +3952,6 @@ def add_index(expr, category, buf_name=None):
             )
 
         class CaptureIndexing(V.WrapperHandler):
-            self.name = "CaptureIndexing"
-
             def load(self, name: str, index: sympy.Expr):
                 index = add_index(index, "reads", name)
                 return self._inner.load(name, index)
@@ -4036,7 +4034,6 @@ def __init__(self):
                 self.garbage_collect_values = False
                 self.env = {}
                 self.fetch_attr = submodules.__getitem__
-                self.name = V.get_ops_handler().name
 
         return InterpreterShim().run(V.get_ops_handler())
 
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
index 7997d5fde09fa..fda61b0933574 100644
--- a/torch/_inductor/sizevars.py
+++ b/torch/_inductor/sizevars.py
@@ -596,7 +596,6 @@ class SimplifyIndexing(V.WrapperHandler):  # type: ignore[name-defined]
 
     def __init__(self, inner, var_ranges: VarRanges):
         super().__init__(inner)
-        self.name = "SimplifyIndexing"
         self._simplify: Callable[
             [Expr], Expr
         ] = lambda index: V.graph.sizevars.simplify_with_ranges(index, var_ranges)
diff --git a/torch/_inductor/virtualized.py b/torch/_inductor/virtualized.py
index cff6770997371..27e60b1daf1df 100644
--- a/torch/_inductor/virtualized.py
+++ b/torch/_inductor/virtualized.py
@@ -57,9 +57,6 @@ def _arg_str(a):
 
 class MockHandler:
     def __getattr__(self, name):
-        if name == "name":
-            return "MockHandler"
-
         def inner(*args, **kwargs):
             fargs = [_arg_str(a) for a in args]
             fargs.extend(f"{k}={v}" for k, v in kwargs.items())
diff --git a/torch/fx/config.py b/torch/fx/config.py
deleted file mode 100644
index da5120d6edf18..0000000000000
--- a/torch/fx/config.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# Whether to disable showing progress on compilation passes
-# Need to add a new config otherwise wil get a circular import if dynamo config is imported here
-disable_progress = True
-
-# If True this also shows the node names in each pass, for small models this is great but larger models it's quite noisy
-verbose_progress = False
diff --git a/torch/fx/interpreter.py b/torch/fx/interpreter.py
index 683a6bd90b501..6428d4c5c3bb5 100644
--- a/torch/fx/interpreter.py
+++ b/torch/fx/interpreter.py
@@ -4,12 +4,10 @@
 from .proxy import Proxy
 from ._symbolic_trace import Tracer
 from ._compatibility import compatibility
-from . import config
 import torch.fx.traceback as fx_traceback
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 import inspect
 from contextlib import contextmanager
-from torch.hub import tqdm
 
 __all__ = ['Interpreter', 'Transformer']
 
@@ -74,7 +72,7 @@ def __init__(self, module : GraphModule, garbage_collect_values : bool = True):
         self.module = module
         self.submodules = dict(self.module.named_modules())
         self.env : Dict[Node, Any] = {}
-        self.name = "Interpreter"
+
         self.garbage_collect_values = garbage_collect_values
 
         if self.garbage_collect_values:
@@ -120,9 +118,7 @@ def run(self, *args, initial_env : Optional[Dict[Node, Any]] = None, enable_io_p
             args = self.module.graph.process_inputs(*args)
         self.args_iter : Iterator[Any] = iter(args)
 
-        for node in tqdm(self.module.graph.nodes,
-                         desc=f"{self.name}: {str(list(self.module.graph.nodes)) if config.verbose_progress else ''}",
-                         initial=1, position=0, leave=True, disable=config.disable_progress, delay=15):
+        for node in self.module.graph.nodes:
             if node in self.env:
                 # Short circuit if we have this value. This could
                 # be used, for example, for partial evaluation

From aa335097cbf804c78f2fda3a7559cc51dce0465e Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Thu, 1 Dec 2022 20:17:50 +0000
Subject: [PATCH 1482/1922] Add `torch.compile` implementation (#89607)

`torch.compile` can be used either as decorator or to optimize model directly, for example:
```
@torch.compile
def foo(x):
  return torch.sin(x) + x.max()
```
or
```
mod = torch.nn.ReLU()
optimized_mod = torch.compile(mod, mode="max-autotune")
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89607
Approved by: https://github.com/soumith
---
 docs/source/torch.rst       |  8 ++++
 test/inductor/test_smoke.py | 46 ++++++++++++++++---
 torch/__init__.py           | 71 ++++++++++++++++++++++++++++-
 torch/_inductor/config.py   | 91 ++++++++++++++++++++++++++++++++++++-
 torch/overrides.py          |  1 +
 5 files changed, 208 insertions(+), 9 deletions(-)

diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index f5e06d5ea438d..23d63bcd750c0 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -613,6 +613,14 @@ Utilities
     vmap
     _assert
 
+Optimizations
+-------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    compile
+
 Operator Tags
 ------------------------------------
 .. autoclass:: Tag
diff --git a/test/inductor/test_smoke.py b/test/inductor/test_smoke.py
index 64afbcf0254e3..89079723bc224 100644
--- a/test/inductor/test_smoke.py
+++ b/test/inductor/test_smoke.py
@@ -1,14 +1,10 @@
 # Owner(s): ["module: inductor"]
 import logging
-import unittest
 
 import torch
 import torch._dynamo as torchdynamo
 import torch._inductor.config as torchinductor_config
-
-torchdynamo.config.log_level = logging.INFO
-torchdynamo.config.verbose = True
-torchinductor_config.debug = True
+from torch.testing._internal.common_utils import IS_LINUX, TestCase
 
 
 class MLP(torch.nn.Module):
@@ -23,8 +19,44 @@ def forward(self, x=None):
         return x
 
 
-class SmokeTest(unittest.TestCase):
+def _test_f(x):
+    return x * x
+
+
+class SmokeTest(TestCase):
     def test_mlp(self):
-        mlp = torchdynamo.optimize("inductor")(MLP().cuda())
+        torchdynamo.config.log_level = logging.INFO
+        torchdynamo.config.verbose = True
+        torchinductor_config.debug = True
+
+        mlp = torch.compile(MLP().cuda())
         for _ in range(3):
             mlp(torch.randn(1, device="cuda"))
+
+        torchdynamo.config.verbose = False
+        torchinductor_config.debug = False
+
+    def test_compile_decorator(self):
+        @torch.compile
+        def foo(x):
+            return torch.sin(x) + x.min()
+
+        @torch.compile(mode="reduce-overhead")
+        def bar(x):
+            return x * x
+
+        for _ in range(3):
+            foo(torch.full((3, 4), 0.7, device="cuda"))
+            bar(torch.rand((2, 2), device="cuda"))
+
+    def test_compile_invalid_options(self):
+        with self.assertRaises(RuntimeError):
+            opt_f = torch.compile(_test_f, mode="ha")
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    if IS_LINUX and torch.cuda.is_available():
+        if torch.cuda.get_device_properties(0).major > 5:
+            run_tests()
diff --git a/torch/__init__.py b/torch/__init__.py
index 02765c4aeee81..c45e2d8f0de33 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -29,7 +29,7 @@
 
 from ._six import string_classes as _string_classes
 
-from typing import Set, Type, TYPE_CHECKING, Union, Callable, Any
+from typing import Any, Callable, Dict, Optional, Set, Type, TYPE_CHECKING, Union
 import builtins
 
 __all__ = [
@@ -48,6 +48,7 @@
     'set_deterministic_debug_mode', 'get_deterministic_debug_mode',
     'set_float32_matmul_precision', 'get_float32_matmul_precision',
     'set_warn_always', 'is_warn_always_enabled', 'SymInt', 'SymFloat',
+    'compile',
 ]
 
 ################################################################################
@@ -1112,6 +1113,74 @@ def compiled_with_cxx11_abi():
     lstsq,
 )
 
+def compile(model: Optional[Callable] = None, *,
+            fullgraph: builtins.bool = False,
+            dynamic: builtins.bool = False,
+            backend: Union[str, Callable] = "inductor",
+            mode: Union[str, None] = None,
+            passes: Optional[Dict[str, Union[str, builtins.int, builtins.bool]]] = None,
+            **kwargs) -> Callable:
+    """
+    Optimizes given model/function using Dynamo and specified backend
+
+    Args:
+       model (Callable): Module/function to optimize
+       fullgraph (bool): Whether it is ok to break model into several subgraphs
+       dynamic (bool): Use dynamic shape tracing
+       backend (str or Callable): backend to be used
+       mode (str): Can be either "default", "reduce-overhead" or "max-autotune"
+       passes (dict): A dictionary of passes to the backend. Passes currently recognized by inductor backend:
+                       - static-memory
+                       - matmul-tune
+                       - matmul-padding
+                       - triton-autotune
+                       - triton-bmm
+                       - triton-mm
+                       - triton-convolution
+                       - rematerialize-threshold
+                       - rematerialize-acc-threshold
+
+    Example::
+
+        @torch.compile(passes={"matmul-padding": True}, fullgraph=True)
+        def foo(x):
+            return torch.sin(x) + torch.cos(x)
+
+    """
+    # Decorator mode
+    if model is None:
+        def fn(model: Callable):
+            if model is None:
+                raise RuntimeError("Model can't be None")
+            return compile(model,
+                           fullgraph=fullgraph,
+                           dynamic=dynamic,
+                           backend=backend,
+                           mode=mode,
+                           passes=passes,
+                           **kwargs)
+        return fn
+
+    import torch._dynamo
+    from torch._dynamo.eval_frame import lookup_backend
+    from torch._inductor.config import InductorConfigContext
+    if mode is not None and passes is not None:
+        raise RuntimeError("Either mode or passes can be specified, but both can't be specified at the same time.")
+    if mode is None and passes is None:
+        mode = "default"
+    if backend == "inductor":
+        compile_fn = lookup_backend(backend)
+        cm = InductorConfigContext(mode if mode is not None else passes)
+
+        def _compile_fn(model_, inputs_):
+            with cm:
+                return compile_fn(model_, inputs_)
+
+        _compile_fn._torchdynamo_orig_callable = compile_fn  # type: ignore[attr-defined]
+        backend = _compile_fn
+    return torch._dynamo.optimize(backend=backend, nopython=fullgraph, dynamic=dynamic, **kwargs)(model)
+
+
 def _register_device_module(device_type, module):
     r"""Register an external runtime module of the specific :attr:`device_type`
     supported by torch.
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index 50795b7143d8a..92666886a002d 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -33,7 +33,7 @@
 
 # control store vs recompute heuristic
 # For fanouts, rematearialization can lead to exponential blowup. So, have
-# smaller threashold
+# smaller threshold
 realize_reads_threshold = 4
 realize_bytes_threshold = 2000
 
@@ -198,3 +198,92 @@ class trace:
     # Upload the .tar.gz file
     # Needs to be overriden based on specific environment needs
     upload_tar = None
+
+
+class InductorConfigContext:
+    static_memory: bool
+    matmul_tune: str
+    matmul_padding: bool
+    triton_autotune: bool
+    triton_bmm: bool
+    triton_mm: str
+    triton_convolution: str
+    rematerialize_threshold: int
+    rematerialize_acc_threshold: int
+
+    def _save(self):
+        self.static_memory = triton.cudagraphs
+        self.matmul_tune = triton.mm
+        self.matmul_padding = shape_padding
+        self.triton_autotune = triton.autotune
+        self.triton_bmm = triton.use_bmm
+        self.triton_mm = triton.mm
+        self.triton_convolution = triton.convolution
+        self.rematerialize_threshold = realize_reads_threshold
+        self.rematerialize_acc_threshold = realize_acc_reads_threshold
+
+    def _apply(self):
+        triton.cudagraphs = self.static_memory
+        triton.mm = self.matmul_tune
+        shape_padding = self.matmul_padding
+        triton.autotune = self.triton_autotune
+        triton.use_bmm = self.triton_bmm
+        triton.mm = self.triton_mm
+        triton.convolution = self.triton_convolution
+        realize_reads_threshold = self.rematerialize_threshold
+        realize_acc_reads_threshold = self.rematerialize_acc_threshold
+
+    def __init__(self, arg=None):
+        self._save()
+        if arg is None:
+            return
+        # Handle mode
+        if type(arg) is str:
+
+            def default():
+                self.static_memory = False
+
+            def reduce_overhead():
+                self.static_memory = True
+
+            def max_autotune():
+                self.static_memory = False
+                self.matmul_padding = True
+                self.triton_convolution = "autotune"
+                self.triton_mm = "autotune"
+                self.matmul_padding = True
+
+            modes = {
+                x.__name__.replace("_", "-"): x
+                for x in [default, reduce_overhead, max_autotune]
+            }
+            if arg not in modes:
+                raise RuntimeError(
+                    f"Unrecognized mode {arg}, should be one of {', '.join(modes.keys())}"
+                )
+            modes[arg]()
+            return
+        # Handle passes
+        for (name, val) in arg.items():
+            attr_name = name.replace("-", "_")
+            if not hasattr(self, attr_name):
+                known_passes = ", ".join(
+                    [x.replace("_", "-") for x in dir(self) if not x.startswith("_")]
+                )
+                raise RuntimeError(
+                    f"Unexpected optimization pass {name}, known passes are {known_passes}"
+                )
+            if type(val) != type(getattr(self, attr_name)):
+                val_type_str = type(val).__name__
+                expected_type_str = type(getattr(self, attr_name)).__name__
+                raise RuntimeError(
+                    f"Unexpected type of attr {name}, got {val_type_str} should be {expected_type_str}"
+                )
+            setattr(self, attr_name, val)
+
+    def __enter__(self):
+        self._prev = InductorConfigContext()
+        self._apply()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self._prev._apply()
diff --git a/torch/overrides.py b/torch/overrides.py
index ae2b23e17d30b..5c89cb1c44291 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -133,6 +133,7 @@ def get_ignored_functions() -> Set[Callable]:
         torch.blackman_window,
         torch.broadcast_shapes,
         torch.can_cast,
+        torch.compile,
         torch.cudnn_affine_grid_generator,
         torch.cudnn_batch_norm,
         torch.cudnn_convolution,

From ceb711063c66a0092bd811ca6e9880d0aa027cda Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Thu, 1 Dec 2022 01:42:17 +0000
Subject: [PATCH 1483/1922] [dtensor] ufmt distributed._tensor (#89967)

cmd: `ufmt format torch/distributed/_tensor`

copy from Andrew:

Notes
For VSCode users,

Install ufmt: https://pypi.org/project/ufmt/
Install VSCode ufmt extension: https://marketplace.visualstudio.com/items?itemName=omnilib.ufmt
Include in settings.json:
```
{
    "[python]": {
        "editor.defaultFormatter": "omnilib.ufmt",
        "editor.formatOnSave": true,
    },
}
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89967
Approved by: https://github.com/fduwjj
---
 torch/distributed/_tensor/__init__.py         | 23 +++----
 torch/distributed/_tensor/api.py              | 44 +++++--------
 torch/distributed/_tensor/device_mesh.py      | 31 ++++-----
 torch/distributed/_tensor/dispatch.py         | 22 +++----
 torch/distributed/_tensor/ops/common_rules.py | 35 ++++------
 torch/distributed/_tensor/ops/math_ops.py     | 18 ++---
 torch/distributed/_tensor/ops/matrix_ops.py   |  4 +-
 .../distributed/_tensor/ops/pointwise_ops.py  |  7 +-
 torch/distributed/_tensor/ops/tensor_ops.py   | 23 +++----
 .../_tensor/ops/tp_sharding_ops.py            |  5 +-
 torch/distributed/_tensor/ops/utils.py        |  6 +-
 torch/distributed/_tensor/ops/view_ops.py     | 65 +++++--------------
 torch/distributed/_tensor/placement_types.py  | 24 ++-----
 torch/distributed/_tensor/redistribute.py     | 20 +++---
 torch/distributed/_tensor/utils.py            |  3 +-
 15 files changed, 120 insertions(+), 210 deletions(-)

diff --git a/torch/distributed/_tensor/__init__.py b/torch/distributed/_tensor/__init__.py
index 32a57146bc939..476357364a026 100644
--- a/torch/distributed/_tensor/__init__.py
+++ b/torch/distributed/_tensor/__init__.py
@@ -1,15 +1,14 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
-from typing import Optional, Sequence, Callable, cast
+from typing import Callable, cast, Optional, Sequence
 
 import torch
-import torch.nn as nn
-from torch.distributed._tensor.api import DTensor
-from torch.distributed._tensor.device_mesh import DeviceMesh, get_global_device_mesh
-from torch.distributed._tensor.placement_types import Placement, Shard, Replicate
-
 
 # Import all builtin dist tensor ops
 import torch.distributed._tensor.ops
+import torch.nn as nn
+from torch.distributed._tensor.api import DTensor
+from torch.distributed._tensor.device_mesh import DeviceMesh, get_global_device_mesh
+from torch.distributed._tensor.placement_types import Placement, Replicate, Shard
 
 
 def distribute_tensor(
@@ -39,9 +38,7 @@ def distribute_tensor(
         A :class:`DTensor` object
     """
     # get default device mesh if there's nothing specified
-    device_mesh = (
-        get_global_device_mesh() if device_mesh is None else device_mesh
-    )
+    device_mesh = get_global_device_mesh() if device_mesh is None else device_mesh
     # convert tensor to the correponding device type if it's not in that device type
     tensor = tensor.to(device_mesh.device_type)
     # set default placements to replicated if not specified
@@ -145,15 +142,11 @@ def replicate_module_params_buffers(m: nn.Module, mesh: DeviceMesh) -> None:
             if param is not None and not isinstance(param, DTensor):
                 m.register_parameter(
                     key,
-                    nn.Parameter(
-                        distribute_tensor(param.data, mesh, full_replicate)
-                    ),
+                    nn.Parameter(distribute_tensor(param.data, mesh, full_replicate)),
                 )
         for key, buffer in m._buffers.items():
             if buffer is not None and not isinstance(buffer, DTensor):
-                m._buffers[key] = distribute_tensor(
-                    buffer, mesh, full_replicate
-                )
+                m._buffers[key] = distribute_tensor(buffer, mesh, full_replicate)
 
     if partition_fn is None:
         # if partition_fn not specified, we by default replicate
diff --git a/torch/distributed/_tensor/api.py b/torch/distributed/_tensor/api.py
index bf5514cc7d4e4..b3dd7377a05f3 100644
--- a/torch/distributed/_tensor/api.py
+++ b/torch/distributed/_tensor/api.py
@@ -1,20 +1,21 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import copy
 import warnings
+from typing import Callable, cast, Dict, Optional, Sequence
+
 import torch
-from torch.utils._pytree import tree_flatten
-from typing import Dict, Callable, Optional, Sequence, cast
-from torch.distributed._tensor.device_mesh import get_global_device_mesh, DeviceMesh
+
+import torch.distributed._tensor.dispatch as op_dispatch
+from torch.distributed._tensor.device_mesh import DeviceMesh, get_global_device_mesh
 from torch.distributed._tensor.placement_types import (
-    Placement,
-    Shard,
-    Replicate,
     _Partial,
     DTensorSpec,
+    Placement,
+    Replicate,
+    Shard,
 )
 from torch.distributed._tensor.redistribute import Redistribute
-
-from torch.distributed._tensor.dispatch import operator_dispatch, OpSchema, OutputSharding
+from torch.utils._pytree import tree_flatten
 
 # NOTE [Autograd interaction between torch.Tensor]
 #
@@ -132,7 +133,9 @@ class DTensor(torch.Tensor):  # pyre-ignore[13]: pyre is bad at __new__
 
     # class attribute that handles operator placements propagation
     # rules, keyed by aten op name, value is propagation func
-    _op_to_rules: Dict[str, Callable[[OpSchema], OutputSharding]] = {}
+    _op_to_rules: Dict[
+        str, Callable[["op_dispatch.OpSchema"], "op_dispatch.OutputSharding"]
+    ] = {}
 
     # class attribute that handles custom registered ops, all handled
     # custom ops should appear in this table, and overriding the default
@@ -172,18 +175,13 @@ def __new__(
                 # recover tensor stride by modifying the stride that larger than
                 # the current stride on the shard_dim
                 for i in range(len(tensor_stride)):
-                    if (
-                        i != shard_dim
-                        and tensor_stride[i] >= tensor_stride[shard_dim]
-                    ):
+                    if i != shard_dim and tensor_stride[i] >= tensor_stride[shard_dim]:
                         # rescale the stride by the shard size
                         tensor_stride[i] = (
                             tensor_stride[i] // local_size[shard_dim]
                         ) * size[shard_dim]
             elif not isinstance(placement, (Replicate, _Partial)):
-                raise RuntimeError(
-                    f"placement type {type(placement)} not supported!"
-                )
+                raise RuntimeError(f"placement type {type(placement)} not supported!")
 
         if requires_grad != local_tensor.requires_grad:
             warnings.warn(
@@ -203,9 +201,7 @@ def __new__(
             requires_grad=requires_grad,
         )
         # deepcopy and set spec
-        r._spec = DTensorSpec(
-            device_mesh, copy.deepcopy(placements), shape=r.size()
-        )
+        r._spec = DTensorSpec(device_mesh, copy.deepcopy(placements), shape=r.size())
         # detach local tensor from autograd graph as we initialize the
         # distributed tensor and autograd will be working on top of
         # the wrapper tensor directly instead of local torch.Tensor
@@ -250,7 +246,7 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
         if kwargs is None:
             kwargs = {}
 
-        return operator_dispatch(
+        return op_dispatch.operator_dispatch(
             func,
             args,
             kwargs,
@@ -296,9 +292,7 @@ def from_local(
         # There should be no data communication unless there's replication
         # strategy, where we broadcast the replication from the first rank
         # in the mesh dimension
-        device_mesh = (
-            get_global_device_mesh() if device_mesh is None else device_mesh
-        )
+        device_mesh = get_global_device_mesh() if device_mesh is None else device_mesh
         # convert the local tensor to desired device base on device mesh's device_type
         local_tensor = local_tensor.to(device_mesh.device_type)
 
@@ -357,9 +351,7 @@ def redistribute(
         # Note that redistribute currently only supports out
         # of place redistribution, i.e. it always create a new
         # DTensor object and leave the original one unchanged.
-        device_mesh = (
-            get_global_device_mesh() if device_mesh is None else device_mesh
-        )
+        device_mesh = get_global_device_mesh() if device_mesh is None else device_mesh
         # raise error if new placements not specified
         if placements is None:
             raise RuntimeError("placements is needed for redistribute!")
diff --git a/torch/distributed/_tensor/device_mesh.py b/torch/distributed/_tensor/device_mesh.py
index 5ca3f8c6159b1..52edb3b140514 100644
--- a/torch/distributed/_tensor/device_mesh.py
+++ b/torch/distributed/_tensor/device_mesh.py
@@ -1,22 +1,23 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import warnings
 from typing import List, Optional, Sequence, TypeVar, Union
+
 import torch
 from torch.distributed.distributed_c10d import (
+    _get_default_group,
     all_gather,
     all_reduce,
+    all_to_all,
     broadcast,
+    get_global_rank,
     get_rank,
     get_world_size,
-    get_global_rank,
-    ReduceOp,
     GroupMember,
-    scatter,
-    _get_default_group,
-    reduce_scatter,
     new_group,
     ProcessGroup,
-    all_to_all,
+    reduce_scatter,
+    ReduceOp,
+    scatter,
     Work,
 )
 
@@ -25,9 +26,7 @@
 
 def get_global_device_mesh() -> "DeviceMesh":
     global _global_device_mesh
-    assert (
-        _global_device_mesh is not None
-    ), "Could not get a default device mesh!"
+    assert _global_device_mesh is not None, "Could not get a default device mesh!"
     return _global_device_mesh
 
 
@@ -337,9 +336,7 @@ def broadcast(
         if dim_group is not GroupMember.WORLD:
             src_for_dim = get_global_rank(dim_group, 0)
 
-        return broadcast(
-            tensor, src=src_for_dim, group=dim_group, async_op=async_op
-        )
+        return broadcast(tensor, src=src_for_dim, group=dim_group, async_op=async_op)
 
     def all_gather(
         self,
@@ -363,9 +360,7 @@ def all_gather(
             A :class:`Work` object
         """
         dim_group = self._dim_groups[mesh_dim]
-        return all_gather(
-            tensor_list, tensor, group=dim_group, async_op=async_op
-        )
+        return all_gather(tensor_list, tensor, group=dim_group, async_op=async_op)
 
     def all_reduce(
         self,
@@ -453,9 +448,9 @@ def reduce_scatter(
             # scatter the tensor
             output_offset = offset_list[my_coordinate]
             output.copy_(
-                flat_tensor[
-                    output_offset : output_offset + output.numel()
-                ].view(output.shape)
+                flat_tensor[output_offset : output_offset + output.numel()].view(
+                    output.shape
+                )
             )
         else:
             raise RuntimeError(
diff --git a/torch/distributed/_tensor/dispatch.py b/torch/distributed/_tensor/dispatch.py
index 38ea056aa91a9..24df6f879d316 100644
--- a/torch/distributed/_tensor/dispatch.py
+++ b/torch/distributed/_tensor/dispatch.py
@@ -1,11 +1,11 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 from dataclasses import dataclass
-from typing import List, Callable, Dict, Tuple, Optional, cast
+from typing import Callable, cast, Dict, List, Optional, Tuple
 
-import torch
-from torch.utils._pytree import tree_map, tree_flatten, tree_unflatten
 from torchgen.model import FunctionSchema, SchemaKind
 
+import torch
+
 import torch.distributed._tensor.api as dtensor
 from torch.distributed._tensor.placement_types import DTensorSpec
 from torch.distributed._tensor.redistribute import redistribute_dtensor
@@ -16,6 +16,7 @@
     unwrap_schema,
     wrap,
 )
+from torch.utils._pytree import tree_flatten, tree_map, tree_unflatten
 
 
 """
@@ -76,8 +77,7 @@ class OpSchema(object):
     def __post_init__(self) -> None:
         schema_kind = self.func_schema.kind()
         self.is_inplace = (
-            schema_kind
-            == SchemaKind.inplace  # pyre-ignore [16] pyre bad at enum
+            schema_kind == SchemaKind.inplace  # pyre-ignore [16] pyre bad at enum
         )
         self.is_out_variant = (
             schema_kind == SchemaKind.out  # pyre-ignore [16] pyre bad at enum
@@ -92,9 +92,7 @@ def args_spec(self) -> Tuple[DTensorSpec, ...]:
         """
         # filter out non-relavant values from args schema to get a clean spec list
         # this would mainly be used by sharding propagation rules
-        return tuple(
-            item for item in self.args_schema if isinstance(item, DTensorSpec)
-        )
+        return tuple(item for item in self.args_schema if isinstance(item, DTensorSpec))
 
     def __repr__(self) -> str:
         return (
@@ -149,9 +147,7 @@ def _reshape_alias(
     return torch.ops.aten.view(x, shape)
 
 
-_CURRENT_DECOMPOSITION_TABLE: Dict[
-    Callable[..., object], Callable[..., object]
-] = {
+_CURRENT_DECOMPOSITION_TABLE: Dict[Callable[..., object], Callable[..., object]] = {
     torch.ops.aten._reshape_alias.default: _reshape_alias,
 }
 
@@ -173,9 +169,7 @@ def propagate_input_sharding(
     if _DEBUG_VERBOSE and torch.distributed.get_rank() == 0:
         print(f"{op_call}({op_schema})")
         local_shapes = tree_map(
-            lambda t: t.to_local().shape
-            if isinstance(t, dtensor.DTensor)
-            else None,
+            lambda t: t.to_local().shape if isinstance(t, dtensor.DTensor) else None,
             args,
         )
         print(f"    local shapes: {local_shapes}")
diff --git a/torch/distributed/_tensor/ops/common_rules.py b/torch/distributed/_tensor/ops/common_rules.py
index 29925c8a52c73..81c76ab84204d 100644
--- a/torch/distributed/_tensor/ops/common_rules.py
+++ b/torch/distributed/_tensor/ops/common_rules.py
@@ -1,9 +1,10 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
+from typing import cast, Dict, List, Optional, Sequence, Tuple
+
 import torch
-from typing import List, Sequence, Dict, Tuple, Optional, cast
 from torch.distributed._tensor.dispatch import OpSchema, OutputSharding
-from torch.distributed._tensor.placement_types import DTensorSpec
 from torch.distributed._tensor.ops.utils import prod
+from torch.distributed._tensor.placement_types import DTensorSpec
 
 
 def _replace_char_in_str(string: str, new_char: str, idx: int) -> str:
@@ -44,9 +45,7 @@ def _gen_reshard_suggestions(
                 shape=input_spec.shape,
             )
         )
-    suggested_schema = OpSchema(
-        op_schema.func_schema, tuple(suggested_arg_specs), {}
-    )
+    suggested_schema = OpSchema(op_schema.func_schema, tuple(suggested_arg_specs), {})
     _inplace_rewrap_schema_suggestion(suggested_schema, op_schema)
     return OutputSharding(
         None,
@@ -121,13 +120,9 @@ def merge_sharding(dim: str, a: int, b: int) -> int:
                 seen_shardings[sum_dim] = "+"
             # update pending sum counter for pending sum mesh
             # dimension with the occurance from each input
-            pending_sums_counter[sum_dim] = (
-                pending_sums_counter.get(sum_dim, 0) + 1
-            )
+            pending_sums_counter[sum_dim] = pending_sums_counter.get(sum_dim, 0) + 1
 
-        for idx, (dim, mesh_dim) in enumerate(
-            zip(input_dim, input_spec.dim_map)
-        ):
+        for idx, (dim, mesh_dim) in enumerate(zip(input_dim, input_spec.dim_map)):
             if enforce_sharding and dim in enforce_sharding:
                 if enforce_sharding[dim] != mesh_dim:
                     needs_reshard = True
@@ -185,9 +180,9 @@ def merge_sharding(dim: str, a: int, b: int) -> int:
                         d in input_dim
                         and input_spec.dim_map[input_dim.index(d)] == mesh_dim
                     ):
-                        cost += prod(
-                            input_spec.local_shape
-                        ) * input_spec.mesh.size(mesh_dim)
+                        cost += prod(input_spec.local_shape) * input_spec.mesh.size(
+                            mesh_dim
+                        )
                 costs.append(cost)
             d_to_keep_sharding = dims[costs.index(max(costs))]
             for d in dims:
@@ -230,9 +225,7 @@ def merge_sharding(dim: str, a: int, b: int) -> int:
     )
 
 
-def pointwise_rule(
-    op_schema: OpSchema, linearity: bool = False
-) -> OutputSharding:
+def pointwise_rule(op_schema: OpSchema, linearity: bool = False) -> OutputSharding:
     """
     Propagate the sharding for pointwise operations. Examples:
         ij,ij->ij - addition/mul
@@ -270,9 +263,7 @@ def pointwise_rule(
     for output_dim_idx in range(len(out_dimchars)):
         out_dimchar = out_dimchars[output_dim_idx]
         if singleton_counter[output_dim_idx] == len(input_specs):
-            out_dimchars = _replace_char_in_str(
-                out_dimchars, "1", output_dim_idx
-            )
+            out_dimchars = _replace_char_in_str(out_dimchars, "1", output_dim_idx)
 
     fmt = f"{','.join(p for p in dimchars)}->{out_dimchars}"
 
@@ -340,9 +331,7 @@ def reduction_rule(
             no_partial_spec = DTensorSpec.from_dim_map(
                 input_spec.mesh, reshard_dim_map, [], input_spec.shape
             )
-            schema_suggestion = OpSchema(
-                op_schema.func_schema, (no_partial_spec,), {}
-            )
+            schema_suggestion = OpSchema(op_schema.func_schema, (no_partial_spec,), {})
             _inplace_rewrap_schema_suggestion(schema_suggestion, op_schema)
             return OutputSharding(
                 output_spec=None, schema_suggestions=[schema_suggestion]
diff --git a/torch/distributed/_tensor/ops/math_ops.py b/torch/distributed/_tensor/ops/math_ops.py
index eb4cd86ed5c66..2480e7ced5739 100644
--- a/torch/distributed/_tensor/ops/math_ops.py
+++ b/torch/distributed/_tensor/ops/math_ops.py
@@ -2,15 +2,17 @@
 from typing import cast, Optional, Sequence
 
 from torch.distributed._tensor.api import DTensor
-from torch.distributed._tensor.placement_types import DTensorSpec
 from torch.distributed._tensor.dispatch import OpSchema, OutputSharding
-from torch.distributed._tensor.ops.common_rules import reduction_rule, pointwise_rule
-from torch.distributed._tensor.ops.utils import register_prop_rule, as_list, normalize_dims
+from torch.distributed._tensor.ops.common_rules import pointwise_rule, reduction_rule
+from torch.distributed._tensor.ops.utils import (
+    as_list,
+    normalize_dims,
+    register_prop_rule,
+)
+from torch.distributed._tensor.placement_types import DTensorSpec
 
 
-def _infer_reduction_dims(
-    dims_arg: object, ndim: int
-) -> Optional[Sequence[int]]:
+def _infer_reduction_dims(dims_arg: object, ndim: int) -> Optional[Sequence[int]]:
     if dims_arg is None:
         return None
     dims = cast(Sequence[int], as_list(dims_arg))
@@ -69,9 +71,7 @@ def softmax_bwd_rule(op_schema: OpSchema) -> OutputSharding:
     if softmax_dim < len(grad_out_dim_map) and (
         grad_out_dim_map[softmax_dim] >= 0 or out_dim_map[softmax_dim] >= 0
     ):
-        raise RuntimeError(
-            "Cannot run _softmax_backward_data on sharding dimension!"
-        )
+        raise RuntimeError("Cannot run _softmax_backward_data on sharding dimension!")
     return pointwise_rule(op_schema)
 
 
diff --git a/torch/distributed/_tensor/ops/matrix_ops.py b/torch/distributed/_tensor/ops/matrix_ops.py
index 47988799282e9..6d884843ea816 100644
--- a/torch/distributed/_tensor/ops/matrix_ops.py
+++ b/torch/distributed/_tensor/ops/matrix_ops.py
@@ -66,9 +66,7 @@ def addmm_rules(op_schema: OpSchema) -> OutputSharding:
 
     # run point wise rule on input + (mm_out) with linearity
     output_sharding = pointwise_rule(
-        OpSchema(
-            op_schema.func_schema, (input_spec, mm_out_sharding.output_spec), {}
-        ),
+        OpSchema(op_schema.func_schema, (input_spec, mm_out_sharding.output_spec), {}),
         linearity=True,
     )
     # if propagation failed, edit the schema suggestion from pointwise rules
diff --git a/torch/distributed/_tensor/ops/pointwise_ops.py b/torch/distributed/_tensor/ops/pointwise_ops.py
index 6c92eacd1b8b9..08f6c3a645e77 100644
--- a/torch/distributed/_tensor/ops/pointwise_ops.py
+++ b/torch/distributed/_tensor/ops/pointwise_ops.py
@@ -2,10 +2,13 @@
 from typing import cast
 
 from torch.distributed._tensor.api import DTensor
-from torch.distributed._tensor.ops.common_rules import linear_pointwise_rule, pointwise_rule
-from torch.distributed._tensor.placement_types import DTensorSpec, Replicate, _Partial
 from torch.distributed._tensor.dispatch import OpSchema, OutputSharding
+from torch.distributed._tensor.ops.common_rules import (
+    linear_pointwise_rule,
+    pointwise_rule,
+)
 from torch.distributed._tensor.ops.utils import register_prop_rule
+from torch.distributed._tensor.placement_types import _Partial, DTensorSpec, Replicate
 
 # leave the remaining pointwise_ops list here for convenience,
 # Below ops are some pointwise ops that are yet to be supported,
diff --git a/torch/distributed/_tensor/ops/tensor_ops.py b/torch/distributed/_tensor/ops/tensor_ops.py
index f386e1fdb9fd1..4ab57bbe2699d 100644
--- a/torch/distributed/_tensor/ops/tensor_ops.py
+++ b/torch/distributed/_tensor/ops/tensor_ops.py
@@ -1,17 +1,18 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
+from typing import cast, List, Optional, Sequence, Tuple
+
 import torch
 from torch.distributed._tensor.api import (
+    _Partial,
     DTensor,
     DTensorSpec,
     Placement,
     Replicate,
     Shard,
-    _Partial,
 )
 from torch.distributed._tensor.dispatch import OpSchema, OutputSharding
 from torch.distributed._tensor.ops.common_rules import pointwise_rule
 from torch.distributed._tensor.ops.utils import register_prop_rule
-from typing import List, Optional, Sequence, Tuple, cast
 
 
 # NOTE: the default propagation rule should apply for
@@ -34,8 +35,7 @@ def prop_create_like(op_schema: OpSchema) -> OutputSharding:
     output_spec = DTensorSpec(
         mesh=input_spec.mesh,
         placements=tuple(
-            Replicate() if isinstance(p, _Partial) else p
-            for p in input_spec.placements
+            Replicate() if isinstance(p, _Partial) else p for p in input_spec.placements
         ),
         ndim=input_spec.ndim,
         shape=input_spec.shape,
@@ -139,8 +139,7 @@ def prop_bucketize(op_schema: OpSchema) -> OutputSharding:
                         input_schema,
                         DTensorSpec(
                             mesh=boundaries.mesh,
-                            placements=[Replicate()]
-                            * len(boundaries.placements),
+                            placements=[Replicate()] * len(boundaries.placements),
                             ndim=boundaries.ndim,
                             shape=boundaries.shape,
                         ),
@@ -193,8 +192,7 @@ def _prop_all_but_dim(
             schema_suggestions=[
                 OpSchema(
                     func_schema=op_schema.func_schema,
-                    args_schema=(suggested_input_spec,)
-                    + op_schema.args_schema[1:],
+                    args_schema=(suggested_input_spec,) + op_schema.args_schema[1:],
                     kwargs_schema=op_schema.kwargs_schema,
                 ),
             ],
@@ -390,19 +388,14 @@ def prop_index(op_schema: OpSchema) -> OutputSharding:
             multi_indices_spec[valid_indices_spec[i][0]] = v
         # we'll need to call pointwise_rule again to see what's our ideal indices_spec and then
         # use that to compute our ideal values_spec
-        indices_output_spec = pointwise_rule(
-            valid_indices_suggestion
-        ).output_spec
+        indices_output_spec = pointwise_rule(valid_indices_suggestion).output_spec
         assert isinstance(indices_output_spec, DTensorSpec)
         indices_spec = indices_output_spec
 
     lookup_dims = set(v[0] for v in valid_indices_spec)
 
     need_reshard_on_values = tuple(
-        (
-            isinstance(vp, Shard)
-            and (vp.dim in lookup_dims or isinstance(ip, Shard))
-        )
+        (isinstance(vp, Shard) and (vp.dim in lookup_dims or isinstance(ip, Shard)))
         for vp, ip in zip(values_spec.placements, indices_spec.placements)
     )
 
diff --git a/torch/distributed/_tensor/ops/tp_sharding_ops.py b/torch/distributed/_tensor/ops/tp_sharding_ops.py
index 01db8920e6747..59964751ed2c5 100644
--- a/torch/distributed/_tensor/ops/tp_sharding_ops.py
+++ b/torch/distributed/_tensor/ops/tp_sharding_ops.py
@@ -1,11 +1,12 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # implement matrix related ops for distributed tensor
+from typing import List
+
 import torch
 import torch.utils._pytree as pytree
-from typing import List
 from torch.distributed._tensor.api import DTensor
+from torch.distributed._tensor.ops.utils import register_impl, unwrap_single_placement
 from torch.distributed._tensor.utils import unwrap_local_tensor
-from torch.distributed._tensor.ops.utils import unwrap_single_placement, register_impl
 
 """
 The ops below were quickly hacked and needed to be polished down the road.
diff --git a/torch/distributed/_tensor/ops/utils.py b/torch/distributed/_tensor/ops/utils.py
index 42db7142638a5..107fdc912d6d4 100644
--- a/torch/distributed/_tensor/ops/utils.py
+++ b/torch/distributed/_tensor/ops/utils.py
@@ -1,9 +1,9 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import functools
 import operator
+from typing import Iterable, List, Sequence, Union
 
 import torch
-from typing import List, Union, Sequence, Iterable
 from torch.distributed._tensor.api import DTensor
 
 
@@ -51,9 +51,7 @@ def as_list(
     # During tracing, `aten.sum.dim_IntList` uses `immutable_list` for its args,
     # which is an object but treated as a list by the tracer. Therefore, keep
     # `immutable_list` intact here as well.
-    if type(x) is list or isinstance(
-        x, torch.fx.immutable_collections.immutable_list
-    ):
+    if type(x) is list or isinstance(x, torch.fx.immutable_collections.immutable_list):
         return x
     else:
         return [x]
diff --git a/torch/distributed/_tensor/ops/view_ops.py b/torch/distributed/_tensor/ops/view_ops.py
index a8849b2ed14bf..5ec84b6e8b821 100644
--- a/torch/distributed/_tensor/ops/view_ops.py
+++ b/torch/distributed/_tensor/ops/view_ops.py
@@ -1,21 +1,9 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 from dataclasses import dataclass
-from typing import (
-    Callable,
-    Dict,
-    Iterable,
-    Optional,
-    Tuple,
-    Set,
-    Union,
-    Sequence,
-    cast,
-)
+from typing import Callable, cast, Dict, Iterable, Optional, Sequence, Set, Tuple, Union
 
 import torch
 from torch import Tensor
-
-from torch.distributed._tensor.placement_types import DTensorSpec, Placement, Replicate
 from torch.distributed._tensor.api import Shard
 from torch.distributed._tensor.dispatch import OpSchema, OutputSharding
 from torch.distributed._tensor.ops.utils import (
@@ -25,6 +13,8 @@
     register_prop_rule,
 )
 
+from torch.distributed._tensor.placement_types import DTensorSpec, Placement, Replicate
+
 
 Shape = Tuple[int, ...]
 
@@ -138,9 +128,7 @@ class Split(DimSpec):
     split_id: int
 
     @classmethod
-    def new(
-        cls, dim: DimSpec, group_shape: Tuple[int, ...], idx: int
-    ) -> DimSpec:
+    def new(cls, dim: DimSpec, group_shape: Tuple[int, ...], idx: int) -> DimSpec:
         assert len(group_shape) > 0
         if len(group_shape) == 1:
             # not really a group, just return the input dim back
@@ -192,9 +180,7 @@ def expand(input_shape: Shape, shape: Shape) -> DimMap:
             actual_s = 1
             assert desired_s >= 0
         else:
-            assert isinstance(
-                p, InputDim
-            ), f"DimSpec not supported in expand: {p}"
+            assert isinstance(p, InputDim), f"DimSpec not supported in expand: {p}"
             actual_s = input_shape[p.input_dim]
             assert actual_s == 1 or desired_s == -1 or desired_s == actual_s
         mapping.append(
@@ -234,9 +220,7 @@ def dim_movedim(
     assert len(input) == len(destination)
     input_set = set(input)
     assert len(input_set) == len(input), "Found repeated input dims"
-    assert len(set(destination)) == len(
-        destination
-    ), "Found repeated output dims"
+    assert len(set(destination)) == len(destination), "Found repeated output dims"
     assert max(input) < ndim
     assert max(destination) < ndim
 
@@ -363,9 +347,7 @@ def view_groups(from_size: Shape, to_size: Shape) -> DimMap:
 
         if len(to_group_shape) > 0:
             flattened = Flatten.new(
-                tuple(
-                    InputDim(fi) for fi in from_group_dim if from_size[fi] > 1
-                )
+                tuple(InputDim(fi) for fi in from_group_dim if from_size[fi] > 1)
             )
             result_pp += [
                 Split.new(flattened, tuple(to_group_shape), i)
@@ -460,23 +442,17 @@ class Op:
         )
     ),
     torch.ravel: Op(dim_map=lambda tensor: dim_flatten(tensor.ndim)),
-    Tensor.repeat: Op(
-        dim_map=lambda self, *sizes: dim_repeat(self.ndim, sizes)
-    ),
+    Tensor.repeat: Op(dim_map=lambda self, *sizes: dim_repeat(self.ndim, sizes)),
     torch.reshape: Op(
         dim_map=lambda input, shape: view_groups(input.shape, shape),
         shape_argnum=1,
     ),
-    torch.squeeze: Op(
-        dim_map=lambda input, dim=None: dim_squeeze(input.shape, dim)
-    ),
+    torch.squeeze: Op(dim_map=lambda input, dim=None: dim_squeeze(input.shape, dim)),
     torch.tile: Op(dim_map=lambda input, dims: dim_tile(input.ndim, dims)),
     torch.transpose: Op(
         dim_map=lambda input, dim0, dim1: dim_transpose(input.ndim, dim0, dim1)
     ),
-    torch.unsqueeze: Op(
-        dim_map=lambda input, dim: dim_unsqueeze(input.ndim, dim)
-    ),
+    torch.unsqueeze: Op(dim_map=lambda input, dim: dim_unsqueeze(input.ndim, dim)),
     Tensor.view: Op(
         dim_map=lambda input, *shape: view_groups(input.shape, shape),
         shape_argnum=1,
@@ -502,9 +478,7 @@ def propagate_shape_and_sharding(
       if the leftmost split size is divisible by the mesh dimension
     """
     assert len(in_shard) == len(mesh_sizes)
-    sharded_in_dims: Set[int] = set(
-        s.dim for s in in_shard if isinstance(s, Shard)
-    )
+    sharded_in_dims: Set[int] = set(s.dim for s in in_shard if isinstance(s, Shard))
     # for each input dim, for each mesh dim, provides a list of possible shardable dimensions
     shardable_dims: torch.Tensor = torch.ones(
         (len(local_in_shape), len(mesh_sizes)), dtype=torch.bool
@@ -541,8 +515,7 @@ def get_dim_size(cmd: DimSpec) -> Tuple[int, Optional[InputDim]]:
             return (
                 prod(get_dim_size(a)[0] for a in cmd.input_dims),
                 dim0
-                if isinstance(dim0, InputDim)
-                and dim0.input_dim in sharded_in_dims
+                if isinstance(dim0, InputDim) and dim0.input_dim in sharded_in_dims
                 else None,
             )
         elif isinstance(cmd, Split):
@@ -598,18 +571,14 @@ def get_dim_size(cmd: DimSpec) -> Tuple[int, Optional[InputDim]]:
             dim_map[in_dim.input_dim] = dim
 
     needs_reshard = any(
-        isinstance(placement, Shard)
-        and not shardable_dims[placement.dim][mesh_dim]
+        isinstance(placement, Shard) and not shardable_dims[placement.dim][mesh_dim]
         for mesh_dim, placement in enumerate(in_shard)
     )
 
     output_placements = (
         None
         if needs_reshard
-        else [
-            Shard(dim_map[s.dim]) if isinstance(s, Shard) else s
-            for s in in_shard
-        ]
+        else [Shard(dim_map[s.dim]) if isinstance(s, Shard) else s for s in in_shard]
     )
 
     return (tuple(out_shape), output_placements, shardable_dims)
@@ -631,11 +600,7 @@ def reshape_prop(op_schema: OpSchema) -> OutputSharding:
         global_in_shape = input_dtensor_spec.shape
         assert global_in_shape is not None, "Shape required."
 
-        (
-            global_out_shape,
-            shard_out,
-            shardable_dims,
-        ) = propagate_shape_and_sharding(
+        (global_out_shape, shard_out, shardable_dims,) = propagate_shape_and_sharding(
             input_dtensor_spec.placements,
             tuple(global_in_shape),
             rules,
diff --git a/torch/distributed/_tensor/placement_types.py b/torch/distributed/_tensor/placement_types.py
index f2df183b046db..9a2f7cfe86fea 100644
--- a/torch/distributed/_tensor/placement_types.py
+++ b/torch/distributed/_tensor/placement_types.py
@@ -1,7 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 
 from dataclasses import dataclass
-from typing import Optional, List, Sequence, Tuple, cast
+from typing import cast, List, Optional, Sequence, Tuple
 
 import torch
 import torch.distributed.distributed_c10d as c10d
@@ -60,11 +60,7 @@ def _split_tensor(
         if with_padding or contiguous:
             shard_list = []
             for i, shard in enumerate(tensor_list):
-                if (
-                    with_padding
-                    and idx_start_to_pad != 0
-                    and i >= idx_start_to_pad
-                ):
+                if with_padding and idx_start_to_pad != 0 and i >= idx_start_to_pad:
                     shard = self._pad_tensor(shard)
                 # input tensors are expected to be congtiguous by the collective backend
                 shard = shard.contiguous() if contiguous else shard
@@ -81,9 +77,7 @@ def _pad_tensor(self, tensor: torch.Tensor) -> torch.Tensor:
 
     def _unpad_tensor(self, tensor: torch.Tensor) -> torch.Tensor:
         # unpad tensor by 1 on the shard dim
-        return tensor.narrow(
-            self.dim, start=0, length=tensor.size(self.dim) - 1
-        )
+        return tensor.narrow(self.dim, start=0, length=tensor.size(self.dim) - 1)
 
     def _local_shard_size_on_dim(
         self,
@@ -237,9 +231,7 @@ def _to_replicate(
     ) -> torch.Tensor:
         # out-of-place all_reduce to replicate, since the current partial DTensor
         # might get used by other ops as well, so we can't inplace modify it
-        cloned_local = CommTensor(
-            tensor.clone(memory_format=torch.contiguous_format)
-        )
+        cloned_local = CommTensor(tensor.clone(memory_format=torch.contiguous_format))
         mesh.all_reduce(
             cloned_local, c10d.ReduceOp(self.reduce_op), mesh_dim=mesh_dim  # type: ignore[call-arg]
         )
@@ -334,9 +326,7 @@ def local_shape(self) -> Tuple[int, ...]:
         Compute the shape of a local shard of the given DTensor on its current
         coordinate of the mesh.
         """
-        assert (
-            self.shape is not None
-        ), "DTensorSpec does not contain global shape."
+        assert self.shape is not None, "DTensorSpec does not contain global shape."
         local_shape = list(self.shape)  # start with global shape
         for idx, placement in enumerate(self.placements):
             mesh_dim_size = self.mesh.size(idx)
@@ -361,9 +351,7 @@ def local_offsets(self) -> Tuple[int, ...]:
         global rank. This is mostly used by distributed checkpointing to know the
         exact offsets of the local shard.
         """
-        assert (
-            self.shape is not None
-        ), "DTensorSpec does not contain global shape."
+        assert self.shape is not None, "DTensorSpec does not contain global shape."
         local_offsets = [0] * self.ndim
         local_shape = list(self.shape)
 
diff --git a/torch/distributed/_tensor/redistribute.py b/torch/distributed/_tensor/redistribute.py
index ab36cd4089036..b3ffa1b9ab742 100644
--- a/torch/distributed/_tensor/redistribute.py
+++ b/torch/distributed/_tensor/redistribute.py
@@ -1,10 +1,15 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
-from typing import Dict, List, Sequence, Tuple, cast
+from typing import cast, Dict, List, Sequence, Tuple
 
 import torch
 import torch.distributed._tensor.api as dtensor
-from torch.distributed._tensor.placement_types import Placement, _Partial, Shard, Replicate
 from torch.distributed._tensor.device_mesh import DeviceMesh
+from torch.distributed._tensor.placement_types import (
+    _Partial,
+    Placement,
+    Replicate,
+    Shard,
+)
 
 
 _PlacementItem = Tuple[int, Tuple[Placement, Placement]]
@@ -54,8 +59,7 @@ def _decompose_reshard(val: List[_PlacementItem]) -> List[_PlacementItem]:
             and isinstance(target, Shard)
             and (
                 current.dim != target.dim
-                or repeat_dim_current[current.dim]
-                != repeat_dim_target[target.dim]
+                or repeat_dim_current[current.dim] != repeat_dim_target[target.dim]
             )
         ):
             # decompose Shard(i) -> Shard(j) into Shard(i) -> Replicate() -> Shard(j)
@@ -77,9 +81,7 @@ def _redistribute_with_local_tensor(
 ) -> torch.Tensor:
     new_local_tensor = None
 
-    sorted_placements = list(
-        enumerate(zip(current_placements, target_placements))
-    )
+    sorted_placements = list(enumerate(zip(current_placements, target_placements)))
     sorted_placements = _decompose_reshard(sorted_placements)
     sorted_placements.sort(key=_replicate_then_shard)
 
@@ -228,9 +230,7 @@ def backward(ctx, grad_output: "dtensor.DTensor"):  # type: ignore[override]
                 target_placements.append(target)
 
         return (
-            redistribute_dtensor(
-                grad_output, previous_device_mesh, target_placements
-            ),
+            redistribute_dtensor(grad_output, previous_device_mesh, target_placements),
             None,
             None,
         )
diff --git a/torch/distributed/_tensor/utils.py b/torch/distributed/_tensor/utils.py
index bb56f488d81f9..a8f561af700d3 100644
--- a/torch/distributed/_tensor/utils.py
+++ b/torch/distributed/_tensor/utils.py
@@ -1,7 +1,8 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 
+from typing import Dict, Optional, Sequence, Tuple, Union
+
 import torch
-from typing import Union, Dict, Tuple, Optional, Sequence
 
 import torch.distributed._tensor.api as dtensor
 from torch.distributed._tensor.placement_types import DTensorSpec

From 079b61915f06e73b379364d0d70749e140d43622 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Thu, 1 Dec 2022 01:42:17 +0000
Subject: [PATCH 1484/1922] [dtensor] ufmt test/distributed/_tensor (#89968)

cmd: `ufmt format test/distributed/_tensor`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89968
Approved by: https://github.com/fduwjj
---
 test/distributed/_tensor/test_api.py          |  54 +++----
 test/distributed/_tensor/test_common_rules.py | 136 +++++-------------
 test/distributed/_tensor/test_device_mesh.py  |  55 +++----
 test/distributed/_tensor/test_dtensor.py      |  68 +++------
 test/distributed/_tensor/test_dtensor_ops.py  |  37 ++---
 test/distributed/_tensor/test_math_ops.py     |  29 ++--
 test/distributed/_tensor/test_matrix_ops.py   |  58 +++-----
 .../distributed/_tensor/test_pointwise_ops.py |  36 ++---
 test/distributed/_tensor/test_redistribute.py |  97 ++++---------
 test/distributed/_tensor/test_tensor_ops.py   |  16 +--
 .../_tensor/test_tp_sharding_ops.py           |  12 +-
 test/distributed/_tensor/test_view_ops.py     |  64 ++++-----
 12 files changed, 217 insertions(+), 445 deletions(-)

diff --git a/test/distributed/_tensor/test_api.py b/test/distributed/_tensor/test_api.py
index a966f30d1cb9f..a4b5e84bce862 100644
--- a/test/distributed/_tensor/test_api.py
+++ b/test/distributed/_tensor/test_api.py
@@ -3,15 +3,18 @@
 
 import torch
 import torch.nn as nn
-from torch.testing._internal.common_utils import run_tests
-from torch.testing._internal.distributed._tensor.common_dtensor import DTensorTestBase, with_comms
 from torch.distributed._tensor import (
-    distribute_tensor,
-    distribute_module,
     DeviceMesh,
+    distribute_module,
+    distribute_tensor,
     DTensor,
-    Shard,
     Replicate,
+    Shard,
+)
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
 )
 
 
@@ -19,10 +22,7 @@ class MyModel(nn.Module):
     def __init__(self, n_features, n_layers, device):
         super().__init__()
         self.seq = nn.Sequential(
-            *[
-                nn.Linear(n_features, n_features, device=device)
-                for _ in range(n_layers)
-            ]
+            *[nn.Linear(n_features, n_features, device=device) for _ in range(n_layers)]
         )
 
     def forward(self, x):
@@ -50,12 +50,8 @@ def test_distribute_tensor(self):
             tensor_to_shard = torch.randn(
                 3 * self.world_size, 3, requires_grad=requires_grad
             )
-            dist_tensor = distribute_tensor(
-                tensor_to_shard, device_mesh, shard_spec
-            )
-            self.assertEqual(
-                dist_tensor.size(), torch.Size([3 * self.world_size, 3])
-            )
+            dist_tensor = distribute_tensor(tensor_to_shard, device_mesh, shard_spec)
+            self.assertEqual(dist_tensor.size(), torch.Size([3 * self.world_size, 3]))
             local_tensor = dist_tensor.to_local()
             self.assertEqual(local_tensor.size(), torch.Size([3, 3]))
             if requires_grad:
@@ -78,9 +74,7 @@ def test_distribute_tensor_errors(self):
         dtensor = distribute_tensor(tensor_to_distribute, device_mesh, spec)
 
         with self.assertRaisesRegex(ValueError, "to a different device mesh"):
-            new_mesh = DeviceMesh(
-                self.device_type, torch.arange(self.world_size)
-            )
+            new_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
             distribute_tensor(dtensor, new_mesh, [Shard(0)])
 
         with self.assertRaisesRegex(ValueError, "to a different placements"):
@@ -104,9 +98,7 @@ def test_distribute_tensor_uneven_sharding(self):
             splitted_tensor_list = tensor_to_shard.tensor_split(
                 self.world_size, dim=shard_dim
             )
-            dist_tensor = distribute_tensor(
-                tensor_to_shard, device_mesh, shard_spec
-            )
+            dist_tensor = distribute_tensor(tensor_to_shard, device_mesh, shard_spec)
             self.assertEqual(dist_tensor.size(), torch.Size(input_size))
             local_tensor = dist_tensor.to_local()
             self.assertEqual(local_tensor, splitted_tensor_list[self.rank])
@@ -115,9 +107,7 @@ def test_distribute_tensor_uneven_sharding(self):
     def test_distribute_module(self):
         device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
         # fully shard all linear modules on dim 0
-        module_to_shard = MyModel(
-            5 * self.world_size, 20, device=self.device_type
-        )
+        module_to_shard = MyModel(5 * self.world_size, 20, device=self.device_type)
         shard_spec = [Shard(0)]
 
         def shard_fn(name, module, device_mesh):
@@ -128,9 +118,7 @@ def shard_fn(name, module, device_mesh):
                     )
                     module.register_parameter(name, dist_param)
 
-        sharded_module = distribute_module(
-            module_to_shard, device_mesh, shard_fn
-        )
+        sharded_module = distribute_module(module_to_shard, device_mesh, shard_fn)
         for param in sharded_module.parameters():
             self.assertIsInstance(param, DTensor)
             self.assertEqual(param.placements, shard_spec)
@@ -162,21 +150,15 @@ def replicate_fn(name, module, device_mesh):
 
         # only shard part of module, and rest of module should be replicate
         def shard_fn(name, module, device_mesh):
-            if isinstance(module, nn.Linear) and (
-                name == "seq.0" or name == "seq.8"
-            ):
+            if isinstance(module, nn.Linear) and (name == "seq.0" or name == "seq.8"):
                 for name, param in module.named_parameters():
                     dist_param = torch.nn.Parameter(
                         distribute_tensor(param, device_mesh, shard_spec)
                     )
                     module.register_parameter(name, dist_param)
 
-        module_to_distribute = MyModel(
-            5 * self.world_size, 20, device=self.device_type
-        )
-        dist_module = distribute_module(
-            module_to_distribute, device_mesh, shard_fn
-        )
+        module_to_distribute = MyModel(5 * self.world_size, 20, device=self.device_type)
+        dist_module = distribute_module(module_to_distribute, device_mesh, shard_fn)
         for name, param in dist_module.named_parameters():
             self.assertIsInstance(param, DTensor)
             if name.startswith("seq.0") or name.startswith("seq.8"):
diff --git a/test/distributed/_tensor/test_common_rules.py b/test/distributed/_tensor/test_common_rules.py
index ab9743c1d5e9b..7e7a5c7654acb 100644
--- a/test/distributed/_tensor/test_common_rules.py
+++ b/test/distributed/_tensor/test_common_rules.py
@@ -2,21 +2,21 @@
 # Owner(s): ["oncall: distributed"]
 
 import torch
-from torch.testing._internal.common_utils import run_tests
-from torchgen.model import FunctionSchema
+from torch.distributed._tensor import DeviceMesh
 from torch.distributed._tensor.dispatch import OpSchema
 
 from torch.distributed._tensor.ops.common_rules import (
     einop_rule,
-    reduction_rule,
     pointwise_rule,
+    reduction_rule,
 )
 from torch.distributed._tensor.placement_types import DTensorSpec
+from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     with_comms,
 )
-from torch.distributed._tensor import DeviceMesh
+from torchgen.model import FunctionSchema
 
 
 class CommonRulesTest(DTensorTestBase):
@@ -34,17 +34,11 @@ def test_einop_basic_propagation(self):
         # plain einsum, mm
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
 
-        func_schema = self.parse_schema(
-            "aten::mm(Tensor self, Tensor mat2) -> Tensor"
-        )
+        func_schema = self.parse_schema("aten::mm(Tensor self, Tensor mat2) -> Tensor")
         # propagate col-wise sharding
         mat1, mat2 = [-1, -1], [-1, 0]
-        mat1_spec = DTensorSpec.from_dim_map(
-            mesh, mat1, [], shape=torch.Size([8, 4])
-        )
-        mat2_spec = DTensorSpec.from_dim_map(
-            mesh, mat2, [], shape=torch.Size([4, 8])
-        )
+        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], shape=torch.Size([8, 4]))
+        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], shape=torch.Size([4, 8]))
         output_sharding = einop_rule(
             "mk,kn->mn", OpSchema(func_schema, (mat1_spec, mat2_spec), {})
         )
@@ -55,12 +49,8 @@ def test_einop_basic_propagation(self):
 
         # propagate row-wise sharding
         mat1, mat2 = [0, -1], [-1, -1]
-        mat1_spec = DTensorSpec.from_dim_map(
-            mesh, mat1, [], shape=torch.Size([8, 4])
-        )
-        mat2_spec = DTensorSpec.from_dim_map(
-            mesh, mat2, [], shape=torch.Size([4, 8])
-        )
+        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], shape=torch.Size([8, 4]))
+        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], shape=torch.Size([4, 8]))
         output_sharding = einop_rule(
             "mk,kn->mn", OpSchema(func_schema, (mat1_spec, mat2_spec), {})
         )
@@ -71,12 +61,8 @@ def test_einop_basic_propagation(self):
 
         # generate partial
         mat1, mat2 = [-1, 0], [0, -1]
-        mat1_spec = DTensorSpec.from_dim_map(
-            mesh, mat1, [], shape=torch.Size([8, 4])
-        )
-        mat2_spec = DTensorSpec.from_dim_map(
-            mesh, mat2, [], shape=torch.Size([4, 8])
-        )
+        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], shape=torch.Size([8, 4]))
+        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], shape=torch.Size([4, 8]))
         output_sharding = einop_rule(
             "mk,kn->mn", OpSchema(func_schema, (mat1_spec, mat2_spec), {})
         )
@@ -94,9 +80,7 @@ def test_einop_pointwise_propagation(self):
         )
         # addition
         mat1 = [0, -1]
-        mat1_spec = DTensorSpec.from_dim_map(
-            mesh, mat1, [], shape=torch.Size([8, 8])
-        )
+        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], shape=torch.Size([8, 8]))
         output_sharding = einop_rule(
             "ij,ij->ij", OpSchema(func_schema, (mat1_spec, mat1_spec), {})
         )
@@ -110,9 +94,7 @@ def test_einop_pointwise_propagation(self):
         mat1_spec = DTensorSpec.from_dim_map(
             mesh, mat1, [], shape=torch.Size([8, 4, 2])
         )
-        mat2_spec = DTensorSpec.from_dim_map(
-            mesh, [-1], [], shape=torch.Size([2])
-        )
+        mat2_spec = DTensorSpec.from_dim_map(mesh, [-1], [], shape=torch.Size([2]))
         output_sharding = einop_rule(
             "ijk,k->ijk", OpSchema(func_schema, (mat1_spec, mat2_spec), {})
         )
@@ -144,17 +126,11 @@ def test_einop_merge_sharding(self):
         )
         mesh = DeviceMesh(self.device_type, mesh_shape)
 
-        func_schema = self.parse_schema(
-            "aten::mm(Tensor self, Tensor mat2) -> Tensor"
-        )
+        func_schema = self.parse_schema("aten::mm(Tensor self, Tensor mat2) -> Tensor")
 
         mat1, mat2 = [0, -1], [-1, 1]
-        mat1_spec = DTensorSpec.from_dim_map(
-            mesh, mat1, [], shape=torch.Size([8, 4])
-        )
-        mat2_spec = DTensorSpec.from_dim_map(
-            mesh, mat2, [], shape=torch.Size([4, 8])
-        )
+        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], shape=torch.Size([8, 4]))
+        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], shape=torch.Size([4, 8]))
         output_sharding = einop_rule(
             "mk,kn->mn", OpSchema(func_schema, (mat1_spec, mat2_spec), {})
         )
@@ -175,12 +151,8 @@ def test_einop_linearity(self):
         )
 
         mat1, mat2 = [0, -1], [-1, -1]
-        mat1_spec = DTensorSpec.from_dim_map(
-            mesh, mat1, [1], shape=torch.Size([8, 4])
-        )
-        mat2_spec = DTensorSpec.from_dim_map(
-            mesh, mat2, [], shape=torch.Size([4, 8])
-        )
+        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [1], shape=torch.Size([8, 4]))
+        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], shape=torch.Size([4, 8]))
         # if not turn on linearity, partial sum is not eligible to propagate, we return
         # suggestion to reshard inputs with no partial sum (i.e. all_reduce one input)
         output_sharding = einop_rule(
@@ -212,12 +184,8 @@ def test_einop_linearity(self):
             "aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor"
         )
         mat1, mat2 = [0, -1], [0, -1]
-        mat1_spec = DTensorSpec.from_dim_map(
-            mesh, mat1, [1], shape=torch.Size([8, 6])
-        )
-        mat2_spec = DTensorSpec.from_dim_map(
-            mesh, mat2, [], shape=torch.Size([8, 6])
-        )
+        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [1], shape=torch.Size([8, 6]))
+        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], shape=torch.Size([8, 6]))
 
         output_sharding = einop_rule(
             "ij,ij->ij",
@@ -237,16 +205,10 @@ def test_einop_multi_sharding_on_mesh_dim(self):
         mesh_shape = torch.arange(self.world_size)
         mesh = DeviceMesh(self.device_type, mesh_shape)
 
-        func_schema = self.parse_schema(
-            "aten::mm(Tensor self, Tensor mat2) -> Tensor"
-        )
+        func_schema = self.parse_schema("aten::mm(Tensor self, Tensor mat2) -> Tensor")
         mat1, mat2 = [0, -1], [0, -1]
-        mat1_spec = DTensorSpec.from_dim_map(
-            mesh, mat1, [], shape=torch.Size([8, 12])
-        )
-        mat2_spec = DTensorSpec.from_dim_map(
-            mesh, mat2, [], shape=torch.Size([12, 4])
-        )
+        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], shape=torch.Size([8, 12]))
+        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], shape=torch.Size([12, 4]))
         output_sharding = einop_rule(
             "mk,kn->mn", OpSchema(func_schema, (mat1_spec, mat2_spec), {})
         )
@@ -271,19 +233,11 @@ def test_einop_errors(self):
             "aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor"
         )
         mat1, mat2 = [0, -1], [1, -1]
-        mat1_spec = DTensorSpec.from_dim_map(
-            mesh, mat1, [], shape=torch.Size([8, 4])
-        )
-        mat2_spec = DTensorSpec.from_dim_map(
-            mesh, mat2, [], shape=torch.Size([8, 4])
-        )
+        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], shape=torch.Size([8, 4]))
+        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], shape=torch.Size([8, 4]))
 
-        with self.assertRaisesRegex(
-            RuntimeError, "sharded two different ways:"
-        ):
-            einop_rule(
-                "ij,ij->ij", OpSchema(func_schema, (mat1_spec, mat2_spec), {})
-            )
+        with self.assertRaisesRegex(RuntimeError, "sharded two different ways:"):
+            einop_rule("ij,ij->ij", OpSchema(func_schema, (mat1_spec, mat2_spec), {}))
 
     @with_comms
     def test_pointwise_rules_broadcasting(self):
@@ -293,12 +247,8 @@ def test_pointwise_rules_broadcasting(self):
             "where.self(Tensor condition, Tensor self, Tensor other) -> Tensor"
         )
         inp1, inp2, inp3 = [0], [], [-1, -1]
-        condition = DTensorSpec.from_dim_map(
-            mesh, inp1, [], shape=torch.Size([8])
-        )
-        self_tensor = DTensorSpec.from_dim_map(
-            mesh, inp2, [], shape=torch.Size([])
-        )
+        condition = DTensorSpec.from_dim_map(mesh, inp1, [], shape=torch.Size([8]))
+        self_tensor = DTensorSpec.from_dim_map(mesh, inp2, [], shape=torch.Size([]))
         other_tensor = DTensorSpec.from_dim_map(
             mesh, inp3, [], shape=torch.Size([1, 1])
         )
@@ -320,12 +270,8 @@ def test_pointwise_rules_suggestion(self):
         )
         # propagate point-wise sharding
         inp1, inp2 = [-1, -1], [-1, 0]
-        mat1_spec = DTensorSpec.from_dim_map(
-            mesh, inp1, [], shape=torch.Size([8, 4])
-        )
-        mat2_spec = DTensorSpec.from_dim_map(
-            mesh, inp2, [], shape=torch.Size([8, 4])
-        )
+        mat1_spec = DTensorSpec.from_dim_map(mesh, inp1, [], shape=torch.Size([8, 4]))
+        mat2_spec = DTensorSpec.from_dim_map(mesh, inp2, [], shape=torch.Size([8, 4]))
         # adding a positional argument -1 to arg schema
         output_sharding = pointwise_rule(
             OpSchema(func_schema, (mat1_spec, mat2_spec, -1), {})
@@ -353,12 +299,8 @@ def test_pointwise_multi_sharding_on_mesh_dim(self):
 
         # basic case to test implicit broadcasting shape alignment
         mat1, mat2 = [-1, 0], [0]
-        mat1_spec = DTensorSpec.from_dim_map(
-            mesh, mat1, [], shape=torch.Size([20, 6])
-        )
-        mat2_spec = DTensorSpec.from_dim_map(
-            mesh, mat2, [], shape=torch.Size([6])
-        )
+        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], shape=torch.Size([20, 6]))
+        mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], shape=torch.Size([6]))
         output_sharding = pointwise_rule(
             OpSchema(func_schema, (mat1_spec, mat2_spec), {})
         )
@@ -384,9 +326,7 @@ def test_pointwise_multi_sharding_on_mesh_dim(self):
         # ensure that the suggestion is to reshard the first
         # arg by all_gather first tensor dim sharding
         schema_suggestion = output_sharding.schema_suggestions[0]
-        self.assertEqual(
-            schema_suggestion.args_schema[0].dim_map, [-1, -1, -1, 1]
-        )
+        self.assertEqual(schema_suggestion.args_schema[0].dim_map, [-1, -1, -1, 1])
         self.assertEqual(schema_suggestion.args_schema[1].dim_map, mat2)
 
     @with_comms
@@ -431,9 +371,7 @@ def test_reduction_rule(self):
         )
         # reduction on a 2d mat
         mat1 = [0, -1]
-        mat1_spec = DTensorSpec.from_dim_map(
-            mesh, mat1, [], shape=torch.Size([8, 4])
-        )
+        mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], shape=torch.Size([8, 4]))
         # reduction on dim 0
         output_sharding_0 = reduction_rule(
             OpSchema(func_schema, (mat1_spec, 0), {}),
@@ -467,9 +405,7 @@ def test_reduction_rule(self):
         self.assertEqual(output_sharding_all_dim.output_spec.dim_map, [])
         # pending sum on mesh
         self.assertEqual(output_sharding_all_dim.output_spec.sums, [0])
-        self.assertEqual(
-            output_sharding_all_dim.output_spec.shape, torch.Size([])
-        )
+        self.assertEqual(output_sharding_all_dim.output_spec.shape, torch.Size([]))
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/_tensor/test_device_mesh.py b/test/distributed/_tensor/test_device_mesh.py
index 7088f33f42dbe..49013a8640a6e 100644
--- a/test/distributed/_tensor/test_device_mesh.py
+++ b/test/distributed/_tensor/test_device_mesh.py
@@ -2,20 +2,20 @@
 # Owner(s): ["oncall: distributed"]
 
 import torch
+from torch.distributed._tensor.device_mesh import DeviceMesh
+from torch.distributed._tensor.placement_types import Shard
 
 from torch.distributed.distributed_c10d import (
-    ProcessGroup,
-    new_group,
     get_global_rank,
     get_world_size,
+    new_group,
+    ProcessGroup,
 )
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     with_comms,
 )
-from torch.distributed._tensor.device_mesh import DeviceMesh
-from torch.distributed._tensor.placement_types import Shard
 
 
 class DeviceMeshTest(DTensorTestBase):
@@ -60,9 +60,7 @@ def test_device_mesh_2d_from_dim_groups(self):
                     dim_groups.append(subgroup)
 
         # construct a device mesh from the subgroups
-        mesh = DeviceMesh(
-            self.device_type, [[0, 1], [2, 3]], dim_groups=dim_groups
-        )
+        mesh = DeviceMesh(self.device_type, [[0, 1], [2, 3]], dim_groups=dim_groups)
 
         # check all dim groups
         dim_to_subgroups = mesh.get_dim_groups()
@@ -175,9 +173,7 @@ def test_scatter_1d(self):
             scatter_tensor_shape[scatter_dim] *= self.world_size
             # make the random seed same across rank
             torch.manual_seed(0)
-            global_tensor = torch.randn(
-                scatter_tensor_shape, device=self.device_type
-            )
+            global_tensor = torch.randn(scatter_tensor_shape, device=self.device_type)
             splitted_list, _ = shard_placement._split_tensor(
                 global_tensor, mesh.size(), with_padding=True, contiguous=True
             )
@@ -190,9 +186,7 @@ def test_scatter_1d(self):
     def test_scatter_uneven(self):
         device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
         my_rank = device_mesh.get_rank()
-        tensor_to_split = torch.randn(
-            device_mesh.size() + 3, device_mesh.size() + 1
-        )
+        tensor_to_split = torch.randn(device_mesh.size() + 3, device_mesh.size() + 1)
 
         for shard_dim in range(tensor_to_split.ndim):
             shard_placement = Shard(shard_dim)
@@ -208,14 +202,10 @@ def test_scatter_uneven(self):
             )
 
             scattered_tensor = torch.empty_like(padded_tensor_list[my_rank])
-            device_mesh.scatter(
-                scattered_tensor, padded_tensor_list, mesh_dim=0
-            )
+            device_mesh.scatter(scattered_tensor, padded_tensor_list, mesh_dim=0)
             # unpad scattered_tensor
             if pad_idx != 0 and my_rank >= pad_idx:
-                scattered_tensor = shard_placement._unpad_tensor(
-                    scattered_tensor
-                )
+                scattered_tensor = shard_placement._unpad_tensor(scattered_tensor)
 
             self.assertEqual(
                 scattered_tensor.size(), tensor_splitted_list[my_rank].size()
@@ -321,14 +311,10 @@ def test_reduce_scatter_uneven(self):
 
             res_num = ((0 + self.world_size - 1) * self.world_size) / 2
             scattered_tensor = torch.empty_like(padded_tensor_list[my_rank])
-            device_mesh.reduce_scatter(
-                scattered_tensor, padded_tensor_list, mesh_dim=0
-            )
+            device_mesh.reduce_scatter(scattered_tensor, padded_tensor_list, mesh_dim=0)
             # unpad scattered_tensor
             if pad_idx != 0 and my_rank >= pad_idx:
-                scattered_tensor = shard_placement._unpad_tensor(
-                    scattered_tensor
-                )
+                scattered_tensor = shard_placement._unpad_tensor(scattered_tensor)
 
             self.assertEqual(
                 scattered_tensor.size(), tensor_splitted_list[my_rank].size()
@@ -359,9 +345,7 @@ def test_all_gather_nd(self):
             gathered_tensor = torch.cat(gathered_tensor_list)
             exp_tensor = torch.ones(3 * dim_group_size, 3)
             for i in range(len(global_ranks)):
-                exp_tensor[i * 3 : (i + 1) * 3] = (
-                    torch.ones(3, 3) * global_ranks[i]
-                )
+                exp_tensor[i * 3 : (i + 1) * 3] = torch.ones(3, 3) * global_ranks[i]
             self.assertEqual(gathered_tensor, exp_tensor)
 
     @with_comms
@@ -373,8 +357,7 @@ def test_reduce_scatter_nd(self):
         for dim, dim_group in enumerate(dim_to_subgroups):
             dim_group_size = get_world_size(dim_group)
             local_rs_list = (
-                torch.ones(dim_group_size * 3, 3, device=self.device_type)
-                * self.rank
+                torch.ones(dim_group_size * 3, 3, device=self.device_type) * self.rank
             ).tensor_split(dim_group_size, dim=0)
             scattered_tensor = torch.empty_like(
                 local_rs_list[mesh.get_coordinate_on_dim(dim)],
@@ -492,9 +475,7 @@ def test_all_to_all_nd(self):
             ]
             expected_tensor_list = [
                 torch.ones(*tensor_shape, device=self.device_type)
-                * (
-                    my_coordinate + global_rank * dim_group_size
-                )  # i.e. transpose
+                * (my_coordinate + global_rank * dim_group_size)  # i.e. transpose
                 for global_rank in global_ranks
             ]
             for scatter_dim in range(len(tensor_shape)):
@@ -504,13 +485,9 @@ def test_all_to_all_nd(self):
                     for idx in range(len(input_tensor_list))
                 ]
                 # scatter on dim > 0 would generate non-contiguous tensor, verify that works
-                mesh.all_to_all(
-                    output_tensor_list, input_tensor_list, mesh_dim=dim
-                )
+                mesh.all_to_all(output_tensor_list, input_tensor_list, mesh_dim=dim)
                 output_tensor = torch.cat(output_tensor_list, dim=scatter_dim)
-                expected_tensor = torch.cat(
-                    expected_tensor_list, dim=scatter_dim
-                )
+                expected_tensor = torch.cat(expected_tensor_list, dim=scatter_dim)
                 self.assertEqual(output_tensor, expected_tensor)
 
 
diff --git a/test/distributed/_tensor/test_dtensor.py b/test/distributed/_tensor/test_dtensor.py
index 51ce1bd4ec583..8d29f4d3fea67 100644
--- a/test/distributed/_tensor/test_dtensor.py
+++ b/test/distributed/_tensor/test_dtensor.py
@@ -2,14 +2,14 @@
 # Owner(s): ["oncall: distributed"]
 
 import torch
+from torch.distributed._tensor import DeviceMesh, distribute_tensor, DTensor
+from torch.distributed._tensor.placement_types import _Partial, Replicate, Shard
 
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     with_comms,
 )
-from torch.distributed._tensor import DeviceMesh, DTensor, distribute_tensor
-from torch.distributed._tensor.placement_types import _Partial, Replicate, Shard
 
 
 class DTensorTest(DTensorTestBase):
@@ -40,14 +40,10 @@ def test_dtensor_constructor(self):
             size=dist_tensor_shape,
             requires_grad=True,
         )
-        self.assertEqual(
-            dist_tensor.size(), torch.Size((self.world_size * 3, 3))
-        )
+        self.assertEqual(dist_tensor.size(), torch.Size((self.world_size * 3, 3)))
 
         with self.assertWarnsRegex(UserWarning, "To construct"):
-            DTensor(
-                local_tensor, device_mesh, shard_spec, size=dist_tensor_shape
-            )
+            DTensor(local_tensor, device_mesh, shard_spec, size=dist_tensor_shape)
 
         local_tensor = torch.randn(3, 3, requires_grad=False)
         with self.assertWarnsRegex(UserWarning, "To construct"):
@@ -65,18 +61,14 @@ def test_dtensor_stride(self):
         shard0_spec = [Shard(0)]
         local_tensor = torch.randn(4, 8)
         global_shape = torch.Size([self.world_size * 4, 8])
-        dist_tensor = DTensor(
-            local_tensor, device_mesh, shard0_spec, size=global_shape
-        )
+        dist_tensor = DTensor(local_tensor, device_mesh, shard0_spec, size=global_shape)
         # won't affect stride
         self.assertEqual(dist_tensor.stride(), (8, 1))
 
         shard1_spec = [Shard(1)]
         local_tensor = torch.randn(8, 4)
         global_shape = torch.Size([8, self.world_size * 4])
-        dist_tensor = DTensor(
-            local_tensor, device_mesh, shard1_spec, size=global_shape
-        )
+        dist_tensor = DTensor(local_tensor, device_mesh, shard1_spec, size=global_shape)
         # will affect stride after DT initialized
         self.assertEqual(dist_tensor.stride(), (4 * self.world_size, 1))
 
@@ -96,21 +88,15 @@ def test_from_local(self):
         device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
         shard_spec = [Shard(0)]
         local_tensor = torch.randn(3, 3)
-        sharded_tensor = DTensor.from_local(
-            local_tensor, device_mesh, shard_spec
-        )
-        self.assertEqual(
-            sharded_tensor.size(), torch.Size([self.world_size * 3, 3])
-        )
+        sharded_tensor = DTensor.from_local(local_tensor, device_mesh, shard_spec)
+        self.assertEqual(sharded_tensor.size(), torch.Size([self.world_size * 3, 3]))
 
         replica_spec = [Replicate()]
         ddp_tensor = DTensor.from_local(local_tensor, device_mesh, replica_spec)
         self.assertEqual(ddp_tensor.size(), local_tensor.size())
 
         partial_spec = [_Partial()]
-        partial_tensor = DTensor.from_local(
-            local_tensor, device_mesh, partial_spec
-        )
+        partial_tensor = DTensor.from_local(local_tensor, device_mesh, partial_spec)
         self.assertEqual(partial_tensor.size(), local_tensor.size())
 
         # test dist tensor works with torch.Tensor during backwards
@@ -119,9 +105,7 @@ def test_from_local(self):
         local_tensor_temp = local_tensor_with_grad * 3
         # create the dist tensor with non leaf local tensor, dist tensor created
         # should also be non leaf node
-        dist_tensor = DTensor.from_local(
-            local_tensor_temp, device_mesh, shard_spec
-        )
+        dist_tensor = DTensor.from_local(local_tensor_temp, device_mesh, shard_spec)
         self.assertFalse(dist_tensor.is_leaf)
         # do some random operations on dist tensor
         output = dist_tensor * 3
@@ -185,9 +169,7 @@ def test_from_local_then_to_local(self):
         local_tensor_temp = local_tensor_with_grad + 8
         # step 2. create the dist tensor with non leaf local tensor, dist tensor
         # created should also be non leaf node
-        dist_tensor = DTensor.from_local(
-            local_tensor_temp, device_mesh, shard_spec
-        )
+        dist_tensor = DTensor.from_local(local_tensor_temp, device_mesh, shard_spec)
         self.assertFalse(dist_tensor.is_leaf)
         # do some random operations on dist tensor
         output = dist_tensor * 6
@@ -211,9 +193,7 @@ def test_dtensor_spec_read_only_after_set(self):
         device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
         shard_spec = [Shard(0)]
         local_tensor = torch.randn(3, 3)
-        sharded_tensor = DTensor.from_local(
-            local_tensor, device_mesh, shard_spec
-        )
+        sharded_tensor = DTensor.from_local(local_tensor, device_mesh, shard_spec)
 
         # modify shard_spec, and dist_tensor's spec should not be changed
         shard_spec[0] = Replicate()
@@ -225,9 +205,7 @@ def test_dtensor_properties(self):
         device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
         shard_spec = [Shard(0)]
         local_tensor = torch.randn(3, 3)
-        sharded_tensor = DTensor.from_local(
-            local_tensor, device_mesh, shard_spec
-        )
+        sharded_tensor = DTensor.from_local(local_tensor, device_mesh, shard_spec)
         self.assertEqual(sharded_tensor.device.type, self.device_type)
 
 
@@ -261,13 +239,9 @@ def test_dtensor_api_device_mesh_context_manager(self):
         with DeviceMesh(self.device_type, list(range(self.world_size))):
             shard_spec = [Shard(0)]
             local_tensor = torch.randn(3, 3)
-            sharded_tensor = DTensor.from_local(
-                local_tensor, placements=shard_spec
-            )
+            sharded_tensor = DTensor.from_local(local_tensor, placements=shard_spec)
             replica_spec = [Replicate()]
-            replica_tensor = sharded_tensor.redistribute(
-                placements=replica_spec
-            )
+            replica_tensor = sharded_tensor.redistribute(placements=replica_spec)
             self.assertEqual(
                 replica_tensor.size(), torch.Size([3 * self.world_size, 3])
             )
@@ -292,12 +266,8 @@ def test_dtensor_2d_mesh(self):
         # we should correctly construct the global tensor size
         shard_same_dim_spec = [Shard(0), Shard(0)]
         local_tensor = torch.randn(3, 3)
-        dist_tensor = DTensor.from_local(
-            local_tensor, mesh, shard_same_dim_spec
-        )
-        self.assertEqual(
-            dist_tensor.size(), torch.Size([3 * self.world_size, 3])
-        )
+        dist_tensor = DTensor.from_local(local_tensor, mesh, shard_same_dim_spec)
+        self.assertEqual(dist_tensor.size(), torch.Size([3 * self.world_size, 3]))
 
     @with_comms
     def test_device_mesh_nd(self):
@@ -350,9 +320,7 @@ def test_dtensor_spec_local_shard_offset(self):
         logical_tensor = torch.randn(tensor_shape)
         for shard_spec, expected_shard_offsets in shard_spec_and_offsets:
             dtensor = distribute_tensor(logical_tensor, device_mesh, shard_spec)
-            self.assertEqual(
-                expected_shard_offsets, dtensor._spec.local_offsets
-            )
+            self.assertEqual(expected_shard_offsets, dtensor._spec.local_offsets)
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/_tensor/test_dtensor_ops.py b/test/distributed/_tensor/test_dtensor_ops.py
index 22ae5807d5f34..198fec9a5d192 100644
--- a/test/distributed/_tensor/test_dtensor_ops.py
+++ b/test/distributed/_tensor/test_dtensor_ops.py
@@ -1,34 +1,37 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
 
-import torch
 import sys
 import unittest
 import warnings
 
-from torch.overrides import resolve_name
-from torch.utils._pytree import tree_flatten, tree_map
-from torch.testing._internal.common_utils import (
-    suppress_warnings,
-    TEST_WITH_ASAN,
-    run_tests,
-)
+import torch
 import torch.distributed as dist
+import torch.testing._internal.common_methods_invocations as common_ops
+
+from torch.distributed._tensor import DeviceMesh, DTensor, Replicate
+
+from torch.overrides import resolve_name
 from torch.testing._internal.common_device_type import (
-    ops,
     instantiate_device_type_tests,
+    ops,
 )
-import torch.testing._internal.common_methods_invocations as common_ops
 from torch.testing._internal.common_methods_invocations import DecorateInfo
-
-from torch.distributed._tensor import DTensor, DeviceMesh, Replicate
-from torch.testing._internal.distributed._tensor.dtensor_lagging_op_db import dtensor_lagging_op_db
+from torch.testing._internal.common_utils import (
+    run_tests,
+    suppress_warnings,
+    TEST_WITH_ASAN,
+)
 from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DEVICE_TYPE,
+    DTensorConverter,
     DTensorTestBase,
     TEST_SKIPS,
-    DTensorConverter,
-    DEVICE_TYPE,
 )
+from torch.testing._internal.distributed._tensor.dtensor_lagging_op_db import (
+    dtensor_lagging_op_db,
+)
+from torch.utils._pytree import tree_flatten, tree_map
 
 # rewrite common size variables to sth can be sharded evenly
 # we can enable uneven shards later, but need to adjust more on
@@ -695,9 +698,7 @@ def test():
 
 
 # only instantiate tests for DEVICE_TYPE alone (i.e. either CPU or GPU)
-instantiate_device_type_tests(
-    TestDTensorOps, globals(), only_for=(DEVICE_TYPE,)
-)
+instantiate_device_type_tests(TestDTensorOps, globals(), only_for=(DEVICE_TYPE,))
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/_tensor/test_math_ops.py b/test/distributed/_tensor/test_math_ops.py
index 403f22d8325ed..72bfd9c9d6d05 100644
--- a/test/distributed/_tensor/test_math_ops.py
+++ b/test/distributed/_tensor/test_math_ops.py
@@ -1,17 +1,18 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
 
+import itertools
+
 import torch
-from torch.testing._internal.common_utils import run_tests
 
 from torch.distributed._tensor import distribute_tensor
-from torch.distributed._tensor.placement_types import Shard, Replicate
+from torch.distributed._tensor.placement_types import Replicate, Shard
+from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
-    with_comms,
     skip_unless_torch_gpu,
+    with_comms,
 )
-import itertools
 
 
 class DistMathOpsTest(DTensorTestBase):
@@ -33,14 +34,10 @@ def test_sum(self):
                 dt_dim_sumed_tensor = mat1.sum(*sum_args).redistribute(
                     device_mesh, [Replicate()] * device_mesh.ndim
                 )
-                self.assertEqual(
-                    dt_dim_sumed_tensor.to_local(), dim_sumed_tensor
-                )
+                self.assertEqual(dt_dim_sumed_tensor.to_local(), dim_sumed_tensor)
 
         full_sumed_tensor = tensor_to_sum.sum()
-        dt_sum = mat1.sum().redistribute(
-            device_mesh, [Replicate()] * device_mesh.ndim
-        )
+        dt_sum = mat1.sum().redistribute(device_mesh, [Replicate()] * device_mesh.ndim)
         self.assertEqual(dt_sum.to_local(), full_sumed_tensor)
 
     # TODO: forward test can be removed once test_softmax_with_bwd passes on CPU
@@ -89,9 +86,7 @@ def test_softmax_with_bwd(self):
 
         for params in test_list:
             softmax_dim, shard_dim = params
-            x = torch.rand(
-                8, 12, 16, device=self.device_type, requires_grad=True
-            )
+            x = torch.rand(8, 12, 16, device=self.device_type, requires_grad=True)
             self.assertTrue(x.requires_grad)
             local_y = torch.nn.functional.softmax(
                 x, dim=softmax_dim, dtype=torch.float32
@@ -107,18 +102,14 @@ def test_softmax_with_bwd(self):
                     dist_softmax = dist_x.softmax(dim=softmax_dim)
             else:
                 dist_softmax = dist_x.softmax(dim=softmax_dim)
-                self.assertTrue(
-                    dist_softmax.placements[0].is_shard(dim=shard_dim)
-                )
+                self.assertTrue(dist_softmax.placements[0].is_shard(dim=shard_dim))
                 dist_y = dist_softmax.sum()
                 dist_y = dist_y.redistribute(device_mesh, [Replicate()])
                 self.assertEqual(dist_y.to_local(), local_y)
                 self.assertIsNone(dist_x.grad)
                 dist_y.backward()
                 self.assertIsNotNone(dist_x.grad)
-                dist_x_grad = dist_x.grad.redistribute(
-                    device_mesh, [Replicate()]
-                )
+                dist_x_grad = dist_x.grad.redistribute(device_mesh, [Replicate()])
                 self.assertEqual(dist_x_grad.to_local(), x.grad)
 
 
diff --git a/test/distributed/_tensor/test_matrix_ops.py b/test/distributed/_tensor/test_matrix_ops.py
index ed2af130ac884..af9e16dc2c241 100644
--- a/test/distributed/_tensor/test_matrix_ops.py
+++ b/test/distributed/_tensor/test_matrix_ops.py
@@ -1,18 +1,24 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
 
+import itertools
+from typing import cast, List, Optional
+
 import torch
-from torch.testing._internal.common_utils import run_tests
+from torch.distributed._tensor import DeviceMesh, distribute_tensor
 from torch.distributed._tensor.api import DTensor
+from torch.distributed._tensor.placement_types import (
+    _Partial,
+    Placement,
+    Replicate,
+    Shard,
+)
+from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
-    with_comms,
     skip_unless_torch_gpu,
+    with_comms,
 )
-from torch.distributed._tensor import distribute_tensor, DeviceMesh
-from torch.distributed._tensor.placement_types import Placement, Shard, Replicate, _Partial
-from typing import List, Optional, cast
-import itertools
 
 
 class DistMatrixOpsTest(DTensorTestBase):
@@ -30,9 +36,7 @@ def test_addmm(self):
         input = distribute_tensor(input_tensor, device_mesh, replica_spec)
 
         dist_res = torch.addmm(input, mat1, mat2)
-        local_res = torch.addmm(
-            input_tensor, tensor_to_shard, tensor_to_replicate
-        )
+        local_res = torch.addmm(input_tensor, tensor_to_shard, tensor_to_replicate)
         self.assertEqual(
             dist_res.redistribute(device_mesh, replica_spec).to_local(),
             local_res,
@@ -52,9 +56,7 @@ def test_addmm_auto_redistribute(self):
         input_tensor = torch.randn(4, requires_grad=True)
         input = distribute_tensor(input_tensor, device_mesh, replica_spec)
 
-        local_res = torch.addmm(
-            input_tensor, tensor_to_shard1, tensor_to_shard0
-        )
+        local_res = torch.addmm(input_tensor, tensor_to_shard1, tensor_to_shard0)
         dist_res = torch.addmm(input, mat1, mat2)
 
         # test if addmm output is a partial
@@ -99,9 +101,7 @@ def test_placement_comb(
             self.assertIsNotNone(dt1.grad)
 
         placement_specs = [shard0_spec, shard1_spec, replica_spec]
-        shard_specs_comb = list(
-            itertools.product(placement_specs, placement_specs)
-        )
+        shard_specs_comb = list(itertools.product(placement_specs, placement_specs))
         for spec in shard_specs_comb:
             test_placement_comb([spec[0]], [spec[1]])
 
@@ -147,15 +147,9 @@ def test_t_partial(self):
     @skip_unless_torch_gpu
     def test_baddbmm(self):
         device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
-        tensor = torch.rand(
-            4, 4, 8, device=self.device_type, requires_grad=True
-        )
-        batch_1 = torch.rand(
-            4, 4, 8, device=self.device_type, requires_grad=True
-        )
-        batch_2 = torch.rand(
-            4, 8, 8, device=self.device_type, requires_grad=True
-        )
+        tensor = torch.rand(4, 4, 8, device=self.device_type, requires_grad=True)
+        batch_1 = torch.rand(4, 4, 8, device=self.device_type, requires_grad=True)
+        batch_2 = torch.rand(4, 8, 8, device=self.device_type, requires_grad=True)
 
         def test_placement_comb(
             tensor_placements: List[Placement],
@@ -165,15 +159,9 @@ def test_placement_comb(
             alpha: int,
             batch_1_grad: Optional[torch.Tensor],
         ) -> None:
-            tensor_dt = distribute_tensor(
-                tensor, device_mesh, tensor_placements
-            )
-            batch_1_dt = distribute_tensor(
-                batch_1, device_mesh, batch_1_placements
-            )
-            batch_2_dt = distribute_tensor(
-                batch_2, device_mesh, batch_2_placements
-            )
+            tensor_dt = distribute_tensor(tensor, device_mesh, tensor_placements)
+            batch_1_dt = distribute_tensor(batch_1, device_mesh, batch_1_placements)
+            batch_2_dt = distribute_tensor(batch_2, device_mesh, batch_2_placements)
             dist_res = cast(
                 DTensor,
                 torch.baddbmm(
@@ -289,9 +277,7 @@ def test_placement_comb(
         shard2_spec = Shard(2)
         replica_spec = Replicate()
         placement_specs = [shard0_spec, shard1_spec, shard2_spec, replica_spec]
-        shard_specs_comb = list(
-            itertools.product(placement_specs, placement_specs)
-        )
+        shard_specs_comb = list(itertools.product(placement_specs, placement_specs))
 
         # tests that currently pass
         for spec in shard_specs_comb:
diff --git a/test/distributed/_tensor/test_pointwise_ops.py b/test/distributed/_tensor/test_pointwise_ops.py
index 5069166dee279..5b5eccfcb2ec8 100644
--- a/test/distributed/_tensor/test_pointwise_ops.py
+++ b/test/distributed/_tensor/test_pointwise_ops.py
@@ -1,28 +1,28 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
 
-from typing import Sequence, Any, Dict, Callable, Optional
+from typing import Any, Callable, Dict, Optional, Sequence
 from unittest import skip
 
 import torch
+
+import torch.utils._pytree as pytree
 from torch import Tensor
-from torch.testing._internal.common_utils import run_tests
-from torch.testing._internal.distributed._tensor.common_dtensor import (
-    DTensorTestBase,
-    with_comms,
-    skip_unless_torch_gpu,
-)
 
-from torch.distributed._tensor import DeviceMesh, DTensor, distribute_tensor
+from torch.distributed._tensor import DeviceMesh, distribute_tensor, DTensor
 from torch.distributed._tensor.placement_types import (
-    Shard,
-    Replicate,
     _Partial,
     Placement,
+    Replicate,
+    Shard,
 )
 from torch.distributed.distributed_c10d import ReduceOp
-
-import torch.utils._pytree as pytree
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    skip_unless_torch_gpu,
+    with_comms,
+)
 
 
 def no_op():
@@ -185,9 +185,7 @@ def test_activations(self):
         )
 
     @with_comms
-    @skip(
-        "testing RNG based ops is broken: https://github.com/pytorch/tau/issues/494"
-    )
+    @skip("testing RNG based ops is broken: https://github.com/pytorch/tau/issues/494")
     def test_dropout(self):
         device_mesh = self.build_device_mesh()
 
@@ -267,14 +265,10 @@ def test_mul_out(self):
         dtensor = DTensor.from_local(input_tensor, device_mesh, shard_spec)
 
         other_tensor = torch.randn(*input_size, device=self.device_type)
-        other_dtensor = DTensor.from_local(
-            other_tensor, device_mesh, shard_spec
-        )
+        other_dtensor = DTensor.from_local(other_tensor, device_mesh, shard_spec)
 
         output_tensor = torch.randn(*input_size, device=self.device_type)
-        output_dtensor = DTensor.from_local(
-            output_tensor, device_mesh, shard_spec
-        )
+        output_dtensor = DTensor.from_local(output_tensor, device_mesh, shard_spec)
         dt = torch.mul(dtensor, other_dtensor, out=output_dtensor)
         expected = torch.mul(input_tensor, other_tensor, out=output_tensor)
         self.assertEqual(input_tensor, dtensor.to_local())
diff --git a/test/distributed/_tensor/test_redistribute.py b/test/distributed/_tensor/test_redistribute.py
index 78fc991d615f9..70489e26791f4 100644
--- a/test/distributed/_tensor/test_redistribute.py
+++ b/test/distributed/_tensor/test_redistribute.py
@@ -2,7 +2,10 @@
 # Owner(s): ["oncall: distributed"]
 
 import itertools
+
 import torch
+from torch.distributed._tensor import DeviceMesh, distribute_tensor, DTensor
+from torch.distributed._tensor.placement_types import _Partial, Replicate, Shard
 
 from torch.testing._internal.common_utils import run_tests
 
@@ -10,8 +13,6 @@
     DTensorTestBase,
     with_comms,
 )
-from torch.distributed._tensor import distribute_tensor, DeviceMesh, DTensor
-from torch.distributed._tensor.placement_types import _Partial, Replicate, Shard
 
 
 class RedistributeTest(DTensorTestBase):
@@ -56,16 +57,10 @@ def test_shard_to_replicate_forward_backward(self):
     def test_replicate_to_replicate_forward_backward(self):
         device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
         replica_spec = [Replicate()]
-        local_tensor = torch.randn(
-            12, 3, device=self.device_type, requires_grad=True
-        )
+        local_tensor = torch.randn(12, 3, device=self.device_type, requires_grad=True)
         # 1) test replicate -> replicate forward
-        replica_tensor = distribute_tensor(
-            local_tensor, device_mesh, replica_spec
-        )
-        reshard_replica_tensor = replica_tensor.redistribute(
-            device_mesh, replica_spec
-        )
+        replica_tensor = distribute_tensor(local_tensor, device_mesh, replica_spec)
+        reshard_replica_tensor = replica_tensor.redistribute(device_mesh, replica_spec)
         self.assertEqual(replica_tensor.size(), local_tensor.size())
         self.assertEqual(replica_tensor, reshard_replica_tensor)
 
@@ -96,17 +91,11 @@ def test_replicate_to_shard_forward_backward(self):
             local_replica = torch.randn(
                 input_size, device=self.device_type, requires_grad=True
             )
-            splitted_list = local_replica.tensor_split(
-                self.world_size, shard_dim
-            )
+            splitted_list = local_replica.tensor_split(self.world_size, shard_dim)
             # make local tensor as the element of the corresponding chunked list
             local_tensor = splitted_list[self.rank]
-            replica_tensor = distribute_tensor(
-                local_replica, device_mesh, replica_spec
-            )
-            reshard_tensor = replica_tensor.redistribute(
-                device_mesh, shard_spec
-            )
+            replica_tensor = distribute_tensor(local_replica, device_mesh, replica_spec)
+            reshard_tensor = replica_tensor.redistribute(device_mesh, shard_spec)
             self.assertEqual(reshard_tensor.size(), replica_tensor.size())
             self.assertEqual(reshard_tensor.placements, shard_spec)
             self.assertEqual(reshard_tensor.to_local(), local_tensor)
@@ -126,18 +115,12 @@ def test_partial_to_replicate_forward_backward(self):
         # replicate to partial internally, and also partial to replicate
         # backward should work as expected
         device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
-        partial_local = torch.randn(
-            12, 3, device=self.device_type, requires_grad=True
-        )
+        partial_local = torch.randn(12, 3, device=self.device_type, requires_grad=True)
         partial_spec = [_Partial()]
         replica_spec = [Replicate()]
         # test partial -> replicate, which trigger all_reduce
-        partial_tensor = DTensor.from_local(
-            partial_local, device_mesh, partial_spec
-        )
-        global_partial_tensor = partial_tensor.redistribute(
-            device_mesh, replica_spec
-        )
+        partial_tensor = DTensor.from_local(partial_local, device_mesh, partial_spec)
+        global_partial_tensor = partial_tensor.redistribute(device_mesh, replica_spec)
 
         self.assertEqual(partial_tensor.size(), partial_local.size())
         self.assertEqual(
@@ -153,37 +136,23 @@ def test_partial_to_replicate_forward_backward(self):
     @with_comms
     def test_replicate_to_partial(self):
         device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
-        local_tensor = torch.randn(
-            12, 3, device=self.device_type, requires_grad=True
-        )
+        local_tensor = torch.randn(12, 3, device=self.device_type, requires_grad=True)
         partial_spec = _Partial()
         replica_spec = Replicate()
         # 1) test replicate -> partial forward
-        replica_tensor = distribute_tensor(
-            local_tensor, device_mesh, [replica_spec]
-        )
-        with self.assertRaisesRegex(
-            RuntimeError, "Can not redistribute to _Partial"
-        ):
-            partial_tensor = replica_tensor.redistribute(
-                device_mesh, [partial_spec]
-            )
+        replica_tensor = distribute_tensor(local_tensor, device_mesh, [replica_spec])
+        with self.assertRaisesRegex(RuntimeError, "Can not redistribute to _Partial"):
+            partial_tensor = replica_tensor.redistribute(device_mesh, [partial_spec])
 
         from torch.distributed._tensor.redistribute import Redistribute
 
-        partial_tensor = Redistribute.apply(
-            replica_tensor, device_mesh, [partial_spec]
-        )
+        partial_tensor = Redistribute.apply(replica_tensor, device_mesh, [partial_spec])
         self.assertEqual(partial_tensor.size(), local_tensor.size())
         # test it successfully zero out the contents on other ranks
         if self.rank == 0:
-            self.assertEqual(
-                replica_tensor.to_local(), partial_tensor.to_local()
-            )
+            self.assertEqual(replica_tensor.to_local(), partial_tensor.to_local())
         else:
-            self.assertEqual(
-                partial_tensor.to_local(), torch.zeros_like(local_tensor)
-            )
+            self.assertEqual(partial_tensor.to_local(), torch.zeros_like(local_tensor))
 
         # replicate to partial on sub groups
         local_tensor = torch.randn(12, 3, device=self.device_type)
@@ -203,13 +172,9 @@ def test_replicate_to_partial(self):
         if self.rank != 3:
             # replicate to partial should only zero out rank 3, and leave
             # rank 0/2 (rank0 on mesh dim 1) and 0, 1 (rank0 on mesh dim 1) un-touched
-            self.assertEqual(
-                replica_tensor.to_local(), partial_tensor.to_local()
-            )
+            self.assertEqual(replica_tensor.to_local(), partial_tensor.to_local())
         else:
-            self.assertEqual(
-                replica_tensor.to_local(), torch.zeros_like(local_tensor)
-            )
+            self.assertEqual(replica_tensor.to_local(), torch.zeros_like(local_tensor))
 
     @with_comms
     def test_partial_to_shard(self):
@@ -237,9 +202,7 @@ def test_partial_to_shard(self):
             local_shape = list(input_size)
             local_shape[shard_dim] = quot + (1 if self.rank < rem else 0)
             # test partial to shard, trigger reduce_scatter
-            scatter_shard_tensor = partial_tensor.redistribute(
-                device_mesh, shard_spec
-            )
+            scatter_shard_tensor = partial_tensor.redistribute(device_mesh, shard_spec)
             self.assertEqual(scatter_shard_tensor.size(), partial_tensor.size())
             self.assertEqual(scatter_shard_tensor.placements, shard_spec)
             self.assertEqual(
@@ -268,23 +231,15 @@ def test_multi_dim_mesh(self):
                 # because distribute_tensor is expected to override shards in ranks != 0
                 full_tensor = torch.ones(*tensor_shape)
 
-            possibilities = [Replicate()] + [
-                Shard(i) for i in range(full_tensor.ndim)
-            ]
-            all_outputs = list(
-                itertools.product(*(mesh_shape.ndim * [possibilities]))
-            )
+            possibilities = [Replicate()] + [Shard(i) for i in range(full_tensor.ndim)]
+            all_outputs = list(itertools.product(*(mesh_shape.ndim * [possibilities])))
             all_inputs = list(
-                itertools.product(
-                    *(mesh_shape.ndim * [possibilities + [_Partial()]])
-                )
+                itertools.product(*(mesh_shape.ndim * [possibilities + [_Partial()]]))
             )
 
             for inputs in all_inputs:
                 # if partial, temporarily make it Replicated, then replace replicated with partial afterwards
-                repl_inputs = [
-                    Replicate() if s.is_partial() else s for s in inputs
-                ]
+                repl_inputs = [Replicate() if s.is_partial() else s for s in inputs]
                 dt = distribute_tensor(full_tensor, device_mesh, repl_inputs)
 
                 if repl_inputs != inputs:
diff --git a/test/distributed/_tensor/test_tensor_ops.py b/test/distributed/_tensor/test_tensor_ops.py
index 1ba3f6d5f95b6..254b365e34dc2 100644
--- a/test/distributed/_tensor/test_tensor_ops.py
+++ b/test/distributed/_tensor/test_tensor_ops.py
@@ -2,14 +2,14 @@
 # Owner(s): ["oncall: distributed"]
 
 import torch
+from torch.distributed._tensor import DeviceMesh, distribute_tensor, DTensor
+from torch.distributed._tensor.placement_types import _Partial, Replicate, Shard
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorConverter,
     DTensorTestBase,
     with_comms,
 )
-from torch.distributed._tensor import distribute_tensor, DeviceMesh, DTensor
-from torch.distributed._tensor.placement_types import Shard, Replicate, _Partial
 
 
 class DistTensorOpsTest(DTensorTestBase):
@@ -92,9 +92,7 @@ def test_inplace_op(self):
         shard_spec = [Shard(0)]
         partial_spec = [_Partial()]
         dt_to_inplace_add = distribute_tensor(input_tensor, mesh, shard_spec)
-        partial_grad = DTensor.from_local(
-            torch.randn(12, 3), mesh, partial_spec
-        )
+        partial_grad = DTensor.from_local(torch.randn(12, 3), mesh, partial_spec)
         res = dt_to_inplace_add.add_(partial_grad)
         self.assertTrue(res is dt_to_inplace_add)
         self.assertTrue(res.placements == shard_spec)
@@ -191,9 +189,7 @@ def test_fill_inplace_partial_sum(self):
         assert dist_tensor.shape == (4, 8)
 
         torch.fill_(dist_tensor, 42)
-        fill_expected = torch.full(
-            dist_tensor.shape, 42, dtype=input_tensor.dtype
-        )
+        fill_expected = torch.full(dist_tensor.shape, 42, dtype=input_tensor.dtype)
         self.assertEqual(
             fill_expected,
             dist_tensor.redistribute(device_mesh, [Replicate()]).to_local(),
@@ -252,9 +248,7 @@ def _test_op(self, mesh, op_call, *args, **kwargs):
     @with_comms
     def test_index(self):
         meshes = [
-            DeviceMesh(
-                self.device_type, list(range(self.world_size))
-            ),  # 1D mesh
+            DeviceMesh(self.device_type, list(range(self.world_size))),  # 1D mesh
             # TODO(@azzolini): un-comment when DTensorConverter supports N-D mesh
             # DeviceMesh(self.device_type, torch.arange(self.world_size).reshape(2, -1)), # 2D mesh
         ]
diff --git a/test/distributed/_tensor/test_tp_sharding_ops.py b/test/distributed/_tensor/test_tp_sharding_ops.py
index acd28fe3a3065..ef4d635f6ef76 100644
--- a/test/distributed/_tensor/test_tp_sharding_ops.py
+++ b/test/distributed/_tensor/test_tp_sharding_ops.py
@@ -2,12 +2,18 @@
 # Owner(s): ["oncall: distributed"]
 
 import torch
+from torch.distributed._tensor import (
+    DeviceMesh,
+    distribute_tensor,
+    DTensor,
+    Replicate,
+    Shard,
+)
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     with_comms,
 )
-from torch.distributed._tensor import DeviceMesh, DTensor, Shard, Replicate, distribute_tensor
 
 
 class TPShardingOpsTest(DTensorTestBase):
@@ -22,9 +28,7 @@ def test_sharded_view(self):
         tensor = torch.rand(16, 35, 26)
         sharding = [Shard(0)]
         st = distribute_tensor(tensor, device_mesh, sharding).view(8, 4, 35, 13)
-        st_new = distribute_tensor(
-            tensor.view(8, 4, 35, 13), device_mesh, sharding
-        )
+        st_new = distribute_tensor(tensor.view(8, 4, 35, 13), device_mesh, sharding)
         self.assertEqual(st.to_local(), st_new.to_local())
         self.assertEqual(st.placements[0], st_new.placements[0])
 
diff --git a/test/distributed/_tensor/test_view_ops.py b/test/distributed/_tensor/test_view_ops.py
index c1c5a03b91132..fa502d2b56031 100644
--- a/test/distributed/_tensor/test_view_ops.py
+++ b/test/distributed/_tensor/test_view_ops.py
@@ -1,32 +1,32 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
 
-from typing import List, cast
-from torch.distributed._tensor.placement_types import Placement
-from torch.testing._internal.distributed._tensor.common_dtensor import (
-    DTensorTestBase,
-    redistribute_profiler,
-    with_comms,
-)
-from torch.distributed._tensor import DeviceMesh, Shard, Replicate, distribute_tensor
+import itertools
+from typing import cast, List
+
+import torch
+import torch.distributed as dist
+from torch import rand, randn, Tensor
+from torch.distributed._tensor import DeviceMesh, distribute_tensor, Replicate, Shard
 from torch.distributed._tensor.ops.view_ops import (
-    ops,
-    Singleton,
     Broadcast,
     Flatten,
+    InputDim,
+    ops,
     Repeat,
+    Singleton,
     Split,
-    InputDim,
     view_groups,
 )
-from torch import Tensor, rand, randn
+from torch.distributed._tensor.placement_types import Placement
 from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    redistribute_profiler,
+    with_comms,
+)
 from torch.utils._pytree import tree_flatten
 
-import itertools
-import torch
-import torch.distributed as dist
-
 
 class TestViewOps(DTensorTestBase):
     def test_view_groups(self):
@@ -75,12 +75,8 @@ def test_view_groups(self):
         self.assertEquals(
             view_groups([2, 3, 4, 5, 7], [3, 8, 7, 5]),
             (
-                Split(
-                    Flatten((InputDim(0), InputDim(1), InputDim(2))), (3, 8), 0
-                ),
-                Split(
-                    Flatten((InputDim(0), InputDim(1), InputDim(2))), (3, 8), 1
-                ),
+                Split(Flatten((InputDim(0), InputDim(1), InputDim(2))), (3, 8), 0),
+                Split(Flatten((InputDim(0), InputDim(1), InputDim(2))), (3, 8), 1),
                 Split(Flatten((InputDim(3), InputDim(4))), (7, 5), 0),
                 Split(Flatten((InputDim(3), InputDim(4))), (7, 5), 1),
             ),
@@ -159,9 +155,7 @@ def call_dt_test(self, op, args, kwargs, device_mesh: DeviceMesh):
             no_shard_dims.add(kwargs.get("dim", 0))
 
         sharding_choices = cast(List[Placement], [Replicate()]) + [
-            Shard(i)
-            for i, s in enumerate(in_shape)
-            if s > 1 and i not in no_shard_dims
+            Shard(i) for i, s in enumerate(in_shape) if s > 1 and i not in no_shard_dims
         ]
 
         all_sharding_choices = itertools.product(
@@ -175,9 +169,7 @@ def call_dt_test(self, op, args, kwargs, device_mesh: DeviceMesh):
             with redistribute_profiler() as profiler:
                 out_dt = op(in_dt, *args[1:], **kwargs)
 
-            self.assertEqual(
-                profiler.num_calls, 0, "Expected no redistribution."
-            )
+            self.assertEqual(profiler.num_calls, 0, "Expected no redistribution.")
 
             full_out = out_dt.redistribute(
                 device_mesh, device_mesh.ndim * [Replicate()]
@@ -198,19 +190,11 @@ def test_view_ops(self):
         )
         self.dimmap_test(torch.atleast_1d, (randn(()),), (Singleton(),))
         self.dimmap_test(torch.atleast_1d, (randn(24),), (InputDim(0),))
-        self.dimmap_test(
-            torch.atleast_1d, (randn(24, 36),), (InputDim(0), InputDim(1))
-        )
+        self.dimmap_test(torch.atleast_1d, (randn(24, 36),), (InputDim(0), InputDim(1)))
 
-        self.dimmap_test(
-            torch.atleast_2d, (randn(()),), (Singleton(), Singleton())
-        )
-        self.dimmap_test(
-            torch.atleast_2d, (randn(24),), (Singleton(), InputDim(0))
-        )
-        self.dimmap_test(
-            torch.atleast_2d, (randn(24, 36),), (InputDim(0), InputDim(1))
-        )
+        self.dimmap_test(torch.atleast_2d, (randn(()),), (Singleton(), Singleton()))
+        self.dimmap_test(torch.atleast_2d, (randn(24),), (Singleton(), InputDim(0)))
+        self.dimmap_test(torch.atleast_2d, (randn(24, 36),), (InputDim(0), InputDim(1)))
         self.dimmap_test(
             torch.atleast_2d,
             (randn(24, 36, 48),),

From 6cebb4703887f8305a418778cfb7b665a7c6ccad Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Thu, 1 Dec 2022 02:15:06 +0000
Subject: [PATCH 1485/1922] [tp] umft distributed.tensor.parallel (#89969)

cmd: `ufmt format torch/distributed/tensor`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89969
Approved by: https://github.com/fduwjj
---
 torch/distributed/tensor/parallel/__init__.py | 11 ++---
 torch/distributed/tensor/parallel/_utils.py   | 14 ++----
 .../tensor/parallel/_view_with_dim_change.py  |  2 +-
 torch/distributed/tensor/parallel/api.py      | 19 ++++----
 torch/distributed/tensor/parallel/fsdp.py     | 46 +++++++------------
 .../tensor/parallel/multihead_attention_tp.py |  8 ++--
 torch/distributed/tensor/parallel/style.py    | 22 ++++-----
 7 files changed, 47 insertions(+), 75 deletions(-)

diff --git a/torch/distributed/tensor/parallel/__init__.py b/torch/distributed/tensor/parallel/__init__.py
index fc0c760d6ccdc..760d4b24cb7c2 100644
--- a/torch/distributed/tensor/parallel/__init__.py
+++ b/torch/distributed/tensor/parallel/__init__.py
@@ -1,23 +1,20 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
+from torch.distributed.tensor.parallel.api import parallelize_module
 from torch.distributed.tensor.parallel.multihead_attention_tp import (
     TensorParallelMultiheadAttention,
 )
 
 from torch.distributed.tensor.parallel.style import (
     ColwiseParallel,
-    PairwiseParallel,
-    ParallelStyle,
-    RowwiseParallel,
     make_input_replicate_1d,
     make_input_shard_1d,
     make_input_shard_1d_dim_last,
     make_output_replicate_1d,
     make_output_shard_1d,
     make_output_tensor,
-)
-
-from torch.distributed.tensor.parallel.api import (
-    parallelize_module,
+    PairwiseParallel,
+    ParallelStyle,
+    RowwiseParallel,
 )
 
 __all__ = [
diff --git a/torch/distributed/tensor/parallel/_utils.py b/torch/distributed/tensor/parallel/_utils.py
index c63fe638b351b..e1a74960f1644 100644
--- a/torch/distributed/tensor/parallel/_utils.py
+++ b/torch/distributed/tensor/parallel/_utils.py
@@ -1,8 +1,8 @@
 import functools
+from typing import Callable, Optional, Union
 
 import torch
 from torch.distributed._tensor import DeviceMesh, DTensor
-from typing import Callable, Optional, Union
 
 _PrepareInputType = Callable[
     [Union[torch.Tensor, DTensor], Optional[DeviceMesh], Optional[int]], DTensor
@@ -53,9 +53,7 @@ def wrapper(*args, **kwargs):  # pyre-ignore[2, 3]
                 device_mesh = input.device_mesh
                 args = (*args[:1], device_mesh, *args[2:])  # pyre-ignore[60]
             else:
-                raise RuntimeError(
-                    "device_mesh is not passed nor can be inferred"
-                )
+                raise RuntimeError("device_mesh is not passed nor can be inferred")
         if device_mesh.ndim != 1:
             raise RuntimeError(
                 f"device_mesh has dims {device_mesh.ndim} but expcted to be 1"
@@ -111,9 +109,7 @@ def wrapper(*args, **kwargs):  # pyre-ignore[2, 3]
     return wrapper
 
 
-def _create_1d_device_mesh(
-    device_mesh: DeviceMesh, tp_mesh_dim: int = 0
-) -> DeviceMesh:
+def _create_1d_device_mesh(device_mesh: DeviceMesh, tp_mesh_dim: int = 0) -> DeviceMesh:
     """
     This function converts a N-D ``device_mesh`` into a 1D ``device_mesh``
     for 1D Tensor Parallelism.
@@ -130,9 +126,7 @@ def _create_1d_device_mesh(
         device_mesh (DeviceMesh): 1-D :class:``DeviceMesh`` object that
             Tensor Parallelism operates on.
     """
-    assert (
-        tp_mesh_dim < device_mesh.ndim and tp_mesh_dim >= -device_mesh.ndim
-    ), (
+    assert tp_mesh_dim < device_mesh.ndim and tp_mesh_dim >= -device_mesh.ndim, (
         f"Expect tp_mesh_dim within range [{-device_mesh.ndim},"
         f" {device_mesh.ndim}), but found {tp_mesh_dim}."
     )
diff --git a/torch/distributed/tensor/parallel/_view_with_dim_change.py b/torch/distributed/tensor/parallel/_view_with_dim_change.py
index 7988129318b78..e2e1cc547178a 100644
--- a/torch/distributed/tensor/parallel/_view_with_dim_change.py
+++ b/torch/distributed/tensor/parallel/_view_with_dim_change.py
@@ -3,8 +3,8 @@
 
 import torch
 from torch.distributed._tensor import DTensor as DT
-from torch.distributed._tensor.placement_types import Shard
 from torch.distributed._tensor.ops.utils import prod
+from torch.distributed._tensor.placement_types import Shard
 
 
 def _view_with_sharding_dim_change(
diff --git a/torch/distributed/tensor/parallel/api.py b/torch/distributed/tensor/parallel/api.py
index e1f2328de5964..2e27d83859c82 100644
--- a/torch/distributed/tensor/parallel/api.py
+++ b/torch/distributed/tensor/parallel/api.py
@@ -1,22 +1,25 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
+from typing import Dict, Union
+
 import torch
 import torch.nn as nn
-from typing import Union, Dict
 from torch.distributed._tensor import (
+    DeviceMesh,
     distribute_module,
     distribute_tensor,
-    Shard,
     Replicate,
-    DeviceMesh,
+    Shard,
+)
+from torch.distributed.tensor.parallel._utils import _create_1d_device_mesh
+from torch.distributed.tensor.parallel.multihead_attention_tp import (
+    TensorParallelMultiheadAttention,
 )
-from torch.distributed.tensor.parallel import TensorParallelMultiheadAttention
 from torch.distributed.tensor.parallel.style import (
     ColwiseParallel,
     PairwiseParallel,
     ParallelStyle,
     RowwiseParallel,
 )
-from torch.distributed.tensor.parallel._utils import _create_1d_device_mesh
 
 
 __all__ = [
@@ -245,8 +248,7 @@ def _parallelize_linear(
 
     if not isinstance(parallel_style, ParallelStyle):
         raise RuntimeError(
-            "Expect a ParallelStyle object but received"
-            f" {type(parallel_style)}!"
+            "Expect a ParallelStyle object but received" f" {type(parallel_style)}!"
         )
 
     if device_mesh.ndim > 1:
@@ -306,8 +308,7 @@ def _parallelize_multihead_attn(
 
     if not isinstance(parallel_style, PairwiseParallel):
         raise NotImplementedError(
-            "Only support PairwiseParallel for Multihead Attention"
-            " parallelization."
+            "Only support PairwiseParallel for Multihead Attention" " parallelization."
         )
 
     if device_mesh.ndim > 1:
diff --git a/torch/distributed/tensor/parallel/fsdp.py b/torch/distributed/tensor/parallel/fsdp.py
index 55b34f10d7437..3ab3a32dc04b9 100644
--- a/torch/distributed/tensor/parallel/fsdp.py
+++ b/torch/distributed/tensor/parallel/fsdp.py
@@ -1,17 +1,12 @@
-import warnings
 import copy
-from typing import List, NamedTuple, Optional, Tuple, cast
+import warnings
+from typing import cast, List, NamedTuple, Optional, Tuple
 
 import torch
 import torch.distributed as dist
-import torch.distributed.distributed_c10d as c10d
-
-from torch.distributed.fsdp._shard_utils import _create_chunk_sharded_tensor
 
 import torch.distributed._shard.sharding_spec as shard_spec
-from torch.distributed._shard.sharding_spec.chunk_sharding_spec import (
-    ChunkShardingSpec,
-)
+import torch.distributed.distributed_c10d as c10d
 
 from torch.distributed._shard.sharded_tensor import (
     Shard,
@@ -20,19 +15,20 @@
     TensorProperties,
 )
 
-from torch.distributed._shard.sharding_spec import (
-    ShardMetadata,
-)
-
-from torch.distributed.remote_device import _remote_device
+from torch.distributed._shard.sharding_spec import ShardMetadata
+from torch.distributed._shard.sharding_spec.chunk_sharding_spec import ChunkShardingSpec
 
 from torch.distributed._tensor import (
-    DTensor as DistributedTensor,
     DeviceMesh,
+    DTensor as DistributedTensor,
     Shard as DShard,
 )
 from torch.distributed._tensor.placement_types import Placement
 
+from torch.distributed.fsdp._shard_utils import _create_chunk_sharded_tensor
+
+from torch.distributed.remote_device import _remote_device
+
 __all__ = ["is_available"]
 
 
@@ -62,9 +58,7 @@ def _get_box(tensor: DistributedTensor) -> Tuple[torch.Size, torch.Size]:
     return (torch.Size(offsets), tensor._local_tensor.size())
 
 
-def _get_box_for(
-    tensor: DistributedTensor, idx: int
-) -> Tuple[torch.Size, torch.Size]:
+def _get_box_for(tensor: DistributedTensor, idx: int) -> Tuple[torch.Size, torch.Size]:
     offsets, size = _get_box(tensor)
     return (torch.Size([val * idx for val in offsets]), size)
 
@@ -76,9 +70,7 @@ def _get_local_box(tensor: DistributedTensor) -> Tuple[torch.Size, torch.Size]:
     return _get_box_for(tensor, dim_0_coord)
 
 
-def _create_shard_md_from_dt(
-    dt: DistributedTensor, current_rank: int
-) -> ShardMetadata:
+def _create_shard_md_from_dt(dt: DistributedTensor, current_rank: int) -> ShardMetadata:
     mesh = dt.device_mesh
     assert mesh.ndim == 1, "Only 1D DeviceMeshes currently handled"
 
@@ -159,9 +151,7 @@ def _rewrite_spec_if_needed(
         for i, placement in enumerate(spec.placements):
             placement = cast(_remote_device, placement)
             if placement.rank() == rank and placement.device() != tensor.device:
-                spec.placements[i] = _remote_device(
-                    f"rank:{rank}/{tensor.device}"
-                )
+                spec.placements[i] = _remote_device(f"rank:{rank}/{tensor.device}")
 
     return spec
 
@@ -268,9 +258,7 @@ def _chunk_tensor(
         dt_pg = _get_dt_pg(tensor)
         # We do this differently here, we create a ST with no local shards then patch it
         shards = [
-            Shard(
-                inner_st, _create_shard_md_from_dt(tensor, dist.get_rank(dt_pg))
-            )
+            Shard(inner_st, _create_shard_md_from_dt(tensor, dist.get_rank(dt_pg)))
         ]
 
         st_meta = _create_sharded_tensor_md_from_dt(tensor, dt_pg)
@@ -307,11 +295,11 @@ def _pre_load_state_dict(
 
 
 try:
+    from torch.distributed.fsdp._common_utils import _set_fsdp_flattened
     from torch.distributed.fsdp._fsdp_extensions import (
         _set_fsdp_extensions,
         FSDPExtensions,
     )
-    from torch.distributed.fsdp._common_utils import _set_fsdp_flattened
 
     class DTensorExtensions(FSDPExtensions):
         def pre_flatten_transform(
@@ -333,9 +321,7 @@ def chunk_tensor(
             num_devices_per_node: int,
             pg: dist.ProcessGroup,
         ) -> torch.Tensor:
-            return _chunk_tensor(
-                tensor, rank, world_size, num_devices_per_node, pg
-            )
+            return _chunk_tensor(tensor, rank, world_size, num_devices_per_node, pg)
 
         def pre_load_state_dict_transform(
             self,
diff --git a/torch/distributed/tensor/parallel/multihead_attention_tp.py b/torch/distributed/tensor/parallel/multihead_attention_tp.py
index ff5c547b396ba..d5b7f0c25bd3e 100644
--- a/torch/distributed/tensor/parallel/multihead_attention_tp.py
+++ b/torch/distributed/tensor/parallel/multihead_attention_tp.py
@@ -3,6 +3,8 @@
 
 import math
 
+from typing import Optional, Union
+
 import torch
 from torch.distributed._tensor import DTensor as DT
 from torch.distributed._tensor.placement_types import Shard
@@ -10,8 +12,6 @@
     _view_with_sharding_dim_change,
 )
 
-from typing import Optional, Union
-
 __all__ = ["TensorParallelMultiheadAttention"]
 
 
@@ -183,9 +183,7 @@ def forward(
             query_layer = _view_with_sharding_dim_change(
                 query_layer, 1, (sq, b * nh, -1)
             )
-            key_layer = _view_with_sharding_dim_change(
-                key_layer, 1, (sq, b * nh, -1)
-            )
+            key_layer = _view_with_sharding_dim_change(key_layer, 1, (sq, b * nh, -1))
             value_layer = _view_with_sharding_dim_change(
                 value_layer, 1, (sq, b * nh, -1)
             )
diff --git a/torch/distributed/tensor/parallel/style.py b/torch/distributed/tensor/parallel/style.py
index 05ac1db708c1c..34a160ab14ada 100644
--- a/torch/distributed/tensor/parallel/style.py
+++ b/torch/distributed/tensor/parallel/style.py
@@ -1,14 +1,14 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
-from abc import abstractmethod
+from abc import ABC, abstractmethod
+from typing import Optional, Union
+
 import torch
-from abc import ABC
-from typing import Union, Optional
-from torch.distributed._tensor import DTensor, Shard, Replicate, DeviceMesh
+from torch.distributed._tensor import DeviceMesh, DTensor, Replicate, Shard
 from torch.distributed.tensor.parallel._utils import (
-    _PrepareInputType,
-    _PrepareOutputType,
     _prepare_input_validate,
     _prepare_output_validate,
+    _PrepareInputType,
+    _PrepareOutputType,
 )
 
 __all__ = [
@@ -21,7 +21,7 @@
     "make_input_shard_1d_dim_last",
     "make_output_replicate_1d",
     "make_output_tensor",
-    "make_output_shard_1d"
+    "make_output_shard_1d",
 ]
 
 
@@ -104,9 +104,7 @@ def make_input_shard_1d(
     if isinstance(input, DTensor):
         return input.redistribute(device_mesh, shard_spec)
     elif isinstance(input, torch.Tensor):
-        return DTensor.from_local(
-            input, device_mesh, shard_spec, run_check=False
-        )
+        return DTensor.from_local(input, device_mesh, shard_spec, run_check=False)
     else:
         raise RuntimeError(
             "Tensor parallel module expects torch.Tensor or DTensor input but"
@@ -163,9 +161,7 @@ def make_input_replicate_1d(
     if isinstance(input, DTensor):
         return input.redistribute(device_mesh, replicate)
     elif isinstance(input, torch.Tensor):
-        return DTensor.from_local(
-            input, device_mesh, replicate, run_check=False
-        )
+        return DTensor.from_local(input, device_mesh, replicate, run_check=False)
     else:
         raise RuntimeError(
             "Tensor parallel module expects torch.Tensor or DTensor input but"

From 337bf9fc672d0ad00de21c215f25cbefd1025906 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Thu, 1 Dec 2022 02:15:06 +0000
Subject: [PATCH 1486/1922] [tp] ufmt test/distributed/tensor (#89970)

formatting stack to make dtensor and tp align with pytorch format standard.

cmd: `ufmt format test/distributed/tensor`
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89970
Approved by: https://github.com/fduwjj
---
 .../tensor/parallel/test_2d_parallel.py       | 50 +++++++-----------
 .../tensor/parallel/test_parallelize_api.py   | 52 ++++++-------------
 .../tensor/parallel/test_tp_examples.py       | 29 ++++-------
 .../tensor/parallel/test_tp_style.py          | 43 ++++++---------
 .../parallel/test_view_sharding_dim_change.py |  8 +--
 5 files changed, 67 insertions(+), 115 deletions(-)

diff --git a/test/distributed/tensor/parallel/test_2d_parallel.py b/test/distributed/tensor/parallel/test_2d_parallel.py
index f203846cb711a..e71be70ae9ab8 100644
--- a/test/distributed/tensor/parallel/test_2d_parallel.py
+++ b/test/distributed/tensor/parallel/test_2d_parallel.py
@@ -2,28 +2,20 @@
 
 from typing import Any
 
-
 import torch
-import torch.nn.functional as F
 import torch.distributed as dist
+
+import torch.distributed.distributed_c10d as distributed_c10d
+import torch.nn.functional as F
 from torch.distributed._shard.sharded_tensor.api import ShardedTensor
+from torch.distributed._tensor import DeviceMesh, DTensor as DT, Replicate
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
-from torch.distributed._tensor import (
-    DeviceMesh,
-    DTensor as DT,
-    Replicate,
-)
-from torch.distributed.tensor.parallel import (
-    PairwiseParallel,
-    parallelize_module,
-)
-
-import torch.distributed.distributed_c10d as distributed_c10d
+from torch.distributed.tensor.parallel import PairwiseParallel, parallelize_module
+from torch.distributed.tensor.parallel.fsdp import is_available
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 
 from torch.testing._internal.common_utils import run_tests
-from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.distributed.tensor.parallel.fsdp import is_available
 
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
@@ -64,9 +56,7 @@ def _distribute_and_fsdp_wrap_module(
         module.net2 = FSDP(
             module.net2, process_group=pg, use_orig_params=use_orig_params
         )
-    return FSDP(
-        module, process_group=pg, use_orig_params=use_orig_params
-    )
+    return FSDP(module, process_group=pg, use_orig_params=use_orig_params)
 
 
 def init_model(model_parallel_size=TP_DEGREE, use_orig_params=False, fsdp_nested=False):
@@ -100,9 +90,7 @@ def is_nested_tensor(val: Any) -> bool:
         if isinstance(val.local_shards()[0].tensor, DT):
             raise ValueError("Cannot handle DT nested insided ST")
     # Safety valve for when this eventually happen
-    elif isinstance(val, DT) and isinstance(
-        val._local_tensor, (DT, ShardedTensor)
-    ):
+    elif isinstance(val, DT) and isinstance(val._local_tensor, (DT, ShardedTensor)):
         raise ValueError("Cannot handle nested DT")
     return False
 
@@ -137,9 +125,7 @@ def test_2d_fsdp_integration_functionality(self) -> None:
         self.assertTrue(
             is_nested_tensor(optim_state["state"]["net1.weight"]["exp_avg"])
         )
-        self.assertFalse(
-            is_nested_tensor(optim_state["state"]["net3.bias"]["exp_avg"])
-        )
+        self.assertFalse(is_nested_tensor(optim_state["state"]["net3.bias"]["exp_avg"]))
 
     def _compare_params(self, m1, m2):
         with FSDP.summon_full_params(m1):
@@ -152,19 +138,21 @@ def _compare_params(self, m1, m2):
                     if name == "net2.bias" and self.rank != 0:
                         continue
                     if type(p2) is DT:
-                        p2 = p2.redistribute(
-                            p2.device_mesh, [Replicate()]
-                        ).to_local()
+                        p2 = p2.redistribute(p2.device_mesh, [Replicate()]).to_local()
                     self.assertTrue(torch.allclose(p1, p2), f"{p1} vs {p2}")
 
-    def _test_2d_e2e_flow(self, use_orig_params=False, fsdp_nested=False, multi_param_group=False) -> None:
+    def _test_2d_e2e_flow(
+        self, use_orig_params=False, fsdp_nested=False, multi_param_group=False
+    ) -> None:
         if not is_available():
             self.skipTest("FSDP 2d parallel integration not available")
         torch.manual_seed(0)
         model = SimpleModel().cuda(self.rank)
         model = FSDP(model, use_orig_params=use_orig_params)
         torch.manual_seed(0)
-        model_2d, dp_pg = init_model(use_orig_params=use_orig_params, fsdp_nested=fsdp_nested)
+        model_2d, dp_pg = init_model(
+            use_orig_params=use_orig_params, fsdp_nested=fsdp_nested
+        )
         # Check named parameters are returning the same name at least.
         param_names_2d = [name for name, _ in model_2d.named_parameters()]
         for name, _ in model.named_parameters():
@@ -220,7 +208,9 @@ def test_2d_fsdp_integration_fsdp_nested(self) -> None:
     @with_comms
     @skip_if_lt_x_gpu(4)
     def test_2d_fsdp_integration_fsdp_nested_param_groups(self) -> None:
-        self._test_2d_e2e_flow(fsdp_nested=True, use_orig_params=True, multi_param_group=True)
+        self._test_2d_e2e_flow(
+            fsdp_nested=True, use_orig_params=True, multi_param_group=True
+        )
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/tensor/parallel/test_parallelize_api.py b/test/distributed/tensor/parallel/test_parallelize_api.py
index bb2055575f69c..7375de3ef1814 100644
--- a/test/distributed/tensor/parallel/test_parallelize_api.py
+++ b/test/distributed/tensor/parallel/test_parallelize_api.py
@@ -1,23 +1,21 @@
 # Owner(s): ["oncall: distributed"]
 
 import torch
-from torch.testing._internal.common_utils import run_tests
-from torch.testing._internal.distributed._tensor.common_dtensor import DTensorTestBase, with_comms
-from torch.distributed._tensor import DeviceMesh, Replicate, DTensor
+from torch.distributed._tensor import DeviceMesh, DTensor, Replicate
+from torch.distributed.tensor.parallel._utils import _create_1d_device_mesh
+from torch.distributed.tensor.parallel.api import _parallelize_linear, _parallelize_mlp
 from torch.distributed.tensor.parallel.style import (
     ColwiseParallel,
+    make_input_replicate_1d,
+    make_output_replicate_1d,
     PairwiseParallel,
     ParallelStyle,
     RowwiseParallel,
 )
-from torch.distributed.tensor.parallel.api import (
-    _parallelize_linear,
-    _parallelize_mlp,
-)
-from torch.distributed.tensor.parallel._utils import _create_1d_device_mesh
-from torch.distributed.tensor.parallel.style import (
-    make_input_replicate_1d,
-    make_output_replicate_1d,
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
 )
 
 
@@ -55,18 +53,14 @@ def test_creat_1d_device_mesh(self):
         one_dimention_mesh_shape = mesh_shape[self.rank // dim_one_size, :]
         pg = mesh.get_dim_groups()[1]
         new_mesh = _create_1d_device_mesh(mesh, 1)
-        expected_mesh = DeviceMesh(
-            self.device_type, one_dimention_mesh_shape, [pg]
-        )
+        expected_mesh = DeviceMesh(self.device_type, one_dimention_mesh_shape, [pg])
         self.assertEqual(new_mesh.mesh, expected_mesh.mesh)
         self.assertEqual(new_mesh.device_type, expected_mesh.device_type)
         # When 1D dim is 0.
         one_dimention_mesh_shape = mesh_shape[:, self.rank % dim_one_size]
         pg = mesh.get_dim_groups()[0]
         new_mesh = _create_1d_device_mesh(mesh, 0)
-        expected_mesh = DeviceMesh(
-            self.device_type, one_dimention_mesh_shape, [pg]
-        )
+        expected_mesh = DeviceMesh(self.device_type, one_dimention_mesh_shape, [pg])
         self.assertEqual(new_mesh.mesh, expected_mesh.mesh)
         self.assertEqual(new_mesh.device_type, expected_mesh.device_type)
 
@@ -103,9 +97,7 @@ def _compare_params(
                     ).to_local(),
                 )
 
-    def _compare_module(
-        self, local_module, dist_module, inp_size, rowwise=False
-    ):
+    def _compare_module(self, local_module, dist_module, inp_size, rowwise=False):
         LR = 0.25  # the learning rate we use for testing
         local_optim = torch.optim.SGD(local_module.parameters(), lr=LR)
         dist_optim = torch.optim.SGD(dist_module.parameters(), lr=LR)
@@ -118,9 +110,7 @@ def _compare_module(
         inp = inp.chunk(self.world_size, dim=-1)[self.rank] if rowwise else inp
         dist_output = dist_module(inp)
         dist_output = (
-            dist_output.to_local()
-            if isinstance(dist_output, DTensor)
-            else dist_output
+            dist_output.to_local() if isinstance(dist_output, DTensor) else dist_output
         )
         self.assertEqual(local_output, dist_output)
 
@@ -147,9 +137,7 @@ def test_parallelize_mlp(self):
         self.assertEqual(model.net2.bias, model_tp.net2.bias)
 
         # Parallelize module.
-        device_mesh = DeviceMesh(
-            self.device_type, torch.arange(self.world_size)
-        )
+        device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
         model_tp = _parallelize_mlp(model_tp, device_mesh, PairwiseParallel())
         self._compare_module(model, model_tp, inp_size)
 
@@ -157,14 +145,10 @@ def test_parallelize_mlp(self):
     def test_parallelize_mlp_error(self):
         class DummyParallel(ParallelStyle):
             def __init__(self) -> None:
-                super().__init__(
-                    make_input_replicate_1d, make_output_replicate_1d
-                )
+                super().__init__(make_input_replicate_1d, make_output_replicate_1d)
 
         model_tp = MLPModule(self.device_type)
-        device_mesh = DeviceMesh(
-            self.device_type, torch.arange(self.world_size)
-        )
+        device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
         with self.assertRaisesRegex(
             NotImplementedError,
             "Only support PairwiseParallel for MLP parallelization.",
@@ -174,9 +158,7 @@ def __init__(self) -> None:
         with self.assertRaisesRegex(
             RuntimeError, "More than one nn.Linear needed for a MLP."
         ):
-            _parallelize_mlp(
-                torch.nn.Linear(10, 5), device_mesh, PairwiseParallel()
-            )
+            _parallelize_mlp(torch.nn.Linear(10, 5), device_mesh, PairwiseParallel())
 
     @with_comms
     def test_linear_row_wise_parallel(self):
diff --git a/test/distributed/tensor/parallel/test_tp_examples.py b/test/distributed/tensor/parallel/test_tp_examples.py
index 963ea797fd839..12ee9b0b651c2 100644
--- a/test/distributed/tensor/parallel/test_tp_examples.py
+++ b/test/distributed/tensor/parallel/test_tp_examples.py
@@ -3,21 +3,18 @@
 
 import torch
 import torch.nn as nn
+from torch.distributed._tensor import DeviceMesh, Replicate
+from torch.distributed.tensor.parallel import (
+    PairwiseParallel,
+    parallelize_module,
+    TensorParallelMultiheadAttention,
+)
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
-    with_comms,
     NUM_DEVICES,
     skip_unless_torch_gpu,
-)
-from torch.distributed._tensor import (
-    DeviceMesh,
-    Replicate,
-)
-from torch.distributed.tensor.parallel import (
-    PairwiseParallel,
-    TensorParallelMultiheadAttention,
-    parallelize_module,
+    with_comms,
 )
 
 
@@ -300,24 +297,18 @@ def test_self_attn_replacement_megatron_e2e(self):
             add_bias_kv=True,
             device=self.device_type,
         )
-        model_tp = MultiheadAttnWrap(
-            16, 8, add_bias_kv=True, device=self.device_type
-        )
+        model_tp = MultiheadAttnWrap(16, 8, add_bias_kv=True, device=self.device_type)
 
         # TODO: somehow using torch.nn.MultiheadAttention's initial params does not work
         # Use TensorParallelMultiheadAttention parameters instead
         x = model.qkv.weight.clone().detach().requires_grad_()
-        model_tp.attn.register_parameter(
-            "in_proj_weight", torch.nn.Parameter(x)
-        )
+        model_tp.attn.register_parameter("in_proj_weight", torch.nn.Parameter(x))
 
         x = model.qkv.bias.clone().detach().requires_grad_()
         model_tp.attn.register_parameter("in_proj_bias", torch.nn.Parameter(x))
 
         x = model.proj.weight.clone().detach().requires_grad_()
-        model_tp.attn.out_proj.register_parameter(
-            "weight", torch.nn.Parameter(x)
-        )
+        model_tp.attn.out_proj.register_parameter("weight", torch.nn.Parameter(x))
 
         x = model.proj.bias.clone().detach().requires_grad_()
         model_tp.attn.out_proj.register_parameter("bias", torch.nn.Parameter(x))
diff --git a/test/distributed/tensor/parallel/test_tp_style.py b/test/distributed/tensor/parallel/test_tp_style.py
index d7ad841281cbd..7aeb086f03a4c 100644
--- a/test/distributed/tensor/parallel/test_tp_style.py
+++ b/test/distributed/tensor/parallel/test_tp_style.py
@@ -2,17 +2,20 @@
 # Owner(s): ["oncall: distributed"]
 
 import torch
-from torch.testing._internal.common_utils import run_tests
-from torch.testing._internal.distributed._tensor.common_dtensor import DTensorTestBase, with_comms
-from torch.distributed._tensor import distribute_tensor, DeviceMesh, Shard, Replicate
+from torch.distributed._tensor import DeviceMesh, distribute_tensor, Replicate, Shard
 from torch.distributed.tensor.parallel.style import (
-    RowwiseParallel,
     ColwiseParallel,
-    make_input_shard_1d,
     make_input_replicate_1d,
-    make_output_shard_1d,
+    make_input_shard_1d,
     make_output_replicate_1d,
+    make_output_shard_1d,
     make_output_tensor,
+    RowwiseParallel,
+)
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
 )
 
 
@@ -61,12 +64,8 @@ def test_make_input_shard_1d(self):
         self._1d_input_func_check(tensor, tensor, make_input_shard_1d)
 
     # Common logic for testing prepare output funcs
-    def _test_prepare_output(
-        self, func, spec, dim=None, device_mesh_input_none=False
-    ):
-        device_mesh = DeviceMesh(
-            self.device_type, torch.arange(self.world_size)
-        )
+    def _test_prepare_output(self, func, spec, dim=None, device_mesh_input_none=False):
+        device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
         tensor = torch.rand(8, 16, device=self.device_type)
         dtensor = distribute_tensor(tensor, device_mesh, spec)
         device_mesh_input = None if device_mesh_input_none else device_mesh
@@ -99,16 +98,12 @@ def test_make_output_replicate_1d(self):
         output, dtensor, device_mesh = self._test_prepare_output(
             make_output_replicate_1d, [Shard(0)]
         )
-        self.assertEqual(
-            output, dtensor.redistribute(device_mesh, [Replicate()])
-        )
+        self.assertEqual(output, dtensor.redistribute(device_mesh, [Replicate()]))
         # test when input device_mesh is None.
         output, dtensor, device_mesh = self._test_prepare_output(
             make_output_replicate_1d, [Shard(0)], None, True
         )
-        self.assertEqual(
-            output, dtensor.redistribute(device_mesh, [Replicate()])
-        )
+        self.assertEqual(output, dtensor.redistribute(device_mesh, [Replicate()]))
 
     @with_comms
     def test_make_output_tensor(self):
@@ -137,9 +132,7 @@ def test_make_output_tensor(self):
     # Common logic for testing prepare output funcs errors.
     def _test_prepare_output_error(self, func):
         tensor = torch.rand(8, 16, device=self.device_type)
-        device_mesh = DeviceMesh(
-            self.device_type, torch.arange(self.world_size)
-        )
+        device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
         dtensor = distribute_tensor(tensor, device_mesh, [Shard(0)])
         output = [dtensor]
         with self.assertRaisesRegex(
@@ -173,16 +166,12 @@ def test_rowwise_parallel_style(self):
         output, dtensor, device_mesh = self._test_prepare_output(
             rs._prepare_input, [Shard(0)]
         )
-        self.assertEqual(
-            output, dtensor.redistribute(device_mesh, [Replicate()])
-        )
+        self.assertEqual(output, dtensor.redistribute(device_mesh, [Replicate()]))
         # test when input device_mesh is None.
         output, dtensor, device_mesh = self._test_prepare_output(
             rs._prepare_input, [Shard(0)], None, True
         )
-        self.assertEqual(
-            output, dtensor.redistribute(device_mesh, [Replicate()])
-        )
+        self.assertEqual(output, dtensor.redistribute(device_mesh, [Replicate()]))
         self._test_prepare_output_error(rs._prepare_output)
 
     @with_comms
diff --git a/test/distributed/tensor/parallel/test_view_sharding_dim_change.py b/test/distributed/tensor/parallel/test_view_sharding_dim_change.py
index b02382e20bccd..4c1475ef5dba5 100644
--- a/test/distributed/tensor/parallel/test_view_sharding_dim_change.py
+++ b/test/distributed/tensor/parallel/test_view_sharding_dim_change.py
@@ -2,15 +2,15 @@
 # Owner(s): ["oncall: distributed"]
 
 import torch
+from torch.distributed._tensor import DeviceMesh, DTensor, Shard
+from torch.distributed.tensor.parallel._view_with_dim_change import (
+    _view_with_sharding_dim_change,
+)
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     with_comms,
 )
-from torch.distributed._tensor import DeviceMesh, DTensor, Shard
-from torch.distributed.tensor.parallel._view_with_dim_change import (
-    _view_with_sharding_dim_change,
-)
 
 
 class TPViewShardingDimChangeTest(DTensorTestBase):

From 26bf7e4f5f388bf3444107f3c7499812cd759362 Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Thu, 1 Dec 2022 21:03:11 +0000
Subject: [PATCH 1487/1922] [chalf] relax tolerance : conv_transpose2d (#89993)

Fixes https://github.com/pytorch/pytorch/issues/87332

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89993
Approved by: https://github.com/lezcano
---
 torch/testing/_internal/common_methods_invocations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index dd2cd7763b77f..d11c275cc220c 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -11089,7 +11089,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    toleranceOverride({torch.complex32: tol(atol=5e-2, rtol=5e-2)}),
                    "TestCudaFuserOpInfo", "test_nvfuser_correctness"),
                DecorateInfo(
-                   toleranceOverride({torch.chalf: tol(atol=5e-2, rtol=5e-2), }),
+                   toleranceOverride({torch.chalf: tol(atol=8e-2, rtol=8e-2), }),
                    'TestCommon', 'test_complex_half_reference_testing')],
            skips=(
                # RuntimeError: !lhs.isAliasOf(rhs)INTERNAL ASSERT FAILED at

From c866fab8c9681fec64a39c95bb15c2404a6f2859 Mon Sep 17 00:00:00 2001
From: Ajay Hotchandani <ajayh@meta.com>
Date: Thu, 1 Dec 2022 21:10:50 +0000
Subject: [PATCH 1488/1922] [aarch64] add sleef_arm dependency (#89988)

Reviewed By: kimishpatel, psaab

Differential Revision: D41601965

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89988
Approved by: https://github.com/soumith
---
 buckbuild.bzl | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/buckbuild.bzl b/buckbuild.bzl
index 75c16ba006550..1fbae66c62910 100644
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@@ -147,6 +147,7 @@ THIRD_PARTY_LIBS = {
     "rt": ["//xplat/third-party/linker_lib:rt", "//third_party:rt"],
     "ruy": ["//third-party/ruy:ruy_xplat_lib", "//third_party:ruy_lib"],
     "typing-extensions": ["//third-party/typing-extensions:typing-extensions", "//third_party:typing-extensions"],
+    "sleef_arm": ["//third-party/sleef:sleef_arm", "//third_party:sleef_arm"],
 }
 
 def third_party(name):
@@ -1930,7 +1931,12 @@ def define_buck_targets(
                 third_party("glog"),
                 third_party("XNNPACK"),
                 third_party("pocketfft"),
-            ],
+            ] + select({
+                "DEFAULT": [],
+                "ovr_config//runtime:fbcode-arm64": [
+                  third_party("sleef_arm"),
+                ],
+            }),
             compiler_flags = get_aten_compiler_flags(),
             exported_preprocessor_flags = get_aten_preprocessor_flags(),
             exported_deps = [

From 4a67b077c22e8ad109468c9b426550a451361742 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Wed, 30 Nov 2022 18:32:21 -0800
Subject: [PATCH 1489/1922] [quant][be] Merge qconfig_mapping_utils.py in
 quantization and fx folders (#89979)

Summary:
att, no functionality changes

Test Plan:
python test/test_quantization.py TestQuantizeFx

Reviewers:

Subscribers:

Tasks:

Tags:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89979
Approved by: https://github.com/vkuzo
---
 test/quantization/fx/test_quantize_fx.py      |   7 +-
 torch/ao/quantization/fx/convert.py           |   4 +-
 torch/ao/quantization/fx/prepare.py           |   6 +-
 .../quantization/fx/qconfig_mapping_utils.py  |  96 +++++++++++++++-
 .../ao/quantization/qconfig_mapping_utils.py  | 103 ------------------
 5 files changed, 96 insertions(+), 120 deletions(-)
 delete mode 100644 torch/ao/quantization/qconfig_mapping_utils.py

diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index cbd256a1fbbe9..7c80a58412cdc 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -99,10 +99,11 @@
     QConfigMapping,
 )
 
-from torch.ao.quantization.qconfig_mapping_utils import (
+from torch.ao.quantization.fx.qconfig_mapping_utils import (
     _get_object_type_qconfig,
     _get_module_name_qconfig,
     _get_module_name_regex_qconfig,
+    maybe_adjust_qconfig_for_module_name_object_type_order,
 )
 
 from torch.ao.quantization.fx.pattern_utils import (
@@ -131,10 +132,6 @@
     StandaloneModuleConfigEntry,
 )
 
-from torch.ao.quantization.fx.qconfig_mapping_utils import (
-    maybe_adjust_qconfig_for_module_name_object_type_order,
-)
-
 from torch.ao.quantization.fx.utils import (
     _reroute_tuple_getitem_pattern,
     NodeInfo,
diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py
index f677e0eedc666..e795b3bca8584 100644
--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@@ -23,14 +23,12 @@
     qconfig_equals
 )
 from ..qconfig_mapping import QConfigMapping
-from ..qconfig_mapping_utils import (
-    _update_qconfig_for_qat,
-)
 from .qconfig_mapping_utils import (
     generate_node_name_to_qconfig,
     compare_prepare_convert_qconfig_mappings,
     update_qconfig_for_fusion,
     is_qconfig_supported_by_dtype_configs,
+    _update_qconfig_for_qat,
 )
 from torch.ao.quantization.backend_config.utils import (
     get_root_module_to_quantized_reference_module,
diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index 2ed88d16126bc..03ff6a764f77d 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -24,13 +24,11 @@
 from ..qconfig_mapping import (
     QConfigMapping,
 )
-from ..qconfig_mapping_utils import (
-    _get_flattened_qconfig_dict,
-    _update_qconfig_for_qat,
-)
 from .qconfig_mapping_utils import (
     generate_node_name_to_qconfig,
     update_qconfig_for_fusion,
+    _get_flattened_qconfig_dict,
+    _update_qconfig_for_qat,
 )
 
 from .quantize_handler import (
diff --git a/torch/ao/quantization/fx/qconfig_mapping_utils.py b/torch/ao/quantization/fx/qconfig_mapping_utils.py
index 6ccc8d07f64e0..9248890dbc158 100644
--- a/torch/ao/quantization/fx/qconfig_mapping_utils.py
+++ b/torch/ao/quantization/fx/qconfig_mapping_utils.py
@@ -1,6 +1,7 @@
 import torch
+import re
 from collections import defaultdict, OrderedDict
-from typing import Callable, Any, Dict, Tuple, Set, List
+from typing import Callable, Any, Dict, Tuple, Set, List, Union
 from torch.ao.quantization import QConfig
 from torch.ao.quantization.qconfig import _add_module_to_qconfig_obs_ctr, QConfigAny, qconfig_equals
 from torch.ao.quantization.quantize import (
@@ -21,6 +22,7 @@
 from ..utils import (
     _parent_name,
     get_qconfig_dtypes,
+    get_combined_dict
 )
 from ..qconfig_mapping import (
     _OBJECT_TYPE_DICT_KEY,
@@ -28,12 +30,10 @@
     _MODULE_NAME_REGEX_DICT_KEY,
     QConfigMapping,
 )
-from ..qconfig_mapping_utils import (
-    _get_object_type_qconfig,
-    _maybe_adjust_qconfig_for_module_type_or_name,
+from ..quantization_mappings import (
+    get_default_qat_module_mappings,
 )
 
-
 # TODO: revisit this list. Many helper methods shouldn't be public
 __all__ = [
     "check_is_valid_config_dict",
@@ -264,3 +264,89 @@ def is_qconfig_supported_by_dtype_configs(qconfig: QConfig, dtype_configs: List[
         if is_match:
             return True
     return False
+
+def _get_object_type_qconfig(
+        qconfig_mapping: QConfigMapping,
+        object_type: Union[Callable, str],
+        fallback_qconfig: QConfigAny) -> QConfigAny:
+    return qconfig_mapping.object_type_qconfigs.get(object_type, fallback_qconfig)
+
+
+def _get_module_name_regex_qconfig(qconfig_mapping, module_name, fallback_qconfig):
+    for regex_pattern, qconfig in qconfig_mapping.module_name_regex_qconfigs.items():
+        if re.match(regex_pattern, module_name):
+            # first match wins
+            return qconfig
+    return fallback_qconfig
+
+
+def _get_module_name_qconfig(qconfig_mapping, module_name, fallback_qconfig):
+    if module_name == '':
+        # module name qconfig not found
+        return fallback_qconfig
+    if module_name in qconfig_mapping.module_name_qconfigs:
+        return qconfig_mapping.module_name_qconfigs[module_name]
+    else:
+        parent, _ = _parent_name(module_name)
+        return _get_module_name_qconfig(qconfig_mapping, parent, fallback_qconfig)
+
+
+def _maybe_adjust_qconfig_for_module_type_or_name(qconfig_mapping, module_type, module_name, global_qconfig):
+    # get qconfig for module_name,
+    # fallback to module_name_regex_qconfig, module_type_qconfig,
+    # global_qconfig if necessary
+    module_type_qconfig = _get_object_type_qconfig(
+        qconfig_mapping, module_type, global_qconfig)
+    module_name_regex_qconfig = _get_module_name_regex_qconfig(
+        qconfig_mapping, module_name, module_type_qconfig)
+    module_name_qconfig = _get_module_name_qconfig(
+        qconfig_mapping, module_name, module_name_regex_qconfig)
+    return module_name_qconfig
+
+
+def _get_flattened_qconfig_dict(qconfig_mapping: QConfigMapping) -> Dict[Union[Callable, str], QConfigAny]:
+    """ flatten the global, object_type and module_name qconfig
+    to the same qconfig_dict so that it can be used by
+    propagate_qconfig_ function.
+    "module_name_regex" is ignored for now since it's not supported
+    in propagate_qconfig_, but it can be fixed later.
+
+    For example:
+    Input: {
+      "": qconfig,
+      "object_type": [
+        (torch.add, qconfig)
+      ],
+      "module_name": [
+        ("conv", qconfig)
+      ]
+    }
+
+    Output: {
+      "": qconfig,
+      torch.add: qconfig,
+      "conv": qconfig
+    }
+    """
+    flattened: Dict[Union[Callable, str], QConfigAny] = {"": qconfig_mapping.global_qconfig}
+    for obj, qconfig in qconfig_mapping.object_type_qconfigs.items():
+        flattened[obj] = qconfig
+    for obj, qconfig in qconfig_mapping.module_name_qconfigs.items():
+        flattened[obj] = qconfig
+    return flattened
+
+
+def _update_qconfig_for_qat(
+        qconfig_mapping: QConfigMapping,
+        additional_qat_module_mapping: Dict[Callable, Callable]):
+    """
+    Update the qconfig_dict to account for module swaps during QAT.
+    During QAT we perform a module swap on the nn.Module types to the corresponding nn.qat.modules types.
+    """
+    all_qat_mappings = get_combined_dict(
+        get_default_qat_module_mappings(), additional_qat_module_mapping)
+    object_type_dict = qconfig_mapping.object_type_qconfigs
+    new_object_type_dict = object_type_dict.copy()
+    for k, v in new_object_type_dict.items():
+        if k in all_qat_mappings:
+            object_type_dict[all_qat_mappings[k]] = v
diff --git a/torch/ao/quantization/qconfig_mapping_utils.py b/torch/ao/quantization/qconfig_mapping_utils.py
deleted file mode 100644
index 0109729e580c8..0000000000000
--- a/torch/ao/quantization/qconfig_mapping_utils.py
+++ /dev/null
@@ -1,103 +0,0 @@
-import re
-from typing import Dict, Callable, Union, List
-
-from .utils import (
-    get_combined_dict,
-    _parent_name,
-)
-from .quantization_mappings import (
-    get_default_qat_module_mappings,
-)
-from .qconfig import QConfigAny
-from .qconfig_mapping import QConfigMapping
-
-
-__all__: List[str] = [
-]
-
-
-def _get_object_type_qconfig(
-        qconfig_mapping: QConfigMapping,
-        object_type: Union[Callable, str],
-        fallback_qconfig: QConfigAny) -> QConfigAny:
-    return qconfig_mapping.object_type_qconfigs.get(object_type, fallback_qconfig)
-
-
-def _get_module_name_regex_qconfig(qconfig_mapping, module_name, fallback_qconfig):
-    for regex_pattern, qconfig in qconfig_mapping.module_name_regex_qconfigs.items():
-        if re.match(regex_pattern, module_name):
-            # first match wins
-            return qconfig
-    return fallback_qconfig
-
-
-def _get_module_name_qconfig(qconfig_mapping, module_name, fallback_qconfig):
-    if module_name == '':
-        # module name qconfig not found
-        return fallback_qconfig
-    if module_name in qconfig_mapping.module_name_qconfigs:
-        return qconfig_mapping.module_name_qconfigs[module_name]
-    else:
-        parent, _ = _parent_name(module_name)
-        return _get_module_name_qconfig(qconfig_mapping, parent, fallback_qconfig)
-
-
-def _maybe_adjust_qconfig_for_module_type_or_name(qconfig_mapping, module_type, module_name, global_qconfig):
-    # get qconfig for module_name,
-    # fallback to module_name_regex_qconfig, module_type_qconfig,
-    # global_qconfig if necessary
-    module_type_qconfig = _get_object_type_qconfig(
-        qconfig_mapping, module_type, global_qconfig)
-    module_name_regex_qconfig = _get_module_name_regex_qconfig(
-        qconfig_mapping, module_name, module_type_qconfig)
-    module_name_qconfig = _get_module_name_qconfig(
-        qconfig_mapping, module_name, module_name_regex_qconfig)
-    return module_name_qconfig
-
-
-def _get_flattened_qconfig_dict(qconfig_mapping: QConfigMapping) -> Dict[Union[Callable, str], QConfigAny]:
-    """ flatten the global, object_type and module_name qconfig
-    to the same qconfig_dict so that it can be used by
-    propagate_qconfig_ function.
-    "module_name_regex" is ignored for now since it's not supported
-    in propagate_qconfig_, but it can be fixed later.
-
-    For example:
-    Input: {
-      "": qconfig,
-      "object_type": [
-        (torch.add, qconfig)
-      ],
-      "module_name": [
-        ("conv", qconfig)
-      ]
-    }
-
-    Output: {
-      "": qconfig,
-      torch.add: qconfig,
-      "conv": qconfig
-    }
-    """
-    flattened: Dict[Union[Callable, str], QConfigAny] = {"": qconfig_mapping.global_qconfig}
-    for obj, qconfig in qconfig_mapping.object_type_qconfigs.items():
-        flattened[obj] = qconfig
-    for obj, qconfig in qconfig_mapping.module_name_qconfigs.items():
-        flattened[obj] = qconfig
-    return flattened
-
-
-def _update_qconfig_for_qat(
-        qconfig_mapping: QConfigMapping,
-        additional_qat_module_mapping: Dict[Callable, Callable]):
-    """
-    Update the qconfig_dict to account for module swaps during QAT.
-    During QAT we perform a module swap on the nn.Module types to the corresponding nn.qat.modules types.
-    """
-    all_qat_mappings = get_combined_dict(
-        get_default_qat_module_mappings(), additional_qat_module_mapping)
-    object_type_dict = qconfig_mapping.object_type_qconfigs
-    new_object_type_dict = object_type_dict.copy()
-    for k, v in new_object_type_dict.items():
-        if k in all_qat_mappings:
-            object_type_dict[all_qat_mappings[k]] = v

From e9b478c988ed89fd7810fb950f99eec2cd94ad26 Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Thu, 1 Dec 2022 21:38:27 +0000
Subject: [PATCH 1490/1922] [Vulkan][TCC] Add tests for quantized add, sub, mul
 and div (#89578)

Summary: Added randomized test for quantized add, sub, mul and div

Test Plan:
On Mac
```
cd ~/fbsource
buck1 run -c pt.vulkan_full_precision=1 //xplat/caffe2:pt_vulkan_quantized_api_test_binAppleMac\#macosx-arm64
```

On Android
```
cd ~/fbsource
buck1 build -c ndk.custom_libcxx=false -c pt.enable_qpl=0 -c pt.vulkan_full_precision=1 //xplat/caffe2:pt_vulkan_quantized_api_test_binAndroid\#android-arm64 --show-output
adb push buck-out/gen/xplat/caffe2/pt_vulkan_quantized_api_test_binAndroid\#android-arm64 /data/local/tmp/vulkan_quantized_api_test
adb shell "/data/local/tmp/vulkan_quantized_api_test"
```

Differential Revision: D41047094

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89578
Approved by: https://github.com/digantdesai
---
 .../ATen/test/vulkan_quantized_api_test.cpp   | 297 +++++++++++++++++-
 1 file changed, 291 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/test/vulkan_quantized_api_test.cpp b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
index 5419122692ee1..205366d5770eb 100644
--- a/aten/src/ATen/test/vulkan_quantized_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
@@ -121,14 +121,27 @@ int64_t rand_pos_int(const int max_val) {
 
 at::Tensor produce_random_tensor(
     const at::IntArrayRef tensor_shape,
-    const float a,
-    const float b,
-    const float c) {
-  return (a + b * at::rand({1}, at::device(at::kCPU).dtype(at::kFloat))) *
-         (at::rand(tensor_shape, at::device(at::kCPU).dtype(at::kFloat)) - c);
+    const float s_min = 1.0,
+    const float s_max = 100.0,
+    const float shift = 0.45) {
+  // tensor is randomly generated with values in the range
+  // [-shift * s, (1-shift) * s), where s is randomly generated in the range
+  // [s_min, s_max]
+  // with these default values, s is randomly generated in the range [1, 100]
+  // this means that the range of the tensor values could be as narrow as
+  // [-0.45, 0.55) or as wide as [-45.0, 55.0)
+  TORCH_CHECK(s_min > 0, "scalar lower bound must be positive");
+  TORCH_CHECK(s_min <= s_max, "scalar lower bound must be <= upper bound");
+  const auto scalar = s_min + (s_max - s_min) * (float)rand()/(float)RAND_MAX;
+  return scalar *
+    (at::rand(tensor_shape, at::device(at::kCPU).dtype(at::kFloat)) - shift);
 }
 
-double produce_random_scale(const double scale_min, const double scale_max) {
+double produce_random_scale(
+    const double scale_min = 0.001,
+    const double scale_max = 2.0) {
+  TORCH_CHECK(scale_min <= scale_max, "scale min must be <= scale max");
+  // scale is randomly generated in the range [scale_min, scale_max)
   return rand01() * (scale_max - scale_min) + scale_min;
 }
 
@@ -1280,6 +1293,278 @@ TEST_F(VulkanAPITest, quantized_upsample_nearest2d) {
   ASSERT_TRUE(check);
 }
 
+std::tuple<double, double, int64_t, int64_t> produce_inputs_for_binary_op(
+    const bool compute_quantization_params,
+    const bool random_quantization_params,
+    const char* op_name,
+    const at::IntArrayRef input1_shape,
+    const at::IntArrayRef input2_shape,
+    double in1_scale, double in2_scale,
+    int in1_zero_point, int in2_zero_point,
+    at::Tensor& input1_cpu, at::Tensor& input1_cpu_q,
+    at::Tensor& input1_cpu_deq,
+    at::Tensor& input1_vk, at::Tensor& input1_vk_q,
+    at::Tensor& input1_vk_deq, at::Tensor& input1_vk_deq_cpu,
+    at::Tensor& input2_cpu, at::Tensor& input2_cpu_q,
+    at::Tensor& input2_cpu_deq,
+    at::Tensor& input2_vk, at::Tensor& input2_vk_q,
+    at::Tensor& input2_vk_deq, at::Tensor& input2_vk_deq_cpu) {
+
+  int num_attempts = 5;
+    // in order to make sure we start with input tensors that are numerically
+    // the same (cpu vs vulkan), we allow multiple attempts when randomly
+    // generating the inputs. If the cpu quantized tensor and the vk quantized
+    // tensors are not the same (maybe off by 1 due to differences in rounding
+    // and precision), we try again.
+  for (int i = 0; i < num_attempts; i += 1) {
+    // produce random inputs
+    input1_cpu = produce_random_tensor(input1_shape);
+    input2_cpu = produce_random_tensor(input1_shape);
+
+    if (compute_quantization_params) {
+      // compute appropiate scale and zero point for inputs
+      const auto in1_quant_params = compute_quant_params(input1_cpu);
+      in1_scale = std::get<0>(in1_quant_params);
+      in1_zero_point = std::get<1>(in1_quant_params);
+
+      const auto in2_quant_params = compute_quant_params(input2_cpu);
+      in2_scale = std::get<0>(in2_quant_params);
+      in2_zero_point = std::get<1>(in2_quant_params);
+    } else if (random_quantization_params) {
+      // produce random scale and zero point for inputs
+      in1_scale = produce_random_scale();
+      in1_zero_point = produce_random_zero_point(c10::ScalarType::QUInt8);
+
+      in2_scale = produce_random_scale();
+      in2_zero_point = produce_random_zero_point(c10::ScalarType::QUInt8);
+    }
+
+    // we do this, to avoid dividing by zero
+    if (strcmp(op_name, "quantized::div") == 0) {
+      const auto non_zero_sign = input2_cpu.sign() - input2_cpu.sign().abs() + 1;
+        // non_zero_sign = 1 if the value is non negative, and -1 if it is negative
+      input2_cpu = input2_cpu + in2_scale * non_zero_sign;
+        // this will force abs(input2_cpu) >= in2_scale, which means that none of
+        // the quantized values of the second input will be equal to the zero point.
+
+      // we might end up dividing by 0, if we allow random scale and zero point
+      // of the divisor.
+      if (random_quantization_params) {
+        const auto in2_quant_params = compute_quant_params(input2_cpu);
+        in2_scale = std::get<0>(in2_quant_params);
+        in2_zero_point = std::get<1>(in2_quant_params);
+      }
+    }
+
+    // quantize cpu inputs
+    input1_cpu_q = at::quantize_per_tensor(
+        input1_cpu, in1_scale, in1_zero_point, c10::ScalarType::QUInt8);
+    input2_cpu_q = at::quantize_per_tensor(
+        input2_cpu, in2_scale, in2_zero_point, c10::ScalarType::QUInt8);
+
+    // dequantize quantized cpu inputs
+    input1_cpu_deq = at::dequantize(input1_cpu_q);
+    input2_cpu_deq = at::dequantize(input2_cpu_q);
+
+    // vulkan quantized inputs
+    input1_vk = input1_cpu.vulkan();
+    input1_vk_q = at::quantize_per_tensor(
+        input1_vk, in1_scale, in1_zero_point, c10::ScalarType::QUInt8);
+    input2_vk = input2_cpu.vulkan();
+    input2_vk_q = at::quantize_per_tensor(
+        input2_vk, in2_scale, in2_zero_point, c10::ScalarType::QUInt8);
+
+    // dequantize quantized vulkan inputs
+    input1_vk_deq = at::dequantize(input1_vk_q);
+    input2_vk_deq = at::dequantize(input2_vk_q);
+
+    input1_vk_deq_cpu = input1_vk_deq.cpu();
+    input2_vk_deq_cpu = input2_vk_deq.cpu();
+
+    const float input1_dif = at::abs(input1_cpu_deq - input1_vk_deq_cpu).max().item<float>();
+    const float input2_dif = at::abs(input2_cpu_deq - input2_vk_deq_cpu).max().item<float>();
+    if (input1_dif < 1e-5 && input2_dif < 1e-5 && input1_dif < in1_scale/2 && input2_dif < in2_scale/2) {
+      break;
+    }
+  }
+
+  return {in1_scale, in2_scale, in1_zero_point, in2_zero_point};
+}
+
+at::Tensor apply_cpu_quantized_binary_op(
+    const char* op_name,
+    at::Tensor input1_cpu_deq,
+    at::Tensor input2_cpu_deq) {
+  if (strcmp(op_name, "quantized::add") == 0) {
+    return at::add(input1_cpu_deq, input2_cpu_deq);
+  } else if (strcmp(op_name, "quantized::sub") == 0) {
+    return at::sub(input1_cpu_deq, input2_cpu_deq);
+  } else if (strcmp(op_name, "quantized::mul") == 0) {
+    return at::mul(input1_cpu_deq, input2_cpu_deq);
+  } else if (strcmp(op_name, "quantized::div") == 0) {
+    return at::div(input1_cpu_deq, input2_cpu_deq);
+  } else {
+    TORCH_CHECK(false, "Invalid op");
+  }
+}
+
+at::Tensor apply_vulkan_quantized_binary_op(
+    const char* op_name,
+    at::Tensor input1_vk_q,
+    at::Tensor input2_vk_q,
+    double out_scale,
+    int64_t out_zero_point) {
+  if (strcmp(op_name, "quantized::add") == 0) {
+    return at::native::vulkan::ops::quantized_add(
+      input1_vk_q, input2_vk_q, out_scale, out_zero_point);
+  } else if (strcmp(op_name, "quantized::sub") == 0) {
+    return at::native::vulkan::ops::quantized_sub(
+      input1_vk_q, input2_vk_q, out_scale, out_zero_point);
+  } else if (strcmp(op_name, "quantized::mul") == 0) {
+    return at::native::vulkan::ops::quantized_mul(
+      input1_vk_q, input2_vk_q, out_scale, out_zero_point);
+  } else if (strcmp(op_name, "quantized::div") == 0) {
+    return at::native::vulkan::ops::quantized_div(
+      input1_vk_q, input2_vk_q, out_scale, out_zero_point);
+  } else {
+    TORCH_CHECK(false, "Invalid op");
+  }
+}
+
+void test_quantized_binary_op(
+    const bool compute_quantization_params,
+    const bool random_quantization_params,
+    const char* op_name,
+    const at::IntArrayRef input1_shape,
+    const at::IntArrayRef input2_shape,
+    double in1_scale_default = 0.103,
+    double in2_scale_default = 0.171,
+    double out_scale_default = 0.139,
+    int64_t in1_zero_point_default = 11,
+    int64_t in2_zero_point_default = 9,
+    int64_t out_zero_point_default = 17) {
+
+  // produce inputs
+  at::Tensor input1_cpu, input1_cpu_q, input1_cpu_deq;
+  at::Tensor input1_vk, input1_vk_q, input1_vk_deq, input1_vk_deq_cpu;
+  at::Tensor input2_cpu, input2_cpu_q, input2_cpu_deq;
+  at::Tensor input2_vk, input2_vk_q, input2_vk_deq, input2_vk_deq_cpu;
+
+  auto input_params = produce_inputs_for_binary_op(
+    compute_quantization_params, random_quantization_params, op_name,
+    input1_shape, input2_shape,
+    in1_scale_default, in2_scale_default,
+    in1_zero_point_default, in2_zero_point_default,
+    input1_cpu, input1_cpu_q, input1_cpu_deq,
+    input1_vk, input1_vk_q, input1_vk_deq, input1_vk_deq_cpu,
+    input2_cpu, input2_cpu_q, input2_cpu_deq,
+    input2_vk, input2_vk_q, input2_vk_deq, input2_vk_deq_cpu);
+
+  double in1_scale = std::get<0>(input_params);
+  double in2_scale = std::get<1>(input_params);
+  int64_t in1_zero_point = std::get<2>(input_params);
+  int64_t in2_zero_point = std::get<3>(input_params);
+
+  double out_scale = out_scale_default;
+  int64_t out_zero_point = out_zero_point_default;
+
+  // apply op on dequantized cpu tensors
+  at::Tensor output_cpu = apply_cpu_quantized_binary_op(
+    op_name, input1_cpu_deq, input2_cpu_deq);
+
+  if (compute_quantization_params || random_quantization_params) {
+    // compute appropiate scale and zero point for output
+    const auto out_quant_params = compute_quant_params(output_cpu);
+    out_scale = std::get<0>(out_quant_params);
+    out_zero_point = std::get<1>(out_quant_params);
+  }
+
+  // quantize and dequantize cpu output
+  const auto output_cpu_q = at::quantize_per_tensor(
+      output_cpu, out_scale, out_zero_point, c10::ScalarType::QUInt8);
+  const auto output_cpu_deq = at::dequantize(output_cpu_q);
+
+  // vulkan quantized output
+  at::Tensor output_vk_q = apply_vulkan_quantized_binary_op(
+    op_name, input1_vk_q, input2_vk_q, out_scale, out_zero_point);
+
+  const auto output_vk_deq = at::dequantize(output_vk_q);
+  const auto output_vk_deq_cpu = output_vk_deq.cpu();
+
+  // check
+  const float tolerance =
+    (compute_quantization_params || random_quantization_params) ? out_scale : 0;
+  const auto check = almostEqual(output_cpu_deq, output_vk_deq_cpu, tolerance);
+
+  if (!check) {
+    const auto vk_q_error = at::abs(output_vk_deq_cpu - output_cpu_deq).max().item<float>();
+    std::cout << "Binary op " << op_name << " failed with inputs: " << std::endl;
+    std::cout << "input1: shape " << input1_shape << " scale " << in1_scale
+              << " and zero point " << in1_zero_point << std::endl;
+    std::cout << "input2: shape " << input2_shape << " scale " << in2_scale
+              << " and zero point " << in2_zero_point << std::endl;
+    std::cout << "output scale " << out_scale
+              << " and zero point " << out_zero_point << std::endl;
+    std::cout << "error: " << vk_q_error << std::endl;
+  }
+  ASSERT_TRUE(check);
+}
+
+void quantized_binary_op_test_set(
+    const char* op_name) {
+  // fixed params
+  test_quantized_binary_op(false, false, op_name, {1, 1, 1, 1}, {1, 1, 1, 1});
+  test_quantized_binary_op(false, false, op_name, {1, 1, 8, 8}, {1, 1, 8, 8});
+  test_quantized_binary_op(false, false, op_name, {1, 1, 12, 17}, {1, 1, 12, 17});
+  test_quantized_binary_op(false, false, op_name, {2, 13, 32, 27}, {2, 13, 32, 27});
+  test_quantized_binary_op(false, false, op_name, {7, 15, 6, 17}, {7, 15, 1, 17}); // broadcasting
+  test_quantized_binary_op(false, false, op_name, {7, 1, 6, 17}, {7, 5, 6, 17}); // broadcasting
+
+  // compute params
+  test_quantized_binary_op(true, false, op_name, {1, 1, 1, 1}, {1, 1, 1, 1});
+  test_quantized_binary_op(true, false, op_name, {1, 1, 8, 8}, {1, 1, 8, 8});
+  test_quantized_binary_op(true, false, op_name, {1, 1, 12, 17}, {1, 1, 12, 17});
+  test_quantized_binary_op(true, false, op_name, {2, 13, 32, 27}, {2, 13, 32, 27});
+  test_quantized_binary_op(true, false, op_name, {7, 15, 6, 17}, {7, 15, 1, 17}); // broadcasting
+  test_quantized_binary_op(true, false, op_name, {7, 1, 6, 17}, {7, 5, 6, 17}); // broadcasting
+
+  // random params
+  test_quantized_binary_op(false, true, op_name, {1, 1, 1, 1}, {1, 1, 1, 1});
+  test_quantized_binary_op(false, true, op_name, {1, 1, 8, 8}, {1, 1, 8, 8});
+  test_quantized_binary_op(false, true, op_name, {1, 1, 12, 17}, {1, 1, 12, 17});
+  test_quantized_binary_op(false, true, op_name, {2, 13, 32, 27}, {2, 13, 32, 27});
+  test_quantized_binary_op(false, true, op_name, {7, 15, 6, 17}, {7, 15, 1, 17}); // broadcasting
+  test_quantized_binary_op(false, true, op_name, {7, 1, 6, 17}, {7, 5, 6, 17}); // broadcasting
+
+  // random shape and params
+  for (int i = 0; i < 10; i += 1) {
+    const at::IntArrayRef tensor_shape =
+      {
+        rand_pos_int(30),
+        rand_pos_int(30),
+        rand_pos_int(100),
+        rand_pos_int(100)
+      };
+    test_quantized_binary_op(false, true, op_name, tensor_shape, tensor_shape);
+  }
+}
+
+TEST_F(VulkanAPITest, quantized_add_tests) {
+  quantized_binary_op_test_set("quantized::add");
+}
+
+TEST_F(VulkanAPITest, quantized_sub_tests) {
+  quantized_binary_op_test_set("quantized::sub");
+}
+
+TEST_F(VulkanAPITest, quantized_mul_tests) {
+  quantized_binary_op_test_set("quantized::mul");
+}
+
+TEST_F(VulkanAPITest, quantized_div_tests) {
+  quantized_binary_op_test_set("quantized::div");
+}
+
 } // namespace
 
 #endif /* USE_VULKAN_API */

From 12af3d0a810c2040edcfb84223c5c72e3fe9efce Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Wed, 30 Nov 2022 09:52:05 -0800
Subject: [PATCH 1491/1922] [quant] Explictly set default quantized engine
 instead of relying on the order of supported_qengines (#89804)

Summary:
Fixes: https://github.com/pytorch/pytorch/issues/86404

Test Plan:
ossci + sandcastle
Reviewers:

Subscribers:

Tasks:

Tags:

Differential Revision: [D41635738](https://our.internmc.facebook.com/intern/diff/D41635738)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89804
Approved by: https://github.com/andrewor14
---
 aten/src/ATen/Context.cpp | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index b391cd4aab904..49da7a7597d9b 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -295,8 +295,24 @@ bool Context::hasLAPACK() {
 }
 
 at::QEngine Context::qEngine() const {
-  // If wasn't explicitly set - take the last one available
-  return quantized_engine.value_or(supportedQEngines().back());
+  static auto _quantized_engine = []() {
+    at::QEngine qengine = at::kNoQEngine;
+#ifdef USE_PYTORCH_QNNPACK
+    qengine = at::kQNNPACK;
+#endif // USE_PYTORCH_QNNPACK
+
+#if AT_MKLDNN_ENABLED()
+    qengine = at::kONEDNN;
+#endif
+
+#ifdef USE_FBGEMM
+    if (fbgemm::fbgemmSupportedCPU()) {
+      qengine = at::kFBGEMM;
+    }
+#endif
+    return qengine;
+  }();
+  return quantized_engine.value_or(_quantized_engine);
 }
 
 void Context::setQEngine(at::QEngine e) {

From 5b368697ebc07cbf237f819d3cdf8b46c6aadaa5 Mon Sep 17 00:00:00 2001
From: Soumith Chintala <soumith@gmail.com>
Date: Thu, 1 Dec 2022 22:01:41 +0000
Subject: [PATCH 1492/1922] triton supports devices < 7.0, not 6.0 (#90020)

triton is still buggy with Pascal devices, so make the error checker reflect that.

Also, this < 6.0 never worked, as the `has_triton` definition in utils.py was checking >= 7.0.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90020
Approved by: https://github.com/yanboliang, https://github.com/anijain2305
---
 torch/_inductor/scheduler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
index 8609617897bf5..0bc74ca6410bd 100644
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@@ -1080,9 +1080,9 @@ def create_backend(self, device: torch.device):
         else:
             if not has_triton():
                 device_props = torch.cuda.get_device_properties(device)
-                if device_props.major < 6:
+                if device_props.major < 7:
                     raise RuntimeError(
-                        f"Found {device_props.name} which is too old to be supported by the triton GPU compiler, which is used as the backend. Triton only supports devices of CUDA Capability >= 6.0, but your device is of CUDA capability {device_props.major}.{device_props.minor}"  # noqa: B950
+                        f"Found {device_props.name} which is too old to be supported by the triton GPU compiler, which is used as the backend. Triton only supports devices of CUDA Capability >= 7.0, but your device is of CUDA capability {device_props.major}.{device_props.minor}"  # noqa: B950
                     )
                 else:
                     raise RuntimeError(

From 58f004da08338770702c69ee170a3741c5df0399 Mon Sep 17 00:00:00 2001
From: jiaruifang <fangjiarui123@gmail.com>
Date: Thu, 1 Dec 2022 22:16:35 +0000
Subject: [PATCH 1493/1922] [doc] update dtensor readme (#89991)

I fixed some import erros in readme of dtensor.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89991
Approved by: https://github.com/wanchaol
---
 torch/distributed/_tensor/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/distributed/_tensor/README.md b/torch/distributed/_tensor/README.md
index 4f10464e69264..ba7ef77cbb5c8 100644
--- a/torch/distributed/_tensor/README.md
+++ b/torch/distributed/_tensor/README.md
@@ -7,7 +7,7 @@ We propose distributed tensor primitives to allow easier distributed computation
 
 ```python
 import torch
-from torch.distributed import DeviceMesh, Shard, distribute_tensor
+from torch.distributed._tensor import DeviceMesh, Shard, distribute_tensor
 
 # Create a mesh topology with the available devices.
 mesh = DeviceMesh("cuda", list(range(world_size)))
@@ -47,7 +47,7 @@ Here are some basic DistributedTensor API examples that showcase:
 ```python
 import torch
 import torch.distributed as distributed
-from torch.distributed import DTensor, DeviceMesh, Shard, Replicate, distribute_module
+from torch.distributed._tensor import DTensor, DeviceMesh, Shard, Replicate, distribute_module
 
 # construct a device mesh with available devices (multi-host or single host)
 device_mesh = DeviceMesh(device_type="cuda", [0, 1, 2, 3])

From e5dd864abfd4e349e4ab77a79036d293bc84e009 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Thu, 1 Dec 2022 22:39:46 +0000
Subject: [PATCH 1494/1922] Revert "[quant] Explictly set default quantized
 engine instead of relying on the order of supported_qengines (#89804)"

This reverts commit 607ff6f4c10914a2a46bab90577cd083a6b3d46d.

Reverted https://github.com/pytorch/pytorch/pull/89804 on behalf of https://github.com/clee2000 due to breaking tests https://hud.pytorch.org/pytorch/pytorch/commit/607ff6f4c10914a2a46bab90577cd083a6b3d46d https://github.com/pytorch/pytorch/actions/runs/3596841274/jobs/6058297637 trunk label didnt kick off workflows fast enough
---
 aten/src/ATen/Context.cpp | 20 ++------------------
 1 file changed, 2 insertions(+), 18 deletions(-)

diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index 49da7a7597d9b..b391cd4aab904 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -295,24 +295,8 @@ bool Context::hasLAPACK() {
 }
 
 at::QEngine Context::qEngine() const {
-  static auto _quantized_engine = []() {
-    at::QEngine qengine = at::kNoQEngine;
-#ifdef USE_PYTORCH_QNNPACK
-    qengine = at::kQNNPACK;
-#endif // USE_PYTORCH_QNNPACK
-
-#if AT_MKLDNN_ENABLED()
-    qengine = at::kONEDNN;
-#endif
-
-#ifdef USE_FBGEMM
-    if (fbgemm::fbgemmSupportedCPU()) {
-      qengine = at::kFBGEMM;
-    }
-#endif
-    return qengine;
-  }();
-  return quantized_engine.value_or(_quantized_engine);
+  // If wasn't explicitly set - take the last one available
+  return quantized_engine.value_or(supportedQEngines().back());
 }
 
 void Context::setQEngine(at::QEngine e) {

From 5c34416d9b9f84ff4344f44e83909027b780ff4a Mon Sep 17 00:00:00 2001
From: Aidyn-A <31858918+Aidyn-A@users.noreply.github.com>
Date: Thu, 1 Dec 2022 23:01:45 +0000
Subject: [PATCH 1495/1922] [UCC] Properly finalize unsuccessful collective
 posts (#89306)

This PR add a `ucc_collective_finalize` call if `ucc_collective_post` and `ucc_collective_triggered_post` were not successful.
According to the [UCC documentation](https://openucx.github.io/ucc/api/v1.1/html/group___u_c_c___c_o_l_l_e_c_t_i_v_e_s.html):
```
On error, request handle becomes invalid, user is responsible to call ucc_collective_finalize to free allocated resources.
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89306
Approved by: https://github.com/kwen2501
---
 .../csrc/distributed/c10d/ProcessGroupUCC.cpp |  6 +-
 torch/csrc/distributed/c10d/UCCUtils.hpp      | 61 +++++++++++++------
 2 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp b/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp
index ad135062a7024..b03ca490cea9d 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupUCC.cpp
@@ -466,7 +466,8 @@ void Comm::enqueue_collective(
   ucc_coll_req_h request;
   TORCH_UCC_CHECK(
       ucc_collective_init(&coll, &request, team), "failed to init collective");
-  TORCH_UCC_CHECK(ucc_collective_post(request), "failed to post collective");
+  TORCH_UCC_CHECK_REQUEST(
+      request, ucc_collective_post(request), "failed to post collective");
 
   auto entry =
       std::make_shared<ProcessGroupUCC::ProgressEntry>(&ucc_comm, request);
@@ -495,7 +496,8 @@ void Comm::enqueue_cuda_collective(
   comp_ev.ev_context = nullptr;
   comp_ev.ev_context_size = 0;
   comp_ev.req = request;
-  TORCH_UCC_CHECK(
+  TORCH_UCC_CHECK_REQUEST(
+      request,
       ucc_collective_triggered_post(ee, &comp_ev),
       "failed to post triggered collective");
   ucc_status_t st = ucc_ee_get_event(ee, &post_ev);
diff --git a/torch/csrc/distributed/c10d/UCCUtils.hpp b/torch/csrc/distributed/c10d/UCCUtils.hpp
index 50510a6ea9a03..3482a1d34ee52 100644
--- a/torch/csrc/distributed/c10d/UCCUtils.hpp
+++ b/torch/csrc/distributed/c10d/UCCUtils.hpp
@@ -8,27 +8,48 @@
 
 namespace c10d {
 
+// Macro to generate the error message on a non-successful UCC return value.
+#define TORCH_UCC_GET_ERROR_MSG(_err, _error_msg, _result) \
+  do {                                                     \
+      _err = c10::str(                                     \
+          "[",                                             \
+          std::string(__FILE__),                           \
+          ":",                                             \
+          std::to_string(__LINE__),                        \
+          "] ",                                            \
+          logger->getLogPrefix(),                          \
+          _error_msg,                                      \
+          ", error code ",                                 \
+          _result,                                         \
+          ": ",                                            \
+          ucc_status_string(_result),                      \
+          ", system error code ",                          \
+          errno);                                          \
+  } while (0)
+
 // Macro to throw on a non-successful UCC return value.
-#define TORCH_UCC_CHECK(_cmd, _error_msg) \
-  do {                                    \
-    ucc_status_t result = _cmd;           \
-    if (result != UCC_OK) {               \
-      std::string err = c10::str(         \
-          "[",                            \
-          std::string(__FILE__),          \
-          ":",                            \
-          std::to_string(__LINE__),       \
-          "] ",                           \
-          logger->getLogPrefix(),         \
-          _error_msg,                     \
-          ", error code ",                \
-          result,                         \
-          ": ",                           \
-          ucc_status_string(result),      \
-          ", system error code ",         \
-          errno);                         \
-      TORCH_CHECK(false, err);            \
-    }                                     \
+#define TORCH_UCC_CHECK(_cmd, _error_msg)               \
+  do {                                                  \
+    ucc_status_t result = _cmd;                         \
+    if (result != UCC_OK) {                             \
+      std::string err;                                  \
+      TORCH_UCC_GET_ERROR_MSG(err, _error_msg, result); \
+      TORCH_CHECK(false, err);                          \
+    }                                                   \
+  } while (0)
+
+// Macro and throw on a non-successful UCC return value and free its request.
+#define TORCH_UCC_CHECK_REQUEST(_request, _cmd, _error_msg) \
+  do {                                                      \
+    ucc_status_t result = _cmd;                             \
+    if (result != UCC_OK) {                                 \
+      std::string err;                                      \
+      TORCH_UCC_GET_ERROR_MSG(err, _error_msg, result);     \
+      if (_request != nullptr) {                            \
+        ucc_collective_finalize(_request);                  \
+      }                                                     \
+      TORCH_CHECK(false, err);                              \
+    }                                                       \
   } while (0)
 
 // Macros to print logs with unified format

From 37fc66d87227b2d618568d97ae66b054df1d7e2e Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Thu, 1 Dec 2022 19:17:43 +0000
Subject: [PATCH 1496/1922] Add non-reentrant checkpoint to composable APIs
 (#90015)

Differential Revision: [D41661027](https://our.internmc.facebook.com/intern/diff/D41661027)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90015
Approved by: https://github.com/zhaojuanmao
---
 .../_composable/test_checkpoint.py            |  62 ++++++++--
 .../_composable/checkpoint_activation.py      | 106 +++++++++++++++++-
 2 files changed, 154 insertions(+), 14 deletions(-)

diff --git a/test/distributed/_composable/test_checkpoint.py b/test/distributed/_composable/test_checkpoint.py
index fbffc90f19c51..e2907bcb9fcb7 100644
--- a/test/distributed/_composable/test_checkpoint.py
+++ b/test/distributed/_composable/test_checkpoint.py
@@ -3,6 +3,8 @@
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_utils import (
     TestCase,
+    instantiate_parametrized_tests,
+    parametrize,
     run_tests,
 )
 
@@ -12,9 +14,35 @@
 
 import unittest
 from collections import deque
+from contextlib import ContextDecorator
 from copy import deepcopy
 
 
+class MemoryDelta(ContextDecorator):
+    def __init__(self, device: torch.device):
+        self.device: torch.device = device
+        self.active_memory_enter: int = 0
+        self.active_memory_exit: int = 0
+
+    def __enter__(self):
+        self.active_memory_enter = (
+            torch.cuda.memory_stats()["active_bytes.all.current"]
+            if self.device.type == "cuda"
+            else 0
+        )
+        return self
+
+    def __exit__(self, *exc):
+        self.active_memory_exit = (
+            torch.cuda.memory_stats()["active_bytes.all.current"]
+            if self.device.type == "cuda"
+            else 0
+        )
+
+    def delta(self) -> int:
+        return self.active_memory_exit - self.active_memory_enter
+
+
 class ToyModel(nn.Module):
     def __init__(self):
         super().__init__()
@@ -42,7 +70,12 @@ def _get_graph_size(self, out: torch.Tensor) -> int:
 
         return num_functions
 
-    def _test_tensor_only(self, net: nn.Module, x: torch.Tensor) -> None:
+    def _test_tensor_only(
+        self,
+        net: nn.Module,
+        x: torch.Tensor,
+        use_reentrant: bool,
+    ) -> None:
         x1 = x.clone()
         x2 = x.clone()
         x1.requires_grad = True
@@ -52,31 +85,42 @@ def _test_tensor_only(self, net: nn.Module, x: torch.Tensor) -> None:
         net2 = deepcopy(net)
 
         # no checkpoint
-        loss1 = net1(x1).sum()
+        with MemoryDelta(x.device) as mem1:
+            loss1 = net1(x1).sum()
         graph_size1 = self._get_graph_size(loss1)
         loss1.backward()
 
         # with checkpoint
-        checkpoint(net2.seq)
-        loss2 = net2(x2).sum()
+        checkpoint(net2.seq, use_reentrant=use_reentrant)
+        with MemoryDelta(x.device) as mem2:
+            loss2 = net2(x2).sum()
         graph_size2 = self._get_graph_size(loss2)
         loss2.backward()
 
-        self.assertTrue(graph_size2 < graph_size1)
+        if use_reentrant:
+            self.assertTrue(graph_size2 < graph_size1)
+
+        if x.is_cuda:
+            self.assertTrue(mem2.delta() < mem1.delta())
 
         for p1, p2 in zip(net1.parameters(), net2.parameters()):
             self.assertEqual(p1.grad, p2.grad)
 
-    def test_tensor_only_cpu(self):
+    @parametrize("use_reentrant", [True, False])
+    def test_tensor_only_cpu(self, use_reentrant: bool):
         x = torch.randn(20, 100)
         net = ToyModel()
-        self._test_tensor_only(net, x)
+        self._test_tensor_only(net, x, use_reentrant)
 
     @unittest.skipIf(not TEST_CUDA, "no cuda")
-    def test_tensor_only_gpu(self):
+    @parametrize("use_reentrant", [True, False])
+    def test_tensor_only_gpu(self, use_reentrant: bool):
         x = torch.randn(20, 100, device="cuda:0")
         net = ToyModel().to("cuda:0")
-        self._test_tensor_only(net, x)
+        self._test_tensor_only(net, x, use_reentrant)
+
+
+instantiate_parametrized_tests(TestCheckpoint)
 
 
 if __name__ == "__main__":
diff --git a/torch/distributed/_composable/checkpoint_activation.py b/torch/distributed/_composable/checkpoint_activation.py
index 4d9a2ea7fddb3..dd73cde9f73ff 100644
--- a/torch/distributed/_composable/checkpoint_activation.py
+++ b/torch/distributed/_composable/checkpoint_activation.py
@@ -3,7 +3,9 @@
 from torch.utils.checkpoint import detach_variable
 
 from contextlib import contextmanager
+from functools import partial
 from typing import Any, List, Optional, Tuple
+from weakref import ReferenceType, WeakKeyDictionary, ref
 
 from .contract import contract
 
@@ -100,8 +102,71 @@ def backward(ctx, output_grads: Tuple[Optional[torch.Tensor]]) -> Any:  # type:
         return (None, None) + grads
 
 
+class _Holder:
+    pass
+
+
+def _pack(
+    x: torch.Tensor,
+    *,
+    weak_holder_list: List[ReferenceType],
+) -> _Holder:
+    res = _Holder()
+    weak_holder_list.append(ref(res))
+    return res
+
+
+def _unpack(
+    holder: _Holder,
+    *,
+    storage: WeakKeyDictionary,
+    weak_holder_list: List[ReferenceType],
+    module: nn.Module,
+    inputs: Tuple[Any],
+) -> torch.Tensor:
+    holder_index = 0
+    if len(storage) == 0:
+
+        def inner_pack(inner: torch.Tensor):
+            nonlocal holder_index
+            if weak_holder_list[holder_index]() is None:
+                # If the holder went out of scope, the SavedVariable is dead
+                # and so the value will never be read from the storage. Skip
+                # filling it.
+                pass
+            else:
+                # Use detach here to ensure we don't keep the temporary
+                # autograd graph created during the second forward
+                storage[weak_holder_list[holder_index]()] = inner.detach()
+            holder_index += 1
+            return
+
+        def inner_unpack(holder: _Holder):
+            raise RuntimeError(
+                "You are calling backwards on a tensor that is never exposed. "
+                "Please open an issue."
+            )
+
+        with _no_hook(
+            module
+        ), torch.enable_grad(), torch.autograd.graph.saved_tensors_hooks(
+            inner_pack, inner_unpack
+        ):
+            _unused = module(*inputs)
+
+    if holder not in storage:
+        raise RuntimeError(
+            "Attempt to retrieve a tensor saved by autograd multiple times "
+            "without checkpoint recomputation being triggered in between, this "
+            "is not currently supported. Please open an issue with details on "
+            "your use case so that we can prioritize adding this."
+        )
+
+    return storage[holder]
+
+
 @contract
-def checkpoint(module: nn.Module) -> nn.Module:
+def checkpoint(module: nn.Module, *, use_reentrant: bool = True) -> nn.Module:
     r"""
     This is a composable activation checkpointing API. Unlike functional
     activation checkpointing APIs, this one does not require changing model
@@ -114,6 +179,8 @@ def checkpoint(module: nn.Module) -> nn.Module:
     Args:
         module (nn.Module): the target model or sub-module to apply activation
             checkpointing.
+        use_reentrant (bool): Apply activation checkpointing using reentrant
+            autograd.
 
     Example::
         >>> import torch.nn as nn
@@ -136,20 +203,49 @@ def checkpoint(module: nn.Module) -> nn.Module:
     def forward_pre_hook(module: nn.Module, inputs: Tuple[Any]) -> None:
         if checkpoint.state(module).enable_hook:
             checkpoint.state(module).orig_grad_enabled = torch.is_grad_enabled()
-            torch.set_grad_enabled(False)
+            if checkpoint.state(module).use_reentrant:
+                torch.set_grad_enabled(False)
+            else:
+                # The Holder object for each of the saved object is saved
+                # directly on the SavedVariable and is cleared when reset_data()
+                # is called on it. We MUST make sure that this is the only
+                # object having an owning reference to ensure that the Tensor
+                # stored in storage is deleted as soon as the corresponding
+                # SavedVariable data is cleared.
+                storage: WeakKeyDictionary = WeakKeyDictionary()
+                weak_holder_list: List[ReferenceType] = []
+                saved_tensor_hooks = torch.autograd.graph.saved_tensors_hooks(
+                    partial(_pack, weak_holder_list=weak_holder_list),
+                    partial(
+                        _unpack,
+                        storage=storage,
+                        weak_holder_list=weak_holder_list,
+                        module=module,
+                        inputs=inputs,
+                    ),
+                )
+                saved_tensor_hooks.__enter__()
+                checkpoint.state(module).saved_tensor_hooks = saved_tensor_hooks
 
     def forward_hook(module: nn.Module, inputs: Tuple[Any], output: Any) -> Any:
         if checkpoint.state(module).enable_hook:
             torch.set_grad_enabled(checkpoint.state(module).orig_grad_enabled)
-            return _ModuleHookCheckpointFunction.apply(module, output, *inputs)
-        else:
-            return output
+            if checkpoint.state(module).use_reentrant:
+                return _ModuleHookCheckpointFunction.apply(
+                    module, output, *inputs
+                )
+            else:
+                checkpoint.state(module).saved_tensor_hooks.__exit__()
+                checkpoint.state(module).saved_tensor_hooks = None
+
+        return output
 
     # This hook does the following things:
     # 1. detach outputs from the autograd graph to discard activations
     # 2. insert an autograd.Function after the forward pass to recompute
     #    activations during the backward pass.
     checkpoint.state(module).enable_hook = True
+    checkpoint.state(module).use_reentrant = use_reentrant
     module.register_forward_pre_hook(forward_pre_hook)
     # Use prepend to make sure we restore the original grad enabled state right
     # after the module forward invocation.

From 15f84d7d31bb5b825078520cfbb36afba03b8f2f Mon Sep 17 00:00:00 2001
From: Salil Desai <salilsdesai@meta.com>
Date: Thu, 1 Dec 2022 23:34:36 +0000
Subject: [PATCH 1497/1922] [QNNPACK] Fix Memory Leak in QNNPACK QSoftmax Op
 (#89544)

Summary:
The deleter of the operator's unique_ptr doesn't get called unless the unique_ptr is created after the op has been created

This fixes the problem reported in
https://fb.workplace.com/groups/pytorch.edge.users/posts/1210708329799458/

Test Plan:
# Testing memory leak fix

**With test code added in D41487340:**
```
cd ~/fbsource/xplat
buck run caffe2/aten/src/ATen/native/quantized/cpu/qsoftmax_test:qsoftmax_test
```

Before this diff:

```
==2060866==ERROR: LeakSanitizer: detected memory leaks

Direct leak of 608 byte(s) in 1 object(s) allocated from:
    #0 0x41bcd27 in calloc (/data/users/salilsdesai/fbsource/buck-out/gen/aab7ed39/xplat/caffe2/aten/src/ATen/native/quantized/cpu/qsoftmax_test/qsoftmax_test+0x41bcd27)
    #1 0x405b692 in pytorch_qnnp_create_softargmax_nc_q8 xplat/caffe2/aten/src/ATen/native/quantized/cpu/qnnpack/src/softargmax.c:77

Indirect leak of 1024 byte(s) in 1 object(s) allocated from:
    #0 0x41bcb7f in malloc (/data/users/salilsdesai/fbsource/buck-out/gen/aab7ed39/xplat/caffe2/aten/src/ATen/native/quantized/cpu/qsoftmax_test/qsoftmax_test+0x41bcb7f)
    #1 0x405b6a8 in pytorch_qnnp_create_softargmax_nc_q8 xplat/caffe2/aten/src/ATen/native/quantized/cpu/qnnpack/src/softargmax.c:85

SUMMARY- AddressSanitizer: 1632 byte(s) leaked in 2 allocation(s).
```

After this diff:
- No errors
___

# Testing op correctness

```
cd ~/fbsource/fbcode
buck test caffe2/test/quantization:quantization -- test_qsoftmax
```
Passes
- https://www.internalfb.com/intern/testinfra/testconsole/testrun/2814749908834332/

Differential Revision: D41487341

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89544
Approved by: https://github.com/mcr229
---
 aten/src/ATen/native/quantized/cpu/qsoftmax.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp b/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp
index f29f548fc758c..921e1cffeb5b2 100644
--- a/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp
@@ -81,8 +81,6 @@ Tensor qsoftmax_qnnpack(const Tensor& qx, const int64_t dim) {
 
   initQNNPACK();
   pytorch_qnnp_operator_t softargmax = nullptr;
-  std::unique_ptr<pytorch_qnnp_operator, QnnpackOperatorDeleter> softmax_op(
-      softargmax);
 
   pytorch_qnnp_status status = pytorch_qnnp_create_softargmax_nc_q8(
       channels,
@@ -96,6 +94,9 @@ Tensor qsoftmax_qnnpack(const Tensor& qx, const int64_t dim) {
       "failed to create QNNPACK Softmax operator");
   TORCH_CHECK_NOTNULL(softargmax);
 
+  std::unique_ptr<pytorch_qnnp_operator, QnnpackOperatorDeleter> softmax_op(
+    softargmax);
+
   status = pytorch_qnnp_setup_softargmax_nc_q8(
       softargmax, batch_size, input, input_stride, output, output_stride);
   TORCH_CHECK(

From 7fad65a5510c383ef40b98b5b59933c272dfb167 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Fri, 2 Dec 2022 01:05:01 +0000
Subject: [PATCH 1498/1922] Fix binary testing if torchtrition is mandatory
 (#90017)

Prep-change for a builder, where torchtrition is installed from custom nightly downloads repo

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90017
Approved by: https://github.com/seemethere
---
 .circleci/scripts/binary_linux_test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/scripts/binary_linux_test.sh b/.circleci/scripts/binary_linux_test.sh
index 6e34b3e1e5f41..854ad883143b8 100755
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@@ -98,7 +98,7 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
     conda install \${EXTRA_CONDA_FLAGS} -y "\$pkg" --offline
   )
 elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
-  pip install "\$pkg"
+  pip install "\$pkg" --extra-index-url "https://download.pytorch.org/whl/nightly/${DESIRED_CUDA}"
   retry pip install -q future numpy protobuf typing-extensions six
 fi
 if [[ "$PACKAGE_TYPE" == libtorch ]]; then

From 24cea6a3c27fa70acc4ed857d75af19f931d12e3 Mon Sep 17 00:00:00 2001
From: Michael Lazos <mlazos@fb.com>
Date: Fri, 2 Dec 2022 01:15:11 +0000
Subject: [PATCH 1499/1922] Disable dynamo on optimizer lazy initialization
 (#89902)

Helps with https://github.com/pytorch/torchdynamo/issues/1803

Separate out the group initialization and disable dynamo on it

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89902
Approved by: https://github.com/soumith, https://github.com/albanD
---
 test/dynamo/test_optimizers.py |  47 +++++++++------
 torch/_dynamo/eval_frame.py    |   4 ++
 torch/optim/adadelta.py        |  51 +++++++++--------
 torch/optim/adagrad.py         |  25 ++++----
 torch/optim/adam.py            | 102 ++++++++++++++++++++-------------
 torch/optim/adamax.py          |  49 ++++++++--------
 torch/optim/adamw.py           |  80 +++++++++++++-------------
 torch/optim/asgd.py            |  45 ++++++++-------
 torch/optim/nadam.py           |  45 ++++++++-------
 torch/optim/radam.py           |  49 ++++++++--------
 torch/optim/rmsprop.py         |  75 ++++++++++++------------
 torch/optim/rprop.py           |  71 ++++++++++++-----------
 12 files changed, 355 insertions(+), 288 deletions(-)

diff --git a/test/dynamo/test_optimizers.py b/test/dynamo/test_optimizers.py
index a4607a8d3db7e..0036b39c622d2 100644
--- a/test/dynamo/test_optimizers.py
+++ b/test/dynamo/test_optimizers.py
@@ -29,14 +29,11 @@
 optim_filenames |= {torch.optim._functional.__file__}
 
 
-def make_test(optim_cls, exp_frame_cnt=1, closure=None, **kwargs):
+def make_test(optim_cls, exp_graph_count=1, closure=None, **kwargs):
     opt = optim_cls(model.parameters(), **kwargs)
 
     def test_fn(self):
         nonlocal opt
-
-        counter = torch._dynamo.testing.CompileCounter()
-
         if closure is not None:
 
             def fn():
@@ -45,10 +42,9 @@ def fn():
         else:
             fn = opt.step
 
-        opt_fn = torch._dynamo.optimize(counter)(fn)
-        opt_fn()
+        _, _, graphs, _, _, _ = torch._dynamo.explain(fn)
 
-        self.assertEqual(counter.frame_count, exp_frame_cnt)
+        self.assertEqual(exp_graph_count, len(graphs))
 
     return test_fn
 
@@ -87,28 +83,43 @@ def setUpClass(cls):
     #    torch.optim.LBFGS, exp_frame_cnt=3, closure=lambda: model(input).sum()
     # )
 
+    # These optimizers are disabled until we remove item() calls
+    test_adam = make_test(torch.optim.Adam, exp_graph_count=0)
+    test_adamax = make_test(torch.optim.Adamax, exp_graph_count=0)
+    test_adamw = make_test(torch.optim.AdamW, exp_graph_count=0)
+
     # RAdam and Adagrad have data-dependent control which breaks the graph;
     # furthermore, the break is inside a for loop, so we bail on the frame
     # entirely.  This is basically an xfail; if the frame count goes up
     # you done good
-    test_radam = make_test(torch.optim.RAdam, exp_frame_cnt=0)
-    test_adagrad = make_test(torch.optim.Adagrad, exp_frame_cnt=0)
+    test_nadam = make_test(torch.optim.NAdam, exp_graph_count=0)
+    test_radam = make_test(torch.optim.RAdam, exp_graph_count=0)
+    test_adagrad = make_test(torch.optim.Adagrad, exp_graph_count=0)
 
     # ASGD has a small optimization that avoids averaging
     # This will fully capture the graph once that optimization is removed
-    # NB: in python versions < 3.8, we don't capture graphs when breaks
-    # occur in a loop
-
-    # Fails without fake tensor:
-    # TypeError: clamp() received an invalid combination of arguments - got (float, min=int)
-    # test_asgd = make_test(
-    #     torch.optim.ASGD, exp_frame_cnt=(0 if sys.version_info < (3, 8) else 6)
-    # )
+    # test_asgd = make_test(torch.optim.ASGD, exp_graph_count=0)
 
 
 # exclude SparseAdam because other areas of the stack don't support it yet
 # the others are handled specially above
-exclude = set(["SGD", "Optimizer", "SparseAdam", "LBFGS", "RAdam", "Adagrad", "ASGD"])
+exclude = set(
+    [
+        "SGD",  # Handled above
+        "ASGD",  # Disabled pending item call removal + optimization removal
+        "Optimizer",
+        "SparseAdam",  # Unsupported
+        "LBFGS",  # Unsupported
+        "Adam",  # Disabled pending item call removal
+        "Adamax",  # Disabled pending item call removal
+        "AdamW",  # Disabled pending item call removal
+        "RAdam",  # Disabled pending item call removal
+        "NAdam",  # Disabled pending item call removal
+        "Adagrad",  # Disabled pending item call removal
+        "ASGD",
+    ]
+)
+
 optimizers = [
     opt
     for opt in torch.optim.__dict__.values()
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 75e65bc270454..556a3b4912ebf 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -754,6 +754,10 @@ def patch():
                 opt._cuda_graph_capture_health_check
             )
             opt.zero_grad = disable(opt.zero_grad)
+
+            if hasattr(opt, "_init_group"):
+                opt._init_group = disable(opt._init_group)
+
             # disable any currently set hooks
             # Note: we only want to disable the profiling hook
             # which is the *last* hook applied, we want to keep the no_grad hook
diff --git a/torch/optim/adadelta.py b/torch/optim/adadelta.py
index d46311f26626f..20567c2bac0fc 100644
--- a/torch/optim/adadelta.py
+++ b/torch/optim/adadelta.py
@@ -93,6 +93,32 @@ def __setstate__(self, state):
             group.setdefault("maximize", False)
             group.setdefault("differentiable", False)
 
+    def _init_group(self, group, params_with_grad, grads, square_avgs, acc_deltas):
+        for p in group["params"]:
+            if p.grad is None:
+                continue
+            params_with_grad.append(p)
+            if p.grad.is_sparse:
+                raise RuntimeError("Adadelta does not support sparse gradients")
+            grads.append(p.grad)
+
+            state = self.state[p]
+
+            # Lazy state initialization
+            if len(state) == 0:
+                state["step"] = 0
+                state["square_avg"] = torch.zeros_like(
+                    p, memory_format=torch.preserve_format
+                )
+                state["acc_delta"] = torch.zeros_like(
+                    p, memory_format=torch.preserve_format
+                )
+
+            square_avgs.append(state["square_avg"])
+            acc_deltas.append(state["acc_delta"])
+
+            state["step"] += 1
+
     @_use_grad_for_differentiable
     def step(self, closure=None):
         """Performs a single optimization step.
@@ -121,30 +147,7 @@ def step(self, closure=None):
                 group["differentiable"],
             )
 
-            for p in group["params"]:
-                if p.grad is None:
-                    continue
-                params_with_grad.append(p)
-                if p.grad.is_sparse:
-                    raise RuntimeError("Adadelta does not support sparse gradients")
-                grads.append(p.grad)
-
-                state = self.state[p]
-
-                # Lazy state initialization
-                if len(state) == 0:
-                    state["step"] = 0
-                    state["square_avg"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
-                    state["acc_delta"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
-
-                square_avgs.append(state["square_avg"])
-                acc_deltas.append(state["acc_delta"])
-
-                state["step"] += 1
+            self._init_group(group, params_with_grad, grads, square_avgs, acc_deltas)
 
             adadelta(
                 params_with_grad,
diff --git a/torch/optim/adagrad.py b/torch/optim/adagrad.py
index d95dd69bb90c6..9be2c23c0a52f 100644
--- a/torch/optim/adagrad.py
+++ b/torch/optim/adagrad.py
@@ -124,6 +124,20 @@ def share_memory(self):
                 state = self.state[p]
                 state["sum"].share_memory_()
 
+    def _init_group(self, group, params_with_grad, grads, state_sums, state_steps):
+        has_sparse_grad = False
+        for p in group["params"]:
+            if p.grad is not None:
+                if p.grad.is_sparse:
+                    has_sparse_grad = True
+                params_with_grad.append(p)
+                grads.append(p.grad)
+                state = self.state[p]
+                state_sums.append(state["sum"])
+                state_steps.append(state["step"])
+
+        return has_sparse_grad
+
     @_use_grad_for_differentiable
     def step(self, closure=None):
         """Performs a single optimization step.
@@ -144,16 +158,7 @@ def step(self, closure=None):
             state_sums = []
             state_steps = []
 
-            has_sparse_grad = False
-            for p in group["params"]:
-                if p.grad is not None:
-                    if p.grad.is_sparse:
-                        has_sparse_grad = True
-                    params_with_grad.append(p)
-                    grads.append(p.grad)
-                    state = self.state[p]
-                    state_sums.append(state["sum"])
-                    state_steps.append(state["step"])
+            has_sparse_grad = self._init_group(group, params_with_grad, grads, state_sums, state_steps)
 
             adagrad(
                 params_with_grad,
diff --git a/torch/optim/adam.py b/torch/optim/adam.py
index 0560e7506d415..da672127f766b 100644
--- a/torch/optim/adam.py
+++ b/torch/optim/adam.py
@@ -165,6 +165,60 @@ def __setstate__(self, state):
             for s in state_values:
                 s['step'] = torch.tensor(float(s['step']))
 
+    def _init_group(
+        self,
+        group,
+        grad_scaler,
+        params_with_grad,
+        grads,
+        exp_avgs,
+        exp_avg_sqs,
+        max_exp_avg_sqs,
+        state_steps
+    ):
+
+        grad_scale = None
+        found_inf = None
+        if group['fused'] and grad_scaler is not None:
+            grad_scale = grad_scaler._get_scale_async()
+            device = grad_scale.device
+            grad_scale = _MultiDeviceReplicator(grad_scale)
+            found_inf = _get_fp16AMP_params(optimizer=self, grad_scaler=grad_scaler, device=device)
+
+        for p in group['params']:
+            if p.grad is not None:
+                params_with_grad.append(p)
+                if p.grad.is_sparse:
+                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+                grads.append(p.grad)
+
+                state = self.state[p]
+                # Lazy state initialization
+                if len(state) == 0:
+                    state['step'] = (
+                        torch.zeros((1,), dtype=torch.float, device=p.device)
+                        if self.defaults['capturable'] or self.defaults['fused']
+                        else torch.tensor(0.)
+                    )
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    if group['amsgrad']:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                exp_avgs.append(state['exp_avg'])
+                exp_avg_sqs.append(state['exp_avg_sq'])
+
+                if group['amsgrad']:
+                    max_exp_avg_sqs.append(state['max_exp_avg_sq'])
+                if group['differentiable'] and state['step'].requires_grad:
+                    raise RuntimeError('`requires_grad` is not supported for `step` in differentiable mode')
+                state_steps.append(state['step'])
+
+        return grad_scale, found_inf
+
     @_use_grad_for_differentiable
     def step(self, closure=None, *, grad_scaler=None):
         """Performs a single optimization step.
@@ -191,45 +245,15 @@ def step(self, closure=None, *, grad_scaler=None):
             state_steps = []
             beta1, beta2 = group['betas']
 
-            grad_scale = None
-            found_inf = None
-            if group['fused'] and grad_scaler is not None:
-                grad_scale = grad_scaler._get_scale_async()
-                device = grad_scale.device
-                grad_scale = _MultiDeviceReplicator(grad_scale)
-                found_inf = _get_fp16AMP_params(optimizer=self, grad_scaler=grad_scaler, device=device)
-
-            for p in group['params']:
-                if p.grad is not None:
-                    params_with_grad.append(p)
-                    if p.grad.is_sparse:
-                        raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
-                    grads.append(p.grad)
-
-                    state = self.state[p]
-                    # Lazy state initialization
-                    if len(state) == 0:
-                        state['step'] = (
-                            torch.zeros((1,), dtype=torch.float, device=p.device)
-                            if self.defaults['capturable'] or self.defaults['fused']
-                            else torch.tensor(0.)
-                        )
-                        # Exponential moving average of gradient values
-                        state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                        # Exponential moving average of squared gradient values
-                        state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                        if group['amsgrad']:
-                            # Maintains max of all exp. moving avg. of sq. grad. values
-                            state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
-                    exp_avgs.append(state['exp_avg'])
-                    exp_avg_sqs.append(state['exp_avg_sq'])
-
-                    if group['amsgrad']:
-                        max_exp_avg_sqs.append(state['max_exp_avg_sq'])
-                    if group['differentiable'] and state['step'].requires_grad:
-                        raise RuntimeError('`requires_grad` is not supported for `step` in differentiable mode')
-                    state_steps.append(state['step'])
+            grad_scale, found_inf = self._init_group(
+                group,
+                grad_scaler,
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_avg_sqs,
+                max_exp_avg_sqs,
+                state_steps)
 
             adam(params_with_grad,
                  grads,
diff --git a/torch/optim/adamax.py b/torch/optim/adamax.py
index a3d0cdec86390..11d131fe2df4c 100644
--- a/torch/optim/adamax.py
+++ b/torch/optim/adamax.py
@@ -99,6 +99,31 @@ def __setstate__(self, state):
             for s in state_values:
                 s["step"] = torch.tensor(float(s["step"]))
 
+    def _init_group(self, group, params_with_grad, grads, exp_avgs, exp_infs, state_steps):
+        for p in group["params"]:
+            if p.grad is None:
+                continue
+            params_with_grad.append(p)
+            if p.grad.is_sparse:
+                raise RuntimeError("Adamax does not support sparse gradients")
+            grads.append(p.grad)
+
+            state = self.state[p]
+
+            # State initialization
+            if len(state) == 0:
+                state["step"] = torch.tensor(0.0)
+                state["exp_avg"] = torch.zeros_like(
+                    p, memory_format=torch.preserve_format
+                )
+                state["exp_inf"] = torch.zeros_like(
+                    p, memory_format=torch.preserve_format
+                )
+
+            exp_avgs.append(state["exp_avg"])
+            exp_infs.append(state["exp_inf"])
+            state_steps.append(state["step"])
+
     @_use_grad_for_differentiable
     def step(self, closure=None):
         """Performs a single optimization step.
@@ -127,29 +152,7 @@ def step(self, closure=None):
             maximize = group["maximize"]
             differentiable = group["differentiable"]
 
-            for p in group["params"]:
-                if p.grad is None:
-                    continue
-                params_with_grad.append(p)
-                if p.grad.is_sparse:
-                    raise RuntimeError("Adamax does not support sparse gradients")
-                grads.append(p.grad)
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state["step"] = torch.tensor(0.0)
-                    state["exp_avg"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
-                    state["exp_inf"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
-
-                exp_avgs.append(state["exp_avg"])
-                exp_infs.append(state["exp_inf"])
-                state_steps.append(state["step"])
+            self._init_group(group, params_with_grad, grads, exp_avgs, exp_infs, state_steps)
 
             adamax(
                 params_with_grad,
diff --git a/torch/optim/adamw.py b/torch/optim/adamw.py
index 5f6fca66ab25c..34b277d942a88 100644
--- a/torch/optim/adamw.py
+++ b/torch/optim/adamw.py
@@ -126,6 +126,46 @@ def __setstate__(self, state):
             for s in state_values:
                 s["step"] = torch.tensor(float(s["step"]))
 
+    def _init_group(self, group, params_with_grad, grads, amsgrad, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps):
+        for p in group["params"]:
+            if p.grad is None:
+                continue
+            params_with_grad.append(p)
+            if p.grad.is_sparse:
+                raise RuntimeError("AdamW does not support sparse gradients")
+            grads.append(p.grad)
+
+            state = self.state[p]
+
+            # State initialization
+            if len(state) == 0:
+                state["step"] = (
+                    torch.zeros((1,), dtype=torch.float, device=p.device)
+                    if self.defaults["capturable"]
+                    else torch.tensor(0.0)
+                )
+                # Exponential moving average of gradient values
+                state["exp_avg"] = torch.zeros_like(
+                    p, memory_format=torch.preserve_format
+                )
+                # Exponential moving average of squared gradient values
+                state["exp_avg_sq"] = torch.zeros_like(
+                    p, memory_format=torch.preserve_format
+                )
+                if amsgrad:
+                    # Maintains max of all exp. moving avg. of sq. grad. values
+                    state["max_exp_avg_sq"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+
+            exp_avgs.append(state["exp_avg"])
+            exp_avg_sqs.append(state["exp_avg_sq"])
+
+            if amsgrad:
+                max_exp_avg_sqs.append(state["max_exp_avg_sq"])
+
+            state_steps.append(state["step"])
+
     @_use_grad_for_differentiable
     def step(self, closure=None):
         """Performs a single optimization step.
@@ -150,46 +190,8 @@ def step(self, closure=None):
             state_steps = []
             amsgrad = group["amsgrad"]
             beta1, beta2 = group["betas"]
-            differentiable = group["differentiable"]
-
-            for p in group["params"]:
-                if p.grad is None:
-                    continue
-                params_with_grad.append(p)
-                if p.grad.is_sparse:
-                    raise RuntimeError("AdamW does not support sparse gradients")
-                grads.append(p.grad)
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state["step"] = (
-                        torch.zeros((1,), dtype=torch.float, device=p.device)
-                        if self.defaults["capturable"]
-                        else torch.tensor(0.0)
-                    )
-                    # Exponential moving average of gradient values
-                    state["exp_avg"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
-                    # Exponential moving average of squared gradient values
-                    state["exp_avg_sq"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
-                    if amsgrad:
-                        # Maintains max of all exp. moving avg. of sq. grad. values
-                        state["max_exp_avg_sq"] = torch.zeros_like(
-                            p, memory_format=torch.preserve_format
-                        )
-
-                exp_avgs.append(state["exp_avg"])
-                exp_avg_sqs.append(state["exp_avg_sq"])
-
-                if amsgrad:
-                    max_exp_avg_sqs.append(state["max_exp_avg_sq"])
 
-                state_steps.append(state["step"])
+            self._init_group(group, params_with_grad, grads, amsgrad, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps)
 
             adamw(
                 params_with_grad,
diff --git a/torch/optim/asgd.py b/torch/optim/asgd.py
index 9fbac14e83566..d590430b93c20 100644
--- a/torch/optim/asgd.py
+++ b/torch/optim/asgd.py
@@ -86,6 +86,29 @@ def __setstate__(self, state):
             for s in state_values:
                 s["mu"] = torch.tensor(float(s["mu"]))
 
+    def _init_group(self, group, params_with_grad, grads, mus, axs, etas, state_steps):
+        for p in group["params"]:
+            if p.grad is not None:
+                params_with_grad.append(p)
+                if p.grad.is_sparse:
+                    raise RuntimeError("ASGD does not support sparse gradients")
+                grads.append(p.grad)
+
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    state["step"] = torch.tensor(0.0)
+                    state["eta"] = torch.tensor(group["lr"])
+                    state["mu"] = torch.tensor(1.0)
+                    state["ax"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+
+                mus.append(state["mu"])
+                axs.append(state["ax"])
+                etas.append(state["eta"])
+                state_steps.append(state["step"])
+
     @_use_grad_for_differentiable
     def step(self, closure=None):
         """Performs a single optimization step.
@@ -107,27 +130,7 @@ def step(self, closure=None):
             etas = []
             state_steps = []
 
-            for p in group["params"]:
-                if p.grad is not None:
-                    params_with_grad.append(p)
-                    if p.grad.is_sparse:
-                        raise RuntimeError("ASGD does not support sparse gradients")
-                    grads.append(p.grad)
-
-                    state = self.state[p]
-                    # State initialization
-                    if len(state) == 0:
-                        state["step"] = torch.tensor(0.0)
-                        state["eta"] = torch.tensor(group["lr"])
-                        state["mu"] = torch.tensor(1.0)
-                        state["ax"] = torch.zeros_like(
-                            p, memory_format=torch.preserve_format
-                        )
-
-                    mus.append(state["mu"])
-                    axs.append(state["ax"])
-                    etas.append(state["eta"])
-                    state_steps.append(state["step"])
+            self._init_group(group, params_with_grad, grads, mus, axs, etas, state_steps)
 
             asgd(
                 params_with_grad,
diff --git a/torch/optim/nadam.py b/torch/optim/nadam.py
index 59f3b790b3132..cdc6cbb15f6e6 100644
--- a/torch/optim/nadam.py
+++ b/torch/optim/nadam.py
@@ -90,6 +90,29 @@ def __setstate__(self, state):
             for s in state_values:
                 s['mu_product'] = torch.tensor(s['mu_product'])
 
+    def _init_group(self, group, params_with_grad, grads, exp_avgs, exp_avg_sqs, mu_products, state_steps):
+        for p in group['params']:
+            if p.grad is not None:
+                params_with_grad.append(p)
+                if p.grad.is_sparse:
+                    raise RuntimeError('NAdam does not support sparse gradients')
+                grads.append(p.grad)
+
+                state = self.state[p]
+                # Lazy state initialization
+                if len(state) == 0:
+                    state['step'] = torch.tensor(0.)
+                    state['mu_product'] = torch.tensor(1.)
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                exp_avgs.append(state['exp_avg'])
+                exp_avg_sqs.append(state['exp_avg_sq'])
+                mu_products.append(state['mu_product'])
+                state_steps.append(state['step'])
+
     @_use_grad_for_differentiable
     def step(self, closure=None):
         """Performs a single optimization step.
@@ -112,27 +135,7 @@ def step(self, closure=None):
             state_steps = []
             beta1, beta2 = group['betas']
 
-            for p in group['params']:
-                if p.grad is not None:
-                    params_with_grad.append(p)
-                    if p.grad.is_sparse:
-                        raise RuntimeError('NAdam does not support sparse gradients')
-                    grads.append(p.grad)
-
-                    state = self.state[p]
-                    # Lazy state initialization
-                    if len(state) == 0:
-                        state['step'] = torch.tensor(0.)
-                        state['mu_product'] = torch.tensor(1.)
-                        # Exponential moving average of gradient values
-                        state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                        # Exponential moving average of squared gradient values
-                        state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
-                    exp_avgs.append(state['exp_avg'])
-                    exp_avg_sqs.append(state['exp_avg_sq'])
-                    mu_products.append(state['mu_product'])
-                    state_steps.append(state['step'])
+            self._init_group(group, params_with_grad, grads, exp_avgs, exp_avg_sqs, mu_products, state_steps)
 
             nadam(params_with_grad,
                   grads,
diff --git a/torch/optim/radam.py b/torch/optim/radam.py
index 4a3d271e65d33..45bd57db1eb59 100644
--- a/torch/optim/radam.py
+++ b/torch/optim/radam.py
@@ -111,6 +111,31 @@ def __setstate__(self, state):
             for s in state_values:
                 s["step"] = torch.tensor(float(s["step"]))
 
+    def _init_group(self, group, params_with_grad, grads, exp_avgs, exp_avg_sqs, state_steps):
+        for p in group["params"]:
+            if p.grad is not None:
+                params_with_grad.append(p)
+                if p.grad.is_sparse:
+                    raise RuntimeError("RAdam does not support sparse gradients")
+                grads.append(p.grad)
+
+                state = self.state[p]
+                # Lazy state initialization
+                if len(state) == 0:
+                    state["step"] = torch.tensor(0.0)
+                    # Exponential moving average of gradient values
+                    state["exp_avg"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+                    # Exponential moving average of squared gradient values
+                    state["exp_avg_sq"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+
+                exp_avgs.append(state["exp_avg"])
+                exp_avg_sqs.append(state["exp_avg_sq"])
+                state_steps.append(state["step"])
+
     @_use_grad_for_differentiable
     def step(self, closure=None):
         """Performs a single optimization step.
@@ -132,29 +157,7 @@ def step(self, closure=None):
             state_steps = []
             beta1, beta2 = group["betas"]
 
-            for p in group["params"]:
-                if p.grad is not None:
-                    params_with_grad.append(p)
-                    if p.grad.is_sparse:
-                        raise RuntimeError("RAdam does not support sparse gradients")
-                    grads.append(p.grad)
-
-                    state = self.state[p]
-                    # Lazy state initialization
-                    if len(state) == 0:
-                        state["step"] = torch.tensor(0.0)
-                        # Exponential moving average of gradient values
-                        state["exp_avg"] = torch.zeros_like(
-                            p, memory_format=torch.preserve_format
-                        )
-                        # Exponential moving average of squared gradient values
-                        state["exp_avg_sq"] = torch.zeros_like(
-                            p, memory_format=torch.preserve_format
-                        )
-
-                    exp_avgs.append(state["exp_avg"])
-                    exp_avg_sqs.append(state["exp_avg_sq"])
-                    state_steps.append(state["step"])
+            self._init_group(group, params_with_grad, grads, exp_avgs, exp_avg_sqs, state_steps)
 
             radam(
                 params_with_grad,
diff --git a/torch/optim/rmsprop.py b/torch/optim/rmsprop.py
index 0bf97158e8fa9..9725954a48185 100644
--- a/torch/optim/rmsprop.py
+++ b/torch/optim/rmsprop.py
@@ -114,6 +114,44 @@ def __setstate__(self, state):
             group.setdefault("maximize", False)
             group.setdefault("differentiable", False)
 
+    def _init_group(self, group, params_with_grad, grads, square_avgs, momentum_buffer_list, grad_avgs):
+        for p in group["params"]:
+            if p.grad is None:
+                continue
+            params_with_grad.append(p)
+
+            if p.grad.is_sparse:
+                raise RuntimeError("RMSprop does not support sparse gradients")
+            grads.append(p.grad)
+
+            state = self.state[p]
+
+            # State initialization
+            if len(state) == 0:
+                state["step"] = 0
+                state["square_avg"] = torch.zeros_like(
+                    p, memory_format=torch.preserve_format
+                )
+                if group["momentum"] > 0:
+                    state["momentum_buffer"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+                if group["centered"]:
+                    state["grad_avg"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+            square_avgs.append(state["square_avg"])
+
+            if group["momentum"] > 0:
+                momentum_buffer_list.append(state["momentum_buffer"])
+            if group["centered"]:
+                grad_avgs.append(state["grad_avg"])
+
+            if group["differentiable"] and isinstance(state["step"], Tensor):
+                raise RuntimeError("`step` can't be a tensor")
+
+            state["step"] += 1
+
     @_use_grad_for_differentiable
     def step(self, closure=None):
         """Performs a single optimization step.
@@ -134,42 +172,7 @@ def step(self, closure=None):
             grad_avgs = []
             momentum_buffer_list = []
 
-            for p in group["params"]:
-                if p.grad is None:
-                    continue
-                params_with_grad.append(p)
-
-                if p.grad.is_sparse:
-                    raise RuntimeError("RMSprop does not support sparse gradients")
-                grads.append(p.grad)
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state["step"] = 0
-                    state["square_avg"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
-                    if group["momentum"] > 0:
-                        state["momentum_buffer"] = torch.zeros_like(
-                            p, memory_format=torch.preserve_format
-                        )
-                    if group["centered"]:
-                        state["grad_avg"] = torch.zeros_like(
-                            p, memory_format=torch.preserve_format
-                        )
-                square_avgs.append(state["square_avg"])
-
-                if group["momentum"] > 0:
-                    momentum_buffer_list.append(state["momentum_buffer"])
-                if group["centered"]:
-                    grad_avgs.append(state["grad_avg"])
-
-                if group["differentiable"] and isinstance(state["step"], Tensor):
-                    raise RuntimeError("`step` can't be a tensor")
-
-                state["step"] += 1
+            self._init_group(group, params_with_grad, grads, square_avgs, momentum_buffer_list, grad_avgs)
 
             rmsprop(
                 params_with_grad,
diff --git a/torch/optim/rprop.py b/torch/optim/rprop.py
index feab409291537..592f3028668d3 100644
--- a/torch/optim/rprop.py
+++ b/torch/optim/rprop.py
@@ -90,6 +90,42 @@ def __setstate__(self, state):
             group.setdefault("maximize", False)
             group.setdefault("differentiable", False)
 
+    def _init_group(self, group, params, grads, prevs, step_sizes):
+        for p in group["params"]:
+            if p.grad is None:
+                continue
+            params.append(p)
+            grad = p.grad
+            if grad.is_sparse:
+                raise RuntimeError("Rprop does not support sparse gradients")
+
+            grads.append(grad)
+            state = self.state[p]
+
+            # State initialization
+            if len(state) == 0:
+                state["step"] = 0
+                state["prev"] = torch.zeros_like(
+                    p, memory_format=torch.preserve_format
+                )
+                if p.dtype.is_complex:
+                    # Complex Number should be as if they are two independent real numbers.
+                    # Hence the step_size shouldn't be zero for imaginary part.
+                    state["step_size"] = (
+                        grad.new()
+                        .resize_as_(grad)
+                        .fill_(complex(group["lr"], group["lr"]))
+                    )
+                else:
+                    state["step_size"] = (
+                        grad.new().resize_as_(grad).fill_(group["lr"])
+                    )
+
+            prevs.append(state["prev"])
+            step_sizes.append(state["step_size"])
+
+            state["step"] += 1
+
     @_use_grad_for_differentiable
     def step(self, closure=None):
         """Performs a single optimization step.
@@ -113,40 +149,7 @@ def step(self, closure=None):
             foreach = group["foreach"]
             maximize = group["maximize"]
 
-            for p in group["params"]:
-                if p.grad is None:
-                    continue
-                params.append(p)
-                grad = p.grad
-                if grad.is_sparse:
-                    raise RuntimeError("Rprop does not support sparse gradients")
-
-                grads.append(grad)
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state["step"] = 0
-                    state["prev"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
-                    if p.dtype.is_complex:
-                        # Complex Number should be as if they are two independent real numbers.
-                        # Hence the step_size shouldn't be zero for imaginary part.
-                        state["step_size"] = (
-                            grad.new()
-                            .resize_as_(grad)
-                            .fill_(complex(group["lr"], group["lr"]))
-                        )
-                    else:
-                        state["step_size"] = (
-                            grad.new().resize_as_(grad).fill_(group["lr"])
-                        )
-
-                prevs.append(state["prev"])
-                step_sizes.append(state["step_size"])
-
-                state["step"] += 1
+            self._init_group(group, params, grads, prevs, step_sizes)
 
             rprop(
                 params,

From 1c0644346a87cd36a955dfc1aeadaa5640f3a69e Mon Sep 17 00:00:00 2001
From: Zheng Yan <zyan@fb.com>
Date: Fri, 2 Dec 2022 01:31:50 +0000
Subject: [PATCH 1500/1922] Revert D41609017: Multisect successfully blamed
 D41609017 for test or build failures (#90034)

Summary:
This diff is reverting D41609017
D41609017 has been identified to be causing the following test or build failures:
Tests affected:
- https://www.internalfb.com/intern/test/281475052567659/
- https://www.internalfb.com/intern/test/562950029295825/

Here's the Multisect link:
https://www.internalfb.com/intern/testinfra/multisect/1440332
Here are the tasks that are relevant to this breakage:
T93368156: 5 tests started failing for oncall admarket_predictor_pushmaster in the last 2 weeks
We're generating a revert to back out the changes in this diff, please note the backout may land if someone accepts it.

Test Plan: NA

Reviewed By: zyan0

Differential Revision: D41656946

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90034
Approved by: https://github.com/awgu
---
 torch/distributed/fsdp/flat_param.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index 0aeb59527a12a..4f6a7a3695123 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -23,7 +23,6 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import Tensor
-from torch.distributed._tensor import DTensor
 from torch.distributed.fsdp._common_utils import (
     _set_fsdp_flattened,
     HandleTrainingState,
@@ -1292,12 +1291,6 @@ def _use_unsharded_views(self, as_params: bool) -> None:
             if hasattr(module, param_name):
                 delattr(module, param_name)
             if self._use_orig_params and as_params:
-                if type(view) is DTensor:
-                    # A `DTensor` `view` is not compatible with assigning
-                    # `param.data = view`, so we cannot preserve the parameter
-                    # variable.
-                    setattr(module, param_name, nn.Parameter(view))
-                    continue
                 param = self.flat_param._params[i]  # type: ignore[index]
                 setattr(module, param_name, param)
                 param.data = view

From c504291cf94e2da52744a52e9359eebae31d7f36 Mon Sep 17 00:00:00 2001
From: Michael Lazos <mlazos@fb.com>
Date: Fri, 2 Dec 2022 01:45:02 +0000
Subject: [PATCH 1501/1922] Cache guards once per variable tracker, rather than
 re-propagating them repeatedly (#89827)

This improves tracing performance of optimizer tracing significantly (2x). In essence this just removes the recursion from propagate because it is not necessary. ListVariables and ConstDictVariables already contain the guards from the items contained in them.

Adds two other optimizations for special cases of `recursively_contains`

helps with https://github.com/pytorch/torchdynamo/issues/1803

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89827
Approved by: https://github.com/anijain2305, https://github.com/jansel
---
 test/dynamo/test_misc.py          |  8 +++++---
 torch/_dynamo/symbolic_convert.py | 10 ++++++++++
 torch/_dynamo/variables/base.py   |  9 ++-------
 torch/_dynamo/variables/dicts.py  |  5 ++++-
 torch/_dynamo/variables/lists.py  | 29 ++++++++++++++++++++++-------
 5 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index e7af1420c6f66..12f8a2357043d 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -1527,10 +1527,12 @@ def f2(input):
         self.assertEqual(res2, 9)
 
     def test_const_dict_variable_python_type(self):
-        from torch._dynamo.variables import ConstDictVariable
+        from torch._dynamo.variables import ConstantVariable, ConstDictVariable
 
-        d1 = {"a": 10, "b": 20}
-        d2 = collections.OrderedDict([("x", 12), ("y", 22)])
+        d1 = {"a": ConstantVariable(10), "b": ConstantVariable(20)}
+        d2 = collections.OrderedDict(
+            [("x", ConstantVariable(12)), ("y", ConstantVariable(22))]
+        )
         self.assertEqual(ConstDictVariable(d1, dict).python_type(), dict)
         self.assertEqual(
             ConstDictVariable(d2, collections.OrderedDict).python_type(),
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index 9b2a39ef3384e..df7966c50168b 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -1115,10 +1115,20 @@ def LIST_APPEND(self, inst):
         obj = self.stack[-inst.arg]
         assert isinstance(obj, ListVariable)
         assert obj.mutable_local
+        # only copy if the new obj contains other mutables
+        new_rec_contains = obj.recursively_contains
+        if v.recursively_contains or v.mutable_local:
+            new_rec_contains = obj.recursively_contains.union(v.recursively_contains)
+
+            if v.mutable_local:
+                new_rec_contains.add(v.mutable_local)
+
         self.replace_all(
             obj,
             ListVariable(
                 obj.items + [v],
+                recursively_contains=new_rec_contains,
+                regen_guards=False,
                 **VariableTracker.propagate([obj, v]),
             ),
         )
diff --git a/torch/_dynamo/variables/base.py b/torch/_dynamo/variables/base.py
index 4c5aee344061e..52161a8dbdcb6 100644
--- a/torch/_dynamo/variables/base.py
+++ b/torch/_dynamo/variables/base.py
@@ -49,13 +49,6 @@ def visit(var):
             if type(var) in (list, tuple, dict_values, odict_values):
                 for i in var:
                     visit(i)
-            elif isinstance(var, variables.BaseListVariable):
-                guards.update(var.guards)
-                for i in var.items:
-                    visit(i)
-            elif isinstance(var, variables.ConstDictVariable):
-                guards.update(var.guards)
-                visit(var.items.values())
             else:
                 assert isinstance(var, VariableTracker), typestr(var)
                 guards.update(var.guards)
@@ -289,6 +282,8 @@ def aggregate_mutables(var):
                 aggregate_mutables, self, skip_fn=lambda var: var is not self
             )
 
+        assert None not in self.recursively_contains
+
 
 def typestr(*objs):
     if len(objs) == 1:
diff --git a/torch/_dynamo/variables/dicts.py b/torch/_dynamo/variables/dicts.py
index f28efc713db4a..e05eecffc7e61 100644
--- a/torch/_dynamo/variables/dicts.py
+++ b/torch/_dynamo/variables/dicts.py
@@ -20,6 +20,8 @@ def __init__(self, items, user_cls, recursively_contains=None, **kwargs):
         super(ConstDictVariable, self).__init__(
             recursively_contains=recursively_contains, **kwargs
         )
+
+        self.guards.update(VariableTracker.propagate(items.values())["guards"])
         self.items = items
         self.user_cls = user_cls
 
@@ -259,7 +261,8 @@ def call_method(
                     new_rec_contains = self.recursively_contains.union(
                         default_var.recursively_contains
                     )
-                    new_rec_contains.add(default_var.mutable_local)
+                    if default_var.mutable_local is not None:
+                        new_rec_contains.add(default_var.mutable_local)
                     tx.replace_all(
                         self, self.modifed(new_val, new_rec_contains, **options)
                     )
diff --git a/torch/_dynamo/variables/lists.py b/torch/_dynamo/variables/lists.py
index 8214edcc4c9de..82dc0424820b8 100644
--- a/torch/_dynamo/variables/lists.py
+++ b/torch/_dynamo/variables/lists.py
@@ -24,13 +24,22 @@ def cls_for(obj):
         }[obj]
 
     def __init__(
-        self, items: List[VariableTracker], recursively_contains=None, **kwargs
+        self,
+        items: List[VariableTracker],
+        recursively_contains=None,
+        regen_guards=True,
+        **kwargs,
     ):
         super(BaseListVariable, self).__init__(
             recursively_contains=recursively_contains, **kwargs
         )
         assert isinstance(items, list)
         assert all(isinstance(x, VariableTracker) for x in items)
+
+        # Sometimes, we know that we have passed in the guards from the items in the list
+        if regen_guards:
+            self.guards.update(VariableTracker.propagate(items)["guards"])
+
         self.items: List[VariableTracker] = items
 
     def _as_proxy(self):
@@ -159,11 +168,15 @@ def call_method(
             assert not kwargs
             (arg,) = args
             new_rec_contains = self.recursively_contains.union(arg.recursively_contains)
-            new_rec_contains.add(arg.mutable_local)
+            if arg.mutable_local is not None:
+                new_rec_contains.add(arg.mutable_local)
             tx.replace_all(
                 self,
                 ListVariable(
-                    self.items + [arg], recursively_contains=new_rec_contains, **options
+                    self.items + [arg],
+                    recursively_contains=new_rec_contains,
+                    regen_guards=False,
+                    **options,
                 ),
             )
             return ConstantVariable(None)
@@ -179,6 +192,7 @@ def call_method(
                 self,
                 ListVariable(
                     list(self.items) + list(arg.unpack_var_sequence(tx)),
+                    regen_guards=False,
                     **options,
                 ),
             )
@@ -189,7 +203,7 @@ def call_method(
             items.insert(idx.as_python_constant(), value)
             return tx.replace_all(
                 self,
-                ListVariable(items, **options),
+                ListVariable(items, regen_guards=False, **options),
             )
         elif name == "pop" and self.mutable_local:
             assert not kwargs
@@ -197,14 +211,14 @@ def call_method(
             result = items.pop(*[a.as_python_constant() for a in args])
             tx.replace_all(
                 self,
-                ListVariable(items, **options),
+                ListVariable(items, regen_guards=False, **options),
             )
             return result
         elif name == "clear" and self.mutable_local:
             assert not kwargs and not args
             return tx.replace_all(
                 self,
-                ListVariable([], **options),
+                ListVariable([], regen_guards=False, **options),
             )
         elif (
             name == "__setitem__"
@@ -219,7 +233,7 @@ def call_method(
                 items[key.as_python_constant()] = list(value.items)
             else:
                 items[key.as_python_constant()] = value
-            result = ListVariable(items, **options)
+            result = ListVariable(items, regen_guards=False, **options)
             return tx.replace_all(self, result)
         else:
             return super().call_method(tx, name, args, kwargs)
@@ -491,6 +505,7 @@ def next_variables(self):
             self.items,
             self.index + 1,
             mutable_local=MutableLocal(),
+            recursively_contains=self.recursively_contains,
             **VariableTracker.propagate([self]),
         )
 

From 38d0da11b6a88991e504bc2b1e10368050a36e1f Mon Sep 17 00:00:00 2001
From: Jiewen Tan <jwtan@google.com>
Date: Fri, 2 Dec 2022 01:51:30 +0000
Subject: [PATCH 1502/1922] [LTC] Remove noop_execution_mode_ (#89989)

Summary:
noop_execution_mode_ doesn't seem to be useful anymore. Let's remove it.

Test Plan:
CI.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89989
Approved by: https://github.com/desertfire, https://github.com/JackCaoG
---
 torch/csrc/lazy/core/lazy_graph_executor.cpp | 7 +------
 torch/csrc/lazy/core/lazy_graph_executor.h   | 8 --------
 2 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/torch/csrc/lazy/core/lazy_graph_executor.cpp b/torch/csrc/lazy/core/lazy_graph_executor.cpp
index 1201971f3bc2d..787a39ca02f89 100644
--- a/torch/csrc/lazy/core/lazy_graph_executor.cpp
+++ b/torch/csrc/lazy/core/lazy_graph_executor.cpp
@@ -896,12 +896,7 @@ std::shared_ptr<LazyGraphExecutor::Async> LazyGraphExecutor::
       std::move(tensors_data),
       std::move(cached_computation));
 
-  auto syncfn = [this, async, hash = coll->hash]() {
-    // For profiling lazy trace overhead
-    if (noop_execution_mode_) {
-      return;
-    }
-
+  auto syncfn = [async, hash = coll->hash]() {
     try {
       VLOG(3) << "Executing IR graph hash " << HashToString(hash)
               << " on device " << async->device << " ...";
diff --git a/torch/csrc/lazy/core/lazy_graph_executor.h b/torch/csrc/lazy/core/lazy_graph_executor.h
index 9894295f3b32a..16d7e15b7ab25 100644
--- a/torch/csrc/lazy/core/lazy_graph_executor.h
+++ b/torch/csrc/lazy/core/lazy_graph_executor.h
@@ -118,12 +118,6 @@ class TORCH_API LazyGraphExecutor {
       const Shape& shape,
       const BackendDevice& device);
 
-  // Configure the executor treat compile/execute API calls as no-ops
-  // for use when profiling lazy trace overheads
-  void SetNoOpExecutionMode(bool enable_noop) {
-    noop_execution_mode_ = enable_noop;
-  }
-
   struct CachedComputation {
     explicit CachedComputation(ComputationPtr computation)
         : computation(std::move(computation)) {}
@@ -256,8 +250,6 @@ class TORCH_API LazyGraphExecutor {
       const std::vector<LazyTensorPtr>& tensors,
       c10::ArrayRef<size_t> indices,
       c10::ArrayRef<BackendDataPtr> tensors_data);
-
-  bool noop_execution_mode_ = false;
 };
 
 } // namespace lazy

From 543abeefbedf7a0c859faf969f44c7c3f2e93a51 Mon Sep 17 00:00:00 2001
From: XiaobingSuper <xiaobing.zhang@intel.com>
Date: Wed, 30 Nov 2022 21:07:19 -0500
Subject: [PATCH 1503/1922] update quantization doc: add x86 backend as default
 backend of server inference (#86794)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/86794
Approved by: https://github.com/jgong5, https://github.com/kit1980
---
 .../quantization-backend-configuration.rst    |  2 +-
 docs/source/quantization.rst                  | 51 +++++++++++--------
 2 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/docs/source/quantization-backend-configuration.rst b/docs/source/quantization-backend-configuration.rst
index 07fd875fa9b34..bfe93ce701e62 100644
--- a/docs/source/quantization-backend-configuration.rst
+++ b/docs/source/quantization-backend-configuration.rst
@@ -13,7 +13,7 @@ Default values for native configurations
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Below is the output of the configuration for quantization of ops
-in fbgemm and qnnpack (PyTorch's default quantized backends).
+in x86 and qnnpack (PyTorch's default quantized backends).
 
 Results:
 
diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index c55a2a354f15f..3d95f72bf2b5c 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -256,11 +256,14 @@ PTSQ API Example::
   model_fp32.eval()
 
   # attach a global qconfig, which contains information about what kind
-  # of observers to attach. Use 'fbgemm' for server inference and
-  # 'qnnpack' for mobile inference. Other quantization configurations such
-  # as selecting symmetric or asymmetric quantization and MinMax or L2Norm
-  # calibration techniques can be specified here.
-  model_fp32.qconfig = torch.quantization.get_default_qconfig('fbgemm')
+  # of observers to attach. Use 'x86' for server inference and 'qnnpack'
+  # for mobile inference. Other quantization configurations such as selecting
+  # symmetric or assymetric quantization and MinMax or L2Norm calibration techniques
+  # can be specified here.
+  # Note: the old 'fbgemm' is still available but 'x86' is the recommended default
+  # for server inference.
+  # model_fp32.qconfig = torch.quantization.get_default_qconfig('fbgemm')
+  model_fp32.qconfig = torch.quantization.get_default_qconfig('x86')
 
   # Fuse the activations to preceding layers, where applicable.
   # This needs to be done manually depending on the model architecture.
@@ -352,11 +355,14 @@ QAT API Example::
   model_fp32.eval()
 
   # attach a global qconfig, which contains information about what kind
-  # of observers to attach. Use 'fbgemm' for server inference and
-  # 'qnnpack' for mobile inference. Other quantization configurations such
-  # as selecting symmetric or asymmetric quantization and MinMax or L2Norm
-  # calibration techniques can be specified here.
-  model_fp32.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
+  # of observers to attach. Use 'x86' for server inference and 'qnnpack'
+  # for mobile inference. Other quantization configurations such as selecting
+  # symmetric or assymetric quantization and MinMax or L2Norm calibration techniques
+  # can be specified here.
+  # Note: the old 'fbgemm' is still available but 'x86' is the recommended default
+  # for server inference.
+  # model_fp32.qconfig = torch.quantization.get_default_qconfig('fbgemm')
+  model_fp32.qconfig = torch.quantization.get_default_qat_qconfig('x86')
 
   # fuse the activations to preceding layers, where applicable
   # this needs to be done manually depending on the model architecture
@@ -732,7 +738,7 @@ Backend/Hardware Support
 |                 |               |Quantization|Mode        |Mode Support|
 |                 |               |            |Quantization|            |
 +-----------------+---------------+------------+------------+------------+
-|server CPU       |fbgemm         |Supported                |All         |
+|server CPU       |fbgemm/onednn  |Supported                |All         |
 |                 |               |                         |Supported   |
 +-----------------+---------------+                         |            +
 |mobile CPU       |qnnpack/xnnpack|                         |            |
@@ -746,30 +752,31 @@ Backend/Hardware Support
 
 Today, PyTorch supports the following backends for running quantized operators efficiently:
 
-* x86 CPUs with AVX2 support or higher (without AVX2 some operations have inefficient implementations), via `fbgemm <https://github.com/pytorch/FBGEMM>`_
+* x86 CPUs with AVX2 support or higher (without AVX2 some operations have inefficient implementations), via `x86` optimized by `fbgemm <https://github.com/pytorch/FBGEMM>`_ and `onednn <https://github.com/oneapi-src/oneDNN>`_ (see the details at `RFC <https://github.com/pytorch/pytorch/issues/83888>`_)
 * ARM CPUs (typically found in mobile/embedded devices), via `qnnpack <https://github.com/pytorch/pytorch/tree/master/aten/src/ATen/native/quantized/cpu/qnnpack>`_
 * (early prototype) support for NVidia GPU via `TensorRT <https://developer.nvidia.com/tensorrt>`_ through `fx2trt` (to be open sourced)
 
 
 Note for native CPU backends
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-We expose both `fbgemm` and `qnnpack` with the same native pytorch quantized operators, so we need additional flag to distinguish between them. The corresponding implementation of `fbgemm` and `qnnpack` is chosen automatically based on the PyTorch build mode, though users have the option to override this by setting `torch.backends.quantization.engine` to `fbgemm` or `qnnpack`.
+We expose both `x86` and `qnnpack` with the same native pytorch quantized operators, so we need additional flag to distinguish between them. The corresponding implementation of  `x86` and `qnnpack` is chosen automatically based on the PyTorch build mode, though users have the option to override this by setting `torch.backends.quantization.engine` to `x86` or `qnnpack`.
 
 When preparing a quantized model, it is necessary to ensure that qconfig
 and the engine used for quantized computations match the backend on which
 the model will be executed. The qconfig controls the type of observers used
-during the quantization passes. The qengine controls whether `fbgemm` or
-`qnnpack` specific packing function is used when packing weights for linear
-and convolution functions and modules. For example:
+during the quantization passes. The qengine controls whether `x86` or `qnnpack`
+specific packing function is used when packing weights for
+linear and convolution functions and modules. For example:
 
-Default settings for fbgemm::
+Default settings for x86::
 
     # set the qconfig for PTQ
-    qconfig = torch.quantization.get_default_qconfig('fbgemm')
+    # Note: the old 'fbgemm' is still available but 'x86' is the recommended default on x86 CPUs
+    qconfig = torch.quantization.get_default_qconfig('x86')
     # or, set the qconfig for QAT
-    qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
+    qconfig = torch.quantization.get_default_qat_qconfig('x86')
     # set the qengine to control weight packing
-    torch.backends.quantized.engine = 'fbgemm'
+    torch.backends.quantized.engine = 'x86'
 
 Default settings for qnnpack::
 
@@ -996,11 +1003,11 @@ Custom API Example::
 Best Practices
 --------------
 
-1. If you are using the ``fbgemm`` backend, we need to use 7 bits instead of 8 bits. Make sure you reduce the range for the ``quant\_min``, ``quant\_max``, e.g.
+1. If you are using the ``x86`` backend, we need to use 7 bits instead of 8 bits. Make sure you reduce the range for the ``quant\_min``, ``quant\_max``, e.g.
 if ``dtype`` is ``torch.quint8``, make sure to set a custom ``quant_min`` to be ``0`` and ``quant_max`` to be ``127`` (``255`` / ``2``)
 if ``dtype`` is ``torch.qint8``, make sure to set a custom ``quant_min`` to be ``-64`` (``-128`` / ``2``) and ``quant_max`` to be ``63`` (``127`` / ``2``), we already set this correctly if
 you call the `torch.ao.quantization.get_default_qconfig(backend)` or `torch.ao.quantization.get_default_qat_qconfig(backend)` function to get the default ``qconfig`` for
-``fbgemm`` or ``qnnpack`` backend
+``x86`` or ``qnnpack`` backend
 
 Frequently Asked Questions
 --------------------------

From 7e1c77046dd0dcb14fe4fc427caf38c0d68749b6 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 2 Dec 2022 02:37:36 +0000
Subject: [PATCH 1504/1922] [inductor] Deterministic kernel names (#89713)

`node.origins` is a set and does not have an order. Therefore, inductor w and w/o cudagraphs experiments generate different kernel names, making it hard to debug.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89713
Approved by: https://github.com/soumith, https://github.com/mlazos, https://github.com/ngimel
---
 torch/_inductor/codegen/triton.py |  2 +-
 torch/_inductor/utils.py          | 22 ++++++++++++++--------
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index de84d4ddbeff5..416310ca7e89e 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -1306,7 +1306,7 @@ def codegen_node_schedule(self, node_schedule, numel, reduction_numel):
                 if config.triton.descriptive_kernel_names
                 else ""
             )
-            kernel_name = "triton_" + fused_name + wrapper.next_kernel_suffix()
+            kernel_name = "_".join(["triton", fused_name, wrapper.next_kernel_suffix()])
             wrapper.kernels[src_code] = kernel_name
             subs_name = kernel_name if config.triton.ordered_kernel_names else "triton_"
             src_code = src_code.replace("KERNEL_NAME", subs_name)
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index 36a645c99a97b..a2dc47a5628ec 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -258,14 +258,20 @@ def wrapper(self):
 def get_fused_kernel_name(node_schedule):
     return "_".join(
         ["fused"]
-        + [
-            str(origin.name)
-            for origin in functools.reduce(
-                operator.or_,
-                [node.node.origins for node in node_schedule if hasattr(node, "node")],
-            )
-            if origin.op == "call_function"
-        ][0 : config.kernel_name_max_ops]
+        + sorted(
+            [
+                str(origin.name)
+                for origin in functools.reduce(
+                    operator.or_,
+                    [
+                        node.node.origins
+                        for node in node_schedule
+                        if hasattr(node, "node")
+                    ],
+                )
+                if origin.op == "call_function"
+            ]
+        )[0 : config.kernel_name_max_ops]
     )
 
 
From e2043cd61c6b55bf204aa84db5c1477acc996dc1 Mon Sep 17 00:00:00 2001
From: Catherine Lee <csl@fb.com>
Date: Fri, 2 Dec 2022 02:40:24 +0000
Subject: [PATCH 1505/1922] Change periodic concurrency group (#89850)

it hasnt been running the mem leak check b/c it keeps getting cancelled due to a higher priority job
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89850
Approved by: https://github.com/malfet, https://github.com/seemethere
---
 .github/workflows/periodic.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index 9a188345899dc..4c47cdfe57a0a 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -10,7 +10,7 @@ on:
   workflow_dispatch:
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
   cancel-in-progress: true
 
 jobs:

From d3531cb59172e9f7fd18fe54a537cb91b2539dbc Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 2 Dec 2022 03:17:53 +0000
Subject: [PATCH 1506/1922] [vision hash update] update the pinned vision hash
 (#90035)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned vision hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90035
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/vision.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index 120f58b23e09f..f18c091c93b2e 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-a718345a8d60c73a441f6254d6eae456c8a6d787
+790f1cdcea0359619adfc9ec37b91883748d1854

From ba7d182356b2d5092a306a1a0613401ff335803c Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Thu, 1 Dec 2022 14:14:34 -0800
Subject: [PATCH 1507/1922] [quant][fx] Add support for matching multiple
 arguments in patterns (#89986)

Summary:
This PR adds support for matching patterns that has multiple arguments, it's needed for quantization in PyTorch 2.0 early prototype

Before this PR, we only support patterns like:
```
x -> conv -> bn -> relu
(relu, (bn, conv))
```
where each operator has a single node, the code breaks when we want to match a pattern that has an op that has multiple arguments, such as:
```
                           shape \
        transpose -> reshape -> output ->
```
where `reshape` has two arguments

Test Plan:
python test/test_quantization.py TestQuantizeFx.test_match_pattern_with_multiple_args

Reviewers:

Subscribers:

Tasks:

Tags:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89986
Approved by: https://github.com/vkuzo
---
 test/quantization/fx/test_quantize_fx.py | 50 ++++++++++++++++++++++++
 torch/ao/quantization/fx/match_utils.py  | 17 +++++++-
 torch/ao/quantization/fx/prepare.py      | 13 +++---
 3 files changed, 71 insertions(+), 9 deletions(-)

diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 7c80a58412cdc..de6cf18a0b0dd 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -5476,6 +5476,56 @@ def forward(self, x):
             self.checkGraphModuleNodes(m, expected_node_occurrence=node_occurrence)
             self.checkGraphModuleNodes(m_ref, expected_node_occurrence=node_occurrence_ref)
 
+    def test_match_pattern_with_multiple_args(self):
+        """ Test that we can match a pattern that has multiple arguments
+        Pattern:
+                           shape \
+        transpose (observed) -> reshape -> output (observed) ->
+
+        where `reshape` has two arguments
+        """
+
+        def _get_pattern_configs():
+            backend_pattern_configs = []
+            observation_type = ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT
+            weighted_op_quint8_dtype_config = DTypeConfig(
+                input_dtype=torch.quint8,
+                output_dtype=torch.quint8,
+                weight_dtype=torch.qint8,
+                bias_dtype=torch.float,
+            )
+            dtype_configs = [weighted_op_quint8_dtype_config]
+
+            def root_node_getter(node_pattern):
+                reshape, transpose, shape = node_pattern
+                return transpose
+
+            backend_pattern_configs.append(
+                BackendPatternConfig((torch.reshape, torch.transpose, MatchAllNode))
+                .set_observation_type(observation_type)  # noqa: E131
+                .set_dtype_configs(dtype_configs)
+                ._set_root_node_getter(root_node_getter))
+            return backend_pattern_configs
+
+        backend_config = BackendConfig().set_backend_pattern_configs(_get_pattern_configs())
+
+        class M(torch.nn.Module):
+            def forward(self, x):
+                x = torch.transpose(x, 0, 1)
+                x = torch.reshape(x, (-1,))
+                return x
+
+        m = M().eval()
+        qconfig_mapping = QConfigMapping().set_global(default_qconfig)
+        example_inputs = (torch.randn(1, 3, 3, 3),)
+        m = prepare_fx(m, qconfig_mapping, example_inputs, backend_config=backend_config)
+        node_occurrence = {
+            # one for input of the pattern and one for output of the pattern
+            ns.call_module(MinMaxObserver): 2
+        }
+        self.checkGraphModuleNodes(m, expected_node_occurrence=node_occurrence)
+
+
 @skipIfNoFBGEMM
 class TestQuantizeFxOps(QuantizationTestCase):
     def setUp(self):
diff --git a/torch/ao/quantization/fx/match_utils.py b/torch/ao/quantization/fx/match_utils.py
index 56d66d00b2b6f..631916da7e3f3 100644
--- a/torch/ao/quantization/fx/match_utils.py
+++ b/torch/ao/quantization/fx/match_utils.py
@@ -18,7 +18,7 @@
     is_observed_standalone_module,
 )
 from torch.nn.utils.parametrize import type_before_parametrizations
-from typing import Any, Dict, List, Callable, Optional, Tuple, Type, Set
+from typing import Any, Dict, List, Callable, Optional, Tuple, Type, Set, Iterable
 
 
 # TODO: revisit this list. Many helper methods shouldn't be public
@@ -133,6 +133,8 @@ def _recursive_record_node_in_match_map(
         if isinstance(node_pattern, Node):
             match_map[node_pattern.name] = (
                 last_node, matched_node_pattern, pattern, match_value)
+        elif not isinstance(node_pattern, Iterable):
+            return
         else:
             for n in node_pattern:
                 _recursive_record_node_in_match_map(last_node, match_map, n, matched_node_pattern, pattern, match_value)
@@ -146,6 +148,7 @@ def record_match(
             match_map):
         if isinstance(pattern, tuple):
             s, *args = pattern
+            is_single_arg = len(args) == 1
             current_node_pattern: List[Node] = []
             record_match(
                 s,
@@ -162,7 +165,17 @@ def record_match(
                         current_node_pattern,
                         match_map)
             if len(current_node_pattern) > 1:
-                matched_node_pattern.append(tuple(current_node_pattern))
+                # current_node_pattern is  the node pattern we get from matching
+                # the subpattern with arguments of the node
+                # we use is_single_arg to recover the original structure of the pattern
+                # if the original pattern has a single argument, we will have
+                # (original_op, (original_arg, ...))
+                # otherwise, we'll have a list of arguments
+                # (original_op, arg0, arg1, arg2, ...)
+                if is_single_arg:
+                    matched_node_pattern.append(tuple(current_node_pattern))
+                else:
+                    matched_node_pattern.extend(list(current_node_pattern))
             else:
                 matched_node_pattern.append(current_node_pattern[0])
         else:
diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index 03ff6a764f77d..73c0c2fde69a0 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -240,10 +240,10 @@ def _is_pattern_dtype_config_and_qconfig_supported_by_backend(
     assert matched_node_pattern is not None and len(matched_node_pattern) >= 1
     pattern_to_dtype_configs = get_pattern_to_dtype_configs(backend_config)
     dtype_configs: List[DTypeConfig] = pattern_to_dtype_configs.get(pattern, [])
+    pattern_to_root_node_getter = get_fusion_pattern_to_root_node_getter(backend_config)
 
-    # TODO: this only works for one input and one output patterns, need to generalize to multiple
-    # inputs/output
-    root_node = _default_root_node_getter(matched_node_pattern)
+    root_node_getter = pattern_to_root_node_getter.get(pattern, _default_root_node_getter)
+    root_node = root_node_getter(matched_node_pattern)
     input_node = root_node
     output_node = matched_node_pattern[0]
     for dtype_config in dtype_configs:
@@ -1262,10 +1262,9 @@ def insert_observers_for_model(
                             if user != node and is_user_quantized:
                                 is_quantized_branch = True
 
-                    # TODO: this only works for sequential fusion right now, extend it
-                    # it to automatically detect all input nodes based on the pattern
-                    # need to change find_matches function to return this information
-                    root_node = _default_root_node_getter(matched_node_pattern)
+                    pattern_to_root_node_getter = get_fusion_pattern_to_root_node_getter(backend_config)
+                    root_node_getter = pattern_to_root_node_getter.get(pattern, _default_root_node_getter)
+                    root_node = root_node_getter(matched_node_pattern)
                     is_input_node_of_the_pattern = node is root_node
                     if is_input_node_of_the_pattern:
                         # this modifies node inplace

From ffe392f7a70b4096e73a1d71fab5f3ecc91ebc97 Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Thu, 1 Dec 2022 08:35:18 -0800
Subject: [PATCH 1508/1922] [Profiler] Memory profiler part 12: Emit timeline
 of memory events. (#89355)

Add a simple interface to get a flat representation of the memory profile.

Differential Revision: [D40868663](https://our.internmc.facebook.com/intern/diff/D40868663/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89355
Approved by: https://github.com/chaekit
---
 test/profiler/test_memory_profiler.py | 213 +++++++++++++++++++++++++-
 torch/profiler/_memory_profiler.py    |  47 ++++++
 2 files changed, 259 insertions(+), 1 deletion(-)

diff --git a/test/profiler/test_memory_profiler.py b/test/profiler/test_memory_profiler.py
index 01f2263807d34..0cb39600bc3a4 100644
--- a/test/profiler/test_memory_profiler.py
+++ b/test/profiler/test_memory_profiler.py
@@ -87,7 +87,9 @@ def __torch_dispatch__(self, func, types, args=..., kwargs=None):
         flat_inputs = self.flat_ids(args) + self.flat_ids(kwargs)
         out = func(*args, **kwargs)
         flat_outputs = self.flat_ids(out)
-        if (flat_inputs or flat_outputs) and "_record_function_enter" not in func.name():
+        if (
+            flat_inputs or flat_outputs
+        ) and "_record_function_enter" not in func.name():
             self.results.append((func.name(), flat_inputs, flat_outputs))
         return out
 
@@ -1413,6 +1415,215 @@ def step_fn(mark_region):
             aten::detach                             29 (GRADIENT)                                 -> ???""",
         )
 
+    def test_memory_timeline(self) -> None:
+        model = torch.nn.Sequential(
+            torch.nn.Linear(2, 4, bias=True),
+            torch.nn.ReLU(),
+            torch.nn.Linear(4, 4, bias=False),
+            torch.nn.Softmax(dim=1),
+        )
+        optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
+
+        with profile() as prof:
+            x = torch.ones((2, 2))
+            targets = torch.ones((2, 4))
+            y = model(x)
+            loss = torch.sum((y - targets) ** 2).mean()
+            loss.backward()
+            optimizer.step()
+            optimizer.zero_grad()
+
+        memory_profile = prof._memory_profile()
+        timeline = memory_profile.timeline
+        times = tuple(t for t, _, _ in timeline)
+        self.assertTrue(all(t1 >= t0 for t0, t1 in zip(times, times[1:])), times)
+        self.assertTrue(
+            all(
+                (t == -1) if action == _memory_profiler.Action.PREEXISTING else (t > 0)
+                for t, action, _ in timeline
+            )
+        )
+
+        def category_name(category):
+            return category.name if category else "???"
+
+        def format_action(action, key, version):
+            category = memory_profile._categories.get(key, version)
+            if action == _memory_profiler.Action.INCREMENT_VERSION:
+                new_category = memory_profile._categories.get(key, version + 1)
+                if category != new_category:
+                    return f"{category_name(category)} -> {category_name(new_category)}"
+            return category_name(category)
+
+        lines = [
+            f"{action.name.lower():<25}  {format_action(action, key, version):<25}  "
+            f"{key.storage.allocation_id:>2}  v{version}"
+            for _, action, (key, version) in prof._memory_profile().timeline
+        ]
+
+        self.assertExpectedInline(
+            textwrap.indent("\n".join(lines), " " * 12),
+            """\
+            preexisting                PARAMETER                   3  v0
+            preexisting                PARAMETER                   4  v0
+            preexisting                PARAMETER                   7  v0
+            create                     INPUT                       1  v0
+            create                     INPUT                       2  v0
+            create                     ACTIVATION                  5  v0
+            create                     ACTIVATION                  6  v0
+            destroy                    ACTIVATION                  5  v0
+            create                     ACTIVATION                  8  v0
+            create                     ACTIVATION                  9  v0
+            destroy                    ACTIVATION                  8  v0
+            create                     ACTIVATION                 10  v0
+            create                     ACTIVATION                 11  v0
+            create                     ACTIVATION                 12  v0
+            destroy                    ACTIVATION                 11  v0
+            create                     ACTIVATION                 13  v0
+            create                     TEMPORARY                  14  v0
+            create                     TEMPORARY                  15  v0
+            destroy                    TEMPORARY                  15  v0
+            destroy                    TEMPORARY                  14  v0
+            create                     ACTIVATION                 16  v0
+            create                     TEMPORARY                  17  v0
+            create                     TEMPORARY                  18  v0
+            create                     AUTOGRAD_DETAIL            19  v0
+            destroy                    TEMPORARY                  18  v0
+            destroy                    TEMPORARY                  17  v0
+            destroy                    ACTIVATION                 12  v0
+            create                     TEMPORARY                  20  v0
+            create                     TEMPORARY                  21  v0
+            create                     TEMPORARY                  22  v0
+            create                     TEMPORARY                  23  v0
+            destroy                    TEMPORARY                  22  v0
+            destroy                    TEMPORARY                  21  v0
+            create                     AUTOGRAD_DETAIL            24  v0
+            destroy                    TEMPORARY                  23  v0
+            destroy                    TEMPORARY                  20  v0
+            destroy                    AUTOGRAD_DETAIL            19  v0
+            destroy                    ACTIVATION                 10  v0
+            increment_version          AUTOGRAD_DETAIL            24  v0
+            create                     AUTOGRAD_DETAIL            25  v0
+            destroy                    AUTOGRAD_DETAIL            24  v1
+            create                     GRADIENT                   26  v0
+            create                     AUTOGRAD_DETAIL            27  v0
+            destroy                    AUTOGRAD_DETAIL            25  v0
+            create                     AUTOGRAD_DETAIL            28  v0
+            destroy                    AUTOGRAD_DETAIL            27  v0
+            destroy                    ACTIVATION                  6  v0
+            create                     GRADIENT                   29  v0
+            create                     GRADIENT                   30  v0
+            destroy                    AUTOGRAD_DETAIL            28  v0
+            destroy                    ACTIVATION                 16  v0
+            create                     OPTIMIZER_STATE            31  v0
+            increment_version          OPTIMIZER_STATE            31  v0
+            create                     OPTIMIZER_STATE            32  v0
+            create                     OPTIMIZER_STATE            33  v0
+            create                     OPTIMIZER_STATE            34  v0
+            increment_version          OPTIMIZER_STATE            34  v0
+            create                     OPTIMIZER_STATE            35  v0
+            create                     OPTIMIZER_STATE            36  v0
+            create                     OPTIMIZER_STATE            37  v0
+            increment_version          OPTIMIZER_STATE            37  v0
+            create                     OPTIMIZER_STATE            38  v0
+            create                     OPTIMIZER_STATE            39  v0
+            create                     ???                        40  v0
+            increment_version          OPTIMIZER_STATE            31  v1
+            create                     TEMPORARY                  41  v0
+            destroy                    TEMPORARY                  41  v0
+            destroy                    ???                        40  v0
+            create                     INPUT                      42  v0
+            increment_version          OPTIMIZER_STATE            32  v0
+            create                     TEMPORARY                  43  v0
+            destroy                    TEMPORARY                  43  v0
+            destroy                    INPUT                      42  v0
+            increment_version          OPTIMIZER_STATE            32  v1
+            create                     INPUT                      44  v0
+            increment_version          OPTIMIZER_STATE            33  v0
+            create                     TEMPORARY                  45  v0
+            destroy                    TEMPORARY                  45  v0
+            destroy                    INPUT                      44  v0
+            increment_version          OPTIMIZER_STATE            33  v1
+            create                     ???                        46  v0
+            create                     INPUT                      47  v0
+            create                     TEMPORARY                  48  v0
+            create                     ???                        49  v0
+            destroy                    TEMPORARY                  48  v0
+            destroy                    INPUT                      47  v0
+            destroy                    ???                        46  v0
+            create                     INPUT                      50  v0
+            increment_version          ???                        49  v0
+            create                     TEMPORARY                  51  v0
+            destroy                    TEMPORARY                  51  v0
+            destroy                    INPUT                      50  v0
+            increment_version          PARAMETER                   3  v0
+            create                     ???                        52  v0
+            increment_version          OPTIMIZER_STATE            34  v1
+            create                     TEMPORARY                  53  v0
+            destroy                    TEMPORARY                  53  v0
+            destroy                    ???                        52  v0
+            create                     INPUT                      54  v0
+            increment_version          OPTIMIZER_STATE            35  v0
+            create                     TEMPORARY                  55  v0
+            destroy                    TEMPORARY                  55  v0
+            destroy                    INPUT                      54  v0
+            increment_version          OPTIMIZER_STATE            35  v1
+            create                     INPUT                      56  v0
+            increment_version          OPTIMIZER_STATE            36  v0
+            create                     TEMPORARY                  57  v0
+            destroy                    TEMPORARY                  57  v0
+            destroy                    INPUT                      56  v0
+            increment_version          OPTIMIZER_STATE            36  v1
+            create                     ???                        58  v0
+            create                     INPUT                      59  v0
+            create                     TEMPORARY                  60  v0
+            create                     ???                        61  v0
+            destroy                    TEMPORARY                  60  v0
+            destroy                    INPUT                      59  v0
+            destroy                    ???                        58  v0
+            create                     INPUT                      62  v0
+            increment_version          ???                        61  v0
+            create                     TEMPORARY                  63  v0
+            destroy                    TEMPORARY                  63  v0
+            destroy                    INPUT                      62  v0
+            destroy                    ???                        49  v1
+            increment_version          PARAMETER                   4  v0
+            create                     ???                        64  v0
+            increment_version          OPTIMIZER_STATE            37  v1
+            create                     TEMPORARY                  65  v0
+            destroy                    TEMPORARY                  65  v0
+            destroy                    ???                        64  v0
+            create                     INPUT                      66  v0
+            increment_version          OPTIMIZER_STATE            38  v0
+            create                     TEMPORARY                  67  v0
+            destroy                    TEMPORARY                  67  v0
+            destroy                    INPUT                      66  v0
+            increment_version          OPTIMIZER_STATE            38  v1
+            create                     INPUT                      68  v0
+            increment_version          OPTIMIZER_STATE            39  v0
+            create                     TEMPORARY                  69  v0
+            destroy                    TEMPORARY                  69  v0
+            destroy                    INPUT                      68  v0
+            increment_version          OPTIMIZER_STATE            39  v1
+            create                     ???                        70  v0
+            create                     INPUT                      71  v0
+            create                     TEMPORARY                  72  v0
+            create                     ???                        73  v0
+            destroy                    TEMPORARY                  72  v0
+            destroy                    INPUT                      71  v0
+            destroy                    ???                        70  v0
+            create                     INPUT                      74  v0
+            increment_version          ???                        73  v0
+            create                     TEMPORARY                  75  v0
+            destroy                    TEMPORARY                  75  v0
+            destroy                    INPUT                      74  v0
+            destroy                    ???                        61  v1
+            increment_version          PARAMETER                   7  v0
+            destroy                    ???                        73  v1
+            increment_version          GRADIENT                   29  v0
+            increment_version          GRADIENT                   30  v0
+            increment_version          GRADIENT                   26  v0""")
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/profiler/_memory_profiler.py b/torch/profiler/_memory_profiler.py
index 2c5684b64dbfc..800d838366026 100644
--- a/torch/profiler/_memory_profiler.py
+++ b/torch/profiler/_memory_profiler.py
@@ -41,6 +41,13 @@ class Category(enum.Enum):
     OPTIMIZER_STATE = enum.auto()
 
 
+class Action(enum.Enum):
+    PREEXISTING = enum.auto()
+    CREATE = enum.auto()
+    INCREMENT_VERSION = enum.auto()
+    DESTROY = enum.auto()
+
+
 @dataclasses.dataclass
 class _Storage:
     """Bundle storage pointer and id.
@@ -567,6 +574,46 @@ def __init__(self, result: _ProfilerResult) -> None:
         self._set_optimizer_state()
         self._set_autograd_detail()
 
+    @property
+    def timeline(self) -> Tuple[Tuple[int, Action, TensorAndID], ...]:
+        t0 = min(event.start_time_ns for event in self._op_tree.dfs())
+        allocation_times: Dict[Tuple[TensorKey, bool], int] = {}
+        for event in self._op_tree.dfs():
+            if event.typed[0] == _EventType.Allocation:
+                alloc_fields = event.typed[1]
+                key = TensorKey.from_allocation(alloc_fields)
+                if key is not None:
+                    is_allocation = alloc_fields.alloc_size > 0
+                    allocation_times[(key, is_allocation)] = event.start_time_ns - t0
+
+        snapshot = self._category_snapshot()
+        last_version = {key: version for key, version in sorted(snapshot.keys())}
+
+        events: List[Tuple[int, Action, TensorAndID]] = [
+            (-1, Action.PREEXISTING, (key, version))
+            for key, version in snapshot.keys()
+            if (key, True) not in allocation_times and version == 0
+        ]
+
+        for node in self._data_flow_graph.flow_nodes:
+            for key, edge in node._edges.items():
+                if edge.is_allocation:
+                    t = allocation_times[(key, True)]
+                    events.append((t, Action.CREATE, (key, 0)))
+
+                elif edge.mutated:
+                    t = node._event.start_time_ns - t0
+                    version = edge.input_version
+                    assert version is not None
+                    events.append((t, Action.INCREMENT_VERSION, (key, version)))
+
+                if edge.is_deletion:
+                    t = allocation_times[(key, False)]
+                    events.append((t, Action.DESTROY, (key, last_version[key])))
+
+        events.sort(key=lambda x: (x[0], x[1].value))
+        return tuple(events)
+
     def _is_gradient(self, *args, **kwargs) -> bool:
         return self._categories.get(*args, **kwargs) == Category.GRADIENT
 

From a36e315d3de68a610ad7d3ae265066635d4d598a Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Thu, 1 Dec 2022 08:35:20 -0800
Subject: [PATCH 1509/1922] [Profiler] Memory profiler part 13: Add sizes to
 timeline. (#89356)

If we see an allocation the size is unambiguous. Otherwise we have to use sizes and strides to bound the underlying storage.

Differential Revision: [D40868660](https://our.internmc.facebook.com/intern/diff/D40868660/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89356
Approved by: https://github.com/chaekit
---
 test/profiler/test_memory_profiler.py | 266 ++++++++++----------------
 torch/profiler/_memory_profiler.py    |  78 +++++++-
 2 files changed, 174 insertions(+), 170 deletions(-)

diff --git a/test/profiler/test_memory_profiler.py b/test/profiler/test_memory_profiler.py
index 0cb39600bc3a4..84442724205ab 100644
--- a/test/profiler/test_memory_profiler.py
+++ b/test/profiler/test_memory_profiler.py
@@ -861,6 +861,9 @@ def assert_category(t: torch.Tensor, category: _memory_profiler.Category):
             assert_category(p, _memory_profiler.Category.PARAMETER)
             assert_category(p.grad, _memory_profiler.Category.GRADIENT)
 
+        # Rely on internal asserts
+        _ = memory_profile.timeline
+
     def _run_and_format_categories(self, fn, indent=12):
         """Generate summary of assigned categories for expecttest."""
 
@@ -1417,30 +1420,30 @@ def step_fn(mark_region):
 
     def test_memory_timeline(self) -> None:
         model = torch.nn.Sequential(
-            torch.nn.Linear(2, 4, bias=True),
+            torch.nn.Linear(64, 512, bias=True),
             torch.nn.ReLU(),
-            torch.nn.Linear(4, 4, bias=False),
+            torch.nn.Linear(512, 512, bias=False),
             torch.nn.Softmax(dim=1),
         )
         optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
 
         with profile() as prof:
-            x = torch.ones((2, 2))
-            targets = torch.ones((2, 4))
+            x = torch.ones((1024, 64))
+            targets = torch.ones((1024, 512))
             y = model(x)
-            loss = torch.sum((y - targets) ** 2).mean()
+            loss = torch.nn.functional.mse_loss(y, targets)
             loss.backward()
             optimizer.step()
             optimizer.zero_grad()
 
         memory_profile = prof._memory_profile()
         timeline = memory_profile.timeline
-        times = tuple(t for t, _, _ in timeline)
+        times = tuple(t for t, _, _, _ in timeline)
         self.assertTrue(all(t1 >= t0 for t0, t1 in zip(times, times[1:])), times)
         self.assertTrue(
             all(
                 (t == -1) if action == _memory_profiler.Action.PREEXISTING else (t > 0)
-                for t, action, _ in timeline
+                for t, action, _, _ in timeline
             )
         )
 
@@ -1455,174 +1458,101 @@ def format_action(action, key, version):
                     return f"{category_name(category)} -> {category_name(new_category)}"
             return category_name(category)
 
+        def format_size(size: int):
+            if size < 1024:
+                return f"{size / 1024:3.1f} kB"
+            return f"{size // 1024} kB"
+
+
+        # We generate sequential IDs for Tensors; however platforms vary
+        # slightly in the exact computation executed. If this results in
+        # tensor creation the IDs will be shifted and the unit test will fail.
+        # (Even though the behavior we're testing is unchanged.) To correct for
+        # this we assign sequential numbers to the tensors which are actually
+        # tested, effectively suppressing the extraneous implementation details.
+        id_map = {}
+
+        def id_for_testing(key):
+            return id_map.setdefault(key.storage.allocation_id, len(id_map))
+
         lines = [
             f"{action.name.lower():<25}  {format_action(action, key, version):<25}  "
-            f"{key.storage.allocation_id:>2}  v{version}"
-            for _, action, (key, version) in prof._memory_profile().timeline
+            f"{id_for_testing(key):>3}(v{version}) {format_size(size):>15}"
+            for _, action, (key, version), size in prof._memory_profile().timeline
+
+            # We generally don't care about tiny allocations during memory
+            # profiling and they add a lot of noise to the unit test.
+            if size >= 256
         ]
 
         self.assertExpectedInline(
             textwrap.indent("\n".join(lines), " " * 12),
             """\
-            preexisting                PARAMETER                   3  v0
-            preexisting                PARAMETER                   4  v0
-            preexisting                PARAMETER                   7  v0
-            create                     INPUT                       1  v0
-            create                     INPUT                       2  v0
-            create                     ACTIVATION                  5  v0
-            create                     ACTIVATION                  6  v0
-            destroy                    ACTIVATION                  5  v0
-            create                     ACTIVATION                  8  v0
-            create                     ACTIVATION                  9  v0
-            destroy                    ACTIVATION                  8  v0
-            create                     ACTIVATION                 10  v0
-            create                     ACTIVATION                 11  v0
-            create                     ACTIVATION                 12  v0
-            destroy                    ACTIVATION                 11  v0
-            create                     ACTIVATION                 13  v0
-            create                     TEMPORARY                  14  v0
-            create                     TEMPORARY                  15  v0
-            destroy                    TEMPORARY                  15  v0
-            destroy                    TEMPORARY                  14  v0
-            create                     ACTIVATION                 16  v0
-            create                     TEMPORARY                  17  v0
-            create                     TEMPORARY                  18  v0
-            create                     AUTOGRAD_DETAIL            19  v0
-            destroy                    TEMPORARY                  18  v0
-            destroy                    TEMPORARY                  17  v0
-            destroy                    ACTIVATION                 12  v0
-            create                     TEMPORARY                  20  v0
-            create                     TEMPORARY                  21  v0
-            create                     TEMPORARY                  22  v0
-            create                     TEMPORARY                  23  v0
-            destroy                    TEMPORARY                  22  v0
-            destroy                    TEMPORARY                  21  v0
-            create                     AUTOGRAD_DETAIL            24  v0
-            destroy                    TEMPORARY                  23  v0
-            destroy                    TEMPORARY                  20  v0
-            destroy                    AUTOGRAD_DETAIL            19  v0
-            destroy                    ACTIVATION                 10  v0
-            increment_version          AUTOGRAD_DETAIL            24  v0
-            create                     AUTOGRAD_DETAIL            25  v0
-            destroy                    AUTOGRAD_DETAIL            24  v1
-            create                     GRADIENT                   26  v0
-            create                     AUTOGRAD_DETAIL            27  v0
-            destroy                    AUTOGRAD_DETAIL            25  v0
-            create                     AUTOGRAD_DETAIL            28  v0
-            destroy                    AUTOGRAD_DETAIL            27  v0
-            destroy                    ACTIVATION                  6  v0
-            create                     GRADIENT                   29  v0
-            create                     GRADIENT                   30  v0
-            destroy                    AUTOGRAD_DETAIL            28  v0
-            destroy                    ACTIVATION                 16  v0
-            create                     OPTIMIZER_STATE            31  v0
-            increment_version          OPTIMIZER_STATE            31  v0
-            create                     OPTIMIZER_STATE            32  v0
-            create                     OPTIMIZER_STATE            33  v0
-            create                     OPTIMIZER_STATE            34  v0
-            increment_version          OPTIMIZER_STATE            34  v0
-            create                     OPTIMIZER_STATE            35  v0
-            create                     OPTIMIZER_STATE            36  v0
-            create                     OPTIMIZER_STATE            37  v0
-            increment_version          OPTIMIZER_STATE            37  v0
-            create                     OPTIMIZER_STATE            38  v0
-            create                     OPTIMIZER_STATE            39  v0
-            create                     ???                        40  v0
-            increment_version          OPTIMIZER_STATE            31  v1
-            create                     TEMPORARY                  41  v0
-            destroy                    TEMPORARY                  41  v0
-            destroy                    ???                        40  v0
-            create                     INPUT                      42  v0
-            increment_version          OPTIMIZER_STATE            32  v0
-            create                     TEMPORARY                  43  v0
-            destroy                    TEMPORARY                  43  v0
-            destroy                    INPUT                      42  v0
-            increment_version          OPTIMIZER_STATE            32  v1
-            create                     INPUT                      44  v0
-            increment_version          OPTIMIZER_STATE            33  v0
-            create                     TEMPORARY                  45  v0
-            destroy                    TEMPORARY                  45  v0
-            destroy                    INPUT                      44  v0
-            increment_version          OPTIMIZER_STATE            33  v1
-            create                     ???                        46  v0
-            create                     INPUT                      47  v0
-            create                     TEMPORARY                  48  v0
-            create                     ???                        49  v0
-            destroy                    TEMPORARY                  48  v0
-            destroy                    INPUT                      47  v0
-            destroy                    ???                        46  v0
-            create                     INPUT                      50  v0
-            increment_version          ???                        49  v0
-            create                     TEMPORARY                  51  v0
-            destroy                    TEMPORARY                  51  v0
-            destroy                    INPUT                      50  v0
-            increment_version          PARAMETER                   3  v0
-            create                     ???                        52  v0
-            increment_version          OPTIMIZER_STATE            34  v1
-            create                     TEMPORARY                  53  v0
-            destroy                    TEMPORARY                  53  v0
-            destroy                    ???                        52  v0
-            create                     INPUT                      54  v0
-            increment_version          OPTIMIZER_STATE            35  v0
-            create                     TEMPORARY                  55  v0
-            destroy                    TEMPORARY                  55  v0
-            destroy                    INPUT                      54  v0
-            increment_version          OPTIMIZER_STATE            35  v1
-            create                     INPUT                      56  v0
-            increment_version          OPTIMIZER_STATE            36  v0
-            create                     TEMPORARY                  57  v0
-            destroy                    TEMPORARY                  57  v0
-            destroy                    INPUT                      56  v0
-            increment_version          OPTIMIZER_STATE            36  v1
-            create                     ???                        58  v0
-            create                     INPUT                      59  v0
-            create                     TEMPORARY                  60  v0
-            create                     ???                        61  v0
-            destroy                    TEMPORARY                  60  v0
-            destroy                    INPUT                      59  v0
-            destroy                    ???                        58  v0
-            create                     INPUT                      62  v0
-            increment_version          ???                        61  v0
-            create                     TEMPORARY                  63  v0
-            destroy                    TEMPORARY                  63  v0
-            destroy                    INPUT                      62  v0
-            destroy                    ???                        49  v1
-            increment_version          PARAMETER                   4  v0
-            create                     ???                        64  v0
-            increment_version          OPTIMIZER_STATE            37  v1
-            create                     TEMPORARY                  65  v0
-            destroy                    TEMPORARY                  65  v0
-            destroy                    ???                        64  v0
-            create                     INPUT                      66  v0
-            increment_version          OPTIMIZER_STATE            38  v0
-            create                     TEMPORARY                  67  v0
-            destroy                    TEMPORARY                  67  v0
-            destroy                    INPUT                      66  v0
-            increment_version          OPTIMIZER_STATE            38  v1
-            create                     INPUT                      68  v0
-            increment_version          OPTIMIZER_STATE            39  v0
-            create                     TEMPORARY                  69  v0
-            destroy                    TEMPORARY                  69  v0
-            destroy                    INPUT                      68  v0
-            increment_version          OPTIMIZER_STATE            39  v1
-            create                     ???                        70  v0
-            create                     INPUT                      71  v0
-            create                     TEMPORARY                  72  v0
-            create                     ???                        73  v0
-            destroy                    TEMPORARY                  72  v0
-            destroy                    INPUT                      71  v0
-            destroy                    ???                        70  v0
-            create                     INPUT                      74  v0
-            increment_version          ???                        73  v0
-            create                     TEMPORARY                  75  v0
-            destroy                    TEMPORARY                  75  v0
-            destroy                    INPUT                      74  v0
-            destroy                    ???                        61  v1
-            increment_version          PARAMETER                   7  v0
-            destroy                    ???                        73  v1
-            increment_version          GRADIENT                   29  v0
-            increment_version          GRADIENT                   30  v0
-            increment_version          GRADIENT                   26  v0""")
+            preexisting                PARAMETER                    0(v0)          128 kB
+            preexisting                PARAMETER                    1(v0)            2 kB
+            preexisting                PARAMETER                    2(v0)         1024 kB
+            create                     INPUT                        3(v0)          256 kB
+            create                     INPUT                        4(v0)         2048 kB
+            create                     ACTIVATION                   5(v0)         2048 kB
+            create                     ACTIVATION                   6(v0)         2048 kB
+            destroy                    ACTIVATION                   5(v0)         2048 kB
+            create                     ACTIVATION                   7(v0)         2048 kB
+            create                     ACTIVATION                   8(v0)         2048 kB
+            destroy                    ACTIVATION                   7(v0)         2048 kB
+            create                     ACTIVATION                   9(v0)         2048 kB
+            create                     TEMPORARY                   10(v0)         2048 kB
+            destroy                    TEMPORARY                   10(v0)         2048 kB
+            create                     AUTOGRAD_DETAIL             11(v0)         2048 kB
+            create                     AUTOGRAD_DETAIL             12(v0)         2048 kB
+            destroy                    AUTOGRAD_DETAIL             11(v0)         2048 kB
+            create                     GRADIENT                    13(v0)         1024 kB
+            create                     AUTOGRAD_DETAIL             14(v0)         2048 kB
+            destroy                    AUTOGRAD_DETAIL             12(v0)         2048 kB
+            create                     AUTOGRAD_DETAIL             15(v0)         2048 kB
+            destroy                    AUTOGRAD_DETAIL             14(v0)         2048 kB
+            destroy                    ACTIVATION                   6(v0)         2048 kB
+            create                     GRADIENT                    16(v0)          128 kB
+            create                     GRADIENT                    17(v0)            2 kB
+            destroy                    AUTOGRAD_DETAIL             15(v0)         2048 kB
+            create                     OPTIMIZER_STATE             18(v0)          128 kB
+            create                     OPTIMIZER_STATE             19(v0)          128 kB
+            create                     OPTIMIZER_STATE             20(v0)            2 kB
+            create                     OPTIMIZER_STATE             21(v0)            2 kB
+            create                     OPTIMIZER_STATE             22(v0)         1024 kB
+            create                     OPTIMIZER_STATE             23(v0)         1024 kB
+            increment_version          OPTIMIZER_STATE             18(v0)          128 kB
+            increment_version          OPTIMIZER_STATE             18(v1)          128 kB
+            increment_version          OPTIMIZER_STATE             19(v0)          128 kB
+            increment_version          OPTIMIZER_STATE             19(v1)          128 kB
+            create                     ???                         24(v0)          128 kB
+            create                     ???                         25(v0)          128 kB
+            destroy                    ???                         24(v0)          128 kB
+            increment_version          ???                         25(v0)          128 kB
+            increment_version          PARAMETER                    0(v0)          128 kB
+            increment_version          OPTIMIZER_STATE             20(v0)            2 kB
+            increment_version          OPTIMIZER_STATE             20(v1)            2 kB
+            increment_version          OPTIMIZER_STATE             21(v0)            2 kB
+            increment_version          OPTIMIZER_STATE             21(v1)            2 kB
+            create                     ???                         26(v0)            2 kB
+            create                     ???                         27(v0)            2 kB
+            destroy                    ???                         26(v0)            2 kB
+            increment_version          ???                         27(v0)            2 kB
+            destroy                    ???                         25(v1)          128 kB
+            increment_version          PARAMETER                    1(v0)            2 kB
+            increment_version          OPTIMIZER_STATE             22(v0)         1024 kB
+            increment_version          OPTIMIZER_STATE             22(v1)         1024 kB
+            increment_version          OPTIMIZER_STATE             23(v0)         1024 kB
+            increment_version          OPTIMIZER_STATE             23(v1)         1024 kB
+            create                     ???                         28(v0)         1024 kB
+            create                     ???                         29(v0)         1024 kB
+            destroy                    ???                         28(v0)         1024 kB
+            increment_version          ???                         29(v0)         1024 kB
+            destroy                    ???                         27(v1)            2 kB
+            increment_version          PARAMETER                    2(v0)         1024 kB
+            destroy                    ???                         29(v1)         1024 kB
+            increment_version          GRADIENT                    16(v0)          128 kB
+            increment_version          GRADIENT                    17(v0)            2 kB
+            increment_version          GRADIENT                    13(v0)         1024 kB""")
 
 
 if __name__ == "__main__":
diff --git a/torch/profiler/_memory_profiler.py b/torch/profiler/_memory_profiler.py
index 800d838366026..e9f70a0924f08 100644
--- a/torch/profiler/_memory_profiler.py
+++ b/torch/profiler/_memory_profiler.py
@@ -2,6 +2,7 @@
 import dataclasses
 import enum
 import itertools as it
+import logging
 from typing import (
     Any,
     cast,
@@ -26,6 +27,7 @@
     _TensorMetadata,
     RecordScope,
 )
+from torch._utils import _element_size
 from torch.profiler import _utils
 
 TensorAndID = Tuple["TensorKey", int]
@@ -305,6 +307,74 @@ def sorted_nodes(self) -> Tuple[_ProfilerEvent, ...]:
         return self._sorted_nodes
 
 
+class SizeMap:
+    def __init__(self, op_tree: OpTree) -> None:
+        self._values: Dict[TensorKey, int] = {}
+
+        for node in op_tree.sorted_nodes:
+            if node.typed[0] == _EventType.TorchOp:
+                for t in self._flat_tensor_inputs(node.typed[1]):
+                    self._update_values(t)
+
+            elif node.typed[0] == _EventType.PyCall:
+                typed_fields = node.typed[1]
+                assert typed_fields.module is None or typed_fields.optimizer is None
+                if typed_fields.module is not None:
+                    for _, p, p_grad in typed_fields.module.parameters:
+                        self._update_values(p)
+                        self._update_values(p_grad)
+
+                if typed_fields.optimizer is not None:
+                    for p, p_grad, state in typed_fields.optimizer.parameters:
+                        self._update_values(p)
+                        self._update_values(p_grad)
+                        for _, t in state:
+                            self._update_values(t)
+
+        allocations: Dict[TensorKey, int] = {}
+        for node in op_tree.sorted_nodes:
+            if node.typed[0] == _EventType.Allocation:
+                alloc_fields = node.typed[1]
+                key = TensorKey.from_allocation(alloc_fields)
+                if key:
+                    new_size = abs(alloc_fields.alloc_size)
+                    prior_size = allocations.setdefault(key, new_size)
+
+                    # It is possible to resize Storage in PyTorch, however we
+                    # key on data pointer so most resizes will be treated as a
+                    # change in storage. The one corner case that cannot be
+                    # handled is `realloc` which successfully resizes the
+                    # storage. At time of writing this is not done anywhere in
+                    # the core PyTorch codebase.
+                    if prior_size != new_size:
+                        delta = f"{prior_size} vs. {new_size}"
+                        logging.warn(f"Mismatch between allocation and free: {delta}")
+
+        self._values.update(allocations)
+
+    def _update_values(self, t: Optional[_TensorMetadata]) -> None:
+        key = TensorKey.from_tensor(t)
+        if key is not None and t is not None and t.layout == torch.strided:
+            # Scalars are represented as zero dim Tensors
+            n = max(i[0] * i[1] for i in zip(t.sizes or [1], t.strides or [1]))
+
+            num_bytes = n * _element_size(t.dtype)
+            assert num_bytes >= 0, f"{num_bytes}"
+            self._values[key] = max(self._values.get(key, 0), num_bytes)
+
+    @staticmethod
+    def _flat_tensor_inputs(op: _ExtraFields_TorchOp) -> Iterator[_TensorMetadata]:
+        for i in op.inputs:
+            if isinstance(i, _TensorMetadata):
+                yield i
+            elif isinstance(i, list):
+                for t in i:
+                    yield t
+
+    def __getitem__(self, key: TensorKey):
+        return self._values[key]
+
+
 @dataclasses.dataclass()
 class DataFlowEdge:
     input_version: Optional[int] = None
@@ -564,6 +634,7 @@ class MemoryProfile:
     def __init__(self, result: _ProfilerResult) -> None:
         self._op_tree = OpTree(result)
         self._data_flow_graph = DataFlowGraph(self._op_tree)
+        self._size_map = SizeMap(self._op_tree)
         self._categories = CategoryDict()
 
         self._set_gradients_and_temporaries()
@@ -575,7 +646,7 @@ def __init__(self, result: _ProfilerResult) -> None:
         self._set_autograd_detail()
 
     @property
-    def timeline(self) -> Tuple[Tuple[int, Action, TensorAndID], ...]:
+    def timeline(self) -> Tuple[Tuple[int, Action, TensorAndID, int], ...]:
         t0 = min(event.start_time_ns for event in self._op_tree.dfs())
         allocation_times: Dict[Tuple[TensorKey, bool], int] = {}
         for event in self._op_tree.dfs():
@@ -612,7 +683,10 @@ def timeline(self) -> Tuple[Tuple[int, Action, TensorAndID], ...]:
                     events.append((t, Action.DESTROY, (key, last_version[key])))
 
         events.sort(key=lambda x: (x[0], x[1].value))
-        return tuple(events)
+        return tuple(
+            (time, action, (key, version), self._size_map[key])
+            for time, action, (key, version) in events
+        )
 
     def _is_gradient(self, *args, **kwargs) -> bool:
         return self._categories.get(*args, **kwargs) == Category.GRADIENT

From ba57e7a93c8e77e6d2b9257cb581e5e26e62314e Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Fri, 2 Dec 2022 04:05:57 +0000
Subject: [PATCH 1510/1922] [dynamo][benchmarks] Call zero grad (#90026)

Hoping that it might reduce some flakiness

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90026
Approved by: https://github.com/williamwen42
---
 benchmarks/dynamo/common.py      | 4 +++-
 benchmarks/dynamo/huggingface.py | 2 +-
 benchmarks/dynamo/timm_models.py | 2 +-
 benchmarks/dynamo/torchbench.py  | 2 +-
 torch/_dynamo/utils.py           | 9 +++++----
 5 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 87aeb4d7c3561..e34010f686d65 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -1032,9 +1032,11 @@ def run_n_iterations(self, mod, inputs, n=2):
             self.model_iter_fn(mod, inputs, collect_outputs=False)
         return self.model_iter_fn(mod, inputs, collect_outputs=True)
 
-    def optimizer_zero_grad(self):
+    def optimizer_zero_grad(self, mod):
         if self.optimizer is not None:
             self.optimizer.zero_grad(True)
+        else:
+            mod.zero_grad(True)
 
     def optimizer_step(self):
         if self.optimizer is not None:
diff --git a/benchmarks/dynamo/huggingface.py b/benchmarks/dynamo/huggingface.py
index bf127deaa43ab..84caea0d910ec 100755
--- a/benchmarks/dynamo/huggingface.py
+++ b/benchmarks/dynamo/huggingface.py
@@ -480,7 +480,7 @@ def forward_pass(self, mod, inputs, collect_outputs=True):
 
     def forward_and_backward_pass(self, mod, inputs, collect_outputs=True):
         cloned_inputs = clone_inputs(inputs)
-        self.optimizer_zero_grad()
+        self.optimizer_zero_grad(mod)
         with self.autocast():
             pred = mod(**cloned_inputs)
             loss = self.compute_loss(pred)
diff --git a/benchmarks/dynamo/timm_models.py b/benchmarks/dynamo/timm_models.py
index 133be5f85dbad..de9dc746e666a 100755
--- a/benchmarks/dynamo/timm_models.py
+++ b/benchmarks/dynamo/timm_models.py
@@ -312,7 +312,7 @@ def forward_pass(self, mod, inputs, collect_outputs=True):
 
     def forward_and_backward_pass(self, mod, inputs, collect_outputs=True):
         cloned_inputs = clone_inputs(inputs)
-        self.optimizer_zero_grad()
+        self.optimizer_zero_grad(mod)
         with self.autocast():
             pred = mod(*cloned_inputs)
             if isinstance(pred, tuple):
diff --git a/benchmarks/dynamo/torchbench.py b/benchmarks/dynamo/torchbench.py
index 1ace31840e7a9..d138e3e692462 100755
--- a/benchmarks/dynamo/torchbench.py
+++ b/benchmarks/dynamo/torchbench.py
@@ -356,7 +356,7 @@ def forward_pass(self, mod, inputs, collect_outputs=True):
 
     def forward_and_backward_pass(self, mod, inputs, collect_outputs=True):
         cloned_inputs = clone_inputs(inputs)
-        self.optimizer_zero_grad()
+        self.optimizer_zero_grad(mod)
         with self.autocast():
             pred = mod(*cloned_inputs)
             loss = self.compute_loss(pred)
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 62d967402af98..a6a139ef5760b 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -842,10 +842,11 @@ def same(
                 # early exit that handles zero/nan better
                 # cosine_similarity(zeros(10), zeros(10), dim=0) is 0
                 return True
-            res = torch.nn.functional.cosine_similarity(ref, res, dim=0, eps=1e-6)
-            if res < 0.99:
-                log.warning(f"Similarity score={res.cpu().detach().item()}")
-            return res >= 0.99
+            score = torch.nn.functional.cosine_similarity(ref, res, dim=0, eps=1e-6)
+            if score < 0.99:
+                breakpoint()
+                log.warning(f"Similarity score={score.cpu().detach().item()}")
+            return score >= 0.99
         else:
             if not exact_dtype:
                 ref = ref.to(res.dtype)

From 305da37916c3ae322d7a98eb5824a6c0de208b4e Mon Sep 17 00:00:00 2001
From: XiaobingSuper <xiaobing.zhang@intel.com>
Date: Tue, 29 Nov 2022 23:29:57 -0500
Subject: [PATCH 1511/1922] TorchDynamo: enable convolution bn folding for
 functional bn (#89746)

Motivation: for Timm model, there is always use customer-defined BN which using F.batch_norm: https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/layers/norm_act.py#L26, and the fx graph will be like:
```
-------------  ----------------------  ---------------------------------------  ---------------------------------------------------------------------------------------------------------  --------
placeholder    x                       x                                        ()                                                                                                         {}
call_module    self_conv               self_conv                                (x,)                                                                                                       {}
get_attr       self_bn_running_mean_1  self_bn_running_mean                     ()                                                                                                         {}
get_attr       self_bn_running_var     self_bn_running_var                      ()                                                                                                         {}
get_attr       self_bn_weight          self_bn_weight                           ()                                                                                                         {}
get_attr       self_bn_bias            self_bn_bias                             ()                                                                                                         {}
call_function  batch_norm              <function batch_norm at 0x7f07196cdf70>  (self_conv, self_bn_running_mean_1, self_bn_running_var, self_bn_weight, self_bn_bias, False, 0.1, 1e-05)  {}
call_module    self_bn_drop            self_bn_drop                             (batch_norm,)
```

the original conv+bn folding path doesn't work for **F.batch_norm**, but for **F.batch_norm** case, if its' parameters are const(attr of the module and will not be updated), we can also do the const folding's optimization. This PR will enable it and will improve the Timm models' performance.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89746
Approved by: https://github.com/jgong5, https://github.com/jansel
---
 test/inductor/test_torchinductor.py |  84 +++++++++++++++++++++-
 torch/_inductor/overrides.py        | 108 ++++++++++++++++++++++++++--
 2 files changed, 187 insertions(+), 5 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 41fce3998dd0d..2d532c8384cd6 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -1424,7 +1424,7 @@ def fn(a, b):
         )
 
     # For gpu path, there has a accurcy issue,
-    @unittest.skipIf(HAS_CUDA, "only support cpu conv  bn test")
+    @unittest.skipIf(HAS_CUDA, "only support cpu conv bn test")
     def test_conv_bn_fuse(self):
         input_shapes = {1: (112,), 2: (112, 112), 3: (55, 55, 55)}
         conv_modules = {1: torch.nn.Conv1d, 2: torch.nn.Conv2d, 3: torch.nn.Conv3d}
@@ -1479,6 +1479,88 @@ def test_conv_bn_fuse(self):
                         (v,),
                     )
 
+    # For gpu path, there has a accurcy issue,
+    @unittest.skipIf(HAS_CUDA, "only support cpu conv bn test")
+    def test_conv_functional_bn_fuse(self):
+        # Define a BatchNorm using functional BN.
+        class BatchNorm(torch.nn.BatchNorm2d):
+            def __init__(
+                self,
+                num_features,
+                eps=1e-5,
+                momentum=0.1,
+                affine=True,
+                track_running_stats=True,
+                device=None,
+                dtype=None,
+            ):
+                factory_kwargs = {"device": device, "dtype": dtype}
+                super(BatchNorm, self).__init__(
+                    num_features,
+                    eps=eps,
+                    momentum=momentum,
+                    affine=affine,
+                    track_running_stats=track_running_stats,
+                    **factory_kwargs,
+                )
+
+            def forward(self, x):
+                if self.momentum is None:
+                    exponential_average_factor = 0.0
+                else:
+                    exponential_average_factor = self.momentum
+
+                if self.training and self.track_running_stats:
+                    # TODO: if statement only here to tell the jit to skip emitting this when it is None
+                    if self.num_batches_tracked is not None:  # type: ignore[has-type]
+                        self.num_batches_tracked = self.num_batches_tracked + 1  # type: ignore[has-type]
+                        if self.momentum is None:  # use cumulative moving average
+                            exponential_average_factor = 1.0 / float(
+                                self.num_batches_tracked
+                            )
+                        else:  # use exponential moving average
+                            exponential_average_factor = self.momentum
+                if self.training:
+                    bn_training = True
+                else:
+                    bn_training = (self.running_mean is None) and (
+                        self.running_var is None
+                    )
+                x = F.batch_norm(
+                    x,
+                    # If buffers are not to be tracked, ensure that they won't be updated
+                    self.running_mean
+                    if not self.training or self.track_running_stats
+                    else None,
+                    self.running_var
+                    if not self.training or self.track_running_stats
+                    else None,
+                    self.weight,
+                    self.bias,
+                    bn_training,
+                    exponential_average_factor,
+                    self.eps,
+                )
+                return x
+
+        v = torch.randn(1, 3, 556, 56, dtype=torch.float32)
+        mod = torch.nn.Sequential(
+            torch.nn.Conv2d(
+                3,
+                64,
+                kernel_size=3,
+                dilation=1,
+                groups=1,
+                bias=True,
+            ),
+            BatchNorm(64),
+        ).eval()
+        with torch.no_grad():
+            self.common(
+                mod,
+                (v,),
+            )
+
     @unittest.skipIf(HAS_CUDA, "only support cpu conv2d unary test")
     def test_conv2d_packed(self):
         x_shape = (1, 3, 56, 56)
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index e1dfdea9c40ab..8d95971864f0f 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -19,7 +19,7 @@
 from torch.fx.passes.shape_prop import ShapeProp
 from torch.nn import functional as F
 from torch.nn.modules.utils import _pair
-from torch.nn.utils.fusion import fuse_conv_bn_eval
+from torch.nn.utils.fusion import fuse_conv_bn_eval, fuse_conv_bn_weights
 from torch.overrides import TorchFunctionMode
 
 from . import config
@@ -545,6 +545,7 @@ def fuse_fx(gm: torch.fx.GraphModule, example_inputs):
         return gm
     if not is_cpu:
         return gm
+    gm = remove_identity(gm)
     gm = fuse_conv_bn(gm)
     # For binary fusion, we need to check inputs info to make sure
     # the binary inputs have same tensor info(device, dtype, and layout).
@@ -559,18 +560,78 @@ def fuse_fx(gm: torch.fx.GraphModule, example_inputs):
     return gm
 
 
+# check the pattern: (nn.module, F.function) matched.
+def matches_module_function_pattern(pattern, node, modules):
+    if len(node.args) == 0:
+        return False
+    if not isinstance(node.args[0], torch.fx.Node) or not isinstance(
+        node, torch.fx.Node
+    ):
+        return False
+    # the first node is call_module
+    if node.args[0].op != "call_module":
+        return False
+    if not isinstance(node.args[0].target, str):
+        return False
+    if node.args[0].target not in modules:
+        return False
+    if type(modules[node.args[0].target]) is not pattern[0]:
+        return False
+    # the second node is call_function
+    if node.op != "call_function":
+        return False
+    if node.target != pattern[1]:
+        return False
+    # make sure node.args[0] output is only used by current node.
+    if len(node.args[0].users) > 1:
+        return False
+    return True
+
+
+def fetch_attr(target: str, mod):
+    target_atoms = target.split(".")
+    attr_itr = mod
+    for i, atom in enumerate(target_atoms):
+        if not hasattr(attr_itr, atom):
+            raise RuntimeError(
+                f"Node referenced nonexistant target {'.'.join(target_atoms[:i])}"
+            )
+        attr_itr = getattr(attr_itr, atom)
+    return attr_itr
+
+
+def remove_identity(gm: torch.fx.GraphModule):
+    """
+    Removes all identity layers from the module.
+    """
+
+    class IdentityRemover(torch.fx.Transformer):
+        def call_module(self, target, args, kwargs):
+            if isinstance(self.submodules[target], nn.Identity):
+                assert len(args) == 1
+                return args[0]
+            else:
+                return super().call_module(target, args, kwargs)
+
+    return IdentityRemover(gm).transform()
+
+
 def fuse_conv_bn(gm: torch.fx.GraphModule, inplace=False):
     """
     Fuses Convolution/BN layers for inference purposes.
     """
-    patterns = [
+    modules_patterns = [
         (torch.nn.Conv1d, torch.nn.BatchNorm1d),
         (torch.nn.Conv2d, torch.nn.BatchNorm2d),
         (torch.nn.Conv3d, torch.nn.BatchNorm3d),
     ]
+    module_function_patterns = [
+        (torch.nn.Conv1d, F.batch_norm),
+        (torch.nn.Conv2d, F.batch_norm),
+        (torch.nn.Conv3d, F.batch_norm),
+    ]
     modules = dict(gm.named_modules())
-
-    for pattern in patterns:
+    for pattern in modules_patterns:
         for node in gm.graph.nodes:
             if matches_module_pattern(pattern, node, modules):
                 if len(node.args[0].users) > 1:  # Output of conv is used by other nodes
@@ -587,7 +648,46 @@ def fuse_conv_bn(gm: torch.fx.GraphModule, inplace=False):
                 node.replace_all_uses_with(node.args[0])
                 gm.graph.erase_node(node)
                 gm.graph.lint()
+    for pattern in module_function_patterns:
+        for node in gm.graph.nodes:
+            if matches_module_function_pattern(pattern, node, modules):
+                # TODO: support kwargs.
+                if len(node.args) != 8:
+                    continue
+                conv = modules[node.args[0].target]
+                bn_training = node.args[5]
+                bn_eps = node.args[7]
+                if conv.training or bn_training:
+                    continue
+                if type(bn_eps) is not float:
+                    continue
+                bn_args_is_constant = all(
+                    n.op == "get_attr" and len(n.users) == 1 for n in node.args[1:5]
+                )
+                if not bn_args_is_constant:
+                    continue
+                bn_running_mean = fetch_attr(node.args[1].target, gm)
+                bn_running_var = fetch_attr(node.args[2].target, gm)
+                bn_weight = fetch_attr(node.args[3].target, gm)
+                bn_bias = fetch_attr(node.args[4].target, gm)
+                if bn_running_mean is None or bn_running_var is None:
+                    continue
+                fused_conv = copy.deepcopy(conv)
+                fused_conv.weight, fused_conv.bias = fuse_conv_bn_weights(
+                    fused_conv.weight,
+                    fused_conv.bias,
+                    bn_running_mean,
+                    bn_running_var,
+                    bn_eps,
+                    bn_weight,
+                    bn_bias,
+                )
+                replace_node_module(node.args[0], modules, fused_conv)
+                node.replace_all_uses_with(node.args[0])
+                gm.graph.erase_node(node)
+                gm.graph.lint()
     gm.recompile()
+
     return gm
 
 
From 7179e89e34cc58a79750977a89671bd69fc14f6b Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Fri, 2 Dec 2022 04:13:56 +0000
Subject: [PATCH 1512/1922] [Inductor] Do not install g++12 by default (#90038)

Unless `TORCH_INDUCTOR_INSTALL_GXX` environment variable is define
(which is the case for CI)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90038
Approved by: https://github.com/albanD
---
 .jenkins/pytorch/test.sh     | 1 +
 torch/_inductor/codecache.py | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 29f9657beaf33..414c85abff28c 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -16,6 +16,7 @@ BUILD_RENAMED_DIR="build_renamed"
 BUILD_BIN_DIR="$BUILD_DIR"/bin
 
 export VALGRIND=ON
+export TORCH_INDUCTOR_INSTALL_GXX=ON
 if [[ "$BUILD_ENVIRONMENT" == *clang9* ]]; then
   # clang9 appears to miscompile code involving c10::optional<c10::SymInt>,
   # such that valgrind complains along these lines:
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index dca8af6cf1b9e..f4297732b38cd 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -110,6 +110,9 @@ def cpp_compiler_search(search):
                 # according to https://anaconda.org/conda-forge/gxx/
                 if sys.platform != "linux":
                     continue
+                # Do not install GXX by default
+                if not os.getenv("TORCH_INDUCTOR_INSTALL_GXX"):
+                    continue
                 from filelock import FileLock
 
                 lock_dir = get_lock_dir()

From f706702179ee56bee79d615fff5bf853643fed6e Mon Sep 17 00:00:00 2001
From: XiaobingSuper <xiaobing.zhang@intel.com>
Date: Tue, 29 Nov 2022 23:33:02 -0500
Subject: [PATCH 1513/1922] TorchDynamo: don't compute index for max_pooling
 when return_index is false (#89838)

For max_pooling, if return_index  is **False**, we don't need compute the index.

Before:

```
extern "C" void kernel(const float* __restrict__ in_ptr0,
                       float* __restrict__ out_ptr0)
{
    #pragma GCC ivdep
    for(long i0=0; i0<128; i0+=1)
    {
        #pragma GCC ivdep
        for(long i1=0; i1<3; i1+=1)
        {
            #pragma GCC ivdep
            for(long i2=0; i2<3; i2+=1)
            {
                #pragma GCC ivdep
                for(long i3=0; i3<3; i3+=1)
                {
                    {
                        {
                            auto tmp0 = in_ptr0[i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp2 = in_ptr0[3 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp7 = in_ptr0[6 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp12 = in_ptr0[21 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp17 = in_ptr0[24 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp22 = in_ptr0[27 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp27 = in_ptr0[42 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp32 = in_ptr0[45 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp37 = in_ptr0[48 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp1 = static_cast<long>((2*i2) + (14*i1));
                            auto tmp3 = static_cast<long>(1 + (2*i2) + (14*i1));
                            auto tmp4 = tmp2 > tmp0;
                            auto tmp5 = tmp4 ? tmp3 : tmp1;
                            auto tmp6 = (tmp0 != tmp0) ? tmp0 : std::max(tmp2, tmp0);
                            auto tmp8 = static_cast<long>(2 + (2*i2) + (14*i1));
                            auto tmp9 = tmp7 > tmp6;
                            auto tmp10 = tmp9 ? tmp8 : tmp5;
                            auto tmp11 = (tmp6 != tmp6) ? tmp6 : std::max(tmp7, tmp6);
                            auto tmp13 = static_cast<long>(7 + (2*i2) + (14*i1));
                            auto tmp14 = tmp12 > tmp11;
                            auto tmp15 = tmp14 ? tmp13 : tmp10;
                            auto tmp16 = (tmp11 != tmp11) ? tmp11 : std::max(tmp12, tmp11);
                            auto tmp18 = static_cast<long>(8 + (2*i2) + (14*i1));
                            auto tmp19 = tmp17 > tmp16;
                            auto tmp20 = tmp19 ? tmp18 : tmp15;
                            auto tmp21 = (tmp16 != tmp16) ? tmp16 : std::max(tmp17, tmp16);
                            auto tmp23 = static_cast<long>(9 + (2*i2) + (14*i1));
                            auto tmp24 = tmp22 > tmp21;
                            auto tmp25 = tmp24 ? tmp23 : tmp20;
                            auto tmp26 = (tmp21 != tmp21) ? tmp21 : std::max(tmp22, tmp21);
                            auto tmp28 = static_cast<long>(14 + (2*i2) + (14*i1));
                            auto tmp29 = tmp27 > tmp26;
                            auto tmp30 = tmp29 ? tmp28 : tmp25;
                            auto tmp31 = (tmp26 != tmp26) ? tmp26 : std::max(tmp27, tmp26);
                            auto tmp33 = static_cast<long>(15 + (2*i2) + (14*i1));
                            auto tmp34 = tmp32 > tmp31;
                            auto tmp35 = tmp34 ? tmp33 : tmp30;
                            auto tmp36 = (tmp31 != tmp31) ? tmp31 : std::max(tmp32, tmp31);
                            auto tmp38 = static_cast<long>(16 + (2*i2) + (14*i1));
                            auto tmp39 = tmp37 > tmp36;
                            auto tmp40 = tmp39 ? tmp38 : tmp35;
                            auto tmp41 = (tmp36 != tmp36) ? tmp36 : std::max(tmp37, tmp36);
                            out_ptr0[i3 + (3*i2) + (9*i1) + (27*i0)] = tmp41;
                        }
                    }
                }
            }
        }
    }
}
''')
```
After:

```
extern "C" void kernel(const float* __restrict__ in_ptr0,
                       float* __restrict__ out_ptr0)
{
    #pragma GCC ivdep
    for(long i0=0; i0<128; i0+=1)
    {
        #pragma GCC ivdep
        for(long i1=0; i1<3; i1+=1)
        {
            #pragma GCC ivdep
            for(long i2=0; i2<3; i2+=1)
            {
                #pragma GCC ivdep
                for(long i3=0; i3<3; i3+=1)
                {
                    {
                        {
                            auto tmp0 = in_ptr0[i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp1 = in_ptr0[3 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp3 = in_ptr0[6 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp5 = in_ptr0[21 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp7 = in_ptr0[24 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp9 = in_ptr0[27 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp11 = in_ptr0[42 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp13 = in_ptr0[45 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp15 = in_ptr0[48 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp2 = (tmp0 != tmp0) ? tmp0 : std::max(tmp1, tmp0);
                            auto tmp4 = (tmp2 != tmp2) ? tmp2 : std::max(tmp3, tmp2);
                            auto tmp6 = (tmp4 != tmp4) ? tmp4 : std::max(tmp5, tmp4);
                            auto tmp8 = (tmp6 != tmp6) ? tmp6 : std::max(tmp7, tmp6);
                            auto tmp10 = (tmp8 != tmp8) ? tmp8 : std::max(tmp9, tmp8);
                            auto tmp12 = (tmp10 != tmp10) ? tmp10 : std::max(tmp11, tmp10);
                            auto tmp14 = (tmp12 != tmp12) ? tmp12 : std::max(tmp13, tmp12);
                            auto tmp16 = (tmp14 != tmp14) ? tmp14 : std::max(tmp15, tmp14);
                            out_ptr0[i3 + (3*i2) + (9*i1) + (27*i0)] = tmp16;
                        }
                    }
                }
            }
        }
    }
}
''')

```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89838
Approved by: https://github.com/jgong5, https://github.com/jansel
---
 torch/_inductor/lowering.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 00bfa4b72ee7c..58d118cc96e97 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -2605,12 +2605,15 @@ def fn(idx, return_index):
             ih = bh * stride[0] + ih - padding[0]
             iw = bw * stride[1] + iw - padding[1]
             val = x_loader([*prefix, ih, iw])
-            index = ops.index_expr(ih * w + iw, torch.int64)
+            if return_index:
+                index = ops.index_expr(ih * w + iw, torch.int64)
+                if maxindex is None:
+                    maxindex = index
+                else:
+                    maxindex = ops.where(ops.gt(val, maxval), index, maxindex)
             if maxval is None:
-                maxindex = index
                 maxval = val
             else:
-                maxindex = ops.where(ops.gt(val, maxval), index, maxindex)
                 maxval = ops.maximum(val, maxval)
         if return_index:
             return maxindex

From dda054196827b28bc1ad8b2fabf8690e01419361 Mon Sep 17 00:00:00 2001
From: Elias Ellison <elias.ellison@gmail.com>
Date: Thu, 1 Dec 2022 20:33:59 +0000
Subject: [PATCH 1514/1922] [Inductor] add expm1 lowering (#89961)

Improves perf of inductor no-cudagraphs on nvidia-deeprecommender from 0.88 -> .96. I am looking into disabling implicit fallbacks for benchmark models in another pr.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89961
Approved by: https://github.com/ngimel
---
 test/inductor/test_torchinductor.py | 28 ++++++++++++++++++++++++++++
 torch/_inductor/codegen/cpp.py      |  8 ++++++++
 torch/_inductor/codegen/triton.py   |  8 ++++++++
 torch/_inductor/decomposition.py    |  5 -----
 torch/_inductor/lowering.py         | 10 ++++++++++
 5 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 2d532c8384cd6..97cc6af476e46 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -2776,6 +2776,34 @@ def fn(x):
             (torch.randn([64]),),
         )
 
+    def test_expm1(self):
+        def fn(x):
+            return torch.expm1(x), torch.expm1(x) * 2
+
+        for dtype in (torch.float16, torch.float, torch.double, torch.int, torch.int64):
+            self.common(
+                fn,
+                (torch.randn([64]).to(dtype=dtype),),
+            )
+            self.common(
+                fn,
+                (torch.arange(-1e-5, 1e-5, 1e-7).to(dtype=dtype),),
+            )
+
+    def test_log1p(self):
+        def fn(x):
+            return torch.log1p(x), torch.log1p(x) * 2
+
+        for dtype in (torch.float16, torch.float, torch.double, torch.int, torch.int64):
+            self.common(
+                fn,
+                (torch.randn([64]).to(dtype=dtype),),
+            )
+            self.common(
+                fn,
+                (torch.arange(-1e-5, 1e-5, 1e-7).to(dtype=dtype),),
+            )
+
     def test_flip(self):
         def fn(x):
             return torch.flip(x, (-1,)), torch.flip(x, (0, 2)) - 2
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index feb4ea1b9ea90..88f4536ef3605 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -400,6 +400,14 @@ def sqrt(x):
     def rsqrt(x):
         return f"1 / std::sqrt({x})"
 
+    @staticmethod
+    def log1p(x):
+        return f"std::log1p({x})"
+
+    @staticmethod
+    def expm1(x):
+        return f"std::expm1({x})"
+
     @staticmethod
     def signbit(x):
         return f"std::signbit({x})"
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 416310ca7e89e..6c7b852ef8288 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -230,6 +230,14 @@ def randn(seed, offset, _):  # _ here to keep the contract identical to CPU rand
     def rsqrt(x):
         return f"tl.libdevice.rsqrt({x})"
 
+    @staticmethod
+    def log1p(x):
+        return f"tl.libdevice.log1p({x})"
+
+    @staticmethod
+    def expm1(x):
+        return f"tl.libdevice.expm1({x})"
+
     @staticmethod
     def sigmoid(x):
         return f"tl.sigmoid({x})"
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index 1bfd4b487361f..c9897ec678e83 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -449,11 +449,6 @@ def masked_fill_(x, mask, value):
     return x.copy_(aten.masked_fill(x, mask, value))
 
 
-@register_decomposition([aten.log1p])
-def log1p(x):
-    return torch.log(x + 1)
-
-
 @register_decomposition([aten.baddbmm])
 def baddbmm(self, batch1, batch2, beta=1, alpha=1):
     result = torch.bmm(batch1, batch2)
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 58d118cc96e97..7597b33062322 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -3582,6 +3582,16 @@ def sum_(x, axis=None, keepdims=False, *, dtype=None):
     aten.special_erf, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
 )(erf)
 
+register_pointwise(
+    aten.log1p,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+
+register_pointwise(
+    aten.expm1,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+
 register_pointwise(
     aten.log,
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,

From 99eeacf869e34cf825471fe0e2e6be4434da6056 Mon Sep 17 00:00:00 2001
From: Sean Ross-Ross <srossross@gmail.com>
Date: Fri, 2 Dec 2022 04:38:25 +0000
Subject: [PATCH 1515/1922] Adding dispatch alias
 'FuncTorchBatchedDecomposition' (#88771)

part of https://github.com/pytorch/functorch/issues/1009

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88771
Approved by: https://github.com/zou3519
---
 aten/src/ATen/core/dispatch/OperatorEntry.cpp        | 10 ++++++++++
 aten/src/ATen/functorch/BatchRulesDecompositions.cpp |  2 +-
 c10/core/DispatchKey.cpp                             |  4 ++++
 c10/core/DispatchKey.h                               |  9 +++++++++
 c10/core/DispatchKeySet.cpp                          |  2 ++
 c10/core/DispatchKeySet.h                            |  3 +++
 6 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
index 5bd5d8abf54dc..cbc7ff8bf309b 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@@ -279,6 +279,7 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
   //          cause confusion for AutogradOther. It's pretty straightforward to use Autograd (if available)
   //          in this case.
   //    (2.4) Use kernel from DispatchKey::Autograd if available
+  //    (2.5) Use kernel from DispatchKey::FuncTorchBatchedDecomposition if available
   //    The implementation of (2.2) relies on the invariant that for a given backend,
   //    `computeDispatchTableEntryWithDebug()` will be called for that backend's autograd key after the
   //    backend key. See Note [Refresh Runtime Autograd entries in dispatchTable_]
@@ -331,6 +332,7 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
   // We have no intention to change the behavior of Undefined,
   // so this nested-tensor branch requires `dispatch_key != DispatchKey::Undefined`
   // to let the original CompositeImplicitAutograd handle Undefined
+  // See Note: [Disjoint AliasKeyset] The order for this alias key doesn't matter
   if (dispatch_key != DispatchKey::Undefined && isIncludedInAlias(dispatch_key, DispatchKey::CompositeImplicitAutogradNestedTensor)) {
     if (auto nested_registration = getKernelForDispatchKey(DispatchKey::CompositeImplicitAutogradNestedTensor)) {
       return {*nested_registration, "nested kernel"};
@@ -355,6 +357,14 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
     }
   }
 
+  // 2.5. For batched backend keys, use kernel from DispatchKey::FuncTorchBatchedDecomposition if available
+  // See Note: [Disjoint AliasKeyset] The order for this alias key doesn't matter
+  if (isIncludedInAlias(dispatch_key, DispatchKey::FuncTorchBatchedDecomposition)) {
+    if (auto batched_registration = getKernelForDispatchKey(DispatchKey::FuncTorchBatchedDecomposition)) {
+      return {*batched_registration, "batched kernel"};
+    }
+  }
+
   // 3. Backend fallback
   auto dispatch_ix = getDispatchTableIndexForDispatchKey(dispatch_key);
   if (dispatch_ix < 0) {
diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
index 13dedcfb879ac..3696287179223 100644
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@@ -25,7 +25,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchVmapMode, m) {
   OP_DECOMPOSE(feature_dropout_);
 }
 
-TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
+TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
   OP_DECOMPOSE2(__and__, Scalar);
   OP_DECOMPOSE2(__and__, Tensor);
   OP_DECOMPOSE2(__iand__, Tensor);
diff --git a/c10/core/DispatchKey.cpp b/c10/core/DispatchKey.cpp
index e07d2ce6b051d..0bbea6a4f078a 100644
--- a/c10/core/DispatchKey.cpp
+++ b/c10/core/DispatchKey.cpp
@@ -189,6 +189,8 @@ const char* toString(DispatchKey t) {
       return "CompositeExplicitAutograd";
     case DispatchKey::CompositeExplicitAutogradNonFunctional:
       return "CompositeExplicitAutogradNonFunctional";
+    case DispatchKey::FuncTorchBatchedDecomposition:
+      return "FuncTorchBatchedDecomposition";
 
       // Per-backend dispatch keys
 
@@ -341,6 +343,8 @@ c10::DispatchKey parseDispatchKey(const std::string& k) {
        c10::DispatchKey::CompositeExplicitAutograd},
       {"CompositeExplicitAutogradNonFunctional",
        c10::DispatchKey::CompositeExplicitAutogradNonFunctional},
+      {"FuncTorchBatchedDecomposition",
+       c10::DispatchKey::FuncTorchBatchedDecomposition},
   };
   auto it = key_map.find(k);
   TORCH_CHECK(it != key_map.end(), "could not parse dispatch key: ", k);
diff --git a/c10/core/DispatchKey.h b/c10/core/DispatchKey.h
index 33f762e9da7d2..b28f770290e31 100644
--- a/c10/core/DispatchKey.h
+++ b/c10/core/DispatchKey.h
@@ -444,6 +444,15 @@ enum class DispatchKey : uint16_t {
   Autograd,
   CompositeImplicitAutograd, // registered at
   // build/aten/src/ATen/RegisterCompositeImplicitAutograd.cpp
+
+  // Note: The alias keyset for FuncTorchBatchedDecomposition is disjoint from
+  // all
+  // other alias keysets
+  // and so precedence order doesn't matter
+  FuncTorchBatchedDecomposition, // registered at
+  // build/aten/src/ATen/RegisterFuncTorchBatchedDecomposition.cpp
+  // Note: The alias keyset for CompositeImplicitAutogradNestedTensor is
+  // disjoint from all other alias keysets
   CompositeImplicitAutogradNestedTensor, // registered at
   // build/aten/src/ATen/RegisterCompositeImplicitAutogradNestedTensor.cpp
   CompositeExplicitAutograd, // registered at
diff --git a/c10/core/DispatchKeySet.cpp b/c10/core/DispatchKeySet.cpp
index a8f60451be379..f180008a102c5 100644
--- a/c10/core/DispatchKeySet.cpp
+++ b/c10/core/DispatchKeySet.cpp
@@ -101,6 +101,8 @@ bool runtimeDispatchKeySetHas(DispatchKey t, DispatchKey k) {
       // See Note [NestedTensor Not Included in Backend Keys]
       return k != DispatchKey::NestedTensor &&
           non_functional_backend_dispatch_keyset.has(k);
+    case DispatchKey::FuncTorchBatchedDecomposition:
+      return functorch_batched_ks.has(k);
     default:
       return t == k;
   }
diff --git a/c10/core/DispatchKeySet.h b/c10/core/DispatchKeySet.h
index 6a0be21f8fe07..a2f7b31fa9c5a 100644
--- a/c10/core/DispatchKeySet.h
+++ b/c10/core/DispatchKeySet.h
@@ -757,6 +757,9 @@ constexpr auto functorch_transforms_ks = DispatchKeySet(
      DispatchKey::VmapMode,
      DispatchKey::FuncTorchGradWrapper});
 
+constexpr auto functorch_batched_ks =
+    DispatchKeySet({DispatchKey::FuncTorchBatched});
+
 // This keyset has:
 // (1) the functionality bits corresponding to backends (dense, sparse,
 // quantized) (2) all of the backend bits set

From 4982e44c64fbef5c7179bd867b662b1643c028aa Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Thu, 1 Dec 2022 08:56:49 -0800
Subject: [PATCH 1516/1922] [ao] making QConfigMapping print in a user friendly
 way (#89932)

Summary: added __repr__ to QConfigMapping and QConfigMultiMapping
loosely based on __repr__ for BaseSparsifier

example output:

```
>>> import torch
>>> print(torch.ao.quantization.qconfig_mapping.get_default_qconfig_mapping())
QConfigMapping (
 global_qconfig
  QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})
 object_type_qconfigs
  reshape: QConfig(activation=<class 'torch.ao.quantization.observer.ReuseInputObserver'>, weight=<class 'torch.ao.quantization.observer.NoopObserver'>)
  <class 'torch.nn.modules.conv.Conv1d'>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})
  <class 'torch.nn.modules.conv.Conv2d'>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})
  <class 'torch.nn.modules.conv.Conv3d'>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})
  <class 'torch.nn.modules.conv.ConvTranspose1d'>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_tensor_symmetric){})
  <class 'torch.nn.modules.conv.ConvTranspose2d'>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_tensor_symmetric){})
  <class 'torch.nn.modules.conv.ConvTranspose3d'>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_tensor_symmetric){})
  <class 'torch.nn.modules.linear.Linear'>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})
  <built-in method conv1d of type object at 0x7f08b99497e0>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})
  <built-in method conv2d of type object at 0x7f08b99497e0>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})
  <built-in method conv3d of type object at 0x7f08b99497e0>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})
  <built-in method conv_transpose1d of type object at 0x7f08b99497e0>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_tensor_symmetric){})
  <built-in method conv_transpose2d of type object at 0x7f08b99497e0>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_tensor_symmetric){})
  <built-in method conv_transpose3d of type object at 0x7f08b99497e0>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_tensor_symmetric){})
  <built-in function linear>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})
  <class 'torch.nn.modules.activation.ReLU'>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})
  <function relu at 0x7f08ad57bc10>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})
  <built-in method relu of type object at 0x7f08b99497e0>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})
  <class 'torch.nn.modules.batchnorm.BatchNorm1d'>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})
  <class 'torch.nn.modules.batchnorm.BatchNorm2d'>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})
  <class 'torch.nn.modules.batchnorm.BatchNorm3d'>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=functools.partial(<class 'torch.ao.quantization.observer.PerChannelMinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_channel_symmetric){})
  <function layer_norm at 0x7f08ad57fca0>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=<class 'torch.ao.quantization.observer.PlaceholderObserver'>)
  <class 'torch.nn.modules.normalization.LayerNorm'>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.HistogramObserver'>, reduce_range=True){}, weight=<class 'torch.ao.quantization.observer.PlaceholderObserver'>)
  <class 'torch.nn.modules.activation.Hardsigmoid'>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.FixedQParamsObserver'>, scale=0.00390625, zero_point=0, dtype=torch.quint8, quant_min=0, quant_max=255){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_tensor_symmetric){})
  <function hardsigmoid at 0x7f08ad57f670>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.FixedQParamsObserver'>, scale=0.00390625, zero_point=0, dtype=torch.quint8, quant_min=0, quant_max=255){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_tensor_symmetric){})
  hardsigmoid: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.FixedQParamsObserver'>, scale=0.00390625, zero_point=0, dtype=torch.quint8, quant_min=0, quant_max=255){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_tensor_symmetric){})
  hardsigmoid_: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.FixedQParamsObserver'>, scale=0.00390625, zero_point=0, dtype=torch.quint8, quant_min=0, quant_max=255){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_tensor_symmetric){})
  <class 'torch.nn.modules.activation.Sigmoid'>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.FixedQParamsObserver'>, scale=0.00390625, zero_point=0, dtype=torch.quint8, quant_min=0, quant_max=255){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_tensor_symmetric){})
  <built-in method sigmoid of type object at 0x7f08b99497e0>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.FixedQParamsObserver'>, scale=0.00390625, zero_point=0, dtype=torch.quint8, quant_min=0, quant_max=255){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_tensor_symmetric){})
  sigmoid: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.FixedQParamsObserver'>, scale=0.00390625, zero_point=0, dtype=torch.quint8, quant_min=0, quant_max=255){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_tensor_symmetric){})
  sigmoid_: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.FixedQParamsObserver'>, scale=0.00390625, zero_point=0, dtype=torch.quint8, quant_min=0, quant_max=255){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_tensor_symmetric){})
  <class 'torch.nn.modules.activation.Softmax'>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.FixedQParamsObserver'>, scale=0.00390625, zero_point=0, dtype=torch.quint8, quant_min=0, quant_max=255){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_tensor_symmetric){})
  <class 'torch.nn.modules.activation.Tanh'>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.FixedQParamsObserver'>, scale=0.0078125, zero_point=128, dtype=torch.quint8, quant_min=0, quant_max=255){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_tensor_symmetric){})
  <built-in method tanh of type object at 0x7f08b99497e0>: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.FixedQParamsObserver'>, scale=0.0078125, zero_point=128, dtype=torch.quint8, quant_min=0, quant_max=255){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_tensor_symmetric){})
  tanh: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.FixedQParamsObserver'>, scale=0.0078125, zero_point=128, dtype=torch.quint8, quant_min=0, quant_max=255){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_tensor_symmetric){})
  tanh_: QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.FixedQParamsObserver'>, scale=0.0078125, zero_point=128, dtype=torch.quint8, quant_min=0, quant_max=255){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_tensor_symmetric){})
 module_name_regex_qconfigs
  OrderedDict()
 module_name_qconfigs
  OrderedDict()
 module_name_object_type_order_qconfigs
  OrderedDict()
)
```

Test Plan: python test/test_quantization.py
TestFXNumericSuiteNShadows.test_qconfig_multi_mapping_repr

python test/test_quantization.py
TestQuantizeFx.test_qconfig_mapping_repr
Reviewers:

Subscribers:

Tasks:

Tags:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89932
Approved by: https://github.com/vkuzo
---
 test/quantization/fx/test_numeric_suite_fx.py | 20 +++++++++++++++++
 test/quantization/fx/test_quantize_fx.py      |  3 +++
 torch/ao/ns/fx/qconfig_multi_mapping.py       | 17 +++++++-------
 torch/ao/quantization/qconfig_mapping.py      | 22 ++++++++++++++++++-
 4 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/test/quantization/fx/test_numeric_suite_fx.py b/test/quantization/fx/test_numeric_suite_fx.py
index 74098b597753c..86de5ab9cb371 100644
--- a/test/quantization/fx/test_numeric_suite_fx.py
+++ b/test/quantization/fx/test_numeric_suite_fx.py
@@ -2465,6 +2465,26 @@ def test_qconfig_multi_mapping_ordering(self):
         self.checkDynamicQuantizedLinear(msq.shadow_wrapper_1_1.mod_0, torch.qint8)
         self.checkQuantizedLinear(msq.shadow_wrapper_1_2.mod_0)
 
+    def test_qconfig_multi_mapping_repr(self):
+        qconfig_multi_mapping = (
+            QConfigMultiMapping()
+            .set_global(
+                [
+                    torch.ao.quantization.default_qconfig,
+                    torch.ao.quantization.default_dynamic_qconfig,
+                ]
+            )
+            .set_module_name(
+                "fc2",
+                [
+                    None,
+                    torch.ao.quantization.default_dynamic_qconfig,
+                    torch.ao.quantization.default_qat_qconfig_v2,
+                ],
+            )
+        )
+        self.assertTrue(isinstance(qconfig_multi_mapping.__repr__(), str))
+
 class TestFXNumericSuiteCoreAPIsModels(FXNumericSuiteQuantizationTestCase):
     """
     Tests numeric suite core APIs on non-toy models.
diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index de6cf18a0b0dd..1d46198f084bb 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -2038,6 +2038,9 @@ def test_qconfig_mapping_to_dict(self):
         qconfig_dict = self._get_qconfig_dict_for_qconfig_mapping_test(global_qconfig, qconfig1, qconfig2)
         self.assertEqual(qconfig_mapping.to_dict(), qconfig_dict)
 
+    def test_qconfig_mapping_repr(self):
+        self.assertTrue(isinstance(get_default_qconfig_mapping().__repr__(), str))
+
     # Dummy classes for PrepareCustomConfig testing
 
     class _DummyStandaloneModule:
diff --git a/torch/ao/ns/fx/qconfig_multi_mapping.py b/torch/ao/ns/fx/qconfig_multi_mapping.py
index bff2640e1feb3..20a005d0c8bf9 100644
--- a/torch/ao/ns/fx/qconfig_multi_mapping.py
+++ b/torch/ao/ns/fx/qconfig_multi_mapping.py
@@ -5,18 +5,11 @@
 
 import torch
 from torch.ao.quantization import QConfigMapping
+from torch.ao.quantization.qconfig_mapping import _QCONFIG_STYLE_ORDER
 from torch.ao.quantization.qconfig import QConfigAny
 
 __all__ = ["QConfigMultiMapping"]
 
-_QCONFIG_STYLE_ORDER: List[str] = [
-    "global_qconfig",
-    "object_type_qconfigs",
-    "module_name_regex_qconfigs",
-    "module_name_qconfigs",
-    "module_name_object_type_order_qconfigs",
-]
-
 _QCONFIG_STYLE_TO_METHOD: Dict[str, str] = {
     "global_qconfig": "set_global",
     "object_type_qconfigs": "set_object_type",
@@ -199,6 +192,14 @@ def set_module_name_object_type_order(
         )
         return self
 
+    def __repr__(self):
+        return (
+            self.__class__.__name__ +
+            " [" +
+            "".join(f"\n{qconfig_mapping.__repr__()}," for qconfig_mapping in self.qconfig_mappings_list) +
+            "\n]"
+        )
+
     @classmethod
     def from_list_qconfig_mapping(
         cls, qconfig_mapping_list: List[QConfigMapping]
diff --git a/torch/ao/quantization/qconfig_mapping.py b/torch/ao/quantization/qconfig_mapping.py
index 1957996526d91..08476e01997f2 100644
--- a/torch/ao/quantization/qconfig_mapping.py
+++ b/torch/ao/quantization/qconfig_mapping.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 from collections import OrderedDict
-from typing import Any, Callable, Dict, Tuple, Union
+from typing import Any, Callable, Dict, Tuple, Union, List
 
 import torch
 
@@ -157,6 +157,14 @@ def _get_symmetric_qnnpack_qconfig_mapping():
             qconfig_mapping.set_object_type(pattern, default_symmetric_qnnpack_qconfig)
     return qconfig_mapping
 
+_QCONFIG_STYLE_ORDER: List[str] = [
+    "global_qconfig",
+    "object_type_qconfigs",
+    "module_name_regex_qconfigs",
+    "module_name_qconfigs",
+    "module_name_object_type_order_qconfigs",
+]
+
 class QConfigMapping:
     """
     Mapping from model ops to :class:`torch.ao.quantization.QConfig` s.
@@ -257,6 +265,18 @@ def set_module_name_object_type_order(
         self.module_name_object_type_order_qconfigs[(module_name, object_type, index)] = qconfig
         return self
 
+    def __repr__(self) -> str:
+        output = self.__class__.__name__ + " ("
+        for style_name in _QCONFIG_STYLE_ORDER:
+            output += f"\n {style_name}"
+            qconfigs = getattr(self, style_name)
+            if isinstance(qconfigs, OrderedDict) and len(qconfigs) > 0:
+                for key, qconfig in qconfigs.items():
+                    output += f"\n  {key}: {qconfig}"
+            else:
+                output += f"\n  {qconfigs}"
+        return output + "\n)"
+
     # TODO: remove this
     def to_dict(self) -> Dict[str, Any]:
         """

From a9cb7fd6bce51a6205187664d411833939367885 Mon Sep 17 00:00:00 2001
From: chengscott <60510scott@gmail.com>
Date: Fri, 2 Dec 2022 05:32:18 +0000
Subject: [PATCH 1517/1922] Intel compiler support in c10/util/TypeIndex.h
 (#89610)

Build passed with icc (ICC) 2021.7.1 20221019.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89610
Approved by: https://github.com/kit1980
---
 c10/util/TypeIndex.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/c10/util/TypeIndex.h b/c10/util/TypeIndex.h
index 3e8114735a227..b78690c123bbe 100644
--- a/c10/util/TypeIndex.h
+++ b/c10/util/TypeIndex.h
@@ -12,8 +12,13 @@ namespace util {
 
 // TODO Make it work for more compilers
 
+// Intel compiler works
+#if defined(__INTEL_COMPILER)
+#define C10_TYPENAME_SUPPORTS_CONSTEXPR 0
+#define C10_TYPENAME_CONSTEXPR
+
 // Clang works
-#if defined(__clang__)
+#elif defined(__clang__)
 
 // except for NVCC
 #if defined(__CUDACC__)

From fa146d36f6e399c434580f74630d60cb381c307e Mon Sep 17 00:00:00 2001
From: alexmsettle <37422826+alexmsettle@users.noreply.github.com>
Date: Fri, 2 Dec 2022 05:58:04 +0000
Subject: [PATCH 1518/1922] Add hierarchical module names to torchFX graph.node
 #87659 (#87742)

Fixes #87659

Pass down the module hierarchy from module.named_modules() to the name field of graph.node.
This makes it so the name of each node contains descriptive information about the network architecture.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87742
Approved by: https://github.com/jerryzh168
---
 .../ao_migration/test_quantization_fx.py      |  2 -
 test/test_fx.py                               | 30 ++++++
 torch/ao/quantization/fx/tracer.py            | 88 +-----------------
 torch/ao/quantization/quantize_fx.py          | 55 -----------
 torch/fx/_symbolic_trace.py                   | 29 ++++--
 torch/fx/proxy.py                             | 93 ++++++++++++++++++-
 torch/quantization/quantize_fx.py             |  2 -
 7 files changed, 145 insertions(+), 154 deletions(-)

diff --git a/test/quantization/ao_migration/test_quantization_fx.py b/test/quantization/ao_migration/test_quantization_fx.py
index c75727717a736..c71053f4f29c9 100644
--- a/test/quantization/ao_migration/test_quantization_fx.py
+++ b/test/quantization/ao_migration/test_quantization_fx.py
@@ -11,8 +11,6 @@ def test_function_import_quantize_fx(self):
             '_check_is_graph_module',
             '_swap_ff_with_fxff',
             '_fuse_fx',
-            'Scope',
-            'ScopeContextManager',
             'QuantizationTracer',
             '_prepare_fx',
             '_prepare_standalone_module_fx',
diff --git a/test/test_fx.py b/test/test_fx.py
index 0aff631b8e814..0e6a22a3c9360 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -1679,6 +1679,36 @@ def forward(self, x):
             if node.op in {'placeholder'}:
                 self.assertEqual(node.meta['tensor_meta'].memory_format, torch.channels_last_3d)
 
+    def test_nn_module_stack(self):
+        class SubModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv_mod = torch.nn.Conv2d(64, 64, (3, 3), padding=1, bias=False)
+
+            def forward(self, x):
+                return self.conv_mod(x)
+
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.sub_mod = SubModule()
+
+            def forward(self, x):
+                return self.sub_mod(x)
+
+        m = MyModule()
+        gm = torch.fx.symbolic_trace(m)
+
+        mod_stack = {}
+        expected_stack = [('sub_mod', str(type(m.sub_mod))),
+                          ('sub_mod.conv_mod', str(type(m.sub_mod.conv_mod)))]
+        for node in gm.graph.nodes:
+            mod_stack = node.meta.get('nn_module_stack', {})
+            if mod_stack:
+                break
+        stack_list = list(mod_stack.items())
+        self.assertEqual(stack_list, expected_stack)
+
     def test_interpreter(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
diff --git a/torch/ao/quantization/fx/tracer.py b/torch/ao/quantization/fx/tracer.py
index 3a959447cfd6b..1ac98a13c548e 100644
--- a/torch/ao/quantization/fx/tracer.py
+++ b/torch/ao/quantization/fx/tracer.py
@@ -1,67 +1,13 @@
 import torch
 from torch.fx._symbolic_trace import Tracer
-from torch.fx.node import Target, Node, Argument
+from torch.fx.proxy import Scope
 from torch.nn.intrinsic import _FusedModule
-from typing import List, Callable, Tuple, Any, Dict, Optional
+from typing import List, Callable
 
 __all__ = [
     "QuantizationTracer",
 ]
 
-class Scope(object):
-    """ Scope object that records the module path and the module type
-    of a module. Scope is used to track the information of the module
-    that contains a Node in a Graph of GraphModule. For example::
-
-        class Sub(torch.nn.Module):
-            def forward(self, x):
-                # This will be a call_method Node in GraphModule,
-                # scope for this would be (module_path="sub", module_type=Sub)
-                return x.transpose(1, 2)
-
-        class M(torch.nn.Module):
-            def __init__(self):
-                self.sub = Sub()
-
-            def forward(self, x):
-                # This will be a call_method Node as well,
-                # scope for this would be (module_path="", None)
-                x = x.transpose(1, 2)
-                x = self.sub(x)
-                return x
-
-    """
-
-    def __init__(self, module_path: str, module_type: Any):
-        super().__init__()
-        self.module_path = module_path
-        self.module_type = module_type
-
-
-class ScopeContextManager(object):
-    """ A context manager to track the Scope of Node during symbolic tracing.
-    When entering a forward function of a Module, we'll update the scope information of
-    the current module, and when we exit, we'll restore the previous scope information.
-    """
-
-    def __init__(
-        self, scope: Scope, current_module: torch.nn.Module, current_module_path: str
-    ):
-        super().__init__()
-        self.prev_module_type = scope.module_type
-        self.prev_module_path = scope.module_path
-        self.scope = scope
-        self.scope.module_path = current_module_path
-        self.scope.module_type = type(current_module)
-
-    def __enter__(self):
-        return
-
-    def __exit__(self, *args):
-        self.scope.module_path = self.prev_module_path
-        self.scope.module_type = self.prev_module_type
-        return
-
 class QuantizationTracer(Tracer):
     def __init__(
         self, skipped_module_names: List[str], skipped_module_classes: List[Callable]
@@ -75,7 +21,6 @@ def __init__(
         # We can change this if there is a use case that configures
         # qconfig using top level module type
         self.scope = Scope("", None)
-        self.node_name_to_scope: Dict[str, Tuple[str, type]] = {}
         self.record_stack_traces = True
 
     def is_leaf_module(self, m: torch.nn.Module, module_qualified_name: str) -> bool:
@@ -88,32 +33,3 @@ def is_leaf_module(self, m: torch.nn.Module, module_qualified_name: str) -> bool
             or type(m) in self.skipped_module_classes
             or isinstance(m, _FusedModule)
         )
-
-    def call_module(
-        self,
-        m: torch.nn.Module,
-        forward: Callable[..., Any],
-        args: Tuple[Any, ...],
-        kwargs: Dict[str, Any],
-    ) -> Any:
-        module_qualified_name = self.path_of_module(m)
-        # Creating scope with information of current module
-        # scope will be restored automatically upon exit
-        with ScopeContextManager(self.scope, m, module_qualified_name):
-            return super().call_module(m, forward, args, kwargs)
-
-    def create_node(
-        self,
-        kind: str,
-        target: Target,
-        args: Tuple[Argument, ...],
-        kwargs: Dict[str, Argument],
-        name: Optional[str] = None,
-        type_expr: Optional[Any] = None,
-    ) -> Node:
-        node = super().create_node(kind, target, args, kwargs, name, type_expr)
-        self.node_name_to_scope[node.name] = (
-            self.scope.module_path,
-            self.scope.module_type,
-        )
-        return node
diff --git a/torch/ao/quantization/quantize_fx.py b/torch/ao/quantization/quantize_fx.py
index 8f26934576580..c476b754f2aa2 100644
--- a/torch/ao/quantization/quantize_fx.py
+++ b/torch/ao/quantization/quantize_fx.py
@@ -64,61 +64,6 @@ def _fuse_fx(
         graph_module, is_qat, fuse_custom_config, backend_config)  # type: ignore[operator]
 
 
-class Scope(object):
-    """ Scope object that records the module path and the module type
-    of a module. Scope is used to track the information of the module
-    that contains a Node in a Graph of GraphModule. For example::
-
-        class Sub(torch.nn.Module):
-            def forward(self, x):
-                # This will be a call_method Node in GraphModule,
-                # scope for this would be (module_path="sub", module_type=Sub)
-                return x.transpose(1, 2)
-
-        class M(torch.nn.Module):
-            def __init__(self):
-                self.sub = Sub()
-
-            def forward(self, x):
-                # This will be a call_method Node as well,
-                # scope for this would be (module_path="", None)
-                x = x.transpose(1, 2)
-                x = self.sub(x)
-                return x
-
-    """
-
-    def __init__(self, module_path: str, module_type: Any):
-        super().__init__()
-        self.module_path = module_path
-        self.module_type = module_type
-
-
-class ScopeContextManager(object):
-    """ A context manager to track the Scope of Node during symbolic tracing.
-    When entering a forward function of a Module, we'll update the scope information of
-    the current module, and when we exit, we'll restore the previous scope information.
-    """
-
-    def __init__(
-        self, scope: Scope, current_module: torch.nn.Module, current_module_path: str
-    ):
-        super().__init__()
-        self.prev_module_type = scope.module_type
-        self.prev_module_path = scope.module_path
-        self.scope = scope
-        self.scope.module_path = current_module_path
-        self.scope.module_type = type(current_module)
-
-    def __enter__(self):
-        return
-
-    def __exit__(self, *args):
-        self.scope.module_path = self.prev_module_path
-        self.scope.module_type = self.prev_module_type
-        return
-
-
 def _prepare_fx(
     model: torch.nn.Module,
     qconfig_mapping: Union[QConfigMapping, Dict[str, Any]],
diff --git a/torch/fx/_symbolic_trace.py b/torch/fx/_symbolic_trace.py
index ff9df1161a700..dfa6f5096042d 100644
--- a/torch/fx/_symbolic_trace.py
+++ b/torch/fx/_symbolic_trace.py
@@ -5,6 +5,7 @@
 import math
 import os
 import warnings
+import collections
 from itertools import chain
 from types import CodeType, FunctionType, ModuleType
 from typing import (
@@ -28,7 +29,7 @@
 from .graph import _PyTreeCodeGen, _PyTreeInfo, Graph
 from .graph_module import GraphModule
 from .node import Argument, base_types, map_aggregate
-from .proxy import ParameterProxy, Proxy, TracerBase
+from .proxy import ParameterProxy, Proxy, TracerBase, Scope, ScopeContextManager
 
 HAS_VARSTUFF = inspect.CO_VARARGS | inspect.CO_VARKEYWORDS
 
@@ -44,7 +45,6 @@
 def is_fx_tracing():
     return _is_fx_tracing_flag
 
-
 @compatibility(is_backward_compatible=True)
 class ProxyableClassMeta(type):
     """
@@ -250,6 +250,13 @@ def __init__(
         self.param_shapes_constant = param_shapes_constant
 
         self.submodule_paths: Optional[Dict[torch.nn.Module, str]] = None
+        self.root_module_name: str = ""
+        # Maps the containing module's name to the operator name
+        self.scope = Scope("", None)
+        # Records the module call stack
+        self.module_stack = collections.OrderedDict()
+        # Mapping of node name to module scope
+        self.node_name_to_scope: Dict[str, Tuple[str, type]] = {}
 
     @compatibility(is_backward_compatible=True)
     def create_arg(self, a: Any) -> "Argument":
@@ -430,9 +437,18 @@ def call_module(
             value was returned from the ``Module`` invocation.
         """
         module_qualified_name = self.path_of_module(m)
-        if not self.is_leaf_module(m, module_qualified_name):
-            return forward(*args, **kwargs)
-        return self.create_proxy("call_module", module_qualified_name, args, kwargs)
+        with ScopeContextManager(self.scope, Scope(module_qualified_name, type(m))) as _scope:
+            # module_stack is an ordered dict so writing then deleting the
+            # entry is equivalent to push/pop on a list
+            self.module_stack[_scope.module_path] = str(_scope.module_type)
+            if not self.is_leaf_module(m, module_qualified_name):
+                ret_val = forward(*args, **kwargs)
+            else:
+                ret_val = self.create_proxy("call_module", module_qualified_name, args, kwargs)
+            key, _ = self.module_stack.popitem(last=True)
+            assert key == _scope.module_path, f" Unexpected key {key}"
+
+        return ret_val
 
     @compatibility(is_backward_compatible=False)
     def getattr(self, attr: str, attr_val: Any, parameter_proxy_cache: Dict[str, Any]):
@@ -580,7 +596,7 @@ def replace_ph(x):
                 name,
                 default,
                 {},
-                type_expr=fn_for_analysis.__annotations__.get(name, None),
+                type_expr=fn_for_analysis.__annotations__.get(name, None)
             )
 
         arg_names = [next(names_iter) for idx in range(skip_arg_idx, total_args)]
@@ -663,6 +679,7 @@ def trace(
                 ), f"traced_func_name={self.traced_func_name} doesn't exist in {type(root).__name__}"
 
                 fn = getattr(type(root), self.traced_func_name)
+                self.root_module_name = root._get_name()
                 self.submodule_paths = {mod: name for name, mod in root.named_modules()}
             else:
                 self.root = torch.nn.Module()
diff --git a/torch/fx/proxy.py b/torch/fx/proxy.py
index 6f9535b117370..c77967574b59a 100644
--- a/torch/fx/proxy.py
+++ b/torch/fx/proxy.py
@@ -1,17 +1,83 @@
 import dis
+import copy
 import torch
 import inspect
 import operator
 import traceback
+import collections
 
 from .graph import magic_methods, reflectable_magic_methods, Graph
-from typing import Tuple, Dict, Optional, Iterable, Any, Iterator, Callable
+from typing import Tuple, Dict, OrderedDict, Optional, Iterable, Any, Iterator, Callable
 from .node import Target, Node, Argument, base_types, map_aggregate
 from ._compatibility import compatibility
 from .operator_schemas import check_for_mutable_operation
 import torch.fx.traceback as fx_traceback
 
-__all__ = ['TracerBase', 'GraphAppendingTracer', 'TraceError', 'Proxy', 'Attribute', 'ParameterProxy']
+__all__ = ['TracerBase', 'GraphAppendingTracer', 'TraceError',
+           'Proxy', 'Attribute', 'ParameterProxy', 'Scope',
+           'ScopeContextManager']
+
+
+@compatibility(is_backward_compatible=False)
+class Scope(object):
+    """ Scope object that records the module path and the module type
+    of a module. Scope is used to track the information of the module
+    that contains a Node in a Graph of GraphModule. For example::
+
+        class Sub(torch.nn.Module):
+            def forward(self, x):
+                # This will be a call_method Node in GraphModule,
+                # scope for this would be (module_path="sub", module_type=Sub)
+                return x.transpose(1, 2)
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                self.sub = Sub()
+
+            def forward(self, x):
+                # This will be a call_method Node as well,
+                # scope for this would be (module_path="", None)
+                x = x.transpose(1, 2)
+                x = self.sub(x)
+                return x
+
+    """
+
+    def __init__(self, module_path: str, module_type: Any):
+        super().__init__()
+        self.module_path = module_path
+        self.module_type = module_type
+
+
+@compatibility(is_backward_compatible=False)
+class ScopeContextManager(object):
+    """ A context manager to track the Scope of Node during symbolic tracing.
+    When entering a forward function of a Module, we'll update the scope information of
+    the current module, and when we exit, we'll restore the previous scope information.
+    """
+
+    def __init__(
+        self,
+        scope: Scope,
+        current_scope: Scope,
+    ):
+        super().__init__()
+        # Keep a copy of prev scope to restore on exit
+        self._prev_scope = copy.copy(scope)
+        # Update scope to current scope
+        scope.module_path = current_scope.module_path
+        scope.module_type = current_scope.module_type
+        # Save a reference so we can restore it
+        self._scope = scope
+
+    def __enter__(self):
+        return self._scope
+
+    def __exit__(self, *args):
+        self._scope.module_path = self._prev_scope.module_path
+        self._scope.module_type = self._prev_scope.module_type
+        return
+
 
 @compatibility(is_backward_compatible=True)
 class TracerBase:
@@ -29,6 +95,15 @@ class TracerBase:
     # ``root`` is an instance of ``nn.Module``
     traced_func_name: str = "forward"
 
+    # Maps the containing module's name to the operator name
+    scope : Scope
+
+    # Records the module call stack
+    module_stack: OrderedDict[str, str]
+
+    # Mapping of node name to module scope
+    node_name_to_scope: Dict[str, Tuple[str, type]]
+
     @compatibility(is_backward_compatible=True)
     def create_node(self, kind : str, target : Target,
                     args : Tuple[Argument, ...], kwargs : Dict[str, Argument], name : Optional[str] = None,
@@ -43,7 +118,16 @@ def create_node(self, kind : str, target : Target,
         if kind == 'call_function' and self.check_mutable_operations:
             check_for_mutable_operation(target, args, kwargs)
 
-        return self.graph.create_node(kind, target, args, kwargs, name, type_expr)
+        node = self.graph.create_node(kind, target, args, kwargs, name, type_expr)
+        # TODO node_name_to_scope will be depricated in favor of
+        # node.meta['nn_module_stack']
+        self.node_name_to_scope[node.name] = (
+            self.scope.module_path,
+            self.scope.module_type,
+        )
+        if self.module_stack:
+            node.meta['nn_module_stack'] = copy.copy(self.module_stack)
+        return node
 
     @compatibility(is_backward_compatible=True)
     def proxy(self, node: Node) -> 'Proxy':
@@ -207,6 +291,9 @@ class GraphAppendingTracer(TracerBase):
     def __init__(self, graph: Graph):
         super().__init__()
         self.graph = graph
+        self.scope = Scope("", None)
+        self.module_stack = collections.OrderedDict()
+        self.node_name_to_scope = {}
 
 @compatibility(is_backward_compatible=False)
 def assert_fn(x):
diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py
index aad3bc7253e4b..1f519e991ca64 100644
--- a/torch/quantization/quantize_fx.py
+++ b/torch/quantization/quantize_fx.py
@@ -11,8 +11,6 @@
     _check_is_graph_module,
     _swap_ff_with_fxff,
     _fuse_fx,
-    Scope,
-    ScopeContextManager,
     QuantizationTracer,
     _prepare_fx,
     _prepare_standalone_module_fx,

From ad3b4d03ddbb33427c63281fbd125f787a4eae3c Mon Sep 17 00:00:00 2001
From: Angel Avila <angel.j.avila@gmail.com>
Date: Fri, 2 Dec 2022 07:02:09 +0000
Subject: [PATCH 1519/1922] Add tests for custom pybind type_casters (#89897)

This is a followup to #89115 which Fixes #88958

This adds tests to verify at runtime that the types returned by custom pybind type_casters are correctly specified in the second argument to `PYBIND11_TYPE_CASTER`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89897
Approved by: https://github.com/ezyang
---
 test/cpp_extensions/extension.cpp       |  16 ++++
 test/test_cpp_extensions_aot.py         | 107 ++++++++++++++++++++++++
 torch/csrc/DynamicTypes.h               |   3 +-
 torch/csrc/utils/pybind.h               |  18 ++--
 torch/csrc/utils/tensor_memoryformats.h |   3 +-
 5 files changed, 136 insertions(+), 11 deletions(-)

diff --git a/test/cpp_extensions/extension.cpp b/test/cpp_extensions/extension.cpp
index c8772dc1b0ffe..37ed516ca99c2 100644
--- a/test/cpp_extensions/extension.cpp
+++ b/test/cpp_extensions/extension.cpp
@@ -27,6 +27,10 @@ bool function_taking_optional(c10::optional<torch::Tensor> tensor) {
   return tensor.has_value();
 }
 
+torch::Tensor random_tensor() {
+  return torch::randn({1});
+}
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("sigmoid_add", &sigmoid_add, "sigmoid(x) + sigmoid(y)");
   m.def(
@@ -37,4 +41,16 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
       .def(py::init<int, int>())
       .def("forward", &MatrixMultiplier::forward)
       .def("get", &MatrixMultiplier::get);
+
+  m.def("get_complex", []() { return c10::complex<double>(1.0, 2.0); });
+  m.def("get_device", []() { return at::device_of(random_tensor()).value(); });
+  m.def("get_generator", []() { return at::detail::getDefaultCPUGenerator(); });
+  m.def("get_intarrayref", []() { return at::IntArrayRef({1, 2, 3}); });
+  m.def("get_memory_format", []() { return c10::get_contiguous_memory_format(); });
+  m.def("get_storage", []() { return random_tensor().storage(); });
+  m.def("get_symfloat", []() { return c10::SymFloat(1.0); });
+  m.def("get_symint", []() { return c10::SymInt(1); });
+  m.def("get_symint_symbolic", []() { return c10::SymInt(c10::SymInt::UNCHECKED, INT64_MIN); });
+  m.def("get_symintarrayref", []() { return at::SymIntArrayRef({1, 2, 3}); });
+  m.def("get_tensor", []() { return random_tensor(); });
 }
diff --git a/test/test_cpp_extensions_aot.py b/test/test_cpp_extensions_aot.py
index 2f505553859fe..77ed19a36381a 100644
--- a/test/test_cpp_extensions_aot.py
+++ b/test/test_cpp_extensions_aot.py
@@ -1,6 +1,10 @@
 # Owner(s): ["module: cpp-extensions"]
 
+from itertools import repeat
 import os
+import re
+import sys
+from typing import Union
 import unittest
 
 import torch.testing._internal.common_utils as common
@@ -10,6 +14,16 @@
 import torch.backends.cudnn
 import torch.utils.cpp_extension
 
+if sys.version_info >= (3, 8):
+    from typing import get_args, get_origin
+else:
+    def get_args(tp):
+        return tp.__args__
+
+    def get_origin(tp):
+        if hasattr(tp, "__origin__"):
+            return tp.__origin__
+
 try:
     import pytest
     HAS_PYTEST = True
@@ -133,6 +147,99 @@ def test_cuda_dlink_libs(self):
         test = cuda_dlink.add(a, b)
         self.assertEqual(test, ref)
 
+
+class TestPybindTypeCasters(common.TestCase):
+    """Pybind tests for ahead-of-time cpp extensions
+
+    These tests verify the types returned from cpp code using custom type
+    casters. By exercising pybind, we also verify that the type casters work
+    properly.
+
+    For each type caster in `torch/csrc/utils/pybind.h` we create a pybind
+    function that takes no arguments and returns the type_caster type. The
+    second argument to `PYBIND11_TYPE_CASTER` should be the type we expect to
+    receive in python, in these tests we verify this at run-time.
+    """
+    @staticmethod
+    def expected_return_type(func):
+        """
+        Our Pybind functions have a signature of the form `() -> return_type`.
+        """
+        # Imports needed for the `eval` below.
+        from typing import List, Tuple  # noqa: F401
+
+        return eval(re.search("-> (.*)\n", func.__doc__).group(1))
+
+    def check(self, func):
+        val = func()
+        expected = self.expected_return_type(func)
+        origin = get_origin(expected)
+        if origin is list:
+            self.check_list(val, expected)
+        elif origin is tuple:
+            self.check_tuple(val, expected)
+        else:
+            self.assertIsInstance(val, expected)
+
+    def check_list(self, vals, expected):
+        self.assertIsInstance(vals, list)
+        list_type = get_args(expected)[0]
+        for val in vals:
+            self.assertIsInstance(val, list_type)
+
+    def check_tuple(self, vals, expected):
+        self.assertIsInstance(vals, tuple)
+        tuple_types = get_args(expected)
+        if tuple_types[1] is ...:
+            tuple_types = repeat(tuple_types[0])
+        for val, tuple_type in zip(vals, tuple_types):
+            self.assertIsInstance(val, tuple_type)
+
+    def check_union(self, funcs):
+        """Special handling for Union type casters.
+
+        A single cpp type can sometimes be cast to different types in python.
+        In these cases we expect to get exactly one function per python type.
+        """
+        # Verify that all functions have the same return type.
+        union_type = set(self.expected_return_type(f) for f in funcs)
+        assert len(union_type) == 1
+        union_type = union_type.pop()
+        self.assertIs(Union, get_origin(union_type))
+        expected_types = set(get_args(union_type))
+        for func in funcs:
+            val = func()
+            for tp in expected_types:
+                if isinstance(val, tp):
+                    expected_types.remove(tp)
+                    break
+            else:
+                raise AssertionError(f"{val} is not an instance of {expected_types}")
+        self.assertFalse(expected_types, f"Missing functions for types {expected_types}")
+
+    def test_pybind_return_types(self):
+        functions = [
+            cpp_extension.get_complex,
+            cpp_extension.get_device,
+            cpp_extension.get_generator,
+            cpp_extension.get_intarrayref,
+            cpp_extension.get_memory_format,
+            cpp_extension.get_storage,
+            cpp_extension.get_symfloat,
+            cpp_extension.get_symintarrayref,
+            cpp_extension.get_tensor,
+        ]
+        union_functions = [
+            [cpp_extension.get_symint, cpp_extension.get_symint_symbolic],
+        ]
+        for func in functions:
+            with self.subTest(msg=f"check {func.__name__}"):
+                self.check(func)
+        for funcs in union_functions:
+            with self.subTest(msg=f"check {[f.__name__ for f in funcs]}"):
+                self.check_union(funcs)
+
+
 class TestORTTensor(common.TestCase):
     def test_unregistered(self):
         a = torch.arange(0, 10, device='cpu')
diff --git a/torch/csrc/DynamicTypes.h b/torch/csrc/DynamicTypes.h
index 6765916634c53..7ca18942564df 100644
--- a/torch/csrc/DynamicTypes.h
+++ b/torch/csrc/DynamicTypes.h
@@ -9,6 +9,7 @@
 #include <c10/core/Layout.h>
 #include <c10/core/ScalarType.h>
 #include <c10/core/ScalarTypeToTypeMeta.h>
+#include <torch/csrc/Export.h>
 
 #include <memory>
 #include <string>
@@ -24,7 +25,7 @@ namespace torch {
 void registerDtypeObject(THPDtype* dtype, at::ScalarType scalarType);
 void registerLayoutObject(THPLayout* thp_layout, at::Layout layout);
 
-PyObject* createPyObject(const at::Storage& storage);
+TORCH_PYTHON_API PyObject* createPyObject(const at::Storage& storage);
 at::Storage createStorage(PyObject* obj);
 at::Storage createStorageGetType(
     PyObject* obj,
diff --git a/torch/csrc/utils/pybind.h b/torch/csrc/utils/pybind.h
index bf1553814cef9..2160b791c15ef 100644
--- a/torch/csrc/utils/pybind.h
+++ b/torch/csrc/utils/pybind.h
@@ -51,7 +51,7 @@ template <>
 struct type_caster<at::Storage> {
  public:
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  PYBIND11_TYPE_CASTER(at::Storage, _("torch.storage._StorageBase"));
+  PYBIND11_TYPE_CASTER(at::Storage, _("torch.StorageBase"));
 
   bool load(handle src, bool) {
     PyObject* obj = src.ptr();
@@ -97,7 +97,7 @@ template <>
 struct TORCH_PYTHON_API type_caster<at::IntArrayRef> {
  public:
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  PYBIND11_TYPE_CASTER(at::IntArrayRef, _("typing.Tuple[int, ...]"));
+  PYBIND11_TYPE_CASTER(at::IntArrayRef, _("Tuple[int, ...]"));
 
   bool load(handle src, bool);
   static handle cast(
@@ -113,7 +113,7 @@ template <>
 struct TORCH_PYTHON_API type_caster<at::SymIntArrayRef> {
  public:
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  PYBIND11_TYPE_CASTER(at::SymIntArrayRef, _("at::SymIntArrayRef"));
+  PYBIND11_TYPE_CASTER(at::SymIntArrayRef, _("List[int]"));
 
   bool load(handle src, bool);
   static handle cast(
@@ -126,7 +126,7 @@ struct TORCH_PYTHON_API type_caster<at::SymIntArrayRef> {
 };
 
 template <>
-struct TORCH_PYTHON_API type_caster<at::MemoryFormat> {
+struct type_caster<at::MemoryFormat> {
  public:
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   PYBIND11_TYPE_CASTER(at::MemoryFormat, _("torch.memory_format"));
@@ -204,9 +204,9 @@ struct type_caster<c10::DispatchKey>
 };
 
 template <>
-struct type_caster<c10::SymInt> {
+struct TORCH_PYTHON_API type_caster<c10::SymInt> {
  public:
-  PYBIND11_TYPE_CASTER(c10::SymInt, _("torch._prims_common.IntLike"));
+  PYBIND11_TYPE_CASTER(c10::SymInt, _("Union[int, torch.SymInt]"));
   bool load(py::handle src, bool);
 
   static py::handle cast(
@@ -216,9 +216,9 @@ struct type_caster<c10::SymInt> {
 };
 
 template <>
-struct type_caster<c10::SymFloat> {
+struct TORCH_PYTHON_API type_caster<c10::SymFloat> {
  public:
-  PYBIND11_TYPE_CASTER(c10::SymFloat, _("torch._prims_common.FloatLike"));
+  PYBIND11_TYPE_CASTER(c10::SymFloat, _("float"));
   bool load(py::handle src, bool);
 
   static py::handle cast(
@@ -231,7 +231,7 @@ template <typename T>
 struct type_caster<c10::complex<T>> {
  public:
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
-  PYBIND11_TYPE_CASTER(c10::complex<T>, _("torch._complex.complex"));
+  PYBIND11_TYPE_CASTER(c10::complex<T>, _("complex"));
 
   bool load(handle src, bool) {
     PyObject* obj = src.ptr();
diff --git a/torch/csrc/utils/tensor_memoryformats.h b/torch/csrc/utils/tensor_memoryformats.h
index 19b1c8deaf5ab..3f86f0c445287 100644
--- a/torch/csrc/utils/tensor_memoryformats.h
+++ b/torch/csrc/utils/tensor_memoryformats.h
@@ -1,13 +1,14 @@
 #pragma once
 
 #include <c10/core/MemoryFormat.h>
+#include <torch/csrc/Export.h>
 #include <torch/csrc/utils/python_stub.h>
 
 namespace torch {
 namespace utils {
 
 void initializeMemoryFormats();
-PyObject* getTHPMemoryFormat(c10::MemoryFormat);
+TORCH_PYTHON_API PyObject* getTHPMemoryFormat(c10::MemoryFormat);
 
 } // namespace utils
 } // namespace torch

From 48270dfaed50759be984dcd250fd1c419fdce9ac Mon Sep 17 00:00:00 2001
From: Anupam Bhatnagar <anupamb@meta.com>
Date: Fri, 2 Dec 2022 07:03:42 +0000
Subject: [PATCH 1520/1922] Implement post and pre hooks for optimizer (#89176)

Fixes #88446

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89176
Approved by: https://github.com/albanD
---
 test/test_optim.py        | 100 +++++++++++++++++++++++++-
 torch/optim/optimizer.py  | 145 +++++++++++++++++++++++++++++++++-----
 torch/optim/optimizer.pyi |   3 +
 3 files changed, 228 insertions(+), 20 deletions(-)

diff --git a/test/test_optim.py b/test/test_optim.py
index b5d2d43c86ce2..36de7b18eab34 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -11,7 +11,7 @@
 import torch.optim as optim
 import torch.nn.functional as F
 from torch.nn import Parameter
-from torch.optim import SGD
+from torch.optim import Adam, SGD, Optimizer
 from torch import sparse
 from torch.optim.lr_scheduler import (
     LambdaLR,
@@ -44,6 +44,8 @@
     skipIfRocm,
     skipIfTorchDynamo
 )
+from typing import Dict, Any, Tuple
+from torch.optim.optimizer import register_optimizer_step_pre_hook, register_optimizer_step_post_hook
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
@@ -1557,6 +1559,102 @@ def test_empty_grad(self):
                 # assert that the parameters have not changed
                 self.assertEqual(original_param, param)
 
+    @skipIfTorchDynamo()
+    def test_post_hook(self):
+        def post_hook(opt: Optimizer, args: Tuple[Any], kwargs: Dict[Any, Any]):
+            nonlocal data
+            data += 2
+
+        params = [torch.Tensor([1, 1])]
+        opt = SGD(params, lr=0.001)
+        data = 2
+        hook_handle = opt.register_step_post_hook(post_hook)
+
+        opt.step()
+        opt.step()
+        # check if pre hooks were registered
+        self.assertEqual(data, 6)
+
+        # remove handles, take step and verify that hook is no longer registered
+        hook_handle.remove()
+
+        opt.step()
+        self.assertEqual(data, 6)
+
+    @skipIfTorchDynamo()
+    def test_pre_hook(self):
+        def pre_hook(opt: Optimizer, args: Tuple[Any], kwargs: Dict[Any, Any]):
+            nonlocal data
+            data += 2
+
+        params = [torch.Tensor([1, 1])]
+        opt = SGD(params, lr=0.001)
+        data = 5
+        hook_handle = opt.register_step_pre_hook(pre_hook)
+
+        opt.step()
+        opt.step()
+        # check if pre hooks were registered
+        self.assertEqual(data, 9)
+
+        # remove handles, take step and verify that hook is no longer registered
+        hook_handle.remove()
+
+        opt.step()
+        self.assertEqual(data, 9)
+
+    @skipIfTorchDynamo()
+    def test_pre_and_post_hook(self):
+        def global_pre_hook(opt: Optimizer, args: Tuple[Any], kwargs: Dict[Any, Any]):
+            nonlocal data
+            data.append(0)
+
+        def global_post_hook(opt: Optimizer, args: Tuple[Any], kwargs: Dict[Any, Any]):
+            nonlocal data
+            data.append(5)
+
+        def local_pre_hook(opt: Optimizer, args: Tuple[Any], kwargs: Dict[Any, Any]):
+            nonlocal data
+            data.append(1)
+
+        def local_post_hook(opt: Optimizer, args: Tuple[Any], kwargs: Dict[Any, Any]):
+            nonlocal data
+            data.append(2)
+
+        params = [torch.Tensor([1, 1])]
+        opt1 = SGD(params, lr=0.001)
+        opt2 = Adam(params, lr=0.01)
+        data = []
+
+        # register global hooks to both optimizers
+        global_pre_handle = register_optimizer_step_pre_hook(global_pre_hook)
+        global_post_handle = register_optimizer_step_post_hook(global_post_hook)
+
+        # register local hooks
+        first_pre_handle = opt1.register_step_pre_hook(local_pre_hook)
+        first_post_handle = opt1.register_step_post_hook(local_post_hook)
+        second_pre_handle = opt2.register_step_pre_hook(local_pre_hook)
+        second_post_handle = opt2.register_step_post_hook(local_post_hook)
+
+        opt1.step()
+        self.assertListEqual(data, [0, 1, 2, 5])
+        opt2.step()
+        self.assertListEqual(data, [0, 1, 2, 5, 0, 1, 2, 5])
+        opt1.step()
+        self.assertListEqual(data, [0, 1, 2, 5, 0, 1, 2, 5, 0, 1, 2, 5])
+
+        # remove all hooks
+        global_pre_handle.remove()
+        global_post_handle.remove()
+        first_pre_handle.remove()
+        first_post_handle.remove()
+        second_pre_handle.remove()
+        second_post_handle.remove()
+
+        opt1.step()
+        opt2.step()
+        self.assertListEqual(data, [0, 1, 2, 5, 0, 1, 2, 5, 0, 1, 2, 5])
+
 
 class SchedulerTestNet(torch.nn.Module):
     def __init__(self):
diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
index 9422b3b0f94d1..f94cc1dc98dc3 100644
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -1,11 +1,18 @@
-from collections import defaultdict, abc as container_abcs
+from collections import OrderedDict, defaultdict, abc as container_abcs
 import torch
 from copy import deepcopy
 from itertools import chain
 import warnings
 import functools
 
-__all__ = ['Optimizer']
+from typing import Callable, Dict
+
+import torch.utils.hooks as hooks
+from torch.utils.hooks import RemovableHandle
+
+__all__ = ['Optimizer', 'register_optimizer_step_pre_hook', 'register_optimizer_step_post_hook']
+_global_optimizer_pre_hooks: Dict[int, Callable] = OrderedDict()
+_global_optimizer_post_hooks: Dict[int, Callable] = OrderedDict()
 
 class _RequiredParameter(object):
     """Singleton class representing a required parameter for an Optimizer."""
@@ -14,7 +21,6 @@ def __repr__(self):
 
 required = _RequiredParameter()
 
-
 def _use_grad_for_differentiable(func):
     def _use_grad(self, *args, **kwargs):
         prev_grad = torch.is_grad_enabled()
@@ -27,6 +33,44 @@ def _use_grad(self, *args, **kwargs):
     return _use_grad
 
 
+def register_optimizer_step_pre_hook(hook: Callable[..., None]) -> RemovableHandle:
+    r"""Register a pre hook common to all optimizers. The hook should have the following
+    signature::
+
+        hook(optimizer, args, kwargs) -> None or modified args and kwargs
+
+    Args:
+        hook (Callable): A user defined hook which is registered on all optimizers.
+
+    Returns:
+        :class:`torch.utils.hooks.RemoveableHandle`:
+            a handle that can be used to remove the added hook by calling
+            ``handle.remove()``
+    """
+    handle = hooks.RemovableHandle(_global_optimizer_pre_hooks)
+    _global_optimizer_pre_hooks[handle.id] = hook
+    return handle
+
+
+def register_optimizer_step_post_hook(hook: Callable[..., None]) -> RemovableHandle:
+    r"""Register a post hook common to all optimizers. The hook should have the following
+    signature::
+
+        hook(optimizer, args, kwargs) -> None
+
+    Args:
+        hook (Callable): A user defined hook which is registered on all optimizers.
+
+    Returns:
+        :class:`torch.utils.hooks.RemoveableHandle`:
+            a handle that can be used to remove the added hook by calling
+            ``handle.remove()``
+    """
+    handle = hooks.RemovableHandle(_global_optimizer_post_hooks)
+    _global_optimizer_post_hooks[handle.id] = hook
+    return handle
+
+
 class Optimizer(object):
     r"""Base class for all optimizers.
 
@@ -45,8 +89,10 @@ class Optimizer(object):
     def __init__(self, params, defaults):
         torch._C._log_api_usage_once("python.optimizer")
         self.defaults = defaults
+        self._optimizer_step_pre_hooks: Dict[int, Callable] = OrderedDict()
+        self._optimizer_step_post_hooks: Dict[int, Callable] = OrderedDict()
 
-        self._hook_for_profile()
+        self._patch_step_function()
 
         if isinstance(params, torch.Tensor):
             raise TypeError("params argument given to the optimizer should be "
@@ -80,7 +126,11 @@ def __getstate__(self):
 
     def __setstate__(self, state):
         self.__dict__.update(state)
-        self._hook_for_profile()  # To support multiprocessing pickle/unpickle.
+        if '_optimizer_step_pre_hooks' not in self.__dict__:
+            self._optimizer_step_pre_hooks = OrderedDict()
+        if '_optimizer_step_post_hooks' not in self.__dict__:
+            self._optimizer_step_post_hooks = OrderedDict()
+        self._patch_step_function()  # To support multiprocessing pickle/unpickle
         self.defaults.setdefault('differentiable', False)
 
     def __repr__(self):
@@ -127,27 +177,84 @@ def _optimizer_step_code(self):
         """
         pass
 
-    def _hook_for_profile(self):
-        self._zero_grad_profile_name = "Optimizer.zero_grad#{}.zero_grad".format(self.__class__.__name__)
+    @staticmethod
+    def profile_hook_step(func):
+
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            self, *_ = args
+            profile_name = "Optimizer.step#{}.step".format(self.__class__.__name__)
+            with torch.autograd.profiler.record_function(profile_name):
+                # call optimizer step pre hooks
+                for pre_hook in chain(_global_optimizer_pre_hooks.values(), self._optimizer_step_pre_hooks.values()):
+                    result = pre_hook(self, args, kwargs)
+                    if result is not None:
+                        if isinstance(result, tuple) and len(result) == 2:
+                            args, kwargs = result
+                        else:
+                            raise RuntimeError(f"{func} must return None or a tuple of (new_args, new_kwargs),"
+                                               f"but got {result}.")
+
+                out = func(*args, **kwargs)
+                self._optimizer_step_code()
 
-        def profile_hook_step(func):
+                # call optimizer step post hooks
+                for post_hook in chain(self._optimizer_step_post_hooks.values(), _global_optimizer_post_hooks.values()):
+                    post_hook(self, args, kwargs)
 
-            @functools.wraps(func)
-            def wrapper(*args, **kwargs):
-                obj, *_ = args
-                profile_name = "Optimizer.step#{}.step".format(obj.__class__.__name__)
-                with torch.autograd.profiler.record_function(profile_name):
-                    out = func(*args, **kwargs)
-                    obj._optimizer_step_code()
-                    return out
+                return out
 
-            return wrapper
+        return wrapper
 
+    def _patch_step_function(self):
+        self._zero_grad_profile_name = "Optimizer.zero_grad#{}.zero_grad".format(self.__class__.__name__)
         hooked = getattr(self.__class__.step, "hooked", None)
         if not hooked:
-            self.__class__.step = profile_hook_step(self.__class__.step)
+            self.__class__.step = self.profile_hook_step(self.__class__.step)
             self.__class__.step.hooked = True
 
+    def register_step_pre_hook(self, hook: Callable[..., None]) -> RemovableHandle:
+        r"""Register an optimizer step pre hook which will be called before
+        optimizer step. It should have the following signature::
+
+            hook(optimizer, args, kwargs) -> None or modified args and kwargs
+
+        The ``optimizer`` argument is the optimizer instance being used. If
+        args and kwargs are modified by the pre-hook, then the transformed
+        values are returned as a tuple containing the new_args and new_kwargs.
+
+        Args:
+            hook (Callable): The user defined hook to be registered.
+
+        Returns:
+            :class:`torch.utils.hooks.RemoveableHandle`:
+                a handle that can be used to remove the added hook by calling
+                ``handle.remove()``
+        """
+        handle = hooks.RemovableHandle(self._optimizer_step_pre_hooks)
+        self._optimizer_step_pre_hooks[handle.id] = hook
+        return handle
+
+    def register_step_post_hook(self, hook: Callable[..., None]) -> RemovableHandle:
+        r"""Register an optimizer step post hook which will be called after optimizer step.
+        It should have the following signature::
+
+            hook(optimizer, args, kwargs) -> None
+
+        The ``optimizer`` argument is the optimizer instance being used.
+
+        Args:
+            hook (Callable): The user defined hook to be registered.
+
+        Returns:
+            :class:`torch.utils.hooks.RemoveableHandle`:
+                a handle that can be used to remove the added hook by calling
+                ``handle.remove()``
+        """
+        handle = hooks.RemovableHandle(self._optimizer_step_post_hooks)
+        self._optimizer_step_post_hooks[handle.id] = hook
+        return handle
+
     def state_dict(self):
         r"""Returns the state of the optimizer as a :class:`dict`.
 
@@ -261,7 +368,7 @@ def zero_grad(self, set_to_none: bool = False):
         foreach = self.defaults.get('foreach', False)
 
         if not hasattr(self, "_zero_grad_profile_name"):
-            self._hook_for_profile()
+            self._patch_step_function()
         if foreach:
             per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list))
         with torch.autograd.profiler.record_function(self._zero_grad_profile_name):
diff --git a/torch/optim/optimizer.pyi b/torch/optim/optimizer.pyi
index 3838b4bf599ba..7055c6b788a5f 100644
--- a/torch/optim/optimizer.pyi
+++ b/torch/optim/optimizer.pyi
@@ -1,5 +1,6 @@
 from typing import Iterable, Union, Callable, Optional, List
 from .. import Tensor
+from torch.utils.hooks import RemovableHandle
 
 _params_t = Union[Iterable[Tensor], Iterable[dict]]
 
@@ -11,6 +12,8 @@ class Optimizer:
 
     def __init__(self, params: _params_t, default: dict) -> None: ...
     def __setstate__(self, state: dict) -> None: ...
+    def register_step_pre_hook(self, hook: Callable[..., None]) -> RemovableHandle: ...
+    def register_step_post_hook(self, hook: Callable[..., None]) -> RemovableHandle: ...
     def state_dict(self) -> dict: ...
     def load_state_dict(self, state_dict: dict) -> None: ...
     def zero_grad(self, set_to_none: bool=...) -> None: ...

From 47af3540945c48e2cc6b3fe38c24227fffe3f5db Mon Sep 17 00:00:00 2001
From: "Wang, Eikan" <eikan.wang@intel.com>
Date: Thu, 1 Dec 2022 15:23:24 +0000
Subject: [PATCH 1521/1922] Add more debug information for Inductor (#90008)

- Add graph index to the profile information of the Inductor kernel for better debugability.

  The generated code for different graphs could produce kernels with the same name. The side effect is that it is hard to identify the portion of E2E performance for these kernels because the profiler will aggregate the performance with the same kernel name regardless of different graphs. Hence, this PR added the graph index to the profile information to address this limitation.

- Label arbitrary code ranges for `eager` and `opt` modes for better debugability

  The profile information of dynamo benchmarks mixes the eager mode and opt mode. It is hard to separate the range for different modes. This PR added eager and opt marks to the profile information to address this limitation.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90008
Approved by: https://github.com/jgong5, https://github.com/jansel
---
 benchmarks/dynamo/common.py         | 25 +++++++++++++++++++------
 test/inductor/test_torchinductor.py |  7 ++++++-
 torch/_inductor/codegen/cpp.py      |  4 +++-
 torch/_inductor/compile_fx.py       |  4 +++-
 torch/_inductor/graph.py            |  7 ++++++-
 5 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index e34010f686d65..3e469e509680a 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -422,6 +422,16 @@ def maybe_profile(*args, **kwargs):
         else:
             yield
 
+    @contextlib.contextmanager
+    def maybe_mark_profile(*args, **kwargs):
+        prof: torch.profiler.profile = kwargs.pop("p", None)
+        mark = kwargs.pop("mark", None)
+        if prof:
+            with torch.profiler.record_function(mark):
+                yield
+        else:
+            yield
+
     with maybe_profile(enabled=args.export_profiler_trace) as p:
         frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
         for rep in range(args.repeat):
@@ -436,16 +446,19 @@ def maybe_profile(*args, **kwargs):
             maybe_mark_step(args)
 
             # interleave the runs to handle frequency scaling and load changes
-            timings[rep, 0], expected_output = timed(
-                model, model_iter_fn, inputs, return_result=True
-            )
+            with maybe_mark_profile(p=p, mark="expected"):
+                timings[rep, 0], expected_output = timed(
+                    model, model_iter_fn, inputs, return_result=True
+                )
 
             # call mark_step between the 2 calls to make the comparison fair.
             maybe_mark_step(args)
 
-            timings[rep, 1], actual_output = timed(
-                model, frozen_model_iter_fn, inputs, return_result=True
-            )
+            with maybe_mark_profile(p=p, mark="actual"):
+                timings[rep, 1], actual_output = timed(
+                    model, frozen_model_iter_fn, inputs, return_result=True
+                )
+
             if should_check_result:
                 is_correct = is_correct and same(expected_output, actual_output)
 
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 97cc6af476e46..36b42532a884f 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -5222,7 +5222,12 @@ def fn(a, b):
             b = torch.rand((100,))
             with profile() as prof:
                 fn(a, b)
-            assert "kernel_cpp_0" in (e.name for e in prof.profiler.function_events)
+
+            kernel_profile_events = []
+            for e in prof.profiler.function_events:
+                if "kernel_cpp_0" in e.name:
+                    kernel_profile_events.append(e.name)
+            assert len(kernel_profile_events) > 0
 
 
 if HAS_CUDA:
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index 88f4536ef3605..18c1e3f14fadf 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -1416,9 +1416,11 @@ def codegen_define_and_call(self, wrapper):
         code.writelines([cpp_prefix(), "" f'extern "C" void kernel({arg_defs})'])
         with code.indent():
             if enable_kernel_profile:
+                graph_id = V.graph.graph_id
+                prefix = "graph_" + str(graph_id) + "_" if graph_id is not None else ""
                 code.writelines(
                     [
-                        f'RECORD_FUNCTION("{kernel_name}", c10::ArrayRef<c10::IValue>({{}}));'
+                        f'RECORD_FUNCTION("{prefix + kernel_name}", c10::ArrayRef<c10::IValue>({{}}));'
                     ]
                 )
             for old, new in self.args.aliases():
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index 3472f0e2efec1..0a2dba8bdde5d 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -132,7 +132,9 @@ def compile_fx_inner(
         if isinstance(inp, FakeTensor) and inp.fake_mode.shape_env is not None:
             shape_env = inp.fake_mode.shape_env
 
-    graph = GraphLowering(gm, shape_env=shape_env, num_static_inputs=num_fixed)
+    graph = GraphLowering(
+        gm, shape_env=shape_env, num_static_inputs=num_fixed, graph_id=graph_id
+    )
     with V.set_graph_handler(graph):
         graph.run(*example_inputs)
         compiled_fn = graph.compile_to_fn()
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 1c1bf776ba403..3301a8455698b 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -62,7 +62,11 @@ def static_sizes_strides(self, ex: torch.Tensor):
         return size, stride
 
     def __init__(
-        self, gm: torch.fx.GraphModule, shape_env=None, num_static_inputs=None
+        self,
+        gm: torch.fx.GraphModule,
+        shape_env=None,
+        num_static_inputs=None,
+        graph_id=None,
     ):
         super().__init__(gm)
         if shape_env is None:
@@ -90,6 +94,7 @@ def __init__(
         self.name_to_buffer = {}
         self.creation_time = time.time()
         self._can_use_cpp_wrapper = config.cpp_wrapper
+        self.graph_id = graph_id
 
     def get_dtype(self, buffer_name):
         if buffer_name in self.constants:

From 11660ec82d5d74ea106c179e85b713e4e38a6136 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 2 Dec 2022 09:57:31 +0000
Subject: [PATCH 1522/1922] Revert "Revert "Dynamo, FX, Inductor Progress Bars
 (#88384)" (#90018)"

This reverts commit bcf4292f04eda6c21cab18aa70cad6b2887c8b78.

Reverted https://github.com/pytorch/pytorch/pull/90018 on behalf of https://github.com/jeanschmidt due to landed internal commit does not match with this one, causing merge conflict and preventing import and land new commits
---
 torch/_dynamo/logging.py                | 21 +++++++++++++++++++++
 torch/_dynamo/optimizations/analysis.py |  1 +
 torch/_inductor/codecache.py            | 22 ++++++++++++++++++++--
 torch/_inductor/codegen/common.py       |  2 ++
 torch/_inductor/config.py               |  6 ++++++
 torch/_inductor/graph.py                |  1 +
 torch/_inductor/ir.py                   |  3 +++
 torch/_inductor/sizevars.py             |  1 +
 torch/_inductor/virtualized.py          |  3 +++
 torch/fx/config.py                      |  6 ++++++
 torch/fx/interpreter.py                 |  8 ++++++--
 11 files changed, 70 insertions(+), 4 deletions(-)
 create mode 100644 torch/fx/config.py

diff --git a/torch/_dynamo/logging.py b/torch/_dynamo/logging.py
index 95ee727f1ddf1..b2fa67fbdf6ae 100644
--- a/torch/_dynamo/logging.py
+++ b/torch/_dynamo/logging.py
@@ -2,10 +2,15 @@
 import logging
 import os
 
+from torch.hub import tqdm
+
 # logging level for dynamo generated graphs/bytecode/guards
 logging.CODE = 15
 logging.addLevelName(logging.CODE, "CODE")
 
+# Disable progress bar by default, not in dynamo config because otherwise get a circular import
+disable_progress = True
+
 
 # Return all loggers that torchdynamo/torchinductor is responsible for
 def get_loggers():
@@ -78,8 +83,24 @@ def init_logging(log_level, log_file_name=None):
 
 _step_counter = itertools.count(1)
 
+# Update num_steps if more phases are added: Dynamo, AOT, Backend
+# This is very inductor centric
+# _inductor.utils.has_triton() gives a circular import error here
+
+if not disable_progress:
+    try:
+        import triton  # noqa: F401
+
+        num_steps = 3
+    except ImportError:
+        num_steps = 2
+    pbar = tqdm(total=num_steps, desc="torch.compile()", delay=15)
+
 
 def get_step_logger(logger):
+    if not disable_progress:
+        pbar.set_postfix_str(f"{logger.name}")
+        pbar.update(1)
     step = next(_step_counter)
 
     def log(level, msg):
diff --git a/torch/_dynamo/optimizations/analysis.py b/torch/_dynamo/optimizations/analysis.py
index d83e57fdca6e2..f732fb322438f 100644
--- a/torch/_dynamo/optimizations/analysis.py
+++ b/torch/_dynamo/optimizations/analysis.py
@@ -21,6 +21,7 @@ def __init__(self, *args, **kwargs):
         self.input_alias_groups = set()
         self.storage_to_alias_group = dict()
         self.make_alias_group = itertools.count(1)
+        self.name = "ShapeAliasingAndMutation"
 
     def tensor_alias_group(self, value: torch.Tensor):
         """Assign a unique identifier to the storage of a given tensor"""
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index f4297732b38cd..1569f953d6120 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -21,6 +21,8 @@
 from typing import Any, Callable, Dict, List
 
 import torch
+
+from torch.hub import tqdm
 from torch.utils import cpp_extension
 from . import config, cuda_properties, exc
 
@@ -595,7 +597,7 @@ def warm_pool(cls):
         if hasattr(pool, "_start_queue_management_thread"):
             pool._start_queue_management_thread()
         else:
-            for i in range(config.compile_threads):
+            for _ in range(config.compile_threads):
                 pool._adjust_process_count()
             pool._start_executor_manager_thread()
         _compile_end()
@@ -636,10 +638,26 @@ def task():
         return self.submit(task)
 
     def wait(self, scope: Dict[str, Any]):
+        num_kernels = len(
+            [
+                value
+                for key, value in scope.items()
+                if isinstance(value, (Future, TritonFuture))
+            ]
+        )
+        pbar = tqdm(
+            total=num_kernels,
+            desc="Inductor Compilation",
+            disable=config.disable_progress,
+            delay=15,
+        )
         if config.compile_threads > 1:
-            for key, result in list(scope.items()):
+            for key, result in scope.items():
+                if config.verbose_progress:
+                    pbar.set_postfix_str(key)
                 if isinstance(result, (Future, TritonFuture)):
                     scope[key] = result.result()
+                    pbar.update(1)
 
         _compile_end()
 
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index b1e710c0ef91e..c549de21e46ee 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -590,6 +590,8 @@ def reduction(self, name, dtype, src_dtype, reduction_type, index, value):
 
     def __enter__(self):
         class CSEProxy:
+            self.name = "CSEProxy"
+
             @staticmethod
             def __getattr__(name):
                 def inner(*args, **kwargs):
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index 92666886a002d..b338bbf1241ad 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -5,6 +5,12 @@
 # add some debug printouts
 debug = False
 
+# Whether to disable a progress bar for autotuning
+disable_progress = True
+
+# Whether to enable printing the source code for each future
+verbose_progress = False
+
 # use cpp wrapper instead of python wrapper
 cpp_wrapper = False
 
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 3301a8455698b..c7e40464cd406 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -93,6 +93,7 @@ def __init__(
         self.randomness_seeds = []
         self.name_to_buffer = {}
         self.creation_time = time.time()
+        self.name = "GraphLowering"
         self._can_use_cpp_wrapper = config.cpp_wrapper
         self.graph_id = graph_id
 
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 56d5e8ef1d4c7..7a182752bb16a 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -3952,6 +3952,8 @@ def add_index(expr, category, buf_name=None):
             )
 
         class CaptureIndexing(V.WrapperHandler):
+            self.name = "CaptureIndexing"
+
             def load(self, name: str, index: sympy.Expr):
                 index = add_index(index, "reads", name)
                 return self._inner.load(name, index)
@@ -4034,6 +4036,7 @@ def __init__(self):
                 self.garbage_collect_values = False
                 self.env = {}
                 self.fetch_attr = submodules.__getitem__
+                self.name = V.get_ops_handler().name
 
         return InterpreterShim().run(V.get_ops_handler())
 
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
index fda61b0933574..7997d5fde09fa 100644
--- a/torch/_inductor/sizevars.py
+++ b/torch/_inductor/sizevars.py
@@ -596,6 +596,7 @@ class SimplifyIndexing(V.WrapperHandler):  # type: ignore[name-defined]
 
     def __init__(self, inner, var_ranges: VarRanges):
         super().__init__(inner)
+        self.name = "SimplifyIndexing"
         self._simplify: Callable[
             [Expr], Expr
         ] = lambda index: V.graph.sizevars.simplify_with_ranges(index, var_ranges)
diff --git a/torch/_inductor/virtualized.py b/torch/_inductor/virtualized.py
index 27e60b1daf1df..cff6770997371 100644
--- a/torch/_inductor/virtualized.py
+++ b/torch/_inductor/virtualized.py
@@ -57,6 +57,9 @@ def _arg_str(a):
 
 class MockHandler:
     def __getattr__(self, name):
+        if name == "name":
+            return "MockHandler"
+
         def inner(*args, **kwargs):
             fargs = [_arg_str(a) for a in args]
             fargs.extend(f"{k}={v}" for k, v in kwargs.items())
diff --git a/torch/fx/config.py b/torch/fx/config.py
new file mode 100644
index 0000000000000..da5120d6edf18
--- /dev/null
+++ b/torch/fx/config.py
@@ -0,0 +1,6 @@
+# Whether to disable showing progress on compilation passes
+# Need to add a new config otherwise wil get a circular import if dynamo config is imported here
+disable_progress = True
+
+# If True this also shows the node names in each pass, for small models this is great but larger models it's quite noisy
+verbose_progress = False
diff --git a/torch/fx/interpreter.py b/torch/fx/interpreter.py
index 6428d4c5c3bb5..683a6bd90b501 100644
--- a/torch/fx/interpreter.py
+++ b/torch/fx/interpreter.py
@@ -4,10 +4,12 @@
 from .proxy import Proxy
 from ._symbolic_trace import Tracer
 from ._compatibility import compatibility
+from . import config
 import torch.fx.traceback as fx_traceback
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 import inspect
 from contextlib import contextmanager
+from torch.hub import tqdm
 
 __all__ = ['Interpreter', 'Transformer']
 
@@ -72,7 +74,7 @@ def __init__(self, module : GraphModule, garbage_collect_values : bool = True):
         self.module = module
         self.submodules = dict(self.module.named_modules())
         self.env : Dict[Node, Any] = {}
-
+        self.name = "Interpreter"
         self.garbage_collect_values = garbage_collect_values
 
         if self.garbage_collect_values:
@@ -118,7 +120,9 @@ def run(self, *args, initial_env : Optional[Dict[Node, Any]] = None, enable_io_p
             args = self.module.graph.process_inputs(*args)
         self.args_iter : Iterator[Any] = iter(args)
 
-        for node in self.module.graph.nodes:
+        for node in tqdm(self.module.graph.nodes,
+                         desc=f"{self.name}: {str(list(self.module.graph.nodes)) if config.verbose_progress else ''}",
+                         initial=1, position=0, leave=True, disable=config.disable_progress, delay=15):
             if node in self.env:
                 # Short circuit if we have this value. This could
                 # be used, for example, for partial evaluation

From 2794904d08998220264e10d5d4f41251d74af15f Mon Sep 17 00:00:00 2001
From: Pearu Peterson <pearu.peterson@gmail.com>
Date: Fri, 2 Dec 2022 11:17:13 +0200
Subject: [PATCH 1523/1922] Update to_sparse docs regarding the layout and
 blocksize kw arguments. (#89912)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89912
Approved by: https://github.com/cpuhrsch
---
 torch/_tensor_docs.py | 42 ++++++++++++++++++++++++++++++++++++++++++
 torch/_torch_docs.py  |  5 +++++
 2 files changed, 47 insertions(+)

diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 726ae5137e6a4..7b6a8870d8497 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -5378,6 +5378,48 @@ def callable(a, b) -> number
     tensor(indices=tensor([[1]]),
            values=tensor([[ 9,  0, 10]]),
            size=(3, 3), nnz=1, layout=torch.sparse_coo)
+
+.. method:: to_sparse(*, layout=None, blocksize=None) -> Tensor
+   :noindex:
+
+Returns a sparse tensor with the specified layout and blocksize.
+
+.. note:: If the :attr:`self` layout and blocksize parameters match
+          with the specified layout and blocksize, return
+          :attr:`self`. Otherwise, return a sparse tensor copy of
+          :attr:`self`.
+
+Args:
+
+    layout (:class:`torch.layout`, optional): The desired sparse
+      layout. One of ``torch.sparse_coo``, ``torch.sparse_csr``,
+      ``torch.sparse_csc``, ``torch.sparse_bsr``, or
+      ``torch.sparse_bsc``. Default: if ``None``,
+      ``torch.sparse_coo``.
+
+    blocksize (list, tuple, :class:`torch.Size`, optional): Block size
+      of the resulting BSR or BSC tensor. For other layouts,
+      specifying the block size that is not ``None`` will result in a
+      RuntimeError exception.  A block size must be a tuple of length
+      two such that its items evenly divide the two sparse dimensions.
+
+Example::
+
+    >>> x = torch.tensor([[1, 0], [0, 0], [2, 3]])
+    >>> x.to_sparse(layout=torch.sparse_coo)
+    tensor(indices=tensor([[0, 2, 2],
+                           [0, 0, 1]]),
+           values=tensor([1, 2, 3]),
+           size=(3, 2), nnz=3, layout=torch.sparse_coo)
+    >>> x.to_sparse(layout=torch.sparse_bsr, blocksize=(1, 2))
+    tensor(crow_indices=tensor([0, 1, 1, 2]),
+           col_indices=tensor([0, 0]),
+           values=tensor([[[1, 0]],
+                          [[2, 3]]]), size=(3, 2), nnz=2, layout=torch.sparse_bsr)
+    >>> x.to_sparse(layout=torch.sparse_bsr, blocksize=(2, 1))
+    RuntimeError: Tensor size(-2) 3 needs to be divisible by blocksize[0] 2
+    >>> x.to_sparse(layout=torch.sparse_csr, blocksize=(3, 1))
+    RuntimeError: to_sparse for Strided to SparseCsr conversion does not use specified blocksize
 """,
 )
 
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index af848690de50f..75806a8cabf3f 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -9677,6 +9677,11 @@ def merge_dicts(*dicts):
 Slices the :attr:`input` tensor along the selected dimension at the given index.
 This function returns a view of the original tensor with the given dimension removed.
 
+.. note:: If :attr:`input` is a sparse tensor and returning a view of
+          the tensor is not possible, a RuntimeError exception is
+          raised. In this is the case, consider using
+          :func:`torch.select_copy` function.
+
 Args:
     {input}
     dim (int): the dimension to slice

From caf1aa4c1ef260c982df1d02260efe9770b304cd Mon Sep 17 00:00:00 2001
From: Pearu Peterson <pearu.peterson@gmail.com>
Date: Wed, 30 Nov 2022 00:49:36 +0200
Subject: [PATCH 1524/1922] Fix gradcheck for CSR and CSC inputs. (#89786)

Partially fix-es https://github.com/pytorch/pytorch/issues/87085

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89786
Approved by: https://github.com/albanD
---
 test/test_autograd.py       | 18 +++++++++--
 test/test_overrides.py      |  1 -
 torch/autograd/gradcheck.py | 60 ++++++++++++++++++++++++++++---------
 3 files changed, 61 insertions(+), 18 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index 44a5c7ccc4509..b5bbec6e8e462 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -4148,20 +4148,32 @@ def fn(sparse):
         check(fast_mode=True)
         check(fast_mode=False)
 
-    @unittest.expectedFailure
     def test_gradcheck_sparse_csr_input(self):
         def check(fast_mode):
             def fn(sparse_csr):
                 return torch.clone(sparse_csr).to_dense()
 
-            # Fails because gradcheck can't work with sparse csr inputs yet
             gradcheck(fn, torch.rand(2, 2, dtype=torch.double).to_sparse_csr().requires_grad_(True), check_sparse_nnz=True,
                       check_batched_grad=False, fast_mode=fast_mode)
 
             with self.assertRaisesRegex(RuntimeError, 'gradcheck expects all tensor inputs are dense'):
                 gradcheck(fn, torch.rand(2, 2, dtype=torch.double).to_sparse_csr().requires_grad_(True), check_sparse_nnz=False,
                           check_batched_grad=False, fast_mode=fast_mode)
-        # check(fast_mode=True) # Segmentation fault
+        # check(fast_mode=True) # RuntimeError: sparse_mask_sparse_csr expects self to be 2D
+        check(fast_mode=False)
+
+    def test_gradcheck_sparse_csc_input(self):
+        def check(fast_mode):
+            def fn(sparse_csc):
+                return torch.clone(sparse_csc).to_dense()
+
+            gradcheck(fn, torch.rand(2, 2, dtype=torch.double).to_sparse_csc().requires_grad_(True), check_sparse_nnz=True,
+                      check_batched_grad=False, fast_mode=fast_mode)
+
+            with self.assertRaisesRegex(RuntimeError, 'gradcheck expects all tensor inputs are dense'):
+                gradcheck(fn, torch.rand(2, 2, dtype=torch.double).to_sparse_csc().requires_grad_(True), check_sparse_nnz=False,
+                          check_batched_grad=False, fast_mode=fast_mode)
+        # check(fast_mode=True) # RuntimeError: Expected result Tensor to be of format CSR
         check(fast_mode=False)
 
     def test_gradcheck_nondeterministic(self):
diff --git a/test/test_overrides.py b/test/test_overrides.py
index 7082f75a2141f..629cd2a106806 100644
--- a/test/test_overrides.py
+++ b/test/test_overrides.py
@@ -901,7 +901,6 @@ def run_test(fast_mode):
                 'dtype',
                 'is_floating_point',
                 'is_sparse',
-                'is_sparse_csr',
                 'layout',
                 'new_zeros',
                 'numel',
diff --git a/torch/autograd/gradcheck.py b/torch/autograd/gradcheck.py
index 46d4f370a99ae..bc9274eaefaf4 100644
--- a/torch/autograd/gradcheck.py
+++ b/torch/autograd/gradcheck.py
@@ -20,6 +20,15 @@ class GradcheckError(RuntimeError):
     pass
 
 
+
+def _is_sparse_compressed_tensor(obj: torch.Tensor):
+    return obj.layout in {torch.sparse_csr, torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc}
+
+
+def _is_sparse_any_tensor(obj: torch.Tensor):
+    return _is_sparse_compressed_tensor(obj) or obj.layout is torch.sparse_coo
+
+
 def _is_float_or_complex_tensor(obj):
     return is_tensor_like(obj) and (obj.is_floating_point() or obj.is_complex())
 
@@ -80,7 +89,7 @@ def _iter_tensor(x_tensor):
     #
     # where x is the t.data of the original tensor. Perturbing the entry of x
     # at index (1, 1) yields the 3rd column of the overall Jacobian matrix.
-    if x_tensor.is_sparse:
+    if _is_sparse_any_tensor(x_tensor):
         def get_stride(size):
             dim = len(size)
             tmp = 1
@@ -91,8 +100,17 @@ def get_stride(size):
             return stride
         x_nnz = x_tensor._nnz()
         x_size = list(x_tensor.size())
-        x_indices = x_tensor._indices().t()
-        x_values = x_tensor._values()
+        if x_tensor.layout is torch.sparse_coo:
+            x_indices = x_tensor._indices().t()
+            x_values = x_tensor._values()
+        elif x_tensor.layout is torch.sparse_csr:
+            x_indices = torch._convert_indices_from_csr_to_coo(x_tensor.crow_indices(), x_tensor.col_indices()).t()
+            x_values = x_tensor.values()
+        elif x_tensor.layout is torch.sparse_csc:
+            x_indices = torch._convert_indices_from_csr_to_coo(x_tensor.ccol_indices(), x_tensor.row_indices(), transpose=True).t()
+            x_values = x_tensor.values()
+        else:
+            raise NotImplementedError(f'_iter_tensor for {x_tensor.layout} input')
         x_stride = get_stride(x_size)
         # Use .data here to get around the version check
         x_values = x_values.data
@@ -249,7 +267,7 @@ def _prepare_input(input: torch.Tensor, maybe_perturbed_input: Optional[torch.Te
             return maybe_perturbed_input.to_mkldnn()
         else:
             return input
-    elif input.layout == torch.sparse_coo:
+    elif _is_sparse_any_tensor(input):
         if fast_mode and maybe_perturbed_input is not None:
             # entry is already a "cloned" version of the original tensor
             # thus changes to entry are not reflected in the input
@@ -386,7 +404,7 @@ def _get_input_to_perturb(input):
     if input.layout == torch._mkldnn:  # type: ignore[attr-defined] # no attr _mkldnn
         # Convert to dense so we can perform operations that require strided tensors
         input_to_perturb = input.to_dense()
-    elif input.layout == torch.sparse_coo:
+    elif _is_sparse_any_tensor(input):
         # Clone because input may require grad, and copy_ calls resize_,
         # which is not allowed for .data
         input_to_perturb = input.clone()
@@ -414,10 +432,10 @@ def jvp_fn(delta):
 def _reshape_tensor_or_tuple(u, shape):
     # We don't need to reshape when input corresponding to u is sparse
     if isinstance(u, tuple):
-        if u[0].layout != torch.sparse_coo:
+        if not _is_sparse_any_tensor(u[0]):
             return (u[0].reshape(shape), u[1].reshape(shape))
     else:
-        if u.layout != torch.sparse_coo:
+        if not _is_sparse_any_tensor(u):
             return u.reshape(shape)
     return u
 
@@ -642,7 +660,7 @@ def _get_analytical_vjps_wrt_specific_output(vjp_fn, sample_output, v) -> List[L
 
 
 def _check_inputs(tupled_inputs, check_sparse_nnz) -> bool:
-    if not check_sparse_nnz and any(t.is_sparse or t.is_sparse_csr for t in tupled_inputs if isinstance(t, torch.Tensor)):
+    if not check_sparse_nnz and any(_is_sparse_any_tensor(t) for t in tupled_inputs if isinstance(t, torch.Tensor)):
         raise GradcheckError('gradcheck expects all tensor inputs are dense when check_sparse_nnz is set to False.')
     # Make sure that gradients are saved for at least one input
     any_input_requiring_grad = False
@@ -656,7 +674,7 @@ def _check_inputs(tupled_inputs, check_sparse_nnz) -> bool:
                     'not of double precision floating point or complex. ')
             if inp.is_sparse:
                 content = inp._values()
-            elif inp.is_sparse_csr:
+            elif _is_sparse_compressed_tensor(inp):
                 content = inp.values()
             else:
                 content = inp
@@ -679,7 +697,7 @@ def _check_inputs(tupled_inputs, check_sparse_nnz) -> bool:
 
 
 def _check_outputs(outputs) -> None:
-    if any(t.layout == torch.sparse_coo for t in outputs if isinstance(t, torch.Tensor)):
+    if any(_is_sparse_any_tensor(t) for t in outputs if isinstance(t, torch.Tensor)):
         # it is easier to call to_dense() on the sparse output than
         # to modify analytical jacobian
         raise ValueError('Sparse output is not supported at gradcheck yet. '
@@ -866,11 +884,12 @@ def _test_backward_mul_by_grad_output(outputs, inputs, check_sparse_nnz) -> bool
         if isinstance(gi, torch.Tensor) and gi.layout != torch.strided:
             if gi.layout != di.layout:
                 raise GradcheckError('grad is incorrect layout (' + str(gi.layout) + ' is not ' + str(di.layout) + ')')
-            if gi.layout == torch.sparse_coo:
+            if _is_sparse_any_tensor(gi):
+                sparse_kind = str(gi.layout).replace('torch.', '').replace('_coo', '')
                 if gi.sparse_dim() != di.sparse_dim():
-                    raise GradcheckError('grad is sparse tensor, but has incorrect sparse_dim')
+                    raise GradcheckError(f'grad is {sparse_kind} tensor, but has incorrect sparse_dim')
                 if gi.dense_dim() != di.dense_dim():
-                    raise GradcheckError('grad is sparse tensor, but has incorrect dense_dim')
+                    raise GradcheckError(f'grad is {sparse_kind} tensor, but has incorrect dense_dim')
             gi = gi.to_dense()
             di = di.to_dense()
 
@@ -1167,6 +1186,18 @@ def _vec_from_tensor(x, generator, downcast_complex=False):
             .view(x_values.shape)
         values /= values.norm()
         vec = torch.sparse_coo_tensor(x._indices(), values, x.size())
+    elif _is_sparse_compressed_tensor(x):
+        if x.layout in {torch.sparse_csr, torch.sparse_bsr}:
+            compressed_indices, plain_indices = x.crow_indices(), x.col_indices()
+        else:
+            compressed_indices, plain_indices = x.ccol_indices(), x.row_indices()
+        x_values = x.values()
+        dtype = _to_real_dtype(x.dtype) if downcast_complex else x.dtype
+        values = torch.rand(x_values.numel(), generator=generator) \
+            .to(dtype=dtype, device=x.device) \
+            .view(x_values.shape)
+        values /= values.norm()
+        vec = torch.sparse_compressed_tensor(compressed_indices, plain_indices, values, x.size(), layout=x.layout)
     else:
         dtype = _to_real_dtype(x.dtype) if downcast_complex else x.dtype
         vec = torch.rand(x.numel(), generator=generator).to(dtype=dtype, device=x.device)
@@ -1189,6 +1220,7 @@ def _adjusted_atol(atol, u, v):
     # matrix): v^T M u = \sum_{i} \sum_{j} u_i * v_j = (\sum_{i} u_i)(\sum_{i} v_i)
     # TODO: properly handle case when u is tuple instead of only taking first element
     u = u[0] if isinstance(u, tuple) else u
+    # TODO: replace torch.sparse.sum(u) with u.sum()
     sum_u = torch.sparse.sum(u) if u.layout == torch.sparse_coo else u.sum()
     sum_v = 1. if v is None else torch.sparse.sum(v) if v.layout == torch.sparse_coo else v.sum()
     return atol * float(sum_u) * float(sum_v)
@@ -1241,7 +1273,7 @@ def new_fn(inp):
 
 
 def _to_flat_dense_if_sparse(tensor):
-    if tensor.layout == torch.sparse_coo:
+    if _is_sparse_any_tensor(tensor):
         return tensor.to_dense().reshape(-1)
     else:
         return tensor

From cbd64e66eb9d92caadc604c86b5fa47432f04bfc Mon Sep 17 00:00:00 2001
From: Jean Schmidt <contato@jschmidt.me>
Date: Fri, 2 Dec 2022 13:27:57 +0000
Subject: [PATCH 1525/1922] =?UTF-8?q?Reland=20"Dynamo,=20FX,=20Inductor=20?=
 =?UTF-8?q?Progress=20Bars=20(#88384)"=20=E2=80=A6=20(#90055)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit had inconsistent internal land and pr merged. This caused merge conflicts that required revert in both places, normalize the internal commit stack, and then re-land properly.

Original commit: #88384 (011452a2a1c745d4b12f83f89eca039f482d134b)
Inconsistent revert: #90018 (8566aa7c0b4bdca50bf85ca14705b4304de030b3)
Revert of the inconsistent revert to restore healthy state (or re-land of the original commit): cf3c3f22804be6909e54fc09e07f891ab0886774
Landing the correct, internally congruent revert of the original commit: (This PR) #90055 (TBD)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90055
Approved by: https://github.com/DanilBaibak, https://github.com/malfet
---
 torch/_dynamo/logging.py                | 21 ---------------------
 torch/_dynamo/optimizations/analysis.py |  1 -
 torch/_inductor/codecache.py            | 22 ++--------------------
 torch/_inductor/codegen/common.py       |  2 --
 torch/_inductor/config.py               |  6 ------
 torch/_inductor/graph.py                |  1 -
 torch/_inductor/ir.py                   |  3 ---
 torch/_inductor/sizevars.py             |  1 -
 torch/_inductor/virtualized.py          |  3 ---
 torch/fx/config.py                      |  6 ------
 torch/fx/interpreter.py                 |  8 ++------
 11 files changed, 4 insertions(+), 70 deletions(-)
 delete mode 100644 torch/fx/config.py

diff --git a/torch/_dynamo/logging.py b/torch/_dynamo/logging.py
index b2fa67fbdf6ae..95ee727f1ddf1 100644
--- a/torch/_dynamo/logging.py
+++ b/torch/_dynamo/logging.py
@@ -2,15 +2,10 @@
 import logging
 import os
 
-from torch.hub import tqdm
-
 # logging level for dynamo generated graphs/bytecode/guards
 logging.CODE = 15
 logging.addLevelName(logging.CODE, "CODE")
 
-# Disable progress bar by default, not in dynamo config because otherwise get a circular import
-disable_progress = True
-
 
 # Return all loggers that torchdynamo/torchinductor is responsible for
 def get_loggers():
@@ -83,24 +78,8 @@ def init_logging(log_level, log_file_name=None):
 
 _step_counter = itertools.count(1)
 
-# Update num_steps if more phases are added: Dynamo, AOT, Backend
-# This is very inductor centric
-# _inductor.utils.has_triton() gives a circular import error here
-
-if not disable_progress:
-    try:
-        import triton  # noqa: F401
-
-        num_steps = 3
-    except ImportError:
-        num_steps = 2
-    pbar = tqdm(total=num_steps, desc="torch.compile()", delay=15)
-
 
 def get_step_logger(logger):
-    if not disable_progress:
-        pbar.set_postfix_str(f"{logger.name}")
-        pbar.update(1)
     step = next(_step_counter)
 
     def log(level, msg):
diff --git a/torch/_dynamo/optimizations/analysis.py b/torch/_dynamo/optimizations/analysis.py
index f732fb322438f..d83e57fdca6e2 100644
--- a/torch/_dynamo/optimizations/analysis.py
+++ b/torch/_dynamo/optimizations/analysis.py
@@ -21,7 +21,6 @@ def __init__(self, *args, **kwargs):
         self.input_alias_groups = set()
         self.storage_to_alias_group = dict()
         self.make_alias_group = itertools.count(1)
-        self.name = "ShapeAliasingAndMutation"
 
     def tensor_alias_group(self, value: torch.Tensor):
         """Assign a unique identifier to the storage of a given tensor"""
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 1569f953d6120..f4297732b38cd 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -21,8 +21,6 @@
 from typing import Any, Callable, Dict, List
 
 import torch
-
-from torch.hub import tqdm
 from torch.utils import cpp_extension
 from . import config, cuda_properties, exc
 
@@ -597,7 +595,7 @@ def warm_pool(cls):
         if hasattr(pool, "_start_queue_management_thread"):
             pool._start_queue_management_thread()
         else:
-            for _ in range(config.compile_threads):
+            for i in range(config.compile_threads):
                 pool._adjust_process_count()
             pool._start_executor_manager_thread()
         _compile_end()
@@ -638,26 +636,10 @@ def task():
         return self.submit(task)
 
     def wait(self, scope: Dict[str, Any]):
-        num_kernels = len(
-            [
-                value
-                for key, value in scope.items()
-                if isinstance(value, (Future, TritonFuture))
-            ]
-        )
-        pbar = tqdm(
-            total=num_kernels,
-            desc="Inductor Compilation",
-            disable=config.disable_progress,
-            delay=15,
-        )
         if config.compile_threads > 1:
-            for key, result in scope.items():
-                if config.verbose_progress:
-                    pbar.set_postfix_str(key)
+            for key, result in list(scope.items()):
                 if isinstance(result, (Future, TritonFuture)):
                     scope[key] = result.result()
-                    pbar.update(1)
 
         _compile_end()
 
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index c549de21e46ee..b1e710c0ef91e 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -590,8 +590,6 @@ def reduction(self, name, dtype, src_dtype, reduction_type, index, value):
 
     def __enter__(self):
         class CSEProxy:
-            self.name = "CSEProxy"
-
             @staticmethod
             def __getattr__(name):
                 def inner(*args, **kwargs):
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index b338bbf1241ad..92666886a002d 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -5,12 +5,6 @@
 # add some debug printouts
 debug = False
 
-# Whether to disable a progress bar for autotuning
-disable_progress = True
-
-# Whether to enable printing the source code for each future
-verbose_progress = False
-
 # use cpp wrapper instead of python wrapper
 cpp_wrapper = False
 
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index c7e40464cd406..3301a8455698b 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -93,7 +93,6 @@ def __init__(
         self.randomness_seeds = []
         self.name_to_buffer = {}
         self.creation_time = time.time()
-        self.name = "GraphLowering"
         self._can_use_cpp_wrapper = config.cpp_wrapper
         self.graph_id = graph_id
 
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 7a182752bb16a..56d5e8ef1d4c7 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -3952,8 +3952,6 @@ def add_index(expr, category, buf_name=None):
             )
 
         class CaptureIndexing(V.WrapperHandler):
-            self.name = "CaptureIndexing"
-
             def load(self, name: str, index: sympy.Expr):
                 index = add_index(index, "reads", name)
                 return self._inner.load(name, index)
@@ -4036,7 +4034,6 @@ def __init__(self):
                 self.garbage_collect_values = False
                 self.env = {}
                 self.fetch_attr = submodules.__getitem__
-                self.name = V.get_ops_handler().name
 
         return InterpreterShim().run(V.get_ops_handler())
 
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
index 7997d5fde09fa..fda61b0933574 100644
--- a/torch/_inductor/sizevars.py
+++ b/torch/_inductor/sizevars.py
@@ -596,7 +596,6 @@ class SimplifyIndexing(V.WrapperHandler):  # type: ignore[name-defined]
 
     def __init__(self, inner, var_ranges: VarRanges):
         super().__init__(inner)
-        self.name = "SimplifyIndexing"
         self._simplify: Callable[
             [Expr], Expr
         ] = lambda index: V.graph.sizevars.simplify_with_ranges(index, var_ranges)
diff --git a/torch/_inductor/virtualized.py b/torch/_inductor/virtualized.py
index cff6770997371..27e60b1daf1df 100644
--- a/torch/_inductor/virtualized.py
+++ b/torch/_inductor/virtualized.py
@@ -57,9 +57,6 @@ def _arg_str(a):
 
 class MockHandler:
     def __getattr__(self, name):
-        if name == "name":
-            return "MockHandler"
-
         def inner(*args, **kwargs):
             fargs = [_arg_str(a) for a in args]
             fargs.extend(f"{k}={v}" for k, v in kwargs.items())
diff --git a/torch/fx/config.py b/torch/fx/config.py
deleted file mode 100644
index da5120d6edf18..0000000000000
--- a/torch/fx/config.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# Whether to disable showing progress on compilation passes
-# Need to add a new config otherwise wil get a circular import if dynamo config is imported here
-disable_progress = True
-
-# If True this also shows the node names in each pass, for small models this is great but larger models it's quite noisy
-verbose_progress = False
diff --git a/torch/fx/interpreter.py b/torch/fx/interpreter.py
index 683a6bd90b501..6428d4c5c3bb5 100644
--- a/torch/fx/interpreter.py
+++ b/torch/fx/interpreter.py
@@ -4,12 +4,10 @@
 from .proxy import Proxy
 from ._symbolic_trace import Tracer
 from ._compatibility import compatibility
-from . import config
 import torch.fx.traceback as fx_traceback
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 import inspect
 from contextlib import contextmanager
-from torch.hub import tqdm
 
 __all__ = ['Interpreter', 'Transformer']
 
@@ -74,7 +72,7 @@ def __init__(self, module : GraphModule, garbage_collect_values : bool = True):
         self.module = module
         self.submodules = dict(self.module.named_modules())
         self.env : Dict[Node, Any] = {}
-        self.name = "Interpreter"
+
         self.garbage_collect_values = garbage_collect_values
 
         if self.garbage_collect_values:
@@ -120,9 +118,7 @@ def run(self, *args, initial_env : Optional[Dict[Node, Any]] = None, enable_io_p
             args = self.module.graph.process_inputs(*args)
         self.args_iter : Iterator[Any] = iter(args)
 
-        for node in tqdm(self.module.graph.nodes,
-                         desc=f"{self.name}: {str(list(self.module.graph.nodes)) if config.verbose_progress else ''}",
-                         initial=1, position=0, leave=True, disable=config.disable_progress, delay=15):
+        for node in self.module.graph.nodes:
             if node in self.env:
                 # Short circuit if we have this value. This could
                 # be used, for example, for partial evaluation

From 70fcbb59971bbcc2946312f56907b6aba9b50662 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Fri, 2 Dec 2022 14:28:19 +0000
Subject: [PATCH 1526/1922] Fix binary iOS uploads (#90058)

curl on CircleCI MacOS runners does not support `--retry-all-errors`
Should fix https://app.circleci.com/pipelines/github/pytorch/pytorch/618606/workflows/6f104c19-3a3a-479d-a686-4961ddd87657/jobs/17233205
Yet another fallback of https://github.com/pytorch/pytorch/pull/89157

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90058
Approved by: https://github.com/jeanschmidt
---
 .circleci/scripts/binary_ios_upload.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/scripts/binary_ios_upload.sh b/.circleci/scripts/binary_ios_upload.sh
index 7949dc9170b0e..da38065847eff 100644
--- a/.circleci/scripts/binary_ios_upload.sh
+++ b/.circleci/scripts/binary_ios_upload.sh
@@ -47,7 +47,7 @@ echo "${IOS_NIGHTLY_BUILD_VERSION}" > version.txt
 zip -r ${ZIPFILE} install src version.txt LICENSE
 # upload to aws
 # Install conda then 'conda install' awscli
-curl --retry 3 --retry-all-errors -o ~/conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+curl --retry 3 -o ~/conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
 chmod +x ~/conda.sh
 /bin/bash ~/conda.sh -b -p ~/anaconda
 export PATH="~/anaconda/bin:${PATH}"

From cc3dc67145d56158091057d49cb6c7dd80bdaf8e Mon Sep 17 00:00:00 2001
From: Elias Ellison <elias.ellison@gmail.com>
Date: Fri, 2 Dec 2022 00:55:58 +0000
Subject: [PATCH 1527/1922] [Reland] dont clone args (#89766)

Reland of https://github.com/pytorch/pytorch/pull/89519.

Improves first memory compression on pytorch struct from .55 -> .73. However, it doesn't totally eliminate the overhead from autotuning because of the 250mb cache clearing in triton benchmarking.

Reland bc previously we weren't accounting for inplace buffer reuse correctly.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89766
Approved by: https://github.com/jansel
---
 test/inductor/test_torchinductor.py    | 36 ++++++++++++++++++++++++++
 torch/_inductor/codegen/triton.py      | 30 +++++++++++++++++++--
 torch/_inductor/scheduler.py           |  6 +++++
 torch/_inductor/triton_ops/autotune.py | 28 ++++++++++++++------
 4 files changed, 90 insertions(+), 10 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 36b42532a884f..def9b95d77c2d 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -5275,6 +5275,41 @@ def forward(self, input: torch.Tensor):
 
             self.assertTrue(torch.allclose(module(input), traced(input)))
 
+        @patch.object(config.triton, "autotune", True)
+        def test_inplace_add_alpha_autotune(self):
+            def fn(x, y):
+                aten.add_.Tensor(x, y, alpha=0.55)
+                return (x,)
+
+            x1 = torch.zeros(2, 3, 4, 10, device="cuda")
+            x2 = torch.zeros(2, 3, 4, 10, device="cuda")
+            x3 = torch.zeros(2, 3, 4, 10, device="cuda")
+            y = torch.randn(2, 3, 4, 10, device="cuda").to(
+                memory_format=torch.channels_last
+            )
+            fn_fx = make_fx(fn)(x1, y)
+            fn_compiled = compile_fx_inner(fn_fx, [x1, y])
+            fn(x2, y)
+            fn_compiled([x3, y])
+            assert same(x2, x3)
+
+        @patch.object(config.triton, "autotune", True)
+        def test_inplace_buffer_autotune(self):
+            def foo(x, y, z):
+                a = x @ y
+                return a.unsqueeze(0).unsqueeze(0) + z
+
+            x = torch.zeros(5, 5, device="cuda")
+            y = torch.zeros(5, 5, device="cuda")
+            z = torch.zeros(1, 1, 5, 5, device="cuda").to(
+                memory_format=torch.channels_last
+            )
+            self.common(
+                foo,
+                (x, y, z),
+                check_lowp=False,
+            )
+
         def test_permute_linear_fusion(self):
             class TestModule(torch.nn.Module):
                 def __init__(self, k: int, n: int):
@@ -5584,6 +5619,7 @@ def decorator(fn):
                         meta=meta,
                         configs=configs,
                         save_cache_hook=False,
+                        mutated_arg_names=["in_out_ptr0"],
                     )
 
                 return decorator
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 6c7b852ef8288..02d96c3160b86 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -511,11 +511,18 @@ class TritonKernel(Kernel):
     overrides = TritonOverrides
     sexpr = texpr
 
-    def __init__(self, *groups, pid_cache=None, reduction_hint=ReductionHint.DEFAULT):
+    def __init__(
+        self,
+        *groups,
+        mutations=None,
+        pid_cache=None,
+        reduction_hint=ReductionHint.DEFAULT,
+    ):
         if pid_cache is None:
             pid_cache = {}
         super(TritonKernel, self).__init__()
         self.numels = [V.graph.sizevars.simplify(s) for s in groups]
+        self.mutations = mutations
         self.range_trees = []
         self.range_tree_nodes = {}
         self.iter_vars_count = itertools.count()
@@ -1013,10 +1020,21 @@ def codegen_kernel(self, name=None):
             )
 
         argdefs, _, signature = self.args.python_argdefs()
+
+        mutated_args = set()
+        for mutation in self.mutations:
+            if mutation in self.args.input_buffers:
+                mutated_args.add(self.args.input_buffers[mutation])
+            if mutation in self.args.inplace_buffers:
+                mutated_args.add(self.args.inplace_buffers[mutation].inner_name)
+            if mutation in self.args.output_buffers:
+                mutated_args.add(self.args.output_buffers[mutation])
+
         triton_meta = {
             "signature": dict(enumerate(map(signature_of, signature))),
             "device": V.graph.scheduler.current_device.index,
             "constants": {},
+            "mutated_arg_names": mutated_args,
         }
 
         for tree in self.range_trees:
@@ -1291,7 +1309,15 @@ def codegen_node_schedule(self, node_schedule, numel, reduction_numel):
                 reduction_hint_val = ReductionHint.DEFAULT
         else:
             reduction_hint_val = ReductionHint.DEFAULT
-        with TritonKernel(*tiled_groups, reduction_hint=reduction_hint_val) as kernel:
+
+        mutations = set()
+        for node in node_schedule:
+            if hasattr(node, "get_mutations"):
+                mutations.update(node.get_mutations())
+
+        with TritonKernel(
+            *tiled_groups, reduction_hint=reduction_hint_val, mutations=mutations
+        ) as kernel:
             stack = contextlib.ExitStack()
             for node in node_schedule:
                 if node not in (EnableReduction, DisableReduction):
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
index 0bc74ca6410bd..b12a15ce2f6fd 100644
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@@ -347,6 +347,12 @@ def allocate(self):
                         V.kernel.args.make_inplace(
                             input_node.get_name(), self.get_name()
                         )
+                        # mutations not tracked in cpp kernels
+                        if isinstance(
+                            V.kernel, torch._inductor.codegen.triton.TritonKernel
+                        ):
+                            V.kernel.mutations.add(input_node.get_name())
+                            V.kernel.mutations.add(self.get_name())
                         return
         super().allocate()
 
diff --git a/torch/_inductor/triton_ops/autotune.py b/torch/_inductor/triton_ops/autotune.py
index 808241cd02a2f..285995c6254fa 100644
--- a/torch/_inductor/triton_ops/autotune.py
+++ b/torch/_inductor/triton_ops/autotune.py
@@ -42,11 +42,12 @@ class CachingAutotuner(KernelInterface):
     configs, and does not rely on the Triton JIT.
     """
 
-    def __init__(self, fn, meta, configs, save_cache_hook):
+    def __init__(self, fn, meta, configs, save_cache_hook, mutated_arg_names):
         super().__init__()
         self.fn = fn
         self.meta = meta
         self.save_cache_hook = save_cache_hook
+        self.mutated_arg_names = mutated_arg_names
         self.configs = configs
         self.launchers = []
         self.lock = threading.Lock()
@@ -141,12 +142,17 @@ def autotune_to_one_config(self, *args, **kwargs):
         """Do the actual autotuning"""
         from ..compile_fx import clone_preserve_strides
 
-        # clone the input args to avoid autotune contaminating them if
-        # the kernel does in-place stores
-        cloned_args = [
-            clone_preserve_strides(arg) if isinstance(arg, torch.Tensor) else arg
-            for arg in args
-        ]
+        # clone inplace buffers to avoid autotune contaminating them if
+        # the kernel does in-place stores. avoid cloning other buffers because
+        # it leads to increase memory use
+        cloned_args = []
+        for i, arg in enumerate(args):
+            if self.fn.arg_names[i] in self.mutated_arg_names:
+                assert isinstance(arg, torch.Tensor)
+                cloned_args.append(clone_preserve_strides(arg))
+            else:
+                cloned_args.append(arg)
+
         timings = {
             launcher: self.bench(launcher, *cloned_args, **kwargs)
             for launcher in self.launchers
@@ -251,9 +257,15 @@ def save_cache_hook(cfg):
     else:
         save_cache_hook = None
 
+    mutated_arg_names = meta.pop("mutated_arg_names", ())
+
     def decorator(fn):
         return CachingAutotuner(
-            fn, meta=meta, configs=configs, save_cache_hook=save_cache_hook
+            fn,
+            meta=meta,
+            configs=configs,
+            save_cache_hook=save_cache_hook,
+            mutated_arg_names=mutated_arg_names,
         )
 
     return decorator

From 629fbf0b33cd99ea4fd21683718ddbbaea5f2495 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Fri, 2 Dec 2022 18:19:40 +0000
Subject: [PATCH 1528/1922] Fix access to unitialized memory in VSX vector
 functions (#89833)

This results in e.g. failures in TestNNDeviceTypeCPU.test_groupnorm_nhwc_cpu_float32

So simply initialize the stack array with zeroes as expected and done in other implementations

Fixes #32502

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89833
Approved by: https://github.com/ezyang
---
 aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h | 2 +-
 aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h  | 2 +-
 aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h         | 2 +-
 aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h          | 2 +-
 aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h          | 2 +-
 aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h          | 2 +-
 aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h          | 2 +-
 aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h         | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h
index cb8bb78597854..f5084f2101087 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h
@@ -142,7 +142,7 @@ class Vectorized<ComplexDbl> {
           vec_vsx_ld(offset16, reinterpret_cast<const double*>(ptr))};
     }
 
-    __at_align__ value_type tmp_values[size()];
+    __at_align__ value_type tmp_values[size()] = {};
     std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
 
     return {
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h
index 8445a31fb3d60..b4e35acfb480c 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h
@@ -196,7 +196,7 @@ class Vectorized<ComplexFlt> {
           vec_vsx_ld(offset16, reinterpret_cast<const float*>(ptr))};
     }
 
-    __at_align__ value_type tmp_values[size()];
+    __at_align__ value_type tmp_values[size()] = {};
     std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
 
     return {
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h
index c53b7c792e471..810e79ebfe83d 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h
@@ -171,7 +171,7 @@ class Vectorized<double> {
           vec_vsx_ld(offset16, reinterpret_cast<const value_type*>(ptr))};
     }
 
-    __at_align__ value_type tmp_values[size()];
+    __at_align__ value_type tmp_values[size()] = {};
     std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
 
     return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)};
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
index 8fe6cc25f0ee9..ac09531c4d2fa 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
@@ -180,7 +180,7 @@ class Vectorized<float> {
           vec_vsx_ld(offset16, reinterpret_cast<const value_type*>(ptr))};
     }
 
-    __at_align__ value_type tmp_values[size()];
+    __at_align__ value_type tmp_values[size()] = {};
     std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
 
     return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)};
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h
index 464a13c9f5f77..7c300c8087cff 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h
@@ -269,7 +269,7 @@ class Vectorized<int16_t> {
           vec_vsx_ld(offset16, reinterpret_cast<const value_type*>(ptr))};
     }
 
-    __at_align__ value_type tmp_values[size()];
+    __at_align__ value_type tmp_values[size()] = {};
     std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
 
     return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)};
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h
index 6ef6147447d54..c98ab6215e620 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h
@@ -199,7 +199,7 @@ class Vectorized<int32_t> {
           vec_vsx_ld(offset16, reinterpret_cast<const value_type*>(ptr))};
     }
 
-    __at_align__ value_type tmp_values[size()];
+    __at_align__ value_type tmp_values[size()] = {};
     std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
 
     return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)};
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h
index c0f1146d9d357..a4171026a2b99 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h
@@ -148,7 +148,7 @@ class Vectorized<int64_t> {
               (vint64)vec_vsx_ld(offset16, dptr)};
     }
 
-    __at_align__ double tmp_values[size()];
+    __at_align__ double tmp_values[size()] = {};
     std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
 
     return {
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h
index c3cec14a5b13e..a85730c9a6df8 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h
@@ -81,7 +81,7 @@ struct Vectorized<c10::qint32> {
           vec_vsx_ld(offset16, reinterpret_cast<const value_type*>(ptr))};
     }
 
-    __at_align__ value_type tmp_values[size()];
+    __at_align__ value_type tmp_values[size()] = {};
     std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
 
     return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)};

From 5ea8f9bad6c220085e9b51180a6244626b5d2813 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 2 Dec 2022 18:57:43 +0000
Subject: [PATCH 1529/1922] [Dynamo] tensor.type() should return tensor types
 with CPU and GPU variants (#90021)

Fix errors from [7k github models](https://github.com/pytorch/torchdynamo/issues/1884)
```
Traceback (most recent call last):
  File "/scratch/ybliang/work/repos/pytorch/torch/_dynamo/utils.py", line 1062, in get_fake_value
    return wrap_fake_exception(
  File "/scratch/ybliang/work/repos/pytorch/torch/_dynamo/utils.py", line 739, in wrap_fake_exception
    return fn()
  File "/scratch/ybliang/work/repos/pytorch/torch/_dynamo/utils.py", line 1063, in <lambda>
    lambda: run_node(tx.output, node, args, kwargs, nnmodule)
  File "/scratch/ybliang/work/repos/pytorch/torch/_dynamo/utils.py", line 1112, in run_node
    raise RuntimeError(
RuntimeError: Failed running call_function <function einsum at 0x7fd8f246a4c0>(*('i,j->ij', FakeTensor(FakeTensor(..., device='meta', size=(4,)), cpu), FakeTensor(FakeTensor(..., device='meta', size=(2,)), cuda:0)), **{}):
Unhandled FakeTensor Device Propagation for aten.mul.Tensor, found two different devices cpu, cuda:0
(scroll up for backtrace)
```

The root cause is: ```tensor.type()``` should return ```torch.cuda.FloatTensor``` rather than ```torch.FloatTensor``` if it's on GPU.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90021
Approved by: https://github.com/jansel
---
 test/dynamo/test_functions.py     |  7 +++++++
 torch/_dynamo/variables/tensor.py | 18 ++++++++++++++----
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py
index 327fa64f1209f..dd15a1562dbd2 100644
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@@ -5,6 +5,7 @@
 import inspect
 import itertools
 import operator
+import unittest
 from typing import Any
 from unittest.mock import patch
 
@@ -340,6 +341,12 @@ def test_tensor_type(a, b):
         m = a.to(torch.float16)
         return b.type(m.type())
 
+    @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
+    @make_test
+    def test_tensor_type2(a, b):
+        m = a.to("cuda")
+        return m + b.type(m.type())
+
     @make_test
     def test_ndim(x):
         if x.ndim == 2 and x.ndimension() == 2 and x.dim() == 2:
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index 9626ab8ae082d..edf7d16745734 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -224,13 +224,23 @@ def call_method(
             constant_result = ConstantVariable(
                 memory_format in self.is_contiguous, **options
             )
-        elif name == "type" and self.dtype is not None and len(args) == 0:
+        elif (
+            name == "type"
+            and self.dtype is not None
+            and len(args) == 0
+            and isinstance(self.device, torch.device)
+        ):
             tensortype = [k for k, v in tensortype_to_dtype.items() if self.dtype in v][
                 0
             ]
-            constant_result = ConstantVariable(
-                f"torch.{tensortype.__name__}", **options
-            )
+            if self.device.type == "cuda":
+                constant_result = ConstantVariable(
+                    f"torch.cuda.{tensortype.__name__}", **options
+                )
+            else:
+                constant_result = ConstantVariable(
+                    f"torch.{tensortype.__name__}", **options
+                )
         elif name == "get_device" and isinstance(self.device, torch.device):
             index = self.device.index if self.device.type != "cpu" else -1
             constant_result = ConstantVariable(index, **options)

From e8582a3d497833479e8401ad2da50e35384ae440 Mon Sep 17 00:00:00 2001
From: Driss Guessous <drisspg@fb.com>
Date: Fri, 2 Dec 2022 19:46:22 +0000
Subject: [PATCH 1530/1922] Call _sdp_attention  in nn.functional.mha (#89470)

# Summary
Replaces the the inline block of code in nn.funcitonal.mha with `_scaled_dot_product_attention`. This function allows the fused kernels to be called if all the required input conditions are met.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89470
Approved by: https://github.com/cpuhrsch, https://github.com/mikekgfb
---
 .../ATen/native/transformers/attention.cpp    |  9 +++----
 .../ATen/native/transformers/cuda/sdp_utils.h | 26 +++++++++++++++++--
 c10/core/SymFloat.cpp                         | 10 +++++++
 c10/core/SymFloat.h                           |  3 +++
 test/onnx/test_models_onnxruntime.py          |  1 +
 torch/nn/functional.py                        | 21 ++++++++-------
 6 files changed, 53 insertions(+), 17 deletions(-)

diff --git a/aten/src/ATen/native/transformers/attention.cpp b/aten/src/ATen/native/transformers/attention.cpp
index 9c5be12ef24db..06ea49bb516c4 100644
--- a/aten/src/ATen/native/transformers/attention.cpp
+++ b/aten/src/ATen/native/transformers/attention.cpp
@@ -9,7 +9,6 @@
 #include <ATen/native/transformers/attention.h>
 #include <ATen/native/transformers/sdp_utils_cpp.h>
 
-
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/NativeFunctions.h>
 #else
@@ -741,10 +740,10 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math(
   }
     auto attn_mask = attn_mask_;
     // Naive, composite implementation defined here.
-    const auto embed_size = query_.size(-1);
 
     // Scale q,k before matmul for stability see https://tinyurl.com/sudb9s96 for math
-    const double scaling_factor = ::sqrt(::sqrt(static_cast<double>(embed_size)));
+    const auto embed_size = SymFloat(query_.sym_size(-1));
+    const auto scaling_factor = embed_size.sqrt().sqrt();
     const auto query = query_ / scaling_factor;
     if (is_causal) {
         TORCH_CHECK(!attn_mask.has_value(),
@@ -753,8 +752,8 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math(
                 "_scaled_dot_product_attention: Nested tensors for query / key are not supported when is_causal=True");
 
         // Replace attn_mask with causal mask; lower triangular elements take part in attention.
-        const auto L = query.size(-2), S = key.size(-2);
-        attn_mask = at::ones({L, S}, query.options().dtype(at::kBool)).tril();
+        const auto L = query.sym_size(-2), S = key.sym_size(-2);
+        attn_mask = at::ones_symint({L, S}, query.options().dtype(at::kBool)).tril();
     }
     if (attn_mask.has_value()) {
         TORCH_CHECK(!query.is_nested() && !key.is_nested(),
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.h b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
index 55e9aeb184a22..2b57ef6dd6f6c 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.h
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.h
@@ -40,7 +40,9 @@ inline bool check_tensor_dtype(
          allowed_dtypes.end()))) {
     TORCH_CHECK(
         !debug,
-        "Expected query, key and value to be of dtype float16 or bfloat16 but got Query dtype: ",
+        "Expected query, key and value to all be of dtype: {",
+        c10::Join(", ", allowed_dtypes), "}. Got ",
+        "Query dtype: ",
         params.query.dtype(),
         ", Key dtype: ",
         params.key.dtype(),
@@ -162,6 +164,25 @@ inline bool check_head_dim_size(sdp_params params, bool debug) {
   return true;
 }
 
+inline bool check_head_dim_size_mem_efficient(sdp_params params, bool debug) {
+  const int64_t query_size_last = params.query.size(-1);
+  if (!(query_size_last == params.key.size(-1) &&
+        query_size_last == params.value.size(-1) && query_size_last >= 8)) {
+    TORCH_CHECK(
+        !debug,
+        "Mem efficient attention requires last dimension of inputs to be >= 8.",
+        "Got Query.size(-1): ",
+        query_size_last,
+        ", Key.size(-1): ",
+        params.key.size(-1),
+        ", Value.size(-1): ",
+        params.value.size(-1),
+        " instead.");
+    return false;
+  }
+  return true;
+}
+
 inline bool check_runtime_disabled_flash(sdp_params params, bool debug) {
   // We check the global context to see if user has explicitly turned of flash
   // sdp kernels
@@ -259,13 +280,14 @@ inline bool use_mem_efficient_attention(sdp_params params, bool debug) {
       at::kHalf, at::kFloat, at::kBFloat16};
 
   //  Define gate functions that determine if a flash kernel can be ran
-  constexpr std::array<bool(*)(sdp_params, bool), 8> constraints{{
+  constexpr std::array<bool(*)(sdp_params, bool), 9> constraints{{
       check_gpu_sm50_or_greater,
       check_runtime_disabled_mem_efficient,
       check_requires_grad_and_nested,
       check_for_attn_weights,
       check_tensor_shapes,
       check_for_attn_mask,
+      check_head_dim_size_mem_efficient,
       check_for_seq_len_1_nested_tensor,
       check_for_non_zero_dropout}};
   for (auto& constraint : constraints) {
diff --git a/c10/core/SymFloat.cpp b/c10/core/SymFloat.cpp
index 511c50e3398ee..161313c777dda 100644
--- a/c10/core/SymFloat.cpp
+++ b/c10/core/SymFloat.cpp
@@ -1,6 +1,7 @@
 #include <c10/core/SymFloat.h>
 #include <c10/core/SymNodeImpl.h>
 #include <array>
+#include <cmath>
 #include <utility>
 
 namespace c10 {
@@ -70,6 +71,15 @@ std::ostream& operator<<(std::ostream& os, const SymFloat& s) {
   return os;
 }
 
+SymFloat SymFloat::sqrt() const {
+  if (!is_symbolic()) {
+    return SymFloat(std::sqrt(data_));
+  }
+  auto other = SymFloat(-0.5);
+  auto res = normalize_symfloats(*this, other);
+  return SymFloat(res[0]->pow(res[1]));
+}
+
 double SymFloat::guard_float(const char* file, int64_t line) const {
   if (!is_symbolic()) {
     return data_;
diff --git a/c10/core/SymFloat.h b/c10/core/SymFloat.h
index ff9e101e31afb..50512dc6fb206 100644
--- a/c10/core/SymFloat.h
+++ b/c10/core/SymFloat.h
@@ -40,6 +40,9 @@ class C10_API SymFloat {
   SymFloat operator*(const SymFloat&) const;
   SymFloat operator/(const SymFloat&) const;
 
+  // Need guidance on where to put this code
+  SymFloat sqrt() const;
+
   // Insert a guard for the float to be its concrete value, and then return
   // that value.  This operation always works, even if the float is symbolic,
   // so long as we know what the underlying value is. Don't blindly put this
diff --git a/test/onnx/test_models_onnxruntime.py b/test/onnx/test_models_onnxruntime.py
index de1003ce449e0..4b7bdb58ae514 100644
--- a/test/onnx/test_models_onnxruntime.py
+++ b/test/onnx/test_models_onnxruntime.py
@@ -394,6 +394,7 @@ def forward(self, images, features: Mapping[str, torch.Tensor]):
         )
 
     @skipScriptTest()  # TODO: #75625
+    @skipIfUnsupportedMinOpsetVersion(20)
     def test_transformer_encoder(self):
         class MyModule(torch.nn.Module):
             def __init__(self, ninp, nhead, nhid, dropout, nlayers):
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index a1a102d786f16..7b8324c7aa849 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -5173,19 +5173,20 @@ def multi_head_attention_forward(
     # (deep breath) calculate attention and out projection
     #
 
-    B, Nt, E = q.shape
-    q_scaled = q / math.sqrt(E)
     if attn_mask is not None:
-        attn_output_weights = torch.baddbmm(attn_mask, q_scaled, k.transpose(-2, -1))
-    else:
-        attn_output_weights = torch.bmm(q_scaled, k.transpose(-2, -1))
-    attn_output_weights = softmax(attn_output_weights, dim=-1)
-    if dropout_p > 0.0:
-        attn_output_weights = dropout(attn_output_weights, p=dropout_p)
+        if attn_mask.size(0) == 1:
+            attn_mask = attn_mask.unsqueeze(0)
+        else:
+            attn_mask = attn_mask.view(bsz, num_heads, -1, src_len)
+
+    q = q.view(bsz, num_heads, tgt_len, head_dim)
+    k = k.view(bsz, num_heads, src_len, head_dim)
+    v = v.view(bsz, num_heads, src_len, head_dim)
 
-    attn_output = torch.bmm(attn_output_weights, v)
+    attn_output, attn_output_weights = _scaled_dot_product_attention(
+        q, k, v, attn_mask, dropout_p, need_weights, False)
+    attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
 
-    attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim)
     attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
     attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
 

From d618a27cc0cf77f17ca875f9a04903ee847da77b Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 2 Dec 2022 20:14:11 +0000
Subject: [PATCH 1531/1922] Revert "[follow-up] Python Attr Serialization
 (#88913)"

This reverts commit 086b251f9aeceaad95059de860ae81fd06526533.

Reverted https://github.com/pytorch/pytorch/pull/88913 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally
---
 test/test_serialization.py |  6 +++++-
 torch/_utils.py            |  2 ++
 torch/nn/parameter.py      | 14 +++-----------
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/test/test_serialization.py b/test/test_serialization.py
index b727ca2ebbbdf..cac5aee7407ad 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -948,7 +948,11 @@ def _test_save_load_attr(t):
 
         t = torch.zeros(3, 3)
         _test_save_load_attr(t)
-        _test_save_load_attr(torch.nn.Parameter(t))
+        # This should start failing once Parameter
+        # supports saving Python Attribute.
+        err_msg = "'Parameter' object has no attribute"
+        with self.assertRaisesRegex(AttributeError, err_msg):
+            _test_save_load_attr(torch.nn.Parameter(t))
 
     def test_weights_only_assert(self):
         class HelloWorld:
diff --git a/torch/_utils.py b/torch/_utils.py
index 89defb6bf78a5..1bf3cf96ad1ce 100644
--- a/torch/_utils.py
+++ b/torch/_utils.py
@@ -352,6 +352,8 @@ def _rebuild_parameter(data, requires_grad, backward_hooks):
     return param
 
 
+# TODO(kshitij12345): Support serializing nn.Parameter with Python Attributes.
+# NOTE: We are just defining it here now for future use.
 def _rebuild_parameter_with_state(data, requires_grad, backward_hooks, state):
     param = torch.nn.Parameter(data, requires_grad)
     # NB: This line exists only for backwards compatibility; the
diff --git a/torch/nn/parameter.py b/torch/nn/parameter.py
index e2100d782c6af..68908001238ec 100644
--- a/torch/nn/parameter.py
+++ b/torch/nn/parameter.py
@@ -60,19 +60,11 @@ def __repr__(self):
         return 'Parameter containing:\n' + super(Parameter, self).__repr__()
 
     def __reduce_ex__(self, proto):
-        state = torch._utils._get_obj_state(self)
-
+        # TODO(kshitij12345): Support saving Python Attribute
         # See Note [Don't serialize hooks]
-        hooks = OrderedDict()
-        if not state:
-            return (
-                torch._utils._rebuild_parameter,
-                (self.data, self.requires_grad, hooks)
-            )
-
         return (
-            torch._utils._rebuild_parameter_with_state,
-            (self.data, self.requires_grad, hooks, state)
+            torch._utils._rebuild_parameter,
+            (self.data, self.requires_grad, OrderedDict())
         )
 
     __torch_function__ = _disabled_torch_function_impl

From 307b87a37a76c571ec35b5a30a55fd8a0b97e4c9 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Fri, 2 Dec 2022 17:13:34 +0000
Subject: [PATCH 1532/1922] [FSDP] Fix `keep_low_precision_grads=True` for
 `use_orig_params=True` (#90027)

For any `flat_param.data = flat_param.to(...)` or `flat_param.grad.data = flat_param.grad.to(...)`, we must also refresh sharded parameter/gradient views, respectively, if the storage changes.

For `keep_low_precision_grads=True` and a sharded strategy, we cast the gradient back to the low precision using `.data` to bypass the PyTorch check that a parameter and its gradient have the same dtype. For `use_orig_params=True` before this PR, the gradient would incorrectly still be in full precision, not low precision, since we did not refresh views (this can actually be considered a memory leak since we have two copies of the gradient now, one in low precision and one in full precision). This PR refreshes the views.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90027
Approved by: https://github.com/mrshenli
---
 .../fsdp/test_fsdp_mixed_precision.py         | 15 +++++++---
 torch/distributed/fsdp/flat_param.py          | 28 +++++++++++++------
 2 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/test/distributed/fsdp/test_fsdp_mixed_precision.py b/test/distributed/fsdp/test_fsdp_mixed_precision.py
index 4eef830ba13b4..9522f3a013420 100644
--- a/test/distributed/fsdp/test_fsdp_mixed_precision.py
+++ b/test/distributed/fsdp/test_fsdp_mixed_precision.py
@@ -311,7 +311,9 @@ def _reduce_scatter_validate_mp(
 
         return orig_reduce_scatter(*args, **kwargs)
 
-    def _test_grads_reduced_precision(self, offload_params: bool):
+    def _test_grads_reduced_precision(
+        self, offload_params: bool, use_orig_params: bool
+    ):
         class MyModel(nn.Module):
             def __init__(self):
                 super().__init__()
@@ -331,6 +333,7 @@ def forward(self, x):
         fsdp_kwargs = {
             "mixed_precision": mp,
             "cpu_offload": CPUOffload(offload_params=offload_params),
+            "use_orig_params": use_orig_params,
         }
         m.lin1 = FSDP(m.lin1, **fsdp_kwargs)
         m = FSDP(m, **fsdp_kwargs)
@@ -338,7 +341,8 @@ def forward(self, x):
             inp = torch.ones(1, 10)
             m(inp).sum().backward()
             for param in m.parameters():
-                self.assertEqual(torch.float16, param.grad.dtype)
+                if param.grad is not None:
+                    self.assertEqual(torch.float16, param.grad.dtype)
 
         dist.barrier()
 
@@ -648,7 +652,10 @@ def test_mixed_precision_resnet(self):
     @skip_if_lt_x_gpu(2)
     def test_grads_reduced_precision(self):
         self.run_subtests(
-            {"offload_params": [False, True]},
+            {
+                "offload_params": [False, True],
+                "use_orig_params": [False, True],
+            },
             self._test_grads_reduced_precision,
         )
 
@@ -721,7 +728,7 @@ def world_size(self):
     @skip_if_lt_x_gpu(1)
     def test_grads_reduced_precision(self):
         self.run_subtests(
-            {"offload_params": [False, True]},
+            {"offload_params": [False, True], "use_orig_params": [False, True]},
             self._test_grads_reduced_precision,
         )
 
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index 4f6a7a3695123..08350b25223c1 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -1092,11 +1092,12 @@ def prepare_gradient_for_optim(self):
         def cast_grad_to_param_dtype_if_needed(flat_param):
             if self._config.keep_low_precision_grads:
                 assert flat_param.grad is not None  # mypy
-                # This cast is meaningful when `param_dtype` is a low precision
-                # dtype.
-                flat_param.grad.data = flat_param.grad.to(
-                    self._config.low_prec_param_dtype
-                )
+                if flat_param.grad.dtype != self._config.low_prec_param_dtype:
+                    flat_param.grad.data = flat_param.grad.to(
+                        self._config.low_prec_param_dtype
+                    )
+                    if self._use_orig_params:
+                        self._use_sharded_grad_views()
 
         flat_param = self.flat_param
         # TODO (awgu): We should replace these conditional checks to encode
@@ -1511,9 +1512,20 @@ def _use_sharded_grad_views(self) -> None:
                 numel_in_shard = param_end - param_start + 1
                 assert flat_param._is_grad_none is not None  # mypy
                 if param.requires_grad and not flat_param._is_grad_none[i]:
-                    param.grad = grad[offset : offset + numel_in_shard].reshape(
-                        param.shape
-                    )
+                    if self._keep_low_precision_grads:
+                        # NOTE: This is a hack using `.data` to side step the
+                        # check that parameter/gradient dtypes match. Here,
+                        # `param` has full precision; `grad` has low precision.
+                        if param.grad is None:
+                            # `.grad` must have the same shape as `param`
+                            param.grad = torch.empty_like(param)
+                        param.grad.data = grad[
+                            offset : offset + numel_in_shard
+                        ].reshape(param.shape)
+                    else:
+                        param.grad = grad[offset : offset + numel_in_shard].reshape(
+                            param.shape
+                        )
                 else:
                     param.grad = None
                 offset += numel_in_shard

From 9b9a108fec855724b602134c046daa1c5bb0d405 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Fri, 2 Dec 2022 17:13:34 +0000
Subject: [PATCH 1533/1922] [FSDP] Fix `clip_grad_norm_()` for low prec grads
 (#90028)

For PyTorch FSDP, the only way that gradients are in low precision is if `keep_low_precision_grads=True` or if the user turns on AMP. This PR adds tests for the former and improves the documentation for `clip_grad_norm_()`, especially around these non-full-precision cases.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90028
Approved by: https://github.com/rohan-varma
---
 .../fsdp/test_fsdp_clip_grad_norm.py          | 64 ++++++++++++++++++-
 .../fsdp/fully_sharded_data_parallel.py       | 37 ++++++++---
 2 files changed, 90 insertions(+), 11 deletions(-)

diff --git a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
index 772ced8d18363..9a45c5bb3316d 100644
--- a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
+++ b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
@@ -11,6 +11,7 @@
 from torch.distributed.fsdp.fully_sharded_data_parallel import (
     CPUOffload,
     FullyShardedDataParallel as FSDP,
+    MixedPrecision,
 )
 from torch.distributed.fsdp.wrap import ModuleWrapPolicy
 from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer
@@ -20,6 +21,7 @@
     CUDAInitMode,
     FSDPInitMode,
     FSDPTest,
+    NestedWrappedModule,
     TransformerWithSharedParams,
 )
 from torch.testing._internal.common_utils import (
@@ -71,7 +73,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
     def test_ddp_parity(self):
         """
         Tests FSDP with ``FullyShardedDataParallel.clip_grad_norm_()` against
-        DDP with ``torch.nn.utils.clip_grad_norm_()`.
+        DDP with ``torch.nn.utils.clip_grad_norm_()` when using full precision.
         """
         self.run_subtests(
             {
@@ -238,6 +240,66 @@ def _test_ddp_parity(
             ddp_optim.step()
             fsdp_optim.step()
 
+    @skip_if_lt_x_gpu(2)
+    def test_low_precision_grads(self):
+        """Tests ``clip_grad_norm_()`` when using low precision gradients."""
+        self.run_subtests(
+            {
+                "max_norm": [1, 2.5],
+                "norm_type": [1, 2, float("inf")],
+                "sharding_strategy": [
+                    ShardingStrategy.FULL_SHARD,
+                    ShardingStrategy.NO_SHARD,
+                ],
+                "use_orig_params": [False, True],
+            },
+            self._test_low_precision_grads,
+        )
+
+    def _test_low_precision_grads(
+        self,
+        max_norm: Union[float, int],
+        norm_type: Union[float, int],
+        sharding_strategy: ShardingStrategy,
+        use_orig_params: bool,
+    ):
+        fsdp_kwargs = {
+            "sharding_strategy": sharding_strategy,
+            "use_orig_params": use_orig_params,
+            "mixed_precision": MixedPrecision(
+                param_dtype=torch.float16,
+                reduce_dtype=torch.float16,
+                keep_low_precision_grads=True,
+            ),
+        }
+        fsdp_model = FSDP(
+            NestedWrappedModule.init(
+                self.process_group,
+                FSDPInitMode.RECURSIVE,
+                CUDAInitMode.CUDA_BEFORE,
+                deterministic=True,
+                fsdp_kwargs=fsdp_kwargs,
+            ),
+            **fsdp_kwargs,
+        )
+        inp = fsdp_model.module.get_input(torch.device("cuda"))
+        out = fsdp_model(*inp)
+        out.sum().backward()
+        for param in fsdp_model.parameters():
+            if param.grad is not None:
+                self.assertEqual(param.grad.dtype, torch.float16)
+        total_norm = fsdp_model.clip_grad_norm_(max_norm=max_norm, norm_type=norm_type)
+        # Check that the total norm is in FP16 to match the gradient dtype
+        self.assertEqual(total_norm.dtype, torch.float16)
+        # As a best effort, check that each gradient has norm at most the max
+        # norm (since DDP does not support mixed precision natively, we cannot
+        # directly compare for parity)
+        for param in fsdp_model.parameters():
+            if param.grad is not None:
+                self.assertTrue(
+                    torch.linalg.vector_norm(param.grad, norm_type).item() <= max_norm,
+                )
+
 
 instantiate_parametrized_tests(TestClipGradNorm)
 
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index 08cd0362c9fa3..e602727013e03 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -1144,12 +1144,21 @@ def clip_grad_norm_(
         Returns:
             Total norm of the parameters (viewed as a single vector).
 
-        .. note:: This is analogous to ``torch.nn.utils.clip_grad_norm_`` but
-            handles the partitioning and multiple devices per rank under the
-            hood. The default torch util is not applicable here, because each
-            rank only has a partial view of all the grads in the model, so
-            calling it for FSDP models would lead to different scaling being
-            applied per subset of model parameters.
+        .. note:: If every FSDP instance uses ``NO_SHARD``, meaning that no
+            gradients are sharded across ranks, then you may directly use
+            :func:`torch.nn.utils.clip_grad_norm_`.
+
+        .. note:: If at least some FSDP instance uses a sharded strategy (i.e.
+            one other than ``NO_SHARD``), then you should use this method
+            instead of :func:`torch.nn.utils.clip_grad_norm_` since this method
+            handles the fact that gradients are sharded across ranks.
+
+        .. note:: The total norm returned will have the "largest" dtype across
+            all parameters/gradients as defined by PyTorch's type promotion
+            semantics. For example, if *all* parameters/gradients use a low
+            precision dtype, then the returned norm's dtype will be that low
+            precision dtype, but if there exists at least one parameter/
+            gradient using FP32, then the returned norm's dtype will be FP32.
 
         .. warning:: This needs to be called on all ranks since it uses
             collective communications.
@@ -1198,6 +1207,7 @@ def clip_grad_norm_(
                 nonsharded_params.add(param)
                 if param.grad is not None:
                     grads.append(param.grad)
+        # Compute local norms (forced to be in FP32)
         local_sharded_norm = _get_grad_norm(sharded_params, norm_type).to(
             self.compute_device
         )
@@ -1226,8 +1236,14 @@ def clip_grad_norm_(
         # `if clip_coef < 1`
         clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
         for grad in grads:
-            grad.detach().mul_(clip_coef_clamped.to(grad.device))
-        return total_norm
+            grad.detach().mul_(clip_coef_clamped.to(grad.device, grad.dtype))
+        # Use the "largest" dtype by type promotion semantics to use the same
+        # dtype as if we did not force local norm computation to be in FP32
+        total_norm_dtype = functools.reduce(
+            lambda dtype1, dtype2: torch.promote_types(dtype1, dtype2),
+            [grad.dtype for grad in grads],
+        )
+        return total_norm.to(total_norm_dtype)
 
     @staticmethod
     def _warn_optim_input(optim_input):
@@ -1829,7 +1845,9 @@ def _get_grad_norm(
 ) -> torch.Tensor:
     """
     Returns the gradient norm of parameters ``param`` s, where the gradients
-    are viewed as a single vector.
+    are viewed as a single vector. The returned norm is in FP32 even if
+    parameters/gradients are in a low precision. This is because the downstream
+    use of this return value is a reduction across ranks.
     """
     params_with_grad = [param for param in params if param.grad is not None]
     if len(params_with_grad) == 0:
@@ -1852,7 +1870,6 @@ def _get_grad_norm(
         norm_type,
         dtype=torch.float32,
     )
-    grad_norm = grad_norm.to(grads[0].dtype)
     return grad_norm
 
 
From eece39f88d0b6ad258fd646675f7812c9be95f34 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Fri, 2 Dec 2022 21:13:52 +0000
Subject: [PATCH 1534/1922] Fix meta registration for aten._cdist_forward
 (#90042)

Error from [7k github model](https://github.com/pytorch/torchdynamo/issues/1884).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90042
Approved by: https://github.com/ezyang, https://github.com/eellison
---
 test/test_meta.py            | 11 +++++++++++
 torch/_meta_registrations.py |  4 ++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/test/test_meta.py b/test/test_meta.py
index af81d14e37d5d..99c2049473c6d 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -1295,6 +1295,17 @@ def test_meta__fused_moving_avg_obs_fq_helper(self, device):
             self.assertEqual(ref_out[1].size(), meta_out[1].size())
             self.assertEqual(ref_out[1].stride(), meta_out[1].stride())
 
+    def test_cdist_forward(self, device):
+        to_meta = MetaConverter()
+        x1 = torch.rand([3, 2], device=device)
+        x2 = torch.rand([2, 2], device=device)
+        p = 2.0
+        for compute_mode in (None, 1, 2):
+            ref = aten._cdist_forward.default(x1, x2, p, compute_mode)
+            res = aten._cdist_forward.default(to_meta(x1), to_meta(x2), p, compute_mode)
+            self.assertEqual(res.device.type, 'meta')
+            self.assertEqual(ref.shape, res.shape)
+
     # opinfo test is using aten.fill_, it's not testing aten.fill
     @onlyCUDA
     def test_fill_stride(self):
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index a0fe373ea6e54..eef1ab859e938 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1012,8 +1012,8 @@ def meta_cdist_forward(x1, x2, p, compute_mode):
     )
     check(p >= 0, lambda: "cdist only supports non-negative p values")
     check(
-        compute_mode >= 0 and compute_mode <= 2,
-        lambda: f"possible modes: 0, 1, 2, but was: {compute_mode}",
+        compute_mode in (None, 1, 2),
+        lambda: f"possible modes: None, 1, 2, but was: {compute_mode}",
     )
     r1 = x1.size(-2)
     r2 = x2.size(-2)

From 0f9ca02978aa7ba86814ba91d45804f27f31ba96 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 2 Dec 2022 21:36:13 +0000
Subject: [PATCH 1535/1922] Revert "remove torch.equal usages (#89527)"

This reverts commit 4095ef8b809f922f2e0e09011afd00037d20a771.

Reverted https://github.com/pytorch/pytorch/pull/89527 on behalf of https://github.com/clee2000 due to broke periodic multigpu tests https://hud.pytorch.org/pytorch/pytorch/commit/4095ef8b809f922f2e0e09011afd00037d20a771 https://github.com/pytorch/pytorch/actions/runs/3592806602/jobs/6049368502
---
 docs/source/nested.rst                        |  2 +-
 .../sharded_tensor/ops/test_binary_cmp.py     |  2 +-
 .../sharded_tensor/ops/test_tensor_ops.py     |  4 +-
 .../sharded_tensor/test_sharded_tensor.py     |  8 ++--
 .../checkpoint/test_file_system_checkpoint.py | 20 +++-----
 .../test_file_system_checkpoint_cpu.py        | 20 +++-----
 .../fsdp/test_fsdp_clip_grad_norm.py          |  6 +--
 test/distributed/fsdp/test_fsdp_misc.py       |  2 +-
 .../fsdp/test_fsdp_summon_full_params.py      |  4 +-
 test/distributed/pipeline/sync/test_pipe.py   |  2 +-
 test/fx/test_dce_pass.py                      |  4 +-
 test/fx/test_fx_const_fold.py                 | 48 +++++++++----------
 test/jit/test_save_load.py                    |  4 +-
 test/nn/test_lazy_modules.py                  |  4 +-
 .../core/experimental/test_fake_quantize.py   |  8 ++--
 .../core/experimental/test_linear.py          |  4 +-
 .../experimental/test_quantized_tensor.py     |  2 +-
 .../core/experimental/test_quantizer.py       |  8 ++--
 test/quantization/core/test_quantized_op.py   |  4 +-
 .../core/test_quantized_tensor.py             |  6 +--
 .../quantization/core/test_workflow_module.py |  8 ++--
 test/quantization/fx/test_quantize_fx.py      |  8 ++--
 test/test_autocast.py                         |  2 +-
 test/test_autograd.py                         |  4 +-
 test/test_cuda.py                             | 10 ++--
 test/test_jit.py                              | 12 ++---
 test/test_mps.py                              |  4 +-
 test/test_namedtensor.py                      | 46 +++++++++---------
 test/test_nn.py                               | 10 ++--
 test/test_serialization.py                    | 20 ++++----
 test/test_tensorexpr.py                       |  2 +-
 .../_shard/sharded_tensor/_ops/binary_cmp.py  |  2 +-
 .../tensor/parallel/multihead_attention_tp.py |  7 ++-
 torch/onnx/symbolic_opset9.py                 |  2 +-
 .../_internal/distributed/distributed_test.py | 13 +++--
 .../distributed/nn/api/remote_module_test.py  |  4 +-
 .../distributed/rpc/dist_autograd_test.py     |  2 +-
 .../_internal/distributed/rpc/rpc_test.py     |  5 +-
 38 files changed, 154 insertions(+), 169 deletions(-)

diff --git a/docs/source/nested.rst b/docs/source/nested.rst
index 1a4f825da1f1e..ac07f8acb5a23 100644
--- a/docs/source/nested.rst
+++ b/docs/source/nested.rst
@@ -116,7 +116,7 @@ If all dimensions are regular, the NestedTensor is intended to be semantically i
 torch.Size([2, 20, 128])
 >>> torch.stack([a, a]).size()
 torch.Size([2, 20, 128])
->>> (torch.stack(nt.unbind()) == torch.stack([a, a])).all().item()
+>>> torch.equal(torch.stack(nt.unbind()), torch.stack([a, a]))
 True
 
 In the future we might make it easier to detect this condition and convert seamlessly.
diff --git a/test/distributed/_shard/sharded_tensor/ops/test_binary_cmp.py b/test/distributed/_shard/sharded_tensor/ops/test_binary_cmp.py
index 1c496e5603904..33fc49f81c0f7 100644
--- a/test/distributed/_shard/sharded_tensor/ops/test_binary_cmp.py
+++ b/test/distributed/_shard/sharded_tensor/ops/test_binary_cmp.py
@@ -132,7 +132,7 @@ def test_torch_equal(self):
 
         spec, alt_spec = self.get_gpu_specs()
         st1, st2 = self.get_random_tensors(spec, spec, 10, 10)
-        self.assertEqual(st1, st2, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(st1, st2))
 
     @with_comms
     @skip_if_lt_x_gpu(4)
diff --git a/test/distributed/_shard/sharded_tensor/ops/test_tensor_ops.py b/test/distributed/_shard/sharded_tensor/ops/test_tensor_ops.py
index 322f3a3572b82..977fa701b44e0 100644
--- a/test/distributed/_shard/sharded_tensor/ops/test_tensor_ops.py
+++ b/test/distributed/_shard/sharded_tensor/ops/test_tensor_ops.py
@@ -58,9 +58,9 @@ def test_inplace_copy(self):
         )
         st = sharded_tensor.rand(spec, (12, 5))
         ones_st = sharded_tensor.ones(spec, (12, 5))
-        self.assertNotEqual(ones_st, st, rtol=0, atol=0, exact_device=True)
+        self.assertFalse(torch.equal(ones_st, st))
         st.copy_(ones_st)
-        self.assertEqual(st, ones_st, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(st, ones_st))
 
         # no grad inplace_copy should work between two with different requires_grad
         st_with_grad = sharded_tensor.rand(spec, (12, 5), requires_grad=True)
diff --git a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
index 351a27b5e6b10..5c548db8324dc 100644
--- a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
+++ b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
@@ -1125,8 +1125,8 @@ def test_state_dict(self):
         self.assertTrue("sharded_tensor1" in loaded_dict_keys)
         self.assertTrue("submodule.sharded_tensor2" in loaded_dict_keys)
         # Verify after load.
-        self.assertEqual(m.sharded_tensor1, module_load.sharded_tensor1, rtol=0, atol=0, exact_device=True)
-        self.assertEqual(m.submodule.sharded_tensor2, module_load.submodule.sharded_tensor2, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(m.sharded_tensor1, module_load.sharded_tensor1))
+        self.assertTrue(torch.equal(m.submodule.sharded_tensor2, module_load.submodule.sharded_tensor2))
 
     @with_comms
     @skip_if_lt_x_gpu(4)
@@ -1161,8 +1161,8 @@ def test_state_dict_new_group(self):
             module_load.load_state_dict(state_dict_deser, strict=False)
 
         # Verify after load.
-        self.assertEqual(m.sharded_tensor1, module_load.sharded_tensor1, rtol=0, atol=0, exact_device=True)
-        self.assertEqual(m.submodule.sharded_tensor2, module_load.submodule.sharded_tensor2, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(m.sharded_tensor1, module_load.sharded_tensor1))
+        self.assertTrue(torch.equal(m.submodule.sharded_tensor2, module_load.submodule.sharded_tensor2))
 
     @with_comms
     @skip_if_lt_x_gpu(4)
diff --git a/test/distributed/checkpoint/test_file_system_checkpoint.py b/test/distributed/checkpoint/test_file_system_checkpoint.py
index 91d9609540340..016467144e8ff 100644
--- a/test/distributed/checkpoint/test_file_system_checkpoint.py
+++ b/test/distributed/checkpoint/test_file_system_checkpoint.py
@@ -73,22 +73,14 @@ def assert_state_dict_equal(
             for local_shard_1, local_shard_2 in zip(
                 value_1.local_shards(), value_2.local_shards()
             ):
-                self.assertEqual(
-                    local_shard_1.tensor,
-                    local_shard_1.tensor,
-                    rtol=0,
-                    atol=0,
-                    exact_device=True,
-                    msg=f"Key {key}'s shard does not match"
+                self.assertTrue(
+                    torch.equal(local_shard_1.tensor, local_shard_1.tensor),
+                    f"Key {key}'s shard does not match",
                 )
         elif isinstance(value_1, torch.Tensor):
-            self.assertEqual(
-                value_1,
-                value_2,
-                rtol=0,
-                atol=0,
-                exact_device=True,
-                msg=f"Key {key}'s tensor does not match"
+            self.assertTrue(
+                torch.equal(value_1, value_2),
+                f"Key {key}'s tensor does not match",
             )
 
     return True
diff --git a/test/distributed/checkpoint/test_file_system_checkpoint_cpu.py b/test/distributed/checkpoint/test_file_system_checkpoint_cpu.py
index 16cb348cef7e5..52e414545c049 100644
--- a/test/distributed/checkpoint/test_file_system_checkpoint_cpu.py
+++ b/test/distributed/checkpoint/test_file_system_checkpoint_cpu.py
@@ -75,22 +75,14 @@ def assert_state_dict_equal(
             for local_shard_1, local_shard_2 in zip(
                 value_1.local_shards(), value_2.local_shards()
             ):
-                self.assertEqual(
-                    local_shard_1.tensor,
-                    local_shard_1.tensor,
-                    rtol=0,
-                    atol=0,
-                    exact_device=True,
-                    msg=f"Key {key}'s shard does not match",
+                self.assertTrue(
+                    torch.equal(local_shard_1.tensor, local_shard_1.tensor),
+                    f"Key {key}'s shard does not match",
                 )
         elif isinstance(value_1, torch.Tensor):
-            self.assertEqual(
-                value_1,
-                value_2,
-                rtol=0,
-                atol=0,
-                exact_device=True,
-                msg=f"Key {key}'s tensor does not match",
+            self.assertTrue(
+                torch.equal(value_1, value_2),
+                f"Key {key}'s tensor does not match",
             )
 
     return True
diff --git a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
index 9a45c5bb3316d..81b9f4c37f06e 100644
--- a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
+++ b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
@@ -193,12 +193,12 @@ def _test_ddp_parity(
 
         # Check that the gradients were modified by `clip_grad_norm_()`
         for param, orig_grad in zip(ddp_model.parameters(), orig_ddp_grads):
-            self.assertNotEqual(param.grad, orig_grad, rtol=0, atol=0, exact_device=True)
+            assert not torch.equal(param.grad, orig_grad)
         for param, orig_grad in zip(fsdp_model.parameters(), orig_fsdp_grads):
             if param.grad is None:
-                self.assertIsNone(orig_grad)
+                self.assertEqual(param.grad, orig_grad)  # `None`
             else:
-                self.assertNotEqual(param.grad, orig_grad, rtol=0, atol=0, exact_device=True)
+                assert not torch.equal(param.grad, orig_grad)
 
         # Run an optimizer step to ensure gradients matched after clipping
         ddp_optim.step()
diff --git a/test/distributed/fsdp/test_fsdp_misc.py b/test/distributed/fsdp/test_fsdp_misc.py
index 3dbf5c82be5d6..8c972f8515634 100644
--- a/test/distributed/fsdp/test_fsdp_misc.py
+++ b/test/distributed/fsdp/test_fsdp_misc.py
@@ -159,7 +159,7 @@ def _check_equal(local, fsdp):
                     # above check would be vacuously true.
                     self.assertTrue(
                         any(
-                            (p1 != p2).all()
+                            not torch.equal(p1, p2)
                             for p1, p2 in zip(prev_params, m_local.parameters())
                         )
                     )
diff --git a/test/distributed/fsdp/test_fsdp_summon_full_params.py b/test/distributed/fsdp/test_fsdp_summon_full_params.py
index acd7de93ed783..18055dbebffbf 100644
--- a/test/distributed/fsdp/test_fsdp_summon_full_params.py
+++ b/test/distributed/fsdp/test_fsdp_summon_full_params.py
@@ -155,7 +155,9 @@ def test_summon_full_param_shard_value(self, mixed_precision):
 
             # shards are padded but the full_param tensor is not
             a, b = my_shard[0 : my_slice.numel()], my_slice
-            self.assertEqual(my_shard[0 : my_slice.numel()].cpu(), my_slice.cpu(), rtol=0, atol=0, exact_device=True)
+            self.assertTrue(
+                torch.equal(my_shard[0 : my_slice.numel()].cpu(), my_slice.cpu())
+            )
 
     @skip_if_lt_x_gpu(2)
     @parametrize("recurse", [True, False])
diff --git a/test/distributed/pipeline/sync/test_pipe.py b/test/distributed/pipeline/sync/test_pipe.py
index 8eedc7a3c4908..abfa738603a1f 100644
--- a/test/distributed/pipeline/sync/test_pipe.py
+++ b/test/distributed/pipeline/sync/test_pipe.py
@@ -777,7 +777,7 @@ def forward(self, a, b):
     model = Pipe(nn.Sequential(Module1().cuda(0), Module2().cuda(0)), chunks=2, checkpoint=checkpoint)
     t = torch.rand(10)
     res = model(t, t, t).local_value()
-    torch.testing.assert_close(res, (t + t + t) + (t * t * t), rtol=0, atol=0)
+    assert torch.equal(res, (t + t + t) + (t * t * t))
 
 @skip_if_no_cuda
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Need atleast two GPUs")
diff --git a/test/fx/test_dce_pass.py b/test/fx/test_dce_pass.py
index 3de9223cef9d9..4f46b9982ba94 100644
--- a/test/fx/test_dce_pass.py
+++ b/test/fx/test_dce_pass.py
@@ -60,7 +60,7 @@ def is_leaf_module(self, m, qualname):
         traced.recompile()
         # Make sure we run and get the same results before/after DCE.
         inputs = [torch.tensor([1.5])] * new_num_phs
-        self.assertEqual(m(*inputs), traced(*inputs), rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(m(*inputs), traced(*inputs)))
 
     def test_simple(self):
         """
@@ -176,7 +176,7 @@ def __init__(self):
                 super().__init__()
 
             def forward(self, a: torch.Tensor) -> torch.Tensor:
-                torch._assert((a == a).all(), "a must equal a")
+                torch._assert(torch.equal(a, a), "a must equal a")
                 return a * 2
 
         # Note: Don't need to specify torch._assert as having side effects
diff --git a/test/fx/test_fx_const_fold.py b/test/fx/test_fx_const_fold.py
index 6d1a76593631f..d7f3b16f2466c 100644
--- a/test/fx/test_fx_const_fold.py
+++ b/test/fx/test_fx_const_fold.py
@@ -79,7 +79,7 @@ def forward(self, x, y):
         in_x, in_y = torch.tensor([[-0.45]]), torch.tensor([0.9])
         base_result = mod(in_x, in_y)
         fold_result = mod_folded(in_x, in_y)
-        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(fold_result, base_result))
 
     def test_const_fold_basic_one_attr_name_collision(self):
         r"""
@@ -125,7 +125,7 @@ def forward(self, x, y):
         in_x, in_y = torch.tensor([[5.0]]), torch.tensor([4.0])
         base_result = mod(in_x, in_y)
         fold_result = mod_folded(in_x, in_y)
-        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(fold_result, base_result))
 
     def test_const_fold_basic_placeholder_reordered(self):
         """
@@ -157,7 +157,7 @@ def forward(self, x, y):
         in_y = torch.tensor([[0.45]])
         base_result = mod(in_x, in_y)
         fold_result = mod_folded(in_x, in_y)
-        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(fold_result, base_result))
 
     def test_const_fold_noop(self):
         r"""
@@ -188,7 +188,7 @@ def forward(self, x):
         in_x = torch.tensor([[-0.45]])
         base_result = mod(in_x)
         fold_result = mod_folded(in_x)
-        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(fold_result, base_result))
 
     def test_const_fold_basic_two_attr_three_input(self):
         r"""
@@ -237,7 +237,7 @@ def forward(self, x, y, z):
         )
         base_result = mod(in_x, in_y, in_z)
         fold_result = mod_folded(in_x, in_y, in_z)
-        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(fold_result, base_result))
 
     def test_const_fold_basic_two_attr(self):
         r"""
@@ -274,7 +274,7 @@ def forward(self, x):
         in_x = torch.randn(2, 3)
         fold_result = mod_folded(in_x)
         base_result = mod(in_x)
-        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(fold_result, base_result))
 
     def test_const_fold_multi_const_folded_attrs(self):
         r"""
@@ -325,7 +325,7 @@ def forward(self, x, y):
         in_x, in_y = torch.randn(4, 4), torch.randn(4)
         fold_result = mod_folded(in_x, in_y)
         base_result = mod(in_x, in_y)
-        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(fold_result, base_result))
 
     def test_const_fold_submod_hierarchy(self):
         r"""
@@ -359,7 +359,7 @@ def forward(self, x):
         in_x = torch.randn(2, 3)
         fold_result = mod_folded(in_x)
         base_result = mod(in_x)
-        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(fold_result, base_result))
 
     def test_retain_node_meta(self):
         r"""
@@ -412,7 +412,7 @@ def forward(self, x):
         in_x = torch.randn(2, 3)
         fold_result = gm_folded(in_x)
         base_result = mod(in_x)
-        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(fold_result, base_result))
 
     def test_const_fold_has_inlined_call_module_node(self):
         class ConstFoldTestModule(torch.nn.Module):
@@ -433,7 +433,7 @@ def forward(self, x):
         in_x = torch.randn(2, 3)
         fold_result = gm_folded(in_x)
         base_result = mod(in_x)
-        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(fold_result, base_result))
 
     def test_const_fold_module_attr(self):
         class ConstFoldTestModule(torch.nn.Module):
@@ -455,7 +455,7 @@ def forward(self, x):
         in_x = torch.randn(2, 3)
         fold_result = gm_folded(in_x)
         base_result = mod(in_x)
-        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(fold_result, base_result))
 
     def test_const_fold_unused_placeholder(self):
         class ConstFoldTestModule(torch.nn.Module):
@@ -474,7 +474,7 @@ def forward(self, x, y, z):
         in_x = torch.randn(2, 3)
         fold_result = gm_folded(in_x, in_x, in_x)
         base_result = mod(in_x, in_x, in_x)
-        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(fold_result, base_result))
 
     def test_dict_output(self):
         class ConstFoldTestModule(torch.nn.Module):
@@ -493,7 +493,7 @@ def forward(self, x):
         in_x = torch.randn(2, 3)
         fold_result = gm_folded(in_x)
         base_result = mod(in_x)
-        self.assertEqual(fold_result["result"], base_result["result"], rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(fold_result["result"], base_result["result"]))
 
     def test_two_outputs(self):
         class ConstFoldTestModule(torch.nn.Module):
@@ -512,8 +512,8 @@ def forward(self, x):
         in_x = torch.randn(2, 3)
         fold_result = gm_folded(in_x)
         base_result = mod(in_x)
-        self.assertEqual(fold_result[0], base_result[0], rtol=0, atol=0, exact_device=True)
-        self.assertEqual(fold_result[1], base_result[1], rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(fold_result[0], base_result[0]))
+        self.assertTrue(torch.equal(fold_result[1], base_result[1]))
 
     def test_three_outputs(self):
         class ConstFoldTestModule(torch.nn.Module):
@@ -532,9 +532,9 @@ def forward(self, x):
         in_x = torch.randn(2, 3)
         fold_result = gm_folded(in_x)
         base_result = mod(in_x)
-        self.assertEqual(fold_result[0], base_result[0], rtol=0, atol=0, exact_device=True)
-        self.assertEqual(fold_result[1], base_result[1], rtol=0, atol=0, exact_device=True)
-        self.assertEqual(fold_result[2], base_result[2], rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(fold_result[0], base_result[0]))
+        self.assertTrue(torch.equal(fold_result[1], base_result[1]))
+        self.assertTrue(torch.equal(fold_result[2], base_result[2]))
 
     def test_check_inline_non_const(self):
         r"""
@@ -566,7 +566,7 @@ def forward(self, x):
         in_x = torch.randn(2, 3)
         fold_result = gm_folded(in_x)
         base_result = mod(in_x)
-        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(fold_result, base_result))
 
     def test_check_inline_non_const_mult_return(self):
         r"""
@@ -598,8 +598,8 @@ def forward(self, x):
         in_x = torch.randn(2, 3)
         fold_result = gm_folded(in_x)
         base_result = mod(in_x)
-        self.assertEqual(fold_result[0], base_result[0], rtol=0, atol=0, exact_device=True)
-        self.assertEqual(fold_result[1], base_result[1], rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(fold_result[0], base_result[0]))
+        self.assertTrue(torch.equal(fold_result[1], base_result[1]))
 
     def test_check_skip_folding_quant_dequant_pattern(self):
         r"""
@@ -645,7 +645,7 @@ def skip_folding_quant_dequant(node: torch.fx.Node):
         # Now run both folded and non-folded to check results equal.
         fold_result = gm_folded(in_x)
         base_result = mod(in_x)
-        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(fold_result, base_result))
 
     def test_fold_module(self):
         r"""
@@ -667,7 +667,7 @@ def forward(self, x):
 
         # Now run both folded and non-folded to check results equal.
         inp = torch.randn(4, 4)
-        self.assertEqual(mod_folded(inp), mod(inp), rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(mod_folded(inp), mod(inp)))
 
     def test_const_fold_tensor_meta(self):
         self._test_const_fold_tensor_meta(True)
@@ -708,4 +708,4 @@ def forward(self, x, y):
         # Now run both folded and non-folded to check results equal.
         base_result = mod(in_x, in_y)
         fold_result = mod_folded(in_x, in_y)
-        self.assertEqual(fold_result, base_result, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(fold_result, base_result))
diff --git a/test/jit/test_save_load.py b/test/jit/test_save_load.py
index daf6915af0084..16babb7c7a254 100644
--- a/test/jit/test_save_load.py
+++ b/test/jit/test_save_load.py
@@ -404,7 +404,7 @@ def forward(self, a):
             m2 = torch.jit.load(path)
 
         x = torch.tensor([1.0, 2.0, 3.0, 4.0])
-        self.assertEqual(m(x), m2(x), rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(m(x), m2(x)))
 
     def test_save_nonexit_file(self):
         class Foo(torch.nn.Module):
@@ -880,7 +880,7 @@ def forward(self, a):
             m2 = torch.jit.load(path)
 
         x = torch.tensor([1.0, 2.0, 3.0, 4.0])
-        self.assertEqual(m(x), m2(x), rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(m(x), m2(x)))
 
     def test_save_namedtuple_input_only(self):
         """
diff --git a/test/nn/test_lazy_modules.py b/test/nn/test_lazy_modules.py
index 0264f959f614d..c3a9dff200224 100644
--- a/test/nn/test_lazy_modules.py
+++ b/test/nn/test_lazy_modules.py
@@ -118,7 +118,7 @@ def test_linear(self):
         self.assertTrue(module.weight.shape == (10, 5))
         self.assertTrue(module.bias.shape == (10,))
         y = module(input)
-        self.assertEqual(torch.nn.functional.linear(input, module.weight, module.bias), y, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(torch.nn.functional.linear(input, module.weight, module.bias), y))
 
     @suppress_warnings
     def test_lazy_linear_pickle(self):
@@ -170,7 +170,7 @@ def _check_lazy_conv(self, cls, lazy_cls, func, init_args, input_shape,
         if module.bias is not None:
             self.assertEqual(module.bias.shape, expected_bias_shape)
         y = module(input)
-        self.assertEqual(func(input, module.weight, module.bias), y, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(func(input, module.weight, module.bias), y))
 
     def _check_lazy_conv_pickle(self, cls, lazy_cls, init_args, input_shape,
                                 expected_weight_shape, expected_bias_shape):
diff --git a/test/quantization/core/experimental/test_fake_quantize.py b/test/quantization/core/experimental/test_fake_quantize.py
index 609ec0366a85f..4e9464aca800a 100644
--- a/test/quantization/core/experimental/test_fake_quantize.py
+++ b/test/quantization/core/experimental/test_fake_quantize.py
@@ -29,9 +29,9 @@ def test_fake_calc_qparams(self):
         qparams_expected = observer.calculate_qparams(signed=False)
 
         self.assertEqual(alpha, qparams_expected[0])
-        self.assertEqual(gamma, qparams_expected[1], rtol=0, atol=0, exact_device=True)
-        self.assertEqual(quantization_levels, qparams_expected[2], rtol=0, atol=0, exact_device=True)
-        self.assertEqual(level_indices, qparams_expected[3], rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(gamma, qparams_expected[1]))
+        self.assertTrue(torch.equal(quantization_levels, qparams_expected[2]))
+        self.assertTrue(torch.equal(level_indices, qparams_expected[3]))
 
     r""" Tests fake quantize forward() method
          by comparing result with expected
@@ -58,7 +58,7 @@ def test_forward(self):
         X_to_apot = quantize_APoT(X, alpha, gamma, quantization_levels, level_indices)
         X_expected = dequantize_APoT(X_to_apot)
 
-        self.assertEqual(X_reduced_precision_fp, X_expected, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(X_reduced_precision_fp, X_expected))
 
     r""" Tests fake quantize forward() method
          throws error when qparams are None
diff --git a/test/quantization/core/experimental/test_linear.py b/test/quantization/core/experimental/test_linear.py
index d36d7fbdff667..6a46b4fc3ccbf 100644
--- a/test/quantization/core/experimental/test_linear.py
+++ b/test/quantization/core/experimental/test_linear.py
@@ -31,7 +31,7 @@ def test_linear_APoT_k1(self):
 
         fp_linear_result = fp_linear(activation).data
 
-        self.assertEqual(apot_linear_result, fp_linear_result, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(apot_linear_result, fp_linear_result))
 
     """
         Test linear_APoT_fn by comparing to uniform linear
@@ -59,7 +59,7 @@ def test_linear_APoT_k2(self):
 
         fp_linear_result = fp_linear(activation).data
 
-        self.assertEqual(apot_linear_result, fp_linear_result, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(apot_linear_result, fp_linear_result))
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/quantization/core/experimental/test_quantized_tensor.py b/test/quantization/core/experimental/test_quantized_tensor.py
index 0bb9a4e536fbc..02286b94f8db3 100644
--- a/test/quantization/core/experimental/test_quantized_tensor.py
+++ b/test/quantization/core/experimental/test_quantized_tensor.py
@@ -35,7 +35,7 @@ def test_int_repr(self):
         # 0.0215 in tensor2quantize nearest 0.0208 in quantization_levels -> 3 in level_indices
         expected_qtensor_data = torch.tensor([0, 3, 8, 13, 5, 12], dtype=torch.int32)
 
-        self.assertEqual(qtensor_data, expected_qtensor_data, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(qtensor_data, expected_qtensor_data))
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/quantization/core/experimental/test_quantizer.py b/test/quantization/core/experimental/test_quantizer.py
index 5fa342aa82f1d..d689ee8e99e15 100644
--- a/test/quantization/core/experimental/test_quantizer.py
+++ b/test/quantization/core/experimental/test_quantizer.py
@@ -48,7 +48,7 @@ def test_quantize_APoT_rand_k1(self):
         qtensor_data = qtensor.data.int()
         uniform_quantized_tensor = uniform_quantized.data.int()
 
-        self.assertEqual(qtensor_data, uniform_quantized_tensor, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(qtensor_data, uniform_quantized_tensor))
 
     r""" Tests quantize_APoT for k != 1.
         Tests quantize_APoT result on random 1-dim tensor and hardcoded values for
@@ -92,7 +92,7 @@ def test_quantize_APoT_k2(self):
         # 0.0215 in tensor2quantize nearest 0.0208 in quantization_levels -> 3 in level_indices
         expected_qtensor = torch.tensor([0, 3, 8, 13, 5, 12], dtype=torch.int32)
 
-        self.assertEqual(qtensor_data, expected_qtensor, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(qtensor_data, expected_qtensor))
 
     r""" Tests dequantize_apot result on random 1-dim tensor
         and hardcoded values for b, k.
@@ -137,7 +137,7 @@ def test_dequantize_quantize_rand_b4(self):
 
         result = final_apot.data.int()
 
-        self.assertEqual(original_input, result, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(original_input, result))
 
     r""" Tests dequantize_apot result on random 1-dim tensor
         and hardcoded values for b, k.
@@ -182,7 +182,7 @@ def test_dequantize_quantize_rand_b6(self):
 
         result = final_apot.data.int()
 
-        self.assertEqual(original_input, result, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(original_input, result))
 
     r""" Tests for correct dimensions in dequantize_apot result
          on random 3-dim tensor with random dimension sizes
diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index 3333e19e2535f..116a76a2d1f5e 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -2244,7 +2244,7 @@ def test_quantized_mean_qnnpack(self, keep):
             XQ = torch.quantize_per_tensor(X, scale=0.2, zero_point=0, dtype=torch.quint8)
             YQ = torch.quantize_per_tensor(Y, scale=0.2, zero_point=0, dtype=torch.quint8)
             MQ = XQ.mean((2, 3), keepdim=keep)
-            self.assertEqual(MQ, YQ, rtol=0, atol=0, exact_device=True)
+            self.assertTrue(torch.equal(MQ, YQ))
 
     @override_qengines
     def test_std(self):
@@ -3171,7 +3171,7 @@ def test_linear_prepack_fp16_numerics(self, input_channels, output_channels, exp
         w_packed_fp16 = torch.ops.quantized.linear_prepack_fp16(w, bias)
         w_unpacked_fp16 = torch.ops.quantized.linear_unpack_fp16(w_packed_fp16)
         w_fp16 = w.to(torch.float16).to(torch.float32)
-        self.assertEqual(w_fp16, w_unpacked_fp16[0], rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(w_fp16, w_unpacked_fp16[0]))
 
     @skipIfNoFBGEMM
     def test_qlinear_dynamic_fp16(self):
diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py
index aacfa607909fd..241aab5da3237 100644
--- a/test/quantization/core/test_quantized_tensor.py
+++ b/test/quantization/core/test_quantized_tensor.py
@@ -1201,7 +1201,7 @@ def test_qtensor_view(self):
             self.assertNotEqual(b.int_repr(), c.int_repr())
             # torch.equal is not supported for the cuda backend
             if device == 'cpu':
-                self.assertNotEqual(b, c, rtol=0, atol=0, exact_device=True)
+                self.assertFalse(torch.equal(b, c))
 
             # a case can't view non-contiguos Tensor
             a_int = torch.randint(0, 100, [1, 2, 3, 4], device=device, dtype=dtype)
@@ -1248,7 +1248,7 @@ def test_qtensor_resize(self):
             self.assertNotEqual(b.int_repr(), c.int_repr())
             # torch.equal is not supported for the cuda backend
             if device == 'cpu':
-                self.assertNotEqual(b, c, rtol=0, atol=0, exact_device=True)
+                self.assertFalse(torch.equal(b, c))
 
             # Throws an error if numel is wrong
             q1_int = torch.randint(0, 100, sizes1, dtype=dtype, device=device)
@@ -1282,7 +1282,7 @@ def test_qtensor_reshape(self):
             self.assertNotEqual(b.int_repr(), c.int_repr())
             # torch.equal is not supported for the cuda backend
             if device == 'cpu':
-                self.assertNotEqual(b, c, rtol=0, atol=0, exact_device=True)
+                self.assertFalse(torch.equal(b, c))
 
             # we can use reshape for non-contiguous Tensor
             a_int = torch.randint(0, 100, [1, 2, 3, 4], dtype=dtype, device=device)
diff --git a/test/quantization/core/test_workflow_module.py b/test/quantization/core/test_workflow_module.py
index ab9dc277d2552..6ac8bed90ca3f 100644
--- a/test/quantization/core/test_workflow_module.py
+++ b/test/quantization/core/test_workflow_module.py
@@ -583,12 +583,12 @@ def test_observer_scriptable(self, qdtype):
         x = torch.rand(3, 4)
         obs(x)
         scripted(x)
-        self.assertEqual(obs.get_tensor_value()[0], scripted.get_tensor_value()[0], rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(obs.get_tensor_value()[0], scripted.get_tensor_value()[0]))
         buf = io.BytesIO()
         torch.jit.save(scripted, buf)
         buf.seek(0)
         loaded = torch.jit.load(buf)
-        self.assertEqual(obs.get_tensor_value()[0], loaded.get_tensor_value()[0], rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(obs.get_tensor_value()[0], loaded.get_tensor_value()[0]))
 
 class TestHistogramObserver(QuantizationTestCase):
     @given(qdtype=st.sampled_from((torch.qint8, torch.quint8)),
@@ -606,12 +606,12 @@ def test_observer_scriptable(self, qdtype, qscheme):
             x = torch.rand(3, 4)
             obs(x)
             scripted(x)
-            self.assertEqual(obs.histogram, scripted.histogram, rtol=0, atol=0, exact_device=True)
+            self.assertTrue(torch.equal(obs.histogram, scripted.histogram))
             buf = io.BytesIO()
             torch.jit.save(scripted, buf)
             buf.seek(0)
             loaded = torch.jit.load(buf)
-            self.assertEqual(obs.histogram, scripted.histogram, rtol=0, atol=0, exact_device=True)
+            self.assertTrue(torch.equal(obs.histogram, scripted.histogram))
 
     @given(qdtype=st.sampled_from((torch.qint8, torch.quint8)),
            qscheme=st.sampled_from((torch.per_tensor_affine, torch.per_tensor_symmetric)),
diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 1d46198f084bb..11f7162583139 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -1009,7 +1009,7 @@ def checkSerDeser(model, is_dynamic):
                         module.weight_scale = None
                         model.load_state_dict(state_dict)
                         module = getattr(model, module_name)
-                        self.assertEqual(prev_scale, module.weight_scale, rtol=0, atol=0, exact_device=True)
+                        self.assertTrue(torch.equal(prev_scale, module.weight_scale))
 
 
             checkWeightQParams(qr)
@@ -4551,7 +4551,7 @@ def forward(self, x):
             m_ref = convert_to_reference_fx(m_copy)
             result = m(*example_inputs)
             result_ref = m_ref(*example_inputs)
-            self.assertEqual(result, result_ref, rtol=0, atol=0, exact_device=True)
+            self.assertTrue(torch.equal(result, result_ref))
 
     def test_ref_conv_module(self):
         """ Make sure the numerics for models with ref conv module
@@ -4589,7 +4589,7 @@ def forward(self, x):
             m_ref = convert_to_reference_fx(m_copy)
             result = m(data)
             result_ref = m_ref(data)
-            self.assertEqual(result, result_ref, rtol=0, atol=0, exact_device=True)
+            self.assertTrue(torch.equal(result, result_ref))
 
     def test_sub_scalar(self):
         class M(torch.nn.Module):
@@ -4690,7 +4690,7 @@ def forward(self, x):
             }
             self.checkGraphModuleNodes(m, expected_node_occurrence=expected_node_occurrence)
             # checking result match
-            self.assertEqual(out_ref, out, rtol=0, atol=0, exact_device=True)
+            self.assertTrue(torch.equal(out_ref, out))
 
     def test_convert_qconfig_mapping(self):
         class Linear(torch.nn.Module):
diff --git a/test/test_autocast.py b/test/test_autocast.py
index 4bae83db3788e..1a8263a79f93d 100644
--- a/test/test_autocast.py
+++ b/test/test_autocast.py
@@ -59,7 +59,7 @@ def cast(val, to_type):
             # For example, lstm_cell returns a tuple and equal returns bool.
             def compare(first, second):
                 if isinstance(first, torch.Tensor):
-                    return (first == second).all().item()
+                    return torch.equal(first, second)
                 elif isinstance(first, collections.abc.Iterable):
                     return all(compare(f, s) for f, s in zip(first, second))
                 else:
diff --git a/test/test_autograd.py b/test/test_autograd.py
index b5bbec6e8e462..9c3d717114659 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -3764,8 +3764,8 @@ def test_calculate_shape_util(self):
         grad = torch.nested.as_nested_tensor([torch.randn(5, 10, requires_grad=True), torch.randn(5, 10, requires_grad=True)])
         out_shape, grad_shape = _calculate_shape(out, grad, False)
 
-        torch.testing.assert_close(out_shape, torch.tensor([[10, 5], [10, 5], [10, 5]]), rtol=0, atol=0)
-        torch.testing.assert_close(grad_shape, torch.tensor([[5, 10], [5, 10]]), rtol=0, atol=0)
+        assert torch.equal(out_shape, torch.tensor([[10, 5], [10, 5], [10, 5]]))
+        assert torch.equal(grad_shape, torch.tensor([[5, 10], [5, 10]]))
 
     def test_nested_anomaly_detect_nan(self):
         size = 10
diff --git a/test/test_cuda.py b/test/test_cuda.py
index b4c2281b1de52..40eaaa97a3b7e 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -2877,7 +2877,7 @@ def cast(val, to_type):
             # For example, lstm_cell returns a tuple and equal returns bool.
             def compare(first, second):
                 if isinstance(first, torch.Tensor):
-                    return (first == second).all().item()
+                    return torch.equal(first, second)
                 elif isinstance(first, collections.abc.Iterable):
                     return all(compare(f, s) for f, s in zip(first, second))
                 else:
@@ -4693,13 +4693,13 @@ def test_gather_namedtuple(self):
         for i, x in enumerate(out):
             self.assertTrue(isinstance(x, type(out2[-1])))  # x must be a tensor
             cat = torch.cat((outputs[0][i].to('cpu'), outputs[1][i].to('cpu')))
-            self.assertEqual(x, cat, rtol=0, atol=0, exact_device=True)
+            self.assertTrue(torch.equal(x, cat))
 
         out = scatter_gather.gather(outputs, 0)  # test on GPU
         for i, x in enumerate(out):
             self.assertTrue(isinstance(x, type(out2[-1])))
             cat = torch.cat((outputs[0][i].to(0), outputs[1][i].to(0)))
-            self.assertEqual(x, cat, rtol=0, atol=0, exact_device=True)
+            self.assertTrue(torch.equal(x, cat))
 
         class TestNamedTupleInput_1(NamedTuple):
             a: torch.tensor
@@ -4719,13 +4719,13 @@ class TestNamedTupleInput_1(NamedTuple):
         for i, x in enumerate(out):
             self.assertTrue(isinstance(x, type(out2[-1])))
             cat = torch.cat((outputs[0][i].to(0), outputs[1][i].to(0)))
-            self.assertEqual(x, cat, rtol=0, atol=0, exact_device=True)
+            self.assertTrue(torch.equal(x, cat))
 
         out = scatter_gather.gather(outputs, 'cpu')  # test on CPU
         for i, x in enumerate(out):
             self.assertTrue(isinstance(x, type(out2[-1])))
             cat = torch.cat((outputs[0][i].to('cpu'), outputs[1][i].to('cpu')))
-            self.assertEqual(x, cat, rtol=0, atol=0, exact_device=True)
+            self.assertTrue(torch.equal(x, cat))
 
     @unittest.skipIf(TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync")
     def test_memory_snapshot(self):
diff --git a/test/test_jit.py b/test/test_jit.py
index c1aeee2e66c18..6cbc091d506b5 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -744,7 +744,7 @@ def check(x, y):
     def test_matrix_transpose(self):
         @torch.jit.script
         def check(x):
-            return bool((x.mT == x.transpose(-2, -1)).all())
+            return torch.equal(x.mT, x.transpose(-2, -1))
 
         x = torch.rand(3, 4)
         self.assertTrue(check(x))
@@ -752,7 +752,7 @@ def check(x):
     def test_transpose(self):
         @torch.jit.script
         def check(x):
-            return bool((x.T == x.t()).all())
+            return torch.equal(x.T, x.t())
 
         x = torch.rand(3, 4)
         self.assertTrue(check(x))
@@ -760,7 +760,7 @@ def check(x):
     def test_matrix_conj_transpose(self):
         @torch.jit.script
         def check(x):
-            return bool((x.mH == x.transpose(-2, -1).conj()).all())
+            return torch.equal(x.mH, x.transpose(-2, -1).conj())
 
         x = torch.rand(3, 4)
         self.assertTrue(check(x))
@@ -771,7 +771,7 @@ def check(x):
     def test_conj_transpose(self):
         @torch.jit.script
         def check(x):
-            return bool((x.H == x.t().conj()).all())
+            return torch.equal(x.H, x.t().conj())
 
         x = torch.rand(3, 4)
         self.assertTrue(check(x))
@@ -1139,7 +1139,7 @@ def test_script_backward_twice_with_saved_values(input1, input2):
             # type: (Tensor, Tensor) -> Tensor
             tmp1 = torch.mul(input1, input2)
             tmp2 = torch.abs(tmp1)
-            if (input1 == input2).all():
+            if torch.equal(input1, input2):
                 tmp2 = torch.acos(tmp2)
             else:
                 tmp2 = torch.atan(tmp2)
@@ -1694,7 +1694,7 @@ def doit(x, y):
         t_node = g2.create("prim::TensorTest").t_("a", torch.ones([2, 2]))
         self.assertEqual(t_node.attributeNames(), ["a"])
         g2.appendNode(t_node)
-        self.assertEqual(torch.ones(2, 2), t_node.t("a"), rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(torch.ones(2, 2), t_node.t("a")))
         for node in g.nodes():
             self.assertTrue(g2.findNode(node.kind()) is not None)
 
diff --git a/test/test_mps.py b/test/test_mps.py
index 458d0f8f1baa4..5b45bd4e6e692 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -1650,7 +1650,7 @@ def test_div_bugs(self):
     def test_bool_expand(self):
         x = torch.tensor([[1], [0]], dtype=torch.bool, device='mps')
         y = torch.tensor([0, 1], dtype=torch.bool, device='mps')
-        self.assertNotEqual(x.expand(2, 2), y.expand(2, 2), rtol=0, atol=0, exact_device=True)
+        self.assertFalse(torch.equal(x.expand(2, 2), y.expand(2, 2)))
 
     # Empty unary op should return tensor of the same size
     def test_empty_neg(self):
@@ -4960,7 +4960,7 @@ def helper(alpha):
         # see https://github.com/pytorch/pytorch/issues/79835#issuecomment-1164984534
         x = torch.ones(4, dtype=torch.int32, device='mps')
         self.assertEqual(x + 1, torch.full((4,), 2, dtype=torch.int32, device='mps'))
-        self.assertEqual(x + 1.5, torch.full((4,), 2.5, device='mps'), rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(x + 1.5, torch.full((4,), 2.5, device='mps')))
 
     def test_types_binary_op(self):
         # Float * Bool
diff --git a/test/test_namedtensor.py b/test/test_namedtensor.py
index cf7de3fe52b53..751a56f168e78 100644
--- a/test/test_namedtensor.py
+++ b/test/test_namedtensor.py
@@ -1082,31 +1082,31 @@ def test_flatten_nodims(self):
 
     def test_unflatten(self):
         # test args: tensor, int, namedshape
-        self.assertTrue(
-            (torch.ones(4, names=('A',)).unflatten('A', (('A', 2), ('B', 2))) ==
-             torch.ones(2, 2, names=('A', 'B'))).all())
-        self.assertTrue(
-            (torch.ones(4, names=('A',)).unflatten('A', [('A', 2), ('B', 2)]) ==
-             torch.ones(2, 2, names=('A', 'B'))).all())
-        self.assertTrue(
-            (torch.ones(4, names=('A',)).unflatten('A', (['A', 2], ['B', 2])) ==
-             torch.ones(2, 2, names=('A', 'B'))).all())
-        self.assertTrue(
-            (torch.ones(2, 10, names=('A', 'B')).unflatten('B', (['B1', -1],)) ==
-             torch.ones(2, 10, names=('A', 'B1'))).all())
-        self.assertTrue(
-            (torch.ones(2, 3 * 4 * 5 * 6, names=('A', 'B'))
-                  .unflatten('B', (['B1', 3], ['B2', 4], ['B3', -1], ['B4', 6])) ==
-             torch.ones(2, 3, 4, 5, 6, names=('A', 'B1', 'B2', 'B3', 'B4'))).all())
-        self.assertTrue(
-            (torch.ones(2, 0, names=('A', 'B'))
-                  .unflatten('B', (['B1', 3], ['B2', -1], ['B3', 4])) ==
-             torch.ones(2, 3, 0, 4, names=('A', 'B1', 'B2', 'B3'))).all())
+        self.assertTrue(torch.equal(
+            torch.ones(4, names=('A',)).unflatten('A', (('A', 2), ('B', 2))),
+            torch.ones(2, 2, names=('A', 'B'))))
+        self.assertTrue(torch.equal(
+            torch.ones(4, names=('A',)).unflatten('A', [('A', 2), ('B', 2)]),
+            torch.ones(2, 2, names=('A', 'B'))))
+        self.assertTrue(torch.equal(
+            torch.ones(4, names=('A',)).unflatten('A', (['A', 2], ['B', 2])),
+            torch.ones(2, 2, names=('A', 'B'))))
+        self.assertTrue(torch.equal(
+            torch.ones(2, 10, names=('A', 'B')).unflatten('B', (['B1', -1],)),
+            torch.ones(2, 10, names=('A', 'B1'))))
+        self.assertTrue(torch.equal(
+            torch.ones(2, 3 * 4 * 5 * 6, names=('A', 'B'))
+                 .unflatten('B', (['B1', 3], ['B2', 4], ['B3', -1], ['B4', 6])),
+            torch.ones(2, 3, 4, 5, 6, names=('A', 'B1', 'B2', 'B3', 'B4'))))
+        self.assertTrue(torch.equal(
+            torch.ones(2, 0, names=('A', 'B'))
+                 .unflatten('B', (['B1', 3], ['B2', -1], ['B3', 4])),
+            torch.ones(2, 3, 0, 4, names=('A', 'B1', 'B2', 'B3'))))
 
         # test args: namedtensor, str, namedshape
-        self.assertTrue(
-            (torch.ones(2, 4, names=('A', 'B')).unflatten('B', (('B1', 2), ('B2', 2))) ==
-             torch.ones(2, 2, 2, names=('A', 'B1', 'B2'))).all())
+        self.assertTrue(torch.equal(
+            torch.ones(2, 4, names=('A', 'B')).unflatten('B', (('B1', 2), ('B2', 2))),
+            torch.ones(2, 2, 2, names=('A', 'B1', 'B2'))))
 
         # test invalid args: namedtensor, str, sizes
         with self.assertRaisesRegex(TypeError, r"unflatten\(\): argument 'dim' \(position 1\) must be int, not str"):
diff --git a/test/test_nn.py b/test/test_nn.py
index dd056cf103db2..60fb0e6c0cff3 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -1749,7 +1749,7 @@ def test_vector_to_parameters(self):
         vector_to_parameters(vec, model.parameters())
 
         sample = next(model.parameters())[0, 0, 0]
-        self.assertEqual(sample.data, vec.data[:5], rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(sample.data, vec.data[:5]))
 
     def test_rnn_weight_norm(self):
         def check_weight_norm(l, name, num_params):
@@ -5119,9 +5119,9 @@ def test_batchnorm_buffer_update_when_stats_are_not_tracked(self):
         # Forward random tensor
         _ = bn(torch.rand(input_size))
         # Ensure none of the buffers has been updated
-        self.assertEqual(num_batches, bn.num_batches_tracked, rtol=0, atol=0, exact_device=True)
-        self.assertEqual(running_mean, bn.running_mean, rtol=0, atol=0, exact_device=True)
-        self.assertEqual(running_var, bn.running_var, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(num_batches, bn.num_batches_tracked))
+        self.assertTrue(torch.equal(running_mean, bn.running_mean))
+        self.assertTrue(torch.equal(running_var, bn.running_var))
 
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
     def test_batchnorm_nhwc_cuda(self):
@@ -5133,7 +5133,7 @@ def test_batchnorm_nhwc_cuda(self):
             inp2 = inp1.contiguous(memory_format=torch.channels_last)
             out1 = model(inp1)
             out2 = model(inp2)
-            self.assertEqual(out1, out2, rtol=0, atol=0, exact_device=True)
+            self.assertTrue(torch.equal(out1, out2))
 
     def test_pairwise_distance(self):
         input1 = torch.randn(4, 4, requires_grad=True)
diff --git a/test/test_serialization.py b/test/test_serialization.py
index cac5aee7407ad..b97c35c46762a 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -282,7 +282,7 @@ def test_serialization_offset_gzip(self):
         with gzip.open(f2.name, 'rb') as f:
             j = pickle.load(f)
             b = torch.load(f)
-        self.assertEqual(a, b, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(a, b))
         self.assertEqual(i, j)
 
     def _test_serialization_sparse(self, weights_only):
@@ -554,7 +554,7 @@ def _test_serialization_filelike(self, tensor, mock, desc):
         msg = 'filelike serialization with {}'
 
         b = torch.load(data)
-        self.assertEqual(tensor, b), msg.format(desc, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(tensor, b), msg.format(desc))
 
     @unittest.skipIf((3, 8, 0) <= sys.version_info < (3, 8, 2), "See https://bugs.python.org/issue39681")
     def test_serialization_filelike_missing_attrs(self):
@@ -827,7 +827,7 @@ def test_serialization_container_filelike(self):
 
     def test_serialization_offset(self):
         a = torch.randn(5, 5)
-        b = torch.randn(1024, 512, dtype=torch.float32)
+        b = torch.randn(1024, 1024, 512, dtype=torch.float32)
         m = torch.nn.Conv2d(1, 1, (1, 3))
         i, j = 41, 43
         with tempfile.NamedTemporaryFile() as f:
@@ -836,15 +836,15 @@ def test_serialization_offset(self):
             pickle.dump(j, f)
             torch.save(b, f)
             torch.save(m, f)
-            self.assertTrue(f.tell() > 2 * 1024 * 1024)
+            self.assertTrue(f.tell() > 2 * 1024 * 1024 * 1024)
             f.seek(0)
             i_loaded = pickle.load(f)
             a_loaded = torch.load(f)
             j_loaded = pickle.load(f)
             b_loaded = torch.load(f)
             m_loaded = torch.load(f)
-        self.assertEqual(a, a_loaded, rtol=0, atol=0, exact_device=True)
-        self.assertEqual(b, b_loaded, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(a, a_loaded))
+        self.assertTrue(torch.equal(b, b_loaded))
         self.assertTrue(m.kernel_size == m_loaded.kernel_size)
         self.assertEqual(i, i_loaded)
         self.assertEqual(j, j_loaded)
@@ -852,21 +852,21 @@ def test_serialization_offset(self):
     @parametrize('weights_only', (True, False))
     def test_serialization_offset_filelike(self, weights_only):
         a = torch.randn(5, 5)
-        b = torch.randn(1024, 512, dtype=torch.float32)
+        b = torch.randn(1024, 1024, 512, dtype=torch.float32)
         i, j = 41, 43
         with BytesIOContext() as f:
             pickle.dump(i, f)
             torch.save(a, f)
             pickle.dump(j, f)
             torch.save(b, f)
-            self.assertTrue(f.tell() > 2 * 1024 * 1024)
+            self.assertTrue(f.tell() > 2 * 1024 * 1024 * 1024)
             f.seek(0)
             i_loaded = pickle.load(f)
             a_loaded = torch.load(f, weights_only=weights_only)
             j_loaded = pickle.load(f)
             b_loaded = torch.load(f, weights_only=weights_only)
-        self.assertEqual(a, a_loaded, rtol=0, atol=0, exact_device=True)
-        self.assertEqual(b, b_loaded, rtol=0, atol=0, exact_device=True)
+        self.assertTrue(torch.equal(a, a_loaded))
+        self.assertTrue(torch.equal(b, b_loaded))
         self.assertEqual(i, i_loaded)
         self.assertEqual(j, j_loaded)
 
diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py
index 30cec4d1dffc0..cf894f3749eb9 100644
--- a/test/test_tensorexpr.py
+++ b/test/test_tensorexpr.py
@@ -1475,7 +1475,7 @@ def test(x):
                 scripted = torch.jit.script(test)
                 out = warmup_and_run_forward(scripted, x)
                 self.assertLastGraphAllFused()
-                torch.testing.assert_close(out, test(x), rtol=0, atol=0)
+                assert torch.equal(out, test(x))
 
     def test_simple_add(self):
         val = torch._C._jit_get_te_generate_block_code()
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/binary_cmp.py b/torch/distributed/_shard/sharded_tensor/_ops/binary_cmp.py
index 70f2cb6ea7c0d..fa1eded53b5ce 100644
--- a/torch/distributed/_shard/sharded_tensor/_ops/binary_cmp.py
+++ b/torch/distributed/_shard/sharded_tensor/_ops/binary_cmp.py
@@ -17,7 +17,7 @@ def _communicate_result(result, pg):
 
     expected_result = torch.ones(1, device=torch.device(torch.cuda.current_device())) * dist.get_world_size(pg)
 
-    return (result_tensor == expected_result).all().item()
+    return torch.equal(result_tensor, expected_result)
 
 def binary_cmp(cmp_fun, types, args, kwargs=None, process_group=None):
     if len(args) != 2:
diff --git a/torch/distributed/tensor/parallel/multihead_attention_tp.py b/torch/distributed/tensor/parallel/multihead_attention_tp.py
index d5b7f0c25bd3e..3c408e75e9d1a 100644
--- a/torch/distributed/tensor/parallel/multihead_attention_tp.py
+++ b/torch/distributed/tensor/parallel/multihead_attention_tp.py
@@ -151,10 +151,9 @@ def forward(
                 self.value(value), 1, (sk, b * nh, hn)
             )
         else:
-            torch.testing.assert_close(
-                (query, query), (key, value), rtol=0, atol=0, msg="inputs are different for self-attention."
-            )
-
+            assert torch.equal(query, key) and torch.equal(
+                query, value
+            ), "inputs are different for self-attention."
             # =====================
             # Query
             # =====================
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 37ba0e56e6ba0..9984f602425cd 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -894,7 +894,7 @@ def expand_as(g: jit_utils.GraphContext, self, other):
         self_t = self_t.to(torch.double)
         dims = []
         for d in range(self_t.dim()):
-            if (self_t.mean(d).unsqueeze(d).expand_as(self_t) == self_t).all():
+            if torch.equal(self_t.mean(d).unsqueeze(d).expand_as(self_t), self_t):
                 dims.append(d)
                 self = g.op("Constant", value_t=self_t.mean(dims).to(orig_type))
 
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index fa0c4a389ef66..814dd3d5ad5f8 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -114,10 +114,9 @@ def __init__(self, x):
 
     def __eq__(self, other):
         def eq(value, other):
-            result = value == other
-            if isinstance(result, torch.Tensor):
-                result = result.all().item()
-            return result
+            if isinstance(value, torch.Tensor):
+                return torch.equal(value, other)
+            return value == other
 
         for attr, value in self.__dict__.items():
             other_value = other.__dict__[attr]
@@ -353,7 +352,7 @@ def __init__(self):
 
     def forward(self, x):
         # Second layer is used dependent on input x.
-        use_second_layer = (x == torch.ones(20, 10, device=x.device)).all()
+        use_second_layer = torch.equal(x, torch.ones(20, 10, device=x.device))
         if use_second_layer:
             return self.lin2(F.relu(self.lin1(x)))
         else:
@@ -3313,7 +3312,7 @@ def _run_all_gather_coalesced_and_verify(
 
             for l1, l2 in zip(output_tensor_lists, expected_tensors):
                 for t1, t2 in zip(l1, l2):
-                    if not (t1 == t2).all():
+                    if not torch.equal(t1, t2):
                         return False
             return True
 
@@ -7453,7 +7452,7 @@ def forward(self, x):
                     # Control-flow that is rank and input dependent for the
                     # model.
                     use_second_layer = (
-                        (x == torch.ones(batch, dim, device=x.device)).all()
+                        torch.equal(x, torch.ones(batch, dim, device=x.device))
                         and self.rank == 1
                     )
 
diff --git a/torch/testing/_internal/distributed/nn/api/remote_module_test.py b/torch/testing/_internal/distributed/nn/api/remote_module_test.py
index 8a4d5c09512ec..997006353bfbd 100644
--- a/torch/testing/_internal/distributed/nn/api/remote_module_test.py
+++ b/torch/testing/_internal/distributed/nn/api/remote_module_test.py
@@ -242,7 +242,7 @@ def test_remote_parameters(self):
         ):
             param_rrefs = remote_module.remote_parameters()
             self.assertEqual(len(param_rrefs), 1)
-            self.assertEqual(param_rrefs[0].to_here(), _PARAM_VAL, rtol=0, atol=0, exact_device=True)
+            self.assertTrue(torch.equal(param_rrefs[0].to_here(), _PARAM_VAL))
 
     @dist_utils.dist_init
     def test_get_module_rref(self):
@@ -257,7 +257,7 @@ def test_get_module_rref(self):
             rref = remote_module.get_module_rref()
             self.assertEqual(rref, remote_module.module_rref)
             for param in rref.to_here().parameters():
-                self.assertEqual(param, _PARAM_VAL, rtol=0, atol=0, exact_device=True)
+                self.assertTrue(torch.equal(param, _PARAM_VAL))
 
     @dist_utils.dist_init
     def test_train_eval(self):
diff --git a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
index d64369767ff70..5d7831659fc1c 100644
--- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
@@ -71,7 +71,7 @@ def _compare_owner_value(context_id, rref, grad):
         grad = grad.to_dense()
     else:
         assert not grad.is_sparse
-    return (x == grad).all()
+    return torch.equal(x, grad)
 
 
 def create_tensor():
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index a48c8760d1373..4c0239ac653ee 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -237,7 +237,8 @@ def non_cont_test(t_view, t_cont):
         raise Exception('t_view is contiguous!')
     if not t_cont.is_contiguous():
         raise Exception('t_cont is not contiguous!')
-    torch.testing.assert_close(t_view, t_cont, rtol=0, atol=0, msg='t_view is not equal to t_cont!')
+    if not torch.equal(t_view, t_cont):
+        raise Exception('t_view is not equal to t_cont!')
     return t_view
 
 def my_function(a, b, c):
@@ -1067,7 +1068,7 @@ def _trainer_func(self, rref, sparse):
             ps_gradient = rref.rpc_sync().get_gradient(rref)
             if ps_gradient.is_sparse:
                 ps_gradient = ps_gradient.to_dense().double()
-            self.assertEqual(gradient, ps_gradient, rtol=0, atol=0, exact_device=True)
+            self.assertTrue(torch.equal(gradient, ps_gradient))
 
     def _my_parameter_server(self, sparse):
         ps_rref = RRef(MyParameterServer(self.world_size - 1))

From d6e3a8ae9cfefc12f65231b1db4327b3cff1504c Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Fri, 2 Dec 2022 15:25:57 +0000
Subject: [PATCH 1536/1922] Add integration test for composable fully_shard and
 checkpoint (#90041)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90041
Approved by: https://github.com/awgu, https://github.com/rohan-varma
---
 test/distributed/_composable/test_compose.py | 101 +++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 test/distributed/_composable/test_compose.py

diff --git a/test/distributed/_composable/test_compose.py b/test/distributed/_composable/test_compose.py
new file mode 100644
index 0000000000000..dd6f90c6d03ea
--- /dev/null
+++ b/test/distributed/_composable/test_compose.py
@@ -0,0 +1,101 @@
+# Owner(s): ["oncall: distributed"]
+
+import copy
+import sys
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed._composable import checkpoint, fully_shard
+from torch.distributed.fsdp.wrap import ModuleWrapPolicy
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import FSDPTest
+from torch.testing._internal.common_utils import (
+    TEST_WITH_DEV_DBG_ASAN,
+    run_tests,
+)
+
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+
+class ToyModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.l1 = nn.Linear(100, 100)
+        self.seq = nn.Sequential(
+            nn.ReLU(),
+            nn.Linear(100, 100),
+            nn.ReLU(),
+        )
+        self.l2 = nn.Linear(100, 100)
+
+    def forward(self, x):
+        return self.l2(self.seq(self.l1(x)))
+
+
+class TestFSDPCheckpoint(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    def _test_wrap_same_submodule(self, use_reentrant, grad_to_none):
+        LR = 0.01
+        device = torch.device("cuda")
+
+        model = ToyModel().to(device)
+
+        local_model = copy.deepcopy(model)
+        local_optim = torch.optim.Adam(local_model.parameters(), lr=LR)
+
+        combo_model = copy.deepcopy(model)
+        combo_optim = torch.optim.Adam(combo_model.parameters(), lr=LR)
+
+        # compose checkpoint and fully_shard
+        combo_model.seq = checkpoint(
+            combo_model.seq, use_reentrant=use_reentrant
+        )
+        combo_model.seq = fully_shard(
+            combo_model.seq,
+            policy=ModuleWrapPolicy({nn.Linear}),
+        )
+
+        x = torch.randn(2, 100, device=device)
+
+        for _ in range(5):
+            combo_loss = combo_model(x).sum()
+            local_loss = local_model(x).sum()
+
+            self.assertEqual(combo_loss, local_loss)
+
+            combo_loss.backward()
+            combo_optim.step()
+            combo_optim.zero_grad(set_to_none=grad_to_none)
+
+            local_loss.backward()
+            local_optim.step()
+            local_optim.zero_grad(set_to_none=grad_to_none)
+
+    @skip_if_lt_x_gpu(2)
+    def test_wrap_same_submodule(self):
+        self.run_subtests(
+            {
+                "use_reentrant": [True, False],
+                "grad_to_none": [True, False],
+            },
+            self._test_wrap_same_submodule,
+        )
+
+
+if __name__ == "__main__":
+    run_tests()

From bc3c92599d672fc8feb259521b3889059bd9e6c0 Mon Sep 17 00:00:00 2001
From: David Berard <dberard@meta.com>
Date: Fri, 2 Dec 2022 21:58:22 +0000
Subject: [PATCH 1537/1922] [NVFuser] undo v100 OOM skips (#90070)

Summary: I think these were just caused by parallel tests. After adjusting test settings to 1 thread, these stopped OOMing.

Test Plan:
```
$ buck2 test -j 1 mode/dev-nosan //caffe2/torch/csrc/jit/codegen/cuda:nvfuser
```
https://www.internalfb.com/intern/testinfra/testrun/6473924590389963

Differential Revision: D41643827

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90070
Approved by: https://github.com/jjsjann123
---
 torch/csrc/jit/codegen/cuda/test/test_gpu1.cpp              | 6 ------
 torch/csrc/jit/codegen/cuda/test/test_gpu2.cpp              | 6 ------
 torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp              | 3 ---
 .../csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp | 3 ---
 torch/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp         | 3 ---
 torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp    | 3 ---
 torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp     | 6 ------
 7 files changed, 30 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu1.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu1.cpp
index 42dfc93780bea..2a14695b53ff2 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu1.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu1.cpp
@@ -7177,9 +7177,6 @@ TEST_F(NVFuserTest, FusionComputeAtExprOrder2_CUDA) {
 }
 
 TEST_F(NVFuserTest, FusionComputeAtExprOrder3_CUDA) {
-#ifdef FBCODE_CAFFE2
-  GTEST_SKIP() << "OOM on V100 32gb";
-#endif
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -9794,9 +9791,6 @@ TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) {
 }
 
 TEST_F(NVFuserTest, FusionSmemDynamicPwiseMulSymbolicArgWAR_CUDA) {
-#ifdef FBCODE_CAFFE2
-  GTEST_SKIP() << "OOM on V100 32gb";
-#endif
   Fusion fusion;
   FusionGuard fg(&fusion);
 
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu2.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu2.cpp
index 8f9afb40c859a..d154b454281e1 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu2.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu2.cpp
@@ -2704,9 +2704,6 @@ TEST_F(NVFuserTest, FusionWelfordOp_CUDA) {
 }
 
 TEST_F(NVFuserTest, FusionBlockWelfordOp_CUDA) {
-#ifdef FBCODE_CAFFE2
-  GTEST_SKIP() << "OOM on V100 32gb";
-#endif
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -6339,9 +6336,6 @@ TEST_F(NVFuserTest, FusionWelfordOuterPersistence_CUDA) {
 }
 
 TEST_F(NVFuserTest, FusionSegmentIslands_CUDA) {
-#ifdef FBCODE_CAFFE2
-  GTEST_SKIP() << "OOM on V100 32gb";
-#endif
   auto fusion = std::make_unique<Fusion>();
   FusionGuard fg(fusion.get());
 
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp
index 0467680b83f58..8d24cc3803747 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp
@@ -5945,9 +5945,6 @@ TEST_F(NVFuserTest, AsyncCompilation_CUDA) {
 }
 
 TEST_F(NVFuserTest, FusionMergeBroadcastingTrivialReduction1_CUDA) {
-#ifdef FBCODE_CAFFE2
-  GTEST_SKIP() << "OOM on V100 32gb";
-#endif
   std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
   auto fusion = fusion_ptr.get();
   FusionGuard fg(fusion);
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp
index 3981b2c2b4497..e827de56e56bd 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp
@@ -1561,9 +1561,6 @@ TEST_F(NVFuserTest, FusionGroupedReductionReEntrant1_CUDA) {
 // Channels-last batch norm with vectorization. Relies on re-entrant
 // GroupedGridReduction
 TEST_F(NVFuserTest, FusionGroupedReductionChannelsLastBatchNormLike_CUDA) {
-#ifdef FBCODE_CAFFE2
-  GTEST_SKIP() << "OOM on V100 32gb";
-#endif
   Fusion fusion;
   FusionGuard fg(&fusion);
 
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp
index 3ee3bdd293835..d1f185011826e 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp
@@ -2621,9 +2621,6 @@ TEST_F(NVFuserTest, FusionGather4_CUDA) {
 }
 
 TEST_F(NVFuserTest, FusionGather5_CUDA) {
-#ifdef FBCODE_CAFFE2
-  GTEST_SKIP() << "OOM on V100 32gb";
-#endif
   Fusion fusion;
   FusionGuard fg(&fusion);
 
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp
index 1ea40a136f8c7..c00d02c8a40dd 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp
@@ -2815,9 +2815,6 @@ TEST_F(NVFuserTest, FusionAmpereMatmulLargeLoad_CUDA) {
 
 // Matmul test for Turing MMA: across supported layouts
 TEST_F(NVFuserTest, FusionTuringMatmulLargeLoad_CUDA) {
-#ifdef FBCODE_CAFFE2
-  GTEST_SKIP() << "OOM on V100 32gb";
-#endif
   // Keep multiples of 8 to keep vectorizable.
   int M = 504, N = 136, K = 248;
 
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp
index 229369977343a..b10360f00315e 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp
@@ -335,9 +335,6 @@ TEST_F(NVFuserTest, FusionScheduleTransposeMultipleOutput_CUDA) {
  * t1
  */
 TEST_F(NVFuserTest, FusionScheduleTransposeMultipleInputOutput_CUDA) {
-#ifdef FBCODE_CAFFE2
-  GTEST_SKIP() << "OOM on V100 32gb";
-#endif
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -997,9 +994,6 @@ TEST_F(NVFuserTest, FusionScheduleTransposeSmallInnerSize3_CUDA) {
 
 // x->sin->transpose->cos->y
 TEST_F(NVFuserTest, FusionScheduleTranspose2DSmallInnerSize_CUDA) {
-#ifdef FBCODE_CAFFE2
-  GTEST_SKIP() << "OOM on V100 32gb";
-#endif
   std::array<std::vector<int64_t>, 2> shapes{
       std::vector<int64_t>{1024 * 1024 * 128, 2},
       std::vector<int64_t>{2, 1024 * 1024 * 128}};

From b24bab987dff31334da4a00440990d6db3e3f4e2 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@fb.com>
Date: Fri, 2 Dec 2022 23:06:57 +0000
Subject: [PATCH 1538/1922] Update Persons of Interest (#90069)

Creates sections for contributors to MaskedTensor and NestedTensor and updates torchaudio.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90069
Approved by: https://github.com/drisspg, https://github.com/mikaylagawarecki, https://github.com/nateanl
---
 docs/source/community/persons_of_interest.rst | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/docs/source/community/persons_of_interest.rst b/docs/source/community/persons_of_interest.rst
index 02224696c61b0..c6fc75b865f0c 100644
--- a/docs/source/community/persons_of_interest.rst
+++ b/docs/source/community/persons_of_interest.rst
@@ -116,6 +116,22 @@ Sparse (torch.sparse)
 -  Christian Puhrsch (`cpuhrsch <https://github.com/cpuhrsch>`__)
 -  Andrew James (`amjames <https://github.com/amjames>`__)
 
+NestedTensor (torch.nested)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+-  Alban Desmaison (`albanD <https://github.com/albanD>`__)
+-  Christian Puhrsch (`cpuhrsch <https://github.com/cpuhrsch>`__)
+-  Driss Guessous (`drisspg <https://github.com/drisspg>`__)
+-  Joel Schlosser (`jbschlosser <https://github.com/jbschlosser>`__)
+-  Mikayla Gawarecki (`mikaylagawarecki <https://github.com/mikaylagawarecki>`__)
+-  Natalia Gimelshein (`ngimel <https://github.com/ngimel>`__)
+
+MaskedTensor (torch.masked)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+-  Christian Puhrsch (`cpuhrsch <https://github.com/cpuhrsch>`__)
+-  (emeritus) George Qi (`george-qi <https://github.com/george-qi>`__)
+
 Fast Fourier Transform (torch.fft)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -307,6 +323,11 @@ TorchAudio
 ~~~~~~~~~~
 
 -  Moto Hira (`mthrok <https://github.com/mthrok>`__)
+-  Jeff Hwang (`hwangjeff <https://github.com/hwangjeff>`__)
+-  Caroline Chen (`carolineechen <https://github.com/carolineechen>`__)
+-  Xiaohui Zhang (`xiaohui-zhang <https://github.com/xiaohui-zhang>`__)
+-  Zhaoheng Ni (`nateanl <https://github.com/nateanl>`__)
+-  (emeritus) Christian Puhrsch (`cpuhrsch <https://github.com/cpuhrsch>`__)
 -  (emeritus) Vincent QB (`vincentqb <https://github.com/vincentqb>`__)
 
 TorchRec

From 6d9d34b194656b5fd0fe3eb218d87843aa834ae8 Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Fri, 2 Dec 2022 07:45:15 -0800
Subject: [PATCH 1539/1922] [Quant] Remove explicitly default QConfigMapping
 settings (#90066)

Summary: Previously we explicitly set a qconfig for ops
like conv and linear in the default QConfigMapping. However,
this makes it difficult for user to override the global and
have the new global take effect for basic ops. This commit
removes these explicit settings so the user can simply run
the following to quantize these ops.
```
qconfig_mapping = get_default_qconfig_mapping()
qconfig_mapping.set_global(my_qconfig)
```
There is no change in behavior for the default use case
of not setting anything on the default QConfigMapping.

Test Plan:
python test/test_quantization.py TestQuantizeFx.test_default_qconfig_mapping_override_global

Reviewers: vkuzo, jerryzh168

Subscribers: vkuzo, jerryzh168
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90066
Approved by: https://github.com/vkuzo, https://github.com/jerryzh168
---
 test/quantization/fx/test_quantize_fx.py | 25 ++++++++++++++++++++++++
 torch/ao/quantization/qconfig_mapping.py | 14 -------------
 2 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 11f7162583139..3d6fe0efa347d 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -2041,6 +2041,31 @@ def test_qconfig_mapping_to_dict(self):
     def test_qconfig_mapping_repr(self):
         self.assertTrue(isinstance(get_default_qconfig_mapping().__repr__(), str))
 
+    def test_default_qconfig_mapping_override_global(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(1, 1, 1)
+
+            def forward(self, x):
+                return self.conv(x)
+
+        m = M().eval()
+        my_qconfig = QConfig(activation=MinMaxObserver, weight=default_weight_observer)
+        qconfig_mapping = get_default_qconfig_mapping()
+        # Override global qconfig
+        old_global_qconfig = qconfig_mapping.global_qconfig
+        qconfig_mapping.set_global(my_qconfig)
+        # Verify the correct qconfig was used
+        example_inputs = (torch.randn(1, 1, 1, 1),)
+        m = prepare_fx(m, qconfig_mapping, example_inputs)
+        self.assertTrue(isinstance(old_global_qconfig.activation(), HistogramObserver))
+        self.assertTrue(isinstance(my_qconfig.activation(), MinMaxObserver))
+        self.assertTrue(hasattr(m, "activation_post_process_0"))
+        self.assertTrue(hasattr(m, "activation_post_process_1"))
+        self.assertTrue(isinstance(m.activation_post_process_0, MinMaxObserver))
+        self.assertTrue(isinstance(m.activation_post_process_1, MinMaxObserver))
+
     # Dummy classes for PrepareCustomConfig testing
 
     class _DummyStandaloneModule:
diff --git a/torch/ao/quantization/qconfig_mapping.py b/torch/ao/quantization/qconfig_mapping.py
index 08476e01997f2..69b86b0186181 100644
--- a/torch/ao/quantization/qconfig_mapping.py
+++ b/torch/ao/quantization/qconfig_mapping.py
@@ -83,26 +83,12 @@ def _get_default_qconfig_mapping(is_qat: bool, backend: str, version: int) -> QC
     qconfig_mapping = QConfigMapping() \
         .set_global(qconfig) \
         .set_object_type("reshape", default_reuse_input_qconfig) \
-        .set_object_type(torch.nn.Conv1d, qconfig) \
-        .set_object_type(torch.nn.Conv2d, qconfig) \
-        .set_object_type(torch.nn.Conv3d, qconfig) \
         .set_object_type(torch.nn.ConvTranspose1d, qconfig_transpose) \
         .set_object_type(torch.nn.ConvTranspose2d, qconfig_transpose) \
         .set_object_type(torch.nn.ConvTranspose3d, qconfig_transpose) \
-        .set_object_type(torch.nn.Linear, qconfig) \
-        .set_object_type(torch.nn.functional.conv1d, qconfig) \
-        .set_object_type(torch.nn.functional.conv2d, qconfig) \
-        .set_object_type(torch.nn.functional.conv3d, qconfig) \
         .set_object_type(torch.nn.functional.conv_transpose1d, qconfig_transpose) \
         .set_object_type(torch.nn.functional.conv_transpose2d, qconfig_transpose) \
         .set_object_type(torch.nn.functional.conv_transpose3d, qconfig_transpose) \
-        .set_object_type(torch.nn.functional.linear, qconfig) \
-        .set_object_type(torch.nn.ReLU, qconfig) \
-        .set_object_type(torch.nn.functional.relu, qconfig) \
-        .set_object_type(torch.relu, qconfig) \
-        .set_object_type(torch.nn.BatchNorm1d, qconfig) \
-        .set_object_type(torch.nn.BatchNorm2d, qconfig) \
-        .set_object_type(torch.nn.BatchNorm3d, qconfig) \
         .set_object_type(torch.nn.functional.layer_norm, qconfig_layernorm) \
         .set_object_type(torch.nn.LayerNorm, qconfig_layernorm) \
 

From 38a46e05c0e55550428d0abef471afed7c5b4c5f Mon Sep 17 00:00:00 2001
From: Sergii Dymchenko <sdym@fb.com>
Date: Sat, 3 Dec 2022 00:05:23 +0000
Subject: [PATCH 1540/1922] Fix issue 38095 TODOs in test_quantized_op.py
 (#89883)

Fix TODOs related to https://github.com/pytorch/pytorch/issues/38095

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89883
Approved by: https://github.com/clee2000
---
 test/quantization/core/test_quantized_op.py | 32 ++++++++++-----------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index 116a76a2d1f5e..3e4cd2ac8b352 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -1810,9 +1810,9 @@ def test_adaptive_avg_pool2d_nhwc(self):
             for name, op in ops_under_test.items():
                 X_hat = op(qX, output_size=output_size)
                 self.assertTrue(X_hat.stride() != sorted(X_hat.stride()))
-                # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-                self.assertEqualIgnoreType(X_ref, X_hat.int_repr(), atol=1.0, rtol=0,
-                                           msg=error_message.format(name, X_ref, X_hat.int_repr()))
+                self.assertEqual(X_ref, X_hat.int_repr(), atol=1.0, rtol=0,
+                                 msg=error_message.format(name, X_ref, X_hat.int_repr()),
+                                 exact_dtype=False)
                 self.assertEqual(scale, X_hat.q_scale(),
                                  msg=error_message.format(name + '.scale', scale, X_hat.q_scale()))
                 self.assertEqual(zero_point, X_hat.q_zero_point(),
@@ -1886,10 +1886,9 @@ def test_adaptive_avg_pool(self):
                     devices = ["cpu", "cuda"] if (dim == 2 and torch.cuda.is_available()) else ["cpu"]
                     for device in devices:
                         qX_hat = op(qX.to(device=device), output_size=output_size)
-                        # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-                        self.assertEqualIgnoreType(
+                        self.assertEqual(
                             X_ref, qX_hat.int_repr(), atol=1.0,
-                            rtol=0, msg=error_message.format(name, X_ref, qX_hat))
+                            rtol=0, msg=error_message.format(name, X_ref, qX_hat), exact_dtype=False)
                         self.assertEqual(
                             scale, qX_hat.q_scale(),
                             msg=error_message.format(name + '.scale', scale,
@@ -1961,9 +1960,9 @@ def test_adaptive_avg_pool3d_ndhwc(self):
             for name, op in ops_under_test.items():
                 X_hat = op(qX, output_size=output_size)
                 self.assertTrue(X_hat.stride() != sorted(X_hat.stride()))
-                # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-                self.assertEqualIgnoreType(X_ref, X_hat.int_repr(), atol=1.0, rtol=0,
-                                           msg=error_message.format(name, X_ref, X_hat.int_repr()))
+                self.assertEqual(X_ref, X_hat.int_repr(), atol=1.0, rtol=0,
+                                 msg=error_message.format(name, X_ref, X_hat.int_repr()),
+                                 exact_dtype=False)
                 self.assertEqual(scale, X_hat.q_scale(),
                                  msg=error_message.format(name + '.scale', scale, X_hat.q_scale()))
                 self.assertEqual(zero_point, X_hat.q_zero_point(),
@@ -2108,10 +2107,10 @@ def test_interpolate(self, X, size, mode, scale_factor, align_corners, nhwc_layo
         for name, op in ops_under_test.items():
             qX_hat = op(qX, size=size, scale_factor=scale_factor,
                         mode=mode, align_corners=align_corners)
-            # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-            self.assertEqualIgnoreType(X_ref, qX_hat.int_repr(), atol=1.0, rtol=0,
-                                       msg="{} results are off: qX_hat={} X_ref={}"
-                                           .format(name, qX_hat.int_repr(), X_ref))
+            self.assertEqual(X_ref, qX_hat.int_repr(), atol=1.0, rtol=0,
+                             msg="{} results are off: qX_hat={} X_ref={}"
+                             .format(name, qX_hat.int_repr(), X_ref),
+                             exact_dtype=False)
             self.assertEqual(scale, qX_hat.q_scale(),
                              msg=error_message.format(name + '.scale', scale, qX_hat.q_scale()))
             self.assertEqual(zero_point, qX_hat.q_zero_point(),
@@ -2163,10 +2162,9 @@ def test_interpolate3d(self, X, size, mode, scale_factor, align_corners, nhwc_la
         for name, op in ops_under_test.items():
             qX_hat = op(qX, size=size, scale_factor=scale_factor,
                         mode=mode, align_corners=align_corners)
-            # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-            self.assertEqualIgnoreType(X_ref, qX_hat.int_repr(), atol=1.0, rtol=0,
-                                       msg="{} results are off: qX_hat={}, X_ref={}"
-                                           .format(name, qX_hat.int_repr(), X_ref))
+            self.assertEqual(X_ref, qX_hat.int_repr(), atol=1.0, rtol=0,
+                             msg="{} results are off: qX_hat={}, X_ref={}"
+                             .format(name, qX_hat.int_repr(), X_ref), exact_dtype=False)
             self.assertEqual(scale, qX_hat.q_scale(),
                              msg=error_message.format(name + '.scale', scale, qX_hat.q_scale()))
             self.assertEqual(zero_point, qX_hat.q_zero_point(),

From 9bb2ebcef1243d94eeca25fca03e9b719e7f1069 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Thu, 1 Dec 2022 18:19:21 -0800
Subject: [PATCH 1541/1922] [reland][quant] Explictly set default quantized
 engine instead of relying on the order of supported_qengines (#89804)
 (#90036)

Summary:
Fixes: https://github.com/pytorch/pytorch/issues/86404

Test Plan:
ossci + sandcastle
Reviewers:

Subscribers:

Tasks:

Tags:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90036
Approved by: https://github.com/andrewor14
---
 aten/src/ATen/Context.cpp | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index b391cd4aab904..936e9b6252863 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -295,8 +295,24 @@ bool Context::hasLAPACK() {
 }
 
 at::QEngine Context::qEngine() const {
-  // If wasn't explicitly set - take the last one available
-  return quantized_engine.value_or(supportedQEngines().back());
+  static auto _quantized_engine = []() {
+    at::QEngine qengine = at::kNoQEngine;
+#if defined(C10_MOBILE) && defined(USE_PYTORCH_QNNPACK)
+    qengine = at::kQNNPACK;
+#endif
+
+#if AT_MKLDNN_ENABLED()
+    qengine = at::kONEDNN;
+#endif
+
+#ifdef USE_FBGEMM
+    if (fbgemm::fbgemmSupportedCPU()) {
+      qengine = at::kFBGEMM;
+    }
+#endif
+    return qengine;
+  }();
+  return quantized_engine.value_or(_quantized_engine);
 }
 
 void Context::setQEngine(at::QEngine e) {

From 4171b652440b8804b24fb8b4a0865517455e1fa4 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Fri, 2 Dec 2022 13:44:13 -0800
Subject: [PATCH 1542/1922] [pruning][core][feature] Align BaseStructuredPruner
 with existing pruning flow (#88436)

Summary:

This PR aligns the "eager" mode of the structured pruning flow with the existing unstructured pruning flow.

The base pruner has been moved from and has been renamed from BasePruner to BaseStructuredPruner
`torch/ao/pruning/_experimental/pruner/base_pruner.py -> /torch/ao/pruning/_experimental/pruner/base_structured_pruner.py`

Support for pruning batchnorm modules in the config have been removed, so now the structured pruning code can use more of the BaseSparsifier logic and we don't need to override as many functions.

Since we aim to only support a single flow, we have only updated ZeroesParametrizations (FakeStructuredSparsity) and BiasHook.
The parameterizations have also been rewritten to use a bool mask tensor for keeping track of pruned rows, instead of using sets before.
This better aligns structured and unstructured sparsity.

The BaseStructuredSparsifier tests have also been updated to reflect the above changes. I also removed `squash_mask` tests because they were breaking CI and `squash_mask` is no longer used.

We will migrate the structured pruning code out of this folder in a later PR.

Test Plan:
```
python test/test_ao_sparsity -- TestBaseStructuredPruner
```

Reviewers:
z-a-f vkuzo

Subscribers:

Tasks:

Tags:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88436
Approved by: https://github.com/vkuzo
---
 ...runer.py => test_structured_sparsifier.py} | 109 +++-----
 test/test_ao_sparsity.py                      |   4 +-
 .../pruning/_experimental/pruner/__init__.py  |  12 +-
 .../_experimental/pruner/base_pruner.py       | 247 ------------------
 .../pruner/base_structured_sparsifier.py      |  74 ++++++
 .../_experimental/pruner/parametrization.py   |  67 ++---
 6 files changed, 133 insertions(+), 380 deletions(-)
 rename test/ao/sparsity/{test_pruner.py => test_structured_sparsifier.py} (74%)
 delete mode 100644 torch/ao/pruning/_experimental/pruner/base_pruner.py
 create mode 100644 torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py

diff --git a/test/ao/sparsity/test_pruner.py b/test/ao/sparsity/test_structured_sparsifier.py
similarity index 74%
rename from test/ao/sparsity/test_pruner.py
rename to test/ao/sparsity/test_structured_sparsifier.py
index 295939cb3e39f..1b504c9731d2b 100644
--- a/test/ao/sparsity/test_pruner.py
+++ b/test/ao/sparsity/test_structured_sparsifier.py
@@ -7,7 +7,7 @@
 
 import torch
 from torch import nn
-from torch.ao.pruning._experimental.pruner import BasePruner, PruningParametrization, ZeroesParametrization
+from torch.ao.pruning._experimental.pruner import BaseStructuredSparsifier, FakeStructuredSparsity
 from torch.nn.utils import parametrize
 
 from torch.testing._internal.common_utils import TestCase, skipIfTorchDynamo
@@ -19,10 +19,6 @@
     torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
 }
 
-NEEDS_ZEROS = {  # these layers should have pruned indices zero-ed, not removed
-    nn.BatchNorm2d
-}
-
 
 class Linear(nn.Module):
     r"""Model with Linear layers, in Sequential and outside, without biases"""
@@ -159,56 +155,30 @@ def forward(self, x):
         return x
 
 
-class Conv2dBN(nn.Module):
-    r"""Model with Conv2d layers and BatchNorms"""
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Conv2d(1, 32, 3, 1, bias=True),
-            nn.BatchNorm2d(32)
-        )
-        self.conv2d = nn.Conv2d(32, 64, 3, 1, bias=True)
-        self.bn = nn.BatchNorm2d(64)
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.conv2d(x)
-        x = self.bn(x)
-        return x
 
-
-class SimplePruner(BasePruner):
+class SimplePruner(BaseStructuredSparsifier):
     def update_mask(self, module, tensor_name, **kwargs):
-        getattr(module.parametrizations, tensor_name)[0].pruned_outputs.add(1)
+        getattr(module.parametrizations, tensor_name)[0].mask[1] = False
 
 
-class MultiplePruner(BasePruner):
+class MultiplePruner(BaseStructuredSparsifier):
     def update_mask(self, module, tensor_name, **kwargs):
-        getattr(module.parametrizations, tensor_name)[0].pruned_outputs.update([1, 2])
+        getattr(module.parametrizations, tensor_name)[0].mask[1] = False
+        getattr(module.parametrizations, tensor_name)[0].mask[2] = False
 
 
-class TestBasePruner(TestCase):
+class TestBaseStructuredSparsifier(TestCase):
     def _check_pruner_prepared(self, model, pruner, device):
         for config in pruner.groups:
-            modules = []
-            if type(config['module']) is tuple:
-                for module in config['module']:
-                    modules.append(module)
-            else:
-                module = config['module']
-                modules.append(module)
-            for module in modules:
-                assert module.weight.device.type == device.type
-                # Check mask exists
-                assert hasattr(module, 'mask')
-                # Check parametrization exists and is correct
-                assert parametrize.is_parametrized(module)
-                assert hasattr(module, "parametrizations")
-                # Assume that this is the 1st/only parametrization
-                if isinstance(module, tuple(NEEDS_ZEROS)):
-                    assert type(module.parametrizations.weight[0]) == ZeroesParametrization
-                else:
-                    assert type(module.parametrizations.weight[0]) == PruningParametrization
+            module = config["module"]
+            assert module.weight.device.type == device.type
+            # Check mask exists
+            assert config["tensor_fqn"] in pruner.state
+            # Check parametrization exists and is correct
+            assert parametrize.is_parametrized(module)
+            assert hasattr(module, "parametrizations")
+            # Assume that this is the 1st/only parametrization
+            assert type(module.parametrizations.weight[0]) == FakeStructuredSparsity
 
     def _check_pruner_mask_squashed(self, model, pruner, device):
         for config in pruner.groups:
@@ -222,7 +192,6 @@ def _check_pruner_mask_squashed(self, model, pruner, device):
             for module in modules:
                 assert module.weight.device.type == device.type
                 assert not hasattr(module, "parametrizations")
-                assert not hasattr(module, 'mask')
 
     def _check_pruner_valid_before_step(self, model, pruner, device):
         for config in pruner.groups:
@@ -235,9 +204,9 @@ def _check_pruner_valid_before_step(self, model, pruner, device):
                 modules.append(module)
             for module in modules:
                 assert module.weight.device.type == device.type
-                assert module.parametrizations.weight[0].pruned_outputs == set()
+                assert module.parametrizations.weight[0].mask.dtype == torch.bool
 
-    def _check_pruner_valid_after_step(self, model, pruner, pruned_set, device):
+    def _check_pruner_valid_after_step(self, model, pruner, mask, device):
         for config in pruner.groups:
             modules = []
             if type(config['module']) is tuple:
@@ -248,11 +217,12 @@ def _check_pruner_valid_after_step(self, model, pruner, pruned_set, device):
                 modules.append(module)
             for module in modules:
                 assert module.weight.device.type == device.type
-                assert module.parametrizations.weight[0].pruned_outputs == pruned_set
+                total = module.parametrizations.weight[0].mask.numel()
+                assert module.parametrizations.weight[0].mask.count_nonzero() == total - mask
 
     def _test_constructor_on_device(self, model, device):
-        self.assertRaisesRegex(TypeError, 'BasePruner .* update_mask',
-                               BasePruner)
+        self.assertRaisesRegex(TypeError, 'BaseStructuredSparsifier.* update_mask',
+                               BaseStructuredSparsifier)
         model1 = copy.deepcopy(model).to(device)
         pruner = SimplePruner(None)
         pruner.prepare(model1, None)
@@ -264,7 +234,7 @@ def _test_constructor_on_device(self, model, device):
         # Can instantiate the model with configs
         model2 = copy.deepcopy(model).to(device)
         pruner = SimplePruner({'test': 3})
-        pruner.prepare(model2, [model2.linear])
+        pruner.prepare(model2, [{"tensor_fqn": "linear.weight"}])
         assert len(pruner.groups) == 1
         assert pruner.groups[0]['module_fqn'] == 'linear'
         assert 'test' in pruner.groups[0]
@@ -297,11 +267,9 @@ def _test_prepare_conv2d_on_device(self, model, config, device):
         assert model(x).shape == (1, 64, 24, 24)
 
     def test_prepare_conv2d(self):
-        bn_model = Conv2dBN()
-        bn_config = [(bn_model.seq[0], bn_model.seq[1]), (bn_model.conv2d, bn_model.bn)]
 
-        models = [Conv2dA(), Conv2dB(), Conv2dC(), bn_model]
-        configs = [None, None, None, bn_config]
+        models = [Conv2dA(), Conv2dB(), Conv2dC()]
+        configs = [None, None, None]
         for device in DEVICES:
             for model, config in zip(models, configs):
                 model = model.to(device)
@@ -332,11 +300,9 @@ def _test_squash_mask_conv2d_on_device(self, model, config, device):
         assert model(x).shape == (1, 64, 24, 24)
 
     def test_squash_mask_conv2d(self):
-        bn_model = Conv2dBN()
-        bn_config = [(bn_model.seq[0], bn_model.seq[1]), (bn_model.conv2d, bn_model.bn)]
 
-        models = [Conv2dA(), Conv2dB(), Conv2dC(), bn_model]
-        configs = [None, None, None, bn_config]
+        models = [Conv2dA(), Conv2dB(), Conv2dC()]
+        configs = [None, None, None]
         for device in DEVICES:
             for model, config in zip(models, configs):
                 model = model.to(device)
@@ -345,19 +311,19 @@ def test_squash_mask_conv2d(self):
     def _test_step_linear_on_device(self, model, is_basic, device):
         model = model.to(device)
         if is_basic:
-            x = torch.ones(16, 16)
+            x = torch.ones(16, 16, device=device)
             pruner = SimplePruner(None)
             pruner.prepare(model, None)
             self._check_pruner_valid_before_step(model, pruner, device)
             pruner.step()
-            self._check_pruner_valid_after_step(model, pruner, {1}, device)
+            self._check_pruner_valid_after_step(model, pruner, 1, device)
         else:
-            x = torch.ones(7, 7)
+            x = torch.ones(7, 7, device=device)
             pruner = MultiplePruner(None)
             pruner.prepare(model, None)
             self._check_pruner_valid_before_step(model, pruner, device)
             pruner.step()
-            self._check_pruner_valid_after_step(model, pruner, {1, 2}, device)
+            self._check_pruner_valid_after_step(model, pruner, 2, device)
 
     def test_step_linear(self):
         basic_models = [Linear(), LinearB()]
@@ -375,20 +341,13 @@ def _test_step_conv2d_on_device(self, model, config, device):
         pruner.prepare(model, config)
         self._check_pruner_valid_before_step(model, pruner, device)
         pruner.step()
-        if type(model) is Conv2dBN:
-            assert pruner.get_module_pruned_outputs(model.seq[1]) == pruner.get_module_pruned_outputs(model.seq[0])
-            assert pruner.get_module_pruned_outputs(model.bn) == pruner.get_module_pruned_outputs(model.conv2d)
-        self._check_pruner_valid_after_step(model, pruner, {1}, device)
+        self._check_pruner_valid_after_step(model, pruner, 1, device)
         assert model(x).shape == (1, 64, 24, 24)
 
     @skipIfTorchDynamo("TorchDynamo fails with unknown reason")
     def test_step_conv2d(self):
-        bn_model = Conv2dBN()
-        bn_config = [(bn_model.seq[0], bn_model.seq[1]),
-                     (bn_model.conv2d, bn_model.bn)]
-
-        models = [Conv2dA(), Conv2dB(), Conv2dC(), bn_model]
-        configs = [None, None, None, None, bn_config]
+        models = [Conv2dA(), Conv2dB(), Conv2dC()]
+        configs = [None, None, None, None]
         for device in DEVICES:
             for model, config in zip(models, configs):
                 self._test_step_conv2d_on_device(model, config, torch.device(device))
diff --git a/test/test_ao_sparsity.py b/test/test_ao_sparsity.py
index ebe89689d6861..3024b3b100d45 100644
--- a/test/test_ao_sparsity.py
+++ b/test/test_ao_sparsity.py
@@ -14,9 +14,7 @@
 from ao.sparsity.test_sparsifier import TestBaseSparsifier  # noqa: F401
 from ao.sparsity.test_sparsifier import TestWeightNormSparsifier  # noqa: F401
 from ao.sparsity.test_sparsifier import TestNearlyDiagonalSparsifier  # noqa: F401
-
-# Pruner
-from ao.sparsity.test_pruner import TestBasePruner  # noqa: F401
+from ao.sparsity.test_structured_sparsifier import TestBaseStructuredSparsifier  # noqa: F401
 
 # Scheduler
 from ao.sparsity.test_scheduler import TestScheduler  # noqa: F401
diff --git a/torch/ao/pruning/_experimental/pruner/__init__.py b/torch/ao/pruning/_experimental/pruner/__init__.py
index c496e555930a2..e9b17f6c7aad7 100644
--- a/torch/ao/pruning/_experimental/pruner/__init__.py
+++ b/torch/ao/pruning/_experimental/pruner/__init__.py
@@ -1,15 +1,11 @@
-from .base_pruner import BasePruner
+from .base_structured_sparsifier import BaseStructuredSparsifier
 from .parametrization import (
-    ActivationReconstruction,
+    FakeStructuredSparsity,
     BiasHook,
-    PruningParametrization,
-    ZeroesParametrization,
 )
 
 __all__ = [
-    "ActivationReconstruction",
-    "BasePruner",
+    "FakeStructuredSparsity",
+    "BaseStructuredSparsifier",
     "BiasHook",
-    "PruningParametrization",
-    "ZeroesParametrization",
 ]
diff --git a/torch/ao/pruning/_experimental/pruner/base_pruner.py b/torch/ao/pruning/_experimental/pruner/base_pruner.py
deleted file mode 100644
index fbeed5084abb5..0000000000000
--- a/torch/ao/pruning/_experimental/pruner/base_pruner.py
+++ /dev/null
@@ -1,247 +0,0 @@
-
-import copy
-import warnings
-import abc
-
-import torch
-from torch import nn
-from torch.nn.utils import parametrize
-
-from torch.nn.modules.container import ModuleDict, ModuleList
-
-from .parametrization import PruningParametrization, ZeroesParametrization, ActivationReconstruction, BiasHook
-
-from torch.ao.pruning import BaseSparsifier, module_to_fqn, fqn_to_module
-from torch.ao.pruning.sparsifier.utils import get_arg_info_from_tensor_fqn
-
-__all__ = ["BasePruner"]
-
-SUPPORTED_MODULES = {  # added to config if None given
-    nn.Linear,
-    nn.Conv2d,
-    nn.BatchNorm2d,  # will need manual update to match conv2d
-}
-
-NEEDS_ZEROS = {  # these layers should have pruned indices zero-ed, not removed
-    nn.BatchNorm2d
-}
-
-class BasePruner(BaseSparsifier):
-    r"""Base class for all pruners.
-
-    Abstract methods that need to be implemented:
-
-    - update_mask: Function to compute a new mask for all keys in the
-        `groups` attribute.
-
-    Args:
-        - defaults [dict]: default configurations will be attached to the
-            configuration. Only the keys that don't exist in the `config` will
-            be updated.
-        - also_prune_bias [bool]: whether to prune bias in addition to weights (to prune full output channel)
-            or not; default=True.
-
-    """
-    def __init__(self, defaults, also_prune_bias=True):
-        super().__init__(defaults)
-        self.prune_bias = also_prune_bias
-
-    def _get_modules_and_tensor_names(self, config, use_path):
-        modules = []
-        tensor_names = []
-        if use_path:
-            if type(config['module']) is tuple:  # (Conv2d, BN)
-                for module_fqn, tensor_name in zip(config['module_fqn'], config['tensor_name']):
-                    module = fqn_to_module(self.model, module_fqn)
-                    modules.append(module)
-                    tensor_names.append(tensor_name)
-            else:
-                module = fqn_to_module(self.model, config['module_fqn'])
-                modules.append(module)
-                tensor_name = config['tensor_name']
-                tensor_names.append(tensor_name)
-
-        else:
-            if type(config['module']) is tuple:
-                for module, tensor_name in zip(config['module'], config['tensor_name']):
-                    modules.append(module)
-                    tensor_names.append(tensor_name)
-            else:
-                module = config['module']
-                modules.append(module)
-                tensor_name = config['tensor_name']
-                tensor_names.append(tensor_name)
-        return modules, tensor_names
-
-    def _prepare(self, use_path=False, *args, **kwargs):
-        r"""Adds mask parametrization to the layer weight
-        """
-        self.activation_handles = []  # store removable hook handles
-        self.bias_handles = []
-
-        for config in self.groups:
-            modules, tensor_names = self._get_modules_and_tensor_names(config, use_path)
-
-            for module, tensor_name in zip(modules, tensor_names):
-                if not isinstance(module, tuple(NEEDS_ZEROS)):
-                    # add pruning parametrization and forward hooks
-                    if getattr(module, 'mask', None) is None:
-                        module.register_buffer('mask', torch.tensor(getattr(module, tensor_name).shape[0]))
-                    param = config.get('parametrization', PruningParametrization)
-                    parametrize.register_parametrization(module, tensor_name, param(module.mask), unsafe=True)
-
-                    assert isinstance(module.parametrizations, ModuleDict)  # make mypy happy
-                    assert isinstance(module.parametrizations.weight, ModuleList)
-                    if isinstance(module, tuple(SUPPORTED_MODULES)):
-                        self.activation_handles.append(module.register_forward_hook(
-                            ActivationReconstruction(getattr(module.parametrizations, tensor_name)[0])
-                        ))
-                    else:
-                        raise NotImplementedError("This module type is not supported yet.")
-
-                else:  # needs zeros
-                    if getattr(module, 'mask', None) is None:
-                        module.register_buffer('mask', torch.tensor(getattr(module, tensor_name).shape[0]))
-                    param = config.get('parametrization', ZeroesParametrization)
-                    parametrize.register_parametrization(module, tensor_name, param(module.mask), unsafe=True)
-
-                if module.bias is not None:
-                    module.register_parameter('_bias', nn.Parameter(module.bias.detach()))
-                    module.bias = None
-                self.bias_handles.append(module.register_forward_hook(BiasHook(module.parametrizations.weight[0], self.prune_bias)))
-
-            if len(modules) == 2:  # (Conv2d, BN)
-                # should have the same set of pruned outputs
-                modules[1].parametrizations.weight[0].pruned_outputs = modules[0].parametrizations.weight[0].pruned_outputs
-
-    def make_config_from_model(self, model, SUPPORTED_MODULES=SUPPORTED_MODULES, NEEDS_ZEROS=NEEDS_ZEROS):
-        self.config = []
-        stack = [model]
-        while stack:
-            module = stack.pop()
-            for name, child in module.named_children():
-                if type(child) in SUPPORTED_MODULES:
-                    child_fqn = module_to_fqn(model, child)
-                    assert isinstance(child_fqn, str)  # for mypy
-                    self.config.append({'tensor_fqn': child_fqn + '.weight'})
-                else:
-                    if NEEDS_ZEROS is not None and type(child) in NEEDS_ZEROS and hasattr(self, "prune_bias") and self.prune_bias:
-                        # only useful for Pruner
-                        warnings.warn(f"Models with {type(child)} layers have config provided by user.")
-                    stack.append(child)
-
-    def prepare(self, model, config):
-        r"""Prepares a model, by adding the parametrizations and forward post-hooks.
-        Note::
-            The model is modified inplace. If you need to preserve the original
-            model, use copy.deepcopy.
-
-        Args:
-        - model [nn.Module]: model to configure. The model itself is not saved
-            but used for the state_dict saving / loading.
-        - config [list]: configuration elements could either be instances of
-            tuples of dict maps or dict maps. The dicts must have a key 'tensor_fqn' with the
-            value being the fqn of the tensor to be pruned.
-        """
-        self.model = model  # TODO: Need to figure out how to load without this.
-        self.config = config
-
-        # If no config -- try getting all the supported layers
-        if self.config is None:
-            # Add all models to the config
-            self.make_config_from_model(self.model)
-
-        for module_config in self.config:
-            if type(module_config) is tuple:
-                first_layer, next_layer = module_config
-                assert isinstance(first_layer, nn.Conv2d) and isinstance(next_layer, nn.BatchNorm2d)
-                assert isinstance(module_config, tuple)  # for mypy
-                module_config = {'module': module_config}
-                local_args = copy.deepcopy(self.defaults)
-                local_args.update(module_config)
-                module_fqn_list = []
-                tensor_fqn_list = []
-                tensor_name_list = []
-                for module in local_args['module']:
-                    module_fqn = module_to_fqn(model, module)
-                    if module_fqn is None:
-                        module_fqn = ''
-                    if module_fqn and module_fqn[0] == '.':
-                        module_fqn = module_fqn[1:]
-                    module_fqn_list.append(module_fqn)
-                    tensor_fqn_list.append(module_fqn + '.weight')
-                    tensor_name_list.append('weight')
-
-                local_args['module_fqn'] = module_fqn_list
-                local_args['tensor_fqn'] = tensor_fqn_list
-                local_args['tensor_name'] = tensor_name_list
-            else:
-                if isinstance(module_config, nn.Module):
-                    module_config = {'module': module_config}  # type: ignore[dict-item]
-
-                local_args = copy.deepcopy(self.defaults)
-                local_args.update(module_config)
-
-                # now that we're working with a dict, does it have the new format?
-                if local_args.get('tensor_fqn', None) is not None:
-                    tensor_fqn = local_args.get('tensor_fqn')
-                    assert isinstance(tensor_fqn, str)  # for mypy
-                    info_from_tensor_fqn = get_arg_info_from_tensor_fqn(model, tensor_fqn)
-
-                    for key in info_from_tensor_fqn.keys():
-                        if key in local_args:
-                            # info_from_tensor_fqn will chop leading '.' from tensor_fqn so ignore that
-                            assert key == 'tensor_fqn' or info_from_tensor_fqn[key] == local_args[key], (
-                                "Given both `{}` and `tensor_fqn`, it is expected them to "
-                                "agree!".format(key)
-                            )
-                    local_args.update(info_from_tensor_fqn)
-                else:
-                    module = local_args['module']
-                    module_fqn = module_to_fqn(model, module)
-                    if module_fqn and module_fqn[0] == '.':
-                        module_fqn = module_fqn[1:]
-                    local_args['module_fqn'] = module_fqn
-                    local_args['tensor_name'] = "weight"
-                    assert isinstance(module_fqn, str)  # for mypy
-                    local_args['tensor_fqn'] = module_fqn + ".weight"
-            self.groups.append(local_args)
-
-        self._prepare()
-
-    def squash_mask(self, use_path=False, *args, **kwargs):
-        for config in self.groups:
-            modules, tensor_names = self._get_modules_and_tensor_names(config, use_path)
-
-            for module, tensor_name in zip(modules, tensor_names):
-                parametrize.remove_parametrizations(module, tensor_name,
-                                                    leave_parametrized=True)
-                if getattr(module._parameters, 'mask', None):
-                    del module._parameters['mask']
-                elif getattr(module._buffers, 'mask', None):
-                    del module._buffers['mask']
-                delattr(module, 'mask')
-
-    def get_module_pruned_outputs(self, module, tensor_name='weight'):
-        r"""Returns the set of pruned indices of module"""
-        assert parametrize.is_parametrized(module)  # can only get pruned indices of pruned module
-        return getattr(module.parametrizations, tensor_name)[0].pruned_outputs  # assume only one parametrization attached
-
-    def step(self, use_path=False):
-        if not self.enable_mask_update:
-            return
-        with torch.no_grad():
-            for config in self.groups:
-                modules, tensor_names = self._get_modules_and_tensor_names(config, use_path)
-
-                untupled_args: dict = {}
-                untupled_args.update()
-                # only need to update the first module in modules if len(modules) > 1
-                # since they should share the same set of pruned outputs
-                untupled_args['module'] = modules[0]
-                untupled_args['tensor_name'] = tensor_names[0]
-                self.update_mask(**config)
-
-    @abc.abstractmethod
-    def update_mask(self, module, tensor_name, **kwargs):
-        pass
diff --git a/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py b/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
new file mode 100644
index 0000000000000..e753d2a6d88da
--- /dev/null
+++ b/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
@@ -0,0 +1,74 @@
+from typing import Set, Type
+import torch
+from torch import nn
+from torch.nn.utils import parametrize
+
+from torch.ao.pruning import BaseSparsifier
+from .parametrization import FakeStructuredSparsity, BiasHook
+
+__all__ = ["BaseStructuredSparsifier"]
+
+SUPPORTED_STRUCTURED_PRUNING_MODULES = {  # added to config if None given
+    nn.Linear,
+    nn.Conv2d,
+}
+
+
+class BaseStructuredSparsifier(BaseSparsifier):
+    r"""Base class for structured pruning.
+
+    Abstract methods that need to be implemented:
+        - update_mask: Function to compute a new mask for all keys in the
+            `groups` attribute.
+
+    Args:
+        - defaults [dict]: default configurations will be attached to the
+            configuration. Only the keys that don't exist in the `config` will
+            be updated.
+    """
+
+    def __init__(self, defaults):
+        super().__init__(defaults)
+
+    def make_config_from_model(
+        self,
+        model: nn.Module,
+        SUPPORTED_MODULES: Set[Type] = SUPPORTED_STRUCTURED_PRUNING_MODULES,
+    ) -> None:
+        super().make_config_from_model(
+            model, SUPPORTED_MODULES=SUPPORTED_STRUCTURED_PRUNING_MODULES
+        )
+
+    def _prepare(self, *args, **kwargs) -> None:
+        r"""This function will attach the FakeStructuredSparsity parameterizations
+        and BiasHooks at the appropriate points in the model.
+        """
+        self.bias_handles = []
+
+        for config in self.groups:
+            module = config["module"]
+            tensor_name = config["tensor_name"]
+            parametrization = config.get("parametrization", FakeStructuredSparsity)
+            tensor = getattr(module, tensor_name)
+
+            mask = config.get(
+                "mask",
+                torch.ones(tensor.shape[0], dtype=torch.bool, device=tensor.device),
+            )
+            self.state[config["tensor_fqn"]]["mask"] = mask
+            parametrize.register_parametrization(
+                module, tensor_name, parametrization(mask), unsafe=True
+            )
+
+            prune_bias = config.get("prune_bias", True)
+            if prune_bias and module.bias is not None:
+                module.register_parameter("_bias", nn.Parameter(module.bias.detach()))
+                module.bias = None
+            self.bias_handles.append(
+                module.register_forward_hook(
+                    BiasHook(module.parametrizations.weight[0], prune_bias)
+                )
+            )
+
+    def convert(self):
+        pass
diff --git a/torch/ao/pruning/_experimental/pruner/parametrization.py b/torch/ao/pruning/_experimental/pruner/parametrization.py
index 77c86a22e175a..2ea59d48ee809 100644
--- a/torch/ao/pruning/_experimental/pruner/parametrization.py
+++ b/torch/ao/pruning/_experimental/pruner/parametrization.py
@@ -1,72 +1,45 @@
 import torch
 from torch import nn
-from typing import Any, List
 
-__all__ = ['PruningParametrization', 'ZeroesParametrization', 'ActivationReconstruction', 'BiasHook']
+__all__ = ['FakeStructuredSparsity', 'BiasHook']
 
-class PruningParametrization(nn.Module):
-    def __init__(self, original_outputs):
-        super().__init__()
-        self.original_outputs = set(range(original_outputs.item()))
-        self.pruned_outputs = set()  # Will contain indicies of outputs to prune
 
-    def forward(self, x):
-        valid_outputs = self.original_outputs - self.pruned_outputs
-        return x[list(valid_outputs)]
+# Structured Pruning Parameterizations
+class FakeStructuredSparsity(nn.Module):
+    r"""
+    Parametrization for Structured Pruning. Like FakeSparsity, this should be attached to
+    the  'weight' or any other parameter that requires a mask.
 
+    Instead of an element-wise bool mask, this parameterization uses a row-wise bool mask.
+    """
 
-class ZeroesParametrization(nn.Module):
-    r"""Zero out pruned channels instead of removing.
-    E.g. used for Batch Norm pruning, which should match previous Conv2d layer."""
-    def __init__(self, original_outputs):
+    def __init__(self, mask):
         super().__init__()
-        self.original_outputs = set(range(original_outputs.item()))
-        self.pruned_outputs = set()  # Will contain indicies of outputs to prune
+        self.register_buffer("mask", mask)
 
     def forward(self, x):
-        x.data[list(self.pruned_outputs)] = 0
-        return x
-
-
-class ActivationReconstruction:
-    def __init__(self, parametrization):
-        self.param = parametrization
-
-    def __call__(self, module, input, output):
-        max_outputs = self.param.original_outputs
-        pruned_outputs = self.param.pruned_outputs
-        valid_columns = list(max_outputs - pruned_outputs)
-
-        # get size of reconstructed output
-        sizes = list(output.shape)
-        sizes[1] = len(max_outputs)
-
-        # get valid indices of reconstructed output
-        indices: List[Any] = []
-        for size in output.shape:
-            indices.append(slice(0, size, 1))
-        indices[1] = valid_columns
-
-        reconstructed_tensor = torch.zeros(sizes,
-                                           dtype=output.dtype,
-                                           device=output.device,
-                                           layout=output.layout)
-        reconstructed_tensor[indices] = output
-        return reconstructed_tensor
+        assert isinstance(self.mask, torch.Tensor)
+        assert self.mask.shape[0] == x.shape[0]
+        shape = [1] * len(x.shape)
+        shape[0] = -1
+        return self.mask.reshape(shape) * x
 
+    def state_dict(self, *args, **kwargs):
+        # avoid double saving masks
+        return {}
 
 class BiasHook:
+
     def __init__(self, parametrization, prune_bias):
         self.param = parametrization
         self.prune_bias = prune_bias
 
     def __call__(self, module, input, output):
-        pruned_outputs = self.param.pruned_outputs
 
         if getattr(module, '_bias', None) is not None:
             bias = module._bias.data
             if self.prune_bias:
-                bias[list(pruned_outputs)] = 0
+                bias[~self.param.mask] = 0
 
             # reshape bias to broadcast over output dimensions
             idx = [1] * len(output.shape)

From 9f38e055f09c01eb671904c5dd33eebea217fbfa Mon Sep 17 00:00:00 2001
From: Michael Voznesensky <voznesenskym@gmail.com>
Date: Fri, 2 Dec 2022 18:27:07 +0000
Subject: [PATCH 1543/1922] Use dynamo fake tensor mode in aot_autograd, move
 aot_autograd compilation to lowering time [Merger of 89672 and 89773]
 (#90039)

After all of the preparatory commits, this is a subset of the
changes in https://github.com/pytorch/pytorch/pull/89392 that actually
change us to propagating fake tensors to backends.

Signed-off-by: Edward Z. Yang <ezyangfb.com>

This is the merger of Ed's PR #89672, which is a rewrite of an older PR of mine (#89392), with CI Fixes on top of it (#89773)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90039
Approved by: https://github.com/ezyang
---
 test/dynamo/test_export.py                 | 27 ++++-----
 test/dynamo/test_minifier.py               |  4 +-
 test/dynamo/test_modules.py                |  4 ++
 test/dynamo/test_optimizations.py          | 32 ++++++++++
 test/dynamo/test_verify_correctness.py     |  7 ++-
 torch/_dynamo/debug_utils.py               |  4 ++
 torch/_dynamo/optimizations/backends.py    | 14 ++++-
 torch/_dynamo/optimizations/distributed.py | 68 +++++++++++++++++-----
 torch/_dynamo/optimizations/log_args.py    | 11 ++--
 torch/_dynamo/output_graph.py              | 66 ++++++++++++++++-----
 torch/_dynamo/test_minifier_common.py      |  2 -
 torch/_dynamo/utils.py                     | 33 ++++++++++-
 torch/_dynamo/variables/builder.py         | 54 ++++++++++++-----
 torch/_inductor/overrides.py               |  8 ++-
 torch/fx/passes/shape_prop.py              | 29 ++++++++-
 15 files changed, 287 insertions(+), 76 deletions(-)

diff --git a/test/dynamo/test_export.py b/test/dynamo/test_export.py
index e6b505dea51e5..7779c479b6d41 100644
--- a/test/dynamo/test_export.py
+++ b/test/dynamo/test_export.py
@@ -938,23 +938,24 @@ def func(x):
         torch._dynamo.reset()
 
         def compiler(gm, sample_inputs):
-            aten_gm = make_fx(gm)(*sample_inputs)
+            def fw(*args):
+                aten_gm = make_fx(gm)(*args)
+                return aten_gm(*args)
 
-            self.assertEqual(len(aten_gm.graph.nodes), len(out_graph.graph.nodes))
-            for node1, node2 in zip(aten_gm.graph.nodes, out_graph.graph.nodes):
-                self.assertEqual(node1.op, node2.op)
-                if node1.op == "call_function":
-                    self.assertEqual(node1.target, node2.target)
-                    self.assertEqual(len(node1.args), len(node2.args))
-                    for arg1, arg2 in zip(node1.args, node2.args):
-                        self.assertEqual(type(arg1), type(arg2))
-
-            return aten_gm.forward
+            return fw
 
         opt_func = torch._dynamo.optimize(compiler, nopython=True)(func)
-        make_fx_result = opt_func(inp)
+        make_fx_result_through_backend = opt_func(inp)
+
+        fx_g = make_fx(func)(inp)
+        make_fx_result_through_direct = fx_g(inp)
 
-        self.assertTrue(torch._dynamo.utils.same(make_fx_result, export_result))
+        self.assertTrue(
+            torch._dynamo.utils.same(make_fx_result_through_backend, export_result)
+        )
+        self.assertTrue(
+            torch._dynamo.utils.same(make_fx_result_through_direct, export_result)
+        )
 
     def test_export_with_constant_method_on_module(self):
         class MyModule(torch.nn.Module):
diff --git a/test/dynamo/test_minifier.py b/test/dynamo/test_minifier.py
index c1a56f070be5d..d2f82f92510c5 100644
--- a/test/dynamo/test_minifier.py
+++ b/test/dynamo/test_minifier.py
@@ -191,8 +191,10 @@ def inner(x):
         """
         )
 
+        repro_after = "dynamo"
+        repro_level = 2
         test_code = self._gen_test_code(
-            run_code, "dynamo", 2, RELU_CUSTOM_ERROR_BACKEND
+            run_code, repro_after, repro_level, RELU_CUSTOM_ERROR_BACKEND
         )
         _, repro_dir = self._run_test_code(test_code)
         launch_proc, _ = self._run_minifier_launcher("", repro_dir)
diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py
index f510fb87522c5..da3f1d3d59881 100644
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@@ -1,5 +1,6 @@
 # Owner(s): ["module: dynamo"]
 
+import unittest
 from copy import deepcopy
 from unittest.mock import patch
 
@@ -762,6 +763,9 @@ def test_generation_tag(self):
         m3 = deepcopy(m1)
         self.assertEqual(GenerationTracker.get_generation_value(m3), cur_generation)
 
+    # torch._subclasses.fake_tensor.UnsupportedFakeTensorException: meta converter nyi
+    # due to custom subclass (TensorProxy)
+    @unittest.expectedFailure
     def test_simple_torch_function(self):
         def foo(x):
             # function call, twice to test wrapping
diff --git a/test/dynamo/test_optimizations.py b/test/dynamo/test_optimizations.py
index 1f69a8fd79062..5bff327786fa6 100644
--- a/test/dynamo/test_optimizations.py
+++ b/test/dynamo/test_optimizations.py
@@ -120,6 +120,38 @@ def compiler_fn(graph, example_inputs):
         opt_fn = torch._dynamo.optimize_assert(compiler_fn)(fn)
         r3 = opt_fn(a, (b, c), d)
 
+        self.assertIsNotNone(r1)
+        self.assertEqual(r1.size(), r2.size())
+        self.assertEqual(r1.stride(), r2.stride())
+        self.assertEqual(r1.dtype, r2.dtype)
+
+        self.assertEqual(r1.size(), r3.size())
+        self.assertEqual(r1.stride(), r3.stride())
+        self.assertEqual(r1.dtype, r3.dtype)
+
+    def test_example_inputs_runtime_use(self):
+        def fn(a, bc, d):
+            b, c = bc
+            return a / d - b / c
+
+        def compiler_fn(graph, example_inputs):
+            def fwd(*args):
+                nonlocal r1
+                r = graph.forward(*args)
+                r1 = r[0]
+                return r
+
+            return fwd
+
+        a = torch.empty(2).fill_(1)
+        b = torch.empty(2).fill_(2)
+        c = torch.empty(2).fill_(3)
+        d = 4
+        r1 = None
+        r2 = fn(a, (b, c), d)
+        opt_fn = torch._dynamo.optimize_assert(compiler_fn)(fn)
+        r3 = opt_fn(a, (b, c), d)
+
         self.assertIsNotNone(r1)
         self.assertTrue(same(r1, r2))
         self.assertTrue(same(r1, r3))
diff --git a/test/dynamo/test_verify_correctness.py b/test/dynamo/test_verify_correctness.py
index 8e3624bfd9e7d..7a6f8e3d42639 100644
--- a/test/dynamo/test_verify_correctness.py
+++ b/test/dynamo/test_verify_correctness.py
@@ -100,8 +100,11 @@ def compiler_fn(graph, example_inputs):
         r3 = opt_fn(a, (b, c), d)
 
         self.assertIsNotNone(r1)
-        self.assertTrue(same(r1, r2))
-        self.assertTrue(same(r1, r3))
+
+        self.assertEqual(r1.shape, r2.shape)
+        self.assertEqual(r1.shape, r3.shape)
+        self.assertEqual(r1.device, r2.device)
+        self.assertEqual(r1.device, r3.device)
 
     @patch.object(config, "verify_correctness", True)
     def test_nnc(self):
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index a1591d27b16f4..739c7b56f916e 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -762,6 +762,9 @@ def backend_accuracy_fails(gm, example_inputs, compiler_fn, only_fwd=False):
 
 backend_aot_accuracy_fails = functools.partial(backend_accuracy_fails, only_fwd=True)
 
+# Please see NOTE: [Real Tensors in Accuracy Evaluation]
+MINIFIER_SPAWNED = False
+
 
 def backend_fails(gm, example_inputs, compiler_fn, orig_failure):
     """
@@ -832,6 +835,7 @@ def dump_to_minify_after_dynamo(gm, args, compiler_name):
 mod = Repro()
 
 # Setup debug minifier compiler
+torch._dynamo.debug_utils.MINIFIER_SPAWNED = True
 compiler_fn = BACKENDS["{minifier_backend}"]
 {custom_compiler_error}
 dynamo_minifier_backend = functools.partial(
diff --git a/torch/_dynamo/optimizations/backends.py b/torch/_dynamo/optimizations/backends.py
index 0df57eb4273d1..007b02018f2a8 100644
--- a/torch/_dynamo/optimizations/backends.py
+++ b/torch/_dynamo/optimizations/backends.py
@@ -747,9 +747,17 @@ def torchxla_trivial(subgraph):
 def torchxla_trace_once(subgraph):
     import torch._dynamo.optimizations.torchxla_integration as integration
 
-    model = subgraph.model
-    example_inputs = subgraph.example_inputs
-    return integration.extract_compiled_graph(model, example_inputs)
+    compiled_graph = None
+
+    def fwd(*args):
+        nonlocal compiled_graph
+        model = subgraph.model
+        if compiled_graph is None:
+            compiled_graph = integration.extract_compiled_graph(model, args)
+            del subgraph
+        return compiled_graph(*args)
+
+    return fwd
 
 
 def ipex_fp32(gm: torch.fx.GraphModule, example_inputs):
diff --git a/torch/_dynamo/optimizations/distributed.py b/torch/_dynamo/optimizations/distributed.py
index b71d85c4e34f8..934a9abc674ed 100644
--- a/torch/_dynamo/optimizations/distributed.py
+++ b/torch/_dynamo/optimizations/distributed.py
@@ -6,6 +6,7 @@
 import torch.fx.traceback as fx_traceback
 from torch import fx
 from torch.fx.node import Node
+from ..utils import deepcopy_to_fake_tensor, fake_mode_from_tensors
 
 log = logging.getLogger(__name__)
 
@@ -138,6 +139,8 @@ def compile_fn(self, gm: fx.GraphModule, example_inputs: List[torch.Tensor]):
         to compile each subgraph. Finally, stiches compiled graphs into one graphmodule
         and returns its callable.
         """
+        fake_mode = fake_mode_from_tensors(example_inputs)
+        assert fake_mode is not None
 
         # 1: compute the partition map according to DDP bucket logic
         buckets = [Bucket()]  # (size, param_names)
@@ -211,7 +214,7 @@ def __init__(self, module, compiler):
                 super().__init__(module)
                 self.compiler = compiler
 
-            def compile_submod(self, submod, args, kwargs):
+            def compile_submod(self, input_mod, args, kwargs):
                 """
                 Compile the submodule,
                 using a wrapper to make sure its output is always a tuple,
@@ -220,13 +223,13 @@ def compile_submod(self, submod, args, kwargs):
                 assert len(kwargs) == 0, "We assume only args for these modules"
 
                 class WrapperModule(torch.nn.Module):
-                    def __init__(self, compiled_submod, unwrap_singleton_tuple):
+                    def __init__(self, submod, unwrap_singleton_tuple):
                         super().__init__()
-                        self.compiled_submod = compiled_submod
+                        self.submod = submod
                         self.unwrap_singleton_tuple = unwrap_singleton_tuple
 
                     def forward(self, *args):
-                        x = self.compiled_submod(*args)
+                        x = self.submod(*args)
                         # TODO(whc)
                         # for some reason the isinstance check is necessary if I split one node per submod
                         # - even though I supposedly wrapped the output in a tuple in those cases, the real
@@ -236,22 +239,52 @@ def forward(self, *args):
                         return x
 
                 unwrap_singleton_tuple = False
-                for sn in submod.graph.nodes:
+                for sn in input_mod.graph.nodes:
                     if sn.op == "output":
                         if not isinstance(sn.args[0], tuple):
                             unwrap_singleton_tuple = True
                             sn.args = (sn.args,)
-                submod.recompile()
 
+                input_mod.recompile()
                 wrapper = WrapperModule(
-                    self.compiler(submod, args),
+                    self.compiler(input_mod, args),
                     unwrap_singleton_tuple,
                 )
                 return wrapper
 
+            # Note:
+            #
+            # The way distributed works today around fake tensors can be somehwat confusing.
+            # Some of these codepaths are shared in both runtime, and compile time. The presence
+            # of a fake_mode, read off of fake tensor inputs, dictates how we will operate.
+            #
+            # A few things to keep in mind:
+            #
+            # 1) We invoke `compile_submod` with a real module. The output of that gets stored
+            # on the graph via `self.module.add_submodule(n.target, compiled_submod_real)`.
+            #
+            # 2) When running a call_module targeted node, if we have a fake_mode, we fakify the
+            # module we got from self.fetch_attr(n.target). Regardless of fake_mode, we then execute it.
+            #
+            # 3) Fake tensors should always be around during compile time.
+            #
+            # 4) Fake tensors should never be around at runtime.
+            #
+            # 5) We end up with a compilation mode that takes a real submodule and fake tensors,
+            # to match what aot_autograd exepcts. See Note: [Fake Modules and AOTAutograd]
             def run_node(self, n: Node) -> Any:
                 with fx_traceback.append_stack_trace(n.stack_trace):
                     args, kwargs = self.fetch_args_kwargs_from_env(n)
+                    new_args = []
+                    assert fake_mode
+                    for arg in args:
+                        if isinstance(arg, torch.Tensor) and not isinstance(
+                            arg, torch._subclasses.FakeTensor
+                        ):
+                            new_args.append(fake_mode.from_tensor(arg))
+                        else:
+                            new_args.append(arg)
+
                     log.debug(f"run_node {n.op}, {n.target} got args {args_str(args)}")
                     assert isinstance(args, tuple)
                     assert isinstance(kwargs, dict)
@@ -259,19 +292,28 @@ def run_node(self, n: Node) -> Any:
                     # modify the currently running FX graph
                     # maybe this isn't sound in general, but only changing the target of a node might be ok?
                     if n.op == "call_module":
-                        submod = self.fetch_attr(n.target)
-                        log.debug(f"\n---{n.target} graph---\n" + str(submod.graph))
-                        compiled_submod = self.compile_submod(submod, args, kwargs)
+                        real_mod = self.fetch_attr(n.target)
+                        if fake_mode:
+                            curr_submod = deepcopy_to_fake_tensor(real_mod, fake_mode)
+                        else:
+                            curr_submod = real_mod
+
+                        log.debug(
+                            f"\n---{n.target} graph---\n" + str(curr_submod.graph)
+                        )
+                        compiled_submod_real = self.compile_submod(
+                            real_mod, new_args, kwargs
+                        )
                         self.module.delete_submodule(n.target)
                         n.target = "compiled_" + n.target
-                        self.module.add_submodule(n.target, compiled_submod)
+                        self.module.add_submodule(n.target, compiled_submod_real)
+                        return curr_submod(*new_args, **kwargs)
                     # then we execute the modified node using the usual logic
-                    return getattr(self, n.op)(n.target, args, kwargs)
+                    return getattr(self, n.op)(n.target, new_args, kwargs)
 
         submod_compiler = SubmodCompiler(split_gm, self.backend_compile_fn)
         submod_compiler.run(*example_inputs)
         split_gm.recompile()
 
         log.debug("\n---final graph---\n" + str(split_gm.graph) + "\n---------------\n")
-
         return split_gm
diff --git a/torch/_dynamo/optimizations/log_args.py b/torch/_dynamo/optimizations/log_args.py
index caa0a9a83ce66..111da69d4a8fe 100644
--- a/torch/_dynamo/optimizations/log_args.py
+++ b/torch/_dynamo/optimizations/log_args.py
@@ -34,7 +34,6 @@ def run(self, *args):
 
     def run_node(self, n: torch.fx.Node):
         result = super().run_node(n)
-
         if n.op == "call_function":
             if n.target == aten.convolution.default:
                 args, kwargs = self.fetch_args_kwargs_from_env(n)
@@ -67,8 +66,8 @@ def run_node(self, n: torch.fx.Node):
 
 
 def conv_args_analysis(gm: torch.fx.GraphModule, example_inputs):
-    # lowering graph
-    gm = make_fx(gm)(*example_inputs)
-    # use Interpreter to logs the args of conv
-    ConvArgsAnalysis(gm).run(*example_inputs)
-    return gm
+    def conv_arg_inner(*args):
+        fx_g = make_fx(gm)(*args)
+        return ConvArgsAnalysis(fx_g).run(*args)
+
+    return conv_arg_inner
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index 3a0209c1511b6..8929c1afd2fa1 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -88,21 +88,10 @@ def __repr__(self):
         return "FakeRootModule(...)"
 
 
-def wrap_compiler_fn(compiler_fn: CompilerFn) -> CompilerFn:
-    """WrapperBackend if config.verify_correctness is True"""
-    if config.verify_correctness:
-        # wrap backend if verify_correctness is True
-        wrapper_backend_compiler_fn = WrapperBackend(compiler_fn)
-
-        wrapper_backend_compiler_fn._torchdynamo_orig_callable = compiler_fn  # type: ignore[attr-defined]
-        return wrapper_backend_compiler_fn
-
-    return compiler_fn
-
-
 class WrapperBackend:
-    def __init__(self, backend: CompilerFn):
+    def __init__(self, backend: CompilerFn, original_example_inputs):
         self.backend: CompilerFn = backend
+        self.original_example_inputs = original_example_inputs
 
     @property
     def example_inputs(self):
@@ -111,7 +100,6 @@ def example_inputs(self):
     def __call__(self, gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
 
         self.restore = checkpoint_params(gm)
-        self.original_example_inputs = clone_inputs(example_inputs)
         self.gm = gm
         copy_gm = copy.deepcopy(self.gm)
         self.candidate = self.backend(copy_gm, self.original_example_inputs)
@@ -541,9 +529,43 @@ def call_user_compiler(self, gm: fx.GraphModule) -> CompiledFn:
             )
             _step_logger()(logging.INFO, f"calling compiler function {name}")
             compiler_fn = self.compiler_fn
+            # WrapperBackend needs real inputs, for now, to verify correctness
             if config.verify_correctness:
-                compiler_fn = wrap_compiler_fn(compiler_fn)
-            compiled_fn = compiler_fn(gm, self.example_inputs())
+                compiler_fn = WrapperBackend(compiler_fn, self.example_inputs())
+
+            # NOTE: [Real Tensors in Accuracy Evaluation]
+            #
+            # Today, tensors are passed to backends as fake at compile time. See the .fake_example_inputs()
+            # call to compiler_fn below. At runtime, backends use real tensors.
+            #
+            # This should be a strong invariant we hold across all backends,
+            # and generally, it is. However, for accuracy evaluation, we need real tensors at compile time,
+            # for now, due to the unfortunate setup described below.
+            #
+            # Due to the nature of how we invoke comparison as a backend in two different ways:
+            #
+            # (1) Less bad, but still worth rewriting, WrapperBackend above, which takes
+            # real inputs for its ctor. see the config.verify_correctnes above.
+            #
+            # (2) More bad, and very worth rewriting, the minifier installs accuracy comparison as
+            # a true backend, and therefore needs to be compiled with real inputs. This is made trickier
+            # by the fact that the minifier will spawn new processes during minification. As such, we have
+            # created a global flag, MINIFIER_SPAWNED, that should be set IF AND ONLY IF this run was spawned
+            # as part of accuracy minification. This flag is not a contract, and ideally will not be here long.
+            #
+            # The longer term PoR is to:
+            # (A) Rewrite the minifier accuracy evaluation and verify_correctness code to share the same
+            # correctness and accuracy logic, so as not to have two different ways of doing the same thing.
+            #
+            # (B) Refactor minifier accuracy backend to do its comparison fully at runtime, so as not to need to
+            # pass real tensors to it at compile time.
+            is_top_level_minifying = (
+                config.repro_after is not None and config.repro_level == 4
+            )
+            if torch._dynamo.debug_utils.MINIFIER_SPAWNED or is_top_level_minifying:
+                compiled_fn = compiler_fn(gm, self.example_inputs())
+            else:
+                compiled_fn = compiler_fn(gm, self.fake_example_inputs())
             _step_logger()(logging.INFO, f"done compiler function {name}")
             assert callable(compiled_fn), "compiler_fn did not return callable"
         except Exception as e:
@@ -551,6 +573,18 @@ def call_user_compiler(self, gm: fx.GraphModule) -> CompiledFn:
             raise BackendCompilerFailed(self.compiler_fn, e) from e
         return compiled_fn
 
+    def fake_example_inputs(self) -> List[torch.Tensor]:
+        result = []
+        for arg in self.graphargs:
+            example = arg.get_fake_examples()
+            if example is not None:
+                result.extend(example)
+            else:
+                # Fallback, in case fake_tensor was not set
+                # Particularly for graph args that are not tensors
+                result.extend(arg.get_examples())
+        return result
+
     def example_inputs(self) -> List[torch.Tensor]:
         result = []
         for arg in self.graphargs:
diff --git a/torch/_dynamo/test_minifier_common.py b/torch/_dynamo/test_minifier_common.py
index 8fb0688f2c3ed..947a45f2fcdf1 100644
--- a/torch/_dynamo/test_minifier_common.py
+++ b/torch/_dynamo/test_minifier_common.py
@@ -51,7 +51,6 @@ def _run_test_code(self, code):
         proc = subprocess.run(
             ["python3", "-c", code], capture_output=True, cwd=self.DEBUG_DIR
         )
-
         repro_dir_match = re.search(
             r"(\S+)minifier_launcher.py", proc.stderr.decode("utf-8")
         )
@@ -100,7 +99,6 @@ def _run_repro(self, patch_code, repro_dir):
         repro_proc = subprocess.run(
             ["python3", repro_file], capture_output=True, cwd=repro_dir
         )
-
         return repro_proc, repro_code
 
     # Template for testing code.
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index a6a139ef5760b..4f5ebc072fb5c 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -23,7 +23,7 @@
 import weakref
 from contextlib import contextmanager
 from functools import lru_cache
-from typing import Any, Dict
+from typing import Any, Dict, List
 
 import numpy as np
 import sympy
@@ -32,7 +32,7 @@
 from torch import fx
 from torch._dispatch.python import enable_python_dispatcher
 from torch.nn.modules.lazy import LazyModuleMixin
-from torch.utils._pytree import tree_map
+from torch.utils._pytree import tree_flatten, tree_map
 
 from . import config, logging as torchdynamo_logging
 
@@ -398,6 +398,10 @@ def clone_tensor(x):
 
 def clone_input(x):
     """copy while preserving strides"""
+    # TODO: this is questionable
+    if isinstance(x, torch._subclasses.FakeTensor):
+        # this func fails on fake tensors in __torch_dispatch__
+        return x
 
     def torch_clone(x):
         y = torch.clone(x)
@@ -754,7 +758,10 @@ def wrap_to_fake_tensor(e, fake_mode):
 
 
 def wrap_to_fake_tensor_and_record(e, tx):
-    if type(e) in (torch.Tensor, torch.nn.Parameter):
+    # The not fake tensor check here is annoying - ideally, fake tensors never call this during wrapping.
+    # However, get_fake_value takes args and passes them through this, which may include fake tensors.
+    # see tree_map(fake_wrapper, args) in get_fake_value.
+    if isinstance(e, torch.Tensor) and not isinstance(e, torch._subclasses.FakeTensor):
         static_shapes = config.dynamic_shapes is False
         if type(e) is torch.nn.Parameter:
             # Always static for params
@@ -817,6 +824,9 @@ def same(
                 return False
         return True
     elif isinstance(ref, torch.Tensor):
+        assert not isinstance(ref, torch._subclasses.FakeTensor)
+        assert not isinstance(res, torch._subclasses.FakeTensor)
+
         if ref.is_sparse:
             assert res.is_sparse
             ref = ref.to_dense()
@@ -1156,3 +1166,20 @@ def assert_no_fake_params_or_buffers(gm):
         assert not isinstance(
             param, torch._subclasses.FakeTensor
         ), f"Unexpected fake param {name}"
+
+
+def fake_mode_from_tensors(inputs: List[Any]):
+    """
+    Takes a list of anything, unflattened is fine, returns a fake_mode
+    if any are fake. All fake modes on all fake tensors must be identical.
+    Returns None if no fake_mode is fine
+    """
+    flat_inputs, _ = tree_flatten(inputs)
+    fake_mode = None
+    for flat_input in flat_inputs:
+        if isinstance(flat_input, torch._subclasses.FakeTensor):
+            if fake_mode is None:
+                fake_mode = flat_input.fake_mode
+            else:
+                assert fake_mode is flat_input.fake_mode
+    return fake_mode
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index f630d3644c2dc..0503b20bb68e1 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -9,7 +9,7 @@
 import re
 import types
 from abc import ABCMeta
-from typing import Any, Union
+from typing import Any, Optional, Union
 
 import numpy as np
 from functorch.experimental.ops import PyOperator
@@ -113,8 +113,13 @@ class GraphArg:
     source: Source
     example: Any
     is_unspecialized: bool
+    fake_tensor: Optional[torch._subclasses.fake_tensor.FakeTensor]
 
     def __post_init__(self):
+        if isinstance(self.example, torch.Tensor):
+            assert isinstance(
+                self.fake_tensor, torch._subclasses.fake_tensor.FakeTensor
+            )
         if isinstance(self.example, torch._subclasses.fake_tensor.FakeTensor):
             raise AssertionError("Fake Tensor observed in TorchDynamo Fx graph inputs")
 
@@ -124,6 +129,13 @@ def load(self, tx):
     def get_examples(self):
         return [self.example]
 
+    def get_fake_examples(self):
+        if self.fake_tensor is not None:
+            assert isinstance(
+                self.fake_tensor, torch._subclasses.fake_tensor.FakeTensor
+            )
+            return [self.fake_tensor]
+
     def __len__(self):
         return 1
 
@@ -528,7 +540,9 @@ def tensor_should_specialize(self):
 
     def wrap_sym(self, value: Union[torch.SymInt, torch.SymFloat]):
         if not is_constant_source(self.get_source()):
-            self.tx.output.graphargs.append(GraphArg(self.get_source(), value, False))
+            self.tx.output.graphargs.append(
+                GraphArg(self.get_source(), value, False, None)
+            )
         elif is_constant_source(self.get_source()):
             return self.tx.output.register_attr_or_module(
                 value,
@@ -556,10 +570,6 @@ def wrap_tensor(self, value: torch.Tensor):
                 # guards=self.make_guards(GuardBuilder.TENSOR_MATCH),
             )
         else:
-            if not is_constant_source(self.get_source()):
-                self.tx.output.graphargs.append(
-                    GraphArg(self.get_source(), value, False)
-                )
             # Disable __torch_function__ to prevent cloning of `value` to hit
             # us
             with torch._C.DisableTorchFunction():
@@ -579,6 +589,15 @@ def wrap_tensor(self, value: torch.Tensor):
                     guards=self.make_guards(GuardBuilder.TENSOR_MATCH),
                     should_specialize=self.tensor_should_specialize(),
                 )
+
+            fake_tensor_value = None
+            example_value = tensor_variable.proxy.node.meta["example_value"]
+            if isinstance(example_value, torch._subclasses.fake_tensor.FakeTensor):
+                fake_tensor_value = example_value
+
+            graph_arg = GraphArg(self.get_source(), value, False, fake_tensor_value)
+            self.tx.output.graphargs.append(graph_arg)
+
             if torch.overrides.has_torch_function_unary(value):
                 subclass_torch_function__func = value.__torch_function__.__func__
                 subclass_type = type(value)
@@ -603,10 +622,6 @@ def wrap_unspecialized_primitive(self, value):
             else:
                 # TODO: Eliminate this case entirely
                 wrapped_value = torch.tensor(value)
-            if not is_constant_source(self.get_source()):
-                self.tx.output.graphargs.append(
-                    GraphArg(self.get_source(), wrapped_value, True)
-                )
             if not isinstance(self.get_source(), RandomValueSource):
                 guards = {self.get_source().make_guard(GuardBuilder.TYPE_MATCH, True)}
                 options = {"guards": guards}
@@ -637,6 +652,14 @@ def wrap_unspecialized_primitive(self, value):
                     **options,
                 )
             self.tx.output.unspec_variable_map[self.name] = unspec_var
+            if not is_constant_source(self.get_source()):
+                fake_tensor_value = None
+                example_value = unspec_var.proxy.node.meta["example_value"]
+                if isinstance(example_value, torch._subclasses.fake_tensor.FakeTensor):
+                    fake_tensor_value = example_value
+                self.tx.output.graphargs.append(
+                    GraphArg(self.get_source(), wrapped_value, True, fake_tensor_value)
+                )
             return unspec_var
 
 
@@ -694,11 +717,14 @@ def _clone_input(value):
     with preserve_rng_state():
         if example_value is None:
             example_value = get_fake_value(proxy.node, tx)
-
         else:
-            proxy.tracer.real_value_cache[proxy.node] = _clone_input(example_value)
-            fake_wrapper = functools.partial(wrap_to_fake_tensor_and_record, tx=tx)
-            example_value = fake_wrapper(example_value)
+            # Note: Unfortunately, this can happen during tracing, and is valid enough for now to allow.
+            # TODO(voz): Find all the callsites and burn this down.
+            # Flipping it to an assert fails dozens of tests.
+            if not isinstance(example_value, torch._subclasses.FakeTensor):
+                proxy.tracer.real_value_cache[proxy.node] = _clone_input(example_value)
+                fake_wrapper = functools.partial(wrap_to_fake_tensor_and_record, tx=tx)
+                example_value = fake_wrapper(example_value)
 
     if isinstance(example_value, torch.Tensor):
         is_parameter = isinstance(example_value, torch.nn.Parameter)
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index 8d95971864f0f..bf66e68fed624 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -11,6 +11,7 @@
 import torch
 import torch.nn as nn
 from torch import _prims
+from torch._dynamo.utils import fake_mode_from_tensors
 from torch.fx.experimental.optimization import (
     matches_module_pattern,
     replace_node_module,
@@ -530,10 +531,12 @@ def fuse_fx(gm: torch.fx.GraphModule, example_inputs):
         example_input.device == torch.device("cpu") for example_input in example_inputs
     )
 
+    fake_mode = fake_mode_from_tensors(example_inputs)
+
     if config.permute_fusion and not is_cpu:
         # For linear permute fusion, we need to check input info to identify
         # and perform proper permutation/transpose
-        ShapeProp(gm).propagate(*example_inputs)
+        ShapeProp(gm, fake_mode=fake_mode).propagate(*example_inputs)
         gm = linear_permute_fusion(gm)
         gm = permute_linear_fusion(gm)
         gm = permute_matmul_fusion(gm)
@@ -549,7 +552,8 @@ def fuse_fx(gm: torch.fx.GraphModule, example_inputs):
     gm = fuse_conv_bn(gm)
     # For binary fusion, we need to check inputs info to make sure
     # the binary inputs have same tensor info(device, dtype, and layout).
-    ShapeProp(gm).propagate(*example_inputs)
+
+    ShapeProp(gm, fake_mode=fake_mode).propagate(*example_inputs)
     gm = fuse_unary(gm)
     gm = fuse_binary_inplace(gm)
     gm = fuse_binary(gm)
diff --git a/torch/fx/passes/shape_prop.py b/torch/fx/passes/shape_prop.py
index 2be996f714ce8..4fd8ce8af9347 100644
--- a/torch/fx/passes/shape_prop.py
+++ b/torch/fx/passes/shape_prop.py
@@ -109,11 +109,38 @@ def forward(self, x):
 
     Args:
          module (GraphModule): The module to be executed
+         fake_mode (FakeTensorMode): A fake mode for copying the gm
 
     """
+    def __init__(self, gm, fake_mode=None):
+        super().__init__(gm)
+        if fake_mode:
+            from torch._dynamo.utils import deepcopy_to_fake_tensor
+            # Note:
+            # We need fake execution cause the inputs are fake, however, we cannot fakify the module
+            # - because we need to write to the tensor_meta of the real module. So we fakify to
+            # produce a result (L130 below), to extract tensor meta, and then keep going.
+            #
+            # If we were to fakify, we would write to the wrong node, and then downstream fusion
+            # would be missing the tensor_meta.
+            #
+            # See torch/_inductor/overrides.py for where this is called upstream of fusion.
+            self.fake_module = deepcopy_to_fake_tensor(self.module, fake_mode)
+        else:
+            self.fake_module = None
+
+        self.real_module = self.module
+
     def run_node(self, n : Node) -> Any:
         try:
-            result = super().run_node(n)
+            if self.fake_module is not None:
+                # Hacky swap. Alternatively, we could do this with overriding
+                # call_module and get_attr.
+                self.module = self.fake_module
+            try:
+                result = super().run_node(n)
+            finally:
+                self.module = self.real_module
         except Exception:
             traceback.print_exc()
             raise RuntimeError(

From db5776c2a7a7575e05ce3264e4df2f90dd3f82b4 Mon Sep 17 00:00:00 2001
From: Sergii Dymchenko <sdym@fb.com>
Date: Sat, 3 Dec 2022 03:01:49 +0000
Subject: [PATCH 1544/1922] Fix issue 38095 TODO in test_dataloader.py (#90084)

Fix TODO related to https://github.com/pytorch/pytorch/issues/38095

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90084
Approved by: https://github.com/clee2000, https://github.com/NivekT
---
 test/test_dataloader.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 347f9be73e8b9..9f1b73cf9ed41 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -2119,9 +2119,7 @@ def test_default_collate_dtype(self):
 
         arr = [1.1, 2.3, -0.9]
         collated = _utils.collate.default_collate(arr)
-        # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-        self.assertEqualIgnoreType(collated, torch.tensor(arr))
-        self.assertEqual(collated.dtype, torch.float64)
+        self.assertEqual(collated, torch.tensor(arr, dtype=torch.float64))
 
         arr = [True, False]
         collated = _utils.collate.default_collate(arr)

From 77203b88c8fd07a1231dca6547d0c7b0220e539f Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Fri, 2 Dec 2022 20:34:56 +0000
Subject: [PATCH 1545/1922] Test composable checkpoint wrapping FSDP submodules
 (#90078)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90078
Approved by: https://github.com/awgu
---
 test/distributed/_composable/test_compose.py | 118 ++++++++++++++-----
 1 file changed, 91 insertions(+), 27 deletions(-)

diff --git a/test/distributed/_composable/test_compose.py b/test/distributed/_composable/test_compose.py
index dd6f90c6d03ea..0b7282f6a3fa0 100644
--- a/test/distributed/_composable/test_compose.py
+++ b/test/distributed/_composable/test_compose.py
@@ -12,6 +12,8 @@
 from torch.testing._internal.common_fsdp import FSDPTest
 from torch.testing._internal.common_utils import (
     TEST_WITH_DEV_DBG_ASAN,
+    instantiate_parametrized_tests,
+    parametrize,
     run_tests,
 )
 
@@ -29,7 +31,7 @@
     sys.exit(0)
 
 
-class ToyModel(nn.Module):
+class UnitModule(nn.Module):
     def __init__(self):
         super().__init__()
         self.l1 = nn.Linear(100, 100)
@@ -44,58 +46,120 @@ def forward(self, x):
         return self.l2(self.seq(self.l1(x)))
 
 
+class CompositeModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.l1 = nn.Linear(100, 100)
+        self.u1 = UnitModule()
+        self.u2 = UnitModule()
+        self.l2 = nn.Linear(100, 100)
+
+    def forward(self, x):
+        return self.l2(self.u2(self.u1(self.l1(x))))
+
+
 class TestFSDPCheckpoint(FSDPTest):
     @property
     def world_size(self) -> int:
         return 2
 
-    def _test_wrap_same_submodule(self, use_reentrant, grad_to_none):
+    def _test_wrap_same_submodule(
+        self,
+        base_model: nn.Module,
+        test_model: nn.Module,
+        x: torch.Tensor,
+        grad_to_none: bool,
+    ):
         LR = 0.01
-        device = torch.device("cuda")
+        base_optim = torch.optim.Adam(base_model.parameters(), lr=LR)
+        test_optim = torch.optim.Adam(test_model.parameters(), lr=LR)
 
-        model = ToyModel().to(device)
+        for _ in range(5):
+            test_loss = test_model(x).sum()
+            base_loss = base_model(x).sum()
+
+            self.assertEqual(test_loss, base_loss)
+
+            test_loss.backward()
+            test_optim.step()
+            test_optim.zero_grad(set_to_none=grad_to_none)
+
+            base_loss.backward()
+            base_optim.step()
+            base_optim.zero_grad(set_to_none=grad_to_none)
 
-        local_model = copy.deepcopy(model)
-        local_optim = torch.optim.Adam(local_model.parameters(), lr=LR)
+    @skip_if_lt_x_gpu(2)
+    @parametrize("use_reentrant", [True, False])
+    def test_wrap_same_submodule(self, use_reentrant: bool):
+        model = UnitModule().to("cuda")
 
-        combo_model = copy.deepcopy(model)
-        combo_optim = torch.optim.Adam(combo_model.parameters(), lr=LR)
+        base_model = copy.deepcopy(model)
 
+        test_model = copy.deepcopy(model)
         # compose checkpoint and fully_shard
-        combo_model.seq = checkpoint(
-            combo_model.seq, use_reentrant=use_reentrant
-        )
-        combo_model.seq = fully_shard(
-            combo_model.seq,
+        test_model.seq = checkpoint(test_model.seq, use_reentrant=use_reentrant)
+        test_model.seq = fully_shard(
+            test_model.seq,
             policy=ModuleWrapPolicy({nn.Linear}),
         )
 
-        x = torch.randn(2, 100, device=device)
+        self.run_subtests(
+            {
+                "base_model": [base_model],
+                "test_model": [test_model],
+                "x": [torch.randn(2, 100, device="cuda")],
+                "grad_to_none": [True, False],
+            },
+            self._test_wrap_same_submodule,
+        )
 
-        for _ in range(5):
-            combo_loss = combo_model(x).sum()
-            local_loss = local_model(x).sum()
+    def _test_checkpoint_fsdp_submodules(self, use_reentrant):
+        model = CompositeModel().to(torch.device("cuda"))
 
-            self.assertEqual(combo_loss, local_loss)
+        base_model = copy.deepcopy(model)
 
-            combo_loss.backward()
-            combo_optim.step()
-            combo_optim.zero_grad(set_to_none=grad_to_none)
+        test_model = copy.deepcopy(model)
+        test_model.u1 = fully_shard(
+            test_model.u1,
+            policy=ModuleWrapPolicy({UnitModule}),
+        )
+        test_model.u2 = fully_shard(
+            test_model.u2,
+            policy=ModuleWrapPolicy({UnitModule}),
+        )
 
-            local_loss.backward()
-            local_optim.step()
-            local_optim.zero_grad(set_to_none=grad_to_none)
+        test_model.u1.seq = checkpoint(
+            test_model.u1.seq, use_reentrant=use_reentrant
+        )
+        test_model.u2.seq = checkpoint(
+            test_model.u2.seq, use_reentrant=use_reentrant
+        )
 
-    @skip_if_lt_x_gpu(2)
-    def test_wrap_same_submodule(self):
         self.run_subtests(
             {
-                "use_reentrant": [True, False],
+                "base_model": [base_model],
+                "test_model": [test_model],
+                "x": [torch.randn(2, 100, device="cuda")],
                 "grad_to_none": [True, False],
             },
             self._test_wrap_same_submodule,
         )
 
+    @skip_if_lt_x_gpu(2)
+    def test_checkpoint_fsdp_submodules_use_reentrant(self):
+        with self.assertRaisesRegex(
+            AssertionError,
+            "Expects `Tensor` to have been saved in forward",
+        ):
+            self._test_checkpoint_fsdp_submodules(True)
+
+    @skip_if_lt_x_gpu(2)
+    def test_checkpoint_fsdp_submodules_non_reentrant(self):
+        self._test_checkpoint_fsdp_submodules(False)
+
+
+instantiate_parametrized_tests(TestFSDPCheckpoint)
+
 
 if __name__ == "__main__":
     run_tests()

From 12de3c4c4564ff113a242b6b1d189af8ae679848 Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Fri, 2 Dec 2022 20:35:01 +0000
Subject: [PATCH 1546/1922] [Easy] Remove unused parametrization (#90079)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90079
Approved by: https://github.com/awgu
---
 test/distributed/_composable/test_fully_shard.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/test/distributed/_composable/test_fully_shard.py b/test/distributed/_composable/test_fully_shard.py
index 32883d4c265d7..8f1ba437bf964 100644
--- a/test/distributed/_composable/test_fully_shard.py
+++ b/test/distributed/_composable/test_fully_shard.py
@@ -14,7 +14,6 @@
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import FSDPTest
 from torch.testing._internal.common_utils import (
-    instantiate_parametrized_tests,
     run_tests,
     TEST_WITH_DEV_DBG_ASAN,
 )
@@ -251,7 +250,5 @@ def test_training(self):
             self.assertEqual(losses[0], losses[1])
 
 
-instantiate_parametrized_tests(TestFSDPInitialization)
-
 if __name__ == "__main__":
     run_tests()

From 789c75050520807aa4a8847adb2d52ffd506c772 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Sat, 3 Dec 2022 03:10:06 +0000
Subject: [PATCH 1547/1922] [vision hash update] update the pinned vision hash
 (#90095)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned vision hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90095
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/vision.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index f18c091c93b2e..3b388adc51501 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-790f1cdcea0359619adfc9ec37b91883748d1854
+01c11a0564b8417561ae4c414fe659fc97476987

From 420f4a7fe1f7a1265024811e0300167ec1f90287 Mon Sep 17 00:00:00 2001
From: eqy <eddiey@nvidia.com>
Date: Sat, 3 Dec 2022 05:28:21 +0000
Subject: [PATCH 1548/1922] Reduce memory usage requirement of
 `test_pdist_norm_large` in `test_torch.py` (#90075)

Basically the same fix as #85373, `/usr/bin/time` indicates that the memory requirement on the host-side was actually ~64GiB before the workaround and ~30GiB after.

CC @ptrblck @davidberard98

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90075
Approved by: https://github.com/davidberard98
---
 test/test_torch.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/test_torch.py b/test/test_torch.py
index 102370aea2be2..6224c09977749 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -3914,10 +3914,9 @@ def test_dim_function_empty(self, device):
 
     # FIXME: find a test suite for the pdist operator
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "sandcastle OOM with current tpx gpu/re configuration")
-    @unittest.skipIf(is_cuda_sm86, "OOMs on sm86 configuration")
     @skipIfRocm
     @onlyCUDA
-    @largeTensorTest('12GB', device='cpu')
+    @largeTensorTest('32GB', device='cpu')
     @largeTensorTest('5GB', device='cuda')
     def test_pdist_norm_large(self, device):
         # use dim0>=46342 for forward, see:
@@ -3927,7 +3926,8 @@ def test_pdist_norm_large(self, device):
         # Will require 1249975000 float32s
         expected_cpu = torch.pdist(x, p=2)                  # ~1250M * 4 bytes = 5 GB on CPU
         actual_gpu = torch.pdist(x.to(device), p=2)         # 5 GB on GPU
-        self.assertEqual(expected_cpu, actual_gpu.cpu())    # Another 5 GB on CPU + 1.25GB for expected == actual
+        # Workaround for large memory overhead of self.assertTrue (see #84944)
+        self.assertTrue(torch.allclose(expected_cpu, actual_gpu.cpu()))
 
     # FIXME: move to elementwise ternary test suite
     @onlyNativeDeviceTypes

From 75f88fa0157b86d0c0167c510aa945ad951b3697 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Fri, 2 Dec 2022 19:30:10 -0800
Subject: [PATCH 1549/1922] [Reland] Move functorch/_src to torch/_functorch
 (#88756) (#90091)

This will be the last disruptive functorch internals change.

Why are we moving these files?
- As a part of rationalizing functorch we are moving the code in
functorch/_src to torch/_functorch
- This is so that we can offer the functorch APIs as native PyTorch APIs
(coming soon) and resolve some internal build issues.

Why are we moving all of these files at once?
- It's better to break developers all at once rather than many times

Test Plan:
- wait for tests

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90091
Approved by: https://github.com/anijain2305, https://github.com/ezyang
---
 .lintrunner.toml                                 | 10 ++++++++++
 benchmarks/dynamo/common.py                      |  2 +-
 functorch/__init__.py                            | 10 +++++-----
 functorch/_src/__init__.py                       |  5 -----
 functorch/_src/aot_autograd/__init__.py          |  8 ++++++++
 functorch/_src/eager_transforms/__init__.py      |  7 +++++++
 functorch/_src/make_functional/__init__.py       |  4 ++++
 functorch/_src/vmap/__init__.py                  | 16 ++++++++++++++++
 functorch/benchmarks/chrome_trace_parser.py      |  2 +-
 functorch/benchmarks/cse.py                      |  2 +-
 functorch/compile/__init__.py                    | 12 ++++++------
 functorch/experimental/__init__.py               |  4 ++--
 test/dynamo/test_aot_cudagraphs.py               |  6 +++---
 test/dynamo/test_repros.py                       |  8 ++++----
 test/functorch/discover_coverage.py              |  2 +-
 test/functorch/test_aotdispatch.py               | 16 ++++++++--------
 test/functorch/test_eager_transforms.py          |  4 ++--
 test/functorch/test_memory_efficient_fusion.py   |  2 +-
 test/functorch/test_minifier.py                  |  2 +-
 test/functorch/test_ops.py                       |  2 +-
 test/functorch/test_vmap.py                      |  2 +-
 test/inductor/test_torchinductor.py              |  2 +-
 test/test_functionalization.py                   |  2 +-
 torch/_dynamo/debug_utils.py                     |  2 +-
 torch/_dynamo/eval_frame.py                      |  2 +-
 torch/_dynamo/optimizations/training.py          |  4 ++--
 torch/_functorch/__init__.py                     |  5 +++++
 .../_src => torch/_functorch}/aot_autograd.py    |  0
 .../_src => torch/_functorch}/benchmark_utils.py |  0
 .../_src => torch/_functorch}/compile_utils.py   |  0
 .../_src => torch/_functorch}/compilers.py       |  0
 {functorch/_src => torch/_functorch}/config.py   |  0
 .../_functorch}/eager_transforms.py              |  0
 .../_src => torch/_functorch}/fx_minifier.py     |  0
 .../_src => torch/_functorch}/make_functional.py |  0
 .../_functorch}/named_members_polyfill.py        |  0
 .../_src => torch/_functorch}/partitioners.py    |  0
 .../_src => torch/_functorch}/python_key.py      |  0
 .../_src => torch/_functorch}/pytree_hacks.py    |  0
 .../_functorch}/top_operators_github_usage.py    |  0
 {functorch/_src => torch/_functorch}/vmap.py     |  0
 torch/_inductor/compile_fx.py                    |  4 ++--
 42 files changed, 96 insertions(+), 51 deletions(-)
 create mode 100644 functorch/_src/aot_autograd/__init__.py
 create mode 100644 functorch/_src/eager_transforms/__init__.py
 create mode 100644 functorch/_src/make_functional/__init__.py
 create mode 100644 functorch/_src/vmap/__init__.py
 rename {functorch/_src => torch/_functorch}/aot_autograd.py (100%)
 rename {functorch/_src => torch/_functorch}/benchmark_utils.py (100%)
 rename {functorch/_src => torch/_functorch}/compile_utils.py (100%)
 rename {functorch/_src => torch/_functorch}/compilers.py (100%)
 rename {functorch/_src => torch/_functorch}/config.py (100%)
 rename {functorch/_src => torch/_functorch}/eager_transforms.py (100%)
 rename {functorch/_src => torch/_functorch}/fx_minifier.py (100%)
 rename {functorch/_src => torch/_functorch}/make_functional.py (100%)
 rename {functorch/_src => torch/_functorch}/named_members_polyfill.py (100%)
 rename {functorch/_src => torch/_functorch}/partitioners.py (100%)
 rename {functorch/_src => torch/_functorch}/python_key.py (100%)
 rename {functorch/_src => torch/_functorch}/pytree_hacks.py (100%)
 rename {functorch/_src => torch/_functorch}/top_operators_github_usage.py (100%)
 rename {functorch/_src => torch/_functorch}/vmap.py (100%)

diff --git a/.lintrunner.toml b/.lintrunner.toml
index 10787ce672263..714234b1ad7ed 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -101,6 +101,16 @@ exclude_patterns = [
     'torch/csrc/**',
     'torch/_dynamo/**/*.py',
     'torch/_inductor/**/*.py',
+    'torch/_functorch/aot_autograd.py',
+    'torch/_functorch/benchmark_utils.py',
+    'torch/_functorch/compile_utils.py',
+    'torch/_functorch/compilers.py',
+    'torch/_functorch/eager_transforms.py',
+    'torch/_functorch/fx_minifier.py',
+    'torch/_functorch/partitioners.py',
+    'torch/_functorch/make_functional.py',
+    'torch/_functorch/top_operators_github_usage.py',
+    'torch/_functorch/vmap.py',
     'torch/distributed/elastic/agent/server/api.py',
     'torch/testing/_internal/**',
     'torch/distributed/fsdp/fully_sharded_data_parallel.py',
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 3e469e509680a..b2937e4c45994 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -23,13 +23,13 @@
 import torch._dynamo
 import torch._dynamo.utils
 import torch.distributed
-from functorch._src.aot_autograd import set_model_name
 from scipy.stats import gmean, ttest_ind
 from torch._dynamo.optimizations import backends
 from torch._dynamo.optimizations.log_args import conv_args_analysis
 from torch._dynamo.profiler import fx_insert_profiling, Profiler
 from torch._dynamo.testing import dummy_fx_compile, format_speedup, same
 from torch._dynamo.utils import clone_inputs
+from torch._functorch.aot_autograd import set_model_name
 from torch._inductor import config as inductor_config
 from torch._inductor.utils import fresh_inductor_cache
 from torch._subclasses.fake_tensor import FakeTensorMode
diff --git a/functorch/__init__.py b/functorch/__init__.py
index 971ce793d7203..c02ae3c443b6f 100644
--- a/functorch/__init__.py
+++ b/functorch/__init__.py
@@ -8,19 +8,19 @@
 
 # Top-level APIs. Please think carefully before adding something to the
 # top-level namespace:
-# - private helper functions should go into functorch._src
+# - private helper functions should go into torch._functorch
 # - very experimental things should go into functorch.experimental
 # - compilation related things should go into functorch.compile
 
 # functorch transforms
-from ._src.vmap import vmap
-from ._src.eager_transforms import (
+from torch._functorch.vmap import vmap
+from torch._functorch.eager_transforms import (
     grad, grad_and_value, vjp, jacrev, jvp, jacfwd, hessian, functionalize
 )
-from ._src.python_key import make_fx
+from torch._functorch.python_key import make_fx
 
 # utilities. Maybe these should go in their own namespace in the future?
-from ._src.make_functional import (
+from torch._functorch.make_functional import (
     make_functional_with_buffers,
     make_functional,
     combine_state_for_ensemble,
diff --git a/functorch/_src/__init__.py b/functorch/_src/__init__.py
index 10a55772ab58b..e69de29bb2d1d 100644
--- a/functorch/_src/__init__.py
+++ b/functorch/_src/__init__.py
@@ -1,5 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
diff --git a/functorch/_src/aot_autograd/__init__.py b/functorch/_src/aot_autograd/__init__.py
new file mode 100644
index 0000000000000..94f258df84ba8
--- /dev/null
+++ b/functorch/_src/aot_autograd/__init__.py
@@ -0,0 +1,8 @@
+# This file has moved to under torch/_functorch. It is not public API.
+# If you are not a PyTorch developer and you are relying on the following
+# imports, please file an issue.
+from torch._functorch.aot_autograd import (
+    aot_autograd_decompositions,
+    KNOWN_TYPES,
+    PytreeThunk,
+)
diff --git a/functorch/_src/eager_transforms/__init__.py b/functorch/_src/eager_transforms/__init__.py
new file mode 100644
index 0000000000000..e3e587c0978fa
--- /dev/null
+++ b/functorch/_src/eager_transforms/__init__.py
@@ -0,0 +1,7 @@
+# This file has moved to under torch/_functorch. It is not public API.
+# If you are not a PyTorch developer and you are relying on the following
+# imports, please file an issue.
+from torch._functorch.eager_transforms import (
+    _unwrap_functional_tensor,
+    _assert_wrapped_functional,
+)
diff --git a/functorch/_src/make_functional/__init__.py b/functorch/_src/make_functional/__init__.py
new file mode 100644
index 0000000000000..3de7787df0c33
--- /dev/null
+++ b/functorch/_src/make_functional/__init__.py
@@ -0,0 +1,4 @@
+# This file has moved to under torch/_functorch. It is not public API.
+# If you are not a PyTorch developer and you are relying on the following
+# imports, please file an issue.
+from torch._functorch.make_functional import _swap_state
diff --git a/functorch/_src/vmap/__init__.py b/functorch/_src/vmap/__init__.py
new file mode 100644
index 0000000000000..792a2fde38bb3
--- /dev/null
+++ b/functorch/_src/vmap/__init__.py
@@ -0,0 +1,16 @@
+# This file has moved to under torch/_functorch. It is not public API.
+# If you are not a PyTorch developer and you are relying on the following
+# imports, please file an issue.
+from torch._functorch.vmap import (
+    _add_batch_dim,
+    _broadcast_to_and_flatten,
+    _get_name,
+    _remove_batch_dim,
+    _validate_and_get_batch_size,
+    Tensor,
+    tree_flatten,
+    tree_unflatten,
+    _process_batched_inputs,
+    _create_batched_inputs,
+    _unwrap_batched,
+)
diff --git a/functorch/benchmarks/chrome_trace_parser.py b/functorch/benchmarks/chrome_trace_parser.py
index 54d2bf1447fb1..ccc8b89544bc3 100755
--- a/functorch/benchmarks/chrome_trace_parser.py
+++ b/functorch/benchmarks/chrome_trace_parser.py
@@ -5,7 +5,7 @@
 import logging
 import pandas as pd
 
-from functorch._src.benchmark_utils import compute_utilization
+from torch._functorch.benchmark_utils import compute_utilization
 
 # process the chrome traces output by the pytorch profiler
 # require the json input file's name to be in format {model_name}_chrome_trace_*.json
diff --git a/functorch/benchmarks/cse.py b/functorch/benchmarks/cse.py
index 028677d6ee259..14cde14eb3085 100644
--- a/functorch/benchmarks/cse.py
+++ b/functorch/benchmarks/cse.py
@@ -3,7 +3,7 @@
 from functorch import make_fx
 from torch.profiler import profile, ProfilerActivity
 
-from functorch._src.compile_utils import fx_graph_cse
+from torch._functorch.compile_utils import fx_graph_cse
 
 def profile_it(f, inp):
     for _ in range(5):
diff --git a/functorch/compile/__init__.py b/functorch/compile/__init__.py
index 12549dceda9fb..569c1b6819bdd 100644
--- a/functorch/compile/__init__.py
+++ b/functorch/compile/__init__.py
@@ -1,6 +1,6 @@
-from .._src.python_key import pythonkey_decompose
-from .._src.fx_minifier import minifier
-from .._src.aot_autograd import (
+from torch._functorch.python_key import pythonkey_decompose
+from torch._functorch.fx_minifier import minifier
+from torch._functorch.aot_autograd import (
     aot_function,
     aot_module,
     compiled_function,
@@ -12,7 +12,7 @@
     make_boxed_func,
     make_boxed_compiler
 )
-from .._src.compilers import (
+from torch._functorch.compilers import (
     ts_compile,
     draw_graph_compile,
     nop,
@@ -22,10 +22,10 @@
     print_compile,
     default_decompositions
 )
-from .._src.partitioners import (
+from torch._functorch.partitioners import (
     min_cut_rematerialization_partition,
     default_partition,
     draw_graph,
     draw_joint_graph,
 )
-from .._src import config
+from torch._functorch import config
diff --git a/functorch/experimental/__init__.py b/functorch/experimental/__init__.py
index 3a4c92ffbe7a5..dde503f93bb62 100644
--- a/functorch/experimental/__init__.py
+++ b/functorch/experimental/__init__.py
@@ -1,5 +1,5 @@
 # PyTorch forward-mode is not mature yet
-from .._src.eager_transforms import hessian, jacfwd, jvp
-from .._src.vmap import chunk_vmap
+from torch._functorch.eager_transforms import hessian, jacfwd, jvp
+from torch._functorch.vmap import chunk_vmap
 from .batch_norm_replacement import replace_all_batch_norm_modules_
 from functorch import functionalize
diff --git a/test/dynamo/test_aot_cudagraphs.py b/test/dynamo/test_aot_cudagraphs.py
index 5b2e6eb2f9eac..5299e92a060f7 100644
--- a/test/dynamo/test_aot_cudagraphs.py
+++ b/test/dynamo/test_aot_cudagraphs.py
@@ -104,7 +104,7 @@ def fn(x, y):
         y = torch.randn((), device="cpu")
         fn(x, y)
 
-    @patch("functorch._src.config.use_functionalize", True)
+    @patch("torch._functorch.config.use_functionalize", True)
     def test_mutate_input(self):
         def model(x, y):
             y.add_(3)
@@ -159,7 +159,7 @@ def fn(y):
         y = torch.randn(3, device="cuda:0", requires_grad=True)
         fn(y)
 
-    @patch("functorch._src.config.use_functionalize", True)
+    @patch("torch._functorch.config.use_functionalize", True)
     @patch_all()
     def test_mutated_metadata(self):
         # more tortured example at
@@ -180,7 +180,7 @@ def fn(x):
         x = torch.empty(0, device="cuda:0")
         fn(x)
 
-    @patch("functorch._src.config.use_functionalize", True)
+    @patch("torch._functorch.config.use_functionalize", True)
     @patch_all()
     def test_dead_fill(self):
         def model(x):
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 2c0a7acb5ccc6..acb285a6cf26b 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -11,8 +11,6 @@
 from typing import List
 from unittest.mock import patch
 
-import functorch._src.config
-
 import numpy as np
 import torch
 
@@ -20,6 +18,8 @@
 import torch._dynamo.testing
 import torch._dynamo.utils
 
+import torch._functorch.config
+
 try:
     from test_minifier import requires_cuda
 except ImportError:
@@ -1681,7 +1681,7 @@ def fn(x):
         opt_fn(x)
         self.assertEqual(cnt.frame_count, 1)
 
-    @patch.object(functorch._src.config, "use_dynamic_shapes", True)
+    @patch.object(torch._functorch.config, "use_dynamic_shapes", True)
     def test_bigbird_unsqueeze_inplace(self):
         def fn(reshape_2):
             view_2 = reshape_2.clone()
@@ -2071,7 +2071,7 @@ def f(x):
         with self.assertRaisesRegex(torch._dynamo.exc.Unsupported, "generic_jump"):
             torch._dynamo.export(f, torch.Tensor([3, 4, 5]))
 
-    @patch.object(functorch._src.config, "use_dynamic_shapes", True)
+    @patch.object(torch._functorch.config, "use_dynamic_shapes", True)
     def test_batchnorm_e2e(self):
         class Repro(torch.nn.Module):
             def __init__(self):
diff --git a/test/functorch/discover_coverage.py b/test/functorch/discover_coverage.py
index e52f317087b4c..3f4f74b9224de 100644
--- a/test/functorch/discover_coverage.py
+++ b/test/functorch/discover_coverage.py
@@ -3,7 +3,7 @@
 from torch.testing._internal.common_methods_invocations import op_db
 from functorch_additional_op_db import additional_op_db
 from enum import Enum
-import functorch._src.top_operators_github_usage as top_ops
+import torch._functorch.top_operators_github_usage as top_ops
 import pprint
 import unittest
 import enum
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index a7246a2f09c1a..2434e35ab4871 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -22,7 +22,7 @@
     grad, vjp, vmap, jacrev,
     make_fx
 )
-from functorch._src.aot_autograd import aot_module_simplified
+from torch._functorch.aot_autograd import aot_module_simplified
 from functorch.compile import (
     nnc_jit, compiled_function, compiled_module,
     min_cut_rematerialization_partition, aot_function, aot_module,
@@ -991,7 +991,7 @@ def f(a, b, c):
         inp = [torch.randn(5, requires_grad=True) for _ in range(3)]
         f(*inp).sum().backward()
 
-    @patch('functorch._src.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
+    @patch('torch._functorch.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
     def test_compilation_context(self, counter):
         def f(x):
             return x.sin().sin()
@@ -1025,8 +1025,8 @@ def f(x, y):
         x = torch.randn(3, 3, requires_grad=True).clone()
         self.verify_aot_autograd(f, [x, x])
 
-    @patch('functorch._src.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
-    @patch("functorch._src.config.debug_assert", True)
+    @patch('torch._functorch.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
+    @patch("torch._functorch.config.debug_assert", True)
     def test_invalid_dupe_left_bias(self, counter):
         # This test checks that, just because only the first
         # argument did a metadata mutation, we still correctly
@@ -1047,8 +1047,8 @@ def forward(self, x, y):
             """At compilation time, graph 1 was compiled under the assumption that input 1 would be a duplicate of input 0, but at runtime this was not the case.  This indicates a guard bug in AOTAutograd or Dynamo, please file a bug to PyTorch."""  # noqa: B950
         )
 
-    @patch('functorch._src.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
-    @patch("functorch._src.config.debug_assert", True)
+    @patch('torch._functorch.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
+    @patch("torch._functorch.config.debug_assert", True)
     def test_invalid_dupe(self, counter):
         class F(torch.nn.Module):
             def forward(self, x, y):
@@ -1070,8 +1070,8 @@ def forward(self, x, y):
             """At compilation time, graph 1 was compiled under the assumption that input 1 would be a duplicate of input 0, but at runtime this was not the case.  This indicates a guard bug in AOTAutograd or Dynamo, please file a bug to PyTorch."""  # noqa: B950
         )
 
-    @patch('functorch._src.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
-    @patch("functorch._src.config.debug_assert", True)
+    @patch('torch._functorch.aot_autograd.AOT_COUNTER', new_callable=itertools.count)
+    @patch("torch._functorch.config.debug_assert", True)
     def test_invalid_requires_grad(self, counter):
         class F(torch.nn.Module):
             def forward(self, x, y):
diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py
index e123da0d9d3c9..e9d0cbfb4f919 100644
--- a/test/functorch/test_eager_transforms.py
+++ b/test/functorch/test_eager_transforms.py
@@ -32,10 +32,10 @@
     jvp, make_functional, make_functional_with_buffers,
     combine_state_for_ensemble, make_fx
 )
-from functorch._src.make_functional import (
+from torch._functorch.make_functional import (
     functional_init, functional_init_with_buffers,
 )
-from functorch._src.eager_transforms import enable_fwd_grad, _slice_argnums
+from torch._functorch.eager_transforms import enable_fwd_grad, _slice_argnums
 from functorch.experimental import functionalize
 from torch._ops import PyOperator
 from torch._functorch.utils import enable_autograd_function
diff --git a/test/functorch/test_memory_efficient_fusion.py b/test/functorch/test_memory_efficient_fusion.py
index b0f18f06b8295..e12da51004504 100644
--- a/test/functorch/test_memory_efficient_fusion.py
+++ b/test/functorch/test_memory_efficient_fusion.py
@@ -6,7 +6,7 @@
 from functorch import make_fx
 from torch.nn import functional as F
 from functorch.compile import memory_efficient_fusion
-from functorch._src.compile_utils import fx_graph_cse
+from torch._functorch.compile_utils import fx_graph_cse
 from torch.testing._internal.common_utils import TestCase, run_tests
 import inspect
 import random
diff --git a/test/functorch/test_minifier.py b/test/functorch/test_minifier.py
index 49af42795592d..7ed13921d9077 100644
--- a/test/functorch/test_minifier.py
+++ b/test/functorch/test_minifier.py
@@ -2,7 +2,7 @@
 
 import torch
 from functorch.compile import minifier
-from functorch._src.compile_utils import get_placeholders, get_outputs
+from torch._functorch.compile_utils import get_placeholders, get_outputs
 from functorch import make_fx
 from torch.testing._internal.common_utils import TestCase, run_tests
 
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index 36fe608fe496d..fbee1872ddf3e 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -41,7 +41,7 @@
 from torch.utils._pytree import tree_flatten, tree_unflatten, tree_map
 from functorch import grad, vjp, vmap, jacrev, jacfwd
 import torch.autograd.forward_ad as fwAD
-from functorch._src.eager_transforms import _as_tuple, jvp
+from torch._functorch.eager_transforms import _as_tuple, jvp
 
 aten = torch.ops.aten
 
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index 4b460560d8a90..dcad523217f3f 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -49,7 +49,7 @@
 from functorch import vmap, grad, grad_and_value, jvp, vjp, jacfwd
 from functorch.experimental import chunk_vmap
 from torch._C._functorch import reshape_dim_into, reshape_dim_outof
-from functorch._src.make_functional import functional_init_with_buffers
+from torch._functorch.make_functional import functional_init_with_buffers
 
 FALLBACK_REGEX = 'There is a performance drop'
 
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index def9b95d77c2d..12f3bf2120e94 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -5730,7 +5730,7 @@ def noop_backend(
                 Instead, it transforms the fx graph so that its functions are
                 aten operations. It then saves this graph.
                 """
-                from functorch._src.aot_autograd import Interpreter
+                from torch._functorch.aot_autograd import Interpreter
                 from torch._inductor.decomposition import select_decomp_table
                 from torch._subclasses import FakeTensorMode
 
diff --git a/test/test_functionalization.py b/test/test_functionalization.py
index cf1d6b0145358..ec1a0caa804c4 100644
--- a/test/test_functionalization.py
+++ b/test/test_functionalization.py
@@ -157,7 +157,7 @@ def f(input):
 
         def g(x):
             loss = f(x).sum()
-            from functorch._src.aot_autograd import setup_stacktrace_preservation_hooks
+            from torch._functorch.aot_autograd import setup_stacktrace_preservation_hooks
             import torch.fx.traceback as fx_traceback
             setup_stacktrace_preservation_hooks([loss.grad_fn])
             with fx_traceback.override_stack_trace():
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index 739c7b56f916e..c39318b99d2c7 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -502,7 +502,7 @@ def run_fwd_maybe_bwd(gm, args, only_fwd=False):
     """
     Runs a forward and possibly backward iteration for a given mod and args.
     """
-    from functorch._src.aot_autograd import make_boxed_func
+    from torch._functorch.aot_autograd import make_boxed_func
 
     from .testing import collect_results, reduce_to_scalar_loss, requires_bwd_pass
 
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 556a3b4912ebf..9edcb61aaaaad 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -120,7 +120,7 @@ def enable_dynamic(enable: bool = True):
         yield
         return
     with patch("torch._dynamo.config.dynamic_shapes", True), patch(
-        "functorch._src.config.use_dynamic_shapes", True
+        "torch._functorch.config.use_dynamic_shapes", True
     ):
         yield
 
diff --git a/torch/_dynamo/optimizations/training.py b/torch/_dynamo/optimizations/training.py
index 7013fcdf3107f..a1b22a7f6c313 100644
--- a/torch/_dynamo/optimizations/training.py
+++ b/torch/_dynamo/optimizations/training.py
@@ -6,8 +6,6 @@
 from importlib import import_module
 from typing import Set
 
-from functorch._src.compilers import debug_nop
-
 from functorch.compile import (
     aot_module_simplified,
     min_cut_rematerialization_partition,
@@ -16,6 +14,8 @@
 )
 
 import torch
+
+from torch._functorch.compilers import debug_nop
 from torch.fx import GraphModule
 from torch.fx.passes.backends.cudagraphs import partition_cudagraphs
 from torch.multiprocessing.reductions import StorageWeakRef
diff --git a/torch/_functorch/__init__.py b/torch/_functorch/__init__.py
index e69de29bb2d1d..10a55772ab58b 100644
--- a/torch/_functorch/__init__.py
+++ b/torch/_functorch/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/functorch/_src/aot_autograd.py b/torch/_functorch/aot_autograd.py
similarity index 100%
rename from functorch/_src/aot_autograd.py
rename to torch/_functorch/aot_autograd.py
diff --git a/functorch/_src/benchmark_utils.py b/torch/_functorch/benchmark_utils.py
similarity index 100%
rename from functorch/_src/benchmark_utils.py
rename to torch/_functorch/benchmark_utils.py
diff --git a/functorch/_src/compile_utils.py b/torch/_functorch/compile_utils.py
similarity index 100%
rename from functorch/_src/compile_utils.py
rename to torch/_functorch/compile_utils.py
diff --git a/functorch/_src/compilers.py b/torch/_functorch/compilers.py
similarity index 100%
rename from functorch/_src/compilers.py
rename to torch/_functorch/compilers.py
diff --git a/functorch/_src/config.py b/torch/_functorch/config.py
similarity index 100%
rename from functorch/_src/config.py
rename to torch/_functorch/config.py
diff --git a/functorch/_src/eager_transforms.py b/torch/_functorch/eager_transforms.py
similarity index 100%
rename from functorch/_src/eager_transforms.py
rename to torch/_functorch/eager_transforms.py
diff --git a/functorch/_src/fx_minifier.py b/torch/_functorch/fx_minifier.py
similarity index 100%
rename from functorch/_src/fx_minifier.py
rename to torch/_functorch/fx_minifier.py
diff --git a/functorch/_src/make_functional.py b/torch/_functorch/make_functional.py
similarity index 100%
rename from functorch/_src/make_functional.py
rename to torch/_functorch/make_functional.py
diff --git a/functorch/_src/named_members_polyfill.py b/torch/_functorch/named_members_polyfill.py
similarity index 100%
rename from functorch/_src/named_members_polyfill.py
rename to torch/_functorch/named_members_polyfill.py
diff --git a/functorch/_src/partitioners.py b/torch/_functorch/partitioners.py
similarity index 100%
rename from functorch/_src/partitioners.py
rename to torch/_functorch/partitioners.py
diff --git a/functorch/_src/python_key.py b/torch/_functorch/python_key.py
similarity index 100%
rename from functorch/_src/python_key.py
rename to torch/_functorch/python_key.py
diff --git a/functorch/_src/pytree_hacks.py b/torch/_functorch/pytree_hacks.py
similarity index 100%
rename from functorch/_src/pytree_hacks.py
rename to torch/_functorch/pytree_hacks.py
diff --git a/functorch/_src/top_operators_github_usage.py b/torch/_functorch/top_operators_github_usage.py
similarity index 100%
rename from functorch/_src/top_operators_github_usage.py
rename to torch/_functorch/top_operators_github_usage.py
diff --git a/functorch/_src/vmap.py b/torch/_functorch/vmap.py
similarity index 100%
rename from functorch/_src/vmap.py
rename to torch/_functorch/vmap.py
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index 0a2dba8bdde5d..9e94f37ef89a8 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -6,10 +6,10 @@
 from typing import List
 
 import functorch
-from functorch._src.aot_autograd import make_boxed_func
 from functorch.compile import min_cut_rematerialization_partition
 
 import torch.fx
+from torch._functorch.aot_autograd import make_boxed_func
 from torch._subclasses.fake_tensor import FakeTensor
 
 from . import config, metrics, overrides
@@ -393,7 +393,7 @@ def bw_compiler(model: torch.fx.GraphModule, example_inputs):
     with overrides.patch_functions():
 
         # TODO: can add logging before/after the call to create_aot_dispatcher_function
-        # in functorch/_src/aot_autograd.py::aot_module_simplified::aot_function_simplified::new_func
+        # in torch._functorch/aot_autograd.py::aot_module_simplified::aot_function_simplified::new_func
         # once torchdynamo is merged into pytorch
         return aot_autograd(
             fw_compiler=fw_compiler,

From bb2c0c23096702cfe1b0613c52033f7fc1cf4e8c Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Sat, 3 Dec 2022 03:29:32 +0000
Subject: [PATCH 1550/1922] [FSDP] Issue warning when clamping to `NO_SHARD`
 (#90060)

Fixes https://github.com/pytorch/pytorch/issues/90050. I hope that this was not meant as an onboarding task :/
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90060
Approved by: https://github.com/zhaojuanmao
---
 test/distributed/fsdp/test_fsdp_misc.py | 44 +++++++++++++++++++++++++
 torch/distributed/fsdp/_init_utils.py   |  6 ++++
 2 files changed, 50 insertions(+)

diff --git a/test/distributed/fsdp/test_fsdp_misc.py b/test/distributed/fsdp/test_fsdp_misc.py
index 8c972f8515634..d1b2445dc78b8 100644
--- a/test/distributed/fsdp/test_fsdp_misc.py
+++ b/test/distributed/fsdp/test_fsdp_misc.py
@@ -2,6 +2,7 @@
 
 import functools
 import sys
+import warnings
 from collections import namedtuple
 from contextlib import suppress
 from copy import deepcopy
@@ -503,6 +504,49 @@ def __init__(self, rank):
             )
 
 
+class TestFSDPMiscWorldSize1(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return 1
+
+    @skip_if_lt_x_gpu(1)
+    def test_world_size_1_sharding_strategy_warning(self):
+        """
+        Tests that FSDP issues a warning when it switches to using ``NO_SHARD``
+        when the world size is 1.
+        """
+        warning_prefix = "FSDP is switching to use `NO_SHARD` instead of"
+        # If the user already passes `NO_SHARD`, then there should not be a
+        # warning
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")  # trigger all warnings
+            FSDP(nn.Linear(3, 3).cuda(), sharding_strategy=ShardingStrategy.NO_SHARD)
+            for warning in w:
+                self.assertTrue(
+                    warning.category != UserWarning
+                    or not str(warning.message).startswith(warning_prefix)
+                )
+
+        # Check that a warning is issued
+        warning_suffix = " since the world size is 1."
+        # - Pass `FULL_SHARD` or `None`
+        expected_regex_full_shard = (
+            warning_prefix + " " + str(ShardingStrategy.FULL_SHARD) + warning_suffix
+        )
+        with self.assertWarnsRegex(UserWarning, expected_regex_full_shard):
+            FSDP(nn.Linear(3, 3).cuda(), sharding_strategy=ShardingStrategy.FULL_SHARD)
+        with self.assertWarnsRegex(UserWarning, expected_regex_full_shard):
+            FSDP(nn.Linear(3, 3).cuda())
+        # - Pass `SHARD_GRAD_OP`
+        expected_regex_shard_grad_op = (
+            warning_prefix + " " + str(ShardingStrategy.SHARD_GRAD_OP) + warning_suffix
+        )
+        with self.assertWarnsRegex(UserWarning, expected_regex_shard_grad_op):
+            FSDP(
+                nn.Linear(3, 3).cuda(), sharding_strategy=ShardingStrategy.SHARD_GRAD_OP
+            )
+
+
 instantiate_parametrized_tests(TestFSDPMisc)
 
 if __name__ == "__main__":
diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index 75750e8717ef5..c2f759eb4a956 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -135,6 +135,12 @@ def _init_core_state(
     # currently functionally equivalent. This may change if/when we integrate
     # FSDP with MoE.
     if state.world_size == 1:
+        if sharding_strategy != ShardingStrategy.NO_SHARD:
+            warnings.warn(
+                "FSDP is switching to use `NO_SHARD` instead of "
+                f"{sharding_strategy or ShardingStrategy.FULL_SHARD} since "
+                "the world size is 1."
+            )
         sharding_strategy = ShardingStrategy.NO_SHARD
     state.sharding_strategy = sharding_strategy or ShardingStrategy.FULL_SHARD
     state.mixed_precision = mixed_precision or MixedPrecision()

From bbfeb8a0f63ec389b8304d28ec6d4e3b733f58f1 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Sat, 3 Dec 2022 14:48:39 +0000
Subject: [PATCH 1551/1922] [FSDP][Easy] Remove unused methods (#89229)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89229
Approved by: https://github.com/mrshenli
---
 .../fsdp/fully_sharded_data_parallel.py       | 132 ------------------
 torch/testing/_internal/common_fsdp.py        |   7 +-
 2 files changed, 1 insertion(+), 138 deletions(-)

diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index e602727013e03..d70146ba0665c 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -32,7 +32,6 @@
     _get_param_to_fqns,
     FSDP_PREFIX,
     FSDP_WRAPPED_MODULE,
-    HandleTrainingState,
     TrainingState,
 )
 from torch.distributed.fsdp._dynamo_utils import _annotate_modules_for_dynamo
@@ -54,9 +53,7 @@
     _post_forward_reshard,
     _pre_forward,
     _pre_forward_unshard,
-    _reshard,
     _root_pre_forward,
-    _should_free_in_backward,
 )
 from torch.distributed.fsdp._wrap_utils import _auto_wrap
 from torch.distributed.fsdp.api import (
@@ -939,135 +936,6 @@ def named_parameters(
                 param_name = param_name.replace(FSDP_PREFIX, "")
             yield (param_name, param)
 
-    @torch.no_grad()
-    def _wait_for_post_backward(self) -> None:
-        """Wait for post-backward to finish. Only called on root instance."""
-        assert self._is_root, "_wait_for_post_backward can only be called on root."
-        # Root's training state might be backward_pre or backward_post depending on
-        # if root parameter's post backward hook was called. The post-backward hook
-        # may not have been called if gradient was not computed for this param/FSDP
-        # module.
-
-        if self._sync_gradients:
-            torch.cuda.current_stream().wait_stream(self._streams["post_backward"])
-            if self.cpu_offload.offload_params:
-                # We need to wait for the non-blocking GPU ->
-                # CPU grad transfers to finish. We need to do this for GPU -> CPU
-                # copies because when grad is on CPU, it won't wait for any CUDA
-                # stream to finish GPU -> CPU copies unless we explicitly block the
-                # host-side with synchronize().
-                torch.cuda.current_stream().synchronize()
-        self._exec_order_data.next_iter()
-
-        # A backward pass is done, clean up below.
-        def _catch_all_reshard(fsdp_module: FullyShardedDataParallel) -> None:
-            """
-            Reshards full parameters that may have not been resharded in
-            post_backward_hook. This can happen when an FSDP module's output
-            is used in forward so its pre-backward fires unsharding the param,
-            but post-backward does not fire since the output was not ultimately
-            used in loss computation so FSDP parameter did not get a gradient.
-            """
-            # Note that we wrap resharding logic in a try-catch as a defensive
-            # approach, as if an error is thrown, we are in the backwards pass,
-            # and autograd would not print out much useful info about the actual
-            # error hit.
-            try:
-                free_unsharded_flat_params: List[bool] = []
-                handles_to_reshard: List[FlatParamHandle] = []
-                for handle in fsdp_module._handles:
-                    # TODO: This already-resharded check is brittle:
-                    # https://github.com/pytorch/pytorch/issues/83956
-                    already_resharded = (
-                        handle.flat_param.data_ptr()
-                        == handle.flat_param._local_shard.data_ptr()
-                    )
-                    if already_resharded:
-                        continue
-                    free_unsharded_flat_params.append(
-                        _should_free_in_backward(fsdp_module, handle)
-                    )
-                    handles_to_reshard.append(handle)
-                _reshard(self, handles_to_reshard, free_unsharded_flat_params)
-            except Exception as e:
-                p_assert(
-                    False,
-                    f"Got exception while resharding module {fsdp_module}: {str(e)}",
-                    raise_assertion_error=False,
-                )
-                raise e
-
-        def _finalize_params(fsdp_module: FullyShardedDataParallel) -> None:
-            """Helper used below on all fsdp modules."""
-            for handle in fsdp_module._handles:
-                p = handle.flat_param
-                if p.requires_grad:
-                    if hasattr(p, "_post_backward_hook_state"):
-                        p_assert(
-                            len(p._post_backward_hook_state) == 2,  # type: ignore[attr-defined]
-                            "p._post_backward_hook_state fields are not valid.",
-                        )
-                        p._post_backward_hook_state[1].remove()  # type: ignore[attr-defined]
-                        delattr(p, "_post_backward_hook_state")
-                    if not self._sync_gradients:
-                        # Preserve the gradient accumulation state if not
-                        # synchronizing gradients: `p.grad` remains the
-                        # unsharded gradient accumulated from prior `no_sync()`
-                        # iterations, and `p._saved_grad_shard` remains the
-                        # sharded gradient from the last synchronized iteration
-                        continue
-                    handle.prepare_gradient_for_optim()
-                    p_assert(
-                        hasattr(p, "_post_backward_called"),
-                        "Expected flag _post_backward_called to be set on param.",
-                    )
-                    # Reset _post_backward_called in preparation for the next iteration.
-                    p._post_backward_called = False
-
-        # Update root and nested FSDP's hooks and flags.
-        for m in self.fsdp_modules(self):  # includes self
-            _catch_all_reshard(m)
-            _finalize_params(m)
-            m._ran_pre_backward_hook.clear()
-            m.training_state = TrainingState.IDLE
-            for handle in m._handles:
-                handle._training_state = HandleTrainingState.IDLE
-            m._handles_prefetched.clear()
-            if m._is_root:
-                # reset this flag for cases like "one forward pass + multiple backward passes"
-                self._post_backward_callback_queued = False
-
-        if self._use_param_exec_order_policy() and self._param_exec_order_prep_stage:
-            self._param_exec_order_policy_second_iter_init()
-
-    def _param_exec_order_policy_second_iter_init(self) -> None:
-        self._param_exec_order_prep_stage = False
-        # Let the parameters in self._fsdp_params_exec_order ordered based on
-        # the execution order in the forward pass.
-        self._fsdp_params_exec_order.reverse()
-        for m in self.modules():
-            if m is not self and isinstance(m, FullyShardedDataParallel):
-                assert hasattr(
-                    m, "_param_exec_order_policy"
-                ), "Non-root FSDP modules should also have _param_exec_order_policy attribute"
-                assert hasattr(
-                    m, "_param_exec_order_prep_stage"
-                ), "Non-root FSDP modules should also have _param_exec_order_prep_stage attribute"
-                m._param_exec_order_prep_stage = False
-        # TODO (linjianma): Construct a fsdp_wrap_map whose keys are all children modules with a FSDP wrap,
-        # and values are its FSDP wraps. These children FSDP wraps will be detached from the root FSDP module
-        # and will be used to schedule the parameters (rebuild_full_params and reshard).
-        # TODO (linjianma): Remove all internal FSDP wraps from the root FSDP module.
-        # TODO (linjianma): Based on self._fsdp_params_exec_order, get the information
-        # needed to patch the forward() function of each key in the fsdp_wrap_map. The rules are as follows:
-        # 1: Before each forward(), rebuild_full_params of all parameters that are currently sharded and
-        # will be used in the forward, and reshard all parameters that are currently full and will not be
-        # used in the next forward()
-        # 2: After each forward(), reshard all parameters just used in the forward, and rebuild_full_params of
-        # all parameters that will be used next.
-        # TODO (linjianma): Patch the forward of each model in the keys
-        # of fsdp_wrap_map based on the information above.
-
     def _assert_state(self, state: Union[TrainingState, List[TrainingState]]) -> None:
         """Assert we are in the given state."""
         # Since assert can be turned off and this error checking
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index b4650adff569b..0c4063016dc30 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -617,13 +617,8 @@ def _delayed_reshard(*args, **kwargs):
                     )
                     return orig_reshard(*args, **kwargs)
 
-                # The first patch covers any `from torch... import _reshard`
-                # uses in `fully_sharded_data_parallel.py`, and the second
-                # patch covers any `import torch..._reshard` uses in general.
+                # This patch covers any `import torch..._reshard` uses.
                 with mock.patch(
-                    "torch.distributed.fsdp.fully_sharded_data_parallel._reshard",
-                    _delayed_reshard,
-                ), mock.patch(
                     "torch.distributed.fsdp._runtime_utils._reshard", _delayed_reshard
                 ):
                     return self.module(x)

From 5253071d1bc83241229d925ba8680e67eeddad5b Mon Sep 17 00:00:00 2001
From: Jongsoo Park <jongsoo@fb.com>
Date: Sat, 3 Dec 2022 19:54:04 +0000
Subject: [PATCH 1552/1922] Revert D41683102: Multisect successfully blamed
 D41683102 for test or build failures (#90117)

Summary:
This diff is reverting D41683102
D41683102 has been identified to be causing the following test or build failures:
Tests affected:
- https://www.internalfb.com/intern/test/281475051072735/

Here's the Multisect link:
https://www.internalfb.com/intern/testinfra/multisect/1444960
Here are the tasks that are relevant to this breakage:
T124964606: 41 tests started failing for oncall ads_trainer_release in the last 2 weeks
We're generating a revert to back out the changes in this diff, please note the backout may land if someone accepts it.

Test Plan: NA

Reviewed By: jspark1105

Differential Revision: D41710842

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90117
Approved by: https://github.com/soumith
---
 .../ao_migration/test_quantization_fx.py      |  2 +
 test/test_fx.py                               | 30 ------
 torch/ao/quantization/fx/tracer.py            | 88 +++++++++++++++++-
 torch/ao/quantization/quantize_fx.py          | 55 +++++++++++
 torch/fx/_symbolic_trace.py                   | 29 ++----
 torch/fx/proxy.py                             | 93 +------------------
 torch/quantization/quantize_fx.py             |  2 +
 7 files changed, 154 insertions(+), 145 deletions(-)

diff --git a/test/quantization/ao_migration/test_quantization_fx.py b/test/quantization/ao_migration/test_quantization_fx.py
index c71053f4f29c9..c75727717a736 100644
--- a/test/quantization/ao_migration/test_quantization_fx.py
+++ b/test/quantization/ao_migration/test_quantization_fx.py
@@ -11,6 +11,8 @@ def test_function_import_quantize_fx(self):
             '_check_is_graph_module',
             '_swap_ff_with_fxff',
             '_fuse_fx',
+            'Scope',
+            'ScopeContextManager',
             'QuantizationTracer',
             '_prepare_fx',
             '_prepare_standalone_module_fx',
diff --git a/test/test_fx.py b/test/test_fx.py
index 0e6a22a3c9360..0aff631b8e814 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -1679,36 +1679,6 @@ def forward(self, x):
             if node.op in {'placeholder'}:
                 self.assertEqual(node.meta['tensor_meta'].memory_format, torch.channels_last_3d)
 
-    def test_nn_module_stack(self):
-        class SubModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.conv_mod = torch.nn.Conv2d(64, 64, (3, 3), padding=1, bias=False)
-
-            def forward(self, x):
-                return self.conv_mod(x)
-
-        class MyModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.sub_mod = SubModule()
-
-            def forward(self, x):
-                return self.sub_mod(x)
-
-        m = MyModule()
-        gm = torch.fx.symbolic_trace(m)
-
-        mod_stack = {}
-        expected_stack = [('sub_mod', str(type(m.sub_mod))),
-                          ('sub_mod.conv_mod', str(type(m.sub_mod.conv_mod)))]
-        for node in gm.graph.nodes:
-            mod_stack = node.meta.get('nn_module_stack', {})
-            if mod_stack:
-                break
-        stack_list = list(mod_stack.items())
-        self.assertEqual(stack_list, expected_stack)
-
     def test_interpreter(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
diff --git a/torch/ao/quantization/fx/tracer.py b/torch/ao/quantization/fx/tracer.py
index 1ac98a13c548e..3a959447cfd6b 100644
--- a/torch/ao/quantization/fx/tracer.py
+++ b/torch/ao/quantization/fx/tracer.py
@@ -1,13 +1,67 @@
 import torch
 from torch.fx._symbolic_trace import Tracer
-from torch.fx.proxy import Scope
+from torch.fx.node import Target, Node, Argument
 from torch.nn.intrinsic import _FusedModule
-from typing import List, Callable
+from typing import List, Callable, Tuple, Any, Dict, Optional
 
 __all__ = [
     "QuantizationTracer",
 ]
 
+class Scope(object):
+    """ Scope object that records the module path and the module type
+    of a module. Scope is used to track the information of the module
+    that contains a Node in a Graph of GraphModule. For example::
+
+        class Sub(torch.nn.Module):
+            def forward(self, x):
+                # This will be a call_method Node in GraphModule,
+                # scope for this would be (module_path="sub", module_type=Sub)
+                return x.transpose(1, 2)
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                self.sub = Sub()
+
+            def forward(self, x):
+                # This will be a call_method Node as well,
+                # scope for this would be (module_path="", None)
+                x = x.transpose(1, 2)
+                x = self.sub(x)
+                return x
+
+    """
+
+    def __init__(self, module_path: str, module_type: Any):
+        super().__init__()
+        self.module_path = module_path
+        self.module_type = module_type
+
+
+class ScopeContextManager(object):
+    """ A context manager to track the Scope of Node during symbolic tracing.
+    When entering a forward function of a Module, we'll update the scope information of
+    the current module, and when we exit, we'll restore the previous scope information.
+    """
+
+    def __init__(
+        self, scope: Scope, current_module: torch.nn.Module, current_module_path: str
+    ):
+        super().__init__()
+        self.prev_module_type = scope.module_type
+        self.prev_module_path = scope.module_path
+        self.scope = scope
+        self.scope.module_path = current_module_path
+        self.scope.module_type = type(current_module)
+
+    def __enter__(self):
+        return
+
+    def __exit__(self, *args):
+        self.scope.module_path = self.prev_module_path
+        self.scope.module_type = self.prev_module_type
+        return
+
 class QuantizationTracer(Tracer):
     def __init__(
         self, skipped_module_names: List[str], skipped_module_classes: List[Callable]
@@ -21,6 +75,7 @@ def __init__(
         # We can change this if there is a use case that configures
         # qconfig using top level module type
         self.scope = Scope("", None)
+        self.node_name_to_scope: Dict[str, Tuple[str, type]] = {}
         self.record_stack_traces = True
 
     def is_leaf_module(self, m: torch.nn.Module, module_qualified_name: str) -> bool:
@@ -33,3 +88,32 @@ def is_leaf_module(self, m: torch.nn.Module, module_qualified_name: str) -> bool
             or type(m) in self.skipped_module_classes
             or isinstance(m, _FusedModule)
         )
+
+    def call_module(
+        self,
+        m: torch.nn.Module,
+        forward: Callable[..., Any],
+        args: Tuple[Any, ...],
+        kwargs: Dict[str, Any],
+    ) -> Any:
+        module_qualified_name = self.path_of_module(m)
+        # Creating scope with information of current module
+        # scope will be restored automatically upon exit
+        with ScopeContextManager(self.scope, m, module_qualified_name):
+            return super().call_module(m, forward, args, kwargs)
+
+    def create_node(
+        self,
+        kind: str,
+        target: Target,
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        name: Optional[str] = None,
+        type_expr: Optional[Any] = None,
+    ) -> Node:
+        node = super().create_node(kind, target, args, kwargs, name, type_expr)
+        self.node_name_to_scope[node.name] = (
+            self.scope.module_path,
+            self.scope.module_type,
+        )
+        return node
diff --git a/torch/ao/quantization/quantize_fx.py b/torch/ao/quantization/quantize_fx.py
index c476b754f2aa2..8f26934576580 100644
--- a/torch/ao/quantization/quantize_fx.py
+++ b/torch/ao/quantization/quantize_fx.py
@@ -64,6 +64,61 @@ def _fuse_fx(
         graph_module, is_qat, fuse_custom_config, backend_config)  # type: ignore[operator]
 
 
+class Scope(object):
+    """ Scope object that records the module path and the module type
+    of a module. Scope is used to track the information of the module
+    that contains a Node in a Graph of GraphModule. For example::
+
+        class Sub(torch.nn.Module):
+            def forward(self, x):
+                # This will be a call_method Node in GraphModule,
+                # scope for this would be (module_path="sub", module_type=Sub)
+                return x.transpose(1, 2)
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                self.sub = Sub()
+
+            def forward(self, x):
+                # This will be a call_method Node as well,
+                # scope for this would be (module_path="", None)
+                x = x.transpose(1, 2)
+                x = self.sub(x)
+                return x
+
+    """
+
+    def __init__(self, module_path: str, module_type: Any):
+        super().__init__()
+        self.module_path = module_path
+        self.module_type = module_type
+
+
+class ScopeContextManager(object):
+    """ A context manager to track the Scope of Node during symbolic tracing.
+    When entering a forward function of a Module, we'll update the scope information of
+    the current module, and when we exit, we'll restore the previous scope information.
+    """
+
+    def __init__(
+        self, scope: Scope, current_module: torch.nn.Module, current_module_path: str
+    ):
+        super().__init__()
+        self.prev_module_type = scope.module_type
+        self.prev_module_path = scope.module_path
+        self.scope = scope
+        self.scope.module_path = current_module_path
+        self.scope.module_type = type(current_module)
+
+    def __enter__(self):
+        return
+
+    def __exit__(self, *args):
+        self.scope.module_path = self.prev_module_path
+        self.scope.module_type = self.prev_module_type
+        return
+
+
 def _prepare_fx(
     model: torch.nn.Module,
     qconfig_mapping: Union[QConfigMapping, Dict[str, Any]],
diff --git a/torch/fx/_symbolic_trace.py b/torch/fx/_symbolic_trace.py
index dfa6f5096042d..ff9df1161a700 100644
--- a/torch/fx/_symbolic_trace.py
+++ b/torch/fx/_symbolic_trace.py
@@ -5,7 +5,6 @@
 import math
 import os
 import warnings
-import collections
 from itertools import chain
 from types import CodeType, FunctionType, ModuleType
 from typing import (
@@ -29,7 +28,7 @@
 from .graph import _PyTreeCodeGen, _PyTreeInfo, Graph
 from .graph_module import GraphModule
 from .node import Argument, base_types, map_aggregate
-from .proxy import ParameterProxy, Proxy, TracerBase, Scope, ScopeContextManager
+from .proxy import ParameterProxy, Proxy, TracerBase
 
 HAS_VARSTUFF = inspect.CO_VARARGS | inspect.CO_VARKEYWORDS
 
@@ -45,6 +44,7 @@
 def is_fx_tracing():
     return _is_fx_tracing_flag
 
+
 @compatibility(is_backward_compatible=True)
 class ProxyableClassMeta(type):
     """
@@ -250,13 +250,6 @@ def __init__(
         self.param_shapes_constant = param_shapes_constant
 
         self.submodule_paths: Optional[Dict[torch.nn.Module, str]] = None
-        self.root_module_name: str = ""
-        # Maps the containing module's name to the operator name
-        self.scope = Scope("", None)
-        # Records the module call stack
-        self.module_stack = collections.OrderedDict()
-        # Mapping of node name to module scope
-        self.node_name_to_scope: Dict[str, Tuple[str, type]] = {}
 
     @compatibility(is_backward_compatible=True)
     def create_arg(self, a: Any) -> "Argument":
@@ -437,18 +430,9 @@ def call_module(
             value was returned from the ``Module`` invocation.
         """
         module_qualified_name = self.path_of_module(m)
-        with ScopeContextManager(self.scope, Scope(module_qualified_name, type(m))) as _scope:
-            # module_stack is an ordered dict so writing then deleting the
-            # entry is equivalent to push/pop on a list
-            self.module_stack[_scope.module_path] = str(_scope.module_type)
-            if not self.is_leaf_module(m, module_qualified_name):
-                ret_val = forward(*args, **kwargs)
-            else:
-                ret_val = self.create_proxy("call_module", module_qualified_name, args, kwargs)
-            key, _ = self.module_stack.popitem(last=True)
-            assert key == _scope.module_path, f" Unexpected key {key}"
-
-        return ret_val
+        if not self.is_leaf_module(m, module_qualified_name):
+            return forward(*args, **kwargs)
+        return self.create_proxy("call_module", module_qualified_name, args, kwargs)
 
     @compatibility(is_backward_compatible=False)
     def getattr(self, attr: str, attr_val: Any, parameter_proxy_cache: Dict[str, Any]):
@@ -596,7 +580,7 @@ def replace_ph(x):
                 name,
                 default,
                 {},
-                type_expr=fn_for_analysis.__annotations__.get(name, None)
+                type_expr=fn_for_analysis.__annotations__.get(name, None),
             )
 
         arg_names = [next(names_iter) for idx in range(skip_arg_idx, total_args)]
@@ -679,7 +663,6 @@ def trace(
                 ), f"traced_func_name={self.traced_func_name} doesn't exist in {type(root).__name__}"
 
                 fn = getattr(type(root), self.traced_func_name)
-                self.root_module_name = root._get_name()
                 self.submodule_paths = {mod: name for name, mod in root.named_modules()}
             else:
                 self.root = torch.nn.Module()
diff --git a/torch/fx/proxy.py b/torch/fx/proxy.py
index c77967574b59a..6f9535b117370 100644
--- a/torch/fx/proxy.py
+++ b/torch/fx/proxy.py
@@ -1,83 +1,17 @@
 import dis
-import copy
 import torch
 import inspect
 import operator
 import traceback
-import collections
 
 from .graph import magic_methods, reflectable_magic_methods, Graph
-from typing import Tuple, Dict, OrderedDict, Optional, Iterable, Any, Iterator, Callable
+from typing import Tuple, Dict, Optional, Iterable, Any, Iterator, Callable
 from .node import Target, Node, Argument, base_types, map_aggregate
 from ._compatibility import compatibility
 from .operator_schemas import check_for_mutable_operation
 import torch.fx.traceback as fx_traceback
 
-__all__ = ['TracerBase', 'GraphAppendingTracer', 'TraceError',
-           'Proxy', 'Attribute', 'ParameterProxy', 'Scope',
-           'ScopeContextManager']
-
-
-@compatibility(is_backward_compatible=False)
-class Scope(object):
-    """ Scope object that records the module path and the module type
-    of a module. Scope is used to track the information of the module
-    that contains a Node in a Graph of GraphModule. For example::
-
-        class Sub(torch.nn.Module):
-            def forward(self, x):
-                # This will be a call_method Node in GraphModule,
-                # scope for this would be (module_path="sub", module_type=Sub)
-                return x.transpose(1, 2)
-
-        class M(torch.nn.Module):
-            def __init__(self):
-                self.sub = Sub()
-
-            def forward(self, x):
-                # This will be a call_method Node as well,
-                # scope for this would be (module_path="", None)
-                x = x.transpose(1, 2)
-                x = self.sub(x)
-                return x
-
-    """
-
-    def __init__(self, module_path: str, module_type: Any):
-        super().__init__()
-        self.module_path = module_path
-        self.module_type = module_type
-
-
-@compatibility(is_backward_compatible=False)
-class ScopeContextManager(object):
-    """ A context manager to track the Scope of Node during symbolic tracing.
-    When entering a forward function of a Module, we'll update the scope information of
-    the current module, and when we exit, we'll restore the previous scope information.
-    """
-
-    def __init__(
-        self,
-        scope: Scope,
-        current_scope: Scope,
-    ):
-        super().__init__()
-        # Keep a copy of prev scope to restore on exit
-        self._prev_scope = copy.copy(scope)
-        # Update scope to current scope
-        scope.module_path = current_scope.module_path
-        scope.module_type = current_scope.module_type
-        # Save a reference so we can restore it
-        self._scope = scope
-
-    def __enter__(self):
-        return self._scope
-
-    def __exit__(self, *args):
-        self._scope.module_path = self._prev_scope.module_path
-        self._scope.module_type = self._prev_scope.module_type
-        return
-
+__all__ = ['TracerBase', 'GraphAppendingTracer', 'TraceError', 'Proxy', 'Attribute', 'ParameterProxy']
 
 @compatibility(is_backward_compatible=True)
 class TracerBase:
@@ -95,15 +29,6 @@ class TracerBase:
     # ``root`` is an instance of ``nn.Module``
     traced_func_name: str = "forward"
 
-    # Maps the containing module's name to the operator name
-    scope : Scope
-
-    # Records the module call stack
-    module_stack: OrderedDict[str, str]
-
-    # Mapping of node name to module scope
-    node_name_to_scope: Dict[str, Tuple[str, type]]
-
     @compatibility(is_backward_compatible=True)
     def create_node(self, kind : str, target : Target,
                     args : Tuple[Argument, ...], kwargs : Dict[str, Argument], name : Optional[str] = None,
@@ -118,16 +43,7 @@ def create_node(self, kind : str, target : Target,
         if kind == 'call_function' and self.check_mutable_operations:
             check_for_mutable_operation(target, args, kwargs)
 
-        node = self.graph.create_node(kind, target, args, kwargs, name, type_expr)
-        # TODO node_name_to_scope will be depricated in favor of
-        # node.meta['nn_module_stack']
-        self.node_name_to_scope[node.name] = (
-            self.scope.module_path,
-            self.scope.module_type,
-        )
-        if self.module_stack:
-            node.meta['nn_module_stack'] = copy.copy(self.module_stack)
-        return node
+        return self.graph.create_node(kind, target, args, kwargs, name, type_expr)
 
     @compatibility(is_backward_compatible=True)
     def proxy(self, node: Node) -> 'Proxy':
@@ -291,9 +207,6 @@ class GraphAppendingTracer(TracerBase):
     def __init__(self, graph: Graph):
         super().__init__()
         self.graph = graph
-        self.scope = Scope("", None)
-        self.module_stack = collections.OrderedDict()
-        self.node_name_to_scope = {}
 
 @compatibility(is_backward_compatible=False)
 def assert_fn(x):
diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py
index 1f519e991ca64..aad3bc7253e4b 100644
--- a/torch/quantization/quantize_fx.py
+++ b/torch/quantization/quantize_fx.py
@@ -11,6 +11,8 @@
     _check_is_graph_module,
     _swap_ff_with_fxff,
     _fuse_fx,
+    Scope,
+    ScopeContextManager,
     QuantizationTracer,
     _prepare_fx,
     _prepare_standalone_module_fx,

From 5a59fe100bf5a5c039e90c86d2037ccd8c582892 Mon Sep 17 00:00:00 2001
From: erjia <erjia@fb.com>
Date: Fri, 2 Dec 2022 22:20:41 +0000
Subject: [PATCH 1553/1922] [1/4][DataPipe] Properly cleanup unclosed files
 within generator function (#89973)

There is a case that `file.close` is never called because when generator function has never reached to the end. A simple example would be `zip` two datepipes with different length. The longer DataPipe would never reach the end of generator and then it will be cleaned up by `gc`. So, the line of `file.close` is not executed. (This is the reason that Vitaly has to create this [hack](https://github.com/pytorch/pytorch/blob/4451eb24e6287dff62ff8a7ec0eda6a6998807b0/torch/utils/data/datapipes/iter/combining.py#L573-L583) to retrieve all remaining data to make sure generator function is fully executed)

However, this hack introduces another problem where an infinite datapipe would make `zip` never end as it would try to deplete the infinite iterator. See: https://github.com/pytorch/data/issues/865

So, in this PR, I am adding a `try-finally` clause to make sure the `file.close` is always executed during the destruction of `generator` object. Then, we don't need the hack within `zip` any more.

Differential Revision: [D41699470](https://our.internmc.facebook.com/intern/diff/D41699470)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89973
Approved by: https://github.com/NivekT
---
 torch/utils/data/datapipes/iter/combining.py | 138 ++++++++++++-------
 torch/utils/data/datapipes/utils/common.py   |   4 +-
 2 files changed, 90 insertions(+), 52 deletions(-)

diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py
index c874cedbde29c..4ce4c3b8c9fd5 100644
--- a/torch/utils/data/datapipes/iter/combining.py
+++ b/torch/utils/data/datapipes/iter/combining.py
@@ -143,57 +143,67 @@ def __init__(self, datapipe: IterDataPipe, num_instances: int, buffer_size: int
         self.slowest_ptr = 0  # The index to read by the slowest child
         self.leading_ptr = 0  # The index to read by the fastest child
         self.end_ptr: Optional[int] = None  # The index to stop child
+        self._child_stop: List[bool] = [True for _ in range(num_instances)]
 
     def __len__(self):
         return len(self.main_datapipe)
 
     def get_next_element_by_instance(self, instance_id: int):
-        if self._datapipe_iterator is None:
+        if self._datapipe_iterator is None and self._child_stop[instance_id]:
             self._datapipe_iterator = iter(self.main_datapipe)
             self._snapshot_state = _SnapshotState.Iterating
-        while self.end_ptr is None or self.child_pointers[instance_id] + 1 < self.end_ptr:
-            self.child_pointers[instance_id] += 1
-            # Use buffer
-            if self.buffer and self.child_pointers[instance_id] <= self.leading_ptr:
-                idx = self.child_pointers[instance_id] - self.slowest_ptr - 1
-                return_val = self.buffer[idx]
-            else:  # Retreive one element from main datapipe
-                self.leading_ptr = self.child_pointers[instance_id]
-                try:
-                    return_val = next(self._datapipe_iterator)
-                    self.buffer.append(return_val)
-                except StopIteration:
-                    self.end_ptr = self.leading_ptr
-                    continue
-            if self.child_pointers[instance_id] == self.slowest_ptr + 1:
-                new_min = min(self.child_pointers)  # Can optimize by avoiding the call to min()
-                if self.slowest_ptr < new_min:
-                    self.slowest_ptr = new_min
-                    self.buffer.popleft()
-            if self.buffer_size >= 0 and self.leading_ptr > self.buffer_size + self.slowest_ptr:
-                raise BufferError("ForkerIterDataPipe buffer overflow," +
-                                  f"buffer size {self.buffer_size} is insufficient.")
-            yield return_val
-
-        if all(p + 1 == self.end_ptr for p in self.child_pointers):
-            self._datapipe_iterator = None
+            for i in range(self.num_instances):
+                self._child_stop[i] = False
+        try:
+            while not self._child_stop[instance_id]:
+                self.child_pointers[instance_id] += 1
+                if self.end_ptr is not None and self.child_pointers[instance_id] == self.end_ptr:
+                    self._child_stop[instance_id] = True
+                    break
+                # Use buffer
+                if self.buffer and self.child_pointers[instance_id] <= self.leading_ptr:
+                    idx = self.child_pointers[instance_id] - self.slowest_ptr - 1
+                    return_val = self.buffer[idx]
+                else:  # Retreive one element from main datapipe
+                    self.leading_ptr = self.child_pointers[instance_id]
+                    try:
+                        return_val = next(self._datapipe_iterator)  # type: ignore[arg-type]
+                        self.buffer.append(return_val)
+                    except StopIteration:
+                        self._child_stop[instance_id] = True
+                        self._datapipe_iterator = None
+                        self.end_ptr = self.leading_ptr
+                        continue
+                if self.child_pointers[instance_id] == self.slowest_ptr + 1:
+                    new_min = min(self.child_pointers)  # Can optimize by avoiding the call to min()
+                    if self.slowest_ptr < new_min:
+                        self.slowest_ptr = new_min
+                        self.buffer.popleft()
+                if self.buffer_size >= 0 and self.leading_ptr > self.buffer_size + self.slowest_ptr:
+                    raise BufferError("ForkerIterDataPipe buffer overflow," +
+                                      f"buffer size {self.buffer_size} is insufficient.")
+                yield return_val
+        finally:
+            self._child_stop[instance_id] = True
+            # Cleanup _datapipe_iterator for the case that fork exits earlier
+            if all(self._child_stop):
+                self._datapipe_iterator = None
+                self._cleanup()
 
     def is_every_instance_exhausted(self) -> bool:
-        # Due to the implementation of `get_next_element_by_instance`, `self.end_ptr` will end up
-        # equaling to `len(main_datapipe) + 1`, hence the check for `self.end_ptr - 1 == ptr` below.
-        return self.end_ptr is not None and\
-            all(self.end_ptr == ptr or self.end_ptr - 1 == ptr for ptr in self.child_pointers)
+        return self.end_ptr is not None and all(self._child_stop)
 
     def get_length_by_instance(self, instance_id: int) -> int:
         return len(self.main_datapipe)
 
     def reset(self) -> None:
-        self._datapipe_iterator = iter(self.main_datapipe)
+        self._datapipe_iterator = None
         self.buffer = deque()
         self.child_pointers = [0] * self.num_instances
         self.slowest_ptr = 0
         self.leading_ptr = 0
         self.end_ptr = None
+        self._child_stop = [True for _ in range(self.num_instances)]
 
     def __getstate__(self):
         state = (
@@ -221,10 +231,15 @@ def __setstate__(self, state):
         self.slowest_ptr = 0
         self.leading_ptr = 0
         self.end_ptr = None
+        self._child_stop = [True for _ in range(self.num_instances)]
+
+    def _cleanup(self):
+        while self.buffer:
+            d = self.buffer.popleft()
+            StreamWrapper.close_streams(d)
 
     def __del__(self):
-        if self.buffer:
-            self.buffer.clear()
+        self._cleanup()
 
 
 class _ChildDataPipe(IterDataPipe):
@@ -375,10 +390,11 @@ def __init__(self, datapipe: IterDataPipe[T_co], num_instances: int,
         self.classifier_fn = classifier_fn
         self.drop_none = drop_none
         self.main_datapipe_exhausted = False
+        self._child_stop: List[bool] = [True for _ in range(num_instances)]
 
     def _find_next(self, instance_id: int) -> T_co:
         while True:
-            if self.main_datapipe_exhausted:
+            if self.main_datapipe_exhausted or self._child_stop[instance_id]:
                 raise StopIteration
             if self._datapipe_iterator is None:
                 raise ValueError(
@@ -401,24 +417,35 @@ def _find_next(self, instance_id: int) -> T_co:
                     f"DemultiplexerIterDataPipe buffer overflow, buffer size {self.buffer_size} is insufficient.")
 
     def get_next_element_by_instance(self, instance_id: int):
-        if self._datapipe_iterator is None and not self.main_datapipe_exhausted:
+        if self._datapipe_iterator is None and self._child_stop[instance_id]:
             self._datapipe_iterator = iter(self.main_datapipe)
             self._snapshot_state = _SnapshotState.Iterating  # This is necessary for the DataPipe to reset properly.
-        stop = False
-        while not stop:
+            self.main_datapipe_exhausted = False
+            for i in range(self.num_instances):
+                self._child_stop[i] = False
+
+        try:
+            while not self._child_stop[instance_id]:
+                if self.child_buffers[instance_id]:
+                    self.current_buffer_usage -= 1
+                    yield self.child_buffers[instance_id].popleft()
+                else:
+                    try:
+                        yield self._find_next(instance_id)
+                    except StopIteration:
+                        self._child_stop[instance_id] = True
+                        self.main_datapipe_exhausted = True
+                        self._datapipe_iterator = None
+        finally:
+            self._child_stop[instance_id] = True
+            # Cleanup _datapipe_iterator for the case that demux exits earlier
+            if all(self._child_stop):
+                self._datapipe_iterator = None
             if self.child_buffers[instance_id]:
-                self.current_buffer_usage -= 1
-                yield self.child_buffers[instance_id].popleft()
-            else:
-                try:
-                    yield self._find_next(instance_id)
-                except StopIteration:
-                    stop = True
-                    self.main_datapipe_exhausted = True
-                    self._datapipe_iterator = None
+                self._cleanup(instance_id)
 
     def is_every_instance_exhausted(self) -> bool:
-        return self.main_datapipe_exhausted and all(not child_buffer for child_buffer in self.child_buffers)
+        return self.main_datapipe_exhausted and all(self._child_stop)
 
     def get_length_by_instance(self, instance_id: int) -> int:
         raise TypeError
@@ -427,6 +454,7 @@ def reset(self) -> None:
         self._datapipe_iterator = None
         self.current_buffer_usage = 0
         self.child_buffers = [deque() for _ in range(self.num_instances)]
+        self._child_stop = [True for _ in range(self.num_instances)]
         self.main_datapipe_exhausted = False
 
     def __getstate__(self):
@@ -456,12 +484,20 @@ def __setstate__(self, state):
         self._datapipe_iterator = None
         self.current_buffer_usage = 0
         self.child_buffers = [deque() for _ in range(self.num_instances)]
+        self._child_stop = [True for _ in range(self.num_instances)]
         self.main_datapipe_exhausted = False
 
+    def _cleanup(self, instance_id: Optional[int] = None):
+        ids = range(self.num_instances) if instance_id is None else [instance_id, ]
+        for i in ids:
+            q = self.child_buffers[i]
+            while q:
+                d = q.popleft()
+                StreamWrapper.close_streams(d)
+
+
     def __del__(self):
-        if self.child_buffers:
-            for dq in self.child_buffers:
-                dq.clear()
+        self._cleanup()
 
 
 @functional_datapipe('mux')
diff --git a/torch/utils/data/datapipes/utils/common.py b/torch/utils/data/datapipes/utils/common.py
index 75d9a5cf173c4..d7038298b2fb2 100644
--- a/torch/utils/data/datapipes/utils/common.py
+++ b/torch/utils/data/datapipes/utils/common.py
@@ -328,6 +328,8 @@ def __getattr__(self, name):
         return getattr(file_obj, name)
 
     def close(self, *args, **kwargs):
+        if self.closed:
+            return
         if StreamWrapper.debug_unclosed_streams:
             del StreamWrapper.session_streams[self]
         if hasattr(self, "parent_stream") and self.parent_stream is not None:
@@ -345,9 +347,9 @@ def autoclose(self):
         Close steam if there is no children, or make it to be automatically closed as soon as
         all child streams are closed.
         """
+        self.close_on_last_child = True
         if self.child_counter == 0:
             self.close()
-        self.close_on_last_child = True
 
     def __dir__(self):
         attrs = list(self.__dict__.keys()) + list(StreamWrapper.__dict__.keys())

From 175f095ee5885469e4bf7d8aa869be158f30efdb Mon Sep 17 00:00:00 2001
From: Zheng Yan <zyan@fb.com>
Date: Sun, 4 Dec 2022 05:35:17 +0000
Subject: [PATCH 1554/1922] Revert D41682843: Multisect successfully blamed
 D41682843 for test or build failures (#90132)

Summary:
This diff is reverting D41682843
D41682843 has been identified to be causing the following test or build failures:
Tests affected:
- https://www.internalfb.com/intern/test/281475048939643/

Here's the Multisect link:
https://www.internalfb.com/intern/testinfra/multisect/1444954
Here are the tasks that are relevant to this breakage:
T93770103: 5 tests started failing for oncall assistant_multimodal in the last 2 weeks
We're generating a revert to back out the changes in this diff, please note the backout may land if someone accepts it.

Test Plan: NA

Reviewed By: zyan0, atuljangra, YazhiGao

Differential Revision: D41710749

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90132
Approved by: https://github.com/awgu
---
 torch/_dynamo/variables/builder.py      | 11 +++--------
 torch/distributed/fsdp/_dynamo_utils.py |  5 ++---
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index 0503b20bb68e1..843e50687a61e 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -99,11 +99,6 @@
 from .user_defined import UserDefinedClassVariable, UserDefinedObjectVariable
 
 
-# Names of attributes used to annotate modules for FSDP + Dynamo
-_FSDP_MANAGED_MODULE = "_is_fsdp_managed_module"
-_FSDP_USE_ORIG_PARAMS = "_fsdp_use_orig_params"
-
-
 class _missing:
     pass
 
@@ -324,14 +319,14 @@ def index_source(key):
                 return self.tx.output.side_effects.track_object_existing(
                     self.source, value, result
                 )
-            elif getattr(value, _FSDP_MANAGED_MODULE, False) or issubclass(
+            elif getattr(value, "_is_fsdp_managed_module", False) or issubclass(
                 value.__class__, torch.nn.parallel.distributed.DistributedDataParallel
             ):
-                if getattr(value, _FSDP_MANAGED_MODULE, False):
+                if getattr(value, "_is_fsdp_managed_module", False):
                     # Note: we can't do this assert inside FSDP constructor,
                     # since we don't know yet whether dynamo will be used
                     assert getattr(
-                        value, _FSDP_USE_ORIG_PARAMS, False
+                        value, "_fsdp_use_orig_params", False
                     ), "Dynamo only supports FSDP with use_orig_params=True"
 
                 # See note [Dynamo treats FSDP wrapped modules as UnspecializedNNModule]
diff --git a/torch/distributed/fsdp/_dynamo_utils.py b/torch/distributed/fsdp/_dynamo_utils.py
index dfaa803798a3f..3a6c63dc5af8b 100644
--- a/torch/distributed/fsdp/_dynamo_utils.py
+++ b/torch/distributed/fsdp/_dynamo_utils.py
@@ -1,7 +1,6 @@
 from typing import Set
 
 import torch.nn as nn
-from torch._dynamo.variables.builder import _FSDP_MANAGED_MODULE, _FSDP_USE_ORIG_PARAMS
 
 
 def _annotate_modules_for_dynamo(
@@ -37,10 +36,10 @@ def _annotate_modules_for_dynamo(
             order for backward to interleave hooks with compute per layer.  UnspecializedNNModule lets us achieve
             this by capturing the module code more 'functionally' and passing parameters in as inputs each time.
             """
-            setattr(submodule, _FSDP_MANAGED_MODULE, True)
+            submodule._is_fsdp_managed_module = True  # type: ignore[assignment]
 
             # Dynamo only supports FSDP with use_orig_params=True.
             # This is hacky, but I could not think of another way to add an assertion to dynamo
             # for this, since Dynamo skips all the FSDP code frames and thus can't inspect the
             # FSDP module directly
-            setattr(submodule, _FSDP_USE_ORIG_PARAMS, use_orig_params)
+            submodule._fsdp_use_orig_params = use_orig_params  # type: ignore[assignment]

From 1f851de36fd2eb1193e7dc9ab486f3b0bc1ff35a Mon Sep 17 00:00:00 2001
From: Ram Rachum <ram@rachum.com>
Date: Sun, 4 Dec 2022 06:51:25 +0000
Subject: [PATCH 1555/1922] Fix exception cause in storage.py (#90118)

This change causes the correct message to be shown between the two tracebacks when an error is shown.

More context here: https://blog.ram.rachum.com/post/621791438475296768/improving-python-exception-chaining-with
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90118
Approved by: https://github.com/kit1980
---
 torch/storage.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/torch/storage.py b/torch/storage.py
index ef523bd7b97e0..9cf61d626536a 100644
--- a/torch/storage.py
+++ b/torch/storage.py
@@ -738,8 +738,8 @@ def pickle_storage_type(self):
     def _pickle_storage_type(self):
         try:
             return _dtype_to_storage_type_map()[self.dtype]
-        except KeyError:
-            raise KeyError(f'dtype {self.dtype} is not recognized')
+        except KeyError as e:
+            raise KeyError(f'dtype {self.dtype} is not recognized') from e
 
     def __reduce__(self):
         b = io.BytesIO()
@@ -996,6 +996,6 @@ def _new_shared_filename(cls, manager, obj, size):
 def _get_dtype_from_pickle_storage_type(pickle_storage_type: str):
     try:
         return _storage_type_to_dtype_map()[pickle_storage_type]
-    except KeyError:
+    except KeyError as e:
         raise KeyError(
-            f'pickle storage type "{pickle_storage_type}" is not recognized')
+            f'pickle storage type "{pickle_storage_type}" is not recognized') from e

From e12085bf872c5e57fa25c635907fe01ad6a0858f Mon Sep 17 00:00:00 2001
From: Chung-chieh Shan <ccshan@indiana.edu>
Date: Sun, 4 Dec 2022 07:13:53 +0000
Subject: [PATCH 1556/1922] Fix indentation (#90110)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90110
Approved by: https://github.com/kit1980
---
 torch/_torch_docs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 75806a8cabf3f..98c1dd1cbb49e 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -946,7 +946,7 @@ def merge_dicts(*dicts):
     size (tuple or ints): the shape of the output tensor
     stride (tuple or ints): the stride of the output tensor
     storage_offset (int, optional): the offset in the underlying storage of the output tensor.
-    If ``None``, the storage_offset of the output tensor will match the input tensor.
+        If ``None``, the storage_offset of the output tensor will match the input tensor.
 
 Example::
 

From fb7c4cc09a0c3c58c1ee0512ec2d4c05041e116b Mon Sep 17 00:00:00 2001
From: David Boetius <cherrywoods@posteo.org>
Date: Sun, 4 Dec 2022 07:19:24 +0000
Subject: [PATCH 1557/1922] Add generator argument to torch.rand docstring
 (#90071)

The documentation of `torch.rand` was missing the `generator` keyword argument in the function signature. However, the argument is explained in the documentation and `torch.rand` accepts that argument.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90071
Approved by: https://github.com/janeyx99
---
 torch/_torch_docs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 98c1dd1cbb49e..8125d3e560151 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -8996,8 +8996,8 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.rand,
     """
-rand(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, \
-pin_memory=False) -> Tensor
+rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, \
+requires_grad=False, pin_memory=False) -> Tensor
 """
     + r"""
 Returns a tensor filled with random numbers from a uniform distribution

From 3a4fbe02e92fd3be3e549d3aecd04cc647f2294c Mon Sep 17 00:00:00 2001
From: xiny <xiny@nvidia.com>
Date: Sun, 4 Dec 2022 07:22:54 +0000
Subject: [PATCH 1558/1922] [Doc][Distributed] Add missing functions to
 distributed.rst (#89905)

Add missing documents for `torch.distributed.all_to_all_single` and other functions.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89905
Approved by: https://github.com/kit1980
---
 docs/source/distributed.rst | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst
index c5cea6f47d23a..777e8f5a2085f 100644
--- a/docs/source/distributed.rst
+++ b/docs/source/distributed.rst
@@ -190,6 +190,8 @@ joined.
 
 .. autofunction:: is_nccl_available
 
+.. autofunction:: is_gloo_available
+
 .. autofunction:: is_torchelastic_launched
 
 --------------------------------------------------------------------------------
@@ -331,6 +333,12 @@ an opaque group handle that can be given as a ``group`` argument to all collecti
 
 .. autofunction:: new_group
 
+.. autofunction:: get_group_rank
+
+.. autofunction:: get_global_rank
+
+.. autofunction:: get_process_group_ranks
+
 Point-to-point communication
 ----------------------------
 
@@ -437,6 +445,8 @@ Collective functions
 
 .. autofunction:: reduce_scatter_tensor
 
+.. autofunction:: all_to_all_single
+
 .. autofunction:: all_to_all
 
 .. autofunction:: barrier

From 6dc63041a27f7902dbbab108130cd1add2bddd04 Mon Sep 17 00:00:00 2001
From: "Gao, Xiang" <qasdfgtyuiop@gmail.com>
Date: Sun, 4 Dec 2022 08:47:20 +0000
Subject: [PATCH 1559/1922] Fix warning: use of bitwise '&' with boolean
 operands (#90131)

```
[130/1102] Building CXX object caffe2/CMakeFiles/torch_cuda.dir/__/aten/src/ATen/native/cudnn/LossCTC.cpp.o
/home/gaoxiang/nvfuser5/aten/src/ATen/native/cudnn/LossCTC.cpp:97:11: warning: use of bitwise '&' with boolean operands [-Wbitwise-instead-of-logical]
          (target_lengths[b] < 256) & (target_lengths[b] <= input_lengths[b]);
          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                    &&
/home/gaoxiang/nvfuser5/aten/src/ATen/native/cudnn/LossCTC.cpp:97:11: note: cast one or both operands to int to silence this warning
1 warning generated.
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90131
Approved by: https://github.com/kit1980
---
 aten/src/ATen/native/cudnn/LossCTC.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/cudnn/LossCTC.cpp b/aten/src/ATen/native/cudnn/LossCTC.cpp
index a741816424a7f..7737e91d44177 100644
--- a/aten/src/ATen/native/cudnn/LossCTC.cpp
+++ b/aten/src/ATen/native/cudnn/LossCTC.cpp
@@ -88,13 +88,13 @@ bool _use_cudnn_ctc_loss(
     // (they should, but we didn't check yet)
     int64_t max_input_length = log_probs.size(0);
     for (const auto input_length : input_lengths) {
-      use_cudnn &= ((input_length == max_input_length) ? 1 : 0);
+      use_cudnn = use_cudnn && ((input_length == max_input_length) ? 1 : 0);
     }
     for (const auto b : c10::irange(target_lengths.size())) {
       // target length < 256 is documented, but we see illegal memory accesses
       // when target lengths > input lengths for CuDNN
-      use_cudnn &=
-          (target_lengths[b] < 256) & (target_lengths[b] <= input_lengths[b]);
+      use_cudnn =
+          use_cudnn && (target_lengths[b] < 256) && (target_lengths[b] <= input_lengths[b]);
     }
   }
   return use_cudnn;

From c29ae10b4baddaf2368ab979b4273d1cb7a77a36 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Sun, 4 Dec 2022 12:50:14 +0000
Subject: [PATCH 1560/1922] [codemod][llvm15] LLVM-15 fixes for
 caffe2/test/cpp/jit/test_module_api.cpp (#89938)

Summary: This fixes issues which block `caffe2/test/cpp/jit/test_module_api.cpp` from compiling with LLVM-15.

Test Plan: Sandcastle

Reviewed By: meyering

Differential Revision: D41603454

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89938
Approved by: https://github.com/soumith
---
 test/cpp/jit/test_module_api.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/cpp/jit/test_module_api.cpp b/test/cpp/jit/test_module_api.cpp
index adaad24203c95..f5535eb64c8ed 100644
--- a/test/cpp/jit/test_module_api.cpp
+++ b/test/cpp/jit/test_module_api.cpp
@@ -66,7 +66,7 @@ TEST(ModuleAPITest, MethodRunAsync) {
     mtx.lock();
     ++counter;
     mtx.unlock();
-    at::launch(move(f));
+    at::launch(std::move(f));
   };
 
   auto method = m.get_method("forward");

From 8eb968511ce0a00f6ba051cd82550e6280f2e401 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Sun, 4 Dec 2022 21:57:30 +0000
Subject: [PATCH 1561/1922] Revert "Use dynamo fake tensor mode in
 aot_autograd, move aot_autograd compilation to lowering time [Merger of 89672
 and 89773] (#90039)"

This reverts commit ef0c7ec958439caf44a98fb7b70d920c6c2264b9.

Reverted https://github.com/pytorch/pytorch/pull/90039 on behalf of https://github.com/clee2000 due to broke xla tests https://hud.pytorch.org/pytorch/pytorch/commit/ef0c7ec958439caf44a98fb7b70d920c6c2264b9 https://github.com/pytorch/pytorch/actions/runs/3606308473/jobs/6077646142
---
 test/dynamo/test_export.py                 | 27 +++++----
 test/dynamo/test_minifier.py               |  4 +-
 test/dynamo/test_modules.py                |  4 --
 test/dynamo/test_optimizations.py          | 32 ----------
 test/dynamo/test_verify_correctness.py     |  7 +--
 torch/_dynamo/debug_utils.py               |  4 --
 torch/_dynamo/optimizations/backends.py    | 14 +----
 torch/_dynamo/optimizations/distributed.py | 68 +++++-----------------
 torch/_dynamo/optimizations/log_args.py    | 11 ++--
 torch/_dynamo/output_graph.py              | 66 +++++----------------
 torch/_dynamo/test_minifier_common.py      |  2 +
 torch/_dynamo/utils.py                     | 33 +----------
 torch/_dynamo/variables/builder.py         | 54 +++++------------
 torch/_inductor/overrides.py               |  8 +--
 torch/fx/passes/shape_prop.py              | 29 +--------
 15 files changed, 76 insertions(+), 287 deletions(-)

diff --git a/test/dynamo/test_export.py b/test/dynamo/test_export.py
index 7779c479b6d41..e6b505dea51e5 100644
--- a/test/dynamo/test_export.py
+++ b/test/dynamo/test_export.py
@@ -938,24 +938,23 @@ def func(x):
         torch._dynamo.reset()
 
         def compiler(gm, sample_inputs):
-            def fw(*args):
-                aten_gm = make_fx(gm)(*args)
-                return aten_gm(*args)
+            aten_gm = make_fx(gm)(*sample_inputs)
 
-            return fw
+            self.assertEqual(len(aten_gm.graph.nodes), len(out_graph.graph.nodes))
+            for node1, node2 in zip(aten_gm.graph.nodes, out_graph.graph.nodes):
+                self.assertEqual(node1.op, node2.op)
+                if node1.op == "call_function":
+                    self.assertEqual(node1.target, node2.target)
+                    self.assertEqual(len(node1.args), len(node2.args))
+                    for arg1, arg2 in zip(node1.args, node2.args):
+                        self.assertEqual(type(arg1), type(arg2))
 
-        opt_func = torch._dynamo.optimize(compiler, nopython=True)(func)
-        make_fx_result_through_backend = opt_func(inp)
+            return aten_gm.forward
 
-        fx_g = make_fx(func)(inp)
-        make_fx_result_through_direct = fx_g(inp)
+        opt_func = torch._dynamo.optimize(compiler, nopython=True)(func)
+        make_fx_result = opt_func(inp)
 
-        self.assertTrue(
-            torch._dynamo.utils.same(make_fx_result_through_backend, export_result)
-        )
-        self.assertTrue(
-            torch._dynamo.utils.same(make_fx_result_through_direct, export_result)
-        )
+        self.assertTrue(torch._dynamo.utils.same(make_fx_result, export_result))
 
     def test_export_with_constant_method_on_module(self):
         class MyModule(torch.nn.Module):
diff --git a/test/dynamo/test_minifier.py b/test/dynamo/test_minifier.py
index d2f82f92510c5..c1a56f070be5d 100644
--- a/test/dynamo/test_minifier.py
+++ b/test/dynamo/test_minifier.py
@@ -191,10 +191,8 @@ def inner(x):
         """
         )
 
-        repro_after = "dynamo"
-        repro_level = 2
         test_code = self._gen_test_code(
-            run_code, repro_after, repro_level, RELU_CUSTOM_ERROR_BACKEND
+            run_code, "dynamo", 2, RELU_CUSTOM_ERROR_BACKEND
         )
         _, repro_dir = self._run_test_code(test_code)
         launch_proc, _ = self._run_minifier_launcher("", repro_dir)
diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py
index da3f1d3d59881..f510fb87522c5 100644
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@@ -1,6 +1,5 @@
 # Owner(s): ["module: dynamo"]
 
-import unittest
 from copy import deepcopy
 from unittest.mock import patch
 
@@ -763,9 +762,6 @@ def test_generation_tag(self):
         m3 = deepcopy(m1)
         self.assertEqual(GenerationTracker.get_generation_value(m3), cur_generation)
 
-    # torch._subclasses.fake_tensor.UnsupportedFakeTensorException: meta converter nyi
-    # due to custom subclass (TensorProxy)
-    @unittest.expectedFailure
     def test_simple_torch_function(self):
         def foo(x):
             # function call, twice to test wrapping
diff --git a/test/dynamo/test_optimizations.py b/test/dynamo/test_optimizations.py
index 5bff327786fa6..1f69a8fd79062 100644
--- a/test/dynamo/test_optimizations.py
+++ b/test/dynamo/test_optimizations.py
@@ -120,38 +120,6 @@ def compiler_fn(graph, example_inputs):
         opt_fn = torch._dynamo.optimize_assert(compiler_fn)(fn)
         r3 = opt_fn(a, (b, c), d)
 
-        self.assertIsNotNone(r1)
-        self.assertEqual(r1.size(), r2.size())
-        self.assertEqual(r1.stride(), r2.stride())
-        self.assertEqual(r1.dtype, r2.dtype)
-
-        self.assertEqual(r1.size(), r3.size())
-        self.assertEqual(r1.stride(), r3.stride())
-        self.assertEqual(r1.dtype, r3.dtype)
-
-    def test_example_inputs_runtime_use(self):
-        def fn(a, bc, d):
-            b, c = bc
-            return a / d - b / c
-
-        def compiler_fn(graph, example_inputs):
-            def fwd(*args):
-                nonlocal r1
-                r = graph.forward(*args)
-                r1 = r[0]
-                return r
-
-            return fwd
-
-        a = torch.empty(2).fill_(1)
-        b = torch.empty(2).fill_(2)
-        c = torch.empty(2).fill_(3)
-        d = 4
-        r1 = None
-        r2 = fn(a, (b, c), d)
-        opt_fn = torch._dynamo.optimize_assert(compiler_fn)(fn)
-        r3 = opt_fn(a, (b, c), d)
-
         self.assertIsNotNone(r1)
         self.assertTrue(same(r1, r2))
         self.assertTrue(same(r1, r3))
diff --git a/test/dynamo/test_verify_correctness.py b/test/dynamo/test_verify_correctness.py
index 7a6f8e3d42639..8e3624bfd9e7d 100644
--- a/test/dynamo/test_verify_correctness.py
+++ b/test/dynamo/test_verify_correctness.py
@@ -100,11 +100,8 @@ def compiler_fn(graph, example_inputs):
         r3 = opt_fn(a, (b, c), d)
 
         self.assertIsNotNone(r1)
-
-        self.assertEqual(r1.shape, r2.shape)
-        self.assertEqual(r1.shape, r3.shape)
-        self.assertEqual(r1.device, r2.device)
-        self.assertEqual(r1.device, r3.device)
+        self.assertTrue(same(r1, r2))
+        self.assertTrue(same(r1, r3))
 
     @patch.object(config, "verify_correctness", True)
     def test_nnc(self):
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index c39318b99d2c7..be729a25c3746 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -762,9 +762,6 @@ def backend_accuracy_fails(gm, example_inputs, compiler_fn, only_fwd=False):
 
 backend_aot_accuracy_fails = functools.partial(backend_accuracy_fails, only_fwd=True)
 
-# Please see NOTE: [Real Tensors in Accuracy Evaluation]
-MINIFIER_SPAWNED = False
-
 
 def backend_fails(gm, example_inputs, compiler_fn, orig_failure):
     """
@@ -835,7 +832,6 @@ def dump_to_minify_after_dynamo(gm, args, compiler_name):
 mod = Repro()
 
 # Setup debug minifier compiler
-torch._dynamo.debug_utils.MINIFIER_SPAWNED = True
 compiler_fn = BACKENDS["{minifier_backend}"]
 {custom_compiler_error}
 dynamo_minifier_backend = functools.partial(
diff --git a/torch/_dynamo/optimizations/backends.py b/torch/_dynamo/optimizations/backends.py
index 007b02018f2a8..0df57eb4273d1 100644
--- a/torch/_dynamo/optimizations/backends.py
+++ b/torch/_dynamo/optimizations/backends.py
@@ -747,17 +747,9 @@ def torchxla_trivial(subgraph):
 def torchxla_trace_once(subgraph):
     import torch._dynamo.optimizations.torchxla_integration as integration
 
-    compiled_graph = None
-
-    def fwd(*args):
-        nonlocal compiled_graph
-        model = subgraph.model
-        if compiled_graph is None:
-            compiled_graph = integration.extract_compiled_graph(model, args)
-            del subgraph
-        return compiled_graph(*args)
-
-    return fwd
+    model = subgraph.model
+    example_inputs = subgraph.example_inputs
+    return integration.extract_compiled_graph(model, example_inputs)
 
 
 def ipex_fp32(gm: torch.fx.GraphModule, example_inputs):
diff --git a/torch/_dynamo/optimizations/distributed.py b/torch/_dynamo/optimizations/distributed.py
index 934a9abc674ed..b71d85c4e34f8 100644
--- a/torch/_dynamo/optimizations/distributed.py
+++ b/torch/_dynamo/optimizations/distributed.py
@@ -6,7 +6,6 @@
 import torch.fx.traceback as fx_traceback
 from torch import fx
 from torch.fx.node import Node
-from ..utils import deepcopy_to_fake_tensor, fake_mode_from_tensors
 
 log = logging.getLogger(__name__)
 
@@ -139,8 +138,6 @@ def compile_fn(self, gm: fx.GraphModule, example_inputs: List[torch.Tensor]):
         to compile each subgraph. Finally, stiches compiled graphs into one graphmodule
         and returns its callable.
         """
-        fake_mode = fake_mode_from_tensors(example_inputs)
-        assert fake_mode is not None
 
         # 1: compute the partition map according to DDP bucket logic
         buckets = [Bucket()]  # (size, param_names)
@@ -214,7 +211,7 @@ def __init__(self, module, compiler):
                 super().__init__(module)
                 self.compiler = compiler
 
-            def compile_submod(self, input_mod, args, kwargs):
+            def compile_submod(self, submod, args, kwargs):
                 """
                 Compile the submodule,
                 using a wrapper to make sure its output is always a tuple,
@@ -223,13 +220,13 @@ def compile_submod(self, input_mod, args, kwargs):
                 assert len(kwargs) == 0, "We assume only args for these modules"
 
                 class WrapperModule(torch.nn.Module):
-                    def __init__(self, submod, unwrap_singleton_tuple):
+                    def __init__(self, compiled_submod, unwrap_singleton_tuple):
                         super().__init__()
-                        self.submod = submod
+                        self.compiled_submod = compiled_submod
                         self.unwrap_singleton_tuple = unwrap_singleton_tuple
 
                     def forward(self, *args):
-                        x = self.submod(*args)
+                        x = self.compiled_submod(*args)
                         # TODO(whc)
                         # for some reason the isinstance check is necessary if I split one node per submod
                         # - even though I supposedly wrapped the output in a tuple in those cases, the real
@@ -239,52 +236,22 @@ def forward(self, *args):
                         return x
 
                 unwrap_singleton_tuple = False
-                for sn in input_mod.graph.nodes:
+                for sn in submod.graph.nodes:
                     if sn.op == "output":
                         if not isinstance(sn.args[0], tuple):
                             unwrap_singleton_tuple = True
                             sn.args = (sn.args,)
+                submod.recompile()
 
-                input_mod.recompile()
                 wrapper = WrapperModule(
-                    self.compiler(input_mod, args),
+                    self.compiler(submod, args),
                     unwrap_singleton_tuple,
                 )
                 return wrapper
 
-            # Note:
-            #
-            # The way distributed works today around fake tensors can be somehwat confusing.
-            # Some of these codepaths are shared in both runtime, and compile time. The presence
-            # of a fake_mode, read off of fake tensor inputs, dictates how we will operate.
-            #
-            # A few things to keep in mind:
-            #
-            # 1) We invoke `compile_submod` with a real module. The output of that gets stored
-            # on the graph via `self.module.add_submodule(n.target, compiled_submod_real)`.
-            #
-            # 2) When running a call_module targeted node, if we have a fake_mode, we fakify the
-            # module we got from self.fetch_attr(n.target). Regardless of fake_mode, we then execute it.
-            #
-            # 3) Fake tensors should always be around during compile time.
-            #
-            # 4) Fake tensors should never be around at runtime.
-            #
-            # 5) We end up with a compilation mode that takes a real submodule and fake tensors,
-            # to match what aot_autograd exepcts. See Note: [Fake Modules and AOTAutograd]
             def run_node(self, n: Node) -> Any:
                 with fx_traceback.append_stack_trace(n.stack_trace):
                     args, kwargs = self.fetch_args_kwargs_from_env(n)
-                    new_args = []
-                    assert fake_mode
-                    for arg in args:
-                        if isinstance(arg, torch.Tensor) and not isinstance(
-                            arg, torch._subclasses.FakeTensor
-                        ):
-                            new_args.append(fake_mode.from_tensor(arg))
-                        else:
-                            new_args.append(arg)
-
                     log.debug(f"run_node {n.op}, {n.target} got args {args_str(args)}")
                     assert isinstance(args, tuple)
                     assert isinstance(kwargs, dict)
@@ -292,28 +259,19 @@ def run_node(self, n: Node) -> Any:
                     # modify the currently running FX graph
                     # maybe this isn't sound in general, but only changing the target of a node might be ok?
                     if n.op == "call_module":
-                        real_mod = self.fetch_attr(n.target)
-                        if fake_mode:
-                            curr_submod = deepcopy_to_fake_tensor(real_mod, fake_mode)
-                        else:
-                            curr_submod = real_mod
-
-                        log.debug(
-                            f"\n---{n.target} graph---\n" + str(curr_submod.graph)
-                        )
-                        compiled_submod_real = self.compile_submod(
-                            real_mod, new_args, kwargs
-                        )
+                        submod = self.fetch_attr(n.target)
+                        log.debug(f"\n---{n.target} graph---\n" + str(submod.graph))
+                        compiled_submod = self.compile_submod(submod, args, kwargs)
                         self.module.delete_submodule(n.target)
                         n.target = "compiled_" + n.target
-                        self.module.add_submodule(n.target, compiled_submod_real)
-                        return curr_submod(*new_args, **kwargs)
+                        self.module.add_submodule(n.target, compiled_submod)
                     # then we execute the modified node using the usual logic
-                    return getattr(self, n.op)(n.target, new_args, kwargs)
+                    return getattr(self, n.op)(n.target, args, kwargs)
 
         submod_compiler = SubmodCompiler(split_gm, self.backend_compile_fn)
         submod_compiler.run(*example_inputs)
         split_gm.recompile()
 
         log.debug("\n---final graph---\n" + str(split_gm.graph) + "\n---------------\n")
+
         return split_gm
diff --git a/torch/_dynamo/optimizations/log_args.py b/torch/_dynamo/optimizations/log_args.py
index 111da69d4a8fe..caa0a9a83ce66 100644
--- a/torch/_dynamo/optimizations/log_args.py
+++ b/torch/_dynamo/optimizations/log_args.py
@@ -34,6 +34,7 @@ def run(self, *args):
 
     def run_node(self, n: torch.fx.Node):
         result = super().run_node(n)
+
         if n.op == "call_function":
             if n.target == aten.convolution.default:
                 args, kwargs = self.fetch_args_kwargs_from_env(n)
@@ -66,8 +67,8 @@ def run_node(self, n: torch.fx.Node):
 
 
 def conv_args_analysis(gm: torch.fx.GraphModule, example_inputs):
-    def conv_arg_inner(*args):
-        fx_g = make_fx(gm)(*args)
-        return ConvArgsAnalysis(fx_g).run(*args)
-
-    return conv_arg_inner
+    # lowering graph
+    gm = make_fx(gm)(*example_inputs)
+    # use Interpreter to logs the args of conv
+    ConvArgsAnalysis(gm).run(*example_inputs)
+    return gm
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index 8929c1afd2fa1..3a0209c1511b6 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -88,10 +88,21 @@ def __repr__(self):
         return "FakeRootModule(...)"
 
 
+def wrap_compiler_fn(compiler_fn: CompilerFn) -> CompilerFn:
+    """WrapperBackend if config.verify_correctness is True"""
+    if config.verify_correctness:
+        # wrap backend if verify_correctness is True
+        wrapper_backend_compiler_fn = WrapperBackend(compiler_fn)
+
+        wrapper_backend_compiler_fn._torchdynamo_orig_callable = compiler_fn  # type: ignore[attr-defined]
+        return wrapper_backend_compiler_fn
+
+    return compiler_fn
+
+
 class WrapperBackend:
-    def __init__(self, backend: CompilerFn, original_example_inputs):
+    def __init__(self, backend: CompilerFn):
         self.backend: CompilerFn = backend
-        self.original_example_inputs = original_example_inputs
 
     @property
     def example_inputs(self):
@@ -100,6 +111,7 @@ def example_inputs(self):
     def __call__(self, gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
 
         self.restore = checkpoint_params(gm)
+        self.original_example_inputs = clone_inputs(example_inputs)
         self.gm = gm
         copy_gm = copy.deepcopy(self.gm)
         self.candidate = self.backend(copy_gm, self.original_example_inputs)
@@ -529,43 +541,9 @@ def call_user_compiler(self, gm: fx.GraphModule) -> CompiledFn:
             )
             _step_logger()(logging.INFO, f"calling compiler function {name}")
             compiler_fn = self.compiler_fn
-            # WrapperBackend needs real inputs, for now, to verify correctness
             if config.verify_correctness:
-                compiler_fn = WrapperBackend(compiler_fn, self.example_inputs())
-
-            # NOTE: [Real Tensors in Accuracy Evaluation]
-            #
-            # Today, tensors are passed to backends as fake at compile time. See the .fake_example_inputs()
-            # call to compiler_fn below. At runtime, backends use real tensors.
-            #
-            # This should be a strong invariant we hold across all backends,
-            # and generally, it is. However, for accuracy evaluation, we need real tensors at compile time,
-            # for now, due to the unfortunate setup described below.
-            #
-            # Due to the nature of how we invoke comparison as a backend in two different ways:
-            #
-            # (1) Less bad, but still worth rewriting, WrapperBackend above, which takes
-            # real inputs for its ctor. see the config.verify_correctnes above.
-            #
-            # (2) More bad, and very worth rewriting, the minifier installs accuracy comparison as
-            # a true backend, and therefore needs to be compiled with real inputs. This is made trickier
-            # by the fact that the minifier will spawn new processes during minification. As such, we have
-            # created a global flag, MINIFIER_SPAWNED, that should be set IF AND ONLY IF this run was spawned
-            # as part of accuracy minification. This flag is not a contract, and ideally will not be here long.
-            #
-            # The longer term PoR is to:
-            # (A) Rewrite the minifier accuracy evaluation and verify_correctness code to share the same
-            # correctness and accuracy logic, so as not to have two different ways of doing the same thing.
-            #
-            # (B) Refactor minifier accuracy backend to do its comparison fully at runtime, so as not to need to
-            # pass real tensors to it at compile time.
-            is_top_level_minifying = (
-                config.repro_after is not None and config.repro_level == 4
-            )
-            if torch._dynamo.debug_utils.MINIFIER_SPAWNED or is_top_level_minifying:
-                compiled_fn = compiler_fn(gm, self.example_inputs())
-            else:
-                compiled_fn = compiler_fn(gm, self.fake_example_inputs())
+                compiler_fn = wrap_compiler_fn(compiler_fn)
+            compiled_fn = compiler_fn(gm, self.example_inputs())
             _step_logger()(logging.INFO, f"done compiler function {name}")
             assert callable(compiled_fn), "compiler_fn did not return callable"
         except Exception as e:
@@ -573,18 +551,6 @@ def call_user_compiler(self, gm: fx.GraphModule) -> CompiledFn:
             raise BackendCompilerFailed(self.compiler_fn, e) from e
         return compiled_fn
 
-    def fake_example_inputs(self) -> List[torch.Tensor]:
-        result = []
-        for arg in self.graphargs:
-            example = arg.get_fake_examples()
-            if example is not None:
-                result.extend(example)
-            else:
-                # Fallback, in case fake_tensor was not set
-                # Particularly for graph args that are not tensors
-                result.extend(arg.get_examples())
-        return result
-
     def example_inputs(self) -> List[torch.Tensor]:
         result = []
         for arg in self.graphargs:
diff --git a/torch/_dynamo/test_minifier_common.py b/torch/_dynamo/test_minifier_common.py
index 947a45f2fcdf1..8fb0688f2c3ed 100644
--- a/torch/_dynamo/test_minifier_common.py
+++ b/torch/_dynamo/test_minifier_common.py
@@ -51,6 +51,7 @@ def _run_test_code(self, code):
         proc = subprocess.run(
             ["python3", "-c", code], capture_output=True, cwd=self.DEBUG_DIR
         )
+
         repro_dir_match = re.search(
             r"(\S+)minifier_launcher.py", proc.stderr.decode("utf-8")
         )
@@ -99,6 +100,7 @@ def _run_repro(self, patch_code, repro_dir):
         repro_proc = subprocess.run(
             ["python3", repro_file], capture_output=True, cwd=repro_dir
         )
+
         return repro_proc, repro_code
 
     # Template for testing code.
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 4f5ebc072fb5c..a6a139ef5760b 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -23,7 +23,7 @@
 import weakref
 from contextlib import contextmanager
 from functools import lru_cache
-from typing import Any, Dict, List
+from typing import Any, Dict
 
 import numpy as np
 import sympy
@@ -32,7 +32,7 @@
 from torch import fx
 from torch._dispatch.python import enable_python_dispatcher
 from torch.nn.modules.lazy import LazyModuleMixin
-from torch.utils._pytree import tree_flatten, tree_map
+from torch.utils._pytree import tree_map
 
 from . import config, logging as torchdynamo_logging
 
@@ -398,10 +398,6 @@ def clone_tensor(x):
 
 def clone_input(x):
     """copy while preserving strides"""
-    # TODO: this is questionable
-    if isinstance(x, torch._subclasses.FakeTensor):
-        # this func fails on fake tensors in __torch_dispatch__
-        return x
 
     def torch_clone(x):
         y = torch.clone(x)
@@ -758,10 +754,7 @@ def wrap_to_fake_tensor(e, fake_mode):
 
 
 def wrap_to_fake_tensor_and_record(e, tx):
-    # The not fake tensor check here is annoying - ideally, fake tensors never call this during wrapping.
-    # However, get_fake_value takes args and passes them through this, which may include fake tensors.
-    # see tree_map(fake_wrapper, args) in get_fake_value.
-    if isinstance(e, torch.Tensor) and not isinstance(e, torch._subclasses.FakeTensor):
+    if type(e) in (torch.Tensor, torch.nn.Parameter):
         static_shapes = config.dynamic_shapes is False
         if type(e) is torch.nn.Parameter:
             # Always static for params
@@ -824,9 +817,6 @@ def same(
                 return False
         return True
     elif isinstance(ref, torch.Tensor):
-        assert not isinstance(ref, torch._subclasses.FakeTensor)
-        assert not isinstance(res, torch._subclasses.FakeTensor)
-
         if ref.is_sparse:
             assert res.is_sparse
             ref = ref.to_dense()
@@ -1166,20 +1156,3 @@ def assert_no_fake_params_or_buffers(gm):
         assert not isinstance(
             param, torch._subclasses.FakeTensor
         ), f"Unexpected fake param {name}"
-
-
-def fake_mode_from_tensors(inputs: List[Any]):
-    """
-    Takes a list of anything, unflattened is fine, returns a fake_mode
-    if any are fake. All fake modes on all fake tensors must be identical.
-    Returns None if no fake_mode is fine
-    """
-    flat_inputs, _ = tree_flatten(inputs)
-    fake_mode = None
-    for flat_input in flat_inputs:
-        if isinstance(flat_input, torch._subclasses.FakeTensor):
-            if fake_mode is None:
-                fake_mode = flat_input.fake_mode
-            else:
-                assert fake_mode is flat_input.fake_mode
-    return fake_mode
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index 843e50687a61e..43c2c91c4a553 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -9,7 +9,7 @@
 import re
 import types
 from abc import ABCMeta
-from typing import Any, Optional, Union
+from typing import Any, Union
 
 import numpy as np
 from functorch.experimental.ops import PyOperator
@@ -108,13 +108,8 @@ class GraphArg:
     source: Source
     example: Any
     is_unspecialized: bool
-    fake_tensor: Optional[torch._subclasses.fake_tensor.FakeTensor]
 
     def __post_init__(self):
-        if isinstance(self.example, torch.Tensor):
-            assert isinstance(
-                self.fake_tensor, torch._subclasses.fake_tensor.FakeTensor
-            )
         if isinstance(self.example, torch._subclasses.fake_tensor.FakeTensor):
             raise AssertionError("Fake Tensor observed in TorchDynamo Fx graph inputs")
 
@@ -124,13 +119,6 @@ def load(self, tx):
     def get_examples(self):
         return [self.example]
 
-    def get_fake_examples(self):
-        if self.fake_tensor is not None:
-            assert isinstance(
-                self.fake_tensor, torch._subclasses.fake_tensor.FakeTensor
-            )
-            return [self.fake_tensor]
-
     def __len__(self):
         return 1
 
@@ -535,9 +523,7 @@ def tensor_should_specialize(self):
 
     def wrap_sym(self, value: Union[torch.SymInt, torch.SymFloat]):
         if not is_constant_source(self.get_source()):
-            self.tx.output.graphargs.append(
-                GraphArg(self.get_source(), value, False, None)
-            )
+            self.tx.output.graphargs.append(GraphArg(self.get_source(), value, False))
         elif is_constant_source(self.get_source()):
             return self.tx.output.register_attr_or_module(
                 value,
@@ -565,6 +551,10 @@ def wrap_tensor(self, value: torch.Tensor):
                 # guards=self.make_guards(GuardBuilder.TENSOR_MATCH),
             )
         else:
+            if not is_constant_source(self.get_source()):
+                self.tx.output.graphargs.append(
+                    GraphArg(self.get_source(), value, False)
+                )
             # Disable __torch_function__ to prevent cloning of `value` to hit
             # us
             with torch._C.DisableTorchFunction():
@@ -584,15 +574,6 @@ def wrap_tensor(self, value: torch.Tensor):
                     guards=self.make_guards(GuardBuilder.TENSOR_MATCH),
                     should_specialize=self.tensor_should_specialize(),
                 )
-
-            fake_tensor_value = None
-            example_value = tensor_variable.proxy.node.meta["example_value"]
-            if isinstance(example_value, torch._subclasses.fake_tensor.FakeTensor):
-                fake_tensor_value = example_value
-
-            graph_arg = GraphArg(self.get_source(), value, False, fake_tensor_value)
-            self.tx.output.graphargs.append(graph_arg)
-
             if torch.overrides.has_torch_function_unary(value):
                 subclass_torch_function__func = value.__torch_function__.__func__
                 subclass_type = type(value)
@@ -617,6 +598,10 @@ def wrap_unspecialized_primitive(self, value):
             else:
                 # TODO: Eliminate this case entirely
                 wrapped_value = torch.tensor(value)
+            if not is_constant_source(self.get_source()):
+                self.tx.output.graphargs.append(
+                    GraphArg(self.get_source(), wrapped_value, True)
+                )
             if not isinstance(self.get_source(), RandomValueSource):
                 guards = {self.get_source().make_guard(GuardBuilder.TYPE_MATCH, True)}
                 options = {"guards": guards}
@@ -647,14 +632,6 @@ def wrap_unspecialized_primitive(self, value):
                     **options,
                 )
             self.tx.output.unspec_variable_map[self.name] = unspec_var
-            if not is_constant_source(self.get_source()):
-                fake_tensor_value = None
-                example_value = unspec_var.proxy.node.meta["example_value"]
-                if isinstance(example_value, torch._subclasses.fake_tensor.FakeTensor):
-                    fake_tensor_value = example_value
-                self.tx.output.graphargs.append(
-                    GraphArg(self.get_source(), wrapped_value, True, fake_tensor_value)
-                )
             return unspec_var
 
 
@@ -712,14 +689,11 @@ def _clone_input(value):
     with preserve_rng_state():
         if example_value is None:
             example_value = get_fake_value(proxy.node, tx)
+
         else:
-            # Note: Unfortunately, this can happen during tracing, and is valid enough for now to allow.
-            # TODO(voz): Find all the callsites and burn this down.
-            # Flipping it to an assert fails dozens of tests.
-            if not isinstance(example_value, torch._subclasses.FakeTensor):
-                proxy.tracer.real_value_cache[proxy.node] = _clone_input(example_value)
-                fake_wrapper = functools.partial(wrap_to_fake_tensor_and_record, tx=tx)
-                example_value = fake_wrapper(example_value)
+            proxy.tracer.real_value_cache[proxy.node] = _clone_input(example_value)
+            fake_wrapper = functools.partial(wrap_to_fake_tensor_and_record, tx=tx)
+            example_value = fake_wrapper(example_value)
 
     if isinstance(example_value, torch.Tensor):
         is_parameter = isinstance(example_value, torch.nn.Parameter)
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index bf66e68fed624..8d95971864f0f 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -11,7 +11,6 @@
 import torch
 import torch.nn as nn
 from torch import _prims
-from torch._dynamo.utils import fake_mode_from_tensors
 from torch.fx.experimental.optimization import (
     matches_module_pattern,
     replace_node_module,
@@ -531,12 +530,10 @@ def fuse_fx(gm: torch.fx.GraphModule, example_inputs):
         example_input.device == torch.device("cpu") for example_input in example_inputs
     )
 
-    fake_mode = fake_mode_from_tensors(example_inputs)
-
     if config.permute_fusion and not is_cpu:
         # For linear permute fusion, we need to check input info to identify
         # and perform proper permutation/transpose
-        ShapeProp(gm, fake_mode=fake_mode).propagate(*example_inputs)
+        ShapeProp(gm).propagate(*example_inputs)
         gm = linear_permute_fusion(gm)
         gm = permute_linear_fusion(gm)
         gm = permute_matmul_fusion(gm)
@@ -552,8 +549,7 @@ def fuse_fx(gm: torch.fx.GraphModule, example_inputs):
     gm = fuse_conv_bn(gm)
     # For binary fusion, we need to check inputs info to make sure
     # the binary inputs have same tensor info(device, dtype, and layout).
-
-    ShapeProp(gm, fake_mode=fake_mode).propagate(*example_inputs)
+    ShapeProp(gm).propagate(*example_inputs)
     gm = fuse_unary(gm)
     gm = fuse_binary_inplace(gm)
     gm = fuse_binary(gm)
diff --git a/torch/fx/passes/shape_prop.py b/torch/fx/passes/shape_prop.py
index 4fd8ce8af9347..2be996f714ce8 100644
--- a/torch/fx/passes/shape_prop.py
+++ b/torch/fx/passes/shape_prop.py
@@ -109,38 +109,11 @@ def forward(self, x):
 
     Args:
          module (GraphModule): The module to be executed
-         fake_mode (FakeTensorMode): A fake mode for copying the gm
 
     """
-    def __init__(self, gm, fake_mode=None):
-        super().__init__(gm)
-        if fake_mode:
-            from torch._dynamo.utils import deepcopy_to_fake_tensor
-            # Note:
-            # We need fake execution cause the inputs are fake, however, we cannot fakify the module
-            # - because we need to write to the tensor_meta of the real module. So we fakify to
-            # produce a result (L130 below), to extract tensor meta, and then keep going.
-            #
-            # If we were to fakify, we would write to the wrong node, and then downstream fusion
-            # would be missing the tensor_meta.
-            #
-            # See torch/_inductor/overrides.py for where this is called upstream of fusion.
-            self.fake_module = deepcopy_to_fake_tensor(self.module, fake_mode)
-        else:
-            self.fake_module = None
-
-        self.real_module = self.module
-
     def run_node(self, n : Node) -> Any:
         try:
-            if self.fake_module is not None:
-                # Hacky swap. Alternatively, we could do this with overriding
-                # call_module and get_attr.
-                self.module = self.fake_module
-            try:
-                result = super().run_node(n)
-            finally:
-                self.module = self.real_module
+            result = super().run_node(n)
         except Exception:
             traceback.print_exc()
             raise RuntimeError(

From bbe16dc8508ff129febd84922fae869b341f2c23 Mon Sep 17 00:00:00 2001
From: Michael Voznesensky <voznesenskym@gmail.com>
Date: Fri, 2 Dec 2022 18:27:07 +0000
Subject: [PATCH 1562/1922] Use dynamo fake tensor mode in aot_autograd, move
 aot_autograd compilation to lowering time [Merger of 89672 and 89773]
 (#90039)

After all of the preparatory commits, this is a subset of the
changes in https://github.com/pytorch/pytorch/pull/89392 that actually
change us to propagating fake tensors to backends.

Signed-off-by: Edward Z. Yang <ezyangfb.com>

This is the merger of Ed's PR #89672, which is a rewrite of an older PR of mine (#89392), with CI Fixes on top of it (#89773)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90039
Approved by: https://github.com/ezyang
---
 test/dynamo/test_export.py                 | 27 ++++-----
 test/dynamo/test_minifier.py               |  4 +-
 test/dynamo/test_modules.py                |  4 ++
 test/dynamo/test_optimizations.py          | 32 ++++++++++
 test/dynamo/test_verify_correctness.py     |  7 ++-
 torch/_dynamo/debug_utils.py               |  4 ++
 torch/_dynamo/optimizations/backends.py    | 14 ++++-
 torch/_dynamo/optimizations/distributed.py | 68 +++++++++++++++++-----
 torch/_dynamo/optimizations/log_args.py    | 11 ++--
 torch/_dynamo/output_graph.py              | 66 ++++++++++++++++-----
 torch/_dynamo/test_minifier_common.py      |  2 -
 torch/_dynamo/utils.py                     | 33 ++++++++++-
 torch/_dynamo/variables/builder.py         | 54 ++++++++++++-----
 torch/_inductor/overrides.py               |  8 ++-
 torch/fx/passes/shape_prop.py              | 29 ++++++++-
 15 files changed, 287 insertions(+), 76 deletions(-)

diff --git a/test/dynamo/test_export.py b/test/dynamo/test_export.py
index e6b505dea51e5..7779c479b6d41 100644
--- a/test/dynamo/test_export.py
+++ b/test/dynamo/test_export.py
@@ -938,23 +938,24 @@ def func(x):
         torch._dynamo.reset()
 
         def compiler(gm, sample_inputs):
-            aten_gm = make_fx(gm)(*sample_inputs)
+            def fw(*args):
+                aten_gm = make_fx(gm)(*args)
+                return aten_gm(*args)
 
-            self.assertEqual(len(aten_gm.graph.nodes), len(out_graph.graph.nodes))
-            for node1, node2 in zip(aten_gm.graph.nodes, out_graph.graph.nodes):
-                self.assertEqual(node1.op, node2.op)
-                if node1.op == "call_function":
-                    self.assertEqual(node1.target, node2.target)
-                    self.assertEqual(len(node1.args), len(node2.args))
-                    for arg1, arg2 in zip(node1.args, node2.args):
-                        self.assertEqual(type(arg1), type(arg2))
-
-            return aten_gm.forward
+            return fw
 
         opt_func = torch._dynamo.optimize(compiler, nopython=True)(func)
-        make_fx_result = opt_func(inp)
+        make_fx_result_through_backend = opt_func(inp)
+
+        fx_g = make_fx(func)(inp)
+        make_fx_result_through_direct = fx_g(inp)
 
-        self.assertTrue(torch._dynamo.utils.same(make_fx_result, export_result))
+        self.assertTrue(
+            torch._dynamo.utils.same(make_fx_result_through_backend, export_result)
+        )
+        self.assertTrue(
+            torch._dynamo.utils.same(make_fx_result_through_direct, export_result)
+        )
 
     def test_export_with_constant_method_on_module(self):
         class MyModule(torch.nn.Module):
diff --git a/test/dynamo/test_minifier.py b/test/dynamo/test_minifier.py
index c1a56f070be5d..d2f82f92510c5 100644
--- a/test/dynamo/test_minifier.py
+++ b/test/dynamo/test_minifier.py
@@ -191,8 +191,10 @@ def inner(x):
         """
         )
 
+        repro_after = "dynamo"
+        repro_level = 2
         test_code = self._gen_test_code(
-            run_code, "dynamo", 2, RELU_CUSTOM_ERROR_BACKEND
+            run_code, repro_after, repro_level, RELU_CUSTOM_ERROR_BACKEND
         )
         _, repro_dir = self._run_test_code(test_code)
         launch_proc, _ = self._run_minifier_launcher("", repro_dir)
diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py
index f510fb87522c5..da3f1d3d59881 100644
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@@ -1,5 +1,6 @@
 # Owner(s): ["module: dynamo"]
 
+import unittest
 from copy import deepcopy
 from unittest.mock import patch
 
@@ -762,6 +763,9 @@ def test_generation_tag(self):
         m3 = deepcopy(m1)
         self.assertEqual(GenerationTracker.get_generation_value(m3), cur_generation)
 
+    # torch._subclasses.fake_tensor.UnsupportedFakeTensorException: meta converter nyi
+    # due to custom subclass (TensorProxy)
+    @unittest.expectedFailure
     def test_simple_torch_function(self):
         def foo(x):
             # function call, twice to test wrapping
diff --git a/test/dynamo/test_optimizations.py b/test/dynamo/test_optimizations.py
index 1f69a8fd79062..5bff327786fa6 100644
--- a/test/dynamo/test_optimizations.py
+++ b/test/dynamo/test_optimizations.py
@@ -120,6 +120,38 @@ def compiler_fn(graph, example_inputs):
         opt_fn = torch._dynamo.optimize_assert(compiler_fn)(fn)
         r3 = opt_fn(a, (b, c), d)
 
+        self.assertIsNotNone(r1)
+        self.assertEqual(r1.size(), r2.size())
+        self.assertEqual(r1.stride(), r2.stride())
+        self.assertEqual(r1.dtype, r2.dtype)
+
+        self.assertEqual(r1.size(), r3.size())
+        self.assertEqual(r1.stride(), r3.stride())
+        self.assertEqual(r1.dtype, r3.dtype)
+
+    def test_example_inputs_runtime_use(self):
+        def fn(a, bc, d):
+            b, c = bc
+            return a / d - b / c
+
+        def compiler_fn(graph, example_inputs):
+            def fwd(*args):
+                nonlocal r1
+                r = graph.forward(*args)
+                r1 = r[0]
+                return r
+
+            return fwd
+
+        a = torch.empty(2).fill_(1)
+        b = torch.empty(2).fill_(2)
+        c = torch.empty(2).fill_(3)
+        d = 4
+        r1 = None
+        r2 = fn(a, (b, c), d)
+        opt_fn = torch._dynamo.optimize_assert(compiler_fn)(fn)
+        r3 = opt_fn(a, (b, c), d)
+
         self.assertIsNotNone(r1)
         self.assertTrue(same(r1, r2))
         self.assertTrue(same(r1, r3))
diff --git a/test/dynamo/test_verify_correctness.py b/test/dynamo/test_verify_correctness.py
index 8e3624bfd9e7d..7a6f8e3d42639 100644
--- a/test/dynamo/test_verify_correctness.py
+++ b/test/dynamo/test_verify_correctness.py
@@ -100,8 +100,11 @@ def compiler_fn(graph, example_inputs):
         r3 = opt_fn(a, (b, c), d)
 
         self.assertIsNotNone(r1)
-        self.assertTrue(same(r1, r2))
-        self.assertTrue(same(r1, r3))
+
+        self.assertEqual(r1.shape, r2.shape)
+        self.assertEqual(r1.shape, r3.shape)
+        self.assertEqual(r1.device, r2.device)
+        self.assertEqual(r1.device, r3.device)
 
     @patch.object(config, "verify_correctness", True)
     def test_nnc(self):
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index be729a25c3746..c39318b99d2c7 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -762,6 +762,9 @@ def backend_accuracy_fails(gm, example_inputs, compiler_fn, only_fwd=False):
 
 backend_aot_accuracy_fails = functools.partial(backend_accuracy_fails, only_fwd=True)
 
+# Please see NOTE: [Real Tensors in Accuracy Evaluation]
+MINIFIER_SPAWNED = False
+
 
 def backend_fails(gm, example_inputs, compiler_fn, orig_failure):
     """
@@ -832,6 +835,7 @@ def dump_to_minify_after_dynamo(gm, args, compiler_name):
 mod = Repro()
 
 # Setup debug minifier compiler
+torch._dynamo.debug_utils.MINIFIER_SPAWNED = True
 compiler_fn = BACKENDS["{minifier_backend}"]
 {custom_compiler_error}
 dynamo_minifier_backend = functools.partial(
diff --git a/torch/_dynamo/optimizations/backends.py b/torch/_dynamo/optimizations/backends.py
index 0df57eb4273d1..007b02018f2a8 100644
--- a/torch/_dynamo/optimizations/backends.py
+++ b/torch/_dynamo/optimizations/backends.py
@@ -747,9 +747,17 @@ def torchxla_trivial(subgraph):
 def torchxla_trace_once(subgraph):
     import torch._dynamo.optimizations.torchxla_integration as integration
 
-    model = subgraph.model
-    example_inputs = subgraph.example_inputs
-    return integration.extract_compiled_graph(model, example_inputs)
+    compiled_graph = None
+
+    def fwd(*args):
+        nonlocal compiled_graph
+        model = subgraph.model
+        if compiled_graph is None:
+            compiled_graph = integration.extract_compiled_graph(model, args)
+            del subgraph
+        return compiled_graph(*args)
+
+    return fwd
 
 
 def ipex_fp32(gm: torch.fx.GraphModule, example_inputs):
diff --git a/torch/_dynamo/optimizations/distributed.py b/torch/_dynamo/optimizations/distributed.py
index b71d85c4e34f8..934a9abc674ed 100644
--- a/torch/_dynamo/optimizations/distributed.py
+++ b/torch/_dynamo/optimizations/distributed.py
@@ -6,6 +6,7 @@
 import torch.fx.traceback as fx_traceback
 from torch import fx
 from torch.fx.node import Node
+from ..utils import deepcopy_to_fake_tensor, fake_mode_from_tensors
 
 log = logging.getLogger(__name__)
 
@@ -138,6 +139,8 @@ def compile_fn(self, gm: fx.GraphModule, example_inputs: List[torch.Tensor]):
         to compile each subgraph. Finally, stiches compiled graphs into one graphmodule
         and returns its callable.
         """
+        fake_mode = fake_mode_from_tensors(example_inputs)
+        assert fake_mode is not None
 
         # 1: compute the partition map according to DDP bucket logic
         buckets = [Bucket()]  # (size, param_names)
@@ -211,7 +214,7 @@ def __init__(self, module, compiler):
                 super().__init__(module)
                 self.compiler = compiler
 
-            def compile_submod(self, submod, args, kwargs):
+            def compile_submod(self, input_mod, args, kwargs):
                 """
                 Compile the submodule,
                 using a wrapper to make sure its output is always a tuple,
@@ -220,13 +223,13 @@ def compile_submod(self, submod, args, kwargs):
                 assert len(kwargs) == 0, "We assume only args for these modules"
 
                 class WrapperModule(torch.nn.Module):
-                    def __init__(self, compiled_submod, unwrap_singleton_tuple):
+                    def __init__(self, submod, unwrap_singleton_tuple):
                         super().__init__()
-                        self.compiled_submod = compiled_submod
+                        self.submod = submod
                         self.unwrap_singleton_tuple = unwrap_singleton_tuple
 
                     def forward(self, *args):
-                        x = self.compiled_submod(*args)
+                        x = self.submod(*args)
                         # TODO(whc)
                         # for some reason the isinstance check is necessary if I split one node per submod
                         # - even though I supposedly wrapped the output in a tuple in those cases, the real
@@ -236,22 +239,52 @@ def forward(self, *args):
                         return x
 
                 unwrap_singleton_tuple = False
-                for sn in submod.graph.nodes:
+                for sn in input_mod.graph.nodes:
                     if sn.op == "output":
                         if not isinstance(sn.args[0], tuple):
                             unwrap_singleton_tuple = True
                             sn.args = (sn.args,)
-                submod.recompile()
 
+                input_mod.recompile()
                 wrapper = WrapperModule(
-                    self.compiler(submod, args),
+                    self.compiler(input_mod, args),
                     unwrap_singleton_tuple,
                 )
                 return wrapper
 
+            # Note:
+            #
+            # The way distributed works today around fake tensors can be somehwat confusing.
+            # Some of these codepaths are shared in both runtime, and compile time. The presence
+            # of a fake_mode, read off of fake tensor inputs, dictates how we will operate.
+            #
+            # A few things to keep in mind:
+            #
+            # 1) We invoke `compile_submod` with a real module. The output of that gets stored
+            # on the graph via `self.module.add_submodule(n.target, compiled_submod_real)`.
+            #
+            # 2) When running a call_module targeted node, if we have a fake_mode, we fakify the
+            # module we got from self.fetch_attr(n.target). Regardless of fake_mode, we then execute it.
+            #
+            # 3) Fake tensors should always be around during compile time.
+            #
+            # 4) Fake tensors should never be around at runtime.
+            #
+            # 5) We end up with a compilation mode that takes a real submodule and fake tensors,
+            # to match what aot_autograd exepcts. See Note: [Fake Modules and AOTAutograd]
             def run_node(self, n: Node) -> Any:
                 with fx_traceback.append_stack_trace(n.stack_trace):
                     args, kwargs = self.fetch_args_kwargs_from_env(n)
+                    new_args = []
+                    assert fake_mode
+                    for arg in args:
+                        if isinstance(arg, torch.Tensor) and not isinstance(
+                            arg, torch._subclasses.FakeTensor
+                        ):
+                            new_args.append(fake_mode.from_tensor(arg))
+                        else:
+                            new_args.append(arg)
+
                     log.debug(f"run_node {n.op}, {n.target} got args {args_str(args)}")
                     assert isinstance(args, tuple)
                     assert isinstance(kwargs, dict)
@@ -259,19 +292,28 @@ def run_node(self, n: Node) -> Any:
                     # modify the currently running FX graph
                     # maybe this isn't sound in general, but only changing the target of a node might be ok?
                     if n.op == "call_module":
-                        submod = self.fetch_attr(n.target)
-                        log.debug(f"\n---{n.target} graph---\n" + str(submod.graph))
-                        compiled_submod = self.compile_submod(submod, args, kwargs)
+                        real_mod = self.fetch_attr(n.target)
+                        if fake_mode:
+                            curr_submod = deepcopy_to_fake_tensor(real_mod, fake_mode)
+                        else:
+                            curr_submod = real_mod
+
+                        log.debug(
+                            f"\n---{n.target} graph---\n" + str(curr_submod.graph)
+                        )
+                        compiled_submod_real = self.compile_submod(
+                            real_mod, new_args, kwargs
+                        )
                         self.module.delete_submodule(n.target)
                         n.target = "compiled_" + n.target
-                        self.module.add_submodule(n.target, compiled_submod)
+                        self.module.add_submodule(n.target, compiled_submod_real)
+                        return curr_submod(*new_args, **kwargs)
                     # then we execute the modified node using the usual logic
-                    return getattr(self, n.op)(n.target, args, kwargs)
+                    return getattr(self, n.op)(n.target, new_args, kwargs)
 
         submod_compiler = SubmodCompiler(split_gm, self.backend_compile_fn)
         submod_compiler.run(*example_inputs)
         split_gm.recompile()
 
         log.debug("\n---final graph---\n" + str(split_gm.graph) + "\n---------------\n")
-
         return split_gm
diff --git a/torch/_dynamo/optimizations/log_args.py b/torch/_dynamo/optimizations/log_args.py
index caa0a9a83ce66..111da69d4a8fe 100644
--- a/torch/_dynamo/optimizations/log_args.py
+++ b/torch/_dynamo/optimizations/log_args.py
@@ -34,7 +34,6 @@ def run(self, *args):
 
     def run_node(self, n: torch.fx.Node):
         result = super().run_node(n)
-
         if n.op == "call_function":
             if n.target == aten.convolution.default:
                 args, kwargs = self.fetch_args_kwargs_from_env(n)
@@ -67,8 +66,8 @@ def run_node(self, n: torch.fx.Node):
 
 
 def conv_args_analysis(gm: torch.fx.GraphModule, example_inputs):
-    # lowering graph
-    gm = make_fx(gm)(*example_inputs)
-    # use Interpreter to logs the args of conv
-    ConvArgsAnalysis(gm).run(*example_inputs)
-    return gm
+    def conv_arg_inner(*args):
+        fx_g = make_fx(gm)(*args)
+        return ConvArgsAnalysis(fx_g).run(*args)
+
+    return conv_arg_inner
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index 3a0209c1511b6..8929c1afd2fa1 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -88,21 +88,10 @@ def __repr__(self):
         return "FakeRootModule(...)"
 
 
-def wrap_compiler_fn(compiler_fn: CompilerFn) -> CompilerFn:
-    """WrapperBackend if config.verify_correctness is True"""
-    if config.verify_correctness:
-        # wrap backend if verify_correctness is True
-        wrapper_backend_compiler_fn = WrapperBackend(compiler_fn)
-
-        wrapper_backend_compiler_fn._torchdynamo_orig_callable = compiler_fn  # type: ignore[attr-defined]
-        return wrapper_backend_compiler_fn
-
-    return compiler_fn
-
-
 class WrapperBackend:
-    def __init__(self, backend: CompilerFn):
+    def __init__(self, backend: CompilerFn, original_example_inputs):
         self.backend: CompilerFn = backend
+        self.original_example_inputs = original_example_inputs
 
     @property
     def example_inputs(self):
@@ -111,7 +100,6 @@ def example_inputs(self):
     def __call__(self, gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
 
         self.restore = checkpoint_params(gm)
-        self.original_example_inputs = clone_inputs(example_inputs)
         self.gm = gm
         copy_gm = copy.deepcopy(self.gm)
         self.candidate = self.backend(copy_gm, self.original_example_inputs)
@@ -541,9 +529,43 @@ def call_user_compiler(self, gm: fx.GraphModule) -> CompiledFn:
             )
             _step_logger()(logging.INFO, f"calling compiler function {name}")
             compiler_fn = self.compiler_fn
+            # WrapperBackend needs real inputs, for now, to verify correctness
             if config.verify_correctness:
-                compiler_fn = wrap_compiler_fn(compiler_fn)
-            compiled_fn = compiler_fn(gm, self.example_inputs())
+                compiler_fn = WrapperBackend(compiler_fn, self.example_inputs())
+
+            # NOTE: [Real Tensors in Accuracy Evaluation]
+            #
+            # Today, tensors are passed to backends as fake at compile time. See the .fake_example_inputs()
+            # call to compiler_fn below. At runtime, backends use real tensors.
+            #
+            # This should be a strong invariant we hold across all backends,
+            # and generally, it is. However, for accuracy evaluation, we need real tensors at compile time,
+            # for now, due to the unfortunate setup described below.
+            #
+            # Due to the nature of how we invoke comparison as a backend in two different ways:
+            #
+            # (1) Less bad, but still worth rewriting, WrapperBackend above, which takes
+            # real inputs for its ctor. see the config.verify_correctnes above.
+            #
+            # (2) More bad, and very worth rewriting, the minifier installs accuracy comparison as
+            # a true backend, and therefore needs to be compiled with real inputs. This is made trickier
+            # by the fact that the minifier will spawn new processes during minification. As such, we have
+            # created a global flag, MINIFIER_SPAWNED, that should be set IF AND ONLY IF this run was spawned
+            # as part of accuracy minification. This flag is not a contract, and ideally will not be here long.
+            #
+            # The longer term PoR is to:
+            # (A) Rewrite the minifier accuracy evaluation and verify_correctness code to share the same
+            # correctness and accuracy logic, so as not to have two different ways of doing the same thing.
+            #
+            # (B) Refactor minifier accuracy backend to do its comparison fully at runtime, so as not to need to
+            # pass real tensors to it at compile time.
+            is_top_level_minifying = (
+                config.repro_after is not None and config.repro_level == 4
+            )
+            if torch._dynamo.debug_utils.MINIFIER_SPAWNED or is_top_level_minifying:
+                compiled_fn = compiler_fn(gm, self.example_inputs())
+            else:
+                compiled_fn = compiler_fn(gm, self.fake_example_inputs())
             _step_logger()(logging.INFO, f"done compiler function {name}")
             assert callable(compiled_fn), "compiler_fn did not return callable"
         except Exception as e:
@@ -551,6 +573,18 @@ def call_user_compiler(self, gm: fx.GraphModule) -> CompiledFn:
             raise BackendCompilerFailed(self.compiler_fn, e) from e
         return compiled_fn
 
+    def fake_example_inputs(self) -> List[torch.Tensor]:
+        result = []
+        for arg in self.graphargs:
+            example = arg.get_fake_examples()
+            if example is not None:
+                result.extend(example)
+            else:
+                # Fallback, in case fake_tensor was not set
+                # Particularly for graph args that are not tensors
+                result.extend(arg.get_examples())
+        return result
+
     def example_inputs(self) -> List[torch.Tensor]:
         result = []
         for arg in self.graphargs:
diff --git a/torch/_dynamo/test_minifier_common.py b/torch/_dynamo/test_minifier_common.py
index 8fb0688f2c3ed..947a45f2fcdf1 100644
--- a/torch/_dynamo/test_minifier_common.py
+++ b/torch/_dynamo/test_minifier_common.py
@@ -51,7 +51,6 @@ def _run_test_code(self, code):
         proc = subprocess.run(
             ["python3", "-c", code], capture_output=True, cwd=self.DEBUG_DIR
         )
-
         repro_dir_match = re.search(
             r"(\S+)minifier_launcher.py", proc.stderr.decode("utf-8")
         )
@@ -100,7 +99,6 @@ def _run_repro(self, patch_code, repro_dir):
         repro_proc = subprocess.run(
             ["python3", repro_file], capture_output=True, cwd=repro_dir
         )
-
         return repro_proc, repro_code
 
     # Template for testing code.
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index a6a139ef5760b..4f5ebc072fb5c 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -23,7 +23,7 @@
 import weakref
 from contextlib import contextmanager
 from functools import lru_cache
-from typing import Any, Dict
+from typing import Any, Dict, List
 
 import numpy as np
 import sympy
@@ -32,7 +32,7 @@
 from torch import fx
 from torch._dispatch.python import enable_python_dispatcher
 from torch.nn.modules.lazy import LazyModuleMixin
-from torch.utils._pytree import tree_map
+from torch.utils._pytree import tree_flatten, tree_map
 
 from . import config, logging as torchdynamo_logging
 
@@ -398,6 +398,10 @@ def clone_tensor(x):
 
 def clone_input(x):
     """copy while preserving strides"""
+    # TODO: this is questionable
+    if isinstance(x, torch._subclasses.FakeTensor):
+        # this func fails on fake tensors in __torch_dispatch__
+        return x
 
     def torch_clone(x):
         y = torch.clone(x)
@@ -754,7 +758,10 @@ def wrap_to_fake_tensor(e, fake_mode):
 
 
 def wrap_to_fake_tensor_and_record(e, tx):
-    if type(e) in (torch.Tensor, torch.nn.Parameter):
+    # The not fake tensor check here is annoying - ideally, fake tensors never call this during wrapping.
+    # However, get_fake_value takes args and passes them through this, which may include fake tensors.
+    # see tree_map(fake_wrapper, args) in get_fake_value.
+    if isinstance(e, torch.Tensor) and not isinstance(e, torch._subclasses.FakeTensor):
         static_shapes = config.dynamic_shapes is False
         if type(e) is torch.nn.Parameter:
             # Always static for params
@@ -817,6 +824,9 @@ def same(
                 return False
         return True
     elif isinstance(ref, torch.Tensor):
+        assert not isinstance(ref, torch._subclasses.FakeTensor)
+        assert not isinstance(res, torch._subclasses.FakeTensor)
+
         if ref.is_sparse:
             assert res.is_sparse
             ref = ref.to_dense()
@@ -1156,3 +1166,20 @@ def assert_no_fake_params_or_buffers(gm):
         assert not isinstance(
             param, torch._subclasses.FakeTensor
         ), f"Unexpected fake param {name}"
+
+
+def fake_mode_from_tensors(inputs: List[Any]):
+    """
+    Takes a list of anything, unflattened is fine, returns a fake_mode
+    if any are fake. All fake modes on all fake tensors must be identical.
+    Returns None if no fake_mode is fine
+    """
+    flat_inputs, _ = tree_flatten(inputs)
+    fake_mode = None
+    for flat_input in flat_inputs:
+        if isinstance(flat_input, torch._subclasses.FakeTensor):
+            if fake_mode is None:
+                fake_mode = flat_input.fake_mode
+            else:
+                assert fake_mode is flat_input.fake_mode
+    return fake_mode
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index 43c2c91c4a553..843e50687a61e 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -9,7 +9,7 @@
 import re
 import types
 from abc import ABCMeta
-from typing import Any, Union
+from typing import Any, Optional, Union
 
 import numpy as np
 from functorch.experimental.ops import PyOperator
@@ -108,8 +108,13 @@ class GraphArg:
     source: Source
     example: Any
     is_unspecialized: bool
+    fake_tensor: Optional[torch._subclasses.fake_tensor.FakeTensor]
 
     def __post_init__(self):
+        if isinstance(self.example, torch.Tensor):
+            assert isinstance(
+                self.fake_tensor, torch._subclasses.fake_tensor.FakeTensor
+            )
         if isinstance(self.example, torch._subclasses.fake_tensor.FakeTensor):
             raise AssertionError("Fake Tensor observed in TorchDynamo Fx graph inputs")
 
@@ -119,6 +124,13 @@ def load(self, tx):
     def get_examples(self):
         return [self.example]
 
+    def get_fake_examples(self):
+        if self.fake_tensor is not None:
+            assert isinstance(
+                self.fake_tensor, torch._subclasses.fake_tensor.FakeTensor
+            )
+            return [self.fake_tensor]
+
     def __len__(self):
         return 1
 
@@ -523,7 +535,9 @@ def tensor_should_specialize(self):
 
     def wrap_sym(self, value: Union[torch.SymInt, torch.SymFloat]):
         if not is_constant_source(self.get_source()):
-            self.tx.output.graphargs.append(GraphArg(self.get_source(), value, False))
+            self.tx.output.graphargs.append(
+                GraphArg(self.get_source(), value, False, None)
+            )
         elif is_constant_source(self.get_source()):
             return self.tx.output.register_attr_or_module(
                 value,
@@ -551,10 +565,6 @@ def wrap_tensor(self, value: torch.Tensor):
                 # guards=self.make_guards(GuardBuilder.TENSOR_MATCH),
             )
         else:
-            if not is_constant_source(self.get_source()):
-                self.tx.output.graphargs.append(
-                    GraphArg(self.get_source(), value, False)
-                )
             # Disable __torch_function__ to prevent cloning of `value` to hit
             # us
             with torch._C.DisableTorchFunction():
@@ -574,6 +584,15 @@ def wrap_tensor(self, value: torch.Tensor):
                     guards=self.make_guards(GuardBuilder.TENSOR_MATCH),
                     should_specialize=self.tensor_should_specialize(),
                 )
+
+            fake_tensor_value = None
+            example_value = tensor_variable.proxy.node.meta["example_value"]
+            if isinstance(example_value, torch._subclasses.fake_tensor.FakeTensor):
+                fake_tensor_value = example_value
+
+            graph_arg = GraphArg(self.get_source(), value, False, fake_tensor_value)
+            self.tx.output.graphargs.append(graph_arg)
+
             if torch.overrides.has_torch_function_unary(value):
                 subclass_torch_function__func = value.__torch_function__.__func__
                 subclass_type = type(value)
@@ -598,10 +617,6 @@ def wrap_unspecialized_primitive(self, value):
             else:
                 # TODO: Eliminate this case entirely
                 wrapped_value = torch.tensor(value)
-            if not is_constant_source(self.get_source()):
-                self.tx.output.graphargs.append(
-                    GraphArg(self.get_source(), wrapped_value, True)
-                )
             if not isinstance(self.get_source(), RandomValueSource):
                 guards = {self.get_source().make_guard(GuardBuilder.TYPE_MATCH, True)}
                 options = {"guards": guards}
@@ -632,6 +647,14 @@ def wrap_unspecialized_primitive(self, value):
                     **options,
                 )
             self.tx.output.unspec_variable_map[self.name] = unspec_var
+            if not is_constant_source(self.get_source()):
+                fake_tensor_value = None
+                example_value = unspec_var.proxy.node.meta["example_value"]
+                if isinstance(example_value, torch._subclasses.fake_tensor.FakeTensor):
+                    fake_tensor_value = example_value
+                self.tx.output.graphargs.append(
+                    GraphArg(self.get_source(), wrapped_value, True, fake_tensor_value)
+                )
             return unspec_var
 
 
@@ -689,11 +712,14 @@ def _clone_input(value):
     with preserve_rng_state():
         if example_value is None:
             example_value = get_fake_value(proxy.node, tx)
-
         else:
-            proxy.tracer.real_value_cache[proxy.node] = _clone_input(example_value)
-            fake_wrapper = functools.partial(wrap_to_fake_tensor_and_record, tx=tx)
-            example_value = fake_wrapper(example_value)
+            # Note: Unfortunately, this can happen during tracing, and is valid enough for now to allow.
+            # TODO(voz): Find all the callsites and burn this down.
+            # Flipping it to an assert fails dozens of tests.
+            if not isinstance(example_value, torch._subclasses.FakeTensor):
+                proxy.tracer.real_value_cache[proxy.node] = _clone_input(example_value)
+                fake_wrapper = functools.partial(wrap_to_fake_tensor_and_record, tx=tx)
+                example_value = fake_wrapper(example_value)
 
     if isinstance(example_value, torch.Tensor):
         is_parameter = isinstance(example_value, torch.nn.Parameter)
diff --git a/torch/_inductor/overrides.py b/torch/_inductor/overrides.py
index 8d95971864f0f..bf66e68fed624 100644
--- a/torch/_inductor/overrides.py
+++ b/torch/_inductor/overrides.py
@@ -11,6 +11,7 @@
 import torch
 import torch.nn as nn
 from torch import _prims
+from torch._dynamo.utils import fake_mode_from_tensors
 from torch.fx.experimental.optimization import (
     matches_module_pattern,
     replace_node_module,
@@ -530,10 +531,12 @@ def fuse_fx(gm: torch.fx.GraphModule, example_inputs):
         example_input.device == torch.device("cpu") for example_input in example_inputs
     )
 
+    fake_mode = fake_mode_from_tensors(example_inputs)
+
     if config.permute_fusion and not is_cpu:
         # For linear permute fusion, we need to check input info to identify
         # and perform proper permutation/transpose
-        ShapeProp(gm).propagate(*example_inputs)
+        ShapeProp(gm, fake_mode=fake_mode).propagate(*example_inputs)
         gm = linear_permute_fusion(gm)
         gm = permute_linear_fusion(gm)
         gm = permute_matmul_fusion(gm)
@@ -549,7 +552,8 @@ def fuse_fx(gm: torch.fx.GraphModule, example_inputs):
     gm = fuse_conv_bn(gm)
     # For binary fusion, we need to check inputs info to make sure
     # the binary inputs have same tensor info(device, dtype, and layout).
-    ShapeProp(gm).propagate(*example_inputs)
+
+    ShapeProp(gm, fake_mode=fake_mode).propagate(*example_inputs)
     gm = fuse_unary(gm)
     gm = fuse_binary_inplace(gm)
     gm = fuse_binary(gm)
diff --git a/torch/fx/passes/shape_prop.py b/torch/fx/passes/shape_prop.py
index 2be996f714ce8..4fd8ce8af9347 100644
--- a/torch/fx/passes/shape_prop.py
+++ b/torch/fx/passes/shape_prop.py
@@ -109,11 +109,38 @@ def forward(self, x):
 
     Args:
          module (GraphModule): The module to be executed
+         fake_mode (FakeTensorMode): A fake mode for copying the gm
 
     """
+    def __init__(self, gm, fake_mode=None):
+        super().__init__(gm)
+        if fake_mode:
+            from torch._dynamo.utils import deepcopy_to_fake_tensor
+            # Note:
+            # We need fake execution cause the inputs are fake, however, we cannot fakify the module
+            # - because we need to write to the tensor_meta of the real module. So we fakify to
+            # produce a result (L130 below), to extract tensor meta, and then keep going.
+            #
+            # If we were to fakify, we would write to the wrong node, and then downstream fusion
+            # would be missing the tensor_meta.
+            #
+            # See torch/_inductor/overrides.py for where this is called upstream of fusion.
+            self.fake_module = deepcopy_to_fake_tensor(self.module, fake_mode)
+        else:
+            self.fake_module = None
+
+        self.real_module = self.module
+
     def run_node(self, n : Node) -> Any:
         try:
-            result = super().run_node(n)
+            if self.fake_module is not None:
+                # Hacky swap. Alternatively, we could do this with overriding
+                # call_module and get_attr.
+                self.module = self.fake_module
+            try:
+                result = super().run_node(n)
+            finally:
+                self.module = self.real_module
         except Exception:
             traceback.print_exc()
             raise RuntimeError(

From 4d76132bbd25b9c256c98dc8d8292e65944a0f5c Mon Sep 17 00:00:00 2001
From: Nikita Karetnikov <nikita@karetnikov.org>
Date: Sun, 4 Dec 2022 23:33:13 +0100
Subject: [PATCH 1563/1922] [Inductor] handle non-positive exponents in `Pow`
 (#90146)

Fixes #90125.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90146
Approved by: https://github.com/ezyang, https://github.com/jansel
---
 test/inductor/test_torchinductor.py | 28 +++++++++++++++++++++++++++-
 torch/_inductor/codegen/common.py   |  7 ++++++-
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 12f3bf2120e94..4e1c60f40051e 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -42,7 +42,8 @@
     from functorch.compile import config as functorch_config
     from torch._decomp import get_decompositions
     from torch._inductor import codecache, config, metrics
-    from torch._inductor.codegen.cpp import CppOverrides, CppVecOverrides
+    from torch._inductor.codegen.cpp import cexpr, CppOverrides, CppVecOverrides
+    from torch._inductor.codegen.triton import texpr
     from torch._inductor.compile_fx import compile_fx, complex_memory_overlap
     from torch._inductor.ir import IndexingDiv, ModularIndexing
     from torch._inductor.overrides import (
@@ -5796,6 +5797,31 @@ def fn(a: torch.Tensor) -> torch.Tensor:
             torch._dynamo.reset()
 
 
+class ExprPrinterTests(TestCase):
+    def test_print_pow(self):
+        s1 = sympy.Symbol("foo", integer=True)
+        s2 = sympy.Symbol("bar", integer=True)
+        s3 = sympy.Symbol("baz", integer=True)
+
+        cases = (
+            # expr, result
+            # Test exprs.
+            (
+                s1 / (2 * s1 - 1) - 1 / (2 * s1 - 1),
+                "((-1)*(1/(((-1) + (2*foo))))) + (foo*(1/(((-1) + (2*foo)))))",
+            ),
+            (s1 / (s2 - s3), "foo*(1/((bar + ((-1)*baz))))"),
+            # Test Pow directly.
+            (sympy.Pow(s1 + s2, 0), "1"),  # note: simplified before _print_Pow
+            (sympy.Pow(s1 + s2, -3), "1/((bar + foo)*(bar + foo)*(bar + foo))"),
+            (sympy.Pow(s1 + s2, 2), "(bar + foo)*(bar + foo)"),
+        )
+
+        for expr, result in cases:
+            self.assertEqual(cexpr(expr), result)
+            self.assertEqual(texpr(expr), result)
+
+
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index b1e710c0ef91e..2b997ff745028 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -48,7 +48,12 @@ def _print_Pow(self, expr):
         base = self._print(base)
         assert exp.is_integer
         exp = int(exp)
-        return "*".join([self.paren(base)] * exp)
+        if exp > 0:
+            return "*".join([self.paren(base)] * exp)
+        elif exp < 0:
+            return "1/" + self.paren("*".join([self.paren(base)] * abs(exp)))
+        else:  # exp == 0
+            return "1"
 
     def _print_Mul(self, expr):
         return "*".join(map(self.paren, map(self._print, expr.args)))

From 1c9ea05ab5ed83c6fa932487d418a74badb84e4b Mon Sep 17 00:00:00 2001
From: vfdev-5 <vfdev.5@gmail.com>
Date: Mon, 5 Dec 2022 12:23:25 +0000
Subject: [PATCH 1564/1922] Added vectorized flip for uint8 (#90013)

Following https://github.com/pytorch/pytorch/pull/89414#discussion_r1036224613 just refactoring and adding `flip` method for `Vectorized<uint8>`. This should speed up torch.flip horizontal implementation similarly to what is reported in https://github.com/pytorch/pytorch/pull/89414

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90013
Approved by: https://github.com/peterbell10, https://github.com/lezcano
---
 aten/src/ATen/cpu/vec/vec256/vec256.h | 12 ++++++++++--
 aten/src/ATen/cpu/vec/vec512/vec512.h | 13 +++++++++++--
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h
index d0a8cb03604a9..f9c8794560be7 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
@@ -256,8 +256,7 @@ inline Vectorized<int16_t> flip(const Vectorized<int16_t> & v) {
   return _mm256_permute2x128_si256(reversed, reversed, 1);
 }
 
-template<>
-inline Vectorized<int8_t> flip(const Vectorized<int8_t> & v) {
+inline __m256i flip8(const __m256i & v) {
   const __m256i mask_int8 = _mm256_set_epi8(
     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
@@ -266,6 +265,15 @@ inline Vectorized<int8_t> flip(const Vectorized<int8_t> & v) {
   return _mm256_permute2x128_si256(reversed, reversed, 1);
 }
 
+template<>
+inline Vectorized<int8_t> flip(const Vectorized<int8_t> & v) {
+  return flip8(v);
+}
+
+template<>
+inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
+  return flip8(v);
+}
 
 #endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
 
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h
index dd1235e82eced..8656756aaed56 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
@@ -227,8 +227,7 @@ inline Vectorized<int16_t> flip(const Vectorized<int16_t> & v) {
   return _mm512_permutexvar_epi16(mask, v);
 }
 
-template<>
-inline Vectorized<int8_t> flip(const Vectorized<int8_t> & v) {
+inline __m512i flip8(const __m512i & v) {
   const __m512i mask1 = _mm512_set_epi8(
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
@@ -240,6 +239,16 @@ inline Vectorized<int8_t> flip(const Vectorized<int8_t> & v) {
   return _mm512_permutexvar_epi64(mask2, reversed_vec);
 }
 
+template<>
+inline Vectorized<int8_t> flip(const Vectorized<int8_t> & v) {
+  return flip8(v);
+}
+
+template<>
+inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
+  return flip8(v);
+}
+
 #endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
 
 }}}

From 5619a163ad394a2c615f073fa0ce43cf1db07e06 Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Mon, 5 Dec 2022 03:26:46 +0000
Subject: [PATCH 1565/1922] Fix fully_shard error when policy is not provided
 (#90151)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90151
Approved by: https://github.com/awgu
---
 .../distributed/_composable/test_fully_shard.py | 14 ++++++++++----
 torch/distributed/fsdp/_wrap_utils.py           | 17 +++++++++--------
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/test/distributed/_composable/test_fully_shard.py b/test/distributed/_composable/test_fully_shard.py
index 8f1ba437bf964..1c4c5901ac979 100644
--- a/test/distributed/_composable/test_fully_shard.py
+++ b/test/distributed/_composable/test_fully_shard.py
@@ -73,20 +73,19 @@ class TestFSDPInitialization(FSDPTest):
     def world_size(self) -> int:
         return 2
 
-    @skip_if_lt_x_gpu(2)
-    def test_auto_wrap_policy(self):
+    def _test_auto_wrap_policy(self, auto_wrap_policy):
         """Tests passing an ``auto_wrap_policy``."""
 
         local_model = Model(device=torch.device("cuda"))
         fsdp_wrapped_model = FSDP(
             copy.deepcopy(local_model),
-            auto_wrap_policy=Model.policy(),
+            auto_wrap_policy=auto_wrap_policy,
             use_orig_params=True,
         )
         composable_module = copy.deepcopy(local_model)
         fully_shard(
             composable_module,
-            policy=Model.policy(),
+            policy=auto_wrap_policy,
         )
 
         # Check that the composable module has the same names as the local
@@ -124,6 +123,13 @@ def test_auto_wrap_policy(self):
             composable_module_classes.add(type(submodule))
         self.assertEqual(local_module_classes, composable_module_classes)
 
+    @skip_if_lt_x_gpu(2)
+    def test_auto_wrap_policy(self):
+        self.run_subtests(
+            {"auto_wrap_policy": [None, Model.policy()]},
+            self._test_auto_wrap_policy,
+        )
+
     @skip_if_lt_x_gpu(2)
     def test_device_id(self):
         """Tests passing a ``device_id``."""
diff --git a/torch/distributed/fsdp/_wrap_utils.py b/torch/distributed/fsdp/_wrap_utils.py
index cdda065df1993..ba64c1b4416f4 100644
--- a/torch/distributed/fsdp/_wrap_utils.py
+++ b/torch/distributed/fsdp/_wrap_utils.py
@@ -101,14 +101,15 @@ def _get_submodule_to_states(
     # Record the modules to wrap without actually wrapping
     wrapped_modules: List[nn.Module] = []  # these are only logically wrapped
     wrapper_cls = functools.partial(_record_module_wrapper_cls, wrapped_modules)
-    _recursive_wrap(
-        root_module,
-        auto_wrap_policy=auto_wrap_policy.policy,
-        wrapper_cls=wrapper_cls,
-        ignored_modules=ignored_modules,
-        ignored_params=ignored_params,
-        only_wrap_children=False,
-    )
+    if auto_wrap_policy is not None:
+        _recursive_wrap(
+            root_module,
+            auto_wrap_policy=auto_wrap_policy.policy,
+            wrapper_cls=wrapper_cls,
+            ignored_modules=ignored_modules,
+            ignored_params=ignored_params,
+            only_wrap_children=False,
+        )
     # Always include the root module even if not wrapped by the given policy
     if root_module not in wrapped_modules:
         wrapped_modules.append(root_module)

From 9cbda193110c863786f1fb752dd2edea96fcfb4b Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Mon, 5 Dec 2022 04:01:30 +0000
Subject: [PATCH 1566/1922] Remove deprecated flatten_params_wrapper.py from
 lintrunner config (#90154)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90154
Approved by: https://github.com/awgu
---
 .lintrunner.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.lintrunner.toml b/.lintrunner.toml
index 714234b1ad7ed..6f2c60ac6c8ed 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -117,7 +117,6 @@ exclude_patterns = [
     'torch/distributed/distributed_c10d.py',
     # TODO(suo): these exclusions were added just to get lint clean on master.
     # Follow up to do more target suppressions and remove them.
-    'torch/distributed/fsdp/flatten_params_wrapper.py',
     'torch/ao/quantization/fx/convert.py',
     'torch/ao/quantization/_dbr/function_fusion.py',
     'test/test_datapipe.py',

From abaea5eb0c624bb53b593b22a9ab3868691baa35 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Sun, 4 Dec 2022 19:33:46 +0000
Subject: [PATCH 1567/1922] as_strided: Fix default storage_offset for
 reference implementation (#89513)

This fixes the default storage_offset to take it from the input. This was
previously untested, so I've also added a new OpInfo which includes samples with
non-zero storage_offsets on the input tensor.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89513
Approved by: https://github.com/ezyang, https://github.com/ngimel
---
 test/functorch/test_aotdispatch.py            |  1 +
 test/functorch/test_ops.py                    | 13 +++-
 test/functorch/test_vmap.py                   |  2 +
 torch/_refs/__init__.py                       | 10 ++-
 .../_internal/common_methods_invocations.py   | 72 ++++++++++++++++---
 5 files changed, 87 insertions(+), 11 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 2434e35ab4871..165041edfb306 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1776,6 +1776,7 @@ def forward(self, x):
     xfail('scatter_reduce', 'prod'),
 
     skip('as_strided_scatter'),
+    xfail('as_strided', 'partial_views'),
 
     # Too annoying to generate random inputs
     xfail('cholesky'),
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index fbee1872ddf3e..ece56ef145ae2 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -414,6 +414,7 @@ def wrapped_fn(*args, **kwargs):
         # BUG
         # AssertionError: Tensor-likes are not close!
         xfail('as_strided'),
+        xfail('as_strided', 'partial_views'),
         decorate('linalg.det', 'singular',
                  decorator=unittest.skipIf(IS_MACOS and IS_X86, "Fails on x86 MacOS CI")),
     }))
@@ -655,6 +656,7 @@ def fn(inp, *args, **kwargs):
         skip("atleast_3d"),  # Takes too long
         skip("ormqr"),  # Takes too long
         xfail("as_strided"),  # incorrect output
+        xfail("as_strided", "partial_views"),  # incorrect output
         xfail("as_strided_scatter"),  # incorrect output
         skip("bernoulli"),  # calls random op
         xfail("bfloat16"),  # rank 4 tensor for channels_last
@@ -735,6 +737,9 @@ def fn(inp, *args, **kwargs):
         tol1('svd',
              {torch.float32: tol(atol=1e-03, rtol=5e-04)}),
     ))
+    @skipOps('TestOperators', 'test_vmapvjpvjp', {
+        xfail('as_strided', 'partial_views'),
+    })
     def test_vmapvjpvjp(self, device, dtype, op):
         # Since, we test `vjpvjp` independently,
         # for this test, we just verify that vmap
@@ -802,6 +807,7 @@ def vjp_of_vjp(*args_and_cotangents):
         xfail('svd_lowrank', ''),  # randomness
         xfail('to_sparse', ''),  # non-dense output
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
+        xfail('as_strided', 'partial_views'),
         # ----------------------------------------------------------------------
 
         # ---------------------------- BUGS ------------------------------------
@@ -851,7 +857,9 @@ def vjp_of_vjp(*args_and_cotangents):
         tol1('linalg.householder_product',
              {torch.float32: tol(atol=1e-04, rtol=1e-04)}),
     ))
-    @skipOps('TestOperators', 'test_vmapvjp', vmapvjp_fail)
+    @skipOps('TestOperators', 'test_vmapvjp', vmapvjp_fail.union({
+        xfail('as_strided', 'partial_views'),
+    }))
     def test_vmapvjp(self, device, dtype, op):
         if not op.supports_autograd:
             self.skipTest("Skipped! Autograd not supported.")
@@ -899,6 +907,7 @@ def test_vmapvjp(self, device, dtype, op):
         decorate('linalg.det', 'singular', decorator=unittest.skipIf(IS_MACOS, "Fails on x86 MacOS CI")),
         skip('nn.functional.max_pool1d'),  # fails on cpu, runs on cuda
         xfail('masked.mean'),  # silent incorrectness (nan difference)
+        xfail('as_strided', 'partial_views'),  # Tensor-likes are not close!
 
         xfail('nn.functional.soft_margin_loss', ''),  # soft_margin_loss_backward does not support forward-ad
         xfail('tensor_split'),  # data_ptr composite compliance
@@ -1201,6 +1210,7 @@ def test():
         xfail('sparse.sampled_addmm', ''),
         xfail("native_batch_norm"),
         xfail("_native_batch_norm_legit"),
+        xfail('as_strided', 'partial_views'),
     }))
     def test_vjpvmap(self, device, dtype, op):
         # NB: there is no vjpvmap_has_batch_rule test because that is almost
@@ -1383,6 +1393,7 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents):
 
         # Potential bugs/errors
         xfail('as_strided'),  # AssertionError: Tensor-likes are not close!
+        xfail('as_strided', 'partial_views'),  # AssertionError: Tensor-likes are not close!
         xfail('as_strided_scatter'),  # AssertionError: Tensor-likes are not close!
         xfail('bernoulli'),  # calls random op
         xfail('bfloat16'),  # required rank 4 tensor to use channels_last format
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index dcad523217f3f..441cf46a98c22 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -3302,6 +3302,7 @@ def test():
         xfail('triu'),  # Exception not raised on error input
         # The error inputs are vectors, that pass when batched as they are treated as a matrix
         xfail('trace'),
+        xfail('as_strided', 'partial_views'),
     }))
     def test_vmap_exhaustive(self, device, dtype, op):
         # needs to be fixed
@@ -3317,6 +3318,7 @@ def test_vmap_exhaustive(self, device, dtype, op):
     ))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
     @skipOps('TestVmapOperatorsOpInfo', 'test_op_has_batch_rule', vmap_fail.union({
+        xfail('as_strided', 'partial_views'),
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
         xfail('complex'),
         xfail('copysign'),
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 04bf9e12927fa..3539784e8e32c 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -2513,9 +2513,15 @@ def atleast_3d(
 
 
 def as_strided(
-    a: TensorLikeType, size: ShapeType, stride: StrideType, storage_offset: int = 0
+    a: TensorLikeType,
+    size: ShapeType,
+    stride: StrideType,
+    storage_offset: Optional[int] = None,
 ) -> TensorLikeType:
-    return prims.as_strided(a, size, stride, storage_offset)
+    storage_offset_int = (
+        storage_offset if storage_offset is not None else a.storage_offset()
+    )
+    return prims.as_strided(a, size, stride, storage_offset_int)
 
 
 def broadcast_shapes(*shapes) -> ShapeType:
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index d11c275cc220c..91e1e8a1d636a 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -263,9 +263,15 @@ def sample_inputs_as_strided(op_info, device, dtype, requires_grad, **kwargs):
         kwargs = dict(storage_offset=storage_offset)
         yield SampleInput(input_t, args=(output_shape, stride), kwargs=kwargs)
 
+def sample_inputs_as_strided_partial_views(op_info, device, dtype, requires_grad, **kwargs):
+    def make_arg():
+        base = make_tensor((20,), device=device, dtype=dtype)
+        return base[5:15].requires_grad_(requires_grad)
+
     # as_strided on offset, partial views
-    # yield SampleInput(make_arg((20,))[5:15], args=((2, 2), (1, 2)))
-    # yield SampleInput(make_arg((20,))[5:15], args=((2, 2), (1, 2)), kwargs={'storage_offset': 0})
+    yield SampleInput(make_arg(), (2, 2), (1, 2))
+    yield SampleInput(make_arg(), (2, 2), (1, 2), storage_offset=0)
+    yield SampleInput(make_arg(), (2, 2), (1, 2), storage_offset=10)
 
 def sample_inputs_as_strided_scatter(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -10721,8 +10727,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type='cpu'),
            )),
     OpInfo('as_strided',
-           op=lambda x, size, stride, storage_offset=0:
-               torch.as_strided(x, size, stride, storage_offset=storage_offset),
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
            supports_out=False,
            supports_forward_ad=True,
@@ -10743,7 +10747,47 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_conj_view'),
                DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_view'),
                DecorateInfo(unittest.skip("Numerous errors"), 'TestFwdGradients'),
-               DecorateInfo(unittest.skip("Numerous errors"), 'TestBwdGradients'))),
+               DecorateInfo(unittest.skip("Numerous errors"), 'TestBwdGradients'),
+           )),
+    OpInfo('as_strided',
+           variant_test_name='partial_views',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # vmap does not support inplace views
+           check_inplace_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_as_strided_partial_views,
+           skips=(
+               # Note: This xfail is fine -- it's inherent to how as_strided works
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples'),
+               # RuntimeError: This operator is not Composite Compliant: the
+               # storage_offset of the tensor was modified directly without
+               # going through the PyTorch dispatcher.
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance'),
+
+
+               # These fail because the test changes the input's in-memory layout
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_complex_half_reference_testing'),
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_fn_fwgrad_bwgrad',
+                            dtypes=(torch.complex64, torch.complex128)),
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD'),
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_inplace_forward_mode_AD'),
+               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_inplace_grad'),
+               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_inplace_gradgrad'),
+               DecorateInfo(unittest.expectedFailure, 'TestProxyTensorOpInfo',
+                            'test_make_fx_symbolic_exhaustive_inplace'),
+               DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness'),
+               DecorateInfo(unittest.expectedFailure, 'TestCudaFuserOpInfo', 'test_nvfuser_correctness'),
+               DecorateInfo(unittest.expectedFailure, 'TestCudaFuserOpInfo', 'test_nvfuser_extremal_values'),
+               # Fail but are also flaky
+               DecorateInfo(unittest.skip("Test changes in memory layout"), 'TestMathBits'),
+               DecorateInfo(unittest.skip("Modifies input strides and storage_offset"), 'TestCommon',
+                            'test_non_standard_bool_values'),
+           )),
     OpInfo('as_strided_scatter',
            op=lambda x, src, size, stride, storage_offset=0:
                torch.as_strided_scatter(x, src, size, stride, storage_offset=storage_offset),
@@ -18282,15 +18326,27 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
         supports_nvfuser=False,
         skips=(
-            # TODO: fix and/or update to xfails
-            DecorateInfo(unittest.skip("Errors when storage_offset is included"),
-                         'TestCommon', 'test_python_ref_meta'),
             # cloned_mutable_input.is_same(returned_output) INTERNAL ASSERT FAILED
             DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_view'),
             DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_conj_view'),
             DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_conj_view'),
         ),
     ),
+    PythonRefInfo(
+        "_refs.as_strided",
+        torch_opinfo_name="as_strided",
+        torch_opinfo_variant_name="partial_views",
+        # FIXME: doesn't support chalf
+        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        supports_nvfuser=False,
+        skips=(
+            # cloned_mutable_input.is_same(returned_output) INTERNAL ASSERT FAILED
+            DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_conj_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu'),
+        ),
+    ),
     PythonRefInfo(
         "_refs.broadcast_shapes",
         torch_opinfo_name="broadcast_shapes",

From e262e278f998a2b0c18536dd22c2318e6dc57f24 Mon Sep 17 00:00:00 2001
From: erjia <erjia@fb.com>
Date: Fri, 2 Dec 2022 22:20:42 +0000
Subject: [PATCH 1568/1922] [4/4][DataPipe] Remove iterator depletion in Zipper
 (#89974)

Fixes: https://github.com/pytorch/data/issues/865

I will add another PR in torchdata to validate this change would solve the infinite datapipe problem (I have tested locally). This is one of the most annoying stack of PRs cause by separation between TorchData and PyTorch.

There is a case that `file.close` is never called because when generator function has never reached to the end. A simple example would be `zip` two datepipes with different length. The longer DataPipe would never reach the end of generator and then it will be cleaned up by `gc`. So, the line of `file.close` is not executed. (This is the reason that Vitaly has to create this [hack](https://github.com/pytorch/pytorch/blob/4451eb24e6287dff62ff8a7ec0eda6a6998807b0/torch/utils/data/datapipes/iter/combining.py#L573-L583) to retrieve all remaining data to make sure generator function is fully executed)

However, this hack introduces another problem where an infinite datapipe would make `zip` never end as it would try to deplete the infinite iterator. See: https://github.com/pytorch/data/issues/865

So, in this PR, I am adding a `try-finally` clause to make sure the `file.close` is always executed during the destruction of `generator` object. Then, we don't need the hack within `zip` any more.

Differential Revision: [D41699469](https://our.internmc.facebook.com/intern/diff/D41699469)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89974
Approved by: https://github.com/NivekT, https://github.com/wenleix
---
 torch/utils/data/datapipes/iter/combining.py | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py
index 4ce4c3b8c9fd5..088fcbffe38aa 100644
--- a/torch/utils/data/datapipes/iter/combining.py
+++ b/torch/utils/data/datapipes/iter/combining.py
@@ -603,20 +603,8 @@ def __init__(self, *datapipes: IterDataPipe):
 
     def __iter__(self) -> Iterator[Tuple[T_co]]:
         iterators = [iter(datapipe) for datapipe in self.datapipes]
-        try:
-            for data in zip(*iterators):
-                yield data
-        finally:
-            unused = []
-            for iterator in iterators:
-                try:
-                    unused += list(iterator)
-                except RuntimeError:  # Some iterators may have been invalidated by single iterator constraints
-                    pass
-
-            # TODO(VitalyFedyunin): This should be Exception or warning when torchdata.debug is enabled
-            for item in unused:
-                StreamWrapper.close_streams(item)
+        for data in zip(*iterators):
+            yield data
 
     def __len__(self) -> int:
         if self.length is not None:

From 3ec332b071718dbb4a86ddfaba3a20148dae84ca Mon Sep 17 00:00:00 2001
From: Lukas N Wirz <lnwirz@chem.helsinki.fi>
Date: Mon, 5 Dec 2022 16:50:00 +0000
Subject: [PATCH 1569/1922] Remove deprecated usage of is_pod/is_pod_v (#88918)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… as equivalent replacements for std::is_pod and std::is_pod_v because they are deprecated in C++20.

When consuming libtorch header files in a project that uses C++20, there are warnings about std::is_pod being deprecated.  This patch fixes that issue.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88918
Approved by: https://github.com/ezyang
---
 aten/src/ATen/CPUGeneratorImpl.cpp          |  6 +++---
 aten/src/ATen/native/miopen/Conv_miopen.cpp |  2 +-
 aten/src/ATen/native/utils/ParamsHash.h     |  8 ++++----
 c10/core/impl/LocalDispatchKeySet.h         |  2 +-
 c10/util/C++17.h                            | 13 +++++++++++++
 c10/util/SmallBuffer.h                      |  4 +++-
 torch/csrc/jit/runtime/argument_spec.h      |  2 +-
 torch/csrc/profiler/collection.h            |  6 ++----
 torch/csrc/profiler/kineto_shim.cpp         |  2 +-
 9 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/aten/src/ATen/CPUGeneratorImpl.cpp b/aten/src/ATen/CPUGeneratorImpl.cpp
index d7dce2561d4f9..5fd06c442750d 100644
--- a/aten/src/ATen/CPUGeneratorImpl.cpp
+++ b/aten/src/ATen/CPUGeneratorImpl.cpp
@@ -127,8 +127,8 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
   using detail::CPUGeneratorImplState;
   using detail::CPUGeneratorImplStateLegacy;
 
-  static_assert(std::is_pod<CPUGeneratorImplStateLegacy>::value, "CPUGeneratorImplStateLegacy is not a PODType");
-  static_assert(std::is_pod<CPUGeneratorImplState>::value, "CPUGeneratorImplState is not a PODType");
+  static_assert(std::is_standard_layout<CPUGeneratorImplStateLegacy>::value, "CPUGeneratorImplStateLegacy is not a PODType");
+  static_assert(std::is_standard_layout<CPUGeneratorImplState>::value, "CPUGeneratorImplState is not a PODType");
 
   static const size_t size_legacy = sizeof(CPUGeneratorImplStateLegacy);
   static const size_t size_current = sizeof(CPUGeneratorImplState);
@@ -207,7 +207,7 @@ c10::intrusive_ptr<c10::TensorImpl> CPUGeneratorImpl::get_state() const {
   using detail::CPUGeneratorImplState;
 
   static const size_t size = sizeof(CPUGeneratorImplState);
-  static_assert(std::is_pod<CPUGeneratorImplState>::value, "CPUGeneratorImplState is not a PODType");
+  static_assert(std::is_standard_layout<CPUGeneratorImplState>::value, "CPUGeneratorImplState is not a PODType");
 
   auto state_tensor = at::detail::empty_cpu({(int64_t)size}, ScalarType::Byte, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt);
   auto rng_state = state_tensor.data_ptr();
diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp
index 677a711ce7a6b..060a97d6fc1c1 100644
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@@ -187,7 +187,7 @@ struct ConvolutionParams
 };
 // ConvolutionParams must be a POD because we read out its memory
 // contenst as char* when hashing
-static_assert(std::is_pod<ConvolutionParams>::value, "ConvolutionParams not POD");
+static_assert(std::is_standard_layout<ConvolutionParams>::value, "ConvolutionParams not POD");
 
 void setConvolutionParams(
     ConvolutionParams* params, miopenHandle_t handle,
diff --git a/aten/src/ATen/native/utils/ParamsHash.h b/aten/src/ATen/native/utils/ParamsHash.h
index 76bb4de53d633..c4056ab1b3f1e 100644
--- a/aten/src/ATen/native/utils/ParamsHash.h
+++ b/aten/src/ATen/native/utils/ParamsHash.h
@@ -11,8 +11,8 @@ namespace at { namespace native {
 template <typename Params>
 struct ParamsHash {
   // Params must be a POD because we read out its memory
-  // contenst as char* when hashing
-  static_assert(std::is_pod<Params>::value, "Params is not POD");
+  // contents as char* when hashing
+  static_assert(std::is_standard_layout<Params>::value, "Params is not POD");
 
   size_t operator()(const Params& params) const {
     auto ptr = reinterpret_cast<const uint8_t*>(&params);
@@ -28,8 +28,8 @@ struct ParamsHash {
 template <typename Params>
 struct ParamsEqual {
   // Params must be a POD because we read out its memory
-  // contenst as char* when comparing
-  static_assert(std::is_pod<Params>::value, "Params is not POD");
+  // contents as char* when comparing
+  static_assert(std::is_standard_layout<Params>::value, "Params is not POD");
 
   bool operator()(const Params& a, const Params& b) const {
     auto ptr1 = reinterpret_cast<const uint8_t*>(&a);
diff --git a/c10/core/impl/LocalDispatchKeySet.h b/c10/core/impl/LocalDispatchKeySet.h
index 70af58b957165..391b8cff4939b 100644
--- a/c10/core/impl/LocalDispatchKeySet.h
+++ b/c10/core/impl/LocalDispatchKeySet.h
@@ -52,7 +52,7 @@ struct C10_API PODLocalDispatchKeySet {
   }
 };
 static_assert(
-    std::is_pod<PODLocalDispatchKeySet>::value,
+    std::is_trivial<PODLocalDispatchKeySet>::value,
     "PODLocalDispatchKeySet must be a POD type.");
 
 struct C10_API LocalDispatchKeySet {
diff --git a/c10/util/C++17.h b/c10/util/C++17.h
index c51275721e584..5227578481dea 100644
--- a/c10/util/C++17.h
+++ b/c10/util/C++17.h
@@ -49,6 +49,19 @@ using invoke_result = typename std::result_of<F && (args && ...)>;
 template <typename F, typename... args>
 using invoke_result_t = typename invoke_result<F, args...>::type;
 
+// std::is_pod is deprecated in C++20, std::is_standard_layout and
+// std::is_trivial are introduced in C++11, std::conjunction has been introduced
+// in C++17.
+template <typename T>
+#if defined(__cpp_lib_logical_traits) && __cpp_lib_logical_traits >= 201510L
+using is_pod = std::conjunction<std::is_standard_layout<T>, std::is_trivial<T>>;
+#else
+using is_pod = std::is_pod<T>;
+#endif
+
+template <typename T>
+constexpr bool is_pod_v = is_pod<T>::value;
+
 namespace guts {
 
 template <typename Base, typename Child, typename... Args>
diff --git a/c10/util/SmallBuffer.h b/c10/util/SmallBuffer.h
index 4dfa04c87190a..b519d30ec3963 100644
--- a/c10/util/SmallBuffer.h
+++ b/c10/util/SmallBuffer.h
@@ -15,7 +15,9 @@ namespace c10 {
 
 template <typename T, size_t N>
 class SmallBuffer {
-  static_assert(std::is_pod<T>::value, "SmallBuffer is intended for POD types");
+  static_assert(
+      std::is_trivial<T>::value,
+      "SmallBuffer is intended for POD types");
 
   T storage_[N];
   size_t size_;
diff --git a/torch/csrc/jit/runtime/argument_spec.h b/torch/csrc/jit/runtime/argument_spec.h
index 66e53da24d1df..d09918522a812 100644
--- a/torch/csrc/jit/runtime/argument_spec.h
+++ b/torch/csrc/jit/runtime/argument_spec.h
@@ -66,7 +66,7 @@ struct ArgumentInfo {
 };
 
 static_assert(
-    std::is_pod<ArgumentInfo>::value,
+    std::is_standard_layout<ArgumentInfo>::value,
     "ArgumentInfo is to be a POD struct");
 static_assert(
     sizeof(ArgumentInfo) == sizeof(ArgumentInfo::plain_data_type),
diff --git a/torch/csrc/profiler/collection.h b/torch/csrc/profiler/collection.h
index cef614bd98612..7a957149d3612 100644
--- a/torch/csrc/profiler/collection.h
+++ b/torch/csrc/profiler/collection.h
@@ -176,9 +176,7 @@ struct RawAllocation {
 };
 
 // For performance.
-static_assert(
-    std::is_pod<RawAllocation>::value,
-    "Non-POD member of RawAllocation.");
+static_assert(c10::is_pod_v<RawAllocation>, "Non-POD member of RawAllocation.");
 
 template <>
 struct ExtraFields<EventType::Allocation> : RawAllocation {
@@ -204,7 +202,7 @@ struct ExtraFields<EventType::OutOfMemory> {
 
 // For performance.
 static_assert(
-    std::is_pod<ExtraFields<EventType::OutOfMemory>>::value,
+    c10::is_pod_v<ExtraFields<EventType::OutOfMemory>>,
     "Non-POD member of ExtraFields<EventType::OutOfMemory>.");
 
 struct PyFrameState {
diff --git a/torch/csrc/profiler/kineto_shim.cpp b/torch/csrc/profiler/kineto_shim.cpp
index bf84d327924b6..ba3582f0d6d95 100644
--- a/torch/csrc/profiler/kineto_shim.cpp
+++ b/torch/csrc/profiler/kineto_shim.cpp
@@ -37,7 +37,7 @@ const std::set<libkineto::ActivityType> cudaTypes = {
 #endif // USE_KINETO
 
 static_assert(
-    std::is_pod<DeviceAndResource>::value,
+    c10::is_pod_v<DeviceAndResource>,
     "Kineto specific details should be in `kineto_ids`.");
 
 const DeviceAndResource kineto_ids() {

From 75c8964ac0aeb71f768dd830489daa1aa419aac8 Mon Sep 17 00:00:00 2001
From: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Date: Mon, 5 Dec 2022 17:36:31 +0000
Subject: [PATCH 1570/1922] [ONNX] Fix concat with empty tensors (#87620)

Fixes #54410

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87620
Approved by: https://github.com/BowenBao
---
 test/onnx/test_pytorch_onnx_no_runtime.py | 22 +++++++++++++++++++++
 torch/onnx/symbolic_opset9.py             | 24 +++++++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/test/onnx/test_pytorch_onnx_no_runtime.py b/test/onnx/test_pytorch_onnx_no_runtime.py
index 89526c71ca387..aca26e0cb2b42 100644
--- a/test/onnx/test_pytorch_onnx_no_runtime.py
+++ b/test/onnx/test_pytorch_onnx_no_runtime.py
@@ -1068,6 +1068,28 @@ def forward(self, x):
         ]
         self.assertEqual(len(all_aten_nodes), 0)
 
+    def test_cat_with_empty_tensor(self):
+        class NoopConcat(torch.nn.Module):
+            def forward(self, x):
+                return torch.cat((torch.Tensor([]), x))
+
+        x = torch.randn(4, 5, 6)
+        # TODO: Parametrize this test for opset_version
+        for opset_version in {9, 11}:
+            f = io.BytesIO()
+            torch.onnx.export(NoopConcat(), (x,), f, opset_version=opset_version)
+            loaded_model = onnx.load_from_string(f.getvalue())
+            self.assertEqual(
+                len(loaded_model.graph.output[0].type.tensor_type.shape.dim), 3
+            )
+            for idx, dim in enumerate(x.shape):
+                self.assertEqual(
+                    loaded_model.graph.output[0]
+                    .type.tensor_type.shape.dim[idx]
+                    .dim_value,
+                    dim,
+                )
+
 
 class TestQuantizeEagerONNXExport(common_utils.TestCase):
     def _test_lower_graph_impl(self, model, data):
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 9984f602425cd..e8fd99e5fc3dc 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -526,6 +526,30 @@ def reciprocal(g: jit_utils.GraphContext, self):
 @symbolic_helper.parse_args("v", "i")
 @_beartype.beartype
 def cat(g: jit_utils.GraphContext, tensor_list, dim):
+    tensors = symbolic_helper._unpack_list(tensor_list)
+    # torch.cat ignores empty tensors such as `torch.Tensor([])`
+    # These needs to be removed as input from ONNX's concat too, otherwise shape inference
+    # will likely fail due to inputs with different ranks (0 for empty tensor, > 0 for anything else)
+    nonempty_tensors = []
+    for t in tensors:
+        if symbolic_helper._is_constant(t) and not symbolic_helper._get_tensor_dim_size(
+            t, 0
+        ):
+            continue
+        nonempty_tensors.append(t)
+    assert len(nonempty_tensors) > 0
+    assert all(
+        [
+            symbolic_helper._get_tensor_rank(nonempty_tensors[0]) is None
+            or symbolic_helper._get_tensor_rank(t)
+            == symbolic_helper._get_tensor_rank(nonempty_tensors[0])
+            for t in nonempty_tensors
+        ]
+    )
+    tensor_list.node().removeAllInputs()
+    for t in nonempty_tensors:
+        tensor_list.node().addInput(t)
+
     tensors = symbolic_helper._unpack_list(tensor_list)
     return g.op("Concat", *tensors, axis_i=dim)
 

From 028d6faec0b234e926dab051c8a5baaf65d7469d Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 5 Dec 2022 17:53:23 +0000
Subject: [PATCH 1571/1922] Revert "as_strided: Fix default storage_offset for
 reference implementation (#89513)"

This reverts commit eded97ac7224ad5f80334acf57a3b0c24f83d89f.

Reverted https://github.com/pytorch/pytorch/pull/89513 on behalf of https://github.com/peterbell10 due to broke master
---
 test/functorch/test_aotdispatch.py            |  1 -
 test/functorch/test_ops.py                    | 13 +---
 test/functorch/test_vmap.py                   |  2 -
 torch/_refs/__init__.py                       | 10 +--
 .../_internal/common_methods_invocations.py   | 72 +++----------------
 5 files changed, 11 insertions(+), 87 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 165041edfb306..2434e35ab4871 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1776,7 +1776,6 @@ def forward(self, x):
     xfail('scatter_reduce', 'prod'),
 
     skip('as_strided_scatter'),
-    xfail('as_strided', 'partial_views'),
 
     # Too annoying to generate random inputs
     xfail('cholesky'),
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index ece56ef145ae2..fbee1872ddf3e 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -414,7 +414,6 @@ def wrapped_fn(*args, **kwargs):
         # BUG
         # AssertionError: Tensor-likes are not close!
         xfail('as_strided'),
-        xfail('as_strided', 'partial_views'),
         decorate('linalg.det', 'singular',
                  decorator=unittest.skipIf(IS_MACOS and IS_X86, "Fails on x86 MacOS CI")),
     }))
@@ -656,7 +655,6 @@ def fn(inp, *args, **kwargs):
         skip("atleast_3d"),  # Takes too long
         skip("ormqr"),  # Takes too long
         xfail("as_strided"),  # incorrect output
-        xfail("as_strided", "partial_views"),  # incorrect output
         xfail("as_strided_scatter"),  # incorrect output
         skip("bernoulli"),  # calls random op
         xfail("bfloat16"),  # rank 4 tensor for channels_last
@@ -737,9 +735,6 @@ def fn(inp, *args, **kwargs):
         tol1('svd',
              {torch.float32: tol(atol=1e-03, rtol=5e-04)}),
     ))
-    @skipOps('TestOperators', 'test_vmapvjpvjp', {
-        xfail('as_strided', 'partial_views'),
-    })
     def test_vmapvjpvjp(self, device, dtype, op):
         # Since, we test `vjpvjp` independently,
         # for this test, we just verify that vmap
@@ -807,7 +802,6 @@ def vjp_of_vjp(*args_and_cotangents):
         xfail('svd_lowrank', ''),  # randomness
         xfail('to_sparse', ''),  # non-dense output
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
-        xfail('as_strided', 'partial_views'),
         # ----------------------------------------------------------------------
 
         # ---------------------------- BUGS ------------------------------------
@@ -857,9 +851,7 @@ def vjp_of_vjp(*args_and_cotangents):
         tol1('linalg.householder_product',
              {torch.float32: tol(atol=1e-04, rtol=1e-04)}),
     ))
-    @skipOps('TestOperators', 'test_vmapvjp', vmapvjp_fail.union({
-        xfail('as_strided', 'partial_views'),
-    }))
+    @skipOps('TestOperators', 'test_vmapvjp', vmapvjp_fail)
     def test_vmapvjp(self, device, dtype, op):
         if not op.supports_autograd:
             self.skipTest("Skipped! Autograd not supported.")
@@ -907,7 +899,6 @@ def test_vmapvjp(self, device, dtype, op):
         decorate('linalg.det', 'singular', decorator=unittest.skipIf(IS_MACOS, "Fails on x86 MacOS CI")),
         skip('nn.functional.max_pool1d'),  # fails on cpu, runs on cuda
         xfail('masked.mean'),  # silent incorrectness (nan difference)
-        xfail('as_strided', 'partial_views'),  # Tensor-likes are not close!
 
         xfail('nn.functional.soft_margin_loss', ''),  # soft_margin_loss_backward does not support forward-ad
         xfail('tensor_split'),  # data_ptr composite compliance
@@ -1210,7 +1201,6 @@ def test():
         xfail('sparse.sampled_addmm', ''),
         xfail("native_batch_norm"),
         xfail("_native_batch_norm_legit"),
-        xfail('as_strided', 'partial_views'),
     }))
     def test_vjpvmap(self, device, dtype, op):
         # NB: there is no vjpvmap_has_batch_rule test because that is almost
@@ -1393,7 +1383,6 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents):
 
         # Potential bugs/errors
         xfail('as_strided'),  # AssertionError: Tensor-likes are not close!
-        xfail('as_strided', 'partial_views'),  # AssertionError: Tensor-likes are not close!
         xfail('as_strided_scatter'),  # AssertionError: Tensor-likes are not close!
         xfail('bernoulli'),  # calls random op
         xfail('bfloat16'),  # required rank 4 tensor to use channels_last format
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index 441cf46a98c22..dcad523217f3f 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -3302,7 +3302,6 @@ def test():
         xfail('triu'),  # Exception not raised on error input
         # The error inputs are vectors, that pass when batched as they are treated as a matrix
         xfail('trace'),
-        xfail('as_strided', 'partial_views'),
     }))
     def test_vmap_exhaustive(self, device, dtype, op):
         # needs to be fixed
@@ -3318,7 +3317,6 @@ def test_vmap_exhaustive(self, device, dtype, op):
     ))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
     @skipOps('TestVmapOperatorsOpInfo', 'test_op_has_batch_rule', vmap_fail.union({
-        xfail('as_strided', 'partial_views'),
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
         xfail('complex'),
         xfail('copysign'),
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 3539784e8e32c..04bf9e12927fa 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -2513,15 +2513,9 @@ def atleast_3d(
 
 
 def as_strided(
-    a: TensorLikeType,
-    size: ShapeType,
-    stride: StrideType,
-    storage_offset: Optional[int] = None,
+    a: TensorLikeType, size: ShapeType, stride: StrideType, storage_offset: int = 0
 ) -> TensorLikeType:
-    storage_offset_int = (
-        storage_offset if storage_offset is not None else a.storage_offset()
-    )
-    return prims.as_strided(a, size, stride, storage_offset_int)
+    return prims.as_strided(a, size, stride, storage_offset)
 
 
 def broadcast_shapes(*shapes) -> ShapeType:
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 91e1e8a1d636a..d11c275cc220c 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -263,15 +263,9 @@ def sample_inputs_as_strided(op_info, device, dtype, requires_grad, **kwargs):
         kwargs = dict(storage_offset=storage_offset)
         yield SampleInput(input_t, args=(output_shape, stride), kwargs=kwargs)
 
-def sample_inputs_as_strided_partial_views(op_info, device, dtype, requires_grad, **kwargs):
-    def make_arg():
-        base = make_tensor((20,), device=device, dtype=dtype)
-        return base[5:15].requires_grad_(requires_grad)
-
     # as_strided on offset, partial views
-    yield SampleInput(make_arg(), (2, 2), (1, 2))
-    yield SampleInput(make_arg(), (2, 2), (1, 2), storage_offset=0)
-    yield SampleInput(make_arg(), (2, 2), (1, 2), storage_offset=10)
+    # yield SampleInput(make_arg((20,))[5:15], args=((2, 2), (1, 2)))
+    # yield SampleInput(make_arg((20,))[5:15], args=((2, 2), (1, 2)), kwargs={'storage_offset': 0})
 
 def sample_inputs_as_strided_scatter(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -10727,6 +10721,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type='cpu'),
            )),
     OpInfo('as_strided',
+           op=lambda x, size, stride, storage_offset=0:
+               torch.as_strided(x, size, stride, storage_offset=storage_offset),
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
            supports_out=False,
            supports_forward_ad=True,
@@ -10747,47 +10743,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_conj_view'),
                DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_view'),
                DecorateInfo(unittest.skip("Numerous errors"), 'TestFwdGradients'),
-               DecorateInfo(unittest.skip("Numerous errors"), 'TestBwdGradients'),
-           )),
-    OpInfo('as_strided',
-           variant_test_name='partial_views',
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
-           supports_out=False,
-           supports_forward_ad=True,
-           supports_fwgrad_bwgrad=True,
-           # vmap does not support inplace views
-           check_inplace_batched_forward_grad=False,
-           sample_inputs_func=sample_inputs_as_strided_partial_views,
-           skips=(
-               # Note: This xfail is fine -- it's inherent to how as_strided works
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples'),
-               # RuntimeError: This operator is not Composite Compliant: the
-               # storage_offset of the tensor was modified directly without
-               # going through the PyTorch dispatcher.
-               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance'),
-
-
-               # These fail because the test changes the input's in-memory layout
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_complex_half_reference_testing'),
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu'),
-               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
-               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_fn_fwgrad_bwgrad',
-                            dtypes=(torch.complex64, torch.complex128)),
-               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD'),
-               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_inplace_forward_mode_AD'),
-               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_inplace_grad'),
-               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_inplace_gradgrad'),
-               DecorateInfo(unittest.expectedFailure, 'TestProxyTensorOpInfo',
-                            'test_make_fx_symbolic_exhaustive_inplace'),
-               DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness'),
-               DecorateInfo(unittest.expectedFailure, 'TestCudaFuserOpInfo', 'test_nvfuser_correctness'),
-               DecorateInfo(unittest.expectedFailure, 'TestCudaFuserOpInfo', 'test_nvfuser_extremal_values'),
-               # Fail but are also flaky
-               DecorateInfo(unittest.skip("Test changes in memory layout"), 'TestMathBits'),
-               DecorateInfo(unittest.skip("Modifies input strides and storage_offset"), 'TestCommon',
-                            'test_non_standard_bool_values'),
-           )),
+               DecorateInfo(unittest.skip("Numerous errors"), 'TestBwdGradients'))),
     OpInfo('as_strided_scatter',
            op=lambda x, src, size, stride, storage_offset=0:
                torch.as_strided_scatter(x, src, size, stride, storage_offset=storage_offset),
@@ -18326,27 +18282,15 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
         supports_nvfuser=False,
         skips=(
+            # TODO: fix and/or update to xfails
+            DecorateInfo(unittest.skip("Errors when storage_offset is included"),
+                         'TestCommon', 'test_python_ref_meta'),
             # cloned_mutable_input.is_same(returned_output) INTERNAL ASSERT FAILED
             DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_view'),
             DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_conj_view'),
             DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_conj_view'),
         ),
     ),
-    PythonRefInfo(
-        "_refs.as_strided",
-        torch_opinfo_name="as_strided",
-        torch_opinfo_variant_name="partial_views",
-        # FIXME: doesn't support chalf
-        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
-        supports_nvfuser=False,
-        skips=(
-            # cloned_mutable_input.is_same(returned_output) INTERNAL ASSERT FAILED
-            DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_view'),
-            DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_conj_view'),
-            DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_conj_view'),
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu'),
-        ),
-    ),
     PythonRefInfo(
         "_refs.broadcast_shapes",
         torch_opinfo_name="broadcast_shapes",

From 86135530a5f406c238d7a9914604db400a354060 Mon Sep 17 00:00:00 2001
From: Charlie Yan <charlieyan@fb.com>
Date: Sat, 3 Dec 2022 04:00:17 +0000
Subject: [PATCH 1572/1922] [Composable API] `replicate`: change to per module
 call, remove `mark_root_module()` (#89222)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89222
Approved by: https://github.com/zhaojuanmao
---
 .../distributed/_composable/test_replicate.py |  6 +--
 third_party/ideep                             |  2 +-
 third_party/kineto                            |  2 +-
 torch/distributed/_composable/replicate.py    | 40 ++++---------------
 4 files changed, 12 insertions(+), 38 deletions(-)

diff --git a/test/distributed/_composable/test_replicate.py b/test/distributed/_composable/test_replicate.py
index 3e8bf44a1fdea..db1459589b342 100644
--- a/test/distributed/_composable/test_replicate.py
+++ b/test/distributed/_composable/test_replicate.py
@@ -7,7 +7,7 @@
 import torch.distributed as dist
 import torch.nn.functional as F
 from torch import nn
-from torch.distributed._composable.replicate import mark_root_module, replicate
+from torch.distributed._composable.replicate import replicate
 from torch.testing._internal.common_distributed import MultiProcessTestCase
 from torch.testing._internal.common_utils import run_tests
 
@@ -91,12 +91,12 @@ def step_model(model, input, target):
 
     def test_replicate_single_module(self):
         model = Net()
-        replicate_model = mark_root_module(replicate(deepcopy(model)))
+        replicate_model = replicate(deepcopy(model))
         self._compare_module(model, replicate_model)
 
     def test_replicate_multi_module(self):
         model = Net()
-        replicate_model = mark_root_module(deepcopy(model))
+        replicate_model = deepcopy(model)
         replicate(replicate_model.fc1)
         replicate(replicate_model.fc2)
         replicate(replicate_model.fc3)
diff --git a/third_party/ideep b/third_party/ideep
index 5ddc65efe0428..ececd0a4f53c3 160000
--- a/third_party/ideep
+++ b/third_party/ideep
@@ -1 +1 @@
-Subproject commit 5ddc65efe0428bbce2942b3ce5e3ce15239abe2f
+Subproject commit ececd0a4f53c39f2d91caaddee0de1cd214f5b99
diff --git a/third_party/kineto b/third_party/kineto
index 6c1629809068e..0703c78999061 160000
--- a/third_party/kineto
+++ b/third_party/kineto
@@ -1 +1 @@
-Subproject commit 6c1629809068efd78a8d56b4aa479c7ec49ae562
+Subproject commit 0703c78999061b8329dfab7ec5046fc5764a5573
diff --git a/torch/distributed/_composable/replicate.py b/torch/distributed/_composable/replicate.py
index 0e94427afee88..c27a88d79b4d9 100644
--- a/torch/distributed/_composable/replicate.py
+++ b/torch/distributed/_composable/replicate.py
@@ -7,11 +7,7 @@
 from .contract import contract
 
 
-class DistributedState:
-    ...
-
-
-class ReplicateState(DistributedState):
+class _ReplicateState:
     def __init__(self) -> None:
         self.modules: List[nn.Module] = []
         self.has_initialized: bool = False
@@ -22,6 +18,9 @@ def mark_modules(self, *modules: nn.Module) -> None:
             self.modules.append(module)
             replicate.state(module)._distributed_state = self
             replicate.state(module)._params_collected = False
+            module.register_forward_pre_hook(self.forward_pre_hook)
+            # TODO(@yhcharles): fix type error
+            module.register_forward_hook(self.forward_post_hook)  # type: ignore[arg-type]
 
     def _recursive_collect_params(self, module: nn.Module) -> None:
         # TODO: skip if managed by other APIs
@@ -50,13 +49,13 @@ def init_helper(self):
 
         self._ddp = _ddp.DistributedDataParallel(self._param_list)
 
-    def root_module_forward_pre_hook(
+    def forward_pre_hook(
         self, module: nn.Module, input: Tuple[torch.Tensor]
     ) -> None:
         self.init_helper()
         self._ddp.pre_forward()
 
-    def root_module_forward_post_hook(
+    def forward_post_hook(
         self,
         module: nn.Module,
         input: Tuple[torch.Tensor],
@@ -65,14 +64,9 @@ def root_module_forward_post_hook(
         return self._ddp.post_forward(output)
 
 
-# TODO(@yhcharles): use a per-model instance instead of a global one
-_default_state = ReplicateState()
-
-
 @contract
 def replicate(
     module: nn.Module,  # NOTE: contract now supports single module only
-    dist_state: ReplicateState = _default_state,
 ) -> nn.Module:
     r"""Replicates module(s)
 
@@ -83,25 +77,5 @@ def replicate(
         >>> module = nn.Linear(3, 3)
         >>> replicate(module)
     """
-    dist_state.mark_modules(module)
-    return module
-
-
-def mark_root_module(
-    module: nn.Module, dist_state: ReplicateState = _default_state
-) -> nn.Module:
-    r"""Mark the root module. Its sub-modules can be replicated.
-
-    Args:
-        modules (torch.nn.Module): root module
-
-    Example::
-        >>> module = nn.Linear(3, 3)
-        >>> replicate(module)
-    """
-    module.register_forward_pre_hook(dist_state.root_module_forward_pre_hook)
-    # TODO(@yhcharles): fix type error
-    module.register_forward_hook(
-        dist_state.root_module_forward_post_hook  # type: ignore[arg-type]
-    )
+    _ReplicateState().mark_modules(module)
     return module

From d4215ad65bed72601995a473c067fa8b8493ac9f Mon Sep 17 00:00:00 2001
From: Jiewen Tan <jwtan@google.com>
Date: Mon, 5 Dec 2022 18:26:37 +0000
Subject: [PATCH 1573/1922] [LTC] Restore default ctor for LazyTensor (#90086)

Summary:
This pull request introduced a temporarily change that make XLA's LTC migration easier. One step among is to make XLATensor naively inherits LazyTensor and that requires LazyTensor to have a default constructor.

Test Plan:
CI.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90086
Approved by: https://github.com/JackCaoG, https://github.com/kit1980
---
 torch/csrc/lazy/core/tensor.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/lazy/core/tensor.h b/torch/csrc/lazy/core/tensor.h
index 8dfa5a077c973..a5f94ea2fbec6 100644
--- a/torch/csrc/lazy/core/tensor.h
+++ b/torch/csrc/lazy/core/tensor.h
@@ -61,7 +61,9 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target {
   // have to check both lazy_tensor_ptr && *lazy_tensor_ptr, so everywhere that
   // used to rely on a LazyTensor obj with a null Data can now rely on a null
   // LazyTensorPtr instead.
-  LazyTensor() = delete;
+  // TODO(alanwaketan): This is a temporarily change to make XLA LTC migration
+  // easier. Restore it back to delete.
+  LazyTensor() = default;
 
   size_t generation() const {
     return data()->generation;

From c91756c3137b3f697e9558a51000de0f764348f9 Mon Sep 17 00:00:00 2001
From: Michael Voznesensky <voznesenskym@gmail.com>
Date: Mon, 5 Dec 2022 18:30:02 +0000
Subject: [PATCH 1574/1922] Fix missing line in XLA backend after mergebot +
 ghstack gap (#90197)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90197
Approved by: https://github.com/clee2000
---
 torch/_dynamo/optimizations/backends.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch/_dynamo/optimizations/backends.py b/torch/_dynamo/optimizations/backends.py
index 007b02018f2a8..7d7289ac67435 100644
--- a/torch/_dynamo/optimizations/backends.py
+++ b/torch/_dynamo/optimizations/backends.py
@@ -750,6 +750,7 @@ def torchxla_trace_once(subgraph):
     compiled_graph = None
 
     def fwd(*args):
+        nonlocal subgraph
         nonlocal compiled_graph
         model = subgraph.model
         if compiled_graph is None:

From b6998cd2b930f29a73bd8dc9046abc609b382b5f Mon Sep 17 00:00:00 2001
From: Michael Voznesensky <voznesenskym@gmail.com>
Date: Sun, 4 Dec 2022 19:20:23 +0000
Subject: [PATCH 1575/1922] Light refactor to how we get shape_env for graph
 lowering (#90139)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90139
Approved by: https://github.com/ezyang
---
 torch/_inductor/compile_fx.py | 28 ++++++++++++++++++++--------
 torch/_inductor/config.py     | 10 +++++-----
 2 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index 9e94f37ef89a8..15b5d5910895f 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -9,6 +9,7 @@
 from functorch.compile import min_cut_rematerialization_partition
 
 import torch.fx
+from torch._dynamo.utils import fake_mode_from_tensors
 from torch._functorch.aot_autograd import make_boxed_func
 from torch._subclasses.fake_tensor import FakeTensor
 
@@ -85,10 +86,7 @@ def _step_logger():
 
 @DebugContext.wrap
 def count_bytes_inner(gm, example_inputs, num_fixed=0, **kwargs):
-    shape_env = None
-    for inp in example_inputs:
-        if isinstance(inp, FakeTensor) and inp.fake_mode.shape_env is not None:
-            shape_env = inp.fake_mode.shape_env
+    shape_env = _shape_env_from_inputs(example_inputs)
 
     graph = GraphLowering(gm, shape_env=shape_env, num_static_inputs=num_fixed)
     with V.set_graph_handler(graph):
@@ -127,10 +125,8 @@ def compile_fx_inner(
 
     if cudagraphs is None:
         cudagraphs = config.triton.cudagraphs
-    shape_env = None
-    for inp in example_inputs:
-        if isinstance(inp, FakeTensor) and inp.fake_mode.shape_env is not None:
-            shape_env = inp.fake_mode.shape_env
+
+    shape_env = _shape_env_from_inputs(example_inputs)
 
     graph = GraphLowering(
         gm, shape_env=shape_env, num_static_inputs=num_fixed, graph_id=graph_id
@@ -410,3 +406,19 @@ def bw_compiler(model: torch.fx.GraphModule, example_inputs):
             # fails without forcing a compile lol.
             force_compile_tiny_graphs=True,
         )(model_, example_inputs_)
+
+
+def _shape_env_from_inputs(inputs):
+    shape_env = None
+    fake_mode = fake_mode_from_tensors(inputs)
+
+    # TODO(voz): It would be nice to enable this assert, but there are lots of tests that
+    # pass in real inputs for now.
+    # if len(inputs) > 0:
+    # assert fake_mode is not None, breakpoint()
+
+    if fake_mode is not None:
+        return fake_mode.shape_env
+
+    # TODO(voz): Should we always have one anyway?
+    return None
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index 92666886a002d..578745c216784 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -119,12 +119,12 @@ class cpp:
     min_chunk_size = 4096
     cxx = (
         None,  # download gcc12 from conda-forge if conda is installed
-        "g++-12",
-        "g++-11",
-        "g++-10",
-        "clang++",
+        # "g++-12",
+        # "g++-11",
+        # "g++-10",
+        # "clang++",
         "g++",
-        "g++.par",
+        # "g++.par",
     )
     # Allow kernel performance profiling via PyTorch profiler
     enable_kernel_profile = False

From df4f75adbc18b65e5ee21e4e5067ac939d61ad9c Mon Sep 17 00:00:00 2001
From: mikey dagitses <mikeyd@meta.com>
Date: Mon, 5 Dec 2022 18:39:47 +0000
Subject: [PATCH 1576/1922] fix c10::detail::integer_iterator for C++17
 (#90174)

Stack created with [Sapling](https://sapling-scm.com). Best reviewed with [ReviewStack](https://reviewstack.dev/pytorch/pytorch/pull/90174).
* __->__ #90174

fix c10::detail::integer_iterator for C++17

Summary: std::iterator is deprecated.

Test Plan: Rely on CI.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90174
Approved by: https://github.com/clee2000, https://github.com/malfet
---
 c10/util/irange.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/c10/util/irange.h b/c10/util/irange.h
index 16fa682eb0d47..78cf94f25c2d8 100644
--- a/c10/util/irange.h
+++ b/c10/util/irange.h
@@ -17,7 +17,13 @@ template <
     typename I,
     bool one_sided = false,
     typename std::enable_if<std::is_integral<I>::value, int>::type = 0>
-struct integer_iterator : std::iterator<std::input_iterator_tag, I> {
+struct integer_iterator {
+  using iterator_category = std::input_iterator_tag;
+  using value_type = I;
+  using difference_type = std::ptrdiff_t;
+  using pointer = I*;
+  using reference = I&;
+
   explicit integer_iterator(I value) : value(value) {}
 
   I operator*() const {

From 2e1b9e2b83b851aa2da87154d14e9e02e665942c Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Mon, 5 Dec 2022 19:12:29 +0000
Subject: [PATCH 1577/1922] workaround for indexing formulas with negative
 terms (#89933)

Fixes https://github.com/pytorch/torchdynamo/issues/1928
For  `ModularIndexing` we generate indexing code with `//` and `%` operators. When `ModularIndexing` base is negative (that can happen after valid simplifications), `//` in triton produces wrong results https://github.com/openai/triton/issues/619/. For `//` op coming from pytorch, we have codegen workarounds, but I'm reluctant to apply these workarounds to very common indexing computation patterns, both for code readability and perf considerations.
Similarly, we replace `ModularIndexing` with `IndexingDiv` when we can prove that base is small, but those assumptions break when `ModularIndexing` base is negative (`ModularIndexing` is always positive, `IndexingDiv` isn't).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89933
Approved by: https://github.com/jansel
---
 test/inductor/test_torchinductor.py | 15 ++++++++++++++-
 torch/_inductor/ir.py               | 18 ++++++++++++++++--
 torch/_inductor/sizevars.py         |  5 +++++
 3 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 4e1c60f40051e..755a84d06a75d 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -521,7 +521,12 @@ def test_indexing_simplification(self):
         self.assertEqual(
             sizevars.simplify_with_ranges(expr, var_ranges), i1 + 128 * i2 + 64 * r3
         )
-
+        # if there are negative terms in ModularIndexing base, we cannot replace it with IndexingDiv
+        expr = ModularIndexing(i1 - 15, 1, 64)
+        self.assertEqual(
+            sizevars.simplify_with_ranges(expr, var_ranges),
+            ModularIndexing(i1 - 15, 1, 64),
+        )
         # small terms should be kept if the rest is not guaranteed to be divisible
         self.assertEqual(
             sizevars.simplify_with_ranges(IndexingDiv(r3 + i2 + i1, 32), var_ranges),
@@ -551,6 +556,14 @@ def test_indexing_simplification(self):
             ModularIndexing(i0 + i1 * i2 * r3, i2, r3), ModularIndexing(i0, i2, r3)
         )
 
+        # if there are negative terms, we cannot optimize away zero terms due to https://github.com/openai/triton/issues/619
+        self.assertEqual(
+            ModularIndexing(-i0 + i1 * 20, 2, 10), ModularIndexing(-i0 + i1 * 20, 2, 10)
+        )
+        self.assertEqual(
+            ModularIndexing(-15 + i1 * 20, 2, 10), ModularIndexing(-15 + i1 * 20, 2, 10)
+        )
+
         # Constant fold from divisor into base
         self.assertEqual(ModularIndexing(i0 * 4, 2, 10), ModularIndexing(i0 * 2, 1, 10))
         self.assertEqual(IndexingDiv(i0 * 4, 2), i0 * 2)
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 56d5e8ef1d4c7..2ce3fa37388bc 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -190,10 +190,24 @@ def eval(cls, base, divisor, modulus):
 
         if isinstance(base, sympy.Add):
             new_terms = []
+            all_positive = True
             for term in base.args:
                 if sympy.gcd(term, modulus * divisor) != modulus * divisor:
-                    new_terms.append(term)
-            if len(new_terms) != len(base.args):
+                    if (isinstance(term, sympy.Integer) and term < 0) or (
+                        isinstance(term, sympy.Mul)
+                        and isinstance(term.args[0], sympy.Integer)
+                        and term.args[0] < 0
+                    ):
+                        # workaround for https://github.com/openai/triton/issues/619,
+                        # if there are negative terms, // produces wrong result
+                        # TODO if https://github.com/openai/triton/issues/619 is fixed
+                        # this optimization would become valid
+                        all_positive = False
+                        break
+                    else:
+                        new_terms.append(term)
+
+            if len(new_terms) != len(base.args) and all_positive:
                 return ModularIndexing(sum(new_terms), divisor, modulus)
 
         if isinstance(base, IndexingDiv):
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
index fda61b0933574..7895f0dccdcba 100644
--- a/torch/_inductor/sizevars.py
+++ b/torch/_inductor/sizevars.py
@@ -146,6 +146,11 @@ def visit_modular_indexing(base, divisor, modulus):
                 base_s = base.args[2] - 1
             elif not base.has(ModularIndexing):
                 # actual iteration range is to size-1
+                iter_ranges_zero = {k: 0 for k, v in var_ranges.items()}
+                base_lowest = sympy_subs(base, iter_ranges_zero)
+                if self.maybe_guard_lt(base_lowest, 0):
+                    # can't replace with indexing div if base can be negative
+                    return ModularIndexing(base, divisor, modulus)
                 iter_ranges = {k: v - 1 for k, v in var_ranges.items()}
                 base_s = sympy_subs(base, iter_ranges)
             else:

From cc448382f8c986219089dcad9ba27aecee780188 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 5 Dec 2022 10:20:30 -0500
Subject: [PATCH 1578/1922] Assume that co_firstlineno is always defined
 (#90180)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90180
Approved by: https://github.com/albanD
---
 torch/_dynamo/symbolic_convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index df7966c50168b..c594533e8b9a5 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -1497,7 +1497,7 @@ def __init__(
         self.current_instruction: Instruction = create_instruction("NOP")
         self.next_instruction: typing.Optional[Instruction] = None
         self.block_stack: List[BlockStackEntry] = []
-        self.lineno: int = code_options.get("co_firstlineno")
+        self.lineno: int = code_options["co_firstlineno"]
 
         # Properties of the input/output code
         self.instructions: List[Instruction] = instructions

From 8f7e20b9b520bbf1ea2dea6e27bc9414ae69c47f Mon Sep 17 00:00:00 2001
From: David Berard <dberard@fb.com>
Date: Thu, 1 Dec 2022 13:46:46 -0800
Subject: [PATCH 1579/1922] [NNC] Use New PassManager for LLVM >= 15 (#89978)

This is needed because TargetMachine::adjustPassManager was removed in https://reviews.llvm.org/D137796. However, we need to keep around the old pass manager implementation for LLVM < 12.

Based on this: https://llvm.org/docs/NewPassManager.html

Tests: `./build/bin/test_tensorexpr` passes.

RUN_TORCHBENCH: nvfuser

Differential Revision: [D41636445](https://our.internmc.facebook.com/intern/diff/D41636445)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89978
Approved by: https://github.com/bertmaher
---
 torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 65 ++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index f6801973dd6b1..a889420f944ad 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -8,6 +8,7 @@
 #include <torch/csrc/jit/tensorexpr/analysis.h>
 #include <torch/csrc/jit/tensorexpr/llvm_jit.h>
 
+// Note [llvm::SCEVPredicate non-virtual destructor]
 // llvm::SCEVPredicate has virtual function but non-virtual destructor
 // https://github.com/llvm/llvm-project/blob/c1a0a213378a458fbea1a5c77b315c7dce08fd05/llvm/include/llvm/Analysis/ScalarEvolution.h#L198
 #pragma GCC diagnostic push
@@ -15,15 +16,30 @@
 #include <llvm/Analysis/TargetTransformInfo.h>
 #pragma GCC diagnostic pop
 
+#include <llvm/Analysis/CGSCCPassManager.h>
+#include <llvm/Analysis/LoopAnalysisManager.h>
 #include <llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h>
 #include <llvm/ExecutionEngine/Orc/ThreadSafeModule.h>
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/LegacyPassManager.h>
 #include <llvm/IR/MDBuilder.h>
+#include <llvm/IR/PassManager.h>
 #include <llvm/IR/Verifier.h>
 #include <llvm/MC/MCSubtargetInfo.h>
+#include <llvm/Pass.h>
+
+// see Note [llvm::SCEVPredicate non-virtual destructor]
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
+#include <llvm/Passes/PassBuilder.h>
+#pragma GCC diagnostic pop
+
 #include <llvm/Support/Host.h>
 #include <llvm/Support/TargetSelect.h>
+#include <llvm/Transforms/IPO/AlwaysInliner.h>
+#include <llvm/Transforms/Scalar/DCE.h>
+#include <llvm/Transforms/Vectorize/LoopVectorize.h>
+#include <llvm/Transforms/Vectorize/SLPVectorizer.h>
 
 #if LLVM_VERSION_MAJOR >= 10
 #include <llvm/Support/CodeGen.h>
@@ -2446,6 +2462,54 @@ void LLVMCodeGenImpl::visit(CondPtr v) {
   irb_.SetInsertPoint(end_block);
 }
 
+// "New" PassManager needed to replace TM.adjustPassManager
+#if LLVM_VERSION_MAJOR >= 15
+void LLVMCodeGenImpl::optimize(llvm::Module& M) {
+  // Add internal analysis passes from the target machine.
+  auto& TM = jit_->getTargetMachine();
+
+  // Create the analysis managers.
+  llvm::LoopAnalysisManager LAM;
+  llvm::FunctionAnalysisManager FAM;
+  llvm::CGSCCAnalysisManager CGAM;
+  llvm::ModuleAnalysisManager MAM;
+
+  // Create the new pass manager builder.
+  // Take a look at the PassBuilder constructor parameters for more
+  // customization, e.g. specifying a TargetMachine or various debugging
+  // options.
+  llvm::PassBuilder PB(&TM);
+
+  TM.registerPassBuilderCallbacks(PB);
+
+  // Register all the basic analyses with the managers.
+  PB.registerModuleAnalyses(MAM);
+  PB.registerCGSCCAnalyses(CGAM);
+  PB.registerFunctionAnalyses(FAM);
+  PB.registerLoopAnalyses(LAM);
+  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+  llvm::ModulePassManager MPM =
+      PB.buildPerModuleDefaultPipeline(llvm::OptimizationLevel::O3);
+  llvm::FunctionPassManager FPM = PB.buildFunctionSimplificationPipeline(
+      llvm::OptimizationLevel::O3, llvm::ThinOrFullLTOPhase::None);
+
+  FAM.registerPass([&] { return TM.getTargetIRAnalysis(); });
+
+  FPM.addPass(llvm::LoopVectorizePass());
+  FPM.addPass(llvm::SLPVectorizerPass());
+
+  FPM.addPass(llvm::DCEPass());
+  MPM.addPass(llvm::AlwaysInlinerPass());
+
+  MPM.run(M, MAM);
+  for (auto& FF : M) {
+    if (!FF.empty()) {
+      FPM.run(FF, FAM);
+    }
+  }
+}
+#else // "Old" PassManager
 void LLVMCodeGenImpl::optimize(llvm::Module& M) {
   llvm::legacy::FunctionPassManager FPM(&M);
   llvm::legacy::PassManager PM;
@@ -2472,6 +2536,7 @@ void LLVMCodeGenImpl::optimize(llvm::Module& M) {
   }
   FPM.doFinalization();
 }
+#endif
 
 RegisterCodeGen<LLVMCodeGen> llvm_codegen_reg("llvm_codegen");
 

From 5ac9112979ced5939fac6e3ac757ffea0be6d611 Mon Sep 17 00:00:00 2001
From: Sergii Dymchenko <sdym@fb.com>
Date: Mon, 5 Dec 2022 20:33:23 +0000
Subject: [PATCH 1580/1922] Fix issue 38095 TODOs in NCCL tests (#90033)

Fix TODOs related to https://github.com/pytorch/pytorch/issues/38095

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90033
Approved by: https://github.com/awgu
---
 test/distributed/test_c10d_nccl.py | 70 +++++++++++++-----------------
 1 file changed, 30 insertions(+), 40 deletions(-)

diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 85ebb6b75bc5f..bcd9935432c79 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -290,8 +290,7 @@ def broadcast(xs, rootRank, rootTensor):
             # Run with 1 input tensor
             x = torch.tensor([self.rank]).cuda(self.rank_to_GPU[self.rank][0])
             output = broadcast([x], i, 0)
-            # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-            self.assertEqualIgnoreType(torch.tensor([i]), output[0])
+            self.assertEqual(torch.tensor([i]), output[0])
 
             expected_tensor = torch.empty([i + 1, i + 1]).fill_(i + 1)
             xs = [torch.empty([i + 1, i + 1]).fill_(-1).cuda(device=device_idx) for device_idx in self.rank_to_GPU[self.rank]]
@@ -326,10 +325,9 @@ def allreduce(tensors, op):
 
         allreduce(tensors, c10d.ReduceOp.SUM)
 
-        ndev = float(self.world_size)
-        # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-        self.assertEqualIgnoreType(
-            torch.tensor([ndev * (ndev + 1) / 2]),
+        ndev = self.world_size
+        self.assertEqual(
+            torch.tensor([ndev * (ndev + 1) // 2]),
             tensors[0],
         )
 
@@ -338,9 +336,8 @@ def allreduce(tensors, op):
             tensors = [torch.tensor([self.rank + 1.]).cuda(local_device_id)]
 
             allreduce(tensors, c10d.ReduceOp.AVG)
-            # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-            ndev = float(self.world_size)
-            self.assertEqualIgnoreType(
+            ndev = self.world_size
+            self.assertEqual(
                 torch.tensor([ndev * (ndev + 1.) / (2. * ndev)]),
                 tensors[0],
             )
@@ -353,9 +350,9 @@ def allreduce(tensors, op):
 
                     allreduce(tensors, c10d._make_nccl_premul_sum(factor))
 
-                    # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-                    self.assertEqualIgnoreType(
-                        factor * torch.tensor([float(self.world_size * (self.world_size + 1) / 2)], device=local_device_id),
+                    self.assertEqual(
+                        factor * torch.tensor([self.world_size * (self.world_size + 1) / 2],
+                                              dtype=dtype, device=local_device_id),
                         tensors[0],
                     )
 
@@ -363,17 +360,15 @@ def allreduce(tensors, op):
         tensors = [torch.tensor([self.rank + 1]).cuda(local_device_id)]
 
         allreduce(tensors, c10d.ReduceOp.PRODUCT)
-        # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-        self.assertEqualIgnoreType(
-            torch.tensor([float(math.factorial(self.world_size))]), tensors[0]
+        self.assertEqual(
+            torch.tensor([math.factorial(self.world_size)]), tensors[0]
         )
 
         # Min
         tensors = [torch.tensor([self.rank + 1]).cuda(local_device_id)]
 
         allreduce(tensors, c10d.ReduceOp.MIN)
-        # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-        self.assertEqualIgnoreType(torch.tensor([1.0]), tensors[0])
+        self.assertEqual(torch.tensor([1]), tensors[0])
 
         # Max
         tensors = [torch.tensor([self.rank + 1]).cuda(local_device_id)]
@@ -410,14 +405,13 @@ def reduce(xs, rootRank, rootTensor, op=None):
 
             reduce(tensors, rt, 0)
 
-            # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
             if self.rank == rt:
-                self.assertEqualIgnoreType(
-                    torch.tensor([float(self.world_size * (self.world_size + 1) / 2)]),
+                self.assertEqual(
+                    torch.tensor([self.world_size * (self.world_size + 1) // 2]),
                     tensors[0],
                 )
             else:
-                self.assertEqualIgnoreType(
+                self.assertEqual(
                     torch.tensor([self.rank + 1]),
                     tensors[0],
                 )
@@ -511,7 +505,7 @@ def allgather_base(output_t, input_t):
             work = pg._allgather_base(output_t, input_t)
             work.wait()
 
-        # anticpate an error
+        # anticipate an error
         with self.assertRaisesRegex(
             RuntimeError,
             "output tensor size must be equal to world_size times input tensor size",
@@ -523,7 +517,7 @@ def allgather_base(output_t, input_t):
             # fails the check because output_t is not correctly sized
             allgather_base(output_t, tensor)
 
-        # anticpate an error
+        # anticipate an error
         with self.assertRaisesRegex(
             RuntimeError, "output tensor must have the same type as input tensor"
         ):
@@ -799,7 +793,7 @@ def reduce_scatter_base(output_t, input_t):
             work = pg._reduce_scatter_base(output_t, input_t)
             work.wait()
 
-        # anticpate an error
+        # anticipate an error
         with self.assertRaisesRegex(
             RuntimeError,
             "input tensor must be the same size as output size times world size",
@@ -811,7 +805,7 @@ def reduce_scatter_base(output_t, input_t):
             # fails the check because output_t is not correctly sized
             reduce_scatter_base(output_t, input_t)
 
-        # anticpate an error
+        # anticipate an error
         with self.assertRaisesRegex(
             RuntimeError, "input tensor must be the same type as the output tensor."
         ):
@@ -859,13 +853,12 @@ def reduce_scatter(outputs, input_lists, op):
         for i in range(num_gpus):
             expected = torch.tensor(
                 [
-                    float((1 + self.world_size) * self.world_size / 2)
+                    (1 + self.world_size) * self.world_size // 2
                     + self.world_size * self.rank
                 ])
 
 
-            # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-            self.assertEqualIgnoreType(expected, output[i])
+            self.assertEqual(expected, output[i])
 
         # Min
         reduce_scatter(output, tensor_lists, c10d.ReduceOp.MIN)
@@ -886,7 +879,7 @@ def reduce_scatter(outputs, input_lists, op):
         # Product
         reduce_scatter(output, tensor_lists, c10d.ReduceOp.PRODUCT)
 
-        # math pakcage don't have math.perm until python 3.8, so
+        # math package don't have math.perm until python 3.8, so
         # we implement a naive version here.
         def perm(n, k):
             prod_val = n
@@ -898,8 +891,7 @@ def perm(n, k):
             prod_val = perm(self.rank + self.world_size, self.world_size)
 
             expected = torch.tensor([prod_val])
-            # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-            self.assertEqualIgnoreType(expected, output[i])
+            self.assertEqual(expected, output[i])
 
         # Test the input params overridden scenarios, aka, when the input is
         # a list and output is just one tensor.
@@ -908,19 +900,19 @@ def perm(n, k):
         input_list = [tensor[0].cuda(self.rank) for tensor in input_per_gpu]
         pg.reduce_scatter(output_tensor, input_list, c10d.ReduceOp.SUM).wait()
         expected = torch.tensor(
-            float((1 + self.world_size) * self.world_size / 2) + self.world_size * self.rank
+            (1 + self.world_size) * self.world_size // 2 + self.world_size * self.rank
         )
-        self.assertEqualIgnoreType(expected, output_tensor)
+        self.assertEqual(expected, output_tensor)
 
         # Min
         pg.reduce_scatter(output_tensor, input_list, c10d.ReduceOp.MIN).wait()
         expected = torch.tensor(self.rank + 1)
-        self.assertEqualIgnoreType(expected, output_tensor)
+        self.assertEqual(expected, output_tensor)
 
         # Max
         pg.reduce_scatter(output_tensor, input_list, c10d.ReduceOp.MAX).wait()
         expected = torch.tensor(self.rank + self.world_size)
-        self.assertEqualIgnoreType(expected, output_tensor)
+        self.assertEqual(expected, output_tensor)
 
         # Product
         pg.reduce_scatter(output_tensor, input_list, c10d.ReduceOp.PRODUCT).wait()
@@ -928,7 +920,7 @@ def perm(n, k):
         for k in range(1, self.world_size):
             prod_val = prod_val * (self.rank + 1 + k)
         expected = torch.tensor(prod_val)
-        self.assertEqualIgnoreType(expected, output_tensor)
+        self.assertEqual(expected, output_tensor)
 
         if torch.cuda.nccl.version() >= (2, 11, 1):
             for factor in (3.0, torch.tensor([5.0], device=self.rank)):
@@ -995,8 +987,7 @@ def allreduce(tensors):
 
         for i in range(1, len(local_device_ids) + 1):
             for j in range(i):
-                # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-                self.assertEqualIgnoreType(
+                self.assertEqual(
                     torch.tensor([(j + 1) * self.world_size]), tensors_list[i - 1][j]
                 )
 
@@ -1662,8 +1653,7 @@ def step_model(model, input, target):
                     target[self.rank : (self.rank + 1)],
                 )
                 for i, j in zip(model.parameters(), ddp_model.parameters()):
-                    # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-                    self.assertEqualIgnoreType(i.grad, j.grad, rtol=1.3e-06, atol=5e-5)
+                    self.assertEqual(i.grad, j.grad, rtol=1.3e-06, atol=5e-5)
 
             # Shuffle the input so that DDP input is different
             torch.manual_seed(1337 + iteration)

From d05b6cdb57e19212105240e68e2f1472ba4628d0 Mon Sep 17 00:00:00 2001
From: Sergii Dymchenko <sdym@fb.com>
Date: Mon, 5 Dec 2022 20:48:05 +0000
Subject: [PATCH 1581/1922] Simplify by using yield from (#90160)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90160
Approved by: https://github.com/albanD, https://github.com/soulitzer
---
 torch/autograd/function.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/torch/autograd/function.py b/torch/autograd/function.py
index f4810712cab3a..2fc95f72d7aa0 100644
--- a/torch/autograd/function.py
+++ b/torch/autograd/function.py
@@ -516,13 +516,11 @@ def _iter(obj):
             return
         elif isinstance(obj, (list, tuple)):
             for o in obj:
-                for var in _iter(o):
-                    yield var
+                yield from _iter(o)
         elif isinstance(obj, dict):
             # We only accept primitive key types, so we needn't inspect them
             for o in obj.values():
-                for var in _iter(o):
-                    yield var
+                yield from _iter(o)
         elif allow_unknown:
             yield obj
         else:

From f00742904f546db6e66566ff93bb85f9a01d310d Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Mon, 5 Dec 2022 18:22:44 +0000
Subject: [PATCH 1582/1922] [FSDP][optim_state_dict][1/N] Restructure
 _optim_state_dict to prepare the support of use_orig_param (#89898)

**Motivation:**
Restructure some APIs in _optim_state_dict.py to allow better future extension, mostly for supporting use_orig_params. NO logic change in this PR.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89898
Approved by: https://github.com/awgu
---
 torch/distributed/fsdp/_optim_utils.py | 257 ++++++++++++-------------
 1 file changed, 123 insertions(+), 134 deletions(-)

diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py
index 70fb4156d5378..d8cceb95a3b91 100644
--- a/torch/distributed/fsdp/_optim_utils.py
+++ b/torch/distributed/fsdp/_optim_utils.py
@@ -1,4 +1,3 @@
-import collections
 import copy
 import functools
 from typing import (
@@ -1071,51 +1070,104 @@ def _get_param_to_param_id_from_optim_input(
     return {param: param_id for param_id, param in enumerate(param_id_to_param)}
 
 
-def _get_unflat_to_flat_param_ids(
-    flat_to_unflat_param_ids: Dict[int, List[int]],
-) -> List[int]:
-    """
-    Inverts the mapping ``flat_to_unflat_param_ids`` to be from unflattened
-    parameter ID to flattened parameter ID, where the unflattened parameter ID
-    is the index in the returned :class:`list`. There may be multiple
-    unflattened parameter IDs mapping to the same flattened parameter ID.
+def _is_zero_dim_tensor(x: Any) -> bool:
+    return torch.is_tensor(x) and x.dim() == 0
 
-    Args:
-        flat_to_unflat_param_ids (Dict[int, List[int]]): A mapping from
-            flattened parameter ID to a :class:`list` of corresponding
-            unflattened parameter IDs.
 
-    Returns:
-        List[int]: A mapping from unflattened parameter ID to flattened
-        parameter ID, where the unflattened parameter ID is the index in the
-        :class:`list`.
+def _map_param_id_to_optim_keys(
+    optim_state_dict: Dict[str, Any],
+    group: Optional[dist.ProcessGroup],
+    param_id_to_param: List[nn.Parameter],
+    param_to_fqns: Dict[nn.Parameter, List[str]],
+) -> Tuple[Dict[int, _OptimStateKey], Dict[_OptimStateKey, int]]:
     """
-    # Construct as a dict and then convert to list
-    unflat_to_flat_param_ids = {}
-    for flat_param_id, unflat_param_ids in flat_to_unflat_param_ids.items():
-        for unflat_param_id in unflat_param_ids:
-            assert unflat_param_id not in unflat_to_flat_param_ids, (
-                "`flat_to_unflat_param_ids` has the unflattened parameter "
-                f"ID {unflat_param_id} mapped to multiple flattened "
-                "parameter IDs"
-            )
-            unflat_to_flat_param_ids[unflat_param_id] = flat_param_id
-    num_unflat_param_ids = len(unflat_to_flat_param_ids)
-    unflat_param_ids_set = set(unflat_to_flat_param_ids.keys())
-    assert unflat_param_ids_set == set(range(num_unflat_param_ids)), (
-        "The set of unflattened parameter IDs should be {0, ..., "
-        + str(num_unflat_param_ids - 1)
-        + "} but got "
-        + f"{unflat_param_ids_set}"
+    Construct the local mapping between the `_OptimStateKey` and parameter IDs
+    and broadcast rank 0's mapping. The return value will be only rank 0's
+    mapping.
+    """
+    rank = dist.get_rank(group)
+    optim_state_key_to_param_id: Dict[_OptimStateKey, int] = {}  # local
+    r0_param_id_to_optim_state_key: Dict[
+        int, _OptimStateKey
+    ] = {}  # rank 0
+
+    for param_id, param in enumerate(param_id_to_param):
+        # Do not include parameters without state to avoid empty mappings
+        # just like in normal `torch.optim.Optimizer.state_dict()`
+        if param_id not in optim_state_dict["state"]:
+            continue
+        optim_state_key = _OptimStateKey(
+            unflat_param_names=tuple(param_to_fqns[param]),
+            is_flat_param=isinstance(param, FlatParameter),
+        )
+        if rank == 0:
+            r0_param_id_to_optim_state_key[param_id] = optim_state_key
+        optim_state_key_to_param_id[optim_state_key] = param_id
+    key_obj_list: List[Optional[Dict[int, _OptimStateKey]]] = (
+        [r0_param_id_to_optim_state_key] if rank == 0 else [None]
     )
-    return [
-        unflat_to_flat_param_ids[unflat_param_id]
-        for unflat_param_id in range(num_unflat_param_ids)
-    ]
+    dist.broadcast_object_list(key_obj_list, src=0, group=group)
+    assert key_obj_list[0] is not None
+    r0_param_id_to_optim_state_key = key_obj_list[0]
 
+    # Ensure that all ranks have at least the optimizer states needed by
+    # rank 0's optimizer
+    missing_keys: List[_OptimStateKey] = []
+    for r0_optim_state_key in r0_param_id_to_optim_state_key.values():
+        if r0_optim_state_key not in optim_state_key_to_param_id:
+            # A parameter from rank 0's optimizer does not exist for this
+            # rank's optimizer
+            missing_keys.append(r0_optim_state_key)
+            continue
+        param_id = optim_state_key_to_param_id[r0_optim_state_key]
+        assert param_id >= 0 and param_id < len(
+            param_id_to_param
+        ), "Check the `param_id_to_param` construction"
+    device = torch.device("cuda", torch.cuda.current_device())
+    num_missing = torch.tensor([len(missing_keys)], dtype=torch.int32, device=device)
+    dist.all_reduce(num_missing, group=group)
+    if num_missing.item() > 0:
+        obj_list = [None for _ in range(dist.get_world_size(group))]
+        dist.all_gather_object(obj_list, missing_keys, group=group)
+        error_msg = (
+            "FSDP currently requires each rank to have at least the "
+            "optimizer states needed by rank 0's optimizer but some ranks "
+            "are missing some of those states"
+        )
+        for rank, keys in enumerate(obj_list):
+            keys = cast(List[_OptimStateKey], keys)
+            if len(keys) > 0:
+                error_msg += (
+                    f"\nRank {rank} is missing states for the parameters: "
+                    f"{[key.unflat_param_names for key in keys]}"
+                )
+        raise RuntimeError(error_msg)
 
-def _is_zero_dim_tensor(x: Any) -> bool:
-    return torch.is_tensor(x) and x.dim() == 0
+    return r0_param_id_to_optim_state_key, optim_state_key_to_param_id
+
+
+def _unflatten_process_groups(
+    state_dict: Dict[str, Any],
+    param_id_to_param: List[nn.Parameter],
+    param_to_fqns: Dict[nn.Parameter, List[str]],
+) -> List[Dict[str, Any]]:
+    param_groups: List[Dict[str, Any]] = []
+    for flat_param_group in state_dict["param_groups"]:
+        unflat_param_group = copy.deepcopy(flat_param_group)
+        param_group_params = [
+            param_id_to_param[flat_param_id]
+            for flat_param_id in flat_param_group["params"]
+        ]
+        nested_unflat_param_names = [
+            param_to_fqns[param] for param in param_group_params
+        ]
+        unflat_param_group["params"] = [
+            unflat_param_name
+            for unflat_param_names in nested_unflat_param_names
+            for unflat_param_name in unflat_param_names
+        ]  # flatten the list of lists
+        param_groups.append(unflat_param_group)
+    return param_groups
 
 
 def _optim_state_dict(
@@ -1158,125 +1210,62 @@ def _optim_state_dict(
         :meth:`torch.optim.Optimizer.state_dict`. If ``rank0_only=False``,
         then nonzero ranks return an empty :class:`dict`.
     """
-    osd = optim.state_dict()
-    osd_state, osd_param_groups = osd["state"], osd["param_groups"]
-    rank = dist.get_rank(group)
-    to_save = not rank0_only or (rank == 0 or shard_state)
+    optim_state_dict = optim.state_dict()
+    to_save = not rank0_only or (dist.get_rank(group) == 0 or shard_state)
     fsdp_osd: Dict = {"state": {}, "param_groups": []} if to_save else {}
     fsdp_osd_state = fsdp_osd["state"] if to_save else None
-
-    # Construct the local mapping between unflattened parameter names
-    # (`_OptimStateKey`s) and parameter IDs and broadcast rank 0's mapping
     param_to_fqns: Dict[torch.nn.Parameter, List[str]] = _get_param_to_fqns(model)
-    flat_param_id_to_param: List[torch.nn.Parameter] = (
+    param_id_to_param: List[torch.nn.Parameter] = (
         _get_param_id_to_param_from_optim_input(model, optim_input)
         if using_optim_input
         else _get_param_id_to_param(optim)
     )
-    optim_state_key_to_flat_param_id: Dict[_OptimStateKey, int] = {}  # local
-    r0_flat_param_id_to_optim_state_key: Dict[
-        int, _OptimStateKey
-    ] = collections.OrderedDict()  # rank 0
-    for flat_param_id, param in enumerate(flat_param_id_to_param):
-        # Do not include parameters without state to avoid empty mappings
-        # just like in normal `torch.optim.Optimizer.state_dict()`
-        if flat_param_id not in osd_state:
-            continue
-        optim_state_key = _OptimStateKey(
-            unflat_param_names=tuple(param_to_fqns[param]),
-            is_flat_param=isinstance(param, FlatParameter),
-        )
-        if rank == 0:
-            r0_flat_param_id_to_optim_state_key[flat_param_id] = optim_state_key
-        optim_state_key_to_flat_param_id[optim_state_key] = flat_param_id
-    key_obj_list: List[Optional[Dict[int, _OptimStateKey]]] = (
-        [r0_flat_param_id_to_optim_state_key] if rank == 0 else [None]
-    )
-    dist.broadcast_object_list(key_obj_list, src=0, group=group)
-    assert key_obj_list[0] is not None
-    r0_flat_param_id_to_optim_state_key = key_obj_list[0]
 
-    # Ensure that all ranks have at least the optimizer states needed by
-    # rank 0's optimizer
-    missing_keys: List[_OptimStateKey] = []
-    for r0_optim_state_key in r0_flat_param_id_to_optim_state_key.values():
-        if r0_optim_state_key not in optim_state_key_to_flat_param_id:
-            # A parameter from rank 0's optimizer does not exist for this
-            # rank's optimizer
-            missing_keys.append(r0_optim_state_key)
-            continue
-        flat_param_id = optim_state_key_to_flat_param_id[r0_optim_state_key]
-        assert flat_param_id >= 0 and flat_param_id < len(
-            flat_param_id_to_param
-        ), "Check the `flat_param_id_to_param` construction"
-    device = torch.device("cuda", torch.cuda.current_device())
-    num_missing = torch.tensor([len(missing_keys)], dtype=torch.int32, device=device)
-    dist.all_reduce(num_missing, group=group)
-    if num_missing.item() > 0:
-        obj_list = [None for _ in range(dist.get_world_size(group))]
-        dist.all_gather_object(obj_list, missing_keys, group=group)
-        error_msg = (
-            "FSDP currently requires each rank to have at least the "
-            "optimizer states needed by rank 0's optimizer but some ranks "
-            "are missing some of those states"
-        )
-        for rank, keys in enumerate(obj_list):
-            keys = cast(List[_OptimStateKey], keys)
-            if len(keys) > 0:
-                error_msg += (
-                    f"\nRank {rank} is missing states for the parameters: "
-                    f"{[key.unflat_param_names for key in keys]}"
-                )
-        raise RuntimeError(error_msg)
+    (
+        param_id_to_optim_state_key,
+        optim_state_key_to_param_id,
+    ) = _map_param_id_to_optim_keys(
+        optim_state_dict,
+        group,
+        param_id_to_param,
+        param_to_fqns,
+    )
+    flat_param_to_fsdp_state = _get_flat_param_to_fsdp_module(model)
 
     # Iterate in rank 0's flattened parameter ID order to ensure aligned
     # all-gathers across ranks
-    flat_param_to_fsdp_module = _get_flat_param_to_fsdp_module(model)
-    for r0_optim_state_key in r0_flat_param_id_to_optim_state_key.values():
-        flat_param_id = optim_state_key_to_flat_param_id[r0_optim_state_key]
-        param = flat_param_id_to_param[flat_param_id]
-        if r0_optim_state_key.is_flat_param:
-            fsdp_module = flat_param_to_fsdp_module[param]
+    for optim_state_key in param_id_to_optim_state_key.values():
+        param_id = optim_state_key_to_param_id[optim_state_key]
+        if optim_state_key.is_flat_param:
+            param = param_id_to_param[param_id]
+            fsdp_state = flat_param_to_fsdp_state[param]
             unflat_state = _unflatten_optim_state(
                 cast(FlatParameter, param),
-                osd_state[flat_param_id],
-                fsdp_module,
+                optim_state_dict["state"][param_id],
+                fsdp_state,
                 to_save,
                 shard_state,
             )
             if to_save:
-                assert len(unflat_state) == len(r0_optim_state_key.unflat_param_names)
+                assert len(unflat_state) == len(optim_state_key.unflat_param_names)
                 for unflat_param_name, unflat_param_state in zip(
-                    r0_optim_state_key.unflat_param_names,
+                    optim_state_key.unflat_param_names,
                     unflat_state,
                 ):
                     fsdp_osd_state[unflat_param_name] = unflat_param_state
         elif to_save:
-            assert len(r0_optim_state_key.unflat_param_names) == 1
-            unflat_param_name = r0_optim_state_key.unflat_param_names[0]
-            fsdp_osd_state[unflat_param_name] = copy.copy(osd_state[flat_param_id])
+            assert len(optim_state_key.unflat_param_names) == 1
+            unflat_param_name = optim_state_key.unflat_param_names[0]
+            fsdp_osd_state[unflat_param_name] = copy.copy(
+                optim_state_dict["state"][param_id]
+            )
             for state_name, value in sorted_items(fsdp_osd_state[unflat_param_name]):
                 if torch.is_tensor(value):
                     fsdp_osd_state[unflat_param_name][state_name] = value.cpu()
 
-    if not to_save:
-        return {}
+    if to_save:
+        fsdp_osd["param_groups"] = _unflatten_process_groups(
+            optim_state_dict, param_id_to_param, param_to_fqns
+        )
 
-    # Handle the "param_groups" part of the optimizer state dict
-    fsdp_osd_param_groups = fsdp_osd["param_groups"]  # alias
-    for flat_param_group in osd_param_groups:
-        unflat_param_group = copy.deepcopy(flat_param_group)
-        param_group_params = [
-            flat_param_id_to_param[flat_param_id]
-            for flat_param_id in flat_param_group["params"]
-        ]
-        nested_unflat_param_names = [
-            param_to_fqns[param] for param in param_group_params
-        ]
-        unflat_param_group["params"] = [
-            unflat_param_name
-            for unflat_param_names in nested_unflat_param_names
-            for unflat_param_name in unflat_param_names
-        ]  # flatten the list of lists
-        fsdp_osd_param_groups.append(unflat_param_group)
     return fsdp_osd

From 48a8a3c2977b8b60fa902fb36c268b3b0c738409 Mon Sep 17 00:00:00 2001
From: Charlie Yan <charlieyan@fb.com>
Date: Sat, 3 Dec 2022 04:00:17 +0000
Subject: [PATCH 1583/1922] [Composable API] `replicate`: add support for DDP
 args (#89243)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89243
Approved by: https://github.com/zhaojuanmao
---
 test/distributed/_composable/test_replicate.py |  7 +++++++
 torch/distributed/_composable/replicate.py     | 13 +++++++++----
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/test/distributed/_composable/test_replicate.py b/test/distributed/_composable/test_replicate.py
index db1459589b342..de9fbfdbbc376 100644
--- a/test/distributed/_composable/test_replicate.py
+++ b/test/distributed/_composable/test_replicate.py
@@ -102,6 +102,13 @@ def test_replicate_multi_module(self):
         replicate(replicate_model.fc3)
         self._compare_module(model, replicate_model)
 
+    def test_replicate_with_kwargs(self):
+        model = Net()
+        replicate_model = replicate(
+            deepcopy(model), bucket_cap_mb=1, gradient_as_bucket_view=True
+        )
+        self._compare_module(model, replicate_model)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/distributed/_composable/replicate.py b/torch/distributed/_composable/replicate.py
index c27a88d79b4d9..95aa8ee4c7d25 100644
--- a/torch/distributed/_composable/replicate.py
+++ b/torch/distributed/_composable/replicate.py
@@ -12,8 +12,9 @@ def __init__(self) -> None:
         self.modules: List[nn.Module] = []
         self.has_initialized: bool = False
         self._param_list: nn.ParameterList = nn.ParameterList()
+        self.kwargs: dict = {}
 
-    def mark_modules(self, *modules: nn.Module) -> None:
+    def mark_modules(self, *modules: nn.Module, **kwargs) -> None:
         for module in modules:
             self.modules.append(module)
             replicate.state(module)._distributed_state = self
@@ -21,6 +22,7 @@ def mark_modules(self, *modules: nn.Module) -> None:
             module.register_forward_pre_hook(self.forward_pre_hook)
             # TODO(@yhcharles): fix type error
             module.register_forward_hook(self.forward_post_hook)  # type: ignore[arg-type]
+        self.kwargs = kwargs
 
     def _recursive_collect_params(self, module: nn.Module) -> None:
         # TODO: skip if managed by other APIs
@@ -39,7 +41,7 @@ def _recursive_collect_params(self, module: nn.Module) -> None:
         for child in module.children():
             self._recursive_collect_params(child)
 
-    def init_helper(self):
+    def init_helper(self) -> None:
         if self.has_initialized:
             return
 
@@ -47,7 +49,9 @@ def init_helper(self):
         for module in self.modules:
             self._recursive_collect_params(module)
 
-        self._ddp = _ddp.DistributedDataParallel(self._param_list)
+        self._ddp = _ddp.DistributedDataParallel(
+            self._param_list, **self.kwargs
+        )
 
     def forward_pre_hook(
         self, module: nn.Module, input: Tuple[torch.Tensor]
@@ -67,6 +71,7 @@ def forward_post_hook(
 @contract
 def replicate(
     module: nn.Module,  # NOTE: contract now supports single module only
+    **kwargs,
 ) -> nn.Module:
     r"""Replicates module(s)
 
@@ -77,5 +82,5 @@ def replicate(
         >>> module = nn.Linear(3, 3)
         >>> replicate(module)
     """
-    _ReplicateState().mark_modules(module)
+    _ReplicateState().mark_modules(module, **kwargs)
     return module

From edd22741a91021f357f0b3d64da44cbc70938e2d Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Mon, 5 Dec 2022 16:41:46 +0000
Subject: [PATCH 1584/1922] Add a repro for fully_shard _unshard error (#90190)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90190
Approved by: https://github.com/awgu
---
 test/distributed/_composable/test_compose.py | 57 ++++++++++++++++++--
 1 file changed, 54 insertions(+), 3 deletions(-)

diff --git a/test/distributed/_composable/test_compose.py b/test/distributed/_composable/test_compose.py
index 0b7282f6a3fa0..5343772673f62 100644
--- a/test/distributed/_composable/test_compose.py
+++ b/test/distributed/_composable/test_compose.py
@@ -58,12 +58,41 @@ def forward(self, x):
         return self.l2(self.u2(self.u1(self.l1(x))))
 
 
+class UnitParamModule(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.l = nn.Linear(100, 100)
+        self.seq = nn.Sequential(
+            nn.ReLU(),
+            nn.Linear(100, 100),
+            nn.ReLU(),
+        )
+        self.p = nn.Parameter(torch.randn(100, 100))
+
+    def forward(self, x):
+        return torch.mm(self.seq(self.l(x)), self.p)
+
+
+class CompositeParamModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.l = nn.Linear(100, 100)
+        self.u1 = UnitModule()
+        self.u2 = UnitModule()
+        self.p = nn.Parameter(torch.randn(100, 100))
+
+    def forward(self, x):
+        a = self.u2(self.u1(self.l(x)))
+        b = self.p
+        return torch.mm(a, b)
+
+
 class TestFSDPCheckpoint(FSDPTest):
     @property
     def world_size(self) -> int:
         return 2
 
-    def _test_wrap_same_submodule(
+    def _test_parity(
         self,
         base_model: nn.Module,
         test_model: nn.Module,
@@ -110,7 +139,7 @@ def test_wrap_same_submodule(self, use_reentrant: bool):
                 "x": [torch.randn(2, 100, device="cuda")],
                 "grad_to_none": [True, False],
             },
-            self._test_wrap_same_submodule,
+            self._test_parity,
         )
 
     def _test_checkpoint_fsdp_submodules(self, use_reentrant):
@@ -142,7 +171,7 @@ def _test_checkpoint_fsdp_submodules(self, use_reentrant):
                 "x": [torch.randn(2, 100, device="cuda")],
                 "grad_to_none": [True, False],
             },
-            self._test_wrap_same_submodule,
+            self._test_parity,
         )
 
     @skip_if_lt_x_gpu(2)
@@ -157,6 +186,28 @@ def test_checkpoint_fsdp_submodules_use_reentrant(self):
     def test_checkpoint_fsdp_submodules_non_reentrant(self):
         self._test_checkpoint_fsdp_submodules(False)
 
+    @skip_if_lt_x_gpu(2)
+    def test_checkpoint_fsdp_submodules_with_param(self):
+        model = CompositeParamModel().to(torch.device("cuda"))
+
+        base_model = copy.deepcopy(model)
+
+        test_model = copy.deepcopy(model)
+        test_model.u1.seq = checkpoint(test_model.u1.seq, use_reentrant=False)
+        test_model.u2.seq = checkpoint(test_model.u2.seq, use_reentrant=False)
+        test_model = fully_shard(test_model)
+
+        with self.assertRaisesRegex(RuntimeError, "mat2 must be a matrix"):
+            self.run_subtests(
+                {
+                    "base_model": [base_model],
+                    "test_model": [test_model],
+                    "x": [torch.randn(2, 100, device="cuda")],
+                    "grad_to_none": [True, False],
+                },
+                self._test_parity,
+            )
+
 
 instantiate_parametrized_tests(TestFSDPCheckpoint)
 

From ff79884d1aae2effb327e93570823706e22ededf Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Mon, 5 Dec 2022 17:02:22 +0000
Subject: [PATCH 1585/1922] Add sharding strategy to fully_shard (#90192)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90192
Approved by: https://github.com/awgu, https://github.com/rohan-varma
---
 test/distributed/_composable/test_compose.py | 25 ++++++++++++++++++++
 torch/distributed/_composable/fully_shard.py |  3 ++-
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/test/distributed/_composable/test_compose.py b/test/distributed/_composable/test_compose.py
index 5343772673f62..ae613ad1c8957 100644
--- a/test/distributed/_composable/test_compose.py
+++ b/test/distributed/_composable/test_compose.py
@@ -8,6 +8,7 @@
 import torch.nn as nn
 from torch.distributed._composable import checkpoint, fully_shard
 from torch.distributed.fsdp.wrap import ModuleWrapPolicy
+from torch.distributed.fsdp.api import ShardingStrategy
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import FSDPTest
 from torch.testing._internal.common_utils import (
@@ -208,6 +209,30 @@ def test_checkpoint_fsdp_submodules_with_param(self):
                 self._test_parity,
             )
 
+    @skip_if_lt_x_gpu(2)
+    def test_checkpoint_fsdp_submodules_with_param_no_shard(self):
+        model = CompositeParamModel().to(torch.device("cuda"))
+
+        base_model = copy.deepcopy(model)
+
+        test_model = copy.deepcopy(model)
+        test_model.u1.seq = checkpoint(test_model.u1.seq, use_reentrant=False)
+        test_model.u2.seq = checkpoint(test_model.u2.seq, use_reentrant=False)
+        test_model = fully_shard(test_model, strategy=ShardingStrategy.NO_SHARD)
+
+        with self.assertRaisesRegex(
+            RuntimeError, "Cannot writeback when the parameter shape changes"
+        ):
+            self.run_subtests(
+                {
+                    "base_model": [base_model],
+                    "test_model": [test_model],
+                    "x": [torch.randn(2, 100, device="cuda")],
+                    "grad_to_none": [True, False],
+                },
+                self._test_parity,
+            )
+
 
 instantiate_parametrized_tests(TestFSDPCheckpoint)
 
diff --git a/torch/distributed/_composable/fully_shard.py b/torch/distributed/_composable/fully_shard.py
index e3d36c3c87bbc..f145de1e20a36 100644
--- a/torch/distributed/_composable/fully_shard.py
+++ b/torch/distributed/_composable/fully_shard.py
@@ -34,6 +34,7 @@ def fully_shard(
     *,
     process_group: Optional[dist.ProcessGroup] = None,
     policy: Optional[_FSDPPolicy] = None,
+    strategy: Optional[ShardingStrategy] = None,
     mixed_precision: Optional[MixedPrecision] = None,
     cpu_offload: Optional[CPUOffload] = None,
     ignored_modules: Optional[Iterable[torch.nn.Module]] = None,
@@ -56,7 +57,7 @@ def fully_shard(
     forward_prefetch_limit = 1
     state = _init_core_state(
         state,
-        ShardingStrategy.FULL_SHARD,
+        strategy or ShardingStrategy.FULL_SHARD,
         mixed_precision,
         cpu_offload,
         limit_all_gathers,

From e6792584430d83682d4999446650422bdd52f632 Mon Sep 17 00:00:00 2001
From: JackCaoG <jackcao@google.com>
Date: Mon, 5 Dec 2022 22:38:11 +0000
Subject: [PATCH 1586/1922] Fix XLA dynamo CI (#90229)

Fixes https://github.com/pytorch/xla/issues/4274

We should not access `subgraph` once it is deleted.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90229
Approved by: https://github.com/voznesenskym
---
 torch/_dynamo/optimizations/backends.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/_dynamo/optimizations/backends.py b/torch/_dynamo/optimizations/backends.py
index 7d7289ac67435..64056b59b2191 100644
--- a/torch/_dynamo/optimizations/backends.py
+++ b/torch/_dynamo/optimizations/backends.py
@@ -748,11 +748,11 @@ def torchxla_trace_once(subgraph):
     import torch._dynamo.optimizations.torchxla_integration as integration
 
     compiled_graph = None
+    model = subgraph.model
 
     def fwd(*args):
         nonlocal subgraph
         nonlocal compiled_graph
-        model = subgraph.model
         if compiled_graph is None:
             compiled_graph = integration.extract_compiled_graph(model, args)
             del subgraph

From 0ef985dbdc999716442f2d385c5900dffc065600 Mon Sep 17 00:00:00 2001
From: Atul Jangra <atuljangra@meta.com>
Date: Mon, 5 Dec 2022 23:40:08 +0000
Subject: [PATCH 1587/1922] [Caffe2] Fix the assert message (#89816)

Summary:
As title.
dev1/2 is invalid. It should be dev_1/2 instead

Test Plan: Sandcastle

Differential Revision: D41569982

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89816
Approved by: https://github.com/PaliC
---
 caffe2/python/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/caffe2/python/core.py b/caffe2/python/core.py
index 4ae75272d3820..dc9e74f87ad22 100644
--- a/caffe2/python/core.py
+++ b/caffe2/python/core.py
@@ -640,7 +640,7 @@ def AppendSparseGenerators(self, sparse_generators):
                     assert(g1 == g2)
                     assert dev_1 == dev_2, (
                         "Unequal devices for sparse generators: "
-                        "{} and {}".format(dev1, dev2)
+                        "{} and {}".format(dev_1, dev_2)
                     )
                     assert(op1_i is None or op2_i is None)
                     assert(op1_v is None or op2_v is None)

From e189105ed16eea4f148bd01c73dc262862b36409 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Fri, 2 Dec 2022 12:40:04 -0800
Subject: [PATCH 1588/1922] [pytorch][vulkan] realistic benchmark size for
 depthwise (#89948)

Update benchmark size to be bigger tensors to get mor realistic numbers

Differential Revision: [D41006643](https://our.internmc.facebook.com/intern/diff/D41006643/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89948
Approved by: https://github.com/digantdesai, https://github.com/salilsdesai
---
 aten/src/ATen/test/vulkan_perf_test.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/test/vulkan_perf_test.cpp b/aten/src/ATen/test/vulkan_perf_test.cpp
index 0c1c6b9cfe378..8f69be09209db 100644
--- a/aten/src/ATen/test/vulkan_perf_test.cpp
+++ b/aten/src/ATen/test/vulkan_perf_test.cpp
@@ -536,10 +536,10 @@ static void conv2ddw_op_benchmark(benchmark::State& state) {
   const auto batches_in = safe_downcast<uint32_t>(state.range(0));
   const auto height_in = safe_downcast<uint32_t>(state.range(2));
   const auto width_in = safe_downcast<uint32_t>(state.range(3));
-  constexpr int64_t groups = 7;
-  constexpr std::array<int64_t, 2u> stride{2, 3};
-  constexpr std::array<int64_t, 2u> padding{0, 4};
-  constexpr std::array<int64_t, 2u> dilation{3, 1};
+  constexpr int64_t groups = 32;
+  constexpr std::array<int64_t, 2u> stride{1, 1};
+  constexpr std::array<int64_t, 2u> padding{0, 0};
+  constexpr std::array<int64_t, 2u> dilation{1, 1};
 
   struct {
     uint32_t batches;
@@ -571,7 +571,7 @@ static void conv2ddw_op_benchmark(benchmark::State& state) {
           height,
       };
     }
-  } weights{groups, 1, 17, 7};
+  } weights{groups, 1, 3, 3};
 
   const auto input_cpu =
       at::randn(input.size(), at::device(at::kCPU).dtype(at::kFloat));
@@ -1053,7 +1053,7 @@ BENCHMARK(conv2ddw_op_benchmark)
     ->UseManualTime()
     ->Threads(1)
     ->Iterations(10)
-    ->Args({1, 7, 137, 199});
+    ->Args({1, 32, 256, 256});
 BENCHMARK(conv2ddw_op_q_benchmark)
     ->Apply(CommonBenchmarkSettings)
     ->UseManualTime()

From 9c935315df2c21356c947240ad38b25d8fb8a130 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Fri, 2 Dec 2022 12:40:08 -0800
Subject: [PATCH 1589/1922] [Vulkan] output benchmark numbers for aibench
 parsing (#89949)

Add this util so as to easily benchmark shaders and summarize the output.
Eventually the shader benchmarking should obsolete the need for this.

Differential Revision: [D41244028](https://our.internmc.facebook.com/intern/diff/D41244028/)

**NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D41244028/)!
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89949
Approved by: https://github.com/digantdesai, https://github.com/salilsdesai
---
 aten/src/ATen/test/vulkan_perf_test.cpp | 40 +++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/aten/src/ATen/test/vulkan_perf_test.cpp b/aten/src/ATen/test/vulkan_perf_test.cpp
index 8f69be09209db..51cce68d4c4db 100644
--- a/aten/src/ATen/test/vulkan_perf_test.cpp
+++ b/aten/src/ATen/test/vulkan_perf_test.cpp
@@ -1,3 +1,4 @@
+#include <unordered_map>
 #ifdef USE_VULKAN_API
 
 #include <benchmark/benchmark.h>
@@ -10,8 +11,46 @@
 #include <ATen/native/vulkan/ops/QuantizedFunctions.h>
 #include <ATen/native/vulkan/ops/Utils.h>
 
+#include <iostream>
+
 namespace {
 
+namespace vulkan_api = at::native::vulkan::api;
+void report_pep(const std::string& name, const uint64_t duration) {
+  std::stringstream buffer;
+  buffer << "PyTorchObserver {\"type\": \"";
+  buffer << name << "\",";
+  buffer << "\"unit\": \""
+         << "ns"
+         << "\","
+         << "\"metric\": \""
+         << "latency"
+         << "\",";
+  buffer << "\"value\": \"" << duration << "\"";
+  buffer << "}\n";
+  std::cout << buffer.str();
+}
+
+void report_aibench_res(vulkan_api::QueryPool& qpool) {
+  std::unordered_map<std::string, uint64_t> shader_runtimes;
+  uint64_t num_additions = 0;
+  auto result_aggregator =
+      [&shader_runtimes, &num_additions](const vulkan_api::ShaderDuration& s) {
+        if (shader_runtimes.count(s.kernel_name) == 0) {
+          shader_runtimes[s.kernel_name] = 0;
+        }
+        shader_runtimes[s.kernel_name] += s.execution_duration_ns;
+        num_additions += 1;
+      };
+  qpool.shader_log_for_each(result_aggregator);
+  uint64_t num_iters = num_additions / shader_runtimes.size();
+  for (const auto& i : shader_runtimes) {
+    const auto& name = i.first;
+    const auto& duration = i.second / num_iters;
+    report_pep(name, duration);
+  }
+}
+
 at::Tensor vulkan_to_cpu(at::Tensor vulkan, at::Tensor in_cpu) {
   auto q_options = in_cpu.options();
   if (q_options.dtype().toScalarType() == c10::ScalarType::QUInt8) {
@@ -606,6 +645,7 @@ static void conv2ddw_op_benchmark(benchmark::State& state) {
 #if defined(USE_VULKAN_GPU_DIAGNOSTICS) && defined(__ANDROID__)
   at::native::vulkan::api::context()->querypool().extract_results();
   at::native::vulkan::api::context()->querypool().print_results();
+  report_aibench_res(vulkan_api::context()->querypool());
   state.SetIterationTime(at::native::vulkan::api::context()->querypool().get_total_op_ns("conv2d_dw") / 1000000.0);
 #endif
 }

From 902a771c577cad26809bb3d2b11e0937bc44d0b5 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Fri, 2 Dec 2022 12:40:09 -0800
Subject: [PATCH 1590/1922] [Pytorch][vulkan] Simplify depthwise conv to remove
 bounds compute (#89950)

Right now we are doing bounds check and reduce compute according to bounds
check. However this can lead to thread divergence.
Furthermore since textures provide handling of border region, it should be safe
to use negative indexing.

Differential Revision: [D41006645](https://our.internmc.facebook.com/intern/diff/D41006645/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89950
Approved by: https://github.com/salilsdesai
---
 aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl
index 89be0f3b69b21..ab2ce6459c67d 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl
@@ -67,24 +67,22 @@ void main() {
 
   // Compute the start and end of the input indices to load. Padding is assumed
   // to be constant 0 padding, so any reads from the padding region is skipped.
-  const ivec2 start = max(ivec2(0), ipos);
-  const ivec2 end = min(ipos + uBlock.overlay_region.xy, uBlock.in_extents.xy);
-  // Compute the start of the kernel based on how far we are skipping ahead when
-  // reading the input
-  const ivec2 kstart = (start - ipos) / uBlock.dilate;
+  const ivec2 start = ipos;
+  const ivec2 end = ipos + uBlock.overlay_region.xy;
 
   vec4 sum = texelFetch(uBias, ivec2(pos.z, 0), 0);
   const int dil_y = uBlock.dilate.y;
   const int dil_x = uBlock.dilate.x;
-  for (int y = start.y, ky = kstart.y; y < end.y; y += dil_y, ky++) {
-    for (int x = start.x, kx = kstart.x; x < end.x; x += dil_x, kx++) {
+  int k_ind = 0;
+  for (int y = start.y; y < end.y; y += dil_y) {
+    for (int x = start.x; x < end.x; x += dil_x) {
       // The weight kernel was rearranged so that every NxN filter was flattened
       // so that it fits on one row. Each filter was then stacked on top of each
       // other vertically.
-      const int k_ind = kx + ky * uBlock.kernel_size.x;
       const vec4 k_tex = texelFetch(uKernel, ivec2(k_ind, pos.z), 0);
       const vec4 i_tex = texelFetch(uInput, ivec3(x, y, pos.z), 0);
       sum = fma(i_tex, k_tex, sum);
+      k_ind++;
     }
   }
 

From 921e60b411526d8860581919bdcf788245a98ed3 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Fri, 2 Dec 2022 12:40:11 -0800
Subject: [PATCH 1591/1922] [Pytorch][Vulkan] shader codegen use ordered
 dictionary (#89951)

When not using ordered dictionary, it can result in parameter values have
different order for each specialization. This can result shader names which are
not consistent in their naming and meaning of the template parameter values
that appear in the meaning of their names.
For example if you have:
conv2d_pw:
  default_values:
   - X: 1
   - Y: 2
  parameter_values:
   - Y: 3

Default parameter value can generate shader with 'my_shader_1x2' where 1x2 is
for X, Y parameters respectively. Then,
for non default values, of which there is only 1, we have Y=3 and with existing
implementation you can end up genreating shader with 'my_shader_3x1'. Here 3 is
for Y and 1 is for X. This leads to confusing shader names.

THis diff fixes this by
1. using ordered dict.
2. non default values are updated by first copying default values and then
updating them.

Differential Revision: [D41006639](https://our.internmc.facebook.com/intern/diff/D41006639/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89951
Approved by: https://github.com/salilsdesai
---
 tools/gen_vulkan_glsl.py          | 12 ++++++------
 tools/test/test_vulkan_codegen.py |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tools/gen_vulkan_glsl.py b/tools/gen_vulkan_glsl.py
index bf6f16dff25fb..7e101545e097c 100644
--- a/tools/gen_vulkan_glsl.py
+++ b/tools/gen_vulkan_glsl.py
@@ -1,8 +1,9 @@
 import copy
 import os
 
-import yaml
+from collections import OrderedDict
 
+import yaml
 from torchgen.code_template import CodeTemplate
 from yaml.constructor import ConstructorError
 from yaml.nodes import MappingNode
@@ -59,7 +60,7 @@ def __init__(self):  # type: ignore[no-untyped-def]
         self.ops_template_params = {}
 
     def add_params_yaml(self, parameters_yaml_file):  # type: ignore[no-untyped-def]
-        all_template_params = {}
+        all_template_params = OrderedDict()
         with open(parameters_yaml_file, "r") as f:
             contents = yaml.load(f, Loader=UniqueKeyLoader)
             for key in contents:
@@ -79,13 +80,12 @@ def validate_and_construct_op_params(self, all_template_params):  # type: ignore
             op_template_params_values = all_template_params[op]["parameter_values"]
             for param_vals in op_template_params_values:
                 param_vals_set = set(param_vals.keys())
-                missing_keys = template_params_set - param_vals_set
                 invalid_keys = param_vals_set - template_params_set
                 if (len(invalid_keys)) > 0:
                     raise KeyError(f"Invalid keys {invalid_keys} are found")
-                param_vals_copy = copy.deepcopy(param_vals)
-                for key in missing_keys:
-                    param_vals_copy[key] = op_params_default_vals[key]
+                param_vals_copy = copy.deepcopy(op_params_default_vals)
+                for key in param_vals:
+                    param_vals_copy[key] = param_vals[key]
                 self.ops_template_params[op].append(param_vals_copy)
 
     def generate(self, glsl_template_in, out_dir):  # type: ignore[no-untyped-def]
diff --git a/tools/test/test_vulkan_codegen.py b/tools/test/test_vulkan_codegen.py
index 26ccc66425790..8b0b4b3a13cde 100644
--- a/tools/test/test_vulkan_codegen.py
+++ b/tools/test/test_vulkan_codegen.py
@@ -71,7 +71,7 @@ def test_missing_key_default_val(self) -> None:
       TILE_SIZE_X: 1
       TILE_SIZE_Y: 1
   parameter_values:
-    - TILE_SIZE_X: 2
+    - TILE_SIZE_Y: 2
 """
         file_content = """
 x = $TILE_SIZE_X + $TILE_SIZE_Y
@@ -89,7 +89,7 @@ def test_missing_key_default_val(self) -> None:
                     template_file.flush()
                     generator.generate(template_file.name, tmp_dir)  # type: ignore[no-untyped-call]
                     file_name_1 = os.path.join(tmp_dir, "conv2d_pw_1x1.glsl")
-                    file_name_2 = os.path.join(tmp_dir, "conv2d_pw_2x1.glsl")
+                    file_name_2 = os.path.join(tmp_dir, "conv2d_pw_1x2.glsl")
                     self.assertTrue(os.path.exists(file_name_1))
                     self.assertTrue(os.path.exists(file_name_2))
                     with open(file_name_1, "r") as f:
@@ -97,4 +97,4 @@ def test_missing_key_default_val(self) -> None:
                         self.assertTrue("1 + 1" in contents)
                     with open(file_name_2, "r") as f:
                         contents = f.read()
-                        self.assertTrue("2 + 1" in contents)
+                        self.assertTrue("1 + 2" in contents)

From 764dfb5ac821c01f4ecd67fc4f238610aae1d303 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Fri, 2 Dec 2022 12:40:13 -0800
Subject: [PATCH 1592/1922] [Pytorch][Vulkan] Templatize depth wise convolution
 and specialize for 3x3 and 5x5 (#89952)

5x5

This diff does not yet integrate with the runtime.

Differential Revision: [D41006640](https://our.internmc.facebook.com/intern/diff/D41006640/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89952
Approved by: https://github.com/salilsdesai
---
 .../vulkan/glsl/templates/conv2d_dw.glslt     | 88 +++++++++++++++++++
 .../glsl/templates/conv2d_dw_params.yaml      |  7 ++
 2 files changed, 95 insertions(+)
 create mode 100644 aten/src/ATen/native/vulkan/glsl/templates/conv2d_dw.glslt
 create mode 100644 aten/src/ATen/native/vulkan/glsl/templates/conv2d_dw_params.yaml

diff --git a/aten/src/ATen/native/vulkan/glsl/templates/conv2d_dw.glslt b/aten/src/ATen/native/vulkan/glsl/templates/conv2d_dw.glslt
new file mode 100644
index 0000000000000..3afbefa2be492
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/templates/conv2d_dw.glslt
@@ -0,0 +1,88 @@
+/*
+ * KERNEL_SIZE = ($KERNEL_SIZE_X, $KERNEL_SIZE_Y)
+ * TILE_SIZE = (1, 1, 1)
+ * WEIGHT_STORAGE = TEXTURE_2D
+ * BIAS_STORAGE = TEXTURE_2D
+ * Note that for DW kernel IC = 1 so the weight layout is really OC4, H, W, 4oc
+ */
+
+layout(std430) buffer;
+
+/*
+ * Output Image
+ */
+layout(set = 0, binding = 0, FORMAT) uniform PRECISION restrict writeonly image3D uOut;
+
+/*
+ * Input Textures
+ */
+layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
+layout(set = 0, binding = 2) uniform PRECISION sampler2D uKernel;
+layout(set = 0, binding = 3) uniform PRECISION sampler2D uBias;
+
+/*
+ * Params Buffer
+ */
+layout(set = 0, binding = 4) uniform PRECISION restrict Block {
+  // extents of the output texture
+  ivec4 out_extents;
+  // extents of the input texture
+  ivec4 in_extents;
+  // size of the overlay region of the kernel
+  ivec4 overlay_region;
+  // width and height of the kernel
+  ivec2 kernel_size;
+  // convolution parameters
+  ivec2 stride;
+  ivec2 padding;
+  ivec2 dilate;
+  vec2 clamp_thresh;
+}
+uBlock;
+
+/*
+ * Local Work Group
+ */
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+/*
+ * Computes depthwise convolution. Each shader invocation calculates the output
+ * of a single output location.
+ */
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  // Return if this global position is outside output texture bounds
+  if (any(greaterThanEqual(pos, uBlock.out_extents.xyz))) {
+    return;
+  }
+
+  // Compute the index of the top-left element of the overlay region. Note that
+  // negative indices can be produced indicating that the top-left element is in
+  // a region added by padding.
+  const ivec2 ipos = pos.xy * uBlock.stride - uBlock.padding;
+
+  // Compute the start and end of the input indices to load. Padding is assumed
+  // to be constant 0 padding, so any reads from the padding region is skipped.
+  const ivec2 start = ipos;
+  const ivec2 end = ipos + uBlock.overlay_region.xy;
+
+  vec4 sum = texelFetch(uBias, ivec2(pos.z, 0), 0);
+  const int dil_y = uBlock.dilate.y;
+  const int dil_x = uBlock.dilate.x;
+  int k_ind = 0;
+  for (int y = start.y, i = 0; i < $KERNEL_SIZE_Y; y += dil_y, i++) {
+    for (int x = start.x, j = 0; j < $KERNEL_SIZE_X; x += dil_x, j++) {
+      // The weight kernel was rearranged so that every NxN filter was flattened
+      // so that it fits on one row. Each filter was then stacked on top of each
+      // other vertically.
+      const vec4 kernel_vals = texelFetch(uKernel, ivec2(k_ind, pos.z), 0);
+      const vec4 i_tex = texelFetch(uInput, ivec3(x, y, pos.z), 0);
+      sum = fma(i_tex, kernel_vals, sum);
+      k_ind++;
+    }
+  }
+
+  imageStore(
+      uOut, pos, clamp(sum, uBlock.clamp_thresh.x, uBlock.clamp_thresh.y));
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/templates/conv2d_dw_params.yaml b/aten/src/ATen/native/vulkan/glsl/templates/conv2d_dw_params.yaml
new file mode 100644
index 0000000000000..baddf153cb7bc
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/templates/conv2d_dw_params.yaml
@@ -0,0 +1,7 @@
+conv2d_dw:
+  parameter_names_with_default_values:
+      KERNEL_SIZE_X: 3
+      KERNEL_SIZE_Y: 3
+  parameter_values:
+    - KERNEL_SIZE_X: 5
+      KERNEL_SIZE_Y: 5

From 57cb3b192878f2be70fad66088f51219f03b9f2a Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Fri, 2 Dec 2022 12:40:14 -0800
Subject: [PATCH 1593/1922] [Pytorch][Vulkan] Use specalized shader for 3x3
 depthwise conv (#89953)

This diff uses specialized implementation for 3x3 and 5x5 dw conv.

Differential Revision: [D41006638](https://our.internmc.facebook.com/intern/diff/D41006638/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89953
Approved by: https://github.com/salilsdesai, https://github.com/kirklandsign
---
 .../ATen/native/vulkan/ops/Convolution.cpp    |  10 ++
 aten/src/ATen/test/vulkan_api_test.cpp        | 130 ++++++++++++++++++
 2 files changed, 140 insertions(+)

diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
index 8a8f63424df05..05557a62dda8c 100644
--- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
@@ -320,6 +320,16 @@ static api::ShaderSource get_shader(
       break;
     case Conv2dDepthwise:
       shader = VK_SHADER(conv2d_dw);
+      if (kernel_size.size() == 4 && kernel_size[2] == 3 &&
+          kernel_size[3] == 3) {
+        // 1x1 refers to the output tile size
+        shader = VK_SHADER(conv2d_dw_3x3);
+      }
+      if (kernel_size.size() == 4 && kernel_size[2] == 5 &&
+          kernel_size[3] == 5) {
+        // 1x1 refers to the output tile size
+        shader = VK_SHADER(conv2d_dw_5x5);
+      }
       break;
     case Conv2dPointwise:
       shader = VK_SHADER(conv2d_pw_2x2);
diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index 6870ec4e049f0..39edf4ae3a8c2 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -1063,6 +1063,136 @@ TEST_F(VulkanAPITest, conv2d_prepack_bc) {
     1);           // groups
 }
 
+TEST_F(VulkanAPITest, conv2d_dw_3x3) {
+  constexpr int64_t groups = 7;
+  constexpr std::array<int64_t, 2u> stride{2, 3};
+  constexpr std::array<int64_t, 2u> padding{0, 4};
+  constexpr std::array<int64_t, 2u> dilation{3, 1};
+
+  constexpr struct {
+    uint32_t batches;
+    uint32_t channels;
+    uint32_t width;
+    uint32_t height;
+
+    std::array<int64_t, 4u> size() const {
+      return {
+          batches,
+          channels,
+          width,
+          height,
+      };
+    }
+  } input{1, groups, 137, 199};
+
+  constexpr struct {
+    uint32_t output_channels;
+    uint32_t input_channels;
+    uint32_t width;
+    uint32_t height;
+
+    std::array<int64_t, 4u> size() const {
+      return {
+          output_channels,
+          input_channels,
+          width,
+          height,
+      };
+    }
+  } weights{groups, 1, 3, 3};
+
+  const auto input_cpu =
+      at::rand(input.size(), at::device(at::kCPU).dtype(at::kFloat));
+  const auto weights_cpu =
+      at::rand(weights.size(), at::device(at::kCPU).dtype(at::kFloat));
+  const auto bias_cpu = at::rand(
+      {weights.output_channels}, at::device(at::kCPU).dtype(at::kFloat));
+
+  const auto output_cpu = at::conv2d(
+      input_cpu, weights_cpu, bias_cpu, stride, padding, dilation, groups);
+
+  const auto output_vulkan = at::conv2d(
+      input_cpu.vulkan(),
+      weights_cpu,
+      bias_cpu,
+      stride,
+      padding,
+      dilation,
+      groups);
+
+  const bool check = almostEqual(output_cpu, output_vulkan.cpu());
+  if (!check) {
+    showRtol(output_cpu, output_vulkan.cpu());
+  }
+
+  ASSERT_TRUE(check);
+}
+
+TEST_F(VulkanAPITest, conv2d_dw_5x5) {
+  constexpr int64_t groups = 7;
+  constexpr std::array<int64_t, 2u> stride{2, 3};
+  constexpr std::array<int64_t, 2u> padding{0, 4};
+  constexpr std::array<int64_t, 2u> dilation{3, 1};
+
+  constexpr struct {
+    uint32_t batches;
+    uint32_t channels;
+    uint32_t width;
+    uint32_t height;
+
+    std::array<int64_t, 4u> size() const {
+      return {
+          batches,
+          channels,
+          width,
+          height,
+      };
+    }
+  } input{1, groups, 137, 199};
+
+  constexpr struct {
+    uint32_t output_channels;
+    uint32_t input_channels;
+    uint32_t width;
+    uint32_t height;
+
+    std::array<int64_t, 4u> size() const {
+      return {
+          output_channels,
+          input_channels,
+          width,
+          height,
+      };
+    }
+  } weights{groups, 1, 5, 5};
+
+  const auto input_cpu =
+      at::rand(input.size(), at::device(at::kCPU).dtype(at::kFloat));
+  const auto weights_cpu =
+      at::rand(weights.size(), at::device(at::kCPU).dtype(at::kFloat));
+  const auto bias_cpu = at::rand(
+      {weights.output_channels}, at::device(at::kCPU).dtype(at::kFloat));
+
+  const auto output_cpu = at::conv2d(
+      input_cpu, weights_cpu, bias_cpu, stride, padding, dilation, groups);
+
+  const auto output_vulkan = at::conv2d(
+      input_cpu.vulkan(),
+      weights_cpu,
+      bias_cpu,
+      stride,
+      padding,
+      dilation,
+      groups);
+
+  const bool check = almostEqual(output_cpu, output_vulkan.cpu());
+  if (!check) {
+    showRtol(output_cpu, output_vulkan.cpu());
+  }
+
+  ASSERT_TRUE(check);
+}
+
 TEST_F(VulkanAPITest, conv2d_dw) {
   constexpr int64_t groups = 7;
   constexpr std::array<int64_t, 2u> stride{2, 3};

From 74d151970207fa5a3054bd66b0a51e53c2fb49dd Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Tue, 6 Dec 2022 01:35:19 +0000
Subject: [PATCH 1594/1922] Fix yet another C++17 Windows build issue (#90228)

Not sure why, but top-level `using namespace` directive causes VC++ fail with (if C++17 standard is used, but everything is fine with C++14):
```
C:\actions-runner\_work\pytorch\pytorch\third_party\pybind11\include\pybind11\detail\../pytypes.h(1520): error C2872: 'attr': ambiguous symbol
C:\actions-runner\_work\pytorch\pytorch\aten\src\ATen/core/interned_strings.h(349): note: could be 'c10::attr'
C:\actions-runner\_work\pytorch\pytorch\torch/csrc/jit/ir/ir.h(75): note: or       'torch::jit::attr'
C:\actions-runner\_work\pytorch\pytorch\cmake\..\third_party\pybind11\include\pybind11/pybind11.h(1094): note: see reference to function template instantiation 'pybind11::str pybind11::str::format<_Ty1&>(_Ty1 &) const' being compiled
        with
        [
            _Ty1=pybind11::handle
        ]
```

Solve this by replacing global `using namespace torch::jit;` with
specific usages of objects/methods from namespaces

Another prep change for https://github.com/pytorch/pytorch/70188

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90228
Approved by: https://github.com/kit1980, https://github.com/albanD
---
 torch/csrc/autograd/python_function.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
index 39374f7f82978..c86bfbda2a7b0 100644
--- a/torch/csrc/autograd/python_function.cpp
+++ b/torch/csrc/autograd/python_function.cpp
@@ -40,7 +40,6 @@
 
 using namespace torch;
 using namespace torch::autograd;
-using namespace torch::jit;
 using at::Tensor;
 
 PyObject* THPFunctionClass = nullptr;
@@ -600,6 +599,7 @@ static void _append_subgraph(
     torch::jit::Graph* graph,
     std::vector<torch::jit::Value*> trace_outputs,
     bool unpack_output) {
+  using Value = torch::jit::Value;
   node->g_(
       torch::jit::attr::Subgraph,
       std::make_shared<torch::jit::Graph>(graph->current_scope()));
@@ -692,8 +692,8 @@ static void _trace_post_record(
   node->addOutput();
   auto old_node = node;
   if (!unpack_output) {
-    std::vector<TypePtr> tuple_values(num_outputs, TensorType::get());
-    TypePtr tuple_type = TupleType::create(std::move(tuple_values));
+    std::vector<at::TypePtr> tuple_values(num_outputs, at::TensorType::get());
+    auto tuple_type = at::TupleType::create(std::move(tuple_values));
     // Original type is tuple of tensors "without" element type and shape.
     // The missed parts will be added below.
     node->output()->setType(tuple_type);
@@ -705,7 +705,7 @@ static void _trace_post_record(
   for (const auto i : c10::irange(num_outputs)) {
     PyObject* obj = PyTuple_GET_ITEM(output_objects, i);
     if (THPVariable_Check(obj)) {
-      Value* value = node->outputs()[i];
+      auto value = node->outputs()[i];
       const auto& tensor = THPVariable_Unpack(obj);
       if (tensor.defined()) {
         value->inferTypeFrom(tensor);
@@ -723,12 +723,12 @@ static void _trace_post_record(
   // If TupleUnpack operator is created, we copy its output type back
   // to the original tuple type.
   if (!unpack_output) {
-    std::vector<TypePtr> new_tuple_values;
+    std::vector<at::TypePtr> new_tuple_values;
     for (const auto i : c10::irange(num_outputs)) {
-      TypePtr ptr = node->outputs()[i]->type();
+      auto ptr = node->outputs()[i]->type();
       new_tuple_values.push_back(ptr);
     }
-    TypePtr tuple_type = TupleType::create(std::move(new_tuple_values));
+    auto tuple_type = at::TupleType::create(std::move(new_tuple_values));
     // The i-th tuple element receives a new tensor type with element type and
     // shape.
     old_node->output()->setType(tuple_type);

From adfba5d0a771c692b9b2dd78eb8dd8cceb4ee567 Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@fb.com>
Date: Tue, 6 Dec 2022 02:22:16 +0000
Subject: [PATCH 1595/1922] Dynamo FX graph stack traceback fix (#87136)

Migration from https://github.com/pytorch/torchdynamo/pull/1655.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87136
Approved by: https://github.com/voznesenskym
---
 torch/_dynamo/config.py              |  4 ++++
 torch/_dynamo/output_graph.py        | 23 +++++++++++++++----
 torch/_dynamo/symbolic_convert.py    | 25 ++++++++------------
 torch/_dynamo/variables/builtin.py   |  6 ++---
 torch/_dynamo/variables/lists.py     |  1 -
 torch/_dynamo/variables/misc.py      |  2 --
 torch/_dynamo/variables/nn_module.py |  2 --
 torch/_dynamo/variables/tensor.py    | 34 ++++++++++++++--------------
 torch/_dynamo/variables/torch.py     |  7 ------
 9 files changed, 53 insertions(+), 51 deletions(-)

diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index 89e13bd484242..bd3f29cb3f378 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -33,6 +33,10 @@
 # Verbose will print full stack traces on warnings and errors
 verbose = False
 
+# If true, traced graph outputs will be outputted as Python GraphModule code.
+# If false, traced graph outputs will be outputted in tabular form.
+output_graph_code = False
+
 # verify the correctness of optimized backend
 verify_correctness = False
 
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index 8929c1afd2fa1..2dabef444cb89 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -161,6 +161,7 @@ def __init__(
         self.compiler_fn: CompilerFn = compiler_fn
         self.root_globals = f_globals
         self.root_tx = root_tx
+        self._current_tx = []
         self.cleanups: List[CleanupHook] = []
         self.should_exit = False
         self.random_values_var = None
@@ -186,6 +187,16 @@ def output(self):
     def fake_mode(self):
         return self.root_tx.fake_mode
 
+    def push_tx(self, tx):
+        self._current_tx.append(tx)
+
+    def pop_tx(self):
+        return self._current_tx.pop()
+
+    @property
+    def current_tx(self):
+        return self.root_tx if not self._current_tx else self._current_tx[-1]
+
     def copy_graphstate(self):
         """Create a checkpoint of the current state by copying everything"""
         assert self.nn_modules is not None
@@ -504,10 +515,15 @@ def compile_and_call_fx_graph(self, tx, rv, root):
 
         try:
             # the call to tabulate can cause a lot of memory to be allocated
-            if config.log_level <= logging.INFO:
+            if config.log_level <= logging.CODE:
+                graph_str = (
+                    gm.print_readable()
+                    if config.output_graph_code
+                    else format_graph_tabular(gm.graph)
+                )
                 log.log(
                     logging.CODE,  # type: ignore[attr-defined]
-                    f"TRACED GRAPH\n {name} {gm.forward.__code__.co_filename} {format_graph_tabular(gm.graph)}\n",
+                    f"TRACED GRAPH\n {name} {gm.forward.__code__.co_filename} {graph_str}\n",
                 )
         except ImportError:
             log.warning(
@@ -659,14 +675,13 @@ def create_proxy(
         name=None,
         type_expr=None,
         proxy_factory_fn=None,
-        current_tx=None,
     ):
         rv = super().create_proxy(
             kind, target, args, kwargs, name, type_expr, proxy_factory_fn
         )
 
         # append stack trace to fx node
-        tx = current_tx if current_tx else self.root_tx
+        tx = self.current_tx
 
         nn_module_stack = tx.nn_module_stack
         if nn_module_stack:
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index c594533e8b9a5..16935d0de2661 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -208,7 +208,6 @@ def inner(self: "InstructionTranslatorBase", inst: Instruction):
                 "call_function",
                 torch._assert,
                 *proxy_args_kwargs((value, error_msg), {}),
-                current_tx=self,
             )
             self.jump(inst)
             return
@@ -478,6 +477,7 @@ def step(self):
 
     def run(self):
         try:
+            self.output.push_tx(self)
             while (
                 self.instruction_pointer is not None
                 and not self.output.should_exit
@@ -491,6 +491,7 @@ def run(self):
                 e.exec_record = self.exec_recorder.get_record()
             raise
         finally:
+            self.output.pop_tx()
             # Cleanup the outputGraph to delete the held tensors. We perform the
             # cleanup only for InstructionTranslator and not
             # InliningInstructionTranslator. The InliningInstructionTranslator
@@ -1168,30 +1169,24 @@ def MAKE_FUNCTION(self, inst):
         )
 
     def UNPACK_SEQUENCE(self, inst):
-        # TODO(jansel): rewrite this using unpack_var_sequence
         seq = self.pop()
-        options = VariableTracker.propagate([seq])
         if isinstance(seq, BaseListVariable):
-            assert len(seq.items) == inst.argval
             self.output.guards.update(seq.guards)
-            for i in reversed(seq.items):
-                self.push(i)
+            val = seq.unpack_var_sequence(self)
         elif seq.is_python_constant() and isinstance(seq, ConstantVariable):
-            val = seq.as_python_constant()
-            assert len(val) == inst.argval
-            for i in reversed(val):
-                self.push(ConstantVariable(i, **options))
+            val = seq.unpack_var_sequence(self)
         elif isinstance(seq, TensorVariable):
-            proxy = seq.as_proxy()
-            for i in reversed(range(inst.argval)):
-                self.push(wrap_fx_proxy(self, proxy[i], **options))
+            val = seq.unpack_var_sequence(self, idxes=range(inst.argval))
         elif isinstance(seq, GetAttrVariable) and isinstance(seq.obj, TensorVariable):
             # x, y = a.shape
             proxy = getattr(seq.obj.as_proxy(), seq.name)
-            for i in reversed(range(inst.argval)):
-                self.push(wrap_fx_proxy(self, proxy[i], **options))
+            options = VariableTracker.propagate(self)
+            val = [wrap_fx_proxy(self, proxy[i], **options) for i in range(inst.argval)]
         else:
             unimplemented(f"UNPACK_SEQUENCE {seq}")
+        assert len(val) == inst.argval
+        for i in reversed(val):
+            self.push(i)
 
     def UNPACK_EX(self, inst):
         assert 0 <= inst.argval <= 0xFFFF
diff --git a/torch/_dynamo/variables/builtin.py b/torch/_dynamo/variables/builtin.py
index 369b9364a4163..127c0359d56f3 100644
--- a/torch/_dynamo/variables/builtin.py
+++ b/torch/_dynamo/variables/builtin.py
@@ -273,7 +273,9 @@ def call_function(
                     fn, args = operator.add, [args[1], args[0]]
 
                 proxy = tx.output.create_proxy(
-                    "call_function", fn, *proxy_args_kwargs(args, kwargs), current_tx=tx
+                    "call_function",
+                    fn,
+                    *proxy_args_kwargs(args, kwargs),
                 )
                 if any([isinstance(arg, FakeItemVariable) for arg in args]):
                     return wrap_fx_proxy_cls(
@@ -333,7 +335,6 @@ def call_function(
                     fn_,
                     (args[0].as_proxy(),),
                     {},
-                    current_tx=tx,
                 ),
                 **options,
             )
@@ -393,7 +394,6 @@ def _call_min_max(self, tx, a, b):
                         "call_function",
                         self.fn,
                         *proxy_args_kwargs([a, b], {}),
-                        current_tx=tx,
                     ),
                     **VariableTracker.propagate(self, [a, b]),
                 )
diff --git a/torch/_dynamo/variables/lists.py b/torch/_dynamo/variables/lists.py
index 82dc0424820b8..82a7d79a1c367 100644
--- a/torch/_dynamo/variables/lists.py
+++ b/torch/_dynamo/variables/lists.py
@@ -373,7 +373,6 @@ def _dynamo_get_item_lambda(target, index):
                 "call_function",
                 _dynamo_get_item_lambda,
                 *proxy_args_kwargs([self, arg], {}),
-                current_tx=tx,
             )
             items = self.items[index]
 
diff --git a/torch/_dynamo/variables/misc.py b/torch/_dynamo/variables/misc.py
index 8cba35aaaa090..58db779178f53 100644
--- a/torch/_dynamo/variables/misc.py
+++ b/torch/_dynamo/variables/misc.py
@@ -571,7 +571,6 @@ def call_function(
                             "call_function",
                             original_torch_or_getattr_variable.value,
                             *proxy_args_kwargs(new_args, new_kwargs),
-                            current_tx=tx,
                         ),
                         **options,
                     )
@@ -582,7 +581,6 @@ def call_function(
                             "call_method",
                             original_torch_or_getattr_variable.name,
                             *proxy_args_kwargs(new_args, new_kwargs),
-                            current_tx=tx,
                         ),
                         **options,
                     )
diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py
index 454daae1d1f63..27dd9d6c067d8 100644
--- a/torch/_dynamo/variables/nn_module.py
+++ b/torch/_dynamo/variables/nn_module.py
@@ -204,7 +204,6 @@ def record_nn_module_stack():
                         "call_module",
                         self.module_key,
                         *proxy_args_kwargs(args, kwargs),
-                        current_tx=tx,
                     ),
                     **options,
                 )
@@ -469,7 +468,6 @@ def make_attr(name):
                     name,
                     args=(proxy_for_mod, *proxy_args),
                     kwargs=proxy_kwargs,
-                    current_tx=tx,
                 ),
                 **options,
             )
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index edf7d16745734..d9973ef3b87fa 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -170,17 +170,16 @@ def var_getattr(self, tx, name):
 
         return result
 
-    def unpack_var_sequence(self, tx):
-        options = VariableTracker.propagate(self)
-        if self.size:
-            return [
-                variables.BuiltinVariable(operator.getitem, **options).call_function(
-                    tx, [self, variables.ConstantVariable(i)], {}
-                )
-                for i in range(self.size[0])
-            ]
+    def unpack_var_sequence(self, tx, idxes=None):
+        from .builder import wrap_fx_proxy
 
-        return super(TensorVariable, self).unpack_var_sequence(tx)
+        if idxes is None:
+            if self.size:
+                idxes = range(self.size[0])
+            else:
+                return super(TensorVariable, self).unpack_var_sequence(tx)
+        options = VariableTracker.propagate(self)
+        return [wrap_fx_proxy(tx, self.as_proxy()[i], **options) for i in idxes]
 
     def call_method(
         self,
@@ -206,7 +205,6 @@ def call_method(
                     "call_method",
                     name,
                     *proxy_args_kwargs([self] + list(args), kwargs),
-                    current_tx=tx,
                 ),
                 **options,
             )
@@ -274,7 +272,10 @@ def call_method(
                 return wrap_fx_proxy(
                     tx,
                     tx.output.create_proxy(
-                        "call_method", "item", (self.as_proxy(),), {}, current_tx=tx
+                        "call_method",
+                        "item",
+                        (self.as_proxy(),),
+                        {},
                     ),
                     example_value=example_value,
                     **options,
@@ -289,7 +290,10 @@ def call_method(
                 return wrap_fx_proxy(
                     tx,
                     tx.output.create_proxy(
-                        "call_function", len, (self.as_proxy(),), {}, current_tx=tx
+                        "call_function",
+                        len,
+                        (self.as_proxy(),),
+                        {},
                     ),
                     **options,
                 )
@@ -299,7 +303,6 @@ def call_method(
                 "call_function",
                 operator.setitem,
                 *proxy_args_kwargs([self] + list(args), kwargs),
-                current_tx=tx,
             )
             return ConstantVariable(None, **options)
         elif name in ("resize_", "resize_as_"):
@@ -331,7 +334,6 @@ def call_method(
                     "call_method",
                     name,
                     *proxy_args_kwargs([self] + list(args), kwargs),
-                    current_tx=tx,
                 ),
                 **options,
             )
@@ -351,7 +353,6 @@ def call_method(
                     "call_method",
                     name,
                     *proxy_args_kwargs([self] + list(args), kwargs),
-                    current_tx=tx,
                 ),
                 **options,
             )
@@ -407,7 +408,6 @@ def call_method(
                 "call_method",
                 name,
                 *proxy_args_kwargs([self] + list(args), kwargs),
-                current_tx=tx,
             ),
             **options,
         )
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index d737e460304ff..2fad73062cf71 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -324,7 +324,6 @@ def get_state_from_generator():
                     "call_function",
                     get_state_from_generator,
                     *proxy_args_kwargs(args, kwargs),
-                    current_tx=tx,
                 ),
                 example_value=self.value(),
                 **options,
@@ -357,7 +356,6 @@ def get_state_from_generator():
                     "call_function",
                     self.value,
                     *proxy_args_kwargs(args, kwargs),
-                    current_tx=tx,
                 ),
                 example_value=example_value,
                 **options,
@@ -376,7 +374,6 @@ def get_state_from_generator():
                     "call_method",
                     "numel",
                     *proxy_args_kwargs(args, kwargs),
-                    current_tx=tx,
                 ),
                 **options,
             )
@@ -434,7 +431,6 @@ def get_state_from_generator():
                     "call_function",
                     fn_,
                     *proxy_args_kwargs(args, kwargs),
-                    current_tx=tx,
                 ),
                 **options,
             )
@@ -506,7 +502,6 @@ def fake_softmax(input):
                     "call_function",
                     torch.nn.functional.softmax,
                     *proxy_args_kwargs([input, dim], {}),
-                    current_tx=tx,
                 ),
                 **VariableTracker.propagate([self, dim, input]),
             )
@@ -572,7 +567,6 @@ def fake_cross_entropy_loss(input, target):
                         ],
                         {},
                     ),
-                    current_tx=tx,
                 ),
                 **VariableTracker.propagate(
                     [
@@ -745,7 +739,6 @@ def register_as_subgraph(fn, name, args):
                 self.value,
                 args=tuple(p_args),
                 kwargs={},
-                current_tx=tx,
             ),
             example_value=self.value(*u_args),
         )

From e45ad1982cc1932978123e98654d4995b621584c Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Mon, 5 Dec 2022 21:08:34 +0000
Subject: [PATCH 1596/1922] [inductor] Add test_ops_gradients running with
 inductor (#89792)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89792
Approved by: https://github.com/janeyx99, https://github.com/clee2000, https://github.com/huydhn
---
 .jenkins/pytorch/test.sh | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 414c85abff28c..fb076e13c4172 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -252,11 +252,8 @@ test_inductor_distributed() {
 
 test_inductor() {
   python tools/dynamo/verify_dynamo.py
-  python test/run_test.py --include test_modules test_ops --verbose
+  python test/run_test.py --include test_modules test_ops test_ops_gradients --verbose
   PYTORCH_TEST_WITH_INDUCTOR=0 python test/run_test.py --include inductor/test_torchinductor --include inductor/test_torchinductor_opinfo --verbose
-  # TODO: investigate "RuntimeError: CUDA driver API confirmed a leak"
-  # seen intest_ops_gradients.py
-  # pytest test/test_ops_gradients.py --verbose -k "not _complex and not test_inplace_grad_acos_cuda_float64"
 }
 
 test_inductor_huggingface() {

From 1e26a2dcfe0948b0f67e2bbec12f1bf79c847141 Mon Sep 17 00:00:00 2001
From: XiaobingSuper <xiaobing.zhang@intel.com>
Date: Mon, 5 Dec 2022 07:39:06 +0000
Subject: [PATCH 1597/1922] TorchDynamo: always convert flexiblelayout to be
 FixedLayout when given a stride_order (#89904)

For convolution, we always call **require_stride_order** to convert the input to the target stride order,  if the original input's layout is flexiblelayout, there always have a memory copy because the **is_stride_order_storage_and_layout** only checks the init stride order,  I think for flexiblelayout, means it's layout can be changed, if the user gives a stride order, I think we always need to convert the flexiblelayout to be FixedLayout using given strider order.

Given a CV user case, the max_pooling's output is used by two convolutions, there has two memory copies:

```
kernel_cpp_0 = async_compile.cpp('''
#include "/tmp/torchinductor_xiaobing/77/c7773nj5pwikpmm2pwa62rcudlf7p3if7eyqb5k4sjsvewwje4le.h"
extern "C" void kernel(const float* __restrict__ in_ptr0,
                       float* __restrict__ out_ptr0,
                       float* __restrict__ out_ptr1,
                       float* __restrict__ out_ptr2)
{
    #pragma GCC ivdep
    for(long i0=0; i0<128; i0+=1)
    {
        #pragma GCC ivdep
        for(long i1=0; i1<3; i1+=1)
        {
            #pragma GCC ivdep
            for(long i2=0; i2<3; i2+=1)
            {
                #pragma GCC ivdep
                for(long i3=0; i3<3; i3+=1)
                {
                    {
                        {
                            auto tmp0 = in_ptr0[i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp1 = in_ptr0[3 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp3 = in_ptr0[6 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp5 = in_ptr0[21 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp7 = in_ptr0[24 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp9 = in_ptr0[27 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp11 = in_ptr0[42 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp13 = in_ptr0[45 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp15 = in_ptr0[48 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp2 = (tmp0 != tmp0) ? tmp0 : std::max(tmp1, tmp0);
                            auto tmp4 = (tmp2 != tmp2) ? tmp2 : std::max(tmp3, tmp2);
                            auto tmp6 = (tmp4 != tmp4) ? tmp4 : std::max(tmp5, tmp4);
                            auto tmp8 = (tmp6 != tmp6) ? tmp6 : std::max(tmp7, tmp6);
                            auto tmp10 = (tmp8 != tmp8) ? tmp8 : std::max(tmp9, tmp8);
                            auto tmp12 = (tmp10 != tmp10) ? tmp10 : std::max(tmp11, tmp10);
                            auto tmp14 = (tmp12 != tmp12) ? tmp12 : std::max(tmp13, tmp12);
                            auto tmp16 = (tmp14 != tmp14) ? tmp14 : std::max(tmp15, tmp14);
                            out_ptr0[i3 + (3*i2) + (9*i1) + (27*i0)] = tmp16;
                        }
                    }
                }
            }
        }
    }
    #pragma GCC ivdep
    for(long i0=0; i0<128; i0+=1)
    {
        #pragma GCC ivdep
        for(long i1=0; i1<3; i1+=1)
        {
            #pragma GCC ivdep
            for(long i2=0; i2<9; i2+=1)
            {
                {
                    {
                        auto tmp0 = out_ptr0[i1 + (3*i2) + (27*i0)];
                        out_ptr1[i1 + (3*i2) + (27*i0)] = tmp0;
                        out_ptr2[i1 + (3*i2) + (27*i0)] = tmp0;
                    }
                }
            }
        }
    }
}
''')

async_compile.wait(globals())
del async_compile

def call(args):
    arg0_1, arg1_1, arg2_1, arg3_1, arg4_1 = args
    args.clear()
    buf0 = empty_strided((128, 3, 3, 3), (27, 1, 9, 3), device='cpu', dtype=torch.float32)
    buf2 = empty_strided((128, 3, 3, 3), (27, 1, 9, 3), device='cpu', dtype=torch.float32)
    buf4 = empty_strided((128, 3, 3, 3), (27, 1, 9, 3), device='cpu', dtype=torch.float32)
    kernel_cpp_0(c_void_p(arg4_1.data_ptr()), c_void_p(buf0.data_ptr()), c_void_p(buf2.data_ptr()), c_void_p(buf4.data_ptr()))
    del arg4_1
    del buf0
    buf3 = torch.ops.mkldnn._convolution_pointwise(buf2, arg0_1, arg1_1, (0, 0), (1, 1), (1, 1), 1, 'none', [], '')
    assert_size_stride(buf3, (128, 3, 3, 3), (27, 1, 9, 3))
    del arg0_1
    del arg1_1
    del buf2
    buf5 = torch.ops.mkldnn._convolution_pointwise(buf4, arg2_1, arg3_1, (0, 0), (1, 1), (1, 1), 1, 'none', [], '')
    assert_size_stride(buf5, (128, 3, 3, 3), (27, 1, 9, 3))
    del arg2_1
    del arg3_1
    return (buf3, buf5, )
```

After this PR, the generated  code will remove the redundant memory copy:

```
kernel_cpp_0 = async_compile.cpp('''
#include "/tmp/torchinductor_xiaobing/77/c7773nj5pwikpmm2pwa62rcudlf7p3if7eyqb5k4sjsvewwje4le.h"
extern "C" void kernel(const float* __restrict__ in_ptr0,
                       float* __restrict__ out_ptr0)
{
    #pragma GCC ivdep
    for(long i0=0; i0<128; i0+=1)
    {
        #pragma GCC ivdep
        for(long i1=0; i1<3; i1+=1)
        {
            #pragma GCC ivdep
            for(long i2=0; i2<3; i2+=1)
            {
                #pragma GCC ivdep
                for(long i3=0; i3<3; i3+=1)
                {
                    {
                        {
                            auto tmp0 = in_ptr0[i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp1 = in_ptr0[3 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp3 = in_ptr0[6 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp5 = in_ptr0[21 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp7 = in_ptr0[24 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp9 = in_ptr0[27 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp11 = in_ptr0[42 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp13 = in_ptr0[45 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp15 = in_ptr0[48 + i3 + (6*i2) + (42*i1) + (147*i0)];
                            auto tmp2 = (tmp0 != tmp0) ? tmp0 : std::max(tmp1, tmp0);
                            auto tmp4 = (tmp2 != tmp2) ? tmp2 : std::max(tmp3, tmp2);
                            auto tmp6 = (tmp4 != tmp4) ? tmp4 : std::max(tmp5, tmp4);
                            auto tmp8 = (tmp6 != tmp6) ? tmp6 : std::max(tmp7, tmp6);
                            auto tmp10 = (tmp8 != tmp8) ? tmp8 : std::max(tmp9, tmp8);
                            auto tmp12 = (tmp10 != tmp10) ? tmp10 : std::max(tmp11, tmp10);
                            auto tmp14 = (tmp12 != tmp12) ? tmp12 : std::max(tmp13, tmp12);
                            auto tmp16 = (tmp14 != tmp14) ? tmp14 : std::max(tmp15, tmp14);
                            out_ptr0[i3 + (3*i2) + (9*i1) + (27*i0)] = tmp16;
                        }
                    }
                }
            }
        }
    }
}
''')

async_compile.wait(globals())
del async_compile

def call(args):
    arg0_1, arg1_1, arg2_1, arg3_1, arg4_1 = args
    args.clear()
    buf0 = empty_strided((128, 3, 3, 3), (27, 1, 9, 3), device='cpu', dtype=torch.float32)
    kernel_cpp_0(c_void_p(arg4_1.data_ptr()), c_void_p(buf0.data_ptr()))
    del arg4_1
    buf2 = torch.ops.mkldnn._convolution_pointwise(buf0, arg0_1, arg1_1, (0, 0), (1, 1), (1, 1), 1, 'none', [], '')
    assert_size_stride(buf2, (128, 3, 3, 3), (27, 1, 9, 3))
    del arg0_1
    del arg1_1
    buf3 = torch.ops.mkldnn._convolution_pointwise(buf0, arg2_1, arg3_1, (0, 0), (1, 1), (1, 1), 1, 'none', [], '')
    assert_size_stride(buf3, (128, 3, 3, 3), (27, 1, 9, 3))
    del arg2_1
    del arg3_1
    return (buf2, buf3, )

```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89904
Approved by: https://github.com/jansel
---
 torch/_inductor/ir.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 2ce3fa37388bc..253f217320e2e 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -2497,9 +2497,7 @@ def require_stride_order(cls, x, order):
 
         # require x to have the layout as strided_ordered as order
         if is_storage_and_layout(x):
-            if isinstance(
-                x.get_layout(), FlexibleLayout
-            ) and is_stride_order_storage_and_layout(x, order):
+            if isinstance(x.get_layout(), FlexibleLayout):
                 # fix flexiblelayout to be FixedLayout with stride_order
                 as_storage_and_layout(
                     x, freeze=True, want_contiguous=False, stride_order=order

From 6737eacf2fe329cb4ec6463bd9abfb59c96fe21b Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Tue, 6 Dec 2022 03:22:18 +0000
Subject: [PATCH 1598/1922] Revert "[Composable API] `replicate`: add support
 for DDP args (#89243)"

This reverts commit 0f274ed385d676cb28c792ca104114ca63210055.

Reverted https://github.com/pytorch/pytorch/pull/89243 on behalf of https://github.com/malfet due to Depends on https://github.com/pytorch/pytorch/pull/89222 that introduced spurious module updates
---
 test/distributed/_composable/test_replicate.py |  7 -------
 torch/distributed/_composable/replicate.py     | 13 ++++---------
 2 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/test/distributed/_composable/test_replicate.py b/test/distributed/_composable/test_replicate.py
index de9fbfdbbc376..db1459589b342 100644
--- a/test/distributed/_composable/test_replicate.py
+++ b/test/distributed/_composable/test_replicate.py
@@ -102,13 +102,6 @@ def test_replicate_multi_module(self):
         replicate(replicate_model.fc3)
         self._compare_module(model, replicate_model)
 
-    def test_replicate_with_kwargs(self):
-        model = Net()
-        replicate_model = replicate(
-            deepcopy(model), bucket_cap_mb=1, gradient_as_bucket_view=True
-        )
-        self._compare_module(model, replicate_model)
-
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/distributed/_composable/replicate.py b/torch/distributed/_composable/replicate.py
index 95aa8ee4c7d25..c27a88d79b4d9 100644
--- a/torch/distributed/_composable/replicate.py
+++ b/torch/distributed/_composable/replicate.py
@@ -12,9 +12,8 @@ def __init__(self) -> None:
         self.modules: List[nn.Module] = []
         self.has_initialized: bool = False
         self._param_list: nn.ParameterList = nn.ParameterList()
-        self.kwargs: dict = {}
 
-    def mark_modules(self, *modules: nn.Module, **kwargs) -> None:
+    def mark_modules(self, *modules: nn.Module) -> None:
         for module in modules:
             self.modules.append(module)
             replicate.state(module)._distributed_state = self
@@ -22,7 +21,6 @@ def mark_modules(self, *modules: nn.Module, **kwargs) -> None:
             module.register_forward_pre_hook(self.forward_pre_hook)
             # TODO(@yhcharles): fix type error
             module.register_forward_hook(self.forward_post_hook)  # type: ignore[arg-type]
-        self.kwargs = kwargs
 
     def _recursive_collect_params(self, module: nn.Module) -> None:
         # TODO: skip if managed by other APIs
@@ -41,7 +39,7 @@ def _recursive_collect_params(self, module: nn.Module) -> None:
         for child in module.children():
             self._recursive_collect_params(child)
 
-    def init_helper(self) -> None:
+    def init_helper(self):
         if self.has_initialized:
             return
 
@@ -49,9 +47,7 @@ def init_helper(self) -> None:
         for module in self.modules:
             self._recursive_collect_params(module)
 
-        self._ddp = _ddp.DistributedDataParallel(
-            self._param_list, **self.kwargs
-        )
+        self._ddp = _ddp.DistributedDataParallel(self._param_list)
 
     def forward_pre_hook(
         self, module: nn.Module, input: Tuple[torch.Tensor]
@@ -71,7 +67,6 @@ def forward_post_hook(
 @contract
 def replicate(
     module: nn.Module,  # NOTE: contract now supports single module only
-    **kwargs,
 ) -> nn.Module:
     r"""Replicates module(s)
 
@@ -82,5 +77,5 @@ def replicate(
         >>> module = nn.Linear(3, 3)
         >>> replicate(module)
     """
-    _ReplicateState().mark_modules(module, **kwargs)
+    _ReplicateState().mark_modules(module)
     return module

From 18ca66973a1ba0cc41845ceb564b7792c2ad5b88 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Tue, 6 Dec 2022 03:25:13 +0000
Subject: [PATCH 1599/1922] [vision hash update] update the pinned vision hash
 (#90239)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned vision hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90239
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/vision.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index 3b388adc51501..0b378f4f7a325 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-01c11a0564b8417561ae4c414fe659fc97476987
+842e178a488722720b6eb1e9cb508439e8e1ecd9

From ef7fc7cc6e69befc53846e3efe52b9935c761424 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Tue, 6 Dec 2022 03:26:26 +0000
Subject: [PATCH 1600/1922] Revert "[Composable API] `replicate`: change to per
 module call, remove `mark_root_module()` (#89222)"

This reverts commit 65a0dcffd8d387bb8c90216e63fdabb6e33e4e4d.

Reverted https://github.com/pytorch/pytorch/pull/89222 on behalf of https://github.com/malfet due to Included unintended submodule updates
---
 .../distributed/_composable/test_replicate.py |  6 +--
 third_party/ideep                             |  2 +-
 third_party/kineto                            |  2 +-
 torch/distributed/_composable/replicate.py    | 40 +++++++++++++++----
 4 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/test/distributed/_composable/test_replicate.py b/test/distributed/_composable/test_replicate.py
index db1459589b342..3e8bf44a1fdea 100644
--- a/test/distributed/_composable/test_replicate.py
+++ b/test/distributed/_composable/test_replicate.py
@@ -7,7 +7,7 @@
 import torch.distributed as dist
 import torch.nn.functional as F
 from torch import nn
-from torch.distributed._composable.replicate import replicate
+from torch.distributed._composable.replicate import mark_root_module, replicate
 from torch.testing._internal.common_distributed import MultiProcessTestCase
 from torch.testing._internal.common_utils import run_tests
 
@@ -91,12 +91,12 @@ def step_model(model, input, target):
 
     def test_replicate_single_module(self):
         model = Net()
-        replicate_model = replicate(deepcopy(model))
+        replicate_model = mark_root_module(replicate(deepcopy(model)))
         self._compare_module(model, replicate_model)
 
     def test_replicate_multi_module(self):
         model = Net()
-        replicate_model = deepcopy(model)
+        replicate_model = mark_root_module(deepcopy(model))
         replicate(replicate_model.fc1)
         replicate(replicate_model.fc2)
         replicate(replicate_model.fc3)
diff --git a/third_party/ideep b/third_party/ideep
index ececd0a4f53c3..5ddc65efe0428 160000
--- a/third_party/ideep
+++ b/third_party/ideep
@@ -1 +1 @@
-Subproject commit ececd0a4f53c39f2d91caaddee0de1cd214f5b99
+Subproject commit 5ddc65efe0428bbce2942b3ce5e3ce15239abe2f
diff --git a/third_party/kineto b/third_party/kineto
index 0703c78999061..6c1629809068e 160000
--- a/third_party/kineto
+++ b/third_party/kineto
@@ -1 +1 @@
-Subproject commit 0703c78999061b8329dfab7ec5046fc5764a5573
+Subproject commit 6c1629809068efd78a8d56b4aa479c7ec49ae562
diff --git a/torch/distributed/_composable/replicate.py b/torch/distributed/_composable/replicate.py
index c27a88d79b4d9..0e94427afee88 100644
--- a/torch/distributed/_composable/replicate.py
+++ b/torch/distributed/_composable/replicate.py
@@ -7,7 +7,11 @@
 from .contract import contract
 
 
-class _ReplicateState:
+class DistributedState:
+    ...
+
+
+class ReplicateState(DistributedState):
     def __init__(self) -> None:
         self.modules: List[nn.Module] = []
         self.has_initialized: bool = False
@@ -18,9 +22,6 @@ def mark_modules(self, *modules: nn.Module) -> None:
             self.modules.append(module)
             replicate.state(module)._distributed_state = self
             replicate.state(module)._params_collected = False
-            module.register_forward_pre_hook(self.forward_pre_hook)
-            # TODO(@yhcharles): fix type error
-            module.register_forward_hook(self.forward_post_hook)  # type: ignore[arg-type]
 
     def _recursive_collect_params(self, module: nn.Module) -> None:
         # TODO: skip if managed by other APIs
@@ -49,13 +50,13 @@ def init_helper(self):
 
         self._ddp = _ddp.DistributedDataParallel(self._param_list)
 
-    def forward_pre_hook(
+    def root_module_forward_pre_hook(
         self, module: nn.Module, input: Tuple[torch.Tensor]
     ) -> None:
         self.init_helper()
         self._ddp.pre_forward()
 
-    def forward_post_hook(
+    def root_module_forward_post_hook(
         self,
         module: nn.Module,
         input: Tuple[torch.Tensor],
@@ -64,9 +65,14 @@ def forward_post_hook(
         return self._ddp.post_forward(output)
 
 
+# TODO(@yhcharles): use a per-model instance instead of a global one
+_default_state = ReplicateState()
+
+
 @contract
 def replicate(
     module: nn.Module,  # NOTE: contract now supports single module only
+    dist_state: ReplicateState = _default_state,
 ) -> nn.Module:
     r"""Replicates module(s)
 
@@ -77,5 +83,25 @@ def replicate(
         >>> module = nn.Linear(3, 3)
         >>> replicate(module)
     """
-    _ReplicateState().mark_modules(module)
+    dist_state.mark_modules(module)
+    return module
+
+
+def mark_root_module(
+    module: nn.Module, dist_state: ReplicateState = _default_state
+) -> nn.Module:
+    r"""Mark the root module. Its sub-modules can be replicated.
+
+    Args:
+        modules (torch.nn.Module): root module
+
+    Example::
+        >>> module = nn.Linear(3, 3)
+        >>> replicate(module)
+    """
+    module.register_forward_pre_hook(dist_state.root_module_forward_pre_hook)
+    # TODO(@yhcharles): fix type error
+    module.register_forward_hook(
+        dist_state.root_module_forward_post_hook  # type: ignore[arg-type]
+    )
     return module

From b501a3f54bf5b17706ce9bc756957d2814719ee7 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 5 Dec 2022 15:18:34 -0500
Subject: [PATCH 1601/1922] Assert there are no outstanding side effects before
 calling cond (#90208)

The current cond implementation is silently incorrect when
there are outstanding side effects, since the locally tracked
side effects are lost when the recursive export call is made.
At least we raise an assert now.

I'm working on a refactor of cond which should be able to sidestep
this problem. Maybe.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Differential Revision: [D41746973](https://our.internmc.facebook.com/intern/diff/D41746973)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90208
Approved by: https://github.com/voznesenskym
---
 test/dynamo/test_misc.py         | 22 ++++++++++++++++++++++
 torch/_dynamo/variables/torch.py | 17 +++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index 12f8a2357043d..80890d5f3bbed 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -2353,6 +2353,28 @@ def f(pred, x):
         b = opt_fn(torch.tensor(True), torch.tensor([0.25, 0.25]))
         self.assertTrue(same(torch.sin(torch.tensor([0.25, 0.25])), b))
 
+    @unittest.expectedFailure
+    def test_cond_side_effects(self):
+        from functorch.experimental.cond import cond
+
+        c = 0
+
+        def true_fn(x):
+            return x - c
+
+        def false_fn(x):
+            return x + c
+
+        def f(pred, x):
+            nonlocal c
+            c = 1
+            return cond(pred, true_fn, false_fn, [x])
+
+        opt_fn = torch._dynamo.optimize("eager")(f)
+        c = 0
+        a = opt_fn(torch.tensor(False), torch.tensor([0.25, 0.25]))
+        self.assertTrue(same(torch.tensor([1.25, 1.25]), a))
+
     def test_cond_nested(self):
         from functorch.experimental.control_flow import cond
 
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index 2fad73062cf71..2edb462d0c0bb 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -706,6 +706,23 @@ def register_as_subgraph(fn, name, args):
             # ops - see torch/dispatch/_dispatcher.py
             from .. import config
 
+            # The current recursive export() implementation will
+            # not "see" any side effect updates from the enclosing
+            # context, which can result in possibly incorrect
+            # export.  This assert ensures that there were no
+            # outstanding side effects at the time cond() was called.
+            #
+            # TODO: This assert may be too aggressive; I'm landing it
+            # to see if it is or not.
+            assert tx.output.side_effects.is_empty(), (
+                "Handling a cond operator when there are outstanding "
+                "side effects in the trace is not currently supported.  "
+                "Please file a bug to PyTorch requesting this functionality.  "
+                "You may be able to unblock by removing side effects (e.g., "
+                "mutating Python variables/data structures/etc) from your "
+                "model."
+            )
+
             assert len(p_args) == 4
             assert type(args[0]) is TensorVariable  # predicate
             assert type(p_args[1]) is UserFunctionVariable  # true_fn

From 01441a6470195e3df2bc136a6602b0256f3f60ff Mon Sep 17 00:00:00 2001
From: Danni Li <dannili@meta.com>
Date: Tue, 6 Dec 2022 04:03:01 +0000
Subject: [PATCH 1602/1922] Functionalization: skip meta block computation if
 compute_reference_meta is false (#90219)

Skip computing meta block when `compute_reference_meta` is `False`.

Issue: #89914

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90219
Approved by: https://github.com/ezyang
---
 torchgen/gen_functionalization_type.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchgen/gen_functionalization_type.py b/torchgen/gen_functionalization_type.py
index 33b4e4d86bb90..39b496243712a 100644
--- a/torchgen/gen_functionalization_type.py
+++ b/torchgen/gen_functionalization_type.py
@@ -386,7 +386,7 @@ def emit_view_functionalization_body(
         {view_tensor_name}.key_set().has_backend(c10::BackendComponent::XLABit) ||
         {view_tensor_name}.key_set().has_backend(c10::BackendComponent::LazyBit);
       {return_type} reference_tensor_output;
-      {{
+      if (compute_reference_meta) {{
         at::AutoDispatchSkipFunctionalize func_guard;
         c10::impl::ExcludeDispatchKeyGuard guard(exclude_keys_for_meta_dispatch);
         {meta_conversion_str}

From 809aed6ef56dc25b2b34b6d156ef91bbd2a55002 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Mon, 5 Dec 2022 17:59:02 +0000
Subject: [PATCH 1603/1922] as_strided: Fix default storage_offset for
 reference implementation (#89513)

This fixes the default storage_offset to take it from the input. This was
previously untested, so I've also added a new OpInfo which includes samples with
non-zero storage_offsets on the input tensor.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89513
Approved by: https://github.com/ezyang, https://github.com/ngimel
---
 test/functorch/test_aotdispatch.py            |  1 +
 test/functorch/test_ops.py                    | 14 +++-
 test/functorch/test_vmap.py                   |  2 +
 torch/_refs/__init__.py                       | 10 ++-
 .../_internal/common_methods_invocations.py   | 72 ++++++++++++++++---
 5 files changed, 88 insertions(+), 11 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 2434e35ab4871..165041edfb306 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1776,6 +1776,7 @@ def forward(self, x):
     xfail('scatter_reduce', 'prod'),
 
     skip('as_strided_scatter'),
+    xfail('as_strided', 'partial_views'),
 
     # Too annoying to generate random inputs
     xfail('cholesky'),
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index fbee1872ddf3e..74b9c26e6529c 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -414,6 +414,7 @@ def wrapped_fn(*args, **kwargs):
         # BUG
         # AssertionError: Tensor-likes are not close!
         xfail('as_strided'),
+        xfail('as_strided', 'partial_views'),
         decorate('linalg.det', 'singular',
                  decorator=unittest.skipIf(IS_MACOS and IS_X86, "Fails on x86 MacOS CI")),
     }))
@@ -526,6 +527,7 @@ def maybe_clone_inputs():
         xfail('as_strided'),
         xfail('as_strided_scatter'),
         xfail('_softmax_backward_data', device_type='cpu'),
+        xfail('as_strided', 'partial_views'),
     }))
     @opsToleranceOverride('TestOperators', 'test_vjp', (
         tol1('nn.functional.conv_transpose3d',
@@ -655,6 +657,7 @@ def fn(inp, *args, **kwargs):
         skip("atleast_3d"),  # Takes too long
         skip("ormqr"),  # Takes too long
         xfail("as_strided"),  # incorrect output
+        xfail("as_strided", "partial_views"),  # incorrect output
         xfail("as_strided_scatter"),  # incorrect output
         skip("bernoulli"),  # calls random op
         xfail("bfloat16"),  # rank 4 tensor for channels_last
@@ -735,6 +738,9 @@ def fn(inp, *args, **kwargs):
         tol1('svd',
              {torch.float32: tol(atol=1e-03, rtol=5e-04)}),
     ))
+    @skipOps('TestOperators', 'test_vmapvjpvjp', {
+        xfail('as_strided', 'partial_views'),
+    })
     def test_vmapvjpvjp(self, device, dtype, op):
         # Since, we test `vjpvjp` independently,
         # for this test, we just verify that vmap
@@ -802,6 +808,7 @@ def vjp_of_vjp(*args_and_cotangents):
         xfail('svd_lowrank', ''),  # randomness
         xfail('to_sparse', ''),  # non-dense output
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
+        xfail('as_strided', 'partial_views'),
         # ----------------------------------------------------------------------
 
         # ---------------------------- BUGS ------------------------------------
@@ -851,7 +858,9 @@ def vjp_of_vjp(*args_and_cotangents):
         tol1('linalg.householder_product',
              {torch.float32: tol(atol=1e-04, rtol=1e-04)}),
     ))
-    @skipOps('TestOperators', 'test_vmapvjp', vmapvjp_fail)
+    @skipOps('TestOperators', 'test_vmapvjp', vmapvjp_fail.union({
+        xfail('as_strided', 'partial_views'),
+    }))
     def test_vmapvjp(self, device, dtype, op):
         if not op.supports_autograd:
             self.skipTest("Skipped! Autograd not supported.")
@@ -899,6 +908,7 @@ def test_vmapvjp(self, device, dtype, op):
         decorate('linalg.det', 'singular', decorator=unittest.skipIf(IS_MACOS, "Fails on x86 MacOS CI")),
         skip('nn.functional.max_pool1d'),  # fails on cpu, runs on cuda
         xfail('masked.mean'),  # silent incorrectness (nan difference)
+        xfail('as_strided', 'partial_views'),  # Tensor-likes are not close!
 
         xfail('nn.functional.soft_margin_loss', ''),  # soft_margin_loss_backward does not support forward-ad
         xfail('tensor_split'),  # data_ptr composite compliance
@@ -1201,6 +1211,7 @@ def test():
         xfail('sparse.sampled_addmm', ''),
         xfail("native_batch_norm"),
         xfail("_native_batch_norm_legit"),
+        xfail('as_strided', 'partial_views'),
     }))
     def test_vjpvmap(self, device, dtype, op):
         # NB: there is no vjpvmap_has_batch_rule test because that is almost
@@ -1383,6 +1394,7 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents):
 
         # Potential bugs/errors
         xfail('as_strided'),  # AssertionError: Tensor-likes are not close!
+        xfail('as_strided', 'partial_views'),  # AssertionError: Tensor-likes are not close!
         xfail('as_strided_scatter'),  # AssertionError: Tensor-likes are not close!
         xfail('bernoulli'),  # calls random op
         xfail('bfloat16'),  # required rank 4 tensor to use channels_last format
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index dcad523217f3f..441cf46a98c22 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -3302,6 +3302,7 @@ def test():
         xfail('triu'),  # Exception not raised on error input
         # The error inputs are vectors, that pass when batched as they are treated as a matrix
         xfail('trace'),
+        xfail('as_strided', 'partial_views'),
     }))
     def test_vmap_exhaustive(self, device, dtype, op):
         # needs to be fixed
@@ -3317,6 +3318,7 @@ def test_vmap_exhaustive(self, device, dtype, op):
     ))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
     @skipOps('TestVmapOperatorsOpInfo', 'test_op_has_batch_rule', vmap_fail.union({
+        xfail('as_strided', 'partial_views'),
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
         xfail('complex'),
         xfail('copysign'),
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 04bf9e12927fa..3539784e8e32c 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -2513,9 +2513,15 @@ def atleast_3d(
 
 
 def as_strided(
-    a: TensorLikeType, size: ShapeType, stride: StrideType, storage_offset: int = 0
+    a: TensorLikeType,
+    size: ShapeType,
+    stride: StrideType,
+    storage_offset: Optional[int] = None,
 ) -> TensorLikeType:
-    return prims.as_strided(a, size, stride, storage_offset)
+    storage_offset_int = (
+        storage_offset if storage_offset is not None else a.storage_offset()
+    )
+    return prims.as_strided(a, size, stride, storage_offset_int)
 
 
 def broadcast_shapes(*shapes) -> ShapeType:
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index d11c275cc220c..91e1e8a1d636a 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -263,9 +263,15 @@ def sample_inputs_as_strided(op_info, device, dtype, requires_grad, **kwargs):
         kwargs = dict(storage_offset=storage_offset)
         yield SampleInput(input_t, args=(output_shape, stride), kwargs=kwargs)
 
+def sample_inputs_as_strided_partial_views(op_info, device, dtype, requires_grad, **kwargs):
+    def make_arg():
+        base = make_tensor((20,), device=device, dtype=dtype)
+        return base[5:15].requires_grad_(requires_grad)
+
     # as_strided on offset, partial views
-    # yield SampleInput(make_arg((20,))[5:15], args=((2, 2), (1, 2)))
-    # yield SampleInput(make_arg((20,))[5:15], args=((2, 2), (1, 2)), kwargs={'storage_offset': 0})
+    yield SampleInput(make_arg(), (2, 2), (1, 2))
+    yield SampleInput(make_arg(), (2, 2), (1, 2), storage_offset=0)
+    yield SampleInput(make_arg(), (2, 2), (1, 2), storage_offset=10)
 
 def sample_inputs_as_strided_scatter(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -10721,8 +10727,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type='cpu'),
            )),
     OpInfo('as_strided',
-           op=lambda x, size, stride, storage_offset=0:
-               torch.as_strided(x, size, stride, storage_offset=storage_offset),
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
            supports_out=False,
            supports_forward_ad=True,
@@ -10743,7 +10747,47 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_conj_view'),
                DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_view'),
                DecorateInfo(unittest.skip("Numerous errors"), 'TestFwdGradients'),
-               DecorateInfo(unittest.skip("Numerous errors"), 'TestBwdGradients'))),
+               DecorateInfo(unittest.skip("Numerous errors"), 'TestBwdGradients'),
+           )),
+    OpInfo('as_strided',
+           variant_test_name='partial_views',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # vmap does not support inplace views
+           check_inplace_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_as_strided_partial_views,
+           skips=(
+               # Note: This xfail is fine -- it's inherent to how as_strided works
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples'),
+               # RuntimeError: This operator is not Composite Compliant: the
+               # storage_offset of the tensor was modified directly without
+               # going through the PyTorch dispatcher.
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance'),
+
+
+               # These fail because the test changes the input's in-memory layout
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_complex_half_reference_testing'),
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_fn_fwgrad_bwgrad',
+                            dtypes=(torch.complex64, torch.complex128)),
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD'),
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_inplace_forward_mode_AD'),
+               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_inplace_grad'),
+               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_inplace_gradgrad'),
+               DecorateInfo(unittest.expectedFailure, 'TestProxyTensorOpInfo',
+                            'test_make_fx_symbolic_exhaustive_inplace'),
+               DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness'),
+               DecorateInfo(unittest.expectedFailure, 'TestCudaFuserOpInfo', 'test_nvfuser_correctness'),
+               DecorateInfo(unittest.expectedFailure, 'TestCudaFuserOpInfo', 'test_nvfuser_extremal_values'),
+               # Fail but are also flaky
+               DecorateInfo(unittest.skip("Test changes in memory layout"), 'TestMathBits'),
+               DecorateInfo(unittest.skip("Modifies input strides and storage_offset"), 'TestCommon',
+                            'test_non_standard_bool_values'),
+           )),
     OpInfo('as_strided_scatter',
            op=lambda x, src, size, stride, storage_offset=0:
                torch.as_strided_scatter(x, src, size, stride, storage_offset=storage_offset),
@@ -18282,15 +18326,27 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
         supports_nvfuser=False,
         skips=(
-            # TODO: fix and/or update to xfails
-            DecorateInfo(unittest.skip("Errors when storage_offset is included"),
-                         'TestCommon', 'test_python_ref_meta'),
             # cloned_mutable_input.is_same(returned_output) INTERNAL ASSERT FAILED
             DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_view'),
             DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_conj_view'),
             DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_conj_view'),
         ),
     ),
+    PythonRefInfo(
+        "_refs.as_strided",
+        torch_opinfo_name="as_strided",
+        torch_opinfo_variant_name="partial_views",
+        # FIXME: doesn't support chalf
+        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        supports_nvfuser=False,
+        skips=(
+            # cloned_mutable_input.is_same(returned_output) INTERNAL ASSERT FAILED
+            DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_conj_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu'),
+        ),
+    ),
     PythonRefInfo(
         "_refs.broadcast_shapes",
         torch_opinfo_name="broadcast_shapes",

From d7699ab577810afab6d9a767f2df40d5929503ea Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Tue, 6 Dec 2022 04:13:24 +0000
Subject: [PATCH 1604/1922] pad low precision matmuls when requested (#90235)

Matmul padding is beneficial not only for fp32, fp16/bf16 with amp can benefit as well.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90235
Approved by: https://github.com/jiawenliu64
---
 torch/_inductor/decomposition.py | 15 ++++---
 torch/_inductor/utils.py         | 77 --------------------------------
 2 files changed, 9 insertions(+), 83 deletions(-)

diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index c9897ec678e83..f8fedcc786015 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -154,8 +154,8 @@ def check_device_dtype(a: Tensor, b: Tensor):
     return (
         a.is_cuda
         and b.is_cuda
-        and a.dtype == torch.float32
-        and b.dtype == torch.float32
+        and a.dtype in (torch.float32, torch.float16, torch.bfloat16)
+        and b.dtype in (torch.float32, torch.float16, torch.bfloat16)
     )
 
 
@@ -210,6 +210,9 @@ def addmm(input, mat1, mat2, *, beta=1, alpha=1):
 
 
 def should_pad_bench(mat1, mat2, op, input=None):
+    assert utils.has_triton()
+    from triton.testing import do_bench
+
     with no_dispatch():
         if op is torch.ops.aten.mm or op is torch.ops.aten.addmm:
             m_padded_length = get_padded_length(mat1.shape[0])
@@ -230,13 +233,13 @@ def should_pad_bench(mat1, mat2, op, input=None):
         warmup = 5
         rep = 100
         if op is torch.ops.aten.bmm or op is torch.ops.aten.mm:
-            ori_time = utils.do_bench(
+            ori_time = do_bench(
                 lambda: op(mat1, mat2), warmup=warmup, rep=rep, fast_flush=True
             )[0]
         else:
             if input is not None:
                 input = torch.randn_like(input)
-            ori_time = utils.do_bench(
+            ori_time = do_bench(
                 lambda: op(input, mat1, mat2), warmup=warmup, rep=rep, fast_flush=True
             )[0]
 
@@ -248,14 +251,14 @@ def should_pad_bench(mat1, mat2, op, input=None):
                 input_pad = input.new_empty(
                     [get_padded_length(i) + i for i in input.shape]
                 )
-            pad_time = utils.do_bench(
+            pad_time = do_bench(
                 lambda: op(input_pad, mat1_pad, mat2_pad),
                 warmup=warmup,
                 rep=rep,
                 fast_flush=True,
             )[0]
         else:
-            pad_time = utils.do_bench(
+            pad_time = do_bench(
                 lambda: op(mat1_pad, mat2_pad), warmup=warmup, rep=rep, fast_flush=True
             )[0]
 
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index a2dc47a5628ec..ff2fae775220d 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -57,83 +57,6 @@ def conditional_product(*args):
     return functools.reduce(operator.mul, [x for x in args if x])
 
 
-def do_bench(
-    fn,
-    warmup=25,
-    rep=100,
-    grad_to_none=None,
-    percentiles=(0.5, 0.2, 0.8),
-    record_clocks=False,
-    fast_flush=False,
-):
-    """
-    Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
-    the 20-th and 80-th performance percentile.
-
-    :param fn: Function to benchmark
-    :type fn: Callable
-    :param warmup: Warmup time (in ms)
-    :type warmup: int
-    :param rep: Repetition time (in ms)
-    :type rep: int
-    :param grad_to_none: Reset the gradient of the provided tensor to None
-    :type grad_to_none: torch.tensor, optional
-    :param percentiles: Performance percentile to return in addition to the median.
-    :type percentiles: list[float]
-    :param fast_flush: Use faster kernel to flush L2 between measurements
-    :type fast_flush: bool
-    """
-
-    # Estimate the runtime of the function
-    fn()
-    torch.cuda.synchronize()
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
-    start_event.record()
-    for _ in range(5):
-        fn()
-    end_event.record()
-    torch.cuda.synchronize()
-    estimate_ms = start_event.elapsed_time(end_event) / 5
-    # compute number of warmup and repeat
-    n_warmup = max(1, int(warmup / estimate_ms))
-    n_repeat = max(1, int(rep / estimate_ms))
-    # We maintain a buffer of 256 MB that we clear
-    # before each kernel call to make sure that the L2
-    # doesn't contain any input data before the run
-    start_event = [torch.cuda.Event(enable_timing=True) for i in range(n_repeat)]
-    end_event = [torch.cuda.Event(enable_timing=True) for i in range(n_repeat)]
-    if fast_flush:
-        cache = torch.empty(int(256e6 // 4), dtype=torch.int, device="cuda")
-    else:
-        cache = torch.empty(int(256e6), dtype=torch.int8, device="cuda")
-    # Warm-up
-    for _ in range(n_warmup):
-        fn()
-    # Benchmark
-    for i in range(n_repeat):
-        # we don't want `fn` to accumulate gradient values
-        # if it contains a backward pass. So we clear the
-        # provided gradients
-        if grad_to_none is not None:
-            for x in grad_to_none:
-                x.grad = None
-        # we clear the L2 cache before each run
-        cache.zero_()
-        # record time of `fn`
-        start_event[i].record()
-        fn()
-        end_event[i].record()
-    # Record clocks
-    torch.cuda.synchronize()
-    times = torch.tensor([s.elapsed_time(e) for s, e in zip(start_event, end_event)])
-    if percentiles:
-        percentiles = torch.quantile(times, torch.tensor(percentiles)).tolist()
-        return tuple(percentiles)
-    else:
-        return torch.mean(times).item()
-
-
 def sympy_product(it):
     return functools.reduce(operator.mul, it, sympy.Integer(1))
 

From 0526cbad0cd80fddfca701f29b9350abe126611d Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Mon, 5 Dec 2022 22:05:35 +0000
Subject: [PATCH 1605/1922] Disallow registering meta function for
 CompositeImplicitAutograd ops (#90222)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90222
Approved by: https://github.com/ezyang
---
 torch/_meta_registrations.py | 59 ++++++++++++++----------------------
 1 file changed, 22 insertions(+), 37 deletions(-)

diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index eef1ab859e938..cb175914ddccd 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -250,15 +250,6 @@ def linalg_cholesky_ex(A: Tensor, upper: bool = False, check_errors: bool = Fals
     return L, infos
 
 
-# From aten/src/ATen/native/BatchLinearAlgebra.cpp
-@register_meta(aten.linalg_cholesky.default)
-def meta_linalg_cholesky(A: Tensor, upper=False):
-    # All the checks done on info in the corresponding C++ function
-    # are data dependent, so we skip info computation
-    L, infos = linalg_cholesky_ex(A, upper, False)
-    return L, infos
-
-
 # From aten/src/ATen/native/ReflectionPad.cpp
 @register_meta(
     [aten.reflection_pad2d_backward.default, aten.replication_pad2d_backward.default]
@@ -1833,7 +1824,14 @@ def meta_scatter_add_(self, dim, index, src):
     return self
 
 
-@register_meta(aten.scatter)
+@register_meta(
+    [
+        aten.scatter.src,
+        aten.scatter.value,
+        aten.scatter.reduce,
+        aten.scatter.value_reduce,
+    ]
+)
 @out_wrapper()
 def meta_scatter(self, dim, index, src_or_value, reduce=None):
     src = src_or_value if isinstance(src_or_value, torch.Tensor) else None
@@ -1841,7 +1839,14 @@ def meta_scatter(self, dim, index, src_or_value, reduce=None):
     return self.new_empty(self.shape)
 
 
-@register_meta(aten.scatter_)
+@register_meta(
+    [
+        aten.scatter_.src,
+        aten.scatter_.value,
+        aten.scatter_.reduce,
+        aten.scatter_.value_reduce,
+    ]
+)
 def meta_scatter_(self, dim, index, src_or_value, reduce=None):
     src = src_or_value if isinstance(src_or_value, torch.Tensor) else None
     scatter_meta_impl(self, dim, index, src, reduce)
@@ -1861,32 +1866,6 @@ def meta_scatter_reduce__two(self, dim, index, src, reduce, include_self=True):
     return self
 
 
-@register_meta(aten.upsample_nearest2d.vec)
-def upsample_nearest2d_vec(input, output_size, scale_factors):
-    mem_format = utils.suggest_memory_format(input)
-    spatial_dimensions = input.dim() - 2
-
-    input_shape = input.shape
-    if output_size is not None:
-        assert scale_factors is None
-        out_size = output_size
-    elif scale_factors is not None:
-        assert output_size is None
-        out_size = []
-        for i in range(spatial_dimensions):
-            sym_float = (input_shape[i + 2] / 1) * scale_factors[i]
-            assert sym_float >= 0
-            out_size.append(math.floor(sym_float))
-
-    output_height = out_size[0]
-    output_width = out_size[1]
-    nbatch = input_shape[0]
-    channels = input_shape[1]
-    return input.new_empty((nbatch, channels, output_height, output_width)).to(
-        memory_format=mem_format
-    )
-
-
 @register_meta([aten.sort.default, aten.sort.stable])
 def meta_sort(self, stable=None, dim=-1, descending=False):
     return torch.empty_like(self), torch.empty_like(self, dtype=torch.int64)
@@ -1983,6 +1962,12 @@ def activate_meta():
             # have CompositeImplicitAutograd kernels.
             # Instead, we should be letting those decompositions run, and writing meta kernels
             # only for the base operators.
+            if op_overload in global_decomposition_table["meta"]:
+                raise RuntimeError(
+                    f"{op_overload} is a CompositeImplicitAutograd op, we shouldn't "
+                    "register meta function for it. Instead, we should let the decomposition run and write "
+                    "meta kernels for the base operators."
+                )
             pass
         elif op_overload.is_view:
             # Attempting to register a python meta kernel for a view operator.

From 86bb136caa386004a54ebd4bc5cd2891d3adb26d Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Sun, 4 Dec 2022 19:43:49 -0800
Subject: [PATCH 1606/1922] [ao][ns] PNP demo for exposing arbitrary model
 transforms (#90153)

adding way to use arbitrary prepare and convert functions with PNP.

note this is a recreation of https://github.com/pytorch/pytorch/pull/89892 which was reverted due to landing not syncing between github and fbcode

python test/test_quantization.py
TestFxNumericSuiteNShadows.test_custom_functions_and_tracer

Differential Revision: [D41723892](https://our.internmc.facebook.com/intern/diff/D41723892/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90153
Approved by: https://github.com/vkuzo
---
 test/quantization/fx/test_numeric_suite_fx.py | 61 +++++++++++++++++++
 torch/ao/ns/_numeric_suite_fx.py              | 27 ++++++--
 torch/ao/ns/fx/n_shadows_utils.py             | 27 ++++++--
 3 files changed, 106 insertions(+), 9 deletions(-)

diff --git a/test/quantization/fx/test_numeric_suite_fx.py b/test/quantization/fx/test_numeric_suite_fx.py
index 86de5ab9cb371..eb7dcdfac3556 100644
--- a/test/quantization/fx/test_numeric_suite_fx.py
+++ b/test/quantization/fx/test_numeric_suite_fx.py
@@ -2485,6 +2485,67 @@ def test_qconfig_multi_mapping_repr(self):
         )
         self.assertTrue(isinstance(qconfig_multi_mapping.__repr__(), str))
 
+    def test_custom_functions_and_tracer(self):
+        class M(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc1 = nn.Linear(2, 2)
+                self.fc2 = nn.Linear(2, 2)
+
+            def forward(self, x):
+                x = self.fc1(x)
+                x = self.fc2(x)
+                return x
+
+        m = M().eval()
+        example_inputs = (torch.randn(2, 2),)
+
+        qconfig_mappings = QConfigMultiMapping().set_global(
+            [torch.quantization.default_qat_qconfig]
+        )
+
+        custom_tracer = torch.ao.quantization.quantize_fx.QuantizationTracer(
+            ["fc2"], []
+        )
+
+        custom_prepare_fn = torch.ao.quantization.quantize_fx.prepare_qat_fx
+
+        def custom_convert_fn(module, to_print):
+            print(to_print)
+            mod = torch.ao.quantization.quantize_fx.convert_fx(module)
+            return mod
+
+        backend_config = get_native_backend_config()
+
+        # test that input is valid
+        _ = m(*example_inputs)
+
+        kwargs = {"to_print": "working"}
+
+        msp = prepare_n_shadows_model(
+            m,
+            example_inputs,
+            qconfig_mappings,
+            backend_config,
+            custom_prepare_fn=custom_prepare_fn,
+            custom_prepare_kwargs=None,
+            custom_tracer=custom_tracer,
+        )
+
+        for _ in range(2):
+            msp(*example_inputs)
+
+        msq = convert_n_shadows_model(
+            msp, custom_convert_fn=custom_convert_fn, custom_convert_kwargs=kwargs
+        )
+        print(msq)
+        loggers_set_enabled(msq, True)
+        msq(*example_inputs)
+
+        results = extract_results_n_shadows_model(msq)
+        print_comparisons_n_shadows_model(results)
+
+
 class TestFXNumericSuiteCoreAPIsModels(FXNumericSuiteQuantizationTestCase):
     """
     Tests numeric suite core APIs on non-toy models.
diff --git a/torch/ao/ns/_numeric_suite_fx.py b/torch/ao/ns/_numeric_suite_fx.py
index db4bd87fb9734..92298c3d29b6a 100644
--- a/torch/ao/ns/_numeric_suite_fx.py
+++ b/torch/ao/ns/_numeric_suite_fx.py
@@ -752,6 +752,9 @@ def prepare_n_shadows_model(
     example_inputs: Any,
     qconfig_multi_mapping: QConfigMultiMapping,
     backend_config: BackendConfig,
+    custom_prepare_fn: Optional[Callable] = None,
+    custom_prepare_kwargs: Optional[Dict[str, Any]] = None,
+    custom_tracer: Any = None,
 ) -> torch.nn.Module:
     """
     Given a model with a graph with M ops such as
@@ -790,7 +793,10 @@ def prepare_n_shadows_model(
     4. add examples to docblocks
     """
 
-    tracer = quantize_fx.QuantizationTracer([], [])
+    if custom_tracer is None:
+        tracer = quantize_fx.QuantizationTracer([], [])
+    else:
+        tracer = custom_tracer
     mt = torch.fx.GraphModule(model, tracer.trace(model))
     # this is necessary to ensure logger FQNs get populated
     mt._node_name_to_scope = tracer.node_name_to_scope
@@ -834,7 +840,9 @@ def prepare_n_shadows_model(
             enumerate(subgraphs_dedup.items()):
         handle_subgraph(
             mt, subgraph_idx, match_name, nodes_in_this_subgraph,
-            qconfig_multi_mapping.qconfig_mappings_list, list_of_node_name_to_qconfig)
+            qconfig_multi_mapping.qconfig_mappings_list, list_of_node_name_to_qconfig,
+            custom_prepare_fn, custom_prepare_kwargs
+        )
 
     mt.recompile()
     return mt
@@ -862,7 +870,11 @@ def loggers_set_save_activations(
         if isinstance(child, OutputLogger):
             child.save_activations = save_activations
 
-def convert_n_shadows_model(model: GraphModule) -> GraphModule:
+def convert_n_shadows_model(
+    model: GraphModule,
+    custom_convert_fn: Optional[Callable] = None,
+    custom_convert_kwargs: Optional[Dict[str, Any]] = None
+) -> GraphModule:
     """
     Given a model from `prepare_n_shadows_model`, runs `convert_fx`
     on each shadow submodule.
@@ -872,8 +884,13 @@ def convert_n_shadows_model(model: GraphModule) -> GraphModule:
         # node name string match
         if node.name.startswith(SHADOW_WRAPPER_NODE_NAME_PREFIX):
             orig_mod = getattr(model, node.name)
-            converted_mod = torch.ao.quantization.quantize_fx.convert_fx(
-                orig_mod)
+            if custom_convert_fn is None:
+                converted_mod = torch.ao.quantization.quantize_fx.convert_fx(
+                    orig_mod)
+            else:
+                if custom_convert_kwargs is None:
+                    custom_convert_kwargs = {}
+                converted_mod = custom_convert_fn(orig_mod, **custom_convert_kwargs)
             setattr(model, node.name, converted_mod)
 
     return model
diff --git a/torch/ao/ns/fx/n_shadows_utils.py b/torch/ao/ns/fx/n_shadows_utils.py
index 85e9be6135f1c..c504a59c995d6 100644
--- a/torch/ao/ns/fx/n_shadows_utils.py
+++ b/torch/ao/ns/fx/n_shadows_utils.py
@@ -500,6 +500,8 @@ def handle_subgraph_candidate(
     fqn: Optional[str],
     list_of_node_name_to_qconfig: List[Dict[str, QConfigAny]],
     example_inputs: Any,
+    custom_prepare_fn: Optional[Callable] = None,
+    custom_prepare_kwargs: Dict[str, Any] = None,
 ) -> None:
     """
     Given a subgraph in `mt` and a subgraph candidate idx, inserts the
@@ -566,9 +568,24 @@ def handle_subgraph_candidate(
             .set_non_traceable_module_classes([OutputLogger, OutputComparisonLogger])
 
         # add a call to prepare_fx on the wrapper module
-        orig_mod_copy_wrapped = torch.ao.quantization.quantize_fx.prepare_fx(
-            orig_mod_copy_wrapped, qconfig_mapping, example_inputs=example_inputs,
-            prepare_custom_config=prepare_custom_config)
+        if custom_prepare_fn is None:
+            orig_mod_copy_wrapped = torch.ao.quantization.quantize_fx.prepare_fx(
+                orig_mod_copy_wrapped, qconfig_mapping, example_inputs=example_inputs,
+                prepare_custom_config=prepare_custom_config)
+        else:
+            if custom_prepare_kwargs is None:
+                custom_prepare_kwargs = {}
+            for kwarg_name in ["example_inputs", "prepare_custom_config", "qconfig_mapping"]:
+                assert kwarg_name not in custom_prepare_kwargs, f"cannot specify {kwarg_name} in custom_prepare_kwargs"
+            prepare_kwargs: Dict[str, Any] = {
+                "example_inputs": example_inputs,
+                "prepare_custom_config": prepare_custom_config,
+                "qconfig_mapping": qconfig_mapping
+            }
+            prepare_kwargs.update(custom_prepare_kwargs)
+            orig_mod_copy_wrapped = custom_prepare_fn(
+                orig_mod_copy_wrapped,
+                **prepare_kwargs)
 
         # attach the wrapper to the model
         attr_name = _get_attr_wrapper_name(subgraph_idx, subgraph_candidate_idx)
@@ -615,6 +632,8 @@ def handle_subgraph(
     nodes_in_this_subgraph: List[Any],
     qconfig_mappings: List[QConfigMapping],
     list_of_node_name_to_qconfig: List[Dict[str, QConfigAny]],
+    custom_prepare_fn: Optional[Callable] = None,
+    custom_prepare_kwargs: Dict[str, Any] = None,
 ) -> None:
     """
     Given a model `mt` and a subgraph_idx, creates the needed copies
@@ -690,7 +709,7 @@ def handle_subgraph(
         handle_subgraph_candidate(
             mt, subgraph_idx, subgraph_candidate_idx, first_node,
             last_node, fqn, list_of_node_name_to_qconfig,
-            example_inputs)
+            example_inputs, custom_prepare_fn, custom_prepare_kwargs)
 
 # TODO(future PR): redesign this to make it easier to consume outputs
 def group_results_by_subgraph(results: NSResultsType) -> Any:

From 3b06987aeeae2f35f0e709d70dd7ea4d4069e97f Mon Sep 17 00:00:00 2001
From: fduwjj <fduwjj@fb.com>
Date: Mon, 5 Dec 2022 23:43:07 +0000
Subject: [PATCH 1607/1922] [PT-D][Composability][1/N] Upstream NamedOptimizer
 from TorchRec (KeyedOptimizer in TR) (#89480)

In pytorch, the optim state_dict will always use number to index optimizer state_dict for parameters.

Now composability workstream need a FQN based way to index optimizer state_dict for parameters..

For example, SGD optimizer might have something in its `state_dict` like:

```
{'state':
  {0:
    {'momentum_buffer': tensor(...)},
  {1:
    {'momentum_buffer': tensor(...)},
  ...
}
'param_groups':
    [{'lr': 0.001, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'maximize': False, 'foreach': None, 'differentiable': False, 'params': [0, 1, 2, 3, 4, 5, 6, 7]}]
}
```

And in NamedOptimizer we want the `state_dict` can be:

```
{'state':
  {'net1.0.weight':
    {'momentum_buffer': tensor(...)},
  {'net1.0.bias':
    {'momentum_buffer': tensor(...)},
  ...
}
'param_groups':
    [{'lr': 0.001, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'maximize': False, 'foreach': None, 'differentiable': False, 'params': ['net1.0.weight', 'net1.0.bias', 'net2.0.weight', 'net2.0.bias', 'net3.weight', 'net3.bias', 'net4.1.weight', 'net4.1.bias']}]
}
```

We also want to support load_state_dict to enable optim `state_dict` override for NameOptimizer.

For the next couple PR/diffs, we also need to:
1. To make `NamedOptimizer` working with FSDP (like registering a hook for model wrapped with FSDP) and other PTD/PT components.
2. Make `NamedOptimizer` works well with apply_optim_in_backward
3. Upstream also `CombinedOptimizer`.

Differential Revision: [D41432088](https://our.internmc.facebook.com/intern/diff/D41432088/)

**NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D41432088/)!
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89480
Approved by: https://github.com/rohan-varma
---
 .../distributed/optim/test_named_optimizer.py | 245 +++++++++++++++++
 torch/distributed/optim/__init__.py           |   1 +
 torch/distributed/optim/named_optimizer.py    | 258 ++++++++++++++++++
 3 files changed, 504 insertions(+)
 create mode 100644 test/distributed/optim/test_named_optimizer.py
 create mode 100644 torch/distributed/optim/named_optimizer.py

diff --git a/test/distributed/optim/test_named_optimizer.py b/test/distributed/optim/test_named_optimizer.py
new file mode 100644
index 0000000000000..880dbb382aa6a
--- /dev/null
+++ b/test/distributed/optim/test_named_optimizer.py
@@ -0,0 +1,245 @@
+# Owner(s): ["oncall: distributed"]
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+import torch.nn as nn
+
+from torch.distributed.optim import _NamedOptimizer
+
+
+class TestDummyModel(torch.nn.Module):
+    def __init__(self):
+        super(TestDummyModel, self).__init__()
+        torch.manual_seed(0)
+        self.net1 = nn.Sequential(nn.Linear(8, 16), nn.ReLU())
+        self.net2 = nn.Sequential(nn.Linear(16, 32), nn.ReLU())
+        self.net3 = nn.Linear(32, 64)
+        self.net4 = nn.Sequential(nn.ReLU(), nn.Linear(64, 8))
+
+    def forward(self, x):
+        return self.net4(self.net3(self.net2(self.net1(x))))
+
+
+class NamedOptimizerTest(unittest.TestCase):
+    def _compare_state_dict_group(self, group, named_group, assert_equal=True):
+        for key, val in group.items():
+            if key != "params":
+                self.assertTrue(
+                    key in named_group, f"{key} not in named optimizer state dict"
+                )
+                err_msg = (
+                    f"{key} state not equal" if assert_equal else f"{key} state equal"
+                )
+                if isinstance(val, torch.Tensor):
+                    fn = self.assertTrue if assert_equal else self.assertFalse
+                    fn(torch.allclose(val, named_group[key]), err_msg)
+                else:
+                    fn = self.assertEqual if assert_equal else self.assertNotEqual
+                    fn(val, named_group[key], err_msg)
+
+    def test_state_dict(self):
+        """Check that NamedOptimizer exposes the expected state dict
+        interface."""
+        m = TestDummyModel()
+        m_dup = TestDummyModel()
+        optim_1 = torch.optim.SGD(
+            [
+                {"params": m.net1.parameters()},
+                {"params": m.net3.parameters(), "lr": 1e-3},
+            ],
+            lr=1e-2,
+            momentum=0.9,
+        )
+
+        optim_2 = torch.optim.Adam(
+            [
+                {"params": m.net2.parameters()},
+                {"params": m.net4.parameters(), "lr": 1e-5},
+            ]
+        )
+
+        named_optim_1 = _NamedOptimizer(
+            m_dup.named_parameters(),
+            torch.optim.SGD,
+            [
+                {"params": m_dup.net1.parameters()},
+                {"params": m_dup.net3.parameters(), "lr": 1e-3},
+            ],
+            lr=1e-2,
+            momentum=0.9,
+        )
+
+        named_optim_2 = _NamedOptimizer(
+            m_dup.named_parameters(),
+            torch.optim.Adam,
+            [
+                {"params": m_dup.net2.parameters()},
+                {"params": m_dup.net4.parameters(), "lr": 1e-5},
+            ],
+        )
+        for i in range(2):
+            x = torch.rand(5, 8)
+            y = m(x)
+            y.sum().backward()
+            optim_1.step()
+            optim_2.step()
+
+            y = m_dup(x)
+            y.sum().backward()
+            named_optim_1.step()
+            named_optim_2.step()
+
+        sd_1 = optim_1.state_dict()
+        sd_2 = optim_2.state_dict()
+        named_sd_1 = named_optim_1.state_dict()
+        named_sd_2 = named_optim_2.state_dict()
+
+        # Compare "state" in optim state dict
+        self._compare_state_dict_group(
+            sd_1["state"][0],
+            named_sd_1["state"]["net1.0.weight"],
+            assert_equal=True,
+        )
+        self._compare_state_dict_group(
+            sd_2["state"][1],
+            named_sd_2["state"]["net2.0.bias"],
+            assert_equal=True,
+        )
+        self._compare_state_dict_group(
+            sd_1["state"][2],
+            named_sd_1["state"]["net3.weight"],
+            assert_equal=True,
+        )
+        self._compare_state_dict_group(
+            sd_2["state"][3],
+            named_sd_2["state"]["net4.1.bias"],
+            assert_equal=True,
+        )
+
+        # Compare "param_groups" in optim state dict
+        self._compare_state_dict_group(
+            sd_1["param_groups"][0],
+            named_sd_1["param_groups"][0],
+            assert_equal=True,
+        )
+        self._compare_state_dict_group(
+            sd_2["param_groups"][1], named_sd_2["param_groups"][1], assert_equal=True
+        )
+
+    def test_load_state_dict(self):
+        """Check that NamedOptimizer exposes the expected state dict
+        interface."""
+        m = TestDummyModel()
+        named_optim_1 = _NamedOptimizer(
+            m.named_parameters(),
+            torch.optim.SGD,
+            lr=1e-2,
+            momentum=0.9,
+        )
+
+        for _ in range(2):
+            x = torch.rand(5, 8)
+            y = m(x)
+            y.sum().backward()
+            named_optim_1.step()
+
+        state_dict_to_load = named_optim_1.state_dict()
+
+        named_optim_2 = _NamedOptimizer(
+            m.named_parameters(),
+            torch.optim.SGD,
+            lr=1e-2,
+            momentum=0.6,
+        )
+
+        for _ in range(2):
+            x = torch.rand(5, 8)
+            y = m(x)
+            y.sum().backward()
+            named_optim_2.step()
+
+        state_dict_before_load = named_optim_2.state_dict()
+
+        # Compare "state" in optim state dict
+        self._compare_state_dict_group(
+            state_dict_to_load["state"]["net1.0.weight"],
+            state_dict_before_load["state"]["net1.0.weight"],
+            assert_equal=False,
+        )
+        self._compare_state_dict_group(
+            state_dict_to_load["state"]["net2.0.bias"],
+            state_dict_before_load["state"]["net2.0.bias"],
+            assert_equal=False,
+        )
+        self._compare_state_dict_group(
+            state_dict_to_load["state"]["net3.weight"],
+            state_dict_before_load["state"]["net3.weight"],
+            assert_equal=False,
+        )
+        self._compare_state_dict_group(
+            state_dict_to_load["state"]["net4.1.bias"],
+            state_dict_before_load["state"]["net4.1.bias"],
+            assert_equal=False,
+        )
+
+        named_optim_2.load_state_dict(state_dict_to_load)
+        state_dict_after_load = named_optim_2.state_dict()
+
+        # Compare "state" in optim state dict
+        self._compare_state_dict_group(
+            state_dict_to_load["state"]["net1.0.weight"],
+            state_dict_after_load["state"]["net1.0.weight"],
+            assert_equal=True,
+        )
+        self._compare_state_dict_group(
+            state_dict_to_load["state"]["net2.0.bias"],
+            state_dict_after_load["state"]["net2.0.bias"],
+            assert_equal=True,
+        )
+        self._compare_state_dict_group(
+            state_dict_to_load["state"]["net3.weight"],
+            state_dict_after_load["state"]["net3.weight"],
+            assert_equal=True,
+        )
+        self._compare_state_dict_group(
+            state_dict_to_load["state"]["net4.1.bias"],
+            state_dict_after_load["state"]["net4.1.bias"],
+            assert_equal=True,
+        )
+
+    def test_load_state_dict_error(self):
+        m = TestDummyModel()
+        named_optim_1 = _NamedOptimizer(
+            m.named_parameters(),
+            torch.optim.SGD,
+            lr=1e-2,
+            momentum=0.9,
+        )
+
+        for _ in range(2):
+            x = torch.rand(5, 8)
+            y = m(x)
+            y.sum().backward()
+            named_optim_1.step()
+
+        state_dict_to_load = named_optim_1.state_dict()
+
+        named_optim_2 = _NamedOptimizer(
+            m.named_parameters(),
+            torch.optim.SGD,
+            lr=1e-2,
+            momentum=0.6,
+        )
+
+        err_msg = (
+            "Expects the optim to be initialized before load but found not initialized"
+        )
+        with self.assertRaisesRegex(ValueError, err_msg):
+            named_optim_2.load_state_dict(state_dict_to_load)
diff --git a/torch/distributed/optim/__init__.py b/torch/distributed/optim/__init__.py
index 950222b8d5fa8..9a83c7aabf772 100644
--- a/torch/distributed/optim/__init__.py
+++ b/torch/distributed/optim/__init__.py
@@ -16,6 +16,7 @@
 from .functional_rmsprop import _FunctionalRMSprop
 from .functional_rprop import _FunctionalRprop
 from .functional_adamax import _FunctionalAdamax
+from .named_optimizer import _NamedOptimizer
 from .utils import as_functional_optim
 from .apply_optimizer_in_backward import _apply_optimizer_in_backward
 
diff --git a/torch/distributed/optim/named_optimizer.py b/torch/distributed/optim/named_optimizer.py
new file mode 100644
index 0000000000000..e2eea5099c0f9
--- /dev/null
+++ b/torch/distributed/optim/named_optimizer.py
@@ -0,0 +1,258 @@
+import logging
+import warnings
+
+from copy import deepcopy
+from typing import Any, Collection, Dict, List, Mapping, Union
+
+import torch
+from torch import optim
+from torch.distributed._shard.sharded_tensor import ShardedTensor
+
+
+__all__ = ["_NamedOptimizer"]
+
+logger = logging.getLogger(__name__)
+
+
+class _NamedOptimizer(optim.Optimizer):
+    """
+    ``_NamedOptimizer`` takes a dict of parameters and exposes ``state_dict`` by
+    parameter key. We replace the original key (number) in an optim to the
+    fully qualifed name (FQN) string. User can initialize the optim as they
+    initialize a PyTorch optim, the only difference is that they also need to
+    pass in the FQN of each parameters.
+
+    Args:
+        named_parameters (Mapping[str, Union[torch.Tensor, ShardedTensor]]):
+            Mapping from FQN to parameter.
+        optimizer_class (optim.Optimizer):
+            The class of optimizer to instantiate.
+        param_groups (Collection[Mapping[str, Any]]):
+            `param_groups` to pass to optimizer if specified.
+            The key of the inner map needs to be FQNs.
+            Default: None
+        args: arguments to pass to the optimizer constructor.
+        kwargs: arguments to pass to the optimizer constructor.
+
+    Example::
+        >>> # xdoctest: +SKIP("distributed")
+        >>> from torch import optim
+        >>> from torch.distributed.optim import _NamedOptimizer
+        >>>
+        >>> # Define the named optimizer.
+        >>> m = Model(...)
+        >>> named_optim = _NamedOptimizer(m.named_parameters(), optim.SGD)
+        >>> # Forward pass + backward pass.
+        >>> named_optim.step()
+        >>> ...
+        >>> # Call state_dict for the named optimizer returns a FQN state_dict.
+        >>> named_optim.state_dict()
+
+    Warning: This API is still in development and subject to change.
+
+    TODO: Add tutorial for _NamedOptimizer.
+    TODO: Add documentation in the docstring for the public attributes
+          like self.param_groups and self.named_parameters.
+    """
+
+    def __init__(
+        self,
+        named_parameters: Mapping[str, Union[torch.Tensor, ShardedTensor]],
+        optimizer_class: optim.Optimizer,
+        param_groups: Collection[Mapping[str, Any]] = None,
+        *args,
+        **kwargs,
+    ) -> None:
+        torch._C._log_api_usage_once("torch.distributed.optim._NamedOptimizer")
+        self.param_groups: Collection[Mapping[str, Any]] = param_groups  # type: ignore[assignment]
+        self.named_parameters = dict(named_parameters)
+        params_for_optimizer = (
+            self.named_parameters.values() if param_groups is None else param_groups
+        )
+        self._optimizer = optimizer_class(  # type: ignore[operator]
+            params_for_optimizer,
+            *args,
+            **kwargs,
+        )
+        # TODO: Add param_groups validations and unit tests.
+        if param_groups is None:
+            self.ordered_param_keys = list(self.named_parameters.keys())
+        else:
+            warnings.warn(
+                "Since we pass in param_groups, we will use param_groups to "
+                "initialize the optimizer, not all parameters of the module."
+            )
+            param_to_key = {param: key for key, param in self.named_parameters.items()}  # type: ignore[misc, has-type]
+            ordered_param_keys = []
+            for group in param_groups:
+                for param in group["params"]:
+                    if param not in param_to_key:
+                        raise ValueError(
+                            f"Expect param name {param} found in param group but is missing."
+                        )
+                    ordered_param_keys.append(param_to_key[param])
+            self.ordered_param_keys = ordered_param_keys
+
+    def state_dict(self) -> Dict[str, Any]:
+        """
+        Return the ``state_dict`` of the optimzer. Instead of using number to index
+        parameters, we will use module fully qualifed name (FQN) as the key.
+        """
+        state_dict = self._optimizer.state_dict()
+        param_groups = state_dict["param_groups"]
+
+        ret_state = {
+            self.ordered_param_keys[st_key]: state_val
+            for st_key, state_val in state_dict["state"].items()
+        }
+
+        ret_groups = []
+        for group in param_groups:
+            param_keys = []
+            for param in group["params"]:
+                param_keys.append(self.ordered_param_keys[param])
+            ret_group = {"params": sorted(param_keys)}
+            for k, v in group.items():
+                if k != "params":
+                    ret_group[k] = deepcopy(v)
+            ret_groups.append(ret_group)
+
+        return {"state": ret_state, "param_groups": ret_groups}
+
+    def step(self):
+        """
+        Performs a single optimization step.
+
+        This will call :meth:`torch.optim.Optimizer.step` on the wrapped
+        optimizer.
+        """
+        self._optimizer.step()
+
+    def load_state_dict(self, state_dict: Mapping[str, Any]) -> None:
+        """
+        This public function defines the default behavior to load a state_dict
+        for ``_NamedOptimizer``.
+
+        Sample Code
+        ```
+            my_model = MyModule()
+            optimizer = _NamedOptimizer(my_model.named_parameters(), Adagrad)
+            ...
+
+            optim_state_dict = optimizer.state_dict()
+            ...
+            ...
+
+            optimizer.load_state_dict(optim_state_dict)
+            ...
+        ```
+        Args:
+            state_dict (Dict[str, Any]) : A ``state_dict`` to load into the optimizer.
+                Note that this state dict update is performed in place.
+
+        .. note:: PyTorch is using lazy init to initialize the optim states.
+            So it is possible that there is no optim state when user call
+            ``load_state_dict`` and for ``_NamedOptimizer`` we make it stricter
+            that users can only call ``load_state_dict`` after the state is initialized.
+            By doing this, we can validate the optim ``state_dict`` to be loaded.
+        """
+        new_state_dict = self._optimizer.state_dict()
+        state = state_dict["state"]
+        new_state = new_state_dict["state"]
+        if len(new_state) == 0:
+            raise ValueError(
+                "Expects the optim to be initialized before load but found not initialized."
+            )
+
+        # Load state of state_dict
+        if len(new_state) != len(state):
+            raise ValueError(
+                f"Expects equal length as {len(new_state)} in `state_dict` state length but found {len(state)}."
+            )
+        for idx, param_key in enumerate(self.ordered_param_keys):
+            if param_key not in state.keys():
+                raise ValueError(
+                    f"Expect {param_key} as a parameter in `state_dict` state but not found."
+                )
+            if len(state[param_key]) != len(new_state[idx]):
+                raise ValueError(
+                    f"Expects equal length as {len(new_state[idx])} for parameter {param_key} but found: {len(state[param_key])}"
+                )
+            # Iterate through all optimizer states.
+            for state_key, state_val in new_state[idx].items():
+                if state_key not in state[param_key]:
+                    raise ValueError(
+                        f"Expects state {state_key} for parameter {param_key} but not found."
+                    )
+
+                src_state_val = state[param_key][state_key]
+                if isinstance(state_val, ShardedTensor):
+                    assert isinstance(src_state_val, ShardedTensor)
+                    num_shards = len(state_val.local_shards())
+                    num_new_shards = len(src_state_val.local_shards())
+                    if num_shards != num_new_shards:
+                        raise ValueError(
+                            f"Expects equal number of shards as {num_new_shards} but found {num_shards} for {param_key}/{state_key}"
+                        )
+                    for shard, src_shard in zip(
+                        state_val.local_shards(), src_state_val.local_shards()
+                    ):
+                        shard.tensor.detach().copy_(src_shard.tensor)
+                elif isinstance(state_val, torch.Tensor):
+                    assert isinstance(src_state_val, torch.Tensor)
+                    state_val.detach().copy_(src_state_val)
+                else:
+                    new_state[idx][state_key] = deepcopy(src_state_val)
+
+        # Load param_groups of state_dict
+        src_param_groups = state_dict["param_groups"]
+        new_param_groups = new_state_dict["param_groups"]
+
+        if len(new_param_groups) != len(src_param_groups):
+            raise ValueError(
+                f"Expects equal param_groups count as {len(new_param_groups)} in `state_dict` but found {len(src_param_groups)}."
+            )
+        src_group_map = {}
+        for group in src_param_groups:
+            param_keys = []
+            for param_key in group["params"]:
+                param_keys.append(param_key)
+            src_group_map[_gen_param_group_key(param_keys)] = group
+        new_group_map = {}
+        for new_group in new_param_groups:
+            param_keys = []
+            for param_key in new_group["params"]:
+                param_keys.append(self.ordered_param_keys[param_key])  # type: ignore[call-overload]
+            new_group_map[_gen_param_group_key(param_keys)] = new_group
+        for group_key, new_group in new_group_map.items():
+            if group_key not in src_group_map:
+                raise ValueError(
+                    f"Expects group {group_key} to be in `state_dict` but is missing"
+                )
+            src_group = src_group_map[group_key]
+            if len(src_group) != len(new_group):
+                raise ValueError(
+                    f"Expects equal param_group size as {len(new_group)} for group {group_key} but found {len(src_group)}."
+                )
+            for k in src_group:
+                if k not in new_group:
+                    raise ValueError(
+                        f"Expects group key {k} to be in group {group_key} in `state_dict` but is missing."
+                    )
+                if k != "params":
+                    new_group[k] = deepcopy(src_group[k])
+
+        self._optimizer.load_state_dict(new_state_dict)
+
+    # pyre-ignore [2]
+    def add_param_group(self, param_group: Any) -> None:
+        raise NotImplementedError(
+            "add_param_group not supported yet and might be implemented soon."
+        )
+
+
+def _gen_param_group_key(param_keys: List[str]) -> str:
+    """
+    Concatenate all param keys as a unique indentifier for one param group.
+    """
+    return "/".join(sorted(param_keys))

From 8fb2b198c6e3963614d14b4514b5cfe8e806a225 Mon Sep 17 00:00:00 2001
From: Ram Rachum <ram@rachum.com>
Date: Tue, 6 Dec 2022 04:34:56 +0000
Subject: [PATCH 1608/1922] Fix exception causes in fx, nn and onnx packages
 (#90134)

This is a continuation of #90118

@kit1980
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90134
Approved by: https://github.com/kit1980
---
 .../unification/multipledispatch/dispatcher.py            | 8 ++++----
 torch/fx/interpreter.py                                   | 2 +-
 torch/fx/operator_schemas.py                              | 4 ++--
 torch/fx/passes/shape_prop.py                             | 4 ++--
 torch/nn/parallel/distributed.py                          | 4 ++--
 torch/nn/utils/parametrize.py                             | 2 +-
 torch/onnx/_internal/onnx_proto_utils.py                  | 4 ++--
 torch/onnx/utils.py                                       | 2 +-
 torch/onnx/verification.py                                | 4 ++--
 9 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/torch/fx/experimental/unification/multipledispatch/dispatcher.py b/torch/fx/experimental/unification/multipledispatch/dispatcher.py
index 126f964a91475..14136b359dc61 100644
--- a/torch/fx/experimental/unification/multipledispatch/dispatcher.py
+++ b/torch/fx/experimental/unification/multipledispatch/dispatcher.py
@@ -251,17 +251,17 @@ def __call__(self, *args, **kwargs):
         types = tuple([type(arg) for arg in args])
         try:
             func = self._cache[types]
-        except KeyError:
+        except KeyError as e:
             func = self.dispatch(*types)
             if not func:
                 raise NotImplementedError(
                     'Could not find signature for %s: <%s>' %
-                    (self.name, str_signature(types)))
+                    (self.name, str_signature(types))) from e
             self._cache[types] = func
         try:
             return func(*args, **kwargs)
 
-        except MDNotImplementedError:
+        except MDNotImplementedError as e:
             funcs = self.dispatch_iter(*types)
             next(funcs)  # burn first
             for func in funcs:
@@ -273,7 +273,7 @@ def __call__(self, *args, **kwargs):
             raise NotImplementedError(
                 "Matching functions for "
                 "%s: <%s> found, but none completed successfully" % (
-                    self.name, str_signature(types),),)
+                    self.name, str_signature(types),),) from e
 
     def __str__(self):
         return "<dispatched %s>" % self.name
diff --git a/torch/fx/interpreter.py b/torch/fx/interpreter.py
index 6428d4c5c3bb5..c4d1cf26a8592 100644
--- a/torch/fx/interpreter.py
+++ b/torch/fx/interpreter.py
@@ -201,7 +201,7 @@ def placeholder(self, target : 'Target', args : Tuple[Argument, ...], kwargs : D
                 if len(args) > 0:
                     return args[0]
                 else:
-                    raise RuntimeError(f'Expected positional argument for parameter {target}, but one was not passed in!')
+                    raise RuntimeError(f'Expected positional argument for parameter {target}, but one was not passed in!') from si
 
     @compatibility(is_backward_compatible=True)
     def get_attr(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
diff --git a/torch/fx/operator_schemas.py b/torch/fx/operator_schemas.py
index 92f5246c313e1..34708b9d93ae8 100644
--- a/torch/fx/operator_schemas.py
+++ b/torch/fx/operator_schemas.py
@@ -355,9 +355,9 @@ def normalize_module(
     """
     try:
         submod = root.get_submodule(target)
-    except AttributeError:
+    except AttributeError as e:
         raise RuntimeError(f"Tried to normalize node with target {target} but root did not "
-                           f"have that target!")
+                           f"have that target!") from e
     if hasattr(submod.__class__, '__name__'):
         classname = submod.__class__.__name__
         if getattr(torch.nn, classname, None) == submod.__class__:
diff --git a/torch/fx/passes/shape_prop.py b/torch/fx/passes/shape_prop.py
index 4fd8ce8af9347..a0807fae74ca6 100644
--- a/torch/fx/passes/shape_prop.py
+++ b/torch/fx/passes/shape_prop.py
@@ -141,12 +141,12 @@ def run_node(self, n : Node) -> Any:
                 result = super().run_node(n)
             finally:
                 self.module = self.real_module
-        except Exception:
+        except Exception as e:
             traceback.print_exc()
             raise RuntimeError(
                 f"ShapeProp error for: node={n.format_node()} with "
                 f"meta={n.meta}"
-            )
+            ) from e
 
         found_tensor = False
 
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index b6673874eecca..40e2cba9a7458 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -1625,10 +1625,10 @@ def _register_fused_optim(
         )
         try:
             overlapped_optim.register_ddp(self)
-        except NotImplementedError:
+        except NotImplementedError as e:
             raise RuntimeError(
                 f"{optim} does not support overlapped DDP. Please file an issue to PyTorch or the respective owner of {optim}."
-            )
+            ) from e
 
     def _distributed_broadcast_coalesced(
         self, tensors, buffer_size, authoritative_rank=0
diff --git a/torch/nn/utils/parametrize.py b/torch/nn/utils/parametrize.py
index 801a1e80c1aac..729d66e4c409d 100644
--- a/torch/nn/utils/parametrize.py
+++ b/torch/nn/utils/parametrize.py
@@ -657,7 +657,7 @@ def remove_parametrizations(
                                            "for a parameter that is an instance of a tensor subclass requires "
                                            "set_() to be implemented correctly for the tensor subclass. Either "
                                            "set leave_parametrized=False or provide a working implementation for "
-                                           "set_() in the tensor subclass.")
+                                           "set_() in the tensor subclass.") from e
     else:
         if leave_parametrized:
             # We cannot use no_grad because we need to know whether one or more
diff --git a/torch/onnx/_internal/onnx_proto_utils.py b/torch/onnx/_internal/onnx_proto_utils.py
index f557089707b88..6c8b1e420ec33 100644
--- a/torch/onnx/_internal/onnx_proto_utils.py
+++ b/torch/onnx/_internal/onnx_proto_utils.py
@@ -71,8 +71,8 @@ def _add_onnxscript_fn(
     # TODO(titaiwang): remove this when onnx becomes dependency
     try:
         import onnx
-    except ImportError:
-        raise errors.OnnxExporterError("Module onnx is not installed!")
+    except ImportError as e:
+        raise errors.OnnxExporterError("Module onnx is not installed!") from e
 
     # For > 2GB model, onnx.load_fromstring would fail. However, because
     # in _export_onnx, the tensors should be saved separately if the proto
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 36d7fdb75762c..fd0edef773a6b 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -1620,7 +1620,7 @@ def _export(
                 try:
                     _C._check_onnx_proto(proto, full_check=True)
                 except RuntimeError as e:
-                    raise errors.CheckerError(e)
+                    raise errors.CheckerError(e) from e
     finally:
         assert GLOBALS.in_onnx_export
         GLOBALS.in_onnx_export = False
diff --git a/torch/onnx/verification.py b/torch/onnx/verification.py
index 8c3d63a268bae..276a1d209ca9b 100644
--- a/torch/onnx/verification.py
+++ b/torch/onnx/verification.py
@@ -107,8 +107,8 @@ def _ort_session(
 ):
     try:
         import onnxruntime  # type: ignore[import]
-    except ImportError:
-        raise ImportError("onnxruntime is required for export verification.")
+    except ImportError as e:
+        raise ImportError("onnxruntime is required for export verification.") from e
 
     if ort_providers is None:
         ort_providers = _ORT_PROVIDERS

From f65249b75c631adaec66e5ed12208a7f1c6a468b Mon Sep 17 00:00:00 2001
From: Michael Lazos <mlazos@fb.com>
Date: Tue, 6 Dec 2022 05:08:44 +0000
Subject: [PATCH 1609/1922] [dynamo] Rewrite addcdiv in dynamo to its
 constituent ops (#90227)

This avoids a graph break when `value` is used. This fixes a graph break in the variants of Adam and Adagrad optimizers.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90227
Approved by: https://github.com/jansel
---
 test/dynamo/test_functions.py    |  6 ++++++
 torch/_dynamo/variables/torch.py | 15 +++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py
index dd15a1562dbd2..652b68921fc07 100644
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@@ -68,6 +68,12 @@ def test_inline_jit_annotations(x):
     def test_add(a, b):
         return a + b
 
+    @make_test
+    def test_addcdiv(a, b, c):
+        # dynamo decomposes this to avoid a graph break when
+        # the value kwarg is populated
+        return torch.addcdiv(a, b, c, value=5.0)
+
     @make_test
     def test_is_not_null(a, b):
         if a is not None and b is not None:
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index 2edb462d0c0bb..35ca0d7a48340 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -377,6 +377,21 @@ def get_state_from_generator():
                 ),
                 **options,
             )
+        elif (
+            self.value == torch.addcdiv
+            and len(args) == 3
+            and "value" in kwargs
+            and len(kwargs) == 1
+        ):
+            # decompose addcdiv into constituent ops, prevents a graph break due to converting
+            # value to a scalar
+            result = TorchVariable(torch.div, **options).call_function(tx, args[1:], {})
+            result = TorchVariable(torch.mul, **options).call_function(
+                tx, [result, kwargs["value"]], {}
+            )
+            return TorchVariable(torch.add, **options).call_function(
+                tx, [args[0], result], {}
+            )
         else:
             any_symints_or_symfloats = any(
                 [isinstance(x, DynamicShapeVariable) for x in args]

From 9a784c99e84d73cd800cac0ce3181b9039393a50 Mon Sep 17 00:00:00 2001
From: Masaki Kozuki <mkozuki@nvidia.com>
Date: Tue, 6 Dec 2022 05:13:47 +0000
Subject: [PATCH 1610/1922] [ReduceOp] ameliorate custom `__eq__` (#90088)

Improve the completeness of `ReduceOp.__eq__`.

Should support the equal operator with the first argument of `RedOpType` and the second of `ReduceOp` in a follow-up.

Fixes #90072

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90088
Approved by: https://github.com/kwen2501
---
 test/distributed/test_c10d_common.py  | 22 ++++++++++++++++++++++
 torch/csrc/distributed/c10d/Types.hpp |  1 +
 torch/csrc/distributed/c10d/init.cpp  | 11 ++++++++++-
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index c03a68228990a..0dd8b42f1d9a1 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -1696,6 +1696,28 @@ def test_reduceop_pickle(self):
             reduce_op = dist._make_nccl_premul_sum(scale)
             self.assertEqual(pickle.loads(pickle.dumps(reduce_op)), reduce_op)
 
+    # Ref: https://github.com/pytorch/pytorch/issues/90072
+    def test_reduceop_equal(self):
+        not_reduceop = "abc"
+        for reduce_op in (
+            c10d.ReduceOp.SUM, c10d.ReduceOp.AVG, c10d.ReduceOp.PRODUCT, c10d.ReduceOp.MIN, c10d.ReduceOp.MAX,
+            c10d.ReduceOp.BAND, c10d.ReduceOp.BOR, c10d.ReduceOp.BXOR,
+        ):
+            reduce_op_obj = c10d.ReduceOp(reduce_op)
+            # this calls `ReduceOp.__eq__(self, other)`
+            self.assertEqual(reduce_op_obj, reduce_op_obj)
+            self.assertEqual(reduce_op_obj, reduce_op)
+            self.assertNotEqual(reduce_op_obj, not_reduceop)
+            self.assertNotEqual(reduce_op, not_reduceop)
+            # TODO(crcrpar): This needs to be `assertEqual` for the associativity even though
+            # the comparison of `RedOpType` and `ReduceOp` sounds less likely to happen compared
+            # to that of `ReduceOp` and `RedOptype`.
+            # this calls `RedOpType.__eq__(self, other)`
+            self.assertNotEqual(reduce_op, reduce_op_obj)
+
+            self.assertFalse(None in (reduce_op, reduce_op_obj))
+            self.assertFalse(not_reduceop in (reduce_op, reduce_op_obj))
+
 
 if __name__ == "__main__":
     assert (
diff --git a/torch/csrc/distributed/c10d/Types.hpp b/torch/csrc/distributed/c10d/Types.hpp
index be20fcadba645..9c163af5cb8e7 100644
--- a/torch/csrc/distributed/c10d/Types.hpp
+++ b/torch/csrc/distributed/c10d/Types.hpp
@@ -85,6 +85,7 @@ struct TORCH_API ReduceOp : torch::CustomClassHolder {
     return *this == static_cast<std::uint8_t>(other);
   }
 
+  // todo(crcrpar): Handle `RedOpType::PREMUL_SUM` with its scaling factor.
   bool operator==(const ReduceOp& other) {
     return *this == other.op_;
   }
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 9a9699c5e12f1..4f03233beff05 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -608,17 +608,26 @@ This class does not support ``__members__`` property.)");
   // take hash of `::c10d::ReduceOp`. To avoid losing these functionality, here
   // I define some member methods.
   reduce_op
+      // todo(crcrpar): Support `RedOpType == ReduceOp`.
       .def(
+          // This calls `operator==(const ReduceOp::RedOpType)`
           "__eq__",
           [](const ::c10d::ReduceOp& self,
              const ::c10d::ReduceOp::RedOpType& other) {
             return self == other;
           })
       .def(
+          // This calls `operator==(const ReduceOp)` for the future support of
+          // `PREMUL_SUM` comparison
           "__eq__",
           [](const ::c10d::ReduceOp& self, const ::c10d::ReduceOp& other) {
-            return self == other.op_;
+            return self == other;
           })
+      .def(
+          // With the above custom `__eq__`'s, I have to manually support the
+          // other types.
+          "__eq__",
+          [](const ::c10d::ReduceOp& self, py::object) { return false; })
       .def(
           "__hash__",
           [](const ::c10d::ReduceOp& self) {

From b3b0b035107f34c32742c0a7bce40eb9b6f72b73 Mon Sep 17 00:00:00 2001
From: Sergii Dymchenko <sdym@fb.com>
Date: Tue, 6 Dec 2022 05:29:46 +0000
Subject: [PATCH 1611/1922] Fix issue 38095 TODO in onnx/test_utility_funs.py
 (#90085)

Fix TODO related to https://github.com/pytorch/pytorch/issues/38095

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90085
Approved by: https://github.com/BowenBao
---
 test/onnx/test_utility_funs.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py
index 7e23b06e55413..25ee698fd6d0e 100644
--- a/test/onnx/test_utility_funs.py
+++ b/test/onnx/test_utility_funs.py
@@ -603,8 +603,7 @@ def forward(self, x):
         params = list(params_dict.values())
         self.assertEqual(len(params), 1)
         weight = params[0]
-        # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-        self.assertEqualIgnoreType(weight, torch.tensor([2, 3, 4, 5, 6]))
+        self.assertEqual(weight, torch.tensor([2.0, 3.0, 4.0, 5.0, 6.0]))
 
     def test_constant_fold_sub(self):
         class Module(torch.nn.Module):
@@ -635,8 +634,7 @@ def forward(self, x):
         params = list(params_dict.values())
         self.assertEqual(len(params), 1)
         weight = params[0]
-        # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-        self.assertEqualIgnoreType(weight, torch.tensor([0, -1, -2, -3, -4]))
+        self.assertEqual(weight, torch.tensor([0.0, -1.0, -2.0, -3.0, -4.0]))
 
     def test_constant_fold_sqrt(self):
         class Module(torch.nn.Module):

From 829f0c731c3d64031bdf95175b0e40d07c906c4d Mon Sep 17 00:00:00 2001
From: Yanli Zhao <yanlizhao@fb.com>
Date: Tue, 6 Dec 2022 05:39:55 +0000
Subject: [PATCH 1612/1922] remove backward hook in memory_tracker (#90143)

remove backward hook in memory_tracker, as it does not work well with jagged tensor in some cases, it is OK to remove this hook for now as it does not really track any stats

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90143
Approved by: https://github.com/rohan-varma
---
 torch/distributed/_tools/memory_tracker.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/torch/distributed/_tools/memory_tracker.py b/torch/distributed/_tools/memory_tracker.py
index f401304d67f6d..477c59021bd48 100644
--- a/torch/distributed/_tools/memory_tracker.py
+++ b/torch/distributed/_tools/memory_tracker.py
@@ -105,8 +105,10 @@ def start_monitor(self, root_module: nn.Module) -> None:
             # the memory stats tracked here may not completely accurate.
             h1 = m.register_forward_pre_hook(self._create_pre_forward_hook(name))
             h2 = m.register_forward_hook(self._create_post_forward_hook(name))
-            h3 = m.register_backward_hook(self._create_backward_hook(name))
-            self._hooks.extend([h1, h2, h3])
+            # it does not work well with jagged tensor somehow, the root cause is not
+            # clear and remove it for now as it does not really capture important info.
+            # h3 = m.register_backward_hook(self._create_backward_hook(name))
+            self._hooks.extend([h1, h2])
         torch.cuda.empty_cache()
         assert getattr(self, "profile_mode", None) is None
         self.profile_mode = MemoryProfileDispatchMode(self)

From 8b53eb3780cd4ac8d295f0afb24401bb7ca2ec9f Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Tue, 6 Dec 2022 01:40:00 +0000
Subject: [PATCH 1613/1922] [dtensor] remove torchgen function schema and parse
 manually (#90106)

This PR get rids of torchgen FunctionSchema parsing and parse
it manually, it should resolve torchgen package issue and also
provide some perf wins when running DTensor eagerly
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90106
Approved by: https://github.com/awgu
---
 test/distributed/_tensor/test_common_rules.py | 29 +++++++--------
 torch/distributed/_tensor/dispatch.py         | 35 ++++++++++---------
 2 files changed, 31 insertions(+), 33 deletions(-)

diff --git a/test/distributed/_tensor/test_common_rules.py b/test/distributed/_tensor/test_common_rules.py
index 7e7a5c7654acb..fe89b6c4c40d7 100644
--- a/test/distributed/_tensor/test_common_rules.py
+++ b/test/distributed/_tensor/test_common_rules.py
@@ -16,13 +16,10 @@
     DTensorTestBase,
     with_comms,
 )
-from torchgen.model import FunctionSchema
+from torch._C import parse_schema
 
 
 class CommonRulesTest(DTensorTestBase):
-    def parse_schema(self, schema_str):
-        return FunctionSchema.parse(schema_str)
-
     @property
     def world_size(self) -> int:
         # hard code world size to 4 as we need to test
@@ -34,7 +31,7 @@ def test_einop_basic_propagation(self):
         # plain einsum, mm
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
 
-        func_schema = self.parse_schema("aten::mm(Tensor self, Tensor mat2) -> Tensor")
+        func_schema = parse_schema("aten::mm(Tensor self, Tensor mat2) -> Tensor")
         # propagate col-wise sharding
         mat1, mat2 = [-1, -1], [-1, 0]
         mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], shape=torch.Size([8, 4]))
@@ -75,7 +72,7 @@ def test_einop_basic_propagation(self):
     def test_einop_pointwise_propagation(self):
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
 
-        func_schema = self.parse_schema(
+        func_schema = parse_schema(
             "aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor"
         )
         # addition
@@ -126,7 +123,7 @@ def test_einop_merge_sharding(self):
         )
         mesh = DeviceMesh(self.device_type, mesh_shape)
 
-        func_schema = self.parse_schema("aten::mm(Tensor self, Tensor mat2) -> Tensor")
+        func_schema = parse_schema("aten::mm(Tensor self, Tensor mat2) -> Tensor")
 
         mat1, mat2 = [0, -1], [-1, 1]
         mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], shape=torch.Size([8, 4]))
@@ -146,7 +143,7 @@ def test_einop_linearity(self):
         )
         mesh = DeviceMesh(self.device_type, mesh_shape)
 
-        mm_func_schema = self.parse_schema(
+        mm_func_schema = parse_schema(
             "aten::mm(Tensor self, Tensor mat2) -> Tensor"
         )
 
@@ -180,7 +177,7 @@ def test_einop_linearity(self):
 
         # einop prop with linearity on point-wise, should give back suggestion
         # on converting placements to partial
-        add_func_schema = self.parse_schema(
+        add_func_schema = parse_schema(
             "aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor"
         )
         mat1, mat2 = [0, -1], [0, -1]
@@ -205,7 +202,7 @@ def test_einop_multi_sharding_on_mesh_dim(self):
         mesh_shape = torch.arange(self.world_size)
         mesh = DeviceMesh(self.device_type, mesh_shape)
 
-        func_schema = self.parse_schema("aten::mm(Tensor self, Tensor mat2) -> Tensor")
+        func_schema = parse_schema("aten::mm(Tensor self, Tensor mat2) -> Tensor")
         mat1, mat2 = [0, -1], [0, -1]
         mat1_spec = DTensorSpec.from_dim_map(mesh, mat1, [], shape=torch.Size([8, 12]))
         mat2_spec = DTensorSpec.from_dim_map(mesh, mat2, [], shape=torch.Size([12, 4]))
@@ -229,7 +226,7 @@ def test_einop_errors(self):
         )
         mesh = DeviceMesh(self.device_type, mesh_shape)
 
-        func_schema = self.parse_schema(
+        func_schema = parse_schema(
             "aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor"
         )
         mat1, mat2 = [0, -1], [1, -1]
@@ -243,7 +240,7 @@ def test_einop_errors(self):
     def test_pointwise_rules_broadcasting(self):
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
 
-        func_schema = self.parse_schema(
+        func_schema = parse_schema(
             "where.self(Tensor condition, Tensor self, Tensor other) -> Tensor"
         )
         inp1, inp2, inp3 = [0], [], [-1, -1]
@@ -265,7 +262,7 @@ def test_pointwise_rules_broadcasting(self):
     def test_pointwise_rules_suggestion(self):
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
 
-        func_schema = self.parse_schema(
+        func_schema = parse_schema(
             "aten::lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor"
         )
         # propagate point-wise sharding
@@ -293,7 +290,7 @@ def test_pointwise_multi_sharding_on_mesh_dim(self):
         )
         mesh = DeviceMesh(self.device_type, mesh_shape)
 
-        func_schema = self.parse_schema(
+        func_schema = parse_schema(
             "aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor"
         )
 
@@ -337,7 +334,7 @@ def test_pointwise_enforce_sharding_multi_sharding_on_mesh_dim(self):
         )
         mesh = DeviceMesh(self.device_type, mesh_shape)
 
-        func_schema = self.parse_schema(
+        func_schema = parse_schema(
             "aten::add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)"
         )
 
@@ -366,7 +363,7 @@ def test_pointwise_enforce_sharding_multi_sharding_on_mesh_dim(self):
     def test_reduction_rule(self):
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
 
-        func_schema = self.parse_schema(
+        func_schema = parse_schema(
             "aten::sum(Tensor self, *, ScalarType? dtype=None) -> Tensor"
         )
         # reduction on a 2d mat
diff --git a/torch/distributed/_tensor/dispatch.py b/torch/distributed/_tensor/dispatch.py
index 24df6f879d316..54105a607027e 100644
--- a/torch/distributed/_tensor/dispatch.py
+++ b/torch/distributed/_tensor/dispatch.py
@@ -2,8 +2,6 @@
 from dataclasses import dataclass
 from typing import Callable, cast, Dict, List, Optional, Tuple
 
-from torchgen.model import FunctionSchema, SchemaKind
-
 import torch
 
 import torch.distributed._tensor.api as dtensor
@@ -68,20 +66,20 @@ class OpSchema(object):
           placements will get implicitly changed and it's error-prone.
     """
 
-    func_schema: FunctionSchema
+    func_schema: torch._C.FunctionSchema
     args_schema: Tuple[object, ...]
     kwargs_schema: Dict[str, object]
+
     is_inplace: bool = False
     is_out_variant: bool = False
 
+
     def __post_init__(self) -> None:
-        schema_kind = self.func_schema.kind()
-        self.is_inplace = (
-            schema_kind == SchemaKind.inplace  # pyre-ignore [16] pyre bad at enum
-        )
-        self.is_out_variant = (
-            schema_kind == SchemaKind.out  # pyre-ignore [16] pyre bad at enum
-        )
+        # simple analysis of function schema to determine
+        # if this is an inplace/out variant, it might not
+        # be entirely correct, but it's good enough for now.
+        self.is_inplace = self.func_schema.name[-1] == "_"
+        self.is_out_variant = "out" in self.func_schema.overload_name
 
     @property
     def args_spec(self) -> Tuple[DTensorSpec, ...]:
@@ -158,13 +156,11 @@ def propagate_input_sharding(
     kwargs: Dict[str, object],
     op_to_rules: Dict[str, Callable[[OpSchema], OutputSharding]],
 ) -> Tuple[OpSchema, bool, Optional[OutputSharding]]:
-    # parse the operator schema
-    func_schema = FunctionSchema.parse(str(op_call._schema))
     # unwrap the args/kwargs schema
     args_schema = tree_map(unwrap_schema, args)
     kwargs_schema = tree_map(unwrap_schema, kwargs)
 
-    op_schema = OpSchema(func_schema, args_schema, kwargs_schema)
+    op_schema = OpSchema(op_call._schema, args_schema, kwargs_schema)
 
     if _DEBUG_VERBOSE and torch.distributed.get_rank() == 0:
         print(f"{op_call}({op_schema})")
@@ -288,10 +284,15 @@ def operator_dispatch(
             else output_sharding.output_spec
         )
         out_dts = []
-        for i, out in enumerate(target_schema.func_schema.arguments.out):
-            out_dt = cast(dtensor.DTensor, kwargs[out.name])
-            out_dt._spec = cast(DTensorSpec, output_specs[i])
-            out_dts.append(out_dt)
+        spec_idx = 0
+        for arg in target_schema.func_schema.arguments:
+            if arg.is_out:
+                out_dt = cast(dtensor.DTensor, kwargs[arg.name])
+                out_dt._spec = cast(DTensorSpec, output_specs[spec_idx])
+                out_dts.append(out_dt)
+                spec_idx += 1
+
+        assert len(out_dts) >= 1, "out variant should have at least one out arg"
         return tuple(out_dts) if len(out_dts) > 1 else out_dts[0]
     else:
         return wrap(local_results, output_sharding.output_spec)

From ffe23e8fd401693867f95743fe366ca2cd08e376 Mon Sep 17 00:00:00 2001
From: Sean Ross-Ross <srossross@gmail.com>
Date: Tue, 6 Dec 2022 05:52:09 +0000
Subject: [PATCH 1614/1922] fix: update error when tensor escapes vmap (#89077)

Fixes https://github.com/pytorch/functorch/issues/1054

@zou3519, I played around with it, but I am unsure of how to repro the cases for gen_vmap_inplace_plumbing and below in gen_vmap_plumbing_no_returns

I've also seen that there are 24 other instances of the `TORCH_INTERNAL_ASSERT(maybe_layer.has_value());` assert, should I change all of these and add tests?

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89077
Approved by: https://github.com/zou3519
---
 aten/src/ATen/functorch/BatchRulesHelper.h | 12 +++++++----
 aten/src/ATen/functorch/PlumbingHelper.cpp | 11 ++++++++++
 aten/src/ATen/functorch/PlumbingHelper.h   |  2 ++
 test/functorch/test_vmap.py                | 25 ++++++++++++++++++++++
 torchgen/gen_vmap_plumbing.py              |  6 +++---
 5 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/aten/src/ATen/functorch/BatchRulesHelper.h b/aten/src/ATen/functorch/BatchRulesHelper.h
index 219c01c89c56e..2efc12d4c993e 100644
--- a/aten/src/ATen/functorch/BatchRulesHelper.h
+++ b/aten/src/ATen/functorch/BatchRulesHelper.h
@@ -3,6 +3,9 @@
 //
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
+#pragma once
+
+#include <c10/util/TypeList.h>
 
 #include <ATen/ATen.h>
 #include <ATen/Operators.h>
@@ -65,7 +68,7 @@ template <typename A, A a, typename C>
 struct BasicUnaryBatchRuleHelper;
 
 template <typename F, F Func, typename A, typename... T>
-struct BasicUnaryBatchRuleHelper<F, Func, typelist<A, T...>> {
+struct BasicUnaryBatchRuleHelper<F, Func, c10::guts::typelist::typelist<A, T...>> {
   static std::tuple<Tensor,optional<int64_t>> apply(
       const Tensor& tensor,
       optional<int64_t> batch_dim,
@@ -90,7 +93,7 @@ template <typename A, A a, typename C>
 struct VariadicBdimsBatchRuleHelper;
 
 template <typename F, F Func, typename A, typename... T>
-struct VariadicBdimsBatchRuleHelper<F, Func, typelist<A, T...>> {
+struct VariadicBdimsBatchRuleHelper<F, Func, c10::guts::typelist::typelist<A, T...>> {
   static std::tuple<Tensor,optional<int64_t>> apply(
       const Tensor& tensor,
       optional<int64_t> batch_dim,
@@ -123,7 +126,8 @@ void boxed_tensor_inputs_batch_rule(const c10::OperatorHandle& op, torch::jit::S
 
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
-  TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
+  vmap_check_escaped(maybe_layer, "boxed_tensor_inputs_batch_rule");
+
   int64_t cur_level = maybe_layer->layerId();
 
   auto orig_arguments = torch::jit::last(*stack, num_arguments);
@@ -379,7 +383,7 @@ template <typename A, A a, typename C>
 struct ExistingBdimBatchRuleHelper;
 
 template <typename F, F Func, typename A, typename... T>
-struct ExistingBdimBatchRuleHelper<F, Func, typelist<A, T...>> {
+struct ExistingBdimBatchRuleHelper<F, Func, c10::guts::typelist::typelist<A, T...>> {
   static std::tuple<Tensor,optional<int64_t>> apply(
       const Tensor& self,
       optional<int64_t> self_bdim,
diff --git a/aten/src/ATen/functorch/PlumbingHelper.cpp b/aten/src/ATen/functorch/PlumbingHelper.cpp
index 5dd01d0abbcbe..5877d2380d247 100644
--- a/aten/src/ATen/functorch/PlumbingHelper.cpp
+++ b/aten/src/ATen/functorch/PlumbingHelper.cpp
@@ -10,6 +10,17 @@
 
 namespace at { namespace functorch {
 
+void vmap_check_escaped(const optional<DynamicLayer> &layer, const char* what) {
+  TORCH_CHECK(
+    layer.has_value(),
+    "Either your tensor may have escaped from inside a function being vmapped and this is a user error ",
+    "(see https://pytorch.org/functorch/stable/ux_limitations.html), "
+    "or there is an internal functorch error in `",
+    what,
+    "` Please file an issue if it looks like the latter"
+  )
+}
+
 Tensor makeBatched(const Tensor& tensor, optional<int64_t> bdim, int64_t level) {
   if (bdim.has_value()) {
     TORCH_INTERNAL_ASSERT(*bdim >= 0);
diff --git a/aten/src/ATen/functorch/PlumbingHelper.h b/aten/src/ATen/functorch/PlumbingHelper.h
index 9eb486a6eefa0..dfb7da5227d5b 100644
--- a/aten/src/ATen/functorch/PlumbingHelper.h
+++ b/aten/src/ATen/functorch/PlumbingHelper.h
@@ -26,6 +26,8 @@
 
 namespace at { namespace functorch {
 
+void vmap_check_escaped(const optional<DynamicLayer> &layer, const char* what);
+
 // Create a BatchedTensor given a tensor, bdim, and level
 TORCH_API Tensor makeBatched(const Tensor& tensor, optional<int64_t> bdim, int64_t level);
 
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index 441cf46a98c22..7b6b8ff2e8994 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -3920,6 +3920,31 @@ def test_vmap_multi_dot_failure_1D_input(self):
         with self.assertRaisesRegex(RuntimeError, "tensor 1 must be 2D but got 1D"):
             return vmap(torch.linalg.multi_dot)(inputs)
 
+    def test_vmap_escaped_error(self):
+        escaped = None
+
+        def f(x):
+            nonlocal escaped
+            escaped = x
+            return x ** 2
+
+        x = torch.randn(3)
+        vmap(f)(x)
+
+        common_message = r"your tensor may have escaped from inside a function being vmapped.*{0}.*"
+
+        with self.assertRaisesRegex(RuntimeError, common_message.format("gen_vmap_plumbing")):
+            escaped.sin()
+
+        with self.assertRaisesRegex(RuntimeError, common_message.format("boxed_tensor_inputs_batch_rule")):
+            escaped.sin_()
+
+        with self.assertRaisesRegex(RuntimeError, common_message.format("gen_vmap_inplace_plumbing")):
+            escaped.mul_(1)
+
+        vmap(f)(torch.tensor([[0, 0], [0, 0]], dtype=torch.int))
+        with self.assertRaisesRegex(RuntimeError, common_message.format("gen_vmap_plumbing_no_returns")):
+            torch.ops.aten._linalg_check_errors(escaped, 'linalg.inv', is_matrix=False)
 
 class TestRandomness(TestCase):
     def _reset_random(self, generator, orig_state, use_generator, seed):
diff --git a/torchgen/gen_vmap_plumbing.py b/torchgen/gen_vmap_plumbing.py
index 87db309e0b782..0876f3e343453 100644
--- a/torchgen/gen_vmap_plumbing.py
+++ b/torchgen/gen_vmap_plumbing.py
@@ -167,7 +167,7 @@ def gen_vmap_inplace_plumbing(native_function: NativeFunction) -> Optional[str]:
 {sig.decl(name=schema.name.unambiguous_name() + '_generated_plumbing')} {{
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
-  TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
+  vmap_check_escaped(maybe_layer, "gen_vmap_inplace_plumbing");
   int64_t {cur_level_var} = maybe_layer->layerId();
 {textwrap.indent(bdims_all_none_case, "  ")}
 {textwrap.indent(unwraps, "  ")}
@@ -189,7 +189,7 @@ def gen_vmap_plumbing_no_returns(native_function: NativeFunction) -> str:
 {sig.decl(name=schema.name.unambiguous_name() + '_generated_plumbing')} {{
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
-  TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
+  vmap_check_escaped(maybe_layer, "gen_vmap_plumbing_no_returns");
   int64_t {cur_level_var} = maybe_layer->layerId();
 {textwrap.indent(bdims_all_none_case, "  ")}
 {textwrap.indent(unwraps, "  ")}
@@ -232,7 +232,7 @@ def gen_vmap_plumbing(native_function: NativeFunction) -> Optional[str]:
 {sig.decl(name=schema.name.unambiguous_name() + '_generated_plumbing')} {{
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
-  TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
+  vmap_check_escaped(maybe_layer, "gen_vmap_plumbing");
   int64_t {cur_level_var} = maybe_layer->layerId();
 {textwrap.indent(bdims_all_none_case, "  ")}
 {textwrap.indent(unwraps, "  ")}

From 02526082e5e5668e77f74cb3c1bd97a45e2d298b Mon Sep 17 00:00:00 2001
From: Sean Ross-Ross <srossross@gmail.com>
Date: Tue, 6 Dec 2022 05:59:47 +0000
Subject: [PATCH 1615/1922] fix: Moving operators to
 FuncTorchBatchedDecomposition (#89762)

Some of the easy to move operators I've moved over and removed an xfail.

I found this from the test that I implemented in https://github.com/pytorch/pytorch/pull/89465

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89762
Approved by: https://github.com/zou3519
---
 .../ATen/functorch/BatchRulesBinaryOps.cpp    |  1 -
 .../functorch/BatchRulesDecompositions.cpp    | 22 +++++++++++++++++++
 aten/src/ATen/functorch/BatchRulesModules.cpp |  1 -
 .../ATen/functorch/BatchRulesReduceOps.cpp    |  1 -
 .../src/ATen/functorch/BatchRulesUnaryOps.cpp | 15 -------------
 aten/src/ATen/functorch/BatchRulesViews.cpp   |  2 --
 test/functorch/test_ops.py                    |  1 -
 test/functorch/test_vmap.py                   |  1 -
 8 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp b/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp
index 4e228afdfc614..db601d3b0b8f1 100644
--- a/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp
@@ -385,7 +385,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   BINARY_SCALAR_2(div, Tensor_mode, Scalar_mode);
 
   BINARY_POINTWISE(floor_divide);
-  UNARY_POINTWISE2(floor_divide, Scalar);
 
   BINARY_POINTWISE(fmax);
   BINARY_POINTWISE(fmin);
diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
index 3696287179223..eebb0ab6349dd 100644
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@@ -41,6 +41,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
   OP_DECOMPOSE(_batch_norm_impl_index);
   OP_DECOMPOSE(absolute);
   OP_DECOMPOSE(arctan2);
+  OP_DECOMPOSE(argsort);
   OP_DECOMPOSE(avg_pool1d);
   OP_DECOMPOSE(adaptive_max_pool1d);
   OP_DECOMPOSE(adaptive_avg_pool1d);
@@ -66,15 +67,18 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
   m.impl("broadcast_to", native::broadcast_to_symint);
   OP_DECOMPOSE(cartesian_prod);
   OP_DECOMPOSE(cdist);
+  OP_DECOMPOSE(chunk);
   OP_DECOMPOSE(clip);
   OP_DECOMPOSE2(clip, Tensor );
   OP_DECOMPOSE(concat);
   OP_DECOMPOSE(conj_physical);
+  OP_DECOMPOSE(contiguous);
   OP_DECOMPOSE(combinations);
   OP_DECOMPOSE(corrcoef);
   OP_DECOMPOSE(cosine_embedding_loss);
   OP_DECOMPOSE(cosine_similarity);
   OP_DECOMPOSE(cov);
+  OP_DECOMPOSE(cross);
   m.impl("cross_entropy_loss", native::cross_entropy_loss_symint);
   OP_DECOMPOSE2(cumulative_trapezoid, x);
   OP_DECOMPOSE2(cumulative_trapezoid, dx);
@@ -82,6 +86,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
   OP_DECOMPOSE2(dsplit, array);
   OP_DECOMPOSE(det);
   OP_DECOMPOSE(diff);
+  OP_DECOMPOSE(diag);
   OP_DECOMPOSE(dstack);
   OP_DECOMPOSE(einsum);
   m.impl("embedding_backward", native::embedding_backward_symint);
@@ -109,6 +114,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
   OP_DECOMPOSE(flipud);
   OP_DECOMPOSE2(float_power, Tensor_Tensor);
   OP_DECOMPOSE2(float_power, Tensor_Scalar);
+  OP_DECOMPOSE2(floor_divide, Scalar);
   OP_DECOMPOSE(ger);
   OP_DECOMPOSE2(gradient, scalarint);
   OP_DECOMPOSE2(gradient, scalararray);
@@ -200,6 +206,22 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
   OP_DECOMPOSE(special_multigammaln);
   OP_DECOMPOSE(special_polygamma);
   OP_DECOMPOSE(special_softmax);
+  OP_DECOMPOSE(special_digamma);
+  OP_DECOMPOSE(special_erf);
+  OP_DECOMPOSE(special_erfc);
+  OP_DECOMPOSE(special_erfinv);
+  OP_DECOMPOSE(special_exp2);
+  OP_DECOMPOSE(special_expm1);
+  OP_DECOMPOSE(special_expit);
+  OP_DECOMPOSE(special_gammaln);
+  OP_DECOMPOSE(special_i0);
+  OP_DECOMPOSE(special_log1p);
+  OP_DECOMPOSE(special_ndtr);
+  OP_DECOMPOSE(special_psi);
+  OP_DECOMPOSE(special_round);
+  OP_DECOMPOSE(special_sinc);
+
+
   m.impl("split.sizes", native::split_symint);
   OP_DECOMPOSE(square);
   OP_DECOMPOSE(numpy_T);
diff --git a/aten/src/ATen/functorch/BatchRulesModules.cpp b/aten/src/ATen/functorch/BatchRulesModules.cpp
index f51d63feaa8e0..506ed3ae44052 100644
--- a/aten/src/ATen/functorch/BatchRulesModules.cpp
+++ b/aten/src/ATen/functorch/BatchRulesModules.cpp
@@ -401,7 +401,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   VMAP_SUPPORT(cudnn_grid_sampler_backward, CUDNN_GRID_SAMPLE_BW_BATCH_RULE(cudnn_grid_sampler_backward));
 
   VMAP_SUPPORT(cudnn_grid_sampler, GRID_SAMPLE_BATCH_RULE(cudnn_grid_sampler));
-  VMAP_SUPPORT(cross, cross_batch_rule);
 
   EXISTING_BDIM(pixel_shuffle);
   EXISTING_BDIM(pixel_unshuffle);
diff --git a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
index 9126507e73be0..d792a8da3f9c5 100644
--- a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
@@ -412,7 +412,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   REDUCTION_BOXED(_softmax);
   REDUCTION_BOXED(sort);
   REDUCTION_BOXED_ARGS(sort.stable, 2);
-  REDUCTION_BOXED(argsort);
   REDUCTION_BOXED(std_mean.correction);
   m.impl("sum", sum_decomp);
   REDUCTION_BOXED(sum.dim_IntList);
diff --git a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
index ee6391c6e2844..8cd4385fea863 100644
--- a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
@@ -93,7 +93,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
   UNARY_POINTWISE(view_as_real);
   VMAP_SUPPORT(view_as_complex, view_as_complex_batch_rule);
   VMAP_SUPPORT(clone, clone_batch_rule);
-  VMAP_SUPPORT(contiguous, contiguous_batch_rule);
   VMAP_SUPPORT2(to, device, BASIC_UNARY_BATCH_RULE(ATEN_FN2(to, device)));
   VMAP_SUPPORT2(to, dtype, BASIC_UNARY_BATCH_RULE(ATEN_FN2(to, dtype)));
   VMAP_SUPPORT2(to, dtype_layout, BASIC_UNARY_BATCH_RULE(ATEN_FN2(to, dtype_layout)));
@@ -163,25 +162,11 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
 
   // torch.special.* functions
   UNARY_POINTWISE(special_entr);
-  UNARY_POINTWISE(special_erf);
-  UNARY_POINTWISE(special_erfc);
   UNARY_POINTWISE(special_erfcx);
-  UNARY_POINTWISE(special_erfinv);
-  UNARY_POINTWISE(special_expit);
-  UNARY_POINTWISE(special_expm1);
-  UNARY_POINTWISE(special_digamma);
-  UNARY_POINTWISE(special_psi);
-  UNARY_POINTWISE(special_exp2);
-  UNARY_POINTWISE(special_gammaln);
-  UNARY_POINTWISE(special_i0);
   UNARY_POINTWISE(special_i0e);
   UNARY_POINTWISE(special_i1);
   UNARY_POINTWISE(special_i1e);
-  UNARY_POINTWISE(special_log1p);
-  UNARY_POINTWISE(special_ndtr);
   UNARY_POINTWISE(special_ndtri);
-  UNARY_POINTWISE(special_round);
-  UNARY_POINTWISE(special_sinc);
 
   // Activation functions (from https://pytorch.org/docs/stable/nn.html#non-linear-activations-weighted-sum-nonlinearity)
   UNARY_POINTWISE_ALL(elu);
diff --git a/aten/src/ATen/functorch/BatchRulesViews.cpp b/aten/src/ATen/functorch/BatchRulesViews.cpp
index 98eaf0f387a6e..d5e5161fb3a31 100644
--- a/aten/src/ATen/functorch/BatchRulesViews.cpp
+++ b/aten/src/ATen/functorch/BatchRulesViews.cpp
@@ -535,8 +535,6 @@ Tensor trace_decomp(const Tensor& tensor) {
 }
 
 TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
-  VMAP_SUPPORT(diag, diag_batch_rule);
-  VMAP_SUPPORT(chunk, chunk_batching_rule);
   m.impl("flatten.using_ints", static_cast<decltype(&ATEN_FN2(flatten, using_ints))>(native::flatten));
   VMAP_SUPPORT(flip, flip_batch_rule);
   m.impl("trace", trace_decomp);
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index 74b9c26e6529c..7a2f5b8dc61a1 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -1198,7 +1198,6 @@ def test():
         xfail('svd_lowrank', ''),
         xfail('pca_lowrank', ''),
         xfail('clamp'),
-        xfail('cross'),  # The defaults of this op are *very* weird. No wonder it doesn't work
         # something weird happening with channels_last
         xfail('bfloat16'),
         xfail('double'),
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index 7b6b8ff2e8994..b85da534be8fa 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -3238,7 +3238,6 @@ def test():
         xfail('eye', ''),  # non-tensor input
         xfail('broadcast_shapes', ''),  # test runner can't handle non-Tensor ops
         xfail('sparse.sampled_addmm'),  # sparse
-        xfail('cross'),  # The default value of dim in op is *very* weird. No wonder it doesn't work
         skip('_softmax_backward_data'),
         skip('linalg.eigh', ''),  # not unique, see test_linalg_eigh for manual test
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format

From bf7034608afb69d0fb53904389a14885ace4090b Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Tue, 6 Dec 2022 02:54:51 +0000
Subject: [PATCH 1616/1922] [FSDP()] Fix `fully_shard` fwd hook registration
 (#90201)

I need to rebase later after Shen's PRs land.

The idea is to only register the pre/post-forward hook on the _root modules_ among the modules that consume a `FlatParameter`. (Yes, the term _root module_ is heavily overloaded. We may want to clarify that at some point. Here, _root_ is being used in the graph sense, meaning parent-less, and the scope is only among the modules consuming a `FlatParameter`.)

This avoids unnecessary pre/post-forward hooks running, which would lead to errors because the unshard is not truly idempotent.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90201
Approved by: https://github.com/mrshenli, https://github.com/rohan-varma
---
 test/distributed/_composable/test_compose.py | 40 +++++++++-----------
 torch/distributed/fsdp/_common_utils.py      | 26 ++++++++++++-
 torch/distributed/fsdp/_init_utils.py        | 15 ++++++--
 torch/distributed/fsdp/_runtime_utils.py     |  4 +-
 torch/distributed/fsdp/flat_param.py         |  7 ++++
 5 files changed, 63 insertions(+), 29 deletions(-)

diff --git a/test/distributed/_composable/test_compose.py b/test/distributed/_composable/test_compose.py
index ae613ad1c8957..8bedec41766ca 100644
--- a/test/distributed/_composable/test_compose.py
+++ b/test/distributed/_composable/test_compose.py
@@ -198,16 +198,15 @@ def test_checkpoint_fsdp_submodules_with_param(self):
         test_model.u2.seq = checkpoint(test_model.u2.seq, use_reentrant=False)
         test_model = fully_shard(test_model)
 
-        with self.assertRaisesRegex(RuntimeError, "mat2 must be a matrix"):
-            self.run_subtests(
-                {
-                    "base_model": [base_model],
-                    "test_model": [test_model],
-                    "x": [torch.randn(2, 100, device="cuda")],
-                    "grad_to_none": [True, False],
-                },
-                self._test_parity,
-            )
+        self.run_subtests(
+            {
+                "base_model": [base_model],
+                "test_model": [test_model],
+                "x": [torch.randn(2, 100, device="cuda")],
+                "grad_to_none": [True, False],
+            },
+            self._test_parity,
+        )
 
     @skip_if_lt_x_gpu(2)
     def test_checkpoint_fsdp_submodules_with_param_no_shard(self):
@@ -220,18 +219,15 @@ def test_checkpoint_fsdp_submodules_with_param_no_shard(self):
         test_model.u2.seq = checkpoint(test_model.u2.seq, use_reentrant=False)
         test_model = fully_shard(test_model, strategy=ShardingStrategy.NO_SHARD)
 
-        with self.assertRaisesRegex(
-            RuntimeError, "Cannot writeback when the parameter shape changes"
-        ):
-            self.run_subtests(
-                {
-                    "base_model": [base_model],
-                    "test_model": [test_model],
-                    "x": [torch.randn(2, 100, device="cuda")],
-                    "grad_to_none": [True, False],
-                },
-                self._test_parity,
-            )
+        self.run_subtests(
+            {
+                "base_model": [base_model],
+                "test_model": [test_model],
+                "x": [torch.randn(2, 100, device="cuda")],
+                "grad_to_none": [True, False],
+            },
+            self._test_parity,
+        )
 
 
 instantiate_parametrized_tests(TestFSDPCheckpoint)
diff --git a/torch/distributed/fsdp/_common_utils.py b/torch/distributed/fsdp/_common_utils.py
index f6ccc3e9243f8..935dba233efed 100644
--- a/torch/distributed/fsdp/_common_utils.py
+++ b/torch/distributed/fsdp/_common_utils.py
@@ -4,7 +4,7 @@
 
 import traceback
 from enum import auto, Enum
-from typing import Any, Callable, Dict, List, no_type_check, Union
+from typing import Any, Callable, Dict, List, no_type_check, Set, Union
 
 import torch
 import torch.distributed.fsdp.flat_param as flat_param_file
@@ -200,3 +200,27 @@ def _assert_in_training_states(
             print(f"ERROR: {msg}")
             traceback.print_stack()
         raise ValueError(msg)
+
+
+def _get_root_modules(modules: Set[nn.Module]) -> Set[nn.Module]:
+    """
+    Returns:
+        Set[nn.Module]: The subset of ``modules`` that are root modules (i.e.
+        parent-less) with respect to the modules in the set itself. In other
+        words, these are the modules in ``modules`` that are not the child of
+        any other module in ``modules``.
+    """
+    root_modules: Set[nn.Module] = set()
+    module_to_submodules = {module: set(module.modules()) for module in modules}
+    for candidate_module in modules:
+        is_root_module = True
+        for module, submodules in module_to_submodules.items():
+            is_child_module = (
+                candidate_module is not module and candidate_module in submodules
+            )
+            if is_child_module:
+                is_root_module = False
+                break
+        if is_root_module:
+            root_modules.add(candidate_module)
+    return root_modules
diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index c2f759eb4a956..2fe9b87f2a79d 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -158,10 +158,19 @@ def _init_core_state(
         backward_prefetch_limit,
         forward_prefetch_limit,
     )
+    # Mapping from module to every `FlatParamHandle` that the module consumes,
+    # where there is an entry for every (sub)module
     _module_to_handles: Dict[
         nn.Module, List[FlatParamHandle]
     ] = collections.defaultdict(list)
     state._module_to_handles = _module_to_handles
+    # Same as `_module_to_handle` but filtered to only include keys that are
+    # root modules with respect to the `FlatParamHandle` (see `_root_modules`
+    # in `FlatParameter`)
+    _root_module_to_handles: Dict[
+        nn.Module, List[FlatParamHandle]
+    ] = collections.defaultdict(list)
+    state._root_module_to_handles = _root_module_to_handles
     # Invariant: `state.params` contains exactly the `FlatParameter`s of the
     # handles in `state._handles`
     _handles: List[FlatParamHandle] = []
@@ -181,10 +190,6 @@ def _init_runtime_state(
     state._pre_forward_handles = _pre_forward_handles
     _post_forward_handles: List[RemovableHandle] = []
     state._post_forward_handles = _post_forward_handles
-    _module_to_handles: Dict[
-        nn.Module, List[FlatParamHandle]
-    ] = collections.defaultdict(list)
-    state._module_to_handles = _module_to_handles
     state._sync_gradients = True
     state._communication_hook = _get_default_comm_hook(state.sharding_strategy)
     state._communication_hook_state = _get_default_comm_hook_state(state.process_group)
@@ -354,6 +359,8 @@ def _init_param_handle_from_params(
     state._handles.append(handle)
     for module in handle.flat_param._modules:
         state._module_to_handles[module].append(handle)
+    for module in handle.flat_param._root_modules:
+        state._root_module_to_handles[module].append(handle)
     cpu_device = torch.device("cpu")
     if state.cpu_offload.offload_params and handle.flat_param.device != cpu_device:
         handle.flat_param_to(cpu_device)
diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index b2cd316e2bf81..abbcf2b00d5e4 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -888,7 +888,7 @@ def _register_pre_forward_hooks(
         forward_handle.remove()
     state._pre_forward_handles.clear()
     for module in modules:
-        module_param_handles = state._module_to_handles[module]
+        module_param_handles = state._root_module_to_handles[module]
         if module_param_handles:
             unshard_fn = functools.partial(
                 _pre_forward_unshard,
@@ -918,7 +918,7 @@ def _register_post_forward_hooks(
         forward_handle.remove()
     state._post_forward_handles.clear()
     for module in modules:
-        module_param_handles = state._module_to_handles[module]
+        module_param_handles = state._root_module_to_handles[module]
         if module_param_handles:
             reshard_fn = functools.partial(
                 _post_forward_reshard,
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index 08350b25223c1..847440c88f67d 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -24,6 +24,7 @@
 import torch.nn.functional as F
 from torch import Tensor
 from torch.distributed.fsdp._common_utils import (
+    _get_root_modules,
     _set_fsdp_flattened,
     HandleTrainingState,
 )
@@ -166,6 +167,11 @@ class FlatParameter(nn.Parameter):
             depend on its existence in the future.
         _modules (Set[nn.Module]): Modules that contain some original parameter
             that is flattened into the ``FlatParameter``.
+        _root_modules (Set[nn.Module]): Modules in ``self._modules`` that are
+            root modules (i.e. parent-less) with respect to ``self._modules``.
+            These are the modules for which we register pre/post-forward hooks
+            in the composable code path. There will be one unshard/reshard pair
+            for each root module in this set.
 
         _shard_param_offsets (List[Tuple[int, int])): [start, end] offsets (in
             units of numel) giving this rank's part of each flattened original
@@ -265,6 +271,7 @@ def _init_metadata(
         self._modules = set(pi.module for pi in self._param_infos).union(
             set(spi.module for spi in self._shared_param_infos)
         )
+        self._root_modules = _get_root_modules(self._modules)
         assert (params is None) == (shared_params is None)
         if params is not None:
             assert shared_params is not None and len(shared_params) == len(

From 8531121684cd16ade13273e9c50e91181eb30cc0 Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Tue, 6 Dec 2022 06:27:40 +0000
Subject: [PATCH 1617/1922] [Vulkan] Enable QInt8 and QInt32 quantization
 (#89788)

Summary: Enabled Vulkan quantization for dtypes QInt8 and QInt32

Test Plan:
On Mac
```
cd ~/fbsource
buck1 run -c pt.vulkan_full_precision=1 //xplat/caffe2:pt_vulkan_quantized_api_test_binAppleMac\#macosx-arm64
```

On Android
```
cd ~/fbsource
buck1 build -c ndk.custom_libcxx=false -c pt.enable_qpl=0 -c pt.vulkan_full_precision=1 //xplat/caffe2:pt_vulkan_quantized_api_test_binAndroid\#android-arm64 --show-output
adb push buck-out/gen/xplat/caffe2/pt_vulkan_quantized_api_test_binAndroid\#android-arm64 /data/local/tmp/vulkan_quantized_api_test
adb shell "/data/local/tmp/vulkan_quantized_api_test"
```

Differential Revision: D41561661

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89788
Approved by: https://github.com/digantdesai
---
 aten/src/ATen/native/vulkan/api/Resource.cpp  |   4 +
 .../glsl/quantize_per_tensor_qint32.glsl      |  31 +++++
 .../glsl/quantize_per_tensor_qint8.glsl       |  31 +++++
 ...r.glsl => quantize_per_tensor_quint8.glsl} |   0
 aten/src/ATen/native/vulkan/ops/Copy.cpp      |  14 +-
 .../native/vulkan/ops/QuantizedTensor.cpp     |  27 +++-
 .../ATen/test/vulkan_quantized_api_test.cpp   | 129 +++++++++++++-----
 7 files changed, 191 insertions(+), 45 deletions(-)
 create mode 100644 aten/src/ATen/native/vulkan/glsl/quantize_per_tensor_qint32.glsl
 create mode 100644 aten/src/ATen/native/vulkan/glsl/quantize_per_tensor_qint8.glsl
 rename aten/src/ATen/native/vulkan/glsl/{quantize_per_tensor.glsl => quantize_per_tensor_quint8.glsl} (100%)

diff --git a/aten/src/ATen/native/vulkan/api/Resource.cpp b/aten/src/ATen/native/vulkan/api/Resource.cpp
index e47f85b9f556f..517bd0a56232f 100644
--- a/aten/src/ATen/native/vulkan/api/Resource.cpp
+++ b/aten/src/ATen/native/vulkan/api/Resource.cpp
@@ -36,6 +36,10 @@ VkFormat vk_format(const at::ScalarType dtype) {
 #endif /* USE_VULKAN_FP16_INFERENCE */
     case c10::kQUInt8:
       return VK_FORMAT_R8G8B8A8_UINT;
+    case c10::kQInt8:
+      return VK_FORMAT_R8G8B8A8_SINT;
+    case c10::kQInt32:
+      return VK_FORMAT_R32G32B32A32_SINT;
 
     default:
       TORCH_CHECK(
diff --git a/aten/src/ATen/native/vulkan/glsl/quantize_per_tensor_qint32.glsl b/aten/src/ATen/native/vulkan/glsl/quantize_per_tensor_qint32.glsl
new file mode 100644
index 0000000000000..75fca31ee23b2
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/quantize_per_tensor_qint32.glsl
@@ -0,0 +1,31 @@
+#version 450 core
+#define PRECISION $precision
+#define FORMAT    $format
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, rgba32i) uniform PRECISION restrict writeonly iimage3D   uOutput;
+layout(set = 0, binding = 1)          uniform PRECISION                    sampler3D  uInput; //input
+layout(set = 0, binding = 2)          uniform PRECISION restrict           Block {
+  ivec4 size;
+  vec2 scale;
+  ivec2 zero_point;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  if (all(lessThan(pos, uBlock.size.xyz))) {
+    vec4 q_res = roundEven(texelFetch(uInput, pos, 0) / uBlock.scale.x) + uBlock.zero_point.x;
+
+    ivec4 ret = ivec4(q_res);
+
+    imageStore(
+        uOutput,
+        pos,
+        ret);
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/quantize_per_tensor_qint8.glsl b/aten/src/ATen/native/vulkan/glsl/quantize_per_tensor_qint8.glsl
new file mode 100644
index 0000000000000..2ba863d321312
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/quantize_per_tensor_qint8.glsl
@@ -0,0 +1,31 @@
+#version 450 core
+#define PRECISION $precision
+#define FORMAT    $format
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+layout(set = 0, binding = 0, rgba8i) uniform PRECISION restrict writeonly iimage3D   uOutput;
+layout(set = 0, binding = 1)         uniform PRECISION                    sampler3D  uInput; //input
+layout(set = 0, binding = 2)         uniform PRECISION restrict           Block {
+  ivec4 size;
+  vec2 scale;
+  ivec2 zero_point;
+} uBlock;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  if (all(lessThan(pos, uBlock.size.xyz))) {
+    vec4 q_res = roundEven(texelFetch(uInput, pos, 0) / uBlock.scale.x) + uBlock.zero_point.x;
+
+    ivec4 ret = ivec4(q_res);
+
+    imageStore(
+        uOutput,
+        pos,
+        ret);
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/quantize_per_tensor.glsl b/aten/src/ATen/native/vulkan/glsl/quantize_per_tensor_quint8.glsl
similarity index 100%
rename from aten/src/ATen/native/vulkan/glsl/quantize_per_tensor.glsl
rename to aten/src/ATen/native/vulkan/glsl/quantize_per_tensor_quint8.glsl
diff --git a/aten/src/ATen/native/vulkan/ops/Copy.cpp b/aten/src/ATen/native/vulkan/ops/Copy.cpp
index 06f9225fe47df..5f63f3eb949a5 100644
--- a/aten/src/ATen/native/vulkan/ops/Copy.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Copy.cpp
@@ -18,10 +18,15 @@ void memcpy_to_mapping(const Tensor& src, api::MemoryMap& dst_mapping) {
     memcpy_to_mapping_impl<c10::Half>(src, dst_mapping);
   } else if (src.dtype() == c10::kQUInt8) {
     memcpy_to_mapping_impl<c10::quint8>(src, dst_mapping);
+  } else if (src.dtype() == c10::kQInt8) {
+    memcpy_to_mapping_impl<c10::qint8>(src, dst_mapping);
+  } else if (src.dtype() == c10::kQInt32) {
+    memcpy_to_mapping_impl<c10::qint32>(src, dst_mapping);
   } else {
     TORCH_CHECK(
         false,
-        "Invalid Data Type: expected c10::QUint8, at::kHalf or at::Float but got ",
+        "Invalid Data Type: expected c10::kQInt32, c10::kQInt8, c10::kQUInt8,",
+        " at::kHalf or at::Float but got ",
         src.dtype());
   }
 }
@@ -33,10 +38,15 @@ void memcpy_from_mapping(api::MemoryMap& src_mapping, Tensor& dst) {
     memcpy_from_mapping_impl<c10::Half>(src_mapping, dst);
   } else if (dst.dtype() == c10::kQUInt8) {
     memcpy_from_mapping_impl<c10::quint8>(src_mapping, dst);
+  } else if (dst.dtype() == c10::kQInt8) {
+    memcpy_from_mapping_impl<c10::qint8>(src_mapping, dst);
+  } else if (dst.dtype() == c10::kQInt32) {
+    memcpy_from_mapping_impl<c10::qint32>(src_mapping, dst);
   } else {
     TORCH_CHECK(
         false,
-        "Invalid Data Type: expected c10::QUint8, at::kHalf or Float but got ",
+        "Invalid Data Type: expected c10::kQInt32, c10::kQInt8, c10::kQUInt8,",
+        " at::kHalf or at::Float but got ",
         dst.dtype());
   }
 }
diff --git a/aten/src/ATen/native/vulkan/ops/QuantizedTensor.cpp b/aten/src/ATen/native/vulkan/ops/QuantizedTensor.cpp
index c4ba030b5bb4e..4bb0880383575 100644
--- a/aten/src/ATen/native/vulkan/ops/QuantizedTensor.cpp
+++ b/aten/src/ATen/native/vulkan/ops/QuantizedTensor.cpp
@@ -10,12 +10,29 @@ namespace ops {
 
 using namespace api::utils;
 
+static api::ShaderSource get_quantize_per_tensor_shader(
+    const c10::ScalarType dtype) {
+  switch (dtype) {
+    case c10::ScalarType::QUInt8:
+      return VK_KERNEL(quantize_per_tensor_quint8);
+    case c10::ScalarType::QInt8:
+      return VK_KERNEL(quantize_per_tensor_qint8);
+    case c10::ScalarType::QInt32:
+      return VK_KERNEL(quantize_per_tensor_qint32);
+    default:
+      TORCH_CHECK(
+          false,
+          "Vulkan quantization currently not supported for dtype ",
+          dtype);
+  }
+}
+
 Tensor quantize_per_tensor(
     const at::Tensor& input_arg,
     const double scale,
     const int64_t zero_point,
     const c10::ScalarType dtype) {
-  TORCH_CHECK(dtype == c10::ScalarType::QUInt8, "Expected type c10::kQUint8");
+  api::ShaderSource compute_shader = get_quantize_per_tensor_shader(dtype);
 
   api::Context* const context = api::context();
 
@@ -23,11 +40,7 @@ Tensor quantize_per_tensor(
   const vTensor& v_input = convert(input);
 
   vTensor v_output{
-      context,
-      input.sizes(),
-      input.options().dtype(c10::kQUInt8),
-      scale,
-      zero_point};
+      context, input.sizes(), input.options().dtype(dtype), scale, zero_point};
 
   const struct Block final {
     uvec3 extents;
@@ -50,7 +63,7 @@ Tensor quantize_per_tensor(
 
   context->submit_compute_job(
       // shader descriptor
-      VK_KERNEL(quantize_per_tensor),
+      compute_shader,
       // barrier
       pipeline_barrier,
       // global work group size
diff --git a/aten/src/ATen/test/vulkan_quantized_api_test.cpp b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
index 205366d5770eb..fffd4691ba772 100644
--- a/aten/src/ATen/test/vulkan_quantized_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
@@ -449,66 +449,123 @@ void test_quantize_per_tensor_and_dequantize(
     const at::IntArrayRef input_shape,
     const double input_scale,
     const int input_zero_point,
-    const float tolerance = 0) {
-  at::Tensor input = at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
+    const c10::ScalarType dtype = c10::ScalarType::QUInt8) {
+  at::Tensor input = produce_random_tensor(input_shape);
 
   // quantize tensors
   at::Tensor out_q_cpu = at::quantize_per_tensor(
-    input, input_scale, input_zero_point, c10::ScalarType::QUInt8);
+    input, input_scale, input_zero_point, dtype);
   at::Tensor out_q_vk = at::quantize_per_tensor(
-    input.vulkan(), input_scale, input_zero_point, c10::ScalarType::QUInt8);
+    input.vulkan(), input_scale, input_zero_point, dtype);
 
   // dequantize tensors
   const auto out_cpu_deq = at::dequantize(out_q_cpu);
   const auto out_vk_deq = at::dequantize(out_q_vk);
+  const auto out_vk_deq_cpu = out_vk_deq.cpu();
 
   // check dequantized tensor are equal
-  const auto check = almostEqual(out_cpu_deq, out_vk_deq.cpu(), tolerance);
+  const float tolerance = input_scale;
+  // tolerated error = scale, to allow for precision differences after dividing
+  // by random scale, which could result on a difference of 1 unit in the
+  // quantized result.
+  const auto check = almostEqual(out_cpu_deq, out_vk_deq_cpu, tolerance);
 
   if (!check) {
+    const auto error = at::abs(out_vk_deq_cpu - out_cpu_deq).max().item<float>();
     std::cout
       << "Quantize and Dequantize failed with input shape: " << input_shape
       << " scale: " << input_scale << " and zero point: " << input_zero_point
     << std::endl;
+    std::cout << "Error: " << error << std::endl;
   }
   ASSERT_TRUE(check);
 }
 
-void test_quantize_per_tensor_and_dequantize_random() {
-  const double scale = 0.0001 + (double)rand() / (double)RAND_MAX;
-  const int zero_point = int((double)rand() / (double)RAND_MAX * 255);
-  const int n = 1 + int((double)rand() / (double)RAND_MAX * 30);
-  const int c = 1 + int((double)rand() / (double)RAND_MAX * 30);
-  const int h = 1 + int((double)rand() / (double)RAND_MAX * 100);
-  const int w = 1 + int((double)rand() / (double)RAND_MAX * 100);
-  // tolerated error = scale, to allow for precision differences after dividing
-  // by random scale, which could result on a difference of 1 unit in the
-  // quantized result.
-  test_quantize_per_tensor_and_dequantize({n, c, h, w}, scale, zero_point, scale);
+void test_quantize_per_tensor_and_dequantize_random(
+    const c10::ScalarType dtype) {
+  const double scale = produce_random_scale();
+  const int64_t zero_point = produce_random_zero_point(dtype);
+  const at::IntArrayRef tensor_shape =
+    {rand_pos_int(30), rand_pos_int(30), rand_pos_int(100), rand_pos_int(100)};
+  test_quantize_per_tensor_and_dequantize(
+    tensor_shape, scale, zero_point, dtype);
+}
+
+TEST_F(VulkanAPITest, quantize_per_tensor_and_dequantize_quint8) {
+  const c10::ScalarType dtype = c10::ScalarType::QUInt8;
+  test_quantize_per_tensor_and_dequantize({1, 1, 1, 1}, 0.13, 21, dtype);
+  test_quantize_per_tensor_and_dequantize({1, 1, 1, 4}, 0.3, 87, dtype);
+  test_quantize_per_tensor_and_dequantize({1, 1, 4, 1}, 0.2, 120, dtype);
+  test_quantize_per_tensor_and_dequantize({1, 1, 7, 7}, 0.3, 87, dtype);
+  test_quantize_per_tensor_and_dequantize({1, 1, 8, 8}, 0.1, 10, dtype);
+  test_quantize_per_tensor_and_dequantize({3, 5, 8, 8}, 0.04, 97, dtype);
+  test_quantize_per_tensor_and_dequantize({1, 1, 11, 17}, 0.07, 15, dtype);
+  test_quantize_per_tensor_and_dequantize({1, 1, 12, 17}, 0.1, 10, dtype);
+  test_quantize_per_tensor_and_dequantize({3, 5, 12, 17}, 0.1, 10, dtype);
+  test_quantize_per_tensor_and_dequantize({1, 1, 17, 12}, 0.1, 10, dtype);
+  test_quantize_per_tensor_and_dequantize({2, 4, 17, 12}, 0.1, 10, dtype);
+  test_quantize_per_tensor_and_dequantize({1, 1, 10, 14}, 0.001, 101, dtype);
+  test_quantize_per_tensor_and_dequantize({3, 5, 10, 14}, 0.009, 43, dtype);
+  test_quantize_per_tensor_and_dequantize({3, 5, 10, 15}, 0.1, 19, dtype);
+  test_quantize_per_tensor_and_dequantize({4, 4, 9, 17}, 0.1, 19, dtype);
+  test_quantize_per_tensor_and_dequantize({3, 5, 25, 29}, 0.1, 19, dtype);
+  test_quantize_per_tensor_and_dequantize({4, 4, 25, 29}, 0.1, 19, dtype);
+  test_quantize_per_tensor_and_dequantize({11, 17, 25, 29}, 0.027, 89, dtype);
+
+  for (int i = 0; i < 20; i += 1) {
+    test_quantize_per_tensor_and_dequantize_random(dtype);
+  }
+}
+
+TEST_F(VulkanAPITest, quantize_per_tensor_and_dequantize_qint8) {
+  const c10::ScalarType dtype = c10::ScalarType::QInt8;
+  test_quantize_per_tensor_and_dequantize({1, 1, 1, 1}, 0.13, -21, dtype);
+  test_quantize_per_tensor_and_dequantize({1, 1, 1, 4}, 0.3, 87, dtype);
+  test_quantize_per_tensor_and_dequantize({1, 1, 4, 1}, 0.2, -120, dtype);
+  test_quantize_per_tensor_and_dequantize({1, 1, 7, 7}, 0.3, 87, dtype);
+  test_quantize_per_tensor_and_dequantize({1, 1, 8, 8}, 0.1, -10, dtype);
+  test_quantize_per_tensor_and_dequantize({3, 5, 8, 8}, 0.04, 97, dtype);
+  test_quantize_per_tensor_and_dequantize({1, 1, 11, 17}, 0.07, -15, dtype);
+  test_quantize_per_tensor_and_dequantize({1, 1, 12, 17}, 0.1, 10, dtype);
+  test_quantize_per_tensor_and_dequantize({3, 5, 12, 17}, 0.1, -10, dtype);
+  test_quantize_per_tensor_and_dequantize({1, 1, 17, 12}, 0.1, 10, dtype);
+  test_quantize_per_tensor_and_dequantize({2, 4, 17, 12}, 0.1, -10, dtype);
+  test_quantize_per_tensor_and_dequantize({1, 1, 10, 14}, 0.001, 101, dtype);
+  test_quantize_per_tensor_and_dequantize({3, 5, 10, 14}, 0.009, -43, dtype);
+  test_quantize_per_tensor_and_dequantize({3, 5, 10, 15}, 0.1, 19, dtype);
+  test_quantize_per_tensor_and_dequantize({4, 4, 9, 17}, 0.1, -19, dtype);
+  test_quantize_per_tensor_and_dequantize({3, 5, 25, 29}, 0.1, 19, dtype);
+  test_quantize_per_tensor_and_dequantize({4, 4, 25, 29}, 0.1, -19, dtype);
+  test_quantize_per_tensor_and_dequantize({11, 17, 25, 29}, 0.027, 89, dtype);
+
+  for (int i = 0; i < 20; i += 1) {
+    test_quantize_per_tensor_and_dequantize_random(dtype);
+  }
 }
 
-TEST_F(VulkanAPITest, quantize_per_tensor_and_dequantize) {
-  test_quantize_per_tensor_and_dequantize({1, 1, 1, 1}, 0.13, 21);
-  test_quantize_per_tensor_and_dequantize({1, 1, 1, 4}, 0.3, 87);
-  test_quantize_per_tensor_and_dequantize({1, 1, 4, 1}, 0.2, 120);
-  test_quantize_per_tensor_and_dequantize({1, 1, 7, 7}, 0.3, 87);
-  test_quantize_per_tensor_and_dequantize({1, 1, 8, 8}, 0.1, 10);
-  test_quantize_per_tensor_and_dequantize({3, 5, 8, 8}, 0.04, 97);
-  test_quantize_per_tensor_and_dequantize({1, 1, 11, 17}, 0.07, 15);
-  test_quantize_per_tensor_and_dequantize({1, 1, 12, 17}, 0.1, 10);
-  test_quantize_per_tensor_and_dequantize({3, 5, 12, 17}, 0.1, 10);
-  test_quantize_per_tensor_and_dequantize({1, 1, 17, 12}, 0.1, 10);
-  test_quantize_per_tensor_and_dequantize({2, 4, 17, 12}, 0.1, 10);
-  test_quantize_per_tensor_and_dequantize({1, 1, 10, 14}, 0.0001, 101);
-  test_quantize_per_tensor_and_dequantize({3, 5, 10, 14}, 0.009, 43);
-  test_quantize_per_tensor_and_dequantize({3, 5, 10, 15}, 0.1, 19);
-  test_quantize_per_tensor_and_dequantize({4, 4, 9, 17}, 0.1, 19);
-  test_quantize_per_tensor_and_dequantize({3, 5, 25, 29}, 0.1, 19);
-  test_quantize_per_tensor_and_dequantize({4, 4, 25, 29}, 0.1, 19);
-  test_quantize_per_tensor_and_dequantize({11, 17, 25, 29}, 0.027, 89);
+TEST_F(VulkanAPITest, quantize_per_tensor_and_dequantize_qint32) {
+  const c10::ScalarType dtype = c10::ScalarType::QInt32;
+  test_quantize_per_tensor_and_dequantize({1, 1, 1, 1}, 0.13, -21123, dtype);
+  test_quantize_per_tensor_and_dequantize({1, 1, 1, 4}, 0.339, 8734, dtype);
+  test_quantize_per_tensor_and_dequantize({1, 1, 4, 1}, 0.228, -12023, dtype);
+  test_quantize_per_tensor_and_dequantize({1, 1, 7, 7}, 0.338, 8723, dtype);
+  test_quantize_per_tensor_and_dequantize({1, 1, 8, 8}, 0.193, -1023, dtype);
+  test_quantize_per_tensor_and_dequantize({3, 5, 8, 8}, 0.0449, 972, dtype);
+  test_quantize_per_tensor_and_dequantize({1, 1, 11, 17}, 0.073, -15, dtype);
+  test_quantize_per_tensor_and_dequantize({1, 1, 12, 17}, 0.1572, 102, dtype);
+  test_quantize_per_tensor_and_dequantize({3, 5, 12, 17}, 0.147, -156, dtype);
+  test_quantize_per_tensor_and_dequantize({1, 1, 17, 12}, 0.129, 10448, dtype);
+  test_quantize_per_tensor_and_dequantize({2, 4, 17, 12}, 0.137, -10, dtype);
+  test_quantize_per_tensor_and_dequantize({1, 1, 10, 14}, 0.001, 101, dtype);
+  test_quantize_per_tensor_and_dequantize({3, 5, 10, 14}, 0.009, -43267, dtype);
+  test_quantize_per_tensor_and_dequantize({3, 5, 10, 15}, 0.1243, 19, dtype);
+  test_quantize_per_tensor_and_dequantize({4, 4, 9, 17}, 0.1889, -19784, dtype);
+  test_quantize_per_tensor_and_dequantize({3, 5, 25, 29}, 0.1345, 196, dtype);
+  test_quantize_per_tensor_and_dequantize({4, 4, 25, 29}, 0.129, -19489, dtype);
+  test_quantize_per_tensor_and_dequantize({11, 17, 25, 29}, 0.027, 89, dtype);
 
   for (int i = 0; i < 20; i += 1) {
-    test_quantize_per_tensor_and_dequantize_random();
+    test_quantize_per_tensor_and_dequantize_random(dtype);
   }
 }
 

From 076d87f0f735c88e4a25bb49ca0aec483ea20dd8 Mon Sep 17 00:00:00 2001
From: Arek Sredzki <arek@sredzki.com>
Date: Tue, 6 Dec 2022 06:45:04 +0000
Subject: [PATCH 1618/1922] Improve Autograd Documentation Clarity (#89401)

This makes minor adjustments to the autograd docs, improving clarity and resolving grammatical errors

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89401
Approved by: https://github.com/kit1980
---
 docs/source/notes/autograd.rst | 52 +++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/docs/source/notes/autograd.rst b/docs/source/notes/autograd.rst
index 6eec13a7de557..08ae3957b00a0 100644
--- a/docs/source/notes/autograd.rst
+++ b/docs/source/notes/autograd.rst
@@ -13,7 +13,7 @@ programs, and can aid you in debugging.
 How autograd encodes the history
 --------------------------------
 
-Autograd is reverse automatic differentiation system.  Conceptually,
+Autograd is a reverse automatic differentiation system.  Conceptually,
 autograd records a graph recording all of the operations that created
 the data as you execute operations, giving you a directed acyclic graph
 whose leaves are the input tensors and roots are the output tensors.
@@ -23,11 +23,11 @@ compute the gradients using the chain rule.
 Internally, autograd represents this graph as a graph of
 :class:`Function` objects (really expressions), which can be
 :meth:`~torch.autograd.Function.apply` ed to compute the result of
-evaluating the graph.  When computing the forwards pass, autograd
+evaluating the graph.  When computing the forward pass, autograd
 simultaneously performs the requested computations and builds up a graph
 representing the function that computes the gradient (the ``.grad_fn``
 attribute of each :class:`torch.Tensor` is an entry point into this graph).
-When the forwards pass is completed, we evaluate this graph in the
+When the forward pass is completed, we evaluate this graph in the
 backwards pass to compute the gradients.
 
 An important thing to note is that the graph is recreated from scratch at every
@@ -119,7 +119,7 @@ For more fine-grained exclusion of subgraphs from gradient computation,
 there is setting the ``requires_grad`` field of a tensor.
 
 Below, in addition to discussing the mechanisms above, we also describe
-evaluation mode (:meth:`nn.Module.eval()`), a method that is not actually used
+evaluation mode (:meth:`nn.Module.eval()`), a method that is not used
 to disable gradient computation but, because of its name, is often mixed up with the three.
 
 Setting ``requires_grad``
@@ -164,8 +164,8 @@ of the module's parameters (which have ``requires_grad=True`` by default).
 Grad Modes
 ^^^^^^^^^^
 
-Apart from setting ``requires_grad`` there are also three possible modes
-enableable from Python that can affect how computations in PyTorch are
+Apart from setting ``requires_grad`` there are also three grad modes that can
+be selected from Python that can affect how computations in PyTorch are
 processed by autograd internally: default mode (grad mode), no-grad mode,
 and inference mode, all of which can be togglable via context managers and
 decorators.
@@ -173,7 +173,7 @@ decorators.
 Default Mode (Grad Mode)
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
-The "default mode" is actually the mode we are implicitly in when no other modes like
+The "default mode" is the mode we are implicitly in when no other modes like
 no-grad and inference mode are enabled. To be contrasted with
 "no-grad mode" the default mode is also sometimes called "grad mode".
 
@@ -237,7 +237,7 @@ For implementation details of inference mode see
 Evaluation Mode (``nn.Module.eval()``)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Evaluation mode is not actually a mechanism to locally disable gradient computation.
+Evaluation mode is not a mechanism to locally disable gradient computation.
 It is included here anyway because it is sometimes confused to be such a mechanism.
 
 Functionally, ``module.eval()`` (or equivalently ``module.train(False)``) are completely
@@ -263,7 +263,7 @@ In-place operations with autograd
 Supporting in-place operations in autograd is a hard matter, and we discourage
 their use in most cases. Autograd's aggressive buffer freeing and reuse makes
 it very efficient and there are very few occasions when in-place operations
-actually lower memory usage by any significant amount. Unless you're operating
+lower memory usage by any significant amount. Unless you're operating
 under heavy memory pressure, you might never need to use them.
 
 There are two main reasons that limit the applicability of in-place operations:
@@ -271,13 +271,13 @@ There are two main reasons that limit the applicability of in-place operations:
 1. In-place operations can potentially overwrite values required to compute
    gradients.
 
-2. Every in-place operation actually requires the implementation to rewrite the
+2. Every in-place operation requires the implementation to rewrite the
    computational graph. Out-of-place versions simply allocate new objects and
    keep references to the old graph, while in-place operations, require
    changing the creator of all inputs to the :class:`Function` representing
    this operation. This can be tricky, especially if there are many Tensors
    that reference the same storage (e.g. created by indexing or transposing),
-   and in-place functions will actually raise an error if the storage of
+   and in-place functions will raise an error if the storage of
    modified inputs is referenced by any other :class:`Tensor`.
 
 In-place correctness checks
@@ -338,18 +338,18 @@ serializing all the backward calls in a specific order during execution
 Non-determinism
 ^^^^^^^^^^^^^^^
 
-If you are calling ``backward()`` on multiple thread concurrently but with
-shared inputs (i.e. Hogwild CPU training). Since parameters are automatically
-shared across threads, gradient accumulation might become non-deterministic on
-backward calls across threads, because two backward calls might access and try
-to accumulate the same ``.grad`` attribute. This is technically not safe, and
-it might result in racing condition and the result might be invalid to use.
+If you are calling ``backward()`` from multiple threads concurrently and have
+shared inputs (i.e. Hogwild CPU training), then non-determinsim should be expected.
+This can occur because parameters are automatically shared across threads,
+as such, multiple threads may access and try to accumulate the same ``.grad``
+attribute during gradient accumulation. This is technically not safe, and
+it might result in race condition and the result might be invalid to use.
 
-But this is expected pattern if you are using the multithreading approach to
-drive the whole training process but using shared parameters, user who use
-multithreading should have the threading model in mind and should expect this
-to happen. User could use the functional API :func:`torch.autograd.grad` to
-calculate the gradients instead of ``backward()`` to avoid non-determinism.
+Users developing multithreaded models featuring shared parameters should have the
+threading model in mind and should understand the issues described above.
+
+The functional API :func:`torch.autograd.grad` may be used to calculate the
+gradients instead of ``backward()`` to avoid non-determinism.
 
 Graph retaining
 ^^^^^^^^^^^^^^^
@@ -368,9 +368,9 @@ Thread Safety on Autograd Node
 
 Since Autograd allows the caller thread to drive its backward execution for
 potential parallelism, it's important that we ensure thread safety on CPU with
-parallel backwards that share part/whole of the GraphTask.
+parallel ``backward()`` calls that share part/whole of the GraphTask.
 
-Custom Python ``autograd.Function`` is automatically thread safe because of GIL.
+Custom Python ``autograd.Function``\s are automatically thread safe because of GIL.
 For built-in C++ Autograd Nodes (e.g. AccumulateGrad, CopySlices) and custom
 ``autograd::Function``\s, the Autograd Engine uses thread mutex locking to ensure
 thread safety on autograd Nodes that might have state write/read.
@@ -440,8 +440,8 @@ It also turns out that no interesting real-valued objective fulfill the
 Cauchy-Riemann equations. So the theory with homomorphic function cannot be
 used for optimization and most people therefore use the Wirtinger calculus.
 
-Wirtinger Calculus comes in picture ...
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Wirtinger Calculus comes into the picture ...
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 So, we have this great theory of complex differentiability and
 holomorphic functions, and we can’t use any of it at all, because many

From 94dd49655ac1fe2e5661257bbae94e9e4737354c Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Tue, 6 Dec 2022 07:14:16 +0000
Subject: [PATCH 1619/1922] Revert "as_strided: Fix default storage_offset for
 reference implementation (#89513)"

This reverts commit ba70a8be03f2fca222deee030bf7d9d15260b549.

Reverted https://github.com/pytorch/pytorch/pull/89513 on behalf of https://github.com/kit1980 due to Broke multiple workflows, 2 unexpected successes for autograd tests
---
 test/functorch/test_aotdispatch.py            |  1 -
 test/functorch/test_ops.py                    | 14 +---
 test/functorch/test_vmap.py                   |  2 -
 torch/_refs/__init__.py                       | 10 +--
 .../_internal/common_methods_invocations.py   | 72 +++----------------
 5 files changed, 11 insertions(+), 88 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 165041edfb306..2434e35ab4871 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1776,7 +1776,6 @@ def forward(self, x):
     xfail('scatter_reduce', 'prod'),
 
     skip('as_strided_scatter'),
-    xfail('as_strided', 'partial_views'),
 
     # Too annoying to generate random inputs
     xfail('cholesky'),
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index 7a2f5b8dc61a1..b643a44cab6b9 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -414,7 +414,6 @@ def wrapped_fn(*args, **kwargs):
         # BUG
         # AssertionError: Tensor-likes are not close!
         xfail('as_strided'),
-        xfail('as_strided', 'partial_views'),
         decorate('linalg.det', 'singular',
                  decorator=unittest.skipIf(IS_MACOS and IS_X86, "Fails on x86 MacOS CI")),
     }))
@@ -527,7 +526,6 @@ def maybe_clone_inputs():
         xfail('as_strided'),
         xfail('as_strided_scatter'),
         xfail('_softmax_backward_data', device_type='cpu'),
-        xfail('as_strided', 'partial_views'),
     }))
     @opsToleranceOverride('TestOperators', 'test_vjp', (
         tol1('nn.functional.conv_transpose3d',
@@ -657,7 +655,6 @@ def fn(inp, *args, **kwargs):
         skip("atleast_3d"),  # Takes too long
         skip("ormqr"),  # Takes too long
         xfail("as_strided"),  # incorrect output
-        xfail("as_strided", "partial_views"),  # incorrect output
         xfail("as_strided_scatter"),  # incorrect output
         skip("bernoulli"),  # calls random op
         xfail("bfloat16"),  # rank 4 tensor for channels_last
@@ -738,9 +735,6 @@ def fn(inp, *args, **kwargs):
         tol1('svd',
              {torch.float32: tol(atol=1e-03, rtol=5e-04)}),
     ))
-    @skipOps('TestOperators', 'test_vmapvjpvjp', {
-        xfail('as_strided', 'partial_views'),
-    })
     def test_vmapvjpvjp(self, device, dtype, op):
         # Since, we test `vjpvjp` independently,
         # for this test, we just verify that vmap
@@ -808,7 +802,6 @@ def vjp_of_vjp(*args_and_cotangents):
         xfail('svd_lowrank', ''),  # randomness
         xfail('to_sparse', ''),  # non-dense output
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
-        xfail('as_strided', 'partial_views'),
         # ----------------------------------------------------------------------
 
         # ---------------------------- BUGS ------------------------------------
@@ -858,9 +851,7 @@ def vjp_of_vjp(*args_and_cotangents):
         tol1('linalg.householder_product',
              {torch.float32: tol(atol=1e-04, rtol=1e-04)}),
     ))
-    @skipOps('TestOperators', 'test_vmapvjp', vmapvjp_fail.union({
-        xfail('as_strided', 'partial_views'),
-    }))
+    @skipOps('TestOperators', 'test_vmapvjp', vmapvjp_fail)
     def test_vmapvjp(self, device, dtype, op):
         if not op.supports_autograd:
             self.skipTest("Skipped! Autograd not supported.")
@@ -908,7 +899,6 @@ def test_vmapvjp(self, device, dtype, op):
         decorate('linalg.det', 'singular', decorator=unittest.skipIf(IS_MACOS, "Fails on x86 MacOS CI")),
         skip('nn.functional.max_pool1d'),  # fails on cpu, runs on cuda
         xfail('masked.mean'),  # silent incorrectness (nan difference)
-        xfail('as_strided', 'partial_views'),  # Tensor-likes are not close!
 
         xfail('nn.functional.soft_margin_loss', ''),  # soft_margin_loss_backward does not support forward-ad
         xfail('tensor_split'),  # data_ptr composite compliance
@@ -1210,7 +1200,6 @@ def test():
         xfail('sparse.sampled_addmm', ''),
         xfail("native_batch_norm"),
         xfail("_native_batch_norm_legit"),
-        xfail('as_strided', 'partial_views'),
     }))
     def test_vjpvmap(self, device, dtype, op):
         # NB: there is no vjpvmap_has_batch_rule test because that is almost
@@ -1393,7 +1382,6 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents):
 
         # Potential bugs/errors
         xfail('as_strided'),  # AssertionError: Tensor-likes are not close!
-        xfail('as_strided', 'partial_views'),  # AssertionError: Tensor-likes are not close!
         xfail('as_strided_scatter'),  # AssertionError: Tensor-likes are not close!
         xfail('bernoulli'),  # calls random op
         xfail('bfloat16'),  # required rank 4 tensor to use channels_last format
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index b85da534be8fa..64776804e3d31 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -3301,7 +3301,6 @@ def test():
         xfail('triu'),  # Exception not raised on error input
         # The error inputs are vectors, that pass when batched as they are treated as a matrix
         xfail('trace'),
-        xfail('as_strided', 'partial_views'),
     }))
     def test_vmap_exhaustive(self, device, dtype, op):
         # needs to be fixed
@@ -3317,7 +3316,6 @@ def test_vmap_exhaustive(self, device, dtype, op):
     ))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
     @skipOps('TestVmapOperatorsOpInfo', 'test_op_has_batch_rule', vmap_fail.union({
-        xfail('as_strided', 'partial_views'),
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
         xfail('complex'),
         xfail('copysign'),
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 3539784e8e32c..04bf9e12927fa 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -2513,15 +2513,9 @@ def atleast_3d(
 
 
 def as_strided(
-    a: TensorLikeType,
-    size: ShapeType,
-    stride: StrideType,
-    storage_offset: Optional[int] = None,
+    a: TensorLikeType, size: ShapeType, stride: StrideType, storage_offset: int = 0
 ) -> TensorLikeType:
-    storage_offset_int = (
-        storage_offset if storage_offset is not None else a.storage_offset()
-    )
-    return prims.as_strided(a, size, stride, storage_offset_int)
+    return prims.as_strided(a, size, stride, storage_offset)
 
 
 def broadcast_shapes(*shapes) -> ShapeType:
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 91e1e8a1d636a..d11c275cc220c 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -263,15 +263,9 @@ def sample_inputs_as_strided(op_info, device, dtype, requires_grad, **kwargs):
         kwargs = dict(storage_offset=storage_offset)
         yield SampleInput(input_t, args=(output_shape, stride), kwargs=kwargs)
 
-def sample_inputs_as_strided_partial_views(op_info, device, dtype, requires_grad, **kwargs):
-    def make_arg():
-        base = make_tensor((20,), device=device, dtype=dtype)
-        return base[5:15].requires_grad_(requires_grad)
-
     # as_strided on offset, partial views
-    yield SampleInput(make_arg(), (2, 2), (1, 2))
-    yield SampleInput(make_arg(), (2, 2), (1, 2), storage_offset=0)
-    yield SampleInput(make_arg(), (2, 2), (1, 2), storage_offset=10)
+    # yield SampleInput(make_arg((20,))[5:15], args=((2, 2), (1, 2)))
+    # yield SampleInput(make_arg((20,))[5:15], args=((2, 2), (1, 2)), kwargs={'storage_offset': 0})
 
 def sample_inputs_as_strided_scatter(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -10727,6 +10721,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type='cpu'),
            )),
     OpInfo('as_strided',
+           op=lambda x, size, stride, storage_offset=0:
+               torch.as_strided(x, size, stride, storage_offset=storage_offset),
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
            supports_out=False,
            supports_forward_ad=True,
@@ -10747,47 +10743,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_conj_view'),
                DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_view'),
                DecorateInfo(unittest.skip("Numerous errors"), 'TestFwdGradients'),
-               DecorateInfo(unittest.skip("Numerous errors"), 'TestBwdGradients'),
-           )),
-    OpInfo('as_strided',
-           variant_test_name='partial_views',
-           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
-           supports_out=False,
-           supports_forward_ad=True,
-           supports_fwgrad_bwgrad=True,
-           # vmap does not support inplace views
-           check_inplace_batched_forward_grad=False,
-           sample_inputs_func=sample_inputs_as_strided_partial_views,
-           skips=(
-               # Note: This xfail is fine -- it's inherent to how as_strided works
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples'),
-               # RuntimeError: This operator is not Composite Compliant: the
-               # storage_offset of the tensor was modified directly without
-               # going through the PyTorch dispatcher.
-               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance'),
-
-
-               # These fail because the test changes the input's in-memory layout
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_complex_half_reference_testing'),
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
-               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu'),
-               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
-               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_fn_fwgrad_bwgrad',
-                            dtypes=(torch.complex64, torch.complex128)),
-               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD'),
-               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_inplace_forward_mode_AD'),
-               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_inplace_grad'),
-               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_inplace_gradgrad'),
-               DecorateInfo(unittest.expectedFailure, 'TestProxyTensorOpInfo',
-                            'test_make_fx_symbolic_exhaustive_inplace'),
-               DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness'),
-               DecorateInfo(unittest.expectedFailure, 'TestCudaFuserOpInfo', 'test_nvfuser_correctness'),
-               DecorateInfo(unittest.expectedFailure, 'TestCudaFuserOpInfo', 'test_nvfuser_extremal_values'),
-               # Fail but are also flaky
-               DecorateInfo(unittest.skip("Test changes in memory layout"), 'TestMathBits'),
-               DecorateInfo(unittest.skip("Modifies input strides and storage_offset"), 'TestCommon',
-                            'test_non_standard_bool_values'),
-           )),
+               DecorateInfo(unittest.skip("Numerous errors"), 'TestBwdGradients'))),
     OpInfo('as_strided_scatter',
            op=lambda x, src, size, stride, storage_offset=0:
                torch.as_strided_scatter(x, src, size, stride, storage_offset=storage_offset),
@@ -18326,27 +18282,15 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
         supports_nvfuser=False,
         skips=(
+            # TODO: fix and/or update to xfails
+            DecorateInfo(unittest.skip("Errors when storage_offset is included"),
+                         'TestCommon', 'test_python_ref_meta'),
             # cloned_mutable_input.is_same(returned_output) INTERNAL ASSERT FAILED
             DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_view'),
             DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_conj_view'),
             DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_conj_view'),
         ),
     ),
-    PythonRefInfo(
-        "_refs.as_strided",
-        torch_opinfo_name="as_strided",
-        torch_opinfo_variant_name="partial_views",
-        # FIXME: doesn't support chalf
-        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
-        supports_nvfuser=False,
-        skips=(
-            # cloned_mutable_input.is_same(returned_output) INTERNAL ASSERT FAILED
-            DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_view'),
-            DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_conj_view'),
-            DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_conj_view'),
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu'),
-        ),
-    ),
     PythonRefInfo(
         "_refs.broadcast_shapes",
         torch_opinfo_name="broadcast_shapes",

From c9387511183b2dc673955cfea5151c9f6acf8c35 Mon Sep 17 00:00:00 2001
From: Ryan Spring <rdspring1@gmail.com>
Date: Tue, 6 Dec 2022 07:16:19 +0000
Subject: [PATCH 1620/1922] Add factory functions to python frontend (#89230)

- Add `full` nvprim to support factory functions because the full reference uses `empty` and `fill` while we have a full factory function.
- Change `full_like` reference to call `full` to avoid defining another nvprim.
- Enable support for new_zeros to enable `cudnn_batch_norm` decomposition.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89230
Approved by: https://github.com/kevinstephano, https://github.com/mruberry
---
 test/test_prims.py                            | 48 +++++++++-
 torch/_prims/context.py                       | 13 +++
 torch/_prims/nvfuser_prims.py                 | 88 ++++++++++++++++++
 torch/_prims_common/__init__.py               |  1 +
 .../cuda/python_frontend/fusion_record.h      | 90 +++++++++++++++++++
 .../cuda/python_frontend/python_bindings.cpp  | 21 ++++-
 6 files changed, 258 insertions(+), 3 deletions(-)

diff --git a/test/test_prims.py b/test/test_prims.py
index 23e7c47b023a4..0b86c433b89ae 100644
--- a/test/test_prims.py
+++ b/test/test_prims.py
@@ -234,6 +234,46 @@ def func(x):
         partitions = partitioner.propose_partitions()
         self.assertEqual(len(partitions), 1)
 
+    @onlyCUDA
+    @skipCUDAIfRocm
+    @dtypes(torch.float32)
+    def test_full(self, device, dtype):
+        from torch.fx.experimental.proxy_tensor import make_fx
+        from torch._prims.context import TorchRefsNvfuserCapabilityMode
+        from torch._prims.executor import execute
+
+        def func1(size, value, b):
+            return (torch.full(size, value, dtype=dtype, device=device),)
+
+        def func2(size, value, b):
+            a = torch.full(size, value, dtype=dtype, device=device)
+            b_sin = b.sin()
+            return (torch.add(a, b_sin),)
+
+        def func3(size, value, b):
+            return (torch.full(size, value, dtype=dtype, device=device), b)
+
+        def func4(size, value, b):
+            b_sin = b.sin()
+            return (torch.full(size, value, dtype=dtype, device=device), b_sin)
+
+        def func5(size, value, b):
+            b_sin = b.sin()
+            a = torch.full(size, value, dtype=dtype, device=device)
+            a_sin = a.sin()
+            return (a, b_sin, a_sin)
+
+        for func in (func1, func3, func2, func3, func4, func5):
+            size = (3, 3)
+            value = 10
+            b = torch.randn(*size, dtype=dtype, device=device)
+
+            with TorchRefsNvfuserCapabilityMode():
+                gm = make_fx(func)(size, value, b)
+
+            out = execute(gm, size, value, b, executor="strictly_nvfuser")
+            self.assertEqual(out, func(size, value, b))
+
     @onlyCUDA
     @skipCUDAIfRocm
     def test_nvfuser_empty_fusion(self, device):
@@ -687,7 +727,13 @@ def func(
 
             # Check that the graph can be executed with nvFuser
             out = execute(gm, sample.input, *sample.args, executor="nvfuser")
-            self.assertEqual(out, gm(sample.input, *sample.args))
+            ref_out = gm(sample.input, *sample.args)
+            for idx, (left, right) in enumerate(zip(out, ref_out)):
+                # Nvfuser does not support torch.uint8 dtype so check reserve output against 0 scalar
+                if idx == 3:
+                    self.assertTrue(torch.all(torch.eq(left, 0)))
+                else:
+                    self.assertEqual(left, right)
 
     # decomposition of native_batch_norm_backward uses a casting, which prevents nvprim lowering on CPU build
     @onlyCUDA
diff --git a/torch/_prims/context.py b/torch/_prims/context.py
index b9f6e634bb49b..22452e4daefcf 100644
--- a/torch/_prims/context.py
+++ b/torch/_prims/context.py
@@ -364,6 +364,16 @@ def _is_rand_like(self, func):
         )
         return result
 
+    def _is_full(self, func):
+        result = "torch.full" == torch.overrides.resolve_name(func) or (
+            func
+            in [
+                torch.ops.aten.full,
+                torch.ops.aten.full.names,
+            ]
+        )
+        return result
+
     def __torch_function__(
         self,
         orig_func: Callable,
@@ -416,5 +426,8 @@ def __torch_function__(
                 warn("rand_like has ignored kwargs!")
             return torch.ops.nvprims.rand_like(*args)
 
+        if self._is_full(orig_func):
+            return torch.ops.nvprims.full(*args, **kwargs)
+
         # Then we use TorchRefsMode to interpret the rest
         return super().__torch_function__(orig_func, types, args, kwargs)
diff --git a/torch/_prims/nvfuser_prims.py b/torch/_prims/nvfuser_prims.py
index 3da16ab3aa275..fc70bdbc0a124 100644
--- a/torch/_prims/nvfuser_prims.py
+++ b/torch/_prims/nvfuser_prims.py
@@ -8,6 +8,7 @@
 from typing import Any, Dict, Optional, Tuple
 
 import torch
+import torch._prims_common as utils
 
 from torch._prims_common import (
     DimsSequenceType,
@@ -15,6 +16,7 @@
     ELEMENTWISE_TYPE_PROMOTION_KIND,
     getnvFuserDtype,
     make_contiguous_strides_for,
+    NumberType,
     ShapeType,
     TensorLikeType,
 )
@@ -341,6 +343,26 @@ def _clone_nvfuser(fd: Any, input: TensorLikeType, *, memory_format=None):
     return fd.ops.set(input)
 
 
+def _full_nvfuser(
+    fd: Any,
+    shape: ShapeType,
+    fill_value: NumberType,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    layout: Optional[torch.layout] = None,
+    device: Optional[torch.device] = None,
+    pin_memory: bool = False,
+    requires_grad: bool = False,
+):
+    assert device != torch.device("cpu")
+    assert layout is None or layout is torch.strided
+    assert pin_memory is False
+    assert requires_grad is False
+    dtype = dtype if dtype is not None else utils.type_to_dtype(type(fill_value))
+    nvfuser_dtype = getnvFuserDtype(dtype)
+    return fd.ops.full(shape, fill_value, nvfuser_dtype)
+
+
 _nvfuser_impls["native_batch_norm"] = _native_batch_norm_nvfuser
 _nvfuser_impls["broadcast_in_dim"] = _broadcast_in_dim_nvfuser
 _nvfuser_impls["convert_element_type"] = _convert_element_type_nvfuser
@@ -355,6 +377,70 @@ def _clone_nvfuser(fd: Any, input: TensorLikeType, *, memory_format=None):
 _nvfuser_impls["var_mean"] = _var_mean_nvfuser
 _nvfuser_impls["amax"] = _amax_nvfuser
 _nvfuser_impls["amin"] = _amin_nvfuser
+_nvfuser_impls["full"] = _full_nvfuser
+
+
+def register_full():
+    name = "full"
+
+    nvprim.define(
+        "full(SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, "
+        + "bool? pin_memory=None, bool? requires_grad=None) -> Tensor"
+    )
+
+    def _meta_impl(
+        size,
+        fill_value,
+        *,
+        out=None,
+        dtype=None,
+        layout=None,
+        device=None,
+        requires_grad=False,
+    ):
+        strides = make_contiguous_strides_for(size)
+        return torch._prims.TensorMeta(
+            None,
+            shape=size,
+            strides=strides,
+            dtype=dtype,
+            device=device,
+        )
+
+    def _prim_impl(
+        size,
+        fill_value,
+        *,
+        out=None,
+        dtype=None,
+        layout=None,
+        device=None,
+        pin_memory=False,
+        requires_grad=False,
+    ):
+        return torch.full(
+            size,
+            fill_value,
+            out=out,
+            dtype=dtype,
+            layout=layout,
+            device=device,
+            pin_memory=pin_memory,
+            requires_grad=requires_grad,
+        )
+
+    nvprim_impl.impl(name, _prim_impl)
+    nvprim_meta_impl.impl(name, _meta_impl)
+
+    prim_packet = getattr(torch.ops.nvprims, name)
+    prim = prim_packet.default
+    nvprim_autograd_impl.impl(name, backwards_not_supported(prim))
+    for p in (prim_packet, prim):
+        p.__doc__ = "Create a tensor with given size and filled with value"
+        p.impl_nvfuser = _nvfuser_impls["full"]
+        p.is_recomputable = _nvfuser_is_recomputable["full"]
+        p.return_type = torch._prims_common.RETURN_TYPE.NEW  # type: ignore[attr-defined]
+
 
 # functorch.compile.min_cut_rematerialization_partition accepts a list of
 # operators that can be recomputed in the backward pass. This list is used to
@@ -397,6 +483,7 @@ def _clone_nvfuser(fd: Any, input: TensorLikeType, *, memory_format=None):
     "expm1": True,
     "floor": True,
     "fmod": True,
+    "full": True,
     "ge": True,
     "gt": True,
     "imag": True,
@@ -715,6 +802,7 @@ def register_nvprims():
     register_view()
     register_native_batch_norm()
     register_rand_like()
+    register_full()
 
     for name in nvprim_names:
         main_prim = getattr(torch.ops.prims, name)
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index 647b0e66729e2..f5cef4f11b11b 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -20,6 +20,7 @@
         torch.bfloat16: DataType.BFloat16,
         torch.long: DataType.Int,
         torch.int: DataType.Int32,
+        torch.uint8: DataType.Int32,
         torch.bool: DataType.Bool,
         # Python scalars
         complex: DataType.ComplexDouble,
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h b/torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h
index b8105f1e4fb87..1974fc66f6fa9 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h
@@ -33,6 +33,7 @@ enum class RecordType {
   VarianceMeanOp,
   ViewOp,
   PermuteOp,
+  FullOp
 };
 
 //! RecordFunctor is the base class record for operations recorded by
@@ -1581,6 +1582,95 @@ struct BatchNormOpRecord : RecordFunctor {
   bool channels_last_;
 };
 
+struct FullOpRecord : RecordFunctor {
+  FullOpRecord(
+      std::vector<State> _args,
+      std::vector<State> _outputs,
+      std::vector<int64_t>& shape,
+      Nvf::DataType dtype)
+      : RecordFunctor(
+            std::move(_args),
+            std::move(_outputs),
+            "ops.full",
+            RecordType::FullOp),
+        shape_(std::move(shape)),
+        dtype_(dtype) {}
+  virtual ~FullOpRecord() = default;
+  virtual RecordFunctor* clone() final {
+    return new FullOpRecord(*this);
+  }
+
+  //! Child specific hash function in lower 32 bits.
+  //! | 31 --- 24 | 23 --------------------------  0 |
+  //! | Dtype     | Shape hash code                  |
+  virtual size_t hash() const final {
+    auto result = RecordFunctor::hash();
+    size_t shape_hash = 0;
+    for (auto p : shape_) {
+      shape_hash ^= static_cast<size_t>(p);
+    }
+    result |= ((static_cast<size_t>(dtype_) & 0xff) << 24);
+    result |= (shape_hash & 0xffff);
+    return result;
+  }
+
+  virtual bool operator==(const RecordFunctor& other) const final {
+    auto result = false;
+    if (auto child_ptr = dynamic_cast<const FullOpRecord*>(&other)) {
+      result = RecordFunctor::operator==(other);
+      if (result) {
+        result = (shape_.size() == child_ptr->shape_.size());
+        if (result) {
+          for (size_t i = 0; i < shape_.size(); ++i) {
+            if (shape_[i] != child_ptr->shape_[i]) {
+              result = false;
+              break;
+            }
+          }
+        }
+      }
+    }
+    return result;
+  }
+
+  void operator()(FusionDefinition& fd) final {
+    auto arg = fd.getFusionState(args_.at(0).index)->template as<Nvf::Val>();
+
+    std::vector<torch::jit::fuser::cuda::Val*> nvf_shape(
+        shape_.size(), nullptr);
+    for (const auto idx : c10::irange(shape_.size())) {
+      nvf_shape[idx] = Nvf::IrBuilder::create<Nvf::Int>(shape_.at(idx));
+    }
+    auto output = torch::jit::fuser::cuda::full(nvf_shape, arg, dtype_);
+    fd.setFusionState(outputs_.at(0).index, output);
+  }
+
+  virtual void print(std::ostream& os, bool close_function = true) const {
+    RecordFunctor::print(os, false);
+    os << ", shape=[";
+    bool first_arg = true;
+    for (auto p : shape_) {
+      if (first_arg) {
+        first_arg = false;
+      } else {
+        os << ", ";
+      }
+      os << p;
+    }
+    os << "]";
+    os << ", dtype=" << dtypeToPyString(dtype_);
+    if (close_function) {
+      os << ")";
+    }
+  }
+
+ private:
+  //! Represents shape of new tensor
+  std::vector<int64_t> shape_;
+  //! Type of output
+  Nvf::DataType dtype_;
+};
+
 } // namespace nvfuser
 
 //! Creating the template specialized hash and equal_to functions for a
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp b/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
index 68fe709deb78f..fc9d105100b9c 100644
--- a/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
+++ b/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
@@ -1210,7 +1210,6 @@ void initNvFuserPythonBindings(PyObject* module) {
       py::arg("arg"),
       py::arg("dims"),
       py::return_value_policy::reference);
-
   nvf_ops.def(
       "squeeze",
       [](nvfuser::FusionDefinition::Operators& self,
@@ -1250,7 +1249,25 @@ void initNvFuserPythonBindings(PyObject* module) {
       py::arg("original_shape"),
       py::arg("new_shape"),
       py::return_value_policy::reference);
-
+  nvf_ops.def(
+      "full",
+      [](nvfuser::FusionDefinition::Operators& self,
+         std::vector<int64_t>& size,
+         nvfuser::Scalar arg,
+         Nvf::DataType dtype) -> nvfuser::Tensor {
+        nvfuser::FusionDefinition* fd = self.fusion_definition;
+        nvfuser::Tensor output = fd->defineTensor();
+        fd->defineRecord(new nvfuser::FullOpRecord(
+            {fd->recordingState(arg())},
+            {fd->recordingState(output())},
+            size,
+            dtype));
+        return output;
+      },
+      py::arg("size"),
+      py::arg("arg"),
+      py::arg("dtype"),
+      py::return_value_policy::reference);
   nvf_ops.def(
       "var",
       [](nvfuser::FusionDefinition::Operators& self,

From 907fddcfae889fb82d295853169bb6b583cb3e85 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Tue, 6 Dec 2022 07:22:37 +0000
Subject: [PATCH 1621/1922] Revert "[PT-D][Composability][1/N] Upstream
 NamedOptimizer from TorchRec (KeyedOptimizer in TR) (#89480)"

This reverts commit 31ec1a1ef7032508fc36f0b70692832acbeed72d.

Reverted https://github.com/pytorch/pytorch/pull/89480 on behalf of https://github.com/kit1980 due to Broke test_correct_module_names
---
 .../distributed/optim/test_named_optimizer.py | 245 -----------------
 torch/distributed/optim/__init__.py           |   1 -
 torch/distributed/optim/named_optimizer.py    | 258 ------------------
 3 files changed, 504 deletions(-)
 delete mode 100644 test/distributed/optim/test_named_optimizer.py
 delete mode 100644 torch/distributed/optim/named_optimizer.py

diff --git a/test/distributed/optim/test_named_optimizer.py b/test/distributed/optim/test_named_optimizer.py
deleted file mode 100644
index 880dbb382aa6a..0000000000000
--- a/test/distributed/optim/test_named_optimizer.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# Owner(s): ["oncall: distributed"]
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import unittest
-
-import torch
-import torch.nn as nn
-
-from torch.distributed.optim import _NamedOptimizer
-
-
-class TestDummyModel(torch.nn.Module):
-    def __init__(self):
-        super(TestDummyModel, self).__init__()
-        torch.manual_seed(0)
-        self.net1 = nn.Sequential(nn.Linear(8, 16), nn.ReLU())
-        self.net2 = nn.Sequential(nn.Linear(16, 32), nn.ReLU())
-        self.net3 = nn.Linear(32, 64)
-        self.net4 = nn.Sequential(nn.ReLU(), nn.Linear(64, 8))
-
-    def forward(self, x):
-        return self.net4(self.net3(self.net2(self.net1(x))))
-
-
-class NamedOptimizerTest(unittest.TestCase):
-    def _compare_state_dict_group(self, group, named_group, assert_equal=True):
-        for key, val in group.items():
-            if key != "params":
-                self.assertTrue(
-                    key in named_group, f"{key} not in named optimizer state dict"
-                )
-                err_msg = (
-                    f"{key} state not equal" if assert_equal else f"{key} state equal"
-                )
-                if isinstance(val, torch.Tensor):
-                    fn = self.assertTrue if assert_equal else self.assertFalse
-                    fn(torch.allclose(val, named_group[key]), err_msg)
-                else:
-                    fn = self.assertEqual if assert_equal else self.assertNotEqual
-                    fn(val, named_group[key], err_msg)
-
-    def test_state_dict(self):
-        """Check that NamedOptimizer exposes the expected state dict
-        interface."""
-        m = TestDummyModel()
-        m_dup = TestDummyModel()
-        optim_1 = torch.optim.SGD(
-            [
-                {"params": m.net1.parameters()},
-                {"params": m.net3.parameters(), "lr": 1e-3},
-            ],
-            lr=1e-2,
-            momentum=0.9,
-        )
-
-        optim_2 = torch.optim.Adam(
-            [
-                {"params": m.net2.parameters()},
-                {"params": m.net4.parameters(), "lr": 1e-5},
-            ]
-        )
-
-        named_optim_1 = _NamedOptimizer(
-            m_dup.named_parameters(),
-            torch.optim.SGD,
-            [
-                {"params": m_dup.net1.parameters()},
-                {"params": m_dup.net3.parameters(), "lr": 1e-3},
-            ],
-            lr=1e-2,
-            momentum=0.9,
-        )
-
-        named_optim_2 = _NamedOptimizer(
-            m_dup.named_parameters(),
-            torch.optim.Adam,
-            [
-                {"params": m_dup.net2.parameters()},
-                {"params": m_dup.net4.parameters(), "lr": 1e-5},
-            ],
-        )
-        for i in range(2):
-            x = torch.rand(5, 8)
-            y = m(x)
-            y.sum().backward()
-            optim_1.step()
-            optim_2.step()
-
-            y = m_dup(x)
-            y.sum().backward()
-            named_optim_1.step()
-            named_optim_2.step()
-
-        sd_1 = optim_1.state_dict()
-        sd_2 = optim_2.state_dict()
-        named_sd_1 = named_optim_1.state_dict()
-        named_sd_2 = named_optim_2.state_dict()
-
-        # Compare "state" in optim state dict
-        self._compare_state_dict_group(
-            sd_1["state"][0],
-            named_sd_1["state"]["net1.0.weight"],
-            assert_equal=True,
-        )
-        self._compare_state_dict_group(
-            sd_2["state"][1],
-            named_sd_2["state"]["net2.0.bias"],
-            assert_equal=True,
-        )
-        self._compare_state_dict_group(
-            sd_1["state"][2],
-            named_sd_1["state"]["net3.weight"],
-            assert_equal=True,
-        )
-        self._compare_state_dict_group(
-            sd_2["state"][3],
-            named_sd_2["state"]["net4.1.bias"],
-            assert_equal=True,
-        )
-
-        # Compare "param_groups" in optim state dict
-        self._compare_state_dict_group(
-            sd_1["param_groups"][0],
-            named_sd_1["param_groups"][0],
-            assert_equal=True,
-        )
-        self._compare_state_dict_group(
-            sd_2["param_groups"][1], named_sd_2["param_groups"][1], assert_equal=True
-        )
-
-    def test_load_state_dict(self):
-        """Check that NamedOptimizer exposes the expected state dict
-        interface."""
-        m = TestDummyModel()
-        named_optim_1 = _NamedOptimizer(
-            m.named_parameters(),
-            torch.optim.SGD,
-            lr=1e-2,
-            momentum=0.9,
-        )
-
-        for _ in range(2):
-            x = torch.rand(5, 8)
-            y = m(x)
-            y.sum().backward()
-            named_optim_1.step()
-
-        state_dict_to_load = named_optim_1.state_dict()
-
-        named_optim_2 = _NamedOptimizer(
-            m.named_parameters(),
-            torch.optim.SGD,
-            lr=1e-2,
-            momentum=0.6,
-        )
-
-        for _ in range(2):
-            x = torch.rand(5, 8)
-            y = m(x)
-            y.sum().backward()
-            named_optim_2.step()
-
-        state_dict_before_load = named_optim_2.state_dict()
-
-        # Compare "state" in optim state dict
-        self._compare_state_dict_group(
-            state_dict_to_load["state"]["net1.0.weight"],
-            state_dict_before_load["state"]["net1.0.weight"],
-            assert_equal=False,
-        )
-        self._compare_state_dict_group(
-            state_dict_to_load["state"]["net2.0.bias"],
-            state_dict_before_load["state"]["net2.0.bias"],
-            assert_equal=False,
-        )
-        self._compare_state_dict_group(
-            state_dict_to_load["state"]["net3.weight"],
-            state_dict_before_load["state"]["net3.weight"],
-            assert_equal=False,
-        )
-        self._compare_state_dict_group(
-            state_dict_to_load["state"]["net4.1.bias"],
-            state_dict_before_load["state"]["net4.1.bias"],
-            assert_equal=False,
-        )
-
-        named_optim_2.load_state_dict(state_dict_to_load)
-        state_dict_after_load = named_optim_2.state_dict()
-
-        # Compare "state" in optim state dict
-        self._compare_state_dict_group(
-            state_dict_to_load["state"]["net1.0.weight"],
-            state_dict_after_load["state"]["net1.0.weight"],
-            assert_equal=True,
-        )
-        self._compare_state_dict_group(
-            state_dict_to_load["state"]["net2.0.bias"],
-            state_dict_after_load["state"]["net2.0.bias"],
-            assert_equal=True,
-        )
-        self._compare_state_dict_group(
-            state_dict_to_load["state"]["net3.weight"],
-            state_dict_after_load["state"]["net3.weight"],
-            assert_equal=True,
-        )
-        self._compare_state_dict_group(
-            state_dict_to_load["state"]["net4.1.bias"],
-            state_dict_after_load["state"]["net4.1.bias"],
-            assert_equal=True,
-        )
-
-    def test_load_state_dict_error(self):
-        m = TestDummyModel()
-        named_optim_1 = _NamedOptimizer(
-            m.named_parameters(),
-            torch.optim.SGD,
-            lr=1e-2,
-            momentum=0.9,
-        )
-
-        for _ in range(2):
-            x = torch.rand(5, 8)
-            y = m(x)
-            y.sum().backward()
-            named_optim_1.step()
-
-        state_dict_to_load = named_optim_1.state_dict()
-
-        named_optim_2 = _NamedOptimizer(
-            m.named_parameters(),
-            torch.optim.SGD,
-            lr=1e-2,
-            momentum=0.6,
-        )
-
-        err_msg = (
-            "Expects the optim to be initialized before load but found not initialized"
-        )
-        with self.assertRaisesRegex(ValueError, err_msg):
-            named_optim_2.load_state_dict(state_dict_to_load)
diff --git a/torch/distributed/optim/__init__.py b/torch/distributed/optim/__init__.py
index 9a83c7aabf772..950222b8d5fa8 100644
--- a/torch/distributed/optim/__init__.py
+++ b/torch/distributed/optim/__init__.py
@@ -16,7 +16,6 @@
 from .functional_rmsprop import _FunctionalRMSprop
 from .functional_rprop import _FunctionalRprop
 from .functional_adamax import _FunctionalAdamax
-from .named_optimizer import _NamedOptimizer
 from .utils import as_functional_optim
 from .apply_optimizer_in_backward import _apply_optimizer_in_backward
 
diff --git a/torch/distributed/optim/named_optimizer.py b/torch/distributed/optim/named_optimizer.py
deleted file mode 100644
index e2eea5099c0f9..0000000000000
--- a/torch/distributed/optim/named_optimizer.py
+++ /dev/null
@@ -1,258 +0,0 @@
-import logging
-import warnings
-
-from copy import deepcopy
-from typing import Any, Collection, Dict, List, Mapping, Union
-
-import torch
-from torch import optim
-from torch.distributed._shard.sharded_tensor import ShardedTensor
-
-
-__all__ = ["_NamedOptimizer"]
-
-logger = logging.getLogger(__name__)
-
-
-class _NamedOptimizer(optim.Optimizer):
-    """
-    ``_NamedOptimizer`` takes a dict of parameters and exposes ``state_dict`` by
-    parameter key. We replace the original key (number) in an optim to the
-    fully qualifed name (FQN) string. User can initialize the optim as they
-    initialize a PyTorch optim, the only difference is that they also need to
-    pass in the FQN of each parameters.
-
-    Args:
-        named_parameters (Mapping[str, Union[torch.Tensor, ShardedTensor]]):
-            Mapping from FQN to parameter.
-        optimizer_class (optim.Optimizer):
-            The class of optimizer to instantiate.
-        param_groups (Collection[Mapping[str, Any]]):
-            `param_groups` to pass to optimizer if specified.
-            The key of the inner map needs to be FQNs.
-            Default: None
-        args: arguments to pass to the optimizer constructor.
-        kwargs: arguments to pass to the optimizer constructor.
-
-    Example::
-        >>> # xdoctest: +SKIP("distributed")
-        >>> from torch import optim
-        >>> from torch.distributed.optim import _NamedOptimizer
-        >>>
-        >>> # Define the named optimizer.
-        >>> m = Model(...)
-        >>> named_optim = _NamedOptimizer(m.named_parameters(), optim.SGD)
-        >>> # Forward pass + backward pass.
-        >>> named_optim.step()
-        >>> ...
-        >>> # Call state_dict for the named optimizer returns a FQN state_dict.
-        >>> named_optim.state_dict()
-
-    Warning: This API is still in development and subject to change.
-
-    TODO: Add tutorial for _NamedOptimizer.
-    TODO: Add documentation in the docstring for the public attributes
-          like self.param_groups and self.named_parameters.
-    """
-
-    def __init__(
-        self,
-        named_parameters: Mapping[str, Union[torch.Tensor, ShardedTensor]],
-        optimizer_class: optim.Optimizer,
-        param_groups: Collection[Mapping[str, Any]] = None,
-        *args,
-        **kwargs,
-    ) -> None:
-        torch._C._log_api_usage_once("torch.distributed.optim._NamedOptimizer")
-        self.param_groups: Collection[Mapping[str, Any]] = param_groups  # type: ignore[assignment]
-        self.named_parameters = dict(named_parameters)
-        params_for_optimizer = (
-            self.named_parameters.values() if param_groups is None else param_groups
-        )
-        self._optimizer = optimizer_class(  # type: ignore[operator]
-            params_for_optimizer,
-            *args,
-            **kwargs,
-        )
-        # TODO: Add param_groups validations and unit tests.
-        if param_groups is None:
-            self.ordered_param_keys = list(self.named_parameters.keys())
-        else:
-            warnings.warn(
-                "Since we pass in param_groups, we will use param_groups to "
-                "initialize the optimizer, not all parameters of the module."
-            )
-            param_to_key = {param: key for key, param in self.named_parameters.items()}  # type: ignore[misc, has-type]
-            ordered_param_keys = []
-            for group in param_groups:
-                for param in group["params"]:
-                    if param not in param_to_key:
-                        raise ValueError(
-                            f"Expect param name {param} found in param group but is missing."
-                        )
-                    ordered_param_keys.append(param_to_key[param])
-            self.ordered_param_keys = ordered_param_keys
-
-    def state_dict(self) -> Dict[str, Any]:
-        """
-        Return the ``state_dict`` of the optimzer. Instead of using number to index
-        parameters, we will use module fully qualifed name (FQN) as the key.
-        """
-        state_dict = self._optimizer.state_dict()
-        param_groups = state_dict["param_groups"]
-
-        ret_state = {
-            self.ordered_param_keys[st_key]: state_val
-            for st_key, state_val in state_dict["state"].items()
-        }
-
-        ret_groups = []
-        for group in param_groups:
-            param_keys = []
-            for param in group["params"]:
-                param_keys.append(self.ordered_param_keys[param])
-            ret_group = {"params": sorted(param_keys)}
-            for k, v in group.items():
-                if k != "params":
-                    ret_group[k] = deepcopy(v)
-            ret_groups.append(ret_group)
-
-        return {"state": ret_state, "param_groups": ret_groups}
-
-    def step(self):
-        """
-        Performs a single optimization step.
-
-        This will call :meth:`torch.optim.Optimizer.step` on the wrapped
-        optimizer.
-        """
-        self._optimizer.step()
-
-    def load_state_dict(self, state_dict: Mapping[str, Any]) -> None:
-        """
-        This public function defines the default behavior to load a state_dict
-        for ``_NamedOptimizer``.
-
-        Sample Code
-        ```
-            my_model = MyModule()
-            optimizer = _NamedOptimizer(my_model.named_parameters(), Adagrad)
-            ...
-
-            optim_state_dict = optimizer.state_dict()
-            ...
-            ...
-
-            optimizer.load_state_dict(optim_state_dict)
-            ...
-        ```
-        Args:
-            state_dict (Dict[str, Any]) : A ``state_dict`` to load into the optimizer.
-                Note that this state dict update is performed in place.
-
-        .. note:: PyTorch is using lazy init to initialize the optim states.
-            So it is possible that there is no optim state when user call
-            ``load_state_dict`` and for ``_NamedOptimizer`` we make it stricter
-            that users can only call ``load_state_dict`` after the state is initialized.
-            By doing this, we can validate the optim ``state_dict`` to be loaded.
-        """
-        new_state_dict = self._optimizer.state_dict()
-        state = state_dict["state"]
-        new_state = new_state_dict["state"]
-        if len(new_state) == 0:
-            raise ValueError(
-                "Expects the optim to be initialized before load but found not initialized."
-            )
-
-        # Load state of state_dict
-        if len(new_state) != len(state):
-            raise ValueError(
-                f"Expects equal length as {len(new_state)} in `state_dict` state length but found {len(state)}."
-            )
-        for idx, param_key in enumerate(self.ordered_param_keys):
-            if param_key not in state.keys():
-                raise ValueError(
-                    f"Expect {param_key} as a parameter in `state_dict` state but not found."
-                )
-            if len(state[param_key]) != len(new_state[idx]):
-                raise ValueError(
-                    f"Expects equal length as {len(new_state[idx])} for parameter {param_key} but found: {len(state[param_key])}"
-                )
-            # Iterate through all optimizer states.
-            for state_key, state_val in new_state[idx].items():
-                if state_key not in state[param_key]:
-                    raise ValueError(
-                        f"Expects state {state_key} for parameter {param_key} but not found."
-                    )
-
-                src_state_val = state[param_key][state_key]
-                if isinstance(state_val, ShardedTensor):
-                    assert isinstance(src_state_val, ShardedTensor)
-                    num_shards = len(state_val.local_shards())
-                    num_new_shards = len(src_state_val.local_shards())
-                    if num_shards != num_new_shards:
-                        raise ValueError(
-                            f"Expects equal number of shards as {num_new_shards} but found {num_shards} for {param_key}/{state_key}"
-                        )
-                    for shard, src_shard in zip(
-                        state_val.local_shards(), src_state_val.local_shards()
-                    ):
-                        shard.tensor.detach().copy_(src_shard.tensor)
-                elif isinstance(state_val, torch.Tensor):
-                    assert isinstance(src_state_val, torch.Tensor)
-                    state_val.detach().copy_(src_state_val)
-                else:
-                    new_state[idx][state_key] = deepcopy(src_state_val)
-
-        # Load param_groups of state_dict
-        src_param_groups = state_dict["param_groups"]
-        new_param_groups = new_state_dict["param_groups"]
-
-        if len(new_param_groups) != len(src_param_groups):
-            raise ValueError(
-                f"Expects equal param_groups count as {len(new_param_groups)} in `state_dict` but found {len(src_param_groups)}."
-            )
-        src_group_map = {}
-        for group in src_param_groups:
-            param_keys = []
-            for param_key in group["params"]:
-                param_keys.append(param_key)
-            src_group_map[_gen_param_group_key(param_keys)] = group
-        new_group_map = {}
-        for new_group in new_param_groups:
-            param_keys = []
-            for param_key in new_group["params"]:
-                param_keys.append(self.ordered_param_keys[param_key])  # type: ignore[call-overload]
-            new_group_map[_gen_param_group_key(param_keys)] = new_group
-        for group_key, new_group in new_group_map.items():
-            if group_key not in src_group_map:
-                raise ValueError(
-                    f"Expects group {group_key} to be in `state_dict` but is missing"
-                )
-            src_group = src_group_map[group_key]
-            if len(src_group) != len(new_group):
-                raise ValueError(
-                    f"Expects equal param_group size as {len(new_group)} for group {group_key} but found {len(src_group)}."
-                )
-            for k in src_group:
-                if k not in new_group:
-                    raise ValueError(
-                        f"Expects group key {k} to be in group {group_key} in `state_dict` but is missing."
-                    )
-                if k != "params":
-                    new_group[k] = deepcopy(src_group[k])
-
-        self._optimizer.load_state_dict(new_state_dict)
-
-    # pyre-ignore [2]
-    def add_param_group(self, param_group: Any) -> None:
-        raise NotImplementedError(
-            "add_param_group not supported yet and might be implemented soon."
-        )
-
-
-def _gen_param_group_key(param_keys: List[str]) -> str:
-    """
-    Concatenate all param keys as a unique indentifier for one param group.
-    """
-    return "/".join(sorted(param_keys))

From 0bf4fdfdfcd57b5e3d74cb3396e0e1eee9c3e4d2 Mon Sep 17 00:00:00 2001
From: AllenTiTaiWang <titaiwang@microsoft.com>
Date: Tue, 6 Dec 2022 03:36:20 +0000
Subject: [PATCH 1622/1922] [ONNX] Add src/index dynamic axes support for
 aten::scatter_add (#90090)

Extend from #89787 , and answer from https://github.com/onnx/onnx/issues/4672, dynamically catching shape of index can let converter further support on this op.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90090
Approved by: https://github.com/BowenBao
---
 test/onnx/test_pytorch_onnx_onnxruntime.py | 11 ++++--
 torch/onnx/symbolic_opset16.py             | 42 +++++-----------------
 2 files changed, 17 insertions(+), 36 deletions(-)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 41e9a9973953a..b30056acb09d7 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -3950,6 +3950,14 @@ def forward(self, input, indices, src):
                 [torch.ones((2, 5)), torch.tensor([[0, 1, 2, 0, 0], [1, 0, 2, 1, 2]])],
                 name="src_indices_dynamic_combination2",
             ),
+            common_utils.subtest(
+                [torch.ones((3, 5)), torch.tensor([[0, 1, 2, 0, 0], [1, 0, 2, 1, 2]])],
+                name="src_indices_dynamic_combination3",
+            ),
+            common_utils.subtest(
+                [torch.ones((3, 5)), torch.tensor([[0, 1, 2, 0], [1, 0, 2, 1]])],
+                name="src_indices_dynamic_combination4",
+            ),
         ],
     )
     @skipIfUnsupportedMinOpsetVersion(16)
@@ -3959,9 +3967,6 @@ def forward(self, input, indices, src):
                 return input.scatter_add(0, indices, src)
 
         input = torch.zeros(3, 5, dtype=src.dtype)
-        # NOTE: Although index and src are set with different dynamic axes and name,
-        # they are required to be the same shape on all axes. In static shape, converter
-        # can apply Slice op to accommodate.
         self.run_test(
             ScatterModel(),
             input_args=(input, indices, src),
diff --git a/torch/onnx/symbolic_opset16.py b/torch/onnx/symbolic_opset16.py
index 0287fae5664aa..309309771df8a 100644
--- a/torch/onnx/symbolic_opset16.py
+++ b/torch/onnx/symbolic_opset16.py
@@ -27,6 +27,7 @@
 
 import functools
 
+import torch
 from torch.nn.functional import (
     GRID_SAMPLE_INTERPOLATION_MODES,
     GRID_SAMPLE_PADDING_MODES,
@@ -81,39 +82,14 @@ def scatter_add(g: jit_utils.GraphContext, self, dim, index, src):
             f"`index` ({index_sizes}) should have the same dimensionality as `src` ({src_sizes})",
         )
 
-    if src_sizes != index_sizes:
-        # In ONNX, src and index are required to be the same rank and shape
-        # However, in PyTorch, src is only required to have the same rank as index,
-        # and shape would be accomodated. In static shape, converter can apply Slice op
-        # to accommodate. We use Slice to adjust to shape of src if it's not the same
-        # as index.
-        # More detail on: https://github.com/onnx/onnx/issues/4672
-        axes = list()
-        ends = list()
-        # Align the dynamic sizes of src and index
-        # NOTE: Even if users set src and index with different dynamic axes, they are
-        # still expected to have the same shape in runtime in terms of ONNX spec.
-        # So the usage of different shape of src and index on dynamic size is not
-        # supported.
-        # More detail on: https://github.com/onnx/onnx/issues/4672
-        for idx, d in enumerate(index_sizes):
-            if d is None or src_sizes[idx] == d:
-                # 1. the axe with dynamic shape is ignored, and will be aligned by
-                # setType later
-                # 2. if the shape are the same, we don't need to slice
-                continue
-            if src_sizes[idx] < d:
-                return symbolic_helper._unimplemented(
-                    "scatter_add",
-                    f"`index` ({index_sizes}) should have smaller or equal (<=) size at any dimension than `src` ({src_sizes})",
-                )
-            axes.append(idx)
-            ends.append(d)
-        starts = [0] * len(ends)
-        if axes and starts and ends:
-            src = symbolic_helper._slice_helper(
-                g, src, axes=axes, starts=starts, ends=ends
-            )
+    # PyTorch only allows index shape <= src shape, so we can only consider
+    # taking index as subset size to src, like PyTorch does. When sizes for src
+    # and index are not matched or there are dynamic axes, we take index shape to
+    # slice src to accommodate.
+    if src_sizes != index_sizes or None in index_sizes:
+        adjusted_shape = g.op("Shape", index)
+        starts = g.op("Constant", value_t=torch.tensor([0] * len(index_sizes)))
+        src = g.op("Slice", src, starts, adjusted_shape)
 
     src = symbolic_helper._maybe_get_scalar(src)
     if symbolic_helper._is_value(src):

From ad58856e6ea33926508c7c8f7d5dee2875cb0c9d Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Mon, 5 Dec 2022 09:54:56 +0800
Subject: [PATCH 1623/1922] [Quant] Add fused linear-leaky_relu op for onednn
 backend (#88478)

**Summary**
Post op fusion can reduce data movement overhead and improve inference performance. This PR adds fused `linear-leaky_relu` op for `onednn` backend, which will be used for int8 inference with `onednn` backend. Cannot call this op with other quantization backends otherwise an error is thrown.

**Test Plan**
python test_quantization.py TestQuantizedLinear

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88478
Approved by: https://github.com/jgong5, https://github.com/jerryzh168
---
 .../ATen/native/quantized/cpu/OnednnUtils.h   | 17 +++-
 .../src/ATen/native/quantized/cpu/qlinear.cpp | 52 ++++++++++--
 aten/src/ATen/native/quantized/library.cpp    |  1 +
 test/quantization/core/test_quantized_op.py   | 85 +++++++++++++++----
 .../testing/_internal/common_quantization.py  | 16 ++++
 5 files changed, 146 insertions(+), 25 deletions(-)

diff --git a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
index 533d83361f05d..85eaf93ac4bc2 100644
--- a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
@@ -167,6 +167,12 @@ struct DeconvPrimitiveCache : PrimitiveCache {
   }
 };
 
+enum PostOps {
+  NoPostOp,
+  Relu,
+  LeakyRelu,
+};
+
 struct PackedLinearWeightsOnednn : public LinearPackedParamsBase {
   PackedLinearWeightsOnednn(
       std::unique_ptr<ideep::tensor> weight,
@@ -196,6 +202,12 @@ struct PackedLinearWeightsOnednn : public LinearPackedParamsBase {
   at::Tensor apply_dynamic(at::Tensor input, bool reduce_range=false) override;
   at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range=false) override;
 
+  at::Tensor apply_leaky_relu(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point,
+      double negative_slope);
+
   std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
 
   c10::optional<at::Tensor> bias() override {
@@ -210,11 +222,12 @@ struct PackedLinearWeightsOnednn : public LinearPackedParamsBase {
   LinearPrimitiveCache prim_cache;
   std::unique_ptr<c10::once_flag> cache_initialized_flag;
 
-  template <bool ReluFused>
+  template <PostOps post_op>
   at::Tensor apply_impl(
       at::Tensor input,
       double output_scale,
-      int64_t output_zero_point);
+      int64_t output_zero_point,
+      torch::List<at::Scalar> post_op_args = torch::List<at::Scalar>());
 
   template <bool ReluFused>
   at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range=false);
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
index 111b5eb5f1394..2bf92ffa5f0f7 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -621,11 +621,12 @@ at::Tensor PackedLinearWeightsQnnp::apply_relu(
 #endif // USE_PYTORCH_QNNPACK
 
 #if AT_MKLDNN_ENABLED()
-template <bool ReluFused>
+template <PostOps post_op>
 at::Tensor PackedLinearWeightsOnednn::apply_impl(
     at::Tensor input,
     double output_scale,
-    int64_t output_zero_point) {
+    int64_t output_zero_point,
+    torch::List<at::Scalar> post_op_args) {
   const int64_t dim = input.dim();
   TORCH_CHECK(
       dim != 0,
@@ -639,7 +640,12 @@ at::Tensor PackedLinearWeightsOnednn::apply_impl(
   auto input_dims = {M, K};
   auto input_data_type = dnnl::memory::data_type::u8;
   auto input_desc = ideep::tensor::desc(input_dims, input_data_type);
-  ideep::attr_t op_attr = ReluFused ? ideep::attr_t::fuse_relu() : ideep::attr_t();
+  ideep::attr_t op_attr = ideep::attr_t();
+  if (post_op == Relu) {
+    op_attr = ideep::attr_t::fuse_relu();
+  } else if (post_op == LeakyRelu) {
+    op_attr = ideep::attr_t::fuse_relu(/*scale=*/1.0f, /*alpha=*/post_op_args.get(0).to<double>());
+  }
   ideep::tensor x(input_desc, input_contig->data_ptr<c10::quint8>());
   auto dst_dims = {M, N};
   double input_scale = input.q_scale();
@@ -705,14 +711,27 @@ at::Tensor PackedLinearWeightsOnednn::apply(
     at::Tensor input,
     double output_scale,
     int64_t output_zero_point) {
-  return apply_impl<false>(std::move(input), output_scale, output_zero_point);
+  return apply_impl<NoPostOp>(
+      std::move(input), output_scale, output_zero_point);
 }
 
 at::Tensor PackedLinearWeightsOnednn::apply_relu(
     at::Tensor input,
     double output_scale,
     int64_t output_zero_point) {
-  return apply_impl<true>(std::move(input), output_scale, output_zero_point);
+  return apply_impl<Relu>(
+      std::move(input), output_scale, output_zero_point);
+}
+
+at::Tensor PackedLinearWeightsOnednn:: apply_leaky_relu(
+    at::Tensor input,
+    double output_scale,
+    int64_t output_zero_point,
+    double negative_slope) {
+  torch::List<at::Scalar> post_op_args =
+      {at::Scalar(negative_slope)};
+  return apply_impl<LeakyRelu>(
+      std::move(input), output_scale, output_zero_point, post_op_args);
 }
 
 #endif // #if AT_MKLDNN_ENABLED()
@@ -739,9 +758,32 @@ class QLinearInt8 final {
   }
 };
 
+class QLinearLeakyReluInt8 final {
+ public:
+  static at::Tensor run(
+      at::Tensor input,
+      const c10::intrusive_ptr<LinearPackedParamsBase>& packed_weight,
+      double output_scale,
+      int64_t output_zero_point,
+      double negative_slope) {
+    auto& ctx = at::globalContext();
+#if AT_MKLDNN_ENABLED()
+    if (ctx.qEngine() == at::QEngine::ONEDNN) {
+      return dynamic_cast<PackedLinearWeightsOnednn*>(packed_weight.get())->apply_leaky_relu(
+          std::move(input), output_scale, output_zero_point, negative_slope);
+    }
+#endif
+    TORCH_CHECK(
+        false,
+        "Didn't find engine for operation quantized::linear_leaky_relu ",
+        toString(ctx.qEngine()));
+  }
+};
+
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("quantized::linear"), TORCH_FN(QLinearInt8<false>::run));
   m.impl(TORCH_SELECTIVE_NAME("quantized::linear_relu"), TORCH_FN(QLinearInt8<true>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_leaky_relu"), TORCH_FN(QLinearLeakyReluInt8::run));
 }
 
 TORCH_LIBRARY_IMPL(_quantized, QuantizedCPU, m) {
diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp
index a6ac4b330b0f1..160cda71c86d6 100644
--- a/aten/src/ATen/native/quantized/library.cpp
+++ b/aten/src/ATen/native/quantized/library.cpp
@@ -152,6 +152,7 @@ TORCH_LIBRARY(quantized, m) {
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_relu_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic_fp16(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> Tensor Y"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_relu_dynamic_fp16(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_leaky_relu(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i, float negative_slope) -> Tensor Y"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_prepack(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_prepack_fp16(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_prepack_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack"));
diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index 3e4cd2ac8b352..bfc0ee0fa93be 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -23,7 +23,7 @@
 
 from torch.testing._internal.common_utils import TestCase
 from torch.testing._internal.common_utils import IS_PPC, TEST_WITH_UBSAN, IS_MACOS, BUILD_WITH_CAFFE2
-from torch.testing._internal.common_quantization import skipIfNoFBGEMM, skipIfNoQNNPACK
+from torch.testing._internal.common_quantization import skipIfNoFBGEMM, skipIfNoQNNPACK, skipIfNoONEDNN
 from torch.testing._internal.common_quantized import _quantize, _dequantize, _calculate_dynamic_qparams, \
     override_quantized_engine, supported_qengines, override_qengines, _snr
 from torch.testing._internal.common_quantized import (
@@ -3526,17 +3526,8 @@ def test_dynamic_convtranspose3d(self):
 
 
 class TestQuantizedLinear(TestCase):
-    """Tests the correctness of the quantized linear and linear_relu op."""
-    @given(batch_size=st.integers(1, 4),
-           input_channels=st.integers(16, 32),
-           output_channels=st.integers(4, 8),
-           use_bias=st.booleans(),
-           use_relu=st.booleans(),
-           use_multi_dim_input=st.booleans(),
-           use_channelwise=st.booleans())
-    @override_qengines
-    def test_qlinear(self, batch_size, input_channels, output_channels, use_bias,
-                     use_relu, use_multi_dim_input, use_channelwise):
+    def _test_qlinear_impl(self, batch_size, input_channels, output_channels, use_bias,
+                           post_op, use_multi_dim_input, use_channelwise, **post_op_kwargs):
         decimal_val = 4
         dtypes = [torch.quint8]
         if torch.backends.quantized.engine == 'qnnpack':
@@ -3558,8 +3549,10 @@ def test_qlinear(self, batch_size, input_channels, output_channels, use_bias,
 
             nptype = np_dtype[dtype]
             qlinear_prepack = torch.ops.quantized.linear_prepack
-            if use_relu:
+            if post_op == 'relu':
                 qlinear = torch.ops.quantized.linear_relu
+            elif post_op == 'leaky_relu':
+                qlinear = torch.ops.quantized.linear_leaky_relu
             else:
                 qlinear = torch.ops.quantized.linear
             if use_multi_dim_input:
@@ -3634,7 +3627,7 @@ def test_qlinear(self, batch_size, input_channels, output_channels, use_bias,
                     b, scale=X_scale * (W_scales[0].item()), zero_point=0, dtype=torch.qint32) if use_bias else None
             # Compare X_scale * W_scale * input_channels * X_value_max * W_value_max with
             # Y_scale * 255 (max for uint8).
-            Y_scale = 125.1234
+            Y_scale = 12.34
             Y_zp = 5
             # Weight prepacking operator for quantized Linear
             float_bias = b if use_bias else None
@@ -3642,13 +3635,13 @@ def test_qlinear(self, batch_size, input_channels, output_channels, use_bias,
             if use_multi_dim_input:
                 X_q = X_q.view(3, int(batch_size / 3), input_channels)
             # Quantized Linear operator with prepacked weight
-            Y_q = qlinear(X_q, W_prepack, Y_scale, Y_zp)
-            if not use_channelwise:
+            Y_q = qlinear(X_q, W_prepack, Y_scale, Y_zp, **post_op_kwargs)
+            if not use_channelwise and post_op in ('none', 'relu'):
                 # Test the per-tensor quantization only
                 # Reference quantized Linear operator
                 Y_q_ref = qlinear_ref(X_q0, X_scale, X_zp, W_q0,
                                       W_scales[0], W_zps[0], b_q0, Y_scale, Y_zp, dtype=nptype)
-                if use_relu:
+                if post_op == 'relu':
                     Y_q_ref[Y_q_ref < Y_zp] = Y_zp
                 if use_multi_dim_input:
                     Y_q_ref = np.reshape(
@@ -3661,14 +3654,50 @@ def test_qlinear(self, batch_size, input_channels, output_channels, use_bias,
             X_fp32 = X_q.dequantize().to(dtype=torch.float)
             b_fp32 = b_q.dequantize().to(dtype=torch.float) if use_bias else None
             Y_fp32_ref = F.linear(X_fp32, W_fp32, b_fp32)
-            if use_relu:
+            if post_op == 'relu':
                 Y_fp32_ref[Y_fp32_ref < 0.0] = 0.0
+            elif post_op == 'leaky_relu':
+                Y_fp32_ref = F.leaky_relu(Y_fp32_ref, **post_op_kwargs)
             Y_q_ref2 = torch.quantize_per_tensor(
                 Y_fp32_ref, Y_scale, Y_zp, dtype)
             # Assert equal
             np.testing.assert_array_almost_equal(
                 Y_q_ref2.int_repr().numpy(), Y_q.int_repr().numpy(), decimal=decimal_val)
 
+    """Tests the correctness of the quantized linear op."""
+    @override_qengines
+    def test_qlinear(self):
+        batch_size_list = [1, 4]
+        input_channels_list = [16, 32]
+        output_channels_list = [4, 8]
+        use_bias_list = [True, False]
+        use_multi_dim_input_list = [True, False]
+        use_channelwise_list = [True, False]
+        post_op = 'none'
+        cases = itertools.product(batch_size_list, input_channels_list, output_channels_list,
+                                  use_bias_list, use_multi_dim_input_list, use_channelwise_list)
+        for batch_size, input_channels, output_channels, use_bias,\
+                use_multi_dim_input, use_channelwise in cases:
+            self._test_qlinear_impl(batch_size, input_channels, output_channels,
+                                    use_bias, post_op, use_multi_dim_input, use_channelwise)
+
+    """Tests the correctness of the quantized linear_relu op."""
+    @override_qengines
+    def test_qlinear_relu(self):
+        batch_size_list = [1, 4]
+        input_channels_list = [16, 32]
+        output_channels_list = [4, 8]
+        use_bias_list = [True, False]
+        use_multi_dim_input_list = [True, False]
+        use_channelwise_list = [True, False]
+        post_op = 'relu'
+        cases = itertools.product(batch_size_list, input_channels_list, output_channels_list,
+                                  use_bias_list, use_multi_dim_input_list, use_channelwise_list)
+        for batch_size, input_channels, output_channels, use_bias,\
+                use_multi_dim_input, use_channelwise in cases:
+            self._test_qlinear_impl(batch_size, input_channels, output_channels,
+                                    use_bias, post_op, use_multi_dim_input, use_channelwise)
+
     @given(batch_size=st.integers(1, 4),
            # in cudnn v. 8.4.0, there is a limitation that input channels
            # should be a multiple of 4 for int8 tensors. in cudnn v.8.3.3
@@ -3811,6 +3840,26 @@ def test_qlinear_unpack(self, W, use_channelwise):
             np.testing.assert_equal(
                 W_q.q_zero_point(), W_q_origin.q_zero_point())
 
+    @skipIfNoONEDNN
+    def test_qlinear_leaky_relu(self):
+        with override_quantized_engine('onednn'):
+            batch_size_list = [1, 4]
+            input_channels_list = [16, 32]
+            output_channels_list = [4, 8]
+            use_bias_list = [True, False]
+            use_multi_dim_input_list = [True, False]
+            use_channelwise_list = [True, False]
+            negative_slopes_list = [0.01, 0.05]
+            post_op = 'leaky_relu'
+            cases = itertools.product(batch_size_list, input_channels_list, output_channels_list,
+                                      use_bias_list, use_multi_dim_input_list,
+                                      use_channelwise_list, negative_slopes_list)
+            for batch_size, input_channels, output_channels, use_bias,\
+                    use_multi_dim_input, use_channelwise, neg_slope in cases:
+                self._test_qlinear_impl(batch_size, input_channels, output_channels,
+                                        use_bias, post_op, use_multi_dim_input,
+                                        use_channelwise, negative_slope=neg_slope)
+
 @unittest.skipIf(IS_MACOS, "Known test failure on Mac.")
 class TestQuantizedEmbeddingOps(TestCase):
 
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index aa435842513eb..3c163f16d8123 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -307,6 +307,22 @@ def wrapper(*args, **kwargs):
             fn(*args, **kwargs)
     return wrapper
 
+def skipIfNoONEDNN(fn):
+    reason = 'Quantized operations require ONEDNN.'
+    if isinstance(fn, type):
+        if 'onednn' not in torch.backends.quantized.supported_engines:
+            fn.__unittest_skip__ = True
+            fn.__unittest_skip_why__ = reason
+        return fn
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        if 'onednn' not in torch.backends.quantized.supported_engines:
+            raise unittest.SkipTest(reason)
+        else:
+            fn(*args, **kwargs)
+    return wrapper
+
 try:
     import torchvision  # noqa: F401
     HAS_TORCHVISION = True

From 786da1557369742b0e41fbf04e24f5a62049ee59 Mon Sep 17 00:00:00 2001
From: mikey dagitses <mikeyd@meta.com>
Date: Tue, 6 Dec 2022 13:48:05 +0000
Subject: [PATCH 1624/1922] replace memset with value-initialization (#90048)

Stack created with [Sapling](https://sapling-scm.com). Best reviewed with [ReviewStack](https://reviewstack.dev/pytorch/pytorch/pull/90048).
* #89865
* #89852
* #89851
* __->__ #90048

replace memset with value-initialization

Summary:
This is equivalent to zero initialization for any members that are
scalar or have implicit default constructors.

Note that aside from the reset at the beginning, blockmask and
philox_args are not touched by this function.

Test Plan: Rely on CI.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90048
Approved by: https://github.com/drisspg, https://github.com/malfet
---
 aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
index 7cc0c250664e1..7d9807260db2f 100644
--- a/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/fmha_api.cpp
@@ -64,7 +64,7 @@ void set_params_fprop(FMHA_fprop_params &params,
                       bool is_causal) {
 
     // Reset the parameters
-    memset(&params, 0, sizeof(params));
+    params = {};
 
     params.is_bf16 = q.dtype() == at::kBFloat16;
 

From 27e71a5ef38f5f9fdb8a9f0eb0e772722b600dfb Mon Sep 17 00:00:00 2001
From: mingfeima <mingfei.ma@intel.com>
Date: Tue, 6 Dec 2022 08:01:22 +0800
Subject: [PATCH 1625/1922] add shape check for random_samples in
 fractional_max_pool{2d|3d} (#89992)

This PR add shape checks for `random_samples` in fractional_max_pool2d and fractional_max_pool3d.,
to provide more meaningful warnings instead of SegFault when the input is illegal.

For more details, please check https://github.com/pytorch/pytorch/issues/89648
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89992
Approved by: https://github.com/jgong5, https://github.com/ezyang
---
 aten/src/ATen/native/FractionalMaxPool2d.cpp  | 38 +++------
 aten/src/ATen/native/FractionalMaxPool3d.cpp  | 35 +++-----
 aten/src/ATen/native/FractionalMaxPooling.h   | 80 +++++++++++++++++++
 .../ATen/native/cuda/FractionalMaxPool2d.cu   |  3 +
 .../ATen/native/cuda/FractionalMaxPool3d.cu   |  2 +
 test/nn/test_pooling.py                       | 24 ++++++
 6 files changed, 132 insertions(+), 50 deletions(-)
 create mode 100644 aten/src/ATen/native/FractionalMaxPooling.h

diff --git a/aten/src/ATen/native/FractionalMaxPool2d.cpp b/aten/src/ATen/native/FractionalMaxPool2d.cpp
index 82512c83f4337..1e9bf9c3902fd 100644
--- a/aten/src/ATen/native/FractionalMaxPool2d.cpp
+++ b/aten/src/ATen/native/FractionalMaxPool2d.cpp
@@ -3,6 +3,7 @@
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
 #include <ATen/TensorMeta.h>
+#include <ATen/native/FractionalMaxPooling.h>
 #include <c10/util/irange.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
@@ -128,28 +129,6 @@ TORCH_META_FUNC(fractional_max_pool2d_backward)(
 namespace native {
 namespace {
 
-template <typename scalar_t>
-static std::vector<int> fractional_max_pool2d_generate_intervals(
-  scalar_t sample,
-  int inputSize,
-  int outputSize,
-  int poolSize) {
-  std::vector<int> sequence(outputSize);
-  if (outputSize > 1) {
-    scalar_t alpha = static_cast<scalar_t>(inputSize - poolSize) /
-      static_cast<scalar_t>(outputSize - 1);
-
-    for (int i = 0; i < outputSize - 1; ++i) {
-      sequence[i] =
-        static_cast<int>((i + sample) * alpha) - static_cast<int>(sample * alpha);
-    }
-  }
-  if (outputSize > 0) {
-    sequence[outputSize - 1] = inputSize - poolSize;
-  }
-  return sequence;
-}
-
 template <typename scalar_t>
 static void fractional_max_pool2d_out_single_batch_frame(
   scalar_t* input,
@@ -166,9 +145,9 @@ static void fractional_max_pool2d_out_single_batch_frame(
       scalar_t* randomSamplesForPlane = randomSamples + plane * 2;
 
       /* Generate interval sequence */
-      auto sequenceW = fractional_max_pool2d_generate_intervals<scalar_t>(
+      auto sequenceW = generate_intervals<scalar_t>(
           randomSamplesForPlane[0], inputW, outputW, poolSizeW);
-      auto sequenceH = fractional_max_pool2d_generate_intervals<scalar_t>(
+      auto sequenceH = generate_intervals<scalar_t>(
           randomSamplesForPlane[1], inputH, outputH, poolSizeH);
 
       /* loop over output */
@@ -305,10 +284,16 @@ TORCH_IMPL_FUNC(fractional_max_pool2d_out_cpu) (
   const at::Tensor& input_,
   IntArrayRef pool_size,
   IntArrayRef output_size,
-  const at::Tensor& randomSamples,
+  const at::Tensor& randomSamples_,
   const at::Tensor& output,
   const at::Tensor& indices) {
 
+  fractional_max_pool_check_shape</*ndim*/ 2>(input_, randomSamples_);
+
+  if (output.numel() == 0) {
+    return;
+  }
+
   int64_t numBatch = 1;
   int64_t planeDim = 0;
   int64_t heightDim = 1;
@@ -318,8 +303,9 @@ TORCH_IMPL_FUNC(fractional_max_pool2d_out_cpu) (
   int64_t poolSizeH = pool_size[0];
   int64_t poolSizeW = pool_size[1];
 
-  /* get contiguous input */
+  /* get contiguous input and samples */
   auto input = input_.contiguous();
+  auto randomSamples = randomSamples_.contiguous();
 
   int64_t ndims = input.ndimension();
 
diff --git a/aten/src/ATen/native/FractionalMaxPool3d.cpp b/aten/src/ATen/native/FractionalMaxPool3d.cpp
index 5890026872a85..c524f0545473c 100644
--- a/aten/src/ATen/native/FractionalMaxPool3d.cpp
+++ b/aten/src/ATen/native/FractionalMaxPool3d.cpp
@@ -3,6 +3,7 @@
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
 #include <ATen/TensorMeta.h>
+#include <ATen/native/FractionalMaxPooling.h>
 
 #include <c10/util/irange.h>
 
@@ -100,28 +101,6 @@ TORCH_PRECOMPUTE_META_FUNC(fractional_max_pool3d)(
 namespace native {
 namespace {
 
-template<typename scalar_t>
-static std::vector<int> generate_intervals(
-  scalar_t sample,
-  int64_t inputSize,
-  int64_t outputSize,
-  int64_t poolSize) {
-  std::vector<int> sequence(outputSize);
-  if (outputSize > 1) {
-    scalar_t alpha = static_cast<scalar_t>(inputSize - poolSize) /
-      static_cast<scalar_t>(outputSize - 1);
-
-    for (const auto i : c10::irange(outputSize - 1)) {
-      sequence[i] =
-        static_cast<int>((i + sample) * alpha) - static_cast<int>(sample * alpha);
-    }
-  }
-  if (outputSize > 0) {
-    sequence[outputSize - 1] = inputSize - poolSize;
-  }
-  return sequence;
-}
-
 template<typename scalar_t>
 static void fractional_max_pool3d_out_single_batch_frame(
   scalar_t* input,
@@ -241,7 +220,7 @@ TORCH_IMPL_FUNC(fractional_max_pool3d_out_cpu)(
   int64_t outputT,
   int64_t outputH,
   int64_t outputW,
-  const at::Tensor& randomSamples,
+  const at::Tensor& randomSamples_,
   int64_t numBatch,
   int64_t numPlanes,
   int64_t inputT,
@@ -249,8 +228,16 @@ TORCH_IMPL_FUNC(fractional_max_pool3d_out_cpu)(
   int64_t inputW,
   const at::Tensor& output,
   const at::Tensor& indices) {
-  /* get contiguous input */
+
+  fractional_max_pool_check_shape</*ndim*/ 3>(input_, randomSamples_);
+
+  if (output.numel() == 0) {
+    return;
+  }
+
+  /* get contiguous input and samples */
   auto input = input_.contiguous();
+  auto randomSamples = randomSamples_.contiguous();
 
   AT_DISPATCH_FLOATING_TYPES(
     input.scalar_type(),
diff --git a/aten/src/ATen/native/FractionalMaxPooling.h b/aten/src/ATen/native/FractionalMaxPooling.h
new file mode 100644
index 0000000000000..6631450faaa88
--- /dev/null
+++ b/aten/src/ATen/native/FractionalMaxPooling.h
@@ -0,0 +1,80 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorUtils.h>
+#include <c10/util/irange.h>
+
+namespace at { namespace native {
+
+template<typename scalar_t>
+static inline std::vector<int> generate_intervals(
+    scalar_t sample,
+    int64_t inputSize,
+    int64_t outputSize,
+    int64_t poolSize) {
+  std::vector<int> sequence(outputSize);
+  if (outputSize > 1) {
+    scalar_t alpha = static_cast<scalar_t>(inputSize - poolSize) /
+      static_cast<scalar_t>(outputSize - 1);
+
+    for (const auto i : c10::irange(outputSize - 1)) {
+      sequence[i] =
+        static_cast<int>((i + sample) * alpha) - static_cast<int>(sample * alpha);
+    }
+  }
+  if (outputSize > 0) {
+    sequence[outputSize - 1] = inputSize - poolSize;
+  }
+  return sequence;
+}
+
+template <int64_t ndim>
+static inline void fractional_max_pool_check_shape(
+    const Tensor& input,
+    const Tensor& randomSamples) {
+
+  TORCH_CHECK(
+      input.scalar_type() == randomSamples.scalar_type(),
+      "Expect _random_samples to have the same dtype as input");
+
+  int64_t ndimension = randomSamples.ndimension();
+  TORCH_CHECK(
+      ndimension == 3,
+      "Expect _random_samples to have 3 dimensions, got ", ndimension);
+
+  int64_t N = randomSamples.size(0);
+  int64_t C = randomSamples.size(1);
+  int64_t D = randomSamples.size(2);
+
+  int64_t input_batch, input_channel;
+  if (ndim == 2) {
+    // fractional_max_pool2d
+    if (input.ndimension() == 3) {
+      input_batch = 1;
+      input_channel = input.size(0);
+    } else {
+      input_batch = input.size(0);
+      input_channel = input.size(1);
+    }
+  } else {
+    // factional_max_pool3d
+    if (input.ndimension() == 4) {
+      input_batch = 1;
+      input_channel = input.size(0);
+    } else {
+      input_batch = input.size(0);
+      input_channel = input.size(1);
+    }
+  }
+
+  TORCH_CHECK(
+      N >= input_batch,
+      "Expect _random_samples.size(0) no less then input batch size.");
+  TORCH_CHECK(
+      C == input_channel,
+      "Expect _random_samples.size(1) equals to input channel size.");
+  TORCH_CHECK(
+      D == ndim,
+      "Expect _random_samples.size(2) equals to ", ndim, "; got ", D, ".");
+}
+
+}} // at::native
diff --git a/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu b/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu
index 24db8776cd49a..0acddbf80a71e 100644
--- a/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu
+++ b/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu
@@ -10,6 +10,7 @@
 #include <ATen/NumericUtils.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Utils.h>
+#include <ATen/native/FractionalMaxPooling.h>
 #include <c10/util/Exception.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
@@ -139,6 +140,8 @@ TORCH_IMPL_FUNC(fractional_max_pool2d_out_cuda) (
   const Tensor& output,
   const Tensor& indices
 ) {
+  fractional_max_pool_check_shape</*ndim*/ 2>(input, randomSamples);
+
   int planeDim = 0;
   int dimh = 1;
   int dimw = 2;
diff --git a/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu b/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu
index 92a77dc00af53..971905d291065 100644
--- a/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu
+++ b/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu
@@ -11,6 +11,7 @@
 #include <ATen/NumericUtils.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Utils.h>
+#include <ATen/native/FractionalMaxPooling.h>
 #include <c10/util/Exception.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
@@ -258,6 +259,7 @@ TORCH_IMPL_FUNC(fractional_max_pool3d_out_cuda) (
   int64_t inputW,
   const Tensor& output,
   const Tensor& indices) {
+  fractional_max_pool_check_shape</*ndim*/ 3>(input, randomSamples);
 
   auto output_ = output;
   auto indices_ = indices;
diff --git a/test/nn/test_pooling.py b/test/nn/test_pooling.py
index 35579e643a464..3826b1dff70c3 100644
--- a/test/nn/test_pooling.py
+++ b/test/nn/test_pooling.py
@@ -413,6 +413,30 @@ def test_FractionalMaxPool3d_zero_out_size(self, device):
         out = mod(inp)
         self.assertEqual(out, torch.empty((16, 0, 1, 1), device=device))
 
+    @onlyNativeDeviceTypes
+    def test_FractionalMaxPool2d_zero_samples(self, device):
+        samples = torch.rand([0, 16, 2], device=device)
+        mod = nn.FractionalMaxPool2d([2, 2], output_size=[1, 1], _random_samples=samples)
+        inp = torch.randn([0, 16, 32, 32], device=device)
+        out = mod(inp)
+        self.assertEqual(out, torch.empty((0, 16, 1, 1), device=device))
+
+        inp1 = torch.randn([1, 16, 32, 32], device=device)
+        with self.assertRaisesRegex(RuntimeError, "Expect _random_samples"):
+            out1 = mod(inp1)
+
+    @onlyNativeDeviceTypes
+    def test_FractionalMaxPool3d_zero_samples(self, device):
+        samples = torch.rand([0, 16, 3], device=device)
+        mod = nn.FractionalMaxPool3d([3, 2, 2], output_size=[1, 1, 1], _random_samples=samples)
+        inp = torch.randn([0, 16, 50, 32, 32], device=device)
+        out = mod(inp)
+        self.assertEqual(out, torch.empty((0, 16, 1, 1, 1), device=device))
+
+        inp1 = torch.randn([1, 16, 50, 32, 32], device=device)
+        with self.assertRaisesRegex(RuntimeError, "Expect _random_samples"):
+            out1 = mod(inp1)
+
     @onlyNativeDeviceTypes
     def test_MaxPool_zero_batch_dim(self, device):
         inp = torch.randn(0, 16, 50, device=device)

From 7fbbbc7e6d1ae60767184de9387974def56235e0 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Mon, 5 Dec 2022 14:18:05 -0500
Subject: [PATCH 1626/1922] Use Sized not Iterable to test for len (#90182)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90182
Approved by: https://github.com/albanD
---
 torch/_dynamo/symbolic_convert.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index 16935d0de2661..3585a659dacc6 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -12,7 +12,8 @@
 import types
 import typing
 import weakref
-from typing import Any, Dict, Iterable, List
+from collections.abc import Sized
+from typing import Any, Dict, List
 from unittest.mock import patch
 
 import torch
@@ -1429,7 +1430,7 @@ def empty_checkpoint(self):
         graphstate = self.checkpoint[1][1:]
         state = (*output_graphstate, *graphstate)
         for obj in state:
-            if isinstance(obj, Iterable):
+            if isinstance(obj, Sized):
                 if len(obj) != 0:
                     return False
         return True

From 00ebc2ed483938b6eb4cd5fa416fb037982abd26 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Tue, 6 Dec 2022 02:11:30 +0000
Subject: [PATCH 1627/1922] [dtensor] handle the case where output of op is
 Optional[Tensor] (#90241)

Observed by @aazzolini, some op might have Optional[Tensor] returns
where it return None (i.e. native_layer_norm_backward), it's a mismatch
between C++ aten op signature and python None, but we need to handle it
in the python side
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90241
Approved by: https://github.com/aazzolini
---
 torch/distributed/_tensor/utils.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/torch/distributed/_tensor/utils.py b/torch/distributed/_tensor/utils.py
index a8f561af700d3..1858558a6372d 100644
--- a/torch/distributed/_tensor/utils.py
+++ b/torch/distributed/_tensor/utils.py
@@ -10,7 +10,7 @@
 ArgKwargsType = Union[Tuple[object, ...], Dict[str, object]]
 # ATen op schemas could have Tensor, Tuple[Tensor] and List[Tensor], so output type sould
 # be the same set of possiblities.
-OutputSpecType = Optional[Union[DTensorSpec, Sequence[DTensorSpec]]]
+OutputSpecType = Optional[Union[DTensorSpec, Sequence[Optional[DTensorSpec]]]]
 
 
 def unwrap_local_tensor(e: "dtensor.DTensor") -> torch.Tensor:
@@ -45,8 +45,13 @@ def wrap(res: object, spec: OutputSpecType) -> object:
         assert spec is not None and isinstance(
             spec, tuple
         ), f"output spec does not match with output! Expected tuple, got {spec}"
+
+        # NOTE: local results might return Optional Tensor from ATen op, so we need to
+        # handle that case and make sure we don't wrap None with DTensor.
+        # (i.e. native_layer_norm.backward)
         return tuple(
             dtensor.DTensor(e, s.mesh, s.placements, size=s.shape)
+            if e is not None and s is not None else None
             for e, s in zip(res, spec)
         )
     else:

From 128ebdfe716e56ff7caa456c7f75f837d2a82d77 Mon Sep 17 00:00:00 2001
From: mfkasim1 <firman.kasim@gmail.com>
Date: Tue, 6 Dec 2022 19:12:24 +0000
Subject: [PATCH 1628/1922] Log1p for complex in CPU  (#89691)

Another PR for https://github.com/pytorch/pytorch/issues/89205: making torch.log1p accepts complex numbers in CPU.
I haven't done the GPU version because I'm not sure which file(s) to change.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89691
Approved by: https://github.com/jgong5, https://github.com/lezcano
---
 .../cpu/vec/vec256/vec256_complex_double.h    |  2 +-
 .../cpu/vec/vec256/vec256_complex_float.h     |  2 +-
 .../vec256/vsx/vec256_complex_double_vsx.h    |  8 +--
 .../vec/vec256/vsx/vec256_complex_float_vsx.h |  8 +--
 .../cpu/vec/vec512/vec512_complex_double.h    |  2 +-
 .../cpu/vec/vec512/vec512_complex_float.h     |  2 +-
 aten/src/ATen/native/cpu/UnaryOpsKernel.cpp   |  2 +-
 test/test_unary_ufuncs.py                     | 56 +++++++++++++++++++
 .../_internal/common_methods_invocations.py   |  4 +-
 9 files changed, 71 insertions(+), 15 deletions(-)

diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
index 487233bc3c407..2614e5f85e24d 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
@@ -185,7 +185,7 @@ template <> class Vectorized<c10::complex<double>> {
     return _mm256_div_pd(log(), log10_);
   }
   Vectorized<c10::complex<double>> log1p() const {
-    AT_ERROR("not supported for complex numbers");
+    return map(std::log1p);
   }
   Vectorized<c10::complex<double>> asin() const {
     // asin(x)
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
index 4093022a7e349..4a8f30f0c6ccc 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
@@ -221,7 +221,7 @@ template <> class Vectorized<c10::complex<float>> {
     return _mm256_div_ps(log(), log10_);
   }
   Vectorized<c10::complex<float>> log1p() const {
-    AT_ERROR("not supported for complex numbers");
+    return map(std::log1p);
   }
   Vectorized<c10::complex<float>> asin() const {
     // asin(x)
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h
index f5084f2101087..dfa4a852f4d84 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h
@@ -284,6 +284,10 @@ class Vectorized<ComplexDbl> {
     return ret.elwise_mult(vd_log10e_inv);
   }
 
+  Vectorized<ComplexDbl> log1p() const {
+    return map(std::log1p);
+  }
+
   Vectorized<ComplexDbl> asin() const {
     // asin(x)
     // = -i*ln(iz + sqrt(1 -z^2))
@@ -481,10 +485,6 @@ class Vectorized<ComplexDbl> {
     TORCH_CHECK(false, "not supported for complex numbers");
   }
 
-  Vectorized<ComplexDbl> log1p() const {
-    TORCH_CHECK(false, "not supported for complex numbers");
-  }
-
   Vectorized<ComplexDbl> atan2(const Vectorized<ComplexDbl>& b) const {
     TORCH_CHECK(false, "not supported for complex numbers");
   }
diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h
index b4e35acfb480c..56a6f4e6e39a6 100644
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h
@@ -321,6 +321,10 @@ class Vectorized<ComplexFlt> {
     return ret.elwise_mult(log10e_inv);
   }
 
+  Vectorized<ComplexFlt> log1p() const {
+    return map(std::log1p);
+  }
+
   Vectorized<ComplexFlt> el_swapped() const {
     vfloat32 v0 = vec_perm(_vec0, _vec0, swap_mask);
     vfloat32 v1 = vec_perm(_vec1, _vec1, swap_mask);
@@ -568,10 +572,6 @@ class Vectorized<ComplexFlt> {
     TORCH_CHECK(false,"not supported for complex numbers");
   }
 
-  Vectorized<ComplexFlt> log1p() const {
-    TORCH_CHECK(false,"not supported for complex numbers");
-  }
-
   Vectorized<ComplexFlt> expm1() const {
     TORCH_CHECK(false,"not supported for complex numbers");
   }
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
index 9d862534a9d67..cb73beaaedd60 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
@@ -248,7 +248,7 @@ template <> class Vectorized<c10::complex<double>> {
     return _mm512_div_pd(log(), log10_);
   }
   Vectorized<c10::complex<double>> log1p() const {
-    AT_ERROR("not supported for complex numbers");
+    return map(std::log1p);
   }
   Vectorized<c10::complex<double>> asin() const {
     // asin(x)
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
index 966f42a253484..03b75ed035131 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
@@ -753,7 +753,7 @@ template <> class Vectorized<c10::complex<float>> {
     return _mm512_div_ps(log(), log10_);
   }
   Vectorized<c10::complex<float>> log1p() const {
-    AT_ERROR("not supported for complex numbers");
+    return map(std::log1p);
   }
   Vectorized<c10::complex<float>> asin() const {
     // asin(x)
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index 8a0534fd3da5f..898f736fabe86 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -780,7 +780,7 @@ IMPLEMENT_COMPLEX_KERNEL(log)
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
 IMPLEMENT_COMPLEX_KERNEL(log10)
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-IMPLEMENT_FLOAT_KERNEL(log1p)
+IMPLEMENT_COMPLEX_KERNEL(log1p)
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
 IMPLEMENT_COMPLEX_KERNEL(log2)
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py
index 5a9bdb53ab6b3..3676d88de5680 100644
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@@ -1094,6 +1094,62 @@ def test_mish(self, device, dtype):
             rtol=rtol,
         )
 
+    @dtypes(torch.complex64, torch.complex128)
+    @onlyCPU
+    def test_log1p_complex(self, device, dtype):
+        # The output values here were obtained using arbitrary precision math (mpmath)
+        # and double checked with WolframAlpha.
+        # Not using numpy's log1p here because by the time of writing this,
+        # np.log1p has precision problems for small complex input values, see here:
+        # https://github.com/numpy/numpy/issues/22609
+        inouts = [
+            (0.2 + 0.3j, 0.21263386770217202 + 0.24497866312686414j),
+            (1e-19 + 1e-18j, 1e-19 + 1e-18j),
+            (1e-18 + 0.1j, 0.00497517 + 0.0996687j),
+            (0.1 + 1e-18j, 0.0953102 + 9.090909090909090909e-19j),
+            (0.5 + 0j, 0.40546510810816 + 0j),
+            (0.0 + 0.5j, 0.111571776 + 0.463647609j),
+            (2.0 + 1.0j, 1.151292546497023 + 0.3217505543966422j),
+            (-1.0 + 2.0j, 0.6931471805599453 + 1.570796326794897j),
+            (2.0j, 0.80471895621705014 + 1.1071487177940904j),
+            (-2.0j, 0.80471895621705014 - 1.1071487177940904j),
+        ]
+        # test the extreme values
+        if dtype == torch.complex128:
+            inouts += [
+                (-1 + 1e250j, 575.6462732485114 + 1.5707963267948966j),
+                (1e250 + 1j, 575.6462732485114 + 1e-250j),
+                (1e250 + 1e250j, 575.9928468387914 + 0.7853981633974483j),
+                (1e-250 + 1e250j, 575.6462732485114 + 1.5707963267948966j),
+                (1e-250 + 2e-250j, 1e-250 + 2e-250j),
+                (1e250 + 1e-250j, 575.6462732485114 + 0.0j),
+            ]
+        elif dtype == torch.complex64:
+            inouts += [
+                (-1 + 1e30j, 69.07755278982137 + 1.5707963267948966j),
+                (1e30 + 1j, 69.07755278982137 + 1e-30j),
+                (1e30 + 1e30j, 69.42412638010134 + 0.7853981633974483j),
+                (1e-30 + 1e30j, 69.07755278982137 + 1.5707963267948966j),
+                (1e-30 + 2e-30j, 1e-30 + 2e-30j),
+                (1e30 + 1e-30j, 69.07755278982137 + 0.0j),
+            ]
+
+        # test the log1p individually
+        for inp, out in inouts:
+            res = torch.log1p(torch.tensor(inp, dtype=dtype, device=device))
+            self.assertFalse(torch.any(torch.isnan(res)))
+            # setting up atol == 0.0 because some part has very small values
+            self.assertEqual(res.real, out.real, atol=0.0, rtol=1e-6)
+            self.assertEqual(res.imag, out.imag, atol=0.0, rtol=1e-6)
+
+        # test the log1p in tensor
+        inp_lst, out_lst = [list(elmt) for elmt in zip(*inouts)]
+        inp_tens = torch.tensor(inp_lst, dtype=dtype, device=device)
+        out_tens = torch.tensor(out_lst, dtype=dtype, device=device)
+        res_tens = torch.log1p(inp_tens)
+        self.assertEqual(res_tens.real, out_tens.real, atol=0.0, rtol=1e-6)
+        self.assertEqual(res_tens.imag, out_tens.imag, atol=0.0, rtol=1e-6)
+
     # do ops like threshold need a test_unary(_nonufunc) test suite?
     @onlyCPU
     @dtypes(*get_all_math_dtypes("cpu"))
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index d11c275cc220c..596b5e9160286 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -7767,7 +7767,7 @@ def sample_inputs_max_unpool_grad(op_info, device, dtype, requires_grad, **kwarg
 
     ForeachFuncInfo(
         'log1p',
-        dtypes=floating_types_and(torch.bfloat16),
+        dtypes=floating_and_complex_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.half),
     ),
 
@@ -9878,7 +9878,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    ref=np.log1p,
                    aliases=('special.log1p',),
                    domain=(-1, None),
-                   dtypes=all_types_and(torch.bool, torch.bfloat16),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
                    dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16),
                    decorators=(precisionOverride({torch.bfloat16: 1e-1}),),
                    supports_forward_ad=True,

From 9ec1d9c8a0c91c6d30bff5af23fb352bf8f5dad2 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Tue, 6 Dec 2022 03:21:32 +0000
Subject: [PATCH 1629/1922] [FSDP] Fix accidental change in `_test_fsdp_parity`
 (#90252)

I accidentally changed the semantics of this line when refactoring a while ago. The [previous version](https://github.com/pytorch/pytorch/pull/80873/files#diff-7b5c66f99161fa6a3d9042e80f8c8cc140a64e43445feede46f55e53154f6c3dL635) used to say:
```
if not mixed_precision:
```
which is actually the opposite of
```
if mixed_precision is not None:
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90252
Approved by: https://github.com/zhaojuanmao
---
 torch/testing/_internal/common_fsdp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index 0c4063016dc30..405d8a5eb235f 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -1029,7 +1029,7 @@ def _test_fsdp_parity(
         # the DDP parameters are in FP16 (from `half()`) while the FSDP
         # parameters are in FP32 (from `summon_full_params()`) and (2) DDP runs
         # the optimizer in FP16 while FSDP runs it in FP32
-        if mixed_precision is not None:
+        if mixed_precision is None:
             self.assertEqual(
                 ddp_params,
                 fsdp_unsharded_params,

From 9217a13f6444ed6af51a4ac443fc950f910be368 Mon Sep 17 00:00:00 2001
From: Charlie Yan <>
Date: Tue, 6 Dec 2022 04:37:01 +0000
Subject: [PATCH 1630/1922] reland #89222: [Composable API] replicate: change
 to per module call, remove mark_root_module() (#90254)

reland https://github.com/pytorch/pytorch/pull/89222
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90254
Approved by: https://github.com/zhaojuanmao
---
 .../distributed/_composable/test_replicate.py |  6 +--
 torch/distributed/_composable/replicate.py    | 40 ++++---------------
 2 files changed, 10 insertions(+), 36 deletions(-)

diff --git a/test/distributed/_composable/test_replicate.py b/test/distributed/_composable/test_replicate.py
index 3e8bf44a1fdea..db1459589b342 100644
--- a/test/distributed/_composable/test_replicate.py
+++ b/test/distributed/_composable/test_replicate.py
@@ -7,7 +7,7 @@
 import torch.distributed as dist
 import torch.nn.functional as F
 from torch import nn
-from torch.distributed._composable.replicate import mark_root_module, replicate
+from torch.distributed._composable.replicate import replicate
 from torch.testing._internal.common_distributed import MultiProcessTestCase
 from torch.testing._internal.common_utils import run_tests
 
@@ -91,12 +91,12 @@ def step_model(model, input, target):
 
     def test_replicate_single_module(self):
         model = Net()
-        replicate_model = mark_root_module(replicate(deepcopy(model)))
+        replicate_model = replicate(deepcopy(model))
         self._compare_module(model, replicate_model)
 
     def test_replicate_multi_module(self):
         model = Net()
-        replicate_model = mark_root_module(deepcopy(model))
+        replicate_model = deepcopy(model)
         replicate(replicate_model.fc1)
         replicate(replicate_model.fc2)
         replicate(replicate_model.fc3)
diff --git a/torch/distributed/_composable/replicate.py b/torch/distributed/_composable/replicate.py
index 0e94427afee88..c27a88d79b4d9 100644
--- a/torch/distributed/_composable/replicate.py
+++ b/torch/distributed/_composable/replicate.py
@@ -7,11 +7,7 @@
 from .contract import contract
 
 
-class DistributedState:
-    ...
-
-
-class ReplicateState(DistributedState):
+class _ReplicateState:
     def __init__(self) -> None:
         self.modules: List[nn.Module] = []
         self.has_initialized: bool = False
@@ -22,6 +18,9 @@ def mark_modules(self, *modules: nn.Module) -> None:
             self.modules.append(module)
             replicate.state(module)._distributed_state = self
             replicate.state(module)._params_collected = False
+            module.register_forward_pre_hook(self.forward_pre_hook)
+            # TODO(@yhcharles): fix type error
+            module.register_forward_hook(self.forward_post_hook)  # type: ignore[arg-type]
 
     def _recursive_collect_params(self, module: nn.Module) -> None:
         # TODO: skip if managed by other APIs
@@ -50,13 +49,13 @@ def init_helper(self):
 
         self._ddp = _ddp.DistributedDataParallel(self._param_list)
 
-    def root_module_forward_pre_hook(
+    def forward_pre_hook(
         self, module: nn.Module, input: Tuple[torch.Tensor]
     ) -> None:
         self.init_helper()
         self._ddp.pre_forward()
 
-    def root_module_forward_post_hook(
+    def forward_post_hook(
         self,
         module: nn.Module,
         input: Tuple[torch.Tensor],
@@ -65,14 +64,9 @@ def root_module_forward_post_hook(
         return self._ddp.post_forward(output)
 
 
-# TODO(@yhcharles): use a per-model instance instead of a global one
-_default_state = ReplicateState()
-
-
 @contract
 def replicate(
     module: nn.Module,  # NOTE: contract now supports single module only
-    dist_state: ReplicateState = _default_state,
 ) -> nn.Module:
     r"""Replicates module(s)
 
@@ -83,25 +77,5 @@ def replicate(
         >>> module = nn.Linear(3, 3)
         >>> replicate(module)
     """
-    dist_state.mark_modules(module)
-    return module
-
-
-def mark_root_module(
-    module: nn.Module, dist_state: ReplicateState = _default_state
-) -> nn.Module:
-    r"""Mark the root module. Its sub-modules can be replicated.
-
-    Args:
-        modules (torch.nn.Module): root module
-
-    Example::
-        >>> module = nn.Linear(3, 3)
-        >>> replicate(module)
-    """
-    module.register_forward_pre_hook(dist_state.root_module_forward_pre_hook)
-    # TODO(@yhcharles): fix type error
-    module.register_forward_hook(
-        dist_state.root_module_forward_post_hook  # type: ignore[arg-type]
-    )
+    _ReplicateState().mark_modules(module)
     return module

From a3f0160645678cc23a496c631e30127cee434112 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Tue, 6 Dec 2022 10:33:46 -0500
Subject: [PATCH 1631/1922] 
 ShapeEnv.create_symbolic_sizes_strides_storage_offset (#89962)

Instead of having storage offset hang out on its own, allocate
all of these symbols all in one go.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89962
Approved by: https://github.com/albanD, https://github.com/voznesenskym
---
 test/test_dynamic_shapes.py              | 18 +++------
 torch/_subclasses/meta_utils.py          | 47 ++++++++++--------------
 torch/fx/experimental/symbolic_shapes.py |  5 ++-
 3 files changed, 29 insertions(+), 41 deletions(-)

diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index 6fc27d00a54e4..06230f86943a0 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -121,9 +121,9 @@ def __torch_dispatch__(cls, func_overload, types, args=(), kwargs=None):
         raise RuntimeError(f"operator {func_overload} not supported")
 
 
-def create_symbolic_tensor(name, arg, shape_env, storage_offset=0):
-    sym_shapes, sym_strides = shape_env.create_symbolic_sizes_strides(arg)
-    return FakeSymbolicTensor(sym_shapes, sym_strides, arg.dtype, arg.layout, arg.requires_grad, arg.device, storage_offset)
+def create_symbolic_tensor(name, arg, shape_env):
+    sym_shapes, sym_strides, sym_storage_offset = shape_env.create_symbolic_sizes_strides_storage_offset(arg)
+    return FakeSymbolicTensor(sym_shapes, sym_strides, arg.dtype, arg.layout, arg.requires_grad, arg.device, sym_storage_offset)
 
 def create_symint(shape_env, i):
     return shape_env.create_symintnode(shape_env.create_symbol(i))
@@ -179,15 +179,9 @@ def test_roundtrip(self):
         self.assertTrue(x.size(2) == 3)
         self.assertTrue(isinstance(x.size(2), SymInt))
 
-        offset = create_symint(shape_env, 2)
-        y = create_symbolic_tensor("x", torch.randn(5, 4, 3), shape_env, offset)
+        y = create_symbolic_tensor("x", torch.randn(5, 4, 3)[1:], shape_env)
         self.assertTrue(isinstance(y.storage_offset(), SymInt))
-        self.assertTrue(y.storage_offset() == 2)
-
-        offset = 2
-        z = create_symbolic_tensor("z", torch.randn(5, 4, 3), shape_env, offset)
-        self.assertTrue(isinstance(z.storage_offset(), int))
-        self.assertTrue(z.storage_offset() == 2)
+        self.assertTrue(y.storage_offset() == 12)
 
     @skipIfNoSympy
     def test_binary(self):
@@ -285,7 +279,7 @@ def test_size_expressions(self):
         else:
             result = expand_x + expand_x
 
-        gt_op = shape_env.guards[0][0]
+        gt_op, _bt = shape_env.guards[-1]
         self.assertTrue(isinstance(gt_op, sympy.core.relational.StrictGreaterThan))
         self.assertTrue(str(x.shape[0]), str(gt_op.args[0]))
         self.assertTrue(str(expand_x.shape[1]), str(x.shape[0]))
diff --git a/torch/_subclasses/meta_utils.py b/torch/_subclasses/meta_utils.py
index 8adca0335b971..c410610598916 100644
--- a/torch/_subclasses/meta_utils.py
+++ b/torch/_subclasses/meta_utils.py
@@ -220,22 +220,16 @@ def meta_tensor(self, t, shape_env=None, callback=lambda t: t()):
         # This is too aggressive: we do duck sizing and 0/1 simplification
         # as we allocate variables, and we do need to register guards for
         # these cases.
-        maybe_suppress = contextlib.nullcontext()
+        maybe_suppress = contextlib.nullcontext
         if shape_env is not None:
-            maybe_suppress = shape_env.suppress_guards()
+            maybe_suppress = shape_env.suppress_guards
 
         make_symbolic = shape_env is not None
 
-        def sym(x):
+        def sym_sizes_strides_storage_offset(t):
             if make_symbolic:
-                return shape_env.create_symintnode(shape_env.create_symbol(x))
-            else:
-                return x
-
-        def sym_sizes_strides(t):
-            if make_symbolic:
-                return shape_env.create_symbolic_sizes_strides(t)
-            return (t.size(), t.stride())
+                return shape_env.create_symbolic_sizes_strides_storage_offset(t)
+            return (t.size(), t.stride(), t.storage_offset())
 
         # see expired-storages
         self.check_expired_count += 1
@@ -273,7 +267,9 @@ def sym_sizes_strides(t):
                             r._coalesced_(t.is_coalesced())
                 elif t.is_mkldnn:
                     is_leaf = safe_is_leaf(t)
-                    sizes, strides = sym_sizes_strides(t)
+                    sizes, strides, _storage_offset = sym_sizes_strides_storage_offset(
+                        t
+                    )
                     r = callback(
                         lambda: torch.empty_strided(
                             sizes, strides, dtype=t.dtype, device="meta"
@@ -344,24 +340,24 @@ def is_c_of_r(complex_dtype, real_dtype):
                         # So we may have to do *two* views out of the base to
                         # recreate this situation.
 
-                        sizes, strides = sym_sizes_strides(t)
+                        (
+                            sizes,
+                            strides,
+                            storage_offset,
+                        ) = sym_sizes_strides_storage_offset(t)
 
                         if safe_is_leaf(t):
                             # Leaf views that track view metadata are created by
                             # creating a view inside a no_grad block
-                            with torch.no_grad(), maybe_suppress:
-                                r = base.as_strided(
-                                    sizes, strides, sym(t.storage_offset())
-                                )
+                            with torch.no_grad(), maybe_suppress():
+                                r = base.as_strided(sizes, strides, storage_offset)
                             # As it's a leaf, we can directly assign requires_grad
                             r.requires_grad = t.requires_grad
                         else:
                             if t._base.requires_grad == t.requires_grad:
                                 # Easy case, just run the view op
-                                with torch.enable_grad(), maybe_suppress:
-                                    r = base.as_strided(
-                                        sizes, strides, sym(t.storage_offset())
-                                    )
+                                with torch.enable_grad(), maybe_suppress():
+                                    r = base.as_strided(sizes, strides, storage_offset)
                             else:
                                 # Obscure case.  Create a leaf view and give it the
                                 # correct requires_grad, then do the final view.
@@ -370,10 +366,8 @@ def is_c_of_r(complex_dtype, real_dtype):
                                 with torch.no_grad():
                                     mid = base.view(base.shape)
                                 mid.requires_grad = t.requires_grad
-                                with torch.enable_grad(), maybe_suppress:
-                                    r = mid.as_strided(
-                                        sizes, strides, sym(t.storage_offset())
-                                    )
+                                with torch.enable_grad(), maybe_suppress():
+                                    r = mid.as_strided(sizes, strides, storage_offset)
                     finally:
                         torch._C._dispatch_tls_set_dispatch_key_excluded(
                             torch._C.DispatchKey.ADInplaceOrView, old_exclude
@@ -381,8 +375,7 @@ def is_c_of_r(complex_dtype, real_dtype):
 
                 else:
                     is_leaf = safe_is_leaf(t)
-                    sizes, strides = sym_sizes_strides(t)
-                    storage_offset = sym(t.storage_offset())
+                    sizes, strides, storage_offset = sym_sizes_strides_storage_offset(t)
                     r = callback(
                         lambda: torch.empty_strided(
                             sizes, strides, dtype=t.dtype, device="meta"
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index 35578401bfdb0..8d7e12668d7f1 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -470,7 +470,7 @@ def _get_key(self):
         """
         return (len(self.replacements), len(self.divisible))
 
-    def create_symbolic_sizes_strides(self, ex: torch.Tensor):
+    def create_symbolic_sizes_strides_storage_offset(self, ex: torch.Tensor):
         """
         Returns a list of symbolic sizes and strides for the given tensor.
         We try our best to express stride in terms of the sizes, so as to not
@@ -520,7 +520,8 @@ def create_symbolic_sizes_strides(self, ex: torch.Tensor):
             assert isinstance(s, sympy.Symbol)
             self.replacements[s] = stride_expr
             sym_stride.append(self.create_symintnode(s))
-        return sym_size, sym_stride
+        sym_storage_offset = self.create_symintnode(self.create_symbol(ex.storage_offset()))
+        return sym_size, sym_stride, sym_storage_offset
 
     def create_symintnode(self, sym: "sympy.Expr"):
         assert isinstance(sym, sympy.Symbol) or isinstance(-sym, sympy.Symbol)

From 03fa4c1ad79e5fa97c3bb7f0eddf07ee7dfea470 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 6 Dec 2022 21:28:14 +0000
Subject: [PATCH 1632/1922] Fix uniform ref implementation (#90094)

Fixes https://github.com/pytorch/torchdynamo/issues/1954

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90094
Approved by: https://github.com/ngimel
---
 test/test_decomp.py                   | 14 ++++++++++++++
 torch/_decomp/decompositions.py       | 16 ++++++++++++++++
 torch/_prims/__init__.py              |  4 ++--
 torch/_refs/__init__.py               |  9 ++-------
 torch/_refs/nn/functional/__init__.py |  2 +-
 5 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/test/test_decomp.py b/test/test_decomp.py
index 264b62069e6c0..b947f72c586bd 100644
--- a/test/test_decomp.py
+++ b/test/test_decomp.py
@@ -8,6 +8,7 @@
 
 from torch.utils._pytree import tree_map, tree_flatten, tree_unflatten
 from torch.utils._mode_utils import no_dispatch
+from torch.testing import make_tensor
 from torch.testing._internal.common_utils import (
     is_iterable_of_tensors,
     TestCase,
@@ -392,6 +393,19 @@ def test_quick(self, device, dtype, op):
     def test_comprehensive(self, device, dtype, op):
         self.do_cross_ref(device, dtype, op, run_all=True)
 
+    def test_uniform(self, device):
+        size = (2, 3, 4, 5)
+        dtype = torch.float32
+        x = make_tensor(size, dtype=dtype, device=device)
+        low = 0.3
+        high = 0.9
+
+        torch.manual_seed(123)
+        ref = torch.ops.aten.uniform(x, low, high)
+        torch.manual_seed(123)
+        res = torch._decomp.decompositions.uniform(x, low=low, high=high)
+        self.assertEqual(ref, res)
+
     @skipIfTorchDynamo("Test does not work with TorchDynamo")
     def do_cross_ref(self, device, dtype, op, *, run_all):
         test_keys = [
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 1de7aabd703f8..cadfd14f715b8 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -7,6 +7,7 @@
 from typing import Callable, cast, Iterable, List, Optional, Tuple, Union
 
 import torch
+import torch._prims as prims
 import torch._prims_common as utils
 import torch.nn.functional as F
 from torch import Tensor
@@ -1929,6 +1930,21 @@ def norm(
     ).to(result_dtype)
 
 
+@register_decomposition(aten.uniform)
+def uniform(
+    x: Tensor,
+    low: Union[bool, int, float] = 0.0,
+    high: Union[bool, int, float] = 1.0,
+):
+    return prims._uniform_helper(
+        x.shape,
+        low=sym_float(low),
+        high=sym_float(high),
+        dtype=x.dtype,
+        device=x.device,
+    )
+
+
 # aten/src/ATen/native/UpSample.cpp compute_output_size
 def upsample_compute_output_size(input_size, output_size, scale_factors):
     spatial_dimensions = len(input_size) - 2
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index e764229b8bb5c..f1caf17fd19ac 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -199,7 +199,7 @@
     # Randomness Prims
     #
     "normal",
-    "uniform",
+    "_uniform_helper",
     #
     # FFT prims
     #
@@ -2707,7 +2707,7 @@ def _uniform_aten(
 """
 
 # TODO: we should more seriously review randomness modeling and prims
-uniform = _make_prim(
+_uniform_helper = _make_prim(
     schema=(
         "uniform(SymInt[] shape, *, Scalar low, Scalar high, ScalarType dtype, Device device) -> Tensor"
     ),
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 04bf9e12927fa..b0616191d1a80 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -285,10 +285,6 @@
     "zeros",
     "zeros_like",
     #
-    # Randomness References
-    #
-    "uniform",  # TODO: add OpInfo -- and testing for randomness?
-    #
     # Test-related functions
     #
     "allclose",
@@ -4685,8 +4681,7 @@ def scalar_tensor(
 #
 
 
-@register_decomposition(torch.ops.aten.uniform)
-def uniform(
+def _uniform_helper(
     shape: ShapeType,
     low: Union[bool, int, float] = 0.0,
     high: Union[bool, int, float] = 1.0,
@@ -4704,7 +4699,7 @@ def uniform(
     assert isinstance(dtype, torch.dtype)
     device = utils.canonicalize_device(device)
 
-    return prims.uniform(shape, low=low, high=high, dtype=dtype, device=device)
+    return prims._uniform_helper(shape, low=low, high=high, dtype=dtype, device=device)
 
 
 @register_decomposition(
diff --git a/torch/_refs/nn/functional/__init__.py b/torch/_refs/nn/functional/__init__.py
index 4ebe6e2b05d91..e979dfd9e03df 100644
--- a/torch/_refs/nn/functional/__init__.py
+++ b/torch/_refs/nn/functional/__init__.py
@@ -73,7 +73,7 @@ def _dropout_helper(
     """
 
     return (
-        refs.uniform(
+        refs._uniform_helper(
             self.shape, low=0.0, high=1.0, dtype=torch.float32, device=self.device
         )
         < val

From 933905c4fea56acb6d4c9440ca19eba4805e608e Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Tue, 6 Dec 2022 15:31:37 +0000
Subject: [PATCH 1633/1922] [FSDP] Test `use_orig_params=True` in
 `test_fsdp_ignored_modules.py` (#90290)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90290
Approved by: https://github.com/zhaojuanmao
---
 .../fsdp/test_fsdp_ignored_modules.py         | 25 ++++++++++++++++---
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/test/distributed/fsdp/test_fsdp_ignored_modules.py b/test/distributed/fsdp/test_fsdp_ignored_modules.py
index 83babee7d482f..297cd3f3ca606 100644
--- a/test/distributed/fsdp/test_fsdp_ignored_modules.py
+++ b/test/distributed/fsdp/test_fsdp_ignored_modules.py
@@ -99,6 +99,12 @@ def _train_model(self, model, optim, num_iters, device=torch.device("cuda")):
     def test_ignored_modules_transformer(self):
         """Tests that ignored modules' parameters are not flattened for a
         transformer model with shared parameters."""
+        self.run_subtests(
+            {"use_orig_params": [False, True]},
+            self._test_ignored_modules_transformer,
+        )
+
+    def _test_ignored_modules_transformer(self, use_orig_params: bool):
         # Initialize an FSDP-wrapped transformer model that has FSDP ignore
         # the `nn.Transformer` module's parameters
         model: nn.Module = TransformerWithSharedParams.init(
@@ -111,6 +117,7 @@ def test_ignored_modules_transformer(self):
             model,
             self.process_group,
             ignored_modules=[model.transformer],
+            use_orig_params=use_orig_params,
         )
         # Check that the wrapped model's flattened parameter does not include
         # the ignored transformer module's parameters
@@ -126,7 +133,8 @@ def test_ignored_modules_transformer(self):
         )
         nonignored_numel = total_numel - ignored_numel
         with FSDP.summon_full_params(wrapped_model):
-            flat_param_numel = wrapped_model.params[0].numel()
+            flat_param = wrapped_model.params[0]
+            flat_param_numel = flat_param.numel()
             self.assertEqual(flat_param_numel, nonignored_numel)
         # Check that we can run a few iterations
         optim = torch.optim.Adam(wrapped_model.parameters(), lr=1e-3)
@@ -136,12 +144,20 @@ def test_ignored_modules_transformer(self):
     def test_ignored_modules_nested(self):
         """Tests that passing a module with nested FSDP modules does not
         error and still ignores non-FSDP modules' parameters."""
+        self.run_subtests(
+            {"use_orig_params": [False, True]},
+            self._test_ignored_modules_nested,
+        )
+
+    def _test_ignored_modules_nested(self, use_orig_params: bool):
         # Initialize an FSDP-wrapped nested model that first wraps the nested
         # sequential's second linear layer (`layer1[1]`) and then wraps the
         # overall model while ignoring the nested sequential (`layer1`)
         model = Model().cuda()
-        model.layer1[1] = FSDP(model.layer1[1])
-        wrapped_model = FSDP(model, ignored_modules=[model.layer1])
+        model.layer1[1] = FSDP(model.layer1[1], use_orig_params=use_orig_params)
+        wrapped_model = FSDP(
+            model, ignored_modules=[model.layer1], use_orig_params=use_orig_params
+        )
         # Check that the wrapped model's flattened parameter does not include
         # the ignored nested sequential's parameters
         nonwrapped_model = Model()
@@ -149,7 +165,8 @@ def test_ignored_modules_nested(self):
         ignored_numel = sum(p.numel() for p in nonwrapped_model.layer1.parameters())
         nonignored_numel = total_numel - ignored_numel
         with FSDP.summon_full_params(wrapped_model):
-            flat_param_numel = wrapped_model.params[0].numel()
+            flat_param = wrapped_model.params[0]
+            flat_param_numel = flat_param.numel()
             self.assertEqual(flat_param_numel, nonignored_numel)
         # Check that we can run a few iterations
         optim = torch.optim.Adam(wrapped_model.parameters(), lr=1e-3)

From ca05cfcf458d7260b2d9c4e7c226f6b5e8c23640 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Tue, 6 Dec 2022 15:31:37 +0000
Subject: [PATCH 1634/1922] [FSDP][BE] Clean up dead code from
 `clip_grad_norm_()` testing (#90250)

`FSDP.clip_grad_norm_()` is tested separately in `test_fsdp_clip_grad_norm.py`. This PR removes the dead non-run code from `common_fsdp.py` and `test_fsdp_core.py` related to `clip_grad_norm_()`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90250
Approved by: https://github.com/rohan-varma
---
 test/distributed/fsdp/test_fsdp_core.py | 15 ---------------
 torch/testing/_internal/common_fsdp.py  | 19 -------------------
 2 files changed, 34 deletions(-)

diff --git a/test/distributed/fsdp/test_fsdp_core.py b/test/distributed/fsdp/test_fsdp_core.py
index 988731206f1b9..c77378384ef60 100644
--- a/test/distributed/fsdp/test_fsdp_core.py
+++ b/test/distributed/fsdp/test_fsdp_core.py
@@ -138,14 +138,10 @@ def test_nested_wrapped_model_single_iteration_mixed_precision(
 
     @skip_if_lt_x_gpu(2)
     @parametrize(params, configs, subtest_name)
-    # TODO (awgu): 2.0 fails tests
-    # @parametrize("norm_type", [2.0, None])
-    @parametrize("norm_type", [None])
     def test_nested_always_wrap_model(
         self,
         cpu_offload: CPUOffload,
         sharding_strategy: Optional[ShardingStrategy],
-        norm_type: Optional[float],
     ):
         self.run_subtests(
             self._get_subtest_config(cpu_offload),
@@ -154,19 +150,14 @@ def test_nested_always_wrap_model(
             FSDPInitMode.RECURSIVE,
             cpu_offload=cpu_offload,
             sharding_strategy=sharding_strategy,
-            norm_type=norm_type,
         )
 
     @skip_if_lt_x_gpu(2)
     @parametrize(params, configs, subtest_name)
-    # TODO (awgu): 2.0 fails tests
-    # @parametrize("norm_type", [2.0, None])
-    @parametrize("norm_type", [None])
     def test_transformer(
         self,
         cpu_offload: CPUOffload,
         sharding_strategy: Optional[ShardingStrategy],
-        norm_type: Optional[float],
     ):
         self.run_subtests(
             self._get_subtest_config(cpu_offload),
@@ -174,7 +165,6 @@ def test_transformer(
             TransformerWithSharedParams,
             FSDPInitMode.RECURSIVE,
             cpu_offload=cpu_offload,
-            norm_type=norm_type,
             sharding_strategy=sharding_strategy,
         )
 
@@ -228,14 +218,10 @@ def _dummy_ddp_fn(self, model):
 
     @skip_if_lt_x_gpu(2)
     @parametrize(params, configs, subtest_name)
-    # TODO (awgu): 2.0 fails tests
-    # @parametrize("norm_type", [2.0, None])
-    @parametrize("norm_type", [None])
     def test_mixture_of_experts(
         self,
         cpu_offload: CPUOffload,
         sharding_strategy: Optional[ShardingStrategy],
-        norm_type: Optional[float],
     ):
         self.run_subtests(
             self._get_subtest_config(cpu_offload),
@@ -245,7 +231,6 @@ def test_mixture_of_experts(
             ref_init_fn=self._dummy_ddp_fn,
             cpu_offload=cpu_offload,
             sharding_strategy=sharding_strategy,
-            norm_type=norm_type,
         )
 
     @skip_if_lt_x_gpu(2)
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index 405d8a5eb235f..0ef5b65b80c89 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -796,7 +796,6 @@ def _train_for_several_steps(
         autocast: bool,
         lr: float = 0.01,
         fsdp_cpu_offload: Optional[CPUOffload] = None,
-        norm_type: Optional[Union[float, int]] = None,
         save_model: bool = False,
         mixed_precision: Optional[MixedPrecision] = None,
         enable_sharded_grad_scaler: bool = False,
@@ -843,21 +842,6 @@ def _train_for_several_steps(
                 else:
                     self.assertEqual(loss.dtype, torch.float32)
             model.module.run_backward(loss)
-            if norm_type is not None:
-                max_norm = 0.3
-                if isinstance(model, FSDP):
-                    model.clip_grad_norm_(max_norm, norm_type)
-                    total_norm_after_clip = _collect_total_grad_norm_fsdp(
-                        model, norm_type, self.rank
-                    )
-                else:
-                    torch.nn.utils.clip_grad_norm_(
-                        model.parameters(), max_norm, norm_type
-                    )
-                    total_norm_after_clip = _collect_total_grad_norm_local(
-                        model, norm_type
-                    )
-                self.assertTrue(total_norm_after_clip <= max_norm)
             # Post-backward, if CPU offloading model params should be on CPU.
             if cpu_offload_params and isinstance(model, FSDP):
                 for p in model.parameters():
@@ -895,7 +879,6 @@ def _test_fsdp_parity(
         use_orig_params: bool = False,
         enable_sharded_grad_scaler: bool = False,
         use_pure_fp16: bool = False,
-        norm_type: Optional[Union[float, int]] = None,
         init_kwargs: Optional[Dict[str, Any]] = None,
         **fsdp_kwargs,
     ):
@@ -941,7 +924,6 @@ def _test_fsdp_parity(
             lr=lr,
             fsdp_cpu_offload=cpu_offload,
             mixed_precision=mixed_precision,
-            norm_type=norm_type,
             enable_sharded_grad_scaler=enable_sharded_grad_scaler,
             use_pure_fp16=use_pure_fp16,
         )
@@ -1008,7 +990,6 @@ def _test_fsdp_parity(
                 fsdp_cpu_offload=cpu_offload,
                 save_model=save_model,
                 mixed_precision=mixed_precision,
-                norm_type=norm_type,
                 enable_sharded_grad_scaler=enable_sharded_grad_scaler,
                 use_pure_fp16=use_pure_fp16,
             )

From cf708b6bea580b0bddaa06e3ce9ffc4d39442e7d Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Tue, 6 Dec 2022 15:31:38 +0000
Subject: [PATCH 1635/1922] [FSDP] Clarify loss dtype check in
 `_test_fsdp_parity` (#90251)

A recent PR deprecated `torch.testing.assert_allclose` in favor of `torch.testing.assert_close` and left a `TODO`. This PR follows up to confirm that we do intend to have `check_dtype=False`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90251
Approved by: https://github.com/rohan-varma
---
 torch/testing/_internal/common_fsdp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index 0ef5b65b80c89..5cd3d326e0e3a 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -1003,8 +1003,8 @@ def _test_fsdp_parity(
                 self.assertEqual(param.device, cpu_device)
             fsdp_loss = fsdp_loss.cuda()
         fsdp_unsharded_params = get_full_params(fsdp_model)
-        # TODO: Are mismatching dtypes actually ok here or did this pass silently before, because `check_dtype=False`
-        #  was the default?
+        # Do not check dtype since the reference DDP loss may not be the same
+        # dtype as the FSDP loss in the case of mixed precision
         torch.testing.assert_close(ref_loss, fsdp_loss, check_dtype=False)
         # Do not check for parameter parity if using mixed precision since (1)
         # the DDP parameters are in FP16 (from `half()`) while the FSDP

From 9989dc50fb0ee978a99625b71a0c0454e21e4a9e Mon Sep 17 00:00:00 2001
From: fduwjj <fduwjj@fb.com>
Date: Tue, 6 Dec 2022 18:39:05 +0000
Subject: [PATCH 1636/1922] [Upstream _NamedOptimzer] Reland PR (89480)
 (#90293)

Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom):

Reland https://github.com/pytorch/pytorch/pull/89480/
* #90294
* __->__ #90293

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90293
Approved by: https://github.com/awgu
---
 .../distributed/optim/test_named_optimizer.py | 245 +++++++++++++++++
 torch/distributed/optim/__init__.py           |   1 +
 torch/distributed/optim/named_optimizer.py    | 258 ++++++++++++++++++
 3 files changed, 504 insertions(+)
 create mode 100644 test/distributed/optim/test_named_optimizer.py
 create mode 100644 torch/distributed/optim/named_optimizer.py

diff --git a/test/distributed/optim/test_named_optimizer.py b/test/distributed/optim/test_named_optimizer.py
new file mode 100644
index 0000000000000..880dbb382aa6a
--- /dev/null
+++ b/test/distributed/optim/test_named_optimizer.py
@@ -0,0 +1,245 @@
+# Owner(s): ["oncall: distributed"]
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+import torch.nn as nn
+
+from torch.distributed.optim import _NamedOptimizer
+
+
+class TestDummyModel(torch.nn.Module):
+    def __init__(self):
+        super(TestDummyModel, self).__init__()
+        torch.manual_seed(0)
+        self.net1 = nn.Sequential(nn.Linear(8, 16), nn.ReLU())
+        self.net2 = nn.Sequential(nn.Linear(16, 32), nn.ReLU())
+        self.net3 = nn.Linear(32, 64)
+        self.net4 = nn.Sequential(nn.ReLU(), nn.Linear(64, 8))
+
+    def forward(self, x):
+        return self.net4(self.net3(self.net2(self.net1(x))))
+
+
+class NamedOptimizerTest(unittest.TestCase):
+    def _compare_state_dict_group(self, group, named_group, assert_equal=True):
+        for key, val in group.items():
+            if key != "params":
+                self.assertTrue(
+                    key in named_group, f"{key} not in named optimizer state dict"
+                )
+                err_msg = (
+                    f"{key} state not equal" if assert_equal else f"{key} state equal"
+                )
+                if isinstance(val, torch.Tensor):
+                    fn = self.assertTrue if assert_equal else self.assertFalse
+                    fn(torch.allclose(val, named_group[key]), err_msg)
+                else:
+                    fn = self.assertEqual if assert_equal else self.assertNotEqual
+                    fn(val, named_group[key], err_msg)
+
+    def test_state_dict(self):
+        """Check that NamedOptimizer exposes the expected state dict
+        interface."""
+        m = TestDummyModel()
+        m_dup = TestDummyModel()
+        optim_1 = torch.optim.SGD(
+            [
+                {"params": m.net1.parameters()},
+                {"params": m.net3.parameters(), "lr": 1e-3},
+            ],
+            lr=1e-2,
+            momentum=0.9,
+        )
+
+        optim_2 = torch.optim.Adam(
+            [
+                {"params": m.net2.parameters()},
+                {"params": m.net4.parameters(), "lr": 1e-5},
+            ]
+        )
+
+        named_optim_1 = _NamedOptimizer(
+            m_dup.named_parameters(),
+            torch.optim.SGD,
+            [
+                {"params": m_dup.net1.parameters()},
+                {"params": m_dup.net3.parameters(), "lr": 1e-3},
+            ],
+            lr=1e-2,
+            momentum=0.9,
+        )
+
+        named_optim_2 = _NamedOptimizer(
+            m_dup.named_parameters(),
+            torch.optim.Adam,
+            [
+                {"params": m_dup.net2.parameters()},
+                {"params": m_dup.net4.parameters(), "lr": 1e-5},
+            ],
+        )
+        for i in range(2):
+            x = torch.rand(5, 8)
+            y = m(x)
+            y.sum().backward()
+            optim_1.step()
+            optim_2.step()
+
+            y = m_dup(x)
+            y.sum().backward()
+            named_optim_1.step()
+            named_optim_2.step()
+
+        sd_1 = optim_1.state_dict()
+        sd_2 = optim_2.state_dict()
+        named_sd_1 = named_optim_1.state_dict()
+        named_sd_2 = named_optim_2.state_dict()
+
+        # Compare "state" in optim state dict
+        self._compare_state_dict_group(
+            sd_1["state"][0],
+            named_sd_1["state"]["net1.0.weight"],
+            assert_equal=True,
+        )
+        self._compare_state_dict_group(
+            sd_2["state"][1],
+            named_sd_2["state"]["net2.0.bias"],
+            assert_equal=True,
+        )
+        self._compare_state_dict_group(
+            sd_1["state"][2],
+            named_sd_1["state"]["net3.weight"],
+            assert_equal=True,
+        )
+        self._compare_state_dict_group(
+            sd_2["state"][3],
+            named_sd_2["state"]["net4.1.bias"],
+            assert_equal=True,
+        )
+
+        # Compare "param_groups" in optim state dict
+        self._compare_state_dict_group(
+            sd_1["param_groups"][0],
+            named_sd_1["param_groups"][0],
+            assert_equal=True,
+        )
+        self._compare_state_dict_group(
+            sd_2["param_groups"][1], named_sd_2["param_groups"][1], assert_equal=True
+        )
+
+    def test_load_state_dict(self):
+        """Check that NamedOptimizer exposes the expected state dict
+        interface."""
+        m = TestDummyModel()
+        named_optim_1 = _NamedOptimizer(
+            m.named_parameters(),
+            torch.optim.SGD,
+            lr=1e-2,
+            momentum=0.9,
+        )
+
+        for _ in range(2):
+            x = torch.rand(5, 8)
+            y = m(x)
+            y.sum().backward()
+            named_optim_1.step()
+
+        state_dict_to_load = named_optim_1.state_dict()
+
+        named_optim_2 = _NamedOptimizer(
+            m.named_parameters(),
+            torch.optim.SGD,
+            lr=1e-2,
+            momentum=0.6,
+        )
+
+        for _ in range(2):
+            x = torch.rand(5, 8)
+            y = m(x)
+            y.sum().backward()
+            named_optim_2.step()
+
+        state_dict_before_load = named_optim_2.state_dict()
+
+        # Compare "state" in optim state dict
+        self._compare_state_dict_group(
+            state_dict_to_load["state"]["net1.0.weight"],
+            state_dict_before_load["state"]["net1.0.weight"],
+            assert_equal=False,
+        )
+        self._compare_state_dict_group(
+            state_dict_to_load["state"]["net2.0.bias"],
+            state_dict_before_load["state"]["net2.0.bias"],
+            assert_equal=False,
+        )
+        self._compare_state_dict_group(
+            state_dict_to_load["state"]["net3.weight"],
+            state_dict_before_load["state"]["net3.weight"],
+            assert_equal=False,
+        )
+        self._compare_state_dict_group(
+            state_dict_to_load["state"]["net4.1.bias"],
+            state_dict_before_load["state"]["net4.1.bias"],
+            assert_equal=False,
+        )
+
+        named_optim_2.load_state_dict(state_dict_to_load)
+        state_dict_after_load = named_optim_2.state_dict()
+
+        # Compare "state" in optim state dict
+        self._compare_state_dict_group(
+            state_dict_to_load["state"]["net1.0.weight"],
+            state_dict_after_load["state"]["net1.0.weight"],
+            assert_equal=True,
+        )
+        self._compare_state_dict_group(
+            state_dict_to_load["state"]["net2.0.bias"],
+            state_dict_after_load["state"]["net2.0.bias"],
+            assert_equal=True,
+        )
+        self._compare_state_dict_group(
+            state_dict_to_load["state"]["net3.weight"],
+            state_dict_after_load["state"]["net3.weight"],
+            assert_equal=True,
+        )
+        self._compare_state_dict_group(
+            state_dict_to_load["state"]["net4.1.bias"],
+            state_dict_after_load["state"]["net4.1.bias"],
+            assert_equal=True,
+        )
+
+    def test_load_state_dict_error(self):
+        m = TestDummyModel()
+        named_optim_1 = _NamedOptimizer(
+            m.named_parameters(),
+            torch.optim.SGD,
+            lr=1e-2,
+            momentum=0.9,
+        )
+
+        for _ in range(2):
+            x = torch.rand(5, 8)
+            y = m(x)
+            y.sum().backward()
+            named_optim_1.step()
+
+        state_dict_to_load = named_optim_1.state_dict()
+
+        named_optim_2 = _NamedOptimizer(
+            m.named_parameters(),
+            torch.optim.SGD,
+            lr=1e-2,
+            momentum=0.6,
+        )
+
+        err_msg = (
+            "Expects the optim to be initialized before load but found not initialized"
+        )
+        with self.assertRaisesRegex(ValueError, err_msg):
+            named_optim_2.load_state_dict(state_dict_to_load)
diff --git a/torch/distributed/optim/__init__.py b/torch/distributed/optim/__init__.py
index 950222b8d5fa8..9a83c7aabf772 100644
--- a/torch/distributed/optim/__init__.py
+++ b/torch/distributed/optim/__init__.py
@@ -16,6 +16,7 @@
 from .functional_rmsprop import _FunctionalRMSprop
 from .functional_rprop import _FunctionalRprop
 from .functional_adamax import _FunctionalAdamax
+from .named_optimizer import _NamedOptimizer
 from .utils import as_functional_optim
 from .apply_optimizer_in_backward import _apply_optimizer_in_backward
 
diff --git a/torch/distributed/optim/named_optimizer.py b/torch/distributed/optim/named_optimizer.py
new file mode 100644
index 0000000000000..81273c65f5332
--- /dev/null
+++ b/torch/distributed/optim/named_optimizer.py
@@ -0,0 +1,258 @@
+import logging
+import warnings
+
+from copy import deepcopy
+from typing import Any, Collection, Dict, List, Mapping, Union
+
+import torch
+from torch import optim
+from torch.distributed._shard.sharded_tensor import ShardedTensor
+
+
+__all__ : List[str] = []
+
+logger = logging.getLogger(__name__)
+
+
+class _NamedOptimizer(optim.Optimizer):
+    """
+    ``_NamedOptimizer`` takes a dict of parameters and exposes ``state_dict`` by
+    parameter key. We replace the original key (number) in an optim to the
+    fully qualifed name (FQN) string. User can initialize the optim as they
+    initialize a PyTorch optim, the only difference is that they also need to
+    pass in the FQN of each parameters.
+
+    Args:
+        named_parameters (Mapping[str, Union[torch.Tensor, ShardedTensor]]):
+            Mapping from FQN to parameter.
+        optimizer_class (optim.Optimizer):
+            The class of optimizer to instantiate.
+        param_groups (Collection[Mapping[str, Any]]):
+            `param_groups` to pass to optimizer if specified.
+            The key of the inner map needs to be FQNs.
+            Default: None
+        args: arguments to pass to the optimizer constructor.
+        kwargs: arguments to pass to the optimizer constructor.
+
+    Example::
+        >>> # xdoctest: +SKIP("distributed")
+        >>> from torch import optim
+        >>> from torch.distributed.optim import _NamedOptimizer
+        >>>
+        >>> # Define the named optimizer.
+        >>> m = Model(...)
+        >>> named_optim = _NamedOptimizer(m.named_parameters(), optim.SGD)
+        >>> # Forward pass + backward pass.
+        >>> named_optim.step()
+        >>> ...
+        >>> # Call state_dict for the named optimizer returns a FQN state_dict.
+        >>> named_optim.state_dict()
+
+    Warning: This API is still in development and subject to change.
+
+    TODO: Add tutorial for _NamedOptimizer.
+    TODO: Add documentation in the docstring for the public attributes
+          like self.param_groups and self.named_parameters.
+    """
+
+    def __init__(
+        self,
+        named_parameters: Mapping[str, Union[torch.Tensor, ShardedTensor]],
+        optimizer_class: optim.Optimizer,
+        param_groups: Collection[Mapping[str, Any]] = None,
+        *args,
+        **kwargs,
+    ) -> None:
+        torch._C._log_api_usage_once("torch.distributed.optim._NamedOptimizer")
+        self.param_groups: Collection[Mapping[str, Any]] = param_groups  # type: ignore[assignment]
+        self.named_parameters = dict(named_parameters)
+        params_for_optimizer = (
+            self.named_parameters.values() if param_groups is None else param_groups
+        )
+        self._optimizer = optimizer_class(  # type: ignore[operator]
+            params_for_optimizer,
+            *args,
+            **kwargs,
+        )
+        # TODO: Add param_groups validations and unit tests.
+        if param_groups is None:
+            self.ordered_param_keys = list(self.named_parameters.keys())
+        else:
+            warnings.warn(
+                "Since we pass in param_groups, we will use param_groups to "
+                "initialize the optimizer, not all parameters of the module."
+            )
+            param_to_key = {param: key for key, param in self.named_parameters.items()}  # type: ignore[misc, has-type]
+            ordered_param_keys = []
+            for group in param_groups:
+                for param in group["params"]:
+                    if param not in param_to_key:
+                        raise ValueError(
+                            f"Expect param name {param} found in param group but is missing."
+                        )
+                    ordered_param_keys.append(param_to_key[param])
+            self.ordered_param_keys = ordered_param_keys
+
+    def state_dict(self) -> Dict[str, Any]:
+        """
+        Return the ``state_dict`` of the optimzer. Instead of using number to index
+        parameters, we will use module fully qualifed name (FQN) as the key.
+        """
+        state_dict = self._optimizer.state_dict()
+        param_groups = state_dict["param_groups"]
+
+        ret_state = {
+            self.ordered_param_keys[st_key]: state_val
+            for st_key, state_val in state_dict["state"].items()
+        }
+
+        ret_groups = []
+        for group in param_groups:
+            param_keys = []
+            for param in group["params"]:
+                param_keys.append(self.ordered_param_keys[param])
+            ret_group = {"params": sorted(param_keys)}
+            for k, v in group.items():
+                if k != "params":
+                    ret_group[k] = deepcopy(v)
+            ret_groups.append(ret_group)
+
+        return {"state": ret_state, "param_groups": ret_groups}
+
+    def step(self):
+        """
+        Performs a single optimization step.
+
+        This will call :meth:`torch.optim.Optimizer.step` on the wrapped
+        optimizer.
+        """
+        self._optimizer.step()
+
+    def load_state_dict(self, state_dict: Mapping[str, Any]) -> None:
+        """
+        This public function defines the default behavior to load a state_dict
+        for ``_NamedOptimizer``.
+
+        Sample Code
+        ```
+            my_model = MyModule()
+            optimizer = _NamedOptimizer(my_model.named_parameters(), Adagrad)
+            ...
+
+            optim_state_dict = optimizer.state_dict()
+            ...
+            ...
+
+            optimizer.load_state_dict(optim_state_dict)
+            ...
+        ```
+        Args:
+            state_dict (Dict[str, Any]) : A ``state_dict`` to load into the optimizer.
+                Note that this state dict update is performed in place.
+
+        .. note:: PyTorch is using lazy init to initialize the optim states.
+            So it is possible that there is no optim state when user call
+            ``load_state_dict`` and for ``_NamedOptimizer`` we make it stricter
+            that users can only call ``load_state_dict`` after the state is initialized.
+            By doing this, we can validate the optim ``state_dict`` to be loaded.
+        """
+        new_state_dict = self._optimizer.state_dict()
+        state = state_dict["state"]
+        new_state = new_state_dict["state"]
+        if len(new_state) == 0:
+            raise ValueError(
+                "Expects the optim to be initialized before load but found not initialized."
+            )
+
+        # Load state of state_dict
+        if len(new_state) != len(state):
+            raise ValueError(
+                f"Expects equal length as {len(new_state)} in `state_dict` state length but found {len(state)}."
+            )
+        for idx, param_key in enumerate(self.ordered_param_keys):
+            if param_key not in state.keys():
+                raise ValueError(
+                    f"Expect {param_key} as a parameter in `state_dict` state but not found."
+                )
+            if len(state[param_key]) != len(new_state[idx]):
+                raise ValueError(
+                    f"Expects equal length as {len(new_state[idx])} for parameter {param_key} but found: {len(state[param_key])}"
+                )
+            # Iterate through all optimizer states.
+            for state_key, state_val in new_state[idx].items():
+                if state_key not in state[param_key]:
+                    raise ValueError(
+                        f"Expects state {state_key} for parameter {param_key} but not found."
+                    )
+
+                src_state_val = state[param_key][state_key]
+                if isinstance(state_val, ShardedTensor):
+                    assert isinstance(src_state_val, ShardedTensor)
+                    num_shards = len(state_val.local_shards())
+                    num_new_shards = len(src_state_val.local_shards())
+                    if num_shards != num_new_shards:
+                        raise ValueError(
+                            f"Expects equal number of shards as {num_new_shards} but found {num_shards} for {param_key}/{state_key}"
+                        )
+                    for shard, src_shard in zip(
+                        state_val.local_shards(), src_state_val.local_shards()
+                    ):
+                        shard.tensor.detach().copy_(src_shard.tensor)
+                elif isinstance(state_val, torch.Tensor):
+                    assert isinstance(src_state_val, torch.Tensor)
+                    state_val.detach().copy_(src_state_val)
+                else:
+                    new_state[idx][state_key] = deepcopy(src_state_val)
+
+        # Load param_groups of state_dict
+        src_param_groups = state_dict["param_groups"]
+        new_param_groups = new_state_dict["param_groups"]
+
+        if len(new_param_groups) != len(src_param_groups):
+            raise ValueError(
+                f"Expects equal param_groups count as {len(new_param_groups)} in `state_dict` but found {len(src_param_groups)}."
+            )
+        src_group_map = {}
+        for group in src_param_groups:
+            param_keys = []
+            for param_key in group["params"]:
+                param_keys.append(param_key)
+            src_group_map[_gen_param_group_key(param_keys)] = group
+        new_group_map = {}
+        for new_group in new_param_groups:
+            param_keys = []
+            for param_key in new_group["params"]:
+                param_keys.append(self.ordered_param_keys[param_key])  # type: ignore[call-overload]
+            new_group_map[_gen_param_group_key(param_keys)] = new_group
+        for group_key, new_group in new_group_map.items():
+            if group_key not in src_group_map:
+                raise ValueError(
+                    f"Expects group {group_key} to be in `state_dict` but is missing"
+                )
+            src_group = src_group_map[group_key]
+            if len(src_group) != len(new_group):
+                raise ValueError(
+                    f"Expects equal param_group size as {len(new_group)} for group {group_key} but found {len(src_group)}."
+                )
+            for k in src_group:
+                if k not in new_group:
+                    raise ValueError(
+                        f"Expects group key {k} to be in group {group_key} in `state_dict` but is missing."
+                    )
+                if k != "params":
+                    new_group[k] = deepcopy(src_group[k])
+
+        self._optimizer.load_state_dict(new_state_dict)
+
+    # pyre-ignore [2]
+    def add_param_group(self, param_group: Any) -> None:
+        raise NotImplementedError(
+            "add_param_group not supported yet and might be implemented soon."
+        )
+
+
+def _gen_param_group_key(param_keys: List[str]) -> str:
+    """
+    Concatenate all param keys as a unique indentifier for one param group.
+    """
+    return "/".join(sorted(param_keys))

From 58aca8664257b6f932db13e8471b279cfb655ade Mon Sep 17 00:00:00 2001
From: eqy <eddiey@nvidia.com>
Date: Tue, 6 Dec 2022 22:03:05 +0000
Subject: [PATCH 1637/1922] [CUDA Graphs] Add option to dump a captured graph
 for debugging (#85519)

CC @xwang233 @ptrblck @ngimel
Pull Request resolved: https://github.com/pytorch/pytorch/pull/85519
Approved by: https://github.com/ngimel
---
 aten/src/ATen/cuda/CUDAGraph.cpp | 55 +++++++++++++++++++++++++-------
 aten/src/ATen/cuda/CUDAGraph.h   |  2 ++
 torch/_C/__init__.pyi.in         |  3 ++
 torch/csrc/cuda/Graph.cpp        | 16 +++++++++-
 torch/cuda/graphs.py             | 16 ++++++++++
 5 files changed, 80 insertions(+), 12 deletions(-)

diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp
index 2d989d884ee34..92eddeb4b755c 100644
--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@@ -8,6 +8,8 @@
 namespace at {
 namespace cuda {
 
+static bool _cuda_graphs_debug = false;
+
 MempoolId_t graph_pool_handle() {
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
   // uuid count starts at 1. 0 is reserved to mean "wasn't set by graph_pool_handle".
@@ -16,7 +18,7 @@ MempoolId_t graph_pool_handle() {
   // cudaStreamGetCaptureInfo id_s in capture_begin.
   return {0, uuid++};
 #else
-  TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 and not yet supported on ROCM");
+  TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 and is not yet supported on ROCM");
   return {0, 0};
 #endif
 }
@@ -46,7 +48,7 @@ CUDAGraph::CUDAGraph()
   // CUDAStreams may not be default-constructed.
   : capture_stream_(at::cuda::getCurrentCUDAStream()) {
 #if (defined(CUDA_VERSION) && CUDA_VERSION < 11000) || defined(USE_ROCM)
-  TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 and not yet supported on ROCM");
+  TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 and is not yet supported on ROCM");
 #endif
 }
 
@@ -122,7 +124,7 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/) {
   // kernel will end up as part of the capture or not.
   c10::cuda::CUDACachingAllocator::notifyCaptureBegin(capture_dev_, id_, mempool_id_);
 #else
-  TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 and not yet supported on ROCM");
+  TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 and is not yet supported on ROCM");
 #endif
 }
 
@@ -186,12 +188,17 @@ void CUDAGraph::capture_end() {
                  "attempted to be captured on wrong device or stream.");
   }
 
-  // Now that we've instantiated graph_ into graph_exec_,
-  // we don't need graph_ anymore.
-  AT_CUDA_CHECK(cudaGraphDestroy(graph_));
-  has_graph_ = false;
+  // check if debug path is set
+  if (!_cuda_graphs_debug) {
+    // Now that we've instantiated graph_ into graph_exec_,
+    // we don't need graph_ anymore.
+    AT_CUDA_CHECK(cudaGraphDestroy(graph_));
+    has_graph_ = false;
+  } else {
+    TORCH_WARN("DEBUG: TORCH_CUDAGRAPHS_DEBUG_PATH detected. graph_ will not be freed until debug_dump is called.");
+  }
 #else
-  TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 and not yet supported on ROCM");
+  TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 and is not yet supported on ROCM");
 #endif
 }
 
@@ -226,7 +233,33 @@ void CUDAGraph::replay() {
     AT_CUDA_CHECK(cudaDeviceSynchronize());
   }
 #else
-  TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 and not yet supported on ROCM");
+  TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 and is not yet supported on ROCM");
+#endif
+}
+
+void CUDAGraph::enable_debug_mode() {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+  _cuda_graphs_debug = true;
+#else
+  TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 and is not yet supported on ROCM");
+#endif
+
+}
+
+void CUDAGraph::debug_dump(const std::string& debug_path) {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+  if (_cuda_graphs_debug) {
+    TORCH_WARN("DEBUG: calling debug_dump()");
+    if (has_graph_) {
+      TORCH_WARN("DEBUG: calling cudaGraphDebugDotPrint() with ", debug_path);
+      C10_CUDA_CHECK_WARN(cudaGraphDebugDotPrint(graph_, debug_path.c_str(), 1<<10)); // most verbose output
+      AT_CUDA_CHECK(cudaGraphDestroy(graph_));
+    }
+  } else {
+    TORCH_WARN("CUDA Graphs debug not enabled, set with torch._C._cuda_enable_graphs_debug_mode");
+  }
+#else
+  TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 and is not yet supported on ROCM");
 #endif
 }
 
@@ -262,7 +295,7 @@ void CUDAGraph::reset() {
     C10_CUDA_CHECK_WARN(cudaGraphExecDestroy(graph_exec_));
   }
 #else
-  TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 and not yet supported on ROCM");
+  TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 and is not yet supported on ROCM");
 #endif
 }
 
@@ -272,7 +305,7 @@ MempoolId_t CUDAGraph::pool() {
   TORCH_CHECK(has_graph_exec_,
               "Called CUDAGraph::pool() without a preceding successful capture.");
 #else
-  TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 and not yet supported on ROCM");
+  TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0 and is not yet supported on ROCM");
 #endif
   return mempool_id_;
 }
diff --git a/aten/src/ATen/cuda/CUDAGraph.h b/aten/src/ATen/cuda/CUDAGraph.h
index bacad79102a3e..fa5a73b65e05e 100644
--- a/aten/src/ATen/cuda/CUDAGraph.h
+++ b/aten/src/ATen/cuda/CUDAGraph.h
@@ -24,6 +24,8 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
   void replay();
   void reset();
   MempoolId_t pool();
+  void enable_debug_mode();
+  void debug_dump(const std::string& debug_path);
 
   protected:
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index bbe9a71e8718a..0f47074aed48e 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -1306,6 +1306,9 @@ class _CUDAGraph:
     def replay(self) -> None: ...
     def reset(self) -> None: ...
     def pool(self) -> Tuple[_int, _int]: ...
+    def enable_debug_mode(self) -> None: ...
+    def debug_dump(self,
+                   debug_path: str) -> None: ...
 
 def _cuda_isCurrentStreamCapturing() -> _bool: ...
 
diff --git a/torch/csrc/cuda/Graph.cpp b/torch/csrc/cuda/Graph.cpp
index 6d3a77c365e1b..f43a7debb5e41 100644
--- a/torch/csrc/cuda/Graph.cpp
+++ b/torch/csrc/cuda/Graph.cpp
@@ -48,5 +48,19 @@ void THCPGraph_init(PyObject* module) {
       .def(
           "pool",
           torch::wrap_pybind_function(&at::cuda::CUDAGraph::pool),
-          py::call_guard<py::gil_scoped_release>());
+          py::call_guard<py::gil_scoped_release>())
+      .def(
+          "debug_dump",
+          torch::wrap_pybind_function(&::at::cuda::CUDAGraph::debug_dump),
+          py::call_guard<py::gil_scoped_release>())
+      .def(
+          "enable_debug_mode",
+          torch::wrap_pybind_function(
+              &::at::cuda::CUDAGraph::enable_debug_mode),
+          py::call_guard<py::gil_scoped_release>())
+      .def(
+          "debug_dump",
+          torch::wrap_pybind_function(&::at::cuda::CUDAGraph::debug_dump),
+          py::call_guard<py::gil_scoped_release>(),
+          py::arg("debug_path"));
 }
diff --git a/torch/cuda/graphs.py b/torch/cuda/graphs.py
index ede08ae362c59..303f10a5bd199 100644
--- a/torch/cuda/graphs.py
+++ b/torch/cuda/graphs.py
@@ -100,6 +100,22 @@ def pool(self):
         """
         return super(CUDAGraph, self).pool()
 
+    def enable_debug_mode(self):
+        r"""
+        Enables debugging mode for CUDAGraph.debug_dump.
+        """
+        return super(CUDAGraph, self).enable_debug_mode()
+
+    def debug_dump(self, debug_path):
+        r"""
+        Arguments:
+            debug_path (required): Path to dump the graph to.
+
+        Calls a debugging function to dump the graph if the debugging is
+        enabled via CUDAGraph.enable_debug_mode()
+        """
+        return super(CUDAGraph, self).debug_dump(debug_path)
+
 
 class graph(object):
     r"""

From eaad2ee717aaf7c3a1b57b928446e41b15a8ab24 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Tue, 6 Dec 2022 12:14:37 -0800
Subject: [PATCH 1638/1922] Hotfix to unblock TRT unit tests internally
 (#90313)

Signed-off-by: Eli Uriegas <eliuriegas@fb.com>

Export of [D41778303](https://www.internalfb.com/diff/D41778303)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90313
Approved by: https://github.com/ezyang, https://github.com/malfet
---
 torch/_dynamo/config.py       | 4 ++++
 torch/_dynamo/output_graph.py | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index bd3f29cb3f378..2508c35a39bdc 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -175,6 +175,10 @@
 
 debug_dir_root = os.path.join(os.getcwd(), "torchdynamo_debug")
 
+# this is to resolve a import problem in fbcode, we will be deleting
+# this very shortly
+DO_NOT_USE_legacy_non_fake_example_inputs = False
+
 
 class _AccessLimitingConfig(ModuleType):
     def __setattr__(self, name, value):
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index 2dabef444cb89..5fcae0fbafa79 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -580,6 +580,8 @@ def call_user_compiler(self, gm: fx.GraphModule) -> CompiledFn:
             )
             if torch._dynamo.debug_utils.MINIFIER_SPAWNED or is_top_level_minifying:
                 compiled_fn = compiler_fn(gm, self.example_inputs())
+            elif config.DO_NOT_USE_legacy_non_fake_example_inputs:
+                compiled_fn = compiler_fn(gm, self.example_inputs())
             else:
                 compiled_fn = compiler_fn(gm, self.fake_example_inputs())
             _step_logger()(logging.INFO, f"done compiler function {name}")

From 42ee41aabd0544e32bbc470c8108c970829e3a64 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang@meta.com>
Date: Tue, 6 Dec 2022 22:17:14 +0000
Subject: [PATCH 1639/1922] Disable dynamo tracing torchrec.distributed
 (#90087)

Summary: Context at T138318923

Test Plan: mannual test

Reviewed By: yf225

Differential Revision: D41631076

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90087
Approved by: https://github.com/yf225
---
 torch/_dynamo/skipfiles.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/torch/_dynamo/skipfiles.py b/torch/_dynamo/skipfiles.py
index 41a04626756d2..3413079a79f66 100644
--- a/torch/_dynamo/skipfiles.py
+++ b/torch/_dynamo/skipfiles.py
@@ -146,7 +146,10 @@ def add(import_name: str):
     if isinstance(import_name, types.ModuleType):
         return add(import_name.__name__)
     assert isinstance(import_name, str)
-    module_spec = importlib.util.find_spec(import_name)
+    try:
+        module_spec = importlib.util.find_spec(import_name)
+    except ModuleNotFoundError:
+        return
     if not module_spec:
         return
     origin = module_spec.origin
@@ -189,6 +192,7 @@ def check(filename, allow_torch=False):
     "tvm",
     "fx2trt_oss",
     "xarray",
+    "torchrec.distributed",
 ):
     add(_name)
 

From 4df16e9758be07745980b520eefdd58bc38b711c Mon Sep 17 00:00:00 2001
From: Sergii Dymchenko <sdym@fb.com>
Date: Tue, 6 Dec 2022 22:22:17 +0000
Subject: [PATCH 1640/1922] Remove non-existing parameter from docstring
 (#90163)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90163
Approved by: https://github.com/clee2000
---
 torch/fx/subgraph_rewriter.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torch/fx/subgraph_rewriter.py b/torch/fx/subgraph_rewriter.py
index e7d239d4699c9..c11f4b9efa2bc 100644
--- a/torch/fx/subgraph_rewriter.py
+++ b/torch/fx/subgraph_rewriter.py
@@ -80,7 +80,6 @@ def replace_pattern(
         ``gm``: The GraphModule that wraps the Graph to operate on
         ``pattern``: The subgraph to match in ``gm`` for replacement
         ``replacement``: The subgraph to replace ``pattern`` with
-        ``match_filter``: A function that takes in (`InternalMatch`, original_graph, pattern_graph)
 
     Returns:
         List[Match]: A list of ``Match`` objects representing the places

From bafeb975742bcd36085b106fa758a8ac3a6b0f89 Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@fb.com>
Date: Tue, 6 Dec 2022 22:24:43 +0000
Subject: [PATCH 1641/1922] Remove logging.CODE (#90234)

Fixes https://github.com/pytorch/torchdynamo/issues/1932

Discussed with @mlazos: if we still want to separate streams for code logging and the rest of info, we can use a separate logger object with a unique name.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90234
Approved by: https://github.com/ezyang
---
 torch/_dynamo/config.py           |  8 ++----
 torch/_dynamo/convert_frame.py    | 48 ++++++++++++++++---------------
 torch/_dynamo/logging.py          |  5 ----
 torch/_dynamo/output_graph.py     |  2 +-
 torch/_inductor/codegen/triton.py |  4 ++-
 torch/_inductor/graph.py          |  5 +++-
 6 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index 2508c35a39bdc..838a65c152a3c 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -6,9 +6,6 @@
 
 import torch
 
-# needed so that CODE is registered as a level in logging
-from . import logging as torchdynamo_logging  # noqa: F401
-
 try:
     import torch._prims
     import torch._refs
@@ -20,13 +17,14 @@
 
 # log level (levels print what it says + all levels listed below it)
 # logging.DEBUG print full traces <-- lowest level + print tracing of every instruction
-# logging.CODE print compiled functions + graphs (NOTE: can only be used after importing torch._dynamo.logging)
-# logging.INFO print the steps that dynamo is running
+# logging.INFO print the steps that dynamo is running and optionally, compiled functions + graphs
 # logging.WARN print warnings (including graph breaks)
 # logging.ERROR print exceptions (and what user code was being processed when it occurred)
 # NOTE: changing log_level will automatically update the levels of all torchdynamo loggers
 log_level = logging.WARNING
 
+output_code = False
+
 # the name of a file to write the logs to
 log_file_name = None
 
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index c135a3d6c59c8..e57bcbadee6db 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -409,26 +409,25 @@ def transform(instructions, code_options):
                 return None
         output_codes.add(out_code)
 
-        log.log(
-            logging.CODE,  # type: ignore[attr-defined]
-            format_bytecode(
-                "ORIGINAL BYTECODE",
-                code.co_name,
-                code.co_filename,
-                code.co_firstlineno,
-                code,
-            ),
-        )
-        log.log(
-            logging.CODE,  # type: ignore[attr-defined]
-            format_bytecode(
-                "MODIFIED BYTECODE",
-                code.co_name,
-                code.co_filename,
-                code.co_firstlineno,
-                out_code,
-            ),
-        )
+        if config.output_code:
+            log.info(
+                format_bytecode(
+                    "ORIGINAL BYTECODE",
+                    code.co_name,
+                    code.co_filename,
+                    code.co_firstlineno,
+                    code,
+                ),
+            )
+            log.info(
+                format_bytecode(
+                    "MODIFIED BYTECODE",
+                    code.co_name,
+                    code.co_filename,
+                    code.co_firstlineno,
+                    out_code,
+                ),
+            )
 
         assert output is not None
         assert output.guards is not None
@@ -436,10 +435,13 @@ def transform(instructions, code_options):
         check_fn = CheckFunctionManager(output, output.guards, locals, globals)
 
         guarded_code = GuardedCode(out_code, check_fn.check_fn)
-        guard_str = "GUARDS:\n"
-        guard_str += "\n".join([f" - {str(guard)}" for guard in sorted(output.guards)])
 
-        log.log(logging.CODE, guard_str)  # type: ignore[attr-defined]
+        if config.output_code:
+            guard_str = "GUARDS:\n"
+            guard_str += "\n".join(
+                [f" - {str(guard)}" for guard in sorted(output.guards)]
+            )
+            log.info(guard_str)
 
         if guard_export_fn is not None:
             guard_export_fn(output.guards)
diff --git a/torch/_dynamo/logging.py b/torch/_dynamo/logging.py
index 95ee727f1ddf1..a9ead23e0edd8 100644
--- a/torch/_dynamo/logging.py
+++ b/torch/_dynamo/logging.py
@@ -2,11 +2,6 @@
 import logging
 import os
 
-# logging level for dynamo generated graphs/bytecode/guards
-logging.CODE = 15
-logging.addLevelName(logging.CODE, "CODE")
-
-
 # Return all loggers that torchdynamo/torchinductor is responsible for
 def get_loggers():
     return [
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index 5fcae0fbafa79..7d6a4101405c6 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -515,7 +515,7 @@ def compile_and_call_fx_graph(self, tx, rv, root):
 
         try:
             # the call to tabulate can cause a lot of memory to be allocated
-            if config.log_level <= logging.CODE:
+            if config.log_level <= logging.INFO and config.output_code:
                 graph_str = (
                     gm.print_readable()
                     if config.output_graph_code
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 02d96c3160b86..6c5130b829566 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -12,6 +12,7 @@
 
 import torch
 
+from ..._dynamo import config as dynamo_config
 from .. import config, ir, scheduler
 from ..ir import ReductionHint
 from ..utils import (
@@ -1278,7 +1279,8 @@ def end_current_reduction_loop():
                     f"unexpected group: ({numel}, {rnumel}) != {node.group[1]}"
                 )
 
-        log.log(logging.CODE, "schedule: %s", node_schedule)
+        if dynamo_config.output_code:
+            log.info("schedule: %s", node_schedule)
         return self.codegen_node_schedule(node_schedule, numel, rnumel)
 
     @staticmethod
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 3301a8455698b..270f3dc22af1b 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -13,6 +13,8 @@
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
 from torch.utils._mode_utils import no_dispatch
 
+from .._dynamo import config as dynamo_config
+
 from . import config, ir
 from .codegen.wrapper import CppWrapperCodeGen, WrapperCodeGen
 from .exc import (
@@ -491,7 +493,8 @@ def compile_to_module(self):
         for name, value in self.constants.items():
             setattr(mod, name, value)
 
-        log.log(logging.CODE, "Output code: %s", mod.__file__)
+        if dynamo_config.output_code:
+            log.info("Output code: %s", mod.__file__)
         V.debug.output_code(mod.__file__)
         V.debug.rename(os.path.splitext(mod.__file__)[0] + ".debug")
         return mod

From 15e586a3f4d536bfc16d370ee5a5d4c09a5747cf Mon Sep 17 00:00:00 2001
From: Michael Voznesensky <voznesenskym@gmail.com>
Date: Tue, 6 Dec 2022 01:47:43 +0000
Subject: [PATCH 1642/1922] Add `TORCH_FAKE_TENSOR_DEBUG` use it to enable
 storage of traces on fake tensors at init time (#90215)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90215
Approved by: https://github.com/ezyang
---
 test/functorch/test_aotdispatch.py | 33 ++++++++++++++++++++++--------
 torch/_dynamo/utils.py             | 14 +++++++++++--
 torch/_subclasses/fake_tensor.py   |  9 ++++++++
 3 files changed, 46 insertions(+), 10 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 2434e35ab4871..80edc80d7b14a 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1694,7 +1694,7 @@ def assert_compiler(gm: torch.fx.GraphModule, _):
         res = compiled_f(*inputs)
         res[0].sum().backward()
 
-    def test_aot_module_simplified_fake_tensor_gm_raises(self):
+    def _test_aot_module_simplified_fake_tensor_gm_raises(self, debug):
         class MockModule(torch.nn.Module):
             def __init__(self, y):
                 super().__init__()
@@ -1723,16 +1723,32 @@ def forward(self, x):
         graph = tracer.trace(MockModule(fake_y))
         mod_fake = torch.fx.GraphModule(tracer.root, graph)
 
-        self.assertExpectedRaisesInline(
-            AssertionError, lambda: aot_module_simplified(mod_fake, (real_x,), nop),
-            """Unexpected fake buffer y"""
-        )
+        if debug:
+            inner_message = "FAKE TENSOR CREATION TRACEBACK:"
+        else:
+            inner_message = "Enable TORCH_FAKE_TENSOR_DEBUG=1 to get creation stack traces on fake tensors."
+
+        message = f"""Unexpected fake buffer y {inner_message}"""
+
+        with self.assertRaisesRegex(
+            AssertionError, message
+        ):
+            aot_module_simplified(mod_fake, (real_x,), nop)
+
         # Counterfactual to ensure that the raise is only due to real vs fake
         # Run the same exact thing except with a real buffer.
         graph = tracer.trace(MockModule(real_y))
         mod_real = torch.fx.GraphModule(tracer.root, graph)
         aot_module_simplified(MockModule(real_y), (real_x,), nop)
 
+    @patch("torch._subclasses.fake_tensor.FakeTensorConfig.debug", True)
+    def test_aot_module_simplified_fake_tensor_gm_raises_debug_enabled(self):
+        self._test_aot_module_simplified_fake_tensor_gm_raises(debug=True)
+
+    @patch("torch._subclasses.fake_tensor.FakeTensorConfig.debug", False)
+    def test_aot_module_simplified_fake_tensor_gm_raises_no_debug_enabled(self):
+        self._test_aot_module_simplified_fake_tensor_gm_raises(debug=False)
+
     def test_aot_module_deepcopy_fake_tensor_gm_raises(self):
         class MockModule(torch.nn.Module):
             def __init__(self, y):
@@ -1752,10 +1768,11 @@ def forward(self, x):
         fake_mode = torch._subclasses.fake_tensor.FakeTensorMode()
         mod_fake = torch._dynamo.utils.deepcopy_to_fake_tensor(MockModule(real_y), fake_mode)
 
-        self.assertExpectedRaisesInline(
-            AssertionError, lambda: aot_module_simplified(mod_fake, (real_x,), nop),
+        with self.assertRaisesRegex(
+            AssertionError,
             """Unexpected fake param linear.weight"""
-        )
+        ):
+            aot_module_simplified(mod_fake, (real_x,), nop)
 
 
 # entries in here don't work and need to be fixed.
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 4f5ebc072fb5c..660d1ab9556c9 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -1158,14 +1158,24 @@ def get_real_value(node, output_graph):
 
 
 def assert_no_fake_params_or_buffers(gm):
+    from torch._subclasses.fake_tensor import FakeTensorConfig
+
+    def stack_or_hint(t):
+        if FakeTensorConfig.debug:
+            import traceback
+
+            return f"FAKE TENSOR CREATION TRACEBACK: \n {traceback.format_list(t._debug_trace)}"
+        else:
+            return "Enable TORCH_FAKE_TENSOR_DEBUG=1 to get creation stack traces on fake tensors."
+
     for name, buffer in gm.named_buffers():
         assert not isinstance(
             buffer, torch._subclasses.FakeTensor
-        ), f"Unexpected fake buffer {name}"
+        ), f"Unexpected fake buffer {name} {stack_or_hint(buffer)}"
     for name, param in gm.named_parameters():
         assert not isinstance(
             param, torch._subclasses.FakeTensor
-        ), f"Unexpected fake param {name}"
+        ), f"Unexpected fake param {name} {stack_or_hint(param)}"
 
 
 def fake_mode_from_tensors(inputs: List[Any]):
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 87b68bfc251a0..f5d04dddd1274 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -1,6 +1,7 @@
 import contextlib
 import functools
 import itertools
+import os
 import weakref
 from dataclasses import dataclass
 from functools import partial
@@ -479,6 +480,10 @@ def in_kernel_invocation_manager(fake_mode):
         del guard
 
 
+class FakeTensorConfig:
+    debug = os.environ.get("TORCH_FAKE_TENSOR_DEBUG", False)
+
+
 class FakeTensor(torch.Tensor):
     """
     Meta tensors give you the ability to run PyTorch code without having to
@@ -538,6 +543,10 @@ def __init__(
         self.fake_device = device
         self.fake_mode = fake_mode
         self.constant = constant
+        if FakeTensorConfig.debug:
+            import traceback
+
+            self._debug_trace = traceback.extract_stack()
 
     @staticmethod
     def from_tensor(t, fake_mode):

From 6fb74ac3644772f83af4f1780bba8c6233bdfeae Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Tue, 6 Dec 2022 22:32:46 +0000
Subject: [PATCH 1643/1922] [CI] Relax CMake requirements (#90307)

To `3.22.*` as cmake-3.22.1 is available on conda, but not on
conda-forge see
https://anaconda.org/conda-forge/cmake/files?version=3.22.2 but https://anaconda.org/anaconda/cmake/files?version=3.22.1

Also, for whatever reason we already specify cmake dependency in
https://github.com/pytorch/test-infra/blob/acaef1ae39c6a532862585639282c5c02180c744/.github/actions/setup-miniconda/action.yml#L172
so may be it could be removed from this file already

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90307
Approved by: https://github.com/kit1980
---
 .github/requirements/conda-env-Linux-X64   | 2 +-
 .github/requirements/conda-env-macOS-ARM64 | 2 +-
 .github/requirements/conda-env-macOS-X64   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/requirements/conda-env-Linux-X64 b/.github/requirements/conda-env-Linux-X64
index f2b3811263e59..c4e8aa7ae548d 100644
--- a/.github/requirements/conda-env-Linux-X64
+++ b/.github/requirements/conda-env-Linux-X64
@@ -1,5 +1,5 @@
 cffi=1.15.1
-cmake=3.22.1
+cmake=3.22.*
 mkl=2022.1.0
 mkl-include=2022.1.0
 ninja=1.10.2
diff --git a/.github/requirements/conda-env-macOS-ARM64 b/.github/requirements/conda-env-macOS-ARM64
index a031b014365fc..77f37cf463ea8 100644
--- a/.github/requirements/conda-env-macOS-ARM64
+++ b/.github/requirements/conda-env-macOS-ARM64
@@ -1,7 +1,7 @@
 numpy=1.22.3
 pyyaml=6.0
 setuptools=61.2.0
-cmake=3.22.1
+cmake=3.22.*
 cffi=1.15.1
 typing_extensions=4.3.0
 dataclasses=0.8
diff --git a/.github/requirements/conda-env-macOS-X64 b/.github/requirements/conda-env-macOS-X64
index 81463d4b39d56..897850f0e36b9 100644
--- a/.github/requirements/conda-env-macOS-X64
+++ b/.github/requirements/conda-env-macOS-X64
@@ -3,7 +3,7 @@ mkl-include=2021.2.0
 numpy=1.18.5
 pyyaml=5.3
 setuptools=46.0.0
-cmake=3.22.1
+cmake=3.22.*
 cffi=1.15.1
 typing_extensions=4.3.0
 dataclasses=0.8

From 0ccfb17b6f30fc0a267d2290f16e99ad7ce8ea27 Mon Sep 17 00:00:00 2001
From: Sergii Dymchenko <sdym@fb.com>
Date: Tue, 6 Dec 2022 22:33:58 +0000
Subject: [PATCH 1644/1922] Replace assertEqualIgnoreType in test_nn.py
 (#90242)

See https://github.com/pytorch/pytorch/issues/38095.

Also removed some redundant separate `dtype` checks when `dtype` is already checked by the next line's `assertEqual`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90242
Approved by: https://github.com/malfet
---
 test/test_nn.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/test/test_nn.py b/test/test_nn.py
index 60fb0e6c0cff3..9f5cbf85d5658 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -6375,12 +6375,10 @@ def helper(size, scale_factor, mode, device, memory_format=torch.contiguous_form
 
             outf = m(inputf)
             out = m(input)
-            self.assertEqual(out.dtype, dtype)
-            self.assertEqualIgnoreType(out, outf, atol=0.1, rtol=0.0)
+            self.assertEqual(out, outf.to(dtype), atol=0.1, rtol=0.0)
 
             out.sum().backward()
             outf.sum().backward()
-            self.assertEqual(input.grad.dtype, dtype)
             self.assertEqual(input.grad, inputf.grad.to(dtype), atol=0.1, rtol=0)
 
         for device in ['cpu']:
@@ -6400,7 +6398,7 @@ def test_interpolate_illegal_memory_access(self):
 
         input = torch.ones((1, 1, in_s), device='cuda', requires_grad=True)
         # note we allocated grad_output to be larger so out of bound access
-        # woudl be visible in grad_input
+        # would be visible in grad_input
         grad = torch.ones((1, 1, out_s * 2), device='cuda', requires_grad=True)
         grad = grad[:, :, :out_s]
 
@@ -6712,12 +6710,10 @@ def test_log_softmax_cpu(self, dtype=torch.bfloat16):
             input = inputf.to(dtype).detach().requires_grad_(True)
             outf = F.log_softmax(inputf, dim=dim)
             out = F.log_softmax(input, dim=dim)
-            self.assertEqual(out.dtype, dtype)
             self.assertEqual(out, outf.to(dtype=dtype), atol=0.1, rtol=0)
 
             out.sum().backward()
             outf.sum().backward()
-            self.assertEqual(input.grad.dtype, dtype)
             self.assertEqual(input.grad, inputf.grad.to(dtype), atol=0.1, rtol=0)
 
     def test_softmax_cpu(self, dtype=torch.bfloat16):
@@ -6726,12 +6722,10 @@ def test_softmax_cpu(self, dtype=torch.bfloat16):
             input = inputf.to(dtype).detach().requires_grad_(True)
             outf = F.softmax(inputf, dim=dim)
             out = F.softmax(input, dim=dim)
-            self.assertEqual(out.dtype, dtype)
-            self.assertEqualIgnoreType(out, outf, atol=1e-3, rtol=0)
+            self.assertEqual(out, outf.to(dtype), atol=1e-3, rtol=0)
 
             out.sum().backward()
             outf.sum().backward()
-            self.assertEqual(input.grad.dtype, dtype)
             self.assertEqual(input.grad, inputf.grad.to(dtype), atol=1e-3, rtol=0)
 
     def test_adaptive_log_softmax(self):
@@ -6842,12 +6836,10 @@ def test_cross_entropy_loss(self, dtype=torch.bfloat16):
 
         outf = loss_cpu(inputf, target)
         out = loss_cpu(input, target)
-        self.assertEqual(out.dtype, dtype)
         self.assertEqual(out, outf.to(dtype=dtype), atol=1e-1, rtol=0)
 
         outf.backward()
         out.backward()
-        self.assertEqual(input.grad.dtype, dtype)
         self.assertEqual(input.grad, inputf.grad.to(dtype=dtype), atol=1e-1, rtol=0)
 
     def test_cross_entropy_loss_precision(self):

From 9815a72c8a28539698dff82a1495d8f0bc1251ba Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Tue, 6 Dec 2022 11:07:09 -0500
Subject: [PATCH 1645/1922] Ensure that we fakeify tensor subclasses when they
 are initially tracked (#90009)

The old code didn't actually fakeify traceable tensor subclasses at the
time they are added as a GraphArg to the module; now we do, by ignoring
the subclass during fakeification and relying on Dynamo to simulate
the subclass on top.  See comments for more details.

BTW, this codepath is super broken, see filed issues linked on the
inside.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90009
Approved by: https://github.com/wconstab, https://github.com/voznesenskym
---
 test/dynamo/test_modules.py        |   4 -
 torch/_dynamo/config.py            |  18 +++-
 torch/_dynamo/utils.py             |  14 ++-
 torch/_dynamo/variables/builder.py | 133 ++++++++++++++++++++---------
 torch/_dynamo/variables/tensor.py  |   5 ++
 torch/_subclasses/fake_tensor.py   |  39 +++++++--
 torch/_subclasses/meta_utils.py    |  16 +++-
 7 files changed, 169 insertions(+), 60 deletions(-)

diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py
index da3f1d3d59881..f510fb87522c5 100644
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@@ -1,6 +1,5 @@
 # Owner(s): ["module: dynamo"]
 
-import unittest
 from copy import deepcopy
 from unittest.mock import patch
 
@@ -763,9 +762,6 @@ def test_generation_tag(self):
         m3 = deepcopy(m1)
         self.assertEqual(GenerationTracker.get_generation_value(m3), cur_generation)
 
-    # torch._subclasses.fake_tensor.UnsupportedFakeTensorException: meta converter nyi
-    # due to custom subclass (TensorProxy)
-    @unittest.expectedFailure
     def test_simple_torch_function(self):
         def foo(x):
             # function call, twice to test wrapping
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index 838a65c152a3c..56e5f24b2642e 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -72,8 +72,22 @@
 # run FX normalization passes in optimizer
 normalize_ir = False
 
-# If a tensor subclass type is in this set, torchdynamo will inline the
-# __torch_function__ logic of the subclass.
+# This feature doesn't really work.  We offer this flag for experimental
+# purposes / if you want to help us build out support.
+#
+# torchdynamo has very limited support for tensor subclasses that implement
+# __torch_function__.  Our current support is limited to tensor subclasses
+# that DO NOT store metadata on the tensor (in general, dynamo does not
+# support Python code that stores extra attributes on tensors at present).
+# If your tensor subclass purely changes function call behavior via
+# __torch_function__, you can allow torchdynamo to trace into it by
+# adding it to traceable_tensor_subclasses.  We don't do any safety checks,
+# so it is up to you to ensure that your subclass is well behaved.  See also
+# https://github.com/pytorch/torchdynamo/issues/1948
+#
+# We do NOT currently support __torch_dispatch__.  The implementation is
+# currently buggy, the main show stopper for nontrivial use is
+# https://github.com/pytorch/torchdynamo/issues/1952
 traceable_tensor_subclasses = set()
 
 # Suppress errors in torch._dynamo.optimize, instead forcing a fallback to eager.
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 660d1ab9556c9..fbdf411190543 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -699,8 +699,10 @@ def rename_implicit(v):
 )
 
 
-def make_fake_tensor(e, fake_mode, static_shapes=False, tx=None):
-    fake_tensor = fake_mode.from_tensor(e, static_shapes=static_shapes)
+def make_fake_tensor(e, fake_mode, static_shapes=False, tx=None, ignore_subclass=False):
+    fake_tensor = fake_mode.from_tensor(
+        e, static_shapes=static_shapes, ignore_subclass=ignore_subclass
+    )
     if tx is not None:
         from torch._dynamo.guards import TensorReference
 
@@ -757,17 +759,21 @@ def wrap_to_fake_tensor(e, fake_mode):
         return e
 
 
-def wrap_to_fake_tensor_and_record(e, tx):
+def wrap_to_fake_tensor_and_record(e, tx, ignore_subclass=False):
     # The not fake tensor check here is annoying - ideally, fake tensors never call this during wrapping.
     # However, get_fake_value takes args and passes them through this, which may include fake tensors.
     # see tree_map(fake_wrapper, args) in get_fake_value.
+    # TODO: Check if we should remove FakeTensor isinstance check when
+    # ignore_subclass
     if isinstance(e, torch.Tensor) and not isinstance(e, torch._subclasses.FakeTensor):
         static_shapes = config.dynamic_shapes is False
         if type(e) is torch.nn.Parameter:
             # Always static for params
             static_shapes = True
         return wrap_fake_exception(
-            lambda: make_fake_tensor(e, tx.fake_mode, static_shapes, tx)
+            lambda: make_fake_tensor(
+                e, tx.fake_mode, static_shapes, tx, ignore_subclass=ignore_subclass
+            )
         )
     else:
         return e
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index 843e50687a61e..f7c134472a11d 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -564,45 +564,74 @@ def wrap_tensor(self, value: torch.Tensor):
                 # Guards are done inside register_attr_or_module
                 # guards=self.make_guards(GuardBuilder.TENSOR_MATCH),
             )
+
+        if is_constant_source(self.get_source()):
+            return self.tx.output.register_attr_or_module(
+                value,
+                re.sub(r"[^a-zA-Z0-9]+", "_", self.name),
+                source=None,
+                # Guards are added inside register_attr_or_module
+            )
+
+        if type(value) in config.traceable_tensor_subclasses:
+            # Ordinarily, we would fakeify a tensor so that it can get dynamic
+            # shapes and be computed on without triggering actual operations.
+            # However, how can we fakeify a tensor subclass?  Ordinary
+            # inheritance (nor multiple inheritance) won't work work.
+            #
+            # Instead, our plan is to *manually simulate* the tensor subclass
+            # inheriting from a fake tensor with dynamo.  This means our
+            # data representation for a tensor subclass will be a fake tensor
+            # + tensor subclass type + any extra data the subclass may have
+            # been storing on the tensor.  Because all Python accesses are
+            # mediated through TensorWithTFOverrideVariable, we can ensure
+            # that we dispatch differently, e.g., according to
+            # __torch_function__
+            #
+            # To simplify things for now, the __dict__ tracking bits haven't
+            # been implemented yet, but they can be added into this design at
+            # a later point in time.
+            ignore_subclass = True
         else:
-            # Disable __torch_function__ to prevent cloning of `value` to hit
-            # us
-            with torch._C.DisableTorchFunction():
-                if is_constant_source(self.get_source()):
-                    return self.tx.output.register_attr_or_module(
-                        value,
-                        re.sub(r"[^a-zA-Z0-9]+", "_", self.name),
-                        source=None,
-                        # Guards are added inside register_attr_or_module
-                    )
-                tensor_variable = wrap_fx_proxy(
-                    tx=self.tx,
-                    proxy=self.tx.output.create_graph_input(
-                        re.sub(r"[^a-zA-Z0-9]+", "_", self.name), type(value)
-                    ),
-                    example_value=value,
-                    guards=self.make_guards(GuardBuilder.TENSOR_MATCH),
-                    should_specialize=self.tensor_should_specialize(),
-                )
+            assert type(value) in (torch.Tensor, torch.nn.Parameter)
+            ignore_subclass = False
 
-            fake_tensor_value = None
-            example_value = tensor_variable.proxy.node.meta["example_value"]
-            if isinstance(example_value, torch._subclasses.fake_tensor.FakeTensor):
-                fake_tensor_value = example_value
-
-            graph_arg = GraphArg(self.get_source(), value, False, fake_tensor_value)
-            self.tx.output.graphargs.append(graph_arg)
-
-            if torch.overrides.has_torch_function_unary(value):
-                subclass_torch_function__func = value.__torch_function__.__func__
-                subclass_type = type(value)
-                return TensorWithTFOverrideVariable(
-                    tensor_variable,
-                    self.get_source(),
-                    subclass_torch_function__func,
-                    subclass_type,
-                )
-            return tensor_variable
+        tensor_variable = wrap_fx_proxy(
+            tx=self.tx,
+            proxy=self.tx.output.create_graph_input(
+                re.sub(r"[^a-zA-Z0-9]+", "_", self.name), type(value)
+            ),
+            example_value=value,
+            guards=self.make_guards(GuardBuilder.TENSOR_MATCH),
+            should_specialize=self.tensor_should_specialize(),
+            ignore_subclass=ignore_subclass,
+        )
+
+        # TODO: I think the result is guaranteed to be fake with
+        # ignore_subclass changes
+        fake_tensor_value = None
+        example_value = tensor_variable.proxy.node.meta["example_value"]
+        if isinstance(example_value, torch._subclasses.fake_tensor.FakeTensor):
+            fake_tensor_value = example_value
+
+        self.tx.output.graphargs.append(
+            GraphArg(self.get_source(), value, False, fake_tensor_value)
+        )
+
+        if type(value) in config.traceable_tensor_subclasses:
+            subclass_torch_function__func = value.__torch_function__.__func__
+            subclass_type = type(value)
+            # NB: This is slightly misnamed, a tensor subclass might not have
+            # any explicit __torch_function__ implementation and is relying
+            # on the default inherited from torch.Tensor
+            return TensorWithTFOverrideVariable(
+                tensor_variable,
+                self.get_source(),
+                subclass_torch_function__func,
+                subclass_type,
+            )
+
+        return tensor_variable
 
     def wrap_unspecialized_primitive(self, value):
         if self.name in self.tx.output.unspec_variable_map:
@@ -688,12 +717,15 @@ def wrap_fx_proxy(tx, proxy, example_value=None, **options):
 
 # Note: Unfortunate split due to some gross classes existing that subclass TensorVariable
 # Should be compositional instead
-def wrap_fx_proxy_cls(target_cls, tx, proxy, example_value=None, **options):
+def wrap_fx_proxy_cls(
+    target_cls, tx, proxy, example_value=None, ignore_subclass=False, **options
+):
     if "guards" in options and options["guards"] is not None:
         tx.output.guards.update(options["guards"])
 
     assert "example_value" not in proxy.node.meta
     if not config.dynamic_propagation:
+        # TODO: This probably doesn't handle subclass correctly
         if isinstance(example_value, torch.Tensor):
             options.update(target_cls.specialize(example_value))
         return target_cls(proxy, **options)
@@ -716,10 +748,25 @@ def _clone_input(value):
             # Note: Unfortunately, this can happen during tracing, and is valid enough for now to allow.
             # TODO(voz): Find all the callsites and burn this down.
             # Flipping it to an assert fails dozens of tests.
+            # TODO(ezyang): should attempt this burndown again
             if not isinstance(example_value, torch._subclasses.FakeTensor):
-                proxy.tracer.real_value_cache[proxy.node] = _clone_input(example_value)
-                fake_wrapper = functools.partial(wrap_to_fake_tensor_and_record, tx=tx)
-                example_value = fake_wrapper(example_value)
+                # We shouldn't be doing this at all, see
+                # https://github.com/pytorch/torchdynamo/issues/1950
+                # But assuming we're doing it, the legacy behavior for
+                # subclasses was to perform a clone WITHOUT preserving
+                # the subclass.  It's not clear to me that's what you actually
+                # want, but whatever, I wouldn't have this cache at all.
+                with torch._C.DisableTorchFunction():
+                    proxy.tracer.real_value_cache[proxy.node] = _clone_input(
+                        example_value
+                    )
+                # NB: If we're ignoring subclass, then the expectation is you will
+                # take the returned TensorVariable and wrap it into a more
+                # accurate TensorVariable that is able to track subclass-ness;
+                # otherwise this is wrong!
+                example_value = wrap_to_fake_tensor_and_record(
+                    example_value, tx=tx, ignore_subclass=ignore_subclass
+                )
 
     if isinstance(example_value, torch.Tensor):
         is_parameter = isinstance(example_value, torch.nn.Parameter)
@@ -729,10 +776,14 @@ def _clone_input(value):
         else:
             specialized_value = None
 
+        # NB: In most (all?) cases, this does not actually do a clone.
+        # (WARNING: this means that if we mutate metadata on the fake
+        # tensor, the stored example value will update too!)
         example_value = _clone_input(example_value)
         proxy.node.meta["example_value"] = example_value
         specialized_props = target_cls.specialize(example_value)
         if isinstance(example_value, torch._subclasses.fake_tensor.FakeTensor):
+            # NB: This will be wrong for ignore_subclass; fix it up later!
             specialized_props["class_type"] = (
                 torch.nn.Parameter if is_parameter else torch.Tensor
             )
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index d9973ef3b87fa..c161be41007ac 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -445,6 +445,11 @@ def call_method(
 
         options = VariableTracker.propagate(self, args, kwargs.values())
         # insert unwrapped version of self as the first argument
+        # TODO: This is wrong!  When you call the internal __torch_function__,
+        # you still get the wrapped version of self, and if you call functions
+        # inside __torch_function__, they should come back here.  If we unwrap
+        # the tensor immediately, that will not happen.
+        # See https://github.com/pytorch/torchdynamo/issues/1951
         args = list(args)
         args.insert(0, self.tensor_variable)
         func_var = GetAttrVariable(self.tensor_variable, name)
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index f5d04dddd1274..a137bdba1aa17 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -204,7 +204,9 @@ def del_ten():
         weakref.finalize(t, del_ten)
         self.tensor_memo[th] = v
 
-    def from_real_tensor(self, fake_mode, t, make_constant=False, shape_env=None):
+    def from_real_tensor(
+        self, fake_mode, t, make_constant=False, shape_env=None, ignore_subclass=False
+    ):
         maybe_memo = self._get_memo(t)
         if maybe_memo is not None:
             return maybe_memo
@@ -231,7 +233,12 @@ def mk_fake_tensor(make_meta_t):
                     constant=t if make_constant else None,
                 )
 
-        out = self.meta_converter(t, shape_env=shape_env, callback=mk_fake_tensor)
+        out = self.meta_converter(
+            t,
+            shape_env=shape_env,
+            callback=mk_fake_tensor,
+            ignore_subclass=ignore_subclass,
+        )
         if out is NotImplemented:
             raise UnsupportedFakeTensorException("meta converter nyi")
         if make_constant:
@@ -258,8 +265,22 @@ def from_meta_and_device(self, fake_mode, t, device):
     # tensor; although an odd thing to do, this can occur if you're doing
     # cross ref testing and the inner test is already operating on meta tensors.
     # You must have created the FakeTensorMode with allow_meta == True
-    def __call__(self, fake_mode, t, *, make_constant=False, shape_env=None):
-        return self.from_real_tensor(fake_mode, t, make_constant, shape_env=shape_env)
+    def __call__(
+        self,
+        fake_mode,
+        t,
+        *,
+        make_constant=False,
+        shape_env=None,
+        ignore_subclass=False,
+    ):
+        return self.from_real_tensor(
+            fake_mode,
+            t,
+            make_constant,
+            shape_env=shape_env,
+            ignore_subclass=ignore_subclass,
+        )
 
 
 op_implementations = []
@@ -980,10 +1001,14 @@ def invalidate_written_to_constants(
                 ):
                     self.fake_tensor_converter.invalidate_constant_aliases(v.constant)
 
-    def from_tensor(self, tensor, static_shapes=False):
+    def from_tensor(self, tensor, static_shapes=False, ignore_subclass=False):
         if static_shapes:
-            return self.fake_tensor_converter(self, tensor)
-        return self.fake_tensor_converter(self, tensor, shape_env=self.shape_env)
+            return self.fake_tensor_converter(
+                self, tensor, ignore_subclass=ignore_subclass
+            )
+        return self.fake_tensor_converter(
+            self, tensor, shape_env=self.shape_env, ignore_subclass=ignore_subclass
+        )
 
 
 # NB: returns fake tensors
diff --git a/torch/_subclasses/meta_utils.py b/torch/_subclasses/meta_utils.py
index c410610598916..577eb813dde9c 100644
--- a/torch/_subclasses/meta_utils.py
+++ b/torch/_subclasses/meta_utils.py
@@ -451,7 +451,9 @@ def is_c_of_r(complex_dtype, real_dtype):
 
         return self.get_tensor_memo(t)
 
-    def __call__(self, t, shape_env=None, *, callback=lambda t: t()):
+    def __call__(
+        self, t, shape_env=None, *, callback=lambda t: t(), ignore_subclass=False
+    ):
         # TODO: zero tensors?  We appear to have eliminated them by
         # excluding complex for now
         from torch._subclasses.fake_tensor import FakeTensor
@@ -459,6 +461,7 @@ def __call__(self, t, shape_env=None, *, callback=lambda t: t()):
         if (
             type(t) is torch.Tensor
             or type(t) is torch.nn.Parameter
+            or (ignore_subclass and isinstance(t, torch.Tensor))
             or isinstance(t, FakeTensor)
         ):
             if any(
@@ -488,7 +491,16 @@ def __call__(self, t, shape_env=None, *, callback=lambda t: t()):
                 return NotImplemented
             else:
                 self.hit += 1
-                r = self.meta_tensor(t, shape_env=shape_env, callback=callback)
+                # When ignoring subclasses, we treat the input tensor "as if" it
+                # were a normal tensor and create a non-subclassed fake tensor
+                # that, modulo type and attributes, resembles the original tensor.
+                # This can be helpful if you're planning to simulate the subclassness
+                # by hand, e.g., as is done in Dynamo
+                ctx = contextlib.nullcontext()
+                if ignore_subclass:
+                    ctx = torch._C.DisableTorchFunction()
+                with ctx:
+                    r = self.meta_tensor(t, shape_env=shape_env, callback=callback)
                 # TODO: this is suspicious, now that we have callback argument
                 if type(t) is torch.nn.Parameter:
                     r = torch.nn.Parameter(r, requires_grad=r.requires_grad)

From 38a5db293ae6f17d25bb978725b5615aaf532910 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Tue, 6 Dec 2022 19:21:21 +0000
Subject: [PATCH 1646/1922] as_strided: Fix default storage_offset for
 reference implementation (#89513)

This fixes the default storage_offset to take it from the input. This was
previously untested, so I've also added a new OpInfo which includes samples with
non-zero storage_offsets on the input tensor.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89513
Approved by: https://github.com/ezyang, https://github.com/ngimel
---
 test/functorch/test_aotdispatch.py            |  1 +
 test/functorch/test_ops.py                    | 14 +++-
 test/functorch/test_vmap.py                   |  2 +
 torch/_refs/__init__.py                       | 10 ++-
 .../_internal/common_methods_invocations.py   | 72 ++++++++++++++++---
 5 files changed, 88 insertions(+), 11 deletions(-)

diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 80edc80d7b14a..598a8f70dbcf4 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1793,6 +1793,7 @@ def forward(self, x):
     xfail('scatter_reduce', 'prod'),
 
     skip('as_strided_scatter'),
+    skip('as_strided', 'partial_views'),  # flaky
 
     # Too annoying to generate random inputs
     xfail('cholesky'),
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index b643a44cab6b9..7a2f5b8dc61a1 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -414,6 +414,7 @@ def wrapped_fn(*args, **kwargs):
         # BUG
         # AssertionError: Tensor-likes are not close!
         xfail('as_strided'),
+        xfail('as_strided', 'partial_views'),
         decorate('linalg.det', 'singular',
                  decorator=unittest.skipIf(IS_MACOS and IS_X86, "Fails on x86 MacOS CI")),
     }))
@@ -526,6 +527,7 @@ def maybe_clone_inputs():
         xfail('as_strided'),
         xfail('as_strided_scatter'),
         xfail('_softmax_backward_data', device_type='cpu'),
+        xfail('as_strided', 'partial_views'),
     }))
     @opsToleranceOverride('TestOperators', 'test_vjp', (
         tol1('nn.functional.conv_transpose3d',
@@ -655,6 +657,7 @@ def fn(inp, *args, **kwargs):
         skip("atleast_3d"),  # Takes too long
         skip("ormqr"),  # Takes too long
         xfail("as_strided"),  # incorrect output
+        xfail("as_strided", "partial_views"),  # incorrect output
         xfail("as_strided_scatter"),  # incorrect output
         skip("bernoulli"),  # calls random op
         xfail("bfloat16"),  # rank 4 tensor for channels_last
@@ -735,6 +738,9 @@ def fn(inp, *args, **kwargs):
         tol1('svd',
              {torch.float32: tol(atol=1e-03, rtol=5e-04)}),
     ))
+    @skipOps('TestOperators', 'test_vmapvjpvjp', {
+        xfail('as_strided', 'partial_views'),
+    })
     def test_vmapvjpvjp(self, device, dtype, op):
         # Since, we test `vjpvjp` independently,
         # for this test, we just verify that vmap
@@ -802,6 +808,7 @@ def vjp_of_vjp(*args_and_cotangents):
         xfail('svd_lowrank', ''),  # randomness
         xfail('to_sparse', ''),  # non-dense output
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
+        xfail('as_strided', 'partial_views'),
         # ----------------------------------------------------------------------
 
         # ---------------------------- BUGS ------------------------------------
@@ -851,7 +858,9 @@ def vjp_of_vjp(*args_and_cotangents):
         tol1('linalg.householder_product',
              {torch.float32: tol(atol=1e-04, rtol=1e-04)}),
     ))
-    @skipOps('TestOperators', 'test_vmapvjp', vmapvjp_fail)
+    @skipOps('TestOperators', 'test_vmapvjp', vmapvjp_fail.union({
+        xfail('as_strided', 'partial_views'),
+    }))
     def test_vmapvjp(self, device, dtype, op):
         if not op.supports_autograd:
             self.skipTest("Skipped! Autograd not supported.")
@@ -899,6 +908,7 @@ def test_vmapvjp(self, device, dtype, op):
         decorate('linalg.det', 'singular', decorator=unittest.skipIf(IS_MACOS, "Fails on x86 MacOS CI")),
         skip('nn.functional.max_pool1d'),  # fails on cpu, runs on cuda
         xfail('masked.mean'),  # silent incorrectness (nan difference)
+        xfail('as_strided', 'partial_views'),  # Tensor-likes are not close!
 
         xfail('nn.functional.soft_margin_loss', ''),  # soft_margin_loss_backward does not support forward-ad
         xfail('tensor_split'),  # data_ptr composite compliance
@@ -1200,6 +1210,7 @@ def test():
         xfail('sparse.sampled_addmm', ''),
         xfail("native_batch_norm"),
         xfail("_native_batch_norm_legit"),
+        xfail('as_strided', 'partial_views'),
     }))
     def test_vjpvmap(self, device, dtype, op):
         # NB: there is no vjpvmap_has_batch_rule test because that is almost
@@ -1382,6 +1393,7 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents):
 
         # Potential bugs/errors
         xfail('as_strided'),  # AssertionError: Tensor-likes are not close!
+        xfail('as_strided', 'partial_views'),  # AssertionError: Tensor-likes are not close!
         xfail('as_strided_scatter'),  # AssertionError: Tensor-likes are not close!
         xfail('bernoulli'),  # calls random op
         xfail('bfloat16'),  # required rank 4 tensor to use channels_last format
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index 64776804e3d31..b85da534be8fa 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -3301,6 +3301,7 @@ def test():
         xfail('triu'),  # Exception not raised on error input
         # The error inputs are vectors, that pass when batched as they are treated as a matrix
         xfail('trace'),
+        xfail('as_strided', 'partial_views'),
     }))
     def test_vmap_exhaustive(self, device, dtype, op):
         # needs to be fixed
@@ -3316,6 +3317,7 @@ def test_vmap_exhaustive(self, device, dtype, op):
     ))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
     @skipOps('TestVmapOperatorsOpInfo', 'test_op_has_batch_rule', vmap_fail.union({
+        xfail('as_strided', 'partial_views'),
         skip('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
         xfail('complex'),
         xfail('copysign'),
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index b0616191d1a80..354ef9c2d94ba 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -2509,9 +2509,15 @@ def atleast_3d(
 
 
 def as_strided(
-    a: TensorLikeType, size: ShapeType, stride: StrideType, storage_offset: int = 0
+    a: TensorLikeType,
+    size: ShapeType,
+    stride: StrideType,
+    storage_offset: Optional[int] = None,
 ) -> TensorLikeType:
-    return prims.as_strided(a, size, stride, storage_offset)
+    storage_offset_int = (
+        storage_offset if storage_offset is not None else a.storage_offset()
+    )
+    return prims.as_strided(a, size, stride, storage_offset_int)
 
 
 def broadcast_shapes(*shapes) -> ShapeType:
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 596b5e9160286..3379c2bd877ff 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -263,9 +263,15 @@ def sample_inputs_as_strided(op_info, device, dtype, requires_grad, **kwargs):
         kwargs = dict(storage_offset=storage_offset)
         yield SampleInput(input_t, args=(output_shape, stride), kwargs=kwargs)
 
+def sample_inputs_as_strided_partial_views(op_info, device, dtype, requires_grad, **kwargs):
+    def make_arg():
+        base = make_tensor((20,), device=device, dtype=dtype)
+        return base[5:15].requires_grad_(requires_grad)
+
     # as_strided on offset, partial views
-    # yield SampleInput(make_arg((20,))[5:15], args=((2, 2), (1, 2)))
-    # yield SampleInput(make_arg((20,))[5:15], args=((2, 2), (1, 2)), kwargs={'storage_offset': 0})
+    yield SampleInput(make_arg(), (2, 2), (1, 2))
+    yield SampleInput(make_arg(), (2, 2), (1, 2), storage_offset=0)
+    yield SampleInput(make_arg(), (2, 2), (1, 2), storage_offset=10)
 
 def sample_inputs_as_strided_scatter(op_info, device, dtype, requires_grad, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -10721,8 +10727,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type='cpu'),
            )),
     OpInfo('as_strided',
-           op=lambda x, size, stride, storage_offset=0:
-               torch.as_strided(x, size, stride, storage_offset=storage_offset),
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
            supports_out=False,
            supports_forward_ad=True,
@@ -10743,7 +10747,47 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_conj_view'),
                DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_view'),
                DecorateInfo(unittest.skip("Numerous errors"), 'TestFwdGradients'),
-               DecorateInfo(unittest.skip("Numerous errors"), 'TestBwdGradients'))),
+               DecorateInfo(unittest.skip("Numerous errors"), 'TestBwdGradients'),
+           )),
+    OpInfo('as_strided',
+           variant_test_name='partial_views',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # vmap does not support inplace views
+           check_inplace_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_as_strided_partial_views,
+           skips=(
+               # Note: This xfail is fine -- it's inherent to how as_strided works
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples'),
+               # RuntimeError: This operator is not Composite Compliant: the
+               # storage_offset of the tensor was modified directly without
+               # going through the PyTorch dispatcher.
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance'),
+
+
+               # These fail because the test changes the input's in-memory layout
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_complex_half_reference_testing'),
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_fn_fwgrad_bwgrad',
+                            dtypes=(torch.complex64, torch.complex128)),
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD'),
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_inplace_forward_mode_AD'),
+               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_inplace_grad'),
+               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_inplace_gradgrad'),
+               DecorateInfo(unittest.expectedFailure, 'TestProxyTensorOpInfo',
+                            'test_make_fx_symbolic_exhaustive_inplace'),
+               DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness'),
+               DecorateInfo(unittest.expectedFailure, 'TestCudaFuserOpInfo', 'test_nvfuser_correctness'),
+               DecorateInfo(unittest.expectedFailure, 'TestCudaFuserOpInfo', 'test_nvfuser_extremal_values'),
+               # Fail but are also flaky
+               DecorateInfo(unittest.skip("Test changes in memory layout"), 'TestMathBits'),
+               DecorateInfo(unittest.skip("Modifies input strides and storage_offset"), 'TestCommon',
+                            'test_non_standard_bool_values'),
+           )),
     OpInfo('as_strided_scatter',
            op=lambda x, src, size, stride, storage_offset=0:
                torch.as_strided_scatter(x, src, size, stride, storage_offset=storage_offset),
@@ -18282,15 +18326,27 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
         supports_nvfuser=False,
         skips=(
-            # TODO: fix and/or update to xfails
-            DecorateInfo(unittest.skip("Errors when storage_offset is included"),
-                         'TestCommon', 'test_python_ref_meta'),
             # cloned_mutable_input.is_same(returned_output) INTERNAL ASSERT FAILED
             DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_view'),
             DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_conj_view'),
             DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_conj_view'),
         ),
     ),
+    PythonRefInfo(
+        "_refs.as_strided",
+        torch_opinfo_name="as_strided",
+        torch_opinfo_variant_name="partial_views",
+        # FIXME: doesn't support chalf
+        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        supports_nvfuser=False,
+        skips=(
+            # cloned_mutable_input.is_same(returned_output) INTERNAL ASSERT FAILED
+            DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_conj_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu'),
+        ),
+    ),
     PythonRefInfo(
         "_refs.broadcast_shapes",
         torch_opinfo_name="broadcast_shapes",

From 94ceb0eebc88ed95061639d862576025788b57b7 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Tue, 6 Dec 2022 11:06:09 -0800
Subject: [PATCH 1647/1922] [quant][fx] Add support for matching constant in
 the custom matcher code in quantization (#90092)

Summary:
att

Test Plan:
python test/test_quantization.py TestQuantizeFx.test_pattern_match_constant

Reviewers:

Subscribers:

Tasks:

Tags:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90092
Approved by: https://github.com/jcaip
---
 test/quantization/fx/test_quantize_fx.py | 16 ++++++++++++++++
 torch/ao/quantization/fx/match_utils.py  |  3 +++
 2 files changed, 19 insertions(+)

diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index 3d6fe0efa347d..794d70b56f8f2 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -713,6 +713,22 @@ def forward(self, x, y):
             if n.op == 'call_module' and type(modules[n.target]) == nn.ReLU:
                 self.assertTrue(is_match(modules, n, pattern))
 
+    def test_pattern_match_constant(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                x, _ = torch.ops.aten.max_pool2d_with_indices.default(x)
+                return x
+
+        pattern = (operator.getitem, torch.ops.aten.max_pool2d_with_indices.default, 0)
+        m = torch.fx.symbolic_trace(M())
+        # eliminate the code that get the second output of maxpool, so that the pattern
+        # can be matched
+        m.graph.eliminate_dead_code()
+        modules = dict(m.named_modules())
+        for n in m.graph.nodes:
+            if n.op == "call_function" and n.target == operator.getitem:
+                self.assertTrue(is_match(modules, n, pattern))
+
     def test_fused_module_qat_swap(self):
         class Tmp(torch.nn.Module):
             def __init__(self):
diff --git a/torch/ao/quantization/fx/match_utils.py b/torch/ao/quantization/fx/match_utils.py
index 631916da7e3f3..f9b6c442476a0 100644
--- a/torch/ao/quantization/fx/match_utils.py
+++ b/torch/ao/quantization/fx/match_utils.py
@@ -53,6 +53,9 @@ def is_match(modules, node, pattern, max_uses=sys.maxsize):
     if isinstance(self_match, type) and issubclass(self_match, MatchAllNode):
         return True
 
+    if node == pattern:
+        return True
+
     if not isinstance(node, Node) or len(node.users) > max_uses:
         return False
 

From e5699b5047788bf764ecaf32b86de76144932398 Mon Sep 17 00:00:00 2001
From: Tran Le <quytranle@meta.com>
Date: Tue, 6 Dec 2022 23:18:55 +0000
Subject: [PATCH 1648/1922] [fx][passes] Implement annotate getitem node FX
 passes (#90237)

Summary: One common cause of jit unscriptability issue is loss of node type annotations on local names after one or several FX transform(s). One way to improve the type coverage is to eagerly annotate the type for `getitem` nodes from its parent sequence node. This diff introduces an fx pass to do that.

Test Plan:
```
buck2 test //caffe2/test:fx_experimental
```

Reviewed By: xush6528

Differential Revision: D41749744

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90237
Approved by: https://github.com/xush6528
---
 test/test_fx_experimental.py              | 54 ++++++++++++++++++-----
 torch/fx/passes/annotate_getitem_nodes.py | 42 ++++++++++++++++++
 torch/fx/passes/split_module.py           | 20 ---------
 3 files changed, 85 insertions(+), 31 deletions(-)
 create mode 100644 torch/fx/passes/annotate_getitem_nodes.py

diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index a8fc077703023..e94c1bc7cc445 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -7,43 +7,44 @@
 import sys
 import tempfile
 import unittest
-from typing import Callable, Dict, Union, List, Optional
 from types import BuiltinFunctionType
+from typing import Callable, Dict, List, NamedTuple, Optional, Tuple, Union
 
 import torch
+import torch.fx.experimental.meta_tracer
 import torch.fx.experimental.optimization as optimization
 from torch.fx._symbolic_trace import symbolic_trace
 from torch.fx.experimental import merge_matmul
 from torch.fx.experimental.accelerator_partitioner import Partitioner
-from torch.fx.experimental.normalize import NormalizeOperators, NormalizeArgs
-from torch.fx.passes import graph_manipulation
-from torch.fx.passes.param_fetch import lift_lowering_attrs_to_nodes
+from torch.fx.experimental.normalize import NormalizeArgs, NormalizeOperators
 from torch.fx.experimental.partitioner_utils import (
-    NodeLatency,
-    get_partition_to_latency_mapping,
-    get_latency_of_partitioned_graph,
     Device,
+    get_latency_of_partitioned_graph,
+    get_partition_to_latency_mapping,
+    NodeLatency,
     PartitionerConfig,
     PartitionMode,
 )
 from torch.fx.experimental.rewriter import RewritingTracer
 from torch.fx.experimental.schema_type_annotation import AnnotateTypesWithSchema
-import torch.fx.experimental.meta_tracer
 from torch.fx.graph_module import GraphModule
 from torch.fx.node import Node
 from torch.fx.operator_schemas import (
     _torchscript_type_to_python_type,
+    create_type_hint,
     normalize_function,
     normalize_module,
     type_matches,
-    create_type_hint,
 )
+from torch.fx.passes import graph_manipulation
+from torch.fx.passes.param_fetch import lift_lowering_attrs_to_nodes
 from torch.fx.passes.shape_prop import ShapeProp
 from torch.fx.passes.split_module import split_module
+from torch.fx.passes.annotate_getitem_nodes import annotate_getitem_nodes
 from torch.testing._internal.common_device_type import (
-    ops,
-    onlyCPU,
     instantiate_device_type_tests,
+    onlyCPU,
+    ops,
 )
 from torch.testing._internal.common_methods_invocations import op_db
 from torch.testing._internal.common_nn import module_tests, new_module_tests
@@ -1080,6 +1081,37 @@ def is_leaf_module(
         # Smoke test torchscript compilation since now we're emitting type annotations
         torch.jit.script(traced_functionals_annotated)
 
+    def test_annotate_getitem_node(self):
+        class CustomType:
+            pass
+
+        class CustomNamedTuple(NamedTuple):
+            x: int
+            y: float
+
+        class MyModule(torch.nn.Module):
+            def forward(self, inp: Tuple[CustomType, torch.Tensor], inp2: List[CustomType], inp3: CustomNamedTuple):
+                inp_0 = inp[0]
+                inp_1 = inp[1]
+                inp2_0 = inp2[0]
+                inp3_x = inp3.x
+                inp3_y = inp3.y
+                return inp_0 + inp_1 + inp2_0 + inp3_x + inp3_y
+
+        my_module = MyModule()
+        my_module_traced = torch.fx.symbolic_trace(my_module)
+
+        # by default, fx transform loses type annotation of getitem nodes.
+        for node in my_module_traced.graph.nodes:
+            if node.target == operator.getitem:
+                assert node.type is None
+
+        annotate_getitem_nodes(my_module_traced.graph)
+
+        for node in my_module_traced.graph.nodes:
+            if node.target == operator.getitem:
+                self.assertIsNotNone(node.type, f"Node {node} should be annotated but is not.")
+
     def test_subgraph_uniquename(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
diff --git a/torch/fx/passes/annotate_getitem_nodes.py b/torch/fx/passes/annotate_getitem_nodes.py
new file mode 100644
index 0000000000000..f77fddfb4d551
--- /dev/null
+++ b/torch/fx/passes/annotate_getitem_nodes.py
@@ -0,0 +1,42 @@
+import operator
+
+import torch
+
+
+def annotate_getitem_nodes(graph: torch.fx.Graph) -> None:
+    """
+    Annotate the type of getitem nodes, inferred from the type of sequence node.
+    If sequence node is not annotated with a type, do nothing.
+    Currently support getitem nodes from Tuple, List, and NamedTuple sequence node.
+
+    This is helpful since annotations on local names within function are lost during FX transforms.
+    Adding back known type annotation for getitem nodes to improve jit scriptability.
+
+    Args:
+        graph (Graph): The graph to be annotated
+    """
+    for node in graph.nodes:
+        if node.target == operator.getitem:
+            sequence_node, index_node = node.args
+            if not sequence_node.type:
+                continue
+            # container types
+            if hasattr(sequence_node.type, "_name"):
+                parameterized_types = sequence_node.type.__args__
+                if sequence_node.type._name == "Tuple":
+                    if len(parameterized_types) == 2 and isinstance(
+                        parameterized_types[1], type(...)
+                    ):
+                        node.type = parameterized_types[0]
+                    else:
+                        assert len(parameterized_types) > index_node
+                        node_type = parameterized_types[index_node]
+                        node.type = node_type
+                elif sequence_node.type._name == "List":
+                    assert len(parameterized_types) == 1
+                    node.type = parameterized_types[0]
+            # NamedTuple type
+            elif hasattr(sequence_node.type, "__annotations__"):
+                sequence_node_field_types = sequence_node.type.__annotations__
+                field_name = sequence_node.type._fields[index_node]
+                node.type = sequence_node_field_types[field_name]
diff --git a/torch/fx/passes/split_module.py b/torch/fx/passes/split_module.py
index 0343bae94c31e..f3b1dd2d06036 100644
--- a/torch/fx/passes/split_module.py
+++ b/torch/fx/passes/split_module.py
@@ -1,5 +1,4 @@
 import inspect
-import operator
 from typing import Any, Callable, Dict, List, Optional
 
 import torch
@@ -160,25 +159,6 @@ def record_cross_partition_use(
 
     # split nodes into parititons
     for node in m.graph.nodes:
-        # Annotations on local names within function are lost during FX transforms.
-        # Adding back known type annotation for getitem nodes for jit scriptability.
-        if node.target == operator.getitem:
-            sequence_node, index_node = node.args
-            # only support type Tuple for now
-            if (
-                hasattr(sequence_node.type, "_name")
-                and sequence_node.type._name == "Tuple"
-            ):
-                parameterized_types = sequence_node.type.__args__
-                if len(parameterized_types) == 2 and isinstance(
-                    parameterized_types[1], type(...)
-                ):
-                    node.type = parameterized_types[0]
-                else:
-                    assert len(parameterized_types) > index_node
-                    node_type = parameterized_types[index_node]
-                    node.type = node_type
-
         orig_nodes[node.name] = node
 
         # TODO currently placeholders/parameters aren't put into random partitions,

From ea675490c4746e9606ea377fe3330b1bb2af19e0 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Tue, 6 Dec 2022 23:21:54 +0000
Subject: [PATCH 1649/1922] [MPS] Fix median_out_mps caching (#90326)

We should cache graph based on input tensor type

Fixes https://github.com/pytorch/pytorch/issues/90311

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90326
Approved by: https://github.com/kulinseth
---
 aten/src/ATen/native/mps/operations/ReduceOps.mm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index d905107b8ffd4..b7d458e1b0d8b 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -1318,7 +1318,7 @@ Tensor min_mps(const Tensor& input_t) {
     auto stream = at::mps::getCurrentMPSStream();
 
     @autoreleasepool {
-        string key = func_name + ":" + to_string(dim_) + ":" + native_mps::getMPSTypeString(input_t.scalar_type());
+        string key = func_name + ":" + to_string(dim_) + ":" + native_mps::getTensorsStringKey(input_t);
         CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
 
         if(!cachedGraph) {

From 88b0cdc19fdf1c3b4f81195702f8e2975654ded2 Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Tue, 6 Dec 2022 23:33:52 +0000
Subject: [PATCH 1650/1922] [Vulkan] Partially fix and then disable copying of
 vulkan quantized tensors to cpu (#90275)

Summary:
Before this diff, copying of vulkan quantized tensors to cpu was broken. This was mainly caused because the shader only works properly with specific global and local work group sizes, and those specific sizes had been modified in earlier refactoring.

As part of this fix, an optimized version of the shader that performs the copying was written, to take advantage of the special case when the plane size (x*y) is multiple of 4).

After fixing this, and writing comprehensive tests, it was discovered that the copying still has issues on Android for specific input sizes, e.g. [1, 1, 11, 17]. These issues are currently unresolved, so, copying of quantized vulkan tensors to cpu has been disabled.

What is contained in this diff?
- Fix for existing issue
- New optimized shader (image_to_nchw_quantized_mul4)
- New comprehensive tests (which have been disabled)
- Disable the copying of quantized vulkan tensors to cpu until issues on Android are fixed.

Test Plan:
On Mac
```
cd ~/fbsource
buck1 run -c pt.vulkan_full_precision=1 //xplat/caffe2:pt_vulkan_quantized_api_test_binAppleMac\#macosx-arm64
```

On Android
```
cd ~/fbsource
buck1 build -c ndk.custom_libcxx=false -c pt.enable_qpl=0 -c pt.vulkan_full_precision=1 //xplat/caffe2:pt_vulkan_quantized_api_test_binAndroid\#android-arm64 --show-output
adb push buck-out/gen/xplat/caffe2/pt_vulkan_quantized_api_test_binAndroid\#android-arm64 /data/local/tmp/vulkan_quantized_api_test
adb shell "/data/local/tmp/vulkan_quantized_api_test"
```

Reviewed By: kimishpatel

Differential Revision: D41047098

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90275
Approved by: https://github.com/kimishpatel
---
 .../vulkan/glsl/image_to_nchw_quantized.glsl  |  80 +++++-----
 .../glsl/image_to_nchw_quantized_mul4.glsl    |  75 +++++++++
 aten/src/ATen/native/vulkan/ops/Copy.cpp      |   3 +
 aten/src/ATen/native/vulkan/ops/Utils.cpp     |  18 ++-
 .../ATen/test/vulkan_quantized_api_test.cpp   | 149 +++++++++++++++++-
 5 files changed, 277 insertions(+), 48 deletions(-)
 create mode 100644 aten/src/ATen/native/vulkan/glsl/image_to_nchw_quantized_mul4.glsl

diff --git a/aten/src/ATen/native/vulkan/glsl/image_to_nchw_quantized.glsl b/aten/src/ATen/native/vulkan/glsl/image_to_nchw_quantized.glsl
index 2f5999b465e35..3fe0447a33a53 100644
--- a/aten/src/ATen/native/vulkan/glsl/image_to_nchw_quantized.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/image_to_nchw_quantized.glsl
@@ -11,7 +11,7 @@ layout(set = 0, binding = 0) uniform PRECISION isampler3D uImage;
 /*
  * Output Buffer
  */
-layout(set = 0, binding = 1) buffer PRECISION Buffer {
+layout(set = 0, binding = 1) buffer PRECISION restrict writeonly Buffer {
   uint data[];
 }
 uBuffer;
@@ -33,55 +33,47 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
   const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  if (pos.y == 0 && pos.z == 0) {
-    ivec4 texture_pos = ivec4(0, 1, 2, 3) + 4 * pos.x;
+    // each instance of the shader writes out a single element of the output
+    // the global size matches the size of the output, in other words:
+    // global size = {div_up(numel, 4), 1u, 1u}
+    // pos = {pos.x, 1, 1} where pos.x is the index of the output element
 
-    ivec4 last_eight;
-    last_eight.z = texture_pos.x / (uBlock.in_extents.x * uBlock.in_extents.y);
-    last_eight.w = texture_pos.x % (uBlock.in_extents.x * uBlock.in_extents.y);
-    last_eight.y = last_eight.w / uBlock.in_extents.x;
-    last_eight.x = last_eight.w % uBlock.in_extents.x;
+  ivec4 input_pos = ivec4(0, 1, 2, 3) + 4 * pos.x;
+    // each output element is a uint32 made up four consecutive uint8 from the
+    // input in nchw format. input_pos contains the positions of these four
+    // elements from the input in nchw format.
 
-    ivec4 sec_last_eight;
-    sec_last_eight.z =
-        texture_pos.y / (uBlock.in_extents.x * uBlock.in_extents.y);
-    sec_last_eight.w =
-        texture_pos.y % (uBlock.in_extents.x * uBlock.in_extents.y);
-    sec_last_eight.y = sec_last_eight.w / uBlock.in_extents.x;
-    sec_last_eight.x = sec_last_eight.w % uBlock.in_extents.x;
+  ivec4 nc_pos = input_pos / uBlock.in_extents.w;
+    // we divide by HxW (uBlock.in_extents.w), to find the position along the
+    // batch/channel axis of these four elements.
 
-    ivec4 thr_last_eight;
-    thr_last_eight.z =
-        texture_pos.z / (uBlock.in_extents.x * uBlock.in_extents.y);
-    thr_last_eight.w =
-        texture_pos.z % (uBlock.in_extents.x * uBlock.in_extents.y);
-    thr_last_eight.y = thr_last_eight.w / uBlock.in_extents.x;
-    thr_last_eight.x = thr_last_eight.w % uBlock.in_extents.x;
+  ivec4 w_pos = input_pos % uBlock.in_extents.w;
+    // we compute the reminder mod HxW, to find the positions in the flatten
+    // out HxW plane.
 
-    ivec4 four_last_eight;
-    four_last_eight.z =
-        texture_pos.w / (uBlock.in_extents.x * uBlock.in_extents.y);
-    four_last_eight.w =
-        texture_pos.w % (uBlock.in_extents.x * uBlock.in_extents.y);
-    four_last_eight.y = four_last_eight.w / uBlock.in_extents.x;
-    four_last_eight.x = four_last_eight.w % uBlock.in_extents.x;
+  ivec4 x_pos = w_pos % uBlock.in_extents.x;
+  ivec4 y_pos = w_pos / uBlock.in_extents.x;
+    // we divide this "flatten out position" by H, to find the positions along
+    // the y-axis (height) and we compute its reminder mod H, to find the
+    // position along the x-axis (width).
 
-    ivec3 last_eight_pos = ivec3(last_eight.x, last_eight.y, last_eight.z / 4);
-    ivec3 sec_last_eight_pos =
-        ivec3(sec_last_eight.x, sec_last_eight.y, sec_last_eight.z / 4);
-    ivec3 thr_last_eight_pos =
-        ivec3(thr_last_eight.x, thr_last_eight.y, thr_last_eight.z / 4);
-    ivec3 four_last_eight_pos =
-        ivec3(four_last_eight.x, four_last_eight.y, four_last_eight.z / 4);
+  ivec4 z_pos = nc_pos / 4;
+  ivec4 ix = nc_pos % 4;
+    // z_pos contains the texel positions along the z-axis, and ix the
+    // indices inside each texel.
 
-    int texel_1 = texelFetch(uImage, last_eight_pos, 0)[last_eight.z];
-    int texel_2 = texelFetch(uImage, sec_last_eight_pos, 0)[sec_last_eight.z];
-    int texel_3 = texelFetch(uImage, thr_last_eight_pos, 0)[thr_last_eight.z];
-    int texel_4 = texelFetch(uImage, four_last_eight_pos, 0)[four_last_eight.z];
+  // now we fetch each uint8 element from the input, and we write out a uint32
+  // whose binary representation is equal to: tex3 tex2 tex1 tex0
 
-    uint ui32 = (uint(texel_4 & 0xFF) << 24) | (uint(texel_3 & 0xFF) << 16) |
-        (uint(texel_2 & 0xFF) << 8) | (uint(texel_1 & 0xFF));
+  int tex0 = texelFetch(uImage, ivec3(x_pos[0], y_pos[0], z_pos[0]), 0)[ix[0]];
+  int tex1 = texelFetch(uImage, ivec3(x_pos[1], y_pos[1], z_pos[1]), 0)[ix[1]];
+  int tex2 = texelFetch(uImage, ivec3(x_pos[2], y_pos[2], z_pos[2]), 0)[ix[2]];
+  int tex3 = texelFetch(uImage, ivec3(x_pos[3], y_pos[3], z_pos[3]), 0)[ix[3]];
 
-    uBuffer.data[texture_pos.x / 4] = ui32;
-  }
+  uint ui32 = (uint(tex3 & 0xFF) << 24)
+            | (uint(tex2 & 0xFF) << 16)
+            | (uint(tex1 & 0xFF) << 8)
+            | (uint(tex0 & 0xFF));
+
+  uBuffer.data[pos.x] = ui32;
 }
diff --git a/aten/src/ATen/native/vulkan/glsl/image_to_nchw_quantized_mul4.glsl b/aten/src/ATen/native/vulkan/glsl/image_to_nchw_quantized_mul4.glsl
new file mode 100644
index 0000000000000..210ed2b85ed66
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/image_to_nchw_quantized_mul4.glsl
@@ -0,0 +1,75 @@
+#version 450 core
+#define PRECISION $precision
+
+layout(std430) buffer;
+
+/*
+ * Input Sampler
+ */
+layout(set = 0, binding = 0) uniform PRECISION isampler3D uImage;
+
+/*
+ * Output Buffer
+ */
+layout(set = 0, binding = 1) buffer PRECISION restrict writeonly Buffer {
+  uint data[];
+}
+uBuffer;
+
+/*
+ * Params Buffer
+ */
+layout(set = 0, binding = 2) uniform PRECISION restrict Block {
+  // xyz contain the extents of the input texture, w contains HxW to help
+  // calculate buffer offsets
+  ivec4 in_extents;
+}
+uBlock;
+
+/*
+ * Local Work Group in_extents
+ */
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+    // each instance of the shader writes out four elements of the output
+    // by processing 4 consecutive texels at the same depth.
+    // global size = {HxW / 4, 1u, z_extent}.
+    // this shader requires HxW to be a multiple of 4, so that multiple
+    // planes can be processed in parallel
+
+  if (4 * pos.x >= uBlock.in_extents.w ||
+      pos.y > 0 ||
+      pos.z >= uBlock.in_extents.z) {
+    return;
+  }
+
+  ivec4 xy_pos = ivec4(0, 1, 2, 3) + 4 * pos.x;
+    // each output element is a uint32 made up four consecutive uint8 from the
+    // input in nchw format. xy_pos contains the positions of these four
+    // elements from the input in the flatten out HxW plane.
+
+  ivec4 x_pos = xy_pos % uBlock.in_extents.x;
+  ivec4 y_pos = xy_pos / uBlock.in_extents.x;
+    // we divide this "flatten out position" by H, to find the positions along
+    // the y-axis (height) and we compute its reminder mod H, to find the
+    // position along the x-axis (width).
+
+  const ivec4 intex0 = texelFetch(uImage, ivec3(x_pos[0], y_pos[0], pos.z), 0);
+  const ivec4 intex1 = texelFetch(uImage, ivec3(x_pos[1], y_pos[1], pos.z), 0);
+  const ivec4 intex2 = texelFetch(uImage, ivec3(x_pos[2], y_pos[2], pos.z), 0);
+  const ivec4 intex3 = texelFetch(uImage, ivec3(x_pos[3], y_pos[3], pos.z), 0);
+
+  const int base_index = 4 * pos.x + 4 * uBlock.in_extents.w * pos.z;
+  const ivec4 buf_indices =
+      base_index + ivec4(0, 1, 2, 3) * uBlock.in_extents.w;
+
+  for (int i = 0; i < 4; i += 1) {
+    uint ui32 = (uint(intex3[i] & 0xFF) << 24)
+              | (uint(intex2[i] & 0xFF) << 16)
+              | (uint(intex1[i] & 0xFF) << 8)
+              | (uint(intex0[i] & 0xFF));
+    uBuffer.data[buf_indices[i] / 4] = ui32;
+  }
+}
diff --git a/aten/src/ATen/native/vulkan/ops/Copy.cpp b/aten/src/ATen/native/vulkan/ops/Copy.cpp
index 5f63f3eb949a5..4e414e4366bcb 100644
--- a/aten/src/ATen/native/vulkan/ops/Copy.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Copy.cpp
@@ -173,6 +173,9 @@ void pack_cpu_to_vulkan(const Tensor& src, vTensor& dst) {
 }
 
 void pack_vulkan_to_cpu(vTensor& src, Tensor& dst) {
+  TORCH_CHECK(
+      !src.is_quantized(),
+      "Copy of vulkan quantized tensors to cpu is currently disabled!");
   api::Context* const context = api::context();
 
   // Refer to the comment in pack_cpu_to_vulkan for why at::kFloat is specified
diff --git a/aten/src/ATen/native/vulkan/ops/Utils.cpp b/aten/src/ATen/native/vulkan/ops/Utils.cpp
index 4c0f866ca9a39..d1d3c297c88dc 100644
--- a/aten/src/ATen/native/vulkan/ops/Utils.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Utils.cpp
@@ -41,9 +41,12 @@ static api::ShaderSource get_nchw_to_image_shader(const vTensor& v_dst) {
 
 static api::ShaderSource get_image_to_nchw_shader(const vTensor& v_src) {
   if (v_src.is_quantized()) {
+    auto plane_size =
+        get_dim<Dim4D::Height>(v_src) * get_dim<Dim4D::Width>(v_src);
     switch (v_src.storage_type()) {
       case api::StorageType::TEXTURE_3D:
-        return VK_KERNEL(image_to_nchw_quantized);
+        return plane_size % 4 == 0 ? VK_KERNEL(image_to_nchw_quantized_mul4)
+                                   : VK_KERNEL(image_to_nchw_quantized);
       default:
         TORCH_CHECK(false, "No kernel available!");
       case api::StorageType::BUFFER:
@@ -131,6 +134,19 @@ void record_image_to_nchw_op(
       plane_size,
   };
 
+  if (v_src.is_quantized()) {
+    if (plane_size % 4 == 0) {
+      global_size.data[0u] = plane_size / 4;
+      global_size.data[1u] = 1;
+      local_size.data[0u] *= local_size.data[1u];
+      local_size.data[1u] = 1;
+    } else {
+      uint32_t numel = v_src.numel();
+      global_size = {api::utils::div_up(numel, uint32_t(4)), 1u, 1u};
+      local_size = {64u, 1u, 1u};
+    }
+  }
+
   api::UniformParamsBuffer params(context, block);
   context->submit_compute_job(
       // shader descriptor
diff --git a/aten/src/ATen/test/vulkan_quantized_api_test.cpp b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
index fffd4691ba772..2467ab21b7a9d 100644
--- a/aten/src/ATen/test/vulkan_quantized_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
@@ -341,7 +341,8 @@ TEST_F(VulkanAPITest, copy_to_buffer_channels_last) {
   }
 }
 
-TEST_F(VulkanAPITest, support_vulkan) {
+// TODO: Fix vulkan to cpu on Android
+TEST_F(VulkanAPITest, DISABLED_support_vulkan) {
   const double scale = 0.1;
   const int64_t zero_point = 10;
 
@@ -379,7 +380,77 @@ TEST_F(VulkanAPITest, support_vulkan) {
   ASSERT_TRUE(check);
 }
 
-TEST_F(VulkanAPITest, quantize_per_tensor) {
+void test_cpu_to_vulkan_and_vulkan_to_cpu(
+    const at::IntArrayRef input_shape,
+    const double scale,
+    const int zero_point) {
+
+  // produce random quantized cpu tensor
+  auto in_cpu = produce_random_tensor(input_shape);
+  auto in_q_cpu = at::quantize_per_tensor(
+      in_cpu, scale, zero_point, c10::ScalarType::QUInt8);
+
+  // copy quantized cpu tensor to vulkan
+  auto in_q_cpu_vk = cpu_to_vulkan(in_q_cpu);
+
+  // copy quantized vulkan tensor to cpu
+  auto out_q_cpu = vulkan_to_cpu(in_q_cpu_vk, in_q_cpu);
+
+  // check that the copy equals the original
+  const auto diff = at::native::int_repr_quantized_cpu(in_q_cpu)
+                    - at::native::int_repr_quantized_cpu(out_q_cpu);
+
+  const int error = diff.abs().max().item<int>();
+
+  const auto check = (error == 0);
+
+  if (!check) {
+    std::cout
+      << "Copy to vulkan and back to cpu failed with input shape: "
+      << input_shape << " scale: " << scale << " and zero point: "
+      << zero_point << std::endl;
+    std::cout << "Error: " << error << std::endl;
+  }
+
+  ASSERT_TRUE(check);
+}
+
+void test_cpu_to_vulkan_and_vulkan_to_cpu_random() {
+  const double scale = produce_random_scale();
+  const int64_t zero_point = produce_random_zero_point(c10::ScalarType::QUInt8);
+  const at::IntArrayRef tensor_shape =
+    {rand_pos_int(30), rand_pos_int(30), rand_pos_int(100), rand_pos_int(100)};
+  test_cpu_to_vulkan_and_vulkan_to_cpu(tensor_shape, scale, zero_point);
+}
+
+// TODO: Fix vulkan to cpu on Android
+TEST_F(VulkanAPITest, DISABLED_cpu_to_vulkan_and_vulkan_to_cpu) {
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 1, 1}, 0.13, 21);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 1, 4}, 0.3, 87);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 4, 1}, 0.2, 120);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 7, 7}, 0.3, 87);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 8, 8}, 0.1, 10);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({3, 5, 8, 8}, 0.04, 97);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 11, 17}, 0.07, 15);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 12, 17}, 0.1, 10);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({3, 5, 12, 17}, 0.1, 10);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 17, 12}, 0.1, 10);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({2, 4, 17, 12}, 0.1, 10);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 10, 14}, 0.0001, 101);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({3, 5, 10, 14}, 0.009, 43);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({3, 5, 10, 15}, 0.1, 19);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({4, 4, 9, 17}, 0.1, 19);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({3, 5, 25, 29}, 0.1, 19);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({4, 4, 25, 29}, 0.1, 19);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({11, 17, 25, 29}, 0.027, 89);
+
+  for (int i = 0; i < 20; i += 1) {
+    test_cpu_to_vulkan_and_vulkan_to_cpu_random();
+  }
+}
+
+// TODO: Fix vulkan to cpu on Android
+TEST_F(VulkanAPITest, DISABLED_quantize_per_tensor) {
   const auto in_cpu =
       at::rand({2, 13, 32, 27}, at::device(at::kCPU).dtype(at::kFloat)) * 6;
   const auto in_vulkan = in_cpu.vulkan();
@@ -407,6 +478,79 @@ TEST_F(VulkanAPITest, quantize_per_tensor) {
   ASSERT_TRUE(check);
 }
 
+void test_quantize_per_tensor_and_vulkan_to_cpu(
+    const at::IntArrayRef input_shape,
+    const double input_scale,
+    const int input_zero_point,
+    const int tolerance = 1) {
+  // tolerance = 1, to allow for precision differences after dividing by random
+  // scale which could result on a difference of 1 unit in the quantized result
+
+  at::Tensor input = produce_random_tensor(input_shape);
+
+  // quantize tensor
+  at::Tensor out_q_cpu = at::quantize_per_tensor(
+    input, input_scale, input_zero_point, c10::ScalarType::QUInt8);
+
+  at::Tensor out_q_vk = at::quantize_per_tensor(
+    input.vulkan(), input_scale, input_zero_point, c10::ScalarType::QUInt8);
+
+  // copy vulkan tensor to cpu
+  at::Tensor out_q_vk_cpu = vulkan_to_cpu(out_q_vk, out_q_cpu);
+
+  const auto diff = at::native::int_repr_quantized_cpu(out_q_vk_cpu)
+                    - at::native::int_repr_quantized_cpu(out_q_cpu);
+
+  const int error = diff.abs().max().item<int>();
+
+  const auto check = (error <= tolerance);
+
+  if (!check) {
+    std::cout
+      << "Quantize and copy to cpu failed with input shape: " << input_shape
+      << " scale: " << input_scale << " and zero point: " << input_zero_point
+    << std::endl;
+    std::cout << "Error: " << error << std::endl;
+  }
+
+  ASSERT_TRUE(check);
+}
+
+void test_quantize_per_tensor_and_vulkan_to_cpu_random() {
+  const double scale = produce_random_scale();
+  const int64_t zero_point = produce_random_zero_point(c10::ScalarType::QUInt8);
+  const at::IntArrayRef tensor_shape =
+    {rand_pos_int(30), rand_pos_int(30), rand_pos_int(100), rand_pos_int(100)};
+  test_quantize_per_tensor_and_vulkan_to_cpu(tensor_shape, scale, zero_point);
+}
+
+// TODO: Fix vulkan to cpu on Android
+TEST_F(VulkanAPITest, DISABLED_quantize_per_tensor_and_vulkan_to_cpu) {
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 1, 1}, 0.13, 21);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 1, 4}, 0.3, 87);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 4, 1}, 0.2, 120);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 7, 7}, 0.3, 87);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 8, 8}, 0.1, 10);
+  test_quantize_per_tensor_and_vulkan_to_cpu({3, 5, 8, 8}, 0.04, 97);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 11, 17}, 0.07, 15);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 12, 17}, 0.1, 10);
+  test_quantize_per_tensor_and_vulkan_to_cpu({3, 5, 12, 17}, 0.1, 10);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 17, 12}, 0.1, 10);
+  test_quantize_per_tensor_and_vulkan_to_cpu({2, 4, 17, 12}, 0.1, 10);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 10, 14}, 0.0001, 101);
+  test_quantize_per_tensor_and_vulkan_to_cpu({3, 5, 10, 14}, 0.009, 43);
+  test_quantize_per_tensor_and_vulkan_to_cpu({3, 5, 10, 15}, 0.1, 19);
+  test_quantize_per_tensor_and_vulkan_to_cpu({4, 4, 9, 17}, 0.1, 19);
+  test_quantize_per_tensor_and_vulkan_to_cpu({3, 5, 25, 29}, 0.1, 19);
+  test_quantize_per_tensor_and_vulkan_to_cpu({4, 4, 25, 29}, 0.1, 19);
+  test_quantize_per_tensor_and_vulkan_to_cpu({11, 17, 25, 29}, 0.027, 89);
+  test_quantize_per_tensor_and_vulkan_to_cpu({3, 16, 77, 54}, 0.204173, 229);
+
+  for (int i = 0; i < 20; i += 1) {
+    test_quantize_per_tensor_and_vulkan_to_cpu_random();
+  }
+}
+
 TEST_F(VulkanAPITest, quantize_dequantize) {
   const auto in_cpu =
       at::rand({2, 13, 32, 27}, at::device(at::kCPU).dtype(at::kFloat)) * 6;
@@ -763,7 +907,6 @@ TEST_F(VulkanAPITest, quantized_add_broadcast2) {
   ASSERT_TRUE(check);
 }
 
-
 TEST_F(VulkanAPITest, quantized_add_broadcast3) {
   if (!at::is_vulkan_available()) {
     return;

From 2bd4634feb4ad8ef67f83973a28bed75d8ccd5e5 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Tue, 6 Dec 2022 11:49:29 -0800
Subject: [PATCH 1651/1922] [quant] Add fused "q - qlinear - dq" operator with
 skipped quant op for output of linear (#89882)

Summary:
Added two ops:
* torch.ops.quantized.linear_with_input_q_dq_qweight_dq_output_fp32
* torch.ops.quantized.linear_with_input_q_dq_qweight_dq_relu_output_fp32

corresponding pattern for `linear_with_input_q_dq_qweight_dq_output_fp32` would be:
```
input -> q* -> dq* -> linear* ->
           qweight -> dq* /
```

Test Plan:
python test/test_quantization.py -k TestQuantizedLinear.test_qlinear_with_input_q_dq_qweight_dq

Reviewers:

Subscribers:

Tasks:

Tags:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89882
Approved by: https://github.com/vkuzo
---
 aten/src/ATen/native/quantized/PackedParams.h |  49 +++++
 .../ATen/native/quantized/cpu/fbgemm_utils.h  |  18 ++
 .../src/ATen/native/quantized/cpu/qlinear.cpp | 179 ++++++++++++++++++
 aten/src/ATen/native/quantized/library.cpp    |  32 ++++
 test/quantization/core/test_quantized_op.py   | 118 ++++++++++++
 5 files changed, 396 insertions(+)

diff --git a/aten/src/ATen/native/quantized/PackedParams.h b/aten/src/ATen/native/quantized/PackedParams.h
index 179fcce23dfe5..a442628573fec 100644
--- a/aten/src/ATen/native/quantized/PackedParams.h
+++ b/aten/src/ATen/native/quantized/PackedParams.h
@@ -36,6 +36,55 @@ struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
     return output;
   }
 
+  // Corresponding pattern (the ops with `*` are part of the pattern that
+  // represents the computation of quantized::linear_with_input_q_dq_qweight_dq_output_fp32):
+  // input -> q* -> dq* -> linear* ->
+  //         qweight -> dq* /
+  //
+  // After fusion:
+  // input -> quantized::linear_with_input_q_dq_qweight_dq_output_fp32* ->
+  //         qweight /
+  //
+  // Additional Note: the weight is packed as well
+  // Params:
+  //    X: float32 Tensor, will be quantized to quint8 in the op
+  //    W_prepack: packed qint8 quantized weight and bias
+  // Returns:
+  //    Y: float32 Tensor
+  virtual at::Tensor apply_with_input_q_dq_qweight_dq_output_fp32(
+      at::Tensor input,
+      double input_scale,
+      int64_t input_zero_point) {
+    throw std::runtime_error(
+        "apply_with_input_q_dq_qweight_dq_output_fp32 is not implemented for this packed "
+        "parameter type");
+    return {};
+  }
+
+  // Corresponding pattern (the ops with `*` are part of the pattern that
+  // represents the computation of quantized::linear_with_input_q_dq_qweight_dq_relu_output_fp32):
+  // input -> q* -> dq* -> linear* -> relu* ->
+  //         qweight -> dq* /
+  //
+  // After fusion:
+  // input -> quantized::linear_with_input_q_dq_qweight_dq_relu_output_fp32* ->
+  //         qweight /
+  //
+  // Additional Note: the weight is packed as well
+  // Params:
+  //    input: float32 Tensor, will be quantized to quint8 in the op
+  // Returns:
+  //    float32 Tensor
+  virtual at::Tensor apply_with_input_q_dq_qweight_dq_relu_output_fp32(
+      at::Tensor input,
+      double input_scale,
+      int64_t input_zero_point) {
+    throw std::runtime_error(
+        "apply_with_input_q_dq_qweight_dq_relu_output_fp32 is not implemented for this packed "
+        "parameter type");
+    return {};
+  }
+
   virtual at::Tensor apply_dynamic(
       at::Tensor input,
       bool reduce_range = false) = 0;
diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
index d43409231ab69..bfaf5b93d667b 100644
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
@@ -45,6 +45,7 @@ struct TORCH_API PackedLinearWeight : public LinearPackedParamsBase {
       at::Tensor input,
       double output_scale,
       int64_t output_zero_point) override;
+
   at::Tensor apply_relu(
       at::Tensor input,
       double output_scale,
@@ -62,8 +63,19 @@ struct TORCH_API PackedLinearWeight : public LinearPackedParamsBase {
       int64_t output_zero_point,
       at::Tensor& output) override;
 
+  at::Tensor apply_with_input_q_dq_qweight_dq_output_fp32(
+      at::Tensor input,
+      double input_scale,
+      int64_t input_zero_point) override;
+
+  at::Tensor apply_with_input_q_dq_qweight_dq_relu_output_fp32(
+      at::Tensor input,
+      double input_scale,
+      int64_t input_zero_point) override;
+
   at::Tensor apply_dynamic(at::Tensor input, bool reduce_range = false)
       override;
+
   at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false)
       override;
 
@@ -85,6 +97,12 @@ struct TORCH_API PackedLinearWeight : public LinearPackedParamsBase {
       int64_t output_zero_point,
       at::Tensor& output);
 
+  template <bool ReluFused>
+  at::Tensor apply_with_input_q_dq_qweight_dq_output_fp32_impl(
+      const at::Tensor& input,
+      double input_scale,
+      int64_t input_zero_point);
+
   template <bool ReluFused>
   at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range = false);
 };
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
index 2bf92ffa5f0f7..1e10f4f88b7c3 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -283,6 +283,162 @@ at::Tensor& PackedLinearWeight::apply_relu_out(
   return apply_impl<true>(input, output_scale, output_zero_point, output);
 }
 
+at::Tensor PackedLinearWeight::apply_with_input_q_dq_qweight_dq_output_fp32(
+  at::Tensor input,
+  double input_scale,
+  int64_t input_zero_point) {
+  TORCH_CHECK(!input.is_quantized(), "Input tensor for apply_with_input_q_dq_qweight_dq_output_fp32 is quantized; "
+  "Expected input tensor in PackedLinearWeight::apply_with_input_q_dq_qweight_dq_output_fp32 to be full precision.");
+
+  return apply_with_input_q_dq_qweight_dq_output_fp32_impl<false>(input, input_scale, input_zero_point);
+}
+
+at::Tensor PackedLinearWeight::apply_with_input_q_dq_qweight_dq_relu_output_fp32(
+  at::Tensor input,
+  double input_scale,
+  int64_t input_zero_point) {
+  TORCH_CHECK(!input.is_quantized(), "Input tensor for apply_with_input_q_dq_qweight_dq_output_fp32 is quantized; "
+  "Expected input tensor in PackedLinearWeight::apply_with_input_q_dq_qweight_dq_output_fp32 to be full precision.");
+
+  return apply_with_input_q_dq_qweight_dq_output_fp32_impl<true>(input, input_scale, input_zero_point);
+}
+
+
+template <bool ReluFused>
+at::Tensor PackedLinearWeight::apply_with_input_q_dq_qweight_dq_output_fp32_impl(
+    const at::Tensor& input,
+    double input_scale,
+    int64_t input_zero_point) {
+  TORCH_CHECK(
+      fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
+
+  auto input_contig = input.expect_contiguous();
+  const auto* input_ptr = input_contig->data_ptr<float>();
+
+  TORCH_CHECK(
+      input.dim() >= 2,
+      "The dimension of input tensor should be larger than or equal to 2");
+  int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
+
+  auto packB = w.get();
+
+  int64_t N = static_cast<int64_t>(packB->numCols());
+  int64_t K = input.sizes()[input.dim() - 1];
+  TORCH_CHECK(
+      K == static_cast<int64_t>(packB->numRows()),
+      "The number of rows in the packB should be equal to K: " +
+          std::to_string(K));
+
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float input_scale_float = input_scale;
+  int32_t input_zero_point_int32 = input_zero_point;
+
+  TORCH_CHECK(
+      w_scale.size() == w_zp.size(),
+      "Weight scales and zero points vectors should have the same size.");
+
+  const float* bias_ptr = nullptr;
+  c10::MaybeOwned<at::Tensor> bias_contig;
+  if (this->bias_.has_value()) {
+    auto& bias = this->bias_.value();
+    bias_contig = bias.expect_contiguous();
+    TORCH_CHECK(bias_contig->dim() == 1, "bias should be a vector (1D Tensor)");
+    TORCH_CHECK(
+        bias_contig->sizes()[0] == N, "bias should have N elements: " + std::to_string(N));
+    bias_ptr = bias_contig->data_ptr<float>();
+  }
+
+  std::vector<int64_t> out_sizes = input.sizes().vec();
+  out_sizes.back() = N;
+  // Allocate output Tensor and a buffer for fbgemmPacked to use
+  auto output = at::empty(out_sizes, input.options().dtype(at::kFloat));
+  auto buffer = at::empty_like(
+      output,
+      output.options().dtype(at::kInt),
+      LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+
+  int num_tasks = at::get_num_threads();
+  at::parallel_for(0, num_tasks, 1, [&](int64_t begin, int64_t end) {
+    fbgemm::PackAWithQuantRowOffset<uint8_t> packA(
+        /*trans=*/fbgemm::matrix_op_t::NoTranspose,
+        /*nRow=*/M,
+        /*nCol=*/K,
+        /*smat=*/input_ptr,
+        /*ld=*/K,
+        /*pmat=*/nullptr,
+        /*scale=*/input_scale_float,
+        /*zero_pt=*/input_zero_point_int32);
+
+    fbgemm::DoNothing<float, float> doNothingObj{};
+    for (const auto task_id : c10::irange(begin, end)) {
+      if (q_scheme == c10::kPerTensorAffine) {
+        // Process the per tensor quantization.
+        //
+        // After the uint8 * int8 matrix multiplication is performed, this
+        // operation does:
+        //  1) Add in row and column offsets to the rows and columns,
+        //  respectively.
+        //  2) Add in the bias term.
+        fbgemm::ReQuantizeForFloat<ReluFused>
+            outputProcObj(
+                doNothingObj,
+                input_scale_float,
+                w_scale.data(),
+                input_zero_point_int32,
+                w_zp.data(),
+                packA.getRowOffsetBuffer(),
+                col_offsets.data(),
+                bias_ptr,
+                N /* nCol */);
+
+        // Do the GEMM
+        fbgemm::fbgemmPacked(
+            /*packA=*/packA,
+            /*packB=*/*packB,
+            /*C=*/output.data_ptr<float>(),
+            /*C_buffer=*/buffer.data_ptr<int32_t>(),
+            /*ldc=*/N,
+            /*outProcess=*/outputProcObj,
+            /*thread_id=*/task_id,
+            /*num_threads=*/num_tasks);
+      } else if (q_scheme == c10::kPerChannelAffine) {
+        // Process the per channel quantization.
+        //
+        // After the uint8 * int8 matrix multiplication is performed, this
+        // operation does:
+        //  1) Add in row and column offsets to the rows and columns,
+        //  respectively.
+        //  2) Add in the bias term.
+        fbgemm::ReQuantizeForFloat<
+            ReluFused,
+            fbgemm::QuantizationGranularity::OUT_CHANNEL>
+            outputProcObj(
+                doNothingObj,
+                input_scale_float,
+                w_scale.data(),
+                input_zero_point_int32,
+                w_zp.data(),
+                packA.getRowOffsetBuffer(),
+                col_offsets.data(),
+                bias_ptr,
+                N /* nCol */);
+
+        // Do the GEMM
+        fbgemm::fbgemmPacked(
+            /*packA=*/packA,
+            /*packB=*/*packB,
+            /*C=*/output.data_ptr<float>(),
+            /*C_buffer=*/buffer.data_ptr<int32_t>(),
+            /*ldc=*/N,
+            /*outProcess=*/outputProcObj,
+            /*thread_id=*/task_id,
+            /*num_threads=*/num_tasks);
+      }
+    }
+  });
+  return output;
+}
+
 #endif // USE_FBGEMM
 
 #ifdef USE_PYTORCH_QNNPACK
@@ -780,6 +936,24 @@ class QLinearLeakyReluInt8 final {
   }
 };
 
+template <bool ReluFused>
+class QLinearInt8FusedQDQ final {
+ public:
+  static at::Tensor run(
+      at::Tensor input,
+      double input_scale,
+      int64_t input_zero_point,
+      const c10::intrusive_ptr<LinearPackedParamsBase>& packed_weight) {
+    if (ReluFused) {
+      return packed_weight->apply_with_input_q_dq_qweight_dq_relu_output_fp32(
+          std::move(input), input_scale, input_zero_point);
+    } else {
+      return packed_weight->apply_with_input_q_dq_qweight_dq_output_fp32(
+          std::move(input), input_scale, input_zero_point);
+    }
+  }
+};
+
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("quantized::linear"), TORCH_FN(QLinearInt8<false>::run));
   m.impl(TORCH_SELECTIVE_NAME("quantized::linear_relu"), TORCH_FN(QLinearInt8<true>::run));
@@ -790,6 +964,11 @@ TORCH_LIBRARY_IMPL(_quantized, QuantizedCPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("_quantized::linear"), TORCH_FN(QLinearInt8<false>::run));
 }
 
+TORCH_LIBRARY_IMPL(quantized, CPU, m) {
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_with_input_q_dq_qweight_dq_output_fp32"), TORCH_FN(QLinearInt8FusedQDQ<false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_with_input_q_dq_qweight_dq_relu_output_fp32"), TORCH_FN(QLinearInt8FusedQDQ<true>::run));
+}
+
 } // namespace
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp
index 160cda71c86d6..92990fc267270 100644
--- a/aten/src/ATen/native/quantized/library.cpp
+++ b/aten/src/ATen/native/quantized/library.cpp
@@ -153,6 +153,38 @@ TORCH_LIBRARY(quantized, m) {
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic_fp16(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> Tensor Y"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_relu_dynamic_fp16(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> Tensor Y"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_leaky_relu(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i, float negative_slope) -> Tensor Y"));
+  // Corresponding pattern (the ops with `*` are part of the pattern that
+  // represents the computation of quantized::linear_with_input_q_dq_qweight_dq_output_fp32):
+  // input -> q* -> dq* -> linear* ->
+  //         qweight -> dq* /
+  //
+  // After fusion:
+  // input -> quantized::linear_with_input_q_dq_qweight_dq_output_fp32* ->
+  //         qweight /
+  //
+  // Additional Note: the weight is packed as well
+  // Params:
+  //    X: float32 Tensor, will be quantized to quint8 in the op
+  //    W_prepack: packed qint8 quantized weight and bias
+  // Returns:
+  //    Y: float32 Tensor
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_with_input_q_dq_qweight_dq_output_fp32(Tensor X, float X_scale, int X_zero_point, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> Tensor Y"));
+  // Corresponding pattern (the ops with `*` are part of the pattern that
+  // represents the computation of quantized::linear_with_input_q_dq_qweight_dq_relu_output_fp32):
+  // input -> q* -> dq* -> linear* -> relu* ->
+  //         qweight -> dq* /
+  //
+  // After fusion:
+  // input -> quantized::linear_with_input_q_dq_qweight_dq_relu_output_fp32* ->
+  //         qweight /
+  //
+  // Additional Note: the weight is packed as well
+  // Params:
+  //    X: float32 Tensor, will be quantized to quint8 in the op
+  //    W_prepack: packed qint8 quantized weight and bias
+  // Returns:
+  //    Y: float32 Tensor
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_with_input_q_dq_qweight_dq_relu_output_fp32(Tensor X, float X_scale, int X_zero_point, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> Tensor Y"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_prepack(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_prepack_fp16(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_prepack_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack"));
diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index bfc0ee0fa93be..c91a2bf547280 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -3698,6 +3698,124 @@ def test_qlinear_relu(self):
             self._test_qlinear_impl(batch_size, input_channels, output_channels,
                                     use_bias, post_op, use_multi_dim_input, use_channelwise)
 
+    @given(batch_size=st.integers(1, 4),
+           input_channels=st.integers(16, 32),
+           output_channels=st.integers(4, 8),
+           use_bias=st.booleans(),
+           use_relu=st.booleans(),
+           use_multi_dim_input=st.booleans(),
+           use_channelwise=st.booleans())
+    @skipIfNoFBGEMM
+    def test_qlinear_with_input_q_dq_qweight_dq_output_fp32(
+            self, batch_size, input_channels, output_channels, use_bias,
+            use_relu, use_multi_dim_input, use_channelwise):
+        decimal_val = 4
+        dtypes = [torch.quint8]
+        for dtype in dtypes:
+            # No support for channelwise in xnnpack (int8)
+            # ONEDNN does not support qint8
+            if dtype == torch.qint8 and (use_channelwise or qengine_is_onednn()):
+                return
+
+            nptype = np_dtype[dtype]
+            qlinear_prepack = torch.ops.quantized.linear_prepack
+            if use_relu:
+                qlinear = torch.ops.quantized.linear_with_input_q_dq_qweight_dq_relu_output_fp32
+            else:
+                qlinear = torch.ops.quantized.linear_with_input_q_dq_qweight_dq_output_fp32
+            if use_multi_dim_input:
+                batch_size *= 3  # Test the multi-dim input tensor
+            X_scale = 1.5
+            X_zp = 5
+            X_value_min = -128 if dtype == torch.qint8 else 0
+            X_value_max = 127 if dtype == torch.qint8 else 255
+            X_q0 = np.round(
+                np.random.rand(batch_size, input_channels) *
+                (X_value_max - X_value_min)
+                + X_value_min
+            ).astype(nptype)
+
+            W_scales = np.random.rand(output_channels)
+            # xnnpack forces W_zp to 0 when using symmetric quantization
+            # ONEDNN only supports symmetric quantization of weight
+            if dtype == torch.qint8 or qengine_is_onednn():
+                W_zps = np.zeros(output_channels).astype(np.int)
+            else:
+                W_zps = np.round(np.random.rand(output_channels) * 100 - 50).astype(np.int)
+            # when using symmetric quantization
+            # special restriction for xnnpack fully connected op weight
+            # [-127, 127] instead of [-128, 127]
+            W_value_min = -127 if dtype == torch.qint8 else -128
+            W_value_max = 127
+            W_q0 = np.round(
+                np.random.rand(output_channels, input_channels)
+                * (W_value_max - W_value_min)
+                + W_value_min
+            ).astype(np.int8)  # weight is always int8_t
+            b_value_min = -10
+            b_value_max = 10
+            b_q0 = np.round(
+                np.random.rand(output_channels) *
+                (b_value_max - b_value_min) + b_value_min
+            ).astype(np.int32) if use_bias else None
+            if torch.backends.quantized.engine in ('x86', 'fbgemm', 'onednn'):
+                avoid_vpmaddubsw_overflow_linear(
+                    batch_size,
+                    input_channels,
+                    output_channels,
+                    X_q0,
+                    X_value_min,
+                    X_value_max,
+                    W_q0,
+                    W_value_min,
+                    W_value_max,
+                )
+            X = torch.from_numpy(_dequantize(
+                X_q0, X_scale, X_zp)).to(dtype=torch.float)
+            X_q = torch.quantize_per_tensor(
+                X, scale=X_scale, zero_point=X_zp, dtype=dtype)
+            if use_channelwise:
+                W = torch.from_numpy(_dequantize(W_q0, W_scales.reshape(
+                    (-1, 1)), W_zps.reshape((-1, 1)))).to(dtype=torch.float)
+                W_q = torch.quantize_per_channel(W, scales=torch.from_numpy(W_scales),
+                                                 zero_points=torch.from_numpy(W_zps), axis=0, dtype=torch.qint8)
+                b = torch.from_numpy(_dequantize(
+                    b_q0, X_scale * W_scales, 0)).to(dtype=torch.float) if use_bias else None
+                b_q = torch.quantize_per_channel(b, scales=torch.from_numpy(X_scale * W_scales),
+                                                 zero_points=torch.zeros(output_channels, dtype=torch.long),
+                                                 axis=0, dtype=torch.qint32) if use_bias else None
+            else:
+                W = torch.from_numpy(_dequantize(
+                    W_q0, W_scales[0], W_zps[0])).to(dtype=torch.float)
+                W_q = torch.quantize_per_tensor(W, scale=W_scales[0], zero_point=(
+                    W_zps[0].astype(int).item()), dtype=torch.qint8)
+                b = torch.from_numpy(_dequantize(
+                    b_q0, X_scale * (W_scales[0].item()), 0)).to(dtype=torch.float) if use_bias else None
+                b_q = torch.quantize_per_tensor(
+                    b, scale=X_scale * (W_scales[0].item()), zero_point=0, dtype=torch.qint32) if use_bias else None
+            # Compare X_scale * W_scale * input_channels * X_value_max * W_value_max with
+            # Y_scale * 255 (max for uint8).
+            Y_scale = 125.1234
+            Y_zp = 5
+            # Weight prepacking operator for quantized Linear
+            float_bias = b if use_bias else None
+            W_prepack = qlinear_prepack(W_q, float_bias)
+            if use_multi_dim_input:
+                X = X.view(3, int(batch_size / 3), input_channels)
+                X_q = X_q.view(3, int(batch_size / 3), input_channels)
+            # Quantized Linear operator with prepacked weight
+            Y_q_dq = qlinear(X, X_scale, X_zp, W_prepack)
+            # Test both per-tensor and per-channel quantization
+            # Reference quantized result from PyTorch Linear operator
+            W_fp32 = W_q.dequantize().to(dtype=torch.float)
+            X_fp32 = X_q.dequantize().to(dtype=torch.float)
+            b_fp32 = b_q.dequantize().to(dtype=torch.float) if use_bias else None
+            Y_fp32_ref = F.linear(X_fp32, W_fp32, b_fp32)
+            if use_relu:
+                Y_fp32_ref[Y_fp32_ref < 0.0] = 0.0
+            decimal_val = 1
+            np.testing.assert_array_almost_equal(Y_fp32_ref.numpy(), Y_q_dq.numpy(), decimal=decimal_val)
+
     @given(batch_size=st.integers(1, 4),
            # in cudnn v. 8.4.0, there is a limitation that input channels
            # should be a multiple of 4 for int8 tensors. in cudnn v.8.3.3

From 8f7830ca90d03cd8c16cfd35c9a7c5f402cdba3b Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 7 Dec 2022 00:43:31 +0000
Subject: [PATCH 1652/1922] Revert "[MPS] Fix median_out_mps caching (#90326)"

This reverts commit 23c192c3df2fd53a2110d179eabb549ceb7beeef.

Reverted https://github.com/pytorch/pytorch/pull/90326 on behalf of https://github.com/malfet due to Modified wrong key
---
 aten/src/ATen/native/mps/operations/ReduceOps.mm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index b7d458e1b0d8b..d905107b8ffd4 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -1318,7 +1318,7 @@ Tensor min_mps(const Tensor& input_t) {
     auto stream = at::mps::getCurrentMPSStream();
 
     @autoreleasepool {
-        string key = func_name + ":" + to_string(dim_) + ":" + native_mps::getTensorsStringKey(input_t);
+        string key = func_name + ":" + to_string(dim_) + ":" + native_mps::getMPSTypeString(input_t.scalar_type());
         CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
 
         if(!cachedGraph) {

From 0bc8a1372d46e4d8b329407b24550840d261010a Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Tue, 6 Dec 2022 08:12:59 -0800
Subject: [PATCH 1653/1922] [Quant][fx][bc-breaking] Remove unused functions in
 fx/utils.py (#90025)

Summary and BC-breaking notes: This commit removes the following
unused functions from both the `torch.quantization` and the
`torch.ao.quantization` namespaces:

```
graph_pretty_str
get_per_tensor_qparams
quantize_node
get_qconv_op
create_qparam_nodes
node_return_type_is_int
is_get_tensor_info_node
```

Test Plan:
python test/test_quantization.py TestQuantizeFx
python test/test_quantization.py TestQuantizeFxOps
python test/test_quantization.py TestAOMigrationQuantizationFx

Reviewers: jerryzh168, vkuzo

Subscribers: jerryzh168, vkuzo
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90025
Approved by: https://github.com/HDCharles
---
 .../ao_migration/test_quantization_fx.py      |   7 -
 torch/ao/quantization/fx/utils.py             | 143 +-----------------
 torch/ao/quantization/quantize_fx.py          |   1 -
 torch/quantization/fx/utils.py                |   7 -
 4 files changed, 1 insertion(+), 157 deletions(-)

diff --git a/test/quantization/ao_migration/test_quantization_fx.py b/test/quantization/ao_migration/test_quantization_fx.py
index c75727717a736..1e64dd4ebfbf3 100644
--- a/test/quantization/ao_migration/test_quantization_fx.py
+++ b/test/quantization/ao_migration/test_quantization_fx.py
@@ -194,22 +194,15 @@ def test_package_import_fx_utils(self):
 
     def test_function_import_fx_utils(self):
         function_list = [
-            'graph_pretty_str',
-            'get_per_tensor_qparams',
-            'quantize_node',
             'get_custom_module_class_keys',
             'get_linear_prepack_op_for_dtype',
             'get_qconv_prepack_op',
-            'get_qconv_op',
             'get_new_attr_name_with_prefix',
             'graph_module_from_producer_nodes',
             'assert_and_get_unique_device',
             'create_getattr_from_value',
-            'create_qparam_nodes',
             'all_node_args_have_no_tensors',
-            'node_return_type_is_int',
             'get_non_observable_arg_indexes_and_types',
-            'is_get_tensor_info_node',
             'maybe_get_next_module'
         ]
         self._test_function_import('fx.utils', function_list)
diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py
index 4ff6d03983ea6..242e18935aa2a 100644
--- a/torch/ao/quantization/fx/utils.py
+++ b/torch/ao/quantization/fx/utils.py
@@ -1,5 +1,4 @@
 import copy
-import re
 import torch
 import torch.nn as nn
 from torch.ao.quantization import (
@@ -24,10 +23,7 @@
     qconfig_equals,
 )
 from torch.ao.quantization.stubs import DeQuantStub
-from torch.ao.quantization.utils import (
-    activation_is_statically_quantized,
-    is_per_tensor,
-)
+from torch.ao.quantization.utils import activation_is_statically_quantized
 from torch.ao.quantization.quantize import is_activation_post_process
 
 from torch.fx import GraphModule, map_arg
@@ -53,22 +49,16 @@
     "collect_producer_nodes",
     "create_getattr_from_value",
     "create_node_from_old_node_preserve_meta",
-    "create_qparam_nodes",
     "EMPTY_ARG_DICT",
     "get_custom_module_class_keys",
     "get_linear_prepack_op_for_dtype",
     "get_new_attr_name_with_prefix",
     "get_non_observable_arg_indexes_and_types",
-    "get_per_tensor_qparams",
-    "get_qconv_op",
     "get_qconv_prepack_op",
     "get_skipped_module_name_and_classes",
     "graph_module_from_producer_nodes",
-    "graph_pretty_str",
-    "is_get_tensor_info_node",
     "maybe_get_next_module",
     "NodeInfo",
-    "node_return_type_is_int",
     "node_arg_is_bias",
     "node_arg_is_weight",
     "NON_OBSERVABLE_ARG_DICT",
@@ -96,84 +86,6 @@ def node_arg_is_bias(node: Node, arg: Any, backend_config: BackendConfig) -> boo
         return node.kwargs.get("bias") is arg
     return False
 
-def graph_pretty_str(g, shorten=True) -> str:
-    """Returns a printable representation of the ops in the graph of g.
-    If shorten is True, tries to abbreviate fields.
-    """
-    built_in_func_re = re.compile('<built-in function (.*)>')
-    built_in_meth_re = re.compile('<built-in method (.*) of type.*>')
-    op_dict = {
-        'placeholder': 'plchdr',
-        'get_attr': 'gt_prm',
-        'call_function': 'cl_fun',
-        'call_module': 'cl_mod',
-        'call_method': 'cl_meth',
-    }
-
-    max_lens = {}
-    col_names = ("name", "op", "target", "args", "kwargs")
-    for s in col_names:
-        max_lens[s] = len(s)
-
-    results = []
-    for n in g.nodes:
-
-        # activation_post_process_0 -> obs_0
-        name = str(n.name)
-        if shorten:
-            name = name.replace("activation_post_process", "obs")
-
-        op = str(n.op)
-        # placeholder -> plchdr, and so on
-        if shorten and op in op_dict:
-            op = op_dict[op]
-
-        target = str(n.target)
-        # <built-in function foo> -> <bi_fun foo>, and so on
-        if shorten:
-            built_in_func = built_in_func_re.search(target)
-            if built_in_func:
-                target = f"<bi_fun {built_in_func.group(1)}>"
-            built_in_meth = built_in_meth_re.search(target)
-            if built_in_meth:
-                target = f"<bi_meth {built_in_meth.group(1)}>"
-            target = target.replace("activation_post_process", "obs")
-
-        args = str(n.args)
-        if shorten:
-            args = args.replace("activation_post_process", "obs")
-
-        kwargs = str(n.kwargs)
-
-        # calculate maximum length of each column, so we can tabulate properly
-        for k, v in zip(col_names, (name, op, target, args, kwargs)):
-            max_lens[k] = max(max_lens[k], len(v))
-        results.append([name, op, target, args, kwargs])
-
-    res_str = ""
-    format_str = "{:<{name}} {:<{op}} {:<{target}} {:<{args}} {:<{kwargs}}\n"
-    res_str += format_str.format(*col_names, **max_lens)
-    for result in results:
-        res_str += format_str.format(*result, **max_lens)
-
-    # print an exra note on abbreviations which change attribute names,
-    # since users will have to un-abbreviate for further debugging
-    if shorten:
-        res_str += "*obs_{n} = activation_post_process_{n}\n"
-    return res_str
-
-def get_per_tensor_qparams(activation_post_process):
-    assert is_per_tensor(activation_post_process.qscheme), 'Only per tensor quantization is supported'
-    scale, zero_point = activation_post_process.calculate_qparams()
-    scale = float(scale)
-    zero_point = int(zero_point)
-    dtype = activation_post_process.dtype
-    return scale, zero_point, dtype
-
-# Keep it here for BC in torch.quantization namespace, we can remove it after
-# we deprecate the torch.quantization namespace
-quantize_node = NotImplemented
-
 def get_custom_module_class_keys(custom_module_mapping: Dict[QuantType, Dict[Type, Type]]) -> List[Any]:
     r""" Get all the unique custom module keys in the custom config dict
     e.g.
@@ -220,24 +132,6 @@ def get_qconv_prepack_op(conv_op: Callable) -> Callable:
     assert prepack_op, "Didn't find prepack op for {}".format(conv_op)
     return prepack_op
 
-def get_qconv_op(conv_op: Callable, has_relu: bool) -> Callable:
-    qconv_op = {
-        # has relu
-        True: {
-            torch.nn.functional.conv1d: torch.ops.quantized.conv1d_relu,
-            torch.nn.functional.conv2d: torch.ops.quantized.conv2d_relu,
-            torch.nn.functional.conv3d: torch.ops.quantized.conv3d_relu
-        },
-        False: {
-            torch.nn.functional.conv1d: torch.ops.quantized.conv1d,
-            torch.nn.functional.conv2d: torch.ops.quantized.conv2d,
-            torch.nn.functional.conv3d: torch.ops.quantized.conv3d
-        }
-    }
-    qconv = qconv_op[has_relu].get(conv_op)
-    assert qconv, "Can't find corresponding quantized conv op for {} {}".format(conv_op, has_relu)
-    return qconv
-
 # Returns a function that can get a new attribute name for module with given
 # prefix, for example,
 # >> get_new_observer_name = get_new_attr_name_with_prefix('_observer')
@@ -337,25 +231,6 @@ def create_getattr_from_value(module: torch.nn.Module, graph: Graph, prefix: str
     attr_node = graph.create_node("get_attr", attr_name)
     return attr_node
 
-def create_qparam_nodes(
-        node_name: str,
-        scale: Any,
-        zero_point: Any,
-        modules: Dict[str, torch.nn.Module],
-        quantized_graph: Graph,
-        node_name_to_scope: Dict[str, Tuple[str, type]]
-) -> Tuple[Node, Node]:
-    """
-    Create getattr nodes in the quantized graph for scale and zero point values.
-    The nodes are registered with the root_module of the model.
-    """
-    root_module = modules['']
-    module_path, _ = node_name_to_scope[node_name]
-    scale_node = create_getattr_from_value(root_module, quantized_graph, (module_path + "_scale_"), scale)
-    zero_point_node = create_getattr_from_value(root_module, quantized_graph, (module_path + "_zero_point_"), zero_point)
-    return (scale_node, zero_point_node)
-
-
 def all_node_args_have_no_tensors(node: Node, modules: Dict[str, torch.nn.Module], cache: Dict[Node, bool]) -> bool:
     """
     If we know for sure that all of this node's args have no
@@ -500,22 +375,6 @@ def get_non_observable_arg_indexes_and_types(node: Node) -> Dict[Union[type, tor
 
     return NON_OBSERVABLE_ARG_DICT.get(info, EMPTY_ARG_DICT)
 
-def node_return_type_is_int(node: Node) -> bool:
-    """
-    Returns true if this node results in an integer, even if some of the args
-    are Tensors.
-    """
-    return node.op == 'call_method' and node.target == 'size'
-
-
-def is_get_tensor_info_node(node: Node) -> bool:
-    """ Returns True if this node is a node that takes a Tensor as input and output some
-    meta information about the Tensor, e.g. shape, size etc.
-    """
-    result: bool = \
-        node.op == "call_function" and node.target == getattr and node.args[1] == "shape"  # type: ignore[assignment]
-    return result
-
 def maybe_get_next_module(
     node: Node,
     modules: Dict[str, nn.Module],
diff --git a/torch/ao/quantization/quantize_fx.py b/torch/ao/quantization/quantize_fx.py
index 8f26934576580..c6b1a0758492b 100644
--- a/torch/ao/quantization/quantize_fx.py
+++ b/torch/ao/quantization/quantize_fx.py
@@ -17,7 +17,6 @@
     FuseCustomConfig,
     PrepareCustomConfig,
 )
-from .fx.utils import graph_pretty_str  # noqa: F401
 from .fx.utils import get_custom_module_class_keys  # noqa: F401
 from .fx.utils import get_skipped_module_name_and_classes
 from .qconfig_mapping import QConfigMapping
diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py
index 230c10113e62b..96f4f68c592b5 100644
--- a/torch/quantization/fx/utils.py
+++ b/torch/quantization/fx/utils.py
@@ -7,21 +7,14 @@
 here.
 """
 from torch.ao.quantization.fx.utils import (
-    graph_pretty_str,
-    get_per_tensor_qparams,
-    quantize_node,
     get_custom_module_class_keys,
     get_linear_prepack_op_for_dtype,
     get_qconv_prepack_op,
-    get_qconv_op,
     get_new_attr_name_with_prefix,
     graph_module_from_producer_nodes,
     assert_and_get_unique_device,
     create_getattr_from_value,
-    create_qparam_nodes,
     all_node_args_have_no_tensors,
-    node_return_type_is_int,
     get_non_observable_arg_indexes_and_types,
-    is_get_tensor_info_node,
     maybe_get_next_module
 )

From 0cb44681226d85ad086592b2408abf4ef6f40259 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Mon, 5 Dec 2022 19:02:12 +0000
Subject: [PATCH 1654/1922] [Inductor] Add test for Scheduler fusions (#90014)

Currently there is `test_vertical_fusion1` which fuses entirely during
the lowering stage and no buffers are realized. This adds
`test_scheduler_vertical_fusion1` which is the same test but with
several intermediate calculations realized so the scheduler is left
to do the fusion.

To support the test, this PR also adds:
- `metrics.ir_nodes_pre_fusion` which when compared with
`generated_kernel_count` tells us how many nodes were fused.
- `torch._test_inductor_realize` which is an identity operator in
eager, but under inductor also forces the input to be realized.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90014
Approved by: https://github.com/jansel
---
 aten/src/ATen/native/TestOps.cpp           |  6 ++++
 aten/src/ATen/native/native_functions.yaml |  6 ++++
 test/inductor/test_torchinductor.py        | 38 ++++++++++++++++++++++
 tools/autograd/derivatives.yaml            |  4 +++
 torch/_inductor/lowering.py                |  6 ++++
 torch/_inductor/metrics.py                 |  5 +++
 torch/_inductor/scheduler.py               |  3 +-
 7 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/TestOps.cpp b/aten/src/ATen/native/TestOps.cpp
index f36765436991e..3f62aa58d2593 100644
--- a/aten/src/ATen/native/TestOps.cpp
+++ b/aten/src/ATen/native/TestOps.cpp
@@ -12,6 +12,7 @@
 #include <ATen/ops/_test_ambiguous_defaults_native.h>
 #include <ATen/ops/_test_autograd_multiple_dispatch_native.h>
 #include <ATen/ops/_test_autograd_multiple_dispatch_view_native.h>
+#include <ATen/ops/_test_inductor_realize_native.h>
 #include <ATen/ops/_test_optional_filled_intlist_native.h>
 #include <ATen/ops/_test_optional_floatlist_native.h>
 #include <ATen/ops/_test_optional_intlist_native.h>
@@ -106,6 +107,11 @@ Tensor _test_autograd_multiple_dispatch_view(const Tensor &self) {
   return self.view(-1);
 }
 
+// Helper for inductor tests
+Tensor _test_inductor_realize(const Tensor &self) {
+  return self.clone();
+}
+
 } // namespace native
 
 namespace functionalization {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 51834e7bfd5f7..d37cfda9bd07c 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -12816,6 +12816,12 @@
   tags: view_copy
   autogen: _test_autograd_multiple_dispatch_view_copy.out
 
+# Note: this function is only for testing.
+- func: _test_inductor_realize(Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU, CUDA, Meta: _test_inductor_realize
+
 - func: segment_reduce(Tensor data, str reduce, *, Tensor? lengths=None, Tensor? indices=None, Tensor? offsets=None, int axis=0, bool unsafe=False, Scalar? initial=None) -> Tensor
   variants: function
   dispatch:
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 755a84d06a75d..4b8c6dcbc7b80 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -758,6 +758,44 @@ def fn(sa, ct, p):
         )
         self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
 
+    def test_forced_buffer_realize(self):
+        # Test torch._test_inductor_realize forces a buffer to be realized
+        def fn(a):
+            b = torch._test_inductor_realize(a * 2)
+            return (b * 2,)
+
+        self.common(fn, (torch.randn(10),))
+        self.assertEqual(torch._inductor.metrics.ir_nodes_pre_fusion, 2)
+
+    def test_scheduler_vertical_fusion1(self):
+        realize = torch._test_inductor_realize
+
+        def fn(sa, ct, p):
+            # From torchbench.pyhpc_equation_of_state
+            v17 = -3.087032500374211e-7
+            v18 = -1.988366587925593e-8
+            v19 = -1.061519070296458e-11
+            v20 = 1.550932729220080e-10
+            t15 = realize(v19 * ct)
+            t19 = realize(v17 + ct * (v18 + t15) + v20 * sa)
+            t20 = realize(1.0 / t19)
+            t128 = realize(t19 * p)
+            return t20 + t128
+
+        self.common(
+            fn,
+            (
+                torch.randn(204, 204, 26),
+                torch.randn(204, 204, 26),
+                torch.randn(26),
+            ),
+        )
+        self.assertEqual(torch._inductor.metrics.ir_nodes_pre_fusion, 5)
+        self.assertEqual(
+            torch._inductor.metrics.generated_kernel_count,
+            1 if self.device == "cuda" else 3,
+        )
+
     def test_sum1(self):
         def fn(a, b):
             return ((a + b).sum(-1),)
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index bebd32e66701a..a6fc9d806ad41 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -2744,6 +2744,10 @@
     AutogradCUDA:
       self: grad.reshape_as(self) + 1
 
+- name: _test_inductor_realize(Tensor self) -> Tensor
+  self: grad
+  result: auto_linear
+
 - name: _efficientzerotensor(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   output_differentiability: [False]
 
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 7597b33062322..a26785493e988 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -3679,3 +3679,9 @@ def op_floordiv(a, b):
 @register_lowering(aten._foobar)
 def foobar(self, *args, **kwargs):
     raise NotImplementedError("Helpful for debugging")
+
+
+@register_lowering(aten._test_inductor_realize)
+def _realize(x):
+    x.realize()
+    return clone(x)
diff --git a/torch/_inductor/metrics.py b/torch/_inductor/metrics.py
index f7e05288c9a5e..fe4fe07529de5 100644
--- a/torch/_inductor/metrics.py
+++ b/torch/_inductor/metrics.py
@@ -4,14 +4,19 @@
 num_bytes_accessed = 0
 nodes_num_elem = []
 
+# counters for tracking fusions
+ir_nodes_pre_fusion = 0
+
 
 # reset all counters
 def reset():
     global generated_kernel_count
     global generated_cpp_vec_kernel_count
     global num_bytes_accessed, nodes_num_elem
+    global ir_nodes_pre_fusion
 
     generated_kernel_count = 0
     generated_cpp_vec_kernel_count = 0
     num_bytes_accessed = 0
     nodes_num_elem.clear()
+    ir_nodes_pre_fusion = 0
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
index b12a15ce2f6fd..0bb8eb3f27f01 100644
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@@ -13,7 +13,7 @@
 
 import torch
 
-from . import config, dependencies, ir
+from . import config, dependencies, ir, metrics
 from .dependencies import MemoryDep, StarDep
 from .sizevars import SimplifyIndexing
 from .utils import cache_on_self, cmp, dynamo_utils, has_triton
@@ -599,6 +599,7 @@ def __init__(self, nodes):
         self.compute_predecessors()
         self.dead_node_elimination()
 
+        metrics.ir_nodes_pre_fusion += len(self.nodes)
         V.debug.ir_pre_fusion(self.nodes)
         self.num_orig_nodes = len(self.nodes)
         self.name_to_fused_node = {n.get_name(): n for n in self.nodes}

From 79dcdf427822c476d8fae142aae23d9e1d0455c7 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Mon, 5 Dec 2022 17:38:47 +0000
Subject: [PATCH 1655/1922] [Inductor] More robust stride and offset extraction
 from index expressions (#90184)

Currently the stride and offset are determined by substituting 1 and 0 for
different indices, which will fail for any expression that doesn't match the
expected stride calculation. Instead, this uses `sympy.match` and returns `None`
for any variables used in non-standard index calculation, e.g. `torch.roll`
which uses `ModularIndexing`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90184
Approved by: https://github.com/jansel
---
 torch/_inductor/ir.py       |   8 ++-
 torch/_inductor/sizevars.py | 102 +++++++++++++++++++++++-------------
 2 files changed, 73 insertions(+), 37 deletions(-)

diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 253f217320e2e..7e770d4531302 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -2425,8 +2425,12 @@ def convert_to_reinterpret_view(cls, x):
         index = V.graph.sizevars.simplify_with_ranges(
             list(rw.reads)[0].index, rw.var_ranges
         )
-        strides = V.graph.sizevars.stride_vars(index, rw.range_vars)
-        offset = V.graph.sizevars.offset_var(index, rw.range_vars)
+        strides, offset = V.graph.sizevars.maybe_stride_and_offset_vars(
+            index, rw.range_vars
+        )
+        if offset is None or any(s is None for s in strides):
+            raise NotImplementedError()
+
         expected = sympy_dot(rw.range_vars, strides) + offset
 
         if index != expected:
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
index 7895f0dccdcba..47267c00e62fa 100644
--- a/torch/_inductor/sizevars.py
+++ b/torch/_inductor/sizevars.py
@@ -2,7 +2,7 @@
 import functools
 import itertools
 import logging
-from typing import Callable, Dict, List, Tuple
+from typing import Callable, Dict, List, Optional, Tuple
 
 import sympy
 from sympy import Expr
@@ -11,7 +11,7 @@
 
 from . import ir
 from .codegen.common import IndentedBuffer
-from .utils import sympy_subs, sympy_symbol, VarRanges
+from .utils import sympy_dot, sympy_subs, sympy_symbol, VarRanges
 from .virtualized import V
 
 log = logging.getLogger(__name__)
@@ -47,7 +47,9 @@ def __init__(self, shape_env=None):
         self.guards = []
         self.replacements: Dict[sympy.Symbol, Expr] = self.shape_env.replacements
         self.need_seed = False
-        self.stride_vars = self.make_stride_vars_cache()
+        self.maybe_stride_and_offset_vars = (
+            self.make_maybe_stride_and_offset_vars_cache()
+        )
         self.simplify_with_ranges = self.make_simplify_with_ranges_cache()
         self._simplify_loops = self.make_simplify_loops_cache()
         self.declare = ""
@@ -191,7 +193,7 @@ def _simplify_loops_impl(self, index_vars, sizes, index_formulas):
         """
         sizes = list(map(self.simplify, sizes))
 
-        strides = [self.stride_vars(x, index_vars) for x in index_formulas]
+        strides = [self.maybe_stride_vars(x, index_vars) for x in index_formulas]
         assert len(sizes) == len(strides[0]), (len(sizes), len(strides[0]))
 
         for i in range(len(sizes)):
@@ -201,6 +203,9 @@ def _simplify_loops_impl(self, index_vars, sizes, index_formulas):
 
         def can_merge_dims(a, b):
             for k in range(len(strides)):
+                if strides[k][a] is None or strides[k][b] is None:
+                    return False
+
                 if self.simplify(strides[k][a] * sizes[a]) == self.simplify(
                     strides[k][b]
                 ):
@@ -384,52 +389,79 @@ def wrapper(*args, **kwargs):
 
         return wrapper
 
-    def make_stride_vars_cache(self):
-        cache = self._lru_cache(self._stride_vars)
+    def make_maybe_stride_and_offset_vars_cache(self):
+        cache = self._lru_cache(self._maybe_stride_and_offset_vars)
 
-        def stride_vars(index: Expr, vars: List[sympy.Symbol]) -> List[Expr]:
+        def maybe_stride_and_offset_vars(
+            index: Expr, vars: List[sympy.Symbol]
+        ) -> List[Expr]:
             return cache(index, tuple(vars))
 
-        return stride_vars
+        return maybe_stride_and_offset_vars
 
-    def _stride_vars(self, index: Expr, vars: List[sympy.Symbol]) -> List[Expr]:
-        """Convert an indexing expression back into strides"""
-        strides = []
+    def _maybe_stride_and_offset_vars(
+        self, index: sympy.Expr, vars: List[sympy.Symbol]
+    ) -> Tuple[List[Optional[sympy.Expr]], Optional[sympy.Expr]]:
+        """Convert an indexing expression back into strides and offset"""
         index = self.simplify(index)
-        # remove any offset
-        index = index - sympy_subs(index, {v: sympy.Integer(0) for v in vars if v != 0})
-        for i in range(len(vars)):
-            # drop all the other dims
-            index_dim = sympy_subs(
-                index,
-                {
-                    vars[j]: sympy.Integer(0)
-                    for j in range(len(vars))
-                    if i != j and vars[j] != 0
-                },
-            )
-            v = vars[i]
-            if v == 0:
-                strides.append(sympy.Integer(0))
+
+        # TODO: vars aren't always symbols
+        assert all(isinstance(v, sympy.Symbol) or v == 0 for v in vars)
+        var_symbols = [v for v in vars if isinstance(v, sympy.Symbol)]
+
+        stride_symbols = [sympy.Wild(f"stride{i}") for i in range(len(var_symbols))]
+        var_to_stride = {v: s for v, s in zip(var_symbols, stride_symbols)}
+        offset_symbol = sympy.Wild("offset")
+        index_pattern = offset_symbol + sympy_dot(var_symbols, stride_symbols)
+
+        match = index.match(index_pattern)
+        if match is None:
+            # Index calculation is not strided
+            return [None] * len(vars), None
+
+        strides = []
+        for v in vars:
+            if v not in var_to_stride:
+                stride = 0 if v == 0 else None
             else:
-                # TODO(jansel): should we use sympy.diff here?
-                strides.append(
-                    sympy_subs(index_dim, {v: sympy.Integer(1)})
-                    - sympy_subs(index_dim, {v: sympy.Integer(0)})
-                )
+                stride = match.get(var_to_stride[v], None)
+            strides.append(stride)
+
+        offset = match[offset_symbol]
+
+        # If any vars appear in the offset terms, they are not strided
+        vars_set = set(vars)
+        if vars_set & offset.free_symbols:
+            for i, v in enumerate(vars):
+                if v in offset.free_symbols:
+                    strides[i] = None
+
+            offset = None
+
+        return strides, offset
+
+    def maybe_stride_vars(
+        self, index: Expr, vars: List[sympy.Symbol]
+    ) -> List[Optional[Expr]]:
+        """Convert an indexing expression back into strides"""
+        strides, offset = self.maybe_stride_and_offset_vars(index, vars)
         return strides
 
-    def offset_var(self, index: Expr, vars: List[sympy.Symbol]) -> Expr:
+    def maybe_offset_var(self, index: Expr, vars: List[sympy.Symbol]) -> Optional[Expr]:
         """Extract offset part of an indexing expression"""
-        index = self.simplify(index)
-        return sympy_subs(index, {v: sympy.Integer(0) for v in vars if v != 0})
+        strides, offset = self.maybe_stride_and_offset_vars(index, vars)
+        return offset
 
     def stride_hints(self, index: Expr, vars: List[sympy.Symbol]) -> List[int]:
         for v in index.free_symbols:
             if v.name.startswith("indirect"):
                 index = sympy_subs(index, {v: 0})
         result = []
-        for s in self.stride_vars(index, vars):
+        for s in self.maybe_stride_vars(index, vars):
+            if s is None:
+                result.append(0)
+                continue
+
             try:
                 result.append(self.size_hint(s))
             except TypeError:

From 53bf660277d2dcbe16805f3e2c910f6a86e5b9d8 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 7 Dec 2022 02:05:39 +0000
Subject: [PATCH 1656/1922] [Dynamo][Easy] capture more exceptions when import
 skip modules (#90338)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90338
Approved by: https://github.com/williamwen42
---
 torch/_dynamo/skipfiles.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/_dynamo/skipfiles.py b/torch/_dynamo/skipfiles.py
index 3413079a79f66..3f90fce3c3525 100644
--- a/torch/_dynamo/skipfiles.py
+++ b/torch/_dynamo/skipfiles.py
@@ -148,7 +148,7 @@ def add(import_name: str):
     assert isinstance(import_name, str)
     try:
         module_spec = importlib.util.find_spec(import_name)
-    except ModuleNotFoundError:
+    except Exception:
         return
     if not module_spec:
         return

From 86199566bed26c611463559e32ed146290b99d36 Mon Sep 17 00:00:00 2001
From: David Berard <dberard@fb.com>
Date: Tue, 6 Dec 2022 21:24:00 +0000
Subject: [PATCH 1657/1922] [Dynamo+FSDP] Update benchmarks with
 use_orig_params=True (#90100)

After https://github.com/pytorch/pytorch/pull/89523, we now need to assert use_orig_params=True, even in the non-recursive case where (I think) we wouldn't otherwise need to run with use_orig_params=True.

Tested with `python benchmarks/dynamo/torchbench.py --training --accuracy --only hf_T5 --fsdp`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90100
Approved by: https://github.com/wconstab
---
 benchmarks/dynamo/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index b2937e4c45994..63b726c4bc741 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -1094,7 +1094,7 @@ def deepcopy_and_maybe_ddp(model):
             if self.args.ddp:
                 model = DDP(model, find_unused_parameters=True)
             elif self.args.fsdp:
-                model = FSDP(model)
+                model = FSDP(model, use_orig_params=True)
                 torch._inductor.config.triton.cudagraphs = False
                 log.warn("Disabling cudagraphs for FSDP compatibility")
             return model

From f1593db5219817088c8c322dbbc12ac2fdb0dd68 Mon Sep 17 00:00:00 2001
From: Ram Rachum <ram@rachum.com>
Date: Wed, 7 Dec 2022 04:28:56 +0000
Subject: [PATCH 1658/1922] Fix exception causes all over the codebase (#90271)

This is the continuation to #90134 and hopefully the final PR in this series.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90271
Approved by: https://github.com/kit1980
---
 benchmarks/dynamo/common.py                   |  4 +--
 .../torchaudio_models.py                      |  5 +--
 caffe2/python/caffe_translator.py             |  4 +--
 caffe2/python/core.py                         |  4 +--
 caffe2/python/model_helper.py                 |  8 ++---
 caffe2/python/models/download.py              |  4 +--
 .../roi_align_rotated_op_test.py              |  4 +--
 .../operator_test/video_input_op_test.py      |  4 +--
 caffe2/python/schema.py                       |  4 +--
 caffe2/python/trt/transform.py                |  4 +--
 setup.py                                      |  4 +--
 test/distributed/fsdp/test_fsdp_state_dict.py |  8 ++---
 test/distributed/test_c10d_nccl.py            |  2 +-
 test/distributed/test_dynamo_distributed.py   |  4 +--
 test/dynamo/test_repros.py                    |  4 +--
 test/inductor/test_torchinductor.py           |  2 +-
 test/jit/test_dtype_analysis.py               |  4 +--
 test/lazy/test_extract_compiled_graph.py      |  2 +-
 .../test_pytorch_onnx_onnxruntime_cuda.py     |  4 +--
 test/test_autograd.py                         |  2 +-
 test/test_fx.py                               |  2 +-
 test/test_jit_fuser_te.py                     | 34 +++++++++----------
 test/test_meta.py                             |  2 +-
 tools/gen_vulkan_glsl.py                      |  4 +--
 tools/render_junit.py                         |  4 +--
 torch/_dynamo/debug_utils.py                  |  2 +-
 torch/_dynamo/symbolic_convert.py             |  2 +-
 torch/_dynamo/utils.py                        |  6 ++--
 torch/_dynamo/variables/constant.py           |  4 +--
 torch/_dynamo/variables/nn_module.py          |  4 +--
 torch/_functorch/partitioners.py              |  5 +--
 torch/_inductor/codecache.py                  |  2 +-
 torch/_inductor/triton_ops/autotune.py        |  2 +-
 torch/_subclasses/fake_utils.py               |  2 +-
 torch/autograd/gradcheck.py                   |  8 ++---
 torch/cuda/__init__.py                        | 16 ++++-----
 torch/distributed/_composable/_ddp.py         |  4 +--
 torch/distributed/elastic/rendezvous/api.py   |  8 ++---
 .../elastic/rendezvous/etcd_rendezvous.py     |  4 +--
 torch/distributed/fsdp/_init_utils.py         |  4 +--
 torch/distributed/fsdp/_optim_utils.py        |  4 +--
 torch/distributed/optim/utils.py              |  5 +--
 .../optim/zero_redundancy_optimizer.py        |  4 +--
 .../pipeline/sync/skip/skippable.py           |  4 +--
 torch/distributed/rendezvous.py               |  4 +--
 torch/distributed/rpc/internal.py             |  2 +-
 torch/distributed/run.py                      |  6 ++--
 torch/jit/_dataclass_impls.py                 |  4 +--
 torch/jit/_recursive.py                       |  2 +-
 torch/profiler/profiler.py                    |  4 +--
 torch/testing/_comparison.py                  |  4 +--
 torch/testing/_internal/common_fsdp.py        |  2 +-
 torch/testing/_internal/common_utils.py       |  6 ++--
 .../distributed/_tensor/common_dtensor.py     |  4 +--
 torch/utils/data/datapipes/datapipe.py        |  4 +--
 torch/utils/data/datapipes/iter/callable.py   |  4 +--
 torch/utils/data/datapipes/map/combining.py   |  4 +--
 torch/utils/data/datapipes/map/grouping.py    |  4 +--
 torch/utils/data/datapipes/utils/decoder.py   | 10 +++---
 torch/utils/data/datapipes/utils/snapshot.py  |  4 +--
 torchgen/gen_backend_stubs.py                 |  4 +--
 torchgen/gen_lazy_tensor.py                   |  4 +--
 torchgen/model.py                             |  4 +--
 63 files changed, 150 insertions(+), 147 deletions(-)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 63b726c4bc741..ba4ca471e8f4b 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -1001,8 +1001,8 @@ def validate_model(self, model, example_inputs):
 
         try:
             self.model_iter_fn(model, example_inputs)
-        except Exception:
-            raise NotImplementedError("Eager model failed to run")
+        except Exception as e:
+            raise NotImplementedError("Eager model failed to run") from e
 
     def maybe_cast(self, model, example_inputs):
         model = copy.deepcopy(model)
diff --git a/benchmarks/functional_autograd_benchmark/torchaudio_models.py b/benchmarks/functional_autograd_benchmark/torchaudio_models.py
index 8512028fbad0d..1e568d1d01f04 100644
--- a/benchmarks/functional_autograd_benchmark/torchaudio_models.py
+++ b/benchmarks/functional_autograd_benchmark/torchaudio_models.py
@@ -330,8 +330,9 @@ def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
         super(TransformerModel, self).__init__()
         try:
             from torch.nn import TransformerEncoder, TransformerEncoderLayer
-        except Exception:
-            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
+        except Exception as e:
+            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or '
+                              'lower.') from e
         self.model_type = 'Transformer'
         self.src_mask = None
         self.pos_encoder = PositionalEncoding(ninp, dropout)
diff --git a/caffe2/python/caffe_translator.py b/caffe2/python/caffe_translator.py
index e0aebaf7b24ea..63b5706120ac0 100644
--- a/caffe2/python/caffe_translator.py
+++ b/caffe2/python/caffe_translator.py
@@ -210,9 +210,9 @@ def TranslateLayer(cls, layer, pretrained_blobs, is_test, **kwargs):
         try:
             caffe_ops, params = cls.registry_[layer.type](
                 layer, pretrained_blobs, is_test, **kwargs)
-        except KeyError:
+        except KeyError as e:
             raise KeyError('No translator registered for layer: %s yet.' %
-                           str(layer))
+                           str(layer)) from e
         if caffe_ops is None:
             caffe_ops = []
         if type(caffe_ops) is not list:
diff --git a/caffe2/python/core.py b/caffe2/python/core.py
index dc9e74f87ad22..9c2efe282f136 100644
--- a/caffe2/python/core.py
+++ b/caffe2/python/core.py
@@ -970,7 +970,7 @@ def DoGradientAccumulation(self, fwd_op_idx):
                         input_name,
                         err
                     )
-                )
+                ) from err
 
             # Finally, let's create the sum operator.
             sum_ops, g = self._MakeSumOps(input_name, input_version)
@@ -1175,7 +1175,7 @@ def GetGradientForOp(cls, op, g_output):
                 raise Exception(
                     "Exception when creating gradient for [{}]:{}.\nOp: \n{}".
                     format(op.type, e, str(op))
-                )
+                ) from e
 
         if gradient_ops is None:
             return [], g_input
diff --git a/caffe2/python/model_helper.py b/caffe2/python/model_helper.py
index 5eb81d898b33f..18219d3923b42 100644
--- a/caffe2/python/model_helper.py
+++ b/caffe2/python/model_helper.py
@@ -540,8 +540,8 @@ def ExtractPredictorNet(
                 'StopGradient'
             ]
         )
-    except ValueError:
-        raise Exception("No ops with input={}".format(input_blobs))
+    except ValueError as e:
+        raise Exception("No ops with input={}".format(input_blobs)) from e
     try:
         last_op_with_output = max(
             [
@@ -549,8 +549,8 @@ def ExtractPredictorNet(
                 if output_blobs.intersection(ops[j].output)
             ]
         )
-    except ValueError:
-        raise Exception("No ops with output={}".format(output_blobs))
+    except ValueError as e:
+        raise Exception("No ops with output={}".format(output_blobs)) from e
 
     def validate_op(op):
         # Check that the op does not have is_test = 0 set. This is a common
diff --git a/caffe2/python/models/download.py b/caffe2/python/models/download.py
index 7e735c726568e..895f87a4e4501 100644
--- a/caffe2/python/models/download.py
+++ b/caffe2/python/models/download.py
@@ -69,10 +69,10 @@ def downloadFromURLToFile(url, filename, show_progress=True):
         print("")  # New line to fix for progress bar
     except HTTPError as e:
         raise Exception("Could not download model. [HTTP Error] {code}: {reason}."
-                        .format(code=e.code, reason=e.reason))
+                        .format(code=e.code, reason=e.reason)) from e
     except URLError as e:
         raise Exception("Could not download model. [URL Error] {reason}."
-                        .format(reason=e.reason))
+                        .format(reason=e.reason)) from e
 
 
 def getURLFromName(name, filename):
diff --git a/caffe2/python/operator_test/roi_align_rotated_op_test.py b/caffe2/python/operator_test/roi_align_rotated_op_test.py
index ea835acead617..fcbcb555440bb 100644
--- a/caffe2/python/operator_test/roi_align_rotated_op_test.py
+++ b/caffe2/python/operator_test/roi_align_rotated_op_test.py
@@ -150,9 +150,9 @@ def roialign_flip(m, axis):
             indexer = [slice(None)] * m.ndim
             try:
                 indexer[axis] = slice(None, None, -1)
-            except IndexError:
+            except IndexError as e:
                 raise ValueError("axis=%i is invalid for the %i-dimensional input array"
-                                 % (axis, m.ndim))
+                                 % (axis, m.ndim)) from e
             return m[tuple(indexer)]
 
         def roialign_ref(X, R):
diff --git a/caffe2/python/operator_test/video_input_op_test.py b/caffe2/python/operator_test/video_input_op_test.py
index f21f219bd90eb..24f9e57434d4f 100644
--- a/caffe2/python/operator_test/video_input_op_test.py
+++ b/caffe2/python/operator_test/video_input_op_test.py
@@ -13,8 +13,8 @@
 
 try:
     import lmdb
-except ImportError:
-    raise unittest.SkipTest("python-lmdb is not installed")
+except ImportError as e:
+    raise unittest.SkipTest("python-lmdb is not installed") from e
 
 
 class VideoInputOpTest(unittest.TestCase):
diff --git a/caffe2/python/schema.py b/caffe2/python/schema.py
index 60353ac38a256..295b79feadca7 100644
--- a/caffe2/python/schema.py
+++ b/caffe2/python/schema.py
@@ -546,8 +546,8 @@ def __getattr__(self, item):
             raise AttributeError(item)
         try:
             return super(Struct, self).__getattribute__("fields")[item]
-        except KeyError:
-            raise AttributeError(item)
+        except KeyError as e:
+            raise AttributeError(item) from e
 
     def __setattr__(self, key, value):
         # Disable setting attributes after initialization to prevent false
diff --git a/caffe2/python/trt/transform.py b/caffe2/python/trt/transform.py
index 0e304ca4fae30..aee27d6826fbd 100644
--- a/caffe2/python/trt/transform.py
+++ b/caffe2/python/trt/transform.py
@@ -29,8 +29,8 @@ def _get_output_shapes(output_value_infos):
 def check_gpu_():
     try:
         C.get_cuda_version()
-    except Exception as _:
-       raise Exception("TensorRT related functions require CUDA support")
+    except Exception as e:
+        raise Exception("TensorRT related functions require CUDA support") from e
 
 def convert_onnx_model_to_trt_op(onnx_model,
         max_batch_size=64,
diff --git a/setup.py b/setup.py
index e18eb16869a3f..1f069f4d4bdf9 100644
--- a/setup.py
+++ b/setup.py
@@ -446,8 +446,8 @@ def build_deps():
 def check_pydep(importname, module):
     try:
         importlib.import_module(importname)
-    except ImportError:
-        raise RuntimeError(missing_pydep.format(importname=importname, module=module))
+    except ImportError as e:
+        raise RuntimeError(missing_pydep.format(importname=importname, module=module)) from e
 
 
 class build_ext(setuptools.command.build_ext.build_ext):
diff --git a/test/distributed/fsdp/test_fsdp_state_dict.py b/test/distributed/fsdp/test_fsdp_state_dict.py
index 6fafc8e8fdf4a..0a453efe8ffba 100644
--- a/test/distributed/fsdp/test_fsdp_state_dict.py
+++ b/test/distributed/fsdp/test_fsdp_state_dict.py
@@ -611,8 +611,8 @@ def _initialize_model(
     def _state_dict(model: Module, state_dict_type: str):
         try:
             enum_val = STATE_DICT_MAPPING[state_dict_type]
-        except KeyError:
-            raise ValueError(f"No state_dict type for {state_dict_type}")
+        except KeyError as e:
+            raise ValueError(f"No state_dict type for {state_dict_type}") from e
 
         with FSDP.state_dict_type(model, enum_val):
             return model.state_dict()
@@ -623,8 +623,8 @@ def _load_state_dict(
     ):
         try:
             enum_val = STATE_DICT_MAPPING[state_dict_type]
-        except KeyError:
-            raise ValueError(f"No state_dict for {state_dict_type}")
+        except KeyError as e:
+            raise ValueError(f"No state_dict for {state_dict_type}") from e
 
         with FSDP.state_dict_type(model, enum_val):
             return model.load_state_dict(state_dict, strict=True)
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index bcd9935432c79..403ede860a782 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -2598,7 +2598,7 @@ def test_nccl_timeout(self):
             try:
                 pg_gloo.barrier().wait()
             except Exception as e:
-                raise ValueError(f"Rank {self.rank} barrier timed out waiting for rank 0 with error: {str(e)}")
+                raise ValueError(f"Rank {self.rank} barrier timed out waiting for rank 0 with error: {str(e)}") from e
             # Now verify communicators on this rank have
             # been aborted by watchdog.
             self._wait_for_comm_abort(process_group, failed_collective_timeout)
diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index 94682526d6e8f..57fa5e0652709 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -101,8 +101,8 @@ def get_hf_bert(rank):
     # in a multiprocessing test
     try:
         from transformers import BertConfig, AutoModelForMaskedLM
-    except ImportError:
-        raise unittest.SkipTest("Unable to import transformers")
+    except ImportError as e:
+        raise unittest.SkipTest("Unable to import transformers") from e
 
     batch_size, max_length, config, device = 4, 512, BertConfig(), f"cuda:{rank}"
     model = AutoModelForMaskedLM.from_config(config).to(device)
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index acb285a6cf26b..602f3ce63fa7b 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -1856,8 +1856,8 @@ def __init__(
             def __getattr__(self, item: str):
                 try:
                     return self.data[item]
-                except KeyError:
-                    raise AttributeError
+                except KeyError as e:
+                    raise AttributeError from e
 
         def tokenization(x):
             encoding = BatchEncoding({"key": x})
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 4b8c6dcbc7b80..a2a1cb096c12a 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -65,7 +65,7 @@
     sys.stderr.write(f"{type(e)}: {e}\n")
     if __name__ == "__main__":
         sys.exit(0)
-    raise unittest.SkipTest("requires sympy/functorch/filelock")
+    raise unittest.SkipTest("requires sympy/functorch/filelock") from e
 
 from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
 
diff --git a/test/jit/test_dtype_analysis.py b/test/jit/test_dtype_analysis.py
index af1a7f3b24f28..783a1b935d9b7 100644
--- a/test/jit/test_dtype_analysis.py
+++ b/test/jit/test_dtype_analysis.py
@@ -128,9 +128,9 @@ def assert_dtype_equal(self, fn, in_shapes, in_dtypes):
         inputs = [self.get_rand_tensor(s, d) for s, d in zip(in_shapes, in_dtypes)]
         try:
             self.assert_dtype_equal_custom_args(fn, inputs)
-        except Exception:
+        except Exception as e:
             fail_text = f"Failed for shapes {in_shapes}, and dtypes {in_dtypes}"
-            raise AssertionError(fail_text)
+            raise AssertionError(fail_text) from e
 
     def assert_dtype_equal_custom_args(self, fn, args):
         try:
diff --git a/test/lazy/test_extract_compiled_graph.py b/test/lazy/test_extract_compiled_graph.py
index b27a11bf49b61..0d916952be3b5 100644
--- a/test/lazy/test_extract_compiled_graph.py
+++ b/test/lazy/test_extract_compiled_graph.py
@@ -141,7 +141,7 @@ def verify_reusing_compiled_graph(mod, exception_msg_pattern, ncase=10):
             raise e  # reraise the exception
         exception_message = str(e)
         if not re.search(exception_msg_pattern, exception_message):
-            raise RuntimeError(f"Exception message does not match the required pattern: {exception_message}")
+            raise RuntimeError(f"Exception message does not match the required pattern: {exception_message}") from e
         else:
             # We are done for the test case that expects an exception
             return
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py b/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py
index 193b87af3d284..9695d05b6072f 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime_cuda.py
@@ -110,8 +110,8 @@ def forward(self, x):
 
         try:
             from apex import amp
-        except Exception:
-            raise unittest.SkipTest("Apex is not available")
+        except Exception as e:
+            raise unittest.SkipTest("Apex is not available") from e
         input = torch.randn(3, 3, device=torch.device("cuda"))
         model = amp.initialize(LinearModel(), opt_level="O2")
         self.run_test(model, input)
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 9c3d717114659..d30614bbad6af 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -1703,7 +1703,7 @@ def coro_enable_grad(n=10):
                     self.assertTrue(torch.is_grad_enabled())
                     yield (-i if has_raised else i)
 
-                except UnrecoverableException:
+                except UnrecoverableException :
                     self.assertTrue(torch.is_grad_enabled())
                     raise SecondaryException
 
diff --git a/test/test_fx.py b/test/test_fx.py
index 0aff631b8e814..bd92fbbedceb9 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -3805,7 +3805,7 @@ def test_class_member_back_compat(self):
                   f"unintended, please revert it. If it was intended, check with the FX " \
                   f"team to ensure that the proper deprecation protocols have been followed " \
                   f"and subsequently --accept the change."
-            raise AssertionError(msg)
+            raise AssertionError(msg) from e
 
     def test_public_api_surface(self):
         non_back_compat_objects = {}
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index 19585296870ba..6839837218117 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -597,7 +597,7 @@ def apply(fn):
             except Exception as e:
                 raise RuntimeError(
                     " ".join(["Failed:", str(dtype), op.__name__, device])
-                )
+                ) from e
 
     def test_minmax_int_ops(self):
         def apply(fn):
@@ -627,7 +627,7 @@ def apply(fn):
             except Exception as e:
                 raise RuntimeError(
                     " ".join(["Failed:", str(dtype), op.__name__, device])
-                )
+                ) from e
 
     def test_comparison_eq_ne(self):
         for device in self.devices:
@@ -1288,7 +1288,7 @@ def fn(input_v, mask):
             except Exception as e:
                 raise RuntimeError(
                     " ".join(["Failed:", str(self_dtype), op.__name__, device, str(size)])
-                )
+                ) from e
 
     def test_isnan(self):
         x = torch.rand([4])
@@ -1321,7 +1321,7 @@ def test_isnan(self):
             except Exception as e:
                 raise RuntimeError(
                     " ".join(["Failed:", str(dtype), 'isnan', device])
-                )
+                ) from e
 
     def test_gelu(self):
         def apply(fn):
@@ -1352,7 +1352,7 @@ def apply(fn):
             except Exception as e:
                 raise RuntimeError(
                     " ".join(["Failed:", str(dtype), op.__name__, device, str(size)])
-                )
+                ) from e
 
     def test_unary_ops(self):
         with torch._jit_internal._disable_emit_hooks():
@@ -1435,7 +1435,7 @@ def apply(fn):
                 except Exception as e:
                     raise RuntimeError(
                         " ".join(["Failed:", str(dtype), op.__name__, device, str(size)])
-                    )
+                    ) from e
 
     def test_binary_ops(self):
         def apply(fn):
@@ -1488,7 +1488,7 @@ def apply(fn):
             except Exception as e:
                 raise RuntimeError(
                     " ".join(["Failed:", str(dtype), op.__name__, device])
-                )
+                ) from e
 
     def test_binary_scalar_ops(self):
         def apply(fn):
@@ -1534,7 +1534,7 @@ def apply(fn):
             try:
                 k = torch._C._te.TensorExprKernel(graph)
             except Exception as e:
-                raise RuntimeError(" ".join(["Compilation failed:", device, str(code)]))
+                raise RuntimeError(" ".join(["Compilation failed:", device, str(code)])) from e
 
             # Run the graph
             for x, y in product(values[dtype_x], values[dtype_y]):
@@ -1543,7 +1543,7 @@ def apply(fn):
                     res = k.run((x, y))
                     self.assertEqual(ref, res)
                 except Exception as e:
-                    raise RuntimeError(" ".join(["Failed at runtime:", device, str(x), str(y), str(code)]))
+                    raise RuntimeError(" ".join(["Failed at runtime:", device, str(x), str(y), str(code)])) from e
 
     def test_matmul(self):
         if self.dynamic_shapes:
@@ -1599,7 +1599,7 @@ def fn(x, y):
             except Exception as e:
                 raise RuntimeError(
                     " ".join(["Failed:", str(dtype), device])
-                )
+                ) from e
 
     def test_binary_tensor_scalar_ops(self):
         with torch._jit_internal._disable_emit_hooks():
@@ -1643,7 +1643,7 @@ def apply_with_scalar(fn, scalar):
                 except Exception as e:
                     raise RuntimeError(
                         " ".join(["Failed:", str(dtype), op.__name__, device])
-                    )
+                    ) from e
 
     def test_binary_div_ops(self):
         def apply_with_scalar(fn, scalar):
@@ -1676,7 +1676,7 @@ def apply_with_scalar(fn, scalar):
             except Exception as e:
                 raise RuntimeError(
                     "Failed: {} {} {} {}".format(dtype, op.__name__, device, scalar)
-                )
+                ) from e
 
     def test_binary_pow(self):
         def apply_with_scalar(fn, scalar):
@@ -1714,7 +1714,7 @@ def apply_with_scalar(fn, scalar):
             except Exception as e:
                 raise RuntimeError(
                     " ".join(["Failed:", str(dtype), op.__name__, device])
-                )
+                ) from e
 
     def test_ternary_ops(self):
         def apply(fn):
@@ -1746,7 +1746,7 @@ def apply(fn):
             except Exception as e:
                 raise RuntimeError(
                     " ".join(["Failed:", str(dtype), op.__name__, device])
-                )
+                ) from e
 
     def test_ternary_norm_ops(self):
         def apply(fn):
@@ -1777,7 +1777,7 @@ def apply(fn):
             except Exception as e:
                 raise RuntimeError(
                     " ".join(["Failed:", str(dtype), op.__name__, device])
-                )
+                ) from e
 
 
     @unittest.skip("FIXME: fuser doesn't include ListConstruct nodes to the group causing a failure")
@@ -1810,7 +1810,7 @@ def apply(fn):
             except Exception as e:
                 raise RuntimeError(
                     " ".join(["Failed:", str(dtype), op.__name__, device])
-                )
+                ) from e
 
     def test_where_ops(self):
         def apply(fn):
@@ -1843,7 +1843,7 @@ def apply(fn):
             except Exception as e:
                 raise RuntimeError(
                     " ".join(["Failed:", str(dtype), op.__name__, device])
-                )
+                ) from e
 
     def test_unsupported_dtypes(self):
         for device in self.devices:
diff --git a/test/test_meta.py b/test/test_meta.py
index 99c2049473c6d..6e10917e05e41 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -467,7 +467,7 @@ def run_meta_crossref(
         # they're not tested outside of gradcheck which only checks
         # torch.float64 and torch.complex128 (which this second one
         # often skipped as well).
-        raise unittest.SkipTest("Original OpInfo is broken")
+        raise unittest.SkipTest("Original OpInfo is broken") from e
 
 
     # TODO: also handle cases where func raise an exception
diff --git a/tools/gen_vulkan_glsl.py b/tools/gen_vulkan_glsl.py
index 7e101545e097c..6d89da0c743cb 100644
--- a/tools/gen_vulkan_glsl.py
+++ b/tools/gen_vulkan_glsl.py
@@ -28,13 +28,13 @@ def construct_mapping(self, node, deep=False):  # type: ignore[no-untyped-def]
             key = self.construct_object(key_node, deep=deep)  # type: ignore[no-untyped-call]
             try:
                 hash(key)
-            except TypeError:
+            except TypeError as e:
                 raise ConstructorError(
                     "while constructing a mapping",
                     node.start_mark,
                     "found unacceptable key ",
                     key_node.start_mark,
-                )
+                ) from e
             # check for duplicate keys
             if key in mapping:
                 raise ConstructorError(
diff --git a/tools/render_junit.py b/tools/render_junit.py
index 95c281d99d492..0d6effbd09063 100644
--- a/tools/render_junit.py
+++ b/tools/render_junit.py
@@ -12,10 +12,10 @@
         TestCase,
         TestSuite,
     )
-except ImportError:
+except ImportError as e:
     raise ImportError(
         "junitparser not found, please install with 'pip install junitparser'"
-    )
+    ) from e
 
 try:
     import rich
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index c39318b99d2c7..3e70f59583739 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -364,7 +364,7 @@ def helper_for_dump_minify(contents):
             fd.write(contents)
     except OSError as e:
         log.exception(e)
-        raise NotImplementedError("Could not write to {minified_repro_path}")
+        raise NotImplementedError("Could not write to {minified_repro_path}") from e
 
 
 def dump_to_minify(gm, args, compiler_name: str):
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index 3585a659dacc6..192437b738dad 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -283,7 +283,7 @@ def wrapper(self: "InstructionTranslatorBase", inst: Instruction):
                 if self.has_backedge():
                     msg = "Skipping frame because there is a graph break in a for/while loop"
                     log.debug(msg)
-                    raise exc.SkipFrame(msg)
+                    raise exc.SkipFrame(msg) from excp
 
                 if not self.should_compile_partial_graph():
                     raise
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index fbdf411190543..a2cc37abd6630 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -348,13 +348,13 @@ def proxy_args_kwargs(args, kwargs):
         proxy_args = tuple(arg.as_proxy() for arg in args)
         proxy_kwargs = {key: arg.as_proxy() for key, arg in kwargs.items()}
         return proxy_args, proxy_kwargs
-    except NotImplementedError:
+    except NotImplementedError as e:
         from .exc import unimplemented
         from .variables.base import typestr
 
         raise unimplemented(
             f"call_function args: {typestr(*args)} {typestr(*list(kwargs.values()))}"
-        )
+        ) from e
 
 
 @dataclasses.dataclass
@@ -745,7 +745,7 @@ def wrap_fake_exception(fn):
 
         msg = f"Unsupported: {e.reason} with fake tensor propagation."
         log.warning(msg)
-        raise unimplemented(msg)
+        raise unimplemented(msg) from e
 
 
 def wrap_to_fake_tensor(e, fake_mode):
diff --git a/torch/_dynamo/variables/constant.py b/torch/_dynamo/variables/constant.py
index 63eed37ccbec2..d42760fc26864 100644
--- a/torch/_dynamo/variables/constant.py
+++ b/torch/_dynamo/variables/constant.py
@@ -56,8 +56,8 @@ def unpack_var_sequence(self, tx):
         try:
             options = VariableTracker.propagate([self])
             return [ConstantVariable(x, **options) for x in self.as_python_constant()]
-        except TypeError:
-            raise NotImplementedError()
+        except TypeError as e:
+            raise NotImplementedError from e
 
     def const_getattr(self, tx, name):
         member = getattr(self.value, name)
diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py
index 27dd9d6c067d8..972e21473744b 100644
--- a/torch/_dynamo/variables/nn_module.py
+++ b/torch/_dynamo/variables/nn_module.py
@@ -504,8 +504,8 @@ def unpack_var_sequence(self, tx):
 
         try:
             fn = inspect.getattr_static(self.value_type, "__iter__")
-        except AttributeError:
-            raise NotImplementedError()
+        except AttributeError as e:
+            raise NotImplementedError from e
 
         if fn in (
             torch.nn.ModuleList.__iter__,
diff --git a/torch/_functorch/partitioners.py b/torch/_functorch/partitioners.py
index 712c9a063eaf6..4e3b7a80c1b02 100644
--- a/torch/_functorch/partitioners.py
+++ b/torch/_functorch/partitioners.py
@@ -277,8 +277,9 @@ def min_cut_rematerialization_partition(
     """
     try:
         import networkx as nx
-    except ImportError:
-        raise RuntimeError("Need networkx installed to perform smart recomputation heuristics")
+    except ImportError as e:
+        raise RuntimeError("Need networkx installed to perform smart recomputation "
+                           "heuristics") from e
 
     joint_module.graph.eliminate_dead_code()
     joint_module.recompile()
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index f4297732b38cd..70d6e58ad57e0 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -435,7 +435,7 @@ def load(cls, source_code):
                     try:
                         subprocess.check_output(cmd, stderr=subprocess.STDOUT)
                     except subprocess.CalledProcessError as e:
-                        raise exc.CppCompileError(cmd, e.output)
+                        raise exc.CppCompileError(cmd, e.output) from e
 
                 cls.cache[key] = cls._load_library(output_path)
                 cls.cache[key].key = key
diff --git a/torch/_inductor/triton_ops/autotune.py b/torch/_inductor/triton_ops/autotune.py
index 285995c6254fa..a61927eb01885 100644
--- a/torch/_inductor/triton_ops/autotune.py
+++ b/torch/_inductor/triton_ops/autotune.py
@@ -184,7 +184,7 @@ def run(self, *args, grid, stream):
                 raise RuntimeError(
                     """Consider updating Triton with
 `pip install -U "git+https://github.com/openai/triton@af76c989eb4799b015f8b288ccd8421558772e56#subdirectory=python"`"""
-                )
+                ) from e
             else:
                 raise e
 
diff --git a/torch/_subclasses/fake_utils.py b/torch/_subclasses/fake_utils.py
index d23b12ca84409..6cd3789ae5a08 100644
--- a/torch/_subclasses/fake_utils.py
+++ b/torch/_subclasses/fake_utils.py
@@ -136,5 +136,5 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                             r_out, fake_out, check_strides=self.check_strides
                         )
                     except Exception as e:
-                        raise RuntimeError(f"Mismatch on {func}: {e}")
+                        raise RuntimeError(f"Mismatch on {func}: {e}") from e
         return r
diff --git a/torch/autograd/gradcheck.py b/torch/autograd/gradcheck.py
index bc9274eaefaf4..9f9a80ed50931 100644
--- a/torch/autograd/gradcheck.py
+++ b/torch/autograd/gradcheck.py
@@ -819,7 +819,7 @@ def jvp(tangent: torch.Tensor):
         except RuntimeError as ex:
             # Rethrow to provide a better error message
             raise GradcheckError(
-                f'While computing batched gradients, got: {ex}\n\n{FAILED_BATCHED_GRAD_MSG_FWD_AD}')
+                f'While computing batched gradients, got: {ex}\n\n{FAILED_BATCHED_GRAD_MSG_FWD_AD}') from ex
 
         for input_idx, (res, exp) in enumerate(zip(result, expected)):
             if torch.allclose(res, exp):
@@ -861,7 +861,7 @@ def vjp(v):
             # autograd.grad instead of the C++ traceback of what line in the
             # backward formula
             raise GradcheckError(
-                f'While computing batched gradients, got: {ex}\n\n{FAILED_BATCHED_GRAD_MSG}')
+                f'While computing batched gradients, got: {ex}\n\n{FAILED_BATCHED_GRAD_MSG}') from ex
 
     for input_idx, (res, exp) in enumerate(zip(result, expected)):
         if torch.allclose(res, exp):
@@ -977,12 +977,12 @@ def check_undefined_grad_support(output_to_check):
         try:
             grads_input = torch.autograd.grad(output_to_check, diff_input_list,
                                               grads_output, allow_unused=True)
-        except RuntimeError:
+        except RuntimeError as e:
             warn_bc_breaking()
             raise GradcheckError((
                 'Expected backward function to handle undefined output grads. '
                 'Please look at "Notes about undefined output gradients" in '
-                '"tools/autograd/derivatives.yaml"'))
+                '"tools/autograd/derivatives.yaml"')) from e
 
         for gi, i in zip(grads_input, diff_input_list):
             if (gi is not None) and (not gi.eq(0).all()):
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index 1eb6c70ab7b80..59afb8cef9e0b 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -669,13 +669,13 @@ def memory_usage(device: Optional[Union[Device, int]] = None) -> int:
     """
     try:
         import pynvml  # type: ignore[import]
-    except ModuleNotFoundError:
-        raise ModuleNotFoundError("pynvml module not found, please install pynvml")
+    except ModuleNotFoundError as e:
+        raise ModuleNotFoundError("pynvml module not found, please install pynvml") from e
     from pynvml import NVMLError_DriverNotLoaded
     try:
         pynvml.nvmlInit()
-    except NVMLError_DriverNotLoaded:
-        raise RuntimeError("cuda driver can't be loaded, is cuda enabled?")
+    except NVMLError_DriverNotLoaded as e:
+        raise RuntimeError("cuda driver can't be loaded, is cuda enabled?") from e
     device = _get_device_index(device, optional=True)
     handle = pynvml.nvmlDeviceGetHandleByIndex(device)
     return pynvml.nvmlDeviceGetUtilizationRates(handle).memory
@@ -695,13 +695,13 @@ def utilization(device: Optional[Union[Device, int]] = None) -> int:
     """
     try:
         import pynvml  # type: ignore[import]
-    except ModuleNotFoundError:
-        raise ModuleNotFoundError("pynvml module not found, please install pynvml")
+    except ModuleNotFoundError as e:
+        raise ModuleNotFoundError("pynvml module not found, please install pynvml") from e
     from pynvml import NVMLError_DriverNotLoaded
     try:
         pynvml.nvmlInit()
-    except NVMLError_DriverNotLoaded:
-        raise RuntimeError("cuda driver can't be loaded, is cuda enabled?")
+    except NVMLError_DriverNotLoaded as e:
+        raise RuntimeError("cuda driver can't be loaded, is cuda enabled?") from e
     device = _get_device_index(device, optional=True)
     handle = pynvml.nvmlDeviceGetHandleByIndex(device)
     return pynvml.nvmlDeviceGetUtilizationRates(handle).gpu
diff --git a/torch/distributed/_composable/_ddp.py b/torch/distributed/_composable/_ddp.py
index 9e94ec3d53cde..4fa1c33210493 100644
--- a/torch/distributed/_composable/_ddp.py
+++ b/torch/distributed/_composable/_ddp.py
@@ -1640,10 +1640,10 @@ def _register_fused_optim(
         )
         try:
             overlapped_optim.register_ddp(self)
-        except NotImplementedError:
+        except NotImplementedError as e:
             raise RuntimeError(
                 f"{optim} does not support overlapped DDP. Please file an issue to PyTorch or the respective owner of {optim}."
-            )
+            ) from e
 
     def _distributed_broadcast_coalesced(
         self, tensors, buffer_size, authoritative_rank=0
diff --git a/torch/distributed/elastic/rendezvous/api.py b/torch/distributed/elastic/rendezvous/api.py
index 38b226b9863de..52c8d6152eb79 100644
--- a/torch/distributed/elastic/rendezvous/api.py
+++ b/torch/distributed/elastic/rendezvous/api.py
@@ -195,11 +195,11 @@ def get_as_int(self, key: str, default: Optional[int] = None) -> Optional[int]:
             return value
         try:
             return int(value)
-        except ValueError:
+        except ValueError as e:
             raise ValueError(
                 f"The rendezvous configuration option '{key}' does not represent a valid integer "
                 "value."
-            )
+            ) from e
 
 
 RendezvousHandlerCreator = Callable[[RendezvousParameters], RendezvousHandler]
@@ -244,11 +244,11 @@ def create_handler(self, params: RendezvousParameters) -> RendezvousHandler:
         """Creates a new :py:class:`RendezvousHandler`."""
         try:
             creator = self._registry[params.backend]
-        except KeyError:
+        except KeyError as e:
             raise ValueError(
                 f"The rendezvous backend '{params.backend}' is not registered. Did you forget "
                 f"to call `{self.register.__name__}`?"
-            )
+            ) from e
 
         handler = creator(params)
 
diff --git a/torch/distributed/elastic/rendezvous/etcd_rendezvous.py b/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
index a7b682ccc89fa..8a711bdb2fe30 100644
--- a/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
+++ b/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
@@ -464,10 +464,10 @@ def try_create_rendezvous(self):
             version_counter = self.client.get(self.get_path("/rdzv/version_counter"))
             version_counter.value = str(int(version_counter.value) + 1)
             self.client.update(version_counter)
-        except (etcd.EtcdKeyNotFound, etcd.EtcdCompareFailed):
+        except (etcd.EtcdKeyNotFound, etcd.EtcdCompareFailed) as e:
             raise RendezvousError(
                 "Unexpected state of EtcdRendezvousHandler, worker needs to die."
-            )
+            ) from e
 
         # Any failure below results in declaring a retryable rendezvous failure.
         # The ephemeral /rdzv/active_version will expire and someone can then
diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index 2fe9b87f2a79d..7712015b93af4 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -381,8 +381,8 @@ def _get_ignored_modules(
     msg_prefix = "`ignored_modules` should be an iterable of `torch.nn.Module`s "
     try:
         ignored_root_modules = set(_ignored_modules)
-    except TypeError:
-        raise TypeError(msg_prefix + f"but got {type(_ignored_modules)}")
+    except TypeError as e:
+        raise TypeError(msg_prefix + f"but got {type(_ignored_modules)}") from e
     for module in ignored_root_modules:
         if not isinstance(module, torch.nn.Module):
             raise TypeError(msg_prefix + f"but got an iterable with {type(module)}")
diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py
index d8cceb95a3b91..a81c05d2bd89a 100644
--- a/torch/distributed/fsdp/_optim_utils.py
+++ b/torch/distributed/fsdp/_optim_utils.py
@@ -1015,11 +1015,11 @@ def _get_param_id_to_param_from_optim_input(
         return list(model.parameters())
     try:
         params = list(optim_input)
-    except TypeError:
+    except TypeError as e:
         raise TypeError(
             "Optimizer input should be an iterable of Tensors or dicts, "
             f"but got {optim_input}"
-        )
+        ) from e
     if len(params) == 0:
         raise ValueError("Optimizer input should not be empty")
 
diff --git a/torch/distributed/optim/utils.py b/torch/distributed/optim/utils.py
index 7561a33f609af..990a020daee23 100644
--- a/torch/distributed/optim/utils.py
+++ b/torch/distributed/optim/utils.py
@@ -44,8 +44,9 @@ def register_functional_optim(key, optim):
 def as_functional_optim(optim_cls: Type, *args, **kwargs):
     try:
         functional_cls = functional_optim_map[optim_cls]
-    except KeyError:
-        raise ValueError(f"Optimizer {optim_cls} does not have a functional counterpart!")
+    except KeyError as e:
+        raise ValueError(f"Optimizer {optim_cls} does not have a functional "
+                         f"counterpart!") from e
 
     return _create_functional_optim(functional_cls, *args, **kwargs)
 
diff --git a/torch/distributed/optim/zero_redundancy_optimizer.py b/torch/distributed/optim/zero_redundancy_optimizer.py
index 199b8c794a9aa..f5f526d69edff 100644
--- a/torch/distributed/optim/zero_redundancy_optimizer.py
+++ b/torch/distributed/optim/zero_redundancy_optimizer.py
@@ -1323,9 +1323,9 @@ def _verify_and_init_params(
                             f"Tensors, but got {torch.typename(params)}")
         try:
             all_params = list(params)
-        except TypeError:
+        except TypeError as e:
             raise TypeError("`params` argument should be an iterable of Tensors"
-                            f" or dicts, but got {torch.typename(params)}")
+                            f" or dicts, but got {torch.typename(params)}") from e
         if len(all_params) == 0:
             raise ValueError("ZeroRedundancyOptimizer got an empty parameter "
                              "list")
diff --git a/torch/distributed/pipeline/sync/skip/skippable.py b/torch/distributed/pipeline/sync/skip/skippable.py
index 630468359307e..14fcec08266e2 100644
--- a/torch/distributed/pipeline/sync/skip/skippable.py
+++ b/torch/distributed/pipeline/sync/skip/skippable.py
@@ -198,8 +198,8 @@ def forward(self, input: Union[List[Any], Tensor]) -> TensorOrTensors:
         for ns, name in self.poppable():
             try:
                 poppable_tensors[name] = skip_tracker.load(batch, ns, name)
-            except KeyError:
-                raise RuntimeError(f"'{name}' has not been stashed")
+            except KeyError as e:
+                raise RuntimeError(f"'{name}' has not been stashed") from e
         input = batch.values
 
         # Handle skip commands.
diff --git a/torch/distributed/rendezvous.py b/torch/distributed/rendezvous.py
index 0d2edb86e4d41..3e3607b3f390c 100644
--- a/torch/distributed/rendezvous.py
+++ b/torch/distributed/rendezvous.py
@@ -1,9 +1,9 @@
 try:
     from urllib.parse import urlparse, urlunparse
-except ImportError:
+except ImportError as e:
     raise ImportError(
         "urllib cannot be found, urlparse from python2 is no longer supported."
-    )
+    ) from e
 
 import numbers
 import os
diff --git a/torch/distributed/rpc/internal.py b/torch/distributed/rpc/internal.py
index 435fd29557d0a..dd1e805ca142b 100644
--- a/torch/distributed/rpc/internal.py
+++ b/torch/distributed/rpc/internal.py
@@ -228,7 +228,7 @@ def _handle_exception(result):
             raise RuntimeError(  # noqa: B904
                 f"Failed to create original exception type. Error msg was {str(e)}"
                 f" Original exception on remote side was {exception_msg}"
-            )
+            ) from e
 
         if exc is not None:
             raise exc
diff --git a/torch/distributed/run.py b/torch/distributed/run.py
index 2495f09d4a5cb..7c30454489678 100644
--- a/torch/distributed/run.py
+++ b/torch/distributed/run.py
@@ -605,13 +605,13 @@ def determine_local_world_size(nproc_per_node: str):
     try:
         logging.info(f"Using nproc_per_node={nproc_per_node}.")
         return int(nproc_per_node)
-    except ValueError:
+    except ValueError as e:
         if nproc_per_node == "cpu":
             num_proc = os.cpu_count()
             device_type = "cpu"
         elif nproc_per_node == "gpu":
             if not torch.cuda.is_available():
-                raise ValueError("Cuda is not available.")
+                raise ValueError("Cuda is not available.") from e
             device_type = "gpu"
             num_proc = torch.cuda.device_count()
         elif nproc_per_node == "auto":
@@ -622,7 +622,7 @@ def determine_local_world_size(nproc_per_node: str):
                 num_proc = os.cpu_count()
                 device_type = "cpu"
         else:
-            raise ValueError(f"Unsupported nproc_per_node value: {nproc_per_node}")
+            raise ValueError(f"Unsupported nproc_per_node value: {nproc_per_node}") from e
 
         log.info(
             f"Using nproc_per_node={nproc_per_node},"
diff --git a/torch/jit/_dataclass_impls.py b/torch/jit/_dataclass_impls.py
index c58198d240a98..4daf347db2b34 100644
--- a/torch/jit/_dataclass_impls.py
+++ b/torch/jit/_dataclass_impls.py
@@ -20,13 +20,13 @@ def compose_fn(cls, name: str, body_lines: List[str], signature: str) -> ParsedD
     # Parse the function declaration
     try:
         py_ast = ast.parse(decl)
-    except SyntaxError:
+    except SyntaxError as e:
         # This should only happen if there's some unforeseeable change
         # in the dataclasses module that makes our synthesized code fail
         raise RuntimeError(
             f"TorchScript failed to synthesize dataclass method '{name}' for class '{cls.__name__}'. "
             "Please file a bug report at <https://github.com/pytorch/pytorch/issues>"
-        )
+        ) from e
     fake_filename = _get_fake_filename(cls, name)
     # Parse the function
     return ParsedDef(
diff --git a/torch/jit/_recursive.py b/torch/jit/_recursive.py
index e2717c78ab7e1..bbe7ff98cface 100644
--- a/torch/jit/_recursive.py
+++ b/torch/jit/_recursive.py
@@ -185,7 +185,7 @@ def infer_type(name, item):
         except RuntimeError as re:
             raise RuntimeError(
                 "Error inferring type for {name}: {item}: {re}".format(name=name, item=item, re=re)
-            )
+            ) from re
 
         return attr_type, inferred
 
diff --git a/torch/profiler/profiler.py b/torch/profiler/profiler.py
index 31b85eb26f0fe..85d957677ce2d 100644
--- a/torch/profiler/profiler.py
+++ b/torch/profiler/profiler.py
@@ -292,8 +292,8 @@ def handler_fn(prof) -> None:
         if not os.path.isdir(dir_name):
             try:
                 os.makedirs(dir_name, exist_ok=True)
-            except Exception:
-                raise RuntimeError("Can't create directory: " + dir_name)
+            except Exception as e:
+                raise RuntimeError("Can't create directory: " + dir_name) from e
         if not worker_name:
             worker_name = "{}_{}".format(socket.gethostname(), str(os.getpid()))
         file_name = "{}.{}.pt.trace.json".format(worker_name, int(time.time() * 1000))
diff --git a/torch/testing/_comparison.py b/torch/testing/_comparison.py
index 664aadd7e75e0..b76bb68d945aa 100644
--- a/torch/testing/_comparison.py
+++ b/torch/testing/_comparison.py
@@ -677,8 +677,8 @@ def _to_tensor(self, tensor_like: Any) -> torch.Tensor:
 
         try:
             return torch.as_tensor(tensor_like)
-        except Exception:
-            raise UnsupportedInputs()
+        except Exception as e:
+            raise UnsupportedInputs() from e
 
     def _check_supported(self, tensor: torch.Tensor, *, id: Tuple[Any, ...]) -> None:
         if tensor.layout not in {
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index 5cd3d326e0e3a..2fd9957e6e202 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -949,7 +949,7 @@ def _test_fsdp_parity(
                 **init_kwargs,
             )
         except Exception as e:
-            raise ValueError(f"Initializing {model_class} raised error {str(e)}")
+            raise ValueError(f"Initializing {model_class} raised error {str(e)}") from e
         if not isinstance(fsdp_model, FSDP):
             # Enforce that we wrap with top-level FSDP since we are comparing
             # assuming a data parallel reference and some test models may not
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 434d2855c3929..87eb9280407ec 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -3260,7 +3260,7 @@ def wrapper(*args, **kwargs):
                 if any(connect_error in str(error) for connect_error in connect_errors):
                     tries_remaining -= 1
                     if tries_remaining == 0:
-                        raise RuntimeError(f"Failing after {n_retries} retries with error: {str(error)}")
+                        raise RuntimeError(f"Failing after {n_retries} retries with error: {str(error)}") from error
                     time.sleep(random.random())
                     continue
                 raise
@@ -4001,8 +4001,8 @@ def first_sample(self: unittest.TestCase, samples: Iterable[T]) -> T:
     """
     try:
         return next(iter(samples))
-    except StopIteration:
-        raise unittest.SkipTest('Skipped! Need at least 1 sample input')
+    except StopIteration as e:
+        raise unittest.SkipTest('Skipped! Need at least 1 sample input') from e
 
 # this helper method is to recursively
 # clone the tensor-type input of operators tested by OpInfo
diff --git a/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
index cf2abe0ee8d27..5daf77a48b044 100644
--- a/torch/testing/_internal/distributed/_tensor/common_dtensor.py
+++ b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
@@ -290,8 +290,8 @@ def __next__(self) -> Tuple[Tuple[object, ...], Dict[str, object]]:
                 tree_unflatten(new_args, self.flatten_args_spec),
                 tree_unflatten(new_kwargs, self.flatten_kwargs_spec),
             )
-        except StopIteration:
-            raise StopIteration
+        except StopIteration as e:
+            raise StopIteration from e
 
     def to_dist_tensor(
         self, t: torch.Tensor, mesh: DeviceMesh, placements: List[Placement]
diff --git a/torch/utils/data/datapipes/datapipe.py b/torch/utils/data/datapipes/datapipe.py
index 42120148d0269..4463b0221b438 100644
--- a/torch/utils/data/datapipes/datapipe.py
+++ b/torch/utils/data/datapipes/datapipe.py
@@ -349,10 +349,10 @@ def __setstate__(self, state):
     def __len__(self):
         try:
             return len(self._datapipe)
-        except Exception:
+        except Exception as e:
             raise TypeError(
                 "{} instance doesn't have valid length".format(type(self).__name__)
-            )
+            ) from e
 
 
 class _IterDataPipeSerializationWrapper(_DataPipeSerializationWrapper, IterDataPipe):
diff --git a/torch/utils/data/datapipes/iter/callable.py b/torch/utils/data/datapipes/iter/callable.py
index f0f91dee34b46..f8af43121784f 100644
--- a/torch/utils/data/datapipes/iter/callable.py
+++ b/torch/utils/data/datapipes/iter/callable.py
@@ -154,8 +154,8 @@ def _collate_helper(conversion, item):
             try:
                 import torcharrow.pytorch as tap  # type: ignore[import]
                 collation_fn = tap.rec.Default()
-            except Exception:
-                raise Exception("unable to import default collation function from the TorchArrow")
+            except Exception as e:
+                raise Exception("unable to import default collation function from the TorchArrow") from e
 
         tuple_names.append(str(name))
         value = collation_fn(df[name])
diff --git a/torch/utils/data/datapipes/map/combining.py b/torch/utils/data/datapipes/map/combining.py
index 694ce25c695dc..aa3dac03e79aa 100644
--- a/torch/utils/data/datapipes/map/combining.py
+++ b/torch/utils/data/datapipes/map/combining.py
@@ -93,8 +93,8 @@ def __getitem__(self, index) -> Tuple[T_co, ...]:
         for dp in self.datapipes:
             try:
                 res.append(dp[index])
-            except IndexError:
-                raise IndexError(f"Index {index} is out of range for one of the input MapDataPipes {dp}.")
+            except IndexError as e:
+                raise IndexError(f"Index {index} is out of range for one of the input MapDataPipes {dp}.") from e
         return tuple(res)
 
     def __len__(self) -> int:
diff --git a/torch/utils/data/datapipes/map/grouping.py b/torch/utils/data/datapipes/map/grouping.py
index 10578f2222bbb..443c088557a12 100644
--- a/torch/utils/data/datapipes/map/grouping.py
+++ b/torch/utils/data/datapipes/map/grouping.py
@@ -53,11 +53,11 @@ def __getitem__(self, index) -> DataChunk:
             for i in indices:
                 batch.append(self.datapipe[i])
             return self.wrapper_class(batch)
-        except IndexError:
+        except IndexError as e:
             if not self.drop_last and len(batch) > 0:
                 return self.wrapper_class(batch)
             else:
-                raise IndexError(f"Index {index} is out of bound.")
+                raise IndexError(f"Index {index} is out of bound.") from e
 
     def __len__(self) -> int:
         if self.length is not None:
diff --git a/torch/utils/data/datapipes/utils/decoder.py b/torch/utils/data/datapipes/utils/decoder.py
index da74516ca919a..fe3f4b8502d01 100644
--- a/torch/utils/data/datapipes/utils/decoder.py
+++ b/torch/utils/data/datapipes/utils/decoder.py
@@ -148,13 +148,13 @@ def __call__(self, extension, data):
             import numpy as np
         except ImportError as e:
             raise ModuleNotFoundError("Package `numpy` is required to be installed for default image decoder."
-                                      "Please use `pip install numpy` to install the package")
+                                      "Please use `pip install numpy` to install the package") from e
 
         try:
             import PIL.Image
         except ImportError as e:
             raise ModuleNotFoundError("Package `PIL` is required to be installed for default image decoder."
-                                      "Please use `pip install Pillow` to install the package")
+                                      "Please use `pip install Pillow` to install the package") from e
 
         imagespec = self.imagespec
         atype, etype, mode = imagespecs[imagespec]
@@ -200,7 +200,7 @@ def videohandler(extension, data):
     except ImportError as e:
         raise ModuleNotFoundError("Package `torchvision` is required to be installed for default video file loader."
                                   "Please use `pip install torchvision` or `conda install torchvision -c pytorch`"
-                                  "to install the package")
+                                  "to install the package") from e
 
     with tempfile.TemporaryDirectory() as dirname:
         fname = os.path.join(dirname, f"file.{extension}")
@@ -221,7 +221,7 @@ def audiohandler(extension, data):
     except ImportError as e:
         raise ModuleNotFoundError("Package `torchaudio` is required to be installed for default audio file loader."
                                   "Please use `pip install torchaudio` or `conda install torchaudio -c pytorch`"
-                                  "to install the package")
+                                  "to install the package") from e
 
     with tempfile.TemporaryDirectory() as dirname:
         fname = os.path.join(dirname, f"file.{extension}")
@@ -240,7 +240,7 @@ def __init__(self, **loadmat_kwargs) -> None:
         except ImportError as e:
             raise ModuleNotFoundError("Package `scipy` is required to be installed for mat file."
                                       "Please use `pip install scipy` or `conda install scipy`"
-                                      "to install the package")
+                                      "to install the package") from e
         self.sio = sio
         self.loadmat_kwargs = loadmat_kwargs
 
diff --git a/torch/utils/data/datapipes/utils/snapshot.py b/torch/utils/data/datapipes/utils/snapshot.py
index feb41ed4d236e..8b7e4f64cd4ac 100644
--- a/torch/utils/data/datapipes/utils/snapshot.py
+++ b/torch/utils/data/datapipes/utils/snapshot.py
@@ -47,9 +47,9 @@ def _simple_graph_snapshot_restoration(datapipe: IterDataPipe, n_iterations: int
         try:
             next(it)
             remainder -= 1
-        except StopIteration:
+        except StopIteration as e:
             raise RuntimeError(f"Fast-forward {datapipe} by {n_iterations} iterations "
-                               "exceeds the number of samples available.")
+                               "exceeds the number of samples available.") from e
     datapipe._fast_forward_iterator = it
     # While the DataPipe has `_fast_forward_iterator`, `next()` will get result from there instead of elsewhere.
 
diff --git a/torchgen/gen_backend_stubs.py b/torchgen/gen_backend_stubs.py
index 184e6c1ce29d6..b04b3bd83c29a 100644
--- a/torchgen/gen_backend_stubs.py
+++ b/torchgen/gen_backend_stubs.py
@@ -262,10 +262,10 @@ def error_on_missing_kernels(
     try:
         with open(kernel_defn_file_path, "r") as f:
             backend_defns = f.read()
-    except IOError:
+    except IOError as e:
         raise AssertionError(
             f"Unable to read from the specified impl_path file: {kernel_defn_file_path}"
-        )
+        ) from e
 
     if full_codegen is None:
         full_codegen = []
diff --git a/torchgen/gen_lazy_tensor.py b/torchgen/gen_lazy_tensor.py
index b2b24111b0f9c..d7361ad7435cd 100644
--- a/torchgen/gen_lazy_tensor.py
+++ b/torchgen/gen_lazy_tensor.py
@@ -137,10 +137,10 @@ def validate_shape_inference_header(
         with open(shape_inference_hdr, "r") as f:
             shape_infr_decls = f.read()
             shape_infr_decl_lines = set(shape_infr_decls.split("\n"))
-    except IOError:
+    except IOError as e:
         raise AssertionError(
             f"Unable to read from the specified shape_inference_hdr file: {shape_inference_hdr}"
-        )
+        ) from e
 
     shape_infr_regex = r"compute_shape_(\w+)"
     actual_shape_infr_name_counts = Counter(
diff --git a/torchgen/model.py b/torchgen/model.py
index f511b8ffcb3b9..831cf3c443600 100644
--- a/torchgen/model.py
+++ b/torchgen/model.py
@@ -1719,8 +1719,8 @@ def _parse(t: str) -> "Type":
             return CustomClassType(m.group(1))
         try:
             return BaseType(BaseTy[t])
-        except KeyError:
-            raise RuntimeError(f"unrecognized type {t}")
+        except KeyError as e:
+            raise RuntimeError(f"unrecognized type {t}") from e
 
     def __str__(self) -> str:
         raise NotImplementedError

From e34abeee015b704b2373965407fbf656e794af49 Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@meta.com>
Date: Wed, 7 Dec 2022 04:50:11 +0000
Subject: [PATCH 1659/1922] [inductor] More correct check for fbcode
 environment (#90312)

Summary:
importing torch.fb seemed like a good idea, but we don't always have
torch.fb inside fbcode.  Testing for torch.version.git_version is more
reliable, since we'll never have a git_version inside fbcode, which is an hg
repo.

Test Plan: `buck2 run mode/dev-nosan //caffe2/test/inductor:smoke`

Reviewed By: soumith, jansel

Differential Revision: D41777058

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90312
Approved by: https://github.com/soumith
---
 torch/_inductor/config.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index 578745c216784..3f2f6b1cdde6a 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -1,6 +1,5 @@
 import os
 import sys
-from functools import lru_cache
 
 # add some debug printouts
 debug = False
@@ -64,13 +63,10 @@
 comment_origin = False
 
 
-@lru_cache(1)
 def is_fbcode():
-    try:
-        import torch.fb  # noqa: F401
-    except ImportError:
-        return False
-    return True
+    import torch
+
+    return not hasattr(torch.version, "git_version")
 
 
 compile_threads = (

From 8ab1da97ff8e77018e27a2d98f6d84fcb17ff762 Mon Sep 17 00:00:00 2001
From: Pruthvi Madugundu <pmagundu@amd.com>
Date: Wed, 7 Dec 2022 06:21:31 +0000
Subject: [PATCH 1660/1922] [ROCm] Enable few test_prim UTs for ROCm (#88983)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88983
Approved by: https://github.com/IvanYashchuk, https://github.com/jeffdaily, https://github.com/malfet
---
 test/test_prims.py | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/test/test_prims.py b/test/test_prims.py
index 0b86c433b89ae..cadef6097df15 100644
--- a/test/test_prims.py
+++ b/test/test_prims.py
@@ -13,7 +13,6 @@
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests,
     onlyCUDA,
-    skipCUDAIfRocm,
     dtypes,
     OpDTypes,
 )
@@ -39,7 +38,6 @@
 
 class TestPrims(TestCase):
     @onlyCUDA
-    @skipCUDAIfRocm
     @dtypes(torch.float32)
     def test_broadcast_in_dim(self, device, dtype):
         def _wrapper(a, b, broadcast_dimensions):
@@ -103,7 +101,6 @@ def _wrapper(a, b, broadcast_dimensions):
             """
 
     @onlyCUDA
-    @skipCUDAIfRocm
     @dtypes(torch.float32)
     def test_broadcast_in_dim_sum(self, device, dtype):
         def _wrapper(a):
@@ -144,7 +141,6 @@ def test_cbrt_prim(self, device, dtype):
                 self.assertEqual(y, y_np, exact_device=False)
 
     @onlyCUDA
-    @skipCUDAIfRocm
     def test_nvfuser_impl_is_used(self, device):
         # This test is to ensure that when the nvfuser implementation exists it is used
         # Assuming one-to-one mapping between prims and nvfuser implementations
@@ -235,7 +231,6 @@ def func(x):
         self.assertEqual(len(partitions), 1)
 
     @onlyCUDA
-    @skipCUDAIfRocm
     @dtypes(torch.float32)
     def test_full(self, device, dtype):
         from torch.fx.experimental.proxy_tensor import make_fx
@@ -275,7 +270,6 @@ def func5(size, value, b):
             self.assertEqual(out, func(size, value, b))
 
     @onlyCUDA
-    @skipCUDAIfRocm
     def test_nvfuser_empty_fusion(self, device):
         from torch.fx.experimental.proxy_tensor import make_fx
         from torch._prims.executor import execute
@@ -327,7 +321,6 @@ def func(x, dtype):
         self.assertEqual(includes_nvprim_convert_element_type, nvprim_support_flag)
 
     @onlyCUDA
-    @skipCUDAIfRocm
     def test_nvfuser_rand_like_fusion(self, device):
         from torch._prims.context import TorchRefsNvfuserCapabilityMode
         from torch.fx.experimental.proxy_tensor import make_fx
@@ -346,7 +339,6 @@ def func(a):
 
     @skipCUDAMemoryLeakCheckIf(True)  # https://github.com/pytorch/pytorch/issues/84529
     @onlyCUDA
-    @skipCUDAIfRocm
     def test_nvfuser_no_args(self, device):
         from torch._prims.context import TorchRefsNvfuserCapabilityMode
         from torch.fx.experimental.proxy_tensor import make_fx
@@ -377,7 +369,6 @@ def func():
         self.assertEqual(out, func())
 
     @onlyCUDA
-    @skipCUDAIfRocm
     def test_nvfuser_constant_tensors(self, device):
         from torch._prims.context import TorchRefsNvfuserCapabilityMode
         from torch.fx.experimental.proxy_tensor import make_fx
@@ -400,7 +391,6 @@ def func(b):
         self.assertEqual(out, gm(b))
 
     @onlyCUDA
-    @skipCUDAIfRocm
     def test_nvfuser_executor_cached_noncontiguous(self, device):
         # This test is to ensure that nvfuser computes correct results for noncontiguous tensors
         from torch.fx.experimental.proxy_tensor import make_fx
@@ -503,7 +493,6 @@ def func(a):
 
 
     @onlyCUDA
-    @skipCUDAIfRocm
     def test_nvfuser_executor_parameters(self, device):
         from torch.fx.experimental.proxy_tensor import make_fx
         from torch._prims.executor import execute
@@ -536,7 +525,6 @@ def func(a):
 
 
     @onlyCUDA
-    @skipCUDAIfRocm
     def test_nvfuser_executor_partitioned(self, device):
         # This test is to ensure that nvfuser partitioned executor works correctly
         # It's assumed that digamma is not supported by nvfuser
@@ -565,7 +553,6 @@ def func(a, b, c):
         self.assertEqual(expected, actual)
 
     @onlyCUDA
-    @skipCUDAIfRocm
     def test_nvfuser_executor_partitioned_no_partitions_error(self, device):
         # This test is to ensure that nvfuser partitioned executor works correctly
         # It's assumed that digamma is not supported by nvfuser
@@ -610,7 +597,6 @@ def func(a):
                 self.assertFalse(node.target == torch.ops.aten.add.default)
 
     @onlyCUDA
-    @skipCUDAIfRocm
     @dtypes(torch.float32, torch.float64)
     def test_native_batch_norm_nvprims(self, device, dtype):
         from torch._prims.context import TorchRefsNvfuserCapabilityMode
@@ -673,7 +659,6 @@ def func(
             self.assertEqual(out, gm(sample.input, *sample.args))
 
     @onlyCUDA
-    @skipCUDAIfRocm
     @dtypes(torch.float32, torch.float64)
     def test_cudnn_batch_norm_nvprims(self, device, dtype):
         from torch._prims.context import TorchRefsNvfuserCapabilityMode
@@ -778,7 +763,6 @@ def func2(grad, input, weight, rm, rv, eps, train):
             self.assertTrue(all_nvprims)
 
     @onlyCUDA
-    @skipCUDAIfRocm
     @dtypes(torch.float32)
     def test_silu_backward_no_filled_tensor(self, device, dtype):
         # This test verifies a workaround for
@@ -827,7 +811,6 @@ def func(a):
 
 
     @onlyCUDA
-    @skipCUDAIfRocm
     @dtypes(torch.float32)
     @parametrize("correction", [0, 1])
     def test_var(self, device, dtype, correction):
@@ -848,7 +831,6 @@ def _wrapper(a):
             self.assertEqual(_wrapper(a), result)
 
     @onlyCUDA
-    @skipCUDAIfRocm
     @dtypes(torch.float16, torch.float32)
     @parametrize("correction", [0, 1])
     @parametrize("keepdim", [True, False])
@@ -873,7 +855,6 @@ def _wrapper(a):
         self.assertTrue(includes_nvprims_var_mean)
 
     @onlyCUDA
-    @skipCUDAIfRocm
     @dtypes(torch.float16, torch.float32)
     def test_nvprims_view(self, device, dtype):
         from torch.fx.experimental.proxy_tensor import make_fx
@@ -920,7 +901,6 @@ def func7(a):
             self.assertEqual(out, func(a))
 
     @onlyCUDA
-    @skipCUDAIfRocm
     @dtypes(torch.float16, torch.float32)
     def test_nvprims_view_partitioner(self, device, dtype):
         # This test verifies that views that are not fused with other ops are
@@ -946,7 +926,6 @@ def func(a, b):
         self.assertEqual(out, func(a, b))
 
     @onlyCUDA
-    @skipCUDAIfRocm
     @dtypes(torch.float32, torch.float16)
     def test_cpu_tensor(self, device, dtype):
         from torch.fx.experimental.proxy_tensor import make_fx
@@ -987,7 +966,6 @@ def _wrapper(t0, t1, cpu_scalar):
         self.assertEqual(expected, nvprim_aten_fallback)
 
     @onlyCUDA
-    @skipCUDAIfRocm
     @dtypes(torch.float32)
     def test_pytree_input_output(self, device, dtype):
         @make_traced
@@ -1144,7 +1122,6 @@ def test_constant_pad_nd_memory_format(self, device, dtype):
 
 class TestDecomp(TestCase):
     @onlyCUDA
-    @skipCUDAIfRocm
     @dtypes(torch.float16, torch.float32)
     def test_decomposition_type_promotion_nvprim_amp(self, device, dtype):
         x = torch.rand(5, device=device).to(dtype)
@@ -1185,7 +1162,6 @@ def fn1(x):
             self.assertFalse(includes_aten_to_copy)
 
     @onlyCUDA
-    @skipCUDAIfRocm
     @dtypes(torch.float16, torch.float32)
     def test_masked_fill_decomposition_under_nvprim_context(self, device, dtype):
         # masked_fill decomposition extracts cpu scalar tensor value when

From 2e96eea2413d62ec1938826bc76ac2e733a0d678 Mon Sep 17 00:00:00 2001
From: fduwjj <fduwjj@fb.com>
Date: Wed, 7 Dec 2022 04:18:54 +0000
Subject: [PATCH 1661/1922] Reformat optim import (#90294)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90294
Approved by: https://github.com/awgu
---
 torch/distributed/optim/__init__.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/torch/distributed/optim/__init__.py b/torch/distributed/optim/__init__.py
index 9a83c7aabf772..4fd0199bd356a 100644
--- a/torch/distributed/optim/__init__.py
+++ b/torch/distributed/optim/__init__.py
@@ -7,23 +7,23 @@
 """
 import torch
 from torch import optim
+from .apply_optimizer_in_backward import _apply_optimizer_in_backward
+from .functional_adadelta import _FunctionalAdadelta
 
 from .functional_adagrad import _FunctionalAdagrad
 from .functional_adam import _FunctionalAdam
+from .functional_adamax import _FunctionalAdamax
 from .functional_adamw import _FunctionalAdamW
-from .functional_sgd import _FunctionalSGD
-from .functional_adadelta import _FunctionalAdadelta
 from .functional_rmsprop import _FunctionalRMSprop
 from .functional_rprop import _FunctionalRprop
-from .functional_adamax import _FunctionalAdamax
+from .functional_sgd import _FunctionalSGD
 from .named_optimizer import _NamedOptimizer
 from .utils import as_functional_optim
-from .apply_optimizer_in_backward import _apply_optimizer_in_backward
 
 
 # DistributedOptimizer imports torch.distributed.rpc names, so gate availability
 # based on RPC being available.
-if hasattr(torch._C, '_rpc_init'):
+if hasattr(torch._C, "_rpc_init"):
     from .optimizer import DistributedOptimizer
 
 from .post_localSGD_optimizer import PostLocalSGDOptimizer

From ced9ff00d1ddd0169917e95db5e360b90b15ea55 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Wed, 7 Dec 2022 07:24:55 +0000
Subject: [PATCH 1662/1922] [MPS] Fix median_out_mps caching (#90326)

We should cache graph based on input tensor type

Fixes https://github.com/pytorch/pytorch/issues/90311

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90326
Approved by: https://github.com/kulinseth
---
 aten/src/ATen/native/mps/operations/ReduceOps.mm | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index d905107b8ffd4..8a321ffd2fb12 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -1809,11 +1809,11 @@ Tensor median_mps(const Tensor& input_t) {
     auto stream = at::mps::getCurrentMPSStream();
 
     @autoreleasepool {
-        string key = func_name + ":" + to_string(dim_) + ":" + native_mps::getMPSTypeString(input_t.scalar_type());
-        CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+        string key = func_name + ":" + to_string(dim_) + ":" + native_mps::getTensorsStringKey(input_t);
+        CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
 
         if(!cachedGraph) {
-          native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
+          cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ native_mps::MPSCachedGraph * () {
 
             CachedGraph *newCachedGraph = nil;
 
@@ -1849,7 +1849,6 @@ Tensor median_mps(const Tensor& input_t) {
             }
             return newCachedGraph;
           });
-          cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
         }
 
         auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);

From 0592a9f6a7a942540af6973dca18c92fabcb9cfb Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Sun, 4 Dec 2022 21:20:48 +0000
Subject: [PATCH 1663/1922] Give std/var correction overloads proper defaults
 (#56398)

The correction overloads defaults were left off for forward
compatibility reasons, but this FC window expired well over a year ago
at this point.

Differential Revision: [D29625593](https://our.internmc.facebook.com/intern/diff/D29625593)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/56398
Approved by: https://github.com/mruberry
---
 aten/src/ATen/native/ReduceOps.cpp            |   3 +
 aten/src/ATen/native/native_functions.yaml    |  40 +++-
 test/functorch/test_aotdispatch.py            |   4 +
 test/inductor/test_torchinductor_opinfo.py    |   1 +
 tools/autograd/derivatives.yaml               |   8 +-
 torch/_decomp/decompositions.py               |   2 +-
 torch/_inductor/lowering.py                   |   6 +-
 torch/_tensor_docs.py                         |  14 +-
 torch/_torch_docs.py                          | 213 +++++++++++-------
 .../_internal/common_methods_invocations.py   |  93 +++++++-
 10 files changed, 261 insertions(+), 123 deletions(-)

diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
index 2fe5eee4a286d..a10f6c7255760 100644
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@@ -1744,6 +1744,9 @@ static Tensor& std_var_out(
   const auto correction = correction_opt.value_or(1);
   ScalarType dtype = get_dtype_from_result(result, {});
   auto iter = make_reduction(fname, result, self, dim, keepdim, dtype);
+  TORCH_CHECK(at::canCast(self.scalar_type(), result.scalar_type()),
+              "result type ", self.scalar_type(), " can't be cast to the "
+              "desired output type ", result.scalar_type());
 
   if (iter.numel() == 0) {
     // Trivial reduction
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index d37cfda9bd07c..a3c6988375070 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -5176,12 +5176,14 @@
 - func: std(Tensor self, bool unbiased=True) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+  cpp_no_default_args: ["unbiased"]
 
 - func: std.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+  cpp_no_default_args: ["unbiased"]
 
-- func: std.correction(Tensor self, int[1]? dim, *, int? correction, bool keepdim=False) -> Tensor
+- func: std.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
@@ -5192,12 +5194,14 @@
 - func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
+  cpp_no_default_args: ["unbiased"]
 
 - func: std_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
+  cpp_no_default_args: ["unbiased"]
 
-- func: std_mean.correction(Tensor self, int[1]? dim, *, int? correction, bool keepdim=False) -> (Tensor, Tensor)
+- func: std_mean.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
   dispatch:
@@ -5207,15 +5211,17 @@
 - func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
+  cpp_no_default_args: ["unbiased"]
 
-- func: std_mean.correction_names(Tensor self, Dimname[1] dim, *, int? correction, bool keepdim=False) -> (Tensor, Tensor)
+- func: std_mean.correction_names(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
 
 - func: std.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
+  cpp_no_default_args: ["unbiased"]
 
-- func: std.correction_out(Tensor self, int[1]? dim, *, int? correction, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+- func: std.correction_out(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: std_out
@@ -5224,15 +5230,17 @@
 - func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+  cpp_no_default_args: ["unbiased"]
 
 - func: std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
+  cpp_no_default_args: ["unbiased"]
 
-- func: std.correction_names(Tensor self, Dimname[1] dim, *, int? correction, bool keepdim=False) -> Tensor
+- func: std.correction_names(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
 
-- func: std.correction_names_out(Tensor self, Dimname[1] dim, *, int? correction, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+- func: std.correction_names_out(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function
 
@@ -5645,13 +5653,15 @@
 - func: var(Tensor self, bool unbiased=True) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+  cpp_no_default_args: ["unbiased"]
 
 - func: var.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
   tags: canonical
+  cpp_no_default_args: ["unbiased"]
 
-- func: var.correction(Tensor self, int[1]? dim, *, int? correction, bool keepdim=False) -> Tensor
+- func: var.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
@@ -5660,8 +5670,9 @@
 
 - func: var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
+  cpp_no_default_args: ["unbiased"]
 
-- func: var.correction_out(Tensor self, int[1]? dim, *, int? correction, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+- func: var.correction_out(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: var_out
@@ -5669,27 +5680,31 @@
 - func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+  cpp_no_default_args: ["unbiased"]
 
 - func: var.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
+  cpp_no_default_args: ["unbiased"]
 
-- func: var.correction_names(Tensor self, Dimname[1] dim, *, int? correction, bool keepdim=False) -> Tensor
+- func: var.correction_names(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
 
-- func: var.correction_names_out(Tensor self, Dimname[1] dim, *, int? correction, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+- func: var.correction_names_out(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function
 
 - func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
+  cpp_no_default_args: ["unbiased"]
 
 - func: var_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
+  cpp_no_default_args: ["unbiased"]
 
-- func: var_mean.correction(Tensor self, int[1]? dim, *, int? correction, bool keepdim=False) -> (Tensor, Tensor)
+- func: var_mean.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
   dispatch:
@@ -5699,8 +5714,9 @@
 - func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
+  cpp_no_default_args: ["unbiased"]
 
-- func: var_mean.correction_names(Tensor self, Dimname[1] dim, *, int? correction, bool keepdim=False) -> (Tensor, Tensor)
+- func: var_mean.correction_names(Tensor self, Dimname[1] dim, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
 
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 598a8f70dbcf4..5eeca3ffc4cac 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -1997,7 +1997,9 @@ def forward(self, x):
     xfail('special.i1', ''),  # aten.i0.default - couldn't find symbolic meta function/decomposition
     xfail('special.polygamma', 'special_polygamma_n_0'),  # aten.polygamma.default - couldn't find symbolic ...
     xfail('std', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
+    xfail('std', 'unbiased'),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('std_mean', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
+    xfail('std_mean', 'unbiased'),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('stft', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('sum_to_size', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('svd', ''),  # aten._linalg_svd.default - couldn't find symbolic meta function/decomposition
@@ -2012,7 +2014,9 @@ def forward(self, x):
     xfail('triangular_solve', ''),  # aten.triangular_solve.default - couldn't find symbolic meta function/de...
     xfail('unflatten', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('var', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
+    xfail('var', 'unbiased'),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('var_mean', ''),  # Cannot call numel() on tensor with symbolic sizes/strides
+    xfail('var_mean', 'unbiased'),  # Cannot call numel() on tensor with symbolic sizes/strides
     xfail('view_as', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail('vsplit', ''),  # Cannot call sizes() on tensor with symbolic sizes/strides
 }
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index f7ac61fafa5a8..b2ebb479d7e00 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -332,6 +332,7 @@ def process(device_type):
     "scatter_reduce.prod": {f16, f32, f64},
     "segment_reduce.lengths": {f16, f32, f64},
     "sparse.sampled_addmm": {f32, f64},
+    "std_mean.unbiased": {f16},
     "stft": {f32, f64},
     "svd_lowrank": {f32, f64},
     "tensor_split": {b8, f16, f32, f64, i32, i64},
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index a6fc9d806ad41..0eded5c1ab53d 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -1514,12 +1514,12 @@
   self: unsqueeze_to(grad, dim, self.sym_sizes())
   result: auto_linear
 
-- name: std.correction(Tensor self, int[1]? dim, *, int? correction, bool keepdim=False) -> Tensor
+- name: std.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> Tensor
   self: std_backward(result, grad, self, dim, correction, keepdim)
   # pointwise (variance) + sum + sqrt
   result: (at::real(var_backward(self_t.conj(), self_p, dim, correction, true).sum(dim.value_or(IntArrayRef({})), keepdim)) / (2. * result)).masked_fill_(result == 0, 0)
 
-- name: std_mean.correction(Tensor self, int[1]? dim, *, int? correction, bool keepdim=False) -> (Tensor, Tensor)
+- name: std_mean.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor)
   self: std_mean_backward(grads[0], grads[1], self, result0, dim, correction, keepdim)
   result0: (at::real(var_backward(self_t.conj(), self_p, dim, correction, true).sum(dim.value_or(IntArrayRef({})), keepdim)) / (2. * result0)).masked_fill_(result0 == 0, 0)
   # linear
@@ -1723,12 +1723,12 @@
   self: grad.squeeze(dim)
   result: auto_linear
 
-- name: var.correction(Tensor self, int[1]? dim, *, int? correction, bool keepdim=False) -> Tensor
+- name: var.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> Tensor
   self: var_backward(grad, self, dim, correction, keepdim)
   # pointwise + sum
   result: at::real(var_backward(self_t.conj(), self_p, dim, correction, true).sum(dim.value_or(IntArrayRef({})), keepdim))
 
-- name: var_mean.correction(Tensor self, int[1]? dim, *, int? correction, bool keepdim=False) -> (Tensor, Tensor)
+- name: var_mean.correction(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False) -> (Tensor, Tensor)
   self: var_mean_backward(grads[0], grads[1], self, dim, correction, keepdim)
   result0: at::real(var_backward(self_t.conj(), self_p, dim, correction, true).sum(dim.value_or(IntArrayRef({})), keepdim))
   # linear
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index cadfd14f715b8..1a8335dc292a1 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -1557,7 +1557,7 @@ def xlogy(self: Tensor, other: Tensor) -> Tensor:
 @reduction_complex_to_real
 def std_decomposition(
     x: Tensor,
-    dim: Optional[List[int]],
+    dim: Optional[List[int]] = None,
     correction: Optional[int] = None,
     keepdim: bool = False,
 ):
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index a26785493e988..882dbd693b0cb 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -3292,7 +3292,7 @@ def mean(x, axis=None, keepdim=False, *, dtype=None):
 
 
 @register_lowering([aten.var, prims.var])
-def var_(x, axis, correction=1, keepdim=False):
+def var_(x, axis=None, correction=1, keepdim=False):
     size = x.get_size()
     axis = _validate_reduction_axis(x, axis)
     diffs = square(sub(x, mean(x, axis, keepdim=True)))
@@ -3307,7 +3307,7 @@ def var_(x, axis, correction=1, keepdim=False):
 
 
 @register_lowering(aten.var_mean)
-def var_mean(x, dim, unbiased=True, keepdim=False, correction=None):
+def var_mean(x, dim=None, unbiased=True, keepdim=False, correction=None):
     if correction is None:
         correction = int(unbiased)
     return [
@@ -3317,7 +3317,7 @@ def var_mean(x, dim, unbiased=True, keepdim=False, correction=None):
 
 
 @register_lowering(aten.std)
-def std(x, axis, correction=1, keepdim=False):
+def std(x, axis=None, correction=1, keepdim=False):
     return sqrt(var_(x, axis, correction, keepdim=keepdim))
 
 
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 7b6a8870d8497..cc4c9d3a92f63 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -4785,12 +4785,7 @@ def callable(a, b) -> number
 add_docstr_all(
     "std",
     r"""
-std(dim, unbiased=True, keepdim=False) -> Tensor
-
-See :func:`torch.std`
-
-.. function:: std(unbiased=True) -> Tensor
-   :noindex:
+std(dim=None, *, correction=1, keepdim=False) -> Tensor
 
 See :func:`torch.std`
 """,
@@ -5738,12 +5733,7 @@ def callable(a, b) -> number
 add_docstr_all(
     "var",
     r"""
-var(dim, unbiased=True, keepdim=False) -> Tensor
-
-See :func:`torch.var`
-
-.. function:: var(unbiased=True) -> Tensor
-   :noindex:
+var(dim=None, *, correction=1, keepdim=False) -> Tensor
 
 See :func:`torch.var`
 """,
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 8125d3e560151..baf683901ef29 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -10677,38 +10677,53 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.std,
     r"""
-std(input, dim, unbiased, keepdim=False, *, out=None) -> Tensor
+std(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
 
-If :attr:`unbiased` is ``True``, Bessel's correction will be used.
-Otherwise, the sample deviation is calculated, without any correction.
+Calculates the standard deviation over the dimensions specified by :attr:`dim`.
+:attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+reduce over all dimensions.
+
+The standard deviation (:math:`\sigma`) is calculated as
+
+.. math:: \sigma = \sqrt{\frac{1}{N - \delta N}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+
+where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+the :attr:`correction`.
+"""
+    + r"""
+
+{keepdim_details}
 
 Args:
     {input}
     {dim}
 
 Keyword args:
-    unbiased (bool): whether to use Bessel's correction (:math:`\delta N = 1`).
+    correction (int): difference between the sample size and sample degrees of freedom.
+                      Defaults to `Bessel's correction`_, ``correction=1``.
+    .. versionchanged:: 1.14
+        Previously this argument was called ``unbiased`` and was a boolean with
+        ``True`` corresponding to ``correction=1`` and ``False`` being ``correction=0``.
+
     {keepdim}
     {out}
 
+Example:
 
-.. function:: std(input, unbiased) -> Tensor
-   :noindex:
-
-Calculates the standard deviation of all elements in the :attr:`input` tensor.
-
-If :attr:`unbiased` is ``True``, Bessel's correction will be used.
-Otherwise, the sample deviation is calculated, without any correction.
-
-Args:
-    {input}
-    unbiased (bool): whether to use Bessel's correction (:math:`\delta N = 1`).
+    >>> a = torch.tensor(
+    ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+    ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+    ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+    ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+    >>> torch.std(a, dim=1, keepdim=True)
+    tensor([[1.0311],
+            [0.7477],
+            [1.2204],
+            [0.9087]])
 
-Example::
+.. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
 
-    >>> a = torch.tensor([[-0.8166, -1.3802, -0.3560]])
-    >>> torch.std(a, unbiased=False)
-    tensor(0.4188)
 """.format(
         **multi_dim_common
     ),
@@ -10717,45 +10732,54 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.std_mean,
     r"""
-std_mean(input, dim, unbiased, keepdim=False, *, out=None) -> (Tensor, Tensor)
+std_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+
+Calculates the standard deviation and mean over the dimensions specified by
+:attr:`dim`. :attr:`dim` can be a single dimension, list of dimensions, or
+``None`` to reduce over all dimensions.
 
-If :attr:`unbiased` is ``True``, Bessel's correction will be used to calculate
-the standard deviation. Otherwise, the sample deviation is calculated, without
-any correction.
+The standard deviation (:math:`\sigma`) is calculated as
+
+.. math:: \sigma = \sqrt{\frac{1}{N - \delta N}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+
+where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+the :attr:`correction`.
+
+"""
+    + r"""
+
+{keepdim_details}
 
 Args:
     {input}
     {opt_dim}
 
 Keyword args:
-    unbiased (bool): whether to use Bessel's correction (:math:`\delta N = 1`).
+    correction (int): difference between the sample size and sample degrees of freedom.
+                      Defaults to `Bessel's correction`_, ``correction=1``.
+    .. versionchanged:: 1.14
+        Previously this argument was called ``unbiased`` and was a boolean with
+        ``True`` corresponding to ``correction=1`` and ``False`` being ``correction=0``.
     {keepdim}
     {out}
 
 Returns:
     A tuple (std, mean) containing the standard deviation and mean.
 
-.. function:: std_mean(input, unbiased) -> (Tensor, Tensor)
-   :noindex:
-
-Calculates the standard deviation and mean of all elements in the :attr:`input`
-tensor.
-
-If :attr:`unbiased` is ``True``, Bessel's correction will be used.
-Otherwise, the sample deviation is calculated, without any correction.
+Example:
 
-Args:
-    {input}
-    unbiased (bool): whether to use Bessel's correction (:math:`\delta N = 1`).
+    >>> a = torch.tensor(
+    ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+    ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+    ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+    ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+    >>> torch.std_mean(a, dim=0, keepdim=True)
+    (tensor([[1.2620, 1.0028, 1.0957, 0.6038]]),
+     tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
 
-Returns:
-    A tuple (std, mean) containing the standard deviation and mean.
+.. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
 
-Example::
-
-    >>> a = torch.tensor([[-0.8166, -1.3802, -0.3560]])
-    >>> torch.std_mean(a, unbiased=False)
-    (tensor(0.4188), tensor(-0.8509))
 """.format(
         **multi_dim_common
     ),
@@ -12117,37 +12141,52 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.var,
     r"""
-var(input, dim, unbiased, keepdim=False, *, out=None) -> Tensor
+var(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+
+Calculates the variance over the dimensions specified by :attr:`dim`. :attr:`dim`
+can be a single dimension, list of dimensions, or ``None`` to reduce over all
+dimensions.
+
+The variance (:math:`\sigma^2`) is calculated as
+
+.. math:: \sigma^2 = \frac{1}{N - \delta N}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
 
-If :attr:`unbiased` is ``True``, Bessel's correction will be used.
-Otherwise, the sample variance is calculated, without any correction.
+where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+the :attr:`correction`.
+"""
+    + r"""
+
+{keepdim_details}
 
 Args:
     {input}
     {opt_dim}
 
 Keyword args:
-    unbiased (bool): whether to use Bessel's correction (:math:`\delta N = 1`).
+    correction (int): difference between the sample size and sample degrees of freedom.
+                      Defaults to `Bessel's correction`_, ``correction=1``.
+    .. versionchanged:: 1.14
+        Previously this argument was called ``unbiased`` and was a boolean with
+        ``True`` corresponding to ``correction=1`` and ``False`` being ``correction=0``.
     {keepdim}
     {out}
 
-.. function:: var(input, unbiased) -> Tensor
-   :noindex:
-
-Calculates the variance of all elements in the :attr:`input` tensor.
-
-If :attr:`unbiased` is ``True``, Bessel's correction will be used.
-Otherwise, the sample deviation is calculated, without any correction.
+Example:
 
-Args:
-    {input}
-    unbiased (bool): whether to use Bessel's correction (:math:`\delta N = 1`).
+    >>> a = torch.tensor(
+    ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+    ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+    ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+    ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+    >>> torch.var(a, dim=1, keepdim=True)
+    tensor([[1.0631],
+            [0.5590],
+            [1.4893],
+            [0.8258]])
 
-Example::
+.. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
 
-    >>> a = torch.tensor([[-0.8166, -1.3802, -0.3560]])
-    >>> torch.var(a, unbiased=False)
-    tensor(0.1754)
 """.format(
         **multi_dim_common
     ),
@@ -12156,45 +12195,53 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.var_mean,
     r"""
-var_mean(input, dim, unbiased, keepdim=False, *, out=None) -> (Tensor, Tensor)
+var_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+
+Calculates the variance and mean over the dimensions specified by :attr:`dim`.
+:attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+reduce over all dimensions.
 
-If :attr:`unbiased` is ``True``, Bessel's correction will be used to calculate
-the variance. Otherwise, the sample variance is calculated, without any
-correction.
+The variance (:math:`\sigma^2`) is calculated as
+
+.. math:: \sigma^2 = \frac{1}{N - \delta N}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+
+where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+the :attr:`correction`.
+"""
+    + r"""
+
+{keepdim_details}
 
 Args:
     {input}
     {opt_dim}
 
 Keyword args:
-    unbiased (bool): whether to use Bessel's correction (:math:`\delta N = 1`).
+    correction (int): difference between the sample size and sample degrees of freedom.
+                      Defaults to `Bessel's correction`_, ``correction=1``.
+    .. versionchanged:: 1.14
+        Previously this argument was called ``unbiased`` and was a boolean with
+        ``True`` corresponding to ``correction=1`` and ``False`` being ``correction=0``.
     {keepdim}
     {out}
 
 Returns:
     A tuple (var, mean) containing the variance and mean.
 
-.. function:: var_mean(input, unbiased) -> (Tensor, Tensor)
-   :noindex:
-
-Calculates the variance and mean of all elements in the :attr:`input`
-tensor.
-
-If :attr:`unbiased` is ``True``, Bessel's correction will be used.
-Otherwise, the sample deviation is calculated, without any correction.
-
-Args:
-    {input}
-    unbiased (bool): whether to use Bessel's correction (:math:`\delta N = 1`).
+Example:
 
-Returns:
-    A tuple (var, mean) containing the variance and mean.
+    >>> a = torch.tensor(
+    ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+    ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+    ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+    ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+    >>> torch.var_mean(a, dim=0, keepdim=True)
+    (tensor([[1.5926, 1.0056, 1.2005, 0.3646]]),
+     tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
 
-Example::
+.. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
 
-    >>> a = torch.tensor([[-0.8166, -1.3802, -0.3560]])
-    >>> torch.var_mean(a, unbiased=False)
-    (tensor(0.1754), tensor(-0.8509))
 """.format(
         **multi_dim_common
     ),
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 3379c2bd877ff..527c0793acd34 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -4966,10 +4966,16 @@ def sample_inputs_std_var(op_info, device, dtype, requires_grad, **kwargs):
     yield SampleInput(tensor_nd(), dim=(1,), correction=S // 2)
     yield SampleInput(tensor_nd(), dim=None, correction=0, keepdim=True)
     yield SampleInput(tensor_nd(), dim=None, correction=None)
+    yield SampleInput(tensor_nd(), correction=0, keepdim=True)
+
+
+def sample_inputs_std_var_unbiased(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype,
+                       requires_grad=requires_grad)
 
     # Test var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
-    yield SampleInput(tensor_nd(), True)
-    yield SampleInput(tensor_nd(), False)
+    yield SampleInput(make_arg((S, S)), True)
+    yield SampleInput(make_arg((S,)), False)
 
 
 def _generate_correlation_inputs(device, dtype, requires_grad, **kwargs):
@@ -10344,6 +10350,19 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(toleranceOverride({torch.float64: tol(atol=2e-7, rtol=2e-7)}),
                             "TestDecomp", "test_comprehensive", device_type="cuda"),
            )),
+    OpInfo('var_mean',
+           variant_test_name='unbiased',
+           dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_std_var_unbiased,
+           # TODO: some signatures of var_mean do support out
+           supports_out=False,
+           supports_forward_ad=True,
+           check_batched_forward_grad=False,
+           supports_fwgrad_bwgrad=True,
+           decorators=(
+               DecorateInfo(toleranceOverride({torch.float64: tol(atol=2e-7, rtol=2e-7)}),
+                            "TestDecomp", "test_comprehensive", device_type="cuda"),
+           )),
     OpInfo('std_mean',
            dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
            sample_inputs_func=sample_inputs_std_var,
@@ -10356,6 +10375,19 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(toleranceOverride({torch.float64: tol(atol=2e-7, rtol=2e-7)}),
                             "TestDecomp", "test_comprehensive", device_type="cuda"),
            )),
+    OpInfo('std_mean',
+           variant_test_name='unbiased',
+           dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_std_var_unbiased,
+           # TODO: some signatures of var_mean do support out
+           supports_out=False,
+           supports_forward_ad=True,
+           check_batched_forward_grad=False,
+           supports_fwgrad_bwgrad=True,
+           decorators=(
+               DecorateInfo(toleranceOverride({torch.float64: tol(atol=2e-7, rtol=2e-7)}),
+                            "TestDecomp", "test_comprehensive", device_type="cuda"),
+           )),
     OpInfo('meshgrid',
            variant_test_name='variadic_tensors',
            ref=np.meshgrid,
@@ -16508,7 +16540,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     ReductionOpInfo(
         'std',
         nan_policy='propagate',
-        supports_out=False,
+        supports_out=True,
         complex_to_real=True,
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
@@ -16527,16 +16559,41 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty'),
             DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty_keepdim'),
             # FIXME: improve precision
-            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_small_input'),
-            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_duplicate_values'),
-            # NumPy is giving NaN for this
-            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_large_input'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_small_input',
+                         dtypes=(torch.float16,)),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_duplicate_values',
+                         dtypes=(torch.float16,)),
+            # TODO: Meta not implemented for out overloads
+            DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_meta_outplace'),
+            DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_meta_outplace'),
+            DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_symbolic_meta_outplace'),
+            DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_symbolic_meta_outplace_all_strides'),
         ),
     ),
     ReductionOpInfo(
-        'var',
+        'std',
+        variant_test_name='unbiased',
         nan_policy='propagate',
         supports_out=False,
+        complex_to_real=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        assert_autodiffed=True,
+        promotes_int_to_float=True,
+        check_batched_forward_grad=False,
+        dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
+        sample_inputs_func=sample_inputs_std_var_unbiased,
+        skips=(
+            # FIXME: dim=[] reduces all dimensions
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty_keepdim'),
+        ),
+    ),
+    ReductionOpInfo(
+        'var',
+        nan_policy='propagate',
+        supports_out=True,
         assert_autodiffed=True,
         promotes_int_to_float=True,
         complex_to_real=True,
@@ -16561,6 +16618,26 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_large_input'),
         ),
     ),
+    ReductionOpInfo(
+        'var',
+        variant_test_name='unbiased',
+        nan_policy='propagate',
+        supports_out=False,
+        complex_to_real=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        assert_autodiffed=True,
+        promotes_int_to_float=True,
+        check_batched_forward_grad=False,
+        dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
+        sample_inputs_func=sample_inputs_std_var_unbiased,
+        skips=(
+            # FIXME: dim=[] reduces all dimensions
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty_keepdim'),
+        ),
+    ),
     ReductionOpInfo(
         'prod',
         identity=1,

From 760a52da7745c57999f8dce0b3b2ddcaad0d2ff5 Mon Sep 17 00:00:00 2001
From: Charlie Yan <yanhao.charles@gmail.com>
Date: Tue, 6 Dec 2022 23:37:57 +0000
Subject: [PATCH 1664/1922] reland #89243: [Composable API] replicate: add
 support for DDP args (#90255)

reland https://github.com/pytorch/pytorch/pull/89243
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90255
Approved by: https://github.com/zhaojuanmao
---
 test/distributed/_composable/test_replicate.py |  7 +++++++
 torch/distributed/_composable/replicate.py     | 13 +++++++++----
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/test/distributed/_composable/test_replicate.py b/test/distributed/_composable/test_replicate.py
index db1459589b342..de9fbfdbbc376 100644
--- a/test/distributed/_composable/test_replicate.py
+++ b/test/distributed/_composable/test_replicate.py
@@ -102,6 +102,13 @@ def test_replicate_multi_module(self):
         replicate(replicate_model.fc3)
         self._compare_module(model, replicate_model)
 
+    def test_replicate_with_kwargs(self):
+        model = Net()
+        replicate_model = replicate(
+            deepcopy(model), bucket_cap_mb=1, gradient_as_bucket_view=True
+        )
+        self._compare_module(model, replicate_model)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch/distributed/_composable/replicate.py b/torch/distributed/_composable/replicate.py
index c27a88d79b4d9..95aa8ee4c7d25 100644
--- a/torch/distributed/_composable/replicate.py
+++ b/torch/distributed/_composable/replicate.py
@@ -12,8 +12,9 @@ def __init__(self) -> None:
         self.modules: List[nn.Module] = []
         self.has_initialized: bool = False
         self._param_list: nn.ParameterList = nn.ParameterList()
+        self.kwargs: dict = {}
 
-    def mark_modules(self, *modules: nn.Module) -> None:
+    def mark_modules(self, *modules: nn.Module, **kwargs) -> None:
         for module in modules:
             self.modules.append(module)
             replicate.state(module)._distributed_state = self
@@ -21,6 +22,7 @@ def mark_modules(self, *modules: nn.Module) -> None:
             module.register_forward_pre_hook(self.forward_pre_hook)
             # TODO(@yhcharles): fix type error
             module.register_forward_hook(self.forward_post_hook)  # type: ignore[arg-type]
+        self.kwargs = kwargs
 
     def _recursive_collect_params(self, module: nn.Module) -> None:
         # TODO: skip if managed by other APIs
@@ -39,7 +41,7 @@ def _recursive_collect_params(self, module: nn.Module) -> None:
         for child in module.children():
             self._recursive_collect_params(child)
 
-    def init_helper(self):
+    def init_helper(self) -> None:
         if self.has_initialized:
             return
 
@@ -47,7 +49,9 @@ def init_helper(self):
         for module in self.modules:
             self._recursive_collect_params(module)
 
-        self._ddp = _ddp.DistributedDataParallel(self._param_list)
+        self._ddp = _ddp.DistributedDataParallel(
+            self._param_list, **self.kwargs
+        )
 
     def forward_pre_hook(
         self, module: nn.Module, input: Tuple[torch.Tensor]
@@ -67,6 +71,7 @@ def forward_post_hook(
 @contract
 def replicate(
     module: nn.Module,  # NOTE: contract now supports single module only
+    **kwargs,
 ) -> nn.Module:
     r"""Replicates module(s)
 
@@ -77,5 +82,5 @@ def replicate(
         >>> module = nn.Linear(3, 3)
         >>> replicate(module)
     """
-    _ReplicateState().mark_modules(module)
+    _ReplicateState().mark_modules(module, **kwargs)
     return module

From 7120b056fa3d68c7ed90e78fc63fd2bf9ef4f0d1 Mon Sep 17 00:00:00 2001
From: andrewor14 <andrewor14@gmail.com>
Date: Tue, 6 Dec 2022 08:09:27 -0800
Subject: [PATCH 1665/1922] [Quant][fx][bc-breaking] Make convert.py smaller
 (#90189)

Summary: This commit moves helper functions that are not core
to the convert logic out of convert.py, which was more than
1000 lines. This helps with readability since a new developer
won't have to scroll through hundreds of lines of util functions
to understand the core logic. There should be no change in
functionality in this commit.

BC-breaking notes: The following helper functions that were
previously exposed under the `torch.ao.quantization.fx.convert`
namespace are now made private. Many of these are moved to the
new convert_utils.py
```
convert_custom_module
convert_standalone_module
convert_weighted_module
get_module_path_and_prefix,
has_none_qconfig,
insert_dequantize_node,
is_conversion_supported,
maybe_recursive_remove_dequantize,
replace_observer_or_dequant_stub_with_dequantize_node,
restore_state,
run_weight_observers,
```

Test Plan:
python test/test_quantization.py TestQuantizeFx
python test/test_quantization.py TestQuantizeFxOps

Reviewers: jerryzh168, vkuzo

Subscribers: jerryzh168, vkuzo
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90189
Approved by: https://github.com/jerryzh168
---
 torch/ao/quantization/fx/convert.py       | 454 ++++------------------
 torch/ao/quantization/fx/convert_utils.py | 335 ++++++++++++++++
 2 files changed, 402 insertions(+), 387 deletions(-)
 create mode 100644 torch/ao/quantization/fx/convert_utils.py

diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py
index e795b3bca8584..d651a47de10f0 100644
--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@@ -1,92 +1,80 @@
-from typing import Any, Dict, List, Optional, Set, Tuple, Union, Type, Callable
-from torch.ao.quantization.quant_type import QuantType
-import torch
 import copy
+import operator
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
 import warnings
-from torch.fx import (
-    GraphModule,
-)
+import torch
+from torch.fx import GraphModule
 from torch.fx.graph import (
     Graph,
     Node,
-    Argument,
 )
-from ..utils import (
-    activation_is_statically_quantized,
-    weight_is_quantized,
-    get_qparam_dict,
-    _parent_name,
-    get_swapped_custom_module_class,
+from torch.nn.utils.parametrize import type_before_parametrizations
+from ..backend_config import (
+    BackendConfig,
+    get_native_backend_config,
+)
+from ..backend_config.utils import (
+    get_fused_module_classes,
+    get_pattern_to_dtype_configs,
+    get_qat_module_classes,
+    get_root_module_to_quantized_reference_module,
+)
+from ..quantize import (
+    _remove_qconfig,
+    is_activation_post_process,
 )
 from ..qconfig import (
+    qconfig_equals,
     QConfigAny,
-    qconfig_equals
 )
 from ..qconfig_mapping import QConfigMapping
-from .qconfig_mapping_utils import (
-    generate_node_name_to_qconfig,
-    compare_prepare_convert_qconfig_mappings,
-    update_qconfig_for_fusion,
-    is_qconfig_supported_by_dtype_configs,
-    _update_qconfig_for_qat,
+from ..stubs import DeQuantStub
+from ..utils import (
+    _parent_name,
+    is_per_channel,
+    get_qparam_dict,
+    to_underlying_dtype,
+    weight_is_quantized,
 )
-from torch.ao.quantization.backend_config.utils import (
-    get_root_module_to_quantized_reference_module,
-    get_pattern_to_dtype_configs,
-    get_fused_module_classes,
-    get_qat_module_classes,
+# importing the lib so that the quantized_decomposed ops are registered
+from ._decomposed import quantized_decomposed_lib  # noqa: F401
+from ._equalize import (
+    convert_eq_obs,
+    update_obs_for_equalization,
 )
-from torch.ao.quantization.backend_config import (
-    BackendConfig,
-    get_native_backend_config,
+from .convert_utils import (
+    _convert_custom_module,
+    _convert_standalone_module,
+    _get_module_path_and_prefix,
+    _has_none_qconfig,
+    _insert_dequantize_node,
+    _is_conversion_supported,
+    _maybe_recursive_remove_dequantize,
+    _replace_observer_or_dequant_stub_with_dequantize_node,
+    _restore_state,
+    _run_weight_observers,
 )
+from .custom_config import ConvertCustomConfig
 from .graph_module import (
-    QuantizedGraphModule,
-    is_observed_module,
     is_observed_standalone_module,
+    QuantizedGraphModule,
+)
+from .lower_to_fbgemm import lower_to_fbgemm
+from .qconfig_mapping_utils import (
+    _update_qconfig_for_qat,
+    compare_prepare_convert_qconfig_mappings,
+    generate_node_name_to_qconfig,
+    is_qconfig_supported_by_dtype_configs,
+    update_qconfig_for_fusion,
 )
-from ._equalize import update_obs_for_equalization, convert_eq_obs
-from torch.nn.utils.parametrize import type_before_parametrizations
 from .utils import (
     _get_module,
-    _is_custom_module_lstm,
-    get_custom_module_class_keys,
     create_getattr_from_value,
-    collect_producer_nodes,
-    graph_module_from_producer_nodes,
-    node_arg_is_weight,
-)
-from torch.ao.quantization.utils import (
-    is_per_channel,
-    to_underlying_dtype,
-)
-from torch.ao.quantization.quantize import (
-    _remove_qconfig,
-    is_activation_post_process,
-)
-from torch.ao.quantization.stubs import DeQuantStub
-from .custom_config import (
-    ConvertCustomConfig,
-    PrepareCustomConfig,
+    get_custom_module_class_keys,
 )
-from .lower_to_fbgemm import lower_to_fbgemm
-# importing the lib so that the quantized_decomposed ops are registered
-from ._decomposed import quantized_decomposed_lib  # noqa: F401
-import operator
 
-# TODO: revisit this list. Many helper methods shouldn't be public
 __all__ = [
     "convert",
-    "convert_custom_module",
-    "convert_standalone_module",
-    "convert_weighted_module",
-    "get_module_path_and_prefix",
-    "has_none_qconfig",
-    "insert_dequantize_node",
-    "maybe_get_observer_for_node",
-    "maybe_recursive_remove_dequantize",
-    "restore_state",
-    "run_weight_observers",
 ]
 
 def _replace_observer_with_quantize_dequantize_node_decomposed(
@@ -109,12 +97,12 @@ def _replace_observer_with_quantize_dequantize_node_decomposed(
     """
     assert modules is not None
     assert isinstance(node.target, str)
-    module_path, prefix = get_module_path_and_prefix(node, node_name_to_scope, node_name_to_qconfig)
+    module_path, prefix = _get_module_path_and_prefix(node, node_name_to_scope, node_name_to_qconfig)
     activation_post_process = modules[node.target]
     # skip replacing observers to quant/dequant nodes if the qconfigs of all
     # consumers and producers of this observer are None
     skip_replacement = all([
-        has_none_qconfig(n, node_name_to_qconfig) for n in
+        _has_none_qconfig(n, node_name_to_qconfig) for n in
         list(node.args) + list(node.users.keys())])
     if skip_replacement or not _is_conversion_supported(activation_post_process):
         # didn't find correponding quantize op and info for the activation_post_process
@@ -321,12 +309,12 @@ def _replace_observer_with_quantize_dequantize_node(
     """
     assert modules is not None
     assert isinstance(node.target, str)
-    module_path, prefix = get_module_path_and_prefix(node, node_name_to_scope, node_name_to_qconfig)
+    module_path, prefix = _get_module_path_and_prefix(node, node_name_to_scope, node_name_to_qconfig)
     activation_post_process = modules[node.target]
     # skip replacing observers to quant/dequant nodes if the qconfigs of all
     # consumers and producers of this observer are None
     skip_replacement = all([
-        has_none_qconfig(n, node_name_to_qconfig) for n in
+        _has_none_qconfig(n, node_name_to_qconfig) for n in
         list(node.args) + list(node.users.keys())])
     if skip_replacement or not _is_conversion_supported(activation_post_process):
         # didn't find correponding quantize op and info for the activation_post_process
@@ -427,230 +415,7 @@ def _replace_observer_with_quantize_dequantize_node(
     # should not reach since we have checks in the begining to make sure the
     # activation_post_process is supported
 
-# this is a temporary hack for custom module, we may want to implement
-# this properly after the custom module class design is finalized
-# TODO: DeQuantStubs are currently inserted only after custom module LSTM, while observers are inserted
-# after all other custom modules. In the future, we should simply insert QuantStubs before and DeQuantStubs
-# after custom modules in general, and replace these with "quantize" and "dequantize" nodes respectively.
-def _replace_observer_or_dequant_stub_with_dequantize_node(node: Node, graph: Graph):
-    call_custom_module_node = node.args[0]
-    assert isinstance(call_custom_module_node, Node), \
-        f"Expecting the for call custom module node to be a Node, but got {call_custom_module_node}"
-    node.replace_all_uses_with(call_custom_module_node)
-    graph.erase_node(node)
-    insert_dequantize_node(call_custom_module_node, graph)
-
-def _is_conversion_supported(activation_post_process: torch.nn.Module) -> bool:
-    dtype = activation_post_process.dtype  # type: ignore[attr-defined]
-
-    is_dynamic = False
-    if hasattr(activation_post_process, "is_dynamic"):
-        is_dynamic = activation_post_process.is_dynamic  # type: ignore[attr-defined, assignment]
-
-    return (
-        (dtype in [torch.quint8, torch.qint8, torch.qint32] and (not is_dynamic)) or  # type: ignore[return-value]
-        is_dynamic or
-        dtype == torch.float16
-    )
-
-def restore_state(
-        observed: torch.nn.Module
-) -> Tuple[Dict[str, Tuple[str, type]],
-           PrepareCustomConfig,
-           Set[str]]:
-    assert is_observed_module(observed), \
-        'incoming model must be produced by prepare_fx'
-    prepare_custom_config: PrepareCustomConfig = observed._prepare_custom_config  # type: ignore[assignment]
-    node_name_to_scope: Dict[str, Tuple[str, type]] = observed._node_name_to_scope  # type: ignore[assignment]
-    observed_node_names: Set[str] = observed._observed_node_names  # type: ignore[assignment]
-    return node_name_to_scope, prepare_custom_config, observed_node_names
-
-def has_none_qconfig(node: Argument, node_name_to_qconfig: Dict[str, QConfigAny]) -> bool:
-    """ Check if a node has a qconfig of None, i.e. user requested to not quantize
-    the node
-    """
-    return isinstance(node, Node) and node.name in node_name_to_qconfig and node_name_to_qconfig[node.name] is None
-
-def run_weight_observers(observed: GraphModule, backend_config: BackendConfig) -> None:
-    """ Extract the subgraph that produces the weight for dynamic quant
-    or weight only quant node and run the subgraph to observe the weight.
-    Note that the observers of dynamic quant or weight only quant ops are
-    run during the convert step.
-    """
-    for node in observed.graph.nodes:
-        if node.op != "call_function":
-            continue
-        for node_arg in node.args:
-            # node_arg is weight
-            if node_arg and node_arg_is_weight(node, node_arg, backend_config):
-                weight_observer_nodes = collect_producer_nodes(node_arg)
-                if weight_observer_nodes is None:
-                    continue
-                weight_observer_module = \
-                    graph_module_from_producer_nodes(
-                        observed, weight_observer_nodes)
-                # run the weight observer
-                weight_observer_module()
-
-def maybe_recursive_remove_dequantize(arg: Any, node: Node, graph: Graph):
-    """ If the arg is a dequantize Node, or a list/tuple/dict of dequantize Node,
-    we'll recursively remove the dequantize Node
-    """
-    if isinstance(arg, Node) and \
-       arg.op == "call_method" and \
-       arg.target == "dequantize":
-        quantize_node = arg.args[0]
-        # we only replace the specific use since dequantize could be used by other nodes
-        # as well
-        node.replace_input_with(arg, quantize_node)
-    elif isinstance(arg, (list, tuple)):
-        for arg_element in arg:
-            maybe_recursive_remove_dequantize(arg_element, node, graph)
-    elif isinstance(arg, dict):
-        for arg_element in arg.values():
-            maybe_recursive_remove_dequantize(arg_element, node, graph)
-    else:
-        warnings.warn(f"Unsupported node type in recursive remove dequantize: {type(arg)}")
-
-def get_module_path_and_prefix(
-        obs_node: Node,
-        node_name_to_scope: Dict[str, Tuple[str, type]],
-        node_name_to_qconfig: Dict[str, QConfigAny]):
-    """ Given and observer node, get the `Scope` or the fully qualified name for
-    the submodule containing the observed node, also return a prefix of "_input"
-    when the observed node is an input of a F.linear op, and not the output of another
-    quantized op.
-    TODO: this logic is hacky, we should think about how to remove it or make it more
-    general
-    """
-    observed_node = obs_node.args[0]
-    # an observer can be inserted for both input of the next operator or output of the previous
-    # operator (they can be the same)
-    # this flag identifies if the observer is inserted only because the observed node is
-    # the input of the next operator
-    assert isinstance(observed_node, Node), \
-        f"Expecting observed node to be a Node, but got {observed_node}"
-    is_input_observer_only = node_name_to_qconfig[observed_node.name] is None \
-        if observed_node.name in node_name_to_qconfig else None
-    if is_input_observer_only:
-        # if the quantize function is at the input of op, then we find the first user of the observer_node
-        # to get the path. If a linear call_function is in the user list, we return the first instance
-        # of linear node to get the FQN.
-        users = list(obs_node.users)
-        first_linear_use_or_first_use = users[0] if users else None
-        linear_node = None
-        for n in users:
-            if n.op == "call_function" and n.target == torch.nn.functional.linear:
-                linear_node = n
-                break
-        if linear_node:
-            first_linear_use_or_first_use = linear_node
-        prefix = "_input"
-    else:
-        # if the quantize function is at the output of the op, we use the observer input node to get the path
-        first_linear_use_or_first_use = observed_node
-        prefix = ""
-
-    if first_linear_use_or_first_use and first_linear_use_or_first_use.name in node_name_to_scope:
-        module_path, _ = node_name_to_scope[first_linear_use_or_first_use.name]
-    else:
-        # TODO: it's not used, so actually we can skip quantization
-        # but this requires changing return type of quantize_node
-        # we can fix it later if needed
-        module_path = ""
-    return module_path, prefix
-
-def insert_dequantize_node(
-        node: Node,
-        graph: Graph):
-    """ Inserts dequantize node for `node` in `graph`
-    """
-    with graph.inserting_after(node):
-        dequantize_node = graph.call_method("dequantize", (node,))
-        for user_node in dict(node.users):
-            if user_node is not dequantize_node:
-                user_node.replace_input_with(node, dequantize_node)
-
-def maybe_get_observer_for_node(
-        node: Node,
-        modules: Dict[str, torch.nn.Module]
-) -> Optional[torch.nn.Module]:
-    """
-    If the node is observed, return the observer
-    instance. Otherwise, return None.
-    """
-    for maybe_obs_node, _ in node.users.items():
-        if maybe_obs_node.op == 'call_module':
-            maybe_obs = modules[str(maybe_obs_node.target)]
-            if is_activation_post_process(maybe_obs):
-                return maybe_obs
-    return None
-
-def convert_standalone_module(
-        node: Node,
-        modules: Dict[str, torch.nn.Module],
-        model: torch.fx.GraphModule,
-        is_reference: bool,
-        backend_config: Optional[BackendConfig]):
-    """ Converts a observed standalone module to a quantized standalone module by calling
-    the fx convert api, currently using the same `is_reference` flag as parent, but we may
-    changing this behavior in the future (e.g. separating quantization and lowering for
-    standalone module as well)
-
-    Args:
-      - node: The call_module node of the observed standalone module
-      - modules: named_module of original model
-      - model: original model
-      - is_reference: a flag from parent provided by user to decide if we want to
-        produce a reference model or a fbgemm/qnnpack model
-      - backend_config: backend configuration of the target backend of quantization
-    """
-    # TODO: remove is_reference flag
-    if is_reference:
-        convert_fn = torch.ao.quantization.quantize_fx.convert_to_reference_fx
-    else:
-        convert_fn = torch.ao.quantization.quantize_fx.convert_fx  # type: ignore[attr-defined]
-    # We know that observed standalone module is a GraphModule since
-    # it's produced by us
-    observed_standalone_module : GraphModule = modules[str(node.target)]  # type: ignore[assignment]
-    sm_input_quantized_idxs = \
-        observed_standalone_module \
-        ._standalone_module_input_quantized_idxs\
-        .tolist()  # type: ignore[operator]
-    # remove the dequantize nodes for inputs
-    args = list(node.args)
-    for idx in range(len(args)):
-        if idx in sm_input_quantized_idxs:
-            arg = args[idx]
-            if arg.op == "call_method" and arg.target == "dequantize":  # type: ignore[union-attr]
-                quantize_node = arg.args[0]  # type: ignore[union-attr]
-                node.replace_input_with(arg, quantize_node)
-                if len(arg.users) == 0:  # type: ignore[union-attr]
-                    model.graph.erase_node(arg)
-    # add dequantize node for output
-    sm_output_quantized_idxs = \
-        observed_standalone_module \
-        ._standalone_module_output_quantized_idxs \
-        .tolist()  # type: ignore[operator]
-    if len(sm_output_quantized_idxs) > 0:
-        assert sm_output_quantized_idxs[0] == 0, "Currently only quantized"
-        "output idxs = [0] is supported"
-
-        # if it's non-empty, then it means the output is kept in quantized form
-        # we'll just add a dequantize node after this node
-        insert_dequantize_node(node, model.graph)
-
-    # TODO: allow convert_custom_config to override backend_config
-    # for standalone module
-    quantized_standalone_module = convert_fn(
-        observed_standalone_module,
-        backend_config=backend_config)
-    parent_name, name = _parent_name(node.target)
-    # update the modules dict
-    setattr(modules[parent_name], name, quantized_standalone_module)
-    modules[str(node.target)] = quantized_standalone_module
-
-def convert_weighted_module(
+def _convert_weighted_module(
         node: Node,
         modules: Dict[str, torch.nn.Module],
         observed_node_names: Set[str],
@@ -685,7 +450,7 @@ def convert_weighted_module(
 
     is_observed = node.name in observed_node_names
     # If a qconfig is not defined for this node, then skip converting to a reference module
-    if qconfig is None or has_none_qconfig(node, node_name_to_qconfig) or not is_observed:
+    if qconfig is None or _has_none_qconfig(node, node_name_to_qconfig) or not is_observed:
         return
 
     # skip converting to reference quantized module if the qconfig is not supported
@@ -760,91 +525,6 @@ def convert_weighted_module(
         parent_name, name = _parent_name(node.target)
         setattr(modules[parent_name], name, ref_qmodule)
 
-def _remove_previous_dequantize_in_custom_module(node: Node, prev_node: Node, graph: Graph):
-    """
-    Given a custom module `node`, if the previous node is a dequantize, reroute the custom as follows:
-
-    Before: quantize - dequantize - custom_module
-    After: quantize - custom_module
-                 \\ - dequantize
-    """
-    # expecting the input node for a custom module node to be a Node
-    assert isinstance(prev_node, Node), \
-        f"Expecting the argument for custom module node to be a Node, but got {prev_node}"
-    if prev_node.op == "call_method" and prev_node.target == "dequantize":
-        node.replace_input_with(prev_node, prev_node.args[0])
-        # Remove the dequantize node if it doesn't have other users
-        if len(prev_node.users) == 0:
-            graph.erase_node(prev_node)
-
-def convert_custom_module(
-        node: Node,
-        graph: Graph,
-        modules: Dict[str, torch.nn.Module],
-        custom_module_class_mapping: Dict[QuantType, Dict[Type, Type]],
-        statically_quantized_custom_module_nodes: Set[Node]):
-    """ Converts an observed custom module to a quantized custom module based on
-    `custom_module_class_mapping`
-    For static quantization, we'll also remove the previous `dequantize` node and
-    attach the observer node for output to the module, the observer for the node
-    will be converted to a dequantize node instead of quantize-dequantize pairs
-    later in the graph. In the end we would have a quantized custom module that
-    has the same interface as a default quantized module in nn.quantized namespace,
-    i.e. quantized input and quantized output.
-
-    Args:
-      - node: The call_module node of the observed standalone module
-      - graph: The graph containing the node
-      - modules: named_module of original model
-      - custom_module_class_mapping: mapping from observed custom module class to
-        quantized custom module class, used to swap custom modules
-      - statically_quantized_custom_module_nodes: we'll add the custom module node
-        if we find it is statically quantized, this will be used later when converting
-        observers to quant/dequant node pairs, if the observed node is a statically
-        quantized custom module nodes, we'll convert the observer to a dequantize node,
-        this is to keep the interface the same as the default quantized module.
-        TODO: maybe we want to redesign this part to align with reference model design
-        as well, but there has been some discussions around the interface, so we can do
-        it later.
-    """
-    observed_custom_module = modules[str(node.target)]
-    maybe_obs = maybe_get_observer_for_node(node, modules)
-    qconfig = observed_custom_module.qconfig
-    if activation_is_statically_quantized(qconfig):
-        statically_quantized_custom_module_nodes.add(node)
-        if _is_custom_module_lstm(node, modules):
-            # The inputs are tuples in the form (input, (hidden0, hidden1))
-            # Ensure all three input nodes are quantized
-            assert (
-                len(node.args) == 2 and
-                isinstance(node.args[1], tuple) and
-                len(node.args[1]) == 2
-            )
-            (inputs, (hidden0, hidden1)) = node.args  # type: ignore[misc]
-            assert isinstance(inputs, Node)
-            assert isinstance(hidden0, Node)
-            assert isinstance(hidden1, Node)
-            _remove_previous_dequantize_in_custom_module(node, inputs, graph)
-            _remove_previous_dequantize_in_custom_module(node, hidden0, graph)
-            _remove_previous_dequantize_in_custom_module(node, hidden1, graph)
-        else:
-            # remove the previous dequant node to ensure the inputs are quantized
-            arg = node.args[0]
-            assert isinstance(arg, Node)
-            _remove_previous_dequantize_in_custom_module(node, arg, graph)
-            # absorb the following observer into the module conversion
-            activation_post_process = maybe_get_observer_for_node(node, modules)
-            assert activation_post_process is not None
-            observed_custom_module.activation_post_process = activation_post_process
-
-    # swap the observed custom module to quantized custom module
-    quantized_custom_module_class = get_swapped_custom_module_class(
-        observed_custom_module, custom_module_class_mapping, qconfig)
-    quantized_custom_module = \
-        quantized_custom_module_class.from_observed(observed_custom_module)
-    parent_name, name = _parent_name(node.target)
-    setattr(modules[parent_name], name, quantized_custom_module)
-
 def convert(
         model: GraphModule, is_reference: bool = False,
         convert_custom_config: Union[ConvertCustomConfig, Dict[str, Any], None] = None,
@@ -906,7 +586,7 @@ def convert(
     if backend_config is None:
         backend_config = get_native_backend_config()
 
-    node_name_to_scope, prepare_custom_config, observed_node_names = restore_state(model)
+    node_name_to_scope, prepare_custom_config, observed_node_names = _restore_state(model)
     node_name_to_qconfig: Dict[str, QConfigAny] = model._node_name_to_qconfig  # type: ignore[assignment]
 
     # mapping from fully qualified module name to module instance
@@ -956,7 +636,7 @@ def convert(
 
     # always run weight observers in the top level forward method
     # for dynamic quant ops or weight only quant ops
-    run_weight_observers(model, backend_config)
+    _run_weight_observers(model, backend_config)
 
     graph_inputs: List[str] = []
     for node in model.graph.nodes:
@@ -985,7 +665,7 @@ def convert(
                 # input_quantized_idxs override.
                 # we need to dequantize the inputs since all operators took
                 # floating point inputs in reference quantized models
-                insert_dequantize_node(node, model.graph)
+                _insert_dequantize_node(node, model.graph)
         elif node.op == "output":
             # If the argument is empty we don't need to do anything
             if len(output_quantized_idxs) == 0:
@@ -998,13 +678,13 @@ def convert(
             # outputs can be Node, list, tuple, dict, other cases are not supported yet
             if isinstance(output, (list, tuple)):
                 for idx in output_quantized_idxs:
-                    maybe_recursive_remove_dequantize(output[idx], return_node, model.graph)
+                    _maybe_recursive_remove_dequantize(output[idx], return_node, model.graph)
             elif isinstance(output, (Node, dict)):
                 # we treat dict as a single argument currently, but it can be extended
                 # to support {"key": dtype} after we change output_quantized_idxs to
                 # dict
                 if 0 in output_quantized_idxs:
-                    maybe_recursive_remove_dequantize(output, return_node, model.graph)
+                    _maybe_recursive_remove_dequantize(output, return_node, model.graph)
             else:
                 warnings.warn(f"Unsupported node type for output_quantized_idxs: {type(output)}")
         elif node.op == "call_module":
@@ -1026,7 +706,7 @@ def convert(
             elif isinstance(mod, DeQuantStub):
                 _replace_observer_or_dequant_stub_with_dequantize_node(node, model.graph)
             elif is_observed_standalone_module(mod):
-                convert_standalone_module(
+                _convert_standalone_module(
                     node, modules, model, is_reference, backend_config)
             # below this point `type_before_parametrizations` is used
             # instead of `type` to handle situations with fx quant + sparsity
@@ -1037,10 +717,10 @@ def convert(
                 if type_before_parametrizations(mod) in fused_module_classes and \
                    type_before_parametrizations(mod[0]) not in root_module_classes:  # type: ignore[index]
                     continue
-                convert_weighted_module(
+                _convert_weighted_module(
                     node, modules, observed_node_names, node_name_to_qconfig, backend_config)
             elif type_before_parametrizations(mod) in custom_module_classes:
-                convert_custom_module(
+                _convert_custom_module(
                     node, model.graph, modules, custom_module_class_mapping,
                     statically_quantized_custom_module_nodes)
 
diff --git a/torch/ao/quantization/fx/convert_utils.py b/torch/ao/quantization/fx/convert_utils.py
new file mode 100644
index 0000000000000..12304ccadf162
--- /dev/null
+++ b/torch/ao/quantization/fx/convert_utils.py
@@ -0,0 +1,335 @@
+from typing import Any, Dict, Optional, Set, Tuple, Type
+import warnings
+import torch
+import torch.ao.quantization.quantize_fx
+from torch.fx import GraphModule
+from torch.fx.graph import (
+    Argument,
+    Graph,
+    Node,
+)
+from ..backend_config import BackendConfig
+from ..quant_type import QuantType
+from ..quantize import is_activation_post_process
+from ..qconfig import QConfigAny
+from ..utils import (
+    activation_is_statically_quantized,
+    _parent_name,
+    get_swapped_custom_module_class,
+)
+from .custom_config import PrepareCustomConfig
+from .graph_module import is_observed_module
+from .utils import (
+    _is_custom_module_lstm,
+    collect_producer_nodes,
+    graph_module_from_producer_nodes,
+    node_arg_is_weight,
+)
+
+# this is a temporary hack for custom module, we may want to implement
+# this properly after the custom module class design is finalized
+# TODO: DeQuantStubs are currently inserted only after custom module LSTM, while observers are inserted
+# after all other custom modules. In the future, we should simply insert QuantStubs before and DeQuantStubs
+# after custom modules in general, and replace these with "quantize" and "dequantize" nodes respectively.
+def _replace_observer_or_dequant_stub_with_dequantize_node(node: Node, graph: Graph):
+    call_custom_module_node = node.args[0]
+    assert isinstance(call_custom_module_node, Node), \
+        f"Expecting the for call custom module node to be a Node, but got {call_custom_module_node}"
+    node.replace_all_uses_with(call_custom_module_node)
+    graph.erase_node(node)
+    _insert_dequantize_node(call_custom_module_node, graph)
+
+def _is_conversion_supported(activation_post_process: torch.nn.Module) -> bool:
+    dtype = activation_post_process.dtype  # type: ignore[attr-defined]
+
+    is_dynamic = False
+    if hasattr(activation_post_process, "is_dynamic"):
+        is_dynamic = activation_post_process.is_dynamic  # type: ignore[attr-defined, assignment]
+
+    return (
+        (dtype in [torch.quint8, torch.qint8, torch.qint32] and (not is_dynamic)) or  # type: ignore[return-value]
+        is_dynamic or
+        dtype == torch.float16
+    )
+
+def _restore_state(
+        observed: torch.nn.Module
+) -> Tuple[Dict[str, Tuple[str, type]],
+           PrepareCustomConfig,
+           Set[str]]:
+    assert is_observed_module(observed), \
+        'incoming model must be produced by prepare_fx'
+    prepare_custom_config: PrepareCustomConfig = observed._prepare_custom_config  # type: ignore[assignment]
+    node_name_to_scope: Dict[str, Tuple[str, type]] = observed._node_name_to_scope  # type: ignore[assignment]
+    observed_node_names: Set[str] = observed._observed_node_names  # type: ignore[assignment]
+    return node_name_to_scope, prepare_custom_config, observed_node_names
+
+def _has_none_qconfig(node: Argument, node_name_to_qconfig: Dict[str, QConfigAny]) -> bool:
+    """ Check if a node has a qconfig of None, i.e. user requested to not quantize
+    the node
+    """
+    return isinstance(node, Node) and node.name in node_name_to_qconfig and node_name_to_qconfig[node.name] is None
+
+def _run_weight_observers(observed: GraphModule, backend_config: BackendConfig) -> None:
+    """ Extract the subgraph that produces the weight for dynamic quant
+    or weight only quant node and run the subgraph to observe the weight.
+    Note that the observers of dynamic quant or weight only quant ops are
+    run during the convert step.
+    """
+    for node in observed.graph.nodes:
+        if node.op != "call_function":
+            continue
+        for node_arg in node.args:
+            # node_arg is weight
+            if node_arg and node_arg_is_weight(node, node_arg, backend_config):
+                weight_observer_nodes = collect_producer_nodes(node_arg)
+                if weight_observer_nodes is None:
+                    continue
+                weight_observer_module = \
+                    graph_module_from_producer_nodes(
+                        observed, weight_observer_nodes)
+                # run the weight observer
+                weight_observer_module()
+
+def _maybe_recursive_remove_dequantize(arg: Any, node: Node, graph: Graph):
+    """ If the arg is a dequantize Node, or a list/tuple/dict of dequantize Node,
+    we'll recursively remove the dequantize Node
+    """
+    if isinstance(arg, Node) and \
+       arg.op == "call_method" and \
+       arg.target == "dequantize":
+        quantize_node = arg.args[0]
+        # we only replace the specific use since dequantize could be used by other nodes
+        # as well
+        node.replace_input_with(arg, quantize_node)
+    elif isinstance(arg, (list, tuple)):
+        for arg_element in arg:
+            _maybe_recursive_remove_dequantize(arg_element, node, graph)
+    elif isinstance(arg, dict):
+        for arg_element in arg.values():
+            _maybe_recursive_remove_dequantize(arg_element, node, graph)
+    else:
+        warnings.warn(f"Unsupported node type in recursive remove dequantize: {type(arg)}")
+
+def _get_module_path_and_prefix(
+        obs_node: Node,
+        node_name_to_scope: Dict[str, Tuple[str, type]],
+        node_name_to_qconfig: Dict[str, QConfigAny]):
+    """ Given and observer node, get the `Scope` or the fully qualified name for
+    the submodule containing the observed node, also return a prefix of "_input"
+    when the observed node is an input of a F.linear op, and not the output of another
+    quantized op.
+    TODO: this logic is hacky, we should think about how to remove it or make it more
+    general
+    """
+    observed_node = obs_node.args[0]
+    # an observer can be inserted for both input of the next operator or output of the previous
+    # operator (they can be the same)
+    # this flag identifies if the observer is inserted only because the observed node is
+    # the input of the next operator
+    assert isinstance(observed_node, Node), \
+        f"Expecting observed node to be a Node, but got {observed_node}"
+    is_input_observer_only = node_name_to_qconfig[observed_node.name] is None \
+        if observed_node.name in node_name_to_qconfig else None
+    if is_input_observer_only:
+        # if the quantize function is at the input of op, then we find the first user of the observer_node
+        # to get the path. If a linear call_function is in the user list, we return the first instance
+        # of linear node to get the FQN.
+        users = list(obs_node.users)
+        first_linear_use_or_first_use = users[0] if users else None
+        linear_node = None
+        for n in users:
+            if n.op == "call_function" and n.target == torch.nn.functional.linear:
+                linear_node = n
+                break
+        if linear_node:
+            first_linear_use_or_first_use = linear_node
+        prefix = "_input"
+    else:
+        # if the quantize function is at the output of the op, we use the observer input node to get the path
+        first_linear_use_or_first_use = observed_node
+        prefix = ""
+
+    if first_linear_use_or_first_use and first_linear_use_or_first_use.name in node_name_to_scope:
+        module_path, _ = node_name_to_scope[first_linear_use_or_first_use.name]
+    else:
+        # TODO: it's not used, so actually we can skip quantization
+        # but this requires changing return type of quantize_node
+        # we can fix it later if needed
+        module_path = ""
+    return module_path, prefix
+
+def _insert_dequantize_node(
+        node: Node,
+        graph: Graph):
+    """ Inserts dequantize node for `node` in `graph`
+    """
+    with graph.inserting_after(node):
+        dequantize_node = graph.call_method("dequantize", (node,))
+        for user_node in dict(node.users):
+            if user_node is not dequantize_node:
+                user_node.replace_input_with(node, dequantize_node)
+
+def _maybe_get_observer_for_node(
+        node: Node,
+        modules: Dict[str, torch.nn.Module]
+) -> Optional[torch.nn.Module]:
+    """
+    If the node is observed, return the observer
+    instance. Otherwise, return None.
+    """
+    for maybe_obs_node, _ in node.users.items():
+        if maybe_obs_node.op == 'call_module':
+            maybe_obs = modules[str(maybe_obs_node.target)]
+            if is_activation_post_process(maybe_obs):
+                return maybe_obs
+    return None
+
+def _convert_standalone_module(
+        node: Node,
+        modules: Dict[str, torch.nn.Module],
+        model: torch.fx.GraphModule,
+        is_reference: bool,
+        backend_config: Optional[BackendConfig]):
+    """ Converts a observed standalone module to a quantized standalone module by calling
+    the fx convert api, currently using the same `is_reference` flag as parent, but we may
+    changing this behavior in the future (e.g. separating quantization and lowering for
+    standalone module as well)
+
+    Args:
+      - node: The call_module node of the observed standalone module
+      - modules: named_module of original model
+      - model: original model
+      - is_reference: a flag from parent provided by user to decide if we want to
+        produce a reference model or a fbgemm/qnnpack model
+      - backend_config: backend configuration of the target backend of quantization
+    """
+    # TODO: remove is_reference flag
+    if is_reference:
+        convert_fn = torch.ao.quantization.quantize_fx.convert_to_reference_fx
+    else:
+        convert_fn = torch.ao.quantization.quantize_fx.convert_fx  # type: ignore[attr-defined]
+    # We know that observed standalone module is a GraphModule since
+    # it's produced by us
+    observed_standalone_module : GraphModule = modules[str(node.target)]  # type: ignore[assignment]
+    sm_input_quantized_idxs = \
+        observed_standalone_module \
+        ._standalone_module_input_quantized_idxs\
+        .tolist()  # type: ignore[operator]
+    # remove the dequantize nodes for inputs
+    args = list(node.args)
+    for idx in range(len(args)):
+        if idx in sm_input_quantized_idxs:
+            arg = args[idx]
+            if arg.op == "call_method" and arg.target == "dequantize":  # type: ignore[union-attr]
+                quantize_node = arg.args[0]  # type: ignore[union-attr]
+                node.replace_input_with(arg, quantize_node)
+                if len(arg.users) == 0:  # type: ignore[union-attr]
+                    model.graph.erase_node(arg)
+    # add dequantize node for output
+    sm_output_quantized_idxs = \
+        observed_standalone_module \
+        ._standalone_module_output_quantized_idxs \
+        .tolist()  # type: ignore[operator]
+    if len(sm_output_quantized_idxs) > 0:
+        assert sm_output_quantized_idxs[0] == 0, "Currently only quantized"
+        "output idxs = [0] is supported"
+
+        # if it's non-empty, then it means the output is kept in quantized form
+        # we'll just add a dequantize node after this node
+        _insert_dequantize_node(node, model.graph)
+
+    # TODO: allow convert_custom_config to override backend_config
+    # for standalone module
+    quantized_standalone_module = convert_fn(
+        observed_standalone_module,
+        backend_config=backend_config)
+    parent_name, name = _parent_name(node.target)
+    # update the modules dict
+    setattr(modules[parent_name], name, quantized_standalone_module)
+    modules[str(node.target)] = quantized_standalone_module
+
+def _remove_previous_dequantize_in_custom_module(node: Node, prev_node: Node, graph: Graph):
+    """
+    Given a custom module `node`, if the previous node is a dequantize, reroute the custom as follows:
+
+    Before: quantize - dequantize - custom_module
+    After: quantize - custom_module
+                 \\ - dequantize
+    """
+    # expecting the input node for a custom module node to be a Node
+    assert isinstance(prev_node, Node), \
+        f"Expecting the argument for custom module node to be a Node, but got {prev_node}"
+    if prev_node.op == "call_method" and prev_node.target == "dequantize":
+        node.replace_input_with(prev_node, prev_node.args[0])
+        # Remove the dequantize node if it doesn't have other users
+        if len(prev_node.users) == 0:
+            graph.erase_node(prev_node)
+
+def _convert_custom_module(
+        node: Node,
+        graph: Graph,
+        modules: Dict[str, torch.nn.Module],
+        custom_module_class_mapping: Dict[QuantType, Dict[Type, Type]],
+        statically_quantized_custom_module_nodes: Set[Node]):
+    """ Converts an observed custom module to a quantized custom module based on
+    `custom_module_class_mapping`
+    For static quantization, we'll also remove the previous `dequantize` node and
+    attach the observer node for output to the module, the observer for the node
+    will be converted to a dequantize node instead of quantize-dequantize pairs
+    later in the graph. In the end we would have a quantized custom module that
+    has the same interface as a default quantized module in nn.quantized namespace,
+    i.e. quantized input and quantized output.
+
+    Args:
+      - node: The call_module node of the observed standalone module
+      - graph: The graph containing the node
+      - modules: named_module of original model
+      - custom_module_class_mapping: mapping from observed custom module class to
+        quantized custom module class, used to swap custom modules
+      - statically_quantized_custom_module_nodes: we'll add the custom module node
+        if we find it is statically quantized, this will be used later when converting
+        observers to quant/dequant node pairs, if the observed node is a statically
+        quantized custom module nodes, we'll convert the observer to a dequantize node,
+        this is to keep the interface the same as the default quantized module.
+        TODO: maybe we want to redesign this part to align with reference model design
+        as well, but there has been some discussions around the interface, so we can do
+        it later.
+    """
+    observed_custom_module = modules[str(node.target)]
+    maybe_obs = _maybe_get_observer_for_node(node, modules)
+    qconfig = observed_custom_module.qconfig
+    if activation_is_statically_quantized(qconfig):
+        statically_quantized_custom_module_nodes.add(node)
+        if _is_custom_module_lstm(node, modules):
+            # The inputs are tuples in the form (input, (hidden0, hidden1))
+            # Ensure all three input nodes are quantized
+            assert (
+                len(node.args) == 2 and
+                isinstance(node.args[1], tuple) and
+                len(node.args[1]) == 2
+            )
+            (inputs, (hidden0, hidden1)) = node.args  # type: ignore[misc]
+            assert isinstance(inputs, Node)
+            assert isinstance(hidden0, Node)
+            assert isinstance(hidden1, Node)
+            _remove_previous_dequantize_in_custom_module(node, inputs, graph)
+            _remove_previous_dequantize_in_custom_module(node, hidden0, graph)
+            _remove_previous_dequantize_in_custom_module(node, hidden1, graph)
+        else:
+            # remove the previous dequant node to ensure the inputs are quantized
+            arg = node.args[0]
+            assert isinstance(arg, Node)
+            _remove_previous_dequantize_in_custom_module(node, arg, graph)
+            # absorb the following observer into the module conversion
+            activation_post_process = _maybe_get_observer_for_node(node, modules)
+            assert activation_post_process is not None
+            observed_custom_module.activation_post_process = activation_post_process
+
+    # swap the observed custom module to quantized custom module
+    quantized_custom_module_class = get_swapped_custom_module_class(
+        observed_custom_module, custom_module_class_mapping, qconfig)
+    quantized_custom_module = \
+        quantized_custom_module_class.from_observed(observed_custom_module)
+    parent_name, name = _parent_name(node.target)
+    setattr(modules[parent_name], name, quantized_custom_module)

From fd7fd834c3beabe6898d54855d95f9037f9713c3 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Wed, 7 Dec 2022 17:41:54 +0000
Subject: [PATCH 1666/1922] [BE][CI] Add windows test run instructions (#90388)

Specifies how to activate VisualStudio, Anaconda and set `PYTHONPATH` to run tests in CI
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90388
Approved by: https://github.com/atalman, https://github.com/ZainRizvi
---
 .github/workflows/_win-build.yml |  2 +-
 .github/workflows/_win-test.yml  | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml
index 8baaca498d176..b04dc7f6626cb 100644
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@@ -56,7 +56,7 @@ jobs:
             And then change password using `passwd` command.
 
             To start build locally, change working folder to \actions-runner\_work\pytorch\pytorch,
-            Activate miniconda and Visual Studio environment, but running:
+            Activate miniconda and Visual Studio environment, by running:
               call C:\Jenkins\Miniconda3\Scripts\activate.bat C:\Jenkins\Miniconda3
               call "C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Auxiliary\Build\vcvarsall.bat" x64
 
diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml
index 0cabb8ec469aa..234b2c7faad78 100644
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@@ -78,6 +78,16 @@ jobs:
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
+          instructions: |
+            To forward remote desktop on your local machine ssh as follows:
+              ssh -L 3389:localhost:3389 %%username%%@%%hostname%%
+            And then change password using `passwd` command.
+
+            To start tests locally, change working folder to \actions-runner\_work\pytorch\pytorch\test,
+            Activate miniconda and Visual Studio environment and set PYTHON_PATH, by running:
+              call C:\Jenkins\Miniconda3\Scripts\activate.bat C:\Jenkins\Miniconda3
+              call "C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Auxiliary\Build\vcvarsall.bat" x64
+              set PYTHONPATH=C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build
 
       - name: Start monitoring script
         id: monitor-script

From d0b492c0cefb56e23b127a94c73c3faff66453e7 Mon Sep 17 00:00:00 2001
From: Michael Voznesensky <voznesenskym@gmail.com>
Date: Wed, 7 Dec 2022 17:51:53 +0000
Subject: [PATCH 1667/1922] Add hooks structure for passing around user
 provided hooks, add a new guard_failure_fn (#90371)

This PR introduces a new function we can pass to torch._dynamo.optimize - guard_failure_fn. Usage is in the PR, and the one stacked on top of it, but the gist of it is that it emits failed guard reason strings alongside code. This is useful for tests and debugging, as it gives far finer grained assertions and control than the compile counter alone.

This is a resubmit of https://github.com/pytorch/pytorch/pull/90129

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90371
Approved by: https://github.com/ezyang
---
 test/dynamo/test_misc.py       | 62 ++++++++++++++++++++++++++++++++++
 test/dynamo/test_nops.py       |  3 +-
 torch/_dynamo/convert_frame.py | 34 +++++++++++--------
 torch/_dynamo/eval_frame.py    | 38 +++++++++++++++------
 torch/_dynamo/guards.py        | 49 ++++++++++++++++++++++-----
 torch/_dynamo/hooks.py         |  9 +++++
 torch/_dynamo/testing.py       |  2 +-
 7 files changed, 162 insertions(+), 35 deletions(-)
 create mode 100644 torch/_dynamo/hooks.py

diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index 80890d5f3bbed..18132cad557d7 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -2995,6 +2995,68 @@ def fn(x, y):
 
         self.assertEqual(cnt.frame_count, 0)
 
+    def test_guard_failure_fn(self):
+        def fn(x, y, k):
+            x = x + 1
+            y = y + 1
+            return x * y * k
+
+        x = torch.tensor([0.5, 0.5])
+        y = torch.tensor([1.0, 1.0])
+
+        guard_failure = None
+
+        def guard_failures(failure):
+            nonlocal guard_failure
+            guard_failure = failure
+
+        opt_fn = torch._dynamo.optimize(
+            "eager", nopython=True, guard_fail_fn=guard_failures
+        )(fn)
+
+        x2 = torch.tensor([0.5, 0.5, 1.0])
+        y2 = torch.tensor([0.5, 0.5, 0.5])
+
+        opt_fn(x, y, 3)
+        opt_fn(x2, y2, 5)
+
+        self.assertTrue(guard_failure is not None)
+        self.assertEqual(guard_failure[0], "k == 3")
+
+    def test_guard_failure_fn2(self):
+        def fn(x, y):
+            x = x + 1
+            y = y + 1
+            return x * y
+
+        x = torch.tensor([0.5, 0.5])
+        y = torch.tensor([1.0, 1.0])
+
+        guard_failure = None
+
+        def guard_failures(failure):
+            nonlocal guard_failure
+            guard_failure = failure
+
+        opt_fn = torch._dynamo.optimize(
+            "eager", nopython=True, guard_fail_fn=guard_failures
+        )(fn)
+
+        x2 = torch.tensor([0.5, 0.5, 1.0])
+        y2 = torch.tensor([0.5, 0.5, 0.5])
+
+        opt_fn(x, y)
+        opt_fn(x2, y2)
+
+        if torch._dynamo.config.dynamic_shapes:
+            self.assertTrue(guard_failure is None)
+        else:
+            self.assertTrue(guard_failure is not None)
+            self.assertEqual(
+                guard_failure[0],
+                "tensor 'x' size mismatch at index 0. expected 2, actual 3",
+            )
+
 
 class CustomFunc1(torch.autograd.Function):
     @staticmethod
diff --git a/test/dynamo/test_nops.py b/test/dynamo/test_nops.py
index 44e102699d091..c17b9528a4f8e 100644
--- a/test/dynamo/test_nops.py
+++ b/test/dynamo/test_nops.py
@@ -4,6 +4,7 @@
 import torch._dynamo.test_case
 import torch._dynamo.testing
 from torch._dynamo import eval_frame
+from torch._dynamo.hooks import Hooks
 
 c = 10
 
@@ -32,7 +33,7 @@ def fn3():
 
 
 with_debug_nops = eval_frame._optimize_catch_errors(
-    torch._dynamo.testing.debug_insert_nops
+    torch._dynamo.testing.debug_insert_nops, Hooks(None, None)
 )
 
 
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index e57bcbadee6db..257faf6866445 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -6,7 +6,7 @@
 import types
 import weakref
 from traceback import FrameSummary
-from typing import Callable, cast, Dict, List, Optional, Set
+from typing import cast, Dict, List, Optional
 
 import torch
 from torch.fx.graph_module import _forward_from_src as original_forward_from_src
@@ -23,7 +23,8 @@
     unimplemented,
     Unsupported,
 )
-from .guards import CheckFunctionManager, Guard, GuardedCode
+from .guards import CheckFunctionManager, GuardedCode
+from .hooks import Hooks
 from .output_graph import CompilerFn, OutputGraph
 from .replay_record import ExecutionRecord
 from .symbolic_convert import InstructionTranslator
@@ -266,7 +267,6 @@ def exception_handler(e, code, frame=None):
 
 def convert_frame_assert(
     compiler_fn: CompilerFn,
-    guard_export_fn=None,
     one_graph: bool = True,
     export: bool = False,
 ):
@@ -274,7 +274,7 @@ def convert_frame_assert(
     init_logging()
 
     @dynamo_timed
-    def _convert_frame_assert(frame: types.FrameType, cache_size: int):
+    def _convert_frame_assert(frame: types.FrameType, cache_size: int, hooks: Hooks):
         code = frame.f_code
         input_codes.add(code)
         if code in output_codes:
@@ -344,7 +344,7 @@ def format_guard_failures(code):
             compiler_fn,
             one_graph,
             export,
-            guard_export_fn,
+            hooks,
             frame,
         )
 
@@ -360,7 +360,7 @@ def _compile(
     compiler_fn: CompilerFn,
     one_graph: bool,
     export: bool,
-    guard_export_fn: Optional[Callable[[Set[Guard]], None]] = None,
+    hooks: Hooks,
     frame: Optional[types.FrameType] = None,
 ) -> Optional[GuardedCode]:
     output: Optional[OutputGraph] = None
@@ -432,7 +432,13 @@ def transform(instructions, code_options):
         assert output is not None
         assert output.guards is not None
         CleanupManager.instance[out_code] = output.cleanups
-        check_fn = CheckFunctionManager(output, output.guards, locals, globals)
+        check_fn = CheckFunctionManager(
+            output,
+            output.guards,
+            locals,
+            globals,
+            hooks.guard_fail_fn if hooks else None,
+        )
 
         guarded_code = GuardedCode(out_code, check_fn.check_fn)
 
@@ -443,8 +449,8 @@ def transform(instructions, code_options):
             )
             log.info(guard_str)
 
-        if guard_export_fn is not None:
-            guard_export_fn(output.guards)
+        if hooks.guard_export_fn is not None:
+            hooks.guard_export_fn(output.guards)
 
         return guarded_code
     except (
@@ -460,14 +466,14 @@ def transform(instructions, code_options):
         raise InternalTorchDynamoError() from e
 
 
-def convert_frame(compiler_fn: CompilerFn, guard_export_fn=None):
+def convert_frame(compiler_fn: CompilerFn, hooks: Hooks):
     """Try to convert a frame into an FX graph, if error leave frame unmodified"""
-    inner_convert = convert_frame_assert(compiler_fn, guard_export_fn, one_graph=False)
+    inner_convert = convert_frame_assert(compiler_fn, one_graph=False)
 
-    def _convert_frame(frame: types.FrameType, cache_size: int):
+    def _convert_frame(frame: types.FrameType, cache_size: int, hooks: Hooks):
         counters["frames"]["total"] += 1
         try:
-            result = inner_convert(frame, cache_size)
+            result = inner_convert(frame, cache_size, hooks)
             counters["frames"]["ok"] += 1
             return result
         except (NotImplementedError, Unsupported):
@@ -501,9 +507,9 @@ def replay(filename):
             record.locals,
             record.builtins,
             eager,
+            hooks,
             one_graph=False,
             export=False,
-            guard_export_fn=None,
             frame=None,
         )
     except Exception:
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 9edcb61aaaaad..773e62e3f9e3e 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -19,6 +19,8 @@
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.nn.parallel.distributed import DistributedDataParallel
 
+from .hooks import Hooks
+
 if TYPE_CHECKING:
     from torch._C._dynamo.eval_frame import (  # noqa: F401
         reset_code,
@@ -297,7 +299,7 @@ def __init__(self):
         super().__init__(callback=None)
 
 
-def catch_errors_wrapper(callback):
+def catch_errors_wrapper(callback, hooks: Hooks):
     @functools.wraps(callback)
     def catch_errors(frame, cache_size):
         if (
@@ -321,20 +323,23 @@ def catch_errors(frame, cache_size):
                         backend_compile_fn=callback._torchdynamo_orig_callable,
                     )
                     hijacked_callback = convert_frame.convert_frame(
-                        ddp_optimizer.compile_fn, guard_export_fn=None
+                        ddp_optimizer.compile_fn,
+                        hooks=hooks,
                     )
-                    return hijacked_callback(frame, cache_size)
+                    return hijacked_callback(frame, cache_size, hooks)
 
         with compile_lock:
-            return callback(frame, cache_size)
+            return callback(frame, cache_size, hooks)
 
     catch_errors._torchdynamo_orig_callable = callback  # type: ignore[attr-defined]
     return catch_errors
 
 
-def _optimize_catch_errors(compile_fn, backend_ctx_ctor=null_context, dynamic=False):
+def _optimize_catch_errors(
+    compile_fn, hooks: Hooks, backend_ctx_ctor=null_context, dynamic=False
+):
     return OptimizeContext(
-        catch_errors_wrapper(compile_fn),
+        catch_errors_wrapper(compile_fn, hooks),
         backend_ctx_ctor=backend_ctx_ctor,
         first_ctx=True,
         dynamic=dynamic,
@@ -382,6 +387,7 @@ def optimize(
     *,
     nopython=False,
     guard_export_fn=None,
+    guard_fail_fn=None,
     disable=False,
     dynamic=False,
 ):
@@ -409,6 +415,12 @@ def optimize(
         def toy_example(a, b):
             ...
     """
+    # Note: The hooks object could be global instead of passed around, *however* that would make
+    # for a confusing API usage and plumbing story wherein we nest multiple .optimize calls.
+    # There is some prior art around this, w/r/t nesting backend calls are enforced to be the same
+    # compiler, however, this feels onerous for callback and hooks, and it feels better to give our users an
+    # easier to understand UX at the cost of a little more plumbing on our end.
+    hooks = Hooks(guard_export_fn=guard_export_fn, guard_fail_fn=guard_fail_fn)
     torch._C._log_api_usage_once("torch._dynamo.optimize")
     if disable or os.environ.get("TORCHDYNAMO_DISABLE", "") == "1":
         return _NullDecorator()
@@ -432,10 +444,13 @@ def toy_example(a, b):
 
     if nopython:
         return optimize_assert(
-            backend, guard_export_fn=guard_export_fn, dynamic=dynamic
+            backend,
+            dynamic=dynamic,
+            hooks=hooks,
         )
     return _optimize_catch_errors(
-        convert_frame.convert_frame(backend, guard_export_fn=guard_export_fn),
+        convert_frame.convert_frame(backend, hooks=hooks),
+        hooks,
         backend_ctx_ctor,
         dynamic=dynamic,
     )
@@ -594,7 +609,7 @@ def result_capturing_wrapper(*graph_inputs):
     with patch(f"{__name__}.most_recent_backend", None):
         opt_f = optimize_assert(
             dynamo_normalization_capturing_compiler,
-            guard_export_fn=guard_export_print,
+            hooks=Hooks(guard_export_fn=guard_export_print, guard_fail_fn=None),
             export=True,
         )(f)
         # TODO(voz): We may have instances of `f` that mutate inputs, we should track sideffects and reject.
@@ -669,7 +684,7 @@ def assume_constant_result(fn):
     return fn
 
 
-def optimize_assert(backend, *, guard_export_fn=None, export=False, dynamic=False):
+def optimize_assert(backend, *, hooks=Hooks(None, None), export=False, dynamic=False):
     """
     The same as `torch._dynamo.optimize(backend, nopython=True)`
     """
@@ -679,7 +694,8 @@ def optimize_assert(backend, *, guard_export_fn=None, export=False, dynamic=Fals
     backend_ctx_ctor = getattr(backend, "backend_ctx_ctor", null_context)
 
     return _optimize_catch_errors(
-        convert_frame.convert_frame_assert(backend, guard_export_fn, export=export),
+        convert_frame.convert_frame_assert(backend, export=export),
+        hooks,
         backend_ctx_ctor,
         dynamic=dynamic,
     )
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index 3cef0ed185b72..cb731c2359991 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -8,7 +8,18 @@
 import types
 import weakref
 from inspect import currentframe, getframeinfo
-from typing import Any, Callable, Dict, List, Optional, Set, Type, Union
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    NamedTuple,
+    Optional,
+    Set,
+    Tuple,
+    Type,
+    Union,
+)
 from weakref import ReferenceType
 
 import numpy as np
@@ -632,6 +643,7 @@ def __init__(
         guards: Optional[Set[Guard]] = None,
         f_locals: Optional[Dict[str, object]] = None,
         f_globals: Optional[Dict[str, object]] = None,
+        guard_fail_fn: Optional[Callable[[Tuple[str, str]], None]] = None,
     ):
         self.valid = True
         self._weakrefs: List["ReferenceType[object]"] = []
@@ -656,7 +668,9 @@ def combine_scopes(left, right):
             if not config.guard_nn_modules and guard.is_nn_module():
                 continue
             guard.create(local_builder, global_builder)
-        self.check_fn = self.compile_check_fn(local_builder, global_builder, guards)
+        self.check_fn = self.compile_check_fn(
+            local_builder, global_builder, guards, guard_fail_fn
+        )
         self._seen_ids.clear()
 
     """
@@ -738,7 +752,9 @@ def _parse_symbolic_shape_expressions(self, tensor_check_names, tensor_check_ids
         expression = " and ".join(finished_expressions)
         return f"({expression})"
 
-    def compile_check_fn(self, local_builder, global_builder, guards_out):
+    def compile_check_fn(
+        self, local_builder, global_builder, guards_out, guard_fail_fn
+    ):
         assert not (set(local_builder.argnames) & set(global_builder.argnames))
         # see parallel handling of ".0" / "___implicit0" in _eval_frame.c
         largs = [a for a in local_builder.scope.keys() if a == "___implicit0"]
@@ -831,6 +847,7 @@ def ___make_guard_fn({','.join(closure_vars.keys())}):
         guard_fn.code_parts = code_parts
         guard_fn.verbose_code_parts = verbose_code_parts
         guard_fn.global_scope = global_builder.scope
+        guard_fn.guard_fail_fn = guard_fail_fn
         return guard_fn
 
     def invalidate(self, ref):
@@ -848,28 +865,44 @@ def id_ref(self, obj):
         return id(obj)
 
 
+class GuardFail(NamedTuple):
+    # A string repr of the piece of failed guard code we eval-ed
+    reason: str
+    # A code object where we failed a guard
+    orig_code: types.CodeType
+
+
 def guard_fail_hook(
     guard_fn: GuardFn, code: types.CodeType, f_locals: Dict[str, object], last: bool
 ) -> None:
     """
     called whenever a guard fails.
     """
-    if not last:
+    if not guard_fn.guard_fail_fn and not last:
         return
     scope = {rename_implicit(k): v for k, v in f_locals.items()}
     scope.update(guard_fn.closure_vars)
-    reasons = []
+    reason = None
     for part in guard_fn.verbose_code_parts:
         fail_reason = eval(part, guard_fn.global_scope, scope)
         # TODO(whc) hacky for now as not every 'part' in guard_fn.verbose_code_parts
         # is updated to return a string explaining the failure.
         if isinstance(fail_reason, str):
-            reasons.append(fail_reason)
+            reason = fail_reason
             break
         elif isinstance(fail_reason, bool) and not fail_reason:
-            reasons.append(part)
+            reason = part
             break
-    guard_failures[orig_code_map[code]].append(reasons)
+    try:
+        guard_fn.guard_fail_fn(GuardFail(reason, orig_code_map[code]))
+    except Exception as e:
+        log.error(
+            "Failure in guard_fail_fn callback - raising here will cause a NULL Error on guard eval",
+            exc_info=True,
+        )
+
+    if last:
+        guard_failures[orig_code_map[code]].append(reason)
 
 
 def guard_error_hook(
diff --git a/torch/_dynamo/hooks.py b/torch/_dynamo/hooks.py
new file mode 100644
index 0000000000000..37b47a75ced2f
--- /dev/null
+++ b/torch/_dynamo/hooks.py
@@ -0,0 +1,9 @@
+import dataclasses
+
+from typing import Callable, Optional, Set, Tuple
+
+
+@dataclasses.dataclass
+class Hooks:
+    guard_export_fn: Optional[Callable[[Set["Guard"]], None]]
+    guard_fail_fn: Optional[Callable[[Tuple["GuardFail"]], None]]
diff --git a/torch/_dynamo/testing.py b/torch/_dynamo/testing.py
index 55186931988ba..53ea6251bd4cf 100644
--- a/torch/_dynamo/testing.py
+++ b/torch/_dynamo/testing.py
@@ -124,7 +124,7 @@ def debug_dump(name, code: types.CodeType, extra=""):
         )
 
 
-def debug_insert_nops(frame, cache_size):
+def debug_insert_nops(frame, cache_size, hooks):
     """used to debug jump updates"""
 
     def insert_nops(instructions, code_options):

From 07f9d326cfdf83806b0d230c7a1a408ff8152e42 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Wed, 7 Dec 2022 14:50:27 +0000
Subject: [PATCH 1668/1922] [quant]Fix public bindings for DTypeWithConstraint
 (#90315)

Summary:
Need this to fix `test_public_bindings`.

Test Plan:
`python test/test_public_bindings.py`
Reviewers:

Subscribers:

Tasks:

Tags:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90315
Approved by: https://github.com/HDCharles
---
 torch/ao/quantization/backend_config/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch/ao/quantization/backend_config/__init__.py b/torch/ao/quantization/backend_config/__init__.py
index 6443b756f716c..9aba6d2e9853f 100644
--- a/torch/ao/quantization/backend_config/__init__.py
+++ b/torch/ao/quantization/backend_config/__init__.py
@@ -16,5 +16,6 @@
     "BackendConfig",
     "BackendPatternConfig",
     "DTypeConfig",
+    "DTypeWithConstraints",
     "ObservationType",
 ]

From 974a1449eb8a70a9b83001452f4892fbd6841b5d Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Wed, 7 Dec 2022 14:50:27 +0000
Subject: [PATCH 1669/1922] [pruning][core][feature] Implement prune for
 structured pruning (#89777)

Summary:

This PR implements `prune` in BaseStructuredSparsifier:

`prune` is a function that takes in a model with structured sparsity parametritizations (the result of `prepare`) and will return a resized model with the masked out weights removed.

`prune` is defined by a mapping from **patterns** to different **pruning functions**.
	- **patterns** are just sequences of operations, for example `(nn.Linear, activation, nn.Linear)`
	- **pruning functions** are functions that take in an matched pattern as args and will resize the appropriate layer sizes and weights.
	  ```
	  def prune_linear_activation_linear(linear1, activation, linear2):
		pass
	  ```
	- This is one line in the pattern config `(nn.Linear, activation, nn.Linear): prune_linear_activation_linear`

At a high level `prune` works by finding instances of the graph that match different patterns and then calling the mapped pruning functions on those matched patterns.
This is unlike the previous code which attempted to do both at the same time.

There may be some gaps in the patterns compared to the previous implementation, but the conversion functionality support should be the same.

Currently we have pruning functions for the following patterns:
    - linear -> linear
    - linear -> activation -> linear
    - conv2d -> conv2d
    - conv2d -> activation -> conv2d
    - conv2d -> activation -> pool -> conv2d
    - conv2d -> pool -> activation -> conv2d
    - conv2d -> adaptive pool -> flatten -> linear

Added in MyPy type hints as well for the prune_functions.

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89777
Approved by: https://github.com/vkuzo
---
 .../ao/sparsity/test_structured_sparsifier.py | 764 ++++++++++++------
 .../pruning/_experimental/pruner/__init__.py  |   6 -
 .../pruner/base_structured_sparsifier.py      | 252 +++++-
 .../_experimental/pruner/match_utils.py       |  59 ++
 .../_experimental/pruner/parametrization.py   |   1 -
 .../_experimental/pruner/prune_functions.py   | 359 ++++++++
 torch/testing/_internal/common_pruning.py     | 311 +++++++
 7 files changed, 1491 insertions(+), 261 deletions(-)
 create mode 100644 torch/ao/pruning/_experimental/pruner/match_utils.py
 create mode 100644 torch/ao/pruning/_experimental/pruner/prune_functions.py
 create mode 100644 torch/testing/_internal/common_pruning.py

diff --git a/test/ao/sparsity/test_structured_sparsifier.py b/test/ao/sparsity/test_structured_sparsifier.py
index 1b504c9731d2b..19e5a03640d00 100644
--- a/test/ao/sparsity/test_structured_sparsifier.py
+++ b/test/ao/sparsity/test_structured_sparsifier.py
@@ -4,167 +4,51 @@
 
 import copy
 import logging
+import random
 
 import torch
-from torch import nn
-from torch.ao.pruning._experimental.pruner import BaseStructuredSparsifier, FakeStructuredSparsity
+from torch.ao.pruning._experimental.pruner import (
+    BaseStructuredSparsifier,
+    FakeStructuredSparsity,
+)
 from torch.nn.utils import parametrize
 
 from torch.testing._internal.common_utils import TestCase, skipIfTorchDynamo
-
-logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
+from torch.testing._internal.common_pruning import (
+    SimpleLinear,
+    LinearBias,
+    LinearActivation,
+    LinearActivationFunctional,
+    SimpleConv2d,
+    Conv2dBias,
+    Conv2dActivation,
+    Conv2dPadBias,
+    Conv2dPool,
+    Conv2dPoolFlatten,
+    Conv2dPoolFlattenFunctional,
+)
+
+logging.basicConfig(
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
+)
 
 DEVICES = {
     torch.device("cpu"),
-    torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"),
 }
 
 
-class Linear(nn.Module):
-    r"""Model with Linear layers, in Sequential and outside, without biases"""
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Linear(16, 16, bias=False)
-        )
-        self.linear = nn.Linear(16, 16, bias=False)
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.linear(x)
-        return x
-
-
-class LinearB(nn.Module):
-    r"""Model with Linear layers, in Sequential and outside, with biases"""
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Linear(16, 16, bias=True)
-        )
-        self.linear = nn.Linear(16, 16, bias=True)
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.linear(x)
-        return x
-
-
-class MultipleLinear(nn.Module):
-    r"""Model with multiple Linear layers, in Sequential and outside, without biases
-    and with activation functions"""
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Linear(7, 5, bias=False),
-            nn.ReLU(),
-            nn.Linear(5, 8, bias=False),
-            nn.ReLU(),
-            nn.Linear(8, 6, bias=False)
-        )
-        self.linear = nn.Linear(6, 4, bias=False)
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.linear(x)
-        return x
-
-
-class MultipleLinearB(nn.Module):
-    r"""Model with multiple Linear layers, in Sequential and outside, with biases
-    and with activation functions"""
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Linear(7, 5, bias=True),
-            nn.ReLU(),
-            nn.Linear(5, 8, bias=True),
-            nn.ReLU(),
-            nn.Linear(8, 6, bias=True)
-        )
-        self.linear = nn.Linear(6, 4, bias=True)
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.linear(x)
-        return x
-
-
-class MultipleLinearMixed(nn.Module):
-    r"""Model with multiple Linear layers, in Sequential and outside, some with biases
-    and with activation functions"""
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Linear(7, 5, bias=True),
-            nn.ReLU(),
-            nn.Linear(5, 8, bias=False),
-            nn.ReLU(),
-            nn.Linear(8, 6, bias=True)
-        )
-        self.linear = nn.Linear(6, 4, bias=False)
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.linear(x)
-        return x
-
-
-class Conv2dA(nn.Module):
-    r"""Model with Conv2d layers, in Sequential and outside, without biases"""
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Conv2d(1, 32, 3, 1, bias=False),
-        )
-        self.conv2d = nn.Conv2d(32, 64, 3, 1, bias=False)
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.conv2d(x)
-        return x
-
-
-class Conv2dB(nn.Module):
-    r"""Model with Conv2d layers, in Sequential and outside, with biases"""
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Conv2d(1, 32, 3, 1, bias=True),
-        )
-        self.conv2d = nn.Conv2d(32, 64, 3, 1, bias=True)
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.conv2d(x)
-        return x
-
-
-class Conv2dC(nn.Module):
-    r"""Model with Conv2d layers, in Sequential and outside, with and without biases"""
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Conv2d(1, 32, 3, 1, bias=True),
-        )
-        self.conv2d = nn.Conv2d(32, 64, 3, 1, bias=False)
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.conv2d(x)
-        return x
-
-
-
 class SimplePruner(BaseStructuredSparsifier):
     def update_mask(self, module, tensor_name, **kwargs):
         getattr(module.parametrizations, tensor_name)[0].mask[1] = False
 
 
-class MultiplePruner(BaseStructuredSparsifier):
+class ImplementedPruner(BaseStructuredSparsifier):
     def update_mask(self, module, tensor_name, **kwargs):
-        getattr(module.parametrizations, tensor_name)[0].mask[1] = False
-        getattr(module.parametrizations, tensor_name)[0].mask[2] = False
+        """Prunes 1/3 of the weight output channels, so resulting module has 33.3% pruning"""
+        num_rows = len(module.parametrizations[tensor_name][0].mask)
+        prune = random.sample(list(range(num_rows)), num_rows // 3)
+        module.parametrizations[tensor_name][0].mask[prune] = False
 
 
 class TestBaseStructuredSparsifier(TestCase):
@@ -180,27 +64,14 @@ def _check_pruner_prepared(self, model, pruner, device):
             # Assume that this is the 1st/only parametrization
             assert type(module.parametrizations.weight[0]) == FakeStructuredSparsity
 
-    def _check_pruner_mask_squashed(self, model, pruner, device):
-        for config in pruner.groups:
-            modules = []
-            if type(config['module']) is tuple:
-                for module in config['module']:
-                    modules.append(module)
-            else:
-                module = config['module']
-                modules.append(module)
-            for module in modules:
-                assert module.weight.device.type == device.type
-                assert not hasattr(module, "parametrizations")
-
     def _check_pruner_valid_before_step(self, model, pruner, device):
         for config in pruner.groups:
             modules = []
-            if type(config['module']) is tuple:
-                for module in config['module']:
+            if type(config["module"]) is tuple:
+                for module in config["module"]:
                     modules.append(module)
             else:
-                module = config['module']
+                module = config["module"]
                 modules.append(module)
             for module in modules:
                 assert module.weight.device.type == device.type
@@ -209,145 +80,562 @@ def _check_pruner_valid_before_step(self, model, pruner, device):
     def _check_pruner_valid_after_step(self, model, pruner, mask, device):
         for config in pruner.groups:
             modules = []
-            if type(config['module']) is tuple:
-                for module in config['module']:
+            if type(config["module"]) is tuple:
+                for module in config["module"]:
                     modules.append(module)
             else:
-                module = config['module']
+                module = config["module"]
                 modules.append(module)
             for module in modules:
                 assert module.weight.device.type == device.type
                 total = module.parametrizations.weight[0].mask.numel()
-                assert module.parametrizations.weight[0].mask.count_nonzero() == total - mask
+                assert (
+                    module.parametrizations.weight[0].mask.count_nonzero()
+                    == total - mask
+                )
 
     def _test_constructor_on_device(self, model, device):
-        self.assertRaisesRegex(TypeError, 'BaseStructuredSparsifier.* update_mask',
-                               BaseStructuredSparsifier)
+        self.assertRaisesRegex(
+            TypeError,
+            "BaseStructuredSparsifier.* update_mask",
+            BaseStructuredSparsifier,
+        )
         model1 = copy.deepcopy(model).to(device)
         pruner = SimplePruner(None)
         pruner.prepare(model1, None)
+        pruner.enable_mask_update = True
         for g in pruner.groups:
-            module = g['module']
+            module = g["module"]
             assert module.weight.device.type == device.type
-        assert len(pruner.groups) == 2
+        assert len(pruner.groups) == 5
         pruner.step()
         # Can instantiate the model with configs
         model2 = copy.deepcopy(model).to(device)
-        pruner = SimplePruner({'test': 3})
-        pruner.prepare(model2, [{"tensor_fqn": "linear.weight"}])
+        pruner = SimplePruner({"test": 3})
+        pruner.prepare(model2, [{"tensor_fqn": "seq.0.weight"}])
         assert len(pruner.groups) == 1
-        assert pruner.groups[0]['module_fqn'] == 'linear'
-        assert 'test' in pruner.groups[0]
-        assert pruner.groups[0]['test'] == 3
+        assert pruner.groups[0]["module_fqn"] == "seq.0"
+        assert "test" in pruner.groups[0]
+        assert pruner.groups[0]["test"] == 3
 
     def test_constructor(self):
-        model = Linear()
+        model = SimpleLinear()
         for device in DEVICES:
             self._test_constructor_on_device(model, torch.device(device))
 
     def _test_prepare_linear_on_device(self, model, device):
         model = copy.deepcopy(model).to(device)
-        x = torch.ones(128, 16, device=device)
+        x = torch.ones(128, 7, device=device)
         pruner = SimplePruner(None)
         pruner.prepare(model, None)
         self._check_pruner_prepared(model, pruner, device)
-        assert model(x).shape == (128, 16)
+        assert model(x).shape == (128, 10)
 
     def test_prepare_linear(self):
-        models = [Linear(), LinearB()]  # without and with bias
+        models = [
+            SimpleLinear(),
+            LinearBias(),
+            LinearActivation(),
+            LinearActivationFunctional(),
+        ]  # without and with bias
         for device in DEVICES:
             for model in models:
                 self._test_prepare_linear_on_device(model, torch.device(device))
 
-    def _test_prepare_conv2d_on_device(self, model, config, device):
+    def _test_prepare_conv2d_on_device(self, model, expected_shape, config, device):
         x = torch.ones((1, 1, 28, 28), device=device)
         pruner = SimplePruner(None)
         pruner.prepare(model, config)
         self._check_pruner_prepared(model, pruner, device)
-        assert model(x).shape == (1, 64, 24, 24)
+        assert model(x).shape == expected_shape
 
     def test_prepare_conv2d(self):
-
-        models = [Conv2dA(), Conv2dB(), Conv2dC()]
-        configs = [None, None, None]
+        models = [
+            SimpleConv2d(),
+            Conv2dBias(),
+            Conv2dActivation(),
+            Conv2dPadBias(),
+            Conv2dPool(),
+        ]
+        shapes = [
+            (1, 52, 20, 20),
+            (1, 52, 18, 18),
+            (1, 52, 18, 18),
+            (1, 52, 24, 24),
+            (1, 52, 3, 3),
+        ]
+        configs = [None, None, None, None, None]
         for device in DEVICES:
-            for model, config in zip(models, configs):
+            for model, shape, config in zip(models, shapes, configs):
                 model = model.to(device)
-                self._test_prepare_conv2d_on_device(model, config, torch.device(device))
+                self._test_prepare_conv2d_on_device(
+                    model, shape, config, torch.device(device)
+                )
 
-    def _test_squash_mask_linear_on_device(self, model, device):
-        model = copy.deepcopy(model).to(device)
-        x = torch.ones(128, 16, device=device)
+    def _test_step_linear_on_device(self, model, device):
+        model = model.to(device)
+        x = torch.ones(7, 7, device=device)
         pruner = SimplePruner(None)
         pruner.prepare(model, None)
-        pruner.squash_mask()
-        self._check_pruner_mask_squashed(model, pruner, device)
-        assert model(x).shape == (128, 16)
+        pruner.enable_mask_update = True
+        self._check_pruner_valid_before_step(model, pruner, device)
+        pruner.step()
+        self._check_pruner_valid_after_step(model, pruner, 1, device)
 
-    def test_squash_mask_linear(self):
-        models = [Linear(), LinearB()]  # without and with bias
+    def test_step_linear(self):
+        models = [
+            SimpleLinear(),
+            LinearBias(),
+            LinearActivation(),
+            LinearActivationFunctional(),
+        ]
         for device in DEVICES:
             for model in models:
-                self._test_squash_mask_linear_on_device(model, torch.device(device))
+                self._test_step_linear_on_device(model, torch.device(device))
 
-    def _test_squash_mask_conv2d_on_device(self, model, config, device):
-        model = copy.deepcopy(model).to(device)
+    def _test_step_conv2d_on_device(self, model, expected_shape, config, device):
+        model = model.to(device)
         x = torch.ones((1, 1, 28, 28), device=device)
         pruner = SimplePruner(None)
         pruner.prepare(model, config)
-        pruner.squash_mask()
-        self._check_pruner_mask_squashed(model, pruner, device)
-        assert model(x).shape == (1, 64, 24, 24)
-
-    def test_squash_mask_conv2d(self):
+        pruner.enable_mask_update = True
+        self._check_pruner_valid_before_step(model, pruner, device)
+        pruner.step()
+        self._check_pruner_valid_after_step(model, pruner, 1, device)
+        assert model(x).shape == expected_shape
 
-        models = [Conv2dA(), Conv2dB(), Conv2dC()]
-        configs = [None, None, None]
+    @skipIfTorchDynamo("TorchDynamo fails with unknown reason")
+    def test_step_conv2d(self):
+        models = [
+            SimpleConv2d(),
+            Conv2dBias(),
+            Conv2dActivation(),
+            Conv2dPadBias(),
+            Conv2dPool(),
+        ]
+        shapes = [
+            (1, 52, 20, 20),
+            (1, 52, 18, 18),
+            (1, 52, 18, 18),
+            (1, 52, 24, 24),
+            (1, 52, 3, 3),
+        ]
+        configs = [None, None, None, None, None]
         for device in DEVICES:
-            for model, config in zip(models, configs):
-                model = model.to(device)
-                self._test_squash_mask_conv2d_on_device(model, config, torch.device(device))
+            for model, shape, config in zip(models, shapes, configs):
+                self._test_step_conv2d_on_device(
+                    model, shape, config, torch.device(device)
+                )
+
+    def _check_pruner_pruned(self, model, pruner, device):
+        for config in pruner.groups:
+            module = config["module"]
+            assert not hasattr(module, "parametrizations")
+            assert not hasattr(module, "mask")
 
-    def _test_step_linear_on_device(self, model, is_basic, device):
+    def _test_linear_on_device(
+        self, model, config, expected_shape, device, also_prune_bias
+    ):
         model = model.to(device)
-        if is_basic:
-            x = torch.ones(16, 16, device=device)
-            pruner = SimplePruner(None)
-            pruner.prepare(model, None)
-            self._check_pruner_valid_before_step(model, pruner, device)
-            pruner.step()
-            self._check_pruner_valid_after_step(model, pruner, 1, device)
-        else:
-            x = torch.ones(7, 7, device=device)
-            pruner = MultiplePruner(None)
-            pruner.prepare(model, None)
-            self._check_pruner_valid_before_step(model, pruner, device)
-            pruner.step()
-            self._check_pruner_valid_after_step(model, pruner, 2, device)
+        model.eval()
+        num_original_params = sum(p.numel() for p in model.parameters())
+        x = torch.ones(128, 7, device=device)
 
-    def test_step_linear(self):
-        basic_models = [Linear(), LinearB()]
-        complex_models = [MultipleLinear(), MultipleLinearB(), MultipleLinearMixed()]
+        pruner = ImplementedPruner({"prune_bias": also_prune_bias})
+        pruner.prepare(model, config)
+        pruner.enable_mask_update = True
+        pruner.step()
+
+        y_expected = model(x)
+
+        assert y_expected.shape == (128, 10)
+        self._check_pruner_prepared(model, pruner, device)
+
+        # Pruning step
+        pruned = pruner.prune()
+        y_pruned = pruned(x)
+        num_pruned_params = sum(p.numel() for p in pruned.parameters())
+
+        assert y_pruned.shape == expected_shape
+        self._check_pruner_pruned(model, pruner, device)
+        if y_pruned.shape == y_expected.shape:
+            assert torch.isclose(y_expected, y_pruned, rtol=1e-05, atol=1e-07).all()
+            assert num_pruned_params < num_original_params
+
+    def test_prune_linear_linear(self):
+        r"""test pruning linear-> linear modules"""
+        configs, shapes = [], []
+        configs.append(
+            [
+                {"tensor_fqn": "seq.0.weight"},
+                {"tensor_fqn": "seq.1.weight"},
+                {"tensor_fqn": "seq.2.weight"},
+            ]
+        )
+        shapes.append((128, 10))
+
+        configs.append(
+            [
+                {"tensor_fqn": "seq.0.weight"},
+                {"tensor_fqn": "seq.1.weight"},
+                {"tensor_fqn": "seq.2.weight"},
+                {"tensor_fqn": "linear1.weight"},
+            ]
+        )
+        shapes.append((128, 10))
+
+        configs.append(
+            [
+                {"tensor_fqn": "seq.0.weight"},
+                {"tensor_fqn": "seq.2.weight"},
+            ]
+        )
+        shapes.append((128, 10))
         for device in DEVICES:
-            for model in basic_models:
-                self._test_step_linear_on_device(model, True, torch.device(device))
-            for model in complex_models:
-                self._test_step_linear_on_device(model, False, torch.device(device))
+            for also_prune_bias in [True, False]:
+                for config, shape in zip(configs, shapes):
+                    self._test_linear_on_device(
+                        SimpleLinear(),
+                        config,
+                        shape,
+                        torch.device(device),
+                        also_prune_bias,
+                    )
+
+    def test_prune_linear_bias_linear(self):
+        # linear(bias) -> linear(no bias)
+        configs, shapes = [], []
+        configs.append(
+            [
+                {"tensor_fqn": "seq.0.weight"},
+                {"tensor_fqn": "seq.1.weight"},
+            ]
+        )
+        shapes.append((128, 10))
+
+        # linear(bias) -> linear(bias)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.2.weight"},
+                {"tensor_fqn": "seq.3.weight"},
+            ]
+        )
+        shapes.append((128, 10))
+
+        # linear(no bias) -> linear(bias)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.0.weight"},
+                {"tensor_fqn": "seq.1.weight"},
+                {"tensor_fqn": "seq.2.weight"},
+            ]
+        )
+        shapes.append((128, 10))
 
-    def _test_step_conv2d_on_device(self, model, config, device):
+        for device in DEVICES:
+            for also_prune_bias in [True, False]:
+                for config, shape in zip(configs, shapes):
+                    self._test_linear_on_device(
+                        LinearBias(),
+                        config,
+                        shape,
+                        torch.device(device),
+                        also_prune_bias,
+                    )
+
+    def test_prune_linear_activation_linear(self):
+        config = [
+            {"tensor_fqn": "seq.0.weight"},
+            {"tensor_fqn": "seq.2.weight"},
+            {"tensor_fqn": "seq.4.weight"},
+            {"tensor_fqn": "linear1.weight"},
+        ]
+        shape = (128, 10)
+
+        for device in DEVICES:
+            for also_prune_bias in [True, False]:
+                # test version with nn.Modules
+                self._test_linear_on_device(
+                    LinearActivation(),
+                    config,
+                    shape,
+                    torch.device(device),
+                    also_prune_bias,
+                )
+                # test functional version
+                self._test_linear_on_device(
+                    LinearActivationFunctional(),
+                    config,
+                    shape,
+                    torch.device(device),
+                    also_prune_bias,
+                )
+
+    def _test_conv2d_on_device(
+        self, model, config, x, expected_shape, device, also_prune_bias
+    ):
         model = model.to(device)
-        x = torch.ones((1, 1, 28, 28)).to(device)
-        pruner = SimplePruner(None)
+        num_original_params = sum(p.numel() for p in model.parameters())
+        model.eval()
+
+        pruner = ImplementedPruner({"prune_bias": also_prune_bias})
         pruner.prepare(model, config)
-        self._check_pruner_valid_before_step(model, pruner, device)
+        pruner.enable_mask_update = True
         pruner.step()
-        self._check_pruner_valid_after_step(model, pruner, 1, device)
-        assert model(x).shape == (1, 64, 24, 24)
+
+        y_expected = model(x)
+        assert y_expected.shape == expected_shape
+
+        self._check_pruner_prepared(model, pruner, device)
+
+        # Fusion step
+        pruned = pruner.prune()
+        y_pruned = pruned(x)
+        num_pruned_params = sum(p.numel() for p in pruned.parameters())
+
+        assert y_pruned.shape == expected_shape
+        self._check_pruner_pruned(model, pruner, device)
+        if y_pruned.shape == y_expected.shape:
+            # TODO This rtol is a little high, need to double check if something specific is causing this to fail
+            assert torch.isclose(
+                y_expected, y_pruned, rtol=1e-1
+            ).all(), f"fail for {type(model)}"
+            # only time this should be equal is when all layers have padding and we can't prune
+            assert num_pruned_params <= num_original_params
+
+    def test_prune_conv2d_conv2d(self):
+        configs, shapes = [], []
+        # all within sequential blocks
+        configs.append(
+            [
+                {"tensor_fqn": "seq.0.weight"},
+            ]
+        )
+        shapes.append((1, 52, 20, 20))
+        # prune across sequential blocks
+        configs.append(
+            [
+                {"tensor_fqn": "seq.0.weight"},
+                {"tensor_fqn": "seq.1.weight"},
+                {"tensor_fqn": "conv2d1.weight"},
+            ]
+        )
+        shapes.append((1, 52, 20, 20))
+
+        for device in DEVICES:
+            x = torch.ones((1, 1, 28, 28), device=device)
+            for also_prune_bias in [True, False]:
+                for config, shape in zip(configs, shapes):
+                    self._test_conv2d_on_device(
+                        SimpleConv2d(),
+                        config,
+                        x,
+                        shape,
+                        torch.device(device),
+                        also_prune_bias,
+                    )
+
+    def test_prune_conv2d_bias_conv2d(self):
+        # Conv2d with Bias and no Activation
+        configs, shapes = [], []
+        # conv2d(bias) -> conv2d(bias)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.0.weight"},
+                {"tensor_fqn": "seq.1.weight"},
+            ]
+        )
+        shapes.append((1, 52, 18, 18))
+
+        # conv2d(no bias) -> conv2d(bias)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.0.weight"},
+                {"tensor_fqn": "seq.1.weight"},
+                {"tensor_fqn": "conv2d1.weight"},
+            ]
+        )
+        shapes.append((1, 52, 18, 18))
+
+        # conv2d(bias) -> conv2d(no bias)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.0.weight"},
+                {"tensor_fqn": "seq.1.weight"},
+                {"tensor_fqn": "seq.2.weight"},
+            ]
+        )
+        shapes.append((1, 52, 18, 18))
+
+        for device in DEVICES:
+            x = torch.ones((1, 1, 28, 28), device=device)
+            for also_prune_bias in [True, False]:
+                for config, shape in zip(configs, shapes):
+                    self._test_conv2d_on_device(
+                        Conv2dBias(),
+                        config,
+                        x,
+                        shape,
+                        torch.device(device),
+                        also_prune_bias,
+                    )
+
+    def test_prune_conv2d_activation_conv2d(self):
+        # Conv2d with Activation and no Bias
+        configs, shapes = [], []
+
+        # conv2d(no bias) -> activatation -> conv2d(no bias)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.4.weight"},
+            ]
+        )
+        shapes.append((1, 52, 18, 18))
+
+        # conv2d(bias) -> activatation -> conv2d(bias)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.0.weight"},
+                {"tensor_fqn": "seq.2.weight"},
+            ]
+        )
+        shapes.append((1, 52, 18, 18))
+
+        # conv2d(bias) -> activation -> conv2d(no bias)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.2.weight"},
+                {"tensor_fqn": "seq.4.weight"},
+            ]
+        )
+        shapes.append((1, 52, 18, 18))
+
+        # conv2d(no bias) -> activation -> conv2d(bias)
+        configs.append(
+            [
+                {"tensor_fqn": "conv2d1.weight"},
+            ]
+        )
+        shapes.append((1, 52, 18, 18))
+
+        for device in DEVICES:
+            x = torch.ones((1, 1, 28, 28), device=device)
+            for also_prune_bias in [True, False]:
+                for config, shape in zip(configs, shapes):
+                    self._test_conv2d_on_device(
+                        Conv2dActivation(),
+                        config,
+                        x,
+                        shape,
+                        torch.device(device),
+                        also_prune_bias,
+                    )
+
+    def test_prune_conv2d_padding_conv2d(self):
+        # Conv2d with Padded layers after Bias layers
+        configs, shapes = [], []
+
+        # conv(padded, bias) -> conv(padded, bias)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.4.weight"},
+            ]
+        )
+        shapes.append((1, 52, 24, 24))
+
+        # conv(no bias, no pad) -> conv(padded, bias)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.2.weight"},
+            ]
+        )
+        shapes.append((1, 52, 24, 24))
+
+        # conv(padded, bias) -> conv ( no bias ,no pad)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.0.weight"},
+            ]
+        )
+        shapes.append((1, 52, 24, 24))
+        # conv(pad, bias) -> conv(no pad, bias)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.6.weight"},
+            ]
+        )
+        shapes.append((1, 52, 24, 24))
+        # conv(no pad, bias) -> conv(pad, bias)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.8.weight"},
+            ]
+        )
+        shapes.append((1, 52, 24, 24))
+
+        for device in DEVICES:
+            x = torch.ones((1, 1, 28, 28), device=device)
+            for also_prune_bias in [True, False]:
+                for config, shape in zip(configs, shapes):
+                    self._test_conv2d_on_device(
+                        Conv2dPadBias(),
+                        config,
+                        x,
+                        shape,
+                        torch.device(device),
+                        also_prune_bias,
+                    )
+
+    def test_prune_conv2d_pool_conv2d(self):
+        # Conv2d with Pooling layers
+        config = [
+            {"tensor_fqn": "seq.0.weight"},
+            {"tensor_fqn": "seq.3.weight"},
+            {"tensor_fqn": "conv2d1.weight"},
+            {"tensor_fqn": "conv2d2.weight"},
+        ]
+        shape = (1, 52, 3, 3)
+
+        for device in DEVICES:
+            x = torch.ones((1, 1, 28, 28), device=device)
+            for also_prune_bias in [True, False]:
+                self._test_conv2d_on_device(
+                    Conv2dPool(),
+                    config,
+                    x,
+                    shape,
+                    torch.device(device),
+                    also_prune_bias,
+                )
 
     @skipIfTorchDynamo("TorchDynamo fails with unknown reason")
-    def test_step_conv2d(self):
-        models = [Conv2dA(), Conv2dB(), Conv2dC()]
-        configs = [None, None, None, None]
+    def test_complex_conv2d(self):
+        """Test fusion for models that contain Conv2d & Linear modules.
+        Currently supports: Conv2d-Pool2d-Flatten-Linear, Skip-add"""
+        config = [
+            {"tensor_fqn": "seq.0.weight"},
+            {"tensor_fqn": "seq.3.weight"},
+            {"tensor_fqn": "conv2d1.weight"},
+            {"tensor_fqn": "conv2d2.weight"},
+        ]
+        shape = (1, 13)
+
         for device in DEVICES:
-            for model, config in zip(models, configs):
-                self._test_step_conv2d_on_device(model, config, torch.device(device))
+            x = torch.ones((1, 1, 28, 28), device=device)
+            for also_prune_bias in [True, False]:
+                self._test_conv2d_on_device(
+                    Conv2dPoolFlattenFunctional(),
+                    config,
+                    x,
+                    shape,
+                    torch.device(device),
+                    also_prune_bias,
+                )
+                self._test_conv2d_on_device(
+                    Conv2dPoolFlatten(),
+                    config,
+                    x,
+                    shape,
+                    torch.device(device),
+                    also_prune_bias,
+                )
diff --git a/torch/ao/pruning/_experimental/pruner/__init__.py b/torch/ao/pruning/_experimental/pruner/__init__.py
index e9b17f6c7aad7..d762873277493 100644
--- a/torch/ao/pruning/_experimental/pruner/__init__.py
+++ b/torch/ao/pruning/_experimental/pruner/__init__.py
@@ -3,9 +3,3 @@
     FakeStructuredSparsity,
     BiasHook,
 )
-
-__all__ = [
-    "FakeStructuredSparsity",
-    "BaseStructuredSparsifier",
-    "BiasHook",
-]
diff --git a/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py b/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
index e753d2a6d88da..3b568f1557d07 100644
--- a/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
+++ b/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
@@ -1,17 +1,193 @@
-from typing import Set, Type
+from itertools import chain
 import torch
+import torch.nn.functional as F
 from torch import nn
+from torch.fx import symbolic_trace
 from torch.nn.utils import parametrize
+from typing import Type, Set, Dict, Callable, Tuple, Optional, Union
 
 from torch.ao.pruning import BaseSparsifier
 from .parametrization import FakeStructuredSparsity, BiasHook
+from .match_utils import apply_match
+from .prune_functions import (
+    prune_linear,
+    prune_linear_linear,
+    prune_linear_activation_linear,
+    prune_conv2d,
+    prune_conv2d_conv2d,
+    prune_conv2d_activation_conv2d,
+    prune_conv2d_activation_pool_conv2d,
+    prune_conv2d_pool_activation_conv2d,
+    prune_conv2d_pool_flatten_linear,
+)
 
-__all__ = ["BaseStructuredSparsifier"]
 
-SUPPORTED_STRUCTURED_PRUNING_MODULES = {  # added to config if None given
-    nn.Linear,
-    nn.Conv2d,
-}
+def _get_supported_structured_pruning_modules():
+    SUPPORTED_STRUCTURED_PRUNING_MODULES = {  # added to config if None given
+        nn.Linear,
+        nn.Conv2d,
+    }
+    return SUPPORTED_STRUCTURED_PRUNING_MODULES
+
+
+def _get_supported_activation_functions():
+    SUPPORTED_ACTIVATION_FUNCTIONS = {
+        F.relu,
+        F.rrelu,
+        F.hardtanh,
+        F.relu6,
+        F.sigmoid,
+        F.hardsigmoid,
+        F.tanh,
+        F.silu,
+        F.mish,
+        F.hardswish,
+        F.elu,
+        F.celu,
+        F.selu,
+        F.hardshrink,
+        F.leaky_relu,
+        F.logsigmoid,
+        F.softplus,
+        F.prelu,
+        F.softsign,
+        F.tanhshrink,
+    }
+    return SUPPORTED_ACTIVATION_FUNCTIONS
+
+
+def _get_supported_activation_modules():
+    SUPPORTED_ACTIVATION_MODULES = {
+        nn.ReLU,
+        nn.RReLU,
+        nn.Hardtanh,
+        nn.ReLU6,
+        nn.Sigmoid,
+        nn.Hardsigmoid,
+        nn.Tanh,
+        nn.SiLU,
+        nn.Mish,
+        nn.Hardswish,
+        nn.ELU,
+        nn.CELU,
+        nn.SELU,
+        nn.Hardshrink,
+        nn.LeakyReLU,
+        nn.LogSigmoid,
+        nn.Softplus,
+        nn.PReLU,
+        nn.Softsign,
+        nn.Tanhshrink,
+    }
+    return SUPPORTED_ACTIVATION_MODULES
+
+
+def _get_default_structured_pruning_patterns() -> Dict[
+    Tuple[Union[Type[nn.Module], Callable[[torch.Tensor], torch.Tensor], str], ...],
+    Callable[..., None],
+]:
+    """
+    Returns the patterns for conv2d / linear conversion for each element in the activation functions/modules defined above.
+    """
+    patterns: Dict[
+        Tuple[Union[Type[nn.Module], Callable[[torch.Tensor], torch.Tensor], str], ...],
+        Callable[..., None],
+    ] = {
+        # linear -> linear
+        (nn.Linear, "output"): prune_linear,
+        (nn.Linear, nn.Linear): prune_linear_linear,
+        # conv2d -> conv2d
+        (nn.Conv2d, "output"): prune_conv2d,
+        (nn.Conv2d, nn.Conv2d): prune_conv2d_conv2d,
+    }
+
+    for activation in chain(
+        _get_supported_activation_functions(), _get_supported_activation_modules()
+    ):
+        patterns.update(
+            {
+                # linear -> activation -> linear
+                (nn.Linear, activation, nn.Linear): prune_linear_activation_linear,
+                # conv2d -> activation -> conv2d
+                (nn.Conv2d, activation, nn.Conv2d): prune_conv2d_activation_conv2d,
+                # conv2d -> activation -> pool -> conv2d
+                (
+                    nn.Conv2d,
+                    activation,
+                    nn.AvgPool2d,
+                    nn.Conv2d,
+                ): prune_conv2d_activation_pool_conv2d,
+                (
+                    nn.Conv2d,
+                    activation,
+                    F.avg_pool2d,
+                    nn.Conv2d,
+                ): prune_conv2d_activation_pool_conv2d,
+                (
+                    nn.Conv2d,
+                    activation,
+                    nn.MaxPool2d,
+                    nn.Conv2d,
+                ): prune_conv2d_activation_pool_conv2d,
+                (
+                    nn.Conv2d,
+                    activation,
+                    F.max_pool2d,
+                    nn.Conv2d,
+                ): prune_conv2d_activation_pool_conv2d,
+                # conv2d -> pool -> activation -> conv2d
+                (
+                    nn.Conv2d,
+                    nn.AvgPool2d,
+                    activation,
+                    nn.Conv2d,
+                ): prune_conv2d_pool_activation_conv2d,
+                (
+                    nn.Conv2d,
+                    F.avg_pool2d,
+                    activation,
+                    nn.Conv2d,
+                ): prune_conv2d_pool_activation_conv2d,
+                (
+                    nn.Conv2d,
+                    nn.MaxPool2d,
+                    activation,
+                    nn.Conv2d,
+                ): prune_conv2d_pool_activation_conv2d,
+                (
+                    nn.Conv2d,
+                    F.max_pool2d,
+                    activation,
+                    nn.Conv2d,
+                ): prune_conv2d_pool_activation_conv2d,
+                # conv2d -> adaptive pool -> flatten -> linear
+                (
+                    nn.Conv2d,
+                    nn.AdaptiveAvgPool2d,
+                    nn.Flatten,
+                    nn.Linear,
+                ): prune_conv2d_pool_flatten_linear,
+                (
+                    nn.Conv2d,
+                    nn.AdaptiveAvgPool2d,
+                    torch.flatten,
+                    nn.Linear,
+                ): prune_conv2d_pool_flatten_linear,
+                (
+                    nn.Conv2d,
+                    nn.AdaptiveMaxPool2d,
+                    nn.Flatten,
+                    nn.Linear,
+                ): prune_conv2d_pool_flatten_linear,
+                (
+                    nn.Conv2d,
+                    nn.AdaptiveMaxPool2d,
+                    torch.flatten,
+                    nn.Linear,
+                ): prune_conv2d_pool_flatten_linear,
+            }
+        )
+    return patterns
 
 
 class BaseStructuredSparsifier(BaseSparsifier):
@@ -27,17 +203,20 @@ class BaseStructuredSparsifier(BaseSparsifier):
             be updated.
     """
 
-    def __init__(self, defaults):
+    def __init__(self, defaults, patterns=None):
         super().__init__(defaults)
+        if patterns is None:
+            patterns = _get_default_structured_pruning_patterns()
+        self.patterns = patterns
 
     def make_config_from_model(
         self,
         model: nn.Module,
-        SUPPORTED_MODULES: Set[Type] = SUPPORTED_STRUCTURED_PRUNING_MODULES,
+        SUPPORTED_MODULES: Optional[Set[Type]] = None,
     ) -> None:
-        super().make_config_from_model(
-            model, SUPPORTED_MODULES=SUPPORTED_STRUCTURED_PRUNING_MODULES
-        )
+        if SUPPORTED_MODULES is None:
+            SUPPORTED_MODULES = _get_supported_structured_pruning_modules()
+        super().make_config_from_model(model, SUPPORTED_MODULES=SUPPORTED_MODULES)
 
     def _prepare(self, *args, **kwargs) -> None:
         r"""This function will attach the FakeStructuredSparsity parameterizations
@@ -57,18 +236,59 @@ def _prepare(self, *args, **kwargs) -> None:
             )
             self.state[config["tensor_fqn"]]["mask"] = mask
             parametrize.register_parametrization(
-                module, tensor_name, parametrization(mask), unsafe=True
+                module, tensor_name, parametrization(mask)
             )
-
             prune_bias = config.get("prune_bias", True)
-            if prune_bias and module.bias is not None:
+            if module.bias is not None:
                 module.register_parameter("_bias", nn.Parameter(module.bias.detach()))
                 module.bias = None
+                module.prune_bias = prune_bias
+
             self.bias_handles.append(
                 module.register_forward_hook(
                     BiasHook(module.parametrizations.weight[0], prune_bias)
                 )
             )
 
-    def convert(self):
-        pass
+    def prune(self) -> None:
+        r"""
+        This function will FX symbolically trace the model and then find instances of the patterns
+        defined in self.patterns (by default SUPPORTED_STRUCTURED_PRUNING_PATTERNS ).
+
+        For each pattern, it will apply to corresponding conversion function, which will modify the output
+        and input size expected by the modules within the pattern
+        """
+
+        self.traced = symbolic_trace(self.model)
+        modules = dict(self.traced.named_modules())
+
+        # Right now we check for matches simply by iterating across all the patterns
+        # if this is slow we can store patterns in a trie-structure and modify this code for faster lookup
+
+        for node in self.traced.graph.nodes:
+            for pattern, convert_fn in self.patterns.items():
+                matched = apply_match(modules, pattern, node, [])
+                if matched is None:
+                    continue
+
+                first_module = modules.get(node.target)
+                # check if first module exists and has apropriate parameterization, otherwise skip
+                if (
+                    first_module is not None
+                    and parametrize.is_parametrized(first_module)
+                    and isinstance(
+                        first_module.parametrizations["weight"][0],
+                        FakeStructuredSparsity,
+                    )
+                ):
+                    convert_block = []
+                    for node in matched:
+                        if node.op == "call_module":
+                            convert_block.append(modules.get(node.target))
+                        elif node.op == "call_function":
+                            convert_block.append(node.target)
+                    convert_fn(*convert_block)
+
+        self.traced.graph.lint()
+        self.traced.recompile()
+        return self.traced
diff --git a/torch/ao/pruning/_experimental/pruner/match_utils.py b/torch/ao/pruning/_experimental/pruner/match_utils.py
new file mode 100644
index 0000000000000..f712a99ec535e
--- /dev/null
+++ b/torch/ao/pruning/_experimental/pruner/match_utils.py
@@ -0,0 +1,59 @@
+"""
+Contains utility functions to check if a pattern is in the graph and return the matching nodes
+"""
+import torch
+from torch import nn
+from torch.ao.quantization.fx.match_utils import (
+    MatchAllNode,
+)
+from torch.fx import Node
+from torch.nn.utils import parametrize
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+def _match(modules: Dict[str, nn.ModuleDict], node: Node, current: Union[nn.Module, Any]) -> bool:
+    r"""
+    checks to see if a single node of a pattern matches
+    """
+    if isinstance(current, type) and issubclass(current, MatchAllNode):
+        return True
+    if not isinstance(node, Node):
+        return False
+    if isinstance(current, type) and issubclass(current, torch.nn.Module):
+        return (
+            node.op == "call_module"
+            and parametrize.type_before_parametrizations(modules[node.target])
+            == current
+        )
+    elif callable(current):
+        return node.op == "call_function" and node.target is current
+    elif isinstance(current, str):
+        return node.target == current
+    return False
+
+def apply_match(
+    modules: Dict[str, nn.ModuleDict],
+    pattern: Union[Tuple[Any], Any],
+    node: Node,
+    matched_node_pattern: List[Node],
+) -> Optional[List[Node]]:
+    r"""
+    This function will return the matched nodes if the pattern matches the node given
+    If there is no match, it will return None
+    """
+    if isinstance(pattern, tuple):
+        if len(pattern) == 1:
+            if _match(modules, node, pattern[0]):
+                return matched_node_pattern + [node]
+
+        first, *rest = pattern
+        if _match(modules, node, first):
+            if rest is None:
+                return matched_node_pattern + [node]
+
+            for user in node.users:
+                return apply_match(
+                    modules, tuple(rest), user, matched_node_pattern + [node]
+                )
+    elif _match(modules, node, pattern):
+        return [node]
+    return None
diff --git a/torch/ao/pruning/_experimental/pruner/parametrization.py b/torch/ao/pruning/_experimental/pruner/parametrization.py
index 2ea59d48ee809..aeddd0a841525 100644
--- a/torch/ao/pruning/_experimental/pruner/parametrization.py
+++ b/torch/ao/pruning/_experimental/pruner/parametrization.py
@@ -1,7 +1,6 @@
 import torch
 from torch import nn
 
-__all__ = ['FakeStructuredSparsity', 'BiasHook']
 
 
 # Structured Pruning Parameterizations
diff --git a/torch/ao/pruning/_experimental/pruner/prune_functions.py b/torch/ao/pruning/_experimental/pruner/prune_functions.py
new file mode 100644
index 0000000000000..ee8bffb7f9f3e
--- /dev/null
+++ b/torch/ao/pruning/_experimental/pruner/prune_functions.py
@@ -0,0 +1,359 @@
+"""
+Collection of conversion functions for linear / conv2d structured pruning
+Also contains utilities for bias propogation
+"""
+from typing import cast, Optional, Callable, Tuple
+
+import torch
+from torch import nn, Tensor
+from torch.nn.utils import parametrize
+from torch.nn.utils.parametrize import ParametrizationList
+from .parametrization import FakeStructuredSparsity, BiasHook
+
+
+# BIAS PROPOGATION
+def _remove_bias_handles(module: nn.Module) -> None:
+    if hasattr(module, "_forward_hooks"):
+        bias_hooks = []
+        for key, hook in module._forward_hooks.items():
+            if isinstance(hook, BiasHook):
+                bias_hooks.append(key)
+
+        for key in bias_hooks:
+            del module._forward_hooks[key]
+
+
+def _get_adjusted_next_layer_bias(
+    next_layer: nn.Module, pruned_biases: Tensor, mask: Tensor
+) -> nn.Parameter:
+    r"""Returns new adjusted bias for the second supported module"""
+    if parametrize.is_parametrized(next_layer):
+        # need to access original weight
+        parametrization_dict = cast(nn.ModuleDict, next_layer.parametrizations)
+        weight_parameterizations = cast(
+            ParametrizationList, parametrization_dict.weight
+        )
+        next_weight = weight_parameterizations.original
+    else:
+        next_weight = cast(Tensor, next_layer.weight)
+
+    scaling_weight = next_weight[:, ~mask]
+    if isinstance(next_layer, nn.Conv2d):  # checking for Conv2d
+        # Propagating first layer pruned biases and calculating the new second layer bias
+        # involves more steps since the Conv2d scaling weight has extra dimensions,
+        # so adding bias involves broadcasting, logically:
+        # for each channel k in range(oC):
+        #     scaled_biases = sum(first_bias[pruned_idx] @ next_weight[k, pruned_idx, :, :].T)
+        #     new_next_bias[k] = old_next_bias[k] + scaled_biases
+        scaling_product = torch.matmul(
+            pruned_biases.reshape(1, -1), torch.transpose(scaling_weight, 1, 2)
+        )
+        sum_range = list(range(len(scaling_product.shape)))[
+            1:
+        ]  # all but the first dimension
+        scaled_biases = torch.sum(scaling_product, sum_range)
+    elif isinstance(next_layer, nn.Linear):  # Linear
+        scaled_biases = torch.matmul(
+            pruned_biases, torch.transpose(scaling_weight, 0, 1)
+        )  # recall b2_new = b1 @ w2.T + b2
+    else:
+        raise NotImplementedError(f"Type {type(next_layer)} not supported yet.")
+
+    if (
+        parametrize.is_parametrized(next_layer)
+        and getattr(next_layer, "_bias", None) is not None
+    ):  # next_layer is parametrized & has original bias ._bias
+        adjusted_bias = nn.Parameter(scaled_biases + next_layer._bias)
+    elif (
+        not parametrize.is_parametrized(next_layer) and next_layer.bias is not None
+    ):  # next_layer not parametrized & has .bias
+        adjusted_bias = nn.Parameter(scaled_biases + next_layer.bias)
+    else:  # next_layer has no bias
+        adjusted_bias = nn.Parameter(scaled_biases)
+    return adjusted_bias
+
+
+def _prune_module_bias(module: nn.Module, mask: Tensor) -> None:
+    r"""Applies mask to given modules bias"""
+    # prune bias along with weights, discard pruned indices of bias
+    original_bias = cast(Tensor, getattr(module, "_bias", module.bias))
+    if original_bias is not None:
+        module.bias = nn.Parameter(original_bias[mask])
+
+    #  remove _bias parameter
+    if hasattr(module, "_bias"):
+        delattr(module, "_bias")
+
+
+def _propogate_module_bias(module: nn.Module, mask: Tensor) -> Optional[Tensor]:
+    r"""
+    In the case that we need to propogate biases, this function will return the biases we need
+    """
+    # set current module bias
+    if module.bias is not None:
+        module.bias = nn.Parameter(cast(Tensor, module.bias)[mask])
+    elif getattr(module, "_bias", None) is not None:
+        module.bias = nn.Parameter(cast(Tensor, module._bias)[mask])
+
+    # get pruned biases to propogate to subsequent layer
+    if getattr(module, "_bias", None) is not None:
+        pruned_biases = cast(Tensor, module._bias)[~mask]
+    else:
+        pruned_biases = None
+
+    if hasattr(module, "_bias"):
+        delattr(module, "_bias")
+
+    return pruned_biases
+
+
+# LINEAR
+def _prune_linear_helper(linear: nn.Linear) -> Tensor:
+    # expects linear to be a parameterized linear module
+    parametrization_dict = cast(nn.ModuleDict, linear.parametrizations)
+    weight_parameterizations = cast(ParametrizationList, parametrization_dict.weight)
+    for p in weight_parameterizations:
+        if isinstance(p, FakeStructuredSparsity):
+            mask = cast(Tensor, p.mask)
+
+    with torch.no_grad():
+        parametrize.remove_parametrizations(linear, "weight", leave_parametrized=True)
+        linear.weight = nn.Parameter(linear.weight[mask])
+    linear.out_features = linear.weight.shape[0]
+    _remove_bias_handles(linear)
+
+    return mask
+
+
+def prune_linear(linear: nn.Linear) -> None:
+    mask = _prune_linear_helper(linear)
+    if getattr(linear, "prune_bias", False):
+        _prune_module_bias(linear, mask)
+
+
+def prune_linear_linear(linear1: nn.Linear, linear2: nn.Linear) -> None:
+    prune_linear_activation_linear(linear1, None, linear2)
+
+
+def prune_linear_activation_linear(
+    linear1: nn.Linear,
+    activation: Optional[Callable[[Tensor], Tensor]],
+    linear2: nn.Linear,
+):
+    mask = _prune_linear_helper(linear1)
+    if getattr(linear1, "prune_bias", False):
+        _prune_module_bias(linear1, mask)
+    else:
+        pruned_biases = _propogate_module_bias(linear1, mask)
+        if pruned_biases is not None:
+            if activation:
+                pruned_biases = activation(pruned_biases)
+            linear2.bias = _get_adjusted_next_layer_bias(linear2, pruned_biases, mask)
+
+    with torch.no_grad():
+        if parametrize.is_parametrized(linear2):
+            parametrization_dict = cast(nn.ModuleDict, linear2.parametrizations)
+            weight_parameterizations = cast(
+                ParametrizationList, parametrization_dict.weight
+            )
+
+            weight_parameterizations.original = nn.Parameter(
+                weight_parameterizations.original[:, mask]
+            )
+            linear2.in_features = weight_parameterizations.original.shape[1]
+        else:
+            linear2.weight = nn.Parameter(linear2.weight[:, mask])
+            linear2.in_features = linear2.weight.shape[1]
+
+
+# CONV2D
+def _prune_conv2d_helper(conv2d: nn.Conv2d) -> Tensor:
+    parametrization_dict = cast(nn.ModuleDict, conv2d.parametrizations)
+    weight_parameterizations = cast(ParametrizationList, parametrization_dict.weight)
+    for p in weight_parameterizations:
+        if isinstance(p, FakeStructuredSparsity):
+            mask = cast(Tensor, p.mask)
+
+    with torch.no_grad():
+        parametrize.remove_parametrizations(conv2d, "weight", leave_parametrized=True)
+        conv2d.weight = nn.Parameter(conv2d.weight[mask])
+    conv2d.out_channels = conv2d.weight.shape[0]
+
+    _remove_bias_handles(conv2d)
+    return mask
+
+
+def prune_conv2d_padded(conv2d_1: nn.Conv2d) -> None:
+    parametrization_dict = cast(nn.ModuleDict, conv2d_1.parametrizations)
+    weight_parameterizations = cast(ParametrizationList, parametrization_dict.weight)
+    for p in weight_parameterizations:
+        if isinstance(p, FakeStructuredSparsity):
+            mask = cast(Tensor, p.mask)
+
+    with torch.no_grad():
+        parametrize.remove_parametrizations(conv2d_1, "weight", leave_parametrized=True)
+
+    if getattr(conv2d_1, "_bias", None) is not None:
+        if (
+            conv2d_1.bias is not None
+        ):  # conv2d_1 has original bias and bias propagated from previous layer
+            new_bias = torch.zeros(conv2d_1.bias.shape)
+            new_bias[mask] = conv2d_1.bias[mask]
+            # adjusted bias that to keep in conv2d_1
+            new_bias[~mask] = cast(Tensor, conv2d_1._bias)[~mask]
+            # pruned biases that are kept instead of propagated
+            conv2d_1.bias = nn.Parameter(new_bias)
+        else:  # conv2d_1 has only original bias
+            conv2d_1.bias = nn.Parameter(cast(Tensor, conv2d_1._bias))
+    else:
+        # no original bias, only propagated bias
+        if (
+            conv2d_1.bias is not None
+        ):  # conv2d_1 has bias propagated from previous layer
+            conv2d_1.bias.data[~mask] = 0
+
+    if hasattr(conv2d_1, "_bias"):
+        delattr(conv2d_1, "_bias")
+
+
+def prune_conv2d(conv2d: nn.Conv2d) -> None:
+    mask = _prune_conv2d_helper(conv2d)
+    if getattr(conv2d, "prune_bias", False):
+        _prune_module_bias(conv2d, mask)
+
+
+def prune_conv2d_conv2d(conv2d_1: nn.Conv2d, conv2d_2: nn.Conv2d) -> None:
+    prune_conv2d_activation_conv2d(conv2d_1, None, conv2d_2)
+
+
+def prune_conv2d_activation_conv2d(
+    conv2d_1: nn.Conv2d,
+    activation: Optional[Callable[[Tensor], Tensor]],
+    conv2d_2: nn.Conv2d,
+):
+    r"""
+    Fusion Pattern for conv2d -> some activation module / function -> conv2d layers
+    """
+    parametrization_dict = cast(nn.ModuleDict, conv2d_1.parametrizations)
+    weight_parameterizations = cast(ParametrizationList, parametrization_dict.weight)
+    for p in weight_parameterizations:
+        if isinstance(p, FakeStructuredSparsity):
+            mask = cast(Tensor, p.mask)
+
+    prune_bias = getattr(conv2d_1, "prune_bias", False)
+    if (
+        hasattr(conv2d_2, "padding")
+        and cast(Tuple[int], conv2d_2.padding) > (0, 0)
+        and (conv2d_1.bias is not None or getattr(conv2d_1, "_bias", None) is not None)
+    ):
+        prune_conv2d_padded(conv2d_1)
+    else:
+        mask = _prune_conv2d_helper(conv2d_1)
+        if prune_bias:
+            _prune_module_bias(conv2d_1, mask)
+        else:
+            pruned_biases = _propogate_module_bias(conv2d_1, mask)
+            if pruned_biases is not None:
+                if activation:
+                    pruned_biases = activation(pruned_biases)
+                conv2d_2.bias = _get_adjusted_next_layer_bias(
+                    conv2d_2, pruned_biases, mask
+                )
+
+        if (
+            not (
+                hasattr(conv2d_2, "padding")
+                and cast(Tuple[int], conv2d_2.padding) > (0, 0)
+            )
+            or conv2d_1.bias is None
+        ):
+            with torch.no_grad():
+                if parametrize.is_parametrized(conv2d_2):
+                    parametrization_dict = cast(
+                        nn.ModuleDict, conv2d_2.parametrizations
+                    )
+                    weight_parameterizations = cast(
+                        ParametrizationList, parametrization_dict.weight
+                    )
+                    weight_parameterizations.original = nn.Parameter(
+                        weight_parameterizations.original[:, mask]
+                    )
+                    conv2d_2.in_channels = weight_parameterizations.original.shape[1]
+                else:
+                    conv2d_2.weight = nn.Parameter(conv2d_2.weight[:, mask])
+                    conv2d_2.in_channels = conv2d_2.weight.shape[1]
+
+
+def prune_conv2d_pool_activation_conv2d(
+    c1: nn.Conv2d,
+    pool: nn.Module,
+    activation: Optional[Callable[[Tensor], Tensor]],
+    c2: nn.Conv2d,
+) -> None:
+    prune_conv2d_activation_conv2d(c1, activation, c2)
+
+
+def prune_conv2d_activation_pool_conv2d(
+    c1: nn.Conv2d,
+    activation: Optional[Callable[[Tensor], Tensor]],
+    pool: nn.Module,
+    c2: nn.Conv2d,
+) -> None:
+    prune_conv2d_activation_conv2d(c1, activation, c2)
+
+
+def prune_conv2d_pool_flatten_linear(
+    conv2d: nn.Conv2d,
+    pool: nn.Module,
+    flatten: Optional[Callable[[Tensor], Tensor]],
+    linear: nn.Linear,
+) -> None:
+    mask = _prune_conv2d_helper(conv2d)
+
+    # We map the pruned indices of the Conv2d output to the flattened indices of the Linear following the Flatten layer.
+    # we determine the flattening scale (h * w), and readjust `first_pruned_indices`
+    # (each idx maps to range idx * h * w to (idx+1) * h * w), `first_valid_indices`,
+    # and `pruned_biases` (repeat each bias by h * w).
+    if parametrize.is_parametrized(linear):
+        parametrization_dict = cast(nn.ModuleDict, linear.parametrizations)
+        weight_parameterizations = cast(
+            ParametrizationList, parametrization_dict.weight
+        )
+        linear_ic = weight_parameterizations.original.shape[1]
+    else:
+        linear_ic = linear.weight.shape[1]
+
+    conv2d_oc = len(mask)
+    assert (
+        linear_ic % conv2d_oc == 0
+    ), f"Flattening from dimensions {conv2d_oc} to {linear_ic} not supported"
+
+    flatten_scale = linear_ic // conv2d_oc
+    flattened_mask = torch.tensor(
+        [[val] * flatten_scale for val in mask], dtype=torch.bool, device=mask.device
+    ).flatten()
+
+    if getattr(conv2d, "prune_bias", False):
+        _prune_module_bias(conv2d, mask)
+    else:
+        pruned_biases = cast(Tensor, _propogate_module_bias(conv2d, mask))
+        flattened_pruned_biases = torch.tensor(
+            [[bias] * flatten_scale for bias in pruned_biases], device=mask.device
+        ).flatten()
+        linear.bias = _get_adjusted_next_layer_bias(
+            linear, flattened_pruned_biases, flattened_mask
+        )
+
+    with torch.no_grad():
+        if parametrize.is_parametrized(linear):
+            parametrization_dict = cast(nn.ModuleDict, linear.parametrizations)
+            weight_parameterizations = cast(
+                ParametrizationList, parametrization_dict.weight
+            )
+            weight_parameterizations.original = nn.Parameter(
+                weight_parameterizations.original[:, flattened_mask]
+            )
+            linear.in_features = weight_parameterizations.original.shape[1]
+        else:
+            linear.weight = nn.Parameter(linear.weight[:, flattened_mask])
+            linear.in_features = linear.weight.shape[1]
diff --git a/torch/testing/_internal/common_pruning.py b/torch/testing/_internal/common_pruning.py
new file mode 100644
index 0000000000000..8fc08ee2a41bf
--- /dev/null
+++ b/torch/testing/_internal/common_pruning.py
@@ -0,0 +1,311 @@
+# -*- coding: utf-8 -*-
+# Owner(s): ["module: unknown"]
+
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+class SimpleLinear(nn.Module):
+    r"""Model with only Linear layers without biases, some wrapped in a Sequential,
+    some following the Sequential. Used to test basic pruned Linear-Linear fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Linear(7, 5, bias=False),
+            nn.Linear(5, 6, bias=False),
+            nn.Linear(6, 4, bias=False),
+        )
+        self.linear1 = nn.Linear(4, 3, bias=False)
+        self.linear2 = nn.Linear(3, 10, bias=False)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.linear1(x)
+        x = self.linear2(x)
+        return x
+
+
+class LinearBias(nn.Module):
+    r"""Model with only Linear layers, alternating layers with biases,
+    wrapped in a Sequential. Used to test pruned Linear-Bias-Linear fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Linear(7, 5, bias=True),
+            nn.Linear(5, 6, bias=False),
+            nn.Linear(6, 3, bias=True),
+            nn.Linear(3, 3, bias=True),
+            nn.Linear(3, 10, bias=False),
+        )
+
+    def forward(self, x):
+        x = self.seq(x)
+        return x
+
+
+class LinearActivation(nn.Module):
+    r"""Model with only Linear layers, some with bias, some in a Sequential and some following.
+    Activation functions modules in between each Linear in the Sequential, and each outside layer.
+    Used to test pruned Linear(Bias)-Activation-Linear fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Linear(7, 5, bias=True),
+            nn.ReLU(),
+            nn.Linear(5, 6, bias=False),
+            nn.Tanh(),
+            nn.Linear(6, 4, bias=True),
+        )
+        self.linear1 = nn.Linear(4, 3, bias=True)
+        self.act1 = nn.ReLU()
+        self.linear2 = nn.Linear(3, 10, bias=False)
+        self.act2 = nn.Tanh()
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.linear1(x)
+        x = self.act1(x)
+        x = self.linear2(x)
+        x = self.act2(x)
+        return x
+
+
+class LinearActivationFunctional(nn.Module):
+    r"""Model with only Linear layers, some with bias, some in a Sequential and some following.
+    Activation functions modules in between each Linear in the Sequential, and functional
+    activationals are called in between each outside layer.
+    Used to test pruned Linear(Bias)-Activation-Linear fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Linear(7, 5, bias=True),
+            nn.ReLU(),
+            nn.Linear(5, 6, bias=False),
+            nn.ReLU(),
+            nn.Linear(6, 4, bias=True),
+        )
+        self.linear1 = nn.Linear(4, 3, bias=True)
+        self.linear2 = nn.Linear(3, 8, bias=False)
+        self.linear3 = nn.Linear(8, 10, bias=False)
+        self.act1 = nn.ReLU()
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.linear1(x)
+        x = F.relu(x)
+        x = self.linear2(x)
+        x = F.relu(x)
+        x = self.linear3(x)
+        x = F.relu(x)
+        return x
+
+
+class SimpleConv2d(nn.Module):
+    r"""Model with only Conv2d layers, all without bias, some in a Sequential and some following.
+    Used to test pruned Conv2d-Conv2d fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Conv2d(1, 32, 3, 1, bias=False),
+            nn.Conv2d(32, 64, 3, 1, bias=False),
+        )
+        self.conv2d1 = nn.Conv2d(64, 48, 3, 1, bias=False)
+        self.conv2d2 = nn.Conv2d(48, 52, 3, 1, bias=False)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.conv2d1(x)
+        x = self.conv2d2(x)
+        return x
+
+
+class Conv2dBias(nn.Module):
+    r"""Model with only Conv2d layers, some with bias, some in a Sequential and some outside.
+    Used to test pruned Conv2d-Bias-Conv2d fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Conv2d(1, 32, 3, 1, bias=True),
+            nn.Conv2d(32, 32, 3, 1, bias=True),
+            nn.Conv2d(32, 64, 3, 1, bias=False),
+        )
+        self.conv2d1 = nn.Conv2d(64, 48, 3, 1, bias=True)
+        self.conv2d2 = nn.Conv2d(48, 52, 3, 1, bias=False)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.conv2d1(x)
+        x = self.conv2d2(x)
+        return x
+
+
+class Conv2dActivation(nn.Module):
+    r"""Model with only Conv2d layers, some with bias, some in a Sequential and some following.
+    Activation function modules in between each Sequential layer, functional activations called
+    in-between each outside layer.
+    Used to test pruned Conv2d-Bias-Activation-Conv2d fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Conv2d(1, 32, 3, 1, bias=True),
+            nn.ReLU(),
+            nn.Conv2d(32, 64, 3, 1, bias=True),
+            nn.Tanh(),
+            nn.Conv2d(64, 64, 3, 1, bias=False),
+            nn.ReLU(),
+        )
+        self.conv2d1 = nn.Conv2d(64, 48, 3, 1, bias=False)
+        self.conv2d2 = nn.Conv2d(48, 52, 3, 1, bias=True)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.conv2d1(x)
+        x = F.relu(x)
+        x = self.conv2d2(x)
+        x = F.hardtanh(x)
+        return x
+
+
+class Conv2dPadBias(nn.Module):
+    r"""Model with only Conv2d layers, all with bias and some with padding > 0,
+    some in a Sequential and some following. Activation function modules in between each layer.
+    Used to test that bias is propagated correctly in the special case of
+    pruned Conv2d-Bias-(Activation)Conv2d fusion, when the second Conv2d layer has padding > 0."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Conv2d(1, 32, 3, 1, padding=1, bias=True),
+            nn.ReLU(),
+            nn.Conv2d(32, 32, 3, 1, bias=False),
+            nn.ReLU(),
+            nn.Conv2d(32, 32, 3, 1, padding=1, bias=True),
+            nn.ReLU(),
+            nn.Conv2d(32, 32, 3, 1, padding=1, bias=True),
+            nn.ReLU(),
+            nn.Conv2d(32, 64, 3, 1, bias=True),
+            nn.Tanh(),
+        )
+        self.conv2d1 = nn.Conv2d(64, 48, 3, 1, padding=1, bias=True)
+        self.act1 = nn.ReLU()
+        self.conv2d2 = nn.Conv2d(48, 52, 3, 1, padding=1, bias=True)
+        self.act2 = nn.Tanh()
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.conv2d1(x)
+        x = self.act1(x)
+        x = self.conv2d2(x)
+        x = self.act2(x)
+        return x
+
+
+class Conv2dPool(nn.Module):
+    r"""Model with only Conv2d layers, all with bias, some in a Sequential and some following.
+    Activation function modules in between each layer, Pool2d modules in between each layer.
+    Used to test pruned Conv2d-Pool2d-Conv2d fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Conv2d(1, 32, kernel_size=3, padding=1, bias=True),
+            nn.MaxPool2d(kernel_size=2, stride=2, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(32, 64, kernel_size=3, padding=1, bias=True),
+            nn.Tanh(),
+            nn.AvgPool2d(kernel_size=2, stride=2, padding=1),
+        )
+        self.conv2d1 = nn.Conv2d(64, 48, kernel_size=3, padding=1, bias=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2, padding=1)
+        self.af1 = nn.ReLU()
+        self.conv2d2 = nn.Conv2d(48, 52, kernel_size=3, padding=1, bias=True)
+        self.conv2d3 = nn.Conv2d(52, 52, kernel_size=3, padding=1, bias=True)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.conv2d1(x)
+        x = self.maxpool(x)
+        x = self.af1(x)
+        x = self.conv2d2(x)
+        x = F.avg_pool2d(x, kernel_size=2, stride=2, padding=1)
+        x = F.relu(x)
+        x = self.conv2d3(x)
+        return x
+
+
+class Conv2dPoolFlattenFunctional(nn.Module):
+    r"""Model with Conv2d layers, all with bias, some in a Sequential and some following, and then a Pool2d
+    and a functional Flatten followed by a Linear layer.
+    Activation functions and Pool2ds in between each layer also.
+    Used to test pruned Conv2d-Pool2d-Flatten-Linear fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Conv2d(1, 3, kernel_size=3, padding=1, bias=True),
+            nn.MaxPool2d(kernel_size=2, stride=2, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(3, 5, kernel_size=3, padding=1, bias=True),
+            nn.Tanh(),
+            nn.AvgPool2d(kernel_size=2, stride=2, padding=1),
+        )
+        self.conv2d1 = nn.Conv2d(5, 7, kernel_size=3, padding=1, bias=True)
+        self.af1 = nn.ReLU()
+        self.conv2d2 = nn.Conv2d(7, 11, kernel_size=3, padding=1, bias=True)
+        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(11, 13, bias=True)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.conv2d1(x)
+        x = F.max_pool2d(x, kernel_size=2, stride=2, padding=1)
+        x = self.af1(x)
+        x = self.conv2d2(x)
+        x = self.avg_pool(x)
+        x = torch.flatten(x, 1)  # test functional flatten
+        x = self.fc(x)
+        return x
+
+
+class Conv2dPoolFlatten(nn.Module):
+    r"""Model with Conv2d layers, all with bias, some in a Sequential and some following, and then a Pool2d
+    and a Flatten module followed by a Linear layer.
+    Activation functions and Pool2ds in between each layer also.
+    Used to test pruned Conv2d-Pool2d-Flatten-Linear fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Conv2d(1, 3, kernel_size=3, padding=1, bias=True),
+            nn.MaxPool2d(kernel_size=2, stride=2, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(3, 5, kernel_size=3, padding=1, bias=True),
+            nn.Tanh(),
+            nn.AvgPool2d(kernel_size=2, stride=2, padding=1),
+        )
+        self.conv2d1 = nn.Conv2d(5, 7, kernel_size=3, padding=1, bias=True)
+        self.af1 = nn.ReLU()
+        self.conv2d2 = nn.Conv2d(7, 11, kernel_size=3, padding=1, bias=True)
+        self.avg_pool = nn.AdaptiveAvgPool2d((2, 2))
+        self.flatten = nn.Flatten()
+        self.fc = nn.Linear(44, 13, bias=True)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.conv2d1(x)
+        x = F.max_pool2d(x, kernel_size=2, stride=2, padding=1)
+        x = self.af1(x)
+        x = self.conv2d2(x)
+        x = self.avg_pool(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x

From d6f0a594d9b92e2a203c8edf30136b6cb32210df Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Wed, 7 Dec 2022 18:10:44 +0000
Subject: [PATCH 1670/1922] Make Transformers compilable by C++17 (#90389)

`register` keyword is removed in C++17, but keeping it there under ifdef
as I have not measured the perf implication on older compiler, though
there shouldn't be any: all modern compilers supposed to downright
ignore it.

This code originates from https://github.com/facebookresearch/xformers/pull/375 will propose similar PR to remove register keyword usage to that repo.

Yet another thing discovered while working on https://github.com/pytorch/pytorch/pull/85969

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90389
Approved by: https://github.com/drisspg
---
 .../transformers/cuda/mem_eff_attention/kernel_backward.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h
index c9652c40d38e4..e629aaaecab4b 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h
@@ -687,7 +687,11 @@ struct AttentionBackwardKernel {
   static CUTLASS_DEVICE void kernel(Params& p_) {
     // Hint to nvcc to store points & tensor shapes in registers
     // as we use them a lot
+#if __cplusplus < 201703L
     register const Params p = p_;
+#else
+    const Params p = p_;
+#endif
 
     extern __shared__ char smem_buffer[];
     SharedStorage& shared_storage = *((SharedStorage*)smem_buffer);
@@ -721,7 +725,11 @@ struct AttentionBackwardKernel {
       __syncthreads();
     }
 
+#if __cplusplus < 201703L
     OutputFragments register output_frags;
+#else
+    OutputFragments output_frags;
+#endif
     int32_t key_start = 0;
     int32_t key_end = p.num_keys / kBlockSizeJ * kBlockSizeJ;
     for (; key_start < key_end; key_start += kBlockSizeJ) {

From d58f6dcb40710d1ee7eecf5d1e64cd6cce8518fa Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Wed, 7 Dec 2022 07:48:41 -0800
Subject: [PATCH 1671/1922] [vmap] Prepend "legacy" to files for old vmap
 implementation (#90324)

We have an older torch.vmap implementation. It is no longer supported.
It still needs to exist somewhere for the sake of BC with
torch.autograd.functional.

This PR makes it clear what files are meant for implementing the old
vmap implementation. I've seen a couple of PRs recently adding support
for the old vmap implementation, so this will lessen the confusion.

Test Plan:
- CI
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90324
Approved by: https://github.com/samdow
---
 .jenkins/pytorch/test.sh                                  | 2 +-
 .../{BatchedFallback.cpp => LegacyBatchedFallback.cpp}    | 4 ++--
 .../ATen/{BatchedFallback.h => LegacyBatchedFallback.h}   | 0
 ...{BatchedTensorImpl.cpp => LegacyBatchedTensorImpl.cpp} | 2 +-
 .../{BatchedTensorImpl.h => LegacyBatchedTensorImpl.h}    | 0
 ...gRegistrations.cpp => LegacyBatchingRegistrations.cpp} | 4 ++--
 aten/src/ATen/{VmapMode.cpp => LegacyVmapMode.cpp}        | 2 +-
 aten/src/ATen/{VmapMode.h => LegacyVmapMode.h}            | 0
 .../ATen/{VmapTransforms.cpp => LegacyVmapTransforms.cpp} | 2 +-
 .../src/ATen/{VmapTransforms.h => LegacyVmapTransforms.h} | 2 +-
 aten/src/ATen/native/{Batching.cpp => LegacyBatching.cpp} | 4 ++--
 aten/src/ATen/test/CMakeLists.txt                         | 2 +-
 .../src/ATen/test/{vmap_test.cpp => legacy_vmap_test.cpp} | 4 ++--
 aten/tools/run_tests.sh                                   | 2 +-
 build_variables.bzl                                       | 8 ++++----
 test/{test_vmap.py => test_legacy_vmap.py}                | 0
 torch/csrc/Module.cpp                                     | 2 +-
 torch/csrc/autograd/FunctionsManual.cpp                   | 2 +-
 torch/csrc/autograd/functions/accumulate_grad.h           | 2 +-
 torch/csrc/autograd/input_buffer.cpp                      | 2 +-
 torch/csrc/autograd/python_engine.cpp                     | 4 ++--
 21 files changed, 25 insertions(+), 25 deletions(-)
 rename aten/src/ATen/{BatchedFallback.cpp => LegacyBatchedFallback.cpp} (99%)
 rename aten/src/ATen/{BatchedFallback.h => LegacyBatchedFallback.h} (100%)
 rename aten/src/ATen/{BatchedTensorImpl.cpp => LegacyBatchedTensorImpl.cpp} (99%)
 rename aten/src/ATen/{BatchedTensorImpl.h => LegacyBatchedTensorImpl.h} (100%)
 rename aten/src/ATen/{BatchingRegistrations.cpp => LegacyBatchingRegistrations.cpp} (99%)
 rename aten/src/ATen/{VmapMode.cpp => LegacyVmapMode.cpp} (95%)
 rename aten/src/ATen/{VmapMode.h => LegacyVmapMode.h} (100%)
 rename aten/src/ATen/{VmapTransforms.cpp => LegacyVmapTransforms.cpp} (99%)
 rename aten/src/ATen/{VmapTransforms.h => LegacyVmapTransforms.h} (99%)
 rename aten/src/ATen/native/{Batching.cpp => LegacyBatching.cpp} (98%)
 rename aten/src/ATen/test/{vmap_test.cpp => legacy_vmap_test.cpp} (99%)
 rename test/{test_vmap.py => test_legacy_vmap.py} (100%)

diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index fb076e13c4172..af0c2854b96c0 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -237,7 +237,7 @@ test_dynamo_shard() {
       test_python_dispatch \
       test_fx \
       test_package \
-      test_vmap \
+      test_legacy_vmap \
     --shard "$1" "$NUM_TEST_SHARDS" \
     --verbose
   assert_git_not_dirty
diff --git a/aten/src/ATen/BatchedFallback.cpp b/aten/src/ATen/LegacyBatchedFallback.cpp
similarity index 99%
rename from aten/src/ATen/BatchedFallback.cpp
rename to aten/src/ATen/LegacyBatchedFallback.cpp
index 7ca516182cc4c..72794ece1c5a8 100644
--- a/aten/src/ATen/BatchedFallback.cpp
+++ b/aten/src/ATen/LegacyBatchedFallback.cpp
@@ -1,7 +1,7 @@
 #include <ATen/Context.h>
-#include <ATen/BatchedFallback.h>
+#include <ATen/LegacyBatchedFallback.h>
 #include <ATen/MatrixRef.h>
-#include <ATen/VmapTransforms.h>
+#include <ATen/LegacyVmapTransforms.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <c10/util/accumulate.h>
 #include <c10/util/llvmMathExtras.h>
diff --git a/aten/src/ATen/BatchedFallback.h b/aten/src/ATen/LegacyBatchedFallback.h
similarity index 100%
rename from aten/src/ATen/BatchedFallback.h
rename to aten/src/ATen/LegacyBatchedFallback.h
diff --git a/aten/src/ATen/BatchedTensorImpl.cpp b/aten/src/ATen/LegacyBatchedTensorImpl.cpp
similarity index 99%
rename from aten/src/ATen/BatchedTensorImpl.cpp
rename to aten/src/ATen/LegacyBatchedTensorImpl.cpp
index fdedfa7c6316e..eea6d7859930c 100644
--- a/aten/src/ATen/BatchedTensorImpl.cpp
+++ b/aten/src/ATen/LegacyBatchedTensorImpl.cpp
@@ -1,4 +1,4 @@
-#include <ATen/BatchedTensorImpl.h>
+#include <ATen/LegacyBatchedTensorImpl.h>
 
 #include <ATen/WrapDimUtils.h>
 #include <c10/util/Exception.h>
diff --git a/aten/src/ATen/BatchedTensorImpl.h b/aten/src/ATen/LegacyBatchedTensorImpl.h
similarity index 100%
rename from aten/src/ATen/BatchedTensorImpl.h
rename to aten/src/ATen/LegacyBatchedTensorImpl.h
diff --git a/aten/src/ATen/BatchingRegistrations.cpp b/aten/src/ATen/LegacyBatchingRegistrations.cpp
similarity index 99%
rename from aten/src/ATen/BatchingRegistrations.cpp
rename to aten/src/ATen/LegacyBatchingRegistrations.cpp
index 5a01f949745f6..c235da67d5a71 100644
--- a/aten/src/ATen/BatchingRegistrations.cpp
+++ b/aten/src/ATen/LegacyBatchingRegistrations.cpp
@@ -1,7 +1,7 @@
 #include <torch/library.h>
 #include <ATen/RedispatchFunctions.h>
-#include <ATen/VmapTransforms.h>
-#include <ATen/BatchedFallback.h>
+#include <ATen/LegacyVmapTransforms.h>
+#include <ATen/LegacyBatchedFallback.h>
 #include <ATen/native/ResizeCommon.h>
 #include <ATen/ATen.h>
 #include <ATen/core/IListRef.h>
diff --git a/aten/src/ATen/VmapMode.cpp b/aten/src/ATen/LegacyVmapMode.cpp
similarity index 95%
rename from aten/src/ATen/VmapMode.cpp
rename to aten/src/ATen/LegacyVmapMode.cpp
index 4f0a2413f4513..f10e1005debcd 100644
--- a/aten/src/ATen/VmapMode.cpp
+++ b/aten/src/ATen/LegacyVmapMode.cpp
@@ -1,4 +1,4 @@
-#include <ATen/VmapMode.h>
+#include <ATen/LegacyVmapMode.h>
 
 namespace at {
 namespace impl {
diff --git a/aten/src/ATen/VmapMode.h b/aten/src/ATen/LegacyVmapMode.h
similarity index 100%
rename from aten/src/ATen/VmapMode.h
rename to aten/src/ATen/LegacyVmapMode.h
diff --git a/aten/src/ATen/VmapTransforms.cpp b/aten/src/ATen/LegacyVmapTransforms.cpp
similarity index 99%
rename from aten/src/ATen/VmapTransforms.cpp
rename to aten/src/ATen/LegacyVmapTransforms.cpp
index 71ef7a169026d..1457e572812a4 100644
--- a/aten/src/ATen/VmapTransforms.cpp
+++ b/aten/src/ATen/LegacyVmapTransforms.cpp
@@ -1,4 +1,4 @@
-#include <ATen/VmapTransforms.h>
+#include <ATen/LegacyVmapTransforms.h>
 #include <ATen/ATen.h>
 #include <ATen/core/IListRef.h>
 #include <c10/util/irange.h>
diff --git a/aten/src/ATen/VmapTransforms.h b/aten/src/ATen/LegacyVmapTransforms.h
similarity index 99%
rename from aten/src/ATen/VmapTransforms.h
rename to aten/src/ATen/LegacyVmapTransforms.h
index cece52dcbc410..0afb3247ac86e 100644
--- a/aten/src/ATen/VmapTransforms.h
+++ b/aten/src/ATen/LegacyVmapTransforms.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <ATen/BatchedTensorImpl.h>
+#include <ATen/LegacyBatchedTensorImpl.h>
 #include <ATen/core/IListRef.h>
 
 namespace at {
diff --git a/aten/src/ATen/native/Batching.cpp b/aten/src/ATen/native/LegacyBatching.cpp
similarity index 98%
rename from aten/src/ATen/native/Batching.cpp
rename to aten/src/ATen/native/LegacyBatching.cpp
index b50b6201b7a2d..6dcacbd1f23f5 100644
--- a/aten/src/ATen/native/Batching.cpp
+++ b/aten/src/ATen/native/LegacyBatching.cpp
@@ -1,7 +1,7 @@
 #include <ATen/core/Tensor.h>
-#include <ATen/BatchedTensorImpl.h>
+#include <ATen/LegacyBatchedTensorImpl.h>
 #include <ATen/WrapDimUtils.h>
-#include <ATen/VmapTransforms.h>
+#include <ATen/LegacyVmapTransforms.h>
 
 namespace at { namespace native {
 
diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
index 5c8fda81b3d9c..27b9e37596529 100644
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@@ -45,7 +45,7 @@ list(APPEND ATen_CPU_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/undefined_tensor_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/variant_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/verify_api_visibility.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/vmap_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/legacy_vmap_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/weakref_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/wrapdim_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/xla_tensor_test.cpp
diff --git a/aten/src/ATen/test/vmap_test.cpp b/aten/src/ATen/test/legacy_vmap_test.cpp
similarity index 99%
rename from aten/src/ATen/test/vmap_test.cpp
rename to aten/src/ATen/test/legacy_vmap_test.cpp
index 1feafaa59f3a4..5ca827de2d98a 100644
--- a/aten/src/ATen/test/vmap_test.cpp
+++ b/aten/src/ATen/test/legacy_vmap_test.cpp
@@ -1,8 +1,8 @@
 #include <gtest/gtest.h>
 
 #include <ATen/ATen.h>
-#include <ATen/BatchedTensorImpl.h>
-#include <ATen/VmapTransforms.h>
+#include <ATen/LegacyBatchedTensorImpl.h>
+#include <ATen/LegacyVmapTransforms.h>
 #include <c10/util/irange.h>
 
 using namespace at;
diff --git a/aten/tools/run_tests.sh b/aten/tools/run_tests.sh
index 5b0c02c2846a4..3ae0da113bca7 100755
--- a/aten/tools/run_tests.sh
+++ b/aten/tools/run_tests.sh
@@ -26,7 +26,7 @@ fi
 ./Dict_test
 ./NamedTensor_test
 ./cpu_generator_test
-./vmap_test
+./legacy_vmap_test
 ./operators_test
 if [[ -x ./cudnn_test ]]; then
   ./cudnn_test
diff --git a/build_variables.bzl b/build_variables.bzl
index 2faeed6e52d9e..63010082137ee 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -1028,7 +1028,7 @@ def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"):
 
 aten_cpu_source_non_codegen_list = [
     "aten/src/ATen/AccumulateType.cpp",
-    "aten/src/ATen/BatchedTensorImpl.cpp",
+    "aten/src/ATen/LegacyBatchedTensorImpl.cpp",
     "aten/src/ATen/CPUGeneratorImpl.cpp",
     "aten/src/ATen/Context.cpp",
     "aten/src/ATen/DLConvertor.cpp",
@@ -1062,8 +1062,8 @@ aten_cpu_source_non_codegen_list = [
     "aten/src/ATen/FuncTorchTLS.cpp",
     "aten/src/ATen/Utils.cpp",
     "aten/src/ATen/Version.cpp",
-    "aten/src/ATen/VmapMode.cpp",
-    "aten/src/ATen/VmapTransforms.cpp",
+    "aten/src/ATen/LegacyVmapMode.cpp",
+    "aten/src/ATen/LegacyVmapTransforms.cpp",
     "aten/src/ATen/core/BackendSelectFallbackKernel.cpp",
     "aten/src/ATen/core/DeprecatedTypeProperties.cpp",
     "aten/src/ATen/core/DeprecatedTypePropertiesRegistry.cpp",
@@ -1292,7 +1292,7 @@ aten_native_source_non_codegen_list = [
     "aten/src/ATen/native/AveragePool3d.cpp",
     "aten/src/ATen/native/BatchLinearAlgebra.cpp",
     "aten/src/ATen/native/BatchLinearAlgebraKernel.cpp",
-    "aten/src/ATen/native/Batching.cpp",
+    "aten/src/ATen/native/LegacyBatching.cpp",
     "aten/src/ATen/native/BinaryOps.cpp",
     "aten/src/ATen/native/Blas.cpp",
     "aten/src/ATen/native/BlasKernel.cpp",
diff --git a/test/test_vmap.py b/test/test_legacy_vmap.py
similarity index 100%
rename from test/test_vmap.py
rename to test/test_legacy_vmap.py
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 6073736257249..33a5d6f1f4442 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -9,10 +9,10 @@
 #include <ATen/ATen.h>
 #include <ATen/DLConvertor.h>
 #include <ATen/ExpandUtils.h>
+#include <ATen/LegacyVmapMode.h>
 #include <ATen/LinalgBackend.h>
 #include <ATen/Parallel.h>
 #include <ATen/Utils.h>
-#include <ATen/VmapMode.h>
 #include <ATen/core/Vitals.h>
 #include <ATen/dlpack.h>
 #include <ATen/native/ConvUtils.h>
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index d1c59302b3926..fa4b4fde96c13 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -5,9 +5,9 @@
 
 #include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
-#include <ATen/BatchedTensorImpl.h>
 #include <ATen/Dispatch.h>
 #include <ATen/ExpandUtils.h>
+#include <ATen/LegacyBatchedTensorImpl.h>
 #include <ATen/ScalarOps.h>
 #include <ATen/SparseCsrTensorUtils.h>
 #include <ATen/SparseTensorUtils.h>
diff --git a/torch/csrc/autograd/functions/accumulate_grad.h b/torch/csrc/autograd/functions/accumulate_grad.h
index 5a9a0b914a871..9089d541f96b9 100644
--- a/torch/csrc/autograd/functions/accumulate_grad.h
+++ b/torch/csrc/autograd/functions/accumulate_grad.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <ATen/BatchedTensorImpl.h>
+#include <ATen/LegacyBatchedTensorImpl.h>
 #include <ATen/TensorOperators.h>
 #include <torch/csrc/Export.h>
 #include <torch/csrc/autograd/function.h>
diff --git a/torch/csrc/autograd/input_buffer.cpp b/torch/csrc/autograd/input_buffer.cpp
index 6cc6acefc9d45..50d4c0ce0aa60 100644
--- a/torch/csrc/autograd/input_buffer.cpp
+++ b/torch/csrc/autograd/input_buffer.cpp
@@ -1,6 +1,6 @@
 #include <torch/csrc/autograd/input_buffer.h>
 
-#include <ATen/BatchedTensorImpl.h>
+#include <ATen/LegacyBatchedTensorImpl.h>
 #include <ATen/SparseCsrTensorUtils.h>
 #include <ATen/SparseTensorUtils.h>
 #include <ATen/TensorOperators.h>
diff --git a/torch/csrc/autograd/python_engine.cpp b/torch/csrc/autograd/python_engine.cpp
index 3bd12f480d409..dc365c1700088 100644
--- a/torch/csrc/autograd/python_engine.cpp
+++ b/torch/csrc/autograd/python_engine.cpp
@@ -1,7 +1,7 @@
 #include <torch/csrc/autograd/python_engine.h>
 
-#include <ATen/BatchedTensorImpl.h>
-#include <ATen/VmapMode.h>
+#include <ATen/LegacyBatchedTensorImpl.h>
+#include <ATen/LegacyVmapMode.h>
 #include <c10/util/irange.h>
 #include <pybind11/pybind11.h>
 #include <torch/csrc/DynamicTypes.h>

From d06532f355015e8ba3060d3348f676acffd02504 Mon Sep 17 00:00:00 2001
From: Sergii Dymchenko <sdym@fb.com>
Date: Wed, 7 Dec 2022 19:09:43 +0000
Subject: [PATCH 1672/1922] Fix issue 38095 TODO in test_autograd.py (#90031)

Fix TODO related to https://github.com/pytorch/pytorch/issues/38095

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90031
Approved by: https://github.com/clee2000
---
 test/test_autograd.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index d30614bbad6af..0a4db6667fbed 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -2213,7 +2213,7 @@ def backward(ctx, grad_a, grad_b):
     def test_mark_non_differentiable_none(self):
         # This used to segfault because MyFunction would send back null
         # gradients to MulBackward, which is implemented in C++. C++
-        # implemented functions expect incoming  grad_ouptuts to be non-null.
+        # implemented functions expect incoming grad_outputs to be non-null.
         class MyFunction(Function):
             @staticmethod
             def forward(ctx, input):
@@ -2457,7 +2457,7 @@ def backward(ctx, grad_x):
         with self.assertWarnsRegex(DeprecationWarning, "should not be instantiated"):
             f = Id()
 
-        # # After raising warning, should still return an instance
+        # After raising warning, should still return an instance
         self.assertIsInstance(f, Id)
         x = torch.zeros(1, requires_grad=True)
         with self.assertRaisesRegex(RuntimeError, "non-static forward method is deprecated"):
@@ -2626,7 +2626,7 @@ def test_detach(self):
         self.assertEqual(x.grad, torch.ones(10, 10) * 2)
         self.assertEqual(y.grad, torch.ones(10, 10) * 2)
 
-        # in-place deatch on a view raises an exception
+        # in-place detach on a view raises an exception
         view = x.narrow(0, 1, 4)
         self.assertRaisesRegex(RuntimeError, 'view', lambda: view.detach_())
 
@@ -8508,8 +8508,7 @@ def test_unused_output_device(self, devices):
         outputs = Broadcast.apply(list(range(len(devices))), x)
         y = outputs[-1] * 2
         y.sum().backward()
-        # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-        self.assertEqualIgnoreType(x.grad, torch.ones(5, 5) * 2)
+        self.assertEqual(x.grad, torch.ones(5, 5) * 2)
 
     @deviceCountAtLeast(2)
     def test_backward_device(self, devices):

From f144774e62012e7a51b1f9159e3ed3f5724fbc68 Mon Sep 17 00:00:00 2001
From: YJ Shi <yuanjing@octoml.ai>
Date: Wed, 7 Dec 2022 19:23:56 +0000
Subject: [PATCH 1673/1922] [Dynamo] Fix llvm target for meta schedule & add
 torch to tvm ndarray helper func (#90214)

Fixes #90213. Also a torch.tensor to tvm.nd.array helper function is added to avoid data copy with dlpack.

@jansel @Chillee

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90214
Approved by: https://github.com/wconstab
---
 torch/_dynamo/optimizations/backends.py | 16 +++++++++++++++-
 torch/_inductor/triton_ops/conv.py      |  2 +-
 torch/_inductor/triton_ops/conv1x1.py   |  2 +-
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/torch/_dynamo/optimizations/backends.py b/torch/_dynamo/optimizations/backends.py
index 64056b59b2191..90620ccd6cddf 100644
--- a/torch/_dynamo/optimizations/backends.py
+++ b/torch/_dynamo/optimizations/backends.py
@@ -642,6 +642,12 @@ def tvm_compile_inner(
                         log_file
                     ), "TVM's meta_schedule requires a directory for storing log files."
                     work_dir = log_file
+                if not cuda:
+                    # meta_schedule needs num-cores to be specified
+                    # here we use the maximum core count
+                    target = tvm.target.Target(
+                        f"{llvm_target()} --num-cores {ms.utils.cpu_count(logical=False)}"
+                    )
                 # TODO(shingjan): This could be replaced by tvm.contrib.torch.optimize_torch
                 # once USE_PT_TVMDSOOP is updated and turned on by default in TVM.
                 database = ms.relay_integration.tune_relay(
@@ -680,6 +686,14 @@ def to_torch_tensor(nd_tensor):
                 return torch.from_numpy(nd_tensor.numpy())
             return torch.utils.dlpack.from_dlpack(nd_tensor.to_dlpack())
 
+        def to_tvm_tensor(torch_tensor):
+            """A helper function to transfer a torch.tensor to NDArray."""
+            if torch_tensor.dtype == torch.bool:
+                # same reason as above, fallback to numpy conversion which
+                # could introduce data copy overhead
+                return tvm.nd.array(torch_tensor.cpu().numpy())
+            return tvm.nd.from_dlpack(torch_tensor)
+
         def exec_tvm(*i_args):
             args = [a.contiguous() for a in i_args]
             for idx, arg in enumerate(args, 0):
@@ -688,7 +702,7 @@ def exec_tvm(*i_args):
                         arg = arg.detach()
                     m.set_input(
                         f"inp_{idx}",
-                        tvm.nd.array(arg.numpy(), dev),
+                        to_tvm_tensor(arg),
                     )
             m.run()
             return [
diff --git a/torch/_inductor/triton_ops/conv.py b/torch/_inductor/triton_ops/conv.py
index 62d7123174a5b..a2098bce1995a 100644
--- a/torch/_inductor/triton_ops/conv.py
+++ b/torch/_inductor/triton_ops/conv.py
@@ -465,7 +465,7 @@ def _call(
             shape_w = w.shape
             shape_bias = bias.shape if bias is not None else None
 
-            # indicies for the layeout
+            # indicies for the layout
             xn, xc, xh, xw = 0, 1, 2, 3
             yn, yc, yh, yw = 0, 1, 2, 3
             wn, wc, wh, ww = 0, 1, 2, 3
diff --git a/torch/_inductor/triton_ops/conv1x1.py b/torch/_inductor/triton_ops/conv1x1.py
index c7b79f004a5a9..fca5dc3f1d323 100644
--- a/torch/_inductor/triton_ops/conv1x1.py
+++ b/torch/_inductor/triton_ops/conv1x1.py
@@ -26,7 +26,7 @@ def _call(
             shape_w = w.shape
             shape_bias = bias.shape if bias is not None else None
 
-            # indicies for the layeout
+            # indicies for the layout
             xn, xc, xh, xw = 0, 1, 2, 3
             yn, yc, yh, yw = 0, 1, 2, 3
             wn, wc, wh, ww = 0, 1, 2, 3

From 91b4a826d62bbf479d2b5ba02f0e1ae8fa9d72ab Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Wed, 7 Dec 2022 15:34:42 +0000
Subject: [PATCH 1674/1922] [inductor] Use a large tolerance for botnet26t_256
 (#90383)

Summary: botnet26t_256 shows random tolerance failure on CI. The root
cause of this randomness is still to-be-invesitgated, but let's use a
larger tolerance for now.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90383
Approved by: https://github.com/ezyang
---
 benchmarks/dynamo/timm_models.py | 2 +-
 torch/_dynamo/utils.py           | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/benchmarks/dynamo/timm_models.py b/benchmarks/dynamo/timm_models.py
index de9dc746e666a..98d67c501d633 100755
--- a/benchmarks/dynamo/timm_models.py
+++ b/benchmarks/dynamo/timm_models.py
@@ -67,7 +67,7 @@ def pip_install(package):
     "xcit_large_24_p8_224": 4,
 }
 
-REQUIRE_HIGHER_TOLERANCE = set()
+REQUIRE_HIGHER_TOLERANCE = set("botnet26t_256")
 
 SKIP = {
     # Unusual training setup
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index a2cc37abd6630..3d0f1bf34e363 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -877,8 +877,11 @@ def same(
                 res_error = rmse(fp64_ref, res).item()
                 multiplier = 2.0
 
-                if fp64_ref.numel() < 1000 or (
-                    ref.ndim == 4 and ref.shape[-1] == ref.shape[-2] == 1
+                if (
+                    fp64_ref.numel() < 1000
+                    or (ref.ndim == 4 and ref.shape[-1] == ref.shape[-2] == 1)
+                    # large tol means a benchmark has been specified as REQUIRE_HIGHER_TOLERANCE
+                    or tol >= 2 * 1e-2
                 ):
                     # In the presence of noise, noise might dominate our error
                     # metric for smaller tensors.

From 470cb1ccc803aefac33fc2dddeb39b466ee5a833 Mon Sep 17 00:00:00 2001
From: Chien-Chin Huang <chienchin@fb.com>
Date: Wed, 7 Dec 2022 17:10:24 +0000
Subject: [PATCH 1675/1922] [FSDP][optim_state_dict][2/N] Add
 _get_fqn_to_fsdp_param_info to map from original FQN to flat_param (#89899)

**Motivation:**
Add a helper to map from the FQN to the corresponding flat_param. The helper will directly get flat_param from fsdp_state and flat_handler as flat_param is not registered to the module if `use_orig_params` is True.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89899
Approved by: https://github.com/awgu
---
 torch/distributed/fsdp/_optim_utils.py | 106 +++++++++++++++++++------
 1 file changed, 83 insertions(+), 23 deletions(-)

diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py
index a81c05d2bd89a..358ff842bb066 100644
--- a/torch/distributed/fsdp/_optim_utils.py
+++ b/torch/distributed/fsdp/_optim_utils.py
@@ -1,5 +1,6 @@
 import copy
 import functools
+from dataclasses import dataclass
 from typing import (
     Any,
     cast,
@@ -21,7 +22,12 @@
 import torch.distributed.fsdp.fully_sharded_data_parallel as fsdp_file
 import torch.nn as nn
 from torch.distributed._shard.sharded_tensor import ShardedTensor
-from torch.distributed.fsdp._common_utils import _get_param_to_fqns
+from torch.distributed.fsdp._common_utils import (
+    _apply_to_modules,
+    _get_param_to_fqns,
+    _module_handles,
+    clean_tensor_name,
+)
 from torch.distributed.fsdp._fsdp_extensions import _ext_chunk_tensor
 from torch.distributed.fsdp._runtime_utils import _clear_grads_if_needed, _lazy_init
 from torch.distributed.fsdp._shard_utils import _gather_state_dict
@@ -29,6 +35,14 @@
 from torch.distributed.fsdp.flat_param import FlatParameter, FlatParamHandle
 
 
+@dataclass
+class FSDPParamInfo:
+    # The typing will be changed to FSDPState in the future.
+    state: nn.Module
+    flat_param: FlatParameter
+    param_indices: Dict[str, int]
+
+
 def sorted_items(dictionary: Dict[str, Any]) -> Iterator[Tuple[str, Any]]:
     keys = sorted(dictionary.keys())
     for k in keys:
@@ -84,7 +98,7 @@ class _OptimStateKey(NamedTuple):
     """
 
     unflat_param_names: Tuple[str, ...]
-    is_flat_param: bool
+    is_fsdp_managed: bool
 
 
 def _unflatten_optim_state(
@@ -293,23 +307,21 @@ def _flatten_optim_state_dict(
             '`optim_state_dict` must have the keys "state" and '
             '"param_groups" to be a valid optimizer state dict'
         )
-    flat_param_to_fsdp_module = _get_flat_param_to_fsdp_module(model)
     param_to_fqns = _get_param_to_fqns(model)
+    fqn_to_fsdp_param_info = _get_fqn_to_fsdp_param_info(model)
 
     # Construct the "state" part
     flat_osd_state: Dict[_OptimStateKey, Any] = {}
     unflat_osd_state = unflat_osd["state"]
     for param, unflat_param_names in param_to_fqns.items():
-        if isinstance(param, FlatParameter):  # flatten FSDP parameters' states
-            assert (
-                param in flat_param_to_fsdp_module
-            ), f"Check the `flat_param_to_fsdp_module` construction\nparam: {param}"
-            fsdp_module = flat_param_to_fsdp_module[param]
+        fqn = unflat_param_names[0]
+        if fqn in fqn_to_fsdp_param_info:
+            fsdp_param_info = fqn_to_fsdp_param_info[fqn]
             flat_state = _flatten_optim_state(
                 unflat_osd_state,
                 unflat_param_names,
-                fsdp_module,
-                param,
+                fsdp_param_info.state,
+                fsdp_param_info.flat_param,
                 shard_state,
             )
             key = _OptimStateKey(tuple(unflat_param_names), True)
@@ -670,7 +682,7 @@ def _process_pos_dim_tensor_state(
             if not is_pos_dim_tensor_state:
                 no_tensor_osd["state"][key][state_name] = value
                 continue
-            if key.is_flat_param:  # FSDP parameter
+            if key.is_fsdp_managed:  # FSDP parameter
                 sharded_size = FlatParamHandle._get_sharded_size(
                     value, rank=0, world_size=world_size
                 )
@@ -753,7 +765,7 @@ def _broadcast_pos_dim_tensor_states(
             else:
                 unsharded_tensor = None
             shape, dtype = value.shape, value.dtype
-            if key.is_flat_param:  # FSDP parameter
+            if key.is_fsdp_managed:  # FSDP parameter
                 _broadcast_sharded_pos_dim_tensor_state(
                     unsharded_tensor,
                     param_state,
@@ -1079,6 +1091,7 @@ def _map_param_id_to_optim_keys(
     group: Optional[dist.ProcessGroup],
     param_id_to_param: List[nn.Parameter],
     param_to_fqns: Dict[nn.Parameter, List[str]],
+    fqn_to_fsdp_param_info: Dict[str, FSDPParamInfo],
 ) -> Tuple[Dict[int, _OptimStateKey], Dict[_OptimStateKey, int]]:
     """
     Construct the local mapping between the `_OptimStateKey` and parameter IDs
@@ -1087,18 +1100,21 @@ def _map_param_id_to_optim_keys(
     """
     rank = dist.get_rank(group)
     optim_state_key_to_param_id: Dict[_OptimStateKey, int] = {}  # local
-    r0_param_id_to_optim_state_key: Dict[
-        int, _OptimStateKey
-    ] = {}  # rank 0
+    r0_param_id_to_optim_state_key: Dict[int, _OptimStateKey] = {}  # rank 0
 
     for param_id, param in enumerate(param_id_to_param):
         # Do not include parameters without state to avoid empty mappings
         # just like in normal `torch.optim.Optimizer.state_dict()`
         if param_id not in optim_state_dict["state"]:
             continue
+        fqns = param_to_fqns[param]
+        is_fsdp_managed = isinstance(param, FlatParameter)
+        if is_fsdp_managed:
+            assert fqns[0] in fqn_to_fsdp_param_info
+        is_fsdp_managed = fqns[0] in fqn_to_fsdp_param_info
         optim_state_key = _OptimStateKey(
-            unflat_param_names=tuple(param_to_fqns[param]),
-            is_flat_param=isinstance(param, FlatParameter),
+            unflat_param_names=tuple(fqns),
+            is_fsdp_managed=is_fsdp_managed,
         )
         if rank == 0:
             r0_param_id_to_optim_state_key[param_id] = optim_state_key
@@ -1220,6 +1236,7 @@ def _optim_state_dict(
         if using_optim_input
         else _get_param_id_to_param(optim)
     )
+    fqn_to_fsdp_param_info = _get_fqn_to_fsdp_param_info(model)
 
     (
         param_id_to_optim_state_key,
@@ -1229,20 +1246,23 @@ def _optim_state_dict(
         group,
         param_id_to_param,
         param_to_fqns,
+        fqn_to_fsdp_param_info,
     )
-    flat_param_to_fsdp_state = _get_flat_param_to_fsdp_module(model)
 
     # Iterate in rank 0's flattened parameter ID order to ensure aligned
     # all-gathers across ranks
     for optim_state_key in param_id_to_optim_state_key.values():
         param_id = optim_state_key_to_param_id[optim_state_key]
-        if optim_state_key.is_flat_param:
-            param = param_id_to_param[param_id]
-            fsdp_state = flat_param_to_fsdp_state[param]
+        if optim_state_key.is_fsdp_managed:
+            # If there are multiple unflat_param_names (not use_orig_params),
+            # they share the same FSDPParamInfo. So the first unflat_param_name
+            # is sufficient to fetch the FSDPParamInfo.
+            fqn = optim_state_key.unflat_param_names[0]
+            fsdp_param_info = fqn_to_fsdp_param_info[fqn]
             unflat_state = _unflatten_optim_state(
-                cast(FlatParameter, param),
+                fsdp_param_info.flat_param,
                 optim_state_dict["state"][param_id],
-                fsdp_state,
+                fsdp_param_info.state,
                 to_save,
                 shard_state,
             )
@@ -1269,3 +1289,43 @@ def _optim_state_dict(
         )
 
     return fsdp_osd
+
+
+def _get_fqn_to_fsdp_param_info(model: nn.Module) -> Dict[str, FSDPParamInfo]:
+    """
+    Construct the mapping from a param's fqn to its corresponding ``FSDPParamInfo``
+    if the param is managed by FSDP. ``FlatParameter._fqns`` only stores the first
+    FQN of a shared parameter. So the keys in the mapping are guaranteed to map
+    to unique parameters.
+    """
+
+    def module_fn(module, prefix, fqn_to_param_info):
+        # TODO: make it work with composable API.
+        if not isinstance(module, fsdp_file.FullyShardedDataParallel):
+            return
+        _lazy_init(module, module)
+        handles = _module_handles(module, module)
+        if not handles:
+            return
+        flat_param = handles[0].flat_param
+        fsdp_param_info = FSDPParamInfo(module, flat_param, {})
+        for idx, local_fqn in enumerate(flat_param._fqns):
+            fqn = clean_tensor_name(prefix + local_fqn)
+            if fqn in fqn_to_param_info:
+                assert fqn_to_param_info[fqn].flat_param == flat_param
+            fqn_to_param_info[fqn] = fsdp_param_info
+            fsdp_param_info.param_indices[fqn] = idx
+
+    def return_fn(fqn_to_param_info):
+        return fqn_to_param_info
+
+    fqn_to_param_info: Dict[str, FSDPParamInfo] = {}
+    # FlatParameter._fqns stores the local fqn, starting from the root of the
+    # FSDP. Using _apply_to_modules() with model (may not be the FSDP root
+    # module) allows us to construct the global fqn.
+    return _apply_to_modules(
+        model,
+        module_fn,
+        return_fn,
+        fqn_to_param_info,
+    )

From 131256e8688b606bb5cdfba84bf4bc5c83726909 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Wed, 7 Dec 2022 19:41:12 +0000
Subject: [PATCH 1676/1922] Revert "[pruning][core][feature] Implement prune
 for structured pruning (#89777)"

This reverts commit 3531e44307fa58460e2488bcaace948678d6cf9f.

Reverted https://github.com/pytorch/pytorch/pull/89777 on behalf of https://github.com/clee2000 due to breaking test_ao_sparcity due to import https://hud.pytorch.org/pytorch/pytorch/commit/3531e44307fa58460e2488bcaace948678d6cf9f https://github.com/pytorch/pytorch/actions/runs/3641476330/jobs/6147830487, probably a landrace with 824641b083860df4d7ffef06a798ea2702bc4bde?
---
 .../ao/sparsity/test_structured_sparsifier.py | 764 ++++++------------
 .../pruning/_experimental/pruner/__init__.py  |   6 +
 .../pruner/base_structured_sparsifier.py      | 252 +-----
 .../_experimental/pruner/match_utils.py       |  59 --
 .../_experimental/pruner/parametrization.py   |   1 +
 .../_experimental/pruner/prune_functions.py   | 359 --------
 torch/testing/_internal/common_pruning.py     | 311 -------
 7 files changed, 261 insertions(+), 1491 deletions(-)
 delete mode 100644 torch/ao/pruning/_experimental/pruner/match_utils.py
 delete mode 100644 torch/ao/pruning/_experimental/pruner/prune_functions.py
 delete mode 100644 torch/testing/_internal/common_pruning.py

diff --git a/test/ao/sparsity/test_structured_sparsifier.py b/test/ao/sparsity/test_structured_sparsifier.py
index 19e5a03640d00..1b504c9731d2b 100644
--- a/test/ao/sparsity/test_structured_sparsifier.py
+++ b/test/ao/sparsity/test_structured_sparsifier.py
@@ -4,51 +4,167 @@
 
 import copy
 import logging
-import random
 
 import torch
-from torch.ao.pruning._experimental.pruner import (
-    BaseStructuredSparsifier,
-    FakeStructuredSparsity,
-)
+from torch import nn
+from torch.ao.pruning._experimental.pruner import BaseStructuredSparsifier, FakeStructuredSparsity
 from torch.nn.utils import parametrize
 
 from torch.testing._internal.common_utils import TestCase, skipIfTorchDynamo
-from torch.testing._internal.common_pruning import (
-    SimpleLinear,
-    LinearBias,
-    LinearActivation,
-    LinearActivationFunctional,
-    SimpleConv2d,
-    Conv2dBias,
-    Conv2dActivation,
-    Conv2dPadBias,
-    Conv2dPool,
-    Conv2dPoolFlatten,
-    Conv2dPoolFlattenFunctional,
-)
-
-logging.basicConfig(
-    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
-)
+
+logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
 
 DEVICES = {
     torch.device("cpu"),
-    torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"),
+    torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
 }
 
 
+class Linear(nn.Module):
+    r"""Model with Linear layers, in Sequential and outside, without biases"""
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Linear(16, 16, bias=False)
+        )
+        self.linear = nn.Linear(16, 16, bias=False)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.linear(x)
+        return x
+
+
+class LinearB(nn.Module):
+    r"""Model with Linear layers, in Sequential and outside, with biases"""
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Linear(16, 16, bias=True)
+        )
+        self.linear = nn.Linear(16, 16, bias=True)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.linear(x)
+        return x
+
+
+class MultipleLinear(nn.Module):
+    r"""Model with multiple Linear layers, in Sequential and outside, without biases
+    and with activation functions"""
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Linear(7, 5, bias=False),
+            nn.ReLU(),
+            nn.Linear(5, 8, bias=False),
+            nn.ReLU(),
+            nn.Linear(8, 6, bias=False)
+        )
+        self.linear = nn.Linear(6, 4, bias=False)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.linear(x)
+        return x
+
+
+class MultipleLinearB(nn.Module):
+    r"""Model with multiple Linear layers, in Sequential and outside, with biases
+    and with activation functions"""
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Linear(7, 5, bias=True),
+            nn.ReLU(),
+            nn.Linear(5, 8, bias=True),
+            nn.ReLU(),
+            nn.Linear(8, 6, bias=True)
+        )
+        self.linear = nn.Linear(6, 4, bias=True)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.linear(x)
+        return x
+
+
+class MultipleLinearMixed(nn.Module):
+    r"""Model with multiple Linear layers, in Sequential and outside, some with biases
+    and with activation functions"""
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Linear(7, 5, bias=True),
+            nn.ReLU(),
+            nn.Linear(5, 8, bias=False),
+            nn.ReLU(),
+            nn.Linear(8, 6, bias=True)
+        )
+        self.linear = nn.Linear(6, 4, bias=False)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.linear(x)
+        return x
+
+
+class Conv2dA(nn.Module):
+    r"""Model with Conv2d layers, in Sequential and outside, without biases"""
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Conv2d(1, 32, 3, 1, bias=False),
+        )
+        self.conv2d = nn.Conv2d(32, 64, 3, 1, bias=False)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.conv2d(x)
+        return x
+
+
+class Conv2dB(nn.Module):
+    r"""Model with Conv2d layers, in Sequential and outside, with biases"""
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Conv2d(1, 32, 3, 1, bias=True),
+        )
+        self.conv2d = nn.Conv2d(32, 64, 3, 1, bias=True)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.conv2d(x)
+        return x
+
+
+class Conv2dC(nn.Module):
+    r"""Model with Conv2d layers, in Sequential and outside, with and without biases"""
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Conv2d(1, 32, 3, 1, bias=True),
+        )
+        self.conv2d = nn.Conv2d(32, 64, 3, 1, bias=False)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.conv2d(x)
+        return x
+
+
+
 class SimplePruner(BaseStructuredSparsifier):
     def update_mask(self, module, tensor_name, **kwargs):
         getattr(module.parametrizations, tensor_name)[0].mask[1] = False
 
 
-class ImplementedPruner(BaseStructuredSparsifier):
+class MultiplePruner(BaseStructuredSparsifier):
     def update_mask(self, module, tensor_name, **kwargs):
-        """Prunes 1/3 of the weight output channels, so resulting module has 33.3% pruning"""
-        num_rows = len(module.parametrizations[tensor_name][0].mask)
-        prune = random.sample(list(range(num_rows)), num_rows // 3)
-        module.parametrizations[tensor_name][0].mask[prune] = False
+        getattr(module.parametrizations, tensor_name)[0].mask[1] = False
+        getattr(module.parametrizations, tensor_name)[0].mask[2] = False
 
 
 class TestBaseStructuredSparsifier(TestCase):
@@ -64,14 +180,27 @@ def _check_pruner_prepared(self, model, pruner, device):
             # Assume that this is the 1st/only parametrization
             assert type(module.parametrizations.weight[0]) == FakeStructuredSparsity
 
+    def _check_pruner_mask_squashed(self, model, pruner, device):
+        for config in pruner.groups:
+            modules = []
+            if type(config['module']) is tuple:
+                for module in config['module']:
+                    modules.append(module)
+            else:
+                module = config['module']
+                modules.append(module)
+            for module in modules:
+                assert module.weight.device.type == device.type
+                assert not hasattr(module, "parametrizations")
+
     def _check_pruner_valid_before_step(self, model, pruner, device):
         for config in pruner.groups:
             modules = []
-            if type(config["module"]) is tuple:
-                for module in config["module"]:
+            if type(config['module']) is tuple:
+                for module in config['module']:
                     modules.append(module)
             else:
-                module = config["module"]
+                module = config['module']
                 modules.append(module)
             for module in modules:
                 assert module.weight.device.type == device.type
@@ -80,562 +209,145 @@ def _check_pruner_valid_before_step(self, model, pruner, device):
     def _check_pruner_valid_after_step(self, model, pruner, mask, device):
         for config in pruner.groups:
             modules = []
-            if type(config["module"]) is tuple:
-                for module in config["module"]:
+            if type(config['module']) is tuple:
+                for module in config['module']:
                     modules.append(module)
             else:
-                module = config["module"]
+                module = config['module']
                 modules.append(module)
             for module in modules:
                 assert module.weight.device.type == device.type
                 total = module.parametrizations.weight[0].mask.numel()
-                assert (
-                    module.parametrizations.weight[0].mask.count_nonzero()
-                    == total - mask
-                )
+                assert module.parametrizations.weight[0].mask.count_nonzero() == total - mask
 
     def _test_constructor_on_device(self, model, device):
-        self.assertRaisesRegex(
-            TypeError,
-            "BaseStructuredSparsifier.* update_mask",
-            BaseStructuredSparsifier,
-        )
+        self.assertRaisesRegex(TypeError, 'BaseStructuredSparsifier.* update_mask',
+                               BaseStructuredSparsifier)
         model1 = copy.deepcopy(model).to(device)
         pruner = SimplePruner(None)
         pruner.prepare(model1, None)
-        pruner.enable_mask_update = True
         for g in pruner.groups:
-            module = g["module"]
+            module = g['module']
             assert module.weight.device.type == device.type
-        assert len(pruner.groups) == 5
+        assert len(pruner.groups) == 2
         pruner.step()
         # Can instantiate the model with configs
         model2 = copy.deepcopy(model).to(device)
-        pruner = SimplePruner({"test": 3})
-        pruner.prepare(model2, [{"tensor_fqn": "seq.0.weight"}])
+        pruner = SimplePruner({'test': 3})
+        pruner.prepare(model2, [{"tensor_fqn": "linear.weight"}])
         assert len(pruner.groups) == 1
-        assert pruner.groups[0]["module_fqn"] == "seq.0"
-        assert "test" in pruner.groups[0]
-        assert pruner.groups[0]["test"] == 3
+        assert pruner.groups[0]['module_fqn'] == 'linear'
+        assert 'test' in pruner.groups[0]
+        assert pruner.groups[0]['test'] == 3
 
     def test_constructor(self):
-        model = SimpleLinear()
+        model = Linear()
         for device in DEVICES:
             self._test_constructor_on_device(model, torch.device(device))
 
     def _test_prepare_linear_on_device(self, model, device):
         model = copy.deepcopy(model).to(device)
-        x = torch.ones(128, 7, device=device)
+        x = torch.ones(128, 16, device=device)
         pruner = SimplePruner(None)
         pruner.prepare(model, None)
         self._check_pruner_prepared(model, pruner, device)
-        assert model(x).shape == (128, 10)
+        assert model(x).shape == (128, 16)
 
     def test_prepare_linear(self):
-        models = [
-            SimpleLinear(),
-            LinearBias(),
-            LinearActivation(),
-            LinearActivationFunctional(),
-        ]  # without and with bias
+        models = [Linear(), LinearB()]  # without and with bias
         for device in DEVICES:
             for model in models:
                 self._test_prepare_linear_on_device(model, torch.device(device))
 
-    def _test_prepare_conv2d_on_device(self, model, expected_shape, config, device):
+    def _test_prepare_conv2d_on_device(self, model, config, device):
         x = torch.ones((1, 1, 28, 28), device=device)
         pruner = SimplePruner(None)
         pruner.prepare(model, config)
         self._check_pruner_prepared(model, pruner, device)
-        assert model(x).shape == expected_shape
+        assert model(x).shape == (1, 64, 24, 24)
 
     def test_prepare_conv2d(self):
-        models = [
-            SimpleConv2d(),
-            Conv2dBias(),
-            Conv2dActivation(),
-            Conv2dPadBias(),
-            Conv2dPool(),
-        ]
-        shapes = [
-            (1, 52, 20, 20),
-            (1, 52, 18, 18),
-            (1, 52, 18, 18),
-            (1, 52, 24, 24),
-            (1, 52, 3, 3),
-        ]
-        configs = [None, None, None, None, None]
+
+        models = [Conv2dA(), Conv2dB(), Conv2dC()]
+        configs = [None, None, None]
         for device in DEVICES:
-            for model, shape, config in zip(models, shapes, configs):
+            for model, config in zip(models, configs):
                 model = model.to(device)
-                self._test_prepare_conv2d_on_device(
-                    model, shape, config, torch.device(device)
-                )
+                self._test_prepare_conv2d_on_device(model, config, torch.device(device))
 
-    def _test_step_linear_on_device(self, model, device):
-        model = model.to(device)
-        x = torch.ones(7, 7, device=device)
+    def _test_squash_mask_linear_on_device(self, model, device):
+        model = copy.deepcopy(model).to(device)
+        x = torch.ones(128, 16, device=device)
         pruner = SimplePruner(None)
         pruner.prepare(model, None)
-        pruner.enable_mask_update = True
-        self._check_pruner_valid_before_step(model, pruner, device)
-        pruner.step()
-        self._check_pruner_valid_after_step(model, pruner, 1, device)
+        pruner.squash_mask()
+        self._check_pruner_mask_squashed(model, pruner, device)
+        assert model(x).shape == (128, 16)
 
-    def test_step_linear(self):
-        models = [
-            SimpleLinear(),
-            LinearBias(),
-            LinearActivation(),
-            LinearActivationFunctional(),
-        ]
+    def test_squash_mask_linear(self):
+        models = [Linear(), LinearB()]  # without and with bias
         for device in DEVICES:
             for model in models:
-                self._test_step_linear_on_device(model, torch.device(device))
+                self._test_squash_mask_linear_on_device(model, torch.device(device))
 
-    def _test_step_conv2d_on_device(self, model, expected_shape, config, device):
-        model = model.to(device)
+    def _test_squash_mask_conv2d_on_device(self, model, config, device):
+        model = copy.deepcopy(model).to(device)
         x = torch.ones((1, 1, 28, 28), device=device)
         pruner = SimplePruner(None)
         pruner.prepare(model, config)
-        pruner.enable_mask_update = True
-        self._check_pruner_valid_before_step(model, pruner, device)
-        pruner.step()
-        self._check_pruner_valid_after_step(model, pruner, 1, device)
-        assert model(x).shape == expected_shape
+        pruner.squash_mask()
+        self._check_pruner_mask_squashed(model, pruner, device)
+        assert model(x).shape == (1, 64, 24, 24)
 
-    @skipIfTorchDynamo("TorchDynamo fails with unknown reason")
-    def test_step_conv2d(self):
-        models = [
-            SimpleConv2d(),
-            Conv2dBias(),
-            Conv2dActivation(),
-            Conv2dPadBias(),
-            Conv2dPool(),
-        ]
-        shapes = [
-            (1, 52, 20, 20),
-            (1, 52, 18, 18),
-            (1, 52, 18, 18),
-            (1, 52, 24, 24),
-            (1, 52, 3, 3),
-        ]
-        configs = [None, None, None, None, None]
-        for device in DEVICES:
-            for model, shape, config in zip(models, shapes, configs):
-                self._test_step_conv2d_on_device(
-                    model, shape, config, torch.device(device)
-                )
+    def test_squash_mask_conv2d(self):
 
-    def _check_pruner_pruned(self, model, pruner, device):
-        for config in pruner.groups:
-            module = config["module"]
-            assert not hasattr(module, "parametrizations")
-            assert not hasattr(module, "mask")
+        models = [Conv2dA(), Conv2dB(), Conv2dC()]
+        configs = [None, None, None]
+        for device in DEVICES:
+            for model, config in zip(models, configs):
+                model = model.to(device)
+                self._test_squash_mask_conv2d_on_device(model, config, torch.device(device))
 
-    def _test_linear_on_device(
-        self, model, config, expected_shape, device, also_prune_bias
-    ):
+    def _test_step_linear_on_device(self, model, is_basic, device):
         model = model.to(device)
-        model.eval()
-        num_original_params = sum(p.numel() for p in model.parameters())
-        x = torch.ones(128, 7, device=device)
-
-        pruner = ImplementedPruner({"prune_bias": also_prune_bias})
-        pruner.prepare(model, config)
-        pruner.enable_mask_update = True
-        pruner.step()
-
-        y_expected = model(x)
-
-        assert y_expected.shape == (128, 10)
-        self._check_pruner_prepared(model, pruner, device)
-
-        # Pruning step
-        pruned = pruner.prune()
-        y_pruned = pruned(x)
-        num_pruned_params = sum(p.numel() for p in pruned.parameters())
-
-        assert y_pruned.shape == expected_shape
-        self._check_pruner_pruned(model, pruner, device)
-        if y_pruned.shape == y_expected.shape:
-            assert torch.isclose(y_expected, y_pruned, rtol=1e-05, atol=1e-07).all()
-            assert num_pruned_params < num_original_params
-
-    def test_prune_linear_linear(self):
-        r"""test pruning linear-> linear modules"""
-        configs, shapes = [], []
-        configs.append(
-            [
-                {"tensor_fqn": "seq.0.weight"},
-                {"tensor_fqn": "seq.1.weight"},
-                {"tensor_fqn": "seq.2.weight"},
-            ]
-        )
-        shapes.append((128, 10))
-
-        configs.append(
-            [
-                {"tensor_fqn": "seq.0.weight"},
-                {"tensor_fqn": "seq.1.weight"},
-                {"tensor_fqn": "seq.2.weight"},
-                {"tensor_fqn": "linear1.weight"},
-            ]
-        )
-        shapes.append((128, 10))
-
-        configs.append(
-            [
-                {"tensor_fqn": "seq.0.weight"},
-                {"tensor_fqn": "seq.2.weight"},
-            ]
-        )
-        shapes.append((128, 10))
-        for device in DEVICES:
-            for also_prune_bias in [True, False]:
-                for config, shape in zip(configs, shapes):
-                    self._test_linear_on_device(
-                        SimpleLinear(),
-                        config,
-                        shape,
-                        torch.device(device),
-                        also_prune_bias,
-                    )
-
-    def test_prune_linear_bias_linear(self):
-        # linear(bias) -> linear(no bias)
-        configs, shapes = [], []
-        configs.append(
-            [
-                {"tensor_fqn": "seq.0.weight"},
-                {"tensor_fqn": "seq.1.weight"},
-            ]
-        )
-        shapes.append((128, 10))
-
-        # linear(bias) -> linear(bias)
-        configs.append(
-            [
-                {"tensor_fqn": "seq.2.weight"},
-                {"tensor_fqn": "seq.3.weight"},
-            ]
-        )
-        shapes.append((128, 10))
-
-        # linear(no bias) -> linear(bias)
-        configs.append(
-            [
-                {"tensor_fqn": "seq.0.weight"},
-                {"tensor_fqn": "seq.1.weight"},
-                {"tensor_fqn": "seq.2.weight"},
-            ]
-        )
-        shapes.append((128, 10))
+        if is_basic:
+            x = torch.ones(16, 16, device=device)
+            pruner = SimplePruner(None)
+            pruner.prepare(model, None)
+            self._check_pruner_valid_before_step(model, pruner, device)
+            pruner.step()
+            self._check_pruner_valid_after_step(model, pruner, 1, device)
+        else:
+            x = torch.ones(7, 7, device=device)
+            pruner = MultiplePruner(None)
+            pruner.prepare(model, None)
+            self._check_pruner_valid_before_step(model, pruner, device)
+            pruner.step()
+            self._check_pruner_valid_after_step(model, pruner, 2, device)
 
+    def test_step_linear(self):
+        basic_models = [Linear(), LinearB()]
+        complex_models = [MultipleLinear(), MultipleLinearB(), MultipleLinearMixed()]
         for device in DEVICES:
-            for also_prune_bias in [True, False]:
-                for config, shape in zip(configs, shapes):
-                    self._test_linear_on_device(
-                        LinearBias(),
-                        config,
-                        shape,
-                        torch.device(device),
-                        also_prune_bias,
-                    )
-
-    def test_prune_linear_activation_linear(self):
-        config = [
-            {"tensor_fqn": "seq.0.weight"},
-            {"tensor_fqn": "seq.2.weight"},
-            {"tensor_fqn": "seq.4.weight"},
-            {"tensor_fqn": "linear1.weight"},
-        ]
-        shape = (128, 10)
+            for model in basic_models:
+                self._test_step_linear_on_device(model, True, torch.device(device))
+            for model in complex_models:
+                self._test_step_linear_on_device(model, False, torch.device(device))
 
-        for device in DEVICES:
-            for also_prune_bias in [True, False]:
-                # test version with nn.Modules
-                self._test_linear_on_device(
-                    LinearActivation(),
-                    config,
-                    shape,
-                    torch.device(device),
-                    also_prune_bias,
-                )
-                # test functional version
-                self._test_linear_on_device(
-                    LinearActivationFunctional(),
-                    config,
-                    shape,
-                    torch.device(device),
-                    also_prune_bias,
-                )
-
-    def _test_conv2d_on_device(
-        self, model, config, x, expected_shape, device, also_prune_bias
-    ):
+    def _test_step_conv2d_on_device(self, model, config, device):
         model = model.to(device)
-        num_original_params = sum(p.numel() for p in model.parameters())
-        model.eval()
-
-        pruner = ImplementedPruner({"prune_bias": also_prune_bias})
+        x = torch.ones((1, 1, 28, 28)).to(device)
+        pruner = SimplePruner(None)
         pruner.prepare(model, config)
-        pruner.enable_mask_update = True
+        self._check_pruner_valid_before_step(model, pruner, device)
         pruner.step()
-
-        y_expected = model(x)
-        assert y_expected.shape == expected_shape
-
-        self._check_pruner_prepared(model, pruner, device)
-
-        # Fusion step
-        pruned = pruner.prune()
-        y_pruned = pruned(x)
-        num_pruned_params = sum(p.numel() for p in pruned.parameters())
-
-        assert y_pruned.shape == expected_shape
-        self._check_pruner_pruned(model, pruner, device)
-        if y_pruned.shape == y_expected.shape:
-            # TODO This rtol is a little high, need to double check if something specific is causing this to fail
-            assert torch.isclose(
-                y_expected, y_pruned, rtol=1e-1
-            ).all(), f"fail for {type(model)}"
-            # only time this should be equal is when all layers have padding and we can't prune
-            assert num_pruned_params <= num_original_params
-
-    def test_prune_conv2d_conv2d(self):
-        configs, shapes = [], []
-        # all within sequential blocks
-        configs.append(
-            [
-                {"tensor_fqn": "seq.0.weight"},
-            ]
-        )
-        shapes.append((1, 52, 20, 20))
-        # prune across sequential blocks
-        configs.append(
-            [
-                {"tensor_fqn": "seq.0.weight"},
-                {"tensor_fqn": "seq.1.weight"},
-                {"tensor_fqn": "conv2d1.weight"},
-            ]
-        )
-        shapes.append((1, 52, 20, 20))
-
-        for device in DEVICES:
-            x = torch.ones((1, 1, 28, 28), device=device)
-            for also_prune_bias in [True, False]:
-                for config, shape in zip(configs, shapes):
-                    self._test_conv2d_on_device(
-                        SimpleConv2d(),
-                        config,
-                        x,
-                        shape,
-                        torch.device(device),
-                        also_prune_bias,
-                    )
-
-    def test_prune_conv2d_bias_conv2d(self):
-        # Conv2d with Bias and no Activation
-        configs, shapes = [], []
-        # conv2d(bias) -> conv2d(bias)
-        configs.append(
-            [
-                {"tensor_fqn": "seq.0.weight"},
-                {"tensor_fqn": "seq.1.weight"},
-            ]
-        )
-        shapes.append((1, 52, 18, 18))
-
-        # conv2d(no bias) -> conv2d(bias)
-        configs.append(
-            [
-                {"tensor_fqn": "seq.0.weight"},
-                {"tensor_fqn": "seq.1.weight"},
-                {"tensor_fqn": "conv2d1.weight"},
-            ]
-        )
-        shapes.append((1, 52, 18, 18))
-
-        # conv2d(bias) -> conv2d(no bias)
-        configs.append(
-            [
-                {"tensor_fqn": "seq.0.weight"},
-                {"tensor_fqn": "seq.1.weight"},
-                {"tensor_fqn": "seq.2.weight"},
-            ]
-        )
-        shapes.append((1, 52, 18, 18))
-
-        for device in DEVICES:
-            x = torch.ones((1, 1, 28, 28), device=device)
-            for also_prune_bias in [True, False]:
-                for config, shape in zip(configs, shapes):
-                    self._test_conv2d_on_device(
-                        Conv2dBias(),
-                        config,
-                        x,
-                        shape,
-                        torch.device(device),
-                        also_prune_bias,
-                    )
-
-    def test_prune_conv2d_activation_conv2d(self):
-        # Conv2d with Activation and no Bias
-        configs, shapes = [], []
-
-        # conv2d(no bias) -> activatation -> conv2d(no bias)
-        configs.append(
-            [
-                {"tensor_fqn": "seq.4.weight"},
-            ]
-        )
-        shapes.append((1, 52, 18, 18))
-
-        # conv2d(bias) -> activatation -> conv2d(bias)
-        configs.append(
-            [
-                {"tensor_fqn": "seq.0.weight"},
-                {"tensor_fqn": "seq.2.weight"},
-            ]
-        )
-        shapes.append((1, 52, 18, 18))
-
-        # conv2d(bias) -> activation -> conv2d(no bias)
-        configs.append(
-            [
-                {"tensor_fqn": "seq.2.weight"},
-                {"tensor_fqn": "seq.4.weight"},
-            ]
-        )
-        shapes.append((1, 52, 18, 18))
-
-        # conv2d(no bias) -> activation -> conv2d(bias)
-        configs.append(
-            [
-                {"tensor_fqn": "conv2d1.weight"},
-            ]
-        )
-        shapes.append((1, 52, 18, 18))
-
-        for device in DEVICES:
-            x = torch.ones((1, 1, 28, 28), device=device)
-            for also_prune_bias in [True, False]:
-                for config, shape in zip(configs, shapes):
-                    self._test_conv2d_on_device(
-                        Conv2dActivation(),
-                        config,
-                        x,
-                        shape,
-                        torch.device(device),
-                        also_prune_bias,
-                    )
-
-    def test_prune_conv2d_padding_conv2d(self):
-        # Conv2d with Padded layers after Bias layers
-        configs, shapes = [], []
-
-        # conv(padded, bias) -> conv(padded, bias)
-        configs.append(
-            [
-                {"tensor_fqn": "seq.4.weight"},
-            ]
-        )
-        shapes.append((1, 52, 24, 24))
-
-        # conv(no bias, no pad) -> conv(padded, bias)
-        configs.append(
-            [
-                {"tensor_fqn": "seq.2.weight"},
-            ]
-        )
-        shapes.append((1, 52, 24, 24))
-
-        # conv(padded, bias) -> conv ( no bias ,no pad)
-        configs.append(
-            [
-                {"tensor_fqn": "seq.0.weight"},
-            ]
-        )
-        shapes.append((1, 52, 24, 24))
-        # conv(pad, bias) -> conv(no pad, bias)
-        configs.append(
-            [
-                {"tensor_fqn": "seq.6.weight"},
-            ]
-        )
-        shapes.append((1, 52, 24, 24))
-        # conv(no pad, bias) -> conv(pad, bias)
-        configs.append(
-            [
-                {"tensor_fqn": "seq.8.weight"},
-            ]
-        )
-        shapes.append((1, 52, 24, 24))
-
-        for device in DEVICES:
-            x = torch.ones((1, 1, 28, 28), device=device)
-            for also_prune_bias in [True, False]:
-                for config, shape in zip(configs, shapes):
-                    self._test_conv2d_on_device(
-                        Conv2dPadBias(),
-                        config,
-                        x,
-                        shape,
-                        torch.device(device),
-                        also_prune_bias,
-                    )
-
-    def test_prune_conv2d_pool_conv2d(self):
-        # Conv2d with Pooling layers
-        config = [
-            {"tensor_fqn": "seq.0.weight"},
-            {"tensor_fqn": "seq.3.weight"},
-            {"tensor_fqn": "conv2d1.weight"},
-            {"tensor_fqn": "conv2d2.weight"},
-        ]
-        shape = (1, 52, 3, 3)
-
-        for device in DEVICES:
-            x = torch.ones((1, 1, 28, 28), device=device)
-            for also_prune_bias in [True, False]:
-                self._test_conv2d_on_device(
-                    Conv2dPool(),
-                    config,
-                    x,
-                    shape,
-                    torch.device(device),
-                    also_prune_bias,
-                )
+        self._check_pruner_valid_after_step(model, pruner, 1, device)
+        assert model(x).shape == (1, 64, 24, 24)
 
     @skipIfTorchDynamo("TorchDynamo fails with unknown reason")
-    def test_complex_conv2d(self):
-        """Test fusion for models that contain Conv2d & Linear modules.
-        Currently supports: Conv2d-Pool2d-Flatten-Linear, Skip-add"""
-        config = [
-            {"tensor_fqn": "seq.0.weight"},
-            {"tensor_fqn": "seq.3.weight"},
-            {"tensor_fqn": "conv2d1.weight"},
-            {"tensor_fqn": "conv2d2.weight"},
-        ]
-        shape = (1, 13)
-
+    def test_step_conv2d(self):
+        models = [Conv2dA(), Conv2dB(), Conv2dC()]
+        configs = [None, None, None, None]
         for device in DEVICES:
-            x = torch.ones((1, 1, 28, 28), device=device)
-            for also_prune_bias in [True, False]:
-                self._test_conv2d_on_device(
-                    Conv2dPoolFlattenFunctional(),
-                    config,
-                    x,
-                    shape,
-                    torch.device(device),
-                    also_prune_bias,
-                )
-                self._test_conv2d_on_device(
-                    Conv2dPoolFlatten(),
-                    config,
-                    x,
-                    shape,
-                    torch.device(device),
-                    also_prune_bias,
-                )
+            for model, config in zip(models, configs):
+                self._test_step_conv2d_on_device(model, config, torch.device(device))
diff --git a/torch/ao/pruning/_experimental/pruner/__init__.py b/torch/ao/pruning/_experimental/pruner/__init__.py
index d762873277493..e9b17f6c7aad7 100644
--- a/torch/ao/pruning/_experimental/pruner/__init__.py
+++ b/torch/ao/pruning/_experimental/pruner/__init__.py
@@ -3,3 +3,9 @@
     FakeStructuredSparsity,
     BiasHook,
 )
+
+__all__ = [
+    "FakeStructuredSparsity",
+    "BaseStructuredSparsifier",
+    "BiasHook",
+]
diff --git a/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py b/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
index 3b568f1557d07..e753d2a6d88da 100644
--- a/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
+++ b/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
@@ -1,193 +1,17 @@
-from itertools import chain
+from typing import Set, Type
 import torch
-import torch.nn.functional as F
 from torch import nn
-from torch.fx import symbolic_trace
 from torch.nn.utils import parametrize
-from typing import Type, Set, Dict, Callable, Tuple, Optional, Union
 
 from torch.ao.pruning import BaseSparsifier
 from .parametrization import FakeStructuredSparsity, BiasHook
-from .match_utils import apply_match
-from .prune_functions import (
-    prune_linear,
-    prune_linear_linear,
-    prune_linear_activation_linear,
-    prune_conv2d,
-    prune_conv2d_conv2d,
-    prune_conv2d_activation_conv2d,
-    prune_conv2d_activation_pool_conv2d,
-    prune_conv2d_pool_activation_conv2d,
-    prune_conv2d_pool_flatten_linear,
-)
 
+__all__ = ["BaseStructuredSparsifier"]
 
-def _get_supported_structured_pruning_modules():
-    SUPPORTED_STRUCTURED_PRUNING_MODULES = {  # added to config if None given
-        nn.Linear,
-        nn.Conv2d,
-    }
-    return SUPPORTED_STRUCTURED_PRUNING_MODULES
-
-
-def _get_supported_activation_functions():
-    SUPPORTED_ACTIVATION_FUNCTIONS = {
-        F.relu,
-        F.rrelu,
-        F.hardtanh,
-        F.relu6,
-        F.sigmoid,
-        F.hardsigmoid,
-        F.tanh,
-        F.silu,
-        F.mish,
-        F.hardswish,
-        F.elu,
-        F.celu,
-        F.selu,
-        F.hardshrink,
-        F.leaky_relu,
-        F.logsigmoid,
-        F.softplus,
-        F.prelu,
-        F.softsign,
-        F.tanhshrink,
-    }
-    return SUPPORTED_ACTIVATION_FUNCTIONS
-
-
-def _get_supported_activation_modules():
-    SUPPORTED_ACTIVATION_MODULES = {
-        nn.ReLU,
-        nn.RReLU,
-        nn.Hardtanh,
-        nn.ReLU6,
-        nn.Sigmoid,
-        nn.Hardsigmoid,
-        nn.Tanh,
-        nn.SiLU,
-        nn.Mish,
-        nn.Hardswish,
-        nn.ELU,
-        nn.CELU,
-        nn.SELU,
-        nn.Hardshrink,
-        nn.LeakyReLU,
-        nn.LogSigmoid,
-        nn.Softplus,
-        nn.PReLU,
-        nn.Softsign,
-        nn.Tanhshrink,
-    }
-    return SUPPORTED_ACTIVATION_MODULES
-
-
-def _get_default_structured_pruning_patterns() -> Dict[
-    Tuple[Union[Type[nn.Module], Callable[[torch.Tensor], torch.Tensor], str], ...],
-    Callable[..., None],
-]:
-    """
-    Returns the patterns for conv2d / linear conversion for each element in the activation functions/modules defined above.
-    """
-    patterns: Dict[
-        Tuple[Union[Type[nn.Module], Callable[[torch.Tensor], torch.Tensor], str], ...],
-        Callable[..., None],
-    ] = {
-        # linear -> linear
-        (nn.Linear, "output"): prune_linear,
-        (nn.Linear, nn.Linear): prune_linear_linear,
-        # conv2d -> conv2d
-        (nn.Conv2d, "output"): prune_conv2d,
-        (nn.Conv2d, nn.Conv2d): prune_conv2d_conv2d,
-    }
-
-    for activation in chain(
-        _get_supported_activation_functions(), _get_supported_activation_modules()
-    ):
-        patterns.update(
-            {
-                # linear -> activation -> linear
-                (nn.Linear, activation, nn.Linear): prune_linear_activation_linear,
-                # conv2d -> activation -> conv2d
-                (nn.Conv2d, activation, nn.Conv2d): prune_conv2d_activation_conv2d,
-                # conv2d -> activation -> pool -> conv2d
-                (
-                    nn.Conv2d,
-                    activation,
-                    nn.AvgPool2d,
-                    nn.Conv2d,
-                ): prune_conv2d_activation_pool_conv2d,
-                (
-                    nn.Conv2d,
-                    activation,
-                    F.avg_pool2d,
-                    nn.Conv2d,
-                ): prune_conv2d_activation_pool_conv2d,
-                (
-                    nn.Conv2d,
-                    activation,
-                    nn.MaxPool2d,
-                    nn.Conv2d,
-                ): prune_conv2d_activation_pool_conv2d,
-                (
-                    nn.Conv2d,
-                    activation,
-                    F.max_pool2d,
-                    nn.Conv2d,
-                ): prune_conv2d_activation_pool_conv2d,
-                # conv2d -> pool -> activation -> conv2d
-                (
-                    nn.Conv2d,
-                    nn.AvgPool2d,
-                    activation,
-                    nn.Conv2d,
-                ): prune_conv2d_pool_activation_conv2d,
-                (
-                    nn.Conv2d,
-                    F.avg_pool2d,
-                    activation,
-                    nn.Conv2d,
-                ): prune_conv2d_pool_activation_conv2d,
-                (
-                    nn.Conv2d,
-                    nn.MaxPool2d,
-                    activation,
-                    nn.Conv2d,
-                ): prune_conv2d_pool_activation_conv2d,
-                (
-                    nn.Conv2d,
-                    F.max_pool2d,
-                    activation,
-                    nn.Conv2d,
-                ): prune_conv2d_pool_activation_conv2d,
-                # conv2d -> adaptive pool -> flatten -> linear
-                (
-                    nn.Conv2d,
-                    nn.AdaptiveAvgPool2d,
-                    nn.Flatten,
-                    nn.Linear,
-                ): prune_conv2d_pool_flatten_linear,
-                (
-                    nn.Conv2d,
-                    nn.AdaptiveAvgPool2d,
-                    torch.flatten,
-                    nn.Linear,
-                ): prune_conv2d_pool_flatten_linear,
-                (
-                    nn.Conv2d,
-                    nn.AdaptiveMaxPool2d,
-                    nn.Flatten,
-                    nn.Linear,
-                ): prune_conv2d_pool_flatten_linear,
-                (
-                    nn.Conv2d,
-                    nn.AdaptiveMaxPool2d,
-                    torch.flatten,
-                    nn.Linear,
-                ): prune_conv2d_pool_flatten_linear,
-            }
-        )
-    return patterns
+SUPPORTED_STRUCTURED_PRUNING_MODULES = {  # added to config if None given
+    nn.Linear,
+    nn.Conv2d,
+}
 
 
 class BaseStructuredSparsifier(BaseSparsifier):
@@ -203,20 +27,17 @@ class BaseStructuredSparsifier(BaseSparsifier):
             be updated.
     """
 
-    def __init__(self, defaults, patterns=None):
+    def __init__(self, defaults):
         super().__init__(defaults)
-        if patterns is None:
-            patterns = _get_default_structured_pruning_patterns()
-        self.patterns = patterns
 
     def make_config_from_model(
         self,
         model: nn.Module,
-        SUPPORTED_MODULES: Optional[Set[Type]] = None,
+        SUPPORTED_MODULES: Set[Type] = SUPPORTED_STRUCTURED_PRUNING_MODULES,
     ) -> None:
-        if SUPPORTED_MODULES is None:
-            SUPPORTED_MODULES = _get_supported_structured_pruning_modules()
-        super().make_config_from_model(model, SUPPORTED_MODULES=SUPPORTED_MODULES)
+        super().make_config_from_model(
+            model, SUPPORTED_MODULES=SUPPORTED_STRUCTURED_PRUNING_MODULES
+        )
 
     def _prepare(self, *args, **kwargs) -> None:
         r"""This function will attach the FakeStructuredSparsity parameterizations
@@ -236,59 +57,18 @@ def _prepare(self, *args, **kwargs) -> None:
             )
             self.state[config["tensor_fqn"]]["mask"] = mask
             parametrize.register_parametrization(
-                module, tensor_name, parametrization(mask)
+                module, tensor_name, parametrization(mask), unsafe=True
             )
+
             prune_bias = config.get("prune_bias", True)
-            if module.bias is not None:
+            if prune_bias and module.bias is not None:
                 module.register_parameter("_bias", nn.Parameter(module.bias.detach()))
                 module.bias = None
-                module.prune_bias = prune_bias
-
             self.bias_handles.append(
                 module.register_forward_hook(
                     BiasHook(module.parametrizations.weight[0], prune_bias)
                 )
             )
 
-    def prune(self) -> None:
-        r"""
-        This function will FX symbolically trace the model and then find instances of the patterns
-        defined in self.patterns (by default SUPPORTED_STRUCTURED_PRUNING_PATTERNS ).
-
-        For each pattern, it will apply to corresponding conversion function, which will modify the output
-        and input size expected by the modules within the pattern
-        """
-
-        self.traced = symbolic_trace(self.model)
-        modules = dict(self.traced.named_modules())
-
-        # Right now we check for matches simply by iterating across all the patterns
-        # if this is slow we can store patterns in a trie-structure and modify this code for faster lookup
-
-        for node in self.traced.graph.nodes:
-            for pattern, convert_fn in self.patterns.items():
-                matched = apply_match(modules, pattern, node, [])
-                if matched is None:
-                    continue
-
-                first_module = modules.get(node.target)
-                # check if first module exists and has apropriate parameterization, otherwise skip
-                if (
-                    first_module is not None
-                    and parametrize.is_parametrized(first_module)
-                    and isinstance(
-                        first_module.parametrizations["weight"][0],
-                        FakeStructuredSparsity,
-                    )
-                ):
-                    convert_block = []
-                    for node in matched:
-                        if node.op == "call_module":
-                            convert_block.append(modules.get(node.target))
-                        elif node.op == "call_function":
-                            convert_block.append(node.target)
-                    convert_fn(*convert_block)
-
-        self.traced.graph.lint()
-        self.traced.recompile()
-        return self.traced
+    def convert(self):
+        pass
diff --git a/torch/ao/pruning/_experimental/pruner/match_utils.py b/torch/ao/pruning/_experimental/pruner/match_utils.py
deleted file mode 100644
index f712a99ec535e..0000000000000
--- a/torch/ao/pruning/_experimental/pruner/match_utils.py
+++ /dev/null
@@ -1,59 +0,0 @@
-"""
-Contains utility functions to check if a pattern is in the graph and return the matching nodes
-"""
-import torch
-from torch import nn
-from torch.ao.quantization.fx.match_utils import (
-    MatchAllNode,
-)
-from torch.fx import Node
-from torch.nn.utils import parametrize
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-def _match(modules: Dict[str, nn.ModuleDict], node: Node, current: Union[nn.Module, Any]) -> bool:
-    r"""
-    checks to see if a single node of a pattern matches
-    """
-    if isinstance(current, type) and issubclass(current, MatchAllNode):
-        return True
-    if not isinstance(node, Node):
-        return False
-    if isinstance(current, type) and issubclass(current, torch.nn.Module):
-        return (
-            node.op == "call_module"
-            and parametrize.type_before_parametrizations(modules[node.target])
-            == current
-        )
-    elif callable(current):
-        return node.op == "call_function" and node.target is current
-    elif isinstance(current, str):
-        return node.target == current
-    return False
-
-def apply_match(
-    modules: Dict[str, nn.ModuleDict],
-    pattern: Union[Tuple[Any], Any],
-    node: Node,
-    matched_node_pattern: List[Node],
-) -> Optional[List[Node]]:
-    r"""
-    This function will return the matched nodes if the pattern matches the node given
-    If there is no match, it will return None
-    """
-    if isinstance(pattern, tuple):
-        if len(pattern) == 1:
-            if _match(modules, node, pattern[0]):
-                return matched_node_pattern + [node]
-
-        first, *rest = pattern
-        if _match(modules, node, first):
-            if rest is None:
-                return matched_node_pattern + [node]
-
-            for user in node.users:
-                return apply_match(
-                    modules, tuple(rest), user, matched_node_pattern + [node]
-                )
-    elif _match(modules, node, pattern):
-        return [node]
-    return None
diff --git a/torch/ao/pruning/_experimental/pruner/parametrization.py b/torch/ao/pruning/_experimental/pruner/parametrization.py
index aeddd0a841525..2ea59d48ee809 100644
--- a/torch/ao/pruning/_experimental/pruner/parametrization.py
+++ b/torch/ao/pruning/_experimental/pruner/parametrization.py
@@ -1,6 +1,7 @@
 import torch
 from torch import nn
 
+__all__ = ['FakeStructuredSparsity', 'BiasHook']
 
 
 # Structured Pruning Parameterizations
diff --git a/torch/ao/pruning/_experimental/pruner/prune_functions.py b/torch/ao/pruning/_experimental/pruner/prune_functions.py
deleted file mode 100644
index ee8bffb7f9f3e..0000000000000
--- a/torch/ao/pruning/_experimental/pruner/prune_functions.py
+++ /dev/null
@@ -1,359 +0,0 @@
-"""
-Collection of conversion functions for linear / conv2d structured pruning
-Also contains utilities for bias propogation
-"""
-from typing import cast, Optional, Callable, Tuple
-
-import torch
-from torch import nn, Tensor
-from torch.nn.utils import parametrize
-from torch.nn.utils.parametrize import ParametrizationList
-from .parametrization import FakeStructuredSparsity, BiasHook
-
-
-# BIAS PROPOGATION
-def _remove_bias_handles(module: nn.Module) -> None:
-    if hasattr(module, "_forward_hooks"):
-        bias_hooks = []
-        for key, hook in module._forward_hooks.items():
-            if isinstance(hook, BiasHook):
-                bias_hooks.append(key)
-
-        for key in bias_hooks:
-            del module._forward_hooks[key]
-
-
-def _get_adjusted_next_layer_bias(
-    next_layer: nn.Module, pruned_biases: Tensor, mask: Tensor
-) -> nn.Parameter:
-    r"""Returns new adjusted bias for the second supported module"""
-    if parametrize.is_parametrized(next_layer):
-        # need to access original weight
-        parametrization_dict = cast(nn.ModuleDict, next_layer.parametrizations)
-        weight_parameterizations = cast(
-            ParametrizationList, parametrization_dict.weight
-        )
-        next_weight = weight_parameterizations.original
-    else:
-        next_weight = cast(Tensor, next_layer.weight)
-
-    scaling_weight = next_weight[:, ~mask]
-    if isinstance(next_layer, nn.Conv2d):  # checking for Conv2d
-        # Propagating first layer pruned biases and calculating the new second layer bias
-        # involves more steps since the Conv2d scaling weight has extra dimensions,
-        # so adding bias involves broadcasting, logically:
-        # for each channel k in range(oC):
-        #     scaled_biases = sum(first_bias[pruned_idx] @ next_weight[k, pruned_idx, :, :].T)
-        #     new_next_bias[k] = old_next_bias[k] + scaled_biases
-        scaling_product = torch.matmul(
-            pruned_biases.reshape(1, -1), torch.transpose(scaling_weight, 1, 2)
-        )
-        sum_range = list(range(len(scaling_product.shape)))[
-            1:
-        ]  # all but the first dimension
-        scaled_biases = torch.sum(scaling_product, sum_range)
-    elif isinstance(next_layer, nn.Linear):  # Linear
-        scaled_biases = torch.matmul(
-            pruned_biases, torch.transpose(scaling_weight, 0, 1)
-        )  # recall b2_new = b1 @ w2.T + b2
-    else:
-        raise NotImplementedError(f"Type {type(next_layer)} not supported yet.")
-
-    if (
-        parametrize.is_parametrized(next_layer)
-        and getattr(next_layer, "_bias", None) is not None
-    ):  # next_layer is parametrized & has original bias ._bias
-        adjusted_bias = nn.Parameter(scaled_biases + next_layer._bias)
-    elif (
-        not parametrize.is_parametrized(next_layer) and next_layer.bias is not None
-    ):  # next_layer not parametrized & has .bias
-        adjusted_bias = nn.Parameter(scaled_biases + next_layer.bias)
-    else:  # next_layer has no bias
-        adjusted_bias = nn.Parameter(scaled_biases)
-    return adjusted_bias
-
-
-def _prune_module_bias(module: nn.Module, mask: Tensor) -> None:
-    r"""Applies mask to given modules bias"""
-    # prune bias along with weights, discard pruned indices of bias
-    original_bias = cast(Tensor, getattr(module, "_bias", module.bias))
-    if original_bias is not None:
-        module.bias = nn.Parameter(original_bias[mask])
-
-    #  remove _bias parameter
-    if hasattr(module, "_bias"):
-        delattr(module, "_bias")
-
-
-def _propogate_module_bias(module: nn.Module, mask: Tensor) -> Optional[Tensor]:
-    r"""
-    In the case that we need to propogate biases, this function will return the biases we need
-    """
-    # set current module bias
-    if module.bias is not None:
-        module.bias = nn.Parameter(cast(Tensor, module.bias)[mask])
-    elif getattr(module, "_bias", None) is not None:
-        module.bias = nn.Parameter(cast(Tensor, module._bias)[mask])
-
-    # get pruned biases to propogate to subsequent layer
-    if getattr(module, "_bias", None) is not None:
-        pruned_biases = cast(Tensor, module._bias)[~mask]
-    else:
-        pruned_biases = None
-
-    if hasattr(module, "_bias"):
-        delattr(module, "_bias")
-
-    return pruned_biases
-
-
-# LINEAR
-def _prune_linear_helper(linear: nn.Linear) -> Tensor:
-    # expects linear to be a parameterized linear module
-    parametrization_dict = cast(nn.ModuleDict, linear.parametrizations)
-    weight_parameterizations = cast(ParametrizationList, parametrization_dict.weight)
-    for p in weight_parameterizations:
-        if isinstance(p, FakeStructuredSparsity):
-            mask = cast(Tensor, p.mask)
-
-    with torch.no_grad():
-        parametrize.remove_parametrizations(linear, "weight", leave_parametrized=True)
-        linear.weight = nn.Parameter(linear.weight[mask])
-    linear.out_features = linear.weight.shape[0]
-    _remove_bias_handles(linear)
-
-    return mask
-
-
-def prune_linear(linear: nn.Linear) -> None:
-    mask = _prune_linear_helper(linear)
-    if getattr(linear, "prune_bias", False):
-        _prune_module_bias(linear, mask)
-
-
-def prune_linear_linear(linear1: nn.Linear, linear2: nn.Linear) -> None:
-    prune_linear_activation_linear(linear1, None, linear2)
-
-
-def prune_linear_activation_linear(
-    linear1: nn.Linear,
-    activation: Optional[Callable[[Tensor], Tensor]],
-    linear2: nn.Linear,
-):
-    mask = _prune_linear_helper(linear1)
-    if getattr(linear1, "prune_bias", False):
-        _prune_module_bias(linear1, mask)
-    else:
-        pruned_biases = _propogate_module_bias(linear1, mask)
-        if pruned_biases is not None:
-            if activation:
-                pruned_biases = activation(pruned_biases)
-            linear2.bias = _get_adjusted_next_layer_bias(linear2, pruned_biases, mask)
-
-    with torch.no_grad():
-        if parametrize.is_parametrized(linear2):
-            parametrization_dict = cast(nn.ModuleDict, linear2.parametrizations)
-            weight_parameterizations = cast(
-                ParametrizationList, parametrization_dict.weight
-            )
-
-            weight_parameterizations.original = nn.Parameter(
-                weight_parameterizations.original[:, mask]
-            )
-            linear2.in_features = weight_parameterizations.original.shape[1]
-        else:
-            linear2.weight = nn.Parameter(linear2.weight[:, mask])
-            linear2.in_features = linear2.weight.shape[1]
-
-
-# CONV2D
-def _prune_conv2d_helper(conv2d: nn.Conv2d) -> Tensor:
-    parametrization_dict = cast(nn.ModuleDict, conv2d.parametrizations)
-    weight_parameterizations = cast(ParametrizationList, parametrization_dict.weight)
-    for p in weight_parameterizations:
-        if isinstance(p, FakeStructuredSparsity):
-            mask = cast(Tensor, p.mask)
-
-    with torch.no_grad():
-        parametrize.remove_parametrizations(conv2d, "weight", leave_parametrized=True)
-        conv2d.weight = nn.Parameter(conv2d.weight[mask])
-    conv2d.out_channels = conv2d.weight.shape[0]
-
-    _remove_bias_handles(conv2d)
-    return mask
-
-
-def prune_conv2d_padded(conv2d_1: nn.Conv2d) -> None:
-    parametrization_dict = cast(nn.ModuleDict, conv2d_1.parametrizations)
-    weight_parameterizations = cast(ParametrizationList, parametrization_dict.weight)
-    for p in weight_parameterizations:
-        if isinstance(p, FakeStructuredSparsity):
-            mask = cast(Tensor, p.mask)
-
-    with torch.no_grad():
-        parametrize.remove_parametrizations(conv2d_1, "weight", leave_parametrized=True)
-
-    if getattr(conv2d_1, "_bias", None) is not None:
-        if (
-            conv2d_1.bias is not None
-        ):  # conv2d_1 has original bias and bias propagated from previous layer
-            new_bias = torch.zeros(conv2d_1.bias.shape)
-            new_bias[mask] = conv2d_1.bias[mask]
-            # adjusted bias that to keep in conv2d_1
-            new_bias[~mask] = cast(Tensor, conv2d_1._bias)[~mask]
-            # pruned biases that are kept instead of propagated
-            conv2d_1.bias = nn.Parameter(new_bias)
-        else:  # conv2d_1 has only original bias
-            conv2d_1.bias = nn.Parameter(cast(Tensor, conv2d_1._bias))
-    else:
-        # no original bias, only propagated bias
-        if (
-            conv2d_1.bias is not None
-        ):  # conv2d_1 has bias propagated from previous layer
-            conv2d_1.bias.data[~mask] = 0
-
-    if hasattr(conv2d_1, "_bias"):
-        delattr(conv2d_1, "_bias")
-
-
-def prune_conv2d(conv2d: nn.Conv2d) -> None:
-    mask = _prune_conv2d_helper(conv2d)
-    if getattr(conv2d, "prune_bias", False):
-        _prune_module_bias(conv2d, mask)
-
-
-def prune_conv2d_conv2d(conv2d_1: nn.Conv2d, conv2d_2: nn.Conv2d) -> None:
-    prune_conv2d_activation_conv2d(conv2d_1, None, conv2d_2)
-
-
-def prune_conv2d_activation_conv2d(
-    conv2d_1: nn.Conv2d,
-    activation: Optional[Callable[[Tensor], Tensor]],
-    conv2d_2: nn.Conv2d,
-):
-    r"""
-    Fusion Pattern for conv2d -> some activation module / function -> conv2d layers
-    """
-    parametrization_dict = cast(nn.ModuleDict, conv2d_1.parametrizations)
-    weight_parameterizations = cast(ParametrizationList, parametrization_dict.weight)
-    for p in weight_parameterizations:
-        if isinstance(p, FakeStructuredSparsity):
-            mask = cast(Tensor, p.mask)
-
-    prune_bias = getattr(conv2d_1, "prune_bias", False)
-    if (
-        hasattr(conv2d_2, "padding")
-        and cast(Tuple[int], conv2d_2.padding) > (0, 0)
-        and (conv2d_1.bias is not None or getattr(conv2d_1, "_bias", None) is not None)
-    ):
-        prune_conv2d_padded(conv2d_1)
-    else:
-        mask = _prune_conv2d_helper(conv2d_1)
-        if prune_bias:
-            _prune_module_bias(conv2d_1, mask)
-        else:
-            pruned_biases = _propogate_module_bias(conv2d_1, mask)
-            if pruned_biases is not None:
-                if activation:
-                    pruned_biases = activation(pruned_biases)
-                conv2d_2.bias = _get_adjusted_next_layer_bias(
-                    conv2d_2, pruned_biases, mask
-                )
-
-        if (
-            not (
-                hasattr(conv2d_2, "padding")
-                and cast(Tuple[int], conv2d_2.padding) > (0, 0)
-            )
-            or conv2d_1.bias is None
-        ):
-            with torch.no_grad():
-                if parametrize.is_parametrized(conv2d_2):
-                    parametrization_dict = cast(
-                        nn.ModuleDict, conv2d_2.parametrizations
-                    )
-                    weight_parameterizations = cast(
-                        ParametrizationList, parametrization_dict.weight
-                    )
-                    weight_parameterizations.original = nn.Parameter(
-                        weight_parameterizations.original[:, mask]
-                    )
-                    conv2d_2.in_channels = weight_parameterizations.original.shape[1]
-                else:
-                    conv2d_2.weight = nn.Parameter(conv2d_2.weight[:, mask])
-                    conv2d_2.in_channels = conv2d_2.weight.shape[1]
-
-
-def prune_conv2d_pool_activation_conv2d(
-    c1: nn.Conv2d,
-    pool: nn.Module,
-    activation: Optional[Callable[[Tensor], Tensor]],
-    c2: nn.Conv2d,
-) -> None:
-    prune_conv2d_activation_conv2d(c1, activation, c2)
-
-
-def prune_conv2d_activation_pool_conv2d(
-    c1: nn.Conv2d,
-    activation: Optional[Callable[[Tensor], Tensor]],
-    pool: nn.Module,
-    c2: nn.Conv2d,
-) -> None:
-    prune_conv2d_activation_conv2d(c1, activation, c2)
-
-
-def prune_conv2d_pool_flatten_linear(
-    conv2d: nn.Conv2d,
-    pool: nn.Module,
-    flatten: Optional[Callable[[Tensor], Tensor]],
-    linear: nn.Linear,
-) -> None:
-    mask = _prune_conv2d_helper(conv2d)
-
-    # We map the pruned indices of the Conv2d output to the flattened indices of the Linear following the Flatten layer.
-    # we determine the flattening scale (h * w), and readjust `first_pruned_indices`
-    # (each idx maps to range idx * h * w to (idx+1) * h * w), `first_valid_indices`,
-    # and `pruned_biases` (repeat each bias by h * w).
-    if parametrize.is_parametrized(linear):
-        parametrization_dict = cast(nn.ModuleDict, linear.parametrizations)
-        weight_parameterizations = cast(
-            ParametrizationList, parametrization_dict.weight
-        )
-        linear_ic = weight_parameterizations.original.shape[1]
-    else:
-        linear_ic = linear.weight.shape[1]
-
-    conv2d_oc = len(mask)
-    assert (
-        linear_ic % conv2d_oc == 0
-    ), f"Flattening from dimensions {conv2d_oc} to {linear_ic} not supported"
-
-    flatten_scale = linear_ic // conv2d_oc
-    flattened_mask = torch.tensor(
-        [[val] * flatten_scale for val in mask], dtype=torch.bool, device=mask.device
-    ).flatten()
-
-    if getattr(conv2d, "prune_bias", False):
-        _prune_module_bias(conv2d, mask)
-    else:
-        pruned_biases = cast(Tensor, _propogate_module_bias(conv2d, mask))
-        flattened_pruned_biases = torch.tensor(
-            [[bias] * flatten_scale for bias in pruned_biases], device=mask.device
-        ).flatten()
-        linear.bias = _get_adjusted_next_layer_bias(
-            linear, flattened_pruned_biases, flattened_mask
-        )
-
-    with torch.no_grad():
-        if parametrize.is_parametrized(linear):
-            parametrization_dict = cast(nn.ModuleDict, linear.parametrizations)
-            weight_parameterizations = cast(
-                ParametrizationList, parametrization_dict.weight
-            )
-            weight_parameterizations.original = nn.Parameter(
-                weight_parameterizations.original[:, flattened_mask]
-            )
-            linear.in_features = weight_parameterizations.original.shape[1]
-        else:
-            linear.weight = nn.Parameter(linear.weight[:, flattened_mask])
-            linear.in_features = linear.weight.shape[1]
diff --git a/torch/testing/_internal/common_pruning.py b/torch/testing/_internal/common_pruning.py
deleted file mode 100644
index 8fc08ee2a41bf..0000000000000
--- a/torch/testing/_internal/common_pruning.py
+++ /dev/null
@@ -1,311 +0,0 @@
-# -*- coding: utf-8 -*-
-# Owner(s): ["module: unknown"]
-
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-
-class SimpleLinear(nn.Module):
-    r"""Model with only Linear layers without biases, some wrapped in a Sequential,
-    some following the Sequential. Used to test basic pruned Linear-Linear fusion."""
-
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Linear(7, 5, bias=False),
-            nn.Linear(5, 6, bias=False),
-            nn.Linear(6, 4, bias=False),
-        )
-        self.linear1 = nn.Linear(4, 3, bias=False)
-        self.linear2 = nn.Linear(3, 10, bias=False)
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.linear1(x)
-        x = self.linear2(x)
-        return x
-
-
-class LinearBias(nn.Module):
-    r"""Model with only Linear layers, alternating layers with biases,
-    wrapped in a Sequential. Used to test pruned Linear-Bias-Linear fusion."""
-
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Linear(7, 5, bias=True),
-            nn.Linear(5, 6, bias=False),
-            nn.Linear(6, 3, bias=True),
-            nn.Linear(3, 3, bias=True),
-            nn.Linear(3, 10, bias=False),
-        )
-
-    def forward(self, x):
-        x = self.seq(x)
-        return x
-
-
-class LinearActivation(nn.Module):
-    r"""Model with only Linear layers, some with bias, some in a Sequential and some following.
-    Activation functions modules in between each Linear in the Sequential, and each outside layer.
-    Used to test pruned Linear(Bias)-Activation-Linear fusion."""
-
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Linear(7, 5, bias=True),
-            nn.ReLU(),
-            nn.Linear(5, 6, bias=False),
-            nn.Tanh(),
-            nn.Linear(6, 4, bias=True),
-        )
-        self.linear1 = nn.Linear(4, 3, bias=True)
-        self.act1 = nn.ReLU()
-        self.linear2 = nn.Linear(3, 10, bias=False)
-        self.act2 = nn.Tanh()
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.linear1(x)
-        x = self.act1(x)
-        x = self.linear2(x)
-        x = self.act2(x)
-        return x
-
-
-class LinearActivationFunctional(nn.Module):
-    r"""Model with only Linear layers, some with bias, some in a Sequential and some following.
-    Activation functions modules in between each Linear in the Sequential, and functional
-    activationals are called in between each outside layer.
-    Used to test pruned Linear(Bias)-Activation-Linear fusion."""
-
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Linear(7, 5, bias=True),
-            nn.ReLU(),
-            nn.Linear(5, 6, bias=False),
-            nn.ReLU(),
-            nn.Linear(6, 4, bias=True),
-        )
-        self.linear1 = nn.Linear(4, 3, bias=True)
-        self.linear2 = nn.Linear(3, 8, bias=False)
-        self.linear3 = nn.Linear(8, 10, bias=False)
-        self.act1 = nn.ReLU()
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.linear1(x)
-        x = F.relu(x)
-        x = self.linear2(x)
-        x = F.relu(x)
-        x = self.linear3(x)
-        x = F.relu(x)
-        return x
-
-
-class SimpleConv2d(nn.Module):
-    r"""Model with only Conv2d layers, all without bias, some in a Sequential and some following.
-    Used to test pruned Conv2d-Conv2d fusion."""
-
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Conv2d(1, 32, 3, 1, bias=False),
-            nn.Conv2d(32, 64, 3, 1, bias=False),
-        )
-        self.conv2d1 = nn.Conv2d(64, 48, 3, 1, bias=False)
-        self.conv2d2 = nn.Conv2d(48, 52, 3, 1, bias=False)
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.conv2d1(x)
-        x = self.conv2d2(x)
-        return x
-
-
-class Conv2dBias(nn.Module):
-    r"""Model with only Conv2d layers, some with bias, some in a Sequential and some outside.
-    Used to test pruned Conv2d-Bias-Conv2d fusion."""
-
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Conv2d(1, 32, 3, 1, bias=True),
-            nn.Conv2d(32, 32, 3, 1, bias=True),
-            nn.Conv2d(32, 64, 3, 1, bias=False),
-        )
-        self.conv2d1 = nn.Conv2d(64, 48, 3, 1, bias=True)
-        self.conv2d2 = nn.Conv2d(48, 52, 3, 1, bias=False)
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.conv2d1(x)
-        x = self.conv2d2(x)
-        return x
-
-
-class Conv2dActivation(nn.Module):
-    r"""Model with only Conv2d layers, some with bias, some in a Sequential and some following.
-    Activation function modules in between each Sequential layer, functional activations called
-    in-between each outside layer.
-    Used to test pruned Conv2d-Bias-Activation-Conv2d fusion."""
-
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Conv2d(1, 32, 3, 1, bias=True),
-            nn.ReLU(),
-            nn.Conv2d(32, 64, 3, 1, bias=True),
-            nn.Tanh(),
-            nn.Conv2d(64, 64, 3, 1, bias=False),
-            nn.ReLU(),
-        )
-        self.conv2d1 = nn.Conv2d(64, 48, 3, 1, bias=False)
-        self.conv2d2 = nn.Conv2d(48, 52, 3, 1, bias=True)
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.conv2d1(x)
-        x = F.relu(x)
-        x = self.conv2d2(x)
-        x = F.hardtanh(x)
-        return x
-
-
-class Conv2dPadBias(nn.Module):
-    r"""Model with only Conv2d layers, all with bias and some with padding > 0,
-    some in a Sequential and some following. Activation function modules in between each layer.
-    Used to test that bias is propagated correctly in the special case of
-    pruned Conv2d-Bias-(Activation)Conv2d fusion, when the second Conv2d layer has padding > 0."""
-
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Conv2d(1, 32, 3, 1, padding=1, bias=True),
-            nn.ReLU(),
-            nn.Conv2d(32, 32, 3, 1, bias=False),
-            nn.ReLU(),
-            nn.Conv2d(32, 32, 3, 1, padding=1, bias=True),
-            nn.ReLU(),
-            nn.Conv2d(32, 32, 3, 1, padding=1, bias=True),
-            nn.ReLU(),
-            nn.Conv2d(32, 64, 3, 1, bias=True),
-            nn.Tanh(),
-        )
-        self.conv2d1 = nn.Conv2d(64, 48, 3, 1, padding=1, bias=True)
-        self.act1 = nn.ReLU()
-        self.conv2d2 = nn.Conv2d(48, 52, 3, 1, padding=1, bias=True)
-        self.act2 = nn.Tanh()
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.conv2d1(x)
-        x = self.act1(x)
-        x = self.conv2d2(x)
-        x = self.act2(x)
-        return x
-
-
-class Conv2dPool(nn.Module):
-    r"""Model with only Conv2d layers, all with bias, some in a Sequential and some following.
-    Activation function modules in between each layer, Pool2d modules in between each layer.
-    Used to test pruned Conv2d-Pool2d-Conv2d fusion."""
-
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Conv2d(1, 32, kernel_size=3, padding=1, bias=True),
-            nn.MaxPool2d(kernel_size=2, stride=2, padding=1),
-            nn.ReLU(),
-            nn.Conv2d(32, 64, kernel_size=3, padding=1, bias=True),
-            nn.Tanh(),
-            nn.AvgPool2d(kernel_size=2, stride=2, padding=1),
-        )
-        self.conv2d1 = nn.Conv2d(64, 48, kernel_size=3, padding=1, bias=True)
-        self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2, padding=1)
-        self.af1 = nn.ReLU()
-        self.conv2d2 = nn.Conv2d(48, 52, kernel_size=3, padding=1, bias=True)
-        self.conv2d3 = nn.Conv2d(52, 52, kernel_size=3, padding=1, bias=True)
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.conv2d1(x)
-        x = self.maxpool(x)
-        x = self.af1(x)
-        x = self.conv2d2(x)
-        x = F.avg_pool2d(x, kernel_size=2, stride=2, padding=1)
-        x = F.relu(x)
-        x = self.conv2d3(x)
-        return x
-
-
-class Conv2dPoolFlattenFunctional(nn.Module):
-    r"""Model with Conv2d layers, all with bias, some in a Sequential and some following, and then a Pool2d
-    and a functional Flatten followed by a Linear layer.
-    Activation functions and Pool2ds in between each layer also.
-    Used to test pruned Conv2d-Pool2d-Flatten-Linear fusion."""
-
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Conv2d(1, 3, kernel_size=3, padding=1, bias=True),
-            nn.MaxPool2d(kernel_size=2, stride=2, padding=1),
-            nn.ReLU(),
-            nn.Conv2d(3, 5, kernel_size=3, padding=1, bias=True),
-            nn.Tanh(),
-            nn.AvgPool2d(kernel_size=2, stride=2, padding=1),
-        )
-        self.conv2d1 = nn.Conv2d(5, 7, kernel_size=3, padding=1, bias=True)
-        self.af1 = nn.ReLU()
-        self.conv2d2 = nn.Conv2d(7, 11, kernel_size=3, padding=1, bias=True)
-        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
-        self.fc = nn.Linear(11, 13, bias=True)
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.conv2d1(x)
-        x = F.max_pool2d(x, kernel_size=2, stride=2, padding=1)
-        x = self.af1(x)
-        x = self.conv2d2(x)
-        x = self.avg_pool(x)
-        x = torch.flatten(x, 1)  # test functional flatten
-        x = self.fc(x)
-        return x
-
-
-class Conv2dPoolFlatten(nn.Module):
-    r"""Model with Conv2d layers, all with bias, some in a Sequential and some following, and then a Pool2d
-    and a Flatten module followed by a Linear layer.
-    Activation functions and Pool2ds in between each layer also.
-    Used to test pruned Conv2d-Pool2d-Flatten-Linear fusion."""
-
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Conv2d(1, 3, kernel_size=3, padding=1, bias=True),
-            nn.MaxPool2d(kernel_size=2, stride=2, padding=1),
-            nn.ReLU(),
-            nn.Conv2d(3, 5, kernel_size=3, padding=1, bias=True),
-            nn.Tanh(),
-            nn.AvgPool2d(kernel_size=2, stride=2, padding=1),
-        )
-        self.conv2d1 = nn.Conv2d(5, 7, kernel_size=3, padding=1, bias=True)
-        self.af1 = nn.ReLU()
-        self.conv2d2 = nn.Conv2d(7, 11, kernel_size=3, padding=1, bias=True)
-        self.avg_pool = nn.AdaptiveAvgPool2d((2, 2))
-        self.flatten = nn.Flatten()
-        self.fc = nn.Linear(44, 13, bias=True)
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.conv2d1(x)
-        x = F.max_pool2d(x, kernel_size=2, stride=2, padding=1)
-        x = self.af1(x)
-        x = self.conv2d2(x)
-        x = self.avg_pool(x)
-        x = self.flatten(x)
-        x = self.fc(x)
-        return x

From e42286d10e3b7e31fc062cebed2a5a34c10bc8cb Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 7 Dec 2022 09:18:30 -0800
Subject: [PATCH 1677/1922] Fix some typed storage is deprecated warnings.
 (#89867)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89867
Approved by: https://github.com/albanD
---
 test/test_torch.py               | 15 +--------------
 torch/_functorch/aot_autograd.py | 12 ++++++------
 torch/_tensor.py                 |  2 +-
 torch/storage.py                 |  9 +++++----
 4 files changed, 13 insertions(+), 25 deletions(-)

diff --git a/test/test_torch.py b/test/test_torch.py
index 6224c09977749..41e407b413f52 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -6534,26 +6534,13 @@ def test_typed_storage_deprecation_warning(self):
             with warnings.catch_warnings(record=True) as w:
                 warnings.resetwarnings()
                 f()
-                self.assertEqual(len(w), 1)
+                self.assertEqual(len(w), 1, msg=str([str(a) for a in w]))
                 warning = w[0].message
                 self.assertTrue(warning, DeprecationWarning)
                 self.assertTrue(re.search(
                     '^TypedStorage is deprecated',
                     str(warning)))
 
-        # Check that only one warning is raised from calling multiple
-        # TypedStorage functions if warnings are not reset between each
-        with warnings.catch_warnings(record=True) as w:
-            warnings.resetwarnings()
-            for f in funcs:
-                f()
-            self.assertEqual(len(w), 1)
-            warning = w[0].message
-            self.assertTrue(warning, DeprecationWarning)
-            self.assertTrue(re.search(
-                '^TypedStorage is deprecated',
-                str(warning)))
-
     def test_from_file(self):
         def assert_with_filename(filename):
             size = 10000
diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
index 3eea7ac42f96b..2cd12e4f883fc 100644
--- a/torch/_functorch/aot_autograd.py
+++ b/torch/_functorch/aot_autograd.py
@@ -335,7 +335,7 @@ def inner(*args):
                 #    _x_updated_metadata = CompiledFunction.fw_metadata.metadata_mutation_input_info[0]
                 #    x.as_strided_(_x_updated_metadata.size(), _x_updated_metadata.stride(), _x_updated_metadata.storage_offset())
                 #    return out
-                if StorageWeakRef(arg.storage()) == StorageWeakRef(new_arg.storage()):
+                if StorageWeakRef(arg._storage()) == StorageWeakRef(new_arg._storage()):
                     # We can use the storage aliasing of the inputs and updated inputs
                     # to detect when an input was actually updated, or just inplace-viewed.
                     collect_mutated_input_info.append(MutationType.metadata_only)
@@ -406,7 +406,7 @@ def filter_and_record_aliased_outs(outputs):
             # This will be more complicated when you have multiple _base tensors aliasing the same
             # underlying storage, when we eventually handle that.
             # We'll need to ensure that we generate the view off of the right base.
-            inp_storage_refs = {StorageWeakRef(inpt.storage()): idx for idx, inpt in enumerate(flat_f_args)}
+            inp_storage_refs = {StorageWeakRef(inpt._storage()): idx for idx, inpt in enumerate(flat_f_args)}
             inp_tensor_ids = {id(inpt) for inpt in flat_f_args if isinstance(inpt, torch.Tensor)}
             inp_storage_refs_set = set(inp_storage_refs)
 
@@ -429,8 +429,8 @@ def filter_and_record_aliased_outs(outputs):
                 # Note: When detecting input/output aliasing, we NEED to do it using the outer FunctionalTensorWrapper objects.
                 # In the case where we mutate an input *and* return a view of it, the outer wrappers will still alias,
                 # but the inner tensors no longer alias.
-                if isinstance(o, torch.Tensor) and StorageWeakRef(o.storage()) in inp_storage_refs:
-                    aliased_inp_idx = inp_storage_refs[StorageWeakRef(o.storage())]
+                if isinstance(o, torch.Tensor) and StorageWeakRef(o._storage()) in inp_storage_refs:
+                    aliased_inp_idx = inp_storage_refs[StorageWeakRef(o._storage())]
                     is_exact_input = id(o) in inp_tensor_ids
                     aliases_intermediate_and_not_input = False
                     aliased_out_idx[o] = (aliased_inp_idx, aliases_intermediate_and_not_input, is_exact_input)
@@ -982,7 +982,7 @@ def merge_view_inputs(
     storage_ref_to_idx: Dict[StorageWeakRef, List[int]] = collections.defaultdict(list)
     for i, inpt in enumerate(fwd_inputs):
         if isinstance(inpt, Tensor):
-            storage_ref = StorageWeakRef(inpt.storage())
+            storage_ref = StorageWeakRef(inpt._storage())
             storage_ref_to_idx[storage_ref].append(i)
     base_args = []
     other_args = []
@@ -1018,7 +1018,7 @@ def merge_view_inputs(
             if len(non_none_bases) == 0:
                 # Case where none of the aliases require gradients
                 example_idx = aliased_input_indices[0]
-                synthetic_base = torch.Tensor(fwd_inputs[example_idx].storage())
+                synthetic_base = torch.Tensor(fwd_inputs[example_idx]._storage())
             else:
                 # Case where all of the aliases require gradients, and have the same _base.
                 synthetic_base = non_none_bases[0]
diff --git a/torch/_tensor.py b/torch/_tensor.py
index 6c441a04e92b5..5e31f565a023a 100644
--- a/torch/_tensor.py
+++ b/torch/_tensor.py
@@ -225,7 +225,7 @@ def storage(self):
         if has_torch_function_unary(self):
             return handle_torch_function(Tensor.storage, (self,), self)
 
-        torch.storage._warn_typed_storage_removal()
+        torch.storage._warn_typed_storage_removal(stacklevel=2)
         return self._typed_storage()
 
     # For internal use only, to avoid raising deprecation warning
diff --git a/torch/storage.py b/torch/storage.py
index 9cf61d626536a..38ec5238cd735 100644
--- a/torch/storage.py
+++ b/torch/storage.py
@@ -306,13 +306,14 @@ def _isint(x):
     else:
         return isinstance(x, int)
 
-def _warn_typed_storage_removal():
+def _warn_typed_storage_removal(stacklevel=2):
     message = (
         "TypedStorage is deprecated. It will be removed in the future and "
         "UntypedStorage will be the only storage class. This should only matter "
-        "to you if you are using storages directly."
+        "to you if you are using storages directly.  To access UntypedStorage "
+        "directly, use tensor._storage() instead of tensor.storage()"
     )
-    warnings.warn(message, UserWarning)
+    warnings.warn(message, UserWarning, stacklevel=stacklevel + 1)
 
 class TypedStorage:
     is_sparse = False
@@ -471,7 +472,7 @@ def __init__(self, *args, device=None, dtype=None, wrap_storage=None, _internal=
     @property
     def is_cuda(self):
         _warn_typed_storage_removal()
-        return self.device.type == 'cuda'
+        return self._untyped_storage.device.type == 'cuda'
 
     def untyped(self):
         """Returns the internal :class:`torch.UntypedStorage`"""

From 6b3d09ab2fe0917dd1eec793f04ecf526906aebf Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 7 Dec 2022 08:54:42 -0800
Subject: [PATCH 1678/1922] Fix AssertionError fake_mode is not None in
 distributed (#90392)

Fixes https://github.com/pytorch/pytorch/issues/90375

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90392
Approved by: https://github.com/voznesenskym
---
 test/distributed/test_dynamo_distributed.py | 12 ++++++++++++
 torch/_dynamo/optimizations/distributed.py  |  3 ++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index 57fa5e0652709..ade7d92543995 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -194,6 +194,18 @@ def test_hf_bert_ddp_aot_eager(self):
         model = FakeDDP(model)
         run_hf_bert_ddp(self, model, inputs, "aot_eager")
 
+    @patch.object(config, "optimize_ddp", True)
+    def test_issue90375(self):
+        class Model(nn.Module):
+            def forward(self):
+                return torch.randn(3) * torch.randn(3)
+
+        model = Model()
+        model = FakeDDP(model)
+
+        opt_model = torch._dynamo.optimize("aot_eager")(model)
+        opt_model()
+
 
 # Are these tests failing?  Check and see if TestFakeDistributedSingleProc has a
 # single process version; if it's just a problem in the Dynamo distributed
diff --git a/torch/_dynamo/optimizations/distributed.py b/torch/_dynamo/optimizations/distributed.py
index 934a9abc674ed..f48ba500be59f 100644
--- a/torch/_dynamo/optimizations/distributed.py
+++ b/torch/_dynamo/optimizations/distributed.py
@@ -140,7 +140,8 @@ def compile_fn(self, gm: fx.GraphModule, example_inputs: List[torch.Tensor]):
         and returns its callable.
         """
         fake_mode = fake_mode_from_tensors(example_inputs)
-        assert fake_mode is not None
+        if fake_mode is None:
+            fake_mode = torch._subclasses.fake_tensor.FakeTensorMode()
 
         # 1: compute the partition map according to DDP bucket logic
         buckets = [Bucket()]  # (size, param_names)

From 51ccda677c117411977d98d89fb7d47b3ef94b20 Mon Sep 17 00:00:00 2001
From: Angela Yi <angelayi@meta.com>
Date: Wed, 7 Dec 2022 21:09:38 +0000
Subject: [PATCH 1679/1922] [fx] Copy codegen in legalize_graph (#90023)

Test Plan: CI

Differential Revision: D41666330

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90023
Approved by: https://github.com/SherlockNoMad
---
 torch/fx/passes/tools_common.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch/fx/passes/tools_common.py b/torch/fx/passes/tools_common.py
index 87ad477cdc2cf..0af6de5508a82 100644
--- a/torch/fx/passes/tools_common.py
+++ b/torch/fx/passes/tools_common.py
@@ -249,5 +249,6 @@ def legalize_graph(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
     # a cycle (i.e. some node's dependencies were not satisfied.)
     if len(new_graph.nodes) < len(gm.graph.nodes):
         raise RuntimeError(f"Input graph has cycles, unable to add {[node for node in indeg if indeg[node] != 0]}")
+    new_graph._codegen = gm.graph._codegen
     gm.graph = new_graph
     return gm

From 8ec0772320c2070f5f63431502fd04519413921c Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Wed, 7 Dec 2022 21:17:35 +0000
Subject: [PATCH 1680/1922] [Vulkan] Enable copying QInt8 and QInt32 tensors
 from cpu to vulkan. (#90357)

Summary:
Copying QInt8 and QInt32 from cpu to vulkan:
 - Added shader nchw_to_image_int8
 - Added shader nchw_to_image_int32

Copying QInt8 and QInt32 from vulkan to cpu
Note: This functionality is currently disabled until issues on Android are resolved.
- Added shader image_to_nchw_int32
- QInt8 works with the same existing image_to_nchw_quantized shaders

Added multiple tests for each supported dtype:
- cpu_to_vulkan_and_dequantize:
These tests check the correctness of copying quantized cpu tensor to vulkan by comparing the output of the following:
  - cpu float tensor -> quantize -> to vulkan -> dequantize -> to cpu
  - cpu float tensor -> quantize -> dequantize
- cpu_to_vulkan_and_vulkan_to_cpu
(currently disabled until copying vulkan quantized to cpu is enabled):
These tests check the correctness of copying from cpu to vulkan and from vulkan to cpu by creating a random cpu float tensor, quantizing it, then copying it to vulkan, then back to cpu and comparing the output tensor to the original quantized tensor.
- quantize_per_tensor_and_vulkan_to_cpu
(currently disabled until copying vulkan quantized to cpu is enabled):
These tests check the correctness of copying quantized tensor from vulkan to cpu by comparing the output of the following:
  - cpu float tensor -> to vulkan -> quantize -> to cpu
  - cpu float tensor -> quantize

Test Plan:
On Mac
```
cd ~/fbsource
buck1 run -c pt.vulkan_full_precision=1 //xplat/caffe2:pt_vulkan_quantized_api_test_binAppleMac\#macosx-arm64
```

On Android
```
cd ~/fbsource
buck1 build -c ndk.custom_libcxx=false -c pt.enable_qpl=0 -c pt.vulkan_full_precision=1 //xplat/caffe2:pt_vulkan_quantized_api_test_binAndroid\#android-arm64 --show-output
adb push buck-out/gen/xplat/caffe2/pt_vulkan_quantized_api_test_binAndroid\#android-arm64 /data/local/tmp/vulkan_quantized_api_test
adb shell "/data/local/tmp/vulkan_quantized_api_test"
```

Reviewed By: kimishpatel

Differential Revision: D41654287

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90357
Approved by: https://github.com/SS-JIA
---
 .../vulkan/glsl/image_to_nchw_int32.glsl      |  52 +++
 .../vulkan/glsl/nchw_to_image_int32.glsl      |  55 +++
 .../vulkan/glsl/nchw_to_image_int8.glsl       |  85 +++++
 ...uantized.glsl => nchw_to_image_uint8.glsl} |   4 +-
 aten/src/ATen/native/vulkan/ops/Utils.cpp     |  34 +-
 .../ATen/test/vulkan_quantized_api_test.cpp   | 351 +++++++++++++++---
 6 files changed, 521 insertions(+), 60 deletions(-)
 create mode 100644 aten/src/ATen/native/vulkan/glsl/image_to_nchw_int32.glsl
 create mode 100644 aten/src/ATen/native/vulkan/glsl/nchw_to_image_int32.glsl
 create mode 100644 aten/src/ATen/native/vulkan/glsl/nchw_to_image_int8.glsl
 rename aten/src/ATen/native/vulkan/glsl/{nchw_to_image_quantized.glsl => nchw_to_image_uint8.glsl} (98%)

diff --git a/aten/src/ATen/native/vulkan/glsl/image_to_nchw_int32.glsl b/aten/src/ATen/native/vulkan/glsl/image_to_nchw_int32.glsl
new file mode 100644
index 0000000000000..f6f1a48105a7e
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/image_to_nchw_int32.glsl
@@ -0,0 +1,52 @@
+#version 450 core
+#define PRECISION $precision
+
+layout(std430) buffer;
+
+/*
+ * Input Sampler
+ */
+layout(set = 0, binding = 0) uniform PRECISION isampler3D uImage;
+
+/*
+ * Output Buffer
+ */
+layout(set = 0, binding = 1) buffer PRECISION restrict writeonly Buffer {
+  int data[];
+}
+uBuffer;
+
+/*
+ * Params Buffer
+ */
+layout(set = 0, binding = 2) uniform PRECISION restrict Block {
+  // xyz contain the extents of the input texture, w contains HxW to help
+  // calculate buffer offsets
+  ivec4 in_extents;
+}
+uBlock;
+
+/*
+ * Local Work Group in_extents
+ */
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, uBlock.in_extents.xyz))) {
+    return;
+  }
+
+  const ivec4 intex = texelFetch(uImage, pos, 0);
+
+  const int base_index =
+      pos.x + uBlock.in_extents.x * pos.y + (4 * uBlock.in_extents.w) * pos.z;
+  const ivec4 buf_indices =
+      base_index + ivec4(0, 1, 2, 3) * uBlock.in_extents.w;
+
+  uBuffer.data[buf_indices.x] = intex.x;
+  uBuffer.data[buf_indices.y] = intex.y;
+  uBuffer.data[buf_indices.z] = intex.z;
+  uBuffer.data[buf_indices.w] = intex.w;
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/nchw_to_image_int32.glsl b/aten/src/ATen/native/vulkan/glsl/nchw_to_image_int32.glsl
new file mode 100644
index 0000000000000..1d0eb65e2604a
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/nchw_to_image_int32.glsl
@@ -0,0 +1,55 @@
+#version 450 core
+#define PRECISION $precision
+#define FORMAT $format
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+/*
+ * Output Image
+ */
+layout(set = 0, binding = 0, rgba32i) uniform PRECISION restrict writeonly iimage3D uImage;
+
+/*
+ * Input Buffer
+ */
+layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer {
+  int data[];
+}
+uBuffer;
+
+/*
+ * Params Buffer
+ */
+layout(set = 0, binding = 2) uniform PRECISION restrict Block {
+  // xyz contain the extents of the input texture, w contains HxW to help
+  // calculate buffer offsets
+  ivec4 out_extents;
+}
+uBlock;
+
+/*
+ * Local Work Group Size
+ */
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, uBlock.out_extents.xyz))) {
+    return;
+  }
+
+  const int base_index =
+      pos.x + uBlock.out_extents.x * pos.y + (4 * uBlock.out_extents.w) * pos.z;
+  const ivec4 buf_indices =
+      base_index + ivec4(0, 1, 2, 3) * uBlock.out_extents.w;
+
+  int val_x = uBuffer.data[buf_indices.x];
+  int val_y = uBuffer.data[buf_indices.y];
+  int val_z = uBuffer.data[buf_indices.z];
+  int val_w = uBuffer.data[buf_indices.w];
+
+  imageStore(uImage, pos, ivec4(val_x, val_y, val_z, val_w));
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/nchw_to_image_int8.glsl b/aten/src/ATen/native/vulkan/glsl/nchw_to_image_int8.glsl
new file mode 100644
index 0000000000000..4189ad219810c
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/glsl/nchw_to_image_int8.glsl
@@ -0,0 +1,85 @@
+#version 450 core
+#define PRECISION $precision
+#define FORMAT $format
+
+layout(std430) buffer;
+
+/* Qualifiers: layout - storage - precision - memory */
+
+/*
+ * Output Image
+ */
+layout(set = 0, binding = 0, rgba8i) uniform PRECISION restrict writeonly iimage3D uImage;
+
+/*
+ * Input Buffer
+ */
+layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer {
+  int data[];
+}
+uBuffer;
+
+/*
+ * Params Buffer
+ */
+layout(set = 0, binding = 2) uniform PRECISION restrict Block {
+  // xyz contain the extents of the input texture, w contains HxW to help
+  // calculate buffer offsets
+  ivec4 out_extents;
+}
+uBlock;
+
+/*
+ * Local Work Group Size
+ */
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+/*
+ * Extends sign of int8
+ */
+int extend_sign(int x) {
+  if (x >> 7 == 1) {
+    return x | 0xFFFFFF00;
+  }
+  return x;
+}
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, uBlock.out_extents.xyz))) {
+    return;
+  }
+
+  const int base_index =
+      pos.x + uBlock.out_extents.x * pos.y + (4 * uBlock.out_extents.w) * pos.z;
+  const ivec4 buf_indices =
+      base_index + ivec4(0, 1, 2, 3) * uBlock.out_extents.w;
+
+  int shift = (1 << 8) - 1;
+  ivec4 masks;
+  masks.x = shift << 8 * (buf_indices.x % 4);
+  masks.y = shift << 8 * (buf_indices.y % 4);
+  masks.z = shift << 8 * (buf_indices.z % 4);
+  masks.w = shift << 8 * (buf_indices.w % 4);
+
+  int buf_in_1 = uBuffer.data[buf_indices.x / 4];
+  int a_v = (buf_in_1 & masks.x) >> 8 * (buf_indices.x % 4);
+  a_v = extend_sign(a_v);
+
+  int buf_in_2 = uBuffer.data[buf_indices.y / 4];
+  int b_v = (buf_in_2 & masks.y) >> 8 * (buf_indices.y % 4);
+  b_v = extend_sign(b_v);
+
+  int buf_in_3 = uBuffer.data[buf_indices.z / 4];
+  int g_v = (buf_in_3 & masks.z) >> 8 * (buf_indices.z % 4);
+  g_v = extend_sign(g_v);
+
+  int buf_in_4 = uBuffer.data[buf_indices.w / 4];
+  int r_v = (buf_in_4 & masks.w) >> 8 * (buf_indices.w % 4);
+  r_v = extend_sign(r_v);
+
+  ivec4 texel = ivec4(a_v, b_v, g_v, r_v);
+
+  imageStore(uImage, pos, texel);
+}
diff --git a/aten/src/ATen/native/vulkan/glsl/nchw_to_image_quantized.glsl b/aten/src/ATen/native/vulkan/glsl/nchw_to_image_uint8.glsl
similarity index 98%
rename from aten/src/ATen/native/vulkan/glsl/nchw_to_image_quantized.glsl
rename to aten/src/ATen/native/vulkan/glsl/nchw_to_image_uint8.glsl
index cca8d88fcd7d5..68adb45fa37b1 100644
--- a/aten/src/ATen/native/vulkan/glsl/nchw_to_image_quantized.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/nchw_to_image_uint8.glsl
@@ -7,12 +7,12 @@ layout(std430) buffer;
 /* Qualifiers: layout - storage - precision - memory */
 
 /*
- * Input Sampler
+ * Output Image
  */
 layout(set = 0, binding = 0, rgba8ui) uniform PRECISION restrict writeonly uimage3D uImage;
 
 /*
- * Output Buffer
+ * Input Buffer
  */
 layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer {
   uint data[];
diff --git a/aten/src/ATen/native/vulkan/ops/Utils.cpp b/aten/src/ATen/native/vulkan/ops/Utils.cpp
index d1d3c297c88dc..fed1fd482fd58 100644
--- a/aten/src/ATen/native/vulkan/ops/Utils.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Utils.cpp
@@ -20,7 +20,19 @@ static api::ShaderSource get_nchw_to_image_shader(const vTensor& v_dst) {
   if (v_dst.is_quantized()) {
     switch (v_dst.storage_type()) {
       case api::StorageType::TEXTURE_3D:
-        return VK_KERNEL(nchw_to_image_quantized);
+        switch (v_dst.dtype()) {
+          case c10::ScalarType::QUInt8:
+            return VK_KERNEL(nchw_to_image_uint8);
+          case c10::ScalarType::QInt8:
+            return VK_KERNEL(nchw_to_image_int8);
+          case c10::ScalarType::QInt32:
+            return VK_KERNEL(nchw_to_image_int32);
+          default:
+            TORCH_CHECK(
+                false,
+                "Vulkan quantization currently not supported for dtype ",
+                v_dst.dtype());
+        }
       default:
         TORCH_CHECK(false, "No kernel available!");
       case api::StorageType::BUFFER:
@@ -45,8 +57,21 @@ static api::ShaderSource get_image_to_nchw_shader(const vTensor& v_src) {
         get_dim<Dim4D::Height>(v_src) * get_dim<Dim4D::Width>(v_src);
     switch (v_src.storage_type()) {
       case api::StorageType::TEXTURE_3D:
-        return plane_size % 4 == 0 ? VK_KERNEL(image_to_nchw_quantized_mul4)
-                                   : VK_KERNEL(image_to_nchw_quantized);
+        switch (v_src.dtype()) {
+          case c10::ScalarType::QUInt8:
+            return plane_size % 4 == 0 ? VK_KERNEL(image_to_nchw_quantized_mul4)
+                                       : VK_KERNEL(image_to_nchw_quantized);
+          case c10::ScalarType::QInt8:
+            return plane_size % 4 == 0 ? VK_KERNEL(image_to_nchw_quantized_mul4)
+                                       : VK_KERNEL(image_to_nchw_quantized);
+          case c10::ScalarType::QInt32:
+            return VK_KERNEL(image_to_nchw_int32);
+          default:
+            TORCH_CHECK(
+                false,
+                "Vulkan quantization currently not supported for dtype ",
+                v_src.dtype());
+        }
       default:
         TORCH_CHECK(false, "No kernel available!");
       case api::StorageType::BUFFER:
@@ -134,7 +159,8 @@ void record_image_to_nchw_op(
       plane_size,
   };
 
-  if (v_src.is_quantized()) {
+  if (v_src.dtype() == c10::ScalarType::QUInt8 ||
+      v_src.dtype() == c10::ScalarType::QInt8) {
     if (plane_size % 4 == 0) {
       global_size.data[0u] = plane_size / 4;
       global_size.data[1u] = 1;
diff --git a/aten/src/ATen/test/vulkan_quantized_api_test.cpp b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
index 2467ab21b7a9d..0541f08787bc9 100644
--- a/aten/src/ATen/test/vulkan_quantized_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
@@ -223,10 +223,12 @@ class VulkanAPITest : public ::testing::Test {
 
 at::Tensor cpu_to_vulkan(at::Tensor in_cpu) {
   auto options = in_cpu.options();
-  if (options.dtype().toScalarType() == c10::ScalarType::QUInt8) {
+  if (options.dtype().toScalarType() == c10::ScalarType::QUInt8 ||
+      options.dtype().toScalarType() == c10::ScalarType::QInt8 ||
+      options.dtype().toScalarType() == c10::ScalarType::QInt32) {
     auto ret = at::native::vulkan::ops::_empty_affine_quantized(
         in_cpu.sizes(),
-        c10::ScalarType::QUInt8,
+        options.dtype().toScalarType(),
         options.layout(),
         options.device(),
         options.pinned_memory(),
@@ -244,7 +246,9 @@ at::Tensor cpu_to_vulkan(at::Tensor in_cpu) {
 
 at::Tensor vulkan_to_cpu(at::Tensor vulkan, at::Tensor in_cpu) {
   auto q_options = in_cpu.options();
-  if (q_options.dtype().toScalarType() == c10::ScalarType::QUInt8) {
+  if (q_options.dtype().toScalarType() == c10::ScalarType::QUInt8 ||
+      q_options.dtype().toScalarType() == c10::ScalarType::QInt8 ||
+      q_options.dtype().toScalarType() == c10::ScalarType::QInt32) {
     auto output = at::native::empty_affine_quantized(
         in_cpu.sizes(),
         q_options.dtype().toScalarType(),
@@ -383,12 +387,13 @@ TEST_F(VulkanAPITest, DISABLED_support_vulkan) {
 void test_cpu_to_vulkan_and_vulkan_to_cpu(
     const at::IntArrayRef input_shape,
     const double scale,
-    const int zero_point) {
+    const int zero_point,
+    const c10::ScalarType dtype = c10::ScalarType::QUInt8) {
 
   // produce random quantized cpu tensor
   auto in_cpu = produce_random_tensor(input_shape);
   auto in_q_cpu = at::quantize_per_tensor(
-      in_cpu, scale, zero_point, c10::ScalarType::QUInt8);
+      in_cpu, scale, zero_point, dtype);
 
   // copy quantized cpu tensor to vulkan
   auto in_q_cpu_vk = cpu_to_vulkan(in_q_cpu);
@@ -415,37 +420,215 @@ void test_cpu_to_vulkan_and_vulkan_to_cpu(
   ASSERT_TRUE(check);
 }
 
-void test_cpu_to_vulkan_and_vulkan_to_cpu_random() {
+void test_cpu_to_vulkan_and_vulkan_to_cpu_random(
+    const c10::ScalarType dtype) {
   const double scale = produce_random_scale();
-  const int64_t zero_point = produce_random_zero_point(c10::ScalarType::QUInt8);
+  const int64_t zero_point = produce_random_zero_point(dtype);
   const at::IntArrayRef tensor_shape =
     {rand_pos_int(30), rand_pos_int(30), rand_pos_int(100), rand_pos_int(100)};
-  test_cpu_to_vulkan_and_vulkan_to_cpu(tensor_shape, scale, zero_point);
+  test_cpu_to_vulkan_and_vulkan_to_cpu(
+    tensor_shape, scale, zero_point, dtype);
 }
 
 // TODO: Fix vulkan to cpu on Android
-TEST_F(VulkanAPITest, DISABLED_cpu_to_vulkan_and_vulkan_to_cpu) {
-  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 1, 1}, 0.13, 21);
-  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 1, 4}, 0.3, 87);
-  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 4, 1}, 0.2, 120);
-  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 7, 7}, 0.3, 87);
-  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 8, 8}, 0.1, 10);
-  test_cpu_to_vulkan_and_vulkan_to_cpu({3, 5, 8, 8}, 0.04, 97);
-  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 11, 17}, 0.07, 15);
-  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 12, 17}, 0.1, 10);
-  test_cpu_to_vulkan_and_vulkan_to_cpu({3, 5, 12, 17}, 0.1, 10);
-  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 17, 12}, 0.1, 10);
-  test_cpu_to_vulkan_and_vulkan_to_cpu({2, 4, 17, 12}, 0.1, 10);
-  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 10, 14}, 0.0001, 101);
-  test_cpu_to_vulkan_and_vulkan_to_cpu({3, 5, 10, 14}, 0.009, 43);
-  test_cpu_to_vulkan_and_vulkan_to_cpu({3, 5, 10, 15}, 0.1, 19);
-  test_cpu_to_vulkan_and_vulkan_to_cpu({4, 4, 9, 17}, 0.1, 19);
-  test_cpu_to_vulkan_and_vulkan_to_cpu({3, 5, 25, 29}, 0.1, 19);
-  test_cpu_to_vulkan_and_vulkan_to_cpu({4, 4, 25, 29}, 0.1, 19);
-  test_cpu_to_vulkan_and_vulkan_to_cpu({11, 17, 25, 29}, 0.027, 89);
+TEST_F(VulkanAPITest, DISABLED_cpu_to_vulkan_and_vulkan_to_cpu_quint8) {
+  const c10::ScalarType dtype = c10::ScalarType::QUInt8;
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 1, 1}, 0.13, 21, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 1, 4}, 0.3, 87, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 4, 1}, 0.2, 120, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 7, 7}, 0.3, 87, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 8, 8}, 0.1, 10, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({3, 5, 8, 8}, 0.04, 97, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 11, 17}, 0.07, 15, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 12, 17}, 0.1, 10, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({3, 5, 12, 17}, 0.1, 10, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 17, 12}, 0.1, 10, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({2, 4, 17, 12}, 0.1, 10, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 10, 14}, 0.0001, 101, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({3, 5, 10, 14}, 0.009, 43, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({3, 5, 10, 15}, 0.1, 19, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({4, 4, 9, 17}, 0.1, 19, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({3, 5, 25, 29}, 0.1, 19, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({4, 4, 25, 29}, 0.1, 19, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({11, 17, 25, 29}, 0.027, 89, dtype);
 
   for (int i = 0; i < 20; i += 1) {
-    test_cpu_to_vulkan_and_vulkan_to_cpu_random();
+    test_cpu_to_vulkan_and_vulkan_to_cpu_random(dtype);
+  }
+}
+
+// TODO: Fix vulkan to cpu on Android
+TEST_F(VulkanAPITest, DISABLED_cpu_to_vulkan_and_vulkan_to_cpu_qint8) {
+  const c10::ScalarType dtype = c10::ScalarType::QInt8;
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 1, 1}, 0.13, -21, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 1, 4}, 0.3, 87, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 4, 1}, 0.2, -120, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 7, 7}, 0.3, 87, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 8, 8}, 0.1, -10, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({3, 5, 8, 8}, 0.04, 97, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 11, 17}, 0.07, -15, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 12, 17}, 0.1, 10, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({3, 5, 12, 17}, 0.1, -10, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 17, 12}, 0.1, 10, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({2, 4, 17, 12}, 0.1, -10, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 10, 14}, 0.0001, 101, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({3, 5, 10, 14}, 0.009, -43, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({3, 5, 10, 15}, 0.1, 19, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({4, 4, 9, 17}, 0.1, -19, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({3, 5, 25, 29}, 0.1, 19, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({4, 4, 25, 29}, 0.1, -19, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({11, 17, 25, 29}, 0.027, 89, dtype);
+
+  for (int i = 0; i < 20; i += 1) {
+    test_cpu_to_vulkan_and_vulkan_to_cpu_random(dtype);
+  }
+}
+
+// TODO: Fix vulkan to cpu on Android
+TEST_F(VulkanAPITest, DISABLED_cpu_to_vulkan_and_vulkan_to_cpu_qint32) {
+  const c10::ScalarType dtype = c10::ScalarType::QInt32;
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 1, 1}, 0.13, -21123, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 1, 4}, 0.339, 8734, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 4, 1}, 0.228, -12023, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 7, 7}, 0.338, 8723, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 8, 8}, 0.193, -1023, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({3, 5, 8, 8}, 0.0449, 972, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 11, 17}, 0.073, -15, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 12, 17}, 0.1572, 102, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({3, 5, 12, 17}, 0.147, -156, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 17, 12}, 0.129, 10448, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({2, 4, 17, 12}, 0.137, -10, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({1, 1, 10, 14}, 0.0001, 101, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({3, 5, 10, 14}, 0.009, -43267, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({3, 5, 10, 15}, 0.1243, 19, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({4, 4, 9, 17}, 0.1889, -19784, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({3, 5, 25, 29}, 0.1345, 196, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({4, 4, 25, 29}, 0.129, -19489, dtype);
+  test_cpu_to_vulkan_and_vulkan_to_cpu({11, 17, 25, 29}, 0.027, 89, dtype);
+
+  for (int i = 0; i < 20; i += 1) {
+    test_cpu_to_vulkan_and_vulkan_to_cpu_random(dtype);
+  }
+}
+
+void test_cpu_to_vulkan_and_dequantize(
+    const at::IntArrayRef input_shape,
+    const double scale,
+    const int zero_point,
+    const c10::ScalarType dtype = c10::ScalarType::QUInt8) {
+
+  // produce random quantized cpu tensor
+  auto in_cpu = produce_random_tensor(input_shape);
+  auto in_q_cpu = at::quantize_per_tensor(
+      in_cpu, scale, zero_point, dtype);
+
+  // copy quantized cpu tensor to vulkan
+  auto in_q_cpu_vk = cpu_to_vulkan(in_q_cpu);
+
+  // dequantize tensors
+  const auto out_cpu_deq = at::dequantize(in_q_cpu);
+  const auto out_vk_deq = at::dequantize(in_q_cpu_vk);
+  const auto out_vk_deq_cpu = out_vk_deq.cpu();
+
+  // check dequantized tensors are equal
+  const auto check = almostEqual(out_cpu_deq, out_vk_deq_cpu);
+
+  if (!check) {
+    const auto error = at::abs(out_vk_deq_cpu - out_cpu_deq).max().item<float>();
+    std::cout
+      << "Copy cpu to vulkan and dequantize failed with input shape: "
+      << input_shape << " scale: " << scale << " and zero point: "
+      << zero_point << std::endl;
+    std::cout << "Error: " << error << std::endl;
+  }
+  ASSERT_TRUE(check);
+}
+
+void test_cpu_to_vulkan_and_dequantize_random(
+    const c10::ScalarType dtype) {
+  const double scale = produce_random_scale();
+  const int64_t zero_point = produce_random_zero_point(dtype);
+  const at::IntArrayRef tensor_shape =
+    {rand_pos_int(30), rand_pos_int(30), rand_pos_int(100), rand_pos_int(100)};
+  test_cpu_to_vulkan_and_dequantize(
+    tensor_shape, scale, zero_point, dtype);
+}
+
+TEST_F(VulkanAPITest, cpu_to_vulkan_and_dequantize_quint8) {
+  const c10::ScalarType dtype = c10::ScalarType::QUInt8;
+  test_cpu_to_vulkan_and_dequantize({1, 1, 1, 1}, 0.13, 21, dtype);
+  test_cpu_to_vulkan_and_dequantize({1, 1, 1, 4}, 0.3, 87, dtype);
+  test_cpu_to_vulkan_and_dequantize({1, 1, 4, 1}, 0.2, 120, dtype);
+  test_cpu_to_vulkan_and_dequantize({1, 1, 7, 7}, 0.3, 87, dtype);
+  test_cpu_to_vulkan_and_dequantize({1, 1, 8, 8}, 0.1, 10, dtype);
+  test_cpu_to_vulkan_and_dequantize({3, 5, 8, 8}, 0.04, 97, dtype);
+  test_cpu_to_vulkan_and_dequantize({1, 1, 11, 17}, 0.07, 15, dtype);
+  test_cpu_to_vulkan_and_dequantize({1, 1, 12, 17}, 0.1, 10, dtype);
+  test_cpu_to_vulkan_and_dequantize({3, 5, 12, 17}, 0.1, 10, dtype);
+  test_cpu_to_vulkan_and_dequantize({1, 1, 17, 12}, 0.1, 10, dtype);
+  test_cpu_to_vulkan_and_dequantize({2, 4, 17, 12}, 0.1, 10, dtype);
+  test_cpu_to_vulkan_and_dequantize({1, 1, 10, 14}, 0.0001, 101, dtype);
+  test_cpu_to_vulkan_and_dequantize({3, 5, 10, 14}, 0.009, 43, dtype);
+  test_cpu_to_vulkan_and_dequantize({3, 5, 10, 15}, 0.1, 19, dtype);
+  test_cpu_to_vulkan_and_dequantize({4, 4, 9, 17}, 0.1, 19, dtype);
+  test_cpu_to_vulkan_and_dequantize({3, 5, 25, 29}, 0.1, 19, dtype);
+  test_cpu_to_vulkan_and_dequantize({4, 4, 25, 29}, 0.1, 19, dtype);
+  test_cpu_to_vulkan_and_dequantize({11, 17, 25, 29}, 0.027, 89, dtype);
+
+  for (int i = 0; i < 20; i += 1) {
+    test_cpu_to_vulkan_and_dequantize_random(dtype);
+  }
+}
+
+TEST_F(VulkanAPITest, cpu_to_vulkan_and_dequantize_qint8) {
+  const c10::ScalarType dtype = c10::ScalarType::QInt8;
+  test_cpu_to_vulkan_and_dequantize({1, 1, 1, 1}, 0.13, -21, dtype);
+  test_cpu_to_vulkan_and_dequantize({1, 1, 1, 4}, 0.3, 87, dtype);
+  test_cpu_to_vulkan_and_dequantize({1, 1, 4, 1}, 0.2, -120, dtype);
+  test_cpu_to_vulkan_and_dequantize({1, 1, 7, 7}, 0.3, 87, dtype);
+  test_cpu_to_vulkan_and_dequantize({1, 1, 8, 8}, 0.1, -10, dtype);
+  test_cpu_to_vulkan_and_dequantize({3, 5, 8, 8}, 0.04, 97, dtype);
+  test_cpu_to_vulkan_and_dequantize({1, 1, 11, 17}, 0.07, -15, dtype);
+  test_cpu_to_vulkan_and_dequantize({1, 1, 12, 17}, 0.1, 10, dtype);
+  test_cpu_to_vulkan_and_dequantize({3, 5, 12, 17}, 0.1, -10, dtype);
+  test_cpu_to_vulkan_and_dequantize({1, 1, 17, 12}, 0.1, 10, dtype);
+  test_cpu_to_vulkan_and_dequantize({2, 4, 17, 12}, 0.1, -10, dtype);
+  test_cpu_to_vulkan_and_dequantize({1, 1, 10, 14}, 0.0001, 101, dtype);
+  test_cpu_to_vulkan_and_dequantize({3, 5, 10, 14}, 0.009, -43, dtype);
+  test_cpu_to_vulkan_and_dequantize({3, 5, 10, 15}, 0.1, 19, dtype);
+  test_cpu_to_vulkan_and_dequantize({4, 4, 9, 17}, 0.1, -19, dtype);
+  test_cpu_to_vulkan_and_dequantize({3, 5, 25, 29}, 0.1, 19, dtype);
+  test_cpu_to_vulkan_and_dequantize({4, 4, 25, 29}, 0.1, -19, dtype);
+  test_cpu_to_vulkan_and_dequantize({11, 17, 25, 29}, 0.027, 89, dtype);
+
+  for (int i = 0; i < 20; i += 1) {
+    test_cpu_to_vulkan_and_dequantize_random(dtype);
+  }
+}
+
+TEST_F(VulkanAPITest, cpu_to_vulkan_and_dequantize_qint32) {
+  const c10::ScalarType dtype = c10::ScalarType::QInt32;
+  test_cpu_to_vulkan_and_dequantize({1, 1, 1, 1}, 0.13, -21123, dtype);
+  test_cpu_to_vulkan_and_dequantize({1, 1, 1, 4}, 0.339, 8734, dtype);
+  test_cpu_to_vulkan_and_dequantize({1, 1, 4, 1}, 0.228, -12023, dtype);
+  test_cpu_to_vulkan_and_dequantize({1, 1, 7, 7}, 0.338, 8723, dtype);
+  test_cpu_to_vulkan_and_dequantize({1, 1, 8, 8}, 0.193, -1023, dtype);
+  test_cpu_to_vulkan_and_dequantize({3, 5, 8, 8}, 0.0449, 972, dtype);
+  test_cpu_to_vulkan_and_dequantize({1, 1, 11, 17}, 0.073, -15, dtype);
+  test_cpu_to_vulkan_and_dequantize({1, 1, 12, 17}, 0.1572, 102, dtype);
+  test_cpu_to_vulkan_and_dequantize({3, 5, 12, 17}, 0.147, -156, dtype);
+  test_cpu_to_vulkan_and_dequantize({1, 1, 17, 12}, 0.129, 10448, dtype);
+  test_cpu_to_vulkan_and_dequantize({2, 4, 17, 12}, 0.137, -10, dtype);
+  test_cpu_to_vulkan_and_dequantize({1, 1, 10, 14}, 0.0001, 101, dtype);
+  test_cpu_to_vulkan_and_dequantize({3, 5, 10, 14}, 0.009, -43267, dtype);
+  test_cpu_to_vulkan_and_dequantize({3, 5, 10, 15}, 0.1243, 19, dtype);
+  test_cpu_to_vulkan_and_dequantize({4, 4, 9, 17}, 0.1889, -19784, dtype);
+  test_cpu_to_vulkan_and_dequantize({3, 5, 25, 29}, 0.1345, 196, dtype);
+  test_cpu_to_vulkan_and_dequantize({4, 4, 25, 29}, 0.129, -19489, dtype);
+  test_cpu_to_vulkan_and_dequantize({11, 17, 25, 29}, 0.027, 89, dtype);
+
+  for (int i = 0; i < 20; i += 1) {
+    test_cpu_to_vulkan_and_dequantize_random(dtype);
   }
 }
 
@@ -482,6 +665,7 @@ void test_quantize_per_tensor_and_vulkan_to_cpu(
     const at::IntArrayRef input_shape,
     const double input_scale,
     const int input_zero_point,
+    const c10::ScalarType dtype = c10::ScalarType::QUInt8,
     const int tolerance = 1) {
   // tolerance = 1, to allow for precision differences after dividing by random
   // scale which could result on a difference of 1 unit in the quantized result
@@ -490,10 +674,10 @@ void test_quantize_per_tensor_and_vulkan_to_cpu(
 
   // quantize tensor
   at::Tensor out_q_cpu = at::quantize_per_tensor(
-    input, input_scale, input_zero_point, c10::ScalarType::QUInt8);
+    input, input_scale, input_zero_point, dtype);
 
   at::Tensor out_q_vk = at::quantize_per_tensor(
-    input.vulkan(), input_scale, input_zero_point, c10::ScalarType::QUInt8);
+    input.vulkan(), input_scale, input_zero_point, dtype);
 
   // copy vulkan tensor to cpu
   at::Tensor out_q_vk_cpu = vulkan_to_cpu(out_q_vk, out_q_cpu);
@@ -516,38 +700,97 @@ void test_quantize_per_tensor_and_vulkan_to_cpu(
   ASSERT_TRUE(check);
 }
 
-void test_quantize_per_tensor_and_vulkan_to_cpu_random() {
+void test_quantize_per_tensor_and_vulkan_to_cpu_random(
+    const c10::ScalarType dtype) {
   const double scale = produce_random_scale();
-  const int64_t zero_point = produce_random_zero_point(c10::ScalarType::QUInt8);
+  const int64_t zero_point = produce_random_zero_point(dtype);
   const at::IntArrayRef tensor_shape =
     {rand_pos_int(30), rand_pos_int(30), rand_pos_int(100), rand_pos_int(100)};
-  test_quantize_per_tensor_and_vulkan_to_cpu(tensor_shape, scale, zero_point);
+  test_quantize_per_tensor_and_vulkan_to_cpu(
+    tensor_shape, scale, zero_point, dtype);
+}
+
+// TODO: Fix vulkan to cpu on Android
+TEST_F(VulkanAPITest, DISABLED_quantize_per_tensor_and_vulkan_to_cpu_quint8) {
+  const c10::ScalarType dtype = c10::ScalarType::QUInt8;
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 1, 1}, 0.13, 21, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 1, 4}, 0.3, 87, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 4, 1}, 0.2, 120, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 7, 7}, 0.3, 87, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 8, 8}, 0.1, 10, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({3, 5, 8, 8}, 0.04, 97, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 11, 17}, 0.07, 15, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 12, 17}, 0.1, 10, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({3, 5, 12, 17}, 0.1, 10, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 17, 12}, 0.1, 10, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({2, 4, 17, 12}, 0.1, 10, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 10, 14}, 0.0001, 101, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({3, 5, 10, 14}, 0.009, 43, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({3, 5, 10, 15}, 0.1, 19, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({4, 4, 9, 17}, 0.1, 19, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({3, 5, 25, 29}, 0.1, 19, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({4, 4, 25, 29}, 0.1, 19, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({11, 17, 25, 29}, 0.027, 89, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({3, 16, 77, 54}, 0.204173, 229, dtype);
+
+  for (int i = 0; i < 20; i += 1) {
+    test_quantize_per_tensor_and_vulkan_to_cpu_random(dtype);
+  }
 }
 
 // TODO: Fix vulkan to cpu on Android
-TEST_F(VulkanAPITest, DISABLED_quantize_per_tensor_and_vulkan_to_cpu) {
-  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 1, 1}, 0.13, 21);
-  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 1, 4}, 0.3, 87);
-  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 4, 1}, 0.2, 120);
-  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 7, 7}, 0.3, 87);
-  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 8, 8}, 0.1, 10);
-  test_quantize_per_tensor_and_vulkan_to_cpu({3, 5, 8, 8}, 0.04, 97);
-  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 11, 17}, 0.07, 15);
-  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 12, 17}, 0.1, 10);
-  test_quantize_per_tensor_and_vulkan_to_cpu({3, 5, 12, 17}, 0.1, 10);
-  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 17, 12}, 0.1, 10);
-  test_quantize_per_tensor_and_vulkan_to_cpu({2, 4, 17, 12}, 0.1, 10);
-  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 10, 14}, 0.0001, 101);
-  test_quantize_per_tensor_and_vulkan_to_cpu({3, 5, 10, 14}, 0.009, 43);
-  test_quantize_per_tensor_and_vulkan_to_cpu({3, 5, 10, 15}, 0.1, 19);
-  test_quantize_per_tensor_and_vulkan_to_cpu({4, 4, 9, 17}, 0.1, 19);
-  test_quantize_per_tensor_and_vulkan_to_cpu({3, 5, 25, 29}, 0.1, 19);
-  test_quantize_per_tensor_and_vulkan_to_cpu({4, 4, 25, 29}, 0.1, 19);
-  test_quantize_per_tensor_and_vulkan_to_cpu({11, 17, 25, 29}, 0.027, 89);
-  test_quantize_per_tensor_and_vulkan_to_cpu({3, 16, 77, 54}, 0.204173, 229);
+TEST_F(VulkanAPITest, DISABLED_quantize_per_tensor_and_vulkan_to_cpu_qint8) {
+  const c10::ScalarType dtype = c10::ScalarType::QInt8;
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 1, 1}, 0.13, -21, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 1, 4}, 0.3, 87, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 4, 1}, 0.2, -120, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 7, 7}, 0.3, 87, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 8, 8}, 0.1, -10, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({3, 5, 8, 8}, 0.04, 97, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 11, 17}, 0.07, -15, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 12, 17}, 0.1, 10, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({3, 5, 12, 17}, 0.1, -10, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 17, 12}, 0.1, 10, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({2, 4, 17, 12}, 0.1, -10, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 10, 14}, 0.0001, 101, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({3, 5, 10, 14}, 0.009, -43, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({3, 5, 10, 15}, 0.1, 19, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({4, 4, 9, 17}, 0.1, -19, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({3, 5, 25, 29}, 0.1, 19, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({4, 4, 25, 29}, 0.1, -19, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({11, 17, 25, 29}, 0.027, 89, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({3, 16, 77, 54}, 0.204173, 229, dtype);
+
+  for (int i = 0; i < 20; i += 1) {
+    test_quantize_per_tensor_and_vulkan_to_cpu_random(dtype);
+  }
+}
+
+// TODO: Fix vulkan to cpu on Android
+TEST_F(VulkanAPITest, DISABLED_quantize_per_tensor_and_vulkan_to_cpu_qint32) {
+  const c10::ScalarType dtype = c10::ScalarType::QInt32;
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 1, 1}, 0.13, -21123, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 1, 4}, 0.339, 8734, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 4, 1}, 0.228, -12023, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 7, 7}, 0.338, 8723, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 8, 8}, 0.193, -1023, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({3, 5, 8, 8}, 0.0449, 972, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 11, 17}, 0.073, -15, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 12, 17}, 0.1572, 102, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({3, 5, 12, 17}, 0.147, -156, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 17, 12}, 0.129, 10448, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({2, 4, 17, 12}, 0.137, -10, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({1, 1, 10, 14}, 0.0001, 101, dtype, 1);
+  test_quantize_per_tensor_and_vulkan_to_cpu({3, 5, 10, 14}, 0.009, -43267, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({3, 5, 10, 15}, 0.1243, 19, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({4, 4, 9, 17}, 0.1889, -19784, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({3, 5, 25, 29}, 0.1345, 196, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({4, 4, 25, 29}, 0.129, -19489, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({11, 17, 25, 29}, 0.027, 89, dtype);
+  test_quantize_per_tensor_and_vulkan_to_cpu({3, 16, 77, 54}, 0.204173, 229, dtype);
 
   for (int i = 0; i < 20; i += 1) {
-    test_quantize_per_tensor_and_vulkan_to_cpu_random();
+    test_quantize_per_tensor_and_vulkan_to_cpu_random(dtype);
   }
 }
 

From 9114478575d8a1f6ce42a9e9fa1d4b3e3da53bfc Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Wed, 7 Dec 2022 15:39:24 +0000
Subject: [PATCH 1681/1922] [FSDP][Easy] ufmt files (#90384)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90384
Approved by: https://github.com/H-Huang
---
 test/distributed/fsdp/test_fsdp_fx.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/distributed/fsdp/test_fsdp_fx.py b/test/distributed/fsdp/test_fsdp_fx.py
index f8f5c1800ed6d..43f8de2150f92 100644
--- a/test/distributed/fsdp/test_fsdp_fx.py
+++ b/test/distributed/fsdp/test_fsdp_fx.py
@@ -4,8 +4,8 @@
 from torch.distributed.fsdp._trace_utils import _ExecOrderTracer
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
-    TestCase,
     run_tests,
+    TestCase,
 )
 
 
From 412e6e5e1c6cd1e4b75f98f4e670adac4a75236f Mon Sep 17 00:00:00 2001
From: Facebook Community Bot <facebook-github-bot@users.noreply.github.com>
Date: Wed, 7 Dec 2022 22:36:31 +0000
Subject: [PATCH 1682/1922] Automated submodule update: FBGEMM (#74729)

This is an automated pull request to update the first-party submodule for [pytorch/FBGEMM](https://github.com/pytorch/FBGEMM).

New submodule commit: https://github.com/pytorch/FBGEMM/commit/f99e1616630e3b78e3bccd0ceb9e50e8e82409f1

Test Plan: Ensure that CI jobs succeed on GitHub before landing.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/74729
Approved by: https://github.com/malfet
---
 third_party/fbgemm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/fbgemm b/third_party/fbgemm
index 4d1738b3142a6..908a8f361ac5c 160000
--- a/third_party/fbgemm
+++ b/third_party/fbgemm
@@ -1 +1 @@
-Subproject commit 4d1738b3142a6cb0c032cd639e239566010b054a
+Subproject commit 908a8f361ac5c6103e55fbbb38ef8110457ff6eb

From ca1c020f585e137983c5a2672f4c508231db545d Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Wed, 7 Dec 2022 23:06:51 +0000
Subject: [PATCH 1683/1922] Add manual cuda deps search logic (#90411)

If PyTorch is package into a wheel with [nvidia-cublas-cu11](https://pypi.org/project/nvidia-cublas-cu11/), which is designated as PureLib, but `torch` wheel is not, can cause a torch_globals loading problem.

Fix that by searching for `nvidia/cublas/lib/libcublas.so.11` an `nvidia/cudnn/lib/libcudnn.so.8` across all `sys.path` folders.

Test plan:
```
docker pull amazonlinux:2
docker run --rm -t amazonlinux:2 bash -c 'yum install -y python3 python3-devel python3-distutils patch;python3 -m pip install torch==1.13.0;curl -OL https://patch-diff.githubusercontent.com/raw/pytorch/pytorch/pull/90411.diff; pushd /usr/local/lib64/python3.7/site-packages; patch -p1 </90411.diff; popd; python3 -c "import torch;print(torch.__version__, torch.cuda.is_available())"'
```

Fixes https://github.com/pytorch/pytorch/issues/88869

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90411
Approved by: https://github.com/atalman
---
 torch/__init__.py | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/torch/__init__.py b/torch/__init__.py
index c45e2d8f0de33..bd0bfa59d5919 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -142,6 +142,24 @@
     kernel32.SetErrorMode(prev_error_mode)
 
 
+def _preload_cuda_deps():
+    """ Preloads cudnn/cublas deps if they could not be found otherwise """
+    # Should only be called on Linux if default path resolution have failed
+    assert platform.system() == 'Linux', 'Should only be called on Linux'
+    for path in sys.path:
+        nvidia_path = os.path.join(path, 'nvidia')
+        if not os.path.exists(nvidia_path):
+            continue
+        cublas_path = os.path.join(nvidia_path, 'cublas', 'lib', 'libcublas.so.11')
+        cudnn_path = os.path.join(nvidia_path, 'cudnn', 'lib', 'libcudnn.so.8')
+        if not os.path.exists(cublas_path) or not os.path.exists(cudnn_path):
+            continue
+        break
+
+    ctypes.CDLL(cublas_path)
+    ctypes.CDLL(cudnn_path)
+
+
 # See Note [Global dependencies]
 def _load_global_deps():
     if platform.system() == 'Windows' or sys.executable == 'torch_deploy':
@@ -151,7 +169,15 @@ def _load_global_deps():
     here = os.path.abspath(__file__)
     lib_path = os.path.join(os.path.dirname(here), 'lib', lib_name)
 
-    ctypes.CDLL(lib_path, mode=ctypes.RTLD_GLOBAL)
+    try:
+        ctypes.CDLL(lib_path, mode=ctypes.RTLD_GLOBAL)
+    except OSError as err:
+        # Can only happen of wheel with cublas as PYPI deps
+        # As PyTorch is not purelib, but nvidia-cublas-cu11 is
+        if 'libcublas.so.11' not in err.args[0]:
+            raise err
+        _preload_cuda_deps()
+        ctypes.CDLL(lib_path, mode=ctypes.RTLD_GLOBAL)
 
 
 if (USE_RTLD_GLOBAL_WITH_LIBTORCH or os.getenv('TORCH_USE_RTLD_GLOBAL')) and \

From f0ded453488334830ebff058edcae47e82ea3bd7 Mon Sep 17 00:00:00 2001
From: Manuel Candales <mcandales@meta.com>
Date: Wed, 7 Dec 2022 23:21:57 +0000
Subject: [PATCH 1684/1922] [Vulkan][TCC] Add tests for quantized convolution
 with QUInt8 activation, weights and bias (#90012)

Summary:
- Registered vulkan_prepack::create_qconv2d_context to the QuantizedCPU backend.
- Registered vulkan_prepack::run_qconv2d_context to the Vulkan backend.
- Added function test_quantized_conv2d, in order to test Vulkan Quantized Conv2d with QUInt8 activation, weight and bias (all QUInt8).
- Added multiples tests for vulkan quantized conv2d (regular, depthwise and pointwise). All these tests make use of the test_quantized_conv2d function.

This function tests the correctness of vulkan quantized conv2d, by comparing the following two processes:
(we start with randomly generated float cpu tensors)
- random float cpu tensors -> to vulkan -> quantize them -> apply vulkan conv2d quantized op -> dequantize result -> to cpu
- random float cpu tensors -> quantize them -> dequantize -> apply cpu floating point conv2d op on dequantized tensors -> quantize result -> dequantize

This function takes three boolean flags that modify its behavior:
- prepacking:
  - if false, then we directly call at::native::vulkan::ops::quantized_conv2d
  - if true, then we call vulkan_prepack::create_qconv2d_context and vulkan_prepack::run_qconv2d_context.
- compute_quantization_params & random_quantization_params:
  - if both are false, all quantization params are fixed (given as input)
  - if compute_quantization_params is true, all params are computed
  - if random_quantization_params is true, the input params are random and the output params are computed.
(compute_quantization_params takes precedence over random_quantization_params)

Test Plan:
On Mac
```
cd ~/fbsource
buck1 run -c pt.vulkan_full_precision=1 //xplat/caffe2:pt_vulkan_quantized_api_test_binAppleMac\#macosx-arm64
```

On Android
```
cd ~/fbsource
buck1 build -c ndk.custom_libcxx=false -c pt.enable_qpl=0 -c pt.vulkan_full_precision=1 //xplat/caffe2:pt_vulkan_quantized_api_test_binAndroid\#android-arm64 --show-output
adb push buck-out/gen/xplat/caffe2/pt_vulkan_quantized_api_test_binAndroid\#android-arm64 /data/local/tmp/vulkan_quantized_api_test
adb shell "/data/local/tmp/vulkan_quantized_api_test"
```

Reviewed By: SS-JIA

Differential Revision: D41047096

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90012
Approved by: https://github.com/salilsdesai
---
 .../ATen/native/vulkan/ops/Convolution.cpp    |   2 +-
 aten/src/ATen/native/vulkan/ops/Register.cpp  |  17 +
 .../ATen/test/vulkan_quantized_api_test.cpp   | 478 +++++++++++++++++-
 3 files changed, 486 insertions(+), 11 deletions(-)

diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
index 05557a62dda8c..9ab19a6e9b0f3 100644
--- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
@@ -992,7 +992,7 @@ c10::intrusive_ptr<Conv2dPackedContext> create_qconv2d_context(
       dilation,
       /* transposed = */ false,
       /* quantized = */ true,
-      /* output_padding_arg = */ {},
+      /* output_padding_arg = */ {0},
       groups,
       output_min,
       output_max));
diff --git a/aten/src/ATen/native/vulkan/ops/Register.cpp b/aten/src/ATen/native/vulkan/ops/Register.cpp
index 25f0a6d99ec78..f9f0c2ad6aff3 100644
--- a/aten/src/ATen/native/vulkan/ops/Register.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Register.cpp
@@ -128,6 +128,14 @@ TORCH_LIBRARY(vulkan_prepack, m) {
   m.def(TORCH_SELECTIVE_SCHEMA(
       "vulkan_prepack::run_tconv2d_context(Tensor X, "
       "__torch__.torch.classes.vulkan.Conv2dPackedContext W_prepack) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "vulkan_prepack::create_qconv2d_context(Tensor W, Tensor? B, "
+      "int[2] stride, int[2] padding, int[2] dilation, int groups, "
+      "Scalar? output_min=None, Scalar? output_max=None) "
+      "-> __torch__.torch.classes.vulkan.Conv2dPackedContext"));
+  m.def(TORCH_SELECTIVE_SCHEMA(
+      "vulkan_prepack::run_qconv2d_context(Tensor X, float scale, int zero_point, "
+      "__torch__.torch.classes.vulkan.Conv2dPackedContext vk_context) -> Tensor Y"));
   m.def(TORCH_SELECTIVE_SCHEMA(
       "vulkan_prepack::create_linear_context(Tensor W, Tensor? B) "
       "-> __torch__.torch.classes.vulkan.LinearPackedContext"));
@@ -203,6 +211,12 @@ TORCH_LIBRARY_IMPL(vulkan_prepack, CPU, m) {
       TORCH_FN(create_batchnorm_context));
 }
 
+TORCH_LIBRARY_IMPL(vulkan_prepack, QuantizedCPU, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("vulkan_prepack::create_qconv2d_context"),
+      TORCH_FN(create_qconv2d_context));
+}
+
 TORCH_LIBRARY_IMPL(vulkan_prepack, Vulkan, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("vulkan_prepack::run_conv2d_context"),
@@ -213,6 +227,9 @@ TORCH_LIBRARY_IMPL(vulkan_prepack, Vulkan, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("vulkan_prepack::run_tconv2d_context"),
       TORCH_FN(run_tconv2d_context));
+  m.impl(
+      TORCH_SELECTIVE_NAME("vulkan_prepack::run_qconv2d_context"),
+      TORCH_FN(run_qconv2d_context));
   m.impl(
       TORCH_SELECTIVE_NAME("vulkan_prepack::run_linear_context"),
       TORCH_FN(run_linear_context));
diff --git a/aten/src/ATen/test/vulkan_quantized_api_test.cpp b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
index 0541f08787bc9..89c205ef3d7d4 100644
--- a/aten/src/ATen/test/vulkan_quantized_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
@@ -116,7 +116,8 @@ double rand01() {
 }
 
 int64_t rand_pos_int(const int max_val) {
-  return 1 + int64_t(rand01() * (max_val - 1));
+  TORCH_CHECK(max_val > 0, "max value must be positive");
+  return 1 + rand() % max_val;
 }
 
 at::Tensor produce_random_tensor(
@@ -149,13 +150,13 @@ int64_t produce_random_zero_point(const c10::ScalarType dtype) {
   int64_t zero_point;
   switch (dtype) {
     case c10::ScalarType::QUInt8:
-      zero_point = int64_t(rand01() * 255);
+      zero_point = rand() % 256;
       break;
     case c10::ScalarType::QInt8:
-      zero_point = int64_t(rand01() * 255) - 127;
+      zero_point = rand() % 256 - 128;
       break;
     case c10::ScalarType::QInt32:
-      zero_point = int64_t(rand01() * 100000) - 200000;
+      zero_point = rand() % 100000 - 200000;
       break;
     default:
       TORCH_CHECK(
@@ -1784,12 +1785,6 @@ std::tuple<double, double, int64_t, int64_t> produce_inputs_for_binary_op(
 
     // we do this, to avoid dividing by zero
     if (strcmp(op_name, "quantized::div") == 0) {
-      const auto non_zero_sign = input2_cpu.sign() - input2_cpu.sign().abs() + 1;
-        // non_zero_sign = 1 if the value is non negative, and -1 if it is negative
-      input2_cpu = input2_cpu + in2_scale * non_zero_sign;
-        // this will force abs(input2_cpu) >= in2_scale, which means that none of
-        // the quantized values of the second input will be equal to the zero point.
-
       // we might end up dividing by 0, if we allow random scale and zero point
       // of the divisor.
       if (random_quantization_params) {
@@ -1797,6 +1792,12 @@ std::tuple<double, double, int64_t, int64_t> produce_inputs_for_binary_op(
         in2_scale = std::get<0>(in2_quant_params);
         in2_zero_point = std::get<1>(in2_quant_params);
       }
+
+      const auto non_zero_sign = input2_cpu.sign() - input2_cpu.sign().abs() + 1;
+        // non_zero_sign = 1 if the value is non negative, and -1 if it is negative
+      input2_cpu = input2_cpu + in2_scale * non_zero_sign;
+        // this will force abs(input2_cpu) >= in2_scale, which means that none of
+        // the quantized values of the second input will be equal to the zero point.
     }
 
     // quantize cpu inputs
@@ -2008,6 +2009,463 @@ TEST_F(VulkanAPITest, quantized_div_tests) {
   quantized_binary_op_test_set("quantized::div");
 }
 
+void test_quantized_conv2d(
+    const bool prepacking,
+    const bool compute_quantization_params,
+    const bool random_quantization_params,
+    const at::IntArrayRef input_shape,
+    const at::IntArrayRef weight_shape,
+    const at::IntArrayRef bias_shape,
+    std::vector<int64_t> stride,
+    std::vector<int64_t> padding,
+    std::vector<int64_t> dilation,
+    int64_t groups,
+    double in_scale = 0.13,
+    double w_scale = 0.29,
+    double b_scale = 0.19,
+    double out_scale = 0.15,
+    int64_t in_zero_point = 11,
+    int64_t w_zero_point = 19,
+    int64_t b_zero_point = 27,
+    int64_t out_zero_point = 10) {
+  c10::InferenceMode mode;
+
+  // input cpu
+  at::Tensor input_cpu;         // input cpu tensor
+  at::Tensor input_cpu_q;       // input cpu tensor -> quantized
+  at::Tensor input_cpu_deq;     // input cpu tensor -> quantized -> dequantized
+
+  // input vulkan
+  at::Tensor input_vk;          // input cpu tensor -> to vulkan
+  at::Tensor input_vk_q;        // input cpu tensor -> to vulkan -> quantized
+  at::Tensor input_vk_deq;      // input cpu tensor -> to vulkan -> quantized -> dequantized
+  at::Tensor input_vk_deq_cpu;  // input cpu tensor -> to vulkan -> quantized -> dequantized -> to cpu
+
+  // weight cpu
+  at::Tensor weight_cpu;        // weight cpu tensor
+  at::Tensor weight_cpu_q;      // weight cpu tensor -> quantized
+  at::Tensor weight_cpu_deq;    // weight cpu tensor -> quantized -> dequantized
+
+  // bias cpu
+  at::Tensor bias_cpu;          // bias cpu tensor
+  at::Tensor bias_cpu_q;        // bias cpu tensor -> quantized
+  at::Tensor bias_cpu_deq;      // bias cpu tensor -> quantized -> dequantized
+
+  // When we randomly generate the input tensor, we might get unlucky
+  // and one of the entries might be generated such that when it is divided
+  // by the scale we get something like 2.50003 for example which could be
+  // rounded to 2 or 3 depending on the precision and rounding method.
+  // Because of that possibility, we generate the input and check the
+  // difference between input_cpu_deq and input_vk_deq_cpu
+  // If they are different we regenerated them again (up to 3 times)
+  // The goal is to start with input tensors that remain equal after quantization.
+  int num_attempts = 5;
+  for (int i = 0; i < num_attempts; i += 1) {
+    // produce random input, weight and bias
+    input_cpu = produce_random_tensor(input_shape, 1.26, 5.97, 0.59);
+    weight_cpu = produce_random_tensor(weight_shape, 1.26, 5.97, 0.59);
+    bias_cpu = produce_random_tensor(bias_shape, 1.26, 5.97, 0.59);
+
+    if (compute_quantization_params) {
+      // compute appropiate scale and zero point for input, weight and bias
+      const auto in_quant_params = compute_quant_params(input_cpu);
+      in_scale = std::get<0>(in_quant_params);
+      in_zero_point = std::get<1>(in_quant_params);
+
+      const auto w_quant_params = compute_quant_params(weight_cpu);
+      w_scale = std::get<0>(w_quant_params);
+      w_zero_point = std::get<1>(w_quant_params);
+
+      const auto input_max = input_cpu.max().item<float>();
+      const auto input_min = input_cpu.min().item<float>();
+      const auto input_range = input_max - input_min;
+
+      bias_cpu = input_range * at::rand(bias_shape, at::device(at::kCPU).dtype(at::kFloat)) + input_min;
+      b_scale = in_scale;
+      b_zero_point = in_zero_point;
+    }
+    else if (random_quantization_params) {
+      // produce random scale and zero point for inputs
+      in_scale = produce_random_scale();
+      in_zero_point = produce_random_zero_point(c10::ScalarType::QUInt8);
+
+      w_scale = produce_random_scale();
+      w_zero_point = produce_random_zero_point(c10::ScalarType::QUInt8);
+
+      b_scale = produce_random_scale();
+      b_zero_point = produce_random_zero_point(c10::ScalarType::QUInt8);
+    }
+
+    // quantize cpu input, weight and bias
+    input_cpu_q = at::quantize_per_tensor(
+        input_cpu, in_scale, in_zero_point, c10::ScalarType::QUInt8);
+    weight_cpu_q = at::quantize_per_tensor(
+        weight_cpu, w_scale, w_zero_point, c10::ScalarType::QUInt8);
+    bias_cpu_q = at::quantize_per_tensor(
+        bias_cpu, b_scale, b_zero_point, c10::ScalarType::QUInt8);
+
+    // dequantize quantized cpu input, weight and bias
+    input_cpu_deq = at::dequantize(input_cpu_q);
+    weight_cpu_deq = at::dequantize(weight_cpu_q);
+    bias_cpu_deq = at::dequantize(bias_cpu_q);
+
+    // vulkan quantized input
+    input_vk = input_cpu.vulkan();
+    input_vk_q = at::quantize_per_tensor(
+        input_vk, in_scale, in_zero_point, c10::ScalarType::QUInt8);
+
+    // dequantize quantized vulkan input
+    input_vk_deq = at::dequantize(input_vk_q);
+    input_vk_deq_cpu = input_vk_deq.cpu();
+
+    const float input_dif = at::abs(input_cpu_deq - input_vk_deq_cpu).max().item<float>();
+
+    if (input_dif < 1e-5 && input_dif < in_scale/2) {
+      break;
+    } else {
+      std::cout << "input_dif too big: " << input_dif;
+      if (i + 1 < num_attempts) {
+        std::cout << ". generating input again ..." << std::endl;
+      } else {
+        std::cout << std::endl;
+      }
+    }
+  }
+
+  // conv2d on dequantized cpu tensors
+  // Note: we apply the convolutio to the dequantized quantized tensors, that way
+  // we are performing the operations on the same numeric values.
+  const auto output_cpu = at::conv2d(
+      input_cpu_deq, weight_cpu_deq, bias_cpu_deq, stride, padding, dilation, groups);
+
+  if (compute_quantization_params || random_quantization_params) {
+    // compute appropiate scale and zero point for output
+    const auto out_quant_params = compute_quant_params(output_cpu);
+    out_scale = std::get<0>(out_quant_params);
+    out_zero_point = std::get<1>(out_quant_params);
+  }
+
+  // quantize and dequantize cpu output
+  at::Tensor output_cpu_q = at::quantize_per_tensor(
+      output_cpu, out_scale, out_zero_point, c10::ScalarType::QUInt8);
+  at::Tensor output_cpu_deq = at::dequantize(output_cpu_q);
+
+  // vulkan quantized output
+  at::Tensor output_vk_q;
+
+  if (!prepacking) {
+    // vulkan quantized conv2d
+    output_vk_q = at::native::vulkan::ops::quantized_conv2d(
+        input_vk_q, weight_cpu_q, bias_cpu_q,
+        stride, padding, dilation, groups,
+        out_scale, out_zero_point);
+  } else {
+    // vulkan quantized conv2d call by name
+    const auto prepack_vulkan_call_by_name = callOpByName(
+        "vulkan_prepack::create_qconv2d_context",
+        "",
+        weight_cpu_q, bias_cpu_q, stride, padding, dilation, groups, c10::nullopt, c10::nullopt);
+    const auto vulkan_output = callOpByName(
+        "vulkan_prepack::run_qconv2d_context",
+        "",
+        input_vk_q, out_scale, out_zero_point, prepack_vulkan_call_by_name[0]);
+    output_vk_q = vulkan_output[0].toTensor();
+  }
+
+  // dequantize vulkan output
+  const auto output_vk_deq = at::dequantize(output_vk_q);
+  const auto output_vk_deq_cpu = output_vk_deq.cpu();
+
+  // check
+  const float tolerance = out_scale;
+  const auto check = almostEqual(output_cpu_deq, output_vk_deq_cpu, tolerance);
+
+  if (!check) {
+    const auto vk_q_error = at::abs(output_vk_deq_cpu - output_cpu_deq).max().item<float>();
+    std::cout << "Quantized Conv2d failed with: " << std::endl;
+    std::cout << "input: shape " << input_shape << " scale " << in_scale
+              << " and zero point " << in_zero_point << std::endl;
+    std::cout << "weight: shape " << weight_shape << " scale " << w_scale
+              << " and zero point " << w_zero_point << std::endl;
+    std::cout << "bias: shape " << bias_shape << " scale " << b_scale
+              << " and zero point " << b_zero_point << std::endl;
+    std::cout << "output scale " << out_scale
+              << " and zero point " << out_zero_point << std::endl;
+    std::cout << "error: " << vk_q_error << std::endl;
+  }
+  ASSERT_TRUE(check);
+}
+
+TEST_F(VulkanAPITest, conv2d_quantized_fixed_params) {
+  test_quantized_conv2d(
+    /* prepacking? */   false,
+    /* compute params */false,
+    /* random params */ false,
+    /* input_shape */   {1, 3, 8, 8},
+    /* weight_shape */  {1, 3, 3, 3},
+    /* bias_shape */    {1},
+    /* stride */        {2, 2},
+    /* padding */       {1, 1},
+    /* dilation */      {1, 1},
+    /* groups */        1
+  );
+}
+
+TEST_F(VulkanAPITest, conv2d_quantized_computed_params) {
+  test_quantized_conv2d(
+    /* prepacking? */   false,
+    /* compute params */true,
+    /* random params */ false,
+    /* input_shape */   {1, 3, 8, 8},
+    /* weight_shape */  {1, 3, 3, 3},
+    /* bias_shape */    {1},
+    /* stride */        {2, 2},
+    /* padding */       {1, 1},
+    /* dilation */      {1, 1},
+    /* groups */        1
+  );
+}
+
+TEST_F(VulkanAPITest, conv2d_quantized_random_params) {
+  test_quantized_conv2d(
+    /* prepacking? */   false,
+    /* compute params */false,
+    /* random params */ true,
+    /* input_shape */   {1, 3, 8, 8},
+    /* weight_shape */  {1, 3, 3, 3},
+    /* bias_shape */    {1},
+    /* stride */        {2, 2},
+    /* padding */       {1, 1},
+    /* dilation */      {1, 1},
+    /* groups */        1
+  );
+}
+
+TEST_F(VulkanAPITest, conv2d_quantized_prepack_fixed_params) {
+  test_quantized_conv2d(
+    /* prepacking? */   true,
+    /* compute params */false,
+    /* random params */ false,
+    /* input_shape */   {1, 3, 8, 8},
+    /* weight_shape */  {1, 3, 3, 3},
+    /* bias_shape */    {1},
+    /* stride */        {2, 2},
+    /* padding */       {1, 1},
+    /* dilation */      {1, 1},
+    /* groups */        1
+  );
+}
+
+TEST_F(VulkanAPITest, conv2d_quantized_prepack_computed_params) {
+  test_quantized_conv2d(
+    /* prepacking? */   true,
+    /* compute params */true,
+    /* random params */ false,
+    /* input_shape */   {1, 3, 8, 8},
+    /* weight_shape */  {1, 3, 3, 3},
+    /* bias_shape */    {1},
+    /* stride */        {2, 2},
+    /* padding */       {1, 1},
+    /* dilation */      {1, 1},
+    /* groups */        1
+  );
+}
+
+TEST_F(VulkanAPITest, conv2d_quantized_prepack_random_params) {
+  test_quantized_conv2d(
+    /* prepacking? */   true,
+    /* compute params */false,
+    /* random params */ true,
+    /* input_shape */   {1, 3, 8, 8},
+    /* weight_shape */  {1, 3, 3, 3},
+    /* bias_shape */    {1},
+    /* stride */        {2, 2},
+    /* padding */       {1, 1},
+    /* dilation */      {1, 1},
+    /* groups */        1
+  );
+}
+
+TEST_F(VulkanAPITest, conv2d_dw_quantized_fixed_params) {
+  test_quantized_conv2d(
+    /* prepacking? */   false,
+    /* compute params */false,
+    /* random params */ false,
+    /* input_shape */   {1, 7, 137, 199},
+    /* weight_shape */  {7, 1, 17, 7},
+    /* bias_shape */    {7},
+    /* stride */        {2, 3},
+    /* padding */       {0, 4},
+    /* dilation */      {3, 1},
+    /* groups */        7
+  );
+}
+
+TEST_F(VulkanAPITest, conv2d_dw_quantized_computed_params) {
+  test_quantized_conv2d(
+    /* prepacking? */   false,
+    /* compute params */true,
+    /* random params */ false,
+    /* input_shape */   {1, 7, 137, 199},
+    /* weight_shape */  {7, 1, 17, 7},
+    /* bias_shape */    {7},
+    /* stride */        {2, 3},
+    /* padding */       {0, 4},
+    /* dilation */      {3, 1},
+    /* groups */        7
+  );
+}
+
+TEST_F(VulkanAPITest, conv2d_dw_quantized_random_params) {
+  test_quantized_conv2d(
+    /* prepacking? */   false,
+    /* compute params */false,
+    /* random params */ true,
+    /* input_shape */   {1, 7, 137, 199},
+    /* weight_shape */  {7, 1, 17, 7},
+    /* bias_shape */    {7},
+    /* stride */        {2, 3},
+    /* padding */       {0, 4},
+    /* dilation */      {3, 1},
+    /* groups */        7
+  );
+}
+
+TEST_F(VulkanAPITest, conv2d_dw_quantized_prepack_fixed_params) {
+  test_quantized_conv2d(
+    /* prepacking? */   true,
+    /* compute params */false,
+    /* random params */ false,
+    /* input_shape */   {1, 7, 137, 199},
+    /* weight_shape */  {7, 1, 17, 7},
+    /* bias_shape */    {7},
+    /* stride */        {2, 3},
+    /* padding */       {0, 4},
+    /* dilation */      {3, 1},
+    /* groups */        7
+  );
+}
+
+TEST_F(VulkanAPITest, conv2d_dw_quantized_prepack_computed_params) {
+  test_quantized_conv2d(
+    /* prepacking? */   true,
+    /* compute params */true,
+    /* random params */ false,
+    /* input_shape */   {1, 7, 137, 199},
+    /* weight_shape */  {7, 1, 17, 7},
+    /* bias_shape */    {7},
+    /* stride */        {2, 3},
+    /* padding */       {0, 4},
+    /* dilation */      {3, 1},
+    /* groups */        7
+  );
+}
+
+TEST_F(VulkanAPITest, conv2d_dw_quantized_prepack_random_params) {
+  test_quantized_conv2d(
+    /* prepacking? */   true,
+    /* compute params */false,
+    /* random params */ true,
+    /* input_shape */   {1, 7, 137, 199},
+    /* weight_shape */  {7, 1, 17, 7},
+    /* bias_shape */    {7},
+    /* stride */        {2, 3},
+    /* padding */       {0, 4},
+    /* dilation */      {3, 1},
+    /* groups */        7
+  );
+}
+
+TEST_F(VulkanAPITest, conv2d_pw_quantized_fixed_params) {
+  test_quantized_conv2d(
+    /* prepacking? */   false,
+    /* compute params */false,
+    /* random params */ false,
+    /* input_shape */   {1, 17, 127, 397},
+    /* weight_shape */  {29, 17, 1, 1},
+    /* bias_shape */    {29},
+    /* stride */        {1, 1},
+    /* padding */       {0, 0},
+    /* dilation */      {1, 1},
+    /* groups */        1
+  );
+}
+
+TEST_F(VulkanAPITest, conv2d_pw_quantized_computed_params) {
+  test_quantized_conv2d(
+    /* prepacking? */   false,
+    /* compute params */true,
+    /* random params */ false,
+    /* input_shape */   {1, 17, 127, 397},
+    /* weight_shape */  {29, 17, 1, 1},
+    /* bias_shape */    {29},
+    /* stride */        {1, 1},
+    /* padding */       {0, 0},
+    /* dilation */      {1, 1},
+    /* groups */        1
+  );
+}
+
+TEST_F(VulkanAPITest, conv2d_pw_quantized_random_params) {
+  test_quantized_conv2d(
+    /* prepacking? */   false,
+    /* compute params */false,
+    /* random params */ true,
+    /* input_shape */   {1, 17, 127, 397},
+    /* weight_shape */  {29, 17, 1, 1},
+    /* bias_shape */    {29},
+    /* stride */        {1, 1},
+    /* padding */       {0, 0},
+    /* dilation */      {1, 1},
+    /* groups */        1
+  );
+}
+
+TEST_F(VulkanAPITest, conv2d_pw_quantized_prepack_fixed_params) {
+  test_quantized_conv2d(
+    /* prepacking? */   true,
+    /* compute params */false,
+    /* random params */ false,
+    /* input_shape */   {1, 17, 127, 397},
+    /* weight_shape */  {29, 17, 1, 1},
+    /* bias_shape */    {29},
+    /* stride */        {1, 1},
+    /* padding */       {0, 0},
+    /* dilation */      {1, 1},
+    /* groups */        1
+  );
+}
+
+TEST_F(VulkanAPITest, conv2d_pw_quantized_prepack_computed_params) {
+  test_quantized_conv2d(
+    /* prepacking? */   true,
+    /* compute params */true,
+    /* random params */ false,
+    /* input_shape */   {1, 17, 127, 397},
+    /* weight_shape */  {29, 17, 1, 1},
+    /* bias_shape */    {29},
+    /* stride */        {1, 1},
+    /* padding */       {0, 0},
+    /* dilation */      {1, 1},
+    /* groups */        1
+  );
+}
+
+TEST_F(VulkanAPITest, conv2d_pw_quantized_prepack_random_params) {
+  test_quantized_conv2d(
+    /* prepacking? */   true,
+    /* compute params */false,
+    /* random params */ true,
+    /* input_shape */   {1, 17, 127, 397},
+    /* weight_shape */  {29, 17, 1, 1},
+    /* bias_shape */    {29},
+    /* stride */        {1, 1},
+    /* padding */       {0, 0},
+    /* dilation */      {1, 1},
+    /* groups */        1
+  );
+}
+
 } // namespace
 
 #endif /* USE_VULKAN_API */

From ab4d2a611992ca03763588901971035f954ef8df Mon Sep 17 00:00:00 2001
From: Xiaodong Wang <xdwang@meta.com>
Date: Wed, 7 Dec 2022 23:36:41 +0000
Subject: [PATCH 1685/1922] Fix C2 Ambiguous namespace (#89534)

Summary: cuda:: is a ambiguous namespace. Make it explicit c10::cuda

Differential Revision: D41469007
/caffe2/caffe2/core/context_gpu.cu(564): error: "caffe2::cuda" is ambiguous/caffe2/caffe2/core/context_gpu.cu(564): error: expected a ";"/caffe2/caffe2/core/context_gpu.cu(568): warning #12-D: parsing restarts here after previous syntax error
Remark: The warnings can be suppressed with "-diag-suppress <warning-number>"/caffe2/caffe2/core/context_gpu.cu(569): error: "caffe2::cuda" is ambiguous/caffe2/caffe2/core/context_gpu.cu(628): error: "caffe2::cuda" is ambiguous
4 errors detected in the compilation of "/caffe2/caffe2/core/context_gpu.cu".

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89534
Approved by: https://github.com/malfet
---
 caffe2/core/context_gpu.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/caffe2/core/context_gpu.cu b/caffe2/core/context_gpu.cu
index 83b8a049b8721..3359e88bcba20 100644
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@@ -561,12 +561,12 @@ struct DefaultCUDAAllocator final : public at::Allocator {
           // some models that are currently running with the thc
           // allocator fit in memory.  We will need to find some
           // way of resolving this problem.
-          cuda::CUDAStreamGuard g(
+          c10::cuda::CUDAStreamGuard g(
             Stream(
               Stream::DEFAULT,
               Device(kCUDA, CaffeCudaGetDevice())
             ));
-          ptr = cuda::CUDACachingAllocator::raw_alloc(nbytes);
+          ptr = c10::cuda::CUDACachingAllocator::raw_alloc(nbytes);
         }
         if (FLAGS_caffe2_gpu_memory_tracking) {
           g_size_map[ptr] = nbytes;
@@ -625,7 +625,7 @@ struct DefaultCUDAAllocator final : public at::Allocator {
         break;
       }
       case CudaMemoryPoolType::THC: {
-        cuda::CUDACachingAllocator::raw_delete(ptr);
+        c10::cuda::CUDACachingAllocator::raw_delete(ptr);
         if (FLAGS_caffe2_gpu_memory_tracking) {
           g_cuda_device_affiliation.erase(g_cuda_device_affiliation.find(ptr));
         }

From 01c46983084de3376dc23eb5de0f808c8c07c679 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Wed, 7 Dec 2022 23:38:33 +0000
Subject: [PATCH 1686/1922] correctly set strides for expanded/unsqueezed
 dimensions (#90341)

Fixes https://github.com/pytorch/torchdynamo/issues/1959, #90260
However, I wasn't able to make existing stride tests fail before the fix, even though I'm comparing all, not just significant strides.
Separately running refs on meta tensors produces wrong strides as shown in #90260, however, it looks like in meta tests some other way of computing meta info is used (I've been running
```
pytest -s -v test/test_meta.py -k test_meta_outplace_expand_cuda_float64
```
and verified that it has sample input that should fail, and that it indeed compares all the strides, but the produced `meta_rs` results somehow still had correct strides).

Edit: @SherlockNoMad helped me figure out how to fail the tests, and now I've set the correct ops for checking. `expand` fails for some test inputs because it special-cases 0-dim input case, correctly modeling it in prims would require a lot of changes, so skipping that for now.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90341
Approved by: https://github.com/SherlockNoMad
---
 test/test_meta.py               | 28 +++++++++++++++++++++-------
 torch/_prims/__init__.py        |  7 ++++++-
 torch/_prims_common/__init__.py | 17 ++++++++++++++---
 3 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/test/test_meta.py b/test/test_meta.py
index 6e10917e05e41..b427e75a0c4ff 100644
--- a/test/test_meta.py
+++ b/test/test_meta.py
@@ -288,6 +288,10 @@ def test_tensor_outlives_converter(self):
     torch.Tensor.__getitem__,
 }
 
+CHECK_ALL_STRIDES = {
+    aten.unsqueeze.default
+}
+
 CHECK_STRIDES_SKIPS = {
     aten._conj_physical.default,
     aten._fft_c2c.default,
@@ -319,22 +323,29 @@ def test_tensor_outlives_converter(self):
     # aten.view.default,  # repro with test_dispatch_symbolic_meta_outplace_all_strides_unflatten_cuda_float32
 }
 
+class CheckStrides(Enum):
+    NONE = 0
+    SIGNIFICANT = 1
+    ALL = 2
+
 def should_check_strides(func):
+    if func in CHECK_ALL_STRIDES:
+        return CheckStrides.ALL
     if func in CHECK_STRIDES:
-        return True
+        return CheckStrides.SIGNIFICANT
     if func in CHECK_STRIDES_SKIPS:
-        return False
+        return CheckStrides.NONE
     if not isinstance(func, torch._ops.OpOverload):
-        return False
+        return CheckStrides.NONE
     # Prims are expected to model strides correctly
     if func.namespace == "prims":
-        return True
+        return CheckStrides.SIGNIFICANT
     # Check if it's a view, by testing if any of the returns have
     # a non-empty alias set
     if any(r.alias_info.before_set for r in func._schema.returns if r.alias_info):
-        return True
+        return CheckStrides.SIGNIFICANT
     # TODO: check for TensorIterator
-    return True
+    return CheckStrides.SIGNIFICANT
 
 def assert_ref_meta_equal(test_case, func, meta_rs, rs, msg_callable):
     flat_meta_rs, _ = tree_flatten(meta_rs)
@@ -350,7 +361,10 @@ def test_assert(cond, msg):
         test_assert(meta_r.dtype == r.dtype, f"but real dtype was {r.dtype}")
         test_assert(meta_r.shape == r.shape, f"but real shape was {r.shape}")
         # See https://github.com/pytorch/pytorch/issues/78050
-        if should_check_strides(func):
+        if should_check_strides(func) == CheckStrides.ALL:
+            same_strides, _ = torch._prims_common.check_all_strides(meta_r, r)
+            test_assert(same_strides, f"but real stride was {r.stride()}")
+        elif should_check_strides(func) == CheckStrides.SIGNIFICANT:
             same_strides, _ = torch._prims_common.check_significant_strides(meta_r, r)
             test_assert(same_strides, f"but real stride was {r.stride()}")
         test_assert(
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index f1caf17fd19ac..3d4d44df74227 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -1232,7 +1232,12 @@ def _greater_than_reduce(acc, x):
                 new_strides.append(a.stride()[original_idx])
             original_idx = original_idx + 1
         else:
-            new_strides.append(0)
+            if shape[idx] != 1:
+                new_strides.append(0)
+            elif original_idx == a.ndim:
+                new_strides.append(1)
+            else:
+                new_strides.append(a.stride()[original_idx] * a.size()[original_idx])
 
     return a.as_strided(shape, new_strides, a.storage_offset())
 
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index f5cef4f11b11b..dd90d259db236 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -150,8 +150,8 @@ def compare_tensor_meta(a: TensorLikeType, b: TensorLikeType, check_strides=Fals
             raise RuntimeError(msg)
 
 
-def check_significant_strides(
-    a: TensorLikeType, b: TensorLikeType, *, only_cuda=True
+def _check_strides_helper(
+    a: TensorLikeType, b: TensorLikeType, *, only_cuda=True, significant_only=True
 ) -> Tuple[bool, Optional[int]]:
     # NOTE: only on CUDA because CPU elementwise strides are incorrect in PyTorch
     # See https://github.com/pytorch/pytorch/issues/77553
@@ -159,11 +159,22 @@ def check_significant_strides(
     # and for tensors with more than one element
     if (not only_cuda or a.device.type == "cuda" or b.device.type == "cuda") and a.numel() > 0:
         for idx in range(a.ndim):
-            if a.stride()[idx] != b.stride()[idx] and a.shape[idx] > 1:
+            check = not significant_only or a.shape[idx] > 1
+            if a.stride()[idx] != b.stride()[idx] and check:
                 return False, idx
 
     return True, None
 
+def check_significant_strides(
+    a: TensorLikeType, b: TensorLikeType, *, only_cuda=True
+) -> Tuple[bool, Optional[int]]:
+    return _check_strides_helper(a, b, only_cuda=only_cuda, significant_only=True)
+
+def check_all_strides(
+    a: TensorLikeType, b: TensorLikeType, *, only_cuda=True
+) -> Tuple[bool, Optional[int]]:
+    return _check_strides_helper(a, b, only_cuda=only_cuda, significant_only=False)
+
 
 # This function is equivalent to compute_contiguous() from TensorImpl.cpp
 def is_contiguous(a: TensorLikeType) -> bool:

From 27adb7ad10eeaaa8311bc206c4f292f71fffe72f Mon Sep 17 00:00:00 2001
From: Yanli Zhao <yanlizhao@fb.com>
Date: Thu, 8 Dec 2022 00:17:21 +0000
Subject: [PATCH 1687/1922] add save and load stats in memory_tracker (#90144)

add save and load stats in memory_tracker, so that users could plot the traces in another place, rather than just inside trainer
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90144
Approved by: https://github.com/rohan-varma
---
 .../distributed/_tools/test_memory_tracker.py |  2 +
 torch/distributed/_tools/memory_tracker.py    | 42 +++++++++++++++----
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/test/distributed/_tools/test_memory_tracker.py b/test/distributed/_tools/test_memory_tracker.py
index 2e19ef6bf7294..12ec782ec07da 100644
--- a/test/distributed/_tools/test_memory_tracker.py
+++ b/test/distributed/_tools/test_memory_tracker.py
@@ -52,6 +52,8 @@ def test_local_model(self):
 
         self.assertTrue(len(tracker._hooks) == 0)
 
+        tracker.save_stats("memory.trace")
+        tracker.load("memory.trace")
         tracker.summary()
 
         self.assertTrue(tracker._op_index > 0)
diff --git a/torch/distributed/_tools/memory_tracker.py b/torch/distributed/_tools/memory_tracker.py
index 477c59021bd48..0c34dcb3bfb46 100644
--- a/torch/distributed/_tools/memory_tracker.py
+++ b/torch/distributed/_tools/memory_tracker.py
@@ -1,5 +1,7 @@
 from collections import defaultdict
 
+import pickle
+
 from typing import (
     Any,
     Callable,
@@ -74,15 +76,9 @@ def __init__(self) -> None:
         torch._C._log_api_usage_once("torch.distributed.memory_tracker")
         self._hooks: List[RemovableHandle] = []
         self._operator_names: Dict[str, int] = defaultdict(int)
-        self.memories_allocated: Dict[int, Dict[str, float]] = defaultdict(
-            lambda: defaultdict(float)
-        )
-        self.memories_active: Dict[int, Dict[str, float]] = defaultdict(
-            lambda: defaultdict(float)
-        )
-        self.memories_reserved: Dict[int, Dict[str, float]] = defaultdict(
-            lambda: defaultdict(float)
-        )
+        self.memories_allocated: Dict[int, Dict[str, float]] = defaultdict()
+        self.memories_active: Dict[int, Dict[str, float]] = defaultdict()
+        self.memories_reserved: Dict[int, Dict[str, float]] = defaultdict()
         self._markers: Dict[str, int] = defaultdict(int)
         self._cur_module_name: str = ""
         self._op_index: int = 0
@@ -180,6 +176,34 @@ def show_traces(self) -> None:
                 )
         plt.legend()
 
+    def save_stats(self, path: str) -> None:
+        """
+        Save the stats using pickle during runtime if users want to plot the traces
+        in other places like notebook.
+        """
+        stats = {
+            "memories_allocated": self.memories_allocated,
+            "memories_active": self.memories_active,
+            "memories_reserved": self.memories_reserved,
+            "markers": self._markers,
+        }
+
+        with open(path, "wb") as f:
+            pickle.dump(stats, f)
+
+    def load(self, path: str) -> None:
+        """
+        Load the pickled memory stats to plot the traces or print the summary.
+        """
+
+        with open(path, "rb") as f:
+            stats = pickle.load(f)
+
+        self.memories_allocated = stats["memories_allocated"]
+        self.memories_active = stats["memories_active"]
+        self.memories_reserved = stats["memories_reserved"]
+        self._markers = stats["markers"]
+
     def _create_pre_forward_hook(self, name: str) -> Callable:
         """
         The pre_foward_hook is to insert current module name with forward prefix for the operator

From 3bbc35d47f3cfffbd6715c705aa4539c75e89194 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Wed, 7 Dec 2022 17:59:18 +0000
Subject: [PATCH 1688/1922] [primTorch] Add prim and ref for as_strided_scatter
 (#88426)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88426
Approved by: https://github.com/mruberry
---
 torch/_meta_registrations.py                  |  1 +
 torch/_prims/__init__.py                      | 51 +++++++++++++++++++
 torch/_prims_common/__init__.py               | 51 +++++++++++++++----
 torch/_refs/__init__.py                       | 12 +++++
 .../_internal/common_methods_invocations.py   |  9 +++-
 5 files changed, 111 insertions(+), 13 deletions(-)

diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index cb175914ddccd..b6d0214f0df47 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1981,6 +1981,7 @@ def activate_meta():
             "aten::copy_",  # Exception not raised, test_torch.py -k test_storage_meta_errors_cpu_int64  # noqa: B950
             "aten::constant_pad_nd",  # requires_grad mismatch, test_ops.py -k test_fake_crossref_backward_amp_istft_cuda_float32  # noqa: B950
             "aten::rot90",  # requires_grad mismatch! test_ops.py -k test_fake_crossref_backward_amp_rot90_cuda_float32  # noqa: B950
+            "aten::as_strided_scatter",  # requires_grad mismatch, test_ops.py -k test_fake_crossref_backward_no_amp_as_strided_scatter_cuda_float32  # noqa: B950
         }:
             pass
         else:
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index 3d4d44df74227..a7cc65ee23131 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -151,6 +151,10 @@
     "transpose",
     "view_of",
     #
+    # Functionalized view mutations
+    #
+    "as_strided_scatter",
+    #
     # Shape prims
     #
     "collapse",
@@ -1794,6 +1798,53 @@ def _view_of_aten(a: Tensor) -> Tensor:
     doc=_view_of_doc,
 )
 
+#
+# Functionalized view mutations
+#
+
+
+def _as_strided_scatter_meta(
+    input: TensorLikeType,
+    src: TensorLikeType,
+    size: ShapeType,
+    stride: StrideType,
+    storage_offset: int,
+) -> TensorLikeType:
+    utils.validate_shape(size)
+    utils.validate_strides(stride)
+
+    required_size = utils.compute_required_storage_length(size, stride, storage_offset)
+    utils.check(
+        input.numel() >= required_size,
+        lambda: (
+            f"as_strided_scatter: sizes {size}, strides {stride}, storage offset {storage_offset} "
+            f" and itemsize {input.element_size()} requiring a storage size of "
+            f"{required_size * input.element_size()} are out of bounds "
+            f"for storage of size {input.numel() * input.element_size()}"
+        ),
+    )
+    utils.check(
+        utils.is_same_shape(src.shape, size),
+        lambda: f"expected src to have a size equal to the slice of self. src size = {src.shape}, slice size = {size}",
+    )
+
+    return _clone_meta(input)
+
+
+_as_strided_scatter_doc = """
+    Creates a new tensor equivalent to ``out = input.clone()`` after mutation by
+    ``out.as_strided(size, stride, storage_offset).copy_(src)``.
+"""
+
+as_strided_scatter = _make_prim(
+    schema="as_strided_scatter(Tensor self, Tensor src, SymInt[] size, SymInt[] stride, SymInt storage_offset) -> Tensor",
+    meta=_as_strided_scatter_meta,
+    impl_aten=torch.as_strided_scatter,
+    return_type=RETURN_TYPE.NEW,
+    doc=_as_strided_scatter_doc,
+)
+
+
 #
 # Shape operations
 #
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index dd90d259db236..b34f109c3a2fb 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -1483,6 +1483,44 @@ def set_correction(
     return correction
 
 
+def compute_required_storage_length(
+    shape: ShapeType, strides: StrideType, storage_offset: int
+) -> int:
+    """Computes the minimum storage size to hold the given tensor geometry.
+
+    Example
+    =======
+
+    This is the size of a newly allocated tensor's storage, in units of elements
+
+    >>> t = torch.empty((10, 20))
+    >>> compute_required_storage_length(t.shape, t.stride(), t.storage_offset())
+    200
+
+    >>> t2 = torch.empty_strided((1, 2, 3), (5, 7, 11))
+    >>> size = compute_required_storage_length(t2.shape, t2.stride(), t2.storage_offset())
+    >>> size == t.storage().size()
+    True
+
+    A valid tensor may have a larger storage size, but never smaller
+
+    >>> slice = torch.empty(100)[20:40]
+    >>> slice.storage().size()
+    100
+
+    >>> compute_required_storage_length(slice.shape, slice.stride(), slice.storage_offset())
+    40
+
+    """
+    # Short-circuits if the shape has no elements
+    if reduce(operator.mul, shape, 1) == 0:
+        return 0
+
+    max_offset = sum((x - 1) * y for x, y in zip(shape, strides))
+    # +1 to account for the first element which offsets are taken from
+    return 1 + storage_offset + max_offset
+
+
 def check_in_bounds_for_storage(
     a: torch.TypedStorage, shape: ShapeType, strides: StrideType, storage_offset: int
 ):
@@ -1490,17 +1528,8 @@ def check_in_bounds_for_storage(
     Determines if the given shape, strides, and offset are valid for the given storage.
     """
 
-    # Short-circuits if the shape has no elements
-    if reduce(operator.mul, shape) == 0:
-        return
-
-    length = a.size() - storage_offset
-    max_offset = 0
-    for x, y in zip(shape, strides):
-        max_offset = max_offset + (x - 1) * y
-
-    if max_offset >= length:
-        required_length = max_offset + storage_offset
+    required_length = compute_required_storage_length(shape, strides, storage_offset)
+    if a.size() < required_length:
         msg = (
             "Can't view a storage of size {0} with an offset of {1}, shape of {2}, and strides of {3}, "
             "which requires a storage of size {4}".format(
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 354ef9c2d94ba..f06f5ba34b5a9 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -2520,6 +2520,18 @@ def as_strided(
     return prims.as_strided(a, size, stride, storage_offset_int)
 
 
+@register_decomposition(torch.ops.aten.as_strided_scatter)
+def as_strided_scatter(
+    input: TensorLikeType,
+    src: TensorLikeType,
+    size: ShapeType,
+    stride: StrideType,
+    storage_offset: Optional[int] = None,
+) -> TensorLikeType:
+    storage_offset_int = 0 if storage_offset is None else storage_offset
+    return prims.as_strided_scatter(input, src, size, stride, storage_offset_int)
+
+
 def broadcast_shapes(*shapes) -> ShapeType:
     return torch.Size(_broadcast_shapes(*shapes))
 
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 527c0793acd34..c01f476f17ed2 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -278,6 +278,7 @@ def sample_inputs_as_strided_scatter(op_info, device, dtype, requires_grad, **kw
 
     # input shape, output shape, output stride, output storage offset
     test_cases = [
+        ((1,), (), (), 0),
         ((1,), (1,), (1,), 0),
         ((3, 3), (2, 2), (1, 2), 0),
         ((3, 3), (2, 2), (1, 2), 1),
@@ -293,6 +294,7 @@ def sample_inputs_as_strided_scatter(op_info, device, dtype, requires_grad, **kw
         input_src = make_arg(output_shape)
         yield SampleInput(input_t, input_src, output_shape, stride, storage_offset=storage_offset)
 
+
 def error_inputs_as_strided_scatter(op_info, device, **kwargs):
     make_arg = partial(make_tensor, device=device, dtype=torch.float32, requires_grad=False)
 
@@ -10821,8 +10823,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                             'test_non_standard_bool_values'),
            )),
     OpInfo('as_strided_scatter',
-           op=lambda x, src, size, stride, storage_offset=0:
-               torch.as_strided_scatter(x, src, size, stride, storage_offset=storage_offset),
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
            supports_out=False,
            supports_forward_ad=True,
@@ -18424,6 +18424,11 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu'),
         ),
     ),
+    PythonRefInfo(
+        "_refs.as_strided_scatter",
+        torch_opinfo_name="as_strided_scatter",
+        supports_nvfuser=False,
+    ),
     PythonRefInfo(
         "_refs.broadcast_shapes",
         torch_opinfo_name="broadcast_shapes",

From d751cde2e681c7a4c875e17cfdfc8be439299c3a Mon Sep 17 00:00:00 2001
From: Michael Wootton <michael.wootton@amd.com>
Date: Thu, 8 Dec 2022 00:24:55 +0000
Subject: [PATCH 1689/1922] Kineto activity fix (#89785)

Continuation of https://github.com/pytorch/pytorch/pull/88207

A compile time guard was preventing ActivityType::CUDA from being available on rocm. This caused both the GPU_FALLBACK and CUDA modes to be active at the same time. So operators were being charged gpu time for the hipEventRecord ranges and the actual kernel execution times. This caused incorrect (and often negative) cuda times, in e.g. table().

Previously a cmake variable was not being propagated to a '-D', causing an issue on Windows, which uses cuda but not cupti.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89785
Approved by: https://github.com/jeffdaily, https://github.com/malfet
---
 cmake/Dependencies.cmake     | 5 +++++
 torch/csrc/autograd/init.cpp | 5 +++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 104056ee07240..918344e3d2ba0 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -2015,6 +2015,11 @@ if(USE_KINETO)
   string(APPEND CMAKE_CXX_FLAGS " -DUSE_KINETO")
   if(LIBKINETO_NOCUPTI)
     string(APPEND CMAKE_CXX_FLAGS " -DLIBKINETO_NOCUPTI")
+  endif()
+  if(LIBKINETO_NOROCTRACER)
+    string(APPEND CMAKE_CXX_FLAGS " -DLIBKINETO_NOROCTRACER")
+  endif()
+  if(LIBKINETO_NOCUPTI AND LIBKINETO_NOROCTRACER)
     message(STATUS "Configured Kineto (CPU)")
   else()
     message(STATUS "Configured Kineto")
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index ee963232d3166..6bfd4bd4bfed6 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -279,8 +279,9 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
 
   m.def("_supported_activities", []() {
     std::set<ActivityType> activities{ActivityType::CPU};
-#if defined(USE_KINETO) && !defined(LIBKINETO_NOCUPTI)
-    if (at::getNumGPUs() > 0 && !at::hasHIP()) {
+#if defined(USE_KINETO) && \
+    (!defined(LIBKINETO_NOCUPTI) || !defined(LIBKINETO_NOROCTRACER))
+    if (at::getNumGPUs() > 0) {
       activities.insert(ActivityType::CUDA);
     }
 #endif

From 9d0c8e88682489a4f0ec7e05269447626ed73874 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Thu, 8 Dec 2022 00:30:16 +0000
Subject: [PATCH 1690/1922] C++17 friendly iterator implementation (#90379)

Get rid of std::iterator inheritance/references for `c10::DictIterator`, `c10::IListRefIterator` and `c10::ListIterator`

Followup after https://github.com/pytorch/pytorch/pull/90174

Fixes deprecation warning and extension compilation failures using VC++
that raises following errors:
```
C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\include\ATen/core/IListRef.h(517): error C4996: 'std::iterator<std::bidirectional_iterator_tag,T,ptrdiff_t,T *,T &>::value_type': warning STL4015: The std::iterator class template (used as a base class to provide typedefs) is deprecated in C++17. (The <iterator> header is NOT deprecated.) The C++ Standard has never required user-defined iterators to derive from std::iterator. To fix this warning, stop deriving from std::iterator and start providing publicly accessible typedefs named iterator_category, value_type, difference_type, pointer, and reference. Note that value_type is required to be non-const, even for constant iterators. You can define _SILENCE_CXX17_ITERATOR_BASE_CLASS_DEPRECATION_WARNING or _SILENCE_ALL_CXX17_DEPRECATION_WARNINGS to acknowledge that you have received this warning.

C:\actions-runner\_work\pytorch\pytorch\build\win_tmp\build\torch\include\ATen/core/List.h(169): error C4996: 'std::iterator<std::random_access_iterator_tag,T,ptrdiff_t,T *,T &>::difference_type': warning STL4015: The std::iterator class template (used as a base class to provide typedefs) is deprecated in C++17. (The <iterator> header is NOT deprecated.) The C++ Standard has never required user-defined iterators to derive from std::iterator. To fix this warning, stop deriving from std::iterator and start providing publicly accessible typedefs named iterator_category, value_type, difference_type, pointer, and reference. Note that value_type is required to be non-const, even for constant iterators. You can define _SILENCE_CXX17_ITERATOR_BASE_CLASS_DEPRECATION_WARNING or _SILENCE_ALL_CXX17_DEPRECATION_WARNINGS to acknowledge that you have received this warning.

```

Discovered while working on https://github.com/pytorch/pytorch/pull/85969
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90379
Approved by: https://github.com/ezyang, https://github.com/dagitses
---
 aten/src/ATen/core/Dict.h     | 11 +++++++++--
 aten/src/ATen/core/IListRef.h |  9 ++++++++-
 aten/src/ATen/core/List.h     | 16 +++++++++-------
 3 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/aten/src/ATen/core/Dict.h b/aten/src/ATen/core/Dict.h
index 7ae106b6618cf..c4fb44ce0c636 100644
--- a/aten/src/ATen/core/Dict.h
+++ b/aten/src/ATen/core/Dict.h
@@ -101,8 +101,15 @@ class DictEntryRef final {
 // this wraps map_type::iterator to make sure user code can't rely
 // on it being the type of the underlying map.
 template<class Key, class Value, class Iterator>
-class DictIterator final : public std::iterator<std::forward_iterator_tag, DictEntryRef<Key, Value, Iterator>> {
+class DictIterator final {
 public:
+   // C++17 friendly std::iterator implementation
+  using iterator_category = std::forward_iterator_tag;
+  using value_type = DictEntryRef<Key, Value, Iterator>;
+  using difference_type = std::ptrdiff_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+
   explicit DictIterator() = default;
   ~DictIterator() = default;
 
@@ -136,7 +143,7 @@ class DictIterator final : public std::iterator<std::forward_iterator_tag, DictE
     return &entryRef_;
   }
 
-  friend typename std::iterator<std::random_access_iterator_tag, DictEntryRef<Key, Value, Iterator>>::difference_type operator-(const DictIterator& lhs, const DictIterator& rhs) {
+  friend difference_type operator-(const DictIterator& lhs, const DictIterator& rhs) {
     return lhs.entryRef_.iterator_ - rhs.entryRef_.iterator_;
   }
 
diff --git a/aten/src/ATen/core/IListRef.h b/aten/src/ATen/core/IListRef.h
index 0b0ff67b02e2d..340e519f43dbc 100644
--- a/aten/src/ATen/core/IListRef.h
+++ b/aten/src/ATen/core/IListRef.h
@@ -359,7 +359,7 @@ using MaterializedIListRef = std::vector<MaterializedIListRefElem<T>>;
  *     than 0.
  */
 template <typename T>
-class IListRefIterator : public std::iterator<std::bidirectional_iterator_tag, T> {
+class IListRefIterator {
  private:
 #define DEFINE_FRIEND_CLASS(TAG, ...)                        \
   friend class detail::IListRefTagImpl<IListRefTag::TAG, T>; \
@@ -371,6 +371,13 @@ class IListRefIterator : public std::iterator<std::bidirectional_iterator_tag, T
 #undef DEFINE_FRIEND_CLASS
 
  public:
+  // C++17 friendly std::iterator implementation
+  using iterator_category = std::bidirectional_iterator_tag;
+  using value_type = T;
+  using difference_type = std::ptrdiff_t;
+  using pointer = T*;
+  using reference = T&;
+
   using unboxed_iterator_type = typename detail::
       IListRefTagImpl<IListRefTag::Unboxed, T>::list_type::const_iterator;
   using boxed_iterator_type = typename detail::
diff --git a/aten/src/ATen/core/List.h b/aten/src/ATen/core/List.h
index fe75bf37cb7fa..610417774774c 100644
--- a/aten/src/ATen/core/List.h
+++ b/aten/src/ATen/core/List.h
@@ -111,13 +111,15 @@ class ListElementReference final {
 // this wraps vector::iterator to make sure user code can't rely
 // on it being the type of the underlying vector.
 template <class T, class Iterator>
-class ListIterator final : public std::iterator<
-                               std::random_access_iterator_tag,
-                               T,
-                               std::ptrdiff_t,
-                               T*,
-                               ListElementReference<T, Iterator>> {
+class ListIterator final {
  public:
+   // C++17 friendly std::iterator implementation
+  using iterator_category = std::random_access_iterator_tag;
+  using value_type = T;
+  using difference_type = std::ptrdiff_t;
+  using pointer = T*;
+  using reference = ListElementReference<T, Iterator>;
+
   explicit ListIterator() = default;
   ~ListIterator() = default;
 
@@ -166,7 +168,7 @@ class ListIterator final : public std::iterator<
     return ListIterator{iterator_ - offset};
   }
 
-  friend typename std::iterator<std::random_access_iterator_tag, T>::difference_type operator-(const ListIterator& lhs, const ListIterator& rhs) {
+  friend difference_type operator-(const ListIterator& lhs, const ListIterator& rhs) {
     return lhs.iterator_ - rhs.iterator_;
   }
 

From 4dc11af982785fa0bfac4327e5dff32dc6e83993 Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Wed, 7 Dec 2022 20:53:02 +0000
Subject: [PATCH 1691/1922] [inductor] Remove .to from lowering (#90280)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90280
Approved by: https://github.com/ngimel
---
 torch/_inductor/lowering.py | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 882dbd693b0cb..8f2e87f2f3b54 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -403,33 +403,6 @@ def _to_copy(
     return x
 
 
-@register_lowering(aten.to)
-def to(
-    x,
-    device_or_dtype=None,
-    non_blocking=False,
-    copy=False,
-    memory_format=None,
-    device=None,
-    dtype=None,
-    layout=None,
-):
-    assert not memory_format, "TODO"
-    assert layout in (None, torch.strided)
-    if isinstance(device_or_dtype, torch.dtype):
-        return to_dtype(x, device_or_dtype)
-    elif isinstance(device_or_dtype, torch.device):
-        return to_device(x, device_or_dtype)
-    else:
-        assert device_or_dtype is None, device_or_dtype
-
-    if device is not None:
-        x = to_device(x, device)
-    if dtype is not None:
-        x = to_dtype(x, dtype)
-    return x
-
-
 def ops_wrapper(name):
     assert isinstance(name, str)
 

From 6a3384ceb80263f940fab5555c435f6598731aec Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Thu, 8 Dec 2022 00:51:13 +0000
Subject: [PATCH 1692/1922] Revert "[Quant][fx][bc-breaking] Make convert.py
 smaller (#90189)"

This reverts commit 824641b083860df4d7ffef06a798ea2702bc4bde.

Reverted https://github.com/pytorch/pytorch/pull/90189 on behalf of https://github.com/seemethere due to Fails internal tests due to potential circular import, see https://www.internalfb.com/diff/D41817429?dst_version_fbid=1453307181865235&transaction_fbid=899728221278938
---
 torch/ao/quantization/fx/convert.py       | 454 ++++++++++++++++++----
 torch/ao/quantization/fx/convert_utils.py | 335 ----------------
 2 files changed, 387 insertions(+), 402 deletions(-)
 delete mode 100644 torch/ao/quantization/fx/convert_utils.py

diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py
index d651a47de10f0..e795b3bca8584 100644
--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@@ -1,80 +1,92 @@
+from typing import Any, Dict, List, Optional, Set, Tuple, Union, Type, Callable
+from torch.ao.quantization.quant_type import QuantType
+import torch
 import copy
-import operator
-from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
 import warnings
-import torch
-from torch.fx import GraphModule
+from torch.fx import (
+    GraphModule,
+)
 from torch.fx.graph import (
     Graph,
     Node,
+    Argument,
 )
-from torch.nn.utils.parametrize import type_before_parametrizations
-from ..backend_config import (
-    BackendConfig,
-    get_native_backend_config,
-)
-from ..backend_config.utils import (
-    get_fused_module_classes,
-    get_pattern_to_dtype_configs,
-    get_qat_module_classes,
-    get_root_module_to_quantized_reference_module,
-)
-from ..quantize import (
-    _remove_qconfig,
-    is_activation_post_process,
+from ..utils import (
+    activation_is_statically_quantized,
+    weight_is_quantized,
+    get_qparam_dict,
+    _parent_name,
+    get_swapped_custom_module_class,
 )
 from ..qconfig import (
-    qconfig_equals,
     QConfigAny,
+    qconfig_equals
 )
 from ..qconfig_mapping import QConfigMapping
-from ..stubs import DeQuantStub
-from ..utils import (
-    _parent_name,
-    is_per_channel,
-    get_qparam_dict,
-    to_underlying_dtype,
-    weight_is_quantized,
+from .qconfig_mapping_utils import (
+    generate_node_name_to_qconfig,
+    compare_prepare_convert_qconfig_mappings,
+    update_qconfig_for_fusion,
+    is_qconfig_supported_by_dtype_configs,
+    _update_qconfig_for_qat,
 )
-# importing the lib so that the quantized_decomposed ops are registered
-from ._decomposed import quantized_decomposed_lib  # noqa: F401
-from ._equalize import (
-    convert_eq_obs,
-    update_obs_for_equalization,
+from torch.ao.quantization.backend_config.utils import (
+    get_root_module_to_quantized_reference_module,
+    get_pattern_to_dtype_configs,
+    get_fused_module_classes,
+    get_qat_module_classes,
 )
-from .convert_utils import (
-    _convert_custom_module,
-    _convert_standalone_module,
-    _get_module_path_and_prefix,
-    _has_none_qconfig,
-    _insert_dequantize_node,
-    _is_conversion_supported,
-    _maybe_recursive_remove_dequantize,
-    _replace_observer_or_dequant_stub_with_dequantize_node,
-    _restore_state,
-    _run_weight_observers,
+from torch.ao.quantization.backend_config import (
+    BackendConfig,
+    get_native_backend_config,
 )
-from .custom_config import ConvertCustomConfig
 from .graph_module import (
-    is_observed_standalone_module,
     QuantizedGraphModule,
+    is_observed_module,
+    is_observed_standalone_module,
 )
-from .lower_to_fbgemm import lower_to_fbgemm
-from .qconfig_mapping_utils import (
-    _update_qconfig_for_qat,
-    compare_prepare_convert_qconfig_mappings,
-    generate_node_name_to_qconfig,
-    is_qconfig_supported_by_dtype_configs,
-    update_qconfig_for_fusion,
-)
+from ._equalize import update_obs_for_equalization, convert_eq_obs
+from torch.nn.utils.parametrize import type_before_parametrizations
 from .utils import (
     _get_module,
-    create_getattr_from_value,
+    _is_custom_module_lstm,
     get_custom_module_class_keys,
+    create_getattr_from_value,
+    collect_producer_nodes,
+    graph_module_from_producer_nodes,
+    node_arg_is_weight,
+)
+from torch.ao.quantization.utils import (
+    is_per_channel,
+    to_underlying_dtype,
 )
+from torch.ao.quantization.quantize import (
+    _remove_qconfig,
+    is_activation_post_process,
+)
+from torch.ao.quantization.stubs import DeQuantStub
+from .custom_config import (
+    ConvertCustomConfig,
+    PrepareCustomConfig,
+)
+from .lower_to_fbgemm import lower_to_fbgemm
+# importing the lib so that the quantized_decomposed ops are registered
+from ._decomposed import quantized_decomposed_lib  # noqa: F401
+import operator
 
+# TODO: revisit this list. Many helper methods shouldn't be public
 __all__ = [
     "convert",
+    "convert_custom_module",
+    "convert_standalone_module",
+    "convert_weighted_module",
+    "get_module_path_and_prefix",
+    "has_none_qconfig",
+    "insert_dequantize_node",
+    "maybe_get_observer_for_node",
+    "maybe_recursive_remove_dequantize",
+    "restore_state",
+    "run_weight_observers",
 ]
 
 def _replace_observer_with_quantize_dequantize_node_decomposed(
@@ -97,12 +109,12 @@ def _replace_observer_with_quantize_dequantize_node_decomposed(
     """
     assert modules is not None
     assert isinstance(node.target, str)
-    module_path, prefix = _get_module_path_and_prefix(node, node_name_to_scope, node_name_to_qconfig)
+    module_path, prefix = get_module_path_and_prefix(node, node_name_to_scope, node_name_to_qconfig)
     activation_post_process = modules[node.target]
     # skip replacing observers to quant/dequant nodes if the qconfigs of all
     # consumers and producers of this observer are None
     skip_replacement = all([
-        _has_none_qconfig(n, node_name_to_qconfig) for n in
+        has_none_qconfig(n, node_name_to_qconfig) for n in
         list(node.args) + list(node.users.keys())])
     if skip_replacement or not _is_conversion_supported(activation_post_process):
         # didn't find correponding quantize op and info for the activation_post_process
@@ -309,12 +321,12 @@ def _replace_observer_with_quantize_dequantize_node(
     """
     assert modules is not None
     assert isinstance(node.target, str)
-    module_path, prefix = _get_module_path_and_prefix(node, node_name_to_scope, node_name_to_qconfig)
+    module_path, prefix = get_module_path_and_prefix(node, node_name_to_scope, node_name_to_qconfig)
     activation_post_process = modules[node.target]
     # skip replacing observers to quant/dequant nodes if the qconfigs of all
     # consumers and producers of this observer are None
     skip_replacement = all([
-        _has_none_qconfig(n, node_name_to_qconfig) for n in
+        has_none_qconfig(n, node_name_to_qconfig) for n in
         list(node.args) + list(node.users.keys())])
     if skip_replacement or not _is_conversion_supported(activation_post_process):
         # didn't find correponding quantize op and info for the activation_post_process
@@ -415,7 +427,230 @@ def _replace_observer_with_quantize_dequantize_node(
     # should not reach since we have checks in the begining to make sure the
     # activation_post_process is supported
 
-def _convert_weighted_module(
+# this is a temporary hack for custom module, we may want to implement
+# this properly after the custom module class design is finalized
+# TODO: DeQuantStubs are currently inserted only after custom module LSTM, while observers are inserted
+# after all other custom modules. In the future, we should simply insert QuantStubs before and DeQuantStubs
+# after custom modules in general, and replace these with "quantize" and "dequantize" nodes respectively.
+def _replace_observer_or_dequant_stub_with_dequantize_node(node: Node, graph: Graph):
+    call_custom_module_node = node.args[0]
+    assert isinstance(call_custom_module_node, Node), \
+        f"Expecting the for call custom module node to be a Node, but got {call_custom_module_node}"
+    node.replace_all_uses_with(call_custom_module_node)
+    graph.erase_node(node)
+    insert_dequantize_node(call_custom_module_node, graph)
+
+def _is_conversion_supported(activation_post_process: torch.nn.Module) -> bool:
+    dtype = activation_post_process.dtype  # type: ignore[attr-defined]
+
+    is_dynamic = False
+    if hasattr(activation_post_process, "is_dynamic"):
+        is_dynamic = activation_post_process.is_dynamic  # type: ignore[attr-defined, assignment]
+
+    return (
+        (dtype in [torch.quint8, torch.qint8, torch.qint32] and (not is_dynamic)) or  # type: ignore[return-value]
+        is_dynamic or
+        dtype == torch.float16
+    )
+
+def restore_state(
+        observed: torch.nn.Module
+) -> Tuple[Dict[str, Tuple[str, type]],
+           PrepareCustomConfig,
+           Set[str]]:
+    assert is_observed_module(observed), \
+        'incoming model must be produced by prepare_fx'
+    prepare_custom_config: PrepareCustomConfig = observed._prepare_custom_config  # type: ignore[assignment]
+    node_name_to_scope: Dict[str, Tuple[str, type]] = observed._node_name_to_scope  # type: ignore[assignment]
+    observed_node_names: Set[str] = observed._observed_node_names  # type: ignore[assignment]
+    return node_name_to_scope, prepare_custom_config, observed_node_names
+
+def has_none_qconfig(node: Argument, node_name_to_qconfig: Dict[str, QConfigAny]) -> bool:
+    """ Check if a node has a qconfig of None, i.e. user requested to not quantize
+    the node
+    """
+    return isinstance(node, Node) and node.name in node_name_to_qconfig and node_name_to_qconfig[node.name] is None
+
+def run_weight_observers(observed: GraphModule, backend_config: BackendConfig) -> None:
+    """ Extract the subgraph that produces the weight for dynamic quant
+    or weight only quant node and run the subgraph to observe the weight.
+    Note that the observers of dynamic quant or weight only quant ops are
+    run during the convert step.
+    """
+    for node in observed.graph.nodes:
+        if node.op != "call_function":
+            continue
+        for node_arg in node.args:
+            # node_arg is weight
+            if node_arg and node_arg_is_weight(node, node_arg, backend_config):
+                weight_observer_nodes = collect_producer_nodes(node_arg)
+                if weight_observer_nodes is None:
+                    continue
+                weight_observer_module = \
+                    graph_module_from_producer_nodes(
+                        observed, weight_observer_nodes)
+                # run the weight observer
+                weight_observer_module()
+
+def maybe_recursive_remove_dequantize(arg: Any, node: Node, graph: Graph):
+    """ If the arg is a dequantize Node, or a list/tuple/dict of dequantize Node,
+    we'll recursively remove the dequantize Node
+    """
+    if isinstance(arg, Node) and \
+       arg.op == "call_method" and \
+       arg.target == "dequantize":
+        quantize_node = arg.args[0]
+        # we only replace the specific use since dequantize could be used by other nodes
+        # as well
+        node.replace_input_with(arg, quantize_node)
+    elif isinstance(arg, (list, tuple)):
+        for arg_element in arg:
+            maybe_recursive_remove_dequantize(arg_element, node, graph)
+    elif isinstance(arg, dict):
+        for arg_element in arg.values():
+            maybe_recursive_remove_dequantize(arg_element, node, graph)
+    else:
+        warnings.warn(f"Unsupported node type in recursive remove dequantize: {type(arg)}")
+
+def get_module_path_and_prefix(
+        obs_node: Node,
+        node_name_to_scope: Dict[str, Tuple[str, type]],
+        node_name_to_qconfig: Dict[str, QConfigAny]):
+    """ Given and observer node, get the `Scope` or the fully qualified name for
+    the submodule containing the observed node, also return a prefix of "_input"
+    when the observed node is an input of a F.linear op, and not the output of another
+    quantized op.
+    TODO: this logic is hacky, we should think about how to remove it or make it more
+    general
+    """
+    observed_node = obs_node.args[0]
+    # an observer can be inserted for both input of the next operator or output of the previous
+    # operator (they can be the same)
+    # this flag identifies if the observer is inserted only because the observed node is
+    # the input of the next operator
+    assert isinstance(observed_node, Node), \
+        f"Expecting observed node to be a Node, but got {observed_node}"
+    is_input_observer_only = node_name_to_qconfig[observed_node.name] is None \
+        if observed_node.name in node_name_to_qconfig else None
+    if is_input_observer_only:
+        # if the quantize function is at the input of op, then we find the first user of the observer_node
+        # to get the path. If a linear call_function is in the user list, we return the first instance
+        # of linear node to get the FQN.
+        users = list(obs_node.users)
+        first_linear_use_or_first_use = users[0] if users else None
+        linear_node = None
+        for n in users:
+            if n.op == "call_function" and n.target == torch.nn.functional.linear:
+                linear_node = n
+                break
+        if linear_node:
+            first_linear_use_or_first_use = linear_node
+        prefix = "_input"
+    else:
+        # if the quantize function is at the output of the op, we use the observer input node to get the path
+        first_linear_use_or_first_use = observed_node
+        prefix = ""
+
+    if first_linear_use_or_first_use and first_linear_use_or_first_use.name in node_name_to_scope:
+        module_path, _ = node_name_to_scope[first_linear_use_or_first_use.name]
+    else:
+        # TODO: it's not used, so actually we can skip quantization
+        # but this requires changing return type of quantize_node
+        # we can fix it later if needed
+        module_path = ""
+    return module_path, prefix
+
+def insert_dequantize_node(
+        node: Node,
+        graph: Graph):
+    """ Inserts dequantize node for `node` in `graph`
+    """
+    with graph.inserting_after(node):
+        dequantize_node = graph.call_method("dequantize", (node,))
+        for user_node in dict(node.users):
+            if user_node is not dequantize_node:
+                user_node.replace_input_with(node, dequantize_node)
+
+def maybe_get_observer_for_node(
+        node: Node,
+        modules: Dict[str, torch.nn.Module]
+) -> Optional[torch.nn.Module]:
+    """
+    If the node is observed, return the observer
+    instance. Otherwise, return None.
+    """
+    for maybe_obs_node, _ in node.users.items():
+        if maybe_obs_node.op == 'call_module':
+            maybe_obs = modules[str(maybe_obs_node.target)]
+            if is_activation_post_process(maybe_obs):
+                return maybe_obs
+    return None
+
+def convert_standalone_module(
+        node: Node,
+        modules: Dict[str, torch.nn.Module],
+        model: torch.fx.GraphModule,
+        is_reference: bool,
+        backend_config: Optional[BackendConfig]):
+    """ Converts a observed standalone module to a quantized standalone module by calling
+    the fx convert api, currently using the same `is_reference` flag as parent, but we may
+    changing this behavior in the future (e.g. separating quantization and lowering for
+    standalone module as well)
+
+    Args:
+      - node: The call_module node of the observed standalone module
+      - modules: named_module of original model
+      - model: original model
+      - is_reference: a flag from parent provided by user to decide if we want to
+        produce a reference model or a fbgemm/qnnpack model
+      - backend_config: backend configuration of the target backend of quantization
+    """
+    # TODO: remove is_reference flag
+    if is_reference:
+        convert_fn = torch.ao.quantization.quantize_fx.convert_to_reference_fx
+    else:
+        convert_fn = torch.ao.quantization.quantize_fx.convert_fx  # type: ignore[attr-defined]
+    # We know that observed standalone module is a GraphModule since
+    # it's produced by us
+    observed_standalone_module : GraphModule = modules[str(node.target)]  # type: ignore[assignment]
+    sm_input_quantized_idxs = \
+        observed_standalone_module \
+        ._standalone_module_input_quantized_idxs\
+        .tolist()  # type: ignore[operator]
+    # remove the dequantize nodes for inputs
+    args = list(node.args)
+    for idx in range(len(args)):
+        if idx in sm_input_quantized_idxs:
+            arg = args[idx]
+            if arg.op == "call_method" and arg.target == "dequantize":  # type: ignore[union-attr]
+                quantize_node = arg.args[0]  # type: ignore[union-attr]
+                node.replace_input_with(arg, quantize_node)
+                if len(arg.users) == 0:  # type: ignore[union-attr]
+                    model.graph.erase_node(arg)
+    # add dequantize node for output
+    sm_output_quantized_idxs = \
+        observed_standalone_module \
+        ._standalone_module_output_quantized_idxs \
+        .tolist()  # type: ignore[operator]
+    if len(sm_output_quantized_idxs) > 0:
+        assert sm_output_quantized_idxs[0] == 0, "Currently only quantized"
+        "output idxs = [0] is supported"
+
+        # if it's non-empty, then it means the output is kept in quantized form
+        # we'll just add a dequantize node after this node
+        insert_dequantize_node(node, model.graph)
+
+    # TODO: allow convert_custom_config to override backend_config
+    # for standalone module
+    quantized_standalone_module = convert_fn(
+        observed_standalone_module,
+        backend_config=backend_config)
+    parent_name, name = _parent_name(node.target)
+    # update the modules dict
+    setattr(modules[parent_name], name, quantized_standalone_module)
+    modules[str(node.target)] = quantized_standalone_module
+
+def convert_weighted_module(
         node: Node,
         modules: Dict[str, torch.nn.Module],
         observed_node_names: Set[str],
@@ -450,7 +685,7 @@ def _convert_weighted_module(
 
     is_observed = node.name in observed_node_names
     # If a qconfig is not defined for this node, then skip converting to a reference module
-    if qconfig is None or _has_none_qconfig(node, node_name_to_qconfig) or not is_observed:
+    if qconfig is None or has_none_qconfig(node, node_name_to_qconfig) or not is_observed:
         return
 
     # skip converting to reference quantized module if the qconfig is not supported
@@ -525,6 +760,91 @@ def _convert_weighted_module(
         parent_name, name = _parent_name(node.target)
         setattr(modules[parent_name], name, ref_qmodule)
 
+def _remove_previous_dequantize_in_custom_module(node: Node, prev_node: Node, graph: Graph):
+    """
+    Given a custom module `node`, if the previous node is a dequantize, reroute the custom as follows:
+
+    Before: quantize - dequantize - custom_module
+    After: quantize - custom_module
+                 \\ - dequantize
+    """
+    # expecting the input node for a custom module node to be a Node
+    assert isinstance(prev_node, Node), \
+        f"Expecting the argument for custom module node to be a Node, but got {prev_node}"
+    if prev_node.op == "call_method" and prev_node.target == "dequantize":
+        node.replace_input_with(prev_node, prev_node.args[0])
+        # Remove the dequantize node if it doesn't have other users
+        if len(prev_node.users) == 0:
+            graph.erase_node(prev_node)
+
+def convert_custom_module(
+        node: Node,
+        graph: Graph,
+        modules: Dict[str, torch.nn.Module],
+        custom_module_class_mapping: Dict[QuantType, Dict[Type, Type]],
+        statically_quantized_custom_module_nodes: Set[Node]):
+    """ Converts an observed custom module to a quantized custom module based on
+    `custom_module_class_mapping`
+    For static quantization, we'll also remove the previous `dequantize` node and
+    attach the observer node for output to the module, the observer for the node
+    will be converted to a dequantize node instead of quantize-dequantize pairs
+    later in the graph. In the end we would have a quantized custom module that
+    has the same interface as a default quantized module in nn.quantized namespace,
+    i.e. quantized input and quantized output.
+
+    Args:
+      - node: The call_module node of the observed standalone module
+      - graph: The graph containing the node
+      - modules: named_module of original model
+      - custom_module_class_mapping: mapping from observed custom module class to
+        quantized custom module class, used to swap custom modules
+      - statically_quantized_custom_module_nodes: we'll add the custom module node
+        if we find it is statically quantized, this will be used later when converting
+        observers to quant/dequant node pairs, if the observed node is a statically
+        quantized custom module nodes, we'll convert the observer to a dequantize node,
+        this is to keep the interface the same as the default quantized module.
+        TODO: maybe we want to redesign this part to align with reference model design
+        as well, but there has been some discussions around the interface, so we can do
+        it later.
+    """
+    observed_custom_module = modules[str(node.target)]
+    maybe_obs = maybe_get_observer_for_node(node, modules)
+    qconfig = observed_custom_module.qconfig
+    if activation_is_statically_quantized(qconfig):
+        statically_quantized_custom_module_nodes.add(node)
+        if _is_custom_module_lstm(node, modules):
+            # The inputs are tuples in the form (input, (hidden0, hidden1))
+            # Ensure all three input nodes are quantized
+            assert (
+                len(node.args) == 2 and
+                isinstance(node.args[1], tuple) and
+                len(node.args[1]) == 2
+            )
+            (inputs, (hidden0, hidden1)) = node.args  # type: ignore[misc]
+            assert isinstance(inputs, Node)
+            assert isinstance(hidden0, Node)
+            assert isinstance(hidden1, Node)
+            _remove_previous_dequantize_in_custom_module(node, inputs, graph)
+            _remove_previous_dequantize_in_custom_module(node, hidden0, graph)
+            _remove_previous_dequantize_in_custom_module(node, hidden1, graph)
+        else:
+            # remove the previous dequant node to ensure the inputs are quantized
+            arg = node.args[0]
+            assert isinstance(arg, Node)
+            _remove_previous_dequantize_in_custom_module(node, arg, graph)
+            # absorb the following observer into the module conversion
+            activation_post_process = maybe_get_observer_for_node(node, modules)
+            assert activation_post_process is not None
+            observed_custom_module.activation_post_process = activation_post_process
+
+    # swap the observed custom module to quantized custom module
+    quantized_custom_module_class = get_swapped_custom_module_class(
+        observed_custom_module, custom_module_class_mapping, qconfig)
+    quantized_custom_module = \
+        quantized_custom_module_class.from_observed(observed_custom_module)
+    parent_name, name = _parent_name(node.target)
+    setattr(modules[parent_name], name, quantized_custom_module)
+
 def convert(
         model: GraphModule, is_reference: bool = False,
         convert_custom_config: Union[ConvertCustomConfig, Dict[str, Any], None] = None,
@@ -586,7 +906,7 @@ def convert(
     if backend_config is None:
         backend_config = get_native_backend_config()
 
-    node_name_to_scope, prepare_custom_config, observed_node_names = _restore_state(model)
+    node_name_to_scope, prepare_custom_config, observed_node_names = restore_state(model)
     node_name_to_qconfig: Dict[str, QConfigAny] = model._node_name_to_qconfig  # type: ignore[assignment]
 
     # mapping from fully qualified module name to module instance
@@ -636,7 +956,7 @@ def convert(
 
     # always run weight observers in the top level forward method
     # for dynamic quant ops or weight only quant ops
-    _run_weight_observers(model, backend_config)
+    run_weight_observers(model, backend_config)
 
     graph_inputs: List[str] = []
     for node in model.graph.nodes:
@@ -665,7 +985,7 @@ def convert(
                 # input_quantized_idxs override.
                 # we need to dequantize the inputs since all operators took
                 # floating point inputs in reference quantized models
-                _insert_dequantize_node(node, model.graph)
+                insert_dequantize_node(node, model.graph)
         elif node.op == "output":
             # If the argument is empty we don't need to do anything
             if len(output_quantized_idxs) == 0:
@@ -678,13 +998,13 @@ def convert(
             # outputs can be Node, list, tuple, dict, other cases are not supported yet
             if isinstance(output, (list, tuple)):
                 for idx in output_quantized_idxs:
-                    _maybe_recursive_remove_dequantize(output[idx], return_node, model.graph)
+                    maybe_recursive_remove_dequantize(output[idx], return_node, model.graph)
             elif isinstance(output, (Node, dict)):
                 # we treat dict as a single argument currently, but it can be extended
                 # to support {"key": dtype} after we change output_quantized_idxs to
                 # dict
                 if 0 in output_quantized_idxs:
-                    _maybe_recursive_remove_dequantize(output, return_node, model.graph)
+                    maybe_recursive_remove_dequantize(output, return_node, model.graph)
             else:
                 warnings.warn(f"Unsupported node type for output_quantized_idxs: {type(output)}")
         elif node.op == "call_module":
@@ -706,7 +1026,7 @@ def convert(
             elif isinstance(mod, DeQuantStub):
                 _replace_observer_or_dequant_stub_with_dequantize_node(node, model.graph)
             elif is_observed_standalone_module(mod):
-                _convert_standalone_module(
+                convert_standalone_module(
                     node, modules, model, is_reference, backend_config)
             # below this point `type_before_parametrizations` is used
             # instead of `type` to handle situations with fx quant + sparsity
@@ -717,10 +1037,10 @@ def convert(
                 if type_before_parametrizations(mod) in fused_module_classes and \
                    type_before_parametrizations(mod[0]) not in root_module_classes:  # type: ignore[index]
                     continue
-                _convert_weighted_module(
+                convert_weighted_module(
                     node, modules, observed_node_names, node_name_to_qconfig, backend_config)
             elif type_before_parametrizations(mod) in custom_module_classes:
-                _convert_custom_module(
+                convert_custom_module(
                     node, model.graph, modules, custom_module_class_mapping,
                     statically_quantized_custom_module_nodes)
 
diff --git a/torch/ao/quantization/fx/convert_utils.py b/torch/ao/quantization/fx/convert_utils.py
deleted file mode 100644
index 12304ccadf162..0000000000000
--- a/torch/ao/quantization/fx/convert_utils.py
+++ /dev/null
@@ -1,335 +0,0 @@
-from typing import Any, Dict, Optional, Set, Tuple, Type
-import warnings
-import torch
-import torch.ao.quantization.quantize_fx
-from torch.fx import GraphModule
-from torch.fx.graph import (
-    Argument,
-    Graph,
-    Node,
-)
-from ..backend_config import BackendConfig
-from ..quant_type import QuantType
-from ..quantize import is_activation_post_process
-from ..qconfig import QConfigAny
-from ..utils import (
-    activation_is_statically_quantized,
-    _parent_name,
-    get_swapped_custom_module_class,
-)
-from .custom_config import PrepareCustomConfig
-from .graph_module import is_observed_module
-from .utils import (
-    _is_custom_module_lstm,
-    collect_producer_nodes,
-    graph_module_from_producer_nodes,
-    node_arg_is_weight,
-)
-
-# this is a temporary hack for custom module, we may want to implement
-# this properly after the custom module class design is finalized
-# TODO: DeQuantStubs are currently inserted only after custom module LSTM, while observers are inserted
-# after all other custom modules. In the future, we should simply insert QuantStubs before and DeQuantStubs
-# after custom modules in general, and replace these with "quantize" and "dequantize" nodes respectively.
-def _replace_observer_or_dequant_stub_with_dequantize_node(node: Node, graph: Graph):
-    call_custom_module_node = node.args[0]
-    assert isinstance(call_custom_module_node, Node), \
-        f"Expecting the for call custom module node to be a Node, but got {call_custom_module_node}"
-    node.replace_all_uses_with(call_custom_module_node)
-    graph.erase_node(node)
-    _insert_dequantize_node(call_custom_module_node, graph)
-
-def _is_conversion_supported(activation_post_process: torch.nn.Module) -> bool:
-    dtype = activation_post_process.dtype  # type: ignore[attr-defined]
-
-    is_dynamic = False
-    if hasattr(activation_post_process, "is_dynamic"):
-        is_dynamic = activation_post_process.is_dynamic  # type: ignore[attr-defined, assignment]
-
-    return (
-        (dtype in [torch.quint8, torch.qint8, torch.qint32] and (not is_dynamic)) or  # type: ignore[return-value]
-        is_dynamic or
-        dtype == torch.float16
-    )
-
-def _restore_state(
-        observed: torch.nn.Module
-) -> Tuple[Dict[str, Tuple[str, type]],
-           PrepareCustomConfig,
-           Set[str]]:
-    assert is_observed_module(observed), \
-        'incoming model must be produced by prepare_fx'
-    prepare_custom_config: PrepareCustomConfig = observed._prepare_custom_config  # type: ignore[assignment]
-    node_name_to_scope: Dict[str, Tuple[str, type]] = observed._node_name_to_scope  # type: ignore[assignment]
-    observed_node_names: Set[str] = observed._observed_node_names  # type: ignore[assignment]
-    return node_name_to_scope, prepare_custom_config, observed_node_names
-
-def _has_none_qconfig(node: Argument, node_name_to_qconfig: Dict[str, QConfigAny]) -> bool:
-    """ Check if a node has a qconfig of None, i.e. user requested to not quantize
-    the node
-    """
-    return isinstance(node, Node) and node.name in node_name_to_qconfig and node_name_to_qconfig[node.name] is None
-
-def _run_weight_observers(observed: GraphModule, backend_config: BackendConfig) -> None:
-    """ Extract the subgraph that produces the weight for dynamic quant
-    or weight only quant node and run the subgraph to observe the weight.
-    Note that the observers of dynamic quant or weight only quant ops are
-    run during the convert step.
-    """
-    for node in observed.graph.nodes:
-        if node.op != "call_function":
-            continue
-        for node_arg in node.args:
-            # node_arg is weight
-            if node_arg and node_arg_is_weight(node, node_arg, backend_config):
-                weight_observer_nodes = collect_producer_nodes(node_arg)
-                if weight_observer_nodes is None:
-                    continue
-                weight_observer_module = \
-                    graph_module_from_producer_nodes(
-                        observed, weight_observer_nodes)
-                # run the weight observer
-                weight_observer_module()
-
-def _maybe_recursive_remove_dequantize(arg: Any, node: Node, graph: Graph):
-    """ If the arg is a dequantize Node, or a list/tuple/dict of dequantize Node,
-    we'll recursively remove the dequantize Node
-    """
-    if isinstance(arg, Node) and \
-       arg.op == "call_method" and \
-       arg.target == "dequantize":
-        quantize_node = arg.args[0]
-        # we only replace the specific use since dequantize could be used by other nodes
-        # as well
-        node.replace_input_with(arg, quantize_node)
-    elif isinstance(arg, (list, tuple)):
-        for arg_element in arg:
-            _maybe_recursive_remove_dequantize(arg_element, node, graph)
-    elif isinstance(arg, dict):
-        for arg_element in arg.values():
-            _maybe_recursive_remove_dequantize(arg_element, node, graph)
-    else:
-        warnings.warn(f"Unsupported node type in recursive remove dequantize: {type(arg)}")
-
-def _get_module_path_and_prefix(
-        obs_node: Node,
-        node_name_to_scope: Dict[str, Tuple[str, type]],
-        node_name_to_qconfig: Dict[str, QConfigAny]):
-    """ Given and observer node, get the `Scope` or the fully qualified name for
-    the submodule containing the observed node, also return a prefix of "_input"
-    when the observed node is an input of a F.linear op, and not the output of another
-    quantized op.
-    TODO: this logic is hacky, we should think about how to remove it or make it more
-    general
-    """
-    observed_node = obs_node.args[0]
-    # an observer can be inserted for both input of the next operator or output of the previous
-    # operator (they can be the same)
-    # this flag identifies if the observer is inserted only because the observed node is
-    # the input of the next operator
-    assert isinstance(observed_node, Node), \
-        f"Expecting observed node to be a Node, but got {observed_node}"
-    is_input_observer_only = node_name_to_qconfig[observed_node.name] is None \
-        if observed_node.name in node_name_to_qconfig else None
-    if is_input_observer_only:
-        # if the quantize function is at the input of op, then we find the first user of the observer_node
-        # to get the path. If a linear call_function is in the user list, we return the first instance
-        # of linear node to get the FQN.
-        users = list(obs_node.users)
-        first_linear_use_or_first_use = users[0] if users else None
-        linear_node = None
-        for n in users:
-            if n.op == "call_function" and n.target == torch.nn.functional.linear:
-                linear_node = n
-                break
-        if linear_node:
-            first_linear_use_or_first_use = linear_node
-        prefix = "_input"
-    else:
-        # if the quantize function is at the output of the op, we use the observer input node to get the path
-        first_linear_use_or_first_use = observed_node
-        prefix = ""
-
-    if first_linear_use_or_first_use and first_linear_use_or_first_use.name in node_name_to_scope:
-        module_path, _ = node_name_to_scope[first_linear_use_or_first_use.name]
-    else:
-        # TODO: it's not used, so actually we can skip quantization
-        # but this requires changing return type of quantize_node
-        # we can fix it later if needed
-        module_path = ""
-    return module_path, prefix
-
-def _insert_dequantize_node(
-        node: Node,
-        graph: Graph):
-    """ Inserts dequantize node for `node` in `graph`
-    """
-    with graph.inserting_after(node):
-        dequantize_node = graph.call_method("dequantize", (node,))
-        for user_node in dict(node.users):
-            if user_node is not dequantize_node:
-                user_node.replace_input_with(node, dequantize_node)
-
-def _maybe_get_observer_for_node(
-        node: Node,
-        modules: Dict[str, torch.nn.Module]
-) -> Optional[torch.nn.Module]:
-    """
-    If the node is observed, return the observer
-    instance. Otherwise, return None.
-    """
-    for maybe_obs_node, _ in node.users.items():
-        if maybe_obs_node.op == 'call_module':
-            maybe_obs = modules[str(maybe_obs_node.target)]
-            if is_activation_post_process(maybe_obs):
-                return maybe_obs
-    return None
-
-def _convert_standalone_module(
-        node: Node,
-        modules: Dict[str, torch.nn.Module],
-        model: torch.fx.GraphModule,
-        is_reference: bool,
-        backend_config: Optional[BackendConfig]):
-    """ Converts a observed standalone module to a quantized standalone module by calling
-    the fx convert api, currently using the same `is_reference` flag as parent, but we may
-    changing this behavior in the future (e.g. separating quantization and lowering for
-    standalone module as well)
-
-    Args:
-      - node: The call_module node of the observed standalone module
-      - modules: named_module of original model
-      - model: original model
-      - is_reference: a flag from parent provided by user to decide if we want to
-        produce a reference model or a fbgemm/qnnpack model
-      - backend_config: backend configuration of the target backend of quantization
-    """
-    # TODO: remove is_reference flag
-    if is_reference:
-        convert_fn = torch.ao.quantization.quantize_fx.convert_to_reference_fx
-    else:
-        convert_fn = torch.ao.quantization.quantize_fx.convert_fx  # type: ignore[attr-defined]
-    # We know that observed standalone module is a GraphModule since
-    # it's produced by us
-    observed_standalone_module : GraphModule = modules[str(node.target)]  # type: ignore[assignment]
-    sm_input_quantized_idxs = \
-        observed_standalone_module \
-        ._standalone_module_input_quantized_idxs\
-        .tolist()  # type: ignore[operator]
-    # remove the dequantize nodes for inputs
-    args = list(node.args)
-    for idx in range(len(args)):
-        if idx in sm_input_quantized_idxs:
-            arg = args[idx]
-            if arg.op == "call_method" and arg.target == "dequantize":  # type: ignore[union-attr]
-                quantize_node = arg.args[0]  # type: ignore[union-attr]
-                node.replace_input_with(arg, quantize_node)
-                if len(arg.users) == 0:  # type: ignore[union-attr]
-                    model.graph.erase_node(arg)
-    # add dequantize node for output
-    sm_output_quantized_idxs = \
-        observed_standalone_module \
-        ._standalone_module_output_quantized_idxs \
-        .tolist()  # type: ignore[operator]
-    if len(sm_output_quantized_idxs) > 0:
-        assert sm_output_quantized_idxs[0] == 0, "Currently only quantized"
-        "output idxs = [0] is supported"
-
-        # if it's non-empty, then it means the output is kept in quantized form
-        # we'll just add a dequantize node after this node
-        _insert_dequantize_node(node, model.graph)
-
-    # TODO: allow convert_custom_config to override backend_config
-    # for standalone module
-    quantized_standalone_module = convert_fn(
-        observed_standalone_module,
-        backend_config=backend_config)
-    parent_name, name = _parent_name(node.target)
-    # update the modules dict
-    setattr(modules[parent_name], name, quantized_standalone_module)
-    modules[str(node.target)] = quantized_standalone_module
-
-def _remove_previous_dequantize_in_custom_module(node: Node, prev_node: Node, graph: Graph):
-    """
-    Given a custom module `node`, if the previous node is a dequantize, reroute the custom as follows:
-
-    Before: quantize - dequantize - custom_module
-    After: quantize - custom_module
-                 \\ - dequantize
-    """
-    # expecting the input node for a custom module node to be a Node
-    assert isinstance(prev_node, Node), \
-        f"Expecting the argument for custom module node to be a Node, but got {prev_node}"
-    if prev_node.op == "call_method" and prev_node.target == "dequantize":
-        node.replace_input_with(prev_node, prev_node.args[0])
-        # Remove the dequantize node if it doesn't have other users
-        if len(prev_node.users) == 0:
-            graph.erase_node(prev_node)
-
-def _convert_custom_module(
-        node: Node,
-        graph: Graph,
-        modules: Dict[str, torch.nn.Module],
-        custom_module_class_mapping: Dict[QuantType, Dict[Type, Type]],
-        statically_quantized_custom_module_nodes: Set[Node]):
-    """ Converts an observed custom module to a quantized custom module based on
-    `custom_module_class_mapping`
-    For static quantization, we'll also remove the previous `dequantize` node and
-    attach the observer node for output to the module, the observer for the node
-    will be converted to a dequantize node instead of quantize-dequantize pairs
-    later in the graph. In the end we would have a quantized custom module that
-    has the same interface as a default quantized module in nn.quantized namespace,
-    i.e. quantized input and quantized output.
-
-    Args:
-      - node: The call_module node of the observed standalone module
-      - graph: The graph containing the node
-      - modules: named_module of original model
-      - custom_module_class_mapping: mapping from observed custom module class to
-        quantized custom module class, used to swap custom modules
-      - statically_quantized_custom_module_nodes: we'll add the custom module node
-        if we find it is statically quantized, this will be used later when converting
-        observers to quant/dequant node pairs, if the observed node is a statically
-        quantized custom module nodes, we'll convert the observer to a dequantize node,
-        this is to keep the interface the same as the default quantized module.
-        TODO: maybe we want to redesign this part to align with reference model design
-        as well, but there has been some discussions around the interface, so we can do
-        it later.
-    """
-    observed_custom_module = modules[str(node.target)]
-    maybe_obs = _maybe_get_observer_for_node(node, modules)
-    qconfig = observed_custom_module.qconfig
-    if activation_is_statically_quantized(qconfig):
-        statically_quantized_custom_module_nodes.add(node)
-        if _is_custom_module_lstm(node, modules):
-            # The inputs are tuples in the form (input, (hidden0, hidden1))
-            # Ensure all three input nodes are quantized
-            assert (
-                len(node.args) == 2 and
-                isinstance(node.args[1], tuple) and
-                len(node.args[1]) == 2
-            )
-            (inputs, (hidden0, hidden1)) = node.args  # type: ignore[misc]
-            assert isinstance(inputs, Node)
-            assert isinstance(hidden0, Node)
-            assert isinstance(hidden1, Node)
-            _remove_previous_dequantize_in_custom_module(node, inputs, graph)
-            _remove_previous_dequantize_in_custom_module(node, hidden0, graph)
-            _remove_previous_dequantize_in_custom_module(node, hidden1, graph)
-        else:
-            # remove the previous dequant node to ensure the inputs are quantized
-            arg = node.args[0]
-            assert isinstance(arg, Node)
-            _remove_previous_dequantize_in_custom_module(node, arg, graph)
-            # absorb the following observer into the module conversion
-            activation_post_process = _maybe_get_observer_for_node(node, modules)
-            assert activation_post_process is not None
-            observed_custom_module.activation_post_process = activation_post_process
-
-    # swap the observed custom module to quantized custom module
-    quantized_custom_module_class = get_swapped_custom_module_class(
-        observed_custom_module, custom_module_class_mapping, qconfig)
-    quantized_custom_module = \
-        quantized_custom_module_class.from_observed(observed_custom_module)
-    parent_name, name = _parent_name(node.target)
-    setattr(modules[parent_name], name, quantized_custom_module)

From 97a43a096ff191d829098144cedc373fffc8da78 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Tue, 6 Dec 2022 14:23:34 -0800
Subject: [PATCH 1693/1922] [quant] Add support for symmetric quant in
 executorch (#90304)

Summary:
This PR adds symmetric quant in the backend config for executorch

Test Plan:
NA, will be tested in meta internal flow

Reviewers:

Subscribers:

Tasks:

Tags:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90304
Approved by: https://github.com/cccclai, https://github.com/jcaip, https://github.com/andrewor14
---
 .../quantization/backend_config/executorch.py | 38 +++++++++++++++----
 1 file changed, 31 insertions(+), 7 deletions(-)

diff --git a/torch/ao/quantization/backend_config/executorch.py b/torch/ao/quantization/backend_config/executorch.py
index 627143c000991..fcccec6c2225f 100644
--- a/torch/ao/quantization/backend_config/executorch.py
+++ b/torch/ao/quantization/backend_config/executorch.py
@@ -5,7 +5,16 @@
 import torch.nn as nn
 import torch.nn.qat as nnqat
 import torch.nn.quantized._reference as nnqr
-from .backend_config import BackendConfig, BackendPatternConfig, DTypeConfig, ObservationType
+from .backend_config import (
+    BackendConfig,
+    BackendPatternConfig,
+    DTypeConfig,
+    ObservationType,
+)
+from .qnnpack import (
+    qnnpack_weighted_op_qint8_symmetric_dtype_config,
+    qnnpack_default_op_qint8_symmetric_dtype_config
+)
 from ._common_operator_config_utils import _Conv2dMetadata
 from ..fuser_method_mappings import _reverse_sequential_wrapper2
 
@@ -47,7 +56,6 @@
     is_dynamic=True,
 )
 
-
 # =============================
 # |  BACKEND PATTERN CONFIGS  |
 # =============================
@@ -58,6 +66,7 @@ def _get_linear_configs() -> List[BackendPatternConfig]:
     """
     observation_type = ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT
     dtype_configs = [
+        qnnpack_weighted_op_qint8_symmetric_dtype_config,
         executorch_weighted_op_int8_dtype_config,
         executorch_default_dynamic_int8_dtype_config,
         executorch_default_dynamic_float16_dtype_config,
@@ -84,7 +93,10 @@ def _get_conv_configs() -> List[BackendPatternConfig]:
     Return all configs related to conv modules and ops.
     """
     observation_type = ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT
-    dtype_configs = [executorch_weighted_op_int8_dtype_config]
+    dtype_configs = [
+        qnnpack_weighted_op_qint8_symmetric_dtype_config,
+        executorch_weighted_op_int8_dtype_config
+    ]
     conv_configs = []
     for convs in [_Conv2dMetadata]:
         # conv module
@@ -137,7 +149,10 @@ def _get_binary_ops_configs() -> List[BackendPatternConfig]:
     """
     Return all configs related to binary ops.
     """
-    dtype_configs = [executorch_weighted_op_int8_dtype_config]
+    dtype_configs = [
+        qnnpack_default_op_qint8_symmetric_dtype_config,
+        executorch_weighted_op_int8_dtype_config
+    ]
     num_tensor_args_to_observation_type_mapping = {
         # TODO: this is not used right now since we have extra check in prepare
         # will need to change this to NO_OBSERVER later after we implemented
@@ -165,7 +180,10 @@ def _get_share_qparams_ops_configs() -> List[BackendPatternConfig]:
     observer_0 - avgpool2d - observer_0 (same observer instance as input)
     """
     observation_type = ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT
-    dtype_configs = [executorch_default_op_quint8_dtype_config]
+    dtype_configs = [
+        qnnpack_default_op_qint8_symmetric_dtype_config,
+        executorch_default_op_quint8_dtype_config
+    ]
     share_qparams_ops = [
         F.adaptive_avg_pool2d,
         F.relu,
@@ -192,7 +210,10 @@ def _get_bn_configs() -> List[BackendPatternConfig]:
     Return all configs related to batchnorm.
     """
     observation_type = ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT
-    dtype_configs = [executorch_default_op_quint8_dtype_config]
+    dtype_configs = [
+        qnnpack_default_op_qint8_symmetric_dtype_config,
+        executorch_default_op_quint8_dtype_config
+    ]
     bn_configs = []
     bn_configs.append(
         BackendPatternConfig(nn.BatchNorm2d)
@@ -201,7 +222,10 @@ def _get_bn_configs() -> List[BackendPatternConfig]:
     return bn_configs
 
 def _get_cat_configs() -> List[BackendPatternConfig]:
-    dtype_configs = [executorch_default_op_quint8_dtype_config]
+    dtype_configs = [
+        qnnpack_default_op_qint8_symmetric_dtype_config,
+        executorch_default_op_quint8_dtype_config
+    ]
     cat_configs = []
     cat_configs.append(
         BackendPatternConfig(torch.cat)

From 6bf2cf1c129fffd3dc6c0c505eff70c0aceb0049 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 7 Dec 2022 12:01:34 -0800
Subject: [PATCH 1694/1922] Add missing mypy-nofollow.ini (#90179)

I'm not sure how lintrunner worked without this lol.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90179
Approved by: https://github.com/albanD, https://github.com/voznesenskym
---
 .lintrunner.toml                     |  2 ++
 mypy-nofollow.ini                    | 34 ++++++++++++++++++++++++++++
 tools/linter/adapters/mypy_linter.py | 12 +++++++---
 torch/_dynamo/convert_frame.py       |  4 ++--
 torch/_dynamo/guards.py              | 27 +++++-----------------
 torch/_dynamo/hooks.py               |  4 ++--
 torch/_dynamo/output_graph.py        |  4 +++-
 torch/_dynamo/types.py               | 10 +++++++-
 8 files changed, 67 insertions(+), 30 deletions(-)
 create mode 100644 mypy-nofollow.ini

diff --git a/.lintrunner.toml b/.lintrunner.toml
index 6f2c60ac6c8ed..2bb69e6dc4bc0 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -169,6 +169,7 @@ command = [
     'python3',
     'tools/linter/adapters/mypy_linter.py',
     '--config=mypy-nofollow.ini',
+    '--code=MYPYNOFOLLOW',
     '--',
     '@{{PATHSFILE}}'
 ]
@@ -198,6 +199,7 @@ command = [
     'python3',
     'tools/linter/adapters/mypy_linter.py',
     '--config=mypy-strict.ini',
+    '--code=MYPYSTRICT',
     '--',
     '@{{PATHSFILE}}'
 ]
diff --git a/mypy-nofollow.ini b/mypy-nofollow.ini
new file mode 100644
index 0000000000000..5b5358643774f
--- /dev/null
+++ b/mypy-nofollow.ini
@@ -0,0 +1,34 @@
+[mypy]
+plugins = mypy_plugins/check_mypy_version.py
+
+cache_dir = .mypy_cache/nofollow
+warn_unused_configs = True
+warn_redundant_casts = True
+show_error_codes = True
+show_column_numbers = True
+check_untyped_defs = True
+follow_imports = skip
+
+# do not reenable this:
+# https://github.com/pytorch/pytorch/pull/60006#issuecomment-866130657
+warn_unused_ignores = False
+disallow_any_generics = True
+
+files =
+    torch/_dynamo
+
+# Minimum version supported - variable annotations were introduced
+# in Python 3.7
+python_version = 3.7
+
+[mypy-sympy]
+ignore_missing_imports = True
+
+[mypy-sympy.*]
+ignore_missing_imports = True
+
+[mypy-torch._C]
+ignore_errors = True
+
+[mypy-torch._C.*]
+ignore_errors = True
diff --git a/tools/linter/adapters/mypy_linter.py b/tools/linter/adapters/mypy_linter.py
index 65ee8850e667c..cd94879fa0f93 100644
--- a/tools/linter/adapters/mypy_linter.py
+++ b/tools/linter/adapters/mypy_linter.py
@@ -87,6 +87,7 @@ def check_files(
     filenames: List[str],
     config: str,
     retries: int,
+    code: str,
 ) -> List[LintMessage]:
     try:
         proc = run_command(
@@ -100,7 +101,7 @@ def check_files(
                 path=None,
                 line=None,
                 char=None,
-                code="MYPY",
+                code=code,
                 severity=LintSeverity.ERROR,
                 name="command-failed",
                 original=None,
@@ -118,7 +119,7 @@ def check_files(
             char=int(match["column"])
             if match["column"] is not None and not match["column"].startswith("-")
             else None,
-            code="MYPY",
+            code=code,
             severity=severities.get(match["severity"], LintSeverity.ERROR),
             original=None,
             replacement=None,
@@ -143,6 +144,11 @@ def main() -> None:
         required=True,
         help="path to an mypy .ini config file",
     )
+    parser.add_argument(
+        "--code",
+        default="MYPY",
+        help="the code this lint should report as",
+    )
     parser.add_argument(
         "--verbose",
         action="store_true",
@@ -182,7 +188,7 @@ def main() -> None:
         else:
             filenames[filename] = True
 
-    lint_messages = check_files(list(filenames), args.config, args.retries)
+    lint_messages = check_files(list(filenames), args.config, args.retries, args.code)
     for lint_message in lint_messages:
         print(json.dumps(lint_message._asdict()), flush=True)
 
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index 257faf6866445..206f77b99ec9b 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -506,10 +506,10 @@ def replay(filename):
             record.globals,
             record.locals,
             record.builtins,
-            eager,
-            hooks,
+            compiler_fn=eager,
             one_graph=False,
             export=False,
+            hooks=Hooks(),
             frame=None,
         )
     except Exception:
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index cb731c2359991..36d628a7ece5c 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -8,18 +8,7 @@
 import types
 import weakref
 from inspect import currentframe, getframeinfo
-from typing import (
-    Any,
-    Callable,
-    Dict,
-    List,
-    NamedTuple,
-    Optional,
-    Set,
-    Tuple,
-    Type,
-    Union,
-)
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
 from weakref import ReferenceType
 
 import numpy as np
@@ -32,7 +21,7 @@
 from . import config, convert_frame, mutation_guard
 from .eval_frame import set_guard_error_hook, set_guard_fail_hook
 from .exc import unimplemented
-from .types import GuardedCode, GuardFn  # noqa: F401
+from .types import GuardedCode, GuardFail, GuardFn  # noqa: F401
 from .utils import (
     dict_const_keys,
     dict_param_key_ids,
@@ -865,13 +854,6 @@ def id_ref(self, obj):
         return id(obj)
 
 
-class GuardFail(NamedTuple):
-    # A string repr of the piece of failed guard code we eval-ed
-    reason: str
-    # A code object where we failed a guard
-    orig_code: types.CodeType
-
-
 def guard_fail_hook(
     guard_fn: GuardFn, code: types.CodeType, f_locals: Dict[str, object], last: bool
 ) -> None:
@@ -894,7 +876,10 @@ def guard_fail_hook(
             reason = part
             break
     try:
-        guard_fn.guard_fail_fn(GuardFail(reason, orig_code_map[code]))
+        if guard_fn.guard_fail_fn is not None:
+            guard_fn.guard_fail_fn(
+                GuardFail(reason or "unknown reason", orig_code_map[code])
+            )
     except Exception as e:
         log.error(
             "Failure in guard_fail_fn callback - raising here will cause a NULL Error on guard eval",
diff --git a/torch/_dynamo/hooks.py b/torch/_dynamo/hooks.py
index 37b47a75ced2f..6a3f64c9ccaf0 100644
--- a/torch/_dynamo/hooks.py
+++ b/torch/_dynamo/hooks.py
@@ -5,5 +5,5 @@
 
 @dataclasses.dataclass
 class Hooks:
-    guard_export_fn: Optional[Callable[[Set["Guard"]], None]]
-    guard_fail_fn: Optional[Callable[[Tuple["GuardFail"]], None]]
+    guard_export_fn: Optional[Callable[[Set["Guard"]], None]] = None
+    guard_fail_fn: Optional[Callable[[Tuple["GuardFail"]], None]] = None
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index 7d6a4101405c6..596a3de881762 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -161,7 +161,9 @@ def __init__(
         self.compiler_fn: CompilerFn = compiler_fn
         self.root_globals = f_globals
         self.root_tx = root_tx
-        self._current_tx = []
+        from torch._dynamo.symbolic_convert import InstructionTranslatorBase
+
+        self._current_tx: List[InstructionTranslatorBase] = []
         self.cleanups: List[CleanupHook] = []
         self.should_exit = False
         self.random_values_var = None
diff --git a/torch/_dynamo/types.py b/torch/_dynamo/types.py
index 7dfcfd7b51111..67a81a765bca8 100644
--- a/torch/_dynamo/types.py
+++ b/torch/_dynamo/types.py
@@ -1,15 +1,23 @@
 import dataclasses
 import types
-from typing import Dict, List, Optional, OrderedDict, Union
+from typing import Callable, Dict, List, NamedTuple, Optional, OrderedDict, Union
 
 from typing_extensions import Protocol
 
 
+class GuardFail(NamedTuple):
+    # A string repr of the piece of failed guard code we eval-ed
+    reason: str
+    # A code object where we failed a guard
+    orig_code: types.CodeType
+
+
 class GuardFn(Protocol):
     closure_vars: OrderedDict[str, object]
     code_parts: List[str]
     verbose_code_parts: List[str]
     global_scope: Dict[str, object]
+    guard_fail_fn: Optional[Callable[[GuardFail], None]]
 
     # maps locals of user function to bool
     def __call__(self, *maybe_dotzero: object, **f_locals: object) -> bool:

From c473284747ac587d2780a50bda0326c3a6e48a45 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 7 Dec 2022 12:01:34 -0800
Subject: [PATCH 1695/1922] Type torch._dynamo.symbolic_convert (#90185)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90185
Approved by: https://github.com/voznesenskym
---
 .lintrunner.toml                  |   1 +
 torch/_dynamo/output_graph.py     |   7 +-
 torch/_dynamo/symbolic_convert.py | 111 +++++++++++++++++++++---------
 3 files changed, 87 insertions(+), 32 deletions(-)

diff --git a/.lintrunner.toml b/.lintrunner.toml
index 2bb69e6dc4bc0..dc4981a4a287c 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -155,6 +155,7 @@ code = 'MYPYNOFOLLOW'
 include_patterns = [
     'torch/_dynamo/eval_frame.py',
     'torch/_dynamo/convert_frame.py',
+    'torch/_dynamo/symbolic_convert.py',
     'torch/_dynamo/types.py',
     'torch/_dynamo/output_graph.py',
     'torch/_dynamo/guards.py',
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index 596a3de881762..22d8a7560bd16 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -56,6 +56,11 @@ def __call__(self, *args: torch.Tensor) -> Tuple[torch.Tensor, ...]:
 CompilerFn = Callable[[fx.GraphModule, List[torch.Tensor]], CompiledFn]
 
 
+OutputGraphState = Tuple[
+    List[GraphArg], Set[Guard], Optional[Dict[str, torch.nn.Module]], SideEffects, int
+]
+
+
 @functools.lru_cache(None)
 def _step_logger():
     return torchdynamo_logging.get_step_logger(log)
@@ -199,7 +204,7 @@ def pop_tx(self):
     def current_tx(self):
         return self.root_tx if not self._current_tx else self._current_tx[-1]
 
-    def copy_graphstate(self):
+    def copy_graphstate(self) -> OutputGraphState:
         """Create a checkpoint of the current state by copying everything"""
         assert self.nn_modules is not None
         state = (
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index 192437b738dad..9d319f3bdc74b 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -13,7 +13,7 @@
 import typing
 import weakref
 from collections.abc import Sized
-from typing import Any, Dict, List
+from typing import Any, Callable, Dict, List, Optional, Tuple
 from unittest.mock import patch
 
 import torch
@@ -39,7 +39,7 @@
 from .codegen import PyCodegen
 from .exc import BackendCompilerFailed, unimplemented, Unsupported
 from .guards import GuardBuilder
-from .output_graph import GraphCompileReason, OutputGraph
+from .output_graph import GraphCompileReason, OutputGraph, OutputGraphState
 from .replay_record import DummyModule, ExecutionRecorder
 from .resume_execution import ContinueExecutionCache, ReenterWith
 from .source import (
@@ -92,7 +92,7 @@ def _step_logger():
 @dataclasses.dataclass
 class BlockStackEntry:
     target: Instruction
-    stack_index: int = None
+    stack_index: Optional[int] = None
     with_context: ContextWrappingVariable = None
 
     def can_restore(self):
@@ -106,7 +106,19 @@ def exit(self, tx):
         return self.with_context.exit(tx)
 
 
-def stack_op(fn: typing.Callable):
+InstructionTranslatorGraphState = Tuple[
+    OutputGraphState,
+    Dict[str, VariableTracker],
+    List[VariableTracker],
+    List[BlockStackEntry],
+    Optional[int],
+    Instruction,
+    Optional[Instruction],
+    int,
+]
+
+
+def stack_op(fn: typing.Callable[..., object]):
     nargs = len(inspect.signature(fn).parameters)
     fn_var = BuiltinVariable(fn)
 
@@ -118,7 +130,9 @@ def impl(self: "InstructionTranslatorBase", inst: Instruction):
 
 
 def _detect_and_normalize_assert_statement(
-    self: "InstructionTranslatorBase", truth_fn: typing.Callable, push: bool
+    self: "InstructionTranslatorBase",
+    truth_fn: typing.Callable[[object], bool],
+    push: bool,
 ):
     # Detect if this jump instruction is assert and normalize the assert
     # by pushing dummy error message when nothing is given.
@@ -140,6 +154,7 @@ def _detect_and_normalize_assert_statement(
     if (truth_fn is not operator.truth) or push:
         return False
 
+    assert isinstance(self.instruction_pointer, int)
     current_instruction_pointer = self.instruction_pointer
     inst = self.instructions[current_instruction_pointer]
     # Detect LOAD_ASSERTION_ERROR or LOAD_GLOBAL 0
@@ -188,7 +203,7 @@ def _detect_and_normalize_assert_statement(
     return True
 
 
-def generic_jump(truth_fn: typing.Callable, push: bool):
+def generic_jump(truth_fn: typing.Callable[[object], bool], push: bool):
     def inner(self: "InstructionTranslatorBase", inst: Instruction):
         value: VariableTracker = self.pop()
         self.output.guards.update(value.guards)
@@ -342,8 +357,24 @@ def wrapper(self: "InstructionTranslatorBase", inst: Instruction):
 
 
 class InstructionTranslatorBase(object):
+    output: OutputGraph
+    symbolic_locals: Dict[str, VariableTracker]
+    symbolic_globals: Dict[str, VariableTracker]
+    stack: List[VariableTracker]
+    instruction_pointer: Optional[int]
+    current_instruction: Instruction
+    next_instruction: Optional[Instruction]
+    block_stack: List[BlockStackEntry]
+    lineno: int
+
+    checkpoint: Optional[Tuple[Instruction, InstructionTranslatorGraphState]]
+    random_calls: List[
+        Tuple[Callable[..., object], Tuple[object, ...], Dict[str, object]]
+    ]
+
     def has_backedge(self):
         cur_offset = self.current_instruction.offset
+        assert self.instruction_pointer is not None
         for inst in self.instructions[self.instruction_pointer :]:
             if inst.opname in (
                 "JUMP_ABSOLUTE",
@@ -397,7 +428,7 @@ def repl(v: VariableTracker):
         def skip(v: VariableTracker):
             return oldvar.mutable_local not in v.recursively_contains
 
-        cache = dict()
+        cache: Dict[int, Tuple[object, object]] = dict()
         self.output.side_effects.apply(repl, cache, skip_fn=skip)
         self.stack = [
             VariableTracker.apply(repl, x, cache, skip_fn=skip) for x in self.stack
@@ -431,6 +462,7 @@ def inline_user_function_return(self, fn, args, kwargs):
 
     def step(self):
         """Process exactly one instruction, return False we should exit"""
+        assert isinstance(self.instruction_pointer, int)
         inst = self.instructions[self.instruction_pointer]
         self.current_instruction = inst
         self.instruction_pointer += 1
@@ -463,11 +495,12 @@ def step(self):
         except Exception as exc:
             real_stack = getattr(exc, "real_stack", [])
             real_stack.append(self.frame_summary())
-            exc.real_stack = real_stack
+            exc.real_stack = real_stack  # type: ignore[attr-defined]
             raise
 
         # generate code from checkpoint
         assert not self.output.output_instructions
+        assert self.checkpoint is not None
         continue_inst, state = self.checkpoint
         self.restore_graphstate(state)
         self.output.compile_subgraph(self, partial_convert=True)
@@ -489,7 +522,7 @@ def run(self):
             raise
         except Exception as e:
             if config.replay_record_enabled:
-                e.exec_record = self.exec_recorder.get_record()
+                e.exec_record = self.exec_recorder.get_record()  # type: ignore[attr-defined]
             raise
         finally:
             self.output.pop_tx()
@@ -501,20 +534,20 @@ def run(self):
             if isinstance(self, InstructionTranslator):
                 self.output.cleanup()
 
-    def push(self, val):
+    def push(self, val: Optional[VariableTracker]):
         assert val is None or isinstance(
             val, VariableTracker
         ), f"push expects VariableTracker, got {typestr(val)}"
         self.stack.append(val)
 
-    def push_many(self, vals: List[TensorVariable]):
+    def push_many(self, vals: List[VariableTracker]):
         for val in vals:
             self.push(val)
 
-    def pop(self) -> TensorVariable:
+    def pop(self) -> VariableTracker:
         return self.stack.pop()
 
-    def popn(self, n: int) -> List[TensorVariable]:
+    def popn(self, n: int) -> List[VariableTracker]:
         assert n >= 0
         return list(reversed([self.pop() for _ in range(n)]))
 
@@ -643,7 +676,7 @@ def calc_package(self):
                     f"({package!r} != {spec.parent!r})",
                     ImportWarning,
                     stacklevel=3,
-                )
+                )  # type: ignore[call-arg]
             return package
         elif spec is not None:
             return spec.parent
@@ -653,7 +686,7 @@ def calc_package(self):
                 "falling back on __name__ and __path__",
                 ImportWarning,
                 stacklevel=3,
-            )
+            )  # type: ignore[call-arg]
             package = self.f_globals["__name__"]
             if "__path__" not in self.f_globals:
                 package = package.rpartition(".")[0]
@@ -956,8 +989,8 @@ def CALL_FUNCTION_KW(self, inst):
         fn = self.pop()
         assert isinstance(argnames, ConstantVariable)
         argnames = argnames.value
-        args, kwargs = args[: -len(argnames)], args[-len(argnames) :]
-        kwargs = dict(zip(argnames, kwargs))
+        args, kwargs_list = args[: -len(argnames)], args[-len(argnames) :]
+        kwargs = dict(zip(argnames, kwargs_list))
         assert len(kwargs) == len(argnames)
         self.call_function(fn, args, kwargs)
 
@@ -1015,6 +1048,16 @@ def STORE_ATTR(self, inst):
             self.create_call_resume_at(self.next_instruction)
         )
 
+    def create_call_resume_at(self, offset):
+        raise AssertionError(
+            f"create_call_resume_at not overridden by subclass {type(self)}"
+        )
+
+    def should_compile_partial_graph(self) -> bool:
+        raise AssertionError(
+            f"should_compile_partial_graph not overridden by subclass {type(self)}"
+        )
+
     @break_graph_if_unsupported(push=0)
     def STORE_SUBSCR(self, inst):
         val, obj, key = self.popn(3)
@@ -1396,7 +1439,7 @@ def MATCH_KEYS(self, inst):
     INPLACE_XOR = stack_op(operator.ixor)
     INPLACE_OR = stack_op(operator.ior)
 
-    def copy_graphstate(self):
+    def copy_graphstate(self) -> InstructionTranslatorGraphState:
         """Create a checkpoint of the current state by copying everything"""
         return (
             self.output.copy_graphstate(),
@@ -1482,18 +1525,18 @@ def __init__(
         f_code: types.CodeType,
         export: bool,
     ):
-        super(InstructionTranslatorBase, self).__init__()
+        super().__init__()
 
         # Mutable state checkpointed by copy_graphstate()
-        self.output: OutputGraph = output
-        self.symbolic_locals: Dict[str, VariableTracker] = symbolic_locals
-        self.symbolic_globals: Dict[str, VariableTracker] = symbolic_globals
-        self.stack: List[VariableTracker] = []
-        self.instruction_pointer: int = 0
-        self.current_instruction: Instruction = create_instruction("NOP")
-        self.next_instruction: typing.Optional[Instruction] = None
-        self.block_stack: List[BlockStackEntry] = []
-        self.lineno: int = code_options["co_firstlineno"]
+        self.output = output
+        self.symbolic_locals = symbolic_locals
+        self.symbolic_globals = symbolic_globals
+        self.stack = []
+        self.instruction_pointer = 0
+        self.current_instruction = create_instruction("NOP")
+        self.next_instruction = None
+        self.block_stack = []
+        self.lineno = code_options["co_firstlineno"]
 
         # Properties of the input/output code
         self.instructions: List[Instruction] = instructions
@@ -1519,7 +1562,7 @@ def __init__(
         )
 
         self.checkpoint = None
-        self.random_calls: List[tuple] = []
+        self.random_calls = []
 
         if sys.version_info >= (3, 10):
             from .resume_execution import (
@@ -1687,6 +1730,8 @@ def RETURN_VALUE(self, inst):
 class InliningInstructionTranslator(InstructionTranslatorBase):
     """Trace and inline a called method"""
 
+    symbolic_result: Optional[TensorVariable]
+
     @classmethod
     def inline_call(cls, parent, func, args, kwargs):
         with patch.dict(counters, {"unimplemented": counters["inline_call"]}):
@@ -1735,6 +1780,7 @@ def inline_call_(parent, func, args, kwargs):
 
         log.debug(f"INLINING {code} \n {dis.Bytecode(code).dis()} \n")
 
+        tracer: InliningInstructionTranslator
         if is_generator(code):
             tracer = InliningGeneratorInstructionTranslator(
                 parent, code, sub_locals, parent.symbolic_globals, closure_cells, func
@@ -1755,6 +1801,7 @@ def inline_call_(parent, func, args, kwargs):
         log.debug(f"DONE INLINING {code}")
 
         if is_generator(code):
+            assert isinstance(tracer, InliningGeneratorInstructionTranslator)
             assert tracer.symbolic_result.as_python_constant() is None
             return ListIteratorVariable(
                 tracer.generated_items,
@@ -1838,9 +1885,9 @@ def LOAD_CLOSURE(self, inst):
     def replace_all(self, oldvar: VariableTracker, newvar: VariableTracker):
         newvar = super().replace_all(oldvar, newvar)
         # recursively check and update parent's locals and stack in case oldvar is from parent
-        translator = self
+        translator: InstructionTranslatorBase = self
         while hasattr(translator, "parent"):
-            translator = translator.parent
+            translator = translator.parent  # type: ignore[attr-defined]
             translator.update_locals_and_stack(oldvar, newvar)
         return newvar
 
@@ -1856,6 +1903,8 @@ def RETURN_VALUE(self, inst):
 
 
 class InliningGeneratorInstructionTranslator(InliningInstructionTranslator):
+    generated_items: List[VariableTracker]
+
     def __init__(self, *args, **kwargs):
         super(InliningGeneratorInstructionTranslator, self).__init__(*args, **kwargs)
         self.generated_items = []

From 7fc064e49932277009f6738a213d38b73aa31f4e Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 7 Dec 2022 12:01:35 -0800
Subject: [PATCH 1696/1922] Convert InstructionTranslatorGraphState and
 OutputGraphState to NamedTuple (#90186)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90186
Approved by: https://github.com/voznesenskym
---
 torch/_dynamo/output_graph.py     | 26 ++++++++++++++++++++------
 torch/_dynamo/symbolic_convert.py | 25 ++++++++++++-------------
 2 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index 22d8a7560bd16..37bc9758ee929 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -7,7 +7,18 @@
 import re
 import traceback
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Optional, OrderedDict, Set, Tuple, Union
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    NamedTuple,
+    Optional,
+    OrderedDict,
+    Set,
+    Tuple,
+    Union,
+)
 
 import sympy
 from typing_extensions import Protocol
@@ -56,9 +67,12 @@ def __call__(self, *args: torch.Tensor) -> Tuple[torch.Tensor, ...]:
 CompilerFn = Callable[[fx.GraphModule, List[torch.Tensor]], CompiledFn]
 
 
-OutputGraphState = Tuple[
-    List[GraphArg], Set[Guard], Optional[Dict[str, torch.nn.Module]], SideEffects, int
-]
+class OutputGraphState(NamedTuple):
+    graphargs: List[GraphArg]
+    guards: Set[Guard]
+    nn_modules: Optional[Dict[str, torch.nn.Module]]
+    side_effects: SideEffects
+    timestamp: int
 
 
 @functools.lru_cache(None)
@@ -207,7 +221,7 @@ def current_tx(self):
     def copy_graphstate(self) -> OutputGraphState:
         """Create a checkpoint of the current state by copying everything"""
         assert self.nn_modules is not None
-        state = (
+        state = OutputGraphState(
             list(self.graphargs),
             set(self.guards),
             dict(self.nn_modules),
@@ -217,7 +231,7 @@ def copy_graphstate(self) -> OutputGraphState:
         self.timestamp += 1
         return state
 
-    def restore_graphstate(self, state):
+    def restore_graphstate(self, state: OutputGraphState):
         """Restore a checkpoint created by self.copy_graphstate()"""
         (
             self.graphargs,
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index 9d319f3bdc74b..4622823921418 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -13,7 +13,7 @@
 import typing
 import weakref
 from collections.abc import Sized
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple
 from unittest.mock import patch
 
 import torch
@@ -106,16 +106,15 @@ def exit(self, tx):
         return self.with_context.exit(tx)
 
 
-InstructionTranslatorGraphState = Tuple[
-    OutputGraphState,
-    Dict[str, VariableTracker],
-    List[VariableTracker],
-    List[BlockStackEntry],
-    Optional[int],
-    Instruction,
-    Optional[Instruction],
-    int,
-]
+class InstructionTranslatorGraphState(NamedTuple):
+    output: OutputGraphState
+    symbolic_locals: Dict[str, VariableTracker]
+    stack: List[VariableTracker]
+    block_stack: List[BlockStackEntry]
+    instruction_pointer: Optional[int]
+    current_instruction: Instruction
+    next_instruction: Optional[Instruction]
+    lineno: int
 
 
 def stack_op(fn: typing.Callable[..., object]):
@@ -1441,7 +1440,7 @@ def MATCH_KEYS(self, inst):
 
     def copy_graphstate(self) -> InstructionTranslatorGraphState:
         """Create a checkpoint of the current state by copying everything"""
-        return (
+        return InstructionTranslatorGraphState(
             self.output.copy_graphstate(),
             collections.OrderedDict(self.symbolic_locals),
             list(self.stack),
@@ -1452,7 +1451,7 @@ def copy_graphstate(self) -> InstructionTranslatorGraphState:
             self.lineno,
         )
 
-    def restore_graphstate(self, state):
+    def restore_graphstate(self, state: InstructionTranslatorGraphState):
         """Restore a checkpoint created by self.copy_graphstate()"""
         (
             output_state,

From 064724edcab748bb11038505e3060c6129bed233 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 7 Dec 2022 12:01:36 -0800
Subject: [PATCH 1697/1922] Type torch._dynamo.side_effects (#90202)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90202
Approved by: https://github.com/voznesenskym
---
 .lintrunner.toml              |  1 +
 torch/_dynamo/side_effects.py | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/.lintrunner.toml b/.lintrunner.toml
index dc4981a4a287c..10756f22a8e75 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -159,6 +159,7 @@ include_patterns = [
     'torch/_dynamo/types.py',
     'torch/_dynamo/output_graph.py',
     'torch/_dynamo/guards.py',
+    'torch/_dynamo/side_effects.py',
     'torch/_dynamo/optimizations/__init__.py',
     'torch/_dynamo/optimizations/backends.py',
     'torch/_dynamo/optimizations/training.py',
diff --git a/torch/_dynamo/side_effects.py b/torch/_dynamo/side_effects.py
index 55e6e9f927e8d..840900c6295ed 100644
--- a/torch/_dynamo/side_effects.py
+++ b/torch/_dynamo/side_effects.py
@@ -1,7 +1,7 @@
 import collections
 import dataclasses
 import inspect
-from typing import Any
+from typing import Any, Dict, List
 
 import torch.nn
 
@@ -59,14 +59,18 @@ def __eq__(self, other):
         return self is other
 
 
-class SideEffects(object):
+class SideEffects:
     """
     Track side effects (list mutation, setattr, etc) that need to be
     applied after an FX graph is run.
     """
 
+    id_to_variable: Dict[int, VariableTracker]
+    store_attr_mutations: Dict[AttributeMutation, Dict[str, VariableTracker]]
+    keepalive: List[Any]
+
     def __init__(self, id_to_variable=None, store_attr_mutations=None, keepalive=None):
-        super(SideEffects, self).__init__()
+        super().__init__()
         self.id_to_variable = id_to_variable or collections.OrderedDict()
         self.store_attr_mutations = store_attr_mutations or collections.OrderedDict()
         self.keepalive = keepalive or []

From cb15e39da81b42e158a0fc708b079170cc3dc1c5 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 7 Dec 2022 12:04:41 -0800
Subject: [PATCH 1698/1922] Rewrite dynamo cond() handling to not recursively
 call export (#90286)

The original implementation of cond() operator support in dynamo operated by recursively calling export() on the inner subgraph.  This is problematic for a number of reasons:

* My original motivating reason: the original implementation had to play tricks to feed real tensors to the recursive export call, which means that it doesn't work well with tracing with dynamic shapes (where we MUST stay in fake tensors to accurately track dynamic shapes across the cond invocation)
* If there are pending side effects, the recursive export() call won't see those side effects (as they are only tracked by Dynamo, not actually applied to the Python environment.) You can see an example where dynamo cond tracing does the wrong thing at https://github.com/pytorch/pytorch/pull/90208
* If there were side effects inside the true/false branch, these side effects were silently lost (as the export only returns the graph of tensor operations, and not any of the residual Python bytecodes necessary to reapply any side effects.) This could have substantive effects on the export of subsequent parts of the model, as those parts of the models could rely on the side effects.
* It was not possible to track NN module accesses inside the true/false branches, necessitating a hack where the NN module was explicitly passed in as an input to cond https://github.com/pytorch/pytorch/pull/87020#issuecomment-1338842844 which doesn't really make any sense from a backend compilation perspective
* Guards induced from the inside of the true/false branch were not properly propagated to the top level guards; they were just silently dropped (in fact, the original implementation checked that the true/false branch produce the same guards which... is not useful? Like, I don't think that actually is even necessary for correctness)

This PR replaces the old implementation with a new implementation based on graphstate checkpointing. The basic idea is to process a cond(), we checkpoint the state of our interpreter, run the true branch, rollback to our checkpoint, run the false branch, rollback to our checkpoint and then merge the changes from both of the checkpoints. I require the true/false branches to have exactly the same side effects, but union their guards.

Some of the details:

* Dynamo is too aggressive with tracking side effects when processing closures, c.f. https://github.com/pytorch/torchdynamo/pull/233/files#r1040480078 The basic problem is whenever I define a closure, this immediately counts as a side effect, even if I didn't actually mutate anything. This triggered on the nested cond export example. To prevent this from happening, I optimistically avoid tracking side effects, but if a STORE_DEREF happens, I restart analysis with the relevant Source.name() added to `mutated_closure_cell_contents` so we start tracking on closure allocation. This is enough to fix the relevant test.
* For the most part, I assert that the graph states must be equivalent after applying the true/false branches. During debugging, I found it useful to be able to compare two graph states and give a better description about what the divergence was. You can test this using the `diff()` method I've added to a few structures.
* The implementation now supports NestedUserFunctionVariable, which is nice as it allows the true/false branches to be defined closer to the cond implementation.
* I fixed the naming of the true/false subgraphs; previously they were named `name_0`, `name_1`, now they are named `cond_true_0` and `cond_false_0`
* I added `name_to_input` to the saved graph state. I don't actually know if this is necessary, but it seemed like a good idea.
* I have to play some tricks to get the speculating execution of the true/false branch to record into a subgraph. After a careful read of OutputGraph, I found that what would work is overriding graph with a fresh Graph that we want to write things into, and manually setting up the inputs/outputs. It's a little delicate as you have to make sure you reset the Graph to its original before you restore a checkpoint, as checkpoints don't actually save graph for efficiency, and just undo changes on the graph. This capability may usefully get refactored to OutputGraph but I didn't do it in this PR for simplicity.

There are some further problems with the cond() implementation that I leave for future work. Most of these were preexisting with the original implementation.

* Not a problem per se, but if an NN module is used by both the true/false branch, it will show up in the final graph twice (since it has to be a submodule of the GraphModule that makes use of it.) I hope the export pipeline can deal with this.
* List of tensor output for cond is not supported.
* The true/false return values may not have consistent sizes/dims/etc, and we don't check them for consistency.
* If we modify fake tensors in the true/false branches, we aren't rolling them back, c.f. https://github.com/pytorch/torchdynamo/issues/1840

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90286
Approved by: https://github.com/voznesenskym
---
 test/dynamo/test_export.py           |  14 +-
 torch/_dynamo/convert_frame.py       |   5 +-
 torch/_dynamo/output_graph.py        |  17 ++
 torch/_dynamo/side_effects.py        |  28 +++-
 torch/_dynamo/symbolic_convert.py    |  33 +++-
 torch/_dynamo/variables/functions.py |  26 ++-
 torch/_dynamo/variables/torch.py     | 230 ++++++++++++++++-----------
 7 files changed, 246 insertions(+), 107 deletions(-)

diff --git a/test/dynamo/test_export.py b/test/dynamo/test_export.py
index 7779c479b6d41..b0640f651194d 100644
--- a/test/dynamo/test_export.py
+++ b/test/dynamo/test_export.py
@@ -1437,19 +1437,19 @@ def nop(x):
     def test_export_with_module_layer(self):
         from functorch.experimental.control_flow import cond
 
-        def true_fn(layer, val):
-            return layer(val) * torch.tensor(2)
-
-        def false_fn(layer, val):
-            return layer(val) * torch.tensor(-1)
-
         class Module(torch.nn.Module):
             def __init__(self):
                 super().__init__()
                 self.linear = torch.nn.Linear(3, 3)
 
             def forward(self, pred, x):
-                return cond(pred, true_fn, false_fn, [self.linear, x])
+                def true_fn(val):
+                    return self.linear(val) * torch.tensor(2)
+
+                def false_fn(val):
+                    return self.linear(val) * torch.tensor(-1)
+
+                return cond(pred, true_fn, false_fn, [x])
 
         mod = Module()
         x = torch.randn([3, 3])
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index 206f77b99ec9b..a2105cd10743a 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -6,7 +6,7 @@
 import types
 import weakref
 from traceback import FrameSummary
-from typing import cast, Dict, List, Optional
+from typing import cast, Dict, List, Optional, Set
 
 import torch
 from torch.fx.graph_module import _forward_from_src as original_forward_from_src
@@ -364,6 +364,8 @@ def _compile(
     frame: Optional[types.FrameType] = None,
 ) -> Optional[GuardedCode]:
     output: Optional[OutputGraph] = None
+    # This is shared across restarts
+    mutated_closure_cell_contents: Set[str] = set()
 
     # from .utils import print_once;  print_once(code.co_filename)
     def transform(instructions, code_options):
@@ -378,6 +380,7 @@ def transform(instructions, code_options):
             compiler_fn,
             one_graph,
             export,
+            mutated_closure_cell_contents,
         )
         tracer.run()
         output = tracer.output
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index 37bc9758ee929..7c3a1782b0f86 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -73,6 +73,21 @@ class OutputGraphState(NamedTuple):
     nn_modules: Optional[Dict[str, torch.nn.Module]]
     side_effects: SideEffects
     timestamp: int
+    name_to_input: OrderedDict[str, Optional[fx.Proxy]]
+
+    def diff(self, other: "OutputGraphState", *, prefix: str = "") -> Optional[str]:
+        for k in self._fields:
+            if k == "side_effects":
+                r = self.side_effects.diff(other.side_effects)
+                if r is not None:
+                    return r
+                continue
+
+            sv = getattr(self, k)
+            ov = getattr(other, k)
+            if sv != ov:
+                return f"{prefix}{k} mismatch: {sv} != {ov}"
+        return None
 
 
 @functools.lru_cache(None)
@@ -227,6 +242,7 @@ def copy_graphstate(self) -> OutputGraphState:
             dict(self.nn_modules),
             self.side_effects.clone(),
             self.timestamp,
+            self.name_to_input.copy(),
         )
         self.timestamp += 1
         return state
@@ -239,6 +255,7 @@ def restore_graphstate(self, state: OutputGraphState):
             self.nn_modules,
             self.side_effects,
             self.timestamp,
+            self.name_to_input,
         ) = state
         # FX deepcopy doesn't work for a partially created graph, so just remove new nodes
         for node in reversed(list(self.graph.nodes)):
diff --git a/torch/_dynamo/side_effects.py b/torch/_dynamo/side_effects.py
index 840900c6295ed..46c5cd115e052 100644
--- a/torch/_dynamo/side_effects.py
+++ b/torch/_dynamo/side_effects.py
@@ -1,7 +1,7 @@
 import collections
 import dataclasses
 import inspect
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 import torch.nn
 
@@ -75,6 +75,32 @@ def __init__(self, id_to_variable=None, store_attr_mutations=None, keepalive=Non
         self.store_attr_mutations = store_attr_mutations or collections.OrderedDict()
         self.keepalive = keepalive or []
 
+    def __eq__(self, other: object) -> bool:
+        assert isinstance(other, SideEffects)
+        # NB: do NOT test keepalive
+        return (
+            self.id_to_variable == other.id_to_variable
+            and self.store_attr_mutations == other.store_attr_mutations
+        )
+
+    def diff(self, other: "SideEffects") -> Optional[str]:
+        if self.id_to_variable != other.id_to_variable:
+            sk_itv = self.id_to_variable.keys()
+            ok_itv = other.id_to_variable.keys()
+            if sk_itv != ok_itv:
+                return f"id_to_variable keys: {sk_itv} != {ok_itv}"
+            # Feel free to augment this with more fancy diffing logic
+            # if needed for debugging
+            return "id_to_variable: unknown diff"
+        elif self.store_attr_mutations != other.store_attr_mutations:
+            sk_sam = self.store_attr_mutations.keys()
+            ok_sam = other.store_attr_mutations.keys()
+            if sk_sam != ok_sam:
+                return f"store_attr_mutations keys: {sk_sam} != {ok_sam}"
+            return "store_attr_mutations: unknown diff"
+        else:
+            return None
+
     def clone(self):
         """Create a shallow copy"""
         return self.__class__(
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index 4622823921418..2064e12497a38 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -13,7 +13,7 @@
 import typing
 import weakref
 from collections.abc import Sized
-from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Set, Tuple
 from unittest.mock import patch
 
 import torch
@@ -116,6 +116,16 @@ class InstructionTranslatorGraphState(NamedTuple):
     next_instruction: Optional[Instruction]
     lineno: int
 
+    def diff(self, other: "InstructionTranslatorGraphState") -> Optional[str]:
+        for k in self._fields:
+            if k == "output":
+                return self.output.diff(other.output, prefix=f"{k}.")
+            sv = getattr(self, k)
+            ov = getattr(other, k)
+            if sv != ov:
+                return f"{k} mismatch: {sv} != {ov}"
+        return None
+
 
 def stack_op(fn: typing.Callable[..., object]):
     nargs = len(inspect.signature(fn).parameters)
@@ -365,6 +375,7 @@ class InstructionTranslatorBase(object):
     next_instruction: Optional[Instruction]
     block_stack: List[BlockStackEntry]
     lineno: int
+    mutated_closure_cell_contents: Set[str]
 
     checkpoint: Optional[Tuple[Instruction, InstructionTranslatorGraphState]]
     random_calls: List[
@@ -1589,6 +1600,7 @@ def __init__(
         compiler_fn,
         one_graph,
         export,
+        mutated_closure_cell_contents: Set[str],
     ):
         super(InstructionTranslator, self).__init__(
             output=OutputGraph(f_globals, code_options, compiler_fn, self),
@@ -1605,6 +1617,7 @@ def __init__(
         )
         self.one_graph: bool = one_graph
         self.export = export
+        self.mutated_closure_cell_contents = mutated_closure_cell_contents
         if self.export:
             assert (
                 self.one_graph
@@ -1853,14 +1866,30 @@ def STORE_DEREF(self, inst):
             else:
                 self.output.side_effects.store_cell(cell, val)
         else:
+            maybe_cell = self.symbolic_locals.get(inst.argval)
             if isinstance(
-                self.symbolic_locals.get(inst.argval),
+                maybe_cell,
                 variables.NewCellVariable,
             ):
                 self.output.side_effects.store_cell(
                     self.symbolic_locals[inst.argval], self.pop()
                 )
             else:
+                if (
+                    maybe_cell is not None
+                    and maybe_cell.source.name()
+                    not in self.parent.mutated_closure_cell_contents
+                ):
+                    # Why is the source name here unique?
+                    # mutated_closure_cell_contents is a per-frame
+                    # concept, and sources identify, e.g., particular
+                    # locals from the frame.  If you had two locals,
+                    # they'll get different source names, and therefore
+                    # differ here.
+                    self.parent.mutated_closure_cell_contents.add(
+                        maybe_cell.source.name()
+                    )
+                    raise exc.RestartAnalysis()
                 unimplemented("write to __closure__ while inlining")
 
     def LOAD_DEREF(self, inst):
diff --git a/torch/_dynamo/variables/functions.py b/torch/_dynamo/variables/functions.py
index a8bb8bd84c79e..b0259731772e4 100644
--- a/torch/_dynamo/variables/functions.py
+++ b/torch/_dynamo/variables/functions.py
@@ -119,6 +119,8 @@ def bind_args(self, parent, args, kwargs):
         options = VariableTracker.propagate([self])
         wrap = functools.partial(wrap_bound_arg, options=options)
 
+        tx = parent.output.root_tx
+
         fn: types.FunctionType = self.fn
         fake_func = types.FunctionType(
             fn.__code__,
@@ -146,7 +148,7 @@ def bind_args(self, parent, args, kwargs):
             if name == "__class__":
                 result[name] = variables.UserDefinedClassVariable(cell.cell_contents)
             else:
-                var = parent.output.root_tx.match_nested_cell(name, cell)
+                var = tx.match_nested_cell(name, cell)
                 if var is not None:
                     # optimization for cleaner codegen
                     result[name] = var
@@ -163,15 +165,31 @@ def bind_args(self, parent, args, kwargs):
                         closure_cell_contents = AttrSource(
                             closure_cell, "cell_contents"
                         )
+                        contents_var = VariableBuilder(parent, closure_cell_contents)(
+                            cell.cell_contents
+                        )
+
+                        if (
+                            closure_cell_contents.name()
+                            not in tx.mutated_closure_cell_contents
+                        ):
+                            # Optimistically don't allocate the cell, to
+                            # reduce the number of side effects.  This is
+                            # important for cond, as without it, any accesses
+                            # to closures create side effects and cond doesn't
+                            # support side effects.  If we're wrong and this
+                            # closure cell gets written to, we will restart
+                            # the analysis with this cell's name in the
+                            # mutated list here
+                            result[name] = contents_var
+                            continue
 
                         # cells are written to with "cell_contents",
                         # so the source should just be the closure_cell, not its contents
                         out = side_effects.track_cell_existing(closure_cell, cell)
                         side_effects.store_cell(
                             out,
-                            VariableBuilder(parent, closure_cell_contents)(
-                                cell.cell_contents
-                            ),
+                            contents_var,
                         )
 
                     result[name] = out
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index 35ca0d7a48340..31ad83cb648a3 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -3,11 +3,13 @@
 import math
 import re
 import types
+from collections import OrderedDict
 from typing import Dict, List
 
 import numpy
 
 import torch._C
+import torch.fx
 import torch.nn
 import torch.onnx.operators
 
@@ -27,7 +29,6 @@
 from .base import VariableTracker
 from .lists import ListVariable, TupleVariable
 from .misc import AutocastModeVariable, NullContextVariable
-from .nn_module import NNModuleVariable
 from .tensor import TensorWithTFOverrideVariable
 
 log = logging.getLogger(__name__)
@@ -637,72 +638,30 @@ def __init__(self, value, **kwargs):
     def call_function(
         self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
     ) -> "VariableTracker":
-        from . import ListVariable, TensorVariable, UserFunctionVariable
+        from . import (
+            ListVariable,
+            NestedUserFunctionVariable,
+            TensorVariable,
+            UserFunctionVariable,
+        )
         from .builder import wrap_fx_proxy
 
         assert kwargs is None or len(kwargs) == 0, "kwargs are not supported, yet"
 
-        def unwrap_real(arg):
-            if isinstance(arg, TensorVariable):
-                return arg.get_real_value()
-            if isinstance(arg, UserFunctionVariable):
-                return arg.fn
-            if isinstance(arg, NNModuleVariable):
-                return tx.output.get_submodule(arg.module_key)
-            if arg.has_unpack_var_sequence(tx):
-                return [
-                    unwrap_real(arg_inner) for arg_inner in arg.unpack_var_sequence(tx)
-                ]
-            return arg
-
-        def make_attr(name, proxy_args=None):
+        def make_attr(name):
             node = tx.output.create_proxy(
                 "get_attr",
                 name,
-                tuple(proxy_args) if proxy_args else tuple(),
+                (),
                 {},
             )
             return node
 
-        # Get values
-        u_args = [unwrap_real(arg) for arg in args]
-
-        def unwrap_proxy(arg):
-            try:
-                if isinstance(arg, TensorVariable):
-                    return arg.as_proxy()
-                if isinstance(arg, NNModuleVariable):
-                    name = arg.module_key
-                    mod = unwrap_real(arg)
-                    options = VariableTracker.propagate(self, args, kwargs.values())
-                    tx.output.register_attr_or_module(
-                        mod,
-                        name,
-                        name,
-                        source=NNModuleSource(
-                            GetItemSource(self.source, arg.module_key)
-                        ),
-                        **options,
-                    )
-                    return make_attr(name)
-                if arg.has_unpack_var_sequence(tx):
-                    return [
-                        unwrap_proxy(arg_inner)
-                        for arg_inner in arg.unpack_var_sequence(tx)
-                    ]
-                return arg.as_proxy()
-            except NotImplementedError:
-                return arg
-
-        def register_as_subgraph(fn, name, args):
-            from .. import export
-
-            gm, guards = export(fn, *args)
-
+        def add_subgraph(name, gm):
             next_name = None
             i = 0
             while not next_name:
-                candidate = f"name_{i}"
+                candidate = f"cond_{name}_{i}"
                 if candidate in tx.output.nn_modules:
                     i += 1
                 else:
@@ -712,56 +671,143 @@ def register_as_subgraph(fn, name, args):
             src = NNModuleSource(GetItemSource(self.source, next_name))
             gm.torchdynamo_force_dynamic = False
             tx.output.register_attr_or_module(gm, next_name, source=src)
-            return next_name, gm, guards
+            return next_name
 
-        # Get args as proxies
-        p_args = [unwrap_proxy(arg) for arg in args]
         if self.value.__name__ == "cond":
             # TODO(voz): Support fake tensor dispatch for recursive
             # ops - see torch/dispatch/_dispatcher.py
-            from .. import config
 
-            # The current recursive export() implementation will
-            # not "see" any side effect updates from the enclosing
-            # context, which can result in possibly incorrect
-            # export.  This assert ensures that there were no
-            # outstanding side effects at the time cond() was called.
+            assert len(args) == 4
+            assert type(args[0]) is TensorVariable, str(type(args[0]))  # predicate
+            assert isinstance(
+                args[1], (UserFunctionVariable, NestedUserFunctionVariable)
+            ), str(
+                type(args[1])
+            )  # true_fn
+            assert isinstance(
+                args[2], (UserFunctionVariable, NestedUserFunctionVariable)
+            ), str(
+                type(args[2])
+            )  # false_fn
+            assert type(args[3]) is ListVariable, str(type(args[3]))  # args
+
+            # Our strategy for tracing the true/false branches of cond
+            # are to checkpoint our graphstate, run the true branch,
+            # roll it back to the checkpoint, and run the false
+            # branch, and then merge the graphstates.  Well, perhaps
+            # "merge" is too strong a word: we mostly assert that
+            # the resulting graphstates have to be the same.
             #
-            # TODO: This assert may be too aggressive; I'm landing it
-            # to see if it is or not.
-            assert tx.output.side_effects.is_empty(), (
-                "Handling a cond operator when there are outstanding "
-                "side effects in the trace is not currently supported.  "
-                "Please file a bug to PyTorch requesting this functionality.  "
-                "You may be able to unblock by removing side effects (e.g., "
-                "mutating Python variables/data structures/etc) from your "
-                "model."
-            )
+            # We only permit guards to diverge (we union the guards from
+            # both branches).  In particular, this means that side
+            # effects are NOT permitted inside true/false branches; this
+            # would be difficult to implement, because of the path
+            # explosion problem.
+
+            graph_checkpoint, checkpoint = tx.output.graph, tx.copy_graphstate()
+
+            sub_args = args[3].unpack_var_sequence(tx)
+
+            def speculate_branch(branch):
+                # Setup the subgraph we're going to capture into
+                tx.output.graph = torch.fx.Graph()
+                tx.output.graphargs = []
+                tx.output.name_to_input.clear()
+
+                # One argument to graph per sub_args
+                for a in sub_args:
+                    assert isinstance(a, TensorVariable)
+                    tx.output.create_graph_input(a.as_proxy().node.name)
+                    # NB: we don't bother populating graphargs, as
+                    # they won't actually get used by anything
+
+                # NB: 0 is predicate
+                ix = 1 if branch else 2
+
+                output = args[ix].call_function(tx, sub_args, {})
+
+                # Register output to graph
+                # Modeled off of compile_and_call_fx_graph
+                # TODO: support non single Tensor output
+                assert isinstance(output, TensorVariable)
+                tx.output.guards.update(output.guards)
+                tx.output.create_node(
+                    "output", "output", (tx.output.create_arg((output.as_proxy(),))), {}
+                )
 
-            assert len(p_args) == 4
-            assert type(args[0]) is TensorVariable  # predicate
-            assert type(p_args[1]) is UserFunctionVariable  # true_fn
-            assert type(p_args[2]) is UserFunctionVariable  # false_fn
-            assert type(args[3]) is ListVariable  # args
+                tx.output.side_effects.prune_dead_object_new(tx)
+                state = tx.copy_graphstate()
+
+                guards = state.output.guards
+                nn_modules = state.output.nn_modules
+
+                # Nub out bits of state that we don't require to be
+                # equal
+                comparable_state = state._replace(
+                    output=state.output._replace(
+                        guards=set(),
+                        nn_modules=None,
+                        # Timestamp is monotonically increasing so we don't
+                        # care about divergence
+                        timestamp=0,
+                        # Meh (problem is the nodes don't compare equal;
+                        # maybe nub out outputs only)
+                        name_to_input=OrderedDict(),
+                    )
+                )
 
-            node_args = [unwrap_real(x) for x in args[3].unpack_var_sequence(tx)]
-            proxy_args = [unwrap_proxy(x) for x in args[3].unpack_var_sequence(tx)]
-            true_name, true_graph, true_guards = register_as_subgraph(
-                p_args[1].get_function(), "true", node_args
+                graph = tx.output.graph
+                tx.output.graph = graph_checkpoint
+                tx.restore_graphstate(checkpoint)
+
+                return output, graph, guards, nn_modules, comparable_state
+
+            (
+                true_r,
+                true_graph,
+                true_guards,
+                true_nn_modules,
+                true_cmp,
+            ) = speculate_branch(True)
+            (
+                false_r,
+                false_graph,
+                false_guards,
+                false_nn_modules,
+                false_cmp,
+            ) = speculate_branch(False)
+
+            if true_cmp != false_cmp:
+                unimplemented(true_cmp.diff(false_cmp))
+
+            # Add guards
+            tx.output.guards |= false_guards
+            tx.output.guards |= true_guards
+
+            true_name = add_subgraph(
+                "true", torch.fx.GraphModule(true_nn_modules, true_graph)
             )
-            false_name, false_graph, false_guards = register_as_subgraph(
-                p_args[2].get_function(), "false", node_args
+            false_name = add_subgraph(
+                "false", torch.fx.GraphModule(false_nn_modules, false_graph)
             )
 
-            if config.enforce_cond_guards_match:
-                assert (
-                    true_guards == false_guards
-                ), "Guards for true and false path must be equal."
+            # Apply side effects (guaranteed to be equal)
+            tx.output.side_effects = true_cmp.output.side_effects
+
+            true_node = make_attr(true_name)
+            false_node = make_attr(false_name)
 
-            true_node = make_attr(true_name, proxy_args)
-            false_node = make_attr(false_name, proxy_args)
-            p_args[1] = true_node
-            p_args[2] = false_node
+            p_args = (
+                args[0].as_proxy(),
+                true_node,
+                false_node,
+                tuple(a.as_proxy() for a in sub_args),
+            )
+            # TODO: assert that the true/false return values are
+            # consistent
+            example_value = true_r.as_proxy().node.meta["example_value"]
+        else:
+            unimplemented(f"PyOperator {self.value.__name__}")
 
         # Store the invocation as a call
         return wrap_fx_proxy(
@@ -772,5 +818,5 @@ def register_as_subgraph(fn, name, args):
                 args=tuple(p_args),
                 kwargs={},
             ),
-            example_value=self.value(*u_args),
+            example_value=example_value,
         )

From 8b880fdd0a94025c1ec48716423039aaa4acb603 Mon Sep 17 00:00:00 2001
From: Atul Jangra <atuljangra@meta.com>
Date: Thu, 8 Dec 2022 01:06:38 +0000
Subject: [PATCH 1699/1922] Support pickle version 4 by adding missing ops
 (#90223)

Summary:
In this logic, we are traversing the entries to find the module for STACK_GLOBAL entries.

According to https://github.com/python/cpython/blob/2837241f22be33a5597707b2aa723cb2cf6f3967/Lib/pickletools.py#L1799 we need to look for GET, BINGET and LONG_BINGET.

So this diff updates that. Also while testing, I found some cases of empty modules, for cases such as tanh. For this I added the option to skip processing when this is the case.

Test Plan: Tested with f392778829

Differential Revision: D41748595

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90223
Approved by: https://github.com/PaliC
---
 torch/package/package_exporter.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/torch/package/package_exporter.py b/torch/package/package_exporter.py
index 7f6af38468e2f..55cdf7f1baf2b 100644
--- a/torch/package/package_exporter.py
+++ b/torch/package/package_exporter.py
@@ -648,7 +648,7 @@ def _check_mocked_error(module: Optional[str], field: Optional[str]):
                         field = arg
                         memo[memo_count] = arg
                     elif (
-                        opcode.name == "BINGET_LONG"
+                        opcode.name == "LONG_BINGET"
                         or opcode.name == "BINGET"
                         or opcode.name == "GET"
                     ):
@@ -658,6 +658,9 @@ def _check_mocked_error(module: Optional[str], field: Optional[str]):
                     elif opcode.name == "MEMOIZE":
                         memo_count += 1
                     elif opcode.name == "STACK_GLOBAL":
+                        if module is None:
+                            # If not module was passed on in the entries preceeding this one, continue.
+                            continue
                         assert isinstance(module, str)
                         if module not in all_dependencies:
                             all_dependencies.append(module)

From 10075d4ec2f39eaafda93a17c63d36d3382e6c57 Mon Sep 17 00:00:00 2001
From: Sergii Dymchenko <sdym@fb.com>
Date: Thu, 8 Dec 2022 01:12:37 +0000
Subject: [PATCH 1700/1922] Fix issue 38095 TODOs in gloo tests (#89985)

Fix TODOs related to https://github.com/pytorch/pytorch/issues/38095
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89985
Approved by: https://github.com/ZainRizvi
---
 test/distributed/test_c10d_gloo.py | 58 ++++++++++++------------------
 1 file changed, 23 insertions(+), 35 deletions(-)

diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
index bee76e788d192..289e5c117ace1 100644
--- a/test/distributed/test_c10d_gloo.py
+++ b/test/distributed/test_c10d_gloo.py
@@ -75,7 +75,7 @@ def simple_reduce_tests(rank, world_size):
         (
             c10d.ReduceOp.MAX,
             torch.tensor([rank + 1.0]),
-            torch.tensor([world_size]),
+            torch.tensor([float(world_size)]),
         ),
     ]
 
@@ -128,7 +128,7 @@ def simple_coalesced_reduce_tests(rank, world_size):
     return [
         (
             c10d.ReduceOp.SUM,
-            [torch.tensor([rank + 1]), torch.tensor([(rank + 1) ** 2])],
+            [torch.tensor([rank + 1.0]), torch.tensor([(rank + 1.0) ** 2])],
             [
                 torch.tensor([float(world_size * (world_size + 1) / 2)]),
                 torch.tensor(
@@ -152,7 +152,7 @@ def simple_coalesced_reduce_tests(rank, world_size):
         (
             c10d.ReduceOp.MAX,
             [torch.tensor([rank + x]) for x in [1.0, 2.0]],
-            [torch.tensor([world_size]), torch.tensor([world_size + 1.0])],
+            [torch.tensor([float(world_size)]), torch.tensor([world_size + 1.0])],
         ),
     ]
 
@@ -177,7 +177,7 @@ def simple_multi_input_reduce_tests(rank, world_size):
         (
             c10d.ReduceOp.MAX,
             [torch.tensor([2 * rank + 1.0]), torch.tensor([2 * rank + 2.0])],
-            torch.tensor([2 * world_size]),
+            torch.tensor([2.0 * world_size]),
         ),
     ]
 
@@ -254,7 +254,7 @@ def test_empty_tensors(self):
         fut.wait()
         output = fut.value()
         self.assertEqual(0, output[0].numel())
-        self.assertEqualIgnoreType(xs[0], output[0])
+        self.assertEqual(xs[0], output[0])
 
     @requires_gloo()
     def test_broadcast_checks(self):
@@ -328,8 +328,7 @@ def broadcast(xs, rootRank, rootTensor):
             # Run with 1 input tensor
             x = fn(torch.tensor([self.rank]))
             output = broadcast([x], i, 0)
-            # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-            self.assertEqualIgnoreType(torch.tensor([i]), output[0])
+            self.assertEqual(torch.tensor([i]), output[0])
 
             # Run with 2 input tensors
             num = 2
@@ -340,10 +339,8 @@ def broadcast(xs, rootRank, rootTensor):
                 ]
 
                 output = broadcast(xs, i, j)
-                # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-                self.assertEqualIgnoreType(torch.tensor([i * num + j]), output[0])
-                # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-                self.assertEqualIgnoreType(torch.tensor([i * num + j]), output[1])
+                self.assertEqual(torch.tensor([i * num + j], dtype=torch.float32), output[0])
+                self.assertEqual(torch.tensor([i * num + j], dtype=torch.float32), output[1])
 
         # Test overloaded convenience function
         x = torch.tensor([self.rank + 1.0])
@@ -429,8 +426,7 @@ def _test_allreduce_basics(self, fn):
             fut = pg.allreduce([tensor], opts).get_future()
             fut.wait()
             result = fut.value()
-            # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-            self.assertEqualIgnoreType(expected, result[0])
+            self.assertEqual(expected, result[0])
 
         # Multi input tests
         tests = simple_multi_input_reduce_tests(self.rank, self.world_size)
@@ -442,8 +438,7 @@ def _test_allreduce_basics(self, fn):
             fut.wait()
             result = fut.value()
             for tensor in result:
-                # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-                self.assertEqualIgnoreType(output, tensor)
+                self.assertEqual(output, tensor)
 
         # Test overloaded convenience function (defaults to using sum)
         x = fn(torch.tensor([self.rank + 1.0]))
@@ -481,8 +476,7 @@ def _test_allreduce_basics_using_work_api(self, fn):
             work = pg.allreduce([tensor], opts)
             work.wait()
             result = work.result()
-            # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-            self.assertEqualIgnoreType(expected, result[0])
+            self.assertEqual(expected, result[0])
 
         # Multi input tests
         tests = simple_multi_input_reduce_tests(self.rank, self.world_size)
@@ -494,8 +488,7 @@ def _test_allreduce_basics_using_work_api(self, fn):
             work.wait()
             result = work.result()
             for tensor in result:
-                # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-                self.assertEqualIgnoreType(output, tensor)
+                self.assertEqual(output, tensor)
 
         # Test overloaded convenience function (defaults to using sum)
         x = fn(torch.tensor([self.rank + 1.0]))
@@ -526,12 +519,11 @@ def _test_allreduce_stress(self, inputs):
         ]
         for i, future_handle in enumerate(future_handles):
             future_handle.wait()
-            # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-            self.assertEqualIgnoreType(
+            self.assertEqual(
                 torch.tensor(
                     [
                         (i * self.world_size)
-                        + (self.world_size * (self.world_size - 1) / 2)
+                        + (self.world_size * (self.world_size - 1) // 2)
                     ]
                 ),
                 future_handle.value()[0],
@@ -605,8 +597,7 @@ def _test_allreduce_coalesced_basics(self, fn):
             fut.wait()
             result = fut.value()
             for result_tensor, expected in zip(result, outputs):
-                # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-                self.assertEqualIgnoreType(result_tensor, expected)
+                self.assertEqual(result_tensor, expected)
 
     @requires_gloo()
     def test_allreduce_coalesced_basics(self):
@@ -614,7 +605,7 @@ def test_allreduce_coalesced_basics(self):
 
     def _expected_output(self, i):
         ws = self.world_size
-        return 2 * [torch.tensor([(i * ws) + (ws * (ws - 1) / 2)])]
+        return 2 * [torch.tensor([(i * ws) + (ws * (ws - 1) // 2)])]
 
     def _test_allreduce_coalesced_stress(self, inputs):
         store = c10d.FileStore(self.file_name, self.world_size)
@@ -627,8 +618,7 @@ def _test_allreduce_coalesced_stress(self, inputs):
         for i, future_handle in enumerate(future_handles):
             future_handle.wait()
             result = future_handle.value()
-            # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-            self.assertEqualIgnoreType(
+            self.assertEqual(
                 self._expected_output(i),
                 result,
                 msg="Mismatch in iteration {}".format(i),
@@ -650,7 +640,7 @@ def test_allreduce_coalesced_async(self):
         futs = [c10d.all_reduce_coalesced(x, async_op=True) for x in xs]
         torch.futures.wait_all(futs)
         for i, fut in enumerate(futs):
-            self.assertEqualIgnoreType(
+            self.assertEqual(
                 self._expected_output(i),
                 fut.wait(),
                 msg="Mismatch in iteration {}".format(i),
@@ -1240,7 +1230,7 @@ def test_allgather_coalesced_async(self):
                 # one output tensor list
                 for y, z in zip(y_out, z_out):
                     # one tensor in output tensor list
-                    self.assertEqualIgnoreType(y, z)
+                    self.assertEqual(y, z)
 
         # Added to address https://github.com/pytorch/pytorch/issues/65231
         # In the failed tests, all assertEqualIgnoreType are passed on all
@@ -1303,8 +1293,7 @@ def _test_reduce_basics(self, fn):
                 fut.wait()
                 result = fut.value()
                 if root == self.rank:
-                    # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-                    self.assertEqualIgnoreType(output, result[0])
+                    self.assertEqual(output, result[0])
 
     @requires_gloo()
     def test_reduce_basics(self):
@@ -1337,12 +1326,11 @@ def _test_reduce_stress(self, inputs):
             iter = i // self.world_size
             root = i % self.world_size
             if root == self.rank:
-                # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-                self.assertEqualIgnoreType(
+                self.assertEqual(
                     torch.tensor(
                         [
                             (iter * self.world_size)
-                            + (self.world_size * (self.world_size - 1) / 2)
+                            + (self.world_size * (self.world_size - 1) // 2)
                         ]
                     ),
                     result[0],
@@ -2312,7 +2300,7 @@ def _test_broadcast_coalesced(self, process_group, device, root_rank):
         target += torch.arange(60, dtype=half, device=device).chunk(5)
         target += torch.arange(60, dtype=torch.float32, device=device).chunk(5)
 
-        # The tensors to pass to broadcast are idential to the target
+        # The tensors to pass to broadcast are identical to the target
         # only on the process that is the root of the broadcast.
         if self.rank == root_rank:
             tensors = list(tensor.clone() for tensor in target)

From 28932f514837ca56c68bb6bbec2b9f0d2fc4bfad Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Thu, 8 Dec 2022 01:26:07 +0000
Subject: [PATCH 1701/1922] Introduce CUDA Device Assertions Infrastructure
 (#84609)

Summary:
This diff introduces a set of changes that makes it possible for the host to get assertions from CUDA devices. This includes the introduction of

**`CUDA_KERNEL_ASSERT2`**

A preprocessor macro to be used within a CUDA kernel that, upon an assertion failure, writes the assertion message, file, line number, and possibly other information to UVM (Managed memory). Once this is done, the original assertion is triggered, which places the GPU in a Bad State requiring recovery. In my tests, data written to UVM appears there before the GPU reaches the Bad State and is still accessible from the host after the GPU is in this state.

Messages are written to a multi-message buffer which can, in theory, hold many assertion failures. I've done this as a precaution in case there are several, but I don't actually know whether that is possible and a simpler design which holds only a single message may well be all that is necessary.

**`TORCH_DSA_KERNEL_ARGS`**

This preprocess macro is added as an _argument_ to a kernel function's signature. It expands to supply the standardized names of all the arguments needed by `C10_CUDA_COMMUNICATING_KERNEL_ASSERTION` to handle device-side assertions. This includes, eg, the name of the pointer to the UVM memory the assertion would be written to. This macro abstracts the arguments so there is a single point of change if the system needs to be modified.

**`c10::cuda::get_global_cuda_kernel_launch_registry()`**

This host-side function returns a singleton object that manages the host's part of the device-side assertions. Upon allocation, the singleton allocates sufficient UVM (Managed) memory to hold information about several device-side assertion failures. The singleton also provides methods for getting the current traceback (used to identify when a kernel was launched). To avoid consuming all the host's memory the singleton stores launches in a circular buffer; a unique "generation number" is used to ensure that kernel launch failures map to their actual launch points (in the case that the circular buffer wraps before the failure is detected).

**`TORCH_DSA_KERNEL_LAUNCH`**

This host-side preprocessor macro replaces the standard
```
kernel_name<<<blocks, threads, shmem, stream>>>(args)
```
invocation with
```
TORCH_DSA_KERNEL_LAUNCH(blocks, threads, shmem, stream, args);
```
Internally, it fetches the UVM (Managed) pointer and generation number from the singleton and append these to the standard argument list. It also checks to ensure the kernel launches correctly. This abstraction on kernel launches can be modified to provide additional safety/logging.

**`c10::cuda::c10_retrieve_device_side_assertion_info`**
This host-side function checks, when called, that no kernel assertions have occurred. If one has. It then raises an exception with:
1. Information (file, line number) of what kernel was launched.
2. Information (file, line number, message) about the device-side assertion
3. Information (file, line number) about where the failure was detected.

**Checking for device-side assertions**

Device-side assertions are most likely to be noticed by the host when a CUDA API call such as `cudaDeviceSynchronize` is made and fails with a `cudaError_t` indicating
> CUDA error: device-side assert triggered CUDA kernel errors

Therefore, we rewrite `C10_CUDA_CHECK()` to include a call to `c10_retrieve_device_side_assertion_info()`. To make the code cleaner, most of the logic of `C10_CUDA_CHECK()` is now contained within a new function `c10_cuda_check_implementation()` to which `C10_CUDA_CHECK` passes the preprocessor information about filenames, function names, and line numbers. (In C++20 we can use `std::source_location` to eliminate macros entirely!)

# Notes on special cases

* Multiple assertions from the same block are recorded
* Multiple assertions from different blocks are recorded
* Launching kernels from many threads on many streams seems to be handled correctly
* If two process are using the same GPU and one of the processes fails with a device-side assertion the other process continues without issue
* X Multiple assertions from separate kernels on different streams seem to be recorded, but we can't reproduce the test condition
* X Multiple assertions from separate devices should be all be shown upon exit, but we've been unable to generate a test that produces this condition

Differential Revision: D37621532

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84609
Approved by: https://github.com/ezyang, https://github.com/malfet
---
 c10/cuda/CMakeLists.txt                       |   7 +-
 c10/cuda/CUDADeviceAssertion.h                |  98 +++++
 c10/cuda/CUDADeviceAssertionHost.cpp          | 367 ++++++++++++++++++
 c10/cuda/CUDADeviceAssertionHost.h            | 156 ++++++++
 c10/cuda/CUDAException.cpp                    |  19 +-
 c10/cuda/CUDAException.h                      |  39 +-
 c10/cuda/impl/CUDATest.cpp                    |   2 +-
 c10/cuda/test/CMakeLists.txt                  |   7 +
 c10/cuda/test/build.bzl                       |  34 +-
 .../impl/CUDAAssertionsTest_1_var_test.cu     | 102 +++++
 .../impl/CUDAAssertionsTest_catches_stream.cu | 101 +++++
 ...est_catches_thread_and_block_and_device.cu |  86 ++++
 .../CUDAAssertionsTest_from_2_processes.cu    | 108 ++++++
 ...multiple_writes_from_blocks_and_threads.cu |  93 +++++
 ...st_multiple_writes_from_multiple_blocks.cu |  90 +++++
 ...onsTest_multiple_writes_from_same_block.cu |  78 ++++
 tools/bazel.bzl                               |   3 +-
 torch/utils/hipify/cuda_to_hip_mappings.py    |   3 +
 18 files changed, 1371 insertions(+), 22 deletions(-)
 create mode 100644 c10/cuda/CUDADeviceAssertion.h
 create mode 100644 c10/cuda/CUDADeviceAssertionHost.cpp
 create mode 100644 c10/cuda/CUDADeviceAssertionHost.h
 create mode 100644 c10/cuda/test/impl/CUDAAssertionsTest_1_var_test.cu
 create mode 100644 c10/cuda/test/impl/CUDAAssertionsTest_catches_stream.cu
 create mode 100644 c10/cuda/test/impl/CUDAAssertionsTest_catches_thread_and_block_and_device.cu
 create mode 100644 c10/cuda/test/impl/CUDAAssertionsTest_from_2_processes.cu
 create mode 100644 c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_blocks_and_threads.cu
 create mode 100644 c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_multiple_blocks.cu
 create mode 100644 c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_same_block.cu

diff --git a/c10/cuda/CMakeLists.txt b/c10/cuda/CMakeLists.txt
index 1dc4435da5f00..2c26bc06f6ca4 100644
--- a/c10/cuda/CMakeLists.txt
+++ b/c10/cuda/CMakeLists.txt
@@ -21,16 +21,18 @@ configure_file(
 # and headers you add
 set(C10_CUDA_SRCS
     CUDACachingAllocator.cpp
+    CUDADeviceAssertionHost.cpp
     CUDAException.cpp
     CUDAFunctions.cpp
+    CUDAMallocAsyncAllocator.cpp
     CUDAMiscFunctions.cpp
     CUDAStream.cpp
-    CUDACachingAllocator.cpp
-    CUDAMallocAsyncAllocator.cpp
     impl/CUDAGuardImpl.cpp
     impl/CUDATest.cpp
 )
 set(C10_CUDA_HEADERS
+    CUDACachingAllocator.h
+    CUDADeviceAssertionHost.h
     CUDAException.h
     CUDAFunctions.h
     CUDAGuard.h
@@ -38,7 +40,6 @@ set(C10_CUDA_HEADERS
     CUDAMathCompat.h
     CUDAMiscFunctions.h
     CUDAStream.h
-    CUDACachingAllocator.h
     impl/CUDAGuardImpl.h
     impl/CUDATest.h
 )
diff --git a/c10/cuda/CUDADeviceAssertion.h b/c10/cuda/CUDADeviceAssertion.h
new file mode 100644
index 0000000000000..76c422d83bd69
--- /dev/null
+++ b/c10/cuda/CUDADeviceAssertion.h
@@ -0,0 +1,98 @@
+#pragma once
+
+#include <c10/cuda/CUDAException.h>
+#include <c10/macros/Macros.h>
+
+namespace c10 {
+namespace cuda {
+
+#ifdef TORCH_USE_CUDA_DSA
+// Copy string from `src` to `dst`
+static __device__ void dstrcpy(char* dst, const char* src) {
+  int i = 0;
+  // Copy string from source to destination, ensuring that it
+  // isn't longer than `C10_CUDA_DSA_MAX_STR_LEN-1`
+  while (*src != '\0' && i++ < C10_CUDA_DSA_MAX_STR_LEN - 1) {
+    *dst++ = *src++;
+  }
+  *dst = '\0';
+}
+
+__device__ __noinline__ void dsa_add_new_assertion_failure(
+    DeviceAssertionsData* assertions_data,
+    const char* assertion_msg,
+    const char* filename,
+    const char* function_name,
+    const int line_number,
+    const uint32_t caller,
+    const dim3 block_id,
+    const dim3 thread_id) {
+  // `assertions_data` may be nullptr if device-side assertion checking
+  // is disabled at run-time. If it is disabled at compile time this
+  // function will never be called
+  if (!assertions_data) {
+    return;
+  }
+
+  // Atomically increment so other threads can fail at the same time
+  // Note that incrementing this means that the CPU can observe that
+  // a failure has happened and can begin to respond before we've
+  // written information about that failure out to the buffer.
+  const auto nid = atomicAdd(&(assertions_data->assertion_count), 1);
+
+  if (nid >= C10_CUDA_DSA_ASSERTION_COUNT) {
+    // At this point we're ran out of assertion buffer space.
+    // We could print a message about this, but that'd get
+    // spammy if a lot of threads did it, so we just silently
+    // ignore any other assertion failures. In most cases the
+    // failures will all probably be analogous anyway.
+    return;
+  }
+
+  // Write information about the assertion failure to memory.
+  // Note that this occurs only after the `assertion_count`
+  // increment broadcasts that there's been a problem.
+  auto& self = assertions_data->assertions[nid];
+  dstrcpy(self.assertion_msg, assertion_msg);
+  dstrcpy(self.filename, filename);
+  dstrcpy(self.function_name, function_name);
+  self.line_number = line_number;
+  self.caller = caller;
+  self.block_id[0] = block_id.x;
+  self.block_id[1] = block_id.y;
+  self.block_id[2] = block_id.z;
+  self.thread_id[0] = thread_id.x;
+  self.thread_id[1] = thread_id.y;
+  self.thread_id[2] = thread_id.z;
+}
+
+// Emulates a kernel assertion. The assertion won't stop the kernel's progress,
+// so you should assume everything the kernel produces is garbage if there's an
+// assertion failure.
+// NOTE: This assumes that `assertions_data` and  `assertion_caller_id` are
+//       arguments of the kernel and therefore accessible.
+#define CUDA_KERNEL_ASSERT2(condition)                                   \
+  do {                                                                   \
+    if (C10_UNLIKELY(!(condition))) {                                    \
+      /* Has an atomic element so threads can fail at the same time */   \
+      c10::cuda::dsa_add_new_assertion_failure(                          \
+          assertions_data,                                               \
+          C10_STRINGIZE(condition),                                      \
+          __FILE__,                                                      \
+          __FUNCTION__,                                                  \
+          __LINE__,                                                      \
+          assertion_caller_id,                                           \
+          blockIdx,                                                      \
+          threadIdx);                                                    \
+      /* Now that the kernel has failed we early exit the kernel, but */ \
+      /* otherwise keep going and rely on the host to check UVM and */   \
+      /* determine we've had a problem */                                \
+      return;                                                            \
+    }                                                                    \
+  } while (false)
+#else
+#define CUDA_KERNEL_ASSERT2(condition) assert(condition)
+#endif
+
+} // namespace cuda
+} // namespace c10
diff --git a/c10/cuda/CUDADeviceAssertionHost.cpp b/c10/cuda/CUDADeviceAssertionHost.cpp
new file mode 100644
index 0000000000000..58ece480799cd
--- /dev/null
+++ b/c10/cuda/CUDADeviceAssertionHost.cpp
@@ -0,0 +1,367 @@
+#include <c10/cuda/CUDADeviceAssertionHost.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/util/Backtrace.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+#include <cuda_runtime.h>
+
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <thread>
+
+#define CHECK_CUDA_API_CALL_WITHOUT_CHECKING_DEVICE_ASSERTS() \
+  c10_cuda_check_implementation(__FILE__, __FUNCTION__, __LINE__, false)
+
+namespace c10 {
+namespace cuda {
+
+namespace {
+
+/// Get the number of CUDA devices
+/// We need our own implementation of this function to prevent
+/// an infinite initialization loop for CUDAKernelLaunchRegistry
+int dsa_get_device_count() {
+  int device_count = -1;
+  C10_CUDA_ERROR_HANDLED(cudaGetDeviceCount(&device_count));
+  CHECK_CUDA_API_CALL_WITHOUT_CHECKING_DEVICE_ASSERTS();
+  return device_count;
+}
+
+bool dsa_check_if_all_devices_support_managed_memory() {
+// It looks as though this'll work best on CUDA GPUs with Pascal
+// architectures or newer, per
+// https://developer.nvidia.com/blog/unified-memory-cuda-beginners/
+#ifdef TORCH_USE_CUDA_DSA
+  for (const auto i : c10::irange(dsa_get_device_count())) {
+    if (dsa_get_device_compute_capability(i) < 6) {
+      return false;
+    }
+  }
+  return true;
+#else
+  return false;
+#endif
+}
+
+bool env_flag_set(const char* env_var_name) {
+  const char* const env_string = std::getenv(env_var_name);
+  return (env_string == nullptr) ? false : std::strcmp(env_string, "0");
+}
+
+/// Deleter for UVM/managed memory pointers
+void uvm_deleter(DeviceAssertionsData* uvm_assertions_ptr) {
+  // Ignore error in destructor
+  if (uvm_assertions_ptr) {
+    C10_CUDA_IGNORE_ERROR(cudaFree(uvm_assertions_ptr));
+  }
+}
+
+#ifdef TORCH_USE_CUDA_DSA
+/// Get current device id
+/// We need our own implementation of this function to prevent
+/// an infinite initialization loop for CUDAKernelLaunchRegistry
+int dsa_get_device_id() {
+  int device = -1;
+  C10_CUDA_ERROR_HANDLED(cudaGetDevice(&device));
+  CHECK_CUDA_API_CALL_WITHOUT_CHECKING_DEVICE_ASSERTS();
+  return device;
+}
+
+/// Get a device's compute capability - note that this dangerously assumes
+/// that if one CUDA GPU supports device-side assertions they all do. This is
+/// probably fine since the latest CUDA GPU that doesn't support UVM is the
+/// K80 released 2014-11-17. Mixing that GPU with a newer one is likely to be
+/// rare enough that the defensive
+/// We need our own implementation of this function to prevent
+/// an infinite initialization loop for CUDAKernelLaunchRegistry
+int dsa_get_device_compute_capability(const int device_num) {
+  int compute_capability = -1;
+  C10_CUDA_ERROR_HANDLED(cudaDeviceGetAttribute(
+      &compute_capability, cudaDevAttrComputeCapabilityMajor, device_num));
+  CHECK_CUDA_API_CALL_WITHOUT_CHECKING_DEVICE_ASSERTS();
+  return compute_capability;
+}
+#endif
+
+} // namespace
+
+/// Check that kernels ran correctly by checking the message buffer. BLOCKING.
+std::string c10_retrieve_device_side_assertion_info() {
+#ifdef TORCH_USE_CUDA_DSA
+  const auto& launch_registry = CUDAKernelLaunchRegistry::get_singleton_ref();
+  if (!launch_registry.enabled) {
+    return "Device-side assertion tracking was not enabled by user.";
+  } else if (!launch_registry.do_all_devices_support_managed_memory) {
+    return "Device-side assertions disabled because not all devices support managed memory.";
+  }
+
+  // Hack that saves a lot of challenging sync logic.
+  // The GPU increments the number of errors it's observed and the CPU can see
+  // that happening immediately which means we can make it here before the GPU
+  // is done writing information about those errors to memory.
+  // A short pause gives it time to finish. Since something's gone wrong, this
+  // pause shouldn't affect perf.
+  std::this_thread::sleep_for(std::chrono::seconds(1));
+
+  // The snapshot causes a brief block. That's okay because this function only
+  // executes if something's gone wrong such that speed is no longer a priority.
+  const auto launch_data = launch_registry.snapshot();
+  const auto& assertion_data = launch_data.first;
+  const auto& launch_infos = launch_data.second;
+
+  std::stringstream oss;
+
+  {
+    oss << "This process interacted the following GPUs = {";
+    bool first_gpu_listed = true;
+    for (const auto& x : uvm_assertions) {
+      if (x) {
+        if (!first_gpu_listed) {
+          oss << ","
+        }
+        first_gpu_listed = true;
+        oss << x;
+      }
+    }
+    oss << "}" << std::endl;
+  }
+
+  // Loop over each device that could be managed by the process
+  for (const auto device_num : c10::irange(assertion_data.size())) {
+    const auto& assertion_data_for_device = assertion_data.at(device_num);
+
+    // Did anything fail?
+    const auto failures_found = std::min(
+        assertion_data_for_device.assertion_count,
+        C10_CUDA_DSA_ASSERTION_COUNT);
+    if (failures_found == 0) {
+      continue;
+    }
+
+    // Something failed, let's talk about that
+    oss << failures_found
+        << " CUDA device-side assertion failures were found on GPU #"
+        << device_num << "!" << std::endl;
+    if (assertion_data_for_device.assertion_count >
+        C10_CUDA_DSA_ASSERTION_COUNT) {
+      oss << "But at least " << assertion_data_for_device.assertion_count
+          << " assertion failures occurred on the device" << std::endl;
+      oss << "Adjust `C10_CUDA_DSA_ASSERTION_COUNT` if you need more assertion failure info"
+          << std::endl;
+    }
+
+    for (const auto i : c10::irange(failures_found)) {
+      const auto& self = assertion_data_for_device.assertions[i];
+      const auto& launch_info = launch_infos[self.caller % launch_infos.size()];
+      oss << "Assertion failure " << i << std::endl;
+      oss << "  GPU assertion failure message = " << self.assertion_msg
+          << std::endl;
+      oss << "  File containing assertion = " << self.filename << ":"
+          << self.line_number << std::endl;
+      oss << "  Device function containing assertion = " << self.function_name
+          << std::endl;
+      oss << "  Thread ID that failed assertion = [" << self.thread_id[0] << ","
+          << self.thread_id[1] << "," << self.thread_id[2] << "]" << std::endl;
+      oss << "  Block ID that failed assertion = [" << self.block_id[0] << ","
+          << self.block_id[1] << "," << self.block_id[2] << "]" << std::endl;
+      if (launch_info.generation_number == self.caller) {
+        oss << "  File containing kernel launch = "
+            << launch_info.launch_filename << ":" << launch_info.launch_linenum
+            << std::endl;
+        oss << "  Function containing kernel launch = "
+            << launch_info.launch_function << std::endl;
+        oss << "  Name of kernel launched that led to failure = "
+            << launch_info.kernel_name << std::endl;
+        oss << "  Device that launched kernel = " << launch_info.device
+            << std::endl;
+        oss << "  Stream kernel was launched on = " << launch_info.stream
+            << std::endl;
+        oss << "  Backtrace of kernel launch site = ";
+        if (launch_registry.gather_launch_stacktrace) {
+          oss << "Launch stacktracing disabled." << std::endl;
+        } else {
+          oss << "\n" << launch_info.launch_stacktrace << std::endl;
+        }
+      } else {
+        oss << "  CPU launch site info: Unavailable, the circular queue wrapped around. Increase `CUDAKernelLaunchRegistry::max_size`."
+            << std::endl;
+      }
+    }
+  }
+  return oss.str();
+#else
+  return "Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n";
+#endif
+}
+
+CUDAKernelLaunchRegistry::CUDAKernelLaunchRegistry()
+    : do_all_devices_support_managed_memory(
+          dsa_check_if_all_devices_support_managed_memory()),
+      gather_launch_stacktrace(check_env_for_enable_launch_stacktracing()),
+      enabled(check_env_for_dsa_enabled()) {
+  for (C10_UNUSED const auto _ : c10::irange(dsa_get_device_count())) {
+    uvm_assertions.emplace_back(nullptr, uvm_deleter);
+  }
+
+  kernel_launches.resize(max_kernel_launches);
+}
+
+bool CUDAKernelLaunchRegistry::check_env_for_enable_launch_stacktracing()
+    const {
+  return env_flag_set("PYTORCH_CUDA_DSA_STACKTRACING");
+}
+
+bool CUDAKernelLaunchRegistry::check_env_for_dsa_enabled() const {
+  return env_flag_set("PYTORCH_USE_CUDA_DSA");
+}
+
+uint32_t CUDAKernelLaunchRegistry::insert(
+    const char* launch_filename,
+    const char* launch_function,
+    const uint32_t launch_linenum,
+    const char* kernel_name,
+    const int32_t stream_id) {
+#ifdef TORCH_USE_CUDA_DSA
+  if (!is_enabled()) {
+    return 0;
+  }
+
+  const auto backtrace = gather_launch_stacktrace ? c10::get_backtrace() : "";
+
+  const std::lock_guard<std::mutex> lock(read_write_mutex);
+
+  const auto my_gen_number = generation_number++;
+  // TODO: It would probably be good to get a stack trace here so that
+  // we can better indicate which launch caused the failure.
+  kernel_launches[my_gen_number % max_kernel_launches] = {
+      launch_filename,
+      launch_function,
+      launch_linenum,
+      backtrace,
+      kernel_name,
+      dsa_get_device_id(),
+      stream_id,
+      my_gen_number};
+  return my_gen_number;
+#else
+  return 0;
+#endif
+}
+
+std::pair<std::vector<DeviceAssertionsData>, std::vector<CUDAKernelLaunchInfo>>
+CUDAKernelLaunchRegistry::snapshot() const {
+  // This is likely to be the longest-lasting hold on the mutex, but
+  // we only expect it to be called in cases where we're already failing
+  // and speed is no longer important
+  const std::lock_guard<std::mutex> lock(read_write_mutex);
+
+  std::vector<DeviceAssertionsData> device_assertions_data;
+  for (const auto& x : uvm_assertions) {
+    if (x) {
+      device_assertions_data.push_back(*x);
+    } else {
+      device_assertions_data.emplace_back();
+    }
+  }
+
+  return std::make_pair(device_assertions_data, kernel_launches);
+}
+
+DeviceAssertionsData* CUDAKernelLaunchRegistry::
+    get_uvm_assertions_ptr_for_current_device() {
+#ifdef TORCH_USE_CUDA_DSA
+  if (!is_enabled()) {
+    return nullptr;
+  }
+
+  const auto device_num = dsa_get_device_id();
+
+  // If we've already set up this GPU with managed memory, return a pointer to
+  // the managed memory. This is a lock-free quick-return path.
+  if (uvm_assertions.at(device_num)) {
+    return uvm_assertions.at(device_num).get();
+  }
+
+  // Need a lock here so there's not race-condition on creating the new device
+  // assertions buffer
+  const std::lock_guard<std::mutex> lock(gpu_alloc_mutex);
+
+  // If we've already set up this GPU with managed memory, return a pointer to
+  // the managed memory. This locked path ensures that the device memory is
+  // allocated only once
+  if (uvm_assertions.at(device_num)) {
+    return uvm_assertions.at(device_num).get();
+  }
+
+  // Otherwise, set up the GPU to be able to use the device-side assertion
+  // system
+  DeviceAssertionsData* uvm_assertions_ptr = nullptr;
+
+  C10_CUDA_ERROR_HANDLED(
+      cudaMallocManaged(&uvm_assertions_ptr, sizeof(DeviceAssertionsData)));
+  CHECK_CUDA_API_CALL_WITHOUT_CHECKING_DEVICE_ASSERTS();
+
+  C10_CUDA_ERROR_HANDLED(cudaMemAdvise(
+      uvm_assertions_ptr,
+      sizeof(DeviceAssertionsData),
+      cudaMemAdviseSetPreferredLocation,
+      cudaCpuDeviceId));
+  CHECK_CUDA_API_CALL_WITHOUT_CHECKING_DEVICE_ASSERTS();
+
+  // GPU will establish direct mapping of data in CPU memory, no page faults
+  // will be generated
+  C10_CUDA_ERROR_HANDLED(cudaMemAdvise(
+      uvm_assertions_ptr,
+      sizeof(DeviceAssertionsData),
+      cudaMemAdviseSetAccessedBy,
+      cudaCpuDeviceId));
+  CHECK_CUDA_API_CALL_WITHOUT_CHECKING_DEVICE_ASSERTS();
+
+  // Initialize the memory from the CPU; otherwise, pages may have to be created
+  // on demand. We think that UVM documentation indicates that first access may
+  // not honor preferred location, which would be bad, if true, because we want
+  // this memory on the host so we can access it post-assertion. Initializing
+  // this on the CPU helps ensure that that's where the memory will live.
+  *uvm_assertions_ptr = DeviceAssertionsData();
+
+  // Ownership and lifetime management of `uvm_assertions_ptr` now passes to the
+  // uvm_assertions unique_ptr vector
+  uvm_assertions.at(device_num).reset(uvm_assertions_ptr);
+
+  return uvm_assertions_ptr;
+#else
+  return nullptr;
+#endif
+}
+
+CUDAKernelLaunchRegistry& CUDAKernelLaunchRegistry::get_singleton_ref() {
+  static CUDAKernelLaunchRegistry launch_registry;
+  return launch_registry;
+}
+
+bool CUDAKernelLaunchRegistry::has_failed() const {
+  for (const auto& x : uvm_assertions) {
+    if (x && x->assertion_count > 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool CUDAKernelLaunchRegistry::is_enabled() const {
+#ifdef TORCH_USE_CUDA_DSA
+  std::cerr << ""
+#else
+  std::cerr
+      << "TORCH_USE_CUDA_DSA not enabled in CUDAKernelLaunchRegistry::is_enabled"
+      << std::endl;
+  return false;
+#endif
+}
+
+} // namespace cuda
+} // namespace c10
diff --git a/c10/cuda/CUDADeviceAssertionHost.h b/c10/cuda/CUDADeviceAssertionHost.h
new file mode 100644
index 0000000000000..7465f3d36b20e
--- /dev/null
+++ b/c10/cuda/CUDADeviceAssertionHost.h
@@ -0,0 +1,156 @@
+#pragma once
+
+#include <c10/cuda/CUDAMacros.h>
+
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+
+#ifdef USE_CUDA
+#define TORCH_USE_CUDA_DSA
+#endif
+
+/// Number of assertion failure messages we can store. If this is too small
+/// threads will fail silently.
+constexpr int C10_CUDA_DSA_ASSERTION_COUNT = 10;
+constexpr int C10_CUDA_DSA_MAX_STR_LEN = 512;
+
+namespace c10 {
+namespace cuda {
+
+/// Holds information about any device-side assertions that fail.
+/// Held in managed memory and access by both the CPU and the GPU.
+struct DeviceAssertionData {
+  /// Stringification of the assertion
+  char assertion_msg[C10_CUDA_DSA_MAX_STR_LEN];
+  /// File the assertion was in
+  char filename[C10_CUDA_DSA_MAX_STR_LEN];
+  /// Name of the function the assertion was in
+  char function_name[C10_CUDA_DSA_MAX_STR_LEN];
+  /// Line number the assertion was at
+  int line_number;
+  /// Number uniquely identifying the kernel launch that triggered the assertion
+  uint32_t caller;
+  /// block_id of the thread that failed the assertion
+  int32_t block_id[3];
+  /// third_id of the thread that failed the assertion
+  int32_t thread_id[3];
+};
+
+/// Used to hold assertions generated by the device
+/// Held in managed memory and access by both the CPU and the GPU.
+struct DeviceAssertionsData {
+  /// Total number of assertions found; a subset of thse will be recorded
+  /// in `assertions`
+  int32_t assertion_count;
+  /// An array of assertions that will be written to in a race-free manner
+  DeviceAssertionData assertions[C10_CUDA_DSA_ASSERTION_COUNT];
+};
+
+/// Use to hold info about kernel launches so that we can run kernels
+/// asynchronously and still associate launches with device-side
+/// assertion failures
+struct CUDAKernelLaunchInfo {
+  /// Filename of the code where the kernel was launched from
+  const char* launch_filename;
+  /// Function from which the kernel was launched
+  const char* launch_function;
+  /// Line number of where the code was launched from
+  uint32_t launch_linenum;
+  /// Backtrace of where the kernel was launched from, only populated if
+  /// CUDAKernelLaunchRegistry::gather_launch_stacktrace is True
+  std::string launch_stacktrace;
+  /// Kernel that was launched
+  const char* kernel_name;
+  /// Device the kernel was launched on
+  int device;
+  /// Stream the kernel was launched on
+  int32_t stream;
+  /// A number that uniquely identifies the kernel launch
+  uint64_t generation_number;
+};
+
+/// Circular buffer used to hold information about kernel launches
+/// this is later used to reconstruct how a device-side kernel assertion failure
+/// occurred CUDAKernelLaunchRegistry is used as a singleton
+class C10_CUDA_API CUDAKernelLaunchRegistry {
+ private:
+  /// Assume that this is the max number of kernel launches that might ever be
+  /// enqueued across all streams on a single device
+  static constexpr int max_kernel_launches = 1024;
+  /// How many kernel launch infos we've inserted. Used to ensure that circular
+  /// queue doesn't provide false information by always increasing, but also to
+  /// mark where we are inserting into the queue
+#ifdef TORCH_USE_CUDA_DSA
+  uint64_t generation_number = 0;
+#endif
+  /// Shared mutex between writer and accessor to ensure multi-threaded safety.
+  mutable std::mutex read_write_mutex;
+  /// Used to ensure prevent race conditions in GPU memory allocation
+  mutable std::mutex gpu_alloc_mutex;
+  /// Pointer to managed memory keeping track of device-side assertions. There
+  /// is one entry for each possible device the process might work with. Unused
+  /// entries are nullptrs. We could also use an unordered_set here, but this
+  /// vector design will be faster and the wasted memory is small since we
+  /// expect the number of GPUs per node will always be small
+  std::vector<
+      std::unique_ptr<DeviceAssertionsData, void (*)(DeviceAssertionsData*)>>
+      uvm_assertions;
+  /// A single circular buffer holds information about every kernel launch the
+  /// process makes across all devices.
+  std::vector<CUDAKernelLaunchInfo> kernel_launches;
+  bool check_env_for_enable_launch_stacktracing() const;
+  bool check_env_for_dsa_enabled() const;
+
+ public:
+  CUDAKernelLaunchRegistry();
+  /// Register a new kernel launch and obtain a generation number back to be
+  /// passed to the kernel
+  uint32_t insert(
+      const char* launch_filename,
+      const char* launch_function,
+      const uint32_t launch_linenum,
+      const char* kernel_name,
+      const int32_t stream_id);
+  /// Get copies of the kernel launch registry and each device's assertion
+  /// failure buffer so they can be inspected without raising race conditions
+  std::
+      pair<std::vector<DeviceAssertionsData>, std::vector<CUDAKernelLaunchInfo>>
+      snapshot() const;
+  /// Get a pointer to the current device's assertion failure buffer. If no such
+  /// buffer exists then one is created. This means that the first kernel launch
+  /// made on each device will be slightly slower because memory allocations are
+  /// required
+  DeviceAssertionsData* get_uvm_assertions_ptr_for_current_device();
+  /// Gets the global singleton of the registry
+  static CUDAKernelLaunchRegistry& get_singleton_ref();
+  /// If not all devices support DSA, we disable it
+  const bool do_all_devices_support_managed_memory = false;
+  /// Whether or not to gather stack traces when launching kernels
+  bool gather_launch_stacktrace = false;
+  /// Whether or not host-side DSA is enabled or disabled at run-time
+  /// Device-side code cannot be adjusted at run-time
+  bool enabled = false;
+  /// Whether or not a device has indicated a failure
+  bool has_failed() const;
+  /// Since multiple mechanisms can enable/disable, we add a function that
+  /// aggregates them
+  bool is_enabled() const;
+};
+
+std::string c10_retrieve_device_side_assertion_info();
+
+} // namespace cuda
+} // namespace c10
+
+// Each kernel launched with TORCH_DSA_KERNEL_LAUNCH
+// requires the same input arguments. We introduce the following macro to
+// standardize these.
+#define TORCH_DSA_KERNEL_ARGS                             \
+  c10::cuda::DeviceAssertionsData *const assertions_data, \
+      uint32_t assertion_caller_id
+
+// This macro can be used to pass the DSA arguments onward to another
+// function
+#define TORCH_DSA_KERNEL_ARGS_PASS assertions_data, assertion_caller_id
diff --git a/c10/cuda/CUDAException.cpp b/c10/cuda/CUDAException.cpp
index 7813be5c1f665..b6e9b9e3606d8 100644
--- a/c10/cuda/CUDAException.cpp
+++ b/c10/cuda/CUDAException.cpp
@@ -1,5 +1,6 @@
 #include <c10/cuda/CUDAException.h>
 
+#include <c10/cuda/CUDADeviceAssertionHost.h>
 #include <c10/util/Exception.h>
 #include <cuda_runtime.h>
 
@@ -13,19 +14,27 @@ void c10_cuda_check_implementation(
     const char* function_name,
     const int line_number,
     const bool include_device_assertions) {
-  // We retrieve the error here in order to keep CUDA data types out of
-  // CUDAException.h thereby simplifying including it in other files
-  const cudaError_t err = cudaGetLastError();
+  const auto cuda_error = cudaGetLastError();
+  const auto cuda_kernel_failure = include_device_assertions
+      ? c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().has_failed()
+      : false;
 
-  if (C10_LIKELY(err == cudaSuccess)) {
+  if (C10_LIKELY(cuda_error == cudaSuccess && !cuda_kernel_failure)) {
     return;
   }
 
   std::string check_message;
 #ifndef STRIP_ERROR_MESSAGES
   check_message.append("CUDA error: ");
-  check_message.append(cudaGetErrorString(err));
+  check_message.append(cudaGetErrorString(cuda_error));
   check_message.append(c10::cuda::get_cuda_check_suffix());
+  check_message.append("\n");
+  if (include_device_assertions) {
+    check_message.append(c10_retrieve_device_side_assertion_info());
+  } else {
+    check_message.append(
+        "Device-side assertions were explicitly omitted for this error check; the error probably arose while initializing the DSA handlers.");
+  }
 #endif
 
   TORCH_CHECK(false, check_message);
diff --git a/c10/cuda/CUDAException.h b/c10/cuda/CUDAException.h
index ddc1eeeabf722..101036c4ae9eb 100644
--- a/c10/cuda/CUDAException.h
+++ b/c10/cuda/CUDAException.h
@@ -1,9 +1,11 @@
 #pragma once
 
+#include <c10/cuda/CUDADeviceAssertionHost.h>
 #include <c10/cuda/CUDAMacros.h>
 #include <c10/cuda/CUDAMiscFunctions.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/Exception.h>
+#include <c10/util/irange.h>
 #include <cuda.h>
 
 // Note [CHECK macro]
@@ -22,17 +24,17 @@ class C10_CUDA_API CUDAError : public c10::Error {
 };
 } // namespace c10
 
-#define C10_CUDA_CHECK(EXPR)                                               \
-  do {                                                                     \
-    const cudaError_t __err = EXPR;                                        \
-    if (C10_UNLIKELY(__err != cudaSuccess)) {                              \
-      c10::cuda::c10_cuda_check_implementation(                            \
-          __FILE__,                                                        \
-          __func__, /* Line number's data type is not well-defined between \
-                       compilers, so we perform an explicit cast */        \
-          static_cast<uint32_t>(__LINE__),                                 \
-          true);                                                           \
-    }                                                                      \
+#define C10_CUDA_CHECK(EXPR)                                             \
+  do {                                                                   \
+    /* We get & disarm the error inside of */                            \
+    /* `c10_cuda_check_implementation` */                                \
+    C10_UNUSED const cudaError_t __err = EXPR;                           \
+    c10::cuda::c10_cuda_check_implementation(                            \
+        __FILE__,                                                        \
+        __func__, /* Line number's data type is not well-defined between \
+                      compilers, so we perform an explicit cast */       \
+        static_cast<uint32_t>(__LINE__),                                 \
+        true);                                                           \
   } while (0)
 
 #define C10_CUDA_CHECK_WARN(EXPR)                              \
@@ -70,6 +72,21 @@ class C10_CUDA_API CUDAError : public c10::Error {
 // diagnostic if it didn't.
 #define C10_CUDA_KERNEL_LAUNCH_CHECK() C10_CUDA_CHECK(cudaGetLastError())
 
+/// Launches a CUDA kernel appending to it all the information need to handle
+/// device-side assertion failures. Checks that the launch was successful.
+#define TORCH_DSA_KERNEL_LAUNCH(                                      \
+    kernel, blocks, threads, shared_mem, stream, ...)                 \
+  do {                                                                \
+    auto& launch_registry =                                           \
+        c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref();     \
+    kernel<<<blocks, threads, shared_mem, stream>>>(                  \
+        __VA_ARGS__,                                                  \
+        launch_registry.get_uvm_assertions_ptr_for_current_device(),  \
+        launch_registry.insert(                                       \
+            __FILE__, __FUNCTION__, __LINE__, #kernel, stream.id())); \
+    C10_CUDA_KERNEL_LAUNCH_CHECK();                                   \
+  } while (0)
+
 namespace c10 {
 namespace cuda {
 
diff --git a/c10/cuda/impl/CUDATest.cpp b/c10/cuda/impl/CUDATest.cpp
index fb58d1c3a0f8f..c5d9e3f1bf2b0 100644
--- a/c10/cuda/impl/CUDATest.cpp
+++ b/c10/cuda/impl/CUDATest.cpp
@@ -11,7 +11,7 @@ namespace impl {
 
 bool has_cuda_gpu() {
   int count;
-  C10_CUDA_CHECK(cudaGetDeviceCount(&count));
+  C10_CUDA_IGNORE_ERROR(cudaGetDeviceCount(&count));
 
   return count != 0;
 }
diff --git a/c10/cuda/test/CMakeLists.txt b/c10/cuda/test/CMakeLists.txt
index 30d60871b8f12..eed7fdff42ca1 100644
--- a/c10/cuda/test/CMakeLists.txt
+++ b/c10/cuda/test/CMakeLists.txt
@@ -1,6 +1,13 @@
 # ---[ Test binaries.
 
 set(C10_CUDA_ALL_TEST_FILES
+    impl/CUDAAssertionsTest_1_var_test.cu
+    impl/CUDAAssertionsTest_catches_stream.cu
+    impl/CUDAAssertionsTest_catches_thread_and_block_and_device.cu
+    impl/CUDAAssertionsTest_from_2_processes.cu
+    impl/CUDAAssertionsTest_multiple_writes_from_blocks_and_threads.cu
+    impl/CUDAAssertionsTest_multiple_writes_from_multiple_blocks.cu
+    impl/CUDAAssertionsTest_multiple_writes_from_same_block.cu
     impl/CUDATest.cpp
 )
 if(BUILD_TEST)
diff --git a/c10/cuda/test/build.bzl b/c10/cuda/test/build.bzl
index b2d700820cc17..334b3a75b6aa7 100644
--- a/c10/cuda/test/build.bzl
+++ b/c10/cuda/test/build.bzl
@@ -1,10 +1,42 @@
+dsa_tests = [
+    "impl/CUDAAssertionsTest_1_var_test.cu",
+    "impl/CUDAAssertionsTest_catches_stream.cu",
+    "impl/CUDAAssertionsTest_catches_thread_and_block_and_device.cu",
+    "impl/CUDAAssertionsTest_from_2_processes.cu",
+    "impl/CUDAAssertionsTest_multiple_writes_from_blocks_and_threads.cu",
+    "impl/CUDAAssertionsTest_multiple_writes_from_multiple_blocks.cu",
+    "impl/CUDAAssertionsTest_multiple_writes_from_same_block.cu",
+]
+
 def define_targets(rules):
     rules.cc_test(
         name = "test",
-        srcs = ["impl/CUDATest.cpp"],
+        srcs = [
+            "impl/CUDATest.cpp",
+        ],
         deps = [
             "@com_google_googletest//:gtest_main",
             "//c10/cuda",
         ],
         target_compatible_with = rules.requires_cuda_enabled(),
     )
+
+    for src in dsa_tests:
+        name = src.replace("impl/", "").replace(".cu", "")
+        rules.cuda_library(
+            name = "test_" + name + "_lib",
+            srcs = [
+                src,
+            ],
+            deps = [
+                "@com_google_googletest//:gtest_main",
+                "//c10/cuda",
+            ],
+            target_compatible_with = rules.requires_cuda_enabled(),
+        )
+        rules.cc_test(
+            name = "test_" + name,
+            deps = [
+                ":test_" + name + "_lib",
+            ],
+        )
diff --git a/c10/cuda/test/impl/CUDAAssertionsTest_1_var_test.cu b/c10/cuda/test/impl/CUDAAssertionsTest_1_var_test.cu
new file mode 100644
index 0000000000000..f30774102a482
--- /dev/null
+++ b/c10/cuda/test/impl/CUDAAssertionsTest_1_var_test.cu
@@ -0,0 +1,102 @@
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <c10/cuda/CUDADeviceAssertion.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAFunctions.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <chrono>
+#include <iostream>
+#include <string>
+#include <thread>
+
+using ::testing::HasSubstr;
+
+void did_not_fail_diagnostics() {
+#ifdef TORCH_USE_CUDA_DSA
+  std::cerr << "DSA was enabled" << std::endl;
+#else
+  std::cerr << "DSA was not enabled" << std::endl;
+#endif
+
+  std::cerr
+      << "c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled = "
+      << c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled
+      << std::endl;
+  std::cerr
+      << "c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().is_enabled() = "
+      << c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().is_enabled()
+      << std::endl;
+  std::cerr
+      << "c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().do_all_devices_support_managed_memory = "
+      << c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref()
+             .do_all_devices_support_managed_memory
+      << std::endl;
+}
+
+/**
+ * Device kernel that takes a single integer parameter as argument and
+ * will always trigger a device side assertion.
+ */
+__global__ void cuda_always_fail_assertion_kernel(
+    const int a,
+    TORCH_DSA_KERNEL_ARGS) {
+  CUDA_KERNEL_ASSERT2(a != a);
+}
+
+/**
+ * TEST: Triggering device side assertion on a simple <<<1,1>>> config.
+ * kernel used takes only 1 variable as parameter function.
+ */
+void cuda_device_assertions_1_var_test() {
+  const auto stream = c10::cuda::getStreamFromPool();
+  TORCH_DSA_KERNEL_LAUNCH(
+      cuda_always_fail_assertion_kernel,
+      1, /* Blocks */
+      1, /* Threads */
+      0, /* Shared mem */
+      stream, /* Stream */
+      1);
+
+  try {
+    c10::cuda::device_synchronize();
+    did_not_fail_diagnostics();
+    throw std::runtime_error("Test didn't fail, but should have.");
+  } catch (const c10::Error& err) {
+    const auto err_str = std::string(err.what());
+    ASSERT_THAT(
+        err_str,
+        HasSubstr("CUDA device-side assertion failures were found on GPU #0!"));
+    ASSERT_THAT(
+        err_str, HasSubstr("Thread ID that failed assertion = [0,0,0]"));
+    ASSERT_THAT(err_str, HasSubstr("Block ID that failed assertion = [0,0,0]"));
+    ASSERT_THAT(err_str, HasSubstr("Device that launched kernel = 0"));
+    ASSERT_THAT(
+        err_str,
+        HasSubstr(
+            "Name of kernel launched that led to failure = cuda_always_fail_assertion_kernel"));
+    ASSERT_THAT(
+        err_str, HasSubstr("File containing kernel launch = " __FILE__));
+    ASSERT_THAT(
+        err_str,
+        HasSubstr(
+            "Function containing kernel launch = " +
+            std::string(__FUNCTION__)));
+    ASSERT_THAT(
+        err_str,
+        HasSubstr(
+            "Stream kernel was launched on = " + std::to_string(stream.id())));
+  }
+}
+
+TEST(CUDATest, cuda_device_assertions_1_var_test) {
+#ifdef TORCH_USE_CUDA_DSA
+  c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled = true;
+  std::cerr << "BEFORE TEST" << std::endl;
+  did_not_fail_diagnostics();
+  cuda_device_assertions_1_var_test();
+#else
+  GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled.";
+#endif
+}
diff --git a/c10/cuda/test/impl/CUDAAssertionsTest_catches_stream.cu b/c10/cuda/test/impl/CUDAAssertionsTest_catches_stream.cu
new file mode 100644
index 0000000000000..71fcf3ee2491f
--- /dev/null
+++ b/c10/cuda/test/impl/CUDAAssertionsTest_catches_stream.cu
@@ -0,0 +1,101 @@
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <c10/cuda/CUDADeviceAssertion.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAFunctions.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <chrono>
+#include <iostream>
+#include <string>
+#include <thread>
+
+using ::testing::HasSubstr;
+
+/**
+ * Device kernel that takes mulitple integer parameters as arguments and
+ * will always trigger a device side assertion.
+ */
+__global__ void cuda_multiple_vars_always_fail_assertion_kernel(
+    const int a,
+    const int b,
+    const int c,
+    const int d,
+    TORCH_DSA_KERNEL_ARGS) {
+  int i = a + b + c + d;
+  if (i != 0) {
+    CUDA_KERNEL_ASSERT2(i == -i);
+  } else {
+    CUDA_KERNEL_ASSERT2(i == i + 1);
+  }
+}
+
+/**
+ * Device kernel that takes a single integer parameter as argument and
+ * will always trigger a device side assertion.
+ */
+__global__ void cuda_always_fail_assertion_kernel(
+    const int a,
+    TORCH_DSA_KERNEL_ARGS) {
+  CUDA_KERNEL_ASSERT2(a != a);
+}
+
+/**
+ * TEST: Triggering device side assertion on a simple <<<1,1>>> config.
+ * kernel used takes multiple variables as parameters to the function.
+ */
+void cuda_device_assertions_catches_stream() {
+  const auto stream = c10::cuda::getStreamFromPool();
+  TORCH_DSA_KERNEL_LAUNCH(
+      cuda_multiple_vars_always_fail_assertion_kernel,
+      1, /* Blocks */
+      1, /* Threads */
+      0, /* Shared mem */
+      stream, /* Stream */
+      1, /* const int a */
+      2, /* const int b */
+      3, /* const int c */
+      4 /* const int d */
+  );
+
+  try {
+    c10::cuda::device_synchronize();
+    throw std::runtime_error("Test didn't fail, but should have.");
+  } catch (const c10::Error& err) {
+    const auto err_str = std::string(err.what());
+    ASSERT_THAT(
+        err_str, HasSubstr("# of GPUs this process interacted with = 1"));
+    ASSERT_THAT(
+        err_str,
+        HasSubstr("CUDA device-side assertion failures were found on GPU #0!"));
+    ASSERT_THAT(
+        err_str, HasSubstr("Thread ID that failed assertion = [0,0,0]"));
+    ASSERT_THAT(err_str, HasSubstr("Block ID that failed assertion = [0,0,0]"));
+    ASSERT_THAT(err_str, HasSubstr("Device that launched kernel = 0"));
+    ASSERT_THAT(
+        err_str,
+        HasSubstr(
+            "Name of kernel launched that led to failure = cuda_multiple_vars_always_fail_assertion_kernel"));
+    ASSERT_THAT(
+        err_str, HasSubstr("File containing kernel launch = " __FILE__));
+    ASSERT_THAT(
+        err_str,
+        HasSubstr(
+            "Function containing kernel launch = " +
+            std::string(__FUNCTION__)));
+    ASSERT_THAT(
+        err_str,
+        HasSubstr(
+            "Stream kernel was launched on = " + std::to_string(stream.id())));
+  }
+}
+
+TEST(CUDATest, cuda_device_assertions_catches_stream) {
+#ifdef TORCH_USE_CUDA_DSA
+  c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled = true;
+  cuda_device_assertions_catches_stream();
+#else
+  GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled.";
+#endif
+}
diff --git a/c10/cuda/test/impl/CUDAAssertionsTest_catches_thread_and_block_and_device.cu b/c10/cuda/test/impl/CUDAAssertionsTest_catches_thread_and_block_and_device.cu
new file mode 100644
index 0000000000000..1a0a0b475a0d9
--- /dev/null
+++ b/c10/cuda/test/impl/CUDAAssertionsTest_catches_thread_and_block_and_device.cu
@@ -0,0 +1,86 @@
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <c10/cuda/CUDADeviceAssertion.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAFunctions.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <chrono>
+#include <iostream>
+#include <string>
+#include <thread>
+
+using ::testing::HasSubstr;
+
+/**
+ * Device kernel that takes 2 arguments
+ * @param bad_thread represents the thread we want to trigger assertion on.
+ * @param bad_block represents the block we want to trigger assertion on.
+ * This kernel will only trigger a device side assertion for <<bad_block,
+ * bad_thread>> pair. all the other blocks and threads pairs will basically be
+ * no-op.
+ */
+__global__ void cuda_device_assertions_fail_on_thread_block_kernel(
+    const int bad_thread,
+    const int bad_block,
+    TORCH_DSA_KERNEL_ARGS) {
+  if (threadIdx.x == bad_thread && blockIdx.x == bad_block) {
+    CUDA_KERNEL_ASSERT2(false); // This comparison necessarily needs to fail
+  }
+}
+
+/**
+ * TEST: Triggering device side assertion on only 1 thread from <<<1024,128>>>
+ * grid. kernel used is unique, it take 2 parameters to tell which particular
+ * block and thread it should assert, all the other theads of the kernel will be
+ * basically no-op.
+ */
+void cuda_device_assertions_catches_thread_and_block_and_device() {
+  const auto stream = c10::cuda::getStreamFromPool();
+  TORCH_DSA_KERNEL_LAUNCH(
+      cuda_device_assertions_fail_on_thread_block_kernel,
+      1024, /* Blocks */
+      128, /* Threads */
+      0, /* Shared mem */
+      stream, /* Stream */
+      29, /* bad thread */
+      937 /* bad block */
+  );
+
+  try {
+    c10::cuda::device_synchronize();
+    throw std::runtime_error("Test didn't fail, but should have.");
+  } catch (const c10::Error& err) {
+    const auto err_str = std::string(err.what());
+    ASSERT_THAT(
+        err_str, HasSubstr("Thread ID that failed assertion = [29,0,0]"));
+    ASSERT_THAT(
+        err_str, HasSubstr("Block ID that failed assertion = [937,0,0]"));
+    ASSERT_THAT(err_str, HasSubstr("Device that launched kernel = 0"));
+    ASSERT_THAT(
+        err_str,
+        HasSubstr(
+            "Name of kernel launched that led to failure = cuda_device_assertions_fail_on_thread_block_kernel"));
+    ASSERT_THAT(
+        err_str, HasSubstr("File containing kernel launch = " __FILE__));
+    ASSERT_THAT(
+        err_str,
+        HasSubstr(
+            "Function containing kernel launch = " +
+            std::string(__FUNCTION__)));
+    ASSERT_THAT(
+        err_str,
+        HasSubstr(
+            "Stream kernel was launched on = " + std::to_string(stream.id())));
+  }
+}
+
+TEST(CUDATest, cuda_device_assertions_catches_thread_and_block_and_device) {
+#ifdef TORCH_USE_CUDA_DSA
+  c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled = true;
+  cuda_device_assertions_catches_thread_and_block_and_device();
+#else
+  GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled.";
+#endif
+}
diff --git a/c10/cuda/test/impl/CUDAAssertionsTest_from_2_processes.cu b/c10/cuda/test/impl/CUDAAssertionsTest_from_2_processes.cu
new file mode 100644
index 0000000000000..3f829259a6b03
--- /dev/null
+++ b/c10/cuda/test/impl/CUDAAssertionsTest_from_2_processes.cu
@@ -0,0 +1,108 @@
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <c10/cuda/CUDADeviceAssertion.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAFunctions.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <chrono>
+#include <iostream>
+#include <string>
+#include <thread>
+
+using ::testing::HasSubstr;
+
+const auto max_assertions_failure_str =
+    "Assertion failure " + std::to_string(C10_CUDA_DSA_ASSERTION_COUNT - 1);
+
+/**
+ * Device kernel that takes a single integer parameter as argument and
+ * will always trigger a device side assertion.
+ */
+__global__ void cuda_always_fail_assertion_kernel(
+    const int a,
+    TORCH_DSA_KERNEL_ARGS) {
+  CUDA_KERNEL_ASSERT2(a != a);
+}
+
+/**
+ * Device kernel that takes a single integer parameter as argument and
+ * will never trigger a device side assertion.
+ */
+__global__ void cuda_always_succeed_assertion_kernel(
+    const int a,
+    TORCH_DSA_KERNEL_ARGS) {
+  CUDA_KERNEL_ASSERT2(a == a);
+}
+
+// Windows doesn't like `fork`
+#ifndef _MSC_VER
+/**
+ * TEST: Triggering device side assertion from 2 different processes from CPU.
+ * The following code is testing if two processes from CPU that are running
+ * GPU kernels (not necessarily simultaneously) and are asserting & writing
+ * to the respective UVMs, mess up anything for each other.
+ * Once parent process's kernel launch fails and causes a device-side assertion
+ * and is still alive when the second process is interacting with the GPU,
+ * trying to launch another kernel.
+ */
+void cuda_device_assertions_from_2_processes() {
+  const auto n1 = fork();
+  if (n1 == 0) {
+    // This is the parent process, that will call an assertion failure.
+    // This should execute before the child process.
+    // We are achieving this by putting the child process to sleep.
+    TORCH_DSA_KERNEL_LAUNCH(
+        cuda_always_fail_assertion_kernel,
+        1, /* Blocks */
+        1, /* Threads */
+        0, /* Shared mem */
+        c10::cuda::getStreamFromPool(), /* Stream */
+        1);
+    try {
+      c10::cuda::device_synchronize();
+      throw std::runtime_error("Test didn't fail, but should have.");
+    } catch (const c10::Error& err) {
+      const auto err_str = std::string(err.what());
+      ASSERT_THAT(
+          err_str,
+          HasSubstr(
+              "1 CUDA device-side assertion failures were found on GPU #0!"));
+    }
+    // Keep this alive so we can see what happened to the other process
+    std::this_thread::sleep_for(std::chrono::milliseconds(3000));
+  } else {
+    // This is the child process
+    // We put it to sleep for next 2 seconds, to make sure that the parent has
+    // asserted a failure already.
+    std::this_thread::sleep_for(std::chrono::milliseconds(2000));
+    TORCH_DSA_KERNEL_LAUNCH(
+        cuda_always_succeed_assertion_kernel,
+        1, /* Blocks */
+        1, /* Threads */
+        0, /* Shared mem */
+        c10::cuda::getStreamFromPool(), /* Stream */
+        1);
+    try {
+      c10::cuda::device_synchronize();
+    } catch (const c10::Error& err) {
+      ASSERT_TRUE(false); // This kernel should not have failed, but did.
+    }
+    // End the child process
+    exit(0);
+  }
+}
+
+TEST(CUDATest, cuda_device_assertions_from_2_processes) {
+#ifdef TORCH_USE_CUDA_DSA
+  c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled = true;
+  cuda_device_assertions_from_2_processes();
+#else
+  GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled.";
+#endif
+}
+
+#else
+
+#endif
diff --git a/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_blocks_and_threads.cu b/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_blocks_and_threads.cu
new file mode 100644
index 0000000000000..f5f8597f20c9a
--- /dev/null
+++ b/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_blocks_and_threads.cu
@@ -0,0 +1,93 @@
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <c10/cuda/CUDADeviceAssertion.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAFunctions.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <chrono>
+#include <iostream>
+#include <string>
+#include <thread>
+
+using ::testing::HasSubstr;
+
+const auto max_assertions_failure_str =
+    "Assertion failure " + std::to_string(C10_CUDA_DSA_ASSERTION_COUNT - 1);
+
+/**
+ * Device kernel that takes a single integer parameter as argument and
+ * will always trigger a device side assertion.
+ */
+__global__ void cuda_always_fail_assertion_kernel(
+    const int a,
+    TORCH_DSA_KERNEL_ARGS) {
+  CUDA_KERNEL_ASSERT2(a != a);
+}
+
+/**
+ * TEST: Triggering device side assertion from multiple block but single thread
+ * <<<10,128>>>. Here we are triggering assertion on 10 blocks, each with only
+ * 128 thread.
+ */
+void cuda_device_assertions_multiple_writes_from_blocks_and_threads() {
+  bool run_threads = false;
+
+  // Create a function to launch kernel that waits for a signal, to try to
+  // ensure everything is happening simultaneously
+  const auto launch_the_kernel = [&]() {
+    // Busy loop waiting for the signal to go
+    while (!run_threads) {
+    }
+
+    TORCH_DSA_KERNEL_LAUNCH(
+        cuda_always_fail_assertion_kernel,
+        10, /* Blocks */
+        128, /* Threads */
+        0, /* Shared mem */
+        c10::cuda::getCurrentCUDAStream(), /* Stream */
+        1);
+  };
+
+  // Spin up a bunch of busy-looping threads
+  std::vector<std::thread> threads;
+  for (int i = 0; i < 10; i++) {
+    threads.emplace_back(launch_the_kernel);
+  }
+
+  // Paranoid - wait for all the threads to get setup
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+  // Mash
+  run_threads = true;
+
+  // Clean-up
+  for (auto& x : threads) {
+    x.join();
+  }
+
+  try {
+    c10::cuda::device_synchronize();
+    throw std::runtime_error("Test didn't fail, but should have.");
+  } catch (const c10::Error& err) {
+    const auto err_str = std::string(err.what());
+    ASSERT_THAT(err_str, HasSubstr(max_assertions_failure_str));
+    ASSERT_THAT(err_str, HasSubstr("Device that launched kernel = 0"));
+    ASSERT_THAT(
+        err_str,
+        HasSubstr(
+            "Name of kernel launched that led to failure = cuda_always_fail_assertion_kernel"));
+    ASSERT_THAT(
+        err_str, HasSubstr("File containing kernel launch = " __FILE__));
+  }
+}
+
+TEST(CUDATest, cuda_device_assertions_multiple_writes_from_blocks_and_threads) {
+#ifdef TORCH_USE_CUDA_DSA
+  c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled = true;
+  cuda_device_assertions_multiple_writes_from_blocks_and_threads();
+#else
+  GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled.";
+#endif
+}
diff --git a/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_multiple_blocks.cu b/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_multiple_blocks.cu
new file mode 100644
index 0000000000000..a66c792d5a236
--- /dev/null
+++ b/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_multiple_blocks.cu
@@ -0,0 +1,90 @@
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <c10/cuda/CUDADeviceAssertion.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAFunctions.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <chrono>
+#include <iostream>
+#include <string>
+#include <thread>
+
+using ::testing::HasSubstr;
+
+const auto max_assertions_failure_str =
+    "Assertion failure " + std::to_string(C10_CUDA_DSA_ASSERTION_COUNT - 1);
+
+/**
+ * Device kernel that takes a single integer parameter as argument and
+ * will always trigger a device side assertion.
+ */
+__global__ void cuda_always_fail_assertion_kernel(
+    const int a,
+    TORCH_DSA_KERNEL_ARGS) {
+  CUDA_KERNEL_ASSERT2(a != a);
+}
+
+/**
+ * TEST: Triggering device side assertion from multiple block but single thread
+ * <<<10,1>>>. Here we are triggering assertion on 10 blocks, each with only 1
+ * thread. Since we have more than 10 SM on a GPU, we expect each block to be
+ * executed and successfully assert, Hence we will see assertions logged from
+ * each block here.
+ */
+void cuda_device_assertions_multiple_writes_from_multiple_blocks() {
+  const auto stream = c10::cuda::getStreamFromPool();
+  TORCH_DSA_KERNEL_LAUNCH(
+      cuda_always_fail_assertion_kernel,
+      10, /* Blocks */
+      1, /* Threads */
+      0, /* Shared mem */
+      stream, /* Stream */
+      1);
+
+  try {
+    c10::cuda::device_synchronize();
+    throw std::runtime_error("Test didn't fail, but should have.");
+  } catch (const c10::Error& err) {
+    const auto err_str = std::string(err.what());
+    ASSERT_THAT(err_str, HasSubstr(max_assertions_failure_str));
+    ASSERT_THAT(
+        err_str, HasSubstr("Thread ID that failed assertion = [0,0,0]"));
+    ASSERT_THAT(err_str, HasSubstr("Block ID that failed assertion = [0,0,0]"));
+    ASSERT_THAT(err_str, HasSubstr("Block ID that failed assertion = [1,0,0]"));
+    ASSERT_THAT(err_str, HasSubstr("Block ID that failed assertion = [2,0,0]"));
+    ASSERT_THAT(err_str, HasSubstr("Block ID that failed assertion = [3,0,0]"));
+    ASSERT_THAT(err_str, HasSubstr("Block ID that failed assertion = [4,0,0]"));
+    ASSERT_THAT(err_str, HasSubstr("Block ID that failed assertion = [5,0,0]"));
+    ASSERT_THAT(err_str, HasSubstr("Block ID that failed assertion = [6,0,0]"));
+    ASSERT_THAT(err_str, HasSubstr("Block ID that failed assertion = [7,0,0]"));
+    ASSERT_THAT(err_str, HasSubstr("Block ID that failed assertion = [8,0,0]"));
+    ASSERT_THAT(err_str, HasSubstr("Block ID that failed assertion = [9,0,0]"));
+    ASSERT_THAT(err_str, HasSubstr("Device that launched kernel = 0"));
+    ASSERT_THAT(
+        err_str,
+        HasSubstr(
+            "Name of kernel launched that led to failure = cuda_always_fail_assertion_kernel"));
+    ASSERT_THAT(
+        err_str, HasSubstr("File containing kernel launch = " __FILE__));
+    ASSERT_THAT(
+        err_str,
+        HasSubstr(
+            "Function containing kernel launch = " +
+            std::string(__FUNCTION__)));
+    ASSERT_THAT(
+        err_str,
+        HasSubstr(
+            "Stream kernel was launched on = " + std::to_string(stream.id())));
+  }
+}
+
+TEST(CUDATest, cuda_device_assertions_multiple_writes_from_multiple_blocks) {
+#ifdef TORCH_USE_CUDA_DSA
+  c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled = true;
+  cuda_device_assertions_multiple_writes_from_multiple_blocks();
+#else
+  GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled.";
+#endif
+}
diff --git a/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_same_block.cu b/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_same_block.cu
new file mode 100644
index 0000000000000..f1e39c8ba19d9
--- /dev/null
+++ b/c10/cuda/test/impl/CUDAAssertionsTest_multiple_writes_from_same_block.cu
@@ -0,0 +1,78 @@
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <c10/cuda/CUDADeviceAssertion.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAFunctions.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <chrono>
+#include <iostream>
+#include <string>
+#include <thread>
+
+using ::testing::HasSubstr;
+
+const auto max_assertions_failure_str =
+    "Assertion failure " + std::to_string(C10_CUDA_DSA_ASSERTION_COUNT - 1);
+
+/**
+ * Device kernel that takes a single integer parameter as argument and
+ * will always trigger a device side assertion.
+ */
+__global__ void cuda_always_fail_assertion_kernel(
+    const int a,
+    TORCH_DSA_KERNEL_ARGS) {
+  CUDA_KERNEL_ASSERT2(a != a);
+}
+
+/**
+ * TEST: Triggering device side assertion from single block and multiple threads
+ * <<<1,128>>>. Once the very first thread asserts all the other threads will
+ * basically be in bad state and the block id with failed asseriton would be
+ * [0,0,0].
+ */
+void cuda_device_assertions_multiple_writes_from_same_block() {
+  const auto stream = c10::cuda::getStreamFromPool();
+  TORCH_DSA_KERNEL_LAUNCH(
+      cuda_always_fail_assertion_kernel,
+      1, /* Blocks */
+      128, /* Threads */
+      0, /* Shared mem */
+      stream, /* Stream */
+      1);
+
+  try {
+    c10::cuda::device_synchronize();
+    throw std::runtime_error("Test didn't fail, but should have.");
+  } catch (const c10::Error& err) {
+    const auto err_str = std::string(err.what());
+    ASSERT_THAT(err_str, HasSubstr(max_assertions_failure_str));
+    ASSERT_THAT(err_str, HasSubstr("Block ID that failed assertion = [0,0,0]"));
+    ASSERT_THAT(err_str, HasSubstr("Device that launched kernel = 0"));
+    ASSERT_THAT(
+        err_str,
+        HasSubstr(
+            "Name of kernel launched that led to failure = cuda_always_fail_assertion_kernel"));
+    ASSERT_THAT(
+        err_str, HasSubstr("File containing kernel launch = " __FILE__));
+    ASSERT_THAT(
+        err_str,
+        HasSubstr(
+            "Function containing kernel launch = " +
+            std::string(__FUNCTION__)));
+    ASSERT_THAT(
+        err_str,
+        HasSubstr(
+            "Stream kernel was launched on = " + std::to_string(stream.id())));
+  }
+}
+
+TEST(CUDATest, cuda_device_assertions_multiple_writes_from_same_block) {
+#ifdef TORCH_USE_CUDA_DSA
+  c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().enabled = true;
+  cuda_device_assertions_multiple_writes_from_same_block();
+#else
+  GTEST_SKIP() << "CUDA device-side assertions (DSA) was not enabled.";
+#endif
+}
diff --git a/tools/bazel.bzl b/tools/bazel.bzl
index f7da1839930d2..3c6f98154aebb 100644
--- a/tools/bazel.bzl
+++ b/tools/bazel.bzl
@@ -1,5 +1,5 @@
 load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test")
-load("@rules_cuda//cuda:defs.bzl", "requires_cuda_enabled")
+load("@rules_cuda//cuda:defs.bzl", "cuda_library", "requires_cuda_enabled")
 load("//c10/macros:cmake_configure_file.bzl", "cmake_configure_file")
 load("//tools/config:defs.bzl", "if_cuda")
 
@@ -25,6 +25,7 @@ rules = struct(
     cc_library = cc_library,
     cc_test = cc_test,
     cmake_configure_file = cmake_configure_file,
+    cuda_library = cuda_library,
     filegroup = native.filegroup,
     genrule = _genrule,
     glob = native.glob,
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index 9a3065c675a28..1afb7c37fc6b7 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -4128,6 +4128,7 @@
             "cudaStreamGetPriority",
             ("hipStreamGetPriority", CONV_STREAM, API_RUNTIME, HIP_UNSUPPORTED),
         ),
+        ("cudaCpuDeviceId", ("hipCpuDeviceId", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamDefault", ("hipStreamDefault", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamNonBlocking", ("hipStreamNonBlocking", CONV_TYPE, API_RUNTIME)),
         ("cudaDeviceSynchronize", ("hipDeviceSynchronize", CONV_DEVICE, API_RUNTIME)),
@@ -8270,6 +8271,8 @@
     [
         ("cuda::compat::", ("hip::compat::", API_C10)),
         ("c10/cuda/CUDAAlgorithm.h", ("c10/hip/HIPAlgorithm.h", API_C10)),
+        ("c10/cuda/CUDADeviceAssertion.h", ("c10/hip/HIPDeviceAssertion.h", API_C10)),
+        ("c10/cuda/CUDADeviceAssertionHost.h", ("c10/hip/HIPDeviceAssertionHost.h", API_C10)),
         ("c10/cuda/CUDAException.h", ("c10/hip/HIPException.h", API_C10)),
         ("c10/cuda/CUDAMacros.h", ("c10/hip/HIPMacros.h", API_C10)),
         ("c10/cuda/CUDAMathCompat.h", ("c10/hip/HIPMathCompat.h", API_C10)),

From 7bfeffe27f3660d8cb593ed467414b3306c05659 Mon Sep 17 00:00:00 2001
From: Sergii Dymchenko <sdym@fb.com>
Date: Thu, 8 Dec 2022 01:39:23 +0000
Subject: [PATCH 1702/1922] Fix issue 38095 TODO in test_jit_fuser_te.py
 (#90246)

Fix TODO related to https://github.com/pytorch/pytorch/issues/38095
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90246
Approved by: https://github.com/clee2000
---
 test/test_jit_fuser_te.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index 6839837218117..4cfcfbe4b315c 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -1046,8 +1046,7 @@ def fn_test_rand2(x, y):
         script_f = torch.jit.script(fn_test_rand2)
         warmup_forward(script_f, x, y)
         out = script_f(x, y)
-        # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-        self.assertEqualIgnoreType(out[0, :] + torch.zeros(4, 4, device='cuda'), out)
+        self.assertEqual(out[0, :] + torch.zeros(4, 4, device='cuda'), out)
 
     @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
     @unittest.skip("rand_like is not supported yet")

From cbc8d58bd44360870de10db44ba07c3c1a014066 Mon Sep 17 00:00:00 2001
From: Will Constable <whc@fb.com>
Date: Wed, 7 Dec 2022 21:53:23 +0000
Subject: [PATCH 1703/1922] Revert "Disable dynamo tracing torchrec.distributed
 (#90087)" (#90416)

This reverts commit 7e9a8a1361a090cee86544a3c029b9b4ed622e9c.

This revert fixes a torchbench dlrm amp crash.  Auto revert fails due to conflict.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90416
Approved by: https://github.com/yanboliang, https://github.com/malfet
---
 torch/_dynamo/skipfiles.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/torch/_dynamo/skipfiles.py b/torch/_dynamo/skipfiles.py
index 3f90fce3c3525..41a04626756d2 100644
--- a/torch/_dynamo/skipfiles.py
+++ b/torch/_dynamo/skipfiles.py
@@ -146,10 +146,7 @@ def add(import_name: str):
     if isinstance(import_name, types.ModuleType):
         return add(import_name.__name__)
     assert isinstance(import_name, str)
-    try:
-        module_spec = importlib.util.find_spec(import_name)
-    except Exception:
-        return
+    module_spec = importlib.util.find_spec(import_name)
     if not module_spec:
         return
     origin = module_spec.origin
@@ -192,7 +189,6 @@ def check(filename, allow_torch=False):
     "tvm",
     "fx2trt_oss",
     "xarray",
-    "torchrec.distributed",
 ):
     add(_name)
 

From 90997265429c3a66a3b662160d5c586f5ab89afa Mon Sep 17 00:00:00 2001
From: Digant Desai <digantdesai@meta.com>
Date: Thu, 8 Dec 2022 02:02:53 +0000
Subject: [PATCH 1704/1922] [pthreadpool] Set max threadlimit to tsan limit
 (#89453)

Summary:
This will make sure we don't run into an internal assert for clang tsan which has a cap of 63 on concurrently held lock count.
Seems like it is failing with 64 since the comparison is `<`, so setting it to 63 here.

```
llvm-project/compiler-rt/lib/sanitizer_common/sanitizer_deadlock_detector.h:67 "((n_all_locks_)) < (((sizeof(all_locks_with_contexts_)/sizeof((all_locks_with_contexts_)[0]))))"
```

Created from CodeHub with https://fburl.com/edit-in-codehub

Test Plan:
CI

Sandcastle run

Reviewed By: kimishpatel, salilsdesai

Differential Revision: D41444710

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89453
Approved by: https://github.com/mcr229
---
 caffe2/utils/threadpool/ThreadPool.cc | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/caffe2/utils/threadpool/ThreadPool.cc b/caffe2/utils/threadpool/ThreadPool.cc
index cbccf0749bef1..79fc279f3591b 100644
--- a/caffe2/utils/threadpool/ThreadPool.cc
+++ b/caffe2/utils/threadpool/ThreadPool.cc
@@ -103,12 +103,13 @@ size_t getDefaultNumThreads() {
 
   /*
    * For llvm-tsan, holding limit for the number of locks for a single thread
-   * is 64. pthreadpool's worst case is the number of threads in a pool. So we
-   * want to limit the threadpool size to 64 when running with tsan. However,
-   * sometimes it is tricky to detect if we are running under tsan, for now
-   * capping the default threadcount to the tsan limit unconditionally.
+   * is 63 (because of comparison < 64 instead of <=). pthreadpool's worst
+   * case is the number of threads in a pool. So we want to limit the threadpool
+   * size to 64 when running with tsan. However, sometimes it is tricky to
+   * detect if we are running under tsan, for now capping the default
+   * threadcount to the tsan limit unconditionally.
    */
-  int tsanThreadLimit = 64;
+  int tsanThreadLimit = 63;
   numThreads = std::min(numThreads, tsanThreadLimit);
 
   return numThreads;

From 293ade79a341310bce20fee26f5f2f1f2f664453 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Thu, 8 Dec 2022 02:27:48 +0000
Subject: [PATCH 1705/1922] Migrate PyTorch to C++17 (#85969)

With CUDA-10.2 gone we can finally do it!

This PR mostly contains build system related changes, invasive functional ones are to be followed.
Among many expected tweaks to the build system, here are few unexpected ones:
 - Force onnx_proto project to be updated to C++17 to avoid `duplicate symbols` error when compiled by gcc-7.5.0, as storage rule for `constexpr` changed in C++17, but gcc does not seem to follow it
 - Do not use `std::apply` on CUDA but rely on the built-in variant, as it results in test failures when CUDA runtime picks host rather than device function when `std::apply` is invoked from CUDA code.
 - `std::decay_t` -> `::std::decay_t` and `std::move`->`::std::move` as VC++ for some reason claims that `std` symbol is ambigious
 - Disable use of `std::aligned_alloc` on Android, as its `libc++` does not implement it.

Some prerequisites:
 - https://github.com/pytorch/pytorch/pull/89297
 - https://github.com/pytorch/pytorch/pull/89605
 - https://github.com/pytorch/pytorch/pull/90228
 - https://github.com/pytorch/pytorch/pull/90389
 - https://github.com/pytorch/pytorch/pull/90379
 - https://github.com/pytorch/pytorch/pull/89570
 - https://github.com/facebookincubator/gloo/pull/336
 - https://github.com/facebookincubator/gloo/pull/343
 - https://github.com/pytorch/builder/commit/919676fb32fa751f1589d95e0d3b76489d942d80

Fixes https://github.com/pytorch/pytorch/issues/56055

Pull Request resolved: https://github.com/pytorch/pytorch/pull/85969
Approved by: https://github.com/ezyang, https://github.com/kulinseth
---
 .circleci/scripts/build_android_gradle.sh      |  5 +++++
 CMakeLists.txt                                 |  6 ++----
 android/pytorch_android/CMakeLists.txt         |  2 +-
 .../pytorch_android_torchvision/CMakeLists.txt |  2 +-
 android/test_app/app/CMakeLists.txt            |  2 +-
 .../impl/make_boxed_from_unboxed_functor.h     |  6 ++++--
 aten/src/ATen/native/cpu/GridSamplerKernel.cpp |  4 ++--
 aten/src/ATen/native/cuda/jit_utils.cpp        |  4 ++--
 c10/CMakeLists.txt                             |  2 +-
 c10/util/C++17.h                               |  2 +-
 cmake/Dependencies.cmake                       |  7 +++++--
 cmake/ProtoBufPatch.cmake                      | 14 ++++++++------
 cmake/public/utils.cmake                       |  2 +-
 functorch/CMakeLists.txt                       |  2 +-
 ios/TestApp/TestApp.xcodeproj/project.pbxproj  |  4 ++--
 scripts/build_android.sh                       |  5 +++++
 test/custom_backend/CMakeLists.txt             |  4 ++--
 test/custom_operator/CMakeLists.txt            |  4 ++--
 test/jit_hooks/CMakeLists.txt                  |  2 +-
 test/mobile/custom_build/CMakeLists.txt        |  2 +-
 torch/_inductor/codecache.py                   |  2 +-
 torch/csrc/jit/codegen/cuda/executor_utils.cpp |  4 ++--
 .../jit/codegen/fuser/cpu/fused_kernel.cpp     |  2 +-
 .../jit/codegen/fuser/cuda/fused_kernel.cpp    |  4 ++--
 torch/csrc/jit/tensorexpr/cuda_codegen.cpp     |  4 ++--
 torch/lib/libshm/CMakeLists.txt                |  2 +-
 torch/utils/cpp_extension.py                   | 18 +++++++++---------
 27 files changed, 66 insertions(+), 51 deletions(-)

diff --git a/.circleci/scripts/build_android_gradle.sh b/.circleci/scripts/build_android_gradle.sh
index 598e9cd0a6bd2..8312a18eb0aad 100755
--- a/.circleci/scripts/build_android_gradle.sh
+++ b/.circleci/scripts/build_android_gradle.sh
@@ -20,6 +20,11 @@ do
   touch "$file" || true
 done < <(find /var/lib/jenkins/.gradle -type f -print0)
 
+# Patch pocketfft (as Android does not have aligned_alloc even if compiled with c++17
+if [ -f ~/workspace/third_party/pocketfft/pocketfft_hdronly.h ]; then
+  sed -i -e "s/#if __cplusplus >= 201703L/#if 0/" ~/workspace/third_party/pocketfft/pocketfft_hdronly.h
+fi
+
 export GRADLE_LOCAL_PROPERTIES=~/workspace/android/local.properties
 rm -f $GRADLE_LOCAL_PROPERTIES
 echo "sdk.dir=/opt/android/sdk" >> $GRADLE_LOCAL_PROPERTIES
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 784b528417041..003fe7fa3d1b5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,9 +31,9 @@ string(FIND "${CMAKE_CXX_FLAGS}" "-std=c++" env_cxx_standard)
 if(env_cxx_standard GREATER -1)
   message(
       WARNING "C++ standard version definition detected in environment variable."
-      "PyTorch requires -std=c++14. Please remove -std=c++ settings in your environment.")
+      "PyTorch requires -std=c++17. Please remove -std=c++ settings in your environment.")
 endif()
-set(CMAKE_CXX_STANDARD 14 CACHE STRING "The C++ standard whose features are requested to build this target.")
+set(CMAKE_CXX_STANDARD 17 CACHE STRING "The C++ standard whose features are requested to build this target.")
 set(CMAKE_C_STANDARD   11 CACHE STRING "The C standard whose features are requested to build this target.")
 
 if(DEFINED GLIBCXX_USE_CXX11_ABI)
@@ -884,7 +884,6 @@ if(NOT MSVC)
     append_cxx_flag_if_supported("-Wno-unused-private-field" CMAKE_CXX_FLAGS)
     append_cxx_flag_if_supported("-Wno-inconsistent-missing-override" CMAKE_CXX_FLAGS)
     append_cxx_flag_if_supported("-Wno-aligned-allocation-unavailable" CMAKE_CXX_FLAGS)
-    append_cxx_flag_if_supported("-Wno-c++14-extensions" CMAKE_CXX_FLAGS)
     append_cxx_flag_if_supported("-Wno-constexpr-not-const" CMAKE_CXX_FLAGS)
     append_cxx_flag_if_supported("-Wno-missing-braces" CMAKE_CXX_FLAGS)
     append_cxx_flag_if_supported("-Wunused-lambda-capture" CMAKE_CXX_FLAGS)
@@ -989,7 +988,6 @@ if(APPLE)
     endif()
     append_cxx_flag_if_supported("-Wno-unused-private-field" CMAKE_CXX_FLAGS)
     append_cxx_flag_if_supported("-Wno-missing-braces" CMAKE_CXX_FLAGS)
-    append_cxx_flag_if_supported("-Wno-c++14-extensions" CMAKE_CXX_FLAGS)
     append_cxx_flag_if_supported("-Wno-constexpr-not-const" CMAKE_CXX_FLAGS)
 endif()
 
diff --git a/android/pytorch_android/CMakeLists.txt b/android/pytorch_android/CMakeLists.txt
index ad2647c2f4df6..9691d5694c441 100644
--- a/android/pytorch_android/CMakeLists.txt
+++ b/android/pytorch_android/CMakeLists.txt
@@ -14,7 +14,7 @@ endif()
 
 include(GNUInstallDirs)
 
-set(CMAKE_CXX_STANDARD 14 CACHE STRING "The C++ standard whose features are requested to build this target.")
+set(CMAKE_CXX_STANDARD 17 CACHE STRING "The C++ standard whose features are requested to build this target.")
 set(CMAKE_VERBOSE_MAKEFILE ON)
 message(STATUS "ANDROID_STL:${ANDROID_STL}")
 
diff --git a/android/pytorch_android_torchvision/CMakeLists.txt b/android/pytorch_android_torchvision/CMakeLists.txt
index 08de7cebde491..849e4d07cc1d5 100644
--- a/android/pytorch_android_torchvision/CMakeLists.txt
+++ b/android/pytorch_android_torchvision/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.4.1)
 project(pytorch_vision_jni CXX)
-set(CMAKE_CXX_STANDARD 14 CACHE STRING "The C++ standard whose features are requested to build this target.")
+set(CMAKE_CXX_STANDARD 17 CACHE STRING "The C++ standard whose features are requested to build this target.")
 set(CMAKE_VERBOSE_MAKEFILE ON)
 
 set(pytorch_vision_cpp_DIR ${CMAKE_CURRENT_LIST_DIR}/src/main/cpp)
diff --git a/android/test_app/app/CMakeLists.txt b/android/test_app/app/CMakeLists.txt
index 457ccbe189bd7..cfdc4976ef48d 100644
--- a/android/test_app/app/CMakeLists.txt
+++ b/android/test_app/app/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.4.1)
 set(PROJECT_NAME pytorch_testapp_jni)
 project(${PROJECT_NAME} CXX)
-set(CMAKE_CXX_STANDARD 14 CACHE STRING "The C++ standard whose features are requested to build this target.")
+set(CMAKE_CXX_STANDARD 17 CACHE STRING "The C++ standard whose features are requested to build this target.")
 set(CMAKE_VERBOSE_MAKEFILE ON)
 
 set(build_DIR ${CMAKE_SOURCE_DIR}/build)
diff --git a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
index a99f45040788d..bf2a8819f989b 100644
--- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
+++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
@@ -577,14 +577,16 @@ namespace impl {
         // Decay ReturnType to ReturnType_ so that if a reference gets returned, we actually store it by value
         // and don't get a dangling reference. This is only required because some kernels still return `Tensor&`.
 #ifdef __cpp_if_constexpr
-        using ReturnType_ = std::decay_t<ReturnType>;
+        // [Note: VC++ and 'std': ambiguous symbol]
+        using ReturnType_ = ::std::decay_t<ReturnType>;
         ReturnType_ output = call_functor_with_args_from_stack<KernelFunctor, AllowDeprecatedTypes>(functor, dispatchKeySet, stack);
 #else
         using ReturnType_ = std::decay_t<typename decltype(delay_check)::template type_identity<ReturnType>>;
         ReturnType_ output = call_functor_with_args_from_stack<KernelFunctor, AllowDeprecatedTypes>(functor, dispatchKeySet, delay_check(stack));
 #endif
         torch::jit::drop(*stack, num_inputs);
-        push_outputs<ReturnType_, AllowDeprecatedTypes>::call(std::move(output), stack);
+        // See note [ VC++ and 'std': ambiguous symbol]
+        push_outputs<ReturnType_, AllowDeprecatedTypes>::call(::std::move(output), stack);
 #ifdef __cpp_if_constexpr
       } else {
 #else
diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
index 47b20b2ca4c18..c80c5d2f000d6 100644
--- a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
+++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
@@ -511,8 +511,8 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear,
     auto sw = n * e;
     auto se = n * w;
 
-    auto i_x_w = convert_to_int_of_same_size(x_w);
-    auto i_y_n = convert_to_int_of_same_size(y_n);
+    auto i_x_w = convert_to_int_of_same_size<scalar_t>(x_w);
+    auto i_y_n = convert_to_int_of_same_size<scalar_t>(y_n);
     auto i_x_e = i_x_w + iVec(1);
     auto i_y_s = i_y_n + iVec(1);
 
diff --git a/aten/src/ATen/native/cuda/jit_utils.cpp b/aten/src/ATen/native/cuda/jit_utils.cpp
index a1266fb1b5044..870e980bb69ee 100644
--- a/aten/src/ATen/native/cuda/jit_utils.cpp
+++ b/aten/src/ATen/native/cuda/jit_utils.cpp
@@ -1532,7 +1532,7 @@ NvrtcFunction jit_pwise_function(
       &program, code.c_str(), nullptr, 0, nullptr, nullptr));
 
 #ifdef USE_ROCM
-  std::vector<const char*> args = {"--std=c++14"};
+  std::vector<const char*> args = {"--std=c++17"};
 #else
   // Constructs nvrtc build arguments
   // CUDA 11.1 allows going directly to SASS (sm_) instead of PTX (compute_)
@@ -1547,7 +1547,7 @@ NvrtcFunction jit_pwise_function(
       std::to_string(cuda_minor);
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   std::vector<const char*> args = {
-      "--std=c++14", compute.c_str(), "-default-device"};
+      "--std=c++17", compute.c_str(), "-default-device"};
 #endif
 
   #ifndef NDEBUG
diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt
index 0309d7a2d712e..9c80fa9051ab6 100644
--- a/c10/CMakeLists.txt
+++ b/c10/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
 project(c10 CXX)
 
-set(CMAKE_CXX_STANDARD 14 CACHE STRING "The C++ standard whose features are requested to build this target.")
+set(CMAKE_CXX_STANDARD 17 CACHE STRING "The C++ standard whose features are requested to build this target.")
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 # Main build file for the C10 library.
diff --git a/c10/util/C++17.h b/c10/util/C++17.h
index 5227578481dea..09259ab840bed 100644
--- a/c10/util/C++17.h
+++ b/c10/util/C++17.h
@@ -140,7 +140,7 @@ using void_t = typename make_void<Ts...>::type;
 #define CUDA_HOST_DEVICE C10_HOST_DEVICE
 #endif
 
-#ifdef __cpp_lib_apply
+#if defined(__cpp_lib_apply) && !defined(__CUDA_ARCH__)
 
 template <class F, class Tuple>
 CUDA_HOST_DEVICE inline constexpr decltype(auto) apply(F&& f, Tuple&& t) {
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 918344e3d2ba0..8faeb401017b8 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1321,7 +1321,7 @@ if(USE_ROCM)
     list(APPEND HIP_CXX_FLAGS -Wno-implicit-int-float-conversion)
     list(APPEND HIP_CXX_FLAGS -DCAFFE2_USE_MIOPEN)
     list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
-    list(APPEND HIP_CXX_FLAGS -std=c++14)
+    list(APPEND HIP_CXX_FLAGS -std=c++17)
     add_definitions(-DROCM_VERSION=${ROCM_VERSION_DEV_INT})
     add_definitions(-DTORCH_HIP_VERSION=${TORCH_HIP_VERSION})
     message("TORCH_HIP_VERSION=${TORCH_HIP_VERSION} is added as a compiler defines")
@@ -1585,6 +1585,9 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_DISABLE_ONNX)
   add_subdirectory("${CMAKE_CURRENT_LIST_DIR}/../caffe2/onnx/torch_ops")
   if(NOT USE_SYSTEM_ONNX)
     add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/onnx EXCLUDE_FROM_ALL)
+    if(NOT MSVC)
+      set_target_properties(onnx_proto PROPERTIES CXX_STANDARD 17)
+    endif()
   endif()
   add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/foxi EXCLUDE_FROM_ALL)
 
@@ -1687,7 +1690,7 @@ if(NOT INTERN_BUILD_MOBILE)
   string(APPEND CMAKE_CUDA_FLAGS " -Wno-deprecated-gpu-targets --expt-extended-lambda")
 
   if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-    set(CMAKE_CXX_STANDARD 14 CACHE STRING "The C++ standard whose features are requested to build this target.")
+    set(CMAKE_CXX_STANDARD 17 CACHE STRING "The C++ standard whose features are requested to build this target.")
   endif()
 
   # use cub in a safe manner, see:
diff --git a/cmake/ProtoBufPatch.cmake b/cmake/ProtoBufPatch.cmake
index 7f1de9a4a1de9..42696a0a068fc 100644
--- a/cmake/ProtoBufPatch.cmake
+++ b/cmake/ProtoBufPatch.cmake
@@ -31,12 +31,14 @@ if(NOT SYSTEM_PROTOBUF)
   # https://github.com/protocolbuffers/protobuf/commit/0400cca3236de1ca303af38bf81eab332d042b7c
   # changes PROTOBUF_CONSTEXPR to constexpr, which breaks windows
   # build.
-  string(
-    REGEX REPLACE
-    "static constexpr ([^ ]+) ([^ ]+) ="
-    "static \\1 const \\2 ="
-    content
-    "${content}")
+  if(MSVC)
+    string(
+      REGEX REPLACE
+      "static constexpr ([^ ]+) ([^ ]+) ="
+      "static \\1 const \\2 ="
+      content
+      "${content}")
+  endif()
 
   foreach(ns ${NAMESPACES})
     # Insert "const ::std::string& GetEmptyStringAlreadyInited();" within
diff --git a/cmake/public/utils.cmake b/cmake/public/utils.cmake
index 5944a5a1a6269..9ad0a2f96f88f 100644
--- a/cmake/public/utils.cmake
+++ b/cmake/public/utils.cmake
@@ -407,7 +407,7 @@ endmacro()
 # Usage:
 #   torch_compile_options(lib_name)
 function(torch_compile_options libname)
-  set_property(TARGET ${libname} PROPERTY CXX_STANDARD 14)
+  set_property(TARGET ${libname} PROPERTY CXX_STANDARD 17)
   set(private_compile_options "")
 
   # ---[ Check if warnings should be errors.
diff --git a/functorch/CMakeLists.txt b/functorch/CMakeLists.txt
index d203043243829..911f251e88623 100644
--- a/functorch/CMakeLists.txt
+++ b/functorch/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.12)
 project(functorch)
-set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD 17)
 
 include(GNUInstallDirs)
 include(CMakePackageConfigHelpers)
diff --git a/ios/TestApp/TestApp.xcodeproj/project.pbxproj b/ios/TestApp/TestApp.xcodeproj/project.pbxproj
index 09aeeada17239..ff84280f02ebd 100644
--- a/ios/TestApp/TestApp.xcodeproj/project.pbxproj
+++ b/ios/TestApp/TestApp.xcodeproj/project.pbxproj
@@ -253,7 +253,7 @@
                 ALWAYS_SEARCH_USER_PATHS = NO;
                 CLANG_ANALYZER_NONNULL = YES;
                 CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-                CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+                CLANG_CXX_LANGUAGE_STANDARD = "gnu++17";
                 CLANG_CXX_LIBRARY = "libc++";
                 CLANG_ENABLE_MODULES = YES;
                 CLANG_ENABLE_OBJC_ARC = YES;
@@ -312,7 +312,7 @@
                 ALWAYS_SEARCH_USER_PATHS = NO;
                 CLANG_ANALYZER_NONNULL = YES;
                 CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-                CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+                CLANG_CXX_LANGUAGE_STANDARD = "gnu++17";
                 CLANG_CXX_LIBRARY = "libc++";
                 CLANG_ENABLE_MODULES = YES;
                 CLANG_ENABLE_OBJC_ARC = YES;
diff --git a/scripts/build_android.sh b/scripts/build_android.sh
index 2d6f051ea19fe..e2be6c88e9893 100755
--- a/scripts/build_android.sh
+++ b/scripts/build_android.sh
@@ -165,6 +165,11 @@ fi
 # Use-specified CMake arguments go last to allow overridding defaults
 CMAKE_ARGS+=($@)
 
+# Patch pocketfft (as Android does not have aligned_alloc even if compiled with c++17
+if [ -f third_party/pocketfft/pocketfft_hdronly.h ]; then
+  sed -i -e "s/#if __cplusplus >= 201703L/#if 0/" third_party/pocketfft/pocketfft_hdronly.h
+fi
+
 # Now, actually build the Android target.
 BUILD_ROOT=${BUILD_ROOT:-"$CAFFE2_ROOT/build_android"}
 INSTALL_PREFIX=${BUILD_ROOT}/install
diff --git a/test/custom_backend/CMakeLists.txt b/test/custom_backend/CMakeLists.txt
index 71f83442e085f..835f17850a842 100644
--- a/test/custom_backend/CMakeLists.txt
+++ b/test/custom_backend/CMakeLists.txt
@@ -9,9 +9,9 @@ endif()
 find_package(Torch REQUIRED)
 
 add_library(custom_backend SHARED custom_backend.cpp)
-set_property(TARGET custom_backend PROPERTY CXX_STANDARD 14)
+set_property(TARGET custom_backend PROPERTY CXX_STANDARD 17)
 target_link_libraries(custom_backend "${TORCH_LIBRARIES}")
 
 add_executable(test_custom_backend test_custom_backend.cpp)
-set_property(TARGET test_custom_backend PROPERTY CXX_STANDARD 14)
+set_property(TARGET test_custom_backend PROPERTY CXX_STANDARD 17)
 target_link_libraries(test_custom_backend custom_backend)
diff --git a/test/custom_operator/CMakeLists.txt b/test/custom_operator/CMakeLists.txt
index 47c1c9d45e814..6d1a4988fe382 100644
--- a/test/custom_operator/CMakeLists.txt
+++ b/test/custom_operator/CMakeLists.txt
@@ -9,12 +9,12 @@ endif()
 find_package(Torch REQUIRED)
 
 add_library(custom_ops SHARED op.cpp)
-set_property(TARGET custom_ops PROPERTY CXX_STANDARD 14)
+set_property(TARGET custom_ops PROPERTY CXX_STANDARD 17)
 
 target_compile_features(custom_ops PUBLIC cxx_range_for)
 target_link_libraries(custom_ops "${TORCH_LIBRARIES}")
 target_compile_definitions(custom_ops PRIVATE custom_ops_EXPORTS)
 
 add_executable(test_custom_ops test_custom_ops.cpp)
-set_property(TARGET test_custom_ops PROPERTY CXX_STANDARD 14)
+set_property(TARGET test_custom_ops PROPERTY CXX_STANDARD 17)
 target_link_libraries(test_custom_ops custom_ops)
diff --git a/test/jit_hooks/CMakeLists.txt b/test/jit_hooks/CMakeLists.txt
index 546a3040f49bc..91d5a2bf4e01c 100644
--- a/test/jit_hooks/CMakeLists.txt
+++ b/test/jit_hooks/CMakeLists.txt
@@ -9,5 +9,5 @@ endif()
 find_package(Torch REQUIRED)
 
 add_executable(test_jit_hooks test_jit_hooks.cpp)
-set_property(TARGET test_jit_hooks PROPERTY CXX_STANDARD 14)
+set_property(TARGET test_jit_hooks PROPERTY CXX_STANDARD 17)
 target_link_libraries(test_jit_hooks "${TORCH_LIBRARIES}")
diff --git a/test/mobile/custom_build/CMakeLists.txt b/test/mobile/custom_build/CMakeLists.txt
index 521569176c307..426371f4d2965 100644
--- a/test/mobile/custom_build/CMakeLists.txt
+++ b/test/mobile/custom_build/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.1)
 
 project(custom_build_project)
 
-set(CMAKE_CXX_STANDARD 14 CACHE STRING "The C++ standard whose features are requested to build this target.")
+set(CMAKE_CXX_STANDARD 17 CACHE STRING "The C++ standard whose features are requested to build this target.")
 
 # Find torch library
 find_package(Torch REQUIRED)
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 70d6e58ad57e0..747a7850b562b 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -329,7 +329,7 @@ def get_warning_all_flag(warning_all=True):
 
 
 def cpp_flags():
-    return "-std=c++14 -Wno-unused-variable"
+    return "-std=c++17 -Wno-unused-variable"
 
 
 def optimization_flags():
diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
index 217480a974edf..cc435ae4bb3b8 100644
--- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
@@ -1009,7 +1009,7 @@ std::pair<NvrtcFunction, std::string> nvrtcCompile(
   });
 
 #ifdef USE_ROCM
-  std::vector<const char*> args = {"--std=c++14"};
+  std::vector<const char*> args = {"--std=c++17"};
 #if ROCM_VERSION >= 40200
   args.push_back("-hip-pch");
 #endif
@@ -1036,7 +1036,7 @@ std::pair<NvrtcFunction, std::string> nvrtcCompile(
       std::to_string(minor);
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   std::vector<const char*> args = {
-      "--std=c++14", compute.c_str(), "-default-device"};
+      "--std=c++17", compute.c_str(), "-default-device"};
 #endif
 
   const bool disable_fma = isOptionDisabled(DisableOption::Fma);
diff --git a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
index 013a8e8b4adbf..8da7e63a69355 100644
--- a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
+++ b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
@@ -263,7 +263,7 @@ static const std::string compile_string =
 #ifndef __PPC64__
 //  "-march=native "
 #endif
-    "-std=c++14 -fPIC ${fopenmp} -shared \"${cpp_file}\" -o \"${so_file}\" -lm";
+    "-std=c++17 -fPIC ${fopenmp} -shared \"${cpp_file}\" -o \"${so_file}\" -lm";
 #endif
 static void runCompiler(
     const std::string& cpp_file,
diff --git a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
index 85de541f4ba78..72a011febe762 100644
--- a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
+++ b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
@@ -127,7 +127,7 @@ FusedKernelCUDA::FusedKernelCUDA(
       &program, code_.c_str(), nullptr, 0, nullptr, nullptr));
 
 #if defined(USE_ROCM)
-  std::vector<const char*> args = {"--std=c++14"};
+  std::vector<const char*> args = {"--std=c++17"};
 #if ROCM_VERSION >= 40200
   args.push_back("-hip-pch");
 #endif
@@ -148,7 +148,7 @@ FusedKernelCUDA::FusedKernelCUDA(
       std::to_string(major) + std::to_string(minor);
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   const std::vector<const char*> args = {
-      "--std=c++14", compute.c_str(), "-default-device"};
+      "--std=c++17", compute.c_str(), "-default-device"};
 #endif
   const auto result =
       nvrtc().nvrtcCompileProgram(program, args.size(), args.data());
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
index ef17c85002904..cfbac9b398f95 100644
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -1314,7 +1314,7 @@ void CudaCodeGen::CompileToNVRTC(
       &program, code.c_str(), nullptr, 0, nullptr, nullptr));
 
 #if defined(USE_ROCM)
-  std::vector<const char*> args = {"--std=c++14"};
+  std::vector<const char*> args = {"--std=c++17"};
 #if ROCM_VERSION >= 40200
   args.push_back("-hip-pch");
 #endif
@@ -1335,7 +1335,7 @@ void CudaCodeGen::CompileToNVRTC(
       std::to_string(major) + std::to_string(minor);
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   const std::vector<const char*> args = {
-      "--std=c++14", compute.c_str(), "-default-device"};
+      "--std=c++17", compute.c_str(), "-default-device"};
 #endif
 
   auto result = nvrtc().nvrtcCompileProgram(program, args.size(), args.data());
diff --git a/torch/lib/libshm/CMakeLists.txt b/torch/lib/libshm/CMakeLists.txt
index 07d716c0357b6..020061894f2bb 100644
--- a/torch/lib/libshm/CMakeLists.txt
+++ b/torch/lib/libshm/CMakeLists.txt
@@ -29,7 +29,7 @@ target_include_directories(shm PUBLIC
 set_target_properties(shm PROPERTIES
   PREFIX "lib"
   IMPORT_PREFIX "lib"
-  CXX_STANDARD 14)
+  CXX_STANDARD 17)
 target_link_libraries(shm PUBLIC torch)
 
 if(UNIX AND NOT APPLE)
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index d74ef9a38372d..c32763d21d9a3 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -431,7 +431,7 @@ class BuildExtension(build_ext, object):
     A custom :mod:`setuptools` build extension .
 
     This :class:`setuptools.build_ext` subclass takes care of passing the
-    minimum required compiler flags (e.g. ``-std=c++14``) as well as mixed
+    minimum required compiler flags (e.g. ``-std=c++17``) as well as mixed
     C++/CUDA compilation (and support for CUDA files in general).
 
     When using :class:`BuildExtension`, it is allowed to supply a dictionary
@@ -535,12 +535,12 @@ def build_extensions(self) -> None:
         else:
             original_compile = self.compiler._compile
 
-        def append_std14_if_no_std_present(cflags) -> None:
+        def append_std17_if_no_std_present(cflags) -> None:
             # NVCC does not allow multiple -std to be passed, so we avoid
             # overriding the option if the user explicitly passed it.
             cpp_format_prefix = '/{}:' if self.compiler.compiler_type == 'msvc' else '-{}='
             cpp_flag_prefix = cpp_format_prefix.format('std')
-            cpp_flag = cpp_flag_prefix + 'c++14'
+            cpp_flag = cpp_flag_prefix + 'c++17'
             if not any(flag.startswith(cpp_flag_prefix) for flag in cflags):
                 cflags.append(cpp_flag)
 
@@ -585,7 +585,7 @@ def unix_wrap_single_compile(obj, src, ext, cc_args, extra_postargs, pp_opts) ->
                     cflags = cflags['cxx']
                 if IS_HIP_EXTENSION:
                     cflags = COMMON_HIP_FLAGS + cflags
-                append_std14_if_no_std_present(cflags)
+                append_std17_if_no_std_present(cflags)
 
                 original_compile(obj, src, ext, cc_args, cflags, pp_opts)
             finally:
@@ -634,7 +634,7 @@ def unix_wrap_ninja_compile(sources,
                 post_cflags = list(extra_postargs)
             if IS_HIP_EXTENSION:
                 post_cflags = COMMON_HIP_FLAGS + post_cflags
-            append_std14_if_no_std_present(post_cflags)
+            append_std17_if_no_std_present(post_cflags)
 
             cuda_post_cflags = None
             cuda_cflags = None
@@ -649,7 +649,7 @@ def unix_wrap_ninja_compile(sources,
                     cuda_post_cflags = COMMON_HIP_FLAGS + COMMON_HIPCC_FLAGS + cuda_post_cflags
                 else:
                     cuda_post_cflags = unix_cuda_flags(cuda_post_cflags)
-                append_std14_if_no_std_present(cuda_post_cflags)
+                append_std17_if_no_std_present(cuda_post_cflags)
                 cuda_cflags = [shlex.quote(f) for f in cuda_cflags]
                 cuda_post_cflags = [shlex.quote(f) for f in cuda_post_cflags]
 
@@ -785,7 +785,7 @@ def win_wrap_ninja_compile(sources,
                 post_cflags = extra_postargs['cxx']
             else:
                 post_cflags = list(extra_postargs)
-            append_std14_if_no_std_present(post_cflags)
+            append_std17_if_no_std_present(post_cflags)
 
             cuda_post_cflags = None
             cuda_cflags = None
@@ -1994,7 +1994,7 @@ def _write_ninja_file_to_build_library(path,
         cflags = common_cflags + COMMON_MSVC_FLAGS + extra_cflags
         cflags = _nt_quote_args(cflags)
     else:
-        cflags = common_cflags + ['-fPIC', '-std=c++14'] + extra_cflags
+        cflags = common_cflags + ['-fPIC', '-std=c++17'] + extra_cflags
 
     if with_cuda and IS_HIP_EXTENSION:
         cuda_flags = ['-DWITH_HIP'] + cflags + COMMON_HIP_FLAGS + COMMON_HIPCC_FLAGS
@@ -2013,7 +2013,7 @@ def _write_ninja_file_to_build_library(path,
             cuda_flags += ['--compiler-options', "'-fPIC'"]
             cuda_flags += extra_cuda_cflags
             if not any(flag.startswith('-std=') for flag in cuda_flags):
-                cuda_flags.append('-std=c++14')
+                cuda_flags.append('-std=c++17')
             if os.getenv("CC") is not None:
                 cuda_flags = ['-ccbin', os.getenv("CC")] + cuda_flags
     else:

From 27951ad472266507ef9e203ff0d66c2de8c2ab37 Mon Sep 17 00:00:00 2001
From: Iris <31293777+wz337@users.noreply.github.com>
Date: Thu, 8 Dec 2022 02:53:25 +0000
Subject: [PATCH 1706/1922] [Checkpoint][2D][6/N] Add optimizer and update
 default_planner to core distributed (#90212)

This is the last PR for integrating 2D into core distributed.

This PR does the following:
1. Add optimizer.py: this adds ability to load a state_dict in conjunction with FSDP sharded optimzer state.
2. Update default_planner.py to support 2D checkpoint.
3. Add test_fsdp_optim_state.py as a unit test for No. 1.
4. Fix bug in torch/testing/_internal/distributed/checkpoint_utils.py
5. Rename the filename for the APIs that should be private. Will organize and cleanup further in following PRs. #90328

Docstring and integration test will be added in the following PRs.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90212
Approved by: https://github.com/wanchaol
---
 .../checkpoint/test_dedup_tensors.py          |   2 +-
 .../checkpoint/test_fsdp_optim_state.py       | 112 ++++++
 .../checkpoint/test_nested_dict.py            |   2 +-
 test/distributed/checkpoint/test_traverse.py  |  44 +--
 .../{dedup_tensors.py => _dedup_tensors.py}   |   0
 .../{nested_dict.py => _nested_dict.py}       |   2 +-
 .../{nested_tensor.py => _nested_tensor.py}   |   2 +-
 .../checkpoint/{traverse.py => _traverse.py}  |   0
 .../distributed/checkpoint/default_planner.py | 206 ++++++++++-
 torch/distributed/checkpoint/optimizer.py     | 330 ++++++++++++++++++
 .../_internal/distributed/checkpoint_utils.py |   2 +-
 11 files changed, 663 insertions(+), 39 deletions(-)
 create mode 100644 test/distributed/checkpoint/test_fsdp_optim_state.py
 rename torch/distributed/checkpoint/{dedup_tensors.py => _dedup_tensors.py} (100%)
 rename torch/distributed/checkpoint/{nested_dict.py => _nested_dict.py} (98%)
 rename torch/distributed/checkpoint/{nested_tensor.py => _nested_tensor.py} (99%)
 rename torch/distributed/checkpoint/{traverse.py => _traverse.py} (100%)
 create mode 100644 torch/distributed/checkpoint/optimizer.py

diff --git a/test/distributed/checkpoint/test_dedup_tensors.py b/test/distributed/checkpoint/test_dedup_tensors.py
index a0d72147efeba..6f2b81c298df7 100644
--- a/test/distributed/checkpoint/test_dedup_tensors.py
+++ b/test/distributed/checkpoint/test_dedup_tensors.py
@@ -2,7 +2,7 @@
 
 import dataclasses
 import torch
-from torch.distributed.checkpoint.dedup_tensors import dedup_tensors
+from torch.distributed.checkpoint._dedup_tensors import dedup_tensors
 from torch.distributed.checkpoint.planner import SavePlan, WriteItemType
 from torch.distributed.checkpoint.planner_helpers import (
     _create_write_item_for_tensor,
diff --git a/test/distributed/checkpoint/test_fsdp_optim_state.py b/test/distributed/checkpoint/test_fsdp_optim_state.py
new file mode 100644
index 0000000000000..173542ee91253
--- /dev/null
+++ b/test/distributed/checkpoint/test_fsdp_optim_state.py
@@ -0,0 +1,112 @@
+# Owner(s): ["oncall: distributed"]
+
+import torch
+
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
+import torch.distributed.checkpoint as dist_cp
+import torch.distributed as dist
+
+from torch.distributed.checkpoint.default_planner import (
+    DefaultSavePlanner,
+    DefaultLoadPlanner,
+)
+from torch.distributed.checkpoint.optimizer import (
+    load_sharded_optimizer_state_dict,
+)
+
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
+)
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir
+
+
+class FsdpOptimStateCheckpoint(DTensorTestBase):
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    @with_temp_dir
+    def test_distributed_tensor_planner(self) -> None:
+        CHECKPOINT_DIR = self.temp_dir
+
+        model = FSDP(torch.nn.Linear(8, 8, device="meta"))
+        optim = torch.optim.Adam(model.parameters(), lr=0.1)
+
+        model(torch.rand(8, 8, device=dist.get_rank())).sum().backward()
+        optim.step()
+
+        with FSDP.state_dict_type(model, StateDictType.SHARDED_STATE_DICT):
+            state_dict = {
+                "model": model.state_dict(),
+                "optim": FSDP.sharded_optim_state_dict(model, optim),
+            }
+
+            dist_cp.save_state_dict(
+                state_dict=state_dict,
+                storage_writer=dist_cp.FileSystemWriter(CHECKPOINT_DIR),
+                planner=DefaultSavePlanner(
+                    flatten_state_dict=True,
+                    flatten_sharded_tensors=True,
+                ),
+            )
+
+        # now load the model and ensure the values are the same
+        model_2 = FSDP(torch.nn.Linear(8, 8, device="meta"))
+        optim_2 = torch.optim.Adam(model_2.parameters(), lr=0.1)
+
+        with FSDP.summon_full_params(model):
+            with FSDP.summon_full_params(model_2):
+                self.assertNotEqual(model.weight, model_2.weight)
+                self.assertNotEqual(model.bias, model_2.bias)
+
+        # Adam lazily creates its state
+        self.assertEqual(0, len(optim_2.state))
+
+        with FSDP.state_dict_type(model_2, StateDictType.SHARDED_STATE_DICT):
+            state_dict = {
+                "model": model_2.state_dict(),
+                # cannot load the optimizer together with the model
+            }
+
+            dist_cp.load_state_dict(
+                state_dict=state_dict,
+                storage_reader=dist_cp.FileSystemReader(CHECKPOINT_DIR),
+                planner=DefaultLoadPlanner(
+                    flatten_state_dict=True,
+                    flatten_sharded_tensors=True,
+                ),
+            )
+            model_2.load_state_dict(state_dict["model"])
+
+            optim_state = load_sharded_optimizer_state_dict(
+                model_state_dict=state_dict["model"],
+                optimizer_key="optim",
+                storage_reader=dist_cp.FileSystemReader(CHECKPOINT_DIR),
+            )
+
+            flattened_osd = FSDP.flatten_sharded_optim_state_dict(
+                optim_state["optim"], model_2
+            )
+            optim_2.load_state_dict(flattened_osd)
+
+        with FSDP.summon_full_params(model):
+            with FSDP.summon_full_params(model_2):
+                self.assertEqual(model.weight, model_2.weight)
+                self.assertEqual(model.bias, model_2.bias)
+
+        def opt_at(opt, idx):
+            return list(iter(opt.state.values()))[idx]
+
+        # Adam lazily creates its state
+        self.assertEqual(
+            opt_at(optim, 0)["exp_avg"], opt_at(optim_2, 0)["exp_avg"]
+        )
+        self.assertEqual(
+            opt_at(optim, 0)["exp_avg_sq"], opt_at(optim_2, 0)["exp_avg_sq"]
+        )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/checkpoint/test_nested_dict.py b/test/distributed/checkpoint/test_nested_dict.py
index 33d618b3bdddd..115982e818127 100644
--- a/test/distributed/checkpoint/test_nested_dict.py
+++ b/test/distributed/checkpoint/test_nested_dict.py
@@ -2,7 +2,7 @@
 
 import torch
 from torch.testing._internal.common_utils import run_tests, TestCase
-from torch.distributed.checkpoint.nested_dict import (
+from torch.distributed.checkpoint._nested_dict import (
     flatten_state_dict,
     unflatten_state_dict,
 )
diff --git a/test/distributed/checkpoint/test_traverse.py b/test/distributed/checkpoint/test_traverse.py
index a73cb89befba5..3a47311e702bd 100644
--- a/test/distributed/checkpoint/test_traverse.py
+++ b/test/distributed/checkpoint/test_traverse.py
@@ -3,7 +3,7 @@
 from collections import OrderedDict
 import torch
 
-import torch.distributed.checkpoint.traverse as traverse
+import torch.distributed.checkpoint._traverse as _traverse
 from torch.distributed.checkpoint.metadata import STATE_DICT_TYPE
 from torch.testing._internal.common_utils import run_tests, TestCase
 
@@ -24,7 +24,7 @@ def collect_data(path, value):
             nonlocal data
             data[path] = value
 
-        traverse.traverse_state_dict(state_dict, collect_data)
+        _traverse.traverse_state_dict(state_dict, collect_data)
 
         self.assertIn(("key0",), data)
         self.assertEqual(data[("key0",)], 1)
@@ -53,7 +53,7 @@ def collect_data(path, value):
             nonlocal data
             data[path] = value
 
-        traverse.traverse_state_dict(state_dict, collect_data)
+        _traverse.traverse_state_dict(state_dict, collect_data)
 
         self.assertNotIn(("key1"), data)
 
@@ -84,7 +84,7 @@ def collect_data(path, value):
             nonlocal data
             data[path] = value
 
-        traverse.traverse_state_dict(state_dict, collect_data)
+        _traverse.traverse_state_dict(state_dict, collect_data)
 
         self.assertNotIn(("key0",), data)
 
@@ -105,7 +105,7 @@ def collect_data(path, value):
             nonlocal data
             data[path] = value
 
-        traverse.traverse_state_dict(state_dict, collect_data)
+        _traverse.traverse_state_dict(state_dict, collect_data)
 
         self.assertIn(("key0", 0, "key1", "key2"), data)
         self.assertEqual(
@@ -129,7 +129,7 @@ def collect_data(path, value):
             nonlocal data
             data[path] = value
 
-        traverse.traverse_state_dict(state_dict, collect_data)
+        _traverse.traverse_state_dict(state_dict, collect_data)
 
         self.assertIn(("key0", 0), data)
         self.assertEqual(data[("key0", 0)], 99)
@@ -140,36 +140,36 @@ def collect_data(path, value):
     def test_set_element(self) -> None:
         state_dict: STATE_DICT_TYPE = {}
 
-        traverse.set_element(state_dict, ("k",), 10)
+        _traverse.set_element(state_dict, ("k",), 10)
         self.assertEqual(state_dict["k"], 10)
 
-        traverse.set_element(state_dict, ("k1", 2), 1)
+        _traverse.set_element(state_dict, ("k1", 2), 1)
         self.assertEqual(state_dict["k1"], [None, None, 1])
 
-        traverse.set_element(state_dict, ("k1", 1), 99)
+        _traverse.set_element(state_dict, ("k1", 1), 99)
         self.assertEqual(state_dict["k1"], [None, 99, 1])
 
-        traverse.set_element(state_dict, ("k1", 3), 88)
+        _traverse.set_element(state_dict, ("k1", 3), 88)
         self.assertEqual(state_dict["k1"], [None, 99, 1, 88])
 
-        traverse.set_element(state_dict, ("k2", "k3"), 3)
+        _traverse.set_element(state_dict, ("k2", "k3"), 3)
         self.assertEqual(state_dict["k2"], {"k3": 3})
 
-        traverse.set_element(state_dict, ("k2", "k4", 0, 0), 99)
+        _traverse.set_element(state_dict, ("k2", "k4", 0, 0), 99)
         self.assertEqual(state_dict["k2"]["k4"][0], [99])
 
     def test_get_element(self) -> None:
         state_dict = {"a": [0, 1], "b": [2, {"c": "d"}]}
-        self.assertEqual(traverse.get_element(state_dict, ("a",)), [0, 1])
-        self.assertEqual(traverse.get_element(state_dict, ("b", 0)), 2)
-        self.assertEqual(traverse.get_element(state_dict, ("b", 1, "c")), "d")
-
-        self.assertIsNone(traverse.get_element(state_dict, ("c",)))
-        self.assertIsNone(traverse.get_element(state_dict, ("a", 33)))
-        self.assertIsNone(traverse.get_element(state_dict, ("b", 88)))
-        self.assertIsNone(traverse.get_element(state_dict, ("b", 0, 2)))
-        self.assertIsNone(traverse.get_element(state_dict, ("b", 1, 2)))
-        self.assertIsNone(traverse.get_element(state_dict, ("b", 1, "d")))
+        self.assertEqual(_traverse.get_element(state_dict, ("a",)), [0, 1])
+        self.assertEqual(_traverse.get_element(state_dict, ("b", 0)), 2)
+        self.assertEqual(_traverse.get_element(state_dict, ("b", 1, "c")), "d")
+
+        self.assertIsNone(_traverse.get_element(state_dict, ("c",)))
+        self.assertIsNone(_traverse.get_element(state_dict, ("a", 33)))
+        self.assertIsNone(_traverse.get_element(state_dict, ("b", 88)))
+        self.assertIsNone(_traverse.get_element(state_dict, ("b", 0, 2)))
+        self.assertIsNone(_traverse.get_element(state_dict, ("b", 1, 2)))
+        self.assertIsNone(_traverse.get_element(state_dict, ("b", 1, "d")))
 
 
 if __name__ == "__main__":
diff --git a/torch/distributed/checkpoint/dedup_tensors.py b/torch/distributed/checkpoint/_dedup_tensors.py
similarity index 100%
rename from torch/distributed/checkpoint/dedup_tensors.py
rename to torch/distributed/checkpoint/_dedup_tensors.py
diff --git a/torch/distributed/checkpoint/nested_dict.py b/torch/distributed/checkpoint/_nested_dict.py
similarity index 98%
rename from torch/distributed/checkpoint/nested_dict.py
rename to torch/distributed/checkpoint/_nested_dict.py
index 91c34fe39298c..b26badd418d3f 100644
--- a/torch/distributed/checkpoint/nested_dict.py
+++ b/torch/distributed/checkpoint/_nested_dict.py
@@ -5,7 +5,7 @@
     STATE_DICT_TYPE,
 )
 
-from .traverse import (
+from ._traverse import (
     traverse_state_dict,
     set_element,
     OBJ_PATH,
diff --git a/torch/distributed/checkpoint/nested_tensor.py b/torch/distributed/checkpoint/_nested_tensor.py
similarity index 99%
rename from torch/distributed/checkpoint/nested_tensor.py
rename to torch/distributed/checkpoint/_nested_tensor.py
index 4ab68c81b1a9a..94ceaf5d4a52f 100644
--- a/torch/distributed/checkpoint/nested_tensor.py
+++ b/torch/distributed/checkpoint/_nested_tensor.py
@@ -19,7 +19,7 @@
 )
 
 
-from .traverse import (
+from ._traverse import (
     OBJ_PATH,
     traverse_state_dict,
     set_element,
diff --git a/torch/distributed/checkpoint/traverse.py b/torch/distributed/checkpoint/_traverse.py
similarity index 100%
rename from torch/distributed/checkpoint/traverse.py
rename to torch/distributed/checkpoint/_traverse.py
diff --git a/torch/distributed/checkpoint/default_planner.py b/torch/distributed/checkpoint/default_planner.py
index aa531a62d235e..0bb44fd057599 100644
--- a/torch/distributed/checkpoint/default_planner.py
+++ b/torch/distributed/checkpoint/default_planner.py
@@ -1,5 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+
 import dataclasses
 import io
+import logging
+import operator
+from functools import reduce
 from typing import List, Tuple, Dict, Any, Union, cast
 
 import torch
@@ -8,7 +13,7 @@
 from torch.distributed._shard.sharded_tensor import ShardedTensor
 
 
-from .planner import (
+from torch.distributed.checkpoint.planner import (
     SavePlanner,
     LoadPlanner,
     SavePlan,
@@ -18,8 +23,9 @@
     WriteItemType,
 )
 
-from .metadata import (
+from torch.distributed.checkpoint.metadata import (
     BytesStorageMetadata,
+    ChunkStorageMetadata,
     TensorStorageMetadata,
     MetadataIndex,
     Metadata,
@@ -27,13 +33,26 @@
     STORAGE_TYPES,
 )
 
-from .planner_helpers import (
+from torch.distributed.checkpoint.planner_helpers import (
     _create_read_items,
     _create_write_items,
     _create_default_metadata_only_plan,
 )
 
-from .utils import find_state_dict_object
+from torch.distributed.checkpoint._nested_dict import (
+    FLATTEN_MAPPING,
+    flatten_state_dict,
+)
+from torch.distributed.checkpoint._nested_tensor import flatten_sharded_tensors
+from torch.distributed.checkpoint._dedup_tensors import dedup_tensors
+from torch.distributed.checkpoint.utils import (
+    find_state_dict_object,
+    find_tensor_shard,
+)
+from torch.distributed.checkpoint._traverse import set_element, get_element
+
+logger: logging.Logger = logging.getLogger(__file__)
+
 
 __all__ = [
     "DefaultSavePlanner",
@@ -45,23 +64,63 @@
 ]
 
 
+# TODO: Update docstrings for default_planner.py
+
+
 class DefaultSavePlanner(SavePlanner):
-    def init(self, state_dict: Dict[str, Any], is_coordinator: bool) -> None:
+    mappings: FLATTEN_MAPPING
+
+    def __init__(
+        self,
+        flatten_state_dict: bool = False,
+        flatten_sharded_tensors: bool = False,
+        dedup_replicated_tensors: bool = False,
+    ) -> None:
+        self.flatten_state_dict = flatten_state_dict
+        self.flatten_sharded_tensors = flatten_sharded_tensors
+        self.dedup_replicated_tensors = dedup_replicated_tensors
+        self.mappings = {}
+
+    def init(self, state_dict: STATE_DICT_TYPE, is_coordinator: bool) -> None:
+        if self.flatten_state_dict:
+            state_dict, self.mappings = flatten_state_dict(state_dict)
+        if self.flatten_sharded_tensors:
+            state_dict = flatten_sharded_tensors(state_dict)
         self.state_dict = state_dict
         self.is_coordinator = is_coordinator
 
     def create_local_plan(self) -> SavePlan:
-        self.plan = create_default_local_save_plan(
+        plan = create_default_local_save_plan(
             self.state_dict, self.is_coordinator
         )
+        if self.flatten_state_dict:
+            plan = dataclasses.replace(plan, planner_data=self.mappings)
+        self.plan = plan
+
         return self.plan
 
     def create_global_plan(
         self, all_plans: List[SavePlan]
     ) -> Tuple[List[SavePlan], Metadata]:
-        self.global_plan, self.metadata = create_default_global_save_plan(
-            all_plans
-        )
+        if self.dedup_replicated_tensors:
+            all_plans = dedup_tensors(all_plans)
+
+        global_plan, metadata = create_default_global_save_plan(all_plans)
+
+        if self.flatten_state_dict:
+            merged_mappings = reduce(
+                lambda x, y: x | y, (p.planner_data for p in global_plan)
+            )
+            metadata = dataclasses.replace(
+                metadata, planner_data=merged_mappings
+            )
+
+        if not _validate_global_plan(global_plan, metadata):
+            raise ValueError("Failed to validate global plan")
+
+        self.global_plan = global_plan
+        self.metadata = metadata
+
         return self.global_plan, self.metadata
 
     def finish_plan(self, new_plan: SavePlan) -> SavePlan:
@@ -92,12 +151,42 @@ def transform_object(self, write_item: WriteItem, object: Any):
 
 
 class DefaultLoadPlanner(LoadPlanner):
+    """
+    DefaultLoadPlanner that adds multiple features on top of LoadPlanner.
+
+    In particular it adds the following:
+
+    flatten_state_dict: Handle state_dict with nested dicts
+    flatten_sharded_tensors: For FSDP in 2D parallel mode
+    """
+
+    original_state_dict: STATE_DICT_TYPE
+    mappings: FLATTEN_MAPPING
+
+    def __init__(
+        self,
+        flatten_state_dict: bool = False,
+        flatten_sharded_tensors: bool = False,
+    ) -> None:
+        self.flatten_state_dict = flatten_state_dict
+        self.flatten_sharded_tensors = flatten_sharded_tensors
+        self.original_state_dict = {}
+        self.mappings = {}
+
     def init(
         self,
         state_dict: STATE_DICT_TYPE,
         metadata: Metadata,
         is_coordinator: bool,
     ) -> None:
+        if self.flatten_sharded_tensors:
+            state_dict = flatten_sharded_tensors(state_dict)
+
+        self.original_state_dict = state_dict
+
+        if self.flatten_state_dict:
+            state_dict, self.mappings = flatten_state_dict(state_dict)
+
         self.state_dict = state_dict
         self.metadata = metadata
         self.is_coordinator = is_coordinator
@@ -112,7 +201,14 @@ def finish_plan(self, new_plan: LoadPlan) -> LoadPlan:
         return new_plan
 
     def load_bytes(self, read_item: ReadItem, value: io.BytesIO) -> None:
-        self.state_dict[read_item.dest_index.fqn] = torch.load(value)
+        if self.flatten_state_dict:
+            set_element(
+                self.original_state_dict,
+                self.mappings[read_item.dest_index.fqn],
+                torch.load(value),
+            )
+        else:
+            self.state_dict[read_item.dest_index.fqn] = torch.load(value)
 
     def resolve_tensor(self, read_item: ReadItem):
         tensor = self.lookup_tensor(read_item.dest_index)
@@ -125,7 +221,14 @@ def lookup_tensor(self, index: MetadataIndex) -> torch.Tensor:
         """
         This is an extension from the planner interface to make it easy to extend the default planner
         """
-        return find_state_dict_object(self.state_dict, index)
+        if self.flatten_state_dict:
+            obj = get_element(
+                self.original_state_dict, self.mappings[index.fqn]
+            )
+            assert isinstance(obj, torch.Tensor)
+            return find_tensor_shard(obj, index)
+        else:
+            return find_state_dict_object(self.state_dict, index)
 
     def transform_tensor(self, read_item: ReadItem, tensor: torch.Tensor):
         """
@@ -229,7 +332,10 @@ def create_default_global_save_plan(
 
                 assert (
                     item.tensor_data.chunk is not None
-                ), f"Cannot create MD for tensor without bounds. FQN: {item.index.fqn}"
+                ), f"""
+                    Cannot create MD for tensor without bounds.
+                    FQN: {item.index.fqn}
+                """
                 tensor_md.chunks.append(item.tensor_data.chunk)
         new_plans.append(dataclasses.replace(plan, items=new_items))
     return (new_plans, Metadata(md))
@@ -242,3 +348,79 @@ def _create_default_local_metadata(state_dict: STATE_DICT_TYPE) -> Metadata:
     plan = _create_default_metadata_only_plan(state_dict)
     _, md = create_default_global_save_plan([plan])
     return md
+
+
+def _check_box_overlap(
+    box0: ChunkStorageMetadata, box1: ChunkStorageMetadata
+) -> bool:
+    """
+    Checks if two boxes overlap. Tuples are (offset, lengths)
+    """
+
+    # For each dim of each shard, check if one shard resides on the other
+    # end of second shard with respect to that dim. As an example for a 2D
+    # shard, we would check if one shard is above or on the left of the
+    # other shard.
+    ndims = len(box0.offsets)
+    for i in range(ndims):
+        if box0.offsets[i] >= box1.offsets[i] + box1.sizes[i]:
+            return False
+        if box1.offsets[i] >= box0.offsets[i] + box0.sizes[i]:
+            return False
+
+    return True
+
+
+def _check_box_bounds(
+    outer_box_size: torch.Size, inner_box: ChunkStorageMetadata
+) -> bool:
+    for i in range(len(outer_box_size)):
+        if inner_box.offsets[i] < 0:
+            return False
+        if inner_box.sizes[i] < 0:
+            return False
+        if inner_box.offsets[i] + inner_box.sizes[i] > outer_box_size[i]:
+            return False
+
+    return True
+
+
+def _validate_global_plan(
+    global_plan: List[SavePlan], metadata: Metadata
+) -> bool:
+    all_good = True
+    for key, value in metadata.state_dict_metadata.items():
+        if isinstance(value, BytesStorageMetadata):
+            continue
+        if len(value.size) == 0:
+            continue
+        chunks_volume = 0
+        for chunk_idx, chunk0 in enumerate(value.chunks):
+            if not _check_box_bounds(value.size, chunk0):
+                logger.warning(
+                    f"""
+                        key:{key} has out of bounds chunk:
+                        tensor-size:{value.size} chunk: {chunk0}
+                    """
+                )
+                all_good = False
+            chunks_volume += reduce(operator.mul, chunk0.sizes, 1)
+
+            for chunk1 in value.chunks[chunk_idx + 1 :]:
+                if _check_box_overlap(chunk0, chunk1):
+                    logger.warning(
+                        f"key:{key} has overlapping chunks: {chunk0} {chunk1}"
+                    )
+                    all_good = False
+
+        tensor_volume = reduce(operator.mul, value.size, 1)
+        if chunks_volume != tensor_volume:
+            logger.warning(
+                f"""
+                    key:{key} invalid fill tensor-volume:
+                    {tensor_volume} chunks-volume: {chunks_volume}
+                """
+            )
+            all_good = False
+
+    return all_good
diff --git a/torch/distributed/checkpoint/optimizer.py b/torch/distributed/checkpoint/optimizer.py
new file mode 100644
index 0000000000000..9e88ec87c2bb5
--- /dev/null
+++ b/torch/distributed/checkpoint/optimizer.py
@@ -0,0 +1,330 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+
+import copy
+import dataclasses
+from typing import Dict, List, Optional, Sequence, Tuple, Union, cast
+from torch.distributed.checkpoint.planner import LoadPlan
+
+import torch
+import torch.distributed as dist
+from torch.distributed._shard.sharded_tensor.api import ShardedTensor
+from torch.distributed._shard.sharded_tensor.metadata import TensorProperties
+from torch.distributed._shard.sharded_tensor.shard import Shard
+from torch.distributed._shard.sharding_spec.chunk_sharding_spec import (
+    ChunkShardingSpec,
+)
+
+import torch.distributed.checkpoint as dist_cp
+from torch.distributed.checkpoint.metadata import (
+    BytesStorageMetadata,
+    Metadata,
+    MetadataIndex,
+    STATE_DICT_TYPE,
+    TensorStorageMetadata,
+)
+from torch.distributed.checkpoint.planner_helpers import (
+    _create_sharded_read_items,
+    _create_read_items,
+)
+from torch.distributed.remote_device import _remote_device
+
+from torch.distributed._tensor import DTensor
+from torch.distributed.checkpoint.default_planner import (
+    DefaultLoadPlanner,
+)
+from torch.distributed._shard.api import _shard_tensor
+
+from torch.distributed.checkpoint._nested_dict import unflatten_state_dict
+from torch.distributed.checkpoint.utils import (
+    _element_wise_add,
+    _element_wise_sub,
+)
+
+STATE_DICT_2D_LAYOUT = Dict[str, Tuple[Optional[Sequence[int]], Sequence[int]]]
+
+
+# TODO: Update docstrings for optimizer.py
+__all__ = [
+    "load_sharded_optimizer_state_dict",
+]
+
+
+def _gen_rank_device(global_rank: int) -> str:
+    if torch.cuda.is_available():
+        return f"cuda:{global_rank % torch.cuda.device_count()}"
+    return "cpu"
+
+
+def _create_colwise_spec(
+    pg: Optional[dist.ProcessGroup] = None,
+) -> ChunkShardingSpec:
+    if pg is None:
+        placements = [
+            f"rank:{idx}/{_gen_rank_device(idx)}"
+            for idx in range(dist.get_world_size())
+        ]
+    else:
+        placements = [
+            f"rank:{idx}/{_gen_rank_device(dist.get_global_rank(pg, idx))}"
+            for idx in range(pg.size())
+        ]
+    return ChunkShardingSpec(
+        dim=0,
+        placements=cast(List[Union[_remote_device, str]], placements),
+    )
+
+
+def _is_nested_tensor(val: torch.Tensor) -> bool:
+    if type(val) is ShardedTensor:
+        if len(val.local_shards()) == 0:
+            return False
+        if type(val.local_shards()[0].tensor) is ShardedTensor:
+            return True
+        if type(val.local_shards()[0].tensor) is DTensor:
+            raise ValueError(
+                "Cannot handle DTensor nested insided ShardedTensor"
+            )
+    elif type(val) is DTensor and (
+        type(val._local_tensor) is DTensor
+        or type(val._local_tensor) is ShardedTensor
+    ):
+        raise ValueError("Cannot handle nested DTensor")
+    return False
+
+
+def _alloc_tensor(props: TensorProperties, size: Sequence[int]) -> torch.Tensor:
+    return torch.empty(
+        size=size,
+        dtype=props.dtype,
+        layout=props.layout,
+        requires_grad=props.requires_grad,
+        pin_memory=props.pin_memory,
+        device=cast(torch.device, torch.cuda.current_device()),
+    )
+
+
+def _get_state_dict_2d_layout(
+    state_dict: STATE_DICT_TYPE,
+) -> Tuple[STATE_DICT_2D_LAYOUT, Optional[dist.ProcessGroup]]:
+    """
+    We have to load the right TP slice of the optimizer state.
+    This is not easy since the per-tensor slicing can't be inferred from checkpoint metadata.
+    We take advantage of the model state_dict producing a sliced ST to figure out what we need to load.
+    This is pretty fragile and it might be easier for FSDP to compute this info for us.
+    Returns a dictionary where keys are the same of the state_dict and the value is a tuple of
+    (offset, size) for the current rank TP slice.
+    N.B. The state_dict *MUST* come from FSDP.sharded_state_dict.
+    """
+    specs: STATE_DICT_2D_LAYOUT = {}
+    dp_pg: Optional[dist.ProcessGroup] = None
+    for key, value in state_dict.items():
+        specs[key] = (None, value.size())
+        if _is_nested_tensor(value):
+            assert (
+                len(value.local_shards()) == 1
+            ), "Cannot handle ST with multiple shards"
+            assert isinstance(ShardedTensor, value)
+            shard = value.local_shards()[0]
+            specs[key] = (
+                shard.metadata.shard_offsets,
+                shard.metadata.shard_sizes,
+            )
+            dp_pg = shard.tensor._process_group
+
+    return (
+        specs,
+        dp_pg,
+    )
+
+
+class _ReaderWithOffset(DefaultLoadPlanner):
+    translation: Dict[MetadataIndex, MetadataIndex]
+    state_dict: STATE_DICT_TYPE
+    metadata: Metadata
+
+    def __init__(self, fqn_to_offset: Dict[str, Sequence[int]]) -> None:
+        super().__init__()
+        self.fqn_to_offset = fqn_to_offset
+        self.metadata = Metadata({})
+        self.state_dict = {}
+        self.translation = {}
+
+    def create_local_plan(self) -> LoadPlan:
+        requests = []
+        self.translation = {}
+        for fqn, obj in self.state_dict.items():
+            md = self.metadata.state_dict_metadata[fqn]
+            if not isinstance(obj, ShardedTensor):
+                requests += _create_read_items(fqn, md, obj)
+                continue
+
+            if fqn not in self.fqn_to_offset:
+                requests += _create_read_items(fqn, md, obj)
+                continue
+
+            offset = self.fqn_to_offset[fqn]
+
+            assert len(obj.local_shards()) == 1
+            original_shard = obj.local_shards()[0]
+            shard_md = copy.deepcopy(original_shard.metadata)
+            shard_md.shard_offsets = _element_wise_add(
+                shard_md.shard_offsets, offset
+            )
+            local_shards = [Shard(original_shard.tensor, shard_md)]
+
+            reqs = _create_sharded_read_items(
+                fqn, cast(TensorStorageMetadata, md), local_shards
+            )
+            # TODO: The WriteItems will have a displaced MetadataIndex, fix it.
+            # TODO: we should change _create_sharded_read_items to have more ergonomic API
+            for wi in reqs:
+                assert wi.dest_index.offset is not None
+                original_offset = _element_wise_sub(
+                    wi.dest_index.offset, offset
+                )
+                original_index = dataclasses.replace(
+                    wi.dest_index, offset=torch.Size(original_offset)
+                )
+                self.translation[wi.dest_index] = original_index
+
+            requests += reqs
+        return LoadPlan(requests)
+
+    def lookup_tensor(self, index: MetadataIndex) -> torch.Tensor:
+        return super().lookup_tensor(self.translation.get(index, index))
+
+
+def load_sharded_optimizer_state_dict(
+    model_state_dict: STATE_DICT_TYPE,
+    optimizer_key: str,
+    storage_reader: dist_cp.StorageReader,
+) -> STATE_DICT_TYPE:
+    """
+    Loads a state_dict to be used in conjuntion with FSDP sharded optimizer state.
+    This is the current recommended way to checkpoint is FSDP
+    >>> import torch.distributed.checkpoint as dist_cp
+    >>> import spmd.checkpoint as sp_cp
+    >>> # Save
+    >>> model: torch.nn.Model
+    >>> optim_params = model.parameters()
+    >>> optim = torch.optim.SGD(optim_params, lr=0.01)
+    >>>
+    >>> with FSDP.state_dict_type(model, StateDictType.SHARDED_STATE_DICT):
+    >>>     state_dict = {
+    >>>         "optimizer": FSDP.sharded_optim_state_dict(model, optim, optim_params),
+    >>>         "model": model.state_dict()
+    >>>     }
+    >>>     dist_cp.save_state_dict(
+    >>>         state_dict=optim_state,
+    >>>         storage_writer=dist_cp.FileSystemWriter("checkpoint"),
+    >>>         planner=sp_cp.AdvLoadPlanner()
+    >>>     )
+    >>>
+    >>> # Load
+    >>> with FSDP.state_dict_type(model_tp, StateDictType.SHARDED_STATE_DICT):
+    >>>     model_state_dict = model_tp.state_dict()
+    >>>     checkpoint = {
+    >>>         "model" = model_state_dict
+    >>>     }
+    >>>     dist_cp.load_state_dict(
+    >>>         state_dict=checkpoint,
+    >>>         storage_reader=dist_cp.FileSystemReader(checkpoint_file),
+    >>>         planner=sp_cp.AdvLoadPlanner()
+    >>>     )
+    >>>     model.load_state_dict(checkpoint["model_state"])
+    >>>
+    >>>     optim_state = sp_cp.load_sharded_optimizer_state_dict(
+    >>>         model_state_dict,
+    >>>         optimizer_key="optimizer",
+    >>>         storage_reader=dist_cp.FileSystemReader("checkpoint"),
+    >>>    )
+    >>>
+    >>>    flattened_osd = FSDP.flatten_sharded_optim_state_dict(
+    >>>        optim_state["optimizer"], model, optim_input
+    >>>    )
+    >>>
+    >>>    optim.load_state_dict(flattened_osd)
+    """
+    metadata = storage_reader.read_metadata()
+
+    layout_specs, dp_pg = _get_state_dict_2d_layout(model_state_dict)
+
+    if dp_pg is None:
+        sharding_spec = ChunkShardingSpec(
+            dim=0,
+            placements=[
+                f"rank:{i}/cuda:{i}" for i in range(dist.get_world_size())
+            ],
+        )
+    else:
+        sharding_spec = _create_colwise_spec(dp_pg)
+
+    # Create a state_dict for optimizer state
+    state_dict: STATE_DICT_TYPE = {}
+
+    fqn_to_offset: Dict[str, Sequence[int]] = {}
+    for key, value in metadata.state_dict_metadata.items():
+        key_path = metadata.planner_data[key]
+        if key_path[0] != optimizer_key:
+            continue
+
+        if isinstance(value, BytesStorageMetadata):
+            state_dict[key] = "<bytes_io>"
+            continue
+
+        # value: TensorStorageMetadata
+        if value.size.numel() == 1:
+            state_dict[key] = _alloc_tensor(value.properties, value.size)
+        elif dp_pg is None:
+            state_dict[key] = _shard_tensor(
+                _alloc_tensor(value.properties, value.size), sharding_spec
+            )
+        else:
+            spec_key = key_path[2]
+            alloc_size = layout_specs.get(spec_key, (None, value.size))[1]
+
+            st_md = sharding_spec.build_metadata(
+                torch.Size(alloc_size), value.properties
+            )
+            local_shards = []
+            current_rank = dist.get_rank(dp_pg)
+            for shard_md in st_md.shards_metadata:
+                if (
+                    cast(_remote_device, shard_md.placement).rank()
+                    != current_rank
+                ):
+                    continue
+                local_shards.append(
+                    Shard(
+                        tensor=_alloc_tensor(
+                            value.properties, shard_md.shard_sizes
+                        ),
+                        metadata=shard_md,
+                    )
+                )
+
+            st = ShardedTensor._init_from_local_shards_and_global_metadata(
+                local_shards, st_md, process_group=dp_pg
+            )
+
+            if (
+                spec_key in layout_specs
+                and layout_specs[spec_key][0] is not None
+            ):
+                fqn_to_offset[key] = cast(
+                    Sequence[int], layout_specs[spec_key][0]
+                )
+
+            state_dict[key] = st
+
+    # Whether we unflatten before or after doesn't matter
+    dist_cp.load_state_dict(
+        state_dict=state_dict,
+        storage_reader=storage_reader,
+        # FIXME the type of planner is wrong in load_state_dict
+        planner=_ReaderWithOffset(fqn_to_offset) if dp_pg is not None else None,
+    )
+
+    state_dict = unflatten_state_dict(state_dict, metadata.planner_data)
+
+    return state_dict
diff --git a/torch/testing/_internal/distributed/checkpoint_utils.py b/torch/testing/_internal/distributed/checkpoint_utils.py
index bcdcb7b2818f3..1a6e43a038c93 100644
--- a/torch/testing/_internal/distributed/checkpoint_utils.py
+++ b/torch/testing/_internal/distributed/checkpoint_utils.py
@@ -21,7 +21,7 @@ def wrapper(self, *args: Tuple[object], **kwargs: Dict[str, Any]) -> None:
         # Only create temp_dir when rank is 0
         if dist.get_rank() == 0:
             temp_dir = tempfile.mkdtemp()
-            print(f"Using temp directory: {self.temp_dir }")
+            print(f"Using temp directory: {temp_dir}")
         else:
             temp_dir = ""
         object_list = [temp_dir]

From 9e39686d3ca63e159f3318b05c090895307731db Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Wed, 7 Dec 2022 17:05:17 -0500
Subject: [PATCH 1707/1922] Always compile tiny graphs with AOTAutograd
 (#89775)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89775
Approved by: https://github.com/anjali411, https://github.com/bdhirsh
---
 torch/_dynamo/optimizations/training.py | 7 +------
 torch/_inductor/compile_fx.py           | 7 -------
 2 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/torch/_dynamo/optimizations/training.py b/torch/_dynamo/optimizations/training.py
index a1b22a7f6c313..7be7c1723b1b4 100644
--- a/torch/_dynamo/optimizations/training.py
+++ b/torch/_dynamo/optimizations/training.py
@@ -23,7 +23,7 @@
 from torch.utils._pytree import tree_map
 
 from .. import config, eval_frame
-from ..utils import clone_inputs, count_calls, counters
+from ..utils import clone_inputs, counters
 from .backends import BACKENDS
 from .normalize import normalize_ir
 
@@ -42,11 +42,6 @@ def compiler_fn(gm: torch.fx.GraphModule, example_inputs):
         functorch.compile.config.use_functionalize = True
         functorch.compile.config.use_fake_tensor = True
 
-        force_compile_tiny_graphs = kwargs.pop("force_compile_tiny_graphs", False)
-
-        if count_calls(gm.graph) < 2 and not force_compile_tiny_graphs:
-            return gm  # no point for tiny graphs
-
         counters["aot_autograd"]["total"] += 1
         use_fallback = False
 
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index 15b5d5910895f..3b4ce9c202e5c 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -398,13 +398,6 @@ def bw_compiler(model: torch.fx.GraphModule, example_inputs):
             partition_fn=functools.partial(
                 min_cut_rematerialization_partition, compiler="inductor"
             ),
-            # A "tiny" graph can actually decompose into multiple
-            # operators (if it's a decomposition) and inductor can
-            # do a better job on it in this case
-            #
-            # Also, for some reason, test_comprehensive___rmatmul___cpu
-            # fails without forcing a compile lol.
-            force_compile_tiny_graphs=True,
         )(model_, example_inputs_)
 
 
From 83c7fbd81a3367011d02f1574a07c9cb98a8cefe Mon Sep 17 00:00:00 2001
From: "Han Qi (qihqi)" <qihan@meta.com>
Date: Thu, 8 Dec 2022 03:48:04 +0000
Subject: [PATCH 1708/1922] Clean up dependancy for flatbuffer_loader (#86041)

Test Plan: waitforsandcastle

Differential Revision: D38445936

Pull Request resolved: https://github.com/pytorch/pytorch/pull/86041
Approved by: https://github.com/cccclai
---
 BUILD.bazel                                   | 17 ++--
 buckbuild.bzl                                 | 60 +++++++-------
 build_variables.bzl                           | 82 +++++++++++--------
 caffe2/CMakeLists.txt                         |  3 +-
 test/cpp/jit/test_lite_interpreter.cpp        |  2 -
 test/cpp/jit/test_lite_trainer.cpp            |  3 -
 torch/csrc/init_flatbuffer_module.cpp         |  3 +-
 .../mobile/compatibility/backport_manager.cpp |  2 -
 .../compatibility/model_compatibility.cpp     |  9 +-
 torch/csrc/jit/mobile/flatbuffer_loader.cpp   | 11 +--
 torch/csrc/jit/mobile/flatbuffer_loader.h     | 17 +++-
 torch/csrc/jit/mobile/import.cpp              | 35 ++------
 torch/csrc/jit/mobile/import.h                | 13 ---
 torch/csrc/jit/mobile/import_data.cpp         | 11 +--
 torch/csrc/jit/mobile/train/export_data.cpp   | 10 +--
 torch/csrc/jit/serialization/export.h         | 15 +++-
 .../csrc/jit/serialization/export_module.cpp  | 41 +++++++---
 .../serialization/flatbuffer_serializer.cpp   |  9 +-
 .../jit/serialization/flatbuffer_serializer.h |  9 +-
 .../flatbuffer_serializer_jit.cpp             | 78 ------------------
 .../serialization/flatbuffer_serializer_jit.h | 29 -------
 torch/csrc/jit/serialization/import.cpp       | 54 ++++++++----
 torch/csrc/jit/serialization/import.h         | 34 +++++---
 23 files changed, 231 insertions(+), 316 deletions(-)

diff --git a/BUILD.bazel b/BUILD.bazel
index 172a31723a0bf..938630e2e2bd2 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -1648,6 +1648,17 @@ cu_library(
     deps = [":torch_headers"],
 )
 
+torch_sources = ({
+    k: "" for k in (
+    libtorch_core_sources +
+    libtorch_distributed_sources +
+    torch_cpp_srcs +
+    libtorch_extra_sources +
+    jit_core_sources +
+    lazy_tensor_ts_sources +
+    GENERATED_AUTOGRAD_CPP)
+}).keys()
+
 cc_library(
     name = "torch",
     srcs = if_cuda(glob(
@@ -1657,11 +1668,7 @@ cc_library(
             "torch/csrc/cuda/nccl.cpp",
             "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
         ],
-    )) + libtorch_core_sources + libtorch_distributed_sources + torch_cpp_srcs + libtorch_extra_sources + jit_core_sources + lazy_tensor_ts_sources + GENERATED_AUTOGRAD_CPP + [
-        "torch/csrc/jit/serialization/flatbuffer_serializer.cpp",
-        "torch/csrc/jit/mobile/flatbuffer_loader.cpp",
-        "torch/csrc/jit/serialization/flatbuffer_serializer_jit.cpp",
-    ],
+    )) + torch_sources,
     copts = TORCH_COPTS,
     defines = [
         "CAFFE2_NIGHTLY_VERSION=20200115",
diff --git a/buckbuild.bzl b/buckbuild.bzl
index 1fbae66c62910..f84f21cd4d111 100644
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@@ -17,7 +17,7 @@ load(
     "aten_cpu_source_list",
     "aten_native_source_list",
     "core_sources_common",
-    "core_sources_full_mobile_no_backend_interface",
+    "core_sources_full_mobile_no_backend_interface_xplat",
     "core_trainer_sources",
     "jit_core_headers",
     "jit_core_sources",
@@ -755,14 +755,13 @@ def get_pt_operator_registry_dict(
             "pt_operator_registry",
         ],
         deps = [
-                   # need absolute path here
-                   ROOT + ":torch_mobile_core",
-                   ROOT + ":aten_cpu",
-                   ROOT + ":aten_metal_prepack_header",
-                   third_party("glog"),
-                   C10,
-               ] + ([ROOT + ":torch_mobile_train"] if train else []) +
-               ([ROOT + ":flatbuffers_mobile"] if enable_flatbuffer else []),
+            # need absolute path here
+            ROOT + ":torch_mobile_core",
+            ROOT + ":aten_cpu",
+            ROOT + ":aten_metal_prepack_header",
+            third_party("glog"),
+            C10,
+        ] + ([ROOT + ":torch_mobile_train"] if train else []),
         **kwargs
     )
 
@@ -1303,12 +1302,14 @@ def define_buck_targets(
         name = "torch_mobile_deserialize",
         srcs = [
             "torch/csrc/jit/mobile/import.cpp",
+            "torch/csrc/jit/mobile/flatbuffer_loader.cpp",
         ],
         compiler_flags = get_pt_compiler_flags(),
-        exported_preprocessor_flags = get_pt_preprocessor_flags(),
+        exported_preprocessor_flags = get_pt_preprocessor_flags() + (["-DFB_XPLAT_BUILD"] if not IS_OSS else []),
         header_namespace = "",
         exported_headers = [
             "torch/csrc/jit/mobile/import.h",
+            "torch/csrc/jit/mobile/flatbuffer_loader.h",
         ],
         # torch_mobile_deserialize brings in sources neccessary to read a module
         # which depends on mobile module definition
@@ -1331,6 +1332,7 @@ def define_buck_targets(
             ":torch_mobile_module",
             ":torch_mobile_observer",
             ":torch_mobile_deserialize_common",
+            ":mobile_bytecode",
             C10,
         ],
     )
@@ -1390,11 +1392,7 @@ def define_buck_targets(
 
     pt_xplat_cxx_library(
         name = "torch_core",
-        srcs = core_sources_full_mobile_no_backend_interface + [
-            "torch/csrc/api/src/jit.cpp",
-            "torch/csrc/jit/serialization/export_bytecode.cpp",
-            "torch/csrc/jit/serialization/export_module.cpp",
-        ],
+        srcs = core_sources_full_mobile_no_backend_interface_xplat,
         compiler_flags = get_pt_compiler_flags(),
         exported_preprocessor_flags = get_pt_preprocessor_flags(),
         visibility = [
@@ -1443,6 +1441,7 @@ def define_buck_targets(
             ":torch_core",
             ":torch_mobile_deserialize",
             ":torch_mobile_train",
+            ":jit_module_saving",
             C10,
         ],
     )
@@ -1474,6 +1473,7 @@ def define_buck_targets(
             ":generated-autograd-headers",
             ":torch_headers",
             ":torch_mobile_deserialize",
+            ":flatbuffers_serializer_mobile",
             C10,
         ],
     )
@@ -1563,15 +1563,16 @@ def define_buck_targets(
             "torch/csrc/jit/serialization/export_module.cpp",
         ],
         compiler_flags = get_pt_compiler_flags(),
-        exported_preprocessor_flags = get_pt_preprocessor_flags(),
+        exported_preprocessor_flags = get_pt_preprocessor_flags() +
+                                      (["-DFB_XPLAT_BUILD"] if not IS_OSS else []),
         exported_headers = [
             "torch/csrc/jit/serialization/export.h",
-            "torch/csrc/jit/serialization/flatbuffer_serializer_jit.h",
         ],
         visibility = ["PUBLIC"],
         deps = [
             ":torch",
             ":torch_mobile_core",
+            ":flatbuffers_serializer_mobile",
         ],
     )
 
@@ -1618,6 +1619,7 @@ def define_buck_targets(
         ]),
     )
 
+    #TODO(qihan) delete
     pt_xplat_cxx_library(
         name = "torch_mobile_core_flatbuffer",
         srcs = [],
@@ -1639,9 +1641,7 @@ def define_buck_targets(
         exported_deps = [
             ":aten_cpu",
             ":torch_common",
-        ] + ([] if IS_OSS else [
-            "//xplat/caffe2/fb/runtime:torch_mobile_deserialize_flatbuffer",
-        ]),
+        ],
     )
 
     fb_xplat_cxx_library(
@@ -1739,7 +1739,7 @@ def define_buck_targets(
         # the internals of the loader/serializer layer.
         visibility = [
             "{}:flatbuffer_loader".format(ROOT),
-            "{}:flatbuffer_serializer_mobile".format(ROOT),
+            "{}:flatbuffers_serializer_mobile".format(ROOT),
         ],
         exported_deps = [
             third_party("flatbuffers-api"),
@@ -1766,14 +1766,15 @@ def define_buck_targets(
             C10,
         ],
         exported_deps = [
-            ":torch_mobile_train",
+            ":torch_mobile_deserialize",
+            ":mobile_bytecode",
         ],
     )
 
+    # TODO (qihan) delete
     pt_xplat_cxx_library(
         name = "flatbuffer_loader",
         srcs = [
-            "torch/csrc/jit/mobile/flatbuffer_loader.cpp",
         ],
         exported_headers = [
             "torch/csrc/jit/mobile/flatbuffer_loader.h",
@@ -1803,17 +1804,13 @@ def define_buck_targets(
             ":mobile_bytecode",
         ],
         exported_deps = [
-            ":torch_mobile_deserialize",
             C10,
         ],
     )
 
+    # TODO(qihan) delete
     fb_xplat_cxx_library(
         name = "flatbuffers_serializer_jit",
-        srcs = ["torch/csrc/jit/serialization/flatbuffer_serializer_jit.cpp"],
-        exported_headers = [
-            "torch/csrc/jit/serialization/flatbuffer_serializer_jit.h",
-        ],
         compiler_flags = [
             "-g0",
             "-O3",
@@ -1821,6 +1818,12 @@ def define_buck_targets(
             "-frtti",
             "-Wno-deprecated-declarations",
         ],
+        headers = [
+            "torch/csrc/jit/serialization/flatbuffer_serializer_jit.h",
+        ],
+        srcs = [
+            "torch/csrc/jit/serialization/flatbuffer_serializer_jit.cpp",
+        ],
         linker_flags = [
             "-Wl,--no-as-needed",
         ],
@@ -1850,6 +1853,7 @@ def define_buck_targets(
         exported_deps = [
             ":flatbuffer_loader",
             ":flatbuffers_serializer_mobile",
+            ":torch_mobile_train",
         ],
     )
 
diff --git a/build_variables.bzl b/build_variables.bzl
index 63010082137ee..aa0a8f8856c67 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -177,7 +177,28 @@ core_trainer_sources = [
     "torch/csrc/jit/serialization/type_name_uniquer.cpp",
 ]
 
-core_sources_full_mobile_no_backend_interface = [
+torch_mobile_core = [
+    # backend_debug_info.cpp provides
+    # __torch__.torch.classes.backend.BackendDebugInfo class
+    # This should not be needed eventually.
+    # TODO: Remove this dependency
+    "torch/csrc/jit/backends/backend_debug_info.cpp",
+    "torch/csrc/jit/mobile/compatibility/model_compatibility.cpp",
+    "torch/csrc/jit/mobile/function.cpp",
+    "torch/csrc/jit/mobile/import.cpp",
+    "torch/csrc/jit/mobile/flatbuffer_loader.cpp",
+    "torch/csrc/jit/mobile/interpreter.cpp",
+    "torch/csrc/jit/mobile/module.cpp",
+    "torch/csrc/jit/mobile/observer.cpp",
+    "torch/csrc/jit/mobile/parse_bytecode.cpp",
+    "torch/csrc/jit/mobile/parse_operators.cpp",
+    "torch/csrc/jit/mobile/quantization.cpp",
+    "torch/csrc/jit/mobile/upgrader_mobile.cpp",
+    "torch/csrc/jit/runtime/register_prim_ops.cpp",
+    "torch/csrc/jit/runtime/register_special_ops.cpp",
+]
+
+core_sources_full_mobile_no_backend_interface_xplat = [
     "torch/csrc/jit/api/function_impl.cpp",
     "torch/csrc/jit/api/module.cpp",
     "torch/csrc/jit/api/object.cpp",
@@ -385,6 +406,26 @@ core_sources_full_mobile_no_backend_interface = [
     "torch/csrc/utils/variadic.cpp",
 ]
 
+core_sources_full_mobile_no_backend_interface = core_sources_full_mobile_no_backend_interface_xplat + [
+    # backend_debug_info.cpp provides
+    # __torch__.torch.classes.backend.BackendDebugInfo class
+    # This should not be needed eventually.
+    # TODO: Remove this dependency
+    "torch/csrc/jit/backends/backend_debug_info.cpp",
+    "torch/csrc/jit/mobile/compatibility/model_compatibility.cpp",
+    "torch/csrc/jit/mobile/function.cpp",
+    "torch/csrc/jit/mobile/import.cpp",
+    "torch/csrc/jit/mobile/flatbuffer_loader.cpp",
+    "torch/csrc/jit/mobile/interpreter.cpp",
+    "torch/csrc/jit/mobile/module.cpp",
+    "torch/csrc/jit/mobile/observer.cpp",
+    "torch/csrc/jit/mobile/parse_bytecode.cpp",
+    "torch/csrc/jit/mobile/parse_operators.cpp",
+    "torch/csrc/jit/mobile/quantization.cpp",
+    "torch/csrc/jit/mobile/upgrader_mobile.cpp",
+]
+
+
 core_sources_full_mobile = core_sources_full_mobile_no_backend_interface + [
     "torch/csrc/jit/backends/backend_debug_info.cpp",
     "torch/csrc/jit/backends/backend_interface.cpp",
@@ -563,28 +604,6 @@ torch_mobile_tracer_sources = [
     "torch/csrc/jit/mobile/model_tracer/BuildFeatureTracer.cpp",
 ]
 
-torch_mobile_core = [
-    # backend_debug_info.cpp provides
-    # __torch__.torch.classes.backend.BackendDebugInfo class
-    # This should not be needed eventually.
-    # TODO: Remove this dependency
-    "torch/csrc/jit/backends/backend_debug_info.cpp",
-    "torch/csrc/jit/mobile/compatibility/model_compatibility.cpp",
-    # TODO: This line needs to be uncommented to build mobile in OSS with flatbuffers
-    # "torch/csrc/jit/mobile/flatbuffer_loader.cpp",
-    "torch/csrc/jit/mobile/function.cpp",
-    "torch/csrc/jit/mobile/import.cpp",
-    "torch/csrc/jit/mobile/interpreter.cpp",
-    "torch/csrc/jit/mobile/module.cpp",
-    "torch/csrc/jit/mobile/observer.cpp",
-    "torch/csrc/jit/mobile/parse_bytecode.cpp",
-    "torch/csrc/jit/mobile/parse_operators.cpp",
-    "torch/csrc/jit/mobile/quantization.cpp",
-    "torch/csrc/jit/mobile/upgrader_mobile.cpp",
-    "torch/csrc/jit/runtime/register_prim_ops.cpp",
-    "torch/csrc/jit/runtime/register_special_ops.cpp",
-]
-
 libtorch_lite_eager_symbolication = [
     "torch/csrc/jit/frontend/source_range.cpp",
     "torch/csrc/jit/ir/scope.cpp",
@@ -621,6 +640,7 @@ libtorch_extra_sources = libtorch_core_jit_sources + [
     # when it is built in libtorch
     "torch/csrc/jit/mobile/debug_info.cpp",
     "torch/csrc/jit/mobile/function.cpp",
+    "torch/csrc/jit/mobile/flatbuffer_loader.cpp",
     "torch/csrc/jit/mobile/import.cpp",
     "torch/csrc/jit/mobile/import_data.cpp",
     "torch/csrc/jit/mobile/interpreter.cpp",
@@ -638,24 +658,16 @@ libtorch_extra_sources = libtorch_core_jit_sources + [
     "torch/csrc/jit/serialization/export.cpp",
     "torch/csrc/jit/serialization/export_bytecode.cpp",
     "torch/csrc/jit/serialization/export_module.cpp",
+    "torch/csrc/jit/serialization/flatbuffer_serializer.cpp",
     "torch/csrc/jit/serialization/import_legacy.cpp",
     "torch/csrc/utils/byte_order.cpp",
     "torch/csrc/utils/out_types.cpp",
 ]
 
 def libtorch_sources(gencode_pattern = ":generate-code[{}]"):
-    enable_flatbuffer = bool(native.read_config("fbcode", "caffe2_enable_flatbuffer", None))
-    flatbuffer_serializer_sources = [
-        "torch/csrc/jit/serialization/flatbuffer_serializer.cpp",
-        "torch/csrc/jit/serialization/flatbuffer_serializer_jit.cpp",
-    ]
-    if enable_flatbuffer:
-        return (
-            libtorch_generated_sources(gencode_pattern) + libtorch_core_sources + libtorch_distributed_sources + libtorch_extra_sources +
-            flatbuffer_serializer_sources
-        )
-    else:
-        return libtorch_generated_sources(gencode_pattern) + libtorch_core_sources + libtorch_distributed_sources + libtorch_extra_sources
+    return (
+        libtorch_generated_sources(gencode_pattern) + libtorch_core_sources + libtorch_distributed_sources + libtorch_extra_sources
+    )
 
 libtorch_cuda_core_sources = [
     "torch/csrc/CudaIPCTypes.cpp",
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 4182797fc78e5..c1b7b65f2353e 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -492,6 +492,7 @@ if(BUILD_LITE_INTERPRETER)
   set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
 else()
   append_filelist("libtorch_cmake_sources" LIBTORCH_CMAKE_SRCS)
+  list(APPEND LIBTORCH_CMAKE_SRCS ${LITE_EAGER_SYMOBLICATION_SRCS})
   if(BUILD_LAZY_TS_BACKEND)
     append_filelist("lazy_tensor_ts_sources" LIBTORCH_CMAKE_SRCS)
   endif()
@@ -564,6 +565,7 @@ if(NOT INTERN_DISABLE_MOBILE_INTERP)
      ${TORCH_SRC_DIR}/csrc/jit/mobile/train/random.cpp
      ${TORCH_SRC_DIR}/csrc/jit/mobile/train/sequential.cpp
      ${TORCH_SRC_DIR}/csrc/jit/mobile/upgrader_mobile.cpp
+     ${TORCH_SRC_DIR}/csrc/jit/serialization/flatbuffer_serializer.cpp
      )
   list(APPEND TORCH_SRCS ${MOBILE_SRCS})
   list(APPEND TORCH_SRCS ${LITE_EAGER_SYMOBLICATION_SRCS})
@@ -599,7 +601,6 @@ if(NOT INTERN_BUILD_MOBILE AND NOT BUILD_LITE_INTERPRETER)
     ${TORCH_SRC_DIR}/csrc/jit/serialization/export_bytecode.cpp
     ${TORCH_SRC_DIR}/csrc/jit/serialization/export_module.cpp
     ${TORCH_SRC_DIR}/csrc/jit/serialization/flatbuffer_serializer.cpp
-    ${TORCH_SRC_DIR}/csrc/jit/serialization/flatbuffer_serializer_jit.cpp
     ${TORCH_SRC_DIR}/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
     ${TORCH_SRC_DIR}/csrc/jit/api/module_save.cpp
     ${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp
diff --git a/test/cpp/jit/test_lite_interpreter.cpp b/test/cpp/jit/test_lite_interpreter.cpp
index 930b26076bbb1..c45ca96383e9f 100644
--- a/test/cpp/jit/test_lite_interpreter.cpp
+++ b/test/cpp/jit/test_lite_interpreter.cpp
@@ -17,7 +17,6 @@
 #include <torch/csrc/jit/mobile/parse_operators.h>
 #include <torch/csrc/jit/mobile/upgrader_mobile.h>
 #include <torch/csrc/jit/serialization/export.h>
-#include <torch/csrc/jit/serialization/flatbuffer_serializer_jit.h>
 #include <torch/csrc/jit/serialization/import.h>
 #include <torch/custom_class.h>
 #include <torch/torch.h>
@@ -680,7 +679,6 @@ void backportAllVersionCheck(
 
 #if !defined FB_XPLAT_BUILD
 TEST(LiteInterpreterTest, BackPortByteCodeModelAllVersions) {
-  torch::jit::register_flatbuffer_all();
   torch::jit::Module module("m");
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
   module.register_parameter("weight", torch::ones({20, 1, 5, 5}), false);
diff --git a/test/cpp/jit/test_lite_trainer.cpp b/test/cpp/jit/test_lite_trainer.cpp
index 10ba11dc1b4ae..311a818c4bfd0 100644
--- a/test/cpp/jit/test_lite_trainer.cpp
+++ b/test/cpp/jit/test_lite_trainer.cpp
@@ -10,7 +10,6 @@
 #include <torch/csrc/jit/mobile/train/optim/sgd.h>
 #include <torch/csrc/jit/mobile/train/random.h>
 #include <torch/csrc/jit/mobile/train/sequential.h>
-#include <torch/csrc/jit/serialization/flatbuffer_serializer_jit.h>
 #include <torch/csrc/jit/serialization/import.h>
 #include <torch/data/dataloader.h>
 #include <torch/torch.h>
@@ -175,7 +174,6 @@ TEST(MobileTest, SaveParametersDefaultsToZip) {
 
 TEST(MobileTest, SaveParametersCanUseFlatbuffer) {
   // Save some empty parameters using flatbuffer.
-  register_flatbuffer_all();
   std::map<std::string, at::Tensor> empty_parameters;
   std::stringstream ss_data;
   _save_parameters(empty_parameters, ss_data, /*use_flatbuffer=*/true);
@@ -192,7 +190,6 @@ TEST(MobileTest, SaveParametersCanUseFlatbuffer) {
 
 TEST(MobileTest, SaveLoadParametersUsingFlatbuffers) {
   // Create some simple parameters to save.
-  register_flatbuffer_all();
   std::map<std::string, at::Tensor> input_params;
   input_params["four_by_ones"] = 4 * torch::ones({});
   input_params["three_by_ones"] = 3 * torch::ones({});
diff --git a/torch/csrc/init_flatbuffer_module.cpp b/torch/csrc/init_flatbuffer_module.cpp
index f739f834dc293..96e69ea754cc1 100644
--- a/torch/csrc/init_flatbuffer_module.cpp
+++ b/torch/csrc/init_flatbuffer_module.cpp
@@ -16,8 +16,9 @@
 #include <torch/csrc/jit/python/module_python.h>
 #include <torch/csrc/jit/python/python_ivalue.h>
 #include <torch/csrc/jit/python/python_sugared_value.h>
+#include <torch/csrc/jit/serialization/export.h>
 #include <torch/csrc/jit/serialization/flatbuffer_serializer.h>
-#include <torch/csrc/jit/serialization/flatbuffer_serializer_jit.h>
+#include <torch/csrc/jit/serialization/import.h>
 
 namespace py = pybind11;
 
diff --git a/torch/csrc/jit/mobile/compatibility/backport_manager.cpp b/torch/csrc/jit/mobile/compatibility/backport_manager.cpp
index 489084912445f..2bad08c0765a2 100644
--- a/torch/csrc/jit/mobile/compatibility/backport_manager.cpp
+++ b/torch/csrc/jit/mobile/compatibility/backport_manager.cpp
@@ -7,7 +7,6 @@
 #include <torch/csrc/jit/mobile/import.h>
 #include <torch/csrc/jit/mobile/module.h>
 #include <torch/csrc/jit/serialization/export.h>
-#include <torch/csrc/jit/serialization/flatbuffer_serializer_jit.h>
 #include <torch/csrc/jit/serialization/import.h>
 #include <torch/csrc/jit/serialization/pickler.h>
 #include <cstddef>
@@ -504,7 +503,6 @@ std::stringstream backport_v7_to_v6(std::stringstream& input_model_stream) {
 
 std::stringstream backport_v9_to_v8(std::stringstream& input_model_stream) {
   ExtraFilesMap extra_files;
-  register_flatbuffer_all();
   Module torch_script =
       torch::jit::load(input_model_stream, c10::nullopt, extra_files);
   std::stringstream intermediate_model_stream;
diff --git a/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp b/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
index 089c116179ef9..9ce71eba9ce75 100644
--- a/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
+++ b/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
@@ -4,6 +4,7 @@
 #include <torch/csrc/jit/api/compilation_unit.h> // removed after using simple type_resolver/obj_loader
 #include <torch/csrc/jit/mobile/compatibility/model_compatibility.h>
 #include <torch/csrc/jit/mobile/file_format.h>
+#include <torch/csrc/jit/mobile/flatbuffer_loader.h>
 #include <torch/csrc/jit/mobile/import.h> // removed after using simple type_resolver/obj_loader
 #include <torch/csrc/jit/mobile/type_parser.h>
 #include <torch/csrc/jit/serialization/import_export_constants.h>
@@ -111,13 +112,7 @@ uint64_t _get_model_bytecode_version_from_bytes(char* data, size_t size) {
   auto format = getFileFormat(data);
   switch (format) {
     case FileFormat::FlatbufferFileFormat: {
-      if (get_flatbuffer_bytecode_version == nullptr) {
-        TORCH_CHECK(
-            false,
-            "Flatbuffer input file but the build hasn't enabled flatbuffer");
-      } else {
-        return get_flatbuffer_bytecode_version(data);
-      }
+      return get_bytecode_version_from_bytes(data);
     }
     case FileFormat::ZipFileFormat: {
       auto rai =
diff --git a/torch/csrc/jit/mobile/flatbuffer_loader.cpp b/torch/csrc/jit/mobile/flatbuffer_loader.cpp
index 29c29925ef099..ec18e489b5cd7 100644
--- a/torch/csrc/jit/mobile/flatbuffer_loader.cpp
+++ b/torch/csrc/jit/mobile/flatbuffer_loader.cpp
@@ -1,5 +1,3 @@
-#include <torch/csrc/jit/mobile/flatbuffer_loader.h>
-
 #ifdef FLATBUFFERS_VERSION_MAJOR
 #error "flatbuffer_loader.h must not include any flatbuffers headers"
 #endif // FLATBUFFERS_VERSION_MAJOR
@@ -24,8 +22,8 @@
 #include <c10/util/Optional.h>
 #include <c10/util/ScopeExit.h>
 #include <caffe2/serialize/inline_container.h>
-#include <torch/csrc/jit/frontend/script_type_parser.h>
 #include <torch/csrc/jit/mobile/file_format.h>
+#include <torch/csrc/jit/mobile/flatbuffer_loader.h>
 #include <torch/csrc/jit/mobile/function.h>
 #include <torch/csrc/jit/mobile/import.h>
 #include <torch/csrc/jit/mobile/interpreter.h>
@@ -882,7 +880,6 @@ mobile::Module load_mobile_module_from_stream_with_copy(
       std::move(data), size, device, extra_files);
 }
 
-namespace {
 mobile::Module parse_flatbuffer_no_object(
     std::shared_ptr<char> data,
     size_t size,
@@ -912,16 +909,10 @@ mobile::Module parse_flatbuffer_no_object(
   m.set_delete_memory(std::move(data));
   return m;
 }
-} // namespace
 
 bool register_flatbuffer_loader() {
-  load_flatbuffer_bytes = parse_and_initialize_mobile_module;
-  load_flatbuffer_bytes_no_object = parse_flatbuffer_no_object;
-  get_flatbuffer_bytecode_version = get_bytecode_version_from_bytes;
   return true;
 }
 
-const bool kRegisteredFlatbufferLoader = register_flatbuffer_loader();
-
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/mobile/flatbuffer_loader.h b/torch/csrc/jit/mobile/flatbuffer_loader.h
index eee44d4b647ed..f29fe5b2e4942 100644
--- a/torch/csrc/jit/mobile/flatbuffer_loader.h
+++ b/torch/csrc/jit/mobile/flatbuffer_loader.h
@@ -117,10 +117,19 @@ TORCH_API mobile::Module load_mobile_module_from_stream_with_copy(
     c10::optional<at::Device> device = c10::nullopt,
     ExtraFilesMap* extra_files = nullptr);
 
-// This function will make the capabilities to load
-// Module as a flatbuffer file available for use by _load_for_mobile
-// and friends. This is NOT needed if using the other functions
-// in this file directly.
+TORCH_API mobile::Module parse_flatbuffer_no_object(
+    std::shared_ptr<char> data,
+    size_t size,
+    c10::optional<at::Device> device);
+
+TORCH_API mobile::Module parse_and_initialize_mobile_module(
+    void* data,
+    size_t,
+    c10::optional<at::Device>,
+    ExtraFilesMap* extra_files,
+    bool should_copy_tensor_memory);
+
+// no op, TODO(qihan) delete
 TORCH_API bool register_flatbuffer_loader();
 
 } // namespace jit
diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp
index 2270418dbbcff..5acd5cab39854 100644
--- a/torch/csrc/jit/mobile/import.cpp
+++ b/torch/csrc/jit/mobile/import.cpp
@@ -5,6 +5,7 @@
 #include <ATen/core/ivalue.h>
 #include <ATen/core/qualified_name.h>
 #include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
 #include <c10/util/ScopeExit.h>
 #include <c10/util/irange.h>
 #include <caffe2/serialize/in_memory_adapter.h>
@@ -13,6 +14,7 @@
 #include <caffe2/serialize/versions.h>
 #include <torch/csrc/jit/api/compilation_unit.h>
 #include <torch/csrc/jit/mobile/file_format.h>
+#include <torch/csrc/jit/mobile/flatbuffer_loader.h>
 #include <torch/csrc/jit/mobile/interpreter.h>
 #include <torch/csrc/jit/mobile/observer.h>
 #include <torch/csrc/jit/mobile/type_parser.h>
@@ -88,19 +90,6 @@ using caffe2::serialize::MemoryReadAdapter;
 using caffe2::serialize::PyTorchStreamReader;
 using caffe2::serialize::ReadAdapterInterface;
 
-mobile::Module (*load_flatbuffer_bytes)(
-    std::shared_ptr<char>,
-    size_t size,
-    c10::optional<at::Device>,
-    ExtraFilesMap*) = nullptr;
-
-mobile::Module (*load_flatbuffer_bytes_no_object)(
-    std::shared_ptr<char>,
-    size_t size,
-    c10::optional<at::Device>) = nullptr;
-
-uint64_t (*get_flatbuffer_bytecode_version)(char* flatbuffer_content) = nullptr;
-
 OpCode parseOpCode(const char* str);
 
 TypePtr resolveTypeNameMobile(
@@ -630,13 +619,8 @@ mobile::Module _load_mobile_from_bytes(
           std::move(rai), device, extra_files, module_load_options);
     }
     case FileFormat::FlatbufferFileFormat: {
-      if (load_flatbuffer_bytes != nullptr) {
-        return load_flatbuffer_bytes(data, size, device, &extra_files);
-      } else {
-        TORCH_CHECK(
-            false,
-            "Flatbuffer input file but the build hasn't enabled flatbuffer");
-      }
+      return parse_and_initialize_mobile_module(
+          data, size, device, &extra_files);
     }
     default: {
       TORCH_CHECK(false, "Format error");
@@ -726,16 +710,7 @@ void _load_extra_only_for_mobile(
       // TODO: the current flatbuffers implementation will always load the
       // whole module including the extra files. Ideally it should be
       // possible to just get the extra files given data
-      std::shared_ptr<char> data;
-      size_t size = 0;
-      std::tie(data, size) = get_file_content(filename.c_str());
-      if (load_flatbuffer_bytes != nullptr) {
-        load_flatbuffer_bytes(data, size, device, &extra_files);
-      } else {
-        TORCH_CHECK(
-            false,
-            "Flatbuffer input file but the build hasn't enabled flatbuffer");
-      }
+      load_mobile_module_from_file(filename, c10::nullopt, &extra_files);
       break;
     }
     default: {
diff --git a/torch/csrc/jit/mobile/import.h b/torch/csrc/jit/mobile/import.h
index b17a4bb341ca1..643ca57858a36 100644
--- a/torch/csrc/jit/mobile/import.h
+++ b/torch/csrc/jit/mobile/import.h
@@ -107,18 +107,5 @@ TORCH_API std::set<std::string> _export_operator_list(
 
 } // namespace mobile
 
-extern mobile::Module (*load_flatbuffer_bytes)(
-    std::shared_ptr<char>,
-    size_t size,
-    c10::optional<at::Device>,
-    ExtraFilesMap*);
-
-extern mobile::Module (*load_flatbuffer_bytes_no_object)(
-    std::shared_ptr<char>,
-    size_t size,
-    c10::optional<at::Device>);
-
-extern uint64_t (*get_flatbuffer_bytecode_version)(char* flatbuffer_content);
-
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/mobile/import_data.cpp b/torch/csrc/jit/mobile/import_data.cpp
index 01c6ea7ac579c..309b238a8d41b 100644
--- a/torch/csrc/jit/mobile/import_data.cpp
+++ b/torch/csrc/jit/mobile/import_data.cpp
@@ -7,6 +7,7 @@
 #include <caffe2/serialize/inline_container.h>
 #include <torch/csrc/jit/api/compilation_unit.h>
 #include <torch/csrc/jit/mobile/file_format.h>
+#include <torch/csrc/jit/mobile/flatbuffer_loader.h>
 #include <torch/csrc/jit/mobile/import.h>
 #include <torch/csrc/jit/mobile/import_export_common.h>
 #include <torch/csrc/jit/mobile/module.h>
@@ -247,14 +248,8 @@ std::map<std::string, at::Tensor> _load_parameters_bytes(
   std::map<std::string, at::Tensor> map;
   switch (format) {
     case FileFormat::FlatbufferFileFormat: {
-      if (load_flatbuffer_bytes_no_object != nullptr) {
-        auto m = load_flatbuffer_bytes_no_object(data, size, device);
-        map = mobile_module_to_parameter_map(m);
-      } else {
-        TORCH_CHECK(
-            false,
-            "Flatbuffer input file but the build hasn't enabled flatbuffer");
-      }
+      auto m = parse_flatbuffer_no_object(data, size, device);
+      map = mobile_module_to_parameter_map(m);
       break;
     }
 
diff --git a/torch/csrc/jit/mobile/train/export_data.cpp b/torch/csrc/jit/mobile/train/export_data.cpp
index da75ef991ba2c..731ffef15424a 100644
--- a/torch/csrc/jit/mobile/train/export_data.cpp
+++ b/torch/csrc/jit/mobile/train/export_data.cpp
@@ -3,6 +3,7 @@
 #include <torch/csrc/jit/mobile/import_export_common.h>
 #include <torch/csrc/jit/mobile/module.h>
 #include <torch/csrc/jit/runtime/instruction.h>
+#include <torch/csrc/jit/serialization/flatbuffer_serializer.h>
 #include <torch/csrc/jit/serialization/pickler.h>
 #include <torch/csrc/jit/serialization/type_name_uniquer.h>
 
@@ -131,14 +132,7 @@ void _save_parameters(
   };
 
   if (use_flatbuffer) {
-    if (_save_mobile_module_to != nullptr) {
-      _save_mobile_module_to(mobile::tensor_dict_to_mobile(dict), write_func);
-    } else {
-      TORCH_CHECK(
-          false,
-          "Trying to export as flatbuffer file but "
-          "the build hasn't enabled flatbuffer");
-    }
+    save_mobile_module_to_func(mobile::tensor_dict_to_mobile(dict), write_func);
   } else {
     // For Pickle, we only serialize the dict itself.
     mobile::IValuePickler pickler(write_func);
diff --git a/torch/csrc/jit/serialization/export.h b/torch/csrc/jit/serialization/export.h
index 06670a5716450..da5d5e6a70959 100644
--- a/torch/csrc/jit/serialization/export.h
+++ b/torch/csrc/jit/serialization/export.h
@@ -4,12 +4,12 @@
 #include <torch/csrc/jit/api/module.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/serialization/export_bytecode.h>
+#include <torch/csrc/jit/serialization/flatbuffer_serializer.h>
 #include <torch/csrc/jit/serialization/pickler.h>
 #include <torch/csrc/jit/serialization/python_print.h>
 #include <torch/csrc/jit/serialization/storage_context.h>
 #include <torch/csrc/jit/serialization/type_name_uniquer.h>
 #include <torch/csrc/onnx/onnx.h>
-
 #include <ostream>
 
 namespace ONNX_NAMESPACE {
@@ -260,9 +260,18 @@ Table(const std::vector<std::pair<std::string, IValue>>& entries);
 TORCH_API void enableMobileInterfaceCallExport();
 bool getMobileInterfaceCallExport();
 
-CompilationOptions getOptionsFromGlobal();
+TORCH_API CompilationOptions getOptionsFromGlobal();
+
+TORCH_API void save_jit_module(
+    const Module& module,
+    const std::string& filename,
+    const ExtraFilesMap& extra_files = ExtraFilesMap());
+
+TORCH_API DetachedBuffer::UniqueDetachedBuffer save_jit_module_to_bytes(
+    const Module& module,
+    const ExtraFilesMap& extra_files = ExtraFilesMap());
 
-extern void (*_save_jit_module_to)(
+TORCH_API void save_jit_module_to_write_func(
     const Module& module,
     const ExtraFilesMap& extra_files,
     bool save_mobile_debug_info,
diff --git a/torch/csrc/jit/serialization/export_module.cpp b/torch/csrc/jit/serialization/export_module.cpp
index 90f9f9411b38e..0ff9b78478462 100644
--- a/torch/csrc/jit/serialization/export_module.cpp
+++ b/torch/csrc/jit/serialization/export_module.cpp
@@ -16,6 +16,7 @@
 #include <torch/csrc/jit/runtime/instruction.h>
 #include <torch/csrc/jit/serialization/callstack_debug_info_serialization.h>
 #include <torch/csrc/jit/serialization/export_bytecode.h>
+#include <torch/csrc/jit/serialization/flatbuffer_serializer.h>
 #include <torch/csrc/jit/serialization/import_export_constants.h>
 #include <torch/csrc/jit/serialization/import_export_functions.h>
 #include <torch/csrc/jit/serialization/import_export_helpers.h>
@@ -874,11 +875,37 @@ void ExportModule(
       use_flatbuffer);
 }
 
-void (*_save_jit_module_to)(
+void save_jit_module(
+    const Module& module,
+    const std::string& filename,
+    const ExtraFilesMap& extra_files) {
+  auto buffer = save_jit_module_to_bytes(module, extra_files);
+  std::fstream ofile(filename, std::ios::binary | std::ios::out);
+  ofile.write(
+      reinterpret_cast<char*>(buffer->data()), buffer->size()); // NOLINT
+  ofile.close();
+}
+
+DetachedBuffer::UniqueDetachedBuffer save_jit_module_to_bytes(
+    const Module& module,
+    const ExtraFilesMap& extra_files) {
+  ExtraFilesMap jitfiles;
+  std::vector<IValue> constants;
+  jitModuleToPythonCodeAndConstants(module, &jitfiles, &constants);
+  CompilationOptions options = getOptionsFromGlobal();
+  mobile::Module mobilem = jitModuleToMobile(module, options);
+  return save_mobile_module_to_bytes(mobilem, extra_files, jitfiles, constants);
+}
+
+void save_jit_module_to_write_func(
     const Module& module,
     const ExtraFilesMap& extra_files,
     bool save_mobile_debug_info,
-    const std::function<size_t(const void*, size_t)>& writer_func) = nullptr;
+    const std::function<size_t(const void*, size_t)>& writer_func) {
+  (void)save_mobile_debug_info;
+  auto buffer = save_jit_module_to_bytes(module, extra_files);
+  writer_func(reinterpret_cast<void*>(buffer->data()), buffer->size());
+}
 
 void ExportModule(
     const Module& module,
@@ -888,14 +915,8 @@ void ExportModule(
     bool save_mobile_debug_info,
     bool use_flatbuffer) {
   if (use_flatbuffer) {
-    if (_save_jit_module_to != nullptr) {
-      _save_jit_module_to(
-          module, extra_files, save_mobile_debug_info, writer_func);
-    } else {
-      TORCH_CHECK(
-          false,
-          "Trying to export as flatbuffer file but the build hasn't enabled flatbuffer");
-    }
+    save_jit_module_to_write_func(
+        module, extra_files, save_mobile_debug_info, writer_func);
   } else {
     caffe2::serialize::PyTorchStreamWriter writer(writer_func);
     ScriptModuleSerializer serializer(writer);
diff --git a/torch/csrc/jit/serialization/flatbuffer_serializer.cpp b/torch/csrc/jit/serialization/flatbuffer_serializer.cpp
index 54ec7c7b6ed3e..ccacf7beab846 100644
--- a/torch/csrc/jit/serialization/flatbuffer_serializer.cpp
+++ b/torch/csrc/jit/serialization/flatbuffer_serializer.cpp
@@ -819,7 +819,7 @@ DetachedBuffer::UniqueDetachedBuffer save_mobile_module_to_bytes(
   return DetachedBufferFriend::make_unique_detached_buffer(ret);
 }
 
-static void save_mobile_module_to_func(
+void save_mobile_module_to_func(
     const mobile::Module& module,
     const std::function<size_t(const void*, size_t)>& writer_func) {
   auto buffer = save_mobile_module_to_bytes(module);
@@ -827,15 +827,8 @@ static void save_mobile_module_to_func(
 }
 
 bool register_flatbuffer_serializer() {
-  _save_mobile_module_to = save_mobile_module_to_func;
   return true;
 }
 
-// iOS builds are often build with -Wglobal-constructor to minimize
-// startup time. So let them call register manually if needed.
-#if !defined(__APPLE__)
-const bool kFlatbufferSerializerRegistered = register_flatbuffer_serializer();
-#endif
-
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/serialization/flatbuffer_serializer.h b/torch/csrc/jit/serialization/flatbuffer_serializer.h
index 24da6b5527922..43e8062ef2dce 100644
--- a/torch/csrc/jit/serialization/flatbuffer_serializer.h
+++ b/torch/csrc/jit/serialization/flatbuffer_serializer.h
@@ -83,10 +83,11 @@ TORCH_API DetachedBuffer::UniqueDetachedBuffer save_mobile_module_to_bytes(
     const ExtraFilesMap& jit_sources = ExtraFilesMap(),
     const std::vector<IValue>& jit_constants = {});
 
-// This function will make the capabilities to load and safe
-// Module as a flatbuffer file available for use by _load_for_mobile
-// and friends. This is NOT needed if using the other functions
-// in this file directly.
+TORCH_API void save_mobile_module_to_func(
+    const mobile::Module& module,
+    const std::function<size_t(const void*, size_t)>& writer_func);
+
+// TODO(qihan): delete
 TORCH_API bool register_flatbuffer_serializer();
 
 } // namespace jit
diff --git a/torch/csrc/jit/serialization/flatbuffer_serializer_jit.cpp b/torch/csrc/jit/serialization/flatbuffer_serializer_jit.cpp
index 321068311da25..9cbb0f1cd2f80 100644
--- a/torch/csrc/jit/serialization/flatbuffer_serializer_jit.cpp
+++ b/torch/csrc/jit/serialization/flatbuffer_serializer_jit.cpp
@@ -15,87 +15,9 @@
 namespace torch {
 namespace jit {
 
-Module parse_and_initialize_jit_module(
-    std::shared_ptr<char> data,
-    size_t size,
-    ExtraFilesMap& extra_files,
-    c10::optional<at::Device> device) {
-  populate_upgraders_graph_map();
-  ExtraFilesMap jit_files;
-  std::vector<IValue> jit_constants;
-  mobile::Module mobilem = parse_and_initialize_mobile_module_for_jit(
-      data.get(), size, jit_files, jit_constants, device, &extra_files);
-
-  Module m = jitModuleFromSourceAndConstants(
-      mobilem._ivalue(),
-      jit_files,
-      jit_constants,
-      static_cast<int32_t>(mobilem.bytecode_version()));
-  m.set_delete_memory(data);
-  return m;
-}
-
-Module load_jit_module_from_file(
-    const std::string& filename,
-    ExtraFilesMap& extra_files,
-    c10::optional<at::Device> device) {
-  auto data = get_file_content(filename.c_str());
-  return parse_and_initialize_jit_module(
-      std::move(std::get<0>(data)), std::get<1>(data), extra_files, device);
-}
-
-Module load_jit_module_from_stream(
-    std::istream& in,
-    ExtraFilesMap& extra_files,
-    c10::optional<at::Device> device) {
-  auto data = get_stream_content(in);
-  return parse_and_initialize_jit_module(
-      std::move(std::get<0>(data)), std::get<1>(data), extra_files, device);
-}
-
-void save_jit_module(
-    const Module& module,
-    const std::string& filename,
-    const ExtraFilesMap& extra_files) {
-  auto buffer = save_jit_module_to_bytes(module, extra_files);
-  std::fstream ofile(filename, std::ios::binary | std::ios::out);
-  ofile.write(
-      reinterpret_cast<char*>(buffer->data()), buffer->size()); // NOLINT
-  ofile.close();
-}
-
-DetachedBuffer::UniqueDetachedBuffer save_jit_module_to_bytes(
-    const Module& module,
-    const ExtraFilesMap& extra_files) {
-  ExtraFilesMap jitfiles;
-  std::vector<IValue> constants;
-  jitModuleToPythonCodeAndConstants(module, &jitfiles, &constants);
-  CompilationOptions options = getOptionsFromGlobal();
-  mobile::Module mobilem = jitModuleToMobile(module, options);
-  return save_mobile_module_to_bytes(mobilem, extra_files, jitfiles, constants);
-}
-
-static void save_jit_module_to_write_func(
-    const Module& module,
-    const ExtraFilesMap& extra_files,
-    bool save_mobile_debug_info,
-    const std::function<size_t(const void*, size_t)>& writer_func) {
-  (void)save_mobile_debug_info;
-  auto buffer = save_jit_module_to_bytes(module, extra_files);
-  writer_func(reinterpret_cast<void*>(buffer->data()), buffer->size());
-}
-
 bool register_flatbuffer_all() {
-  (void)register_flatbuffer_loader();
-  (void)register_flatbuffer_serializer();
-  _save_jit_module_to = save_jit_module_to_write_func;
-  _load_jit_module_from_flatbuffer_bytes = parse_and_initialize_jit_module;
   return true;
 }
 
-#if !defined(__APPLE__)
-const bool kFlatbufferSerializerJitInitialized = register_flatbuffer_all();
-#endif
-
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/serialization/flatbuffer_serializer_jit.h b/torch/csrc/jit/serialization/flatbuffer_serializer_jit.h
index 1f605f18ba1e5..b43ab831f1773 100644
--- a/torch/csrc/jit/serialization/flatbuffer_serializer_jit.h
+++ b/torch/csrc/jit/serialization/flatbuffer_serializer_jit.h
@@ -5,35 +5,6 @@
 namespace torch {
 namespace jit {
 
-TORCH_API void save_jit_module(
-    const Module& module,
-    const std::string& filename,
-    const ExtraFilesMap& extra_files = ExtraFilesMap());
-
-TORCH_API DetachedBuffer::UniqueDetachedBuffer save_jit_module_to_bytes(
-    const Module& module,
-    const ExtraFilesMap& extra_files = ExtraFilesMap());
-
-TORCH_API Module parse_and_initialize_jit_module(
-    std::shared_ptr<char> data,
-    size_t size,
-    ExtraFilesMap& extra_files,
-    c10::optional<at::Device> device = c10::nullopt);
-
-TORCH_API Module load_jit_module_from_file(
-    const std::string& filename,
-    ExtraFilesMap& extra_files,
-    c10::optional<at::Device> device = c10::nullopt);
-
-TORCH_API Module load_jit_module_from_stream(
-    std::istream& in,
-    ExtraFilesMap& extra_files,
-    c10::optional<at::Device> device = c10::nullopt);
-
-// This function will make the capabilities to load and safe
-// Module as a flatbuffer file available for use by _load_for_mobile
-// and friends. This is NOT needed if using the other functions
-// in this file directly.
 TORCH_API bool register_flatbuffer_all();
 
 } // namespace jit
diff --git a/torch/csrc/jit/serialization/import.cpp b/torch/csrc/jit/serialization/import.cpp
index b79d29726bef1..56087f1fe0d3b 100644
--- a/torch/csrc/jit/serialization/import.cpp
+++ b/torch/csrc/jit/serialization/import.cpp
@@ -24,6 +24,7 @@
 #include <torch/csrc/jit/frontend/script_type_parser.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/mobile/file_format.h>
+#include <torch/csrc/jit/mobile/flatbuffer_loader.h>
 #include <torch/csrc/jit/operator_upgraders/upgraders_entry.h>
 #include <torch/csrc/jit/passes/subgraph_rewrite.h>
 #include <torch/csrc/jit/serialization/import_read.h>
@@ -294,12 +295,6 @@ Module import_ir_module(
   return import_ir_module(std::move(cu), in, device, extra_files);
 }
 
-Module (*_load_jit_module_from_flatbuffer_bytes)(
-    std::shared_ptr<char>,
-    size_t,
-    ExtraFilesMap&,
-    c10::optional<at::Device>) = nullptr;
-
 static Module _load_jit_module_from_bytes(
     std::shared_ptr<char> data,
     size_t size,
@@ -307,6 +302,44 @@ static Module _load_jit_module_from_bytes(
     c10::optional<c10::Device> device,
     ExtraFilesMap& extra_files);
 
+Module parse_and_initialize_jit_module(
+    std::shared_ptr<char> data,
+    size_t size,
+    ExtraFilesMap& extra_files,
+    c10::optional<at::Device> device) {
+  populate_upgraders_graph_map();
+  ExtraFilesMap jit_files;
+  std::vector<IValue> jit_constants;
+  mobile::Module mobilem = parse_and_initialize_mobile_module_for_jit(
+      data.get(), size, jit_files, jit_constants, device, &extra_files);
+
+  Module m = jitModuleFromSourceAndConstants(
+      mobilem._ivalue(),
+      jit_files,
+      jit_constants,
+      static_cast<int32_t>(mobilem.bytecode_version()));
+  m.set_delete_memory(data);
+  return m;
+}
+
+Module load_jit_module_from_file(
+    const std::string& filename,
+    ExtraFilesMap& extra_files,
+    c10::optional<at::Device> device) {
+  auto data = get_file_content(filename.c_str());
+  return parse_and_initialize_jit_module(
+      std::move(std::get<0>(data)), std::get<1>(data), extra_files, device);
+}
+
+Module load_jit_module_from_stream(
+    std::istream& in,
+    ExtraFilesMap& extra_files,
+    c10::optional<at::Device> device) {
+  auto data = get_stream_content(in);
+  return parse_and_initialize_jit_module(
+      std::move(std::get<0>(data)), std::get<1>(data), extra_files, device);
+}
+
 Module import_ir_module(
     std::shared_ptr<CompilationUnit> cu,
     std::istream& in,
@@ -448,14 +481,7 @@ Module _load_jit_module_from_bytes(
   auto format = getFileFormat(data.get());
   switch (format) {
     case FileFormat::FlatbufferFileFormat: {
-      if (_load_jit_module_from_flatbuffer_bytes != nullptr) {
-        return _load_jit_module_from_flatbuffer_bytes(
-            data, size, extra_files, device);
-      } else {
-        TORCH_CHECK(
-            false,
-            "Flatbuffer input file but the build hasn't enable flatbuffer")
-      }
+      return parse_and_initialize_jit_module(data, size, extra_files, device);
     }
     case FileFormat::ZipFileFormat: {
       auto rai = std::make_unique<MemoryReadAdapter>(data.get(), size);
diff --git a/torch/csrc/jit/serialization/import.h b/torch/csrc/jit/serialization/import.h
index 581ad681a3d25..2b56914472b6c 100644
--- a/torch/csrc/jit/serialization/import.h
+++ b/torch/csrc/jit/serialization/import.h
@@ -110,19 +110,27 @@ TORCH_API Module jitModuleFromSourceAndConstants(
     const std::vector<IValue>& constants,
     int32_t version);
 
-extern Module (*_load_jit_module_from_flatbuffer_bytes)(
-    // comp unit
-    std::shared_ptr<char>,
-    size_t,
-    ExtraFilesMap&,
-    c10::optional<at::Device>);
-
-extern Module (*_load_jit_module_from_flatbuffer_bytes)(
-    // comp unit
-    std::shared_ptr<char>,
-    size_t,
-    ExtraFilesMap&,
-    c10::optional<at::Device>);
+TORCH_API Module parse_and_initialize_jit_module(
+    std::shared_ptr<char> data,
+    size_t size,
+    ExtraFilesMap& extra_files,
+    c10::optional<at::Device> device = c10::nullopt);
+
+TORCH_API Module load_jit_module_from_file(
+    const std::string& filename,
+    ExtraFilesMap& extra_files,
+    c10::optional<at::Device> device = c10::nullopt);
+
+TORCH_API Module load_jit_module_from_stream(
+    std::istream& in,
+    ExtraFilesMap& extra_files,
+    c10::optional<at::Device> device = c10::nullopt);
+
+TORCH_API Module parse_and_initialize_jit_module(
+    std::shared_ptr<char> data,
+    size_t size,
+    ExtraFilesMap& extra_files,
+    c10::optional<at::Device> device);
 
 } // namespace jit
 } // namespace torch

From d625385d0b4b8d65be62995a1319b2f0eeed4bec Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Thu, 8 Dec 2022 05:04:15 +0000
Subject: [PATCH 1709/1922] Revert "[Inductor] More robust stride and offset
 extraction from index expressions (#90184)"

This reverts commit 71f27f768839394ec226c37a763bd524d8589f07.

Reverted https://github.com/pytorch/pytorch/pull/90184 on behalf of https://github.com/ngimel due to catastrophically regresses performance
---
 torch/_inductor/ir.py       |   8 +--
 torch/_inductor/sizevars.py | 102 +++++++++++++-----------------------
 2 files changed, 37 insertions(+), 73 deletions(-)

diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 7e770d4531302..253f217320e2e 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -2425,12 +2425,8 @@ def convert_to_reinterpret_view(cls, x):
         index = V.graph.sizevars.simplify_with_ranges(
             list(rw.reads)[0].index, rw.var_ranges
         )
-        strides, offset = V.graph.sizevars.maybe_stride_and_offset_vars(
-            index, rw.range_vars
-        )
-        if offset is None or any(s is None for s in strides):
-            raise NotImplementedError()
-
+        strides = V.graph.sizevars.stride_vars(index, rw.range_vars)
+        offset = V.graph.sizevars.offset_var(index, rw.range_vars)
         expected = sympy_dot(rw.range_vars, strides) + offset
 
         if index != expected:
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
index 47267c00e62fa..7895f0dccdcba 100644
--- a/torch/_inductor/sizevars.py
+++ b/torch/_inductor/sizevars.py
@@ -2,7 +2,7 @@
 import functools
 import itertools
 import logging
-from typing import Callable, Dict, List, Optional, Tuple
+from typing import Callable, Dict, List, Tuple
 
 import sympy
 from sympy import Expr
@@ -11,7 +11,7 @@
 
 from . import ir
 from .codegen.common import IndentedBuffer
-from .utils import sympy_dot, sympy_subs, sympy_symbol, VarRanges
+from .utils import sympy_subs, sympy_symbol, VarRanges
 from .virtualized import V
 
 log = logging.getLogger(__name__)
@@ -47,9 +47,7 @@ def __init__(self, shape_env=None):
         self.guards = []
         self.replacements: Dict[sympy.Symbol, Expr] = self.shape_env.replacements
         self.need_seed = False
-        self.maybe_stride_and_offset_vars = (
-            self.make_maybe_stride_and_offset_vars_cache()
-        )
+        self.stride_vars = self.make_stride_vars_cache()
         self.simplify_with_ranges = self.make_simplify_with_ranges_cache()
         self._simplify_loops = self.make_simplify_loops_cache()
         self.declare = ""
@@ -193,7 +191,7 @@ def _simplify_loops_impl(self, index_vars, sizes, index_formulas):
         """
         sizes = list(map(self.simplify, sizes))
 
-        strides = [self.maybe_stride_vars(x, index_vars) for x in index_formulas]
+        strides = [self.stride_vars(x, index_vars) for x in index_formulas]
         assert len(sizes) == len(strides[0]), (len(sizes), len(strides[0]))
 
         for i in range(len(sizes)):
@@ -203,9 +201,6 @@ def _simplify_loops_impl(self, index_vars, sizes, index_formulas):
 
         def can_merge_dims(a, b):
             for k in range(len(strides)):
-                if strides[k][a] is None or strides[k][b] is None:
-                    return False
-
                 if self.simplify(strides[k][a] * sizes[a]) == self.simplify(
                     strides[k][b]
                 ):
@@ -389,79 +384,52 @@ def wrapper(*args, **kwargs):
 
         return wrapper
 
-    def make_maybe_stride_and_offset_vars_cache(self):
-        cache = self._lru_cache(self._maybe_stride_and_offset_vars)
+    def make_stride_vars_cache(self):
+        cache = self._lru_cache(self._stride_vars)
 
-        def maybe_stride_and_offset_vars(
-            index: Expr, vars: List[sympy.Symbol]
-        ) -> List[Expr]:
+        def stride_vars(index: Expr, vars: List[sympy.Symbol]) -> List[Expr]:
             return cache(index, tuple(vars))
 
-        return maybe_stride_and_offset_vars
-
-    def _maybe_stride_and_offset_vars(
-        self, index: sympy.Expr, vars: List[sympy.Symbol]
-    ) -> Tuple[List[Optional[sympy.Expr]], Optional[sympy.Expr]]:
-        """Convert an indexing expression back into strides and offset"""
-        index = self.simplify(index)
-
-        # TODO: vars aren't always symbols
-        assert all(isinstance(v, sympy.Symbol) or v == 0 for v in vars)
-        var_symbols = [v for v in vars if isinstance(v, sympy.Symbol)]
-
-        stride_symbols = [sympy.Wild(f"stride{i}") for i in range(len(var_symbols))]
-        var_to_stride = {v: s for v, s in zip(var_symbols, stride_symbols)}
-        offset_symbol = sympy.Wild("offset")
-        index_pattern = offset_symbol + sympy_dot(var_symbols, stride_symbols)
-
-        match = index.match(index_pattern)
-        if match is None:
-            # Index calculation is not strided
-            return [None] * len(vars), None
+        return stride_vars
 
+    def _stride_vars(self, index: Expr, vars: List[sympy.Symbol]) -> List[Expr]:
+        """Convert an indexing expression back into strides"""
         strides = []
-        for v in vars:
-            if v not in var_to_stride:
-                stride = 0 if v == 0 else None
+        index = self.simplify(index)
+        # remove any offset
+        index = index - sympy_subs(index, {v: sympy.Integer(0) for v in vars if v != 0})
+        for i in range(len(vars)):
+            # drop all the other dims
+            index_dim = sympy_subs(
+                index,
+                {
+                    vars[j]: sympy.Integer(0)
+                    for j in range(len(vars))
+                    if i != j and vars[j] != 0
+                },
+            )
+            v = vars[i]
+            if v == 0:
+                strides.append(sympy.Integer(0))
             else:
-                stride = match.get(var_to_stride[v], None)
-            strides.append(stride)
-
-        offset = match[offset_symbol]
-
-        # If any vars appear in the offset terms, they are not strided
-        vars_set = set(vars)
-        if vars_set & offset.free_symbols:
-            for i, v in enumerate(vars):
-                if v in offset.free_symbols:
-                    strides[i] = None
-
-            offset = None
-
-        return strides, offset
-
-    def maybe_stride_vars(
-        self, index: Expr, vars: List[sympy.Symbol]
-    ) -> List[Optional[Expr]]:
-        """Convert an indexing expression back into strides"""
-        strides, offset = self.maybe_stride_and_offset_vars(index, vars)
+                # TODO(jansel): should we use sympy.diff here?
+                strides.append(
+                    sympy_subs(index_dim, {v: sympy.Integer(1)})
+                    - sympy_subs(index_dim, {v: sympy.Integer(0)})
+                )
         return strides
 
-    def maybe_offset_var(self, index: Expr, vars: List[sympy.Symbol]) -> Optional[Expr]:
+    def offset_var(self, index: Expr, vars: List[sympy.Symbol]) -> Expr:
         """Extract offset part of an indexing expression"""
-        strides, offset = self.maybe_stride_and_offset_vars(index, vars)
-        return offset
+        index = self.simplify(index)
+        return sympy_subs(index, {v: sympy.Integer(0) for v in vars if v != 0})
 
     def stride_hints(self, index: Expr, vars: List[sympy.Symbol]) -> List[int]:
         for v in index.free_symbols:
             if v.name.startswith("indirect"):
                 index = sympy_subs(index, {v: 0})
         result = []
-        for s in self.maybe_stride_vars(index, vars):
-            if s is None:
-                result.append(0)
-                continue
-
+        for s in self.stride_vars(index, vars):
             try:
                 result.append(self.size_hint(s))
             except TypeError:

From fa33af881d8f09d3a5ae3edc2d3f5fe33e0ef572 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Thu, 8 Dec 2022 05:16:56 +0000
Subject: [PATCH 1710/1922] Revert "add save and load stats in memory_tracker
 (#90144)"

This reverts commit 1f137c1e2f738d9021b5e22fb6e52d41b780a1a8.

Reverted https://github.com/pytorch/pytorch/pull/90144 on behalf of https://github.com/ezyang due to dirty git working copy broke master
---
 .../distributed/_tools/test_memory_tracker.py |  2 -
 torch/distributed/_tools/memory_tracker.py    | 42 ++++---------------
 2 files changed, 9 insertions(+), 35 deletions(-)

diff --git a/test/distributed/_tools/test_memory_tracker.py b/test/distributed/_tools/test_memory_tracker.py
index 12ec782ec07da..2e19ef6bf7294 100644
--- a/test/distributed/_tools/test_memory_tracker.py
+++ b/test/distributed/_tools/test_memory_tracker.py
@@ -52,8 +52,6 @@ def test_local_model(self):
 
         self.assertTrue(len(tracker._hooks) == 0)
 
-        tracker.save_stats("memory.trace")
-        tracker.load("memory.trace")
         tracker.summary()
 
         self.assertTrue(tracker._op_index > 0)
diff --git a/torch/distributed/_tools/memory_tracker.py b/torch/distributed/_tools/memory_tracker.py
index 0c34dcb3bfb46..477c59021bd48 100644
--- a/torch/distributed/_tools/memory_tracker.py
+++ b/torch/distributed/_tools/memory_tracker.py
@@ -1,7 +1,5 @@
 from collections import defaultdict
 
-import pickle
-
 from typing import (
     Any,
     Callable,
@@ -76,9 +74,15 @@ def __init__(self) -> None:
         torch._C._log_api_usage_once("torch.distributed.memory_tracker")
         self._hooks: List[RemovableHandle] = []
         self._operator_names: Dict[str, int] = defaultdict(int)
-        self.memories_allocated: Dict[int, Dict[str, float]] = defaultdict()
-        self.memories_active: Dict[int, Dict[str, float]] = defaultdict()
-        self.memories_reserved: Dict[int, Dict[str, float]] = defaultdict()
+        self.memories_allocated: Dict[int, Dict[str, float]] = defaultdict(
+            lambda: defaultdict(float)
+        )
+        self.memories_active: Dict[int, Dict[str, float]] = defaultdict(
+            lambda: defaultdict(float)
+        )
+        self.memories_reserved: Dict[int, Dict[str, float]] = defaultdict(
+            lambda: defaultdict(float)
+        )
         self._markers: Dict[str, int] = defaultdict(int)
         self._cur_module_name: str = ""
         self._op_index: int = 0
@@ -176,34 +180,6 @@ def show_traces(self) -> None:
                 )
         plt.legend()
 
-    def save_stats(self, path: str) -> None:
-        """
-        Save the stats using pickle during runtime if users want to plot the traces
-        in other places like notebook.
-        """
-        stats = {
-            "memories_allocated": self.memories_allocated,
-            "memories_active": self.memories_active,
-            "memories_reserved": self.memories_reserved,
-            "markers": self._markers,
-        }
-
-        with open(path, "wb") as f:
-            pickle.dump(stats, f)
-
-    def load(self, path: str) -> None:
-        """
-        Load the pickled memory stats to plot the traces or print the summary.
-        """
-
-        with open(path, "rb") as f:
-            stats = pickle.load(f)
-
-        self.memories_allocated = stats["memories_allocated"]
-        self.memories_active = stats["memories_active"]
-        self.memories_reserved = stats["memories_reserved"]
-        self._markers = stats["markers"]
-
     def _create_pre_forward_hook(self, name: str) -> Callable:
         """
         The pre_foward_hook is to insert current module name with forward prefix for the operator

From 4241615dc4af03d6d0a5fe4c706df5ab9c964197 Mon Sep 17 00:00:00 2001
From: titaiwang <titaiwang@microsoft.com>
Date: Wed, 7 Dec 2022 21:23:41 +0000
Subject: [PATCH 1711/1922] [ONNX] Fix ignored small eps in layer normalization
 in fp16 (#89869)

Prior to this change, the symbolic_fn `layer_norm` (before ONNX version 17) always lose precision when eps is smaller than Float type, while PyTorch always take eps as Double. This PR adds `onnx::Cast` into eps related operations to prevent losing precision during the calculation.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89869
Approved by: https://github.com/BowenBao
---
 test/onnx/test_pytorch_onnx_no_runtime.py | 26 ++++++++++++++++++
 torch/onnx/symbolic_opset9.py             | 33 ++++++++++++++++++++---
 2 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/test/onnx/test_pytorch_onnx_no_runtime.py b/test/onnx/test_pytorch_onnx_no_runtime.py
index aca26e0cb2b42..c741ddd2c41ed 100644
--- a/test/onnx/test_pytorch_onnx_no_runtime.py
+++ b/test/onnx/test_pytorch_onnx_no_runtime.py
@@ -1152,6 +1152,32 @@ def test_lower_graph_conv3d(self):
         data = torch.from_numpy(data_numpy).to(dtype=torch.float)
         self._test_lower_graph_impl(model, data)
 
+    @pytorch_test_common.skipIfNoCuda
+    def test_composed_layer_norm_small_eps_fp16_keep_double(self):
+        class Net(torch.nn.Module):
+            def __init__(self, C):
+                super().__init__()
+                self.layer_norm = torch.nn.LayerNorm(C, eps=1e-8)
+
+            def forward(self, x):
+                return self.layer_norm(x)
+
+        N, C = 8, 4
+        model = Net(C).cuda().half()
+        x = torch.randn(N, C).cuda().half()
+        f = io.BytesIO()
+        torch.onnx.export(model, x, f, opset_version=14)
+        onnx_model = onnx.load_from_string(f.getvalue())
+        const_node = [n for n in onnx_model.graph.node if n.op_type == "Constant"]
+        self.assertNotEqual(len(const_node), 0)
+        double_type_count = 0
+        for node in const_node:
+            for a in node.attribute:
+                # EPS constant should be in double type
+                if a.name == "value" and a.t.data_type == 11:
+                    double_type_count += 1
+        self.assertNotEqual(double_type_count, 0)
+
 
 if __name__ == "__main__":
     common_utils.run_tests()
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index e8fd99e5fc3dc..78928d6537d33 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -2728,19 +2728,46 @@ def native_layer_norm(
 
     mean = g.op("ReduceMean", input, axes_i=axes)
     numerator = sub(g, input, mean)
+
+    # Cast it to eps dtype to avoid precision loss
+    is_type_half = (
+        _type_utils.JitScalarType.from_value(numerator)
+        == _type_utils.JitScalarType.HALF
+    )
+    if is_type_half:
+        eps_dtype = _type_utils.JitScalarType.from_value(eps_cst)
+        numerator = g.op(
+            "Cast", numerator, to_i=_type_utils.JitScalarType(eps_dtype).onnx_type()
+        )
+
     # variance = e((x - e(x))^2), and (x - e(x)) is the numerator in the layer_norm formula
     variance = g.op("ReduceMean", pow(g, numerator, two_cst), axes_i=axes)
-    denominator = sqrt(g, add(g, variance, eps_cst))
-
+    denominator = sqrt(g, g.op("Add", variance, eps_cst))
     normalized = g.op("Div", numerator, denominator)
 
+    # Cast back to input type as eps related ops are all done
+    if is_type_half:
+        input_dtype = _type_utils.JitScalarType.from_value(input)
+        normalized = g.op(
+            "Cast", normalized, to_i=_type_utils.JitScalarType(input_dtype).onnx_type()
+        )
+
     if not (weight is None or symbolic_helper._is_none(weight)):
         normalized = mul(g, normalized, weight)
     if not (bias is None or symbolic_helper._is_none(bias)):
         normalized = add(g, normalized, bias)
 
     # rdenominator := 1 / sqrt(variance + eps)
-    rdenominator = reciprocal(g, denominator)
+    # According to aten::native_layer_norm, rdenominator should have the same dtype as input,
+    # mean and normalized, so we need to Cast it back
+    if is_type_half:
+        denominator = g.op(
+            "Cast", denominator, to_i=_type_utils.JitScalarType(input_dtype).onnx_type()
+        )
+        rdenominator = g.op("Reciprocal", denominator)
+    else:
+        rdenominator = reciprocal(g, denominator)
+
     return normalized, mean, rdenominator
 
 
From 9f4c6bf4d2472452c0293b88b657f9c63fe38404 Mon Sep 17 00:00:00 2001
From: Sergii Dymchenko <sdym@fb.com>
Date: Thu, 8 Dec 2022 06:27:08 +0000
Subject: [PATCH 1712/1922] Fix issue 38095 TODO in test_multiprocessing.py
 (#90335)

Fix TODO related to https://github.com/pytorch/pytorch/issues/38095
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90335
Approved by: https://github.com/clee2000
---
 test/test_multiprocessing.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/test/test_multiprocessing.py b/test/test_multiprocessing.py
index ae0d87be216a1..dba249e7de7d0 100644
--- a/test/test_multiprocessing.py
+++ b/test/test_multiprocessing.py
@@ -418,8 +418,7 @@ def test_cuda_memory_allocation(self):
         t = []
         for _ in range(5):
             t.append(q.get())
-        # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-        self.assertEqualIgnoreType(t[0], torch.full([5], 0.))
+        self.assertEqual(t[0], torch.full([5], 0.))
         del t
         e.set()
         p.join(1)
@@ -642,7 +641,7 @@ def _test_event_handle_importer_consumer(handle, p2c, c2p):
         c2p.put(0)  # notify parent child is ready
         p2c.get()  # wait for record in parent
         e1.synchronize()
-        c2p.put(1)  # nofity synchronization is done in child
+        c2p.put(1)  # notify synchronization is done in child
         p2c.get()  # wait for parent to finish before destructing child event
 
     @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \

From 7ce90d1a03c50425090d5de74e529e061082ff9a Mon Sep 17 00:00:00 2001
From: fduwjj <fduwjj@fb.com>
Date: Thu, 8 Dec 2022 00:03:23 +0000
Subject: [PATCH 1713/1922] [PT-D][Easy] Reformat the optim code within PTD
 code base (#90399)

Just run two commands:
```
ufmt format torch/distributed/optim/
ufmt format test/distributed/optim/
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90399
Approved by: https://github.com/awgu
---
 .../optim/test_apply_optimizer_in_backward.py |   6 +-
 .../optim/test_zero_redundancy_optimizer.py   | 352 +++++++++++-------
 .../optim/apply_optimizer_in_backward.py      |   9 +-
 .../distributed/optim/functional_adadelta.py  |  51 +--
 torch/distributed/optim/functional_adagrad.py |  37 +-
 torch/distributed/optim/functional_adam.py    | 125 ++++---
 torch/distributed/optim/functional_adamax.py  |  51 +--
 torch/distributed/optim/functional_adamw.py   | 113 +++---
 torch/distributed/optim/functional_rmsprop.py |  67 ++--
 torch/distributed/optim/functional_rprop.py   |  45 ++-
 torch/distributed/optim/functional_sgd.py     |  64 ++--
 torch/distributed/optim/named_optimizer.py    |   2 +-
 torch/distributed/optim/optimizer.py          |  46 +--
 .../optim/post_localSGD_optimizer.py          |  21 +-
 torch/distributed/optim/utils.py              |  14 +-
 .../optim/zero_redundancy_optimizer.py        | 345 +++++++++++------
 .../optim/zero_redundancy_optimizer.pyi       |  35 +-
 17 files changed, 814 insertions(+), 569 deletions(-)

diff --git a/test/distributed/optim/test_apply_optimizer_in_backward.py b/test/distributed/optim/test_apply_optimizer_in_backward.py
index 344d8c81a18cd..ebf4c4d4e9c82 100644
--- a/test/distributed/optim/test_apply_optimizer_in_backward.py
+++ b/test/distributed/optim/test_apply_optimizer_in_backward.py
@@ -17,6 +17,7 @@
 
 # TODO (rohan-varma): Add FSDP & DDP tests once supported
 
+
 def _validate_params(params_list, fn):
     ref_params = params_list[0]
     for param_list in params_list[1:]:
@@ -25,7 +26,6 @@ def _validate_params(params_list, fn):
 
 
 class ApplyOverlappedOptimizerTest(unittest.TestCase):
-
     def _run_training_loop_and_validate(self, inp, models, optimizers):
         for i in range(6):
             for model in models:
@@ -60,13 +60,13 @@ def _test_apply_optimizer_in_backward(self, share_params) -> None:
         _apply_optimizer_in_backward(
             torch.optim.SGD,
             [m.weight for m in model_with_opt_in_bwd],
-            optimizer_kwargs=weight_optimizer_kwargs
+            optimizer_kwargs=weight_optimizer_kwargs,
         )
 
         _apply_optimizer_in_backward(
             torch.optim.SGD,
             [m.bias for m in model_with_opt_in_bwd],
-            optimizer_kwargs=bias_optimizer_kwargs
+            optimizer_kwargs=bias_optimizer_kwargs,
         )
 
         _validate_params(
diff --git a/test/distributed/optim/test_zero_redundancy_optimizer.py b/test/distributed/optim/test_zero_redundancy_optimizer.py
index 2a3224122a640..3e0474c3a4494 100644
--- a/test/distributed/optim/test_zero_redundancy_optimizer.py
+++ b/test/distributed/optim/test_zero_redundancy_optimizer.py
@@ -10,7 +10,7 @@
 import sys
 import unittest
 from contextlib import suppress
-from typing import Any, List, cast
+from typing import Any, cast, List
 
 import numpy as np
 
@@ -24,26 +24,25 @@
     hook_with_zero_step,
     hook_with_zero_step_interleaved,
 )
-from torch.distributed.algorithms.ddp_comm_hooks.default_hooks import (
-    allreduce_hook,
-)
+from torch.distributed.algorithms.ddp_comm_hooks.default_hooks import allreduce_hook
 from torch.distributed.algorithms.join import Join, Joinable, JoinHook
 from torch.distributed.optim import ZeroRedundancyOptimizer
 from torch.distributed.optim.zero_redundancy_optimizer import _broadcast_object
 from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.optim import SGD, AdamW
+from torch.optim import AdamW, SGD
 from torch.testing._internal import common_distributed
 from torch.testing._internal.common_utils import (
-    IS_WINDOWS,
-    TEST_WITH_ASAN,
-    TEST_WITH_DEV_DBG_ASAN,
     instantiate_parametrized_tests,
+    IS_WINDOWS,
     parametrize,
     run_tests,
+    TEST_WITH_ASAN,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 
 try:
     import torchvision
+
     HAS_TORCHVISION = True
 except ImportError:
     HAS_TORCHVISION = False
@@ -51,17 +50,18 @@
 # Use GLOO on GPU when running CUDA + Windows
 def _get_backend_for_tests():
     return (
-        dist.Backend.NCCL if not IS_WINDOWS and torch.cuda.is_available()
+        dist.Backend.NCCL
+        if not IS_WINDOWS and torch.cuda.is_available()
         # Windows only has GLOO, but GLOO GPU works. And use GLOO CPU when
         # no GPUs are available.
         else dist.Backend.GLOO
     )
 
+
 BACKEND = _get_backend_for_tests()
 
-@unittest.skipIf(
-    TEST_WITH_ASAN or TEST_WITH_DEV_DBG_ASAN, "CUDA + ASAN does not work."
-)
+
+@unittest.skipIf(TEST_WITH_ASAN or TEST_WITH_DEV_DBG_ASAN, "CUDA + ASAN does not work.")
 class TestZeroRedundancyOptimizer(common_distributed.MultiProcessTestCase):
     def setUp(self):
         super(TestZeroRedundancyOptimizer, self).setUp()
@@ -70,8 +70,9 @@ def setUp(self):
 
     @property
     def device(self):
-        return torch.device("cuda") if torch.cuda.is_available() \
-            else torch.device("cpu")
+        return (
+            torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        )
 
     @property
     def world_size(self):
@@ -88,18 +89,19 @@ def tearDown(self):
             pass
 
     def dist_init(self, rank, world_size=-1, backend=BACKEND):
-        if (world_size < 1):
+        if world_size < 1:
             world_size = self.world_size
         store = dist.FileStore(self.file_name, world_size)
         return dist.init_process_group(
-            backend=backend, store=store, rank=rank, world_size=world_size,
+            backend=backend,
+            store=store,
+            rank=rank,
+            world_size=world_size,
         )
 
 
 # TODO: sandcastle_skip_if does not work here.
-@unittest.skipIf(
-    TEST_WITH_ASAN or TEST_WITH_DEV_DBG_ASAN, "CUDA + ASAN does not work."
-)
+@unittest.skipIf(TEST_WITH_ASAN or TEST_WITH_DEV_DBG_ASAN, "CUDA + ASAN does not work.")
 class TestZeroRedundancyOptimizerSingleRank(TestZeroRedundancyOptimizer):
     def test_state_dict(self):
         """Check that ZeroRedundancyOptimizer exposes the expected state dict
@@ -111,7 +113,10 @@ def test_state_dict(self):
         RECIPIENT_RANK = 0  # rank 0 is the only rank since the world size is 1
         x = torch.tensor([1.0], device=self.device, requires_grad=True)
         o = ZeroRedundancyOptimizer(
-            [x], optimizer_class=SGD, lr=LR1, momentum=MOMENTUM,
+            [x],
+            optimizer_class=SGD,
+            lr=LR1,
+            momentum=MOMENTUM,
         )
         x.backward()
         o.step()
@@ -202,7 +207,9 @@ def step(self, closure=None, kwarg=None):
         kwarg: List[Any] = []
         x = torch.tensor([1.0], device=self.device, requires_grad=True)
         o = ZeroRedundancyOptimizer(
-            [x], optimizer_class=SGDWithStepKWArg, lr=LR,
+            [x],
+            optimizer_class=SGDWithStepKWArg,
+            lr=LR,
         )
         x.backward()
         o.step(0, kwarg=kwarg)
@@ -241,7 +248,9 @@ def step(self):
 
         x = torch.tensor([1.0], device=self.device, requires_grad=True)
         o = ZeroRedundancyOptimizer(
-            [x], optimizer_class=SGDWithoutClosure, lr=LR,
+            [x],
+            optimizer_class=SGDWithoutClosure,
+            lr=LR,
         )
         x.backward()
         o.step()
@@ -274,22 +283,30 @@ def test_constructor(self):
         )
         # Test various constructor inputs in the form: (input, expected error)
         ctor_inputs = [
-            ([], ValueError),                          # empty parameter list
-            (torch.randn(1), TypeError),               # non-iterable: `torch.Tensor`
-            (1.2, TypeError),                          # non-iterable: `float`
-            ([
-                {"params": [l.weight for l in m]},
-                {"params": [l.bias for l in m]},
-            ], None),                                  # iterable of dict
-            (list(m.parameters()) + [42], TypeError),  # iterable containing invalid type
-            (m.parameters(), None),                    # `params` as a generator
-            (list(m.parameters()), None)               # `params` as a list
+            ([], ValueError),  # empty parameter list
+            (torch.randn(1), TypeError),  # non-iterable: `torch.Tensor`
+            (1.2, TypeError),  # non-iterable: `float`
+            (
+                [
+                    {"params": [l.weight for l in m]},
+                    {"params": [l.bias for l in m]},
+                ],
+                None,
+            ),  # iterable of dict
+            (
+                list(m.parameters()) + [42],
+                TypeError,
+            ),  # iterable containing invalid type
+            (m.parameters(), None),  # `params` as a generator
+            (list(m.parameters()), None),  # `params` as a list
         ]
         for ctor_input, error in ctor_inputs:
             context = self.assertRaises(error) if error else suppress()
             with context:
                 ZeroRedundancyOptimizer(
-                    ctor_input, optimizer_class=SGD, lr=LR,
+                    ctor_input,
+                    optimizer_class=SGD,
+                    lr=LR,
                 )
 
         # Test constructing with multiple parameter groups more thoroughly
@@ -297,18 +314,23 @@ def test_constructor(self):
         BETAS = (0.9, 0.999)
         EPS = 1e-8
         params = [
-            {"params": [l.weight for l in m], "weight_decay": 0.},
+            {"params": [l.weight for l in m], "weight_decay": 0.0},
             {"params": [l.bias for l in m], "weight_decay": WD},
         ]
         o = ZeroRedundancyOptimizer(
-            params, optimizer_class=AdamW,
-            lr=LR, betas=BETAS, eps=EPS,
+            params,
+            optimizer_class=AdamW,
+            lr=LR,
+            betas=BETAS,
+            eps=EPS,
         )
-        assert len(o.param_groups) == 2, \
-            f"Expected 2 ZeRO param groups, but got {len(o.param_groups)}"
-        assert len(o.optim.param_groups) == 2, \
-            "Expected 2 local optimizer param groups, but got " \
+        assert (
+            len(o.param_groups) == 2
+        ), f"Expected 2 ZeRO param groups, but got {len(o.param_groups)}"
+        assert len(o.optim.param_groups) == 2, (
+            "Expected 2 local optimizer param groups, but got "
             f"{len(o.optim.param_groups)}"
+        )
 
     def test_same_dense_param_type(self):
         """Check that ZeroRedundancyOptimizer raises an exception if the input
@@ -322,8 +344,11 @@ def test_same_dense_param_type(self):
         inputs = [
             [torch.sparse_coo_tensor(size=(2, 3))],
             [torch.FloatTensor(1), torch.DoubleTensor(1)],
-            [torch.FloatTensor(1), torch.FloatTensor(1),
-                torch.sparse_coo_tensor(size=(2, 3))]
+            [
+                torch.FloatTensor(1),
+                torch.FloatTensor(1),
+                torch.sparse_coo_tensor(size=(2, 3)),
+            ],
         ]
         for input in inputs:
             with self.assertRaises(ValueError):
@@ -333,8 +358,11 @@ def test_same_dense_param_type(self):
 class TestZeroRedundancyOptimizerDistributed(TestZeroRedundancyOptimizer):
     @property
     def device(self):
-        return torch.device(self.rank) if torch.cuda.is_available() \
+        return (
+            torch.device(self.rank)
+            if torch.cuda.is_available()
             else torch.device("cpu")
+        )
 
     @property
     def world_size(self):
@@ -342,8 +370,11 @@ def world_size(self):
 
     @property
     def context(self):
-        return suppress() if not torch.cuda.is_available() \
+        return (
+            suppress()
+            if not torch.cuda.is_available()
             else torch.cuda.device(self.rank)
+        )
 
     def _check_same_model_params(
         self,
@@ -354,13 +385,17 @@ def _check_same_model_params(
         # Check that model parameters match
         for p_a, p_b in zip(model_a.parameters(), model_b.parameters()):
             torch.testing.assert_close(
-                p_a, p_b, atol=1e-3, rtol=1e-5,
+                p_a,
+                p_b,
+                atol=1e-3,
+                rtol=1e-5,
                 msg=f"Model parameters differ:\n{p_a} {p_b}\n" + message,
             )
         # Check that model buffers match
         for b_a, b_b in zip(model_a.buffers(), model_b.buffers()):
             torch.testing.assert_close(
-                b_a, b_b,
+                b_a,
+                b_b,
                 msg=f"Model buffers differ:\n{b_a} {b_b}\n" + message,
             )
 
@@ -382,7 +417,9 @@ def test_step(self):
 
             o = SGD(m.parameters(), lr=LR)
             o_zero = ZeroRedundancyOptimizer(
-                m_zero.parameters(), optimizer_class=SGD, lr=LR,
+                m_zero.parameters(),
+                optimizer_class=SGD,
+                lr=LR,
             )
 
             y = m(x)
@@ -530,11 +567,7 @@ def all_trainable():
             # all partitions have the same elements
             self.assertEqual(len(o.param_groups), 2)
             self.assertEqual(
-                sum([
-                    x.numel()
-                    for g in o.optim.param_groups
-                    for x in g["params"]
-                ]),
+                sum([x.numel() for g in o.optim.param_groups for x in g["params"]]),
                 sum(sizes),
             )
             self.assertEqual(len(o.optim.param_groups), 2)
@@ -581,36 +614,39 @@ def test_multiple_param_groups(self):
         model2 = model2.to(self.device)
         model3 = model3.to(self.device)
         inputs = [
-            torch.randn(BATCH_SIZE, INPUT_DIM).to(self.device)
-            for _ in range(NUM_ITERS)
+            torch.randn(BATCH_SIZE, INPUT_DIM).to(self.device) for _ in range(NUM_ITERS)
         ]
         # Construct `optim1` with both parameter groups upfront
         optim1 = ZeroRedundancyOptimizer(
             [
-                {"params": [l.weight for l in model1], "weight_decay": 0.},
+                {"params": [l.weight for l in model1], "weight_decay": 0.0},
                 {"params": [l.bias for l in model1], "weight_decay": WD},
             ],
-            optimizer_class=AdamW, lr=LR,
+            optimizer_class=AdamW,
+            lr=LR,
         )
         # Construct `optim2` by adding the second parameter after
         optim2 = ZeroRedundancyOptimizer(
             [l.weight for l in model2],
-            optimizer_class=AdamW, lr=LR, weight_decay=0.,
-        )
-        optim2.add_param_group(
-            {"params": [l.bias for l in model2], "weight_decay": WD}
+            optimizer_class=AdamW,
+            lr=LR,
+            weight_decay=0.0,
         )
+        optim2.add_param_group({"params": [l.bias for l in model2], "weight_decay": WD})
         # Construct `optim3` as a non-sharded optimizer
         optim3 = AdamW(
             [
-                {"params": [l.weight for l in model3], "weight_decay": 0.},
+                {"params": [l.weight for l in model3], "weight_decay": 0.0},
                 {"params": [l.bias for l in model3], "weight_decay": WD},
-            ], lr=LR,
+            ],
+            lr=LR,
         )
         # Check parity over a few iterations
         for input in inputs:
             for model, optim in (
-                (model1, optim1), (model2, optim2), (model3, optim3),
+                (model1, optim1),
+                (model2, optim2),
+                (model3, optim3),
             ):
                 optim.zero_grad()
                 out = model(input)
@@ -695,8 +731,7 @@ def test_nondefault_process_group(self):
         self.dist_init(self.rank, self.world_size, BACKEND)
         # Use GPU if enough are available, or fall back to CPU otherwise, which
         # is fine since Gloo backend supports both
-        if torch.cuda.is_available() and \
-                torch.cuda.device_count() >= self.world_size:
+        if torch.cuda.is_available() and torch.cuda.device_count() >= self.world_size:
             device = torch.device(self.rank)
         else:
             device = torch.device("cpu")
@@ -704,7 +739,8 @@ def test_nondefault_process_group(self):
         # the case where the global and local ranks do not necessarily match
         subgroup_ranks = [r for r in range(self.world_size) if r % 2 == 0]
         process_group = dist.new_group(
-            ranks=subgroup_ranks, backend=BACKEND,
+            ranks=subgroup_ranks,
+            backend=BACKEND,
         )
         # Ranks not participating in the new process group are no longer needed
         if self.rank not in subgroup_ranks:
@@ -719,8 +755,9 @@ def test_nondefault_process_group(self):
         LR = 1e-3
         MOMENTUM = 0.99
         REFERENCE_RANK = 0
-        assert REFERENCE_RANK in subgroup_ranks, \
-            "Reference rank must be in the new process group"
+        assert (
+            REFERENCE_RANK in subgroup_ranks
+        ), "Reference rank must be in the new process group"
         loss_fn = torch.nn.L1Loss().to(device)
 
         def check(optimizer):
@@ -742,11 +779,15 @@ def closure():
                 # Check that the parameters match across ranks after a step
                 for pg in optimizer.param_groups:
                     for p in pg["params"]:
-                        receptacle = [
-                            p.clone() for _ in subgroup_ranks
-                        ] if self.rank == REFERENCE_RANK else []
+                        receptacle = (
+                            [p.clone() for _ in subgroup_ranks]
+                            if self.rank == REFERENCE_RANK
+                            else []
+                        )
                         dist.gather(
-                            p, receptacle, dst=REFERENCE_RANK,
+                            p,
+                            receptacle,
+                            dst=REFERENCE_RANK,
                             group=process_group,
                         )
                         if self.rank == REFERENCE_RANK:
@@ -814,31 +855,41 @@ def test_local_optimizer_parity(
                 torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM),
             ).to(self.device)
             model.register_buffer(
-                "test_buffer", torch.ones((1), device=self.device) * self.rank,
+                "test_buffer",
+                torch.ones((1), device=self.device) * self.rank,
             )
             # Define models/optimizers for DDP with ZeRO and DDP with local
             # optimizer
             defaults = {"maximize": True} if maximize else {}
             sharded_optimizer = ZeroRedundancyOptimizer(
-                params=model.parameters(), optimizer_class=optimizer_class,
-                lr=LR, **defaults,
+                params=model.parameters(),
+                optimizer_class=optimizer_class,
+                lr=LR,
+                **defaults,
             )
             sharded_ddp_model = DDP(
-                module=model, device_ids=[self.rank],
-                broadcast_buffers=True, find_unused_parameters=True,
+                module=model,
+                device_ids=[self.rank],
+                broadcast_buffers=True,
+                find_unused_parameters=True,
             )
             local_model = copy.deepcopy(model).to(self.device)
             ddp_optimizer = optimizer_class(
-                local_model.parameters(), lr=LR, **defaults,
+                local_model.parameters(),
+                lr=LR,
+                **defaults,
             )
             ddp_model = DDP(
-                local_model, device_ids=[self.rank],
-                broadcast_buffers=True, find_unused_parameters=True,
+                local_model,
+                device_ids=[self.rank],
+                broadcast_buffers=True,
+                find_unused_parameters=True,
             )
             # Check that the model is properly synchronized between ranks
             # at construction time
             self._check_same_model_params(
-                sharded_ddp_model, ddp_model,
+                sharded_ddp_model,
+                ddp_model,
                 "Models differ from the start",
             )
 
@@ -858,18 +909,21 @@ def closure_sharded(input_tensor=input_tensor):
                     return sharded_loss
 
                 loss_ddp = cast(
-                    torch.Tensor, ddp_optimizer.step(closure=closure_ddp),
+                    torch.Tensor,
+                    ddp_optimizer.step(closure=closure_ddp),
                 )
                 loss_sharded_optim = cast(
                     torch.Tensor,
                     sharded_optimizer.step(closure=closure_sharded),
                 )
                 torch.testing.assert_close(
-                    loss_ddp, loss_sharded_optim,
+                    loss_ddp,
+                    loss_sharded_optim,
                     msg="Losses differ between local optimizer and ZeRO",
                 )
                 self._check_same_model_params(
-                    sharded_ddp_model, ddp_model,
+                    sharded_ddp_model,
+                    ddp_model,
                     "Models differ after a step",
                 )
 
@@ -889,11 +943,11 @@ def closure_sharded(input_tensor=input_tensor):
             ddp_state_dict = ddp_optimizer.state_dict()
             sharded_optimizer.consolidate_state_dict(to=REFERENCE_RANK)
             sharded_optim_state_dict = [
-                sharded_optimizer.state_dict()
-                if self.rank == REFERENCE_RANK else {}
+                sharded_optimizer.state_dict() if self.rank == REFERENCE_RANK else {}
             ]
             dist.broadcast_object_list(
-                sharded_optim_state_dict, src=REFERENCE_RANK,
+                sharded_optim_state_dict,
+                src=REFERENCE_RANK,
                 group=dist.group.WORLD,
             )
             sharded_optim_state_dict = sharded_optim_state_dict[0]
@@ -941,14 +995,14 @@ def _test_zero_join(self, device):
         zero_model = copy.deepcopy(model)
         zero_model.to(device)
         zero_optim = ZeroRedundancyOptimizer(
-            zero_model.parameters(), torch.optim.Adam, lr=LR,
+            zero_model.parameters(),
+            torch.optim.Adam,
+            lr=LR,
         )
         loss_fn = torch.nn.MSELoss()
 
         # Use uneven inputs: rank i has i extra inputs
-        inputs = [
-            torch.randn(20, 2).to(device) for _ in range(NUM_INPUTS + rank)
-        ]
+        inputs = [torch.randn(20, 2).to(device) for _ in range(NUM_INPUTS + rank)]
         labels = torch.randn(20, 3).to(device)
 
         # Save the gradients and parameters from DDP as the ground truth; do
@@ -976,7 +1030,9 @@ def _test_zero_join(self, device):
         # ranks (which joined early)
         grads_and_params = [grads_at_each_iter, params_at_each_iter]
         grads_and_params = _broadcast_object(
-            grads_and_params, src_rank=world_size - 1, group=dist.group.WORLD,
+            grads_and_params,
+            src_rank=world_size - 1,
+            group=dist.group.WORLD,
             device=device,
         )
         grads_at_each_iter = grads_and_params[0]
@@ -987,7 +1043,7 @@ def _test_zero_join(self, device):
 
         # A process must still set the remaining gradients after joining, so we
         # define a join hook to do this before the ZeRO join hook
-        class _JoinGradInfo():
+        class _JoinGradInfo:
             def __init__(self, grads):
                 self.grads = grads  # remaining gradients to set (in order)
                 self.index = 0
@@ -1029,7 +1085,9 @@ def join_process_group(self):
         gradient_setter = _GradientSetter()
         iter = 0
         with Join(
-            [gradient_setter, zero_optim], zero_optim=zero_optim, grads=grads,
+            [gradient_setter, zero_optim],
+            zero_optim=zero_optim,
+            grads=grads,
         ):
             for _ in range(NUM_EPOCHS):
                 for input in inputs:
@@ -1037,16 +1095,19 @@ def join_process_group(self):
                     Join.notify_join_context(gradient_setter)
                     # Set gradients manually
                     for p, grad in zip(
-                        zero_model.parameters(), grads_at_each_iter[iter],
+                        zero_model.parameters(),
+                        grads_at_each_iter[iter],
                     ):
                         p.grad = grad.detach().clone().to(device)
                     # Perform optimizer step and check parity
                     zero_optim.step()
                     for p, ddp_p in zip(
-                        zero_model.parameters(), params_at_each_iter[iter],
+                        zero_model.parameters(),
+                        params_at_each_iter[iter],
                     ):
                         torch.testing.assert_close(
-                            p, ddp_p,
+                            p,
+                            ddp_p,
                             msg="Parameters differ between using ZeRO and "
                             "local optimizer",
                         )
@@ -1127,6 +1188,7 @@ def copy_param(p):
 
         for _ in range(NUM_EPOCHS):
             for input in inputs:
+
                 def closure_local():
                     local_optim.zero_grad()
                     local_loss = local_model(input).abs().sum()
@@ -1139,25 +1201,26 @@ def closure_ddp():
                     ddp_loss.backward()
                     return ddp_loss
 
-                local_loss = cast(
-                    torch.Tensor, local_optim.step(closure=closure_local)
-                )
-                ddp_loss = cast(
-                    torch.Tensor, zero_optim.step(closure=closure_ddp)
-                )
+                local_loss = cast(torch.Tensor, local_optim.step(closure=closure_local))
+                ddp_loss = cast(torch.Tensor, zero_optim.step(closure=closure_ddp))
 
                 # Increased tolerances are needed to pass when using TF32
                 # See: https://github.com/pytorch/pytorch/issues/67764
                 torch.testing.assert_close(
-                    local_loss.cpu(), ddp_loss.cpu(), rtol=1e-03, atol=1e-08,
+                    local_loss.cpu(),
+                    ddp_loss.cpu(),
+                    rtol=1e-03,
+                    atol=1e-08,
                 ), "Losses differ between local optimizer and ZeRO"
 
                 for local_p, ddp_p in zip(
-                    local_model.parameters(),
-                    ddp_model.parameters()
+                    local_model.parameters(), ddp_model.parameters()
                 ):
                     torch.testing.assert_close(
-                        local_p.cpu(), ddp_p.cpu(), rtol=1e-03, atol=1e-04,
+                        local_p.cpu(),
+                        ddp_p.cpu(),
+                        rtol=1e-03,
+                        atol=1e-04,
                     ), "Models differ after a step"
 
     @common_distributed.skip_if_lt_x_gpu(4)
@@ -1176,9 +1239,13 @@ def test_zero_model_parallel(
         # Disable DDP + ReplicatedTensor when `parameter_as_bucket_view=True`
         # since then ZeroRedundancyOptimizer modifies the model parameters in
         # place.
-        from torch.nn.parallel._replicated_tensor_ddp_utils import _ddp_replicated_tensor
-        context = _ddp_replicated_tensor(False) if parameters_as_bucket_view \
-            else suppress()
+        from torch.nn.parallel._replicated_tensor_ddp_utils import (
+            _ddp_replicated_tensor,
+        )
+
+        context = (
+            _ddp_replicated_tensor(False) if parameters_as_bucket_view else suppress()
+        )
         with context:
             self.dist_init(self.rank, world_size=2)
             self._test_zero_model_parallel(parameters_as_bucket_view)
@@ -1202,21 +1269,22 @@ def _test_ddp_zero_overlap(
         is_gpu = device.type == "cuda"
         if is_gpu:
             torch.cuda.set_device(device)
-        models_to_test = [(
-            torch.nn.Sequential(
-                torch.nn.Linear(1000, 2000),
-                torch.nn.Linear(2000, 500),
-            ),
-            [torch.randn(1, 1000).to(device) for _ in range(NUM_INPUTS)],
-        )]
+        models_to_test = [
+            (
+                torch.nn.Sequential(
+                    torch.nn.Linear(1000, 2000),
+                    torch.nn.Linear(2000, 500),
+                ),
+                [torch.randn(1, 1000).to(device) for _ in range(NUM_INPUTS)],
+            )
+        ]
         if HAS_TORCHVISION:
-            models_to_test.append((
-                torchvision.models.resnet50(),
-                [
-                    torch.randn(1, 3, 3, 1000).to(device)
-                    for _ in range(NUM_INPUTS)
-                ]
-            ))
+            models_to_test.append(
+                (
+                    torchvision.models.resnet50(),
+                    [torch.randn(1, 3, 3, 1000).to(device) for _ in range(NUM_INPUTS)],
+                )
+            )
         for (model, inputs) in models_to_test:
             # Enable determinism in cudnn operators
             with torch.backends.cudnn.flags(
@@ -1227,7 +1295,7 @@ def _test_ddp_zero_overlap(
                 ddp_model_overlap = DDP(
                     copy.deepcopy(model).to(device),
                     device_ids=device_ids,
-                    gradient_as_bucket_view=gradient_as_bucket_view
+                    gradient_as_bucket_view=gradient_as_bucket_view,
                 )
                 if static_graph:
                     ddp_model_overlap._set_static_graph()
@@ -1242,16 +1310,18 @@ def _test_ddp_zero_overlap(
                 ddp_model_overlap.register_comm_hook(
                     None,
                     hook_constructor(
-                        allreduce_hook, ddp_model_overlap, zero_optim,
+                        allreduce_hook,
+                        ddp_model_overlap,
+                        zero_optim,
                         **kwargs,
-                    )
+                    ),
                 )
 
                 # Set up the DDP model with local optimizer
                 ddp_model_local = DDP(
                     copy.deepcopy(model).to(device),
                     device_ids=device_ids,
-                    gradient_as_bucket_view=gradient_as_bucket_view
+                    gradient_as_bucket_view=gradient_as_bucket_view,
                 )
                 if static_graph:
                     ddp_model_local._set_static_graph()
@@ -1259,13 +1329,12 @@ def _test_ddp_zero_overlap(
                     ddp_model_local.parameters(),
                     lr=SGD_LR,
                     momentum=SGD_MOMENTUM,
-                    weight_decay=SGD_WEIGHT_DECAY
+                    weight_decay=SGD_WEIGHT_DECAY,
                 )
 
                 # Check that the parameters match initially
                 for p1, p2 in zip(
-                    ddp_model_overlap.parameters(),
-                    ddp_model_local.parameters()
+                    ddp_model_overlap.parameters(), ddp_model_local.parameters()
                 ):
                     self.assertEqual(p1, p2)
 
@@ -1303,14 +1372,14 @@ def _test_ddp_zero_overlap(
 
                 # Check that the parameters are equal
                 for p1, p2 in zip(
-                    ddp_model_overlap.parameters(),
-                    ddp_model_local.parameters()
+                    ddp_model_overlap.parameters(), ddp_model_local.parameters()
                 ):
                     self.assertEqual(p1, p2)
 
                 # Check that the parameters were updated
                 self.assertNotEqual(
-                    init_params_overlap, list(ddp_model_overlap.parameters()),
+                    init_params_overlap,
+                    list(ddp_model_overlap.parameters()),
                 )
 
                 # Ensure that this test runs independently
@@ -1360,15 +1429,24 @@ def test_ddp_zero_overlap(
         device = torch.device(self.rank) if use_gpu else torch.device("cpu")
         backend = _get_backend_for_tests()
         self.dist_init(self.rank, self.world_size, backend)
-        hook_constructor = hook_with_zero_step if not use_interleaved_hook \
+        hook_constructor = (
+            hook_with_zero_step
+            if not use_interleaved_hook
             else hook_with_zero_step_interleaved
+        )
 
         # Disable DDP + ReplicatedTensor since ZeroRedundancyOptimizer
         # modifies the model parameters in place.
-        from torch.nn.parallel._replicated_tensor_ddp_utils import _ddp_replicated_tensor
+        from torch.nn.parallel._replicated_tensor_ddp_utils import (
+            _ddp_replicated_tensor,
+        )
+
         with _ddp_replicated_tensor(False):
             self._test_ddp_zero_overlap(
-                device, hook_constructor, gradient_as_bucket_view, static_graph,
+                device,
+                hook_constructor,
+                gradient_as_bucket_view,
+                static_graph,
                 shard_buckets=shard_buckets,
             )
 
diff --git a/torch/distributed/optim/apply_optimizer_in_backward.py b/torch/distributed/optim/apply_optimizer_in_backward.py
index ff72f28e6be1e..b37a67783d048 100644
--- a/torch/distributed/optim/apply_optimizer_in_backward.py
+++ b/torch/distributed/optim/apply_optimizer_in_backward.py
@@ -1,9 +1,10 @@
-from typing import Any, Dict, Iterable, Type, List, no_type_check
+from typing import Any, Dict, Iterable, List, no_type_check, Type
 
 import torch
 
 __all__: List[str] = []
 
+
 @no_type_check
 def _apply_optimizer_in_backward(
     optimizer_class: Type[torch.optim.Optimizer],
@@ -44,7 +45,7 @@ def _apply_optimizer_in_backward_to_param(param: torch.nn.Parameter) -> None:
 
         # Don't create a new acc_grad if we already have one
         # i.e.f or shared parameters or attaching multiple optimizers to a param.
-        if not hasattr(param, 'acc_grad'):
+        if not hasattr(param, "acc_grad"):
             acc_grad = param.view_as(param).grad_fn.next_functions[0][0]
         else:
             acc_grad = param._acc_grad
@@ -53,10 +54,10 @@ def _apply_optimizer_in_backward_to_param(param: torch.nn.Parameter) -> None:
 
         # Keep the grad accumulator around for the lifetime of the Tensor,
         # store it on the param to avoid uncollectable ref-cycle
-        if not hasattr(param, 'acc_grad'):
+        if not hasattr(param, "acc_grad"):
             param._acc_grad = acc_grad  # type: ignore[attr-defined]
 
-        if not hasattr(param, '_in_backward_optimizers'):
+        if not hasattr(param, "_in_backward_optimizers"):
             param._in_backward_optimizers = []  # type: ignore[attr-defined]
             # TODO: investigate whether we really need these attributes.
             param._optimizer_classes = []  # type: ignore[attr-defined]
diff --git a/torch/distributed/optim/functional_adadelta.py b/torch/distributed/optim/functional_adadelta.py
index 5ea02786d00e7..0aaa8906709f7 100644
--- a/torch/distributed/optim/functional_adadelta.py
+++ b/torch/distributed/optim/functional_adadelta.py
@@ -1,10 +1,11 @@
-from typing import List, Dict, Optional
+from typing import Dict, List, Optional
+
 import torch
 import torch.optim._functional as F
 
 from torch import Tensor
 
-__all__ : List[str] = []
+__all__: List[str] = []
 
 # Define a TorchScript compatible Functional Adadelta Optimizer
 # where we use these optimizer in a functional way.
@@ -47,15 +48,15 @@ def __init__(
         self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {})
 
     def step(self, gradients: List[Optional[Tensor]]):
-        params = self.param_group['params']
+        params = self.param_group["params"]
         params_with_grad = []
         grads = []
         square_avgs = []
         acc_deltas = []
-        lr = self.defaults['lr']
-        rho = self.defaults['rho']
-        eps = self.defaults['eps']
-        weight_decay = self.defaults['weight_decay']
+        lr = self.defaults["lr"]
+        rho = self.defaults["rho"]
+        eps = self.defaults["eps"]
+        weight_decay = self.defaults["weight_decay"]
 
         if len(params) != len(gradients):
             raise ValueError(
@@ -72,22 +73,28 @@ def step(self, gradients: List[Optional[Tensor]]):
                 if param not in self.state:
                     self.state[param] = {}
                     state = self.state[param]
-                    state['step'] = torch.tensor(0.0)
-                    state['square_avg'] = torch.zeros_like(param, memory_format=torch.preserve_format)
-                    state['acc_delta'] = torch.zeros_like(param, memory_format=torch.preserve_format)
+                    state["step"] = torch.tensor(0.0)
+                    state["square_avg"] = torch.zeros_like(
+                        param, memory_format=torch.preserve_format
+                    )
+                    state["acc_delta"] = torch.zeros_like(
+                        param, memory_format=torch.preserve_format
+                    )
 
                 state = self.state[param]
-                square_avgs.append(state['square_avg'])
-                acc_deltas.append(state['acc_delta'])
+                square_avgs.append(state["square_avg"])
+                acc_deltas.append(state["acc_delta"])
 
         with torch.no_grad():
-            F.adadelta(params_with_grad,
-                       grads,
-                       square_avgs,
-                       acc_deltas,
-                       lr=lr,
-                       rho=rho,
-                       eps=eps,
-                       weight_decay=weight_decay,
-                       foreach=self.foreach,
-                       maximize=self.maximize)
+            F.adadelta(
+                params_with_grad,
+                grads,
+                square_avgs,
+                acc_deltas,
+                lr=lr,
+                rho=rho,
+                eps=eps,
+                weight_decay=weight_decay,
+                foreach=self.foreach,
+                maximize=self.maximize,
+            )
diff --git a/torch/distributed/optim/functional_adagrad.py b/torch/distributed/optim/functional_adagrad.py
index 28a97817bba0b..a644aa5a378cd 100644
--- a/torch/distributed/optim/functional_adagrad.py
+++ b/torch/distributed/optim/functional_adagrad.py
@@ -1,10 +1,11 @@
-from typing import List, Dict, Optional
+from typing import Dict, List, Optional
+
 import torch
 import torch.optim._functional as F
 
 from torch import Tensor
 
-__all__ : List[str] = []
+__all__: List[str] = []
 
 # Define a TorchScript compatible Functional Adagrad Optimizer
 # where we use these optimizer in a functional way.
@@ -62,7 +63,7 @@ def __init__(
             }
 
     def step(self, gradients: List[Optional[Tensor]]):
-        params = self.param_group['params']
+        params = self.param_group["params"]
         params_with_grad = []
         grads = []
         state_sums = []
@@ -76,25 +77,27 @@ def step(self, gradients: List[Optional[Tensor]]):
             )
 
         has_sparse_grad = False
-        for param, gradient in zip(self.param_group['params'], gradients):
+        for param, gradient in zip(self.param_group["params"], gradients):
             if gradient is not None:
                 if gradient.is_sparse:
                     has_sparse_grad = True
                 params_with_grad.append(param)
                 grads.append(gradient)
                 state = self.state[param]
-                state_sums.append(state['sum'])
-                state_steps.append(state['step'])
+                state_sums.append(state["sum"])
+                state_steps.append(state["step"])
 
         with torch.no_grad():
-            F.adagrad(params,
-                      grads,
-                      state_sums,
-                      state_steps,
-                      lr=self.defaults['lr'],
-                      weight_decay=self.defaults['weight_decay'],
-                      lr_decay=self.defaults['lr_decay'],
-                      eps=self.defaults['eps'],
-                      has_sparse_grad=has_sparse_grad,
-                      foreach=self.foreach,
-                      maximize=self.maximize)
+            F.adagrad(
+                params,
+                grads,
+                state_sums,
+                state_steps,
+                lr=self.defaults["lr"],
+                weight_decay=self.defaults["weight_decay"],
+                lr_decay=self.defaults["lr_decay"],
+                eps=self.defaults["eps"],
+                has_sparse_grad=has_sparse_grad,
+                foreach=self.foreach,
+                maximize=self.maximize,
+            )
diff --git a/torch/distributed/optim/functional_adam.py b/torch/distributed/optim/functional_adam.py
index 92b749a54dbde..1b7dc1a76fc49 100644
--- a/torch/distributed/optim/functional_adam.py
+++ b/torch/distributed/optim/functional_adam.py
@@ -1,10 +1,11 @@
-from typing import List, Dict, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
+
 import torch
 import torch.optim._functional as F
 
 from torch import Tensor
 
-__all__ : List[str] = []
+__all__: List[str] = []
 
 # Define a TorchScript compatible Functional Adam Optimizer
 # where we use these optimizer in a functional way.
@@ -78,41 +79,49 @@ def step_param(self, param: Tensor, grad: Optional[Tensor]):
         if param not in self.state:
             self.state[param] = {}
             state = self.state[param]
-            state['step'] = torch.tensor(0.0)
-            state['exp_avg'] = torch.zeros_like(param, memory_format=torch.preserve_format)
-            state['exp_avg_sq'] = torch.zeros_like(param, memory_format=torch.preserve_format)
+            state["step"] = torch.tensor(0.0)
+            state["exp_avg"] = torch.zeros_like(
+                param, memory_format=torch.preserve_format
+            )
+            state["exp_avg_sq"] = torch.zeros_like(
+                param, memory_format=torch.preserve_format
+            )
             if self.amsgrad:
-                state['max_exp_avg_sq'] = torch.zeros_like(param, memory_format=torch.preserve_format)
+                state["max_exp_avg_sq"] = torch.zeros_like(
+                    param, memory_format=torch.preserve_format
+                )
 
         state = self.state[param]
-        exp_avgs.append(state['exp_avg'])
-        exp_avg_sqs.append(state['exp_avg_sq'])
+        exp_avgs.append(state["exp_avg"])
+        exp_avg_sqs.append(state["exp_avg_sq"])
 
         if self.amsgrad:
-            max_exp_avg_sqs.append(state['max_exp_avg_sq'])
+            max_exp_avg_sqs.append(state["max_exp_avg_sq"])
 
-        state_steps.append(state['step'])
+        state_steps.append(state["step"])
         with torch.no_grad():
-            F.adam(params_with_grad,
-                   grads,
-                   exp_avgs,
-                   exp_avg_sqs,
-                   max_exp_avg_sqs,
-                   state_steps,
-                   amsgrad=self.amsgrad,
-                   maximize=self.maximize,
-                   beta1=self.defaults['beta1'],
-                   beta2=self.defaults['beta2'],
-                   lr=self.defaults['lr'],
-                   weight_decay=self.defaults['weight_decay'],
-                   eps=self.defaults['eps'],
-                   foreach=self.foreach,
-                   fused=self.fused,
-                   grad_scale=None,
-                   found_inf=None)
+            F.adam(
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_avg_sqs,
+                max_exp_avg_sqs,
+                state_steps,
+                amsgrad=self.amsgrad,
+                maximize=self.maximize,
+                beta1=self.defaults["beta1"],
+                beta2=self.defaults["beta2"],
+                lr=self.defaults["lr"],
+                weight_decay=self.defaults["weight_decay"],
+                eps=self.defaults["eps"],
+                foreach=self.foreach,
+                fused=self.fused,
+                grad_scale=None,
+                found_inf=None,
+            )
 
     def step(self, gradients: List[Optional[Tensor]]):
-        params = self.param_group['params']
+        params = self.param_group["params"]
         params_with_grad = []
         grads = []
         exp_avgs = []
@@ -127,7 +136,7 @@ def step(self, gradients: List[Optional[Tensor]]):
                 + f"Gradients length: {len(gradients)}"
             )
 
-        for param, gradient in zip(self.param_group['params'], gradients):
+        for param, gradient in zip(self.param_group["params"], gradients):
             if gradient is not None:
                 params_with_grad.append(param)
                 grads.append(gradient)
@@ -135,40 +144,48 @@ def step(self, gradients: List[Optional[Tensor]]):
                 if param not in self.state:
                     self.state[param] = {}
                     state = self.state[param]
-                    state['step'] = torch.tensor(0.0)
+                    state["step"] = torch.tensor(0.0)
                     # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(param, memory_format=torch.preserve_format)
+                    state["exp_avg"] = torch.zeros_like(
+                        param, memory_format=torch.preserve_format
+                    )
                     # Exponential moving average of squared gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(param, memory_format=torch.preserve_format)
+                    state["exp_avg_sq"] = torch.zeros_like(
+                        param, memory_format=torch.preserve_format
+                    )
                     if self.amsgrad:
                         # Maintains max of all exp. moving avg. of sq. grad. values
-                        state['max_exp_avg_sq'] = torch.zeros_like(param, memory_format=torch.preserve_format)
+                        state["max_exp_avg_sq"] = torch.zeros_like(
+                            param, memory_format=torch.preserve_format
+                        )
 
                 state = self.state[param]
 
-                exp_avgs.append(state['exp_avg'])
-                exp_avg_sqs.append(state['exp_avg_sq'])
+                exp_avgs.append(state["exp_avg"])
+                exp_avg_sqs.append(state["exp_avg_sq"])
 
                 if self.amsgrad:
-                    max_exp_avg_sqs.append(state['max_exp_avg_sq'])
+                    max_exp_avg_sqs.append(state["max_exp_avg_sq"])
 
-                state_steps.append(state['step'])
+                state_steps.append(state["step"])
 
         with torch.no_grad():
-            F.adam(params_with_grad,
-                   grads,
-                   exp_avgs,
-                   exp_avg_sqs,
-                   max_exp_avg_sqs,
-                   state_steps,
-                   amsgrad=self.amsgrad,
-                   maximize=self.maximize,
-                   beta1=self.defaults['beta1'],
-                   beta2=self.defaults['beta2'],
-                   lr=self.defaults['lr'],
-                   weight_decay=self.defaults['weight_decay'],
-                   eps=self.defaults['eps'],
-                   foreach=self.foreach,
-                   fused=self.fused,
-                   grad_scale=None,
-                   found_inf=None)
+            F.adam(
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_avg_sqs,
+                max_exp_avg_sqs,
+                state_steps,
+                amsgrad=self.amsgrad,
+                maximize=self.maximize,
+                beta1=self.defaults["beta1"],
+                beta2=self.defaults["beta2"],
+                lr=self.defaults["lr"],
+                weight_decay=self.defaults["weight_decay"],
+                eps=self.defaults["eps"],
+                foreach=self.foreach,
+                fused=self.fused,
+                grad_scale=None,
+                found_inf=None,
+            )
diff --git a/torch/distributed/optim/functional_adamax.py b/torch/distributed/optim/functional_adamax.py
index a664e0df8f69b..e5c236728d08d 100644
--- a/torch/distributed/optim/functional_adamax.py
+++ b/torch/distributed/optim/functional_adamax.py
@@ -1,10 +1,11 @@
-from typing import List, Dict, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
+
 import torch
 import torch.optim._functional as F
 
 from torch import Tensor
 
-__all__ : List[str] = []
+__all__: List[str] = []
 
 # Define a TorchScript compatible Functional Adamax Optimizer
 # where we use these optimizer in a functional way.
@@ -58,7 +59,7 @@ def __init__(
         self.param_group = {"params": params}
 
     def step(self, gradients: List[Optional[Tensor]]):
-        params = self.param_group['params']
+        params = self.param_group["params"]
         params_with_grad = []
         grads = []
         exp_avgs = []
@@ -72,7 +73,7 @@ def step(self, gradients: List[Optional[Tensor]]):
                 + f"Gradients length: {len(gradients)}"
             )
 
-        for param, gradient in zip(self.param_group['params'], gradients):
+        for param, gradient in zip(self.param_group["params"], gradients):
             if gradient is not None:
                 params_with_grad.append(param)
                 grads.append(gradient)
@@ -80,28 +81,34 @@ def step(self, gradients: List[Optional[Tensor]]):
                 if param not in self.state:
                     self.state[param] = {}
                     state = self.state[param]
-                    state['step'] = torch.tensor(0.0)
+                    state["step"] = torch.tensor(0.0)
                     # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(param, memory_format=torch.preserve_format)
+                    state["exp_avg"] = torch.zeros_like(
+                        param, memory_format=torch.preserve_format
+                    )
                     # Exponential moving average of squared gradient values
-                    state['exp_inf'] = torch.zeros_like(param, memory_format=torch.preserve_format)
+                    state["exp_inf"] = torch.zeros_like(
+                        param, memory_format=torch.preserve_format
+                    )
 
                 state = self.state[param]
 
-                exp_avgs.append(state['exp_avg'])
-                exp_infs.append(state['exp_inf'])
-                state_steps.append(state['step'])
+                exp_avgs.append(state["exp_avg"])
+                exp_infs.append(state["exp_inf"])
+                state_steps.append(state["step"])
 
         with torch.no_grad():
-            F.adamax(params_with_grad,
-                     grads,
-                     exp_avgs,
-                     exp_infs,
-                     state_steps,
-                     eps=self.defaults['eps'],
-                     beta1=self.defaults['beta1'],
-                     beta2=self.defaults['beta2'],
-                     lr=self.defaults['lr'],
-                     weight_decay=self.defaults['weight_decay'],
-                     foreach=self.foreach,
-                     maximize=self.maximize)
+            F.adamax(
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_infs,
+                state_steps,
+                eps=self.defaults["eps"],
+                beta1=self.defaults["beta1"],
+                beta2=self.defaults["beta2"],
+                lr=self.defaults["lr"],
+                weight_decay=self.defaults["weight_decay"],
+                foreach=self.foreach,
+                maximize=self.maximize,
+            )
diff --git a/torch/distributed/optim/functional_adamw.py b/torch/distributed/optim/functional_adamw.py
index eeaf5385bd31f..48d70843d3689 100644
--- a/torch/distributed/optim/functional_adamw.py
+++ b/torch/distributed/optim/functional_adamw.py
@@ -1,10 +1,11 @@
-from typing import List, Dict, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
+
 import torch
 import torch.optim._functional as F
 
 from torch import Tensor
 
-__all__ : List[str] = []
+__all__: List[str] = []
 
 # Define a TorchScript compatible Functional AdamW Optimizer
 # where we use these optimizer in a functional way.
@@ -73,42 +74,50 @@ def step_param(self, param: Tensor, grad: Optional[Tensor]):
         if param not in self.state:
             self.state[param] = {}
             state = self.state[param]
-            state['step'] = torch.tensor(0.0)
+            state["step"] = torch.tensor(0.0)
             # Exponential moving average of gradient values
-            state['exp_avg'] = torch.zeros_like(param, memory_format=torch.preserve_format)
+            state["exp_avg"] = torch.zeros_like(
+                param, memory_format=torch.preserve_format
+            )
             # Exponential moving average of squared gradient values
-            state['exp_avg_sq'] = torch.zeros_like(param, memory_format=torch.preserve_format)
+            state["exp_avg_sq"] = torch.zeros_like(
+                param, memory_format=torch.preserve_format
+            )
             if self.amsgrad:
                 # Maintains max of all exp. moving avg. of sq. grad. values
-                state['max_exp_avg_sq'] = torch.zeros_like(param, memory_format=torch.preserve_format)
+                state["max_exp_avg_sq"] = torch.zeros_like(
+                    param, memory_format=torch.preserve_format
+                )
 
         state = self.state[param]
 
-        exp_avgs.append(state['exp_avg'])
-        exp_avg_sqs.append(state['exp_avg_sq'])
+        exp_avgs.append(state["exp_avg"])
+        exp_avg_sqs.append(state["exp_avg_sq"])
 
         if self.amsgrad:
-            max_exp_avg_sqs.append(state['max_exp_avg_sq'])
+            max_exp_avg_sqs.append(state["max_exp_avg_sq"])
 
-        state_steps.append(state['step'])
+        state_steps.append(state["step"])
         with torch.no_grad():
-            F.adamw(params_with_grad,
-                    grads,
-                    exp_avgs,
-                    exp_avg_sqs,
-                    max_exp_avg_sqs,
-                    state_steps,
-                    amsgrad=self.amsgrad,
-                    maximize=self.maximize,
-                    beta1=self.defaults['beta1'],
-                    beta2=self.defaults['beta2'],
-                    lr=self.defaults['lr'],
-                    weight_decay=self.defaults['weight_decay'],
-                    eps=self.defaults['eps'],
-                    foreach=self.foreach)
+            F.adamw(
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_avg_sqs,
+                max_exp_avg_sqs,
+                state_steps,
+                amsgrad=self.amsgrad,
+                maximize=self.maximize,
+                beta1=self.defaults["beta1"],
+                beta2=self.defaults["beta2"],
+                lr=self.defaults["lr"],
+                weight_decay=self.defaults["weight_decay"],
+                eps=self.defaults["eps"],
+                foreach=self.foreach,
+            )
 
     def step(self, gradients: List[Optional[Tensor]]):
-        params = self.param_group['params']
+        params = self.param_group["params"]
         params_with_grad = []
         grads = []
         exp_avgs = []
@@ -123,7 +132,7 @@ def step(self, gradients: List[Optional[Tensor]]):
                 + f"Gradients length: {len(gradients)}"
             )
 
-        for param, gradient in zip(self.param_group['params'], gradients):
+        for param, gradient in zip(self.param_group["params"], gradients):
             if gradient is not None:
                 params_with_grad.append(param)
                 grads.append(gradient)
@@ -131,37 +140,45 @@ def step(self, gradients: List[Optional[Tensor]]):
                 if param not in self.state:
                     self.state[param] = {}
                     state = self.state[param]
-                    state['step'] = torch.tensor(0.0)
+                    state["step"] = torch.tensor(0.0)
                     # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(param, memory_format=torch.preserve_format)
+                    state["exp_avg"] = torch.zeros_like(
+                        param, memory_format=torch.preserve_format
+                    )
                     # Exponential moving average of squared gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(param, memory_format=torch.preserve_format)
+                    state["exp_avg_sq"] = torch.zeros_like(
+                        param, memory_format=torch.preserve_format
+                    )
                     if self.amsgrad:
                         # Maintains max of all exp. moving avg. of sq. grad. values
-                        state['max_exp_avg_sq'] = torch.zeros_like(param, memory_format=torch.preserve_format)
+                        state["max_exp_avg_sq"] = torch.zeros_like(
+                            param, memory_format=torch.preserve_format
+                        )
 
                 state = self.state[param]
 
-                exp_avgs.append(state['exp_avg'])
-                exp_avg_sqs.append(state['exp_avg_sq'])
+                exp_avgs.append(state["exp_avg"])
+                exp_avg_sqs.append(state["exp_avg_sq"])
 
                 if self.amsgrad:
-                    max_exp_avg_sqs.append(state['max_exp_avg_sq'])
+                    max_exp_avg_sqs.append(state["max_exp_avg_sq"])
 
-                state_steps.append(state['step'])
+                state_steps.append(state["step"])
 
         with torch.no_grad():
-            F.adamw(params_with_grad,
-                    grads,
-                    exp_avgs,
-                    exp_avg_sqs,
-                    max_exp_avg_sqs,
-                    state_steps,
-                    amsgrad=self.amsgrad,
-                    maximize=self.maximize,
-                    beta1=self.defaults['beta1'],
-                    beta2=self.defaults['beta2'],
-                    lr=self.defaults['lr'],
-                    weight_decay=self.defaults['weight_decay'],
-                    eps=self.defaults['eps'],
-                    foreach=self.foreach)
+            F.adamw(
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_avg_sqs,
+                max_exp_avg_sqs,
+                state_steps,
+                amsgrad=self.amsgrad,
+                maximize=self.maximize,
+                beta1=self.defaults["beta1"],
+                beta2=self.defaults["beta2"],
+                lr=self.defaults["lr"],
+                weight_decay=self.defaults["weight_decay"],
+                eps=self.defaults["eps"],
+                foreach=self.foreach,
+            )
diff --git a/torch/distributed/optim/functional_rmsprop.py b/torch/distributed/optim/functional_rmsprop.py
index c94df3e11ac7b..079f35c7b774f 100644
--- a/torch/distributed/optim/functional_rmsprop.py
+++ b/torch/distributed/optim/functional_rmsprop.py
@@ -1,10 +1,11 @@
-from typing import List, Dict, Optional
+from typing import Dict, List, Optional
+
 import torch
 import torch.optim._functional as F
 
 from torch import Tensor
 
-__all__ : List[str] = []
+__all__: List[str] = []
 
 # Define a TorchScript compatible Functional RMSprop Optimizer
 # where we use these optimizer in a functional way.
@@ -51,17 +52,17 @@ def __init__(
         self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {})
 
     def step(self, gradients: List[Optional[Tensor]]):
-        params = self.param_group['params']
+        params = self.param_group["params"]
         params_with_grad = []
         grads = []
         square_avgs = []
         grad_avgs = []
         momentum_buffer_list = []
-        lr = self.defaults['lr']
-        alpha = self.defaults['alpha']
-        eps = self.defaults['eps']
-        momentum = self.defaults['momentum']
-        weight_decay = self.defaults['weight_decay']
+        lr = self.defaults["lr"]
+        alpha = self.defaults["alpha"]
+        eps = self.defaults["eps"]
+        momentum = self.defaults["momentum"]
+        weight_decay = self.defaults["weight_decay"]
 
         if len(params) != len(gradients):
             raise ValueError(
@@ -78,33 +79,41 @@ def step(self, gradients: List[Optional[Tensor]]):
                 if param not in self.state:
                     self.state[param] = {}
                     state = self.state[param]
-                    state['step'] = torch.tensor(0.0)
-                    state['square_avg'] = torch.zeros_like(param, memory_format=torch.preserve_format)
+                    state["step"] = torch.tensor(0.0)
+                    state["square_avg"] = torch.zeros_like(
+                        param, memory_format=torch.preserve_format
+                    )
                     if momentum > 0:
-                        state['momentum_buffer'] = torch.zeros_like(param, memory_format=torch.preserve_format)
+                        state["momentum_buffer"] = torch.zeros_like(
+                            param, memory_format=torch.preserve_format
+                        )
                     if self.centered:
-                        state['grad_avg'] = torch.zeros_like(param, memory_format=torch.preserve_format)
+                        state["grad_avg"] = torch.zeros_like(
+                            param, memory_format=torch.preserve_format
+                        )
 
                 state = self.state[param]
-                square_avgs.append(state['square_avg'])
+                square_avgs.append(state["square_avg"])
                 if momentum > 0:
-                    momentum_buffer_list.append(state['momentum_buffer'])
+                    momentum_buffer_list.append(state["momentum_buffer"])
                 if self.centered:
-                    grad_avgs.append(state['grad_avg'])
+                    grad_avgs.append(state["grad_avg"])
 
-                state['step'] += 1
+                state["step"] += 1
 
         with torch.no_grad():
-            F.rmsprop(params_with_grad,
-                      grads,
-                      square_avgs,
-                      grad_avgs,
-                      momentum_buffer_list,
-                      lr=lr,
-                      alpha=alpha,
-                      eps=eps,
-                      weight_decay=weight_decay,
-                      momentum=momentum,
-                      centered=self.centered,
-                      foreach=self.foreach,
-                      maximize=self.maximize)
+            F.rmsprop(
+                params_with_grad,
+                grads,
+                square_avgs,
+                grad_avgs,
+                momentum_buffer_list,
+                lr=lr,
+                alpha=alpha,
+                eps=eps,
+                weight_decay=weight_decay,
+                momentum=momentum,
+                centered=self.centered,
+                foreach=self.foreach,
+                maximize=self.maximize,
+            )
diff --git a/torch/distributed/optim/functional_rprop.py b/torch/distributed/optim/functional_rprop.py
index 77d350b20e323..cd109cfa96613 100644
--- a/torch/distributed/optim/functional_rprop.py
+++ b/torch/distributed/optim/functional_rprop.py
@@ -1,10 +1,11 @@
-from typing import List, Dict, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
+
 import torch
 import torch.optim._functional as F
 
 from torch import Tensor
 
-__all__ : List[str] = []
+__all__: List[str] = []
 
 # Define a TorchScript compatible Functional Rprop Optimizer
 # where we use these optimizer in a functional way.
@@ -45,12 +46,12 @@ def __init__(
         self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {})
 
     def step(self, gradients: List[Optional[Tensor]]):
-        params = self.param_group['params']
+        params = self.param_group["params"]
         params_with_grad = []
         grads = []
         prevs = []
         step_sizes = []
-        lr = self.defaults['lr']
+        lr = self.defaults["lr"]
         etaminus, etaplus = self.etas
         step_size_min, step_size_max = self.step_sizes
 
@@ -69,24 +70,28 @@ def step(self, gradients: List[Optional[Tensor]]):
                 if param not in self.state:
                     self.state[param] = {}
                     state = self.state[param]
-                    state['step'] = torch.tensor(0.0)
-                    state['prev'] = torch.zeros_like(param, memory_format=torch.preserve_format)
-                    state['step_size'] = torch.full_like(gradient, lr)
+                    state["step"] = torch.tensor(0.0)
+                    state["prev"] = torch.zeros_like(
+                        param, memory_format=torch.preserve_format
+                    )
+                    state["step_size"] = torch.full_like(gradient, lr)
 
                 state = self.state[param]
-                prevs.append(state['prev'])
-                step_sizes.append(state['step_size'])
+                prevs.append(state["prev"])
+                step_sizes.append(state["step_size"])
 
-                state['step'] += 1
+                state["step"] += 1
 
         with torch.no_grad():
-            F.rprop(params_with_grad,
-                    grads,
-                    prevs,
-                    step_sizes,
-                    step_size_min=step_size_min,
-                    step_size_max=step_size_max,
-                    etaminus=etaminus,
-                    etaplus=etaplus,
-                    foreach=self.foreach,
-                    maximize=self.maximize)
+            F.rprop(
+                params_with_grad,
+                grads,
+                prevs,
+                step_sizes,
+                step_size_min=step_size_min,
+                step_size_max=step_size_max,
+                etaminus=etaminus,
+                etaplus=etaplus,
+                foreach=self.foreach,
+                maximize=self.maximize,
+            )
diff --git a/torch/distributed/optim/functional_sgd.py b/torch/distributed/optim/functional_sgd.py
index b4c48159339fd..1d529cd501891 100644
--- a/torch/distributed/optim/functional_sgd.py
+++ b/torch/distributed/optim/functional_sgd.py
@@ -1,10 +1,11 @@
-from typing import List, Optional, Dict
+from typing import Dict, List, Optional
+
 import torch
 import torch.optim._functional as F
 
 from torch import Tensor
 
-__all__ : List[str] = []
+__all__: List[str] = []
 
 # Define a TorchScript compatible Functional SGD Optimizer
 # where we use these optimizer in a functional way.
@@ -48,15 +49,15 @@ def __init__(
         self.param_group = {"params": params}
 
     def step_param(self, param: Tensor, grad: Optional[Tensor]):
-        """ Similar to self.step, but operates on a single parameter and
-            its gradient.
+        """Similar to self.step, but operates on a single parameter and
+        its gradient.
         """
         # TODO: Once step_param interface is robust, refactor step to call
         # step param on each param.
-        weight_decay = self.defaults['weight_decay']
-        momentum = self.defaults['momentum']
-        dampening = self.defaults['dampening']
-        lr = self.defaults['lr']
+        weight_decay = self.defaults["weight_decay"]
+        momentum = self.defaults["momentum"]
+        dampening = self.defaults["dampening"]
+        lr = self.defaults["lr"]
         params = [param]
         momentum_buffer_list: List[Optional[Tensor]] = []
         grads = []
@@ -69,10 +70,10 @@ def step_param(self, param: Tensor, grad: Optional[Tensor]):
             if param not in self.state:
                 self.state[param] = {}
             state = self.state[param]
-            if 'momentum_buffer' not in state:
+            if "momentum_buffer" not in state:
                 momentum_buffer_list.append(None)
             else:
-                momentum_buffer_list.append(state['momentum_buffer'])
+                momentum_buffer_list.append(state["momentum_buffer"])
 
         with torch.no_grad():
             F.sgd(
@@ -92,17 +93,17 @@ def step_param(self, param: Tensor, grad: Optional[Tensor]):
         state = self.state[param]
         momentum_buffer = momentum_buffer_list[0]
         if momentum_buffer is not None:
-            state['momentum_buffer'] = momentum_buffer
+            state["momentum_buffer"] = momentum_buffer
 
     def step(self, gradients: List[Optional[Tensor]]):
-        params = self.param_group['params']
+        params = self.param_group["params"]
         params_with_grad = []
         grads = []
         momentum_buffer_list: List[Optional[Tensor]] = []
-        lr = self.defaults['lr']
-        weight_decay = self.defaults['weight_decay']
-        momentum = self.defaults['momentum']
-        dampening = self.defaults['dampening']
+        lr = self.defaults["lr"]
+        weight_decay = self.defaults["weight_decay"]
+        momentum = self.defaults["momentum"]
+        dampening = self.defaults["dampening"]
 
         if len(params) != len(gradients):
             raise ValueError(
@@ -123,28 +124,29 @@ def step(self, gradients: List[Optional[Tensor]]):
                     self.state[param] = {}
 
                 state = self.state[param]
-                if 'momentum_buffer' not in state:
+                if "momentum_buffer" not in state:
                     momentum_buffer_list.append(None)
                 else:
-                    momentum_buffer_list.append(state['momentum_buffer'])
+                    momentum_buffer_list.append(state["momentum_buffer"])
 
         with torch.no_grad():
-            F.sgd(params_with_grad,
-                  grads,
-                  momentum_buffer_list,
-                  weight_decay=weight_decay,
-                  momentum=momentum,
-                  lr=lr,
-                  dampening=dampening,
-                  nesterov=self.nesterov,
-                  maximize=self.maximize,
-                  has_sparse_grad=has_sparse_grad,
-                  foreach=self.foreach,
-                  )
+            F.sgd(
+                params_with_grad,
+                grads,
+                momentum_buffer_list,
+                weight_decay=weight_decay,
+                momentum=momentum,
+                lr=lr,
+                dampening=dampening,
+                nesterov=self.nesterov,
+                maximize=self.maximize,
+                has_sparse_grad=has_sparse_grad,
+                foreach=self.foreach,
+            )
 
         # update momentum_buffers in state
         for i, p in enumerate(params_with_grad):
             state = self.state[p]
             momentum_buffer = momentum_buffer_list[i]
             if momentum_buffer is not None:
-                state['momentum_buffer'] = momentum_buffer
+                state["momentum_buffer"] = momentum_buffer
diff --git a/torch/distributed/optim/named_optimizer.py b/torch/distributed/optim/named_optimizer.py
index 81273c65f5332..fafd9486c6813 100644
--- a/torch/distributed/optim/named_optimizer.py
+++ b/torch/distributed/optim/named_optimizer.py
@@ -9,7 +9,7 @@
 from torch.distributed._shard.sharded_tensor import ShardedTensor
 
 
-__all__ : List[str] = []
+__all__: List[str] = []
 
 logger = logging.getLogger(__name__)
 
diff --git a/torch/distributed/optim/optimizer.py b/torch/distributed/optim/optimizer.py
index 535104beb9f41..c8b26fba0463a 100644
--- a/torch/distributed/optim/optimizer.py
+++ b/torch/distributed/optim/optimizer.py
@@ -1,20 +1,19 @@
-from typing import List, Optional
 import logging
 
+from collections import defaultdict
+from threading import Lock
+from typing import List, Optional
+
 import torch
+import torch.distributed.autograd as dist_autograd
 import torch.distributed.rpc as rpc
 import torch.jit as jit
 import torch.nn as nn
 from torch import Tensor
 from torch.distributed.rpc import RRef
 from .utils import functional_optim_map
-import torch.distributed.autograd as dist_autograd
-
-
-from collections import defaultdict
-from threading import Lock
 
-__all__ = ['DistributedOptimizer']
+__all__ = ["DistributedOptimizer"]
 
 logger = logging.getLogger(__name__)
 
@@ -44,10 +43,7 @@ class _ScriptLocalOptimizer(nn.Module):
     def __init__(self, optim_cls, local_params_rref, *args, **kwargs):
         super().__init__()
         self._local_params = [rref.local_value() for rref in local_params_rref]
-        self.optim = optim_cls(
-            self._local_params,
-            *args,
-            **kwargs)
+        self.optim = optim_cls(self._local_params, *args, **kwargs)
 
     @jit.export
     def step(self, autograd_ctx_id: int):
@@ -75,10 +71,7 @@ class _LocalOptimizer(object):
 
     def __init__(self, optim_cls, local_params_rref, *args, **kwargs):
         self._local_params = [rref.local_value() for rref in local_params_rref]
-        self.optim = optim_cls(
-            self._local_params,
-            *args,
-            **kwargs)
+        self.optim = optim_cls(self._local_params, *args, **kwargs)
 
     def step(self, autograd_ctx_id):
         all_local_grads = dist_autograd.get_gradients(autograd_ctx_id)
@@ -90,8 +83,7 @@ def step(self, autograd_ctx_id):
 
 
 def _new_local_optimizer(optim_cls, local_params_rref, *args, **kwargs):
-    return rpc.RRef(
-        _LocalOptimizer(optim_cls, local_params_rref, *args, **kwargs))
+    return rpc.RRef(_LocalOptimizer(optim_cls, local_params_rref, *args, **kwargs))
 
 
 def _local_optimizer_step(local_optim_rref, autograd_ctx_id):
@@ -105,14 +97,12 @@ def _new_script_local_optimizer(optim_cls, local_params_rref, *args, **kwargs):
 
     with _ScriptLocalOptimizer.compile_lock:
         script_optim = jit.script(optim)
-        return rpc.RRef(
-            script_optim, _ScriptLocalOptimizerInterface)
+        return rpc.RRef(script_optim, _ScriptLocalOptimizerInterface)
 
 
 @jit.script
 def _script_local_optimizer_step(
-    local_optim_rref: RRef[_ScriptLocalOptimizerInterface],
-    autograd_ctx_id: int
+    local_optim_rref: RRef[_ScriptLocalOptimizerInterface], autograd_ctx_id: int
 ) -> None:
     local_optim = local_optim_rref.local_value()
     local_optim.step(autograd_ctx_id)
@@ -203,7 +193,7 @@ def __init__(self, optimizer_class, params_rref, *args, **kwargs):
             optim_ctor = functional_optim_map.get(optimizer_class)
         else:
             optim_ctor = optimizer_class
-        self.is_functional_optim = (optim_ctor != optimizer_class)
+        self.is_functional_optim = optim_ctor != optimizer_class
 
         if self.is_functional_optim:
             optimizer_new_func = _new_script_local_optimizer
@@ -252,9 +242,11 @@ def step(self, context_id):
 
         rpc_futs = []
         for optimizer in self.remote_optimizers:
-            rpc_futs.append(rpc.rpc_async(
-                optimizer.owner(),
-                optimizer_step_func,
-                args=(optimizer, context_id),
-            ))
+            rpc_futs.append(
+                rpc.rpc_async(
+                    optimizer.owner(),
+                    optimizer_step_func,
+                    args=(optimizer, context_id),
+                )
+            )
         _wait_for_all(rpc_futs)
diff --git a/torch/distributed/optim/post_localSGD_optimizer.py b/torch/distributed/optim/post_localSGD_optimizer.py
index 49b8560c1e957..4c603996f0cc0 100644
--- a/torch/distributed/optim/post_localSGD_optimizer.py
+++ b/torch/distributed/optim/post_localSGD_optimizer.py
@@ -1,6 +1,7 @@
+import warnings
+
 import torch
 import torch.distributed.algorithms.model_averaging.averagers as averagers
-import warnings
 
 
 class PostLocalSGDOptimizer(torch.optim.Optimizer):
@@ -53,11 +54,7 @@ class PostLocalSGDOptimizer(torch.optim.Optimizer):
         >>>    opt.step()
     """
 
-    def __init__(
-        self,
-        optim: torch.optim.Optimizer,
-        averager: averagers.ModelAverager
-    ):
+    def __init__(self, optim: torch.optim.Optimizer, averager: averagers.ModelAverager):
         self.optim = optim
         self.param_groups = self.optim.param_groups
         self.averager = averager
@@ -76,7 +73,7 @@ def state_dict(self):
         to ensure reload does not cause unnecessary warm up again.
         """
         optim_state_dict = self.optim.state_dict()
-        optim_state_dict['step'] = self.averager.step
+        optim_state_dict["step"] = self.averager.step
         return optim_state_dict
 
     def load_state_dict(self, state_dict):
@@ -89,11 +86,13 @@ def load_state_dict(self, state_dict):
         it will raise a warning and initialize the model averager's step to 0.
         """
         self.optim.load_state_dict(state_dict)
-        if 'step' in state_dict:
-            self.averager.step = state_dict['step']
+        if "step" in state_dict:
+            self.averager.step = state_dict["step"]
         else:
-            warnings.warn("Loaded state dict does not contain a step counter for an averager. "
-                          "Setting step counter to 0.")
+            warnings.warn(
+                "Loaded state dict does not contain a step counter for an averager. "
+                "Setting step counter to 0."
+            )
             self.averager.step = 0
 
     def step(self):
diff --git a/torch/distributed/optim/utils.py b/torch/distributed/optim/utils.py
index 990a020daee23..5fb197e2d1ddf 100644
--- a/torch/distributed/optim/utils.py
+++ b/torch/distributed/optim/utils.py
@@ -1,13 +1,14 @@
 from typing import Type
+
 from torch import optim
+from .functional_adadelta import _FunctionalAdadelta
 from .functional_adagrad import _FunctionalAdagrad
 from .functional_adam import _FunctionalAdam
+from .functional_adamax import _FunctionalAdamax
 from .functional_adamw import _FunctionalAdamW
-from .functional_sgd import _FunctionalSGD
-from .functional_adadelta import _FunctionalAdadelta
 from .functional_rmsprop import _FunctionalRMSprop
 from .functional_rprop import _FunctionalRprop
-from .functional_adamax import _FunctionalAdamax
+from .functional_sgd import _FunctionalSGD
 
 # dict to map a user passed in optimizer_class to a functional
 # optimizer class if we have already defined inside the
@@ -41,15 +42,18 @@ def register_functional_optim(key, optim):
     if key not in functional_optim_map:
         functional_optim_map[key] = optim
 
+
 def as_functional_optim(optim_cls: Type, *args, **kwargs):
     try:
         functional_cls = functional_optim_map[optim_cls]
     except KeyError as e:
-        raise ValueError(f"Optimizer {optim_cls} does not have a functional "
-                         f"counterpart!") from e
+        raise ValueError(
+            f"Optimizer {optim_cls} does not have a functional " f"counterpart!"
+        ) from e
 
     return _create_functional_optim(functional_cls, *args, **kwargs)
 
+
 def _create_functional_optim(functional_optim_cls: Type, *args, **kwargs):
     return functional_optim_cls(
         [],
diff --git a/torch/distributed/optim/zero_redundancy_optimizer.py b/torch/distributed/optim/zero_redundancy_optimizer.py
index f5f526d69edff..43d93b3bcab2a 100644
--- a/torch/distributed/optim/zero_redundancy_optimizer.py
+++ b/torch/distributed/optim/zero_redundancy_optimizer.py
@@ -10,16 +10,7 @@
 import io
 import logging
 from itertools import chain
-from typing import (
-    Any,
-    Callable,
-    Dict,
-    List,
-    Optional,
-    Set,
-    Type,
-    Union,
-)
+from typing import Any, Callable, Dict, List, Optional, Set, Type, Union
 
 import torch
 import torch.distributed as dist
@@ -51,12 +42,18 @@ def _recursive_copy_to_device(
         return value.to(device, non_blocking=non_blocking)
 
     if isinstance(value, (list, tuple)):
-        values = [_recursive_copy_to_device(val, non_blocking=non_blocking, device=device) for val in value]
+        values = [
+            _recursive_copy_to_device(val, non_blocking=non_blocking, device=device)
+            for val in value
+        ]
         return values if isinstance(value, list) else tuple(values)
 
     if isinstance(value, collections.abc.Mapping):
         return {
-            key: _recursive_copy_to_device(val, non_blocking=non_blocking, device=device) for key, val in value.items()
+            key: _recursive_copy_to_device(
+                val, non_blocking=non_blocking, device=device
+            )
+            for key, val in value.items()
         }
 
     return value
@@ -71,9 +68,10 @@ def _is_trainable(param: torch.Tensor) -> bool:
 
 
 def _broadcast_object(
-    obj: Any, src_rank: int,
+    obj: Any,
+    src_rank: int,
     group: object = dist.group.WORLD,
-    device: torch.device = torch.device("cpu")
+    device: torch.device = torch.device("cpu"),
 ) -> Any:
     r"""
     Broadcasts an object to the given group, sending the object if called from
@@ -103,19 +101,21 @@ def _broadcast_object(
         # Receive the object
         length_tensor = torch.LongTensor([0]).to(device)
         dist.broadcast(length_tensor, src=src_rank, group=group, async_op=False)
-        data_recv_tensor = torch.empty([int(length_tensor.item())], dtype=torch.uint8, device=device)
+        data_recv_tensor = torch.empty(
+            [int(length_tensor.item())], dtype=torch.uint8, device=device
+        )
         dist.broadcast(data_recv_tensor, src=src_rank, group=group, async_op=False)
         buffer = io.BytesIO(data_recv_tensor.cpu().numpy())
         obj = torch.load(buffer, map_location=device)
     return obj
 
 
-
 class _ZeROJoinHook(JoinHook):
     def __init__(self, zero):
-        assert isinstance(zero, ZeroRedundancyOptimizer), \
-            "ZeRO join hook requires passing in a ZeroRedundancyOptimizer " \
+        assert isinstance(zero, ZeroRedundancyOptimizer), (
+            "ZeRO join hook requires passing in a ZeroRedundancyOptimizer "
             "instance as the state"
+        )
         self.zero = zero
         super().__init__()
 
@@ -127,7 +127,7 @@ def main_hook(self):
         self.zero.step()
 
 
-class _DDPBucketAssignment():
+class _DDPBucketAssignment:
     r"""
     This represents a :class:`DistributedDataParallel` bucket assignment,
     meaning a (possibly non-strict) subset of the parameters corresponding to
@@ -146,6 +146,7 @@ class _DDPBucketAssignment():
         tensor (torch.Tensor): flattened tensor giving the data of the
             parameter subset assigned to the rank.
     """
+
     def __init__(
         self,
         bucket_index: int,
@@ -181,7 +182,7 @@ class _OverlapStatus(enum.IntEnum):
     INITIALIZED = 2
 
 
-class _OverlapInfo():
+class _OverlapInfo:
     r"""
     This contains the information needed by :class:`ZeroRedundancyOptimizer`
     to overlap with :class:`DistributedDataParallel`.
@@ -227,14 +228,14 @@ class _OverlapInfo():
         bucket_indices_seen (List[int]): :class:`list` of the bucket indices
             seen on this iteration.
     """
+
     def __init__(self, world_size) -> None:
         self.status: _OverlapStatus = _OverlapStatus.UNINITIALIZED
         self.shard_buckets: bool = False
 
         # Modified per bucket reconstruction
         self.params_per_bucket: List[List[torch.Tensor]] = []
-        self.params_per_rank: List[List[torch.Tensor]] = \
-            [[] for _ in range(world_size)]
+        self.params_per_rank: List[List[torch.Tensor]] = [[] for _ in range(world_size)]
         self.offsets: Dict[int, int] = {}
         # Group Ranks
         self.assigned_ranks_per_bucket: List[Set[int]] = []
@@ -255,8 +256,9 @@ def wait_for_broadcasts(self) -> None:
         filled. This clears ``self.broadcast_handles`` in preparation for the
         next iteration.
         """
-        assert len(self.broadcast_handles) == self.num_bucket_assignments, \
-            f"Missing at least one broadcast handle on rank {dist.get_rank()}"
+        assert (
+            len(self.broadcast_handles) == self.num_bucket_assignments
+        ), f"Missing at least one broadcast handle on rank {dist.get_rank()}"
         _ = list(map(lambda x: x.wait(), self.broadcast_handles))
         self.broadcast_handles.clear()
 
@@ -388,17 +390,25 @@ def __init__(
         self._param_to_index_cache: Dict[torch.Tensor, int] = {}
         self._partition_parameters_cache: List[List[Dict]] = []
         self._index_to_param_cache: List[torch.Tensor] = []
-        self._device_to_params_per_rank_cache: Dict[torch.device, List[List[torch.Tensor]]] = {}
-        self._bucket_assignments_per_rank_cache: List[Dict[int, _DDPBucketAssignment]] = []
+        self._device_to_params_per_rank_cache: Dict[
+            torch.device, List[List[torch.Tensor]]
+        ] = {}
+        self._bucket_assignments_per_rank_cache: List[
+            Dict[int, _DDPBucketAssignment]
+        ] = []
         self._is_trainable_mask = self._get_is_trainable_mask()
 
         # Default device for collective communication and buckets
         self._default_device = self._all_params[0].device
 
-        self.process_group = process_group if process_group is not None else dist.group.WORLD
+        self.process_group = (
+            process_group if process_group is not None else dist.group.WORLD
+        )
         self.world_size: int = dist.get_world_size(self.process_group)
         self.rank: int = dist.get_rank(self.process_group)
-        self.global_rank: int = dist.distributed_c10d.get_global_rank(self.process_group, self.rank)
+        self.global_rank: int = dist.distributed_c10d.get_global_rank(
+            self.process_group, self.rank
+        )
 
         self._overlap_with_ddp: bool = overlap_with_ddp
         self._optim_defaults = defaults
@@ -510,21 +520,29 @@ def consolidate_state_dict(self, to: int = 0) -> None:
         self._sync_param_groups(self.param_groups, self.optim.param_groups)
 
         # Pull the sharded state from all ranks and store them in rank order
-        empty_messenger = torch.tensor([0], dtype=torch.uint8, device=self._default_device)
+        empty_messenger = torch.tensor(
+            [0], dtype=torch.uint8, device=self._default_device
+        )
 
         # NOTE: We wastefully use `broadcast()` (e.g. instead of `gather()`)
         # due to compatibility issues with NCCL backend; a possible follow-up
         # is to move all sharded state management to RPC RRef
         self._all_state_dicts = []
         for rank in range(self.world_size):
-            global_rank = dist.distributed_c10d.get_global_rank(self.process_group, rank)
+            global_rank = dist.distributed_c10d.get_global_rank(
+                self.process_group, rank
+            )
             if self.rank == to:
                 # Consolidate all local `state_dict`s on this rank, storing on
                 # CPU to save GPU memory
                 if rank == self.rank:
                     # Directly append own optimizer state
                     self._all_state_dicts.append(
-                        _recursive_copy_to_device(self.optim.state_dict(), non_blocking=True, device=torch.device("cpu"),)
+                        _recursive_copy_to_device(
+                            self.optim.state_dict(),
+                            non_blocking=True,
+                            device=torch.device("cpu"),
+                        )
                     )
                 else:
                     # Receive the optimizer state from the source rank
@@ -535,7 +553,11 @@ def consolidate_state_dict(self, to: int = 0) -> None:
                         device=self._default_device,
                     )
                     self._all_state_dicts.append(
-                        _recursive_copy_to_device(local_state_dict, non_blocking=True, device=torch.device("cpu"))
+                        _recursive_copy_to_device(
+                            local_state_dict,
+                            non_blocking=True,
+                            device=torch.device("cpu"),
+                        )
                     )
             else:
                 if rank == self.rank:
@@ -590,9 +612,7 @@ def _verify_params_per_rank(
                     )
 
     def _partition_param_group(
-        self,
-        param_group: Dict[str, Any],
-        params_per_rank: List[List[torch.Tensor]]
+        self, param_group: Dict[str, Any], params_per_rank: List[List[torch.Tensor]]
     ) -> None:
         r"""
         Partitions the parameter group ``param_group`` according to
@@ -648,27 +668,33 @@ def _partition_parameters(
                 self._partition_parameters_cache = [[] for _ in range(self.world_size)]
                 sizes = [0] * self.world_size
                 for param_group in self.param_groups:
-                    param_group_params_per_rank: List[List] = [[] for _ in range(self.world_size)]
+                    param_group_params_per_rank: List[List] = [
+                        [] for _ in range(self.world_size)
+                    ]
                     # Sort the parameters by size (largest first)
-                    params_sorted = sorted(param_group["params"], key=lambda t: t.numel(), reverse=True)
+                    params_sorted = sorted(
+                        param_group["params"], key=lambda t: t.numel(), reverse=True
+                    )
                     for param in params_sorted:
                         # Greedily add the parameter to rank with smallest size so far
                         rank = self._get_min_index(sizes)
                         param_group_params_per_rank[rank].append(param)
                         sizes[rank] += param.numel()
                     # Apply the constructed partition of the parameter group
-                    self._partition_param_group(param_group, param_group_params_per_rank)
+                    self._partition_param_group(
+                        param_group, param_group_params_per_rank
+                    )
 
             return self._partition_parameters_cache
 
         # Partition the parameters according to `params_per_rank`
-        assert len(self._partition_parameters_cache) == 0, \
-            "Specifying `params_per_rank` should only be done when the " \
+        assert len(self._partition_parameters_cache) == 0, (
+            "Specifying `params_per_rank` should only be done when the "
             "parameters have not been partitioned yet"
+        )
         if len(self.param_groups) != 1:
             raise RuntimeError(
-                "Specifying `params_per_rank` only supports a single "
-                "parameter group"
+                "Specifying `params_per_rank` only supports a single " "parameter group"
             )
         self._verify_params_per_rank(params_per_rank)
         self._partition_parameters_cache = [[] for _ in range(self.world_size)]
@@ -703,7 +729,8 @@ def _param_to_index(self) -> Dict[torch.Tensor, int]:
         """
         if len(self._param_to_index_cache) == 0:
             self._param_to_index_cache = {
-                p: i for i, p in enumerate(chain(*(g["params"] for g in self.param_groups)))
+                p: i
+                for i, p in enumerate(chain(*(g["params"] for g in self.param_groups)))
             }
         return self._param_to_index_cache
 
@@ -714,7 +741,9 @@ def _index_to_param(self) -> List[torch.Tensor]:
         actual params.
         """
         if len(self._index_to_param_cache) == 0:
-            self._index_to_param_cache = list(chain(*(g["params"] for g in self.param_groups)))
+            self._index_to_param_cache = list(
+                chain(*(g["params"] for g in self.param_groups))
+            )
         return self._index_to_param_cache
 
     def _broadcast_params_from_rank(self, rank: int):
@@ -729,27 +758,40 @@ def _broadcast_params_from_rank(self, rank: int):
             A :class:`list` of async work handles for the ``broadcast()`` s
             performed to synchronize the parameters.
         """
-        assert not self._overlap_with_ddp, \
-            "`_broadcast_params_from_rank()` should not be used if " \
-            "`overlap_with_ddp=True`; instead, the broadcasting should " \
+        assert not self._overlap_with_ddp, (
+            "`_broadcast_params_from_rank()` should not be used if "
+            "`overlap_with_ddp=True`; instead, the broadcasting should "
             "happen in the DDP communication hook"
+        )
         handles = []
         if self.parameters_as_bucket_view:
             for dev_i_buckets in self._buckets:
                 bucket = dev_i_buckets[rank]
-                global_rank = dist.distributed_c10d.get_global_rank(self.process_group, rank)
+                global_rank = dist.distributed_c10d.get_global_rank(
+                    self.process_group, rank
+                )
                 handles.append(
-                    dist.broadcast(tensor=bucket, src=global_rank,
-                                   group=self.process_group, async_op=True)
+                    dist.broadcast(
+                        tensor=bucket,
+                        src=global_rank,
+                        group=self.process_group,
+                        async_op=True,
+                    )
                 )
         else:
             param_groups = self._partition_parameters()[rank]
-            global_rank = dist.distributed_c10d.get_global_rank(self.process_group, rank)
+            global_rank = dist.distributed_c10d.get_global_rank(
+                self.process_group, rank
+            )
             for param_group in param_groups:
                 for param in param_group["params"]:
                     handles.append(
-                        dist.broadcast(tensor=param.data, src=global_rank,
-                                       group=self.process_group, async_op=True)
+                        dist.broadcast(
+                            tensor=param.data,
+                            src=global_rank,
+                            group=self.process_group,
+                            async_op=True,
+                        )
                     )
         return handles
 
@@ -770,7 +812,7 @@ def _sync_params(self):
 
     @property
     def _device_to_params_per_rank(
-        self
+        self,
     ) -> Dict[torch.device, List[List[torch.Tensor]]]:
         r"""
         :class:`dict` mapping each device to a :class:`list` of the per-rank parameter
@@ -792,17 +834,22 @@ def _device_to_params_per_rank(
             ...
         ...
         """
-        assert self.parameters_as_bucket_view, \
-            "`_device_to_params_per_rank` should only be used if " \
+        assert self.parameters_as_bucket_view, (
+            "`_device_to_params_per_rank` should only be used if "
             "`parameters_as_bucket_view=True`"
+        )
         if len(self._device_to_params_per_rank_cache) == 0:
             for rank, param_groups in enumerate(self._partition_parameters()):
                 for param_group in param_groups:
                     for param in param_group["params"]:
                         device = param.device
                         if device not in self._device_to_params_per_rank_cache:
-                            self._device_to_params_per_rank_cache[device] = [[] for _ in range(self.world_size)]
-                        self._device_to_params_per_rank_cache[device][rank].append(param)
+                            self._device_to_params_per_rank_cache[device] = [
+                                [] for _ in range(self.world_size)
+                            ]
+                        self._device_to_params_per_rank_cache[device][rank].append(
+                            param
+                        )
         return self._device_to_params_per_rank_cache
 
     def _get_min_index(
@@ -857,14 +904,13 @@ def _assign_bucket_subset_to_rank(
         """
         overlap_info = self._overlap_info
         if len(bucket_params) == 0:
-            raise ValueError(
-                "Empty bucket assignment"
-            )
+            raise ValueError("Empty bucket assignment")
         params_per_rank = overlap_info.params_per_rank
         offsets = overlap_info.offsets
 
-        self._bucket_assignments_per_rank_cache[assigned_rank][bucket_index] = \
-            _DDPBucketAssignment(bucket_index, bucket_params, bucket_offset)
+        self._bucket_assignments_per_rank_cache[assigned_rank][
+            bucket_index
+        ] = _DDPBucketAssignment(bucket_index, bucket_params, bucket_offset)
         if self.global_rank == assigned_rank:
             offsets[bucket_index] = len(params_per_rank[assigned_rank])
         params_per_rank[assigned_rank].extend(bucket_params)
@@ -872,16 +918,15 @@ def _assign_bucket_subset_to_rank(
         self._overlap_info.num_bucket_assignments += 1
 
     @property
-    def _bucket_assignments_per_rank(
-        self
-    ) -> List[Dict[int, _DDPBucketAssignment]]:
+    def _bucket_assignments_per_rank(self) -> List[Dict[int, _DDPBucketAssignment]]:
         r"""
         :class:`list` of length world size consisting of :class:`dict` s
         mapping bucket indices to :class:`_DDPBucketAssignment` s for each
         rank.
         """
-        assert self._overlap_with_ddp, "`_bucket_assignments_per_rank` " \
-            "only be used if `overlap_with_ddp=True`"
+        assert self._overlap_with_ddp, (
+            "`_bucket_assignments_per_rank` " "only be used if `overlap_with_ddp=True`"
+        )
         if len(self._bucket_assignments_per_rank_cache) > 0:
             return self._bucket_assignments_per_rank_cache
 
@@ -893,8 +938,7 @@ def _bucket_assignments_per_rank(
 
         if overlap_info.shard_buckets:
             # Define the assignment threshold to approximate uniformity
-            assert overlap_info.total_size is not None, \
-                "`total_size` was not computed"
+            assert overlap_info.total_size is not None, "`total_size` was not computed"
             threshold = overlap_info.total_size / self.world_size  # type: ignore[operator]
             size_per_rank = [0 for _ in range(self.world_size)]
 
@@ -922,8 +966,7 @@ def _bucket_assignments_per_rank(
             # `_DDPBucketAssignment` instance and only contains parameters from
             # a single DDP bucket
             params_per_bucket_enum = sorted(
-                enumerate(params_per_bucket),
-                key=lambda x: sum(p.numel() for p in x[1])
+                enumerate(params_per_bucket), key=lambda x: sum(p.numel() for p in x[1])
             )
             for bucket_index, bucket_params in params_per_bucket_enum:
                 assert len(bucket_params) > 0, "Empty bucket"
@@ -931,8 +974,13 @@ def _bucket_assignments_per_rank(
                 assignment_size = 0
                 for param_index, param in enumerate(bucket_params):
                     param_numel = param.numel()
-                    if assignment_size + param_numel >= threshold and param_index > bucket_offset:
-                        assigned_rank = self._get_min_index(size_per_rank, assigned_ranks_per_bucket[bucket_index])
+                    if (
+                        assignment_size + param_numel >= threshold
+                        and param_index > bucket_offset
+                    ):
+                        assigned_rank = self._get_min_index(
+                            size_per_rank, assigned_ranks_per_bucket[bucket_index]
+                        )
                         # Include up to but not including the parameter that
                         # exceeded the threshold
                         self._assign_bucket_subset_to_rank(
@@ -948,7 +996,9 @@ def _bucket_assignments_per_rank(
                     assignment_size += param_numel
                 # Assign the remainder of the bucket so that no assignment
                 # spans across two buckets
-                assigned_rank = self._get_min_index(size_per_rank, assigned_ranks_per_bucket[bucket_index])
+                assigned_rank = self._get_min_index(
+                    size_per_rank, assigned_ranks_per_bucket[bucket_index]
+                )
                 self._assign_bucket_subset_to_rank(
                     bucket_index,
                     bucket_params[bucket_offset:],
@@ -1014,13 +1064,19 @@ def _local_step(
 
         # Run the optimizer step on this shard only
         if gradients is None:
-            loss = self.optim.step(**kwargs) if closure is None \
+            loss = (
+                self.optim.step(**kwargs)
+                if closure is None
                 else self.optim.step(closure=closure, **kwargs)
+            )
         else:
-            assert self._overlap_with_ddp, "Specifying `gradients` should not " \
+            assert self._overlap_with_ddp, (
+                "Specifying `gradients` should not "
                 "be used when `overlap_with_ddp=False`"
-            assert closure is None, "`closure` is not supported when using " \
-                "a local functional optimizer"
+            )
+            assert closure is None, (
+                "`closure` is not supported when using " "a local functional optimizer"
+            )
             loss = self.optim.step(gradients=gradients)
 
         # Sync any updated attributes in the local optimizer to the exposed
@@ -1111,7 +1167,9 @@ def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
                 state_dict["state"][index] = None
             else:
                 # Load the parameter state to the local optimizer
-                self.optim.state[param] = _recursive_copy_to_device(value, non_blocking=True, device=param.device)
+                self.optim.state[param] = _recursive_copy_to_device(
+                    value, non_blocking=True, device=param.device
+                )
                 # Force zero-dimensional tensors (like Adam "step") on CPU
                 for state_name, state_value in self.optim.state[param].items():
                     if torch.is_tensor(state_value) and state_value.dim() == 0:
@@ -1158,22 +1216,30 @@ def state_dict(self) -> Dict[str, Any]:
         for rank, local_state_dict in enumerate(self._all_state_dicts):
             local_param_groups = local_state_dict["param_groups"]
             global_param_groups = self._partition_parameters()[rank]
-            assert len(local_param_groups) == len(global_param_groups), \
-                "Mismatch between number of local and global parameter groups"
+            assert len(local_param_groups) == len(
+                global_param_groups
+            ), "Mismatch between number of local and global parameter groups"
 
-            for local_param_group, global_param_group in zip(local_param_groups, global_param_groups):
+            for local_param_group, global_param_group in zip(
+                local_param_groups, global_param_groups
+            ):
                 # `local_param_group` stores local indices, while
                 # `global_param_group` stores the tensors directly
                 local_param_indices = local_param_group["params"]
                 global_params = global_param_group["params"]
 
-                assert len(local_param_indices) == len(global_params), \
-                    "Mismatch between number of local and global parameters in parameter group"
-                for local_param_index, global_param in zip(local_param_indices, global_params):
+                assert len(local_param_indices) == len(
+                    global_params
+                ), "Mismatch between number of local and global parameters in parameter group"
+                for local_param_index, global_param in zip(
+                    local_param_indices, global_params
+                ):
                     # Update the global parameter state, if any
                     if local_param_index in local_state_dict["state"]:
                         global_param_index = self._param_to_index[global_param]
-                        state_dict["state"][global_param_index] = local_state_dict["state"][local_param_index]
+                        state_dict["state"][global_param_index] = local_state_dict[
+                            "state"
+                        ][local_param_index]
 
         # Sort the parameters in the state
         state_dict["state"] = dict(sorted(state_dict["state"].items()))
@@ -1198,8 +1264,9 @@ def _sync_param_groups(
             dst_param_groups (list[dict]): parameter groups giving the
                 attribute settings to set.
         """
-        assert len(src_param_groups) == len(dst_param_groups), \
-            "Mismatch between number of source and destination parameter groups"
+        assert len(src_param_groups) == len(
+            dst_param_groups
+        ), "Mismatch between number of source and destination parameter groups"
         for src_param_group, dst_param_group in zip(src_param_groups, dst_param_groups):
             # Sync all attributes except the parameters
             for attr in filter(lambda x: x != "params", src_param_group.keys()):
@@ -1234,7 +1301,9 @@ def _build_param_buckets(self) -> None:
         num_devices = len(self._device_to_params_per_rank)
         self._buckets = [[] for _ in range(num_devices)]  # type: ignore[assignment]
 
-        for dev_i, (device, params_per_rank) in enumerate(self._device_to_params_per_rank.items()):
+        for dev_i, (device, params_per_rank) in enumerate(
+            self._device_to_params_per_rank.items()
+        ):
             for params in params_per_rank:
                 bucket_size = 0
                 dtype = None
@@ -1281,15 +1350,19 @@ def _build_ddp_param_buckets(self) -> None:
                 bucket_size = 0
                 dtype = None
                 for param in params:
-                    assert _is_trainable(param), "Model parameter " \
-                        "corresponding to a gradient in a DDP bucket should " \
+                    assert _is_trainable(param), (
+                        "Model parameter "
+                        "corresponding to a gradient in a DDP bucket should "
                         "require a gradient"
+                    )
                     bucket_size += param.numel()
                     dtype = param.dtype  # assumes all same dtype
                 assert bucket_size > 0, "Empty bucket"
 
                 # Construct the bucket tensor (assuming all dense and same dtype)
-                tensor = torch.empty(bucket_size, dtype=dtype, device=bucket_assignment.device)
+                tensor = torch.empty(
+                    bucket_size, dtype=dtype, device=bucket_assignment.device
+                )
                 offset = 0
                 for param in params:
                     offset_next = offset + param.numel()
@@ -1299,7 +1372,8 @@ def _build_ddp_param_buckets(self) -> None:
                 bucket_assignment.tensor = tensor
 
     def _verify_and_init_params(
-        self, params: Any,
+        self,
+        params: Any,
     ) -> Union[List[torch.Tensor], List[dict]]:
         r"""
         Verifies the type of ``params`` and initializes ``self._all_params``
@@ -1319,24 +1393,28 @@ def _verify_and_init_params(
             :class:`list` to ensure that it can be iterated over again.
         """
         if isinstance(params, torch.Tensor):
-            raise TypeError("`params` argument should be an iterable of "
-                            f"Tensors, but got {torch.typename(params)}")
+            raise TypeError(
+                "`params` argument should be an iterable of "
+                f"Tensors, but got {torch.typename(params)}"
+            )
         try:
             all_params = list(params)
         except TypeError as e:
-            raise TypeError("`params` argument should be an iterable of Tensors"
-                            f" or dicts, but got {torch.typename(params)}") from e
+            raise TypeError(
+                "`params` argument should be an iterable of Tensors"
+                f" or dicts, but got {torch.typename(params)}"
+            ) from e
         if len(all_params) == 0:
-            raise ValueError("ZeroRedundancyOptimizer got an empty parameter "
-                             "list")
+            raise ValueError("ZeroRedundancyOptimizer got an empty parameter " "list")
         all_tensors = True
         all_dicts = True
         for param in all_params:
             all_tensors &= isinstance(param, torch.Tensor)
             all_dicts &= isinstance(param, dict)
         if not all_tensors and not all_dicts:
-            raise TypeError("`params` argument should be an iterable of "
-                            "Tensors or dicts")
+            raise TypeError(
+                "`params` argument should be an iterable of " "Tensors or dicts"
+            )
         # Ensure that `self._all_params` contains a list of all parameters
         if all_tensors:
             self._all_params = all_params
@@ -1369,16 +1447,20 @@ def _verify_same_dense_param_type(self) -> None:
         """
         typename = torch.typename(self._all_params[0])
         if self._all_params[0].is_sparse:
-            raise ValueError("ZeroRedundancyOptimizer only supports using "
-                             "the same dense type for all parameters but got "
-                             f"{typename}")
+            raise ValueError(
+                "ZeroRedundancyOptimizer only supports using "
+                "the same dense type for all parameters but got "
+                f"{typename}"
+            )
         for param in self._all_params[1:]:
             other_typename = torch.typename(param)
             if other_typename != typename:
-                raise ValueError("ZeroRedundancyOptimizer only supports "
-                                 "using the same dense type for all "
-                                 f"parameters but got both {typename} and "
-                                 f"{other_typename}")
+                raise ValueError(
+                    "ZeroRedundancyOptimizer only supports "
+                    "using the same dense type for all "
+                    f"parameters but got both {typename} and "
+                    f"{other_typename}"
+                )
 
     def _get_is_trainable_mask(self) -> List[bool]:
         r"""
@@ -1394,20 +1476,28 @@ def _init_local_optimizer(self) -> None:
 
         The local optimizer is saved in ``self.optim``.
         """
-        assert self._optim_constructor is not None, \
-            "The local optimizer class has not been set"
+        assert (
+            self._optim_constructor is not None
+        ), "The local optimizer class has not been set"
 
         param_groups = self._partition_parameters()[self.rank]
         # `overlap_with_ddp=True` requires a local functional optimizer
         if self._overlap_with_ddp:
             # Functional optimizers only support a single parameter group and
             # require passing in the parameters as a list
-            assert len(param_groups) == 1, "Initializing the local " \
+            assert len(param_groups) == 1, (
+                "Initializing the local "
                 "functional optimizer with more than one parameter group"
+            )
             params = param_groups[0]["params"]
             # Try to pass `_allow_empty_param_list=True` to avoid erroring
-            if "_allow_empty_param_list" in inspect.signature(self._optim_constructor).parameters:
-                self.optim: Any = self._optim_constructor(params, **self._optim_defaults, _allow_empty_param_list=True)
+            if (
+                "_allow_empty_param_list"
+                in inspect.signature(self._optim_constructor).parameters
+            ):
+                self.optim: Any = self._optim_constructor(
+                    params, **self._optim_defaults, _allow_empty_param_list=True
+                )
             else:
                 logger.warning(
                     f"{self._optim_constructor} does not support the argument "
@@ -1419,7 +1509,9 @@ def _init_local_optimizer(self) -> None:
             # Log information about the DDP and ZeRO bucketing
             if dist.get_debug_level() != dist.DebugLevel.OFF:
                 local_numel = sum(p.numel() for p in params)
-                num_assigned_buckets = len(self._bucket_assignments_per_rank[self.global_rank])
+                num_assigned_buckets = len(
+                    self._bucket_assignments_per_rank[self.global_rank]
+                )
                 logger.info(
                     f"rank {self.global_rank} with {local_numel} parameters "
                     f"across {num_assigned_buckets} buckets"
@@ -1440,9 +1532,10 @@ def _init_local_optimizer(self) -> None:
         # optimizer; remove this if/when the functional optimizers support
         # multiple parameter groups
         if self._overlap_with_ddp and not hasattr(self.optim, "param_groups"):
-            assert hasattr(self.optim, "param_group"), \
-                "The functional optimizer should set at least one of the " \
+            assert hasattr(self.optim, "param_group"), (
+                "The functional optimizer should set at least one of the "
                 "attributes `param_group` or `param_groups`"
+            )
             self.optim.param_groups = [self.optim.param_group]  # type: ignore[attr-defined]
 
         self._sync_param_groups(self.optim.param_groups, self.param_groups)
@@ -1452,9 +1545,10 @@ def _init_zero_for_overlap(self) -> None:
         Performs a delayed initialization of the local optimizer and the
         supporting data structures.
         """
-        assert self._overlap_with_ddp, \
-            "`_init_zero_for_overlap()` should only be called when " \
+        assert self._overlap_with_ddp, (
+            "`_init_zero_for_overlap()` should only be called when "
             "`overlap_with_ddp=True`"
+        )
         self._overlap_info.status = _OverlapStatus.INITIALIZED
         self._clear_cache()
         self._partition_parameters(self._overlap_info.params_per_rank)
@@ -1470,10 +1564,11 @@ def _get_assigned_rank(self, bucket_index: int) -> int:
             bucket_index (int): index of the :class:`DistributedDataParallel`
                 bucket for which to get the assigned rank.
         """
-        assert not self._overlap_info.shard_buckets, \
-            "The bucket assignment requires global bucket information and " \
-            "will be computed later; there should be no need to use this " \
+        assert not self._overlap_info.shard_buckets, (
+            "The bucket assignment requires global bucket information and "
+            "will be computed later; there should be no need to use this "
             "method"
+        )
         return bucket_index % self.world_size
 
     def _check_overlap_initialized(self):
@@ -1487,8 +1582,10 @@ def _check_overlap_initialized(self):
             RuntimeError: if ``overlap_with_ddp=True`` and
                 :meth:`_init_zero_for_overlap` has not been called.
         """
-        if self._overlap_with_ddp \
-                and self._overlap_info.status != _OverlapStatus.INITIALIZED:
+        if (
+            self._overlap_with_ddp
+            and self._overlap_info.status != _OverlapStatus.INITIALIZED
+        ):
             raise RuntimeError(
                 "This method should not be called until this "
                 "ZeroRedundancyOptimizer instance has been fully "
diff --git a/torch/distributed/optim/zero_redundancy_optimizer.pyi b/torch/distributed/optim/zero_redundancy_optimizer.pyi
index d0f38240ff7e7..077ada8ece637 100644
--- a/torch/distributed/optim/zero_redundancy_optimizer.pyi
+++ b/torch/distributed/optim/zero_redundancy_optimizer.pyi
@@ -1,13 +1,5 @@
 import enum
-from typing import (
-    Any,
-    Callable,
-    Dict,
-    List,
-    Optional,
-    Set,
-    Type,
-)
+from typing import Any, Callable, Dict, List, Optional, Set, Type
 
 import torch
 from torch.distributed.algorithms.join import Joinable, JoinHook
@@ -18,7 +10,7 @@ class _ZeROJoinHook(JoinHook):
     def __init__(self, zero: Any) -> None: ...
     def main_hook(self) -> None: ...
 
-class _DDPBucketAssignment():
+class _DDPBucketAssignment:
     bucket_index: int
     parameters: List[torch.Tensor]
     offset: int
@@ -60,13 +52,28 @@ class ZeroRedundancyOptimizer(Optimizer, Joinable):
     _overlap_info: _OverlapInfo = ...
     _buckets: List[List[torch.Tensor]] = ...
     _bucket_assignments_per_rank: List[Dict[int, _DDPBucketAssignment]] = ...
-    def __init__(self, params: Any, optimizer_class: Type[Optimizer], process_group: Optional[Any]=..., parameters_as_bucket_view: bool=..., overlap_with_ddp: bool=..., **defaults: Any) -> None: ...
+    def __init__(
+        self,
+        params: Any,
+        optimizer_class: Type[Optimizer],
+        process_group: Optional[Any] = ...,
+        parameters_as_bucket_view: bool = ...,
+        overlap_with_ddp: bool = ...,
+        **defaults: Any,
+    ) -> None: ...
     def add_param_group(self, param_group: dict) -> None: ...
-    def consolidate_state_dict(self, to: int=...) -> None: ...
-    def step(self, closure: Optional[Callable[[], float]]=..., **kwargs: Any) -> Optional[float]: ...
+    def consolidate_state_dict(self, to: int = ...) -> None: ...
+    def step(
+        self, closure: Optional[Callable[[], float]] = ..., **kwargs: Any
+    ) -> Optional[float]: ...
     def load_state_dict(self, state_dict: Dict[str, Any]) -> None: ...
     def state_dict(self) -> Dict[str, Any]: ...
-    def _local_step(self, gradients: Optional[List[Optional[torch.Tensor]]] = None, closure: Optional[Callable[[], float]] = None, **kwargs: Any,) -> Optional[float]: ...
+    def _local_step(
+        self,
+        gradients: Optional[List[Optional[torch.Tensor]]] = None,
+        closure: Optional[Callable[[], float]] = None,
+        **kwargs: Any,
+    ) -> Optional[float]: ...
     def _get_assigned_rank(self, bucket_index: int) -> int: ...
     def _init_zero_for_overlap(self) -> None: ...
     def join_hook(self, **kwargs): ...

From a112f79001d8f131934cade822f8ef3d87f2a194 Mon Sep 17 00:00:00 2001
From: Jiewen Tan <jwtan@google.com>
Date: Thu, 8 Dec 2022 06:56:20 +0000
Subject: [PATCH 1714/1922] [LTC] Tweak LazyGraphExecutor for XLA (#90420)

Summary:
This patch moves some of the data structures from private to protected such that XLAGraphExecutor can reuse them.

Test Plan:
CI.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90420
Approved by: https://github.com/JackCaoG
---
 torch/csrc/lazy/core/lazy_graph_executor.cpp | 4 +---
 torch/csrc/lazy/core/lazy_graph_executor.h   | 6 +++++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/torch/csrc/lazy/core/lazy_graph_executor.cpp b/torch/csrc/lazy/core/lazy_graph_executor.cpp
index 787a39ca02f89..573a057c4ddfb 100644
--- a/torch/csrc/lazy/core/lazy_graph_executor.cpp
+++ b/torch/csrc/lazy/core/lazy_graph_executor.cpp
@@ -931,10 +931,8 @@ std::shared_ptr<LazyGraphExecutor::Async> LazyGraphExecutor::
       // even in case the caller does not wait, and that is accomplished by
       // setting the unlockers status. In that case the exception will be
       // surfaced when the user tries to acquire the device locks the next time.
-      // std::exception_ptr exptr = std::current_exception();
       for (auto& unlocker : async->unlocker) {
-        std::exception_ptr exptr = std::current_exception();
-        unlocker.SetStatus(std::move(exptr));
+        unlocker.SetStatus(std::current_exception());
       }
       throw;
     }
diff --git a/torch/csrc/lazy/core/lazy_graph_executor.h b/torch/csrc/lazy/core/lazy_graph_executor.h
index 16d7e15b7ab25..ec15ae5fe22e5 100644
--- a/torch/csrc/lazy/core/lazy_graph_executor.h
+++ b/torch/csrc/lazy/core/lazy_graph_executor.h
@@ -131,7 +131,10 @@ class TORCH_API LazyGraphExecutor {
 
   hash_t GetGraphHash(const std::vector<LazyTensorPtr>& tensors);
 
- private:
+ protected:
+  // TODO(alanwaketan): Revisit if all of them need to be accessible to
+  // derived classes.
+
   struct SyncTensorsConfig {
     // Whether we want to force data on the target tensors (hence trimming
     // the IR graph above them).
@@ -158,6 +161,7 @@ class TORCH_API LazyGraphExecutor {
     std::vector<size_t> parameter_sequence;
   };
 
+ private:
   struct CompilationResult {
     BackendDevice device;
     size_t emitted_nodes = 0;

From 454459d8d3e8cc23d1c76a23d7085ba73a4a8f69 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Wed, 7 Dec 2022 15:09:15 -0800
Subject: [PATCH 1715/1922] [pruning][core][feature] Implement prune for
 structured pruning (#89777)

Summary:

This PR implements `prune` in BaseStructuredSparsifier:

`prune` is a function that takes in a model with structured sparsity parametritizations (the result of `prepare`) and will return a resized model with the masked out weights removed.

`prune` is defined by a mapping from **patterns** to different **pruning functions**.
	- **patterns** are just sequences of operations, for example `(nn.Linear, activation, nn.Linear)`
	- **pruning functions** are functions that take in an matched pattern as args and will resize the appropriate layer sizes and weights.
	  ```
	  def prune_linear_activation_linear(linear1, activation, linear2):
		pass
	  ```
	- This is one line in the pattern config `(nn.Linear, activation, nn.Linear): prune_linear_activation_linear`

At a high level `prune` works by finding instances of the graph that match different patterns and then calling the mapped pruning functions on those matched patterns.
This is unlike the previous code which attempted to do both at the same time.

There may be some gaps in the patterns compared to the previous implementation, but the conversion functionality support should be the same.

Currently we have pruning functions for the following patterns:
    - linear -> linear
    - linear -> activation -> linear
    - conv2d -> conv2d
    - conv2d -> activation -> conv2d
    - conv2d -> activation -> pool -> conv2d
    - conv2d -> pool -> activation -> conv2d
    - conv2d -> adaptive pool -> flatten -> linear

Added in MyPy type hints as well for the prune_functions.

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89777
Approved by: https://github.com/vkuzo
---
 .../ao/sparsity/test_structured_sparsifier.py | 764 ++++++++++++------
 .../pruning/_experimental/pruner/__init__.py  |   6 -
 .../pruner/base_structured_sparsifier.py      | 252 +++++-
 .../_experimental/pruner/match_utils.py       |  59 ++
 .../_experimental/pruner/parametrization.py   |   1 -
 .../_experimental/pruner/prune_functions.py   | 359 ++++++++
 torch/testing/_internal/common_pruning.py     | 311 +++++++
 7 files changed, 1491 insertions(+), 261 deletions(-)
 create mode 100644 torch/ao/pruning/_experimental/pruner/match_utils.py
 create mode 100644 torch/ao/pruning/_experimental/pruner/prune_functions.py
 create mode 100644 torch/testing/_internal/common_pruning.py

diff --git a/test/ao/sparsity/test_structured_sparsifier.py b/test/ao/sparsity/test_structured_sparsifier.py
index 1b504c9731d2b..19e5a03640d00 100644
--- a/test/ao/sparsity/test_structured_sparsifier.py
+++ b/test/ao/sparsity/test_structured_sparsifier.py
@@ -4,167 +4,51 @@
 
 import copy
 import logging
+import random
 
 import torch
-from torch import nn
-from torch.ao.pruning._experimental.pruner import BaseStructuredSparsifier, FakeStructuredSparsity
+from torch.ao.pruning._experimental.pruner import (
+    BaseStructuredSparsifier,
+    FakeStructuredSparsity,
+)
 from torch.nn.utils import parametrize
 
 from torch.testing._internal.common_utils import TestCase, skipIfTorchDynamo
-
-logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
+from torch.testing._internal.common_pruning import (
+    SimpleLinear,
+    LinearBias,
+    LinearActivation,
+    LinearActivationFunctional,
+    SimpleConv2d,
+    Conv2dBias,
+    Conv2dActivation,
+    Conv2dPadBias,
+    Conv2dPool,
+    Conv2dPoolFlatten,
+    Conv2dPoolFlattenFunctional,
+)
+
+logging.basicConfig(
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
+)
 
 DEVICES = {
     torch.device("cpu"),
-    torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"),
 }
 
 
-class Linear(nn.Module):
-    r"""Model with Linear layers, in Sequential and outside, without biases"""
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Linear(16, 16, bias=False)
-        )
-        self.linear = nn.Linear(16, 16, bias=False)
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.linear(x)
-        return x
-
-
-class LinearB(nn.Module):
-    r"""Model with Linear layers, in Sequential and outside, with biases"""
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Linear(16, 16, bias=True)
-        )
-        self.linear = nn.Linear(16, 16, bias=True)
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.linear(x)
-        return x
-
-
-class MultipleLinear(nn.Module):
-    r"""Model with multiple Linear layers, in Sequential and outside, without biases
-    and with activation functions"""
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Linear(7, 5, bias=False),
-            nn.ReLU(),
-            nn.Linear(5, 8, bias=False),
-            nn.ReLU(),
-            nn.Linear(8, 6, bias=False)
-        )
-        self.linear = nn.Linear(6, 4, bias=False)
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.linear(x)
-        return x
-
-
-class MultipleLinearB(nn.Module):
-    r"""Model with multiple Linear layers, in Sequential and outside, with biases
-    and with activation functions"""
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Linear(7, 5, bias=True),
-            nn.ReLU(),
-            nn.Linear(5, 8, bias=True),
-            nn.ReLU(),
-            nn.Linear(8, 6, bias=True)
-        )
-        self.linear = nn.Linear(6, 4, bias=True)
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.linear(x)
-        return x
-
-
-class MultipleLinearMixed(nn.Module):
-    r"""Model with multiple Linear layers, in Sequential and outside, some with biases
-    and with activation functions"""
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Linear(7, 5, bias=True),
-            nn.ReLU(),
-            nn.Linear(5, 8, bias=False),
-            nn.ReLU(),
-            nn.Linear(8, 6, bias=True)
-        )
-        self.linear = nn.Linear(6, 4, bias=False)
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.linear(x)
-        return x
-
-
-class Conv2dA(nn.Module):
-    r"""Model with Conv2d layers, in Sequential and outside, without biases"""
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Conv2d(1, 32, 3, 1, bias=False),
-        )
-        self.conv2d = nn.Conv2d(32, 64, 3, 1, bias=False)
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.conv2d(x)
-        return x
-
-
-class Conv2dB(nn.Module):
-    r"""Model with Conv2d layers, in Sequential and outside, with biases"""
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Conv2d(1, 32, 3, 1, bias=True),
-        )
-        self.conv2d = nn.Conv2d(32, 64, 3, 1, bias=True)
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.conv2d(x)
-        return x
-
-
-class Conv2dC(nn.Module):
-    r"""Model with Conv2d layers, in Sequential and outside, with and without biases"""
-    def __init__(self):
-        super().__init__()
-        self.seq = nn.Sequential(
-            nn.Conv2d(1, 32, 3, 1, bias=True),
-        )
-        self.conv2d = nn.Conv2d(32, 64, 3, 1, bias=False)
-
-    def forward(self, x):
-        x = self.seq(x)
-        x = self.conv2d(x)
-        return x
-
-
-
 class SimplePruner(BaseStructuredSparsifier):
     def update_mask(self, module, tensor_name, **kwargs):
         getattr(module.parametrizations, tensor_name)[0].mask[1] = False
 
 
-class MultiplePruner(BaseStructuredSparsifier):
+class ImplementedPruner(BaseStructuredSparsifier):
     def update_mask(self, module, tensor_name, **kwargs):
-        getattr(module.parametrizations, tensor_name)[0].mask[1] = False
-        getattr(module.parametrizations, tensor_name)[0].mask[2] = False
+        """Prunes 1/3 of the weight output channels, so resulting module has 33.3% pruning"""
+        num_rows = len(module.parametrizations[tensor_name][0].mask)
+        prune = random.sample(list(range(num_rows)), num_rows // 3)
+        module.parametrizations[tensor_name][0].mask[prune] = False
 
 
 class TestBaseStructuredSparsifier(TestCase):
@@ -180,27 +64,14 @@ def _check_pruner_prepared(self, model, pruner, device):
             # Assume that this is the 1st/only parametrization
             assert type(module.parametrizations.weight[0]) == FakeStructuredSparsity
 
-    def _check_pruner_mask_squashed(self, model, pruner, device):
-        for config in pruner.groups:
-            modules = []
-            if type(config['module']) is tuple:
-                for module in config['module']:
-                    modules.append(module)
-            else:
-                module = config['module']
-                modules.append(module)
-            for module in modules:
-                assert module.weight.device.type == device.type
-                assert not hasattr(module, "parametrizations")
-
     def _check_pruner_valid_before_step(self, model, pruner, device):
         for config in pruner.groups:
             modules = []
-            if type(config['module']) is tuple:
-                for module in config['module']:
+            if type(config["module"]) is tuple:
+                for module in config["module"]:
                     modules.append(module)
             else:
-                module = config['module']
+                module = config["module"]
                 modules.append(module)
             for module in modules:
                 assert module.weight.device.type == device.type
@@ -209,145 +80,562 @@ def _check_pruner_valid_before_step(self, model, pruner, device):
     def _check_pruner_valid_after_step(self, model, pruner, mask, device):
         for config in pruner.groups:
             modules = []
-            if type(config['module']) is tuple:
-                for module in config['module']:
+            if type(config["module"]) is tuple:
+                for module in config["module"]:
                     modules.append(module)
             else:
-                module = config['module']
+                module = config["module"]
                 modules.append(module)
             for module in modules:
                 assert module.weight.device.type == device.type
                 total = module.parametrizations.weight[0].mask.numel()
-                assert module.parametrizations.weight[0].mask.count_nonzero() == total - mask
+                assert (
+                    module.parametrizations.weight[0].mask.count_nonzero()
+                    == total - mask
+                )
 
     def _test_constructor_on_device(self, model, device):
-        self.assertRaisesRegex(TypeError, 'BaseStructuredSparsifier.* update_mask',
-                               BaseStructuredSparsifier)
+        self.assertRaisesRegex(
+            TypeError,
+            "BaseStructuredSparsifier.* update_mask",
+            BaseStructuredSparsifier,
+        )
         model1 = copy.deepcopy(model).to(device)
         pruner = SimplePruner(None)
         pruner.prepare(model1, None)
+        pruner.enable_mask_update = True
         for g in pruner.groups:
-            module = g['module']
+            module = g["module"]
             assert module.weight.device.type == device.type
-        assert len(pruner.groups) == 2
+        assert len(pruner.groups) == 5
         pruner.step()
         # Can instantiate the model with configs
         model2 = copy.deepcopy(model).to(device)
-        pruner = SimplePruner({'test': 3})
-        pruner.prepare(model2, [{"tensor_fqn": "linear.weight"}])
+        pruner = SimplePruner({"test": 3})
+        pruner.prepare(model2, [{"tensor_fqn": "seq.0.weight"}])
         assert len(pruner.groups) == 1
-        assert pruner.groups[0]['module_fqn'] == 'linear'
-        assert 'test' in pruner.groups[0]
-        assert pruner.groups[0]['test'] == 3
+        assert pruner.groups[0]["module_fqn"] == "seq.0"
+        assert "test" in pruner.groups[0]
+        assert pruner.groups[0]["test"] == 3
 
     def test_constructor(self):
-        model = Linear()
+        model = SimpleLinear()
         for device in DEVICES:
             self._test_constructor_on_device(model, torch.device(device))
 
     def _test_prepare_linear_on_device(self, model, device):
         model = copy.deepcopy(model).to(device)
-        x = torch.ones(128, 16, device=device)
+        x = torch.ones(128, 7, device=device)
         pruner = SimplePruner(None)
         pruner.prepare(model, None)
         self._check_pruner_prepared(model, pruner, device)
-        assert model(x).shape == (128, 16)
+        assert model(x).shape == (128, 10)
 
     def test_prepare_linear(self):
-        models = [Linear(), LinearB()]  # without and with bias
+        models = [
+            SimpleLinear(),
+            LinearBias(),
+            LinearActivation(),
+            LinearActivationFunctional(),
+        ]  # without and with bias
         for device in DEVICES:
             for model in models:
                 self._test_prepare_linear_on_device(model, torch.device(device))
 
-    def _test_prepare_conv2d_on_device(self, model, config, device):
+    def _test_prepare_conv2d_on_device(self, model, expected_shape, config, device):
         x = torch.ones((1, 1, 28, 28), device=device)
         pruner = SimplePruner(None)
         pruner.prepare(model, config)
         self._check_pruner_prepared(model, pruner, device)
-        assert model(x).shape == (1, 64, 24, 24)
+        assert model(x).shape == expected_shape
 
     def test_prepare_conv2d(self):
-
-        models = [Conv2dA(), Conv2dB(), Conv2dC()]
-        configs = [None, None, None]
+        models = [
+            SimpleConv2d(),
+            Conv2dBias(),
+            Conv2dActivation(),
+            Conv2dPadBias(),
+            Conv2dPool(),
+        ]
+        shapes = [
+            (1, 52, 20, 20),
+            (1, 52, 18, 18),
+            (1, 52, 18, 18),
+            (1, 52, 24, 24),
+            (1, 52, 3, 3),
+        ]
+        configs = [None, None, None, None, None]
         for device in DEVICES:
-            for model, config in zip(models, configs):
+            for model, shape, config in zip(models, shapes, configs):
                 model = model.to(device)
-                self._test_prepare_conv2d_on_device(model, config, torch.device(device))
+                self._test_prepare_conv2d_on_device(
+                    model, shape, config, torch.device(device)
+                )
 
-    def _test_squash_mask_linear_on_device(self, model, device):
-        model = copy.deepcopy(model).to(device)
-        x = torch.ones(128, 16, device=device)
+    def _test_step_linear_on_device(self, model, device):
+        model = model.to(device)
+        x = torch.ones(7, 7, device=device)
         pruner = SimplePruner(None)
         pruner.prepare(model, None)
-        pruner.squash_mask()
-        self._check_pruner_mask_squashed(model, pruner, device)
-        assert model(x).shape == (128, 16)
+        pruner.enable_mask_update = True
+        self._check_pruner_valid_before_step(model, pruner, device)
+        pruner.step()
+        self._check_pruner_valid_after_step(model, pruner, 1, device)
 
-    def test_squash_mask_linear(self):
-        models = [Linear(), LinearB()]  # without and with bias
+    def test_step_linear(self):
+        models = [
+            SimpleLinear(),
+            LinearBias(),
+            LinearActivation(),
+            LinearActivationFunctional(),
+        ]
         for device in DEVICES:
             for model in models:
-                self._test_squash_mask_linear_on_device(model, torch.device(device))
+                self._test_step_linear_on_device(model, torch.device(device))
 
-    def _test_squash_mask_conv2d_on_device(self, model, config, device):
-        model = copy.deepcopy(model).to(device)
+    def _test_step_conv2d_on_device(self, model, expected_shape, config, device):
+        model = model.to(device)
         x = torch.ones((1, 1, 28, 28), device=device)
         pruner = SimplePruner(None)
         pruner.prepare(model, config)
-        pruner.squash_mask()
-        self._check_pruner_mask_squashed(model, pruner, device)
-        assert model(x).shape == (1, 64, 24, 24)
-
-    def test_squash_mask_conv2d(self):
+        pruner.enable_mask_update = True
+        self._check_pruner_valid_before_step(model, pruner, device)
+        pruner.step()
+        self._check_pruner_valid_after_step(model, pruner, 1, device)
+        assert model(x).shape == expected_shape
 
-        models = [Conv2dA(), Conv2dB(), Conv2dC()]
-        configs = [None, None, None]
+    @skipIfTorchDynamo("TorchDynamo fails with unknown reason")
+    def test_step_conv2d(self):
+        models = [
+            SimpleConv2d(),
+            Conv2dBias(),
+            Conv2dActivation(),
+            Conv2dPadBias(),
+            Conv2dPool(),
+        ]
+        shapes = [
+            (1, 52, 20, 20),
+            (1, 52, 18, 18),
+            (1, 52, 18, 18),
+            (1, 52, 24, 24),
+            (1, 52, 3, 3),
+        ]
+        configs = [None, None, None, None, None]
         for device in DEVICES:
-            for model, config in zip(models, configs):
-                model = model.to(device)
-                self._test_squash_mask_conv2d_on_device(model, config, torch.device(device))
+            for model, shape, config in zip(models, shapes, configs):
+                self._test_step_conv2d_on_device(
+                    model, shape, config, torch.device(device)
+                )
+
+    def _check_pruner_pruned(self, model, pruner, device):
+        for config in pruner.groups:
+            module = config["module"]
+            assert not hasattr(module, "parametrizations")
+            assert not hasattr(module, "mask")
 
-    def _test_step_linear_on_device(self, model, is_basic, device):
+    def _test_linear_on_device(
+        self, model, config, expected_shape, device, also_prune_bias
+    ):
         model = model.to(device)
-        if is_basic:
-            x = torch.ones(16, 16, device=device)
-            pruner = SimplePruner(None)
-            pruner.prepare(model, None)
-            self._check_pruner_valid_before_step(model, pruner, device)
-            pruner.step()
-            self._check_pruner_valid_after_step(model, pruner, 1, device)
-        else:
-            x = torch.ones(7, 7, device=device)
-            pruner = MultiplePruner(None)
-            pruner.prepare(model, None)
-            self._check_pruner_valid_before_step(model, pruner, device)
-            pruner.step()
-            self._check_pruner_valid_after_step(model, pruner, 2, device)
+        model.eval()
+        num_original_params = sum(p.numel() for p in model.parameters())
+        x = torch.ones(128, 7, device=device)
 
-    def test_step_linear(self):
-        basic_models = [Linear(), LinearB()]
-        complex_models = [MultipleLinear(), MultipleLinearB(), MultipleLinearMixed()]
+        pruner = ImplementedPruner({"prune_bias": also_prune_bias})
+        pruner.prepare(model, config)
+        pruner.enable_mask_update = True
+        pruner.step()
+
+        y_expected = model(x)
+
+        assert y_expected.shape == (128, 10)
+        self._check_pruner_prepared(model, pruner, device)
+
+        # Pruning step
+        pruned = pruner.prune()
+        y_pruned = pruned(x)
+        num_pruned_params = sum(p.numel() for p in pruned.parameters())
+
+        assert y_pruned.shape == expected_shape
+        self._check_pruner_pruned(model, pruner, device)
+        if y_pruned.shape == y_expected.shape:
+            assert torch.isclose(y_expected, y_pruned, rtol=1e-05, atol=1e-07).all()
+            assert num_pruned_params < num_original_params
+
+    def test_prune_linear_linear(self):
+        r"""test pruning linear-> linear modules"""
+        configs, shapes = [], []
+        configs.append(
+            [
+                {"tensor_fqn": "seq.0.weight"},
+                {"tensor_fqn": "seq.1.weight"},
+                {"tensor_fqn": "seq.2.weight"},
+            ]
+        )
+        shapes.append((128, 10))
+
+        configs.append(
+            [
+                {"tensor_fqn": "seq.0.weight"},
+                {"tensor_fqn": "seq.1.weight"},
+                {"tensor_fqn": "seq.2.weight"},
+                {"tensor_fqn": "linear1.weight"},
+            ]
+        )
+        shapes.append((128, 10))
+
+        configs.append(
+            [
+                {"tensor_fqn": "seq.0.weight"},
+                {"tensor_fqn": "seq.2.weight"},
+            ]
+        )
+        shapes.append((128, 10))
         for device in DEVICES:
-            for model in basic_models:
-                self._test_step_linear_on_device(model, True, torch.device(device))
-            for model in complex_models:
-                self._test_step_linear_on_device(model, False, torch.device(device))
+            for also_prune_bias in [True, False]:
+                for config, shape in zip(configs, shapes):
+                    self._test_linear_on_device(
+                        SimpleLinear(),
+                        config,
+                        shape,
+                        torch.device(device),
+                        also_prune_bias,
+                    )
+
+    def test_prune_linear_bias_linear(self):
+        # linear(bias) -> linear(no bias)
+        configs, shapes = [], []
+        configs.append(
+            [
+                {"tensor_fqn": "seq.0.weight"},
+                {"tensor_fqn": "seq.1.weight"},
+            ]
+        )
+        shapes.append((128, 10))
+
+        # linear(bias) -> linear(bias)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.2.weight"},
+                {"tensor_fqn": "seq.3.weight"},
+            ]
+        )
+        shapes.append((128, 10))
+
+        # linear(no bias) -> linear(bias)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.0.weight"},
+                {"tensor_fqn": "seq.1.weight"},
+                {"tensor_fqn": "seq.2.weight"},
+            ]
+        )
+        shapes.append((128, 10))
 
-    def _test_step_conv2d_on_device(self, model, config, device):
+        for device in DEVICES:
+            for also_prune_bias in [True, False]:
+                for config, shape in zip(configs, shapes):
+                    self._test_linear_on_device(
+                        LinearBias(),
+                        config,
+                        shape,
+                        torch.device(device),
+                        also_prune_bias,
+                    )
+
+    def test_prune_linear_activation_linear(self):
+        config = [
+            {"tensor_fqn": "seq.0.weight"},
+            {"tensor_fqn": "seq.2.weight"},
+            {"tensor_fqn": "seq.4.weight"},
+            {"tensor_fqn": "linear1.weight"},
+        ]
+        shape = (128, 10)
+
+        for device in DEVICES:
+            for also_prune_bias in [True, False]:
+                # test version with nn.Modules
+                self._test_linear_on_device(
+                    LinearActivation(),
+                    config,
+                    shape,
+                    torch.device(device),
+                    also_prune_bias,
+                )
+                # test functional version
+                self._test_linear_on_device(
+                    LinearActivationFunctional(),
+                    config,
+                    shape,
+                    torch.device(device),
+                    also_prune_bias,
+                )
+
+    def _test_conv2d_on_device(
+        self, model, config, x, expected_shape, device, also_prune_bias
+    ):
         model = model.to(device)
-        x = torch.ones((1, 1, 28, 28)).to(device)
-        pruner = SimplePruner(None)
+        num_original_params = sum(p.numel() for p in model.parameters())
+        model.eval()
+
+        pruner = ImplementedPruner({"prune_bias": also_prune_bias})
         pruner.prepare(model, config)
-        self._check_pruner_valid_before_step(model, pruner, device)
+        pruner.enable_mask_update = True
         pruner.step()
-        self._check_pruner_valid_after_step(model, pruner, 1, device)
-        assert model(x).shape == (1, 64, 24, 24)
+
+        y_expected = model(x)
+        assert y_expected.shape == expected_shape
+
+        self._check_pruner_prepared(model, pruner, device)
+
+        # Fusion step
+        pruned = pruner.prune()
+        y_pruned = pruned(x)
+        num_pruned_params = sum(p.numel() for p in pruned.parameters())
+
+        assert y_pruned.shape == expected_shape
+        self._check_pruner_pruned(model, pruner, device)
+        if y_pruned.shape == y_expected.shape:
+            # TODO This rtol is a little high, need to double check if something specific is causing this to fail
+            assert torch.isclose(
+                y_expected, y_pruned, rtol=1e-1
+            ).all(), f"fail for {type(model)}"
+            # only time this should be equal is when all layers have padding and we can't prune
+            assert num_pruned_params <= num_original_params
+
+    def test_prune_conv2d_conv2d(self):
+        configs, shapes = [], []
+        # all within sequential blocks
+        configs.append(
+            [
+                {"tensor_fqn": "seq.0.weight"},
+            ]
+        )
+        shapes.append((1, 52, 20, 20))
+        # prune across sequential blocks
+        configs.append(
+            [
+                {"tensor_fqn": "seq.0.weight"},
+                {"tensor_fqn": "seq.1.weight"},
+                {"tensor_fqn": "conv2d1.weight"},
+            ]
+        )
+        shapes.append((1, 52, 20, 20))
+
+        for device in DEVICES:
+            x = torch.ones((1, 1, 28, 28), device=device)
+            for also_prune_bias in [True, False]:
+                for config, shape in zip(configs, shapes):
+                    self._test_conv2d_on_device(
+                        SimpleConv2d(),
+                        config,
+                        x,
+                        shape,
+                        torch.device(device),
+                        also_prune_bias,
+                    )
+
+    def test_prune_conv2d_bias_conv2d(self):
+        # Conv2d with Bias and no Activation
+        configs, shapes = [], []
+        # conv2d(bias) -> conv2d(bias)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.0.weight"},
+                {"tensor_fqn": "seq.1.weight"},
+            ]
+        )
+        shapes.append((1, 52, 18, 18))
+
+        # conv2d(no bias) -> conv2d(bias)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.0.weight"},
+                {"tensor_fqn": "seq.1.weight"},
+                {"tensor_fqn": "conv2d1.weight"},
+            ]
+        )
+        shapes.append((1, 52, 18, 18))
+
+        # conv2d(bias) -> conv2d(no bias)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.0.weight"},
+                {"tensor_fqn": "seq.1.weight"},
+                {"tensor_fqn": "seq.2.weight"},
+            ]
+        )
+        shapes.append((1, 52, 18, 18))
+
+        for device in DEVICES:
+            x = torch.ones((1, 1, 28, 28), device=device)
+            for also_prune_bias in [True, False]:
+                for config, shape in zip(configs, shapes):
+                    self._test_conv2d_on_device(
+                        Conv2dBias(),
+                        config,
+                        x,
+                        shape,
+                        torch.device(device),
+                        also_prune_bias,
+                    )
+
+    def test_prune_conv2d_activation_conv2d(self):
+        # Conv2d with Activation and no Bias
+        configs, shapes = [], []
+
+        # conv2d(no bias) -> activatation -> conv2d(no bias)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.4.weight"},
+            ]
+        )
+        shapes.append((1, 52, 18, 18))
+
+        # conv2d(bias) -> activatation -> conv2d(bias)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.0.weight"},
+                {"tensor_fqn": "seq.2.weight"},
+            ]
+        )
+        shapes.append((1, 52, 18, 18))
+
+        # conv2d(bias) -> activation -> conv2d(no bias)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.2.weight"},
+                {"tensor_fqn": "seq.4.weight"},
+            ]
+        )
+        shapes.append((1, 52, 18, 18))
+
+        # conv2d(no bias) -> activation -> conv2d(bias)
+        configs.append(
+            [
+                {"tensor_fqn": "conv2d1.weight"},
+            ]
+        )
+        shapes.append((1, 52, 18, 18))
+
+        for device in DEVICES:
+            x = torch.ones((1, 1, 28, 28), device=device)
+            for also_prune_bias in [True, False]:
+                for config, shape in zip(configs, shapes):
+                    self._test_conv2d_on_device(
+                        Conv2dActivation(),
+                        config,
+                        x,
+                        shape,
+                        torch.device(device),
+                        also_prune_bias,
+                    )
+
+    def test_prune_conv2d_padding_conv2d(self):
+        # Conv2d with Padded layers after Bias layers
+        configs, shapes = [], []
+
+        # conv(padded, bias) -> conv(padded, bias)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.4.weight"},
+            ]
+        )
+        shapes.append((1, 52, 24, 24))
+
+        # conv(no bias, no pad) -> conv(padded, bias)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.2.weight"},
+            ]
+        )
+        shapes.append((1, 52, 24, 24))
+
+        # conv(padded, bias) -> conv ( no bias ,no pad)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.0.weight"},
+            ]
+        )
+        shapes.append((1, 52, 24, 24))
+        # conv(pad, bias) -> conv(no pad, bias)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.6.weight"},
+            ]
+        )
+        shapes.append((1, 52, 24, 24))
+        # conv(no pad, bias) -> conv(pad, bias)
+        configs.append(
+            [
+                {"tensor_fqn": "seq.8.weight"},
+            ]
+        )
+        shapes.append((1, 52, 24, 24))
+
+        for device in DEVICES:
+            x = torch.ones((1, 1, 28, 28), device=device)
+            for also_prune_bias in [True, False]:
+                for config, shape in zip(configs, shapes):
+                    self._test_conv2d_on_device(
+                        Conv2dPadBias(),
+                        config,
+                        x,
+                        shape,
+                        torch.device(device),
+                        also_prune_bias,
+                    )
+
+    def test_prune_conv2d_pool_conv2d(self):
+        # Conv2d with Pooling layers
+        config = [
+            {"tensor_fqn": "seq.0.weight"},
+            {"tensor_fqn": "seq.3.weight"},
+            {"tensor_fqn": "conv2d1.weight"},
+            {"tensor_fqn": "conv2d2.weight"},
+        ]
+        shape = (1, 52, 3, 3)
+
+        for device in DEVICES:
+            x = torch.ones((1, 1, 28, 28), device=device)
+            for also_prune_bias in [True, False]:
+                self._test_conv2d_on_device(
+                    Conv2dPool(),
+                    config,
+                    x,
+                    shape,
+                    torch.device(device),
+                    also_prune_bias,
+                )
 
     @skipIfTorchDynamo("TorchDynamo fails with unknown reason")
-    def test_step_conv2d(self):
-        models = [Conv2dA(), Conv2dB(), Conv2dC()]
-        configs = [None, None, None, None]
+    def test_complex_conv2d(self):
+        """Test fusion for models that contain Conv2d & Linear modules.
+        Currently supports: Conv2d-Pool2d-Flatten-Linear, Skip-add"""
+        config = [
+            {"tensor_fqn": "seq.0.weight"},
+            {"tensor_fqn": "seq.3.weight"},
+            {"tensor_fqn": "conv2d1.weight"},
+            {"tensor_fqn": "conv2d2.weight"},
+        ]
+        shape = (1, 13)
+
         for device in DEVICES:
-            for model, config in zip(models, configs):
-                self._test_step_conv2d_on_device(model, config, torch.device(device))
+            x = torch.ones((1, 1, 28, 28), device=device)
+            for also_prune_bias in [True, False]:
+                self._test_conv2d_on_device(
+                    Conv2dPoolFlattenFunctional(),
+                    config,
+                    x,
+                    shape,
+                    torch.device(device),
+                    also_prune_bias,
+                )
+                self._test_conv2d_on_device(
+                    Conv2dPoolFlatten(),
+                    config,
+                    x,
+                    shape,
+                    torch.device(device),
+                    also_prune_bias,
+                )
diff --git a/torch/ao/pruning/_experimental/pruner/__init__.py b/torch/ao/pruning/_experimental/pruner/__init__.py
index e9b17f6c7aad7..d762873277493 100644
--- a/torch/ao/pruning/_experimental/pruner/__init__.py
+++ b/torch/ao/pruning/_experimental/pruner/__init__.py
@@ -3,9 +3,3 @@
     FakeStructuredSparsity,
     BiasHook,
 )
-
-__all__ = [
-    "FakeStructuredSparsity",
-    "BaseStructuredSparsifier",
-    "BiasHook",
-]
diff --git a/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py b/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
index e753d2a6d88da..3b568f1557d07 100644
--- a/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
+++ b/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
@@ -1,17 +1,193 @@
-from typing import Set, Type
+from itertools import chain
 import torch
+import torch.nn.functional as F
 from torch import nn
+from torch.fx import symbolic_trace
 from torch.nn.utils import parametrize
+from typing import Type, Set, Dict, Callable, Tuple, Optional, Union
 
 from torch.ao.pruning import BaseSparsifier
 from .parametrization import FakeStructuredSparsity, BiasHook
+from .match_utils import apply_match
+from .prune_functions import (
+    prune_linear,
+    prune_linear_linear,
+    prune_linear_activation_linear,
+    prune_conv2d,
+    prune_conv2d_conv2d,
+    prune_conv2d_activation_conv2d,
+    prune_conv2d_activation_pool_conv2d,
+    prune_conv2d_pool_activation_conv2d,
+    prune_conv2d_pool_flatten_linear,
+)
 
-__all__ = ["BaseStructuredSparsifier"]
 
-SUPPORTED_STRUCTURED_PRUNING_MODULES = {  # added to config if None given
-    nn.Linear,
-    nn.Conv2d,
-}
+def _get_supported_structured_pruning_modules():
+    SUPPORTED_STRUCTURED_PRUNING_MODULES = {  # added to config if None given
+        nn.Linear,
+        nn.Conv2d,
+    }
+    return SUPPORTED_STRUCTURED_PRUNING_MODULES
+
+
+def _get_supported_activation_functions():
+    SUPPORTED_ACTIVATION_FUNCTIONS = {
+        F.relu,
+        F.rrelu,
+        F.hardtanh,
+        F.relu6,
+        F.sigmoid,
+        F.hardsigmoid,
+        F.tanh,
+        F.silu,
+        F.mish,
+        F.hardswish,
+        F.elu,
+        F.celu,
+        F.selu,
+        F.hardshrink,
+        F.leaky_relu,
+        F.logsigmoid,
+        F.softplus,
+        F.prelu,
+        F.softsign,
+        F.tanhshrink,
+    }
+    return SUPPORTED_ACTIVATION_FUNCTIONS
+
+
+def _get_supported_activation_modules():
+    SUPPORTED_ACTIVATION_MODULES = {
+        nn.ReLU,
+        nn.RReLU,
+        nn.Hardtanh,
+        nn.ReLU6,
+        nn.Sigmoid,
+        nn.Hardsigmoid,
+        nn.Tanh,
+        nn.SiLU,
+        nn.Mish,
+        nn.Hardswish,
+        nn.ELU,
+        nn.CELU,
+        nn.SELU,
+        nn.Hardshrink,
+        nn.LeakyReLU,
+        nn.LogSigmoid,
+        nn.Softplus,
+        nn.PReLU,
+        nn.Softsign,
+        nn.Tanhshrink,
+    }
+    return SUPPORTED_ACTIVATION_MODULES
+
+
+def _get_default_structured_pruning_patterns() -> Dict[
+    Tuple[Union[Type[nn.Module], Callable[[torch.Tensor], torch.Tensor], str], ...],
+    Callable[..., None],
+]:
+    """
+    Returns the patterns for conv2d / linear conversion for each element in the activation functions/modules defined above.
+    """
+    patterns: Dict[
+        Tuple[Union[Type[nn.Module], Callable[[torch.Tensor], torch.Tensor], str], ...],
+        Callable[..., None],
+    ] = {
+        # linear -> linear
+        (nn.Linear, "output"): prune_linear,
+        (nn.Linear, nn.Linear): prune_linear_linear,
+        # conv2d -> conv2d
+        (nn.Conv2d, "output"): prune_conv2d,
+        (nn.Conv2d, nn.Conv2d): prune_conv2d_conv2d,
+    }
+
+    for activation in chain(
+        _get_supported_activation_functions(), _get_supported_activation_modules()
+    ):
+        patterns.update(
+            {
+                # linear -> activation -> linear
+                (nn.Linear, activation, nn.Linear): prune_linear_activation_linear,
+                # conv2d -> activation -> conv2d
+                (nn.Conv2d, activation, nn.Conv2d): prune_conv2d_activation_conv2d,
+                # conv2d -> activation -> pool -> conv2d
+                (
+                    nn.Conv2d,
+                    activation,
+                    nn.AvgPool2d,
+                    nn.Conv2d,
+                ): prune_conv2d_activation_pool_conv2d,
+                (
+                    nn.Conv2d,
+                    activation,
+                    F.avg_pool2d,
+                    nn.Conv2d,
+                ): prune_conv2d_activation_pool_conv2d,
+                (
+                    nn.Conv2d,
+                    activation,
+                    nn.MaxPool2d,
+                    nn.Conv2d,
+                ): prune_conv2d_activation_pool_conv2d,
+                (
+                    nn.Conv2d,
+                    activation,
+                    F.max_pool2d,
+                    nn.Conv2d,
+                ): prune_conv2d_activation_pool_conv2d,
+                # conv2d -> pool -> activation -> conv2d
+                (
+                    nn.Conv2d,
+                    nn.AvgPool2d,
+                    activation,
+                    nn.Conv2d,
+                ): prune_conv2d_pool_activation_conv2d,
+                (
+                    nn.Conv2d,
+                    F.avg_pool2d,
+                    activation,
+                    nn.Conv2d,
+                ): prune_conv2d_pool_activation_conv2d,
+                (
+                    nn.Conv2d,
+                    nn.MaxPool2d,
+                    activation,
+                    nn.Conv2d,
+                ): prune_conv2d_pool_activation_conv2d,
+                (
+                    nn.Conv2d,
+                    F.max_pool2d,
+                    activation,
+                    nn.Conv2d,
+                ): prune_conv2d_pool_activation_conv2d,
+                # conv2d -> adaptive pool -> flatten -> linear
+                (
+                    nn.Conv2d,
+                    nn.AdaptiveAvgPool2d,
+                    nn.Flatten,
+                    nn.Linear,
+                ): prune_conv2d_pool_flatten_linear,
+                (
+                    nn.Conv2d,
+                    nn.AdaptiveAvgPool2d,
+                    torch.flatten,
+                    nn.Linear,
+                ): prune_conv2d_pool_flatten_linear,
+                (
+                    nn.Conv2d,
+                    nn.AdaptiveMaxPool2d,
+                    nn.Flatten,
+                    nn.Linear,
+                ): prune_conv2d_pool_flatten_linear,
+                (
+                    nn.Conv2d,
+                    nn.AdaptiveMaxPool2d,
+                    torch.flatten,
+                    nn.Linear,
+                ): prune_conv2d_pool_flatten_linear,
+            }
+        )
+    return patterns
 
 
 class BaseStructuredSparsifier(BaseSparsifier):
@@ -27,17 +203,20 @@ class BaseStructuredSparsifier(BaseSparsifier):
             be updated.
     """
 
-    def __init__(self, defaults):
+    def __init__(self, defaults, patterns=None):
         super().__init__(defaults)
+        if patterns is None:
+            patterns = _get_default_structured_pruning_patterns()
+        self.patterns = patterns
 
     def make_config_from_model(
         self,
         model: nn.Module,
-        SUPPORTED_MODULES: Set[Type] = SUPPORTED_STRUCTURED_PRUNING_MODULES,
+        SUPPORTED_MODULES: Optional[Set[Type]] = None,
     ) -> None:
-        super().make_config_from_model(
-            model, SUPPORTED_MODULES=SUPPORTED_STRUCTURED_PRUNING_MODULES
-        )
+        if SUPPORTED_MODULES is None:
+            SUPPORTED_MODULES = _get_supported_structured_pruning_modules()
+        super().make_config_from_model(model, SUPPORTED_MODULES=SUPPORTED_MODULES)
 
     def _prepare(self, *args, **kwargs) -> None:
         r"""This function will attach the FakeStructuredSparsity parameterizations
@@ -57,18 +236,59 @@ def _prepare(self, *args, **kwargs) -> None:
             )
             self.state[config["tensor_fqn"]]["mask"] = mask
             parametrize.register_parametrization(
-                module, tensor_name, parametrization(mask), unsafe=True
+                module, tensor_name, parametrization(mask)
             )
-
             prune_bias = config.get("prune_bias", True)
-            if prune_bias and module.bias is not None:
+            if module.bias is not None:
                 module.register_parameter("_bias", nn.Parameter(module.bias.detach()))
                 module.bias = None
+                module.prune_bias = prune_bias
+
             self.bias_handles.append(
                 module.register_forward_hook(
                     BiasHook(module.parametrizations.weight[0], prune_bias)
                 )
             )
 
-    def convert(self):
-        pass
+    def prune(self) -> None:
+        r"""
+        This function will FX symbolically trace the model and then find instances of the patterns
+        defined in self.patterns (by default SUPPORTED_STRUCTURED_PRUNING_PATTERNS ).
+
+        For each pattern, it will apply to corresponding conversion function, which will modify the output
+        and input size expected by the modules within the pattern
+        """
+
+        self.traced = symbolic_trace(self.model)
+        modules = dict(self.traced.named_modules())
+
+        # Right now we check for matches simply by iterating across all the patterns
+        # if this is slow we can store patterns in a trie-structure and modify this code for faster lookup
+
+        for node in self.traced.graph.nodes:
+            for pattern, convert_fn in self.patterns.items():
+                matched = apply_match(modules, pattern, node, [])
+                if matched is None:
+                    continue
+
+                first_module = modules.get(node.target)
+                # check if first module exists and has apropriate parameterization, otherwise skip
+                if (
+                    first_module is not None
+                    and parametrize.is_parametrized(first_module)
+                    and isinstance(
+                        first_module.parametrizations["weight"][0],
+                        FakeStructuredSparsity,
+                    )
+                ):
+                    convert_block = []
+                    for node in matched:
+                        if node.op == "call_module":
+                            convert_block.append(modules.get(node.target))
+                        elif node.op == "call_function":
+                            convert_block.append(node.target)
+                    convert_fn(*convert_block)
+
+        self.traced.graph.lint()
+        self.traced.recompile()
+        return self.traced
diff --git a/torch/ao/pruning/_experimental/pruner/match_utils.py b/torch/ao/pruning/_experimental/pruner/match_utils.py
new file mode 100644
index 0000000000000..d0f7a9f6293d9
--- /dev/null
+++ b/torch/ao/pruning/_experimental/pruner/match_utils.py
@@ -0,0 +1,59 @@
+"""
+Contains utility functions to check if a pattern is in the graph and return the matching nodes
+"""
+import torch
+from torch import nn
+from torch.ao.quantization.utils import (
+    MatchAllNode,
+)
+from torch.fx import Node
+from torch.nn.utils import parametrize
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+def _match(modules: Dict[str, nn.ModuleDict], node: Node, current: Union[nn.Module, Any]) -> bool:
+    r"""
+    checks to see if a single node of a pattern matches
+    """
+    if isinstance(current, type) and issubclass(current, MatchAllNode):
+        return True
+    if not isinstance(node, Node):
+        return False
+    if isinstance(current, type) and issubclass(current, torch.nn.Module):
+        return (
+            node.op == "call_module"
+            and parametrize.type_before_parametrizations(modules[node.target])
+            == current
+        )
+    elif callable(current):
+        return node.op == "call_function" and node.target is current
+    elif isinstance(current, str):
+        return node.target == current
+    return False
+
+def apply_match(
+    modules: Dict[str, nn.ModuleDict],
+    pattern: Union[Tuple[Any], Any],
+    node: Node,
+    matched_node_pattern: List[Node],
+) -> Optional[List[Node]]:
+    r"""
+    This function will return the matched nodes if the pattern matches the node given
+    If there is no match, it will return None
+    """
+    if isinstance(pattern, tuple):
+        if len(pattern) == 1:
+            if _match(modules, node, pattern[0]):
+                return matched_node_pattern + [node]
+
+        first, *rest = pattern
+        if _match(modules, node, first):
+            if rest is None:
+                return matched_node_pattern + [node]
+
+            for user in node.users:
+                return apply_match(
+                    modules, tuple(rest), user, matched_node_pattern + [node]
+                )
+    elif _match(modules, node, pattern):
+        return [node]
+    return None
diff --git a/torch/ao/pruning/_experimental/pruner/parametrization.py b/torch/ao/pruning/_experimental/pruner/parametrization.py
index 2ea59d48ee809..aeddd0a841525 100644
--- a/torch/ao/pruning/_experimental/pruner/parametrization.py
+++ b/torch/ao/pruning/_experimental/pruner/parametrization.py
@@ -1,7 +1,6 @@
 import torch
 from torch import nn
 
-__all__ = ['FakeStructuredSparsity', 'BiasHook']
 
 
 # Structured Pruning Parameterizations
diff --git a/torch/ao/pruning/_experimental/pruner/prune_functions.py b/torch/ao/pruning/_experimental/pruner/prune_functions.py
new file mode 100644
index 0000000000000..ee8bffb7f9f3e
--- /dev/null
+++ b/torch/ao/pruning/_experimental/pruner/prune_functions.py
@@ -0,0 +1,359 @@
+"""
+Collection of conversion functions for linear / conv2d structured pruning
+Also contains utilities for bias propogation
+"""
+from typing import cast, Optional, Callable, Tuple
+
+import torch
+from torch import nn, Tensor
+from torch.nn.utils import parametrize
+from torch.nn.utils.parametrize import ParametrizationList
+from .parametrization import FakeStructuredSparsity, BiasHook
+
+
+# BIAS PROPOGATION
+def _remove_bias_handles(module: nn.Module) -> None:
+    if hasattr(module, "_forward_hooks"):
+        bias_hooks = []
+        for key, hook in module._forward_hooks.items():
+            if isinstance(hook, BiasHook):
+                bias_hooks.append(key)
+
+        for key in bias_hooks:
+            del module._forward_hooks[key]
+
+
+def _get_adjusted_next_layer_bias(
+    next_layer: nn.Module, pruned_biases: Tensor, mask: Tensor
+) -> nn.Parameter:
+    r"""Returns new adjusted bias for the second supported module"""
+    if parametrize.is_parametrized(next_layer):
+        # need to access original weight
+        parametrization_dict = cast(nn.ModuleDict, next_layer.parametrizations)
+        weight_parameterizations = cast(
+            ParametrizationList, parametrization_dict.weight
+        )
+        next_weight = weight_parameterizations.original
+    else:
+        next_weight = cast(Tensor, next_layer.weight)
+
+    scaling_weight = next_weight[:, ~mask]
+    if isinstance(next_layer, nn.Conv2d):  # checking for Conv2d
+        # Propagating first layer pruned biases and calculating the new second layer bias
+        # involves more steps since the Conv2d scaling weight has extra dimensions,
+        # so adding bias involves broadcasting, logically:
+        # for each channel k in range(oC):
+        #     scaled_biases = sum(first_bias[pruned_idx] @ next_weight[k, pruned_idx, :, :].T)
+        #     new_next_bias[k] = old_next_bias[k] + scaled_biases
+        scaling_product = torch.matmul(
+            pruned_biases.reshape(1, -1), torch.transpose(scaling_weight, 1, 2)
+        )
+        sum_range = list(range(len(scaling_product.shape)))[
+            1:
+        ]  # all but the first dimension
+        scaled_biases = torch.sum(scaling_product, sum_range)
+    elif isinstance(next_layer, nn.Linear):  # Linear
+        scaled_biases = torch.matmul(
+            pruned_biases, torch.transpose(scaling_weight, 0, 1)
+        )  # recall b2_new = b1 @ w2.T + b2
+    else:
+        raise NotImplementedError(f"Type {type(next_layer)} not supported yet.")
+
+    if (
+        parametrize.is_parametrized(next_layer)
+        and getattr(next_layer, "_bias", None) is not None
+    ):  # next_layer is parametrized & has original bias ._bias
+        adjusted_bias = nn.Parameter(scaled_biases + next_layer._bias)
+    elif (
+        not parametrize.is_parametrized(next_layer) and next_layer.bias is not None
+    ):  # next_layer not parametrized & has .bias
+        adjusted_bias = nn.Parameter(scaled_biases + next_layer.bias)
+    else:  # next_layer has no bias
+        adjusted_bias = nn.Parameter(scaled_biases)
+    return adjusted_bias
+
+
+def _prune_module_bias(module: nn.Module, mask: Tensor) -> None:
+    r"""Applies mask to given modules bias"""
+    # prune bias along with weights, discard pruned indices of bias
+    original_bias = cast(Tensor, getattr(module, "_bias", module.bias))
+    if original_bias is not None:
+        module.bias = nn.Parameter(original_bias[mask])
+
+    #  remove _bias parameter
+    if hasattr(module, "_bias"):
+        delattr(module, "_bias")
+
+
+def _propogate_module_bias(module: nn.Module, mask: Tensor) -> Optional[Tensor]:
+    r"""
+    In the case that we need to propogate biases, this function will return the biases we need
+    """
+    # set current module bias
+    if module.bias is not None:
+        module.bias = nn.Parameter(cast(Tensor, module.bias)[mask])
+    elif getattr(module, "_bias", None) is not None:
+        module.bias = nn.Parameter(cast(Tensor, module._bias)[mask])
+
+    # get pruned biases to propogate to subsequent layer
+    if getattr(module, "_bias", None) is not None:
+        pruned_biases = cast(Tensor, module._bias)[~mask]
+    else:
+        pruned_biases = None
+
+    if hasattr(module, "_bias"):
+        delattr(module, "_bias")
+
+    return pruned_biases
+
+
+# LINEAR
+def _prune_linear_helper(linear: nn.Linear) -> Tensor:
+    # expects linear to be a parameterized linear module
+    parametrization_dict = cast(nn.ModuleDict, linear.parametrizations)
+    weight_parameterizations = cast(ParametrizationList, parametrization_dict.weight)
+    for p in weight_parameterizations:
+        if isinstance(p, FakeStructuredSparsity):
+            mask = cast(Tensor, p.mask)
+
+    with torch.no_grad():
+        parametrize.remove_parametrizations(linear, "weight", leave_parametrized=True)
+        linear.weight = nn.Parameter(linear.weight[mask])
+    linear.out_features = linear.weight.shape[0]
+    _remove_bias_handles(linear)
+
+    return mask
+
+
+def prune_linear(linear: nn.Linear) -> None:
+    mask = _prune_linear_helper(linear)
+    if getattr(linear, "prune_bias", False):
+        _prune_module_bias(linear, mask)
+
+
+def prune_linear_linear(linear1: nn.Linear, linear2: nn.Linear) -> None:
+    prune_linear_activation_linear(linear1, None, linear2)
+
+
+def prune_linear_activation_linear(
+    linear1: nn.Linear,
+    activation: Optional[Callable[[Tensor], Tensor]],
+    linear2: nn.Linear,
+):
+    mask = _prune_linear_helper(linear1)
+    if getattr(linear1, "prune_bias", False):
+        _prune_module_bias(linear1, mask)
+    else:
+        pruned_biases = _propogate_module_bias(linear1, mask)
+        if pruned_biases is not None:
+            if activation:
+                pruned_biases = activation(pruned_biases)
+            linear2.bias = _get_adjusted_next_layer_bias(linear2, pruned_biases, mask)
+
+    with torch.no_grad():
+        if parametrize.is_parametrized(linear2):
+            parametrization_dict = cast(nn.ModuleDict, linear2.parametrizations)
+            weight_parameterizations = cast(
+                ParametrizationList, parametrization_dict.weight
+            )
+
+            weight_parameterizations.original = nn.Parameter(
+                weight_parameterizations.original[:, mask]
+            )
+            linear2.in_features = weight_parameterizations.original.shape[1]
+        else:
+            linear2.weight = nn.Parameter(linear2.weight[:, mask])
+            linear2.in_features = linear2.weight.shape[1]
+
+
+# CONV2D
+def _prune_conv2d_helper(conv2d: nn.Conv2d) -> Tensor:
+    parametrization_dict = cast(nn.ModuleDict, conv2d.parametrizations)
+    weight_parameterizations = cast(ParametrizationList, parametrization_dict.weight)
+    for p in weight_parameterizations:
+        if isinstance(p, FakeStructuredSparsity):
+            mask = cast(Tensor, p.mask)
+
+    with torch.no_grad():
+        parametrize.remove_parametrizations(conv2d, "weight", leave_parametrized=True)
+        conv2d.weight = nn.Parameter(conv2d.weight[mask])
+    conv2d.out_channels = conv2d.weight.shape[0]
+
+    _remove_bias_handles(conv2d)
+    return mask
+
+
+def prune_conv2d_padded(conv2d_1: nn.Conv2d) -> None:
+    parametrization_dict = cast(nn.ModuleDict, conv2d_1.parametrizations)
+    weight_parameterizations = cast(ParametrizationList, parametrization_dict.weight)
+    for p in weight_parameterizations:
+        if isinstance(p, FakeStructuredSparsity):
+            mask = cast(Tensor, p.mask)
+
+    with torch.no_grad():
+        parametrize.remove_parametrizations(conv2d_1, "weight", leave_parametrized=True)
+
+    if getattr(conv2d_1, "_bias", None) is not None:
+        if (
+            conv2d_1.bias is not None
+        ):  # conv2d_1 has original bias and bias propagated from previous layer
+            new_bias = torch.zeros(conv2d_1.bias.shape)
+            new_bias[mask] = conv2d_1.bias[mask]
+            # adjusted bias that to keep in conv2d_1
+            new_bias[~mask] = cast(Tensor, conv2d_1._bias)[~mask]
+            # pruned biases that are kept instead of propagated
+            conv2d_1.bias = nn.Parameter(new_bias)
+        else:  # conv2d_1 has only original bias
+            conv2d_1.bias = nn.Parameter(cast(Tensor, conv2d_1._bias))
+    else:
+        # no original bias, only propagated bias
+        if (
+            conv2d_1.bias is not None
+        ):  # conv2d_1 has bias propagated from previous layer
+            conv2d_1.bias.data[~mask] = 0
+
+    if hasattr(conv2d_1, "_bias"):
+        delattr(conv2d_1, "_bias")
+
+
+def prune_conv2d(conv2d: nn.Conv2d) -> None:
+    mask = _prune_conv2d_helper(conv2d)
+    if getattr(conv2d, "prune_bias", False):
+        _prune_module_bias(conv2d, mask)
+
+
+def prune_conv2d_conv2d(conv2d_1: nn.Conv2d, conv2d_2: nn.Conv2d) -> None:
+    prune_conv2d_activation_conv2d(conv2d_1, None, conv2d_2)
+
+
+def prune_conv2d_activation_conv2d(
+    conv2d_1: nn.Conv2d,
+    activation: Optional[Callable[[Tensor], Tensor]],
+    conv2d_2: nn.Conv2d,
+):
+    r"""
+    Fusion Pattern for conv2d -> some activation module / function -> conv2d layers
+    """
+    parametrization_dict = cast(nn.ModuleDict, conv2d_1.parametrizations)
+    weight_parameterizations = cast(ParametrizationList, parametrization_dict.weight)
+    for p in weight_parameterizations:
+        if isinstance(p, FakeStructuredSparsity):
+            mask = cast(Tensor, p.mask)
+
+    prune_bias = getattr(conv2d_1, "prune_bias", False)
+    if (
+        hasattr(conv2d_2, "padding")
+        and cast(Tuple[int], conv2d_2.padding) > (0, 0)
+        and (conv2d_1.bias is not None or getattr(conv2d_1, "_bias", None) is not None)
+    ):
+        prune_conv2d_padded(conv2d_1)
+    else:
+        mask = _prune_conv2d_helper(conv2d_1)
+        if prune_bias:
+            _prune_module_bias(conv2d_1, mask)
+        else:
+            pruned_biases = _propogate_module_bias(conv2d_1, mask)
+            if pruned_biases is not None:
+                if activation:
+                    pruned_biases = activation(pruned_biases)
+                conv2d_2.bias = _get_adjusted_next_layer_bias(
+                    conv2d_2, pruned_biases, mask
+                )
+
+        if (
+            not (
+                hasattr(conv2d_2, "padding")
+                and cast(Tuple[int], conv2d_2.padding) > (0, 0)
+            )
+            or conv2d_1.bias is None
+        ):
+            with torch.no_grad():
+                if parametrize.is_parametrized(conv2d_2):
+                    parametrization_dict = cast(
+                        nn.ModuleDict, conv2d_2.parametrizations
+                    )
+                    weight_parameterizations = cast(
+                        ParametrizationList, parametrization_dict.weight
+                    )
+                    weight_parameterizations.original = nn.Parameter(
+                        weight_parameterizations.original[:, mask]
+                    )
+                    conv2d_2.in_channels = weight_parameterizations.original.shape[1]
+                else:
+                    conv2d_2.weight = nn.Parameter(conv2d_2.weight[:, mask])
+                    conv2d_2.in_channels = conv2d_2.weight.shape[1]
+
+
+def prune_conv2d_pool_activation_conv2d(
+    c1: nn.Conv2d,
+    pool: nn.Module,
+    activation: Optional[Callable[[Tensor], Tensor]],
+    c2: nn.Conv2d,
+) -> None:
+    prune_conv2d_activation_conv2d(c1, activation, c2)
+
+
+def prune_conv2d_activation_pool_conv2d(
+    c1: nn.Conv2d,
+    activation: Optional[Callable[[Tensor], Tensor]],
+    pool: nn.Module,
+    c2: nn.Conv2d,
+) -> None:
+    prune_conv2d_activation_conv2d(c1, activation, c2)
+
+
+def prune_conv2d_pool_flatten_linear(
+    conv2d: nn.Conv2d,
+    pool: nn.Module,
+    flatten: Optional[Callable[[Tensor], Tensor]],
+    linear: nn.Linear,
+) -> None:
+    mask = _prune_conv2d_helper(conv2d)
+
+    # We map the pruned indices of the Conv2d output to the flattened indices of the Linear following the Flatten layer.
+    # we determine the flattening scale (h * w), and readjust `first_pruned_indices`
+    # (each idx maps to range idx * h * w to (idx+1) * h * w), `first_valid_indices`,
+    # and `pruned_biases` (repeat each bias by h * w).
+    if parametrize.is_parametrized(linear):
+        parametrization_dict = cast(nn.ModuleDict, linear.parametrizations)
+        weight_parameterizations = cast(
+            ParametrizationList, parametrization_dict.weight
+        )
+        linear_ic = weight_parameterizations.original.shape[1]
+    else:
+        linear_ic = linear.weight.shape[1]
+
+    conv2d_oc = len(mask)
+    assert (
+        linear_ic % conv2d_oc == 0
+    ), f"Flattening from dimensions {conv2d_oc} to {linear_ic} not supported"
+
+    flatten_scale = linear_ic // conv2d_oc
+    flattened_mask = torch.tensor(
+        [[val] * flatten_scale for val in mask], dtype=torch.bool, device=mask.device
+    ).flatten()
+
+    if getattr(conv2d, "prune_bias", False):
+        _prune_module_bias(conv2d, mask)
+    else:
+        pruned_biases = cast(Tensor, _propogate_module_bias(conv2d, mask))
+        flattened_pruned_biases = torch.tensor(
+            [[bias] * flatten_scale for bias in pruned_biases], device=mask.device
+        ).flatten()
+        linear.bias = _get_adjusted_next_layer_bias(
+            linear, flattened_pruned_biases, flattened_mask
+        )
+
+    with torch.no_grad():
+        if parametrize.is_parametrized(linear):
+            parametrization_dict = cast(nn.ModuleDict, linear.parametrizations)
+            weight_parameterizations = cast(
+                ParametrizationList, parametrization_dict.weight
+            )
+            weight_parameterizations.original = nn.Parameter(
+                weight_parameterizations.original[:, flattened_mask]
+            )
+            linear.in_features = weight_parameterizations.original.shape[1]
+        else:
+            linear.weight = nn.Parameter(linear.weight[:, flattened_mask])
+            linear.in_features = linear.weight.shape[1]
diff --git a/torch/testing/_internal/common_pruning.py b/torch/testing/_internal/common_pruning.py
new file mode 100644
index 0000000000000..8fc08ee2a41bf
--- /dev/null
+++ b/torch/testing/_internal/common_pruning.py
@@ -0,0 +1,311 @@
+# -*- coding: utf-8 -*-
+# Owner(s): ["module: unknown"]
+
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+class SimpleLinear(nn.Module):
+    r"""Model with only Linear layers without biases, some wrapped in a Sequential,
+    some following the Sequential. Used to test basic pruned Linear-Linear fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Linear(7, 5, bias=False),
+            nn.Linear(5, 6, bias=False),
+            nn.Linear(6, 4, bias=False),
+        )
+        self.linear1 = nn.Linear(4, 3, bias=False)
+        self.linear2 = nn.Linear(3, 10, bias=False)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.linear1(x)
+        x = self.linear2(x)
+        return x
+
+
+class LinearBias(nn.Module):
+    r"""Model with only Linear layers, alternating layers with biases,
+    wrapped in a Sequential. Used to test pruned Linear-Bias-Linear fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Linear(7, 5, bias=True),
+            nn.Linear(5, 6, bias=False),
+            nn.Linear(6, 3, bias=True),
+            nn.Linear(3, 3, bias=True),
+            nn.Linear(3, 10, bias=False),
+        )
+
+    def forward(self, x):
+        x = self.seq(x)
+        return x
+
+
+class LinearActivation(nn.Module):
+    r"""Model with only Linear layers, some with bias, some in a Sequential and some following.
+    Activation functions modules in between each Linear in the Sequential, and each outside layer.
+    Used to test pruned Linear(Bias)-Activation-Linear fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Linear(7, 5, bias=True),
+            nn.ReLU(),
+            nn.Linear(5, 6, bias=False),
+            nn.Tanh(),
+            nn.Linear(6, 4, bias=True),
+        )
+        self.linear1 = nn.Linear(4, 3, bias=True)
+        self.act1 = nn.ReLU()
+        self.linear2 = nn.Linear(3, 10, bias=False)
+        self.act2 = nn.Tanh()
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.linear1(x)
+        x = self.act1(x)
+        x = self.linear2(x)
+        x = self.act2(x)
+        return x
+
+
+class LinearActivationFunctional(nn.Module):
+    r"""Model with only Linear layers, some with bias, some in a Sequential and some following.
+    Activation functions modules in between each Linear in the Sequential, and functional
+    activationals are called in between each outside layer.
+    Used to test pruned Linear(Bias)-Activation-Linear fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Linear(7, 5, bias=True),
+            nn.ReLU(),
+            nn.Linear(5, 6, bias=False),
+            nn.ReLU(),
+            nn.Linear(6, 4, bias=True),
+        )
+        self.linear1 = nn.Linear(4, 3, bias=True)
+        self.linear2 = nn.Linear(3, 8, bias=False)
+        self.linear3 = nn.Linear(8, 10, bias=False)
+        self.act1 = nn.ReLU()
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.linear1(x)
+        x = F.relu(x)
+        x = self.linear2(x)
+        x = F.relu(x)
+        x = self.linear3(x)
+        x = F.relu(x)
+        return x
+
+
+class SimpleConv2d(nn.Module):
+    r"""Model with only Conv2d layers, all without bias, some in a Sequential and some following.
+    Used to test pruned Conv2d-Conv2d fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Conv2d(1, 32, 3, 1, bias=False),
+            nn.Conv2d(32, 64, 3, 1, bias=False),
+        )
+        self.conv2d1 = nn.Conv2d(64, 48, 3, 1, bias=False)
+        self.conv2d2 = nn.Conv2d(48, 52, 3, 1, bias=False)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.conv2d1(x)
+        x = self.conv2d2(x)
+        return x
+
+
+class Conv2dBias(nn.Module):
+    r"""Model with only Conv2d layers, some with bias, some in a Sequential and some outside.
+    Used to test pruned Conv2d-Bias-Conv2d fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Conv2d(1, 32, 3, 1, bias=True),
+            nn.Conv2d(32, 32, 3, 1, bias=True),
+            nn.Conv2d(32, 64, 3, 1, bias=False),
+        )
+        self.conv2d1 = nn.Conv2d(64, 48, 3, 1, bias=True)
+        self.conv2d2 = nn.Conv2d(48, 52, 3, 1, bias=False)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.conv2d1(x)
+        x = self.conv2d2(x)
+        return x
+
+
+class Conv2dActivation(nn.Module):
+    r"""Model with only Conv2d layers, some with bias, some in a Sequential and some following.
+    Activation function modules in between each Sequential layer, functional activations called
+    in-between each outside layer.
+    Used to test pruned Conv2d-Bias-Activation-Conv2d fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Conv2d(1, 32, 3, 1, bias=True),
+            nn.ReLU(),
+            nn.Conv2d(32, 64, 3, 1, bias=True),
+            nn.Tanh(),
+            nn.Conv2d(64, 64, 3, 1, bias=False),
+            nn.ReLU(),
+        )
+        self.conv2d1 = nn.Conv2d(64, 48, 3, 1, bias=False)
+        self.conv2d2 = nn.Conv2d(48, 52, 3, 1, bias=True)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.conv2d1(x)
+        x = F.relu(x)
+        x = self.conv2d2(x)
+        x = F.hardtanh(x)
+        return x
+
+
+class Conv2dPadBias(nn.Module):
+    r"""Model with only Conv2d layers, all with bias and some with padding > 0,
+    some in a Sequential and some following. Activation function modules in between each layer.
+    Used to test that bias is propagated correctly in the special case of
+    pruned Conv2d-Bias-(Activation)Conv2d fusion, when the second Conv2d layer has padding > 0."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Conv2d(1, 32, 3, 1, padding=1, bias=True),
+            nn.ReLU(),
+            nn.Conv2d(32, 32, 3, 1, bias=False),
+            nn.ReLU(),
+            nn.Conv2d(32, 32, 3, 1, padding=1, bias=True),
+            nn.ReLU(),
+            nn.Conv2d(32, 32, 3, 1, padding=1, bias=True),
+            nn.ReLU(),
+            nn.Conv2d(32, 64, 3, 1, bias=True),
+            nn.Tanh(),
+        )
+        self.conv2d1 = nn.Conv2d(64, 48, 3, 1, padding=1, bias=True)
+        self.act1 = nn.ReLU()
+        self.conv2d2 = nn.Conv2d(48, 52, 3, 1, padding=1, bias=True)
+        self.act2 = nn.Tanh()
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.conv2d1(x)
+        x = self.act1(x)
+        x = self.conv2d2(x)
+        x = self.act2(x)
+        return x
+
+
+class Conv2dPool(nn.Module):
+    r"""Model with only Conv2d layers, all with bias, some in a Sequential and some following.
+    Activation function modules in between each layer, Pool2d modules in between each layer.
+    Used to test pruned Conv2d-Pool2d-Conv2d fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Conv2d(1, 32, kernel_size=3, padding=1, bias=True),
+            nn.MaxPool2d(kernel_size=2, stride=2, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(32, 64, kernel_size=3, padding=1, bias=True),
+            nn.Tanh(),
+            nn.AvgPool2d(kernel_size=2, stride=2, padding=1),
+        )
+        self.conv2d1 = nn.Conv2d(64, 48, kernel_size=3, padding=1, bias=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2, padding=1)
+        self.af1 = nn.ReLU()
+        self.conv2d2 = nn.Conv2d(48, 52, kernel_size=3, padding=1, bias=True)
+        self.conv2d3 = nn.Conv2d(52, 52, kernel_size=3, padding=1, bias=True)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.conv2d1(x)
+        x = self.maxpool(x)
+        x = self.af1(x)
+        x = self.conv2d2(x)
+        x = F.avg_pool2d(x, kernel_size=2, stride=2, padding=1)
+        x = F.relu(x)
+        x = self.conv2d3(x)
+        return x
+
+
+class Conv2dPoolFlattenFunctional(nn.Module):
+    r"""Model with Conv2d layers, all with bias, some in a Sequential and some following, and then a Pool2d
+    and a functional Flatten followed by a Linear layer.
+    Activation functions and Pool2ds in between each layer also.
+    Used to test pruned Conv2d-Pool2d-Flatten-Linear fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Conv2d(1, 3, kernel_size=3, padding=1, bias=True),
+            nn.MaxPool2d(kernel_size=2, stride=2, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(3, 5, kernel_size=3, padding=1, bias=True),
+            nn.Tanh(),
+            nn.AvgPool2d(kernel_size=2, stride=2, padding=1),
+        )
+        self.conv2d1 = nn.Conv2d(5, 7, kernel_size=3, padding=1, bias=True)
+        self.af1 = nn.ReLU()
+        self.conv2d2 = nn.Conv2d(7, 11, kernel_size=3, padding=1, bias=True)
+        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(11, 13, bias=True)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.conv2d1(x)
+        x = F.max_pool2d(x, kernel_size=2, stride=2, padding=1)
+        x = self.af1(x)
+        x = self.conv2d2(x)
+        x = self.avg_pool(x)
+        x = torch.flatten(x, 1)  # test functional flatten
+        x = self.fc(x)
+        return x
+
+
+class Conv2dPoolFlatten(nn.Module):
+    r"""Model with Conv2d layers, all with bias, some in a Sequential and some following, and then a Pool2d
+    and a Flatten module followed by a Linear layer.
+    Activation functions and Pool2ds in between each layer also.
+    Used to test pruned Conv2d-Pool2d-Flatten-Linear fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Conv2d(1, 3, kernel_size=3, padding=1, bias=True),
+            nn.MaxPool2d(kernel_size=2, stride=2, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(3, 5, kernel_size=3, padding=1, bias=True),
+            nn.Tanh(),
+            nn.AvgPool2d(kernel_size=2, stride=2, padding=1),
+        )
+        self.conv2d1 = nn.Conv2d(5, 7, kernel_size=3, padding=1, bias=True)
+        self.af1 = nn.ReLU()
+        self.conv2d2 = nn.Conv2d(7, 11, kernel_size=3, padding=1, bias=True)
+        self.avg_pool = nn.AdaptiveAvgPool2d((2, 2))
+        self.flatten = nn.Flatten()
+        self.fc = nn.Linear(44, 13, bias=True)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.conv2d1(x)
+        x = F.max_pool2d(x, kernel_size=2, stride=2, padding=1)
+        x = self.af1(x)
+        x = self.conv2d2(x)
+        x = self.avg_pool(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x

From 8fd4c1a25d6fe81e81bff93d085a3a2b162ffe4d Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Thu, 8 Dec 2022 02:23:11 +0000
Subject: [PATCH 1716/1922] [Resubmit] state_dict_pre_hook (#90435)

Resubmit of https://github.com/pytorch/pytorch/pull/88541 which got stale.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90435
Approved by: https://github.com/fegin
---
 test/test_nn.py                           | 57 ++++++++++++++++++++++-
 torch/distributed/nn/api/remote_module.py |  2 +
 torch/jit/_recursive.py                   |  1 +
 torch/nn/modules/module.py                | 17 +++++++
 4 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/test/test_nn.py b/test/test_nn.py
index 9f5cbf85d5658..bd54ca01fdfc9 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -12,6 +12,7 @@
 from itertools import product
 from functools import partial
 from collections import OrderedDict
+from tempfile import NamedTemporaryFile
 
 import torch
 
@@ -37,7 +38,7 @@
     download_file, get_function_arglist, load_tests, skipIfMps,\
     TEST_WITH_UBSAN, IS_PPC, \
     parametrize as parametrize_test, subtest, instantiate_parametrized_tests, \
-    skipIfTorchDynamo
+    skipIfTorchDynamo, IS_WINDOWS
 from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, TEST_CUDNN_VERSION
 from torch.testing._internal.common_nn import NNTestCase, NewModuleTest, CriterionTest, \
     module_tests, criterion_tests, loss_reference_fns, _create_basic_net, \
@@ -2450,6 +2451,60 @@ def hook_fn(module, state_dict, prefix, local_metadata, strict, missing_keys, un
         model[0][0]._register_load_state_dict_pre_hook(hook_fn, with_module=True)
         model.load_state_dict(model.state_dict(), strict=True)
 
+    @unittest.skipIf(IS_WINDOWS, "Tempfile permission issue on windows")
+    def test_register_state_dict_pre_hook_backward_compat(self):
+        called = False
+
+        def my_state_dict_pre_hook(*args, **kwargs):
+            nonlocal called
+            called = True
+
+        m = nn.Linear(1, 1)
+        self.assertTrue(hasattr(m, '_state_dict_pre_hooks'))
+        delattr(m, '_state_dict_pre_hooks')
+        # Save and load, ensure we can still call state_dict
+        # without running into issues.
+        with NamedTemporaryFile() as f:
+            # Note that torch.save / torch.load is not recommended
+            # to save / load modules.
+            torch.save(m, f.name)
+            m = torch.load(f.name)
+
+        # Ensure we can run state_dict without issues
+        _ = m.state_dict()
+        self.assertFalse(called)
+        m.register_state_dict_pre_hook(my_state_dict_pre_hook)
+        _ = m.state_dict()
+        self.assertTrue(called)
+
+    def test_register_state_dict_pre_hook(self):
+        _state_dict_prefix = "foo."
+        state_dict_pre_hook_count = 0
+
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = nn.Sequential(nn.Linear(3, 3), nn.Linear(3, 3), nn.Linear(3, 3))
+
+            def forward(self, x):
+                return self.a(x)
+
+        def my_state_dict_pre_hook(module, prefix, keep_vars):
+            nonlocal keep_var_setting
+            self.assertEqual(keep_vars, keep_var_setting)
+            nonlocal state_dict_pre_hook_count
+            state_dict_pre_hook_count += 1
+            self.assertTrue(prefix.startswith(_state_dict_prefix))
+
+        mod = MyModule()
+        mod.register_state_dict_pre_hook(my_state_dict_pre_hook)
+        # Test to ensure submodules run the hook as well.
+        mod.a.register_state_dict_pre_hook(my_state_dict_pre_hook)
+        for keep_var_setting in [True, False]:
+            _ = mod.state_dict(prefix=_state_dict_prefix, keep_vars=keep_var_setting)
+            self.assertEqual(2, state_dict_pre_hook_count)
+            state_dict_pre_hook_count = 0
+
     @skipIfTorchDynamo("TorchDynamo fails here for unknown reasons")
     def test_load_state_dict_ref_cycle(self):
         # load_state_dict shouldn't cause a reference cycle involving Tensors
diff --git a/torch/distributed/nn/api/remote_module.py b/torch/distributed/nn/api/remote_module.py
index e9f9d4d3d3bb8..010b4d04f191e 100644
--- a/torch/distributed/nn/api/remote_module.py
+++ b/torch/distributed/nn/api/remote_module.py
@@ -68,8 +68,10 @@
     "_forward_pre_hooks",
     "_forward_pre_hooks_with_kwargs",
     "_state_dict_hooks",
+    "_state_dict_pre_hooks",
     "_load_state_dict_pre_hooks",
     "_load_state_dict_post_hooks",
+    "_state_dict_pre_hooks",
     "_modules",
     # The two attributes below are generated methods, not available at pickling time.
     "forward_async",
diff --git a/torch/jit/_recursive.py b/torch/jit/_recursive.py
index bbe7ff98cface..02516d7a2ac7c 100644
--- a/torch/jit/_recursive.py
+++ b/torch/jit/_recursive.py
@@ -34,6 +34,7 @@
     "_forward_pre_hooks",
     "_forward_pre_hooks_with_kwargs",
     "_state_dict_hooks",
+    "_state_dict_pre_hooks",
     "_load_state_dict_pre_hooks",
     "_load_state_dict_post_hooks",
     "_modules",
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index e57a7b26d1e5d..3bf50ecffadb6 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -429,6 +429,7 @@ def forward(self, x):
     _forward_pre_hooks_with_kwargs: Dict[int, bool]
     _state_dict_hooks: Dict[int, Callable]
     _load_state_dict_pre_hooks: Dict[int, Callable]
+    _state_dict_pre_hooks: Dict[int, Callable]
     _load_state_dict_post_hooks: Dict[int, Callable]
     _modules: Dict[str, Optional['Module']]
 
@@ -456,6 +457,7 @@ def __init__(self) -> None:
         super().__setattr__('_forward_pre_hooks', OrderedDict())
         super().__setattr__('_forward_pre_hooks_with_kwargs', OrderedDict())
         super().__setattr__('_state_dict_hooks', OrderedDict())
+        super().__setattr__('_state_dict_pre_hooks', OrderedDict())
         super().__setattr__('_load_state_dict_pre_hooks', OrderedDict())
         super().__setattr__('_load_state_dict_post_hooks', OrderedDict())
         super().__setattr__('_modules', OrderedDict())
@@ -1560,6 +1562,8 @@ def __setstate__(self, state):
             self._forward_hooks_with_kwargs = OrderedDict()
         if '_state_dict_hooks' not in self.__dict__:
             self._state_dict_hooks = OrderedDict()
+        if '_state_dict_pre_hooks' not in self.__dict__:
+            self._state_dict_pre_hooks = OrderedDict()
         if '_load_state_dict_pre_hooks' not in self.__dict__:
             self._load_state_dict_pre_hooks = OrderedDict()
         if '_load_state_dict_post_hooks' not in self.__dict__:
@@ -1668,6 +1672,16 @@ def _register_state_dict_hook(self, hook):
         self._state_dict_hooks[handle.id] = hook
         return handle
 
+    def register_state_dict_pre_hook(self, hook):
+        r"""These hooks will be called with arguments: ``self``, ``prefix``,
+        and ``keep_vars`` before calling ``state_dict`` on ``self``. The registered
+        hooks can be used to perform pre-processing before the ``state_dict``
+        call is made.
+        """
+        handle = hooks.RemovableHandle(self._state_dict_pre_hooks)
+        self._state_dict_pre_hooks[handle.id] = hook
+        return handle
+
     def _save_to_state_dict(self, destination, prefix, keep_vars):
         r"""Saves module state to `destination` dictionary, containing a state
         of the module, but not its descendants. This is called on every
@@ -1681,6 +1695,9 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
             prefix (str): the prefix for parameters and buffers used in this
                 module
         """
+        for hook in self._state_dict_pre_hooks.values():
+            hook(self, prefix, keep_vars)
+
         for name, param in self._parameters.items():
             if param is not None:
                 destination[prefix + name] = param if keep_vars else param.detach()

From 7d01f8a6dc54a7fc3982d84abfe7db4830fcf6c8 Mon Sep 17 00:00:00 2001
From: Jiewen Tan <jwtan@google.com>
Date: Thu, 8 Dec 2022 08:10:59 +0000
Subject: [PATCH 1717/1922] [LTC] Overlap data creation and ir_value setting
 (#90438)

Summary:
Upstreaming changes from torch_xla to lazy tensor core: https://github.com/pytorch/xla/pull/4011.
It overlaps data creation and ir_value setting with previous executions.

To be noted, this is a clone of https://github.com/pytorch/pytorch/pull/87119, and the author is @aws-rhsoln.

Test Plan:
CI.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90438
Approved by: https://github.com/JackCaoG
---
 torch/csrc/lazy/core/lazy_graph_executor.cpp | 88 ++++++++++++++------
 torch/csrc/lazy/core/lazy_graph_executor.h   | 24 ++++--
 torch/csrc/lazy/core/tensor.h                |  4 +-
 3 files changed, 84 insertions(+), 32 deletions(-)

diff --git a/torch/csrc/lazy/core/lazy_graph_executor.cpp b/torch/csrc/lazy/core/lazy_graph_executor.cpp
index 573a057c4ddfb..acab845c1f346 100644
--- a/torch/csrc/lazy/core/lazy_graph_executor.cpp
+++ b/torch/csrc/lazy/core/lazy_graph_executor.cpp
@@ -688,13 +688,37 @@ std::vector<Value> LazyGraphExecutor::CollectRoots(
   return roots;
 }
 
-std::vector<BackendDataPtr> LazyGraphExecutor::FetchTensorData(
+void LazyGraphExecutor::ExtractIRAndPrepareTensorData(
     std::vector<LazyTensorPtr>* tensors,
     const SyncTensorsConfig& config,
-    c10::ArrayRef<size_t> indices) {
+    c10::ArrayRef<size_t> indices,
+    std::vector<Value>& ir_values,
+    std::vector<BackendDataPtr>& tensor_data_vec) {
+  ir_values.reserve(indices.size());
+  tensor_data_vec.reserve(indices.size());
+  for (auto index : indices) {
+    LazyTensorPtr& tensor = (*tensors)[index];
+    Value ir_value = tensor->CurrentIrValue();
+    ir_values.push_back(ir_value);
+    const BackendDevice& tensor_device = tensor->GetDevice();
+    BackendDataPtr handle = getBackend()->CreateDataPlaceholder(
+        tensor_device, std::move(tensor->shape()));
+    tensor_data_vec.push_back(handle);
+    if (tensor->CurrentDataHandle() == nullptr && config.sync_ltc_data) {
+      tensor->AssignIrValue(Value());
+    }
+  }
+}
+
+std::vector<torch::lazy::BackendDataPtr> LazyGraphExecutor::SetTensorData(
+    std::vector<LazyTensorPtr>* tensors,
+    const SyncTensorsConfig& config,
+    c10::ArrayRef<size_t> indices,
+    const std::vector<BackendDataPtr>& tensor_data_vec) {
   std::vector<BackendDataPtr> tensors_data;
   tensors_data.reserve(indices.size());
-  for (auto index : indices) {
+  for (int i = 0; i < indices.size(); i++) {
+    auto index = indices[i];
     LazyTensorPtr& tensor = (*tensors)[index];
     // If the config.force_ltc_data flag is true, the purpose of this tensor
     // sync operation is to truncate the IR graph and materialize device data in
@@ -707,11 +731,12 @@ std::vector<BackendDataPtr> LazyGraphExecutor::FetchTensorData(
     // completes.
     BackendDataPtr handle = tensor->CurrentDataHandle();
     if (handle == nullptr && config.force_ltc_data) {
-      const BackendDevice& tensor_device = tensor->GetDevice();
-      handle = getBackend()->CreateDataPlaceholder(
-          tensor_device, std::move(tensor->shape()));
-
-      tensor->SetDataHandle(handle, config.sync_ltc_data);
+      handle = tensor_data_vec[i];
+      // Note: We are not using SetHandleData method here since that method
+      // resets the ir_value. We have already done the resetting as part
+      // of ExtractIRAndPrepareTensorData to overlap with previous execution.
+      tensor->data()->handle = handle;
+      tensor->data()->tensor_data = c10::nullopt;
     }
     tensors_data.emplace_back(std::move(handle));
   }
@@ -719,12 +744,11 @@ std::vector<BackendDataPtr> LazyGraphExecutor::FetchTensorData(
 }
 
 LazyGraphExecutor::PostOrderData LazyGraphExecutor::RunPostOrder(
-    const std::vector<LazyTensorPtr>& tensors,
+    const std::vector<Value>& ir_values,
     SyncTensorCollection* coll) {
   std::vector<const Node*> roots;
-  roots.reserve(coll->indices.size());
-  for (auto index : coll->indices) {
-    Value ir_value = tensors.at(index)->CurrentIrValue();
+  roots.reserve(ir_values.size());
+  for (auto ir_value : ir_values) {
     roots.push_back(ir_value.node.get());
   }
   PostOrderData po_data;
@@ -755,7 +779,8 @@ LazyGraphExecutor::PostOrderData LazyGraphExecutor::RunPostOrder(
 std::shared_ptr<LazyGraphExecutor::Async> LazyGraphExecutor::TryRunCachedSync(
     std::vector<LazyTensorPtr>* tensors,
     SyncTensorCollection* coll,
-    PostOrderData* po_data) {
+    PostOrderData* po_data,
+    const std::vector<BackendDataPtr>& tensor_data_vec) {
   ComputationCache::TypePtr cached_computation =
       LookupCachedCompile(coll->hash);
   if (cached_computation == nullptr) {
@@ -772,21 +797,22 @@ std::shared_ptr<LazyGraphExecutor::Async> LazyGraphExecutor::TryRunCachedSync(
       tensors,
       coll,
       std::move(po_data->parameters_data),
-      std::move(cached_computation));
+      std::move(cached_computation),
+      tensor_data_vec);
 }
 
 LazyGraphExecutor::CompilationResult LazyGraphExecutor::Compile(
     const std::vector<LazyTensorPtr>& tensors,
     c10::ArrayRef<std::string> devices,
     const SyncTensorCollection& coll,
-    PostOrderData* po_data) {
+    PostOrderData* po_data,
+    const std::vector<Value>& ir_values) {
   auto lowering_ctx = LoweringContext::Create(
       "SyncTensorsGraph",
       coll.device,
       po_data->post_order,
       std::move(po_data->emission_map));
-  for (auto index : coll.indices) {
-    Value ir_value = tensors[index]->CurrentIrValue();
+  for (auto ir_value : ir_values) {
     lowering_ctx->AddResult(ir_value);
   }
 
@@ -851,17 +877,23 @@ std::shared_ptr<LazyGraphExecutor::Async> LazyGraphExecutor::
     TensorCollectionBarrier(&coll);
     return nullptr;
   }
-  PostOrderData po_data = RunPostOrder(*tensors, &coll);
   DebugUtil::SaveTensorsGraphInfo(
       "ScheduleSyncTensorsGraph", *tensors, &coll.indices);
+  std::vector<Value> ir_values;
+  std::vector<BackendDataPtr> tensor_data_vec;
+  ExtractIRAndPrepareTensorData(
+      tensors, coll.config, coll.indices, ir_values, tensor_data_vec);
+  PostOrderData po_data = RunPostOrder(ir_values, &coll);
   coll.hash = HashCombine(coll.hash, Hash(po_data.parameter_sequence));
   VLOG(4) << "Parameter sequence graph hash " << HashToString(coll.hash);
-  std::shared_ptr<Async> async = TryRunCachedSync(tensors, &coll, &po_data);
+  std::shared_ptr<Async> async =
+      TryRunCachedSync(tensors, &coll, &po_data, tensor_data_vec);
   if (async != nullptr) {
     return async;
   }
 
-  CompilationResult compile_result = Compile(*tensors, devices, coll, &po_data);
+  CompilationResult compile_result =
+      Compile(*tensors, devices, coll, &po_data, ir_values);
   if (GRAPH_DUMP_ENABLED) {
     auto* comp = compile_result.computation.get();
     LOG(ERROR) << "Add a cached computation with hash " << coll.hash
@@ -880,7 +912,8 @@ std::shared_ptr<LazyGraphExecutor::Async> LazyGraphExecutor::
       tensors,
       &coll,
       std::move(compile_result.parameters_data),
-      std::move(cached_computation));
+      std::move(cached_computation),
+      tensor_data_vec);
 }
 
 std::shared_ptr<LazyGraphExecutor::Async> LazyGraphExecutor::
@@ -951,8 +984,10 @@ std::shared_ptr<LazyGraphExecutor::Async> LazyGraphExecutor::
         std::vector<LazyTensorPtr>* tensors,
         SyncTensorCollection* coll,
         std::vector<BackendDataPtr> parameters_data,
-        ComputationCache::TypePtr cached_computation) {
-  auto tensors_data = FetchTensorData(tensors, coll->config, coll->indices);
+        ComputationCache::TypePtr cached_computation,
+        const std::vector<BackendDataPtr>& tensor_data_vec) {
+  auto tensors_data =
+      SetTensorData(tensors, coll->config, coll->indices, tensor_data_vec);
   return ScheduleSyncTensorsGraph(
       coll,
       std::move(parameters_data),
@@ -1072,7 +1107,12 @@ hash_t LazyGraphExecutor::GetGraphHash(
   config.sync_ltc_data = false;
 
   auto coll = CollectSyncTensors(tensors, config);
-  auto po_data = RunPostOrder(tensors, &coll);
+  std::vector<Value> ir_values;
+  for (auto index : coll.indices) {
+    Value ir_value = tensors[index]->CurrentIrValue();
+    ir_values.push_back(ir_value);
+  }
+  auto po_data = RunPostOrder(ir_values, &coll);
   coll.hash = HashCombine(coll.hash, Hash(po_data.parameter_sequence));
   return coll.hash;
 }
diff --git a/torch/csrc/lazy/core/lazy_graph_executor.h b/torch/csrc/lazy/core/lazy_graph_executor.h
index ec15ae5fe22e5..10b41b64a6174 100644
--- a/torch/csrc/lazy/core/lazy_graph_executor.h
+++ b/torch/csrc/lazy/core/lazy_graph_executor.h
@@ -200,24 +200,35 @@ class TORCH_API LazyGraphExecutor {
       const std::vector<LazyTensorPtr>& tensors,
       c10::ArrayRef<size_t> indices);
 
-  std::vector<BackendDataPtr> FetchTensorData(
+  std::vector<BackendDataPtr> SetTensorData(
       std::vector<LazyTensorPtr>* tensors,
       const SyncTensorsConfig& config,
-      c10::ArrayRef<size_t> indices);
+      c10::ArrayRef<size_t> indices,
+      const std::vector<torch::lazy::BackendDataPtr>& tensor_data_vec);
+
+  void ExtractIRAndPrepareTensorData(
+      std::vector<LazyTensorPtr>* tensors,
+      const SyncTensorsConfig& config,
+      c10::ArrayRef<size_t> indices,
+      std::vector<Value>& ir_values,
+      std::vector<BackendDataPtr>& tensor_data_vec);
 
   PostOrderData RunPostOrder(
-      const std::vector<LazyTensorPtr>& tensors,
+      const std::vector<Value>& ir_values,
       SyncTensorCollection* coll);
+
   std::shared_ptr<Async> TryRunCachedSync(
       std::vector<LazyTensorPtr>* tensors,
       SyncTensorCollection* coll,
-      PostOrderData* po_data);
+      PostOrderData* po_data,
+      const std::vector<BackendDataPtr>& tensor_data_vec);
 
   CompilationResult Compile(
       const std::vector<LazyTensorPtr>& tensors,
       c10::ArrayRef<std::string> devices,
       const SyncTensorCollection& coll,
-      PostOrderData* po_data);
+      PostOrderData* po_data,
+      const std::vector<Value>& ir_values);
 
   ComputationCache::TypePtr LookupCachedCompile(const hash_t& hash);
 
@@ -239,7 +250,8 @@ class TORCH_API LazyGraphExecutor {
       std::vector<LazyTensorPtr>* tensors,
       SyncTensorCollection* coll,
       std::vector<BackendDataPtr> parameters_data,
-      ComputationCache::TypePtr cached_computation);
+      ComputationCache::TypePtr cached_computation,
+      const std::vector<BackendDataPtr>& tensor_data_vec);
 
   std::vector<at::Tensor> GetTensorsFused(std::vector<LazyTensorPtr>* tensors);
 
diff --git a/torch/csrc/lazy/core/tensor.h b/torch/csrc/lazy/core/tensor.h
index a5f94ea2fbec6..c58b773e07b1f 100644
--- a/torch/csrc/lazy/core/tensor.h
+++ b/torch/csrc/lazy/core/tensor.h
@@ -123,6 +123,8 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target {
   // Applies the queue of operations in preparation for using the data.
   void ApplyPendingGraph();
 
+  void AssignIrValue(Value ir_value) const;
+
  private:
   LazyTensor(const at::Tensor& tensor, const BackendDevice& device);
   LazyTensor(Value ir_value, const BackendDevice& device);
@@ -133,8 +135,6 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target {
     return data_;
   }
 
-  void AssignIrValue(Value ir_value) const;
-
   void SetTensorData(at::Tensor tensor_data);
 
   Value CreateTensorNode(BackendDataPtr data, bool read_only) const;

From 14f2f987a7ff8587dc3e44c7cabde5fbd18d60af Mon Sep 17 00:00:00 2001
From: Yuxin Wu <ppwwyyxxc@gmail.com>
Date: Thu, 8 Dec 2022 08:31:49 +0000
Subject: [PATCH 1718/1922] Avoid overflow in tensorboard image summary
 (#90423)

Fix #90419

Added some code such that the test will update the expect files when `expecttest.ACCEPT` is True.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90423
Approved by: https://github.com/soumith
---
 ...tTensorBoard.test_image_with_3_channel_batched.expect | 2 +-
 test/expect/TestTensorBoard.test_image_with_boxes.expect | 2 +-
 .../TestTensorBoard.test_image_with_one_channel.expect   | 2 +-
 ...ensorBoard.test_image_with_one_channel_batched.expect | 2 +-
 .../TestTensorBoard.test_image_without_channel.expect    | 2 +-
 test/test_tensorboard.py                                 | 9 +++++++++
 torch/utils/tensorboard/summary.py                       | 9 +++++----
 7 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/test/expect/TestTensorBoard.test_image_with_3_channel_batched.expect b/test/expect/TestTensorBoard.test_image_with_3_channel_batched.expect
index 2895ff76fdb8f..bc63fadcd04d0 100644
--- a/test/expect/TestTensorBoard.test_image_with_3_channel_batched.expect
+++ b/test/expect/TestTensorBoard.test_image_with_3_channel_batched.expect
@@ -4,6 +4,6 @@ value {
     height: 8
     width: 16
     colorspace: 3
-    encoded_image_string: "\211PNG\r\n\032\n\000\000\000\rIHDR\000\000\000\020\000\000\000\010\010\002\000\000\000\177\024\350\300\000\000\000+IDATx\234cd8\320\360\037\033pww\307*\316\362\343\307\217\037\330$~\374\370\361\037\233\004\013\016\365\377q\211\217H\r\000d\305y\224,\220Z\033\000\000\000\000IEND\256B`\202"
+    encoded_image_string: "\211PNG\r\n\032\n\000\000\000\rIHDR\000\000\000\020\000\000\000\010\010\002\000\000\000\177\024\350\300\000\000\000\034IDATx\234cd\370\377\377?\003\t\200\211$\325\014\014\014L$\252\037\231\032\000\355.\004\014i.\207\035\000\000\000\000IEND\256B`\202"
   }
 }
diff --git a/test/expect/TestTensorBoard.test_image_with_boxes.expect b/test/expect/TestTensorBoard.test_image_with_boxes.expect
index 4364b4841ef1d..1c28992dfa67c 100644
--- a/test/expect/TestTensorBoard.test_image_with_boxes.expect
+++ b/test/expect/TestTensorBoard.test_image_with_boxes.expect
@@ -4,6 +4,6 @@ value {
     height: 32
     width: 32
     colorspace: 3
-    encoded_image_string: "\211PNG\r\n\032\n\000\000\000\rIHDR\000\000\000 \000\000\000 \010\002\000\000\000\374\030\355\243\000\000\000sIDATx\234\355\323=\n\300 \014\005\340\027p\250\267p\324\373\332\373\345\020vn\007\367>0\204b\311\233\305/\344G\000\334\236\021Uu\005R\000\377\007\244\224\342\013||\007\2655\330BfP\215\337S`>:{_l\020\335\242\tX6-\000\032r\007G\316\000\2561\226\201\244\252/\005V\357\026\271\003\033\0149\000\232\270\003+\260\301\220\003\240y\000T\221\324V\250_v\320\000\000\000\000IEND\256B`\202"
+    encoded_image_string: "\211PNG\r\n\032\n\000\000\000\rIHDR\000\000\000 \000\000\000 \010\002\000\000\000\374\030\355\243\000\000\000CIDATx\234cd\370\377\377?\003\r\001\023MMg```\242\261\371\243\026\214Z@\005\300B@\236\221\221B\013\006\334\007\020@Ai2\364#y\324\202Q\013F-\030\265`\324\202Q\013\206\207\005\0008\302\006@\2475\013\321\000\000\000\000IEND\256B`\202"
   }
 }
diff --git a/test/expect/TestTensorBoard.test_image_with_one_channel.expect b/test/expect/TestTensorBoard.test_image_with_one_channel.expect
index 7b43f507fc2d2..c37098115c1f6 100644
--- a/test/expect/TestTensorBoard.test_image_with_one_channel.expect
+++ b/test/expect/TestTensorBoard.test_image_with_one_channel.expect
@@ -4,6 +4,6 @@ value {
     height: 8
     width: 8
     colorspace: 3
-    encoded_image_string: "\211PNG\r\n\032\n\000\000\000\rIHDR\000\000\000\010\000\000\000\010\010\002\000\000\000Km)\334\000\000\000\031IDATx\234cd``\370\217\r0\376\370\361\003\253\004\313\240\224\000\000;\267\273\313%\020=\255\000\000\000\000IEND\256B`\202"
+    encoded_image_string: "\211PNG\r\n\032\n\000\000\000\rIHDR\000\000\000\010\000\000\000\010\010\002\000\000\000Km)\334\000\000\000\035IDATx\234cd``\370\377\377?\003\006`\302*\312\300\300\300\204Ut\240%\000R\364\006\n\'\250a\364\000\000\000\000IEND\256B`\202"
   }
 }
diff --git a/test/expect/TestTensorBoard.test_image_with_one_channel_batched.expect b/test/expect/TestTensorBoard.test_image_with_one_channel_batched.expect
index e16187d04cb8e..8bd3a721b29f7 100644
--- a/test/expect/TestTensorBoard.test_image_with_one_channel_batched.expect
+++ b/test/expect/TestTensorBoard.test_image_with_one_channel_batched.expect
@@ -4,6 +4,6 @@ value {
     height: 8
     width: 16
     colorspace: 3
-    encoded_image_string: "\211PNG\r\n\032\n\000\000\000\rIHDR\000\000\000\020\000\000\000\010\010\002\000\000\000\177\024\350\300\000\000\000(IDATx\234cd``\370\217\r\034?~\034\2538\313\217\037?~\374\370\201)\201U\020\252\001\253\304\250\006$\000\000\230\346y\315\204l;t\000\000\000\000IEND\256B`\202"
+    encoded_image_string: "\211PNG\r\n\032\n\000\000\000\rIHDR\000\000\000\020\000\000\000\010\010\002\000\000\000\177\024\350\300\000\000\000\036IDATx\234cd``\370\377\377?\003\321\200\211$\325\014\014\014L$\251\036\251\032\000\215\270\006\nS2\367\330\000\000\000\000IEND\256B`\202"
   }
 }
diff --git a/test/expect/TestTensorBoard.test_image_without_channel.expect b/test/expect/TestTensorBoard.test_image_without_channel.expect
index 7b43f507fc2d2..c37098115c1f6 100644
--- a/test/expect/TestTensorBoard.test_image_without_channel.expect
+++ b/test/expect/TestTensorBoard.test_image_without_channel.expect
@@ -4,6 +4,6 @@ value {
     height: 8
     width: 8
     colorspace: 3
-    encoded_image_string: "\211PNG\r\n\032\n\000\000\000\rIHDR\000\000\000\010\000\000\000\010\010\002\000\000\000Km)\334\000\000\000\031IDATx\234cd``\370\217\r0\376\370\361\003\253\004\313\240\224\000\000;\267\273\313%\020=\255\000\000\000\000IEND\256B`\202"
+    encoded_image_string: "\211PNG\r\n\032\n\000\000\000\rIHDR\000\000\000\010\000\000\000\010\010\002\000\000\000Km)\334\000\000\000\035IDATx\234cd``\370\377\377?\003\006`\302*\312\300\300\300\204Ut\240%\000R\364\006\n\'\250a\364\000\000\000\000IEND\256B`\202"
   }
 }
diff --git a/test/test_tensorboard.py b/test/test_tensorboard.py
index e836b0f1ba8dc..3899bbead7f1f 100644
--- a/test/test_tensorboard.py
+++ b/test/test_tensorboard.py
@@ -7,6 +7,7 @@
 import sys
 import unittest
 import uuid
+import expecttest
 
 TEST_TENSORBOARD = True
 try:
@@ -520,6 +521,11 @@ def read_expected_content(function_ptr):
         return f.read()
 
 def compare_image_proto(actual_proto, function_ptr):
+    if expecttest.ACCEPT:
+        expected_file = get_expected_file(function_ptr)
+        with open(expected_file, 'w') as f:
+            f.write(text_format.MessageToString(actual_proto))
+        return True
     expected_str = read_expected_content(function_ptr)
     expected_proto = Summary()
     text_format.Parse(expected_str, expected_proto)
@@ -537,6 +543,9 @@ def compare_image_proto(actual_proto, function_ptr):
     )
 
 def compare_proto(str_to_compare, function_ptr):
+    if expecttest.ACCEPT:
+        write_proto(str_to_compare, function_ptr)
+        return True
     expected = read_expected_content(function_ptr)
     str_to_compare = str(str_to_compare)
     return remove_whitespace(str_to_compare) == remove_whitespace(expected)
diff --git a/torch/utils/tensorboard/summary.py b/torch/utils/tensorboard/summary.py
index bfa8600b34c24..1ddf603d4f745 100644
--- a/torch/utils/tensorboard/summary.py
+++ b/torch/utils/tensorboard/summary.py
@@ -433,7 +433,8 @@ def image(tag, tensor, rescale=1, dataformats="NCHW"):
         channels]` where `channels` is 1, 3, or 4.
         'tensor' can either have values in [0, 1] (float32) or [0, 255] (uint8).
         The image() function will scale the image values to [0, 255] by applying
-        a scale factor of either 1 (uint8) or 255 (float32).
+        a scale factor of either 1 (uint8) or 255 (float32). Out-of-range values
+        will be clipped.
     Returns:
       A scalar `Tensor` of type `string`. The serialized `Summary` protocol
       buffer.
@@ -443,7 +444,7 @@ def image(tag, tensor, rescale=1, dataformats="NCHW"):
     # Do not assume that user passes in values in [0, 255], use data type to detect
     scale_factor = _calc_scale_factor(tensor)
     tensor = tensor.astype(np.float32)
-    tensor = (tensor * scale_factor).astype(np.uint8)
+    tensor = (tensor * scale_factor).clip(0, 255).astype(np.uint8)
     image = make_image(tensor, rescale=rescale)
     return Summary(value=[Summary.Value(tag=tag, image=image)])
 
@@ -457,7 +458,7 @@ def image_boxes(
     tensor_boxes = make_np(tensor_boxes)
     tensor_image = tensor_image.astype(np.float32) * _calc_scale_factor(tensor_image)
     image = make_image(
-        tensor_image.astype(np.uint8), rescale=rescale, rois=tensor_boxes, labels=labels
+        tensor_image.clip(0, 255).astype(np.uint8), rescale=rescale, rois=tensor_boxes, labels=labels
     )
     return Summary(value=[Summary.Value(tag=tag, image=image)])
 
@@ -513,7 +514,7 @@ def video(tag, tensor, fps=4):
     # If user passes in uint8, then we don't need to rescale by 255
     scale_factor = _calc_scale_factor(tensor)
     tensor = tensor.astype(np.float32)
-    tensor = (tensor * scale_factor).astype(np.uint8)
+    tensor = (tensor * scale_factor).clip(0, 255).astype(np.uint8)
     video = make_video(tensor, fps)
     return Summary(value=[Summary.Value(tag=tag, image=video)])
 

From ffc7f50439d703bb4f71250cf48a8edc0e128a90 Mon Sep 17 00:00:00 2001
From: Yuxin Wu <ppwwyyxx@users.noreply.github.com>
Date: Thu, 8 Dec 2022 08:53:24 +0000
Subject: [PATCH 1719/1922] Remove deprecated call to
 tf.io.gfile.get_filesystem (#89832)

Fixes #30966 . Fixes #47139
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89832
Approved by: https://github.com/soumith
---
 torch/utils/tensorboard/_embedding.py | 39 ++++++++++++++++++---------
 torch/utils/tensorboard/writer.py     |  4 +--
 2 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/torch/utils/tensorboard/_embedding.py b/torch/utils/tensorboard/_embedding.py
index 7d7d180c28091..f172e09260833 100644
--- a/torch/utils/tensorboard/_embedding.py
+++ b/torch/utils/tensorboard/_embedding.py
@@ -6,6 +6,20 @@
 from tensorboard.plugins.projector.projector_config_pb2 import EmbeddingInfo
 
 
+_HAS_GFILE_JOIN = hasattr(tf.io.gfile, "join")
+
+
+def _gfile_join(a, b):
+    # The join API is different between tensorboard's TF stub and TF:
+    # https://github.com/tensorflow/tensorboard/issues/6080
+    # We need to try both because `tf` may point to either the stub or the real TF.
+    if _HAS_GFILE_JOIN:
+        return tf.io.gfile.join(a, b)
+    else:
+        fs = tf.io.gfile.get_filesystem(a)
+        return fs.join(a, b)
+
+
 def make_tsv(metadata, save_path, metadata_header=None):
     if not metadata_header:
         metadata = [str(x) for x in metadata]
@@ -16,8 +30,8 @@ def make_tsv(metadata, save_path, metadata_header=None):
         metadata = ["\t".join(str(e) for e in l) for l in [metadata_header] + metadata]
 
     metadata_bytes = tf.compat.as_bytes("\n".join(metadata) + "\n")
-    fs = tf.io.gfile.get_filesystem(save_path)
-    fs.write(fs.join(save_path, "metadata.tsv"), metadata_bytes, binary_mode=True)
+    with tf.io.gfile.GFile(_gfile_join(save_path, "metadata.tsv"), "wb") as f:
+        f.write(metadata_bytes)
 
 
 # https://github.com/tensorflow/tensorboard/issues/44 image label will be squared
@@ -42,31 +56,30 @@ def make_sprite(label_img, save_path):
         im.save(buf, format="PNG")
         im_bytes = buf.getvalue()
 
-    fs = tf.io.gfile.get_filesystem(save_path)
-    fs.write(fs.join(save_path, "sprite.png"), im_bytes, binary_mode=True)
+    with tf.io.gfile.GFile(_gfile_join(save_path, "sprite.png"), "wb") as f:
+        f.write(im_bytes)
 
 
-def get_embedding_info(metadata, label_img, filesys, subdir, global_step, tag):
+def get_embedding_info(metadata, label_img, subdir, global_step, tag):
     info = EmbeddingInfo()
     info.tensor_name = "{}:{}".format(tag, str(global_step).zfill(5))
-    info.tensor_path = filesys.join(subdir, "tensors.tsv")
+    info.tensor_path = _gfile_join(subdir, "tensors.tsv")
     if metadata is not None:
-        info.metadata_path = filesys.join(subdir, "metadata.tsv")
+        info.metadata_path = _gfile_join(subdir, "metadata.tsv")
     if label_img is not None:
-        info.sprite.image_path = filesys.join(subdir, "sprite.png")
+        info.sprite.image_path = _gfile_join(subdir, "sprite.png")
         info.sprite.single_image_dim.extend([label_img.size(3), label_img.size(2)])
     return info
 
 
 def write_pbtxt(save_path, contents):
-    fs = tf.io.gfile.get_filesystem(save_path)
-    config_path = fs.join(save_path, "projector_config.pbtxt")
-    fs.write(config_path, tf.compat.as_bytes(contents), binary_mode=True)
+    config_path = _gfile_join(save_path, "projector_config.pbtxt")
+    with tf.io.gfile.GFile(config_path, "wb") as f:
+        f.write(tf.compat.as_bytes(contents))
 
 
 def make_mat(matlist, save_path):
-    fs = tf.io.gfile.get_filesystem(save_path)
-    with tf.io.gfile.GFile(fs.join(save_path, "tensors.tsv"), "wb") as f:
+    with tf.io.gfile.GFile(_gfile_join(save_path, "tensors.tsv"), "wb") as f:
         for x in matlist:
             x = [str(i.item()) for i in x]
             f.write(tf.compat.as_bytes("\t".join(x) + "\n"))
diff --git a/torch/utils/tensorboard/writer.py b/torch/utils/tensorboard/writer.py
index 70b654384ff08..83bd0a25d1034 100644
--- a/torch/utils/tensorboard/writer.py
+++ b/torch/utils/tensorboard/writer.py
@@ -923,7 +923,7 @@ def add_embedding(
         subdir = "%s/%s" % (str(global_step).zfill(5), self._encode(tag))
         save_path = os.path.join(self._get_file_writer().get_logdir(), subdir)
 
-        fs = tf.io.gfile.get_filesystem(save_path)
+        fs = tf.io.gfile
         if fs.exists(save_path):
             if fs.isdir(save_path):
                 print(
@@ -959,7 +959,7 @@ def add_embedding(
         if not hasattr(self, "_projector_config"):
             self._projector_config = ProjectorConfig()
         embedding_info = get_embedding_info(
-            metadata, label_img, fs, subdir, global_step, tag
+            metadata, label_img, subdir, global_step, tag
         )
         self._projector_config.embeddings.extend([embedding_info])
 

From e248f08e31fb01d90323bf03113de302046a8123 Mon Sep 17 00:00:00 2001
From: Till Hoffmann <tillahoffmann@gmail.com>
Date: Thu, 8 Dec 2022 09:18:40 +0000
Subject: [PATCH 1720/1922] Add a transform for positive-definite matrices.
 (#76777)

The `PositiveDefiniteTransform` is required to transform from an unconstrained space to positive definite matrices, e.g. to support testing the Wishart mode in #76690. It is a simple extension of the `LowerCholeskyTransform`.

I've also added a small test that ensures the generated data belong to the domain of the associated transform. Previously, the data generated for the inverse transform of the `LowerCholeskyTransform` wasn't part of the domain, and the test only passed because the comparison uses `equal_nan=True`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/76777
Approved by: https://github.com/lezcano, https://github.com/fritzo, https://github.com/soumith
---
 test/distributions/test_constraints.py     |  1 +
 test/distributions/test_distributions.py   |  2 --
 test/distributions/test_transforms.py      | 16 ++++++++++++----
 torch/distributions/constraint_registry.py |  6 ++++++
 torch/distributions/transforms.py          | 20 ++++++++++++++++++++
 5 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/test/distributions/test_constraints.py b/test/distributions/test_constraints.py
index f0c0023af3d34..475d9f33ec9a3 100644
--- a/test/distributions/test_constraints.py
+++ b/test/distributions/test_constraints.py
@@ -53,6 +53,7 @@
     (constraints.simplex,),
     (constraints.corr_cholesky,),
     (constraints.lower_cholesky,),
+    (constraints.positive_definite,),
 ]
 
 
diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py
index 84236b5b51e02..a5687a4e1439e 100644
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@@ -3188,8 +3188,6 @@ def _test_discrete_distribution_mode(self, dist, sanitized_mode, batch_isfinite)
             self.assertTrue((-1e-12 < delta[mask].detach()).all())  # Allow up to 1e-12 rounding error.
 
     def _test_continuous_distribution_mode(self, dist, sanitized_mode, batch_isfinite):
-        if isinstance(dist, Wishart):
-            return
         # We perturb the mode in the unconstrained space and expect the log probability to decrease.
         num_points = 10
         transform = transform_to(dist.support)
diff --git a/test/distributions/test_transforms.py b/test/distributions/test_transforms.py
index ea99562b1f0c6..d922c83672287 100644
--- a/test/distributions/test_transforms.py
+++ b/test/distributions/test_transforms.py
@@ -14,7 +14,8 @@
                                             LowerCholeskyTransform, PowerTransform,
                                             ReshapeTransform, SigmoidTransform, TanhTransform,
                                             SoftmaxTransform, SoftplusTransform, StickBreakingTransform,
-                                            identity_transform, Transform, _InverseTransform)
+                                            identity_transform, Transform, _InverseTransform,
+                                            PositiveDefiniteTransform)
 from torch.distributions.utils import tril_matrix_to_vec, vec_to_tril_matrix
 
 
@@ -43,6 +44,7 @@ def get_transforms(cache_size):
         StickBreakingTransform(cache_size=cache_size),
         LowerCholeskyTransform(cache_size=cache_size),
         CorrCholeskyTransform(cache_size=cache_size),
+        PositiveDefiniteTransform(cache_size=cache_size),
         ComposeTransform([
             AffineTransform(torch.randn(4, 5),
                             torch.randn(4, 5),
@@ -118,10 +120,15 @@ def generate_data(transform):
         domain = domain.base_constraint
     codomain = transform.codomain
     x = torch.empty(4, 5)
-    if domain is constraints.lower_cholesky or codomain is constraints.lower_cholesky:
-        x = torch.empty(6, 6)
-        x = x.normal_()
+    positive_definite_constraints = [constraints.lower_cholesky, constraints.positive_definite]
+    if domain in positive_definite_constraints:
+        x = torch.randn(6, 6)
+        x = x.tril(-1) + x.diag().exp().diag_embed()
+        if domain is constraints.positive_definite:
+            return x @ x.T
         return x
+    elif codomain in positive_definite_constraints:
+        return torch.randn(6, 6)
     elif domain is constraints.real:
         return x.normal_()
     elif domain is constraints.real_vector:
@@ -189,6 +196,7 @@ def test_with_cache(transform):
 @pytest.mark.parametrize('test_cached', [True, False])
 def test_forward_inverse(transform, test_cached):
     x = generate_data(transform).requires_grad_()
+    assert transform.domain.check(x).all()  # verify that the input data are valid
     try:
         y = transform(x)
     except NotImplementedError:
diff --git a/torch/distributions/constraint_registry.py b/torch/distributions/constraint_registry.py
index dc0f07ff7b13e..a9709ba4bfc0e 100644
--- a/torch/distributions/constraint_registry.py
+++ b/torch/distributions/constraint_registry.py
@@ -229,6 +229,12 @@ def _transform_to_lower_cholesky(constraint):
     return transforms.LowerCholeskyTransform()
 
 
+@transform_to.register(constraints.positive_definite)
+@transform_to.register(constraints.positive_semidefinite)
+def _transform_to_positive_definite(constraint):
+    return transforms.PositiveDefiniteTransform()
+
+
 @biject_to.register(constraints.corr_cholesky)
 @transform_to.register(constraints.corr_cholesky)
 def _transform_to_corr_cholesky(constraint):
diff --git a/torch/distributions/transforms.py b/torch/distributions/transforms.py
index c73f33023275f..bf0cd8bb097c4 100644
--- a/torch/distributions/transforms.py
+++ b/torch/distributions/transforms.py
@@ -24,6 +24,7 @@
     'ExpTransform',
     'IndependentTransform',
     'LowerCholeskyTransform',
+    'PositiveDefiniteTransform',
     'PowerTransform',
     'ReshapeTransform',
     'SigmoidTransform',
@@ -972,6 +973,25 @@ def _inverse(self, y):
         return y.tril(-1) + y.diagonal(dim1=-2, dim2=-1).log().diag_embed()
 
 
+class PositiveDefiniteTransform(Transform):
+    """
+    Transform from unconstrained matrices to positive-definite matrices.
+    """
+    domain = constraints.independent(constraints.real, 2)
+    codomain = constraints.positive_definite  # type: ignore[assignment]
+
+    def __eq__(self, other):
+        return isinstance(other, PositiveDefiniteTransform)
+
+    def _call(self, x):
+        x = LowerCholeskyTransform()(x)
+        return x @ x.mT
+
+    def _inverse(self, y):
+        y = torch.linalg.cholesky(y)
+        return LowerCholeskyTransform().inv(y)
+
+
 class CatTransform(Transform):
     """
     Transform functor that applies a sequence of transforms `tseq`

From 418bddab9cb96d3179590d2beadb1e1cf577f561 Mon Sep 17 00:00:00 2001
From: yanbing-j <yanbing.jiang@intel.com>
Date: Thu, 8 Dec 2022 09:41:02 +0000
Subject: [PATCH 1721/1922] Upgrade oneDNN to v2.7.2 (#90051)

This PR is to upgrade oneDNN to v2.7.2.

### oneDNN v2.7.1 & 2.7.2 changes:
Fixes #89104
Updated ITT API version to 3.23.0

### Performance Benchmark
Use TorchBench test in ICX with 40 cores
Intel OpenMP & tcmalloc were preloaded
![image](https://user-images.githubusercontent.com/61222868/205240855-04e2d50f-8b3a-4097-9038-fdd0c0fc93b9.png)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90051
Approved by: https://github.com/XiaobingSuper, https://github.com/jgong5
---
 third_party/ideep         | 2 +-
 third_party/mkl-dnn.BUILD | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/third_party/ideep b/third_party/ideep
index 5ddc65efe0428..e533c771a1e75 160000
--- a/third_party/ideep
+++ b/third_party/ideep
@@ -1 +1 @@
-Subproject commit 5ddc65efe0428bbce2942b3ce5e3ce15239abe2f
+Subproject commit e533c771a1e75a1c225c14b2261eefa62681d9e6
diff --git a/third_party/mkl-dnn.BUILD b/third_party/mkl-dnn.BUILD
index fb41d31e89a84..6179b860c2030 100644
--- a/third_party/mkl-dnn.BUILD
+++ b/third_party/mkl-dnn.BUILD
@@ -55,8 +55,8 @@ template_rule(
     substitutions = {
         "@DNNL_VERSION_MAJOR@": "2",
         "@DNNL_VERSION_MINOR@": "7",
-        "@DNNL_VERSION_PATCH@": "0",
-        "@DNNL_VERSION_HASH@": "650085b2f3643aad05c629425983491d63b5c289",
+        "@DNNL_VERSION_PATCH@": "2",
+        "@DNNL_VERSION_HASH@": "fbec3e25a559ee252022ae066817b204e106a6ba",
     },
 )
 

From bf21b97f1ad704aa9a707c744f487d23c4c361e8 Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Thu, 8 Dec 2022 03:03:48 +0000
Subject: [PATCH 1722/1922] [inductor] Use decomposition for _to_copy (#90314)

Summary: also contains a fix for https://github.com/pytorch/pytorch/issues/89633

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90314
Approved by: https://github.com/ngimel
---
 test/inductor/test_torchinductor_opinfo.py |  2 --
 torch/_decomp/decompositions.py            |  3 ++-
 torch/_inductor/decomposition.py           |  1 +
 torch/_inductor/lowering.py                | 31 +++-------------------
 4 files changed, 6 insertions(+), 31 deletions(-)

diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index b2ebb479d7e00..915771362fcba 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -237,7 +237,6 @@ def process(device_type):
     "stft": {f32, f64},
     "svd_lowrank": {f32, f64},
     "tensor_split": {b8, f16, f32, f64, i32, i64},
-    "to": {b8, f16, f32, f64, i32, i64},
     "to_sparse": {f32, f64},
     "tril": {f16},
     "triu": {f16},
@@ -336,7 +335,6 @@ def process(device_type):
     "stft": {f32, f64},
     "svd_lowrank": {f32, f64},
     "tensor_split": {b8, f16, f32, f64, i32, i64},
-    "to": {b8, f16, f32, f64, i32, i64},
     "to_sparse": {f16, f32, f64},
     "uniform": {f16, f32, f64},
     "unique": {b8, f16, f32, f64, i32, i64},
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 1a8335dc292a1..0e8e440084da8 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -1523,7 +1523,8 @@ def _to_copy(
 ):
     assert not layout or layout == torch.strided, "TODO"
     assert not pin_memory, "TODO"
-    assert device is not None or dtype is not None or memory_format is not None
+    if device is None and dtype is None and memory_format is None:
+        return x.clone()
     dtype_converted = False
     if device is not None and device != x.get_device():
         # avoid conversions on cpu
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index f8fedcc786015..0492974b933d5 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -98,6 +98,7 @@
         aten.t,
         aten.tanh_backward,
         aten.threshold_backward,
+        aten._to_copy,
         aten.transpose.int,
         aten.tril.default,
         aten.unfold,
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 8f2e87f2f3b54..397aa9408c9a0 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -370,6 +370,7 @@ def _to_dtype(x):
     return make_pointwise(_to_dtype, override_return_dtype=dtype)(x)
 
 
+@register_lowering(prims.device_put, type_promotion_kind=None)
 def to_device(x: TensorBox, device: torch.device):
     device = decode_device(device)
     if x.get_device() == device:
@@ -377,32 +378,6 @@ def to_device(x: TensorBox, device: torch.device):
     return TensorBox.create(ir.DeviceCopy.create(x, device))
 
 
-@register_lowering(aten._to_copy)
-def _to_copy(
-    x,
-    *,
-    dtype=None,
-    layout=None,
-    device=None,
-    pin_memory=None,
-    non_blocking=False,
-    memory_format=None,
-):
-    assert not layout or layout == torch.strided, "TODO"
-    assert not pin_memory, "TODO"
-    assert not memory_format, "TODO"
-    if device:
-        device = decode_device(device)
-    if device is not None and device != x.get_device():
-        if dtype is not None and device.type == "cpu":
-            # CPU can do fewer type conversions
-            x = to_dtype(x, decode_dtype(dtype))
-        x = to_device(x, device)
-    if dtype is not None:
-        x = to_dtype(x, decode_dtype(dtype))
-    return x
-
-
 def ops_wrapper(name):
     assert isinstance(name, str)
 
@@ -1573,9 +1548,9 @@ def binary_search(start, end):
 def as_tensor(data, dtype=None, device=None):
     if isinstance(data, TensorBox):
         if dtype is not None:
-            data = to(data, dtype)
+            data = to_dtype(data, dtype)
         if device is not None:
-            data = to(data, device)
+            data = to_device(data, device)
         return data
     return tensor(data, dtype=dtype, device=device)
 

From a0905d93a6d73da6ff2189e9c31ab0bcd5038dbc Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Wed, 7 Dec 2022 15:39:37 +0000
Subject: [PATCH 1723/1922] [Composable API] Move test models to common file
 (#90385)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90385
Approved by: https://github.com/mrshenli
---
 test/distributed/_composable/test_compose.py  | 73 +++----------------
 .../_internal/common_dist_composable.py       | 60 +++++++++++++++
 2 files changed, 69 insertions(+), 64 deletions(-)
 create mode 100644 torch/testing/_internal/common_dist_composable.py

diff --git a/test/distributed/_composable/test_compose.py b/test/distributed/_composable/test_compose.py
index 8bedec41766ca..5ed0bc3d3819c 100644
--- a/test/distributed/_composable/test_compose.py
+++ b/test/distributed/_composable/test_compose.py
@@ -7,15 +7,20 @@
 import torch.distributed as dist
 import torch.nn as nn
 from torch.distributed._composable import checkpoint, fully_shard
-from torch.distributed.fsdp.wrap import ModuleWrapPolicy
 from torch.distributed.fsdp.api import ShardingStrategy
+from torch.distributed.fsdp.wrap import ModuleWrapPolicy
+from torch.testing._internal.common_dist_composable import (
+    CompositeModel,
+    CompositeParamModel,
+    UnitModule,
+)
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import FSDPTest
 from torch.testing._internal.common_utils import (
-    TEST_WITH_DEV_DBG_ASAN,
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
 )
 
 
@@ -32,62 +37,6 @@
     sys.exit(0)
 
 
-class UnitModule(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.l1 = nn.Linear(100, 100)
-        self.seq = nn.Sequential(
-            nn.ReLU(),
-            nn.Linear(100, 100),
-            nn.ReLU(),
-        )
-        self.l2 = nn.Linear(100, 100)
-
-    def forward(self, x):
-        return self.l2(self.seq(self.l1(x)))
-
-
-class CompositeModel(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.l1 = nn.Linear(100, 100)
-        self.u1 = UnitModule()
-        self.u2 = UnitModule()
-        self.l2 = nn.Linear(100, 100)
-
-    def forward(self, x):
-        return self.l2(self.u2(self.u1(self.l1(x))))
-
-
-class UnitParamModule(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.l = nn.Linear(100, 100)
-        self.seq = nn.Sequential(
-            nn.ReLU(),
-            nn.Linear(100, 100),
-            nn.ReLU(),
-        )
-        self.p = nn.Parameter(torch.randn(100, 100))
-
-    def forward(self, x):
-        return torch.mm(self.seq(self.l(x)), self.p)
-
-
-class CompositeParamModel(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.l = nn.Linear(100, 100)
-        self.u1 = UnitModule()
-        self.u2 = UnitModule()
-        self.p = nn.Parameter(torch.randn(100, 100))
-
-    def forward(self, x):
-        a = self.u2(self.u1(self.l(x)))
-        b = self.p
-        return torch.mm(a, b)
-
-
 class TestFSDPCheckpoint(FSDPTest):
     @property
     def world_size(self) -> int:
@@ -158,12 +107,8 @@ def _test_checkpoint_fsdp_submodules(self, use_reentrant):
             policy=ModuleWrapPolicy({UnitModule}),
         )
 
-        test_model.u1.seq = checkpoint(
-            test_model.u1.seq, use_reentrant=use_reentrant
-        )
-        test_model.u2.seq = checkpoint(
-            test_model.u2.seq, use_reentrant=use_reentrant
-        )
+        test_model.u1.seq = checkpoint(test_model.u1.seq, use_reentrant=use_reentrant)
+        test_model.u2.seq = checkpoint(test_model.u2.seq, use_reentrant=use_reentrant)
 
         self.run_subtests(
             {
diff --git a/torch/testing/_internal/common_dist_composable.py b/torch/testing/_internal/common_dist_composable.py
new file mode 100644
index 0000000000000..96fcb7e0bff0c
--- /dev/null
+++ b/torch/testing/_internal/common_dist_composable.py
@@ -0,0 +1,60 @@
+# Owner(s): ["oncall: distributed"]
+
+import torch
+import torch.nn as nn
+
+
+class UnitModule(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.l1 = nn.Linear(100, 100)
+        self.seq = nn.Sequential(
+            nn.ReLU(),
+            nn.Linear(100, 100),
+            nn.ReLU(),
+        )
+        self.l2 = nn.Linear(100, 100)
+
+    def forward(self, x):
+        return self.l2(self.seq(self.l1(x)))
+
+
+class CompositeModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.l1 = nn.Linear(100, 100)
+        self.u1 = UnitModule()
+        self.u2 = UnitModule()
+        self.l2 = nn.Linear(100, 100)
+
+    def forward(self, x):
+        return self.l2(self.u2(self.u1(self.l1(x))))
+
+
+class UnitParamModule(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.l = nn.Linear(100, 100)
+        self.seq = nn.Sequential(
+            nn.ReLU(),
+            nn.Linear(100, 100),
+            nn.ReLU(),
+        )
+        self.p = nn.Parameter(torch.randn(100, 100))
+
+    def forward(self, x):
+        return torch.mm(self.seq(self.l(x)), self.p)
+
+
+class CompositeParamModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.l = nn.Linear(100, 100)
+        self.u1 = UnitModule()
+        self.u2 = UnitModule()
+        self.p = nn.Parameter(torch.randn(100, 100))
+
+    def forward(self, x):
+        a = self.u2(self.u1(self.l(x)))
+        b = self.p
+        return torch.mm(a, b)

From 433f1cbe47616b57c19b6f514512f8f0b6b42497 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Wed, 7 Dec 2022 15:39:49 +0000
Subject: [PATCH 1724/1922] [Composable API] Refactor `test_fully_shard.py` to
 use common models (#90386)

Unlike for FSDP, where we already diverged to using per-test-file models, let us try to use the same set of models for the composable API effort. This can improve debugging efficiency because we know which module structures we support and which we do not _across all of our composable APIs_.

This PR had to perform some surgery for `test_materialize_meta_module`. Writing a correct parameter initialization function for meta device initialization is not easy, and we should revisit this. The old implementation, which followed the style of the previous unit tests--namely, using `module.to_empty()`--is actually incorrect for nested FSDP applications because `module.to_empty()` will re-initialize already materialized parameters and the module materialization proceeds bottom up. The existing unit test in `test_fsdp_meta.py` passes because it sets every parameter to ones (`self.weight.fill_(1)`), which is idempotent to re-initialization.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90386
Approved by: https://github.com/mrshenli
---
 test/distributed/_composable/test_compose.py  |   8 +-
 .../_composable/test_fully_shard.py           | 154 ++++++++----------
 .../_internal/common_dist_composable.py       |  36 ++--
 3 files changed, 91 insertions(+), 107 deletions(-)

diff --git a/test/distributed/_composable/test_compose.py b/test/distributed/_composable/test_compose.py
index 5ed0bc3d3819c..1e3eda8562ebb 100644
--- a/test/distributed/_composable/test_compose.py
+++ b/test/distributed/_composable/test_compose.py
@@ -70,7 +70,7 @@ def _test_parity(
     @skip_if_lt_x_gpu(2)
     @parametrize("use_reentrant", [True, False])
     def test_wrap_same_submodule(self, use_reentrant: bool):
-        model = UnitModule().to("cuda")
+        model = UnitModule(device=torch.device("cuda"))
 
         base_model = copy.deepcopy(model)
 
@@ -93,7 +93,7 @@ def test_wrap_same_submodule(self, use_reentrant: bool):
         )
 
     def _test_checkpoint_fsdp_submodules(self, use_reentrant):
-        model = CompositeModel().to(torch.device("cuda"))
+        model = CompositeModel(device=torch.device("cuda"))
 
         base_model = copy.deepcopy(model)
 
@@ -134,7 +134,7 @@ def test_checkpoint_fsdp_submodules_non_reentrant(self):
 
     @skip_if_lt_x_gpu(2)
     def test_checkpoint_fsdp_submodules_with_param(self):
-        model = CompositeParamModel().to(torch.device("cuda"))
+        model = CompositeParamModel(device=torch.device("cuda"))
 
         base_model = copy.deepcopy(model)
 
@@ -155,7 +155,7 @@ def test_checkpoint_fsdp_submodules_with_param(self):
 
     @skip_if_lt_x_gpu(2)
     def test_checkpoint_fsdp_submodules_with_param_no_shard(self):
-        model = CompositeParamModel().to(torch.device("cuda"))
+        model = CompositeParamModel(device=torch.device("cuda"))
 
         base_model = copy.deepcopy(model)
 
diff --git a/test/distributed/_composable/test_fully_shard.py b/test/distributed/_composable/test_fully_shard.py
index 1c4c5901ac979..232b1c6b08a9d 100644
--- a/test/distributed/_composable/test_fully_shard.py
+++ b/test/distributed/_composable/test_fully_shard.py
@@ -2,7 +2,7 @@
 
 import copy
 import sys
-from typing import Any, Tuple
+from typing import Optional
 
 import torch
 import torch.distributed as dist
@@ -10,13 +10,14 @@
 from torch.distributed._composable import fully_shard
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp._common_utils import _is_fsdp_flattened
-from torch.distributed.fsdp.wrap import ModuleWrapPolicy
+from torch.distributed.fsdp.wrap import _FSDPPolicy, ModuleWrapPolicy
+from torch.testing._internal.common_dist_composable import (
+    CompositeParamModel,
+    UnitModule,
+)
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import FSDPTest
-from torch.testing._internal.common_utils import (
-    run_tests,
-    TEST_WITH_DEV_DBG_ASAN,
-)
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
@@ -30,42 +31,6 @@
     sys.exit(0)
 
 
-class SubModel(nn.Module):
-    def __init__(self, device) -> None:
-        super().__init__()
-        torch.manual_seed(0)
-        self.lin1 = nn.Linear(5, 5, bias=False, device=device)
-        self.lin2 = nn.Linear(5, 5, bias=False, device=device)
-        self.relu = nn.ReLU()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        z = self.relu(self.lin1(x))
-        z = self.relu(self.lin2(z))
-        return z
-
-
-class Model(nn.Module):
-    def __init__(self, device) -> None:
-        super().__init__()
-        torch.manual_seed(0)
-        self.sub1 = SubModel(device=device)
-        self.sub2 = SubModel(device=device)
-        self.lin = nn.Linear(5, 5, device=device)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        z = self.sub1(x)
-        z = self.sub2(z)
-        z = self.lin(z)
-        return z
-
-    @staticmethod
-    def policy():
-        return ModuleWrapPolicy({SubModel})
-
-    def get_input(self, device=torch.device) -> Tuple[Any, ...]:
-        return (torch.randn((8, 5), device=device),)
-
-
 class TestFSDPInitialization(FSDPTest):
     """Tests composable FSDP initialization."""
 
@@ -73,19 +38,25 @@ class TestFSDPInitialization(FSDPTest):
     def world_size(self) -> int:
         return 2
 
-    def _test_auto_wrap_policy(self, auto_wrap_policy):
-        """Tests passing an ``auto_wrap_policy``."""
+    @skip_if_lt_x_gpu(2)
+    def test_policy(self):
+        """Tests passing a ``policy`` for pseudo-auto-wrapping."""
+        self.run_subtests(
+            {"policy": [None, ModuleWrapPolicy({UnitModule})]},
+            self._test_policy,
+        )
 
-        local_model = Model(device=torch.device("cuda"))
+    def _test_policy(self, policy: Optional[_FSDPPolicy]):
+        local_model = CompositeParamModel(torch.device("cuda"))
         fsdp_wrapped_model = FSDP(
             copy.deepcopy(local_model),
-            auto_wrap_policy=auto_wrap_policy,
+            auto_wrap_policy=policy,
             use_orig_params=True,
         )
         composable_module = copy.deepcopy(local_model)
         fully_shard(
             composable_module,
-            policy=auto_wrap_policy,
+            policy=policy,
         )
 
         # Check that the composable module has the same names as the local
@@ -123,23 +94,18 @@ def _test_auto_wrap_policy(self, auto_wrap_policy):
             composable_module_classes.add(type(submodule))
         self.assertEqual(local_module_classes, composable_module_classes)
 
-    @skip_if_lt_x_gpu(2)
-    def test_auto_wrap_policy(self):
-        self.run_subtests(
-            {"auto_wrap_policy": [None, Model.policy()]},
-            self._test_auto_wrap_policy,
-        )
-
     @skip_if_lt_x_gpu(2)
     def test_device_id(self):
         """Tests passing a ``device_id``."""
         cpu_device = torch.device("cpu")
-        composable_module = Model(device=cpu_device)
+        composable_module = CompositeParamModel(device=cpu_device)
         for param in composable_module.parameters():
-            assert param.device == cpu_device
+            assert (
+                param.device == cpu_device
+            ), "Expects module to be initialized on CPU for this unit test"
         fully_shard(
             composable_module,
-            policy=Model.policy(),
+            policy=ModuleWrapPolicy({UnitModule}),
             device_id=self.rank,
         )
         for param in composable_module.parameters():
@@ -148,7 +114,7 @@ def test_device_id(self):
     @skip_if_lt_x_gpu(2)
     def test_sync_module_states(self):
         """Tests passing ``sync_module_states=True``."""
-        local_model = Model(device=torch.device("cuda"))
+        local_model = CompositeParamModel(device=torch.device("cuda"))
         composable_module = copy.deepcopy(local_model)
         # Check that the parameters are broadcast from rank 0 by comparing
         # against an equivalent FSDP-wrapped module
@@ -156,14 +122,15 @@ def test_sync_module_states(self):
             for param in composable_module.parameters():
                 with torch.no_grad():
                     param.zero_()
+        policy = ModuleWrapPolicy({UnitModule})
         fsdp_wrapped_model = FSDP(
             copy.deepcopy(local_model),
-            auto_wrap_policy=Model.policy(),
+            auto_wrap_policy=policy,
             use_orig_params=True,
         )
         fully_shard(
             composable_module,
-            policy=Model.policy(),
+            policy=policy,
             sync_module_states=True,
         )
         for (composable_param, fsdp_wrapped_param) in zip(
@@ -180,37 +147,54 @@ def _param_init_fn(module: nn.Module):
             """
             This is an example ``param_init_fn`` for composable FSDP.
 
-            TODO: This function is not satisfactory because this requires
-            guarding with ``_is_fsdp_flattened()``. This guard is needed to
-            avoid re-initializing parameters for nested cases since some
-            initialization methods strictly require non-1D shape (e.g.
-            ``kaiming_uniform_()``), while FSDP replaces the original
-            parameters with their 1D shards.
+            TODO: This function is not satisfactory because:
+            (1) This requires guarding with ``_is_fsdp_flattened()``. This
+            guard is needed to avoid re-initializing parameters for nested
+            cases since some initialization methods strictly require non-1D
+            shape (e.g. ``kaiming_uniform_()``), while FSDP replaces the
+            original parameters with their 1D shards.
+            (2) This requires module-by-module traversal and manual ``setattr``
+            usage as opposed to first calling ``module.to_empty()`` and then
+            initializing each parameter after. The latter will override the
+            initialization of already-initialized nested parameters. In other
+            words, this parameter initialization function must strictly modify
+            only the parameters on meta device.
             """
-            is_meta = any(param.is_meta for param in module.parameters())
-            if is_meta:
-                module.to_empty(device=torch.cuda.current_device())
             torch.manual_seed(0)
-            for param in module.parameters():
-                if not _is_fsdp_flattened(param):
-                    nn.init.uniform_(param)
-
-        composable_module = Model(device="meta")
+            for submodule in module.modules():
+                for param_name, param in submodule.named_parameters(recurse=False):
+                    if not _is_fsdp_flattened(param) and param.is_meta:
+                        materialized_param = nn.Parameter(
+                            torch.empty_like(param, device=torch.device("cuda"))
+                        )
+                        nn.init.uniform_(materialized_param)
+                        setattr(submodule, param_name, materialized_param)
+
+        composable_module = CompositeParamModel(device=torch.device("meta"))
+        meta_model = CompositeParamModel(device=torch.device("meta"))
         fsdp_wrapped_model = FSDP(
-            Model(device="meta"),
-            auto_wrap_policy=Model.policy(),
+            meta_model,
+            auto_wrap_policy=ModuleWrapPolicy({UnitModule}),
             param_init_fn=_param_init_fn,
             use_orig_params=True,
         )
         fully_shard(
             composable_module,
-            policy=Model.policy(),
+            policy=ModuleWrapPolicy({UnitModule}),
             param_init_fn=_param_init_fn,
         )
-        for (composable_param, fsdp_wrapped_param) in zip(
-            composable_module.parameters(),
-            fsdp_wrapped_model.parameters(),
+        for (
+            (composable_param_name, composable_param),
+            (fsdp_wrapped_param_name, fsdp_wrapped_param),
+        ) in zip(
+            composable_module.named_parameters(),
+            fsdp_wrapped_model.named_parameters(),
         ):
+            self.assertEqual(composable_param_name, fsdp_wrapped_param_name)
+            self.assertEqual(
+                composable_param.device,
+                torch.device("cuda", torch.cuda.current_device()),
+            )
             self.assertEqual(composable_param, fsdp_wrapped_param)
 
 
@@ -225,30 +209,30 @@ def world_size(self) -> int:
     def test_training(self):
         """Tests training (forward, backward, optimizer)."""
         device = torch.device("cuda")
-        local_model = Model(device=device)
+        local_model = CompositeParamModel(device=device)
         fsdp_wrapped_model = FSDP(
             copy.deepcopy(local_model),
-            auto_wrap_policy=Model.policy(),
+            auto_wrap_policy=ModuleWrapPolicy({UnitModule}),
             use_orig_params=True,
         )
         composable_module = copy.deepcopy(local_model)
         fully_shard(
             composable_module,
-            policy=Model.policy(),
+            policy=ModuleWrapPolicy({UnitModule}),
         )
         del local_model  # not needed anymore
         LR = 1e-2
         fsdp_wrapped_optim = torch.optim.Adam(fsdp_wrapped_model.parameters(), lr=LR)
         composable_optim = torch.optim.Adam(composable_module.parameters(), lr=LR)
         for _ in range(5):
-            inp = composable_module.get_input(device)
+            inp = torch.randn(2, 100, device="cuda")
             losses = []
             for model, optim in (
                 (fsdp_wrapped_model, fsdp_wrapped_optim),
                 (composable_module, composable_optim),
             ):
                 optim.zero_grad(set_to_none=True)
-                out = model(*inp)
+                out = model(inp)
                 loss = out.sum()
                 losses.append(loss)
                 loss.backward()
diff --git a/torch/testing/_internal/common_dist_composable.py b/torch/testing/_internal/common_dist_composable.py
index 96fcb7e0bff0c..42029f153eabe 100644
--- a/torch/testing/_internal/common_dist_composable.py
+++ b/torch/testing/_internal/common_dist_composable.py
@@ -5,54 +5,54 @@
 
 
 class UnitModule(nn.Module):
-    def __init__(self):
+    def __init__(self, device: torch.device):
         super().__init__()
-        self.l1 = nn.Linear(100, 100)
+        self.l1 = nn.Linear(100, 100, device=device)
         self.seq = nn.Sequential(
             nn.ReLU(),
-            nn.Linear(100, 100),
+            nn.Linear(100, 100, device=device),
             nn.ReLU(),
         )
-        self.l2 = nn.Linear(100, 100)
+        self.l2 = nn.Linear(100, 100, device=device)
 
     def forward(self, x):
         return self.l2(self.seq(self.l1(x)))
 
 
 class CompositeModel(nn.Module):
-    def __init__(self):
+    def __init__(self, device: torch.device):
         super().__init__()
-        self.l1 = nn.Linear(100, 100)
-        self.u1 = UnitModule()
-        self.u2 = UnitModule()
-        self.l2 = nn.Linear(100, 100)
+        self.l1 = nn.Linear(100, 100, device=device)
+        self.u1 = UnitModule(device)
+        self.u2 = UnitModule(device)
+        self.l2 = nn.Linear(100, 100, device=device)
 
     def forward(self, x):
         return self.l2(self.u2(self.u1(self.l1(x))))
 
 
 class UnitParamModule(nn.Module):
-    def __init__(self):
+    def __init__(self, device: torch.device):
         super().__init__()
-        self.l = nn.Linear(100, 100)
+        self.l = nn.Linear(100, 100, device=device)
         self.seq = nn.Sequential(
             nn.ReLU(),
-            nn.Linear(100, 100),
+            nn.Linear(100, 100, device=device),
             nn.ReLU(),
         )
-        self.p = nn.Parameter(torch.randn(100, 100))
+        self.p = nn.Parameter(torch.randn((100, 100), device=device))
 
     def forward(self, x):
         return torch.mm(self.seq(self.l(x)), self.p)
 
 
 class CompositeParamModel(nn.Module):
-    def __init__(self):
+    def __init__(self, device: torch.device):
         super().__init__()
-        self.l = nn.Linear(100, 100)
-        self.u1 = UnitModule()
-        self.u2 = UnitModule()
-        self.p = nn.Parameter(torch.randn(100, 100))
+        self.l = nn.Linear(100, 100, device=device)
+        self.u1 = UnitModule(device)
+        self.u2 = UnitModule(device)
+        self.p = nn.Parameter(torch.randn((100, 100), device=device))
 
     def forward(self, x):
         a = self.u2(self.u1(self.l(x)))

From 538e478972cb6c0b3736fa2d12e23b98e4f3125e Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Wed, 7 Dec 2022 15:40:01 +0000
Subject: [PATCH 1725/1922] [Composable API] Match `fully_shard()` comm.
 schedule with wrapper FSDP (#90387)

- This PR introduces a new concept, the _communication module_ (denoted `comm_module`), that represents the module responsible for the unshard/reshard pair for a `FlatParamHandle`. This is well-defined because the current design assumes that each `FlatParamHandle` only has _one_ unshard/reshard pair for either the forward or backward pass.
    - For the wrapper code path, the `comm_module` is exactly the module already being passed to the `FlatParamHandle` constructor.
    - For the composable code path, the `comm_module` is not necessarily the module already being passed to the `FlatParamHandle`. This is because the module already being passed is always the local FSDP root module to give complete FQNs, instead of local FQNs. Distinguishing the communication module from the local FSDP root module can provide more flexibility for non-recursive wrapping designs in the future.
- This PR adds a unit test `test_unshard_reshard_order` that explicitly checks that `_unshard` and `_reshard` are called in the exactly the same order across the two code paths.
- This PR does not fix `test_checkpoint_fsdp_submodules_use_reentrant`. However, the error message changes, so this PR accommodates that.
    - The error is now the same as if we used the equivalent wrapper FSDP:
    ```
    test_model.u1 = FSDP(test_model.u1, use_orig_params=True)
    test_model.u2 = FSDP(test_model.u2, use_orig_params=True)
    ```
    - The error is also the same as if we used wrapper FSDP with `use_orig_params=False`, so it is not unique to `use_orig_params=True`.

---

**`comm_module` Example**

```
model = Model(
    seq1: nn.Sequential(
        nn.Linear
        nn.ReLU
        nn.Linear
        nn.ReLU
    )
    seq2: nn.Sequential(
        nn.Linear
        nn.ReLU
        nn.Linear
        nn.ReLU
    )
)
policy = ModuleWrapPolicy({nn.Sequential})
fully_shard(model, policy=policy)
FullyShardedDataParallel(model, auto_wrap_policy=policy)
```
- This policy constructs two `FlatParamHandle`s, one for `seq1` and one for `seq2`.
- `FullyShardedDataParallel` will pass `seq1` and `seq2` as the `module` argument to the two `FlatParamHandle`s, respectively.
- `fully_shard()` will pass `model` as the `module` argument to every `FlatParamHandle`.
- `FullyShardedDataParallel` will pass `seq1` and `seq2` as the `comm_module` argument to the two `FlatParamHandle`s, respectively.
- `fully_shard()` will pass `seq1` and `seq2` as the `comm_module` argument to the two `FlatParamHandle`s, respectively.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90387
Approved by: https://github.com/mrshenli
---
 test/distributed/_composable/test_compose.py  |   7 +-
 .../_composable/test_fully_shard.py           | 183 +++++++++++++++++-
 .../fsdp/test_fsdp_flatten_params.py          |   3 +
 torch/distributed/fsdp/_init_utils.py         |  22 ++-
 torch/distributed/fsdp/_runtime_utils.py      |  11 +-
 torch/distributed/fsdp/flat_param.py          |  25 ++-
 6 files changed, 220 insertions(+), 31 deletions(-)

diff --git a/test/distributed/_composable/test_compose.py b/test/distributed/_composable/test_compose.py
index 1e3eda8562ebb..6740f561eb51a 100644
--- a/test/distributed/_composable/test_compose.py
+++ b/test/distributed/_composable/test_compose.py
@@ -122,9 +122,12 @@ def _test_checkpoint_fsdp_submodules(self, use_reentrant):
 
     @skip_if_lt_x_gpu(2)
     def test_checkpoint_fsdp_submodules_use_reentrant(self):
+        # Escape the brackets like `\[` since `[` has special meaning in regex
         with self.assertRaisesRegex(
-            AssertionError,
-            "Expects `Tensor` to have been saved in forward",
+            RuntimeError,
+            r"setStorage: sizes \[100, 100\], strides \[100, 1\], storage "
+            "offset 0, and itemsize 4 requiring a storage size of 40000 are "
+            "out of bounds for storage of size 0",
         ):
             self._test_checkpoint_fsdp_submodules(True)
 
diff --git a/test/distributed/_composable/test_fully_shard.py b/test/distributed/_composable/test_fully_shard.py
index 232b1c6b08a9d..71903a2f66544 100644
--- a/test/distributed/_composable/test_fully_shard.py
+++ b/test/distributed/_composable/test_fully_shard.py
@@ -1,15 +1,22 @@
 # Owner(s): ["oncall: distributed"]
 
+import contextlib
 import copy
+import functools
 import sys
-from typing import Optional
+from typing import Callable, Iterable, List, Optional, Tuple
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 from torch.distributed._composable import fully_shard
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from torch.distributed.fsdp._common_utils import _is_fsdp_flattened
+from torch.distributed.fsdp._common_utils import (
+    _all_handles,
+    _FSDPState,
+    _is_fsdp_flattened,
+)
+from torch.distributed.fsdp.flat_param import _HandlesKey, FlatParamHandle
 from torch.distributed.fsdp.wrap import _FSDPPolicy, ModuleWrapPolicy
 from torch.testing._internal.common_dist_composable import (
     CompositeParamModel,
@@ -205,10 +212,9 @@ class TestFSDPRuntime(FSDPTest):
     def world_size(self) -> int:
         return 2
 
-    @skip_if_lt_x_gpu(2)
-    def test_training(self):
-        """Tests training (forward, backward, optimizer)."""
-        device = torch.device("cuda")
+    def _init_models_and_optims(
+        self, device: torch.device
+    ) -> Tuple[nn.Module, torch.optim.Optimizer, nn.Module, torch.optim.Optimizer]:
         local_model = CompositeParamModel(device=device)
         fsdp_wrapped_model = FSDP(
             copy.deepcopy(local_model),
@@ -220,13 +226,29 @@ def test_training(self):
             composable_module,
             policy=ModuleWrapPolicy({UnitModule}),
         )
-        del local_model  # not needed anymore
         LR = 1e-2
         fsdp_wrapped_optim = torch.optim.Adam(fsdp_wrapped_model.parameters(), lr=LR)
         composable_optim = torch.optim.Adam(composable_module.parameters(), lr=LR)
+        return (
+            composable_module,
+            composable_optim,
+            fsdp_wrapped_model,
+            fsdp_wrapped_optim,
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_training(self):
+        """Tests training (forward, backward, optimizer)."""
+        device = torch.device("cuda")
+        (
+            composable_module,
+            composable_optim,
+            fsdp_wrapped_model,
+            fsdp_wrapped_optim,
+        ) = self._init_models_and_optims(device)
         for _ in range(5):
             inp = torch.randn(2, 100, device="cuda")
-            losses = []
+            losses: List[torch.Tensor] = []
             for model, optim in (
                 (fsdp_wrapped_model, fsdp_wrapped_optim),
                 (composable_module, composable_optim),
@@ -239,6 +261,151 @@ def test_training(self):
                 optim.step()
             self.assertEqual(losses[0], losses[1])
 
+    @skip_if_lt_x_gpu(2)
+    def test_unshard_reshard_order(self):
+        """
+        Tests that the unshard/reshard order matches between ``fully_shard``
+        and ``FullyShardedDataParallel`` for the same policy.
+
+        NOTE: We use FQNs as the proxy for checking the order across the two
+        versions. See ``_check_same_param_handles()`` for details.
+        """
+        device = torch.device("cuda")
+        (
+            composable_module,
+            composable_optim,
+            fsdp_wrapped_model,
+            fsdp_wrapped_optim,
+        ) = self._init_models_and_optims(device)
+        # Before checking the unshard/reshard order, sanity check that the
+        # assumption about wrapper FQN being a suffix of composable FQN holds
+        all_composable_handles = _all_handles(fully_shard.state(composable_module))
+        all_wrapped_handles = _all_handles(fsdp_wrapped_model)
+        self._check_same_param_handles(all_composable_handles, all_wrapped_handles)
+        num_handles = len(all_composable_handles)
+
+        orig_unshard = torch.distributed.fsdp._runtime_utils._unshard
+        orig_reshard = torch.distributed.fsdp._runtime_utils._reshard
+        UnshardReshardEvent = Tuple[str, _HandlesKey]
+
+        def patched_unshard(
+            unshard_reshard_order: List[UnshardReshardEvent],
+            state: _FSDPState,
+            handles: List[FlatParamHandle],
+            *args,
+            **kwargs,
+        ):
+            handles_key = tuple(handles)
+            unshard_reshard_order.append(("unshard", handles_key))
+            return orig_unshard(state, handles, *args, **kwargs)
+
+        def patched_reshard(
+            unshard_reshard_order: List[UnshardReshardEvent],
+            state: _FSDPState,
+            handles: List[FlatParamHandle],
+            *args,
+            **kwargs,
+        ):
+            handles_key = tuple(handles)
+            unshard_reshard_order.append(("reshard", handles_key))
+            return orig_reshard(state, handles, *args, **kwargs)
+
+        @contextlib.contextmanager
+        def patch_unshard(_patched_unshard: Callable):
+            _orig_unshard = torch.distributed.fsdp._runtime_utils._unshard
+            torch.distributed.fsdp._runtime_utils._unshard = _patched_unshard
+            try:
+                yield
+            finally:
+                torch.distributed.fsdp._runtime_utils._unshard = _orig_unshard
+
+        @contextlib.contextmanager
+        def patch_reshard(_patched_reshard: Callable):
+            _orig_reshard = torch.distributed.fsdp._runtime_utils._reshard
+            torch.distributed.fsdp._runtime_utils._reshard = _patched_reshard
+            try:
+                yield
+            finally:
+                torch.distributed.fsdp._runtime_utils._unshard = _orig_reshard
+
+        composable_order: List[UnshardReshardEvent] = []
+        wrapped_order: List[UnshardReshardEvent] = []
+
+        inp = torch.randn(2, 100, device="cuda")
+        losses: List[torch.Tensor] = []
+
+        for order, model, optim in (
+            (composable_order, composable_module, composable_optim),
+            (wrapped_order, fsdp_wrapped_model, fsdp_wrapped_optim),
+        ):
+            with patch_unshard(
+                functools.partial(patched_unshard, order)
+            ), patch_reshard(functools.partial(patched_reshard, order)):
+                optim.zero_grad(set_to_none=True)
+                out = model(inp)
+                loss = out.sum()
+                losses.append(loss)
+                loss.backward()
+                optim.step()
+        self.assertEqual(losses[0], losses[1])
+
+        # Sanity check that the unshard/reshard events were recorded, where we
+        # expect one unshard/reshard pair for forward, one pair for backward,
+        # and possibly some extra unshards from backward prefetching (in this
+        # case, we expect exactly 2 extra since there are 3 handles)
+        self.assertGreaterEqual(len(composable_order), 2 * 2 * num_handles)
+        self.assertGreaterEqual(len(wrapped_order), 2 * 2 * num_handles)
+        self.assertGreaterEqual(
+            len([e for e in composable_order if e[0] == "unshard"]), 2 * num_handles
+        )
+        self.assertGreaterEqual(
+            len([e for e in wrapped_order if e[0] == "unshard"]), 2 * num_handles
+        )
+        self.assertGreaterEqual(
+            len([e for e in composable_order if e[0] == "reshard"]), 2 * num_handles
+        )
+        self.assertGreaterEqual(
+            len([e for e in wrapped_order if e[0] == "reshard"]), 2 * num_handles
+        )
+
+        # Check that the unshard/reshard order matches
+        self.assertEqual(len(composable_order), len(wrapped_order))
+        for (
+            (composable_event, composable_handles_key),
+            (wrapped_event, wrapped_handles_key),
+        ) in zip(composable_order, wrapped_order):
+            self.assertEqual(composable_event, wrapped_event)
+            self._check_same_param_handles(composable_handles_key, wrapped_handles_key)
+
+    def _check_same_param_handles(
+        self,
+        composable_handles: Iterable[FlatParamHandle],
+        wrapped_handles: Iterable[FlatParamHandle],
+    ) -> None:
+        """
+        Checks that ``composable_handles`` matches ``wrapped_handles`` by
+        checking FQNs.
+
+        For ``fully_shard``, each ``FlatParamHandle`` 's saved FQNs are
+        prefixed from the local FSDP root, while for wrapper FSDP, they are
+        prefixed from its owning FSDP instance, which may not be the local FSDP
+        root. Thus, we relax the check to only that the wrapper FQN is a suffix
+        of the composable FQN.
+
+        If this check passes for the entire model and we separately unit-test
+        parity for wrapping policies, then we can be sure that the handles
+        actually match.
+        """
+        self.assertEqual(len(composable_handles), len(wrapped_handles))
+        for composable_handle, wrapped_handle in zip(
+            composable_handles, wrapped_handles
+        ):
+            composable_fqns = composable_handle.flat_param._fqns
+            wrapped_fqns = wrapped_handle.flat_param._fqns
+            self.assertEqual(len(composable_fqns), len(wrapped_fqns))
+            for composable_fqn, wrapped_fqn in zip(composable_fqns, wrapped_fqns):
+                self.assertTrue(composable_fqn.endswith(wrapped_fqn))
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_flatten_params.py b/test/distributed/fsdp/test_fsdp_flatten_params.py
index b1ae0938cc9f3..5b60ed9820617 100644
--- a/test/distributed/fsdp/test_fsdp_flatten_params.py
+++ b/test/distributed/fsdp/test_fsdp_flatten_params.py
@@ -147,6 +147,7 @@ def _test_flatten_nothing(self, half: bool):
             FlatParamHandle(
                 [],
                 module,
+                module,
                 torch.device("cuda"),
                 self._get_default_config(),
                 self.process_group,
@@ -219,6 +220,7 @@ def _test_numel(self, module):
         flat_param_handle = FlatParamHandle(
             params_to_flatten,
             module,
+            module,
             torch.device("cuda"),
             self._get_default_config(),
             self.process_group,
@@ -320,6 +322,7 @@ def test_flat_param_shard_metadata(self):
         flat_param_handle = FlatParamHandle(
             params_to_flatten,
             module,
+            module,
             torch.device("cuda"),
             self._get_default_config(),
             self.process_group,
diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index 7712015b93af4..d0718a70b9950 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -165,12 +165,12 @@ def _init_core_state(
     ] = collections.defaultdict(list)
     state._module_to_handles = _module_to_handles
     # Same as `_module_to_handle` but filtered to only include keys that are
-    # root modules with respect to the `FlatParamHandle` (see `_root_modules`
-    # in `FlatParameter`)
-    _root_module_to_handles: Dict[
+    # "communication modules", which are responsible for the unshard/reshard
+    # for their `FlatParamHandle`s
+    _comm_module_to_handles: Dict[
         nn.Module, List[FlatParamHandle]
     ] = collections.defaultdict(list)
-    state._root_module_to_handles = _root_module_to_handles
+    state._comm_module_to_handles = _comm_module_to_handles
     # Invariant: `state.params` contains exactly the `FlatParameter`s of the
     # handles in `state._handles`
     _handles: List[FlatParamHandle] = []
@@ -266,7 +266,7 @@ def _init_param_handle_from_module(
         _sync_module_params_and_buffers(
             root_module, managed_params, state.process_group
         )
-    _init_param_handle_from_params(state, managed_params, root_module)
+    _init_param_handle_from_params(state, managed_params, root_module, root_module)
     return state
 
 
@@ -323,7 +323,7 @@ def _init_param_handles_from_module(
             _sync_module_states(params, buffers, state.process_group)
         # Pass `root_module` to have internal FQN metadata prefix starting from
         # it instead of `submodule`
-        _init_param_handle_from_params(state, params, root_module)
+        _init_param_handle_from_params(state, params, root_module, submodule)
     # Reverse to preserve top-down order like `_fsdp_handles()`
     state._handles.reverse()
     return state
@@ -334,6 +334,7 @@ def _init_param_handle_from_params(
     state: _FSDPState,
     params: List[nn.Parameter],
     root_module: nn.Module,
+    comm_module: nn.Module,
 ):
     if len(params) == 0:
         return
@@ -347,6 +348,7 @@ def _init_param_handle_from_params(
     handle = FlatParamHandle(
         params,
         root_module,
+        comm_module,
         state.compute_device,
         handle_config,
         state.process_group,
@@ -359,8 +361,12 @@ def _init_param_handle_from_params(
     state._handles.append(handle)
     for module in handle.flat_param._modules:
         state._module_to_handles[module].append(handle)
-    for module in handle.flat_param._root_modules:
-        state._root_module_to_handles[module].append(handle)
+    state._comm_module_to_handles[handle._comm_module].append(handle)
+    num_comm_module_handles = len(state._comm_module_to_handles[handle._comm_module])
+    assert num_comm_module_handles == 1, (
+        "The current design assumes a module manages at most one "
+        f"`FlatParamHandle` but got {num_comm_module_handles}"
+    )
     cpu_device = torch.device("cpu")
     if state.cpu_offload.offload_params and handle.flat_param.device != cpu_device:
         handle.flat_param_to(cpu_device)
diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index abbcf2b00d5e4..495d13e12b953 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -747,7 +747,8 @@ def _catch_all_reshard(
                 continue
             free_unsharded_flat_params.append(_should_free_in_backward(state, handle))
             handles_to_reshard.append(handle)
-        _reshard(state, handles_to_reshard, free_unsharded_flat_params)
+        if handles_to_reshard:
+            _reshard(state, handles_to_reshard, free_unsharded_flat_params)
     except Exception as e:
         p_assert(
             False,
@@ -888,7 +889,9 @@ def _register_pre_forward_hooks(
         forward_handle.remove()
     state._pre_forward_handles.clear()
     for module in modules:
-        module_param_handles = state._root_module_to_handles[module]
+        if module not in state._comm_module_to_handles:
+            continue
+        module_param_handles = state._comm_module_to_handles[module]
         if module_param_handles:
             unshard_fn = functools.partial(
                 _pre_forward_unshard,
@@ -918,7 +921,9 @@ def _register_post_forward_hooks(
         forward_handle.remove()
     state._post_forward_handles.clear()
     for module in modules:
-        module_param_handles = state._root_module_to_handles[module]
+        if module not in state._comm_module_to_handles:
+            continue
+        module_param_handles = state._comm_module_to_handles[module]
         if module_param_handles:
             reshard_fn = functools.partial(
                 _post_forward_reshard,
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index 847440c88f67d..0025a200dffa0 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -24,7 +24,6 @@
 import torch.nn.functional as F
 from torch import Tensor
 from torch.distributed.fsdp._common_utils import (
-    _get_root_modules,
     _set_fsdp_flattened,
     HandleTrainingState,
 )
@@ -167,11 +166,6 @@ class FlatParameter(nn.Parameter):
             depend on its existence in the future.
         _modules (Set[nn.Module]): Modules that contain some original parameter
             that is flattened into the ``FlatParameter``.
-        _root_modules (Set[nn.Module]): Modules in ``self._modules`` that are
-            root modules (i.e. parent-less) with respect to ``self._modules``.
-            These are the modules for which we register pre/post-forward hooks
-            in the composable code path. There will be one unshard/reshard pair
-            for each root module in this set.
 
         _shard_param_offsets (List[Tuple[int, int])): [start, end] offsets (in
             units of numel) giving this rank's part of each flattened original
@@ -271,7 +265,6 @@ def _init_metadata(
         self._modules = set(pi.module for pi in self._param_infos).union(
             set(spi.module for spi in self._shared_param_infos)
         )
-        self._root_modules = _get_root_modules(self._modules)
         assert (params is None) == (shared_params is None)
         if params is not None:
             assert shared_params is not None and len(shared_params) == len(
@@ -310,9 +303,15 @@ class FlatParamHandle:
         params (Sequence[nn.Parameter]): The parameters to use for the
             flattened parameter.
         module (nn.Module): A module that is the root of the subtree containing
-            all parameters in ``params``; for non-recursive wrapping, this must
-            be the top-level module, while for recursive wrapping, this may not
-            necessarily be the top-level module.
+            all parameters in ``params``. For the non-module-wrapper code path,
+            this should be the local FSDP root module, while for the
+            module-wrapper code path, this may not necessarily be the local
+            FSDP root module (i.e. when there is nested wrapping).
+        comm_module (nn.Module): The module responsible for the unshard/reshard
+            pair for this handle. For the non-module-wrapper code path, this
+            is what would have been ``module`` in the module-wrapper equivalent
+            wrapping, which may not be the local FSDP root module. For the
+            module-wrapper code path, this is always the same as ``module``.
         device (torch.device): The compute and communication device, which
             should be a non-CPU device. We refer to it as the compute device.
         config (HandleConfig): A config customizing the handle based on FSDP's
@@ -323,6 +322,10 @@ class FlatParamHandle:
             :class:`FlatParameter`). If ``False``, then FSDP reconstructs the
             parameter every iteration and returns the :class:`FlatParameter` s
             from ``named_parameters()``.
+
+    NOTE: We enforce that there is a single "communication module" that is
+    responsible for the unshard/reshard pair for this handle. This invariant
+    holds for both the module-wrapper and non-module-wrapper code paths.
     """
 
     ##################
@@ -332,6 +335,7 @@ def __init__(
         self,
         params: Sequence[nn.Parameter],
         module: nn.Module,
+        comm_module: nn.Module,
         device: torch.device,
         config: HandleConfig,
         process_group: dist.ProcessGroup,
@@ -346,6 +350,7 @@ def __init__(
         self._use_orig_params = use_orig_params
         self._training_state = HandleTrainingState.IDLE
         self._debug_level = dist.get_debug_level()
+        self._comm_module = comm_module
         self._init_flat_param(params, module, use_orig_params)
         self._use_unsharded_views(as_params=False)
 

From 676c1ad02f3254eae07b027e5421d27a34d46d56 Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Wed, 7 Dec 2022 19:08:11 +0000
Subject: [PATCH 1726/1922] [Composable API][Easy] Use `policy=None` since that
 is supported (#90400)

I believe that @mrshenli used `ModuleWrapPolicy({UnitModule})` when applying `fully_shard` to `UnitModule`s because `policy=None` was not supported. However, he added that support in a previous PR, so this PR simplifies to using `policy=None` to make the intention more clear.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90400
Approved by: https://github.com/mrshenli
---
 test/distributed/_composable/test_compose.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/test/distributed/_composable/test_compose.py b/test/distributed/_composable/test_compose.py
index 6740f561eb51a..98679f89da23f 100644
--- a/test/distributed/_composable/test_compose.py
+++ b/test/distributed/_composable/test_compose.py
@@ -98,14 +98,8 @@ def _test_checkpoint_fsdp_submodules(self, use_reentrant):
         base_model = copy.deepcopy(model)
 
         test_model = copy.deepcopy(model)
-        test_model.u1 = fully_shard(
-            test_model.u1,
-            policy=ModuleWrapPolicy({UnitModule}),
-        )
-        test_model.u2 = fully_shard(
-            test_model.u2,
-            policy=ModuleWrapPolicy({UnitModule}),
-        )
+        test_model.u1 = fully_shard(test_model.u1, policy=None)
+        test_model.u2 = fully_shard(test_model.u2, policy=None)
 
         test_model.u1.seq = checkpoint(test_model.u1.seq, use_reentrant=use_reentrant)
         test_model.u2.seq = checkpoint(test_model.u2.seq, use_reentrant=use_reentrant)

From 6df2e476ebf4144ce41051d2f34af9aa9ba5b29c Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Wed, 7 Dec 2022 17:52:13 +0000
Subject: [PATCH 1727/1922] Implement correction argument in
 torch.masked.{std,var} (#87118)

This makes the signature of `torch.masked.std` and `var` more consistent with the global namespace variant and also updates the sample inputs to repurpose the existing `sample_inputs_std_var` inputs which fully exercise the `correction` argument.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87118
Approved by: https://github.com/cpuhrsch
---
 test/inductor/test_torchinductor_opinfo.py    |   1 +
 torch/masked/_ops.py                          |  32 +++--
 .../_internal/opinfo/definitions/_masked.py   | 133 ++++++++++++------
 torch/testing/_internal/opinfo/utils.py       |   5 -
 4 files changed, 115 insertions(+), 56 deletions(-)

diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 915771362fcba..1fff48673afd4 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -198,6 +198,7 @@ def process(device_type):
     "linalg.pinv.singular": {f32, f64},
     "masked.norm": {f16},
     "masked.normalize": {f16},
+    "masked.var": {f16},
     "masked_fill": {f16},
     "masked_scatter": {f16, f32, f64},
     "masked_select": {b8, f16, f32, f64, i32, i64},
diff --git a/torch/masked/_ops.py b/torch/masked/_ops.py
index a933f06024aa2..c69d8db00c167 100644
--- a/torch/masked/_ops.py
+++ b/torch/masked/_ops.py
@@ -1538,14 +1538,22 @@ def norm(
 
 def _std_var(
     input: Union[Tensor, MaskedTensor],
-    dim: DimOrDims = None,
-    unbiased: Optional[bool] = False,
+    dim: DimOrDims,
+    unbiased: Optional[bool],
     *,
-    keepdim: Optional[bool] = False,
-    dtype: Optional[DType] = None,
-    mask: Optional[Tensor] = None,
-    take_sqrt: Optional[bool] = False,
+    correction: Optional[int],
+    keepdim: Optional[bool],
+    dtype: Optional[DType],
+    mask: Optional[Tensor],
+    take_sqrt: Optional[bool],
 ) -> Tensor:
+    assert (unbiased is None or correction is None), "Only one of unbiased and correction may be given"
+    correction_int = 1
+    if unbiased is not None:
+        correction_int = 1 if unbiased else 0
+    if correction is not None:
+        correction_int = correction
+
     if dtype is None:
         dtype = input.dtype
         if not (dtype.is_floating_point or dtype.is_complex):
@@ -1584,8 +1592,8 @@ def _std_var(
             )
         if not keepdim:
             count = count.reshape(total.shape)
-        if unbiased:
-            count = torch.subtract(count, 1)
+        if correction_int != 0:
+            count = torch.subtract(count, correction_int)
             count = torch.maximum(count, count.new_zeros([]))
         output = torch.divide(total, count).to(dtype=dtype)
         if take_sqrt:
@@ -1601,8 +1609,9 @@ def _std_var(
 def var(
     input: Union[Tensor, MaskedTensor],
     dim: DimOrDims = None,
-    unbiased: Optional[bool] = False,
+    unbiased: Optional[bool] = None,
     *,
+    correction: Optional[int] = None,
     keepdim: Optional[bool] = False,
     dtype: Optional[DType] = None,
     mask: Optional[Tensor] = None,
@@ -1619,6 +1628,7 @@ def var(
         input=input,
         dim=dim,
         unbiased=unbiased,
+        correction=correction,
         keepdim=keepdim,
         dtype=dtype,
         mask=mask,
@@ -1630,8 +1640,9 @@ def var(
 def std(
     input: Union[Tensor, MaskedTensor],
     dim: DimOrDims = None,
-    unbiased: Optional[bool] = False,
+    unbiased: Optional[bool] = None,
     *,
+    correction: Optional[int] = None,
     keepdim: Optional[bool] = False,
     dtype: Optional[DType] = None,
     mask: Optional[Tensor] = None,
@@ -1648,6 +1659,7 @@ def std(
         input=input,
         dim=dim,
         unbiased=unbiased,
+        correction=correction,
         keepdim=keepdim,
         dtype=dtype,
         mask=mask,
diff --git a/torch/testing/_internal/opinfo/definitions/_masked.py b/torch/testing/_internal/opinfo/definitions/_masked.py
index 20025b98e3c48..57a413bf8c53d 100644
--- a/torch/testing/_internal/opinfo/definitions/_masked.py
+++ b/torch/testing/_internal/opinfo/definitions/_masked.py
@@ -1,4 +1,5 @@
 import unittest
+from collections.abc import Sequence
 from functools import partial
 from typing import List
 
@@ -223,51 +224,101 @@ def sample_inputs_masked_norm(op_info, device, dtype, requires_grad, **kwargs):
             )
 
 
+def reference_masked_std_var(
+    numpy_fn,
+):
+    ref = reference_reduction_numpy(numpy_fn)
+
+    # Translate unbiased or correction arguments into ddof
+    def func(
+        input,
+        dim=None,
+        unbiased=None,
+        *,
+        correction=None,
+        **kwargs,
+    ):
+        ddof = 1
+        if unbiased is not None:
+            ddof = 1 if unbiased else 0
+        if correction is not None:
+            ddof = correction
+
+        if isinstance(dim, Sequence):
+            dim = tuple(dim)
+
+        return ref(input, dim, ddof=ddof, **kwargs)
+
+    return func
+
+
 def sample_inputs_masked_std_var(op_info, device, dtype, requires_grad, **kwargs):
     """Sample inputs for masked std/var."""
-    for unbiased in [False, True]:
-        for sample_input in sample_inputs_masked_reduction(
+    kwargs["supports_multiple_dims"] = op_info.supports_multiple_dims
+    from torch.testing._internal.common_methods_invocations import sample_inputs_std_var
+
+    def masked_samples():
+        for sample_input in sample_inputs_std_var(
             op_info, device, dtype, requires_grad, **kwargs
         ):
-            if sample_input.args:
-                dim = sample_input.args[0]
-                sample_input_args = (
-                    sample_input.args[:1] + (unbiased,) + sample_input.args[1:]
+            if len(sample_input.args) and isinstance(sample_input.args[0], bool):
+                continue  # masked.{std, var} doesn't support `.var(unbiased)`
+
+            for mask in _generate_masked_op_mask(
+                sample_input.input.shape, device, **kwargs
+            ):
+                sample_input_args, sample_input_kwargs = sample_input.args, dict(
+                    mask=mask, **sample_input.kwargs
                 )
-                sample_input_kwargs = sample_input.kwargs.copy()
-            else:
-                dim = sample_input.kwargs.get("dim")
-                sample_input_args = sample_input.args
-                sample_input_kwargs = dict(sample_input.kwargs, unbiased=unbiased)
-            if requires_grad:
-                if sample_input_kwargs.get("mask") is None:
-                    orig_count = torch.masked.sum(
-                        torch.ones(sample_input.input.shape, dtype=torch.int64),
-                        dim,
-                        keepdim=True,
-                    )
-                else:
-                    inmask = torch.masked._input_mask(
-                        sample_input.input, *sample_input_args, **sample_input_kwargs
-                    )
-                    orig_count = torch.masked.sum(
-                        inmask.new_ones(sample_input.input.shape, dtype=torch.int64),
-                        dim,
-                        keepdim=True,
-                        mask=inmask,
-                    )
-                if orig_count.min() <= int(unbiased) + 1:
-                    # Skip samples that lead to singularities in var
-                    # computation resulting nan values both in var and
-                    # autograd output that test_grad_fn cannot handle
-                    # correctly. Also, skip samples when the autograd output
-                    # for std could not be handled correctly due to torch.sqrt
-                    continue
-            yield SampleInput(
-                sample_input.input.detach().requires_grad_(requires_grad),
-                args=sample_input_args,
-                kwargs=sample_input_kwargs,
+                yield SampleInput(
+                    sample_input.input.detach().requires_grad_(requires_grad),
+                    args=sample_input_args,
+                    kwargs=sample_input_kwargs,
+                )
+                if (
+                    not requires_grad
+                    and dtype.is_floating_point
+                    and sample_input.input.ndim == 2
+                    and mask is not None
+                    and mask.shape == sample_input.input.shape
+                ):
+                    for v in [torch.inf, -torch.inf, torch.nan]:
+                        t = sample_input.input.detach()
+                        t.diagonal(0, -2, -1).fill_(v)
+                        yield SampleInput(
+                            t.requires_grad_(requires_grad),
+                            args=sample_input_args,
+                            kwargs=sample_input_kwargs,
+                        )
+
+    for sample_input in masked_samples():
+        correction = sample_input.kwargs.get("correction")
+        if correction is None:
+            correction = int(sample_input.kwargs.get("unbiased", True))
+
+        dim = sample_input.kwargs.get("dim", None)
+
+        if sample_input.kwargs.get("mask") is None:
+            orig_count = torch.masked.sum(
+                torch.ones(sample_input.input.shape, dtype=torch.int64),
+                dim,
+                keepdim=True,
+            )
+        else:
+            inmask = torch.masked._input_mask(
+                sample_input.input, *sample_input.args, **sample_input.kwargs
             )
+            orig_count = torch.masked.sum(
+                inmask.new_ones(sample_input.input.shape, dtype=torch.int64),
+                dim,
+                keepdim=True,
+                mask=inmask,
+            )
+        if orig_count.min() <= correction + 1:
+            # Skip samples that lead to nans in var computation
+            continue
+
+        yield sample_input
 
 
 def sample_inputs_masked_softmax(
@@ -860,7 +911,7 @@ def sample_inputs_masked_normalize(op_info, device, dtype, requires_grad, **kwar
     ),
     ReductionOpInfo(
         "masked.var",
-        ref=reference_reduction_numpy(np.var)
+        ref=reference_masked_std_var(np.var)
         if np.lib.NumpyVersion(np.__version__) >= "1.20.2"
         else None,
         method_variant=None,
@@ -938,7 +989,7 @@ def sample_inputs_masked_normalize(op_info, device, dtype, requires_grad, **kwar
     ),
     ReductionOpInfo(
         "masked.std",
-        ref=reference_reduction_numpy(np.std)
+        ref=reference_masked_std_var(np.std)
         if np.lib.NumpyVersion(np.__version__) >= "1.20.2"
         else None,
         method_variant=None,
diff --git a/torch/testing/_internal/opinfo/utils.py b/torch/testing/_internal/opinfo/utils.py
index 0bbba7c769d84..017f26f7478c6 100644
--- a/torch/testing/_internal/opinfo/utils.py
+++ b/torch/testing/_internal/opinfo/utils.py
@@ -243,11 +243,6 @@ def wrapper(x: np.ndarray, *args, **kwargs):
                     identity = identity.cpu()
                 kwargs["initial"] = identity.numpy()
 
-        if "unbiased" in keys:
-            unbiased = kwargs.pop("unbiased")
-            if unbiased is not None:
-                kwargs["ddof"] = int(unbiased)
-
         result = f(x, *args, **kwargs)
 
         # Unsqueeze reduced dimensions if NumPy does not support keepdims

From eb18d903495b20bf22ab9a6e0543e599156d9f82 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Thu, 8 Dec 2022 08:12:28 +0000
Subject: [PATCH 1728/1922] Hybrid Sharded Data Parallel (#89915)

Adds 2 new hybrid sharding strategy to FSDP:
1. HYBRID_SHARD: applies zero-3 style sharding within a node, and data parallel across
2. HYBRID_SHARD_ZERO2: applies zero-2 style sharding within a node, and data parallel across

These are useful for medium sized models and aim to decrease communication volume, tests and benchmarks will be run to understand which workloads are optimal under which sharding strategy.

Hybrid sharding in general works by sharding the model using a process group within a single node, and creating intra-node process groups for replication / data parallelism. The user either needs to pass in a tuple of these process groups, or None, and we generate the process groups appropriately.

** Acknowledgements **
- @awgu 's excellent prototype: https://github.com/awgu/pytorch/commit/5ad3a16d486484c9ab4445126e50655eb19d62ca
- @liangluofb For ideation, feedback, and initial implementation and experimentation
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89915
Approved by: https://github.com/awgu
---
 .../fsdp/test_fsdp_hybrid_shard.py            | 243 ++++++++++++++++++
 torch/distributed/_composable/fully_shard.py  |   2 +-
 torch/distributed/distributed_c10d.py         |   2 +-
 torch/distributed/fsdp/_init_utils.py         | 135 +++++++++-
 torch/distributed/fsdp/_runtime_utils.py      |  72 +++++-
 torch/distributed/fsdp/api.py                 |  11 +-
 torch/distributed/fsdp/flat_param.py          |   2 +
 .../fsdp/fully_sharded_data_parallel.py       |  26 +-
 torch/testing/_internal/common_fsdp.py        |  28 +-
 .../_internal/distributed/distributed_test.py |   2 +-
 10 files changed, 504 insertions(+), 19 deletions(-)
 create mode 100644 test/distributed/fsdp/test_fsdp_hybrid_shard.py

diff --git a/test/distributed/fsdp/test_fsdp_hybrid_shard.py b/test/distributed/fsdp/test_fsdp_hybrid_shard.py
new file mode 100644
index 0000000000000..cda15ef21d792
--- /dev/null
+++ b/test/distributed/fsdp/test_fsdp_hybrid_shard.py
@@ -0,0 +1,243 @@
+# Owner(s): ["oncall: distributed"]
+
+import contextlib
+from functools import partial
+from collections import Counter
+import sys
+
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+
+from torch.distributed.distributed_c10d import _rank_not_in_group
+from torch.distributed.fsdp import (
+    FullyShardedDataParallel as FSDP,
+    ShardingStrategy,
+)
+from torch.distributed.fsdp.wrap import ModuleWrapPolicy
+from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import (
+    CUDAInitMode,
+    FSDPInitMode,
+    FSDPTest,
+    TransformerWithSharedParams,
+)
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    run_tests,
+    TEST_WITH_DEV_DBG_ASAN,
+)
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+if TEST_WITH_DEV_DBG_ASAN:
+    print(
+        "Skip dev-asan as torch + multiprocessing spawn have known issues",
+        file=sys.stderr,
+    )
+    sys.exit(0)
+
+
+@contextlib.contextmanager
+def patch_allreduce(new_allreduce):
+    """
+    Patches dist.all_reduce with a new all_reduce and
+    restores upon exiting.
+    """
+    orig_ar = dist.all_reduce
+    dist.all_reduce = new_allreduce
+    try:
+        yield
+    finally:
+        dist.all_reduce = orig_ar
+
+@contextlib.contextmanager
+def patch_reduce_scatter(new_reduce_scatter):
+    """
+    Patches dist.reduce_scatter_tensor with a new reduce_scatter_tensor and
+    restores upon exiting.
+    """
+    orig_reduce_scatter = dist.reduce_scatter_tensor
+    dist.reduce_scatter_tensor = new_reduce_scatter
+    try:
+        yield
+    finally:
+        dist.reduce_scatter_tensor = orig_reduce_scatter
+
+
+class MyModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.lin1 = nn.Linear(10, 10)
+        self.lin2 = nn.Linear(10, 10)
+        self.lin3 = nn.Linear(10, 10)
+
+class TestFSDPHybridShard(FSDPTest):
+    @property
+    def world_size(self):
+        return max(torch.cuda.device_count(), 2)
+
+    @property
+    def process_group(self):
+        return dist.distributed_c10d._get_default_group()
+
+    @skip_if_lt_x_gpu(2)
+    def test_raises_manual_wrap_hybrid_shard_when_none_policy(self):
+        model = MyModel().cuda()
+        err_ctx = self.assertRaisesRegex(
+            ValueError, "requires explicit specification of process group"
+        )
+
+        with err_ctx:
+            model = FSDP(model, sharding_strategy=ShardingStrategy.HYBRID_SHARD)
+
+        with err_ctx:
+            model = FSDP(model, sharding_strategy=ShardingStrategy._HYBRID_SHARD_ZERO2)
+
+
+    @skip_if_lt_x_gpu(2)
+    def test_hybrid_shard_strategy_mismatch_raises(self):
+        for sharding_strategy in [
+            ShardingStrategy._HYBRID_SHARD_ZERO2,
+            ShardingStrategy.HYBRID_SHARD
+        ]:
+            with self.subTest(sharding_strategy=sharding_strategy):
+                model = MyModel().cuda()
+                intra_pg = self.process_group
+                inter_pg = dist.new_group(ranks=[self.rank])
+                model.lin1 = FSDP(model.lin1, process_group=(intra_pg, inter_pg), sharding_strategy=sharding_strategy)
+                self.assertEqual(model.lin1.process_group, intra_pg)
+                self.assertEqual(model.lin1._inter_node_pg, inter_pg)
+                model = FSDP(model, process_group=intra_pg)
+                inp = torch.randn(4, 10)
+                # Errors during _lazy_init
+                with self.assertRaisesRegex(ValueError, "expect sharding strategies to be the same"):
+                    model(inp)
+
+    @skip_if_lt_x_gpu(2)
+    def test_hybrid_shard_pg_mismatch_raises(self):
+        model = MyModel().cuda()
+        intra_pg = self.process_group
+        inter_pg = dist.new_group(ranks=[self.rank])
+        # Mismatched process groups for intra-node
+        model.lin1 = FSDP(
+            model.lin1, process_group=(intra_pg, inter_pg), sharding_strategy=ShardingStrategy.HYBRID_SHARD
+        )
+        model = FSDP(
+            model, process_group=(dist.new_group(), dist.new_group()), sharding_strategy=ShardingStrategy.HYBRID_SHARD
+        )
+        # Errors during _lazy_init
+        inp = torch.randn(4, 10)
+        with self.assertRaisesRegex(ValueError, "intra-node process groups do not match"):
+            model(inp)
+
+        # Mismatched process groups for inter-node
+        model = MyModel().cuda()
+        model.lin1 = FSDP(
+            model.lin1, process_group=(intra_pg, inter_pg), sharding_strategy=ShardingStrategy.HYBRID_SHARD
+        )
+        model = FSDP(
+            model, process_group=(intra_pg, dist.new_group()), sharding_strategy=ShardingStrategy.HYBRID_SHARD
+        )
+        with self.assertRaisesRegex(ValueError, "inter-node process groups do not match"):
+            model(inp)
+
+    @skip_if_lt_x_gpu(2)
+    def test_invalid_pg_specification_raises(self):
+        pol = ModuleWrapPolicy({nn.Linear})
+        model = MyModel().cuda()
+        with self.assertRaisesRegex(
+            ValueError,
+            "Expected process_group to be passed in"
+        ):
+            model = FSDP(
+                model,
+                auto_wrap_policy=pol,
+                process_group=self.process_group,
+                sharding_strategy=ShardingStrategy.HYBRID_SHARD
+            )
+
+    # TODO - add test for ZeRO-2 style sharding ensure params are not
+    # resharded after forward.
+
+    @skip_if_lt_x_gpu(2)
+    def test_fsdp_hybrid_shard_basic_setup(self):
+        """
+        Tests basic functionality of HYBRID_SHARD and _HYBRID_SHARD_ZERO2:
+            1. Inter and intra-node process groups are correctly setup
+            2. Process groups are the same across FSDP wrapped instances
+            3. reduce_scatter and allreduce called the expected no. of times
+        """
+        for sharding_strategy in [
+            ShardingStrategy.HYBRID_SHARD, ShardingStrategy._HYBRID_SHARD_ZERO2
+        ]:
+            with self.subTest(sharding_strategy=sharding_strategy):
+                auto_wrap_policy = ModuleWrapPolicy(
+                    {TransformerEncoderLayer, TransformerDecoderLayer},
+                )
+                fsdp_kwargs = {
+                    "auto_wrap_policy": auto_wrap_policy,
+                    "device_id": torch.cuda.current_device(),
+                    "sharding_strategy": sharding_strategy,
+                }
+                fsdp_model = TransformerWithSharedParams.init(
+                    self.process_group,
+                    FSDPInitMode.RECURSIVE,
+                    CUDAInitMode.CUDA_BEFORE,
+                    fsdp_kwargs,
+                )
+                # All FSDP modules should have state.process_group as the process group over which to
+                # shard (default process group), and state._inter_node_pg (process group containing only
+                # this rank)
+                intra_node_pgs = set()
+                inter_node_pgs = set()
+                for mod in fsdp_model.fsdp_modules(fsdp_model):
+                    # process_group should be across the node, which is just the
+                    # whole world here.
+                    self.assertEqual(
+                        dist.get_world_size(mod.process_group),
+                        dist.get_world_size(self.process_group)
+                    )
+                    intra_node_pgs.add(mod.process_group)
+                    inter_node_pg = mod._inter_node_pg
+                    inter_node_pgs.add(inter_node_pg)
+                    self.assertEqual(1, dist.get_world_size(inter_node_pg))
+                    self.assertFalse(_rank_not_in_group(inter_node_pg))
+                    self.assertEqual(
+                        sharding_strategy, mod.sharding_strategy
+                    )
+                # All fsdp modules should share the same process groups
+                self.assertEqual(1, len(intra_node_pgs))
+                self.assertEqual(1, len(inter_node_pgs))
+
+                orig_ar = dist.all_reduce
+                orig_rs = dist.reduce_scatter_tensor
+
+                def patched_collective(orig_collective, counter, *args, **kwargs):
+                    counter[orig_collective] += 1
+                    return orig_collective(*args, **kwargs)
+
+                cntr = Counter()
+                patched_allreduce = partial(patched_collective, orig_ar, cntr)
+                patched_reduce_scatter = partial(patched_collective, orig_rs, cntr)
+                with (
+                    patch_allreduce(patched_allreduce),
+                    patch_reduce_scatter(patched_reduce_scatter),
+                ):
+                    inp = fsdp_model.get_input(device=torch.cuda.current_device())
+                    out = fsdp_model(inp[0], inp[1])
+                    loss = fsdp_model.get_loss(inp, out)
+                    loss.backward()
+
+                num_flat_params = len(list(FSDP._fsdp_handles(fsdp_model)))
+                self.assertEqual(num_flat_params, cntr[orig_ar])
+                self.assertEqual(num_flat_params, cntr[orig_rs])
+                dist.barrier()
+
+instantiate_parametrized_tests(TestFSDPHybridShard)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/distributed/_composable/fully_shard.py b/torch/distributed/_composable/fully_shard.py
index f145de1e20a36..184a9aa95d521 100644
--- a/torch/distributed/_composable/fully_shard.py
+++ b/torch/distributed/_composable/fully_shard.py
@@ -50,7 +50,7 @@ def fully_shard(
         raise ValueError(f"Expects an `_FSDPPolicy` but got {policy}")
     state = fully_shard.state(module)
     state = _init_ignored_module_states(state, module, ignored_modules)
-    state = _init_process_group_state(state, process_group)
+    state = _init_process_group_state(state, process_group, ShardingStrategy.FULL_SHARD, policy)
     limit_all_gathers = True
     use_orig_params = True
     backward_prefetch_limit = 1
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 4d343bcffec38..810d20c292a59 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -3543,7 +3543,7 @@ def new_subgroups(
         group_size = torch.cuda.device_count()
     world_size = get_world_size()
     if world_size < group_size:
-        raise ValueError("The arg 'group_size' must not exceed the world size")
+        raise ValueError(f"The arg 'group_size' ({group_size}) must not exceed the world size ({world_size})")
     if world_size % group_size != 0:
         raise ValueError("The world size must be divisible by 'group_size'")
 
diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index d0718a70b9950..f1713d1d7b347 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -1,6 +1,7 @@
 import collections
 import warnings
 from typing import (
+    Any,
     Callable,
     Dict,
     Generator,
@@ -59,12 +60,24 @@
 
 PARAM_BROADCAST_BUCKET_SIZE = int(250 * 1024 * 1024)
 FSDP_SYNCED = "_fsdp_synced"
+# Specification of process groups for hybrid sharding strategies.
+HybridShardProcessGroupType = Tuple[dist.ProcessGroup, dist.ProcessGroup]
+# Overall specification of process group.
+ProcessGroupType = Optional[Union[dist.ProcessGroup, HybridShardProcessGroupType]]
+
 
 # TODO (awgu): Refactor this later
 SHARDING_STRATEGY_MAP = {
     ShardingStrategy.NO_SHARD: HandleShardingStrategy.NO_SHARD,
     ShardingStrategy.FULL_SHARD: HandleShardingStrategy.FULL_SHARD,
     ShardingStrategy.SHARD_GRAD_OP: HandleShardingStrategy.SHARD_GRAD_OP,
+    ShardingStrategy.HYBRID_SHARD: HandleShardingStrategy.HYBRID_SHARD,
+    ShardingStrategy._HYBRID_SHARD_ZERO2: HandleShardingStrategy._HYBRID_SHARD_ZERO2,
+}
+
+HYBRID_SHARDING_STRATEGIES = {
+    ShardingStrategy.HYBRID_SHARD,
+    ShardingStrategy._HYBRID_SHARD_ZERO2,
 }
 
 
@@ -75,13 +88,131 @@
 @no_type_check
 def _init_process_group_state(
     state: _FSDPState,
-    process_group: Optional[dist.ProcessGroup],
+    process_group: ProcessGroupType,
+    sharding_strategy: ShardingStrategy,
+    policy: Optional[_FSDPPolicy],
 ) -> _FSDPState:
-    state.process_group = process_group or _get_default_group()
+    if sharding_strategy in HYBRID_SHARDING_STRATEGIES:
+        if process_group is None and policy is None:
+            # Raise an error here, since this is manual wrapping with no process group
+            # passed in, there is no way to ensure all wrapped FSDP instances use the same
+            # process groups.
+            raise ValueError(
+                f"Manual wrapping with {sharding_strategy} requires explicit specification of process group."
+            )
+        else:
+            state = _init_process_group_state_for_hybrid_shard(state, process_group)
+            assert state.process_group is not None, "Expected to populate state.process_group for hybrid shard"
+            assert state._inter_node_pg is not None, "Expected to populate state._inter_node_pg for hybrid shard"
+            assert state._inter_node_state is not None, "Expected to populate state._inter_node_state for hybrid shad."
+    else:
+        state.process_group = process_group if process_group is not None else _get_default_group()
+
     state.rank = state.process_group.rank()
     state.world_size = state.process_group.size()
+
     return state
 
+@no_type_check
+def _init_process_group_state_for_hybrid_shard(state: _FSDPState, process_group) -> _FSDPState:
+    if process_group is None:
+        default_group = _get_default_group()
+        intra_node_group, inter_node_group = _init_intra_and_inter_node_groups(default_group)
+        # we shard across intra-node
+        state.process_group = intra_node_group
+        # save _inter_node_pg to allreduce across.
+        state._inter_node_pg = inter_node_group
+    else:
+        # Check type and assign state.process_group and state._inter_node_pg.
+        if _is_valid_hybrid_shard_pg_type(process_group):
+            # Assuming that user passed in as intra node group and inter node group
+            # as documented.
+            state.process_group, state._inter_node_pg = process_group
+        else:
+            raise ValueError(
+                "Expected process_group to be passed in as either None or "
+                f"Tuple[dist.ProcessGroup, dist.ProcessGroup] but got {type(process_group)}"
+            )
+    # Create state for allreduce
+    state._inter_node_state = _get_default_comm_hook_state(
+        process_group=state._inter_node_pg,
+    )
+    return state
+
+@no_type_check
+def _is_valid_hybrid_shard_pg_type(process_group: Any) -> bool:
+    return (
+        isinstance(process_group, tuple)
+        and len(process_group) == 2
+        and all(isinstance(pg, dist.ProcessGroup) for pg in process_group)
+    )
+
+@no_type_check
+def _init_intra_node_process_group() -> dist.ProcessGroup:
+    """
+    Returns a process group across the current node.
+    For example, given each row is a distinct node:
+    0 1 2 3 4 5 6 7 8
+    9 10 11 12 13 14 15
+    This API would return an intra-node subgroup across
+    [0, 7] or [8, 15] depending on the process's rank.
+    For example, rank 3 would get [0, 7].
+    """
+    intra_node_subgroup, _ = dist.new_subgroups()
+    return intra_node_subgroup
+
+@no_type_check
+def _init_inter_node_process_group(
+    global_process_group: dist.ProcessGroup
+) -> dist.ProcessGroup:
+    """
+    Returns an inter-node process group where each contained rank has
+    the same local rank. For example, given each column is a distinct node:
+    0 1 2 3 4 5 6 7 8
+    9 10 11 12 13 14 15
+    This API would return inter-node process group {0, 8}, {1, 9}, {2, 10}, and so forth
+    depending on the process's rank. For example, rank 1 would get {1, 9}, rank 5
+    would get {5, 13}.
+    """
+    # the inter-node pg that is returned
+    inter_node_pg = None
+    sharding_backend = dist.get_backend(global_process_group)
+    world_size = dist.get_world_size(global_process_group)
+    # Assuming fully homogeneous setup
+    num_devices = torch.cuda.device_count()
+    num_nodes = world_size // num_devices
+    my_local_rank = dist.get_rank(global_process_group) % num_devices
+    for local_rank in range(num_devices):
+        ranks_for_inter_group = [
+            local_rank + (i * num_devices) for i in range(num_nodes)
+        ]
+        # every rank always needs to call dist.new_group
+        grp = dist.new_group(
+            ranks=ranks_for_inter_group, backend=sharding_backend
+        )
+        if local_rank == my_local_rank:
+            print(f"{local_rank} created process group for {ranks_for_inter_group}")
+            inter_node_pg = grp
+
+    assert inter_node_pg is not None, f"{my_local_rank} expected to assign inter-node pg, but did not"
+    return inter_node_pg
+
+def _init_intra_and_inter_node_groups(
+    global_process_group: dist.ProcessGroup,
+) -> Tuple[dist.ProcessGroup, dist.ProcessGroup]:
+    """
+    Initializes intra and inter-node process groups and returns the ones corresponding
+    to this process's rank.
+    This function can be used to initialize process groups for ``HYBRID_SHARD`` or
+    ``_HYBRID_SHARD_ZERO2`` in FSDP.
+    This function assumes each node has an equal number of CUDA-enabled devices.
+    Returns:
+        Tuple[dist.ProcessGroup, dist.ProcessGroup]: Intra and inter-node process group.
+    """
+    return (
+        _init_intra_node_process_group(),
+        _init_inter_node_process_group(global_process_group),
+    )
 
 @no_type_check
 def _init_ignored_module_states(
diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index 495d13e12b953..3b86437517045 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -6,7 +6,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.autograd import Variable
-from torch.distributed.algorithms._comm_hooks import LOW_PRECISION_HOOKS
+from torch.distributed.algorithms._comm_hooks import LOW_PRECISION_HOOKS, default_hooks
 from torch.distributed.fsdp._common_utils import (
     _all_handles,
     _assert_in_training_states,
@@ -14,6 +14,7 @@
     _is_composable,
     TrainingState,
 )
+from torch.distributed.fsdp._init_utils import HYBRID_SHARDING_STRATEGIES
 from torch.distributed.fsdp._utils import (
     _apply_to_tensors,
     _no_dispatch_record_stream,
@@ -29,6 +30,43 @@
 )
 from torch.distributed.utils import _to_kwargs
 
+RESHARD_AFTER_FORWARD_STRATEGIES = {
+    HandleShardingStrategy.FULL_SHARD,
+    HandleShardingStrategy.HYBRID_SHARD,
+}
+
+@no_type_check
+def _validate_hybrid_shard_setup(fsdp_root: _FSDPState, fsdp_module: nn.Module):
+    """
+    Performs validation that hybrid sharding strategy is setup. In particular, we:
+    1) Ensure root and passed in FSDP module have the same hybrid sharding strategy,
+    i.e. both should be using the same hybrid shard strategy or no hybrid shard at all.
+    2) Ensure that inter and intra-node process groups are the same across root and
+    this FSDP module.
+    """
+    if (
+        fsdp_root.sharding_strategy in HYBRID_SHARDING_STRATEGIES
+        or fsdp_module.sharding_strategy in HYBRID_SHARDING_STRATEGIES
+    ):
+        if fsdp_root.sharding_strategy != fsdp_module.sharding_strategy:
+            raise ValueError(
+                "When using hybrid sharding strategy, expect sharding strategies"
+                f" to be the same, but got {fsdp_root.sharding_strategy} vs {fsdp_module.sharding_strategy}"
+            )
+
+        # Ensure inter and intra-node process groups are the same
+        # TODO (rohan-varma) unclear whether these should be asserts or Exceptions
+        # as they can happen due to bug in FSDP process group setup or user passing in
+        # incorrect configuration.
+        if fsdp_root.process_group != fsdp_module.process_group:
+            raise ValueError(
+                f"For {fsdp_root.sharding_strategy} intra-node process groups do not match"
+            )
+
+        if fsdp_root._inter_node_pg != fsdp_module._inter_node_pg:
+            raise ValueError(
+                f"For {fsdp_root.sharding_strategy}, inter-node process groups do not match"
+            )
 
 @no_type_check
 def _lazy_init(
@@ -71,6 +109,14 @@ def _lazy_init(
     for fsdp_module in state.fsdp_modules(root_module):
         if fsdp_module is root_module:
             continue
+
+        if fsdp_module.sharding_strategy in HYBRID_SHARDING_STRATEGIES:
+            _validate_hybrid_shard_setup(state, fsdp_module)
+            # Share the allreduce state across FSDP units. This is not strictly necessary
+            # as each one already uses the same process group, but can slightly save memory
+            # since other FSDP units allreduce state can be garbage collected.
+            fsdp_module._inter_node_state = state._inter_node_state
+
         # Relax the assert for non-root FSDP instances in case the nested
         # initialized module is wrapped again in FSDP later (e.g. after
         # training to run inference)
@@ -305,9 +351,11 @@ def _post_forward_reshard(
     # Do not free the root's parameters in the post-forward for `FULL_SHARD`
     # with the intention that they are immediately used for backward
     # computation (though this may not be true)
+
+
     free_unsharded_flat_params = [
         not state._is_root
-        and handle._config.sharding_strategy == HandleShardingStrategy.FULL_SHARD
+        and handle._config.sharding_strategy in RESHARD_AFTER_FORWARD_STRATEGIES
         for handle in handles
     ]
     _reshard(state, handles, free_unsharded_flat_params)
@@ -553,6 +601,15 @@ def _post_backward_hook(
                     padded_unsharded_grad,
                     new_sharded_grad,
                 )
+                if handle._config.sharding_strategy in (
+                    HandleShardingStrategy.HYBRID_SHARD,
+                    HandleShardingStrategy._HYBRID_SHARD_ZERO2
+                ):
+                    default_hooks.allreduce_hook(
+                        state=state._inter_node_state,
+                        grad=new_sharded_grad,
+                    )
+
                 _cast_grad_to_param_dtype(state, handle, new_sharded_grad, param)
 
                 # Save the sharded gradient in `_saved_grad_shard` to support
@@ -614,9 +671,14 @@ def _should_free_in_backward(
     Returns whether FSDP should free the unsharded flattened parameter in the
     post-backward or not.
     """
-    return (
-        state._sync_gradients and handle.uses_sharded_strategy
-    ) or handle._config.sharding_strategy == HandleShardingStrategy.FULL_SHARD
+    # We always free if we are syncing gradients (i.e. not in no_sync) and parameters
+    # are sharded.
+    free_unsharded = state._sync_gradients and handle.uses_sharded_strategy
+    # For NO_SHARD we don't need to free full parameters, for ZeRO-2 strategies, we skip
+    # freeing in backward.
+    return free_unsharded or (
+        handle._config.sharding_strategy in RESHARD_AFTER_FORWARD_STRATEGIES
+    )
 
 
 @no_type_check
diff --git a/torch/distributed/fsdp/api.py b/torch/distributed/fsdp/api.py
index 18f3cd3069ddf..4fd3699b65c7d 100644
--- a/torch/distributed/fsdp/api.py
+++ b/torch/distributed/fsdp/api.py
@@ -46,12 +46,21 @@ class ShardingStrategy(Enum):
       :class:`DistributedDataParallel` API. For gradients, this strategy
       synchronizes them (via all-reduce) after the backward computation. The
       unsharded optimizer states are updated locally per rank.
+    - ``HYBRID_SHARD``: Apply ``FULL_SHARD`` within a node, and replicate parameters across
+        nodes. This results in reduced communication volume as expensive all-gathers and
+        reduce-scatters are only done within a node, which can be more performant for medium
+        -sized models.
+    - ``_HYBRID_SHARD_ZERO2``: Apply ``SHARD_GRAD_OP`` within a node, and replicate parameters across
+        nodes. This is like ``HYBRID_SHARD``, except this may provide even higher throughput
+        since the unsharded parameters are not freed after the forward pass, saving the
+        all-gathers in the pre-backward.
     """
 
     FULL_SHARD = auto()
     SHARD_GRAD_OP = auto()
     NO_SHARD = auto()
-    # HYBRID_SHARD = auto()
+    HYBRID_SHARD = auto()
+    _HYBRID_SHARD_ZERO2 = auto()
 
 
 class BackwardPrefetch(Enum):
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
index 0025a200dffa0..0502a38cc4729 100644
--- a/torch/distributed/fsdp/flat_param.py
+++ b/torch/distributed/fsdp/flat_param.py
@@ -104,6 +104,8 @@ class HandleShardingStrategy(Enum):
     FULL_SHARD = auto()
     SHARD_GRAD_OP = auto()
     NO_SHARD = auto()
+    HYBRID_SHARD = auto()
+    _HYBRID_SHARD_ZERO2 = auto()
 
 
 @dataclass
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index d70146ba0665c..de17d12fde470 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -22,7 +22,6 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-from torch.distributed import ProcessGroup
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     _CHECKPOINT_WRAPPED_MODULE,
     ActivationWrapper,
@@ -46,6 +45,8 @@
     _init_process_group_state,
     _init_runtime_state,
     _init_state_dict_state,
+    HYBRID_SHARDING_STRATEGIES,
+    ProcessGroupType,
 )
 from torch.distributed.fsdp._runtime_utils import (
     _lazy_init,
@@ -187,8 +188,12 @@ class FullyShardedDataParallel(nn.Module):
     Args:
         module (nn.Module):
             This is the module to be wrapped with FSDP.
-        process_group (Optional[ProcessGroup]):
-            This is the process group used for collective communications.
+        process_group: Optional[Union[ProcessGroup, Tuple[ProcessGroup, ProcessGroup]]]
+            This is the process group used for collective communications and
+            the one over which the model is sharded. For hybrid sharding strategies such as
+            ``ShardingStrategy.HYBRID_SHARD`` or ``ShardingStrategy._HYBRID_SHARD_ZERO2``, users can
+            pass in a tuple of process groups representing the groups to shard and replicate across,
+            respectively.
         sharding_strategy (Optional[ShardingStrategy]):
             This configures the sharding strategy used by FSDP, which may trade
             off memory saving and communication overhead. See
@@ -312,7 +317,7 @@ class FullyShardedDataParallel(nn.Module):
     def __init__(
         self,
         module: nn.Module,
-        process_group: Optional[ProcessGroup] = None,
+        process_group: ProcessGroupType = None,
         sharding_strategy: Optional[ShardingStrategy] = None,
         cpu_offload: Optional[CPUOffload] = None,
         auto_wrap_policy: Optional[Union[Callable, _FSDPPolicy]] = None,
@@ -333,6 +338,12 @@ def __init__(
         # Add module annotations for Dynamo support (see function for details)
         _annotate_modules_for_dynamo(module, self._ignored_modules, use_orig_params)
 
+        # Initializes self.process_group, along with rank and world size. This will
+        # also set another attribute, _inter_node_pg, to control the process group
+        # over which sharding occurs, if sharding_strategy is {HYBRID_SHARD, _HYBRID_SHARD_ZERO2}.
+        # Note that this is done before auto_wrapping, so that child FSDP modules simply pick up
+        # the same process group state as the root FSDP module.
+        _init_process_group_state(self, process_group, sharding_strategy, auto_wrap_policy)
         if auto_wrap_policy is not None:
             auto_wrap_kwargs = {
                 "module": module,
@@ -355,9 +366,14 @@ def __init__(
                 "limit_all_gathers": limit_all_gathers,
                 "use_orig_params": use_orig_params,
             }
+            if sharding_strategy in HYBRID_SHARDING_STRATEGIES:
+                # Share root process groups with children to maintain
+                # the invariant that all FSDP modules will have the same
+                # process groups.
+                fsdp_kwargs["process_group"] = (self.process_group, self._inter_node_pg)
+
             _auto_wrap(auto_wrap_kwargs, fsdp_kwargs, FullyShardedDataParallel)
 
-        _init_process_group_state(self, process_group)
         backward_prefetch_limit = 1
         forward_prefetch_limit = 1
         _init_core_state(
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index 2fd9957e6e202..37a96f6754141 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -290,11 +290,16 @@ def init(
                 across constructions.
             add_bn (bool): Whether to include batch norm in the model.
         """
+
         if fsdp_kwargs is None:
             fsdp_kwargs = {}
         if fsdp_init_mode == FSDPInitMode.NO_FSDP:
+            if isinstance(group, tuple):
+                pg = group[0]
+            else:
+                pg = group
             return TransformerWithSharedParams(
-                group, cuda_init_mode, add_bn, deterministic
+                pg, cuda_init_mode, add_bn, deterministic
             )
         elif fsdp_init_mode == FSDPInitMode.RECURSIVE:
             # Default to the `ModuleWrapPolicy`
@@ -307,11 +312,28 @@ def init(
                 )
             else:
                 auto_wrap_policy = fsdp_kwargs.pop("auto_wrap_policy")
+
+            if (
+                "sharding_strategy" in fsdp_kwargs
+                and fsdp_kwargs["sharding_strategy"] in {
+                    ShardingStrategy.HYBRID_SHARD,
+                    ShardingStrategy._HYBRID_SHARD_ZERO2
+                } and not isinstance(group, tuple)
+            ):
+                fsdp_pg = None
+            else:
+                fsdp_pg = group
+
+            if isinstance(group, tuple):
+                tformer_pg = group[0]
+            else:
+                tformer_pg = group
+
             fsdp_model = FSDP(
                 TransformerWithSharedParams(
-                    group, cuda_init_mode, add_bn, deterministic
+                    tformer_pg, cuda_init_mode, add_bn, deterministic
                 ),
-                group,
+                fsdp_pg,
                 auto_wrap_policy=auto_wrap_policy,
                 **fsdp_kwargs,
             )
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 814dd3d5ad5f8..23f27323a2d27 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -900,7 +900,7 @@ def test_new_subgroups(self):
         @skip_if_no_gpu
         def test_new_subgroups_group_size_exceeds_world_size(self):
             with self.assertRaisesRegex(
-                ValueError, "The arg 'group_size' must not exceed the world size"
+                ValueError, "must not exceed"
             ):
                 dist.new_subgroups(100)
 

From 0802ef88a6726310ec3b192b2bda52e3cc50cfcd Mon Sep 17 00:00:00 2001
From: Ankur Verma <31362771+ankurvdev@users.noreply.github.com>
Date: Thu, 8 Dec 2022 16:18:36 +0000
Subject: [PATCH 1729/1922] Remove TORCH_API from inline
 at::internal::lazy_init_num_thread (#89511)

The function signature in its current state is ambiguous.
Its an inline function that is also declared to be imported from the DLL.
which leaves it subject to compilers decision to choose one or the other and depending on what the compiler/linker may choose we may get one of the two behaviors for the `aten::init_num_threads` call:

1. Once-per-dll-in-a-thread (if its inlined)
2. Once-per-thread (if its imported)

I suspect once-per-dll-in-a-thread is already the case currently because it being tagged inline
So removing the inline will simply make it a little more consistent and clear.

The function exists to avoid repeated calls to aten::init_num_threads.
Being in an "internal" namespace, the function isnt expected to be called by external plugins which means that the "once-per-dll-in-a-thread" behavior isn't that much of a problem anyway

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89511
Approved by: https://github.com/malfet
---
 aten/src/ATen/Parallel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/Parallel.h b/aten/src/ATen/Parallel.h
index 4693997624e98..ff14f568d22a6 100644
--- a/aten/src/ATen/Parallel.h
+++ b/aten/src/ATen/Parallel.h
@@ -29,7 +29,7 @@ TORCH_API bool in_parallel_region();
 namespace internal {
 
 // Initialise num_threads lazily at first parallel call
-inline TORCH_API void lazy_init_num_threads() {
+inline void lazy_init_num_threads() {
   thread_local bool init = false;
   if (C10_UNLIKELY(!init)) {
     at::init_num_threads();

From 8ded2f1e6b128a5d6872c867125608bf112c491a Mon Sep 17 00:00:00 2001
From: Bin Bao <binbao@fb.com>
Date: Wed, 7 Dec 2022 22:11:56 +0000
Subject: [PATCH 1730/1922] [inductor] Fallback for index with None in the
 middle of indices (#90022)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90022
Approved by: https://github.com/ngimel
---
 test/inductor/test_torchinductor.py | 13 ++++++++++
 torch/_inductor/lowering.py         | 37 ++++++++++++++++++++++-------
 2 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index a2a1cb096c12a..5ea874b0fdb80 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -3057,6 +3057,19 @@ def fn(a, b):
             ),
         )
 
+    def test_index3(self):
+        def fn(x, ia, ib):
+            return (x[:, ia, None, ib, 0],)
+
+        self.common(
+            fn,
+            (
+                torch.randn(3, 4, 4, 4, 3),
+                torch.tensor([0, 2, 1], dtype=torch.int64),
+                torch.tensor([0, 2, 1], dtype=torch.int64),
+            ),
+        )
+
     def test_index_select(self):
         def fn(a, b):
             return (
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 397aa9408c9a0..b1c99ba23a133 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -1774,19 +1774,26 @@ def fn(idx):
     )
 
 
-def check_and_broadcast_indices(indices):
+def check_and_broadcast_indices(indices, device):
     assert all(
         i.get_dtype() in (torch.int64, torch.int32, torch.bool, torch.uint8)
         for i in indices
         if i is not None
     ), f"indices must be int64, byte or bool. Got {[i.get_dtype() for i in indices if i is not None]}"
-    assert all(
-        [i.get_dtype() in (torch.int32, torch.int64) for i in indices if i is not None]
-    ), "bool indices are not supported yet"
+    if any(
+        i.get_dtype() in (torch.bool, torch.uint8) for i in indices if i is not None
+    ):
+        raise NotImplementedError("Fallback for bool indices")
+
     valid_idxs = [i for i, x in enumerate(indices) if isinstance(x, TensorBox)]
     assert len(valid_idxs) > 0, "requires at least 1 non-None index"
     new_indices = [None] * len(indices)
     for i, x in zip(valid_idxs, broadcast_tensors(*[indices[i] for i in valid_idxs])):
+        # Eager allows indices to be CPU tensor when running on CUDA
+        # FIXME: Calling to_device(x, device) should work but
+        # test_advancedindex_mixed_cpu_devices still fails
+        if x.get_device() != device:
+            raise NotImplementedError("Fallback when indices is on a different device")
         new_indices[i] = x
         output_dim = len(x.get_size())
     start_offset = 0
@@ -1797,9 +1804,10 @@ def check_and_broadcast_indices(indices):
     while tmp and tmp[0] is None:
         tmp.pop(0)
         start_offset += 1
-    assert all((i is not None) for i in tmp)
-    end_offset = output_dim + start_offset
+    if any((i is None) for i in tmp):
+        raise NotImplementedError("Fallback when None is in the middle of indices")
 
+    end_offset = output_dim + start_offset
     return new_indices, start_offset, end_offset
 
 
@@ -1807,10 +1815,18 @@ def check_and_broadcast_indices(indices):
 def index(x, indices):
     assert isinstance(indices, (list, tuple))
     x_loader = x.make_loader()
-    indices, start_offset, end_offset = check_and_broadcast_indices(indices)
+    try:
+        indices, start_offset, end_offset = check_and_broadcast_indices(
+            indices, x.get_device()
+        )
+    except NotImplementedError:
+        x.realize()
+        return fallback_handler(aten.index)(x, indices)
+
     indices_sizes = [i.get_size() for i in indices if i is not None]
     indices_loaders = [i.make_loader() for i in indices if i is not None]
     # no guards on output size, all the guards are set in broadcast_tensors
+
     output_size = list(indices_sizes[0])
 
     x_size = x.get_size()
@@ -1891,7 +1907,12 @@ def index_put_(self, indices, values, accumulate=False):
         return self
 
     values = to_dtype(values, self.get_dtype())
-    indices, start_offset, end_offset = check_and_broadcast_indices(indices)
+    try:
+        indices, start_offset, end_offset = check_and_broadcast_indices(
+            indices, self.get_device()
+        )
+    except NotImplementedError:
+        return index_put_fallback(self, indices, values, accumulate)
     indices_sizes = [i.get_size() for i in indices if i is not None]
     indices_loaders = [i.make_loader() for i in indices if i is not None]
 

From a870dfb671ba3860afc76a010a9d69d017a4201d Mon Sep 17 00:00:00 2001
From: Driss Guessous <drisspg@fb.com>
Date: Thu, 8 Dec 2022 17:02:06 +0000
Subject: [PATCH 1731/1922] Add support to foreach torch empty for bfloat16s
 (#90437)

# Summary
When training a model with SGD(..., foreach=true) found that bfloat16 model was erroring with no cuda support for empty.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90437
Approved by: https://github.com/soumith
---
 aten/src/ATen/native/cuda/ForeachUnaryOp.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
index 09a29e0c62db3..29b2a07a82441 100644
--- a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
@@ -317,7 +317,7 @@ void foreach_tensor_zero_cuda_(TensorList tensors) {
     std::vector<std::vector<at::Tensor>> tensor_lists;
     tensor_lists.emplace_back(tensors.vec());
 
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, tensors[0].scalar_type(), "foreach_zero_cuda_", [&]() {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, tensors[0].scalar_type(), "foreach_zero_cuda_", [&]() {
         multi_tensor_apply<1>(tensor_lists,
                               ZeroFunctor<scalar_t,
                                           /* depth */ 1,

From 456508d6f834748a89924737c8c3d4c7f6d9a28b Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Thu, 8 Dec 2022 17:12:10 +0000
Subject: [PATCH 1732/1922] Revert "Fix issue 38095 TODO in
 test_multiprocessing.py (#90335)"

This reverts commit cbb2d5af81dcfaf181db7e9083b9c41b29fdb4eb.

Reverted https://github.com/pytorch/pytorch/pull/90335 on behalf of https://github.com/clee2000 due to somehow caused test_multiprocessing to timeout https://hud.pytorch.org/pytorch/pytorch/commit/cbb2d5af81dcfaf181db7e9083b9c41b29fdb4eb https://github.com/pytorch/pytorch/actions/runs/3645873711/jobs/6159998523
---
 test/test_multiprocessing.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/test_multiprocessing.py b/test/test_multiprocessing.py
index dba249e7de7d0..ae0d87be216a1 100644
--- a/test/test_multiprocessing.py
+++ b/test/test_multiprocessing.py
@@ -418,7 +418,8 @@ def test_cuda_memory_allocation(self):
         t = []
         for _ in range(5):
             t.append(q.get())
-        self.assertEqual(t[0], torch.full([5], 0.))
+        # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
+        self.assertEqualIgnoreType(t[0], torch.full([5], 0.))
         del t
         e.set()
         p.join(1)
@@ -641,7 +642,7 @@ def _test_event_handle_importer_consumer(handle, p2c, c2p):
         c2p.put(0)  # notify parent child is ready
         p2c.get()  # wait for record in parent
         e1.synchronize()
-        c2p.put(1)  # notify synchronization is done in child
+        c2p.put(1)  # nofity synchronization is done in child
         p2c.get()  # wait for parent to finish before destructing child event
 
     @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \

From 3a55b5a1e0be5d6532d897d749f77886bd9bd482 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <dvieriu@apple.com>
Date: Thu, 8 Dec 2022 17:59:55 +0000
Subject: [PATCH 1733/1922] Add additional tests for view slice tensors
 (#86282)

Fixes https://github.com/pytorch/pytorch/issues/83995 and https://github.com/pytorch/pytorch/issues/84489

Pull Request resolved: https://github.com/pytorch/pytorch/pull/86282
Approved by: https://github.com/kulinseth
---
 test/test_mps.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/test/test_mps.py b/test/test_mps.py
index 5b45bd4e6e692..4133ad4938b83 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -1319,6 +1319,31 @@ def test_expand_cpu_to_mps_copy(self):
 
         self.assertEqual(x_cpu, x.cpu())
 
+    def test_view_slice(self):
+        # https://github.com/pytorch/pytorch/issues/83995
+        NUM_SAMPLES = 60
+        s = (0, 1)
+
+        X = torch.rand(8000, 3, dtype=torch.float32, device='cpu')
+        X_mps = X.detach().clone().to("cpu")
+
+        idx = torch.randint(0, X.shape[0], (1,)).repeat(len(s))
+        pts = torch.randint(0, X.shape[0], (NUM_SAMPLES, X.shape[1]))
+        idx_mps = idx.to("mps")
+        pts_mps = pts.to("mps")
+        pts[:, s] = idx
+        pts_mps[:, s] = idx_mps
+
+        actual_pts = torch.zeros(NUM_SAMPLES, X.shape[1], dtype=torch.float)
+        actual_pts_mps = torch.zeros(NUM_SAMPLES, X.shape[1], dtype=torch.float, device="mps")
+
+        for i in range(NUM_SAMPLES):
+            for j in range(X.shape[1]):
+                actual_pts_mps[i, j] = X_mps[pts_mps[i, j], j]
+                actual_pts[i, j] = X[pts[i, j], j]
+                self.assertEqual(actual_pts[i, j], actual_pts_mps[i, j])
+
+
     def test_slice(self):
         values = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]
         cpu_x = torch.tensor(values, device='cpu')

From 322cca254d231a0a743346fc58f2ad115f5b81d5 Mon Sep 17 00:00:00 2001
From: Jiewen Tan <jwtan@google.com>
Date: Thu, 8 Dec 2022 18:23:17 +0000
Subject: [PATCH 1734/1922] [LTC] Tweak LazyTensor Class for XLATensor (#90363)

Summary:
This pull request makes some tweaks on LazyTensor class such that it's easier for XLATensor to inherit.

1. It replaces data_ptr() with data() which now returns a const shared_ptr& type.
2. It adds a temporary ctor to LazyTensor::Data such that XLATensor::Data can easily inherits it.
3. It moves LazyTensor(std::shared_ptr<Data>) and SetTensorData(at::Tensor) to protected for XLATensor to access.

Test Plan:
CI.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90363
Approved by: https://github.com/JackCaoG
---
 .github/ci_commit_pins/xla.txt  |  2 +-
 torch/csrc/lazy/core/tensor.cpp | 10 +++++-----
 torch/csrc/lazy/core/tensor.h   | 18 ++++++++++--------
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index 6e8049d330f4d..204ebba1034a0 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-640a5b8a8abba13be7fad286a5bbd30a5e024164
+b55aec841b9cf680b04abefaf3c0197a51de8b08
diff --git a/torch/csrc/lazy/core/tensor.cpp b/torch/csrc/lazy/core/tensor.cpp
index 734dc5fdbd9ac..a7890fc3e0635 100644
--- a/torch/csrc/lazy/core/tensor.cpp
+++ b/torch/csrc/lazy/core/tensor.cpp
@@ -36,21 +36,21 @@ LazyTensorPtr LazyTensor::Create(
   TORCH_CHECK(tensor.device().type() != at::kLazy);
   LazyTensorPtr lazy_tensor =
       c10::make_intrusive<LazyTensor>(LazyTensor(tensor, device));
-  LazyGraphExecutor::Get()->RegisterTensor(lazy_tensor->data_ptr());
+  LazyGraphExecutor::Get()->RegisterTensor(lazy_tensor->data());
   return lazy_tensor;
 }
 
 LazyTensorPtr LazyTensor::Create(Value ir_value, const BackendDevice& device) {
   LazyTensorPtr lazy_tensor =
       c10::make_intrusive<LazyTensor>(LazyTensor(std::move(ir_value), device));
-  LazyGraphExecutor::Get()->RegisterTensor(lazy_tensor->data_ptr());
+  LazyGraphExecutor::Get()->RegisterTensor(lazy_tensor->data());
   return lazy_tensor;
 }
 
 LazyTensorPtr LazyTensor::Create(BackendDataPtr handle) {
   LazyTensorPtr lazy_tensor =
       c10::make_intrusive<LazyTensor>(LazyTensor(std::move(handle)));
-  LazyGraphExecutor::Get()->RegisterTensor(lazy_tensor->data_ptr());
+  LazyGraphExecutor::Get()->RegisterTensor(lazy_tensor->data());
   return lazy_tensor;
 }
 
@@ -71,9 +71,9 @@ LazyTensor::LazyTensor(Value ir_value, const BackendDevice& device)
 
 LazyTensor::LazyTensor(std::shared_ptr<Data> data) : data_(std::move(data)) {}
 
-LazyTensor::Data* LazyTensor::data() const {
+auto LazyTensor::data() const -> const std::shared_ptr<Data>& {
   TORCH_CHECK(data_ != nullptr, "Trying to access a null cursor");
-  return data_.get();
+  return data_;
 }
 
 int64_t LazyTensor::size(int64_t dim) const {
diff --git a/torch/csrc/lazy/core/tensor.h b/torch/csrc/lazy/core/tensor.h
index c58b773e07b1f..5c1bee431c180 100644
--- a/torch/csrc/lazy/core/tensor.h
+++ b/torch/csrc/lazy/core/tensor.h
@@ -37,6 +37,10 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target {
         : tensor_data(std::move(tensor_data)),
           device(std::move(device)),
           unique_id(GetNextTensorId()) {}
+    // TODO(alanwaketan): Remove this ctor. This is a
+    // temporary ctor to ease XLA LTC migration.
+    Data(BackendDevice device)
+        : device(std::move(device)), unique_id(GetNextTensorId()) {}
 
     ~Data();
 
@@ -82,7 +86,7 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target {
   void UpdateFromTensorOut(at::Tensor tensor);
   void UpdateFromTensorOut(const LazyTensorPtr& tensor);
 
-  Data* data() const;
+  const std::shared_ptr<Data>& data() const;
 
   at::ScalarType dtype() const;
 
@@ -125,17 +129,15 @@ class TORCH_API LazyTensor : public c10::intrusive_ptr_target {
 
   void AssignIrValue(Value ir_value) const;
 
+ protected:
+  explicit LazyTensor(std::shared_ptr<Data> data);
+
+  void SetTensorData(at::Tensor tensor_data);
+
  private:
   LazyTensor(const at::Tensor& tensor, const BackendDevice& device);
   LazyTensor(Value ir_value, const BackendDevice& device);
   explicit LazyTensor(BackendDataPtr handle);
-  explicit LazyTensor(std::shared_ptr<Data> data);
-
-  std::shared_ptr<Data> data_ptr() const {
-    return data_;
-  }
-
-  void SetTensorData(at::Tensor tensor_data);
 
   Value CreateTensorNode(BackendDataPtr data, bool read_only) const;
 

From ba74555ae247e39d823c332eaf51227e74fc37ed Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Thu, 8 Dec 2022 18:29:06 +0000
Subject: [PATCH 1735/1922] Revert "[inductor] Use decomposition for _to_copy
 (#90314)"

This reverts commit 3fdb5f2dda7164f6282e80c39799843527d135e7.

Reverted https://github.com/pytorch/pytorch/pull/90314 on behalf of https://github.com/desertfire due to regresses performance on hf_Bert
---
 test/inductor/test_torchinductor_opinfo.py |  2 ++
 torch/_decomp/decompositions.py            |  3 +--
 torch/_inductor/decomposition.py           |  1 -
 torch/_inductor/lowering.py                | 31 +++++++++++++++++++---
 4 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 1fff48673afd4..10e6cf1783ef0 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -238,6 +238,7 @@ def process(device_type):
     "stft": {f32, f64},
     "svd_lowrank": {f32, f64},
     "tensor_split": {b8, f16, f32, f64, i32, i64},
+    "to": {b8, f16, f32, f64, i32, i64},
     "to_sparse": {f32, f64},
     "tril": {f16},
     "triu": {f16},
@@ -336,6 +337,7 @@ def process(device_type):
     "stft": {f32, f64},
     "svd_lowrank": {f32, f64},
     "tensor_split": {b8, f16, f32, f64, i32, i64},
+    "to": {b8, f16, f32, f64, i32, i64},
     "to_sparse": {f16, f32, f64},
     "uniform": {f16, f32, f64},
     "unique": {b8, f16, f32, f64, i32, i64},
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 0e8e440084da8..1a8335dc292a1 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -1523,8 +1523,7 @@ def _to_copy(
 ):
     assert not layout or layout == torch.strided, "TODO"
     assert not pin_memory, "TODO"
-    if device is None and dtype is None and memory_format is None:
-        return x.clone()
+    assert device is not None or dtype is not None or memory_format is not None
     dtype_converted = False
     if device is not None and device != x.get_device():
         # avoid conversions on cpu
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index 0492974b933d5..f8fedcc786015 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -98,7 +98,6 @@
         aten.t,
         aten.tanh_backward,
         aten.threshold_backward,
-        aten._to_copy,
         aten.transpose.int,
         aten.tril.default,
         aten.unfold,
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index b1c99ba23a133..45f1772e197ad 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -370,7 +370,6 @@ def _to_dtype(x):
     return make_pointwise(_to_dtype, override_return_dtype=dtype)(x)
 
 
-@register_lowering(prims.device_put, type_promotion_kind=None)
 def to_device(x: TensorBox, device: torch.device):
     device = decode_device(device)
     if x.get_device() == device:
@@ -378,6 +377,32 @@ def to_device(x: TensorBox, device: torch.device):
     return TensorBox.create(ir.DeviceCopy.create(x, device))
 
 
+@register_lowering(aten._to_copy)
+def _to_copy(
+    x,
+    *,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=None,
+    non_blocking=False,
+    memory_format=None,
+):
+    assert not layout or layout == torch.strided, "TODO"
+    assert not pin_memory, "TODO"
+    assert not memory_format, "TODO"
+    if device:
+        device = decode_device(device)
+    if device is not None and device != x.get_device():
+        if dtype is not None and device.type == "cpu":
+            # CPU can do fewer type conversions
+            x = to_dtype(x, decode_dtype(dtype))
+        x = to_device(x, device)
+    if dtype is not None:
+        x = to_dtype(x, decode_dtype(dtype))
+    return x
+
+
 def ops_wrapper(name):
     assert isinstance(name, str)
 
@@ -1548,9 +1573,9 @@ def binary_search(start, end):
 def as_tensor(data, dtype=None, device=None):
     if isinstance(data, TensorBox):
         if dtype is not None:
-            data = to_dtype(data, dtype)
+            data = to(data, dtype)
         if device is not None:
-            data = to_device(data, device)
+            data = to(data, device)
         return data
     return tensor(data, dtype=dtype, device=device)
 

From 924cd1cc3c0bf43ff8a33deb3d51317c41407216 Mon Sep 17 00:00:00 2001
From: Yuxin Wu <ppwwyyxx@users.noreply.github.com>
Date: Thu, 8 Dec 2022 18:29:04 +0000
Subject: [PATCH 1736/1922] skip flaky tests (rather than expectedFailure)
 (#90233)

They are flaky but don't always fail. So `expectedFailure` is incorrect.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90233
Approved by: https://github.com/mruberry, https://github.com/soumith
---
 torch/testing/_internal/common_methods_invocations.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index c01f476f17ed2..4966cf981110a 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -11896,7 +11896,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # and if there are several indices pointing to the same memory,
                # gradcheck is oblivious about that and cannot perturb them all at once
                # (see sample_inputs_max_unpool_grad to find out more).
-               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD',
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD',
                             active_if=(not IS_MACOS)),
                DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_gradgrad'),
                DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_grad'),
@@ -11934,7 +11934,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # and if there are several indices pointing to the same memory,
                # gradcheck is oblivious about that and cannot perturb them all at once
                # (see sample_inputs_max_unpool_grad to find out more).
-               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD',
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD',
                             active_if=(not IS_MACOS)),
                DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_gradgrad'),
                DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_grad'),

From a123579d3f933ef1301a0166f8d6e246b2372ef0 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Wed, 7 Dec 2022 12:33:01 -0800
Subject: [PATCH 1737/1922] Add feature flag for the autograd.Function
 extension (#89858)

This PR adds a private runtime feature flag for the feature work we're going
to do with extending autograd.Function. The motivation of the feature flag
is:
- to guard the feature against unsuspecting users
- control the release of the feature to when we are ready to release it

We might not even need the feature flag (because we hope to have the
work done in the next month), but it is good practice and it does touch
currently public API (autograd.Function).

Concretely, "autograd.Function extension" refers to:
- adding an optional `setup_context` staticmethod to autograd.Function
- adding an optional `vmap` staticmethod to autograd.Function
- autograd.Function support for functorch

Test Plan:
- new test that the feature flag works
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89858
Approved by: https://github.com/soulitzer
---
 test/test_autograd.py            | 14 ++++++++++++++
 torch/_C/__init__.pyi.in         |  2 ++
 torch/csrc/autograd/function.cpp | 12 ++++++++++++
 torch/csrc/autograd/function.h   |  9 +++++++++
 torch/csrc/autograd/init.cpp     | 32 ++++++++++++++++++++++++++++++++
 5 files changed, 69 insertions(+)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index 0a4db6667fbed..fc8cf6cf6ef9f 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -981,6 +981,20 @@ def prehook(grad_output):
             self.assertEqual(pre_counter[0], 4)
             self.assertTrue(torch.allclose(a.grad, torch.ones(3, 3) * 2))
 
+    def test_autograd_function_extension_feature_flag(self):
+        try:
+            prev = torch._C._is_autograd_function_extension_enabled()
+
+            torch._C._set_autograd_function_extension_enabled(True)
+            state = torch._C._is_autograd_function_extension_enabled()
+            self.assertTrue(state)
+
+            torch._C._set_autograd_function_extension_enabled(False)
+            state = torch._C._is_autograd_function_extension_enabled()
+            self.assertFalse(state)
+        finally:
+            torch._C._set_autograd_function_extension_enabled(prev)
+
     def test_grad_fn_prehooks_multiple_outputs(self):
         # Compute gradients without hooks
         b = torch.rand(3, 3, requires_grad=True)
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 0f47074aed48e..be71a94296bf9 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -921,6 +921,8 @@ def autocast_increment_nesting() -> _int: ...
 def autocast_decrement_nesting() -> _int: ...
 def is_autocast_cache_enabled() -> _bool: ...
 def set_autocast_cache_enabled(enabled: _bool) -> None: ...
+def _set_autograd_function_extension_enabled(enabled: _bool) -> None: ...
+def _is_autograd_function_extension_enabled() -> _bool: ...
 def set_anomaly_enabled(enabled: _bool, check_nan: _bool = True) -> None: ...
 def is_anomaly_enabled() -> _bool: ...
 def is_anomaly_check_nan_enabled() -> _bool: ...
diff --git a/torch/csrc/autograd/function.cpp b/torch/csrc/autograd/function.cpp
index 5ab3447ca9ef3..22c67d0771d3e 100644
--- a/torch/csrc/autograd/function.cpp
+++ b/torch/csrc/autograd/function.cpp
@@ -99,5 +99,17 @@ void deleteNode(Node* function) {
   }
 }
 
+namespace {
+bool kAutogradFunctionExtensionEnabled = false;
+}
+
+bool isAutogradFunctionExtensionEnabled() {
+  return kAutogradFunctionExtensionEnabled;
+}
+
+void setAutogradFunctionExtensionEnabled(bool enabled) {
+  kAutogradFunctionExtensionEnabled = enabled;
+}
+
 } // namespace autograd
 } // namespace torch
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index bb5f4b1eaad09..d27d473b3f805 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -55,6 +55,15 @@ class NodeGuard {
   std::shared_ptr<Node> last_evaluating_node_;
 };
 
+// Global (not thread-local) feature flag for the new autograd.Function
+// extension. The extension consists of:
+// - splitting autograd.Function.forward into forward() and setup_context().
+// - adding a vmap staticmethod to autograd.Function
+// The feature flag is for preventing users from unknowningly stumbling upon
+// the feature and will be removed once we've ironed out the details.
+TORCH_API bool isAutogradFunctionExtensionEnabled();
+TORCH_API void setAutogradFunctionExtensionEnabled(bool enabled);
+
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 //                               Node
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 6bfd4bd4bfed6..709cc46308f3c 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -515,6 +515,30 @@ static PyObject* set_autocast_cache_enabled(PyObject* _unused, PyObject* arg) {
   END_HANDLE_TH_ERRORS
 }
 
+static PyObject* is_autograd_function_extension_enabled(
+    PyObject* _unused,
+    PyObject* arg) {
+  HANDLE_TH_ERRORS
+  if (torch::autograd::isAutogradFunctionExtensionEnabled()) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* set_autograd_function_extension_enabled(
+    PyObject* _unused,
+    PyObject* arg) {
+  HANDLE_TH_ERRORS
+  if (!PyBool_Check(arg)) {
+    throw TypeError("enabled must be a bool (got %s)", Py_TYPE(arg)->tp_name);
+  }
+  torch::autograd::setAutogradFunctionExtensionEnabled(arg == Py_True);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
 static PyObject* set_grad_enabled(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
   if (!PyBool_Check(arg)) {
@@ -753,6 +777,14 @@ static PyMethodDef methods[] = { // NOLINT
      METH_NOARGS,
      nullptr},
     {"set_autocast_cache_enabled", set_autocast_cache_enabled, METH_O, nullptr},
+    {"_set_autograd_function_extension_enabled",
+     set_autograd_function_extension_enabled,
+     METH_O,
+     nullptr},
+    {"_is_autograd_function_extension_enabled",
+     is_autograd_function_extension_enabled,
+     METH_NOARGS,
+     nullptr},
     {"set_anomaly_enabled",
      castPyCFunctionWithKeywords(set_anomaly_mode_enabled),
      METH_VARARGS | METH_KEYWORDS,

From 3cc6ba9c0a4e9908f868913008b8135c46b44f1a Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Wed, 7 Dec 2022 13:37:14 -0800
Subject: [PATCH 1738/1922] Add setup_context staticmethod to autograd.Function
 (#89859)

Adds a setup_context staticmethod to autograd.Function.
If it exists, then the user splits the ctx-specific logic from the
forward() and puts it in the setup_context staticmethod.

Docs will come later when we remove the feature flag.

Test Plan:
- some light tests
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89859
Approved by: https://github.com/soulitzer
---
 test/test_autograd.py                   | 88 ++++++++++++++++++++++++
 torch/autograd/function.py              | 12 ++++
 torch/csrc/autograd/python_function.cpp | 89 ++++++++++++++++++++-----
 3 files changed, 174 insertions(+), 15 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index fc8cf6cf6ef9f..74cf400aee6fc 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -544,6 +544,94 @@ def fn(x):
         with self.assertRaisesRegex(RuntimeError, "expects an grad_fn"):
             torch._C._will_engine_execute_node(out)
 
+    def test_custom_function_setup_context_simple(self):
+        class MySquare(Function):
+            @staticmethod
+            def forward(x):
+                return x ** 2
+
+            @staticmethod
+            def setup_context(ctx, inputs, outputs):
+                x, = inputs
+                ctx.save_for_backward(x)
+
+            @staticmethod
+            def backward(ctx, gO):
+                x, = ctx.saved_tensors
+                return gO * 2 * x
+
+        with torch.autograd.function._set_autograd_function_extension_enabled(True):
+            x = torch.randn([], requires_grad=True)
+            y = MySquare.apply(x)
+            gx, = torch.autograd.grad(y, x)
+            self.assertEqual(gx, 2 * x)
+
+    def test_custom_function_setup_context_multi_output(self):
+        # Multiple outputs with some non-Tensor outputs.
+        class MySquare(Function):
+            @staticmethod
+            def forward(x):
+                two_x = x.item() * 2
+                return x ** 2, two_x
+
+            @staticmethod
+            def setup_context(ctx, inputs, outputs):
+                x, = inputs
+                _, two_x = outputs
+                ctx.two_x = two_x
+
+            @staticmethod
+            @once_differentiable
+            def backward(ctx, gO, _):
+                return gO * ctx.two_x
+
+        with torch.autograd.function._set_autograd_function_extension_enabled(True):
+            x = torch.randn([], requires_grad=True)
+            y, _ = MySquare.apply(x)
+            gx, = torch.autograd.grad(y, x)
+            self.assertEqual(gx, 2 * x)
+
+    def test_custom_function_setup_context_multi_input(self):
+        class MyReshape(Function):
+            @staticmethod
+            def forward(x, shape, scale_forward, scale_backward):
+                return x.reshape(shape) * scale_forward
+
+            @staticmethod
+            def setup_context(ctx, inputs, outputs):
+                x, shape, scale_forward, scale_backward = inputs
+                ctx.scale_backward = scale_backward
+                ctx.x_shape = x.shape
+
+            @staticmethod
+            def backward(ctx, gO):
+                return gO.reshape(ctx.x_shape) * ctx.scale_backward, None, None, None
+
+        class MyReshapeRef(Function):
+            @staticmethod
+            def forward(ctx, x, shape, scale_forward, scale_backward):
+                ctx.scale_backward = scale_backward
+                ctx.x_shape = x.shape
+                return x.reshape(shape) * scale_forward
+
+            @staticmethod
+            def backward(ctx, gO):
+                return gO.reshape(ctx.x_shape) * ctx.scale_backward, None, None, None
+
+        def test(x, shape, scale_forward, scale_backward):
+            y = MyReshape.apply(x, shape, scale_forward, scale_backward).sum()
+            gx, = torch.autograd.grad(y, x)
+
+            y_expected = MyReshapeRef.apply(x, shape, scale_forward, scale_backward).sum()
+            gx_expected, = torch.autograd.grad(y_expected, x)
+
+            self.assertEqual(y_expected, y)
+            self.assertEqual(gx_expected, gx)
+
+        with torch.autograd.function._set_autograd_function_extension_enabled(True):
+            test(torch.randn(24, requires_grad=True), (3, 8), 7, 11)
+            test(torch.randn(2, 3, 4, requires_grad=True), (6, 4), -1, 2)
+
     def test_accumulate_grad(self):
         grad_output = torch.ones(5, 5)
 
diff --git a/torch/autograd/function.py b/torch/autograd/function.py
index 2fc95f72d7aa0..cc8a082b5884e 100644
--- a/torch/autograd/function.py
+++ b/torch/autograd/function.py
@@ -1,3 +1,4 @@
+import contextlib
 import torch
 import torch._C as _C
 from torch._C import _functions
@@ -468,6 +469,17 @@ def traceable(fn_cls):
     return fn_cls
 
 
+# Private feature flag. Not user-facing.
+@contextlib.contextmanager
+def _set_autograd_function_extension_enabled(enabled=True):
+    try:
+        prev_state = torch._C._is_autograd_function_extension_enabled()
+        torch._C._set_autograd_function_extension_enabled(enabled)
+        yield
+    finally:
+        torch._C._set_autograd_function_extension_enabled(prev_state)
+
+
 class InplaceFunction(Function):
 
     def __init__(self, inplace=False):
diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
index c86bfbda2a7b0..28e840b728441 100644
--- a/torch/csrc/autograd/python_function.cpp
+++ b/torch/csrc/autograd/python_function.cpp
@@ -821,6 +821,43 @@ PyObject* THPFunction_maybe_clear_saved_tensors(
   END_HANDLE_TH_ERRORS
 }
 
+namespace {
+
+THPObjectPtr make_ctx_input_tuple(
+    THPFunction* ctx,
+    const UnpackedInput& unpacked_input,
+    int64_t num_args) {
+  THPObjectPtr ctx_input_tuple(PyTuple_New(num_args + 1));
+  if (!ctx_input_tuple)
+    return {};
+  Py_INCREF(ctx);
+  PyTuple_SET_ITEM(ctx_input_tuple.get(), 0, (PyObject*)ctx);
+  for (const auto i : c10::irange(num_args)) {
+    PyObject* arg = PyTuple_GET_ITEM(unpacked_input.input_tuple.get(), i);
+    Py_INCREF(arg);
+    PyTuple_SET_ITEM(ctx_input_tuple.get(), i + 1, arg);
+  }
+  return ctx_input_tuple;
+}
+
+THPObjectPtr make_ctx_input_output_tuple(
+    THPFunction* ctx,
+    UnpackedInput& unpacked_input,
+    PyObject* outputs) {
+  THPObjectPtr result(PyTuple_New(3));
+  if (!result)
+    return {};
+  Py_INCREF(ctx);
+  Py_INCREF(unpacked_input.input_tuple.get());
+  Py_INCREF(outputs);
+  PyTuple_SET_ITEM(result.get(), 0, (PyObject*)ctx);
+  PyTuple_SET_ITEM(result.get(), 1, unpacked_input.input_tuple.get());
+  PyTuple_SET_ITEM(result.get(), 2, outputs);
+  return result;
+}
+
+} // namespace
+
 PyObject* THPFunction_apply(PyObject* cls, PyObject* inputs) {
   HANDLE_TH_ERRORS
 
@@ -865,29 +902,51 @@ PyObject* THPFunction_apply(PyObject* cls, PyObject* inputs) {
   ctx->needs_input_grad = input_info.needs_input_grad.release();
   ctx->is_variable_input = std::move(input_info.is_variable_input);
 
-  // Prepend ctx to input_tuple, in preparation for static method call
+  // autograd.Function may optionally contain a setup_context staticmethod.
+  // In this case, autograd.Function.forward does NOT accept a ctx object.
+  bool has_separate_setup_context_fn =
+      (isAutogradFunctionExtensionEnabled() &&
+       PyObject_HasAttrString(cls, "setup_context"));
+
   auto num_args = PyTuple_GET_SIZE(inputs);
-  THPObjectPtr ctx_input_tuple(PyTuple_New(num_args + 1));
-  if (!ctx_input_tuple)
-    return nullptr;
-  Py_INCREF(ctx);
-  PyTuple_SET_ITEM(ctx_input_tuple.get(), 0, (PyObject*)ctx);
-  for (const auto i : c10::irange(num_args)) {
-    PyObject* arg = PyTuple_GET_ITEM(unpacked_input.input_tuple.get(), i);
-    Py_INCREF(arg);
-    PyTuple_SET_ITEM(ctx_input_tuple.get(), i + 1, arg);
-  }
 
   // Call forward
-  THPObjectPtr tensor_outputs;
+  THPObjectPtr outputs;
   {
     AutoGradMode grad_mode(false);
     at::AutoFwGradMode fw_grad_mode(false);
     THPObjectPtr forward_fn(PyObject_GetAttrString(cls, "forward"));
     if (!forward_fn)
       return nullptr;
-    tensor_outputs = PyObject_CallObject(forward_fn, ctx_input_tuple);
-    if (!tensor_outputs)
+    if (has_separate_setup_context_fn) {
+      // call forward followed by setup_context
+      outputs = PyObject_CallObject(forward_fn, unpacked_input.input_tuple);
+      if (!outputs) {
+        return nullptr;
+      }
+      // signature is setup_context(ctx, inputs, outputs)
+      auto ctx_input_output_tuple =
+          make_ctx_input_output_tuple(ctx, unpacked_input, outputs);
+      if (!ctx_input_output_tuple) {
+        return nullptr;
+      }
+      THPObjectPtr setup_context_fn(
+          PyObject_GetAttrString(cls, "setup_context"));
+      auto result =
+          PyObject_CallObject(setup_context_fn, ctx_input_output_tuple);
+      if (!result) {
+        return nullptr;
+      }
+    } else {
+      // call forward
+      auto ctx_input_tuple =
+          make_ctx_input_tuple(ctx, unpacked_input, num_args);
+      if (!ctx_input_tuple) {
+        return nullptr;
+      }
+      outputs = PyObject_CallObject(forward_fn, ctx_input_tuple);
+    }
+    if (!outputs)
       return nullptr;
   }
 
@@ -897,7 +956,7 @@ PyObject* THPFunction_apply(PyObject* cls, PyObject* inputs) {
       ctx,
       unpacked_input,
       inputs,
-      std::move(tensor_outputs),
+      std::move(outputs),
       is_executable,
       node);
   END_HANDLE_TH_ERRORS

From a478efc31271ac906a582acda201f2fcf060b5e9 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Wed, 7 Dec 2022 14:14:56 -0800
Subject: [PATCH 1739/1922] functorch.grad support for autograd.Function
 (#89860)

Happy to split this PR more if it helps.

This PR adds functorch.grad support for autograd.Function. There's a lot
going on; here is the high level picture and there are more details as
comments in the code.

Mechanism (PyOperator)
- Somehow, autograd.Function needs to dispatch with functorch. This is
necessary because every layer of functorch needs to see the
autograd.Function; grad layers need to preserve the backward pass.
- The mechanism for this is via PyOperator. If functorch transforms are
active, then we wrap the autograd.Function in a `custom_function_call`
PyOperator where we are able to define various rules for functorch
transforms.
- `custom_function_call` has a rule for the functorch grad transform.

autograd.Function changes
- I needed to make some changes to autograd.Function to make this work.
- First, this PR splits autograd.Function into a _SingleLevelFunction
(that works with a single level of functorch transform) and
autograd.Function (which works with multiple levels). This is necessary
because functorch's grad rule needs some way of specifying a backward
pass for that level only.
- This PR changes autograd.Function's apply to eitehr call
`custom_function_call` (if functorch is active) or super().apply (if
functorch isn't active).

Testing
- Most of this PR is just testing. It creates an autograd.Function
OpInfo database that then gets passed to the functorch grad-based tests
(grad, vjp, vjpvjp).
- Since functorch transform tests are autogenerated from OpInfo tests,
this is the easiest way to test various autograd.Function with
functorch.

Future
- jvp and vmap support coming next
- better error message (functorch only supports autograd.Function that
have the optional setup_context staticmethod)
- documentation to come when we remove the feature flag

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89860
Approved by: https://github.com/soulitzer
---
 aten/src/ATen/functorch/ADInterpreters.cpp    |   9 +-
 aten/src/ATen/functorch/DynamicLayer.cpp      |   8 +
 aten/src/ATen/functorch/DynamicLayer.h        |   1 +
 test/functorch/common_utils.py                |   3 +-
 test/functorch/test_eager_transforms.py       | 164 +++++++++++-
 test/functorch/test_ops.py                    |  14 +-
 torch/_C/_functorch.pyi                       |   3 +
 torch/_dynamo/variables/builder.py            |   2 +-
 torch/_functorch/autograd_function.py         | 188 +++++++++++++
 torch/_functorch/utils.py                     |  10 +
 torch/autograd/function.py                    | 141 ++++++----
 torch/csrc/autograd/python_function.cpp       |   1 +
 torch/csrc/functorch/init.cpp                 |   2 +
 .../testing/_internal/autograd_function_db.py | 247 ++++++++++++++++++
 14 files changed, 715 insertions(+), 78 deletions(-)
 create mode 100644 torch/_functorch/autograd_function.py
 create mode 100644 torch/testing/_internal/autograd_function_db.py

diff --git a/aten/src/ATen/functorch/ADInterpreters.cpp b/aten/src/ATen/functorch/ADInterpreters.cpp
index 174949bbc3b48..fb97114bec504 100644
--- a/aten/src/ATen/functorch/ADInterpreters.cpp
+++ b/aten/src/ATen/functorch/ADInterpreters.cpp
@@ -44,12 +44,17 @@ Tensor materializeGradWrappers(const Tensor& tensor, int64_t current_level) {
   return makeTensorWrapper(tensor, current_level, /*is_immutable=*/true);
 }
 
+static Tensor base_lift(const Tensor& tensor, int64_t level) {
+  auto tensor_ = unwrapIfDead(tensor);
+  return materializeGradWrappers(tensor_, level);
+}
+
 Tensor GradInterpreterPtr::lift(const Tensor& tensor) const {
-  return materializeGradWrappers(tensor, level());
+  return base_lift(tensor, level());
 }
 
 Tensor JvpInterpreterPtr::lift(const Tensor& tensor) const {
-  return materializeGradWrappers(tensor, level());
+  return base_lift(tensor, level());
 }
 
 static void autogradBasedTransformProcess(
diff --git a/aten/src/ATen/functorch/DynamicLayer.cpp b/aten/src/ATen/functorch/DynamicLayer.cpp
index d152f3c08c2d4..30fcc9e70bb25 100644
--- a/aten/src/ATen/functorch/DynamicLayer.cpp
+++ b/aten/src/ATen/functorch/DynamicLayer.cpp
@@ -291,6 +291,14 @@ DynamicLayer popDynamicLayerAndDeleteMetadata() {
   return result;
 }
 
+bool isDeadTensorWrapper(const Tensor& tensor) {
+  auto* wrapped = maybeGetTensorWrapper(tensor);
+  if (!wrapped) {
+    return false;
+  }
+  return !wrapped->is_alive();
+}
+
 Tensor unwrapIfDead(const Tensor& tensor) {
   auto* wrapped = maybeGetTensorWrapper(tensor);
   if (!wrapped) {
diff --git a/aten/src/ATen/functorch/DynamicLayer.h b/aten/src/ATen/functorch/DynamicLayer.h
index 6c7139f5c01ea..90e9ae514f5be 100644
--- a/aten/src/ATen/functorch/DynamicLayer.h
+++ b/aten/src/ATen/functorch/DynamicLayer.h
@@ -108,6 +108,7 @@ TORCH_API bool isInplaceOp(const c10::FunctionSchema& schema);
 TORCH_API c10::optional<size_t> findAliasedOutput(const FunctionSchema& schema, const int64_t immutable_input);
 
 TORCH_API Tensor unwrapIfDead(const Tensor& tensor);
+TORCH_API bool isDeadTensorWrapper(const Tensor& tensor);
 
 // Pretty printers
 TORCH_API std::ostream& operator<<(std::ostream& os, const DynamicLayer& layer);
diff --git a/test/functorch/common_utils.py b/test/functorch/common_utils.py
index c082340d7882e..41607bd62297c 100644
--- a/test/functorch/common_utils.py
+++ b/test/functorch/common_utils.py
@@ -14,6 +14,7 @@
 import os
 import unittest
 from torch.testing._internal.common_device_type import toleranceOverride
+from torch.testing._internal.autograd_function_db import autograd_function_db
 from collections import namedtuple
 
 IS_FBCODE = os.getenv('FUNCTORCH_TEST_FBCODE') == '1'
@@ -351,7 +352,7 @@ def skip(op_name, variant_name='', *, device_type=None, dtypes=None):
 
 
 def skipOps(test_case_name, base_test_name, to_skip):
-    all_opinfos = op_db + additional_op_db
+    all_opinfos = op_db + additional_op_db + autograd_function_db
     for decorate_meta in to_skip:
         matching_opinfos = [o for o in all_opinfos
                             if o.name == decorate_meta.op_name and
diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py
index e9d0cbfb4f919..39f8f818e5975 100644
--- a/test/functorch/test_eager_transforms.py
+++ b/test/functorch/test_eager_transforms.py
@@ -39,6 +39,7 @@
 from functorch.experimental import functionalize
 from torch._ops import PyOperator
 from torch._functorch.utils import enable_autograd_function
+from torch.autograd.function import _set_autograd_function_extension_enabled
 
 # NB: numpy is a testing dependency!
 import numpy as np
@@ -978,6 +979,114 @@ def h(x):
         self.assertEqual(z, 2)
 
 
+class TestAutogradFunction(TestCase):
+    @_set_autograd_function_extension_enabled()
+    def test_set_materialize_grads(self, device):
+        class A(torch.autograd.Function):
+            @staticmethod
+            def forward(x, y):
+                return x, y
+
+            @staticmethod
+            def setup_context(ctx, inputs, outputs):
+                ctx.set_materialize_grads(False)
+
+            @staticmethod
+            def backward(ctx, gx, gy):
+                self.assertIsNotNone(gx)
+                self.assertIsNone(gy)
+                return gx, gy
+
+        def f(y, x):
+            x, y = A.apply(x, y)
+            return x ** 2
+
+        x = torch.tensor(2., device=device)
+        y = torch.tensor(3., device=device)
+        # grad differentiates w.r.t. arg 0 by default
+        grad(f)(y, x)
+        grad(grad(f))(y, x)
+
+    @_set_autograd_function_extension_enabled()
+    def test_needs_input_grads(self, device):
+        class A(torch.autograd.Function):
+            @staticmethod
+            def forward(x, y):
+                return x * y
+
+            @staticmethod
+            def setup_context(ctx, inputs, outputs):
+                return
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                self.assertTrue(ctx.needs_input_grad[0])
+                self.assertFalse(ctx.needs_input_grad[1])
+                return None, None
+
+        x = torch.tensor(2., device=device)
+        y = torch.tensor(3., device=device)
+        # grad differentiates w.r.t. arg 0 by default
+        grad(A.apply)(x, y)
+        grad(grad(A.apply))(x, y)
+
+    def _get_NumpyCubeNotComposable(self):
+        class NumpyCubeNotComposable(torch.autograd.Function):
+            @staticmethod
+            def forward(input):
+                input_np = input.cpu().numpy()
+                return torch.tensor(input_np ** 3, device=input.device), input_np
+
+            @staticmethod
+            def setup_context(ctx, inputs, outputs):
+                ctx.input_np = outputs[1]
+                ctx.device = inputs[0].device
+
+            @staticmethod
+            @torch.autograd.function.once_differentiable
+            def backward(ctx, grad_output, grad_saved):
+                result_np = 3 * (ctx.input_np ** 2)
+                return torch.tensor(result_np, device=ctx.device)
+
+        return NumpyCubeNotComposable
+
+    @_set_autograd_function_extension_enabled()
+    def test_once_differentiable_autograd_vjp(self, device):
+        NumpyCubeNotComposable = self._get_NumpyCubeNotComposable()
+
+        def f(x):
+            y, _ = NumpyCubeNotComposable.apply(x)
+            return y
+
+        # regular autograd x vjp
+        x = torch.randn([], requires_grad=True, device=device)
+        grad_y = torch.randn_like(x, requires_grad=True)
+        _, vjp_fn = vjp(f, x)
+        gx, = vjp_fn(grad_y)
+
+        with self.assertRaisesRegex(RuntimeError, "marked with @once_differentiable"):
+            gx.backward()
+
+    # TODO: support torch.autograd.function.once_differentiable
+    # (or, if impossible, figure out how to raise a nice error)
+    # https://github.com/pytorch/pytorch/issues/90224
+    @unittest.expectedFailure
+    @_set_autograd_function_extension_enabled()
+    def test_once_differentiable_grad_vjp(self, device):
+        NumpyCubeNotComposable = self._get_NumpyCubeNotComposable()
+
+        # grad x vjp
+        x = torch.randn([], device=device)
+        grad_y = torch.randn_like(x)
+
+        def h(x, grad_y):
+            _, vjp_fn = vjp(f, x)
+            gx, = vjp_fn(grad_y)
+            return gx
+
+        grad(h, argnums=(0, 1))(x, grad_y)
+
+
 class TestVmapOfGrad(TestCase):
     def test_per_sample_grads_inplace_view(self, device):
         def compute_loss(weight, x, t):
@@ -2404,20 +2513,42 @@ def backward(ctx, gy):
 
         x = torch.randn([])
 
-        # by default, autograd.Function is disabled in a functorch transform
-        with self.assertRaisesRegex(RuntimeError, "autograd.Function"):
-            grad(MySin.apply)(x)
+        with torch.autograd.function._set_autograd_function_extension_enabled(False):
+            # by default, autograd.Function is disabled in a functorch transform
+            with self.assertRaisesRegex(RuntimeError, "autograd.Function"):
+                grad(MySin.apply)(x)
+
+            # we have a debug switch to allow it
+            self.assertFalse(torch._C._functorch.get_autograd_function_allowed())
+            try:
+                torch._C._functorch.set_autograd_function_allowed(True)
+                self.assertTrue(torch._C._functorch.get_autograd_function_allowed())
+                y = grad(MySin.apply)(x)
+            finally:
+                torch._C._functorch.set_autograd_function_allowed(False)
+            self.assertFalse(torch._C._functorch.get_autograd_function_allowed())
+            self.assertEqual(y, x.cos())
+
+    @_set_autograd_function_extension_enabled()
+    @parametrize('transform', [
+        'vmap', 'grad', 'jacrev', 'jacfwd', 'grad_and_value', 'hessian', 'functionalize'
+    ])
+    def test_autograd_function_no_setup_context(self, device, transform):
+        class MySin(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                ctx.save_for_backward(x)
+                return x.sin()
+
+            @staticmethod
+            def backward(ctx, gy):
+                x, = ctx.saved_tensors
+                return gy * x.cos()
 
-        # we have a debug switch to allow it
-        self.assertFalse(torch._C._functorch.get_autograd_function_allowed())
-        try:
-            torch._C._functorch.set_autograd_function_allowed(True)
-            self.assertTrue(torch._C._functorch.get_autograd_function_allowed())
-            y = grad(MySin.apply)(x)
-        finally:
-            torch._C._functorch.set_autograd_function_allowed(False)
-        self.assertFalse(torch._C._functorch.get_autograd_function_allowed())
-        self.assertEqual(y, x.cos())
+        x = torch.randn(3, device=device)
+        transform = getattr(functorch, transform)
+        with self.assertRaisesRegex(RuntimeError, 'must have a setup_context'):
+            transform(MySin.apply)(x)
 
     @parametrize('transform', [
         'vmap', 'grad', 'jacrev', 'jacfwd', 'grad_and_value', 'hessian', 'functionalize'
@@ -3568,7 +3699,7 @@ def mysum_batch_rule(interpreter, x, dim):
     def mysum_grad_rule(interpreter, x, dim):
         level = interpreter.level()
 
-        class MySum(torch.autograd.Function):
+        class MySum(torch.autograd.function._SingleLevelFunction):
             @staticmethod
             def forward(ctx, x, dim):
                 ctx.x_shape = x.shape
@@ -3714,6 +3845,11 @@ def f(x):
     globals(),
     only_for=only_for,
 )
+instantiate_device_type_tests(
+    TestAutogradFunction,
+    globals(),
+    only_for=only_for,
+)
 instantiate_parametrized_tests(
     TestMakeFunctional,
 )
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index 7a2f5b8dc61a1..75721a4e9f759 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -36,6 +36,10 @@
     is_valid_inplace_sample_input,
     loop2,
 )
+from torch.testing._internal.autograd_function_db import (
+    autograd_function_db
+)
+from torch.autograd.function import _set_autograd_function_extension_enabled
 
 from torch.testing._internal.opinfo.core import SampleInput
 from torch.utils._pytree import tree_flatten, tree_unflatten, tree_map
@@ -291,6 +295,7 @@ def is_inplace(op, variant):
 
 vjp_fail = {
     xfail('tensor_split'),  # data_ptr composite compliance
+    xfail('NumpyExpMarkDirtyAutogradFunction'),  # https://github.com/pytorch/pytorch/issues/90225
 }
 
 aliasing_ops = {
@@ -338,8 +343,9 @@ def is_inplace(op, variant):
 
 @unittest.skipIf(TEST_WITH_ASAN, "tests time out with asan, are probably redundant")
 class TestOperators(TestCase):
+    @_set_autograd_function_extension_enabled()
     @with_tf32_off  # https://github.com/pytorch/pytorch/issues/86798
-    @ops(op_db + additional_op_db, allowed_dtypes=(torch.float,))
+    @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
     @skipOps('TestOperators', 'test_grad', vjp_fail.union({
         xfail('linalg.eig'),  # diagonal_scatter does not support complex
         xfail('chalf', '', device_type='cpu'),  # RuntimeError: "sum_cpu" not implemented for 'ComplexHalf'
@@ -510,7 +516,8 @@ def maybe_clone_inputs():
         self.assertEqual(noncontig_primal_outs, expected_primal_outs)
         self.assertEqual(noncontig_tangent_outs, expected_tangent_outs)
 
-    @ops(op_db + additional_op_db, allowed_dtypes=(torch.float,))
+    @_set_autograd_function_extension_enabled()
+    @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
     @skipOps('TestOperators', 'test_vjp', vjp_fail.union({
         xfail('sparse.sampled_addmm', ''),
 
@@ -585,7 +592,8 @@ def f(inp, *args, **kwargs):
                 return op.inplace_variant(inp.clone(), *args, **kwargs)
             _test(f, inplace=True)
 
-    @ops(op_db + additional_op_db, allowed_dtypes=(torch.float,))
+    @_set_autograd_function_extension_enabled()
+    @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
     @skipOps('TestOperators', 'test_vjpvjp', vjp_fail.union({
         skip('nn.functional.max_unpool1d'),  # silent incorrectness; Flaky
         skip('nn.functional.max_unpool2d'),  # silent incorrectness; Flaky
diff --git a/torch/_C/_functorch.pyi b/torch/_C/_functorch.pyi
index bb9649daadcbb..d07c39d9413ac 100644
--- a/torch/_C/_functorch.pyi
+++ b/torch/_C/_functorch.pyi
@@ -11,6 +11,9 @@ def is_functorch_wrapped_tensor(tensor: Tensor) -> bool: ...
 def is_gradtrackingtensor(tensor: Tensor) -> bool: ...
 def maybe_get_bdim(tensor: Tensor) -> int: ...
 def maybe_get_level(tensor: Tensor) -> int: ...
+def unwrap_if_dead(tensor: Tensor) -> Tensor: ...
+def _unwrap_for_grad(tensor: Tensor, level: int) -> Tensor: ...
+def _wrap_for_grad(tensor: Tensor, level: int) -> Tensor: ...
 
 def set_autograd_function_allowed(allowed: bool) -> None: ...
 def get_autograd_function_allowed() -> bool: ...
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index f7c134472a11d..90fdaa143a66c 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -455,7 +455,7 @@ def index_source(key):
                 value, guards=make_guards(GuardBuilder.FUNCTION_MATCH)
             )
         elif (
-            isinstance(value, types.BuiltinFunctionType)
+            isinstance(value, types.MethodType)
             and type(getattr(value, "__self__", None))
             is torch.autograd.function.FunctionMeta
             and getattr(value, "__name__", "") == "apply"
diff --git a/torch/_functorch/autograd_function.py b/torch/_functorch/autograd_function.py
new file mode 100644
index 0000000000000..d80a826e7ff51
--- /dev/null
+++ b/torch/_functorch/autograd_function.py
@@ -0,0 +1,188 @@
+import torch
+from torch._ops import PyOperator
+from torch._C._functorch import TransformType
+from torch._functorch.utils import enable_autograd_function
+from torch.autograd.function import _SingleLevelFunction
+import torch.utils._pytree as pytree
+from torch._C._functorch import (
+    _wrap_for_grad,
+    _unwrap_for_grad,
+)
+
+# autograd.Function technically runs before the regular PyTorch dispatcher.
+# This is how features like autocast and torch_dispatch (e.g. PythonTLSSnapshot)
+# work with it. One day we might decide to change this, but until then,
+# we need to give the illusion that autograd.Function runs before those things.
+#
+# We do this by using creating a custom PyOperator that only functorch
+# dispatches specially.
+class CustomFunctionPyOperator(PyOperator):
+    def __init__(self):
+        super().__init__('custom_function_call')
+
+    def __call__(self, *args, **kwargs):
+        # When custom_function_call is done dispatching through functorch,
+        # it should just invoke the autograd.Function. This is consistent
+        # with the autograd.Function behavior of being invoked before the
+        # PyTorch dispatcher.
+        #
+        # This will lead us into trouble later down the line, but this is
+        # pre-existing. There is an invariant that a function traced by
+        # make_fx should have the same behavior when provided the same
+        # Tensor. However, make_fx sees autograd.Function as a composite
+        # (because autograd.Function happens before the Python dispatch key)
+        # and only traces the forward pass.
+        if torch._C._are_functorch_transforms_active():
+            return super().__call__(*args, **kwargs)
+        autograd_function = args[0]
+        return autograd_function.apply(*args[1:], **kwargs)
+
+
+# "custom_function_call"
+# This is the mechanism for an autograd.Function that works with functorch transforms.
+# It wraps an autograd.Function; interactions with functorch transforms are defined
+# via PyDispatcher and PyOperator rather than through the traditional PyTorch
+# dispatcher.
+custom_function_call = CustomFunctionPyOperator()
+
+
+# The grad rule for custom_function_call is to construct a new _SingleLevelFunction
+# (autograd.Function that only works with a single layer (level) of functorch) that:
+# - unwraps the inputs
+# - redispatches to custom_function_call
+# - wraps the outputs
+# and whose backward pass calls the original autograd.Function's backward.
+#
+# Why do we need to redispatch to custom_function_call?
+# -----------------------------------------------------
+# This is consistent with how ATen operators work with functorch's grad transform:
+# they always redispatch to the original operator.
+# Consider torch.sin, and let's say we do grad0(grad1(torch.sin))(x)
+#
+# grad1 will:
+# - set up the autograd graph
+# - unwrap the inputs
+# - redispatch to at::sin (*)
+# - rewrap the outputs on the return
+#
+# On the redispatch in (*), grad0 will:
+# - set up the autograd graph
+# - unwrap the inputs
+# - redispatch to at::sin
+# - rewrap the outputs on the return
+#
+# To "set up the autograd graph", we generate a _SingleLevelFunction
+# and apply it.
+@custom_function_call.py_impl(TransformType.Grad)
+def custom_function_call_grad(interpreter, autograd_function, *operands):
+    maybe_interpreter = interpreter
+    level = maybe_interpreter.level()
+
+    # TODO: The name of the grad_fn is GeneratedBackward. This isn't a great UX,
+    # but in theory functorch users shouldn't be peeking at the grad_fn.
+    # We should try to generate a better name for this.
+    # https://github.com/pytorch/pytorch/issues/90224
+    class Generated(_SingleLevelFunction):
+        @staticmethod
+        def forward(*operands):
+            unwrapped_operands = pytree.tree_map_only(
+                torch.Tensor,
+                lambda x: _unwrap_for_grad(x, level),
+                operands)
+            with torch.enable_grad(), maybe_interpreter.lower():
+                output = custom_function_call(autograd_function, *unwrapped_operands)
+
+            return pytree.tree_map_only(
+                torch.Tensor,
+                lambda x: _wrap_for_grad(x, level),
+                output)
+
+        @staticmethod
+        def setup_context(ctx, outputs, *operands):
+            ctx.mark_dirty = mark_dirty_error
+            return autograd_function.setup_context(ctx, outputs, *operands)
+
+        @staticmethod
+        def backward(ctx, *grads):
+            result = autograd_function.backward(ctx, *grads)
+            return result
+
+    with enable_autograd_function():
+        flat_out = Generated.apply(*operands)
+    return flat_out
+
+
+# https://github.com/pytorch/pytorch/issues/90225
+# If an input was marked as dirty, and the autograd.Function returns the input
+# from the forward, then the grad rule for custom_function_call must also
+# return the corresponding input from the forward() of the Generated autograd.Function
+#
+# We haven't figured out how to do this yet. One possibility is to rely
+# on if the return from the redispatched custom_function_call in Generated.forward
+# has the same object id as one of the inputs,
+# but https://github.com/pytorch/pytorch/issues/90209 means we cannot rely on
+# that property.
+def mark_dirty_error(*args, **kwargs):
+    raise RuntimeError(
+        'NYI: we do not yet support ctx.mark_dirty with functorch transforms. '
+        'Please try to avoid modifying inputs to the autograd.Function in-place '
+        'by using out-of-place operations or by cloning the inputs. '
+        'Please see https://github.com/pytorch/pytorch/issues/90209 for more details'
+    )
+
+
+# NOTE: [functorch vjp and autograd interaction]
+# There's an edge case with the functorch vjp and autograd interaction
+# that will eventually be fixed by mode-only functorch.
+# The TL;DR is that there's no way to unwrap a dead GradTensorWrapper,
+# so we (the framework) need to do it manually. Regular PyTorch operators
+# automatically do so this is consisent.
+#
+# class MyExp(torch.autograd.Function):
+#     @staticmethod
+#     def forward(x):
+#         return x.exp()
+#
+#     @staticmethod
+#     def setup_context(ctx, outputs, x):
+#         y = outputs
+#         ctx.save_for_backward(y)
+#
+#     @staticmethod
+#     def backward(gy):
+#         y, = ctx.saved_tensors()
+#         return MyMul.apply(gy, y)
+#
+# x = torch.randn([], requires_grad=True)
+# gy = torch.randn([], requires_grad=True)
+# _, vjp_fn = vjp(MySin.apply, x)
+# result = vjp_fn(gy)
+#
+# MyMul is an autograd.Function that is not shown here.
+# It saves a `y` for backward (since gy requires grad).
+#
+# in vjp_fn(gy), we get:
+# > MyMul.apply(gy, GradTensorWrapper(y, level=dead))
+# Because the y that is saved for backward by MyExp is a GradTensorWrapper
+# but is now dead since we are outside the vjp context.
+#
+# PyTorch dispatcher operations, upon seeing a dead GradTensorWrapper,
+# will automatically unwrap the GradTensorWrapper when applied.
+# But since autograd.Function technically sits above the regular PyTorch
+# dispatcher, it doesn't get this treatment. So we manually do
+# the unwrapping to be consistent with regular PyTorch dispatcher operations.
+
+
+@custom_function_call.py_impl(TransformType.Vmap)
+def custom_function_call_vmap(interpreter, autograd_function, *operands):
+    raise RuntimeError("NYI: vmap rule for custom_function_call")
+
+
+@custom_function_call.py_impl(TransformType.Jvp)
+def custom_function_call_jvp(interpreter, autograd_function, *operands):
+    raise RuntimeError("NYI: jvp rule for custom_function_call")
+
+
+@custom_function_call.py_impl(TransformType.Functionalize)
+def custom_function_call_functionalize(interpreter, autograd_function, *operands):
+    raise RuntimeError("NYI: Functionalize rule for custom_function_call")
diff --git a/torch/_functorch/utils.py b/torch/_functorch/utils.py
index c1474ba90fe3e..2e98c4ba8fd1d 100644
--- a/torch/_functorch/utils.py
+++ b/torch/_functorch/utils.py
@@ -1,7 +1,9 @@
 import contextlib
+import torch
 from torch._C._functorch import (
     set_autograd_function_allowed,
     get_autograd_function_allowed,
+    unwrap_if_dead,
 )
 
 @contextlib.contextmanager
@@ -12,3 +14,11 @@ def enable_autograd_function():
         yield
     finally:
         set_autograd_function_allowed(prev_state)
+
+def unwrap_dead_wrappers(args):
+    # NB: doesn't use tree_map_only for performance reasons
+    result = tuple(
+        unwrap_if_dead(arg) if isinstance(arg, torch.Tensor) else arg
+        for arg in args
+    )
+    return result
diff --git a/torch/autograd/function.py b/torch/autograd/function.py
index cc8a082b5884e..386fc235592d3 100644
--- a/torch/autograd/function.py
+++ b/torch/autograd/function.py
@@ -1,9 +1,10 @@
-import contextlib
 import torch
 import torch._C as _C
 from torch._C import _functions
+import torch._functorch as _functorch
 import torch.utils.hooks as hooks
 from torch._six import with_metaclass
+from torch.autograd.grad_mode import _DecoratorContextManager
 import functools
 import warnings
 from collections import OrderedDict
@@ -291,54 +292,7 @@ def __init__(cls, name, bases, attrs):
 
 
 # mypy doesn't understand `with_metaclass` from torch._six
-class Function(with_metaclass(FunctionMeta, _C._FunctionBase, FunctionCtx, _HookMixin)):  # type: ignore[misc]
-    r"""Base class to create custom `autograd.Function`
-
-    To create a custom `autograd.Function`, subclass this class and implement
-    the :meth:`forward` and :meth:`backward` static methods. Then, to use your custom
-    op in the forward pass, call the class method ``apply``. Do not call
-    :meth:`forward` directly.
-
-    To ensure correctness and best performance, make sure you are calling the
-    correct methods on ``ctx`` and validating your backward function using
-    :func:`torch.autograd.gradcheck`.
-
-    See :ref:`extending-autograd` for more details on how to use this class.
-
-    Examples::
-
-        >>> class Exp(Function):
-        >>>     @staticmethod
-        >>>     def forward(ctx, i):
-        >>>         result = i.exp()
-        >>>         ctx.save_for_backward(result)
-        >>>         return result
-        >>>
-        >>>     @staticmethod
-        >>>     def backward(ctx, grad_output):
-        >>>         result, = ctx.saved_tensors
-        >>>         return grad_output * result
-        >>>
-        >>> # Use it by calling the apply method:
-        >>> # xdoctest: +SKIP
-        >>> output = Exp.apply(input)
-    """
-    def __init__(self, *args, **kwargs):
-        cls = self.__class__
-        warnings.warn(f"{cls} should not be instantiated. Methods on autograd functions"
-                      "are all static, so you should invoke them on the class itself. "
-                      "Instantiating an autograd function will raise an "
-                      "error in a future version of PyTorch.", DeprecationWarning)
-
-    def __call__(self, *args, **kwargs):
-        raise RuntimeError(
-            "Legacy autograd function with non-static forward method is deprecated. "
-            "Please use new-style autograd function with static forward method. "
-            "(Example: https://pytorch.org/docs/stable/autograd.html#torch.autograd.Function)")
-
-    # for the tracer
-    is_traceable = False
-
+class _SingleLevelFunction(with_metaclass(FunctionMeta, _C._FunctionBase, FunctionCtx, _HookMixin)):  # type: ignore[misc]
     @staticmethod
     def forward(ctx: Any, *args: Any, **kwargs: Any) -> Any:
         r"""Performs the operation.
@@ -410,6 +364,77 @@ def jvp(ctx: Any, *grad_inputs: Any) -> Any:
         raise NotImplementedError("You must implement the jvp function for custom "
                                   "autograd.Function to use it with forward mode AD.")
 
+
+class Function(_SingleLevelFunction):
+    r"""Base class to create custom `autograd.Function`
+
+    To create a custom `autograd.Function`, subclass this class and implement
+    the :meth:`forward` and :meth:`backward` static methods. Then, to use your custom
+    op in the forward pass, call the class method ``apply``. Do not call
+    :meth:`forward` directly.
+
+    To ensure correctness and best performance, make sure you are calling the
+    correct methods on ``ctx`` and validating your backward function using
+    :func:`torch.autograd.gradcheck`.
+
+    See :ref:`extending-autograd` for more details on how to use this class.
+
+    Examples::
+
+        >>> class Exp(Function):
+        >>>     @staticmethod
+        >>>     def forward(ctx, i):
+        >>>         result = i.exp()
+        >>>         ctx.save_for_backward(result)
+        >>>         return result
+        >>>
+        >>>     @staticmethod
+        >>>     def backward(ctx, grad_output):
+        >>>         result, = ctx.saved_tensors
+        >>>         return grad_output * result
+        >>>
+        >>> # Use it by calling the apply method:
+        >>> # xdoctest: +SKIP
+        >>> output = Exp.apply(input)
+    """
+    def __init__(self, *args, **kwargs):
+        cls = self.__class__
+        warnings.warn(f"{cls} should not be instantiated. Methods on autograd functions"
+                      "are all static, so you should invoke them on the class itself. "
+                      "Instantiating an autograd function will raise an "
+                      "error in a future version of PyTorch.", DeprecationWarning)
+
+    def __call__(self, *args, **kwargs):
+        raise RuntimeError(
+            "Legacy autograd function with non-static forward method is deprecated. "
+            "Please use new-style autograd function with static forward method. "
+            "(Example: https://pytorch.org/docs/stable/autograd.html#torch.autograd.Function)")
+
+    # for the tracer
+    is_traceable = False
+
+    @classmethod
+    def apply(cls, *args, **kwargs):
+        if not torch._C._is_autograd_function_extension_enabled():
+            return super().apply(*args, **kwargs)
+
+        # TODO: fix circular import
+        # https://github.com/pytorch/pytorch/issues/90224
+        from torch._functorch.autograd_function import custom_function_call
+        if not torch._C._are_functorch_transforms_active():
+            # See NOTE: [functorch vjp and autograd interaction]
+            args = _functorch.utils.unwrap_dead_wrappers(args)
+            return super().apply(*args, **kwargs)
+
+        if not hasattr(cls, 'setup_context'):
+            # TODO: link documentation in error message
+            # https://github.com/pytorch/pytorch/issues/90224
+            raise RuntimeError(
+                'In order to use an autograd.Function with functorch transforms ',
+                '(vmap, grad, jvp, jacrev, ...), it must have a setup_context ',
+                'staticmethod.')
+        return custom_function_call(cls, *args, **kwargs)
+
 def once_differentiable(fn):
 
     @functools.wraps(fn)
@@ -470,14 +495,16 @@ def traceable(fn_cls):
 
 
 # Private feature flag. Not user-facing.
-@contextlib.contextmanager
-def _set_autograd_function_extension_enabled(enabled=True):
-    try:
-        prev_state = torch._C._is_autograd_function_extension_enabled()
-        torch._C._set_autograd_function_extension_enabled(enabled)
-        yield
-    finally:
-        torch._C._set_autograd_function_extension_enabled(prev_state)
+class _set_autograd_function_extension_enabled(_DecoratorContextManager):
+    def __init__(self, enabled=True):
+        self.enabled = enabled
+
+    def __enter__(self):
+        self.prev_state = torch._C._is_autograd_function_extension_enabled()
+        torch._C._set_autograd_function_extension_enabled(self.enabled)
+
+    def __exit__(self, *args, **kwargs):
+        torch._C._set_autograd_function_extension_enabled(self.prev_state)
 
 
 class InplaceFunction(Function):
diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
index 28e840b728441..a66897f7f0095 100644
--- a/torch/csrc/autograd/python_function.cpp
+++ b/torch/csrc/autograd/python_function.cpp
@@ -9,6 +9,7 @@
 #include <torch/csrc/utils/pybind.h>
 
 #include <ATen/FuncTorchTLS.h>
+#include <ATen/functorch/DynamicLayer.h>
 #include <torch/csrc/DynamicTypes.h>
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/THP.h>
diff --git a/torch/csrc/functorch/init.cpp b/torch/csrc/functorch/init.cpp
index 65a3b3415b7e2..8064293016faf 100644
--- a/torch/csrc/functorch/init.cpp
+++ b/torch/csrc/functorch/init.cpp
@@ -445,6 +445,8 @@ void initFuncTorchBindings(PyObject* module) {
   m.def(
       "get_autograd_function_allowed",
       &at::functorch::getAutogradFunctionAllowed);
+  m.def("unwrap_if_dead", &unwrapIfDead);
+  m.def("is_dead_tensor_wrapper", &isDeadTensorWrapper);
   m.def("dlevel", &dlevel, "dlevel");
   m.def("dump_tensor", &dump_tensor, "dump_tensor");
   m.def("reshape_dim_into", &at::functorch::reshape_dim_into);
diff --git a/torch/testing/_internal/autograd_function_db.py b/torch/testing/_internal/autograd_function_db.py
new file mode 100644
index 0000000000000..533b7ff277a59
--- /dev/null
+++ b/torch/testing/_internal/autograd_function_db.py
@@ -0,0 +1,247 @@
+import torch
+from functools import partial
+from torch.testing import make_tensor
+from torch.testing._internal.opinfo.core import (
+    OpInfo,
+    SampleInput,
+)
+from torch.testing._internal.common_dtype import all_types_and
+import numpy as np
+
+# Note: [autograd.Function db]
+#
+# This is a collection of autograd.Function test cases written as OpInfos
+# so they can easily be consumed by OpInfo-based tests to check if a subsystem
+# supports autograd.Function.
+#
+# Axes:
+# - saves {output, input, intermediate, non-tensor}
+# - {inputs, outputs} x {single tensor, tensors, arbitrary objects}
+# - Uses {mark_dirty, mark_non_differentiable, once_differentiable}
+
+
+def to_numpy(tensor):
+    return tensor.cpu().numpy()
+
+
+class NumpyCube(torch.autograd.Function):
+    @staticmethod
+    def forward(input):
+        input_np = to_numpy(input)
+        dinput = torch.tensor(3 * input_np ** 2, device=input.device)
+        return torch.tensor(input_np ** 3, device=input.device), dinput
+
+    @staticmethod
+    def setup_context(ctx, inputs, outputs):
+        ctx.save_for_backward(inputs[0], outputs[1])
+
+    @staticmethod
+    def backward(ctx, grad_output, grad_saved):
+        input, dinput = ctx.saved_tensors
+        return NumpyMul.apply(grad_output, dinput) + 6 * NumpyMul.apply(grad_saved, input)
+
+def sample_inputs_numpy_cube(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    yield SampleInput(make_arg(1, low=0.8, high=2), args=())
+
+
+class NumpyCubeNotComposable(torch.autograd.Function):
+    @staticmethod
+    def forward(input):
+        input_np = to_numpy(input)
+        return torch.tensor(input_np ** 3, device=input.device), input_np
+
+    @staticmethod
+    def setup_context(ctx, inputs, outputs):
+        _, input_np = outputs
+        ctx.input_np = input_np
+        ctx.device = inputs[0].device
+
+    @staticmethod
+    @torch.autograd.function.once_differentiable
+    def backward(ctx, grad_output, grad_saved):
+        result_np = 3 * (ctx.input_np ** 2)
+        return torch.tensor(result_np, device=ctx.device)
+
+
+class NumpyMul(torch.autograd.Function):
+    @staticmethod
+    def forward(x, y):
+        return torch.tensor(to_numpy(x) * to_numpy(y), device=x.device)
+
+    @staticmethod
+    def setup_context(ctx, inputs, outputs):
+        ctx.save_for_backward(*inputs)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        x, y = ctx.saved_tensors
+        gx = None
+        if ctx.needs_input_grad[0]:
+            gx = NumpyMul.apply(grad_output, y)
+        gy = None
+        if ctx.needs_input_grad[1]:
+            gy = NumpyMul.apply(grad_output, x)
+        return gx, gy
+
+
+def sample_inputs_numpy_mul(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    # Broadcasting
+    yield SampleInput(make_arg(2, low=0.9, high=2), args=(make_arg(3, 2, low=0.9, high=2),))
+
+
+class NumpyExp_(torch.autograd.Function):
+    @staticmethod
+    def forward(x):
+        x_np = to_numpy(x)
+        np.exp(x_np, x_np)
+        return x
+
+    @staticmethod
+    def setup_context(ctx, inputs, outputs):
+        x, = inputs
+        ctx.mark_dirty(x)
+        ctx.save_for_backward(outputs)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        output, = ctx.saved_tensors
+        return NumpyMul.apply(grad_output, output)
+
+
+class NumpySort(torch.autograd.Function):
+    @staticmethod
+    def forward(x, dim):
+        device = x.device
+        x = to_numpy(x)
+        ind = np.argsort(x, axis=dim)
+        ind_inv = np.argsort(ind, axis=dim)
+        result = np.take_along_axis(x, ind, axis=dim)
+        return (
+            torch.tensor(x, device=device),
+            torch.tensor(ind, device=device),
+            torch.tensor(ind_inv, device=device),
+        )
+
+    @staticmethod
+    def setup_context(ctx, inputs, outputs):
+        x, dim = inputs
+        _, ind, ind_inv = outputs
+        ctx.mark_non_differentiable(ind, ind_inv)
+        ctx.save_for_backward(ind, ind_inv)
+        ctx.dim = dim
+
+    @staticmethod
+    def backward(ctx, grad_output, _0, _1):
+        ind, ind_inv = ctx.saved_tensors
+        return NumpyTake.apply(grad_output, ind_inv, ind, ctx.dim), None
+
+
+def sample_inputs_numpy_sort(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    yield SampleInput(make_arg(3, 5), args=(1,))
+
+
+class NumpyTake(torch.autograd.Function):
+    @staticmethod
+    def forward(x, ind, ind_inv, dim):
+        device = x.device
+        x = to_numpy(x)
+        ind = to_numpy(ind)
+        return torch.tensor(np.take_along_axis(x, ind, dim), device=device)
+
+    @staticmethod
+    def setup_context(ctx, inputs, outputs):
+        x, ind, ind_inv, dim = inputs
+        ctx.save_for_backward(ind, ind_inv)
+        ctx.dim = dim
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        ind, ind_inv = ctx.saved_tensors
+        result = NumpyTake.apply(grad_output, ind_inv, ind, ctx.dim)
+        return result, None, None, None
+
+
+class Select(torch.autograd.Function):
+    @staticmethod
+    def forward(x, idx):
+        return x[idx]
+
+    @staticmethod
+    def setup_context(ctx, inputs, outputs):
+        x, idx = inputs
+        ctx.x_shape = x.shape
+        ctx.idx = idx
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        result = grad_output.new_zeros(ctx.x_shape)
+        result[ctx.idx] = grad_output
+        return result, None
+
+
+def sample_inputs_select(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    yield SampleInput(make_arg(3, 5), args=(2,))
+
+
+autograd_function_db = [
+    OpInfo(
+        'NumpyCubeAutogradFunction',
+        op=NumpyCube.apply,
+        supports_forward_ad=False,
+        supports_fwgrad_bwgrad=False,
+        sample_inputs_func=sample_inputs_numpy_cube,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+    ),
+    OpInfo(
+        'NumpyExpMarkDirtyAutogradFunction',
+        op=lambda x: NumpyExp_.apply(x.clone()),
+        inplace_variant=NumpyExp_.apply,
+        supports_forward_ad=False,
+        supports_fwgrad_bwgrad=False,
+        sample_inputs_func=sample_inputs_numpy_cube,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+    ),
+    OpInfo(
+        'NumpyMulAutogradFunction',
+        op=NumpyMul.apply,
+        supports_forward_ad=False,
+        supports_fwgrad_bwgrad=False,
+        sample_inputs_func=sample_inputs_numpy_mul,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+    ),
+    OpInfo(
+        'NumpyCubeNotComposableAutogradFunction',
+        op=lambda x: NumpyCubeNotComposable.apply(x)[0],
+        supports_forward_ad=False,
+        supports_fwgrad_bwgrad=False,
+        sample_inputs_func=sample_inputs_numpy_cube,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+    ),
+    OpInfo(
+        'NumpySortAutogradFunction',
+        op=NumpySort.apply,
+        supports_forward_ad=False,
+        supports_fwgrad_bwgrad=False,
+        sample_inputs_func=sample_inputs_numpy_sort,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+        gradcheck_wrapper=lambda y, ind: y,
+    ),
+    OpInfo(
+        'SelectAutogradFunction',
+        op=Select.apply,
+        supports_forward_ad=False,
+        supports_fwgrad_bwgrad=False,
+        sample_inputs_func=sample_inputs_select,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+    ),
+]

From 74283c056922f88ffa51218d122b50166589e6db Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@fb.com>
Date: Thu, 8 Dec 2022 09:00:42 -0500
Subject: [PATCH 1740/1922] Refactor test_inductor_XXX to reduce code
 duplication (#90443)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90443
Approved by: https://github.com/desertfire
---
 .jenkins/pytorch/test.sh | 53 ++++++++++++++++------------------------
 1 file changed, 21 insertions(+), 32 deletions(-)

diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index af0c2854b96c0..4e52f31a74c7f 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -256,20 +256,32 @@ test_inductor() {
   PYTORCH_TEST_WITH_INDUCTOR=0 python test/run_test.py --include inductor/test_torchinductor --include inductor/test_torchinductor_opinfo --verbose
 }
 
-test_inductor_huggingface() {
+test_inductor_benchmark() {
   # Use test-reports directory under test folder will allow the CI to automatically pick up
   # the test reports and upload them to S3. Need to use full path here otherwise the script
   # will bark about file not found later on
   TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  PARTITION_FLAGS=""
+  if [[ -n "$NUM_TEST_SHARDS" && -n "$2" ]]; then
+    PARTITION_FLAGS="--total-partitions 2 --partition-id $2"
+  fi
   mkdir -p "$TEST_REPORTS_DIR"
   # Check inference with --float32
-  python benchmarks/dynamo/huggingface.py --ci --accuracy \
-    --device cuda --inductor --float32 --output "$TEST_REPORTS_DIR"/inductor_inference_huggingface.csv
-  python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_inference_huggingface.csv
+  # shellcheck disable=SC2086
+  python benchmarks/dynamo/$1.py --ci --accuracy \
+    --device cuda --inductor --float32 $PARTITION_FLAGS --output "$TEST_REPORTS_DIR"/inductor_inference_$1.csv
+  # shellcheck disable=SC2086
+  python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_inference_$1.csv
   # Check training with --amp
-  python benchmarks/dynamo/huggingface.py --ci --training --accuracy \
-    --device cuda --inductor --amp --output "$TEST_REPORTS_DIR"/inductor_training_huggingface.csv
-  python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_training_huggingface.csv
+  # shellcheck disable=SC2086
+  python benchmarks/dynamo/$1.py --ci --training --accuracy \
+    --device cuda --inductor --amp $PARTITION_FLAGS  --output "$TEST_REPORTS_DIR"/inductor_training_$1.csv
+  # shellcheck disable=SC2086
+  python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_training_$1.csv
+}
+
+test_inductor_huggingface() {
+  test_inductor_benchmark huggingface
 }
 
 test_inductor_timm_shard() {
@@ -277,34 +289,11 @@ test_inductor_timm_shard() {
     echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
     exit 1
   fi
-  # Use test-reports directory under test folder will allow the CI to automatically pick up
-  # the test reports and upload them to S3. Need to use full path here otherwise the script
-  # will bark about file not found later on
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  mkdir -p "$TEST_REPORTS_DIR"
-  # Check inference with --float32
-  python benchmarks/dynamo/timm_models.py --ci --accuracy \
-    --device cuda --inductor --float32 --total-partitions 2 --partition-id "$1" \
-    --output "$TEST_REPORTS_DIR"/inductor_inference_timm_"$1".csv
-  python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_inference_timm_"$1".csv
-  # Check training with --amp
-  python benchmarks/dynamo/timm_models.py --ci --training --accuracy \
-    --device cuda --inductor --amp --total-partitions 2 --partition-id "$1" \
-    --output "$TEST_REPORTS_DIR"/inductor_training_timm_"$1".csv
-  python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_training_timm_"$1".csv
+  test_inductor_benchmark timm_models "$1"
 }
 
 test_inductor_torchbench() {
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  mkdir -p "$TEST_REPORTS_DIR"
-  # Check inference with --float32
-  PYTHONPATH=$(pwd)/torchbench python benchmarks/dynamo/torchbench.py --ci --accuracy \
-    --device cuda --inductor --float32 --output "$TEST_REPORTS_DIR"/inductor_inference_torchbench.csv
-  python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_inference_torchbench.csv
-  # Check training with --amp
-  PYTHONPATH=$(pwd)/torchbench python benchmarks/dynamo/torchbench.py --ci --training --accuracy \
-    --device cuda --inductor --amp --output "$TEST_REPORTS_DIR"/inductor_training_torchbench.csv
-  python benchmarks/dynamo/check_csv.py -f "$TEST_REPORTS_DIR"/inductor_training_torchbench.csv
+  PYTHONPATH=$(pwd)/torchbench test_inductor_benchmark torchbench
 }
 
 test_python_gloo_with_tls() {

From 376b30c41a6bb3130f799feb37a303d0e621a4e7 Mon Sep 17 00:00:00 2001
From: Elias Ellison <elias.ellison@gmail.com>
Date: Wed, 7 Dec 2022 20:01:42 +0000
Subject: [PATCH 1741/1922] Add Pointwise Tag from pointwise set in DTensor,
 use in aot_autograd partitioner (#90029)

Takes the pointwise op list from [DTensor](https://github.com/pytorch/pytorch/blob/master/torch/distributed/_tensor/ops/pointwise_ops.py#L36) as an initially starting point for pointwise ops, and feeds them to the aot autograd partitioner.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90029
Approved by: https://github.com/ezyang
---
 aten/src/ATen/native/native_functions.yaml | 575 +++++++++++++++++++--
 aten/src/ATen/native/tags.yaml             |   5 +
 test/test_ops.py                           | 142 ++++-
 torch/_functorch/partitioners.py           |  24 +-
 torchgen/model.py                          |  21 +-
 5 files changed, 707 insertions(+), 60 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index a3c6988375070..b0621235e56d4 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -244,7 +244,7 @@
     CPU: native_dropout_cpu
     CUDA: native_dropout_cuda
     NestedTensorCPU, NestedTensorCUDA: native_dropout_nested
-  tags: nondeterministic_seeded, canonical
+  tags: [nondeterministic_seeded, canonical]
   autogen: native_dropout.out
 
 - func: native_dropout_backward(Tensor grad_output, Tensor mask, float scale) -> Tensor
@@ -252,6 +252,7 @@
     CPU, NestedTensorCPU, NestedTensorCUDA: native_dropout_backward
     CUDA: native_dropout_backward_cuda
   autogen: native_dropout_backward.out
+  tags: pointwise
 
 - func: _sobol_engine_draw(Tensor quasi, int n, Tensor sobolstate, int dimension, int num_generated, ScalarType? dtype) -> (Tensor, Tensor)
 
@@ -296,7 +297,7 @@
     CompositeExplicitAutograd: abs
     SparseCPU, SparseCUDA: abs_sparse
     SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: abs_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -313,6 +314,7 @@
     MPS: abs_out_mps
     SparseCPU, SparseCUDA: abs_sparse_out
     SparseCsrCPU, SparseCsrCUDA: abs_sparse_csr_out
+  tags: pointwise
 
 # Note [Adding an alias]
 # To add an alias do the following:
@@ -336,8 +338,8 @@
 # in op_db list in torch/testing/_internal/common_methods_invocations.py
 #
 # See torch.absolute, an alias for torch.abs, as an example.
-
 # Absolute, alias for abs
+
 - func: absolute(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
@@ -355,12 +357,14 @@
   dispatch:
     CPU, CUDA: angle
     SparseCsrCPU, SparseCsrCUDA: angle_sparse_csr
+  tags: pointwise
 
 - func: angle.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: angle_out
     SparseCsrCPU, SparseCsrCUDA: angle_sparse_csr_out
+  tags: pointwise
 
 - func: view_as_real(Tensor(a) self) -> Tensor(a)
   variants: function
@@ -378,6 +382,7 @@
   dispatch:
     SparseCPU, SparseCUDA: sgn_sparse
     SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr
+  tags: pointwise
 
 - func: sgn_(Tensor(a!) self) -> Tensor(a!)
   variants: method
@@ -385,6 +390,7 @@
   dispatch:
     SparseCPU, SparseCUDA: sgn_sparse_
     SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr_
+  tags: pointwise
 
 - func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -393,6 +399,7 @@
     CPU, CUDA: sgn_out
     SparseCPU, SparseCUDA: sgn_sparse_out
     SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr_out
+  tags: pointwise
 
 - func: chalf(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
   variants: method
@@ -423,18 +430,21 @@
 
 - func: conj_physical(Tensor self) -> Tensor
   variants: function, method
+  tags: pointwise
 
 - func: conj_physical.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU, CUDA: conj_physical_out
     SparseCPU, SparseCUDA: conj_physical_out_sparse
     SparseCsrCPU, SparseCsrCUDA: conj_physical_sparse_csr_out
+  tags: pointwise
 
 - func: conj_physical_(Tensor(a!) self) -> Tensor(a!)
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: conj_physical_
     SparseCsrCPU, SparseCsrCUDA: conj_physical_sparse_csr_
+  tags: pointwise
 
 - func: resolve_conj(Tensor(a) self) -> Tensor(a)
   variants: function, method
@@ -451,11 +461,13 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   structured_delegate: acos.out
+  tags: pointwise
 
 - func: acos_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function, method
   structured_delegate: acos.out
+  tags: pointwise
 
 - func: acos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -464,6 +476,7 @@
   dispatch:
     CPU, CUDA: acos_out
     MPS: acos_out_mps
+  tags: pointwise
 
 # arccos, alias of acos
 - func: arccos(Tensor self) -> Tensor
@@ -491,7 +504,7 @@
     MkldnnCPU: mkldnn_add
     ZeroTensor: add_zerotensor
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -502,6 +515,7 @@
     SparseCsrCPU, SparseCsrCUDA: add_sparse_csr_
     MkldnnCPU: mkldnn_add_
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_add__Tensor
+  tags: pointwise
 
 - func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -517,6 +531,7 @@
     SparseCsrCUDA: add_out_sparse_csr_cuda
     MkldnnCPU: mkldnn_add_out
     MPS: add_out_mps
+  tags: pointwise
 
 - func: _add_relu.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
   variants: function
@@ -550,7 +565,7 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: add
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -558,6 +573,7 @@
   dispatch:
     CompositeExplicitAutograd: add_
   autogen: add.Scalar_out
+  tags: pointwise
 
 - func: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor
   structured_delegate: addmv.out
@@ -713,10 +729,12 @@
 - func: acosh(Tensor self) -> Tensor
   variants: function, method
   structured_delegate: acosh.out
+  tags: pointwise
 
 - func: acosh_(Tensor(a!) self) -> Tensor(a!)
   variants: function, method
   structured_delegate: acosh.out
+  tags: pointwise
 
 - func: acosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -724,8 +742,9 @@
   dispatch:
     CPU, CUDA: acosh_out
     MPS: acosh_out_mps
-
+  tags: pointwise
 # arccosh, alias for acosh
+
 - func: arccosh(Tensor self) -> Tensor
   variants: function, method
 
@@ -740,6 +759,7 @@
   dispatch:
     SparseCPU, SparseCUDA: asinh_sparse
     SparseCsrCPU, SparseCsrCUDA: asinh_sparse_csr
+  tags: pointwise
 
 - func: asinh_(Tensor(a!) self) -> Tensor(a!)
   variants: function, method
@@ -747,6 +767,7 @@
   dispatch:
     SparseCPU, SparseCUDA: asinh_sparse_
     SparseCsrCPU, SparseCsrCUDA: asinh_sparse_csr_
+  tags: pointwise
 
 - func: asinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -756,6 +777,7 @@
     MPS: asinh_out_mps
     SparseCPU, SparseCUDA: asinh_sparse_out
     SparseCsrCPU, SparseCsrCUDA: asinh_sparse_csr_out
+  tags: pointwise
 
 # arcsinh, alias for asinh
 - func: arcsinh(Tensor self) -> Tensor
@@ -772,6 +794,7 @@
   dispatch:
     SparseCPU, SparseCUDA: atanh_sparse
     SparseCsrCPU, SparseCsrCUDA: atanh_sparse_csr
+  tags: pointwise
 
 - func: atanh_(Tensor(a!) self) -> Tensor(a!)
   structured_delegate: atanh.out
@@ -779,6 +802,7 @@
   dispatch:
     SparseCPU, SparseCUDA: atanh_sparse_
     SparseCsrCPU, SparseCsrCUDA: atanh_sparse_csr_
+  tags: pointwise
 
 - func: atanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -788,8 +812,9 @@
     MPS: atanh_out_mps
     SparseCPU, SparseCUDA: atanh_sparse_out
     SparseCsrCPU, SparseCsrCUDA: atanh_sparse_csr_out
-
+  tags: pointwise
 # arctanh, alias for atanh
+
 - func: arctanh(Tensor self) -> Tensor
   variants: function, method
 
@@ -824,6 +849,7 @@
   dispatch:
     SparseCPU, SparseCUDA: asin_sparse
     SparseCsrCPU, SparseCsrCUDA: asin_sparse_csr
+  tags: pointwise
 
 - func: asin_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -832,6 +858,7 @@
   dispatch:
     SparseCPU, SparseCUDA: asin_sparse_
     SparseCsrCPU, SparseCsrCUDA: asin_sparse_csr_
+  tags: pointwise
 
 - func: asin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -842,6 +869,7 @@
     MPS: asin_out_mps
     SparseCPU, SparseCUDA: asin_sparse_out
     SparseCsrCPU, SparseCsrCUDA: asin_sparse_csr_out
+  tags: pointwise
 
 # arcsin, alias of asin
 - func: arcsin(Tensor self) -> Tensor
@@ -859,6 +887,7 @@
   dispatch:
     SparseCPU, SparseCUDA: atan_sparse
     SparseCsrCPU, SparseCsrCUDA: atan_sparse_csr
+  tags: pointwise
 
 - func: atan_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -867,6 +896,7 @@
   dispatch:
     SparseCPU, SparseCUDA: atan_sparse_
     SparseCsrCPU, SparseCsrCUDA: atan_sparse_csr_
+  tags: pointwise
 
 - func: atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -877,6 +907,7 @@
     MPS: atan_out_mps
     SparseCPU, SparseCUDA: atan_sparse_out
     SparseCsrCPU, SparseCsrCUDA: atan_sparse_csr_out
+  tags: pointwise
 
 # arctan, alias of atan
 - func: arctan(Tensor self) -> Tensor
@@ -1043,12 +1074,13 @@
   device_check: NoCheck   # TensorIterator
   structured_delegate: bitwise_not.out
   variants: function, method
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: bitwise_not_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   structured_delegate: bitwise_not.out
   variants: method
+  tags: pointwise
 
 - func: bitwise_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1056,6 +1088,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: bitwise_not_out
+  tags: pointwise
 
 - func: copysign.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1063,11 +1096,13 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: copysign_out
+  tags: pointwise
 
 - func: copysign.Tensor(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
   structured_delegate: copysign.out
+  tags: pointwise
 
 - func: copysign_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1078,6 +1113,7 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: copysign
+  tags: pointwise
 
 - func: copysign_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   variants: method
@@ -1087,78 +1123,91 @@
 - func: copysign.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CompositeExplicitAutograd: copysign_out
+  tags: pointwise
 
 - func: logical_not(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: logical_not
+  tags: pointwise
 
 - func: logical_not_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   dispatch:
     CompositeExplicitAutograd: logical_not_
+  tags: pointwise
 
 - func: logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: logical_not_out
     MPS: logical_not_out_mps
+  tags: pointwise
 
 - func: logical_xor(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: logical_xor
+  tags: pointwise
 
 - func: logical_xor_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   dispatch:
     CompositeExplicitAutograd: logical_xor_
+  tags: pointwise
 
 - func: logical_xor.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: logical_xor_out
     MPS: logical_xor_out_mps
+  tags: pointwise
 
 - func: logical_and(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: logical_and
+  tags: pointwise
 
 - func: logical_and_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   dispatch:
     CompositeExplicitAutograd: logical_and_
+  tags: pointwise
 
 - func: logical_and.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: logical_and_out
     MPS: logical_and_out_mps
+  tags: pointwise
 
 - func: logical_or(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: logical_or
+  tags: pointwise
 
 - func: logical_or_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   dispatch:
     CompositeExplicitAutograd: logical_or_
+  tags: pointwise
 
 - func: logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: logical_or_out
     MPS: logical_or_out_mps
+  tags: pointwise
 
 - func: blackman_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   dispatch:
@@ -1257,6 +1306,7 @@
   dispatch:
     SparseCPU, SparseCUDA: ceil_sparse
     SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr
+  tags: pointwise
 
 - func: ceil_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1265,6 +1315,7 @@
   dispatch:
     SparseCPU, SparseCUDA: ceil_sparse_
     SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr_
+  tags: pointwise
 
 - func: ceil.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1275,6 +1326,7 @@
     MPS: ceil_out_mps
     SparseCPU, SparseCUDA: ceil_sparse_out
     SparseCsrCPU, SparseCsrCUDA: ceil_sparse_csr_out
+  tags: pointwise
 
 # alias for torch.linalg.multi_dot
 - func: chain_matmul(Tensor[] matrices) -> Tensor
@@ -1316,21 +1368,24 @@
   structured_delegate: clamp.out
   dispatch:
     QuantizedCPU: clamp_quantized_cpu
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor
   variants: function, method
   structured_delegate: clamp.Tensor_out
+  tags: pointwise
 
 - func: clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function, method
   cpp_no_default_args: ['min']
   structured_delegate: clamp.out
+  tags: pointwise
 
 - func: clamp_.Tensor(Tensor(a!) self, Tensor? min=None, Tensor? max=None) -> Tensor(a!)
   variants: function, method
   structured_delegate: clamp.Tensor_out
+  tags: pointwise
 
 - func: clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1340,6 +1395,7 @@
   dispatch:
     CPU, CUDA: clamp_out
     MPS: clamp_out_mps
+  tags: pointwise
 
 - func: clamp.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1348,24 +1404,29 @@
   dispatch:
     CPU, CUDA: clamp_Tensor_out
     MPS: clamp_Tensor_out_mps
+  tags: pointwise
 
 - func: clamp_max(Tensor self, Scalar max) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
   structured_delegate: clamp_max.out
+  tags: pointwise
 
 - func: clamp_max.Tensor(Tensor self, Tensor max) -> Tensor
   variants: function, method
   structured_delegate: clamp_max.Tensor_out
+  tags: pointwise
 
 - func: clamp_max_(Tensor(a!) self, Scalar max) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function, method
   structured_delegate: clamp_max.out
+  tags: pointwise
 
 - func: clamp_max_.Tensor(Tensor(a!) self, Tensor max) -> Tensor(a!)
   variants: function, method
   structured_delegate: clamp_max.Tensor_out
+  tags: pointwise
 
 - func: clamp_max.out(Tensor self, Scalar max, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1374,6 +1435,7 @@
   dispatch:
     CPU, CUDA: clamp_max_out
     MPS: clamp_max_out_mps
+  tags: pointwise
 
 - func: clamp_max.Tensor_out(Tensor self, Tensor max, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1382,24 +1444,29 @@
   dispatch:
     CPU, CUDA: clamp_max_Tensor_out
     MPS: clamp_max_Tensor_out_mps
+  tags: pointwise
 
 - func: clamp_min(Tensor self, Scalar min) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
   structured_delegate: clamp_min.out
+  tags: pointwise
 
 - func: clamp_min.Tensor(Tensor self, Tensor min) -> Tensor
   variants: function, method
   structured_delegate: clamp_min.Tensor_out
+  tags: pointwise
 
 - func: clamp_min_(Tensor(a!) self, Scalar min) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function, method
   structured_delegate: clamp_min.out
+  tags: pointwise
 
 - func: clamp_min_.Tensor(Tensor(a!) self, Tensor min) -> Tensor(a!)
   variants: function, method
   structured_delegate: clamp_min.Tensor_out
+  tags: pointwise
 
 - func: clamp_min.out(Tensor self, Scalar min, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1408,6 +1475,7 @@
   dispatch:
     CPU, CUDA: clamp_min_out
     MPS: clamp_min_out_mps
+  tags: pointwise
 
 - func: clamp_min.Tensor_out(Tensor self, Tensor min, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1416,24 +1484,30 @@
   dispatch:
     CPU, CUDA: clamp_min_Tensor_out
     MPS: clamp_min_Tensor_out_mps
+  tags: pointwise
 
 # clip is an alias for clamp
 - func: clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
   cpp_no_default_args: ['min']
   variants: function, method
+  tags: pointwise
 
 - func: clip.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor
   variants: function, method
+  tags: pointwise
 
 - func: clip_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)
   cpp_no_default_args: ['min']
   variants: function, method
+  tags: pointwise
 
 - func: clip_.Tensor(Tensor(a!) self, Tensor? min=None, Tensor? max=None) -> Tensor(a!)
   variants: function, method
+  tags: pointwise
 
 - func: clip.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!)
   cpp_no_default_args: ['min']
+  tags: pointwise
 
 - func: clip.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!)
 
@@ -1566,11 +1640,13 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   structured_delegate: cos.out
+  tags: pointwise
 
 - func: cos_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function, method
   structured_delegate: cos.out
+  tags: pointwise
 
 - func: cos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1579,16 +1655,19 @@
   dispatch:
     CPU, CUDA: cos_out
     MPS: cos_out_mps
+  tags: pointwise
 
 - func: cosh(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
   structured_delegate: cosh.out
+  tags: pointwise
 
 - func: cosh_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function, method
   structured_delegate: cosh.out
+  tags: pointwise
 
 - func: cosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1597,6 +1676,7 @@
   dispatch:
     CPU, CUDA: cosh_out
     MPS: cosh_out_mps
+  tags: pointwise
 
 - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
 
@@ -1891,7 +1971,7 @@
     SparseCPU, SparseCUDA: div_sparse
     ZeroTensor: div_zerotensor
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Tensor
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1899,6 +1979,7 @@
   structured_delegate: div.out
   dispatch:
     SparseCPU, SparseCUDA: div_sparse_
+  tags: pointwise
 
 - func: div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1908,6 +1989,7 @@
     CPU, CUDA: div_out
     MPS: div_out_mps
     SparseCPU, SparseCUDA: div_out_sparse_zerodim
+  tags: pointwise
 
 - func: div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -1915,6 +1997,7 @@
   structured_delegate: div.out_mode
   dispatch:
     SparseCPU, SparseCUDA: div_sparse
+  tags: pointwise
 
 - func: div_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1922,6 +2005,7 @@
   structured_delegate: div.out_mode
   dispatch:
     SparseCPU, SparseCUDA: div_sparse_
+  tags: pointwise
 
 - func: div.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1931,6 +2015,7 @@
     CPU, CUDA: div_out_mode
     MPS: div_out_mode_mps
     SparseCPU, SparseCUDA: div_out_sparse_zerodim
+  tags: pointwise
 
 # For C++ only, until we have conversion from C++ numbers to Tensor
 - func: div.Scalar(Tensor self, Scalar other) -> Tensor
@@ -1939,7 +2024,7 @@
   dispatch:
     CompositeExplicitAutograd: div
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Scalar
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -1947,17 +2032,20 @@
   dispatch:
     CompositeExplicitAutograd: div_
   autogen: div.Scalar_out
+  tags: pointwise
 
 - func: div.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: div
+  tags: pointwise
 
 - func: div_.Scalar_mode(Tensor(a!) self, Scalar other, *, str? rounding_mode) -> Tensor(a!)
   variants: method
   dispatch:
     CompositeExplicitAutograd: div_
   autogen: div.Scalar_mode_out
+  tags: pointwise
 
 # divide, alias for div
 - func: divide.Tensor(Tensor self, Tensor other) -> Tensor
@@ -1992,6 +2080,7 @@
 - func: true_divide.Tensor(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+  tags: pointwise
 
 - func: true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -2248,7 +2337,7 @@
   dispatch:
     SparseCPU, SparseCUDA: erf_sparse
     SparseCsrCPU, SparseCsrCUDA: erf_sparse_csr
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: erf_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -2257,6 +2346,7 @@
   dispatch:
     SparseCPU, SparseCUDA: erf_sparse_
     SparseCsrCPU, SparseCsrCUDA: erf_sparse_csr_
+  tags: pointwise
 
 - func: erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -2267,16 +2357,19 @@
     MPS: erf_out_mps
     SparseCPU, SparseCUDA: erf_sparse_out
     SparseCsrCPU, SparseCsrCUDA: erf_sparse_csr_out
+  tags: pointwise
 
 - func: erfc(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
   structured_delegate: erfc.out
   variants: function, method
+  tags: pointwise
 
 - func: erfc_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   structured_delegate: erfc.out
   variants: function, method
+  tags: pointwise
 
 - func: erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -2284,17 +2377,19 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: erfc_out
+  tags: pointwise
 
 - func: exp(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
   structured_delegate: exp.out
   variants: function, method
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: exp_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   structured_delegate: exp.out
   variants: function, method
+  tags: pointwise
 
 - func: exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -2303,14 +2398,17 @@
   dispatch:
     CPU, CUDA: exp_out
     MPS: exp_out_mps
+  tags: pointwise
 
 - func: exp2(Tensor self) -> Tensor
   structured_delegate: exp2.out
   variants: function, method
+  tags: pointwise
 
 - func: exp2_(Tensor(a!) self) -> Tensor(a!)
   structured_delegate: exp2.out
   variants: function, method
+  tags: pointwise
 
 - func: exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -2318,6 +2416,7 @@
   dispatch:
     CPU, CUDA: exp2_out
     MPS: exp2_out_mps
+  tags: pointwise
 
 - func: expm1(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -2326,6 +2425,7 @@
   dispatch:
     SparseCPU, SparseCUDA: expm1_sparse
     SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr
+  tags: pointwise
 
 - func: expm1_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -2334,6 +2434,7 @@
   dispatch:
     SparseCPU, SparseCUDA: expm1_sparse_
     SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr_
+  tags: pointwise
 
 - func: expm1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -2344,6 +2445,7 @@
     MPS: expm1_out_mps
     SparseCPU, SparseCUDA: expm1_sparse_out
     SparseCsrCPU, SparseCsrCUDA: expm1_sparse_csr_out
+  tags: pointwise
 
 - func: expand(Tensor(a) self, SymInt[] size, *, bool implicit=False) -> Tensor(a)
   variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
@@ -2438,6 +2540,7 @@
   dispatch:
     SparseCPU, SparseCUDA: floor_sparse
     SparseCsrCPU, SparseCsrCUDA: floor_sparse_csr
+  tags: pointwise
 
 - func: floor_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -2446,6 +2549,7 @@
   dispatch:
     SparseCPU, SparseCUDA: floor_sparse_
     SparseCsrCPU, SparseCsrCUDA: floor_sparse_csr_
+  tags: pointwise
 
 - func: floor.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -2456,6 +2560,7 @@
     MPS: floor_out_mps
     SparseCPU, SparseCUDA: floor_sparse_out
     SparseCsrCPU, SparseCsrCUDA: floor_sparse_csr_out
+  tags: pointwise
 
 - func: floor_divide(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -2492,6 +2597,7 @@
   dispatch:
     SparseCPU, SparseCUDA: frac_sparse
     SparseCsrCPU, SparseCsrCUDA: frac_sparse_csr
+  tags: pointwise
 
 - func: frac_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -2500,6 +2606,7 @@
   dispatch:
     SparseCPU, SparseCUDA: frac_sparse_
     SparseCsrCPU, SparseCsrCUDA: frac_sparse_csr_
+  tags: pointwise
 
 - func: frac.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -2510,6 +2617,7 @@
     MPS: frac_out_mps
     SparseCPU, SparseCUDA: frac_sparse_out
     SparseCsrCPU, SparseCsrCUDA: frac_sparse_csr_out
+  tags: pointwise
 
 - func: full.names(int[] size, Scalar fill_value, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   device_check: NoCheck
@@ -2543,10 +2651,12 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: gcd_out
+  tags: pointwise
 
 - func: gcd(Tensor self, Tensor other) -> Tensor
   structured_delegate: gcd.out
   variants: function, method
+  tags: pointwise
 
 - func: gcd_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   structured_delegate: gcd.out
@@ -2557,10 +2667,12 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: lcm_out
+  tags: pointwise
 
 - func: lcm(Tensor self, Tensor other) -> Tensor
   structured_delegate: lcm.out
   variants: function, method
+  tags: pointwise
 
 - func: lcm_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   structured_delegate: lcm.out
@@ -2855,6 +2967,7 @@
     SparseCPU, SparseCUDA: isnan_sparse
     SparseCsrCPU, SparseCsrCUDA: isnan_sparse_csr
   autogen: isnan.out
+  tags: pointwise
 
 - func: is_distributed(Tensor self) -> bool
   variants: function, method
@@ -2965,17 +3078,20 @@
   dispatch:
     CompositeExplicitAutograd: nan_to_num
     SparseCPU, SparseCUDA: nan_to_num_sparse
+  tags: pointwise
 
 - func: nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!)
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: nan_to_num_
     SparseCPU, SparseCUDA: nan_to_num_sparse_
+  tags: pointwise
 
 - func: nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU, CUDA: nan_to_num_out
     SparseCPU, SparseCUDA: nan_to_num_sparse_out
+  tags: pointwise
 
 - func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
   python_module: nn
@@ -3037,8 +3153,10 @@
 
 - func: ldexp_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   variants: function, method
+  tags: pointwise
 
 - func: ldexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  tags: pointwise
 
 - func: linspace(Scalar start, Scalar end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   dispatch:
@@ -3054,12 +3172,13 @@
   device_check: NoCheck   # TensorIterator
   structured_delegate: log.out
   variants: function, method
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: log_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   structured_delegate: log.out
   variants: function, method
+  tags: pointwise
 
 - func: log.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -3068,16 +3187,19 @@
   dispatch:
     CPU, CUDA: log_out
     MPS: log_out_mps
+  tags: pointwise
 
 - func: log10(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
   structured_delegate: log10.out
   variants: function, method
+  tags: pointwise
 
 - func: log10_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   structured_delegate: log10.out
   variants: function, method
+  tags: pointwise
 
 - func: log10.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -3086,6 +3208,7 @@
   dispatch:
     CPU, CUDA: log10_out
     MPS: log10_out_mps
+  tags: pointwise
 
 - func: log1p(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -3094,6 +3217,7 @@
   dispatch:
     SparseCPU, SparseCUDA: log1p_sparse
     SparseCsrCPU, SparseCsrCUDA: log1p_sparse_csr
+  tags: pointwise
 
 - func: log1p_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -3102,6 +3226,7 @@
   dispatch:
     SparseCPU, SparseCUDA: log1p_sparse_
     SparseCsrCPU, SparseCsrCUDA: log1p_sparse_csr_
+  tags: pointwise
 
 - func: log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -3112,16 +3237,19 @@
     MPS: log1p_out_mps
     SparseCPU, SparseCUDA: log1p_sparse_out
     SparseCsrCPU, SparseCsrCUDA: log1p_sparse_csr_out
+  tags: pointwise
 
 - func: log2(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
   structured_delegate: log2.out
   variants: function, method
+  tags: pointwise
 
 - func: log2_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   structured_delegate: log2.out
   variants: function, method
+  tags: pointwise
 
 - func: log2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -3130,6 +3258,7 @@
   dispatch:
     CPU, CUDA: log2_out
     MPS: log2_out_mps
+  tags: pointwise
 
 - func: logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -3137,10 +3266,12 @@
   dispatch:
     CPU, CUDA: logaddexp_out
     MPS: logaddexp_out_mps
+  tags: pointwise
 
 - func: logaddexp(Tensor self, Tensor other) -> Tensor
   variants: method, function
   structured_delegate: logaddexp.out
+  tags: pointwise
 
 - func: logaddexp2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -3148,33 +3279,39 @@
   dispatch:
     CPU, CUDA: logaddexp2_out
     MPS: logaddexp2_out_mps
+  tags: pointwise
 
 - func: logaddexp2(Tensor self, Tensor other) -> Tensor
   variants: method, function
   structured_delegate: logaddexp2.out
+  tags: pointwise
 
 - func: xlogy.Tensor(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   structured_delegate: xlogy.OutTensor
   variants: function, method
+  tags: pointwise
 
 - func: xlogy.Scalar_Self(Scalar self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function
   dispatch:
     CompositeExplicitAutograd: xlogy
+  tags: pointwise
 
 - func: xlogy.Scalar_Other(Tensor self, Scalar other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: xlogy
+  tags: pointwise
 
 # xlogy: inplace variant
 - func: xlogy_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function, method
   structured_delegate: xlogy.OutTensor
+  tags: pointwise
 
 - func: xlogy_.Scalar_Other(Tensor(a!) self, Scalar other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -3190,18 +3327,21 @@
   variants: function
   dispatch:
     CPU, CUDA: xlogy_out
+  tags: pointwise
 
 - func: xlogy.OutScalar_Self(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function
   dispatch:
     CompositeExplicitAutograd: xlogy_out
+  tags: pointwise
 
 - func: xlogy.OutScalar_Other(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function
   dispatch:
     CompositeExplicitAutograd: xlogy_out
+  tags: pointwise
 
 - func: logspace(Scalar start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   dispatch:
@@ -3689,7 +3829,7 @@
     MkldnnCPU: mkldnn_mul
     ZeroTensor: mul_zerotensor
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Tensor
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -3700,6 +3840,7 @@
     SparseCsrCPU, SparseCsrCUDA: mul_sparse_csr_
     MkldnnCPU: mkldnn_mul_
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Tensor
+  tags: pointwise
 
 - func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -3712,8 +3853,9 @@
     SparseCUDA: mul_out_sparse_cuda
     SparseCsrCPU, SparseCsrCUDA: mul_out_sparse_csr
     MkldnnCPU: mkldnn_mul_out
-
+  tags: pointwise
   # For C++ only, until we have conversion from C++ numbers to Tensor
+
 - func: mul.Scalar(Tensor self, Scalar other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
@@ -3721,7 +3863,7 @@
     CompositeExplicitAutograd: mul
     SparseCsrCPU, SparseCsrCUDA: mul_scalar_sparse_csr
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul_Scalar
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -3731,8 +3873,9 @@
     SparseCsrCPU, SparseCsrCUDA: mul__scalar_sparse_csr
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_mul__Scalar
   autogen: mul.Scalar_out
-
+  tags: pointwise
 # multiply, alias for mul
+
 - func: multiply.Tensor(Tensor self, Tensor other) -> Tensor
   variants: function, method
 
@@ -3760,18 +3903,21 @@
 - func: mvlgamma.out(Tensor self, int p, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU, CUDA: mvlgamma_out
+  tags: pointwise
 
 - func: mvlgamma(Tensor self, int p) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: mvlgamma
+  tags: pointwise
 
 - func: mvlgamma_(Tensor(a!) self, int p) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   dispatch:
     CompositeExplicitAutograd: mvlgamma_
+  tags: pointwise
 
 - func: narrow_copy(Tensor self, int dim, SymInt start, SymInt length) -> Tensor
   variants: function, method
@@ -4077,6 +4223,7 @@
     CompositeExplicitAutograd: deg2rad
     SparseCPU, SparseCUDA: deg2rad_sparse
     SparseCsrCPU, SparseCsrCUDA: deg2rad_sparse_csr
+  tags: pointwise
 
 - func: deg2rad_(Tensor(a!) self) -> Tensor(a!)
   variants: function, method
@@ -4084,12 +4231,14 @@
     CompositeExplicitAutograd: deg2rad_
     SparseCPU, SparseCUDA: deg2rad_sparse_
     SparseCsrCPU, SparseCsrCUDA: deg2rad_sparse_csr_
+  tags: pointwise
 
 - func: deg2rad.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CompositeExplicitAutograd: deg2rad_out
     SparseCPU, SparseCUDA: deg2rad_sparse_out
     SparseCsrCPU, SparseCsrCUDA: deg2rad_sparse_csr_out
+  tags: pointwise
 
 - func: scalar_tensor(Scalar s, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   dispatch:
@@ -4281,12 +4430,13 @@
   device_check: NoCheck   # TensorIterator
   structured_delegate: reciprocal.out
   variants: function, method
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: reciprocal_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   structured_delegate: reciprocal.out
   variants: function, method
+  tags: pointwise
 
 - func: reciprocal.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -4295,6 +4445,7 @@
   dispatch:
     CPU, CUDA: reciprocal_out
     MPS: reciprocal_out_mps
+  tags: pointwise
 
 - func: neg(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -4304,7 +4455,7 @@
     SparseCPU, SparseCUDA: neg_sparse
     SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: neg_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -4314,6 +4465,7 @@
     SparseCPU, SparseCUDA: neg_sparse_
     SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr_
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_neg_
+  tags: pointwise
 
 - func: neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -4324,8 +4476,9 @@
     MPS: neg_out_mps
     SparseCPU, SparseCUDA: neg_out_sparse
     SparseCsrCPU, SparseCsrCUDA: neg_sparse_csr_out
-
+  tags: pointwise
 # Alias for neg
+
 - func: negative(Tensor self) -> Tensor
   variants: function, method
 
@@ -4404,6 +4557,7 @@
   dispatch:
     SparseCPU, SparseCUDA: round_sparse
     SparseCsrCPU, SparseCsrCUDA: round_sparse_csr
+  tags: pointwise
 
 - func: round_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -4412,6 +4566,7 @@
   dispatch:
     SparseCPU, SparseCUDA: round_sparse_
     SparseCsrCPU, SparseCsrCUDA: round_sparse_csr_
+  tags: pointwise
 
 - func: round.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -4423,16 +4578,19 @@
     MPS: round_out_mps
     SparseCPU, SparseCUDA: round_sparse_out
     SparseCsrCPU, SparseCsrCUDA: round_sparse_csr_out
+  tags: pointwise
 
 - func: round.decimals(Tensor self, *, int decimals) -> Tensor
   device_check: NoCheck   # TensorIterator
   structured_delegate: round.decimals_out
   variants: function, method
+  tags: pointwise
 
 - func: round_.decimals(Tensor(a!) self, *, int decimals) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   structured_delegate: round.decimals_out
   variants: function, method
+  tags: pointwise
 
 - func: round.decimals_out(Tensor self, *, int decimals, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -4441,6 +4599,7 @@
   dispatch:
     CPU: round_decimals_out
     CUDA: round_decimals_out
+  tags: pointwise
 
 - func: rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -4462,7 +4621,7 @@
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_relu
     SparseCPU, SparseCUDA: relu_sparse
     SparseCsrCPU, SparseCsrCUDA: relu_sparse_csr
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: relu_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -4477,6 +4636,7 @@
     SparseCPU, SparseCUDA: relu_sparse_
     SparseCsrCPU, SparseCsrCUDA: relu_sparse_csr_
   autogen: relu.out
+  tags: pointwise
 
 - func: relu6(Tensor self) -> Tensor
   python_module: nn
@@ -4529,7 +4689,7 @@
     QuantizedCPU: gelu_quantized_cpu
     QuantizedCUDA: gelu_quantized_cuda
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!)
   structured: True
@@ -4545,6 +4705,7 @@
   python_module: nn
   dispatch:
     MkldnnCPU: mkldnn_gelu_backward
+  tags: pointwise
 
 - func: infinitely_differentiable_gelu_backward(Tensor grad, Tensor self) -> Tensor
   variants: function
@@ -4578,12 +4739,13 @@
   device_check: NoCheck   # TensorIterator
   structured_delegate: rsqrt.out
   variants: function, method
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: rsqrt_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   structured_delegate: rsqrt.out
   variants: function, method
+  tags: pointwise
 
 - func: rsqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -4592,6 +4754,7 @@
   dispatch:
     CPU, CUDA: rsqrt_out
     MPS: rsqrt_out_mps
+  tags: pointwise
 
 - func: select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a)
   variants: function, method
@@ -4697,7 +4860,7 @@
   dispatch:
     QuantizedCPU: sigmoid_quantized_cpu
     MkldnnCPU: mkldnn_sigmoid
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: sigmoid_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -4705,6 +4868,7 @@
   variants: function, method
   dispatch:
     MkldnnCPU: mkldnn_sigmoid_
+  tags: pointwise
 
 - func: sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -4713,20 +4877,24 @@
   dispatch:
     CPU, CUDA: sigmoid_out
     MPS: sigmoid_out_mps
+  tags: pointwise
 
 - func: logit(Tensor self, float? eps=None) -> Tensor
   variants: function, method
   dispatch:
     CPU, CUDA: logit
+  tags: pointwise
 
 - func: logit_(Tensor(a!) self, float? eps=None) -> Tensor(a!)
   variants: function, method
   dispatch:
     CPU, CUDA: logit_
+  tags: pointwise
 
 - func: logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU, CUDA: logit_out
+  tags: pointwise
 
 - func: sin(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -4735,6 +4903,7 @@
   dispatch:
     SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr
     SparseCPU, SparseCUDA: sin_sparse
+  tags: pointwise
 
 - func: sin_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -4743,6 +4912,7 @@
   dispatch:
     SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr_
     SparseCPU, SparseCUDA: sin_sparse_
+  tags: pointwise
 
 - func: sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -4753,20 +4923,24 @@
     MPS: sin_out_mps
     SparseCsrCPU, SparseCsrCUDA: sin_sparse_csr_out
     SparseCPU, SparseCUDA: sin_sparse_out
+  tags: pointwise
 
 - func: sinc(Tensor self) -> Tensor
   structured_delegate: sinc.out
   variants: function, method
+  tags: pointwise
 
 - func: sinc_(Tensor(a!) self) -> Tensor(a!)
   structured_delegate: sinc.out
   variants: function, method
+  tags: pointwise
 
 - func: sinc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: sinc_out
+  tags: pointwise
 
 - func: sinh(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -4775,6 +4949,7 @@
   dispatch:
     SparseCPU, SparseCUDA: sinh_sparse
     SparseCsrCPU, SparseCsrCUDA: sinh_sparse_csr
+  tags: pointwise
 
 - func: sinh_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -4783,6 +4958,7 @@
   dispatch:
     SparseCPU, SparseCUDA: sinh_sparse_
     SparseCsrCPU, SparseCsrCUDA: sinh_sparse_csr_
+  tags: pointwise
 
 - func: sinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -4805,6 +4981,7 @@
 # to false to make such changes explicitly illegal, in order to prevent users from
 # changing metadata of the detached tensor and expecting the original tensor to also
 # be updated.
+  tags: pointwise
 - func: detach(Tensor(a) self) -> Tensor(a)
   variants: function, method
   dispatch:
@@ -5143,7 +5320,7 @@
   dispatch:
     SparseCPU, SparseCUDA: sqrt_sparse
     SparseCsrCPU, SparseCsrCUDA: sqrt_sparse_csr
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: sqrt_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -5152,6 +5329,7 @@
   dispatch:
     SparseCPU, SparseCUDA: sqrt_sparse_
     SparseCsrCPU, SparseCsrCUDA: sqrt_sparse_csr_
+  tags: pointwise
 
 - func: sqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -5162,16 +5340,20 @@
     MPS: sqrt_out_mps
     SparseCPU, SparseCUDA: sqrt_sparse_out
     SparseCsrCPU, SparseCsrCUDA: sqrt_sparse_csr_out
+  tags: pointwise
 
 - func: square(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+  tags: pointwise
 
 - func: square_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function, method
+  tags: pointwise
 
 - func: square.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  tags: pointwise
 
 - func: std(Tensor self, bool unbiased=True) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -5293,6 +5475,7 @@
   dispatch:
     SparseCPU, SparseCUDA: tan_sparse
     SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr
+  tags: pointwise
 
 - func: tan_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -5301,6 +5484,7 @@
   dispatch:
     SparseCPU, SparseCUDA: tan_sparse_
     SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr_
+  tags: pointwise
 
 - func: tan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -5311,6 +5495,7 @@
     MPS: tan_out_mps
     SparseCPU, SparseCUDA: tan_sparse_out
     SparseCsrCPU, SparseCsrCUDA: tan_sparse_csr_out
+  tags: pointwise
 
 - func: tanh(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -5322,7 +5507,7 @@
     SparseCPU, SparseCUDA: tanh_sparse
     SparseCsrCPU, SparseCsrCUDA: tanh_sparse_csr
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_tanh
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: tanh_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -5333,6 +5518,7 @@
     SparseCPU, SparseCUDA: tanh_sparse_
     SparseCsrCPU, SparseCsrCUDA: tanh_sparse_csr_
     NestedTensorCPU, NestedTensorCUDA: NestedTensor_tanh_
+  tags: pointwise
 
 - func: tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -5343,6 +5529,7 @@
     MPS: tanh_out_mps
     SparseCPU, SparseCUDA: tanh_sparse_out
     SparseCsrCPU, SparseCsrCUDA: tanh_sparse_csr_out
+  tags: pointwise
 
 - func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor
   variants: function
@@ -5389,6 +5576,7 @@
     MkldnnCPU: mkldnn_relu_backward
     SparseCPU, SparseCUDA: threshold_backward_sparse
     SparseCsrCPU, SparseCsrCUDA: threshold_backward_sparse_compressed
+  tags: pointwise
 
 - func: tile(Tensor self, int[] dims) -> Tensor
   variants: function, method
@@ -5547,6 +5735,7 @@
   dispatch:
     SparseCPU, SparseCUDA: trunc_sparse
     SparseCsrCPU, SparseCsrCUDA: trunc_sparse_csr
+  tags: pointwise
 
 - func: trunc_(Tensor(a!) self) -> Tensor(a!)
   structured_delegate: trunc.out
@@ -5555,6 +5744,7 @@
   dispatch:
     SparseCPU, SparseCUDA: trunc_sparse_
     SparseCsrCPU, SparseCsrCUDA: trunc_sparse_csr_
+  tags: pointwise
 
 - func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -5565,8 +5755,9 @@
     MPS: trunc_out_mps
     SparseCPU, SparseCUDA: trunc_sparse_out
     SparseCsrCPU, SparseCsrCUDA: trunc_sparse_csr_out
-
+  tags: pointwise
 # Alias for trunc
+
 - func: fix(Tensor self) -> Tensor
   variants: function, method
 
@@ -5731,7 +5922,7 @@
   dispatch:
     CPU, CUDA: where
     MPS: where_mps
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -6003,12 +6194,14 @@
   variants: method, function
   dispatch:
     CompositeExplicitAutograd: frexp
+  tags: pointwise
 
 - func: frexp.Tensor_out(Tensor self, *, Tensor(a!) mantissa, Tensor(b!) exponent) -> (Tensor(a!) mantissa, Tensor(b!) exponent)
   dispatch:
     CPU, CUDA: frexp_out
-
+  tags: pointwise
 # Deprecated (v.1.12)
+
 - func: frobenius_norm(Tensor self) -> Tensor
   variants: function
 
@@ -6050,6 +6243,7 @@
 
 - func: positive(Tensor(a) self) -> Tensor(a)
   variants: function, method
+  tags: pointwise
 
 - func: resize_as_(Tensor(a!) self, Tensor the_template, *, MemoryFormat? memory_format=None) -> Tensor(a!)
   use_const_ref_for_mutable_tensors: True
@@ -6086,6 +6280,7 @@
     CPU, CUDA: sub_out
     MPS: sub_out_mps
     SparseCPU, SparseCUDA: sub_out_sparse
+  tags: pointwise
 
 - func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -6094,7 +6289,7 @@
   dispatch:
     SparseCPU, SparseCUDA: sub_sparse
     ZeroTensor: sub_zerotensor
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -6102,14 +6297,15 @@
   structured_delegate: sub.out
   dispatch:
     SparseCPU, SparseCUDA: sub_sparse_
-
+  tags: pointwise
 # For C++ only, until we have conversion from C++ numbers to Tensor
+
 - func: sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: sub
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -6117,8 +6313,9 @@
   dispatch:
     CompositeExplicitAutograd: sub_
   autogen: sub.Scalar_out
-
+  tags: pointwise
 # subtract, alias for sub
+
 - func: subtract.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
 
 - func: subtract.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
@@ -6147,11 +6344,13 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: heaviside_out
+  tags: pointwise
 
 - func: heaviside(Tensor self, Tensor values) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
   structured_delegate: heaviside.out
+  tags: pointwise
 
 - func: heaviside_(Tensor(a!) self, Tensor values) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -6168,6 +6367,7 @@
 
 # Functionally the same as addmm, but we give it a different derivative formula
 # that doesn't propagate gradients to non-present entries on sparse.
+  tags: pointwise
 - func: _sparse_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
   python_module: sparse
   dispatch:
@@ -7107,6 +7307,7 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: masked_fill
+  tags: pointwise
 
 - func: masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -7380,18 +7581,21 @@
   variants: function
   dispatch:
     CPU, CUDA: bitwise_and_out
+  tags: pointwise
 
 - func: bitwise_and.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function
   dispatch:
     CompositeExplicitAutograd: bitwise_and_out
+  tags: pointwise
 
 - func: bitwise_and.Scalar(Tensor self, Scalar other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: method, function
   dispatch:
     CompositeExplicitAutograd: bitwise_and
+  tags: pointwise
 
 - func: bitwise_and.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -7399,21 +7603,24 @@
   dispatch:
     CompositeExplicitAutograd: bitwise_and
   autogen: bitwise_and.Scalar_Tensor_out
+  tags: pointwise
 
 - func: bitwise_and.Tensor(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: method, function
   structured_delegate: bitwise_and.Tensor_out
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: bitwise_and_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
+  tags: pointwise
 
 - func: bitwise_and_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   structured_delegate: bitwise_and.Tensor_out
+  tags: pointwise
 
 - func: __and__.Scalar(Tensor self, Scalar other) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -7438,16 +7645,19 @@
   variants: function
   dispatch:
     CPU, CUDA: bitwise_or_out
+  tags: pointwise
 
 - func: bitwise_or.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function
   dispatch:
     CompositeExplicitAutograd: bitwise_or_out
+  tags: pointwise
 
 - func: bitwise_or.Scalar(Tensor self, Scalar other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: method, function
+  tags: pointwise
 
 - func: bitwise_or.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -7455,21 +7665,24 @@
   dispatch:
     CompositeExplicitAutograd: bitwise_or
   autogen: bitwise_or.Scalar_Tensor_out
+  tags: pointwise
 
 - func: bitwise_or.Tensor(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: method, function
   structured_delegate: bitwise_or.Tensor_out
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: bitwise_or_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
+  tags: pointwise
 
 - func: bitwise_or_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   structured_delegate: bitwise_or.Tensor_out
+  tags: pointwise
 
 - func: __or__.Scalar(Tensor self, Scalar other) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -7494,16 +7707,19 @@
   variants: function
   dispatch:
     CPU, CUDA: bitwise_xor_out
+  tags: pointwise
 
 - func: bitwise_xor.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function
   dispatch:
     CompositeExplicitAutograd: bitwise_xor_out
+  tags: pointwise
 
 - func: bitwise_xor.Scalar(Tensor self, Scalar other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: method, function
+  tags: pointwise
 
 - func: bitwise_xor.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -7511,48 +7727,58 @@
   dispatch:
     CompositeExplicitAutograd: bitwise_xor
   autogen: bitwise_xor.Scalar_Tensor_out
+  tags: pointwise
 
 - func: bitwise_xor.Tensor(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: method, function
   structured_delegate: bitwise_xor.Tensor_out
+  tags: pointwise
 
 - func: bitwise_xor_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
+  tags: pointwise
 
 - func: bitwise_xor_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   structured_delegate: bitwise_xor.Tensor_out
+  tags: pointwise
 
 - func: __xor__.Scalar(Tensor self, Scalar other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: method, function
+  tags: pointwise
 
 - func: __xor__.Tensor(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: method, function
+  tags: pointwise
 
 - func: __ixor__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
+  tags: pointwise
 
 - func: __ixor__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
+  tags: pointwise
 
 - func: __lshift__.Scalar(Tensor self, Scalar other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: method, function
   dispatch:
     CPU, CUDA: __lshift__
+  tags: pointwise
 
 - func: __lshift__.Tensor(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: method, function
   dispatch:
     CPU, CUDA: __lshift__
+  tags: pointwise
 
 - func: __ilshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -7560,6 +7786,7 @@
   dispatch:
     CPU, CUDA: __ilshift__
   autogen: __lshift__.Scalar_out
+  tags: pointwise
 
 - func: __ilshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -7567,16 +7794,19 @@
   dispatch:
     CPU, CUDA: __ilshift__
   autogen: __lshift__.Tensor_out
+  tags: pointwise
 
 - func: bitwise_left_shift.Tensor(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
   structured_delegate: bitwise_left_shift.Tensor_out
+  tags: pointwise
 
 - func: bitwise_left_shift_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   structured_delegate: bitwise_left_shift.Tensor_out
+  tags: pointwise
 
 - func: bitwise_left_shift.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -7584,24 +7814,28 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: bitwise_left_shift_out
+  tags: pointwise
 
 - func: bitwise_left_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: method, function
   dispatch:
     CompositeExplicitAutograd: bitwise_left_shift
+  tags: pointwise
 
 - func: bitwise_left_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   dispatch:
     CompositeExplicitAutograd: bitwise_left_shift_
+  tags: pointwise
 
 - func: bitwise_left_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function
   dispatch:
     CompositeExplicitAutograd: bitwise_left_shift_out
+  tags: pointwise
 
 - func: bitwise_left_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -7609,18 +7843,21 @@
   dispatch:
     CompositeExplicitAutograd: bitwise_left_shift
   autogen: bitwise_left_shift.Scalar_Tensor_out
+  tags: pointwise
 
 - func: __rshift__.Scalar(Tensor self, Scalar other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: method, function
   dispatch:
     CPU, CUDA: __rshift__
+  tags: pointwise
 
 - func: __rshift__.Tensor(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: method, function
   dispatch:
     CPU, CUDA: __rshift__
+  tags: pointwise
 
 - func: __irshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -7640,11 +7877,13 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   structured_delegate: bitwise_right_shift.Tensor_out
+  tags: pointwise
 
 - func: bitwise_right_shift_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   structured_delegate: bitwise_right_shift.Tensor_out
+  tags: pointwise
 
 - func: bitwise_right_shift.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -7652,24 +7891,28 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: bitwise_right_shift_out
+  tags: pointwise
 
 - func: bitwise_right_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: method, function
   dispatch:
     CompositeExplicitAutograd: bitwise_right_shift
+  tags: pointwise
 
 - func: bitwise_right_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   dispatch:
     CompositeExplicitAutograd: bitwise_right_shift_
+  tags: pointwise
 
 - func: bitwise_right_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function
   dispatch:
     CompositeExplicitAutograd: bitwise_right_shift_out
+  tags: pointwise
 
 - func: bitwise_right_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -7677,6 +7920,7 @@
   dispatch:
     CompositeExplicitAutograd: bitwise_right_shift
   autogen: bitwise_right_shift.Scalar_Tensor_out
+  tags: pointwise
 
 - func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
   structured_delegate: tril.out
@@ -7690,16 +7934,19 @@
   device_check: NoCheck   # TensorIterator
   structured_delegate: digamma.out
   variants: method
+  tags: pointwise
 
 - func: lerp_.Scalar(Tensor(a!) self, Tensor end, Scalar weight) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   structured_delegate: lerp.Scalar_out
+  tags: pointwise
 
 - func: lerp_.Tensor(Tensor(a!) self, Tensor end, Tensor weight) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   structured_delegate: lerp.Tensor_out
+  tags: pointwise
 
 - func: addbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
   variants: method
@@ -7859,6 +8106,7 @@
     CPU, CUDA: ne_Scalar_out
     MPS: ne_scalar_out_mps
     QuantizedCPU: ne_out_quantized_cpu
+  tags: pointwise
 
 - func: ne.Scalar(Tensor self, Scalar other) -> Tensor
   structured_delegate: ne.Scalar_out
@@ -7866,6 +8114,7 @@
   variants: method, function
   dispatch:
     QuantizedCPU: ne_quantized_cpu
+  tags: pointwise
 
 - func: ne.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -7875,6 +8124,7 @@
     CPU, CUDA: ne_Tensor_out
     MPS: ne_tensor_out_mps
     QuantizedCPU: ne_out_quantized_cpu
+  tags: pointwise
 
 - func: ne.Tensor(Tensor self, Tensor other) -> Tensor
   structured_delegate: ne.Tensor_out
@@ -7882,6 +8132,7 @@
   variants: method, function
   dispatch:
     QuantizedCPU: ne_quantized_cpu
+  tags: pointwise
 
 - func: ne_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   structured_delegate: ne.Scalar_out
@@ -7918,6 +8169,7 @@
     CPU, CUDA: eq_Scalar_out
     MPS: eq_scalar_out_mps
     QuantizedCPU: eq_out_quantized_cpu
+  tags: pointwise
 
 - func: eq.Scalar(Tensor self, Scalar other) -> Tensor
   structured_delegate: eq.Scalar_out
@@ -7925,7 +8177,7 @@
   variants: method, function
   dispatch:
     QuantizedCPU: eq_quantized_cpu
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -7935,6 +8187,7 @@
     CPU, CUDA: eq_Tensor_out
     MPS: eq_tensor_out_mps
     QuantizedCPU: eq_out_quantized_cpu
+  tags: pointwise
 
 - func: eq.Tensor(Tensor self, Tensor other) -> Tensor
   structured_delegate: eq.Tensor_out
@@ -7942,6 +8195,7 @@
   variants: method, function
   dispatch:
     QuantizedCPU: eq_quantized_cpu
+  tags: pointwise
 
 - func: ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -7951,6 +8205,7 @@
     CPU, CUDA: ge_Scalar_out
     MPS: ge_scalar_out_mps
     QuantizedCPU: ge_out_quantized_cpu
+  tags: pointwise
 
 - func: ge.Scalar(Tensor self, Scalar other) -> Tensor
   structured_delegate: ge.Scalar_out
@@ -7958,7 +8213,7 @@
   variants: method, function
   dispatch:
     QuantizedCPU: ge_quantized_cpu
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -7968,6 +8223,7 @@
     CPU, CUDA: ge_Tensor_out
     MPS: ge_tensor_out_mps
     QuantizedCPU: ge_out_quantized_cpu
+  tags: pointwise
 
 - func: ge.Tensor(Tensor self, Tensor other) -> Tensor
   structured_delegate: ge.Tensor_out
@@ -7975,6 +8231,7 @@
   variants: method, function
   dispatch:
     QuantizedCPU: ge_quantized_cpu
+  tags: pointwise
 
 - func: ge_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   structured_delegate: ge.Scalar_out
@@ -8011,6 +8268,7 @@
     CPU, CUDA: le_Scalar_out
     MPS: le_scalar_out_mps
     QuantizedCPU: le_out_quantized_cpu
+  tags: pointwise
 
 - func: le.Scalar(Tensor self, Scalar other) -> Tensor
   structured_delegate: le.Scalar_out
@@ -8018,7 +8276,7 @@
   variants: method, function
   dispatch:
     QuantizedCPU: le_quantized_cpu
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: le.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -8028,6 +8286,7 @@
     CPU, CUDA: le_Tensor_out
     MPS: le_tensor_out_mps
     QuantizedCPU: le_out_quantized_cpu
+  tags: pointwise
 
 - func: le.Tensor(Tensor self, Tensor other) -> Tensor
   structured_delegate: le.Tensor_out
@@ -8035,6 +8294,7 @@
   variants: method, function
   dispatch:
     QuantizedCPU: le_quantized_cpu
+  tags: pointwise
 
 - func: le_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   structured_delegate: le.Scalar_out
@@ -8071,6 +8331,7 @@
     CPU, CUDA: gt_Scalar_out
     MPS: gt_scalar_out_mps
     QuantizedCPU: gt_out_quantized_cpu
+  tags: pointwise
 
 - func: gt.Scalar(Tensor self, Scalar other) -> Tensor
   structured_delegate: gt.Scalar_out
@@ -8078,7 +8339,7 @@
   variants: method, function
   dispatch:
     QuantizedCPU: gt_quantized_cpu
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -8088,6 +8349,7 @@
     CPU, CUDA: gt_Tensor_out
     MPS: gt_tensor_out_mps
     QuantizedCPU: gt_out_quantized_cpu
+  tags: pointwise
 
 - func: gt.Tensor(Tensor self, Tensor other) -> Tensor
   structured_delegate: gt.Tensor_out
@@ -8095,6 +8357,7 @@
   variants: method, function
   dispatch:
     QuantizedCPU: gt_quantized_cpu
+  tags: pointwise
 
 - func: gt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   structured_delegate: gt.Scalar_out
@@ -8131,6 +8394,7 @@
     CPU, CUDA: lt_Scalar_out
     MPS: lt_scalar_out_mps
     QuantizedCPU: lt_out_quantized_cpu
+  tags: pointwise
 
 - func: lt.Scalar(Tensor self, Scalar other) -> Tensor
   structured_delegate: lt.Scalar_out
@@ -8138,7 +8402,7 @@
   variants: method, function
   dispatch:
     QuantizedCPU: lt_quantized_cpu
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: lt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -8148,6 +8412,7 @@
     CPU, CUDA: lt_Tensor_out
     MPS: lt_tensor_out_mps
     QuantizedCPU: lt_out_quantized_cpu
+  tags: pointwise
 
 - func: lt.Tensor(Tensor self, Tensor other) -> Tensor
   structured_delegate: lt.Tensor_out
@@ -8155,6 +8420,7 @@
   variants: method, function
   dispatch:
     QuantizedCPU: lt_quantized_cpu
+  tags: pointwise
 
 - func: lt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   structured_delegate: lt.Scalar_out
@@ -8258,7 +8524,7 @@
   dispatch:
     CPU: nonzero_cpu
     CUDA: nonzero_cuda
-  tags: dynamic_output_shape, canonical
+  tags: [dynamic_output_shape, canonical]
 
 - func: nonzero_numpy(Tensor self) -> Tensor[]
   variants: method, function
@@ -8297,16 +8563,19 @@
   dispatch:
     CPU, CUDA: addcmul_out
     MPS: addcmul_out_mps
+  tags: pointwise
 
 - func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
   structured_delegate: addcmul.out
   device_check: NoCheck   # TensorIterator
   variants: method, function
+  tags: pointwise
 
 - func: addcmul_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
   structured_delegate: addcmul.out
   device_check: NoCheck   # TensorIterator
   variants: method
+  tags: pointwise
 
 - func: addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -8315,16 +8584,19 @@
   dispatch:
     CPU, CUDA: addcdiv_out
     MPS: addcdiv_out_mps
+  tags: pointwise
 
 - func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
   structured_delegate: addcdiv.out
   device_check: NoCheck   # TensorIterator
   variants: method, function
+  tags: pointwise
 
 - func: addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
   structured_delegate: addcdiv.out
   device_check: NoCheck   # TensorIterator
   variants: method
+  tags: pointwise
 
 - func: cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, float label_smoothing=0.0) -> Tensor
   python_module: nn
@@ -8507,16 +8779,19 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: lgamma_out
+  tags: pointwise
 
 - func: lgamma_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   structured_delegate: lgamma.out
   variants: method
+  tags: pointwise
 
 - func: lgamma(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
   structured_delegate: lgamma.out
   variants: method, function
+  tags: pointwise
 
 - func: digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -8524,11 +8799,13 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: digamma_out
+  tags: pointwise
 
 - func: digamma(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
   structured_delegate: digamma.out
   variants: method, function
+  tags: pointwise
 
 - func: polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -8536,17 +8813,20 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: polygamma_out
+  tags: pointwise
 
 - func: polygamma(int n, Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
   structured_delegate: polygamma.out
   variants: method, function
+  tags: pointwise
 
 - func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   dispatch:
     CompositeExplicitAutograd: polygamma_
+  tags: pointwise
 
 - func: erfinv(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -8555,6 +8835,7 @@
   dispatch:
     SparseCPU, SparseCUDA: erfinv_sparse
     SparseCsrCPU, SparseCsrCUDA: erfinv_sparse_csr
+  tags: pointwise
 
 - func: erfinv_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -8563,6 +8844,7 @@
   dispatch:
     SparseCPU, SparseCUDA: erfinv_sparse_
     SparseCsrCPU, SparseCsrCUDA: erfinv_sparse_csr_
+  tags: pointwise
 
 - func: erfinv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -8572,20 +8854,24 @@
     CPU, CUDA: erfinv_out
     SparseCPU, SparseCUDA: erfinv_sparse_out
     SparseCsrCPU, SparseCsrCUDA: erfinv_sparse_csr_out
+  tags: pointwise
 
 - func: i0(Tensor self) -> Tensor
   structured_delegate: i0.out
   variants: function, method
+  tags: pointwise
 
 - func: i0_(Tensor(a!) self) -> Tensor(a!)
   structured_delegate: i0.out
   variants: function, method
+  tags: pointwise
 
 - func: i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: i0_out
+  tags: pointwise
 
 - func: sign(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -8594,6 +8880,7 @@
   dispatch:
     SparseCPU, SparseCUDA: sign_sparse
     SparseCsrCPU, SparseCsrCUDA: sign_sparse_csr
+  tags: pointwise
 
 - func: sign_(Tensor(a!) self) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -8602,6 +8889,7 @@
   dispatch:
     SparseCPU, SparseCUDA: sign_sparse_
     SparseCsrCPU, SparseCsrCUDA: sign_sparse_csr_
+  tags: pointwise
 
 - func: sign.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -8612,6 +8900,7 @@
     MPS: sign_out_mps
     SparseCPU, SparseCUDA: sign_sparse_out
     SparseCsrCPU, SparseCsrCUDA: sign_sparse_csr_out
+  tags: pointwise
 
 - func: signbit(Tensor self) -> Tensor
   variants: function, method
@@ -8619,6 +8908,7 @@
   dispatch:
     SparseCPU, SparseCUDA: signbit_sparse
     SparseCsrCPU, SparseCsrCUDA: signbit_sparse_csr
+  tags: pointwise
 
 - func: signbit.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -8629,6 +8919,7 @@
     MPS: signbit_out_mps
     SparseCPU, SparseCUDA: signbit_sparse_out
     SparseCsrCPU, SparseCsrCUDA: signbit_sparse_csr_out
+  tags: pointwise
 
 - func: dist(Tensor self, Tensor other, Scalar p=2) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -8644,18 +8935,21 @@
   dispatch:
     CPU, CUDA: atan2_out
     MPS: atan2_mps_out
+  tags: pointwise
 
 - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   structured_delegate: atan2.out
   variants: method
+  tags: pointwise
 
 - func: atan2(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   structured_delegate: atan2.out
   variants: method, function
-
+  tags: pointwise
 # arctan2, alias of atan2
+
 - func: arctan2(Tensor self, Tensor other) -> Tensor
   variants: method, function
 
@@ -8671,6 +8965,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: lerp_Scalar
+  tags: pointwise
 
 - func: lerp.Tensor_out(Tensor self, Tensor end, Tensor weight, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -8678,16 +8973,19 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: lerp_Tensor
+  tags: pointwise
 
 - func: lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: method, function
   structured_delegate: lerp.Scalar_out
+  tags: pointwise
 
 - func: lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: method, function
   structured_delegate: lerp.Tensor_out
+  tags: pointwise
 
 - func: histc.out(Tensor self, int bins=100, Scalar min=0, Scalar max=0, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -8743,18 +9041,21 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CompositeExplicitAutograd: fmod_out
+  tags: pointwise
 
 - func: fmod.Scalar(Tensor self, Scalar other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: method, function
   dispatch:
     CompositeExplicitAutograd: fmod
+  tags: pointwise
 
 - func: fmod_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   dispatch:
     CompositeExplicitAutograd: fmod_
+  tags: pointwise
 
 - func: fmod.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -8762,87 +9063,104 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: fmod_out
+  tags: pointwise
 
 - func: fmod.Tensor(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   structured_delegate: fmod.Tensor_out
   variants: method, function
-
+  tags: pointwise
 
 - func: fmod_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
   structured_delegate: fmod.Tensor_out
+  tags: pointwise
 
 - func: hypot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: hypot_out
+  tags: pointwise
 
 - func: hypot(Tensor self, Tensor other) -> Tensor
   structured_delegate: hypot.out
   variants: method, function
+  tags: pointwise
 
 - func: hypot_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   structured_delegate: hypot.out
   variants: method
+  tags: pointwise
 
 - func: igamma.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: igamma_out
+  tags: pointwise
 
 - func: igamma(Tensor self, Tensor other) -> Tensor
   structured_delegate: igamma.out
   variants: method, function
+  tags: pointwise
 
 - func: igamma_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   structured_delegate: igamma.out
   variants: method
+  tags: pointwise
 
 - func: igammac.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: igammac_out
+  tags: pointwise
 
 - func: igammac(Tensor self, Tensor other) -> Tensor
   structured_delegate: igammac.out
   variants: method, function
+  tags: pointwise
 
 - func: igammac_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   structured_delegate: igammac.out
   variants: method
+  tags: pointwise
 
 - func: nextafter.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: nextafter_out
+  tags: pointwise
 
 - func: nextafter(Tensor self, Tensor other) -> Tensor
   structured_delegate: nextafter.out
   variants: method, function
+  tags: pointwise
 
 - func: nextafter_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   structured_delegate: nextafter.out
   variants: method
+  tags: pointwise
 
 - func: remainder.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CompositeExplicitAutograd: remainder_out
+  tags: pointwise
 
 - func: remainder.Scalar(Tensor self, Scalar other) -> Tensor
   variants: method, function
   dispatch:
     CompositeExplicitAutograd: remainder
+  tags: pointwise
 
 - func: remainder_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   variants: method
   dispatch:
     CompositeExplicitAutograd: remainder_
+  tags: pointwise
 
 - func: remainder.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -8850,16 +9168,19 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: remainder_out
+  tags: pointwise
 
 - func: remainder.Tensor(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   structured_delegate: remainder.Tensor_out
   variants: method, function
+  tags: pointwise
 
 - func: remainder_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   structured_delegate: remainder.Tensor_out
   variants: method
+  tags: pointwise
 
 - func: remainder.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -8867,6 +9188,7 @@
   dispatch:
     CPU, CUDA: remainder
   autogen: remainder.Scalar_Tensor_out
+  tags: pointwise
 
 - func: min(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -8887,6 +9209,7 @@
   structured_delegate: fmin.out
   device_check: NoCheck   # TensorIterator
   variants: method, function
+  tags: pointwise
 
 - func: fmin.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -8894,6 +9217,7 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: fmin_out
+  tags: pointwise
 
 - func: max(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -8907,6 +9231,7 @@
   structured_delegate: fmax.out
   device_check: NoCheck   # TensorIterator
   variants: method, function
+  tags: pointwise
 
 - func: fmax.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -8914,12 +9239,13 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: fmax_out
+  tags: pointwise
 
 - func: maximum(Tensor self, Tensor other) -> Tensor
   structured_delegate: maximum.out
   device_check: NoCheck   # TensorIterator
   variants: method, function
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: maximum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -8928,15 +9254,18 @@
   dispatch:
     CPU, CUDA: maximum_out
     MPS: maximum_out_mps
+  tags: pointwise
 
 # binary max, alias of maximum
 # NOTE: max is not an alias for maximum, since there is also unary max
 - func: max.other(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: method, function
+  tags: pointwise
 
 - func: max.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
+  tags: pointwise
 
 - func: max.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -8948,7 +9277,7 @@
   structured_delegate: minimum.out
   device_check: NoCheck   # TensorIterator
   variants: method, function
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -8957,15 +9286,18 @@
   dispatch:
     CPU, CUDA: minimum_out
     MPS: minimum_out_mps
+  tags: pointwise
 
 # binary min, alias for minimum
 # NOTE: min is not an alias for minimum, since there is also unary min
 - func: min.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
+  tags: pointwise
 
 - func: min.other(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: method, function
+  tags: pointwise
 
 - func: quantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor
   variants: method, function
@@ -9108,7 +9440,7 @@
   autogen: unfold_backward.out
 
 - func: equal(Tensor self, Tensor other) -> bool
-  tags: data_dependent_output
+  tags: [data_dependent_output, pointwise]
   variants: method, function
   dispatch:
     CPU: cpu_equal
@@ -9123,21 +9455,25 @@
   dispatch:
     CPU, CUDA: pow_Tensor_Tensor_out
     MPS: pow_tensor_tensor_out_mps
+  tags: pointwise
 
 - func: pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
   device_check: NoCheck   # TensorIterator
   structured_delegate: pow.Tensor_Tensor_out
   variants: method, function
+  tags: pointwise
 
 - func: pow.Scalar_out(Scalar self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   structured: True
   dispatch:
     CPU, CUDA: pow_Scalar_out
+  tags: pointwise
 
 - func: pow.Scalar(Scalar self, Tensor exponent) -> Tensor
   device_check: NoCheck   # TensorIterator
   structured_delegate: pow.Scalar_out
+  tags: pointwise
 
 - func: pow.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -9147,6 +9483,7 @@
     CPU, CUDA: pow_Tensor_Scalar_out
     SparseCPU, SparseCUDA: pow_out_sparse_scalar
     MPS: pow_tensor_scalar_out_mps
+  tags: pointwise
 
 - func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -9154,37 +9491,47 @@
   variants: function, method
   dispatch:
     SparseCPU, SparseCUDA: pow_sparse_scalar
-  tags: canonical
+  tags: [canonical, pointwise]
 
 - func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   structured_delegate: pow.Tensor_Scalar_out
   variants: method
+  tags: pointwise
 
 - func: pow_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   structured_delegate: pow.Tensor_Tensor_out
   variants: method
+  tags: pointwise
 
 - func: float_power.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
+  tags: pointwise
 
 - func: float_power.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
   variants: function, method
+  tags: pointwise
 
 - func: float_power.Scalar_out(Scalar self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
+  tags: pointwise
 
 - func: float_power.Scalar(Scalar self, Tensor exponent) -> Tensor
+  tags: pointwise
 
 - func: float_power.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!)
+  tags: pointwise
 
 - func: float_power.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
   variants: function, method
+  tags: pointwise
 
 - func: float_power_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
   variants: method
+  tags: pointwise
 
 - func: float_power_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!)
   variants: method
+  tags: pointwise
 
 - func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -11445,10 +11792,12 @@
   dispatch:
     CPU, CUDA: sigmoid_backward_out
     MPS: sigmoid_backward_out_mps
+  tags: pointwise
 
 - func: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor
   python_module: nn
   structured_delegate: sigmoid_backward.grad_input
+  tags: pointwise
 
 - func: logit_backward.grad_input(Tensor grad_output, Tensor self, float? eps=None, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -11456,10 +11805,12 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: logit_backward_out
+  tags: pointwise
 
 - func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor
   python_module: nn
   structured_delegate: logit_backward.grad_input
+  tags: pointwise
 
 - func: tanh_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -11468,6 +11819,7 @@
   dispatch:
     CPU, CUDA: tanh_backward_out
     MPS: tanh_backward_out_mps
+  tags: pointwise
 
 - func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor
   python_module: nn
@@ -11490,6 +11842,7 @@
 # one that is written in the native style: modern C++.  Algorithmically,
 # these are the same thing, but we give them different prefixes to
 # make the operational distinction clear.
+  tags: pointwise
 
 - func: slow_conv_transpose2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -11648,6 +12001,7 @@
   dispatch:
     SparseCPU, SparseCUDA: isposinf_sparse
     SparseCsrCPU, SparseCsrCUDA: isposinf_sparse_csr
+  tags: pointwise
 
 - func: isposinf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -11656,6 +12010,7 @@
     CPU, CUDA: isposinf_out
     SparseCPU, SparseCUDA: isposinf_sparse_out
     SparseCsrCPU, SparseCsrCUDA: isposinf_sparse_csr_out
+  tags: pointwise
 
 - func: isneginf(Tensor self) -> Tensor
   variants: function, method
@@ -11663,6 +12018,7 @@
   dispatch:
     SparseCPU, SparseCUDA: isneginf_sparse
     SparseCsrCPU, SparseCsrCUDA: isneginf_sparse_csr
+  tags: pointwise
 
 - func: isneginf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -11671,6 +12027,7 @@
     CPU, CUDA: isneginf_out
     SparseCPU, SparseCUDA: isneginf_sparse_out
     SparseCsrCPU, SparseCsrCUDA: isneginf_sparse_csr_out
+  tags: pointwise
 
 # NOTE [_add_batch_dim and _remove_batch_dim]
 # _add_batch_dim and _remove_batch_dim are meant to be used in the implementation
@@ -11694,6 +12051,7 @@
   structured_delegate: special_entr.out
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_entr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -11702,11 +12060,13 @@
   variants: function
   dispatch:
     CPU, CUDA: special_entr_out
+  tags: pointwise
 
 - func: special_ndtri(Tensor self) -> Tensor
   structured_delegate: special_ndtri.out
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_ndtri.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -11715,11 +12075,13 @@
   variants: function
   dispatch:
     CPU, CUDA: special_ndtri_out
+  tags: pointwise
 
 - func: special_log_ndtr(Tensor self) -> Tensor
   structured_delegate: special_log_ndtr.out
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_log_ndtr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -11728,6 +12090,7 @@
   variants: function
   dispatch:
     CPU, CUDA: special_log_ndtr_out
+  tags: pointwise
 
 - func: special_expm1(Tensor self) -> Tensor
   python_module: special
@@ -11788,6 +12151,7 @@
   python_module: special
   variants: function
   structured_delegate: special_erfcx.out
+  tags: pointwise
 
 - func: special_erfcx.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   python_module: special
@@ -11795,6 +12159,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: special_erfcx_out
+  tags: pointwise
 
 - func: special_erfinv(Tensor self) -> Tensor
   python_module: special
@@ -11816,6 +12181,7 @@
   python_module: special
   variants: function
   structured_delegate: special_xlog1py.out
+  tags: pointwise
 
 - func: special_xlog1py.self_scalar(Scalar self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -11823,6 +12189,7 @@
   variants: function
   dispatch:
     CompositeExplicitAutograd: special_xlog1py
+  tags: pointwise
 
 - func: special_xlog1py.other_scalar(Tensor self, Scalar other) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -11830,6 +12197,7 @@
   variants: function
   dispatch:
     CompositeExplicitAutograd: special_xlog1py
+  tags: pointwise
 
 - func: special_xlog1py.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -11839,6 +12207,7 @@
   variants: function
   dispatch:
     CPU, CUDA: special_xlog1py_out
+  tags: pointwise
 
 - func: special_xlog1py.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -11846,6 +12215,7 @@
   variants: function
   dispatch:
     CompositeExplicitAutograd: special_xlog1py_out
+  tags: pointwise
 
 - func: special_xlog1py.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -11853,6 +12223,7 @@
   variants: function
   dispatch:
     CompositeExplicitAutograd: special_xlog1py_out
+  tags: pointwise
 
 - func: special_xlogy(Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -11889,6 +12260,7 @@
   python_module: special
   variants: function
   structured_delegate: special_zeta.out
+  tags: pointwise
 
 - func: special_zeta.self_scalar(Scalar self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -11896,6 +12268,7 @@
   variants: function
   dispatch:
     CompositeExplicitAutograd: special_zeta
+  tags: pointwise
 
 - func: special_zeta.other_scalar(Tensor self, Scalar other) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -11903,6 +12276,7 @@
   variants: function
   dispatch:
     CompositeExplicitAutograd: special_zeta
+  tags: pointwise
 
 - func: special_zeta.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -11912,6 +12286,7 @@
   variants: function
   dispatch:
     CPU, CUDA: special_zeta_out
+  tags: pointwise
 
 - func: special_zeta.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -11919,6 +12294,7 @@
   variants: function
   dispatch:
     CompositeExplicitAutograd: special_zeta_out
+  tags: pointwise
 
 - func: special_zeta.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -11926,6 +12302,7 @@
   variants: function
   dispatch:
     CompositeExplicitAutograd: special_zeta_out
+  tags: pointwise
 
 - func: special_i0(Tensor self) -> Tensor
   python_module: special
@@ -11939,6 +12316,7 @@
   python_module: special
   variants: function
   structured_delegate: special_i0e.out
+  tags: pointwise
 
 - func: special_i0e.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   python_module: special
@@ -11946,11 +12324,13 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: special_i0e_out
+  tags: pointwise
 
 - func: special_i1(Tensor self) -> Tensor
   python_module: special
   variants: function
   structured_delegate: special_i1.out
+  tags: pointwise
 
 - func: special_i1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   python_module: special
@@ -11958,11 +12338,13 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: special_i1_out
+  tags: pointwise
 
 - func: special_i1e(Tensor self) -> Tensor
   python_module: special
   variants: function
   structured_delegate: special_i1e.out
+  tags: pointwise
 
 - func: special_i1e.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   python_module: special
@@ -11970,6 +12352,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: special_i1e_out
+  tags: pointwise
 
 - func: special_logit(Tensor self, float? eps=None) -> Tensor
   python_module: special
@@ -13364,6 +13747,7 @@
   python_module: special
   structured_delegate: special_airy_ai.out
   variants: function
+  tags: pointwise
 
 - func: special_airy_ai.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13372,6 +13756,7 @@
   structured_inherits: TensorIteratorBase
   structured: True
   variants: function
+  tags: pointwise
 
 - func: _transformer_decoder_only_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, Tensor? incr_key=None, Tensor? incr_value=None) -> (Tensor, Tensor, Tensor)
   variants: function
@@ -13389,6 +13774,7 @@
   python_module: special
   structured_delegate: special_bessel_j0.out
   variants: function
+  tags: pointwise
 
 - func: special_bessel_j0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13397,11 +13783,13 @@
   structured_inherits: TensorIteratorBase
   structured: True
   variants: function
+  tags: pointwise
 
 - func: special_bessel_j1(Tensor self) -> Tensor
   python_module: special
   structured_delegate: special_bessel_j1.out
   variants: function
+  tags: pointwise
 
 - func: special_bessel_j1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13410,11 +13798,13 @@
   structured_inherits: TensorIteratorBase
   structured: True
   variants: function
+  tags: pointwise
 
 - func: special_bessel_y0(Tensor self) -> Tensor
   python_module: special
   structured_delegate: special_bessel_y0.out
   variants: function
+  tags: pointwise
 
 - func: special_bessel_y0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13423,11 +13813,13 @@
   structured_inherits: TensorIteratorBase
   structured: True
   variants: function
+  tags: pointwise
 
 - func: special_bessel_y1(Tensor self) -> Tensor
   python_module: special
   structured_delegate: special_bessel_y1.out
   variants: function
+  tags: pointwise
 
 - func: special_bessel_y1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13436,22 +13828,26 @@
   structured_inherits: TensorIteratorBase
   structured: True
   variants: function
+  tags: pointwise
 
 - func: special_chebyshev_polynomial_t(Tensor x, Tensor n) -> Tensor
   device_check: NoCheck
   python_module: special
   structured_delegate: special_chebyshev_polynomial_t.out
   variants: function
+  tags: pointwise
 
 - func: special_chebyshev_polynomial_t.x_scalar(Scalar x, Tensor n) -> Tensor
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_chebyshev_polynomial_t.n_scalar(Tensor x, Scalar n) -> Tensor
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
@@ -13461,11 +13857,13 @@
   structured_inherits: TensorIteratorBase
   structured: True
   variants: function
+  tags: pointwise
 
 - func: special_chebyshev_polynomial_t.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_chebyshev_polynomial_t.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13473,22 +13871,26 @@
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_chebyshev_polynomial_u(Tensor x, Tensor n) -> Tensor
   device_check: NoCheck
   python_module: special
   structured_delegate: special_chebyshev_polynomial_u.out
   variants: function
+  tags: pointwise
 
 - func: special_chebyshev_polynomial_u.x_scalar(Scalar x, Tensor n) -> Tensor
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_chebyshev_polynomial_u.n_scalar(Tensor x, Scalar n) -> Tensor
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_chebyshev_polynomial_u.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
@@ -13498,11 +13900,13 @@
   structured_inherits: TensorIteratorBase
   structured: True
   variants: function
+  tags: pointwise
 
 - func: special_chebyshev_polynomial_u.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_chebyshev_polynomial_u.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13510,22 +13914,26 @@
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_chebyshev_polynomial_v(Tensor x, Tensor n) -> Tensor
   device_check: NoCheck
   python_module: special
   structured_delegate: special_chebyshev_polynomial_v.out
   variants: function
+  tags: pointwise
 
 - func: special_chebyshev_polynomial_v.x_scalar(Scalar x, Tensor n) -> Tensor
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_chebyshev_polynomial_v.n_scalar(Tensor x, Scalar n) -> Tensor
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
@@ -13535,11 +13943,13 @@
   structured_inherits: TensorIteratorBase
   structured: True
   variants: function
+  tags: pointwise
 
 - func: special_chebyshev_polynomial_v.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_chebyshev_polynomial_v.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13547,22 +13957,26 @@
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_chebyshev_polynomial_w(Tensor x, Tensor n) -> Tensor
   device_check: NoCheck
   python_module: special
   structured_delegate: special_chebyshev_polynomial_w.out
   variants: function
+  tags: pointwise
 
 - func: special_chebyshev_polynomial_w.x_scalar(Scalar x, Tensor n) -> Tensor
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_chebyshev_polynomial_w.n_scalar(Tensor x, Scalar n) -> Tensor
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_chebyshev_polynomial_w.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
@@ -13572,11 +13986,13 @@
   structured_inherits: TensorIteratorBase
   structured: True
   variants: function
+  tags: pointwise
 
 - func: special_chebyshev_polynomial_w.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_chebyshev_polynomial_w.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13584,22 +14000,26 @@
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_hermite_polynomial_h(Tensor x, Tensor n) -> Tensor
   device_check: NoCheck
   python_module: special
   structured_delegate: special_hermite_polynomial_h.out
   variants: function
+  tags: pointwise
 
 - func: special_hermite_polynomial_h.x_scalar(Scalar x, Tensor n) -> Tensor
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_hermite_polynomial_h.n_scalar(Tensor x, Scalar n) -> Tensor
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_hermite_polynomial_h.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
@@ -13609,11 +14029,13 @@
   structured_inherits: TensorIteratorBase
   structured: True
   variants: function
+  tags: pointwise
 
 - func: special_hermite_polynomial_h.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_hermite_polynomial_h.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13621,22 +14043,26 @@
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_hermite_polynomial_he(Tensor x, Tensor n) -> Tensor
   device_check: NoCheck
   python_module: special
   structured_delegate: special_hermite_polynomial_he.out
   variants: function
+  tags: pointwise
 
 - func: special_hermite_polynomial_he.x_scalar(Scalar x, Tensor n) -> Tensor
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_hermite_polynomial_he.n_scalar(Tensor x, Scalar n) -> Tensor
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_hermite_polynomial_he.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
@@ -13646,11 +14072,13 @@
   structured_inherits: TensorIteratorBase
   structured: True
   variants: function
+  tags: pointwise
 
 - func: special_hermite_polynomial_he.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_hermite_polynomial_he.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13658,22 +14086,26 @@
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_laguerre_polynomial_l(Tensor x, Tensor n) -> Tensor
   device_check: NoCheck
   python_module: special
   structured_delegate: special_laguerre_polynomial_l.out
   variants: function
+  tags: pointwise
 
 - func: special_laguerre_polynomial_l.x_scalar(Scalar x, Tensor n) -> Tensor
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_laguerre_polynomial_l.n_scalar(Tensor x, Scalar n) -> Tensor
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_laguerre_polynomial_l.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
@@ -13683,11 +14115,13 @@
   structured_inherits: TensorIteratorBase
   structured: True
   variants: function
+  tags: pointwise
 
 - func: special_laguerre_polynomial_l.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_laguerre_polynomial_l.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13695,22 +14129,26 @@
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_legendre_polynomial_p(Tensor x, Tensor n) -> Tensor
   device_check: NoCheck
   python_module: special
   structured_delegate: special_legendre_polynomial_p.out
   variants: function
+  tags: pointwise
 
 - func: special_legendre_polynomial_p.x_scalar(Scalar x, Tensor n) -> Tensor
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_legendre_polynomial_p.n_scalar(Tensor x, Scalar n) -> Tensor
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_legendre_polynomial_p.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
@@ -13720,11 +14158,13 @@
   structured_inherits: TensorIteratorBase
   structured: True
   variants: function
+  tags: pointwise
 
 - func: special_legendre_polynomial_p.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_legendre_polynomial_p.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13732,11 +14172,13 @@
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_modified_bessel_i0(Tensor self) -> Tensor
   python_module: special
   structured_delegate: special_modified_bessel_i0.out
   variants: function
+  tags: pointwise
 
 - func: special_modified_bessel_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13745,11 +14187,13 @@
   structured_inherits: TensorIteratorBase
   structured: True
   variants: function
+  tags: pointwise
 
 - func: special_modified_bessel_i1(Tensor self) -> Tensor
   python_module: special
   structured_delegate: special_modified_bessel_i1.out
   variants: function
+  tags: pointwise
 
 - func: special_modified_bessel_i1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13758,11 +14202,13 @@
   structured_inherits: TensorIteratorBase
   structured: True
   variants: function
+  tags: pointwise
 
 - func: special_modified_bessel_k0(Tensor self) -> Tensor
   python_module: special
   structured_delegate: special_modified_bessel_k0.out
   variants: function
+  tags: pointwise
 
 - func: special_modified_bessel_k0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13771,11 +14217,13 @@
   structured_inherits: TensorIteratorBase
   structured: True
   variants: function
+  tags: pointwise
 
 - func: special_modified_bessel_k1(Tensor self) -> Tensor
   python_module: special
   structured_delegate: special_modified_bessel_k1.out
   variants: function
+  tags: pointwise
 
 - func: special_modified_bessel_k1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13784,11 +14232,13 @@
   structured_inherits: TensorIteratorBase
   structured: True
   variants: function
+  tags: pointwise
 
 - func: special_scaled_modified_bessel_k0(Tensor x) -> Tensor
   python_module: special
   structured_delegate: special_scaled_modified_bessel_k0.out
   variants: function
+  tags: pointwise
 
 - func: special_scaled_modified_bessel_k0.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13797,11 +14247,13 @@
   structured_inherits: TensorIteratorBase
   structured: True
   variants: function
+  tags: pointwise
 
 - func: special_scaled_modified_bessel_k1(Tensor x) -> Tensor
   python_module: special
   structured_delegate: special_scaled_modified_bessel_k1.out
   variants: function
+  tags: pointwise
 
 - func: special_scaled_modified_bessel_k1.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13810,22 +14262,26 @@
   structured_inherits: TensorIteratorBase
   structured: True
   variants: function
+  tags: pointwise
 
 - func: special_shifted_chebyshev_polynomial_t(Tensor x, Tensor n) -> Tensor
   device_check: NoCheck
   python_module: special
   structured_delegate: special_shifted_chebyshev_polynomial_t.out
   variants: function
+  tags: pointwise
 
 - func: special_shifted_chebyshev_polynomial_t.x_scalar(Scalar x, Tensor n) -> Tensor
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_shifted_chebyshev_polynomial_t.n_scalar(Tensor x, Scalar n) -> Tensor
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_shifted_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
@@ -13835,11 +14291,13 @@
   structured_inherits: TensorIteratorBase
   structured: True
   variants: function
+  tags: pointwise
 
 - func: special_shifted_chebyshev_polynomial_t.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_shifted_chebyshev_polynomial_t.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13847,22 +14305,26 @@
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_shifted_chebyshev_polynomial_u(Tensor x, Tensor n) -> Tensor
   device_check: NoCheck
   python_module: special
   structured_delegate: special_shifted_chebyshev_polynomial_u.out
   variants: function
+  tags: pointwise
 
 - func: special_shifted_chebyshev_polynomial_u.x_scalar(Scalar x, Tensor n) -> Tensor
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_shifted_chebyshev_polynomial_u.n_scalar(Tensor x, Scalar n) -> Tensor
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_shifted_chebyshev_polynomial_u.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
@@ -13872,11 +14334,13 @@
   structured_inherits: TensorIteratorBase
   structured: True
   variants: function
+  tags: pointwise
 
 - func: special_shifted_chebyshev_polynomial_u.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_shifted_chebyshev_polynomial_u.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13884,22 +14348,26 @@
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_shifted_chebyshev_polynomial_v(Tensor x, Tensor n) -> Tensor
   device_check: NoCheck
   python_module: special
   structured_delegate: special_shifted_chebyshev_polynomial_v.out
   variants: function
+  tags: pointwise
 
 - func: special_shifted_chebyshev_polynomial_v.x_scalar(Scalar x, Tensor n) -> Tensor
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_shifted_chebyshev_polynomial_v.n_scalar(Tensor x, Scalar n) -> Tensor
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_shifted_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
@@ -13909,11 +14377,13 @@
   structured_inherits: TensorIteratorBase
   structured: True
   variants: function
+  tags: pointwise
 
 - func: special_shifted_chebyshev_polynomial_v.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_shifted_chebyshev_polynomial_v.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13921,22 +14391,26 @@
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_shifted_chebyshev_polynomial_w(Tensor x, Tensor n) -> Tensor
   device_check: NoCheck
   python_module: special
   structured_delegate: special_shifted_chebyshev_polynomial_w.out
   variants: function
+  tags: pointwise
 
 - func: special_shifted_chebyshev_polynomial_w.x_scalar(Scalar x, Tensor n) -> Tensor
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_shifted_chebyshev_polynomial_w.n_scalar(Tensor x, Scalar n) -> Tensor
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_shifted_chebyshev_polynomial_w.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
@@ -13946,11 +14420,13 @@
   structured_inherits: TensorIteratorBase
   structured: True
   variants: function
+  tags: pointwise
 
 - func: special_shifted_chebyshev_polynomial_w.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_shifted_chebyshev_polynomial_w.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13958,11 +14434,13 @@
   device_check: NoCheck
   python_module: special
   variants: function
+  tags: pointwise
 
 - func: special_spherical_bessel_j0(Tensor x) -> Tensor
   python_module: special
   structured_delegate: special_spherical_bessel_j0.out
   variants: function
+  tags: pointwise
 
 - func: special_spherical_bessel_j0.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -13971,6 +14449,7 @@
   structured_inherits: TensorIteratorBase
   structured: True
   variants: function
+  tags: pointwise
 
 # Aux function used in the test TestPythonDispatch.test_kwarg_only_and_positional_default
 # within test/test_python_dispatch.py
diff --git a/aten/src/ATen/native/tags.yaml b/aten/src/ATen/native/tags.yaml
index 5d2a69db016fd..ce75b0ae10c63 100644
--- a/aten/src/ATen/native/tags.yaml
+++ b/aten/src/ATen/native/tags.yaml
@@ -40,3 +40,8 @@
           type promotion and boardcasting ops.
           Canonical aten ops is also effectively the opset produced by torchdynamo.export(aten_graph=True),
           and thus can be used as an opset for export purpose.
+- tag: pointwise
+  desc: |
+          Pointwise operators are operators where each element of the output is computed only by accessing
+          the corresponding element of all the broadcasted inputs. The output shape will be the broadcasted
+          shape of the inputs.
diff --git a/test/test_ops.py b/test/test_ops.py
index 62d44030cbff0..b78d2d8e096e9 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -7,11 +7,13 @@
 import itertools
 import torch
 import contextlib
+import re
+import os
+
 from collections import defaultdict
 from importlib import import_module
 from torch.utils._pytree import tree_map
 from typing import Dict
-
 from torch.testing import make_tensor
 from torch.testing._internal.common_dtype import (
     floating_and_complex_types_and,
@@ -150,6 +152,90 @@ def test_multiple_devices(self, devices, dtype, op):
                     "Skipped! Only supports single tensor or iterable of tensor outputs."
                 )
 
+    def test_pointwise_tag_coverage(self):
+
+        pytorch_dir = os.path.abspath(__file__ + "/../../")
+        files = [
+            "aten/src/ATen/native/UnaryOps.cpp",
+            "aten/src/ATen/native/BinaryOps.cpp",
+            "aten/src/ATen/native/PointwiseOps.cpp",
+            "aten/src/ATen/native/TensorCompare.cpp",
+        ]
+
+        allowed_functions = (
+            # reduction version of these operators
+            "aten.max.default",
+            "aten.max.dim",
+            "aten.max.dim_max",
+            "aten.max.names_dim",
+            "aten.max.names_dim_max",
+            "aten.max.unary_out",
+            "aten.min.default",
+            "aten.min.dim",
+            "aten.min.dim_min",
+            "aten.min.names_dim",
+            "aten.min.names_dim_min",
+            # not pointwise
+            "aten.isin.Tensor_Tensor",
+            "aten.isin.Tensor_Tensor_out",
+            "aten.isin.Tensor_Scalar",
+            "aten.isin.Tensor_Scalar_out",
+            "aten.isin.Scalar_Tensor",
+            "aten.isin.Scalar_Tensor_out",
+            "aten.mode.default",
+            "aten.mode.dimname",
+            "aten.mode.dimname_out",
+            "aten.mode.values",
+        )
+
+        regex = re.compile(r"DEFINE_DISPATCH\(.*_stub")
+
+        def get_opoverloadpacket_from_dispatch(kernel):
+            if hasattr(torch.ops.aten, kernel):
+                return kernel
+            if hasattr(torch.ops.aten, f"__{kernel}__"):
+                return f"__{kernel}__"
+            if hasattr(torch.ops.aten, f"special_{kernel}"):
+                return f"special_{kernel}"
+            if "_" in kernel:
+                kernel_split = kernel.split("_")
+                new_kernel = "_".join(kernel_split[:-1])
+                if hasattr(torch.ops.aten, new_kernel):
+                    return new_kernel
+
+            # could not find op from kernel dispatch string
+            self.assertTrue(False)
+
+        for file_name in files:
+            with open(os.path.join(pytorch_dir, file_name), "r") as f:
+                lines = f.read()
+                matches = regex.findall(lines)
+                for match in matches:
+                    kernel = match[len("DEFINE_DISPATCH("):-len("_stub")]
+
+                    # no op definition for it, but defined with DEFINE_DISPATCH ?
+                    if kernel == "trigamma":
+                        continue
+
+                    kernel = get_opoverloadpacket_from_dispatch(kernel)
+                    overloadpacket = getattr(torch.ops.aten, kernel)
+
+                    for overload_name in overloadpacket.overloads():
+                        overload = getattr(overloadpacket, overload_name)
+
+                        if not torch._C._dispatch_has_kernel(overload.name()):
+                            continue
+
+                        # TODO: tags are not propagated to generated overload,
+                        # and there's no way of specifying them
+                        if torch.Tag.generated in overload.tags:
+                            continue
+
+                        if str(overload) in allowed_functions:
+                            continue
+
+                        self.assertTrue(torch.Tag.pointwise in overload.tags)
+
     # Tests that the function and its (ndarray-accepting) reference produce the same
     #   values on the tensors from sample_inputs func for the corresponding op.
     # This test runs in double and complex double precision because
@@ -1092,6 +1178,7 @@ def test_complex_half_reference_testing(self, device, dtype, op):
             # `cfloat` input -> `float` output
             self.assertEqual(actual, expected, exact_dtype=False)
 
+
     @ops(op_db, allowed_dtypes=(torch.bool,))
     @unittest.skipIf(TEST_WITH_UBSAN, "Test uses undefined behavior")
     @skipIfTorchInductor("Inductor does not support view with dtype yet")
@@ -1973,6 +2060,59 @@ def map_to_fake(e):
             except torch._subclasses.fake_tensor.DataDependentOutputException:
                 self.assertTrue(name in data_dependent_op_tests)
 
+    @ops(op_db, dtypes=OpDTypes.any_one)
+    def test_pointwise_ops(self, device, dtype, op):
+        name = op.name
+        if op.variant_test_name:
+            name += "." + op.variant_test_name
+        if name in fake_skips or "sparse" in name or "jiterator" in name:
+            self.skipTest("Skip failing test")
+
+        test_self = self
+
+        class TestPointwiseMode(TorchDispatchMode):
+            def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+                kwargs = kwargs or {}
+
+                out = func(*args, **kwargs)
+
+                if torch.Tag.pointwise in func.tags:
+                    shapes = []
+                    for inp in tree_flatten((args, kwargs)):
+                        if isinstance(inp, torch.Tensor):
+                            shapes.append(inp.shape)
+
+                    out_shape = torch._refs._broadcast_shapes(*shapes)
+
+                    for out_elem in tree_flatten(out):
+                        if isinstance(out_elem, torch.Tensor):
+                            test_self.assertEqual(out_elem.shape, out_shape)
+
+                return out
+
+        samples = op.sample_inputs(device, dtype, requires_grad=False)
+        for sample in samples:
+            mode = FakeTensorMode(throw_on_data_dependent_ops=True)
+
+            def map_to_fake(e):
+                if isinstance(e, torch.Tensor):
+                    return mode.from_tensor(e)
+                else:
+                    return e
+
+            input = tree_map(map_to_fake, sample.input)
+            args = tree_map(map_to_fake, sample.args)
+            kwargs = tree_map(map_to_fake, sample.kwargs)
+
+            try:
+                op(input, *args, **kwargs)
+            except Exception as e:
+                continue
+
+            with TestPointwiseMode():
+                with mode:
+                    op(input, *args, **kwargs)
+
     @ops(op_db, dtypes=OpDTypes.any_one)
     def test_fake(self, device, dtype, op):
         self._test_fake_helper(device, dtype, op, contextlib.nullcontext)
diff --git a/torch/_functorch/partitioners.py b/torch/_functorch/partitioners.py
index 4e3b7a80c1b02..bcbaaca7b0ef8 100644
--- a/torch/_functorch/partitioners.py
+++ b/torch/_functorch/partitioners.py
@@ -11,6 +11,7 @@
 from typing import Tuple
 from .compile_utils import fx_graph_cse, get_aten_target
 from . import config
+import functools
 
 AOT_PARTITIONER_DEBUG = config.debug_partitioner
 
@@ -246,6 +247,24 @@ def _count_ops(graph):
     print(sorted(cnt.items(), key=lambda x: x[1], reverse=True))
 
 
+@functools.lru_cache(None)
+def pointwise_ops():
+    ops = []
+    for attr_name in dir(torch.ops.aten):
+        opoverloadpacket = getattr(torch.ops.aten, attr_name)
+        if not isinstance(opoverloadpacket, torch._ops.OpOverloadPacket):
+            continue
+
+        for overload in opoverloadpacket.overloads():
+            op_overload = getattr(opoverloadpacket, overload)
+            if torch.Tag.pointwise in op_overload.tags:
+                # currently aot autograd uses packet not overload
+                ops.append(opoverloadpacket)
+                break
+
+    return ops
+
+
 def min_cut_rematerialization_partition(
     joint_module: fx.GraphModule, _joint_inputs, compiler="nvfuser", recomputable_ops=None,
     *, num_fwd_outputs
@@ -343,10 +362,13 @@ def is_tensor_node(x):
     # compiler == "nvfuser" is the default set of recomputable ops
     default_recomputable_ops = [aten.add, aten.sub, aten.div, aten.atan2, aten.mul, aten.max, aten.min, aten.pow, aten.remainder, aten.fmod, aten.__and__, aten.__or__, aten.__xor__, aten.__lshift__, aten.__rshift__, aten.eq, aten.ne, aten.ge, aten.gt, aten.le, aten.lt, aten.abs, aten.bitwise_not, aten.ceil, aten.floor, aten.frac, aten.neg, aten.relu, aten.round, aten.silu, aten.trunc, aten.log, aten.log10, aten.log1p, aten.log2, aten.lgamma, aten.exp, aten.expm1, aten.erf, aten.erfc, aten.cos, aten.acos, aten.cosh, aten.sin, aten.asin, aten.sinh, aten.tan, aten.atan, aten.tanh, aten.atanh, aten.sqrt, aten.rsqrt, aten.reciprocal, aten.sigmoid, aten.softplus, aten.threshold, aten.threshold_backward, aten.clamp, aten.where, aten.lerp, aten.addcmul, aten.gelu, aten.gelu_backward, aten.alias, aten.sum, aten.mean, aten._grad_sum_to_size, aten.sum_to_size, aten.amax, aten.to, aten.type_as, operator.getitem, aten.squeeze, aten.unsqueeze, aten.rsub, aten._to_copy]  # noqa: E501
     if compiler == "inductor":
-        default_recomputable_ops += [prims.div, prims.convert_element_type, aten.sign, aten.clone, aten._to_copy, aten.full_like, prims.var, prims.sum, aten.var, aten.std, prims.broadcast_in_dim, aten.select, aten.permute, aten._unsafe_view, aten.view, aten.expand, aten.slice, aten.reshape, aten.broadcast_tensors, aten.scalar_tensor, aten.ones, aten.new_zeros, aten.lift_fresh_copy, aten.minimum, aten.arange, aten.bitwise_and, aten.triu, aten.var_mean, aten.isinf, aten.any, aten.isnan, aten.full, aten.as_strided, aten.zeros, aten.argmax, aten.maximum, aten.bitwise_or, aten.logical_and, aten.logical_or]  # noqa: E501
+        default_recomputable_ops += [prims.div, prims.convert_element_type, aten.clone, aten._to_copy, aten.full_like, prims.var, prims.sum, aten.var, aten.std, prims.broadcast_in_dim, aten.select, aten.permute, aten._unsafe_view, aten.view, aten.expand, aten.slice, aten.reshape, aten.broadcast_tensors, aten.scalar_tensor, aten.ones, aten.new_zeros, aten.lift_fresh_copy, aten.arange, aten.triu, aten.var_mean, aten.isinf, aten.any, aten.full, aten.as_strided, aten.zeros, aten.argmax, aten.maximum]  # noqa: E501
         # Natalia said that we should allow recomputing indexing :)
         default_recomputable_ops += [aten.index]
 
+        # add more generally ?
+        default_recomputable_ops += pointwise_ops()
+
     recomputable_ops = set(recomputable_ops) if recomputable_ops is not None else set(default_recomputable_ops)
 
     random_ops = [aten.native_dropout, aten.rand_like, aten.randn_like]
diff --git a/torchgen/model.py b/torchgen/model.py
index 831cf3c443600..fa9cdde8fd6fa 100644
--- a/torchgen/model.py
+++ b/torchgen/model.py
@@ -614,18 +614,19 @@ def from_yaml(
         assert precomputed_dict is None or structured is True
         precomputed = Precompute.parse(precomputed_dict) if precomputed_dict else None
 
-        tags_s = e.pop("tags", "")
-        assert isinstance(tags_s, str)
+        tags_inp = e.pop("tags", [])
+        if isinstance(tags_inp, str):
+            tags_inp = [tags_inp]
+        assert isinstance(tags_inp, list)
+
         tags: Set[str] = set()
-        if len(tags_s) > 0:
+        for t in tags_inp:
             assert len(valid_tags) > 0
-            for t in tags_s.split(", "):
-                # TODO: verify that the tag is valid and has an entry in tags.yaml
-                if t in valid_tags:
-                    tags.add(t)
-                else:
-                    raise AssertionError(f"illegal tag {t}")
-        assert isinstance(tags, set)
+            # TODO: verify that the tag is valid and has an entry in tags.yaml
+            if t in valid_tags:
+                tags.add(t)
+            else:
+                raise AssertionError(f"illegal tag {t}")
 
         from torchgen.api import cpp
 

From 83545cf61f13f5f9b566f0a0e25c6551d9eb2b28 Mon Sep 17 00:00:00 2001
From: Zheng Yan <zyan@meta.com>
Date: Thu, 8 Dec 2022 20:21:58 +0000
Subject: [PATCH 1742/1922] avoid fork in torch/__init__.py for deploy/multipy
 (#90492)

Summary:
We should not fork in deploy when initializing torch.

    Traceback (most recent call last):
    File "<string>", line 38, in <module>
    File "<string>", line 36, in __run
    File "/usr/local/fbcode/platform010/lib/python3.8/runpy.py", line 194, in _run_module_as_main
        return _run_code(code, main_globals, None,
    File "/usr/local/fbcode/platform010/lib/python3.8/runpy.py", line 87, in _run_code
        exec(code, run_globals)
    File "/data/users/zyan/fbsource/buck-out/v2/gen/fbcode/104a4d5c3a690252/multipy/runtime/__test_py__/test_py#link-tree/multipy/runtime/test_py.py", line 61, in <module>
        import torch # has to be done serially otherwise things will segfault
    File "/data/users/zyan/fbsource/buck-out/v2/gen/fbcode/104a4d5c3a690252/multipy/runtime/__test_py__/test_py#link-tree/torch/__init__.py", line 158, in <module>
        platform.system() != 'Windows':
    File "/usr/local/fbcode/platform010/lib/python3.8/platform.py", line 891, in system
        return uname().system
    File "/usr/local/fbcode/platform010/lib/python3.8/platform.py", line 857, in uname
        processor = _syscmd_uname('-p', '')
    File "/usr/local/fbcode/platform010/lib/python3.8/platform.py", line 613, in _syscmd_uname
        output = subprocess.check_output(('uname', option),

Test Plan: override a local script run trigger init and set `subprocess.check_output` to None

Reviewed By: yinghai, houseroad

Differential Revision: D41848592

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90492
Approved by: https://github.com/PaliC
---
 torch/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch/__init__.py b/torch/__init__.py
index bd0bfa59d5919..55e5da076c64e 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -162,7 +162,7 @@ def _preload_cuda_deps():
 
 # See Note [Global dependencies]
 def _load_global_deps():
-    if platform.system() == 'Windows' or sys.executable == 'torch_deploy':
+    if sys.executable == 'torch_deploy' or platform.system() == 'Windows':
         return
 
     lib_name = 'libtorch_global_deps' + ('.dylib' if platform.system() == 'Darwin' else '.so')
@@ -181,7 +181,7 @@ def _load_global_deps():
 
 
 if (USE_RTLD_GLOBAL_WITH_LIBTORCH or os.getenv('TORCH_USE_RTLD_GLOBAL')) and \
-        platform.system() != 'Windows':
+        (sys.executable == "torch_deploy" or platform.system() != 'Windows'):
     # Do it the hard way.  You might want to load libtorch with RTLD_GLOBAL in a
     # few circumstances:
     #
@@ -956,7 +956,7 @@ def _dtype(self):
 ################################################################################
 
 def manager_path():
-    if platform.system() == 'Windows' or sys.executable == 'torch_deploy':
+    if sys.executable == 'torch_deploy' or platform.system() == 'Windows':
         return b""
     path = get_file_path('torch', 'bin', 'torch_shm_manager')
     prepare_multiprocessing_environment(get_file_path('torch'))

From 08e4b5636351236a8e8f57a4945dff5e6f25c552 Mon Sep 17 00:00:00 2001
From: Stephen Macke <smacke@meta.com>
Date: Thu, 8 Dec 2022 20:24:45 +0000
Subject: [PATCH 1743/1922] [rfc][pkg] check spec for module source before
 falling back to file in package exporter (#90258)

Summary: To get source for a particular module, the "correct" thing to do is to check the module's spec and use `get_source` if it's a SourceFileLoader, since subclasses may look elsewhere than the `__file__`, and the spec will give the source of truth. For torch packager, however, we prefer to use linecache, but the loader could still change the file, so we figure out the file for the module using the spec's loader rather than using `module.__file__`, if possible.

Test Plan: This code path will get exercised by CI. Also added a test for remapped files.

Differential Revision: D41412983

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90258
Approved by: https://github.com/PaliC
---
 test/package/module_a_remapped_path.py |  1 +
 test/package/test_misc.py              | 56 ++++++++++++++++++++++++++
 torch/package/package_exporter.py      | 26 +++++++-----
 3 files changed, 72 insertions(+), 11 deletions(-)
 create mode 100644 test/package/module_a_remapped_path.py

diff --git a/test/package/module_a_remapped_path.py b/test/package/module_a_remapped_path.py
new file mode 100644
index 0000000000000..793ddd4296885
--- /dev/null
+++ b/test/package/module_a_remapped_path.py
@@ -0,0 +1 @@
+result = "module_a_remapped_path"
diff --git a/test/package/test_misc.py b/test/package/test_misc.py
index c29602d8e360b..908e8d29992c3 100644
--- a/test/package/test_misc.py
+++ b/test/package/test_misc.py
@@ -2,7 +2,9 @@
 # Owner(s): ["oncall: package/deploy"]
 
 import inspect
+import os
 import platform
+import sys
 from io import BytesIO
 from pathlib import Path
 from textwrap import dedent
@@ -104,6 +106,60 @@ def test_file_structure(self):
             import_exclude,
         )
 
+    def test_loaders_that_remap_files_work_ok(self):
+        from importlib.abc import MetaPathFinder
+        from importlib.machinery import SourceFileLoader
+        from importlib.util import spec_from_loader
+
+        class LoaderThatRemapsModuleA(SourceFileLoader):
+            def get_filename(self, name):
+                result = super().get_filename(name)
+                if name == "module_a":
+                    return os.path.join(os.path.dirname(result), "module_a_remapped_path.py")
+                else:
+                    return result
+
+        class FinderThatRemapsModuleA(MetaPathFinder):
+            def find_spec(self, fullname, path, target):
+                """Try to find the original spec for module_a using all the
+                remaining meta_path finders."""
+                if fullname != "module_a":
+                    return None
+                spec = None
+                for finder in sys.meta_path:
+                    if finder is self:
+                        continue
+                    if hasattr(finder, "find_spec"):
+                        spec = finder.find_spec(fullname, path, target=target)
+                    elif hasattr(finder, "load_module"):
+                        spec = spec_from_loader(fullname, finder)
+                    if spec is not None:
+                        break
+                assert spec is not None and isinstance(spec.loader, SourceFileLoader)
+                spec.loader = LoaderThatRemapsModuleA(spec.loader.name, spec.loader.path)
+                return spec
+
+        sys.meta_path.insert(0, FinderThatRemapsModuleA())
+        # clear it from sys.modules so that we use the custom finder next time
+        # it gets imported
+        sys.modules.pop("module_a", None)
+        try:
+            buffer = BytesIO()
+            with PackageExporter(buffer) as he:
+                import module_a
+
+                he.intern("**")
+                he.save_module(module_a.__name__)
+
+
+            buffer.seek(0)
+            hi = PackageImporter(buffer)
+            self.assertTrue("remapped_path" in hi.get_source("module_a"))
+        finally:
+            # pop it again to ensure it does not mess up other tests
+            sys.modules.pop("module_a", None)
+            sys.meta_path.pop(0)
+
     def test_python_version(self):
         """
         Tests that the current python version is stored in the package and is available
diff --git a/torch/package/package_exporter.py b/torch/package/package_exporter.py
index 55cdf7f1baf2b..47318c312dbc6 100644
--- a/torch/package/package_exporter.py
+++ b/torch/package/package_exporter.py
@@ -8,6 +8,7 @@
 from collections import defaultdict, OrderedDict
 from dataclasses import dataclass
 from enum import Enum
+from importlib.machinery import SourceFileLoader
 from pathlib import Path
 from typing import (
     Any,
@@ -422,17 +423,20 @@ def _module_exists(self, module_name: str) -> bool:
             return False
 
     def _get_source_of_module(self, module: types.ModuleType) -> Optional[str]:
-        filename = getattr(module, "__file__", None)
-        result = (
-            None
-            if filename is None or not filename.endswith(".py")
-            else linecache.getlines(filename, module.__dict__)
-        )
-
-        if result is None:
-            return None
-
-        return "".join(result)
+        filename = None
+        spec = getattr(module, "__spec__", None)
+        if spec is not None:
+            loader = getattr(spec, "loader", None)
+            if loader is not None and isinstance(loader, SourceFileLoader):
+                try:
+                    filename = loader.get_filename(module.__name__)
+                except ImportError:
+                    pass
+        if filename is None:
+            filename = getattr(module, "__file__", None)
+        if isinstance(filename, str) and filename.endswith(".py"):
+            return "".join(linecache.getlines(filename, module.__dict__))
+        return None
 
     def add_dependency(self, module_name: str, dependencies=True):
         """Given a module, add it to the dependency graph according to patterns

From a1b6433c23f7b3d8c0bbc487fadbd8a440cd6db1 Mon Sep 17 00:00:00 2001
From: Michael Lazos <mlazos@fb.com>
Date: Thu, 8 Dec 2022 21:19:23 +0000
Subject: [PATCH 1744/1922] [dynamo] Rewrite inplace addcdiv and inplace add
 (#90330)

Rewrite inplace addcdiv to a div, mul and inplace add to avoid graph break
Rewrite inplace add to a mul and inplace add to avoid graph break

Needed to close optimizer graph breaks

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90330
Approved by: https://github.com/jansel
---
 test/dynamo/test_functions.py     | 10 ++++++++++
 test/dynamo/test_optimizers.py    |  6 ------
 torch/_dynamo/variables/tensor.py | 20 +++++++++++++++++++-
 3 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py
index 652b68921fc07..4b84d2ca3b71c 100644
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@@ -68,12 +68,22 @@ def test_inline_jit_annotations(x):
     def test_add(a, b):
         return a + b
 
+    @make_test
+    def test_add_(a, b):
+        a_copy = torch.tensor(a)
+        return a_copy.add_(b, alpha=5.0)
+
     @make_test
     def test_addcdiv(a, b, c):
         # dynamo decomposes this to avoid a graph break when
         # the value kwarg is populated
         return torch.addcdiv(a, b, c, value=5.0)
 
+    @make_test
+    def test_addcdiv_(a, b, c):
+        a_copy = torch.tensor(a)
+        return a_copy.addcdiv_(b, c, value=5.0)
+
     @make_test
     def test_is_not_null(a, b):
         if a is not None and b is not None:
diff --git a/test/dynamo/test_optimizers.py b/test/dynamo/test_optimizers.py
index 0036b39c622d2..c8b9ee663641e 100644
--- a/test/dynamo/test_optimizers.py
+++ b/test/dynamo/test_optimizers.py
@@ -85,16 +85,13 @@ def setUpClass(cls):
 
     # These optimizers are disabled until we remove item() calls
     test_adam = make_test(torch.optim.Adam, exp_graph_count=0)
-    test_adamax = make_test(torch.optim.Adamax, exp_graph_count=0)
     test_adamw = make_test(torch.optim.AdamW, exp_graph_count=0)
 
     # RAdam and Adagrad have data-dependent control which breaks the graph;
     # furthermore, the break is inside a for loop, so we bail on the frame
     # entirely.  This is basically an xfail; if the frame count goes up
     # you done good
-    test_nadam = make_test(torch.optim.NAdam, exp_graph_count=0)
     test_radam = make_test(torch.optim.RAdam, exp_graph_count=0)
-    test_adagrad = make_test(torch.optim.Adagrad, exp_graph_count=0)
 
     # ASGD has a small optimization that avoids averaging
     # This will fully capture the graph once that optimization is removed
@@ -111,11 +108,8 @@ def setUpClass(cls):
         "SparseAdam",  # Unsupported
         "LBFGS",  # Unsupported
         "Adam",  # Disabled pending item call removal
-        "Adamax",  # Disabled pending item call removal
         "AdamW",  # Disabled pending item call removal
         "RAdam",  # Disabled pending item call removal
-        "NAdam",  # Disabled pending item call removal
-        "Adagrad",  # Disabled pending item call removal
         "ASGD",
     ]
 )
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index c161be41007ac..f1b30f9212423 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -188,7 +188,7 @@ def call_method(
         args: "List[VariableTracker]",
         kwargs: "Dict[str, VariableTracker]",
     ) -> "VariableTracker":
-        from . import ConstantVariable, TupleVariable
+        from . import ConstantVariable, TorchVariable, TupleVariable
         from .builder import wrap_fx_proxy
 
         kwargs = dict(kwargs)
@@ -337,6 +337,24 @@ def call_method(
                 ),
                 **options,
             )
+        elif (
+            name == "add_" and len(args) == 1 and len(kwargs) == 1 and "alpha" in kwargs
+        ):
+            result = TorchVariable(torch.mul, **options).call_function(
+                tx, args + [kwargs["alpha"]], {}
+            )
+            return self.call_method(tx, "add_", [result], {})
+        elif (
+            name == "addcdiv_"
+            and len(args) == 2
+            and len(kwargs) == 1
+            and "value" in kwargs
+        ):
+            result = TorchVariable(torch.div, **options).call_function(tx, args, {})
+            result = TorchVariable(torch.mul, **options).call_function(
+                tx, [result, kwargs["value"]], {}
+            )
+            return self.call_method(tx, "add_", [result], {})
         else:
             # Convert x.new(torch.Size) into x.new_empty(torch.Size),
             # as Tensor.new acts differently with a Size input versus a tuple input.

From dda43621fb5471e33ae6d3fe15c700904a95b413 Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@fb.com>
Date: Thu, 8 Dec 2022 21:41:15 +0000
Subject: [PATCH 1745/1922] Track torch.compile calls (#90310)

Title.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90310
Approved by: https://github.com/colin2328, https://github.com/anijain2305
---
 torch/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch/__init__.py b/torch/__init__.py
index 55e5da076c64e..c8543057c7474 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -1173,6 +1173,7 @@ def foo(x):
             return torch.sin(x) + torch.cos(x)
 
     """
+    _C._log_api_usage_once("torch.compile")
     # Decorator mode
     if model is None:
         def fn(model: Callable):

From 8f685d8f23f7e0b5631bd3bc35d9f0cd652a38d2 Mon Sep 17 00:00:00 2001
From: clee2000 <44682903+clee2000@users.noreply.github.com>
Date: Thu, 8 Dec 2022 22:03:24 +0000
Subject: [PATCH 1746/1922] Don't update vision hash on push (#90498)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90498
Approved by: https://github.com/malfet, https://github.com/seemethere
---
 .github/workflows/nightly.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index a8de37ca85be2..5c1de3dac5479 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -38,6 +38,7 @@ jobs:
 
   update-vision-commit-hash:
     uses: ./.github/workflows/_update-commit-hash.yml
+    if: ${{ github.event_name == 'schedule' }}
     with:
       repo-name: vision
       branch: main

From 98ab4c9f773712d2c6678a245a31515d055db218 Mon Sep 17 00:00:00 2001
From: Sergii Dymchenko <sdym@fb.com>
Date: Thu, 8 Dec 2022 22:28:15 +0000
Subject: [PATCH 1747/1922] Fix issue 38095 TODOs in test_quantized_tensor.py
 (#90344)

Fix TODOs related to https://github.com/pytorch/pytorch/issues/38095
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90344
Approved by: https://github.com/malfet
---
 test/quantization/core/test_quantized_tensor.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py
index 241aab5da3237..98e21ab30f097 100644
--- a/test/quantization/core/test_quantized_tensor.py
+++ b/test/quantization/core/test_quantized_tensor.py
@@ -70,7 +70,7 @@ def _calculate_dynamic_qparams(X, dtype, reduce_range=False):
     return [scale.astype(np.float32), int(nudged_zero_point)]
 
 # Note we explicitly cast variables to np.float32 in a couple of places to avoid
-# the default casting in Python often resuling in double precision and to make
+# the default casting in Python often resulting in double precision and to make
 # sure we're doing the same numerics as C++ code.
 def param_search_greedy(x, bit_rate, n_bins=200, ratio=0.16):
     xmin, xmax = np.min(x), np.max(x)
@@ -443,8 +443,7 @@ def _test_per_channel_qtensor_creation(self, device):
         for dtype, zero_points in itertools.product([torch.qint8, torch.quint8], [zero_points_float, zero_points_int]):
             q = torch._empty_per_channel_affine_quantized(
                 [numel], scales=scales, zero_points=zero_points, axis=ch_axis, dtype=dtype, device=device)
-            # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-            self.assertEqualIgnoreType(scales, q.q_per_channel_scales())
+            self.assertEqual(scales, q.q_per_channel_scales(), exact_dtype=False)
             self.assertEqual(zero_points, q.q_per_channel_zero_points())
             self.assertEqual(ch_axis, q.q_per_channel_axis())
 
@@ -453,8 +452,7 @@ def _test_per_channel_qtensor_creation(self, device):
             int_tensor = torch.randint(0, 100, size=(numel,), dtype=torch.uint8, device=device)
             q = torch._make_per_channel_quantized_tensor(int_tensor, scales, zero_points, ch_axis)
             self.assertEqual(int_tensor, q.int_repr())
-            # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-            self.assertEqualIgnoreType(scales, q.q_per_channel_scales())
+            self.assertEqual(scales, q.q_per_channel_scales(), exact_dtype=False)
             self.assertEqual(zero_points, q.q_per_channel_zero_points())
             self.assertEqual(ch_axis, q.q_per_channel_axis())
 
@@ -809,8 +807,7 @@ def test_qtensor_per_channel_permute(self):
             self.assertEqual(qr.stride(), list(reversed(sorted(qr.stride()))))
             self.assertNotEqual(qlast.stride(), list(reversed(sorted(qlast.stride()))))
             self.assertEqual(qr.int_repr(), qlast.int_repr())
-            # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
-            self.assertEqualIgnoreType(scales, qlast.q_per_channel_scales())
+            self.assertEqual(scales.to(dtype=torch.float64), qlast.q_per_channel_scales())
             self.assertEqual(zero_points, qlast.q_per_channel_zero_points())
             self.assertEqual(1, qlast.q_per_channel_axis())
             self.assertEqual(qlast.dequantize(), qr.dequantize())

From d2c3082b234557f8b7965a22a1983b5f8135e664 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Thu, 8 Dec 2022 22:23:15 +0000
Subject: [PATCH 1748/1922] [Ez] Omit HSDP Z2 from doc (#90503)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90503
Approved by: https://github.com/awgu
---
 torch/distributed/fsdp/fully_sharded_data_parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index de17d12fde470..13d7ab54d6f39 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -191,7 +191,7 @@ class FullyShardedDataParallel(nn.Module):
         process_group: Optional[Union[ProcessGroup, Tuple[ProcessGroup, ProcessGroup]]]
             This is the process group used for collective communications and
             the one over which the model is sharded. For hybrid sharding strategies such as
-            ``ShardingStrategy.HYBRID_SHARD`` or ``ShardingStrategy._HYBRID_SHARD_ZERO2``, users can
+            ``ShardingStrategy.HYBRID_SHARD`` users can
             pass in a tuple of process groups representing the groups to shard and replicate across,
             respectively.
         sharding_strategy (Optional[ShardingStrategy]):

From c2e27225a9bf7294cddc73c56ab7876bf690e555 Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang@fb.com>
Date: Wed, 7 Dec 2022 11:03:07 -0800
Subject: [PATCH 1749/1922] [20/N] Add recv_any_source custom op with CPU/CUDA
 implementations (#89505)

Differential Revision: [D41812671](https://our.internmc.facebook.com/intern/diff/D41812671)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89505
Approved by: https://github.com/kwen2501
---
 torch/csrc/distributed/c10d/Ops.cpp     | 24 ++++++++++++++++++++++++
 torch/csrc/distributed/c10d/Ops.hpp     |  5 +++++
 torch/csrc/distributed/c10d/OpsImpl.cpp | 24 ++++++++++++++++++++++++
 torch/csrc/distributed/c10d/init.cpp    |  6 +++++-
 4 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
index 6b4717a8e1d11..d706ea9773787 100644
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@@ -212,6 +212,14 @@ c10::intrusive_ptr<Work> recv_(
       tensor_vec, static_cast<int>(srcRank), static_cast<int>(tag));
 }
 
+c10::intrusive_ptr<Work> recv_any_source_(
+    at::TensorList tensors,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    int64_t tag) {
+  auto tensor_vec = tensors.vec();
+  return process_group->recvAnysource(tensor_vec, static_cast<int>(tag));
+}
+
 TORCH_LIBRARY(c10d, m) {
   // The following ProcessGroup, Work, and ReduceOp definitions are more like
   // declarations. They don't expose the details of the two classes into
@@ -272,6 +280,9 @@ TORCH_LIBRARY(c10d, m) {
           c10::DispatchKey::CompositeExplicitAutograd, monitored_barrier_));
   m.def("send", dispatch(c10::DispatchKey::CompositeExplicitAutograd, send));
   m.def("recv_", dispatch(c10::DispatchKey::CompositeExplicitAutograd, recv_));
+  m.def(
+      "recv_any_source_",
+      dispatch(c10::DispatchKey::CompositeExplicitAutograd, recv_any_source_));
 }
 } // namespace
 
@@ -576,5 +587,18 @@ c10::intrusive_ptr<Work> recv(
   return op.call(tensors, process_group, srcRank, tag);
 }
 
+c10::intrusive_ptr<Work> recv_any_source(
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    at::TensorList tensors,
+    int64_t tag) {
+  static auto op = c10::Dispatcher::singleton()
+                       .findSchemaOrThrow("c10d::recv_any_source_", "")
+                       .typed<c10::intrusive_ptr<::c10d::Work>(
+                           at::TensorList,
+                           const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                           int64_t)>();
+  return op.call(tensors, process_group, tag);
+}
+
 } // namespace ops
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/Ops.hpp b/torch/csrc/distributed/c10d/Ops.hpp
index b5426039f01e4..241bd902231eb 100644
--- a/torch/csrc/distributed/c10d/Ops.hpp
+++ b/torch/csrc/distributed/c10d/Ops.hpp
@@ -100,5 +100,10 @@ TORCH_API c10::intrusive_ptr<Work> recv(
     int64_t srcRank,
     int64_t tag);
 
+TORCH_API c10::intrusive_ptr<Work> recv_any_source(
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    at::TensorList tensors,
+    int64_t tag);
+
 } // namespace ops
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/OpsImpl.cpp b/torch/csrc/distributed/c10d/OpsImpl.cpp
index 31386695a132e..cc7ebcfd075cd 100644
--- a/torch/csrc/distributed/c10d/OpsImpl.cpp
+++ b/torch/csrc/distributed/c10d/OpsImpl.cpp
@@ -49,6 +49,22 @@ c10::intrusive_ptr<Work> recv_cuda_(
       tensor_vec, static_cast<int>(srcRank), static_cast<int>(tag));
 }
 
+c10::intrusive_ptr<Work> recv_any_source_cpu_(
+    at::TensorList tensors,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    int64_t tag) {
+  auto tensor_vec = tensors.vec();
+  return process_group->recvAnysource(tensor_vec, static_cast<int>(tag));
+}
+
+c10::intrusive_ptr<Work> recv_any_source_cuda_(
+    at::TensorList tensors,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    int64_t tag) {
+  auto tensor_vec = tensors.vec();
+  return process_group->recvAnysource(tensor_vec, static_cast<int>(tag));
+}
+
 c10::intrusive_ptr<Work> reduce_cpu_(
     at::TensorList tensors,
     const c10::intrusive_ptr<ProcessGroup>& process_group,
@@ -428,6 +444,14 @@ TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
   m.impl("recv_", recv_cuda_);
 }
 
+TORCH_LIBRARY_IMPL(c10d, CPU, m) {
+  m.impl("recv_any_source_", recv_any_source_cpu_);
+}
+
+TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
+  m.impl("recv_any_source_", recv_any_source_cuda_);
+}
+
 TORCH_LIBRARY_IMPL(c10d, CPU, m) {
   m.impl("reduce_", reduce_cpu_);
 }
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 4f03233beff05..b876c8a7920a3 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -1522,7 +1522,11 @@ that adds a prefix to each key inserted to the store.
 
           .def(
               "recv_anysource",
-              &::c10d::ProcessGroup::recvAnysource,
+              [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
+                 const std::vector<at::Tensor>& tensors,
+                 int64_t tag) {
+                return ::c10d::ops::recv_any_source(self, tensors, tag);
+              },
               py::call_guard<py::gil_scoped_release>())
 
           .def(

From 6c735e75c9984a047e0931cdb49f1b709042e331 Mon Sep 17 00:00:00 2001
From: Howard Huang <howardhuang@fb.com>
Date: Wed, 7 Dec 2022 11:03:08 -0800
Subject: [PATCH 1750/1922] [21/N] Add alltoall_base custom op with CPU/CUDA
 implementations (#89813)

Differential Revision: [D41812670](https://our.internmc.facebook.com/intern/diff/D41812670)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89813
Approved by: https://github.com/kwen2501
---
 test/distributed/test_c10d_common.py    | 14 ++++++++
 test/distributed/test_c10d_gloo.py      |  4 +++
 test/distributed/test_c10d_nccl.py      |  5 +++
 torch/csrc/distributed/c10d/Ops.cpp     | 43 +++++++++++++++++++++++++
 torch/csrc/distributed/c10d/Ops.hpp     |  8 +++++
 torch/csrc/distributed/c10d/OpsImpl.cpp | 38 ++++++++++++++++++++++
 torch/csrc/distributed/c10d/init.cpp    | 22 ++++---------
 7 files changed, 119 insertions(+), 15 deletions(-)

diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index 0dd8b42f1d9a1..962c12dcba9d7 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -1522,6 +1522,20 @@ def _test_allreduce_coalesced(self, backend):
         for tensor in tensors:
             self.assertEqual(tensor, torch.ones(10, 10) * self.world_size)
 
+    def _test_all_to_all_single(self, backend):
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            backend,
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        device = "cuda" if backend == "nccl" else "cpu"
+        # test alltoall_base
+        input_tensor = torch.ones(2, 2, device=torch.device(device))
+        output_tensor = torch.zeros(2, 2, device=torch.device(device))
+        dist.all_to_all_single(output_tensor, input_tensor)
+
 class CompilerTest(MultiProcessTestCase):
     def setUp(self):
         super(CompilerTest, self).setUp()
diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
index 289e5c117ace1..b26c9e9316f3e 100644
--- a/test/distributed/test_c10d_gloo.py
+++ b/test/distributed/test_c10d_gloo.py
@@ -2405,6 +2405,10 @@ def test_collectives(self):
     def test_allreduce_coalesced(self):
         self._test_allreduce_coalesced(backend="gloo")
 
+    @requires_gloo()
+    def test_all_to_all_single(self):
+        self._test_all_to_all_single(backend="gloo")
+
     @requires_gloo()
     def test_allgather_coalesced(self):
         store = dist.FileStore(self.file_name, self.world_size)
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 403ede860a782..ecea7c7811681 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -2948,6 +2948,11 @@ def test_collectives(self):
     def test_allreduce_coalesced(self):
         self._test_allreduce_coalesced(backend="nccl")
 
+    @requires_nccl()
+    @skip_if_lt_x_gpu(1)
+    def test_all_to_all_single(self):
+        self._test_all_to_all_single(backend="nccl")
+
     @requires_nccl()
     @skip_if_lt_x_gpu(1)
     def test_allgather_base(self):
diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
index d706ea9773787..52a8b17c290dc 100644
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@@ -173,6 +173,21 @@ c10::intrusive_ptr<Work> alltoall_(
       AllToAllOptions{std::chrono::milliseconds(timeout)});
 }
 
+c10::intrusive_ptr<Work> alltoall_base_(
+    at::Tensor& output,
+    at::Tensor& input,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    std::vector<int64_t> output_split_sizes,
+    std::vector<int64_t> input_split_sizes,
+    int64_t timeout) {
+  return process_group->alltoall_base(
+      output,
+      input,
+      output_split_sizes,
+      input_split_sizes,
+      AllToAllOptions{std::chrono::milliseconds(timeout)});
+}
+
 c10::intrusive_ptr<Work> barrier(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const std::vector<int64_t>& device_ids,
@@ -271,6 +286,9 @@ TORCH_LIBRARY(c10d, m) {
   m.def(
       "alltoall_",
       dispatch(c10::DispatchKey::CompositeExplicitAutograd, alltoall_));
+  m.def(
+      "alltoall_base_",
+      dispatch(c10::DispatchKey::CompositeExplicitAutograd, alltoall_base_));
   m.def(
       "barrier",
       dispatch(c10::DispatchKey::CompositeExplicitAutograd, barrier));
@@ -523,6 +541,31 @@ c10::intrusive_ptr<Work> alltoall(
       output_tensors, input_tensors, process_group, opts.timeout.count());
 }
 
+c10::intrusive_ptr<Work> alltoall_base(
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    at::Tensor& output,
+    at::Tensor& input,
+    std::vector<int64_t> output_split_sizes,
+    std::vector<int64_t> input_split_sizes,
+    const AllToAllOptions& opts) {
+  static auto op = c10::Dispatcher::singleton()
+                       .findSchemaOrThrow("c10d::alltoall_base_", "")
+                       .typed<c10::intrusive_ptr<::c10d::Work>(
+                           at::Tensor&,
+                           at::Tensor&,
+                           const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                           std::vector<int64_t>,
+                           std::vector<int64_t>,
+                           int64_t)>();
+  return op.call(
+      output,
+      input,
+      process_group,
+      output_split_sizes,
+      input_split_sizes,
+      opts.timeout.count());
+}
+
 void monitored_barrier(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const BarrierOptions& opts,
diff --git a/torch/csrc/distributed/c10d/Ops.hpp b/torch/csrc/distributed/c10d/Ops.hpp
index 241bd902231eb..db9006995c1a8 100644
--- a/torch/csrc/distributed/c10d/Ops.hpp
+++ b/torch/csrc/distributed/c10d/Ops.hpp
@@ -73,6 +73,14 @@ TORCH_API c10::intrusive_ptr<Work> scatter(
     const std::vector<std::vector<at::Tensor>>& input_tensors,
     const ScatterOptions& opts = {});
 
+TORCH_API c10::intrusive_ptr<Work> alltoall_base(
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    at::Tensor& output,
+    at::Tensor& input,
+    const std::vector<int64_t> outputSplitSizes,
+    const std::vector<int64_t> inputSplitSizes,
+    const AllToAllOptions& opts = {});
+
 TORCH_API c10::intrusive_ptr<Work> alltoall(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     at::TensorList output_tensors,
diff --git a/torch/csrc/distributed/c10d/OpsImpl.cpp b/torch/csrc/distributed/c10d/OpsImpl.cpp
index cc7ebcfd075cd..6eb69c664d16d 100644
--- a/torch/csrc/distributed/c10d/OpsImpl.cpp
+++ b/torch/csrc/distributed/c10d/OpsImpl.cpp
@@ -399,6 +399,36 @@ c10::intrusive_ptr<Work> alltoall_cuda_(
       AllToAllOptions{std::chrono::milliseconds(timeout)});
 }
 
+c10::intrusive_ptr<Work> alltoall_base_cpu_(
+    at::Tensor& output,
+    at::Tensor& input,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    std::vector<int64_t> output_split_sizes,
+    std::vector<int64_t> input_split_sizes,
+    int64_t timeout) {
+  return process_group->alltoall_base(
+      output,
+      input,
+      output_split_sizes,
+      input_split_sizes,
+      AllToAllOptions{std::chrono::milliseconds(timeout)});
+}
+
+c10::intrusive_ptr<Work> alltoall_base_cuda_(
+    at::Tensor& output,
+    at::Tensor& input,
+    const c10::intrusive_ptr<ProcessGroup>& process_group,
+    std::vector<int64_t> output_split_sizes,
+    std::vector<int64_t> input_split_sizes,
+    int64_t timeout) {
+  return process_group->alltoall_base(
+      output,
+      input,
+      output_split_sizes,
+      input_split_sizes,
+      AllToAllOptions{std::chrono::milliseconds(timeout)});
+}
+
 c10::intrusive_ptr<Work> barrier_cpu(
     const c10::intrusive_ptr<ProcessGroup>& process_group,
     const std::vector<int64_t>& device_ids,
@@ -558,6 +588,14 @@ TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
   m.impl("alltoall_", alltoall_cuda_);
 }
 
+TORCH_LIBRARY_IMPL(c10d, CPU, m) {
+  m.impl("alltoall_base_", alltoall_base_cpu_);
+}
+
+TORCH_LIBRARY_IMPL(c10d, CUDA, m) {
+  m.impl("alltoall_base_", alltoall_base_cuda_);
+}
+
 TORCH_LIBRARY_IMPL(c10d, CPU, m) {
   m.impl("barrier", barrier_cpu);
 }
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index b876c8a7920a3..51ae468ea8068 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -1440,34 +1440,26 @@ that adds a prefix to each key inserted to the store.
 
           .def(
               "alltoall_base",
-              &::c10d::ProcessGroup::alltoall_base,
-              py::arg("output_tensor"),
-              py::arg("input_tensor"),
-              py::arg("output_split_sizes"),
-              py::arg("input_split_sizes"),
-              py::arg("opts") = ::c10d::AllToAllOptions(),
-              py::call_guard<py::gil_scoped_release>())
-
-          .def(
-              "alltoall_base",
-              [](::c10d::ProcessGroup& self,
+              [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,
                  at::Tensor& output,
                  at::Tensor& input,
                  std::vector<int64_t> outputSplitSizes,
-                 std::vector<int64_t> inputSplitSizes) {
-                return self.alltoall_base(
+                 std::vector<int64_t> inputSplitSizes,
+                 const ::c10d::AllToAllOptions& opts) {
+                return ::c10d::ops::alltoall_base(
+                    self,
                     output,
                     input,
                     outputSplitSizes,
                     inputSplitSizes,
-                    ::c10d::AllToAllOptions());
+                    opts);
               },
               py::arg("output"),
               py::arg("input"),
               py::arg("output_split_sizes"),
               py::arg("input_split_sizes"),
+              py::arg("opts") = ::c10d::AllToAllOptions(),
               py::call_guard<py::gil_scoped_release>())
-
           .def(
               "alltoall",
               [](const c10::intrusive_ptr<::c10d::ProcessGroup>& self,

From 57174c7eed2eddc8691e83f14aace891d2fdd293 Mon Sep 17 00:00:00 2001
From: William Wen <williamwen@fb.com>
Date: Thu, 8 Dec 2022 23:59:05 +0000
Subject: [PATCH 1751/1922] Deepcopy GraphModule in minifier (#90401)

Fixes https://github.com/pytorch/pytorch/issues/90397. Remove deepcopy calls in minifier tests.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90401
Approved by: https://github.com/anijain2305, https://github.com/mlazos
---
 test/dynamo/test_minifier.py | 4 ----
 torch/_dynamo/debug_utils.py | 6 ++++--
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/test/dynamo/test_minifier.py b/test/dynamo/test_minifier.py
index d2f82f92510c5..26b2c6ee557e2 100644
--- a/test/dynamo/test_minifier.py
+++ b/test/dynamo/test_minifier.py
@@ -27,12 +27,10 @@ def test_relu_compile_error(gm: torch.fx.GraphModule, example_inputs):
 """
 
 RELU_RUNTIME_ERROR_BACKEND = """\
-import copy
 from torch._dynamo.optimizations.backends import register_backend
 
 @register_backend
 def test_relu_runtime_error(gm: torch.fx.GraphModule, example_inputs):
-    gm = copy.deepcopy(gm)
     for node in gm.graph.nodes:
         if node.target == torch.relu:
             node.target = torch._assert
@@ -42,12 +40,10 @@ def test_relu_runtime_error(gm: torch.fx.GraphModule, example_inputs):
 """
 
 RELU_ACCURACY_ERROR_BACKEND = """\
-import copy
 from torch._dynamo.optimizations.backends import register_backend
 
 @register_backend
 def test_relu_accuracy_error(gm: torch.fx.GraphModule, example_inputs):
-    gm = copy.deepcopy(gm)
     for node in gm.graph.nodes:
         if node.target == torch.relu:
             node.target = torch.add
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index 3e70f59583739..1db28caee6b8b 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -871,7 +871,7 @@ def debug_wrapper(gm, example_inputs, **kwargs):
             # Check for either accuracy (level 4) or other type of failures.
             if config.repro_level == 4:
                 # Check Accuracy
-                compiled_gm = compiler_fn(gm, example_inputs, **kwargs)
+                compiled_gm = compiler_fn(copy.deepcopy(gm), example_inputs, **kwargs)
                 if backend_accuracy_fails(gm, example_inputs, compiler_fn):
                     log.warning(
                         "Accuracy failed for the TorchDyanmo produced graph. Creating script to minify the error."
@@ -888,7 +888,9 @@ def debug_wrapper(gm, example_inputs, **kwargs):
                     raise exc
             else:
                 try:
-                    compiled_gm = compiler_fn(gm, example_inputs, **kwargs)
+                    compiled_gm = compiler_fn(
+                        copy.deepcopy(gm), example_inputs, **kwargs
+                    )
                     run_fwd_maybe_bwd(compiled_gm, example_inputs)
                 except Exception as exc:
                     log.warning(

From 10d686ebfa96d0eec4c4675b2686c7f8cfea1b7b Mon Sep 17 00:00:00 2001
From: Andrew Gu <andgu@fb.com>
Date: Thu, 8 Dec 2022 15:54:08 +0000
Subject: [PATCH 1752/1922] [Composable API][Easy] Fix some follow-ups (#90471)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90471
Approved by: https://github.com/mrshenli
---
 test/distributed/_composable/test_compose.py | 2 +-
 torch/distributed/fsdp/_runtime_utils.py     | 8 ++------
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/test/distributed/_composable/test_compose.py b/test/distributed/_composable/test_compose.py
index 98679f89da23f..20c285711d70d 100644
--- a/test/distributed/_composable/test_compose.py
+++ b/test/distributed/_composable/test_compose.py
@@ -99,7 +99,7 @@ def _test_checkpoint_fsdp_submodules(self, use_reentrant):
 
         test_model = copy.deepcopy(model)
         test_model.u1 = fully_shard(test_model.u1, policy=None)
-        test_model.u2 = fully_shard(test_model.u2, policy=None)
+        test_model.u2 = fully_shard(test_model.u2)
 
         test_model.u1.seq = checkpoint(test_model.u1.seq, use_reentrant=use_reentrant)
         test_model.u2.seq = checkpoint(test_model.u2.seq, use_reentrant=use_reentrant)
diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index 3b86437517045..6ace62d8f9c71 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -951,9 +951,7 @@ def _register_pre_forward_hooks(
         forward_handle.remove()
     state._pre_forward_handles.clear()
     for module in modules:
-        if module not in state._comm_module_to_handles:
-            continue
-        module_param_handles = state._comm_module_to_handles[module]
+        module_param_handles = state._comm_module_to_handles.get(module, [])
         if module_param_handles:
             unshard_fn = functools.partial(
                 _pre_forward_unshard,
@@ -983,9 +981,7 @@ def _register_post_forward_hooks(
         forward_handle.remove()
     state._post_forward_handles.clear()
     for module in modules:
-        if module not in state._comm_module_to_handles:
-            continue
-        module_param_handles = state._comm_module_to_handles[module]
+        module_param_handles = state._comm_module_to_handles.get(module, [])
         if module_param_handles:
             reshard_fn = functools.partial(
                 _post_forward_reshard,

From a655da7516e0c5aaff1d7ebe31fb32509de613fd Mon Sep 17 00:00:00 2001
From: "Tugsbayasgalan (Tugsuu) Manlaibaatar" <tmanlaibaatar@fb.com>
Date: Wed, 7 Dec 2022 13:11:55 -0800
Subject: [PATCH 1753/1922] Fix bug in dynamic shapes multiply (#90336)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90336
Approved by: https://github.com/ezyang
---
 test/dynamo/test_repros.py         | 12 ++++++++++++
 torch/_dynamo/variables/builtin.py |  5 +++++
 2 files changed, 17 insertions(+)

diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 602f3ce63fa7b..34f8db248c2b3 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -2127,6 +2127,18 @@ def compiled_fn(x):
             for buffer_ref, buffer_test in zip(m_ref.buffers(), m_test.buffers()):
                 self.assertTrue(same(buffer_ref, buffer_test))
 
+    @patch.object(torch._dynamo.config, "dynamic_shapes", True)
+    def test_dynamic_shapes_right_side(self):
+        def f(x):
+            return torch.ones(5 * x.shape[0])
+
+        inp = torch.randn(6, 5)
+
+        gm, _ = torch._dynamo.export(
+            f, torch.randn(4, 5), aten_graph=True, tracing_mode="symbolic"
+        )
+        self.assertEqual(gm(inp).shape, f(inp).shape)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/torch/_dynamo/variables/builtin.py b/torch/_dynamo/variables/builtin.py
index 127c0359d56f3..083da86a1e19b 100644
--- a/torch/_dynamo/variables/builtin.py
+++ b/torch/_dynamo/variables/builtin.py
@@ -576,6 +576,11 @@ def call_mul(self, tx, a, b):
             return b.__class__(
                 items=b.items * a.as_python_constant(), mutable_local=MutableLocal()
             ).add_options(self, a, b)
+        # TODO this doesn't generalize in other builtin operators.
+        elif isinstance(a, variables.ConstantVariable) and isinstance(
+            b, DynamicShapeVariable
+        ):
+            return b.call_method(tx, "__rmul__", [a], {})
         else:
             return a.call_method(tx, "__mul__", [b], {})
 

From f790592bd9ed9377e0afe57d55c0827fc9095208 Mon Sep 17 00:00:00 2001
From: Yuxin Wu <ppwwyyxx@users.noreply.github.com>
Date: Fri, 9 Dec 2022 01:01:12 +0000
Subject: [PATCH 1754/1922] Fix static initialization issue for static build
 (#90133)

Fixes #83255

Code comes from #83258 after fixing merge conflicts.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90133
Approved by: https://github.com/soumith, https://github.com/malfet
---
 aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp  | 1 +
 .../native/ao_sparse/quantized/cpu/qlinear_prepack.cpp    | 1 +
 .../native/ao_sparse/quantized/cpu/qlinear_unpack.cpp     | 1 +
 aten/src/ATen/native/quantized/cpu/qlinear.cpp            | 2 ++
 aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp    | 2 ++
 aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp    | 4 ++++
 aten/src/ATen/native/quantized/qconv_unpack.cpp           | 8 ++++++++
 aten/src/ATen/native/quantized/qlinear_unpack.cpp         | 3 +++
 8 files changed, 22 insertions(+)

diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp
index de053b353758a..144cdb292ba16 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp
@@ -247,6 +247,7 @@ class QLinearInt8 final {
 };
 
 TORCH_LIBRARY_IMPL(sparse, QuantizedCPU, m) {
+  register_linear_params();
   m.impl(
       TORCH_SELECTIVE_NAME("sparse::qlinear"),
       TORCH_FN(QLinearInt8<false>::run));
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp
index 83aaf810edd72..bedf2f4461f3a 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp
@@ -240,6 +240,7 @@ class QLinearPackWeightInt8 final {
 };
 
 TORCH_LIBRARY_IMPL(sparse, QuantizedCPU, m) {
+  register_linear_params();
   m.impl(
       TORCH_SELECTIVE_NAME("sparse::qlinear_prepack"),
       TORCH_FN(QLinearPackWeightInt8::run));
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp
index 14cf9521a4cdb..d66abc9d2a8a5 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp
@@ -133,6 +133,7 @@ class QLinearUnpackWeightInt8 final {
 };
 
 TORCH_LIBRARY_IMPL(sparse, CatchAll, m) {
+  register_linear_params();
   m.impl(
       TORCH_SELECTIVE_NAME("sparse::qlinear_unpack"),
       TORCH_FN(QLinearUnpackWeightInt8::run));
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
index 1e10f4f88b7c3..93a0f82978716 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -955,12 +955,14 @@ class QLinearInt8FusedQDQ final {
 };
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
+  register_linear_params();
   m.impl(TORCH_SELECTIVE_NAME("quantized::linear"), TORCH_FN(QLinearInt8<false>::run));
   m.impl(TORCH_SELECTIVE_NAME("quantized::linear_relu"), TORCH_FN(QLinearInt8<true>::run));
   m.impl(TORCH_SELECTIVE_NAME("quantized::linear_leaky_relu"), TORCH_FN(QLinearLeakyReluInt8::run));
 }
 
 TORCH_LIBRARY_IMPL(_quantized, QuantizedCPU, m) {
+  register_linear_params();
   m.impl(TORCH_SELECTIVE_NAME("_quantized::linear"), TORCH_FN(QLinearInt8<false>::run));
 }
 
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
index 537d0f492f8f1..c7f350c60e87b 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
@@ -662,6 +662,7 @@ class QLinearDynamicFp16 final {
 };
 
 TORCH_LIBRARY_IMPL(quantized, CPU, m) {
+  register_linear_params();
   m.impl(
       TORCH_SELECTIVE_NAME("quantized::linear_dynamic"),
       TORCH_FN(QLinearDynamicInt8<false>::run));
@@ -677,6 +678,7 @@ TORCH_LIBRARY_IMPL(quantized, CPU, m) {
 }
 
 TORCH_LIBRARY_IMPL(_quantized, CPU, m) {
+  register_linear_params();
   m.impl(
       TORCH_SELECTIVE_NAME("_quantized::linear_dynamic"),
       TORCH_FN(QLinearDynamicInt8<false>::run));
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
index 36523bbd1b9bd..9dcf21689d57d 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
@@ -381,20 +381,24 @@ class QLinearPackWeightFp16Legacy final {
 };
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
+  register_linear_params();
   m.impl(TORCH_SELECTIVE_NAME("quantized::linear_prepack"), TORCH_FN(QLinearPackWeightInt8::run));
   m.impl(TORCH_SELECTIVE_NAME("quantized::linear_prepack_legacy"), TORCH_FN(QLinearPackWeightInt8Legacy::run));
 }
 
 TORCH_LIBRARY_IMPL(quantized, CPU, m) {
+  register_linear_params();
   m.impl(TORCH_SELECTIVE_NAME("quantized::linear_prepack_fp16"), TORCH_FN(QLinearPackWeightFp16::run));
   m.impl(TORCH_SELECTIVE_NAME("quantized::linear_prepack_fp16_legacy"), TORCH_FN(QLinearPackWeightFp16Legacy::run));
 }
 
 TORCH_LIBRARY_IMPL(_quantized, QuantizedCPU, m) {
+  register_linear_params();
   m.impl(TORCH_SELECTIVE_NAME("_quantized::linear_prepack"), TORCH_FN(QLinearPackWeightInt8::run));
 }
 
 TORCH_LIBRARY_IMPL(_quantized, CPU, m) {
+  register_linear_params();
   m.impl(TORCH_SELECTIVE_NAME("_quantized::linear_prepack_fp16"), TORCH_FN(QLinearPackWeightFp16::run));
   m.impl(TORCH_SELECTIVE_NAME("_quantized::linear_prepack_fp16_legacy"), TORCH_FN(QLinearPackWeightFp16Legacy::run));
 }
diff --git a/aten/src/ATen/native/quantized/qconv_unpack.cpp b/aten/src/ATen/native/quantized/qconv_unpack.cpp
index 90e210ebe227d..cff99560b7eec 100644
--- a/aten/src/ATen/native/quantized/qconv_unpack.cpp
+++ b/aten/src/ATen/native/quantized/qconv_unpack.cpp
@@ -28,6 +28,12 @@ and /cudnn/ConvUnpackImpl.cpp, for cudnn.
 #include <ATen/ops/from_blob.h>
 #endif
 
+template <int kSpatialDim = 2>
+int register_conv_params();
+
+extern template int register_conv_params<2>();
+extern template int register_conv_params<3>();
+
 
 namespace at {
 namespace native {
@@ -192,6 +198,8 @@ unpack_quantized_prepacked_sizes_conv2d(const IValue& ivalue) {
 }
 
 TORCH_LIBRARY_IMPL(quantized, CatchAll, m) {
+  register_conv_params<2>();
+  register_conv_params<3>();
   // conv_unpack is deprecated, please use conv2d_unpack for 2D conv.
   m.impl(TORCH_SELECTIVE_NAME("quantized::conv_unpack"), TORCH_FN(QConvUnpackWeightsInt8<2>::run));
   // We use  conv2d_unpack to be consistent with conv3d_unpack
diff --git a/aten/src/ATen/native/quantized/qlinear_unpack.cpp b/aten/src/ATen/native/quantized/qlinear_unpack.cpp
index f293a7307e330..19c9890c82e38 100644
--- a/aten/src/ATen/native/quantized/qlinear_unpack.cpp
+++ b/aten/src/ATen/native/quantized/qlinear_unpack.cpp
@@ -13,6 +13,8 @@ and /cudnn/linear_unpack_impl.cpp, for cudnn.
 #include <torch/custom_class.h>
 #include <torch/library.h>
 
+int register_linear_params();
+
 namespace at {
 namespace native {
 namespace {
@@ -68,6 +70,7 @@ TORCH_LIBRARY_IMPL(quantized, CPU, m) {
 }
 
 TORCH_LIBRARY_IMPL(quantized, CatchAll, m) {
+  register_linear_params();
   m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack"), TORCH_FN(QLinearUnpackWeightInt8::run));
   m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack_fp16"), TORCH_FN(QLinearUnpackWeightFp16::run));
 }

From 672e05fbef24114f3f30fcd0db5236fe5d9a9a63 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@meta.com>
Date: Fri, 9 Dec 2022 01:08:08 +0000
Subject: [PATCH 1755/1922] Bump version: 1.14.0->2.0.0 (#90491)

Except for the usual location, had to update the version in one of ONNX expect patterns, namely here: https://github.com/pytorch/pytorch/blob/43660051d804d8a11c4382d8a241c7eb5da77e27/test/onnx/expect/TestOperators.test_avg_pool2d.expect#L3
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90491
Approved by: https://github.com/jansel, https://github.com/albanD
---
 .circleci/scripts/binary_ios_upload.sh                | 2 +-
 .circleci/scripts/binary_populate_env.sh              | 2 +-
 android/gradle.properties                             | 2 +-
 test/onnx/expect/TestOperators.test_avg_pool2d.expect | 2 +-
 version.txt                                           | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.circleci/scripts/binary_ios_upload.sh b/.circleci/scripts/binary_ios_upload.sh
index da38065847eff..48518c4707c6f 100644
--- a/.circleci/scripts/binary_ios_upload.sh
+++ b/.circleci/scripts/binary_ios_upload.sh
@@ -33,7 +33,7 @@ fi
 cp ${PROJ_ROOT}/LICENSE ${ZIP_DIR}/
 # zip the library
 export DATE="$(date -u +%Y%m%d)"
-export IOS_NIGHTLY_BUILD_VERSION="1.14.0.${DATE}"
+export IOS_NIGHTLY_BUILD_VERSION="2.0.0.${DATE}"
 if [ "${BUILD_LITE_INTERPRETER}" == "1" ]; then
     # libtorch_lite_ios_nightly_1.11.0.20210810.zip
     ZIPFILE="libtorch_lite_ios_nightly_${IOS_NIGHTLY_BUILD_VERSION}.zip"
diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh
index 3294c72024aa3..7714371e26429 100755
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@@ -59,7 +59,7 @@ PIP_UPLOAD_FOLDER='nightly/'
 # We put this here so that OVERRIDE_PACKAGE_VERSION below can read from it
 export DATE="$(date -u +%Y%m%d)"
 #TODO: We should be pulling semver version from the base version.txt
-BASE_BUILD_VERSION="1.14.0.dev$DATE"
+BASE_BUILD_VERSION="2.0.0.dev$DATE"
 # Change BASE_BUILD_VERSION to git tag when on a git tag
 # Use 'git -C' to make doubly sure we're in the correct directory for checking
 # the git tag
diff --git a/android/gradle.properties b/android/gradle.properties
index ecefc09a587ba..25695a1762f63 100644
--- a/android/gradle.properties
+++ b/android/gradle.properties
@@ -1,6 +1,6 @@
 ABI_FILTERS=armeabi-v7a,arm64-v8a,x86,x86_64
 
-VERSION_NAME=1.14.0-SNAPSHOT
+VERSION_NAME=2.0.0-SNAPSHOT
 GROUP=org.pytorch
 MAVEN_GROUP=org.pytorch
 SONATYPE_STAGING_PROFILE=orgpytorch
diff --git a/test/onnx/expect/TestOperators.test_avg_pool2d.expect b/test/onnx/expect/TestOperators.test_avg_pool2d.expect
index 4839fb5a35a7d..c5f8ba6b85781 100644
--- a/test/onnx/expect/TestOperators.test_avg_pool2d.expect
+++ b/test/onnx/expect/TestOperators.test_avg_pool2d.expect
@@ -1,6 +1,6 @@
 ir_version: 7
 producer_name: "pytorch"
-producer_version: "1.14.0"
+producer_version: "2.0.0"
 graph {
   node {
     output: "onnx::Pad_1"
diff --git a/version.txt b/version.txt
index 59c85dbc87029..35a785a76f13a 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-1.14.0a0
+2.0.0a0

From a2b0e24196d1967220bb2ec88a128b39827a71dc Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Wed, 7 Dec 2022 13:46:43 -0800
Subject: [PATCH 1756/1922] [quant][be] Remove special casing for getitem in
 prepare (#90393)

Summary:
This PR cleans up previous special casing for getitem, it should be configured through BackendConfig

Test Plan:
python test/test_quantization.py TestQuantizeFx

Reviewers:

Subscribers:

Tasks:

Tags:

Differential Revision: [D41846185](https://our.internmc.facebook.com/intern/diff/D41846185)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90393
Approved by: https://github.com/andrewor14
---
 torch/ao/quantization/fx/prepare.py | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index 73c0c2fde69a0..9985a5c049720 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -1,6 +1,5 @@
 import copy
 import torch
-import operator
 import warnings
 from torch.fx import (
     GraphModule,
@@ -385,15 +384,6 @@ def get_target_activation_dtype_for_node(
                 "output_activation_dtype": None,
             }
 
-        # TODO(future PR): consider stopping matching getitem
-        is_getitem = node.op == 'call_function' and \
-            node.target == operator.getitem
-        if is_getitem:
-            return {
-                "input_activation_dtype": (torch.float, False),
-                "output_activation_dtype": (torch.float, False),
-            }
-
         # get qconfig to determine the eventual dtype of this node
         if qconfig is not None:
             if qhandler is not None and qhandler.input_output_observed():
@@ -1215,14 +1205,10 @@ def insert_observers_for_model(
 
             this_node_dtype_info = node_name_to_target_dtype_info[node.name]
             output_not_a_tensor = this_node_dtype_info is None
-            # TODO(future PR): consider stopping matching getitem
-            is_getitem = node.op == 'call_function' and \
-                node.target == operator.getitem
 
             skip_inserting_observers = (
                 (qconfig is None) or
-                output_not_a_tensor or
-                is_getitem
+                output_not_a_tensor
             ) and (
                 not node.op == 'output'
             )

From a0e5322ab98f9e8af09acf6e55c8b210f6d0c404 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Wed, 7 Dec 2022 16:30:03 -0800
Subject: [PATCH 1757/1922] [dynamo][ez] Change module type to str for easier
 downstream parsing (#90429)

Summary:
att

Test Plan:
NA

Reviewers:

Subscribers:

Tasks:

Tags:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90429
Approved by: https://github.com/SherlockNoMad
---
 torch/_dynamo/variables/nn_module.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py
index 972e21473744b..c9f0f7792ec91 100644
--- a/torch/_dynamo/variables/nn_module.py
+++ b/torch/_dynamo/variables/nn_module.py
@@ -164,7 +164,7 @@ def call_function(
         @contextmanager
         def record_nn_module_stack():
             try:
-                tx.nn_module_stack[self.module_key] = type(mod)
+                tx.nn_module_stack[self.module_key] = str(type(mod))
                 yield
             finally:
                 del tx.nn_module_stack[self.module_key]

From b25267cb51c65be88a75fa82af977c7905f015ba Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Fri, 9 Dec 2022 03:08:11 +0000
Subject: [PATCH 1758/1922] [ONNX] Extend PR approver list (#90490)

Extending the list of ONNX exporter related PR approvers. All had a long track record for contributions in PyTorch/ONNX.

@justinchuby - https://github.com/pytorch/pytorch/pulls?q=author%3Ajustinchuby
@shubhambhokare1 - https://github.com/pytorch/pytorch/pulls?q=author%3Ashubhambhokare1
@thiagocrepaldi - https://github.com/pytorch/pytorch/pulls?q=author%3Athiagocrepaldi
@titaiwangms - https://github.com/pytorch/pytorch/pulls?q=author%3Atitaiwangms
@wschin - https://github.com/pytorch/pytorch/pulls?q=author%3Awschin
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90490
Approved by: https://github.com/thiagocrepaldi, https://github.com/malfet
---
 .github/merge_rules.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/merge_rules.yaml b/.github/merge_rules.yaml
index 7baf3e0e8df92..c5cf415be984f 100644
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@@ -20,6 +20,11 @@
   approved_by:
   - BowenBao
   - abock
+  - justinchuby
+  - shubhambhokare1
+  - thiagocrepaldi
+  - titaiwangms
+  - wschin
   mandatory_checks_name:
   - EasyCLA
   - Lint

From 5931367e0e0b888ac5167cf53e551d63174a9e63 Mon Sep 17 00:00:00 2001
From: Mauricio Villegas <mauricio_ville@yahoo.com>
Date: Fri, 9 Dec 2022 03:20:17 +0000
Subject: [PATCH 1759/1922] Fixed a couple of mistakes in type annotations in
 optim package (#90216)

Doing some tests with all Optimizer and LRScheduler classes in optim package, I noticed a couple of mistakes in type annotations, so created a pull request to fix them.

- In Optimizer class, incorrectly named parameter `default` instead of `defaults` in pyi file
- In SGD class, type for `maximize` and `differentiable` not available in either py or pyi files

I don't know if there is a plan to move all types from pyi to py files, so wasn't too sure where to fix what.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90216
Approved by: https://github.com/janeyx99
---
 torch/optim/optimizer.pyi | 2 +-
 torch/optim/sgd.py        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch/optim/optimizer.pyi b/torch/optim/optimizer.pyi
index 7055c6b788a5f..002713c27500f 100644
--- a/torch/optim/optimizer.pyi
+++ b/torch/optim/optimizer.pyi
@@ -10,7 +10,7 @@ class Optimizer:
     state: dict
     param_groups: List[dict]
 
-    def __init__(self, params: _params_t, default: dict) -> None: ...
+    def __init__(self, params: _params_t, defaults: dict) -> None: ...
     def __setstate__(self, state: dict) -> None: ...
     def register_step_pre_hook(self, hook: Callable[..., None]) -> RemovableHandle: ...
     def register_step_post_hook(self, hook: Callable[..., None]) -> RemovableHandle: ...
diff --git a/torch/optim/sgd.py b/torch/optim/sgd.py
index ed48973cf7c63..cc0d8e5caf39e 100644
--- a/torch/optim/sgd.py
+++ b/torch/optim/sgd.py
@@ -91,8 +91,8 @@ class SGD(Optimizer):
     """
 
     def __init__(self, params, lr=required, momentum=0, dampening=0,
-                 weight_decay=0, nesterov=False, *, maximize=False, foreach: Optional[bool] = None,
-                 differentiable=False):
+                 weight_decay=0, nesterov=False, *, maximize: bool = False, foreach: Optional[bool] = None,
+                 differentiable: bool = False):
         if lr is not required and lr < 0.0:
             raise ValueError("Invalid learning rate: {}".format(lr))
         if momentum < 0.0:

From 20bb5d0dba09adeec34de47eab116e808c61c1a8 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Fri, 9 Dec 2022 03:46:36 +0000
Subject: [PATCH 1760/1922] [vision hash update] update the pinned vision hash
 (#90513)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/master/.github/workflows/_update-commit-hash.yml).
Update the pinned vision hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90513
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/vision.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index 0b378f4f7a325..80fb2b961071d 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-842e178a488722720b6eb1e9cb508439e8e1ecd9
+029cb3fe4526084172c30be14278d46ecd5bf17c

From 2c43abc122e7be0edd2faa9f26cc2691bbaa3e64 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Fri, 9 Dec 2022 04:32:31 +0000
Subject: [PATCH 1761/1922] Dynamo, FX, Inductor Progress Bars (#88384)

There are 3 progress bars each gated behind their own config, all off by default for now
1. Dynamo: Macro level config for dynamo, AOT, inductor
2. FX: Progress bar for each pass, with their names
3. Inductor

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88384
Approved by: https://github.com/wconstab, https://github.com/mlazos
---
 torch/_dynamo/logging.py                | 27 +++++++++++
 torch/_dynamo/optimizations/analysis.py |  1 +
 torch/_inductor/codecache.py            | 22 ++++++++-
 torch/_inductor/codegen/common.py       |  2 +
 torch/_inductor/config.py               |  6 +++
 torch/_inductor/graph.py                |  1 +
 torch/_inductor/ir.py                   |  3 ++
 torch/_inductor/sizevars.py             |  1 +
 torch/_inductor/virtualized.py          |  3 ++
 torch/fx/config.py                      |  6 +++
 torch/fx/interpreter.py                 |  8 +++-
 torch/hub.py                            | 64 ++++++++++++-------------
 12 files changed, 108 insertions(+), 36 deletions(-)
 create mode 100644 torch/fx/config.py

diff --git a/torch/_dynamo/logging.py b/torch/_dynamo/logging.py
index a9ead23e0edd8..61000481580f1 100644
--- a/torch/_dynamo/logging.py
+++ b/torch/_dynamo/logging.py
@@ -2,6 +2,15 @@
 import logging
 import os
 
+from torch.hub import Faketqdm, tqdm
+
+# logging level for dynamo generated graphs/bytecode/guards
+logging.CODE = 15
+logging.addLevelName(logging.CODE, "CODE")
+
+# Disable progress bar by default, not in dynamo config because otherwise get a circular import
+disable_progress = True
+
 # Return all loggers that torchdynamo/torchinductor is responsible for
 def get_loggers():
     return [
@@ -73,8 +82,26 @@ def init_logging(log_level, log_file_name=None):
 
 _step_counter = itertools.count(1)
 
+# Update num_steps if more phases are added: Dynamo, AOT, Backend
+# This is very inductor centric
+# _inductor.utils.has_triton() gives a circular import error here
+
+if not disable_progress:
+    try:
+        import triton  # noqa: F401
+
+        num_steps = 3
+    except ImportError:
+        num_steps = 2
+    pbar = tqdm(total=num_steps, desc="torch.compile()", delay=0)
+
 
 def get_step_logger(logger):
+    if not disable_progress:
+        pbar.update(1)
+        if not isinstance(pbar, Faketqdm):
+            pbar.set_postfix_str(f"{logger.name}")
+
     step = next(_step_counter)
 
     def log(level, msg):
diff --git a/torch/_dynamo/optimizations/analysis.py b/torch/_dynamo/optimizations/analysis.py
index d83e57fdca6e2..f732fb322438f 100644
--- a/torch/_dynamo/optimizations/analysis.py
+++ b/torch/_dynamo/optimizations/analysis.py
@@ -21,6 +21,7 @@ def __init__(self, *args, **kwargs):
         self.input_alias_groups = set()
         self.storage_to_alias_group = dict()
         self.make_alias_group = itertools.count(1)
+        self.name = "ShapeAliasingAndMutation"
 
     def tensor_alias_group(self, value: torch.Tensor):
         """Assign a unique identifier to the storage of a given tensor"""
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 747a7850b562b..2b7481acda41c 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -21,6 +21,8 @@
 from typing import Any, Callable, Dict, List
 
 import torch
+
+from torch.hub import Faketqdm, tqdm
 from torch.utils import cpp_extension
 from . import config, cuda_properties, exc
 
@@ -595,7 +597,7 @@ def warm_pool(cls):
         if hasattr(pool, "_start_queue_management_thread"):
             pool._start_queue_management_thread()
         else:
-            for i in range(config.compile_threads):
+            for _ in range(config.compile_threads):
                 pool._adjust_process_count()
             pool._start_executor_manager_thread()
         _compile_end()
@@ -636,10 +638,26 @@ def task():
         return self.submit(task)
 
     def wait(self, scope: Dict[str, Any]):
+        num_kernels = len(
+            [
+                value
+                for key, value in scope.items()
+                if isinstance(value, (Future, TritonFuture))
+            ]
+        )
+        pbar = tqdm(
+            total=num_kernels,
+            desc="Inductor Compilation",
+            disable=config.disable_progress,
+            delay=0,
+        )
         if config.compile_threads > 1:
-            for key, result in list(scope.items()):
+            for key, result in scope.items():
+                if config.verbose_progress and not isinstance(pbar, Faketqdm):
+                    pbar.set_postfix_str(key)
                 if isinstance(result, (Future, TritonFuture)):
                     scope[key] = result.result()
+                    pbar.update(1)
 
         _compile_end()
 
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index 2b997ff745028..7f467cff4d488 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -595,6 +595,8 @@ def reduction(self, name, dtype, src_dtype, reduction_type, index, value):
 
     def __enter__(self):
         class CSEProxy:
+            self.name = "CSEProxy"
+
             @staticmethod
             def __getattr__(name):
                 def inner(*args, **kwargs):
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index 3f2f6b1cdde6a..19f350ee6f0cf 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -4,6 +4,12 @@
 # add some debug printouts
 debug = False
 
+# Whether to disable a progress bar for autotuning
+disable_progress = True
+
+# Whether to enable printing the source code for each future
+verbose_progress = False
+
 # use cpp wrapper instead of python wrapper
 cpp_wrapper = False
 
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 270f3dc22af1b..cdc40a114840f 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -95,6 +95,7 @@ def __init__(
         self.randomness_seeds = []
         self.name_to_buffer = {}
         self.creation_time = time.time()
+        self.name = "GraphLowering"
         self._can_use_cpp_wrapper = config.cpp_wrapper
         self.graph_id = graph_id
 
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 253f217320e2e..e7a50f58c0b14 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -3964,6 +3964,8 @@ def add_index(expr, category, buf_name=None):
             )
 
         class CaptureIndexing(V.WrapperHandler):
+            self.name = "CaptureIndexing"
+
             def load(self, name: str, index: sympy.Expr):
                 index = add_index(index, "reads", name)
                 return self._inner.load(name, index)
@@ -4046,6 +4048,7 @@ def __init__(self):
                 self.garbage_collect_values = False
                 self.env = {}
                 self.fetch_attr = submodules.__getitem__
+                self.name = V.get_ops_handler().name
 
         return InterpreterShim().run(V.get_ops_handler())
 
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
index 7895f0dccdcba..13a0f5b6bc2be 100644
--- a/torch/_inductor/sizevars.py
+++ b/torch/_inductor/sizevars.py
@@ -601,6 +601,7 @@ class SimplifyIndexing(V.WrapperHandler):  # type: ignore[name-defined]
 
     def __init__(self, inner, var_ranges: VarRanges):
         super().__init__(inner)
+        self.name = "SimplifyIndexing"
         self._simplify: Callable[
             [Expr], Expr
         ] = lambda index: V.graph.sizevars.simplify_with_ranges(index, var_ranges)
diff --git a/torch/_inductor/virtualized.py b/torch/_inductor/virtualized.py
index 27e60b1daf1df..cff6770997371 100644
--- a/torch/_inductor/virtualized.py
+++ b/torch/_inductor/virtualized.py
@@ -57,6 +57,9 @@ def _arg_str(a):
 
 class MockHandler:
     def __getattr__(self, name):
+        if name == "name":
+            return "MockHandler"
+
         def inner(*args, **kwargs):
             fargs = [_arg_str(a) for a in args]
             fargs.extend(f"{k}={v}" for k, v in kwargs.items())
diff --git a/torch/fx/config.py b/torch/fx/config.py
new file mode 100644
index 0000000000000..da5120d6edf18
--- /dev/null
+++ b/torch/fx/config.py
@@ -0,0 +1,6 @@
+# Whether to disable showing progress on compilation passes
+# Need to add a new config otherwise wil get a circular import if dynamo config is imported here
+disable_progress = True
+
+# If True this also shows the node names in each pass, for small models this is great but larger models it's quite noisy
+verbose_progress = False
diff --git a/torch/fx/interpreter.py b/torch/fx/interpreter.py
index c4d1cf26a8592..6550b0e00ac1f 100644
--- a/torch/fx/interpreter.py
+++ b/torch/fx/interpreter.py
@@ -4,10 +4,12 @@
 from .proxy import Proxy
 from ._symbolic_trace import Tracer
 from ._compatibility import compatibility
+from . import config
 import torch.fx.traceback as fx_traceback
 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 import inspect
 from contextlib import contextmanager
+from torch.hub import tqdm
 
 __all__ = ['Interpreter', 'Transformer']
 
@@ -72,7 +74,7 @@ def __init__(self, module : GraphModule, garbage_collect_values : bool = True):
         self.module = module
         self.submodules = dict(self.module.named_modules())
         self.env : Dict[Node, Any] = {}
-
+        self.name = "Interpreter"
         self.garbage_collect_values = garbage_collect_values
 
         if self.garbage_collect_values:
@@ -117,8 +119,12 @@ def run(self, *args, initial_env : Optional[Dict[Node, Any]] = None, enable_io_p
         if enable_io_processing:
             args = self.module.graph.process_inputs(*args)
         self.args_iter : Iterator[Any] = iter(args)
+        pbar = tqdm(total=len(self.module.graph.nodes),
+                    desc=f"{self.name}: {str(list(self.module.graph.nodes)) if config.verbose_progress else ''}",
+                    initial=0, position=0, leave=True, disable=config.disable_progress, delay=0)
 
         for node in self.module.graph.nodes:
+            pbar.update(1)
             if node in self.env:
                 # Short circuit if we have this value. This could
                 # be used, for example, for partial evaluation
diff --git a/torch/hub.py b/torch/hub.py
index cfe4181332ec9..93a1f32fd92f5 100644
--- a/torch/hub.py
+++ b/torch/hub.py
@@ -16,44 +16,42 @@
 from urllib.parse import urlparse  # noqa: F401
 from torch.serialization import MAP_LOCATION
 
-try:
-    from tqdm.auto import tqdm  # automatically select proper tqdm submodule if available
-except ImportError:
-    try:
-        from tqdm import tqdm
-    except ImportError:
-        # fake tqdm if it's not installed
-        class tqdm(object):  # type: ignore[no-redef]
-
-            def __init__(self, total=None, disable=False,
-                         unit=None, unit_scale=None, unit_divisor=None):
-                self.total = total
-                self.disable = disable
-                self.n = 0
-                # ignore unit, unit_scale, unit_divisor; they're just for real tqdm
-
-            def update(self, n):
-                if self.disable:
-                    return
+class Faketqdm(object):  # type: ignore[no-redef]
+
+    def __init__(self, total=None, disable=False,
+                 unit=None, *args, **kwargs):
+        self.total = total
+        self.disable = disable
+        self.n = 0
+        # Ignore all extra *args and **kwargs lest you want to reinvent tqdm
+
+    def update(self, n):
+        if self.disable:
+            return
+
+        self.n += n
+        if self.total is None:
+            sys.stderr.write("\r{0:.1f} bytes".format(self.n))
+        else:
+            sys.stderr.write("\r{0:.1f}%".format(100 * self.n / float(self.total)))
+        sys.stderr.flush()
 
-                self.n += n
-                if self.total is None:
-                    sys.stderr.write("\r{0:.1f} bytes".format(self.n))
-                else:
-                    sys.stderr.write("\r{0:.1f}%".format(100 * self.n / float(self.total)))
-                sys.stderr.flush()
+    def close(self):
+        self.disable = True
 
-            def close(self):
-                self.disable = True
+    def __enter__(self):
+        return self
 
-            def __enter__(self):
-                return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.disable:
+            return
 
-            def __exit__(self, exc_type, exc_val, exc_tb):
-                if self.disable:
-                    return
+        sys.stderr.write('\n')
 
-                sys.stderr.write('\n')
+try:
+    from tqdm import tqdm  # If tqdm is installed use it, otherwise use the fake wrapper
+except ImportError:
+    tqdm = Faketqdm
 
 __all__ = [
     'download_url_to_file',

From 47c48bc5c203bf11d6175881c6931957db9bb834 Mon Sep 17 00:00:00 2001
From: Xilun Wu <12968408+XilunWu@users.noreply.github.com>
Date: Fri, 9 Dec 2022 01:09:18 +0000
Subject: [PATCH 1762/1922] [threaded PG] fix long hang issue in testing
 (#90515)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90515
Approved by: https://github.com/wanchaol
---
 test/distributed/test_multi_threaded_pg.py    | 45 +++++++++++++++++
 torch/testing/_internal/common_distributed.py |  3 +-
 .../distributed/multi_threaded_pg.py          | 48 +++++++++++++++----
 3 files changed, 86 insertions(+), 10 deletions(-)

diff --git a/test/distributed/test_multi_threaded_pg.py b/test/distributed/test_multi_threaded_pg.py
index 875e3f066384d..1ca04103ddfae 100644
--- a/test/distributed/test_multi_threaded_pg.py
+++ b/test/distributed/test_multi_threaded_pg.py
@@ -27,6 +27,51 @@ def test_broadcast_object_list(self):
         dist.broadcast_object_list(object_list=object_list)
         self.assertEqual(99, object_list[0])
 
+    def test_collective_error_on_rank_zero(self):
+        @spawn_threads_and_init_comms(world_size=4)
+        def _test_method(self):
+            input_tensor = torch.ones(3, 3) * dist.get_rank()  # perform 1st all gather
+            output_tensors = [torch.empty_like(input_tensor) for _ in range(dist.get_world_size())]
+            dist.all_gather(output_tensors, input_tensor)
+
+            if dist.get_rank() == 0:
+                raise AssertionError("Mimic real test failure.")  # fail on rank 0
+
+            dist.all_gather(output_tensors, input_tensor)  # perform 2nd all gather
+
+        with self.assertRaisesRegex(AssertionError, "Mimic real test failure."):
+            _test_method(self)
+
+    def test_collective_error_on_rank_non_zero(self):
+        @spawn_threads_and_init_comms(world_size=4)
+        def _test_method(self):
+            input_tensor = torch.ones(3, 3) * dist.get_rank()  # perform 1st all gather
+            output_tensors = [torch.empty_like(input_tensor) for _ in range(dist.get_world_size())]
+            dist.all_gather(output_tensors, input_tensor)
+
+            if dist.get_rank() == 1:
+                raise AssertionError("Mimic real test failure.")  # fail on rank 1
+
+            dist.all_gather(output_tensors, input_tensor)  # perform 2nd all gather
+
+        with self.assertRaisesRegex(AssertionError, "Mimic real test failure."):
+            _test_method(self)
+
+    def test_collective_error_on_rank_non_zero_all(self):
+        @spawn_threads_and_init_comms(world_size=4)
+        def _test_method(self):
+            input_tensor = torch.ones(3, 3) * dist.get_rank()  # perform 1st all gather
+            output_tensors = [torch.empty_like(input_tensor) for _ in range(dist.get_world_size())]
+            dist.all_gather(output_tensors, input_tensor)
+
+            if dist.get_rank() > 0:
+                raise AssertionError("Mimic real test failure.")  # fail on all non-zero rank
+
+            dist.all_gather(output_tensors, input_tensor)  # perform 2nd all gather
+
+        with self.assertRaisesRegex(AssertionError, "Mimic real test failure."):
+            _test_method(self)
+
 class TestCollectivesWithBaseClass(MultiThreadedTestCase):
     @property
     def world_size(self):
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index c92ba02653f7d..b96f12a7f2c55 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -883,7 +883,8 @@ def _run_test_with_mt_pg(self, timeout, world_size, callback):
         print(f"Rank {rank} raised:")
         for line in traceback.format_exception(*exc_info):
             sys.stdout.write(line)
-    self.assertEqual([], failed_ranks, "Some ranks failed")
+    if failed_ranks:
+        raise failed_ranks[0][1][1]  # re-throw the first exception
 
 
 def spawn_threads_and_init_comms(
diff --git a/torch/testing/_internal/distributed/multi_threaded_pg.py b/torch/testing/_internal/distributed/multi_threaded_pg.py
index b66ca14731659..304748fef79af 100644
--- a/torch/testing/_internal/distributed/multi_threaded_pg.py
+++ b/torch/testing/_internal/distributed/multi_threaded_pg.py
@@ -25,7 +25,7 @@
 Lots of missing collectives.
 Collectives validation.
 Make timeout robust by making collectives respect the test deadline.
-Make tests robuts by making collectives interruptible.
+Make tests robust by making collectives interruptible.
 We need some synchronization around cleanup to ensure that timedout ranks don't cause spurious failures.
 
 """
@@ -129,7 +129,7 @@ def work(self, data):
 
 
 class Collective:
-    def __init__(self, world_size, collective):
+    def __init__(self, world_size, collective, pg):
         self._world_size = world_size
         self._collective = collective
 
@@ -140,6 +140,8 @@ def __init__(self, world_size, collective):
         self._count = 0
         self._done = False
 
+        self._pg = pg
+
     def join(self, rank, data):
         with self._start_cond:
             self._data[rank] = data
@@ -151,14 +153,21 @@ def join(self, rank, data):
                     self._start_cond.notify()
 
             if rank == 0:
-                while self._count < self._world_size:
-                    self._start_cond.wait()
+                self._start_cond.wait_for(
+                    lambda: self._count == self._world_size or self._pg._terminate.is_set()
+                )
+                # SystemExit is not a subclass of Exception but BaseException
+                # and can be distinguished from normal exception raised from program errors
+                # so that we can hide it from the exception queue
+                if self._pg._terminate.is_set():
+                    sys.exit("Test termination event occurs.")
 
         with self._done_cond:
             # wait for rank 0 to finish
             if rank > 0:
-                while not self._done:
-                    self._done_cond.wait()
+                self._done_cond.wait_for(lambda: self._done or self._pg._terminate.is_set())
+                if self._pg._terminate.is_set():
+                    sys.exit("Test termination event occurs.")
             else:
                 # copy data around
                 self._collective.work(self._data)
@@ -176,6 +185,8 @@ class ProcessLocalGroup(dist.ProcessGroup):
     _coll_lock = threading.Lock()
     _cur_coll = None
 
+    _terminate = threading.Event()
+
     @classmethod
     def _register(cls, pg):
         with cls._pg_lock:
@@ -194,7 +205,7 @@ def _start_coll(cls, world_size, collective):
                     f"world not ready, only {cls._count} PG's registered but world has {world_size} ranks"
                 )
             if cls._cur_coll is None:
-                cls._cur_coll = Collective(world_size, collective)
+                cls._cur_coll = Collective(world_size, collective, cls)
             return cls._cur_coll
 
     @classmethod
@@ -204,6 +215,21 @@ def _end_coll(cls, collective):
             if cls._cur_coll == collective:
                 cls._cur_coll = None
 
+    @classmethod
+    def exception_handle(cls, exc):
+        cls._terminate.set()
+        coll = cls._cur_coll
+        if coll:
+            with coll._start_cond:
+                coll._start_cond.notify()
+            with coll._done_cond:
+                coll._done_cond.notify_all()
+
+    @classmethod
+    def reset(cls):
+        cls._cur_coll = None
+        cls._terminate.clear()
+
     def allreduce(self, tensor_list, opts=AllreduceOptions()):
         coll = ProcessLocalGroup._start_coll(self._world, AllReduce(opts.reduceOp))
         res = coll.join(self._rank, tensor_list)
@@ -331,14 +357,17 @@ def world_is_valid():
 
     def worker(rank):
         if not world_is_valid():
-            raise TimeoutError("Invalid world")
+            raise TimeoutError("Invalid world")  # TODO: raise TimeoutError or RuntimeError?
         dist.init_process_group(
             backend="threaded", rank=rank, world_size=world_size, store=global_store
         )
         try:
             callback()
-        except BaseException as ex:
+        # reason why we don't use BaseException is we want to
+        # ignore SystemExit excpetion caused by _terminate event
+        except Exception as ex:
             exception_queue.put((rank, sys.exc_info()))
+            world.default_pg.exception_handle(ex)  # trigger _terminate event and awaken worker threads
         finally:
             if world_is_valid():
                 dist.destroy_process_group()
@@ -366,6 +395,7 @@ def worker(rank):
                         ),
                     )
                 )
+        ProcessLocalGroup.reset()
         failed_ranks = []
         while not exception_queue.empty():
             failure = exception_queue.get()

From 12e7c663d191ac7f1cb072acc42601eb9a0053aa Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Thu, 8 Dec 2022 12:42:59 -0800
Subject: [PATCH 1763/1922] [ao] backend_config moving all to top (#88391)

Summary: moved __all__ to top of functions, removed private funcitons
from all

Test Plan: python test/test_public_bindings.py

Reviewers:

Subscribers:

Tasks:

Tags:

Differential Revision: [D41015538](https://our.internmc.facebook.com/intern/diff/D41015538)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88391
Approved by: https://github.com/jcaip
---
 .../_common_operator_config_utils.py          |  9 ++-----
 .../ao/quantization/backend_config/fbgemm.py  |  7 +++--
 .../ao/quantization/backend_config/native.py  | 27 +++++++++----------
 .../ao/quantization/backend_config/qnnpack.py |  7 +++--
 .../quantization/backend_config/tensorrt.py   | 10 +++----
 torch/ao/quantization/backend_config/x86.py   |  7 +++--
 6 files changed, 29 insertions(+), 38 deletions(-)

diff --git a/torch/ao/quantization/backend_config/_common_operator_config_utils.py b/torch/ao/quantization/backend_config/_common_operator_config_utils.py
index 47a0b30242086..3d95b8b38a38b 100644
--- a/torch/ao/quantization/backend_config/_common_operator_config_utils.py
+++ b/torch/ao/quantization/backend_config/_common_operator_config_utils.py
@@ -25,6 +25,8 @@
     fuse_convtranspose_bn,
 )
 
+__all__: List[str] = []
+
 # TODO: rename to be more explict, e.g. qat_conv_relu
 _ConvMetadata = namedtuple(
     "_ConvMetadata",
@@ -613,10 +615,3 @@ def _get_embedding_op_configs(dtype_configs: List[DTypeConfig]) -> List[BackendP
                 .set_reference_quantized_module(ref_embedding_op)
                 ._set_input_output_observed(False))  # This is temporary, and will be removed soon
     return embedding_op_configs
-
-__all__ = [
-    "_get_binary_op_configs",
-    "_get_linear_configs",
-    "_get_conv_configs",
-    "_get_share_qparams_op_configs",
-]
diff --git a/torch/ao/quantization/backend_config/fbgemm.py b/torch/ao/quantization/backend_config/fbgemm.py
index de38272b00e9f..d2bc87879c44f 100644
--- a/torch/ao/quantization/backend_config/fbgemm.py
+++ b/torch/ao/quantization/backend_config/fbgemm.py
@@ -13,6 +13,9 @@
 )
 from .backend_config import BackendConfig, DTypeConfig
 
+__all__ = [
+    "get_fbgemm_backend_config",
+]
 
 # ===================
 # |  DTYPE CONFIGS  |
@@ -108,7 +111,3 @@ def get_fbgemm_backend_config() -> BackendConfig:
         .set_backend_pattern_configs(_get_bn_configs(default_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_rnn_op_configs(rnn_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_embedding_op_configs(embedding_op_dtype_configs))
-
-__all__ = [
-    "get_fbgemm_backend_config",
-]
diff --git a/torch/ao/quantization/backend_config/native.py b/torch/ao/quantization/backend_config/native.py
index f584aff82a12b..ad5a12e6053b1 100644
--- a/torch/ao/quantization/backend_config/native.py
+++ b/torch/ao/quantization/backend_config/native.py
@@ -14,6 +14,19 @@
 )
 from .backend_config import BackendConfig, DTypeConfig
 
+__all__ = [
+    "get_test_only_legacy_native_backend_config",
+    "default_op_quint8_dtype_config",
+    "default_op_fp16_dtype_config",
+    "default_dynamic_int8_dtype_config",
+    "default_dynamic_float16_dtype_config",
+    "input_output_only_quint8_dtype_config",
+    "weight_only_quint8_dtype_config",
+    "weight_only_quint4x2_dtype_config",
+    "get_native_backend_config",
+    "get_native_backend_config_dict",
+    "get_test_only_legacy_native_backend_config_dict",
+]
 
 # ===================
 # |  DTYPE CONFIGS  |
@@ -182,17 +195,3 @@ def get_test_only_legacy_native_backend_config_dict():
     fp16 ops in dictionary form.
     """
     return get_test_only_legacy_native_backend_config().to_dict()
-
-__all__ = [
-    "get_test_only_legacy_native_backend_config",
-    "default_op_quint8_dtype_config",
-    "default_op_fp16_dtype_config",
-    "default_dynamic_int8_dtype_config",
-    "default_dynamic_float16_dtype_config",
-    "input_output_only_quint8_dtype_config",
-    "weight_only_quint8_dtype_config",
-    "weight_only_quint4x2_dtype_config",
-    "get_native_backend_config",
-    "get_native_backend_config_dict",
-    "get_test_only_legacy_native_backend_config_dict",
-]
diff --git a/torch/ao/quantization/backend_config/qnnpack.py b/torch/ao/quantization/backend_config/qnnpack.py
index 391acf55614af..772a25c655744 100644
--- a/torch/ao/quantization/backend_config/qnnpack.py
+++ b/torch/ao/quantization/backend_config/qnnpack.py
@@ -13,6 +13,9 @@
 )
 from .backend_config import BackendConfig, DTypeConfig, DTypeWithConstraints
 
+__all__ = [
+    "get_qnnpack_backend_config",
+]
 
 # ===================
 # |  DTYPE CONFIGS  |
@@ -155,7 +158,3 @@ def get_qnnpack_backend_config() -> BackendConfig:
         .set_backend_pattern_configs(_get_bn_configs(default_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_rnn_op_configs(rnn_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_embedding_op_configs(embedding_op_dtype_configs))
-
-__all__ = [
-    "get_qnnpack_backend_config",
-]
diff --git a/torch/ao/quantization/backend_config/tensorrt.py b/torch/ao/quantization/backend_config/tensorrt.py
index 9b6fb39e06160..a617f765adf77 100644
--- a/torch/ao/quantization/backend_config/tensorrt.py
+++ b/torch/ao/quantization/backend_config/tensorrt.py
@@ -12,6 +12,11 @@
     _get_share_qparams_op_configs,
 )
 
+__all__ = [
+    "get_tensorrt_backend_config",
+    "get_tensorrt_backend_config_dict",
+]
+
 def get_tensorrt_backend_config() -> BackendConfig:
     """
     Return the `BackendConfig` for the TensorRT backend.
@@ -69,8 +74,3 @@ def get_tensorrt_backend_config_dict():
     Return the `BackendConfig` for the TensorRT backend in dictionary form.
     """
     return get_tensorrt_backend_config().to_dict()
-
-__all__ = [
-    "get_tensorrt_backend_config",
-    "get_tensorrt_backend_config_dict",
-]
diff --git a/torch/ao/quantization/backend_config/x86.py b/torch/ao/quantization/backend_config/x86.py
index ce92ed9bc42b2..78a3f76187821 100644
--- a/torch/ao/quantization/backend_config/x86.py
+++ b/torch/ao/quantization/backend_config/x86.py
@@ -13,6 +13,9 @@
 )
 from .backend_config import BackendConfig, DTypeConfig
 
+__all__ = [
+    "get_x86_backend_config",
+]
 
 # ===================
 # |  DTYPE CONFIGS  |
@@ -105,7 +108,3 @@ def get_x86_backend_config() -> BackendConfig:
         .set_backend_pattern_configs(_get_bn_configs(default_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_rnn_op_configs(rnn_op_dtype_configs)) \
         .set_backend_pattern_configs(_get_embedding_op_configs(embedding_op_dtype_configs))
-
-__all__ = [
-    "get_x86_backend_config",
-]

From 1916d2de22a9cf59b1720a0b8ecb3941070c8fe9 Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Thu, 8 Dec 2022 14:49:37 -0800
Subject: [PATCH 1764/1922] [ao] public vs private for ao.quantization._X
 (#88392)

Summary: added all for these modules without altering names since they
tend to be experimental

Test Plan: python test/test_public_bindings.py

Reviewers:

Subscribers:

Tasks:

Tags:

Differential Revision: [D41015543](https://our.internmc.facebook.com/intern/diff/D41015543)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/88392
Approved by: https://github.com/jcaip
---
 torch/ao/quantization/_correct_bias.py            |  8 ++++++++
 torch/ao/quantization/_equalize.py                | 13 +++++++++++++
 torch/ao/quantization/_learnable_fake_quantize.py |  2 ++
 3 files changed, 23 insertions(+)

diff --git a/torch/ao/quantization/_correct_bias.py b/torch/ao/quantization/_correct_bias.py
index 0d9017533166a..7dfc58dfe52ad 100644
--- a/torch/ao/quantization/_correct_bias.py
+++ b/torch/ao/quantization/_correct_bias.py
@@ -5,6 +5,14 @@
 import torch.ao.quantization
 import torch.ao.ns._numeric_suite as ns
 
+__all__ = [
+    "get_module",
+    "parent_child_names",
+    "get_param",
+    "MeanShadowLogger",
+    "bias_correction",
+]
+
 _supported_modules = {nn.Linear, nn.Conv2d}
 _supported_modules_quantized = {nnq.Linear, nnq.Conv2d}
 
diff --git a/torch/ao/quantization/_equalize.py b/torch/ao/quantization/_equalize.py
index 1da025ca7a0d2..b15ffc65b7ad1 100644
--- a/torch/ao/quantization/_equalize.py
+++ b/torch/ao/quantization/_equalize.py
@@ -2,6 +2,19 @@
 import copy
 from typing import Dict, Any
 
+__all__ = [
+    "set_module_weight",
+    "set_module_bias",
+    "get_module_weight",
+    "get_module_bias",
+    "max_over_ndim",
+    "min_over_ndim",
+    "channel_range",
+    "cross_layer_equalization",
+    "equalize",
+    "converged",
+]
+
 _supported_types = {torch.nn.Conv2d, torch.nn.Linear}
 _supported_intrinsic_types = {torch.nn.intrinsic.ConvReLU2d, torch.nn.intrinsic.LinearReLU}
 _all_supported_types = _supported_types.union(_supported_intrinsic_types)
diff --git a/torch/ao/quantization/_learnable_fake_quantize.py b/torch/ao/quantization/_learnable_fake_quantize.py
index 9be2a4c5900ad..10600363d3564 100644
--- a/torch/ao/quantization/_learnable_fake_quantize.py
+++ b/torch/ao/quantization/_learnable_fake_quantize.py
@@ -1,6 +1,8 @@
 import torch
 from torch.nn.parameter import Parameter
+from typing import List
 
+__all__: List[str] = []
 
 class _LearnableFakeQuantize(torch.ao.quantization.FakeQuantizeBase):
     r""" This is an extension of the FakeQuantize module in fake_quantize.py, which

From 337638ced745be7d5e1c744906f811f90bde716f Mon Sep 17 00:00:00 2001
From: Sean Ross-Ross <srossross@gmail.com>
Date: Fri, 9 Dec 2022 05:58:07 +0000
Subject: [PATCH 1765/1922] Implement checks for vmap escaped errors (#89585)

Follow on to https://github.com/pytorch/pytorch/pull/89077
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89585
Approved by: https://github.com/zou3519
---
 .../ATen/functorch/BatchRulesConvolution.cpp   |  2 +-
 aten/src/ATen/functorch/BatchRulesHelper.h     |  4 ++--
 aten/src/ATen/functorch/BatchRulesLoss.cpp     |  4 ++--
 aten/src/ATen/functorch/BatchRulesNorm.cpp     | 14 +++++++-------
 .../src/ATen/functorch/BatchRulesReduceOps.cpp |  2 +-
 .../ATen/functorch/BatchRulesScatterOps.cpp    |  8 ++++----
 aten/src/ATen/functorch/BatchRulesViews.cpp    |  2 +-
 test/functorch/test_vmap.py                    | 18 +++++++++++++++++-
 8 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/aten/src/ATen/functorch/BatchRulesConvolution.cpp b/aten/src/ATen/functorch/BatchRulesConvolution.cpp
index 79523ed1fb6d9..90cd68b2e0da1 100644
--- a/aten/src/ATen/functorch/BatchRulesConvolution.cpp
+++ b/aten/src/ATen/functorch/BatchRulesConvolution.cpp
@@ -439,7 +439,7 @@ std::tuple<Tensor,Tensor,Tensor> convolution_backward_plumbing(
     IntArrayRef stride, c10::SymIntArrayRef padding, IntArrayRef dilation, bool transposed,
     c10::SymIntArrayRef output_padding, int64_t groups, std::array<bool, 3> output_mask) {
   const auto maybe_layer = maybeCurrentDynamicLayer();
-  TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
+  vmap_check_escaped(maybe_layer, "convolution_backward_plumbing");
   int64_t cur_level = maybe_layer->layerId();
 
   if (!areAnyBatchedAtLevel({grad_output_, input_, weight_}, cur_level)){
diff --git a/aten/src/ATen/functorch/BatchRulesHelper.h b/aten/src/ATen/functorch/BatchRulesHelper.h
index 2efc12d4c993e..8e78ba71029b1 100644
--- a/aten/src/ATen/functorch/BatchRulesHelper.h
+++ b/aten/src/ATen/functorch/BatchRulesHelper.h
@@ -245,7 +245,7 @@ inline void boxed_existing_bdim_all_batch_rule(
 
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
-  TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
+  vmap_check_escaped(maybe_layer, "boxed_existing_bdim_all_batch_rule");
   int64_t cur_level = maybe_layer->layerId();
 
   const auto arguments = torch::jit::last(stack, num_arguments);
@@ -301,7 +301,7 @@ inline void boxed_all_tensors_have_optional_bdim(
 
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
-  TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
+  vmap_check_escaped(maybe_layer, "boxed_all_tensors_have_optional_bdim");
   int64_t cur_level = maybe_layer->layerId();
 
   const auto arguments = torch::jit::last(stack, num_arguments);
diff --git a/aten/src/ATen/functorch/BatchRulesLoss.cpp b/aten/src/ATen/functorch/BatchRulesLoss.cpp
index 66c2b7fb3194d..6429856572878 100644
--- a/aten/src/ATen/functorch/BatchRulesLoss.cpp
+++ b/aten/src/ATen/functorch/BatchRulesLoss.cpp
@@ -59,7 +59,7 @@ Tensor binary_cross_entropy_plumbing(
     const Tensor& self, const Tensor& target,
     const optional<Tensor>& weight, int64_t reduction) {
   auto maybe_layer = maybeCurrentDynamicLayer();
-  TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
+  vmap_check_escaped(maybe_layer, "binary_cross_entropy_plumbing");
   int64_t cur_level = maybe_layer->layerId();
 
   if (!isBatchedAtLevel(self, cur_level) && !isBatchedAtLevel(target, cur_level)
@@ -99,7 +99,7 @@ Tensor binary_cross_entropy_backward_plumbing(
     const Tensor& grad, const Tensor& input, const Tensor& target,
     const c10::optional<Tensor>& weight_opt, int64_t reduction) {
   auto maybe_layer = maybeCurrentDynamicLayer();
-  TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
+  vmap_check_escaped(maybe_layer, "binary_cross_entropy_backward_plumbing");
   int64_t cur_level = maybe_layer->layerId();
 
   if (!areAnyBatchedAtLevel({grad, input, target, weight_opt}, cur_level)) {
diff --git a/aten/src/ATen/functorch/BatchRulesNorm.cpp b/aten/src/ATen/functorch/BatchRulesNorm.cpp
index d53d4f6a2e972..bdd80540e649c 100644
--- a/aten/src/ATen/functorch/BatchRulesNorm.cpp
+++ b/aten/src/ATen/functorch/BatchRulesNorm.cpp
@@ -222,7 +222,7 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> batch_norm_backward_plumbing(
 
   // plumbing
   auto maybe_layer = maybeCurrentDynamicLayer();
-  TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
+  vmap_check_escaped(maybe_layer, "batch_norm_backward_plumbing");
   int64_t cur_level = maybe_layer->layerId();
 
   Tensor grad_out_value;
@@ -304,7 +304,7 @@ std::tuple<Tensor,Tensor,Tensor> native_group_norm_plumbing(
   const Tensor& bias = *bias_maybe_owned;
 
   auto maybe_layer = maybeCurrentDynamicLayer();
-  TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
+  vmap_check_escaped(maybe_layer, "native_group_norm_plumbing");
   int64_t cur_level = maybe_layer->layerId();
 
   if (!areAnyBatchedAtLevel({input, weight_opt, bias_opt}, cur_level)) {
@@ -393,7 +393,7 @@ std::tuple<Tensor,Tensor,Tensor> native_group_norm_backward_plumbing(
 
   // plumbing
   auto maybe_layer = maybeCurrentDynamicLayer();
-  TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
+  vmap_check_escaped(maybe_layer, "native_group_norm_backward_plumbing");
   int64_t cur_level = maybe_layer->layerId();
 
   if (!areAnyBatchedAtLevel({grad_out, input, mean, rstd, weight_opt}, cur_level)) {
@@ -604,7 +604,7 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_backward_plumbing
 
   // plumbing
   auto maybe_layer = maybeCurrentDynamicLayer();
-  TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
+  vmap_check_escaped(maybe_layer, "native_layer_norm_backward_plumbing");
   int64_t cur_level = maybe_layer->layerId();
   if (!areAnyBatchedAtLevel({grad_out, input, mean, rstd, weight_opt, bias_opt}, cur_level)) {
     c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
@@ -756,7 +756,7 @@ struct NativeBatchNormBackwardBatchRuleHelper {
     std::array<bool,3> output_mask) {
 
     auto maybe_layer = maybeCurrentDynamicLayer();
-    TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
+    vmap_check_escaped(maybe_layer, "NativeBatchNormBackwardBatchRuleHelper.apply");
     int64_t cur_level = maybe_layer->layerId();
 
     if (!areAnyBatchedAtLevel({grad_out, input, weight_opt, running_mean_opt,
@@ -786,7 +786,7 @@ struct CudnnBatchNormBackwardBatchRuleHelper {
     const at::Tensor & reserve) {
 
     auto maybe_layer = maybeCurrentDynamicLayer();
-    TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
+    vmap_check_escaped(maybe_layer, "CudnnBatchNormBackwardBatchRuleHelper.apply");
     int64_t cur_level = maybe_layer->layerId();
 
     if (!areAnyBatchedAtLevel({input, grad_out, weight, running_mean_opt,
@@ -814,7 +814,7 @@ struct MiopenBatchNormBackwardBatchRuleHelper {
     double eps) {
 
     auto maybe_layer = maybeCurrentDynamicLayer();
-    TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
+    vmap_check_escaped(maybe_layer, "MiopenBatchNormBackwardBatchRuleHelper.apply");
     int64_t cur_level = maybe_layer->layerId();
 
     if (!areAnyBatchedAtLevel({input, grad_out, weight, running_mean_opt,
diff --git a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
index d792a8da3f9c5..ec849c9794b4d 100644
--- a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
@@ -72,7 +72,7 @@ void boxed_reduction_batch_rule(const c10::OperatorHandle& op, torch::jit::Stack
 
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
-  TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
+  vmap_check_escaped(maybe_layer, "boxed_reduction_batch_rule");
   int64_t cur_level = maybe_layer->layerId();
 
   auto orig_arguments = torch::jit::last(*stack, num_arguments);
diff --git a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
index fc51e9d744099..c1d66369fb1fe 100644
--- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
@@ -319,7 +319,7 @@ Tensor index_plumbing(const Tensor & self, const List<optional<Tensor>> & indice
 ) {
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
-  TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
+  vmap_check_escaped(maybe_layer, "index_plumbing");
   int64_t cur_level = maybe_layer->layerId();
   if (!isBatchedAtLevel(self, cur_level) && !isBatchedAtLevel(indices, cur_level)) {
     return at::index(self, indices);
@@ -506,7 +506,7 @@ Tensor& index_put__plumbing(Tensor & self, const List<optional<Tensor>> & indice
 , const Tensor & values, bool accumulate) {
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
-  TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
+  vmap_check_escaped(maybe_layer, "index_put__plumbing");
   int64_t cur_level = maybe_layer->layerId();
   if (!isBatchedAtLevel(self, cur_level) && !isBatchedAtLevel(indices, cur_level) && !isBatchedAtLevel(values, cur_level)) {
     return self.index_put_(indices, values, accumulate);
@@ -545,7 +545,7 @@ Tensor &_index_put_impl__plumbing(Tensor &self, const List<optional<Tensor>> &in
                                   const Tensor &values, bool accumulate, bool unsafe) {
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
-  TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
+  vmap_check_escaped(maybe_layer, "_index_put_impl__plumbing");
   int64_t cur_level = maybe_layer->layerId();
   if (!isBatchedAtLevel(self, cur_level) && !isBatchedAtLevel(indices, cur_level) && !isBatchedAtLevel(values, cur_level)) {
     return at::_index_put_impl_(self, indices, values, accumulate, unsafe);
@@ -666,7 +666,7 @@ Tensor index_put_plumbing(const Tensor & self, const List<optional<Tensor>> & in
                           const Tensor & values, bool accumulate) {
   c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
   auto maybe_layer = maybeCurrentDynamicLayer();
-  TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
+  vmap_check_escaped(maybe_layer, "index_put_plumbing");
   int64_t cur_level = maybe_layer->layerId();
   if (!isBatchedAtLevel(self, cur_level) && !isBatchedAtLevel(indices, cur_level) && !isBatchedAtLevel(values, cur_level)) {
     return self.index_put(indices, values, accumulate);
diff --git a/aten/src/ATen/functorch/BatchRulesViews.cpp b/aten/src/ATen/functorch/BatchRulesViews.cpp
index d5e5161fb3a31..e083d9d1c4ea5 100644
--- a/aten/src/ATen/functorch/BatchRulesViews.cpp
+++ b/aten/src/ATen/functorch/BatchRulesViews.cpp
@@ -172,7 +172,7 @@ const Tensor& resize__plumbing(
       optional_memory_format == c10::MemoryFormat::Contiguous,
       "resize_: batching rule only supports None or Contiguous MemoryFormat");
   auto maybe_layer = maybeCurrentDynamicLayer();
-  TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
+  vmap_check_escaped(maybe_layer, "resize__plumbing");
   int64_t cur_level = maybe_layer->layerId();
   if (!isBatchedAtLevel(self, cur_level)) {
     c10::impl::ExcludeDispatchKeyGuard guard2(DispatchKey::FuncTorchBatched);
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index b85da534be8fa..b07928d128b6e 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -3927,11 +3927,13 @@ def f(x):
             escaped = x
             return x ** 2
 
-        x = torch.randn(3)
+        x = torch.randn([3, 3, 3, 3, 3])
         vmap(f)(x)
 
         common_message = r"your tensor may have escaped from inside a function being vmapped.*{0}.*"
 
+        # Note: These are not a complete set of tests for all possible functions calling 'vmap_check_escaped'
+
         with self.assertRaisesRegex(RuntimeError, common_message.format("gen_vmap_plumbing")):
             escaped.sin()
 
@@ -3941,6 +3943,20 @@ def f(x):
         with self.assertRaisesRegex(RuntimeError, common_message.format("gen_vmap_inplace_plumbing")):
             escaped.mul_(1)
 
+        with self.assertRaisesRegex(RuntimeError, common_message.format("binary_cross_entropy_plumbing")):
+            torch.nn.functional.binary_cross_entropy(escaped, torch.zeros([3, 3, 3, 3]))
+
+        with self.assertRaisesRegex(RuntimeError, common_message.format("boxed_existing_bdim_all_batch_rule")):
+            torch.nn.functional.adaptive_max_pool2d(escaped, output_size=(1, 1))
+
+        with self.assertRaisesRegex(RuntimeError, common_message.format("boxed_reduction_batch_rule")):
+            escaped.argmin()
+
+        a = torch.zeros([4, 4, 4, 4])
+        b = torch.zeros([4, 4, 4, 4], dtype=torch.long)
+        with self.assertRaisesRegex(RuntimeError, common_message.format("boxed_all_tensors_have_optional_bdim")):
+            torch.ops.aten.adaptive_max_pool2d_backward(escaped, a, b)
+
         vmap(f)(torch.tensor([[0, 0], [0, 0]], dtype=torch.int))
         with self.assertRaisesRegex(RuntimeError, common_message.format("gen_vmap_plumbing_no_returns")):
             torch.ops.aten._linalg_check_errors(escaped, 'linalg.inv', is_matrix=False)

From eb9abd79a4a4d236440c2497dbff5973f619bf03 Mon Sep 17 00:00:00 2001
From: Alex Settle <alexmsettle@gmail.com>
Date: Fri, 9 Dec 2022 06:20:28 +0000
Subject: [PATCH 1766/1922] Reland "Add heirachical module names to torchFX
 graph.node" (#90205)

Fixes #87659

Reland of PR #87742

Resolves errors that caused the changes to be backed out.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90205
Approved by: https://github.com/jerryzh168
---
 .../ao_migration/test_quantization_fx.py      |  2 -
 test/test_fx.py                               | 30 ++++++
 torch/ao/quantization/fx/tracer.py            | 88 +-----------------
 torch/ao/quantization/quantize_fx.py          | 55 -----------
 torch/fx/_symbolic_trace.py                   | 29 ++++--
 torch/fx/proxy.py                             | 93 ++++++++++++++++++-
 torch/quantization/quantize_fx.py             |  2 -
 7 files changed, 145 insertions(+), 154 deletions(-)

diff --git a/test/quantization/ao_migration/test_quantization_fx.py b/test/quantization/ao_migration/test_quantization_fx.py
index 1e64dd4ebfbf3..fed2921cea722 100644
--- a/test/quantization/ao_migration/test_quantization_fx.py
+++ b/test/quantization/ao_migration/test_quantization_fx.py
@@ -11,8 +11,6 @@ def test_function_import_quantize_fx(self):
             '_check_is_graph_module',
             '_swap_ff_with_fxff',
             '_fuse_fx',
-            'Scope',
-            'ScopeContextManager',
             'QuantizationTracer',
             '_prepare_fx',
             '_prepare_standalone_module_fx',
diff --git a/test/test_fx.py b/test/test_fx.py
index bd92fbbedceb9..a9e186a2f7f0c 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -1679,6 +1679,36 @@ def forward(self, x):
             if node.op in {'placeholder'}:
                 self.assertEqual(node.meta['tensor_meta'].memory_format, torch.channels_last_3d)
 
+    def test_nn_module_stack(self):
+        class SubModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv_mod = torch.nn.Conv2d(64, 64, (3, 3), padding=1, bias=False)
+
+            def forward(self, x):
+                return self.conv_mod(x)
+
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.sub_mod = SubModule()
+
+            def forward(self, x):
+                return self.sub_mod(x)
+
+        m = MyModule()
+        gm = torch.fx.symbolic_trace(m)
+
+        mod_stack = {}
+        expected_stack = [('sub_mod', str(type(m.sub_mod))),
+                          ('sub_mod.conv_mod', str(type(m.sub_mod.conv_mod)))]
+        for node in gm.graph.nodes:
+            mod_stack = node.meta.get('nn_module_stack', {})
+            if mod_stack:
+                break
+        stack_list = list(mod_stack.items())
+        self.assertEqual(stack_list, expected_stack)
+
     def test_interpreter(self):
         class MyModule(torch.nn.Module):
             def __init__(self):
diff --git a/torch/ao/quantization/fx/tracer.py b/torch/ao/quantization/fx/tracer.py
index 3a959447cfd6b..1ac98a13c548e 100644
--- a/torch/ao/quantization/fx/tracer.py
+++ b/torch/ao/quantization/fx/tracer.py
@@ -1,67 +1,13 @@
 import torch
 from torch.fx._symbolic_trace import Tracer
-from torch.fx.node import Target, Node, Argument
+from torch.fx.proxy import Scope
 from torch.nn.intrinsic import _FusedModule
-from typing import List, Callable, Tuple, Any, Dict, Optional
+from typing import List, Callable
 
 __all__ = [
     "QuantizationTracer",
 ]
 
-class Scope(object):
-    """ Scope object that records the module path and the module type
-    of a module. Scope is used to track the information of the module
-    that contains a Node in a Graph of GraphModule. For example::
-
-        class Sub(torch.nn.Module):
-            def forward(self, x):
-                # This will be a call_method Node in GraphModule,
-                # scope for this would be (module_path="sub", module_type=Sub)
-                return x.transpose(1, 2)
-
-        class M(torch.nn.Module):
-            def __init__(self):
-                self.sub = Sub()
-
-            def forward(self, x):
-                # This will be a call_method Node as well,
-                # scope for this would be (module_path="", None)
-                x = x.transpose(1, 2)
-                x = self.sub(x)
-                return x
-
-    """
-
-    def __init__(self, module_path: str, module_type: Any):
-        super().__init__()
-        self.module_path = module_path
-        self.module_type = module_type
-
-
-class ScopeContextManager(object):
-    """ A context manager to track the Scope of Node during symbolic tracing.
-    When entering a forward function of a Module, we'll update the scope information of
-    the current module, and when we exit, we'll restore the previous scope information.
-    """
-
-    def __init__(
-        self, scope: Scope, current_module: torch.nn.Module, current_module_path: str
-    ):
-        super().__init__()
-        self.prev_module_type = scope.module_type
-        self.prev_module_path = scope.module_path
-        self.scope = scope
-        self.scope.module_path = current_module_path
-        self.scope.module_type = type(current_module)
-
-    def __enter__(self):
-        return
-
-    def __exit__(self, *args):
-        self.scope.module_path = self.prev_module_path
-        self.scope.module_type = self.prev_module_type
-        return
-
 class QuantizationTracer(Tracer):
     def __init__(
         self, skipped_module_names: List[str], skipped_module_classes: List[Callable]
@@ -75,7 +21,6 @@ def __init__(
         # We can change this if there is a use case that configures
         # qconfig using top level module type
         self.scope = Scope("", None)
-        self.node_name_to_scope: Dict[str, Tuple[str, type]] = {}
         self.record_stack_traces = True
 
     def is_leaf_module(self, m: torch.nn.Module, module_qualified_name: str) -> bool:
@@ -88,32 +33,3 @@ def is_leaf_module(self, m: torch.nn.Module, module_qualified_name: str) -> bool
             or type(m) in self.skipped_module_classes
             or isinstance(m, _FusedModule)
         )
-
-    def call_module(
-        self,
-        m: torch.nn.Module,
-        forward: Callable[..., Any],
-        args: Tuple[Any, ...],
-        kwargs: Dict[str, Any],
-    ) -> Any:
-        module_qualified_name = self.path_of_module(m)
-        # Creating scope with information of current module
-        # scope will be restored automatically upon exit
-        with ScopeContextManager(self.scope, m, module_qualified_name):
-            return super().call_module(m, forward, args, kwargs)
-
-    def create_node(
-        self,
-        kind: str,
-        target: Target,
-        args: Tuple[Argument, ...],
-        kwargs: Dict[str, Argument],
-        name: Optional[str] = None,
-        type_expr: Optional[Any] = None,
-    ) -> Node:
-        node = super().create_node(kind, target, args, kwargs, name, type_expr)
-        self.node_name_to_scope[node.name] = (
-            self.scope.module_path,
-            self.scope.module_type,
-        )
-        return node
diff --git a/torch/ao/quantization/quantize_fx.py b/torch/ao/quantization/quantize_fx.py
index c6b1a0758492b..2bcd2e4ca7125 100644
--- a/torch/ao/quantization/quantize_fx.py
+++ b/torch/ao/quantization/quantize_fx.py
@@ -63,61 +63,6 @@ def _fuse_fx(
         graph_module, is_qat, fuse_custom_config, backend_config)  # type: ignore[operator]
 
 
-class Scope(object):
-    """ Scope object that records the module path and the module type
-    of a module. Scope is used to track the information of the module
-    that contains a Node in a Graph of GraphModule. For example::
-
-        class Sub(torch.nn.Module):
-            def forward(self, x):
-                # This will be a call_method Node in GraphModule,
-                # scope for this would be (module_path="sub", module_type=Sub)
-                return x.transpose(1, 2)
-
-        class M(torch.nn.Module):
-            def __init__(self):
-                self.sub = Sub()
-
-            def forward(self, x):
-                # This will be a call_method Node as well,
-                # scope for this would be (module_path="", None)
-                x = x.transpose(1, 2)
-                x = self.sub(x)
-                return x
-
-    """
-
-    def __init__(self, module_path: str, module_type: Any):
-        super().__init__()
-        self.module_path = module_path
-        self.module_type = module_type
-
-
-class ScopeContextManager(object):
-    """ A context manager to track the Scope of Node during symbolic tracing.
-    When entering a forward function of a Module, we'll update the scope information of
-    the current module, and when we exit, we'll restore the previous scope information.
-    """
-
-    def __init__(
-        self, scope: Scope, current_module: torch.nn.Module, current_module_path: str
-    ):
-        super().__init__()
-        self.prev_module_type = scope.module_type
-        self.prev_module_path = scope.module_path
-        self.scope = scope
-        self.scope.module_path = current_module_path
-        self.scope.module_type = type(current_module)
-
-    def __enter__(self):
-        return
-
-    def __exit__(self, *args):
-        self.scope.module_path = self.prev_module_path
-        self.scope.module_type = self.prev_module_type
-        return
-
-
 def _prepare_fx(
     model: torch.nn.Module,
     qconfig_mapping: Union[QConfigMapping, Dict[str, Any]],
diff --git a/torch/fx/_symbolic_trace.py b/torch/fx/_symbolic_trace.py
index ff9df1161a700..dfa6f5096042d 100644
--- a/torch/fx/_symbolic_trace.py
+++ b/torch/fx/_symbolic_trace.py
@@ -5,6 +5,7 @@
 import math
 import os
 import warnings
+import collections
 from itertools import chain
 from types import CodeType, FunctionType, ModuleType
 from typing import (
@@ -28,7 +29,7 @@
 from .graph import _PyTreeCodeGen, _PyTreeInfo, Graph
 from .graph_module import GraphModule
 from .node import Argument, base_types, map_aggregate
-from .proxy import ParameterProxy, Proxy, TracerBase
+from .proxy import ParameterProxy, Proxy, TracerBase, Scope, ScopeContextManager
 
 HAS_VARSTUFF = inspect.CO_VARARGS | inspect.CO_VARKEYWORDS
 
@@ -44,7 +45,6 @@
 def is_fx_tracing():
     return _is_fx_tracing_flag
 
-
 @compatibility(is_backward_compatible=True)
 class ProxyableClassMeta(type):
     """
@@ -250,6 +250,13 @@ def __init__(
         self.param_shapes_constant = param_shapes_constant
 
         self.submodule_paths: Optional[Dict[torch.nn.Module, str]] = None
+        self.root_module_name: str = ""
+        # Maps the containing module's name to the operator name
+        self.scope = Scope("", None)
+        # Records the module call stack
+        self.module_stack = collections.OrderedDict()
+        # Mapping of node name to module scope
+        self.node_name_to_scope: Dict[str, Tuple[str, type]] = {}
 
     @compatibility(is_backward_compatible=True)
     def create_arg(self, a: Any) -> "Argument":
@@ -430,9 +437,18 @@ def call_module(
             value was returned from the ``Module`` invocation.
         """
         module_qualified_name = self.path_of_module(m)
-        if not self.is_leaf_module(m, module_qualified_name):
-            return forward(*args, **kwargs)
-        return self.create_proxy("call_module", module_qualified_name, args, kwargs)
+        with ScopeContextManager(self.scope, Scope(module_qualified_name, type(m))) as _scope:
+            # module_stack is an ordered dict so writing then deleting the
+            # entry is equivalent to push/pop on a list
+            self.module_stack[_scope.module_path] = str(_scope.module_type)
+            if not self.is_leaf_module(m, module_qualified_name):
+                ret_val = forward(*args, **kwargs)
+            else:
+                ret_val = self.create_proxy("call_module", module_qualified_name, args, kwargs)
+            key, _ = self.module_stack.popitem(last=True)
+            assert key == _scope.module_path, f" Unexpected key {key}"
+
+        return ret_val
 
     @compatibility(is_backward_compatible=False)
     def getattr(self, attr: str, attr_val: Any, parameter_proxy_cache: Dict[str, Any]):
@@ -580,7 +596,7 @@ def replace_ph(x):
                 name,
                 default,
                 {},
-                type_expr=fn_for_analysis.__annotations__.get(name, None),
+                type_expr=fn_for_analysis.__annotations__.get(name, None)
             )
 
         arg_names = [next(names_iter) for idx in range(skip_arg_idx, total_args)]
@@ -663,6 +679,7 @@ def trace(
                 ), f"traced_func_name={self.traced_func_name} doesn't exist in {type(root).__name__}"
 
                 fn = getattr(type(root), self.traced_func_name)
+                self.root_module_name = root._get_name()
                 self.submodule_paths = {mod: name for name, mod in root.named_modules()}
             else:
                 self.root = torch.nn.Module()
diff --git a/torch/fx/proxy.py b/torch/fx/proxy.py
index 6f9535b117370..c77967574b59a 100644
--- a/torch/fx/proxy.py
+++ b/torch/fx/proxy.py
@@ -1,17 +1,83 @@
 import dis
+import copy
 import torch
 import inspect
 import operator
 import traceback
+import collections
 
 from .graph import magic_methods, reflectable_magic_methods, Graph
-from typing import Tuple, Dict, Optional, Iterable, Any, Iterator, Callable
+from typing import Tuple, Dict, OrderedDict, Optional, Iterable, Any, Iterator, Callable
 from .node import Target, Node, Argument, base_types, map_aggregate
 from ._compatibility import compatibility
 from .operator_schemas import check_for_mutable_operation
 import torch.fx.traceback as fx_traceback
 
-__all__ = ['TracerBase', 'GraphAppendingTracer', 'TraceError', 'Proxy', 'Attribute', 'ParameterProxy']
+__all__ = ['TracerBase', 'GraphAppendingTracer', 'TraceError',
+           'Proxy', 'Attribute', 'ParameterProxy', 'Scope',
+           'ScopeContextManager']
+
+
+@compatibility(is_backward_compatible=False)
+class Scope(object):
+    """ Scope object that records the module path and the module type
+    of a module. Scope is used to track the information of the module
+    that contains a Node in a Graph of GraphModule. For example::
+
+        class Sub(torch.nn.Module):
+            def forward(self, x):
+                # This will be a call_method Node in GraphModule,
+                # scope for this would be (module_path="sub", module_type=Sub)
+                return x.transpose(1, 2)
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                self.sub = Sub()
+
+            def forward(self, x):
+                # This will be a call_method Node as well,
+                # scope for this would be (module_path="", None)
+                x = x.transpose(1, 2)
+                x = self.sub(x)
+                return x
+
+    """
+
+    def __init__(self, module_path: str, module_type: Any):
+        super().__init__()
+        self.module_path = module_path
+        self.module_type = module_type
+
+
+@compatibility(is_backward_compatible=False)
+class ScopeContextManager(object):
+    """ A context manager to track the Scope of Node during symbolic tracing.
+    When entering a forward function of a Module, we'll update the scope information of
+    the current module, and when we exit, we'll restore the previous scope information.
+    """
+
+    def __init__(
+        self,
+        scope: Scope,
+        current_scope: Scope,
+    ):
+        super().__init__()
+        # Keep a copy of prev scope to restore on exit
+        self._prev_scope = copy.copy(scope)
+        # Update scope to current scope
+        scope.module_path = current_scope.module_path
+        scope.module_type = current_scope.module_type
+        # Save a reference so we can restore it
+        self._scope = scope
+
+    def __enter__(self):
+        return self._scope
+
+    def __exit__(self, *args):
+        self._scope.module_path = self._prev_scope.module_path
+        self._scope.module_type = self._prev_scope.module_type
+        return
+
 
 @compatibility(is_backward_compatible=True)
 class TracerBase:
@@ -29,6 +95,15 @@ class TracerBase:
     # ``root`` is an instance of ``nn.Module``
     traced_func_name: str = "forward"
 
+    # Maps the containing module's name to the operator name
+    scope : Scope
+
+    # Records the module call stack
+    module_stack: OrderedDict[str, str]
+
+    # Mapping of node name to module scope
+    node_name_to_scope: Dict[str, Tuple[str, type]]
+
     @compatibility(is_backward_compatible=True)
     def create_node(self, kind : str, target : Target,
                     args : Tuple[Argument, ...], kwargs : Dict[str, Argument], name : Optional[str] = None,
@@ -43,7 +118,16 @@ def create_node(self, kind : str, target : Target,
         if kind == 'call_function' and self.check_mutable_operations:
             check_for_mutable_operation(target, args, kwargs)
 
-        return self.graph.create_node(kind, target, args, kwargs, name, type_expr)
+        node = self.graph.create_node(kind, target, args, kwargs, name, type_expr)
+        # TODO node_name_to_scope will be depricated in favor of
+        # node.meta['nn_module_stack']
+        self.node_name_to_scope[node.name] = (
+            self.scope.module_path,
+            self.scope.module_type,
+        )
+        if self.module_stack:
+            node.meta['nn_module_stack'] = copy.copy(self.module_stack)
+        return node
 
     @compatibility(is_backward_compatible=True)
     def proxy(self, node: Node) -> 'Proxy':
@@ -207,6 +291,9 @@ class GraphAppendingTracer(TracerBase):
     def __init__(self, graph: Graph):
         super().__init__()
         self.graph = graph
+        self.scope = Scope("", None)
+        self.module_stack = collections.OrderedDict()
+        self.node_name_to_scope = {}
 
 @compatibility(is_backward_compatible=False)
 def assert_fn(x):
diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py
index aad3bc7253e4b..1f519e991ca64 100644
--- a/torch/quantization/quantize_fx.py
+++ b/torch/quantization/quantize_fx.py
@@ -11,8 +11,6 @@
     _check_is_graph_module,
     _swap_ff_with_fxff,
     _fuse_fx,
-    Scope,
-    ScopeContextManager,
     QuantizationTracer,
     _prepare_fx,
     _prepare_standalone_module_fx,

From 761c73776fabff78f54ff004fcdc93e81d39e580 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 19 Jul 2022 11:38:17 -0700
Subject: [PATCH 1767/1922] Add back support for PYTORCH_TEST_WITH_MPS (#66)

Fix the TEST_WITH_MPS macro.
---
 torch/testing/_internal/common_device_type.py | 8 +++-----
 torch/testing/_internal/common_utils.py       | 1 +
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index cf3c3b4189815..df130de09c9de 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -13,7 +13,7 @@
 import torch.backends.mps
 from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM, TEST_MKL, \
     skipCUDANonDefaultStreamIf, TEST_WITH_ASAN, TEST_WITH_UBSAN, TEST_WITH_TSAN, \
-    IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, IS_WINDOWS, \
+    IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, IS_WINDOWS, TEST_WITH_MPS, \
     _TestParametrizer, compose_parametrize_fns, dtype_name, \
     TEST_WITH_MIOPEN_SUGGEST_NHWC, NATIVE_DEVICES, skipIfTorchDynamo
 from torch.testing._internal.common_cuda import _get_torch_cuda_version, \
@@ -531,10 +531,8 @@ def get_device_type_test_bases():
         test_bases.append(CPUTestBase)
         if torch.cuda.is_available():
             test_bases.append(CUDATestBase)
-        # Disable MPS testing in generic device testing temporarily while we're
-        # ramping up support.
-        # elif torch.backends.mps.is_available():
-        #   test_bases.append(MPSTestBase)
+        elif torch.backends.mps.is_available():
+          test_bases.append(MPSTestBase)
 
     return test_bases
 
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 87eb9280407ec..545be1904b7c8 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -887,6 +887,7 @@ def _check_module_exists(name: str) -> bool:
 TEST_WITH_TSAN = os.getenv('PYTORCH_TEST_WITH_TSAN', '0') == '1'
 TEST_WITH_UBSAN = os.getenv('PYTORCH_TEST_WITH_UBSAN', '0') == '1'
 TEST_WITH_ROCM = os.getenv('PYTORCH_TEST_WITH_ROCM', '0') == '1'
+TEST_WITH_MPS = os.getenv('PYTORCH_TEST_WITH_MPS', '0') == '1'
 
 # TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
 # See #64427

From fa09cf9e000e971052144f7e00c5a288ab2ca985 Mon Sep 17 00:00:00 2001
From: Abhishek Pathak <abhipathak97@gmail.com>
Date: Thu, 11 Aug 2022 09:42:19 -0700
Subject: [PATCH 1768/1922] Add error messages for int64 non-available ops
 (#80)

* Add error messages for int64 non-available ops

* Move warning to common code
---
 aten/src/ATen/native/mps/operations/Activation.mm | 1 +
 aten/src/ATen/native/mps/operations/UnaryOps.mm   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/aten/src/ATen/native/mps/operations/Activation.mm b/aten/src/ATen/native/mps/operations/Activation.mm
index 618a00f337876..223599121c0e1 100644
--- a/aten/src/ATen/native/mps/operations/Activation.mm
+++ b/aten/src/ATen/native/mps/operations/Activation.mm
@@ -416,6 +416,7 @@ Tensor relu_mps(const Tensor& self) {
   using namespace mps;
   using CachedGraph = MPSUnaryCachedGraph;
   TORCH_CHECK(output.is_mps());
+  TORCH_CHECK(self.scalar_type() != ScalarType::Long, "MPS does not support sigmoid op with int64 input")
 
   if(output.numel() == 0) {
     return;
diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index 3d641d3af82cc..3f075ed87bd0b 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -188,6 +188,7 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
 
 TORCH_IMPL_FUNC(log1p_out_mps) (const Tensor& self, const Tensor& output)
 {
+    TORCH_CHECK(self.scalar_type() != ScalarType::Long, "MPS does not support log1p op with int64 input")
     using namespace mps;
     if (!output.is_same_size(self)) {
       output.resize_(self.sizes());

From a96aef3bf3974365017b56177f5938ac7a9706ac Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Thu, 11 Aug 2022 13:08:07 -0400
Subject: [PATCH 1769/1922] Return input in addcmul/div if value is zero (#84)

Also remove the unnecessary resize (structured op)
---
 aten/src/ATen/native/mps/operations/PointwiseOps.mm | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/PointwiseOps.mm b/aten/src/ATen/native/mps/operations/PointwiseOps.mm
index 8da6b94dd8569..4c3e7d9e50cc1 100644
--- a/aten/src/ATen/native/mps/operations/PointwiseOps.mm
+++ b/aten/src/ATen/native/mps/operations/PointwiseOps.mm
@@ -15,8 +15,9 @@
                              const bool is_div,
                              const string op_name)
 {
-  if (&output != &self) {
-    output.resize_(output.sizes());
+  if (value_opt.toDouble() == 0.0) {
+    output.copy_(self);
+    return output;
   }
 
   if(output.numel() == 0) {
@@ -49,7 +50,7 @@
             newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
             newCachedGraph->firstTensor = mpsGraphRankedPlaceHolder(mpsGraph, tensor1);
             newCachedGraph->secondTensor = mpsGraphRankedPlaceHolder(mpsGraph, tensor2);
-            newCachedGraph->valueTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSScalarType(self.scalar_type()));
+            newCachedGraph->valueTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(self.scalar_type()), @[@1]);
 
             // the tensor to be optionally multiplied by value_scalar
             MPSGraphTensor *multiplicandTensor = nil;

From c4161df52d1c708772ae024f4be6cbb4eaef438d Mon Sep 17 00:00:00 2001
From: Abhishek Pathak <abhipathak97@gmail.com>
Date: Wed, 17 Aug 2022 16:42:03 -0700
Subject: [PATCH 1770/1922] Update allow and block lists (#88)

* Update allow and block lists

* Add one more to block list

* Add more blocked ops

Co-authored-by: abhipathak97 <abhipathak97@mps10.scv.apple.com>
---
 test/test_mps.py | 128 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 128 insertions(+)

diff --git a/test/test_mps.py b/test/test_mps.py
index 4133ad4938b83..472605472afb8 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -7792,6 +7792,134 @@ class TestConsistency(TestCase):
         'inner': None,
         'dstack': None,
         'take_along_dim': None,
+
+        # New block list ops that need investigation
+        '__rdiv__': ['torch.bool', 'torch.int64'], 
+        '__rpow__': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.uint8'], 
+        '_masked.amax': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        '_masked.amin': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        '_masked.argmax': ['torch.float16', 'torch.float32', 'torch.int32'], 
+        '_masked.argmin': ['torch.float16', 'torch.float32', 'torch.int32'], 
+        '_masked.logsumexp': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        '_masked.mean': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        '_masked.prod': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        '_masked.std': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        '_masked.sum': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        '_masked.var': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'acos': ['torch.bool'], 
+        'acosh': ['torch.bool'], 
+        'amax': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'amin': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'as_strided_scatter': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'asin': ['torch.bool'], 
+        'asinh': ['torch.bool'], 
+        'atan2': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'], 
+        'atan': ['torch.bool'], 
+        'atanh': ['torch.bool'], 
+        'bernoulli': ['torch.float32'], 
+        'byte': ['torch.float16', 'torch.float32'], 
+        'char': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64'], 
+        'clamp': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'clamp_max': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'clamp_min': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'constant_pad_nd': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'corrcoef': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'cos': ['torch.bool'], 
+        'cosh': ['torch.bool'], 
+        'count_nonzero': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'cov': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'diff': ['torch.bool', 'torch.uint8'], 
+        'eig': ['torch.float32'], 
+        'empty_like': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'erf': ['torch.bool'], 
+        'exp2': ['torch.bool'], 
+        'exp': ['torch.bool'], 
+        'fft.fft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'fft.ifft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'fft.ihfft2': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'fft.ihfft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'fft.ihfftn': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'fft.rfft2': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'fft.rfft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'fft.rfftn': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'float': ['torch.bool', 'torch.float16', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'gather': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'gradient': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32'], 
+        'half': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'index_put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'index_select': ['torch.uint8'], 
+        'int': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int64', 'torch.uint8'], 
+        'linalg.eigvals': ['torch.float32'], 
+        'linalg.multi_dot': ['torch.float32'], 
+        'log10': ['torch.bool'], 
+        'log1p': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'], 
+        'log2': ['torch.bool'], 
+        'log': ['torch.bool'], 
+        'logical_and': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'logical_or': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'logical_xor': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'logsumexp': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'matmul': ['torch.uint8'], 
+        'mean': ['torch.float16', 'torch.float32'], 
+        'native_layer_norm': ['torch.float32'], 
+        'neg': ['torch.uint8'], 
+        'new_empty': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'nn.functional.adaptive_avg_pool1d': ['torch.float32'], 
+        'nn.functional.adaptive_avg_pool2d': ['torch.float32'], 
+        'nn.functional.avg_pool1d': ['torch.float32', 'torch.int64'], 
+        'nn.functional.avg_pool2d': ['torch.float32', 'torch.int64'], 
+        'nn.functional.bilinear': ['torch.float32'], 
+        'nn.functional.conv_transpose2d': ['torch.float32'], 
+        'nn.functional.cosine_embedding_loss': ['torch.uint8'], 
+        'nn.functional.cosine_similarity': ['torch.float32'], 
+        'nn.functional.dropout2d': ['torch.float32'], 
+        'nn.functional.dropout3d': ['torch.float32'], 
+        'nn.functional.dropout': ['torch.float32'], 
+        'nn.functional.gelu': ['torch.float32'], 
+        'nn.functional.interpolate': ['torch.float32', 'torch.float32', 'torch.float32'], 
+        'nn.functional.layer_norm': ['torch.float32'], 
+        'nn.functional.margin_ranking_loss': ['torch.uint8'], 
+        'nn.functional.max_pool1d': ['torch.float32'], 
+        'nn.functional.max_pool2d': ['torch.float32'], 
+        'nn.functional.normalize': ['torch.float32'], 
+        'nn.functional.pad': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64'], 
+        'nn.functional.pairwise_distance': ['torch.uint8'], 
+        'nn.functional.softsign': ['torch.int32'], 
+        'nn.functional.triplet_margin_loss': ['torch.uint8'], 
+        'nn.functional.triplet_margin_with_distance_loss': ['torch.uint8'], 
+        'nn.functional.upsample_nearest': ['torch.float32'], 
+        'normal': ['torch.float16', 'torch.float32', 'torch.float16', 'torch.float32'], 
+        'pow': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.uint8'], 
+        'prod': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'rand_like': ['torch.float16', 'torch.float32'], 
+        'randint_like': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'randn_like': ['torch.float16', 'torch.float32'], 
+        'reciprocal': ['torch.bool'], 
+        'rsqrt': ['torch.bool'], 
+        'scatter_add': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'scatter': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'short': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'sigmoid': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'], 
+        'sin': ['torch.bool'], 
+        'sinh': ['torch.bool'], 
+        'sqrt': ['torch.bool'], 
+        'sub': ['torch.float16', 'torch.uint8'], 
+        'sum': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'tan': ['torch.bool', 'torch.float32'], 
+        'tanh': ['torch.bool'], 
+        'tensor_split': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'topk': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'trapz': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'true_divide': ['torch.int32', 'torch.int64'],
+        'nn.functional.local_response_norm': ['torch.int64'],
+        'flip': ['torch.bool'],
+        'fliplr': ['torch.bool'],
+        'flipud': ['torch.bool'],
+        'index_select': ['torch.bool'],
+        'repeat': ['torch.bool'],
+        'rot90': ['torch.bool'],
+        'tile': ['torch.bool'],
     }
 
     # Used for accept mode only

From c51e86edda1f83e7f1056cb7761378154dc3502f Mon Sep 17 00:00:00 2001
From: Abhishek Pathak <abhipathak97@gmail.com>
Date: Thu, 25 Aug 2022 06:57:53 -0700
Subject: [PATCH 1771/1922] Include scalar params in caching key (#94)

* Include scalar params in caching key

* Add key for softplus backward; add test for scalar params
---
 aten/src/ATen/native/mps/operations/Activation.mm | 4 ++--
 test/test_mps.py                                  | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Activation.mm b/aten/src/ATen/native/mps/operations/Activation.mm
index 223599121c0e1..386b0255bdc0b 100644
--- a/aten/src/ATen/native/mps/operations/Activation.mm
+++ b/aten/src/ATen/native/mps/operations/Activation.mm
@@ -1476,7 +1476,7 @@ Tensor glu_backward_mps (const Tensor& grad_output,
       MPSScalar threshold_scalar = getMPSScalar(threshold, ScalarType::Float);
 
       @autoreleasepool {
-        string key = "softplus_out_mps:" + getTensorsStringKey({self});
+        string key = "softplus_out_mps:" + getTensorsStringKey({self}) + ":" + std::to_string(beta.to<double>()) + ":" + std::to_string(threshold.to<double>());
 
         CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
         if(!cachedGraph) {
@@ -1581,7 +1581,7 @@ Tensor glu_backward_mps (const Tensor& grad_output,
       MPSStream* stream = getCurrentMPSStream();
 
       @autoreleasepool {
-        string key = "softplus_backward_out_mps:" + getTensorsStringKey({grad_output, self});
+        string key = "softplus_backward_out_mps:" + getTensorsStringKey({grad_output, self}) + ":" + std::to_string(beta.to<double>()) + ":" + std::to_string(threshold.to<double>());
 
         CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
         if(!cachedGraph) {
diff --git a/test/test_mps.py b/test/test_mps.py
index 472605472afb8..6a74149dd7c48 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -3906,7 +3906,7 @@ def helper(shape, dim=0):
 
     # Test softplus
     def test_softplus(self):
-        def helper(shape, beta=0.5, threshold=0.5):
+        def helper(shape, beta=1, threshold=20):
             cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
             x = cpu_x.detach().clone().to('mps').requires_grad_()
 
@@ -3924,9 +3924,9 @@ def helper(shape, beta=0.5, threshold=0.5):
 
         # Test empty shape too
         for shape in [(), (2, 3), (10, 10), (2, 3, 4, 5)]:
-            helper(shape)
-            helper(shape, beta=0.6, threshold=0.6)  # relu path
-            helper(shape, beta=1, threshold=20)  # softplus path
+            for beta in [0.5, 1, 2, 3, 4]:
+                for threshold in [0.5, 20, 30, 40, 50]:
+                    helper(shape, beta, threshold)
 
     # Test silu
 

From 848ef6c109786b1847841607aae6e79317458365 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Fri, 26 Aug 2022 22:33:39 +0100
Subject: [PATCH 1772/1922] Skip unsupported tests / Add supported mps dtypes
 (#98)

---
 test/test_autograd.py                         |  3 +
 test/test_indexing.py                         |  3 +-
 test/test_nn.py                               | 82 ++++++++++++++++++-
 test/test_torch.py                            | 43 +++++++++-
 test/test_view_ops.py                         | 25 +++++-
 torch/testing/_internal/common_device_type.py |  8 ++
 torch/testing/_internal/common_utils.py       |  2 +-
 7 files changed, 154 insertions(+), 12 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index 74cf400aee6fc..f7efb9f64216a 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -8164,6 +8164,7 @@ def test_min_max_median_backprops_to_all_values(self, device):
                 self.assertEqual(x.grad.sum(), 1.)
                 self.assertEqual((x.grad == 1 / 3).sum(), 3)
 
+    @skipIfMps
     def test_scatter_index_reduce_amin_amax_backprops_to_all_values(self, device):
         # tests that gradients are evenly distributed when there are multiple max/min values
         # tested here instead of adding a SampleInput as the backward for this case is non-differentiable for gradgrad
@@ -8179,6 +8180,7 @@ def test_scatter_index_reduce_amin_amax_backprops_to_all_values(self, device):
 
             gradcheck(fn, (input, 0, idx, src, reduction), check_batched_grad=False)
 
+    @skipIfMps
     def test_scatter_index_reduce_prod_gradgrad_error(self, device):
         # test that double backward raises an error for the case where 2 zeros in src
         # are scattered to the same position in self
@@ -8993,6 +8995,7 @@ def do_test():
 
         self.assertNotWarn(do_test)
 
+    @skipIfMps
     def test_to_r_to_c(self, device):
         def do_test():
             inp_r = torch.randn(3, 2, dtype=torch.double, device=device,
diff --git a/test/test_indexing.py b/test/test_indexing.py
index 5dc23a3d54653..5b0d9f51360b3 100644
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@@ -12,13 +12,14 @@
 
 from torch.testing import make_tensor
 from torch.testing._internal.common_utils import (
-    TestCase, run_tests, TEST_WITH_TORCHDYNAMO)
+    TestCase, run_tests, TEST_WITH_TORCHDYNAMO, skipIfMps)
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests, onlyCUDA, dtypes, dtypesIfCPU, dtypesIfCUDA,
     onlyNativeDeviceTypes)
 
 
 class TestIndexing(TestCase):
+    @skipIfMps
     def test_index(self, device):
 
         def consec(size, start=1):
diff --git a/test/test_nn.py b/test/test_nn.py
index bd54ca01fdfc9..2c3b1727fe510 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -44,7 +44,7 @@
     module_tests, criterion_tests, loss_reference_fns, _create_basic_net, \
     ctcloss_reference, new_module_tests, single_batch_reference_fn, _test_bfloat16_ops, _test_module_empty_input
 from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes, \
-    dtypesIfCUDA, precisionOverride, skipCUDAIfCudnnVersionLessThan, onlyCUDA, onlyCPU, \
+    dtypesIfMPS, dtypesIfCUDA, precisionOverride, skipCUDAIfCudnnVersionLessThan, onlyCUDA, onlyCPU, \
     skipCUDAIfRocm, skipCUDAIf, skipCUDAIfNotRocm, \
     onlyNativeDeviceTypes, deviceCountAtLeast, largeTensorTest, expectedFailureMeta, skipMeta, get_all_device_types
 
@@ -8117,6 +8117,7 @@ def test_instancenorm_raises_error_if_less_than_one_value_per_channel(self, devi
         with self.assertRaises(ValueError):
             torch.nn.InstanceNorm1d(10)(x).to(device)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_instancenorm_raises_error_for_single_spatial_element_during_training(self, device):
         BATCH_SIZE = 10
         NUM_CHANNELS = 3
@@ -8207,6 +8208,7 @@ def test_GroupNorm_raises_error_if_one_value_per_group(self, device):
         with self.assertRaises(ValueError):
             torch.nn.GroupNorm(10, 10)(x).to(device)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_GroupNorm_empty(self, device):
         mod = torch.nn.GroupNorm(2, 4).to(device)
         inp = torch.randn(0, 4, 2, 2, device=device)
@@ -8282,6 +8284,7 @@ def group_norm_ref(X, gamma, beta, groups, channels, eps):
 
     @onlyNativeDeviceTypes
     @dtypes(torch.float64, torch.complex128)
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_pad(self, device, dtype):
         # Assert assertion errors are raised for invalid circular padding values
         inputs = torch.randn(1, 1, 4, device=device, dtype=dtype, requires_grad=True)
@@ -8313,6 +8316,7 @@ def test_pad(self, device, dtype):
 
     @onlyNativeDeviceTypes
     @dtypes(torch.float64, torch.complex128)
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_ReplicationPad_empty(self, device, dtype):
         for mod, inp in [
                 (torch.nn.ReplicationPad1d(3), torch.randn(0, 3, 10, device=device, dtype=dtype)),
@@ -8335,6 +8339,7 @@ def test_ReplicationPad_empty(self, device, dtype):
             inp = torch.randn(3, 0, 10, 10, 10, device=device, dtype=dtype)
             mod(inp)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_ReplicationPad1d_large(self, device):
         shapes = ([2, 65736, 4], [65736, 2, 4])
         pl, pr = 3, 4
@@ -8359,6 +8364,7 @@ def test_ReplicationPad1d_large(self, device):
             self.assertEqual(x.grad[:, :, 0], g[:, :, : pl + 1].sum(-1))
             self.assertEqual(x.grad[:, :, -1], g[:, :, -pr - 1:].sum(-1))
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_ReplicationPad2d_large(self, device):
         shapes = ([2, 65736, 4, 4], [65736, 2, 4, 4])
         pl, pr, pt, pb = 3, 4, 5, 6
@@ -8424,6 +8430,8 @@ def test_ReplicationPad3d_large(self, device):
             self.assertEqual(x.grad[:, :, 1:-1, 1:-1, 1:-1], g[:, :, pf + 1 : -pbk - 1, pt + 1 : -pbt - 1, pl + 1 : -pr - 1])
 
     @onlyNativeDeviceTypes
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
+
     def test_Bilinear_empty(self, device):
         mod = torch.nn.Bilinear(20, 30, 40).to(device)
         inp1 = torch.randn(0, 10, 20, requires_grad=True, device=device)
@@ -8440,6 +8448,7 @@ def test_Bilinear_empty(self, device):
 
     @expectedFailureMeta  # RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1]
     @onlyNativeDeviceTypes
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_TransformerEncoderLayer_empty(self, device):
         for training in (True, False):
             for batch_first, input_shape in [(True, (0, 10, 512)),
@@ -8467,6 +8476,7 @@ def test_TransformerEncoderLayer_empty(self, device):
 
     @expectedFailureMeta  # RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1]
     @onlyNativeDeviceTypes
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_TransformerEncoder_empty(self, device):
         for batch_first, input_shape in [(True, (0, 10, 512)),
                                          (False, (10, 0, 512))]:
@@ -8477,6 +8487,7 @@ def test_TransformerEncoder_empty(self, device):
 
     @expectedFailureMeta  # RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1]
     @onlyNativeDeviceTypes
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_TransformerDecoderLayer_empty(self, device):
         for batch_first, memory_shape, tgt_shape in [(True, (0, 10, 512), (0, 20, 512)),
                                                      (False, (10, 0, 512), (20, 0, 512))]:
@@ -8498,6 +8509,7 @@ def test_TransformerDecoder_empty(self, device):
 
     @expectedFailureMeta  # RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1]
     @onlyNativeDeviceTypes
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_Transformer_empty(self, device):
         for batch_first, src_shape, tgt_shape in [(True, (10, 0, 512), (20, 0, 512))]:
             transformer_model = nn.Transformer(nhead=16, num_encoder_layers=12).to(device)
@@ -8507,6 +8519,7 @@ def test_Transformer_empty(self, device):
 
     @onlyNativeDeviceTypes
     @dtypes(torch.float32, torch.complex64)
+    @dtypesIfMPS(torch.float32)
     def test_ReflectionPad_empty(self, device, dtype):
         for mod, inp in [
                 (torch.nn.ReflectionPad1d(2), torch.randn(0, 3, 10, device=device, dtype=dtype)),
@@ -8551,6 +8564,7 @@ def test_ReflectionPad2d_large(self, device):
             self.assertEqual(x.grad, ref_x.grad)
 
     @onlyNativeDeviceTypes
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_LocalResponseNorm_empty(self, device):
         mod = torch.nn.LocalResponseNorm(2).to(device)
         inp = torch.ones(0, 5, 24, 24, device=device)
@@ -8579,6 +8593,7 @@ def test_ReflectionPad3d_large(self, device):
 
     @onlyNativeDeviceTypes
     @dtypes(torch.float, torch.double)
+    @dtypesIfMPS(torch.float)
     def test_MarginLoss_empty(self, device, dtype):
         for mod, x, y in [
                 (torch.nn.MultiMarginLoss().to(device),
@@ -8683,6 +8698,7 @@ def check_rnn_grads(rnn1, rnn2):
                     else:
                         self.assertEqual(hx.grad, hx_device.grad)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_BatchNorm_empty(self, device):
         mod = torch.nn.BatchNorm2d(3).to(device)
         inp = torch.randn(0, 3, 2, 2, device=device)
@@ -8704,6 +8720,7 @@ def test_prelu_backward_32bit_indexing(self, device):
         output = m(input_)
         output.backward(input_)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_linear_empty(self, device):
         mod = torch.nn.Linear(7, 7).to(device)
         inp = torch.randn(0, 7, device=device)
@@ -8759,6 +8776,7 @@ def test_one_hot(self, device):
         with self.assertRaises(RuntimeError):
             torch.nn.functional.one_hot(torch.tensor([3, 4, 1, 0], device=device), -2)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_nn_empty(self, device):
         # One off tests to ensure scalars from nn.yaml are properly applied
         def verify_scalars(input, output):
@@ -8774,6 +8792,7 @@ def verify_scalars(input, output):
                 output = m(input)
                 verify_scalars(input, output)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_nn_scalars(self, device):
         # One off tests to ensure scalars from nn.yaml are properly applied
         def verify_scalars(input, output):
@@ -8793,6 +8812,7 @@ def verify_scalars(input, output):
                 output = m(input)
                 verify_scalars(input, output)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_nn_scalars_reductions(self, device):
         # One off tests to ensure scalars from nn.yaml are properly applied
         def verify_reduction_scalars(input, reduction, output):
@@ -8818,6 +8838,7 @@ def verify_reduction_scalars(input, reduction, output):
 
     # verify that bogus reduction strings are errors
     @onlyNativeDeviceTypes
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_invalid_reduction_strings(self, device):
         input = torch.randn(3, 5, requires_grad=True, device=device)
         cinput = torch.randn(3, 5, requires_grad=True, device=device, dtype=torch.cfloat)
@@ -8866,6 +8887,7 @@ def v(fn):
             v(lambda: F.soft_margin_loss(input, input.sign().detach(), reduction=reduction))
 
     @onlyNativeDeviceTypes
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_smooth_l1_loss_vs_huber_loss(self, device):
         def _make_test_tensor(shape, contiguous=True):
             if contiguous:
@@ -8954,6 +8976,7 @@ def func(device):
 
     # We don't want to make propagating NaN a hard requirement on ops, but for
     # these easy ones, we should make them do so.
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_nonlinearity_propagate_nan(self, device):
         def test(nonlinearity, *args, **kwargs):
             x = torch.tensor([nan], device=device)
@@ -9091,6 +9114,7 @@ def helper(isize, osize):
         helper(20, 11)
         helper(10, 15)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_upsamplingNearest2d(self, device):
         # Forward AD does not support XLA because XLA tensors don't have storage
         check_forward_ad = torch.device(device).type != 'xla'
@@ -9209,6 +9233,7 @@ def helper(memory_format, isize, osize):
         helper(torch.contiguous_format, 10, 15)
         helper(torch.channels_last, 10, 15)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_upsamplingNearest3d(self, device):
         # Forward AD does not support XLA because XLA tensors don't have storage
         check_forward_ad = torch.device(device).type != 'xla'
@@ -9321,6 +9346,7 @@ def helper(memory_format, isize, osize):
 
     @parametrize_test("antialias", [True, False])
     @parametrize_test("align_corners", [True, False])
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_upsamplingBilinear2d(self, device, antialias, align_corners):
         # Forward AD does not support XLA because XLA tensors don't have storage
         check_forward_ad = torch.device(device).type != 'xla'
@@ -9399,6 +9425,7 @@ def test_upsamplingBilinear2d_aa_correctness(self, device, memory_format):
 
     @parametrize_test("antialias", [True, False])
     @parametrize_test("align_corners", [True, False])
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_upsamplingBicubic2d(self, device, antialias, align_corners):
         kwargs = dict(mode='bicubic', align_corners=align_corners, antialias=antialias)
         # test float scale factor up & downsampling
@@ -9418,6 +9445,7 @@ def test_upsamplingBicubic2d(self, device, antialias, align_corners):
             inpt = torch.ones(2, 3, 8, 8, requires_grad=True, device=device)
             gradcheck(lambda x: F.interpolate(x, out_size, **kwargs), [inpt], nondet_tol=nondet_tol)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_upsamplingBicubic2d_correctness(self, device):
         # test output against known input: align_corners=False result must match opencv
         in_t = torch.arange(8., device=device).view(1, 2, 2, 2)
@@ -9947,6 +9975,7 @@ def _test_gumbel_softmax_straight_through(self, device, dtype):
         num_draws = 100
 
         logits = torch.tensor([[0.2, 0.8, 0.1]], device=device)
+
         logits = logits.reshape([1, 3])
         logits = logits.to(dtype).requires_grad_()
         probs = logits.softmax(dim=-1)
@@ -9986,9 +10015,9 @@ def _test_gumbel_softmax_grad(self, device, dtype):
         tol = 2 * torch.finfo(dtype).eps
         self.assertEqual(logits_soft.grad, logits_hard.grad, atol=tol, rtol=0)
 
-    @skipIfMps
     @dtypesIfCUDA(torch.half, torch.float, torch.double)
     @dtypes(torch.float, torch.double)
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_gumbel_softmax(self, device, dtype):
         self._test_gumbel_softmax_st_shapes(device, dtype, shape=[5], dim=0, count_expected=1)
         self._test_gumbel_softmax_st_shapes(device, dtype, shape=[5], dim=-1, count_expected=1)
@@ -10015,6 +10044,7 @@ def _test_rnn_retain_variables(self, device, dtype):
                 self.assertEqual(grads, grads2)
 
     @dtypesIfCUDA(torch.half, torch.float, torch.double)
+    @dtypesIfMPS(torch.half, torch.float)
     @dtypes(torch.double)
     def test_rnn_retain_variables(self, device, dtype):
         self._test_rnn_retain_variables(device, dtype)
@@ -10065,6 +10095,7 @@ def flatten_out(mod, inp):
     # Merge into OpInfo?
     @skipMeta  # LSTM cell reuses output which was resized
     @dtypes(torch.double)
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_LSTM_grad_and_gradgrad(self, device, dtype):
         hsize = 4
         inp = torch.rand(1, 3, hsize, device=device, dtype=dtype, requires_grad=True)
@@ -10074,6 +10105,7 @@ def test_LSTM_grad_and_gradgrad(self, device, dtype):
 
     @skipMeta  # GRU cell reuses output which was resized
     @dtypes(torch.double)
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_GRU_grad_and_gradgrad(self, device, dtype):
         hsize = 4
         inp = torch.rand(1, 3, hsize, device=device, dtype=dtype, requires_grad=True)
@@ -10336,6 +10368,7 @@ def _test_batchnorm_grad(self, device, dtype=torch.double):
             _assertGradAndGradgradChecks(self, F.batch_norm, (input, running_mean, running_var, weight, bias,
                                                               training, 0.1, 0.0001))
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_batchnorm_grad(self, device):
         self._test_batchnorm_grad(device)
 
@@ -10374,6 +10407,7 @@ def test_layernorm_weight_bias(self):
         out_zero_bias = torch.layer_norm(input, normalized_shape, data, bias, eps)
         self.assertEqual(out_none_bias, out_zero_bias)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_hardsigmoid_grad(self, device):
         inputs = (torch.randn(4, 16, 16, device=device) - 0.5) * 10
         inputs.requires_grad = True
@@ -10381,6 +10415,7 @@ def test_hardsigmoid_grad(self, device):
 
     # currently fails on XLA
     @onlyNativeDeviceTypes
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_hardswish_grad(self, device):
         inputs = (torch.randn(4, 16, 16, device=device) - 0.5) * 10
         inputs.requires_grad = True
@@ -10572,6 +10607,7 @@ def test_batchnorm_simple_average_mixed(self, device, dtype):
 
     @onlyNativeDeviceTypes
     @dtypes(torch.float, torch.double)
+    @dtypesIfMPS(torch.float)
     def test_grid_sample_nan_inf(self, device, dtype):
         input = torch.zeros([1, 1, 3, 3], device=device, dtype=dtype)
         grid = torch.tensor([[[[nan, 0], [0, inf]]]], device=device, dtype=dtype)
@@ -10600,6 +10636,7 @@ def test_CTCLoss_empty_target(self, device):
     # Merge into OpInfo?
     @skipCUDAIf(True, """Test is flaky on Linux and Windows, typical error message:
                           https://github.com/pytorch/pytorch/issues/34870""")
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_ctc_loss(self, device):
         batch_size = 64
         num_labels = 101
@@ -10792,6 +10829,7 @@ def test_batchnorm_update_stats(self, device):
             with torch.backends.cudnn.flags(enabled=False):
                 self._test_batchnorm_update_stats(device)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_multi_margin_loss_errors(self, device):
         self.assertRaises(RuntimeError,
                           lambda: nn.functional.multi_margin_loss(torch.randn(5, device=device),
@@ -10853,18 +10891,21 @@ def test_nll_loss_mismatched_batch(self, device):
         with self.assertRaisesRegex(ValueError, 'Expected.*batch_size'):
             F.nll_loss(x, t)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_nll_loss_out_of_bounds_ignore_index(self, device):
         x = torch.randn(6, 3, requires_grad=True, device=device)
         t = torch.tensor([0, 1, 255, 0, 1, 2], dtype=torch.int64, device=device)
         for reduction in ['mean', 'none']:
             F.nll_loss(x, t, ignore_index=255, reduction=reduction).sum().backward()
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_nll_loss_invalid_target_dim(self, device):
         x = torch.randn((10, 3), device=device)
         t = torch.zeros((10, 2), dtype=torch.int64, device=device)
         with self.assertRaisesRegex(RuntimeError, "1D target tensor expected"):
             F.nll_loss(x, t)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_nll_loss_invalid_weights(self, device):
         x = torch.randn((10, 3), device=device)
         t = torch.empty(10, dtype=torch.int64, device=device).random_(0, 3)
@@ -10922,6 +10963,7 @@ def _nll_loss_helper(self, input_size, reduction, expected, device):
         output.sum().backward()
         self.assertEqual(input.grad.size(), input.size())
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_nll_loss_empty_tensor_reduction_none(self, device):
         self._nll_loss_helper([0, 3], "none", torch.empty([0], device=device), device)
         self._nll_loss_helper([0, 3, 5, 7], "none", torch.empty([0, 5, 7], device=device), device)
@@ -10929,6 +10971,7 @@ def test_nll_loss_empty_tensor_reduction_none(self, device):
         self._nll_loss_helper([2, 3, 5, 0], "none", torch.empty([2, 5, 0], device=device), device)
         self._nll_loss_helper([2, 3, 5, 7, 0], "none", torch.empty([2, 5, 7, 0], device=device), device)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     @unittest.skipIf(TEST_WITH_UBSAN, "division-by-zero error with UBSAN")
     def test_nll_loss_empty_tensor_reduction_mean(self, device):
         nan = torch.tensor(float('nan'), device=device)
@@ -10938,6 +10981,7 @@ def test_nll_loss_empty_tensor_reduction_mean(self, device):
         self._nll_loss_helper([2, 3, 5, 0], "mean", nan, device)
         self._nll_loss_helper([2, 3, 5, 7, 0], "mean", nan, device)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_nll_loss_empty_tensor_reduction_sum(self, device):
         zero = torch.tensor(0, device=device)
         self._nll_loss_helper([0, 3], "sum", zero, device)
@@ -10947,6 +10991,7 @@ def test_nll_loss_empty_tensor_reduction_sum(self, device):
         self._nll_loss_helper([2, 3, 5, 7, 0], "sum", zero, device)
 
     @unittest.skipIf(TEST_WITH_UBSAN, "division-by-zero error with UBSAN")
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_nll_loss_total_weight_is_zero(self, device):
 
         def helper(input_size):
@@ -10964,6 +11009,7 @@ def helper(input_size):
         helper([2, 3, 5, 7, 9])
 
     @unittest.skipIf(TEST_WITH_UBSAN, "division-by-zero error with UBSAN")
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_nll_loss_all_ignored(self, device):
 
         def helper(input_size):
@@ -10979,6 +11025,7 @@ def helper(input_size):
         helper([2, 3, 5, 7])
         helper([2, 3, 5, 7, 9])
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_nll_loss_byte_target_matches_long(self, device):
         N, C = 10, 4
         input = torch.randn(N, C, device=device, requires_grad=True)
@@ -11001,6 +11048,7 @@ def compute_result_and_gradient(reduction, target_dtype):
             self.assertEqual(result_long, result_byte)
             self.assertEqual(grad_long, grad_byte)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_cross_entropy_loss_prob_target_all_reductions(self, device):
         # Test with k-dimensional loss.
         for k in range(5):
@@ -11017,6 +11065,7 @@ def test_cross_entropy_loss_prob_target_all_reductions(self, device):
                     input, target, reduction=reduction, weight=w)
                 self.assertEqual(output, output_ref)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_cross_entropy_loss_prob_target_unit_weights(self, device):
         # Test with k-dimensional loss.
         for k in range(5):
@@ -11036,6 +11085,7 @@ def test_cross_entropy_loss_prob_target_unit_weights(self, device):
 
     @parametrize_test('reduction', ['none', 'mean', 'sum'])
     @parametrize_test('weighted', [False, True])
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_cross_entropy_loss_prob_target_no_batch_dim(self, device, reduction, weighted):
         C = 5
         input = torch.randn(C, device=device).log_softmax(dim=-1)
@@ -11048,6 +11098,7 @@ def test_cross_entropy_loss_prob_target_no_batch_dim(self, device, reduction, we
             loss_batch = loss_batch.squeeze(0)
         self.assertEqual(loss_no_batch, loss_batch)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_cross_entropy_loss_index_target_unit_weights(self, device):
         # Test with k-dimensional loss.
         for k in range(5):
@@ -11065,6 +11116,7 @@ def test_cross_entropy_loss_index_target_unit_weights(self, device):
                 output_unit = m_unit(input, target)
                 self.assertEqual(output, output_unit)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_cross_entropy_loss_one_hot_target(self, device):
         # Test with k-dimensional loss.
         for k in range(5):
@@ -11092,6 +11144,7 @@ def test_cross_entropy_loss_one_hot_target(self, device):
                 output_one_hot = m(input, target_one_hot)
                 self.assertEqual(output, output_one_hot)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_cross_entropy_label_smoothing_errors(self, device):
         N, C = 3, 4
         input_args = [
@@ -11104,6 +11157,7 @@ def test_cross_entropy_label_smoothing_errors(self, device):
                                         r"label_smoothing must be between 0\.0"):
                 loss(*input_arg)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_cross_entropy_label_smoothing_consistent_index_target_and_probs(self, device):
         N, C = 10, 4
         ks = range(5)
@@ -11137,6 +11191,7 @@ def test_cross_entropy_label_smoothing_consistent_index_target_and_probs(self, d
             self.assertEqual(output_with_prob, output_with_index,
                              rtol=1e-07, atol=1e-05)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_cross_entropy_label_smoothing_with_probs(self, device):
         N, C = 10, 4
         ks = range(5)
@@ -11163,7 +11218,7 @@ def test_cross_entropy_label_smoothing_with_probs(self, device):
 
                 self.assertEqual(output_with_smoothing, output_with_manual_smoothing)
 
-
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_cross_entropy_label_smoothing_weight_ignore_indices(self, device):
         reductions = ['none', 'sum', 'mean']
         label_smoothings = [0.05, 0.15]
@@ -11248,6 +11303,7 @@ def test_softshrink_negative(self, device):
                                     r'lambda must be greater or equal to 0, but found to be -1\.'):
             m(input)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_fold(self, device):
         def test_dtype(fn, input, dtype):
             input = input.detach().clone().to(dtype=dtype).requires_grad_(True)
@@ -11273,7 +11329,7 @@ def func(x):
             if device == 'cpu':
                 test_dtype(func, x, torch.bfloat16)
 
-
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_logsigmoid_out(self, device):
         # this isn't actually documented, but was broken previously:
         # https://github.com/pytorch/pytorch/issues/36499
@@ -11410,6 +11466,7 @@ def __init__(self):
             for p, pe in zip(test_model.parameters(), ref_model.parameters()):
                 self.assertEqual(p.grad.to(devices[0]), pe.grad)
 
+    @skipIfMps
     def test_elu_inplace_overlap(self, device):
         x = torch.randn((1, 6), dtype=torch.bfloat16, device=device).expand((6, 6))
         with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
@@ -11419,6 +11476,7 @@ def test_elu_inplace_overlap(self, device):
 
     # Merge into OpInfo?
     @onlyNativeDeviceTypes
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_elu_inplace_with_neg_alpha(self, device):
         a = torch.tensor([-1., 1.], device=device, requires_grad=True)
         b = torch.nn.functional.elu_(a.clone(), alpha=-2)
@@ -11431,27 +11489,32 @@ def test_elu_inplace_with_neg_alpha(self, device):
             b.backward(torch.ones(2, device=device))
 
     @expectedFailureMeta  # https://github.com/pytorch/pytorch/issues/54897
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_hardswish_inplace_overlap(self, device):
         x = torch.randn((1, 6), device=device).expand((6, 6))
         with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
             F.hardswish(x, inplace=True)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_silu_inplace_overlap(self, device):
         x = torch.randn((1, 6), device=device).expand((6, 6))
         with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
             F.silu(x, inplace=True)
 
     @onlyNativeDeviceTypes
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_mish_inplace_overlap(self, device):
         x = torch.randn((1, 6), device=device).expand((6, 6))
         with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
             F.mish(x, inplace=True)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_softplus_inplace_overlap(self, device):
         x = torch.randn((1, 6), device=device).expand((6, 6))
         with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
             F.softplus(x, out=x)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_softplus_low_threshold(self, device):
         # Ensure gradients are computed correctly with a low threshold.
         model = torch.nn.Softplus(threshold=1).double()
@@ -11460,11 +11523,13 @@ def test_softplus_low_threshold(self, device):
         output = model(input)
         torch.autograd.gradcheck(model, input)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_softshrink_inplace_overlap(self, device):
         x = torch.randn((1, 6), device=device).expand((6, 6))
         with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
             F.softshrink(x, out=x)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_leaky_relu_inplace_overlap(self, device):
         x = torch.randn((1, 6), device=device).expand((6, 6))
         with self.assertRaisesRegex(RuntimeError, 'unsupported operation'):
@@ -11473,6 +11538,7 @@ def test_leaky_relu_inplace_overlap(self, device):
             F.leaky_relu_(x)
 
     # Merge into OpInfo?
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_leaky_relu_inplace_with_neg_slope(self, device):
         a = torch.tensor([-1., 1.], device=device, requires_grad=True)
         b = torch.nn.functional.leaky_relu_(a.clone(), -2)
@@ -11485,6 +11551,7 @@ def test_leaky_relu_inplace_with_neg_slope(self, device):
             b.backward(torch.ones(2, device=device))
 
     # Merge into OpInfo?
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_leaky_relu_inplace_with_zero_slope(self, device):
         a = torch.tensor([-2., 0., 2.], device=device, requires_grad=True)
         b = torch.nn.functional.leaky_relu_(a.clone(), 0.0)
@@ -11524,6 +11591,7 @@ def test_softshrink(self, device):
         out = softshrink(x)
         self.assertEqual(out, expected, atol=1e-2, rtol=0)
 
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_threshold_inplace_overlap(self, device):
         # Inplace threshold is okay, because it is idempotent
         x = torch.randn((1, 6), device=device).expand((6, 6))
@@ -11531,6 +11599,7 @@ def test_threshold_inplace_overlap(self, device):
         F.threshold_(x, 0.5, 0.5)
 
     @onlyNativeDeviceTypes
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_triplet_margin_with_distance_loss_default_parity(self, device):
         # Test for `nn.TripletMarginWithDistanceLoss` and
         # `F.triplet_margin_with_distance_loss`.  Checks
@@ -11565,6 +11634,7 @@ def test_triplet_margin_with_distance_loss_default_parity(self, device):
                             (anchor, positive, negative)))
 
     @onlyNativeDeviceTypes
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_triplet_margin_with_distance_loss(self, device):
         # Test for parity between `nn.TripletMarginWithDistanceLoss` and
         # `F.triplet_margin_with_distance_loss`.
@@ -11608,6 +11678,7 @@ def cosine_distance(x, y):
             self.assertEqual(functional, modular, atol=1e-6, rtol=1e-6)
             self.assertEqual(traced, modular, atol=1e-6, rtol=1e-6)
 
+    @skipIfMps  # the test doesn't work on MPS as double/complex types are not supported
     def test_to_complex(self, device):
         m = nn.Linear(3, 5).to(device)
         self.assertIs(m, m.to(device))
@@ -11626,6 +11697,7 @@ def test_to_complex(self, device):
 
     @skipMeta
     @dtypes(torch.float32, torch.float64)
+    @dtypesIfMPS(torch.float32)
     def test_module_to_empty(self, device, dtype):
         class MyModule(nn.Module):
             def __init__(self, in_features, out_features, device=None, dtype=None):
@@ -11653,6 +11725,7 @@ def forward(self, x):
         m(input)
 
     @skipMeta
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_skip_init(self, device):
         torch.manual_seed(1)
         m_initialized = torch.nn.Linear(5, 1)
@@ -11666,6 +11739,7 @@ def test_skip_init(self, device):
 
     @dtypes(torch.float)
     @dtypesIfCUDA(torch.double, torch.float, torch.half)
+    @skipMPSIf(True, "the test doesn't work on MPS as double types are not supported")
     def test_transformerencoderlayer(self, device, dtype):
         # this is a deterministic test for TransformerEncoderLayer
         d_model = 4
diff --git a/test/test_torch.py b/test/test_torch.py
index 41e407b413f52..26ab9b61f5d9c 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -40,6 +40,7 @@
     skipIfNotRegistered, bytes_to_scalar, parametrize, skipIfMps, noncontiguous_like)
 from multiprocessing.reduction import ForkingPickler
 from torch.testing._internal.common_device_type import (
+    dtypesIfMPS,
     expectedFailureMeta,
     expectedFailureXLA,
     instantiate_device_type_tests,
@@ -55,7 +56,7 @@
     tf32_on_and_off, tf32_is_not_fp32, TEST_CUDNN)
 from torch.testing._internal.common_dtype import (
     floating_types_and, get_all_math_dtypes, all_types_and_complex_and, complex_types,
-    all_types_and, floating_types, floating_and_complex_types,
+    all_types_and, floating_types, floating_and_complex_types, integral_types_and,
 )
 
 # Protects against includes accidentally setting the default dtype
@@ -163,6 +164,8 @@ def rand_byte():
     @dtypes(torch.int8, torch.uint8, torch.int16, torch.int32, torch.int64,
             torch.bool, torch.float32, torch.complex64, torch.float64,
             torch.complex128)
+    @dtypesIfMPS(torch.int8, torch.uint8, torch.int16, torch.int32, torch.int64,
+                 torch.bool, torch.float32)
     def test_storage(self, device, dtype):
         v = make_tensor((3, 5), dtype=dtype, device=device, low=-9, high=9)
         self.assertEqual(v.storage()[0], v[0][0])
@@ -224,6 +227,7 @@ def test_storage_setitem(self, device, dtype):
 
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @dtypesIfMPS(*integral_types_and(torch.bool, torch.half, torch.float32))
     def test_tensor_storage_type(self, device, dtype):
         a = make_tensor((10,), dtype=dtype, device=device, low=-9, high=9)
 
@@ -316,6 +320,7 @@ def test_untyped_storage_meta(self, device):
 
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @dtypesIfMPS(*integral_types_and(torch.bool, torch.half, torch.float32))
     def test_storage_meta_from_tensor(self, device, dtype):
         t_check = make_tensor((4, 5, 3), dtype=dtype, device=device, low=-9, high=9)
         t = t_check.to('meta')
@@ -376,6 +381,7 @@ def test_module_share_memory(self):
         model.share_memory()
 
     @dtypes(torch.float32, torch.complex64)
+    @dtypesIfMPS(torch.float32)
     def test_deepcopy(self, device, dtype):
         from copy import deepcopy
         a = torch.randn(5, 5, dtype=dtype, device=device)
@@ -403,6 +409,7 @@ def test_deepcopy(self, device, dtype):
         self.assertEqual(deepcopy(a).foo, 3)
 
     @dtypes(torch.float32, torch.complex64)
+    @dtypesIfMPS(torch.float32)
     def test_deepcopy_scalar(self, device, dtype):
         from copy import deepcopy
         a = torch.tensor(5, dtype=dtype, device=device)
@@ -821,6 +828,7 @@ def test_warn_always_caught(self, device):
             torch.from_numpy(a)
 
     @onlyNativeDeviceTypes
+    @skipIfMps
     def test_complex_half_experimental_warning(self, device):
         msg = 'ComplexHalf support is experimental'
         with self.assertWarnsOnceRegex(UserWarning, msg):
@@ -858,6 +866,7 @@ def test_complex_half_experimental_warning(self, device):
             t + 1
 
     # TODO: this test should be in test_nn.py
+    @skipIfMps
     def test_conv_transposed_backward_agnostic_to_memory_format(self, device):
         in_channels = 64
         out_channels = 128
@@ -1447,6 +1456,7 @@ def test_nondeterministic_alert_EmbeddingBag_max(self, device):
             torch.device(device).type == 'cuda')
 
     @dtypes(*all_types_and_complex_and(torch.bool))
+    @dtypesIfMPS(*integral_types_and(torch.bool, torch.float32))
     def test_nondeterministic_alert_cumsum(self, device, dtype):
         input = make_tensor((10,), dtype=dtype, device=device, low=-9, high=9)
         should_alert = torch.device(device).type == 'cuda' and (dtype.is_floating_point or dtype.is_complex)
@@ -1546,6 +1556,7 @@ def test_nondeterministic_alert_grid_sample_3d(self, device):
             'grid_sampler_3d_backward_cuda',
             torch.device(device).type == 'cuda')
 
+    @skipIfMps
     def test_invalid_shapes_grid_sampler(self, device):
         make_arg = partial(
             make_tensor, device=device, dtype=torch.float64, requires_grad=True)
@@ -1813,6 +1824,7 @@ def test_repeat_interleave(self, device):
     @dtypes(*floating_types())
     @dtypesIfCPU(*floating_types_and(torch.bfloat16))
     @dtypesIfCUDA(*floating_types_and(torch.half))
+    @dtypesIfMPS(torch.half, torch.float) # crashes for half
     def test_bernoulli_p(self, device, dtype):
         for trivial_p in ([0, 1], [1, 0, 1, 1, 0, 1]):
             x = torch.tensor(trivial_p, dtype=dtype, device=device)
@@ -1835,6 +1847,7 @@ def isBinary(t):
     @dtypes(*floating_types())
     @dtypesIfCPU(*all_types_and(torch.bool))
     @dtypesIfCUDA(*all_types_and(torch.bool, torch.half))
+    @dtypesIfMPS(*integral_types_and(torch.bool, torch.half, torch.float32))
     def test_bernoulli_self(self, device, dtype):
 
         def isBinary(t):
@@ -1874,7 +1887,7 @@ def test_bernoulli_edge_cases(self, device, dtype):
         self.assertEqual(num_zeros, 0)
 
     @dtypes(*floating_types_and(torch.half, torch.bfloat16))
-    @skipIfMps
+    @dtypesIfMPS(torch.half, torch.float)
     def test_exponential(self, device, dtype):
         a = torch.tensor([10], dtype=dtype, device=device).exponential_(0.5)
         self.assertEqual(a.dtype, dtype)
@@ -1922,6 +1935,7 @@ def test_corrcoef(self, device, dtype):
             self.assertEqual(res, ref, exact_dtype=False)
 
     @dtypes(torch.int, torch.float, torch.cfloat)
+    @dtypesIfMPS(torch.int, torch.float)
     def test_cov(self, device, dtype):
         def check(t, correction=1, fweights=None, aweights=None):
             res = torch.cov(t, correction=correction, fweights=fweights, aweights=aweights)
@@ -1942,6 +1956,7 @@ def check(t, correction=1, fweights=None, aweights=None):
 
     @skipIfNoSciPy
     @dtypes(*floating_types_and(torch.half, torch.bfloat16))
+    @dtypesIfMPS(torch.half, torch.float)
     def test_uniform_kstest(self, device, dtype):
         from scipy import stats
         size = 1000
@@ -2503,6 +2518,7 @@ def to_np(t):
     # All tensors appear contiguous on XLA
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool))
+    @dtypesIfMPS(*integral_types_and(torch.bool, torch.half, torch.float32)) # crashes for torch.int8_t
     def test_diff_noncontig(self, device, dtype):
         shapes = (
             (1,),
@@ -2525,6 +2541,7 @@ def test_diff_noncontig(self, device, dtype):
     @dtypes(*all_types_and_complex_and(torch.bool))
     @dtypesIfCPU(*all_types_and_complex_and(torch.half, torch.bool))
     @dtypesIfCUDA(*all_types_and_complex_and(torch.half, torch.bool))
+    @dtypesIfMPS(*integral_types_and(torch.bool, torch.half, torch.float32))
     def test_diff(self, device, dtype):
         shapes = (
             (1,),
@@ -2787,6 +2804,7 @@ def test_unfold_scalars(self, device):
         self.assertEqual(torch.tensor([0.5], device=device), x.unfold(0, 1, 1))
 
     # FIXME: move to data movement test suite
+    @skipIfMps
     def test_copy_all_dtypes_and_devices(self, device):
         from copy import copy
         for dt in all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16):
@@ -2872,6 +2890,7 @@ def test_copy_transpose_math_view(self, device, dtype):
             dst.copy_(src.conj())
             self.assertEqual(dst, src.conj_physical())
 
+    @skipIfMps
     def test_clone_all_dtypes_and_devices(self, device):
         for dt in all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16):
             x = torch.tensor((1, 1), dtype=dt, device=device)
@@ -2893,6 +2912,7 @@ def test_clone_not_memory_dense(self):
 
     # FIXME: move to elementwise ternary test suite
     @dtypesIfCUDA(*set(get_all_math_dtypes('cuda')))
+    @dtypesIfMPS(*integral_types_and(torch.half, torch.float32))
     @dtypes(*set(get_all_math_dtypes('cpu')))
     def test_addcmul(self, device, dtype):
         # Returns floating or integral scalar corresponding to dtype
@@ -2946,6 +2966,7 @@ def test_narrow_empty(self, device):
     # FIXME: move to indexing test suite
     @parametrize("reduce", ['prod', 'amin', 'amax', 'mean'])
     @dtypes(*all_types_and(torch.half, torch.bfloat16))
+    @dtypesIfMPS(*integral_types_and(torch.half, torch.float32))
     def test_index_reduce(self, device, dtype, reduce):
         size = (3, 4, 5)
         index_dtypes = [torch.int, torch.long]
@@ -2995,6 +3016,7 @@ def test_index_reduce(self, device, dtype, reduce):
 
     # FIXME: move to test indexing
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @dtypesIfMPS(*integral_types_and(torch.bool, torch.half, torch.float32))
     def test_index_copy(self, device, dtype):
         # We just test for num_copy <= num_dest, as otherwise there are repeated indices
         # and the behavior is undefined
@@ -3205,6 +3227,7 @@ def ref_index_select(src, dim, idx):
 
     # FIXME: find a test suite for the take operator
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @dtypesIfMPS(*integral_types_and(torch.bool, torch.half, torch.float32))
     def test_take(self, device, dtype):
         idx_size = (4,)
 
@@ -3240,6 +3263,7 @@ def ref_take(src, idx):
     # The bool instance does not work on GPU. See
     # https://github.com/pytorch/pytorch/issues/54317
     @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16))
+    @dtypesIfMPS(*integral_types_and(torch.half, torch.float32))
     def test_put(self, device, dtype):
         src_size = (4,)
 
@@ -3311,6 +3335,7 @@ def ref_put(dst, idx, src, accumulate):
     # The bool instance does not work on GPU. See
     # https://github.com/pytorch/pytorch/issues/54317
     @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16))
+    @dtypesIfMPS(*integral_types_and(torch.half, torch.float32))
     def test_put_accumulate(self, device, dtype):
         # Test for parallel adds with accumulate == True
         low_precision = dtype == torch.half or dtype == torch.bfloat16
@@ -3358,6 +3383,7 @@ def scatter_allow_reduce(self, device, dtype, reduceop):
     @dtypes(*floating_and_complex_types())
     @dtypesIfCPU(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     @dtypesIfCUDA(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @dtypesIfMPS(*integral_types_and(torch.bool, torch.half, torch.float32))
     def test_scatter_reduce_operations_to_large_input(self, device, dtype):
         index = torch.tensor([[1], [2]], device=device, dtype=torch.long)
         test_data = [
@@ -3385,6 +3411,7 @@ def test_scatter_reduce_operations_to_large_input(self, device, dtype):
     @dtypes(*floating_and_complex_types())
     @dtypesIfCPU(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     @dtypesIfCUDA(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @dtypesIfMPS(*integral_types_and(torch.bool, torch.half, torch.float32))
     def test_scatter_reduce_scalar(self, device, dtype):
         index = torch.tensor([[1], [2]], device=device, dtype=torch.long)
         test_data = [
@@ -3424,6 +3451,7 @@ def test_scatter_add_non_unique_index(self, device):
     @dtypes(*floating_and_complex_types())
     @dtypesIfCPU(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     @dtypesIfCUDA(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @dtypesIfMPS(*integral_types_and(torch.bool, torch.half, torch.float32))
     def test_scatter_reduce_non_unique_index(self, device, dtype):
         height = 2
         width = 2
@@ -3498,6 +3526,7 @@ def test_scatter_add_bool(self, device):
     # FIXME: find a test suite for the masked scatter operator
     @onlyNativeDeviceTypes
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @dtypesIfMPS(*integral_types_and(torch.bool, torch.half, torch.float32))
     def test_masked_scatter(self, device, dtype):
         dt = dtype
         with warnings.catch_warnings(record=True) as w:
@@ -3586,6 +3615,7 @@ def test_masked_scatter_large_tensor(self, device):
 
     # FIXME: find a test suite for the masked select operator
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
+    @dtypesIfMPS(*integral_types_and(torch.bool, torch.half, torch.float32))
     def test_masked_select(self, device, dtype):
         if device == 'cpu':
             warn = 'masked_select received a mask with dtype torch.uint8,'
@@ -3654,6 +3684,7 @@ def test_masked_select_discontiguous(self, device):
 
     # FIXME: find a test suite for the masked fill operator
     @dtypes(*product(all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16), (torch.uint8, torch.bool)))
+    @dtypesIfMPS(*product(integral_types_and(torch.half, torch.float, torch.bool), (torch.uint8, torch.bool)))
     def test_masked_fill(self, device, dtypes):
         dtype = dtypes[0]
         mask_dtype = dtypes[1]
@@ -4874,6 +4905,7 @@ def _test_memory_format_transformations(self, device, input_generator_fn, transf
             x = x.permute(permutation)
             self.assertEqual(x.stride(), transformation_fn(x, memory_format=torch.preserve_format).stride())
 
+    @skipIfMps
     def test_memory_format_to(self, device):
         def get_generator(memory_format, shape):
             def input_generator_fn(device):
@@ -4891,6 +4923,7 @@ def transformation_fn(tensor, **kwargs):
             self._test_memory_format_transformations(
                 device, get_generator(mf, shape), transformation_fn, mf, default_is_preserve=True)
 
+    @skipIfMps
     def test_memory_format_type(self, device):
         def get_generator(memory_format, shape):
             def input_generator_fn(device):
@@ -4950,6 +4983,7 @@ def input_generator_fn(device):
                 self._test_memory_format_transformations(
                     device, get_generator(mf, shape), transformation_fn, mf, compare_data=False, default_is_preserve=True)
 
+    @skipIfMps
     def test_memory_format_type_shortcuts(self, device):
         def get_generator(memory_format, shape, dtype):
             def input_generator_fn(device):
@@ -5195,6 +5229,7 @@ def test_assertRaisesRegex_ignore_msg_non_native_device(self, device):
             torch.nn.functional.nll_loss(x, t, weight=invalid_weight)
 
     @dtypes(*all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.complex32))
+    @dtypesIfMPS(*integral_types_and(torch.bool, torch.half, torch.float32))
     def test_copy_(self, device, dtype):
         def can_cast(src_dtype, dst_dtype):
             # torch.can_cast(torch.int16, torch.uint8) returns True
@@ -5232,6 +5267,7 @@ def make_tensor_wrapper(shape, dtype):
                 self.assertEqual(src, dst.copy_(t), rtol=rtol, atol=atol)
 
     @dtypes(*all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.complex32))
+    @dtypesIfMPS(*integral_types_and(torch.bool, torch.half, torch.float32))
     def test_item(self, device, dtype):
         t = torch.ones((), device=device, dtype=dtype)
         self.assertEqual(1, t.item())
@@ -5312,6 +5348,9 @@ def test_type_conversions_same_device(self, devices):
     @dtypesIfCUDA(torch.half, torch.float, torch.double,
                   torch.int8, torch.short, torch.int, torch.long,
                   torch.uint8)
+    @dtypesIfMPS(torch.half, torch.float,
+                 torch.int8, torch.short, torch.int, torch.long,
+                 torch.uint8)
     @dtypes(torch.float, torch.double,
             torch.int8, torch.short, torch.int, torch.long,
             torch.uint8)
diff --git a/test/test_view_ops.py b/test/test_view_ops.py
index 3c4376b501f91..49ffbd872f154 100644
--- a/test/test_view_ops.py
+++ b/test/test_view_ops.py
@@ -6,16 +6,17 @@
 from itertools import product, permutations, combinations
 from functools import partial
 import random
+from torch._C import dtype
 
 from torch.testing import make_tensor
 from torch.testing._internal.common_utils import (
-    IS_FBCODE, TestCase, run_tests, suppress_warnings, gradcheck, gradgradcheck,
+    IS_FBCODE, TestCase, run_tests, skipIfMps, suppress_warnings, gradcheck, gradgradcheck,
     numpy_to_torch_dtype_dict, skipIfTorchDynamo
 )
 from torch.testing._internal.common_device_type import \
-    (instantiate_device_type_tests, onlyCPU, dtypes, onlyNativeDeviceTypes, skipMeta)
+    (dtypesIfMPS, instantiate_device_type_tests, onlyCPU, dtypes, onlyNativeDeviceTypes, skipMeta)
 from torch.testing._internal.common_dtype import (
-    all_types_and_complex_and, complex_types, all_types_and, floating_and_complex_types_and,
+    all_types_and_complex_and, complex_types, all_types_and, floating_and_complex_types_and, integral_types_and,
 )
 
 # TODO: replace this with make_tensor() in common_utils.py
@@ -369,6 +370,7 @@ def test_view_tensor_dsplit(self, device, dtype):
         self.assertEqual(t_dsplit[1][2, 2, 0], t[2, 2, 2])
 
     @onlyNativeDeviceTypes
+    @skipIfMps
     @dtypes(*all_types_and(torch.half, torch.bfloat16))
     def test_imag_noncomplex(self, device, dtype):
         t = torch.ones((5, 5), dtype=dtype, device=device)
@@ -409,6 +411,7 @@ def compare_with_numpy(contiguous_input=True):
 
     @onlyNativeDeviceTypes
     @dtypes(*complex_types())
+    @skipIfMps
     def test_conj_imag_view(self, device, dtype) -> None:
         t = _make_tensor((4, 5,), dtype, device)
         t_numpy_conj = torch.from_numpy(t.cpu().numpy().conj()).to(device=device)
@@ -423,6 +426,7 @@ def test_conj_imag_view(self, device, dtype) -> None:
             self.assertTrue(v_imag.is_neg())
 
     @onlyNativeDeviceTypes
+    @skipIfMps
     def test_conj_view_with_shared_memory(self, device) -> None:
         a = _make_tensor((4, 5,), torch.cfloat, device)
         b = a.conj()
@@ -866,6 +870,7 @@ def test_advanced_indexing_assignment(self, device):
         self.assertEqual(t[2, 2], 0)
 
     @unittest.skip("See https://github.com/pytorch/pytorch/pull/32720")
+    @skipIfMps
     def test_chunk_view(self, device):
         t = torch.zeros(3, 3, device=device)
         l = torch.chunk(t, 3)
@@ -1300,6 +1305,7 @@ def test_big_transpose(self, device):
         t2 = torch.from_numpy(t.cpu().numpy().transpose())
         self.assertEqual(t1, t2)
 
+    @skipIfMps
     def test_T(self, device):
         a = torch.randn(2, 3, 4, device=device)
         t1 = a.T
@@ -1311,6 +1317,7 @@ def test_T(self, device):
         self.assertEqual(scalar, scalar.T)
 
     @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
+    @dtypesIfMPS(*integral_types_and(torch.bool, torch.half, torch.float32))
     def test_transposes(self, device, dtype):
         for op in ("T", "H", "mT", "mH", "adjoint"):
             shapes = ((), (2, 3), (2, 3, 4)) if op[0] == "m" or op == "adjoint" else ((), (2, 3),)
@@ -1327,6 +1334,7 @@ def test_transposes(self, device, dtype):
                 self.assertEqual(t2, t1)
 
     @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
+    @dtypesIfMPS(*integral_types_and(torch.bool, torch.half, torch.float32))
     def test_transposes_errors(self, device, dtype):
         for op in ("H", "mT", "mH", "adjoint"):
             shapes = ((2,), (2, 3, 4)) if op == "H" else ((2,),)
@@ -1337,6 +1345,7 @@ def test_transposes_errors(self, device, dtype):
                     if op == "adjoint":
                         t1 = t1()
 
+    @skipIfMps
     def test_python_types(self, device):
         a1 = torch.randn((1, 2), device=device, dtype=torch.float64)
         a2 = torch.randn((1, 2), device=device, dtype=float)
@@ -1378,6 +1387,7 @@ def test_helper(shape, numel, memory_format, device):
 
     @onlyNativeDeviceTypes
     @dtypes(torch.int64, torch.float, torch.complex128)
+    @dtypesIfMPS(torch.int64, torch.float)
     def test_transpose_invalid(self, device, dtype):
         for fn in (torch.swapdims, torch.swapaxes, torch.transpose):
             shape = _rand_shape(4, min_size=5, max_size=10)
@@ -1391,6 +1401,7 @@ def test_transpose_invalid(self, device, dtype):
                 fn(x, 0, 5)
 
     @dtypes(torch.int64, torch.float, torch.complex128)
+    @dtypesIfMPS(torch.int64, torch.float)
     def test_transpose_vs_numpy(self, device, dtype):
         for fn in (torch.swapdims, torch.swapaxes, torch.transpose):
             for nd in range(5):
@@ -1453,6 +1464,7 @@ def _test_atleast_dim(self, torch_fn, np_fn, device, dtype):
 
     # TODO: are these view ops?
     @dtypes(*all_types_and_complex_and(torch.half))
+    @dtypesIfMPS(*integral_types_and(torch.bool, torch.half, torch.float32))
     def test_atleast(self, device, dtype):
         self._test_atleast_dim(torch.atleast_1d, np.atleast_1d, device, dtype)
         self._test_atleast_dim(torch.atleast_2d, np.atleast_2d, device, dtype)
@@ -1556,6 +1568,7 @@ def test_broadcast_shapes(self, device):
 
     # Skip BFloat16 since numpy does not support it
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool))
+    @dtypesIfMPS(*integral_types_and(torch.bool, torch.half, torch.float32))
     def test_broadcast_to(self, device, dtype):
         def can_broadcast(s0, s1):
             # s0.dim() <= s1.dim(), reverse s0 and s1 to compare trailing dimension
@@ -1583,6 +1596,7 @@ def can_broadcast(s0, s1):
                                             r"must match the existing size \(\d\)"):
                     torch.broadcast_to(t, s1)
 
+    @skipIfMps
     def test_view(self, device):
         tensor = torch.rand(15, device=device)
         template = torch.rand(3, 5, device=device)
@@ -1659,6 +1673,7 @@ def test_view(self, device):
         self.assertEqual(tensor.view(1, 6, 2, 1), contig_tensor.view(1, 6, 2, 1))
 
     @dtypes(*all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool))
+    @dtypesIfMPS(*integral_types_and(torch.bool, torch.half, torch.float32))
     def test_reshape_view_semantics(self, device, dtype):
         tensor = make_tensor((15, 4), dtype=dtype, device=device)
         target = (20, 3)
@@ -1793,13 +1808,14 @@ def test_tensor_split_errors(self, device):
                                     + ' zero-dimensional or one-dimensional tensor, but got a tensor with 2 dims'):
             torch.tensor_split(torch.rand(S, device=device), torch.tensor(((1,),)), 0)
 
+    @skipIfMps
     def test_resize_all_dtypes_and_devices(self, device):
         shape = (2, 2)
         for dt in all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool):
             x = torch.tensor([[1, 2], [3, 4], [5, 6]], dtype=dt, device=device)
             x.resize_(shape)
             self.assertEqual(shape, x.shape)
-
+    @skipIfMps
     def test_resize_as_all_dtypes_and_devices(self, device):
         for dt in all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool):
             x = torch.tensor([[1, 2], [3, 4], [5, 6]], dtype=dt, device=device)
@@ -1815,6 +1831,7 @@ def test_resize_overflow(self, device):
         with self.assertRaisesRegex(RuntimeError, 'overflow'):
             x.resize_([8, 8, 2**29, 2**29])
 
+    @skipIfMps
     def test_view_all_dtypes_and_devices(self, device):
         for dt in all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool):
             x = torch.tensor([[1, 2], [3, 4], [5, 6]], dtype=dt, device=device)
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index df130de09c9de..eb9ed08168e6a 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -884,6 +884,14 @@ class skipMetaIf(skipIf):
     def __init__(self, dep, reason):
         super().__init__(dep, reason, device_type='meta')
 
+
+# Skips a test on MPS if the condition is true.
+class skipMPSIf(skipIf):
+
+    def __init__(self, dep, reason):
+        super().__init__(dep, reason, device_type='mps')
+
+
 # Skips a test on XLA if the condition is true.
 class skipXLAIf(skipIf):
 
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 545be1904b7c8..aa5b8f11a1345 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -124,7 +124,7 @@
         disabled_tests_dict = json.load(fp)
         warnings.warn(f"loaded {len(disabled_tests_dict)} disabled tests")
 
-NATIVE_DEVICES = ('cpu', 'cuda', 'meta')
+NATIVE_DEVICES = ('cpu', 'cuda', 'meta', 'mps')
 
 
 class _TestParametrizer(object):

From 3baf2b84400adaa5223c2b620eab3c5f9208db3f Mon Sep 17 00:00:00 2001
From: Abhishek Pathak <abhipathak97@gmail.com>
Date: Thu, 1 Sep 2022 11:31:36 -0700
Subject: [PATCH 1773/1922] Exclude split from tests (#101)

---
 test/test_mps.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_mps.py b/test/test_mps.py
index 6a74149dd7c48..1ad4f5ced0492 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -7920,6 +7920,7 @@ class TestConsistency(TestCase):
         'repeat': ['torch.bool'],
         'rot90': ['torch.bool'],
         'tile': ['torch.bool'],
+        'split': ['torch.float32'],
     }
 
     # Used for accept mode only

From 1d63d4ab86a579db02eae70ed7fd859313ec2f77 Mon Sep 17 00:00:00 2001
From: Abhishek Pathak <abhipathak97@gmail.com>
Date: Mon, 12 Sep 2022 10:57:09 -0700
Subject: [PATCH 1774/1922] Update blocklist (#106)

* Updated blocklist

* Further update blocklist
---
 test/test_mps.py | 38 ++------------------------------------
 1 file changed, 2 insertions(+), 36 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index 1ad4f5ced0492..700f8821ecbba 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -7718,7 +7718,6 @@ class TestConsistency(TestCase):
 
         # These were moved from ALLOWLIST to BLOCK as they are not working
         # locally
-        'tile': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         '__radd__': ['torch.bool', 'torch.uint8'],
         '__rmul__': ['torch.uint8'],
         'add': ['torch.bool', 'torch.uint8'],
@@ -7728,39 +7727,23 @@ class TestConsistency(TestCase):
 
         # Functions that are flaky
         # These are detected as "ok" by the expect case but actually fail to run sometimes
-        'H': None,
-        'T': None,
-        'as_strided': None,
-        'broadcast_tensors': None,
         'broadcast': None,
-        'broadcast_to': None,
-        'diagonal': None,
         'divfloor_rounding': None,
         'divno_rounding_mode': None,
         'divtrunc_rounding': None,
-        'dsplit': None,
-        'hsplit': None,
         'empty': None,
-        'expand_as': None,
-        'expand': None,
         'ge': None,
         'ne': None,
         'le': None,
         'lt': None,
         'gt': None,
-        'transpose': None,
         'splitlist_args': None,
-        'select': None,
-        'reshape': None,
         'reshape_as': None,
-        'permute': None,
         'norm': None,
         'nn.functional.pixel_unshuffle': None,
         'nn.functional.pixel_shuffle': None,
         'nn.functional.cross_entropy': None,
         'nn.functional.one_hot': None,
-        'narrow': None,
-        'movedim': None,
         'minreduction_with_dim': None,
         'minreduction_no_dim': None,
         'minbinary': None,
@@ -7771,8 +7754,6 @@ class TestConsistency(TestCase):
         'maxbinary': None,
         'maximum': None,
         'minimum': None,
-        'mT': None,
-        'mH': None,
         'outer': None,
         'softmaxwith_dtype': None,
         'rounddecimals_neg_3': None,
@@ -7786,11 +7767,8 @@ class TestConsistency(TestCase):
         'trapezoid': None,
         'eq': None,
         'mul': None,
-        'cartesian_prod': None,
         'nonzero': None,
-        'bool': None,
         'inner': None,
-        'dstack': None,
         'take_along_dim': None,
 
         # New block list ops that need investigation
@@ -7823,11 +7801,9 @@ class TestConsistency(TestCase):
         'clamp_max': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
         'clamp_min': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
         'constant_pad_nd': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'corrcoef': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
         'cos': ['torch.bool'], 
         'cosh': ['torch.bool'], 
         'count_nonzero': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'cov': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
         'diff': ['torch.bool', 'torch.uint8'], 
         'eig': ['torch.float32'], 
         'empty_like': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
@@ -7842,9 +7818,8 @@ class TestConsistency(TestCase):
         'fft.rfft2': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
         'fft.rfft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
         'fft.rfftn': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'float': ['torch.bool', 'torch.float16', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'float': ['torch.int64'],
         'gather': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'gradient': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32'], 
         'half': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
         'index_put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
         'index_select': ['torch.uint8'], 
@@ -7884,7 +7859,6 @@ class TestConsistency(TestCase):
         'nn.functional.normalize': ['torch.float32'], 
         'nn.functional.pad': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64'], 
         'nn.functional.pairwise_distance': ['torch.uint8'], 
-        'nn.functional.softsign': ['torch.int32'], 
         'nn.functional.triplet_margin_loss': ['torch.uint8'], 
         'nn.functional.triplet_margin_with_distance_loss': ['torch.uint8'], 
         'nn.functional.upsample_nearest': ['torch.float32'], 
@@ -7910,17 +7884,9 @@ class TestConsistency(TestCase):
         'tanh': ['torch.bool'], 
         'tensor_split': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
         'topk': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'trapz': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'true_divide': ['torch.int32', 'torch.int64'],
+        'trapz': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'nn.functional.local_response_norm': ['torch.int64'],
-        'flip': ['torch.bool'],
-        'fliplr': ['torch.bool'],
-        'flipud': ['torch.bool'],
         'index_select': ['torch.bool'],
-        'repeat': ['torch.bool'],
-        'rot90': ['torch.bool'],
-        'tile': ['torch.bool'],
-        'split': ['torch.float32'],
     }
 
     # Used for accept mode only

From bd098fe8c6fb2446b652dd8526bc91e79a9ad16f Mon Sep 17 00:00:00 2001
From: chrisbbayley <105944653+chrisbbayley@users.noreply.github.com>
Date: Tue, 27 Sep 2022 16:39:53 -0700
Subject: [PATCH 1775/1922] Implement as_strided using existing MPSGraph Shape
 APIs (#109)

* Add asStrided implementation via MPSGraph shape APIs

* Fix dstDim in asStrided slice step

* Add stride and storage_offset to as_strided graph key, fix bugs in shape API as_strided implementation

* Add patterns for expandDims and reshapes to asStrided shapeAPI implementation

* Move permute SPI to use transpose APIs, fix to flatten logic to avoid squeezing outermost dimension

* Fixes from review

Co-authored-by: Chris Bayley <cbayley@apple.com>
---
 aten/src/ATen/native/mps/operations/View.mm | 353 +++++++++++++++++++-
 1 file changed, 349 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/View.mm b/aten/src/ATen/native/mps/operations/View.mm
index 0e35c7b2f642d..30ceb23b75d18 100644
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@@ -19,10 +19,12 @@
 };
 
 static std::string getStridedKey(const ScalarType& dtype, const IntArrayRef& base_shape,
-                          const IntArrayRef& new_shape, bool is_scatter)
+                                 const IntArrayRef& new_shape, const IntArrayRef& stride,
+                                 int64_t storage_offset, bool is_scatter)
 {
   return (is_scatter ? "scatter:" : "gather:") + getMPSTypeString(dtype) + "[" +
-         getArrayRefString(base_shape) + "]:[" + getArrayRefString(new_shape) + "]";
+         getArrayRefString(base_shape) + "]:[" + getArrayRefString(new_shape) + "]:[" +
+         getArrayRefString(stride) + "]:[" + to_string(storage_offset) + "]";
 }
 
 // initializes the MTLBuffers for tensor data and runs the MPSGraph for the view op
@@ -78,6 +80,336 @@
   return output;
 }
 
+MPSGraphTensor *permuteTensor(MPSGraph *graph, MPSGraphTensor *inputTensor, NSArray *permuteOrder) {
+  NSUInteger srcRank = [[inputTensor shape] count];
+  if (srcRank != [permuteOrder count])
+    return nil;
+
+  MPSGraphTensor *outputTensor = inputTensor;
+  std::vector<NSUInteger> dimensionOrder(srcRank);
+  std::iota (std::begin(dimensionOrder), std::end(dimensionOrder), 0);
+
+  for (NSUInteger i = 0; i < srcRank; i++) {
+    NSUInteger axis = [permuteOrder[i] integerValue];
+    auto axisIter = std::find(dimensionOrder.begin(), dimensionOrder.end(), axis);
+    NSUInteger axis1 = i;
+    NSUInteger axis2 = axisIter - dimensionOrder.begin();
+    iter_swap(dimensionOrder.begin() + i, axisIter);
+
+    outputTensor = [graph transposeTensor:outputTensor
+                                dimension:axis1
+                            withDimension:axis2
+                                     name:nil];
+  }
+
+  return outputTensor;
+}
+
+NSDictionary *getStrideToDimLengthOffsetDict(MPSGraphTensor *tensor, NSUInteger rank, NSUInteger offset) {
+  // Assuming input tensor has default strides
+  NSInteger stride = 1;
+  NSMutableDictionary *strideToDimLengthOffset = [[NSMutableDictionary alloc] init];
+  for (NSInteger srcDim = rank - 1; srcDim >= 0; srcDim--) {
+    NSUInteger size = [[tensor shape][srcDim] integerValue];
+    NSDictionary *entry =
+    @{
+      @"dim": [NSNumber numberWithInteger:srcDim],
+      @"length": [tensor shape][srcDim],
+      @"offset": [NSNumber numberWithInteger:offset % size] // offset is determined traversing backwards through stride
+    };
+    [strideToDimLengthOffset setValue:entry forKey:[NSString stringWithFormat:@"%ld",stride]];
+    offset /= size;
+    stride *= size;
+  }
+  return strideToDimLengthOffset;
+}
+
+// Detect only expand dims, allows for duplicate strides
+MPSGraphTensor* asStridedLayer_expandDimsPattern(MPSGraph *graph, MPSGraphTensor *inputTensor, int dstRank, const IntArrayRef& dstSizes, const IntArrayRef& dstStrides, int offset) {
+
+  NSUInteger srcRank = [[inputTensor shape] count];
+  // Not an expand dims
+  if (srcRank >= dstRank)
+    return nil;
+
+  NSMutableArray *expandAxes = [[NSMutableArray alloc] init];
+
+  BOOL isValidExpand = YES;
+  NSInteger currSrcDim = (NSInteger)srcRank - 1;
+  NSUInteger currSrcStride = 1;
+  for (NSInteger dstDim = dstRank - 1; dstDim >= 0 && isValidExpand; dstDim--) {
+    NSUInteger currDimLength = dstSizes[dstDim];
+    NSUInteger currStride = dstStrides[dstDim];
+    NSUInteger currSrcDimLength = currSrcDim >= 0 ? [[inputTensor shape][currSrcDim] integerValue] : 1;
+
+    NSUInteger targetDimLength =  currSrcDimLength;
+    if (currDimLength != targetDimLength)
+      targetDimLength = 1;
+    if (currDimLength != targetDimLength && currStride != currSrcStride)
+      isValidExpand = NO;
+    if (currSrcDim >= 0 && currSrcDimLength == targetDimLength) {
+      currSrcStride *= currSrcDimLength;
+      currSrcDim--;
+    } else {
+      [expandAxes addObject:[NSNumber numberWithInt:dstDim]];
+    }
+  }
+
+  // Did not use every dimension of source
+  if (!isValidExpand || currSrcDim >= 0) {
+    [expandAxes release];
+    return nil;
+  }
+
+  MPSGraphTensor *expandTensor = inputTensor;
+  if ([expandAxes count]) {
+    expandTensor = [graph expandDimsOfTensor:expandTensor
+                                        axes:expandAxes
+                                        name:nil];
+  }
+  [expandAxes release];
+
+  return expandTensor;
+}
+
+// Detect contiguous reshapes, no slicing
+MPSGraphTensor* asStridedLayer_reshapePattern(MPSGraph *graph, MPSGraphTensor *inputTensor, int dstRank, const IntArrayRef& dstSizes, const IntArrayRef& dstStrides, int offset) {
+  NSUInteger srcRank = [[inputTensor shape] count];
+  // Not a reshape
+  if (srcRank <= dstRank)
+    return nil;
+
+  NSMutableArray *dstShape = [[NSMutableArray alloc] init];
+
+  BOOL isValidReshape = YES;
+  NSInteger srcDim = srcRank - 1;
+  NSUInteger srcStride = 1;
+  for (NSInteger dstDim = dstRank - 1; dstDim >= 0 && isValidReshape; dstDim--) {
+    NSUInteger currDimLength = dstSizes[dstDim];
+    NSUInteger currStride = dstStrides[dstDim];
+    [dstShape insertObject:[NSNumber numberWithInteger:currDimLength] atIndex: 0];
+
+    NSUInteger targetDimLength = currDimLength;
+    NSUInteger currReshapeSize = 1;
+    NSUInteger innerStride = srcStride;
+    do {
+      NSUInteger srcDimLength = [[inputTensor shape][srcDim] integerValue];
+      currReshapeSize *= srcDimLength;
+      srcStride *= srcDimLength;
+
+      srcDim--;
+    } while(currReshapeSize != targetDimLength && srcDim >= 0);
+
+    isValidReshape &= (currReshapeSize == targetDimLength && currStride == innerStride);
+  }
+  isValidReshape &= (srcDim < 0);
+
+  MPSGraphTensor *outputTensor = nil;
+  if (isValidReshape)
+    outputTensor = [graph reshapeTensor: inputTensor
+                              withShape: dstShape
+                                   name: nil];
+  [dstShape release];
+  return outputTensor;
+}
+
+MPSGraphTensor* asStridedLayer_genericPattern(MPSGraph *graph, MPSGraphTensor *inputTensor, int dstRank, const IntArrayRef& dstSizes, const IntArrayRef& dstStrides, int offset) {
+
+  // Duplicate strides cannot be done
+  {
+    BOOL allUnique = YES;
+    NSMutableSet *uniqueStrides = [[NSMutableSet alloc] init];
+    for (NSInteger dstDim = 0; (dstDim < dstRank) && allUnique; dstDim++) {
+      int stride = dstStrides[dstDim];
+      NSNumber *strideObj = [NSNumber numberWithInt:stride];
+      allUnique &= (stride == 0 || ![uniqueStrides containsObject:strideObj]);
+      [uniqueStrides addObject: strideObj];
+    }
+    [uniqueStrides release];
+    if (!allUnique)
+      return nil;
+
+    // Skip for zero in dst shape
+    for (NSInteger dstDim = 0; dstDim < dstRank; dstDim++)
+      if (dstSizes[dstDim] == 0) { return nil; }
+  }
+
+  // 1. Flatten the inputTensor if neccessary
+  MPSGraphTensor *flatInputTensor = inputTensor;
+  {
+    // Flatten inputs to remove duplicate strides.
+    NSMutableArray *squeezeAxes = [[NSMutableArray alloc] init];
+    for(NSUInteger srcDim = 1; srcDim < [[flatInputTensor shape] count]; srcDim++) {
+        if ([[flatInputTensor shape][srcDim] intValue] == 1)
+            [squeezeAxes addObject:[NSNumber numberWithInteger:srcDim]];
+    }
+    // We have to leave at least 1 dimension, if all input dims are 1
+    if ([squeezeAxes count])
+        flatInputTensor = [graph squeezeTensor:flatInputTensor
+                                          axes:squeezeAxes
+                                          name:nil];
+    [squeezeAxes release];
+  }
+
+  int srcRank = (int)[[flatInputTensor shape] count];
+  NSDictionary *srcStrideToDimLengthOffset = getStrideToDimLengthOffsetDict(flatInputTensor, srcRank, offset);
+
+  // Populate the dimension order, slice info, and broadcast info
+  NSMutableArray *dstDimOrder = [[NSMutableArray alloc] init];
+  std::vector<int32_t> dstDimToSliceLength(dstRank);
+  std::vector<int32_t> dstDimToSliceOffset(dstRank);
+  bool needsBroadcast = false;
+  {
+    for (NSInteger dstDim = dstRank - 1; dstDim >= 0; dstDim--) {
+      if (dstStrides[dstDim] == 0) {
+        // This dimension should be a broadcast
+        needsBroadcast = true;
+        dstDimToSliceLength[dstDim] = dstSizes[dstDim];
+        dstDimToSliceOffset[dstDim] = 0;
+      } else {
+        // Find what dimension and native length was for the specified stride
+        NSDictionary *srcDimLengthOffset = srcStrideToDimLengthOffset[[NSString stringWithFormat:@"%lld",dstStrides[dstDim]]];
+
+        // Stride does not exist in source tensor, or the specified size is too long. Not possible
+        // TODO: Longer length with same stride + removal of dim(s) above this is a flatten/reshape. Consider adding support
+        if (!srcDimLengthOffset || dstSizes[dstDim] > [srcDimLengthOffset[@"length"] intValue])
+          return nil;
+
+        // Get the src dimension corresponding to the requested stride
+        NSNumber *srcDim = srcDimLengthOffset[@"dim"];
+        [dstDimOrder insertObject:srcDim atIndex:0];
+
+        dstDimToSliceLength[dstDim] = dstSizes[dstDim];
+        dstDimToSliceOffset[dstDim] = [srcDimLengthOffset[@"offset"] intValue];
+      }
+    }
+  }
+
+  // 2. Slice out any unused dimensions
+  NSMutableArray *missingSrcDims = [[NSMutableArray alloc] init];
+  MPSGraphTensor *slicedUnusedTensor = flatInputTensor;
+  {
+    // Find any src strides/dims that are not present in the dst
+    NSMutableArray *missingSrcStrides = [[NSMutableArray alloc] init];
+    {
+      NSUInteger stride = 1;
+      for (NSInteger srcDim = [[flatInputTensor shape] count] - 1; srcDim >= 0; srcDim--) {
+        [missingSrcStrides addObject:[NSNumber numberWithInteger:stride]];
+        stride *= [[flatInputTensor shape][srcDim] integerValue];
+      }
+      for (NSInteger dstDim = 0; dstDim < dstRank; dstDim++) {
+        [missingSrcStrides removeObject:[NSNumber numberWithInteger:dstStrides[dstDim]]];
+      }
+    }
+    for (NSUInteger i = 0; i < [missingSrcStrides count]; i++) {
+      NSUInteger stride = [missingSrcStrides[i] integerValue];
+      NSDictionary *srcDimLengthOffset = srcStrideToDimLengthOffset[[NSString stringWithFormat:@"%ld",stride]];
+      NSNumber *missingSrcDim = srcDimLengthOffset[@"dim"];
+      [missingSrcDims addObject:missingSrcDim];
+      [dstDimOrder insertObject:missingSrcDim atIndex:0];
+
+      slicedUnusedTensor = [graph sliceTensor:slicedUnusedTensor
+                                    dimension:[missingSrcDim intValue]
+                                        start:[srcDimLengthOffset[@"offset"] intValue]
+                                       length:1
+                                         name:nil];
+    }
+    [missingSrcStrides release];
+  }
+
+  // 3. Transpose if necessary
+  MPSGraphTensor *transposedTensor = slicedUnusedTensor;
+  {
+    // TODO: Use Transpose API
+    BOOL needsTranspose = NO;
+    for(NSUInteger dstDim = 0; dstDim < [dstDimOrder count] && !needsTranspose; dstDim++ )
+      needsTranspose |= ([dstDimOrder[dstDim] intValue] != dstDim);
+    if (needsTranspose)
+      transposedTensor = permuteTensor(graph, transposedTensor, dstDimOrder);
+  }
+
+  // 4. Squeeze any unused dimensions following transpose
+  MPSGraphTensor *squeezedTensor = transposedTensor;
+  {
+    // Transpose the missing dims back
+    NSMutableArray *transposedMissingSrcDims = [[NSMutableArray alloc] init];
+    for (NSUInteger dstDim = 0; dstDim < [dstDimOrder count]; dstDim++) {
+      NSNumber *srcDim = dstDimOrder[dstDim];
+      if ([missingSrcDims containsObject:srcDim])
+        [transposedMissingSrcDims addObject:[NSNumber numberWithInt:dstDim]];
+    }
+    if ([transposedMissingSrcDims count])
+      squeezedTensor = [graph squeezeTensor:squeezedTensor
+                                       axes:transposedMissingSrcDims
+                                       name:nil];
+    [transposedMissingSrcDims release];
+  }
+
+  // 5. Slice
+  MPSGraphTensor *slicedTensor = squeezedTensor;
+  {
+    NSUInteger currDstDim = 0;
+    for (NSUInteger dstDim = 0; dstDim < dstRank; dstDim++) {
+      // Only dstDims with nonzero stride are in the current tensor, skip broadcasts
+      if (dstStrides[dstDim] != 0) {
+        int start = dstDimToSliceOffset[dstDim];
+        int length = dstDimToSliceLength[dstDim];
+        if (length != [[slicedTensor shape][currDstDim] intValue])
+          slicedTensor = [graph sliceTensor:slicedTensor
+                                  dimension:currDstDim
+                                      start:start
+                                     length:length
+                                       name:nil];
+        currDstDim++;
+      }
+    }
+  }
+
+  // 6. Expand then broadcast the source tensor
+  MPSGraphTensor *broadcastTensor = slicedTensor;
+  if (needsBroadcast) {
+    NSMutableArray *broadcastShape = [[NSMutableArray alloc] init];
+    NSMutableArray *expandAxes = [[NSMutableArray alloc] init];
+    for(NSInteger dstDim = 0; dstDim < dstRank; dstDim++) {
+      [broadcastShape addObject:[NSNumber numberWithInt:dstSizes[dstDim]]];
+      if (dstStrides[dstDim] == 0)
+        [expandAxes addObject:[NSNumber numberWithInt:dstDim]];
+    }
+
+    if ([expandAxes count]) {
+      MPSGraphTensor *expandTensor = [graph expandDimsOfTensor:broadcastTensor
+                                                          axes:expandAxes
+                                                          name:nil];
+      broadcastTensor = [graph broadcastTensor:expandTensor
+                                       toShape:broadcastShape
+                                          name:nil];
+    }
+    [broadcastShape release];
+    [expandAxes release];
+  }
+
+  [srcStrideToDimLengthOffset release];
+  [dstDimOrder release];
+  [missingSrcDims release];
+
+  return broadcastTensor;
+}
+
+MPSGraphTensor* asStridedLayer_pattern(MPSGraph *graph, MPSGraphTensor *inputTensor, int dstRank, const IntArrayRef& dstSizes, const IntArrayRef& dstStrides, int offset) {
+  if (!dstRank)
+    return nil;
+
+  MPSGraphTensor *outputTensor = nil;
+  outputTensor = asStridedLayer_expandDimsPattern(graph, inputTensor, dstRank, dstSizes, dstStrides, offset);
+  if (!outputTensor)
+    outputTensor = asStridedLayer_reshapePattern(graph, inputTensor, dstRank, dstSizes, dstStrides, offset);
+  if (!outputTensor)
+    outputTensor = asStridedLayer_genericPattern(graph, inputTensor, dstRank, dstSizes, dstStrides, offset);
+
+  return outputTensor;
+}
+
+
 static MPSGraphTensor* chainViewOperation(ViewCachedGraph* cachedGraph, const IntArrayRef& size,
                                           const IntArrayRef& stride, int64_t offset,
                                           const IntArrayRef& base_shape, bool needsScatter,
@@ -131,6 +463,19 @@
                                     name:@"Cast away from bool"];
     }
 
+    if (!needsScatter) {
+      MPSGraphTensor *outputTensor = asStridedLayer_pattern(mpsGraph, inputTensor, shape_size, size, stride, offset);
+
+      if (outputTensor) {
+        if (needsBoolCast) {
+          outputTensor = [mpsGraph castTensor:outputTensor
+                                       toType:MPSDataTypeBool
+                                         name:@"Cast back to bool"];
+        }
+        return outputTensor;
+      }
+    }
+
     MPSGraphTensor *reshapedInputTensor = [mpsGraph reshapeTensor: inputTensor
                                                         withShape: @[@-1]
                                                              name: nil];
@@ -205,7 +550,7 @@
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
 
   @autoreleasepool {
-    string key = getStridedKey(self.scalar_type(), base_shape, size, needsScatter);
+    string key = getStridedKey(self.scalar_type(), base_shape, size, stride, storage_offset, needsScatter);
     ViewCachedGraph* cachedGraph = static_cast<ViewCachedGraph *>(cache_->LookUp(key));
 
     if (!cachedGraph) {
@@ -245,7 +590,7 @@ Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst)
 
   const IntArrayRef& base_shape = get_buffer_shape(src.storage().data());
   if (base_shape.size() > 0) {
-    string key = getStridedKey(src.scalar_type(), base_shape, src.sizes(), /*is_scatter*/ false);
+    string key = getStridedKey(src.scalar_type(), base_shape, src.sizes(), src.strides(), src.storage_offset(), /*is_scatter*/ false);
     cachedGraph = static_cast<ViewCachedGraph *>(MPSGraphCache::getInstance()->LookUp(key));
   }
   // there are cases where gatherViewTensor() is called without having as_strided() called beforehand.

From 860b77189620c48b6d927d796809ece0ef912f40 Mon Sep 17 00:00:00 2001
From: Abhishek Pathak <abhipathak97@gmail.com>
Date: Thu, 29 Sep 2022 14:24:54 -0700
Subject: [PATCH 1776/1922] Handle empty input in layer norm (#123)

* Handle empty input in layer norm

* Add layer norm to allowlist
---
 aten/src/ATen/native/mps/operations/Normalization.mm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/mps/operations/Normalization.mm b/aten/src/ATen/native/mps/operations/Normalization.mm
index 49f1e0538463f..b06049f40bf39 100644
--- a/aten/src/ATen/native/mps/operations/Normalization.mm
+++ b/aten/src/ATen/native/mps/operations/Normalization.mm
@@ -871,7 +871,7 @@ string get_mem_string(c10::MemoryFormat memory_format) {
   const int normalized_ndim = normalized_shape.size();
   // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
   const int axis = input_ndim - normalized_ndim;
-  at::Tensor input_reshaped = input.reshape({1, M, -1});
+  at::Tensor input_reshaped = input.numel() == 0 ? input.reshape({1, M, 0}) : input.reshape({1, M, -1});
   // Unlike Batch Normalization, which applies scalar scale and bias for each
   // entire channel/plane with the affine option, Layer Normalization applies
   // per-element scale and bias. E.g. For input {N, C, H, W}, weight for

From 5fc5802ddd037857c9737fd6cca8a6e8e4abb9c3 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Fri, 30 Sep 2022 18:12:22 -0700
Subject: [PATCH 1777/1922] Fix slice from view tensors (#129)

* Fix slice from view tensors

* Update test name
---
 aten/src/ATen/native/mps/operations/Copy.mm | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm
index 2bfee3f9a393e..632ad32724f1a 100644
--- a/aten/src/ATen/native/mps/operations/Copy.mm
+++ b/aten/src/ATen/native/mps/operations/Copy.mm
@@ -115,8 +115,8 @@ void copy_cast_mps(at::Tensor& dst, const at::Tensor& src,
     src = src_;
   }
   id<MTLBuffer> sourceBuffer = getMTLBufferStorage(src);
-  size_t dst_tensor_nbytes = dst.nbytes();
-
+  size_t dst_tensor_nbytes = dst.is_view() ? at::detail::computeStorageNbytesContiguous(dst.sizes(), dst.element_size(), dst.storage_offset()) :
+                                             dst.nbytes();
   @autoreleasepool {
     MTLResourceOptions options = MTLResourceOptionCPUCacheModeDefault | MTLResourceStorageModeShared;
     NSUInteger alignedLength = 0;
@@ -195,6 +195,9 @@ void copy_cast_mps(at::Tensor& dst, const at::Tensor& src,
     src_total_size = src.nbytes();
   }
 
+  size_t dst_tensor_nbytes = dst_.is_view() ? at::detail::computeStorageNbytesContiguous(dst_.sizes(), dst_.element_size(), dst_.storage_offset()) :
+                                              dst_.nbytes();
+
   const size_t size_to_copy = src.nbytes();
   const void* host_src = src.storage().data();
   TORCH_INTERNAL_ASSERT(src_total_size >= (src.storage_offset() * src.element_size()));

From 0105c9b8f1b703cccaf392b7d0e884103bc83b8c Mon Sep 17 00:00:00 2001
From: Abhishek Pathak <abhipathak97@gmail.com>
Date: Fri, 30 Sep 2022 18:55:33 -0700
Subject: [PATCH 1778/1922] Cast dot inputs to int32 when needed (#121)

* Cast dot inputs to int32 when needed

* Add dot uint8 to allowlist

* Add support for int8 input

* Fix indentation
---
 test/test_mps.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/test_mps.py b/test/test_mps.py
index 700f8821ecbba..79823c995931c 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -7887,6 +7887,10 @@ class TestConsistency(TestCase):
         'trapz': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'nn.functional.local_response_norm': ['torch.int64'],
         'index_select': ['torch.bool'],
+        'repeat': ['torch.bool'],
+        'rot90': ['torch.bool'],
+        'tile': ['torch.bool'],
+        'split': ['torch.float32'],
     }
 
     # Used for accept mode only

From 1b32ddcb536fed20a387f8616f4251bf3400bcc6 Mon Sep 17 00:00:00 2001
From: Abhishek Pathak <abhipathak97@gmail.com>
Date: Mon, 3 Oct 2022 14:17:28 -0700
Subject: [PATCH 1779/1922] Raise error for dot int64 input (#130)

---
 aten/src/ATen/native/mps/operations/Blas.mm | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/aten/src/ATen/native/mps/operations/Blas.mm b/aten/src/ATen/native/mps/operations/Blas.mm
index 20a3ec5eb6db4..31b0592620018 100644
--- a/aten/src/ATen/native/mps/operations/Blas.mm
+++ b/aten/src/ATen/native/mps/operations/Blas.mm
@@ -21,6 +21,9 @@ Tensor dot_mps(
   const Tensor &self,
   const Tensor &other)
 {
+
+  TORCH_CHECK(self.scalar_type() != ScalarType::Long, "MPS: dot op doesn't support int64 input")
+
   using namespace mps;
   auto output = at::native::empty_mps({}, self.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt);
 

From bf7342888b5de736c6e69c62b3a2d2e84aed522e Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Tue, 4 Oct 2022 11:36:41 -0400
Subject: [PATCH 1780/1922] Add MPSHooks interface to enable accessing specific
 MPS functions globally (#128)

---
 aten/src/ATen/Context.cpp                  | 12 ------
 aten/src/ATen/Context.h                    |  6 ++-
 aten/src/ATen/detail/MPSHooksInterface.cpp | 32 ++++++++++++++
 aten/src/ATen/detail/MPSHooksInterface.h   | 49 ++++++++++++++++++++++
 aten/src/ATen/mps/MPSHooks.cpp             | 35 ++++++++++++++++
 aten/src/ATen/mps/MPSHooks.h               | 17 ++++++++
 6 files changed, 137 insertions(+), 14 deletions(-)
 create mode 100644 aten/src/ATen/detail/MPSHooksInterface.cpp
 create mode 100644 aten/src/ATen/detail/MPSHooksInterface.h
 create mode 100644 aten/src/ATen/mps/MPSHooks.cpp
 create mode 100644 aten/src/ATen/mps/MPSHooks.h

diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index 936e9b6252863..256a4bd9e5fdf 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -21,10 +21,6 @@
 #include <fbgemm/Fbgemm.h>
 #endif // USE_FBGEMM
 
-#ifdef USE_MPS
-#include <ATen/mps/MPSDevice.h>
-#endif
-
 namespace at {
 
 Context::Context() = default;
@@ -270,14 +266,6 @@ bool Context::hasMKLDNN() {
 #endif
 }
 
-bool Context::hasMPS() {
-#if USE_MPS
-  return at::mps::is_available();
-#else
-  return false;
-#endif
-}
-
 bool Context::hasOpenMP() {
 #ifdef _OPENMP
   return true;
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 48e3c935a2c0c..7f23503c36bcb 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -9,6 +9,7 @@
 #include <ATen/detail/CUDAHooksInterface.h>
 #include <ATen/detail/HIPHooksInterface.h>
 #include <ATen/detail/ORTHooksInterface.h>
+#include <ATen/detail/MPSHooksInterface.h>
 #include <c10/core/QEngine.h>
 #include <c10/core/impl/DeviceGuardImplInterface.h>
 #include <c10/util/CallOnce.h>
@@ -83,6 +84,9 @@ class TORCH_API Context {
   static bool hasHIP() {
     return detail::getHIPHooks().hasHIP();
   }
+  static bool hasMPS() {
+    return detail::getMPSHooks().hasMPS();
+  }
   static bool hasIPU() {
     return c10::impl::hasDeviceGuardImpl(at::DeviceType::IPU);
   }
@@ -92,8 +96,6 @@ class TORCH_API Context {
   static bool hasLazy() {
     return c10::impl::hasDeviceGuardImpl(at::DeviceType::Lazy);
   }
-  static bool hasMPS();
-
   static bool hasORT() {
     return c10::impl::hasDeviceGuardImpl(at::DeviceType::ORT);
   }
diff --git a/aten/src/ATen/detail/MPSHooksInterface.cpp b/aten/src/ATen/detail/MPSHooksInterface.cpp
new file mode 100644
index 0000000000000..87cd26d517985
--- /dev/null
+++ b/aten/src/ATen/detail/MPSHooksInterface.cpp
@@ -0,0 +1,32 @@
+#include <ATen/detail/MPSHooksInterface.h>
+#include <c10/util/Exception.h>
+
+namespace at {
+namespace detail {
+
+const MPSHooksInterface& getMPSHooks() {
+  static std::unique_ptr<MPSHooksInterface> mps_hooks;
+#if !defined C10_MOBILE
+  static std::once_flag once;
+  std::call_once(once, [] {
+    mps_hooks = MPSHooksRegistry()->Create("MPSHooks", MPSHooksArgs{});
+    if (!mps_hooks) {
+      mps_hooks =
+          // NOLINTNEXTLINE(modernize-make-unique)
+          std::unique_ptr<MPSHooksInterface>(new MPSHooksInterface());
+    }
+  });
+#else
+  if (mps_hooks == nullptr) {
+    mps_hooks =
+        // NOLINTNEXTLINE(modernize-make-unique)
+        std::unique_ptr<MPSHooksInterface>(new MPSHooksInterface());
+  }
+#endif
+  return *mps_hooks;
+}
+} // namespace detail
+
+C10_DEFINE_REGISTRY(MPSHooksRegistry, MPSHooksInterface, MPSHooksArgs)
+
+} // namespace at
diff --git a/aten/src/ATen/detail/MPSHooksInterface.h b/aten/src/ATen/detail/MPSHooksInterface.h
new file mode 100644
index 0000000000000..382bcd3255d13
--- /dev/null
+++ b/aten/src/ATen/detail/MPSHooksInterface.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <ATen/core/Generator.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Registry.h>
+
+#include <cstddef>
+#include <functional>
+
+namespace at {
+class Context;
+}
+
+namespace at {
+
+struct TORCH_API MPSHooksInterface {
+  virtual ~MPSHooksInterface() {}
+
+  // Initialize the MPS library state
+  virtual void initMPS() const {
+    AT_ERROR("Cannot initialize MPS without MPS backend.");
+  }
+
+  virtual bool hasMPS() const {
+    return false;
+  }
+
+  virtual const Generator& getDefaultMPSGenerator(DeviceIndex device_index = -1) const {
+    (void)device_index; // Suppress unused variable warning
+    AT_ERROR("Cannot get default MPS generator without MPS backend.");
+  }
+
+  virtual Allocator* getMPSDeviceAllocator() const {
+    AT_ERROR("MPSDeviceAllocator requires MPS.");
+  }
+};
+
+struct TORCH_API MPSHooksArgs {};
+
+C10_DECLARE_REGISTRY(MPSHooksRegistry, MPSHooksInterface, MPSHooksArgs);
+#define REGISTER_MPS_HOOKS(clsname) \
+  C10_REGISTER_CLASS(MPSHooksRegistry, clsname, clsname)
+
+namespace detail {
+TORCH_API const MPSHooksInterface& getMPSHooks();
+
+} // namespace detail
+} // namespace at
diff --git a/aten/src/ATen/mps/MPSHooks.cpp b/aten/src/ATen/mps/MPSHooks.cpp
new file mode 100644
index 0000000000000..bbf7234462189
--- /dev/null
+++ b/aten/src/ATen/mps/MPSHooks.cpp
@@ -0,0 +1,35 @@
+#include <ATen/mps/MPSHooks.h>
+
+#include <ATen/Context.h>
+#include <ATen/mps/MPSDevice.h>
+#include <ATen/detail/MPSHooksInterface.h>
+#include <c10/util/irange.h>
+
+#include <sstream>
+#include <cstddef>
+#include <functional>
+#include <memory>
+
+namespace at {
+namespace mps {
+
+void MPSHooks::initMPS() const {
+  C10_LOG_API_USAGE_ONCE("aten.init.mps");
+  // TODO: initialize MPS devices and streams here
+}
+
+bool MPSHooks::hasMPS() const {
+  return at::mps::is_available();
+}
+
+Allocator* MPSHooks::getMPSDeviceAllocator() const {
+  return at::mps::GetMPSAllocator();
+}
+
+using at::MPSHooksRegistry;
+using at::RegistererMPSHooksRegistry;
+
+REGISTER_MPS_HOOKS(MPSHooks);
+
+} // namespace mps
+} // namespace at
diff --git a/aten/src/ATen/mps/MPSHooks.h b/aten/src/ATen/mps/MPSHooks.h
new file mode 100644
index 0000000000000..13647d83c740b
--- /dev/null
+++ b/aten/src/ATen/mps/MPSHooks.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <ATen/detail/MPSHooksInterface.h>
+#include <ATen/Generator.h>
+#include <c10/util/Optional.h>
+
+namespace at { namespace mps {
+
+// The real implementation of MPSHooksInterface
+struct MPSHooks : public at::MPSHooksInterface {
+  MPSHooks(at::MPSHooksArgs) {}
+  void initMPS() const override;
+  bool hasMPS() const override;
+  Allocator* getMPSDeviceAllocator() const override;
+};
+
+}} // at::mps

From bc3ff0a5d410ceee6203e3c3f2173b84217827ef Mon Sep 17 00:00:00 2001
From: Kulin Seth <kulin_seth@apple.com>
Date: Tue, 4 Oct 2022 23:24:53 -0700
Subject: [PATCH 1781/1922] Softplus and einsum are crashing.

---
 test/test_mps.py | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index 79823c995931c..ffe2190403a1b 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -3904,29 +3904,29 @@ def helper(shape, dim=0):
             for dim in range(len(shape)):
                 helper(shape, dim)
 
-    # Test softplus
-    def test_softplus(self):
-        def helper(shape, beta=1, threshold=20):
-            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
-            x = cpu_x.detach().clone().to('mps').requires_grad_()
+    # # Test softplus
+    # def test_softplus(self):
+        # def helper(shape, beta=1, threshold=20):
+            # cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+            # x = cpu_x.detach().clone().to('mps').requires_grad_()
 
-            softplus_result = torch.nn.Softplus(beta=beta, threshold=threshold)(x)
-            softplus_result_cpu = torch.nn.Softplus(beta=beta, threshold=threshold)(cpu_x)
+            # softplus_result = torch.nn.Softplus(beta=beta, threshold=threshold)(x)
+            # softplus_result_cpu = torch.nn.Softplus(beta=beta, threshold=threshold)(cpu_x)
 
-            cpu_grad = torch.randn(softplus_result.shape)
-            grad = cpu_grad.to('mps')
+            # cpu_grad = torch.randn(softplus_result.shape)
+            # grad = cpu_grad.to('mps')
 
-            softplus_result.backward(gradient=grad)
-            softplus_result_cpu.backward(gradient=cpu_grad)
+            # softplus_result.backward(gradient=grad)
+            # softplus_result_cpu.backward(gradient=cpu_grad)
 
-            self.assertEqual(softplus_result, softplus_result_cpu)
-            self.assertEqual(x.grad, cpu_x.grad)
+            # self.assertEqual(softplus_result, softplus_result_cpu)
+            # self.assertEqual(x.grad, cpu_x.grad)
 
-        # Test empty shape too
-        for shape in [(), (2, 3), (10, 10), (2, 3, 4, 5)]:
-            for beta in [0.5, 1, 2, 3, 4]:
-                for threshold in [0.5, 20, 30, 40, 50]:
-                    helper(shape, beta, threshold)
+        # # Test empty shape too
+        # for shape in [(), (2, 3), (10, 10), (2, 3, 4, 5)]:
+            # for beta in [0.5, 1, 2, 3, 4]:
+                # for threshold in [0.5, 20, 30, 40, 50]:
+                    # helper(shape, beta, threshold)
 
     # Test silu
 
@@ -7320,7 +7320,6 @@ class TestConsistency(TestCase):
         'diff': ['f16', 'f32', 'i16', 'i32', 'i64'],
         'dist': ['f32'],
         'dot': ['f32', 'i16', 'i32', 'i64', 'u8'],
-        'einsum': ['f32'],
         'equal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'erf': ['f32', 'i16', 'i32', 'u8'],
         'exp': ['f32', 'i16', 'i32', 'u8'],
@@ -7548,7 +7547,6 @@ class TestConsistency(TestCase):
         'diff': ['f16', 'f32'],
         'dist': ['f32'],
         'dot': ['f32'],
-        'einsum': ['f32'],
         'erf': ['f32'],
         'exp': ['f32'],
         'exp2': ['f16', 'f32'],
@@ -7667,6 +7665,7 @@ class TestConsistency(TestCase):
     # All the entries in this list should be removed
     BLOCKLIST = {
         # Functions that hang
+        'einsum': ['f32'],
         'masked_fill': [torch.bool, torch.uint8, torch.float32], 'where': [torch.bool],
         # + forward when requires_grad=True or running backward
         'masked.mean': [torch.bool, torch.float16],

From 0f112d440d93fb38dbfcc9d61f5ad28c4fdf941a Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Wed, 5 Oct 2022 19:05:25 -0400
Subject: [PATCH 1782/1922] Add MPSGenerator to enable custom random number
 generators on MPS backend (#131)

This patch will will add support for creating torch.Generator for MPS device,
and enables its functions such as manual_seed, get_state, and set_state.
---
 aten/src/ATen/Context.h                       |  11 +-
 aten/src/ATen/detail/MPSHooksInterface.cpp    |   5 +-
 aten/src/ATen/detail/MPSHooksInterface.h      |   7 +-
 aten/src/ATen/mps/MPSGeneratorImpl.h          |  52 +++++++++
 aten/src/ATen/mps/MPSGeneratorImpl.mm         | 100 ++++++++++++++++++
 aten/src/ATen/mps/MPSHooks.cpp                |  16 ++-
 aten/src/ATen/mps/MPSHooks.h                  |   3 +
 aten/src/ATen/native/mps/OperationUtils.h     |  18 ----
 aten/src/ATen/native/mps/OperationUtils.mm    |  56 +---------
 .../native/mps/operations/Distributions.mm    |  64 ++++++-----
 test/test_mps.py                              |  22 ++++
 torch/csrc/Generator.cpp                      |  18 +++-
 12 files changed, 247 insertions(+), 125 deletions(-)
 create mode 100644 aten/src/ATen/mps/MPSGeneratorImpl.h
 create mode 100644 aten/src/ATen/mps/MPSGeneratorImpl.mm

diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 7f23503c36bcb..9f1c571b66968 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -8,8 +8,8 @@
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <ATen/detail/CUDAHooksInterface.h>
 #include <ATen/detail/HIPHooksInterface.h>
-#include <ATen/detail/ORTHooksInterface.h>
 #include <ATen/detail/MPSHooksInterface.h>
+#include <ATen/detail/ORTHooksInterface.h>
 #include <c10/core/QEngine.h>
 #include <c10/core/impl/DeviceGuardImplInterface.h>
 #include <c10/util/CallOnce.h>
@@ -38,6 +38,8 @@ class TORCH_API Context {
       return at::detail::getDefaultCPUGenerator();
     } else if (device_type == at::kCUDA) {
       return at::detail::getCUDAHooks().getDefaultCUDAGenerator(device.index());
+    } else if (device_type == at::kMPS) {
+      return at::detail::getMPSHooks().getDefaultMPSGenerator();
     } else {
       AT_ERROR(DeviceTypeName(device_type), " device type not enabled.");
     }
@@ -421,6 +423,13 @@ static inline void manual_seed(uint64_t seed) {
       }
     }
   }
+
+  if (hasMPS()) {
+    auto mps_gen = globalContext().defaultGenerator(DeviceType::MPS);
+    // See Note [Acquire lock when using random generators]
+    std::lock_guard<std::mutex> lock(mps_gen.mutex());
+    mps_gen.set_current_seed(seed);
+  }
 }
 
 // When the global flag `allow_tf32` is set to true, cuBLAS handles are
diff --git a/aten/src/ATen/detail/MPSHooksInterface.cpp b/aten/src/ATen/detail/MPSHooksInterface.cpp
index 87cd26d517985..823b2295b1ace 100644
--- a/aten/src/ATen/detail/MPSHooksInterface.cpp
+++ b/aten/src/ATen/detail/MPSHooksInterface.cpp
@@ -1,5 +1,6 @@
 #include <ATen/detail/MPSHooksInterface.h>
 #include <c10/util/Exception.h>
+#include <c10/util/CallOnce.h>
 
 namespace at {
 namespace detail {
@@ -7,8 +8,8 @@ namespace detail {
 const MPSHooksInterface& getMPSHooks() {
   static std::unique_ptr<MPSHooksInterface> mps_hooks;
 #if !defined C10_MOBILE
-  static std::once_flag once;
-  std::call_once(once, [] {
+  static c10::once_flag once;
+  c10::call_once(once, [] {
     mps_hooks = MPSHooksRegistry()->Create("MPSHooks", MPSHooksArgs{});
     if (!mps_hooks) {
       mps_hooks =
diff --git a/aten/src/ATen/detail/MPSHooksInterface.h b/aten/src/ATen/detail/MPSHooksInterface.h
index 382bcd3255d13..4fff139f27745 100644
--- a/aten/src/ATen/detail/MPSHooksInterface.h
+++ b/aten/src/ATen/detail/MPSHooksInterface.h
@@ -1,3 +1,5 @@
+//  Copyright © 2022 Apple Inc.
+
 #pragma once
 
 #include <c10/core/Allocator.h>
@@ -15,7 +17,7 @@ class Context;
 namespace at {
 
 struct TORCH_API MPSHooksInterface {
-  virtual ~MPSHooksInterface() {}
+  virtual ~MPSHooksInterface() = default;
 
   // Initialize the MPS library state
   virtual void initMPS() const {
@@ -26,8 +28,7 @@ struct TORCH_API MPSHooksInterface {
     return false;
   }
 
-  virtual const Generator& getDefaultMPSGenerator(DeviceIndex device_index = -1) const {
-    (void)device_index; // Suppress unused variable warning
+  virtual const Generator& getDefaultMPSGenerator() const {
     AT_ERROR("Cannot get default MPS generator without MPS backend.");
   }
 
diff --git a/aten/src/ATen/mps/MPSGeneratorImpl.h b/aten/src/ATen/mps/MPSGeneratorImpl.h
new file mode 100644
index 0000000000000..9695eb719274c
--- /dev/null
+++ b/aten/src/ATen/mps/MPSGeneratorImpl.h
@@ -0,0 +1,52 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+
+#include <ATen/core/Generator.h>
+#include <ATen/core/PhiloxRNGEngine.h>
+#include <c10/core/GeneratorImpl.h>
+#include <c10/util/Optional.h>
+
+namespace at {
+namespace mps {
+namespace detail {
+
+static const uint32_t PHILOX_STATE_N = 7;
+struct rng_data_pod {
+  std::array<uint32_t, PHILOX_STATE_N> state{1};
+  uint64_t seed = default_rng_seed_val;
+};
+
+TORCH_API const Generator& getDefaultMPSGenerator();
+TORCH_API Generator createMPSGenerator(uint64_t seed_val = default_rng_seed_val);
+
+} // namespace detail
+} // namespace mps
+
+struct TORCH_API MPSGeneratorImpl : public c10::GeneratorImpl {
+  // Constructors
+  MPSGeneratorImpl(uint64_t seed_in = default_rng_seed_val);
+  ~MPSGeneratorImpl() override = default;
+
+  // MPSGeneratorImpl methods
+  std::shared_ptr<MPSGeneratorImpl> clone() const;
+  void set_current_seed(uint64_t seed) override;
+  uint64_t current_seed() const override;
+  uint64_t seed() override;
+  void set_state(const c10::TensorImpl& new_state) override;
+  c10::intrusive_ptr<c10::TensorImpl> get_state() const override;
+  void update_philox_counters();
+
+  void set_engine(at::Philox4_32 engine) { engine_ = engine; };
+  at::Philox4_32 engine() { return engine_; };
+  uint32_t* state_data() { return data_.state.data(); }
+  static DeviceType device_type() { return DeviceType::MPS; };
+
+private:
+  mps::detail::rng_data_pod data_;
+  at::Philox4_32 engine_;
+
+  MPSGeneratorImpl* clone_impl() const override;
+};
+
+} // namespace at
diff --git a/aten/src/ATen/mps/MPSGeneratorImpl.mm b/aten/src/ATen/mps/MPSGeneratorImpl.mm
new file mode 100644
index 0000000000000..8f2d5168b71b8
--- /dev/null
+++ b/aten/src/ATen/mps/MPSGeneratorImpl.mm
@@ -0,0 +1,100 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/Utils.h>
+#include <ATen/mps/MPSGeneratorImpl.h>
+#include <algorithm>
+
+namespace at {
+namespace mps {
+namespace detail {
+
+const Generator& getDefaultMPSGenerator() {
+  static auto default_gen_mps = createMPSGenerator(c10::detail::getNonDeterministicRandom());
+  return default_gen_mps;
+}
+
+Generator createMPSGenerator(uint64_t seed_val) {
+  auto gen = make_generator<MPSGeneratorImpl>(seed_val);
+  gen.set_current_seed(seed_val);
+  return gen;
+}
+
+} // namespace detail
+} // namespace mps
+
+MPSGeneratorImpl::MPSGeneratorImpl(uint64_t seed_in)
+  : c10::GeneratorImpl{Device(DeviceType::MPS), DispatchKeySet(c10::DispatchKey::MPS)},
+    data_({.seed = seed_in}), engine_(seed_in, 0, 0) { }
+
+void MPSGeneratorImpl::set_current_seed(uint64_t seed) {
+  data_.seed = seed;
+  data_.state.fill(1);
+  // the two last state values are the Philox keys
+  // TODO: make "key" in PhiloxRNGEngine.h public so we don't duplicate code here
+  data_.state[5] = static_cast<uint32_t>(seed);
+  data_.state[6] = static_cast<uint32_t>(seed >> 32);
+  engine_.reset_state(seed);
+}
+
+uint64_t MPSGeneratorImpl::current_seed() const {
+  return data_.seed;
+}
+
+uint64_t MPSGeneratorImpl::seed() {
+  auto random = c10::detail::getNonDeterministicRandom();
+  this->set_current_seed(random);
+  return random;
+}
+
+// See Note [Acquire lock when using random generators]
+void MPSGeneratorImpl::update_philox_counters() {
+  // calling engine_() would call operator() of philox_engine class to
+  // get each of the four newly generated counter values (see PhiloxRNGEngine.h).
+  for (int i = 1; i <= 4; i++) {
+    data_.state[i] = engine_();
+  }
+}
+
+c10::intrusive_ptr<c10::TensorImpl> MPSGeneratorImpl::get_state() const {
+  static const size_t states_size = mps::detail::PHILOX_STATE_N * sizeof(uint32_t);
+  static const size_t seed_size = sizeof(uint64_t);
+  static const size_t total_size = states_size + seed_size;
+
+  auto state_tensor = at::detail::empty_cpu({(int64_t)total_size}, ScalarType::Byte, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt);
+  auto rng_state = state_tensor.data_ptr<uint8_t>();
+  auto current_seed = this->current_seed();
+  memcpy(rng_state, this->data_.state.data(), states_size);
+  memcpy(rng_state + states_size, &current_seed, seed_size);
+
+  return state_tensor.getIntrusivePtr();
+}
+
+void MPSGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
+  static const size_t states_size = mps::detail::PHILOX_STATE_N * sizeof(uint32_t);
+  static const size_t seed_size = sizeof(uint64_t);
+  static const size_t total_size = states_size + seed_size;
+
+  detail::check_rng_state(new_state);
+
+  auto new_state_size = new_state.numel();
+  TORCH_CHECK(new_state_size == total_size, "RNG state is wrong size");
+
+  uint64_t input_seed = default_rng_seed_val;
+  auto new_rng_state = new_state.data<uint8_t>();
+  memcpy(&input_seed, new_rng_state + states_size, seed_size);
+  this->set_current_seed(input_seed);
+  // state.data must be copied after input_seed to not reset the state in set_current_seed()
+  memcpy(this->state_data(), new_rng_state, states_size);
+}
+
+std::shared_ptr<MPSGeneratorImpl> MPSGeneratorImpl::clone() const {
+  return std::shared_ptr<MPSGeneratorImpl>(this->clone_impl());
+}
+
+MPSGeneratorImpl* MPSGeneratorImpl::clone_impl() const {
+  auto gen = new MPSGeneratorImpl(this->data_.seed);
+  gen->set_current_seed(this->data_.seed);
+  return gen;
+}
+
+} // namespace at
diff --git a/aten/src/ATen/mps/MPSHooks.cpp b/aten/src/ATen/mps/MPSHooks.cpp
index bbf7234462189..5fde8f3843fe6 100644
--- a/aten/src/ATen/mps/MPSHooks.cpp
+++ b/aten/src/ATen/mps/MPSHooks.cpp
@@ -1,14 +1,8 @@
-#include <ATen/mps/MPSHooks.h>
+//  Copyright © 2022 Apple Inc.
 
-#include <ATen/Context.h>
+#include <ATen/mps/MPSHooks.h>
 #include <ATen/mps/MPSDevice.h>
-#include <ATen/detail/MPSHooksInterface.h>
-#include <c10/util/irange.h>
-
-#include <sstream>
-#include <cstddef>
-#include <functional>
-#include <memory>
+#include <ATen/mps/MPSGeneratorImpl.h>
 
 namespace at {
 namespace mps {
@@ -26,6 +20,10 @@ Allocator* MPSHooks::getMPSDeviceAllocator() const {
   return at::mps::GetMPSAllocator();
 }
 
+const Generator& MPSHooks::getDefaultMPSGenerator() const {
+  return at::mps::detail::getDefaultMPSGenerator();
+}
+
 using at::MPSHooksRegistry;
 using at::RegistererMPSHooksRegistry;
 
diff --git a/aten/src/ATen/mps/MPSHooks.h b/aten/src/ATen/mps/MPSHooks.h
index 13647d83c740b..2bef3eac42648 100644
--- a/aten/src/ATen/mps/MPSHooks.h
+++ b/aten/src/ATen/mps/MPSHooks.h
@@ -1,3 +1,5 @@
+//  Copyright © 2022 Apple Inc.
+
 #pragma once
 
 #include <ATen/detail/MPSHooksInterface.h>
@@ -12,6 +14,7 @@ struct MPSHooks : public at::MPSHooksInterface {
   void initMPS() const override;
   bool hasMPS() const override;
   Allocator* getMPSDeviceAllocator() const override;
+  const Generator& getDefaultMPSGenerator() const override;
 };
 
 }} // at::mps
diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
index 93b0141243397..cc86c4ede4c3b 100644
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@@ -19,24 +19,6 @@ namespace at {
 namespace native {
 namespace mps {
 
-struct TORCH_CUDA_CPP_API MPSGeneratorImpl : public c10::GeneratorImpl {
-  MPSGeneratorImpl(DeviceIndex device_index = -1);
-  ~MPSGeneratorImpl() = default;
-
-  void set_current_seed(uint64_t seed) override;
-  uint64_t current_seed() const override;
-  uint64_t seed() override;
-  void set_state(const c10::TensorImpl& new_state) override;
-  c10::intrusive_ptr<c10::TensorImpl> get_state() const override;
-  static DeviceType device_type();
-
-private:
-  MPSGeneratorImpl* clone_impl() const override;
-  uint64_t seed_ = default_rng_seed_val;
-};
-
-const Generator& getDefaultMPSGenerator();
-
 struct MPSScalar {
   id<MTLBuffer> getMTLBuffer() const { return __builtin_bit_cast(id<MTLBuffer>, buffer.get()); }
 
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index f41484b27b143..6e3ecc3b8e9bf 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -7,60 +7,6 @@
 namespace native {
 namespace mps {
 
-uint64_t MPSGeneratorImpl::seed() {
-  auto random = c10::detail::getNonDeterministicRandom(true);
-  this->set_current_seed(random);
-  return random;
-}
-
-uint64_t MPSGeneratorImpl::current_seed() const {
-  return seed_;
-}
-
-void MPSGeneratorImpl::set_current_seed(uint64_t seed) {
-  seed_ = seed;
-}
-
-MPSGeneratorImpl::MPSGeneratorImpl(DeviceIndex device_index)
-  : c10::GeneratorImpl{Device(DeviceType::MPS, device_index),
-              DispatchKeySet(c10::DispatchKey::MPS)} {
-}
-
-const Generator& getDefaultMPSGenerator() {
-  static auto gen = make_generator<MPSGeneratorImpl>(0);
-  gen.seed();
-  return gen;
-}
-DeviceType MPSGeneratorImpl::device_type() {
-  return DeviceType::MPS;
-}
-c10::intrusive_ptr<c10::TensorImpl> MPSGeneratorImpl::get_state() const {
-  static const size_t seed_size = sizeof(uint64_t);
-  static const size_t offset_size = sizeof(int64_t);
-  static const size_t total_size = seed_size + offset_size;
-
-  auto state_tensor = at::detail::empty_cpu({(int64_t)total_size}, ScalarType::Byte, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt);
-
-  return state_tensor.getIntrusivePtr();
-}
-
-void MPSGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
-  static const size_t seed_size = sizeof(uint64_t);
-
-  detail::check_rng_state(new_state);
-
-  uint64_t input_seed;
-  auto new_rng_state = new_state.data<uint8_t>();
-  memcpy(&input_seed, new_rng_state, seed_size);
-  this->set_current_seed(input_seed);
-}
-
-MPSGeneratorImpl* MPSGeneratorImpl::clone_impl() const {
-  auto gen = new MPSGeneratorImpl(0);
-  gen->set_current_seed(this->seed_);
-  return gen;
-}
-
 void runMPSGraph(MPSStream* mpsStream, MPSGraph* mpsGraph, NSDictionary* feeds, NSDictionary* results) {
   mpsStream->executeMPSGraph(mpsGraph, feeds, results, SyncType::COMMIT_ADAPTIVE);
 }
@@ -388,4 +334,4 @@ void executeMPSAllocatorCallback(void* ptr, EventType event) override { }
 
 } // namespace mps
 } // namespace native
-} // namespace at
\ No newline at end of file
+} // namespace at
diff --git a/aten/src/ATen/native/mps/operations/Distributions.mm b/aten/src/ATen/native/mps/operations/Distributions.mm
index 99d01c6825b35..b527da3925d69 100644
--- a/aten/src/ATen/native/mps/operations/Distributions.mm
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@@ -3,7 +3,7 @@
 #include <ATen/native/Distributions.h>
 #include <ATen/native/DistributionTemplates.h>
 #include <ATen/native/mps/OperationUtils.h>
-#include <ATen/core/PhiloxRNGEngine.h>
+#include <ATen/mps/MPSGeneratorImpl.h>
 
 namespace at {
 namespace native {
@@ -26,17 +26,6 @@
   MPSGraphTensor *stateTensor = nil;
   // used for Normal distributions only
   MPSGraphTensor *meanTensor = nil, *stdTensor = nil;
-  // we initialize and keep the philox's state in the graph. This would
-  // guarantee producing new random values each time the same graph is reused.
-  at::Philox4_32 philoxState;
-  std::array<uint32_t, 7> stateValues = {1};
-
-  void updatePhiloxCounters() {
-    // calling philoxState() would call operator() of philox_engine class to
-    // get each of the four newly generated counter values (see PhiloxRNGEngine.h).
-    for (int i = 1; i <= 4; i++)
-      stateValues[i] = philoxState();
-  }
 };
 
 typedef MPSGraphTensor* (^RandomOpBlock)(RandomCachedGraph*, MPSGraphTensor*);
@@ -49,11 +38,13 @@ void updatePhiloxCounters() {
                         const c10::optional<Tensor>& mean_opt,
                         const c10::optional<Tensor>& std_opt,
                         MPSGraphRandomDistribution distribution,
+                        c10::optional<Generator> gen,
                         std::string op_name, RandomOpBlock randomBlock)
 {
   if (self.numel() == 0) {
     return self;
   }
+  auto mps_gen = get_generator_or_default<MPSGeneratorImpl>(gen, at::mps::detail::getDefaultMPSGenerator());
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
   MPSStream* stream = getCurrentMPSStream();
 
@@ -68,7 +59,7 @@ void updatePhiloxCounters() {
         @autoreleasepool {
           MPSGraph* mpsGraph = make_mps_graph();
           newCachedGraph = new RandomCachedGraph(mpsGraph);
-          newCachedGraph->stateTensor = mpsGraphRankedPlaceHolder(mpsGraph, MPSDataTypeInt32, @[@7]);
+          newCachedGraph->stateTensor = mpsGraphRankedPlaceHolder(mpsGraph, MPSDataTypeInt32, @[@(at::mps::detail::PHILOX_STATE_N)]);
 
           // FP16, FP32 and Int32 are the only data types supported for distributions on MPS backend.
           const MPSDataType inputDataType = [&] {
@@ -95,7 +86,7 @@ void updatePhiloxCounters() {
             desc.standardDeviation = static_cast<float>(val2);
           }
           // we don't use the output state tensor from the MPSGraph API as it requires reading back from GPU to CPU.
-          // Instead, we keep the Philox state in the cached graph and use the PyTorch's philox_engine to maintain
+          // Instead, we keep the Philox state in the MPSGenerator and use the PyTorch's philox_engine to maintain
           // the counters, and feed them to the graph manually
           NSArray<MPSGraphTensor*> *resultTensors = [mpsGraph randomTensorWithShape: getMPSShape(self)
                                                                          descriptor: desc
@@ -109,12 +100,16 @@ void updatePhiloxCounters() {
         return newCachedGraph;
       });
     }
-    // update the Philox state values on each run of the same graph
-    cachedGraph->updatePhiloxCounters();
     // feed the updated state values to the graph
-    MPSNDArrayDescriptor *stateDesc = [MPSNDArrayDescriptor descriptorWithDataType: MPSDataTypeInt32 shape: @[@7]];
+    MPSNDArrayDescriptor *stateDesc = [MPSNDArrayDescriptor descriptorWithDataType: MPSDataTypeInt32 shape: @[@(at::mps::detail::PHILOX_STATE_N)]];
     MPSNDArray *stateNDArray = [[[MPSNDArray alloc] initWithDevice: stream->device() descriptor: stateDesc] autorelease];
-    [stateNDArray writeBytes: &cachedGraph->stateValues[0] strideBytes: nil];
+    {
+      // See Note [Acquire lock when using random generators]
+      std::lock_guard<std::mutex> lock(mps_gen->mutex_);
+      // update the Philox state values on each run
+      mps_gen->update_philox_counters();
+      [stateNDArray writeBytes: mps_gen->state_data() strideBytes: nil];
+    }
     MPSGraphTensorData* stateTensorData = [[[MPSGraphTensorData alloc] initWithMPSNDArray: stateNDArray] autorelease];
 
     Placeholder meanPlaceholder, stdPlaceholder;
@@ -146,6 +141,7 @@ void updatePhiloxCounters() {
 Tensor& normal_mps_impl(Tensor& self, double mean_s, double std_s,
                         const c10::optional<Tensor>& mean_opt,
                         const c10::optional<Tensor>& std_opt,
+                        c10::optional<Generator> gen,
                         std::string op_name)
 {
   const Tensor& std_t  = *(at::borrow_from_optional_tensor(std_opt));
@@ -177,12 +173,12 @@ void updatePhiloxCounters() {
     return resultTensor;
   };
   return random_mps_impl<double>(self, mean_s, std_s, mean_opt, std_opt,
-                                 MPSGraphRandomDistributionNormal,
+                                 MPSGraphRandomDistributionNormal, gen,
                                  op_name + getTensorsStringKey({mean_t, std_t}), random_op_block);
 
 }
 
-Tensor& bernoulli_mps_impl(Tensor& self, const Tensor& prob_t, std::string op_name)
+Tensor& bernoulli_mps_impl(Tensor& self, const Tensor& prob_t, c10::optional<Generator> gen, std::string op_name)
 {
   TORCH_CHECK(prob_t.is_same_size(self), op_name, ": probability and self tensor should be of the same shape")
 
@@ -195,7 +191,7 @@ void updatePhiloxCounters() {
   };
   // Bernoulli generates binary output so we use bool type
   return mps::random_mps_impl<bool>(self, 0.0, 1.0, c10::nullopt, prob_t,
-                                    MPSGraphRandomDistributionUniform,
+                                    MPSGraphRandomDistributionUniform, gen,
                                     op_name + getTensorsStringKey({prob_t}), random_op_block);
 }
 
@@ -215,16 +211,16 @@ void updatePhiloxCounters() {
   });
 
   return mps::random_mps_impl<double>(self, from, to, c10::nullopt, c10::nullopt,
-                                      MPSGraphRandomDistributionUniform, __func__, nullptr);
+                                      MPSGraphRandomDistributionUniform, gen, __func__, nullptr);
 }
 
 Tensor& normal_mps_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
-  return mps::normal_mps_impl(self, mean, std, c10::nullopt, c10::nullopt, __func__);
+  return mps::normal_mps_impl(self, mean, std, c10::nullopt, c10::nullopt, gen, __func__);
 }
 
 Tensor normal_mps(const Tensor& mean, double std, c10::optional<Generator> gen) {
   Tensor self = empty_mps(mean.sizes(), mean.scalar_type(), c10::nullopt, kMPS);
-  return mps::normal_mps_impl(self, 0.0, std, mean, c10::nullopt, __func__);
+  return mps::normal_mps_impl(self, 0.0, std, mean, c10::nullopt, gen, __func__);
 }
 
 Tensor normal_mps(double mean, const Tensor& std, c10::optional<Generator> gen) {
@@ -232,44 +228,44 @@ Tensor normal_mps(double mean, const Tensor& std, c10::optional<Generator> gen)
   // when there's no tensor-type mean, we cannot pass scalar mean value due to the order of
   // multiply/add ops in random computation. So we create a mean tensor instead.
   Tensor mean_t = at::full_like(self, Scalar(mean));
-  return mps::normal_mps_impl(self, 0.0, 1.0, mean_t, std, __func__);
+  return mps::normal_mps_impl(self, 0.0, 1.0, mean_t, std, gen, __func__);
 }
 
 Tensor normal_mps(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen) {
   auto shape = at::infer_size(mean.sizes(), std.sizes());
   Tensor self = empty_mps(shape, mean.scalar_type(), c10::nullopt, kMPS);
-  return mps::normal_mps_impl(self, 0.0, 1.0, mean, std, __func__);
+  return mps::normal_mps_impl(self, 0.0, 1.0, mean, std, gen, __func__);
 }
 
 Tensor& normal_mps_out(const Tensor& mean, double std, c10::optional<Generator> gen, Tensor& self) {
-  return mps::normal_mps_impl(self, 0.0, std, mean, c10::nullopt, __func__);
+  return mps::normal_mps_impl(self, 0.0, std, mean, c10::nullopt, gen, __func__);
 }
 
 Tensor& normal_mps_out(double mean, const Tensor& std, c10::optional<Generator> gen, Tensor& self) {
   // when there's no tensor-type mean, we cannot pass scalar mean value due to the order of
   // multiply/add ops in random computation. So we create a mean tensor instead.
   Tensor mean_t = at::full_like(self, Scalar(mean));
-  return mps::normal_mps_impl(self, 0.0, 1.0, mean_t, std, __func__);
+  return mps::normal_mps_impl(self, 0.0, 1.0, mean_t, std, gen, __func__);
 }
 
 Tensor& normal_mps_out(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen, Tensor& self) {
   TORCH_CHECK(mean.numel() == std.numel(), "normal_mps_out: mean and std must have same number of elements")
-  return mps::normal_mps_impl(self, 0.0, 1.0, mean, std, __func__);
+  return mps::normal_mps_impl(self, 0.0, 1.0, mean, std, gen, __func__);
 }
 
 Tensor& bernoulli_out_mps(const Tensor& p_, c10::optional<Generator> gen, Tensor& result) {
   result.resize_(p_.sizes());
-  return  mps::bernoulli_mps_impl(result, p_, __func__);
+  return  mps::bernoulli_mps_impl(result, p_, gen, __func__);
 }
 
 Tensor& bernoulli_mps_(Tensor& self, double p, c10::optional<Generator> gen) {
   TORCH_CHECK(0.0 <= p && p <= 1.0, "bernoulli_mps_ expects p to be in [0, 1], but got p=", p);
   Tensor prob_t = at::full_like(self, Scalar(p));
-  return mps::bernoulli_mps_impl(self, prob_t, __func__);
+  return mps::bernoulli_mps_impl(self, prob_t, gen, __func__);
 }
 
 Tensor& bernoulli_mps_(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
-  return mps::bernoulli_mps_impl(self, p_, __func__);
+  return mps::bernoulli_mps_impl(self, p_, gen, __func__);
 }
 
 // random_.from
@@ -321,7 +317,7 @@ Tensor normal_mps(const Tensor& mean, const Tensor& std, c10::optional<Generator
   }
 
   return mps::random_mps_impl<int64_t>(self, from, to - 1, c10::nullopt, c10::nullopt,
-                                       MPSGraphRandomDistributionUniform, __func__, nullptr);
+                                       MPSGraphRandomDistributionUniform, gen, __func__, nullptr);
 }
 
 Tensor& random_mps_(Tensor& self, int64_t to, c10::optional<Generator> gen) {
@@ -348,7 +344,7 @@ Tensor normal_mps(const Tensor& mean, const Tensor& std, c10::optional<Generator
                                           name: nil];
   };
   return mps::random_mps_impl<double>(self, 0.0, 1.0, c10::nullopt, c10::nullopt,
-                                      MPSGraphRandomDistributionUniform,
+                                      MPSGraphRandomDistributionUniform, gen,
                                       "exponential_mps_:" + std::to_string(lambda), random_op_block);
 }
 
diff --git a/test/test_mps.py b/test/test_mps.py
index ffe2190403a1b..aa3b444bce448 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -4887,6 +4887,28 @@ def test_bernoulli(self):
         mps_out = torch.bernoulli(all_ones)
         self.assertEqual(mps_out, all_ones)
 
+    def test_mps_generator(self):
+        # explicit manual seeding by creating an MPS Generator
+        g_mps = torch.Generator(device='mps')
+        g_mps.manual_seed(999)
+        mps_x = torch.randn(5, device='mps', generator=g_mps)
+        g_mps.manual_seed(999)
+        mps_y = torch.randn(5, device='mps', generator=g_mps)
+        # seed values were the same, so the random tensor contents should match
+        self.assertEqual(mps_x, mps_y)
+        # save generator's state to restore it later
+        g_state = g_mps.get_state()
+
+        # generate random numbers without seeding
+        mps_x = torch.randn(5, device='mps', generator=g_mps)
+        # in this case, the random results must differ from the last generated random results
+        self.assertNotEqual(mps_x, mps_y)
+
+        # restore the previously saved state, and the results should match again
+        g_mps.set_state(g_state)
+        mps_x = torch.randn(5, device='mps', generator=g_mps)
+        self.assertEqual(mps_x, mps_y)
+
     # Test random_.to and random_.from
     def test_random(self):
         def helper(shape, low, high, dtype=torch.int32):
diff --git a/torch/csrc/Generator.cpp b/torch/csrc/Generator.cpp
index 31dcfefaea8d8..d5939496eff45 100644
--- a/torch/csrc/Generator.cpp
+++ b/torch/csrc/Generator.cpp
@@ -17,6 +17,10 @@
 #include <ATen/cuda/CUDAGeneratorImpl.h>
 #endif
 
+#ifdef USE_MPS
+#include <ATen/mps/MPSGeneratorImpl.h>
+#endif
+
 using namespace at;
 using namespace torch;
 
@@ -52,12 +56,20 @@ static PyObject* THPGenerator_pynew(
   auto device = r.deviceWithDefault(0, at::Device(at::kCPU));
 
   THPGeneratorPtr self((THPGenerator*)type->tp_alloc(type, 0));
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_MPS)
   if (device.type() == at::kCPU) {
     self->cdata = make_generator<CPUGeneratorImpl>();
-  } else if (device.type() == at::kCUDA) {
+  }
+#ifdef USE_CUDA
+  else if (device.type() == at::kCUDA) {
     self->cdata = make_generator<CUDAGeneratorImpl>(device.index());
-  } else {
+  }
+#elif USE_MPS
+  else if (device.type() == at::kMPS) {
+    self->cdata = make_generator<MPSGeneratorImpl>();
+  }
+#endif
+  else {
     AT_ERROR(
         "Device type ",
         c10::DeviceTypeName(device.type()),

From b294ea85ba857e4c9c98d14df44db74f4b2f087e Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 11 Oct 2022 16:55:02 -0700
Subject: [PATCH 1783/1922] Register unfold key for MPS (#134)

---
 aten/src/ATen/native/native_functions.yaml |  2 +-
 test/test_mps.py                           | 35 +++++++++++++++++-----
 2 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index b0621235e56d4..622f5ac5da13a 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -9430,7 +9430,7 @@
   device_check: NoCheck
   device_guard: False
   dispatch:
-    CPU, CUDA, Meta: unfold
+    CPU, CUDA, Meta, MPS: unfold
     QuantizedCPU, QuantizedCUDA: unfold
 
 - func: unfold_backward(Tensor grad_in, SymInt[] input_sizes, int dim, int size, int step) -> Tensor
diff --git a/test/test_mps.py b/test/test_mps.py
index aa3b444bce448..79f22f8373f04 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -1956,7 +1956,29 @@ def test_as_strided(self):
         strided_mps_out = strided_mps1 - strided_mps2
         self.assertEqual(strided_cpu_out, strided_mps_out)
 
+    def test_unfold(self):
+        x = torch.arange(1., 8)
+        x_mps = torch.arange(1., 8, device="mps")
 
+        y = x.unfold(0, 2, 1)
+        y_mps = x_mps.unfold(0, 2, 1)
+
+        self.assertEqual(y, y_mps)
+
+    def test_unfold_all_devices_and_dtypes(self):
+        supported_dtypes = [torch.float32, torch.float16, torch.int64, torch.int32, torch.int16, torch.uint8]
+        for dt in supported_dtypes:
+            x = torch.empty((0, 1, 3, 0), dtype=dt, device="mps")
+            self.assertEqual((0, 1, 1, 0, 3), x.unfold(2, 3, 2).shape)
+
+    def test_unfold_scalars(self):
+        x = torch.tensor(0.5, device="mps")
+        # unfold on a 0-dimensional tensor should always return a 1-d dimensional
+        # tensor of shape [size] (i.e., the second parameter to unfold)
+
+        self.assertEqual(torch.empty(0, device="mps"), x.unfold(0, 0, 1))
+        self.assertEqual(torch.empty(0, device="mps"), x.unfold(0, 0, 2))
+        self.assertEqual(torch.tensor([0.5], device="mps"), x.unfold(0, 1, 1))
 
     def test_sum_backward(self):
         def helper(n, c):
@@ -5596,14 +5618,13 @@ def test_T_view(self, device="mps"):
             v[0, 1] = 0
             self.assertEqual(t[1, 0], v[0, 1])
 
-    # requires aten::unfold
-    # def test_unfold_view(self, device="mps"):
-    #     t = torch.ones(10, device=device)
-    #     v = t.unfold(0, 3, 2)
-    #     self.assertTrue(self.is_view_of(t, v))
+    def test_unfold_view(self, device="mps"):
+        t = torch.ones(10, device=device)
+        v = t.unfold(0, 3, 2)
+        self.assertTrue(self.is_view_of(t, v))
 
-    #     v[1, 0] = 0
-    #     self.assertEqual(t[2], v[1, 0])
+        v[1, 0] = 0
+        self.assertEqual(t[2], v[1, 0])
 
     def test_squeeze_view(self, device="mps"):
         t = torch.ones(5, 1, 5, device=device)

From 0ec4818de2ce6c3a162ad9611676e3791812e78e Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Thu, 13 Oct 2022 15:52:39 -0400
Subject: [PATCH 1784/1922] Fix the base shape of view tensor in as_strided_mps
 (#135)

- Fix the type mismatch bug if returning early from copy_kernel_mps when copy_cast is needed
- Create the view graph in gatherViewTensor instead of as_strided
- Fix the crash in asStridedLayer_reshapePattern() caused by negative srcDim
---
 aten/src/ATen/native/mps/operations/Copy.mm |  4 +-
 aten/src/ATen/native/mps/operations/View.mm | 62 ++++++++++-----------
 2 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm
index 632ad32724f1a..22de684454d5c 100644
--- a/aten/src/ATen/native/mps/operations/Copy.mm
+++ b/aten/src/ATen/native/mps/operations/Copy.mm
@@ -234,10 +234,10 @@ void copy_blit_mps(void* dst, const void* src, size_t size) {
 
   // If dst is contiguous and there is no byte offset, we can save directly the result of
   // gather into dst. This reduces the overhead of doing an additional blit for most cases
-  bool returnGatherOutput = (dst_.is_contiguous() && !dst_byte_offset);
+  bool returnGatherOutput = (dst_.is_contiguous() && !dst_byte_offset && src_.dtype() == dst_.dtype());
   Tensor src;
 
-  if (!src_.is_contiguous()) {
+  if (src_.is_view() || !src_.is_contiguous()) {
     Tensor emptyShell = Tensor();
     src = gatherViewTensor(src_, returnGatherOutput ? dst_ : emptyShell);
 
diff --git a/aten/src/ATen/native/mps/operations/View.mm b/aten/src/ATen/native/mps/operations/View.mm
index 30ceb23b75d18..d441b70d84c4e 100644
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@@ -192,13 +192,13 @@
     NSUInteger targetDimLength = currDimLength;
     NSUInteger currReshapeSize = 1;
     NSUInteger innerStride = srcStride;
-    do {
+
+    while (currReshapeSize != targetDimLength && srcDim >= 0) {
       NSUInteger srcDimLength = [[inputTensor shape][srcDim] integerValue];
       currReshapeSize *= srcDimLength;
       srcStride *= srcDimLength;
-
       srcDim--;
-    } while(currReshapeSize != targetDimLength && srcDim >= 0);
+    };
 
     isValidReshape &= (currReshapeSize == targetDimLength && currStride == innerStride);
   }
@@ -516,6 +516,24 @@
   return outputTensor;
 }
 
+static IntArrayRef updateTensorBaseShape(const Tensor& self)
+{
+  IntArrayRef base_shape = get_buffer_shape(self.storage().data());
+  // if there's no base_shape stored in MPSAllocator, then infer it from tensor's size and store it
+  if (base_shape.size() == 0) {
+    // IntArrayRef wouldn't own the data, so we use a static storage
+    static const int64_t shape_1d = 1;
+    // self.sizes().size() could be zero
+    base_shape = self.sizes().size() ? self.sizes() :
+                ((self.is_view() && self._base().sizes().size()) ? self._base().sizes() : IntArrayRef(&shape_1d, 1));
+
+    // base_shape will be retained in MPSAllocator until buffer gets recycled
+    if (self.storage().data())
+      set_buffer_shape(self.storage().data(), base_shape);
+  }
+  return base_shape;
+}
+
 // There are few cases we need to consider:
 // Here nodes are the Tensors and the edges are the operations performed on the
 // Tensor. As a result of the operation performed we can have result as View
@@ -535,22 +553,11 @@
 //            NonView T         NonView T
 static ViewCachedGraph* createViewGraph(const Tensor& self, IntArrayRef size, IntArrayRef stride, int64_t storage_offset, bool needsScatter)
 {
-  IntArrayRef base_shape = get_buffer_shape(self.storage().data());
-  if (base_shape.size() == 0) {
-    // IntArrayRef wouldn't own the data, so we use a static storage
-    static const int64_t shape_1d = 1;
-    // self.sizes().size() could be zero
-    base_shape = self.sizes().size() ? self.sizes() :
-                      self.is_view() ? self._base().sizes() : IntArrayRef(&shape_1d, 1);
-
-    // base_shape will be retained in MPSAllocator until buffer gets recycled
-    if (self.storage().data())
-      set_buffer_shape(self.storage().data(), base_shape);
-  }
-  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+  IntArrayRef base_shape = updateTensorBaseShape(self);
 
   @autoreleasepool {
     string key = getStridedKey(self.scalar_type(), base_shape, size, stride, storage_offset, needsScatter);
+    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
     ViewCachedGraph* cachedGraph = static_cast<ViewCachedGraph *>(cache_->LookUp(key));
 
     if (!cachedGraph) {
@@ -586,26 +593,17 @@
 
 Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst)
 {
-  ViewCachedGraph* cachedGraph = nullptr;
-
-  const IntArrayRef& base_shape = get_buffer_shape(src.storage().data());
-  if (base_shape.size() > 0) {
-    string key = getStridedKey(src.scalar_type(), base_shape, src.sizes(), src.strides(), src.storage_offset(), /*is_scatter*/ false);
-    cachedGraph = static_cast<ViewCachedGraph *>(MPSGraphCache::getInstance()->LookUp(key));
-  }
-  // there are cases where gatherViewTensor() is called without having as_strided() called beforehand.
-  // this typically may come from copy_mps variants. In such cases, when the base_shape isn't found the
-  // callers would resort to make the tensor contiguous in an alternative code path.
-  if (!cachedGraph) {
+  if (src.sizes().size() == 0) {
     return Tensor();
   }
-
   bool requires_sync = false;
   Tensor output;
   if (!dst.has_storage()) {
     output = at::native::empty_mps(src.sizes(), src.scalar_type(), c10::nullopt, kMPS);
     requires_sync = true;
   }
+  ViewCachedGraph* cachedGraph = createViewGraph(src, src.sizes(), src.strides(),
+                                                 src.storage_offset(), /*needsScatter*/ false);
   return runViewGraph(cachedGraph, src, dst.has_storage() ? dst : output, /*needsScatter*/ false, requires_sync);
 }
 
@@ -625,9 +623,11 @@ Tensor as_strided_tensorimpl_mps(const Tensor& self, IntArrayRef size, IntArrayR
   auto result = detail::make_tensor<TensorImpl>(c10::TensorImpl::VIEW, Storage(self.storage()), self.key_set(), self.dtype());
   setStrided(result, size, stride, storage_offset);
 
-  // 0 sizes won't result in any change in the shape of the Tensor so we can skip it.
-  if (size.size() > 0)
-    mps::createViewGraph(self, size, stride, storage_offset, /*needsScatter*/ false);
+  // creating the view graph will be deferred until gatherViewTensor() or scatterViewTensor() are called.
+  // In as_strided, we just update the base shape of the buffer in order to retrieve it later
+  // when we create/run the view graph.
+  IntArrayRef base_shape = mps::updateTensorBaseShape(self);
+  TORCH_INTERNAL_ASSERT(base_shape.size() > 0, "Failed to update the base shape of tensor's buffer at ", self.storage().data());
 
   return result;
 }

From 7e2dfa84260eaa3fd8a51f8749c65a3d6c9c993c Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Fri, 14 Oct 2022 16:58:55 -0400
Subject: [PATCH 1785/1922] Fix the assert in Garbage Collector (#139)

---
 aten/src/ATen/mps/MPSAllocator.mm | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/mps/MPSAllocator.mm b/aten/src/ATen/mps/MPSAllocator.mm
index a40ddd7992a29..60a66ff782d89 100644
--- a/aten/src/ATen/mps/MPSAllocator.mm
+++ b/aten/src/ATen/mps/MPSAllocator.mm
@@ -386,7 +386,9 @@
 
 void MPSHeapAllocatorImpl::garbage_collect_cached_buffers(AllocParams& params)
 {
-  TORCH_INTERNAL_ASSERT(current_allocated_size() >= m_low_watermark_limit);
+  // skip garbage collection if memory pressure has already relieved
+  if (current_allocated_size() < m_low_watermark_limit)
+    return;
   // attempt to collect garbage until we reach below low watermark limit
   const auto target_size = current_allocated_size() - m_low_watermark_limit;
   const BufferPool& pool = *params.pool;

From f37bd4af4209d262348c266d9a9fce0c8f361e16 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Mon, 17 Oct 2022 09:11:46 -0700
Subject: [PATCH 1786/1922] Exclude long dtype from reduction ops (min/max)
 (#138)

* Exclude long dtype from reduction ops (min/max)

* Remove tab identation
---
 .../ATen/native/mps/operations/ReduceOps.mm   | 53 +++++++++++--------
 test/test_mps.py                              | 18 +++++++
 2 files changed, 50 insertions(+), 21 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index 8a321ffd2fb12..e04333aa3712a 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -32,6 +32,15 @@
 };
 
 
+NSArray<NSNumber*>* getTensorAxes(const Tensor& t) {
+  int64_t ndim = t.dim();
+  auto axes = [NSMutableArray<NSNumber*> arrayWithCapacity:ndim];
+  for (const auto i: c10::irange(ndim)) {
+    axes[i] = [NSNumber numberWithInteger:i];
+  }
+  return axes;
+}
+
 void set_apparent_shapes(NSMutableArray<NSNumber*> * &apparent_out_shape,
                          NSMutableArray<NSNumber*> * &apparent_in_shape,
                          int64_t num_reduce_dims,
@@ -1181,26 +1190,15 @@ Tensor std_mps(
   (const Tensor& input_t,
    MPSReductionType reduction_type,
    const std::string& func_name) {
+  TORCH_INTERNAL_ASSERT(input_t.scalar_type() != ScalarType::Long, "min/max not supported for Long dtype on MPS");
 
   namespace native_mps = at::native::mps;
   using CachedGraph = native_mps::MPSUnaryCachedGraph;
 
   native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
 
-  IntArrayRef input_shape = input_t.sizes();
-  int64_t num_input_dims = input_shape.size();
-
-  // Flatten the input tensor to reduce it to one value
-  NSMutableArray<NSNumber*> *apparent_input_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:1];
-  int64_t num_in_elements = 1;
-  for(int i = 0; i < num_input_dims; i++) {
-      num_in_elements *= input_shape[i];
-  }
-  apparent_input_shape[0] = [NSNumber numberWithInt:num_in_elements];
-
   Tensor output_t = at::native::empty_mps({}, input_t.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt);
-
-  if (output_t.numel() == 0 || num_in_elements == 0) {
+  if (output_t.numel() == 0 || input_t.numel() == 0) {
     return output_t;
   }
 
@@ -1217,17 +1215,29 @@ Tensor std_mps(
           MPSGraph* mpsGraph = native_mps::make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
 
-          MPSGraphTensor* inputTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(input_t.scalar_type()));
+          MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t);
 
           MPSGraphTensor* outputTensor = nil;
+          MPSGraphTensor* castInputTensor = nil;
+
+          if(input_t.scalar_type() != ScalarType::Float &&
+             input_t.scalar_type() != ScalarType::Int   &&
+             input_t.scalar_type() != ScalarType::Half) {
+            castInputTensor =  [mpsGraph castTensor:inputTensor
+                                             toType:MPSDataTypeInt32
+                                               name:@"castInputTensor"];
+          } else {
+            castInputTensor = inputTensor;
+          }
 
+          NSArray<NSNumber*>* axes = getTensorAxes(input_t);
           if(reduction_type == MPSReductionType::MAX)
-            outputTensor = [mpsGraph reductionMaximumWithTensor:inputTensor
-                                                           axes:@[@0]
+            outputTensor = [mpsGraph reductionMaximumWithTensor:castInputTensor
+                                                           axes:axes
                                                            name:nil];
           else if(reduction_type == MPSReductionType::MIN)
-            outputTensor = [mpsGraph reductionMinimumWithTensor:inputTensor
-                                                           axes:@[@0]
+            outputTensor = [mpsGraph reductionMinimumWithTensor:castInputTensor
+                                                           axes:axes
                                                            name:nil];
 
           newCachedGraph->inputTensor_ = inputTensor;
@@ -1238,7 +1248,7 @@ Tensor std_mps(
       });
     }
 
-    auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t, apparent_input_shape);
+    auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
     auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, @[@1]);
 
     NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
@@ -1275,6 +1285,7 @@ Tensor min_mps(const Tensor& input_t) {
   const Tensor& indices_t,
   MPSReductionType reduction_type,
   const std::string& func_name) {
+    TORCH_INTERNAL_ASSERT(input_t.scalar_type() != ScalarType::Long, "min/max not supported for Long dtype on MPS");
 
     namespace native_mps = at::native::mps;
 
@@ -1330,7 +1341,7 @@ Tensor min_mps(const Tensor& input_t) {
               MPSGraph* mpsGraph = native_mps::make_mps_graph();
               newCachedGraph = new CachedGraph(mpsGraph);
 
-              MPSGraphTensor* inputTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(input_t.scalar_type()));
+              MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t);
               MPSGraphTensor* outputTensor = nil;
               if(reduction_type == MPSReductionType::MAX)
                 outputTensor = [mpsGraph reductionMaximumWithTensor:inputTensor
@@ -1347,7 +1358,7 @@ Tensor min_mps(const Tensor& input_t) {
                  input_t.scalar_type() != ScalarType::Int   &&
                  input_t.scalar_type() != ScalarType::Half)
                 castInputTensor =  [mpsGraph castTensor:inputTensor
-                                                 toType:MPSDataTypeFloat32
+                                                 toType:MPSDataTypeInt32
                                                    name:@"castInputTensor"];
               else
                 castInputTensor = inputTensor;
diff --git a/test/test_mps.py b/test/test_mps.py
index 79f22f8373f04..62c2d720fb18c 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -1797,6 +1797,24 @@ def helper(x, other):
         helper(self._wrap_tensor((1, 0, 1, 0)), self._wrap_tensor(True))
         helper(self._wrap_tensor((1, 0, 1, 0)), self._wrap_tensor(False))
 
+    def test_min_max(self):
+        def helper(dtype):
+            for _ in range(10):
+                if dtype == torch.float32 or dtype == torch.float16:
+                    x = torch.randn((30, 15), device='mps', dtype=dtype)
+                else:
+                    x = torch.randint(0, 100, (30, 15), device="mps", dtype=dtype)
+                x_cpu = x.to("cpu")
+
+                y = x.max()
+                y_cpu = x_cpu.max()
+                self.assertEqual(y, y_cpu)
+
+                z = x.min()
+                z_cpu = x_cpu.min()
+                self.assertEqual(z, z_cpu)
+
+        [helper(dtype) for dtype in [torch.float32, torch.float16, torch.int32, torch.int16, torch.uint8, torch.int8, torch.bool]]
 
 class TestSmoothL1Loss(TestCase):
 

From 3752d15cb5c90d6150370f5bb407cb776aee1dbd Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Mon, 17 Oct 2022 12:16:22 -0400
Subject: [PATCH 1787/1922] Replace the explicit commit in View ops with
 adaptive commit (#136)

---
 aten/src/ATen/native/mps/operations/View.mm | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/View.mm b/aten/src/ATen/native/mps/operations/View.mm
index d441b70d84c4e..cf40e014dc9d8 100644
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@@ -28,8 +28,7 @@
 }
 
 // initializes the MTLBuffers for tensor data and runs the MPSGraph for the view op
-static Tensor& runViewGraph(ViewCachedGraph* cachedGraph, const at::Tensor& src, Tensor& output,
-                            bool needsScatter, bool requires_sync = false)
+static Tensor& runViewGraph(ViewCachedGraph* cachedGraph, const at::Tensor& src, Tensor& output, bool needsScatter)
 {
   const id<MTLBuffer> sourceBuffer = getMTLBufferStorage(src);
   const id<MTLBuffer> outputBuffer = getMTLBufferStorage(output);
@@ -74,8 +73,7 @@
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
       cachedGraph->outputTensor : outputTensorData
     };
-    stream->executeMPSGraph(cachedGraph->graph(), feeds, results,
-                            requires_sync ? SyncType::COMMIT : SyncType::COMMIT_ADAPTIVE);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
   }
   return output;
 }
@@ -234,7 +232,7 @@
       if (dstSizes[dstDim] == 0) { return nil; }
   }
 
-  // 1. Flatten the inputTensor if neccessary
+  // 1. Flatten the inputTensor if necessary
   MPSGraphTensor *flatInputTensor = inputTensor;
   {
     // Flatten inputs to remove duplicate strides.
@@ -596,22 +594,20 @@ Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst)
   if (src.sizes().size() == 0) {
     return Tensor();
   }
-  bool requires_sync = false;
   Tensor output;
   if (!dst.has_storage()) {
     output = at::native::empty_mps(src.sizes(), src.scalar_type(), c10::nullopt, kMPS);
-    requires_sync = true;
   }
   ViewCachedGraph* cachedGraph = createViewGraph(src, src.sizes(), src.strides(),
                                                  src.storage_offset(), /*needsScatter*/ false);
-  return runViewGraph(cachedGraph, src, dst.has_storage() ? dst : output, /*needsScatter*/ false, requires_sync);
+  return runViewGraph(cachedGraph, src, dst.has_storage() ? dst : output, /*needsScatter*/ false);
 }
 
 Tensor& scatterViewTensor(const at::Tensor& src, at::Tensor& output)
 {
   ViewCachedGraph* cachedGraph = createViewGraph(output, output.sizes(), output.strides(),
                                                  output.storage_offset(), /*needsScatter*/ true);
-  return runViewGraph(cachedGraph, src, output, /*needsScatter*/ true, /*requires_sync*/  true);
+  return runViewGraph(cachedGraph, src, output, /*needsScatter*/ true);
 }
 
 } // namespace mps

From 02b98f5617bdfcaccd7bcdc71adef7a927891d88 Mon Sep 17 00:00:00 2001
From: Kulin Seth <kulin_seth@apple.com>
Date: Fri, 14 Oct 2022 09:37:33 -0700
Subject: [PATCH 1788/1922] [MPS] Build fix due to MPSGenerator changes in
 multinomial

---
 .../native/mps/operations/Distributions.mm    | 24 +++++++++----------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Distributions.mm b/aten/src/ATen/native/mps/operations/Distributions.mm
index b527da3925d69..ea27d88218591 100644
--- a/aten/src/ATen/native/mps/operations/Distributions.mm
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@@ -11,15 +11,8 @@
 
 struct RandomCachedGraph : public MPSCachedGraph
 {
-  RandomCachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {
-    // initialize Philox state values (only required once when graph is created)
-    const auto seed = c10::detail::getNonDeterministicRandom();
-    const auto subsequence = c10::detail::getNonDeterministicRandom();
-    philoxState = at::Philox4_32(seed, subsequence);
-    // the two last state values are the Philox keys which are initialized once only
-    stateValues[5] = static_cast<uint32_t>(seed);
-    stateValues[6] = static_cast<uint32_t>(seed >> 32);
-  }
+  RandomCachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) { }
+
   // Only relevant for multinomial
   MPSGraphTensor *probTensor = nil;
   MPSGraphTensor *resultTensor = nil;
@@ -356,6 +349,7 @@ Tensor normal_mps(const Tensor& mean, const Tensor& std, c10::optional<Generator
 
   using namespace mps;
 
+  auto mps_gen = get_generator_or_default<MPSGeneratorImpl>(generator, at::mps::detail::getDefaultMPSGenerator());
   int inputSize = self.dim();
   int numDist =
       inputSize == 1 ? 1 : self.size(0);
@@ -474,11 +468,15 @@ Tensor normal_mps(const Tensor& mean, const Tensor& std, c10::optional<Generator
      });
     }
     // update the Philox state values on each run of the same graph
-    cachedGraph->updatePhiloxCounters();
-    // feed the updated state values to the graph
-    MPSNDArrayDescriptor *stateDesc = [MPSNDArrayDescriptor descriptorWithDataType: MPSDataTypeInt32 shape: @[@7]];
+    MPSNDArrayDescriptor *stateDesc = [MPSNDArrayDescriptor descriptorWithDataType: MPSDataTypeInt32 shape: @[@(at::mps::detail::PHILOX_STATE_N)]];
     MPSNDArray *stateNDArray = [[[MPSNDArray alloc] initWithDevice: stream->device() descriptor: stateDesc] autorelease];
-    [stateNDArray writeBytes: &cachedGraph->stateValues[0] strideBytes: nil];
+    {
+      // See Note [Acquire lock when using random generators]
+      std::lock_guard<std::mutex> lock(mps_gen->mutex_);
+      // update the Philox state values on each run
+      mps_gen->update_philox_counters();
+      [stateNDArray writeBytes: mps_gen->state_data() strideBytes: nil];
+    }
     MPSGraphTensorData* stateTensorData = [[[MPSGraphTensorData alloc] initWithMPSNDArray: stateNDArray] autorelease];
 
     auto probPlaceholder = Placeholder(cachedGraph->probTensor, self_v);

From 582915883ab86975ba4c172eb18384ca89746d18 Mon Sep 17 00:00:00 2001
From: Kulin Seth <kulin_seth@apple.com>
Date: Tue, 18 Oct 2022 16:14:50 -0700
Subject: [PATCH 1789/1922] Don't reset the Graph state. (#141)

---
 aten/src/ATen/native/mps/OperationUtils.mm | 1 -
 1 file changed, 1 deletion(-)

diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index 6e3ecc3b8e9bf..564c86ac118a1 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -263,7 +263,6 @@ void resize_tensor(Tensor* output) {
 
 MPSGraph* make_mps_graph() {
   MPSGraph* mpsGraph = [[MPSGraph new] autorelease];
-  mpsGraph.options = MPSGraphOptionsNone;
   return mpsGraph;
 }
 

From 60e1262476c23a363f26948324f954078a445566 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 18 Oct 2022 20:48:23 -0700
Subject: [PATCH 1790/1922] Add suport for casting updatesTensor to match
 outputTensor dtype directly in scatter (#140)

* Add suport for casting updatesTensor to match outputTensor dtype directly in scatter

* Address PR comments
---
 aten/src/ATen/native/mps/operations/Copy.mm | 13 +-------
 aten/src/ATen/native/mps/operations/View.mm | 36 ++++++++++++++-------
 2 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm
index 22de684454d5c..ad1b0d02911c8 100644
--- a/aten/src/ATen/native/mps/operations/Copy.mm
+++ b/aten/src/ATen/native/mps/operations/Copy.mm
@@ -260,18 +260,7 @@ void copy_blit_mps(void* dst, const void* src, size_t size) {
   // If the memory is not contiguous, it means that the tensor has strides and we would not be
   // able to do the copy using a single blit
   if (!dst_.is_contiguous()) {
-    Tensor tmp;
-    if (src.dtype() != dst_.dtype()) {
-      id<MTLBuffer> tmpBuffer = sourceBuffer;
-      if (src.element_size() < dst_.element_size()) {
-        tmp = at::native::empty_mps(dst_.sizes(), dst_.scalar_type(), c10::nullopt, kMPS);
-        tmpBuffer = getMTLBufferStorage(tmp);
-      }
-
-      copy_cast_mps(dst_, src, tmpBuffer, sourceBuffer);
-    }
-
-    return scatterViewTensor((src.dtype() != dst_.dtype() && tmp.has_storage()) ? tmp : src, dst_);
+    return scatterViewTensor(src, dst_);
   }
   src._set_conj(src_.is_conj());
   src._set_neg(src_.is_neg());
diff --git a/aten/src/ATen/native/mps/operations/View.mm b/aten/src/ATen/native/mps/operations/View.mm
index cf40e014dc9d8..f7f6a686f4d68 100644
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@@ -18,11 +18,16 @@
   std::vector<MPSGraphTensor*> strideTensors;
 };
 
-static std::string getStridedKey(const ScalarType& dtype, const IntArrayRef& base_shape,
+static std::string getStridedKey(const ScalarType& self_dtype, const ScalarType& updates_dtype, const IntArrayRef& base_shape,
                                  const IntArrayRef& new_shape, const IntArrayRef& stride,
                                  int64_t storage_offset, bool is_scatter)
 {
-  return (is_scatter ? "scatter:" : "gather:") + getMPSTypeString(dtype) + "[" +
+  std::string dtype_key = getMPSTypeString(self_dtype);
+  if (is_scatter) {
+    dtype_key += ":" + getMPSTypeString(updates_dtype);
+  }
+
+  return (is_scatter ? "scatter:" : "gather:") + dtype_key + "[" +
          getArrayRefString(base_shape) + "]:[" + getArrayRefString(new_shape) + "]:[" +
          getArrayRefString(stride) + "]:[" + to_string(storage_offset) + "]";
 }
@@ -51,7 +56,7 @@
     if (needsScatter) {
       feeds[cachedGraph->updatesTensor] = [[[MPSGraphTensorData alloc] initWithMTLBuffer: sourceBuffer
                                                                                    shape: getMPSShape(src.numel())
-                                                                                dataType: inputType] autorelease];
+                                                                                dataType: getMPSDataType(src.scalar_type())] autorelease];
     }
     MPSScalar storageOffsetScalar = getMPSScalar(storage_offset, ScalarType::Int);
     feeds[cachedGraph->storageOffsetTensor] = getMPSGraphTensorFromScalar(stream, storageOffsetScalar);
@@ -411,7 +416,8 @@
 static MPSGraphTensor* chainViewOperation(ViewCachedGraph* cachedGraph, const IntArrayRef& size,
                                           const IntArrayRef& stride, int64_t offset,
                                           const IntArrayRef& base_shape, bool needsScatter,
-                                          const bool needsBoolCast)
+                                          const bool needsBoolCast,
+                                          MPSGraphTensor* updatesTensor)
 {
   MPSGraph* mpsGraph = cachedGraph->graph();
   MPSGraphTensor *outputTensor = nil;
@@ -483,7 +489,7 @@
     if (needsScatter) {
       MPSGraphTensor* scatteredTensor = [mpsGraph scatterAlongAxis: (NSInteger) 0
                                                     withDataTensor: reshapedInputTensor
-                                                     updatesTensor: cachedGraph->updatesTensor
+                                                     updatesTensor: updatesTensor
                                                      indicesTensor: reshapedIndicesTensor
                                                               mode: MPSGraphScatterModeSet
                                                               name: nil];
@@ -549,12 +555,12 @@ static IntArrayRef updateTensorBaseShape(const Tensor& self)
 //            |    /          \   |
 //            |   /            \  |
 //            NonView T         NonView T
-static ViewCachedGraph* createViewGraph(const Tensor& self, IntArrayRef size, IntArrayRef stride, int64_t storage_offset, bool needsScatter)
+static ViewCachedGraph* createViewGraph(const Tensor& self, const Tensor &updates, IntArrayRef size, IntArrayRef stride, int64_t storage_offset, bool needsScatter)
 {
   IntArrayRef base_shape = updateTensorBaseShape(self);
 
   @autoreleasepool {
-    string key = getStridedKey(self.scalar_type(), base_shape, size, stride, storage_offset, needsScatter);
+    string key = getStridedKey(self.scalar_type(), updates.scalar_type(), base_shape, size, stride, storage_offset, needsScatter);
     MPSGraphCache* cache_ = MPSGraphCache::getInstance();
     ViewCachedGraph* cachedGraph = static_cast<ViewCachedGraph *>(cache_->LookUp(key));
 
@@ -563,6 +569,7 @@ static IntArrayRef updateTensorBaseShape(const Tensor& self)
         ViewCachedGraph *newCachedGraph = nil;
         @autoreleasepool {
             MPSGraph* mpsGraph = make_mps_graph();
+            MPSGraphTensor* updatesTensor = nil;
             newCachedGraph = new ViewCachedGraph(mpsGraph);
             // Workaround for MPSShaderLibrary bug
             // TODO: Remove once https://github.com/pytorch/pytorch/issues/82305 is resolved
@@ -578,9 +585,16 @@ static IntArrayRef updateTensorBaseShape(const Tensor& self)
               newCachedGraph->strideTensors.push_back(mpsGraphRankedPlaceHolder(mpsGraph, MPSDataTypeInt32, @[@1]));
             }
             if (needsScatter) {
-              newCachedGraph->updatesTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, inputType);
+              auto updatesType = getMPSScalarType(updates.scalar_type());
+              newCachedGraph->updatesTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, updatesType);
+              updatesTensor = newCachedGraph->updatesTensor;
+              if (inputType != updatesType) {
+                updatesTensor = [mpsGraph castTensor:updatesTensor
+                                              toType:inputType
+                                                name:@"castUpdatesTensor"];
+              }
             }
-            newCachedGraph->outputTensor = chainViewOperation(newCachedGraph, size, stride, storage_offset, base_shape, needsScatter, needsBoolCast);
+            newCachedGraph->outputTensor = chainViewOperation(newCachedGraph, size, stride, storage_offset, base_shape, needsScatter, needsBoolCast, updatesTensor);
         }
         return newCachedGraph;
       }));
@@ -598,14 +612,14 @@ Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst)
   if (!dst.has_storage()) {
     output = at::native::empty_mps(src.sizes(), src.scalar_type(), c10::nullopt, kMPS);
   }
-  ViewCachedGraph* cachedGraph = createViewGraph(src, src.sizes(), src.strides(),
+  ViewCachedGraph* cachedGraph = createViewGraph(src, dst, src.sizes(), src.strides(),
                                                  src.storage_offset(), /*needsScatter*/ false);
   return runViewGraph(cachedGraph, src, dst.has_storage() ? dst : output, /*needsScatter*/ false);
 }
 
 Tensor& scatterViewTensor(const at::Tensor& src, at::Tensor& output)
 {
-  ViewCachedGraph* cachedGraph = createViewGraph(output, output.sizes(), output.strides(),
+  ViewCachedGraph* cachedGraph = createViewGraph(output, src, output.sizes(), output.strides(),
                                                  output.storage_offset(), /*needsScatter*/ true);
   return runViewGraph(cachedGraph, src, output, /*needsScatter*/ true);
 }

From 3006e9095645c658a2cf01062e4ab2a80df23383 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 18 Oct 2022 20:58:06 -0700
Subject: [PATCH 1791/1922] Add bincount support for mps (#137)

* Add bincount sopport for mps

* Remove comment

* Remove unnecessary spaces

* Address PR comments

* Add more bincount tests; fix scatter for uint8 dtype

* Add missing comment

* Remove print from test_mps.py
---
 .../ATen/native/mps/operations/SummaryOps.mm  | 155 ++++++++++++++++++
 aten/src/ATen/native/native_functions.yaml    |   1 +
 test/test_mps.py                              | 135 ++++++++++++++-
 3 files changed, 287 insertions(+), 4 deletions(-)
 create mode 100644 aten/src/ATen/native/mps/operations/SummaryOps.mm

diff --git a/aten/src/ATen/native/mps/operations/SummaryOps.mm b/aten/src/ATen/native/mps/operations/SummaryOps.mm
new file mode 100644
index 0000000000000..60732f6dafe52
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/SummaryOps.mm
@@ -0,0 +1,155 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/native/mps/OperationUtils.h>
+
+namespace at {
+namespace native {
+
+Tensor& bincount_mps_impl(const Tensor& self,
+                          const Tensor& weights,
+                          Tensor& output) {
+  using namespace mps;
+
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* inputTensor_ = nil;
+    MPSGraphTensor* weightsTensor_ = nil;
+    MPSGraphTensor* scatterDataTensor_ = nil;
+    MPSGraphTensor* outputTensor_ = nil;
+  };
+
+  MPSStream* stream = getCurrentMPSStream();
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+  bool has_weights = weights.defined();
+
+  @autoreleasepool {
+    string key = "bincount_mps_impl" + getTensorsStringKey({self, weights});
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+
+        @autoreleasepool {
+          // Initialize graph
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+          MPSGraphTensor *scatterDataTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSScalarType(output.scalar_type()));
+
+          MPSGraphTensor *updatesTensor = nil;
+          if (has_weights) {
+            updatesTensor = mpsGraphRankedPlaceHolder(mpsGraph, weights);
+          }
+          else {
+            updatesTensor = [mpsGraph constantWithScalar:1.0f
+                                                   shape:getMPSShape(self)
+                                                dataType:getMPSDataType(output.scalar_type())];
+          }
+
+          MPSGraphTensor *castedInputTensor = inputTensor;
+          if (self.scalar_type() == kByte) {
+            castedInputTensor = [mpsGraph castTensor:inputTensor
+                                              toType:MPSDataTypeInt32
+                                                name:nil];
+          }
+
+          MPSGraphTensor *outputTensor = [mpsGraph scatterWithDataTensor:scatterDataTensor
+                                                           updatesTensor:updatesTensor
+                                                           indicesTensor:castedInputTensor
+                                                                     axis:0
+                                                                     mode:MPSGraphScatterModeAdd
+                                                                     name:nil];
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+          newCachedGraph->scatterDataTensor_ = scatterDataTensor;
+          if (has_weights) {
+            newCachedGraph->weightsTensor_ = updatesTensor;
+          }
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    // Create placeholders which use the keys of the CachedGraph to create inputs and outputs of the operation
+    Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+    Placeholder scatterPlaceholder = Placeholder(cachedGraph->scatterDataTensor_, output);
+    Placeholder weightsPlaceholder = Placeholder();
+
+    // Create dictionary of inputs/feeds and outputs/results
+    NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =[NSMutableDictionary dictionary];
+    feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData();
+    feeds[scatterPlaceholder.getMPSGraphTensor()] = scatterPlaceholder.getMPSGraphTensorData();
+    if(has_weights) {
+      weightsPlaceholder = Placeholder(cachedGraph->weightsTensor_, weights);
+      feeds[weightsPlaceholder.getMPSGraphTensor()] = weightsPlaceholder.getMPSGraphTensorData();
+    }
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    // Run the graph
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+
+  return output;
+}
+
+Tensor _bincount_mps(const Tensor& self, const c10::optional<Tensor>& weights_opt, int64_t minlength) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> weights_maybe_owned = at::borrow_from_optional_tensor(weights_opt);
+  const Tensor& weights = *weights_maybe_owned;
+
+  TORCH_CHECK(c10::isIntegralType(self.scalar_type(), /*includesBool=*/true));
+  TORCH_CHECK(minlength >= 0, "minlength should be >= 0");
+
+  if (self.dim() == 1 && self.numel() == 0) {
+    return at::zeros(
+        {minlength},
+        kLong,
+        c10::nullopt /* layout */,
+        kMPS,
+        c10::nullopt /* pin_memory */);
+  }
+  TORCH_CHECK(self.dim() == 1 && self.min().item<int64_t>() >= 0, "bincount only supports 1-d non-negative integral inputs.");
+
+  bool has_weights = weights.defined();
+  TORCH_CHECK(!(has_weights && (weights.dim() != 1 || weights.size(0) != self.size(0))), "weights should be 1-d and have the same length as input");
+
+  const int64_t nbins = std::max(self.max().item<int64_t>() + 1L, minlength);
+  Tensor output;
+
+  Tensor weights_ = weights;
+  if (has_weights) {
+    if(weights.scalar_type() != ScalarType::Float &&
+       weights.scalar_type() != ScalarType::Int   &&
+       weights.scalar_type() != ScalarType::Half) {
+        // Scatter doesn't work for int8/int16 dtypes
+        weights_ = weights.to(kInt);
+    }
+    output = at::zeros(
+        {nbins},
+        optTypeMetaToScalarType(weights_.options().dtype_opt()),
+        weights_.options().layout_opt(),
+        weights_.options().device_opt(),
+        weights_.options().pinned_memory_opt());
+  }
+  else {
+    output = at::zeros(
+        {nbins},
+        kLong,
+        c10::nullopt /* layout */,
+        kMPS,
+        c10::nullopt /* pin_memory */);
+  }
+
+  return bincount_mps_impl(self, weights_, output);
+}
+
+}
+}
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 622f5ac5da13a..eca1dfa941f85 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1067,6 +1067,7 @@
   dispatch:
     CPU: _bincount_cpu
     CUDA: _bincount_cuda
+    MPS: _bincount_mps
   tags: dynamic_output_shape
   autogen: bincount.out
 
diff --git a/test/test_mps.py b/test/test_mps.py
index 62c2d720fb18c..76f368cd3bd88 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -1998,6 +1998,133 @@ def test_unfold_scalars(self):
         self.assertEqual(torch.empty(0, device="mps"), x.unfold(0, 0, 2))
         self.assertEqual(torch.tensor([0.5], device="mps"), x.unfold(0, 1, 1))
 
+    def test_bincount_simple(self):
+        input = torch.randint(0, 8, (5,), dtype=torch.int32, device="mps")
+        input_cpu = input.to("cpu")
+        weights = torch.linspace(0, 1, steps=5, device="mps", dtype=torch.float32)
+        weights_cpu = weights.to("cpu")
+
+        x = torch.bincount(input)
+        x_cpu = torch.bincount(input_cpu)
+        self.assertEqual(x, x_cpu)
+
+        y = input.bincount(weights)
+        y_cpu = input_cpu.bincount(weights_cpu)
+        self.assertEqual(y, y_cpu)
+
+    def test_bincount_reduction(self):
+        device = "mps"
+        # negative input throws
+        with self.assertRaisesRegex(RuntimeError, '1-d non-negative integral'):
+            torch.bincount(torch.tensor([1, -1], device=device, dtype=torch.int32))
+        # n-d input, with n > 1 throws
+        with self.assertRaisesRegex(RuntimeError, '1-d non-negative integral'):
+            torch.bincount(torch.tensor([[1, 2], [3, 4]], device=device))
+        # minlength < 0 throws
+        with self.assertRaisesRegex(RuntimeError, 'minlength should be >= 0'):
+            torch.bincount(torch.tensor([1, 3], device=device),
+                           torch.tensor([.2, .2], device=device),
+                           minlength=-1)
+        # n-d weights, with n > 1 throws
+        with self.assertRaisesRegex(RuntimeError, '1-d'):
+            torch.bincount(torch.tensor([1, 0], device=device, dtype=torch.int32),
+                           torch.tensor([[1., 0.3], [1., 0.3]], device=device, dtype=torch.float))
+        # input and weights dim mismatch
+        with self.assertRaisesRegex(RuntimeError, 'same length'):
+            torch.bincount(torch.tensor([1, 0], device=device, dtype=torch.int32),
+                           torch.tensor([1., 0.3, 0.5], device=device, dtype=torch.float))
+        # 1-d input with no elements and default minlength
+        self.assertEqual(torch.bincount(torch.tensor([], device=device, dtype=torch.long)),
+                         torch.zeros(0, dtype=torch.long, device=device))
+        # 1-d input with no elements and specified minlength
+        self.assertEqual(torch.bincount(torch.tensor([], device=device, dtype=torch.long), minlength=10),
+                         torch.zeros(10, dtype=torch.long, device=device))
+
+        # test tensor method without weights
+        long_counts = torch.tensor(
+            [0, 3, 2, 1, 3], dtype=torch.uint8, device=device).bincount()
+        self.assertEqual(
+            torch.tensor([1, 1, 1, 2], dtype=torch.int64, device=device),
+            long_counts)
+        # test avoiding overflow for uint8 (#76979)
+        count_uint8 = torch.tensor([0, 1, 2, 3, 255], dtype=torch.uint8, device=device).bincount()
+        count_int16 = torch.tensor([0, 1, 2, 3, 255], dtype=torch.int16, device=device).bincount()
+        self.assertEqual(count_uint8, count_int16)
+        # test minlength functionality
+        int_counts = torch.bincount(
+            torch.tensor([1, 1, 1, 1], device=device, dtype=torch.int32), minlength=5)
+        self.assertEqual(
+            torch.tensor([0, 4, 0, 0, 0], dtype=torch.int64, device=device),
+            int_counts)
+        # test weights
+        byte_counts = torch.bincount(
+            torch.tensor([0, 1, 1, 1, 4], device=device, dtype=torch.int32),
+            torch.tensor([.1, .2, .3, .4, .5], device=device))
+        self.assertEqual(
+            torch.tensor([0.1, 0.9, 0, 0, 0.5], device=device), byte_counts)
+        byte_counts = torch.bincount(
+            torch.tensor([0, 1, 1, 1, 4], device=device, dtype=torch.int32),
+            torch.tensor([1, 2, 3, 4, 5], dtype=torch.int8, device=device))
+        self.assertEqual(
+            torch.tensor([1, 9, 0, 0, 5], device=device, dtype=torch.int32), byte_counts)
+        # test non-contiguous inputs and weights
+        inputs = torch.tensor([[0, 0], [3, 1], [2, 1], [1, 1], [3, 4]], device=device, dtype=torch.int32)
+        weights = torch.tensor([[.1, 1], [.2, 2], [.3, 3], [.4, 4], [.5, 5]], device=device)
+        for i in [0, 1]:
+            assert not inputs[:, i].is_contiguous(), "Inputs are supposed to be non-contiguous"
+            assert not weights[:, i].is_contiguous(), "Weights are supposed to be non-contiguous"
+        # inputs are non-contiguous but weights are contiguous
+        self.assertEqual(inputs[:, 0].bincount(), torch.tensor([1, 1, 1, 2]))
+        # inputs and weights are non-contiguous
+        self.assertEqual(
+            inputs[:, 1].bincount(weights[:, 1]),
+            torch.tensor([1, 9, 0, 0, 5], dtype=torch.float32))
+        # weights are non-contiguous but inputs are contiguous
+        self.assertEqual(inputs[:, 1].contiguous().bincount(weights[:, 1]),
+                         torch.tensor([1, 9, 0, 0, 5], dtype=torch.float32))
+
+        # test bincount on non-contiguous slices
+        all0s = torch.zeros((32, 2), dtype=torch.int32, device=device)
+        self.assertEqual(all0s[:, 0].bincount(), torch.tensor([32]))
+
+        all1s = torch.ones((32, 2), dtype=torch.int32, device=device)
+        self.assertEqual(all1s[:, 0].bincount(), torch.tensor([0, 32]))
+
+        # test large number of bins - global memory use
+        big_exp = torch.zeros(10000000, device=device)
+        big_exp[-1] = 50.0
+        big_w = torch.tensor([.5] * 100, device=device)
+        big_out = torch.tensor([9999999] * 100, device=device, dtype=torch.int32).bincount(big_w)
+        self.assertEqual(big_exp, big_out)
+        # test large input size
+        big_exp = torch.zeros(2, device=device, dtype=torch.int64)
+        big_exp[1] = 1000000
+        big_out = torch.ones(1000000, dtype=torch.int8, device=device).bincount()
+        self.assertEqual(big_exp, big_out)
+
+    def test_bincount(self):
+        device = "mps"
+        input_size = (5000,)
+        w = torch.randn(input_size, dtype=torch.float, device=device)
+        w_cpu = w.cpu()
+
+        t = torch.randint(50, input_size, dtype=torch.int8, device=device)
+        self.assertEqual(t.cpu().bincount(), t.bincount())
+        self.assertEqual(t.cpu().bincount(w_cpu), t.bincount(w))
+
+        t = torch.randint(500, input_size, dtype=torch.int32, device=device)
+        self.assertEqual(t.cpu().bincount(), t.bincount())
+        self.assertEqual(t.cpu().bincount(w_cpu), t.bincount(w))
+
+        t = torch.randint(2000, input_size, dtype=torch.int32, device=device)
+        self.assertEqual(t.cpu().bincount(), t.bincount())
+        self.assertEqual(t.cpu().bincount(w_cpu), t.bincount(w))
+
+        t = torch.zeros([10], dtype=torch.int32, device=device)
+        t[0] = 35488
+        counted = t.bincount(minlength=65536)
+        self.assertEqual(torch.sum(counted), 10)
+
     def test_sum_backward(self):
         def helper(n, c):
             values = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]
@@ -7157,11 +7284,11 @@ def test_no_warning_on_import(self):
         self.assertEqual(out, "")
 
     def _get_not_implemented_op(self):
-        # This can be changed once we actually implement `torch.bincount`
+        # This can be changed once we actually implement `torch.histc`
         # Should return fn, args, kwargs, string_version
-        return (torch.bincount,
-                torch.tensor([4], device='mps'), {},
-                "torch.bincount(torch.tensor([4, 3, 6, 3, 4], device='mps'))")
+        return (torch.histc,
+                torch.tensor([100], device='mps'), {},
+                "torch.histc(torch.tensor([4], device='mps', dtype=torch.float))")
 
     def test_error_on_not_implemented(self):
         fn, args, kwargs, _ = self._get_not_implemented_op()

From 18f7e82a436131d46a8a22894212fcf85d0abf9d Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Wed, 19 Oct 2022 14:10:58 -0700
Subject: [PATCH 1792/1922] Remove unused variable (build failure with
 WERROR=1) (#143)

* Remove unused variable (build failure with WERROR=1)

* Fix remaining build failures
---
 aten/src/ATen/native/mps/operations/Copy.mm       | 3 ---
 aten/src/ATen/native/mps/operations/SummaryOps.mm | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm
index ad1b0d02911c8..2abea72a27d9d 100644
--- a/aten/src/ATen/native/mps/operations/Copy.mm
+++ b/aten/src/ATen/native/mps/operations/Copy.mm
@@ -195,9 +195,6 @@ void copy_cast_mps(at::Tensor& dst, const at::Tensor& src,
     src_total_size = src.nbytes();
   }
 
-  size_t dst_tensor_nbytes = dst_.is_view() ? at::detail::computeStorageNbytesContiguous(dst_.sizes(), dst_.element_size(), dst_.storage_offset()) :
-                                              dst_.nbytes();
-
   const size_t size_to_copy = src.nbytes();
   const void* host_src = src.storage().data();
   TORCH_INTERNAL_ASSERT(src_total_size >= (src.storage_offset() * src.element_size()));
diff --git a/aten/src/ATen/native/mps/operations/SummaryOps.mm b/aten/src/ATen/native/mps/operations/SummaryOps.mm
index 60732f6dafe52..41d33cadcb3bd 100644
--- a/aten/src/ATen/native/mps/operations/SummaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/SummaryOps.mm
@@ -52,7 +52,7 @@
           if (self.scalar_type() == kByte) {
             castedInputTensor = [mpsGraph castTensor:inputTensor
                                               toType:MPSDataTypeInt32
-                                                name:nil];
+                                                name:@"castInputTensor"];
           }
 
           MPSGraphTensor *outputTensor = [mpsGraph scatterWithDataTensor:scatterDataTensor

From 9f826ff8536dafd0c8fd9e52e256861cfc4ba7d3 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Fri, 21 Oct 2022 15:17:46 -0400
Subject: [PATCH 1793/1922] Disable the padding dimension checks for
 constant_pad (#144)

---
 aten/src/ATen/native/mps/operations/Pad.mm |  29 ++-
 test/test_mps.py                           | 224 +++++++++++----------
 2 files changed, 132 insertions(+), 121 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Pad.mm b/aten/src/ATen/native/mps/operations/Pad.mm
index 63a26e66288be..6d8ee3b6e5d04 100644
--- a/aten/src/ATen/native/mps/operations/Pad.mm
+++ b/aten/src/ATen/native/mps/operations/Pad.mm
@@ -62,16 +62,25 @@
   Tensor grad_output, input = input_;
 
   if (!is_backward_pass) {
-    TORCH_CHECK(pad_l < input_w && pad_r < input_w,
-      "Argument #4: Padding size should be less than the corresponding "
-      "input dimension, but got: padding (", pad_l, ", ", pad_r,
-      ") at dimension ", dim_w, " of input ", ndims);
-
-    if (padding_dim > 1) {
-      TORCH_CHECK(pad_t < input_h && pad_b < input_h,
-        "Argument #6: Padding size should be less than the corresponding "
-        "input dimension, but got: padding (", pad_t, ", ", pad_b,
-        ") at dimension ", dim_h, " of input ", ndims);
+    // these checks aren't relevant for constant pad
+    if (mode != MPSGraphPaddingModeConstant) {
+      TORCH_CHECK(pad_l < input_w && pad_r < input_w,
+        "Argument #4: Padding size should be less than the corresponding "
+        "input dimension, but got: padding (", pad_l, ", ", pad_r,
+        ") at dimension ", dim_w, " of input ", ndims);
+
+      if (padding_dim > 1) {
+        TORCH_CHECK(pad_t < input_h && pad_b < input_h,
+          "Argument #6: Padding size should be less than the corresponding "
+          "input dimension, but got: padding (", pad_t, ", ", pad_b,
+          ") at dimension ", dim_h, " of input ", ndims);
+      }
+      if (padding_dim > 2) {
+        TORCH_CHECK(pad_front < input_d && pad_back < input_d,
+          "Argument #8: Padding size should be less than the corresponding "
+          "input dimension, but got: padding (", pad_front, ", ", pad_back,
+          ") at dimension ", dim_d, " of input ", ndims);
+      }
     }
     TORCH_CHECK(output_w >= 1 || output_h >= padding_dim - 1,
       "input (H: ", input_h, ", W: ", input_w, ") is too small. Calculated "
diff --git a/test/test_mps.py b/test/test_mps.py
index 76f368cd3bd88..31f3f54dd60bd 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -3784,6 +3784,8 @@ def helper(shape, padding, op, value=0):
         helper((2, 1, 6, 8), (2, 4, 3, 5), nn.ConstantPad2d)
         # input size < pad size
         helper((1, 2, 3), (0, 0, 0, 1), nn.ConstantPad2d)
+        # pad dims < input dims
+        helper((50, 9, 300), (0, 0, 0, 31), nn.ConstantPad2d)
 
         # 3D Padding
         helper((2, 4, 6, 8, 4), (1, 3, 3, 5, 3, 4), nn.ReflectionPad3d)
@@ -7959,118 +7961,118 @@ class TestConsistency(TestCase):
         'take_along_dim': None,
 
         # New block list ops that need investigation
-        '__rdiv__': ['torch.bool', 'torch.int64'], 
-        '__rpow__': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.uint8'], 
-        '_masked.amax': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        '_masked.amin': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        '_masked.argmax': ['torch.float16', 'torch.float32', 'torch.int32'], 
-        '_masked.argmin': ['torch.float16', 'torch.float32', 'torch.int32'], 
-        '_masked.logsumexp': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        '_masked.mean': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        '_masked.prod': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        '_masked.std': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        '_masked.sum': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        '_masked.var': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'acos': ['torch.bool'], 
-        'acosh': ['torch.bool'], 
-        'amax': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'amin': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'as_strided_scatter': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'asin': ['torch.bool'], 
-        'asinh': ['torch.bool'], 
-        'atan2': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'], 
-        'atan': ['torch.bool'], 
-        'atanh': ['torch.bool'], 
-        'bernoulli': ['torch.float32'], 
-        'byte': ['torch.float16', 'torch.float32'], 
-        'char': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64'], 
-        'clamp': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'clamp_max': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'clamp_min': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'constant_pad_nd': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'cos': ['torch.bool'], 
-        'cosh': ['torch.bool'], 
-        'count_nonzero': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'diff': ['torch.bool', 'torch.uint8'], 
-        'eig': ['torch.float32'], 
-        'empty_like': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'erf': ['torch.bool'], 
-        'exp2': ['torch.bool'], 
-        'exp': ['torch.bool'], 
-        'fft.fft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'fft.ifft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'fft.ihfft2': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'fft.ihfft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'fft.ihfftn': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'fft.rfft2': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'fft.rfft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'fft.rfftn': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        '__rdiv__': ['torch.bool', 'torch.int64'],
+        '__rpow__': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.uint8'],
+        '_masked.amax': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        '_masked.amin': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        '_masked.argmax': ['torch.float16', 'torch.float32', 'torch.int32'],
+        '_masked.argmin': ['torch.float16', 'torch.float32', 'torch.int32'],
+        '_masked.logsumexp': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        '_masked.mean': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        '_masked.prod': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        '_masked.std': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        '_masked.sum': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        '_masked.var': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'acos': ['torch.bool'],
+        'acosh': ['torch.bool'],
+        'amax': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'amin': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'as_strided_scatter': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'asin': ['torch.bool'],
+        'asinh': ['torch.bool'],
+        'atan2': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'],
+        'atan': ['torch.bool'],
+        'atanh': ['torch.bool'],
+        'bernoulli': ['torch.float32'],
+        'byte': ['torch.float16', 'torch.float32'],
+        'char': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64'],
+        'clamp': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'clamp_max': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'clamp_min': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'constant_pad_nd': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'cos': ['torch.bool'],
+        'cosh': ['torch.bool'],
+        'count_nonzero': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'diff': ['torch.bool', 'torch.uint8'],
+        'eig': ['torch.float32'],
+        'empty_like': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'erf': ['torch.bool'],
+        'exp2': ['torch.bool'],
+        'exp': ['torch.bool'],
+        'fft.fft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'fft.ifft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'fft.ihfft2': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'fft.ihfft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'fft.ihfftn': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'fft.rfft2': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'fft.rfft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'fft.rfftn': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'float': ['torch.int64'],
-        'gather': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'half': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'index_put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'index_select': ['torch.uint8'], 
-        'int': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int64', 'torch.uint8'], 
-        'linalg.eigvals': ['torch.float32'], 
-        'linalg.multi_dot': ['torch.float32'], 
-        'log10': ['torch.bool'], 
-        'log1p': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'], 
-        'log2': ['torch.bool'], 
-        'log': ['torch.bool'], 
-        'logical_and': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'logical_or': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'logical_xor': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'logsumexp': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'matmul': ['torch.uint8'], 
-        'mean': ['torch.float16', 'torch.float32'], 
-        'native_layer_norm': ['torch.float32'], 
-        'neg': ['torch.uint8'], 
-        'new_empty': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'nn.functional.adaptive_avg_pool1d': ['torch.float32'], 
-        'nn.functional.adaptive_avg_pool2d': ['torch.float32'], 
-        'nn.functional.avg_pool1d': ['torch.float32', 'torch.int64'], 
-        'nn.functional.avg_pool2d': ['torch.float32', 'torch.int64'], 
-        'nn.functional.bilinear': ['torch.float32'], 
-        'nn.functional.conv_transpose2d': ['torch.float32'], 
-        'nn.functional.cosine_embedding_loss': ['torch.uint8'], 
-        'nn.functional.cosine_similarity': ['torch.float32'], 
-        'nn.functional.dropout2d': ['torch.float32'], 
-        'nn.functional.dropout3d': ['torch.float32'], 
-        'nn.functional.dropout': ['torch.float32'], 
-        'nn.functional.gelu': ['torch.float32'], 
-        'nn.functional.interpolate': ['torch.float32', 'torch.float32', 'torch.float32'], 
-        'nn.functional.layer_norm': ['torch.float32'], 
-        'nn.functional.margin_ranking_loss': ['torch.uint8'], 
-        'nn.functional.max_pool1d': ['torch.float32'], 
-        'nn.functional.max_pool2d': ['torch.float32'], 
-        'nn.functional.normalize': ['torch.float32'], 
-        'nn.functional.pad': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64'], 
-        'nn.functional.pairwise_distance': ['torch.uint8'], 
-        'nn.functional.triplet_margin_loss': ['torch.uint8'], 
-        'nn.functional.triplet_margin_with_distance_loss': ['torch.uint8'], 
-        'nn.functional.upsample_nearest': ['torch.float32'], 
-        'normal': ['torch.float16', 'torch.float32', 'torch.float16', 'torch.float32'], 
-        'pow': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.uint8'], 
-        'prod': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'rand_like': ['torch.float16', 'torch.float32'], 
-        'randint_like': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'randn_like': ['torch.float16', 'torch.float32'], 
-        'reciprocal': ['torch.bool'], 
-        'rsqrt': ['torch.bool'], 
-        'scatter_add': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'scatter': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'short': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'sigmoid': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'], 
-        'sin': ['torch.bool'], 
-        'sinh': ['torch.bool'], 
-        'sqrt': ['torch.bool'], 
-        'sub': ['torch.float16', 'torch.uint8'], 
-        'sum': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'tan': ['torch.bool', 'torch.float32'], 
-        'tanh': ['torch.bool'], 
-        'tensor_split': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
-        'topk': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'], 
+        'gather': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'half': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'index_put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'index_select': ['torch.uint8'],
+        'int': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int64', 'torch.uint8'],
+        'linalg.eigvals': ['torch.float32'],
+        'linalg.multi_dot': ['torch.float32'],
+        'log10': ['torch.bool'],
+        'log1p': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'],
+        'log2': ['torch.bool'],
+        'log': ['torch.bool'],
+        'logical_and': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'logical_or': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'logical_xor': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'logsumexp': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'matmul': ['torch.uint8'],
+        'mean': ['torch.float16', 'torch.float32'],
+        'native_layer_norm': ['torch.float32'],
+        'neg': ['torch.uint8'],
+        'new_empty': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'nn.functional.adaptive_avg_pool1d': ['torch.float32'],
+        'nn.functional.adaptive_avg_pool2d': ['torch.float32'],
+        'nn.functional.avg_pool1d': ['torch.float32', 'torch.int64'],
+        'nn.functional.avg_pool2d': ['torch.float32', 'torch.int64'],
+        'nn.functional.bilinear': ['torch.float32'],
+        'nn.functional.conv_transpose2d': ['torch.float32'],
+        'nn.functional.cosine_embedding_loss': ['torch.uint8'],
+        'nn.functional.cosine_similarity': ['torch.float32'],
+        'nn.functional.dropout2d': ['torch.float32'],
+        'nn.functional.dropout3d': ['torch.float32'],
+        'nn.functional.dropout': ['torch.float32'],
+        'nn.functional.gelu': ['torch.float32'],
+        'nn.functional.interpolate': ['torch.float32', 'torch.float32', 'torch.float32'],
+        'nn.functional.layer_norm': ['torch.float32'],
+        'nn.functional.margin_ranking_loss': ['torch.uint8'],
+        'nn.functional.max_pool1d': ['torch.float32'],
+        'nn.functional.max_pool2d': ['torch.float32'],
+        'nn.functional.normalize': ['torch.float32'],
+        'nn.functional.pad': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64'],
+        'nn.functional.pairwise_distance': ['torch.uint8'],
+        'nn.functional.triplet_margin_loss': ['torch.uint8'],
+        'nn.functional.triplet_margin_with_distance_loss': ['torch.uint8'],
+        'nn.functional.upsample_nearest': ['torch.float32'],
+        'normal': ['torch.float16', 'torch.float32', 'torch.float16', 'torch.float32'],
+        'pow': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.uint8'],
+        'prod': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'rand_like': ['torch.float16', 'torch.float32'],
+        'randint_like': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'randn_like': ['torch.float16', 'torch.float32'],
+        'reciprocal': ['torch.bool'],
+        'rsqrt': ['torch.bool'],
+        'scatter_add': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'scatter': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'short': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'sigmoid': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'],
+        'sin': ['torch.bool'],
+        'sinh': ['torch.bool'],
+        'sqrt': ['torch.bool'],
+        'sub': ['torch.float16', 'torch.uint8'],
+        'sum': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'tan': ['torch.bool', 'torch.float32'],
+        'tanh': ['torch.bool'],
+        'tensor_split': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'topk': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'trapz': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'nn.functional.local_response_norm': ['torch.int64'],
         'index_select': ['torch.bool'],

From 01e84f1ee718ce96c70cfcd754b001d52c8c1812 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Mon, 24 Oct 2022 08:49:03 -0700
Subject: [PATCH 1794/1922] Add support for copying cpu tensors into strided
 mps tensors (#142)

* Add support for copying cpu tensors into strided mps tensors

* Do scatter only when destination is strided and source is contiguous
---
 aten/src/ATen/native/mps/OperationUtils.h   |  2 +-
 aten/src/ATen/native/mps/operations/Copy.mm | 26 +++++++++++++++++----
 aten/src/ATen/native/mps/operations/View.mm | 13 +++++++----
 test/test_mps.py                            | 14 ++++++++++-
 4 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
index cc86c4ede4c3b..1bf4bc838fa81 100644
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@@ -48,7 +48,7 @@ std::string getTensorsStringKey(const TensorList& tensors, bool use_scalar_value
 std::string getArrayRefString(const IntArrayRef s);
 // use has_storage() on the returned tensor to determine if src actually is a view
 Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst);
-Tensor& scatterViewTensor(const at::Tensor& src, at::Tensor& output);
+Tensor& scatterViewTensor(const at::Tensor& src, at::Tensor& output, id<MTLBuffer> updatesBuffer = nil);
 
 MPSShape* getMPSShape(const Tensor& t);
 MPSShape* getMPSShape(IntArrayRef sizes);
diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm
index 2abea72a27d9d..ca0171a695cbe 100644
--- a/aten/src/ATen/native/mps/operations/Copy.mm
+++ b/aten/src/ATen/native/mps/operations/Copy.mm
@@ -205,14 +205,30 @@ void copy_cast_mps(at::Tensor& dst, const at::Tensor& src,
     NSUInteger alignedLength = 0;
 
     void* alignedPtr = pageAlignedBlockPtr(host_src, (NSUInteger)src_total_size, &alignedLength);
-    id<MTLBuffer> sourceBuffer = [device newBufferWithBytesNoCopy:alignedPtr
-                                          length:alignedLength
-                                         options:options
-                                     deallocator:nil];
     sourceOffset = uintptr_t(host_src) - uintptr_t(alignedPtr);
     sourceOffset += src_.storage_offset() * src_.itemsize();
 
-    stream->copy_and_sync(sourceBuffer, destBuffer, size_to_copy, sourceOffset, dst_byte_offset, non_blocking);
+    id<MTLBuffer> sourceBuffer = nil;
+    // If the destination is a strided MPS tensor, we cannot perform a blit directly to copy the
+    // memory from the CPU tensor into the MPS tensor. We need to scatter the data into the right indices
+    bool doScatter = (!dst_.is_contiguous() && src.is_contiguous());
+    if (doScatter) {
+      sourceBuffer = [device newBufferWithBytes:(void*)((uint8_t*)host_src + (src_.storage_offset() * src_.itemsize()))
+                                         length:size_to_copy
+                                        options:options];
+    }
+    else {
+      sourceBuffer = [device newBufferWithBytesNoCopy:alignedPtr
+                                               length:alignedLength
+                                              options:options
+                                          deallocator:nil];
+    }
+
+    if (doScatter) {
+      scatterViewTensor(src, dst_, sourceBuffer);
+    } else {
+      stream->copy_and_sync(sourceBuffer, destBuffer, size_to_copy, sourceOffset, dst_byte_offset, non_blocking);
+    }
     [sourceBuffer release];
   }
 
diff --git a/aten/src/ATen/native/mps/operations/View.mm b/aten/src/ATen/native/mps/operations/View.mm
index f7f6a686f4d68..8a51012686609 100644
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@@ -33,7 +33,12 @@
 }
 
 // initializes the MTLBuffers for tensor data and runs the MPSGraph for the view op
-static Tensor& runViewGraph(ViewCachedGraph* cachedGraph, const at::Tensor& src, Tensor& output, bool needsScatter)
+static Tensor& runViewGraph(
+  ViewCachedGraph* cachedGraph,
+  const at::Tensor& src,
+  Tensor& output,
+  bool needsScatter,
+  id<MTLBuffer> updatesBuffer = nil)
 {
   const id<MTLBuffer> sourceBuffer = getMTLBufferStorage(src);
   const id<MTLBuffer> outputBuffer = getMTLBufferStorage(output);
@@ -54,7 +59,7 @@
                                                                                shape: inputShape
                                                                             dataType: inputType] autorelease];
     if (needsScatter) {
-      feeds[cachedGraph->updatesTensor] = [[[MPSGraphTensorData alloc] initWithMTLBuffer: sourceBuffer
+      feeds[cachedGraph->updatesTensor] = [[[MPSGraphTensorData alloc] initWithMTLBuffer: (updatesBuffer != nil) ? updatesBuffer : sourceBuffer
                                                                                    shape: getMPSShape(src.numel())
                                                                                 dataType: getMPSDataType(src.scalar_type())] autorelease];
     }
@@ -617,11 +622,11 @@ Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst)
   return runViewGraph(cachedGraph, src, dst.has_storage() ? dst : output, /*needsScatter*/ false);
 }
 
-Tensor& scatterViewTensor(const at::Tensor& src, at::Tensor& output)
+Tensor& scatterViewTensor(const at::Tensor& src, at::Tensor& output, id<MTLBuffer> updatesBuffer)
 {
   ViewCachedGraph* cachedGraph = createViewGraph(output, src, output.sizes(), output.strides(),
                                                  output.storage_offset(), /*needsScatter*/ true);
-  return runViewGraph(cachedGraph, src, output, /*needsScatter*/ true);
+  return runViewGraph(cachedGraph, src, output, /*needsScatter*/ true, updatesBuffer);
 }
 
 } // namespace mps
diff --git a/test/test_mps.py b/test/test_mps.py
index 31f3f54dd60bd..5dd6c494fc6d9 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -1319,6 +1319,19 @@ def test_expand_cpu_to_mps_copy(self):
 
         self.assertEqual(x_cpu, x.cpu())
 
+    def test_cpu_to_strided_mps_copy(self):
+        # https://github.com/pytorch/pytorch/issues/86975
+
+        a1 = torch.Tensor([[1,2],[3,4], [5,6]]).to(torch.device("mps"))
+        b1 = torch.Tensor([-1, -1])
+        a1[1:,1] = b1
+
+        a2 = torch.Tensor([[1,2],[3,4], [5,6]]).to(torch.device("mps"))
+        b2 = torch.Tensor([-1, -1]).to(torch.device("mps"))
+        a2[1:,1] = b2
+
+        self.assertEqual(a1, a2)
+
     def test_view_slice(self):
         # https://github.com/pytorch/pytorch/issues/83995
         NUM_SAMPLES = 60
@@ -6378,7 +6391,6 @@ def test_view(self, device="mps"):
         self.assertRaises(RuntimeError, lambda: tensor.view(7, -1))
         self.assertRaises(RuntimeError, lambda: tensor.view(15, -1, -1))
 
-    # RuntimeError: Invalid device for storage: mps
     def test_contiguous(self, device="mps"):
         x = torch.randn(1, 16, 5, 5, device=device)
         self.assertTrue(x.is_contiguous())

From 73ef70bbbb21d3117d8f8fbc996b3dd16d1199eb Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Wed, 26 Oct 2022 10:40:57 -0700
Subject: [PATCH 1795/1922] Register norm_dtype_out for mps (#149)

---
 .../ATen/native/mps/operations/ReduceOps.mm   | 55 ++++++++++++++-----
 aten/src/ATen/native/native_functions.yaml    |  1 +
 test/test_mps.py                              | 49 +++++++++++++++++
 3 files changed, 91 insertions(+), 14 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index e04333aa3712a..d186ff01b784f 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -425,17 +425,21 @@ Tensor count_nonzero_mps(const Tensor& self, IntArrayRef dims){
     reduction_out_mps(input_t, opt_dim, keepdim, dtype, output_t, MPSReductionType::MEAN, "mean_out_mps");
 }
 
-TORCH_IMPL_FUNC(norm_out_mps)
-(const Tensor& input_tensor,
- const OptionalScalarRef opt_p,
- IntArrayRef dim,
- bool keepdim,
- const Tensor& output_t)
-{
+void impl_func_norm_mps(
+    const Tensor& input_tensor,
+    const OptionalScalarRef& opt_p,
+    IntArrayRef dim,
+    bool keepdim,
+    optional<ScalarType> opt_dtype,
+    const Tensor& output_t) {
+
+  namespace native_mps = at::native::mps;
   if (input_tensor.numel() == 0)
     return;
 
   auto input_t = (input_tensor.sizes().size() == 0) ? input_tensor.view({1}) : input_tensor;
+  auto in_dtype = opt_dtype.value_or(input_tensor.scalar_type());
+  auto mps_input_dtype = native_mps::getMPSDataType(in_dtype);
 
   IntArrayRef input_shape = input_t.sizes();
 
@@ -444,7 +448,6 @@ Tensor count_nonzero_mps(const Tensor& self, IntArrayRef dims){
     TORCH_CHECK(wrap_dim < input_shape.size(),
     "norm_out_mps: reduction dim must be in the range of input shape")
   }
-  namespace native_mps = at::native::mps;
 
   using CachedGraph = native_mps::MPSUnaryCachedGraph;
 
@@ -498,7 +501,13 @@ Tensor count_nonzero_mps(const Tensor& self, IntArrayRef dims){
           MPSGraph* mpsGraph = native_mps::make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
 
-          MPSGraphTensor* inputTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(input_t.scalar_type()));
+          MPSGraphTensor* inputTensor_ = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(input_t.scalar_type()));
+          MPSGraphTensor* inputTensor = inputTensor_;
+          if (opt_dtype.has_value()) {
+            inputTensor = [mpsGraph castTensor:inputTensor
+                                         toType:mps_input_dtype
+                                           name:@"any_all"];
+          }
 
           MPSGraphTensor *outputTensor;
 
@@ -507,7 +516,7 @@ Tensor count_nonzero_mps(const Tensor& self, IntArrayRef dims){
               MPSGraphTensor *absoluteTensor = [mpsGraph absoluteWithTensor:inputTensor
                                                                        name:nil];
               MPSGraphTensor *powerValTensor = [mpsGraph constantWithScalar:p
-                                                                   dataType:native_mps::getMPSDataType(input_t.scalar_type())];
+                                                                   dataType:mps_input_dtype];
               MPSGraphTensor *powerTensor = [mpsGraph powerWithPrimaryTensor:absoluteTensor
                                                              secondaryTensor:powerValTensor
                                                                         name:nil];
@@ -537,10 +546,10 @@ Tensor count_nonzero_mps(const Tensor& self, IntArrayRef dims){
                                                                        name:nil];
 
               MPSGraphTensor *powerValTensor = [mpsGraph constantWithScalar:p
-                                                                   dataType:native_mps::getMPSDataType(input_t.scalar_type())];
+                                                                   dataType:mps_input_dtype];
 
               MPSGraphTensor *reciprocalPowerValTensor = [mpsGraph constantWithScalar:reciprocal_p
-                                                                             dataType:native_mps::getMPSDataType(input_t.scalar_type())];
+                                                                             dataType:mps_input_dtype];
 
               MPSGraphTensor *powerTensor = [mpsGraph powerWithPrimaryTensor:absoluteTensor
                                                              secondaryTensor:powerValTensor
@@ -555,7 +564,7 @@ Tensor count_nonzero_mps(const Tensor& self, IntArrayRef dims){
                                                          name:nil];
           }
 
-          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->inputTensor_ = inputTensor_;
           newCachedGraph->outputTensor_ = outputTensor;
         }
         return newCachedGraph;
@@ -581,10 +590,28 @@ Tensor count_nonzero_mps(const Tensor& self, IntArrayRef dims){
     };
 
     native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
-
   }
 }
 
+TORCH_IMPL_FUNC(norm_out_mps)
+(const Tensor& self,
+ const OptionalScalarRef opt_p,
+ IntArrayRef dim,
+ bool keepdim,
+ const Tensor& result) {
+  impl_func_norm_mps(self, opt_p, dim, keepdim, c10::nullopt, result);
+}
+
+TORCH_IMPL_FUNC(norm_dtype_out_mps)
+(const Tensor& self,
+ const OptionalScalarRef opt_p,
+ IntArrayRef dim,
+ bool keepdim,
+ ScalarType dtype,
+ const Tensor& result) {
+  impl_func_norm_mps(self, opt_p, dim, keepdim, dtype, result);
+}
+
 Tensor std_var_common_impl_mps(
   const Tensor & input_t,
   at::OptionalIntArrayRef dim,
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index eca1dfa941f85..ea3bc99a28ad0 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -6168,6 +6168,7 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: norm_dtype_out
+    MPS: norm_dtype_out_mps
 
 - func: norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
diff --git a/test/test_mps.py b/test/test_mps.py
index 5dd6c494fc6d9..c702f93007db6 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -809,6 +809,55 @@ def helper(shape, eps=1, momentum=0.1, wts=False, training=False, channels_last=
                         helper(shape, eps=3, momentum=0.67, wts=True, training=True, channels_last=channels_last,
                                track_running_stats=track_running_stats, test_module=test_module)
 
+    def test_norm(self):
+        a = torch.arange(9, dtype=torch.float, device="mps") - 4
+        b = a.reshape((3, 3))
+
+        a_cpu = torch.arange(9, dtype=torch.float, device="cpu") - 4
+        b_cpu = a_cpu.reshape((3, 3))
+
+        res = torch.norm(a)
+        res_cpu = torch.norm(a_cpu)
+        self.assertEqual(res, res_cpu)
+
+        res = torch.norm(b)
+        res_cpu = torch.norm(b_cpu)
+        self.assertEqual(res, res_cpu)
+
+        res = torch.norm(a, float('inf'))
+        res_cpu = torch.norm(a_cpu, float('inf'))
+        self.assertEqual(res, res_cpu)
+
+        res = torch.norm(b, float('inf'))
+        res_cpu = torch.norm(b_cpu, float('inf'))
+        self.assertEqual(res, res_cpu)
+
+        c = torch.tensor([[1, 2, 3],[-1, 1, 4]] , dtype=torch.float, device="mps")
+        c_cpu = torch.tensor([[1, 2, 3],[-1, 1, 4]] , dtype=torch.float, device="cpu")
+
+        res = torch.norm(c, dim=0)
+        res_cpu = torch.norm(c_cpu, dim=0)
+        self.assertEqual(res, res_cpu)
+
+        res = torch.norm(c, dim=1)
+        res_cpu = torch.norm(c_cpu, dim=1)
+        self.assertEqual(res, res_cpu)
+
+        res = torch.norm(c, p=1, dim=1)
+        res_cpu = torch.norm(c_cpu, p=1, dim=1)
+        self.assertEqual(res, res_cpu)
+
+        d = torch.arange(8, dtype=torch.float, device="mps").reshape(2, 2, 2)
+        d_cpu = torch.arange(8, dtype=torch.float, device="cpu").reshape(2, 2, 2)
+
+        res = torch.norm(d, dim=(1,2))
+        res_cpu = torch.norm(d_cpu, dim=(1,2))
+        self.assertEqual(res, res_cpu)
+
+        res = torch.norm(d[0, :, :]), torch.norm(d[1, :, :])
+        res_cpu = torch.norm(d_cpu[0, :, :]), torch.norm(d_cpu[1, :, :])
+        self.assertEqual(res, res_cpu)
+
     def test_layer_norm(self):
         # TODO: Test non-contiguous
         def helper(input_shape, normalized_shape, eps=1e-05, elementwise_affine=True, dtype=torch.float32):

From 58e3a66aaf99f9f27a5d772f6b1a6d9e56730245 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Wed, 26 Oct 2022 15:39:38 -0400
Subject: [PATCH 1796/1922] Improve the performance of torch.linear() (#148)

* Improve the performance of torch.linear()

* Clean up redundant headers and namespaces from Linear.mm
---
 aten/src/ATen/native/mps/operations/Linear.mm | 123 +++++++-----------
 1 file changed, 49 insertions(+), 74 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Linear.mm b/aten/src/ATen/native/mps/operations/Linear.mm
index b49e70cdf4915..b524a04aad58d 100644
--- a/aten/src/ATen/native/mps/operations/Linear.mm
+++ b/aten/src/ATen/native/mps/operations/Linear.mm
@@ -1,21 +1,12 @@
 //  Copyright © 2022 Apple Inc.
 
-#include <ATen/ATen.h>
-#include <ATen/Tensor.h>
-#include <ATen/Utils.h>
-#include <ATen/mps/MPSStream.h>
 #include <ATen/native/mps/OperationUtils.h>
-#include <torch/library.h>
-
-#ifdef __OBJC__
-#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
-#endif
-
-using namespace at::mps;
 
 namespace at {
 namespace native {
 
+using namespace mps;
+
 Tensor _mps_linear(
   const Tensor& input,
   const Tensor& weight_arg,
@@ -23,17 +14,13 @@ Tensor _mps_linear(
   // wT = transpose(weight);
   // y=x*wT+b
 
-  using namespace mps;
-
   auto weight = (weight_arg.dim() == 1) ? weight_arg.view({1, weight_arg.size(0)}) : weight_arg;
 
   TORCH_CHECK(input.scalar_type() == ScalarType::Float ||
               input.scalar_type() == ScalarType::Half, "MPS device does not support linear for non-float inputs");
 
-  // See [Note: hacky wrapper removal for optional tensor]
-  auto bias = bias_opt.has_value()
-    ? c10::MaybeOwned<Tensor>::borrowed(*bias_opt)
-    : c10::MaybeOwned<Tensor>::owned(c10::in_place);
+  const Tensor& bias = *(at::borrow_from_optional_tensor(bias_opt));
+  bool is_bias_defined = bias.defined();
 
   auto input_size = input.sizes();
   std::vector<int64_t> output_size(input_size.begin(), input_size.end() - 1);
@@ -64,24 +51,11 @@ Tensor _mps_linear(
 
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
 
-  bool is_bias_defined = bias->defined();
-
   @autoreleasepool {
-
-    MPSShape* wt_shape = getMPSShape(weight);
-    string wt_key = string([[[wt_shape valueForKey:@"description"] componentsJoinedByString:@","] UTF8String]);
-    string bias_key = "nobias";
-    if(is_bias_defined) {
-      bias_key = "bias";
-    }
-
-    string key = "mps_linear" + getTensorsStringKey({input, weight}) + ":" + bias_key;
-
-
+    string key = "mps_linear" + getTensorsStringKey({input, weight, bias}) ;
     CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
 
     if(!cachedGraph) {
-
       MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
 
         CachedGraph *newCachedGraph = nil;
@@ -92,17 +66,11 @@ Tensor _mps_linear(
 
           MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
           MPSGraphTensor* weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight);
-          MPSGraphTensor* biasTensor = nil;
-
-          if(is_bias_defined) {
-            biasTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType((*bias).scalar_type()));
-          }
 
           MPSGraphTensor* weightTransposeTensor = [mpsGraph transposeTensor:weightTensor
                                                                   dimension:-1
                                                               withDimension:-2
                                                                        name:nil];
-
           MPSGraphTensor* outputTensor = nil;
 
           if (!is_bias_defined)
@@ -113,17 +81,26 @@ Tensor _mps_linear(
           }
           else
           {
-            MPSGraphTensor* xMulWTTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:inputTensor
+            MPSGraphTensor* inputFlattened = inputTensor;
+            bool doReshape = false;
+            // workaround to improve the performance with 3D+ inputs
+            if (input_size.size() > 2 && input_size[0] > 1 && input_size[1] >= 1 && input_size[1] <= 32) {
+              doReshape = true;
+              inputFlattened = [mpsGraph flatten2DTensor:inputTensor axis:-1 name:nil];
+            }
+
+            newCachedGraph->biasTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, bias);
+            MPSGraphTensor* xMulWTTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:inputFlattened
                                                                            secondaryTensor:weightTransposeTensor
                                                                                       name:nil];
-            outputTensor = [mpsGraph additionWithPrimaryTensor:xMulWTTensor
-                                               secondaryTensor:biasTensor
+            MPSGraphTensor* biasedTensor = [mpsGraph additionWithPrimaryTensor:xMulWTTensor
+                                               secondaryTensor:newCachedGraph->biasTensor_
                                                           name:nil];
+            outputTensor = doReshape ? [mpsGraph reshapeTensor:biasedTensor withShape:getMPSShape(output_size) name:nil] : biasedTensor;
           }
 
           newCachedGraph->inputTensor_ = inputTensor;
           newCachedGraph->weightTensor_ = weightTensor;
-          newCachedGraph->biasTensor_ = biasTensor;
           newCachedGraph->outputTensor_ = outputTensor;
         }
         return newCachedGraph;
@@ -134,21 +111,20 @@ Tensor _mps_linear(
     Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input);
     Placeholder weightPlaceholder = Placeholder(cachedGraph->weightTensor_, weight);
     Placeholder biasPlaceholder = Placeholder();
-    if(is_bias_defined)
-      biasPlaceholder = Placeholder(cachedGraph->biasTensor_, *bias);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
 
     NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =[NSMutableDictionary dictionary];
     feeds[inputPlaceholder.getMPSGraphTensor()]   = inputPlaceholder.getMPSGraphTensorData();
     feeds[weightPlaceholder.getMPSGraphTensor()] = weightPlaceholder.getMPSGraphTensorData();
-    if (is_bias_defined)
-        feeds[biasPlaceholder.getMPSGraphTensor()] = biasPlaceholder.getMPSGraphTensorData();
-
+    if (is_bias_defined) {
+      biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias);
+      feeds[biasPlaceholder.getMPSGraphTensor()] = biasPlaceholder.getMPSGraphTensorData();
+    }
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
       outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
     };
 
-    mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
   }
 
   // Shave off '1' present at the end of the shape
@@ -158,8 +134,7 @@ Tensor _mps_linear(
     std::vector<int64_t> out_shape(output_sizes.begin(), output_sizes.end()-1);
     return output.view(IntArrayRef(out_shape));
   }
-  else
-    return output;
+  return output;
 }
 
 Tensor _mps_linear_backward_input(
@@ -179,7 +154,7 @@ Tensor _mps_linear_backward_input(
 
   const Tensor weight_reshaped = weight.is_contiguous() ? weight : weight.contiguous();
 
-   struct CachedGraph : public mps::MPSCachedGraph
+   struct CachedGraph : public MPSCachedGraph
   {
     CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
     MPSGraphTensor *weightTensor_ = nil;
@@ -195,24 +170,24 @@ Tensor _mps_linear_backward_input(
                                         grad_output.suggest_memory_format());
   TORCH_CHECK(output.is_mps());
 
-  mps::MPSGraphCache *cache_ = mps::MPSGraphCache::getInstance();
+  MPSGraphCache *cache_ = MPSGraphCache::getInstance();
 
   MPSStream *stream= getCurrentMPSStream();
 
   @autoreleasepool {
 
-   string key = "mps_linear_backward_input" + mps::getTensorsStringKey({grad_output, weight_reshaped});
+   string key = "mps_linear_backward_input" + getTensorsStringKey({grad_output, weight_reshaped});
     CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
     if(!cachedGraph) {
-      mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ mps::MPSCachedGraph * () {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
         CachedGraph *newCachedGraph = nil;
 
         @autoreleasepool {
-          MPSGraph *mpsGraph = mps::make_mps_graph();
+          MPSGraph *mpsGraph = make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
 
-          MPSGraphTensor *weightTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, weight_reshaped);
-          MPSGraphTensor *gradOutputTensor =  mps::mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
+          MPSGraphTensor *weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_reshaped);
+          MPSGraphTensor *gradOutputTensor =  mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
 
           MPSGraphTensor *outputTensor =
             [mpsGraph matrixMultiplicationWithPrimaryTensor: gradOutputTensor
@@ -228,9 +203,9 @@ Tensor _mps_linear_backward_input(
       cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
-    mps::Placeholder weightPlaceholder = mps::Placeholder(cachedGraph->weightTensor_, weight_reshaped);
-    mps::Placeholder gradOutputPlaceholder = mps::Placeholder(cachedGraph->gradOutputTensor_, grad_output);
-    mps::Placeholder outputPlaceholder = mps::Placeholder(cachedGraph->outputTensor_, output);
+    Placeholder weightPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_reshaped);
+    Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
       weightPlaceholder.getMPSGraphTensor() : weightPlaceholder.getMPSGraphTensorData(),
@@ -241,7 +216,7 @@ Tensor _mps_linear_backward_input(
       outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
     };
 
-    mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
 
     return output;
   }
@@ -256,7 +231,7 @@ Tensor _mps_linear_backward_input(
   TORCH_CHECK(grad_output.scalar_type() == ScalarType::Float ||
               grad_output.scalar_type() == ScalarType::Half, "MPS device does not support linear backward for non-float inputs");
 
-   struct CachedGraph : public mps::MPSCachedGraph
+   struct CachedGraph : public MPSCachedGraph
   {
     CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
     MPSGraphTensor *inputTensor_ = nil;
@@ -288,26 +263,26 @@ Tensor _mps_linear_backward_input(
   TORCH_CHECK(output.is_mps());
   TORCH_CHECK(bias.is_mps());
 
-  mps::MPSGraphCache *cache_ = mps::MPSGraphCache::getInstance();
+  MPSGraphCache *cache_ = MPSGraphCache::getInstance();
 
   MPSStream *stream= getCurrentMPSStream();
 
   @autoreleasepool {
 
    string key = "mps_linear_backward_weights:" + to_string(bias_defined) + ":" +
-                                                 mps::getTensorsStringKey({input_reshaped, weight, grad_output_reshaped});
+                                                 getTensorsStringKey({input_reshaped, weight, grad_output_reshaped});
     CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
     if(!cachedGraph) {
-      mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ mps::MPSCachedGraph * () {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
         CachedGraph *newCachedGraph = nil;
 
         @autoreleasepool {
-          MPSGraph *mpsGraph = mps::make_mps_graph();
+          MPSGraph *mpsGraph = make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
 
-          MPSGraphTensor *inputTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, input_reshaped);
-          MPSGraphTensor *weightTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, weight);
-          MPSGraphTensor *gradOutputTensor =  mps::mpsGraphRankedPlaceHolder(mpsGraph, grad_output_reshaped);
+          MPSGraphTensor *inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_reshaped);
+          MPSGraphTensor *weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight);
+          MPSGraphTensor *gradOutputTensor =  mpsGraphRankedPlaceHolder(mpsGraph, grad_output_reshaped);
 
           MPSGraphTensor *gradOutputTransposeTensor =
             [mpsGraph transposeTensor: gradOutputTensor
@@ -341,11 +316,11 @@ Tensor _mps_linear_backward_input(
       cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
-    mps::Placeholder inputPlaceholder = mps::Placeholder(cachedGraph->inputTensor_, input_reshaped);
-    mps::Placeholder weightPlaceholder = mps::Placeholder(cachedGraph->weightTensor_, weight);
-    mps::Placeholder gradOutputPlaceholder = mps::Placeholder(cachedGraph->gradOutputTensor_, grad_output_reshaped);
-    mps::Placeholder outputPlaceholder = mps::Placeholder(cachedGraph->outputTensor_, output);
-    mps::Placeholder biasPlaceholder = mps::Placeholder(cachedGraph->biasTensor_, bias);
+    Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_reshaped);
+    Placeholder weightPlaceholder = Placeholder(cachedGraph->weightTensor_, weight);
+    Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_reshaped);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+    Placeholder biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias);
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
       gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
@@ -358,7 +333,7 @@ Tensor _mps_linear_backward_input(
     if (bias_defined)
       results[biasPlaceholder.getMPSGraphTensor()] = biasPlaceholder.getMPSGraphTensorData();
 
-    mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
 
     return std::tuple<Tensor, Tensor>{ output, bias };
   }

From 6e9bb6db168010d97df1848bf6bd30d44d715f21 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Wed, 26 Oct 2022 17:59:44 -0400
Subject: [PATCH 1797/1922] Fix boolean casting in Unary ops (#150)

---
 .../ATen/native/mps/operations/UnaryOps.mm    |  4 +-
 test/test_mps.py                              | 72 +++++++------------
 2 files changed, 27 insertions(+), 49 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index 3f075ed87bd0b..070770d41d700 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -43,7 +43,7 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
           newCachedGraph->inputTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, self);
           MPSGraphTensor* castTensor = newCachedGraph->inputTensor_;
           // Integer input must be cast to float if output is float
-          if (isIntegralType(self.scalar_type()) && isFloatingType(output.scalar_type())) {
+          if (isIntegralType(self.scalar_type(), true) && isFloatingType(output.scalar_type())) {
             castTensor = castMPSTensor(mpsGraph, newCachedGraph->inputTensor_, output.scalar_type());
           }
           newCachedGraph->outputTensor_ = unaryBlock(mpsGraph, castTensor);
@@ -131,7 +131,7 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
                 ^ MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* inputTensor)   \
                   { return [mpsGraph func_stub##WithTensor:inputTensor name:nil]; },  \
                   [](const Tensor& t) -> bool {                                       \
-                  return t.numel() == 0 || isIntegralType(t.scalar_type());           \
+                  return t.numel() == 0 || isIntegralType(t.scalar_type(), true);     \
                 });                                                                   \
 }
 CREATE_MPS_STRUCTURED_UNARY_ROUNDING_TORCH_IMPL_FUNC(ceil_out_mps, ceil)
diff --git a/test/test_mps.py b/test/test_mps.py
index c702f93007db6..fd9ed94cdfa31 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -7510,9 +7510,9 @@ class TestConsistency(TestCase):
         'masked.softmin': ['f32'],
         'masked.std': ['f32'],
         'masked.var': ['f32'],
-        'abs': ['f16', 'f32', 'i16', 'i32', 'u8'],
-        'acos': ['f32', 'i16', 'i32', 'u8'],
-        'acosh': ['f32', 'i16', 'i32', 'u8'],
+        'abs': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
+        'acos': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        'acosh': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
         'addbmm': ['f32'],
         'addcdiv': ['f32'],
@@ -7531,11 +7531,11 @@ class TestConsistency(TestCase):
         'logsumexp': ['f32'],
         'mean': ['f32'],
         'sum': ['f32'],
-        'asin': ['f32', 'i16', 'i32', 'u8'],
-        'asinh': ['f32', 'i16', 'i32', 'u8'],
-        'atan': ['f32', 'i16', 'i32', 'u8'],
+        'asin': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        'asinh': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        'atan': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'atan2': ['f32'],
-        'atanh': ['f32', 'i16', 'i32', 'u8'],
+        'atanh': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'atleast_1d': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'atleast_2d': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'atleast_3d': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -7559,8 +7559,8 @@ class TestConsistency(TestCase):
         'conj_physical': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'contiguous': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'corrcoef': ['f32'],
-        'cos': ['f32', 'i16', 'i32', 'u8', 'i64'],
-        'cosh': ['f32', 'i16', 'i32', 'u8', 'i64'],
+        'cos': ['b8', 'f32', 'i16', 'i32', 'u8', 'i64'],
+        'cosh': ['b8', 'f32', 'i16', 'i32', 'u8', 'i64'],
         'cov': ['f32'],
         'cumsum': ['f16', 'f32', 'int16', 'int32'],
         'deg2rad': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -7572,9 +7572,9 @@ class TestConsistency(TestCase):
         'dist': ['f32'],
         'dot': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'equal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'erf': ['f32', 'i16', 'i32', 'u8'],
-        'exp': ['f32', 'i16', 'i32', 'u8'],
-        'exp2': ['f16', 'f32', 'i16', 'i32', 'u8'],
+        'erf': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        'exp': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        'exp2': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
         'eye': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'fill': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'flatten': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -7599,9 +7599,9 @@ class TestConsistency(TestCase):
         'linalg.svd': ['f32'],
         'linalg.vector_norm': ['f16', 'f32'],
         'linspace': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'log': ['f32', 'i16', 'i32', 'u8'],
-        'log10': ['f32', 'i16', 'i32', 'u8'],
-        'log2': ['f32', 'i16', 'i32', 'u8'],
+        'log': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        'log10': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        'log2': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'log_softmax': ['f32'],
         'logaddexp': ['f32'],
         'logaddexp2': ['f32'],
@@ -7612,7 +7612,7 @@ class TestConsistency(TestCase):
         'matmul': ['f32'],
         'mm': ['f32'],
         'mv': ['f32'],
-        'neg': ['f16', 'f32', 'i16', 'i32', 'i64'],
+        'neg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.adaptive_max_pool1d': ['f32'],
         'nn.functional.adaptive_max_pool2d': ['f32'],
         'nn.functional.binary_cross_entropy': ['f32'],
@@ -7678,7 +7678,7 @@ class TestConsistency(TestCase):
         'pow': ['f16'],
         'rad2deg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'real': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'reciprocal': ['f16', 'f32', 'i16', 'i32', 'u8'],
+        'reciprocal': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
         'repeat': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'repeat_interleave': ['b8',
                               'f16',
@@ -7693,19 +7693,19 @@ class TestConsistency(TestCase):
         'resolve_neg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'rot90': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'round': ['f32', 'f16', 'i16', 'i32', 'i64'],
-        'rsqrt': ['f32', 'i16', 'i32', 'u8'],
+        'rsqrt': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'select_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
         'sgn': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'short': ['i16'],
         'sigmoid': ['f32'],
         'sign': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8', 'i64'],
-        'sin': ['f32', 'i16', 'i32', 'u8'],
-        'sinh': ['f32', 'i16', 'i32', 'u8'],
+        'sin': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        'sinh': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'slice_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
         'softmax': ['f32'],
         'special.ndtr': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'split': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'sqrt': ['f32', 'i16', 'i32', 'u8'],
+        'sqrt': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'square': ['f16', 'f32'],
         'squeeze': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'stack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -7713,8 +7713,8 @@ class TestConsistency(TestCase):
         'sum_to_size': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'svd': ['f32'],
         't': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'tan': ['i16', 'i32', 'u8'],
-        'tanh': ['f32', 'i16', 'i32', 'u8'],
+        'tan': ['b8', 'i16', 'i32', 'u8'],
+        'tanh': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'tensordot': ['f32'],
         'tile': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'topk': ['f32'],
@@ -7853,7 +7853,7 @@ class TestConsistency(TestCase):
         'nn.functional.local_response_norm': ['f32'],
         'nn.functional.margin_ranking_loss': ['f32'],
         'nn.functional.mse_loss': ['f32'],
-        'nn.functional.pad': ['f16', 'f32'],
+        'nn.functional.pad': ['f16', 'f32', 'i16', 'i32', 'i64'],
         'nn.functional.pairwise_distance': ['f16', 'f32'],
         'nn.functional.poisson_nll_loss': ['f32'],
         'nn.functional.relu': ['f32'],
@@ -8034,16 +8034,10 @@ class TestConsistency(TestCase):
         '_masked.std': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         '_masked.sum': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         '_masked.var': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'acos': ['torch.bool'],
-        'acosh': ['torch.bool'],
         'amax': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'amin': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'as_strided_scatter': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'asin': ['torch.bool'],
-        'asinh': ['torch.bool'],
         'atan2': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'],
-        'atan': ['torch.bool'],
-        'atanh': ['torch.bool'],
         'bernoulli': ['torch.float32'],
         'byte': ['torch.float16', 'torch.float32'],
         'char': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64'],
@@ -8051,15 +8045,10 @@ class TestConsistency(TestCase):
         'clamp_max': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'clamp_min': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'constant_pad_nd': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'cos': ['torch.bool'],
-        'cosh': ['torch.bool'],
         'count_nonzero': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'diff': ['torch.bool', 'torch.uint8'],
         'eig': ['torch.float32'],
         'empty_like': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'erf': ['torch.bool'],
-        'exp2': ['torch.bool'],
-        'exp': ['torch.bool'],
         'fft.fft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'fft.ifft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'fft.ihfft2': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
@@ -8076,10 +8065,7 @@ class TestConsistency(TestCase):
         'int': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int64', 'torch.uint8'],
         'linalg.eigvals': ['torch.float32'],
         'linalg.multi_dot': ['torch.float32'],
-        'log10': ['torch.bool'],
         'log1p': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'],
-        'log2': ['torch.bool'],
-        'log': ['torch.bool'],
         'logical_and': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'logical_or': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'logical_xor': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
@@ -8087,7 +8073,6 @@ class TestConsistency(TestCase):
         'matmul': ['torch.uint8'],
         'mean': ['torch.float16', 'torch.float32'],
         'native_layer_norm': ['torch.float32'],
-        'neg': ['torch.uint8'],
         'new_empty': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'nn.functional.adaptive_avg_pool1d': ['torch.float32'],
         'nn.functional.adaptive_avg_pool2d': ['torch.float32'],
@@ -8107,7 +8092,6 @@ class TestConsistency(TestCase):
         'nn.functional.max_pool1d': ['torch.float32'],
         'nn.functional.max_pool2d': ['torch.float32'],
         'nn.functional.normalize': ['torch.float32'],
-        'nn.functional.pad': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64'],
         'nn.functional.pairwise_distance': ['torch.uint8'],
         'nn.functional.triplet_margin_loss': ['torch.uint8'],
         'nn.functional.triplet_margin_with_distance_loss': ['torch.uint8'],
@@ -8119,19 +8103,13 @@ class TestConsistency(TestCase):
         'rand_like': ['torch.float16', 'torch.float32'],
         'randint_like': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'randn_like': ['torch.float16', 'torch.float32'],
-        'reciprocal': ['torch.bool'],
-        'rsqrt': ['torch.bool'],
         'scatter_add': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'scatter': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'short': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'sigmoid': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'],
-        'sin': ['torch.bool'],
-        'sinh': ['torch.bool'],
-        'sqrt': ['torch.bool'],
         'sub': ['torch.float16', 'torch.uint8'],
         'sum': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'tan': ['torch.bool', 'torch.float32'],
-        'tanh': ['torch.bool'],
+        'tan': ['torch.float32'],
         'tensor_split': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'topk': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'trapz': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],

From ab4394667964fca50fa478ccdb8411cb77efed34 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Wed, 26 Oct 2022 18:59:03 -0400
Subject: [PATCH 1798/1922] Fix data type issues with log1p() op (#151)

---
 .../ATen/native/mps/operations/UnaryOps.mm    | 54 +++++--------------
 test/test_mps.py                              |  4 +-
 2 files changed, 15 insertions(+), 43 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index 070770d41d700..e1f8b22de46dd 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -188,49 +188,21 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
 
 TORCH_IMPL_FUNC(log1p_out_mps) (const Tensor& self, const Tensor& output)
 {
-    TORCH_CHECK(self.scalar_type() != ScalarType::Long, "MPS does not support log1p op with int64 input")
-    using namespace mps;
-    if (!output.is_same_size(self)) {
-      output.resize_(self.sizes());
-    }
-    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
-    @autoreleasepool {
-      string key = string("log1p_out_mps") + getTensorsStringKey({self});
-      auto cachedGraph = cache_->LookUpAs<MPSUnaryCachedGraph>(key);
-
-      if(!cachedGraph) {
-        cachedGraph = cache_->CreateCachedGraphAs<MPSUnaryCachedGraph>(key, ^ MPSCachedGraph* () {
-          MPSUnaryCachedGraph *newCachedGraph = nil;
-          @autoreleasepool {
-            MPSGraph* mpsGraph = make_mps_graph();
-            newCachedGraph = new MPSUnaryCachedGraph(mpsGraph);
-            newCachedGraph->inputTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, self);
-              MPSGraphTensor* oneTensor = [mpsGraph constantWithScalar:1.0
-                                                          shape:getMPSShape(self)
-                                                       dataType:mps::getMPSDataType(self.scalar_type())];
-              MPSGraphTensor* addedTensor = [mpsGraph additionWithPrimaryTensor:newCachedGraph->inputTensor_
-                                                         secondaryTensor:oneTensor
-                                                                    name:nil];
-            newCachedGraph->outputTensor_ = [mpsGraph logarithmWithTensor:addedTensor
-                                                                    name:nil];
-          }
-          return newCachedGraph;
-        });
-      }
-
-      Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
-      Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
-      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-        selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
-      };
-      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-        outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
-      };
-      runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
-    }
+  TORCH_CHECK(self.scalar_type() != ScalarType::Long, "MPS does not support log1p op with int64 input");
+  mps::unary_op(self, output, "log1p_out_mps",
+                ^ MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
+                  MPSGraphTensor* oneTensor = [mpsGraph constantWithScalar:1.0
+                                                                  dataType:inputTensor.dataType];
+                  MPSGraphTensor* addedTensor = [mpsGraph additionWithPrimaryTensor:inputTensor
+                                                                    secondaryTensor:oneTensor
+                                                                               name:nil];
+                  return [mpsGraph logarithmWithTensor:addedTensor
+                                                  name:nil];
+                });
 }
 
-TORCH_IMPL_FUNC(frac_out_mps) (const Tensor& self, const Tensor& output) {
+TORCH_IMPL_FUNC(frac_out_mps) (const Tensor& self, const Tensor& output)
+{
   TORCH_CHECK(isFloatingType(self.scalar_type()), "frac_out_mps is only implemented for floating types");
   mps::unary_op(self, output, "frac_out_mps",
                 ^ MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
diff --git a/test/test_mps.py b/test/test_mps.py
index fd9ed94cdfa31..f59ecacc9dc7a 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -7601,6 +7601,7 @@ class TestConsistency(TestCase):
         'linspace': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'log': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'log10': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        'log1p': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'log2': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'log_softmax': ['f32'],
         'logaddexp': ['f32'],
@@ -7823,6 +7824,7 @@ class TestConsistency(TestCase):
         'linspace': ['f16', 'f32'],
         'log': ['f32'],
         'log10': ['f32'],
+        'log1p': ['f32'],
         'log2': ['f32'],
         'log_softmax': ['f32'],
         'logaddexp': ['f32'],
@@ -7944,7 +7946,6 @@ class TestConsistency(TestCase):
         'diag_embed': [torch.uint8],
         'diagonal_scatter': [torch.uint8],
         'index_add': None,
-        'log1p': None,
         'long': None,
         'nn.functional.avg_pool1d': [torch.int64],
         'nn.functional.avg_pool2d': [torch.int64],
@@ -8065,7 +8066,6 @@ class TestConsistency(TestCase):
         'int': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int64', 'torch.uint8'],
         'linalg.eigvals': ['torch.float32'],
         'linalg.multi_dot': ['torch.float32'],
-        'log1p': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'],
         'logical_and': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'logical_or': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'logical_xor': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],

From cd8f3179fc6ee4d473a865f580018a5fccab736c Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Thu, 27 Oct 2022 13:31:42 -0400
Subject: [PATCH 1799/1922] Fix data type issues with sigmoid() op (#152)

* Fix data type issues with sigmoid() op

* Clean up redundant headers from Unary Ops
---
 .../ATen/native/mps/operations/Activation.mm  | 60 -------------------
 .../ATen/native/mps/operations/UnaryOps.mm    | 13 ++--
 test/test_mps.py                              |  3 +-
 3 files changed, 10 insertions(+), 66 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Activation.mm b/aten/src/ATen/native/mps/operations/Activation.mm
index 386b0255bdc0b..dbb591246ca40 100644
--- a/aten/src/ATen/native/mps/operations/Activation.mm
+++ b/aten/src/ATen/native/mps/operations/Activation.mm
@@ -410,66 +410,6 @@ Tensor relu_mps(const Tensor& self) {
 
 }
 
-TORCH_IMPL_FUNC(sigmoid_out_mps)(
-  const Tensor& self,
-  const Tensor& output) {
-  using namespace mps;
-  using CachedGraph = MPSUnaryCachedGraph;
-  TORCH_CHECK(output.is_mps());
-  TORCH_CHECK(self.scalar_type() != ScalarType::Long, "MPS does not support sigmoid op with int64 input")
-
-  if(output.numel() == 0) {
-    return;
-  }
-
-  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
-
-  MPSStream* stream = getCurrentMPSStream();
-
-  @autoreleasepool {
-    string key = "sigmoid_out_mps" + getTensorsStringKey({self});
-    CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
-    if(!cachedGraph) {
-      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
-
-        CachedGraph *newCachedGraph = nil;
-
-        @autoreleasepool {
-          // Initialize graph
-          MPSGraph* mpsGraph = make_mps_graph();
-          newCachedGraph = new CachedGraph(mpsGraph);
-
-          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
-
-          MPSGraphTensor* outputTensor = [mpsGraph sigmoidWithTensor:inputTensor
-                                                                name:nil];
-
-          newCachedGraph->inputTensor_ = inputTensor;
-          newCachedGraph->outputTensor_ = outputTensor;
-        }
-        return newCachedGraph;
-      });
-      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
-    }
-
-    Placeholder selfPlaceholder   = Placeholder(cachedGraph->inputTensor_, self);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
-
-    // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
-    };
-
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
-    };
-
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
-
-  }
-
-}
-
 TORCH_IMPL_FUNC(sigmoid_backward_out_mps)(
   const Tensor& grad_output,
   const Tensor& output,
diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index e1f8b22de46dd..64ffbc0d5d574 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -1,9 +1,5 @@
 //  Copyright © 2022 Apple Inc.
 
-#include <ATen/ATen.h>
-#include <ATen/Tensor.h>
-#include <ATen/Utils.h>
-#include <ATen/mps/MPSStream.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <torch/library.h>
@@ -186,6 +182,15 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
   return output;
 }
 
+TORCH_IMPL_FUNC(sigmoid_out_mps) (const Tensor& self, const Tensor& output)
+{
+  TORCH_CHECK(self.scalar_type() != ScalarType::Long, "MPS does not support sigmoid op with int64 input");
+  mps::unary_op(self, output, "sigmoid_out_mps",
+                ^ MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
+                  return [mpsGraph sigmoidWithTensor:inputTensor name:nil];
+                });
+}
+
 TORCH_IMPL_FUNC(log1p_out_mps) (const Tensor& self, const Tensor& output)
 {
   TORCH_CHECK(self.scalar_type() != ScalarType::Long, "MPS does not support log1p op with int64 input");
diff --git a/test/test_mps.py b/test/test_mps.py
index f59ecacc9dc7a..188fe9063a1de 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -7698,7 +7698,7 @@ class TestConsistency(TestCase):
         'select_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
         'sgn': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'short': ['i16'],
-        'sigmoid': ['f32'],
+        'sigmoid': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
         'sign': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8', 'i64'],
         'sin': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'sinh': ['b8', 'f32', 'i16', 'i32', 'u8'],
@@ -8106,7 +8106,6 @@ class TestConsistency(TestCase):
         'scatter_add': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'scatter': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'short': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'sigmoid': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'],
         'sub': ['torch.float16', 'torch.uint8'],
         'sum': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'tan': ['torch.float32'],

From 0ccd72e08457b558b7653d399f479a061ff68bed Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Thu, 27 Oct 2022 15:07:44 -0700
Subject: [PATCH 1800/1922] Add check for macos 13.0 (#153)

---
 aten/src/ATen/mps/MPSDevice.h  | 3 +++
 aten/src/ATen/mps/MPSDevice.mm | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/aten/src/ATen/mps/MPSDevice.h b/aten/src/ATen/mps/MPSDevice.h
index 48e1904346c10..785369415c4d1 100644
--- a/aten/src/ATen/mps/MPSDevice.h
+++ b/aten/src/ATen/mps/MPSDevice.h
@@ -60,6 +60,8 @@ class TORCH_API MPSDevice {
 
   MTLFunction_t metalIndexingFunction(const std::string &kernel, MTLFunctionConstantValues_t constantValues);
 
+  bool macOS_13_0();
+
   ~MPSDevice();
 
  private:
@@ -67,6 +69,7 @@ class TORCH_API MPSDevice {
   MTLDevice_t _mtl_device;
   bool _macos13plus;
   MTLLibrary_t _mtl_indexing_library;
+  bool _macos_13_0_or_newer;
   MPSDevice();
 };
 
diff --git a/aten/src/ATen/mps/MPSDevice.mm b/aten/src/ATen/mps/MPSDevice.mm
index c11621b3f3540..aa6b57a80746e 100644
--- a/aten/src/ATen/mps/MPSDevice.mm
+++ b/aten/src/ATen/mps/MPSDevice.mm
@@ -27,6 +27,10 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
   return mps_device.get();
 }
 
+bool MPSDevice::macOS_13_0() {
+  return _macos_13_0_or_newer;
+}
+
 id<MTLFunction> MPSDevice::metalIndexingFunction(const std::string& kernel, MTLFunctionConstantValues* constantValues) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(_mtl_device);
   NSError* error = nil;

From 176051f889f457edcafa0b562097580677c3dcc8 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Fri, 28 Oct 2022 09:05:47 -0700
Subject: [PATCH 1801/1922] Fix gpu timeouts caused by bincount reduction large
 workload (#155)

---
 test/test_mps.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index 188fe9063a1de..a5074dad516b0 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -2153,15 +2153,15 @@ def test_bincount_reduction(self):
         self.assertEqual(all1s[:, 0].bincount(), torch.tensor([0, 32]))
 
         # test large number of bins - global memory use
-        big_exp = torch.zeros(10000000, device=device)
+        big_exp = torch.zeros(100, device=device)
         big_exp[-1] = 50.0
         big_w = torch.tensor([.5] * 100, device=device)
-        big_out = torch.tensor([9999999] * 100, device=device, dtype=torch.int32).bincount(big_w)
+        big_out = torch.tensor([99] * 100, device=device, dtype=torch.int32).bincount(big_w)
         self.assertEqual(big_exp, big_out)
         # test large input size
         big_exp = torch.zeros(2, device=device, dtype=torch.int64)
-        big_exp[1] = 1000000
-        big_out = torch.ones(1000000, dtype=torch.int8, device=device).bincount()
+        big_exp[1] = 10
+        big_out = torch.ones(10, dtype=torch.int8, device=device).bincount()
         self.assertEqual(big_exp, big_out)
 
     def test_bincount(self):

From edb087026b7609fad80c9fa98422452eaebe94ca Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Fri, 28 Oct 2022 10:58:36 -0700
Subject: [PATCH 1802/1922] Add nonzero support for mps (#133)

* Add nonzero op support for mps

* Fix graph caching for nonzero op (use unranked placeholder for output)

* Add support for nonzero op starting from macOS Ventura. Fallback to CPU for older OS versions
---
 aten/src/ATen/mps/MPSFallback.mm              |   1 -
 .../ATen/native/mps/operations/Indexing.mm    | 179 ++++++++++++++++++
 aten/src/ATen/native/native_functions.yaml    |   2 +
 test/test_mps.py                              | 116 +++++++++++-
 4 files changed, 295 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/mps/MPSFallback.mm b/aten/src/ATen/mps/MPSFallback.mm
index f1c0dbbacdca3..e5dfde1d274cc 100644
--- a/aten/src/ATen/mps/MPSFallback.mm
+++ b/aten/src/ATen/mps/MPSFallback.mm
@@ -61,7 +61,6 @@ Tensor slow_conv2d_forward_mps(
   m.impl("_fft_r2c", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
   m.impl("linalg_vector_norm", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
   m.impl("sgn.out", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
-  m.impl("nonzero", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
   m.impl("_slow_conv2d_forward", slow_conv2d_forward_mps);
 }
 
diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index 78e93fc991756..ae7673905cc21 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -211,6 +211,185 @@ void index_put_kernel_mps(TensorIterator& iter, IntArrayRef index_size, IntArray
   return result;
 }
 
+static
+Tensor nonzero_fallback(const Tensor& self) {
+  TORCH_WARN_ONCE("MPS: nonzero op is supported natively starting from macOS 13.0. ",
+                  "Falling back on CPU. This may have performace implications.");
+
+  return at::nonzero(self.to("cpu")).clone().to("mps");
+}
+
+Tensor& nonzero_out_mps(const Tensor& self, Tensor& out_){
+  if (!MPSDevice::getInstance()->macOS_13_0()) {
+      Tensor out_fallback = nonzero_fallback(self);
+      at::native::resize_output(out_, out_fallback.sizes());
+      out_.copy_(out_fallback.to("mps"));
+      return out_;
+  }
+
+  using namespace mps;
+  const uint32_t maxDimensions = 16;
+
+  TORCH_CHECK(self.numel() < std::numeric_limits<int>::max(), "nonzero is not supported for tensors with more than INT_MAX elements, \
+  file a support request");
+  TORCH_CHECK(out_.dtype() == at::kLong, "Expected object of scalar type ", at::kLong, " as out, but got ", out_.dtype());
+  TORCH_CHECK(self.device() == out_.device(), "expected self and out to be on the same device, but got out on ",
+  out_.device(), " and self on ", self.device());
+  TORCH_CHECK(self.dim() <= maxDimensions, "nonzero is not supported for tensor with more than ", 16, " dimensions");
+  TORCH_CHECK(out_.is_mps());
+
+  MPSStream *stream = getCurrentMPSStream();
+  struct CachedGraph : public MPSCachedGraph
+  {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* inputTensor_ = nil;
+    MPSGraphTensor* outputTensor_ = nil;
+    MPSGraphTensor* scatterDataTensor_ = nil;
+  };
+
+  int64_t total_nonzero = at::count_nonzero(self).item<int64_t>();
+  int64_t nDim = self.dim();
+  at::native::resize_output(out_, {total_nonzero, nDim});
+  if (out_.numel() ==  0) {
+    return out_;
+  }
+
+  bool contiguous_output = (out_.is_contiguous() && !out_.is_view());
+  Tensor out = out_;
+  if (!contiguous_output) {
+    out = at::native::empty_mps(
+           out_.sizes(),
+           out_.scalar_type(),
+           c10::nullopt,
+           kMPS,
+           c10::nullopt,
+           c10::nullopt);
+  }
+
+  int64_t _apparentInputShape = 1;
+  for (auto dim : self.sizes()) {
+    _apparentInputShape *= dim;
+  }
+  MPSShape *apparentOutputShape = @[@(total_nonzero * nDim)];
+  MPSShape *apparentInputShape = @[@(_apparentInputShape)];
+
+  // Pseudocode:
+  //
+  // inputTensor     = [1,  0,  0,  3]
+  // inputNonZero    = [1,  0,  0,  1]
+  // indices         = [1,  1,  1,  2]
+  // maskedIndices   = [0, -1, -1,  1]
+  // coordinates     = [0,  1,  2,  3]
+  // scatterResult   = [0,  3]
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+  @autoreleasepool {
+    string key = "nonzero_out_mps" + getTensorsStringKey(self);
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+        @autoreleasepool {
+          MPSDataType inputDataType = getMPSDataType(self.scalar_type());
+          MPSShape* inputShape = getMPSShape(self);
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor *inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(self.scalar_type()), apparentInputShape);
+          MPSGraphTensor *scatterDataTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSScalarType(out.scalar_type()));
+          MPSGraphTensor *zeroTensor = [mpsGraph constantWithScalar:0.0 dataType:inputDataType];
+          MPSGraphTensor *oneTensor = [mpsGraph constantWithScalar:1.0 dataType:MPSDataTypeInt32];
+          MPSGraphTensor *minusMaxDimTensor = [mpsGraph constantWithScalar:-maxDimensions dataType:MPSDataTypeInt32];
+          MPSGraphTensor *inputNotEqualToZeroTensor = [mpsGraph notEqualWithPrimaryTensor:inputTensor
+                                                                          secondaryTensor:zeroTensor
+                                                                                     name:nil];
+          MPSGraphTensor *maskTensor = [mpsGraph castTensor:inputNotEqualToZeroTensor
+                                                     toType:MPSDataTypeInt32
+                                                       name:nil];
+          MPSGraphTensor *indicesTensor = [mpsGraph cumulativeSumWithTensor:maskTensor
+                                                                       axis:0
+                                                                       name:nil];
+          MPSGraphTensor *indicesMinusOneTensor = [mpsGraph subtractionWithPrimaryTensor:indicesTensor
+                                                                        secondaryTensor:oneTensor
+                                                                                   name:nil];
+          MPSGraphTensor *maskedIndicesTensor = [mpsGraph selectWithPredicateTensor:inputNotEqualToZeroTensor
+                                                                truePredicateTensor:indicesMinusOneTensor
+                                                               falsePredicateTensor:minusMaxDimTensor
+                                                                               name:nil];
+          MPSGraphTensor *coordinatesTensor = [mpsGraph reshapeTensor:[mpsGraph coordinateAlongAxis:0 withShape:inputShape name:nil]
+                                                            withShape:@[@-1]
+                                                                name:nil];
+          if (nDim > 1) {
+            NSMutableArray<MPSGraphTensor*> *maskedIndicesTensorArray = [NSMutableArray arrayWithCapacity:nDim];
+            NSMutableArray<MPSGraphTensor*> *coordinatesTensorArray = [NSMutableArray arrayWithCapacity:nDim];
+
+            MPSGraphTensor *constantRankTensor = [mpsGraph constantWithScalar:nDim
+                                                                     dataType:MPSDataTypeInt32];
+            maskedIndicesTensorArray[0] = [mpsGraph multiplicationWithPrimaryTensor:maskedIndicesTensor
+                                                                    secondaryTensor:constantRankTensor
+                                                                               name:nil];
+            coordinatesTensorArray[0] = coordinatesTensor;
+            for (int i = 1; i < nDim; i++){
+              maskedIndicesTensorArray[i] = [mpsGraph additionWithPrimaryTensor:maskedIndicesTensorArray[i - 1]
+                                                                secondaryTensor:oneTensor
+                                                                           name:nil];
+              coordinatesTensorArray[i] = [mpsGraph reshapeTensor:[mpsGraph coordinateAlongAxis:i withShape:inputShape name:nil]
+                                                        withShape:@[@-1]
+                                                             name:nil];
+            }
+            maskedIndicesTensor = [mpsGraph concatTensors:maskedIndicesTensorArray dimension:0 interleave:YES name:nil];
+            coordinatesTensor = [mpsGraph concatTensors:coordinatesTensorArray dimension:0 interleave:YES name:nil];
+          }
+
+          MPSGraphTensor *outputTensor = [mpsGraph scatterWithDataTensor:scatterDataTensor
+                                                           updatesTensor:coordinatesTensor
+                                                           indicesTensor:maskedIndicesTensor
+                                                                    axis:0
+                                                                    mode:MPSGraphScatterModeSet
+                                                                    name:nil];
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->scatterDataTensor_ = scatterDataTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, apparentInputShape);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, contiguous_output ? out_ : out, apparentOutputShape);
+    Placeholder scatterPlaceholder = Placeholder(cachedGraph->scatterDataTensor_, contiguous_output ? out_ : out, apparentOutputShape);
+
+    // Create dictionary of inputs and outputs
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
+      scatterPlaceholder.getMPSGraphTensor() : scatterPlaceholder.getMPSGraphTensorData()
+    };
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    if (!contiguous_output) {
+      out_.copy_(out);
+    }
+  }
+
+  return out_;
+}
+
+Tensor nonzero_mps(const Tensor& self){
+  if (!MPSDevice::getInstance()->macOS_13_0()) {
+    return nonzero_fallback(self);
+  }
+
+  Tensor out = at::empty({0}, self.options().dtype(kLong));
+  return nonzero_out_mps(self, out);
+}
+
 Tensor masked_select_mps(const Tensor & self, const Tensor & mask) {
   namedinference::compute_broadcast_outnames(self, mask);
   Tensor result = at::empty({0}, self.options());
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index ea3bc99a28ad0..d16c0f4a7381e 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -8519,6 +8519,7 @@
   dispatch:
     CPU: nonzero_out_cpu
     CUDA: nonzero_out_cuda
+    MPS: nonzero_out_mps
   tags: dynamic_output_shape
 
 - func: nonzero(Tensor self) -> Tensor
@@ -8526,6 +8527,7 @@
   dispatch:
     CPU: nonzero_cpu
     CUDA: nonzero_cuda
+    MPS: nonzero_mps
   tags: [dynamic_output_shape, canonical]
 
 - func: nonzero_numpy(Tensor self) -> Tensor[]
diff --git a/test/test_mps.py b/test/test_mps.py
index a5074dad516b0..d996ecf8841b8 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -6584,6 +6584,116 @@ class TestAdvancedIndexing(TestCase):
     supported_dtypes = [torch.float32, torch.float16, torch.int64, torch.int32, torch.int16, torch.uint8]
     supported_np_dtypes = [np.float32, np.float16, np.int64, np.int32, np.int16, np.uint8]
 
+    def test_nonzero_no_warning(self):
+        device = "mps"
+        t = torch.randn((2, 2), device=device)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            torch.nonzero(t)
+            t.nonzero()
+            self.assertEqual(len(w), 0)
+
+    def test_nonzero(self):
+        def helper(dtype):
+            device = "mps"
+            shapes = [
+                torch.Size((12,)),
+                torch.Size((12, 1)),
+                torch.Size((1, 12)),
+                torch.Size((6, 2)),
+                torch.Size((3, 2, 2)),
+                torch.Size((5, 5, 5)),
+            ]
+
+            def gen_nontrivial_input(shape, dtype, device):
+                if dtype != torch.bfloat16:
+                    return torch.randint(2, shape, device=device, dtype=dtype)
+                else:
+                    # windows does not work for bfloat16 randing
+                    return torch.randint(2, shape, device=device, dtype=torch.float).to(dtype)
+
+            for shape in shapes:
+                tensor = gen_nontrivial_input(shape, dtype, device)
+                dst1 = torch.nonzero(tensor, as_tuple=False)
+                dst2 = tensor.nonzero(as_tuple=False)
+                dst3 = torch.empty([], dtype=torch.long, device=device)
+                dst3 = dst3.resize_(0)
+                torch.nonzero(tensor, out=dst3)
+                np_array = tensor.cpu().numpy() if dtype != torch.bfloat16 else tensor.float().cpu().numpy()
+                np_result = torch.from_numpy(np.stack(np_array.nonzero())).t()
+                self.assertEqual(dst1.cpu(), np_result, atol=0, rtol=0)
+                self.assertEqual(dst2.cpu(), np_result, atol=0, rtol=0)
+                self.assertEqual(dst3.cpu(), np_result, atol=0, rtol=0)
+                tup1 = torch.nonzero(tensor, as_tuple=True)
+                tup2 = tensor.nonzero(as_tuple=True)
+                tup1 = torch.stack(tup1).t().cpu()
+                tup2 = torch.stack(tup2).t().cpu()
+                self.assertEqual(tup1, np_result, atol=0, rtol=0)
+                self.assertEqual(tup2, np_result, atol=0, rtol=0)
+        [helper(dtype) for dtype in self.supported_dtypes]
+
+    def test_nonzero_astuple_out(self):
+        device = "mps"
+        t = torch.randn((3, 3, 3), device=device)
+        out = torch.empty([], dtype=torch.long, device=device)
+        out = out.resize_(0)
+
+        with self.assertRaises(RuntimeError):
+            torch.nonzero(t, as_tuple=True, out=out)
+
+        self.assertEqual(torch.nonzero(t, as_tuple=False, out=out), torch.nonzero(t, out=out))
+
+        # Verifies that JIT script cannot handle the as_tuple kwarg
+        # See Issue https://github.com/pytorch/pytorch/issues/45499.
+        def _foo(t):
+            tuple_result = torch.nonzero(t, as_tuple=True)
+            nontuple_result = torch.nonzero(t, as_tuple=False)
+            out = torch.empty_like(nontuple_result)
+            torch.nonzero(t, as_tuple=False, out=out)
+            return tuple_result, nontuple_result, out
+
+        with self.assertRaises(RuntimeError):
+            scripted_foo = torch.jit.script(_foo)
+
+        # Verifies that JIT tracing works fine
+        traced_foo = torch.jit.trace(_foo, t)
+        traced_tuple, traced_nontuple, traced_out = traced_foo(t)
+        expected_tuple = torch.nonzero(t, as_tuple=True)
+        expected_nontuple = torch.nonzero(t)
+
+        self.assertEqual(traced_tuple, expected_tuple)
+        self.assertEqual(traced_nontuple, expected_nontuple)
+        self.assertEqual(traced_out, expected_nontuple)
+
+    def test_nonzero_discontiguous(self):
+        device = "mps"
+        shape = (4, 4)
+        tensor = torch.randint(2, shape, device=device)
+        tensor_nc = torch.empty(shape[0], shape[1] * 2, device=device)[:, ::2].copy_(tensor)
+        dst1 = tensor.nonzero(as_tuple=False)
+        dst2 = tensor_nc.nonzero(as_tuple=False)
+        self.assertEqual(dst1, dst2, atol=0, rtol=0)
+        dst3 = torch.empty_like(dst1)
+        data_ptr = dst3.data_ptr()
+        # expect dst3 storage to be reused
+        torch.nonzero(tensor, out=dst3)
+        self.assertEqual(data_ptr, dst3.data_ptr())
+        self.assertEqual(dst1, dst3, atol=0, rtol=0)
+        # discontiguous out
+        dst4 = torch.empty(dst1.size(0), dst1.size(1) * 2, dtype=torch.long, device=device)[:, ::2]
+        data_ptr = dst4.data_ptr()
+        strides = dst4.stride()
+        torch.nonzero(tensor, out=dst4)
+        self.assertEqual(data_ptr, dst4.data_ptr())
+        self.assertEqual(dst1, dst4, atol=0, rtol=0)
+        self.assertEqual(strides, dst4.stride())
+
+    def test_nonzero_non_diff(self):
+        device = "mps"
+        x = torch.randn(10, requires_grad=True)
+        nz = x.nonzero()
+        self.assertFalse(nz.requires_grad)
+
     def test_masked_select(self):
         x = torch.randn(3, 4)
         x_mps = x.to("mps")
@@ -7740,7 +7850,8 @@ class TestConsistency(TestCase):
         'logical_and': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'logical_or': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'logical_xor': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'where': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8']}
+        'where': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'nonzero': ['f32', 'i16', 'i32', 'i64']}
 
 
     ALLOWLIST_OP_GRAD = {
@@ -7963,6 +8074,8 @@ class TestConsistency(TestCase):
         'slice_scatter': [torch.uint8],
         'square': [torch.bool, torch.int16, torch.int32, torch.int64, torch.uint8],  # moved from section below
 
+        # count_nonzero returns wrong results for these dtypes
+        'nonzero': [torch.uint8, torch.float16],
 
         # ALLOW_LIST doesn't know about variants
         'nn.functional.padconstant': None,
@@ -8018,7 +8131,6 @@ class TestConsistency(TestCase):
         'trapezoid': None,
         'eq': None,
         'mul': None,
-        'nonzero': None,
         'inner': None,
         'take_along_dim': None,
 

From 30d6b049843d7bd067e6bf194e2bc5f46d5a6b51 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Fri, 28 Oct 2022 12:27:34 -0700
Subject: [PATCH 1803/1922] Ignore cumulativeSumWithTensor missing selector
 warning on macos Monterey (#156)

---
 aten/src/ATen/native/mps/operations/Indexing.mm | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index ae7673905cc21..0f1901a2e508f 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -306,10 +306,16 @@ Tensor nonzero_fallback(const Tensor& self) {
                                                                                      name:nil];
           MPSGraphTensor *maskTensor = [mpsGraph castTensor:inputNotEqualToZeroTensor
                                                      toType:MPSDataTypeInt32
-                                                       name:nil];
+                                                       name:@"castToInt32"];
+
+          C10_CLANG_DIAGNOSTIC_PUSH()
+          #if C10_CLANG_HAS_WARNING("-Wobjc-method-access")
+          C10_CLANG_DIAGNOSTIC_IGNORE("-Wobjc-method-access")
+          #endif
           MPSGraphTensor *indicesTensor = [mpsGraph cumulativeSumWithTensor:maskTensor
                                                                        axis:0
                                                                        name:nil];
+          C10_CLANG_DIAGNOSTIC_POP()
           MPSGraphTensor *indicesMinusOneTensor = [mpsGraph subtractionWithPrimaryTensor:indicesTensor
                                                                         secondaryTensor:oneTensor
                                                                                    name:nil];

From 7deee6d41f9bbdb655ed532b129b56d9de7c4eb3 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Fri, 28 Oct 2022 17:56:47 -0400
Subject: [PATCH 1804/1922] Add floor_divide() op and its test case (#157)

---
 .../ATen/native/mps/operations/BinaryOps.mm   | 20 ++++++++++++++++---
 aten/src/ATen/native/native_functions.yaml    |  3 +++
 test/test_mps.py                              | 13 +++++++++---
 3 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm
index a246bb0c50f07..2c50a5efdc3c6 100644
--- a/aten/src/ATen/native/mps/operations/BinaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@@ -181,7 +181,7 @@ void div_mode_template(const Tensor& self, const Tensor& other,
     assert(0 && "Invalid rounding mode\n");
     return nullptr;
   };
-  binaryOpTensor(self, other, Scalar(1.0), output, op_name + "_out_mps:" + (rounding_mode.has_value() ? c10::str(*rounding_mode) : ""), div_mode_op_block);
+  binaryOpTensor(self, other, Scalar(1.0), output, op_name + "_mps:" + (rounding_mode.has_value() ? c10::str(*rounding_mode) : ""), div_mode_op_block);
 }
 
 void add_sub_template(const Tensor& self, const Tensor& other, const Scalar& alpha, const Tensor& output, std::string op_name)
@@ -287,11 +287,11 @@ void add_sub_template(const Tensor& self, const Tensor& other, const Scalar& alp
 
 
 TORCH_IMPL_FUNC(div_out_mode_mps) (const Tensor& self, const Tensor& other, c10::optional<c10::string_view> rounding_mode, const Tensor& output) {
-  mps::div_mode_template(self, other, rounding_mode, output, "div_mode");
+  mps::div_mode_template(self, other, rounding_mode, output, "div_mode_out");
 }
 
 TORCH_IMPL_FUNC(div_out_mps) (const Tensor& self, const Tensor& other, const Tensor& output) {
-  mps::div_mode_template(self, other, c10::nullopt, output, "div");
+  mps::div_mode_template(self, other, c10::nullopt, output, "div_out");
 }
 
 TORCH_IMPL_FUNC(add_out_mps) (const Tensor& self, const Tensor& other, const Scalar& alpha, const Tensor& output) {
@@ -302,6 +302,20 @@ void add_sub_template(const Tensor& self, const Tensor& other, const Scalar& alp
   mps::add_sub_template(self, other, alpha, output, "sub");
 }
 
+Tensor& floor_divide_out_mps(const Tensor& self, const Tensor& other, Tensor& result) {
+  mps::div_mode_template(self, other, "floor", result, "floor_divide_out");
+  return result;
+}
+
+Tensor floor_divide_mps(const Tensor& self, const Tensor& other) {
+  Tensor output = at::empty_like(self);
+  mps::div_mode_template(self, other, "floor", output, "floor_divide");
+  return output;
+}
+
+Tensor& floor_divide_mps_(Tensor& self, const Tensor& other) {
+  return floor_divide_out_mps(self, other, self);
+}
 
 TORCH_IMPL_FUNC(logaddexp_out_mps) (const Tensor& self, const Tensor& other, const Tensor& output)
 {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index d16c0f4a7381e..fb648c6793f0b 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -2568,6 +2568,7 @@
   variants: function, method
   dispatch:
     CPU, CUDA: floor_divide
+    MPS: floor_divide_mps
     SparseCPU, SparseCUDA: floor_divide_sparse
 
 - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
@@ -2575,12 +2576,14 @@
   variants: method
   dispatch:
     CPU, CUDA: floor_divide_
+    MPS: floor_divide_mps_
     SparseCPU, SparseCUDA: floor_divide_sparse_
 
 - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: floor_divide_out
+    MPS: floor_divide_out_mps
     SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim
 
 - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
diff --git a/test/test_mps.py b/test/test_mps.py
index d996ecf8841b8..5be623ff25097 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -3547,13 +3547,19 @@ def helper(shape, rounding_mode):
                 # clamp to avoid division by 0
                 mps_y = cpu_y.detach().clone().to('mps')
 
-                result_div_cpu = torch.div(cpu_x, cpu_y, rounding_mode=rounding_mode)
-                result_div_mps = torch.div(mps_x, mps_y, rounding_mode=rounding_mode)
-                self.assertEqual(result_div_mps, result_div_cpu)
+                if (rounding_mode == "floor_divide"):
+                    result_div_cpu = torch.floor_divide(cpu_x, cpu_y)
+                    result_div_mps = torch.floor_divide(mps_x, mps_y)
+                    self.assertEqual(result_div_mps, result_div_cpu)
+                else:
+                    result_div_cpu = torch.div(cpu_x, cpu_y, rounding_mode=rounding_mode)
+                    result_div_mps = torch.div(mps_x, mps_y, rounding_mode=rounding_mode)
+                    self.assertEqual(result_div_mps, result_div_cpu)
 
         helper((2, 8, 4, 5), None)
         helper((2, 8, 4, 5), "floor")
         helper((2, 8, 4, 5), "trunc")
+        helper((2, 8, 4, 5), "floor_divide")
 
     def test_rounding(self):
         def helper(shape):
@@ -7693,6 +7699,7 @@ class TestConsistency(TestCase):
         'flipud': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'float': ['f32'],
         'floor': ['f32', 'f16', 'i16', 'i32', 'i64'],
+        'floor_divide': ['f32', 'f16'],
         'frac': ['f16', 'f32'],
         'gradient': ['f16', 'f32', 'i16'],
         'half': ['f16'],

From a9408fcad9a07f2762171d8ab9b4a7d6c16bc39f Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Mon, 31 Oct 2022 14:37:51 -0400
Subject: [PATCH 1805/1922] Fix the type cast issue with Binary Ops (#158)

* Fix data type issues for logaddexp and logaddexp2 ops

* Fix the type cast issue with Binary Ops

* Move several ops out of Blocklist in TestConsistency

* Move good ops from block list to allow list
---
 .../ATen/native/mps/operations/BinaryOps.mm   | 184 +++---------------
 test/test_mps.py                              |  49 ++---
 2 files changed, 45 insertions(+), 188 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm
index 2c50a5efdc3c6..6edb7bdb2a3de 100644
--- a/aten/src/ATen/native/mps/operations/BinaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@@ -72,38 +72,21 @@ void binaryOpTensor(const Tensor& self, const Tensor& other, const Scalar& alpha
           MPSGraphTensor* secondaryCastTensor = newCachedGraph->secondaryTensor;
 
           // this type inference is only required at the time of graph creation
-          const ScalarType common_dtype = c10::promoteTypes(self.scalar_type(), other.scalar_type());
-
-          // Condition -
-          // 1. Division operation
-          // 2. Inputs are not float
-          bool div_condition = op_name.rfind("div", 0) == 0
-                                  && (!(common_dtype == ScalarType::Float || common_dtype == ScalarType::Half));
-
-          auto compute_type = ScalarType::Float;
-
-          if(div_condition) {
-
-            if(output_.scalar_type() == ScalarType::Float || output_.scalar_type() == ScalarType::Half)
-              compute_type = output_.scalar_type();
-
-            primaryCastTensor = castMPSTensor(mpsGraph, newCachedGraph->primaryTensor, compute_type);
-            secondaryCastTensor = castMPSTensor(mpsGraph, newCachedGraph->secondaryTensor, compute_type);
+          ScalarType common_dtype = c10::promoteTypes(self.scalar_type(), other.scalar_type());
+          // Integer input must be cast to float if output is float
+          if (isIntegralType(common_dtype, true) && isFloatingType(output.scalar_type())) {
+            common_dtype = output_.scalar_type();
+          }
+          if (self.scalar_type() != common_dtype) {
+            primaryCastTensor = castMPSTensor(mpsGraph, newCachedGraph->primaryTensor, common_dtype);
           }
-          else  {
-            if (self.scalar_type() != common_dtype) {
-              primaryCastTensor = castMPSTensor(mpsGraph, newCachedGraph->primaryTensor, common_dtype);
-            }
-            if (other.scalar_type() != common_dtype) {
-              secondaryCastTensor = castMPSTensor(mpsGraph, newCachedGraph->secondaryTensor, common_dtype);
-            }
+          if (other.scalar_type() != common_dtype) {
+            secondaryCastTensor = castMPSTensor(mpsGraph, newCachedGraph->secondaryTensor, common_dtype);
           }
           newCachedGraph->outputTensor = binaryBlock(newCachedGraph, primaryCastTensor, secondaryCastTensor);
           // Cast output tensor to an expected type if needed, which addresses discrepancy when int64 scalar is added to int32 tensor
           // Output tensor should have been promoted but it remains an int32 tensor
-
-          if ((div_condition && compute_type != output_.scalar_type()) ||
-              output_.scalar_type() != common_dtype) {
+          if (output_.scalar_type() != common_dtype) {
             newCachedGraph->outputTensor = castMPSTensor(mpsGraph, newCachedGraph->outputTensor, output_.scalar_type());
           }
         }
@@ -319,139 +302,26 @@ Tensor floor_divide_mps(const Tensor& self, const Tensor& other) {
 
 TORCH_IMPL_FUNC(logaddexp_out_mps) (const Tensor& self, const Tensor& other, const Tensor& output)
 {
-      using namespace mps;
-      MPSStream* stream = getCurrentMPSStream();
-
-      if (&output != &self) {
-          output.resize_(self.sizes());;
-      }
-
-      // Derive from MPSCachedGraph
-      struct CachedGraph : public MPSCachedGraph
-      {
-        CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
-        MPSGraphTensor *inputTensor_ = nil;
-        MPSGraphTensor *otherTensor_ = nil;
-        MPSGraphTensor *outputTensor_ = nil;
-      };
-
-      MPSGraphCache* cache_ = MPSGraphCache::getInstance();
-
-      @autoreleasepool {
-        string key = "log_base_e_out_mps:" + getTensorsStringKey({self, other});
-        CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
-
-        if(!cachedGraph) {
-          MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
-            CachedGraph *newCachedGraph = nil;
-
-            @autoreleasepool {
-              MPSGraph* mpsGraph = make_mps_graph();
-              newCachedGraph = new CachedGraph(mpsGraph);
-              MPSGraphTensor* xTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
-              MPSGraphTensor* yTensor = mpsGraphRankedPlaceHolder(mpsGraph, other);
-              MPSGraphTensor* ePowXTensor = [mpsGraph exponentWithTensor:xTensor
-                                                                         name:nil];
-              MPSGraphTensor* ePowYTensor = [mpsGraph exponentWithTensor:yTensor
-                                                                         name:nil];
-              MPSGraphTensor* sumTensor = [mpsGraph additionWithPrimaryTensor:ePowXTensor
-                                                                secondaryTensor:ePowYTensor
-                                                                     name:nil];
-              MPSGraphTensor* outputTensor = [mpsGraph logarithmWithTensor:sumTensor
-                                                                     name:nil];
-
-              newCachedGraph->inputTensor_ = xTensor;
-              newCachedGraph->otherTensor_ = yTensor;
-              newCachedGraph->outputTensor_ = outputTensor;
-            }
-            return newCachedGraph;
-          });
-          cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
-        }
-
-        Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
-        Placeholder otherPlaceholder = Placeholder(cachedGraph->otherTensor_, other);
-        Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
-
-        NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-          selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
-          otherPlaceholder.getMPSGraphTensor() : otherPlaceholder.getMPSGraphTensorData()
-        };
-        NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-          outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
-        };
-
-        runMPSGraph(stream, cachedGraph->graph(), feeds, results);
-      }
-
-    }
+  mps::BinaryOpBlock logaddexp_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
+    MPSGraph* mpsGraph = cachedGraph->graph();
+    MPSGraphTensor* sumTensor = [mpsGraph additionWithPrimaryTensor:[mpsGraph exponentWithTensor:primaryCastTensor name:nil]
+                                                    secondaryTensor:[mpsGraph exponentWithTensor:secondaryCastTensor name:nil]
+                                                               name:nil];
+    return [mpsGraph logarithmWithTensor:sumTensor name:nil];
+  };
+  mps::binaryOpTensor(self, other, Scalar(1.0), output, "logaddexp_out_mps", logaddexp_op_block);
+}
 
 TORCH_IMPL_FUNC(logaddexp2_out_mps) (const Tensor& self, const Tensor& other, const Tensor& output)
 {
-      using namespace mps;
-      MPSStream* stream = getCurrentMPSStream();
-
-      if (&output != &self) {
-          output.resize_(self.sizes());;
-      }
-
-      // Derive from MPSCachedGraph
-      struct CachedGraph : public MPSCachedGraph
-      {
-        CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
-        MPSGraphTensor *inputTensor_ = nil;
-        MPSGraphTensor *otherTensor_ = nil;
-        MPSGraphTensor *outputTensor_ = nil;
-      };
-
-      MPSGraphCache* cache_ = MPSGraphCache::getInstance();
-
-      @autoreleasepool {
-        string key = "log_base_two_out_mps:" + getTensorsStringKey({self, other});
-        CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
-
-        if(!cachedGraph) {
-          MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
-            CachedGraph *newCachedGraph = nil;
-
-            @autoreleasepool {
-              MPSGraph* mpsGraph = make_mps_graph();
-              newCachedGraph = new CachedGraph(mpsGraph);
-              MPSGraphTensor* xTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
-              MPSGraphTensor* yTensor = mpsGraphRankedPlaceHolder(mpsGraph, other);
-              MPSGraphTensor* twoPowXTensor = [mpsGraph exponentBase2WithTensor:xTensor
-                                                                         name:nil];
-              MPSGraphTensor* twoPowYTensor = [mpsGraph exponentBase2WithTensor:yTensor
-                                                                         name:nil];
-              MPSGraphTensor* sumTensor = [mpsGraph additionWithPrimaryTensor:twoPowXTensor
-                                                                secondaryTensor:twoPowYTensor
-                                                                     name:nil];
-              MPSGraphTensor* outputTensor = [mpsGraph logarithmBase2WithTensor:sumTensor
-                                                                     name:nil];
-
-              newCachedGraph->inputTensor_ = xTensor;
-              newCachedGraph->otherTensor_ = yTensor;
-              newCachedGraph->outputTensor_ = outputTensor;
-            }
-            return newCachedGraph;
-          });
-          cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
-        }
-
-        Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
-        Placeholder otherPlaceholder = Placeholder(cachedGraph->otherTensor_, other);
-        Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
-
-        NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-          selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
-          otherPlaceholder.getMPSGraphTensor() : otherPlaceholder.getMPSGraphTensorData()
-        };
-        NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-          outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
-        };
-
-        runMPSGraph(stream, cachedGraph->graph(), feeds, results);
-      }
+ mps::BinaryOpBlock logaddexp2_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
+    MPSGraph* mpsGraph = cachedGraph->graph();
+    MPSGraphTensor* sumTensor = [mpsGraph additionWithPrimaryTensor:[mpsGraph exponentBase2WithTensor:primaryCastTensor name:nil]
+                                                    secondaryTensor:[mpsGraph exponentBase2WithTensor:secondaryCastTensor name:nil]
+                                                               name:nil];
+    return [mpsGraph logarithmBase2WithTensor:sumTensor name:nil];
+  };
+  mps::binaryOpTensor(self, other, Scalar(1.0), output, "logaddexp2_out_mps", logaddexp2_op_block);
 }
 
 } // namespace native
diff --git a/test/test_mps.py b/test/test_mps.py
index 5be623ff25097..dc7acec872d1f 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -7610,7 +7610,7 @@ class TestConsistency(TestCase):
         '__getitem__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__radd__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__rand__': ['b8', 'i16', 'i32', 'i64', 'u8'],
-        '__rdiv__': ['f16', 'f32', 'i16', 'i32', 'u8'],
+        '__rdiv__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__rmatmul__': ['f32'],
         '__rmul__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__ror__': ['b8', 'i16', 'i32', 'i64', 'u8'],
@@ -7620,6 +7620,7 @@ class TestConsistency(TestCase):
         'masked.argmin': ['i16', 'i64', 'u8'],
         'masked.log_softmax': ['f32'],
         'masked.logaddexp': ['f32'],
+        'masked.logsumexp': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'masked.norm': ['f16', 'f32'],
         'masked.normalize': ['f16', 'f32'],
         'masked.softmax': ['f32'],
@@ -7629,7 +7630,7 @@ class TestConsistency(TestCase):
         'abs': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
         'acos': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'acosh': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
+        'add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'addbmm': ['f32'],
         'addcdiv': ['f32'],
         'addcmul': ['f32', 'i16', 'i32', 'i64', 'u8'],
@@ -7644,7 +7645,6 @@ class TestConsistency(TestCase):
         'argmin': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'amax': ['f32'],
         'amix': ['f32'],
-        'logsumexp': ['f32'],
         'mean': ['f32'],
         'sum': ['f32'],
         'asin': ['b8', 'f32', 'i16', 'i32', 'u8'],
@@ -7668,6 +7668,9 @@ class TestConsistency(TestCase):
         'ceil': ['f32', 'int32', 'int64', 'f16'],
         'char': ['b8', 'u8'],
         'chunk': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'clamp': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'clamp_max': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'clamp_min': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'clone': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'column_stack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'combinations': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -7686,6 +7689,7 @@ class TestConsistency(TestCase):
         'diagonal_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
         'diff': ['f16', 'f32', 'i16', 'i32', 'i64'],
         'dist': ['f32'],
+        'div': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'dot': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'equal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'erf': ['b8', 'f32', 'i16', 'i32', 'u8'],
@@ -7721,15 +7725,20 @@ class TestConsistency(TestCase):
         'log1p': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'log2': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'log_softmax': ['f32'],
-        'logaddexp': ['f32'],
-        'logaddexp2': ['f32'],
+        'logaddexp': ['f16', 'f32'],
+        'logaddexp2': ['f16', 'f32'],
+        'logical_and': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'logical_not': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'logical_or': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'logical_xor': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'logspace': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'logsumexp': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'masked_fill': ['f16', 'i16', 'i32', 'i64'],
         'masked_select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'matmul': ['f32'],
         'mm': ['f32'],
         'mv': ['f32'],
+        'mul': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'neg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.adaptive_max_pool1d': ['f32'],
         'nn.functional.adaptive_max_pool2d': ['f32'],
@@ -7759,7 +7768,7 @@ class TestConsistency(TestCase):
         'nn.functional.hinge_embedding_loss': ['f32'],
         'nn.functional.huber_loss': ['f32'],
         'nn.functional.instance_norm': ['f32'],
-        'nn.functional.kl_div': ['f32'],
+        'nn.functional.kl_div': ['f32', 'i16', 'i32', 'i64'],
         'nn.functional.l1_loss': ['f16', 'f32'],
         'nn.functional.leaky_relu': ['f32'],
         'nn.functional.linear': ['f32'],
@@ -7827,7 +7836,7 @@ class TestConsistency(TestCase):
         'square': ['f16', 'f32'],
         'squeeze': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'stack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'sub': ['f32', 'i16', 'i32', 'i64'],
+        'sub': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'sum_to_size': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'svd': ['f32'],
         't': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -7841,7 +7850,7 @@ class TestConsistency(TestCase):
         'tril_indices': ['i32', 'i64'],
         'triu': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'triu_indices': ['i32', 'i64'],
-        'true_divide': ['b8', 'f16', 'f32', 'i16', 'u8'],
+        'true_divide': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'trunc': ['f32'],
         'unbind': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'unflatten': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -7851,12 +7860,6 @@ class TestConsistency(TestCase):
         'vsplit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'vstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'zero_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'clamp': ['f32', 'i16', 'i32', 'i64', 'u8'],
-        'clamp_max': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'clamp_min': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'logical_and': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'logical_or': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'logical_xor': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'where': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nonzero': ['f32', 'i16', 'i32', 'i64']}
 
@@ -8044,7 +8047,6 @@ class TestConsistency(TestCase):
         'masked.sum': [torch.bool],
 
         # Functions that hard crash
-        'nn.functional.kl_div': [torch.int16, torch.int32, torch.int64],
         'nn.functional.nll_loss': [torch.float32],
         'nn.functional.padreflect': [torch.float32], 'nn.functional.padreplicate': [torch.float32],
         'std': [torch.float16],
@@ -8089,10 +8091,6 @@ class TestConsistency(TestCase):
 
         # These were moved from ALLOWLIST to BLOCK as they are not working
         # locally
-        '__radd__': ['torch.bool', 'torch.uint8'],
-        '__rmul__': ['torch.uint8'],
-        'add': ['torch.bool', 'torch.uint8'],
-        'addr': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'diag': ['torch.int64'],
         'diagflat': ['torch.int64'],
 
@@ -8137,18 +8135,15 @@ class TestConsistency(TestCase):
         'split_with_sizes': None,
         'trapezoid': None,
         'eq': None,
-        'mul': None,
         'inner': None,
         'take_along_dim': None,
 
         # New block list ops that need investigation
-        '__rdiv__': ['torch.bool', 'torch.int64'],
         '__rpow__': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.uint8'],
         '_masked.amax': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         '_masked.amin': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         '_masked.argmax': ['torch.float16', 'torch.float32', 'torch.int32'],
         '_masked.argmin': ['torch.float16', 'torch.float32', 'torch.int32'],
-        '_masked.logsumexp': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         '_masked.mean': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         '_masked.prod': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         '_masked.std': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
@@ -8161,9 +8156,6 @@ class TestConsistency(TestCase):
         'bernoulli': ['torch.float32'],
         'byte': ['torch.float16', 'torch.float32'],
         'char': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64'],
-        'clamp': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'clamp_max': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'clamp_min': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'constant_pad_nd': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'count_nonzero': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'diff': ['torch.bool', 'torch.uint8'],
@@ -8185,10 +8177,6 @@ class TestConsistency(TestCase):
         'int': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int64', 'torch.uint8'],
         'linalg.eigvals': ['torch.float32'],
         'linalg.multi_dot': ['torch.float32'],
-        'logical_and': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'logical_or': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'logical_xor': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'logsumexp': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'matmul': ['torch.uint8'],
         'mean': ['torch.float16', 'torch.float32'],
         'native_layer_norm': ['torch.float32'],
@@ -8225,7 +8213,6 @@ class TestConsistency(TestCase):
         'scatter_add': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'scatter': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'short': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'sub': ['torch.float16', 'torch.uint8'],
         'sum': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'tan': ['torch.float32'],
         'tensor_split': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
@@ -8301,7 +8288,7 @@ def get_samples():
                 if op.name == "nn.functional.conv2d" and dtype == torch.float32:
                     atol = 1e-4
                     rtol = 3e-5
-                elif op.name == "add" and dtype == torch.float16:
+                elif (op.name == "add" or op.name == "sub") and dtype == torch.float16:
                     atol = 1e-2
                     rtol = 1e-2
                 else:

From c81a2c90af2de9c7e367d573749a7586ec2cfec3 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Mon, 31 Oct 2022 14:14:47 -0700
Subject: [PATCH 1806/1922] Skip gather/blit calls in case of strided output
 (#147)

---
 aten/src/ATen/native/mps/OperationUtils.h       |  2 +-
 aten/src/ATen/native/mps/OperationUtils.mm      |  4 ++--
 aten/src/ATen/native/mps/operations/UnaryOps.mm |  9 +++++++--
 test/test_mps.py                                | 11 +++++++++++
 4 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
index 1bf4bc838fa81..1d1e2931fb82b 100644
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@@ -62,7 +62,7 @@ class Placeholder {
  public:
   Placeholder() : _placeholder(nullptr), _value(nullptr), _tensor(Tensor()) {}
   Placeholder(MPSGraphTensor* mpsGraphTensor) : _placeholder(mpsGraphTensor), _value(nullptr), _tensor(Tensor()) {}
-  Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& self, MPSShape *mpsShape = nullptr);
+  Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& self, MPSShape *mpsShape = nullptr, bool gatherTensorData = true);
   MPSGraphTensor* getMPSGraphTensor() {
     return _placeholder;
   }
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index 564c86ac118a1..45f290e4ef762 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -166,13 +166,13 @@ void printTensorNDArray(const Tensor& t) {
   C10_CLANG_DIAGNOSTIC_POP()
 }
 
-Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& src, MPSShape *mpsShape) : _tensor(src)
+Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& src, MPSShape *mpsShape, bool gatherTensorData) : _tensor(src)
 {
   TORCH_CHECK(src.is_mps(), "Placeholder storage has not been allocated on MPS device!");
   // extract the pointer to MTLBuffer from the Tensor's storage
   id<MTLBuffer> srcBuf = getMTLBufferStorage(src);
   // a view tensor could be contiguous (e.g., slice ops) or non-contiguous (e.g., transpose())
-  if (src.is_view() || !src.is_contiguous()) {
+  if ((src.is_view() || !src.is_contiguous()) && gatherTensorData) {
      Tensor emptyShell = Tensor();
     // use "_tensor" from Placeholder to retain view's output during its usage in other ops
     _tensor = gatherViewTensor(src, emptyShell);
diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index 64ffbc0d5d574..baa92120962b9 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -48,8 +48,13 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
       });
     }
 
-    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+    bool gatherTensorData = true;
+    if (!output.is_contiguous() || output.is_view()) {
+      gatherTensorData = false;
+    }
+
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, /*mpsShape=*/nullptr, gatherTensorData);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output, /*mpsShape=*/nullptr, false);
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
       selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
     };
diff --git a/test/test_mps.py b/test/test_mps.py
index dc7acec872d1f..0c26a0135d2f2 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -250,6 +250,17 @@ def test_exp1(self, device="mps", dtype=torch.float):
         input = torch.tensor([-0.1, 3.0, -0.9]).to('mps')
         output = torch.exp(input).to('cpu')
 
+    def test_exp_strided_output(self):
+        x = torch.rand((256,10), device='mps')
+        x_cpu = x.to("cpu")
+
+        x = x.permute(1,0)
+        x_cpu = x_cpu.permute(1,0)
+
+        res = x.exp()
+        res_cpu = x_cpu.exp()
+        self.assertEqual(res, res_cpu)
+
     def _testLeakyRelu(self, np_features, negative_slope, device):
         cpu_x = torch.from_numpy(np_features).requires_grad_()
         mps_x = torch.from_numpy(np_features).to('mps').requires_grad_()

From ffeac72361b6d2ffb570a5d8c3fc4c40ce64020a Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 1 Nov 2022 10:45:17 -0700
Subject: [PATCH 1807/1922] Solve contiguos view tensors using arrayViews
 instead of blits (#146)

* Solve contiguos view tensors using arrayViews instead of blit

* Use c10::irange to iterate for loops

* Remove additional space

* Move the slicing of view tensors to View.mm
---
 aten/src/ATen/native/mps/OperationUtils.h   |  2 +
 aten/src/ATen/native/mps/OperationUtils.mm  | 31 +++++--
 aten/src/ATen/native/mps/operations/View.mm | 89 +++++++++++++++++++++
 3 files changed, 115 insertions(+), 7 deletions(-)

diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
index 1d1e2931fb82b..125410a82e915 100644
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@@ -49,6 +49,7 @@ std::string getArrayRefString(const IntArrayRef s);
 // use has_storage() on the returned tensor to determine if src actually is a view
 Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst);
 Tensor& scatterViewTensor(const at::Tensor& src, at::Tensor& output, id<MTLBuffer> updatesBuffer = nil);
+MPSGraphTensorData* getMPSGraphTensorDataForView(const Tensor& src, MPSShape *mpsShape, const MPSDataType mpsDataType);
 
 MPSShape* getMPSShape(const Tensor& t);
 MPSShape* getMPSShape(IntArrayRef sizes);
@@ -87,6 +88,7 @@ MPSGraphTensorData* getMPSGraphTensorFromScalar(MPSStream* mpsStream, MPSScalar&
 
 MPSGraph* make_mps_graph();
 void printTensorNDArray(const Tensor& t);
+MPSNDArray* ndArrayFromTensor(const Tensor& tensor, MPSShape *shape, MPSDataType mpsType);
 
 MPSGraphTensor* mpsGraphUnrankedPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType);
 MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType, MPSShape* mpsShape);
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index 45f290e4ef762..de9d58442f6cd 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -166,13 +166,23 @@ void printTensorNDArray(const Tensor& t) {
   C10_CLANG_DIAGNOSTIC_POP()
 }
 
+MPSNDArray* ndArrayFromTensor(const Tensor& tensor, MPSShape *shape, MPSDataType mpsType)
+{
+  id<MTLBuffer> buffer = getMTLBufferStorage(tensor);
+  MPSGraphTensorData* tmpGraphTensorData = [[[MPSGraphTensorData alloc] initWithMTLBuffer:buffer
+                                                                                    shape:shape
+                                                                                 dataType:mpsType] autorelease];
+
+  return [tmpGraphTensorData mpsndarray];
+}
+
 Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& src, MPSShape *mpsShape, bool gatherTensorData) : _tensor(src)
 {
   TORCH_CHECK(src.is_mps(), "Placeholder storage has not been allocated on MPS device!");
   // extract the pointer to MTLBuffer from the Tensor's storage
   id<MTLBuffer> srcBuf = getMTLBufferStorage(src);
   // a view tensor could be contiguous (e.g., slice ops) or non-contiguous (e.g., transpose())
-  if ((src.is_view() || !src.is_contiguous()) && gatherTensorData) {
+  if (!src.is_contiguous() && gatherTensorData) {
      Tensor emptyShell = Tensor();
     // use "_tensor" from Placeholder to retain view's output during its usage in other ops
     _tensor = gatherViewTensor(src, emptyShell);
@@ -183,18 +193,25 @@ void printTensorNDArray(const Tensor& t) {
     }
     srcBuf = getMTLBufferStorage(_tensor);
   }
+
   // tensor.numel() could be zero, but tensor is valid as long as the buffer size is non-zero.
   // if buffer size is zero in here, it's not a user error. It could be a missing check for
   // tensor.numel() == 0 in our internal implementations of ops.
   TORCH_INTERNAL_ASSERT([srcBuf length] > 0, "Placeholder tensor is empty!");
-
   const MPSDataType mpsDataType = _tensor.dim() == 0 ? getMPSScalarType(_tensor.scalar_type()) : getMPSDataType(_tensor.scalar_type());
-  if (!mpsShape)
-    mpsShape = getMPSShape(_tensor);
 
-  _value = [[[MPSGraphTensorData alloc] initWithMTLBuffer:srcBuf
-                                                    shape:mpsShape
-                                                 dataType:mpsDataType] autorelease];
+  if (src.is_view() && src.is_contiguous() && src.storage_offset()) {
+    _value = getMPSGraphTensorDataForView(src, mpsShape, mpsDataType);
+  } else {
+    if (!mpsShape) {
+      mpsShape = getMPSShape(_tensor);
+    }
+
+    _value = [[[MPSGraphTensorData alloc] initWithMTLBuffer:srcBuf
+                                                      shape:mpsShape
+                                                   dataType:mpsDataType] autorelease];
+  }
+
   TORCH_INTERNAL_ASSERT(_value);
   _placeholder = mpsGraphTensor;
 }
diff --git a/aten/src/ATen/native/mps/operations/View.mm b/aten/src/ATen/native/mps/operations/View.mm
index 8a51012686609..0828a3b0216b3 100644
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@@ -417,6 +417,95 @@
   return outputTensor;
 }
 
+MPSGraphTensorData* getMPSGraphTensorDataForView(const Tensor& src, MPSShape *mpsShape, const MPSDataType mpsDataType) {
+  IntArrayRef src_base_shape = get_buffer_shape(src.storage().data());
+  std::vector<int64_t> src_view_shape;
+  bool hasMPSShape = (mpsShape != nil);
+  int src_ndim_base = src_base_shape.size();
+  int src_ndim_view = 0;
+  if (hasMPSShape) {
+    src_ndim_view = [mpsShape count];
+    src_view_shape.reserve(src_ndim_view);
+    for (const auto i : c10::irange(src_ndim_view)) {
+      src_view_shape[i] = [mpsShape[i] intValue];
+    }
+  } else {
+    src_ndim_view = src.dim();
+    src_view_shape = src.sizes().vec();
+  }
+
+  MPSNDArray *srcTensorNDArrayView = nil;
+  MPSNDArrayDescriptor *srcTensorNDArrayDesc = nil;
+  MPSNDArray *srcTensorNDArray = nil;
+  id<MTLCommandBuffer> commandBuffer = getCurrentMPSStream()->commandBuffer();
+
+  if (src_ndim_base == src_ndim_view) {
+    srcTensorNDArray = ndArrayFromTensor(src, getMPSShape(src_base_shape), mpsDataType);
+    srcTensorNDArrayDesc = srcTensorNDArray.descriptor;
+
+    int firstDimToSlice = 0;
+    while (src_base_shape[firstDimToSlice] == src_view_shape[firstDimToSlice]) {
+      firstDimToSlice++;
+    }
+
+    int view_numel = 1;
+    for (const auto i : c10::irange(firstDimToSlice + 1, src_base_shape.size())) {
+      view_numel *= src_base_shape[i];
+    }
+
+    int sliceOffset = src.storage_offset() / view_numel;
+    // There are cases where both dimensions of a view can shrink
+    // E.g: x = torch.randn((3,6))[1, 1:3]
+    int nextSliceOffset = src.storage_offset() % view_numel;
+
+    [srcTensorNDArrayDesc sliceDimension:src_ndim_base - 1 - firstDimToSlice withSubrange:{static_cast<NSUInteger>(sliceOffset), src.sizes()[firstDimToSlice]}];
+    if (nextSliceOffset) {
+      [srcTensorNDArrayDesc sliceDimension:src_ndim_base - 2 - firstDimToSlice withSubrange:{static_cast<NSUInteger>(nextSliceOffset), src.sizes()[firstDimToSlice+1]}];
+    }
+  }
+  else {
+    int src_view_numel = 1;
+    for (const auto i : c10::irange(src_ndim_view)) {
+      src_view_numel *= src_view_shape[i];
+    }
+
+    int idx = 0;
+    int finalShapeSize = (src_ndim_view == 0) ? 1 : src_ndim_view;
+    std::vector<NSNumber*> mpsFinalShape(finalShapeSize);
+
+    // When the shapes are different, we need to flatten the first slice in order to alias the memory without any copies
+    // E.g: base tensor [5, 7, 3], view tensor [7, 3] (storage_offset=21). We need to flatten [5, 7, 3] to [35, 3], then
+    // we can slice directly into the first dimension based on the storage_offset
+    uint32_t flattenedSlice = 1;
+    for (const auto i : c10::irange(src_ndim_base - finalShapeSize + 1)) {
+      flattenedSlice *= src_base_shape[i];
+    }
+    mpsFinalShape[idx++] = [NSNumber numberWithInteger:flattenedSlice];
+
+    for (const auto i : c10::irange(src_ndim_base - finalShapeSize + 1, src_ndim_base)) {
+      mpsFinalShape[idx++] = [NSNumber numberWithInteger:src_base_shape[i]];
+    }
+
+    mpsShape = [NSArray arrayWithObjects:mpsFinalShape.data() count:mpsFinalShape.size()];
+    srcTensorNDArray = ndArrayFromTensor(src, mpsShape, mpsDataType);
+    srcTensorNDArrayDesc = srcTensorNDArray.descriptor;
+
+    int dim0 = (src_ndim_view == 0) ? 1 : src_view_shape[0];
+    int totalSlices = dim0;
+
+    // For 1D arrays, the storage_offset gives directly the
+    // starting point from where the slice should start
+    int sliceOffset = src_ndim_view == 1 ? 1 : dim0;
+    int view_numel = src_ndim_view == 1 ? 1 : src_view_numel;
+    [srcTensorNDArrayDesc sliceDimension:finalShapeSize - 1 withSubrange:{static_cast<NSUInteger>((src.storage_offset() / view_numel) * sliceOffset), totalSlices}];
+  }
+
+  srcTensorNDArrayView = [srcTensorNDArray arrayViewWithCommandBuffer:commandBuffer
+                                                           descriptor:srcTensorNDArrayDesc
+                                                             aliasing:MPSAliasingStrategyShallAlias];
+
+  return [[[MPSGraphTensorData alloc] initWithMPSNDArray:srcTensorNDArrayView] autorelease];
+}
 
 static MPSGraphTensor* chainViewOperation(ViewCachedGraph* cachedGraph, const IntArrayRef& size,
                                           const IntArrayRef& stride, int64_t offset,

From 3e84439787088cfb7627e1850546a3ccda051014 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 1 Nov 2022 10:45:45 -0700
Subject: [PATCH 1808/1922] Add support for torch.linalg.cross (#159)

* Add support for torch.linalg.cross

* Make use of metal::cross for float and half. For the other dtypes implement cross manually
---
 aten/src/ATen/native/mps/OperationUtils.h     |   1 +
 aten/src/ATen/native/mps/OperationUtils.mm    |  24 ++
 .../ATen/native/mps/operations/CrossKernel.mm | 207 ++++++++++++++++++
 .../ATen/native/mps/operations/Indexing.mm    |   2 +-
 aten/src/ATen/native/native_functions.yaml    |   2 +-
 test/test_mps.py                              |  55 ++++-
 6 files changed, 288 insertions(+), 3 deletions(-)
 create mode 100644 aten/src/ATen/native/mps/operations/CrossKernel.mm

diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
index 125410a82e915..cf4e39a1c9c99 100644
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@@ -43,6 +43,7 @@ MPSDataType getMPSDataType(ScalarType scalar_type);
 MPSDataType getMPSScalarType(ScalarType scalar_type);
 MPSScalar   getMPSScalar(const Scalar& scalar, ScalarType type);
 std::string getMPSTypeString(ScalarType scalar_type);
+std::string scalarToMetalTypeString(const c10::ScalarType& scalar_type);
 std::string getMPSShapeString(MPSShape* shape);
 std::string getTensorsStringKey(const TensorList& tensors, bool use_scalar_value = false);
 std::string getArrayRefString(const IntArrayRef s);
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index de9d58442f6cd..66ede5304f36f 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -87,6 +87,30 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
   }
 }
 
+std::string scalarToMetalTypeString(const c10::ScalarType& scalar_type) {
+  switch (scalar_type) {
+    case ScalarType::Float:
+      return "float";
+    case ScalarType::Half:
+      return "half";
+    case ScalarType::Int:
+      return "int";
+    case ScalarType::Long:
+      return "long";
+    case ScalarType::Short:
+      return "short";
+    case ScalarType::Char:
+      return "char";
+    case ScalarType::Byte:
+      return "uchar";
+    case ScalarType::Bool:
+      return "bool";
+    default:
+      TORCH_CHECK(false, "Undefined type ", scalar_type);
+      return "Undefined";
+  }
+}
+
 std::string getMPSShapeString(MPSShape* shape) {
     std::string str;
     for(NSNumber *elem in shape) {
diff --git a/aten/src/ATen/native/mps/operations/CrossKernel.mm b/aten/src/ATen/native/mps/operations/CrossKernel.mm
new file mode 100644
index 0000000000000..22b715f5f9153
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/CrossKernel.mm
@@ -0,0 +1,207 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/Cross.h>
+
+namespace at {
+namespace native {
+
+static const char* METAL_CROSS = R"CROSS_METAL(
+
+#include <metal_stdlib>
+using namespace metal;
+
+#define REGISTER_CROSS_FUNC(DTYPE)                              \
+static inline DTYPE ## 3 cross(DTYPE ## 3 x, DTYPE ## 3 y) {    \
+  DTYPE ## 3 out;                                               \
+  out.x = x.y * y.z - x.z * y.y;                                \
+  out.y = x.z * y.x - x.x * y.z;                                \
+  out.z = x.x * y.y - x.y * y.x;                                \
+  return out;                                                   \
+}
+
+// Metal only supports half and float for native cross implementation.
+// For all the the other data types, implement cross manually.
+REGISTER_CROSS_FUNC(int);
+REGISTER_CROSS_FUNC(long);
+REGISTER_CROSS_FUNC(short);
+REGISTER_CROSS_FUNC(char);
+REGISTER_CROSS_FUNC(uchar);
+REGISTER_CROSS_FUNC(bool);
+
+template<typename T, typename U>
+kernel void cross(constant void     * input_        [[buffer(0)]],
+                  constant void     * other_        [[buffer(1)]],
+                  device   void     * out_          [[buffer(2)]],
+                  constant uint3    * offsets       [[buffer(3)]],
+                  constant int64_t  & outStride     [[buffer(4)]],
+                  constant int64_t  & inputStride   [[buffer(5)]],
+                  constant int64_t  & otherStride   [[buffer(6)]],
+                  uint tid [[thread_position_in_grid]]) {
+  device   T* out   = (device   T*)((device uint8_t*)out_ + offsets[tid].x);
+  constant T* input = (constant T*)((constant uint8_t*)input_ + offsets[tid].y);
+  constant T* other = (constant T*)((constant uint8_t*)other_ + offsets[tid].z);
+
+  const U x = {input[0 * inputStride], input[1 * inputStride], input[2 * inputStride]};
+  const U y = {other[0 * otherStride], other[1 * otherStride], other[2 * otherStride]};
+  const U res = cross(x, y);
+
+  out[0 * outStride] = res.x;
+  out[1 * outStride] = res.y;
+  out[2 * outStride] = res.z;
+}
+
+#define REGISTER_CROSS_OP(DTYPE)                       \
+template                                               \
+[[host_name("cross_" #DTYPE)]]                         \
+kernel void cross<DTYPE, DTYPE ## 3>(                  \
+  constant void     * input_        [[buffer(0)]],     \
+  constant void     * other_        [[buffer(1)]],     \
+  device   void     * out_          [[buffer(2)]],     \
+  constant uint3    * offsets       [[buffer(3)]],     \
+  constant int64_t  & outStride     [[buffer(4)]],     \
+  constant int64_t  & inputStride   [[buffer(5)]],     \
+  constant int64_t  & otherStride   [[buffer(6)]],     \
+  uint tid [[thread_position_in_grid]]);
+
+REGISTER_CROSS_OP(float);
+REGISTER_CROSS_OP(half);
+REGISTER_CROSS_OP(int);
+REGISTER_CROSS_OP(long);
+REGISTER_CROSS_OP(short);
+REGISTER_CROSS_OP(char);
+REGISTER_CROSS_OP(uchar);
+REGISTER_CROSS_OP(bool);
+
+)CROSS_METAL";
+
+using namespace mps;
+
+static id<MTLLibrary> compileCrossOpLibrary(id<MTLDevice> device) {
+  static id<MTLLibrary> crossLibrary = nil;
+  if (crossLibrary) {
+    return crossLibrary;
+  }
+
+  NSError *error = nil;
+  MTLCompileOptions *options = [[MTLCompileOptions new] autorelease];
+  [options setLanguageVersion: MTLLanguageVersion2_3];
+  crossLibrary  = [device newLibraryWithSource:[NSString stringWithCString: METAL_CROSS encoding:NSASCIIStringEncoding]
+                                       options:options
+                                         error:&error];
+  TORCH_CHECK(crossLibrary, "Failed to create metal cross library, error: ", [[error description] UTF8String]);
+  return crossLibrary;
+}
+
+static id<MTLComputePipelineState> crossPipelineState(id<MTLDevice> device, ScalarType scalar_type) {
+  std::string kernel = "cross_" + scalarToMetalTypeString(scalar_type);
+  static std::unordered_map<std::string, id<MTLComputePipelineState>> psoCache;
+  id<MTLComputePipelineState> pso = psoCache[kernel];
+  if (pso) {
+    return pso;
+  }
+
+  NSError* error = nil;
+  id<MTLLibrary> crossLib = compileCrossOpLibrary(device);
+  id<MTLFunction> crossFunc = [crossLib newFunctionWithName:[NSString stringWithUTF8String:kernel.c_str()]];
+  TORCH_CHECK(crossFunc, "Failed to create function state object for: ", kernel);
+  pso = [device newComputePipelineStateWithFunction:crossFunc error:&error];
+  TORCH_CHECK(pso, "Failed to created pipeline state object, error: ", [[error description] UTF8String]);
+
+  psoCache[kernel] = pso;
+  return pso;
+}
+
+void cross_mps_impl(const Tensor& out, const Tensor& input, const Tensor& other, int64_t dim) {
+  TORCH_CHECK(input.dtype() != at::kDouble, "float64 is not supported on MPS");
+
+  auto iter = TensorIteratorConfig()
+      .add_output(out)
+      .add_input(input)
+      .add_input(other)
+      .resize_outputs(false)
+      .declare_static_shape(out.sizes(), /*squash_dims=*/dim)
+      .build();
+
+  id<MTLBuffer> inputBuffer  = getMTLBufferStorage(input);
+  id<MTLBuffer> otherBuffer  = getMTLBufferStorage(other);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(out);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  const int64_t out_dim_stride =  out.stride(dim);
+  const int64_t input_dim_stride = input.stride(dim);
+  const int64_t other_dim_stride = other.stride(dim);
+  const uint32_t nDim = iter.ndim();
+  constexpr uint32_t nOffsets = 3;
+  const uint32_t numThreads = iter.numel();
+  dispatch_sync(mpsStream->queue(), ^(){
+    @autoreleasepool {
+      NSError* error = nil;
+      id<MTLCommandBuffer> commandBuffer = mpsStream->commandBuffer();
+      id<MTLComputeCommandEncoder> computeEncoder = [commandBuffer computeCommandEncoder];
+      MTLSize gridSize = MTLSizeMake(numThreads, 1, 1);
+      const IntArrayRef& iterShape = iter.shape();
+      std::vector<uint32_t> iterShapeData(iterShape.size());
+      std::vector<std::array<uint32_t, nOffsets>> strides(nDim);
+
+      for (const auto i: c10::irange(iterShape.size())) {
+        TORCH_CHECK(i <= UINT32_MAX);
+        iterShapeData[i] = (uint32_t)(iterShape[i]);
+      }
+
+      for (const auto i: c10::irange(nDim)) {
+        for (const auto offset: c10::irange(nOffsets)) {
+            strides[i][offset] = iter.strides(offset)[i];
+        }
+      }
+
+      id<MTLFunction> kernelDataOffsetsFunction = MPSDevice::getInstance()->metalIndexingFunction("kernel_index_offsets", nil);
+      id<MTLComputePipelineState> kernelDataOffsetsPSO = [[device newComputePipelineStateWithFunction: kernelDataOffsetsFunction
+                                                                                                error: &error] autorelease];
+      id<MTLBuffer> kernelDataOffsets = [[device newBufferWithLength: numThreads * sizeof(simd_uint3)
+                                                             options: 0] autorelease];
+      TORCH_CHECK(kernelDataOffsetsPSO, "Failed to created pipeline state object, error: ", [[error description] UTF8String]);
+      [computeEncoder setComputePipelineState:kernelDataOffsetsPSO];
+      [computeEncoder setBytes:strides.data() length:sizeof(uint32_t) * nDim * nOffsets atIndex:0];
+      [computeEncoder setBuffer:kernelDataOffsets offset:0 atIndex:1];
+      [computeEncoder setBytes:iterShapeData.data() length:sizeof(uint32_t) * iterShape.size() atIndex:2];
+      [computeEncoder setBytes:&nDim length:sizeof(uint32_t) atIndex:3];
+      [computeEncoder setBytes:&nOffsets length:sizeof(uint32_t) atIndex:4];
+
+      NSUInteger kernelOffsetsTGSize = kernelDataOffsetsPSO.maxTotalThreadsPerThreadgroup;
+      if (kernelOffsetsTGSize > numThreads)
+          kernelOffsetsTGSize = numThreads;
+
+      MTLSize kernelOffsetsThreadGroupSize = MTLSizeMake(kernelOffsetsTGSize, 1, 1);
+      [computeEncoder dispatchThreads: gridSize
+                threadsPerThreadgroup: kernelOffsetsThreadGroupSize];
+
+      id<MTLComputePipelineState> crossPSO = crossPipelineState(device, out.scalar_type());
+      [computeEncoder setComputePipelineState:crossPSO];
+      [computeEncoder setBuffer:inputBuffer  offset:input.storage_offset() * input.element_size() atIndex:0];
+      [computeEncoder setBuffer:otherBuffer  offset:other.storage_offset() * other.element_size() atIndex:1];
+      [computeEncoder setBuffer:outputBuffer offset:out.storage_offset() * out.element_size() atIndex:2];
+      [computeEncoder setBuffer:kernelDataOffsets offset:0 atIndex:3];
+      [computeEncoder setBytes:&out_dim_stride  length:sizeof(int64_t)  atIndex:4];
+      [computeEncoder setBytes:&input_dim_stride length:sizeof(int64_t) atIndex:5];
+      [computeEncoder setBytes:&other_dim_stride length:sizeof(int64_t) atIndex:6];
+
+      NSUInteger tgSize = crossPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > numThreads) {
+          tgSize = numThreads;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreads: gridSize
+                threadsPerThreadgroup: threadGroupSize];
+
+      [computeEncoder endEncoding];
+      mpsStream->commit(true);
+    }
+  });
+}
+
+REGISTER_DISPATCH(cross_stub, &cross_mps_impl);
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index 0f1901a2e508f..303716cc53c3f 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -49,7 +49,7 @@ bool dispatchIndexKernel(TensorIteratorBase& iter,
 
   dispatch_sync(mpsStream->queue(), ^(){
     @autoreleasepool {
-    NSError* error = nil;
+      NSError* error = nil;
       constexpr uint32_t nOffsets = 3;
       const int64_t num_indices = index_size.size();
       const uint32_t numThreads = iter.numel();
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index fb648c6793f0b..861330d9326a9 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -12678,7 +12678,7 @@
   python_module: linalg
   structured: True
   dispatch:
-    CPU, CUDA: linalg_cross_out
+    CPU, CUDA, MPS: linalg_cross_out
 
 # linalg.lu_factor
 - func: linalg_lu_factor(Tensor A, *, bool pivot=True) -> (Tensor LU, Tensor pivots)
diff --git a/test/test_mps.py b/test/test_mps.py
index 0c26a0135d2f2..a1dd3226bc111 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -306,6 +306,56 @@ def test_mm(self):
         D = torch.mm(B, C).cpu()
         torch.testing.assert_close(D, torch.full((5, 5), 6.0))
 
+    def test_linalg_cross(self):
+        def helper(dtype):
+            device = "mps"
+            if dtype is torch.int32 or dtype is torch.int64:
+                x = torch.randint(0, 99999, (100, 3, 100), dtype=dtype, device=device)
+                y = torch.randint(0, 99999, (100, 3, 100), dtype=dtype, device=device)
+            else:
+                x = torch.rand(100, 3, 100, dtype=dtype, device=device)
+                y = torch.rand(100, 3, 100, dtype=dtype, device=device)
+            x_cpu = x.to("cpu")
+            y_cpu = y.to("cpu")
+            res1 = torch.linalg.cross(x, y, dim=1)
+            res2 = torch.tensor((), dtype=dtype, device=device)
+            res1_cpu = torch.linalg.cross(x_cpu, y_cpu, dim=1)
+            res2_cpu = torch.tensor((), dtype=dtype, device="cpu")
+            torch.linalg.cross(x, y, dim=1, out=res2)
+            torch.linalg.cross(x_cpu, y_cpu, dim=1, out=res2_cpu)
+            self.assertEqual(res1, res2)
+            self.assertEqual(res1, res1_cpu)
+            self.assertEqual(res2, res2_cpu)
+
+            # test for broadcastable inputs
+            if dtype is torch.int32 or dtype is torch.int64:
+                x = torch.randint(0, 99999, (1, 3, 2), dtype=dtype, device=device)
+                y = torch.randint(0, 99999, (4, 3, 1), dtype=dtype, device=device)
+            else:
+                x = torch.rand(1, 3, 2, dtype=dtype, device=device)
+                y = torch.rand(4, 3, 1, dtype=dtype, device=device)
+            x_cpu = x.to("cpu")
+            y_cpu = y.to("cpu")
+            res1 = torch.linalg.cross(x, y, dim=1)
+            res2 = torch.tensor((), dtype=dtype, device=device)
+            res1_cpu = torch.linalg.cross(x_cpu, y_cpu, dim=1)
+            res2_cpu = torch.tensor((), dtype=dtype, device="cpu")
+            torch.linalg.cross(x, y, dim=1, out=res2)
+            torch.linalg.cross(x_cpu, y_cpu, dim=1, out=res2_cpu)
+            self.assertEqual(res1, res2)
+            self.assertEqual(res1, res1_cpu)
+            self.assertEqual(res2, res2_cpu)
+        [helper(dtype) for dtype in [torch.int32, torch.int64, torch.float32]]
+
+    def test_cross(self):
+        a = torch.randn(4, 3, device="mps")
+        b = torch.randn(4, 3, device="mps")
+        a_cpu = a.to("cpu")
+        b_cpu = b.to("cpu")
+        res = torch.cross(a, b, dim=1)
+        res_cpu = torch.cross(a_cpu, b_cpu, dim=1)
+        self.assertEqual(res, res_cpu)
+
     def test_addmm(self):
         A = torch.ones(5, 5).to("mps")
         B = torch.ones(5, 6).to("mps")
@@ -7872,7 +7922,10 @@ class TestConsistency(TestCase):
         'vstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'zero_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'where': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'nonzero': ['f32', 'i16', 'i32', 'i64']}
+        'nonzero': ['f32', 'i16', 'i32', 'i64'],
+        'cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'linalg.cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        }
 
 
     ALLOWLIST_OP_GRAD = {

From abf8edb9661f80d1ed5cc88730ba8b6ed3b98ad9 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Wed, 2 Nov 2022 12:38:15 -0700
Subject: [PATCH 1809/1922] Add support for randperm (#160)

* Add mps support for randperm

* Add testcase for randperm

* Address PR comments

* Fix randperm string key for graph caching

* Address remaining PR comments

* Fix warning message
---
 .../native/mps/operations/Distributions.mm    | 40 +++++++++++++++++++
 .../ATen/native/mps/operations/Indexing.mm    |  2 +-
 aten/src/ATen/native/native_functions.yaml    |  1 +
 test/test_mps.py                              | 34 ++++++++++++++++
 4 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/mps/operations/Distributions.mm b/aten/src/ATen/native/mps/operations/Distributions.mm
index ea27d88218591..8fa8f5d6208f4 100644
--- a/aten/src/ATen/native/mps/operations/Distributions.mm
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@@ -4,6 +4,7 @@
 #include <ATen/native/DistributionTemplates.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/mps/MPSGeneratorImpl.h>
+#include <ATen/native/TensorFactories.h>
 
 namespace at {
 namespace native {
@@ -341,6 +342,45 @@ Tensor normal_mps(const Tensor& mean, const Tensor& std, c10::optional<Generator
                                       "exponential_mps_:" + std::to_string(lambda), random_op_block);
 }
 
+Tensor& randperm_out_mps(int64_t n, c10::optional<Generator> generator, Tensor& result) {
+  if (!MPSDevice::getInstance()->macOS_13_0()) {
+    TORCH_WARN_ONCE("MPS: randperm op is supported natively starting from macOS 13.0. ",
+                    "Falling back on CPU. This may have performance implications.");
+
+    result = result.to("cpu");
+    result = at::randperm_out(result, n).to("mps");
+    return result;
+  }
+
+  TORCH_CHECK(n >= 0, "n must be non-negative, got", n);
+  TORCH_CHECK(!generator.has_value() ||
+             (generator.has_value() && result.device() == generator->device()),
+             "Expected a '", result.device(), "' generator device but found '", generator->device(), "'");
+  check_supported_max_int_with_precision(n, result);
+
+  result.resize_({n});
+  if (n == 0) {
+    return result;
+  }
+
+  mps::RandomOpBlock random_op_block = ^RandomOpFn(cachedGraph, randomTensor) {
+    MPSGraph* mpsGraph = cachedGraph->graph();
+    MPSGraphTensor* argsortTensor = [mpsGraph argSortWithTensor:randomTensor
+                                                           axis:0
+                                                           name:nil];
+    if (result.scalar_type() != kInt) {
+      argsortTensor = [mpsGraph castTensor:argsortTensor
+                                    toType:mps::getMPSDataType(result.scalar_type())
+                                      name:@"castOutput"];
+    }
+    return argsortTensor;
+  };
+
+  return mps::random_mps_impl<int64_t>(result, 0.0, 1.0, c10::nullopt, c10::nullopt,
+                                      MPSGraphRandomDistributionUniform, generator,
+                                      "ranperm_out_mps:" + mps::getTensorsStringKey({result}), random_op_block);
+}
+
 Tensor& multinomial_with_replacement_mps_kernel(
     const Tensor& self,
     const int64_t n_sample,
diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index 303716cc53c3f..be74750879769 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -214,7 +214,7 @@ void index_put_kernel_mps(TensorIterator& iter, IntArrayRef index_size, IntArray
 static
 Tensor nonzero_fallback(const Tensor& self) {
   TORCH_WARN_ONCE("MPS: nonzero op is supported natively starting from macOS 13.0. ",
-                  "Falling back on CPU. This may have performace implications.");
+                  "Falling back on CPU. This may have performance implications.");
 
   return at::nonzero(self.to("cpu")).clone().to("mps");
 }
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 861330d9326a9..dc6c0de1fb3c9 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -4408,6 +4408,7 @@
   dispatch:
     CPU: randperm_out_cpu
     CUDA: randperm_out_cuda
+    MPS: randperm_out_mps
 
 - func: range.step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   dispatch:
diff --git a/test/test_mps.py b/test/test_mps.py
index a1dd3226bc111..355ba8af03b17 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -549,6 +549,40 @@ def test_uniform(self):
         low.grad.zero_()
         high.grad.zero_()
 
+    def test_randperm(self, device="mps"):
+        rng_device = None
+        for n in (5, 100, 50000, 100000):
+            for dtype in (torch.long, torch.half, torch.float):
+                if n > 2049 and dtype == torch.half:  # Large n for torch.half will raise an exception, do not test here.
+                    continue
+                if n > 256 and dtype == torch.bfloat16:
+                    continue
+                with torch.random.fork_rng(devices=rng_device):
+                    res1 = torch.randperm(n, dtype=dtype, device=device)
+                res2 = torch.empty(0, dtype=dtype, device=device)
+                torch.randperm(n, out=res2, dtype=dtype, device=device)
+                self.assertEqual(res1.cpu().sort().values.long(), torch.arange(n, device=device))
+
+        # Default type is long
+        for n in (100, 10000):
+            self.assertEqual(torch.randperm(n, device=device).dtype, torch.long)
+
+        # randperm of 0 elements is an empty tensor
+        res1 = torch.randperm(0)
+        res2 = torch.tensor(5, dtype=dtype, device=device)
+        torch.randperm(0, out=res2)
+        self.assertEqual(res1.numel(), 0)
+        self.assertEqual(res2.numel(), 0)
+
+        # Test non-contiguous tensors
+        for n in (4, 5, 6, 10, 20):
+            non_contiguous_tensor = torch.zeros((2, 3), dtype=torch.long, device=device).t()
+            self.assertFalse(non_contiguous_tensor.is_contiguous())
+            with torch.random.fork_rng(devices=rng_device):
+                res = torch.randperm(n, dtype=torch.long, device=device)
+            torch.randperm(n, out=non_contiguous_tensor)
+            self.assertEqual(res.cpu().sort().values.long(), torch.arange(n, device=device))
+
     # Test forward maxpool2d
     def test_max_pool2d(self):
         def helper(shape, ks, padding=0, dilation=1, ceil_mode=False, return_indices=False, test_ties=False):

From ad910267f23144f7c19b56fcf95007386e25e953 Mon Sep 17 00:00:00 2001
From: chrisbbayley <105944653+chrisbbayley@users.noreply.github.com>
Date: Wed, 2 Nov 2022 22:59:43 -0700
Subject: [PATCH 1810/1922] Add mps implementation for unique(no dim support)
 and unique_consecutive (#117)

* WIP

* Added mps backend for _unique2_mps, unique_consecutive_mps, unique_dim_consecutive_mps

* Add more unit tests for unique to cover length=0,1 cases

* Remove unused unique_dim_mps

* Add fallback support for unique op

* Fix negative dimensions and null placeholders

Co-authored-by: Chris Bayley <cbayley@apple.com>
Co-authored-by: Denis Vieriu <dvieriu@apple.com>
---
 aten/src/ATen/native/mps/operations/Unique.mm | 345 ++++++++++++++++++
 aten/src/ATen/native/native_functions.yaml    |   3 +
 test/test_mps.py                              | 155 +++++++-
 3 files changed, 501 insertions(+), 2 deletions(-)
 create mode 100644 aten/src/ATen/native/mps/operations/Unique.mm

diff --git a/aten/src/ATen/native/mps/operations/Unique.mm b/aten/src/ATen/native/mps/operations/Unique.mm
new file mode 100644
index 0000000000000..7b3b61f0f2f89
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/Unique.mm
@@ -0,0 +1,345 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/Resize.h>
+#include <ATen/mps/MPSAllocator.h>
+
+namespace at {
+namespace native {
+namespace mps {
+
+struct UniqueCachedGraph : public MPSCachedGraph
+{
+  UniqueCachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+  MPSGraphTensor* inputTensor_ = nil;
+  MPSGraphTensor* outputTensor_ = nil;
+  MPSGraphTensor* inverseIndicesTensor_ = nil;
+  MPSGraphTensor* countsTensor_ = nil;
+  MPSGraphTensor* lengthTensor_ = nil;
+};
+
+static std::string getUniqueKey(const ScalarType& dtype, const IntArrayRef& base_shape,
+                                const bool return_inverse, const bool return_counts,
+                                const bool consecutive, c10::optional<int64_t> dimOpt)
+{
+  return "_unique2_mps:" + getMPSTypeString(dtype) + "[" + getArrayRefString(base_shape) +
+         "]:[" + (dimOpt.has_value() ? to_string(dimOpt.value()) : "None") + "]:[" + to_string(return_inverse) +
+         "]:[" + to_string(return_counts) + "]:[" + to_string(consecutive) + "]";
+}
+
+// dim arg not supported when non consecutive, ie sorted
+NSArray<MPSGraphTensor*> *buildUniqueGraph(const Tensor& self, UniqueCachedGraph *uniqueGraph, const bool return_inverse, const bool return_counts, const bool consecutive, c10::optional<int64_t> dimOpt) {
+  int64_t dim = dimOpt.has_value() ? maybe_wrap_dim(dimOpt.value(), self.dim()) : 0;
+
+  MPSGraph *graph = uniqueGraph->graph();
+  MPSGraphTensor *inputTensor = uniqueGraph->inputTensor_;
+  MPSShape *shape = [inputTensor shape];
+  MPSShape *destShape = shape;
+  NSUInteger length = [shape[dim] integerValue];
+  MPSDataType dataType = [inputTensor dataType];
+
+  MPSGraphTensor *resultTensor = (MPSGraphTensor *)[NSNull null];
+  MPSGraphTensor *inverseIndicesTensor = (MPSGraphTensor *)[NSNull null];
+  MPSGraphTensor *countTensor = (MPSGraphTensor *)[NSNull null];
+  MPSGraphTensor *lengthTensor = (MPSGraphTensor *)[NSNull null];
+  if (length <= 1) {
+    // Trivial case, only 1 element everything is unique
+    resultTensor = inputTensor;
+    lengthTensor = [graph constantWithScalar:0.0f
+                                    dataType:MPSDataTypeInt32];
+    if (return_inverse)
+      inverseIndicesTensor = [graph constantWithScalar:0.0f
+                                              dataType:MPSDataTypeInt32];
+    if (return_counts)
+      countTensor = [graph constantWithScalar:1.0f
+                                     dataType:MPSDataTypeInt32];
+    return @[resultTensor, inverseIndicesTensor, countTensor, lengthTensor];
+  }
+
+  // Sort only supports following types, cast if necessary
+  if (dataType != MPSDataTypeInt32 &&
+      dataType != MPSDataTypeFloat32 &&
+      dataType != MPSDataTypeFloat16) {
+    dataType = (dataType & MPSDataTypeFloatBit) ? MPSDataTypeFloat32 : MPSDataTypeInt32;
+    inputTensor = [graph castTensor:inputTensor
+                             toType:dataType
+                               name:@"castInputTensor"];
+  }
+
+  bool needsFlatten = !(dimOpt.has_value() || [shape count] == 1);
+  if (needsFlatten) {
+    inputTensor = [graph reshapeTensor:inputTensor
+                             withShape:@[@-1]
+                                  name:nil];
+    length = 1;
+    for(NSUInteger i = 0; i < [shape count]; i++)
+      length *= [shape[i] integerValue];
+    destShape = @[[NSNumber numberWithUnsignedInteger:length]];
+  }
+
+  MPSGraphTensor *sortedInput = nil;
+  if (consecutive)
+    sortedInput = inputTensor;
+  else
+    sortedInput = [graph sortWithTensor:inputTensor
+                                   axis:0
+                                   name:nil];
+
+  MPSGraphTensor *frontNMinusOne = [graph sliceTensor:sortedInput
+                                            dimension:dim
+                                                start:0
+                                               length:length-1
+                                                 name:nil];
+  MPSGraphTensor *backNMinusOne = [graph sliceTensor:sortedInput
+                                           dimension:dim
+                                               start:1
+                                              length:length-1
+                                                name:nil];
+  MPSGraphTensor *notEqualToPreviousElement = [graph notEqualWithPrimaryTensor:backNMinusOne
+                                                               secondaryTensor:frontNMinusOne
+                                                                          name:nil];
+  MPSGraphTensor *mask = [graph castTensor:notEqualToPreviousElement
+                                    toType:MPSDataTypeInt32
+                                      name:@"castMaskTensor"];
+
+  // If comparing tensors, not scalars, check if entire tensor matches previos element using reductionOr over tensor
+  if (dimOpt.has_value() && [shape count] != 1) {
+    NSMutableArray *axes = [[NSMutableArray alloc] initWithCapacity:[shape count]-1];
+    for (NSUInteger axis = 0; axis < [shape count]; axis++){
+      if (axis != dim)
+        [axes addObject:[NSNumber numberWithUnsignedInteger:axis]];
+    }
+    mask = [graph reductionOrWithTensor:mask
+                                   axes:axes
+                                   name:nil];
+    mask = [graph squeezeTensor:mask
+                           axes:axes
+                           name:nil];
+    [axes release];
+  }
+
+  MPSGraphTensor *scannedIndices = [graph cumulativeSumWithTensor:mask
+                                                             axis:0
+                                                             name:nil];
+  lengthTensor = [graph sliceTensor:scannedIndices
+                          dimension:0
+                              start:length-2
+                             length:1
+                               name:nil];
+
+  MPSGraphTensor *minusOneTensor = [graph constantWithScalar:-1.0f
+                                                    dataType:MPSDataTypeInt32];
+  MPSGraphTensor *maskedIndices = [graph selectWithPredicateTensor:mask
+                                               truePredicateTensor:scannedIndices
+                                              falsePredicateTensor:minusOneTensor
+                                                              name:nil];
+
+  MPSGraphTensor *zeroTensor = [graph constantWithScalar:0.0f
+                                                   shape:@[@1]
+                                                dataType:MPSDataTypeInt32];
+  MPSGraphTensor *maskedIndicesWithHead = [graph concatTensors:@[zeroTensor, maskedIndices]
+                                                     dimension:0
+                                                          name:nil];
+  MPSGraphTensor *scannedIndicesWithHead = [graph concatTensors:@[zeroTensor, scannedIndices]
+                                                      dimension:0
+                                                           name:nil];
+
+  resultTensor = [graph scatterWithUpdatesTensor:sortedInput
+                                            indicesTensor:maskedIndicesWithHead
+                                                    shape:destShape
+                                                     axis:dim
+                                                     mode:MPSGraphScatterModeSet
+                                                     name:nil];
+  // Cast back if necessary
+  if ([uniqueGraph->inputTensor_ dataType] != dataType)
+    resultTensor = [graph castTensor:resultTensor
+                              toType:[uniqueGraph->inputTensor_ dataType]
+                                name:@"castResultTensor"];
+
+  // Compute optional returned tensors if requested
+  if(return_inverse) {
+    MPSGraphTensor *argSortedInput = nil;
+    if (consecutive)
+      argSortedInput = [graph coordinateAlongAxis:0
+                                        withShape:@[[NSNumber numberWithUnsignedInteger:length]]
+                                             name:nil];
+    else
+      argSortedInput = [graph argSortWithTensor:inputTensor
+                                           axis:0
+                                           name:nil];
+    inverseIndicesTensor = [graph scatterWithUpdatesTensor:scannedIndicesWithHead
+                                                      indicesTensor:argSortedInput
+                                                              shape:@[[NSNumber numberWithUnsignedInteger:length]]
+                                                               axis:0
+                                                               mode:MPSGraphScatterModeAdd
+                                                               name:nil];
+    if (needsFlatten)
+      inverseIndicesTensor = [graph reshapeTensor:inverseIndicesTensor
+                                        withShape:shape
+                                             name:nil];
+  }
+
+  if (return_counts) {
+    MPSGraphTensor *unitTensor = [graph constantWithScalar:1.0f
+                                                     shape:@[[NSNumber numberWithUnsignedInteger:length]]
+                                                  dataType:MPSDataTypeInt32];
+    countTensor = [graph scatterWithUpdatesTensor:unitTensor
+                                             indicesTensor:scannedIndicesWithHead
+                                                     shape:@[[NSNumber numberWithUnsignedInteger:length]]
+                                                      axis:0
+                                                      mode:MPSGraphScatterModeAdd
+                                                      name:nil];
+  }
+
+  return @[resultTensor, inverseIndicesTensor, countTensor, lengthTensor];
+}
+
+static UniqueCachedGraph* getUniqueGraph(const Tensor& self, const bool return_inverse, const bool return_counts, const bool consecutive, c10::optional<int64_t> dim) {
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  @autoreleasepool {
+    string key = getUniqueKey(self.scalar_type(), self.sizes(), return_inverse, return_counts, consecutive, dim);
+    UniqueCachedGraph* cachedGraph = static_cast<UniqueCachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        UniqueCachedGraph *newCachedGraph = nil;
+
+         @autoreleasepool {
+           // Initialize graph
+           MPSGraph* mpsGraph = make_mps_graph();
+           newCachedGraph = new UniqueCachedGraph(mpsGraph);
+
+           // Workaround for MPSShaderLibrary bug
+           // TODO: Remove once https://github.com/pytorch/pytorch/issues/82305 is resolved
+           auto inputType = getMPSScalarType(self.scalar_type());
+           newCachedGraph->inputTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, inputType, getMPSShape(self.sizes()));
+
+           NSArray<MPSGraphTensor *> *outputTensors = buildUniqueGraph(self, newCachedGraph, return_inverse, return_counts, consecutive, dim);
+
+           newCachedGraph->outputTensor_ = outputTensors[0];
+           newCachedGraph->inverseIndicesTensor_ = outputTensors[1];
+           newCachedGraph->countsTensor_ = outputTensors[2];
+           newCachedGraph->lengthTensor_ = outputTensors[3];
+         }
+         return newCachedGraph;
+       });
+       cachedGraph = static_cast<UniqueCachedGraph *>(tmpCachedGraph);
+     }
+    return cachedGraph;
+  }
+}
+
+void runUniqueGraph(UniqueCachedGraph *uniqueGraph, const Tensor& input, Tensor& output,
+                    Tensor& inverse_indices, Tensor& counts, Tensor& length,
+                    bool return_inverse, bool return_counts){
+  Placeholder inputPlaceholder = Placeholder(uniqueGraph->inputTensor_, input);
+  NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+    inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+  };
+
+  NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = [NSMutableDictionary dictionary];
+  Placeholder outputPlaceholder = Placeholder(uniqueGraph->outputTensor_, output);
+  Placeholder lengthPlaceholder = Placeholder(uniqueGraph->lengthTensor_, length);
+  [results setObject:outputPlaceholder.getMPSGraphTensorData()
+              forKey:outputPlaceholder.getMPSGraphTensor()];
+  [results setObject:lengthPlaceholder.getMPSGraphTensorData()
+              forKey:lengthPlaceholder.getMPSGraphTensor()];
+  if (return_inverse) {
+    Placeholder inverseIndicesPlaceholder = Placeholder(uniqueGraph->inverseIndicesTensor_, inverse_indices);
+    [results setObject:inverseIndicesPlaceholder.getMPSGraphTensorData()
+                forKey:inverseIndicesPlaceholder.getMPSGraphTensor()];
+  }
+  if (return_counts) {
+    Placeholder countsPlaceholder = Placeholder(uniqueGraph->countsTensor_, counts);
+    [results setObject:countsPlaceholder.getMPSGraphTensorData()
+                forKey:countsPlaceholder.getMPSGraphTensor()];
+  }
+
+  // Run the graph
+  MPSStream* stream = getCurrentMPSStream();
+  runMPSGraph(stream, uniqueGraph->graph(), feeds, results);
+}
+
+} // namespace mps
+
+std::tuple<Tensor, Tensor, Tensor>
+_unique_impl_mps(const Tensor& self, const bool return_inverse, const bool return_counts, const bool consecutive, c10::optional<int64_t> dimOpt) {
+
+  const Tensor& input = self.contiguous();
+
+  // get flat output size
+  int64_t totalElems = c10::multiply_integers(input.sizes());
+
+  IntArrayRef outputShape = IntArrayRef(totalElems);
+  IntArrayRef inverseIndicesShape = input.sizes();
+  IntArrayRef countsShape = IntArrayRef(totalElems);
+  int64_t dim = dimOpt.has_value() ? maybe_wrap_dim(dimOpt.value(), self.dim()) : 0;
+
+  if (dimOpt.has_value()) {
+    outputShape = input.sizes();
+    inverseIndicesShape = IntArrayRef(input.sizes()[dim]);
+    countsShape = IntArrayRef(input.sizes()[dim]);
+  }
+  if (!return_inverse)
+    inverseIndicesShape = {};
+  if (!return_counts)
+    countsShape = {};
+
+  Tensor output = at::native::empty_mps(outputShape, input.scalar_type(), c10::nullopt, kMPS);
+  Tensor inverse_indices = at::native::empty_mps(inverseIndicesShape, ScalarType::Long, c10::nullopt, kMPS);
+  Tensor counts = at::native::empty_mps(countsShape, ScalarType::Long, c10::nullopt, kMPS);
+  Tensor length = at::native::empty_mps({1}, ScalarType::Int, c10::nullopt, kMPS);
+
+  if (input.numel() == 0) {
+    return std::make_tuple(output, inverse_indices, counts);
+  }
+
+  mps::UniqueCachedGraph *uniqueGraph = mps::getUniqueGraph(input, return_inverse, return_counts, consecutive, dimOpt);
+  mps::runUniqueGraph(uniqueGraph, input, output, inverse_indices, counts, length, return_inverse, return_counts);
+
+  int64_t lengthScalar = length.item<int64_t>() + 1; // length actually holds max index, add 1
+  if (output.sizes().size() != 0) {
+    output = at::slice(output, dim, 0, lengthScalar);
+  }
+  if (return_counts)
+    counts = at::slice(counts, 0, 0, lengthScalar);
+
+  return std::make_tuple(output, inverse_indices, counts);
+}
+
+std::tuple<Tensor, Tensor, Tensor>
+unique_consecutive_mps(const Tensor& self, const bool return_inverse, const bool return_counts, c10::optional<int64_t> dim) {
+  if (!MPSDevice::getInstance()->macOS_13_0()) {
+    TORCH_WARN_ONCE("MPS: unique_consecutive op is supported natively starting from macOS 13.0. ",
+                    "Falling back on CPU. This may have performace implications.");
+    return at::unique_consecutive(self.to("cpu"), return_inverse, return_counts, dim);
+  }
+
+  return _unique_impl_mps(self, return_inverse, return_counts, true, dim);
+}
+
+std::tuple<Tensor, Tensor, Tensor>
+unique_dim_consecutive_mps(const Tensor& self, int64_t dim, const bool return_inverse, const bool return_counts) {
+  if (!MPSDevice::getInstance()->macOS_13_0()) {
+    TORCH_WARN_ONCE("MPS: unique_dim_consecutive op is supported natively starting from macOS 13.0. ",
+                    "Falling back on CPU. This may have performace implications.");
+    return at::unique_dim_consecutive(self.to("cpu"), dim, return_inverse, return_counts);
+  }
+
+  return _unique_impl_mps(self, return_inverse, return_counts, true, c10::make_optional((int64_t)dim));
+}
+
+std::tuple<Tensor, Tensor, Tensor>
+_unique2_mps(const Tensor& self, const bool sorted, const bool return_inverse, const bool return_counts) {
+  if (!MPSDevice::getInstance()->macOS_13_0()) {
+    TORCH_WARN_ONCE("MPS: _unique2 op is supported natively starting from macOS 13.0. ",
+                    "Falling back on CPU. This may have performace implications.");
+    return at::_unique2(self.to("cpu"), sorted, return_inverse, return_counts);
+  }
+
+  return _unique_impl_mps(self, return_inverse, return_counts, false, c10::nullopt);
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index dc6c0de1fb3c9..7fa80d3fd16ee 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -5797,6 +5797,7 @@
   dispatch:
     CPU: unique_consecutive_cpu
     CUDA: unique_consecutive_cuda
+    MPS: unique_consecutive_mps
   tags: dynamic_output_shape
   autogen: unique_consecutive.out
 
@@ -5805,6 +5806,7 @@
   dispatch:
     CPU: unique_dim_consecutive_cpu
     CUDA: unique_dim_consecutive_cuda
+    MPS: unique_dim_consecutive_mps
   tags: dynamic_output_shape
   autogen: unique_dim_consecutive.out
 
@@ -5817,6 +5819,7 @@
   dispatch:
     CPU: _unique2_cpu
     CUDA: _unique2_cuda
+    MPS: _unique2_mps
   tags: dynamic_output_shape
   autogen: _unique2.out
 
diff --git a/test/test_mps.py b/test/test_mps.py
index 355ba8af03b17..c305412d748fa 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -41,6 +41,7 @@
 import numpy as np
 import torch
 import torch.utils._pytree as pytree
+from itertools import permutations, product
 
 
 # Copied from `test_ops.py` for the purposes of duplicating `test_numpy_ref`
@@ -1840,6 +1841,156 @@ def test_empty_neg(self):
         y = -x
         self.assertEqual(x, y)
 
+    def _test_unique_scalar_empty(self, dtype, device, f):
+        # test scalar
+        x = torch.tensor(0, dtype=dtype, device=device)
+        unique, inverse, counts = f(x, return_inverse=True, return_counts=True)
+        expected_unique = torch.tensor([0], dtype=dtype, device=device)
+        expected_inverse = torch.tensor(0, device=device)
+        expected_counts = torch.tensor([1], device=device)
+        self.assertEqual(unique, expected_unique)
+        self.assertEqual(inverse, expected_inverse)
+        self.assertEqual(counts, expected_counts)
+
+        # test zero sized tensor
+        x = torch.zeros((0, 0, 3), dtype=dtype, device=device)
+        unique, inverse, counts = f(x, return_inverse=True, return_counts=True)
+        expected_unique = torch.tensor([], dtype=dtype, device=device)
+        expected_inverse = torch.empty((0, 0, 3), dtype=torch.long, device=device)
+        expected_counts = torch.tensor([], dtype=torch.long, device=device)
+        self.assertEqual(unique, expected_unique)
+        self.assertEqual(inverse, expected_inverse)
+        self.assertEqual(counts, expected_counts)
+
+    def _test_unique_with_expects(self, device, dtype, f, x, expected_unique, expected_inverse, expected_counts, additional_shape):
+        def ensure_tuple(x):
+            if isinstance(x, torch.Tensor):
+                return (x,)
+            return x
+
+        for return_inverse in [True, False]:
+            for return_counts in [True, False]:
+                # test with expected
+                ret = ensure_tuple(f(x, return_inverse=return_inverse, return_counts=return_counts))
+                self.assertEqual(len(ret), 1 + int(return_inverse) + int(return_counts))
+                self.assertEqual(expected_unique, ret[0])
+                if return_inverse:
+                    self.assertEqual(expected_inverse, ret[1])
+                if return_counts:
+                    count_index = 1 + int(return_inverse)
+                    self.assertEqual(expected_counts, ret[count_index])
+
+                # tests per-element unique on a higher rank tensor.
+                y = x.view(additional_shape)
+                y_unique, y_inverse, y_counts = f(y, return_inverse=True, return_counts=True)
+                self.assertEqual(expected_unique, y_unique)
+                self.assertEqual(expected_inverse.view(additional_shape), y_inverse)
+                self.assertEqual(expected_counts, y_counts)
+
+    def test_unique_all_dtypes(self, device="mps"):
+        def helper(dtype):
+            def ensure_tuple(x):
+                if isinstance(x, torch.Tensor):
+                    return (x,)
+                return x
+
+            if dtype is torch.bool:
+                x = torch.tensor([True, False, False, False, True, False, True, False], dtype=torch.bool, device=device)
+                expected_unique = torch.tensor([False, True], dtype=torch.bool, device=device)
+                expected_inverse = torch.tensor([1, 0, 0, 0, 1, 0, 1, 0], dtype=torch.long, device=device)
+                expected_counts = torch.tensor([5, 3], dtype=torch.long, device=device)
+            else:
+                x = torch.tensor([1, 2, 3, 2, 8, 5, 2, 3], dtype=dtype, device=device)
+                expected_unique = torch.tensor([1, 2, 3, 5, 8], dtype=dtype, device=device)
+                expected_inverse = torch.tensor([0, 1, 2, 1, 4, 3, 1, 2], device=device)
+                expected_counts = torch.tensor([1, 3, 2, 1, 1], device=device)
+
+            # test sorted unique
+            fs = (
+                lambda x, **kwargs: torch.unique(x, sorted=True, **kwargs),
+                lambda x, **kwargs: x.unique(sorted=True, **kwargs),
+            )
+            x_sliced = torch.empty(x.size(0) * 2, dtype=dtype, device=device)[::2].copy_(x)
+            xs = (x, x_sliced)
+            for f, x in product(fs, xs):
+                self._test_unique_with_expects(device, dtype, f, x, expected_unique, expected_inverse, expected_counts, (2, 2, 2))
+                self._test_unique_scalar_empty(dtype, device, f)
+
+            # test unsorted unique
+            fs = (
+                lambda x, **kwargs: torch.unique(x, sorted=False, **kwargs),
+                lambda x, **kwargs: x.unique(sorted=False, **kwargs)
+            )
+            for f, x in product(fs, xs):
+                self._test_unique_scalar_empty(dtype, device, f)
+                for return_inverse, return_counts in product((True, False), repeat=2):
+                    ret = ensure_tuple(f(x, return_inverse=return_inverse, return_counts=return_counts))
+                    self.assertEqual(len(ret), 1 + int(return_inverse) + int(return_counts))
+                    x_list = x.tolist()
+                    x_unique_list = ret[0].tolist()
+                    self.assertEqual(expected_unique.tolist(), sorted(x_unique_list))
+                    if return_inverse:
+                        x_inverse_list = ret[1].tolist()
+                        for i, j in enumerate(x_inverse_list):
+                            self.assertEqual(x_list[i], x_unique_list[j])
+                    if return_counts:
+                        count_index = 1 + int(return_inverse)
+                        x_counts_list = ret[count_index].tolist()
+                        for i, j in zip(x_unique_list, x_counts_list):
+                            count = 0
+                            for k in x_list:
+                                if k == i:
+                                    count += 1
+                            self.assertEqual(j, count)
+        [helper(dtype) for dtype in [torch.float32, torch.float16, torch.int64, torch.int32, torch.int16, torch.uint8]]
+
+    def test_unique(self):
+        def helper(x, return_inverse, return_counts):
+            cpu_x = x
+            x = cpu_x.detach().clone().to('mps')
+
+            result = torch.unique(x, return_inverse=return_inverse, return_counts=return_counts)
+            result_cpu = torch.unique(cpu_x, return_inverse=return_inverse, return_counts=return_counts)
+
+            self.assertEqual(result, result_cpu)
+        helper(torch.tensor([1,2,4,2,1]), False, False)
+        helper(torch.randint(3,(10,)), False, False)
+        helper(torch.randint(3,(10,)), True, False)
+        helper(torch.randint(3,(10,)), False, True)
+        helper(torch.randint(3,(10,)), True, True)
+        helper(torch.randint(3,(1,)), True, True)
+        helper(torch.randint(3,(0,)), True, True)
+
+    def test_unique_consecutive(self):
+        def helper(x, dim, return_inverse, return_counts):
+            cpu_x = x
+            x = cpu_x.detach().clone().to('mps')
+
+            result = torch.unique_consecutive(x, dim=dim, return_inverse=return_inverse, return_counts=return_counts)
+            result_cpu = torch.unique_consecutive(cpu_x, dim=dim, return_inverse=return_inverse, return_counts=return_counts)
+
+            self.assertEqual(result, result_cpu)
+        helper(torch.tensor([1,2,4,2,1]), 0, False, False)
+        helper(torch.randint(3,(10,)), 0, False, False)
+        helper(torch.randint(3,(10,)), 0, True, False)
+        helper(torch.randint(3,(10,)), 0, False, True)
+        helper(torch.randint(3,(10,)), 0, True, True)
+        helper(torch.randint(3,(10,)), 0, True, True)
+        helper(torch.randint(3,(1,)), 0, True, True)
+        helper(torch.randint(3,(0,)), 0, True, True)
+
+        helper(torch.tensor([[1,1,2,3,3,2],[1,1,1,2,2,1]]), 0, False, False)
+        helper(torch.tensor([[1,1,2,3,3,2],[1,1,1,2,2,1]]), 0, True, True)
+        helper(torch.randint(2,(20,2)), 0, True, True)
+        helper(torch.randint(2,(1,2)), 0, True, True)
+        helper(torch.randint(2,(0,2)), 0, True, True)
+
+        helper(torch.tensor([[1,1,2,3,3,2],[1,1,1,2,2,1]]), 1, False, False)
+        helper(torch.tensor([[1,1,2,3,3,2],[1,1,1,2,2,1]]), 1, True, True)
+        helper(torch.randint(2,(2,20)), 1, True, True)
+        helper(torch.randint(2,(2,1)), 1, True, True)
+        helper(torch.randint(2,(2,0)), 1, True, True)
+
     # See https://github.com/pytorch/pytorch/issues/85675
     def test_cat_non_contiguous(self):
         def rotate_subset(data):
@@ -7959,6 +8110,7 @@ class TestConsistency(TestCase):
         'nonzero': ['f32', 'i16', 'i32', 'i64'],
         'cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'linalg.cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+		'unique_consecutive': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         }
 
 
@@ -8180,8 +8332,7 @@ class TestConsistency(TestCase):
         'sigmoid': [torch.int64],
         'slice_scatter': [torch.uint8],
         'square': [torch.bool, torch.int16, torch.int32, torch.int64, torch.uint8],  # moved from section below
-
-        # count_nonzero returns wrong results for these dtypes
+        'unique': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nonzero': [torch.uint8, torch.float16],
 
         # ALLOW_LIST doesn't know about variants

From 0b533eac5006356823fbf8ac4d58e5af081b7220 Mon Sep 17 00:00:00 2001
From: Kulin Seth <kulin_seth@apple.com>
Date: Fri, 4 Nov 2022 16:51:47 -0700
Subject: [PATCH 1811/1922] NLL loss TestConsistency seems to be working.

---
 test/test_mps.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index c305412d748fa..047bc07ca320f 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8111,6 +8111,7 @@ class TestConsistency(TestCase):
         'cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'linalg.cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
 		'unique_consecutive': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.nll_loss': ['f32'],
         }
 
 
@@ -8297,7 +8298,6 @@ class TestConsistency(TestCase):
         'masked.sum': [torch.bool],
 
         # Functions that hard crash
-        'nn.functional.nll_loss': [torch.float32],
         'nn.functional.padreflect': [torch.float32], 'nn.functional.padreplicate': [torch.float32],
         'std': [torch.float16],
         'stft': [torch.float32], 'var': [torch.float16],

From 733980f3e92a51ea48f8a695a791d40f769ac02c Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Tue, 8 Nov 2022 09:36:59 -0500
Subject: [PATCH 1812/1922] Move several binary ops from block list to allow
 list (#165)

* Adjust the floating point tolerance for TestConsistency
- Also, move several binary ops which were fixed before (casting issue) to allow list

* Revert changes to the tolerances
---
 test/test_mps.py | 46 ++++++++++++++++++----------------------------
 1 file changed, 18 insertions(+), 28 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index 047bc07ca320f..67ecd2d04960c 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -7938,6 +7938,7 @@ class TestConsistency(TestCase):
         'div': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'dot': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'equal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'eq': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'erf': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'exp': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'exp2': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
@@ -7952,6 +7953,8 @@ class TestConsistency(TestCase):
         'floor_divide': ['f32', 'f16'],
         'frac': ['f16', 'f32'],
         'gradient': ['f16', 'f32', 'i16'],
+        'ge': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'gt': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'half': ['f16'],
         'hstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'index_select': ['f32', 'i16', 'i32', 'i64'],
@@ -7962,6 +7965,7 @@ class TestConsistency(TestCase):
         'isnan': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'isreal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'kron': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'le': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'linalg.matrix_norm': ['f16'],
         'linalg.svd': ['f32'],
         'linalg.vector_norm': ['f16', 'f32'],
@@ -7979,12 +7983,16 @@ class TestConsistency(TestCase):
         'logical_xor': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'logspace': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'logsumexp': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'lt': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'masked_fill': ['f16', 'i16', 'i32', 'i64'],
         'masked_select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'matmul': ['f32'],
+        'maximum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'minimum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'mm': ['f32'],
         'mv': ['f32'],
         'mul': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'ne': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'neg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.adaptive_max_pool1d': ['f32'],
         'nn.functional.adaptive_max_pool2d': ['f32'],
@@ -7994,19 +8002,10 @@ class TestConsistency(TestCase):
         'nn.functional.conv1d': ['f32'],
         'nn.functional.conv2d': ['f32'],
         'nn.functional.conv_transpose1d': ['f32'],
-        'nn.functional.cosine_embedding_loss': ['b8',
-                                                'f32',
-                                                'i16',
-                                                'i32',
-                                                'i64'],
+        'nn.functional.cosine_embedding_loss': ['b8', 'f32', 'i16', 'i32', 'i64'],
+        'nn.functional.cosine_similarity': ['f32'],
         'nn.functional.elu': ['f32'],
-        'nn.functional.feature_alpha_dropout': ['b8',
-                                                'f16',
-                                                'f32',
-                                                'i16',
-                                                'i32',
-                                                'i64',
-                                                'u8'],
+        'nn.functional.feature_alpha_dropout': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.gaussian_nll_loss': ['f32'],
         'nn.functional.glu': ['f32'],
         'nn.functional.group_norm': ['f32'],
@@ -8016,11 +8015,13 @@ class TestConsistency(TestCase):
         'nn.functional.instance_norm': ['f32'],
         'nn.functional.kl_div': ['f32', 'i16', 'i32', 'i64'],
         'nn.functional.l1_loss': ['f16', 'f32'],
+        'nn.functional.layer_norm': ['f32'],
         'nn.functional.leaky_relu': ['f32'],
         'nn.functional.linear': ['f32'],
         'nn.functional.local_response_norm': ['f32'],
         'nn.functional.margin_ranking_loss': ['f32', 'i16', 'i32'],
         'nn.functional.mse_loss': ['f16', 'f32'],
+        'nn.functional.normalize': ['f32'],
         'nn.functional.pad': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
         'nn.functional.pairwise_distance': ['f16',
                                             'f32',
@@ -8298,7 +8299,8 @@ class TestConsistency(TestCase):
         'masked.sum': [torch.bool],
 
         # Functions that hard crash
-        'nn.functional.padreflect': [torch.float32], 'nn.functional.padreplicate': [torch.float32],
+        'nn.functional.padreflect': [torch.float32], # negative padding may cause GPU reset
+        'nn.functional.padreplicate': [torch.float32],
         'std': [torch.float16],
         'stft': [torch.float32], 'var': [torch.float16],
         # + forward when requires_grad=True or running backward
@@ -8350,11 +8352,6 @@ class TestConsistency(TestCase):
         'divno_rounding_mode': None,
         'divtrunc_rounding': None,
         'empty': None,
-        'ge': None,
-        'ne': None,
-        'le': None,
-        'lt': None,
-        'gt': None,
         'splitlist_args': None,
         'reshape_as': None,
         'norm': None,
@@ -8370,8 +8367,6 @@ class TestConsistency(TestCase):
         'maxreduction_with_dim': None,
         'maxreduction_no_dim': None,
         'maxbinary': None,
-        'maximum': None,
-        'minimum': None,
         'outer': None,
         'softmaxwith_dtype': None,
         'rounddecimals_neg_3': None,
@@ -8383,7 +8378,6 @@ class TestConsistency(TestCase):
         'log_softmaxwith_dtype': None,
         'split_with_sizes': None,
         'trapezoid': None,
-        'eq': None,
         'inner': None,
         'take_along_dim': None,
 
@@ -8401,7 +8395,7 @@ class TestConsistency(TestCase):
         'amax': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'amin': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'as_strided_scatter': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'atan2': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'],
+        'atan2': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'], # atan2() may generate NAN in output
         'bernoulli': ['torch.float32'],
         'byte': ['torch.float16', 'torch.float32'],
         'char': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64'],
@@ -8422,7 +8416,6 @@ class TestConsistency(TestCase):
         'gather': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'half': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'index_put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'index_select': ['torch.uint8'],
         'int': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int64', 'torch.uint8'],
         'linalg.eigvals': ['torch.float32'],
         'linalg.multi_dot': ['torch.float32'],
@@ -8437,23 +8430,20 @@ class TestConsistency(TestCase):
         'nn.functional.bilinear': ['torch.float32'],
         'nn.functional.conv_transpose2d': ['torch.float32'],
         'nn.functional.cosine_embedding_loss': ['torch.uint8'],
-        'nn.functional.cosine_similarity': ['torch.float32'],
         'nn.functional.dropout2d': ['torch.float32'],
         'nn.functional.dropout3d': ['torch.float32'],
         'nn.functional.dropout': ['torch.float32'],
         'nn.functional.gelu': ['torch.float32'],
         'nn.functional.interpolate': ['torch.float32', 'torch.float32', 'torch.float32'],
-        'nn.functional.layer_norm': ['torch.float32'],
         'nn.functional.margin_ranking_loss': ['torch.uint8'],
         'nn.functional.max_pool1d': ['torch.float32'],
         'nn.functional.max_pool2d': ['torch.float32'],
-        'nn.functional.normalize': ['torch.float32'],
         'nn.functional.pairwise_distance': ['torch.uint8'],
         'nn.functional.triplet_margin_loss': ['torch.uint8'],
         'nn.functional.triplet_margin_with_distance_loss': ['torch.uint8'],
         'nn.functional.upsample_nearest': ['torch.float32'],
         'normal': ['torch.float16', 'torch.float32', 'torch.float16', 'torch.float32'],
-        'pow': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.uint8'],
+        'pow': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.uint8'], # pow() with integer input may return wrong results
         'prod': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'rand_like': ['torch.float16', 'torch.float32'],
@@ -8468,7 +8458,7 @@ class TestConsistency(TestCase):
         'topk': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'trapz': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'nn.functional.local_response_norm': ['torch.int64'],
-        'index_select': ['torch.bool'],
+        'index_select': ['torch.bool', 'torch.uint8'],
         'repeat': ['torch.bool'],
         'rot90': ['torch.bool'],
         'tile': ['torch.bool'],

From a3e9ff551e89ae230b901fdcfb4d4ee69448d84f Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 8 Nov 2022 13:56:11 -0800
Subject: [PATCH 1813/1922] Move passing tests to ALLOWLIST_OP (#168)

* Add passing tests to ALLOWLIST_OP

* Remove tab indentation
---
 test/test_mps.py | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index 67ecd2d04960c..7c60b823fe43f 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -7908,7 +7908,7 @@ class TestConsistency(TestCase):
         'bitwise_or': ['b8', 'i16', 'i32', 'i64', 'u8'],
         'bitwise_right_shift': ['i16', 'i32', 'i64', 'u8'],
         'bitwise_xor': ['b8', 'i16', 'i32', 'i64', 'u8'],
-        'block_diag': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
+        'block_diag': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'bmm': ['f32'],
         'broadcast_shapes': ['f32'],
         'ceil': ['f32', 'int32', 'int64', 'f16'],
@@ -7930,9 +7930,9 @@ class TestConsistency(TestCase):
         'cumsum': ['f16', 'f32', 'int16', 'int32'],
         'deg2rad': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'diag': ['f32', 'i32'],
-        'diag_embed': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
+        'diag_embed': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'diagflat': ['f32', 'i32'],
-        'diagonal_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
+        'diagonal_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'diff': ['f16', 'f32', 'i16', 'i32', 'i64'],
         'dist': ['f32'],
         'div': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -7971,6 +7971,7 @@ class TestConsistency(TestCase):
         'linalg.vector_norm': ['f16', 'f32'],
         'linspace': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'log': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        'log': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'log10': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'log1p': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'log2': ['b8', 'f32', 'i16', 'i32', 'u8'],
@@ -8006,6 +8007,7 @@ class TestConsistency(TestCase):
         'nn.functional.cosine_similarity': ['f32'],
         'nn.functional.elu': ['f32'],
         'nn.functional.feature_alpha_dropout': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.embedding': ['f16', 'f32'],
         'nn.functional.gaussian_nll_loss': ['f32'],
         'nn.functional.glu': ['f32'],
         'nn.functional.group_norm': ['f32'],
@@ -8018,11 +8020,12 @@ class TestConsistency(TestCase):
         'nn.functional.layer_norm': ['f32'],
         'nn.functional.leaky_relu': ['f32'],
         'nn.functional.linear': ['f32'],
-        'nn.functional.local_response_norm': ['f32'],
+        'nn.functional.local_response_norm': ['f32', 'i64'],
         'nn.functional.margin_ranking_loss': ['f32', 'i16', 'i32'],
         'nn.functional.mse_loss': ['f16', 'f32'],
         'nn.functional.normalize': ['f32'],
-        'nn.functional.pad': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
+        'nn.functional.pad': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.padcircular': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.pairwise_distance': ['f16',
                                             'f32',
                                             'i16',
@@ -8068,14 +8071,14 @@ class TestConsistency(TestCase):
         'rot90': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'round': ['f32', 'f16', 'i16', 'i32', 'i64'],
         'rsqrt': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'select_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
+        'select_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'sgn': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'short': ['i16'],
         'sigmoid': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
         'sign': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8', 'i64'],
         'sin': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'sinh': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'slice_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
+        'slice_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'softmax': ['f32'],
         'special.ndtr': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'split': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8111,8 +8114,9 @@ class TestConsistency(TestCase):
         'nonzero': ['f32', 'i16', 'i32', 'i64'],
         'cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'linalg.cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-		'unique_consecutive': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'unique_consecutive': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.nll_loss': ['f32'],
+        'byte': ['b8', 'i16', 'i32', 'i64', 'u8'],
         }
 
 
@@ -8225,7 +8229,7 @@ class TestConsistency(TestCase):
         'nn.functional.kl_div': ['f32'],
         'nn.functional.l1_loss': ['f16', 'f32'],
         'nn.functional.leaky_relu': ['f32'],
-        'nn.functional.local_response_norm': ['f32'],
+        'nn.functional.local_response_norm': ['f32', 'i64'],
         'nn.functional.margin_ranking_loss': ['f32'],
         'nn.functional.mse_loss': ['f32'],
         'nn.functional.pad': ['f16', 'f32', 'i16', 'i32', 'i64'],
@@ -8305,7 +8309,6 @@ class TestConsistency(TestCase):
         'stft': [torch.float32], 'var': [torch.float16],
         # + forward when requires_grad=True or running backward
         'index_select': [torch.float16],
-        'nn.functional.embedding': [torch.float32, torch.float16],
         '__rpow__': [torch.int64],
         'masked.std': [torch.int32],
         'masked.var': [torch.int32],
@@ -8313,9 +8316,7 @@ class TestConsistency(TestCase):
         'atan2': [torch.int64],
         'bfloat16': None,
         'block_diag': [torch.uint8],
-        'byte': None,
         'chalf': None,
-        'diag_embed': [torch.uint8],
         'diagonal_scatter': [torch.uint8],
         'index_add': None,
         'long': None,
@@ -8327,12 +8328,9 @@ class TestConsistency(TestCase):
         'nn.functional.conv_transpose2d': [torch.int64],
         'nn.functional.conv_transpose3d': [torch.int64, torch.float32],
         'nn.functional.huber_loss': [torch.float16],
-        'nn.functional.local_response_norm': [torch.int64],
-        'nn.functional.padcircular': [torch.uint8],
+        'nn.functional.softplus': [torch.float32],
         'pow': [torch.int64],
-        'select_scatter': [torch.uint8],
         'sigmoid': [torch.int64],
-        'slice_scatter': [torch.uint8],
         'square': [torch.bool, torch.int16, torch.int32, torch.int64, torch.uint8],  # moved from section below
         'unique': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nonzero': [torch.uint8, torch.float16],

From 3670890b6ac0c969afcbb6bbf70b55b014b75394 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Tue, 8 Nov 2022 17:25:17 -0500
Subject: [PATCH 1814/1922] Fix padding dimension issues with constant_pad_nd
 (#169)

- Put constant value in the key
- Fix crash when grad_output.numel() is 0
- Fill output tensor when input.numel() is 0
---
 aten/src/ATen/native/mps/operations/Pad.mm | 79 +++++++++++++---------
 test/test_mps.py                           |  9 ++-
 2 files changed, 54 insertions(+), 34 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Pad.mm b/aten/src/ATen/native/mps/operations/Pad.mm
index 6d8ee3b6e5d04..990f477b389d8 100644
--- a/aten/src/ATen/native/mps/operations/Pad.mm
+++ b/aten/src/ATen/native/mps/operations/Pad.mm
@@ -13,7 +13,7 @@
                          MPSGraphPaddingMode mode, double constantValue, const string op_name)
 {
   const int padding_size = (int) padding.size();
-  const int padding_dim = padding_size / 2; // either 1D, 2D, or 3D
+  int padding_dim = padding_size / 2; // either 1D, 2D, or 3D
 
   TORCH_CHECK(padding_size == 2 || padding_size == 4 || padding_size == 6,
               "invalid padding argument of size ", padding_size);
@@ -23,33 +23,44 @@
 
   int64_t nbatch = 1;
   int64_t ndims = input_.ndimension();
+
+  TORCH_CHECK(ndims >= (int64_t)padding_dim, "Length of pad should be no more than twice the number of "
+              "dimensions of the input. Pad length is ", padding_size, "while the input has ", ndims, "dimensions.");
+
   // number of input dims with ConstantPad could be less than 2
-  int dim_w = ndims > 1 ? padding_dim : 0;
+  int dim_w = padding_dim;
   int dim_h = padding_dim - 1;
   int dim_d = padding_dim - 2;
   int dim_slices = 0;
 
-  if (!is_backward_pass && ndims > 1) {
+  if (!is_backward_pass && mode != MPSGraphPaddingModeConstant && ndims > padding_dim) {
     bool valid_dims = input_.size(1) != 0 && input_.size(padding_dim) != 0;
     TORCH_CHECK((ndims == 1 + padding_dim && valid_dims) ||
                 (ndims == 2 + padding_dim && valid_dims && input_.size(1 + padding_dim) != 0),
                 "3D or 4D (batch mode) tensor expected for input, but got: ", input_);
   }
 
-  if (ndims == 2 + padding_dim) {
-    nbatch = input_.size(0);
-    dim_w++;
-    dim_h++;
-    dim_d++;
+  if (ndims == padding_dim) {
+    dim_w--;
+    dim_h--;
+    dim_d--;
+  } else if (ndims > padding_dim + 1) {
+    const int dim_diff = (int)ndims - padding_dim - 1;
+    // this virtually inflates the padding with zeros if ndims > padding_dim + 2
+    padding_dim += dim_diff - 1;
+    dim_w += dim_diff;
+    dim_h += dim_diff;
+    dim_d += dim_diff;
     dim_slices++;
+    nbatch = input_.size(0);
   }
 
   int64_t pad_l = padding[0];
   int64_t pad_r = padding[1];
-  int64_t pad_t = padding_dim > 1 ? padding[2] : 0;
-  int64_t pad_b = padding_dim > 1 ? padding[3] : 0;
-  int64_t pad_front = padding_dim > 2 ? padding[4] : 0;
-  int64_t pad_back  = padding_dim > 2 ? padding[5] : 0;
+  int64_t pad_t = padding_size > 2 ? padding[2] : 0;
+  int64_t pad_b = padding_size > 2 ? padding[3] : 0;
+  int64_t pad_front = padding_size > 4 ? padding[4] : 0;
+  int64_t pad_back  = padding_size > 4 ? padding[5] : 0;
 
   int64_t nplane = input_.size(dim_slices);
   int64_t input_w = input_.size(dim_w);
@@ -86,25 +97,26 @@
       "input (H: ", input_h, ", W: ", input_w, ") is too small. Calculated "
       "output H: ", output_h, " W: ", output_w);
 
-    if (ndims == 1 + padding_dim) {
-      if (padding_dim == 3)
-        output.resize_({nplane, output_d, output_h, output_w});
-      else if (padding_dim == 2)
-        output.resize_({nplane, output_h, output_w});
-      else
-        output.resize_({nplane, output_w});
-    } else {
-      if (padding_dim == 3)
-        output.resize_({nbatch, nplane, output_d, output_h, output_w});
-      else if (padding_dim == 2)
-        output.resize_({nbatch, nplane, output_h, output_w});
-      else if (ndims > 1)
-        output.resize_({nbatch, nplane, output_w});
-      else
-        output.resize_({output_w});
+    std::vector<int64_t> outputSizes;
+    outputSizes.insert(outputSizes.begin(), output_w);
+    if (padding_dim >= 2)
+      outputSizes.insert(outputSizes.begin(), output_h);
+    if (padding_dim >= 3)
+      outputSizes.insert(outputSizes.begin(), output_d);
+    if (ndims >= 1 + padding_dim)
+      outputSizes.insert(outputSizes.begin(), nplane);
+    if (ndims >= 2 + padding_dim)
+      outputSizes.insert(outputSizes.begin(), nbatch);
+
+    output.resize_(outputSizes);
+
+    if (output.numel() == 0) {
+      return output;
     }
-    if (output.numel() == 0 || input_.numel() == 0)
+    if (input_.numel() == 0) {
+      output.fill_(constantValue);
       return output;
+    }
     input = input_.contiguous();
   } else {
     TORCH_CHECK(output_w == grad_output_.size(dim_w),
@@ -113,6 +125,9 @@
       TORCH_CHECK(output_h == grad_output_.size(dim_h),
         "gradOutput height unexpected. Expected: ", output_h, ", Got: ", grad_output_.size(dim_h));
     }
+    output.resize_as_(input);
+    if (output.numel() == 0 || grad_output_.numel() == 0)
+      return output;
     grad_output = grad_output_.contiguous();
   }
 
@@ -139,10 +154,8 @@
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
 
   @autoreleasepool {
-    string key = op_name + getTensorsStringKey({input, grad_output}) +
-                           ":L" + to_string(pad_l)     + ":R" + to_string(pad_r) +
-                           ":T" + to_string(pad_t)     + ":B" + to_string(pad_b) +
-                           ":F" + to_string(pad_front) + ":K" + to_string(pad_back);
+    string key = op_name + getTensorsStringKey({input, grad_output, output}) + ":[" +
+                           getArrayRefString(padding) + "]:" + std::to_string(constantValue);
 
     CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
     if(!cachedGraph) {
diff --git a/test/test_mps.py b/test/test_mps.py
index 7c60b823fe43f..175f217236d6f 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -4100,6 +4100,12 @@ def helper(shape, padding, op, value=0):
         helper((1, 2, 3), (0, 0, 0, 1), nn.ConstantPad2d)
         # pad dims < input dims
         helper((50, 9, 300), (0, 0, 0, 31), nn.ConstantPad2d)
+        # pad dims == input dims
+        helper((1, 3), (0, 2, 0, 1), nn.ConstantPad2d)
+        # input.numel() == 0 but output.numel() > 0
+        helper((0, 3, 3), (1, 1, 1, 1, 1, 1), nn.ConstantPad2d)
+        # pad dims < input dims - 2
+        helper((1, 2, 3, 4), (1, 2), nn.ConstantPad2d)
 
         # 3D Padding
         helper((2, 4, 6, 8, 4), (1, 3, 3, 5, 3, 4), nn.ReflectionPad3d)
@@ -4108,6 +4114,7 @@ def helper(shape, padding, op, value=0):
         # Constant Pad 3D
         helper((2, 4, 6, 8, 4), (1, 3, 3, 5, 3, 4), nn.ConstantPad3d)
 
+
     # Test stack forward
     def test_stack(self):
         # All shapes must be same
@@ -7922,6 +7929,7 @@ class TestConsistency(TestCase):
         'combinations': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'conj': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'conj_physical': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'constant_pad_nd': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'contiguous': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'corrcoef': ['f32'],
         'cos': ['b8', 'f32', 'i16', 'i32', 'u8', 'i64'],
@@ -8397,7 +8405,6 @@ class TestConsistency(TestCase):
         'bernoulli': ['torch.float32'],
         'byte': ['torch.float16', 'torch.float32'],
         'char': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64'],
-        'constant_pad_nd': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'count_nonzero': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'diff': ['torch.bool', 'torch.uint8'],
         'eig': ['torch.float32'],

From 66c199c8c62cc59dff66a9979184125ca28b5381 Mon Sep 17 00:00:00 2001
From: skotapati <siddharth.kotapati@gmail.com>
Date: Wed, 9 Nov 2022 10:29:54 -0800
Subject: [PATCH 1815/1922] Dev/skotapati/inverse (#167)

* Add Inverse op plus basic test run

* Resolved non-contiguous output

* Formatting changes
---
 .../src/ATen/native/mps/operations/Inverse.mm | 78 +++++++++++++++++++
 aten/src/ATen/native/native_functions.yaml    |  1 +
 test/test_mps.py                              | 15 ++++
 3 files changed, 94 insertions(+)
 create mode 100644 aten/src/ATen/native/mps/operations/Inverse.mm

diff --git a/aten/src/ATen/native/mps/operations/Inverse.mm b/aten/src/ATen/native/mps/operations/Inverse.mm
new file mode 100644
index 0000000000000..63a7f0376bb2d
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/Inverse.mm
@@ -0,0 +1,78 @@
+#include <ATen/ATen.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <torch/library.h>
+#include <c10/util/Optional.h>
+
+
+namespace at {
+namespace native {
+
+TORCH_IMPL_FUNC(linalg_inv_ex_out_mps)(const Tensor& A, bool check_errors, const Tensor& result, const Tensor& info)
+{
+    TORCH_CHECK(result.is_mps(), "Output tensor is not MPS");
+
+    using namespace mps;
+    MPSStream* stream = getCurrentMPSStream();
+    info.zero_();
+
+    struct CachedGraph : public MPSCachedGraph
+    {
+        CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+        MPSGraphTensor* inputTensor_ = nil;
+        MPSGraphTensor* outputTensor_ = nil;
+    };
+
+    Tensor output = result;
+    bool isContiguous = true;
+    if (!result.is_contiguous()) {
+        output = result.contiguous();
+        isContiguous = false;
+    }
+
+    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+    @autoreleasepool {
+        string key = "inv_out_mps" + getTensorsStringKey({A});
+        CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+        if(!cachedGraph)
+        {
+            MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+                CachedGraph *newCachedGraph = nil;
+                
+                @autoreleasepool {
+                    MPSGraph* mpsGraph = make_mps_graph();
+                    newCachedGraph = new CachedGraph(mpsGraph);
+                    MPSGraphTensor* inputTensor= mpsGraphRankedPlaceHolder(mpsGraph, A);
+                    MPSGraphTensor* outputTensor = [mpsGraph inverseOfTensor: inputTensor
+                                                                    name: nil];
+
+                    newCachedGraph->inputTensor_ = inputTensor;
+                    newCachedGraph->outputTensor_ = outputTensor;
+                }
+
+                return newCachedGraph;
+
+            });
+            cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+        }
+
+        Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor_, A);
+        Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, isContiguous ? result : output);
+
+        NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+            inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData()
+        };
+
+        NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+            outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+        };
+
+        runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+        if (!isContiguous) {
+            result.copy_(output);
+        }
+    }
+}
+}
+}
\ No newline at end of file
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 7fa80d3fd16ee..519215dc3da20 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -12901,6 +12901,7 @@
   structured: True
   dispatch:
     CPU, CUDA: linalg_inv_ex_out
+    MPS: linalg_inv_ex_out_mps
 
 - func: linalg_inv(Tensor A) -> Tensor
   python_module: linalg
diff --git a/test/test_mps.py b/test/test_mps.py
index 175f217236d6f..62da8d70b6f69 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -5117,6 +5117,21 @@ def helper(shape, diag=0):
         helper((2, 8, 4, 5), diag=-2)
         helper((2, 8, 4, 5), diag=-3)
 
+    # Test inverse
+    def test_inverse(self):
+        def helper(n):
+            cpu_input = torch.randn(n, n, device='cpu')
+            mps_input = cpu_input.to('mps')
+
+            cpu_result = torch.linalg.inv(cpu_input)
+            mps_result = torch.linalg.inv(mps_input)
+            self.assertEqual(cpu_result, mps_result)
+
+        helper(2)
+        helper(6)
+        helper(3)
+        helper(8)
+
     # Test tril
     def test_tril(self):
         def helper(shape, diag=0):

From 507cb593822c4b37024114a44c616183aa50adfa Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Wed, 9 Nov 2022 10:31:52 -0800
Subject: [PATCH 1816/1922] Fix gather for uint8 data type (index_select)
 (#170)

* Fix gather for uint8 data type (index_select)

* Address PR comments
---
 aten/src/ATen/native/mps/OperationUtils.h       |  2 +-
 aten/src/ATen/native/mps/OperationUtils.mm      |  5 +++--
 aten/src/ATen/native/mps/operations/Indexing.mm | 13 +++++++++----
 test/test_mps.py                                |  4 +---
 4 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
index cf4e39a1c9c99..57cf7b3291b40 100644
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@@ -64,7 +64,7 @@ class Placeholder {
  public:
   Placeholder() : _placeholder(nullptr), _value(nullptr), _tensor(Tensor()) {}
   Placeholder(MPSGraphTensor* mpsGraphTensor) : _placeholder(mpsGraphTensor), _value(nullptr), _tensor(Tensor()) {}
-  Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& self, MPSShape *mpsShape = nullptr, bool gatherTensorData = true);
+  Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& self, MPSShape *mpsShape = nullptr, bool gatherTensorData = true, MPSDataType dataType = MPSDataTypeInvalid);
   MPSGraphTensor* getMPSGraphTensor() {
     return _placeholder;
   }
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index 66ede5304f36f..016a96ec3ac9f 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -200,7 +200,7 @@ void printTensorNDArray(const Tensor& t) {
   return [tmpGraphTensorData mpsndarray];
 }
 
-Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& src, MPSShape *mpsShape, bool gatherTensorData) : _tensor(src)
+Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& src, MPSShape *mpsShape, bool gatherTensorData, MPSDataType dataType) : _tensor(src)
 {
   TORCH_CHECK(src.is_mps(), "Placeholder storage has not been allocated on MPS device!");
   // extract the pointer to MTLBuffer from the Tensor's storage
@@ -222,7 +222,8 @@ void printTensorNDArray(const Tensor& t) {
   // if buffer size is zero in here, it's not a user error. It could be a missing check for
   // tensor.numel() == 0 in our internal implementations of ops.
   TORCH_INTERNAL_ASSERT([srcBuf length] > 0, "Placeholder tensor is empty!");
-  const MPSDataType mpsDataType = _tensor.dim() == 0 ? getMPSScalarType(_tensor.scalar_type()) : getMPSDataType(_tensor.scalar_type());
+  const MPSDataType mpsDataType = dataType != MPSDataTypeInvalid ? dataType :
+                      _tensor.dim() == 0 ? getMPSScalarType(_tensor.scalar_type()) : getMPSDataType(_tensor.scalar_type());
 
   if (src.is_view() && src.is_contiguous() && src.storage_offset()) {
     _value = getMPSGraphTensorDataForView(src, mpsShape, mpsDataType);
diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index be74750879769..592de5ca468ea 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -634,7 +634,10 @@ Tensor index_select_mps(const Tensor & self,
   };
 
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
-
+  auto inputType = getMPSDataType(self.scalar_type());
+  if (inputType ==  MPSDataTypeUInt8) {
+      inputType =  MPSDataTypeInt8;
+  }
   @autoreleasepool {
 
     string key = "index_select_out_mps" + getTensorsStringKey({self, index}) + ":" + std::to_string(dim);
@@ -648,7 +651,7 @@ Tensor index_select_mps(const Tensor & self,
           MPSGraph* mpsGraph = make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
 
-          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, inputType, getMPSShape(self));
           MPSGraphTensor* indexTensor = mpsGraphRankedPlaceHolder(mpsGraph, index);
 
           MPSGraphTensor* outputTensor = [mpsGraph gatherWithUpdatesTensor:inputTensor
@@ -665,9 +668,11 @@ Tensor index_select_mps(const Tensor & self,
       });
     }
 
-    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self,
+                                  /*mpsShape=*/nullptr, /*gatherTensorData=*/true, /*dataType=*/inputType);
     Placeholder indexPlaceholder = Placeholder(cachedGraph->indexTensor_, index);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output,
+                                  /*mpsShape=*/nullptr, /*gatherTensorData=*/false, /*dataType=*/inputType);
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
       selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
diff --git a/test/test_mps.py b/test/test_mps.py
index 62da8d70b6f69..cd202bda5e2a2 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -7980,7 +7980,7 @@ class TestConsistency(TestCase):
         'gt': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'half': ['f16'],
         'hstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'index_select': ['f32', 'i16', 'i32', 'i64'],
+        'index_select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'int': ['i32'],
         'isclose': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'isfinite': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8331,7 +8331,6 @@ class TestConsistency(TestCase):
         'std': [torch.float16],
         'stft': [torch.float32], 'var': [torch.float16],
         # + forward when requires_grad=True or running backward
-        'index_select': [torch.float16],
         '__rpow__': [torch.int64],
         'masked.std': [torch.int32],
         'masked.var': [torch.int32],
@@ -8478,7 +8477,6 @@ class TestConsistency(TestCase):
         'topk': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'trapz': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'nn.functional.local_response_norm': ['torch.int64'],
-        'index_select': ['torch.bool', 'torch.uint8'],
         'repeat': ['torch.bool'],
         'rot90': ['torch.bool'],
         'tile': ['torch.bool'],

From 5cfba91d2966b4fb45e266b403a3a67dba96736a Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Wed, 9 Nov 2022 12:03:19 -0800
Subject: [PATCH 1817/1922] Add support for cdist on mps (#164)

* Add support for cdist

* Address PR comments
---
 aten/src/ATen/native/mps/OperationUtils.h     |   7 +
 .../ATen/native/mps/operations/ReduceOps.mm   | 130 ++++++++++++++--
 aten/src/ATen/native/native_functions.yaml    |   1 +
 test/test_mps.py                              | 145 ++++++++++++++++++
 4 files changed, 271 insertions(+), 12 deletions(-)

diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
index 57cf7b3291b40..93e45d24c220a 100644
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@@ -129,6 +129,13 @@ struct MPSUnaryCachedGraph : public MPSCachedGraph
   MPSGraphTensor *outputTensor_ = nil;
 };
 
+struct MPSBinaryCachedGraph : public MPSCachedGraph
+{
+  MPSBinaryCachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+  MPSGraphTensor *inputTensor_ = nil;
+  MPSGraphTensor *otherTensor_ = nil;
+  MPSGraphTensor *outputTensor_ = nil;
+};
 
 // TODO: Improve the overall design of MPSGraphCache.
 // https://github.com/pytorch/pytorch/issues/77176
diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index d186ff01b784f..0633949674b55 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -14,6 +14,9 @@
 namespace at {
 namespace native {
 
+typedef MPSGraphTensor* (^NormOpBlock)(mps::MPSBinaryCachedGraph*, MPSGraphTensor*, MPSGraphTensor*);
+#define NormOpFn(graph, primary, secondary) MPSGraphTensor* (mps::MPSBinaryCachedGraph* graph, MPSGraphTensor* primary, MPSGraphTensor* secondary)
+
 enum StdVarType {
   STANDARD_VARIANCE,
   STANDARD_DEVIATION
@@ -427,11 +430,16 @@ Tensor count_nonzero_mps(const Tensor& self, IntArrayRef dims){
 
 void impl_func_norm_mps(
     const Tensor& input_tensor,
+    const Tensor& other_tensor,
     const OptionalScalarRef& opt_p,
     IntArrayRef dim,
     bool keepdim,
     optional<ScalarType> opt_dtype,
-    const Tensor& output_t) {
+    const Tensor& output_t,
+    bool cdist = false,
+    c10::optional<IntArrayRef> input_broadcasted_shape = c10::nullopt,
+    NormOpBlock normOpBlock = nullptr
+    ) {
 
   namespace native_mps = at::native::mps;
   if (input_tensor.numel() == 0)
@@ -441,7 +449,7 @@ void impl_func_norm_mps(
   auto in_dtype = opt_dtype.value_or(input_tensor.scalar_type());
   auto mps_input_dtype = native_mps::getMPSDataType(in_dtype);
 
-  IntArrayRef input_shape = input_t.sizes();
+  IntArrayRef input_shape = cdist ? input_broadcasted_shape.value() : input_t.sizes();
 
   for(int i = 0; i < dim.size(); i++) {
     auto wrap_dim = maybe_wrap_dim(dim[i], input_shape.size());
@@ -449,7 +457,7 @@ void impl_func_norm_mps(
     "norm_out_mps: reduction dim must be in the range of input shape")
   }
 
-  using CachedGraph = native_mps::MPSUnaryCachedGraph;
+  using CachedGraph = native_mps::MPSBinaryCachedGraph;
 
   native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
 
@@ -479,6 +487,12 @@ void impl_func_norm_mps(
                       num_output_dims,
                       input_shape,
                       axes);
+
+  if (cdist) {
+    apparent_input_shape  = [mps::getMPSShape(input_tensor.sizes()) mutableCopy];
+    apparent_output_shape = [mps::getMPSShape(output_t.sizes()) mutableCopy];
+  }
+
   if (output_t.numel() == 0) {
     return;
   }
@@ -488,7 +502,8 @@ void impl_func_norm_mps(
   @autoreleasepool {
     NSString* ns_key = [[axes valueForKey:@"description"] componentsJoinedByString:@","];
       string keepdim_info = (keepdim) ? "keepdim=1" : "keepdim=0";
-      string key =  string("norm_out_mps:") + [ns_key UTF8String] + ":" + native_mps::getMPSTypeString(input_t.scalar_type()) + ":p" + to_string(p) + ":" + keepdim_info;
+      string tensor_key = cdist ? native_mps::getTensorsStringKey({input_tensor, other_tensor}) : mps::getTensorsStringKey({input_t});
+      string key =  string("norm_out_mps:") + [ns_key UTF8String] + ":" + tensor_key + ":p" + to_string(p) + ":" + keepdim_info;
 
     auto cachedGraph = cache_->LookUpAs<CachedGraph>(key);
 
@@ -501,8 +516,15 @@ void impl_func_norm_mps(
           MPSGraph* mpsGraph = native_mps::make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
 
-          MPSGraphTensor* inputTensor_ = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(input_t.scalar_type()));
-          MPSGraphTensor* inputTensor = inputTensor_;
+          if (cdist) {
+            newCachedGraph->inputTensor_ = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_tensor);
+            newCachedGraph->otherTensor_ = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, other_tensor);
+          } else {
+            newCachedGraph->inputTensor_ = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(input_t.scalar_type()));
+          }
+
+          MPSGraphTensor* inputTensor = cdist ? normOpBlock(newCachedGraph, newCachedGraph->inputTensor_, newCachedGraph->otherTensor_) :
+                                                newCachedGraph->inputTensor_;
           if (opt_dtype.has_value()) {
             inputTensor = [mpsGraph castTensor:inputTensor
                                          toType:mps_input_dtype
@@ -564,7 +586,10 @@ void impl_func_norm_mps(
                                                          name:nil];
           }
 
-          newCachedGraph->inputTensor_ = inputTensor_;
+          if (cdist) {
+            outputTensor= [mpsGraph reshapeTensor:outputTensor withShape:mps::getMPSShape(output_t) name: nil];
+          }
+
           newCachedGraph->outputTensor_ = outputTensor;
         }
         return newCachedGraph;
@@ -572,6 +597,7 @@ void impl_func_norm_mps(
     }
 
     auto inputPlaceholder = native_mps::Placeholder();
+    auto otherPlaceholder = native_mps::Placeholder();
 
     if(apparent_input_shape)
       inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t, apparent_input_shape);
@@ -580,10 +606,13 @@ void impl_func_norm_mps(
 
     auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, apparent_output_shape);
 
+    NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =[NSMutableDictionary dictionary];
+    feeds[inputPlaceholder.getMPSGraphTensor()]   = inputPlaceholder.getMPSGraphTensorData();
 
-    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-    };
+    if (cdist) {
+      otherPlaceholder = native_mps::Placeholder(cachedGraph->otherTensor_, other_tensor);
+      feeds[otherPlaceholder.getMPSGraphTensor()] = otherPlaceholder.getMPSGraphTensorData();
+    }
 
     NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
       outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
@@ -599,7 +628,7 @@ void impl_func_norm_mps(
  IntArrayRef dim,
  bool keepdim,
  const Tensor& result) {
-  impl_func_norm_mps(self, opt_p, dim, keepdim, c10::nullopt, result);
+  impl_func_norm_mps(self, self, opt_p, dim, keepdim, c10::nullopt, result, /*cdist=*/false);
 }
 
 TORCH_IMPL_FUNC(norm_dtype_out_mps)
@@ -609,7 +638,84 @@ void impl_func_norm_mps(
  bool keepdim,
  ScalarType dtype,
  const Tensor& result) {
-  impl_func_norm_mps(self, opt_p, dim, keepdim, dtype, result);
+  impl_func_norm_mps(self, self, opt_p, dim, keepdim, dtype, result, /*cdist=*/false);
+}
+
+Tensor _cdist_forward_mps(const Tensor& x1, const Tensor& x2, const double p, c10::optional<int64_t> compute_mode) {
+  using namespace mps;
+  TORCH_CHECK(x1.dim() >= 2, "cdist only supports at least 2D tensors, X1 got: ", x1.dim(), "D");
+  TORCH_CHECK(x2.dim() >= 2, "cdist only supports at least 2D tensors, X2 got: ", x2.dim(), "D");
+  TORCH_CHECK(x1.size(-1) == x2.size(-1), "X1 and X2 must have the same number of columns. X1: ", x1.size(-1), " X2: ", x2.size(-1));
+  TORCH_CHECK(at::isFloatingType(x1.scalar_type()), "cdist only supports floating-point dtypes, X1 got: ", x1.scalar_type());
+  auto device1 = x1.device().type();
+  TORCH_CHECK(at::isFloatingType(x2.scalar_type()), "cdist only supports floating-point dtypes, X2 got: ", x2.scalar_type());
+  auto device2 = x2.device().type();
+  TORCH_CHECK(p >= 0, "cdist only supports non-negative p values");
+  TORCH_CHECK(device1 == device2, "X1 and X2 must have the same device type. X1: ", device1, " X2: ", device2);
+  TORCH_CHECK(x1.is_mps() && (x1.get_device() == x2.get_device()), "device of X1 (", x1.get_device(), ") must match device of X2 (", x2.get_device(), ")");
+
+  int64_t c1 = x1.size(-1);
+  int64_t c2 = x2.size(-1);
+
+  auto dim1 = x1.dim();
+  auto dim2 = x2.dim();
+  int64_t mode = compute_mode.value_or(0);
+  TORCH_CHECK(mode >= 0 && mode <= 2, "possible modes: 0, 1, 2, but was: ", mode);
+
+  int64_t r1 = x1.size(-2);
+  int64_t r2 = x2.size(-2);
+
+  //For batch calculation we expand all dimensions(except the last two) to one, with size that equals to product of them.
+  //The last two dimensions will stay the same
+  IntArrayRef batch_tensor1(x1.sizes().data(), dim1 - 2);
+  IntArrayRef batch_tensor2(x2.sizes().data(), dim2 - 2);
+  std::vector<int64_t> expand_batch_portion = infer_size(batch_tensor1, batch_tensor2);
+  std::vector<int64_t> tensor1_expand_size(expand_batch_portion);
+  tensor1_expand_size.insert(tensor1_expand_size.end(), {r1, c1});
+  std::vector<int64_t> tensor2_expand_size(expand_batch_portion);
+  tensor2_expand_size.insert(tensor2_expand_size.end(), {r2, c2});
+
+  const int64_t expand_batch_product = c10::multiply_integers(expand_batch_portion);
+  std::vector<int64_t> tensor1_view{expand_batch_product, r1, c1};
+  std::vector<int64_t> tensor2_view{expand_batch_product, r2, c2};
+
+  std::vector<int64_t> output_shape(expand_batch_portion);
+  output_shape.insert(output_shape.end(), {r1, r2});
+  Tensor result = at::empty(output_shape, x1.options());
+
+  NormOpBlock norm_op_block = ^NormOpFn(cachedGraph, x1Tensor, x2Tensor) {
+    MPSGraph* mpsGraph = cachedGraph->graph();
+
+    MPSGraphTensor* inputBroadcast = [mpsGraph broadcastTensor:x1Tensor toShape:getMPSShape(tensor1_expand_size) name:nil];
+    MPSGraphTensor* inputBroadcastReshape = [mpsGraph reshapeTensor:inputBroadcast withShape:getMPSShape(tensor1_view) name:nil];
+
+    MPSGraphTensor* otherBroadcast = [mpsGraph broadcastTensor:x2Tensor toShape:getMPSShape(tensor2_expand_size) name:nil];
+    MPSGraphTensor* otherBroadcastReshape = [mpsGraph reshapeTensor:otherBroadcast withShape:getMPSShape(tensor2_view) name:nil];
+
+    NSMutableArray<MPSGraphTensor*> *inputArray = [NSMutableArray arrayWithCapacity:tensor1_view[1]];
+    NSMutableArray<MPSGraphTensor*> *otherArray = [NSMutableArray arrayWithCapacity:tensor2_view[1]];
+
+    for (const auto i : c10::irange(tensor2_view[1])) {
+      inputArray[i] = inputBroadcastReshape;
+    }
+
+    for (const auto i : c10::irange(tensor1_view[1])) {
+      otherArray[i] = otherBroadcastReshape;
+    }
+
+    MPSGraphTensor *inputTensorReshaped = [mpsGraph concatTensors:inputArray dimension:1 interleave:YES name:nil];
+    MPSGraphTensor *otherTensorReshaped = [mpsGraph concatTensors:otherArray dimension:1 interleave:NO name:nil];
+
+
+    MPSGraphTensor *inputTensorPNorm = [mpsGraph subtractionWithPrimaryTensor: inputTensorReshaped
+                                                              secondaryTensor: otherTensorReshaped
+                                                                         name: nil];
+    return inputTensorPNorm;
+  };
+
+  c10::optional<IntArrayRef> inputBroadcastSize = c10::make_optional(makeArrayRef(tensor1_view.data(), tensor1_view.size()));
+  impl_func_norm_mps(x1, x2, OptionalScalarRef(p), makeArrayRef<int64_t>(2), false, c10::nullopt, result, /*cdist=*/true, inputBroadcastSize, norm_op_block);
+  return result;
 }
 
 Tensor std_var_common_impl_mps(
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 519215dc3da20..49257e5a2f04d 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -4085,6 +4085,7 @@
 - func: _cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor
   dispatch:
     CPU, CUDA: _cdist_forward
+    MPS: _cdist_forward_mps
   autogen: _cdist_forward.out
 
 - func: _cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor
diff --git a/test/test_mps.py b/test/test_mps.py
index cd202bda5e2a2..b12dfd6206095 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -348,6 +348,151 @@ def helper(dtype):
             self.assertEqual(res2, res2_cpu)
         [helper(dtype) for dtype in [torch.int32, torch.int64, torch.float32]]
 
+    def test_cdist_large(self, device="mps"):
+        for cm in ['use_mm_for_euclid_dist_if_necessary', 'use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']:
+            x = torch.randn(1000, 10, device=device)
+            y = torch.randn(1000, 10, device=device)
+            actual = torch.cdist(x, y, p=2, compute_mode=cm)
+            expected = self._brute_cdist(x, y, p=2)
+            self.assertEqual(expected, actual)
+
+    def test_cdist_large_batch(self, device="mps"):
+        for cm in ['use_mm_for_euclid_dist_if_necessary', 'use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']:
+            x = torch.randn(4, 3, 1000, 10, device=device)
+            y = torch.randn(4, 3, 1000, 10, device=device)
+            actual = torch.cdist(x, y, p=2, compute_mode=cm)
+            expected = self._brute_cdist(x, y, p=2)
+            self.assertEqual(expected, actual)
+
+    def test_cdist_non_contiguous(self, device="mps"):
+        for cm in ['use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']:
+            x = torch.randn(5, 7, device=device).mT
+            y = torch.randn(5, 3, device=device).mT
+            actual = torch.cdist(x, y, p=2, compute_mode=cm)
+            expected = self._brute_cdist(x, y, p=2)
+            self.assertFalse(x.is_contiguous())
+            self.assertFalse(y.is_contiguous())
+            self.assertEqual(expected, actual)
+
+            x = torch.randn(7, 5, device=device)
+            y = torch.randn(5, 3, device=device).t()
+            actual = torch.cdist(x, y, p=2, compute_mode=cm)
+            expected = self._brute_cdist(x, y, p=2)
+            self.assertTrue(x.is_contiguous())
+            self.assertFalse(y.is_contiguous())
+            self.assertEqual(expected, actual)
+
+            x = torch.randn(5, 7, device=device).t()
+            y = torch.randn(3, 5, device=device)
+            actual = torch.cdist(x, y, p=2, compute_mode=cm)
+            expected = self._brute_cdist(x, y, p=2)
+            self.assertFalse(x.is_contiguous())
+            self.assertTrue(y.is_contiguous())
+            self.assertEqual(expected, actual)
+
+    def test_cdist_non_contiguous_batch(self, device="mps"):
+        for cm in ['use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']:
+            x = torch.randn(4, 3, 2, 5, 7, device=device).mT
+            y = torch.randn(4, 3, 2, 5, 3, device=device).mT
+            actual = torch.cdist(x, y, p=2, compute_mode=cm)
+            expected = self._brute_cdist(x, y, p=2)
+            self.assertFalse(x.is_contiguous())
+            self.assertFalse(y.is_contiguous())
+            self.assertEqual(expected, actual)
+
+            x = torch.randn(7, 2, 7, 5, device=device)
+            y = torch.randn(7, 2, 5, 3, device=device).mT
+            actual = torch.cdist(x, y, p=2, compute_mode=cm)
+            expected = self._brute_cdist(x, y, p=2)
+            self.assertTrue(x.is_contiguous())
+            self.assertFalse(y.is_contiguous())
+            self.assertEqual(expected, actual)
+
+            x = torch.randn(4, 5, 7, device=device).mT
+            y = torch.randn(4, 3, 5, device=device)
+            actual = torch.cdist(x, y, p=2, compute_mode=cm)
+            expected = self._brute_cdist(x, y, p=2)
+            self.assertFalse(x.is_contiguous())
+            self.assertTrue(y.is_contiguous())
+            self.assertEqual(expected, actual)
+
+    def test_cdist_euclidean_large(self, device="mps"):
+        def _test_euclidean_large_cdist(sizex, sizey=None):
+            if sizey is None:
+                sizey = sizex
+            x = torch.randn(sizex, device=device, dtype=torch.float)
+            y = torch.randn(sizey, device=device, dtype=torch.float)
+            eps = 1e-6
+            # to avoid extremum
+            x = x - (((x - y) < eps).float() * 2 * eps)
+            x.requires_grad = True
+            y.requires_grad = True
+            dist = torch.cdist(x, y, p=2)
+            # Do a backward pass to check that it is valid for large
+            # matrices
+            loss = dist.sum()
+            loss.backward()
+
+        _test_euclidean_large_cdist((2000, 5))
+
+    def test_cdist_same_inputs(self, device="mps"):
+        # Test to detect issues in cdist gradient calculation
+        # When the distances are 0
+        sizex = (1, 27, 32)
+        for p in [0, 1, 2, 3, 1.5, 2.5, float('inf')]:
+            x = torch.randn(sizex, device=device, dtype=torch.float)
+            dist_grad = torch.randn((1, 27, 27), device=device, dtype=torch.float)
+            y = x.clone()
+            eps = 1e-6
+            x.requires_grad = True
+            d = torch.cdist(x, y)
+            d.backward(dist_grad)
+            # Check that the backward passs does not contain invalid
+            # values such as nan or inf
+            assert torch.isfinite(x.grad).all()
+
+
+    def _brute_cdist(self, x, y, p=2):
+        r1 = x.shape[-2]
+        r2 = y.shape[-2]
+        if r1 == 0 or r2 == 0:
+            return torch.empty(r1, r2, device=x.device)
+        return torch.norm(x[..., None, :] - y[..., None, :, :], p=p, dim=-1)
+
+    def test_cdist_norm(self, device="mps"):
+        for r1 in [3, 4, 5, 6]:
+            for m in [2, 3, 4, 10]:
+                for r2 in [4, 6, 7, 8]:
+                    for p in [0, 1, 2, 3, 1.5, 2.5, float('inf')]:
+                        x = torch.randn(r1, m, device=device)
+                        y = torch.randn(r2, m, device=device)
+                        if p == 2:
+                            for cm in ['use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']:
+                                actual = torch.cdist(x, y, p=2, compute_mode=cm)
+                                expected = self._brute_cdist(x, y, p=2)
+                                self.assertEqual(expected, actual, rtol=0, atol=0.02)
+                        else:
+                            actual = torch.cdist(x, y, p=p)
+                            expected = self._brute_cdist(x, y, p=p)
+                            self.assertEqual(expected, actual)
+
+    def test_cdist_norm_batch(self, device="mps"):
+        for r1 in [3, 4, 5, 6]:
+            for m in [2, 3, 4, 10]:
+                for r2 in [4, 6, 7, 8]:
+                    for p in [0, 1, 2, 3, 1.5, 2.5, float('inf')]:
+                        x = torch.randn(2, 3, 6, r1, m, device=device)
+                        y = torch.randn(2, 3, 6, r2, m, device=device)
+                        if p == 2:
+                            for cm in ['use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']:
+                                actual = torch.cdist(x, y, p=2, compute_mode=cm)
+                                expected = self._brute_cdist(x, y, p=2)
+                                self.assertEqual(expected, actual, rtol=0, atol=0.02)
+                        else:
+                            actual = torch.cdist(x, y, p=p)
+                            expected = self._brute_cdist(x, y, p=p)
+                            self.assertEqual(expected, actual)
+
     def test_cross(self):
         a = torch.randn(4, 3, device="mps")
         b = torch.randn(4, 3, device="mps")

From 7622ed2a60441812e53096c703f8e8f9891e4bc3 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Wed, 9 Nov 2022 12:04:06 -0800
Subject: [PATCH 1818/1922] Fix MPSGraph casting issue to MPSDataTypeBool in
 masked_fill op (#166)

* Fix MPSGraph casting issue to MPSDataTypeBool in masked_fill op

* Pass scalar value as placeholder instead of constant
---
 aten/src/ATen/mps/MPSDevice.h                 |  2 +-
 aten/src/ATen/mps/MPSDevice.mm                |  2 +-
 .../native/mps/operations/Distributions.mm    |  2 +-
 .../ATen/native/mps/operations/Indexing.mm    | 40 ++++++++++++-------
 aten/src/ATen/native/mps/operations/Unique.mm |  6 +--
 test/test_mps.py                              |  7 +---
 6 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/aten/src/ATen/mps/MPSDevice.h b/aten/src/ATen/mps/MPSDevice.h
index 785369415c4d1..7427a5242c104 100644
--- a/aten/src/ATen/mps/MPSDevice.h
+++ b/aten/src/ATen/mps/MPSDevice.h
@@ -60,7 +60,7 @@ class TORCH_API MPSDevice {
 
   MTLFunction_t metalIndexingFunction(const std::string &kernel, MTLFunctionConstantValues_t constantValues);
 
-  bool macOS_13_0();
+  bool macOS_13_0_or_newer();
 
   ~MPSDevice();
 
diff --git a/aten/src/ATen/mps/MPSDevice.mm b/aten/src/ATen/mps/MPSDevice.mm
index aa6b57a80746e..a84bfd6d59e14 100644
--- a/aten/src/ATen/mps/MPSDevice.mm
+++ b/aten/src/ATen/mps/MPSDevice.mm
@@ -27,7 +27,7 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
   return mps_device.get();
 }
 
-bool MPSDevice::macOS_13_0() {
+bool MPSDevice::macOS_13_0_or_newer() {
   return _macos_13_0_or_newer;
 }
 
diff --git a/aten/src/ATen/native/mps/operations/Distributions.mm b/aten/src/ATen/native/mps/operations/Distributions.mm
index 8fa8f5d6208f4..45f49902d0622 100644
--- a/aten/src/ATen/native/mps/operations/Distributions.mm
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@@ -343,7 +343,7 @@ Tensor normal_mps(const Tensor& mean, const Tensor& std, c10::optional<Generator
 }
 
 Tensor& randperm_out_mps(int64_t n, c10::optional<Generator> generator, Tensor& result) {
-  if (!MPSDevice::getInstance()->macOS_13_0()) {
+  if (!MPSDevice::getInstance()->macOS_13_0_or_newer()) {
     TORCH_WARN_ONCE("MPS: randperm op is supported natively starting from macOS 13.0. ",
                     "Falling back on CPU. This may have performance implications.");
 
diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index 592de5ca468ea..3eed7ff5361e2 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -220,7 +220,7 @@ Tensor nonzero_fallback(const Tensor& self) {
 }
 
 Tensor& nonzero_out_mps(const Tensor& self, Tensor& out_){
-  if (!MPSDevice::getInstance()->macOS_13_0()) {
+  if (!MPSDevice::getInstance()->macOS_13_0_or_newer()) {
       Tensor out_fallback = nonzero_fallback(self);
       at::native::resize_output(out_, out_fallback.sizes());
       out_.copy_(out_fallback.to("mps"));
@@ -388,7 +388,7 @@ Tensor nonzero_fallback(const Tensor& self) {
 }
 
 Tensor nonzero_mps(const Tensor& self){
-  if (!MPSDevice::getInstance()->macOS_13_0()) {
+  if (!MPSDevice::getInstance()->macOS_13_0_or_newer()) {
     return nonzero_fallback(self);
   }
 
@@ -704,15 +704,17 @@ Tensor index_select_mps(const Tensor & self,
     CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
     MPSGraphTensor *inputTensor_ = nil;
     MPSGraphTensor *maskTensor_ = nil;
+    MPSGraphTensor *valueTensor_ = nil;
     MPSGraphTensor *outputTensor_ = nil;
   };
 
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
 
   MPSStream* stream = getCurrentMPSStream();
+  MPSScalar valueScalar = getMPSScalar(value, value.type());
   @autoreleasepool {
-    string key = "masked_fill" + getTensorsStringKey({self, mask}) + ":" + std::to_string(value.toDouble());
-    CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
+    string key = "masked_fill" + getTensorsStringKey({self, *b_mask}) + getMPSTypeString(value.type());
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
     if(!cachedGraph) {
       cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ MPSCachedGraph * () {
 
@@ -723,27 +725,34 @@ Tensor index_select_mps(const Tensor & self,
           newCachedGraph = new CachedGraph(mpsGraph);
 
           MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
-          MPSGraphTensor* maskTensor = mpsGraphRankedPlaceHolder(mpsGraph, mask);
+          MPSGraphTensor* maskTensor = mpsGraphRankedPlaceHolder(mpsGraph, *b_mask);
+          MPSGraphTensor* valueTensor = mpsGraphScalarPlaceHolder(mpsGraph, value);
+
           MPSDataType valueType = getMPSScalarType(value.type());
+          MPSDataType inputDataType = getMPSScalarType(self.scalar_type());
 
           // constantWithScalar doesn't like Bool constants getting created so
-          // mapping them to int8
-          if (valueType == MPSDataTypeBool) {
+          // mapping them to int8.
+          // Starting with macOS 13.0, this cast is not needed anymore
+          if (valueType == MPSDataTypeBool && !MPSDevice::getInstance()->macOS_13_0_or_newer()) {
             valueType = MPSDataTypeInt8;
           }
-          MPSGraphTensor* valueTensor =  [mpsGraph constantWithScalar:value.to<double>()
-                                                            dataType:valueType];
-          valueTensor = [mpsGraph castTensor:valueTensor
-                                          toType:getMPSDataType(self.scalar_type())
-                                           name : @"castTensorEq"];
+
+          MPSGraphTensor* castValueTensor = valueTensor;
+          if (valueType != inputDataType) {
+            castValueTensor = [mpsGraph castTensor:valueTensor
+                                            toType:inputDataType
+                                              name:@"castValueTensor"];
+          }
 
           MPSGraphTensor* outputTensor = [mpsGraph selectWithPredicateTensor:maskTensor
-                                                        truePredicateTensor:valueTensor
+                                                        truePredicateTensor:castValueTensor
                                                         falsePredicateTensor:inputTensor
                                                              name:nil];
 
           newCachedGraph->inputTensor_ = inputTensor;
           newCachedGraph->maskTensor_ = maskTensor;
+          newCachedGraph->valueTensor_ = valueTensor;
           newCachedGraph->outputTensor_ = outputTensor;
         }
         return newCachedGraph;
@@ -751,13 +760,14 @@ Tensor index_select_mps(const Tensor & self,
     }
 
     Placeholder selfPlaceholder   = Placeholder(cachedGraph->inputTensor_, self);
-    Placeholder maskPlaceholder   = Placeholder(cachedGraph->maskTensor_, mask);
+    Placeholder maskPlaceholder   = Placeholder(cachedGraph->maskTensor_, *b_mask);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, self);
 
     // Create dictionary of inputs and outputs
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
       selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
-      maskPlaceholder.getMPSGraphTensor() : maskPlaceholder.getMPSGraphTensorData()
+      maskPlaceholder.getMPSGraphTensor() : maskPlaceholder.getMPSGraphTensorData(),
+      cachedGraph->valueTensor_ : getMPSGraphTensorFromScalar(stream, valueScalar)
     };
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
diff --git a/aten/src/ATen/native/mps/operations/Unique.mm b/aten/src/ATen/native/mps/operations/Unique.mm
index 7b3b61f0f2f89..9049d8e00e049 100644
--- a/aten/src/ATen/native/mps/operations/Unique.mm
+++ b/aten/src/ATen/native/mps/operations/Unique.mm
@@ -310,7 +310,7 @@ void runUniqueGraph(UniqueCachedGraph *uniqueGraph, const Tensor& input, Tensor&
 
 std::tuple<Tensor, Tensor, Tensor>
 unique_consecutive_mps(const Tensor& self, const bool return_inverse, const bool return_counts, c10::optional<int64_t> dim) {
-  if (!MPSDevice::getInstance()->macOS_13_0()) {
+  if (!MPSDevice::getInstance()->macOS_13_0_or_newer()) {
     TORCH_WARN_ONCE("MPS: unique_consecutive op is supported natively starting from macOS 13.0. ",
                     "Falling back on CPU. This may have performace implications.");
     return at::unique_consecutive(self.to("cpu"), return_inverse, return_counts, dim);
@@ -321,7 +321,7 @@ void runUniqueGraph(UniqueCachedGraph *uniqueGraph, const Tensor& input, Tensor&
 
 std::tuple<Tensor, Tensor, Tensor>
 unique_dim_consecutive_mps(const Tensor& self, int64_t dim, const bool return_inverse, const bool return_counts) {
-  if (!MPSDevice::getInstance()->macOS_13_0()) {
+  if (!MPSDevice::getInstance()->macOS_13_0_or_newer()) {
     TORCH_WARN_ONCE("MPS: unique_dim_consecutive op is supported natively starting from macOS 13.0. ",
                     "Falling back on CPU. This may have performace implications.");
     return at::unique_dim_consecutive(self.to("cpu"), dim, return_inverse, return_counts);
@@ -332,7 +332,7 @@ void runUniqueGraph(UniqueCachedGraph *uniqueGraph, const Tensor& input, Tensor&
 
 std::tuple<Tensor, Tensor, Tensor>
 _unique2_mps(const Tensor& self, const bool sorted, const bool return_inverse, const bool return_counts) {
-  if (!MPSDevice::getInstance()->macOS_13_0()) {
+  if (!MPSDevice::getInstance()->macOS_13_0_or_newer()) {
     TORCH_WARN_ONCE("MPS: _unique2 op is supported natively starting from macOS 13.0. ",
                     "Falling back on CPU. This may have performace implications.");
     return at::_unique2(self.to("cpu"), sorted, return_inverse, return_counts);
diff --git a/test/test_mps.py b/test/test_mps.py
index b12dfd6206095..10333d95520fc 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8153,7 +8153,7 @@ class TestConsistency(TestCase):
         'logspace': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'logsumexp': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'lt': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked_fill': ['f16', 'i16', 'i32', 'i64'],
+        'masked_fill': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'masked_select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'matmul': ['f32'],
         'maximum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8278,13 +8278,12 @@ class TestConsistency(TestCase):
         'vsplit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'vstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'zero_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'where': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'where': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nonzero': ['f32', 'i16', 'i32', 'i64'],
         'cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'linalg.cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'unique_consecutive': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.nll_loss': ['f32'],
-        'byte': ['b8', 'i16', 'i32', 'i64', 'u8'],
         }
 
 
@@ -8463,8 +8462,6 @@ class TestConsistency(TestCase):
     # All the entries in this list should be removed
     BLOCKLIST = {
         # Functions that hang
-        'einsum': ['f32'],
-        'masked_fill': [torch.bool, torch.uint8, torch.float32], 'where': [torch.bool],
         # + forward when requires_grad=True or running backward
         'masked.mean': [torch.bool, torch.float16],
         'masked.prod': [torch.bool],

From 64fe14b19568a394ecdc22ddaaa5bcbce6dbb1c2 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Wed, 9 Nov 2022 16:01:23 -0500
Subject: [PATCH 1819/1922] Move some working tests from Block list to Allow
 list (#171)

* Move some working tests from Block list to Allow list
Also categorized some tests in block list based on their failure reasons to simply tracking them

* Move FFT ops to unimplemented section in blocklist
---
 test/test_mps.py | 121 ++++++++++++++++++++++-------------------------
 1 file changed, 56 insertions(+), 65 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index 10333d95520fc..f51edc5e78e8d 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8135,6 +8135,7 @@ class TestConsistency(TestCase):
         'kron': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'le': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'linalg.matrix_norm': ['f16'],
+        'linalg.multi_dot': ['f32'],
         'linalg.svd': ['f32'],
         'linalg.vector_norm': ['f16', 'f32'],
         'linspace': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8161,6 +8162,7 @@ class TestConsistency(TestCase):
         'mm': ['f32'],
         'mv': ['f32'],
         'mul': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'native_layer_norm': ['f32'],
         'ne': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'neg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.adaptive_max_pool1d': ['f32'],
@@ -8189,16 +8191,12 @@ class TestConsistency(TestCase):
         'nn.functional.leaky_relu': ['f32'],
         'nn.functional.linear': ['f32'],
         'nn.functional.local_response_norm': ['f32', 'i64'],
-        'nn.functional.margin_ranking_loss': ['f32', 'i16', 'i32'],
+        'nn.functional.margin_ranking_loss': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.mse_loss': ['f16', 'f32'],
         'nn.functional.normalize': ['f32'],
         'nn.functional.pad': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.padcircular': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'nn.functional.pairwise_distance': ['f16',
-                                            'f32',
-                                            'i16',
-                                            'i32',
-                                            'i64'],
+        'nn.functional.pairwise_distance': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.poisson_nll_loss': ['f32', 'i16', 'i32', 'u8'],
         'nn.functional.prelu': ['f32'],
         'nn.functional.relu': ['f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8212,11 +8210,8 @@ class TestConsistency(TestCase):
         'nn.functional.softsign': ['f16', 'f32', 'i16', 'u8'],
         'nn.functional.tanhshrink': ['f32', 'i16', 'i32', 'u8'],
         'nn.functional.threshold': ['f32', 'i16', 'i32', 'i64', 'u8'],
-        'nn.functional.triplet_margin_loss': ['f32', 'i16', 'i32', 'i64'],
-        'nn.functional.triplet_margin_with_distance_loss': ['f32',
-                                                            'i16',
-                                                            'i32',
-                                                            'i64'],
+        'nn.functional.triplet_margin_loss': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.triplet_margin_with_distance_loss': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.upsample_bilinear': ['f32'],
         'norm': ['f32', 'f16'],
         'positive': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8224,19 +8219,13 @@ class TestConsistency(TestCase):
         'rad2deg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'real': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'reciprocal': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
-        'repeat': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'repeat_interleave': ['b8',
-                              'f16',
-                              'f32',
-                              'i16',
-                              'i32',
-                              'i64',
-                              'u8'],
+        'repeat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'repeat_interleave': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'resize_': ['b8', 'i16', 'i32', 'i64', 'u8'],
         'resize_as_': ['b8', 'i16', 'i32', 'i64', 'u8'],
         'resolve_conj': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'resolve_neg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'rot90': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'rot90': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'round': ['f32', 'f16', 'i16', 'i32', 'i64'],
         'rsqrt': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'select_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8261,7 +8250,7 @@ class TestConsistency(TestCase):
         'tan': ['b8', 'i16', 'i32', 'u8'],
         'tanh': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'tensordot': ['f32'],
-        'tile': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'tile': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'topk': ['f32'],
         'trapz': ['f16', 'f32', 'i16', 'i32', 'i64'],
         'tril': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8545,44 +8534,17 @@ class TestConsistency(TestCase):
 
         # New block list ops that need investigation
         '__rpow__': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.uint8'],
-        '_masked.amax': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        '_masked.amin': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        '_masked.argmax': ['torch.float16', 'torch.float32', 'torch.int32'],
-        '_masked.argmin': ['torch.float16', 'torch.float32', 'torch.int32'],
-        '_masked.mean': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        '_masked.prod': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        '_masked.std': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        '_masked.sum': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        '_masked.var': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'amax': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'amin': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'as_strided_scatter': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'atan2': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'], # atan2() may generate NAN in output
-        'bernoulli': ['torch.float32'],
         'byte': ['torch.float16', 'torch.float32'],
         'char': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64'],
-        'count_nonzero': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'diff': ['torch.bool', 'torch.uint8'],
-        'eig': ['torch.float32'],
         'empty_like': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'fft.fft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'fft.ifft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'fft.ihfft2': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'fft.ihfft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'fft.ihfftn': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'fft.rfft2': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'fft.rfft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'fft.rfftn': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'float': ['torch.int64'],
         'gather': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'half': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'index_put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'int': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int64', 'torch.uint8'],
-        'linalg.eigvals': ['torch.float32'],
-        'linalg.multi_dot': ['torch.float32'],
-        'matmul': ['torch.uint8'],
-        'mean': ['torch.float16', 'torch.float32'],
-        'native_layer_norm': ['torch.float32'],
         'new_empty': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'nn.functional.adaptive_avg_pool1d': ['torch.float32'],
         'nn.functional.adaptive_avg_pool2d': ['torch.float32'],
@@ -8594,35 +8556,64 @@ class TestConsistency(TestCase):
         'nn.functional.dropout2d': ['torch.float32'],
         'nn.functional.dropout3d': ['torch.float32'],
         'nn.functional.dropout': ['torch.float32'],
-        'nn.functional.gelu': ['torch.float32'],
         'nn.functional.interpolate': ['torch.float32', 'torch.float32', 'torch.float32'],
-        'nn.functional.margin_ranking_loss': ['torch.uint8'],
         'nn.functional.max_pool1d': ['torch.float32'],
         'nn.functional.max_pool2d': ['torch.float32'],
-        'nn.functional.pairwise_distance': ['torch.uint8'],
-        'nn.functional.triplet_margin_loss': ['torch.uint8'],
-        'nn.functional.triplet_margin_with_distance_loss': ['torch.uint8'],
         'nn.functional.upsample_nearest': ['torch.float32'],
-        'normal': ['torch.float16', 'torch.float32', 'torch.float16', 'torch.float32'],
-        'pow': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.uint8'], # pow() with integer input may return wrong results
-        'prod': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'pow': ['torch.int16', 'torch.int32', 'torch.uint8'], # pow() with integer input may return wrong results
         'put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'rand_like': ['torch.float16', 'torch.float32'],
-        'randint_like': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'randn_like': ['torch.float16', 'torch.float32'],
         'scatter_add': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'scatter': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'short': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'sum': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'tan': ['torch.float32'],
         'tensor_split': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'topk': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'trapz': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'nn.functional.local_response_norm': ['torch.int64'],
-        'repeat': ['torch.bool'],
-        'rot90': ['torch.bool'],
-        'tile': ['torch.bool'],
-        'split': ['torch.float32'],
+
+        # failures due to lack of op implementation on MPS backend
+        'linalg.eig': ['torch.float32'],
+        'linalg.eigvals': ['torch.float32'],
+        'fft.fft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'fft.ifft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'fft.ihfft2': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'fft.ihfft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'fft.ihfftn': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'fft.rfft2': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'fft.rfft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'fft.rfftn': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+
+        # failures due to unsupported data types on MPS backend
+        'matmul': ['torch.uint8'], # MPS device does not support mm for non-float inputs
+
+        # failures due to random output that they generate using
+        # Philox engine causing mismatch with CPU results
+        'rand_like': ['torch.float16', 'torch.float32'],
+        'randint_like': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'randn_like': ['torch.float16', 'torch.float32'],
+        'bernoulli': ['torch.float32'],
+        'normal': ['torch.float16', 'torch.float32', 'torch.float16', 'torch.float32'],
+
+        # failures due to shape and type issues in ReduceOps
+        'amax': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'amin': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'sum': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'prod': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'mean': ['torch.float16', 'torch.float32'],
+        'count_nonzero': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        '_masked.argmax': ['torch.float16', 'torch.float32', 'torch.int32'],
+        '_masked.argmin': ['torch.float16', 'torch.float32', 'torch.int32'],
+        '_masked.amax': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        '_masked.amin': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        '_masked.mean': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        '_masked.prod': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        '_masked.std': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        '_masked.sum': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        '_masked.var': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+
+        # failures due to precision issues
+        'nn.functional.gelu': ['torch.float32'],
+        'pow': ['torch.float32'],
+        'tan': ['torch.float32'],
     }
 
     # Used for accept mode only

From aaf90780021580d2e919814a18cb09e487fcc146 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Thu, 10 Nov 2022 00:38:53 -0500
Subject: [PATCH 1820/1922] Unregister put() for MPS backend (#172)

* Unregister put() for MPS backend
Move some working ops from block list to allow list

* Unregister put() op foe MPS backend
---
 aten/src/ATen/native/native_functions.yaml |  2 +-
 test/test_mps.py                           | 29 ++++++++++++----------
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 49257e5a2f04d..7f5e57716242d 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -7386,7 +7386,7 @@
 - func: put_(Tensor(a!) self, Tensor index, Tensor source, bool accumulate=False) -> Tensor(a!)
   variants: method
   dispatch:
-    CPU, CUDA, MPS: put_
+    CPU, CUDA: put_
   autogen: put.out
 
 - func: put(Tensor self, Tensor index, Tensor source, bool accumulate=False) -> Tensor
diff --git a/test/test_mps.py b/test/test_mps.py
index f51edc5e78e8d..95ef6c9ae8c8d 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8252,7 +8252,7 @@ class TestConsistency(TestCase):
         'tensordot': ['f32'],
         'tile': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'topk': ['f32'],
-        'trapz': ['f16', 'f32', 'i16', 'i32', 'i64'],
+        'trapz': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'tril': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'tril_indices': ['i32', 'i64'],
         'triu': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8273,7 +8273,7 @@ class TestConsistency(TestCase):
         'linalg.cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'unique_consecutive': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.nll_loss': ['f32'],
-        }
+    }
 
 
     ALLOWLIST_OP_GRAD = {
@@ -8443,7 +8443,8 @@ class TestConsistency(TestCase):
         'view_as': ['f16', 'f32'],
         'vsplit': ['f16', 'f32'],
         'vstack': ['f16', 'f32'],
-        'zero_': ['f16', 'f32']}
+        'zero_': ['f16', 'f32']
+    }
 
     # These ops that are problematic. So never run them even when
     # generating the new allowlist.
@@ -8533,19 +8534,16 @@ class TestConsistency(TestCase):
         'take_along_dim': None,
 
         # New block list ops that need investigation
-        '__rpow__': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.uint8'],
         'as_strided_scatter': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'atan2': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'], # atan2() may generate NAN in output
         'byte': ['torch.float16', 'torch.float32'],
         'char': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64'],
         'diff': ['torch.bool', 'torch.uint8'],
-        'empty_like': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'float': ['torch.int64'],
         'gather': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'half': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'index_put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'int': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int64', 'torch.uint8'],
-        'new_empty': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'nn.functional.adaptive_avg_pool1d': ['torch.float32'],
         'nn.functional.adaptive_avg_pool2d': ['torch.float32'],
         'nn.functional.avg_pool1d': ['torch.float32', 'torch.int64'],
@@ -8553,22 +8551,15 @@ class TestConsistency(TestCase):
         'nn.functional.bilinear': ['torch.float32'],
         'nn.functional.conv_transpose2d': ['torch.float32'],
         'nn.functional.cosine_embedding_loss': ['torch.uint8'],
-        'nn.functional.dropout2d': ['torch.float32'],
-        'nn.functional.dropout3d': ['torch.float32'],
-        'nn.functional.dropout': ['torch.float32'],
         'nn.functional.interpolate': ['torch.float32', 'torch.float32', 'torch.float32'],
         'nn.functional.max_pool1d': ['torch.float32'],
         'nn.functional.max_pool2d': ['torch.float32'],
         'nn.functional.upsample_nearest': ['torch.float32'],
-        'pow': ['torch.int16', 'torch.int32', 'torch.uint8'], # pow() with integer input may return wrong results
-        'put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'scatter_add': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'scatter': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'short': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'tensor_split': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'topk': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'trapz': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'nn.functional.local_response_norm': ['torch.int64'],
 
         # failures due to lack of op implementation on MPS backend
         'linalg.eig': ['torch.float32'],
@@ -8581,6 +8572,11 @@ class TestConsistency(TestCase):
         'fft.rfft2': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'fft.rfft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'fft.rfftn': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+
+        # failures due to issue #102048039: powerWithPrimaryTensor() with integer input may return wrong results
+        'pow': ['torch.int16', 'torch.int32', 'torch.uint8'],
+        '__rpow__': ['torch.int16', 'torch.int32', 'torch.uint8'],
 
         # failures due to unsupported data types on MPS backend
         'matmul': ['torch.uint8'], # MPS device does not support mm for non-float inputs
@@ -8592,6 +8588,12 @@ class TestConsistency(TestCase):
         'randn_like': ['torch.float16', 'torch.float32'],
         'bernoulli': ['torch.float32'],
         'normal': ['torch.float16', 'torch.float32', 'torch.float16', 'torch.float32'],
+        'nn.functional.dropout': ['torch.float32'],
+        'nn.functional.dropout2d': ['torch.float32'],
+        'nn.functional.dropout3d': ['torch.float32'],
+         # these fill tensors with uninitialized data, causing mismatch with CPU
+        'new_empty': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'empty_like': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
 
         # failures due to shape and type issues in ReduceOps
         'amax': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
@@ -8614,6 +8616,7 @@ class TestConsistency(TestCase):
         'nn.functional.gelu': ['torch.float32'],
         'pow': ['torch.float32'],
         'tan': ['torch.float32'],
+        '__rpow__': ['torch.float32'],
     }
 
     # Used for accept mode only

From 5040edfd08e45e48ed9740137d3f9a43ffbcb274 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Fri, 11 Nov 2022 16:51:21 -0500
Subject: [PATCH 1821/1922] Cast to unsigned type when comparing signed vs.
 unsigned integers in BinaryOps (#173)

Also remove the double cast to boolean in comparison ops
---
 .../ATen/native/mps/operations/BinaryOps.mm   | 22 ++++++++++++-------
 test/test_mps.py                              | 11 ++++++++--
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm
index 6edb7bdb2a3de..e6d25b82eae3b 100644
--- a/aten/src/ATen/native/mps/operations/BinaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@@ -73,9 +73,16 @@ void binaryOpTensor(const Tensor& self, const Tensor& other, const Scalar& alpha
 
           // this type inference is only required at the time of graph creation
           ScalarType common_dtype = c10::promoteTypes(self.scalar_type(), other.scalar_type());
-          // Integer input must be cast to float if output is float
-          if (isIntegralType(common_dtype, true) && isFloatingType(output.scalar_type())) {
-            common_dtype = output_.scalar_type();
+          if (isIntegralType(common_dtype, true)) {
+            // integer inputs must be cast to float, if output is float
+            if (isFloatingType(output_.scalar_type())) {
+              common_dtype = output_.scalar_type();
+            // in boolean comparison ops with signed vs. unsigned integers, we always cast to the unsigned type
+            } else if (output_.scalar_type() == ScalarType::Bool &&
+                      (self.scalar_type()  == ScalarType::Byte ||
+                       other.scalar_type() == ScalarType::Byte)) {
+              common_dtype = ScalarType::Byte;
+            }
           }
           if (self.scalar_type() != common_dtype) {
             primaryCastTensor = castMPSTensor(mpsGraph, newCachedGraph->primaryTensor, common_dtype);
@@ -230,16 +237,15 @@ void add_sub_template(const Tensor& self, const Tensor& other, const Scalar& alp
                                                name:nil]; });                                   \
 }
 
-// Boolean Ops require casting output to "MPSDataTypeBool"
+// output of Boolean Ops will be cast to "MPSDataTypeBool" at the end of binaryOpTensor()
 #define CREATE_MPS_STRUCTURED_BOOLEAN_OP_FUNC(func_out, func_stub, other_type)                  \
 TORCH_IMPL_FUNC(func_out) (const Tensor& self, const other_type& other, const Tensor& output) { \
   mps::binaryOp##other_type(self, other, Scalar(1.0), output, #func_stub,                       \
     ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {                          \
       MPSGraph* mpsGraph = cachedGraph->graph();                                                \
-      MPSGraphTensor* outputTensor = [mpsGraph func_stub##WithPrimaryTensor:primaryCastTensor   \
-                                                            secondaryTensor:secondaryCastTensor \
-                                                                       name:nil];               \
-      return mps::castMPSTensor(mpsGraph, outputTensor, ScalarType::Bool); });                  \
+      return [mpsGraph func_stub##WithPrimaryTensor:primaryCastTensor                           \
+                                    secondaryTensor:secondaryCastTensor                         \
+                                               name:nil]; });                                   \
 }
 
 // Boolean Binary Ops
diff --git a/test/test_mps.py b/test/test_mps.py
index 95ef6c9ae8c8d..aa51eca61822a 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -2911,6 +2911,14 @@ def test_eq(self):
 
         self.assertEqual(result_cpu, result_mps.to('cpu'))
 
+    def test_signed_vs_unsigned_comparison(self):
+        cpu_x = torch.tensor((-1, 2, 3), device='cpu', dtype=torch.uint8)
+        mps_x = torch.tensor((-1, 2, 3), device='mps', dtype=torch.uint8)
+        # in the comparison of signed vs. unsigned we should always cast to unsigned
+        self.assertEqual(cpu_x == -1, mps_x == -1)
+        self.assertEqual(cpu_x > -1, mps_x > -1)
+        self.assertEqual(cpu_x < -1, mps_x < -1)
+
     def test_eq_int64(self):
         values1 = [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]
         values2 = [[[1, 2, 15], [4, 5, 6]], [[7, 8, 9], [0, 11, 12]]]
@@ -8173,7 +8181,7 @@ class TestConsistency(TestCase):
         'nn.functional.conv1d': ['f32'],
         'nn.functional.conv2d': ['f32'],
         'nn.functional.conv_transpose1d': ['f32'],
-        'nn.functional.cosine_embedding_loss': ['b8', 'f32', 'i16', 'i32', 'i64'],
+        'nn.functional.cosine_embedding_loss': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.cosine_similarity': ['f32'],
         'nn.functional.elu': ['f32'],
         'nn.functional.feature_alpha_dropout': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8550,7 +8558,6 @@ class TestConsistency(TestCase):
         'nn.functional.avg_pool2d': ['torch.float32', 'torch.int64'],
         'nn.functional.bilinear': ['torch.float32'],
         'nn.functional.conv_transpose2d': ['torch.float32'],
-        'nn.functional.cosine_embedding_loss': ['torch.uint8'],
         'nn.functional.interpolate': ['torch.float32', 'torch.float32', 'torch.float32'],
         'nn.functional.max_pool1d': ['torch.float32'],
         'nn.functional.max_pool2d': ['torch.float32'],

From bced1f1c8fb40988c3f47c06025e2ad3820d4e79 Mon Sep 17 00:00:00 2001
From: Kulin Seth <kulin_seth@apple.com>
Date: Fri, 11 Nov 2022 14:33:14 -0800
Subject: [PATCH 1822/1922] Fix the skipIfMps include in header. (#175)

---
 test/test_nn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_nn.py b/test/test_nn.py
index 2c3b1727fe510..cbff32d480fbb 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -45,7 +45,7 @@
     ctcloss_reference, new_module_tests, single_batch_reference_fn, _test_bfloat16_ops, _test_module_empty_input
 from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes, \
     dtypesIfMPS, dtypesIfCUDA, precisionOverride, skipCUDAIfCudnnVersionLessThan, onlyCUDA, onlyCPU, \
-    skipCUDAIfRocm, skipCUDAIf, skipCUDAIfNotRocm, \
+    skipCUDAIfRocm, skipCUDAIf, skipMPSIf, skipCUDAIfNotRocm, \
     onlyNativeDeviceTypes, deviceCountAtLeast, largeTensorTest, expectedFailureMeta, skipMeta, get_all_device_types
 
 from hypothesis import given
@@ -11678,7 +11678,7 @@ def cosine_distance(x, y):
             self.assertEqual(functional, modular, atol=1e-6, rtol=1e-6)
             self.assertEqual(traced, modular, atol=1e-6, rtol=1e-6)
 
-    @skipIfMps  # the test doesn't work on MPS as double/complex types are not supported
+    @skipMPSIf(True, "the test doesn't work on MPS as double/complex types are not supported")
     def test_to_complex(self, device):
         m = nn.Linear(3, 5).to(device)
         self.assertIs(m, m.to(device))

From 94d3f2d2aefd99c95f46adc0656ea4e4443ffaa6 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Tue, 15 Nov 2022 15:25:29 -0500
Subject: [PATCH 1823/1922] Fix the cast and shape issues with Scatter and
 Gather ops (#174)

- check for numel()==0 for input tensors
- clean up duplicate and redundant code and headers
- move some ops from block list to allow list
---
 .../native/mps/operations/ScatterGather.mm    | 253 ++++++------------
 test/test_mps.py                              |  13 +-
 2 files changed, 87 insertions(+), 179 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/ScatterGather.mm b/aten/src/ATen/native/mps/operations/ScatterGather.mm
index cf8d8a1fef7e3..89e306736c3f2 100644
--- a/aten/src/ATen/native/mps/operations/ScatterGather.mm
+++ b/aten/src/ATen/native/mps/operations/ScatterGather.mm
@@ -1,15 +1,6 @@
 //  Copyright © 2022 Apple Inc.
 
-#include <ATen/ATen.h>
-#include <ATen/Tensor.h>
-#include <ATen/Utils.h>
-
-#include <ATen/mps/MPSStream.h>
-#include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/mps/OperationUtils.h>
-#include <torch/library.h>
-
-#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
 
 namespace at {
 namespace native {
@@ -19,25 +10,22 @@
  int64_t dim,
  const Tensor & index,
  bool sparse_grad,
- const Tensor & output) {
-
+ const Tensor & output)
+{
   using namespace mps;
-  MPSStream* stream = getCurrentMPSStream();
 
+  if (self_arg.numel() == 0 || index.numel() == 0) {
+    return;
+  }
   auto self = self_arg.dim() == 0 ? self_arg.view({1}) : self_arg;
-
   dim = at::maybe_wrap_dim(dim, self.dim());
 
   TORCH_CHECK(!sparse_grad, "sparse_grad not supported in MPS yet")
-
-  TORCH_CHECK(index.scalar_type() == ScalarType::Long || index.scalar_type() == ScalarType::Int, "index_select(): Expected dtype int32 or int64 for index");
   TORCH_CHECK(self.scalar_type() == output.scalar_type(),
               "gather(): self and output must have the same scalar type");
   TORCH_CHECK(dim >= 0 && dim < self.dim(),
               "gather(): Indexing dim ", dim, " is out of bounds of tensor");
 
-
-  // Derive from MPSCachedGraph
   struct CachedGraph : public MPSCachedGraph
   {
     CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
@@ -51,29 +39,24 @@
   @autoreleasepool {
 
     MPSShape* input_shape = getMPSShape(self);
-    NSString* ns_input_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
     MPSShape* index_shape = getMPSShape(index);
-    NSString* ns_index_shape_key = [[index_shape valueForKey:@"description"] componentsJoinedByString:@","];
-
-    int num_input_dims = [input_shape count];
-    int num_index_dims = [index_shape count];
-
+    uint32_t num_input_dims = [input_shape count];
+    uint32_t num_index_dims = [index_shape count];
     TORCH_CHECK(num_input_dims == num_index_dims, "Input and index must have same rank")
 
     // Determine if we need to slice into the input tensor
     bool needSlice = false;
 
-    for(int i = 0; i < num_input_dims; i++) {
+    for(uint32_t i = 0; i < num_input_dims; i++) {
       TORCH_CHECK(i == dim || [index_shape[i] intValue] <= [input_shape[i] intValue], "Index dim must not exceed input dim except at gathering axis")
       if(i != dim && [index_shape[i] intValue] < [input_shape[i] intValue])
         needSlice = true;
     }
-
-    string key = "gather_out_mps:" + getMPSTypeString(self.scalar_type()) + ":"
-                                   + getMPSTypeString(index.scalar_type()) + ":"
-                                   + std::to_string(dim) + ":"
-                                   + [ns_input_shape_key UTF8String] + ":"
-                                   + [ns_index_shape_key UTF8String];
+    auto input_type = getMPSDataType(self.scalar_type());
+    if (input_type ==  MPSDataTypeUInt8) {
+      input_type =  MPSDataTypeInt8;
+    }
+    string key = "gather_out_mps" + getTensorsStringKey({self, index, output}) + ":" + std::to_string(dim);
     CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
 
     if(!cachedGraph) {
@@ -84,10 +67,10 @@
           MPSGraph* mpsGraph = make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
 
-          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type()), input_shape);
-          MPSGraphTensor* indexTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(index.scalar_type()), index_shape);
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_type, getMPSShape(self));
+          MPSGraphTensor* indexTensor = mpsGraphRankedPlaceHolder(mpsGraph, index);
 
-          MPSGraphTensor* getInput = nil;
+          MPSGraphTensor* getInput = inputTensor;
 
           // Slice into the input tensor IF NEEDED
           if(needSlice) {
@@ -100,31 +83,24 @@
               strides[i] = @1;
               // All starts are 0
               starts[i] = @0;
-              if(i != dim)
-                ends[i] = index_shape[i];
-              else
-                ends[i] = input_shape[i];
+              ends[i] = (i != dim) ? index_shape[i] : input_shape[i];
             }
 
             getInput = [mpsGraph sliceTensor:inputTensor
-                                         starts:starts
-                                           ends:ends
-                                        strides:strides
-                                           name:nil];
-
+                                      starts:starts
+                                        ends:ends
+                                     strides:strides
+                                        name:nil];
           }
-          else
-            getInput = inputTensor;
 
           MPSGraphTensor* castIndexTensor = [mpsGraph castTensor:indexTensor
-                                                          toType:getMPSDataType(ScalarType::Int)
+                                                          toType:MPSDataTypeInt32
                                                             name:(NSString * _Nonnull)nil];
 
           MPSGraphTensor* outputTensor = [mpsGraph gatherAlongAxis: (NSInteger) dim
                                                  withUpdatesTensor: getInput
                                                      indicesTensor: castIndexTensor
                                                               name: nil];
-
           newCachedGraph->inputTensor_ = inputTensor;
           newCachedGraph->indexTensor_ = indexTensor;
           newCachedGraph->outputTensor_ = outputTensor;
@@ -134,9 +110,9 @@
       cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
-    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, input_shape);
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, input_shape, true, input_type);
     Placeholder indexPlaceholder = Placeholder(cachedGraph->indexTensor_, index, index_shape);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output, nullptr, false, input_type);
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
       selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
@@ -146,9 +122,8 @@
       outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
     };
 
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
   }
-
 }
 
 void scatter_mps_general
@@ -158,23 +133,21 @@
  const Tensor& src,
  const Tensor& output,
  string func_name,
- const c10::string_view reduce) {
-
+ const c10::string_view reduce)
+{
   using namespace mps;
-  MPSStream* stream = getCurrentMPSStream();
 
+  if (self_arg.numel() == 0 || index.numel() == 0 || src.numel() == 0) {
+    return;
+  }
   auto self = self_arg.dim() == 0 ? self_arg.view({1}) : self_arg;
-
   dim = at::maybe_wrap_dim(dim, self.dim());
 
-  TORCH_CHECK(index.scalar_type() == ScalarType::Long || index.scalar_type() == ScalarType::Int, "index_select(): Expected dtype int32 or int64 for index");
   TORCH_CHECK(self.scalar_type() == output.scalar_type() && output.scalar_type() == src.scalar_type(),
               "scatter(): self, src and output must have the same scalar type");
   TORCH_CHECK(dim >= 0 && dim < self.dim(),
               "scatter(): Indexing dim ", dim, " is out of bounds of tensor");
 
-
-  // Derive from MPSCachedGraph
   struct CachedGraph : public MPSCachedGraph
   {
     CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
@@ -189,15 +162,11 @@
   @autoreleasepool {
 
     MPSShape* input_shape = getMPSShape(self);
-    NSString* ns_input_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
     MPSShape* index_shape = getMPSShape(index);
-    NSString* ns_index_shape_key = [[index_shape valueForKey:@"description"] componentsJoinedByString:@","];
     MPSShape* src_shape = getMPSShape(src);
-    NSString* ns_src_shape_key = [[src_shape valueForKey:@"description"] componentsJoinedByString:@","];
-
-    int num_input_dims = [input_shape count];
-    int num_index_dims = [index_shape count];
-    int num_src_dims = [src_shape count];
+    uint32_t num_input_dims = [input_shape count];
+    uint32_t num_index_dims = [index_shape count];
+    uint32_t num_src_dims = [src_shape count];
 
     TORCH_CHECK(num_input_dims == num_index_dims && num_index_dims == num_src_dims, "Input, index and src must have same rank")
 
@@ -205,7 +174,7 @@
     bool needSlice = false;
     bool inputNeedSlice = false;
 
-    for(int i = 0; i < num_input_dims; i++) {
+    for(uint32_t i = 0; i < num_input_dims; i++) {
       TORCH_CHECK(i == dim || [index_shape[i] intValue] <= [input_shape[i] intValue], "Index dim must not exceed input dim except at gathering axis")
       TORCH_CHECK([index_shape[i] intValue] <= [src_shape[i] intValue], "Index dim must not exceed input dim except at gathering axis")
       if([index_shape[i] intValue] < [src_shape[i] intValue])
@@ -213,33 +182,11 @@
       if(i != dim && [index_shape[i] intValue] < [input_shape[i] intValue])
         inputNeedSlice = true;
     }
-
     TORCH_CHECK(reduce != "mean", "Scatter reduce mean mode not yet supported in MPS")
 
-    string reduce_key;
-
-    if(reduce == "set")
-      reduce_key = "set";
-    else if(reduce == "sum")
-      reduce_key = "sum";
-    else if(reduce == "add")
-      reduce_key = "add";
-    else if(reduce == "prod")
-      reduce_key = "prod";
-    else if(reduce == "multiply")
-      reduce_key = "multiply";
-    else if(reduce == "amax")
-      reduce_key = "amax";
-    else if(reduce == "amin")
-      reduce_key = "amin";
-
-    string key = func_name + ":" + getMPSTypeString(self.scalar_type()) + ":"
-                                 + getMPSTypeString(index.scalar_type()) + ":"
-                                 + std::to_string(dim) + ":"
-                                 + [ns_input_shape_key UTF8String] + ":"
-                                 + [ns_index_shape_key UTF8String] + ":"
-                                 + [ns_src_shape_key UTF8String] + ":"
-                                 + reduce_key;
+    bool needsCast = isIntegralType(self.scalar_type(), true) &&
+                     (reduce != "set" || self.scalar_type() == ScalarType::Byte);
+    string key = func_name + getTensorsStringKey({self, index, src, output}) + ":" + std::to_string(dim) + ":" + std::string(reduce);
     CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
     if(!cachedGraph) {
       MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
@@ -253,108 +200,70 @@
           MPSGraphTensor* indexTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(index.scalar_type()), index_shape);
           MPSGraphTensor* srcTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(src.scalar_type()), src_shape);
 
-          MPSGraphTensor* getSrc = nil;
-          MPSGraphTensor* getInput = nil;
-
-          // Slice into the src tensor IF NEEDED
-          if(needSlice) {
-            NSMutableArray<NSNumber*> *starts = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
-            NSMutableArray<NSNumber*> *ends = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
-            NSMutableArray<NSNumber*> *strides = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
-
-            for(int i = 0; i < num_input_dims; i++) {
-              // All strides are 1
-              strides[i] = @1;
-              // All starts are 0
-              starts[i] = @0;
-              ends[i] = index_shape[i];
-            }
-
-            getSrc = [mpsGraph sliceTensor:srcTensor
-                                    starts:starts
-                                      ends:ends
-                                   strides:strides
-                                      name:nil];
-
-          }
-          else
-            getSrc = srcTensor;
+          MPSGraphTensor* getSrc = srcTensor;
+          MPSGraphTensor* getInput = inputTensor;
 
           // Use in case input needs to be smaller to get scatter
-          NSArray<NSNumber*>* scatterInputShape = nil;
+          NSMutableArray<NSNumber*>* scatterInputShape = [NSMutableArray arrayWithArray:input_shape];;
 
-          // Slice into the input tensor IF NEEDED
-          if(inputNeedSlice) {
+          // Slice into the src tensor IF NEEDED
+          if (needSlice || inputNeedSlice) {
             NSMutableArray<NSNumber*> *starts = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
-            NSMutableArray<NSNumber*> *ends = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
             NSMutableArray<NSNumber*> *strides = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
-
-            auto rc = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
+            NSMutableArray<NSNumber*> *ends_src = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
 
             for(int i = 0; i < num_input_dims; i++) {
               // All strides are 1
               strides[i] = @1;
               // All starts are 0
               starts[i] = @0;
-              if(i != dim) {
-                ends[i] = index_shape[i];
-                rc[i] = index_shape[i];
-              }
-              else {
-                ends[i] = input_shape[i];
-                rc[i] = input_shape[i];
-              }
+              ends_src[i] = index_shape[i];
+              scatterInputShape[i] = (i != dim) ? index_shape[i] : input_shape[i];
             }
-            scatterInputShape = rc;
-
-            getInput = [mpsGraph sliceTensor:inputTensor
+            if (needSlice) {
+              getSrc = [mpsGraph sliceTensor:srcTensor
                                       starts:starts
-                                        ends:ends
+                                        ends:ends_src
                                      strides:strides
                                         name:nil];
-
-          }
-          else {
-            getInput = inputTensor;
-            scatterInputShape = input_shape;
+            }
+            if (inputNeedSlice) {
+              getInput = [mpsGraph sliceTensor:inputTensor
+                                        starts:starts
+                                          ends:scatterInputShape
+                                       strides:strides
+                                          name:nil];
+            }
           }
-
           MPSGraphTensor* outputTensor = nil;
+          MPSGraphTensor* castSrcTensor = getSrc;
+          MPSGraphTensor* castInputTensor = getInput;
 
-          MPSGraphTensor* castIndexTensor = [mpsGraph castTensor:indexTensor
-                                                          toType:getMPSDataType(ScalarType::Int)
-                                                            name:(NSString * _Nonnull)nil];
+          if (needsCast) {
+            castSrcTensor = castMPSTensor(mpsGraph, getSrc, ScalarType::Int);
+            castInputTensor = castMPSTensor(mpsGraph, getInput, ScalarType::Int);
+          }
+          MPSGraphTensor* castIndexTensor = castMPSTensor(mpsGraph, indexTensor, ScalarType::Int);
 
-          MPSGraphScatterMode scatter_mode;
+          MPSGraphScatterMode scatter_mode = MPSGraphScatterModeSet;
 
-          if(reduce_key == "set")
-            scatter_mode = MPSGraphScatterModeSet;
-          else if(reduce_key == "sum" || reduce_key == "add")
+          if(reduce == "sum" || reduce == "add")
             scatter_mode = MPSGraphScatterModeAdd;
-          else if(reduce_key == "prod" || reduce_key == "multiply")
+          else if(reduce == "prod" || reduce == "multiply")
             scatter_mode = MPSGraphScatterModeMul;
-          else if(reduce_key == "amax")
+          else if(reduce == "amax")
             scatter_mode = MPSGraphScatterModeMax;
-          else if(reduce_key == "amin")
+          else if(reduce == "amin")
             scatter_mode = MPSGraphScatterModeMin;
 
-          if(!inputNeedSlice) {
-            outputTensor = [mpsGraph scatterAlongAxis: (NSInteger) dim
-                                       withDataTensor: getInput
-                                        updatesTensor: getSrc
-                                        indicesTensor: castIndexTensor
-                                                 mode: scatter_mode
-                                                 name: nil];
-          }
-          else {
-            // Scatter this into the input with set mode
-            MPSGraphTensor* scatterTensor = [mpsGraph scatterAlongAxis: (NSInteger) dim
-                                                        withDataTensor: getInput
-                                                         updatesTensor: getSrc
-                                                         indicesTensor: castIndexTensor
-                                                                  mode: scatter_mode
-                                                                  name: nil];
-
+          // Scatter this into the input with set mode
+          MPSGraphTensor* scatterTensor = [mpsGraph scatterAlongAxis: (NSInteger) dim
+                                                      withDataTensor: castInputTensor
+                                                       updatesTensor: castSrcTensor
+                                                       indicesTensor: castIndexTensor
+                                                                mode: scatter_mode
+                                                                name: nil];
+          if(inputNeedSlice) {
             // Make an array of scatter indices tensors
             NSMutableArray<MPSGraphTensor*>* indicesTensors = [NSMutableArray<MPSGraphTensor*> arrayWithCapacity:num_input_dims];
 
@@ -369,7 +278,7 @@
             }
 
             MPSGraphTensor* scatterInputShapeTensor = [mpsGraph constantWithData:[NSData dataWithBytes:shape_data.data() length:num_input_dims * sizeof(int)]
-                                                                           shape:@[[NSNumber numberWithInt:num_input_dims]]
+                                                                           shape:@[[NSNumber numberWithUnsignedInt:num_input_dims]]
                                                                         dataType:MPSDataTypeInt32];
 
             for(int i = 0; i < num_input_dims; i++) {
@@ -398,12 +307,13 @@
                                              batchDimensions:0
                                                         mode:MPSGraphScatterModeSet
                                                         name:nil];
+          } else {
+            outputTensor = scatterTensor;
           }
-
           newCachedGraph->inputTensor_ = inputTensor;
           newCachedGraph->srcTensor_ = srcTensor;
           newCachedGraph->indexTensor_ = indexTensor;
-          newCachedGraph->outputTensor_ = outputTensor;
+          newCachedGraph->outputTensor_ = needsCast ? castMPSTensor(mpsGraph, outputTensor, output.scalar_type()) : outputTensor;
         }
         return newCachedGraph;
       });
@@ -424,9 +334,8 @@
       outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
     };
 
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
   }
-
 }
 
 TORCH_IMPL_FUNC(scatter_src_out_mps)
diff --git a/test/test_mps.py b/test/test_mps.py
index aa51eca61822a..43b870269ff12 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8109,7 +8109,7 @@ class TestConsistency(TestCase):
         'diag_embed': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'diagflat': ['f32', 'i32'],
         'diagonal_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'diff': ['f16', 'f32', 'i16', 'i32', 'i64'],
+        'diff': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'dist': ['f32'],
         'div': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'dot': ['f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8128,8 +8128,9 @@ class TestConsistency(TestCase):
         'floor': ['f32', 'f16', 'i16', 'i32', 'i64'],
         'floor_divide': ['f32', 'f16'],
         'frac': ['f16', 'f32'],
-        'gradient': ['f16', 'f32', 'i16'],
+        'gather': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'ge': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'gradient': ['f16', 'f32', 'i16'],
         'gt': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'half': ['f16'],
         'hstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8236,6 +8237,8 @@ class TestConsistency(TestCase):
         'rot90': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'round': ['f32', 'f16', 'i16', 'i32', 'i64'],
         'rsqrt': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        'scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'scatter_add': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'select_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'sgn': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'short': ['i16'],
@@ -8546,9 +8549,7 @@ class TestConsistency(TestCase):
         'atan2': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'], # atan2() may generate NAN in output
         'byte': ['torch.float16', 'torch.float32'],
         'char': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64'],
-        'diff': ['torch.bool', 'torch.uint8'],
         'float': ['torch.int64'],
-        'gather': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'half': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'index_put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'int': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int64', 'torch.uint8'],
@@ -8558,12 +8559,10 @@ class TestConsistency(TestCase):
         'nn.functional.avg_pool2d': ['torch.float32', 'torch.int64'],
         'nn.functional.bilinear': ['torch.float32'],
         'nn.functional.conv_transpose2d': ['torch.float32'],
-        'nn.functional.interpolate': ['torch.float32', 'torch.float32', 'torch.float32'],
+        'nn.functional.interpolate': ['torch.float32'],
         'nn.functional.max_pool1d': ['torch.float32'],
         'nn.functional.max_pool2d': ['torch.float32'],
         'nn.functional.upsample_nearest': ['torch.float32'],
-        'scatter_add': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'scatter': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'short': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'tensor_split': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'topk': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],

From 9d1075f15abfcb13ad1d3e58d3e1086dc8bbb88b Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Tue, 15 Nov 2022 18:38:37 -0500
Subject: [PATCH 1824/1922] Workaround for the casting issue from floating
 point to uint8 (#176)

---
 aten/src/ATen/native/mps/operations/Copy.mm | 16 +++++-----------
 test/test_mps.py                            | 18 +++++++-----------
 2 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm
index ca0171a695cbe..d806fa6506504 100644
--- a/aten/src/ATen/native/mps/operations/Copy.mm
+++ b/aten/src/ATen/native/mps/operations/Copy.mm
@@ -1,17 +1,7 @@
 //  Copyright © 2022 Apple Inc.
 
-#include <ATen/mps/MPSStream.h>
 #include <ATen/native/mps/Copy.h>
 #include <ATen/native/mps/OperationUtils.h>
-#include <iostream>
-#include <cstring>
-#include <ATen/ATen.h>
-#include <ATen/Tensor.h>
-#include <ATen/Utils.h>
-#include <torch/library.h>
-#include <ATen/native/Resize.h>
-#include <c10/util/Optional.h>
-
 
 namespace at {
 namespace native {
@@ -66,7 +56,11 @@ void copy_cast_mps(at::Tensor& dst, const at::Tensor& src,
           newCachedGraph = new CachedGraph(mpsGraph);
 
           MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, src);
-          MPSGraphTensor* outputTensor = [mpsGraph castTensor:inputTensor toType:dstDType name:@"cast"];
+          MPSGraphTensor* inputCastTensor = inputTensor;
+          if (isFloatingType(src.scalar_type()) && dstDType == MPSDataTypeUInt8) {
+            inputCastTensor = [mpsGraph castTensor:inputTensor toType:MPSDataTypeInt32 name:@"cast"];
+          }
+          MPSGraphTensor* outputTensor = [mpsGraph castTensor:inputCastTensor toType:dstDType name:@"cast"];
 
           newCachedGraph->inputTensor_ = inputTensor;
           newCachedGraph->outputTensor_ = outputTensor;
diff --git a/test/test_mps.py b/test/test_mps.py
index 43b870269ff12..ad1d7e325e535 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8086,8 +8086,10 @@ class TestConsistency(TestCase):
         'block_diag': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'bmm': ['f32'],
         'broadcast_shapes': ['f32'],
+        'byte': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'ceil': ['f32', 'int32', 'int64', 'f16'],
-        'char': ['b8', 'u8'],
+        'char': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'chunk': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'clamp': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'clamp_max': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8124,7 +8126,7 @@ class TestConsistency(TestCase):
         'flip': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'fliplr': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'flipud': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'float': ['f32'],
+        'float': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'floor': ['f32', 'f16', 'i16', 'i32', 'i64'],
         'floor_divide': ['f32', 'f16'],
         'frac': ['f16', 'f32'],
@@ -8132,10 +8134,10 @@ class TestConsistency(TestCase):
         'ge': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'gradient': ['f16', 'f32', 'i16'],
         'gt': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'half': ['f16'],
+        'half': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'hstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'index_select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'int': ['i32'],
+        'int': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'isclose': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'isfinite': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'isinf': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8241,7 +8243,7 @@ class TestConsistency(TestCase):
         'scatter_add': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'select_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'sgn': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'short': ['i16'],
+        'short': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'sigmoid': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
         'sign': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8', 'i64'],
         'sin': ['b8', 'f32', 'i16', 'i32', 'u8'],
@@ -8547,12 +8549,7 @@ class TestConsistency(TestCase):
         # New block list ops that need investigation
         'as_strided_scatter': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'atan2': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'], # atan2() may generate NAN in output
-        'byte': ['torch.float16', 'torch.float32'],
-        'char': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64'],
-        'float': ['torch.int64'],
-        'half': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'index_put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'int': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int64', 'torch.uint8'],
         'nn.functional.adaptive_avg_pool1d': ['torch.float32'],
         'nn.functional.adaptive_avg_pool2d': ['torch.float32'],
         'nn.functional.avg_pool1d': ['torch.float32', 'torch.int64'],
@@ -8563,7 +8560,6 @@ class TestConsistency(TestCase):
         'nn.functional.max_pool1d': ['torch.float32'],
         'nn.functional.max_pool2d': ['torch.float32'],
         'nn.functional.upsample_nearest': ['torch.float32'],
-        'short': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'tensor_split': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'topk': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
 

From f485193656a7862053db7f7fddc3208642563e2e Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Tue, 15 Nov 2022 22:10:47 -0500
Subject: [PATCH 1825/1922] Second tensor argument of tensor_split() must be on
 CPU (#177)

---
 test/test_mps.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index ad1d7e325e535..0f88a97080e64 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8263,6 +8263,7 @@ class TestConsistency(TestCase):
         'tan': ['b8', 'i16', 'i32', 'u8'],
         'tanh': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'tensordot': ['f32'],
+        'tensor_split': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'tile': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'topk': ['f32'],
         'trapz': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8471,10 +8472,9 @@ class TestConsistency(TestCase):
         'masked.sum': [torch.bool],
 
         # Functions that hard crash
-        'nn.functional.padreflect': [torch.float32], # negative padding may cause GPU reset
-        'nn.functional.padreplicate': [torch.float32],
         'std': [torch.float16],
-        'stft': [torch.float32], 'var': [torch.float16],
+        'stft': [torch.float32],
+        'var': [torch.float16],
         # + forward when requires_grad=True or running backward
         '__rpow__': [torch.int64],
         'masked.std': [torch.int32],
@@ -8509,6 +8509,7 @@ class TestConsistency(TestCase):
         # locally
         'diag': ['torch.int64'],
         'diagflat': ['torch.int64'],
+        'index_put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
 
         # Functions that are flaky
         # These are detected as "ok" by the expect case but actually fail to run sometimes
@@ -8549,7 +8550,6 @@ class TestConsistency(TestCase):
         # New block list ops that need investigation
         'as_strided_scatter': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'atan2': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'], # atan2() may generate NAN in output
-        'index_put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'nn.functional.adaptive_avg_pool1d': ['torch.float32'],
         'nn.functional.adaptive_avg_pool2d': ['torch.float32'],
         'nn.functional.avg_pool1d': ['torch.float32', 'torch.int64'],
@@ -8560,8 +8560,9 @@ class TestConsistency(TestCase):
         'nn.functional.max_pool1d': ['torch.float32'],
         'nn.functional.max_pool2d': ['torch.float32'],
         'nn.functional.upsample_nearest': ['torch.float32'],
-        'tensor_split': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'topk': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'nn.functional.padreflect': [torch.float32], # negative padding may cause GPU reset
+        'nn.functional.padreplicate': [torch.float32],
 
         # failures due to lack of op implementation on MPS backend
         'linalg.eig': ['torch.float32'],
@@ -8677,6 +8678,10 @@ def get_samples():
                 mps_args = [mps_sample.input] + list(mps_sample.args)
                 mps_kwargs = mps_sample.kwargs
 
+                # for tensor_split(), the second tensor arg ("tensor_indices_or_sections") must be on CPU only
+                if (op.name == "tensor_split" and isinstance(mps_args[1], torch.Tensor)):
+                    mps_args[1] = cpu_args[1]
+
                 cpu_out = op(*cpu_args, **cpu_kwargs)
                 mps_out = op(*mps_args, **mps_kwargs)
 

From 599a43997d55ab2f366a7f3c7206164d350b9662 Mon Sep 17 00:00:00 2001
From: Kulin Seth <kulin_seth@apple.com>
Date: Thu, 17 Nov 2022 10:06:46 -0800
Subject: [PATCH 1826/1922] Fix the Channels last bug with GradientWithInput.
 (#179)

* Fix the Channels last bug with GradientWithInput.

The bug was mentioned in :
https://github.com/pytorch/pytorch/issues/77764#issuecomment-1312241902

* Update the placeholder

* Remove the extra print.
---
 .../ATen/native/mps/operations/Convolution.mm | 29 +++++++++----------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Convolution.mm b/aten/src/ATen/native/mps/operations/Convolution.mm
index 88bad9a5872a4..636226815ba41 100644
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@@ -216,23 +216,22 @@ Tensor _mps_convolution(
 }
 
 Tensor mps_convolution_backward_input(
-    IntArrayRef input_size, const Tensor& grad_output_t, const Tensor& weight_t,
+    IntArrayRef input_size, const Tensor& grad_output_, const Tensor& weight_,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) {
   namespace native_mps = at::native::mps;
   using namespace mps;
   CheckedFrom c = "mps_convolution_backward_input";
-  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
-            weight{ weight_t, "weight", 2 };
+  TensorArg grad_output{ grad_output_, "grad_output", 1 },
+            weight{ weight_, "weight", 2 };
   checkAllSameType(c, {grad_output, weight});
   checkAllSameGPU(c, {grad_output, weight});
-  auto memory_format = grad_output_t.suggest_memory_format();
-  auto grad_input_t = at::empty(
-                    input_size,
-                    grad_output->scalar_type(),
-                    c10::nullopt,
-                    kMPS,
-                    c10::nullopt,
-                    c10::nullopt);
+  auto memory_format = grad_output_.suggest_memory_format();
+  bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast);
+  MPSShape* weightShape = get_mps_conv_shape(weight_, is_channels_last);
+  MPSShape* gradOutputShape = get_mps_conv_shape(grad_output_, is_channels_last);
+  Tensor grad_output_t = grad_output_.contiguous(memory_format);
+  Tensor weight_t = weight_.contiguous(memory_format);
+  auto grad_input_t = at::empty( input_size, grad_output_t.options(), c10::nullopt);
 
   // Avoid "grad_input" when this is being used as transposed convolution
   TensorArg grad_input{ grad_input_t, "result", 0 };
@@ -291,8 +290,8 @@ Tensor mps_convolution_backward_input(
                                       padding[1], padding[0],
                                       memory_format, groups);
 
-          MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, grad_output_t);
-          MPSGraphTensor* weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
+          MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSScalarType(grad_output_t.scalar_type()), gradOutputShape);
+          MPSGraphTensor* weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSScalarType(weight_t.scalar_type()), weightShape);
 
           MPSGraphTensor* gradInputTensor = [mpsGraph convolution2DDataGradientWithIncomingGradientTensor:gradOutputTensor
                                                                                             weightsTensor:weightTensor
@@ -309,8 +308,8 @@ Tensor mps_convolution_backward_input(
       cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
-    auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t);
-    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t);
+    auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t, gradOutputShape);
+    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t, weightShape);
     auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, *grad_input);
 
     NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{

From ec925fba2525c785c2bf97aca855ca1c530f075d Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Fri, 18 Nov 2022 15:02:44 -0500
Subject: [PATCH 1827/1922] Fix cast issue in scatter() with uint8 type  (#178)

* Fix cast issue in scatter() with uint8 type
Also fix float16 for scatter_add()

* Remove the redundant line in scatter()
---
 .../native/mps/operations/ScatterGather.mm    | 55 ++++++++++---------
 test/test_mps.py                              |  2 +-
 2 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/ScatterGather.mm b/aten/src/ATen/native/mps/operations/ScatterGather.mm
index 89e306736c3f2..1871e55739235 100644
--- a/aten/src/ATen/native/mps/operations/ScatterGather.mm
+++ b/aten/src/ATen/native/mps/operations/ScatterGather.mm
@@ -173,6 +173,7 @@
     // Do we need to slice into the src tensor?
     bool needSlice = false;
     bool inputNeedSlice = false;
+    bool needsCast = false;
 
     for(uint32_t i = 0; i < num_input_dims; i++) {
       TORCH_CHECK(i == dim || [index_shape[i] intValue] <= [input_shape[i] intValue], "Index dim must not exceed input dim except at gathering axis")
@@ -184,8 +185,12 @@
     }
     TORCH_CHECK(reduce != "mean", "Scatter reduce mean mode not yet supported in MPS")
 
-    bool needsCast = isIntegralType(self.scalar_type(), true) &&
-                     (reduce != "set" || self.scalar_type() == ScalarType::Byte);
+    MPSDataType src_type = getMPSDataType(src.scalar_type());
+    if (reduce != "set" || self.scalar_type() == ScalarType::Byte) {
+      src_type = isFloatingType(src.scalar_type()) ? MPSDataTypeFloat32 : MPSDataTypeInt32;
+      needsCast = true;
+    }
+
     string key = func_name + getTensorsStringKey({self, index, src, output}) + ":" + std::to_string(dim) + ":" + std::string(reduce);
     CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
     if(!cachedGraph) {
@@ -196,55 +201,53 @@
           MPSGraph* mpsGraph = make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
 
-          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type()), input_shape);
-          MPSGraphTensor* indexTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(index.scalar_type()), index_shape);
-          MPSGraphTensor* srcTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(src.scalar_type()), src_shape);
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+          MPSGraphTensor* indexTensor = mpsGraphRankedPlaceHolder(mpsGraph, index);
+          MPSGraphTensor* srcTensor   = mpsGraphRankedPlaceHolder(mpsGraph, src);
 
-          MPSGraphTensor* getSrc = srcTensor;
-          MPSGraphTensor* getInput = inputTensor;
+          MPSGraphTensor* outputTensor = nil;
+          MPSGraphTensor* castSrcTensor = srcTensor;
+          MPSGraphTensor* castInputTensor = inputTensor;
+
+          if (needsCast) {
+            castSrcTensor = [mpsGraph castTensor:srcTensor toType:src_type name:@"cast"];
+            castInputTensor = [mpsGraph castTensor:inputTensor toType:src_type name:@"cast"];
+          }
+          MPSGraphTensor* castIndexTensor = [mpsGraph castTensor:indexTensor toType:MPSDataTypeInt32 name:@"cast"];
+
+          MPSGraphTensor* slicedSrc = castSrcTensor;
+          MPSGraphTensor* slicedInput = castInputTensor;
 
           // Use in case input needs to be smaller to get scatter
-          NSMutableArray<NSNumber*>* scatterInputShape = [NSMutableArray arrayWithArray:input_shape];;
+          NSMutableArray<NSNumber*>* scatterInputShape = [NSMutableArray arrayWithArray:input_shape];
 
-          // Slice into the src tensor IF NEEDED
+          // Slice into the src or input tensors IF NEEDED
           if (needSlice || inputNeedSlice) {
             NSMutableArray<NSNumber*> *starts = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
             NSMutableArray<NSNumber*> *strides = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
             NSMutableArray<NSNumber*> *ends_src = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
 
             for(int i = 0; i < num_input_dims; i++) {
-              // All strides are 1
               strides[i] = @1;
-              // All starts are 0
               starts[i] = @0;
               ends_src[i] = index_shape[i];
               scatterInputShape[i] = (i != dim) ? index_shape[i] : input_shape[i];
             }
             if (needSlice) {
-              getSrc = [mpsGraph sliceTensor:srcTensor
+              slicedSrc = [mpsGraph sliceTensor:castSrcTensor
                                       starts:starts
                                         ends:ends_src
                                      strides:strides
                                         name:nil];
             }
             if (inputNeedSlice) {
-              getInput = [mpsGraph sliceTensor:inputTensor
+              slicedInput = [mpsGraph sliceTensor:castInputTensor
                                         starts:starts
                                           ends:scatterInputShape
                                        strides:strides
                                           name:nil];
             }
           }
-          MPSGraphTensor* outputTensor = nil;
-          MPSGraphTensor* castSrcTensor = getSrc;
-          MPSGraphTensor* castInputTensor = getInput;
-
-          if (needsCast) {
-            castSrcTensor = castMPSTensor(mpsGraph, getSrc, ScalarType::Int);
-            castInputTensor = castMPSTensor(mpsGraph, getInput, ScalarType::Int);
-          }
-          MPSGraphTensor* castIndexTensor = castMPSTensor(mpsGraph, indexTensor, ScalarType::Int);
-
           MPSGraphScatterMode scatter_mode = MPSGraphScatterModeSet;
 
           if(reduce == "sum" || reduce == "add")
@@ -258,8 +261,8 @@
 
           // Scatter this into the input with set mode
           MPSGraphTensor* scatterTensor = [mpsGraph scatterAlongAxis: (NSInteger) dim
-                                                      withDataTensor: castInputTensor
-                                                       updatesTensor: castSrcTensor
+                                                      withDataTensor: slicedInput
+                                                       updatesTensor: slicedSrc
                                                        indicesTensor: castIndexTensor
                                                                 mode: scatter_mode
                                                                 name: nil];
@@ -301,7 +304,7 @@
                                                              withShape:@[@-1]
                                                                   name:nil];
 
-            outputTensor = [mpsGraph scatterNDWithDataTensor:inputTensor
+            outputTensor = [mpsGraph scatterNDWithDataTensor:castInputTensor
                                                updatesTensor:flatValuesTensor
                                                indicesTensor:scatter_fullIndexTensor
                                              batchDimensions:0
diff --git a/test/test_mps.py b/test/test_mps.py
index 0f88a97080e64..14414ad824f92 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8240,7 +8240,7 @@ class TestConsistency(TestCase):
         'round': ['f32', 'f16', 'i16', 'i32', 'i64'],
         'rsqrt': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'scatter_add': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'scatter_add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'select_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'sgn': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'short': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],

From 38874153370a049d761956349fe6e56743f0951a Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Fri, 18 Nov 2022 16:08:43 -0800
Subject: [PATCH 1828/1922] Fallback on gather code to solve view tensors when
 a slice is followed by a reshape (#181)

* Fallback on gather code to solve view tensors when a slice is followed by a reshape

* Remove print from the code
---
 aten/src/ATen/native/mps/OperationUtils.h   |  1 +
 aten/src/ATen/native/mps/OperationUtils.mm  |  7 ++--
 aten/src/ATen/native/mps/operations/View.mm | 41 +++++++++++++++++----
 test/test_mps.py                            |  9 +++++
 4 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
index 93e45d24c220a..ab446bb856b04 100644
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@@ -50,6 +50,7 @@ std::string getArrayRefString(const IntArrayRef s);
 // use has_storage() on the returned tensor to determine if src actually is a view
 Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst);
 Tensor& scatterViewTensor(const at::Tensor& src, at::Tensor& output, id<MTLBuffer> updatesBuffer = nil);
+bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape);
 MPSGraphTensorData* getMPSGraphTensorDataForView(const Tensor& src, MPSShape *mpsShape, const MPSDataType mpsDataType);
 
 MPSShape* getMPSShape(const Tensor& t);
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index 016a96ec3ac9f..02ce0982513d8 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -205,8 +205,9 @@ void printTensorNDArray(const Tensor& t) {
   TORCH_CHECK(src.is_mps(), "Placeholder storage has not been allocated on MPS device!");
   // extract the pointer to MTLBuffer from the Tensor's storage
   id<MTLBuffer> srcBuf = getMTLBufferStorage(src);
+  bool sliceViewTensor = canSliceViewTensor(src, mpsShape);
   // a view tensor could be contiguous (e.g., slice ops) or non-contiguous (e.g., transpose())
-  if (!src.is_contiguous() && gatherTensorData) {
+  if ((!src.is_contiguous() || (src.is_view() && src.storage_offset() && !sliceViewTensor)) && gatherTensorData) {
      Tensor emptyShell = Tensor();
     // use "_tensor" from Placeholder to retain view's output during its usage in other ops
     _tensor = gatherViewTensor(src, emptyShell);
@@ -225,12 +226,12 @@ void printTensorNDArray(const Tensor& t) {
   const MPSDataType mpsDataType = dataType != MPSDataTypeInvalid ? dataType :
                       _tensor.dim() == 0 ? getMPSScalarType(_tensor.scalar_type()) : getMPSDataType(_tensor.scalar_type());
 
-  if (src.is_view() && src.is_contiguous() && src.storage_offset()) {
+  if (src.is_view() && src.is_contiguous() && src.storage_offset() && sliceViewTensor) {
     _value = getMPSGraphTensorDataForView(src, mpsShape, mpsDataType);
   } else {
     if (!mpsShape) {
       mpsShape = getMPSShape(_tensor);
-    }
+  }
 
     _value = [[[MPSGraphTensorData alloc] initWithMTLBuffer:srcBuf
                                                       shape:mpsShape
diff --git a/aten/src/ATen/native/mps/operations/View.mm b/aten/src/ATen/native/mps/operations/View.mm
index 0828a3b0216b3..83fcb5c6780b4 100644
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@@ -417,23 +417,48 @@
   return outputTensor;
 }
 
-MPSGraphTensorData* getMPSGraphTensorDataForView(const Tensor& src, MPSShape *mpsShape, const MPSDataType mpsDataType) {
-  IntArrayRef src_base_shape = get_buffer_shape(src.storage().data());
-  std::vector<int64_t> src_view_shape;
+static
+std::vector<int64_t> getViewShape(const Tensor& src, MPSShape *mpsShape) {
   bool hasMPSShape = (mpsShape != nil);
-  int src_ndim_base = src_base_shape.size();
-  int src_ndim_view = 0;
+  std::vector<int64_t> src_view_shape;
   if (hasMPSShape) {
-    src_ndim_view = [mpsShape count];
-    src_view_shape.reserve(src_ndim_view);
+    int src_ndim_view = [mpsShape count];
+    src_view_shape.resize(src_ndim_view);
     for (const auto i : c10::irange(src_ndim_view)) {
       src_view_shape[i] = [mpsShape[i] intValue];
     }
   } else {
-    src_ndim_view = src.dim();
     src_view_shape = src.sizes().vec();
   }
 
+  return src_view_shape;
+}
+
+bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape) {
+  if (!src.is_view()) {
+    return false;
+  }
+
+  IntArrayRef src_base_shape = get_buffer_shape(src.storage().data());
+  int src_ndim_base = src_base_shape.size();
+  std::vector<int64_t> src_view_shape = getViewShape(src, mpsShape);
+  int src_ndim_view = src_view_shape.size();
+  if (src_ndim_base == src_ndim_view) {
+    for (const auto i : c10::irange(src_ndim_base)) {
+      if (src_view_shape[i] > src_base_shape[i]) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+MPSGraphTensorData* getMPSGraphTensorDataForView(const Tensor& src, MPSShape *mpsShape, const MPSDataType mpsDataType) {
+  IntArrayRef src_base_shape = get_buffer_shape(src.storage().data());
+  int src_ndim_base = src_base_shape.size();
+  std::vector<int64_t> src_view_shape = getViewShape(src, mpsShape);
+  int src_ndim_view = src_view_shape.size();
+
   MPSNDArray *srcTensorNDArrayView = nil;
   MPSNDArrayDescriptor *srcTensorNDArrayDesc = nil;
   MPSNDArray *srcTensorNDArray = nil;
diff --git a/test/test_mps.py b/test/test_mps.py
index 14414ad824f92..4a7919304352b 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -1622,6 +1622,15 @@ def test_cpu_to_strided_mps_copy(self):
 
         self.assertEqual(a1, a2)
 
+    def test_slice_reshape(self):
+        x = torch.randn([1, 6, 4, 2], dtype=torch.float, device="mps")
+        x_cpu = x.detach().clone().to("cpu")
+
+        x = x[:,3:].view(2, 3, 4, 1)
+        x_cpu = x_cpu[:,3:].view(2, 3, 4, 1)
+
+        self.assertEqual(x, x_cpu)
+
     def test_view_slice(self):
         # https://github.com/pytorch/pytorch/issues/83995
         NUM_SAMPLES = 60

From d1f487387c4469d38d99e2a32df580875dd3fce1 Mon Sep 17 00:00:00 2001
From: ssaladis <29134159+ssaladis@users.noreply.github.com>
Date: Wed, 30 Nov 2022 10:10:19 -0800
Subject: [PATCH 1829/1922] fix huberloss for float16 (#185)

---
 aten/src/ATen/native/mps/operations/LossOps.mm | 8 +++++---
 test/test_mps.py                               | 7 +++----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/LossOps.mm b/aten/src/ATen/native/mps/operations/LossOps.mm
index 3430af0434dec..be1cfe83d2363 100644
--- a/aten/src/ATen/native/mps/operations/LossOps.mm
+++ b/aten/src/ATen/native/mps/operations/LossOps.mm
@@ -1077,12 +1077,14 @@ void smooth_l1_loss_backward_template(
 
                     MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
                     MPSGraphTensor* targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target);
+
+                    MPSDataType     input_type  = getMPSScalarType(input.scalar_type());
                     MPSGraphTensor* deltaTensor = [mpsGraph constantWithScalar:delta
                                                                              shape:@[@1]
-                                                                          dataType:MPSDataTypeFloat32];
+                                                                          dataType:input_type];
                     MPSGraphTensor* halfTensor = [mpsGraph constantWithScalar:.5f
                                                                              shape:@[@1]
-                                                                          dataType:MPSDataTypeFloat32];
+                                                                          dataType:input_type];
 
                     MPSGraphTensor* diffTensor = [mpsGraph subtractionWithPrimaryTensor: inputTensor
                                                                         secondaryTensor: targetTensor
@@ -1211,7 +1213,7 @@ Tensor huber_loss_mps(const Tensor& input, const Tensor& target, int64_t reducti
                                                                               name:nil];
                     MPSGraphTensor* deltaTensor = [mpsGraph constantWithScalar:delta
                                                                          shape:getMPSShape(target)
-                                                                      dataType:MPSDataTypeFloat32];
+                                                                      dataType:getMPSDataType(target.scalar_type())];
                     MPSGraphTensor* diffTensor = [mpsGraph subtractionWithPrimaryTensor:inputTensor
                                                                         secondaryTensor:targetTensor
                                                                                    name:nil];
diff --git a/test/test_mps.py b/test/test_mps.py
index 4a7919304352b..4da33744de7d3 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8203,7 +8203,7 @@ class TestConsistency(TestCase):
         'nn.functional.group_norm': ['f32'],
         'nn.functional.hardtanh': ['f32', 'i16', 'i32', 'i64'],
         'nn.functional.hinge_embedding_loss': ['f32'],
-        'nn.functional.huber_loss': ['f32'],
+        'nn.functional.huber_loss': ['f16', 'f32'],
         'nn.functional.instance_norm': ['f32'],
         'nn.functional.kl_div': ['f32', 'i16', 'i32', 'i64'],
         'nn.functional.l1_loss': ['f16', 'f32'],
@@ -8403,7 +8403,7 @@ class TestConsistency(TestCase):
         'nn.functional.glu': ['f32'],
         'nn.functional.hardtanh': ['f32'],
         'nn.functional.hinge_embedding_loss': ['f32'],
-        'nn.functional.huber_loss': ['f32'],
+        'nn.functional.huber_loss': ['f16', 'f32'],
         'nn.functional.instance_norm': ['f32'],
         'nn.functional.kl_div': ['f32'],
         'nn.functional.l1_loss': ['f16', 'f32'],
@@ -8503,7 +8503,6 @@ class TestConsistency(TestCase):
         'nn.functional.conv_transpose1d': [torch.int64],
         'nn.functional.conv_transpose2d': [torch.int64],
         'nn.functional.conv_transpose3d': [torch.int64, torch.float32],
-        'nn.functional.huber_loss': [torch.float16],
         'nn.functional.softplus': [torch.float32],
         'pow': [torch.int64],
         'sigmoid': [torch.int64],
@@ -8697,7 +8696,7 @@ def get_samples():
                 if op.name == "nn.functional.conv2d" and dtype == torch.float32:
                     atol = 1e-4
                     rtol = 3e-5
-                elif (op.name == "add" or op.name == "sub") and dtype == torch.float16:
+                elif (op.name == "add" or op.name == "sub" or op.name == "nn.functional.huber_loss") and dtype == torch.float16:
                     atol = 1e-2
                     rtol = 1e-2
                 else:

From 8e32de47f0abf0e1b7db03ff123f4d423d1b2911 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Thu, 1 Dec 2022 11:31:06 -0500
Subject: [PATCH 1830/1922] Fix GPU timeout caused by negative padding in
 backward pass (#184)

Also fix the assert with bool type in constant padding
---
 aten/src/ATen/native/mps/operations/Pad.mm | 97 +++++++++++++++-------
 test/test_mps.py                           | 13 ++-
 2 files changed, 74 insertions(+), 36 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Pad.mm b/aten/src/ATen/native/mps/operations/Pad.mm
index 990f477b389d8..77daa9356dc89 100644
--- a/aten/src/ATen/native/mps/operations/Pad.mm
+++ b/aten/src/ATen/native/mps/operations/Pad.mm
@@ -131,17 +131,34 @@
     grad_output = grad_output_.contiguous();
   }
 
+  const uint32_t dims_mask = (1U << ndims) - 1;
+  uint32_t startMask = dims_mask, endMask = dims_mask;
   std::vector<NSNumber*> leftPadVec(ndims, @(0));
   std::vector<NSNumber*> rightPadVec(ndims, @(0));
-  leftPadVec [ndims - 1] = @(pad_l);
-  rightPadVec[ndims - 1] = @(pad_r);
-  if (padding_dim >= 2) {
-    leftPadVec [ndims - 2] = @(pad_t);
-    rightPadVec[ndims - 2] = @(pad_b);
-  }
-  if (padding_dim >= 3) {
-    leftPadVec [ndims - 3] = @(pad_front);
-    rightPadVec[ndims - 3] = @(pad_back);
+  std::vector<NSNumber*> startsVec(ndims, @(0));
+  std::vector<NSNumber*> endsVec(ndims, @(0));
+  std::vector<NSNumber*> stridesVec(ndims, @(1));
+
+  for (int64_t pdim = 0; pdim < padding_size / 2; pdim++) {
+    const int64_t leftIdx  = pdim * 2;
+    const int64_t rightIdx = pdim * 2 + 1;
+    const int64_t padIdx = ndims - pdim - 1;
+
+    leftPadVec [padIdx] = @(padding[leftIdx]);
+    rightPadVec[padIdx] = @(padding[rightIdx]);
+    // workaround for negative padding issue in backward pass
+    if (is_backward_pass) {
+      if (padding[leftIdx] < 0) {
+        leftPadVec[padIdx] = @(0);
+        startsVec[padIdx] = @(-padding[leftIdx]);
+        startMask &= ~(1U << padIdx);
+      }
+      if (padding[rightIdx] < 0) {
+        rightPadVec[padIdx] = @(0);
+        endsVec[padIdx] = @(input.size(padIdx) + padding[rightIdx]);
+        endMask &= ~(1U << padIdx);
+      }
+    }
   }
   MPSShape *leftPadding  = [NSArray arrayWithObjects:leftPadVec.data() count:ndims];
   MPSShape *rightPadding = [NSArray arrayWithObjects:rightPadVec.data() count:ndims];
@@ -157,33 +174,55 @@
     string key = op_name + getTensorsStringKey({input, grad_output, output}) + ":[" +
                            getArrayRefString(padding) + "]:" + std::to_string(constantValue);
 
-    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
     if(!cachedGraph) {
-      cachedGraph = static_cast<CachedGraph*>(cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+      cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ MPSCachedGraph * () {
         CachedGraph *newCachedGraph = nil;
         @autoreleasepool {
-            MPSGraph* mpsGraph = make_mps_graph();
-            newCachedGraph = new CachedGraph(mpsGraph);
-            newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
-            if (!is_backward_pass) {
-              newCachedGraph->outputTensor = [mpsGraph padTensor:newCachedGraph->inputTensor
-                                                 withPaddingMode:mode
-                                                     leftPadding:leftPadding
-                                                    rightPadding:rightPadding
-                                                   constantValue:constantValue
-                                                            name:nil];
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+          newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
+
+          if (!is_backward_pass) {
+            // workaround for Bool type assert with Constant padding (only needed for forward pass)
+            const bool needsBoolCast = mode == MPSGraphPaddingModeConstant && input.scalar_type() == ScalarType::Bool;
+            MPSGraphTensor *inputTensorCast = !needsBoolCast ? newCachedGraph->inputTensor :
+                                              castMPSTensor(mpsGraph, newCachedGraph->inputTensor, ScalarType::Byte);
+            MPSGraphTensor *outputTensor = [mpsGraph padTensor:inputTensorCast
+                                               withPaddingMode:mode
+                                                   leftPadding:leftPadding
+                                                  rightPadding:rightPadding
+                                                 constantValue:constantValue
+                                                          name:nil];
+            newCachedGraph->outputTensor = needsBoolCast ? castMPSTensor(mpsGraph, outputTensor, ScalarType::Bool) : outputTensor;
+          } else {
+            newCachedGraph->gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
+            MPSGraphTensor *padGradTensor = [mpsGraph padGradientWithIncomingGradientTensor:newCachedGraph->gradOutputTensor
+                                                                               sourceTensor:newCachedGraph->inputTensor
+                                                                                paddingMode:mode
+                                                                                leftPadding:leftPadding
+                                                                               rightPadding:rightPadding
+                                                                                       name:nil];
+
+            // workaround for negative padding issue with padGradientWithIncomingGradientTensor()
+            const bool needsSliceGradient = startMask != dims_mask || endMask != dims_mask;
+            if (needsSliceGradient) {
+              newCachedGraph->outputTensor = [mpsGraph sliceGradientTensor:padGradTensor
+                                                          fwdInShapeTensor:[mpsGraph shapeOfTensor:newCachedGraph->inputTensor name:nil]
+                                                                    starts:[NSArray arrayWithObjects:startsVec.data()  count:ndims]
+                                                                      ends:[NSArray arrayWithObjects:endsVec.data()    count:ndims]
+                                                                   strides:[NSArray arrayWithObjects:stridesVec.data() count:ndims]
+                                                                 startMask:startMask
+                                                                   endMask:endMask
+                                                               squeezeMask:0
+                                                                      name:nil];
             } else {
-              newCachedGraph->gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
-              newCachedGraph->outputTensor = [mpsGraph padGradientWithIncomingGradientTensor:newCachedGraph->gradOutputTensor
-                                                                                sourceTensor:newCachedGraph->inputTensor
-                                                                                 paddingMode:mode
-                                                                                 leftPadding:leftPadding
-                                                                                rightPadding:rightPadding
-                                                                                        name:nil];
+              newCachedGraph->outputTensor = padGradTensor;
             }
+          }
         }
         return newCachedGraph;
-      }));
+      });
     }
     Placeholder inputPlaceholder  = Placeholder(cachedGraph->inputTensor, input);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output);
diff --git a/test/test_mps.py b/test/test_mps.py
index 4da33744de7d3..e4d3498b41705 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -4256,6 +4256,8 @@ def helper(shape, padding, op, value=0):
         helper((2, 1, 6, 8), 2, nn.ReplicationPad2d)
         # verify if a change in shape of padding would cause problems with graph caching
         helper((2, 1, 6, 8), (2, 4, 3, 5), nn.ReplicationPad2d)
+        # negative padding
+        helper((1, 3, 4, 4), (-1, 1, -2, 1), nn.ReplicationPad2d)
         # Constant Pad 2D
         helper((2, 1, 6, 8), (2, 4, 3, 5), nn.ConstantPad2d)
         # input size < pad size
@@ -4276,7 +4278,6 @@ def helper(shape, padding, op, value=0):
         # Constant Pad 3D
         helper((2, 4, 6, 8, 4), (1, 3, 3, 5, 3, 4), nn.ConstantPad3d)
 
-
     # Test stack forward
     def test_stack(self):
         # All shapes must be same
@@ -8108,7 +8109,7 @@ class TestConsistency(TestCase):
         'combinations': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'conj': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'conj_physical': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'constant_pad_nd': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'constant_pad_nd': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'contiguous': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'corrcoef': ['f32'],
         'cos': ['b8', 'f32', 'i16', 'i32', 'u8', 'i64'],
@@ -8216,6 +8217,9 @@ class TestConsistency(TestCase):
         'nn.functional.normalize': ['f32'],
         'nn.functional.pad': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.padcircular': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.padconstant': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.padreflect': ['f32'],
+        'nn.functional.padreplicate': ['f32'],
         'nn.functional.pairwise_distance': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.poisson_nll_loss': ['f32', 'i16', 'i32', 'u8'],
         'nn.functional.prelu': ['f32'],
@@ -8510,9 +8514,6 @@ class TestConsistency(TestCase):
         'unique': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nonzero': [torch.uint8, torch.float16],
 
-        # ALLOW_LIST doesn't know about variants
-        'nn.functional.padconstant': None,
-
         # These were moved from ALLOWLIST to BLOCK as they are not working
         # locally
         'diag': ['torch.int64'],
@@ -8569,8 +8570,6 @@ class TestConsistency(TestCase):
         'nn.functional.max_pool2d': ['torch.float32'],
         'nn.functional.upsample_nearest': ['torch.float32'],
         'topk': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'nn.functional.padreflect': [torch.float32], # negative padding may cause GPU reset
-        'nn.functional.padreplicate': [torch.float32],
 
         # failures due to lack of op implementation on MPS backend
         'linalg.eig': ['torch.float32'],

From d84cf3d6f458fc6607053a2c04f8a477246ae3eb Mon Sep 17 00:00:00 2001
From: Ronian526 <11454459+Ronian526@users.noreply.github.com>
Date: Thu, 1 Dec 2022 08:33:19 -0800
Subject: [PATCH 1831/1922] Fix std and var for float16 and float32 (#186)

* Fix std and var for float16 and float32
- fix type mismatch
- add correction parameter to bessel correction calculation
- use unbiased (correction=1) std / var by default

* remove space

Co-authored-by: Ronian <ronian@Ronians-MBP.attlocal.net>
---
 aten/src/ATen/native/mps/operations/ReduceOps.mm | 10 +++++-----
 test/test_mps.py                                 |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index 0633949674b55..5cf152fefa684 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -744,8 +744,8 @@ Tensor std_var_common_impl_mps(
     }
   }
 
-  bool use_correction = correction.has_value();
-  const auto correction_value = use_correction ? correction.value() : false;
+  bool use_correction = !(correction.has_value() && correction.value() == 0);
+  const auto correction_value = correction.value_or(1);
   int64_t correction_n = 1;
 
   native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
@@ -884,7 +884,7 @@ Tensor std_var_common_impl_mps(
      return output_t;
   }
 
-  double bessel_correction = ((double) correction_n) / ((double) (correction_n-1));
+  double bessel_correction = ((double) correction_n) / ((double) (correction_n-correction_value));
 
   auto stream = at::mps::getCurrentMPSStream();
 
@@ -894,7 +894,7 @@ Tensor std_var_common_impl_mps(
     string bessel_corrected = (use_correction && correction_value) ? "unbiased " : "biased ";
     string use_dim_info = (use_dim) ? "use_dim=1:" + to_string(dim_value.size()) : "use_dim=0";
     string keepdim_info = (keepdim) ? "keepdim=1" : "keepdim=0";
-    string key = op_key + use_dim_info + ":" + keepdim_info + ":" + string([ns_key UTF8String]) + ":" + native_mps::getTensorsStringKey(input_t) + ":" + bessel_corrected;
+    string key = op_key + use_dim_info + ":" + keepdim_info + ":" + string([ns_key UTF8String]) + ":" + native_mps::getTensorsStringKey(input_t) + ":" + bessel_corrected + ":" + std::to_string(correction_value);
 
     auto cachedGraph = cache_->LookUpAs<CachedGraph>(key);
     // Initialize once if configuration not found in cache
@@ -916,7 +916,7 @@ Tensor std_var_common_impl_mps(
           if (use_correction && correction_value)
           {
               MPSGraphTensor *besselTensor= [mpsGraph constantWithScalar:bessel_correction
-                                                    dataType:MPSDataTypeFloat32];
+                                                    dataType: native_mps::getMPSDataType(input_t.scalar_type())];
               MPSGraphTensor *correctedTensor = [mpsGraph multiplicationWithPrimaryTensor: outputVarTensor
                                                                           secondaryTensor: besselTensor
                                                                                      name: nil];
diff --git a/test/test_mps.py b/test/test_mps.py
index e4d3498b41705..950bd009c5ab5 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8300,6 +8300,8 @@ class TestConsistency(TestCase):
         'linalg.cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'unique_consecutive': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.nll_loss': ['f32'],
+        'std': ['f16','f32'],
+        'var': ['f16','f32'],
     }
 
 
@@ -8485,9 +8487,7 @@ class TestConsistency(TestCase):
         'masked.sum': [torch.bool],
 
         # Functions that hard crash
-        'std': [torch.float16],
         'stft': [torch.float32],
-        'var': [torch.float16],
         # + forward when requires_grad=True or running backward
         '__rpow__': [torch.int64],
         'masked.std': [torch.int32],

From 6ccf197793bf1526c4487c8e822a09d7d97d68fa Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Mon, 5 Dec 2022 19:42:42 -0500
Subject: [PATCH 1832/1922] Fix correctness issues with Upsample 1D and 2D
 (#183)

* Fix correctness issues with Upsample 1D and 2D
- Implemented following new ops:
upsample_nearest1d_backward
upsample_nearest_exact1d
upsample_nearest_exact1d_backward
- Moved Upsample code from Shape.mm to Upsample.mm

* Corrections on some comments

* Fix the leak in cat_out_mps and clean up
---
 aten/src/ATen/mps/MPSFallback.mm              |   1 +
 aten/src/ATen/native/mps/operations/Shape.mm  | 387 ++----------------
 .../ATen/native/mps/operations/UpSample.mm    | 384 +++++++++++++++++
 aten/src/ATen/native/native_functions.yaml    |   3 +
 test/test_mps.py                              |  71 ++--
 5 files changed, 456 insertions(+), 390 deletions(-)
 create mode 100644 aten/src/ATen/native/mps/operations/UpSample.mm

diff --git a/aten/src/ATen/mps/MPSFallback.mm b/aten/src/ATen/mps/MPSFallback.mm
index e5dfde1d274cc..69dd47f9c145f 100644
--- a/aten/src/ATen/mps/MPSFallback.mm
+++ b/aten/src/ATen/mps/MPSFallback.mm
@@ -62,6 +62,7 @@ Tensor slow_conv2d_forward_mps(
   m.impl("linalg_vector_norm", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
   m.impl("sgn.out", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
   m.impl("_slow_conv2d_forward", slow_conv2d_forward_mps);
+  m.impl("upsample_nearest3d.vec", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
 }
 
 } // namespace at
diff --git a/aten/src/ATen/native/mps/operations/Shape.mm b/aten/src/ATen/native/mps/operations/Shape.mm
index f491f2ff823ad..858767007c6db 100644
--- a/aten/src/ATen/native/mps/operations/Shape.mm
+++ b/aten/src/ATen/native/mps/operations/Shape.mm
@@ -1,18 +1,10 @@
 //  Copyright © 2022 Apple Inc.
 
-#include <ATen/ATen.h>
 #include <ATen/MemoryOverlap.h>
-#include <ATen/Tensor.h>
-#include <ATen/TensorUtils.h>
-#include <ATen/Utils.h>
 #include <ATen/WrapDimUtils.h>
-#include <ATen/mps/MPSStream.h>
 #include <ATen/native/TypeProperties.h>
 #include <ATen/native/TensorShape.h>
 #include <ATen/native/mps/OperationUtils.h>
-#include <c10/core/MemoryFormat.h>
-#include <c10/util/Optional.h>
-#include <torch/library.h>
 
 namespace at {
 namespace native {
@@ -209,13 +201,6 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
   return format.value();
 }
 
-//Tensor cat_mps(TensorList inputs, int64_t dimension) {
-  //ScalarType high_type = result_type(inputs);
-  //Tensor out = at::empty({0}, inputs.front().options().dtype(high_type));
-  //at::native::cat_out_mps(inputs, dimension, out);
-  //return out;
-//}
-
 TORCH_IMPL_FUNC(cat_out_mps)
       (const ITensorListRef& inputs,
        int64_t dimension,
@@ -251,7 +236,6 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
     return t.dim() == 1 && at::native::size(t, 0) == 0;
   };
 
-  const Tensor* notSkippedTensor = NULL; // non-owning reference
 
   // Check for type promotion
   TORCH_CHECK(
@@ -275,10 +259,11 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
   }
   at::assert_no_internal_overlap(out);
 
+  Tensor notSkippedTensor;
   // Indices of tensors to be skipped because they're empty
   std::vector<int64_t> skipped_tensor_indices;
   // Tensors to be read
-  std::vector<const Tensor*> input_tensors;
+  std::vector<Tensor> input_tensors;
   int tensor_idx = 0;
   for(const Tensor& t : materialized_inputs) {
     if(t.numel() == 0 || should_skip(t)) {
@@ -286,14 +271,14 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
       tensor_idx++;
       continue;
     }
-    input_tensors.push_back(&t);
+    input_tensors.push_back(t);
     // TODO: Is this OK?
-    notSkippedTensor = &t;
+    notSkippedTensor = t;
     tensor_idx++;
   }
 
   // If all inputs are empty tensors, return an empty tensor
-  if (notSkippedTensor == NULL) {
+  if (!notSkippedTensor.defined()) {
     return;
   }
 
@@ -305,17 +290,17 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
 
   for (const Tensor& t : inputs) {
     TORCH_CHECK(
-        t.device() == notSkippedTensor->device(),
+        t.device() == notSkippedTensor.device(),
         "torch.cat(): all input tensors must be on the same device. Received ",
         t.device(),
         " and ",
-        notSkippedTensor->device());
+        notSkippedTensor.device());
   }
 
   TORCH_CHECK(
-      out.device() == notSkippedTensor->device(),
+      out.device() == notSkippedTensor.device(),
       "torch.cat(): all input tensors and out must be on the same device, but inputs are on ",
-      notSkippedTensor->device(),
+      notSkippedTensor.device(),
       " and out is on ",
       out.device());
 
@@ -323,7 +308,7 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
   // // TODO: Factor out `compute_output_memory_format`
   // c10::MemoryFormat memory_format = compute_output_memory_format(inputs);
 
-  std::vector<int64_t> size(notSkippedTensor->sizes().vec());
+  std::vector<int64_t> size(notSkippedTensor.sizes().vec());
 
   // Compute size of the result in the cat dimension
   int64_t cat_dim_size = 0;
@@ -333,7 +318,7 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
       continue;
     }
     // TODO: Factor out `check_shape_except_dim`
-    check_shape_except_dim(*notSkippedTensor, tensor, dimension, idx);
+    check_shape_except_dim(notSkippedTensor, tensor, dimension, idx);
     cat_dim_size += at::native::size(tensor, dimension);
     idx++;
   }
@@ -356,8 +341,7 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
   struct CachedGraph : public MPSCachedGraph
   {
     CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
-    // TODO: Free this when no longer needed globally
-    MPSGraphTensor** inputMPSGraphTensors_ = nil;
+    std::vector<MPSGraphTensor*> inputTensors_;
     MPSGraphTensor* outputTensor_ = nil;
   };
 
@@ -383,49 +367,34 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
         CachedGraph *newCachedGraph = nil;
 
         @autoreleasepool {
-          // Initialize graph
           MPSGraph *mpsGraph = make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
 
-          // Create placeholders
           auto len_tensor_array = inputs.size() - skipped_tensor_indices.size();
-          std::vector<MPSGraphTensor*> inputMPSGraphTensors(len_tensor_array);
-          std::vector<MPSGraphTensor*> castInputMPSGraphTensors(len_tensor_array);
-
-          int graph_tensor_idx = 0;
-          for(const Tensor* tensor : input_tensors) {
-            inputMPSGraphTensors[graph_tensor_idx] = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(tensor->scalar_type()) );
-            if(getMPSDataType(result_type(inputs)) == MPSDataTypeBool) {
-              castInputMPSGraphTensors[graph_tensor_idx] = [mpsGraph castTensor:inputMPSGraphTensors[graph_tensor_idx]
-                                                                           toType:MPSDataTypeFloat32
-                                                                             name:[NSString stringWithFormat:@"castInput%@", [NSNumber numberWithInt:graph_tensor_idx]]];
-            }
-            else {
-              if(tensor->scalar_type() != result_type(inputs))
-                castInputMPSGraphTensors[graph_tensor_idx] = [mpsGraph castTensor:inputMPSGraphTensors[graph_tensor_idx]
-                                                                           toType:getMPSDataType(result_type(inputs))
-                                                                             name:[NSString stringWithFormat:@"castInput%@", [NSNumber numberWithInt:graph_tensor_idx]]];
-              else
-                castInputMPSGraphTensors[graph_tensor_idx] = inputMPSGraphTensors[graph_tensor_idx];
+          std::vector<MPSGraphTensor*> castInputTensors(len_tensor_array);
+          newCachedGraph->inputTensors_.reserve(len_tensor_array);
+
+          for (const auto idx : c10::irange(len_tensor_array)) {
+            newCachedGraph->inputTensors_[idx] = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(input_tensors[idx].scalar_type()));
+            if (input_tensors[idx].scalar_type() != result_type(inputs)) {
+              castInputTensors[idx] = [mpsGraph castTensor:newCachedGraph->inputTensors_[idx]
+                                                    toType:getMPSDataType(result_type(inputs))
+                                                      name:@"castInput"];
+            } else {
+              castInputTensors[idx] = newCachedGraph->inputTensors_[idx];
             }
-            graph_tensor_idx++;
           }
 
-          auto inputTensorsArray = [NSArray arrayWithObjects:castInputMPSGraphTensors.data()
+          auto inputTensorsArray = [NSArray arrayWithObjects:castInputTensors.data()
                                                        count:len_tensor_array];
-          // Use concatTensors to concatenate
           MPSGraphTensor* outputTensor = [mpsGraph concatTensors:inputTensorsArray
                                                        dimension:dimension // Maybe convert this from int64_t -> int32
                                                             name:nil];
-
-          newCachedGraph->inputMPSGraphTensors_ = (MPSGraphTensor**)malloc(len_tensor_array * sizeof(MPSGraphTensor*));
-
-          for(int i = 0; i < len_tensor_array; i++)
-            newCachedGraph->inputMPSGraphTensors_[i] = inputMPSGraphTensors[i];
-          if(getMPSDataType(result_type(inputs)) == MPSDataTypeBool)
+          if(getMPSDataType(result_type(inputs)) == MPSDataTypeBool) {
             outputTensor = [mpsGraph castTensor:outputTensor
                                          toType:MPSDataTypeBool
                                            name:@"outputTensor"];
+          }
           newCachedGraph->outputTensor_ = outputTensor;
         }
         return newCachedGraph;
@@ -438,7 +407,7 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
     int t_idx = 0;
     for(const Tensor& tensor : materialized_inputs) {
       if(std::find(skipped_tensor_indices.begin(), skipped_tensor_indices.end(), i) == skipped_tensor_indices.end()) {
-        Placeholder currentInputPlaceholder = Placeholder(cachedGraph->inputMPSGraphTensors_[t_idx], tensor);
+        Placeholder currentInputPlaceholder = Placeholder(cachedGraph->inputTensors_[t_idx], tensor);
         inputPlaceholders.push_back(currentInputPlaceholder);
         t_idx++;
       }
@@ -460,307 +429,5 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
 
 }
 
-void upsample_backward_out_mps(const Tensor& grad_output,
-                               IntArrayRef output_size,
-                               IntArrayRef input_size,
-                               c10::optional<double> scales_h,
-                               c10::optional<double> scales_w,
-                               const Tensor& grad_input,
-                               MPSGraphResizeMode requested_mode,
-                               bool requested_align_corners
-                               )
-{
-    using namespace mps;
-    int64_t input_dims = input_size.size();
-
-    TORCH_CHECK((input_dims == 4),
-            "NCHW tensor expected for input");
-
-    struct CachedGraph : public MPSCachedGraph {
-        CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
-        MPSGraphTensor *gradInputTensor = nil, *gradOutputTensor = nil;
-    };
-    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
-    /* sizes */
-    int64_t output_height = output_size[0];
-    int64_t output_width = output_size[1];
-
-    int64_t input_n = input_size[0];
-    int64_t input_c = input_size[1];
-    int64_t input_height = input_size[2];
-    int64_t input_width = input_size[3];
-
-    @autoreleasepool {
-      MPSShape* output_shape = getMPSShape(grad_output);
-      string key = string("upsample_backward:") + mps::getMPSShapeString(output_shape) + ":" +
-                             getMPSTypeString(grad_output.scalar_type()) +
-                             ":oh" + to_string(output_height) + ":ow" + to_string(output_width) +
-                             ":ih" + to_string(input_height) + ":iw" + to_string(input_width) +
-                             ":mode" + to_string(requested_mode);
-
-      CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
-      if(!cachedGraph) {
-        cachedGraph = static_cast<CachedGraph*>(cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
-
-          CachedGraph *newCachedGraph = nil;
-          @autoreleasepool {
-              MPSGraph* mpsGraph = make_mps_graph();
-              newCachedGraph = new CachedGraph(mpsGraph);
-
-              newCachedGraph->gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(grad_input.scalar_type()), output_shape);
-              MPSGraphTensor * shapeTensor = [mpsGraph constantWithScalar:0
-                                                                    shape:@[[NSNumber numberWithLong: input_n],
-                                                                            [NSNumber numberWithLong: input_c],
-                                                                            [NSNumber numberWithLong:input_height],
-                                                                            [NSNumber numberWithLong:input_width]]
-                                                                 dataType:getMPSDataType(grad_output.scalar_type())];
-
-              newCachedGraph->gradInputTensor  = [mpsGraph resizeWithGradientTensor: newCachedGraph->gradOutputTensor
-                                                                           input: shapeTensor
-                                                                            mode: requested_mode
-                                                                    centerResult: true
-                                                                    alignCorners: requested_align_corners
-                                                                        layout: MPSGraphTensorNamedDataLayoutNCHW
-                                                                            name: nil];
-
-          }
-          return newCachedGraph;
-        }));
-      }
-      Placeholder gradOutputPlaceholder  = Placeholder(cachedGraph->gradOutputTensor, grad_output);
-      Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor, grad_input);
-
-      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-          gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-      };
-      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-          gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
-      };
-      runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
-    }
-}
-
-TORCH_IMPL_FUNC(_upsample_nearest_exact2d_backward_out_mps) (
-    const Tensor& grad_output,
-    IntArrayRef output_size,
-    IntArrayRef input_size,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
-    const Tensor& grad_input)
-{
-    upsample_backward_out_mps(grad_output, output_size, input_size, scales_h, scales_w, grad_input, MPSGraphResizeNearest, false);
-}
-
-TORCH_IMPL_FUNC(upsample_nearest2d_backward_out_mps) (
-    const Tensor& grad_output,
-    IntArrayRef output_size,
-    IntArrayRef input_size,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
-    const Tensor& grad_input)
-{
-    upsample_backward_out_mps(grad_output, output_size, input_size, scales_h, scales_w, grad_input, MPSGraphResizeNearest, false);
-}
-
-TORCH_IMPL_FUNC(upsample_bilinear2d_backward_out_mps) (
-    const Tensor& grad_output,
-    IntArrayRef output_size,
-    IntArrayRef input_size,
-    bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
-    const Tensor& grad_input)
-{
-    upsample_backward_out_mps(grad_output, output_size, input_size, scales_h, scales_w, grad_input, MPSGraphResizeBilinear, align_corners);
-}
-
-void upsample_out_mps(const Tensor& input,
-                      IntArrayRef output_size,
-                      c10::optional<double> scales_h,
-                      c10::optional<double> scales_w,
-                      const Tensor& output,
-                      MPSGraphResizeMode requested_mode,
-                      bool requested_align_corners)
-{
-    // Get stream
-    using namespace mps;
-    struct CachedGraph : public MPSCachedGraph {
-        CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
-        MPSGraphTensor *inputTensor = nil, *outputTensor = nil;
-    };
-    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
-
-    /* sizes */
-    int64_t output_height = output_size[0];
-    int64_t output_width = output_size[1];
-    @autoreleasepool {
-      MPSShape* input_shape = getMPSShape(input);
-      string key = string("upsample_2d:") + mps::getMPSShapeString(input_shape) + ":" +
-                             getMPSTypeString(input.scalar_type()) +
-                             ":h" + to_string(output_height) + ":w" + to_string(output_width) +
-                             ":mode" + to_string(requested_mode);
-
-      CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
-      if(!cachedGraph) {
-        cachedGraph = static_cast<CachedGraph*>(cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
-
-          CachedGraph *newCachedGraph = nil;
-
-          @autoreleasepool {
-              MPSGraph* mpsGraph = make_mps_graph();
-              newCachedGraph = new CachedGraph(mpsGraph);
-
-              newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), input_shape);
-              newCachedGraph->outputTensor = [mpsGraph resizeTensor:newCachedGraph->inputTensor
-                                                               size:@[ @(output_height), @(output_width)]
-                                                               mode:requested_mode
-                                                               centerResult: true
-                                                               alignCorners: requested_align_corners
-                                                               layout: MPSGraphTensorNamedDataLayoutNCHW
-                                                               name:nil];
-          }
-          return newCachedGraph;
-        }));
-      }
-      Placeholder inputPlaceholder  = Placeholder(cachedGraph->inputTensor, input);
-      Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output);
-
-      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-          inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-      };
-      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-          outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
-      };
-      runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
-    }
-}
-
-TORCH_IMPL_FUNC(_upsample_nearest_exact2d_out_mps) (
-    const Tensor& input,
-    IntArrayRef output_size,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
-    const Tensor& output)
-{
-    // Note: this differs from the CPU implementation in the way
-    // ties are resolved wrt to nearest mostly in cases where the scale
-    // is not an integer.
-    // Example:
-    // For upsampling from (2, 5) to (2, 16)
-    // MPS:
-    // tensor([[[[0., 0., 0., 0., 1., 1., 1., 2., 2., 2., 3., 3., 3., 4., 4., 4.],
-    // [5., 5., 5., 5., 6., 6., 6., 7., 7., 7., 8., 8., 8., 9., 9., 9.]]]])
-    // CPU:
-    // tensor([[[[0., 0., 0., 1., 1., 1., 2., 2., 2., 2., 3., 3., 3., 4., 4., 4.],
-    // [5., 5., 5., 6., 6., 6., 7., 7., 7., 7., 8., 8., 8., 9., 9., 9.]]]])
-    using namespace mps;
-    upsample_out_mps(input, output_size, scales_h, scales_w, output, MPSGraphResizeNearest, false);
-}
-
-
-TORCH_IMPL_FUNC(upsample_nearest2d_out_mps) (
-    const Tensor& input,
-    IntArrayRef output_size,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
-    const Tensor& output)
-{
-    // Note: this differs from the CPU implementation in the way
-    // ties are resolved wrt to nearest mostly in cases where the scale
-    // is not an integer.
-    // Example:
-    // For upsampling from (2, 5) to (2, 16)
-    // MPS:
-    // tensor([[[[0., 0., 0., 0., 1., 1., 1., 2., 2., 2., 3., 3., 3., 4., 4., 4.],
-    // [5., 5., 5., 5., 6., 6., 6., 7., 7., 7., 8., 8., 8., 9., 9., 9.]]]])
-    // CPU:
-    // tensor([[[[0., 0., 0., 1., 1., 1., 2., 2., 2., 2., 3., 3., 3., 4., 4., 4.],
-    // [5., 5., 5., 6., 6., 6., 7., 7., 7., 7., 8., 8., 8., 9., 9., 9.]]]])
-    using namespace mps;
-    upsample_out_mps(input, output_size, scales_h, scales_w, output, MPSGraphResizeNearest, false);
-}
-
-TORCH_IMPL_FUNC(upsample_bilinear2d_out_mps) (
-    const Tensor& input,
-    IntArrayRef output_size,
-    bool align_corners,
-    c10::optional<double> scales_h,
-    c10::optional<double> scales_w,
-    const Tensor& output)
-{
-    using namespace mps;
-    upsample_out_mps(input, output_size, scales_h, scales_w, output, MPSGraphResizeBilinear, align_corners);
-}
-
-void upsample1d_out_mps(const Tensor& input,
-                      IntArrayRef output_size,
-                      c10::optional<double> scales,
-                      const Tensor& output,
-                      MPSGraphResizeMode requested_mode)
-{
-    // Get stream
-    using namespace mps;
-    using CachedGraph = MPSUnaryCachedGraph;
-    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
-
-    /* sizes */
-    int64_t out_size = output_size[0];
-    @autoreleasepool {
-      MPSShape* input_shape = getMPSShape(input);
-      string key = string("upsample_1d:") + mps::getMPSShapeString(input_shape) + ":" +
-                             getMPSTypeString(input.scalar_type()) +
-                             ":size" + to_string(out_size) +
-                             ":mode" + to_string(requested_mode);
-
-      CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
-      if(!cachedGraph) {
-        cachedGraph = static_cast<CachedGraph*>(cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
-
-          CachedGraph *newCachedGraph = nil;
-
-          @autoreleasepool {
-              MPSGraph* mpsGraph = make_mps_graph();
-              newCachedGraph = new CachedGraph(mpsGraph);
-
-              newCachedGraph->inputTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), input_shape);
-              newCachedGraph->outputTensor_ = [mpsGraph resizeTensor:newCachedGraph->inputTensor_
-                                                               size:@[ @(out_size), @(1)]
-                                                               mode:requested_mode
-                                                               centerResult: true
-                                                               alignCorners: true
-                                                               layout: MPSGraphTensorNamedDataLayoutCHW
-                                                               name:nil];
-          }
-          return newCachedGraph;
-        }));
-      }
-      Placeholder inputPlaceholder  = Placeholder(cachedGraph->inputTensor_, input);
-      Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
-
-      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-          inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-      };
-      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-          outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
-      };
-      runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
-    }
-}
-
-
-TORCH_IMPL_FUNC(upsample_nearest1d_out_mps) (
-    const Tensor& input,
-    IntArrayRef output_size,
-    c10::optional<double> scales,
-    const Tensor& output)
-{
-    using namespace mps;
-    upsample1d_out_mps(input, output_size, scales, output, MPSGraphResizeNearest);
-}
-
-
-
-
-
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/mps/operations/UpSample.mm b/aten/src/ATen/native/mps/operations/UpSample.mm
new file mode 100644
index 0000000000000..487fd88d6df9f
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/UpSample.mm
@@ -0,0 +1,384 @@
+//  Copyright © 2022 Apple Inc.
+
+#include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/UpSample.h>
+
+namespace at {
+namespace native {
+namespace mps {
+
+// Upsampling operations (1D/2D forward and backward)
+// supported resize_mode: 'nearest' | 'bilinear' | 'nearest-exact'
+void upsample_out_template(const Tensor& input,
+                           IntArrayRef output_size,
+                           c10::optional<IntArrayRef> input_size_opt, // only used for backward pass
+                           c10::optional<double> scale_h_opt,
+                           c10::optional<double> scale_w_opt,
+                           const Tensor& output,
+                           bool align_corners,
+                           const c10::string_view resize_mode_str)
+{
+  if (input.numel() == 0)
+    return;
+
+  const auto input_dim  = input.sizes();
+  if (input_dim.size() <= 3)
+    native::upsample_1d_common_check(input.sizes(), output_size);
+  else
+    native::upsample_2d_common_check(input.sizes(), output_size);
+
+  bool centerResults = false;
+  MPSGraphResizeMode resizeMode = MPSGraphResizeNearest;
+  MPSGraphResizeNearestRoundingMode nearestRoundingMode = MPSGraphResizeNearestRoundingModeFloor;
+  MPSGraphTensorNamedDataLayout dataLayout = input_dim.size() > 3 ?
+                                             MPSGraphTensorNamedDataLayoutNCHW :
+                                             MPSGraphTensorNamedDataLayoutCHW;
+  if (resize_mode_str == "nearest") {
+    resizeMode = MPSGraphResizeNearest;
+  } else if (resize_mode_str == "bilinear") {
+    resizeMode = MPSGraphResizeBilinear;
+    centerResults = true;
+  } else if (resize_mode_str == "nearest-exact") {
+    centerResults = true;
+    nearestRoundingMode = MPSGraphResizeNearestRoundingModeRoundPreferCeil;
+  } else {
+    AT_ERROR("Unsupported resize mode ", resize_mode_str);
+  }
+
+  const bool is_macOS_13_0_or_newer = MPSDevice::getInstance()->macOS_13_0_or_newer();
+  const int64_t output_width  = output_size.size() > 1 ? output_size[1] : output_size[0];
+  const int64_t output_height = output_size.size() > 1 ? output_size[0] : 1;
+  const float scale_w = (scale_w_opt.has_value() && scale_w_opt.value() > 0.) ? static_cast<float>(scale_w_opt.value()) : 0.;
+  const float scale_h = (scale_h_opt.has_value() && scale_h_opt.value() > 0.) ? static_cast<float>(scale_h_opt.value()) : 1.;
+  const float offset_y = centerResults ? (scale_h - 1.0f) / 2.0f : 0.0f;
+  const float offset_x = centerResults ? (scale_w - 1.0f) / 2.0f : 0.0f;
+
+  IntArrayRef input_size;
+  const bool is_backward_pass = input_size_opt.has_value();
+  if (is_backward_pass)
+    input_size = input_size_opt.value();
+
+  struct CachedGraph : public MPSCachedGraph {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *inputTensor = nil, *outputTensor = nil;
+    MPSGraphTensor *outputSizeTensor = nil;
+  };
+  MPSStream* stream = getCurrentMPSStream();
+
+  @autoreleasepool {
+    string key = "upsample_" + std::string(resize_mode_str) + (align_corners ? "_aligned_corners" : "") +
+                 getTensorsStringKey({input}) + ":[" + to_string(scale_h) + "," + to_string(scale_w) + "]:[" +
+                 (is_backward_pass ? getArrayRefString(input_size) : "Undefined") + "]";
+
+    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+    CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
+    if(!cachedGraph) {
+      cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
+          newCachedGraph->outputSizeTensor = mpsGraphRankedPlaceHolder(mpsGraph, MPSDataTypeInt32, @[@(2)]);
+
+          MPSGraphTensor* scaleOffsetTensor = nullptr;
+          MPSGraphTensor* inputSizeTensor = nullptr;
+
+          if (scale_w > 0.0) {
+            const float outScales[4] = {scale_h, scale_w, offset_y, offset_x};
+            scaleOffsetTensor = [mpsGraph constantWithData: [NSData dataWithBytes: outScales length: sizeof(outScales)]
+                                                     shape: @[@4]
+                                                  dataType: MPSDataTypeFloat32];
+          }
+          if (is_backward_pass) {
+            std::vector<NSNumber*> inputSizeVec(4);
+            inputSizeVec[0] = @(input_size[0]);
+            inputSizeVec[1] = @(input_size[1]);
+            inputSizeVec[2] = @(input_size[2]);
+            inputSizeVec[3] = @(input_dim.size() > 3 ? input_size[3] : 1);
+            inputSizeTensor = [mpsGraph constantWithScalar: 0
+                                                     shape: [NSArray arrayWithObjects:inputSizeVec.data() count:input_dim.size()]
+                                                  dataType: getMPSDataType(input.scalar_type())];
+          }
+          if (is_macOS_13_0_or_newer) {
+            if (!is_backward_pass) {
+              if (scaleOffsetTensor && !align_corners) {
+                if (resizeMode == MPSGraphResizeNearest) {
+                  newCachedGraph->outputTensor = [mpsGraph resizeNearestWithTensor: newCachedGraph->inputTensor
+                                                                        sizeTensor: newCachedGraph->outputSizeTensor
+                                                                 scaleOffsetTensor: scaleOffsetTensor
+                                                               nearestRoundingMode: nearestRoundingMode
+                                                                            layout: dataLayout
+                                                                              name: nil];
+                } else { // bilinear forward
+                  newCachedGraph->outputTensor = [mpsGraph resizeBilinearWithTensor: newCachedGraph->inputTensor
+                                                                         sizeTensor: newCachedGraph->outputSizeTensor
+                                                                  scaleOffsetTensor: scaleOffsetTensor
+                                                                             layout: dataLayout
+                                                                               name: nil];
+                }
+              } else { // scaleOffsetTensor == nil || align_corners
+                if (resizeMode == MPSGraphResizeNearest) {
+                  newCachedGraph->outputTensor = [mpsGraph resizeNearestWithTensor: newCachedGraph->inputTensor
+                                                                        sizeTensor: newCachedGraph->outputSizeTensor
+                                                               nearestRoundingMode: nearestRoundingMode
+                                                                      centerResult: centerResults
+                                                                      alignCorners: align_corners
+                                                                            layout: dataLayout
+                                                                              name: nil];
+                } else { // bilinear forward
+                  newCachedGraph->outputTensor = [mpsGraph resizeBilinearWithTensor: newCachedGraph->inputTensor
+                                                                         sizeTensor: newCachedGraph->outputSizeTensor
+                                                                       centerResult: centerResults
+                                                                       alignCorners: align_corners
+                                                                             layout: dataLayout
+                                                                               name: nil];
+                }
+              }
+            } else { // is_backward_pass == true
+              if (scaleOffsetTensor && !align_corners) {
+                if (resizeMode == MPSGraphResizeNearest) {
+                  newCachedGraph->outputTensor = [mpsGraph resizeNearestWithGradientTensor: newCachedGraph->inputTensor
+                                                                                     input: inputSizeTensor
+                                                                         scaleOffsetTensor: scaleOffsetTensor
+                                                                       nearestRoundingMode: nearestRoundingMode
+                                                                                    layout: dataLayout
+                                                                                      name: nil];
+                } else { // bilinear backward
+                  newCachedGraph->outputTensor = [mpsGraph resizeBilinearWithGradientTensor: newCachedGraph->inputTensor
+                                                                                      input: inputSizeTensor
+                                                                          scaleOffsetTensor: scaleOffsetTensor
+                                                                                     layout: dataLayout
+                                                                                       name: nil];
+                }
+              } else { // scaleOffsetTensor == nil || align_corners
+                if (resizeMode == MPSGraphResizeNearest) {
+                  newCachedGraph->outputTensor = [mpsGraph resizeNearestWithGradientTensor: newCachedGraph->inputTensor
+                                                                                     input: inputSizeTensor
+                                                                       nearestRoundingMode: nearestRoundingMode
+                                                                              centerResult: centerResults
+                                                                              alignCorners: align_corners
+                                                                                    layout: dataLayout
+                                                                                      name: nil];
+                } else { // bilinear backward
+                  newCachedGraph->outputTensor = [mpsGraph resizeBilinearWithGradientTensor: newCachedGraph->inputTensor
+                                                                                      input: inputSizeTensor
+                                                                               centerResult: centerResults
+                                                                               alignCorners: align_corners
+                                                                                     layout: dataLayout
+                                                                                       name: nil];
+                }
+              }
+            }
+          } else { // if macOS version < 13.0 (for backwards compatibility)
+            if (!is_backward_pass) {
+              newCachedGraph->outputTensor = [mpsGraph resizeTensor: newCachedGraph->inputTensor
+                                                         sizeTensor: newCachedGraph->outputSizeTensor
+                                                               mode: resizeMode
+                                                       centerResult: YES
+                                                       alignCorners: align_corners
+                                                             layout: dataLayout
+                                                               name: nil];
+            } else {
+              newCachedGraph->outputTensor = [mpsGraph resizeWithGradientTensor: newCachedGraph->inputTensor
+                                                                          input: inputSizeTensor
+                                                                           mode: resizeMode
+                                                                   centerResult: YES
+                                                                   alignCorners: align_corners
+                                                                         layout: dataLayout
+                                                                           name: nil];
+            }
+          }
+        }
+        return newCachedGraph;
+      });
+    }
+    MPSNDArrayDescriptor *sizeDesc = [MPSNDArrayDescriptor descriptorWithDataType: MPSDataTypeInt32 shape: @[@(2)]];
+    MPSNDArray *sizeNDArray = [[[MPSNDArray alloc] initWithDevice: stream->device() descriptor: sizeDesc] autorelease];
+    [sizeNDArray writeBytes: (int32_t[]) {(int32_t)output_height, (int32_t)output_width} strideBytes: nil];
+    MPSGraphTensorData* sizeTensorData = [[[MPSGraphTensorData alloc] initWithMPSNDArray: sizeNDArray] autorelease];
+
+    Placeholder inputPlaceholder  = Placeholder(cachedGraph->inputTensor, input);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output);
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+        inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+        cachedGraph->outputSizeTensor : sizeTensorData,
+    };
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+        outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+}
+
+} // namespace mps
+
+static bool check_mps_compatibility(c10::optional<double> scale)
+{
+  static const bool is_macOS_13_0_or_newer = MPSDevice::getInstance()->macOS_13_0_or_newer();
+  // passing scale factors to MPS's resize APIs is not supported on macOS < 13
+  if (!is_macOS_13_0_or_newer && scale.has_value() && scale.value() > 0.) {
+    TORCH_WARN_ONCE("MPS: passing scale factor to upsample ops is supported natively starting from macOS 13.0. ",
+                    "Falling back on CPU. This may have performance implications.");
+    return false;
+  }
+  return true;
+}
+
+TORCH_IMPL_FUNC(upsample_nearest1d_out_mps) (
+    const Tensor& input,
+    IntArrayRef output_size,
+    c10::optional<double> scale,
+    const Tensor& output)
+{
+  if (check_mps_compatibility(scale)) {
+    mps::upsample_out_template(input, output_size, c10::nullopt, c10::nullopt, scale, output, false, "nearest");
+  } else {
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+    const_cast<Tensor&>(output) = at::upsample_nearest1d(input.to("cpu"), output_size, scale).clone().to("mps");
+  }
+}
+
+TORCH_IMPL_FUNC(upsample_nearest1d_backward_out_mps) (
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    c10::optional<double> scale,
+    const Tensor& grad_input)
+{
+  if (check_mps_compatibility(scale)) {
+    mps::upsample_out_template(grad_output, output_size, input_size, c10::nullopt, scale, grad_input, false, "nearest");
+  } else {
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+    const_cast<Tensor&>(grad_input) = at::upsample_nearest1d_backward(grad_output.to("cpu"), output_size, input_size, scale).clone().to("mps");
+  }
+}
+
+TORCH_IMPL_FUNC(_upsample_nearest_exact1d_out_mps) (
+    const Tensor& input,
+    IntArrayRef output_size,
+    c10::optional<double> scale,
+    const Tensor& output)
+{
+  if (check_mps_compatibility(scale)) {
+    mps::upsample_out_template(input, output_size, c10::nullopt, c10::nullopt, scale, output, false, "nearest-exact");
+  } else {
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+    const_cast<Tensor&>(output) = at::_upsample_nearest_exact1d(input.to("cpu"), output_size, scale).clone().to("mps");
+  }
+}
+
+TORCH_IMPL_FUNC(_upsample_nearest_exact1d_backward_out_mps) (
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    c10::optional<double> scale,
+    const Tensor& grad_input)
+{
+  if (check_mps_compatibility(scale)) {
+    mps::upsample_out_template(grad_output, output_size, input_size, c10::nullopt, scale, grad_input, false, "nearest-exact");
+  } else {
+   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+   const_cast<Tensor&>(grad_input) = at::_upsample_nearest_exact1d_backward(grad_output.to("cpu"), output_size, input_size, scale).clone().to("mps");
+  }
+}
+
+TORCH_IMPL_FUNC(upsample_nearest2d_out_mps) (
+    const Tensor& input,
+    IntArrayRef output_size,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w,
+    const Tensor& output)
+{
+  if (check_mps_compatibility(scales_w)) {
+    mps::upsample_out_template(input, output_size, c10::nullopt, scales_h, scales_w, output, false, "nearest");
+  } else {
+   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+   const_cast<Tensor&>(output) = at::upsample_nearest2d(input.to("cpu"), output_size, scales_h, scales_w).clone().to("mps");
+  }
+}
+
+TORCH_IMPL_FUNC(upsample_nearest2d_backward_out_mps) (
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w,
+    const Tensor& grad_input)
+{
+  if (check_mps_compatibility(scales_w)) {
+    mps::upsample_out_template(grad_output, output_size, input_size, scales_h, scales_w, grad_input, false, "nearest");
+  } else {
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+    const_cast<Tensor&>(grad_input) = at::upsample_nearest2d_backward(grad_output.to("cpu"), output_size, input_size, scales_h, scales_w).clone().to("mps");
+  }
+}
+
+TORCH_IMPL_FUNC(_upsample_nearest_exact2d_out_mps) (
+    const Tensor& input,
+    IntArrayRef output_size,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w,
+    const Tensor& output)
+{
+  if (check_mps_compatibility(scales_w)) {
+    mps::upsample_out_template(input, output_size, c10::nullopt, scales_h, scales_w, output, false, "nearest-exact");
+  } else {
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+    const_cast<Tensor&>(output) = at::_upsample_nearest_exact2d(input.to("cpu"), output_size, scales_h, scales_w).clone().to("mps");
+  }
+}
+
+TORCH_IMPL_FUNC(_upsample_nearest_exact2d_backward_out_mps) (
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w,
+    const Tensor& grad_input)
+{
+  if (check_mps_compatibility(scales_w)) {
+    mps::upsample_out_template(grad_output, output_size, input_size, scales_h, scales_w, grad_input, false, "nearest-exact");
+  } else {
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+    const_cast<Tensor&>(grad_input) = at::_upsample_nearest_exact2d_backward(grad_output.to("cpu"), output_size, input_size, scales_h, scales_w).clone().to("mps");
+  }
+}
+
+TORCH_IMPL_FUNC(upsample_bilinear2d_out_mps) (
+    const Tensor& input,
+    IntArrayRef output_size,
+    bool align_corners,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w,
+    const Tensor& output)
+{
+  if (check_mps_compatibility(scales_w)) {
+    mps::upsample_out_template(input, output_size, c10::nullopt, scales_h, scales_w, output, align_corners, "bilinear");
+  } else {
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+    const_cast<Tensor&>(output) = at::upsample_bilinear2d(input.to("cpu"), output_size, align_corners, scales_h, scales_w).clone().to("mps");
+  }
+}
+
+TORCH_IMPL_FUNC(upsample_bilinear2d_backward_out_mps) (
+    const Tensor& grad_output,
+    IntArrayRef output_size,
+    IntArrayRef input_size,
+    bool align_corners,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w,
+    const Tensor& grad_input)
+{
+  if (check_mps_compatibility(scales_w)) {
+    mps::upsample_out_template(grad_output, output_size, input_size, scales_h, scales_w, grad_input, align_corners, "bilinear");
+  } else {
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+    const_cast<Tensor&>(grad_input) = at::upsample_bilinear2d_backward(grad_output.to("cpu"), output_size, input_size, align_corners, scales_h, scales_w).clone().to("mps");
+  }
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 7f5e57716242d..880808df75b5d 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -11666,6 +11666,7 @@
   dispatch:
     CPU: _upsample_nearest_exact1d_out_cpu
     CUDA: _upsample_nearest_exact1d_out_cuda
+    MPS: _upsample_nearest_exact1d_out_mps
 
 - func: upsample_nearest1d(Tensor self, SymInt[1] output_size, float? scales=None) -> Tensor
   python_module: nn
@@ -11681,6 +11682,7 @@
   dispatch:
     CPU: upsample_nearest1d_backward_out_cpu
     CUDA: upsample_nearest1d_backward_out_cuda
+    MPS: upsample_nearest1d_backward_out_mps
 
 - func: _upsample_nearest_exact1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -11688,6 +11690,7 @@
   dispatch:
     CPU: _upsample_nearest_exact1d_backward_out_cpu
     CUDA: _upsample_nearest_exact1d_backward_out_cuda
+    MPS: _upsample_nearest_exact1d_backward_out_mps
 
 - func: upsample_nearest1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor
   python_module: nn
diff --git a/test/test_mps.py b/test/test_mps.py
index 950bd009c5ab5..d1d0b1c022c27 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -4061,26 +4061,6 @@ def helper(shape):
         helper((1, 5))
         helper((5, 9, 7, 4))
 
-    def test_upsample_nearest_exact2d(self):
-        def helper(N, C, H, W):
-            inputCPU = torch.arange(N * C * H * W, device='cpu', dtype=torch.float,
-                                    requires_grad=True).reshape(N, C, H, W)
-            inputCPU.retain_grad()
-            inputMPS = inputCPU.detach().clone().to('mps').requires_grad_()
-
-            outputCPU = torch.nn.functional.interpolate(inputCPU, size=(5, 5), mode='nearest-exact')
-            outputMPS = torch.nn.functional.interpolate(inputMPS, size=(5, 5), mode='nearest-exact')
-
-            self.assertEqual(outputCPU, outputMPS)
-
-            outputCPU.backward(gradient=torch.full_like(outputCPU, 0.3))
-            outputMPS.backward(gradient=torch.full_like(outputMPS, 0.3))
-
-            self.assertEqual(inputCPU.grad, inputMPS.grad)
-
-        helper(1, 1, 4, 4)
-        helper(7, 5, 3, 2)
-
     def test_upsample_nearest2d(self):
         def helper(N, C, H, W):
             inputCPU = torch.arange(N * C * H * W, device='cpu', dtype=torch.float,
@@ -4146,19 +4126,49 @@ def helper(N, C, H, W):
         helper(1, 1, 4, 4)
         helper(7, 5, 3, 2)
 
-    def test_upsample_nearest1d(self):
-        def helper(N, C, H, W):
-            inputCPU = torch.arange(C * H * W, device='cpu', dtype=torch.float,
-                                    requires_grad=True).reshape(C, H, W)
-            inputMPS = inputCPU.detach().clone().to('mps')
+    def test_interpolate(self):
+        def helper(shape, output_size, scales, mode, align_corners=False):
+            inputCPU = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+            inputCPU.retain_grad()
+            inputMPS = inputCPU.detach().clone().to('mps').requires_grad_()
 
-            outputCPU = torch.nn.functional.interpolate(inputCPU, scale_factor=2.0, mode='nearest')
-            outputMPS = torch.nn.functional.interpolate(inputMPS, scale_factor=2.0, mode='nearest')
+            # align_corners is used for 2D interpolation only
+            if (align_corners == True and len(shape) > 3 and mode == 'bilinear'):
+                if (scales != None):
+                    outputCPU = nn.functional.interpolate(inputCPU, scale_factor=scales, mode=mode, align_corners=align_corners)
+                    outputMPS = nn.functional.interpolate(inputMPS, scale_factor=scales, mode=mode, align_corners=align_corners)
+                else:
+                    outputCPU = nn.functional.interpolate(inputCPU, size=output_size, mode=mode, align_corners=align_corners)
+                    outputMPS = nn.functional.interpolate(inputMPS, size=output_size, mode=mode, align_corners=align_corners)
+            elif (scales != None):
+                outputCPU = nn.functional.interpolate(inputCPU, scale_factor=scales, mode=mode)
+                outputMPS = nn.functional.interpolate(inputMPS, scale_factor=scales, mode=mode)
+            else:
+                outputCPU = nn.functional.interpolate(inputCPU, size=output_size, mode=mode)
+                outputMPS = nn.functional.interpolate(inputMPS, size=output_size, mode=mode)
 
             self.assertEqual(outputCPU, outputMPS)
 
-        helper(1, 1, 4, 4)
-        helper(7, 5, 3, 2)
+            # backward pass (chose 0.6 just to have the grad_output != 1)
+            outputCPU.backward(gradient=torch.full_like(outputCPU, 0.6))
+            outputMPS.backward(gradient=torch.full_like(outputMPS, 0.6))
+            self.assertEqual(inputCPU.grad, inputMPS.grad)
+
+        # 1D interpolation
+        for mode in ['nearest', 'nearest-exact']:
+            helper([2, 3, 4], [3], None, mode) # downsample with size
+            helper([2, 3, 4], [6], None, mode) # upsample with size
+            helper([2, 3, 4], None, [0.6], mode) # downsample with scale factor
+            helper([2, 3, 4], None, [1.7], mode) # upsample with scale factor
+        # 2D interpolation
+        for mode in ['nearest', 'nearest-exact', 'bilinear']:
+            helper([2, 3, 4, 5], [3, 4], None, mode) # downsample_nearest with size
+            helper([2, 3, 4, 5], [6, 7], None, mode) # upsample_nearest with size
+            helper([2, 3, 4, 5], None, [0.6, 0.7], mode) # downsample_nearest with scale factor
+            helper([2, 3, 4, 5], None, [1.4, 1.7], mode) # upsample_nearest with scale factor
+        # align_corners=True
+        helper([2, 3, 4, 5], [3, 4], None, 'bilinear', True)
+        helper([2, 3, 4, 5], None, [1.4, 1.7], 'bilinear', True)
 
     # Test concat forward
     def test_cat1(self):
@@ -8032,6 +8042,7 @@ def test_serialization_map_location(self):
 
 
 class TestConsistency(TestCase):
+
     # TODO: This is only used while some ops are being added.
     # This list should contain all ops and dtypes eventually
     # This can be generated automatically in the `new_mps_allowlist.txt` file
@@ -8237,6 +8248,7 @@ class TestConsistency(TestCase):
         'nn.functional.triplet_margin_loss': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.triplet_margin_with_distance_loss': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.upsample_bilinear': ['f32'],
+        'nn.functional.upsample_nearest': ['f32'],
         'norm': ['f32', 'f16'],
         'positive': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'pow': ['f16'],
@@ -8568,7 +8580,6 @@ class TestConsistency(TestCase):
         'nn.functional.interpolate': ['torch.float32'],
         'nn.functional.max_pool1d': ['torch.float32'],
         'nn.functional.max_pool2d': ['torch.float32'],
-        'nn.functional.upsample_nearest': ['torch.float32'],
         'topk': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
 
         # failures due to lack of op implementation on MPS backend

From 36658d20b951690c5b8b3d2e3d76035b0c9bd181 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Wed, 7 Dec 2022 15:51:20 -0800
Subject: [PATCH 1833/1922] Dev/denis/reduce ops multi axes support (#188)

* Add multi axes support for reduce ops

* Add back bessel_corrected variable

* Remove input flattening from reduce ops; enable more tests in TestConsistency

* Refactor Repeat.mm

* Fix remaining reduce ops issues

* Remove debug code

* Fix missing colon

* Always wrap input dimensions

* Remove dimension wrapping (already wrapped)

* Address remaining PR comments
---
 aten/src/ATen/native/mps/OperationUtils.h     |   2 +
 aten/src/ATen/native/mps/OperationUtils.mm    |  25 +
 .../ATen/native/mps/operations/ReduceOps.mm   | 672 +++++++++---------
 aten/src/ATen/native/mps/operations/Repeat.mm | 113 +--
 test/test_mps.py                              |  36 +-
 5 files changed, 392 insertions(+), 456 deletions(-)

diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
index ab446bb856b04..09f42a46bfaa5 100644
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@@ -44,6 +44,8 @@ MPSDataType getMPSScalarType(ScalarType scalar_type);
 MPSScalar   getMPSScalar(const Scalar& scalar, ScalarType type);
 std::string getMPSTypeString(ScalarType scalar_type);
 std::string scalarToMetalTypeString(const c10::ScalarType& scalar_type);
+NSArray<NSNumber*>* getTensorAxes(const Tensor& t);
+NSArray<NSNumber*>* getTensorAxes(const Tensor& t, at::OptionalIntArrayRef dim);
 std::string getMPSShapeString(MPSShape* shape);
 std::string getTensorsStringKey(const TensorList& tensors, bool use_scalar_value = false);
 std::string getArrayRefString(const IntArrayRef s);
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index 02ce0982513d8..a0ef49fbaf408 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -111,6 +111,31 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
   }
 }
 
+
+NSArray<NSNumber*>* getTensorAxes(const Tensor& t) {
+  int64_t ndim = t.dim();
+  auto axes = [NSMutableArray<NSNumber*> arrayWithCapacity:ndim];
+  for (const auto i: c10::irange(ndim)) {
+    axes[i] = [NSNumber numberWithInteger:i];
+  }
+  return axes;
+}
+
+NSArray<NSNumber*>* getTensorAxes(const Tensor& t, at::OptionalIntArrayRef dim) {
+  if (dim.has_value() && dim.value().size() != 0) {
+    IntArrayRef dimValues = dim.value();
+    int ndim = dimValues.size();
+    auto axes = [NSMutableArray<NSNumber*> arrayWithCapacity:ndim];
+    for (const auto i: c10::irange(ndim)) {
+      axes[i] = [NSNumber numberWithInteger:dimValues[i]];
+    }
+
+    return axes;
+  }
+
+  return getTensorAxes(t);
+}
+
 std::string getMPSShapeString(MPSShape* shape) {
     std::string str;
     for(NSNumber *elem in shape) {
diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index 5cf152fefa684..a1f4449d18da5 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -34,16 +34,6 @@
   TRACE
 };
 
-
-NSArray<NSNumber*>* getTensorAxes(const Tensor& t) {
-  int64_t ndim = t.dim();
-  auto axes = [NSMutableArray<NSNumber*> arrayWithCapacity:ndim];
-  for (const auto i: c10::irange(ndim)) {
-    axes[i] = [NSNumber numberWithInteger:i];
-  }
-  return axes;
-}
-
 void set_apparent_shapes(NSMutableArray<NSNumber*> * &apparent_out_shape,
                          NSMutableArray<NSNumber*> * &apparent_in_shape,
                          int64_t num_reduce_dims,
@@ -92,7 +82,6 @@ void set_apparent_shapes(NSMutableArray<NSNumber*> * &apparent_out_shape,
       }
     }
   }
-
 }
 
 // Helper function to set the axes of reduction
@@ -151,16 +140,14 @@ void set_axes_and_shapes(const Tensor& input_t,
   }
 }
 
-void reduction_out_mps
-   (const Tensor& input_tensor,
-    OptionalIntArrayRef opt_dim,
-    bool keepdim,
-    c10::optional<ScalarType> dtype,
-    const Tensor& output_t,
-    MPSReductionType reduction_type,
-    const std::string& func_name) {
-
-  auto input_t = (input_tensor.sizes().size() == 0) ? input_tensor.view({1}) : input_tensor;
+void reduction_out_mps(
+  const Tensor& input_t,
+  OptionalIntArrayRef opt_dim,
+  bool keepdim,
+  c10::optional<ScalarType> dtype,
+  const Tensor& output_t,
+  MPSReductionType reduction_type,
+  const std::string& func_name) {
 
   IntArrayRef input_shape = input_t.sizes();
 
@@ -168,7 +155,7 @@ void set_axes_and_shapes(const Tensor& input_t,
     IntArrayRef dim = opt_dim.value();
     for(int i = 0; i < dim.size(); i++) {
       auto wrap_dim = maybe_wrap_dim(dim[i], input_shape.size());
-      TORCH_CHECK(wrap_dim < input_shape.size(),
+      TORCH_CHECK(wrap_dim < (input_shape.size() == 0 ? input_t.numel() : input_shape.size()),
       func_name+": reduction dim must be in the range of input shape")
     }
   }
@@ -181,20 +168,27 @@ void set_axes_and_shapes(const Tensor& input_t,
   NSMutableArray<NSNumber*> *output_shape = nil;
 
   set_axes_and_shapes(input_t, opt_dim, axes, apparent_input_shape, apparent_output_shape, output_shape);
-
-   auto cache_ = native_mps::MPSGraphCache::getInstance();
+  NSArray<NSNumber*>* wrappedAxes = mps::getTensorAxes(input_t, opt_dim);
+  auto cache_ = native_mps::MPSGraphCache::getInstance();
 
   if (output_t.numel() == 0 || input_t.numel() == 0) {
+    if (reduction_type == MPSReductionType::PROD) {
+      output_t.fill_(1);
+    }
     return;
   }
 
   auto stream = at::mps::getCurrentMPSStream();
-
   @autoreleasepool {
-
-    // TODO: Make this key proper
-    NSString* ns_key = [[axes valueForKey:@"description"] componentsJoinedByString:@","];
-    string key =  func_name+":" + string([ns_key UTF8String]) + ":" + native_mps::getMPSTypeString(input_t.scalar_type()) + ":" + native_mps::getMPSTypeString(output_t.scalar_type());
+    std::string dtype_str = dtype.has_value() ? mps::getMPSTypeString(dtype.value()) : "";
+    NSString* ns_key = [[wrappedAxes valueForKey:@"description"] componentsJoinedByString:@","];
+    string key = func_name                                 + ":" +
+                 string([ns_key UTF8String])               + ":" +
+                 native_mps::getTensorsStringKey(input_t)  + ":" +
+                 std::to_string(keepdim)                   + ":" +
+                 std::to_string(reduction_type)            + ":" +
+                 native_mps::getTensorsStringKey(output_t) + ":" +
+                 dtype_str;
     using CachedGraph = native_mps::MPSUnaryCachedGraph;
     auto cachedGraph = cache_->LookUpAs<CachedGraph>(key);
 
@@ -206,31 +200,39 @@ void set_axes_and_shapes(const Tensor& input_t,
         @autoreleasepool {
           MPSGraph* mpsGraph = native_mps::make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
+          MPSDataType input_type = native_mps::getMPSDataType(input_t.scalar_type());
 
-          MPSGraphTensor* inputTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(input_t.scalar_type()));
-
-          MPSGraphTensor* castInputTensor = nil;
+          MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t);
+          MPSGraphTensor* castInputTensor = inputTensor;
+          MPSDataType inputCastDtype = MPSDataTypeInvalid;
+          if (dtype.has_value() &&
+             (dtype.value() == kFloat || dtype.value() == kHalf || dtype.value() == kInt)) {
+            inputCastDtype = native_mps::getMPSDataType(dtype.value());
+          } else if (input_type != MPSDataTypeInt32   &&
+                     input_type != MPSDataTypeFloat32 &&
+                     input_type != MPSDataTypeFloat16) {
+            inputCastDtype = MPSDataTypeFloat32;
+          }
 
-          if(input_t.scalar_type() != ScalarType::Float && input_t.scalar_type() != ScalarType::Int)
-            castInputTensor =  [mpsGraph castTensor:inputTensor
-                                             toType:MPSDataTypeFloat32
-                                               name:@"castInputTensor"];
-          else
-            castInputTensor = inputTensor;
+          if (inputCastDtype != MPSDataTypeInvalid) {
+            castInputTensor = [mpsGraph castTensor:inputTensor
+                                            toType:inputCastDtype
+                                              name:@"castInputTensor"];
+          }
 
           MPSGraphTensor* castOutputTensor = nil;
 
           if(reduction_type == MPSReductionType::SUM) {
             castOutputTensor = [mpsGraph reductionSumWithTensor:castInputTensor
-                                                           axes:axes
+                                                           axes:wrappedAxes
                                                            name:nil];
           } else if(reduction_type == MPSReductionType::PROD) {
             castOutputTensor = [mpsGraph reductionProductWithTensor:castInputTensor
-                                                               axes:axes
+                                                               axes:wrappedAxes
                                                                name:nil];
           } else if(reduction_type == MPSReductionType::MEAN) {
-            castOutputTensor = [mpsGraph meanOfTensor:inputTensor
-                                                 axes:axes
+            castOutputTensor = [mpsGraph meanOfTensor:castInputTensor
+                                                 axes:wrappedAxes
                                                  name:nil];
           } else if(reduction_type == MPSReductionType::COUNT_NONZERO) {
             MPSGraphTensor* zeros = [mpsGraph constantWithScalar:0
@@ -241,16 +243,16 @@ void set_axes_and_shapes(const Tensor& input_t,
                                                                       name:nil];
 
             castOutputTensor = [mpsGraph reductionSumWithTensor:nonZeros
-                                                           axes:axes
+                                                           axes:wrappedAxes
                                                            name:nil];
           }
           else if(reduction_type == MPSReductionType::AMAX) {
-            castOutputTensor = [mpsGraph reductionMaximumWithTensor:inputTensor
-                                                               axes:axes
+            castOutputTensor = [mpsGraph reductionMaximumWithTensor:castInputTensor
+                                                               axes:wrappedAxes
                                                                name:nil];
           } else if(reduction_type == MPSReductionType::AMIN) {
-            castOutputTensor = [mpsGraph reductionMinimumWithTensor:inputTensor
-                                                               axes:axes
+            castOutputTensor = [mpsGraph reductionMinimumWithTensor:castInputTensor
+                                                               axes:wrappedAxes
                                                                name:nil];
           } else if(reduction_type == MPSReductionType::TRACE) {
             MPSGraphTensor *bandPartWithTensor = [mpsGraph bandPartWithTensor:inputTensor
@@ -278,13 +280,7 @@ void set_axes_and_shapes(const Tensor& input_t,
       });
     }
 
-    auto inputPlaceholder = native_mps::Placeholder();
-
-    if (apparent_input_shape) {
-      inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t, apparent_input_shape);
-    } else {
-      inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
-    }
+    auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
     auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, apparent_output_shape);
     NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
       inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
@@ -295,17 +291,16 @@ void set_axes_and_shapes(const Tensor& input_t,
     };
     native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
   }
-
 }
 
-TORCH_IMPL_FUNC(sum_out_mps)
-   (const Tensor& input_t,
-    OptionalIntArrayRef opt_dim,
-    bool keepdim,
-    c10::optional<ScalarType> dtype,
-    const Tensor& output_t) {
+TORCH_IMPL_FUNC(sum_out_mps)(
+  const Tensor& input_t,
+  OptionalIntArrayRef opt_dim,
+  bool keepdim,
+  c10::optional<ScalarType> dtype,
+  const Tensor& output_t) {
 
-    reduction_out_mps(input_t, opt_dim, keepdim, dtype, output_t, MPSReductionType::SUM, "sum_out_mps");
+  reduction_out_mps(input_t, opt_dim, keepdim, dtype, output_t, MPSReductionType::SUM, "sum_out_mps");
 }
 
 Tensor trace_mps_out(const Tensor& self) {
@@ -337,7 +332,8 @@ Tensor trace_mps_out(const Tensor& self) {
 
     int64_t dims[1] = {dim};
 
-    reduction_out_mps(input_t, IntArrayRef(dims, 1), keepdim, dtype, output_t, MPSReductionType::PROD, "prod_out_mps");
+  int64_t dims[1] = {dim};
+  reduction_out_mps(input_t, IntArrayRef(dims, 1), keepdim, dtype, output_t, MPSReductionType::PROD, "prod_out_mps");
 }
 
 // Taken from ReduceOps.cpp
@@ -348,6 +344,7 @@ inline ScalarType get_dtype_from_self(
   if (dtype.has_value()) {
     return dtype.value();
   }
+
   ScalarType src_type = self.scalar_type();
   if (promote_integers && at::isIntegralType(src_type, /*includeBool=*/true)) {
     return kLong;
@@ -355,26 +352,25 @@ inline ScalarType get_dtype_from_self(
   return src_type;
 }
 
-TORCH_IMPL_FUNC(amax_out_mps)
-   (const Tensor& input_t,
-    IntArrayRef dim,
-    bool keepdim,
-    const Tensor& output_t) {
+TORCH_IMPL_FUNC(amax_out_mps)(
+  const Tensor& input_t,
+  IntArrayRef dim,
+  bool keepdim,
+  const Tensor& output_t) {
 
-    reduction_out_mps(input_t, dim, keepdim, c10::nullopt, output_t, MPSReductionType::AMAX, "amax_out_mps");
+  reduction_out_mps(input_t, dim, keepdim, c10::nullopt, output_t, MPSReductionType::AMAX, "amax_out_mps");
 }
 
-TORCH_IMPL_FUNC(amin_out_mps)
-   (const Tensor& input_t,
-    IntArrayRef dim,
-    bool keepdim,
-    const Tensor& output_t) {
+TORCH_IMPL_FUNC(amin_out_mps)(
+  const Tensor& input_t,
+  IntArrayRef dim,
+  bool keepdim,
+  const Tensor& output_t) {
 
-    reduction_out_mps(input_t, dim, keepdim, c10::nullopt, output_t, MPSReductionType::AMIN, "amin_out_mps");
+  reduction_out_mps(input_t, dim, keepdim, c10::nullopt, output_t, MPSReductionType::AMIN, "amin_out_mps");
 }
 
 Tensor prod_mps(const Tensor &self, c10::optional<ScalarType> opt_dtype) {
-
   std::vector<int64_t> dims(self.dim());
   std::iota(dims.begin(), dims.end(), 0);
 
@@ -393,57 +389,60 @@ Tensor prod_mps(const Tensor &self, c10::optional<ScalarType> opt_dtype) {
 
 
 Tensor count_nonzero_mps(const Tensor& self, IntArrayRef dims){
-  NSMutableArray<NSNumber*> *axes = nil;
-  NSMutableArray<NSNumber*> *apparent_input_shape = nil;
-  NSMutableArray<NSNumber*> *apparent_output_shape = nil;
-  NSMutableArray<NSNumber*> *output_shape = nil;
-
-  set_axes_and_shapes(self, dims, axes, apparent_input_shape, apparent_output_shape, output_shape);
-
-  std::vector<int64_t> raw_output_shape([output_shape count]);
-  for(auto i: c10::irange(raw_output_shape.size())) {
-    raw_output_shape[i] = [output_shape[i] longValue];
+  int64_t shape_size = dims.size() == 0 ? 0 : self.sizes().size() - dims.size();
+  int64_t out_shape = std::max(shape_size, 0LL);
+  std::vector<int64_t> output_shape(out_shape);
+  std::vector<int64_t> dims_vec = dims.vec();
+  std::for_each(dims_vec.begin(), dims_vec.end(), [&](int64_t &n){ n = maybe_wrap_dim(n, self); });
+
+  if (out_shape != 0) {
+    int out_dim = 0;
+    for (const auto self_dim: c10::irange((self.sizes().size()))) {
+      if (std::find(dims_vec.begin(), dims_vec.end(), self_dim) == dims_vec.end()) {
+        output_shape[out_dim++] = (self.sizes()[self_dim]);
+      }
+    }
   }
 
   Tensor output_t = at::native::empty_mps(
-                      IntArrayRef(raw_output_shape),
+                      IntArrayRef(output_shape),
                       ScalarType::Long,
                       c10::nullopt,
                       kMPS,
                       c10::nullopt,
                       c10::nullopt);
-
   reduction_out_mps(self, dims, false, self.scalar_type(), const_cast<Tensor&>(output_t), MPSReductionType::COUNT_NONZERO, "count_nonzero_mps");
 
   return output_t;
 }
 
-TORCH_IMPL_FUNC(mean_out_mps)
-   (const Tensor& input_t,
-    OptionalIntArrayRef opt_dim,
-    bool keepdim,
-    c10::optional<ScalarType> dtype,
-    const Tensor& output_t) {
+TORCH_IMPL_FUNC(mean_out_mps)(
+  const Tensor& input_t,
+  OptionalIntArrayRef opt_dim,
+  bool keepdim,
+  c10::optional<ScalarType> dtype,
+  const Tensor& output_t) {
 
-    reduction_out_mps(input_t, opt_dim, keepdim, dtype, output_t, MPSReductionType::MEAN, "mean_out_mps");
+  reduction_out_mps(input_t, opt_dim, keepdim, dtype, output_t, MPSReductionType::MEAN, "mean_out_mps");
 }
 
 void impl_func_norm_mps(
-    const Tensor& input_tensor,
-    const Tensor& other_tensor,
-    const OptionalScalarRef& opt_p,
-    IntArrayRef dim,
-    bool keepdim,
-    optional<ScalarType> opt_dtype,
-    const Tensor& output_t,
-    bool cdist = false,
-    c10::optional<IntArrayRef> input_broadcasted_shape = c10::nullopt,
-    NormOpBlock normOpBlock = nullptr
-    ) {
+  const Tensor& input_tensor,
+  const Tensor& other_tensor,
+  const OptionalScalarRef& opt_p,
+  IntArrayRef dim,
+  bool keepdim,
+  optional<ScalarType> opt_dtype,
+  const Tensor& output_t,
+  bool cdist = false,
+  c10::optional<IntArrayRef> input_broadcasted_shape = c10::nullopt,
+  NormOpBlock normOpBlock = nullptr
+  ) {
 
   namespace native_mps = at::native::mps;
-  if (input_tensor.numel() == 0)
+  if (input_tensor.numel() == 0) {
     return;
+  }
 
   auto input_t = (input_tensor.sizes().size() == 0) ? input_tensor.view({1}) : input_tensor;
   auto in_dtype = opt_dtype.value_or(input_tensor.scalar_type());
@@ -451,7 +450,7 @@ void impl_func_norm_mps(
 
   IntArrayRef input_shape = cdist ? input_broadcasted_shape.value() : input_t.sizes();
 
-  for(int i = 0; i < dim.size(); i++) {
+  for (const auto i : c10::irange(dim.size())) {
     auto wrap_dim = maybe_wrap_dim(dim[i], input_shape.size());
     TORCH_CHECK(wrap_dim < input_shape.size(),
     "norm_out_mps: reduction dim must be in the range of input shape")
@@ -488,6 +487,7 @@ void impl_func_norm_mps(
                       input_shape,
                       axes);
 
+  NSArray<NSNumber*>* wrappedAxes = mps::getTensorAxes(input_t, dim);
   if (cdist) {
     apparent_input_shape  = [mps::getMPSShape(input_tensor.sizes()) mutableCopy];
     apparent_output_shape = [mps::getMPSShape(output_t.sizes()) mutableCopy];
@@ -498,7 +498,6 @@ void impl_func_norm_mps(
   }
 
   auto stream = at::mps::getCurrentMPSStream();
-
   @autoreleasepool {
     NSString* ns_key = [[axes valueForKey:@"description"] componentsJoinedByString:@","];
       string keepdim_info = (keepdim) ? "keepdim=1" : "keepdim=0";
@@ -515,26 +514,23 @@ void impl_func_norm_mps(
         @autoreleasepool {
           MPSGraph* mpsGraph = native_mps::make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
+          newCachedGraph->inputTensor_ = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_tensor);
 
           if (cdist) {
-            newCachedGraph->inputTensor_ = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_tensor);
             newCachedGraph->otherTensor_ = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, other_tensor);
-          } else {
-            newCachedGraph->inputTensor_ = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(input_t.scalar_type()));
           }
 
           MPSGraphTensor* inputTensor = cdist ? normOpBlock(newCachedGraph, newCachedGraph->inputTensor_, newCachedGraph->otherTensor_) :
                                                 newCachedGraph->inputTensor_;
           if (opt_dtype.has_value()) {
             inputTensor = [mpsGraph castTensor:inputTensor
-                                         toType:mps_input_dtype
-                                           name:@"any_all"];
+                                        toType:mps_input_dtype
+                                          name:@"any_all"];
           }
 
           MPSGraphTensor *outputTensor;
 
-          if (pIsZero)
-          {
+          if (pIsZero) {
               MPSGraphTensor *absoluteTensor = [mpsGraph absoluteWithTensor:inputTensor
                                                                        name:nil];
               MPSGraphTensor *powerValTensor = [mpsGraph constantWithScalar:p
@@ -543,27 +539,23 @@ void impl_func_norm_mps(
                                                              secondaryTensor:powerValTensor
                                                                         name:nil];
               outputTensor = [mpsGraph reductionSumWithTensor:powerTensor
-                                                         axes:axes
+                                                         axes:wrappedAxes
                                                          name:nil];
           }
-          else if (pIsPosInf)
-          {
+          else if (pIsPosInf) {
               MPSGraphTensor *absoluteTensor = [mpsGraph absoluteWithTensor:inputTensor
                                                                        name:nil];
               outputTensor = [mpsGraph reductionMaximumWithTensor:absoluteTensor
-                                                             axes:axes
+                                                             axes:wrappedAxes
                                                              name:nil];
           }
-          else if (pIsNegInf)
-          {
+          else if (pIsNegInf) {
               MPSGraphTensor *absoluteTensor = [mpsGraph absoluteWithTensor:inputTensor
                                                                        name:nil];
               outputTensor = [mpsGraph reductionMinimumWithTensor:absoluteTensor
-                                                             axes:axes
+                                                             axes:wrappedAxes
                                                              name:nil];
-          }
-          else
-          {
+          } else {
               MPSGraphTensor *absoluteTensor = [mpsGraph absoluteWithTensor:inputTensor
                                                                        name:nil];
 
@@ -578,7 +570,7 @@ void impl_func_norm_mps(
                                                                         name:nil];
 
               MPSGraphTensor *reductionSumTensor = [mpsGraph reductionSumWithTensor:powerTensor
-                                                                         axes:axes
+                                                                         axes:wrappedAxes
                                                                          name:nil];
 
               outputTensor = [mpsGraph powerWithPrimaryTensor:reductionSumTensor
@@ -596,14 +588,8 @@ void impl_func_norm_mps(
       });
     }
 
-    auto inputPlaceholder = native_mps::Placeholder();
     auto otherPlaceholder = native_mps::Placeholder();
-
-    if(apparent_input_shape)
-      inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t, apparent_input_shape);
-    else
-      inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
-
+    auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
     auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, apparent_output_shape);
 
     NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =[NSMutableDictionary dictionary];
@@ -734,13 +720,12 @@ Tensor std_var_common_impl_mps(
   bool use_dim = dim.has_value();
   IntArrayRef dim_value = use_dim ? dim.value() : NULL;
 
-  if (use_dim)
-  {
-      string errMessage = (stdVarType == STANDARD_DEVIATION) ? "std_mps" : "var_mps";
-      errMessage += ": reduction dim must be in the range of input shape";
-      for(int i = 0; i < dim_value.size(); i++) {
-        auto wrap_dim = maybe_wrap_dim(dim_value[i], input_shape.size());
-        TORCH_CHECK(wrap_dim < input_shape.size(), errMessage.c_str())
+  if (use_dim){
+    string errMessage = (stdVarType == STANDARD_DEVIATION) ? "std_mps" : "var_mps";
+    errMessage += ": reduction dim must be in the range of input shape";
+    for (const int i : c10::irange(dim_value.size())) {
+      auto wrap_dim = maybe_wrap_dim(dim_value[i], input_shape.size());
+      TORCH_CHECK(wrap_dim < input_shape.size(), errMessage.c_str())
     }
   }
 
@@ -750,6 +735,8 @@ Tensor std_var_common_impl_mps(
 
   native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
 
+  NSArray<NSNumber*>* wrappedAxes = mps::getTensorAxes(input_t, dim);
+
   int64_t num_output_dims = 0;
   NSMutableArray<NSNumber *> *axes = nil;
   NSMutableArray<NSNumber*> *apparent_output_shape = nil;
@@ -779,97 +766,90 @@ Tensor std_var_common_impl_mps(
       axes[0] = @0;
 
   }
-  else if (!keepdim && use_dim && dim_value.size() > 0)
-  {
-      int64_t num_reduce_dims = dim_value.size();
-      num_output_dims = num_input_dims;
-
-      set_axes(axes, num_reduce_dims, dim_value, num_input_dims);
-      set_apparent_shapes(apparent_output_shape,
-                           apparent_input_shape,
-                           num_reduce_dims,
-                           num_input_dims,
-                           num_output_dims,
-                           input_shape,
-                           axes);
-
-      num_output_dims = (num_input_dims >= num_reduce_dims) ? (num_input_dims - num_reduce_dims) : 0; //num_input_dims;
-
-      unsigned int curr_i = 0;
-      for (int i = 0; i < num_input_dims; i++)
-      {
-          bool found = false;
-          for (int j = 0; j < num_reduce_dims; j++)
-          {
-              if (i == dim_value[j])
-              {
-                  found = true;
-                  break;
-              }
-          }
-          if (found) continue;
-          output_shape.push_back(input_shape[i]);
-          curr_i += 1;
-          // End loop when output shape is filled
-          if (curr_i == num_output_dims)
-            break;
-      }
+  else if (!keepdim && use_dim && dim_value.size() > 0) {
+    int64_t num_reduce_dims = dim_value.size();
+    num_output_dims = num_input_dims;
 
-      for(int i = 0; i < num_reduce_dims; i++)
-      {
-          auto wrap_dim = maybe_wrap_dim(dim_value[i], input_shape.size());
-          correction_n *= input_shape[wrap_dim];
-      }
-      // (3, 4, 5) --> (3, 5)
-  }
-  else if ((keepdim && !use_dim) || (keepdim && use_dim && dim_value.size() <= 0))
-  {
-      num_output_dims = 0;
-      int64_t num_reduce_dims = 0;
-      set_axes(axes, num_reduce_dims, dim_value, input_shape.size());
-      set_apparent_shapes(apparent_output_shape,
+    set_axes(axes, num_reduce_dims, dim_value, num_input_dims);
+    set_apparent_shapes(apparent_output_shape,
                           apparent_input_shape,
-                           num_reduce_dims,
-                           num_input_dims,
-                           num_output_dims,
-                           input_shape,
-                           axes);
-      num_output_dims = num_input_dims;
-      for (int i = 0; i < num_input_dims; i++)
-      {
-          output_shape.push_back((int64_t) 1);
-          correction_n *= input_shape[i];
+                          num_reduce_dims,
+                          num_input_dims,
+                          num_output_dims,
+                          input_shape,
+                          axes);
+
+    num_output_dims = (num_input_dims >= num_reduce_dims) ? (num_input_dims - num_reduce_dims) : 0; //num_input_dims;
+
+    unsigned int curr_i = 0;
+    for (const int i : c10::irange(num_input_dims)) {
+      bool found = false;
+      for (const int j : c10::irange(num_reduce_dims)) {
+        if (i == dim_value[j]) {
+            found = true;
+            break;
+        }
       }
-      // scalar --> vector case [[1.0034567]]
-  }
-  else if (keepdim && use_dim && dim_value.size() > 0)
-  {
-      int64_t num_reduce_dims = dim_value.size();
-      num_output_dims = num_input_dims;
-
-      set_axes(axes, num_reduce_dims, dim_value, num_input_dims);
-      set_apparent_shapes(apparent_output_shape,
-                           apparent_input_shape,
-                           num_reduce_dims,
-                           num_input_dims,
-                           num_output_dims,
-                           input_shape,
-                           axes);
-
-      num_output_dims = num_input_dims;//(num_input_dims >= num_reduce_dims) ? (num_input_dims - num_reduce_dims) : 0;
-
-      for(int i = 0; i < num_reduce_dims; i++)
-      {
-          auto wrap_dim = maybe_wrap_dim(dim_value[i], input_shape.size());
-          correction_n *= input_shape[wrap_dim];
+      if (found) {
+        continue;
       }
 
-      for (int i = 0; i < num_input_dims; i++)
-      {
-          output_shape.push_back([apparent_output_shape[i] longValue]);
-      }
+      output_shape.push_back(input_shape[i]);
+      curr_i += 1;
+      // End loop when output shape is filled
+      if (curr_i == num_output_dims)
+        break;
+    }
+
+    for(int i = 0; i < num_reduce_dims; i++) {
+        auto wrap_dim = maybe_wrap_dim(dim_value[i], input_shape.size());
+        correction_n *= input_shape[wrap_dim];
+    }
+    // (3, 4, 5) --> (3, 5)
+  }
+  else if ((keepdim && !use_dim) || (keepdim && use_dim && dim_value.size() <= 0)) {
+    num_output_dims = 0;
+    int64_t num_reduce_dims = 0;
+    set_axes(axes, num_reduce_dims, dim_value, input_shape.size());
+    set_apparent_shapes(apparent_output_shape,
+                        apparent_input_shape,
+                          num_reduce_dims,
+                          num_input_dims,
+                          num_output_dims,
+                          input_shape,
+                          axes);
+    num_output_dims = num_input_dims;
+    for (const int i : c10::irange(num_input_dims))
+    {
+        output_shape.push_back((int64_t) 1);
+        correction_n *= input_shape[i];
+    }
+    // scalar --> vector case [[1.0034567]]
   }
+  else if (keepdim && use_dim && dim_value.size() > 0) {
+    int64_t num_reduce_dims = dim_value.size();
+    num_output_dims = num_input_dims;
+
+    set_axes(axes, num_reduce_dims, dim_value, num_input_dims);
+    set_apparent_shapes(apparent_output_shape,
+                          apparent_input_shape,
+                          num_reduce_dims,
+                          num_input_dims,
+                          num_output_dims,
+                          input_shape,
+                          axes);
+
+    num_output_dims = num_input_dims;//(num_input_dims >= num_reduce_dims) ? (num_input_dims - num_reduce_dims) : 0;
+
+    for(const int i : c10::irange(num_reduce_dims)) {
+      auto wrap_dim = maybe_wrap_dim(dim_value[i], input_shape.size());
+      correction_n *= input_shape[wrap_dim];
+    }
 
+    for (const int i : c10::irange(num_input_dims)) {
+      output_shape.push_back([apparent_output_shape[i] longValue]);
+    }
+  }
 
   Tensor output_t = at::native::empty_mps(
                       IntArrayRef(output_shape.data(), num_output_dims),
@@ -879,27 +859,31 @@ Tensor std_var_common_impl_mps(
                       c10::nullopt,
                       c10::nullopt);
 
-  if (output_t.numel() == 0 || input_t.numel() == 0)
-  {
-     return output_t;
+  if (output_t.numel() == 0 || input_t.numel() == 0) {
+    return output_t;
   }
 
   double bessel_correction = ((double) correction_n) / ((double) (correction_n-correction_value));
-
   auto stream = at::mps::getCurrentMPSStream();
 
   @autoreleasepool {
     string op_key = (stdVarType == STANDARD_DEVIATION) ? "std_mps" : "var_mps";
-    NSString* ns_key = [[axes valueForKey:@"description"] componentsJoinedByString:@","];
+    NSString* ns_key = [[wrappedAxes valueForKey:@"description"] componentsJoinedByString:@","];
     string bessel_corrected = (use_correction && correction_value) ? "unbiased " : "biased ";
     string use_dim_info = (use_dim) ? "use_dim=1:" + to_string(dim_value.size()) : "use_dim=0";
     string keepdim_info = (keepdim) ? "keepdim=1" : "keepdim=0";
-    string key = op_key + use_dim_info + ":" + keepdim_info + ":" + string([ns_key UTF8String]) + ":" + native_mps::getTensorsStringKey(input_t) + ":" + bessel_corrected + ":" + std::to_string(correction_value);
+    string key = op_key                                   + ":" +
+                 native_mps::getTensorsStringKey(input_t) + ":" +
+                 use_dim_info                             + ":" +
+                 keepdim_info                             + ":" +
+                 string([ns_key UTF8String])              + ":" +
+                 bessel_corrected                         + ":" +
+                std::to_string(correction_value);
 
     auto cachedGraph = cache_->LookUpAs<CachedGraph>(key);
     // Initialize once if configuration not found in cache
-  if(!cachedGraph) {
-      cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ native_mps::MPSCachedGraph * () {
+    if(!cachedGraph) {
+      native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
 
       CachedGraph *newCachedGraph = nil;
 
@@ -907,24 +891,22 @@ Tensor std_var_common_impl_mps(
           MPSGraph* mpsGraph = native_mps::make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
 
-          MPSGraphTensor *inputTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(input_t.scalar_type()));
+          MPSGraphTensor *inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t);
           MPSGraphTensor *outputVarTensor = [mpsGraph varianceOfTensor:inputTensor
-                                                                     axes:axes
-                                                                     name:nil];
-          MPSGraphTensor *outputTensor;
+                                                                  axes:wrappedAxes
+                                                                  name:nil];
+          MPSGraphTensor *outputTensor = nil;
 
           if (use_correction && correction_value)
           {
               MPSGraphTensor *besselTensor= [mpsGraph constantWithScalar:bessel_correction
-                                                    dataType: native_mps::getMPSDataType(input_t.scalar_type())];
-              MPSGraphTensor *correctedTensor = [mpsGraph multiplicationWithPrimaryTensor: outputVarTensor
-                                                                          secondaryTensor: besselTensor
-                                                                                     name: nil];
+                                                                dataType:native_mps::getMPSDataType(input_t.scalar_type())];
+              MPSGraphTensor *correctedTensor = [mpsGraph multiplicationWithPrimaryTensor:outputVarTensor
+                                                                          secondaryTensor:besselTensor
+                                                                                     name:nil];
               outputTensor = (stdVarType == STANDARD_DEVIATION) ?
                     [mpsGraph squareRootWithTensor:correctedTensor name:nil] : correctedTensor;
-          }
-          else
-          {
+          } else {
               outputTensor = (stdVarType == STANDARD_DEVIATION) ?
                     [mpsGraph squareRootWithTensor:outputVarTensor name:nil] : outputVarTensor;
           }
@@ -935,16 +917,8 @@ Tensor std_var_common_impl_mps(
       return newCachedGraph;
       });
   }
-  auto inputPlaceholder = native_mps::Placeholder();
 
-  if(apparent_input_shape)
-  {
-    inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t, apparent_input_shape);
-  }
-  else
-  {
-    inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
-  }
+  auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
   auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, apparent_output_shape);
 
   NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
@@ -1029,8 +1003,7 @@ Tensor std_mps(
 
                 if (input_type != MPSDataTypeInt32 &&
                     input_type != MPSDataTypeFloat32 &&
-                    input_type != MPSDataTypeFloat16 )
-                {
+                    input_type != MPSDataTypeFloat16) {
                     MPSGraphTensor* inputCastedTensor = [mpsGraph castTensor:inputTensor
                                                                       toType:MPSDataTypeInt32
                                                                         name:@"any_all"];
@@ -1363,7 +1336,7 @@ Tensor std_mps(
             castInputTensor = inputTensor;
           }
 
-          NSArray<NSNumber*>* axes = getTensorAxes(input_t);
+          NSArray<NSNumber*>* axes = mps::getTensorAxes(input_t);
           if(reduction_type == MPSReductionType::MAX)
             outputTensor = [mpsGraph reductionMaximumWithTensor:castInputTensor
                                                            axes:axes
@@ -1575,121 +1548,121 @@ Tensor min_mps(const Tensor& input_t) {
     namespace native_mps = at::native::mps;
     using CachedGraph = native_mps::MPSUnaryCachedGraph;
 
-    native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
-
-    int64_t dim_;
+  native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
 
-    if (dim.has_value()) {
-        dim_ = maybe_wrap_dim(dim.value(), input_t.dim());
-        zero_numel_check_dims(input_t, dim_, reduction_type == MPSReductionType::MAX ? "argmax()" : "argmin()");
-    } else {
-        TORCH_CHECK_INDEX(
-        input_t.numel() != 0,
-        reduction_type == MPSReductionType::MAX ? "argmax()" : "argmin()" , ": Expected reduction dim to be specified for input.numel() == 0.");
-        // Since input will be flattened, take argmax or argmin along 0'th dimension
-        dim_ = 0;
-    }
+  int64_t dim_;
+
+  if (dim.has_value()) {
+      dim_ = maybe_wrap_dim(dim.value(), input_t.dim());
+      zero_numel_check_dims(input_t, dim_, reduction_type == MPSReductionType::MAX ? "argmax()" : "argmin()");
+  } else {
+      TORCH_CHECK_INDEX(
+      input_t.numel() != 0,
+      reduction_type == MPSReductionType::MAX ? "argmax()" : "argmin()" , ": Expected reduction dim to be specified for input.numel() == 0.");
+      // Since input will be flattened, take argmax or argmin along 0'th dimension
+      dim_ = 0;
+  }
 
-    // Calculate the output shape according to keepdim=True
-    // If there is no dim argument, the input shape is flattened
-    IntArrayRef input_shape = input_t.sizes();
-    int64_t num_input_dims = input_shape.size();
-    NSMutableArray<NSNumber*> *apparent_in_shape = nil;
-    NSMutableArray<NSNumber*> *apparent_out_shape = nil;
+  // Calculate the output shape according to keepdim=True
+  // If there is no dim argument, the input shape is flattened
+  IntArrayRef input_shape = input_t.sizes();
+  int64_t num_input_dims = input_shape.size();
+  NSMutableArray<NSNumber*> *apparent_in_shape = nil;
+  NSMutableArray<NSNumber*> *apparent_out_shape = nil;
 
-    if(dim.has_value()) {
-        apparent_out_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
-        for(int i = 0; i < num_input_dims; i++) {
-            if(dim_ == i)
-                apparent_out_shape[i] = @1;
-            else
-                apparent_out_shape[i] = [NSNumber numberWithInt:input_shape[i]];
-        }
+  if(dim.has_value()) {
+    apparent_out_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
+    for(int i = 0; i < num_input_dims; i++) {
+        if(dim_ == i)
+            apparent_out_shape[i] = @1;
+        else
+            apparent_out_shape[i] = [NSNumber numberWithInt:input_shape[i]];
     }
-    else {
-        apparent_in_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:1];
-        int64_t num_in_elements = 1;
-        for(int i = 0; i < num_input_dims; i++) {
-            num_in_elements *= input_shape[i];
-        }
-        apparent_in_shape[0] = [NSNumber numberWithInt:num_in_elements];
-
-        apparent_out_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:1];
-        apparent_out_shape[0] = @1;
+  } else {
+    apparent_in_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:1];
+    int64_t num_in_elements = 1;
+    for(int i = 0; i < num_input_dims; i++) {
+        num_in_elements *= input_shape[i];
     }
+    apparent_in_shape[0] = [NSNumber numberWithInt:num_in_elements];
 
-    if (output_t.numel() == 0) {
-        return;
-    }
+    apparent_out_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:1];
+    apparent_out_shape[0] = @1;
+  }
 
-    auto stream = at::mps::getCurrentMPSStream();
+  if (output_t.numel() == 0) {
+      return;
+  }
 
-    @autoreleasepool {
-        string key = func_name + to_string(dim_) + ":" + native_mps::getTensorsStringKey(input_t);
-        CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
+  if (!apparent_in_shape) {
+    apparent_in_shape = [native_mps::getMPSShape(input_t.sizes()) mutableCopy];
+  }
 
-        if(!cachedGraph) {
-          cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ native_mps::MPSCachedGraph * () {
+  auto stream = at::mps::getCurrentMPSStream();
+  @autoreleasepool {
+    NSString* ns_key = [[apparent_in_shape valueForKey:@"description"] componentsJoinedByString:@","];
+    string key = func_name                                + ":" +
+                 to_string(dim_)                          + ":" +
+                 native_mps::getTensorsStringKey(input_t) + ":" +
+                 string([ns_key UTF8String]);
+      CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
 
-            CachedGraph *newCachedGraph = nil;
+      if(!cachedGraph) {
+        native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
 
-            @autoreleasepool {
-              MPSGraph* mpsGraph = native_mps::make_mps_graph();
-              newCachedGraph = new CachedGraph(mpsGraph);
+          CachedGraph *newCachedGraph = nil;
 
-              MPSGraphTensor* inputTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(input_t.scalar_type()));
+          @autoreleasepool {
+            MPSGraph* mpsGraph = native_mps::make_mps_graph();
+            newCachedGraph = new CachedGraph(mpsGraph);
 
-              MPSGraphTensor* castInputTensor = nil;
-              MPSGraphTensor* argreduceOutTensor = nil;
+            MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(input_t.scalar_type()), apparent_in_shape);
 
-              if(input_t.scalar_type() != ScalarType::Float &&
-                 input_t.scalar_type() != ScalarType::Int   &&
-                 input_t.scalar_type() != ScalarType::Half)
-                castInputTensor =  [mpsGraph castTensor:inputTensor
-                                                 toType:MPSDataTypeFloat32
-                                                   name:@"castInputTensor"];
-              else
-                castInputTensor = inputTensor;
+            MPSGraphTensor* castInputTensor = inputTensor;
+            MPSGraphTensor* argreduceOutTensor = nil;
 
-              if (reduction_type == MPSReductionType::MAX) {
-                argreduceOutTensor = [mpsGraph reductionArgMaximumWithTensor:castInputTensor
-                                                                        axis:(NSInteger)dim_
-                                                                        name:nil];
-              }
-              else {
-                argreduceOutTensor = [mpsGraph reductionArgMinimumWithTensor:castInputTensor
-                                                                        axis:(NSInteger)dim_
-                                                                        name:nil];
-              }
-              MPSGraphTensor* outputTensor = [mpsGraph castTensor:argreduceOutTensor
-                                                           toType:MPSDataTypeInt64
-                                                             name:@"castOutpuTensor"];
+            if(input_t.scalar_type() != ScalarType::Float &&
+                input_t.scalar_type() != ScalarType::Int   &&
+                input_t.scalar_type() != ScalarType::Half) {
+              castInputTensor =  [mpsGraph castTensor:inputTensor
+                                               toType:MPSDataTypeFloat32
+                                                 name:@"castInputTensor"];
+            }
 
-              newCachedGraph->inputTensor_ = inputTensor;
-              newCachedGraph->outputTensor_ = outputTensor;
+            if (reduction_type == MPSReductionType::MAX) {
+              argreduceOutTensor = [mpsGraph reductionArgMaximumWithTensor:castInputTensor
+                                                                      axis:(NSInteger)dim_
+                                                                      name:nil];
+            } else {
+              argreduceOutTensor = [mpsGraph reductionArgMinimumWithTensor:castInputTensor
+                                                                      axis:(NSInteger)dim_
+                                                                      name:nil];
             }
-            return newCachedGraph;
-          });
-        }
+            MPSGraphTensor* outputTensor = [mpsGraph castTensor:argreduceOutTensor
+                                                          toType:MPSDataTypeInt64
+                                                            name:@"castOutputTensor"];
 
-        native_mps::Placeholder inputPlaceholder = native_mps::Placeholder();
-        if(apparent_in_shape)
-            inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t, apparent_in_shape);
-        else
-            inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
+            newCachedGraph->inputTensor_ = inputTensor;
+            newCachedGraph->outputTensor_ = outputTensor;
+          }
+          return newCachedGraph;
+        });
+        cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+      }
 
-        auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, apparent_out_shape);
+      auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t, apparent_in_shape);
+      auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t, apparent_out_shape);
 
-        NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
-          inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-        };
+      NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
+        inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+      };
 
-        NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
-          outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
-        };
+      NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
+        outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+      };
 
-        native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
-    }
+      native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
 }
 
 TORCH_IMPL_FUNC(argmax_out_mps)
@@ -1710,7 +1683,6 @@ Tensor min_mps(const Tensor& input_t) {
     argmax_argmin_out_mps(input_t, dim, keepdim, output_t, MPSReductionType::MIN, "argmin_out_mps");
 }
 
-
 // Min/Max with dim
 std::tuple<Tensor, Tensor> min_max_mps
    (const Tensor& input_t,
diff --git a/aten/src/ATen/native/mps/operations/Repeat.mm b/aten/src/ATen/native/mps/operations/Repeat.mm
index 8b6b709da6427..6e52dbe89c53b 100644
--- a/aten/src/ATen/native/mps/operations/Repeat.mm
+++ b/aten/src/ATen/native/mps/operations/Repeat.mm
@@ -36,48 +36,6 @@ Tensor permute_mps(const Tensor& self, IntArrayRef dims) {
   return self.as_strided(newSizes, newStrides);
 }
 
-void set_apparent_shapes(NSArray<NSNumber*> * input_shape,
-                         NSArray<NSNumber*> * &apparent_input_shape,
-                         int64_t num_input_dims,
-                         IntArrayRef repeats,
-                         NSMutableArray<NSNumber*> * &repeats_shape,
-                         int64_t num_repeat_dims) {
-
-
-  bool repeat_empty = false;
-  if(num_repeat_dims == 0) {
-    num_repeat_dims = num_input_dims;
-    repeat_empty = true;
-  }
-
-  // Set repeats_shape
-  repeats_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:num_repeat_dims];
-
-  for(int i = 0; i < num_repeat_dims; i++) {
-    if(repeat_empty)
-      repeats_shape[i] = [NSNumber numberWithInteger:1];
-    else
-      repeats_shape[i] = [NSNumber numberWithInteger:repeats[i]];
-  }
-
-  // If no extension of the shape is needed
-  if(num_repeat_dims == num_input_dims) {
-    apparent_input_shape = input_shape;
-  }
-  // num_repeat_dims > num_input_dims
-  else {
-    auto rc = [NSMutableArray<NSNumber*> arrayWithCapacity:num_repeat_dims];
-
-    for(int i = 0; i < num_repeat_dims - num_input_dims; i++)
-      rc[i] = @1;
-
-    for(int i = num_repeat_dims - num_input_dims; i < num_repeat_dims; i++)
-      rc[i] = input_shape[i + num_input_dims - num_repeat_dims];
-    apparent_input_shape = rc;
-  }
-
-}
-
 Tensor repeat_mps(const Tensor& self, IntArrayRef repeats) {
 
   using namespace mps;
@@ -91,54 +49,32 @@ Tensor repeat_mps(const Tensor& self, IntArrayRef repeats) {
     MPSGraphTensor *outputTensor_ = nil;
   };
 
-  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
-
-  NSArray<NSNumber*> *apparent_input_shape = nil;
-  NSMutableArray<NSNumber*> *repeats_shape = nil;
-
-  auto input_shape = getMPSShape(self);
-  auto num_input_dims = [input_shape count];
-  auto num_repeat_dims = repeats.size();
-
-  set_apparent_shapes(input_shape,
-                      apparent_input_shape,
-                      num_input_dims,
-                      repeats,
-                      repeats_shape,
-                      num_repeat_dims);
-
-  // Set output shape
-  std::vector<int64_t> output_shape(num_repeat_dims);
+  // Add new leading dimensions to the tensor if the
+  // number of target dimensions is larger than the
+  // number of source dimensions.
+  int64_t num_new_dimensions = repeats.size() - self.dim();
+  DimVector padded_size(num_new_dimensions, 1);
+  padded_size.insert(padded_size.end(), self.sizes().begin(), self.sizes().end());
+  DimVector target_size(repeats.size());
   bool zero_tensor = false;
-  for(auto i : c10::irange(num_repeat_dims)) {
-    output_shape[i] = repeats[i] * [apparent_input_shape[i] intValue];
-    if(output_shape[i] == 0) {
+  for(const auto idx : c10::irange(repeats.size())) {
+    if (repeats[idx] == 0) {
       zero_tensor = true;
     }
+    target_size[idx] = padded_size[idx] * repeats[idx];
   }
 
-  Tensor output = at::native::empty_mps(
-                      IntArrayRef(output_shape),
-                      self.scalar_type(),
-                      c10::nullopt,
-                      kMPS,
-                      c10::nullopt,
-                      c10::nullopt);
-
-  // Empty output
-  if(zero_tensor || output.numel() == 0)
-    return output;
+  Tensor expanded_tensor = self.expand(padded_size);
+  Tensor result = at::empty(target_size, self.options());
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+  if(zero_tensor || result.numel() == 0) {
+    return result;
+  }
 
   auto stream = at::mps::getCurrentMPSStream();
 
   @autoreleasepool {
-
-    NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
-    NSString* ns_repeats_key = [[repeats_shape valueForKey:@"description"] componentsJoinedByString:@","];
-
-    string key = "repeat_mps:" + getMPSTypeString(self.scalar_type())
-                               + ":" + string([ns_shape_key UTF8String])
-                               + ":" + string([ns_repeats_key UTF8String]);
+    string key = "repeat_mps:" + getTensorsStringKey(self) + ":" + getArrayRefString(repeats);
     CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
 
     if(!cachedGraph) {
@@ -149,9 +85,9 @@ Tensor repeat_mps(const Tensor& self, IntArrayRef repeats) {
           MPSGraph* mpsGraph = make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
 
-          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type()), apparent_input_shape);
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, expanded_tensor);
           MPSGraphTensor* outputTensor = [mpsGraph tileTensor:inputTensor
-                                               withMultiplier:repeats_shape
+                                               withMultiplier:getMPSShape(repeats)
                                                          name:nil];
 
           newCachedGraph->inputTensor_ = inputTensor;
@@ -162,8 +98,8 @@ Tensor repeat_mps(const Tensor& self, IntArrayRef repeats) {
       cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
-    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, apparent_input_shape);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, expanded_tensor);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result);
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
       selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
@@ -175,9 +111,8 @@ Tensor repeat_mps(const Tensor& self, IntArrayRef repeats) {
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
   }
 
-  return output;
-
+  return result;
 }
 
-}
-}
+} // namespace native
+} // namespace at
diff --git a/test/test_mps.py b/test/test_mps.py
index d1d0b1c022c27..69ab80ba9924e 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8314,6 +8314,19 @@ class TestConsistency(TestCase):
         'nn.functional.nll_loss': ['f32'],
         'std': ['f16','f32'],
         'var': ['f16','f32'],
+        'amax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'amin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'prod': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'mean': ['f16', 'f32'],
+        'count_nonzero': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.amax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.amin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.mean': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.prod': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.std': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.var': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
     }
 
 
@@ -8504,6 +8517,11 @@ class TestConsistency(TestCase):
         '__rpow__': [torch.int64],
         'masked.std': [torch.int32],
         'masked.var': [torch.int32],
+
+        # Failures due to inconsistency between CPU and GPU for `inf` case
+        'masked.argmax': ['f16', 'f32', 'i32'],
+        'masked.argmin': ['f16', 'f32', 'i32'],
+
         'as_strided_scatter': [torch.uint8],
         'atan2': [torch.int64],
         'bfloat16': None,
@@ -8616,24 +8634,8 @@ class TestConsistency(TestCase):
         'new_empty': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'empty_like': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
 
-        # failures due to shape and type issues in ReduceOps
-        'amax': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'amin': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'sum': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'prod': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'mean': ['torch.float16', 'torch.float32'],
-        'count_nonzero': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        '_masked.argmax': ['torch.float16', 'torch.float32', 'torch.int32'],
-        '_masked.argmin': ['torch.float16', 'torch.float32', 'torch.int32'],
-        '_masked.amax': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        '_masked.amin': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        '_masked.mean': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        '_masked.prod': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        '_masked.std': ['torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        '_masked.sum': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        '_masked.var': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-
         # failures due to precision issues
+        'masked.var': ['f16'],
         'nn.functional.gelu': ['torch.float32'],
         'pow': ['torch.float32'],
         'tan': ['torch.float32'],

From e864e8c0704ea16542780bba74143c2feb7d5c11 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Wed, 7 Dec 2022 17:31:46 -0800
Subject: [PATCH 1834/1922] Convert grad output always to NCHW in
 mps_convolution_backward_input (#190)

* Convert grad output always to NCHW in mps_convolution_backward_input

* Fix grad_input shape in backward convolution
---
 .../ATen/native/mps/operations/Convolution.mm | 43 ++++++++++++-------
 test/test_mps.py                              | 32 +++++++++-----
 2 files changed, 49 insertions(+), 26 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Convolution.mm b/aten/src/ATen/native/mps/operations/Convolution.mm
index 636226815ba41..7c508b0dd8724 100644
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@@ -41,7 +41,7 @@ void fill_conv_desc(MPSGraphConvolution2DOpDescriptor* descriptor_,
 
 static
 MPSShape* get_mps_conv_shape(const Tensor& tensor, bool is_channels_last) {
-  if (is_channels_last) {
+  if (is_channels_last && tensor.is_contiguous() && !tensor.is_view()) {
     const auto tensorSizes = tensor.sizes();
     const NSUInteger N = tensorSizes[0];
     const NSUInteger C = tensorSizes[1];
@@ -216,22 +216,25 @@ Tensor _mps_convolution(
 }
 
 Tensor mps_convolution_backward_input(
-    IntArrayRef input_size, const Tensor& grad_output_, const Tensor& weight_,
+    IntArrayRef input_size, const Tensor& grad_output_t, const Tensor& weight_t,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) {
   namespace native_mps = at::native::mps;
   using namespace mps;
   CheckedFrom c = "mps_convolution_backward_input";
-  TensorArg grad_output{ grad_output_, "grad_output", 1 },
-            weight{ weight_, "weight", 2 };
+  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
+            weight{ weight_t, "weight", 2 };
   checkAllSameType(c, {grad_output, weight});
   checkAllSameGPU(c, {grad_output, weight});
-  auto memory_format = grad_output_.suggest_memory_format();
+  auto memory_format = grad_output_t.suggest_memory_format();
   bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast);
-  MPSShape* weightShape = get_mps_conv_shape(weight_, is_channels_last);
-  MPSShape* gradOutputShape = get_mps_conv_shape(grad_output_, is_channels_last);
-  Tensor grad_output_t = grad_output_.contiguous(memory_format);
-  Tensor weight_t = weight_.contiguous(memory_format);
-  auto grad_input_t = at::empty( input_size, grad_output_t.options(), c10::nullopt);
+
+  auto grad_input_t = at::empty(
+                    input_size,
+                    grad_output->scalar_type(),
+                    c10::nullopt,
+                    kMPS,
+                    c10::nullopt,
+                    c10::nullopt);
 
   // Avoid "grad_input" when this is being used as transposed convolution
   TensorArg grad_input{ grad_input_t, "result", 0 };
@@ -265,8 +268,9 @@ Tensor mps_convolution_backward_input(
         assert(0 && "Check should have been done earlier\n");
     }
 
+    MPSShape* gradOutputShape = get_mps_conv_shape(grad_output_t, is_channels_last);
     MPSShape* mps_input_shape = getMPSShape(input_size);
-    NSString* ns_shape_key = [[mps_input_shape valueForKey:@"description"] componentsJoinedByString:@","];
+    NSString* ns_shape_key = [[gradOutputShape valueForKey:@"description"] componentsJoinedByString:@","];
     string key = "mps_convolution_backward_input:" + to_string(stride[0]) + ":" + to_string(stride[1]) + ":"
                                                    + to_string(dilation[0]) + ":" + to_string(dilation[1]) + ":"
                                                    + to_string(padding[0]) + ":" + to_string(padding[1]) + ":"
@@ -288,12 +292,21 @@ Tensor mps_convolution_backward_input(
           fill_conv_desc(descriptor_, stride[1], stride[0],
                                       dilation[1], dilation[0],
                                       padding[1], padding[0],
-                                      memory_format, groups);
+                                      at::MemoryFormat::Contiguous, groups);
 
           MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSScalarType(grad_output_t.scalar_type()), gradOutputShape);
-          MPSGraphTensor* weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSScalarType(weight_t.scalar_type()), weightShape);
+          MPSGraphTensor* weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
+
+          MPSGraphTensor *gradOutputTensorTranspose = gradOutputTensor;
+          if (is_channels_last && grad_output_t.is_contiguous() && !grad_output_t.is_view()) {
+            // NHWC -> NCHW
+            gradOutputTensorTranspose = [mpsGraph transposeTensor: [mpsGraph transposeTensor:gradOutputTensor dimension:-1 withDimension:-2 name:nil]
+                                           dimension: -2
+                                       withDimension: -3
+                                                name: nil];
+          }
 
-          MPSGraphTensor* gradInputTensor = [mpsGraph convolution2DDataGradientWithIncomingGradientTensor:gradOutputTensor
+          MPSGraphTensor* gradInputTensor = [mpsGraph convolution2DDataGradientWithIncomingGradientTensor:gradOutputTensorTranspose
                                                                                             weightsTensor:weightTensor
                                                                                               outputShape:mps_input_shape
                                                                              forwardConvolutionDescriptor:descriptor_
@@ -309,7 +322,7 @@ Tensor mps_convolution_backward_input(
     }
 
     auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t, gradOutputShape);
-    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t, weightShape);
+    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t);
     auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, *grad_input);
 
     NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
diff --git a/test/test_mps.py b/test/test_mps.py
index 69ab80ba9924e..aafb34cb16b87 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -6981,17 +6981,27 @@ def test_conv_transpose_1d_nn_functional(self):
         self.assertEqual(tcpu, tgpu.cpu(), rtol=2.6e-05, atol=2e-04)
 
     def test_conv_backward_1d_channels_last(self):
-        # https://github.com/pytorch/pytorch/issues/84511
-        conv_cpu = torch.nn.Conv1d(in_channels=1, out_channels=1, kernel_size=3)
-        conv_mps = copy.deepcopy(conv_cpu).to(device='mps')
-
-        data = torch.rand(1, 176, 1, dtype=torch.float32)
-        x_cpu = data.permute(0, 2, 1).contiguous()
-        x_mps = data.permute(0, 2, 1).contiguous().to("mps")
-        res_cpu = conv_cpu(x_cpu).sum().backward()
-        res_mps = conv_mps(x_mps).sum().backward()
-
-        self.assertEqual(res_cpu, res_mps)
+        def helper(shape, in_channels=1, out_channels=1, kernel_size=3, groups=1):
+            # https://github.com/pytorch/pytorch/issues/84511
+            conv_mps = torch.nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=groups, device="mps")
+            conv_cpu = copy.deepcopy(conv_mps).to(device='cpu')
+
+            data = torch.rand(shape, dtype=torch.float32)
+            x_cpu = data.permute(0, 2, 1).contiguous().requires_grad_(True)
+            x_mps = data.to("mps").permute(0, 2, 1).contiguous().requires_grad_(True)
+            res_cpu = conv_cpu(x_cpu)
+            res_mps = conv_mps(x_mps)
+            self.assertEqual(res_cpu, res_mps)
+            res_cpu = res_cpu.sum().backward()
+            res_mps = res_mps.sum().backward()
+            self.assertEqual(conv_cpu.weight.grad, conv_mps.weight.grad)
+            self.assertEqual(x_cpu.grad, x_mps.grad)
+
+        helper(shape=(1, 176, 1))
+        helper(shape=(2, 12, 1))
+        helper(shape=(3, 176, 1))
+        helper(shape=(4, 376, 1))
+        # helper(shape=(1024, 376, 9), in_channels=9, out_channels=3, groups=3)
 
     def test_conv1d_contiguous(self):
         model_cpu = torch.nn.Conv1d(1, 128, 3)

From e26a5f54fdf60ec546a6e5d812fdf5e969483304 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Wed, 7 Dec 2022 17:36:35 -0800
Subject: [PATCH 1835/1922] Add more testcases for convolution backward input
 pass (#191)

---
 test/test_mps.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index aafb34cb16b87..346fbd1f15694 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -6983,25 +6983,30 @@ def test_conv_transpose_1d_nn_functional(self):
     def test_conv_backward_1d_channels_last(self):
         def helper(shape, in_channels=1, out_channels=1, kernel_size=3, groups=1):
             # https://github.com/pytorch/pytorch/issues/84511
-            conv_mps = torch.nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=groups, device="mps")
-            conv_cpu = copy.deepcopy(conv_mps).to(device='cpu')
+            conv_cpu = torch.nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=groups)
+            conv_mps = torch.nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=groups).to("mps")
+            conv_mps.weight.data = conv_cpu.weight.data.detach().clone().to("mps").requires_grad_(True)
+            conv_mps.bias.data = conv_cpu.bias.data.detach().clone().to("mps").requires_grad_(True)
+
 
             data = torch.rand(shape, dtype=torch.float32)
             x_cpu = data.permute(0, 2, 1).contiguous().requires_grad_(True)
-            x_mps = data.to("mps").permute(0, 2, 1).contiguous().requires_grad_(True)
+            x_mps = data.permute(0, 2, 1).detach().clone().to("mps").contiguous().requires_grad_(True)
             res_cpu = conv_cpu(x_cpu)
             res_mps = conv_mps(x_mps)
             self.assertEqual(res_cpu, res_mps)
             res_cpu = res_cpu.sum().backward()
             res_mps = res_mps.sum().backward()
-            self.assertEqual(conv_cpu.weight.grad, conv_mps.weight.grad)
+
+            # self.assertEqual(conv_cpu.weight.grad, conv_mps.weight.grad)
             self.assertEqual(x_cpu.grad, x_mps.grad)
 
         helper(shape=(1, 176, 1))
         helper(shape=(2, 12, 1))
         helper(shape=(3, 176, 1))
         helper(shape=(4, 376, 1))
-        # helper(shape=(1024, 376, 9), in_channels=9, out_channels=3, groups=3)
+        helper(shape=(1024, 376, 9), in_channels=9, out_channels=1, groups=1)
+        helper(shape=(1024, 376, 9), in_channels=9, out_channels=9, groups=3)
 
     def test_conv1d_contiguous(self):
         model_cpu = torch.nn.Conv1d(1, 128, 3)

From fadc76772b610c75e5771644f7936c9224db46ce Mon Sep 17 00:00:00 2001
From: Kulin Seth <kulin_seth@apple.com>
Date: Wed, 7 Dec 2022 17:45:20 -0800
Subject: [PATCH 1836/1922] Dev/kulin/nll (#189)

* Fix the NLLLoss2D crash.

* Cleanup.
---
 aten/src/ATen/native/mps/operations/LossOps.mm | 16 ++++------------
 test/test_mps.py                               | 14 ++++++++++++++
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/LossOps.mm b/aten/src/ATen/native/mps/operations/LossOps.mm
index be1cfe83d2363..6968591f18ac8 100644
--- a/aten/src/ATen/native/mps/operations/LossOps.mm
+++ b/aten/src/ATen/native/mps/operations/LossOps.mm
@@ -356,19 +356,12 @@ void nllnd_loss_backward_impl(
         MPSShape* weight_shape = getMPSShape(weight);
         MPSShape* total_weight_shape = getMPSShape(total_weight);
 
-        NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
-
         string key = "nllnd_loss_backward_impl:" + to_string(numClasses) + ":" +
                                                    to_string(ignore_index) + ":" +
                                                    to_string(isWeightsArrayValid) + ":" +
                                                    reductionToString(reduction) + ":" +
-                                                   [ns_shape_key UTF8String] + ":" +
-                                                   getMPSTypeString(input.scalar_type()) + ":" +
-                                                   getMPSTypeString(target.scalar_type()) + ":" +
-                                                   getMPSTypeString(weight.scalar_type()) + ":" +
-                                                   getMPSTypeString(total_weight.scalar_type());
+                                                   getTensorsStringKey({input, target, weight, total_weight});
         CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
-
         if(!cachedGraph) {
             MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
 
@@ -408,12 +401,11 @@ void nllnd_loss_backward_impl(
                     }
 
                     float onValue = -1.0f;
+                    auto target_axis = target.defined() ? target.dim() : 1;
 
-                    MPSGraphTensor *oneHotTensor;
-
-                    oneHotTensor = [mpsGraph oneHotWithIndicesTensor:udpatedTargetTensor
+                    MPSGraphTensor *oneHotTensor = [mpsGraph oneHotWithIndicesTensor:udpatedTargetTensor
                                                                depth:numClasses
-                                                                axis:1
+                                                                axis:target_axis
                                                             dataType:inputTensor.dataType
                                                              onValue:onValue
                                                             offValue:0.0f
diff --git a/test/test_mps.py b/test/test_mps.py
index 346fbd1f15694..8fbb9909abc3c 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -2318,6 +2318,20 @@ def test_smooth_l1_loss_reduction_mean_sum_backward(self):
 
 
 class TestNLLLoss(TestCase):
+    def test_nll2d_loss_backward(self, device='mps'):
+        a = torch.randn(3, 5, requires_grad=True, device=device)
+        b = torch.tensor([1, 0, 4], device=device)
+        loss = nn.NLLLoss()
+        out = loss(a, b)
+        self.assertIsNone(out.grad_fn._saved_weight)
+        loss = nn.NLLLoss(weight=torch.ones((5,), device=device))
+        out = loss(a, b)
+        self.assertEqual(out.grad_fn._saved_weight, torch.ones((5,)))
+
+        out.sum().backward()
+        with self.assertRaisesRegex(RuntimeError, "after they have already been freed"):
+            out.grad_fn._saved_weight
+
     def test_nll_loss_mismatched_batch(self, device='mps'):
         x = torch.randn((10, 3), requires_grad=True, device=device)
         # t should have size (10,)

From 73b53ca8ef8e9eae4d4267a8bf765d04d23111fc Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Wed, 7 Dec 2022 21:57:16 -0800
Subject: [PATCH 1837/1922] Fix conv grad weights (#192)

* Fix conv gradient with weights when one input is contiguos and the other one is channels last

* Remove prints

* Fix the ConvGradient with weights for NHWC case. CLIP is now working.

Co-authored-by: Kulin Seth <kulin_seth@apple.com>
---
 .../ATen/native/mps/operations/Convolution.mm | 36 +++++++++++++------
 test/test_mps.py                              |  2 +-
 2 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Convolution.mm b/aten/src/ATen/native/mps/operations/Convolution.mm
index 7c508b0dd8724..7336eabb27ee6 100644
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@@ -340,14 +340,17 @@ Tensor mps_convolution_backward_input(
 }
 
 Tensor mps_convolution_backward_weights(
-    IntArrayRef weight_size, const Tensor& grad_output_t, const Tensor& input_t,
+    IntArrayRef weight_size, const Tensor& grad_output_, const Tensor& input_,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) {
   namespace native_mps = at::native::mps;
   using namespace mps;
   CheckedFrom c = "mps_convolution_backward_weights";
-  auto memory_format = input_t.suggest_memory_format();
+  auto memory_format = input_.suggest_memory_format();
   bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast);
-  MPSShape* inputShape = get_mps_conv_shape(input_t, is_channels_last);
+
+  auto grad_output_t = grad_output_.to(memory_format);
+  auto input_t = input_.to(memory_format);
+
   MPSShape* gradOutputShape = get_mps_conv_shape(grad_output_t, is_channels_last);
 
   // For uniformity with everything else, although it seems grad_weight
@@ -358,7 +361,13 @@ Tensor mps_convolution_backward_weights(
   checkAllSameType(c, {grad_output, input});
   checkAllSameGPU(c, {grad_output, input});
 
-  auto grad_weight_t = at::empty(weight_size, grad_output_t.options(), c10::nullopt);
+  auto grad_weight_t = at::empty(
+                          weight_size,
+                          grad_output_t.scalar_type(),
+                          c10::nullopt,
+                          kMPS,
+                          c10::nullopt,
+                          memory_format);
   TensorArg grad_weight{ grad_weight_t, "result", 0 };
 
   convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups);
@@ -389,9 +398,8 @@ Tensor mps_convolution_backward_weights(
       default:
         assert(0 && "Check should have been done earlier\n");
     }
-
     MPSShape* mps_weight_shape = getMPSShape(weight_size);
-    NSString* ns_shape_key = [[mps_weight_shape valueForKey:@"description"] componentsJoinedByString:@","];
+    NSString* ns_shape_key = [[gradOutputShape valueForKey:@"description"] componentsJoinedByString:@","];
     string key = "mps_convolution_backward_weights:" + to_string(stride[0]) + ":" + to_string(stride[1]) + ":"
                                                      + to_string(dilation[0]) + ":" + to_string(dilation[1]) + ":"
                                                      + to_string(padding[0]) + ":" + to_string(padding[1]) + ":"
@@ -413,12 +421,20 @@ Tensor mps_convolution_backward_weights(
           fill_conv_desc(descriptor_, stride[1], stride[0],
                                       dilation[1], dilation[0],
                                       padding[1], padding[0],
-                                      memory_format, groups);
+                                      at::MemoryFormat::Contiguous, groups);
 
           MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSScalarType(grad_output_t.scalar_type()), gradOutputShape);
-          MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSScalarType(input_t.scalar_type()), inputShape);
+          MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t);
 
-          MPSGraphTensor* gradWeightTensor = [mpsGraph convolution2DWeightsGradientWithIncomingGradientTensor:gradOutputTensor
+          MPSGraphTensor *gradOutputTensorTranspose = gradOutputTensor;
+          if (is_channels_last && grad_output_t.is_contiguous() && !grad_output_t.is_view()) {
+            // NHWC -> NCHW
+            gradOutputTensorTranspose = [mpsGraph transposeTensor: [mpsGraph transposeTensor:gradOutputTensor dimension:-1 withDimension:-2 name:nil]
+                                           dimension: -2
+                                       withDimension: -3
+                                                name: nil];
+          }
+          MPSGraphTensor* gradWeightTensor = [mpsGraph convolution2DWeightsGradientWithIncomingGradientTensor:gradOutputTensorTranspose
                                                                                                  sourceTensor:inputTensor
                                                                                                   outputShape:mps_weight_shape
                                                                                  forwardConvolutionDescriptor:descriptor_
@@ -434,7 +450,7 @@ Tensor mps_convolution_backward_weights(
     }
 
     auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t, gradOutputShape);
-    auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t, inputShape);
+    auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t);
     auto outputPlaceholder = Placeholder(cachedGraph->gradWeightTensor_, grad_weight_t);
 
     NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
diff --git a/test/test_mps.py b/test/test_mps.py
index 8fbb9909abc3c..9e2d7a73930d2 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -7012,7 +7012,7 @@ def helper(shape, in_channels=1, out_channels=1, kernel_size=3, groups=1):
             res_cpu = res_cpu.sum().backward()
             res_mps = res_mps.sum().backward()
 
-            # self.assertEqual(conv_cpu.weight.grad, conv_mps.weight.grad)
+            self.assertEqual(conv_cpu.weight.grad, conv_mps.weight.grad, rtol=2.6e-05, atol=2e-04)
             self.assertEqual(x_cpu.grad, x_mps.grad)
 
         helper(shape=(1, 176, 1))

From 9c1bdc95bb6595cb10e61b5a861f8f0a9e30e197 Mon Sep 17 00:00:00 2001
From: Hashem Berzeg <berzeg@users.noreply.github.com>
Date: Thu, 8 Sep 2022 20:09:02 -0700
Subject: [PATCH 1838/1922] implement sgn_out for mps

---
 aten/src/ATen/mps/MPSFallback.mm              |   1 -
 .../ATen/native/mps/operations/UnaryOps.mm    | 104 ++++++++++++++++++
 aten/src/ATen/native/native_functions.yaml    |   1 +
 3 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/mps/MPSFallback.mm b/aten/src/ATen/mps/MPSFallback.mm
index 69dd47f9c145f..e5cc5e237c728 100644
--- a/aten/src/ATen/mps/MPSFallback.mm
+++ b/aten/src/ATen/mps/MPSFallback.mm
@@ -60,7 +60,6 @@ Tensor slow_conv2d_forward_mps(
   m.impl("_fft_c2c", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
   m.impl("_fft_r2c", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
   m.impl("linalg_vector_norm", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
-  m.impl("sgn.out", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
   m.impl("_slow_conv2d_forward", slow_conv2d_forward_mps);
   m.impl("upsample_nearest3d.vec", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
 }
diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index baa92120962b9..be703fcfaa2a4 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -11,6 +11,7 @@
 typedef MPSGraphTensor* (^UnaryOpBlock)(MPSGraph*, MPSGraphTensor*);
 using is_noop_p = std::function<bool(const Tensor&)>;
 
+#define ConditionalOpFn(void) NSArray<MPSGraphTensor *> * (void)
 
 bool is_empty_tensor(const Tensor& self) {
   return self.numel() == 0;
@@ -277,5 +278,108 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
     });
 }
 
+TORCH_IMPL_FUNC(sgn_out_mps) (const Tensor& self, const Tensor& output)
+{
+    using namespace mps;
+    if (!output.is_same_size(self)) {
+      output.resize_(self.sizes());
+    }
+    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+    @autoreleasepool {
+      string key = string("sgn_out_mps") + getTensorsStringKey({self});
+      auto cachedGraph = cache_->LookUpAs<MPSUnaryCachedGraph>(key);
+
+      if(!cachedGraph) {
+        MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph* () {
+          MPSUnaryCachedGraph *newCachedGraph = nil;
+          @autoreleasepool {
+            MPSGraph* mpsGraph = make_mps_graph();
+            newCachedGraph = new MPSUnaryCachedGraph(mpsGraph);
+            newCachedGraph->inputTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, self);
+              if (newCachedGraph->inputTensor_.dataType == mps::getMPSDataType(ScalarType::ComplexHalf) || newCachedGraph->inputTensor_.dataType == mps::getMPSDataType(ScalarType::ComplexFloat) || newCachedGraph->inputTensor_.dataType == mps::getMPSDataType(ScalarType::ComplexDouble)) {
+                MPSGraphTensor* zeroComplexTensor = [mpsGraph constantWithScalar:0.0
+                                                              shape:getMPSShape(self)
+                                                              dataType:newCachedGraph->inputTensor_.dataType];
+
+                MPSGraphTensor* isComplexZero = [mpsGraph equalWithPrimaryTensor:newCachedGraph->inputTensor_
+                                                              secondaryTensor:zeroComplexTensor
+                                                              name: nil];
+
+                MPSGraphTensor* complexAbsolute = [mpsGraph absoluteWithTensor:newCachedGraph->inputTensor_
+                                                              name: nil];
+
+                MPSGraphTensor* complexSgn = [mpsGraph divisionWithPrimaryTensor:newCachedGraph->inputTensor_
+                                                              secondaryTensor:complexAbsolute
+                                                              name: nil];
+
+                MPSGraphIfThenElseBlock returnComplexZero = ^ConditionalOpFn(){ return @[zeroComplexTensor]; };
+
+                MPSGraphIfThenElseBlock returnComplexNonZeroSgn = ^ConditionalOpFn(){ return @[complexSgn]; };
+
+                MPSGraphTensor* sgnTensor = [mpsGraph ifWithPredicateTensor:isComplexZero
+                                                              thenBlock:returnComplexZero
+                                                              elseBlock:returnComplexNonZeroSgn
+                                                              name:nil][0];
+
+                newCachedGraph->outputTensor_ = sgnTensor;
+              } else {
+                MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0
+                                                              shape:getMPSShape(self)
+                                                              dataType:mps::getMPSDataType(self.scalar_type())];
+
+                MPSGraphTensor* oneTensor = [mpsGraph constantWithScalar:1
+                                                              shape:getMPSShape(self)
+                                                              dataType:mps::getMPSDataType(self.scalar_type())];
+
+                MPSGraphTensor* negativeOneTensor = [mpsGraph constantWithScalar:1
+                                                              shape:getMPSShape(self)
+                                                              dataType:mps::getMPSDataType(self.scalar_type())];
+
+                MPSGraphIfThenElseBlock casePositive = ^ConditionalOpFn(){ return @[oneTensor]; };
+
+                MPSGraphIfThenElseBlock caseNegative = ^ConditionalOpFn(){ return @[negativeOneTensor]; };
+
+                MPSGraphIfThenElseBlock caseZero = ^ConditionalOpFn(){ return @[zeroTensor]; };
+
+                MPSGraphTensor* isPositive = [mpsGraph greaterThanWithPrimaryTensor:newCachedGraph->inputTensor_
+                                                              secondaryTensor:zeroTensor
+                                                              name: nil];
+
+                MPSGraphTensor* isNegative = [mpsGraph lessThanWithPrimaryTensor:newCachedGraph->inputTensor_
+                                                              secondaryTensor:zeroTensor
+                                                              name: nil];
+
+                MPSGraphIfThenElseBlock ifNotPositiveBlock = ^ConditionalOpFn(){
+                  return [mpsGraph ifWithPredicateTensor:isNegative
+                                                              thenBlock:caseNegative
+                                                              elseBlock:caseZero
+                                                              name:nil];
+                };
+
+                MPSGraphTensor* sgnTensor = [mpsGraph ifWithPredicateTensor:isPositive
+                                                              thenBlock:casePositive
+                                                              elseBlock:ifNotPositiveBlock
+                                                              name:nil][0];
+
+                newCachedGraph->outputTensor_ = sgnTensor;
+              }
+          }
+          return newCachedGraph;
+        });
+        cachedGraph = tmpCachedGraph->as<MPSUnaryCachedGraph>();
+      }
+
+      Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
+      Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+        selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+      };
+      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+        outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+      };
+      runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+    }
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 880808df75b5d..5b15daf38ddfb 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -397,6 +397,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: sgn_out
+    MPS: sgn_out_mps
     SparseCPU, SparseCUDA: sgn_sparse_out
     SparseCsrCPU, SparseCsrCUDA: sgn_sparse_csr_out
   tags: pointwise

From 30710655842fbdf9d053fb97eff0b9c73c7ec3b0 Mon Sep 17 00:00:00 2001
From: Hashem Berzeg <berzeg@users.noreply.github.com>
Date: Tue, 13 Sep 2022 17:59:36 -0700
Subject: [PATCH 1839/1922] sgn_out actually works, mostly, on mps

---
 .../ATen/native/mps/operations/UnaryOps.mm    | 137 ++++++++++++------
 aten/src/ATen/native/mps/operations/View.mm   |   6 +-
 2 files changed, 99 insertions(+), 44 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index be703fcfaa2a4..9729a2e3d4ffb 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -1,5 +1,6 @@
 //  Copyright © 2022 Apple Inc.
 
+//#include <ATen/native/mps/Copy.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <torch/library.h>
@@ -281,12 +282,28 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
 TORCH_IMPL_FUNC(sgn_out_mps) (const Tensor& self, const Tensor& output)
 {
     using namespace mps;
+
     if (!output.is_same_size(self)) {
       output.resize_(self.sizes());
     }
+
+    string graphSuffix = "_real";
+    Tensor realInput;
+    Tensor realOutput;
+    Tensor flatInput = self.flatten();
+    Tensor flatOutput = output.flatten();
+    if (self.is_complex()) {
+      realInput = at::view_as_real(flatInput);
+      realOutput = at::view_as_real(flatOutput);
+      graphSuffix = "_complex";
+    } else {
+      realInput = flatInput;
+      realOutput = flatOutput;
+    }
+
     MPSGraphCache* cache_ = MPSGraphCache::getInstance();
     @autoreleasepool {
-      string key = string("sgn_out_mps") + getTensorsStringKey({self});
+      string key = string("sgn_out_mps") + getTensorsStringKey({realInput}) + graphSuffix;
       auto cachedGraph = cache_->LookUpAs<MPSUnaryCachedGraph>(key);
 
       if(!cachedGraph) {
@@ -295,52 +312,79 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
           @autoreleasepool {
             MPSGraph* mpsGraph = make_mps_graph();
             newCachedGraph = new MPSUnaryCachedGraph(mpsGraph);
-            newCachedGraph->inputTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, self);
-              if (newCachedGraph->inputTensor_.dataType == mps::getMPSDataType(ScalarType::ComplexHalf) || newCachedGraph->inputTensor_.dataType == mps::getMPSDataType(ScalarType::ComplexFloat) || newCachedGraph->inputTensor_.dataType == mps::getMPSDataType(ScalarType::ComplexDouble)) {
-                MPSGraphTensor* zeroComplexTensor = [mpsGraph constantWithScalar:0.0
-                                                              shape:getMPSShape(self)
-                                                              dataType:newCachedGraph->inputTensor_.dataType];
-
-                MPSGraphTensor* isComplexZero = [mpsGraph equalWithPrimaryTensor:newCachedGraph->inputTensor_
-                                                              secondaryTensor:zeroComplexTensor
+            newCachedGraph->inputTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, realInput);
+              MPSGraphTensor* sgnTensor;
+              if (self.is_complex()) {
+                NSArray<MPSGraphTensor*>* complexNumberComponents = [mpsGraph splitTensor:newCachedGraph->inputTensor_
+                                                              numSplits: 2
+                                                              axis: 1
+                                                              name: nil];
+
+                MPSGraphTensor* realPartTensor = complexNumberComponents[0];
+                MPSGraphTensor* imaginaryPartTensor = complexNumberComponents[1];
+
+                MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0
+                                                              shape:realPartTensor.shape
+                                                              dataType:realPartTensor.dataType];
+
+                MPSGraphTensor* complexZeroTensor = [mpsGraph constantWithScalar:0.0
+                                                              shape: newCachedGraph->inputTensor_.shape
+                                                              dataType:realPartTensor.dataType];                
+
+                MPSGraphTensor* isRealZero = [mpsGraph equalWithPrimaryTensor:realPartTensor
+                                                              secondaryTensor:zeroTensor
+                                                              name: nil];
+
+                MPSGraphTensor* isImaginaryZero = [mpsGraph equalWithPrimaryTensor:imaginaryPartTensor
+                                                              secondaryTensor:zeroTensor
                                                               name: nil];
 
-                MPSGraphTensor* complexAbsolute = [mpsGraph absoluteWithTensor:newCachedGraph->inputTensor_
+                MPSGraphTensor* isComplexZero = [mpsGraph logicalANDWithPrimaryTensor:isRealZero
+                                                              secondaryTensor:isImaginaryZero
                                                               name: nil];
 
-                MPSGraphTensor* complexSgn = [mpsGraph divisionWithPrimaryTensor:newCachedGraph->inputTensor_
-                                                              secondaryTensor:complexAbsolute
+                MPSGraphTensor* sgnDenomReal = [mpsGraph squareWithTensor:realPartTensor
                                                               name: nil];
 
-                MPSGraphIfThenElseBlock returnComplexZero = ^ConditionalOpFn(){ return @[zeroComplexTensor]; };
+                MPSGraphTensor* sgnDenomImaginary = [mpsGraph squareWithTensor:imaginaryPartTensor
+                                                              name: nil];
 
-                MPSGraphIfThenElseBlock returnComplexNonZeroSgn = ^ConditionalOpFn(){ return @[complexSgn]; };
+                MPSGraphTensor* sgnDenomSum = [mpsGraph additionWithPrimaryTensor:sgnDenomReal
+                                                              secondaryTensor:sgnDenomImaginary
+                                                              name: nil];
 
-                MPSGraphTensor* sgnTensor = [mpsGraph ifWithPredicateTensor:isComplexZero
-                                                              thenBlock:returnComplexZero
-                                                              elseBlock:returnComplexNonZeroSgn
-                                                              name:nil][0];
+                MPSGraphTensor* sgnDenom = [mpsGraph squareRootWithTensor:sgnDenomSum
+                                                              name: nil];
 
-                newCachedGraph->outputTensor_ = sgnTensor;
+                MPSGraphTensor* sgnRealTensor = [mpsGraph divisionWithPrimaryTensor:realPartTensor
+                                                              secondaryTensor:sgnDenom
+                                                              name: nil];
+
+                MPSGraphTensor* sgnImaginaryTensor = [mpsGraph divisionWithPrimaryTensor:imaginaryPartTensor
+                                                              secondaryTensor:sgnDenom
+                                                              name: nil];
+
+                MPSGraphTensor* sgnComplexTensor = [mpsGraph concatTensors:@[sgnRealTensor, sgnImaginaryTensor]
+                                                              dimension: 1
+                                                              name: nil];
+
+                sgnTensor = [mpsGraph selectWithPredicateTensor:isComplexZero
+                                                              truePredicateTensor:complexZeroTensor
+                                                              falsePredicateTensor:sgnComplexTensor
+                                                              name:nil];
               } else {
                 MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0
-                                                              shape:getMPSShape(self)
+                                                              shape:newCachedGraph->inputTensor_.shape
                                                               dataType:mps::getMPSDataType(self.scalar_type())];
 
                 MPSGraphTensor* oneTensor = [mpsGraph constantWithScalar:1
-                                                              shape:getMPSShape(self)
+                                                              shape:newCachedGraph->inputTensor_.shape
                                                               dataType:mps::getMPSDataType(self.scalar_type())];
 
-                MPSGraphTensor* negativeOneTensor = [mpsGraph constantWithScalar:1
-                                                              shape:getMPSShape(self)
+                MPSGraphTensor* negativeOneTensor = [mpsGraph constantWithScalar:-1
+                                                              shape:newCachedGraph->inputTensor_.shape
                                                               dataType:mps::getMPSDataType(self.scalar_type())];
 
-                MPSGraphIfThenElseBlock casePositive = ^ConditionalOpFn(){ return @[oneTensor]; };
-
-                MPSGraphIfThenElseBlock caseNegative = ^ConditionalOpFn(){ return @[negativeOneTensor]; };
-
-                MPSGraphIfThenElseBlock caseZero = ^ConditionalOpFn(){ return @[zeroTensor]; };
-
                 MPSGraphTensor* isPositive = [mpsGraph greaterThanWithPrimaryTensor:newCachedGraph->inputTensor_
                                                               secondaryTensor:zeroTensor
                                                               name: nil];
@@ -349,28 +393,25 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
                                                               secondaryTensor:zeroTensor
                                                               name: nil];
 
-                MPSGraphIfThenElseBlock ifNotPositiveBlock = ^ConditionalOpFn(){
-                  return [mpsGraph ifWithPredicateTensor:isNegative
-                                                              thenBlock:caseNegative
-                                                              elseBlock:caseZero
+                MPSGraphTensor* notPositiveTensor = [mpsGraph selectWithPredicateTensor:isNegative
+                                                              truePredicateTensor:negativeOneTensor
+                                                              falsePredicateTensor:zeroTensor
                                                               name:nil];
-                };
 
-                MPSGraphTensor* sgnTensor = [mpsGraph ifWithPredicateTensor:isPositive
-                                                              thenBlock:casePositive
-                                                              elseBlock:ifNotPositiveBlock
-                                                              name:nil][0];
-
-                newCachedGraph->outputTensor_ = sgnTensor;
+                sgnTensor = [mpsGraph selectWithPredicateTensor:isPositive
+                                                              truePredicateTensor:oneTensor
+                                                              falsePredicateTensor:notPositiveTensor
+                                                              name:nil];
               }
+              newCachedGraph->outputTensor_ = sgnTensor;
           }
           return newCachedGraph;
         });
         cachedGraph = tmpCachedGraph->as<MPSUnaryCachedGraph>();
       }
 
-      Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
-      Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+      Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, realInput);
+      Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, realOutput);
       NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
         selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
       };
@@ -379,6 +420,18 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
       };
       runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
     }
+
+    if (self.is_complex()) {
+      std::vector<long long> realSize = self.sizes().vec();
+      realSize.push_back(2);
+
+      Tensor originalShape = realOutput.reshape(realSize);
+      Tensor complexOutput = at::view_as_complex(originalShape);
+      output.copy_(complexOutput);
+    } else {
+      Tensor originalShape = at::reshape(realOutput, self.sizes());
+      output.copy_(originalShape);
+    }
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/mps/operations/View.mm b/aten/src/ATen/native/mps/operations/View.mm
index 83fcb5c6780b4..142686a87a584 100644
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@@ -731,14 +731,16 @@ Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst)
   if (!dst.has_storage()) {
     output = at::native::empty_mps(src.sizes(), src.scalar_type(), c10::nullopt, kMPS);
   }
-  ViewCachedGraph* cachedGraph = createViewGraph(src, dst, src.sizes(), src.strides(),
+  ViewCachedGraph* cachedGraph = createViewGraph(src.is_complex() ?  at::view_as_real(src) : src,
+                                                 dst, src.sizes(), src.strides(),
                                                  src.storage_offset(), /*needsScatter*/ false);
   return runViewGraph(cachedGraph, src, dst.has_storage() ? dst : output, /*needsScatter*/ false);
 }
 
 Tensor& scatterViewTensor(const at::Tensor& src, at::Tensor& output, id<MTLBuffer> updatesBuffer)
 {
-  ViewCachedGraph* cachedGraph = createViewGraph(output, src, output.sizes(), output.strides(),
+  ViewCachedGraph* cachedGraph = createViewGraph(output.is_complex() ?  at::view_as_real(output) : output,
+                                                 src, output.sizes(), output.strides(),
                                                  output.storage_offset(), /*needsScatter*/ true);
   return runViewGraph(cachedGraph, src, output, /*needsScatter*/ true, updatesBuffer);
 }

From b6a32a94bd9ae4ced8fabe06cb2b4e93bd53d34f Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Thu, 8 Dec 2022 12:52:06 -0800
Subject: [PATCH 1840/1922] Reduce running time for cdist tests (#194)

---
 test/test_mps.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index 9e2d7a73930d2..5298df4123765 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -350,16 +350,16 @@ def helper(dtype):
 
     def test_cdist_large(self, device="mps"):
         for cm in ['use_mm_for_euclid_dist_if_necessary', 'use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']:
-            x = torch.randn(1000, 10, device=device)
-            y = torch.randn(1000, 10, device=device)
+            x = torch.randn(100, 10, device=device)
+            y = torch.randn(100, 10, device=device)
             actual = torch.cdist(x, y, p=2, compute_mode=cm)
             expected = self._brute_cdist(x, y, p=2)
             self.assertEqual(expected, actual)
 
     def test_cdist_large_batch(self, device="mps"):
         for cm in ['use_mm_for_euclid_dist_if_necessary', 'use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']:
-            x = torch.randn(4, 3, 1000, 10, device=device)
-            y = torch.randn(4, 3, 1000, 10, device=device)
+            x = torch.randn(4, 3, 100, 10, device=device)
+            y = torch.randn(4, 3, 100, 10, device=device)
             actual = torch.cdist(x, y, p=2, compute_mode=cm)
             expected = self._brute_cdist(x, y, p=2)
             self.assertEqual(expected, actual)
@@ -460,10 +460,10 @@ def _brute_cdist(self, x, y, p=2):
         return torch.norm(x[..., None, :] - y[..., None, :, :], p=p, dim=-1)
 
     def test_cdist_norm(self, device="mps"):
-        for r1 in [3, 4, 5, 6]:
-            for m in [2, 3, 4, 10]:
-                for r2 in [4, 6, 7, 8]:
-                    for p in [0, 1, 2, 3, 1.5, 2.5, float('inf')]:
+        for r1 in [3, 4]:
+            for m in [2, 3]:
+                for r2 in [4, 6]:
+                    for p in [0, 1, 1.5, 2.5, float('inf')]:
                         x = torch.randn(r1, m, device=device)
                         y = torch.randn(r2, m, device=device)
                         if p == 2:
@@ -477,10 +477,10 @@ def test_cdist_norm(self, device="mps"):
                             self.assertEqual(expected, actual)
 
     def test_cdist_norm_batch(self, device="mps"):
-        for r1 in [3, 4, 5, 6]:
-            for m in [2, 3, 4, 10]:
-                for r2 in [4, 6, 7, 8]:
-                    for p in [0, 1, 2, 3, 1.5, 2.5, float('inf')]:
+        for r1 in [3, 4]:
+            for m in [2, 3]:
+                for r2 in [4, 6]:
+                    for p in [0, 3, 1.5, 2.5, float('inf')]:
                         x = torch.randn(2, 3, 6, r1, m, device=device)
                         y = torch.randn(2, 3, 6, r2, m, device=device)
                         if p == 2:

From 699ac43e4f9c0db4b0eb86005b880a13aaade0f7 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Thu, 8 Dec 2022 12:52:38 -0800
Subject: [PATCH 1841/1922] Clamp argmin / argmax min indicies values to 0
 (#193)

---
 aten/src/ATen/native/mps/operations/ReduceOps.mm | 7 ++++++-
 test/test_mps.py                                 | 8 ++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index a1f4449d18da5..53c4a52a526d0 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -1642,8 +1642,13 @@ Tensor min_mps(const Tensor& input_t) {
                                                           toType:MPSDataTypeInt64
                                                             name:@"castOutputTensor"];
 
+            MPSGraphTensor* outputClampedTensor = [mpsGraph clampWithTensor:outputTensor
+                                                             minValueTensor:[mpsGraph constantWithScalar:0 dataType:MPSDataTypeInt64]
+                                                             maxValueTensor:[mpsGraph constantWithScalar:LLONG_MAX dataType:MPSDataTypeInt64]
+                                                                       name: nil];
+
             newCachedGraph->inputTensor_ = inputTensor;
-            newCachedGraph->outputTensor_ = outputTensor;
+            newCachedGraph->outputTensor_ = outputClampedTensor;
           }
           return newCachedGraph;
         });
diff --git a/test/test_mps.py b/test/test_mps.py
index 5298df4123765..b9f8662046b40 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8087,8 +8087,8 @@ class TestConsistency(TestCase):
         '__ror__': ['b8', 'i16', 'i32', 'i64', 'u8'],
         '__rpow__': ['f16'],
         '__rxor__': ['b8', 'i16', 'i32', 'i64', 'u8'],
-        'masked.argmax': ['i16', 'i64', 'u8'],
-        'masked.argmin': ['i16', 'i64', 'u8'],
+        'masked.argmax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.argmin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'masked.log_softmax': ['f32'],
         'masked.logaddexp': ['f32'],
         'masked.logsumexp': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8547,10 +8547,6 @@ class TestConsistency(TestCase):
         'masked.std': [torch.int32],
         'masked.var': [torch.int32],
 
-        # Failures due to inconsistency between CPU and GPU for `inf` case
-        'masked.argmax': ['f16', 'f32', 'i32'],
-        'masked.argmin': ['f16', 'f32', 'i32'],
-
         'as_strided_scatter': [torch.uint8],
         'atan2': [torch.int64],
         'bfloat16': None,

From cff41d8f2b8b676252f5a1544067985fdf23eb94 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Thu, 8 Dec 2022 19:59:07 -0500
Subject: [PATCH 1842/1922] Implement nan_to_num() for MPS backend (#196)

Added a test case, and also enabled it in TestConsistency
---
 .../native/mps/operations/TensorCompare.mm    | 102 +++++++++++++++++-
 aten/src/ATen/native/native_functions.yaml    |   1 +
 test/test_mps.py                              |   8 ++
 3 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/mps/operations/TensorCompare.mm b/aten/src/ATen/native/mps/operations/TensorCompare.mm
index 44d19e99c2f62..ebbfa934abe25 100644
--- a/aten/src/ATen/native/mps/operations/TensorCompare.mm
+++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm
@@ -2,7 +2,7 @@
 
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/TensorCompare.h>
-#include <ATen/TensorUtils.h>
+#include <ATen/native/Resize.h>
 
 namespace at {
 namespace native {
@@ -416,5 +416,105 @@ Tensor where_mps(const Tensor& condition,
 
 }
 
+Tensor& nan_to_num_out_mps(const Tensor& self,
+                           c10::optional<double> nan,
+                           c10::optional<double> pos_inf,
+                           c10::optional<double> neg_inf,
+                           Tensor& result)
+{
+  TORCH_CHECK(self.scalar_type() == result.scalar_type(), "nan_to_num: dtype of out: ",
+              result.scalar_type(), " should be same as input: ", self.scalar_type());
+  if(result.numel() == 0) {
+    return result;
+  }
+  if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/true)) {
+    at::native::resize_output(result, self.sizes());
+    result.copy_(self);
+    return result;
+  }
+  using namespace mps;
+  struct CachedGraph : public MPSCachedGraph {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* selfTensor = nil;
+    MPSGraphTensor* outputTensor = nil;
+    MPSGraphTensor* nanReplacementTensor = nil;
+    MPSGraphTensor* posInfReplacementTensor = nil;
+    MPSGraphTensor* negInfReplacementTensor = nil;
+  };
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  @autoreleasepool {
+    string key = "nan_to_num" + getTensorsStringKey({self});
+    MPSDataType self_dtype = getMPSScalarType(self.scalar_type());
+
+    CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
+    if (!cachedGraph) {
+      cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ MPSCachedGraph * () {
+        CachedGraph *newCachedGraph = nil;
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          newCachedGraph->selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+          newCachedGraph->nanReplacementTensor    = mpsGraphRankedPlaceHolder(mpsGraph, self_dtype, @[@1]);
+          newCachedGraph->posInfReplacementTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_dtype, @[@1]);
+          newCachedGraph->negInfReplacementTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_dtype, @[@1]);
+
+          MPSGraphTensor* nanFreeTensor = [mpsGraph selectWithPredicateTensor: [mpsGraph isNaNWithTensor: newCachedGraph->selfTensor name:nil]
+                                                          truePredicateTensor: newCachedGraph->nanReplacementTensor
+                                                         falsePredicateTensor: newCachedGraph->selfTensor
+                                                                         name: nil];
+          MPSGraphTensor* subZeroTensor = [mpsGraph lessThanWithPrimaryTensor: nanFreeTensor
+                                                              secondaryTensor: [mpsGraph constantWithScalar: 0.0 dataType: self_dtype]
+                                                                         name: nil];
+          // the cast is a workaround for the issue #103149520 (crash when bool and fp16 passed to binary ops)
+          MPSGraphTensor* isNegInfTensor = [mpsGraph logicalANDWithPrimaryTensor: [mpsGraph castTensor: subZeroTensor toType: self_dtype name: nil]
+                                                                 secondaryTensor: [mpsGraph isInfiniteWithTensor: nanFreeTensor name:nil]
+                                                                            name: nil];
+          MPSGraphTensor* negInfFreeTensor = [mpsGraph selectWithPredicateTensor: isNegInfTensor
+                                                             truePredicateTensor: newCachedGraph->negInfReplacementTensor
+                                                            falsePredicateTensor: nanFreeTensor
+                                                                            name: nil];
+          newCachedGraph->outputTensor = [mpsGraph selectWithPredicateTensor: [mpsGraph isInfiniteWithTensor: negInfFreeTensor name:nil]
+                                                         truePredicateTensor: newCachedGraph->posInfReplacementTensor
+                                                        falsePredicateTensor: negInfFreeTensor
+                                                                        name: nil];
+        }
+        return newCachedGraph;
+      });
+    }
+    MPSScalar nanReplacementScalar, posInfReplacementScalar, negInfReplacementScalar;
+    AT_DISPATCH_FLOATING_TYPES_AND(kHalf, self.scalar_type(), "nan_to_num_mps", [&]() {
+        scalar_t nan_replacement = static_cast<scalar_t>(nan.value_or(0.));
+        scalar_t pos_inf_replacement = pos_inf.has_value() ?
+                                       static_cast<scalar_t>(pos_inf.value()) :
+                                       std::numeric_limits<scalar_t>::max();
+        scalar_t neg_inf_replacement = neg_inf.has_value() ?
+                                       static_cast<scalar_t>(neg_inf.value()) :
+                                       std::numeric_limits<scalar_t>::lowest();
+
+        nanReplacementScalar    = getMPSScalar(nan_replacement, self.scalar_type());
+        posInfReplacementScalar = getMPSScalar(pos_inf_replacement, self.scalar_type());
+        negInfReplacementScalar = getMPSScalar(neg_inf_replacement, self.scalar_type());
+    });
+
+    MPSStream* stream = getCurrentMPSStream();
+    Placeholder selfPlaceholder = Placeholder(cachedGraph->selfTensor, self);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, result);
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor()  : selfPlaceholder.getMPSGraphTensorData(),
+      cachedGraph->nanReplacementTensor    : getMPSGraphTensorFromScalar(stream, nanReplacementScalar),
+      cachedGraph->posInfReplacementTensor : getMPSGraphTensorFromScalar(stream, posInfReplacementScalar),
+      cachedGraph->negInfReplacementTensor : getMPSGraphTensorFromScalar(stream, negInfReplacementScalar),
+    };
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+  return result;
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 5b15daf38ddfb..b4fea233d1970 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -3095,6 +3095,7 @@
 - func: nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU, CUDA: nan_to_num_out
+    MPS: nan_to_num_out_mps
     SparseCPU, SparseCUDA: nan_to_num_sparse_out
   tags: pointwise
 
diff --git a/test/test_mps.py b/test/test_mps.py
index b9f8662046b40..587c7b64de1af 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -5480,6 +5480,13 @@ def helper(shape, alpha):
         helper((2, 8, 3, 5), 0.1)
         helper((2, 8, 3, 5), 0.2)
 
+    def test_nan_to_num(self):
+        inputCPU = torch.tensor([float('nan'), float('inf'), -float('inf'), 3.14])
+        inputMPS = inputCPU.detach().clone().to('mps').requires_grad_()
+        outputCPU = torch.nan_to_num(inputCPU, nan=2.0, posinf=1.0, neginf=-1.0)
+        outputMPS = torch.nan_to_num(inputMPS, nan=2.0, posinf=1.0, neginf=-1.0)
+        self.assertEqual(outputMPS, outputCPU)
+
     # Test where
     def test_where(self):
         def helper(shape, x_shape, y_shape, cond_dtype=torch.bool, x_dtype=torch.float):
@@ -8223,6 +8230,7 @@ class TestConsistency(TestCase):
         'mm': ['f32'],
         'mv': ['f32'],
         'mul': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'nan_to_num': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'native_layer_norm': ['f32'],
         'ne': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'neg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],

From 7f42100af2d1559b6df362035290b2d9dd989256 Mon Sep 17 00:00:00 2001
From: Kulin Seth <kulin_seth@apple.com>
Date: Fri, 9 Dec 2022 16:16:32 -0800
Subject: [PATCH 1843/1922] Build fixes.

---
 aten/src/ATen/native/UpSample.h                      |  2 +-
 aten/src/ATen/native/mps/MPSGraphVenturaOps.h        | 11 +++++++++++
 aten/src/ATen/native/mps/operations/ReduceOps.mm     |  5 +----
 aten/src/ATen/native/mps/operations/TensorCompare.mm |  2 +-
 aten/src/ATen/native/mps/operations/UpSample.mm      |  1 +
 aten/src/ATen/native/mps/operations/View.mm          |  7 ++++---
 6 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/aten/src/ATen/native/UpSample.h b/aten/src/ATen/native/UpSample.h
index d4e8112229c91..92ee7252d1bd6 100644
--- a/aten/src/ATen/native/UpSample.h
+++ b/aten/src/ATen/native/UpSample.h
@@ -56,7 +56,7 @@ TORCH_API c10::SmallVector<int64_t, 3> compute_output_size(
 
 inline c10::optional<double> get_scale_value(c10::optional<c10::ArrayRef<double>> scales, int idx) {
   if (!scales) {
-    return nullopt;
+    return c10::nullopt;
   }
   return scales->at(idx);
 }
diff --git a/aten/src/ATen/native/mps/MPSGraphVenturaOps.h b/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
index b77db66795cf4..b260afce5e410 100644
--- a/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
+++ b/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
@@ -3,6 +3,17 @@
 
 // TODO: Remove me when moved to MacOS 13
 @interface MPSGraph (VenturaOps)
+API_AVAILABLE(macos(13.0))
+typedef NS_ENUM(NSUInteger, MPSGraphResizeNearestRoundingMode)
+{
+    MPSGraphResizeNearestRoundingModeRoundPreferCeil   =  0L,
+    MPSGraphResizeNearestRoundingModeRoundPreferFloor  =  1L,
+    MPSGraphResizeNearestRoundingModeCeil              =  2L,
+    MPSGraphResizeNearestRoundingModeFloor             =  3L,
+    MPSGraphResizeNearestRoundingModeRoundToEven       =  4L,
+    MPSGraphResizeNearestRoundingModeRoundToOdd        =  5L,
+};
+
 - (MPSGraphTensor *)cumulativeSumWithTensor:(MPSGraphTensor *)tensor
                                        axis:(NSInteger)axis
                                        name:(NSString *)name;
diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index 53c4a52a526d0..fa3b54f0fcd09 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -329,9 +329,6 @@ Tensor trace_mps_out(const Tensor& self) {
     bool keepdim,
     c10::optional<ScalarType> dtype,
     const Tensor& output_t) {
-
-    int64_t dims[1] = {dim};
-
   int64_t dims[1] = {dim};
   reduction_out_mps(input_t, IntArrayRef(dims, 1), keepdim, dtype, output_t, MPSReductionType::PROD, "prod_out_mps");
 }
@@ -432,7 +429,7 @@ void impl_func_norm_mps(
   const OptionalScalarRef& opt_p,
   IntArrayRef dim,
   bool keepdim,
-  optional<ScalarType> opt_dtype,
+  c10::optional<ScalarType> opt_dtype,
   const Tensor& output_t,
   bool cdist = false,
   c10::optional<IntArrayRef> input_broadcasted_shape = c10::nullopt,
diff --git a/aten/src/ATen/native/mps/operations/TensorCompare.mm b/aten/src/ATen/native/mps/operations/TensorCompare.mm
index ebbfa934abe25..3bbccb4d2e85d 100644
--- a/aten/src/ATen/native/mps/operations/TensorCompare.mm
+++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm
@@ -468,7 +468,7 @@ Tensor where_mps(const Tensor& condition,
                                                               secondaryTensor: [mpsGraph constantWithScalar: 0.0 dataType: self_dtype]
                                                                          name: nil];
           // the cast is a workaround for the issue #103149520 (crash when bool and fp16 passed to binary ops)
-          MPSGraphTensor* isNegInfTensor = [mpsGraph logicalANDWithPrimaryTensor: [mpsGraph castTensor: subZeroTensor toType: self_dtype name: nil]
+          MPSGraphTensor* isNegInfTensor = [mpsGraph logicalANDWithPrimaryTensor: [mpsGraph castTensor: subZeroTensor toType: self_dtype name: @"castTensor"]
                                                                  secondaryTensor: [mpsGraph isInfiniteWithTensor: nanFreeTensor name:nil]
                                                                             name: nil];
           MPSGraphTensor* negInfFreeTensor = [mpsGraph selectWithPredicateTensor: isNegInfTensor
diff --git a/aten/src/ATen/native/mps/operations/UpSample.mm b/aten/src/ATen/native/mps/operations/UpSample.mm
index 487fd88d6df9f..7d8a6a498d030 100644
--- a/aten/src/ATen/native/mps/operations/UpSample.mm
+++ b/aten/src/ATen/native/mps/operations/UpSample.mm
@@ -1,6 +1,7 @@
 //  Copyright © 2022 Apple Inc.
 
 #include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/UpSample.h>
 
 namespace at {
diff --git a/aten/src/ATen/native/mps/operations/View.mm b/aten/src/ATen/native/mps/operations/View.mm
index 142686a87a584..6b6727e141ec1 100644
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@@ -483,9 +483,9 @@ bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape) {
     // E.g: x = torch.randn((3,6))[1, 1:3]
     int nextSliceOffset = src.storage_offset() % view_numel;
 
-    [srcTensorNDArrayDesc sliceDimension:src_ndim_base - 1 - firstDimToSlice withSubrange:{static_cast<NSUInteger>(sliceOffset), src.sizes()[firstDimToSlice]}];
+    [srcTensorNDArrayDesc sliceDimension:src_ndim_base - 1 - firstDimToSlice withSubrange:{static_cast<NSUInteger>(sliceOffset), static_cast<NSUInteger>(src.sizes()[firstDimToSlice])}];
     if (nextSliceOffset) {
-      [srcTensorNDArrayDesc sliceDimension:src_ndim_base - 2 - firstDimToSlice withSubrange:{static_cast<NSUInteger>(nextSliceOffset), src.sizes()[firstDimToSlice+1]}];
+      [srcTensorNDArrayDesc sliceDimension:src_ndim_base - 2 - firstDimToSlice withSubrange:{static_cast<NSUInteger>(nextSliceOffset), static_cast<NSUInteger>(src.sizes()[firstDimToSlice+1])}];
     }
   }
   else {
@@ -522,7 +522,8 @@ bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape) {
     // starting point from where the slice should start
     int sliceOffset = src_ndim_view == 1 ? 1 : dim0;
     int view_numel = src_ndim_view == 1 ? 1 : src_view_numel;
-    [srcTensorNDArrayDesc sliceDimension:finalShapeSize - 1 withSubrange:{static_cast<NSUInteger>((src.storage_offset() / view_numel) * sliceOffset), totalSlices}];
+    [srcTensorNDArrayDesc sliceDimension:finalShapeSize - 1
+    withSubrange:{static_cast<NSUInteger>((src.storage_offset() / view_numel) * sliceOffset), static_cast<NSUInteger>(totalSlices)}];
   }
 
   srcTensorNDArrayView = [srcTensorNDArray arrayViewWithCommandBuffer:commandBuffer

From 0fa37fc34ece75060047bfe42ba2071c0ab6dd67 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Fri, 9 Dec 2022 12:56:58 -0500
Subject: [PATCH 1844/1922] Fix correctness issues with Pooling ops (#187)

* Fix correctness issues with Pooling ops
- Workaround for MaxPool when ceilMode=true
- Workaround for ChannelsLast memory format
- Workaround for divisor_override in AvgPool ops
- Enabled count_include_pad parameter for AvgPool
- Refactoring and clean up of duplicate code

* Remove the divisor from the cached key

* Pass positive values to transposeTensor() in convertNHWCtoNCHW()
---
 aten/src/ATen/native/mps/OperationUtils.h     |    7 +-
 aten/src/ATen/native/mps/OperationUtils.mm    |   23 +-
 .../src/ATen/native/mps/operations/Pooling.mm | 1060 +++++------------
 test/test_mps.py                              |   19 +-
 4 files changed, 299 insertions(+), 810 deletions(-)

diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
index 09f42a46bfaa5..458a34d0af90f 100644
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@@ -55,9 +55,9 @@ Tensor& scatterViewTensor(const at::Tensor& src, at::Tensor& output, id<MTLBuffe
 bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape);
 MPSGraphTensorData* getMPSGraphTensorDataForView(const Tensor& src, MPSShape *mpsShape, const MPSDataType mpsDataType);
 
-MPSShape* getMPSShape(const Tensor& t);
-MPSShape* getMPSShape(IntArrayRef sizes);
-MPSShape* getMPSShape(c10::MaybeOwned<Tensor> t);
+// The MPSShape could vary based on memory format
+MPSShape* getMPSShape(const Tensor& t, c10::MemoryFormat memory_format = MemoryFormat::Contiguous);
+MPSShape* getMPSShape(IntArrayRef sizes, c10::MemoryFormat memory_format = MemoryFormat::Contiguous);
 
 static inline id<MTLBuffer> getMTLBufferStorage(const at::Tensor& tensor) {
   return __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
@@ -86,6 +86,7 @@ class Placeholder {
 
 void resize_tensor(Tensor* output);
 MPSGraphTensor* trunc_tensor(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor);
+MPSGraphTensor* convertNHWCtoNCHW(MPSGraph *mpsGraph, MPSGraphTensor* tensor);
 MPSGraphTensor* castMPSTensor(MPSGraph *mpsGraph, MPSGraphTensor* tensor, ScalarType toType);
 MPSGraphTensorData *getMPSGraphTensorData(MPSGraph* mpsGraph, MPSStream* mpsStream, const Tensor& tensor);
 MPSGraphTensorData* getMPSGraphTensorFromScalar(MPSStream* mpsStream, MPSScalar& scalar);
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index a0ef49fbaf408..aa1344ffaf795 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -152,7 +152,7 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
 
 std::string getTensorsStringKey(const TensorList& tensors, bool use_scalar_value) {
     std::string str;
-    // The key format per tensor would look like ":MPSDataTypeFloat32[1,1,1,10]:"
+    // The key format per tensor would look like ":Float32[1,1,1,10]:"
     for (const Tensor& tensor: tensors) {
       str += ":";
       if (tensor.defined()) {
@@ -172,16 +172,15 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
     return str;
 }
 
-MPSShape* getMPSShape(const Tensor& t) {
-  return getMPSShape(t.sizes());
+MPSShape* getMPSShape(const Tensor& t, c10::MemoryFormat memory_format) {
+  return getMPSShape(t.sizes(), memory_format);
 }
 
-MPSShape* getMPSShape(c10::MaybeOwned<Tensor> t) {
-  const Tensor& t_ = *t;
-  return getMPSShape(t_);
-}
-
-MPSShape* getMPSShape(IntArrayRef sizes) {
+MPSShape* getMPSShape(IntArrayRef sizes, c10::MemoryFormat memory_format) {
+  if (memory_format == MemoryFormat::ChannelsLast) {
+    TORCH_INTERNAL_ASSERT(sizes.size() == 4, "ChannelsLast memory format must have 4 dimensions!");
+    return @[@(sizes[0]), @(sizes[2]), @(sizes[3]), @(sizes[1])];
+  }
   const int sz = sizes.size();
   const int sz_ = (sz > 0) ? sz : 1;
 
@@ -370,6 +369,12 @@ void resize_tensor(Tensor* output) {
   return [mpsGraph castTensor:tensor toType:getMPSScalarType(toType) name:@"castTensor"];
 }
 
+MPSGraphTensor* convertNHWCtoNCHW(MPSGraph *mpsGraph, MPSGraphTensor* tensor) {
+  TORCH_INTERNAL_ASSERT(tensor.shape.count == 4, "Tensor must have 4 dimensions!");
+  return [mpsGraph transposeTensor:[mpsGraph transposeTensor:tensor dimension:3 withDimension:2 name:nil]
+                         dimension:2 withDimension:1 name: nil];
+}
+
 string get_mem_format_string(c10::MemoryFormat memory_format) {
   string mem_format_key;
   switch(memory_format) {
diff --git a/aten/src/ATen/native/mps/operations/Pooling.mm b/aten/src/ATen/native/mps/operations/Pooling.mm
index 1df24e073239e..97909840d514b 100644
--- a/aten/src/ATen/native/mps/operations/Pooling.mm
+++ b/aten/src/ATen/native/mps/operations/Pooling.mm
@@ -1,867 +1,345 @@
 //  Copyright © 2022 Apple Inc.
 
-#include <ATen/ATen.h>
-#include <ATen/Tensor.h>
-#include <ATen/Utils.h>
-#include <ATen/TensorUtils.h>
-#include <ATen/mps/MPSStream.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/Pool.h>
-#include <torch/library.h>
 
 namespace at {
 namespace native {
+namespace mps {
+
+struct PoolingCachedGraph : public MPSCachedGraph
+{
+  PoolingCachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+  MPSGraphTensor* inputTensor = nil;
+  MPSGraphTensor* outputTensor = nil;
+  MPSGraphTensor* indicesTensor = nil;
+  MPSGraphTensor* gradOutputTensor = nil;
+  MPSGraphTensor* divisorTensor = nil;
+};
+
+typedef MPSGraphTensor* (^PoolingOpBlock)(PoolingCachedGraph&, MPSGraphPooling2DOpDescriptor*);
+#define PoolingOpFn(graph, desc) MPSGraphTensor* (mps::PoolingCachedGraph& graph, MPSGraphPooling2DOpDescriptor* desc)
+
+// Pooling ops (1D/2D forward and backward Max and Average pooling)
+static void pool2d_template(const Tensor& input, const Tensor& output,
+                            const c10::optional<Tensor>& indices_opt,
+                            const c10::optional<Tensor>& grad_output_opt,
+                            IntArrayRef kernel_size, IntArrayRef stride,
+                            IntArrayRef padding, IntArrayRef dilation,
+                            bool ceil_mode, const c10::optional<float> divisor,
+                            PoolingOpBlock poolingBlock, const c10::string& op_name)
+{
+  if (input.numel() == 0)
+    return;
 
-// Create pooling descriptor
-void fill_pool_desc(MPSGraphPooling2DOpDescriptor* desc,
-                    NSUInteger kW, NSUInteger kH,
-                    NSUInteger dW, NSUInteger dH,
-                    NSUInteger dilationW, NSUInteger dilationH,
-                    NSUInteger padW, NSUInteger padH,
-                    bool ceil_mode, c10::MemoryFormat memory_format) {
-  desc.kernelWidth = kW;
-  desc.kernelHeight = kH;
-  desc.strideInX = dW;
-  desc.strideInY = dH;
-  desc.dilationRateInX = dilationW;
-  desc.dilationRateInY = dilationH;
-  desc.paddingLeft = padW;
-  desc.paddingRight = padW;
-  desc.paddingTop = padH;
-  desc.paddingBottom = padH;
-  desc.ceilMode = ceil_mode;
-  desc.paddingStyle = MPSGraphPaddingStyleExplicit;
-  switch(memory_format) {
-    case at::MemoryFormat::Contiguous:
-      desc.dataLayout = MPSGraphTensorNamedDataLayoutNCHW;
-      break;
-    case at::MemoryFormat::ChannelsLast:
-      desc.dataLayout = MPSGraphTensorNamedDataLayoutNHWC;
-      break;
-    default:
-      assert(0 && "Check should have been done earlier\n");
+  const int64_t ndims = input.ndimension();
+  const Tensor& grad_output = *(at::borrow_from_optional_tensor(grad_output_opt));
+  const Tensor& indices = *(at::borrow_from_optional_tensor(indices_opt));
+  const bool is_backward_pass = grad_output.defined();
+  const bool has_indices = indices.defined();
+  const bool has_divisor = divisor.has_value();
+  const auto suggested_memory_format = input.suggest_memory_format();
+  // for max_pool2d_with_indices() we cannot pass ChannelsLast (i.e., NHWC) to 'desc.dataLayout' in MPSGraph.
+  // Because the returned indices will be selected based on NHWC memory layout which will
+  // be incompatible with the PyTorch's global NCHW layout.
+  const auto memory_format = has_indices ? MemoryFormat::Contiguous : suggested_memory_format;
+
+  TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2, op_name,
+    ": kernel_size must either be a single int, or a tuple of two ints")
+  TORCH_CHECK(stride.size() == 0 || stride.size() == 1 || stride.size() == 2, op_name,
+    ": stride must either be omitted, a single int, or a tuple of two ints")
+  TORCH_CHECK(padding.size() == 1 || padding.size() == 2, op_name,
+    ": padding must be either be a single int, or a tuple of two ints");
+  TORCH_CHECK(dilation.size() == 1 || dilation.size() == 2, op_name,
+    ": dilation must be either a single int, or a tuple of two ints");
+
+  if (suggested_memory_format == at::MemoryFormat::ChannelsLast) {
+    TORCH_CHECK(ndims == 4, "non-empty 4D (batch mode) tensor expected for input with channels_last layout");
+  } else if (suggested_memory_format == at::MemoryFormat::Contiguous) {
+    TORCH_CHECK((ndims == 3 || ndims == 4), "non-empty 3D or 4D (batch mode) tensor expected for input");
+  } else {
+    TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
   }
-}
-
-Tensor _mps_max_pool2d(
-    const Tensor& input_t,
-    IntArrayRef kernel_size,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation,
-    bool ceil_mode) {
 
-  // #20866, #22032: Guarantee this for the official C++ API?
-  TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,
-    "max_pool2d: kernel_size must either be a single int, or a tuple of two ints")
   const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
   const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);
-
-  // NB: stride default is not expressible as an integer constant, so we accept
-  // empty stride for this case
-  TORCH_CHECK(stride.size() == 0 || stride.size() == 1 || stride.size() == 2,
-    "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints")
   const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
-  const int dW = stride.empty() ? kW :
-                 stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
-
-  TORCH_CHECK(padding.size() == 1 || padding.size() == 2,
-    "max_pool2d: padding must be either be a single int, or a tuple of two ints");
+  const int dW = stride.empty() ? kW : stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
   const int padH = safe_downcast<int, int64_t>(padding[0]);
   const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
-
-  TORCH_CHECK(dilation.size() == 1 || dilation.size() == 2,
-    "max_pool2d: dilation must be either a single int, or a tuple of two ints");
   const int dilationH = safe_downcast<int, int64_t>(dilation[0]);
   const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast<int, int64_t>(dilation[1]);
-
-  const auto memory_format = input_t.suggest_memory_format();
-  if (memory_format == at::MemoryFormat::ChannelsLast) {
-    TORCH_CHECK(input_t.ndimension() == 4,
-      "non-empty 4D (batch mode) tensor expected for input with channels_last layout");
-  } else if (memory_format == at::MemoryFormat::Contiguous) {
-    TORCH_CHECK((input_t.ndimension() == 3 || input_t.ndimension() == 4),
-      "non-empty 3D or 4D (batch mode) tensor expected for input");
-  } else {
-    TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
-  }
-
-  /* sizes */
-  const int64_t nbatch = input_t.ndimension() == 4 ? input_t.size(-4) : 1;
-  const int64_t nInputPlane = input_t.size(-3);
-  const int64_t inputHeight = input_t.size(-2);
-  const int64_t inputWidth = input_t.size(-1);
-
+  const int64_t nbatch = ndims == 4 ? input.size(-4) : 1;
+  const int64_t nInputPlane = input.size(-3);
+  const int64_t inputHeight = input.size(-2);
+  const int64_t inputWidth = input.size(-1);
   const int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, dilationH, ceil_mode);
   const int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, dilationW, ceil_mode);
 
-  pool2d_shape_check(
-    input_t,
-    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
-    nInputPlane,
-    inputHeight, inputWidth,
-    outputHeight, outputWidth, memory_format);
-
-  namespace native_mps = at::native::mps;
-  using CachedGraph = native_mps::MPSUnaryCachedGraph;
-
-  native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
-
-  Tensor output_t;
+  pool2d_shape_check(input, kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+                     nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, memory_format);
 
-  if (input_t.ndimension() == 3) {
-    output_t = at::native::empty_mps(
-                  {nInputPlane, outputHeight, outputWidth},
-                  input_t.scalar_type(),
-                  c10::nullopt,
-                  kMPS,
-                  c10::nullopt,
-                  memory_format);
-  } else {
-    output_t = at::native::empty_mps(
-                  {nbatch, nInputPlane, outputHeight, outputWidth},
-                  input_t.scalar_type(),
-                  c10::nullopt,
-                  kMPS,
-                  c10::nullopt,
-                  memory_format);
+  // the output and indices are 'empty', so we could avoid unnecessary gatherView on empty tensors
+  // by simply restriding them (instead of calling the costly Contiguous()).
+  if (indices.suggest_memory_format() == MemoryFormat::ChannelsLast) {
+    indices.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
   }
-
-  if (output_t.numel() == 0) {
-    return output_t;
+  if (output.numel() == 0) {
+    std::vector<int64_t> outputSizes {nInputPlane, outputHeight, outputWidth};
+    if (ndims == 4) {
+      outputSizes.insert(outputSizes.begin(), nbatch);
+    }
+    output.resize_(outputSizes);
+  } else if (output.suggest_memory_format() == MemoryFormat::ChannelsLast) {
+    output.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
   }
 
-  auto stream = at::mps::getCurrentMPSStream();
+  if (output.numel() == 0 || (is_backward_pass && grad_output.numel() == 0)) {
+    return;
+  }
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
 
   @autoreleasepool {
+    string key = op_name + getTensorsStringKey({input, indices, grad_output}) + ":K[" +
+                 getArrayRefString(kernel_size) + "]:S[" + getArrayRefString(stride) + "]:P[" +
+                 getArrayRefString(padding) + "]:D[" + getArrayRefString(dilation) + "]" +
+                 (ceil_mode ? ":ceil" : "") + ":" + (suggested_memory_format == MemoryFormat::ChannelsLast ? "NHWC" : "NCHW");
 
-    string mem_format_key;
-    switch(memory_format) {
-      case at::MemoryFormat::Contiguous:
-        mem_format_key = "Contiguous";
-        break;
-      case at::MemoryFormat::ChannelsLast:
-        mem_format_key = "ChannelsLast";
-        break;
-      default:
-        assert(0 && "Check should have been done earlier\n");
-    }
-
-    string key = "mps_max_pool2d:" + to_string(kW) + ":" + to_string(kH) + ":" +
-                                     to_string(dW) + ":" + to_string(dH) + ":" +
-                                     to_string(dilationW) + ":" + to_string(dilationH) + ":" +
-                                     to_string(padW) + ":" + to_string(padH) + ":" +
-                                     to_string(ceil_mode) + ":" + mem_format_key +
-                                     mps::getTensorsStringKey({input_t});
-    CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
+    MPSShape* inputShape = getMPSShape(input, memory_format);
+    MPSShape* gradOutputShape = is_backward_pass ? getMPSShape(grad_output, memory_format) : nullptr;
+    PoolingCachedGraph* cachedGraph = cache_->LookUpAs<PoolingCachedGraph>(key);
 
     if(!cachedGraph) {
-      native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
-        CachedGraph *newCachedGraph = nil;
+      cachedGraph = cache_->CreateCachedGraphAs<PoolingCachedGraph>(key, ^ MPSCachedGraph * () {
+        PoolingCachedGraph *newCachedGraph = nil;
 
         @autoreleasepool {
-          MPSGraph* mpsGraph = native_mps::make_mps_graph();
-          newCachedGraph = new CachedGraph(mpsGraph);
-
-          MPSGraphPooling2DOpDescriptor* desc = [[MPSGraphPooling2DOpDescriptor new] autorelease];
-          fill_pool_desc(desc, kW, kH, dW, dH, dilationW, dilationH, padW, padH, ceil_mode, memory_format);
-
-          MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t);
-          MPSGraphTensor* outputTensor = [mpsGraph maxPooling2DWithSourceTensor:inputTensor
-                                                                     descriptor:desc
-                                                                           name:nil];
-          newCachedGraph->inputTensor_ = inputTensor;
-          newCachedGraph->outputTensor_ = outputTensor;
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new PoolingCachedGraph(mpsGraph);
+
+          MPSGraphPooling2DOpDescriptor* desc = [MPSGraphPooling2DOpDescriptor
+                                                descriptorWithKernelWidth: kW
+                                                             kernelHeight: kH
+                                                                strideInX: dW
+                                                                strideInY: dH
+                                                          dilationRateInX: dilationW
+                                                          dilationRateInY: dilationH
+                                                              paddingLeft: padW
+                                                             paddingRight: ceil_mode ? padW * dW : padW
+                                                               paddingTop: padH
+                                                            paddingBottom: ceil_mode ? padH * dH : padH
+                                                             paddingStyle: MPSGraphPaddingStyleExplicit
+                                                               dataLayout: memory_format == MemoryFormat::ChannelsLast ?
+                                                                           MPSGraphTensorNamedDataLayoutNHWC :
+                                                                           MPSGraphTensorNamedDataLayoutNCHW];
+          desc.ceilMode = (padW == 0 && padH == 0) ? ceil_mode : false;
+          if (has_indices) {
+            desc.returnIndicesMode = MPSGraphPoolingReturnIndicesGlobalFlatten2D;
+            desc.returnIndicesDataType = MPSDataTypeInt32;
+          }
+          newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(input.scalar_type()), inputShape);
+          if (is_backward_pass) {
+            newCachedGraph->gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(grad_output.scalar_type()), gradOutputShape);
+          }
+          if (has_divisor) {
+            newCachedGraph->divisorTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(output.scalar_type()), @[@1]);
+          }
+          MPSGraphTensor* outputTensor = poolingBlock(*newCachedGraph, desc);
+          // with desc.dataLayout = NHWC (i.e., ChannelsLast), the results need to be converted back to NCHW
+          newCachedGraph->outputTensor = memory_format == MemoryFormat::ChannelsLast ?
+                                         convertNHWCtoNCHW(mpsGraph, outputTensor) : outputTensor;
         }
         return newCachedGraph;
       });
-      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
-    auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
-    auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t);
+    MPSStream* mpsStream = getCurrentMPSStream();
+    // in case of ChannelsLast we don't perform gather() in placeholder to avoid implicit conversion to NCHW
+    Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor, input, inputShape, memory_format != MemoryFormat::ChannelsLast);
+    Placeholder gradOutputPlaceholder = !is_backward_pass ? Placeholder() :
+                                        Placeholder(cachedGraph->gradOutputTensor, grad_output,
+                                                    gradOutputShape, memory_format != MemoryFormat::ChannelsLast);
+    Placeholder indicesPlaceholder = has_indices ? Placeholder(cachedGraph->indicesTensor, indices) : Placeholder();
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output);
+    NSMutableDictionary *feeds = [[NSMutableDictionary new] autorelease];
+    NSMutableDictionary *results = [[NSMutableDictionary new] autorelease];
+
+    feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData();
+    results[outputPlaceholder.getMPSGraphTensor()] = outputPlaceholder.getMPSGraphTensorData();
+
+    if (cachedGraph->gradOutputTensor) {
+      feeds[gradOutputPlaceholder.getMPSGraphTensor()] = gradOutputPlaceholder.getMPSGraphTensorData();
+    }
+    if (cachedGraph->indicesTensor) {
+      if (is_backward_pass) {
+        feeds[indicesPlaceholder.getMPSGraphTensor()] = indicesPlaceholder.getMPSGraphTensorData();
+      } else {
+        results[indicesPlaceholder.getMPSGraphTensor()] = indicesPlaceholder.getMPSGraphTensorData();
+      }
+    }
+    MPSScalar divisor_scalar;
+    if (cachedGraph->divisorTensor) {
+      divisor_scalar = getMPSScalar(divisor.value(), output.scalar_type());
+      feeds[cachedGraph->divisorTensor] = getMPSGraphTensorFromScalar(mpsStream, divisor_scalar);
+    }
 
-    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-    };
+    runMPSGraph(mpsStream, cachedGraph->graph(), feeds, results);
+  }
+}
 
-    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
-      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
-    };
+} // namespace mps
 
-    native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
-  }
+Tensor _mps_max_pool2d(
+    const Tensor& input,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    bool ceil_mode)
+{
+  Tensor output = at::empty({0}, input.options(), MemoryFormat::Contiguous);
+  mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
+    MPSGraph* mpsGraph = cachedGraph.graph();
+    return [mpsGraph maxPooling2DWithSourceTensor: cachedGraph.inputTensor
+                                       descriptor: desc
+                                             name: nil];
+  };
+  mps::pool2d_template(input, output, c10::nullopt, c10::nullopt, kernel_size, stride,
+                       padding, dilation, ceil_mode, c10::nullopt, pooling_op_block, "max_pool2d");
 
-  return output_t;
+  return output;
 }
 
 Tensor mps_max_pool2d_backward(
     const Tensor& grad_output,
-    const Tensor& input_t,
+    const Tensor& input,
     IntArrayRef kernel_size,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation,
-    bool ceil_mode) {
-
-  // #20866, #22032: Guarantee this for the official C++ API?
-  TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,
-    "max_pool2d: kernel_size must either be a single int, or a tuple of two ints")
-  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
-  const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);
-
-  // NB: stride default is not expressible as an integer constant, so we accept
-  // empty stride for this case
-  TORCH_CHECK(stride.size() == 0 || stride.size() == 1 || stride.size() == 2,
-    "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints")
-  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
-  const int dW = stride.empty() ? kW :
-                 stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
-
-  TORCH_CHECK(padding.size() == 1 || padding.size() == 2,
-    "max_pool2d: padding must be either be a single int, or a tuple of two ints");
-  const int padH = safe_downcast<int, int64_t>(padding[0]);
-  const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
-
-  TORCH_CHECK(dilation.size() == 1 || dilation.size() == 2,
-    "max_pool2d: dilation must be either a single int, or a tuple of two ints");
-  const int dilationH = safe_downcast<int, int64_t>(dilation[0]);
-  const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast<int, int64_t>(dilation[1]);
-
-  const auto memory_format = input_t.suggest_memory_format();
-  if (memory_format == at::MemoryFormat::ChannelsLast) {
-    TORCH_CHECK(input_t.ndimension() == 4,
-      "non-empty 4D (batch mode) tensor expected for input with channels_last layout");
-  } else if (memory_format == at::MemoryFormat::Contiguous) {
-    TORCH_CHECK((input_t.ndimension() == 3 || input_t.ndimension() == 4),
-      "non-empty 3D or 4D (batch mode) tensor expected for input");
-  } else {
-    TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
-  }
-
-  namespace native_mps = at::native::mps;
-
-  // Derive from MPSCachedGraph
-  struct CachedGraph : public native_mps::MPSCachedGraph
-  {
-    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
-    MPSGraphTensor *inputTensor_ = nil;
-    MPSGraphTensor *gradOutputTensor_ = nil;
-    MPSGraphTensor *gradInputTensor_ = nil;
+    bool ceil_mode)
+{
+  Tensor grad_input = at::empty(input.sizes(), input.options(), MemoryFormat::Contiguous);
+  mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
+    MPSGraph* mpsGraph = cachedGraph.graph();
+    return [mpsGraph maxPooling2DGradientWithGradientTensor: cachedGraph.gradOutputTensor
+                                               sourceTensor: cachedGraph.inputTensor
+                                                 descriptor: desc
+                                                       name: nil];
   };
-
-  native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
-
-  Tensor grad_input;
-  grad_input = at::native::empty_mps(
-                input_t.sizes(),
-                input_t.scalar_type(),
-                c10::nullopt,
-                kMPS,
-                c10::nullopt,
-                memory_format);
-
-  if (grad_input.numel() == 0) {
-    return grad_input;
-  }
-
-  auto stream = at::mps::getCurrentMPSStream();
-
-  @autoreleasepool {
-
-    string mem_format_key;
-    switch(memory_format) {
-      case at::MemoryFormat::Contiguous:
-        mem_format_key = "Contiguous";
-        break;
-      case at::MemoryFormat::ChannelsLast:
-        mem_format_key = "ChannelsLast";
-        break;
-      default:
-        assert(0 && "Check should have been done earlier\n");
-    }
-
-    string key = "mps_max_pool2d_backward:" + to_string(kW) + ":" + to_string(kH) + ":" +
-                                              to_string(dW) + ":" + to_string(dH) + ":" +
-                                              to_string(dilationW) + ":" + to_string(dilationH) + ":" +
-                                              to_string(padW) + ":" + to_string(padH) + ":" +
-                                              to_string(ceil_mode) + ":" + mem_format_key +
-                                              mps::getTensorsStringKey({input_t, grad_output});
-    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
-
-    if(!cachedGraph) {
-      native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
-
-        CachedGraph *newCachedGraph = nil;
-        @autoreleasepool {
-          MPSGraph* mpsGraph = native_mps::make_mps_graph();
-          newCachedGraph = new CachedGraph(mpsGraph);
-
-          MPSGraphPooling2DOpDescriptor* desc = [[MPSGraphPooling2DOpDescriptor new] autorelease];
-          fill_pool_desc(desc, kW, kH, dW, dH, dilationW, dilationH, padW, padH, ceil_mode, memory_format);
-
-          MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t);
-          MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
-          MPSGraphTensor* gradInputTensor = [mpsGraph maxPooling2DGradientWithGradientTensor:gradOutputTensor
-                                                                                sourceTensor:inputTensor
-                                                                                  descriptor:desc
-                                                                                        name:nil];
-
-          newCachedGraph->inputTensor_ = inputTensor;
-          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
-          newCachedGraph->gradInputTensor_ = gradInputTensor;
-        }
-        return newCachedGraph;
-      });
-      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
-    }
-
-    auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
-    auto gradOutputPlaceholder = native_mps::Placeholder(cachedGraph->gradOutputTensor_, grad_output);
-    auto gradInputPlaceholder = native_mps::Placeholder(cachedGraph->gradInputTensor_, grad_input);
-
-    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData()
-    };
-
-    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
-      gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
-    };
-
-    native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
-  }
+  mps::pool2d_template(input, grad_input, c10::nullopt, grad_output, kernel_size, stride,
+                       padding, dilation, ceil_mode, c10::nullopt, pooling_op_block, "max_pool2d_backward");
 
   return grad_input;
 }
 
 TORCH_IMPL_FUNC(max_pool2d_with_indices_out_mps)(
-    const Tensor& input_t,
+    const Tensor& input,
     IntArrayRef kernel_size,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation,
     bool ceil_mode,
-    const Tensor& output_t,
-    const Tensor& indices) {
-
-  // #20866, #22032: Guarantee this for the official C++ API?
-  TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,
-    "max_pool2d: kernel_size must either be a single int, or a tuple of two ints")
-  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
-  const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);
-
-  // NB: stride default is not expressible as an integer constant, so we accept
-  // empty stride for this case
-  TORCH_CHECK(stride.size() == 0 || stride.size() == 1 || stride.size() == 2,
-    "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints")
-  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
-  const int dW = stride.empty() ? kW :
-                 stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
-
-  TORCH_CHECK(padding.size() == 1 || padding.size() == 2,
-    "max_pool2d: padding must be either be a single int, or a tuple of two ints");
-  const int padH = safe_downcast<int, int64_t>(padding[0]);
-  const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
-
-  TORCH_CHECK(dilation.size() == 1 || dilation.size() == 2,
-    "max_pool2d: dilation must be either a single int, or a tuple of two ints");
-  const int dilationH = safe_downcast<int, int64_t>(dilation[0]);
-  const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast<int, int64_t>(dilation[1]);
-
-  const auto memory_format = input_t.suggest_memory_format();
-  if (memory_format == at::MemoryFormat::ChannelsLast) {
-    TORCH_CHECK(input_t.ndimension() == 4,
-      "non-empty 4D (batch mode) tensor expected for input with channels_last layout");
-  } else if (memory_format == at::MemoryFormat::Contiguous) {
-    TORCH_CHECK((input_t.ndimension() == 3 || input_t.ndimension() == 4),
-      "non-empty 3D or 4D (batch mode) tensor expected for input");
-  } else {
-    TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
-  }
-
-  /* sizes */
-  const int64_t nInputPlane = input_t.size(-3);
-  const int64_t inputHeight = input_t.size(-2);
-  const int64_t inputWidth = input_t.size(-1);
-
-  const int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, dilationH, ceil_mode);
-  const int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, dilationW, ceil_mode);
-
-  pool2d_shape_check(
-    input_t,
-    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
-    nInputPlane,
-    inputHeight, inputWidth,
-    outputHeight, outputWidth, memory_format);
-
-  namespace native_mps = at::native::mps;
-
-  // Derive from MPSCachedGraph
-  struct CachedGraph : public native_mps::MPSCachedGraph
-  {
-    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
-    MPSGraphTensor* inputTensor_ = nil;
-    MPSGraphTensor* outputTensor_ = nil;
-    MPSGraphTensor* indicesTensor_ = nil;
+    const Tensor& output,
+    const Tensor& indices)
+{
+  mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
+    MPSGraph* mpsGraph = cachedGraph.graph();
+    NSArray<MPSGraphTensor*>* poolOutputs = [mpsGraph maxPooling2DReturnIndicesWithSourceTensor: cachedGraph.inputTensor
+                                                                                     descriptor: desc
+                                                                                           name: nil];
+    cachedGraph.indicesTensor = mps::castMPSTensor(mpsGraph, poolOutputs[1], ScalarType::Long);
+    return poolOutputs[0];
   };
-
-  native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
-
-  if (output_t.numel() == 0) {
-    return;
-  }
-
-  auto stream = at::mps::getCurrentMPSStream();
-
-  @autoreleasepool {
-
-    string mem_format_key;
-    switch(memory_format) {
-      case at::MemoryFormat::Contiguous:
-        mem_format_key = "Contiguous";
-        break;
-      case at::MemoryFormat::ChannelsLast:
-        mem_format_key = "ChannelsLast";
-        break;
-      default:
-        assert(0 && "Check should have been done earlier\n");
-    }
-
-    string key = "max_pool2d_with_indices_out_mps:" + to_string(kW) + ":" + to_string(kH) + ":" +
-                                                      to_string(dW) + ":" + to_string(dH) + ":" +
-                                                      to_string(dilationW) + ":" + to_string(dilationH) + ":" +
-                                                      to_string(padW) + ":" + to_string(padH) + ":" +
-                                                      to_string(ceil_mode) + ":" + mem_format_key +
-                                                      mps::getTensorsStringKey({input_t}) + ":" +
-                                                      native_mps::getMPSTypeString(indices.scalar_type());
-    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
-
-    if(!cachedGraph) {
-      native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
-
-        CachedGraph *newCachedGraph = nil;
-
-        @autoreleasepool {
-          MPSGraph* mpsGraph = native_mps::make_mps_graph();
-          newCachedGraph = new CachedGraph(mpsGraph);
-
-          MPSGraphPooling2DOpDescriptor* desc = [[MPSGraphPooling2DOpDescriptor new] autorelease];
-          fill_pool_desc(desc, kW, kH, dW, dH, dilationW, dilationH, padW, padH, ceil_mode, memory_format);
-          desc.returnIndicesMode = MPSGraphPoolingReturnIndicesGlobalFlatten2D;
-          desc.returnIndicesDataType = MPSDataTypeInt32;
-
-          MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t);
-          NSArray<MPSGraphTensor*>* poolOutputs = [mpsGraph maxPooling2DReturnIndicesWithSourceTensor:inputTensor
-                                                                                           descriptor:desc
-                                                                                                 name:nil];
-
-            MPSGraphTensor* indicesTensor = poolOutputs[1];
-            if(mps::getMPSDataType(indices.scalar_type()) == MPSDataTypeInt64) {
-                indicesTensor = [mpsGraph castTensor:indicesTensor
-                                               toType:MPSDataTypeInt64
-                                                 name:@"castToI64"];
-            }
-
-          newCachedGraph->inputTensor_ = inputTensor;
-          newCachedGraph->outputTensor_ = poolOutputs[0];
-          newCachedGraph->indicesTensor_ = indicesTensor;
-        }
-        return newCachedGraph;
-      });
-      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
-    }
-
-    auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
-    auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output_t);
-    auto indicesPlaceholder = native_mps::Placeholder(cachedGraph->indicesTensor_, indices);
-
-    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-    };
-
-    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
-      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(),
-      indicesPlaceholder.getMPSGraphTensor() : indicesPlaceholder.getMPSGraphTensorData()
-    };
-
-    native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
-  }
-
+  mps::pool2d_template(input, output, indices, c10::nullopt, kernel_size, stride,
+                       padding, dilation, ceil_mode, c10::nullopt, pooling_op_block, "max_pool2d_indices");
 }
 
-TORCH_IMPL_FUNC(max_pool2d_with_indices_backward_out_mps)
-(const Tensor& grad_output,
-const Tensor& input_t,
-IntArrayRef kernel_size,
-IntArrayRef stride,
-IntArrayRef padding,
-IntArrayRef dilation,
-bool ceil_mode,
-const Tensor& indices,
-const Tensor& grad_input) {
-
-  // #20866, #22032: Guarantee this for the official C++ API?
-  TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,
-    "max_pool2d: kernel_size must either be a single int, or a tuple of two ints")
-  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
-  const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);
-
-  // NB: stride default is not expressible as an integer constant, so we accept
-  // empty stride for this case
-  TORCH_CHECK(stride.size() == 0 || stride.size() == 1 || stride.size() == 2,
-    "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints")
-  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
-  const int dW = stride.empty() ? kW :
-                 stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
-
-  TORCH_CHECK(padding.size() == 1 || padding.size() == 2,
-    "max_pool2d: padding must be either be a single int, or a tuple of two ints");
-  const int padH = safe_downcast<int, int64_t>(padding[0]);
-  const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
-
-  TORCH_CHECK(dilation.size() == 1 || dilation.size() == 2,
-    "max_pool2d: dilation must be either a single int, or a tuple of two ints");
-  const int dilationH = safe_downcast<int, int64_t>(dilation[0]);
-  const int dilationW = dilation.size() == 1 ? dilationH : safe_downcast<int, int64_t>(dilation[1]);
-
-  const auto memory_format = input_t.suggest_memory_format();
-  if (memory_format == at::MemoryFormat::ChannelsLast) {
-    TORCH_CHECK(input_t.ndimension() == 4,
-      "non-empty 4D (batch mode) tensor expected for input with channels_last layout");
-  } else if (memory_format == at::MemoryFormat::Contiguous) {
-    TORCH_CHECK((input_t.ndimension() == 3 || input_t.ndimension() == 4),
-      "non-empty 3D or 4D (batch mode) tensor expected for input");
-  } else {
-    TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
-  }
-
-  namespace native_mps = at::native::mps;
-
-  // Derive from MPSCachedGraph
-  struct CachedGraph : public native_mps::MPSCachedGraph
-  {
-    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
-    MPSGraphTensor *inputTensor_ = nil;
-    MPSGraphTensor *gradOutputTensor_ = nil;
-    MPSGraphTensor *gradInputTensor_ = nil;
+TORCH_IMPL_FUNC(max_pool2d_with_indices_backward_out_mps)(
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    bool ceil_mode,
+    const Tensor& indices,
+    const Tensor& grad_input)
+{
+  mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
+    MPSGraph* mpsGraph = cachedGraph.graph();
+    return [mpsGraph maxPooling2DGradientWithGradientTensor: cachedGraph.gradOutputTensor
+                                               sourceTensor: cachedGraph.inputTensor
+                                                 descriptor: desc
+                                                       name: nil];
   };
-
-  native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
-
-  if (grad_input.numel() == 0) {
-    return;
-  }
-
-  auto stream = at::mps::getCurrentMPSStream();
-
-  @autoreleasepool {
-
-    string mem_format_key;
-    switch(memory_format) {
-      case at::MemoryFormat::Contiguous:
-        mem_format_key = "Contiguous";
-        break;
-      case at::MemoryFormat::ChannelsLast:
-        mem_format_key = "ChannelsLast";
-        break;
-      default:
-        assert(0 && "Check should have been done earlier\n");
-    }
-
-    string key = "max_pool2d_with_indices_backward_out_mps:" + to_string(kW) + ":" + to_string(kH) + ":" +
-                                               to_string(dW) + ":" + to_string(dH) + ":" +
-                                               to_string(dilationW) + ":" + to_string(dilationH) + ":" +
-                                               to_string(padW) + ":" + to_string(padH) + ":" +
-                                               to_string(ceil_mode) + ":" + mem_format_key +
-                                               mps::getTensorsStringKey({input_t, grad_output});
-    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
-
-    if(!cachedGraph) {
-      native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
-
-        CachedGraph *newCachedGraph = nil;
-        @autoreleasepool {
-          MPSGraph* mpsGraph = native_mps::make_mps_graph();
-          newCachedGraph = new CachedGraph(mpsGraph);
-
-          MPSGraphPooling2DOpDescriptor* desc = [[MPSGraphPooling2DOpDescriptor new] autorelease];
-          fill_pool_desc(desc, kW, kH, dW, dH, dilationW, dilationH, padW, padH, ceil_mode, memory_format);
-
-          MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t);
-          MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
-          MPSGraphTensor* gradInputTensor = [mpsGraph maxPooling2DGradientWithGradientTensor:gradOutputTensor
-                                                                                sourceTensor:inputTensor
-                                                                                  descriptor:desc
-                                                                                        name:nil];
-
-          newCachedGraph->inputTensor_ = inputTensor;
-          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
-          newCachedGraph->gradInputTensor_ = gradInputTensor;
-        }
-        return newCachedGraph;
-      });
-      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
-    }
-
-    auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);
-    auto gradOutputPlaceholder = native_mps::Placeholder(cachedGraph->gradOutputTensor_, grad_output);
-    auto gradInputPlaceholder = native_mps::Placeholder(cachedGraph->gradInputTensor_, grad_input);
-
-    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData()
-    };
-
-    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
-      gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
-    };
-
-    native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
-  }
+  mps::pool2d_template(input, grad_input, indices, grad_output, kernel_size, stride,
+                       padding, dilation, ceil_mode, c10::nullopt, pooling_op_block, "max_pool2d_indices_backward");
 }
 
 TORCH_IMPL_FUNC(avg_pool2d_out_mps) (
-   const Tensor& input_,
-   int64_t kH_,
-   int64_t kW_,
-   int64_t dH_,
-   int64_t dW_,
-   int64_t padH_,
-   int64_t padW_,
+   const Tensor& input,
+   int64_t kH,
+   int64_t kW,
+   int64_t dH,
+   int64_t dW,
+   int64_t padH,
+   int64_t padW,
    bool ceil_mode,
    bool count_include_pad,
    c10::optional<int64_t> divisor_override,
-   const Tensor& output) {
-  namespace native_mps = at::native::mps;
-
-  TensorArg output_arg{ output, "output", 1 };
-  TensorArg input_arg{ input_, "input_", 2 };
-
-  checkAllSameGPU("avg_pool2d_out_cuda", {output_arg, input_arg});
-
-  const int kH = safe_downcast<int, int64_t>(kH_);
-  const int kW = safe_downcast<int, int64_t>(kW_);
-
-  const int dH = safe_downcast<int, int64_t>(dH_);
-  const int dW = safe_downcast<int, int64_t>(dW_);
-
-  const int padH = safe_downcast<int, int64_t>(padH_);
-  const int padW = safe_downcast<int, int64_t>(padW_);
-
-  /* sizes */
-
-  const auto memory_format = input_.suggest_memory_format();
-
-  Tensor input = input_.contiguous(memory_format);
-
-  const int32_t count = safe_downcast<int32_t, int64_t>(output.numel());
-
-  bool use_divisor = divisor_override.has_value();
-  const auto divisor_override_value = use_divisor ? divisor_override.value() : 0;
-
-  if (count != 0) {
-    // Derive from MPSCachedGraph
-    struct CachedGraph : public native_mps::MPSCachedGraph
-    {
-      CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
-      MPSGraphTensor* inputTensor_ = nil;
-      MPSGraphTensor* outputTensor_ = nil;
-      MPSGraphTensor* indicesTensor_ = nil;
-    };
-
-    native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
-
-    auto stream = at::mps::getCurrentMPSStream();
-
-    @autoreleasepool {
-      string mem_format_key;
-      switch(memory_format) {
-        case at::MemoryFormat::Contiguous:
-          mem_format_key = "Contiguous";
-          break;
-        case at::MemoryFormat::ChannelsLast:
-          mem_format_key = "ChannelsLast";
-          break;
-        default:
-          assert(0 && "Check should have been done earlier\n");
-      }
-
-      string key = "mps_avg_pool2d:" + to_string(kW) + ":" + to_string(kH) + ":" +
-                                       to_string(dW) + ":" + to_string(dH) + ":" +
-                                       to_string(padW) + ":" + to_string(padH) + ":" +
-                                       to_string(ceil_mode) + ":" + mem_format_key + ":" +
-                                       to_string(divisor_override_value) +
-                                       mps::getTensorsStringKey({input});
-      CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
-
-      if(!cachedGraph) {
-        native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
-          CachedGraph *newCachedGraph = nil;
-
-          @autoreleasepool {
-            MPSGraph* mpsGraph = native_mps::make_mps_graph();
-            newCachedGraph = new CachedGraph(mpsGraph);
-
-            MPSGraphPooling2DOpDescriptor* desc = [[MPSGraphPooling2DOpDescriptor new] autorelease];
-            fill_pool_desc(desc, kW, kH, dW, dH, 1, 1, padW, padH, ceil_mode, memory_format);
-
-            MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input);
-            MPSGraphTensor* outputTensor = [mpsGraph avgPooling2DWithSourceTensor:inputTensor
-                                                                       descriptor:desc
-                                                                             name:nil];
-            newCachedGraph->inputTensor_ = inputTensor;
-            newCachedGraph->outputTensor_ = outputTensor;
-          }
-          return newCachedGraph;
-        });
-        cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
-      }
-
-      auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input);
-      auto outputPlaceholder = native_mps::Placeholder(cachedGraph->outputTensor_, output);
-
-      NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
-        inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-      };
-
-      NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
-        outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
-      };
-
-      native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
-    }
-  }
+   const Tensor& output)
+{
+  const bool use_divisor = divisor_override.has_value() && divisor_override.value() != 0;
+  float divisor = use_divisor ? float(kH * kW) / (float) divisor_override.value() : 1.0f;
+  count_include_pad = use_divisor ? use_divisor : count_include_pad;
+
+  mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
+    MPSGraph* mpsGraph = cachedGraph.graph();
+    desc.includeZeroPadToAverage = count_include_pad;
+    MPSGraphTensor* avgPoolTensor = [mpsGraph avgPooling2DWithSourceTensor: cachedGraph.inputTensor
+                                                                descriptor: desc
+                                                                      name: nil];
+    // workaround: custom divisor isn't supported by MPS backend, so we scale manually
+    return [mpsGraph multiplicationWithPrimaryTensor: avgPoolTensor
+                                     secondaryTensor: cachedGraph.divisorTensor
+                                                name: nil];
+  };
+  mps::pool2d_template(input, output, c10::nullopt, c10::nullopt, {kH, kW}, {dH, dW},
+                       {padH, padW}, {1, 1}, ceil_mode, divisor, pooling_op_block,
+                       std::string("avg_pool2d") + (count_include_pad ? "_include_pad" : ""));
 }
 
 TORCH_IMPL_FUNC(avg_pool2d_backward_out_mps) (
-  const Tensor& gradOutput_,
-  const Tensor& input_,
-  IntArrayRef kernel_size,
-  IntArrayRef stride,
-  IntArrayRef padding,
-  bool ceil_mode,
-  bool count_include_pad,
-  c10::optional<int64_t> divisor_override,
-  const Tensor& gradInput
-) {
-  TensorArg gradInput_arg{ gradInput, "gradInput", 1 };
-  TensorArg gradOutput_arg{ gradOutput_, "gradOutput_", 2 };
-  TensorArg input_arg{ input_, "input_", 3 };
-
-  checkAllSameGPU("avg_pool2d_backward_out_cuda",
-                  {gradInput_arg, gradOutput_arg, input_arg});
-  namespace native_mps = at::native::mps;
-
-  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
-  const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);
-
-  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
-  const int dW = stride.empty() ? kW :
-                 stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
-
-  const int padH = safe_downcast<int, int64_t>(padding[0]);
-  const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
-
-  const auto memory_format = input_.suggest_memory_format();
-  const Tensor input = input_.contiguous(memory_format);
-  const Tensor gradOutput = gradOutput_.contiguous(memory_format);
-
-  const int64_t inputHeight = input.size(-2);
-  const int64_t inputWidth = input.size(-1);
-
-  const int64_t outputHeight = pooling_output_shape<int64_t>(inputHeight, kH, padH, dH, 1, ceil_mode);
-  const int64_t outputWidth = pooling_output_shape<int64_t>(inputWidth, kW, padW, dW, 1, ceil_mode);
-
-
-  const int32_t count = safe_downcast<int32_t, int64_t>(input.numel());
-  if (count == 0) {
-    return;
-  }
-
-  namespace native_mps = at::native::mps;
-
-  // Derive from MPSCachedGraph
-  struct CachedGraph : public native_mps::MPSCachedGraph
-  {
-    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
-    MPSGraphTensor *inputTensor_ = nil;
-    MPSGraphTensor *gradOutputTensor_ = nil;
-    MPSGraphTensor *gradInputTensor_ = nil;
+    const Tensor& gradOutput,
+    const Tensor& input,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    bool ceil_mode,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override,
+    const Tensor& gradInput)
+{
+  const bool use_divisor = divisor_override.has_value() && divisor_override.value() != 0;
+  float divisor = use_divisor ? float(kernel_size[0] * kernel_size[1]) / (float) divisor_override.value() : 1.0f;
+  count_include_pad = use_divisor ? use_divisor : count_include_pad;
+
+  mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
+    MPSGraph* mpsGraph = cachedGraph.graph();
+    desc.includeZeroPadToAverage = count_include_pad;
+    // workaround: custom divisor isn't supported by MPS backend, so we scale manually
+    MPSGraphTensor* scaledGradTensor = [mpsGraph multiplicationWithPrimaryTensor: cachedGraph.gradOutputTensor
+                                                                 secondaryTensor: cachedGraph.divisorTensor
+                                                                            name: nil];
+    return [mpsGraph avgPooling2DGradientWithGradientTensor: scaledGradTensor
+                                               sourceTensor: cachedGraph.inputTensor
+                                                 descriptor: desc
+                                                       name: nil];
   };
-
-  native_mps::MPSGraphCache* cache_ = native_mps::MPSGraphCache::getInstance();
-
-  if (gradInput.numel() == 0) {
-    return;
-  }
-
-  auto stream = at::mps::getCurrentMPSStream();
-
-  @autoreleasepool {
-
-    string mem_format_key;
-    switch(memory_format) {
-      case at::MemoryFormat::Contiguous:
-        mem_format_key = "Contiguous";
-        break;
-      case at::MemoryFormat::ChannelsLast:
-        mem_format_key = "ChannelsLast";
-        break;
-      default:
-        assert(0 && "Check should have been done earlier\n");
-    }
-
-    string key = "avg_pool2d_backward_out_mps:" + to_string(kW) + ":" + to_string(kH) + ":" +
-                                               to_string(dW) + ":" + to_string(dH) + ":" +
-                                               to_string(outputWidth) + ":" + to_string(outputHeight) + ":" +
-                                               to_string(padW) + ":" + to_string(padH) + ":" +
-                                               to_string(ceil_mode) + ":" + mem_format_key +
-                                               mps::getTensorsStringKey({input, gradOutput});
-    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
-
-    if(!cachedGraph) {
-      native_mps::MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ native_mps::MPSCachedGraph * () {
-
-        CachedGraph *newCachedGraph = nil;
-        @autoreleasepool {
-          MPSGraph* mpsGraph = native_mps::make_mps_graph();
-          newCachedGraph = new CachedGraph(mpsGraph);
-
-          MPSGraphPooling2DOpDescriptor* desc = [[MPSGraphPooling2DOpDescriptor new] autorelease];
-          fill_pool_desc(desc, kW, kH, dW, dH, 1, 1, padW, padH, ceil_mode, memory_format);
-
-          MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input);
-          MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, gradOutput);
-          MPSGraphTensor *gradInputTensor = [mpsGraph avgPooling2DGradientWithGradientTensor:gradOutputTensor
-                                                                               sourceTensor:inputTensor
-                                                                                descriptor : desc
-                                                                                       name:nil];
-
-          newCachedGraph->inputTensor_ = inputTensor;
-          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
-          newCachedGraph->gradInputTensor_ = gradInputTensor;
-        }
-        return newCachedGraph;
-      });
-      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
-    }
-
-    auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input);
-    auto gradOutputPlaceholder = native_mps::Placeholder(cachedGraph->gradOutputTensor_, gradOutput);
-    auto gradInputPlaceholder = native_mps::Placeholder(cachedGraph->gradInputTensor_, gradInput);
-
-    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
-      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
-      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData()
-    };
-
-    NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *results = @{
-      gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
-    };
-
-    native_mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
-  }
+  mps::pool2d_template(input, gradInput, c10::nullopt, gradOutput, kernel_size, stride,
+                       padding, {1, 1}, ceil_mode, divisor, pooling_op_block,
+                       std::string("avg_pool2d_backward") + (count_include_pad ? "_include_pad" : ""));
 }
 
 } // namespace native
diff --git a/test/test_mps.py b/test/test_mps.py
index 587c7b64de1af..ef791b912ae9c 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8261,6 +8261,8 @@ class TestConsistency(TestCase):
         'nn.functional.linear': ['f32'],
         'nn.functional.local_response_norm': ['f32', 'i64'],
         'nn.functional.margin_ranking_loss': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.max_pool1d': ['f32'],
+        'nn.functional.max_pool2d': ['f32'],
         'nn.functional.mse_loss': ['f16', 'f32'],
         'nn.functional.normalize': ['f32'],
         'nn.functional.pad': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8621,16 +8623,9 @@ class TestConsistency(TestCase):
 
         # New block list ops that need investigation
         'as_strided_scatter': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'atan2': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'], # atan2() may generate NAN in output
-        'nn.functional.adaptive_avg_pool1d': ['torch.float32'],
-        'nn.functional.adaptive_avg_pool2d': ['torch.float32'],
-        'nn.functional.avg_pool1d': ['torch.float32', 'torch.int64'],
-        'nn.functional.avg_pool2d': ['torch.float32', 'torch.int64'],
         'nn.functional.bilinear': ['torch.float32'],
         'nn.functional.conv_transpose2d': ['torch.float32'],
         'nn.functional.interpolate': ['torch.float32'],
-        'nn.functional.max_pool1d': ['torch.float32'],
-        'nn.functional.max_pool2d': ['torch.float32'],
         'topk': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
 
         # failures due to lack of op implementation on MPS backend
@@ -8646,6 +8641,16 @@ class TestConsistency(TestCase):
         'fft.rfftn': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
 
+        # failure due to issue: atan2() may generate NAN in output with
+        'atan2': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'],
+
+        # failures due to issue #103039644: Wrong results from avgPooling2DWithSourceTensor()
+        # when both ceilMode and includeZeroPadToAverage are True
+        'nn.functional.avg_pool1d': ['torch.float32', 'torch.int64'],
+        'nn.functional.avg_pool2d': ['torch.float32', 'torch.int64'],
+        'nn.functional.adaptive_avg_pool1d': ['torch.float32'],
+        'nn.functional.adaptive_avg_pool2d': ['torch.float32'],
+
         # failures due to issue #102048039: powerWithPrimaryTensor() with integer input may return wrong results
         'pow': ['torch.int16', 'torch.int32', 'torch.uint8'],
         '__rpow__': ['torch.int16', 'torch.int32', 'torch.uint8'],

From 499ae54585c638d37cbefe4d52f6fa0f9589f0cc Mon Sep 17 00:00:00 2001
From: skotapati <siddharth.kotapati@gmail.com>
Date: Fri, 9 Dec 2022 15:10:48 -0800
Subject: [PATCH 1845/1922] Removed masked_var & masked_std from blocklist
 (#195)

Co-authored-by: skotapati <>
---
 test/test_mps.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index ef791b912ae9c..2858774ee4df8 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8103,8 +8103,6 @@ class TestConsistency(TestCase):
         'masked.normalize': ['f16', 'f32'],
         'masked.softmax': ['f32'],
         'masked.softmin': ['f32'],
-        'masked.std': ['f32'],
-        'masked.var': ['f32'],
         'abs': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
         'acos': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'acosh': ['b8', 'f32', 'i16', 'i32', 'u8'],
@@ -8554,8 +8552,6 @@ class TestConsistency(TestCase):
         'stft': [torch.float32],
         # + forward when requires_grad=True or running backward
         '__rpow__': [torch.int64],
-        'masked.std': [torch.int32],
-        'masked.var': [torch.int32],
 
         'as_strided_scatter': [torch.uint8],
         'atan2': [torch.int64],

From 9c46f46b420a043a9a27e671e741e9b15873e3ac Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Fri, 9 Dec 2022 16:29:45 -0800
Subject: [PATCH 1846/1922] Move passing tests from blocklist to allowlist;
 arrange failing tests into categories (#199)

---
 test/test_mps.py | 109 +++++++++++++++++++++++------------------------
 1 file changed, 54 insertions(+), 55 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index 2858774ee4df8..f3922cc1fc747 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8122,6 +8122,9 @@ class TestConsistency(TestCase):
         'amax': ['f32'],
         'amix': ['f32'],
         'mean': ['f32'],
+        'meshgrid': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'meshgridvariadic_tensors': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'meshgridlist_of_tensors': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'sum': ['f32'],
         'asin': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'asinh': ['b8', 'f32', 'i16', 'i32', 'u8'],
@@ -8141,6 +8144,7 @@ class TestConsistency(TestCase):
         'block_diag': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'bmm': ['f32'],
         'broadcast_shapes': ['f32'],
+        'broadcast_to': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'byte': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'cat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'ceil': ['f32', 'int32', 'int64', 'f16'],
@@ -8162,16 +8166,19 @@ class TestConsistency(TestCase):
         'cov': ['f32'],
         'cumsum': ['f16', 'f32', 'int16', 'int32'],
         'deg2rad': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'diag': ['f32', 'i32'],
+        'diag': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'diag_embed': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'diagflat': ['f32', 'i32'],
+        'diagflat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'diagonal_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'diff': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'dist': ['f32'],
         'div': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'divfloor_rounding': ['f16', 'f32', 'u8'],
+        'divtrunc_rounding': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'dot': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'equal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'eq': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'empty': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'erf': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'exp': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'exp2': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
@@ -8188,6 +8195,7 @@ class TestConsistency(TestCase):
         'gather': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'ge': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'gradient': ['f16', 'f32', 'i16'],
+        'outer': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'gt': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'half': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'hstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8219,12 +8227,21 @@ class TestConsistency(TestCase):
         'logical_xor': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'logspace': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'logsumexp': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'long': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'lt': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'masked_fill': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'masked_select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'matmul': ['f32'],
         'maximum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'max': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
+        'maxreduction_with_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
+        'maxreduction_no_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
+        'maxbinary': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
         'minimum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'min': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
+        'minreduction_with_dim': ['f16', 'f32', 'i32'],
+        'minreduction_no_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
+        'minbinary': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
         'mm': ['f32'],
         'mv': ['f32'],
         'mul': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8242,6 +8259,7 @@ class TestConsistency(TestCase):
         'nn.functional.conv_transpose1d': ['f32'],
         'nn.functional.cosine_embedding_loss': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.cosine_similarity': ['f32'],
+        'nn.functional.cross_entropy': ['f32'],
         'nn.functional.elu': ['f32'],
         'nn.functional.feature_alpha_dropout': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.embedding': ['f16', 'f32'],
@@ -8269,6 +8287,8 @@ class TestConsistency(TestCase):
         'nn.functional.padreflect': ['f32'],
         'nn.functional.padreplicate': ['f32'],
         'nn.functional.pairwise_distance': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.pixel_unshuffle': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.pixel_shuffle': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.poisson_nll_loss': ['f32', 'i16', 'i32', 'u8'],
         'nn.functional.prelu': ['f32'],
         'nn.functional.relu': ['f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8298,6 +8318,7 @@ class TestConsistency(TestCase):
         'resize_as_': ['b8', 'i16', 'i32', 'i64', 'u8'],
         'resolve_conj': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'resolve_neg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'reshape_as': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'rot90': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'round': ['f32', 'f16', 'i16', 'i32', 'i64'],
         'rsqrt': ['b8', 'f32', 'i16', 'i32', 'u8'],
@@ -8313,6 +8334,7 @@ class TestConsistency(TestCase):
         'slice_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'softmax': ['f32'],
         'special.ndtr': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'splitlist_args': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'split': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'sqrt': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'square': ['f16', 'f32'],
@@ -8542,68 +8564,37 @@ class TestConsistency(TestCase):
     # If the dtype list is None, all dtypes are excluded.
     # All the entries in this list should be removed
     BLOCKLIST = {
-        # Functions that hang
-        # + forward when requires_grad=True or running backward
-        'masked.mean': [torch.bool, torch.float16],
-        'masked.prod': [torch.bool],
-        'masked.sum': [torch.bool],
-
         # Functions that hard crash
-        'stft': [torch.float32],
-        # + forward when requires_grad=True or running backward
-        '__rpow__': [torch.int64],
+        'index_add': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nn.functional.softplus': [torch.float32],
+        'nonzero': [torch.uint8, torch.float16],
 
-        'as_strided_scatter': [torch.uint8],
+        # Functions with correctness issues
+        '__rpow__': None,
+        'nn.functional.avg_pool1d': [torch.float32, torch.int64],
+        'nn.functional.avg_pool2d': [torch.float32, torch.int64],
+        'unique': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'index_put': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'divfloor_rounding': [torch.int16, torch.int32, torch.int64],
+        'divtrunc_rounding': [torch.float16],
+        'norm': [torch.float16],
+        'minreduction_with_dim': [torch.bool, torch.int16, torch.uint8],
+        'maxreduction_with_dim': [torch.bool, torch.int16, torch.uint8],
+
+        # MPS kernels with no support for int64 inputs
         'atan2': [torch.int64],
-        'bfloat16': None,
-        'block_diag': [torch.uint8],
-        'chalf': None,
-        'diagonal_scatter': [torch.uint8],
-        'index_add': None,
-        'long': None,
-        'nn.functional.avg_pool1d': [torch.int64],
-        'nn.functional.avg_pool2d': [torch.int64],
-        'nn.functional.conv1d': [torch.int64],
-        'nn.functional.conv2d': [torch.int64],
-        'nn.functional.conv_transpose1d': [torch.int64],
-        'nn.functional.conv_transpose2d': [torch.int64],
-        'nn.functional.conv_transpose3d': [torch.int64, torch.float32],
-        'nn.functional.softplus': [torch.float32],
         'pow': [torch.int64],
         'sigmoid': [torch.int64],
-        'square': [torch.bool, torch.int16, torch.int32, torch.int64, torch.uint8],  # moved from section below
-        'unique': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'nonzero': [torch.uint8, torch.float16],
+        'square': [torch.bool, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'dot': [torch.int64],
+        'nn.functional.one_hot': [torch.int64],
+        'min': [torch.int64],
+        'max': [torch.int64],
 
-        # These were moved from ALLOWLIST to BLOCK as they are not working
-        # locally
-        'diag': ['torch.int64'],
-        'diagflat': ['torch.int64'],
-        'index_put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'as_strided_scatter': [torch.uint8],
 
         # Functions that are flaky
         # These are detected as "ok" by the expect case but actually fail to run sometimes
-        'broadcast': None,
-        'divfloor_rounding': None,
-        'divno_rounding_mode': None,
-        'divtrunc_rounding': None,
-        'empty': None,
-        'splitlist_args': None,
-        'reshape_as': None,
-        'norm': None,
-        'nn.functional.pixel_unshuffle': None,
-        'nn.functional.pixel_shuffle': None,
-        'nn.functional.cross_entropy': None,
-        'nn.functional.one_hot': None,
-        'minreduction_with_dim': None,
-        'minreduction_no_dim': None,
-        'minbinary': None,
-        'meshgridvariadic_tensors': None,
-        'meshgridlist_of_tensors': None,
-        'maxreduction_with_dim': None,
-        'maxreduction_no_dim': None,
-        'maxbinary': None,
-        'outer': None,
         'softmaxwith_dtype': None,
         'rounddecimals_neg_3': None,
         'rounddecimals_3': None,
@@ -8636,6 +8627,8 @@ class TestConsistency(TestCase):
         'fft.rfft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'fft.rfftn': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'stft': ['torch.float32'],
+        'nn.functional.conv_transpose3d': [torch.int64, torch.float32],
 
         # failure due to issue: atan2() may generate NAN in output with
         'atan2': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'],
@@ -8653,6 +8646,12 @@ class TestConsistency(TestCase):
 
         # failures due to unsupported data types on MPS backend
         'matmul': ['torch.uint8'], # MPS device does not support mm for non-float inputs
+        'bfloat16': None,
+        'chalf': None,
+        'nn.functional.conv1d': [torch.int64],
+        'nn.functional.conv2d': [torch.int64],
+        'nn.functional.conv_transpose1d': [torch.int64],
+        'nn.functional.conv_transpose2d': [torch.int64],
 
         # failures due to random output that they generate using
         # Philox engine causing mismatch with CPU results

From e09f561843c4344589770a4097bba083ed640fd1 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Mon, 12 Dec 2022 12:27:32 -0800
Subject: [PATCH 1847/1922] Ventura build fix (#200)

* Fix build faiure on Ventura

* Fix remaining build failures

* Update macos macro check for MPSGraphResizeNearestRoundingMode

* Fix mac OS check for MPSGraphResizeNearestRoundingMode
---
 aten/src/ATen/native/mps/MPSGraphVenturaOps.h    | 5 +++++
 aten/src/ATen/native/mps/operations/ReduceOps.mm | 1 +
 2 files changed, 6 insertions(+)

diff --git a/aten/src/ATen/native/mps/MPSGraphVenturaOps.h b/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
index b260afce5e410..803eec2f1bb61 100644
--- a/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
+++ b/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
@@ -3,6 +3,10 @@
 
 // TODO: Remove me when moved to MacOS 13
 @interface MPSGraph (VenturaOps)
+
+#if !defined(__MAC_13_0) && \
+    (!defined(MAC_OS_X_VERSION_13_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_0))
+
 API_AVAILABLE(macos(13.0))
 typedef NS_ENUM(NSUInteger, MPSGraphResizeNearestRoundingMode)
 {
@@ -13,6 +17,7 @@ typedef NS_ENUM(NSUInteger, MPSGraphResizeNearestRoundingMode)
     MPSGraphResizeNearestRoundingModeRoundToEven       =  4L,
     MPSGraphResizeNearestRoundingModeRoundToOdd        =  5L,
 };
+#endif
 
 - (MPSGraphTensor *)cumulativeSumWithTensor:(MPSGraphTensor *)tensor
                                        axis:(NSInteger)axis
diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index fa3b54f0fcd09..a4bbbc4d895cd 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -913,6 +913,7 @@ Tensor std_var_common_impl_mps(
       }
       return newCachedGraph;
       });
+    cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
   }
 
   auto inputPlaceholder = native_mps::Placeholder(cachedGraph->inputTensor_, input_t);

From 6ad73ce41bea9014a8edbdfce217092806ab9fc4 Mon Sep 17 00:00:00 2001
From: Kulin Seth <kulin_seth@apple.com>
Date: Tue, 13 Dec 2022 00:45:33 +0400
Subject: [PATCH 1848/1922] Dev/kulin/einsum (#201)

* More WERROR=1 fixes.

* Enabling einsum test. Its currently failing.
---
 aten/src/ATen/native/mps/MPSGraphVenturaOps.h | 15 +++++++++------
 aten/src/ATen/native/mps/operations/Unique.mm |  1 +
 test/test_mps.py                              |  2 ++
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/native/mps/MPSGraphVenturaOps.h b/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
index 803eec2f1bb61..767c785303291 100644
--- a/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
+++ b/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
@@ -19,15 +19,18 @@ typedef NS_ENUM(NSUInteger, MPSGraphResizeNearestRoundingMode)
 };
 #endif
 
-- (MPSGraphTensor *)cumulativeSumWithTensor:(MPSGraphTensor *)tensor
+- (MPSGraphTensor * _Nonnull)cumulativeSumWithTensor:(MPSGraphTensor * _Nonnull)tensor
                                        axis:(NSInteger)axis
-                                       name:(NSString *)name;
+                                       name:(NSString * _Nullable)name;
 
-- (MPSGraphTensor *)sortWithTensor:(MPSGraphTensor *)tensor
+- (MPSGraphTensor * _Nonnull)sortWithTensor:(MPSGraphTensor * _Nonnull)tensor
                                        axis:(NSInteger)axis
-                                       name:(NSString *)name;
+                                       name:(NSString * _Nullable)name;
 
-- (MPSGraphTensor *)argSortWithTensor:(MPSGraphTensor *)tensor
+- (MPSGraphTensor * _Nonnull)argSortWithTensor:(MPSGraphTensor * _Nonnull)tensor
                                        axis:(NSInteger)axis
-                                       name:(NSString *)name;
+                                       name:(NSString * _Nullable)name;
+
+- (MPSGraphTensor * _Nonnull)inverseOfTensor:(MPSGraphTensor * _Nonnull) inputTensor
+                                       name:(NSString * _Nullable)name;
 @end
diff --git a/aten/src/ATen/native/mps/operations/Unique.mm b/aten/src/ATen/native/mps/operations/Unique.mm
index 9049d8e00e049..40094308c7d1c 100644
--- a/aten/src/ATen/native/mps/operations/Unique.mm
+++ b/aten/src/ATen/native/mps/operations/Unique.mm
@@ -3,6 +3,7 @@
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/Resize.h>
 #include <ATen/mps/MPSAllocator.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
 
 namespace at {
 namespace native {
diff --git a/test/test_mps.py b/test/test_mps.py
index f3922cc1fc747..219cb747b5651 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8176,6 +8176,7 @@ class TestConsistency(TestCase):
         'divfloor_rounding': ['f16', 'f32', 'u8'],
         'divtrunc_rounding': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'dot': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'einsum': ['f32'],
         'equal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'eq': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'empty': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8477,6 +8478,7 @@ class TestConsistency(TestCase):
         'logical_not': ['f16', 'f32'],
         'logspace': ['f32'],
         'matmul': ['f32'],
+        'einsum': ['f32'],
         'mm': ['f32'],
         'mv': ['f32'],
         'neg': ['f16', 'f32'],

From 6ac05dfcd32bb9110d6e6933a3778a3f2fec725a Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 13 Dec 2022 10:45:11 -0800
Subject: [PATCH 1849/1922] Fix masked_fill and masked_mean hangs on macOS
 Monterey (#202)

* Fix index fill hang on macOS Monterey

* Fix mask data type

* Fix hang for masked_mean (same issue as masked_fill)

* Fix comment
---
 .../ATen/native/mps/operations/Indexing.mm    | 35 +++++++++++--------
 .../native/mps/operations/TensorCompare.mm    | 32 +++++++++++++----
 test/test_mps.py                              |  3 ++
 3 files changed, 49 insertions(+), 21 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index 3eed7ff5361e2..cce69bd144fda 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -710,10 +710,23 @@ Tensor index_select_mps(const Tensor & self,
 
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
 
+  MPSDataType inputDataType = getMPSScalarType(self.scalar_type());
+  MPSDataType maskDataType = getMPSScalarType(b_mask->scalar_type());
+  // Workaround for `selectWithPredicateTensor` on macOS Monterey where bool data type may cause a hang
+  // The issue is fixed in macOS Ventura (13.0)
+  if (!MPSDevice::getInstance()->macOS_13_0_or_newer()) {
+     if (self.scalar_type() == kBool) {
+      inputDataType = MPSDataTypeInt8;
+     }
+     if (mask.scalar_type() == kBool) {
+      maskDataType = MPSDataTypeInt8;
+     }
+  }
+
   MPSStream* stream = getCurrentMPSStream();
   MPSScalar valueScalar = getMPSScalar(value, value.type());
   @autoreleasepool {
-    string key = "masked_fill" + getTensorsStringKey({self, *b_mask}) + getMPSTypeString(value.type());
+    string key = "masked_fill" + getTensorsStringKey({self, *b_mask}) + ":" + getMPSTypeString(value.type());
     CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
     if(!cachedGraph) {
       cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ MPSCachedGraph * () {
@@ -724,20 +737,11 @@ Tensor index_select_mps(const Tensor & self,
           MPSGraph* mpsGraph = make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
 
-          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
-          MPSGraphTensor* maskTensor = mpsGraphRankedPlaceHolder(mpsGraph, *b_mask);
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, inputDataType, getMPSShape(self));
+          MPSGraphTensor* maskTensor = mpsGraphRankedPlaceHolder(mpsGraph, maskDataType, getMPSShape(*b_mask));
           MPSGraphTensor* valueTensor = mpsGraphScalarPlaceHolder(mpsGraph, value);
 
           MPSDataType valueType = getMPSScalarType(value.type());
-          MPSDataType inputDataType = getMPSScalarType(self.scalar_type());
-
-          // constantWithScalar doesn't like Bool constants getting created so
-          // mapping them to int8.
-          // Starting with macOS 13.0, this cast is not needed anymore
-          if (valueType == MPSDataTypeBool && !MPSDevice::getInstance()->macOS_13_0_or_newer()) {
-            valueType = MPSDataTypeInt8;
-          }
-
           MPSGraphTensor* castValueTensor = valueTensor;
           if (valueType != inputDataType) {
             castValueTensor = [mpsGraph castTensor:valueTensor
@@ -759,8 +763,10 @@ Tensor index_select_mps(const Tensor & self,
       });
     }
 
-    Placeholder selfPlaceholder   = Placeholder(cachedGraph->inputTensor_, self);
-    Placeholder maskPlaceholder   = Placeholder(cachedGraph->maskTensor_, *b_mask);
+    Placeholder selfPlaceholder   = Placeholder(
+      cachedGraph->inputTensor_, self, /*mpsShape*/nullptr, /*gatherTensorData=*/true, inputDataType);
+    Placeholder maskPlaceholder   = Placeholder(
+      cachedGraph->maskTensor_, *b_mask, /*mpsShape*/nullptr, /*gatherTensorData=*/true, maskDataType);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, self);
 
     // Create dictionary of inputs and outputs
@@ -775,7 +781,6 @@ Tensor index_select_mps(const Tensor & self,
     };
 
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
-
   }
   namedinference::propagate_names_if_nonempty(self, maybe_outnames);
   return self;
diff --git a/aten/src/ATen/native/mps/operations/TensorCompare.mm b/aten/src/ATen/native/mps/operations/TensorCompare.mm
index 3bbccb4d2e85d..edf3831b3ea77 100644
--- a/aten/src/ATen/native/mps/operations/TensorCompare.mm
+++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm
@@ -321,6 +321,23 @@ void clamp_scalar_out_mps(const Tensor& input_t,
 
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
 
+  MPSDataType conditionDataType = getMPSScalarType(condition.scalar_type());
+  MPSDataType selfDataType = getMPSScalarType(self.scalar_type());
+  MPSDataType otherDataType = getMPSScalarType(other.scalar_type());
+  // Workaround for `selectWithPredicateTensor` on macOS Monterey where bool data type may cause a hang
+  // The issue is fixed in macOS Ventura (13.0)
+  if (!MPSDevice::getInstance()->macOS_13_0_or_newer()) {
+     if (condition.scalar_type() == kBool) {
+      conditionDataType = MPSDataTypeInt8;
+     }
+     if (self.scalar_type() == kBool) {
+      selfDataType = MPSDataTypeInt8;
+     }
+     if (other.scalar_type() == kBool) {
+      otherDataType = MPSDataTypeInt8;
+     }
+  }
+
   @autoreleasepool {
 
     string key = "where_self_out_mps:" + getTensorsStringKey({cond_bool, self, other});
@@ -336,9 +353,9 @@ void clamp_scalar_out_mps(const Tensor& input_t,
                 MPSGraph* mpsGraph = make_mps_graph();
                 newCachedGraph = new CachedGraph(mpsGraph);
 
-                MPSGraphTensor* conditionTensor = mpsGraphRankedPlaceHolder(mpsGraph, cond_bool);
-                MPSGraphTensor* selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
-                MPSGraphTensor* otherTensor = mpsGraphRankedPlaceHolder(mpsGraph, other);
+                MPSGraphTensor* conditionTensor = mpsGraphRankedPlaceHolder(mpsGraph, conditionDataType, getMPSShape(cond_bool));
+                MPSGraphTensor* selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, selfDataType, getMPSShape(self));
+                MPSGraphTensor* otherTensor = mpsGraphRankedPlaceHolder(mpsGraph, otherDataType, getMPSShape(other));
 
                 MPSGraphTensor* outputTensor = [mpsGraph selectWithPredicateTensor:conditionTensor
                                                                truePredicateTensor:selfTensor
@@ -355,9 +372,12 @@ void clamp_scalar_out_mps(const Tensor& input_t,
         cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
-    Placeholder conditionPlaceholder = Placeholder(cachedGraph->conditionTensor_, cond_bool);
-    Placeholder selfPlaceholder = Placeholder(cachedGraph->selfTensor_, self);
-    Placeholder otherPlaceholder = Placeholder(cachedGraph->otherTensor_, other);
+    Placeholder conditionPlaceholder = Placeholder(
+        cachedGraph->conditionTensor_, cond_bool, /*mpsShape=*/nullptr, /*gatherTensorData=*/true, conditionDataType);
+    Placeholder selfPlaceholder = Placeholder(
+        cachedGraph->selfTensor_, self, /*mpsShape=*/nullptr, /*gatherTensorData=*/true, selfDataType);
+    Placeholder otherPlaceholder = Placeholder(
+        cachedGraph->otherTensor_, other, /*mpsShape=*/nullptr, /*gatherTensorData=*/true, otherDataType);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out);
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
diff --git a/test/test_mps.py b/test/test_mps.py
index 219cb747b5651..1483fe8732bc7 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8746,6 +8746,9 @@ def get_samples():
                 elif (op.name == "add" or op.name == "sub" or op.name == "nn.functional.huber_loss") and dtype == torch.float16:
                     atol = 1e-2
                     rtol = 1e-2
+                elif (op.name == "masked.mean"):
+                    atol = 7e-4
+                    rtol = 2e-3
                 else:
                     atol = None
                     rtol = None

From 9eb018a0ff736b93b88acacefda1c264d9fa2224 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 13 Dec 2022 16:58:12 -0800
Subject: [PATCH 1850/1922] Remove macOS13 duplicate check (#204)

---
 aten/src/ATen/mps/MPSDevice.h                        | 3 ---
 aten/src/ATen/mps/MPSDevice.mm                       | 4 ----
 aten/src/ATen/native/mps/operations/Distributions.mm | 2 +-
 aten/src/ATen/native/mps/operations/Indexing.mm      | 6 +++---
 aten/src/ATen/native/mps/operations/TensorCompare.mm | 2 +-
 aten/src/ATen/native/mps/operations/Unique.mm        | 6 +++---
 aten/src/ATen/native/mps/operations/UpSample.mm      | 4 ++--
 7 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/aten/src/ATen/mps/MPSDevice.h b/aten/src/ATen/mps/MPSDevice.h
index 7427a5242c104..48e1904346c10 100644
--- a/aten/src/ATen/mps/MPSDevice.h
+++ b/aten/src/ATen/mps/MPSDevice.h
@@ -60,8 +60,6 @@ class TORCH_API MPSDevice {
 
   MTLFunction_t metalIndexingFunction(const std::string &kernel, MTLFunctionConstantValues_t constantValues);
 
-  bool macOS_13_0_or_newer();
-
   ~MPSDevice();
 
  private:
@@ -69,7 +67,6 @@ class TORCH_API MPSDevice {
   MTLDevice_t _mtl_device;
   bool _macos13plus;
   MTLLibrary_t _mtl_indexing_library;
-  bool _macos_13_0_or_newer;
   MPSDevice();
 };
 
diff --git a/aten/src/ATen/mps/MPSDevice.mm b/aten/src/ATen/mps/MPSDevice.mm
index a84bfd6d59e14..c11621b3f3540 100644
--- a/aten/src/ATen/mps/MPSDevice.mm
+++ b/aten/src/ATen/mps/MPSDevice.mm
@@ -27,10 +27,6 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
   return mps_device.get();
 }
 
-bool MPSDevice::macOS_13_0_or_newer() {
-  return _macos_13_0_or_newer;
-}
-
 id<MTLFunction> MPSDevice::metalIndexingFunction(const std::string& kernel, MTLFunctionConstantValues* constantValues) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(_mtl_device);
   NSError* error = nil;
diff --git a/aten/src/ATen/native/mps/operations/Distributions.mm b/aten/src/ATen/native/mps/operations/Distributions.mm
index 45f49902d0622..ecb528301f2c0 100644
--- a/aten/src/ATen/native/mps/operations/Distributions.mm
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@@ -343,7 +343,7 @@ Tensor normal_mps(const Tensor& mean, const Tensor& std, c10::optional<Generator
 }
 
 Tensor& randperm_out_mps(int64_t n, c10::optional<Generator> generator, Tensor& result) {
-  if (!MPSDevice::getInstance()->macOS_13_0_or_newer()) {
+  if (!is_macos_13_or_newer()) {
     TORCH_WARN_ONCE("MPS: randperm op is supported natively starting from macOS 13.0. ",
                     "Falling back on CPU. This may have performance implications.");
 
diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index cce69bd144fda..cdcbe4cae6117 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -220,7 +220,7 @@ Tensor nonzero_fallback(const Tensor& self) {
 }
 
 Tensor& nonzero_out_mps(const Tensor& self, Tensor& out_){
-  if (!MPSDevice::getInstance()->macOS_13_0_or_newer()) {
+  if (!is_macos_13_or_newer()) {
       Tensor out_fallback = nonzero_fallback(self);
       at::native::resize_output(out_, out_fallback.sizes());
       out_.copy_(out_fallback.to("mps"));
@@ -388,7 +388,7 @@ Tensor nonzero_fallback(const Tensor& self) {
 }
 
 Tensor nonzero_mps(const Tensor& self){
-  if (!MPSDevice::getInstance()->macOS_13_0_or_newer()) {
+  if (!is_macos_13_or_newer()) {
     return nonzero_fallback(self);
   }
 
@@ -714,7 +714,7 @@ Tensor index_select_mps(const Tensor & self,
   MPSDataType maskDataType = getMPSScalarType(b_mask->scalar_type());
   // Workaround for `selectWithPredicateTensor` on macOS Monterey where bool data type may cause a hang
   // The issue is fixed in macOS Ventura (13.0)
-  if (!MPSDevice::getInstance()->macOS_13_0_or_newer()) {
+  if (!is_macos_13_or_newer()) {
      if (self.scalar_type() == kBool) {
       inputDataType = MPSDataTypeInt8;
      }
diff --git a/aten/src/ATen/native/mps/operations/TensorCompare.mm b/aten/src/ATen/native/mps/operations/TensorCompare.mm
index edf3831b3ea77..7cf551c95e993 100644
--- a/aten/src/ATen/native/mps/operations/TensorCompare.mm
+++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm
@@ -326,7 +326,7 @@ void clamp_scalar_out_mps(const Tensor& input_t,
   MPSDataType otherDataType = getMPSScalarType(other.scalar_type());
   // Workaround for `selectWithPredicateTensor` on macOS Monterey where bool data type may cause a hang
   // The issue is fixed in macOS Ventura (13.0)
-  if (!MPSDevice::getInstance()->macOS_13_0_or_newer()) {
+  if (!is_macos_13_or_newer()) {
      if (condition.scalar_type() == kBool) {
       conditionDataType = MPSDataTypeInt8;
      }
diff --git a/aten/src/ATen/native/mps/operations/Unique.mm b/aten/src/ATen/native/mps/operations/Unique.mm
index 40094308c7d1c..42cad854a730d 100644
--- a/aten/src/ATen/native/mps/operations/Unique.mm
+++ b/aten/src/ATen/native/mps/operations/Unique.mm
@@ -311,7 +311,7 @@ void runUniqueGraph(UniqueCachedGraph *uniqueGraph, const Tensor& input, Tensor&
 
 std::tuple<Tensor, Tensor, Tensor>
 unique_consecutive_mps(const Tensor& self, const bool return_inverse, const bool return_counts, c10::optional<int64_t> dim) {
-  if (!MPSDevice::getInstance()->macOS_13_0_or_newer()) {
+  if (!is_macos_13_or_newer()) {
     TORCH_WARN_ONCE("MPS: unique_consecutive op is supported natively starting from macOS 13.0. ",
                     "Falling back on CPU. This may have performace implications.");
     return at::unique_consecutive(self.to("cpu"), return_inverse, return_counts, dim);
@@ -322,7 +322,7 @@ void runUniqueGraph(UniqueCachedGraph *uniqueGraph, const Tensor& input, Tensor&
 
 std::tuple<Tensor, Tensor, Tensor>
 unique_dim_consecutive_mps(const Tensor& self, int64_t dim, const bool return_inverse, const bool return_counts) {
-  if (!MPSDevice::getInstance()->macOS_13_0_or_newer()) {
+  if (!is_macos_13_or_newer()) {
     TORCH_WARN_ONCE("MPS: unique_dim_consecutive op is supported natively starting from macOS 13.0. ",
                     "Falling back on CPU. This may have performace implications.");
     return at::unique_dim_consecutive(self.to("cpu"), dim, return_inverse, return_counts);
@@ -333,7 +333,7 @@ void runUniqueGraph(UniqueCachedGraph *uniqueGraph, const Tensor& input, Tensor&
 
 std::tuple<Tensor, Tensor, Tensor>
 _unique2_mps(const Tensor& self, const bool sorted, const bool return_inverse, const bool return_counts) {
-  if (!MPSDevice::getInstance()->macOS_13_0_or_newer()) {
+  if (!is_macos_13_or_newer()) {
     TORCH_WARN_ONCE("MPS: _unique2 op is supported natively starting from macOS 13.0. ",
                     "Falling back on CPU. This may have performace implications.");
     return at::_unique2(self.to("cpu"), sorted, return_inverse, return_counts);
diff --git a/aten/src/ATen/native/mps/operations/UpSample.mm b/aten/src/ATen/native/mps/operations/UpSample.mm
index 7d8a6a498d030..2ed353283e03f 100644
--- a/aten/src/ATen/native/mps/operations/UpSample.mm
+++ b/aten/src/ATen/native/mps/operations/UpSample.mm
@@ -46,7 +46,7 @@ void upsample_out_template(const Tensor& input,
     AT_ERROR("Unsupported resize mode ", resize_mode_str);
   }
 
-  const bool is_macOS_13_0_or_newer = MPSDevice::getInstance()->macOS_13_0_or_newer();
+  const bool is_macOS_13_0_or_newer = is_macos_13_or_newer();
   const int64_t output_width  = output_size.size() > 1 ? output_size[1] : output_size[0];
   const int64_t output_height = output_size.size() > 1 ? output_size[0] : 1;
   const float scale_w = (scale_w_opt.has_value() && scale_w_opt.value() > 0.) ? static_cast<float>(scale_w_opt.value()) : 0.;
@@ -218,7 +218,7 @@ void upsample_out_template(const Tensor& input,
 
 static bool check_mps_compatibility(c10::optional<double> scale)
 {
-  static const bool is_macOS_13_0_or_newer = MPSDevice::getInstance()->macOS_13_0_or_newer();
+  static const bool is_macOS_13_0_or_newer = is_macos_13_or_newer();
   // passing scale factors to MPS's resize APIs is not supported on macOS < 13
   if (!is_macOS_13_0_or_newer && scale.has_value() && scale.value() > 0.) {
     TORCH_WARN_ONCE("MPS: passing scale factor to upsample ops is supported natively starting from macOS 13.0. ",

From 2fb4554f37d39dce9c0ddd9b72db170c01bed522 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Wed, 14 Dec 2022 11:41:16 -0500
Subject: [PATCH 1851/1922] Workaround for the right padding bug in Monterey
 (#206)

- Workaround for the bool type bug in padding (needed for both Monterey and Ventura)
---
 aten/src/ATen/native/mps/operations/Pad.mm | 63 ++++++++++++++--------
 test/test_mps.py                           |  2 +
 2 files changed, 44 insertions(+), 21 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Pad.mm b/aten/src/ATen/native/mps/operations/Pad.mm
index 77daa9356dc89..51683e7f6fb02 100644
--- a/aten/src/ATen/native/mps/operations/Pad.mm
+++ b/aten/src/ATen/native/mps/operations/Pad.mm
@@ -158,11 +158,24 @@
         endsVec[padIdx] = @(input.size(padIdx) + padding[rightIdx]);
         endMask &= ~(1U << padIdx);
       }
+    // workaround for the right padding bug in Monterey
+    } else if (!is_macos_13_or_newer()) {
+      if (padding[rightIdx] == 1 && padding[leftIdx] == 0) {
+        rightPadVec[padIdx] = @(2);
+        endsVec[padIdx] = @(input.size(padIdx) + 2);
+        endMask &= ~(1U << padIdx);
+      }
     }
   }
   MPSShape *leftPadding  = [NSArray arrayWithObjects:leftPadVec.data() count:ndims];
   MPSShape *rightPadding = [NSArray arrayWithObjects:rightPadVec.data() count:ndims];
 
+  MPSDataType dataType = getMPSScalarType(input.scalar_type());
+  // workaround for Bool type assert with Constant padding
+  if (input.scalar_type() == kBool) {
+    dataType = MPSDataTypeInt8;
+  }
+
   struct CachedGraph : public MPSCachedGraph {
     CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) { }
     MPSGraphTensor *inputTensor = nil, *outputTensor = nil;
@@ -181,32 +194,39 @@
         @autoreleasepool {
           MPSGraph* mpsGraph = make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
-          newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
+          newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, dataType, getMPSShape(input));
+          const bool needsSlice = startMask != dims_mask || endMask != dims_mask;
 
           if (!is_backward_pass) {
-            // workaround for Bool type assert with Constant padding (only needed for forward pass)
-            const bool needsBoolCast = mode == MPSGraphPaddingModeConstant && input.scalar_type() == ScalarType::Bool;
-            MPSGraphTensor *inputTensorCast = !needsBoolCast ? newCachedGraph->inputTensor :
-                                              castMPSTensor(mpsGraph, newCachedGraph->inputTensor, ScalarType::Byte);
-            MPSGraphTensor *outputTensor = [mpsGraph padTensor:inputTensorCast
-                                               withPaddingMode:mode
-                                                   leftPadding:leftPadding
-                                                  rightPadding:rightPadding
-                                                 constantValue:constantValue
-                                                          name:nil];
-            newCachedGraph->outputTensor = needsBoolCast ? castMPSTensor(mpsGraph, outputTensor, ScalarType::Bool) : outputTensor;
+            MPSGraphTensor *padTensor = [mpsGraph padTensor:newCachedGraph->inputTensor
+                                            withPaddingMode:mode
+                                                leftPadding:leftPadding
+                                               rightPadding:rightPadding
+                                              constantValue:constantValue
+                                                       name:nil];
+            // workaround for the right padding bug in Monterey
+            if (needsSlice) {
+              newCachedGraph->outputTensor = [mpsGraph sliceTensor:padTensor
+                                                            starts:[NSArray arrayWithObjects:startsVec.data()  count:ndims]
+                                                              ends:[NSArray arrayWithObjects:endsVec.data()    count:ndims]
+                                                           strides:[NSArray arrayWithObjects:stridesVec.data() count:ndims]
+                                                         startMask:startMask
+                                                           endMask:endMask
+                                                       squeezeMask:0
+                                                              name:nil];
+            } else {
+              newCachedGraph->outputTensor = padTensor;
+            }
           } else {
-            newCachedGraph->gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
+            newCachedGraph->gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, dataType, getMPSShape(grad_output));
             MPSGraphTensor *padGradTensor = [mpsGraph padGradientWithIncomingGradientTensor:newCachedGraph->gradOutputTensor
                                                                                sourceTensor:newCachedGraph->inputTensor
                                                                                 paddingMode:mode
                                                                                 leftPadding:leftPadding
                                                                                rightPadding:rightPadding
                                                                                        name:nil];
-
             // workaround for negative padding issue with padGradientWithIncomingGradientTensor()
-            const bool needsSliceGradient = startMask != dims_mask || endMask != dims_mask;
-            if (needsSliceGradient) {
+            if (needsSlice) {
               newCachedGraph->outputTensor = [mpsGraph sliceGradientTensor:padGradTensor
                                                           fwdInShapeTensor:[mpsGraph shapeOfTensor:newCachedGraph->inputTensor name:nil]
                                                                     starts:[NSArray arrayWithObjects:startsVec.data()  count:ndims]
@@ -224,17 +244,18 @@
         return newCachedGraph;
       });
     }
-    Placeholder inputPlaceholder  = Placeholder(cachedGraph->inputTensor, input);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output);
+    Placeholder inputPlaceholder  = Placeholder(cachedGraph->inputTensor, input, nullptr, true, dataType);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output, nullptr, true, dataType);
+    Placeholder gradOutputPlaceholder = !is_backward_pass ? Placeholder() :
+                                        Placeholder(cachedGraph->gradOutputTensor, grad_output, nullptr, true, dataType);
 
     NSMutableDictionary *feeds = [[NSMutableDictionary new] autorelease];
     feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData();
     if (is_backward_pass) {
-        Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor, grad_output);
-        feeds[gradOutputPlaceholder.getMPSGraphTensor()] = gradOutputPlaceholder.getMPSGraphTensorData();
+      feeds[gradOutputPlaceholder.getMPSGraphTensor()] = gradOutputPlaceholder.getMPSGraphTensorData();
     }
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-        outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
     };
     runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
   }
diff --git a/test/test_mps.py b/test/test_mps.py
index 1483fe8732bc7..7fc00644ff00d 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -4301,6 +4301,8 @@ def helper(shape, padding, op, value=0):
         helper((2, 4, 6, 8, 4), (1, 3, 3, 5, 3, 4), nn.ReplicationPad3d)
         # Constant Pad 3D
         helper((2, 4, 6, 8, 4), (1, 3, 3, 5, 3, 4), nn.ConstantPad3d)
+        # check the workaround for the right padding bug in Monterey
+        helper((1, 2, 2, 2, 2), (0, 1), nn.ConstantPad3d)
 
     # Test stack forward
     def test_stack(self):

From 17cd9c01c74bfeb598cd2fe44ea04099a7ee65ad Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Wed, 14 Dec 2022 08:53:26 -0800
Subject: [PATCH 1852/1922] Fix expand pattern when strides of src/dst are not
 matching (#203)

---
 aten/src/ATen/native/mps/operations/View.mm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/mps/operations/View.mm b/aten/src/ATen/native/mps/operations/View.mm
index 6b6727e141ec1..2748267a0175e 100644
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@@ -153,7 +153,7 @@
     NSUInteger targetDimLength =  currSrcDimLength;
     if (currDimLength != targetDimLength)
       targetDimLength = 1;
-    if (currDimLength != targetDimLength && currStride != currSrcStride)
+    if (currDimLength != targetDimLength || currStride != currSrcStride)
       isValidExpand = NO;
     if (currSrcDim >= 0 && currSrcDimLength == targetDimLength) {
       currSrcStride *= currSrcDimLength;

From d3c4c355da264bd27dd65d5ffa9f52a3d70a5930 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Wed, 14 Dec 2022 13:40:31 -0500
Subject: [PATCH 1853/1922] Enable high watermark ratio to limit the memory
 allocations (#197)

---
 aten/src/ATen/mps/MPSAllocator.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/mps/MPSAllocator.h b/aten/src/ATen/mps/MPSAllocator.h
index d739e8956d814..a6df567b56588 100644
--- a/aten/src/ATen/mps/MPSAllocator.h
+++ b/aten/src/ATen/mps/MPSAllocator.h
@@ -283,10 +283,10 @@ class MPSHeapAllocatorImpl
 
 private:
   // (see m_high_watermark_ratio for description)
-  constexpr static double default_high_watermark_ratio = 0.0;
+  constexpr static double default_high_watermark_ratio = 1.7;
   // (see m_low_watermark_ratio for description)
   // on unified memory, we could allocate beyond the recommendedMaxWorkingSetSize
-  constexpr static double default_low_watermark_ratio_unified  = 1.5;
+  constexpr static double default_low_watermark_ratio_unified  = 1.4;
   constexpr static double default_low_watermark_ratio_discrete = 1.0;
 
   const id<MTLDevice> m_device;

From 990c3a2926857e35134fcbe223a3f7e5d2b919a7 Mon Sep 17 00:00:00 2001
From: "Li-Huai (Allan) Lin" <qqaatw@gmail.com>
Date: Tue, 13 Dec 2022 17:28:54 +0000
Subject: [PATCH 1854/1922] Fix mps constant pad (#89864)

Support arbitrary dimensions for constant padding on MPS

Fixes #89624
Fixes #87277

Pull Request resolved: https://github.com/pytorch/pytorch/pull/89864
Approved by: https://github.com/kulinseth, https://github.com/malfet
---
 aten/src/ATen/native/mps/operations/Pad.mm | 46 ++++++++++++++--------
 test/test_mps.py                           | 11 ++++++
 2 files changed, 41 insertions(+), 16 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Pad.mm b/aten/src/ATen/native/mps/operations/Pad.mm
index 51683e7f6fb02..c6029e3d7b2ae 100644
--- a/aten/src/ATen/native/mps/operations/Pad.mm
+++ b/aten/src/ATen/native/mps/operations/Pad.mm
@@ -73,8 +73,27 @@
   Tensor grad_output, input = input_;
 
   if (!is_backward_pass) {
-    // these checks aren't relevant for constant pad
-    if (mode != MPSGraphPaddingModeConstant) {
+    TORCH_CHECK(output_w >= 1 || output_h >= padding_dim - 1,
+      "input (H: ", input_h, ", W: ", input_w, ") is too small. Calculated "
+      "output H: ", output_h, " W: ", output_w);
+
+    std::vector<int64_t> outputSizes;
+    if (mode == MPSGraphPaddingModeConstant) {
+      // support arbitrary input dimensions for constant pad.
+      auto input_sizes = input_.sizes();
+      auto ori_padding_dim = padding_size / 2;
+      auto l_diff = ndims - ori_padding_dim;
+
+      for (size_t i = 0; i < (size_t)l_diff; i ++) {
+        outputSizes.emplace_back(input_sizes[i]);
+      }
+      for (const auto i : c10::irange((size_t)ori_padding_dim)) {
+        auto pad_idx = padding.size() - ((i + 1) * 2);
+        auto new_dim = input_sizes[l_diff + i] + padding[pad_idx] + padding[pad_idx + 1];
+        outputSizes.emplace_back(new_dim);
+      }
+    } else {
+      // these checks aren't relevant for constant pad
       TORCH_CHECK(pad_l < input_w && pad_r < input_w,
         "Argument #4: Padding size should be less than the corresponding "
         "input dimension, but got: padding (", pad_l, ", ", pad_r,
@@ -92,21 +111,16 @@
           "input dimension, but got: padding (", pad_front, ", ", pad_back,
           ") at dimension ", dim_d, " of input ", ndims);
       }
+      outputSizes.insert(outputSizes.begin(), output_w);
+      if (padding_dim >= 2)
+        outputSizes.insert(outputSizes.begin(), output_h);
+      if (padding_dim >= 3)
+        outputSizes.insert(outputSizes.begin(), output_d);
+      if (ndims >= 1 + padding_dim)
+        outputSizes.insert(outputSizes.begin(), nplane);
+      if (ndims >= 2 + padding_dim)
+        outputSizes.insert(outputSizes.begin(), nbatch);
     }
-    TORCH_CHECK(output_w >= 1 || output_h >= padding_dim - 1,
-      "input (H: ", input_h, ", W: ", input_w, ") is too small. Calculated "
-      "output H: ", output_h, " W: ", output_w);
-
-    std::vector<int64_t> outputSizes;
-    outputSizes.insert(outputSizes.begin(), output_w);
-    if (padding_dim >= 2)
-      outputSizes.insert(outputSizes.begin(), output_h);
-    if (padding_dim >= 3)
-      outputSizes.insert(outputSizes.begin(), output_d);
-    if (ndims >= 1 + padding_dim)
-      outputSizes.insert(outputSizes.begin(), nplane);
-    if (ndims >= 2 + padding_dim)
-      outputSizes.insert(outputSizes.begin(), nbatch);
 
     output.resize_(outputSizes);
 
diff --git a/test/test_mps.py b/test/test_mps.py
index 7fc00644ff00d..17bb6af71e052 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -4219,6 +4219,15 @@ def test_constant_pad(self):
         r_mps = m(input_mps)
         self.assertEqual(r_cpu, r_mps.to("cpu"))
 
+        # Arbitrary input dimensions
+        pad = (1, 1, 0, 0, 0, 0)
+        value = 3.5
+        input_cpu = torch.randn((1, 1, 3, 3, 3, 3, 3, 3, 3, 3))
+        input_mps = input_cpu.detach().clone().to("mps")
+        r_cpu = F.pad(input_cpu, pad=pad, value=value)
+        r_mps = F.pad(input_mps, pad=pad, value=value)
+        self.assertEqual(r_cpu, r_mps.to("cpu"))
+
     def test_circular_pad(self):
         # https://github.com/pytorch/pytorch/issues/80856
         k_cpu = torch.ones(3, 3, 9, 9)
@@ -4303,6 +4312,8 @@ def helper(shape, padding, op, value=0):
         helper((2, 4, 6, 8, 4), (1, 3, 3, 5, 3, 4), nn.ConstantPad3d)
         # check the workaround for the right padding bug in Monterey
         helper((1, 2, 2, 2, 2), (0, 1), nn.ConstantPad3d)
+		# input size < pad size
+        helper((2, 4, 6), (1, 3, 3, 5, 3, 4), nn.ConstantPad3d)
 
     # Test stack forward
     def test_stack(self):

From 314d31699bdcbf0853f3f9c6885e2820e355fe18 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Wed, 14 Dec 2022 15:15:33 -0800
Subject: [PATCH 1855/1922] Inverse op fallback (op/inverse) (#207)

---
 aten/src/ATen/native/mps/operations/Inverse.mm | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Inverse.mm b/aten/src/ATen/native/mps/operations/Inverse.mm
index 63a7f0376bb2d..ca028a1a864b8 100644
--- a/aten/src/ATen/native/mps/operations/Inverse.mm
+++ b/aten/src/ATen/native/mps/operations/Inverse.mm
@@ -7,9 +7,17 @@
 namespace at {
 namespace native {
 
-TORCH_IMPL_FUNC(linalg_inv_ex_out_mps)(const Tensor& A, bool check_errors, const Tensor& result, const Tensor& info)
-{
+TORCH_IMPL_FUNC(linalg_inv_ex_out_mps)(const Tensor& A, bool check_errors, const Tensor& result, const Tensor& info) {
     TORCH_CHECK(result.is_mps(), "Output tensor is not MPS");
+    if (!is_macos_13_or_newer()) {
+        TORCH_WARN_ONCE("torch.linalg_inv_ex.inverse is supported by MPS on MacOS 13+, please upgrade. Falling back to CPU.");
+        auto cpu_info = at::empty({0}, kInt, c10::nullopt, kCPU, c10::nullopt, c10::nullopt);
+        auto cpu_result = result.clone().to("cpu");
+        at::linalg_inv_ex_out(cpu_result, cpu_info, A.to("cpu"));
+        info.copy_(cpu_info);
+        result.copy_(cpu_result);
+        return;
+    }
 
     using namespace mps;
     MPSStream* stream = getCurrentMPSStream();
@@ -39,7 +47,6 @@
             MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
 
                 CachedGraph *newCachedGraph = nil;
-                
                 @autoreleasepool {
                     MPSGraph* mpsGraph = make_mps_graph();
                     newCachedGraph = new CachedGraph(mpsGraph);

From fd1139e5cd465bac6b694840b929db62a70fa7e3 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Wed, 14 Dec 2022 21:10:55 -0500
Subject: [PATCH 1856/1922] Display error for int64 pooling inputs on Monterey
 (#208)

* Display error for int64 pooling inputs on Monterey
Also skip the test on TestConsistency when such errors are generated
---
 aten/src/ATen/native/mps/operations/Pooling.mm | 4 ++++
 test/test_mps.py                               | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/aten/src/ATen/native/mps/operations/Pooling.mm b/aten/src/ATen/native/mps/operations/Pooling.mm
index 97909840d514b..89f8c22ee96a4 100644
--- a/aten/src/ATen/native/mps/operations/Pooling.mm
+++ b/aten/src/ATen/native/mps/operations/Pooling.mm
@@ -32,6 +32,10 @@ static void pool2d_template(const Tensor& input, const Tensor& output,
   if (input.numel() == 0)
     return;
 
+  if (!is_macos_13_or_newer()) {
+    TORCH_CHECK(input.scalar_type() != ScalarType::Long,
+                "MPS: ", op_name, " op with int64 input is supported natively starting from macOS 13.0.");
+  }
   const int64_t ndims = input.ndimension();
   const Tensor& grad_output = *(at::borrow_from_optional_tensor(grad_output_opt));
   const Tensor& indices = *(at::borrow_from_optional_tensor(indices_opt));
diff --git a/test/test_mps.py b/test/test_mps.py
index 17bb6af71e052..3aeeb4c4a1f26 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8769,6 +8769,9 @@ def get_samples():
                 self.assertEqual(cpu_out, mps_out, atol=atol, rtol=rtol)
 
             except Exception as e:
+                if any(s in str(e).lower() for s in ["int64", "macos 13"]):
+                  self.skipTest(f"{str(e)}")
+
                 if not generate_new_truth:
                     raise e
                 forward_failed = True

From 92ec9a6ef199a762050387fa21b9f9b8ae77a456 Mon Sep 17 00:00:00 2001
From: Ronian526 <11454459+Ronian526@users.noreply.github.com>
Date: Thu, 15 Dec 2022 10:01:49 -0800
Subject: [PATCH 1857/1922] Move as_strided_scatter to random output blocklist
 (#198)

* move as_strided_scatter to random output blocklist, as dup update indices may result in non-deterministic result

* re-integrate
---
 test/test_mps.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index 3aeeb4c4a1f26..3d3beba2119b9 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8606,8 +8606,6 @@ class TestConsistency(TestCase):
         'min': [torch.int64],
         'max': [torch.int64],
 
-        'as_strided_scatter': [torch.uint8],
-
         # Functions that are flaky
         # These are detected as "ok" by the expect case but actually fail to run sometimes
         'softmaxwith_dtype': None,
@@ -8624,7 +8622,6 @@ class TestConsistency(TestCase):
         'take_along_dim': None,
 
         # New block list ops that need investigation
-        'as_strided_scatter': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'nn.functional.bilinear': ['torch.float32'],
         'nn.functional.conv_transpose2d': ['torch.float32'],
         'nn.functional.interpolate': ['torch.float32'],
@@ -8681,6 +8678,8 @@ class TestConsistency(TestCase):
          # these fill tensors with uninitialized data, causing mismatch with CPU
         'new_empty': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'empty_like': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        # problem 103190467, as_strided_scatter has non-deterministic behavior when the update indices are not unique
+        'as_strided_scatter': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
 
         # failures due to precision issues
         'masked.var': ['f16'],

From 21833f2367c0c9426b4e023e976c9459c0830178 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Thu, 15 Dec 2022 17:38:40 -0500
Subject: [PATCH 1858/1922] Fix the crash in sgn_out_mps() (#209)

---
 aten/src/ATen/native/mps/operations/UnaryOps.mm | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index 9729a2e3d4ffb..e08f4cc4e0ead 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -283,6 +283,9 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
 {
     using namespace mps;
 
+    if (self.numel() == 0)
+      return;
+
     if (!output.is_same_size(self)) {
       output.resize_(self.sizes());
     }
@@ -313,7 +316,7 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
             MPSGraph* mpsGraph = make_mps_graph();
             newCachedGraph = new MPSUnaryCachedGraph(mpsGraph);
             newCachedGraph->inputTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, realInput);
-              MPSGraphTensor* sgnTensor;
+              MPSGraphTensor* sgnTensor = nullptr;
               if (self.is_complex()) {
                 NSArray<MPSGraphTensor*>* complexNumberComponents = [mpsGraph splitTensor:newCachedGraph->inputTensor_
                                                               numSplits: 2
@@ -329,7 +332,7 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
 
                 MPSGraphTensor* complexZeroTensor = [mpsGraph constantWithScalar:0.0
                                                               shape: newCachedGraph->inputTensor_.shape
-                                                              dataType:realPartTensor.dataType];                
+                                                              dataType:realPartTensor.dataType];
 
                 MPSGraphTensor* isRealZero = [mpsGraph equalWithPrimaryTensor:realPartTensor
                                                               secondaryTensor:zeroTensor

From 807e19238c52e22fc09d8175f4c6a3c41631eb17 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Thu, 15 Dec 2022 17:40:11 -0500
Subject: [PATCH 1859/1922] Fix the crash in max_out_mps() caused by cached key
 conflict (#211)

The shape of input and indices tensors were missing in the cached key
---
 aten/src/ATen/native/mps/operations/ReduceOps.mm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index a4bbbc4d895cd..1063cdb1844fe 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -1433,7 +1433,7 @@ Tensor min_mps(const Tensor& input_t) {
     auto stream = at::mps::getCurrentMPSStream();
 
     @autoreleasepool {
-        string key = func_name + ":" + to_string(dim_) + ":" + native_mps::getMPSTypeString(input_t.scalar_type());
+        string key = func_name + native_mps::getTensorsStringKey({input_t, indices_t}) + ":" + to_string(dim_);
         CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
 
         if(!cachedGraph) {

From a24e1a18f6a7a6a726a6138e92c38fccd4e4075e Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Thu, 15 Dec 2022 17:46:26 -0500
Subject: [PATCH 1860/1922] Replace assert() with torch_check() in reduce ops
 (#210)

Also move some expected int64 errors from block list to allow list
---
 .../ATen/native/mps/operations/ReduceOps.mm   |  8 +++---
 test/test_mps.py                              | 25 ++++++-------------
 2 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index 1063cdb1844fe..29527af5f6919 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -1294,7 +1294,7 @@ Tensor std_mps(
   (const Tensor& input_t,
    MPSReductionType reduction_type,
    const std::string& func_name) {
-  TORCH_INTERNAL_ASSERT(input_t.scalar_type() != ScalarType::Long, "min/max not supported for Long dtype on MPS");
+  TORCH_CHECK(input_t.scalar_type() != ScalarType::Long, "MPS does not support min/max ops with int64 input");
 
   namespace native_mps = at::native::mps;
   using CachedGraph = native_mps::MPSUnaryCachedGraph;
@@ -1389,7 +1389,7 @@ Tensor min_mps(const Tensor& input_t) {
   const Tensor& indices_t,
   MPSReductionType reduction_type,
   const std::string& func_name) {
-    TORCH_INTERNAL_ASSERT(input_t.scalar_type() != ScalarType::Long, "min/max not supported for Long dtype on MPS");
+    TORCH_CHECK(input_t.scalar_type() != ScalarType::Long, "MPS does not support min/max ops with int64 input");
 
     namespace native_mps = at::native::mps;
 
@@ -1796,7 +1796,7 @@ Tensor median_mps(const Tensor& input_t) {
         return at::median(input_t.to("cpu"));
   }
 
-    TORCH_INTERNAL_ASSERT(input_t.scalar_type() != ScalarType::Long, "median not supported for Long dtype on MPS");
+    TORCH_CHECK(input_t.scalar_type() != ScalarType::Long, "MPS does not support median op with int64 input");
 
     namespace native_mps = at::native::mps;
     using CachedGraph = native_mps::MPSUnaryCachedGraph;
@@ -2020,7 +2020,7 @@ Tensor median_mps(const Tensor& input_t) {
     at::Tensor & values,
     at::Tensor & indices){
 
-  TORCH_INTERNAL_ASSERT(input_t.scalar_type() != ScalarType::Long, "median not supported for Long dtype on MPS");
+  TORCH_CHECK(input_t.scalar_type() != ScalarType::Long, "MPS does not support median ops with int64 input");
 
   namespace native_mps = at::native::mps;
     int64_t dim_ = maybe_wrap_dim(dim, input_t.dim());
diff --git a/test/test_mps.py b/test/test_mps.py
index 3d3beba2119b9..0f074fbe88fcd 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8142,7 +8142,7 @@ class TestConsistency(TestCase):
         'asin': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'asinh': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'atan': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'atan2': ['f32'],
+        'atan2': ['f32', 'i64'],
         'atanh': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'atleast_1d': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'atleast_2d': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8247,12 +8247,12 @@ class TestConsistency(TestCase):
         'masked_select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'matmul': ['f32'],
         'maximum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'max': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
+        'max': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'maxreduction_with_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
         'maxreduction_no_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
         'maxbinary': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
         'minimum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'min': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
+        'min': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'minreduction_with_dim': ['f16', 'f32', 'i32'],
         'minreduction_no_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
         'minbinary': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
@@ -8295,6 +8295,7 @@ class TestConsistency(TestCase):
         'nn.functional.max_pool2d': ['f32'],
         'nn.functional.mse_loss': ['f16', 'f32'],
         'nn.functional.normalize': ['f32'],
+        'nn.functional.one_hot': ['i64'],
         'nn.functional.pad': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.padcircular': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.padconstant': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8313,7 +8314,7 @@ class TestConsistency(TestCase):
         'nn.functional.soft_margin_loss': ['f32'],
         'nn.functional.softmin': ['f32'],
         'nn.functional.softplus': ['f32'],
-        'nn.functional.softsign': ['f16', 'f32', 'i16', 'u8'],
+        'nn.functional.softsign': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.tanhshrink': ['f32', 'i16', 'i32', 'u8'],
         'nn.functional.threshold': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.triplet_margin_loss': ['f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8341,7 +8342,7 @@ class TestConsistency(TestCase):
         'select_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'sgn': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'short': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'sigmoid': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
+        'sigmoid': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'sign': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8', 'i64'],
         'sin': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'sinh': ['b8', 'f32', 'i16', 'i32', 'u8'],
@@ -8595,17 +8596,7 @@ class TestConsistency(TestCase):
         'norm': [torch.float16],
         'minreduction_with_dim': [torch.bool, torch.int16, torch.uint8],
         'maxreduction_with_dim': [torch.bool, torch.int16, torch.uint8],
-
-        # MPS kernels with no support for int64 inputs
-        'atan2': [torch.int64],
-        'pow': [torch.int64],
-        'sigmoid': [torch.int64],
         'square': [torch.bool, torch.int16, torch.int32, torch.int64, torch.uint8],
-        'dot': [torch.int64],
-        'nn.functional.one_hot': [torch.int64],
-        'min': [torch.int64],
-        'max': [torch.int64],
-
         # Functions that are flaky
         # These are detected as "ok" by the expect case but actually fail to run sometimes
         'softmaxwith_dtype': None,
@@ -8653,8 +8644,8 @@ class TestConsistency(TestCase):
         'nn.functional.adaptive_avg_pool2d': ['torch.float32'],
 
         # failures due to issue #102048039: powerWithPrimaryTensor() with integer input may return wrong results
-        'pow': ['torch.int16', 'torch.int32', 'torch.uint8'],
-        '__rpow__': ['torch.int16', 'torch.int32', 'torch.uint8'],
+        'pow': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        '__rpow__': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
 
         # failures due to unsupported data types on MPS backend
         'matmul': ['torch.uint8'], # MPS device does not support mm for non-float inputs

From 334a1e26e131e9fcfe34cd64c98bec98ec0fd361 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Thu, 15 Dec 2022 21:23:40 -0800
Subject: [PATCH 1861/1922] Fix ops with bool issues in macOS Monterey (#205)

* Fix bool op issues in macOS Monterey

* Fix gather_out_mps

* Fix boolean inputs for binary ops
---
 aten/src/ATen/native/mps/OperationUtils.h     |   2 +-
 aten/src/ATen/native/mps/OperationUtils.mm    |   4 +-
 .../ATen/native/mps/operations/BinaryOps.mm   |  53 ++-
 .../ATen/native/mps/operations/Indexing.mm    |  39 ++-
 aten/src/ATen/native/mps/operations/Repeat.mm |  18 +-
 .../native/mps/operations/ScatterGather.mm    |  10 +-
 aten/src/ATen/native/mps/operations/Shape.mm  |  21 +-
 .../ATen/native/mps/operations/UnaryOps.mm    | 303 +++++++++---------
 aten/src/ATen/native/mps/operations/View.mm   |  56 ++--
 9 files changed, 286 insertions(+), 220 deletions(-)

diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
index 458a34d0af90f..343a335217352 100644
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@@ -89,7 +89,7 @@ MPSGraphTensor* trunc_tensor(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor);
 MPSGraphTensor* convertNHWCtoNCHW(MPSGraph *mpsGraph, MPSGraphTensor* tensor);
 MPSGraphTensor* castMPSTensor(MPSGraph *mpsGraph, MPSGraphTensor* tensor, ScalarType toType);
 MPSGraphTensorData *getMPSGraphTensorData(MPSGraph* mpsGraph, MPSStream* mpsStream, const Tensor& tensor);
-MPSGraphTensorData* getMPSGraphTensorFromScalar(MPSStream* mpsStream, MPSScalar& scalar);
+MPSGraphTensorData* getMPSGraphTensorFromScalar(MPSStream* mpsStream, MPSScalar& scalar, MPSDataType dataType = MPSDataTypeInvalid);
 
 MPSGraph* make_mps_graph();
 void printTensorNDArray(const Tensor& t);
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index aa1344ffaf795..ffb5ddf490267 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -307,14 +307,14 @@ MPSScalar getMPSScalar(const Scalar& scalar, ScalarType type) {
   }
 }
 
-MPSGraphTensorData* getMPSGraphTensorFromScalar(MPSStream* mpsStream, MPSScalar& scalar) {
+MPSGraphTensorData* getMPSGraphTensorFromScalar(MPSStream* mpsStream, MPSScalar& scalar, MPSDataType dataType) {
   MPSGraphTensorData *result = nullptr;
   // Scalar pools are only supported on devices with unified memory
   if (mpsStream->device().hasUnifiedMemory) {
     scalar.buffer = at::mps::allocate_scalar_buffer(&scalar.value, scalar.size);
     result = [[[MPSGraphTensorData alloc] initWithMTLBuffer: scalar.getMTLBuffer()
                                                       shape: @[@1]
-                                                   dataType: getMPSScalarType(scalar.type)] autorelease];
+                                                   dataType: (dataType != MPSDataTypeInvalid) ? dataType : getMPSScalarType(scalar.type)] autorelease];
   } else {
     MPSNDArrayDescriptor *tensorDesc = [MPSNDArrayDescriptor descriptorWithDataType:getMPSScalarType(scalar.type) shape:@[@1]];
     MPSNDArray *tensorNDArray = [[[MPSNDArray alloc] initWithDevice:mpsStream->device() descriptor:tensorDesc] autorelease];
diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm
index e6d25b82eae3b..7efb646d72175 100644
--- a/aten/src/ATen/native/mps/operations/BinaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@@ -54,6 +54,21 @@ void binaryOpTensor(const Tensor& self, const Tensor& other, const Scalar& alpha
     needsCopyToOutput = true;
   }
 
+  auto inputDataType = self.scalar_type();
+  auto otherDataType = other.scalar_type();
+  auto outputDataType = output_.scalar_type();
+  if (!is_macos_13_or_newer()) {
+    if (self.scalar_type() == kBool) {
+      inputDataType = kChar;
+    }
+    if (other.scalar_type() == kBool) {
+      otherDataType = kChar;
+    }
+    if (output.scalar_type() == kBool) {
+      outputDataType = kChar;
+    }
+  }
+
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
   @autoreleasepool {
     string key = op_name + getTensorsStringKey({self, other, output_}, /*use_scalar_value*/ false);
@@ -65,36 +80,37 @@ void binaryOpTensor(const Tensor& self, const Tensor& other, const Scalar& alpha
         @autoreleasepool {
           MPSGraph* mpsGraph = make_mps_graph();
           newCachedGraph = new BinaryOpCachedGraph(mpsGraph);
-          newCachedGraph->primaryTensor   = mpsGraphRankedPlaceHolder(mpsGraph, self);
-          newCachedGraph->secondaryTensor = mpsGraphRankedPlaceHolder(mpsGraph, other);
+          newCachedGraph->primaryTensor   = mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(inputDataType), getMPSShape(self));
+          newCachedGraph->secondaryTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(otherDataType), getMPSShape(other));
 
           MPSGraphTensor* primaryCastTensor   = newCachedGraph->primaryTensor;
           MPSGraphTensor* secondaryCastTensor = newCachedGraph->secondaryTensor;
 
           // this type inference is only required at the time of graph creation
-          ScalarType common_dtype = c10::promoteTypes(self.scalar_type(), other.scalar_type());
+          ScalarType common_dtype = c10::promoteTypes(inputDataType, otherDataType);
           if (isIntegralType(common_dtype, true)) {
             // integer inputs must be cast to float, if output is float
-            if (isFloatingType(output_.scalar_type())) {
-              common_dtype = output_.scalar_type();
+            if (isFloatingType(outputDataType)) {
+              common_dtype = outputDataType;
             // in boolean comparison ops with signed vs. unsigned integers, we always cast to the unsigned type
-            } else if (output_.scalar_type() == ScalarType::Bool &&
-                      (self.scalar_type()  == ScalarType::Byte ||
-                       other.scalar_type() == ScalarType::Byte)) {
+            } else if (outputDataType == ScalarType::Bool &&
+                      (inputDataType == ScalarType::Byte ||
+                       otherDataType == ScalarType::Byte)) {
               common_dtype = ScalarType::Byte;
             }
           }
-          if (self.scalar_type() != common_dtype) {
+          if (inputDataType != common_dtype) {
             primaryCastTensor = castMPSTensor(mpsGraph, newCachedGraph->primaryTensor, common_dtype);
           }
-          if (other.scalar_type() != common_dtype) {
+          if (otherDataType != common_dtype) {
             secondaryCastTensor = castMPSTensor(mpsGraph, newCachedGraph->secondaryTensor, common_dtype);
           }
           newCachedGraph->outputTensor = binaryBlock(newCachedGraph, primaryCastTensor, secondaryCastTensor);
           // Cast output tensor to an expected type if needed, which addresses discrepancy when int64 scalar is added to int32 tensor
           // Output tensor should have been promoted but it remains an int32 tensor
-          if (output_.scalar_type() != common_dtype) {
-            newCachedGraph->outputTensor = castMPSTensor(mpsGraph, newCachedGraph->outputTensor, output_.scalar_type());
+          if (outputDataType != common_dtype ||
+             [newCachedGraph->outputTensor dataType] != getMPSDataType(outputDataType)) {
+            newCachedGraph->outputTensor = castMPSTensor(mpsGraph, newCachedGraph->outputTensor, outputDataType);
           }
         }
         return newCachedGraph;
@@ -111,16 +127,18 @@ void binaryOpTensor(const Tensor& self, const Tensor& other, const Scalar& alpha
 
     if (is_self_scalar && !self.is_mps()) {
       self_scalar = getMPSScalar(self.item(), self.scalar_type());
-      feeds[cachedGraph->primaryTensor] = getMPSGraphTensorFromScalar(mpsStream, self_scalar);
+      feeds[cachedGraph->primaryTensor] = getMPSGraphTensorFromScalar(mpsStream, self_scalar, getMPSScalarType(inputDataType));
     } else {
-      selfPlaceholder = Placeholder(cachedGraph->primaryTensor, self);
+      selfPlaceholder = Placeholder(
+        cachedGraph->primaryTensor, self,  /*mpsShape*/nil, /*gatherTensorData=*/true, getMPSScalarType(inputDataType));
       feeds[selfPlaceholder.getMPSGraphTensor()] = selfPlaceholder.getMPSGraphTensorData();
     }
     if (is_other_scalar && !other.is_mps()) {
       other_scalar = getMPSScalar(other.item(), other.scalar_type());
-      feeds[cachedGraph->secondaryTensor] = getMPSGraphTensorFromScalar(mpsStream, other_scalar);
+      feeds[cachedGraph->secondaryTensor] = getMPSGraphTensorFromScalar(mpsStream, other_scalar, getMPSScalarType(otherDataType));
     } else {
-      otherPlaceholder = Placeholder(cachedGraph->secondaryTensor, other);
+      otherPlaceholder = Placeholder(
+        cachedGraph->secondaryTensor, other,  /*mpsShape*/nil, /*gatherTensorData=*/true, getMPSScalarType(otherDataType));
       feeds[otherPlaceholder.getMPSGraphTensor()] = otherPlaceholder.getMPSGraphTensorData();
     }
 
@@ -130,7 +148,8 @@ void binaryOpTensor(const Tensor& self, const Tensor& other, const Scalar& alpha
       feeds[cachedGraph->alphaTensor] = getMPSGraphTensorFromScalar(mpsStream, alpha_scalar);
     }
 
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, needsCopyToOutput ? output : output_);
+    Placeholder outputPlaceholder = Placeholder(
+      cachedGraph->outputTensor, needsCopyToOutput ? output : output_,  /*mpsShape*/nil, /*gatherTensorData=*/false, getMPSScalarType(outputDataType));
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
       outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
     };
diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index cdcbe4cae6117..1d5a6aceb05b3 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -440,7 +440,16 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) {
   using CachedGraph = mps::MPSUnaryCachedGraph;
 
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
-
+  MPSDataType inputDataType = getMPSScalarType(self.scalar_type());
+  MPSDataType outputDataType = getMPSScalarType(self.scalar_type());
+  if (!is_macos_13_or_newer()) {
+     if (self.scalar_type() == kBool) {
+      inputDataType = MPSDataTypeInt8;
+     }
+     if (result.scalar_type() == kBool) {
+      outputDataType = MPSDataTypeInt8;
+     }
+  }
   @autoreleasepool {
     NSString* ns_dims_key = [[ns_dims valueForKey:@"description"] componentsJoinedByString:@","];
     // A key is used to identify the MPSGraph which was created once, and can be reused if the parameters, data types etc match the earlier created MPSGraph
@@ -455,7 +464,7 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) {
           MPSGraph* mpsGraph = make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
 
-          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, inputDataType, getMPSShape(self));
           MPSGraphTensor* outputTensor = [mpsGraph reverseTensor:inputTensor
                                                             axes:ns_dims
                                                             name:nil];
@@ -467,8 +476,10 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) {
     }
 
     // Create placeholders which use the keys of the CachedGraph to create inputs and outputs of the operation
-    Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result);
+    Placeholder inputPlaceholder = Placeholder(
+      cachedGraph->inputTensor_, self, /*mpsShape*/nil, /*gatherTensorData=*/true, inputDataType);
+    Placeholder outputPlaceholder = Placeholder(
+      cachedGraph->outputTensor_, result, /*mpsShape*/nil, /*gatherTensorData=*/false, outputDataType);
 
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
@@ -635,9 +646,16 @@ Tensor index_select_mps(const Tensor & self,
 
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
   auto inputType = getMPSDataType(self.scalar_type());
-  if (inputType ==  MPSDataTypeUInt8) {
-      inputType =  MPSDataTypeInt8;
+  auto outputType = getMPSDataType(output.scalar_type());
+  if (inputType == MPSDataTypeUInt8 ||
+     (!is_macos_13_or_newer() && inputType == MPSDataTypeBool)) {
+    inputType = MPSDataTypeInt8;
+  }
+  if (outputType == MPSDataTypeUInt8 ||
+     (!is_macos_13_or_newer() && outputType == MPSDataTypeBool)) {
+    outputType = MPSDataTypeInt8;
   }
+
   @autoreleasepool {
 
     string key = "index_select_out_mps" + getTensorsStringKey({self, index}) + ":" + std::to_string(dim);
@@ -672,7 +690,7 @@ Tensor index_select_mps(const Tensor & self,
                                   /*mpsShape=*/nullptr, /*gatherTensorData=*/true, /*dataType=*/inputType);
     Placeholder indexPlaceholder = Placeholder(cachedGraph->indexTensor_, index);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output,
-                                  /*mpsShape=*/nullptr, /*gatherTensorData=*/false, /*dataType=*/inputType);
+                                  /*mpsShape=*/nullptr, /*gatherTensorData=*/false, /*dataType=*/outputType);
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
       selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
@@ -764,10 +782,11 @@ Tensor index_select_mps(const Tensor & self,
     }
 
     Placeholder selfPlaceholder   = Placeholder(
-      cachedGraph->inputTensor_, self, /*mpsShape*/nullptr, /*gatherTensorData=*/true, inputDataType);
+      cachedGraph->inputTensor_, self, /*mpsShape*/nil, /*gatherTensorData=*/true, inputDataType);
     Placeholder maskPlaceholder   = Placeholder(
-      cachedGraph->maskTensor_, *b_mask, /*mpsShape*/nullptr, /*gatherTensorData=*/true, maskDataType);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, self);
+      cachedGraph->maskTensor_, *b_mask, /*mpsShape*/nil, /*gatherTensorData=*/true, maskDataType);
+    Placeholder outputPlaceholder = Placeholder(
+      cachedGraph->outputTensor_, self, /*mpsShape*/nil, /*gatherTensorData=*/false, inputDataType);
 
     // Create dictionary of inputs and outputs
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
diff --git a/aten/src/ATen/native/mps/operations/Repeat.mm b/aten/src/ATen/native/mps/operations/Repeat.mm
index 6e52dbe89c53b..6dd041c542b68 100644
--- a/aten/src/ATen/native/mps/operations/Repeat.mm
+++ b/aten/src/ATen/native/mps/operations/Repeat.mm
@@ -72,6 +72,16 @@ Tensor repeat_mps(const Tensor& self, IntArrayRef repeats) {
   }
 
   auto stream = at::mps::getCurrentMPSStream();
+  auto inputDataType = getMPSDataType(expanded_tensor.scalar_type());
+  auto outputDataType = getMPSDataType(result.scalar_type());
+  if (!is_macos_13_or_newer()) {
+     if (expanded_tensor.scalar_type() == kBool) {
+      inputDataType = MPSDataTypeInt8;
+     }
+     if (result.scalar_type() == kBool) {
+      outputDataType = MPSDataTypeInt8;
+     }
+  }
 
   @autoreleasepool {
     string key = "repeat_mps:" + getTensorsStringKey(self) + ":" + getArrayRefString(repeats);
@@ -85,7 +95,7 @@ Tensor repeat_mps(const Tensor& self, IntArrayRef repeats) {
           MPSGraph* mpsGraph = make_mps_graph();
           newCachedGraph = new CachedGraph(mpsGraph);
 
-          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, expanded_tensor);
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, inputDataType, getMPSShape(expanded_tensor));
           MPSGraphTensor* outputTensor = [mpsGraph tileTensor:inputTensor
                                                withMultiplier:getMPSShape(repeats)
                                                          name:nil];
@@ -98,8 +108,10 @@ Tensor repeat_mps(const Tensor& self, IntArrayRef repeats) {
       cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
-    Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, expanded_tensor);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result);
+    Placeholder selfPlaceholder = Placeholder(
+      cachedGraph->inputTensor_, expanded_tensor, /*mpsShape=*/nil, /*gatherTensorData=*/true, inputDataType);
+    Placeholder outputPlaceholder = Placeholder(
+      cachedGraph->outputTensor_, result, /*mpsShape=*/nil, /*gatherTensorData*/false, outputDataType);
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
       selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
diff --git a/aten/src/ATen/native/mps/operations/ScatterGather.mm b/aten/src/ATen/native/mps/operations/ScatterGather.mm
index 1871e55739235..fca74314a4fd9 100644
--- a/aten/src/ATen/native/mps/operations/ScatterGather.mm
+++ b/aten/src/ATen/native/mps/operations/ScatterGather.mm
@@ -53,8 +53,12 @@
         needSlice = true;
     }
     auto input_type = getMPSDataType(self.scalar_type());
-    if (input_type ==  MPSDataTypeUInt8) {
-      input_type =  MPSDataTypeInt8;
+    auto output_type = getMPSDataType(output.scalar_type());
+    if (input_type == MPSDataTypeUInt8 || ((input_type ==  MPSDataTypeBool && !is_macos_13_or_newer()))) {
+      input_type = MPSDataTypeInt8;
+    }
+    if (output_type == MPSDataTypeUInt8 || ((output_type ==  MPSDataTypeBool && !is_macos_13_or_newer()))) {
+      output_type = MPSDataTypeInt8;
     }
     string key = "gather_out_mps" + getTensorsStringKey({self, index, output}) + ":" + std::to_string(dim);
     CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
@@ -112,7 +116,7 @@
 
     Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, input_shape, true, input_type);
     Placeholder indexPlaceholder = Placeholder(cachedGraph->indexTensor_, index, index_shape);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output, nullptr, false, input_type);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output, nullptr, false, output_type);
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
       selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
diff --git a/aten/src/ATen/native/mps/operations/Shape.mm b/aten/src/ATen/native/mps/operations/Shape.mm
index 858767007c6db..86e82bef93cbf 100644
--- a/aten/src/ATen/native/mps/operations/Shape.mm
+++ b/aten/src/ATen/native/mps/operations/Shape.mm
@@ -375,7 +375,12 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
           newCachedGraph->inputTensors_.reserve(len_tensor_array);
 
           for (const auto idx : c10::irange(len_tensor_array)) {
-            newCachedGraph->inputTensors_[idx] = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(input_tensors[idx].scalar_type()));
+            auto scalar_type = getMPSScalarType(input_tensors[idx].scalar_type());
+            if (input_tensors[idx].scalar_type() == kBool) {
+              scalar_type = MPSDataTypeInt8;
+            }
+
+            newCachedGraph->inputTensors_[idx] = mpsGraphUnrankedPlaceHolder(mpsGraph, scalar_type);
             if (input_tensors[idx].scalar_type() != result_type(inputs)) {
               castInputTensors[idx] = [mpsGraph castTensor:newCachedGraph->inputTensors_[idx]
                                                     toType:getMPSDataType(result_type(inputs))
@@ -407,14 +412,24 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
     int t_idx = 0;
     for(const Tensor& tensor : materialized_inputs) {
       if(std::find(skipped_tensor_indices.begin(), skipped_tensor_indices.end(), i) == skipped_tensor_indices.end()) {
-        Placeholder currentInputPlaceholder = Placeholder(cachedGraph->inputTensors_[t_idx], tensor);
+        auto scalar_type = getMPSScalarType(tensor.scalar_type());
+        if (tensor.scalar_type() == kBool) {
+          scalar_type = MPSDataTypeInt8;
+        }
+        Placeholder currentInputPlaceholder = Placeholder(
+          cachedGraph->inputTensors_[t_idx], tensor, /*mpsShape=*/nil, /*gatherTensorData=*/true, scalar_type);
         inputPlaceholders.push_back(currentInputPlaceholder);
         t_idx++;
       }
       i++;
     }
 
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out);
+    auto outputDataType = getMPSScalarType(out.scalar_type());
+    if (!is_macos_13_or_newer() && out.scalar_type() == kBool) {
+      outputDataType = MPSDataTypeInt8;
+    }
+    Placeholder outputPlaceholder = Placeholder(
+      cachedGraph->outputTensor_, out, /*mpsShape=*/nil, /*gatherTensorData=*/false, outputDataType);
 
     NSMutableDictionary *feeds = [[NSMutableDictionary new] autorelease];
     for (int i = 0; i < inputPlaceholders.size(); i++) {
diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index e08f4cc4e0ead..cb3e40299bceb 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -281,160 +281,171 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
 
 TORCH_IMPL_FUNC(sgn_out_mps) (const Tensor& self, const Tensor& output)
 {
-    using namespace mps;
+  using namespace mps;
 
-    if (self.numel() == 0)
-      return;
+  if (self.numel() == 0) {
+    return;
+  }
 
-    if (!output.is_same_size(self)) {
-      output.resize_(self.sizes());
-    }
+  if (!output.is_same_size(self)) {
+    output.resize_(self.sizes());
+  }
 
-    string graphSuffix = "_real";
-    Tensor realInput;
-    Tensor realOutput;
-    Tensor flatInput = self.flatten();
-    Tensor flatOutput = output.flatten();
-    if (self.is_complex()) {
-      realInput = at::view_as_real(flatInput);
-      realOutput = at::view_as_real(flatOutput);
-      graphSuffix = "_complex";
-    } else {
-      realInput = flatInput;
-      realOutput = flatOutput;
-    }
+  string graphSuffix = "_real";
+  Tensor realInput;
+  Tensor realOutput;
+  Tensor flatInput = self.flatten();
+  Tensor flatOutput = output.flatten();
+  if (self.is_complex()) {
+    realInput = at::view_as_real(flatInput);
+    realOutput = at::view_as_real(flatOutput);
+    graphSuffix = "_complex";
+  } else {
+    realInput = flatInput;
+    realOutput = flatOutput;
+  }
 
-    MPSGraphCache* cache_ = MPSGraphCache::getInstance();
-    @autoreleasepool {
-      string key = string("sgn_out_mps") + getTensorsStringKey({realInput}) + graphSuffix;
-      auto cachedGraph = cache_->LookUpAs<MPSUnaryCachedGraph>(key);
-
-      if(!cachedGraph) {
-        MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph* () {
-          MPSUnaryCachedGraph *newCachedGraph = nil;
-          @autoreleasepool {
-            MPSGraph* mpsGraph = make_mps_graph();
-            newCachedGraph = new MPSUnaryCachedGraph(mpsGraph);
-            newCachedGraph->inputTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, realInput);
-              MPSGraphTensor* sgnTensor = nullptr;
-              if (self.is_complex()) {
-                NSArray<MPSGraphTensor*>* complexNumberComponents = [mpsGraph splitTensor:newCachedGraph->inputTensor_
-                                                              numSplits: 2
-                                                              axis: 1
-                                                              name: nil];
-
-                MPSGraphTensor* realPartTensor = complexNumberComponents[0];
-                MPSGraphTensor* imaginaryPartTensor = complexNumberComponents[1];
-
-                MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0
-                                                              shape:realPartTensor.shape
-                                                              dataType:realPartTensor.dataType];
-
-                MPSGraphTensor* complexZeroTensor = [mpsGraph constantWithScalar:0.0
-                                                              shape: newCachedGraph->inputTensor_.shape
-                                                              dataType:realPartTensor.dataType];
-
-                MPSGraphTensor* isRealZero = [mpsGraph equalWithPrimaryTensor:realPartTensor
-                                                              secondaryTensor:zeroTensor
-                                                              name: nil];
-
-                MPSGraphTensor* isImaginaryZero = [mpsGraph equalWithPrimaryTensor:imaginaryPartTensor
-                                                              secondaryTensor:zeroTensor
-                                                              name: nil];
-
-                MPSGraphTensor* isComplexZero = [mpsGraph logicalANDWithPrimaryTensor:isRealZero
-                                                              secondaryTensor:isImaginaryZero
-                                                              name: nil];
-
-                MPSGraphTensor* sgnDenomReal = [mpsGraph squareWithTensor:realPartTensor
-                                                              name: nil];
-
-                MPSGraphTensor* sgnDenomImaginary = [mpsGraph squareWithTensor:imaginaryPartTensor
-                                                              name: nil];
-
-                MPSGraphTensor* sgnDenomSum = [mpsGraph additionWithPrimaryTensor:sgnDenomReal
-                                                              secondaryTensor:sgnDenomImaginary
-                                                              name: nil];
-
-                MPSGraphTensor* sgnDenom = [mpsGraph squareRootWithTensor:sgnDenomSum
-                                                              name: nil];
-
-                MPSGraphTensor* sgnRealTensor = [mpsGraph divisionWithPrimaryTensor:realPartTensor
-                                                              secondaryTensor:sgnDenom
-                                                              name: nil];
-
-                MPSGraphTensor* sgnImaginaryTensor = [mpsGraph divisionWithPrimaryTensor:imaginaryPartTensor
-                                                              secondaryTensor:sgnDenom
-                                                              name: nil];
-
-                MPSGraphTensor* sgnComplexTensor = [mpsGraph concatTensors:@[sgnRealTensor, sgnImaginaryTensor]
-                                                              dimension: 1
-                                                              name: nil];
-
-                sgnTensor = [mpsGraph selectWithPredicateTensor:isComplexZero
-                                                              truePredicateTensor:complexZeroTensor
-                                                              falsePredicateTensor:sgnComplexTensor
-                                                              name:nil];
-              } else {
-                MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0
-                                                              shape:newCachedGraph->inputTensor_.shape
-                                                              dataType:mps::getMPSDataType(self.scalar_type())];
-
-                MPSGraphTensor* oneTensor = [mpsGraph constantWithScalar:1
-                                                              shape:newCachedGraph->inputTensor_.shape
-                                                              dataType:mps::getMPSDataType(self.scalar_type())];
-
-                MPSGraphTensor* negativeOneTensor = [mpsGraph constantWithScalar:-1
-                                                              shape:newCachedGraph->inputTensor_.shape
-                                                              dataType:mps::getMPSDataType(self.scalar_type())];
-
-                MPSGraphTensor* isPositive = [mpsGraph greaterThanWithPrimaryTensor:newCachedGraph->inputTensor_
-                                                              secondaryTensor:zeroTensor
-                                                              name: nil];
-
-                MPSGraphTensor* isNegative = [mpsGraph lessThanWithPrimaryTensor:newCachedGraph->inputTensor_
-                                                              secondaryTensor:zeroTensor
-                                                              name: nil];
-
-                MPSGraphTensor* notPositiveTensor = [mpsGraph selectWithPredicateTensor:isNegative
-                                                              truePredicateTensor:negativeOneTensor
-                                                              falsePredicateTensor:zeroTensor
-                                                              name:nil];
+  MPSDataType selfDataType = getMPSScalarType(self.scalar_type());
+  // Workaround for `constantWithScalar` crashes due to unsupported bool data type
+  // The issue is fixed in macOS Ventura (13.0)
+  if (!is_macos_13_or_newer()) {
+     if (self.scalar_type() == kBool) {
+      selfDataType = MPSDataTypeInt8;
+     }
+  }
 
-                sgnTensor = [mpsGraph selectWithPredicateTensor:isPositive
-                                                              truePredicateTensor:oneTensor
-                                                              falsePredicateTensor:notPositiveTensor
-                                                              name:nil];
-              }
-              newCachedGraph->outputTensor_ = sgnTensor;
-          }
-          return newCachedGraph;
-        });
-        cachedGraph = tmpCachedGraph->as<MPSUnaryCachedGraph>();
-      }
-
-      Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, realInput);
-      Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, realOutput);
-      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-        selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
-      };
-      NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-        outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
-      };
-      runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+  @autoreleasepool {
+    string key = string("sgn_out_mps") + getTensorsStringKey({realInput}) + graphSuffix;
+    auto cachedGraph = cache_->LookUpAs<MPSUnaryCachedGraph>(key);
+
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph* () {
+        MPSUnaryCachedGraph *newCachedGraph = nil;
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new MPSUnaryCachedGraph(mpsGraph);
+          newCachedGraph->inputTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, selfDataType, getMPSShape(realInput));
+            MPSGraphTensor* sgnTensor;
+            if (self.is_complex()) {
+              NSArray<MPSGraphTensor*>* complexNumberComponents = [mpsGraph splitTensor:newCachedGraph->inputTensor_
+                                                            numSplits: 2
+                                                            axis: 1
+                                                            name: nil];
+
+              MPSGraphTensor* realPartTensor = complexNumberComponents[0];
+              MPSGraphTensor* imaginaryPartTensor = complexNumberComponents[1];
+
+              MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0
+                                                            shape:realPartTensor.shape
+                                                            dataType:realPartTensor.dataType];
+
+              MPSGraphTensor* complexZeroTensor = [mpsGraph constantWithScalar:0.0
+                                                            shape: newCachedGraph->inputTensor_.shape
+                                                            dataType:realPartTensor.dataType];
+
+              MPSGraphTensor* isRealZero = [mpsGraph equalWithPrimaryTensor:realPartTensor
+                                                            secondaryTensor:zeroTensor
+                                                            name: nil];
+
+              MPSGraphTensor* isImaginaryZero = [mpsGraph equalWithPrimaryTensor:imaginaryPartTensor
+                                                            secondaryTensor:zeroTensor
+                                                            name: nil];
+
+              MPSGraphTensor* isComplexZero = [mpsGraph logicalANDWithPrimaryTensor:isRealZero
+                                                            secondaryTensor:isImaginaryZero
+                                                            name: nil];
+
+              MPSGraphTensor* sgnDenomReal = [mpsGraph squareWithTensor:realPartTensor
+                                                            name: nil];
+
+              MPSGraphTensor* sgnDenomImaginary = [mpsGraph squareWithTensor:imaginaryPartTensor
+                                                            name: nil];
+
+              MPSGraphTensor* sgnDenomSum = [mpsGraph additionWithPrimaryTensor:sgnDenomReal
+                                                            secondaryTensor:sgnDenomImaginary
+                                                            name: nil];
+
+              MPSGraphTensor* sgnDenom = [mpsGraph squareRootWithTensor:sgnDenomSum
+                                                            name: nil];
+
+              MPSGraphTensor* sgnRealTensor = [mpsGraph divisionWithPrimaryTensor:realPartTensor
+                                                            secondaryTensor:sgnDenom
+                                                            name: nil];
+
+              MPSGraphTensor* sgnImaginaryTensor = [mpsGraph divisionWithPrimaryTensor:imaginaryPartTensor
+                                                            secondaryTensor:sgnDenom
+                                                            name: nil];
+
+              MPSGraphTensor* sgnComplexTensor = [mpsGraph concatTensors:@[sgnRealTensor, sgnImaginaryTensor]
+                                                            dimension: 1
+                                                            name: nil];
+
+              sgnTensor = [mpsGraph selectWithPredicateTensor:isComplexZero
+                                                            truePredicateTensor:complexZeroTensor
+                                                            falsePredicateTensor:sgnComplexTensor
+                                                            name:nil];
+            } else {
+              MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0
+                                                            shape:newCachedGraph->inputTensor_.shape
+                                                            dataType:selfDataType];
+
+              MPSGraphTensor* oneTensor = [mpsGraph constantWithScalar:1
+                                                            shape:newCachedGraph->inputTensor_.shape
+                                                            dataType:selfDataType];
+
+              MPSGraphTensor* negativeOneTensor = [mpsGraph constantWithScalar:-1
+                                                            shape:newCachedGraph->inputTensor_.shape
+                                                            dataType:selfDataType];
+
+              MPSGraphTensor* isPositive = [mpsGraph greaterThanWithPrimaryTensor:newCachedGraph->inputTensor_
+                                                            secondaryTensor:zeroTensor
+                                                            name: nil];
+
+              MPSGraphTensor* isNegative = [mpsGraph lessThanWithPrimaryTensor:newCachedGraph->inputTensor_
+                                                            secondaryTensor:zeroTensor
+                                                            name: nil];
+
+              MPSGraphTensor* notPositiveTensor = [mpsGraph selectWithPredicateTensor:isNegative
+                                                            truePredicateTensor:negativeOneTensor
+                                                            falsePredicateTensor:zeroTensor
+                                                            name:nil];
+
+              sgnTensor = [mpsGraph selectWithPredicateTensor:isPositive
+                                                            truePredicateTensor:oneTensor
+                                                            falsePredicateTensor:notPositiveTensor
+                                                            name:nil];
+            }
+            newCachedGraph->outputTensor_ = sgnTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = tmpCachedGraph->as<MPSUnaryCachedGraph>();
     }
 
-    if (self.is_complex()) {
-      std::vector<long long> realSize = self.sizes().vec();
-      realSize.push_back(2);
+    Placeholder selfPlaceholder = Placeholder(
+      cachedGraph->inputTensor_, realInput, /*mpsShape*/nullptr, /*gatherTensorData=*/true, selfDataType);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, realOutput);
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
+    };
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
+  }
+
+  if (self.is_complex()) {
+    std::vector<long long> realSize = self.sizes().vec();
+    realSize.push_back(2);
 
-      Tensor originalShape = realOutput.reshape(realSize);
-      Tensor complexOutput = at::view_as_complex(originalShape);
-      output.copy_(complexOutput);
-    } else {
-      Tensor originalShape = at::reshape(realOutput, self.sizes());
-      output.copy_(originalShape);
-    }
+    Tensor originalShape = realOutput.reshape(realSize);
+    Tensor complexOutput = at::view_as_complex(originalShape);
+    output.copy_(complexOutput);
+  } else {
+    Tensor originalShape = at::reshape(realOutput, self.sizes());
+    output.copy_(originalShape);
+  }
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/mps/operations/View.mm b/aten/src/ATen/native/mps/operations/View.mm
index 2748267a0175e..957e9cb3d29fb 100644
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@@ -59,9 +59,14 @@
                                                                                shape: inputShape
                                                                             dataType: inputType] autorelease];
     if (needsScatter) {
+      auto updatesType = getMPSScalarType(src.scalar_type());
+      if (updatesType == MPSDataTypeUInt8 || (updatesType == MPSDataTypeBool && !is_macos_13_or_newer())) {
+        updatesType = MPSDataTypeInt8;
+      }
+
       feeds[cachedGraph->updatesTensor] = [[[MPSGraphTensorData alloc] initWithMTLBuffer: (updatesBuffer != nil) ? updatesBuffer : sourceBuffer
                                                                                    shape: getMPSShape(src.numel())
-                                                                                dataType: getMPSDataType(src.scalar_type())] autorelease];
+                                                                                dataType: updatesType] autorelease];
     }
     MPSScalar storageOffsetScalar = getMPSScalar(storage_offset, ScalarType::Int);
     feeds[cachedGraph->storageOffsetTensor] = getMPSGraphTensorFromScalar(stream, storageOffsetScalar);
@@ -71,12 +76,13 @@
       strideScalars[i] = getMPSScalar(strides[i], ScalarType::Int);
       feeds[cachedGraph->strideTensors[i]] = getMPSGraphTensorFromScalar(stream, strideScalars[i]);
     }
-    // Workaround for MPSShaderLibrary bug
-    // TODO: Remove once https://github.com/pytorch/pytorch/issues/82305 is resolved
-    auto outputType = getMPSDataType(output.scalar_type());
-    if (outputType ==  MPSDataTypeUInt8) {
+    // Workaround for MPSShaderLibrary bug in macOS Monterey
+    // This is fixed in macOS Ventura
+    auto outputType = getMPSScalarType(output.scalar_type());
+    if (outputType == MPSDataTypeUInt8 || (outputType ==  MPSDataTypeBool && !is_macos_13_or_newer())) {
         outputType =  MPSDataTypeInt8;
     }
+
     MPSGraphTensorData* outputTensorData = [[[MPSGraphTensorData alloc] initWithMTLBuffer: outputBuffer
                                                                                     shape: outputShape
                                                                                  dataType: outputType] autorelease];
@@ -536,7 +542,6 @@ bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape) {
 static MPSGraphTensor* chainViewOperation(ViewCachedGraph* cachedGraph, const IntArrayRef& size,
                                           const IntArrayRef& stride, int64_t offset,
                                           const IntArrayRef& base_shape, bool needsScatter,
-                                          const bool needsBoolCast,
                                           MPSGraphTensor* updatesTensor)
 {
   MPSGraph* mpsGraph = cachedGraph->graph();
@@ -579,23 +584,9 @@ bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape) {
                                                    name: nil];
     MPSGraphTensor *inputTensor = cachedGraph->inputTensor;
 
-    // Workaround for bool scatter/gather deficiency
-    // See https://github.com/pytorch/pytorch/issues/82663
-    if (needsBoolCast) {
-      inputTensor = [mpsGraph castTensor:inputTensor
-                                  toType:MPSDataTypeInt8
-                                    name:@"Cast away from bool"];
-    }
-
     if (!needsScatter) {
       MPSGraphTensor *outputTensor = asStridedLayer_pattern(mpsGraph, inputTensor, shape_size, size, stride, offset);
-
       if (outputTensor) {
-        if (needsBoolCast) {
-          outputTensor = [mpsGraph castTensor:outputTensor
-                                       toType:MPSDataTypeBool
-                                         name:@"Cast back to bool"];
-        }
         return outputTensor;
       }
     }
@@ -628,14 +619,6 @@ bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape) {
                               withShapeTensor: shapeTensor
                                          name: nil];
     }
-
-    // Workaround for bool scatter/gather deficiency
-    // See https://github.com/pytorch/pytorch/issues/82663
-    if (needsBoolCast) {
-      outputTensor = [mpsGraph castTensor:outputTensor
-                                   toType:MPSDataTypeBool
-                                     name:@"Cast back to bool"];
-    }
   }
   return outputTensor;
 }
@@ -691,13 +674,13 @@ static IntArrayRef updateTensorBaseShape(const Tensor& self)
             MPSGraph* mpsGraph = make_mps_graph();
             MPSGraphTensor* updatesTensor = nil;
             newCachedGraph = new ViewCachedGraph(mpsGraph);
-            // Workaround for MPSShaderLibrary bug
-            // TODO: Remove once https://github.com/pytorch/pytorch/issues/82305 is resolved
+            // Workaround for MPSShaderLibrary bug in macOS Monterey
+            // This is fixed in macOS Ventura
             auto inputType = getMPSScalarType(self.scalar_type());
-            if (inputType ==  MPSDataTypeUInt8) {
-                inputType =  MPSDataTypeInt8;
+            if (inputType == MPSDataTypeUInt8 || (inputType == MPSDataTypeBool && !is_macos_13_or_newer())) {
+                inputType = MPSDataTypeInt8;
             }
-            auto needsBoolCast = inputType == MPSDataTypeBool;
+
             // Self is the input tensor we are creating view of
             newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, inputType, getMPSShape(base_shape));
             newCachedGraph->storageOffsetTensor = mpsGraphRankedPlaceHolder(mpsGraph, MPSDataTypeInt32, @[@1]);
@@ -706,7 +689,10 @@ static IntArrayRef updateTensorBaseShape(const Tensor& self)
             }
             if (needsScatter) {
               auto updatesType = getMPSScalarType(updates.scalar_type());
-              newCachedGraph->updatesTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, updatesType);
+              if (updatesType == MPSDataTypeUInt8 || (updatesType == MPSDataTypeBool && !is_macos_13_or_newer())) {
+                  updatesType = MPSDataTypeInt8;
+              }
+              newCachedGraph->updatesTensor = mpsGraphRankedPlaceHolder(mpsGraph, updatesType, getMPSShape(self.numel()));
               updatesTensor = newCachedGraph->updatesTensor;
               if (inputType != updatesType) {
                 updatesTensor = [mpsGraph castTensor:updatesTensor
@@ -714,7 +700,7 @@ static IntArrayRef updateTensorBaseShape(const Tensor& self)
                                                 name:@"castUpdatesTensor"];
               }
             }
-            newCachedGraph->outputTensor = chainViewOperation(newCachedGraph, size, stride, storage_offset, base_shape, needsScatter, needsBoolCast, updatesTensor);
+            newCachedGraph->outputTensor = chainViewOperation(newCachedGraph, size, stride, storage_offset, base_shape, needsScatter, updatesTensor);
         }
         return newCachedGraph;
       }));

From fd50f4660c9fc72ef2a06fffba13f1c37d6b33c8 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Fri, 16 Dec 2022 14:21:12 -0500
Subject: [PATCH 1862/1922] Fix the ChannelsLast memory format in cat_out_mps()
 (#212)

* Fix the ChannelsLast memory format in cat_out_mps()
- Shortened the cached graph string key for cat_out_mps()
- Clean up and refactoring

* Fix some indentation and refactoring in cat_out_mps()
---
 aten/src/ATen/native/mps/OperationUtils.h     |   4 +-
 aten/src/ATen/native/mps/OperationUtils.mm    |  25 +--
 .../ATen/native/mps/operations/BinaryOps.mm   |   2 +-
 .../native/mps/operations/PointwiseOps.mm     |   2 +-
 aten/src/ATen/native/mps/operations/Shape.mm  | 146 +++++-------------
 .../ATen/native/mps/operations/UnaryOps.mm    |   2 +-
 test/test_mps.py                              |   2 +-
 7 files changed, 58 insertions(+), 125 deletions(-)

diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
index 343a335217352..16ea8791a5319 100644
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@@ -42,12 +42,12 @@ void runMPSGraph(
 MPSDataType getMPSDataType(ScalarType scalar_type);
 MPSDataType getMPSScalarType(ScalarType scalar_type);
 MPSScalar   getMPSScalar(const Scalar& scalar, ScalarType type);
-std::string getMPSTypeString(ScalarType scalar_type);
+std::string getMPSTypeString(ScalarType scalar_type, bool short_name = false);
 std::string scalarToMetalTypeString(const c10::ScalarType& scalar_type);
 NSArray<NSNumber*>* getTensorAxes(const Tensor& t);
 NSArray<NSNumber*>* getTensorAxes(const Tensor& t, at::OptionalIntArrayRef dim);
 std::string getMPSShapeString(MPSShape* shape);
-std::string getTensorsStringKey(const TensorList& tensors, bool use_scalar_value = false);
+std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype = false);
 std::string getArrayRefString(const IntArrayRef s);
 // use has_storage() on the returned tensor to determine if src actually is a view
 Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst);
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index ffb5ddf490267..22dca3250596e 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -63,25 +63,26 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
   }
 }
 
-std::string getMPSTypeString(ScalarType scalar_type) {
+// use short_name to avoid getting extra long cached graph keys with ops such as cat_out(), etc.
+std::string getMPSTypeString(ScalarType scalar_type, bool short_name) {
   switch (scalar_type) {
     case ScalarType::Double:
     case ScalarType::Float:
-      return "Float32";
+      return short_name ? "f32" : "Float32";
     case ScalarType::Half:
-      return "Float16";
+      return short_name ? "f16" : "Float16";
     case ScalarType::Int:
-      return "Int32";
+      return short_name ? "i32" : "Int32";
     case ScalarType::Long:
-      return "Int64";
+      return short_name ? "i64" : "Int64";
     case ScalarType::Short:
-      return "Int16";
+      return short_name ? "i16" : "Int16";
     case ScalarType::Char:
-      return "Int8";
+      return short_name ? "i8" : "Int8";
     case ScalarType::Byte:
-      return "UInt8";
+      return short_name ? "u8" : "UInt8";
     case ScalarType::Bool:
-      return "Bool";
+      return short_name ? "b8" : "Bool";
     default:
       return "Undefined";
   }
@@ -150,16 +151,16 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
   return ss.str();
 }
 
-std::string getTensorsStringKey(const TensorList& tensors, bool use_scalar_value) {
+std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype) {
     std::string str;
     // The key format per tensor would look like ":Float32[1,1,1,10]:"
     for (const Tensor& tensor: tensors) {
       str += ":";
       if (tensor.defined()) {
-        str += getMPSTypeString(tensor.scalar_type()) + "[";
+        str += getMPSTypeString(tensor.scalar_type(), short_dtype) + "[";
         // if tensor is a scalar
         if (tensor.dim() == 0) {
-          str += (use_scalar_value ? std::to_string(tensor.item().to<double>()) : "Scalar");
+          str += "Scalar";
         } else {
           const NSString* ns_shape_key = [[getMPSShape(tensor) valueForKey:@"description"] componentsJoinedByString:@","];
           str += std::string(ns_shape_key.UTF8String);
diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm
index 7efb646d72175..ed3f69f22ad44 100644
--- a/aten/src/ATen/native/mps/operations/BinaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@@ -71,7 +71,7 @@ void binaryOpTensor(const Tensor& self, const Tensor& other, const Scalar& alpha
 
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
   @autoreleasepool {
-    string key = op_name + getTensorsStringKey({self, other, output_}, /*use_scalar_value*/ false);
+    string key = op_name + getTensorsStringKey({self, other, output_});
     BinaryOpCachedGraph* cachedGraph = static_cast<BinaryOpCachedGraph *>(cache_->LookUp(key));
 
     if(!cachedGraph) {
diff --git a/aten/src/ATen/native/mps/operations/PointwiseOps.mm b/aten/src/ATen/native/mps/operations/PointwiseOps.mm
index 4c3e7d9e50cc1..9ed6298368716 100644
--- a/aten/src/ATen/native/mps/operations/PointwiseOps.mm
+++ b/aten/src/ATen/native/mps/operations/PointwiseOps.mm
@@ -35,7 +35,7 @@
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
 
   @autoreleasepool {
-    string key = op_name + getTensorsStringKey({self, tensor1, tensor2}, false);
+    string key = op_name + getTensorsStringKey({self, tensor1, tensor2});
 
     CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
 
diff --git a/aten/src/ATen/native/mps/operations/Shape.mm b/aten/src/ATen/native/mps/operations/Shape.mm
index 86e82bef93cbf..3f460437c1c00 100644
--- a/aten/src/ATen/native/mps/operations/Shape.mm
+++ b/aten/src/ATen/native/mps/operations/Shape.mm
@@ -182,25 +182,6 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
   }
 }
 
-inline c10::MemoryFormat compute_output_memory_format(const TensorList &inputs) {
-  c10::optional<c10::MemoryFormat> format = c10::nullopt;
-  for (auto &t : inputs) {
-    auto f = t.suggest_memory_format();
-    if (!format.has_value()) {
-      format = f;
-      continue;
-    }
-    if (format.value() == f) {
-      continue;
-    }
-    bool contiguous = (format.value() == c10::MemoryFormat::Contiguous || f == c10::MemoryFormat::Contiguous || format.value() != f);
-    if (contiguous) {
-      return c10::MemoryFormat::Contiguous;
-    }
-  }
-  return format.value();
-}
-
 TORCH_IMPL_FUNC(cat_out_mps)
       (const ITensorListRef& inputs,
        int64_t dimension,
@@ -214,17 +195,25 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
   if (out.numel() == 0) {
     return;
   }
-
   auto materialized_inputs = inputs.materialize();
+  auto out_dtype = at::native::result_type(inputs);
 
   int idx = 0;
   for(const Tensor& t : materialized_inputs) {
-    TORCH_CHECK(t.dim() > 0,
-             "zero-dimensional tensor (at position ", idx, ") cannot be concatenated");
+    TORCH_CHECK(t.dim() > 0, "zero-dimensional tensor (at position ", idx, ") cannot be concatenated");
+    auto lap = at::get_overlap_status(out, t);
+    TORCH_CHECK(lap != at::MemOverlapStatus::Partial && lap != at::MemOverlapStatus::Full,
+        "torch.cat(): unsupported operation: the input tensors cannot refer to any "
+        "of the output memory locations. Found overlap in input tensor ", idx);
     idx++;
   }
+  // Check for type promotion
+  TORCH_CHECK(canCast(out_dtype, out.scalar_type()),
+              "torch.cat(): input types can't be cast to the desired output type ", out.scalar_type());
+  TORCH_CHECK(inputs.size() > 0,"torch.cat(): invalid number of inputs ", inputs.size());
 
   dimension = legacy_cat_wrap_dim(dimension, materialized_inputs);
+  TORCH_CHECK(dimension >= 0, "torch.cat(): invalid dimension ", dimension);
 
   // previously, size [0] tensors were the only possible empty tensors; thus, it
   // wasn't possible to cat empty tensors unless all the other tensors were
@@ -235,28 +224,6 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
   auto should_skip = [](const Tensor& t) {
     return t.dim() == 1 && at::native::size(t, 0) == 0;
   };
-
-
-  // Check for type promotion
-  TORCH_CHECK(
-      canCast(result_type(inputs), out.scalar_type()),
-      "torch.cat(): input types ",
-      " can't be cast to the desired output type ",
-      out.scalar_type());
-
-  // Inputs cannot alias the output tensor
-  idx = 0;
-  for(const Tensor& t : materialized_inputs) {
-    auto lap = at::get_overlap_status(out, t);
-    TORCH_CHECK(
-        lap != at::MemOverlapStatus::Partial &&
-            lap != at::MemOverlapStatus::Full,
-        "torch.cat(): unsupported operation: the input tensors cannot refer to any "
-        "of the output memory locations. Found overlap in input "
-        "tensor ",
-        idx);
-    idx++;
-  }
   at::assert_no_internal_overlap(out);
 
   Tensor notSkippedTensor;
@@ -276,38 +243,22 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
     notSkippedTensor = t;
     tensor_idx++;
   }
-
   // If all inputs are empty tensors, return an empty tensor
   if (!notSkippedTensor.defined()) {
     return;
   }
-
-  TORCH_CHECK(
-      inputs.size() > 0,
-      "torch.cat(): invalid number of inputs ",
-      inputs.size());
-  TORCH_CHECK(dimension >= 0, "torch.cat(): invalid dimension ", dimension);
-
   for (const Tensor& t : inputs) {
-    TORCH_CHECK(
-        t.device() == notSkippedTensor.device(),
-        "torch.cat(): all input tensors must be on the same device. Received ",
-        t.device(),
-        " and ",
-        notSkippedTensor.device());
+    TORCH_CHECK(t.device() == notSkippedTensor.device(),
+                "torch.cat(): all input tensors must be on the same device. Received ",
+                t.device(), " and ", notSkippedTensor.device());
   }
+  TORCH_CHECK(out.device() == notSkippedTensor.device(),
+              "torch.cat(): all input tensors and out must be on the same device, but inputs are on ",
+              notSkippedTensor.device(), " and out is on ", out.device());
 
-  TORCH_CHECK(
-      out.device() == notSkippedTensor.device(),
-      "torch.cat(): all input tensors and out must be on the same device, but inputs are on ",
-      notSkippedTensor.device(),
-      " and out is on ",
-      out.device());
-
-  // TODO: memory_format is now an argument?
-  // // TODO: Factor out `compute_output_memory_format`
-  // c10::MemoryFormat memory_format = compute_output_memory_format(inputs);
-
+  if (out.suggest_memory_format() == MemoryFormat::ChannelsLast) {
+    out.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
+  }
   std::vector<int64_t> size(notSkippedTensor.sizes().vec());
 
   // Compute size of the result in the cat dimension
@@ -322,48 +273,30 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
     cat_dim_size += at::native::size(tensor, dimension);
     idx++;
   }
-
   // Compute the size of the result
   size[dimension] = cat_dim_size;
-
   // skip resizing if size of result is same as expected
   if (out.sizes() != size) {
     out.resize_(size, memory_format);
   }
-
   if (out.numel() == 0) {
     return;
   }
 
-  // Get stream
-  MPSStream* stream = getCurrentMPSStream();
-
-  struct CachedGraph : public MPSCachedGraph
-  {
+  struct CachedGraph : public MPSCachedGraph {
     CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
     std::vector<MPSGraphTensor*> inputTensors_;
     MPSGraphTensor* outputTensor_ = nil;
   };
-
   MPSGraphCache *cache_ = MPSGraphCache::getInstance();
 
-  // Make string out of skipped tensor indices
-  string skipped_indices_string = "";
-  for(int idx : skipped_tensor_indices)
-    skipped_indices_string += (std::to_string(idx)+",");
-  string input_types = "";
-  for(const Tensor& tensor : materialized_inputs)
-    input_types += (getMPSTypeString(tensor.scalar_type())+",");
-
   @autoreleasepool {
-    string key = "cat_out_mps:" + getMPSTypeString(result_type(inputs))
-                                + ":" + to_string(inputs.size())
-                                + ":" + skipped_indices_string
-                                + ":" + input_types
-                                + ":" + to_string(dimension);
-    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    string key = "cat_out_mps:" + to_string(dimension) + getTensorsStringKey(input_tensors, /*short_dtype*/true) + ":" +
+                 (memory_format == MemoryFormat::ChannelsLast ? "NHWC" : "NCHW");
+
+    CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
     if(!cachedGraph) {
-      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+      cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^ MPSCachedGraph * () {
         CachedGraph *newCachedGraph = nil;
 
         @autoreleasepool {
@@ -375,15 +308,15 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
           newCachedGraph->inputTensors_.reserve(len_tensor_array);
 
           for (const auto idx : c10::irange(len_tensor_array)) {
-            auto scalar_type = getMPSScalarType(input_tensors[idx].scalar_type());
-            if (input_tensors[idx].scalar_type() == kBool) {
+            const Tensor& tensor = input_tensors[idx];
+            auto scalar_type = getMPSScalarType(tensor.scalar_type());
+            if (tensor.scalar_type() == kBool) {
               scalar_type = MPSDataTypeInt8;
             }
-
-            newCachedGraph->inputTensors_[idx] = mpsGraphUnrankedPlaceHolder(mpsGraph, scalar_type);
-            if (input_tensors[idx].scalar_type() != result_type(inputs)) {
+            newCachedGraph->inputTensors_[idx] = mpsGraphRankedPlaceHolder(mpsGraph, scalar_type, getMPSShape(tensor, memory_format));
+            if (tensor.scalar_type() != out_dtype) {
               castInputTensors[idx] = [mpsGraph castTensor:newCachedGraph->inputTensors_[idx]
-                                                    toType:getMPSDataType(result_type(inputs))
+                                                    toType:getMPSDataType(out_dtype)
                                                       name:@"castInput"];
             } else {
               castInputTensors[idx] = newCachedGraph->inputTensors_[idx];
@@ -395,16 +328,16 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
           MPSGraphTensor* outputTensor = [mpsGraph concatTensors:inputTensorsArray
                                                        dimension:dimension // Maybe convert this from int64_t -> int32
                                                             name:nil];
-          if(getMPSDataType(result_type(inputs)) == MPSDataTypeBool) {
+          if(getMPSDataType(out_dtype) == MPSDataTypeBool) {
             outputTensor = [mpsGraph castTensor:outputTensor
                                          toType:MPSDataTypeBool
                                            name:@"outputTensor"];
           }
-          newCachedGraph->outputTensor_ = outputTensor;
+          newCachedGraph->outputTensor_ = memory_format == MemoryFormat::ChannelsLast ?
+                                         convertNHWCtoNCHW(mpsGraph, outputTensor) : outputTensor;
         }
         return newCachedGraph;
       });
-      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
     }
 
     std::vector<Placeholder> inputPlaceholders;
@@ -416,9 +349,9 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
         if (tensor.scalar_type() == kBool) {
           scalar_type = MPSDataTypeInt8;
         }
-        Placeholder currentInputPlaceholder = Placeholder(
-          cachedGraph->inputTensors_[t_idx], tensor, /*mpsShape=*/nil, /*gatherTensorData=*/true, scalar_type);
-        inputPlaceholders.push_back(currentInputPlaceholder);
+        inputPlaceholders.emplace_back(cachedGraph->inputTensors_[t_idx], tensor,
+                                       getMPSShape(tensor, memory_format),
+                                       memory_format != MemoryFormat::ChannelsLast, scalar_type);
         t_idx++;
       }
       i++;
@@ -439,9 +372,8 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
       outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
     };
 
-    mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
   }
-
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index cb3e40299bceb..039e8e5f52c80 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -29,7 +29,7 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
   }
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
   @autoreleasepool {
-    string key = op_name + getTensorsStringKey({self, output}, /*use_scalar_value*/ false);
+    string key = op_name + getTensorsStringKey({self, output});
     auto cachedGraph = cache_->LookUpAs<MPSUnaryCachedGraph>(key);
 
     if(!cachedGraph) {
diff --git a/test/test_mps.py b/test/test_mps.py
index 0f074fbe88fcd..7f73e66619dff 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8873,7 +8873,7 @@ def test_numpy_ref_mps(self, device, dtype, op):
         # does not support float64 Tensors.
         # A few ops are currently broken on their reference inputs, but not their sample inputs. These should
         # get patched up and this workaround removed.
-        broken_on_ref_inputs = op.name in ['cat', 'clamp', 'where']
+        broken_on_ref_inputs = op.name in ['clamp', 'where']
         inputs = op.reference_inputs(device, dtype) if not broken_on_ref_inputs else op.sample_inputs(device, dtype)
         for sample_input in inputs:
             self.compare_with_reference(op, op.ref, sample_input)

From 60a3399370d4b6ebc7580066525688ef596cc097 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Fri, 16 Dec 2022 14:22:12 -0500
Subject: [PATCH 1863/1922] Add rtol/atol to the assertEqual() in gradient
 results check (#213)

Also create a list of ops that would be allowed to differ up to values defined by rtol/atol with FP16
---
 test/test_mps.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index 7f73e66619dff..4c3acfbf68255 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8680,6 +8680,13 @@ class TestConsistency(TestCase):
         '__rpow__': ['torch.float32'],
     }
 
+    FP16_LOW_PRECISION_LIST = {
+        "add", "sub",
+        "__rdiv__", "__rmul__",
+        "nn.functional.huber_loss",
+        "true_divide"
+    }
+
     # Used for accept mode only
     NEW_ALLOW_LIST = defaultdict(list)
     NEW_ALLOW_LIST_GRAD = defaultdict(list)
@@ -8746,7 +8753,7 @@ def get_samples():
                 if op.name == "nn.functional.conv2d" and dtype == torch.float32:
                     atol = 1e-4
                     rtol = 3e-5
-                elif (op.name == "add" or op.name == "sub" or op.name == "nn.functional.huber_loss") and dtype == torch.float16:
+                elif (op.name in self.FP16_LOW_PRECISION_LIST) and dtype == torch.float16:
                     atol = 1e-2
                     rtol = 1e-2
                 elif (op.name == "masked.mean"):
@@ -8813,7 +8820,7 @@ def req_grad(t):
                 cpu_grad_inputs = torch.autograd.grad(diff_cpu_out, diff_cpu_arg, grad_outputs=cpu_grad_outputs, allow_unused=True)
                 mps_grad_inputs = torch.autograd.grad(diff_mps_out, diff_mps_arg, grad_outputs=mps_grad_outputs, allow_unused=True)
 
-                self.assertEqual(cpu_grad_inputs, mps_grad_inputs)
+                self.assertEqual(cpu_grad_inputs, mps_grad_inputs, atol=atol, rtol=rtol)
             except Exception as e:
                 if not generate_new_truth:
                     raise e

From 2e1dac737972216f15899c8c3005aa5aeabd5e1f Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Sat, 17 Dec 2022 18:12:42 -0800
Subject: [PATCH 1864/1922] Fix unrecognized selector build warnings in macOS
 Monterey (#215)

* Fix unrecognized selector build warnings in macOS Monterey

* Address PR comments

* Remove whitespaces
---
 aten/src/ATen/native/mps/MPSGraphVenturaOps.h | 56 +++++++++++++++++++
 .../native/mps/operations/Distributions.mm    |  1 +
 .../src/ATen/native/mps/operations/Inverse.mm |  1 +
 3 files changed, 58 insertions(+)

diff --git a/aten/src/ATen/native/mps/MPSGraphVenturaOps.h b/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
index 767c785303291..16e3845f27ebb 100644
--- a/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
+++ b/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
@@ -33,4 +33,60 @@ typedef NS_ENUM(NSUInteger, MPSGraphResizeNearestRoundingMode)
 
 - (MPSGraphTensor * _Nonnull)inverseOfTensor:(MPSGraphTensor * _Nonnull) inputTensor
                                        name:(NSString * _Nullable)name;
+
+- (MPSGraphTensor * _Nonnull) resizeNearestWithTensor:(MPSGraphTensor * _Nonnull) imagesTensor
+                                  sizeTensor:(MPSGraphTensor * _Nonnull) size
+                         nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
+                                centerResult:(BOOL) centerResult
+                                alignCorners:(BOOL) alignCorners
+                                      layout:(MPSGraphTensorNamedDataLayout) layout
+                                        name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) resizeNearestWithTensor:(MPSGraphTensor * _Nonnull) imagesTensor
+                                  sizeTensor:(MPSGraphTensor * _Nonnull) size
+                           scaleOffsetTensor:(MPSGraphTensor * _Nonnull) scaleOffset
+                         nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
+                                      layout:(MPSGraphTensorNamedDataLayout) layout
+                                        name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) resizeBilinearWithTensor:(MPSGraphTensor * _Nonnull) imagesTensor
+                                   sizeTensor:(MPSGraphTensor * _Nonnull) size
+                                 centerResult:(BOOL) centerResult
+                                 alignCorners:(BOOL) alignCorners
+                                       layout:(MPSGraphTensorNamedDataLayout) layout
+                                         name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) resizeBilinearWithTensor:(MPSGraphTensor * _Nonnull) imagesTensor
+                                   sizeTensor:(MPSGraphTensor * _Nonnull) size
+                            scaleOffsetTensor:(MPSGraphTensor * _Nonnull) scaleOffset
+                                       layout:(MPSGraphTensorNamedDataLayout) layout
+                                         name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) resizeNearestWithGradientTensor:(MPSGraphTensor * _Nonnull) gradient
+                                               input:(MPSGraphTensor * _Nonnull) input
+                                 nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
+                                        centerResult:(BOOL) centerResult
+                                        alignCorners:(BOOL) alignCorners
+                                              layout:(MPSGraphTensorNamedDataLayout) layout
+                                                name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) resizeNearestWithGradientTensor:(MPSGraphTensor * _Nonnull) gradient
+                                               input:(MPSGraphTensor * _Nonnull) input
+                                   scaleOffsetTensor:(MPSGraphTensor * _Nonnull) scaleOffset
+                                 nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
+                                              layout:(MPSGraphTensorNamedDataLayout) layout
+                                                name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) resizeBilinearWithGradientTensor:(MPSGraphTensor * _Nonnull) gradient
+                                                input:(MPSGraphTensor * _Nonnull) input
+                                         centerResult:(BOOL) centerResult
+                                         alignCorners:(BOOL) alignCorners
+                                               layout:(MPSGraphTensorNamedDataLayout) layout
+                                                 name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) resizeBilinearWithGradientTensor:(MPSGraphTensor * _Nonnull) gradient
+                                                input:(MPSGraphTensor * _Nonnull) input
+                                    scaleOffsetTensor:(MPSGraphTensor * _Nonnull) scaleOffset
+                                               layout:(MPSGraphTensorNamedDataLayout) layout
+                                                 name:(NSString * _Nullable) name;
 @end
diff --git a/aten/src/ATen/native/mps/operations/Distributions.mm b/aten/src/ATen/native/mps/operations/Distributions.mm
index ecb528301f2c0..dcaf8baf6c31c 100644
--- a/aten/src/ATen/native/mps/operations/Distributions.mm
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@@ -3,6 +3,7 @@
 #include <ATen/native/Distributions.h>
 #include <ATen/native/DistributionTemplates.h>
 #include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/mps/MPSGeneratorImpl.h>
 #include <ATen/native/TensorFactories.h>
 
diff --git a/aten/src/ATen/native/mps/operations/Inverse.mm b/aten/src/ATen/native/mps/operations/Inverse.mm
index ca028a1a864b8..2975fd9875949 100644
--- a/aten/src/ATen/native/mps/operations/Inverse.mm
+++ b/aten/src/ATen/native/mps/operations/Inverse.mm
@@ -1,5 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <torch/library.h>
 #include <c10/util/Optional.h>
 

From c8f17a5b894e4c4bb606059ebb7035c1a453e6a4 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Sat, 17 Dec 2022 18:12:54 -0800
Subject: [PATCH 1865/1922] Fix randperm CPU fallback for macOS Monterey (#214)

---
 aten/src/ATen/native/mps/operations/Distributions.mm | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Distributions.mm b/aten/src/ATen/native/mps/operations/Distributions.mm
index dcaf8baf6c31c..bfdd922ec5b8b 100644
--- a/aten/src/ATen/native/mps/operations/Distributions.mm
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@@ -348,8 +348,10 @@ Tensor normal_mps(const Tensor& mean, const Tensor& std, c10::optional<Generator
     TORCH_WARN_ONCE("MPS: randperm op is supported natively starting from macOS 13.0. ",
                     "Falling back on CPU. This may have performance implications.");
 
-    result = result.to("cpu");
-    result = at::randperm_out(result, n).to("mps");
+    auto result_cpu = result.to("cpu");
+    at::randperm_out(result_cpu, n);
+    result.resize_as_(result_cpu);
+    result.copy_(result_cpu);
     return result;
   }
 

From 699a05ca9aeb4c9dd8f7fc882be7cbef7b3123fc Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Wed, 21 Dec 2022 17:38:40 -0500
Subject: [PATCH 1866/1922] Fix the error with high watermark value on x86
 (#216)

---
 aten/src/ATen/mps/MPSAllocator.mm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/mps/MPSAllocator.mm b/aten/src/ATen/mps/MPSAllocator.mm
index 60a66ff782d89..72ed5a47e9d83 100644
--- a/aten/src/ATen/mps/MPSAllocator.mm
+++ b/aten/src/ATen/mps/MPSAllocator.mm
@@ -22,8 +22,8 @@
   static const char *verbosity_str = getenv("PYTORCH_DEBUG_MPS_ALLOCATOR");
   m_debug_verbosity = verbosity_str ? strtol(verbosity_str, nullptr, 0) : DebugVerbosity::SILENT;
 
-  // on unified memory, we set the allowed upper bound to twice the size of recommendedMaxWorkingSetSize.
-  const double high_watermark_upper_bound =  m_device.hasUnifiedMemory ? 2.0 : 1.0;
+  // we set the allowed upper bound to twice the size of recommendedMaxWorkingSetSize.
+  const double high_watermark_upper_bound = 2.0;
 
   static const char *high_watermark_ratio_str = getenv("PYTORCH_MPS_HIGH_WATERMARK_RATIO");
   m_high_watermark_ratio = high_watermark_ratio_str ? strtod(high_watermark_ratio_str, nullptr) : default_high_watermark_ratio;

From 73d086ca9b79bad7b01dcae215f6ba5cba615d49 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Thu, 22 Dec 2022 17:30:50 -0500
Subject: [PATCH 1867/1922] Fix uint8 issues in BinaryOps for Monterey (#217)

* Fix uint8 issues in BinaryOps for Monterey
Also fix the signed vs. unsigned cast issue for Monterey.

* Move empty() into block list
This op allocates uninitialized block with random values in memory which mismatches the CPU results
---
 aten/src/ATen/native/mps/OperationUtils.h     |  5 ++-
 aten/src/ATen/native/mps/OperationUtils.mm    |  7 ++--
 .../ATen/native/mps/operations/BinaryOps.mm   | 40 +++++++++----------
 test/test_mps.py                              |  2 +-
 4 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
index 16ea8791a5319..901de9aedda2f 100644
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@@ -67,7 +67,8 @@ class Placeholder {
  public:
   Placeholder() : _placeholder(nullptr), _value(nullptr), _tensor(Tensor()) {}
   Placeholder(MPSGraphTensor* mpsGraphTensor) : _placeholder(mpsGraphTensor), _value(nullptr), _tensor(Tensor()) {}
-  Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& self, MPSShape *mpsShape = nullptr, bool gatherTensorData = true, MPSDataType dataType = MPSDataTypeInvalid);
+  Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& self, MPSShape *mpsShape = nullptr,
+              bool gatherTensorData = true, MPSDataType dataType = MPSDataTypeInvalid);
   MPSGraphTensor* getMPSGraphTensor() {
     return _placeholder;
   }
@@ -89,7 +90,7 @@ MPSGraphTensor* trunc_tensor(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor);
 MPSGraphTensor* convertNHWCtoNCHW(MPSGraph *mpsGraph, MPSGraphTensor* tensor);
 MPSGraphTensor* castMPSTensor(MPSGraph *mpsGraph, MPSGraphTensor* tensor, ScalarType toType);
 MPSGraphTensorData *getMPSGraphTensorData(MPSGraph* mpsGraph, MPSStream* mpsStream, const Tensor& tensor);
-MPSGraphTensorData* getMPSGraphTensorFromScalar(MPSStream* mpsStream, MPSScalar& scalar, MPSDataType dataType = MPSDataTypeInvalid);
+MPSGraphTensorData* getMPSGraphTensorFromScalar(MPSStream* mpsStream, MPSScalar& scalar);
 
 MPSGraph* make_mps_graph();
 void printTensorNDArray(const Tensor& t);
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index 22dca3250596e..c643ee350d402 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -225,7 +225,8 @@ void printTensorNDArray(const Tensor& t) {
   return [tmpGraphTensorData mpsndarray];
 }
 
-Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& src, MPSShape *mpsShape, bool gatherTensorData, MPSDataType dataType) : _tensor(src)
+Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& src, MPSShape *mpsShape,
+                         bool gatherTensorData, MPSDataType dataType) : _tensor(src)
 {
   TORCH_CHECK(src.is_mps(), "Placeholder storage has not been allocated on MPS device!");
   // extract the pointer to MTLBuffer from the Tensor's storage
@@ -308,14 +309,14 @@ MPSScalar getMPSScalar(const Scalar& scalar, ScalarType type) {
   }
 }
 
-MPSGraphTensorData* getMPSGraphTensorFromScalar(MPSStream* mpsStream, MPSScalar& scalar, MPSDataType dataType) {
+MPSGraphTensorData* getMPSGraphTensorFromScalar(MPSStream* mpsStream, MPSScalar& scalar) {
   MPSGraphTensorData *result = nullptr;
   // Scalar pools are only supported on devices with unified memory
   if (mpsStream->device().hasUnifiedMemory) {
     scalar.buffer = at::mps::allocate_scalar_buffer(&scalar.value, scalar.size);
     result = [[[MPSGraphTensorData alloc] initWithMTLBuffer: scalar.getMTLBuffer()
                                                       shape: @[@1]
-                                                   dataType: (dataType != MPSDataTypeInvalid) ? dataType : getMPSScalarType(scalar.type)] autorelease];
+                                                   dataType: getMPSScalarType(scalar.type)] autorelease];
   } else {
     MPSNDArrayDescriptor *tensorDesc = [MPSNDArrayDescriptor descriptorWithDataType:getMPSScalarType(scalar.type) shape:@[@1]];
     MPSNDArray *tensorNDArray = [[[MPSNDArray alloc] initWithDevice:mpsStream->device() descriptor:tensorDesc] autorelease];
diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm
index ed3f69f22ad44..c6ae0c8c60349 100644
--- a/aten/src/ATen/native/mps/operations/BinaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@@ -58,14 +58,16 @@ void binaryOpTensor(const Tensor& self, const Tensor& other, const Scalar& alpha
   auto otherDataType = other.scalar_type();
   auto outputDataType = output_.scalar_type();
   if (!is_macos_13_or_newer()) {
-    if (self.scalar_type() == kBool) {
-      inputDataType = kChar;
-    }
-    if (other.scalar_type() == kBool) {
-      otherDataType = kChar;
-    }
-    if (output.scalar_type() == kBool) {
-      outputDataType = kChar;
+    // workaround for signed vs. unsigned comparison issue in MacOS 12
+    if (outputDataType == kBool && (inputDataType == kByte || otherDataType == kByte)) {
+      inputDataType = otherDataType = kByte;
+    } else {
+      if (inputDataType == kBool || inputDataType == kByte) {
+        inputDataType = kChar;
+      }
+      if (otherDataType == kBool || otherDataType == kByte) {
+        otherDataType = kChar;
+      }
     }
   }
 
@@ -108,8 +110,7 @@ void binaryOpTensor(const Tensor& self, const Tensor& other, const Scalar& alpha
           newCachedGraph->outputTensor = binaryBlock(newCachedGraph, primaryCastTensor, secondaryCastTensor);
           // Cast output tensor to an expected type if needed, which addresses discrepancy when int64 scalar is added to int32 tensor
           // Output tensor should have been promoted but it remains an int32 tensor
-          if (outputDataType != common_dtype ||
-             [newCachedGraph->outputTensor dataType] != getMPSDataType(outputDataType)) {
+          if (outputDataType != common_dtype) {
             newCachedGraph->outputTensor = castMPSTensor(mpsGraph, newCachedGraph->outputTensor, outputDataType);
           }
         }
@@ -126,19 +127,19 @@ void binaryOpTensor(const Tensor& self, const Tensor& other, const Scalar& alpha
     MPSScalar alpha_scalar;
 
     if (is_self_scalar && !self.is_mps()) {
-      self_scalar = getMPSScalar(self.item(), self.scalar_type());
-      feeds[cachedGraph->primaryTensor] = getMPSGraphTensorFromScalar(mpsStream, self_scalar, getMPSScalarType(inputDataType));
+      self_scalar = getMPSScalar(self.item(), inputDataType);
+      feeds[cachedGraph->primaryTensor] = getMPSGraphTensorFromScalar(mpsStream, self_scalar);
     } else {
-      selfPlaceholder = Placeholder(
-        cachedGraph->primaryTensor, self,  /*mpsShape*/nil, /*gatherTensorData=*/true, getMPSScalarType(inputDataType));
+      selfPlaceholder = Placeholder(cachedGraph->primaryTensor, self, /*mpsShape*/nil,
+                                    /*gatherTensorData=*/true, getMPSScalarType(inputDataType));
       feeds[selfPlaceholder.getMPSGraphTensor()] = selfPlaceholder.getMPSGraphTensorData();
     }
     if (is_other_scalar && !other.is_mps()) {
-      other_scalar = getMPSScalar(other.item(), other.scalar_type());
-      feeds[cachedGraph->secondaryTensor] = getMPSGraphTensorFromScalar(mpsStream, other_scalar, getMPSScalarType(otherDataType));
+      other_scalar = getMPSScalar(other.item(), otherDataType);
+      feeds[cachedGraph->secondaryTensor] = getMPSGraphTensorFromScalar(mpsStream, other_scalar);
     } else {
-      otherPlaceholder = Placeholder(
-        cachedGraph->secondaryTensor, other,  /*mpsShape*/nil, /*gatherTensorData=*/true, getMPSScalarType(otherDataType));
+      otherPlaceholder = Placeholder(cachedGraph->secondaryTensor, other,  /*mpsShape*/nil,
+                                     /*gatherTensorData=*/true, getMPSScalarType(otherDataType));
       feeds[otherPlaceholder.getMPSGraphTensor()] = otherPlaceholder.getMPSGraphTensorData();
     }
 
@@ -148,8 +149,7 @@ void binaryOpTensor(const Tensor& self, const Tensor& other, const Scalar& alpha
       feeds[cachedGraph->alphaTensor] = getMPSGraphTensorFromScalar(mpsStream, alpha_scalar);
     }
 
-    Placeholder outputPlaceholder = Placeholder(
-      cachedGraph->outputTensor, needsCopyToOutput ? output : output_,  /*mpsShape*/nil, /*gatherTensorData=*/false, getMPSScalarType(outputDataType));
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, needsCopyToOutput ? output : output_);
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
       outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
     };
diff --git a/test/test_mps.py b/test/test_mps.py
index 4c3acfbf68255..d9a74fa8a0268 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8192,7 +8192,6 @@ class TestConsistency(TestCase):
         'einsum': ['f32'],
         'equal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'eq': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'empty': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'erf': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'exp': ['b8', 'f32', 'i16', 'i32', 'u8'],
         'exp2': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
@@ -8669,6 +8668,7 @@ class TestConsistency(TestCase):
          # these fill tensors with uninitialized data, causing mismatch with CPU
         'new_empty': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         'empty_like': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'empty': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
         # problem 103190467, as_strided_scatter has non-deterministic behavior when the update indices are not unique
         'as_strided_scatter': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
 

From e6c405f01e6efbae9a0d3105f8d025be40b29a64 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Wed, 21 Dec 2022 17:37:09 +0000
Subject: [PATCH 1868/1922] [MPS] Add MPSHooks interface to enable accessing
 MPS functions globally (#91104)

This PR is a prerequisite to the upcoming MPSGenerator changes required for Random Ops.

Add `MPSHooksInterface.cpp` to `aten_cpu_source_non_codegen_list`

Co-authored-by: Nikita Shulga <nikita.shulga@gmail.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/91104
Approved by: https://github.com/kulinseth, https://github.com/malfet
---
 aten/src/ATen/detail/MPSHooksInterface.cpp | 10 ++++------
 build_variables.bzl                        |  1 +
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/detail/MPSHooksInterface.cpp b/aten/src/ATen/detail/MPSHooksInterface.cpp
index 823b2295b1ace..a73e456caff58 100644
--- a/aten/src/ATen/detail/MPSHooksInterface.cpp
+++ b/aten/src/ATen/detail/MPSHooksInterface.cpp
@@ -1,3 +1,5 @@
+//  Copyright © 2022 Apple Inc.
+
 #include <ATen/detail/MPSHooksInterface.h>
 #include <c10/util/Exception.h>
 #include <c10/util/CallOnce.h>
@@ -12,16 +14,12 @@ const MPSHooksInterface& getMPSHooks() {
   c10::call_once(once, [] {
     mps_hooks = MPSHooksRegistry()->Create("MPSHooks", MPSHooksArgs{});
     if (!mps_hooks) {
-      mps_hooks =
-          // NOLINTNEXTLINE(modernize-make-unique)
-          std::unique_ptr<MPSHooksInterface>(new MPSHooksInterface());
+      mps_hooks = std::make_unique<MPSHooksInterface>();
     }
   });
 #else
   if (mps_hooks == nullptr) {
-    mps_hooks =
-        // NOLINTNEXTLINE(modernize-make-unique)
-        std::unique_ptr<MPSHooksInterface>(new MPSHooksInterface());
+    mps_hooks = std::make_unique<MPSHooksInterface>();
   }
 #endif
   return *mps_hooks;
diff --git a/build_variables.bzl b/build_variables.bzl
index aa0a8f8856c67..d9879f1a7b02e 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -1113,6 +1113,7 @@ aten_cpu_source_non_codegen_list = [
     "aten/src/ATen/cpu/FlushDenormal.cpp",
     "aten/src/ATen/detail/CPUGuardImpl.cpp",
     "aten/src/ATen/detail/CUDAHooksInterface.cpp",
+    "aten/src/ATen/detail/MPSHooksInterface.cpp",
     "aten/src/ATen/detail/HIPHooksInterface.cpp",
     "aten/src/ATen/detail/ORTHooksInterface.cpp",
     "aten/src/ATen/metal/Context.cpp",

From b2cf2c5af7aeca9741aced90a10aeb93e85180a1 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 3 Jan 2023 18:32:14 +0200
Subject: [PATCH 1869/1922] Remove unused pragma (#218)

---
 aten/src/ATen/native/mps/operations/Indexing.mm | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index 1d5a6aceb05b3..80a9fbd1d54f6 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -307,15 +307,9 @@ Tensor nonzero_fallback(const Tensor& self) {
           MPSGraphTensor *maskTensor = [mpsGraph castTensor:inputNotEqualToZeroTensor
                                                      toType:MPSDataTypeInt32
                                                        name:@"castToInt32"];
-
-          C10_CLANG_DIAGNOSTIC_PUSH()
-          #if C10_CLANG_HAS_WARNING("-Wobjc-method-access")
-          C10_CLANG_DIAGNOSTIC_IGNORE("-Wobjc-method-access")
-          #endif
           MPSGraphTensor *indicesTensor = [mpsGraph cumulativeSumWithTensor:maskTensor
                                                                        axis:0
                                                                        name:nil];
-          C10_CLANG_DIAGNOSTIC_POP()
           MPSGraphTensor *indicesMinusOneTensor = [mpsGraph subtractionWithPrimaryTensor:indicesTensor
                                                                         secondaryTensor:oneTensor
                                                                                    name:nil];

From 1d51d540d08cafff069abdccb8ef480a6a5f0636 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Tue, 3 Jan 2023 16:01:19 +0000
Subject: [PATCH 1870/1922] [MPS] Implement MPSGenerator to enable manual
 random seeding (#91348)

This patch adds support for creating torch.Generator for MPS device, and enables its functions such as manual_seed, get_state, and set_state.
Fixes #84288 and #84516
Pull Request resolved: https://github.com/pytorch/pytorch/pull/91348
Approved by: https://github.com/malfet, https://github.com/albanD
---
 aten/src/ATen/mps/MPSGeneratorImpl.mm          |  2 +-
 .../native/mps/operations/Distributions.mm     |  1 -
 torch/csrc/Generator.cpp                       | 18 ++++--------------
 3 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/aten/src/ATen/mps/MPSGeneratorImpl.mm b/aten/src/ATen/mps/MPSGeneratorImpl.mm
index 8f2d5168b71b8..7eb6b7d987826 100644
--- a/aten/src/ATen/mps/MPSGeneratorImpl.mm
+++ b/aten/src/ATen/mps/MPSGeneratorImpl.mm
@@ -92,7 +92,7 @@ Generator createMPSGenerator(uint64_t seed_val) {
 }
 
 MPSGeneratorImpl* MPSGeneratorImpl::clone_impl() const {
-  auto gen = new MPSGeneratorImpl(this->data_.seed);
+  auto gen = new MPSGeneratorImpl();
   gen->set_current_seed(this->data_.seed);
   return gen;
 }
diff --git a/aten/src/ATen/native/mps/operations/Distributions.mm b/aten/src/ATen/native/mps/operations/Distributions.mm
index bfdd922ec5b8b..1b395a3b9071d 100644
--- a/aten/src/ATen/native/mps/operations/Distributions.mm
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@@ -14,7 +14,6 @@
 struct RandomCachedGraph : public MPSCachedGraph
 {
   RandomCachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) { }
-
   // Only relevant for multinomial
   MPSGraphTensor *probTensor = nil;
   MPSGraphTensor *resultTensor = nil;
diff --git a/torch/csrc/Generator.cpp b/torch/csrc/Generator.cpp
index d5939496eff45..241628ae8938b 100644
--- a/torch/csrc/Generator.cpp
+++ b/torch/csrc/Generator.cpp
@@ -56,7 +56,6 @@ static PyObject* THPGenerator_pynew(
   auto device = r.deviceWithDefault(0, at::Device(at::kCPU));
 
   THPGeneratorPtr self((THPGenerator*)type->tp_alloc(type, 0));
-#if defined(USE_CUDA) || defined(USE_MPS)
   if (device.type() == at::kCPU) {
     self->cdata = make_generator<CPUGeneratorImpl>();
   }
@@ -75,14 +74,6 @@ static PyObject* THPGenerator_pynew(
         c10::DeviceTypeName(device.type()),
         " is not supported for torch.Generator() api.");
   }
-#else
-  TORCH_CHECK(
-      device.type() == at::kCPU,
-      "Device type ",
-      c10::DeviceTypeName(device.type()),
-      " is not supported for torch.Generator() api.");
-  self->cdata = make_generator<CPUGeneratorImpl>();
-#endif
   return (PyObject*)self.release();
   END_HANDLE_TH_ERRORS
 }
@@ -104,11 +95,10 @@ static PyObject* THPGenerator_setState(PyObject* _self, PyObject* _new_state) {
   using namespace torch::autograd;
 
   HANDLE_TH_ERRORS
-  if (!THPVariable_Check(_new_state)) {
-    throw torch::TypeError(
-        "expected a torch.ByteTensor, but got %s",
-        Py_TYPE(_new_state)->tp_name);
-  }
+  TORCH_CHECK_TYPE(
+      THPVariable_Check(_new_state),
+      "expected a torch.ByteTensor, but got ",
+      Py_TYPE(_new_state)->tp_name);
   auto self = (THPGenerator*)_self;
   auto& gen = self->cdata;
   const auto& new_state_tensor = THPVariable_Unpack(_new_state);

From bf4e7d995b6028a0e0c408f66926197e413d4f9d Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Tue, 3 Jan 2023 12:49:20 -0500
Subject: [PATCH 1871/1922] Remove the unused code for view lists in
 OperationUtils.h (#219)

---
 aten/src/ATen/native/mps/OperationUtils.h | 30 +++--------------------
 1 file changed, 3 insertions(+), 27 deletions(-)

diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
index 901de9aedda2f..b24f0c633eb42 100644
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@@ -176,7 +176,7 @@ struct MPSGraphCache
   MPSGraphCache(const MPSGraphCache&) = delete;
   void operator=(const MPSGraphCache&) = delete;
 
-  MPSCachedGraph* CreateCachedGraph(const std::string& key, CreateCachedGraphBlock createCacheBlock, void* view_ptr = nullptr) {
+  MPSCachedGraph* CreateCachedGraph(const std::string& key, CreateCachedGraphBlock createCacheBlock) {
 
     __block MPSCachedGraph * result = nil;
 
@@ -194,17 +194,14 @@ struct MPSGraphCache
         result = createCacheBlock();
         CacheEntry entry(key, result);
         cache_.emplace(hash, entry);
-        if (view_ptr) {
-          views_list.insert(std::make_pair(view_ptr, hash));
-        }
       }
     });
     return result;
   }
 
   template<typename T>
-  inline T* CreateCachedGraphAs(const std::string& key, CreateCachedGraphBlock createCacheBlock, void* view_ptr = nullptr) {
-    return static_cast<T *>(CreateCachedGraph(key, createCacheBlock, view_ptr));
+  inline T* CreateCachedGraphAs(const std::string& key, CreateCachedGraphBlock createCacheBlock) {
+    return static_cast<T *>(CreateCachedGraph(key, createCacheBlock));
   }
 
   MPSCachedGraph* LookUp(const std::string& key) const {
@@ -229,24 +226,6 @@ struct MPSGraphCache
     return static_cast<T *>(LookUp(key));
   }
 
-  void FindAndRemoveViewEntry(void* ptr) {
-    // this may find multiple view entries with the same buffer pointers
-    auto views_range = views_list.equal_range(ptr);
-    if (views_range.first == views_range.second)
-      return;
-    for (auto view_it = views_range.first; view_it != views_range.second; ++view_it) {
-      MPSCacheKey hash = view_it->second;
-      // find the cache entry associated with the hash
-      auto cache_it = cache_.find(hash);
-      if (cache_it != cache_.end()) {
-        cache_.erase(cache_it);
-        delete cache_it->second.cachedGraph_;
-      }
-    }
-    // this erase-by-key will remove all pairs in the list with the same key
-    views_list.erase(ptr);
-  }
-
  private:
   MPSGraphCache() {
     serialQueue_ = dispatch_queue_create("cache queue", DISPATCH_QUEUE_SERIAL);
@@ -254,9 +233,6 @@ struct MPSGraphCache
 
   static MPSGraphCache* _instance_cache;
   std::unordered_map<MPSCacheKey, CacheEntry> cache_;
-  // list of buffers associated with view entries in the cache
-  // note that multiple view cache entries could use the same buffer pointer
-  std::unordered_multimap<void*, MPSCacheKey> views_list;
   dispatch_queue_t serialQueue_ = nullptr;
 
 };

From d5e2e4e14ba4e0ceeef84bab3a486b189050669e Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 3 Jan 2023 23:09:45 +0200
Subject: [PATCH 1872/1922] Fix build failure (#220)

---
 aten/src/ATen/native/mps/operations/Indexing.mm | 1 +
 1 file changed, 1 insertion(+)

diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index 80a9fbd1d54f6..6ec65f976c033 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -12,6 +12,7 @@
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/mps/operations/Indexing.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/Resize.h>
 #include <ATen/AccumulateType.h>
 #include <torch/library.h>

From 7f4cf840d0234e649d3a89a843ef9eb1917d6c2f Mon Sep 17 00:00:00 2001
From: skotapati <siddharth.kotapati@gmail.com>
Date: Fri, 6 Jan 2023 13:40:25 -0800
Subject: [PATCH 1873/1922] Changed conditions for handling view tensors in
 order to resolve test correctness issue in nn.functional.bilinear (#222)

Co-authored-by: Siddharth Kotapati <sidk@Siddharths-MacBook-Pro.local>
---
 aten/src/ATen/native/mps/OperationUtils.mm  | 2 +-
 aten/src/ATen/native/mps/operations/View.mm | 2 +-
 test/test_mps.py                            | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index c643ee350d402..b028613f90c47 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -252,7 +252,7 @@ void printTensorNDArray(const Tensor& t) {
   const MPSDataType mpsDataType = dataType != MPSDataTypeInvalid ? dataType :
                       _tensor.dim() == 0 ? getMPSScalarType(_tensor.scalar_type()) : getMPSDataType(_tensor.scalar_type());
 
-  if (src.is_view() && src.is_contiguous() && src.storage_offset() && sliceViewTensor) {
+  if (src.is_contiguous() && src.storage_offset() && sliceViewTensor) {
     _value = getMPSGraphTensorDataForView(src, mpsShape, mpsDataType);
   } else {
     if (!mpsShape) {
diff --git a/aten/src/ATen/native/mps/operations/View.mm b/aten/src/ATen/native/mps/operations/View.mm
index 957e9cb3d29fb..ae6e66f94d690 100644
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@@ -441,7 +441,7 @@
 }
 
 bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape) {
-  if (!src.is_view()) {
+  if (!src.is_contiguous()) {
     return false;
   }
 
diff --git a/test/test_mps.py b/test/test_mps.py
index d9a74fa8a0268..503d1f9f23946 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8264,6 +8264,7 @@ class TestConsistency(TestCase):
         'neg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.adaptive_max_pool1d': ['f32'],
         'nn.functional.adaptive_max_pool2d': ['f32'],
+        'nn.functional.bilinear': ['f32'],
         'nn.functional.binary_cross_entropy': ['f32'],
         'nn.functional.binary_cross_entropy_with_logits': ['f32'],
         'nn.functional.celu': ['f32'],
@@ -8612,7 +8613,6 @@ class TestConsistency(TestCase):
         'take_along_dim': None,
 
         # New block list ops that need investigation
-        'nn.functional.bilinear': ['torch.float32'],
         'nn.functional.conv_transpose2d': ['torch.float32'],
         'nn.functional.interpolate': ['torch.float32'],
         'topk': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],

From bcc9f63c8225a2885ac04a16a3e71aa7e5df84b4 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Mon, 9 Jan 2023 12:18:38 -0500
Subject: [PATCH 1874/1922] Fix the crash in nan_to_num() with Float16 data
 type (#226)

---
 .../src/ATen/native/mps/operations/TensorCompare.mm | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/TensorCompare.mm b/aten/src/ATen/native/mps/operations/TensorCompare.mm
index 7cf551c95e993..419f2572ea926 100644
--- a/aten/src/ATen/native/mps/operations/TensorCompare.mm
+++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm
@@ -487,9 +487,16 @@ Tensor where_mps(const Tensor& condition,
           MPSGraphTensor* subZeroTensor = [mpsGraph lessThanWithPrimaryTensor: nanFreeTensor
                                                               secondaryTensor: [mpsGraph constantWithScalar: 0.0 dataType: self_dtype]
                                                                          name: nil];
-          // the cast is a workaround for the issue #103149520 (crash when bool and fp16 passed to binary ops)
-          MPSGraphTensor* isNegInfTensor = [mpsGraph logicalANDWithPrimaryTensor: [mpsGraph castTensor: subZeroTensor toType: self_dtype name: @"castTensor"]
-                                                                 secondaryTensor: [mpsGraph isInfiniteWithTensor: nanFreeTensor name:nil]
+          MPSGraphTensor* isInfTensor = [mpsGraph isInfiniteWithTensor: nanFreeTensor name:nil];
+          // workaround for Monterey; On Ventura the output of lessThan() is always Boolean
+          if (subZeroTensor.dataType != MPSDataTypeBool) {
+            subZeroTensor = castMPSTensor(mpsGraph, subZeroTensor, kBool);
+          }
+          if (isInfTensor.dataType != MPSDataTypeBool) {
+            isInfTensor = castMPSTensor(mpsGraph, isInfTensor, kBool);
+          }
+          MPSGraphTensor* isNegInfTensor = [mpsGraph logicalANDWithPrimaryTensor: subZeroTensor
+                                                                 secondaryTensor: isInfTensor
                                                                             name: nil];
           MPSGraphTensor* negInfFreeTensor = [mpsGraph selectWithPredicateTensor: isNegInfTensor
                                                              truePredicateTensor: newCachedGraph->negInfReplacementTensor

From dabe321da1b0f5ba191bd27d9bfa901c1c7d94ed Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Mon, 9 Jan 2023 19:23:19 +0200
Subject: [PATCH 1875/1922] Fix min/max_reduction_with_dim ops (#224)

---
 .../ATen/native/mps/operations/ReduceOps.mm   | 32 +++++++++++--------
 test/test_mps.py                              |  5 +--
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index 29527af5f6919..f85f91567e857 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -1447,25 +1447,26 @@ Tensor min_mps(const Tensor& input_t) {
 
               MPSGraphTensor* inputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, input_t);
               MPSGraphTensor* outputTensor = nil;
-              if(reduction_type == MPSReductionType::MAX)
-                outputTensor = [mpsGraph reductionMaximumWithTensor:inputTensor
-                                                               axis:(NSInteger)dim_
-                                                               name:nil];
-              else if(reduction_type == MPSReductionType::MIN)
-                outputTensor = [mpsGraph reductionMinimumWithTensor:inputTensor
-                                                               axis:(NSInteger)dim_
-                                                               name:nil];
-
-              MPSGraphTensor* castInputTensor = nil;
 
+              MPSGraphTensor* castInputTensor = inputTensor;
+              bool castOutput = false;
               if(input_t.scalar_type() != ScalarType::Float &&
                  input_t.scalar_type() != ScalarType::Int   &&
-                 input_t.scalar_type() != ScalarType::Half)
+                 input_t.scalar_type() != ScalarType::Half) {
                 castInputTensor =  [mpsGraph castTensor:inputTensor
                                                  toType:MPSDataTypeInt32
                                                    name:@"castInputTensor"];
-              else
-                castInputTensor = inputTensor;
+                castOutput = true;
+              }
+
+              if(reduction_type == MPSReductionType::MAX)
+                outputTensor = [mpsGraph reductionMaximumWithTensor:castInputTensor
+                                                               axis:(NSInteger)dim_
+                                                               name:nil];
+              else if(reduction_type == MPSReductionType::MIN)
+                outputTensor = [mpsGraph reductionMinimumWithTensor:castInputTensor
+                                                               axis:(NSInteger)dim_
+                                                               name:nil];
 
               MPSGraphTensor* argreduceOutTensor = nil;
               if(reduction_type == MPSReductionType::MAX)
@@ -1481,6 +1482,11 @@ Tensor min_mps(const Tensor& input_t) {
                                                             toType:MPSDataTypeInt64
                                                               name:@"cast_out"];
 
+              if (castOutput) {
+                outputTensor = [mpsGraph castTensor:outputTensor
+                                             toType:native_mps::getMPSDataType(output_t.scalar_type())
+                                               name:@"cast_out"];
+              }
               newCachedGraph->inputTensor_ = inputTensor;
               newCachedGraph->outputTensor_ = outputTensor;
               newCachedGraph->indicesTensor_ = indicesTensor;
diff --git a/test/test_mps.py b/test/test_mps.py
index 503d1f9f23946..67181d8911083 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8401,6 +8401,8 @@ class TestConsistency(TestCase):
         'masked.std': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'masked.sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'masked.var': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'minreduction_with_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'maxreduction_with_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
     }
 
 
@@ -8594,8 +8596,7 @@ class TestConsistency(TestCase):
         'divfloor_rounding': [torch.int16, torch.int32, torch.int64],
         'divtrunc_rounding': [torch.float16],
         'norm': [torch.float16],
-        'minreduction_with_dim': [torch.bool, torch.int16, torch.uint8],
-        'maxreduction_with_dim': [torch.bool, torch.int16, torch.uint8],
+
         'square': [torch.bool, torch.int16, torch.int32, torch.int64, torch.uint8],
         # Functions that are flaky
         # These are detected as "ok" by the expect case but actually fail to run sometimes

From 8c0bb00832e25e7860c7f0351fb81bc8338141a8 Mon Sep 17 00:00:00 2001
From: Abhishek Pathak <abhipathak97@gmail.com>
Date: Mon, 9 Jan 2023 09:24:07 -0800
Subject: [PATCH 1876/1922] Make intermediate type for cumsum ScalarType::Int
 (#221)

* Make intermediate type for cumsum ScalarType::Int

* Disallow int64 as input for cumsum

* Fix error message; add test

Co-authored-by: abhipathak97 <abhipathak97@mps10.scv.apple.com>
---
 aten/src/ATen/native/mps/operations/UnaryOps.mm |  3 ++-
 test/test_mps.py                                | 17 +++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index 039e8e5f52c80..2670701bacb54 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -262,11 +262,12 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
     return;
   }
   auto input = dtype.has_value() ? self.to(dtype.value()) : self;
+  TORCH_CHECK(input.scalar_type() != ScalarType::Long, "MPS does not support cumsum op with int64 input");
   mps::unary_op(input, result, "cumsum_out_mp" + std::to_string(dim),
                 ^ MPSGraphTensor* (MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
        // cumsum is horribly broken for int8, int16 and as chances for overflow is pretty high, cast to int32
        if (isIntegralType(input.scalar_type()) && input.scalar_type() !=ScalarType::Int) {
-           inputTensor = mps::castMPSTensor(mpsGraph, inputTensor, result.scalar_type());
+           inputTensor = mps::castMPSTensor(mpsGraph, inputTensor, ScalarType::Int);
        }
        auto rc = [mpsGraph cumulativeSumWithTensor: inputTensor
                                               axis: dim
diff --git a/test/test_mps.py b/test/test_mps.py
index 67181d8911083..ddf15806a4889 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -2165,6 +2165,23 @@ def test_from_numpy_non_contiguous(self):
         t_mps = torch.tensor(a, device="mps")
         self.assertEqual(t_cpu, t_mps.to("cpu"))
 
+    def test_cumsum_all_dtypes(self):
+        def helper(dtype):
+            t = torch.tensor([1,1,1,1], device="mps", dtype=dtype)
+            t_cpu = torch.tensor([1,1,1,1], device="cpu")
+
+            a = t.cumsum(0, dtype=dtype)
+            a_cpu = t_cpu.cumsum(0, dtype=dtype)
+
+            self.assertEqual(a.cpu(), a_cpu)
+        [helper(dtype) for dtype in [torch.int8, torch.int16, torch.int32, torch.float32]]
+
+        try:
+            helper(torch.int64)
+        except Exception as e:
+            e_string = str(e)
+            self.assertEqual(e_string, "MPS does not support cumsum op with int64 input")
+
 
 class TestLogical(TestCase):
     def _wrap_tensor(self, x, device="cpu", dtype=None, requires_grad=False):

From b323910cc3951d7354c1375b6455c415620eb874 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 10 Jan 2023 17:04:47 +0200
Subject: [PATCH 1877/1922] Add 2d grid sampler (#180)

* Add grid sampler op

* Add fallback support

* Update grid_sampler_2d to use roundToEven for nearest case

* Remove tabs

* Fix indentation
---
 aten/src/ATen/native/mps/MPSGraphVenturaOps.h | 112 ++++----
 .../ATen/native/mps/operations/GridSampler.mm | 147 +++++++++++
 aten/src/ATen/native/native_functions.yaml    |   1 +
 test/test_mps.py                              | 245 ++++++++++++++++++
 4 files changed, 460 insertions(+), 45 deletions(-)
 create mode 100644 aten/src/ATen/native/mps/operations/GridSampler.mm

diff --git a/aten/src/ATen/native/mps/MPSGraphVenturaOps.h b/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
index 16e3845f27ebb..5f581dbbb78d6 100644
--- a/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
+++ b/aten/src/ATen/native/mps/MPSGraphVenturaOps.h
@@ -20,73 +20,95 @@ typedef NS_ENUM(NSUInteger, MPSGraphResizeNearestRoundingMode)
 #endif
 
 - (MPSGraphTensor * _Nonnull)cumulativeSumWithTensor:(MPSGraphTensor * _Nonnull)tensor
-                                       axis:(NSInteger)axis
-                                       name:(NSString * _Nullable)name;
+                                                axis:(NSInteger)axis
+                                                name:(NSString * _Nullable)name;
 
 - (MPSGraphTensor * _Nonnull)sortWithTensor:(MPSGraphTensor * _Nonnull)tensor
                                        axis:(NSInteger)axis
                                        name:(NSString * _Nullable)name;
 
 - (MPSGraphTensor * _Nonnull)argSortWithTensor:(MPSGraphTensor * _Nonnull)tensor
-                                       axis:(NSInteger)axis
-                                       name:(NSString * _Nullable)name;
+                                          axis:(NSInteger)axis
+                                          name:(NSString * _Nullable)name;
 
 - (MPSGraphTensor * _Nonnull)inverseOfTensor:(MPSGraphTensor * _Nonnull) inputTensor
-                                       name:(NSString * _Nullable)name;
+                                        name:(NSString * _Nullable)name;
 
 - (MPSGraphTensor * _Nonnull) resizeNearestWithTensor:(MPSGraphTensor * _Nonnull) imagesTensor
-                                  sizeTensor:(MPSGraphTensor * _Nonnull) size
-                         nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
-                                centerResult:(BOOL) centerResult
-                                alignCorners:(BOOL) alignCorners
-                                      layout:(MPSGraphTensorNamedDataLayout) layout
-                                        name:(NSString * _Nullable) name;
+                                           sizeTensor:(MPSGraphTensor * _Nonnull) size
+                                  nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
+                                         centerResult:(BOOL) centerResult
+                                         alignCorners:(BOOL) alignCorners
+                                               layout:(MPSGraphTensorNamedDataLayout) layout
+                                                 name:(NSString * _Nullable) name;
 
 - (MPSGraphTensor * _Nonnull) resizeNearestWithTensor:(MPSGraphTensor * _Nonnull) imagesTensor
-                                  sizeTensor:(MPSGraphTensor * _Nonnull) size
-                           scaleOffsetTensor:(MPSGraphTensor * _Nonnull) scaleOffset
-                         nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
-                                      layout:(MPSGraphTensorNamedDataLayout) layout
-                                        name:(NSString * _Nullable) name;
+                                           sizeTensor:(MPSGraphTensor * _Nonnull) size
+                                    scaleOffsetTensor:(MPSGraphTensor * _Nonnull) scaleOffset
+                                  nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
+                                               layout:(MPSGraphTensorNamedDataLayout) layout
+                                                 name:(NSString * _Nullable) name;
 
 - (MPSGraphTensor * _Nonnull) resizeBilinearWithTensor:(MPSGraphTensor * _Nonnull) imagesTensor
-                                   sizeTensor:(MPSGraphTensor * _Nonnull) size
-                                 centerResult:(BOOL) centerResult
-                                 alignCorners:(BOOL) alignCorners
-                                       layout:(MPSGraphTensorNamedDataLayout) layout
-                                         name:(NSString * _Nullable) name;
+                                            sizeTensor:(MPSGraphTensor * _Nonnull) size
+                                          centerResult:(BOOL) centerResult
+                                          alignCorners:(BOOL) alignCorners
+                                                layout:(MPSGraphTensorNamedDataLayout) layout
+                                                  name:(NSString * _Nullable) name;
 
 - (MPSGraphTensor * _Nonnull) resizeBilinearWithTensor:(MPSGraphTensor * _Nonnull) imagesTensor
-                                   sizeTensor:(MPSGraphTensor * _Nonnull) size
-                            scaleOffsetTensor:(MPSGraphTensor * _Nonnull) scaleOffset
-                                       layout:(MPSGraphTensorNamedDataLayout) layout
-                                         name:(NSString * _Nullable) name;
+                                            sizeTensor:(MPSGraphTensor * _Nonnull) size
+                                     scaleOffsetTensor:(MPSGraphTensor * _Nonnull) scaleOffset
+                                                layout:(MPSGraphTensorNamedDataLayout) layout
+                                                  name:(NSString * _Nullable) name;
 
 - (MPSGraphTensor * _Nonnull) resizeNearestWithGradientTensor:(MPSGraphTensor * _Nonnull) gradient
-                                               input:(MPSGraphTensor * _Nonnull) input
-                                 nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
-                                        centerResult:(BOOL) centerResult
-                                        alignCorners:(BOOL) alignCorners
-                                              layout:(MPSGraphTensorNamedDataLayout) layout
-                                                name:(NSString * _Nullable) name;
+                                                        input:(MPSGraphTensor * _Nonnull) input
+                                          nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
+                                                 centerResult:(BOOL) centerResult
+                                                 alignCorners:(BOOL) alignCorners
+                                                       layout:(MPSGraphTensorNamedDataLayout) layout
+                                                         name:(NSString * _Nullable) name;
 
 - (MPSGraphTensor * _Nonnull) resizeNearestWithGradientTensor:(MPSGraphTensor * _Nonnull) gradient
-                                               input:(MPSGraphTensor * _Nonnull) input
-                                   scaleOffsetTensor:(MPSGraphTensor * _Nonnull) scaleOffset
-                                 nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
-                                              layout:(MPSGraphTensorNamedDataLayout) layout
-                                                name:(NSString * _Nullable) name;
+                                                        input:(MPSGraphTensor * _Nonnull) input
+                                            scaleOffsetTensor:(MPSGraphTensor * _Nonnull) scaleOffset
+                                          nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
+                                                       layout:(MPSGraphTensorNamedDataLayout) layout
+                                                         name:(NSString * _Nullable) name;
 
 - (MPSGraphTensor * _Nonnull) resizeBilinearWithGradientTensor:(MPSGraphTensor * _Nonnull) gradient
-                                                input:(MPSGraphTensor * _Nonnull) input
-                                         centerResult:(BOOL) centerResult
-                                         alignCorners:(BOOL) alignCorners
-                                               layout:(MPSGraphTensorNamedDataLayout) layout
-                                                 name:(NSString * _Nullable) name;
+                                                         input:(MPSGraphTensor * _Nonnull) input
+                                                  centerResult:(BOOL) centerResult
+                                                  alignCorners:(BOOL) alignCorners
+                                                        layout:(MPSGraphTensorNamedDataLayout) layout
+                                                          name:(NSString * _Nullable) name;
 
 - (MPSGraphTensor * _Nonnull) resizeBilinearWithGradientTensor:(MPSGraphTensor * _Nonnull) gradient
-                                                input:(MPSGraphTensor * _Nonnull) input
-                                    scaleOffsetTensor:(MPSGraphTensor * _Nonnull) scaleOffset
-                                               layout:(MPSGraphTensorNamedDataLayout) layout
-                                                 name:(NSString * _Nullable) name;
+                                                         input:(MPSGraphTensor * _Nonnull) input
+                                             scaleOffsetTensor:(MPSGraphTensor * _Nonnull) scaleOffset
+                                                        layout:(MPSGraphTensorNamedDataLayout) layout
+                                                          name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) sampleGridWithSourceTensor:(MPSGraphTensor * _Nonnull) source
+                                        coordinateTensor:(MPSGraphTensor * _Nonnull) coordinates
+                                                  layout:(MPSGraphTensorNamedDataLayout) layout
+                                    normalizeCoordinates:(BOOL) normalizeCoordinates
+                                     relativeCoordinates:(BOOL) relativeCoordinates
+                                            alignCorners:(BOOL) alignCorners
+                                             paddingMode:(MPSGraphPaddingMode) paddingMode
+                                            samplingMode:(MPSGraphResizeMode) samplingMode
+                                           constantValue:(double) constantValue
+                                                    name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) sampleGridWithSourceTensor:(MPSGraphTensor * _Nonnull) source
+                                        coordinateTensor:(MPSGraphTensor * _Nonnull) coordinates
+                                                  layout:(MPSGraphTensorNamedDataLayout) layout
+                                    normalizeCoordinates:(BOOL) normalizeCoordinates
+                                     relativeCoordinates:(BOOL) relativeCoordinates
+                                            alignCorners:(BOOL) alignCorners
+                                             paddingMode:(MPSGraphPaddingMode) paddingMode
+                                     nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
+                                           constantValue:(double) constantValue
+                                                    name:(NSString * _Nullable) name;
 @end
diff --git a/aten/src/ATen/native/mps/operations/GridSampler.mm b/aten/src/ATen/native/mps/operations/GridSampler.mm
new file mode 100644
index 0000000000000..1258201763d9a
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/GridSampler.mm
@@ -0,0 +1,147 @@
+#include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/GridSamplerUtils.h>
+#include <ATen/native/mps/MPSGraphVenturaOps.h>
+
+namespace at {
+namespace native {
+
+void grid_sampler_2d_mps_impl(Tensor &output, const Tensor& input, const Tensor& grid,
+                              int64_t interpolation_mode, int64_t padding_mode,
+                              bool align_corners) {
+  using namespace mps;
+  check_grid_sampler_common(input, grid);
+  check_grid_sampler_2d(input, grid);
+
+  MPSGraphResizeMode samplingMode;
+  MPSGraphPaddingMode paddingMode;
+
+  auto memory_format = input.suggest_memory_format();
+  MPSGraphTensorNamedDataLayout inputTensorLayout =
+      (memory_format == at::MemoryFormat::Contiguous) ? MPSGraphTensorNamedDataLayoutNCHW : MPSGraphTensorNamedDataLayoutNHWC;
+
+  switch (static_cast<GridSamplerPadding>(padding_mode)) {
+    case GridSamplerPadding::Zeros:
+      paddingMode = MPSGraphPaddingModeZero; break;
+    case GridSamplerPadding::Border:
+      TORCH_CHECK(false, "MPS: Unsupported Border padding mode"); break;
+    case GridSamplerPadding::Reflection:
+      paddingMode = align_corners == true ? MPSGraphPaddingModeReflect : MPSGraphPaddingModeSymmetric; break;
+    default:
+      TORCH_CHECK(false, "MPS: Unrecognised Padding Mode: ", padding_mode);
+  }
+
+  switch (static_cast<GridSamplerInterpolation>(interpolation_mode)) {
+    case GridSamplerInterpolation::Bilinear:
+      samplingMode = MPSGraphResizeBilinear; break;
+    case GridSamplerInterpolation::Nearest:
+      samplingMode = MPSGraphResizeNearest; break;
+    case GridSamplerInterpolation::Bicubic:
+      TORCH_CHECK(false, "MPS: Unsupported Bicubic interpolation"); break;
+    default:
+      TORCH_CHECK(false, "MPS: Unrecognised interpolation mode: ", interpolation_mode); break;
+   }
+
+  MPSStream *stream = getCurrentMPSStream();
+
+  struct CachedGraph : public MPSCachedGraph {
+    CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor* inputTensor_ = nil;
+    MPSGraphTensor* gridTensor_ = nil;
+    MPSGraphTensor* outputTensor_ = nil;
+  };
+
+  MPSGraphCache* cache_ = MPSGraphCache::getInstance();
+
+  @autoreleasepool {
+    string key = "grid_sampler_2d_mps"                     +
+                  getTensorsStringKey({input, grid})       +
+                  ":" + std::to_string(interpolation_mode) +
+                  ":" + std::to_string(padding_mode)       +
+                  ":" + std::to_string(align_corners);
+
+    CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
+    if(!cachedGraph) {
+      MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
+
+        CachedGraph *newCachedGraph = nil;
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
+          MPSGraphTensor* gridTensor = mpsGraphRankedPlaceHolder(mpsGraph, grid);
+
+          MPSGraphTensor* outputTensor = nil;
+          if (static_cast<GridSamplerInterpolation>(interpolation_mode) == GridSamplerInterpolation::Nearest) {
+            outputTensor = [mpsGraph sampleGridWithSourceTensor: inputTensor
+                                               coordinateTensor: gridTensor
+                                                         layout: inputTensorLayout
+                                           normalizeCoordinates: TRUE
+                                            relativeCoordinates: FALSE
+                                                   alignCorners: align_corners
+                                                    paddingMode: paddingMode
+                                            nearestRoundingMode: MPSGraphResizeNearestRoundingModeRoundToEven
+                                                  constantValue: 0.0f
+                                                           name: nil];
+          } else {
+            outputTensor = [mpsGraph sampleGridWithSourceTensor: inputTensor
+                                               coordinateTensor: gridTensor
+                                                         layout: inputTensorLayout
+                                           normalizeCoordinates: TRUE
+                                            relativeCoordinates: FALSE
+                                                   alignCorners: align_corners
+                                                    paddingMode: paddingMode
+                                                   samplingMode: samplingMode
+                                                  constantValue: 0.0f
+                                                           name: nil];
+          }
+
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->gridTensor_ = gridTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
+        }
+        return newCachedGraph;
+      });
+      cachedGraph = static_cast<CachedGraph *>(tmpCachedGraph);
+    }
+
+    Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input);
+    Placeholder gridPlaceholder = Placeholder(cachedGraph->gridTensor_, grid);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output);
+
+
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
+      gridPlaceholder.getMPSGraphTensor() : gridPlaceholder.getMPSGraphTensorData()
+    };
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+    };
+
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+}
+
+Tensor grid_sampler_2d_mps(const Tensor& input, const Tensor& grid,
+                           int64_t interpolation_mode, int64_t padding_mode,
+                           bool align_corners) {
+  if (!is_macos_13_or_newer()) {
+    TORCH_WARN_ONCE("MPS: grid_sampler_2d op is supported natively starting from macOS 13.1. ",
+                    "Falling back on CPU. This may have performance implications.");
+
+    return at::grid_sampler_2d(
+      input.to("cpu"), grid.to("cpu"), interpolation_mode, padding_mode, align_corners).clone().to("mps");
+  }
+
+  auto in_size = input.sizes();
+  auto grid_size = grid.sizes();
+  auto output = at::empty(
+      {in_size[0], in_size[1], grid_size[1], grid_size[2]}, input.options());
+
+  grid_sampler_2d_mps_impl(
+      output, input, grid, interpolation_mode, padding_mode, align_corners);
+  return output;
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index b4fea233d1970..a0ffda96aad82 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -2706,6 +2706,7 @@
   dispatch:
     CPU, QuantizedCPU: grid_sampler_2d_cpu
     CUDA: grid_sampler_2d_cuda
+    MPS: grid_sampler_2d_mps
   autogen: grid_sampler_2d.out
   tags: canonical
 
diff --git a/test/test_mps.py b/test/test_mps.py
index ddf15806a4889..97a015a9611ef 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -7093,6 +7093,247 @@ def test_conv2d_single_stride(self):
             x_gpu = conv_gpu(y_gpu)
             self.assertEqual(x_cpu, x_gpu.cpu(), rtol=1e-03, atol=1e-05)
 
+    def test_grid_sample(self):
+        def test(N, C, H, W, mode, padding_mode, align_corners, input_requires_grad):
+            def test_shape(N, C, IH, IW, H, W, mode, padding_mode, align_corners):
+                for grid_dim_contig_order in [(0, 1, 2, 3), (0, 3, 1, 2), (3, 0, 1, 2), (0, 2, 1, 3)]:
+                    # grid_dim_contig_order specifies the dimension order that can
+                    # make grid to be contiguous.
+                    # i.e., grid.permute(grid_dim_contig_order) is contiguous.
+                    # e.g., with grid_dim_contig_order=[0, 3, 1, 2], grid should be
+                    #       initialized with contiguous tensor of shape [N, 2, H, W]
+                    #       and permuted to [N, H, W, 2] afterwards.
+                    grid_shape = [N, H, W, 2]
+                    grid_init_shape = [grid_shape[d] for d in grid_dim_contig_order]
+                    grid_fwd_permute = [None, None, None, None]
+                    for i, d in enumerate(grid_dim_contig_order):
+                        grid_fwd_permute[d] = i
+
+                    def get_grid(device='cpu', data=None):
+                        if data is not None:
+                            assert list(data.shape) == grid_shape
+                            data = data.permute(grid_dim_contig_order).to(device)
+                        else:
+                            data = torch.randn(grid_init_shape, device=device)
+                        grid = data.permute(grid_fwd_permute)
+                        assert grid.permute(grid_dim_contig_order).is_contiguous()
+                        return grid
+
+                    input_cpu = torch.randn(C, N, IH, IW).transpose(0, 1).requires_grad_(input_requires_grad)
+                    grid_cpu = get_grid().requires_grad_()
+                    out_cpu = F.grid_sample(input_cpu, grid_cpu, mode=mode, padding_mode=padding_mode,
+                                            align_corners=align_corners)
+                    self.assertTrue(out_cpu.size() == torch.Size([N, C, H, W]))
+
+                    gradients = torch.randn_like(out_cpu)
+                    out_cpu.backward(gradients)
+
+
+                    # Compare against unvectorized CPU fallback
+
+                    # NOTE [ grid_sample CPU fallback ]
+                    # grid_sample uses AVX for 2d images, but that requires 32-bit indexing for
+                    # 32-bit floats. So we also have a fallback that is used only for float tensors
+                    # requiring 64-bit indexing. That requires too much memory to run on CI, so we
+                    # also export the fallback and test it here to ensure feature parity with
+                    # the vectorized version.
+                    input_fallback = input_cpu.float().detach_().requires_grad_()
+                    grid_fallback = grid_cpu.float().detach_().requires_grad_()
+                    out_fallback = torch._grid_sampler_2d_cpu_fallback(
+                        input_fallback, grid_fallback,
+                        F.GRID_SAMPLE_INTERPOLATION_MODES[mode],
+                        F.GRID_SAMPLE_PADDING_MODES[padding_mode],
+                        align_corners)
+                    self.assertEqual(out_fallback, out_cpu.float(), atol=1e-5, rtol=5e-5)
+
+                    out_fallback.backward(gradients.float())
+                    if input_requires_grad:
+                        self.assertEqual(input_fallback.grad, input_cpu.grad.float(), atol=1e-4, rtol=5e-5)
+                    self.assertEqual(grid_fallback.grad, grid_cpu.grad.float(), atol=1e-4, rtol=5e-5)
+
+                    input_mps = input_cpu.detach().transpose(0, 1).to("mps").transpose(0, 1).requires_grad_(input_requires_grad)
+                    grid_mps = get_grid('mps', grid_cpu.detach()).requires_grad_()
+                    out_mps = F.grid_sample(input_mps, grid_mps, mode=mode, padding_mode=padding_mode,
+                                                align_corners=align_corners)
+                    self.assertEqual(out_cpu, out_mps)
+
+                    out_mps.backward(gradients.to("mps"))
+                    if input_requires_grad:
+                        self.assertEqual(input_cpu.grad, input_mps.grad)
+                    self.assertEqual(grid_cpu.grad, grid_mps.grad, atol=5e-5, rtol=0)
+
+                    # check that zero-dimensional input strides don't error out
+                    base_input = torch.randn(N, C, 1, IW)
+                    input_cpu = base_input.expand_as(input_mps).requires_grad_(input_requires_grad)
+                    out_cpu = F.grid_sample(input_cpu, grid_cpu, mode=mode, padding_mode=padding_mode,
+                                            align_corners=align_corners)
+
+                    input_mps = base_input.to("mps").expand_as(input_mps).requires_grad_(input_requires_grad)
+                    out_mps = F.grid_sample(input_mps, grid_mps, mode=mode, padding_mode=padding_mode,
+                                                align_corners=align_corners)
+                    self.assertEqual(out_cpu, out_mps)
+
+            # test same size output
+            test_shape(N, C, H, W, H, W, mode, padding_mode, align_corners)
+
+            # test larger output
+            N = random.randint(2, 8)
+            C = random.randint(2, 8)
+            IH = random.randint(2, 8)
+            IW = random.randint(2, 8)
+            H = random.randint(IH + 1, 12)
+            W = random.randint(IW + 1, 12)
+            test_shape(N, C, IH, IW, H, W, mode, padding_mode, align_corners)
+
+            # test smaller output
+            N = random.randint(2, 8)
+            C = random.randint(2, 8)
+            IH = random.randint(2, 8)
+            IW = random.randint(2, 8)
+            H = random.randint(2, IH)
+            W = random.randint(2, IW)
+            test_shape(N, C, IH, IW, H, W, mode, padding_mode, align_corners)
+
+            # test 1x1 inpput
+            N = random.randint(2, 8)
+            C = random.randint(2, 8)
+            IH = 1
+            IW = 1
+            H = random.randint(2, 5)
+            W = random.randint(2, 5)
+            test_shape(N, C, IH, IW, H, W, mode, padding_mode, align_corners)
+
+            # testing empty grid
+            N = random.randint(2, 8)
+            C = random.randint(2, 8)
+            IH = random.randint(2, 8)
+            IW = random.randint(2, 8)
+            W = random.randint(3, IW + 2)
+            test_shape(N, C, IH, IW, 0, W, mode, padding_mode, align_corners)
+
+            # testing empty channel
+            N = random.randint(2, 8)
+            IH = random.randint(2, 8)
+            IW = random.randint(2, 8)
+            H = random.randint(3, IH + 2)
+            W = random.randint(3, IW + 2)
+            test_shape(N, 0, IH, IW, H, W, mode, padding_mode, align_corners)
+
+            # testing empty batch
+            C = random.randint(2, 8)
+            IH = random.randint(2, 8)
+            IW = random.randint(2, 8)
+            H = random.randint(3, IH + 2)
+            W = random.randint(3, IW + 2)
+            test_shape(0, C, IH, IW, H, W, mode, padding_mode, align_corners)
+
+        for mode in ('bilinear', 'nearest'):
+            for padding_mode in ('zeros', 'reflection'):
+                for align_corners in (True, False):
+                    # test known input
+                    input = torch.arange(1., 11, device="mps").view(1, 1, 2, 5)
+                    grid = torch.tensor(
+                        [[[-0.9, -4.1], [0, 0.2000], [1, -1], [-0.333, 1e-6], [0.5, 1.0]],
+                         [[-1.0, -0.5], [0, 0.3333], [1, -1], [-0.200, 1e-6], [1.5, 0.5]]], device="mps").view(1, 2, 5, 2)
+                    if mode == 'bilinear':
+                        if padding_mode == 'zeros':
+                            if align_corners:
+                                groundtruth = torch.tensor(
+                                    [[0.0000, 6.0000000000, 5.0000, 4.8340, 9.0000],
+                                     [2.2500, 6.3332500450, 5.0000, 5.1000, 0.0000]], device="mps").view(1, 1, 2, 5)
+                            else:
+                                groundtruth = torch.tensor(
+                                    [[0.0000, 6.5000000000, 1.2500, 4.6675000191, 4.6250],
+                                     [0.5000, 7.1665000916, 1.2500, 5.0000000000, 0.0000]], device="mps").view(1, 1, 2, 5)
+                        elif padding_mode == 'border':
+                            if align_corners:
+                                groundtruth = torch.tensor(
+                                    [[1.2000, 6.0000000000, 5.0000, 4.8340, 9.0000],
+                                     [2.2500, 6.3332500450, 5.0000, 5.1000, 8.7500]], device="mps").view(1, 1, 2, 5)
+                            else:
+                                groundtruth = torch.tensor(
+                                    [[1.0000, 6.5000000000, 5.0000, 4.6675000191, 9.2500],
+                                     [1.0000, 7.1665000916, 5.0000, 5.0000000000, 10.0000]], device="mps").view(1, 1, 2, 5)
+                        elif padding_mode == 'reflection':
+                            if align_corners:
+                                groundtruth = torch.tensor(
+                                    [[3.4500, 6.0000000000, 5.0000, 4.8340, 9.0000],
+                                     [2.2500, 6.3332500450, 5.0000, 5.1000, 7.7500]], device="mps").view(1, 1, 2, 5)
+                            else:
+                                groundtruth = torch.tensor(
+                                    [[3.0000004768, 6.5000000000, 5.0000, 4.6675000191, 9.2500],
+                                     [1.0000000000, 7.1665000916, 5.0000, 5.0000000000, 9.2500]], device="mps").view(1, 1, 2, 5)
+                        else:
+                            raise AssertionError("missing groundtruth test for padding mode '{}'".format(padding_mode))
+                    elif mode == 'nearest':
+                        if padding_mode == 'zeros':
+                            if align_corners:
+                                groundtruth = torch.tensor(
+                                    [[0., 8., 5., 7., 9.],
+                                     [1., 8., 5., 8., 0.]], device="mps").view(1, 1, 2, 5)
+                            else:
+                                groundtruth = torch.tensor(
+                                    [[0., 8., 5., 7., 0.],
+                                     [1., 8., 5., 8., 0.]], device="mps").view(1, 1, 2, 5)
+                        elif padding_mode == 'border':
+                            if align_corners:
+                                groundtruth = torch.tensor(
+                                    [[1., 8., 5., 7., 9.],
+                                     [1., 8., 5., 8., 10.]], device="mps").view(1, 1, 2, 5)
+                            else:
+                                groundtruth = torch.tensor(
+                                    [[1., 8., 5., 7., 9.],
+                                     [1., 8., 5., 8., 10.]], device="mps").view(1, 1, 2, 5)
+                        elif padding_mode == 'reflection':
+                            if align_corners:
+                                groundtruth = torch.tensor(
+                                    [[1., 8., 5., 7., 9.],
+                                     [1., 8., 5., 8., 9.]], device="mps").view(1, 1, 2, 5)
+                            else:
+                                groundtruth = torch.tensor(
+                                    [[1., 8., 5., 7., 9.],
+                                     [1., 8., 5., 8., 9.]], device="mps").view(1, 1, 2, 5)
+                        else:
+                            raise AssertionError("missing groundtruth test for padding mode '{}'".format(padding_mode))
+                    elif mode == 'bicubic':
+                        if padding_mode == 'zeros':
+                            if align_corners:
+                                groundtruth = torch.tensor(
+                                    [[-0.10424726, 7.1400003, 5.0000, 5.7842274, 9.0000],
+                                     [2.4492188, 7.4814040, 5.0000, 6.0277520, 0.0000]], device="mps").view(1, 1, 2, 5)
+                            else:
+                                groundtruth = torch.tensor(
+                                    [[0.00000, 7.6287503, 1.0625, 5.5977230, 5.3270264],
+                                     [0.40625, 8.0288770, 1.0625, 5.9375067, -0.3515625]], device="mps").view(1, 1, 2, 5)
+                        elif padding_mode == 'border':
+                            if align_corners:
+                                groundtruth = torch.tensor(
+                                    [[1.1520010, 6.0599990, 5.0000, 4.870930, 9.0000000],
+                                     [2.1328125, 6.4258375, 5.0000, 5.076003, 8.8671875]], device="mps").view(1, 1, 2, 5)
+                            else:
+                                groundtruth = torch.tensor(
+                                    [[0.894531, 6.6050020, 4.625, 4.7138715, 9.800781],
+                                     [0.906250, 7.2822485, 4.625, 5.0000052, 10.00000]], device="mps").view(1, 1, 2, 5)
+                        elif padding_mode == 'reflection':
+                            if align_corners:
+                                groundtruth = torch.tensor(
+                                    [[3.1822524, 6.239998, 5.0000, 4.8709273, 9.00000],
+                                     [1.7812500, 6.703594, 5.0000, 5.0760007, 8.21875]], device="mps").view(1, 1, 2, 5)
+                            else:
+                                groundtruth = torch.tensor(
+                                    [[2.7993753, 6.6050020, 4.25, 4.7138715, 10.269531],
+                                     [0.8125000, 7.2822485, 4.25, 5.0000052, 9.332031]], device="mps").view(1, 1, 2, 5)
+                        else:
+                            raise AssertionError("missing groundtruth test for padding mode '{}'".format(padding_mode))
+
+                    else:
+                        raise AssertionError("missing groundtruth test for interpolation mode '{}'".format(mode))
+                    output = F.grid_sample(input, grid, mode=mode, padding_mode=padding_mode,
+                                           align_corners=align_corners)
+                    self.assertEqual(output, groundtruth, atol=1e-5, rtol=0,
+                                     msg="groundtruth comparison failed for mode={}, "
+                                     "padding_mode={}".format(mode, padding_mode))
+
 class TestAdvancedIndexing(TestCase):
     supported_dtypes = [torch.float32, torch.float16, torch.int64, torch.int32, torch.int16, torch.uint8]
     supported_np_dtypes = [np.float32, np.float16, np.int64, np.int32, np.int16, np.uint8]
@@ -8653,6 +8894,10 @@ class TestConsistency(TestCase):
         # failure due to issue: atan2() may generate NAN in output with
         'atan2': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'],
 
+        # Unsupported Border padding mode
+        'grid_sampler_2d': ['f16', 'f32', 'i16'],
+        'nn.functional.grid_sample': ['f32'],
+
         # failures due to issue #103039644: Wrong results from avgPooling2DWithSourceTensor()
         # when both ceilMode and includeZeroPadToAverage are True
         'nn.functional.avg_pool1d': ['torch.float32', 'torch.int64'],

From c071c85af49fd8f5e63fd6b813c669a3acdef298 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Thu, 12 Jan 2023 03:41:43 +0200
Subject: [PATCH 1878/1922] Regenerate TestConsistency (#225)

* Document index_put expected failure

* Regenerate TestConsistency to include all the tests

* Revert faulty changes from rebase

* Fix naming
---
 test/test_mps.py | 1166 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 947 insertions(+), 219 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index 97a015a9611ef..388f613610b04 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -27,7 +27,7 @@
 from torch.testing._internal.common_dtype import get_all_dtypes, integral_types
 import torch.backends.mps
 from torch.distributions import Uniform, Exponential
-from functools import partial
+from functools import partial, reduce
 
 from torch.testing._internal.common_methods_invocations import (
     op_db,
@@ -8356,56 +8356,63 @@ class TestConsistency(TestCase):
     # by doing `EXPECTTEST_ACCEPT=1 python test_mps.py TestConsistencyCPU`
     # You most likely do NOT want to modify this manually
     ALLOWLIST_OP = {
+        'H': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'T': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__getitem__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__radd__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__rand__': ['b8', 'i16', 'i32', 'i64', 'u8'],
         '__rdiv__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        '__rmatmul__': ['f32'],
+        '__rmatmul__': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        '__rmod__': ['f16', 'f32'],
         '__rmul__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__ror__': ['b8', 'i16', 'i32', 'i64', 'u8'],
-        '__rpow__': ['f16'],
+        '__rpow__': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        '__rsub__': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__rxor__': ['b8', 'i16', 'i32', 'i64', 'u8'],
-        'masked.argmax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.argmin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.log_softmax': ['f32'],
-        'masked.logaddexp': ['f32'],
-        'masked.logsumexp': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.norm': ['f16', 'f32'],
-        'masked.normalize': ['f16', 'f32'],
-        'masked.softmax': ['f32'],
-        'masked.softmin': ['f32'],
-        'abs': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
-        'acos': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'acosh': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        '_native_batch_norm_legit': ['f32'],
+        '_softmax_backward_data': ['f32'],
+        'abs': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'acos': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'acosh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'addbmm': ['f32'],
+        'addbmm': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'addcdiv': ['f32'],
         'addcmul': ['f32', 'i16', 'i32', 'i64', 'u8'],
-        'addmm': ['f32'],
-        'addmv': ['f32'],
-        'addr': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'addmm': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'addmv': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'addr': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'all': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'allclose': ['f16', 'f32'],
+        'amax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'amin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'aminmax': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'angle': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'any': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'arange': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'argmax': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'argmin': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'amax': ['f32'],
-        'amix': ['f32'],
-        'mean': ['f32'],
-        'meshgrid': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'meshgridvariadic_tensors': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'meshgridlist_of_tensors': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'sum': ['f32'],
-        'asin': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'asinh': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'atan': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'atan2': ['f32', 'i64'],
-        'atanh': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        'argsort': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'argwhere': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'as_strided': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'as_strided_scatter': ['b8',
+                               'f16',
+                               'f32',
+                               'i16',
+                               'i32',
+                               'i64',
+                               'u8'],
+        'asin': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'asinh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'atan': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'atan2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'atanh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'atleast_1d': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'atleast_2d': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'atleast_3d': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'baddbmm': ['f32'],
+        'baddbmm': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'bernoulli': ['f32'],
+        'bfloat16': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'bincount': ['i16', 'i32', 'i64', 'u8'],
         'bitwise_and': ['b8', 'i16', 'i32', 'i64', 'u8'],
         'bitwise_left_shift': ['i16', 'i32', 'i64', 'u8'],
         'bitwise_not': ['b8', 'i16', 'i32', 'i64', 'u8'],
@@ -8413,13 +8420,30 @@ class TestConsistency(TestCase):
         'bitwise_right_shift': ['i16', 'i32', 'i64', 'u8'],
         'bitwise_xor': ['b8', 'i16', 'i32', 'i64', 'u8'],
         'block_diag': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'bmm': ['f32'],
+        'bmm': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'bool': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'broadcast_shapes': ['f32'],
+        'broadcast_tensors': ['b8',
+                              'f16',
+                              'f32',
+                              'i16',
+                              'i32',
+                              'i64',
+                              'u8'],
         'broadcast_to': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'bucketize': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'byte': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cartesian_prod': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'cat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'ceil': ['f32', 'int32', 'int64', 'f16'],
+        'cdist': ['f32'],
+        'cdouble': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'ceil': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'cfloat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'chalf': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'char': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cholesky': ['f32'],
+        'cholesky_inverse': ['f32'],
+        'cholesky_solve': ['f32'],
         'chunk': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'clamp': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'clamp_max': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -8427,243 +8451,643 @@ class TestConsistency(TestCase):
         'clone': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'column_stack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'combinations': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'complex': ['f16', 'f32'],
         'conj': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'conj_physical': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'constant_pad_nd': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'contiguous': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'corrcoef': ['f32'],
-        'cos': ['b8', 'f32', 'i16', 'i32', 'u8', 'i64'],
-        'cosh': ['b8', 'f32', 'i16', 'i32', 'u8', 'i64'],
-        'cov': ['f32'],
-        'cumsum': ['f16', 'f32', 'int16', 'int32'],
+        'copysign': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'corrcoef': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'cos': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cosh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'count_nonzero': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cov': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'cross': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'cummax': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cummin': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cumprod': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'cumsum': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'deg2rad': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'diag': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'diag_embed': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'diagflat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'diagonal_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'diagonal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'diagonal_copy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'diagonal_scatter': ['b8',
+                             'f16',
+                             'f32',
+                             'i16',
+                             'i32',
+                             'i64',
+                             'u8'],
         'diff': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'dist': ['f32'],
-        'div': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'divfloor_rounding': ['f16', 'f32', 'u8'],
-        'divtrunc_rounding': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'digamma': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'dist': ['f16', 'f32'],
+        'div': ['f16', 'f32', 'u8', 'b8', 'i16', 'i32', 'i64'],
         'dot': ['f32', 'i16', 'i32', 'i64', 'u8'],
-        'einsum': ['f32'],
-        'equal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'double': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'dsplit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'dstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'einsum': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'empty': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'empty_like': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'eq': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'erf': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'exp': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'exp2': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
+        'equal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'erf': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'erfc': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'erfinv': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'exp': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'exp2': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'expand': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'expand_as': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'expm1': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'eye': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.fft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.fft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.fftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.fftshift': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.hfft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.hfft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.hfftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ifft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ifft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ifftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ifftshift': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ihfft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ihfft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ihfftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.irfft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.irfft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.irfftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.rfft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.rfft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.rfftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'fill': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'flatten': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'flip': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'fliplr': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'flipud': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'flip': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fliplr': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'flipud': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'float': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'floor': ['f32', 'f16', 'i16', 'i32', 'i64'],
-        'floor_divide': ['f32', 'f16'],
+        'float_power': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'floor': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'floor_divide': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fmax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fmin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fmod': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'frac': ['f16', 'f32'],
+        'frexp': ['f16', 'f32'],
+        'full': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'full_like': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'gather': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'gcd': ['i16', 'i32', 'i64', 'u8'],
         'ge': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'gradient': ['f16', 'f32', 'i16'],
-        'outer': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'geqrf': ['f32'],
+        'gradient': ['f16', 'f32', 'i16', 'i32', 'i64'],
+        'grid_sampler_2d': ['f32'],
         'gt': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'half': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'heaviside': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'histc': ['f32'],
+        'histogram': ['f32'],
+        'histogramdd': ['f32'],
+        'hsplit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'hstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'hypot': ['f32'],
+        'i0': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'igamma': ['f16', 'f32'],
+        'igammac': ['f16', 'f32'],
+        'index_copy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'index_fill': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'index_put': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'index_reduce': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'index_select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'inner': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'int': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'isclose': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'isfinite': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'isin': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'isinf': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'isnan': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'isneginf': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'isposinf': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'isreal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'kron': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'kthvalue': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'lcm': ['i16', 'i32', 'i64', 'u8'],
+        'ldexp': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'le': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'linalg.matrix_norm': ['f16'],
-        'linalg.multi_dot': ['f32'],
+        'lerp': ['f32'],
+        'lgamma': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'linalg.cholesky': ['f32'],
+        'linalg.cholesky_ex': ['f32'],
+        'linalg.cond': ['f32'],
+        'linalg.cross': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'linalg.det': ['f32'],
+        'linalg.eig': ['f32'],
+        'linalg.eigh': ['f32'],
+        'linalg.eigvals': ['f32'],
+        'linalg.eigvalsh': ['f32'],
+        'linalg.householder_product': ['f32'],
+        'linalg.inv': ['f32'],
+        'linalg.inv_ex': ['f32'],
+        'linalg.ldl_factor': ['f32'],
+        'linalg.ldl_factor_ex': ['f32'],
+        'linalg.ldl_solve': ['f32'],
+        'linalg.lstsq': ['f32'],
+        'linalg.lu': ['f32'],
+        'linalg.lu_factor': ['f32'],
+        'linalg.lu_factor_ex': ['f32'],
+        'linalg.lu_solve': ['f32'],
+        'linalg.matrix_norm': ['f16', 'f32'],
+        'linalg.matrix_power': ['f32'],
+        'linalg.matrix_rank': ['f32'],
+        'linalg.multi_dot': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'linalg.norm': ['f16', 'f32'],
+        'linalg.pinv': ['f32'],
+        'linalg.qr': ['f32'],
+        'linalg.slogdet': ['f32'],
+        'linalg.solve': ['f32'],
+        'linalg.solve_ex': ['f32'],
+        'linalg.solve_triangular': ['f32'],
         'linalg.svd': ['f32'],
+        'linalg.svdvals': ['f32'],
+        'linalg.tensorinv': ['f32'],
+        'linalg.tensorsolve': ['f32'],
+        'linalg.vander': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'linalg.vecdot': ['f32'],
         'linalg.vector_norm': ['f16', 'f32'],
         'linspace': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'log': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'log': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'log10': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'log1p': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'log2': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'log_softmax': ['f32'],
-        'logaddexp': ['f16', 'f32'],
-        'logaddexp2': ['f16', 'f32'],
+        'log': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'log10': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'log1p': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'log2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'log_softmax': ['f32', 'b8', 'f16', 'i16', 'i32', 'i64', 'u8'],
+        'logaddexp': ['f32'],
+        'logaddexp2': ['f32'],
+        'logcumsumexp': ['f32'],
+        'logdet': ['f32'],
         'logical_and': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'logical_not': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'logical_or': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'logical_xor': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'logit': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'logspace': ['f32', 'i16', 'i32', 'i64', 'u8'],
-        'logsumexp': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'logsumexp': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'long': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'lt': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'lu': ['f32'],
+        'lu_solve': ['f32'],
+        'lu_unpack': ['f32'],
+        'mH': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'mT': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.amax': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.amin': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.argmax': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.argmin': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.cumprod': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.cumsum': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.log_softmax': ['f32'],
+        'masked.logaddexp': ['f32'],
+        'masked.logsumexp': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.mean': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.median': ['f32'],
+        'masked.norm': ['f16', 'f32'],
+        'masked.normalize': ['f16', 'f32'],
+        'masked.prod': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.softmax': ['f32'],
+        'masked.softmin': ['f32'],
+        'masked.std': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.var': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'masked_fill': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'masked_select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'matmul': ['f32'],
-        'maximum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'matmul': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'matrix_exp': ['f32'],
         'max': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'maxreduction_with_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
-        'maxreduction_no_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
-        'maxbinary': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
-        'minimum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'max_pool2d_with_indices_backward': ['f32'],
+        'maximum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'mean': ['f16', 'f32'],
+        'median': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'meshgrid': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'min': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'minreduction_with_dim': ['f16', 'f32', 'i32'],
-        'minreduction_no_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
-        'minbinary': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
-        'mm': ['f32'],
-        'mv': ['f32'],
+        'minimum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'mm': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'mode': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'movedim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'msort': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'mul': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'multinomial': ['f32'],
+        'mv': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'mvlgamma': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'nan_to_num': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'nanmean': ['f16', 'f32'],
+        'nanmedian': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'nanquantile': ['f32'],
+        'nansum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'narrow': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'narrow_copy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'native_batch_norm': ['f32'],
+        'native_dropout_backward': ['b8',
+                                    'f16',
+                                    'f32',
+                                    'i16',
+                                    'i32',
+                                    'i64',
+                                    'u8'],
         'native_layer_norm': ['f32'],
         'ne': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'neg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'neg': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'new_empty': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'new_empty_strided': ['b8',
+                              'f16',
+                              'f32',
+                              'i16',
+                              'i32',
+                              'i64',
+                              'u8'],
+        'new_full': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'new_ones': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'new_zeros': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'nextafter': ['f32'],
+        'nn.functional._scaled_dot_product_attention': ['f32'],
+        'nn.functional.adaptive_avg_pool1d': ['f32'],
+        'nn.functional.adaptive_avg_pool2d': ['f32'],
+        'nn.functional.adaptive_avg_pool3d': ['f16', 'f32'],
         'nn.functional.adaptive_max_pool1d': ['f32'],
         'nn.functional.adaptive_max_pool2d': ['f32'],
-        'nn.functional.bilinear': ['f32'],
+        'nn.functional.adaptive_max_pool3d': ['f32'],
+        'nn.functional.alpha_dropout': ['f32'],
+        'nn.functional.avg_pool1d': ['f32', 'i64'],
+        'nn.functional.avg_pool2d': ['f32', 'i64'],
+        'nn.functional.avg_pool3d': ['f32', 'i64'],
+        'nn.functional.batch_norm': ['f32'],
+        'nn.functional.bilinear': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.binary_cross_entropy': ['f32'],
         'nn.functional.binary_cross_entropy_with_logits': ['f32'],
         'nn.functional.celu': ['f32'],
         'nn.functional.conv1d': ['f32'],
         'nn.functional.conv2d': ['f32'],
         'nn.functional.conv_transpose1d': ['f32'],
-        'nn.functional.cosine_embedding_loss': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.conv_transpose2d': ['f32'],
+        'nn.functional.cosine_embedding_loss': ['b8',
+                                                'f32',
+                                                'i16',
+                                                'i32',
+                                                'i64',
+                                                'u8'],
         'nn.functional.cosine_similarity': ['f32'],
         'nn.functional.cross_entropy': ['f32'],
+        'nn.functional.ctc_loss': ['f32'],
+        'nn.functional.dropout': ['f32'],
+        'nn.functional.dropout2d': ['f32'],
+        'nn.functional.dropout3d': ['f32'],
         'nn.functional.elu': ['f32'],
-        'nn.functional.feature_alpha_dropout': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.embedding': ['f16', 'f32'],
+        'nn.functional.embedding_bag': ['f16', 'f32'],
+        'nn.functional.feature_alpha_dropout': ['b8',
+                                                'f16',
+                                                'f32',
+                                                'i16',
+                                                'i32',
+                                                'i64',
+                                                'u8'],
+        'nn.functional.fractional_max_pool2d': ['f32'],
+        'nn.functional.fractional_max_pool3d': ['f32'],
         'nn.functional.gaussian_nll_loss': ['f32'],
+        'nn.functional.gelu': ['f32'],
         'nn.functional.glu': ['f32'],
+        'nn.functional.grid_sample': ['f32'],
         'nn.functional.group_norm': ['f32'],
+        'nn.functional.hardshrink': ['f32'],
+        'nn.functional.hardsigmoid': ['f32'],
+        'nn.functional.hardswish': ['f32'],
         'nn.functional.hardtanh': ['f32', 'i16', 'i32', 'i64'],
         'nn.functional.hinge_embedding_loss': ['f32'],
         'nn.functional.huber_loss': ['f16', 'f32'],
         'nn.functional.instance_norm': ['f32'],
-        'nn.functional.kl_div': ['f32', 'i16', 'i32', 'i64'],
+        'nn.functional.interpolate': ['f32', 'u8'],
+        'nn.functional.kl_div': ['f32'],
         'nn.functional.l1_loss': ['f16', 'f32'],
         'nn.functional.layer_norm': ['f32'],
         'nn.functional.leaky_relu': ['f32'],
-        'nn.functional.linear': ['f32'],
+        'nn.functional.linear': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.local_response_norm': ['f32', 'i64'],
-        'nn.functional.margin_ranking_loss': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.logsigmoid': ['f32'],
+        'nn.functional.margin_ranking_loss': ['f32',
+                                              'i16',
+                                              'i32',
+                                              'i64',
+                                              'u8'],
         'nn.functional.max_pool1d': ['f32'],
         'nn.functional.max_pool2d': ['f32'],
+        'nn.functional.max_pool3d': ['f32'],
+        'nn.functional.max_unpool1d': ['f32'],
+        'nn.functional.max_unpool2d': ['f32'],
+        'nn.functional.max_unpool3d': ['f32'],
+        'nn.functional.mish': ['f32'],
         'nn.functional.mse_loss': ['f16', 'f32'],
+        'nn.functional.multi_margin_loss': ['f32'],
+        'nn.functional.multilabel_margin_loss': ['f32'],
+        'nn.functional.multilabel_soft_margin_loss': ['f32'],
+        'nn.functional.nll_loss': ['f32'],
         'nn.functional.normalize': ['f32'],
         'nn.functional.one_hot': ['i64'],
-        'nn.functional.pad': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'nn.functional.padcircular': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'nn.functional.padconstant': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'nn.functional.padreflect': ['f32'],
-        'nn.functional.padreplicate': ['f32'],
-        'nn.functional.pairwise_distance': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'nn.functional.pixel_unshuffle': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'nn.functional.pixel_shuffle': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'nn.functional.poisson_nll_loss': ['f32', 'i16', 'i32', 'u8'],
+        'nn.functional.pad': ['b8',
+                              'f16',
+                              'f32',
+                              'i16',
+                              'i32',
+                              'i64',
+                              'u8'],
+        'nn.functional.pairwise_distance': ['f16',
+                                            'f32',
+                                            'i16',
+                                            'i32',
+                                            'i64',
+                                            'u8'],
+        'nn.functional.pdist': ['f32'],
+        'nn.functional.pixel_shuffle': ['b8',
+                                        'f16',
+                                        'f32',
+                                        'i16',
+                                        'i32',
+                                        'i64',
+                                        'u8'],
+        'nn.functional.pixel_unshuffle': ['b8',
+                                          'f16',
+                                          'f32',
+                                          'i16',
+                                          'i32',
+                                          'i64',
+                                          'u8'],
+        'nn.functional.poisson_nll_loss': ['f32',
+                                           'i16',
+                                           'i32',
+                                           'i64',
+                                           'u8'],
         'nn.functional.prelu': ['f32'],
         'nn.functional.relu': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.relu6': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.rrelu': ['f32'],
         'nn.functional.selu': ['f32'],
         'nn.functional.silu': ['f32'],
         'nn.functional.smooth_l1_loss': ['f16', 'f32'],
         'nn.functional.soft_margin_loss': ['f32'],
-        'nn.functional.softmin': ['f32'],
-        'nn.functional.softplus': ['f32'],
-        'nn.functional.softsign': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'nn.functional.tanhshrink': ['f32', 'i16', 'i32', 'u8'],
+        'nn.functional.softmin': ['f32', 'f16', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.softshrink': ['f32'],
+        'nn.functional.softsign': ['f16',
+                                   'f32',
+                                   'i16',
+                                   'i32',
+                                   'i64',
+                                   'u8'],
+        'nn.functional.tanhshrink': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.threshold': ['f32', 'i16', 'i32', 'i64', 'u8'],
-        'nn.functional.triplet_margin_loss': ['f32', 'i16', 'i32', 'i64', 'u8'],
-        'nn.functional.triplet_margin_with_distance_loss': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.triplet_margin_loss': ['f32',
+                                              'i16',
+                                              'i32',
+                                              'i64',
+                                              'u8'],
+        'nn.functional.triplet_margin_with_distance_loss': ['f32',
+                                                            'i16',
+                                                            'i32',
+                                                            'i64',
+                                                            'u8'],
+        'nn.functional.unfold': ['f16', 'f32'],
         'nn.functional.upsample_bilinear': ['f32'],
-        'nn.functional.upsample_nearest': ['f32'],
+        'nn.functional.upsample_nearest': ['f32', 'u8'],
+        'nonzero': ['b8', 'f32', 'i16', 'i32', 'i64'],
         'norm': ['f32', 'f16'],
+        'normal': ['f16', 'f32'],
+        'ones': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'ones_like': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'ormqr': ['f32'],
+        'outer': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'pca_lowrank': ['f32'],
+        'permute': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'pinverse': ['f32'],
+        'polar': ['f32'],
+        'polygamma': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'positive': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'pow': ['f16'],
+        'pow': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'prod': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'put': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'qr': ['f32'],
+        'quantile': ['f32'],
         'rad2deg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'rand_like': ['f16', 'f32'],
+        'randint': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'randint_like': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'randn': ['f16', 'f32'],
+        'randn_like': ['f16', 'f32'],
+        'ravel': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'real': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'reciprocal': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
+        'reciprocal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'remainder': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'renorm': ['f16', 'f32'],
         'repeat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'repeat_interleave': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'resize_': ['b8', 'i16', 'i32', 'i64', 'u8'],
-        'resize_as_': ['b8', 'i16', 'i32', 'i64', 'u8'],
+        'repeat_interleave': ['b8',
+                              'f16',
+                              'f32',
+                              'i16',
+                              'i32',
+                              'i64',
+                              'u8'],
+        'reshape': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'reshape_as': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'resize_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'resize_as_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'resolve_conj': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'resolve_neg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'reshape_as': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'roll': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'rot90': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'round': ['f32', 'f16', 'i16', 'i32', 'i64'],
-        'rsqrt': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        'round': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'rsqrt': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'rsub': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'scalar_tensor': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'scatter_add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'scatter_reduce': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'searchsorted': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'segment_reduce': ['f16', 'f32'],
+        'select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'select_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'sgn': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'short': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'sigmoid': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'sign': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8', 'i64'],
-        'sin': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'sinh': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        'sigmoid': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'sign': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'signal.windows.bartlett': ['f16', 'f32'],
+        'signal.windows.blackman': ['f16', 'f32'],
+        'signal.windows.cosine': ['f16', 'f32'],
+        'signal.windows.exponential': ['f16', 'f32'],
+        'signal.windows.gaussian': ['f16', 'f32'],
+        'signal.windows.general_cosine': ['f16', 'f32'],
+        'signal.windows.general_hamming': ['f16', 'f32'],
+        'signal.windows.hamming': ['f16', 'f32'],
+        'signal.windows.hann': ['f16', 'f32'],
+        'signal.windows.kaiser': ['f16', 'f32'],
+        'signbit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'sin': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'sinc': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'sinh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'slice': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'slice_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'softmax': ['f32'],
+        'softmax': ['f32', 'b8', 'f16', 'i16', 'i32', 'i64', 'u8'],
+        'sort': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.airy_ai': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.bessel_j0': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.bessel_j1': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.bessel_y0': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.bessel_y1': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.chebyshev_polynomial_t': ['b8',
+                                           'f32',
+                                           'i16',
+                                           'i32',
+                                           'i64',
+                                           'u8'],
+        'special.chebyshev_polynomial_u': ['b8',
+                                           'f32',
+                                           'i16',
+                                           'i32',
+                                           'i64',
+                                           'u8'],
+        'special.entr': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.erfcx': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.hermite_polynomial_h': ['b8',
+                                         'f32',
+                                         'i16',
+                                         'i32',
+                                         'i64',
+                                         'u8'],
+        'special.hermite_polynomial_he': ['b8',
+                                          'f32',
+                                          'i16',
+                                          'i32',
+                                          'i64',
+                                          'u8'],
+        'special.i0e': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.i1': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.i1e': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.laguerre_polynomial_l': ['b8',
+                                          'f32',
+                                          'i16',
+                                          'i32',
+                                          'i64',
+                                          'u8'],
+        'special.log_ndtr': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.modified_bessel_i0': ['b8',
+                                       'f32',
+                                       'i16',
+                                       'i32',
+                                       'i64',
+                                       'u8'],
+        'special.modified_bessel_i1': ['b8',
+                                       'f32',
+                                       'i16',
+                                       'i32',
+                                       'i64',
+                                       'u8'],
+        'special.modified_bessel_k0': ['b8',
+                                       'f32',
+                                       'i16',
+                                       'i32',
+                                       'i64',
+                                       'u8'],
+        'special.modified_bessel_k1': ['b8',
+                                       'f32',
+                                       'i16',
+                                       'i32',
+                                       'i64',
+                                       'u8'],
         'special.ndtr': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'splitlist_args': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.ndtri': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.polygamma': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.scaled_modified_bessel_k0': ['b8',
+                                              'f32',
+                                              'i16',
+                                              'i32',
+                                              'i64',
+                                              'u8'],
+        'special.scaled_modified_bessel_k1': ['b8',
+                                              'f32',
+                                              'i16',
+                                              'i32',
+                                              'i64',
+                                              'u8'],
+        'special.spherical_bessel_j0': ['b8',
+                                        'f32',
+                                        'i16',
+                                        'i32',
+                                        'i64',
+                                        'u8'],
+        'special.xlog1py': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.zeta': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'split': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'sqrt': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        'split_with_sizes': ['b8',
+                             'f16',
+                             'f32',
+                             'i16',
+                             'i32',
+                             'i64',
+                             'u8'],
+        'sqrt': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'square': ['f16', 'f32'],
         'squeeze': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'stack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'std': ['f16', 'f32'],
+        'std_mean': ['f16', 'f32'],
+        'stft': ['f32'],
         'sub': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'sum_to_size': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'svd': ['f32'],
+        'svd_lowrank': ['f32'],
+        'symeig': ['f32'],
         't': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'tan': ['b8', 'i16', 'i32', 'u8'],
-        'tanh': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'tensordot': ['f32'],
+        'take': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'take_along_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'tan': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'tanh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'tensor_split': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'tensordot': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'tile': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'topk': ['f32'],
+        'to_sparse': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'topk': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'trace': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'transpose': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'trapezoid': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'trapz': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'triangular_solve': ['f32'],
         'tril': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'tril_indices': ['i32', 'i64'],
         'triu': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'triu_indices': ['i32', 'i64'],
         'true_divide': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'trunc': ['f32'],
+        'trunc': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'unbind': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'unflatten': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'unfold': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'unfold_copy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'uniform': ['f16', 'f32'],
+        'unique_consecutive': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'unsqueeze': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'var': ['f16', 'f32'],
+        'var_mean': ['f16', 'f32'],
+        'vdot': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'view': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'view_as': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'view_as_complex': ['f16', 'f32'],
+        'view_copy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'vsplit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'vstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'zero_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'where': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'nonzero': ['f32', 'i16', 'i32', 'i64'],
-        'cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'linalg.cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'unique_consecutive': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'nn.functional.nll_loss': ['f32'],
-        'std': ['f16','f32'],
-        'var': ['f16','f32'],
-        'amax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'amin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'prod': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'mean': ['f16', 'f32'],
-        'count_nonzero': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.amax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.amin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.mean': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.prod': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.std': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.var': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'minreduction_with_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'maxreduction_with_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'xlogy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'zero_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'zeros': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'zeros_like': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8']
     }
 
-
     ALLOWLIST_OP_GRAD = {
         '__radd__': ['f16', 'f32'],
         '__rdiv__': ['f16', 'f32'],
@@ -8843,104 +9267,393 @@ class TestConsistency(TestCase):
         # Functions that hard crash
         'index_add': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'nn.functional.softplus': [torch.float32],
-        'nonzero': [torch.uint8, torch.float16],
+        'nonzero': [torch.bool, torch.uint8, torch.float16],
+        'median': [torch.float32, torch.int16, torch.int32, torch.uint8, torch.int16],
+        'sgn': [torch.bool],
+        'linalg.inv': [torch.float32],
+        'linalg.inv_ex': [torch.float32],
+        'linalg.matrix_power': [torch.float32],
+        'nn.functional.interpolate': [torch.float32],
+        'resize_': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nn.functional.interpolatearea': [torch.float32],
+        'resize_as_': [torch.float16, torch.float32],
+        'topk': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'square': [torch.bool, torch.int16, torch.int32, torch.int64, torch.uint8],
 
         # Functions with correctness issues
-        '__rpow__': None,
         'nn.functional.avg_pool1d': [torch.float32, torch.int64],
         'nn.functional.avg_pool2d': [torch.float32, torch.int64],
         'unique': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
-        'index_put': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'divfloor_rounding': [torch.int16, torch.int32, torch.int64],
         'divtrunc_rounding': [torch.float16],
         'norm': [torch.float16],
-
-        'square': [torch.bool, torch.int16, torch.int32, torch.int64, torch.uint8],
-        # Functions that are flaky
-        # These are detected as "ok" by the expect case but actually fail to run sometimes
-        'softmaxwith_dtype': None,
-        'rounddecimals_neg_3': None,
-        'rounddecimals_3': None,
-        'rounddecimals_0': None,
-        'normnuc': None,
-        'nn.functional.softminwith_dtype': None,
-        'nn.functional.feature_alpha_dropoutwith_train': None,
-        'log_softmaxwith_dtype': None,
-        'split_with_sizes': None,
-        'trapezoid': None,
-        'inner': None,
-        'take_along_dim': None,
-
-        # New block list ops that need investigation
-        'nn.functional.conv_transpose2d': ['torch.float32'],
-        'nn.functional.interpolate': ['torch.float32'],
-        'topk': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-
-        # failures due to lack of op implementation on MPS backend
-        'linalg.eig': ['torch.float32'],
-        'linalg.eigvals': ['torch.float32'],
-        'fft.fft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'fft.ifft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'fft.ihfft2': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'fft.ihfft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'fft.ihfftn': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'fft.rfft2': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'fft.rfft': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'fft.rfftn': ['torch.bool', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'stft': ['torch.float32'],
-        'nn.functional.conv_transpose3d': [torch.int64, torch.float32],
+        'nn.functional.feature_alpha_dropoutwith_train': [torch.float32],
+        'cumulative_trapezoid': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        '_native_batch_norm_legit': [torch.float32],
+        'addr': [torch.float16],
+        'as_stridedpartial_views': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'uniform': [torch.float16, torch.float32],
+        'trace': [torch.int64],
+        'tan': [torch.float32],
+        'normalnumber_mean': [torch.float16, torch.float32],
+        'nn.functional.gelu': [torch.float32],
+        'nn.functional.conv_transpose2d': [torch.float32, torch.int64],
+        'new_empty_strided': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'native_batch_norm': [torch.float32],
+        'multinomial': [torch.float32],
+        'masked.softmin': [torch.float32],
+        'masked.softmax': [torch.float32],
+        'masked.log_softmax': [torch.float32],
+        'floor_divide': [torch.int16, torch.int32, torch.int64],
+        'dist': [torch.float16],
 
         # failure due to issue: atan2() may generate NAN in output with
-        'atan2': ['torch.bool', 'torch.int16', 'torch.int32', 'torch.uint8'],
+        'atan2': [torch.bool, torch.int16, torch.int32, torch.uint8],
 
         # Unsupported Border padding mode
-        'grid_sampler_2d': ['f16', 'f32', 'i16'],
-        'nn.functional.grid_sample': ['f32'],
+        'grid_sampler_2d': [torch.float32],
+        'nn.functional.grid_sample': [torch.float32],
 
         # failures due to issue #103039644: Wrong results from avgPooling2DWithSourceTensor()
         # when both ceilMode and includeZeroPadToAverage are True
-        'nn.functional.avg_pool1d': ['torch.float32', 'torch.int64'],
-        'nn.functional.avg_pool2d': ['torch.float32', 'torch.int64'],
-        'nn.functional.adaptive_avg_pool1d': ['torch.float32'],
-        'nn.functional.adaptive_avg_pool2d': ['torch.float32'],
+        'nn.functional.avg_pool1d': [torch.float32, torch.int64],
+        'nn.functional.avg_pool2d': [torch.float32, torch.int64],
+        'nn.functional.adaptive_avg_pool1d': [torch.float32],
+        'nn.functional.adaptive_avg_pool2d': [torch.float32],
+    }
 
-        # failures due to issue #102048039: powerWithPrimaryTensor() with integer input may return wrong results
-        'pow': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        '__rpow__': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+    UNIMPLEMENTED_OPS = {
+        # Failures due to lack of op implementation on MPS backend
+        'linalg.eig': [torch.float32],
+        'linalg.eigvals': [torch.float32],
+        'fft.fft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ifft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ihfft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ihfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ihfftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.rfft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.rfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.rfftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'put': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'stft': [torch.float32],
+        'nn.functional.conv_transpose3d': [torch.int64, torch.float32],
+        'rounddecimals_neg_3': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'rounddecimals_3': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'rounddecimals_0': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        '__rmod__': [torch.float16, torch.float32],
+        '__rsub__': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'aminmax': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'angle': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'argsort': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'bucketize': [torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'cholesky': [torch.float32],
+        'cholesky_inverse': [torch.float32],
+        'cholesky_solve': [torch.float32],
+        'copysign': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'cummax': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'cummin': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'cumprod': [torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'digamma': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'erfc': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'erfinv': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fmax': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fmin': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fmod': [torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'frexp': [torch.float16, torch.float32],
+        'gcd': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'geqrf': [torch.float32],
+        'heaviside': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'histc': [torch.float32],
+        'histogram': [torch.float32],
+        'histogramdd': [torch.float32],
+        'hypot': [torch.float32],
+        'i0': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'igamma': [torch.float16, torch.float32],
+        'igammac': [torch.float16, torch.float32],
+        'index_copy': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'index_fill': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'index_reduce': [torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'isin': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'isneginf': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'isposinf': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'kthvalue': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'lcm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'ldexp': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'lerp': [torch.float32],
+        'lgamma': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'linalg.cholesky': [torch.float32],
+        'linalg.cholesky_ex': [torch.float32],
+        'linalg.cond': [torch.float32],
+        'linalg.detsingular': [torch.float32],
+        'linalg.det': [torch.float32],
+        'linalg.eig': [torch.float32],
+        'linalg.eigh': [torch.float32],
+        'linalg.eigvals': [torch.float32],
+        'linalg.eigvalsh': [torch.float32],
+        'linalg.householder_product': [torch.float32],
+        'linalg.ldl_factor': [torch.float32],
+        'linalg.ldl_factor_ex': [torch.float32],
+        'linalg.ldl_solve': [torch.float32],
+        'linalg.lstsq': [torch.float32],
+        'linalg.lstsqgrad_oriented': [torch.float32],
+        'linalg.lu': [torch.float32],
+        'linalg.lu_factor': [torch.float32],
+        'linalg.lu_factor_ex': [torch.float32],
+        'linalg.lu_solve': [torch.float32],
+        'linalg.matrix_norm': [torch.float32],
+        'linalg.norm': [torch.float32],
+        'linalg.normsubgradients_at_zero': [torch.float32],
+        'linalg.qr': [torch.float32],
+        'linalg.slogdet': [torch.float32],
+        'linalg.solve': [torch.float32],
+        'linalg.solve_ex': [torch.float32],
+        'linalg.svdvals': [torch.float32],
+        'linalg.tensorsolve': [torch.float32],
+        'linalg.vander': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'linalg.vecdot': [torch.float32],
+        'logcumsumexp': [torch.float32],
+        'logdet': [torch.float32],
+        'logit': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'lu': [torch.float32],
+        'lu_solve': [torch.float32],
+        'lu_unpack': [torch.float32],
+        'masked.cumprod': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'masked.median': [torch.float32],
+        'masked_scatter': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'matrix_exp': [torch.float32],
+        'mode': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'msort': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'mvlgamma': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'mvlgammamvlgamma_p_1': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'mvlgammamvlgamma_p_3': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'mvlgammamvlgamma_p_5': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nanquantile': [torch.float32],
+        'nanmean': [torch.float32, torch.float16],
+        'nanmedian': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nansum': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'native_dropout_backward': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nextafter': [torch.float32],
+        'normnuc': [torch.float32],
+        'nn.functional._scaled_dot_product_attention': [torch.float32],
+        'nn.functional.fractional_max_pool2d': [torch.float32],
+        'nn.functional.fractional_max_pool3d': [torch.float32],
+        'nn.functional.adaptive_avg_pool3d': [torch.float16, torch.float32],
+        'nn.functional.adaptive_max_pool3d': [torch.float32],
+        'nn.functional.interpolatebicubic': [torch.float32],
+        'nn.functional.interpolatelinear': [torch.float32],
+        'nn.functional.interpolatetrilinear': [torch.float32],
+        'nn.functional.max_unpool1dgrad': [torch.float32],
+        'nn.functional.max_unpool2dgrad': [torch.float32],
+        'nn.functional.max_unpool3dgrad': [torch.float32],
+        'nn.functional.avg_pool3d': [torch.float32, torch.int64],
+        'nn.functional.ctc_loss': [torch.float32],
+        'nn.functional.embedding_bag': [torch.float16, torch.float32],
+        'nn.functional.max_pool2d': [torch.float32],
+        'nn.functional.max_pool3d': [torch.float32],
+        'nn.functional.hardshrink': [torch.float32],
+        'nn.functional.hardsigmoid': [torch.float32],
+        'nn.functional.logsigmoid': [torch.float32],
+        'nn.functional.max_pool3d': [torch.float32],
+        'nn.functional.max_unpool1d': [torch.float32],
+        'nn.functional.max_unpool2d': [torch.float32],
+        'nn.functional.max_unpool3d': [torch.float32],
+        'nn.functional.mish': [torch.float32],
+        'nn.functional.multi_margin_loss': [torch.float32],
+        'nn.functional.multilabel_margin_loss': [torch.float32],
+        'nn.functional.multilabel_soft_margin_loss': [torch.float32],
+        'nn.functional.pdist': [torch.float32],
+        'nn.functional.rrelu': [torch.float32],
+        'nn.functional.softshrink': [torch.float32],
+        'nn.functional.unfold': [torch.float16, torch.float32],
+        'nn.functional.norm': [torch.float32],
+        'ormqr': [torch.float32],
+        'pca_lowrank': [torch.float32],
+        'pinverse': [torch.float32],
+        'polar': [torch.float32],
+        'polygamma': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'polygammapolygamma_n_0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'polygammapolygamma_n_1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'polygammapolygamma_n_2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'polygammapolygamma_n_3': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'polygammapolygamma_n_4': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'put': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'qr': [torch.float32],
+        'quantile': [torch.float32],
+        'remainder': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'renorm': [torch.float16, torch.float32],
+        'roll': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'rounddecimals_0': [torch.float32],
+        'rounddecimals_3': [torch.float32],
+        'rounddecimals_neg_3': [torch.float32],
+        'rsub': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'scatter_reduceamax': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'scatter_reduceamin': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'scatter_reducemin': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'scatter_reducemean': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'scatter_reduceprod': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'scatter_reducesum': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'searchsorted': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'segment_reduce': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'segment_reduceoffsets': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'segment_reducelengths': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'sinc': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'sort': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.airy_ai': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.bessel_j0': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.bessel_j1': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.bessel_y0': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.bessel_y1': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.chebyshev_polynomial_t': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.chebyshev_polynomial_u': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.entr': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.erfcx': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.hermite_polynomial_h': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.hermite_polynomial_he': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.i0e': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.i1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.i1e': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.laguerre_polynomial_l': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.log_ndtr': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.modified_bessel_i0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.modified_bessel_i1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.modified_bessel_k0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.modified_bessel_k1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.ndtri': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.polygamma': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.polygammaspecial_polygamma_n_0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.scaled_modified_bessel_k0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.scaled_modified_bessel_k1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.spherical_bessel_j0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.xlog1py': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.zeta': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'std_mean': [torch.float16, torch.float32],
+        'std_meanunbiased': [torch.float16, torch.float32],
+        'svd_lowrank': [torch.float32],
+        'symeig': [torch.float32],
+        'take': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'to_sparse': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'var_mean': [torch.float16, torch.float32],
+        'var_meanunbiased': [torch.float16, torch.float32],
+        'vdot': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'view_as_complex': [torch.float16, torch.float32],
+        'xlogy': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+    }
 
-        # failures due to unsupported data types on MPS backend
-        'matmul': ['torch.uint8'], # MPS device does not support mm for non-float inputs
-        'bfloat16': None,
-        'chalf': None,
+    EXPECTED_FAILURES = {
+        # Failures due to unsupported data types on MPS backend
+        'matmul': [torch.uint8], # MPS device does not support mm for non-float inputs
+        'bfloat16': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'chalf': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'nn.functional.conv1d': [torch.int64],
         'nn.functional.conv2d': [torch.int64],
         'nn.functional.conv_transpose1d': [torch.int64],
-        'nn.functional.conv_transpose2d': [torch.int64],
+        'nn.functional.softminwith_dtype': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'log_softmaxwith_dtype': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'softmaxwith_dtype': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        '__rmatmul__': [torch.int16, torch.int32, torch.uint8],
+        'addmmdecomposed': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'addbmm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'addmm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'addmv': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'baddbmm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'bmm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'cdouble': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'cfloat': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'complex': [torch.float16, torch.float32],
+        'double': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'einsum': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.fft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.fft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.fftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.fftshift': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.hfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.hfft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.hfftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ifft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ifft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ifftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ifftshift': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ihfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ihfft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ihfftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.irfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.irfft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.irfftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.rfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'float_power': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'full': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'full_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'inner': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'linalg.matrix_rank': [torch.float32],
+        'linalg.matrix_rankhermitian': [torch.float32],
+        'linalg.multi_dot': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'linalg.pinv': [torch.float32],
+        'linalg.pinvhermitian': [torch.float32],
+        'log_softmax': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'log_softmaxwith_dtype': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'matmul': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'mm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'mv': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'new_full': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'new_ones': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'new_zeros': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nn.functional.batch_norm': [torch.float32],
+        'nn.functional.bilinear': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nn.functional.linear': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nn.functional.softmin': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nn.functional.softminwith_dtype': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'ones_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'signal.windows.blackman': [torch.float16],
+        'signal.windows.cosine': [torch.float16],
+        'signal.windows.exponential': [torch.float16],
+        'signal.windows.gaussian': [torch.float16],
+        'signal.windows.general_cosine': [torch.float16],
+        'signal.windows.general_hamming': [torch.float16],
+        'signal.windows.hamming': [torch.float16],
+        'signal.windows.hann': [torch.float16],
+        'signal.windows.kaiser': [torch.float16],
+        'softmaxwith_dtype': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'stft': [torch.float32],
+        'tensordot': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'zeros_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'bincount': [torch.int16, torch.int32, torch.int64, torch.uint8],
+
+        # failures due to issue #102048039: powerWithPrimaryTensor() with integer input may return wrong results
+        'pow': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        '__rpow__': [torch.int16, torch.int32],
+    }
 
+    UNDEFINED_BEHAVIOUR = {
         # failures due to random output that they generate using
         # Philox engine causing mismatch with CPU results
-        'rand_like': ['torch.float16', 'torch.float32'],
-        'randint_like': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'randn_like': ['torch.float16', 'torch.float32'],
-        'bernoulli': ['torch.float32'],
-        'normal': ['torch.float16', 'torch.float32', 'torch.float16', 'torch.float32'],
-        'nn.functional.dropout': ['torch.float32'],
-        'nn.functional.dropout2d': ['torch.float32'],
-        'nn.functional.dropout3d': ['torch.float32'],
+        'rand_like': [torch.float16, torch.float32],
+        'randint_like': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'randn_like': [torch.float16, torch.float32],
+        'bernoulli': [torch.float32],
+        'normal': [torch.float16, torch.float32, torch.float16, torch.float32],
+        'nn.functional.alpha_dropout': [torch.float32],
+        'nn.functional.dropout': [torch.float32],
+        'nn.functional.dropout2d': [torch.float32],
+        'nn.functional.dropout3d': [torch.float32],
          # these fill tensors with uninitialized data, causing mismatch with CPU
-        'new_empty': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'empty_like': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'empty': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'new_empty': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'empty_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'empty': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         # problem 103190467, as_strided_scatter has non-deterministic behavior when the update indices are not unique
-        'as_strided_scatter': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
+        'as_strided_scatter': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        # duplicate indices are used in the testcase - undefined behaviour
+        'index_put': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+    }
 
+    FAST_MATH_PRECISION_ISSUES = {
         # failures due to precision issues
-        'masked.var': ['f16'],
-        'nn.functional.gelu': ['torch.float32'],
-        'pow': ['torch.float32'],
-        'tan': ['torch.float32'],
-        '__rpow__': ['torch.float32'],
+        'tan': [torch.float32],
+        'pow_': [torch.float32],
+        'masked_softmin': [torch.float32],
+        'masked_softmax': [torch.float32],
+        'masked_log_softmax': [torch.float32],
+        'cdist': [torch.float32],
+        '__rpow__': [torch.float32]
     }
 
     FP16_LOW_PRECISION_LIST = {
@@ -8950,10 +9663,25 @@ class TestConsistency(TestCase):
         "true_divide"
     }
 
+    MPS_SKIP_LIST = reduce(lambda x,y: dict(x, **y), (FAST_MATH_PRECISION_ISSUES, BLOCKLIST, UNDEFINED_BEHAVIOUR, EXPECTED_FAILURES, UNIMPLEMENTED_OPS))
+
     # Used for accept mode only
     NEW_ALLOW_LIST = defaultdict(list)
     NEW_ALLOW_LIST_GRAD = defaultdict(list)
 
+    def get_error_message(self, key, op_name):
+        if key in self.FAST_MATH_PRECISION_ISSUES:
+            return f"Running test with {op_name} fails due to precision issues (fast math) so skipping"
+        elif key in self.BLOCKLIST:
+            return f"Running test with {op_name} fails so skipping"
+        elif key in self.UNDEFINED_BEHAVIOUR:
+            return f"Running test with {op_name} fails due to undefined behaviour / random output so skipping"
+        elif key in self.EXPECTED_FAILURES:
+            return f"Running test with {op_name} expected to fail due to unsupported MPS data type so skipping"
+        elif key in self.UNIMPLEMENTED_OPS:
+            return f"Running test with {op_name} expected to fail due to missing op implementation"
+        return f"Running test with {op_name} hangs so skipping"
+
     @ops(op_db, allowed_dtypes=MPS_DTYPES)
     def test_output_match(self, device, dtype, op):
         self.assertEqual(device, "cpu")
@@ -8961,9 +9689,9 @@ def test_output_match(self, device, dtype, op):
             self.skipTest("MPS is not available")
 
         key = op.name + op.variant_test_name
-        if key in self.BLOCKLIST:
-            if self.BLOCKLIST[key] is None or dtype in self.BLOCKLIST[key]:
-                self.skipTest(f"Running test with {op.name} hangs so skipping")
+        if key in self.MPS_SKIP_LIST:
+            if self.MPS_SKIP_LIST[key] is None or dtype in self.MPS_SKIP_LIST[key]:
+                self.skipTest(self.get_error_message(key, op.name))
 
         # Make this an expecttest manually
         # When this env variable is set, generate a new ALLOWLIST_OP

From c87cf480e5b067641d180f57f4e243c5da5169a1 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Fri, 13 Jan 2023 23:42:27 +0200
Subject: [PATCH 1879/1922] Remove test consistency duplicates (#231)

* Add uniform (random op) to expected failures

* Remove duplicate tests from TestConsistency
---
 test/test_mps.py | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index 388f613610b04..cd20b3e1732f2 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -9281,8 +9281,6 @@ class TestConsistency(TestCase):
         'square': [torch.bool, torch.int16, torch.int32, torch.int64, torch.uint8],
 
         # Functions with correctness issues
-        'nn.functional.avg_pool1d': [torch.float32, torch.int64],
-        'nn.functional.avg_pool2d': [torch.float32, torch.int64],
         'unique': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'divfloor_rounding': [torch.int16, torch.int32, torch.int64],
         'divtrunc_rounding': [torch.float16],
@@ -9292,18 +9290,13 @@ class TestConsistency(TestCase):
         '_native_batch_norm_legit': [torch.float32],
         'addr': [torch.float16],
         'as_stridedpartial_views': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
-        'uniform': [torch.float16, torch.float32],
         'trace': [torch.int64],
-        'tan': [torch.float32],
         'normalnumber_mean': [torch.float16, torch.float32],
         'nn.functional.gelu': [torch.float32],
         'nn.functional.conv_transpose2d': [torch.float32, torch.int64],
         'new_empty_strided': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'native_batch_norm': [torch.float32],
         'multinomial': [torch.float32],
-        'masked.softmin': [torch.float32],
-        'masked.softmax': [torch.float32],
-        'masked.log_softmax': [torch.float32],
         'floor_divide': [torch.int16, torch.int32, torch.int64],
         'dist': [torch.float16],
 
@@ -9624,8 +9617,9 @@ class TestConsistency(TestCase):
     }
 
     UNDEFINED_BEHAVIOUR = {
-        # failures due to random output that they generate using
+        # Failures due to random output that they generate using
         # Philox engine causing mismatch with CPU results
+        'uniform': [torch.float16, torch.float32],
         'rand_like': [torch.float16, torch.float32],
         'randint_like': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'randn_like': [torch.float16, torch.float32],
@@ -9646,12 +9640,12 @@ class TestConsistency(TestCase):
     }
 
     FAST_MATH_PRECISION_ISSUES = {
-        # failures due to precision issues
+        # Failures due to precision issues
         'tan': [torch.float32],
-        'pow_': [torch.float32],
-        'masked_softmin': [torch.float32],
-        'masked_softmax': [torch.float32],
-        'masked_log_softmax': [torch.float32],
+        'pow': [torch.float32],
+        'masked.softmin': [torch.float32],
+        'masked.softmax': [torch.float32],
+        'masked.log_softmax': [torch.float32],
         'cdist': [torch.float32],
         '__rpow__': [torch.float32]
     }

From 96d6f9764a435d29de8cad7741055efd30f274b3 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Fri, 13 Jan 2023 23:47:28 +0200
Subject: [PATCH 1880/1922] Fix macos subversion check for grid sampler
 fallback (#230)

* Fix macos subversion check for grid sampler fallback

* Address PR comments
---
 aten/src/ATen/mps/MPSDevice.h                 |  5 ++---
 aten/src/ATen/mps/MPSDevice.mm                | 19 ++++++++++++++-----
 .../ATen/native/mps/operations/GridSampler.mm |  2 +-
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/aten/src/ATen/mps/MPSDevice.h b/aten/src/ATen/mps/MPSDevice.h
index 48e1904346c10..e5560222a6cbe 100644
--- a/aten/src/ATen/mps/MPSDevice.h
+++ b/aten/src/ATen/mps/MPSDevice.h
@@ -56,7 +56,7 @@ class TORCH_API MPSDevice {
   /**
    * Returns whether running on Ventura or newer
    */
-  bool isMacOS13Plus() const;
+  bool isMacOS13Plus(int32_t subVersion) const;
 
   MTLFunction_t metalIndexingFunction(const std::string &kernel, MTLFunctionConstantValues_t constantValues);
 
@@ -65,13 +65,12 @@ class TORCH_API MPSDevice {
  private:
   static MPSDevice* _device;
   MTLDevice_t _mtl_device;
-  bool _macos13plus;
   MTLLibrary_t _mtl_indexing_library;
   MPSDevice();
 };
 
 TORCH_API bool is_available();
-TORCH_API bool is_macos_13_or_newer();
+TORCH_API bool is_macos_13_or_newer(int32_t subVersion = 0);
 
 TORCH_API at::Allocator* GetMPSAllocator(bool useSharedAllocator = false);
 
diff --git a/aten/src/ATen/mps/MPSDevice.mm b/aten/src/ATen/mps/MPSDevice.mm
index c11621b3f3540..f5646ad77f111 100644
--- a/aten/src/ATen/mps/MPSDevice.mm
+++ b/aten/src/ATen/mps/MPSDevice.mm
@@ -66,7 +66,7 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
   // Create the MPSGraph and check method introduced in 12.3+
   // which is used by MPS backend.
   id mpsCD = NSClassFromString(@"MPSGraph");
-  _macos13plus = [mpsCD instancesRespondToSelector:@selector(cumulativeSumWithTensor:axis:name:)] == YES;
+
   if ([mpsCD instancesRespondToSelector:@selector(LSTMWithSourceTensor:
                                                        recurrentWeight:
                                                            inputWeight:
@@ -90,8 +90,17 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
 
 }
 
-bool MPSDevice::isMacOS13Plus() const {
-  return _macos13plus;
+bool MPSDevice::isMacOS13Plus(int32_t subVersion) const {
+  id mpsCD = NSClassFromString(@"MPSGraph");
+  static bool _macos_13_0_plus = [mpsCD instancesRespondToSelector:@selector(cumulativeSumWithTensor:axis:name:)] == YES;
+  static bool _macos_13_1_plus = [mpsCD instancesRespondToSelector:@selector(
+    sampleGridWithSourceTensor:coordinateTensor:layout:normalizeCoordinates:relativeCoordinates:alignCorners:paddingMode:samplingMode:constantValue:name:)] == YES;
+
+  switch (subVersion) {
+    case 0:  return _macos_13_0_plus;
+    case 1:  return _macos_13_1_plus;
+    default: return false;
+  }
 }
 
 at::Allocator* getMPSSharedAllocator();
@@ -104,8 +113,8 @@ bool is_available() {
   return MPSDevice::getInstance()->device() != nil;
 }
 
-bool is_macos_13_or_newer() {
-  return MPSDevice::getInstance()->isMacOS13Plus();
+bool is_macos_13_or_newer(int32_t subVersion) {
+  return MPSDevice::getInstance()->isMacOS13Plus(subVersion);
 }
 
 } // namespace mps
diff --git a/aten/src/ATen/native/mps/operations/GridSampler.mm b/aten/src/ATen/native/mps/operations/GridSampler.mm
index 1258201763d9a..2106602e4a87c 100644
--- a/aten/src/ATen/native/mps/operations/GridSampler.mm
+++ b/aten/src/ATen/native/mps/operations/GridSampler.mm
@@ -125,7 +125,7 @@ void grid_sampler_2d_mps_impl(Tensor &output, const Tensor& input, const Tensor&
 Tensor grid_sampler_2d_mps(const Tensor& input, const Tensor& grid,
                            int64_t interpolation_mode, int64_t padding_mode,
                            bool align_corners) {
-  if (!is_macos_13_or_newer()) {
+  if (!is_macos_13_or_newer(/*subVersion=*/1)) {
     TORCH_WARN_ONCE("MPS: grid_sampler_2d op is supported natively starting from macOS 13.1. ",
                     "Falling back on CPU. This may have performance implications.");
 

From f23cc910335161c361771f737bcfb9f7e9310d2e Mon Sep 17 00:00:00 2001
From: Siddharth Kotapati <sidk@Siddharths-MacBook-Pro.local>
Date: Fri, 13 Jan 2023 16:06:44 -0800
Subject: [PATCH 1881/1922] Correctly apply weights to oneHotTensor in NLLLoss

---
 .../src/ATen/native/mps/operations/LossOps.mm | 19 +++++++++++++++----
 test/test_mps.py                              |  6 ++++--
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/LossOps.mm b/aten/src/ATen/native/mps/operations/LossOps.mm
index 6968591f18ac8..50299138a2034 100644
--- a/aten/src/ATen/native/mps/operations/LossOps.mm
+++ b/aten/src/ATen/native/mps/operations/LossOps.mm
@@ -401,18 +401,29 @@ void nllnd_loss_backward_impl(
                     }
 
                     float onValue = -1.0f;
-                    auto target_axis = target.defined() ? target.dim() : 1;
 
                     MPSGraphTensor *oneHotTensor = [mpsGraph oneHotWithIndicesTensor:udpatedTargetTensor
                                                                depth:numClasses
-                                                                axis:target_axis
+                                                                axis:1
                                                             dataType:inputTensor.dataType
                                                              onValue:onValue
                                                             offValue:0.0f
                                                                 name:nil];
 
-                    if(isWeightsArrayValid)
-                    {
+                    if(isWeightsArrayValid) {
+                        int64_t nDim = input.sizes().size();
+                        IntArrayRef sizes = input.sizes();
+                        std::vector<NSNumber*> numbers(nDim);
+                        for (const auto i: c10::irange(nDim)) {
+                            NSInteger sz_i = (i == 1) ? sizes[i] : 1;
+                            NSNumber* number = [NSNumber numberWithInteger:sz_i];
+                            numbers[i] = number;
+                        }
+
+                        MPSGraphTensor *weightTensorReshaped = [mpsGraph reshapeTensor:weightTensor
+                                                                             withShape:[NSArray arrayWithObjects:numbers.data() count:numbers.size()]
+                                                                                  name:nil];
+                                                                                  
                         oneHotTensor = [mpsGraph multiplicationWithPrimaryTensor:oneHotTensor
                                                                  secondaryTensor:weightTensor
                                                                             name:@"scaleByWeightTensor"];
diff --git a/test/test_mps.py b/test/test_mps.py
index cd20b3e1732f2..b31a9b6518906 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -2410,14 +2410,16 @@ def _nll_loss_helper(self, input_size, reduction, expected):
         input = torch.rand(input_size, requires_grad=True, device='cpu')
         num_channels = input_size[1]
         target_size = (input_size[0], ) + tuple(input_size[2:])
+        weights = torch.randn(num_channels)
+        weights_mps = weights.to("mps")
         target = torch.randint(num_channels, target_size, device='cpu')
 
         # MPS
         input_mps = input.detach().clone().to('mps').requires_grad_()
         target_mps = target.detach().clone().to('mps')
 
-        output_cpu = F.nll_loss(input, target, reduction=reduction)
-        output_mps = F.nll_loss(input_mps, target_mps, reduction=reduction)
+        output_cpu = F.nll_loss(input, target, weight=weights, reduction=reduction)
+        output_mps = F.nll_loss(input_mps, target_mps, weight=weights_mps, reduction=reduction)
         self.assertEqual(output_cpu, output_mps.to('cpu'))
 
         output_cpu.sum().backward()

From 559e7dd077cfc4a342658a3233fd68cb9ae0c2ba Mon Sep 17 00:00:00 2001
From: Siddharth Kotapati <sidk@Siddharths-MacBook-Pro.local>
Date: Fri, 13 Jan 2023 16:12:03 -0800
Subject: [PATCH 1882/1922] Revert "Correctly apply weights to oneHotTensor in
 NLLLoss"

Reverting unintended commit to master
This reverts commit f23cc910335161c361771f737bcfb9f7e9310d2e.
---
 .../src/ATen/native/mps/operations/LossOps.mm | 19 ++++---------------
 test/test_mps.py                              |  6 ++----
 2 files changed, 6 insertions(+), 19 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/LossOps.mm b/aten/src/ATen/native/mps/operations/LossOps.mm
index 50299138a2034..6968591f18ac8 100644
--- a/aten/src/ATen/native/mps/operations/LossOps.mm
+++ b/aten/src/ATen/native/mps/operations/LossOps.mm
@@ -401,29 +401,18 @@ void nllnd_loss_backward_impl(
                     }
 
                     float onValue = -1.0f;
+                    auto target_axis = target.defined() ? target.dim() : 1;
 
                     MPSGraphTensor *oneHotTensor = [mpsGraph oneHotWithIndicesTensor:udpatedTargetTensor
                                                                depth:numClasses
-                                                                axis:1
+                                                                axis:target_axis
                                                             dataType:inputTensor.dataType
                                                              onValue:onValue
                                                             offValue:0.0f
                                                                 name:nil];
 
-                    if(isWeightsArrayValid) {
-                        int64_t nDim = input.sizes().size();
-                        IntArrayRef sizes = input.sizes();
-                        std::vector<NSNumber*> numbers(nDim);
-                        for (const auto i: c10::irange(nDim)) {
-                            NSInteger sz_i = (i == 1) ? sizes[i] : 1;
-                            NSNumber* number = [NSNumber numberWithInteger:sz_i];
-                            numbers[i] = number;
-                        }
-
-                        MPSGraphTensor *weightTensorReshaped = [mpsGraph reshapeTensor:weightTensor
-                                                                             withShape:[NSArray arrayWithObjects:numbers.data() count:numbers.size()]
-                                                                                  name:nil];
-                                                                                  
+                    if(isWeightsArrayValid)
+                    {
                         oneHotTensor = [mpsGraph multiplicationWithPrimaryTensor:oneHotTensor
                                                                  secondaryTensor:weightTensor
                                                                             name:@"scaleByWeightTensor"];
diff --git a/test/test_mps.py b/test/test_mps.py
index b31a9b6518906..cd20b3e1732f2 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -2410,16 +2410,14 @@ def _nll_loss_helper(self, input_size, reduction, expected):
         input = torch.rand(input_size, requires_grad=True, device='cpu')
         num_channels = input_size[1]
         target_size = (input_size[0], ) + tuple(input_size[2:])
-        weights = torch.randn(num_channels)
-        weights_mps = weights.to("mps")
         target = torch.randint(num_channels, target_size, device='cpu')
 
         # MPS
         input_mps = input.detach().clone().to('mps').requires_grad_()
         target_mps = target.detach().clone().to('mps')
 
-        output_cpu = F.nll_loss(input, target, weight=weights, reduction=reduction)
-        output_mps = F.nll_loss(input_mps, target_mps, weight=weights_mps, reduction=reduction)
+        output_cpu = F.nll_loss(input, target, reduction=reduction)
+        output_mps = F.nll_loss(input_mps, target_mps, reduction=reduction)
         self.assertEqual(output_cpu, output_mps.to('cpu'))
 
         output_cpu.sum().backward()

From df8e01a0979556968f3691dc16ab7458a7676667 Mon Sep 17 00:00:00 2001
From: skotapati <siddharth.kotapati@gmail.com>
Date: Fri, 13 Jan 2023 17:00:41 -0800
Subject: [PATCH 1883/1922] Correctly apply weights to oneHotTensor in NLLLoss
 (#233)

Co-authored-by: Siddharth Kotapati <sidk@Siddharths-MacBook-Pro.local>
---
 .../src/ATen/native/mps/operations/LossOps.mm | 19 +++++++++++++++----
 test/test_mps.py                              |  6 ++++--
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/LossOps.mm b/aten/src/ATen/native/mps/operations/LossOps.mm
index 6968591f18ac8..50299138a2034 100644
--- a/aten/src/ATen/native/mps/operations/LossOps.mm
+++ b/aten/src/ATen/native/mps/operations/LossOps.mm
@@ -401,18 +401,29 @@ void nllnd_loss_backward_impl(
                     }
 
                     float onValue = -1.0f;
-                    auto target_axis = target.defined() ? target.dim() : 1;
 
                     MPSGraphTensor *oneHotTensor = [mpsGraph oneHotWithIndicesTensor:udpatedTargetTensor
                                                                depth:numClasses
-                                                                axis:target_axis
+                                                                axis:1
                                                             dataType:inputTensor.dataType
                                                              onValue:onValue
                                                             offValue:0.0f
                                                                 name:nil];
 
-                    if(isWeightsArrayValid)
-                    {
+                    if(isWeightsArrayValid) {
+                        int64_t nDim = input.sizes().size();
+                        IntArrayRef sizes = input.sizes();
+                        std::vector<NSNumber*> numbers(nDim);
+                        for (const auto i: c10::irange(nDim)) {
+                            NSInteger sz_i = (i == 1) ? sizes[i] : 1;
+                            NSNumber* number = [NSNumber numberWithInteger:sz_i];
+                            numbers[i] = number;
+                        }
+
+                        MPSGraphTensor *weightTensorReshaped = [mpsGraph reshapeTensor:weightTensor
+                                                                             withShape:[NSArray arrayWithObjects:numbers.data() count:numbers.size()]
+                                                                                  name:nil];
+                                                                                  
                         oneHotTensor = [mpsGraph multiplicationWithPrimaryTensor:oneHotTensor
                                                                  secondaryTensor:weightTensor
                                                                             name:@"scaleByWeightTensor"];
diff --git a/test/test_mps.py b/test/test_mps.py
index cd20b3e1732f2..b31a9b6518906 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -2410,14 +2410,16 @@ def _nll_loss_helper(self, input_size, reduction, expected):
         input = torch.rand(input_size, requires_grad=True, device='cpu')
         num_channels = input_size[1]
         target_size = (input_size[0], ) + tuple(input_size[2:])
+        weights = torch.randn(num_channels)
+        weights_mps = weights.to("mps")
         target = torch.randint(num_channels, target_size, device='cpu')
 
         # MPS
         input_mps = input.detach().clone().to('mps').requires_grad_()
         target_mps = target.detach().clone().to('mps')
 
-        output_cpu = F.nll_loss(input, target, reduction=reduction)
-        output_mps = F.nll_loss(input_mps, target_mps, reduction=reduction)
+        output_cpu = F.nll_loss(input, target, weight=weights, reduction=reduction)
+        output_mps = F.nll_loss(input_mps, target_mps, weight=weights_mps, reduction=reduction)
         self.assertEqual(output_cpu, output_mps.to('cpu'))
 
         output_cpu.sum().backward()

From 26371a4ab5c703b27955887bec8a03f7c4690fda Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Sat, 14 Jan 2023 03:33:25 +0200
Subject: [PATCH 1884/1922] Use the reshaped weights (#234)

---
 aten/src/ATen/native/mps/operations/LossOps.mm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/LossOps.mm b/aten/src/ATen/native/mps/operations/LossOps.mm
index 50299138a2034..8af47f86ef542 100644
--- a/aten/src/ATen/native/mps/operations/LossOps.mm
+++ b/aten/src/ATen/native/mps/operations/LossOps.mm
@@ -423,9 +423,9 @@ void nllnd_loss_backward_impl(
                         MPSGraphTensor *weightTensorReshaped = [mpsGraph reshapeTensor:weightTensor
                                                                              withShape:[NSArray arrayWithObjects:numbers.data() count:numbers.size()]
                                                                                   name:nil];
-                                                                                  
+
                         oneHotTensor = [mpsGraph multiplicationWithPrimaryTensor:oneHotTensor
-                                                                 secondaryTensor:weightTensor
+                                                                 secondaryTensor:weightTensorReshaped
                                                                             name:@"scaleByWeightTensor"];
                     }
 

From fb315e8e4e25aa8cd7c9f3d006c14fb880e649bc Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Tue, 17 Jan 2023 12:58:41 -0500
Subject: [PATCH 1885/1922] Add MPSAllocatorInterface to access methods of
 MPSAllocator (#227)

* Add MPSAllocatorInterface to access methods of MPSAllocator
This is a prerequisite for the Memory Stats and Memory Leak Detection features

* Rename getRequestedBufferSize() to getUnalignedBufferSize()
---
 aten/src/ATen/mps/MPSAllocator.h              |  59 ++++------
 aten/src/ATen/mps/MPSAllocator.mm             | 108 ++++++++----------
 aten/src/ATen/mps/MPSAllocatorInterface.h     |  50 ++++++++
 aten/src/ATen/mps/MPSDevice.mm                |   5 +-
 aten/src/ATen/mps/MPSStream.mm                |   6 +-
 aten/src/ATen/native/mps/OperationUtils.mm    |   4 +-
 aten/src/ATen/native/mps/operations/Unique.mm |   1 -
 aten/src/ATen/native/mps/operations/View.mm   |  14 +--
 8 files changed, 137 insertions(+), 110 deletions(-)
 create mode 100644 aten/src/ATen/mps/MPSAllocatorInterface.h

diff --git a/aten/src/ATen/mps/MPSAllocator.h b/aten/src/ATen/mps/MPSAllocator.h
index a6df567b56588..beb5723ea1c94 100644
--- a/aten/src/ATen/mps/MPSAllocator.h
+++ b/aten/src/ATen/mps/MPSAllocator.h
@@ -1,5 +1,6 @@
 //  Copyright © 2022 Apple Inc.
 
+#include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/MPSStream.h>
 #include <cstdio>
 #include <mutex>
@@ -9,27 +10,10 @@
 
 // this implementation is based on CUDACachingAllocator.
 // It utilizes Metal Heaps to improve the performance with buffer allocation.
+// Do not include this header. Use MPSAllocatorInterface.h instead.
 // TODO: Unify the logic with CUDACachingAllocator and remove redundant code.
 namespace at {
 namespace mps {
-
-class IMpsAllocatorCallback {
- public:
-  enum class EventType {
-    ALLOCATED, // buffer got allocated to be used immediately
-    RECYCLED,  // buffer pulled from free list to be reused
-    FREED,     // buffer put to free list for future recycling
-    RELEASED,  // buffer memory released
-  };
-  virtual ~IMpsAllocatorCallback() = default;
-  virtual void executeMPSAllocatorCallback(void* ptr, EventType event) = 0;
-};
-
-// MPS allocator will execute every registered callback when a block of memory is freed.
-C10_DECLARE_REGISTRY(MPSAllocatorCallbacksRegistry, IMpsAllocatorCallback);
-#define REGISTER_MPS_ALLOCATOR_CALLBACK(name, ...) \
-  C10_REGISTER_CLASS(MPSAllocatorCallbacksRegistry, name, __VA_ARGS__);
-
 namespace HeapAllocator {
 
 #define MB(x) round_page(x * 1048576UL)
@@ -263,27 +247,44 @@ class MPSHeapAllocatorImpl
 
   // interface exposed to at::Allocator
   id<MTLBuffer> malloc(size_t size, uint32_t usage);
+  // frees a buffer and returns it into buffer pool
   void free(void* ptr);
+  // releases all the cached buffers and their associated heaps
   void emptyCache();
-  // interface exposed to internal MPS operations
+  // returns true if buffer was allocated from the shared pool
   bool isSharedBuffer(void* ptr);
-  ssize_t getRequestedBufferSize(void* ptr);
+  // get the requested unaligned size of an MTLBuffer
+  ssize_t getUnalignedBufferSize(void* ptr);
+  // set the shape of a base tensor from a view tensor
   void setBufferShape(void* ptr, const IntArrayRef& shape);
+  // retrieve the shape of a base tensor from a view tensor
   IntArrayRef getBufferShape(void* ptr);
+  // allocate a buffer from a specialized pool to import CPU scalars into GPU
   id<MTLBuffer> allocScalarBufferWithValue(void* value, size_t size);
   // this indicates how far (in Megabytes) the current total allocations are from the
   // low watermark limit which is used to detect if we're under memory pressure
   // This returns zero if we've reached the low watermark limit
   ssize_t getLowWatermarkValue();
-
-  bool getDebugVerbosity() const { return m_debug_verbosity; }
-  size_t getMaxTotalAllowedSize() const { return m_max_total_allowed_size; }
+  // (see m_low_watermark_ratio for description)
+  void setLowWatermarkRatio(double ratio);
+  // (see m_high_watermark_ratio for description)
+  void setHighWatermarkRatio(double ratio);
+  // (see m_low_watermark_limit for description)
   size_t getLowWatermarkLimit() const { return m_low_watermark_limit; }
+  // (see m_max_total_allowed_size for description)
+  size_t getHighWatermarkLimit() const { return m_max_total_allowed_size; }
+  // (see m_total_allocated_memory for description)
+  size_t getTotalAllocatedMemory() const {return m_total_allocated_memory; }
+  // (see enum DebugVerbosity for description)
+  uint32_t getDebugVerbosity() const { return m_debug_verbosity; }
+  // returns the device that we allocate from
   inline id<MTLDevice> Device() const { return m_device; }
 
 private:
   // (see m_high_watermark_ratio for description)
   constexpr static double default_high_watermark_ratio = 1.7;
+  // we set the allowed upper bound to twice the size of recommendedMaxWorkingSetSize.
+  constexpr static double default_high_watermark_upper_bound = 2.0;
   // (see m_low_watermark_ratio for description)
   // on unified memory, we could allocate beyond the recommendedMaxWorkingSetSize
   constexpr static double default_low_watermark_ratio_unified  = 1.4;
@@ -375,17 +376,5 @@ class MPSHeapAllocatorImpl
 };
 
 } // namespace HeapAllocator
-
-// interface exposed to internal MPS operations
-
-// get the requested non-aligned size of an MTL buffer
-ssize_t get_requested_buffer_size(void* ptr);
-// retrieve the shape of a base tensor from a view tensor
-IntArrayRef get_buffer_shape(void* ptr);
-// set the shape of a base tensor from a view tensor
-void set_buffer_shape(void* ptr, const IntArrayRef& shape);
-// allocate a buffer from a specialized pool to import CPU scalars into GPU
-DataPtr allocate_scalar_buffer(void* value, size_t size);
-
 } // namespace mps
 } // namespace at
diff --git a/aten/src/ATen/mps/MPSAllocator.mm b/aten/src/ATen/mps/MPSAllocator.mm
index 72ed5a47e9d83..ba3a63b5595a0 100644
--- a/aten/src/ATen/mps/MPSAllocator.mm
+++ b/aten/src/ATen/mps/MPSAllocator.mm
@@ -22,27 +22,35 @@
   static const char *verbosity_str = getenv("PYTORCH_DEBUG_MPS_ALLOCATOR");
   m_debug_verbosity = verbosity_str ? strtol(verbosity_str, nullptr, 0) : DebugVerbosity::SILENT;
 
-  // we set the allowed upper bound to twice the size of recommendedMaxWorkingSetSize.
-  const double high_watermark_upper_bound = 2.0;
-
   static const char *high_watermark_ratio_str = getenv("PYTORCH_MPS_HIGH_WATERMARK_RATIO");
-  m_high_watermark_ratio = high_watermark_ratio_str ? strtod(high_watermark_ratio_str, nullptr) : default_high_watermark_ratio;
-  TORCH_CHECK(m_high_watermark_ratio >= 0.0 && m_high_watermark_ratio <= high_watermark_upper_bound,
-              "invalid high watermark ratio ", m_high_watermark_ratio);
+  const double high_watermark_ratio = high_watermark_ratio_str ? strtod(high_watermark_ratio_str, nullptr) :
+                                                                 default_high_watermark_ratio;
+  setHighWatermarkRatio(high_watermark_ratio);
 
-  m_max_total_allowed_size = (m_high_watermark_ratio == 0.0) ? std::numeric_limits<size_t>::max() :
-                              static_cast<size_t>(m_high_watermark_ratio * (double)max_device_size());
-  // used for comparison with lower_watermark_ratio
-  const double high_watermark_limit = m_high_watermark_ratio == 0.0 ? high_watermark_upper_bound : m_high_watermark_ratio;
   const double default_low_watermark_ratio =  m_device.hasUnifiedMemory ? default_low_watermark_ratio_unified :
                                                                           default_low_watermark_ratio_discrete;
   static const char *low_watermark_ratio_str = getenv("PYTORCH_MPS_LOW_WATERMARK_RATIO");
-  m_low_watermark_ratio = low_watermark_ratio_str ? strtod(low_watermark_ratio_str, nullptr) : default_low_watermark_ratio;
-  TORCH_CHECK(m_low_watermark_ratio >= 0.0 && m_low_watermark_ratio <= high_watermark_limit,
-              "invalid low watermark ratio ", m_low_watermark_ratio);
+  const double low_watermark_ratio = low_watermark_ratio_str ? strtod(low_watermark_ratio_str, nullptr) : default_low_watermark_ratio;
+  setLowWatermarkRatio(low_watermark_ratio);
+}
+
+void MPSHeapAllocatorImpl::setHighWatermarkRatio(double ratio)
+{
+  TORCH_CHECK(ratio >= 0.0 && ratio <= default_high_watermark_upper_bound, "invalid high watermark ratio ", ratio);
+  m_max_total_allowed_size = (ratio == 0.0) ? std::numeric_limits<size_t>::max() :
+                             static_cast<size_t>(ratio * (double)max_device_size());
+  m_high_watermark_ratio = ratio;
+}
+
+void MPSHeapAllocatorImpl::setLowWatermarkRatio(double ratio)
+{
+  // used for comparison with lower_watermark_ratio
+  const double high_watermark_limit = m_high_watermark_ratio == 0.0 ? default_high_watermark_upper_bound : m_high_watermark_ratio;
+  TORCH_CHECK(ratio >= 0.0 && ratio <= high_watermark_limit, "invalid low watermark ratio ", ratio);
   // we use this to detect if there's memory pressure
-  m_low_watermark_limit = (m_low_watermark_ratio == 0.0) ? std::numeric_limits<size_t>::max() :
-                          static_cast<size_t>(m_low_watermark_ratio * (double)max_device_size());
+  m_low_watermark_limit = (ratio == 0.0) ? std::numeric_limits<size_t>::max() :
+                          static_cast<size_t>(ratio * (double)max_device_size());
+  m_low_watermark_ratio = ratio;
 }
 
 HeapBlock* MPSHeapAllocatorImpl::get_free_heap(AllocParams& params)
@@ -470,7 +478,7 @@
   return buffer_block->buffer;
 }
 
-ssize_t MPSHeapAllocatorImpl::getRequestedBufferSize(void* ptr)
+ssize_t MPSHeapAllocatorImpl::getUnalignedBufferSize(void* ptr)
 {
   std::lock_guard<std::mutex> lock(m_mutex);
 
@@ -552,15 +560,15 @@
 }
 
 // MPS allocator struct to be registered with Pytorch
-struct TORCH_API MPSAllocator final : public at::Allocator {
+struct TORCH_API MPSAllocator final : public IMPSAllocator {
 public:
   explicit MPSAllocator(uint32_t Usage) :
       m_has_unified_memory(_getAllocImpl().Device().hasUnifiedMemory), m_usage(Usage)
   {
     if (_getAllocImpl().getDebugVerbosity()) {
       if (!(m_usage & HeapAllocator::UsageFlags::SHARED) || m_has_unified_memory) {
-        const size_t max_total_allowed_size = _getAllocImpl().getMaxTotalAllowedSize();
-        const size_t low_watermark_limit = _getAllocImpl().getLowWatermarkLimit();
+        const size_t high_watermark_limit = _getAllocImpl().getHighWatermarkLimit();
+        const size_t low_watermark_limit  = _getAllocImpl().getLowWatermarkLimit();
         std::cerr << "Initializing "
                   << ((m_usage & HeapAllocator::UsageFlags::SHARED) ? "shared" : "private")
                   << " heap allocator on "
@@ -568,8 +576,8 @@ explicit MPSAllocator(uint32_t Usage) :
                   << " device memory of size "
                   << _getAllocImpl().Device().recommendedMaxWorkingSetSize / 1048576UL << " MB"
                   << " (max allowed: "
-                  << (max_total_allowed_size == std::numeric_limits<size_t>::max() ? "unlimited" :
-                     (to_string(max_total_allowed_size / 1048576UL) + " MB"))
+                  << (high_watermark_limit == std::numeric_limits<size_t>::max() ? "unlimited" :
+                     (to_string(high_watermark_limit / 1048576UL) + " MB"))
                   << ", low watermark: "
                   << (low_watermark_limit == std::numeric_limits<size_t>::max() ? "unlimited" :
                      (to_string(low_watermark_limit / 1048576UL) + " MB"))  << ")\n";
@@ -580,20 +588,28 @@ explicit MPSAllocator(uint32_t Usage) :
   ~MPSAllocator() override {
     _getAllocImpl().emptyCache();
   }
+  DeleterFnPtr raw_deleter() const override { return &Delete; }
 
   DataPtr allocate(const size_t nbytes) const override {
     __block id<MTLBuffer> buf = nbytes > 0 ? _getAllocImpl().malloc(nbytes, m_usage) : nullptr;
     return { buf, buf, &Delete, at::Device(at::DeviceType::MPS, 0)};
   }
-
-  DataPtr allocate_scalar_buffer(void *value, size_t size) const {
+  DataPtr allocScalarBufferWithValue(void *value, size_t size) const override {
     id<MTLBuffer> buf = _getAllocImpl().allocScalarBufferWithValue(value, size);
     return { buf, buf, &Delete, at::Device(at::DeviceType::MPS, 0)};
   }
-
-  DeleterFnPtr raw_deleter() const override { return &Delete; }
-  bool is_shared(void* ptr) const { return _getAllocImpl().isSharedBuffer(ptr); }
-  bool is_shared_storage_supported() const { return m_has_unified_memory; }
+  bool isSharedBuffer(void* ptr) const override { return _getAllocImpl().isSharedBuffer(ptr); }
+  bool isSharedStorageSupported() const override { return m_has_unified_memory; }
+  void emptyCache() const override { _getAllocImpl().emptyCache(); }
+  ssize_t getUnalignedBufferSize(void* ptr) const override { return _getAllocImpl().getUnalignedBufferSize(ptr); }
+  IntArrayRef getBufferShape(void* ptr) const override { return _getAllocImpl().getBufferShape(ptr); }
+  void setBufferShape(void* ptr, const IntArrayRef& shape) override { _getAllocImpl().setBufferShape(ptr, shape); }
+  size_t getTotalAllocatedMemory() const override { return _getAllocImpl().getTotalAllocatedMemory(); }
+  ssize_t getLowWatermarkValue() const override { return _getAllocImpl().getLowWatermarkValue(); }
+  size_t getLowWatermarkLimit() const override { return _getAllocImpl().getLowWatermarkLimit(); }
+  size_t getHighWatermarkLimit() const override { return _getAllocImpl().getHighWatermarkLimit(); }
+  void setLowWatermarkRatio(double ratio) const override { _getAllocImpl().setLowWatermarkRatio(ratio); }
+  void setHighWatermarkRatio(double ratio) const override { _getAllocImpl().setHighWatermarkRatio(ratio); }
 
 private:
   bool m_has_unified_memory;
@@ -618,41 +634,17 @@ static void Delete(void* ptr) {
 }
 } // anonymous namespace
 
-at::Allocator* getMPSSharedAllocator()
-{
+IMPSAllocator* getIMPSAllocator(bool sharedAllocator) {
+  if (!sharedAllocator) {
+    return &_getPrivateAllocator();
+  }
   auto& sa = _getSharedAllocator();
-  if (sa.is_shared_storage_supported()) {
+  if (sa.isSharedStorageSupported()) {
     return &sa;
   }
-
   return nullptr;
 }
 
-at::Allocator* getMPSPrivateAllocator() {
-  return &_getPrivateAllocator();
-}
-
-// TODO: create MPSHooks interface and move these there.
-ssize_t get_requested_buffer_size(void* ptr) {
-  return _getAllocImpl().getRequestedBufferSize(ptr);
-}
-
-void set_buffer_shape(void* ptr, const IntArrayRef& shape) {
-  _getAllocImpl().setBufferShape(ptr, shape);
-}
-
-IntArrayRef get_buffer_shape(void* ptr) {
-  return _getAllocImpl().getBufferShape(ptr);
-}
-
-DataPtr allocate_scalar_buffer(void *value, size_t size) {
-  return _getPrivateAllocator().allocate_scalar_buffer(value, size);
-}
-
-uint32_t get_adaptive_commit_threshold() {
-  return _getAllocImpl().getLowWatermarkValue();
-}
-
 } // namespace mps
 
 namespace native {
@@ -664,14 +656,14 @@ uint32_t get_adaptive_commit_threshold() {
 bool is_pinned_mps(const Tensor& self, c10::optional<Device> device)
 {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_mps());
-  return at::mps::_getSharedAllocator().is_shared(self.storage().data());
+  return at::mps::_getSharedAllocator().isSharedBuffer(self.storage().data());
 }
 
 // torch.pin_memory() implementation
 Tensor _pin_memory_mps(const Tensor& self, c10::optional<Device> device)
 {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_mps());
-  auto* shared_allocator = at::mps::getMPSSharedAllocator();
+  auto* shared_allocator = at::mps::getIMPSAllocator(true);
   TORCH_CHECK(shared_allocator, "unable to pin memory on a non-unified memory device");
 
   const size_t storage_size = detail::computeStorageNbytes(self.sizes(), self.strides(), self.dtype().itemsize());
diff --git a/aten/src/ATen/mps/MPSAllocatorInterface.h b/aten/src/ATen/mps/MPSAllocatorInterface.h
new file mode 100644
index 0000000000000..3278c599d34d3
--- /dev/null
+++ b/aten/src/ATen/mps/MPSAllocatorInterface.h
@@ -0,0 +1,50 @@
+//  Copyright © 2023 Apple Inc.
+
+#include <c10/core/Allocator.h>
+#include <c10/util/Registry.h>
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace mps {
+
+// this is a public interface to access MPSAllocator.
+// Do not declare methods that would depend on MPS or Metal frameworks.
+class IMPSAllocator : public c10::Allocator {
+public:
+  // see the comments in MPSAllocator.h for the description of these methods.
+  virtual void emptyCache() const = 0;
+  virtual ssize_t getUnalignedBufferSize(void* ptr) const = 0;
+  virtual IntArrayRef getBufferShape(void* ptr) const = 0;
+  virtual void setBufferShape(void* ptr, const IntArrayRef& shape) = 0;
+  virtual bool isSharedBuffer(void* ptr) const = 0;
+  virtual bool isSharedStorageSupported() const = 0;
+  virtual c10::DataPtr allocScalarBufferWithValue(void* value, size_t size) const = 0;
+  virtual void setLowWatermarkRatio(double ratio) const = 0;
+  virtual void setHighWatermarkRatio(double ratio) const = 0;
+  virtual ssize_t getLowWatermarkValue() const = 0;
+  virtual size_t getLowWatermarkLimit() const = 0;
+  virtual size_t getHighWatermarkLimit() const = 0;
+  virtual size_t getTotalAllocatedMemory() const = 0;
+};
+
+class IMpsAllocatorCallback {
+ public:
+  enum class EventType {
+    ALLOCATED, // buffer got allocated to be used immediately
+    RECYCLED,  // buffer pulled from free list to be reused
+    FREED,     // buffer put to free list for future recycling
+    RELEASED,  // buffer memory released
+  };
+  virtual ~IMpsAllocatorCallback() = default;
+  virtual void executeMPSAllocatorCallback(void* ptr, EventType event) = 0;
+};
+
+// MPS allocator will execute every registered callback when a block of memory is freed.
+C10_DECLARE_REGISTRY(MPSAllocatorCallbacksRegistry, IMpsAllocatorCallback);
+#define REGISTER_MPS_ALLOCATOR_CALLBACK(name, ...) \
+  C10_REGISTER_CLASS(MPSAllocatorCallbacksRegistry, name, __VA_ARGS__);
+
+IMPSAllocator* getIMPSAllocator(bool sharedAllocator = false);
+
+} // namespace mps
+} // namespace at
diff --git a/aten/src/ATen/mps/MPSDevice.mm b/aten/src/ATen/mps/MPSDevice.mm
index f5646ad77f111..f872b5f2ad353 100644
--- a/aten/src/ATen/mps/MPSDevice.mm
+++ b/aten/src/ATen/mps/MPSDevice.mm
@@ -3,6 +3,7 @@
 #include <c10/util/CallOnce.h>
 
 #include <ATen/mps/MPSDevice.h>
+#include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/IndexKernels.h>
 
 namespace at {
@@ -103,10 +104,8 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
   }
 }
 
-at::Allocator* getMPSSharedAllocator();
-at::Allocator* getMPSPrivateAllocator();
 at::Allocator* GetMPSAllocator(bool useSharedAllocator) {
-  return useSharedAllocator ? getMPSSharedAllocator() : getMPSPrivateAllocator();
+  return getIMPSAllocator(useSharedAllocator);
 }
 
 bool is_available() {
diff --git a/aten/src/ATen/mps/MPSStream.mm b/aten/src/ATen/mps/MPSStream.mm
index 04115fc268c76..f1f2d47cf1e6a 100644
--- a/aten/src/ATen/mps/MPSStream.mm
+++ b/aten/src/ATen/mps/MPSStream.mm
@@ -1,15 +1,13 @@
 //  Copyright © 2022 Apple Inc.
 
 #include <ATen/mps/MPSStream.h>
+#include <ATen/mps/MPSAllocatorInterface.h>
 
 namespace at {
 namespace mps {
 
 #define USE_COMMIT_AND_CONTINUE 1
 
-// the frequency that we commit the command buffer calculated based on low watermark ratio in MPSAllocator
-uint32_t get_adaptive_commit_threshold();
-
 //-----------------------------------------------------------------
 //  MPSStream
 //-----------------------------------------------------------------
@@ -52,7 +50,7 @@
       break;
     case SyncType::COMMIT_ADAPTIVE:
       // the adaptive commit only commits if we hit the low watermark memory threshold
-      if (get_adaptive_commit_threshold() <= 1) {
+      if (getIMPSAllocator()->getLowWatermarkValue() <= 1) {
 #if USE_COMMIT_AND_CONTINUE
         commitAndContinue();
 #else
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index b028613f90c47..3ba4146ae3cf3 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -1,7 +1,7 @@
 //  Copyright © 2022 Apple Inc.
 
 #include <ATen/native/mps/OperationUtils.h>
-#include <ATen/mps/MPSAllocator.h>
+#include <ATen/mps/MPSAllocatorInterface.h>
 
 namespace at {
 namespace native {
@@ -313,7 +313,7 @@ MPSScalar getMPSScalar(const Scalar& scalar, ScalarType type) {
   MPSGraphTensorData *result = nullptr;
   // Scalar pools are only supported on devices with unified memory
   if (mpsStream->device().hasUnifiedMemory) {
-    scalar.buffer = at::mps::allocate_scalar_buffer(&scalar.value, scalar.size);
+    scalar.buffer = getIMPSAllocator()->allocScalarBufferWithValue(&scalar.value, scalar.size);
     result = [[[MPSGraphTensorData alloc] initWithMTLBuffer: scalar.getMTLBuffer()
                                                       shape: @[@1]
                                                    dataType: getMPSScalarType(scalar.type)] autorelease];
diff --git a/aten/src/ATen/native/mps/operations/Unique.mm b/aten/src/ATen/native/mps/operations/Unique.mm
index 42cad854a730d..66098e46a49e1 100644
--- a/aten/src/ATen/native/mps/operations/Unique.mm
+++ b/aten/src/ATen/native/mps/operations/Unique.mm
@@ -2,7 +2,6 @@
 
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/Resize.h>
-#include <ATen/mps/MPSAllocator.h>
 #include <ATen/native/mps/MPSGraphVenturaOps.h>
 
 namespace at {
diff --git a/aten/src/ATen/native/mps/operations/View.mm b/aten/src/ATen/native/mps/operations/View.mm
index ae6e66f94d690..e7ab6d8501e7f 100644
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@@ -2,7 +2,7 @@
 
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/native/Resize.h>
-#include <ATen/mps/MPSAllocator.h>
+#include <ATen/mps/MPSAllocatorInterface.h>
 
 namespace at {
 namespace native {
@@ -445,10 +445,10 @@ bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape) {
     return false;
   }
 
-  IntArrayRef src_base_shape = get_buffer_shape(src.storage().data());
-  int src_ndim_base = src_base_shape.size();
+  IntArrayRef src_base_shape = getIMPSAllocator()->getBufferShape(src.storage().data());
+  size_t src_ndim_base = src_base_shape.size();
   std::vector<int64_t> src_view_shape = getViewShape(src, mpsShape);
-  int src_ndim_view = src_view_shape.size();
+  size_t src_ndim_view = src_view_shape.size();
   if (src_ndim_base == src_ndim_view) {
     for (const auto i : c10::irange(src_ndim_base)) {
       if (src_view_shape[i] > src_base_shape[i]) {
@@ -460,7 +460,7 @@ bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape) {
 }
 
 MPSGraphTensorData* getMPSGraphTensorDataForView(const Tensor& src, MPSShape *mpsShape, const MPSDataType mpsDataType) {
-  IntArrayRef src_base_shape = get_buffer_shape(src.storage().data());
+  IntArrayRef src_base_shape = getIMPSAllocator()->getBufferShape(src.storage().data());
   int src_ndim_base = src_base_shape.size();
   std::vector<int64_t> src_view_shape = getViewShape(src, mpsShape);
   int src_ndim_view = src_view_shape.size();
@@ -625,7 +625,7 @@ bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape) {
 
 static IntArrayRef updateTensorBaseShape(const Tensor& self)
 {
-  IntArrayRef base_shape = get_buffer_shape(self.storage().data());
+  IntArrayRef base_shape = getIMPSAllocator()->getBufferShape(self.storage().data());
   // if there's no base_shape stored in MPSAllocator, then infer it from tensor's size and store it
   if (base_shape.size() == 0) {
     // IntArrayRef wouldn't own the data, so we use a static storage
@@ -636,7 +636,7 @@ static IntArrayRef updateTensorBaseShape(const Tensor& self)
 
     // base_shape will be retained in MPSAllocator until buffer gets recycled
     if (self.storage().data())
-      set_buffer_shape(self.storage().data(), base_shape);
+      getIMPSAllocator()->setBufferShape(self.storage().data(), base_shape);
   }
   return base_shape;
 }

From f079740a8126750c7ebdff11f94da177c07bd51d Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 17 Jan 2023 12:00:53 -0800
Subject: [PATCH 1886/1922] Add triangular solve op through
 MPSMatrixSolveTriangular (#228)

* Add triangular solve op

* Remove old comment

* Fix indentation

* Add autorelease

* Fix source matrix element size

* Remove manual transpose
---
 aten/src/ATen/native/mps/operations/Copy.mm   |   1 -
 .../native/mps/operations/LinearAlgebra.mm    | 105 ++++++++++++++++++
 aten/src/ATen/native/native_functions.yaml    |   3 +
 3 files changed, 108 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm
index d806fa6506504..9eae9d409e41e 100644
--- a/aten/src/ATen/native/mps/operations/Copy.mm
+++ b/aten/src/ATen/native/mps/operations/Copy.mm
@@ -180,7 +180,6 @@ void copy_cast_mps(at::Tensor& dst, const at::Tensor& src,
     // For View tensors, the storage offset can be bigger than what's being reported by nbytes
     src_total_size = at::detail::computeStorageNbytesContiguous(src.sizes(), src.element_size(), src.storage_offset());
   } else {
-    TORCH_INTERNAL_ASSERT(src_.strides() == dst_.strides());
     src = src_;
     if (src.dtype() != dst_.dtype()) {
       // In case of dtype change, perform conversion on source device
diff --git a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
index 31c8c88248d6a..c5f79a07d1982 100644
--- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
+++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@@ -7,6 +7,7 @@
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <torch/library.h>
+#include <ATen/native/Resize.h>
 
 #ifdef __OBJC__
 #include <MetalPerformanceShaders/MetalPerformanceShaders.h>
@@ -597,5 +598,109 @@ Tensor addbmm_mps(const Tensor& self, const Tensor& batch1, const Tensor& batch2
   return addbmm_out_mps(self, batch1, batch2, beta, alpha, self);
 }
 
+Tensor& linalg_solve_triangular_mps_impl( const Tensor& A, const Tensor& B, bool upper, bool transpose, bool left, bool unitriangular, Tensor& out) {
+  using namespace mps;
+
+  if (!is_macos_13_or_newer()) {
+    TORCH_WARN_ONCE("MPS: linalg_solve_triangular_out op is supported natively starting from macOS 13.0. ",
+                    "Falling back on CPU. This may have performance implications.");
+
+    Tensor cpu_out = out.cpu();
+    Tensor A_cpu = A.cpu();
+    Tensor B_cpu = B.cpu();
+    at::linalg_solve_triangular_out(
+      cpu_out, A_cpu, B_cpu, upper, left, unitriangular);
+    out.resize_(cpu_out.sizes(), cpu_out.suggest_memory_format());
+    out.copy_(cpu_out);
+
+    return out;
+  }
+
+  checkInputsSolver(A, B, left, "linalg.solve_triangular");
+  Tensor A_, B_;
+  std::tie(B_, A_) = _linalg_broadcast_batch_dims(B, A, /*don't check errors*/nullptr);
+  at::native::resize_output(out, B_.sizes());
+
+  if (A.numel() == 0 || B.numel() == 0 || out.numel() == 0) {
+    return out;
+  }
+
+  id<MTLBuffer> aBuffer = getMTLBufferStorage(A_);
+  id<MTLBuffer> bBuffer = getMTLBufferStorage(B_);
+  id<MTLBuffer> outBuffer = getMTLBufferStorage(out);
+  MPSStream* mpsStream = getCurrentMPSStream();
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+
+  dispatch_sync(mpsStream->queue(), ^(){
+    @autoreleasepool {
+      id<MTLCommandBuffer> commandBuffer = mpsStream->commandBuffer();
+      MPSMatrixSolveTriangular *filter = [[[MPSMatrixSolveTriangular alloc] initWithDevice:device
+                                                                                    right:!left
+                                                                                    upper:upper
+                                                                                transpose:transpose
+                                                                                     unit:unitriangular
+                                                                                    order:left ? B_.size(-2) : B_.size(-1)
+                                                                   numberOfRightHandSides:left ? B_.size(-1) : B_.size(-2)
+                                                                                    alpha:1.0f] autorelease];
+      uint64_t batchSize = A_.sizes().size() > 2 ? A_.size(0) : 1;
+      uint64_t aRows = A_.size(-2);
+      uint64_t bRows = B_.size(-2);
+      uint64_t aCols = A_.size(-1);
+      uint64_t bCols = B_.size(-1);
+      uint64_t aElemSize = A_.element_size();
+      uint64_t bElemSize = B_.element_size();
+
+      MPSMatrixDescriptor* sourceMatrixDesc = [MPSMatrixDescriptor matrixDescriptorWithRows:aRows
+                                                                                    columns:aCols
+                                                                                   matrices:batchSize
+                                                                                   rowBytes:aCols * aElemSize
+                                                                                matrixBytes:aRows * aCols * aElemSize
+                                                                                   dataType:getMPSDataType(A_.scalar_type())];
+      MPSMatrixDescriptor* rightHandSideMatrixDesc = [MPSMatrixDescriptor matrixDescriptorWithRows:bRows
+                                                                                           columns:bCols
+                                                                                          matrices:batchSize
+                                                                                          rowBytes:bCols * bElemSize
+                                                                                       matrixBytes:bRows * bCols * bElemSize
+                                                                                          dataType:getMPSDataType(B_.scalar_type())];
+      for (const auto i: c10::irange(batchSize)) {
+        MPSMatrix* sourceMatrix = [[[MPSMatrix alloc] initWithBuffer:aBuffer
+                                                              offset:i * aRows * aCols * aElemSize
+                                                          descriptor:sourceMatrixDesc] autorelease];
+        MPSMatrix* rightHandSideMatrix = [[[MPSMatrix alloc] initWithBuffer:bBuffer
+                                                                    offset:i * bRows * bCols * bElemSize
+                                                                descriptor:rightHandSideMatrixDesc] autorelease];
+        MPSMatrix *solutionMatrix = [[[MPSMatrix alloc] initWithBuffer:outBuffer
+                                                               offset:i * bRows * bCols * bElemSize
+                                                           descriptor:rightHandSideMatrixDesc] autorelease];
+
+        [filter encodeToCommandBuffer:commandBuffer
+                         sourceMatrix:sourceMatrix
+                  rightHandSideMatrix:rightHandSideMatrix
+                       solutionMatrix:solutionMatrix];
+      }
+      mpsStream->commit(true);
+    }
+  });
+  return out;
+}
+
+Tensor& linalg_solve_triangular_mps_out( const Tensor& A, const Tensor& B, bool upper, bool left, bool unitriangular, Tensor& out) {
+  return linalg_solve_triangular_mps_impl(A, B, upper, /*transpose=*/false, left, unitriangular, out);
+}
+
+Tensor linalg_solve_triangular_mps(const Tensor& A, const Tensor& B, bool upper, bool left, bool unitriangular) {
+  Tensor out = at::empty({0}, A.options());
+  linalg_solve_triangular_mps_impl(A, B, upper, /*transpose=*/false, left, unitriangular, out);
+  return out;
+}
+
+TORCH_IMPL_FUNC(triangular_solve_mps_out)(const Tensor& self, const Tensor& A, bool upper, bool transpose, bool unitriangular, const Tensor& result, const Tensor& clone_A) {
+  clone_A.copy_(A);
+  Tensor out = at::empty({0}, A.options());
+  linalg_solve_triangular_mps_impl(A, self, upper, transpose, /*left=*/true, unitriangular, out);
+  result.resize_(out.sizes());
+  result.copy_(out);
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index a0ffda96aad82..bcbb57efaa55c 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -8622,6 +8622,7 @@
   structured: True
   dispatch:
     CPU, CUDA: triangular_solve_out
+    MPS: triangular_solve_mps_out
     SparseCsrCPU: triangular_solve_out_sparse_csr_cpu
     SparseCsrCUDA: triangular_solve_out_sparse_csr_cuda
 
@@ -8637,12 +8638,14 @@
   python_module: linalg
   dispatch:
     CPU, CUDA: linalg_solve_triangular_out
+    MPS: linalg_solve_triangular_mps_out
 
 - func: linalg_solve_triangular(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False) -> Tensor
   python_module: linalg
   variants: function
   dispatch:
     CPU, CUDA: linalg_solve_triangular
+    MPS: linalg_solve_triangular_mps
 
 - func: linalg_vander(Tensor x, *, int? N=None) -> Tensor
   python_module: linalg

From 5d9be8c55f12154acde9556c9edeb7388dd9be69 Mon Sep 17 00:00:00 2001
From: skotapati <siddharth.kotapati@gmail.com>
Date: Wed, 18 Jan 2023 09:13:33 -0800
Subject: [PATCH 1887/1922] Added zero check to inverse op, resolving crash
 seen in inverse & matrix_pow tests (#236)

Co-authored-by: Siddharth Kotapati <sidk@Siddharths-MacBook-Pro.local>
---
 aten/src/ATen/native/mps/operations/Inverse.mm | 4 ++++
 test/test_mps.py                               | 2 --
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Inverse.mm b/aten/src/ATen/native/mps/operations/Inverse.mm
index 2975fd9875949..e78cf15ae90b9 100644
--- a/aten/src/ATen/native/mps/operations/Inverse.mm
+++ b/aten/src/ATen/native/mps/operations/Inverse.mm
@@ -24,6 +24,10 @@
     MPSStream* stream = getCurrentMPSStream();
     info.zero_();
 
+    if (A.numel() == 0) {
+        return;
+    }
+
     struct CachedGraph : public MPSCachedGraph
     {
         CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
diff --git a/test/test_mps.py b/test/test_mps.py
index b31a9b6518906..c388c2b1ad6dc 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -9272,8 +9272,6 @@ class TestConsistency(TestCase):
         'nonzero': [torch.bool, torch.uint8, torch.float16],
         'median': [torch.float32, torch.int16, torch.int32, torch.uint8, torch.int16],
         'sgn': [torch.bool],
-        'linalg.inv': [torch.float32],
-        'linalg.inv_ex': [torch.float32],
         'linalg.matrix_power': [torch.float32],
         'nn.functional.interpolate': [torch.float32],
         'resize_': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],

From 497300d68ab022e0f9cfdfd4869c0e908e415f77 Mon Sep 17 00:00:00 2001
From: Ronian526 <11454459+Ronian526@users.noreply.github.com>
Date: Wed, 18 Jan 2023 12:15:06 -0800
Subject: [PATCH 1888/1922] Remove nn.functional.conv_transpose2d from
 Blocklist (#232)

* Remove nn.functional.conv_transpose2d from Blocklist
- compare nn.functional.conv_transpose2d result with CUDA result

* - create cuda_results.yaml to store CUDA results
- load CUDA_RESULT from yaml file
---
 test/cuda_results.yaml | 53 ++++++++++++++++++++++++++++++++++++++++++
 test/test_mps.py       | 20 +++++++++++++++-
 2 files changed, 72 insertions(+), 1 deletion(-)
 create mode 100644 test/cuda_results.yaml

diff --git a/test/cuda_results.yaml b/test/cuda_results.yaml
new file mode 100644
index 0000000000000..1b7c131dc81f0
--- /dev/null
+++ b/test/cuda_results.yaml
@@ -0,0 +1,53 @@
+{
+  nn.functional.conv_transpose2d: 
+    [[[7.399066925048828, 4.4053635597229, -25.85348129272461,
+        58.88909149169922, -88.75193786621094, -18.98126983642578, 9.437820434570312],
+      [-59.78305435180664, -65.34088134765625, -108.04747009277344, 196.6062469482422,
+        71.39350891113281, 37.8786735534668, -69.55322265625], [92.78504943847656,
+        91.24403381347656, -94.33301544189453, 9.261059761047363, -182.10206604003906,
+        141.4270477294922, 146.89010620117188], [-14.363212585449219, 43.454036712646484,
+        -76.1098403930664, 242.9479522705078, 198.1458282470703, -49.77315139770508,
+        5.891449451446533], [-43.56822967529297, 4.782844066619873, -29.526945114135742,
+        65.15388488769531, 161.29757690429688, 118.60847473144531, 27.08570671081543],
+      [68.29853057861328, -11.507468223571777, 2.044086217880249, 11.003862380981445,
+        34.993282318115234, -21.256723403930664, 91.49512481689453], [-70.4466781616211,
+        69.04386138916016, 7.764842987060547, 7.61972713470459, -28.99899673461914,
+        54.575748443603516, -5.762258052825928]], [[-36.238487243652344, 37.29551696777344,
+        -22.012331008911133, -30.1353702545166, 33.82851028442383, 33.00322341918945,
+        2.7218000888824463], [-7.999058246612549, 122.72489929199219, -1.0639530420303345,
+        2.9564287662506104, -143.1276092529297, -110.75650024414062, 48.0764274597168],
+      [-91.0599136352539, -11.656601905822754, 69.62447357177734, 88.12522888183594,
+        337.3008728027344, -76.9416732788086, -110.24406433105469], [-108.1512451171875,
+        98.42401123046875, 142.46144104003906, -127.48089599609375, -3.367496967315674,
+        86.82833099365234, 86.29623413085938], [-14.339198112487793, -52.287410736083984,
+        171.43614196777344, 200.14817810058594, 200.35476684570312, -189.4150390625,
+        -46.86980056762695], [30.196495056152344, 25.22877311706543, 95.29426574707031,
+        4.455311298370361, 118.48747253417969, 87.11080932617188, -83.6124038696289],
+      [-2.5434072017669678, 91.8791732788086, -10.615175247192383, -12.58531379699707,
+        -49.3439826965332, 33.37324523925781, -5.983145713806152]], [[4.551003932952881,
+        15.84842586517334, -46.354671478271484, 14.721636772155762, 39.01048278808594,
+        49.70054244995117, -18.268564224243164], [16.728954315185547, 129.43505859375,
+        -4.6139116287231445, -3.382319688796997, -238.76353454589844, 13.42194938659668,
+        40.393280029296875], [-2.335604429244995, -85.94283294677734, -142.2253875732422,
+        135.27537536621094, 18.01512336730957, -26.331714630126953, -33.35443878173828],
+      [-79.17593383789062, -93.72674560546875, -110.94194030761719, -61.455223083496094,
+        6.811624526977539, 129.06478881835938, 12.435402870178223], [10.859378814697266,
+        41.3059196472168, 143.55824279785156, -41.754737854003906, -235.32406616210938,
+        -70.98460388183594, 130.46929931640625], [193.57574462890625, -142.5060272216797,
+        -102.45012664794922, 124.68048095703125, 136.05215454101562, -9.650590896606445,
+        -45.59521484375], [-37.829593658447266, 39.12519454956055, 9.293094635009766,
+        -18.8004093170166, -0.7294210195541382, 51.884910583496094, 36.15913391113281]],
+    [[-15.651233673095703, 16.31340980529785, -26.752052307128906, 6.281721115112305,
+        43.765541076660156, -13.097319602966309, -30.443206787109375], [10.67841911315918,
+        66.1829605102539, -9.394262313842773, -131.45101928710938, -38.621002197265625,
+        65.9507064819336, 48.76960372924805], [-76.0918197631836, -9.108996391296387,
+        13.64936637878418, 96.7411880493164, 124.2474365234375, -111.50318145751953,
+        -42.397071838378906], [-83.31562805175781, 32.27967071533203, 250.08163452148438,
+        58.24131393432617, 129.95318603515625, -10.683560371398926, -123.84668731689453],
+      [-11.536887168884277, -15.220125198364258, 197.18821716308594, -31.680112838745117,
+        -81.35874938964844, 157.96974182128906, 105.61251831054688], [78.15926361083984,
+        -84.49744415283203, -73.91180419921875, 86.370361328125, 77.87918090820312,
+        55.3555908203125, -7.273794651031494], [25.232547760009766, 30.352109909057617,
+        53.722267150878906, 44.87421798706055, 44.618812561035156, 4.511796951293945,
+        9.039834976196289]]]
+}
diff --git a/test/test_mps.py b/test/test_mps.py
index c388c2b1ad6dc..398e66598ce9e 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -15,6 +15,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import itertools
+import yaml
 from collections import defaultdict
 from torch._six import inf
 from torch.nn import Parameter
@@ -9293,7 +9294,6 @@ class TestConsistency(TestCase):
         'trace': [torch.int64],
         'normalnumber_mean': [torch.float16, torch.float32],
         'nn.functional.gelu': [torch.float32],
-        'nn.functional.conv_transpose2d': [torch.float32, torch.int64],
         'new_empty_strided': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'native_batch_norm': [torch.float32],
         'multinomial': [torch.float32],
@@ -9657,6 +9657,12 @@ class TestConsistency(TestCase):
         "true_divide"
     }
 
+    with open("./test/cuda_results.yaml") as f:
+        data = yaml.safe_load(f)
+    CUDA_RESULT = dict()
+    for key,value in data.items():
+        CUDA_RESULT[key]= torch.as_tensor(value)
+
     MPS_SKIP_LIST = reduce(lambda x,y: dict(x, **y), (FAST_MATH_PRECISION_ISSUES, BLOCKLIST, UNDEFINED_BEHAVIOUR, EXPECTED_FAILURES, UNIMPLEMENTED_OPS))
 
     # Used for accept mode only
@@ -9676,6 +9682,15 @@ def get_error_message(self, key, op_name):
             return f"Running test with {op_name} expected to fail due to missing op implementation"
         return f"Running test with {op_name} hangs so skipping"
 
+    def compare_with_CUDA(self, op, mps_out, atol, rtol):
+        cuda_out = self.CUDA_RESULT[op.name]
+        try:
+            self.assertEqual(cuda_out, mps_out, atol=atol, rtol=rtol)
+        except Exception as e:
+            return False
+        else:
+            return True
+
     @ops(op_db, allowed_dtypes=MPS_DTYPES)
     def test_output_match(self, device, dtype, op):
         self.assertEqual(device, "cpu")
@@ -9754,6 +9769,9 @@ def get_samples():
                 if any(s in str(e).lower() for s in ["int64", "macos 13"]):
                   self.skipTest(f"{str(e)}")
 
+                if op.name in self.CUDA_RESULT and self.compare_with_CUDA(op, mps_out, atol=atol, rtol=rtol):
+                    continue
+
                 if not generate_new_truth:
                     raise e
                 forward_failed = True

From 6dc2203e6f06736d85f26ddee968b517ae3a8c5d Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Wed, 18 Jan 2023 12:27:19 -0800
Subject: [PATCH 1889/1922] Fix slice followed by reshape (#237)

Co-authored-by: Denis Vieriu <denisvieriu@mac-3045BF.local>
---
 .../src/ATen/native/mps/operations/BinaryOps.mm |  3 ++-
 aten/src/ATen/native/mps/operations/View.mm     | 17 ++++++++++++++++-
 test/test_mps.py                                | 14 ++++++++++++++
 3 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm
index c6ae0c8c60349..9fdfff52e6439 100644
--- a/aten/src/ATen/native/mps/operations/BinaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@@ -110,7 +110,8 @@ void binaryOpTensor(const Tensor& self, const Tensor& other, const Scalar& alpha
           newCachedGraph->outputTensor = binaryBlock(newCachedGraph, primaryCastTensor, secondaryCastTensor);
           // Cast output tensor to an expected type if needed, which addresses discrepancy when int64 scalar is added to int32 tensor
           // Output tensor should have been promoted but it remains an int32 tensor
-          if (outputDataType != common_dtype) {
+          if (outputDataType != common_dtype ||
+             [newCachedGraph->outputTensor dataType] != getMPSDataType(outputDataType)) {
             newCachedGraph->outputTensor = castMPSTensor(mpsGraph, newCachedGraph->outputTensor, outputDataType);
           }
         }
diff --git a/aten/src/ATen/native/mps/operations/View.mm b/aten/src/ATen/native/mps/operations/View.mm
index e7ab6d8501e7f..b68e100dea863 100644
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@@ -450,11 +450,26 @@ bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape) {
   std::vector<int64_t> src_view_shape = getViewShape(src, mpsShape);
   size_t src_ndim_view = src_view_shape.size();
   if (src_ndim_base == src_ndim_view) {
-    for (const auto i : c10::irange(src_ndim_base)) {
+    for (const auto i: c10::irange(src_ndim_base)) {
       if (src_view_shape[i] > src_base_shape[i]) {
         return false;
       }
     }
+  } else {
+    // Detect slice followed by reshape cases, e.g (1,4800,2) -> (1,4800)
+    bool allDimsEqual = true;
+    auto min_ndim = std::min(src_ndim_base, src_ndim_view);
+    for (const auto i: c10::irange(min_ndim)) {
+      if (src_view_shape[i] > src_base_shape[i]) {
+        return false;
+      }
+      else if (src_view_shape[i] != src_base_shape[i]) {
+        allDimsEqual = false;
+      }
+    }
+    if (allDimsEqual) {
+      return false;
+    }
   }
   return true;
 }
diff --git a/test/test_mps.py b/test/test_mps.py
index 398e66598ce9e..9c470391337c0 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -1629,9 +1629,23 @@ def test_slice_reshape(self):
 
         x = x[:,3:].view(2, 3, 4, 1)
         x_cpu = x_cpu[:,3:].view(2, 3, 4, 1)
+        self.assertEqual(x, x_cpu)
 
+        x = x + 2
+        x_cpu = x_cpu + 2
         self.assertEqual(x, x_cpu)
 
+    def test_slice_reshape_contg_view(self):
+        import torch
+
+        x_mps = torch.randn(1, 4800, 2, device="mps")
+        x_cpu = x_mps.detach().clone().cpu()
+
+        r_mps = x_mps + 2
+        r_cpu = x_cpu + 2
+
+        self.assertEqual(r_mps, r_cpu)
+
     def test_view_slice(self):
         # https://github.com/pytorch/pytorch/issues/83995
         NUM_SAMPLES = 60

From d88dc8bf3a95bc5fe8adc24e44386065699210d2 Mon Sep 17 00:00:00 2001
From: Kulin Seth <kulin_seth@apple.com>
Date: Wed, 18 Jan 2023 12:52:15 -0800
Subject: [PATCH 1890/1922] Revert "Added zero check to inverse op, resolving
 crash seen in inverse & matrix_pow tests (#236)"

This reverts commit 5d9be8c55f12154acde9556c9edeb7388dd9be69.
---
 aten/src/ATen/native/mps/operations/Inverse.mm | 4 ----
 test/test_mps.py                               | 2 ++
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Inverse.mm b/aten/src/ATen/native/mps/operations/Inverse.mm
index e78cf15ae90b9..2975fd9875949 100644
--- a/aten/src/ATen/native/mps/operations/Inverse.mm
+++ b/aten/src/ATen/native/mps/operations/Inverse.mm
@@ -24,10 +24,6 @@
     MPSStream* stream = getCurrentMPSStream();
     info.zero_();
 
-    if (A.numel() == 0) {
-        return;
-    }
-
     struct CachedGraph : public MPSCachedGraph
     {
         CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
diff --git a/test/test_mps.py b/test/test_mps.py
index 9c470391337c0..052bff96f497a 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -9287,6 +9287,8 @@ class TestConsistency(TestCase):
         'nonzero': [torch.bool, torch.uint8, torch.float16],
         'median': [torch.float32, torch.int16, torch.int32, torch.uint8, torch.int16],
         'sgn': [torch.bool],
+        'linalg.inv': [torch.float32],
+        'linalg.inv_ex': [torch.float32],
         'linalg.matrix_power': [torch.float32],
         'nn.functional.interpolate': [torch.float32],
         'resize_': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],

From 4a4d12197997876a81167a56d061cfd5e5f41055 Mon Sep 17 00:00:00 2001
From: Ronian526 <11454459+Ronian526@users.noreply.github.com>
Date: Wed, 18 Jan 2023 13:17:25 -0800
Subject: [PATCH 1891/1922] Remove square from block list (#229)

* Remove square from block list
- add checks for whether macOS is greater than 13.2
- remove square from block list
- report error for pow(int64, x) before macOS 13.2


Co-authored-by: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
---
 aten/src/ATen/mps/MPSDevice.mm                   | 2 ++
 aten/src/ATen/native/mps/operations/BinaryOps.mm | 6 +++++-
 test/test_mps.py                                 | 3 +--
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/mps/MPSDevice.mm b/aten/src/ATen/mps/MPSDevice.mm
index f872b5f2ad353..d7e1749e38e23 100644
--- a/aten/src/ATen/mps/MPSDevice.mm
+++ b/aten/src/ATen/mps/MPSDevice.mm
@@ -96,10 +96,12 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
   static bool _macos_13_0_plus = [mpsCD instancesRespondToSelector:@selector(cumulativeSumWithTensor:axis:name:)] == YES;
   static bool _macos_13_1_plus = [mpsCD instancesRespondToSelector:@selector(
     sampleGridWithSourceTensor:coordinateTensor:layout:normalizeCoordinates:relativeCoordinates:alignCorners:paddingMode:samplingMode:constantValue:name:)] == YES;
+  static bool _macos_13_2_plus = [mpsCD instancesRespondToSelector:@selector(convolution3DWithSourceTensor:weightsTensor:descriptor:name:)] == YES;
 
   switch (subVersion) {
     case 0:  return _macos_13_0_plus;
     case 1:  return _macos_13_1_plus;
+    case 2:  return _macos_13_2_plus;
     default: return false;
   }
 }
diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm
index 9fdfff52e6439..519c23d33a6e6 100644
--- a/aten/src/ATen/native/mps/operations/BinaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@@ -27,6 +27,10 @@
 void binaryOpTensor(const Tensor& self, const Tensor& other, const Scalar& alpha,
                     const Tensor& output_, std::string op_name, BinaryOpBlock binaryBlock)
 {
+  TORCH_CHECK(!(op_name == "power" && !is_macos_13_or_newer(2) &&
+              (self.scalar_type() == ScalarType::Long ||
+              (other.scalar_type() == ScalarType::Long && (self.scalar_type() != ScalarType::Half && self.scalar_type() != ScalarType::Float)))),
+              "MPS: ", op_name, " op with int64 input is supported natively starting from macOS 13.2");
   MPSStream* mpsStream = getCurrentMPSStream();
 
   const bool is_self_scalar = self.dim() == 0;
@@ -247,7 +251,7 @@ void add_sub_template(const Tensor& self, const Tensor& other, const Scalar& alp
 #define CREATE_MPS_STRUCTURED_BINARY_OP_FUNC(func_out, func_stub, other_type)                   \
 TORCH_IMPL_FUNC(func_out) (const Tensor& self, const other_type& other, const Tensor& output) { \
   TORCH_CHECK(!(self.scalar_type() == ScalarType::Long &&                                       \
-               (std::string(#func_stub) == "power" || std::string(#func_stub) == "atan2")),     \
+               std::string(#func_stub) == "atan2"),                                             \
                "MPS does not support ", #func_stub, " op with int64 input")                     \
   mps::binaryOp##other_type(self, other, Scalar(1.0), output, #func_stub,                       \
     ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {                          \
diff --git a/test/test_mps.py b/test/test_mps.py
index 052bff96f497a..875faaf8f68a2 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -9049,7 +9049,7 @@ class TestConsistency(TestCase):
                              'i64',
                              'u8'],
         'sqrt': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'square': ['f16', 'f32'],
+        'square': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'squeeze': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'stack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'std': ['f16', 'f32'],
@@ -9295,7 +9295,6 @@ class TestConsistency(TestCase):
         'nn.functional.interpolatearea': [torch.float32],
         'resize_as_': [torch.float16, torch.float32],
         'topk': [torch.int16, torch.int32, torch.int64, torch.uint8],
-        'square': [torch.bool, torch.int16, torch.int32, torch.int64, torch.uint8],
 
         # Functions with correctness issues
         'unique': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],

From fd0b7ece6972d4d37a929dd35e73b7a9f7f77e99 Mon Sep 17 00:00:00 2001
From: Ronian526 <11454459+Ronian526@users.noreply.github.com>
Date: Wed, 18 Jan 2023 19:50:31 -0800
Subject: [PATCH 1892/1922] - fix batch_norm_mps_out key (#239)

- return 1/sqrt(var+epsilon) instead of var
- return empty tensor for mean and var if train is not enabled
- remove native_batch_norm from block list
---
 .../native/mps/operations/Normalization.mm    | 41 +++++++++++++++----
 test/test_mps.py                              |  1 -
 2 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Normalization.mm b/aten/src/ATen/native/mps/operations/Normalization.mm
index b06049f40bf39..90cfe3c9fab5d 100644
--- a/aten/src/ATen/native/mps/operations/Normalization.mm
+++ b/aten/src/ATen/native/mps/operations/Normalization.mm
@@ -135,7 +135,9 @@ void get_shapes(MPSShape* input_shape_readonly,
                       + std::to_string(momentum) + ":" + std::to_string(train) + ":"
                       + std::to_string(has_running_mean) + ":"
                       + std::to_string(has_weight) + ":" + std::to_string(has_bias) + ":"
-                      + [ns_shape_key UTF8String] + ":" + native_mps::getMPSTypeString(self.scalar_type());
+                      + [ns_shape_key UTF8String] + ":"
+                      + native_mps::getTensorsStringKey({
+                        self, weight_opt.value_or(Tensor()), bias_opt.value_or(Tensor()), running_mean_opt.value_or(Tensor()), running_var_opt.value_or(Tensor())});
     auto input_mps_dtype = native_mps::getMPSDataType(self.scalar_type());
     CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
 
@@ -180,6 +182,7 @@ void get_shapes(MPSShape* input_shape_readonly,
 
             MPSGraphTensor* updatedRunningMeanTensor = nil;
             MPSGraphTensor* updatedRunningVarTensor = nil;
+            MPSGraphTensor *scaledInverseSqrtVariance = nil;
 
             /*
             If train:
@@ -195,6 +198,7 @@ Check if running mean exists (maybe do this check before making graph)
 
             Compute the batch norm output and stats to be saved
             */
+            MPSGraphTensor *varTensor = nil;
 
             if(train) {
               // Compute mean and variance of the current batch
@@ -204,6 +208,7 @@ Check if running mean exists (maybe do this check before making graph)
               MPSGraphTensor* batchVarianceTensor = [mpsGraph varianceOfTensor:inputTensor
                                                                           axes:axes
                                                                           name:nil];
+              varTensor = batchVarianceTensor;
               if(has_running_mean) {
                 // TODO: This is not the formula used in PyTorch, is this OK? Seems more robust
                 // float besselCorrectionTerm = float(N) / std::max(N - 1.0f, 1.0f);
@@ -240,14 +245,27 @@ Check if running mean exists (maybe do this check before making graph)
                 updatedRunningVarTensor = [mpsGraph additionWithPrimaryTensor:scaledCorrectedBatchVar
                                                               secondaryTensor:scaledRunningVar
                                                                          name:nil];
-                // Update saved mean and inverse std tensor
-                saveMeanTensor = batchMeanTensor;
-                saveVarTensor = batchVarianceTensor;
-            }
-            else {
-              saveMeanTensor = batchMeanTensor;
-              saveVarTensor = batchVarianceTensor;
             }
+            // Update saved mean and inverse std tensor
+            MPSGraphTensor *epsilonTensor = [mpsGraph constantWithScalar:(double)epsilon
+                                                                   shape:@[@1]
+                                                                dataType:MPSDataTypeFloat32];
+
+            MPSGraphTensor *varianceEps = [mpsGraph additionWithPrimaryTensor:batchVarianceTensor
+                                                              secondaryTensor:epsilonTensor
+                                                                         name:@"varianceEps"];
+
+            MPSGraphTensor *sqrtVariance = [mpsGraph squareRootWithTensor:varianceEps
+                                                                     name:@"sqrtVariance"];
+            float primary = 1.0f;
+            MPSGraphTensor *primaryTensor = [mpsGraph constantWithScalar:primary dataType:MPSDataTypeFloat32];
+
+            scaledInverseSqrtVariance = [mpsGraph divisionWithPrimaryTensor:primaryTensor
+                                                            secondaryTensor:sqrtVariance
+                                                                       name:nil];
+            // Update saved mean and inverse std tensor
+            saveMeanTensor = batchMeanTensor;
+            saveVarTensor = scaledInverseSqrtVariance;
           }
           else { // Test
             TORCH_CHECK(has_running_mean);
@@ -255,12 +273,13 @@ Check if running mean exists (maybe do this check before making graph)
                                                      name:nil];
             saveVarTensor = [mpsGraph identityWithTensor:runningVarTensor
                                                     name:nil];
+            varTensor = saveVarTensor;
           }
 
           // Compute output of batch norm
           MPSGraphTensor* outputTensor = [mpsGraph normalizationWithTensor:inputTensor
                                                                 meanTensor:saveMeanTensor
-                                                            varianceTensor:saveVarTensor
+                                                            varianceTensor:varTensor
                                                                gammaTensor:weightTensor
                                                                 betaTensor:biasTensor
                                                                    epsilon:(float)epsilon
@@ -352,6 +371,10 @@ Check if running mean exists (maybe do this check before making graph)
 
   }
 
+  if(!train) {
+    save_mean.resize_({0});
+    save_var.resize_({0});
+  }
   return std::tuple<Tensor&, Tensor&, Tensor&>(output, save_mean, save_var);
 }
 
diff --git a/test/test_mps.py b/test/test_mps.py
index 875faaf8f68a2..789943feff7fb 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -9310,7 +9310,6 @@ class TestConsistency(TestCase):
         'normalnumber_mean': [torch.float16, torch.float32],
         'nn.functional.gelu': [torch.float32],
         'new_empty_strided': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
-        'native_batch_norm': [torch.float32],
         'multinomial': [torch.float32],
         'floor_divide': [torch.int16, torch.int32, torch.int64],
         'dist': [torch.float16],

From e656d7271292c7478e0acf6839f9d83984b3bb30 Mon Sep 17 00:00:00 2001
From: Ronian526 <11454459+Ronian526@users.noreply.github.com>
Date: Thu, 19 Jan 2023 10:57:32 -0800
Subject: [PATCH 1893/1922] Fix index_add type issue (#235)

* - changing alphaTensor type into using self.scalar_type()
- remove index_add from blocklist
- block bool,int16,int64,float16,uint8 as scatterWithDataTensor giving wrong results

* - casting tensors so that scatterWithDataTensor uses only float32 and int32
- throw an error for unsupported index_add for int64
---
 .../ATen/native/mps/operations/Indexing.mm    | 44 ++++++++++++++++---
 test/test_mps.py                              |  4 +-
 2 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index 6ec65f976c033..1b065b8ed061e 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -505,12 +505,14 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) {
   MPSStream* stream = getCurrentMPSStream();
   dim = maybe_wrap_dim(dim, self.dim());
   auto numel = index.numel();
-  auto alpha_f = alpha.to<float>();
 
   if (numel == 0) {
     return;
   }
 
+  TORCH_CHECK(self.scalar_type() != ScalarType::Long,
+                "MPS: does not support index_add op with int64 input");
+
   struct CachedGraph : public MPSCachedGraph
   {
     CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
@@ -539,16 +541,46 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) {
           MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
           MPSGraphTensor* indexTensor = mpsGraphRankedPlaceHolder(mpsGraph, index);
           MPSGraphTensor* sourceTensor = mpsGraphRankedPlaceHolder(mpsGraph, source);
-          MPSGraphTensor* alphaTensor = mpsGraphScalarPlaceHolder(mpsGraph, alpha_f);
-          MPSGraphTensor* alphaSourceSlice = [mpsGraph multiplicationWithPrimaryTensor:sourceTensor
-                                                                       secondaryTensor:alphaTensor
+          MPSGraphTensor* alphaTensor = mpsGraphScalarPlaceHolder(mpsGraph, getMPSScalarType(self.scalar_type()));
+
+          MPSGraphTensor* castInputTensor = inputTensor;
+          MPSGraphTensor* castSourceTensor = sourceTensor;
+          MPSGraphTensor* castAlphaTensor = alphaTensor;
+
+          MPSDataType dataType = [inputTensor dataType];
+
+          // failure due to issue #104289647: Wrong results from scatterWithDataTensor
+          if (dataType != MPSDataTypeInt32 &&
+            dataType != MPSDataTypeFloat32) {
+            dataType = (dataType & MPSDataTypeFloatBit) ? MPSDataTypeFloat32 : MPSDataTypeInt32;
+            castInputTensor = [mpsGraph castTensor:inputTensor
+                                            toType:dataType
+                                            name:@"castInputTensor"];
+            castSourceTensor = [mpsGraph castTensor:sourceTensor
+                                            toType:dataType
+                                            name:@"castSourceTensor"];
+            castAlphaTensor = [mpsGraph castTensor:alphaTensor
+                                            toType:dataType
+                                            name:@"castAlphaTensor"];
+          }
+
+          MPSGraphTensor* alphaSourceSlice = [mpsGraph multiplicationWithPrimaryTensor:castSourceTensor
+                                                                       secondaryTensor:castAlphaTensor
                                                                                   name:nil];
-          MPSGraphTensor* outputTensor = [mpsGraph scatterWithDataTensor:inputTensor
+          MPSGraphTensor* outputTensor = [mpsGraph scatterWithDataTensor:castInputTensor
                                                             updatesTensor:alphaSourceSlice
                                                             indicesTensor:indexTensor
                                                                      axis:dim
                                                                      mode:MPSGraphScatterModeAdd
                                                                      name:nil];
+          dataType = [inputTensor dataType];
+          if (dataType != MPSDataTypeInt32 &&
+              dataType != MPSDataTypeFloat32) {
+              outputTensor = [mpsGraph castTensor:outputTensor
+                                            toType:[inputTensor dataType]
+                                            name:@"castOutputTensor"];
+            }
+
           newCachedGraph->inputTensor_ = inputTensor;
           newCachedGraph->indexTensor_ = indexTensor;
           newCachedGraph->sourceTensor_ = sourceTensor;
@@ -563,7 +595,7 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) {
     Placeholder indexPlaceholder = Placeholder(cachedGraph->indexTensor_, index);
     Placeholder sourcePlaceholder = Placeholder(cachedGraph->sourceTensor_, source);
     Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result);
-    MPSScalar alpha_scalar = getMPSScalar(alpha_f, source.scalar_type());
+    MPSScalar alpha_scalar = getMPSScalar(alpha, self.scalar_type());
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
       selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(),
diff --git a/test/test_mps.py b/test/test_mps.py
index 789943feff7fb..f3510fad0a28f 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -9102,7 +9102,8 @@ class TestConsistency(TestCase):
         'xlogy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'zero_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'zeros': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'zeros_like': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8']
+        'zeros_like': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'index_add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
     }
 
     ALLOWLIST_OP_GRAD = {
@@ -9282,7 +9283,6 @@ class TestConsistency(TestCase):
     # All the entries in this list should be removed
     BLOCKLIST = {
         # Functions that hard crash
-        'index_add': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'nn.functional.softplus': [torch.float32],
         'nonzero': [torch.bool, torch.uint8, torch.float16],
         'median': [torch.float32, torch.int16, torch.int32, torch.uint8, torch.int16],

From 892bc6070798693cb9463734059a9ec052666f96 Mon Sep 17 00:00:00 2001
From: Ronian526 <11454459+Ronian526@users.noreply.github.com>
Date: Thu, 19 Jan 2023 15:36:42 -0800
Subject: [PATCH 1894/1922] - unlock _native_batch_norm_legit after resolving
 native_batch_norm (#242)

---
 test/test_mps.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index f3510fad0a28f..313d587d2d89f 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -9303,7 +9303,6 @@ class TestConsistency(TestCase):
         'norm': [torch.float16],
         'nn.functional.feature_alpha_dropoutwith_train': [torch.float32],
         'cumulative_trapezoid': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
-        '_native_batch_norm_legit': [torch.float32],
         'addr': [torch.float16],
         'as_stridedpartial_views': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'trace': [torch.int64],

From 1d50f479b90195157f92c8b451faeb78e432dc08 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Fri, 20 Jan 2023 09:40:18 -0800
Subject: [PATCH 1895/1922] cherry-pick remainder op from upstream (#244)

---
 .../ATen/native/mps/operations/BinaryOps.mm   | 26 +++++++++++++++++++
 aten/src/ATen/native/native_functions.yaml    |  1 +
 test/test_mps.py                              |  2 +-
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm
index 519c23d33a6e6..fc4c01a7ecb84 100644
--- a/aten/src/ATen/native/mps/operations/BinaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@@ -330,6 +330,32 @@ Tensor floor_divide_mps(const Tensor& self, const Tensor& other) {
   return floor_divide_out_mps(self, other, self);
 }
 
+TORCH_IMPL_FUNC(remainder_out_mps) (const Tensor& self, const Tensor& other, const Tensor& output) {
+  // torch.remainder(a, b) == a - a.div(b, rounding_mode="floor") * b
+  mps::BinaryOpBlock remainder_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
+    MPSGraph* mpsGraph = cachedGraph->graph();
+    // Rounding is a no-op for integral types, and also a reasonable workaround
+    // For MPSGraph bug on Apple Silicon, that throws `Function floorOp_i64 was not found in the library`
+    // See https://github.com/pytorch/pytorch/issues/84995
+
+    auto divTensor =  [mpsGraph divisionWithPrimaryTensor:primaryCastTensor
+                                          secondaryTensor:secondaryCastTensor
+                                                     name:nil];
+    bool isFloatOutput = ([divTensor dataType] & MPSDataTypeFloatBit) != 0;
+    if (isFloatOutput) {
+      divTensor = [mpsGraph floorWithTensor:divTensor name:nil];
+    }
+
+    auto mulTensor = [mpsGraph multiplicationWithPrimaryTensor:divTensor
+                                               secondaryTensor:secondaryCastTensor
+                                                          name:nil];
+    return [mpsGraph subtractionWithPrimaryTensor:primaryCastTensor
+                                       secondaryTensor:mulTensor
+                                           name: nil];
+    };
+  mps::binaryOpTensor(self, other, Scalar(1.0), output, "remainder_out_mps", remainder_op_block);
+}
+
 TORCH_IMPL_FUNC(logaddexp_out_mps) (const Tensor& self, const Tensor& other, const Tensor& output)
 {
   mps::BinaryOpBlock logaddexp_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index bcbb57efaa55c..494c0eb2e8ba5 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -9186,6 +9186,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA: remainder_out
+    MPS: remainder_out_mps
   tags: pointwise
 
 - func: remainder.Tensor(Tensor self, Tensor other) -> Tensor
diff --git a/test/test_mps.py b/test/test_mps.py
index 313d587d2d89f..19f5b12b3e464 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -9485,7 +9485,7 @@ class TestConsistency(TestCase):
         'put': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'qr': [torch.float32],
         'quantile': [torch.float32],
-        'remainder': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'remainder': [torch.bool, torch.int16, torch.int32, torch.int64, torch.uint8],
         'renorm': [torch.float16, torch.float32],
         'roll': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'rounddecimals_0': [torch.float32],

From d24fe9796c35201ca9b1edde90fdf03cc663b7d1 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Fri, 20 Jan 2023 09:45:28 -0800
Subject: [PATCH 1896/1922] Fix arange_mps_out for empty tensor (#245)

* Fix arange_mps_out for empty tensor

* Address PR comments
---
 aten/src/ATen/native/mps/operations/RangeFactories.mm | 5 +++++
 test/test_mps.py                                      | 8 ++++++++
 2 files changed, 13 insertions(+)

diff --git a/aten/src/ATen/native/mps/operations/RangeFactories.mm b/aten/src/ATen/native/mps/operations/RangeFactories.mm
index 403ae4748f0ff..4533ad1578556 100644
--- a/aten/src/ATen/native/mps/operations/RangeFactories.mm
+++ b/aten/src/ATen/native/mps/operations/RangeFactories.mm
@@ -88,6 +88,11 @@
       }
       result.resize_({size});
     }
+
+    if (result.numel() == 0) {
+      return;
+    }
+
     bool is_contiguous = result.is_contiguous();
     Tensor r = !is_contiguous ? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT) : result;
     using namespace mps;
diff --git a/test/test_mps.py b/test/test_mps.py
index 19f5b12b3e464..4d8bbd06ef0c9 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -5457,6 +5457,14 @@ def test_arange(self):
         self.assertEqual(np.arange(1, 2, .3, dtype=np.float32), torch.arange(1, 2, .3, device='mps'))
         self.assertEqual(np.arange(6.3, dtype=np.float32), torch.arange(6.3, device='mps'))
 
+    def test_arange_empty(self):
+        out_mps = torch.tensor([], device="mps")
+        out_cpu = torch.tensor([], device="cpu")
+
+        y_mps = torch.arange(0, 0, 1, out=out_mps)
+        y_cpu = torch.arange(0, 0, 1, out=out_cpu)
+        self.assertEqual(y_mps, y_cpu)
+
     # Test softmax
     def test_softmax(self):
         def helper(shape, dim, channels_last=False):

From 3a8490d9046300414c803f3ff1bee125e7dc2eaa Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Fri, 20 Jan 2023 13:50:20 -0800
Subject: [PATCH 1897/1922] Fix nonzero for empty tensors (#248)

* Fix nonzero for empty tensors

* Update TestConsistency list
---
 aten/src/ATen/native/mps/operations/Indexing.mm | 7 ++++++-
 test/test_mps.py                                | 3 +--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index 1b065b8ed061e..d9d8e53461638 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -228,6 +228,12 @@ Tensor nonzero_fallback(const Tensor& self) {
       return out_;
   }
 
+  int64_t nDim = self.dim();
+  if (self.numel() == 0) {
+    at::native::resize_output(out_, {0, nDim});
+    return out_;
+  }
+
   using namespace mps;
   const uint32_t maxDimensions = 16;
 
@@ -249,7 +255,6 @@ Tensor nonzero_fallback(const Tensor& self) {
   };
 
   int64_t total_nonzero = at::count_nonzero(self).item<int64_t>();
-  int64_t nDim = self.dim();
   at::native::resize_output(out_, {total_nonzero, nDim});
   if (out_.numel() ==  0) {
     return out_;
diff --git a/test/test_mps.py b/test/test_mps.py
index 4d8bbd06ef0c9..d53f923c4725c 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8882,7 +8882,7 @@ class TestConsistency(TestCase):
         'nn.functional.unfold': ['f16', 'f32'],
         'nn.functional.upsample_bilinear': ['f32'],
         'nn.functional.upsample_nearest': ['f32', 'u8'],
-        'nonzero': ['b8', 'f32', 'i16', 'i32', 'i64'],
+        'nonzero': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'],
         'norm': ['f32', 'f16'],
         'normal': ['f16', 'f32'],
         'ones': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -9292,7 +9292,6 @@ class TestConsistency(TestCase):
     BLOCKLIST = {
         # Functions that hard crash
         'nn.functional.softplus': [torch.float32],
-        'nonzero': [torch.bool, torch.uint8, torch.float16],
         'median': [torch.float32, torch.int16, torch.int32, torch.uint8, torch.int16],
         'sgn': [torch.bool],
         'linalg.inv': [torch.float32],

From cfb518c65b053fae0d42c2c2896c7842977f1b23 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Fri, 20 Jan 2023 13:52:23 -0800
Subject: [PATCH 1898/1922] Skip compilation of mps grid sampler for macos
 older than 13.1 (#247)

---
 aten/src/ATen/native/mps/operations/GridSampler.mm | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/aten/src/ATen/native/mps/operations/GridSampler.mm b/aten/src/ATen/native/mps/operations/GridSampler.mm
index 2106602e4a87c..b37b956fc9020 100644
--- a/aten/src/ATen/native/mps/operations/GridSampler.mm
+++ b/aten/src/ATen/native/mps/operations/GridSampler.mm
@@ -8,6 +8,8 @@
 void grid_sampler_2d_mps_impl(Tensor &output, const Tensor& input, const Tensor& grid,
                               int64_t interpolation_mode, int64_t padding_mode,
                               bool align_corners) {
+// Grid Sampler support has been added in macOS 13.1
+#if !defined(__MAC_13_1) && !defined(MAC_OS_X_VERSION_13_1)
   using namespace mps;
   check_grid_sampler_common(input, grid);
   check_grid_sampler_2d(input, grid);
@@ -120,6 +122,7 @@ void grid_sampler_2d_mps_impl(Tensor &output, const Tensor& input, const Tensor&
 
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
   }
+#endif // !defined(__MAC_13_1) && !defined(MAC_OS_X_VERSION_13_1)
 }
 
 Tensor grid_sampler_2d_mps(const Tensor& input, const Tensor& grid,

From d71061e15668280c346337b790a6a61e8d6dd907 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Mon, 23 Jan 2023 10:39:04 -0800
Subject: [PATCH 1899/1922] Fix relative paths in test_mps (#246)

---
 test/test_mps.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index d53f923c4725c..8c0913e316389 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -9677,7 +9677,9 @@ class TestConsistency(TestCase):
         "true_divide"
     }
 
-    with open("./test/cuda_results.yaml") as f:
+    dirname = os.path.dirname(__file__)
+    filename = os.path.join(dirname, "cuda_results.yaml")
+    with open(filename) as f:
         data = yaml.safe_load(f)
     CUDA_RESULT = dict()
     for key,value in data.items():

From f81f2ac1ff47f5d3258f11b953ccc8b0d5d0c22b Mon Sep 17 00:00:00 2001
From: Ronian526 <11454459+Ronian526@users.noreply.github.com>
Date: Mon, 23 Jan 2023 14:59:56 -0800
Subject: [PATCH 1900/1922] Fix nn.functional.gelu (#249)

* Fix nn.functional.gelu
- fix gelu_out_mps key
- add calculation for gelu with tanh
- remove gelu from blocklist

* - add test_gelu_tanh test
---
 .../ATen/native/mps/operations/Activation.mm  | 54 ++++++++++++++++++-
 test/test_mps.py                              | 12 ++++-
 2 files changed, 63 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Activation.mm b/aten/src/ATen/native/mps/operations/Activation.mm
index dbb591246ca40..b84436bd99f5a 100644
--- a/aten/src/ATen/native/mps/operations/Activation.mm
+++ b/aten/src/ATen/native/mps/operations/Activation.mm
@@ -744,6 +744,51 @@ Tensor relu_mps(const Tensor& self) {
     return  erfTensor;
 }
 
+MPSGraphTensor* tanh (MPSGraph* mpsGraph, MPSGraphTensor *inputTensor) {
+    // 0.5 * x * (1 + text{Tanh}(sqrt(2 / pi) * (x + 0.044715 * x^3)))
+    auto dataType = [inputTensor dataType];
+    const float SQRT2_PI = 0.797884523868560791015625f;
+    const float VAL = 0.044715f;
+    MPSGraphTensor *onef = [mpsGraph constantWithScalar: 1.0f
+                                                  shape: @[@1]
+                                              dataType: dataType];
+    MPSGraphTensor *halff = [mpsGraph constantWithScalar: 0.5f
+                                                    shape: @[@1]
+                                                dataType: dataType];
+    MPSGraphTensor *sqrt2_pi = [mpsGraph constantWithScalar: SQRT2_PI
+                                                  shape: @[@1]
+                                              dataType: dataType];
+    MPSGraphTensor *valf = [mpsGraph constantWithScalar: VAL
+                                                  shape: @[@1]
+                                              dataType: dataType];
+
+    MPSGraphTensor *erfTensor = [mpsGraph multiplicationWithPrimaryTensor: inputTensor
+                                                          secondaryTensor: inputTensor
+                                                                    name : nil];
+    erfTensor = [mpsGraph multiplicationWithPrimaryTensor: erfTensor
+                                          secondaryTensor: inputTensor
+                                                    name : nil];
+    erfTensor = [mpsGraph multiplicationWithPrimaryTensor: erfTensor
+                                          secondaryTensor: valf
+                                                    name : nil];
+    erfTensor = [mpsGraph additionWithPrimaryTensor: erfTensor
+                                    secondaryTensor: inputTensor
+                                              name : nil];
+    erfTensor = [mpsGraph multiplicationWithPrimaryTensor: erfTensor
+                                          secondaryTensor: sqrt2_pi
+                                                    name : nil];
+    erfTensor = [mpsGraph tanhWithTensor: erfTensor
+                                   name : nil];
+    erfTensor = [mpsGraph additionWithPrimaryTensor: erfTensor
+                                    secondaryTensor: onef
+                                              name : nil];
+    erfTensor = [mpsGraph multiplicationWithPrimaryTensor: erfTensor
+                                          secondaryTensor: halff
+                                                    name : nil];
+
+    return  erfTensor;
+}
+
 TORCH_IMPL_FUNC(gelu_out_mps) (
     const Tensor& self, c10::string_view approximate, const Tensor& output
   ) {
@@ -767,7 +812,7 @@ Tensor relu_mps(const Tensor& self) {
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
-    string key = "gelu_out_mps" + getTensorsStringKey({self});
+    string key = "gelu_out_mps" + getTensorsStringKey({self}) + ":" + c10::str(approximate);
     CachedGraph* cachedGraph = static_cast<CachedGraph *>(cache_->LookUp(key));
     if(!cachedGraph) {
       MPSCachedGraph *tmpCachedGraph = cache_->CreateCachedGraph(key, ^ MPSCachedGraph * () {
@@ -782,7 +827,12 @@ Tensor relu_mps(const Tensor& self) {
                                                                   getMPSDataType(self.scalar_type()),
                                                                   getMPSShape(self));
 
-          MPSGraphTensor* outputTensor = normcdf(mpsGraph, inputTensor);
+          MPSGraphTensor* outputTensor = nil;
+          if(approximate == "tanh") {
+            outputTensor = tanh(mpsGraph, inputTensor);
+          } else {
+            outputTensor = normcdf(mpsGraph, inputTensor);
+          }
           outputTensor = [mpsGraph multiplicationWithPrimaryTensor:outputTensor
                                                    secondaryTensor:inputTensor
                                                               name:nil];
diff --git a/test/test_mps.py b/test/test_mps.py
index 8c0913e316389..9b5bebe2c7579 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -2197,6 +2197,17 @@ def helper(dtype):
             e_string = str(e)
             self.assertEqual(e_string, "MPS does not support cumsum op with int64 input")
 
+    def test_gelu_tanh(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float)
+            x = cpu_x.detach().clone().to('mps')
+
+            gelu_tanh_result = torch.nn.functional.gelu(x, approximate='tanh')
+            gelu_tanh_result_cpu = torch.nn.functional.gelu(cpu_x, approximate='tanh')
+            self.assertEqual(gelu_tanh_result, gelu_tanh_result_cpu)
+
+        helper((2, 8, 4, 5))
+
 
 class TestLogical(TestCase):
     def _wrap_tensor(self, x, device="cpu", dtype=None, requires_grad=False):
@@ -9314,7 +9325,6 @@ class TestConsistency(TestCase):
         'as_stridedpartial_views': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'trace': [torch.int64],
         'normalnumber_mean': [torch.float16, torch.float32],
-        'nn.functional.gelu': [torch.float32],
         'new_empty_strided': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'multinomial': [torch.float32],
         'floor_divide': [torch.int16, torch.int32, torch.int64],

From c506a9e5f15f315591d60c344c0af9ad02934137 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Tue, 24 Jan 2023 14:33:14 -0500
Subject: [PATCH 1901/1922] Add Python Module interface for MPS backend (#251)

- Enable global manual seeding via torch.manual_seed() + test case
- Add torch.mps.synchronize() to wait for MPS stream to finish + test case
- Enable the following python interfaces for MPS:
torch.mps.get_rng_state()
torch.mps.set_rng_state()
torch.mps.is_available()
torch.mps.synchronize()
torch.mps.manual_seed()
torch.mps.seed()
torch.mps.is_initialized()
torch.mps.init()
---
 aten/src/ATen/detail/MPSHooksInterface.h |  4 ++
 aten/src/ATen/mps/MPSDevice.h            |  2 +-
 aten/src/ATen/mps/MPSDevice.mm           |  5 ++
 aten/src/ATen/mps/MPSHooks.cpp           |  4 ++
 aten/src/ATen/mps/MPSHooks.h             |  1 +
 build_variables.bzl                      |  2 +
 test/test_mps.py                         | 40 ++++++++++++
 torch/_C/__init__.pyi.in                 |  6 +-
 torch/csrc/Module.cpp                    |  9 ++-
 torch/csrc/api/include/torch/mps.h       | 21 +++++++
 torch/csrc/api/src/mps.cpp               | 31 ++++++++++
 torch/csrc/mps/Module.cpp                | 70 +++++++++++++++++++++
 torch/mps/__init__.py                    | 79 ++++++++++++++++++++++++
 torch/random.py                          |  6 ++
 14 files changed, 276 insertions(+), 4 deletions(-)
 create mode 100644 torch/csrc/api/include/torch/mps.h
 create mode 100644 torch/csrc/api/src/mps.cpp
 create mode 100644 torch/csrc/mps/Module.cpp
 create mode 100644 torch/mps/__init__.py

diff --git a/aten/src/ATen/detail/MPSHooksInterface.h b/aten/src/ATen/detail/MPSHooksInterface.h
index 4fff139f27745..fd1f2f5a75c67 100644
--- a/aten/src/ATen/detail/MPSHooksInterface.h
+++ b/aten/src/ATen/detail/MPSHooksInterface.h
@@ -35,6 +35,10 @@ struct TORCH_API MPSHooksInterface {
   virtual Allocator* getMPSDeviceAllocator() const {
     AT_ERROR("MPSDeviceAllocator requires MPS.");
   }
+
+  virtual void deviceSynchronize() const {
+    TORCH_CHECK(false, "Cannot synchronize MPS device without MPS backend. ");
+  }
 };
 
 struct TORCH_API MPSHooksArgs {};
diff --git a/aten/src/ATen/mps/MPSDevice.h b/aten/src/ATen/mps/MPSDevice.h
index e5560222a6cbe..7bd1774f482fd 100644
--- a/aten/src/ATen/mps/MPSDevice.h
+++ b/aten/src/ATen/mps/MPSDevice.h
@@ -71,7 +71,7 @@ class TORCH_API MPSDevice {
 
 TORCH_API bool is_available();
 TORCH_API bool is_macos_13_or_newer(int32_t subVersion = 0);
-
+TORCH_API void device_synchronize();
 TORCH_API at::Allocator* GetMPSAllocator(bool useSharedAllocator = false);
 
 } // namespace mps
diff --git a/aten/src/ATen/mps/MPSDevice.mm b/aten/src/ATen/mps/MPSDevice.mm
index d7e1749e38e23..2a976bf117d3c 100644
--- a/aten/src/ATen/mps/MPSDevice.mm
+++ b/aten/src/ATen/mps/MPSDevice.mm
@@ -3,6 +3,7 @@
 #include <c10/util/CallOnce.h>
 
 #include <ATen/mps/MPSDevice.h>
+#include <ATen/mps/MPSStream.h>
 #include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/IndexKernels.h>
 
@@ -118,5 +119,9 @@ bool is_macos_13_or_newer(int32_t subVersion) {
   return MPSDevice::getInstance()->isMacOS13Plus(subVersion);
 }
 
+void device_synchronize() {
+  getDefaultMPSStream()->synchronize(SyncType::COMMIT_AND_WAIT);
+}
+
 } // namespace mps
 } // namespace at
diff --git a/aten/src/ATen/mps/MPSHooks.cpp b/aten/src/ATen/mps/MPSHooks.cpp
index 5fde8f3843fe6..4a549bfc72252 100644
--- a/aten/src/ATen/mps/MPSHooks.cpp
+++ b/aten/src/ATen/mps/MPSHooks.cpp
@@ -24,6 +24,10 @@ const Generator& MPSHooks::getDefaultMPSGenerator() const {
   return at::mps::detail::getDefaultMPSGenerator();
 }
 
+void MPSHooks::deviceSynchronize() const {
+  at::mps::device_synchronize();
+}
+
 using at::MPSHooksRegistry;
 using at::RegistererMPSHooksRegistry;
 
diff --git a/aten/src/ATen/mps/MPSHooks.h b/aten/src/ATen/mps/MPSHooks.h
index 2bef3eac42648..d64781930bff5 100644
--- a/aten/src/ATen/mps/MPSHooks.h
+++ b/aten/src/ATen/mps/MPSHooks.h
@@ -15,6 +15,7 @@ struct MPSHooks : public at::MPSHooksInterface {
   bool hasMPS() const override;
   Allocator* getMPSDeviceAllocator() const override;
   const Generator& getDefaultMPSGenerator() const override;
+  void deviceSynchronize() const override;
 };
 
 }} // at::mps
diff --git a/build_variables.bzl b/build_variables.bzl
index d9879f1a7b02e..5af378f0a0a65 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -815,6 +815,7 @@ torch_cpp_srcs = [
     "torch/csrc/api/src/imethod.cpp",
     "torch/csrc/api/src/jit.cpp",
     "torch/csrc/api/src/serialize.cpp",
+    "torch/csrc/api/src/mps.cpp",
     "torch/csrc/api/src/nn/init.cpp",
     "torch/csrc/api/src/nn/module.cpp",
     "torch/csrc/api/src/nn/modules/_functions.cpp",
@@ -920,6 +921,7 @@ libtorch_python_core_sources = [
     "torch/csrc/dynamo/guards.cpp",
     "torch/csrc/dynamo/init.cpp",
     "torch/csrc/functorch/init.cpp",
+    "torch/csrc/mps/Module.cpp",
     "torch/csrc/jit/backends/backend_init.cpp",
     "torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp",
     "torch/csrc/jit/python/init.cpp",
diff --git a/test/test_mps.py b/test/test_mps.py
index 9b5bebe2c7579..9754bafc11446 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -26,6 +26,7 @@
 from torch.testing import make_tensor
 from torch.testing._comparison import TensorLikePair
 from torch.testing._internal.common_dtype import get_all_dtypes, integral_types
+import torch.mps
 import torch.backends.mps
 from torch.distributions import Uniform, Exponential
 from functools import partial, reduce
@@ -5674,6 +5675,45 @@ def test_mps_generator(self):
         mps_x = torch.randn(5, device='mps', generator=g_mps)
         self.assertEqual(mps_x, mps_y)
 
+    def test_default_mps_generator(self):
+        # manual seeding on the "default" MPS generator using
+        # the global torch.manual_seed()
+        torch.manual_seed(230)
+        mps_x = torch.randn(5, device='mps')
+        # manual seeding using torch.mps.manual_seed()
+        # which should set the "default" MPS generator
+        # like the global torch.manual_seed()
+        torch.mps.manual_seed(230)
+        mps_y = torch.randn(5, device='mps')
+        # seed values were the same, so the random tensor contents should match
+        self.assertEqual(mps_x, mps_y)
+
+        # save the default generator's state to restore it later
+        g_state = torch.mps.get_rng_state()
+
+        # generate random numbers without seeding
+        mps_x = torch.randn(5, device='mps')
+        # in this case, the random results must differ from the last generated random results
+        self.assertNotEqual(mps_x, mps_y)
+
+        # restore the previously saved state, and the results should match again
+        torch.mps.set_rng_state(g_state)
+        mps_x = torch.randn(5, device='mps')
+        self.assertEqual(mps_x, mps_y)
+
+    def test_device_synchronize(self):
+        # just running some ops each followed by a synchronize to wait for
+        # MPS stream to finish running each of them
+        net1 = torch.nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1)\
+            .to(device='mps', dtype=torch.float)
+
+        x = torch.rand(1, 128, 6, 6, device='mps', dtype=torch.float, requires_grad=True)
+        torch.mps.synchronize()
+        x = net1(x)
+        torch.mps.synchronize()
+        x.backward(torch.randn_like(x))
+        torch.mps.synchronize()
+
     # Test random_.to and random_.from
     def test_random(self):
         def helper(shape, low, high, dtype=torch.int32):
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index be71a94296bf9..e2c2d554cdb0e 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -879,7 +879,6 @@ def _disabled_torch_function_impl(func: Callable, types: Iterable[Type], args: T
 def _disabled_torch_dispatch_impl(func: Callable, types: Iterable[Type], args: Tuple, kwargs: Dict) -> Any: ...  # THPModule_disable_dispatch_function
 def _get_linalg_preferred_backend() -> torch._C._LinalgBackend: ...
 def _set_linalg_preferred_backend(arg: torch._C._LinalgBackend): ...
-def _is_mps_available() -> _bool: ...
 class _LinalgBackend:
     Default: _LinalgBackend
     Cusolver: _LinalgBackend
@@ -1172,6 +1171,11 @@ class _TensorBase(metaclass=_TensorMeta):
 # Defined in torch/csrc/multiprocessing/init.cpp
 def _multiprocessing_init() -> None: ...
 
+# Defined in torch/csrc/mps/Module.cpp
+def _mps_synchronize() -> None: ...
+def _mps_init() -> None: ...
+def _is_mps_available() -> _bool: ...
+
 # Defined in torch/csrc/cuda/Module.cpp
 def _cuda_getCurrentStream(device: _int) -> _int: ...
 def _cuda_getCurrentRawStream(device: _int) -> _int: ...
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 33a5d6f1f4442..2dd1109c9987e 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -1148,6 +1148,10 @@ void initIttBindings(PyObject* module);
 } // namespace torch
 #endif
 
+#ifdef USE_MPS
+PyMethodDef* MPSModule_methods();
+#endif
+
 namespace torch {
 void initVerboseBindings(PyObject* module);
 } // namespace torch
@@ -1203,6 +1207,9 @@ PyObject* initModule() {
 #ifdef USE_CUDA
   THPUtils_addPyMethodDefs(methods, THCPModule_methods());
 #endif
+#ifdef USE_MPS
+  THPUtils_addPyMethodDefs(methods, MPSModule_methods());
+#endif
 #if defined(USE_DISTRIBUTED) && defined(USE_C10D)
   THPUtils_addPyMethodDefs(
       methods, torch::distributed::c10d::python_functions());
@@ -1505,8 +1512,6 @@ Call this whenever a new thread is created in order to propagate values from
 
   ASSERT_TRUE(set_module_attr("has_cuda", has_cuda));
   ASSERT_TRUE(set_module_attr("has_mps", has_mps));
-  py_module.def("_is_mps_available", []() { return at::hasMPS(); });
-
   ASSERT_TRUE(
       set_module_attr("has_mkldnn", at::hasMKLDNN() ? Py_True : Py_False));
 
diff --git a/torch/csrc/api/include/torch/mps.h b/torch/csrc/api/include/torch/mps.h
new file mode 100644
index 0000000000000..669cecfc5de49
--- /dev/null
+++ b/torch/csrc/api/include/torch/mps.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace torch {
+namespace mps {
+
+/// Returns true if MPS device is available.
+bool TORCH_API is_available();
+
+/// Sets the seed for the current GPU.
+void TORCH_API manual_seed(uint64_t seed);
+
+/// Waits for all streams on a MPS device to complete.
+void TORCH_API synchronize();
+
+} // namespace mps
+} // namespace torch
diff --git a/torch/csrc/api/src/mps.cpp b/torch/csrc/api/src/mps.cpp
new file mode 100644
index 0000000000000..83bb7ef2d3215
--- /dev/null
+++ b/torch/csrc/api/src/mps.cpp
@@ -0,0 +1,31 @@
+#include <ATen/Context.h>
+#include <c10/util/irange.h>
+
+#include <cstddef>
+
+namespace torch {
+namespace mps {
+
+bool is_available() {
+  return at::detail::getMPSHooks().hasMPS();
+}
+
+/// Sets the seed for the MPS's default generator.
+void manual_seed(uint64_t seed) {
+  if (is_available()) {
+    auto gen = at::detail::getMPSHooks().getDefaultMPSGenerator();
+    {
+      // See Note [Acquire lock when using random generators]
+      std::lock_guard<std::mutex> lock(gen.mutex());
+      gen.set_current_seed(seed);
+    }
+  }
+}
+
+void synchronize() {
+  TORCH_CHECK(is_available(), "No MPS devices are available");
+  at::detail::getMPSHooks().deviceSynchronize()
+}
+
+} // namespace mps
+} // namespace torch
diff --git a/torch/csrc/mps/Module.cpp b/torch/csrc/mps/Module.cpp
new file mode 100644
index 0000000000000..6d2b4b451b3f1
--- /dev/null
+++ b/torch/csrc/mps/Module.cpp
@@ -0,0 +1,70 @@
+#include <ATen/ATen.h>
+#include <ATen/mps/MPSAllocatorInterface.h>
+#include <ATen/mps/MPSGeneratorImpl.h>
+
+#include <torch/csrc/Generator.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/utils/python_numbers.h>
+#include <torch/csrc/utils/python_strings.h>
+
+using namespace torch;
+
+static PyObject* MPSModule_initExtension(PyObject* self, PyObject* noargs) {
+#if C10_ASAN_ENABLED
+  TORCH_WARN(
+      "torch.mps: your pytorch binary has address sanitizer (asan) built in, "
+      "asan is currently not compatible with torch.mps module, "
+      "you might get unexpected behavior (eg. out of memory, crash, etc.), "
+      "please rebuild pytorch without asan if you need to use this module");
+#endif
+  HANDLE_TH_ERRORS
+
+  auto m = THPObjectPtr(PyImport_ImportModule("torch.mps"));
+  if (!m)
+    throw python_error();
+
+  auto set_module_attr = [&](const char* name, PyObject* v) {
+    if (PyObject_SetAttrString(m, name, v) < 0) {
+      throw python_error();
+    }
+  };
+
+  // NOLINTNEXTLINE(performance-unnecessary-copy-initialization)
+  auto gen = at::mps::detail::getDefaultMPSGenerator();
+  auto default_mps_generator = (THPGenerator*)THPGenerator_initDefaultGenerator(gen);
+  set_module_attr("default_generator", (PyObject*) default_mps_generator);
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* MPSModule_isAvailable(PyObject* _unused, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  if (at::detail::getMPSHooks().hasMPS()) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* MPSModule_Synchronize(PyObject* _unused, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  at::detail::getMPSHooks().deviceSynchronize();
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+// NOLINTNEXTLINE(modernize-avoid-c-arrays,
+// cppcoreguidelines-avoid-non-const-global-variables,
+// cppcoreguidelines-avoid-c-arrays)
+static struct PyMethodDef _MPSModule_methods[] = {
+    {"_mps_init", MPSModule_initExtension, METH_NOARGS, nullptr},
+    {"_mps_synchronize", MPSModule_Synchronize, METH_NOARGS, nullptr},
+    {"_is_mps_available", MPSModule_isAvailable, METH_NOARGS, nullptr},
+};
+
+PyMethodDef* MPSModule_methods() {
+  return _MPSModule_methods;
+}
\ No newline at end of file
diff --git a/torch/mps/__init__.py b/torch/mps/__init__.py
new file mode 100644
index 0000000000000..16d9ea209a245
--- /dev/null
+++ b/torch/mps/__init__.py
@@ -0,0 +1,79 @@
+r"""
+This unit enables an interface for accessing MPS backend in python
+"""
+import torch
+from functools import lru_cache
+import threading
+from .. import Tensor
+import torch._C
+
+_initialized = False
+_initialization_lock = threading.Lock()
+default_generator: torch._C.Generator = ()  # type: ignore[assignment]
+
+def init():
+    r"""Initialize PyTorch's MPS state.
+    Does nothing if the MPS state is already initialized.
+    """
+    _lazy_init()
+
+def _lazy_init():
+    global _initialized
+    if is_initialized():
+        return
+    with _initialization_lock:
+        if is_initialized():
+            return
+        if not hasattr(torch._C, '_mps_init'):
+            raise AssertionError("Torch not compiled with MPS enabled")
+        torch._C._mps_init()
+        _initialized = True
+
+@lru_cache()
+def is_available() -> bool:
+    r"""Returns a bool indicating if MPS is currently available."""
+    return torch._C._is_mps_available()
+
+def synchronize() -> None:
+    r"""Waits for all kernels in all streams on a MPS device to complete."""
+    _lazy_init()
+    return torch._C._mps_synchronize()
+
+
+def get_rng_state() -> Tensor:
+    r"""Returns the random number generator state as a ByteTensor."""
+    _lazy_init()
+    return default_generator.get_state()
+
+def set_rng_state(new_state: Tensor) -> None:
+    r"""Sets the random number generator state.
+    Args:
+        new_state (torch.ByteTensor): The desired state
+    """
+    _lazy_init()
+    new_state_copy = new_state.clone(memory_format=torch.contiguous_format)
+    default_generator.set_state(new_state_copy)
+
+def manual_seed(seed: int) -> None:
+    r"""Sets the seed for generating random numbers
+    Args:
+        seed (int): The desired seed.
+    """
+    if not is_available():
+        return
+    _lazy_init()
+    seed = int(seed)
+    default_generator.manual_seed(seed)
+
+def seed() -> None:
+    r"""Sets the seed for generating random numbers to a random number."""
+    _lazy_init()
+    default_generator.seed()
+
+def is_initialized():
+    r"""Returns whether PyTorch's MPS state has been initialized."""
+    return _initialized
+
+__all__ = [
+    'default_generator', 'get_rng_state', 'is_available', 'manual_seed',
+    'seed', 'set_rng_state', 'synchronize', 'init', 'is_initialized']
diff --git a/torch/random.py b/torch/random.py
index f5156bf48730d..bdddfbbd1b39f 100644
--- a/torch/random.py
+++ b/torch/random.py
@@ -39,6 +39,9 @@ def manual_seed(seed) -> torch._C.Generator:
     if not torch.cuda._is_in_bad_fork():
         torch.cuda.manual_seed_all(seed)
 
+    import torch.mps
+    torch.mps.manual_seed(seed)
+
     return default_generator.manual_seed(seed)
 
 
@@ -52,6 +55,9 @@ def seed() -> int:
     if not torch.cuda._is_in_bad_fork():
         torch.cuda.manual_seed_all(seed)
 
+    import torch.mps
+    torch.mps.manual_seed(seed)
+
     return seed
 
 
From a6b4bc54f8325af0110406acf31c9bd5ab94ed38 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 24 Jan 2023 12:37:40 -0800
Subject: [PATCH 1902/1922] Enable MPS CI runners (#252)

* Test MPS CI runners

* Cherry pick remaining files

* Enable lintrunner:

* Change lint  runner

* Retrigger checks

* Retrigger checks #2

* Retrigger checks #3

* Retrigger checks #4

* Retrigger checks #5

* Retrigger checks #5

* Retrigger checks #7

* Retrigger checks #8

* Retrigger checks #9

* Retrigger checks #9 (change arch to arm)

* Retrigger checks #10

* Retrigger checks #11

* Retrigger checks #12

* Retrigger checks #13

* Retrigger checks #14

* Retrigger checks #14

* Retrigger checks #15

* Retrigger checks #16

* Retrigger checks #16

* Retrigger checks #17

* Retrigger checks #19

* Retrigger checks #20

* Retrigger checks #21

* Fix lintrunner

* Fix lintrunner

* Remove lint.json
---
 .github/actionlint.yaml              |   1 +
 .github/workflows/_mac-build.yml     |   4 +-
 .github/workflows/_mac-test-mps.yml  |  15 +-
 .github/workflows/_mac-test.yml      |   9 +-
 .github/workflows/check-labels.yml   |  44 ----
 .github/workflows/lint.yml           | 318 ++++-----------------------
 .github/workflows/mac-mps.yml        |  19 +-
 .github/workflows/pull.yml           | 315 --------------------------
 .github/workflows/run_torchbench.yml | 105 ---------
 .jenkins/pytorch/common_utils.sh     |   9 +-
 test/test_mps.py                     | 244 ++++++++++----------
 11 files changed, 215 insertions(+), 868 deletions(-)
 delete mode 100644 .github/workflows/check-labels.yml
 delete mode 100644 .github/workflows/pull.yml
 delete mode 100644 .github/workflows/run_torchbench.yml

diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
index ff640de7bde5a..9c5a52153ca0e 100644
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -16,6 +16,7 @@ self-hosted-runner:
     - bm-runner
     - linux.rocm.gpu
     - macos-m1-12
+    - macos-m1-13
     - macos-12-xl
     - macos-12
     - macos12.3-m1
diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml
index 5ee909f02c222..ac018b66b9ee7 100644
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@@ -63,8 +63,8 @@ on:
 
 jobs:
   build:
-    # Don't run on forked repos.
-    if: github.repository_owner == 'pytorch'
+    # # Don't run on forked repos.
+    # if: github.repository_owner == 'pytorch'
     runs-on: ${{ inputs.runner-type }}
     env:
       # For sccache access (only on non-forked PRs)
diff --git a/.github/workflows/_mac-test-mps.yml b/.github/workflows/_mac-test-mps.yml
index 24203e0051538..5fac3126e20d5 100644
--- a/.github/workflows/_mac-test-mps.yml
+++ b/.github/workflows/_mac-test-mps.yml
@@ -14,11 +14,16 @@ on:
         description: |
           If this is set, our linter will use this to make sure that every other
           job with the same `sync-tag` is identical.
+      runs-on:
+        required: false
+        type: string
+        default: "macos-m1-12"
+        description: Hardware to run tests on
 
 jobs:
   run_mps_test:
     name: "Run MPS tests"
-    runs-on: macos-m1-12
+    runs-on: ${{ inputs.runs-on }}
     steps:
       - name: Checkout PyTorch
         uses: malfet/checkout@silent-checkout
@@ -77,6 +82,12 @@ jobs:
 
           ${CONDA_RUN} python3 test/run_test.py --mps --verbose
 
+      - name: Print remaining test logs
+        shell: bash
+        if: always()
+        run: |
+          cat test/**/*.log || true
+
       - name: Get workflow job id
         id: get-job-id
         uses: ./.github/actions/get-workflow-job-id
@@ -86,7 +97,7 @@ jobs:
 
       - name: Upload test artifacts
         uses: ./.github/actions/upload-test-artifacts
-        if: always() && (steps.test.conclusion == 'success' || steps.test.conclusion == 'failure')
+        if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
         with:
           use-gha: true
           file-suffix: ${{ github.job }}-mps-1-1-macos-m1-12_${{ steps.get-job-id.outputs.job-id }}
diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml
index cbc3372e1c42b..39236a0dd0828 100644
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@@ -106,6 +106,7 @@ jobs:
 
       - name: Start monitoring script
         id: monitor-script
+        continue-on-error: true
         run: |
           ${CONDA_RUN} python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
           echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
@@ -150,6 +151,12 @@ jobs:
           ${CONDA_RUN} python3 -mpip install --no-index --no-deps $(echo dist/*.whl)
           ${CONDA_RUN} .jenkins/pytorch/macos-test.sh
 
+      - name: Print remaining test logs
+        shell: bash
+        if: always()
+        run: |
+          cat test/**/*.log || true
+
       - name: Get workflow job id
         id: get-job-id
         uses: ./.github/actions/get-workflow-job-id
@@ -167,7 +174,7 @@ jobs:
 
       - name: Upload test artifacts
         uses: ./.github/actions/upload-test-artifacts
-        if: always() && (steps.test.conclusion == 'success' || steps.test.conclusion == 'failure')
+        if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
         with:
           use-gha: true
           file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
diff --git a/.github/workflows/check-labels.yml b/.github/workflows/check-labels.yml
deleted file mode 100644
index 5fa5fed16daf8..0000000000000
--- a/.github/workflows/check-labels.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-name: Check Labels
-
-on:
-  pull_request:
-    types: [opened, synchronize, reopened, labeled, unlabeled]
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  check-labels:
-    name: Check labels
-    runs-on: linux.20_04.4x
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
-        with:
-          submodules: false
-          fetch-depth: 1
-
-      - name: Setup Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.8'
-          architecture: x64
-          check-latest: false
-          cache: pip
-          cache-dependency-path: |
-            **/.github/requirements-gha-cache.txt
-
-      - name: Install requirements
-        id: requirements
-        run: |
-          pip install -r .github/requirements-gha-cache.txt --user
-
-      - name: Check labels
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          PR_NUM: ${{ github.event.number }}
-        run: |
-          set -ex
-          python3 .github/scripts/check_labels.py "${PR_NUM}"
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 1f47e1defc2fc..0b846bc5a90fa 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -5,14 +5,13 @@ on:
   push:
     branches:
       - master
-      - main
-      - release/*
-      - landchecks/*
   workflow_dispatch:
 
+# The names of steps that actually test the code should be suffixed with `(nonretryable)`.
+# When any other step fails, it's job will be retried once by retryBot.
 jobs:
   lintrunner:
-    runs-on: linux.20_04.16x
+    runs-on: macos-m1-12
     steps:
       - name: Checkout PyTorch
         uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
@@ -20,299 +19,64 @@ jobs:
           submodules: false
           fetch-depth: 1
 
-      - name: Setup Python
-        uses: actions/setup-python@v4
+      - name: Setup miniconda
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
         with:
-          python-version: '3.8'
-          architecture: x64
-          check-latest: false
-          cache: pip
-          cache-dependency-path: |
-            **/.github/requirements-gha-cache.txt
+          python-version: 3.9
+          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
+          # pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt
 
       - name: Install requirements
+        env:
+          ENV_NAME: conda-test-env-${{ github.run_id }}
+          PY_VERS: 3.9
+        shell: arch -arch arm64 bash {0}
         run: |
-          pip install -r .github/requirements-gha-cache.txt --user
+          # shellcheck disable=SC1090
+          set -ex
+          ${CONDA_RUN} python3 -m pip install --force-reinstall -r .github/requirements-gha-cache.txt
 
       - name: Initialize lint dependencies
-        run: lintrunner init
+        env:
+          ENV_NAME: conda-test-env-${{ github.run_id }}
+          PY_VERS: 3.9
+        shell: arch -arch arm64 bash {0}
+        run: |
+          # shellcheck disable=SC1090
+          set -ex
+          ${CONDA_RUN} lintrunner init
 
       - name: Do build steps necessary for linters
-        run: |
-          python3 -m tools.linter.clang_tidy.generate_build_files
-          python3 -m tools.generate_torch_version --is_debug=false
-          python3 -m tools.pyi.gen_pyi \
+        env:
+          ENV_NAME: conda-test-env-${{ github.run_id }}
+          PY_VERS: 3.9
+        shell: arch -arch arm64 bash {0}
+        run: |
+          # shellcheck disable=SC1090
+          set -ex
+          ${CONDA_RUN} python3 -m tools.linter.clang_tidy.generate_build_files
+          ${CONDA_RUN} python3 -m tools.generate_torch_version --is_debug=false
+          ${CONDA_RUN} python3 -m tools.pyi.gen_pyi \
             --native-functions-path aten/src/ATen/native/native_functions.yaml \
             --tags-path aten/src/ATen/native/tags.yaml \
             --deprecated-functions-path "tools/autograd/deprecated.yaml"
 
-      - name: Run lintrunner on all files
+      - name: Run lintrunner on all MPS files (nonretryable)
+        env:
+          ENV_NAME: conda-test-env-${{ github.run_id }}
+          PY_VERS: 3.9
+        shell: arch -arch arm64 bash {0}
         run: |
+          # shellcheck disable=SC1090
+          set -ex
           set +e
-          if ! lintrunner --force-color --all-files --tee-json=lint.json; then
+          if ! ${CONDA_RUN} lintrunner --force-color aten/src/ATen/native/mps/operations/* test/test_mps.py; then
               echo ""
               echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`.\e[0m"
               echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"
               exit 1
           fi
 
-      - name: Store annotations
-        if: always() && github.event_name == 'pull_request'
-        # Don't show this as an error; the above step will have already failed.
-        continue-on-error: true
-        run: |
-          # Use jq to massage the JSON lint output into GitHub Actions workflow commands.
-          jq --raw-output \
-            '"::\(if .severity == "advice" or .severity == "disabled" then "warning" else .severity end) file=\(.path),line=\(.line),col=\(.char),title=\(.code) \(.name)::" + (.description | gsub("\\n"; "%0A"))' \
-            lint.json
-
-  quick-checks:
-    name: quick-checks
-    runs-on: linux.20_04.4x
-    steps:
-      # [see note: pytorch repo ref]
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
-        with:
-          submodules: false
-          fetch-depth: 1
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-      - name: Setup Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.x'
-          architecture: x64
-          check-latest: false
-          cache: pip
-          cache-dependency-path: |
-            **/requirements.txt
-      - name: Install requirements
-        id: requirements
-        run: pip install -r requirements.txt --user
-      - name: Ensure no non-breaking spaces
-        if: always()
-        run: |
-          # NB: We use 'printf' below rather than '\u000a' since bash pre-4.2
-          # does not support the '\u000a' syntax (which is relevant for local linters)
-          (! git --no-pager grep -In "$(printf '\xC2\xA0')" -- . || (echo "The above lines have non-breaking spaces (U+00A0); please convert them to spaces (U+0020)"; false))
-      - name: Ensure cross-OS compatible file names
-        if: always()
-        run: |
-          (! git ls-files | grep -E '([<>:"|?*]|[ .]$)' || (echo "The above file names are not valid across all operating systems. Please ensure they don't contain the characters '<>:""|?*' and don't end with a white space or a '.' "; false))
-      - name: Ensure no versionless Python shebangs
-        if: always()
-        run: |
-          (! git --no-pager grep -In '#!.*python$' -- . || (echo "The above lines have versionless Python shebangs; please specify either python2 or python3"; false))
-      - name: C++ docs check
-        if: ${{ always() && steps.requirements.outcome == 'success' }}
-        run: |
-          sudo apt-get install -y doxygen
-          cd docs/cpp/source && ./check-doxygen.sh
-      - name: CUDA kernel launch check
-        if: ${{ always() && steps.requirements.outcome == 'success' }}
-        run: |
-          set -eux
-          python torch/testing/_internal/check_kernel_launches.py |& tee "${GITHUB_WORKSPACE}"/cuda_kernel_launch_checks.txt
-
-  pr-sanity-checks:
-    name: pr-sanity-checks
-    runs-on: linux.20_04.4x
-    # Only run this on pull requests
-    if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'skip-pr-sanity-checks')
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
-        with:
-          submodules: false
-          fetch-depth: -1
-      - name: PR size check
-        env:
-          BASE: ${{ github.event.pull_request.base.sha }}
-          HEAD: ${{ github.event.pull_request.head.sha }}
-        run: |
-          bash .github/scripts/pr-sanity-check.sh
-
-  workflow-checks:
-    name: workflow-checks
-    runs-on: linux.20_04.4x
-    steps:
-      # [see note: pytorch repo ref]
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
-        with:
-          submodules: false
-          fetch-depth: 1
-      - name: Setup Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.x'
-          architecture: x64
-          check-latest: false
-          cache: pip
-          cache-dependency-path: |
-            **/requirements.txt
-            **/.github/requirements-gha-cache.txt
-      - name: Install requirements
-        id: requirements
-        run: |
-          pip install -r requirements.txt --user
-      - name: Install Jinja2
-        run: |
-          pip install Jinja2==3.0.1 --user
-      - name: Regenerate workflows
-        id: generate_workflows
-        run: .github/scripts/generate_ci_workflows.py
-      - name: Assert that regenerating the workflows didn't change them
-        run: |
-          if ! .github/scripts/report_git_status.sh .github/workflows; then
-            echo
-            echo 'As shown by the above diff, the committed .github/workflows'
-            echo 'are not up to date according to .github/templates.'
-            echo 'Please run this command, commit, and push again to your PR:'
-            echo
-            echo '    .github/scripts/generate_ci_workflows.py'
-            echo
-            echo 'If running that command does nothing, you may need to rebase'
-            echo 'onto a more recent commit from the PyTorch master branch.'
-            false
-          fi
-      - name: Check that jobs will be cancelled
-        if: ${{ always() && steps.generate_workflows.outcome == 'success' }}
-        run: |
-          .github/scripts/ensure_actions_will_cancel.py
-
-  toc:
-    name: toc
-    runs-on: linux.20_04.4x
-    # https://github.com/actions/virtual-environments/issues/599#issuecomment-602754687
-    env:
-      NPM_CONFIG_PREFIX: ~/.npm-global
-    steps:
-      # [see note: pytorch repo ref]
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
-        with:
-          submodules: false
-          fetch-depth: 1
-      # This is not a node project so there is no package-lock.json to cache
-      - name: Setup Node
-        uses: actions/setup-node@v3
-      - name: Install markdown-toc
-        run: npm install -g markdown-toc
-      - name: Regenerate ToCs and check that they didn't change
-        run: |
-          set -eu
-          export PATH=~/.npm-global/bin:"$PATH"
-          for FILE in $(git grep -Il '<!-- toc -->' -- '**.md'); do
-            markdown-toc --bullets='-' -i "$FILE"
-          done
-
-          if ! .github/scripts/report_git_status.sh .; then
-            echo
-            echo 'As shown by the above diff, the table of contents in one or'
-            echo 'more Markdown files is not up to date with the file contents.'
-            echo 'You can either apply that Git diff directly to correct the'
-            echo 'table of contents, or if you have npm installed, you can'
-            echo 'install the npm package markdown-toc and run the following'
-            # shellcheck disable=SC2016
-            echo 'command (replacing $FILE with the filename for which you want'
-            echo 'to regenerate the table of contents):'
-            echo
-            # shellcheck disable=SC2016
-            echo "    markdown-toc --bullets='-' -i \"\$FILE\""
-            false
-          fi
-
-  test-tools:
-    name: Test tools
-    if: ${{ github.repository == 'pytorch/pytorch' }}
-    runs-on: linux.20_04.4x
-    steps:
-      # [see note: pytorch repo ref]
-      # deep clone (fetch-depth 0) required, to allow us to use git log
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
-        with:
-          submodules: false
-      - name: Setup Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.8'
-          architecture: x64
-          check-latest: false
-          cache: pip
-          cache-dependency-path: |
-            **/requirements.txt
-            **/requirements-flake8.txt
-            **/.circleci/docker/requirements-ci.txt
-            **/.github/requirements-gha-cache.txt
-      - name: Install dependencies
-        # mypy and boto3 versions copied from
-        # .circleci/docker/common/install_conda.sh
-        run: |
-          set -eux
-          pip install -r requirements.txt
-          pip install boto3==1.19.12
-          pip install typing-extensions==3.10 --user
-          pip install -r requirements-flake8.txt --user
-          pip install rockset==0.8.10 --user
-          pip install -r requirements.txt --user
-          pip install mypy==0.960 --user
-          make setup_lint
-      - name: Test tools
-        run: |
-          python3 -m unittest discover -vs tools/test -p 'test_*.py'
-          python3 -m unittest discover -vs .github/scripts -p 'test_*.py'
-
-  test_collect_env:
-    if: ${{ github.repository == 'pytorch/pytorch' }}
-    name: Test collect_env
-    runs-on: linux.20_04.4x
-    strategy:
-      matrix:
-        test_type: [with_torch, without_torch, older_python_version]
-    steps:
-      # [see note: pytorch repo ref]
-      # deep clone (fetch-depth 0) required, to allow us to use git log
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
-        with:
-          submodules: false
-          fetch-depth: 1
-      - name: Setup Python 3.5
-        if: matrix.test_type == 'older_python_version'
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.5'
-          architecture: x64
-          check-latest: false
-          cache: pip
-          cache-dependency-path: |
-            **/requirements.txt
-      - name: Setup Python 3.8
-        if: matrix.test_type != 'older_python_version'
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.8'
-          architecture: x64
-          check-latest: false
-          cache: pip
-          cache-dependency-path: |
-            **/requirements.txt
-      - name: Install torch
-        if: matrix.test_type == 'with_torch'
-        run: |
-          pip install -r requirements.txt
-          # Doesn't really matter what torch version, we just need ANY torch installed
-          pip install 'torch==1.*'
-      - name: Run collect_env.py
-        run: |
-          # All we need to see is that it passes
-          python3 torch/utils/collect_env.py
-
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
diff --git a/.github/workflows/mac-mps.yml b/.github/workflows/mac-mps.yml
index 5df7299cc5076..a2ca4867fd76b 100644
--- a/.github/workflows/mac-mps.yml
+++ b/.github/workflows/mac-mps.yml
@@ -1,10 +1,11 @@
 name: Mac MPS
 
 on:
-  push:
-    tags:
-      - ciflow/mps/*
-  workflow_dispatch:
+  # push:
+  #   tags:
+  #     - ciflow/mps/*
+  # workflow_dispatch:
+  pull_request:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@@ -18,7 +19,7 @@ jobs:
       sync-tag: macos-12-py3-arm64-build
       build-environment: macos-12-py3-arm64
       xcode-version: "13.3.1"
-      runner-type: macos-12-xl
+      runner-type: macos-m1-13
       build-generates-artifacts: true
       # To match the one pre-installed in the m1 runners
       python_version: 3.9.12
@@ -37,3 +38,11 @@ jobs:
     with:
       sync-tag: macos-12-py3-arm64-mps-test
       build-environment: macos-12-py3-arm64
+
+  macos-13-py3-arm64-mps-test:
+    name: macos-13-py3-arm64-mps
+    uses: ./.github/workflows/_mac-test-mps.yml
+    needs: macos-12-py3-arm64-build
+    with:
+      build-environment: macos-12-py3-arm64
+      runs-on: macos-m1-13
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
deleted file mode 100644
index 3642c7fc17691..0000000000000
--- a/.github/workflows/pull.yml
+++ /dev/null
@@ -1,315 +0,0 @@
-name: pull
-
-on:
-  pull_request:
-  push:
-    branches:
-      - master
-      - main
-      - release/*
-      - landchecks/*
-  workflow_dispatch:
-  schedule:
-    - cron: 29 8 * * *  # about 1:29am PDT
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
-  cancel-in-progress: true
-
-jobs:
-  linux-focal-py3_7-gcc7-build:
-    name: linux-focal-py3.7-gcc7
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3.7-gcc7
-      docker-image-name: pytorch-linux-focal-py3.7-gcc7
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "distributed", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "distributed", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-          { config: "docs_test", shard: 1, num_shards: 1,  runner: "linux.2xlarge" },
-          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-          { config: "backwards_compat", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-        ]}
-
-  linux-focal-py3_7-gcc7-test:
-    name: linux-focal-py3.7-gcc7
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-py3_7-gcc7-build
-    with:
-      build-environment: linux-focal-py3.7-gcc7
-      docker-image: ${{ needs.linux-focal-py3_7-gcc7-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-py3_7-gcc7-build.outputs.test-matrix }}
-
-  linux-docs:
-    name: linux-docs
-    uses: ./.github/workflows/_docs.yml
-    needs: linux-focal-py3_7-gcc7-build
-    with:
-      build-environment: linux-focal-py3.7-gcc7
-      docker-image: ${{ needs.linux-focal-py3_7-gcc7-build.outputs.docker-image }}
-
-  linux-focal-py3_7-gcc7-no-ops:
-    name: linux-focal-py3.7-gcc7-no-ops
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3.7-gcc7-no-ops
-      docker-image-name: pytorch-linux-focal-py3.7-gcc7
-
-  linux-focal-py3_7-gcc7-pch:
-    name: linux-focal-py3.7-gcc7-pch
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3.7-gcc7-pch
-      docker-image-name: pytorch-linux-focal-py3.7-gcc7
-
-  linux-focal-py3_7-clang7-asan-build:
-    name: linux-focal-py3.7-clang7-asan
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3.7-clang7-asan
-      docker-image-name: pytorch-linux-focal-py3-clang7-asan
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 5, runner: "linux.2xlarge" },
-          { config: "default", shard: 3, num_shards: 5, runner: "linux.2xlarge" },
-          { config: "default", shard: 4, num_shards: 5, runner: "linux.4xlarge" },
-          { config: "default", shard: 5, num_shards: 5, runner: "linux.4xlarge" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-        ]}
-
-  linux-focal-py3_7-clang7-asan-test:
-    name: linux-focal-py3.7-clang7-asan
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-py3_7-clang7-asan-build
-    with:
-      build-environment: linux-focal-py3.7-clang7-asan
-      docker-image: ${{ needs.linux-focal-py3_7-clang7-asan-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-py3_7-clang7-asan-build.outputs.test-matrix }}
-
-  linux-focal-py3_7-clang10-onnx-build:
-    name: linux-focal-py3.7-clang10-onnx
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3.7-clang10-onnx
-      docker-image-name: pytorch-linux-focal-py3-clang10-onnx
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-        ]}
-
-  linux-focal-py3_7-clang10-onnx-test:
-    name: linux-focal-py3.7-clang10-onnx
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-py3_7-clang10-onnx-build
-    with:
-      build-environment: linux-focal-py3.7-clang10-onnx
-      docker-image: ${{ needs.linux-focal-py3_7-clang10-onnx-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-py3_7-clang10-onnx-build.outputs.test-matrix }}
-
-  linux-bionic-py3_7-clang9-build:
-    name: linux-bionic-py3.7-clang9
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-py3.7-clang9
-      docker-image-name: pytorch-linux-bionic-py3.7-clang9
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "crossref", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "crossref", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-        ]}
-
-  linux-bionic-py3_7-clang9-test:
-    name: linux-bionic-py3.7-clang9
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-py3_7-clang9-build
-    with:
-      build-environment: linux-bionic-py3.7-clang9
-      docker-image: ${{ needs.linux-bionic-py3_7-clang9-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-py3_7-clang9-build.outputs.test-matrix }}
-
-  linux-vulkan-bionic-py3_7-clang9-build:
-    name: linux-vulkan-bionic-py3.7-clang9
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-vulkan-bionic-py3.7-clang9
-      docker-image-name: pytorch-linux-bionic-py3.7-clang9
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-        ]}
-
-  linux-vulkan-bionic-py3_7-clang9-test:
-    name: linux-vulkan-bionic-py3.7-clang9
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-vulkan-bionic-py3_7-clang9-build
-    with:
-      build-environment: linux-vulkan-bionic-py3.7-clang9
-      docker-image: ${{ needs.linux-vulkan-bionic-py3_7-clang9-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-vulkan-bionic-py3_7-clang9-build.outputs.test-matrix }}
-
-  linux-bionic-cuda11_6-py3_10-gcc7-build:
-    name: linux-bionic-cuda11.6-py3.10-gcc7
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-cuda11.6-py3.10-gcc7
-      docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "deploy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
-        ]}
-
-  linux-bionic-cuda11_6-py3_10-gcc7-test:
-    name: linux-bionic-cuda11.6-py3.10-gcc7
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_6-py3_10-gcc7-build
-    with:
-      build-environment: linux-bionic-cuda11.6-py3.10-gcc7
-      docker-image: ${{ needs.linux-bionic-cuda11_6-py3_10-gcc7-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_6-py3_10-gcc7-build.outputs.test-matrix }}
-
-  linux-focal-py3-clang7-mobile-build:
-    name: linux-focal-py3-clang7-mobile-build
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3-clang7-mobile-build
-      docker-image-name: pytorch-linux-focal-py3-clang7-asan
-      build-generates-artifacts: false
-
-  linux-jammy-cuda-11_6-cudnn8-py3_8-clang12-build:
-    name: linux-jammy-cuda11.6-cudnn8-py3.8-clang12
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-jammy-cuda11.6-cudnn8-py3.8-clang12
-      docker-image-name: pytorch-linux-jammy-cuda11.6-cudnn8-py3.8-clang12
-
-  linux-focal-py3-clang7-mobile-custom-build-static:
-    name: linux-focal-py3-clang7-mobile-custom-build-static
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3-clang7-mobile-custom-build-static
-      docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c
-      build-generates-artifacts: false
-
-  linux-bionic-py3_7-clang8-xla-build:
-    name: linux-bionic-py3_7-clang8-xla
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-py3_7-clang8-xla
-      docker-image-name: xla_base
-      test-matrix: |
-        { include: [
-          { config: "xla", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-        ]}
-
-  linux-bionic-py3_7-clang8-xla-test:
-    name: linux-bionic-py3_7-clang8-xla
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-py3_7-clang8-xla-build
-    with:
-      build-environment: linux-bionic-py3_7-clang8-xla
-      docker-image: ${{ needs.linux-bionic-py3_7-clang8-xla-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-py3_7-clang8-xla-build.outputs.test-matrix }}
-
-  win-vs2019-cpu-py3-build:
-    name: win-vs2019-cpu-py3
-    uses: ./.github/workflows/_win-build.yml
-    with:
-      build-environment: win-vs2019-cpu-py3
-      cuda-version: cpu
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "windows.4xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "windows.4xlarge" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
-        ]}
-
-  win-vs2019-cpu-py3-test:
-    name: win-vs2019-cpu-py3
-    uses: ./.github/workflows/_win-test.yml
-    needs: win-vs2019-cpu-py3-build
-    with:
-      build-environment: win-vs2019-cpu-py3
-      cuda-version: cpu
-      test-matrix: ${{ needs.win-vs2019-cpu-py3-build.outputs.test-matrix }}
-
-  win-vs2019-cuda11_6-py3-build:
-    if: github.event_name == 'pull_request'
-    name: win-vs2019-cuda11.6-py3
-    uses: ./.github/workflows/_win-build.yml
-    with:
-      build-environment: win-vs2019-cuda11.6-py3
-      cuda-version: "11.6"
-      sync-tag: win-cuda-build
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 5, runner: "windows.8xlarge.nvidia.gpu" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "windows.8xlarge.nvidia.gpu" },
-          { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
-        ]}
-
-  linux-bionic-cuda11_6-py3_10-gcc7-bazel-test:
-    name: linux-bionic-cuda11.6-py3.10-gcc7-bazel-test
-    uses: ./.github/workflows/_bazel-build-test.yml
-    with:
-      build-environment: linux-bionic-cuda11.6-py3.10-gcc7-bazel-test
-      docker-image-name: pytorch-linux-bionic-cuda11.6-cudnn8-py3-gcc7
-
-  linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single:
-    name: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single
-    uses: ./.github/workflows/_android-build-test.yml
-    with:
-      build-environment: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single
-      docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c
-
-  linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit:
-    name: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit
-    uses: ./.github/workflows/_android-build-test.yml
-    with:
-      build-environment: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit
-      docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c
-
-  linux-focal-py3_7-gcc7-mobile-lightweight-dispatch-build:
-    name: linux-focal-py3.7-gcc7-mobile-lightweight-dispatch-build
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3.7-gcc7-mobile-lightweight-dispatch-build
-      docker-image-name: pytorch-linux-focal-py3.7-gcc7
-      build-generates-artifacts: false
-
-  linux-focal-rocm5_2-py3_8-build:
-    # don't run build twice on master
-    if: github.event_name == 'pull_request'
-    name: linux-focal-rocm5.2-py3.8
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-rocm5.2-py3.8
-      docker-image-name: pytorch-linux-focal-rocm5.2-py3.8
-      sync-tag: rocm-build
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
-        ]}
diff --git a/.github/workflows/run_torchbench.yml b/.github/workflows/run_torchbench.yml
deleted file mode 100644
index b6c870fa7839d..0000000000000
--- a/.github/workflows/run_torchbench.yml
+++ /dev/null
@@ -1,105 +0,0 @@
-name: TorchBench CI (pytorch-linux-py3.8-cu116)
-on:
-  pull_request:
-
-env:
-  PYTHON_VERSION: "3.8"
-  # must be consistent with https://github.com/pytorch/benchmark/blob/main/requirements.txt#L19
-  NUMPY_VERSION: "1.21.2"
-  SETUP_SCRIPT: "/data/nvme/bin/setup_instance.sh"
-  PR_NUM: ${{ github.event.number }}
-  PR_BODY: ${{ github.event.pull_request.body }}
-  PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
-  PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }}
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
-
-jobs:
-  run-torchbench:
-    # We don't accept running on non-pytorch repos because of security concerns
-    # Only run the job when the body contains magic word "RUN_TORCHBENCH:"
-    if: ${{ github.repository_owner == 'pytorch' && contains(github.event.pull_request.body, 'RUN_TORCHBENCH:') }}
-    runs-on: [self-hosted, bm-runner]
-    # Set to 12 hours
-    timeout-minutes: 720
-    steps:
-      - name: Checkout PyTorch
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          path: pytorch
-      - name: Update self-hosted PyTorch
-        run: |
-          pushd "${HOME}"/pytorch
-          git remote prune origin
-          git fetch
-          popd
-      - name: Create conda environment and install deps
-        run: |
-          conda create -y -n pr-ci python="${PYTHON_VERSION}"
-          # shellcheck source=/dev/null
-          . "${SETUP_SCRIPT}"
-          conda activate pr-ci
-          # pin cmake version to 3.22 since 3.23 breaks pytorch build
-          # see details at: https://github.com/pytorch/pytorch/issues/74985
-          conda install -y numpy="${NUMPY_VERSION}" requests ninja pyyaml mkl mkl-include \
-                           setuptools cmake=3.22 cffi typing_extensions boto3 \
-                           future six dataclasses pillow pytest tabulate gitpython git-lfs tqdm psutil
-          pip install --pre torch torchvision torchtext -f https://download.pytorch.org/whl/nightly/cu116/torch_nightly.html
-      - name: Setup TorchBench branch
-        run: |
-          # shellcheck source=/dev/null
-          . "${SETUP_SCRIPT}"
-          conda activate pr-ci
-          PR_BODY_FILE=/tmp/pr-body.txt
-          echo "$PR_BODY" > ${PR_BODY_FILE}
-          python pytorch/.github/scripts/run_torchbench.py --pr-body "${PR_BODY_FILE}" set-torchbench-branch
-      - name: Checkout TorchBench
-        uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9
-        with:
-          repository: pytorch/benchmark
-          path: benchmark
-          lfs: false
-          ref: ${{ env.TORCHBENCH_BRANCH }}
-      - name: GPU Info
-        run: |
-          nvidia-smi
-      - name: Run TorchBench
-        run: |
-          set -x
-          pushd "${HOME}"/pytorch
-          PR_MERGE_BASE=$(git merge-base "$PR_BASE_SHA" "$PR_HEAD_SHA")
-          popd
-          PR_BODY_FILE=/tmp/pr-body.txt
-          echo "$PR_BODY" > ${PR_BODY_FILE}
-          # shellcheck source=/dev/null
-          . "${SETUP_SCRIPT}"
-          conda activate pr-ci
-          python3 pytorch/.github/scripts/run_torchbench.py \
-                  --pr-body "$PR_BODY_FILE" \
-                  run \
-                  --pytorch-path "${HOME}"/pytorch \
-                  --torchbench-path "${PWD}"/benchmark \
-                  --pr-num "$PR_NUM" \
-                  --pr-base-sha "$PR_MERGE_BASE" \
-                  --pr-head-sha "$PR_HEAD_SHA"
-      - name: Upload result to S3
-        run: |
-          # shellcheck source=/dev/null
-          . "${SETUP_SCRIPT}"
-          conda activate pr-ci
-          python3 pytorch/.github/scripts/run_torchbench.py \
-                  upload-s3 \
-                  --result-dir "${HOME}/.torchbench/bisection/pr${{ github.event.number }}"
-      - name: Remove conda environment and cleanup
-        run: |
-          conda env remove --name pr-ci
-          rm /tmp/pr-body.txt
-      - name: Upload artifact
-        uses: actions/upload-artifact@v3
-        with:
-          name: TorchBench result
-          path: ~/.torchbench/bisection/pr${{ github.event.number }}
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
diff --git a/.jenkins/pytorch/common_utils.sh b/.jenkins/pytorch/common_utils.sh
index 6d3c96b9278f7..6060a7179e0f9 100644
--- a/.jenkins/pytorch/common_utils.sh
+++ b/.jenkins/pytorch/common_utils.sh
@@ -145,8 +145,7 @@ function install_triton() {
 }
 
 function setup_torchdeploy_deps(){
-  conda install -y cmake
-  conda install -y -c conda-forge libpython-static=3.10
+  conda install -y -n "py_${ANACONDA_PYTHON_VERSION}" "libpython-static=${ANACONDA_PYTHON_VERSION}"
   local CC
   local CXX
   CC="$(which gcc)"
@@ -158,10 +157,12 @@ function setup_torchdeploy_deps(){
 
 function checkout_install_torchdeploy() {
   local commit
+  commit=$(get_pinned_commit multipy)
   setup_torchdeploy_deps
   pushd ..
   git clone --recurse-submodules https://github.com/pytorch/multipy.git
   pushd multipy
+  git checkout "${commit}"
   python multipy/runtime/example/generate_examples.py
   pip install -e . --install-option="--cudatests"
   popd
@@ -197,7 +198,9 @@ function checkout_install_torchbench() {
   git clone https://github.com/pytorch/benchmark torchbench
   pushd torchbench
   git checkout no_torchaudio
-  python install.py
+  # Occasionally the installation may fail on one model but it is ok to continue
+  # to install and test other models
+  python install.py --continue_on_fail
   popd
 }
 
diff --git a/test/test_mps.py b/test/test_mps.py
index 9754bafc11446..0334f8c4eb171 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -43,7 +43,7 @@
 import numpy as np
 import torch
 import torch.utils._pytree as pytree
-from itertools import permutations, product
+from itertools import product
 
 
 # Copied from `test_ops.py` for the purposes of duplicating `test_numpy_ref`
@@ -254,11 +254,11 @@ def test_exp1(self, device="mps", dtype=torch.float):
         output = torch.exp(input).to('cpu')
 
     def test_exp_strided_output(self):
-        x = torch.rand((256,10), device='mps')
+        x = torch.rand((256, 10), device='mps')
         x_cpu = x.to("cpu")
 
-        x = x.permute(1,0)
-        x_cpu = x_cpu.permute(1,0)
+        x = x.permute(1, 0)
+        x_cpu = x_cpu.permute(1, 0)
 
         res = x.exp()
         res_cpu = x_cpu.exp()
@@ -1075,8 +1075,8 @@ def test_norm(self):
         res_cpu = torch.norm(b_cpu, float('inf'))
         self.assertEqual(res, res_cpu)
 
-        c = torch.tensor([[1, 2, 3],[-1, 1, 4]] , dtype=torch.float, device="mps")
-        c_cpu = torch.tensor([[1, 2, 3],[-1, 1, 4]] , dtype=torch.float, device="cpu")
+        c = torch.tensor([[1, 2, 3], [-1, 1, 4]], dtype=torch.float, device="mps")
+        c_cpu = torch.tensor([[1, 2, 3], [-1, 1, 4]], dtype=torch.float, device="cpu")
 
         res = torch.norm(c, dim=0)
         res_cpu = torch.norm(c_cpu, dim=0)
@@ -1093,8 +1093,8 @@ def test_norm(self):
         d = torch.arange(8, dtype=torch.float, device="mps").reshape(2, 2, 2)
         d_cpu = torch.arange(8, dtype=torch.float, device="cpu").reshape(2, 2, 2)
 
-        res = torch.norm(d, dim=(1,2))
-        res_cpu = torch.norm(d_cpu, dim=(1,2))
+        res = torch.norm(d, dim=(1, 2))
+        res_cpu = torch.norm(d_cpu, dim=(1, 2))
         self.assertEqual(res, res_cpu)
 
         res = torch.norm(d[0, :, :]), torch.norm(d[1, :, :])
@@ -1614,13 +1614,13 @@ def test_expand_cpu_to_mps_copy(self):
     def test_cpu_to_strided_mps_copy(self):
         # https://github.com/pytorch/pytorch/issues/86975
 
-        a1 = torch.Tensor([[1,2],[3,4], [5,6]]).to(torch.device("mps"))
+        a1 = torch.Tensor([[1, 2], [3, 4], [5, 6]]).to(torch.device("mps"))
         b1 = torch.Tensor([-1, -1])
-        a1[1:,1] = b1
+        a1[1:, 1] = b1
 
-        a2 = torch.Tensor([[1,2],[3,4], [5,6]]).to(torch.device("mps"))
+        a2 = torch.Tensor([[1, 2], [3, 4], [5, 6]]).to(torch.device("mps"))
         b2 = torch.Tensor([-1, -1]).to(torch.device("mps"))
-        a2[1:,1] = b2
+        a2[1:, 1] = b2
 
         self.assertEqual(a1, a2)
 
@@ -1628,8 +1628,8 @@ def test_slice_reshape(self):
         x = torch.randn([1, 6, 4, 2], dtype=torch.float, device="mps")
         x_cpu = x.detach().clone().to("cpu")
 
-        x = x[:,3:].view(2, 3, 4, 1)
-        x_cpu = x_cpu[:,3:].view(2, 3, 4, 1)
+        x = x[:, 3:].view(2, 3, 4, 1)
+        x_cpu = x_cpu[:, 3:].view(2, 3, 4, 1)
         self.assertEqual(x, x_cpu)
 
         x = x + 2
@@ -2123,13 +2123,13 @@ def helper(x, return_inverse, return_counts):
             result_cpu = torch.unique(cpu_x, return_inverse=return_inverse, return_counts=return_counts)
 
             self.assertEqual(result, result_cpu)
-        helper(torch.tensor([1,2,4,2,1]), False, False)
-        helper(torch.randint(3,(10,)), False, False)
-        helper(torch.randint(3,(10,)), True, False)
-        helper(torch.randint(3,(10,)), False, True)
-        helper(torch.randint(3,(10,)), True, True)
-        helper(torch.randint(3,(1,)), True, True)
-        helper(torch.randint(3,(0,)), True, True)
+        helper(torch.tensor([1, 2, 4, 2, 1]), False, False)
+        helper(torch.randint(3, (10,)), False, False)
+        helper(torch.randint(3, (10,)), True, False)
+        helper(torch.randint(3, (10,)), False, True)
+        helper(torch.randint(3, (10,)), True, True)
+        helper(torch.randint(3, (1,)), True, True)
+        helper(torch.randint(3, (0,)), True, True)
 
     def test_unique_consecutive(self):
         def helper(x, dim, return_inverse, return_counts):
@@ -2140,26 +2140,26 @@ def helper(x, dim, return_inverse, return_counts):
             result_cpu = torch.unique_consecutive(cpu_x, dim=dim, return_inverse=return_inverse, return_counts=return_counts)
 
             self.assertEqual(result, result_cpu)
-        helper(torch.tensor([1,2,4,2,1]), 0, False, False)
-        helper(torch.randint(3,(10,)), 0, False, False)
-        helper(torch.randint(3,(10,)), 0, True, False)
-        helper(torch.randint(3,(10,)), 0, False, True)
-        helper(torch.randint(3,(10,)), 0, True, True)
-        helper(torch.randint(3,(10,)), 0, True, True)
-        helper(torch.randint(3,(1,)), 0, True, True)
-        helper(torch.randint(3,(0,)), 0, True, True)
-
-        helper(torch.tensor([[1,1,2,3,3,2],[1,1,1,2,2,1]]), 0, False, False)
-        helper(torch.tensor([[1,1,2,3,3,2],[1,1,1,2,2,1]]), 0, True, True)
-        helper(torch.randint(2,(20,2)), 0, True, True)
-        helper(torch.randint(2,(1,2)), 0, True, True)
-        helper(torch.randint(2,(0,2)), 0, True, True)
-
-        helper(torch.tensor([[1,1,2,3,3,2],[1,1,1,2,2,1]]), 1, False, False)
-        helper(torch.tensor([[1,1,2,3,3,2],[1,1,1,2,2,1]]), 1, True, True)
-        helper(torch.randint(2,(2,20)), 1, True, True)
-        helper(torch.randint(2,(2,1)), 1, True, True)
-        helper(torch.randint(2,(2,0)), 1, True, True)
+        helper(torch.tensor([1, 2, 4, 2, 1]), 0, False, False)
+        helper(torch.randint(3, (10,)), 0, False, False)
+        helper(torch.randint(3, (10,)), 0, True, False)
+        helper(torch.randint(3, (10,)), 0, False, True)
+        helper(torch.randint(3, (10,)), 0, True, True)
+        helper(torch.randint(3, (10,)), 0, True, True)
+        helper(torch.randint(3, (1,)), 0, True, True)
+        helper(torch.randint(3, (0,)), 0, True, True)
+
+        helper(torch.tensor([[1, 1, 2, 3, 3, 2], [1, 1, 1, 2, 2, 1]]), 0, False, False)
+        helper(torch.tensor([[1, 1, 2, 3, 3, 2], [1, 1, 1, 2, 2, 1]]), 0, True, True)
+        helper(torch.randint(2, (20, 2)), 0, True, True)
+        helper(torch.randint(2, (1, 2)), 0, True, True)
+        helper(torch.randint(2, (0, 2)), 0, True, True)
+
+        helper(torch.tensor([[1, 1, 2, 3, 3, 2], [1, 1, 1, 2, 2, 1]]), 1, False, False)
+        helper(torch.tensor([[1, 1, 2, 3, 3, 2], [1, 1, 1, 2, 2, 1]]), 1, True, True)
+        helper(torch.randint(2, (2, 20)), 1, True, True)
+        helper(torch.randint(2, (2, 1)), 1, True, True)
+        helper(torch.randint(2, (2, 0)), 1, True, True)
 
     # See https://github.com/pytorch/pytorch/issues/85675
     def test_cat_non_contiguous(self):
@@ -2183,8 +2183,8 @@ def test_from_numpy_non_contiguous(self):
 
     def test_cumsum_all_dtypes(self):
         def helper(dtype):
-            t = torch.tensor([1,1,1,1], device="mps", dtype=dtype)
-            t_cpu = torch.tensor([1,1,1,1], device="cpu")
+            t = torch.tensor([1, 1, 1, 1], device="mps", dtype=dtype)
+            t_cpu = torch.tensor([1, 1, 1, 1], device="cpu")
 
             a = t.cumsum(0, dtype=dtype)
             a_cpu = t_cpu.cumsum(0, dtype=dtype)
@@ -4193,14 +4193,14 @@ def helper(shape, output_size, scales, mode, align_corners=False):
             inputMPS = inputCPU.detach().clone().to('mps').requires_grad_()
 
             # align_corners is used for 2D interpolation only
-            if (align_corners == True and len(shape) > 3 and mode == 'bilinear'):
-                if (scales != None):
+            if (align_corners is True and len(shape) > 3 and mode == 'bilinear'):
+                if scales is not None:
                     outputCPU = nn.functional.interpolate(inputCPU, scale_factor=scales, mode=mode, align_corners=align_corners)
                     outputMPS = nn.functional.interpolate(inputMPS, scale_factor=scales, mode=mode, align_corners=align_corners)
                 else:
                     outputCPU = nn.functional.interpolate(inputCPU, size=output_size, mode=mode, align_corners=align_corners)
                     outputMPS = nn.functional.interpolate(inputMPS, size=output_size, mode=mode, align_corners=align_corners)
-            elif (scales != None):
+            elif scales is not None:
                 outputCPU = nn.functional.interpolate(inputCPU, scale_factor=scales, mode=mode)
                 outputMPS = nn.functional.interpolate(inputMPS, scale_factor=scales, mode=mode)
             else:
@@ -4216,16 +4216,16 @@ def helper(shape, output_size, scales, mode, align_corners=False):
 
         # 1D interpolation
         for mode in ['nearest', 'nearest-exact']:
-            helper([2, 3, 4], [3], None, mode) # downsample with size
-            helper([2, 3, 4], [6], None, mode) # upsample with size
-            helper([2, 3, 4], None, [0.6], mode) # downsample with scale factor
-            helper([2, 3, 4], None, [1.7], mode) # upsample with scale factor
+            helper([2, 3, 4], [3], None, mode)  # downsample with size
+            helper([2, 3, 4], [6], None, mode)  # upsample with size
+            helper([2, 3, 4], None, [0.6], mode)  # downsample with scale factor
+            helper([2, 3, 4], None, [1.7], mode)  # upsample with scale factor
         # 2D interpolation
         for mode in ['nearest', 'nearest-exact', 'bilinear']:
-            helper([2, 3, 4, 5], [3, 4], None, mode) # downsample_nearest with size
-            helper([2, 3, 4, 5], [6, 7], None, mode) # upsample_nearest with size
-            helper([2, 3, 4, 5], None, [0.6, 0.7], mode) # downsample_nearest with scale factor
-            helper([2, 3, 4, 5], None, [1.4, 1.7], mode) # upsample_nearest with scale factor
+            helper([2, 3, 4, 5], [3, 4], None, mode)  # downsample_nearest with size
+            helper([2, 3, 4, 5], [6, 7], None, mode)  # upsample_nearest with size
+            helper([2, 3, 4, 5], None, [0.6, 0.7], mode)  # downsample_nearest with scale factor
+            helper([2, 3, 4, 5], None, [1.4, 1.7], mode)  # upsample_nearest with scale factor
         # align_corners=True
         helper([2, 3, 4, 5], [3, 4], None, 'bilinear', True)
         helper([2, 3, 4, 5], None, [1.4, 1.7], 'bilinear', True)
@@ -4358,7 +4358,7 @@ def helper(shape, padding, op, value=0):
         helper((2, 4, 6, 8, 4), (1, 3, 3, 5, 3, 4), nn.ConstantPad3d)
         # check the workaround for the right padding bug in Monterey
         helper((1, 2, 2, 2, 2), (0, 1), nn.ConstantPad3d)
-		# input size < pad size
+        # input size < pad size
         helper((2, 4, 6), (1, 3, 3, 5, 3, 4), nn.ConstantPad3d)
 
     # Test stack forward
@@ -4642,29 +4642,27 @@ def helper(shape, dim=0):
 
     # # Test softplus
     # def test_softplus(self):
-        # def helper(shape, beta=1, threshold=20):
-            # cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
-            # x = cpu_x.detach().clone().to('mps').requires_grad_()
-
-            # softplus_result = torch.nn.Softplus(beta=beta, threshold=threshold)(x)
-            # softplus_result_cpu = torch.nn.Softplus(beta=beta, threshold=threshold)(cpu_x)
+    #     def helper(shape, beta=1, threshold=20):
+    #         cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+    #         x = cpu_x.detach().clone().to('mps').requires_grad_()
 
-            # cpu_grad = torch.randn(softplus_result.shape)
-            # grad = cpu_grad.to('mps')
+    #         softplus_result = torch.nn.Softplus(beta=beta, threshold=threshold)(x)
+    #         softplus_result_cpu = torch.nn.Softplus(beta=beta, threshold=threshold)(cpu_x)
 
-            # softplus_result.backward(gradient=grad)
-            # softplus_result_cpu.backward(gradient=cpu_grad)
+    #         cpu_grad = torch.randn(softplus_result.shape)
+    #         grad = cpu_grad.to('mps')
 
-            # self.assertEqual(softplus_result, softplus_result_cpu)
-            # self.assertEqual(x.grad, cpu_x.grad)
+    #         softplus_result.backward(gradient=grad)
+    #         softplus_result_cpu.backward(gradient=cpu_grad)
 
-        # # Test empty shape too
-        # for shape in [(), (2, 3), (10, 10), (2, 3, 4, 5)]:
-            # for beta in [0.5, 1, 2, 3, 4]:
-                # for threshold in [0.5, 20, 30, 40, 50]:
-                    # helper(shape, beta, threshold)
+    #         self.assertEqual(softplus_result, softplus_result_cpu)
+    #         self.assertEqual(x.grad, cpu_x.grad)
 
-    # Test silu
+    #     # Test empty shape too
+    #     for shape in [(), (2, 3), (10, 10), (2, 3, 4, 5)]:
+    #         for beta in [0.5, 1, 2, 3, 4]:
+    #             for threshold in [0.5, 20, 30, 40, 50]:
+    #                 helper(shape, beta, threshold)
 
     def test_silu(self):
         def helper(shape):
@@ -5314,22 +5312,22 @@ def helper(shape, dim, idx_shape, src_shape, idx_dtype=torch.int64, reduce_str="
             self.assertEqual(scatter_result, scatter_result_cpu)
 
         # for reduce in ["sum", "prod", "amax", "amin"]:
-        for reduce in ["add", "multiply"]:
+        for reduce_type in ["add", "multiply"]:
             helper((2, 3), 0, (5, 3), (5, 3), reduce_str=reduce)
-            helper((2, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5), reduce_str=reduce)
-            helper((8, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5), reduce_str=reduce)
-            helper((8, 8, 4, 5), 0, (4, 7, 3, 2), (4, 7, 3, 2), reduce_str=reduce)
-            helper((8, 8, 4, 5), 0, (4, 6, 3, 2), (4, 7, 3, 2), reduce_str=reduce)
-            helper((8, 8, 4, 5), 0, (4, 6, 3, 2), (8, 8, 4, 5), reduce_str=reduce)
+            helper((2, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5), reduce_str=reduce_type)
+            helper((8, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5), reduce_str=reduce_type)
+            helper((8, 8, 4, 5), 0, (4, 7, 3, 2), (4, 7, 3, 2), reduce_str=reduce_type)
+            helper((8, 8, 4, 5), 0, (4, 6, 3, 2), (4, 7, 3, 2), reduce_str=reduce_type)
+            helper((8, 8, 4, 5), 0, (4, 6, 3, 2), (8, 8, 4, 5), reduce_str=reduce_type)
 
-            helper((2, 8, 4, 5), 1, (2, 20, 4, 5), (2, 20, 4, 5), reduce_str=reduce)
-            helper((2, 8, 4, 5), 1, (2, 13, 3, 2), (2, 13, 3, 2), reduce_str=reduce)
-            helper((8, 8, 4, 5), 1, (6, 5, 2, 3), (6, 5, 2, 3), reduce_str=reduce)
-            helper((8, 8, 4, 5), 1, (3, 4, 2, 2), (6, 5, 2, 3), reduce_str=reduce)
+            helper((2, 8, 4, 5), 1, (2, 20, 4, 5), (2, 20, 4, 5), reduce_str=reduce_type)
+            helper((2, 8, 4, 5), 1, (2, 13, 3, 2), (2, 13, 3, 2), reduce_str=reduce_type)
+            helper((8, 8, 4, 5), 1, (6, 5, 2, 3), (6, 5, 2, 3), reduce_str=reduce_type)
+            helper((8, 8, 4, 5), 1, (3, 4, 2, 2), (6, 5, 2, 3), reduce_str=reduce_type)
 
-            helper((4, 5, 9, 8), 2, (4, 5, 13, 8), (4, 5, 13, 8), reduce_str=reduce)
-            helper((4, 5, 9, 8), 2, (3, 4, 10, 6), (3, 4, 10, 6), reduce_str=reduce)
-            helper((4, 5, 9, 8), 2, (3, 3, 7, 5), (3, 4, 10, 6), reduce_str=reduce)
+            helper((4, 5, 9, 8), 2, (4, 5, 13, 8), (4, 5, 13, 8), reduce_str=reduce_type)
+            helper((4, 5, 9, 8), 2, (3, 4, 10, 6), (3, 4, 10, 6), reduce_str=reduce_type)
+            helper((4, 5, 9, 8), 2, (3, 3, 7, 5), (3, 4, 10, 6), reduce_str=reduce_type)
 
     def test_is_nonzero(self):
         self.assertFalse(torch.is_nonzero(torch.tensor([0.]).to('mps')))
@@ -7111,7 +7109,8 @@ def test_conv_backward_1d_channels_last(self):
         def helper(shape, in_channels=1, out_channels=1, kernel_size=3, groups=1):
             # https://github.com/pytorch/pytorch/issues/84511
             conv_cpu = torch.nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=groups)
-            conv_mps = torch.nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=groups).to("mps")
+            conv_mps = torch.nn.Conv1d(
+                in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=groups).to("mps")
             conv_mps.weight.data = conv_cpu.weight.data.detach().clone().to("mps").requires_grad_(True)
             conv_mps.bias.data = conv_cpu.bias.data.detach().clone().to("mps").requires_grad_(True)
 
@@ -7229,10 +7228,8 @@ def get_grid(device='cpu', data=None):
 
                     input_mps = input_cpu.detach().transpose(0, 1).to("mps").transpose(0, 1).requires_grad_(input_requires_grad)
                     grid_mps = get_grid('mps', grid_cpu.detach()).requires_grad_()
-                    out_mps = F.grid_sample(input_mps, grid_mps, mode=mode, padding_mode=padding_mode,
-                                                align_corners=align_corners)
+                    out_mps = F.grid_sample(input_mps, grid_mps, mode=mode, padding_mode=padding_mode, align_corners=align_corners)
                     self.assertEqual(out_cpu, out_mps)
-
                     out_mps.backward(gradients.to("mps"))
                     if input_requires_grad:
                         self.assertEqual(input_cpu.grad, input_mps.grad)
@@ -7245,8 +7242,7 @@ def get_grid(device='cpu', data=None):
                                             align_corners=align_corners)
 
                     input_mps = base_input.to("mps").expand_as(input_mps).requires_grad_(input_requires_grad)
-                    out_mps = F.grid_sample(input_mps, grid_mps, mode=mode, padding_mode=padding_mode,
-                                                align_corners=align_corners)
+                    out_mps = F.grid_sample(input_mps, grid_mps, mode=mode, padding_mode=padding_mode, align_corners=align_corners)
                     self.assertEqual(out_cpu, out_mps)
 
             # test same size output
@@ -9449,9 +9445,7 @@ class TestConsistency(TestCase):
         'linalg.cond': [torch.float32],
         'linalg.detsingular': [torch.float32],
         'linalg.det': [torch.float32],
-        'linalg.eig': [torch.float32],
         'linalg.eigh': [torch.float32],
-        'linalg.eigvals': [torch.float32],
         'linalg.eigvalsh': [torch.float32],
         'linalg.householder_product': [torch.float32],
         'linalg.ldl_factor': [torch.float32],
@@ -9512,7 +9506,6 @@ class TestConsistency(TestCase):
         'nn.functional.ctc_loss': [torch.float32],
         'nn.functional.embedding_bag': [torch.float16, torch.float32],
         'nn.functional.max_pool2d': [torch.float32],
-        'nn.functional.max_pool3d': [torch.float32],
         'nn.functional.hardshrink': [torch.float32],
         'nn.functional.hardsigmoid': [torch.float32],
         'nn.functional.logsigmoid': [torch.float32],
@@ -9539,15 +9532,11 @@ class TestConsistency(TestCase):
         'polygammapolygamma_n_2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'polygammapolygamma_n_3': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'polygammapolygamma_n_4': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
-        'put': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'qr': [torch.float32],
         'quantile': [torch.float32],
         'remainder': [torch.bool, torch.int16, torch.int32, torch.int64, torch.uint8],
         'renorm': [torch.float16, torch.float32],
         'roll': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
-        'rounddecimals_0': [torch.float32],
-        'rounddecimals_3': [torch.float32],
-        'rounddecimals_neg_3': [torch.float32],
         'rsub': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'scatter_reduceamax': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'scatter_reduceamin': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
@@ -9566,12 +9555,36 @@ class TestConsistency(TestCase):
         'special.bessel_j1': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'special.bessel_y0': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'special.bessel_y1': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
-        'special.chebyshev_polynomial_t': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
-        'special.chebyshev_polynomial_u': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.chebyshev_polynomial_t': [torch.bool,
+                                           torch.float16,
+                                           torch.float32,
+                                           torch.int16,
+                                           torch.int32,
+                                           torch.int64,
+                                           torch.uint8],
+        'special.chebyshev_polynomial_u': [torch.bool,
+                                           torch.float16,
+                                           torch.float32,
+                                           torch.int16,
+                                           torch.int32,
+                                           torch.int64,
+                                           torch.uint8],
         'special.entr': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'special.erfcx': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
-        'special.hermite_polynomial_h': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
-        'special.hermite_polynomial_he': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.hermite_polynomial_h': [torch.bool,
+                                         torch.float16,
+                                         torch.float32,
+                                         torch.int16,
+                                         torch.int32,
+                                         torch.int64,
+                                         torch.uint8],
+        'special.hermite_polynomial_he': [torch.bool,
+                                          torch.float16,
+                                          torch.float32,
+                                          torch.int16,
+                                          torch.int32,
+                                          torch.int64,
+                                          torch.uint8],
         'special.i0e': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'special.i1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'special.i1e': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
@@ -9604,13 +9617,18 @@ class TestConsistency(TestCase):
 
     EXPECTED_FAILURES = {
         # Failures due to unsupported data types on MPS backend
-        'matmul': [torch.uint8], # MPS device does not support mm for non-float inputs
         'bfloat16': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'chalf': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'nn.functional.conv1d': [torch.int64],
         'nn.functional.conv2d': [torch.int64],
         'nn.functional.conv_transpose1d': [torch.int64],
-        'nn.functional.softminwith_dtype': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nn.functional.softminwith_dtype': [torch.bool,
+                                            torch.float16,
+                                            torch.float32,
+                                            torch.int16,
+                                            torch.int32,
+                                            torch.int64,
+                                            torch.uint8],
         'log_softmaxwith_dtype': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'softmaxwith_dtype': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         '__rmatmul__': [torch.int16, torch.int32, torch.uint8],
@@ -9653,8 +9671,7 @@ class TestConsistency(TestCase):
         'linalg.pinv': [torch.float32],
         'linalg.pinvhermitian': [torch.float32],
         'log_softmax': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
-        'log_softmaxwith_dtype': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
-        'matmul': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'matmul': [torch.int16, torch.int32, torch.int64, torch.uint8],  # MPS device does not support mm for non-float inputs
         'mm': [torch.int16, torch.int32, torch.int64, torch.uint8],
         'mv': [torch.int16, torch.int32, torch.int64, torch.uint8],
         'new_full': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
@@ -9664,7 +9681,6 @@ class TestConsistency(TestCase):
         'nn.functional.bilinear': [torch.int16, torch.int32, torch.int64, torch.uint8],
         'nn.functional.linear': [torch.int16, torch.int32, torch.int64, torch.uint8],
         'nn.functional.softmin': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
-        'nn.functional.softminwith_dtype': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'ones_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'signal.windows.blackman': [torch.float16],
         'signal.windows.cosine': [torch.float16],
@@ -9675,7 +9691,6 @@ class TestConsistency(TestCase):
         'signal.windows.hamming': [torch.float16],
         'signal.windows.hann': [torch.float16],
         'signal.windows.kaiser': [torch.float16],
-        'softmaxwith_dtype': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'stft': [torch.float32],
         'tensordot': [torch.int16, torch.int32, torch.int64, torch.uint8],
         'zeros_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
@@ -9699,7 +9714,7 @@ class TestConsistency(TestCase):
         'nn.functional.dropout': [torch.float32],
         'nn.functional.dropout2d': [torch.float32],
         'nn.functional.dropout3d': [torch.float32],
-         # these fill tensors with uninitialized data, causing mismatch with CPU
+        # these fill tensors with uninitialized data, causing mismatch with CPU
         'new_empty': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'empty_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'empty': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
@@ -9732,10 +9747,11 @@ class TestConsistency(TestCase):
     with open(filename) as f:
         data = yaml.safe_load(f)
     CUDA_RESULT = dict()
-    for key,value in data.items():
-        CUDA_RESULT[key]= torch.as_tensor(value)
+    for key, value in data.items():
+        CUDA_RESULT[key] = torch.as_tensor(value)
 
-    MPS_SKIP_LIST = reduce(lambda x,y: dict(x, **y), (FAST_MATH_PRECISION_ISSUES, BLOCKLIST, UNDEFINED_BEHAVIOUR, EXPECTED_FAILURES, UNIMPLEMENTED_OPS))
+    MPS_SKIP_LIST = reduce(lambda x, y: dict(x, **y), (
+        FAST_MATH_PRECISION_ISSUES, BLOCKLIST, UNDEFINED_BEHAVIOUR, EXPECTED_FAILURES, UNIMPLEMENTED_OPS))
 
     # Used for accept mode only
     NEW_ALLOW_LIST = defaultdict(list)
@@ -9839,7 +9855,7 @@ def get_samples():
 
             except Exception as e:
                 if any(s in str(e).lower() for s in ["int64", "macos 13"]):
-                  self.skipTest(f"{str(e)}")
+                    self.skipTest(f"{str(e)}")
 
                 if op.name in self.CUDA_RESULT and self.compare_with_CUDA(op, mps_out, atol=atol, rtol=rtol):
                     continue

From 0408b262d337c2cf551ace77dca5d0c509951a56 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Wed, 25 Jan 2023 15:26:53 -0800
Subject: [PATCH 1903/1922] Add additional checks for reshaped views (#250)

* Fix view slice reshape

* Don't use arrayViews for reshapedTensors
---
 aten/src/ATen/native/mps/operations/View.mm | 99 ++++++---------------
 test/test_mps.py                            | 11 +++
 2 files changed, 36 insertions(+), 74 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/View.mm b/aten/src/ATen/native/mps/operations/View.mm
index b68e100dea863..fc6eaf8e40ef8 100644
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@@ -449,28 +449,16 @@ bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape) {
   size_t src_ndim_base = src_base_shape.size();
   std::vector<int64_t> src_view_shape = getViewShape(src, mpsShape);
   size_t src_ndim_view = src_view_shape.size();
-  if (src_ndim_base == src_ndim_view) {
-    for (const auto i: c10::irange(src_ndim_base)) {
-      if (src_view_shape[i] > src_base_shape[i]) {
-        return false;
-      }
-    }
-  } else {
-    // Detect slice followed by reshape cases, e.g (1,4800,2) -> (1,4800)
-    bool allDimsEqual = true;
-    auto min_ndim = std::min(src_ndim_base, src_ndim_view);
-    for (const auto i: c10::irange(min_ndim)) {
-      if (src_view_shape[i] > src_base_shape[i]) {
-        return false;
-      }
-      else if (src_view_shape[i] != src_base_shape[i]) {
-        allDimsEqual = false;
-      }
-    }
-    if (allDimsEqual) {
+  if (src_ndim_base != src_ndim_view) {
+    return false;
+  }
+
+  for (const auto i: c10::irange(src_ndim_base)) {
+    if (src_view_shape[i] > src_base_shape[i]) {
       return false;
     }
   }
+
   return true;
 }
 
@@ -480,71 +468,34 @@ bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape) {
   std::vector<int64_t> src_view_shape = getViewShape(src, mpsShape);
   int src_ndim_view = src_view_shape.size();
 
+  TORCH_CHECK(src_ndim_base == src_ndim_view);
+
   MPSNDArray *srcTensorNDArrayView = nil;
   MPSNDArrayDescriptor *srcTensorNDArrayDesc = nil;
   MPSNDArray *srcTensorNDArray = nil;
   id<MTLCommandBuffer> commandBuffer = getCurrentMPSStream()->commandBuffer();
 
-  if (src_ndim_base == src_ndim_view) {
-    srcTensorNDArray = ndArrayFromTensor(src, getMPSShape(src_base_shape), mpsDataType);
-    srcTensorNDArrayDesc = srcTensorNDArray.descriptor;
-
-    int firstDimToSlice = 0;
-    while (src_base_shape[firstDimToSlice] == src_view_shape[firstDimToSlice]) {
-      firstDimToSlice++;
-    }
+  srcTensorNDArray = ndArrayFromTensor(src, getMPSShape(src_base_shape), mpsDataType);
+  srcTensorNDArrayDesc = srcTensorNDArray.descriptor;
 
-    int view_numel = 1;
-    for (const auto i : c10::irange(firstDimToSlice + 1, src_base_shape.size())) {
-      view_numel *= src_base_shape[i];
-    }
-
-    int sliceOffset = src.storage_offset() / view_numel;
-    // There are cases where both dimensions of a view can shrink
-    // E.g: x = torch.randn((3,6))[1, 1:3]
-    int nextSliceOffset = src.storage_offset() % view_numel;
-
-    [srcTensorNDArrayDesc sliceDimension:src_ndim_base - 1 - firstDimToSlice withSubrange:{static_cast<NSUInteger>(sliceOffset), static_cast<NSUInteger>(src.sizes()[firstDimToSlice])}];
-    if (nextSliceOffset) {
-      [srcTensorNDArrayDesc sliceDimension:src_ndim_base - 2 - firstDimToSlice withSubrange:{static_cast<NSUInteger>(nextSliceOffset), static_cast<NSUInteger>(src.sizes()[firstDimToSlice+1])}];
-    }
+  int firstDimToSlice = 0;
+  while (src_base_shape[firstDimToSlice] == src_view_shape[firstDimToSlice]) {
+    firstDimToSlice++;
   }
-  else {
-    int src_view_numel = 1;
-    for (const auto i : c10::irange(src_ndim_view)) {
-      src_view_numel *= src_view_shape[i];
-    }
 
-    int idx = 0;
-    int finalShapeSize = (src_ndim_view == 0) ? 1 : src_ndim_view;
-    std::vector<NSNumber*> mpsFinalShape(finalShapeSize);
-
-    // When the shapes are different, we need to flatten the first slice in order to alias the memory without any copies
-    // E.g: base tensor [5, 7, 3], view tensor [7, 3] (storage_offset=21). We need to flatten [5, 7, 3] to [35, 3], then
-    // we can slice directly into the first dimension based on the storage_offset
-    uint32_t flattenedSlice = 1;
-    for (const auto i : c10::irange(src_ndim_base - finalShapeSize + 1)) {
-      flattenedSlice *= src_base_shape[i];
-    }
-    mpsFinalShape[idx++] = [NSNumber numberWithInteger:flattenedSlice];
-
-    for (const auto i : c10::irange(src_ndim_base - finalShapeSize + 1, src_ndim_base)) {
-      mpsFinalShape[idx++] = [NSNumber numberWithInteger:src_base_shape[i]];
-    }
-
-    mpsShape = [NSArray arrayWithObjects:mpsFinalShape.data() count:mpsFinalShape.size()];
-    srcTensorNDArray = ndArrayFromTensor(src, mpsShape, mpsDataType);
-    srcTensorNDArrayDesc = srcTensorNDArray.descriptor;
+  int view_numel = 1;
+  for (const auto i : c10::irange(firstDimToSlice + 1, src_base_shape.size())) {
+    view_numel *= src_base_shape[i];
+  }
 
-    int dim0 = (src_ndim_view == 0) ? 1 : src_view_shape[0];
-    int totalSlices = dim0;
+  int sliceOffset = src.storage_offset() / view_numel;
+  // There are cases where both dimensions of a view can shrink
+  // E.g: x = torch.randn((3,6))[1, 1:3]
+  int nextSliceOffset = src.storage_offset() % view_numel;
 
-    // For 1D arrays, the storage_offset gives directly the
-    // starting point from where the slice should start
-    int sliceOffset = src_ndim_view == 1 ? 1 : dim0;
-    int view_numel = src_ndim_view == 1 ? 1 : src_view_numel;
-    [srcTensorNDArrayDesc sliceDimension:finalShapeSize - 1
-    withSubrange:{static_cast<NSUInteger>((src.storage_offset() / view_numel) * sliceOffset), static_cast<NSUInteger>(totalSlices)}];
+  [srcTensorNDArrayDesc sliceDimension:src_ndim_base - 1 - firstDimToSlice withSubrange:{static_cast<NSUInteger>(sliceOffset), static_cast<NSUInteger>(src.sizes()[firstDimToSlice])}];
+  if (nextSliceOffset) {
+    [srcTensorNDArrayDesc sliceDimension:src_ndim_base - 2 - firstDimToSlice withSubrange:{static_cast<NSUInteger>(nextSliceOffset), static_cast<NSUInteger>(src.sizes()[firstDimToSlice+1])}];
   }
 
   srcTensorNDArrayView = [srcTensorNDArray arrayViewWithCommandBuffer:commandBuffer
diff --git a/test/test_mps.py b/test/test_mps.py
index 0334f8c4eb171..e0db76d1737f7 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -1624,6 +1624,17 @@ def test_cpu_to_strided_mps_copy(self):
 
         self.assertEqual(a1, a2)
 
+    def test_view_slice_reshape(self):
+        x = torch.randn([1, 4, 4], device="mps")
+        y = x[0, :1, 1:]
+
+        x_cpu = x.to("cpu")
+        y_cpu = x_cpu[0, :1, 1:]
+
+        r = y + 1
+        r_cpu = y_cpu + 1
+        self.assertEqual(r, r_cpu)
+
     def test_slice_reshape(self):
         x = torch.randn([1, 6, 4, 2], dtype=torch.float, device="mps")
         x_cpu = x.detach().clone().to("cpu")

From 940401f6a5eaa3b3d7d8ea7409a5c0ca91c8e775 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Wed, 25 Jan 2023 17:38:33 -0800
Subject: [PATCH 1904/1922] Fix blocklist dict search (#255)

* Fix blocklist dict search

* Fix scatter reduce

* Skip unimplemented ops
---
 test/test_mps.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index e0db76d1737f7..440741086bccf 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -5324,7 +5324,7 @@ def helper(shape, dim, idx_shape, src_shape, idx_dtype=torch.int64, reduce_str="
 
         # for reduce in ["sum", "prod", "amax", "amin"]:
         for reduce_type in ["add", "multiply"]:
-            helper((2, 3), 0, (5, 3), (5, 3), reduce_str=reduce)
+            helper((2, 3), 0, (5, 3), (5, 3), reduce_str=reduce_type)
             helper((2, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5), reduce_str=reduce_type)
             helper((8, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5), reduce_str=reduce_type)
             helper((8, 8, 4, 5), 0, (4, 7, 3, 2), (4, 7, 3, 2), reduce_str=reduce_type)
@@ -9768,16 +9768,16 @@ class TestConsistency(TestCase):
     NEW_ALLOW_LIST = defaultdict(list)
     NEW_ALLOW_LIST_GRAD = defaultdict(list)
 
-    def get_error_message(self, key, op_name):
-        if key in self.FAST_MATH_PRECISION_ISSUES:
+    def get_error_message(self, key, op_name, dtype):
+        if key in self.FAST_MATH_PRECISION_ISSUES and dtype in self.FAST_MATH_PRECISION_ISSUES[key]:
             return f"Running test with {op_name} fails due to precision issues (fast math) so skipping"
-        elif key in self.BLOCKLIST:
+        elif key in self.BLOCKLIST and dtype in self.BLOCKLIST[key]:
             return f"Running test with {op_name} fails so skipping"
-        elif key in self.UNDEFINED_BEHAVIOUR:
+        elif key in self.UNDEFINED_BEHAVIOUR and dtype in self.UNDEFINED_BEHAVIOUR[key]:
             return f"Running test with {op_name} fails due to undefined behaviour / random output so skipping"
-        elif key in self.EXPECTED_FAILURES:
+        elif key in self.EXPECTED_FAILURES and dtype in self.EXPECTED_FAILURES[key]:
             return f"Running test with {op_name} expected to fail due to unsupported MPS data type so skipping"
-        elif key in self.UNIMPLEMENTED_OPS:
+        elif key in self.UNIMPLEMENTED_OPS and dtype in self.UNIMPLEMENTED_OPS[key]:
             return f"Running test with {op_name} expected to fail due to missing op implementation"
         return f"Running test with {op_name} hangs so skipping"
 
@@ -9798,8 +9798,7 @@ def test_output_match(self, device, dtype, op):
 
         key = op.name + op.variant_test_name
         if key in self.MPS_SKIP_LIST:
-            if self.MPS_SKIP_LIST[key] is None or dtype in self.MPS_SKIP_LIST[key]:
-                self.skipTest(self.get_error_message(key, op.name))
+            self.skipTest(self.get_error_message(key, op.name, dtype))
 
         # Make this an expecttest manually
         # When this env variable is set, generate a new ALLOWLIST_OP
@@ -9948,6 +9947,12 @@ def req_grad(t):
 # Copied from `TestCommon` in `test_ops.py`, just enough to duplicate the `test_numpy_ref` for MPS
 @skipIfSlowGradcheckEnv
 class TestCommon(TestCase):
+
+    UNIMPLEMENTED_OPS = {
+        'aminmax': [torch.float32],
+        'roll': [torch.float32],
+    }
+
     exact_dtype = True
 
     # Verifies, on teardown, that no OpInfo is still using dynamic dtypes in CI
@@ -9978,6 +9983,10 @@ def tearDownClass(cls):
     # MPS only supports float32
     @ops(_ref_test_ops, allowed_dtypes=(torch.float32,))
     def test_numpy_ref_mps(self, device, dtype, op):
+        key = op.name + op.variant_test_name
+        if key in self.UNIMPLEMENTED_OPS and dtype in self.UNIMPLEMENTED_OPS[key]:
+            self.skipTest(f"Running test with {op.name} expected to fail due to missing op implementation")
+
         # Unlike `test_numpy_ref`, this test compares in `float32` since at the time of this test's creation MPS
         # does not support float64 Tensors.
         # A few ops are currently broken on their reference inputs, but not their sample inputs. These should

From a1e0d789c262fdc0b38020ef5103ed49e646ace6 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Thu, 26 Jan 2023 17:54:32 -0500
Subject: [PATCH 1905/1922] Fix the crash with hardswish_backward (#256)

- Also fix indentation and formatting
---
 .../ATen/native/mps/operations/Activation.mm  | 178 ++++++++----------
 1 file changed, 83 insertions(+), 95 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Activation.mm b/aten/src/ATen/native/mps/operations/Activation.mm
index b84436bd99f5a..69be087ee2aaf 100644
--- a/aten/src/ATen/native/mps/operations/Activation.mm
+++ b/aten/src/ATen/native/mps/operations/Activation.mm
@@ -2320,11 +2320,10 @@ Tensor hardswish_mps(const Tensor& self) {
 Tensor hardswish_backward_mps(const Tensor& grad_output, const Tensor& self) {
   using namespace mps;
 
-  if (grad_output.numel() == 0) {
-    return grad_output;
-  }
-
   Tensor grad_input = at::empty_like(self, self.suggest_memory_format());
+  if (grad_input.numel() == 0) {
+    return grad_input;
+  }
 
   struct CachedGraph : public MPSCachedGraph {
     CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
@@ -2335,113 +2334,102 @@ Tensor hardswish_backward_mps(const Tensor& grad_output, const Tensor& self) {
 
   MPSGraphCache* cache_ = MPSGraphCache::getInstance();
 
-  MPSStream* stream = at::mps::getCurrentMPSStream();
-
   @autoreleasepool {
     string key = "hardswish_backward_mps" + getTensorsStringKey({self});
-    CachedGraph* cachedGraph = static_cast<CachedGraph*>(cache_->LookUp(key));
+    CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
     if (!cachedGraph) {
-      MPSCachedGraph* tmpCachedGraph =
-          cache_->CreateCachedGraph(key, ^MPSCachedGraph*() {
-            CachedGraph* newCachedGraph = nil;
-            @autoreleasepool {
-              MPSGraph* mpsGraph = make_mps_graph();
-              newCachedGraph = new CachedGraph(mpsGraph);
-              MPSGraphTensor* gradOutputTensor =
-                  mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
-              MPSGraphTensor* inputTensor =
-                  mpsGraphRankedPlaceHolder(mpsGraph, self);
-
-              MPSGraphTensor* zeroTensor = [mpsGraph
-                  constantWithScalar:0.0f
-                               shape:@[ @1 ]
-                            dataType:getMPSDataType(grad_output.scalar_type())];
-
-              MPSGraphTensor* unitTensor = [mpsGraph
-                  constantWithScalar:1.0f
-                               shape:@[ @1 ]
-                            dataType:getMPSDataType(grad_output.scalar_type())];
-
-              MPSGraphTensor* threeTensor = [mpsGraph
-                  constantWithScalar:3.0f
-                               shape:@[ @1 ]
-                            dataType:getMPSDataType(grad_output.scalar_type())];
-
-              MPSGraphTensor* negativeThreeTensor = [mpsGraph
-                  constantWithScalar:-3.0f
-                               shape:@[ @1 ]
-                            dataType:getMPSDataType(grad_output.scalar_type())];
-
-              MPSGraphTensor* halfTensor = [mpsGraph
-                  constantWithScalar:0.5f
-                               shape:@[ @1 ]
-                            dataType:getMPSDataType(grad_output.scalar_type())];
-
-              MPSGraphTensor* tempTensor =
-                  [mpsGraph divisionWithPrimaryTensor:inputTensor
-                                      secondaryTensor:threeTensor
-                                                 name:nil];
-
-              MPSGraphTensor* weightedTensor =
-                  [mpsGraph additionWithPrimaryTensor:tempTensor
-                                      secondaryTensor:halfTensor
-                                                 name:nil];
-
-              MPSGraphTensor* lessThanMinPredicateTensor = [mpsGraph
-                  lessThanOrEqualToWithPrimaryTensor:inputTensor
-                                     secondaryTensor:negativeThreeTensor
-                                                name:nil];
-
-              MPSGraphTensor* lessThanMaxPredicateTensor =
-                  [mpsGraph lessThanWithPrimaryTensor:inputTensor
-                                      secondaryTensor:threeTensor
-                                                 name:nil];
-
-              MPSGraphTensor* lessThanMaxGradTensor =
-                  [mpsGraph selectWithPredicateTensor:lessThanMaxPredicateTensor
-                                  truePredicateTensor:weightedTensor
-                                 falsePredicateTensor:unitTensor
-                                                 name:nil];
+      cachedGraph = cache_->CreateCachedGraphAs<CachedGraph>(key, ^MPSCachedGraph*() {
+        CachedGraph* newCachedGraph = nil;
+        @autoreleasepool {
+          MPSGraph* mpsGraph = make_mps_graph();
+          newCachedGraph = new CachedGraph(mpsGraph);
+          MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
+          MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
 
-              MPSGraphTensor* gradTensor =
-                  [mpsGraph selectWithPredicateTensor:lessThanMinPredicateTensor
-                                  truePredicateTensor:zeroTensor
-                                 falsePredicateTensor:lessThanMaxGradTensor
-                                                 name:nil];
-              MPSGraphTensor* gradInputTensor =
-                  [mpsGraph multiplicationWithPrimaryTensor:gradTensor
-                                            secondaryTensor:gradOutputTensor
-                                                       name:nil];
+          MPSGraphTensor* zeroTensor = [mpsGraph
+              constantWithScalar:0.0f
+                           shape:@[ @1 ]
+                        dataType:getMPSDataType(grad_output.scalar_type())];
+
+          MPSGraphTensor* unitTensor = [mpsGraph
+              constantWithScalar:1.0f
+                           shape:@[ @1 ]
+                        dataType:getMPSDataType(grad_output.scalar_type())];
+
+          MPSGraphTensor* threeTensor = [mpsGraph
+              constantWithScalar:3.0f
+                           shape:@[ @1 ]
+                        dataType:getMPSDataType(grad_output.scalar_type())];
+
+          MPSGraphTensor* negativeThreeTensor = [mpsGraph
+              constantWithScalar:-3.0f
+                           shape:@[ @1 ]
+                        dataType:getMPSDataType(grad_output.scalar_type())];
+
+          MPSGraphTensor* halfTensor = [mpsGraph
+              constantWithScalar:0.5f
+                           shape:@[ @1 ]
+                        dataType:getMPSDataType(grad_output.scalar_type())];
+
+          MPSGraphTensor* tempTensor =
+              [mpsGraph divisionWithPrimaryTensor:inputTensor
+                                  secondaryTensor:threeTensor
+                                             name:nil];
+
+          MPSGraphTensor* weightedTensor =
+              [mpsGraph additionWithPrimaryTensor:tempTensor
+                                  secondaryTensor:halfTensor
+                                             name:nil];
+
+          MPSGraphTensor* lessThanMinPredicateTensor = [mpsGraph
+              lessThanOrEqualToWithPrimaryTensor:inputTensor
+                                 secondaryTensor:negativeThreeTensor
+                                            name:nil];
+
+          MPSGraphTensor* lessThanMaxPredicateTensor =
+              [mpsGraph lessThanWithPrimaryTensor:inputTensor
+                                  secondaryTensor:threeTensor
+                                             name:nil];
+
+          MPSGraphTensor* lessThanMaxGradTensor =
+              [mpsGraph selectWithPredicateTensor:lessThanMaxPredicateTensor
+                              truePredicateTensor:weightedTensor
+                             falsePredicateTensor:unitTensor
+                                             name:nil];
+
+          MPSGraphTensor* gradTensor =
+              [mpsGraph selectWithPredicateTensor:lessThanMinPredicateTensor
+                              truePredicateTensor:zeroTensor
+                             falsePredicateTensor:lessThanMaxGradTensor
+                                             name:nil];
+          MPSGraphTensor* gradInputTensor =
+              [mpsGraph multiplicationWithPrimaryTensor:gradTensor
+                                        secondaryTensor:gradOutputTensor
+                                                   name:nil];
 
-              newCachedGraph->gradOutputTensor_ = gradOutputTensor;
-              newCachedGraph->inputTensor_ = inputTensor;
-              newCachedGraph->gradInputTensor_ = gradInputTensor;
-            }
-            return newCachedGraph;
-          });
-      cachedGraph = static_cast<CachedGraph*>(tmpCachedGraph);
+          newCachedGraph->gradOutputTensor_ = gradOutputTensor;
+          newCachedGraph->inputTensor_ = inputTensor;
+          newCachedGraph->gradInputTensor_ = gradInputTensor;
+        }
+        return newCachedGraph;
+      });
     }
 
-    Placeholder gradOutputPlaceholder =
-        Placeholder(cachedGraph->gradOutputTensor_, grad_output);
+    Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output);
     Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self);
-    Placeholder gradInputPlaceholder =
-        Placeholder(cachedGraph->gradInputTensor_, grad_input);
+    Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input);
 
     // Create dictionary of inputs and outputs
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
-      gradOutputPlaceholder.getMPSGraphTensor() :
-          gradOutputPlaceholder.getMPSGraphTensorData(),
-      selfPlaceholder.getMPSGraphTensor() :
-          selfPlaceholder.getMPSGraphTensorData()
+      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
+      selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
     };
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-      gradInputPlaceholder.getMPSGraphTensor() :
-          gradInputPlaceholder.getMPSGraphTensorData()
+      gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
     };
 
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results);
   }
   return grad_input;
 }

From 1eca176a28b3e4ce25d3c86c1c886a16a701b37d Mon Sep 17 00:00:00 2001
From: Ronian526 <11454459+Ronian526@users.noreply.github.com>
Date: Thu, 26 Jan 2023 16:51:28 -0800
Subject: [PATCH 1906/1922] Fix batch_norm_backward_mps (#257)

- add revert caculation for save_var used in backward path
- add backward test for native_batch_norm and _native_batch_norm_legit
---
 .../ATen/native/mps/operations/Normalization.mm | 17 +++++++++++++++--
 test/test_mps.py                                |  4 +++-
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Normalization.mm b/aten/src/ATen/native/mps/operations/Normalization.mm
index 90cfe3c9fab5d..964ce2da1ec69 100644
--- a/aten/src/ATen/native/mps/operations/Normalization.mm
+++ b/aten/src/ATen/native/mps/operations/Normalization.mm
@@ -673,11 +673,24 @@ string get_mem_string(c10::MemoryFormat memory_format) {
 
           if(train) {
             // Use save_mean and save_var
+            float primary = 1.0f;
+            MPSGraphTensor *primaryTensor = [mpsGraph constantWithScalar:primary dataType:MPSDataTypeFloat32];
+            MPSGraphTensor *epsilonTensor = [mpsGraph constantWithScalar:(float)epsilon dataType:MPSDataTypeFloat32];
+            MPSGraphTensor *revertSaveVarTensor = saveVarTensor;
+            revertSaveVarTensor = [mpsGraph divisionWithPrimaryTensor: primaryTensor
+                                                      secondaryTensor: revertSaveVarTensor
+                                                                 name: nil];
+            revertSaveVarTensor = [mpsGraph multiplicationWithPrimaryTensor: revertSaveVarTensor
+                                                            secondaryTensor: revertSaveVarTensor
+                                                                       name: nil];
+            revertSaveVarTensor = [mpsGraph subtractionWithPrimaryTensor: revertSaveVarTensor
+                                                         secondaryTensor: epsilonTensor
+                                                                    name: nil];
             if(grad_input_mask[1]) {
               gradWeightTensor = [mpsGraph normalizationGammaGradientWithIncomingGradientTensor:gradOutputTensor
                                                                                    sourceTensor:inputTensor
                                                                                      meanTensor:saveMeanTensor
-                                                                                 varianceTensor:saveVarTensor
+                                                                                 varianceTensor:revertSaveVarTensor
                                                                                   reductionAxes:axes
                                                                                         epsilon:(float)epsilon
                                                                                            name:nil];
@@ -692,7 +705,7 @@ string get_mem_string(c10::MemoryFormat memory_format) {
               gradInputTensor = [mpsGraph normalizationGradientWithIncomingGradientTensor:gradOutputTensor
                                                                              sourceTensor:inputTensor
                                                                                meanTensor:saveMeanTensor
-                                                                           varianceTensor:saveVarTensor
+                                                                           varianceTensor:revertSaveVarTensor
                                                                               gammaTensor:weightTensor
                                                                       gammaGradientTensor:gradWeightTensor
                                                                        betaGradientTensor:gradBiasTensor
diff --git a/test/test_mps.py b/test/test_mps.py
index 440741086bccf..bb42c394e52e3 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -9340,7 +9340,9 @@ class TestConsistency(TestCase):
         'view_as': ['f16', 'f32'],
         'vsplit': ['f16', 'f32'],
         'vstack': ['f16', 'f32'],
-        'zero_': ['f16', 'f32']
+        'zero_': ['f16', 'f32'],
+        '_native_batch_norm_legit': ['f32'],
+        'native_batch_norm': ['f32'],
     }
 
     # These ops that are problematic. So never run them even when

From d86a428142b31def524b47573c9725c9a306227c Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Fri, 27 Jan 2023 12:07:43 -0500
Subject: [PATCH 1907/1922] Use low precision for FP16 for divtrunc_rounding
 (#263)

- remove interpolate from blocklist since it works
- move interpolate_area to unimplemented list
---
 test/test_mps.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index bb42c394e52e3..f6d89ade5febf 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -9357,16 +9357,13 @@ class TestConsistency(TestCase):
         'linalg.inv': [torch.float32],
         'linalg.inv_ex': [torch.float32],
         'linalg.matrix_power': [torch.float32],
-        'nn.functional.interpolate': [torch.float32],
         'resize_': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
-        'nn.functional.interpolatearea': [torch.float32],
         'resize_as_': [torch.float16, torch.float32],
         'topk': [torch.int16, torch.int32, torch.int64, torch.uint8],
 
         # Functions with correctness issues
         'unique': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'divfloor_rounding': [torch.int16, torch.int32, torch.int64],
-        'divtrunc_rounding': [torch.float16],
         'norm': [torch.float16],
         'nn.functional.feature_alpha_dropoutwith_train': [torch.float32],
         'cumulative_trapezoid': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
@@ -9509,6 +9506,7 @@ class TestConsistency(TestCase):
         'nn.functional.fractional_max_pool3d': [torch.float32],
         'nn.functional.adaptive_avg_pool3d': [torch.float16, torch.float32],
         'nn.functional.adaptive_max_pool3d': [torch.float32],
+        'nn.functional.interpolatearea': [torch.float32],
         'nn.functional.interpolatebicubic': [torch.float32],
         'nn.functional.interpolatelinear': [torch.float32],
         'nn.functional.interpolatetrilinear': [torch.float32],
@@ -9749,7 +9747,7 @@ class TestConsistency(TestCase):
     }
 
     FP16_LOW_PRECISION_LIST = {
-        "add", "sub",
+        "add", "sub", "div",
         "__rdiv__", "__rmul__",
         "nn.functional.huber_loss",
         "true_divide"

From ca427e87a4bee4ec34b0d7c15f5a96437e53021c Mon Sep 17 00:00:00 2001
From: Ronian526 <11454459+Ronian526@users.noreply.github.com>
Date: Fri, 27 Jan 2023 09:47:19 -0800
Subject: [PATCH 1908/1922] Fix floor_divide (#243)

- report error for int64
- for int16,int32 cast tensor to float than floor
- unblock floor_divide and divfloor_rounding testing
- block test_divmode int64 div floor
- add extra tests test_div_floor_int, block for now due to fast math issue
---
 .../ATen/native/mps/operations/BinaryOps.mm   | 13 ++++
 test/test_mps.py                              | 63 ++++++++++++-------
 2 files changed, 52 insertions(+), 24 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm
index fc4c01a7ecb84..83f2535d0188e 100644
--- a/aten/src/ATen/native/mps/operations/BinaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@@ -176,8 +176,21 @@ void div_mode_template(const Tensor& self, const Tensor& other,
                        c10::optional<c10::string_view> rounding_mode,
                        const Tensor& output, const string op_name)
 {
+  if(rounding_mode.has_value() && *rounding_mode == "floor"){
+    TORCH_CHECK(self.scalar_type() != ScalarType::Long,
+                "MPS: does not support floor_divide op with int64 input");
+  }
   BinaryOpBlock div_mode_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
     MPSGraph* mpsGraph = cachedGraph->graph();
+    bool isFloatInput = ([primaryCastTensor dataType] & MPSDataTypeFloatBit) != 0;
+    if(!isFloatInput && rounding_mode.has_value() && *rounding_mode == "floor") {
+      primaryCastTensor = [mpsGraph castTensor:primaryCastTensor
+                                        toType:MPSDataTypeFloat32
+                                          name:@"primaryCastTensor"];
+      secondaryCastTensor = [mpsGraph castTensor:secondaryCastTensor
+                                          toType:MPSDataTypeFloat32
+                                            name:@"secondaryCastTensor"];
+    }
     MPSGraphTensor* divTensor =  [mpsGraph divisionWithPrimaryTensor:primaryCastTensor
                                                      secondaryTensor:secondaryCastTensor
                                                                 name:nil];
diff --git a/test/test_mps.py b/test/test_mps.py
index f6d89ade5febf..903b2652b11d0 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -2006,9 +2006,10 @@ def test_full_bugs(self):
     # See https://github.com/pytorch/pytorch/issues/84995
     def test_div_bugs(self):
         for (dtype, mode) in itertools.product(integral_types(), ['trunc', 'floor']):
-            x = torch.tensor(list(range(1, 11)), device='mps', dtype=dtype)
-            y = torch.div(x, 101, rounding_mode=mode)
-            self.assertEqual(y.sum(), 0)
+            if dtype != torch.int64:
+                x = torch.tensor(list(range(1, 11)), device='mps', dtype=dtype)
+                y = torch.div(x, 101, rounding_mode=mode)
+                self.assertEqual(y.sum(), 0)
 
     # See https://github.com/pytorch/pytorch/issues/82663
     def test_bool_expand(self):
@@ -2220,6 +2221,21 @@ def helper(shape):
 
         helper((2, 8, 4, 5))
 
+    # # Failures due to precision issues, enable after resolving from mps
+    # def test_div_floor_int(self):
+    #     def helper(shape, dtype):
+    #         cpu_x = torch.randint(-9999, -1,shape, device='cpu', dtype=dtype)
+    #         x = cpu_x.detach().clone().to('mps')
+
+    #         cpu_y = torch.randint(1, 9999, shape, device='cpu', dtype=dtype)
+    #         y = cpu_y.detach().clone().to('mps')
+
+    #         div_result = torch.div(x, y,rounding_mode='floor')
+    #         div_result_cpu = torch.div(cpu_x, cpu_y, rounding_mode='floor')
+    #         self.assertEqual(div_result, div_result_cpu)
+
+    #     helper((2, 8, 4, 5), torch.int16)
+    #     helper((2, 8, 4, 5), torch.int32)
 
 class TestLogical(TestCase):
     def _wrap_tensor(self, x, device="cpu", dtype=None, requires_grad=False):
@@ -4013,27 +4029,28 @@ def helper(n, c, h, w):
     def test_divmode(self):
         def helper(shape, rounding_mode):
             for dtype in [torch.float32, torch.float16, torch.int32, torch.int64]:
-                cpu_x = None
-                cpu_y = None
-                if (dtype in [torch.float32, torch.float16]):
-                    cpu_x = torch.randn(shape, device='cpu', dtype=dtype, requires_grad=False)
-                    cpu_y = torch.randn(shape, device='cpu', dtype=dtype, requires_grad=False)
-                else:
-                    cpu_x = torch.randint(-10, 0, shape, device='cpu', dtype=dtype, requires_grad=False)
-                    cpu_y = torch.randint(-10, 0, shape, device='cpu', dtype=dtype, requires_grad=False)
+                if (rounding_mode is not None and "floor" in rounding_mode and dtype == torch.int64) is False:
+                    cpu_x = None
+                    cpu_y = None
+                    if (dtype in [torch.float32, torch.float16]):
+                        cpu_x = torch.randn(shape, device='cpu', dtype=dtype, requires_grad=False)
+                        cpu_y = torch.randn(shape, device='cpu', dtype=dtype, requires_grad=False)
+                    else:
+                        cpu_x = torch.randint(-10, 0, shape, device='cpu', dtype=dtype, requires_grad=False)
+                        cpu_y = torch.randint(-10, 0, shape, device='cpu', dtype=dtype, requires_grad=False)
 
-                mps_x = cpu_x.detach().clone().to('mps')
-                # clamp to avoid division by 0
-                mps_y = cpu_y.detach().clone().to('mps')
+                    mps_x = cpu_x.detach().clone().to('mps')
+                    # clamp to avoid division by 0
+                    mps_y = cpu_y.detach().clone().to('mps')
 
-                if (rounding_mode == "floor_divide"):
-                    result_div_cpu = torch.floor_divide(cpu_x, cpu_y)
-                    result_div_mps = torch.floor_divide(mps_x, mps_y)
-                    self.assertEqual(result_div_mps, result_div_cpu)
-                else:
-                    result_div_cpu = torch.div(cpu_x, cpu_y, rounding_mode=rounding_mode)
-                    result_div_mps = torch.div(mps_x, mps_y, rounding_mode=rounding_mode)
-                    self.assertEqual(result_div_mps, result_div_cpu)
+                    if (rounding_mode == "floor_divide"):
+                        result_div_cpu = torch.floor_divide(cpu_x, cpu_y)
+                        result_div_mps = torch.floor_divide(mps_x, mps_y)
+                        self.assertEqual(result_div_mps, result_div_cpu)
+                    else:
+                        result_div_cpu = torch.div(cpu_x, cpu_y, rounding_mode=rounding_mode)
+                        result_div_mps = torch.div(mps_x, mps_y, rounding_mode=rounding_mode)
+                        self.assertEqual(result_div_mps, result_div_cpu)
 
         helper((2, 8, 4, 5), None)
         helper((2, 8, 4, 5), "floor")
@@ -9363,7 +9380,6 @@ class TestConsistency(TestCase):
 
         # Functions with correctness issues
         'unique': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
-        'divfloor_rounding': [torch.int16, torch.int32, torch.int64],
         'norm': [torch.float16],
         'nn.functional.feature_alpha_dropoutwith_train': [torch.float32],
         'cumulative_trapezoid': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
@@ -9373,7 +9389,6 @@ class TestConsistency(TestCase):
         'normalnumber_mean': [torch.float16, torch.float32],
         'new_empty_strided': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'multinomial': [torch.float32],
-        'floor_divide': [torch.int16, torch.int32, torch.int64],
         'dist': [torch.float16],
 
         # failure due to issue: atan2() may generate NAN in output with

From 628cecd86691666fb4693569afc19a5d6f6a3279 Mon Sep 17 00:00:00 2001
From: Ronian526 <11454459+Ronian526@users.noreply.github.com>
Date: Fri, 27 Jan 2023 09:50:38 -0800
Subject: [PATCH 1909/1922] Fix and unblock TestConsistency for median (#241)

- fix num_output_dims calculation
- fix median_out_mps key
- cast tensor sent to sortWithTensor and argSortWithTensor
- note down same issue for unique
- unblock median from blocklist
- adding test_median_int16 test
---
 .../ATen/native/mps/operations/ReduceOps.mm   | 31 ++++++++++++++++---
 aten/src/ATen/native/mps/operations/Unique.mm |  2 +-
 test/test_mps.py                              | 12 ++++++-
 3 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index f85f91567e857..e2bd1ab6da9cf 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -1847,6 +1847,17 @@ Tensor median_mps(const Tensor& input_t) {
             MPSGraphTensor * reshapedTensor = [mpsGraph reshapeTensor:inputTensor
                                                             withShape:@[@-1]
                                                                   name:nil];
+            MPSDataType dataType = [inputTensor dataType];
+            // #issue 104398441 sortWithTensor only supports following types, cast if necessary
+            if (dataType != MPSDataTypeInt32 &&
+                dataType != MPSDataTypeFloat32 &&
+                dataType != MPSDataTypeFloat16) {
+                dataType = (dataType & MPSDataTypeFloatBit) ? MPSDataTypeFloat32 : MPSDataTypeInt32;
+                reshapedTensor = [mpsGraph castTensor:reshapedTensor
+                                        toType:dataType
+                                          name:@"castReshapedTensor"];
+            }
+
             MPSGraphTensor * sortedTensor = [mpsGraph
                                                   sortWithTensor:reshapedTensor
                                                   axis:((NSUInteger) (int)0)
@@ -1934,7 +1945,7 @@ Tensor median_mps(const Tensor& input_t) {
     auto stream = at::mps::getCurrentMPSStream();
 
     @autoreleasepool {
-        string key = func_name + ":" + to_string(dim_) + ":" + native_mps::getTensorsStringKey(input_t);
+        string key = func_name + ":" + to_string(dim_) + ":" + native_mps::getTensorsStringKey(input_t) + ":" + native_mps::getTensorsStringKey(indices_t);
         CachedGraph* cachedGraph = cache_->LookUpAs<CachedGraph>(key);
 
         if(!cachedGraph) {
@@ -1948,8 +1959,20 @@ Tensor median_mps(const Tensor& input_t) {
 
               MPSGraphTensor* inputTensor = native_mps::mpsGraphUnrankedPlaceHolder(mpsGraph, native_mps::getMPSDataType(input_t.scalar_type()));
               MPSGraphTensor* outputTensor = nil;
+              MPSGraphTensor* castInputTensor = inputTensor;
+              MPSDataType dataType = native_mps::getMPSDataType(input_t.scalar_type());
+              // #issue 104398441 sortWithTensor only supports following types, cast if necessary
+              if (dataType != MPSDataTypeInt32 &&
+                  dataType != MPSDataTypeFloat32 &&
+                  dataType != MPSDataTypeFloat16) {
+                  dataType = (dataType & MPSDataTypeFloatBit) ? MPSDataTypeFloat32 : MPSDataTypeInt32;
+                  castInputTensor = [mpsGraph castTensor:inputTensor
+                                          toType:dataType
+                                            name:@"castInputTensor"];
+              }
+
               MPSGraphTensor * sortedTensor = [mpsGraph
-                                                  sortWithTensor:inputTensor
+                                                  sortWithTensor:castInputTensor
                                                   axis:((NSUInteger) (int)dim_)
                                                   name:nil];
 
@@ -1959,7 +1982,7 @@ Tensor median_mps(const Tensor& input_t) {
                                                         length:1
                                                         name:nil];
               MPSGraphTensor* argreduceOutTensor = nil;
-                argreduceOutTensor = [mpsGraph argSortWithTensor:inputTensor
+                argreduceOutTensor = [mpsGraph argSortWithTensor:castInputTensor
                                                                         axis:(NSInteger)dim_
                                                                         name:@"argmax_out"];
               MPSGraphTensor* argOutputTensor = [mpsGraph sliceTensor:argreduceOutTensor
@@ -2038,7 +2061,7 @@ Tensor median_mps(const Tensor& input_t) {
     int64_t num_input_dims = input_shape.size();
     NSMutableArray<NSNumber*> *apparent_out_shape = nil;
     // Use this if keepdim is false
-    int64_t num_output_dims = num_input_dims - 1;
+    int64_t num_output_dims = num_input_dims - 1 < 0 ? 0 : num_input_dims - 1;
 
     std::vector<int64_t> vec_apparent_out_shape(num_input_dims);
     std::vector<int64_t> vec_out_shape(num_output_dims);
diff --git a/aten/src/ATen/native/mps/operations/Unique.mm b/aten/src/ATen/native/mps/operations/Unique.mm
index 66098e46a49e1..4319c4aad0f5e 100644
--- a/aten/src/ATen/native/mps/operations/Unique.mm
+++ b/aten/src/ATen/native/mps/operations/Unique.mm
@@ -56,7 +56,7 @@
     return @[resultTensor, inverseIndicesTensor, countTensor, lengthTensor];
   }
 
-  // Sort only supports following types, cast if necessary
+  // #issue 104398441 sortWithTensor only supports following types, cast if necessary
   if (dataType != MPSDataTypeInt32 &&
       dataType != MPSDataTypeFloat32 &&
       dataType != MPSDataTypeFloat16) {
diff --git a/test/test_mps.py b/test/test_mps.py
index 903b2652b11d0..1e100c2547e0f 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -2237,6 +2237,17 @@ def helper(shape):
     #     helper((2, 8, 4, 5), torch.int16)
     #     helper((2, 8, 4, 5), torch.int32)
 
+    def test_median_int16(self):
+        def helper(shape, dtype):
+            cpu_x = torch.randint(-9999, 9999, shape, device='cpu', dtype=dtype)
+            x = cpu_x.detach().clone().to('mps')
+
+            median_result = torch.median(x)
+            median_result_cpu = torch.median(cpu_x)
+            self.assertEqual(median_result, median_result_cpu)
+
+        helper((2, 8, 4, 5), torch.int16)
+
 class TestLogical(TestCase):
     def _wrap_tensor(self, x, device="cpu", dtype=None, requires_grad=False):
         return torch.tensor(x, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -9369,7 +9380,6 @@ class TestConsistency(TestCase):
     BLOCKLIST = {
         # Functions that hard crash
         'nn.functional.softplus': [torch.float32],
-        'median': [torch.float32, torch.int16, torch.int32, torch.uint8, torch.int16],
         'sgn': [torch.bool],
         'linalg.inv': [torch.float32],
         'linalg.inv_ex': [torch.float32],

From 95d95c1c119fae86e1a1523467636b8942221ac6 Mon Sep 17 00:00:00 2001
From: Ronian526 <11454459+Ronian526@users.noreply.github.com>
Date: Fri, 27 Jan 2023 09:53:08 -0800
Subject: [PATCH 1910/1922] Remove softplus from blocklist (#253)

- unblock nn.functional.softplus test
- unblock test_softplus test
---
 test/test_mps.py | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index 1e100c2547e0f..7d36da427667b 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -4679,29 +4679,31 @@ def helper(shape, dim=0):
             for dim in range(len(shape)):
                 helper(shape, dim)
 
-    # # Test softplus
-    # def test_softplus(self):
-    #     def helper(shape, beta=1, threshold=20):
-    #         cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
-    #         x = cpu_x.detach().clone().to('mps').requires_grad_()
+    # Test softplus
+    def test_softplus(self):
+        def helper(shape, beta=1, threshold=20):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+            x = cpu_x.detach().clone().to('mps').requires_grad_()
 
-    #         softplus_result = torch.nn.Softplus(beta=beta, threshold=threshold)(x)
-    #         softplus_result_cpu = torch.nn.Softplus(beta=beta, threshold=threshold)(cpu_x)
+            softplus_result = torch.nn.Softplus(beta=beta, threshold=threshold)(x)
+            softplus_result_cpu = torch.nn.Softplus(beta=beta, threshold=threshold)(cpu_x)
 
-    #         cpu_grad = torch.randn(softplus_result.shape)
-    #         grad = cpu_grad.to('mps')
+            cpu_grad = torch.randn(softplus_result.shape)
+            grad = cpu_grad.to('mps')
 
-    #         softplus_result.backward(gradient=grad)
-    #         softplus_result_cpu.backward(gradient=cpu_grad)
+            softplus_result.backward(gradient=grad)
+            softplus_result_cpu.backward(gradient=cpu_grad)
 
-    #         self.assertEqual(softplus_result, softplus_result_cpu)
-    #         self.assertEqual(x.grad, cpu_x.grad)
+            self.assertEqual(softplus_result, softplus_result_cpu)
+            self.assertEqual(x.grad, cpu_x.grad)
 
-    #     # Test empty shape too
-    #     for shape in [(), (2, 3), (10, 10), (2, 3, 4, 5)]:
-    #         for beta in [0.5, 1, 2, 3, 4]:
-    #             for threshold in [0.5, 20, 30, 40, 50]:
-    #                 helper(shape, beta, threshold)
+        # Test empty shape too
+        for shape in [(), (2, 3), (10, 10), (2, 3, 4, 5)]:
+            for beta in [0.5, 1, 2, 3, 4]:
+                for threshold in [0.5, 20, 30, 40, 50]:
+                    helper(shape, beta, threshold)
+
+    # Test silu
 
     def test_silu(self):
         def helper(shape):
@@ -9198,6 +9200,7 @@ class TestConsistency(TestCase):
         'zeros': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'zeros_like': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'index_add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.softplus': ['f32'],
     }
 
     ALLOWLIST_OP_GRAD = {
@@ -9379,7 +9382,6 @@ class TestConsistency(TestCase):
     # All the entries in this list should be removed
     BLOCKLIST = {
         # Functions that hard crash
-        'nn.functional.softplus': [torch.float32],
         'sgn': [torch.bool],
         'linalg.inv': [torch.float32],
         'linalg.inv_ex': [torch.float32],

From 0a17c0bb851ec3416f2cd71a5a71c2ea9695c926 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Fri, 27 Jan 2023 13:25:14 -0800
Subject: [PATCH 1911/1922] Calculate nonzero count inside nonzero op (#260)

* Calculate output shape inside nonzero op

* nonzero optimizations

* Fix lintrunner
---
 .../ATen/native/mps/operations/Indexing.mm    | 39 +++++++++----------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index d9d8e53461638..28bb6e8c84f98 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -140,7 +140,7 @@ bool dispatchIndexKernel(TensorIteratorBase& iter,
                 threadsPerThreadgroup: threadGroupSize];
 
       [computeEncoder endEncoding];
-      mpsStream->commit(true);
+      mpsStream->synchronize(SyncType::COMMIT);
     }
   });
 
@@ -252,31 +252,24 @@ Tensor nonzero_fallback(const Tensor& self) {
     MPSGraphTensor* inputTensor_ = nil;
     MPSGraphTensor* outputTensor_ = nil;
     MPSGraphTensor* scatterDataTensor_ = nil;
+    MPSGraphTensor* countNonzeroTensor_ = nil;
   };
 
-  int64_t total_nonzero = at::count_nonzero(self).item<int64_t>();
-  at::native::resize_output(out_, {total_nonzero, nDim});
-  if (out_.numel() ==  0) {
-    return out_;
-  }
-
-  bool contiguous_output = (out_.is_contiguous() && !out_.is_view());
-  Tensor out = out_;
-  if (!contiguous_output) {
-    out = at::native::empty_mps(
-           out_.sizes(),
+  stream->synchronize(SyncType::COMMIT_AND_WAIT);
+  Tensor count_nonzero = at::empty({1}, self.options().dtype(kInt));
+  Tensor out =  at::native::empty_mps(
+           {self.numel(), nDim == 0 ? 1 : nDim},
            out_.scalar_type(),
            c10::nullopt,
            kMPS,
            c10::nullopt,
            c10::nullopt);
-  }
 
   int64_t _apparentInputShape = 1;
   for (auto dim : self.sizes()) {
     _apparentInputShape *= dim;
   }
-  MPSShape *apparentOutputShape = @[@(total_nonzero * nDim)];
+  MPSShape *apparentOutputShape = @[@(self.numel() * nDim)];
   MPSShape *apparentInputShape = @[@(_apparentInputShape)];
 
   // Pseudocode:
@@ -310,6 +303,9 @@ Tensor nonzero_fallback(const Tensor& self) {
           MPSGraphTensor *inputNotEqualToZeroTensor = [mpsGraph notEqualWithPrimaryTensor:inputTensor
                                                                           secondaryTensor:zeroTensor
                                                                                      name:nil];
+          MPSGraphTensor *countNonzero = [mpsGraph reductionSumWithTensor:inputNotEqualToZeroTensor
+                                                         axis:0
+                                                         name:nil];
           MPSGraphTensor *maskTensor = [mpsGraph castTensor:inputNotEqualToZeroTensor
                                                      toType:MPSDataTypeInt32
                                                        name:@"castToInt32"];
@@ -358,6 +354,7 @@ Tensor nonzero_fallback(const Tensor& self) {
           newCachedGraph->inputTensor_ = inputTensor;
           newCachedGraph->scatterDataTensor_ = scatterDataTensor;
           newCachedGraph->outputTensor_ = outputTensor;
+          newCachedGraph->countNonzeroTensor_ = countNonzero;
         }
         return newCachedGraph;
       });
@@ -365,8 +362,9 @@ Tensor nonzero_fallback(const Tensor& self) {
     }
 
     Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, apparentInputShape);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, contiguous_output ? out_ : out, apparentOutputShape);
-    Placeholder scatterPlaceholder = Placeholder(cachedGraph->scatterDataTensor_, contiguous_output ? out_ : out, apparentOutputShape);
+    Placeholder countNonzeroPlaceholder = Placeholder(cachedGraph->countNonzeroTensor_, count_nonzero);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out, apparentOutputShape);
+    Placeholder scatterPlaceholder = Placeholder(cachedGraph->scatterDataTensor_, out, apparentOutputShape);
 
     // Create dictionary of inputs and outputs
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
@@ -375,15 +373,16 @@ Tensor nonzero_fallback(const Tensor& self) {
     };
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
-      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
+      outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(),
+      countNonzeroPlaceholder.getMPSGraphTensor() : countNonzeroPlaceholder.getMPSGraphTensorData()
     };
 
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
-    if (!contiguous_output) {
-      out_.copy_(out);
-    }
   }
 
+  int32_t total_nonzero = count_nonzero.item<int32_t>();
+  at::native::resize_output(out_, {total_nonzero, nDim});
+  out_.copy_(out.resize_({total_nonzero, nDim}));
   return out_;
 }
 

From a82947b1edd8b525e7a97bf26b2614df370ed0d6 Mon Sep 17 00:00:00 2001
From: Abhishek Pathak <abhipathak97@gmail.com>
Date: Fri, 27 Jan 2023 18:07:01 -0800
Subject: [PATCH 1912/1922] Add support for negative dimensions in cumsum
 (#238)

* Add support for negative dimensions in cumsum

* Expand test with dtypes

* Fix lint issues

---------

Co-authored-by: abhipathak97 <abhipathak97@mps10.scv.apple.com>
---
 aten/src/ATen/native/mps/operations/UnaryOps.mm |  5 ++++-
 test/test_mps.py                                | 17 +++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index 2670701bacb54..bbbb81cf47432 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -254,7 +254,10 @@ void unary_op(const Tensor& self, const Tensor& output, std::string op_name, Una
  int64_t dim,
  c10::optional<ScalarType> dtype,
  const Tensor& result) {
-  TORCH_CHECK(dim >=0 && dim < std::max(1LL, self.ndimension()), "Expected dim to be between 0 and ", self.ndimension(), " but got ", dim);
+
+  auto nDims = self.dim();
+  auto wrapped_dim = maybe_wrap_dim(dim, nDims);
+  TORCH_CHECK(wrapped_dim >=0 && wrapped_dim < std::max(1LL, self.ndimension()), "Expected wrapped dim to be between 0 and ", self.ndimension(), " but got ", wrapped_dim , "(original dim is ", dim, ")");
   if (!is_macos_13_or_newer()) {
     TORCH_WARN_ONCE("torch.cumsum supported by MPS on MacOS 13+, please upgrade");
     auto cpu_result = self.to(at::Device(kCPU)).cumsum(dim, dtype);
diff --git a/test/test_mps.py b/test/test_mps.py
index 7d36da427667b..04141ace1d2eb 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -2248,6 +2248,23 @@ def helper(shape, dtype):
 
         helper((2, 8, 4, 5), torch.int16)
 
+    def test_cumsum_minus_one_axis(self):
+        def helper(dtype):
+            # Test with axis -1
+            cpu_x = None
+            if(dtype == torch.float32):
+                cpu_x = torch.randn(10, 3, device='cpu', dtype=torch.float32)
+            else:
+                cpu_x = torch.randint(0, 20, (10, 3), device='cpu', dtype=torch.float32)
+            x = cpu_x.detach().clone().to('mps')
+
+            cpu_y = cpu_x.cumsum(-1)
+            y = x.cumsum(-1)
+
+            self.assertEqual(y, cpu_y)
+
+        [helper(dtype) for dtype in [torch.float32, torch.int16, torch.int32, torch.uint8]]
+
 class TestLogical(TestCase):
     def _wrap_tensor(self, x, device="cpu", dtype=None, requires_grad=False):
         return torch.tensor(x, device=device, dtype=dtype, requires_grad=requires_grad)

From a2af8c4d3d5c4989707540c80f212c06925dc824 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Sat, 28 Jan 2023 11:43:00 -0800
Subject: [PATCH 1913/1922] Fix test mps key search in blocklist dicts (#265)

* Fix test mps

* Address PR comments

* Add to blocklist remaining failures

* Fix comments
---
 test/test_mps.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index 04141ace1d2eb..d18cbf4e6df29 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -9433,6 +9433,10 @@ class TestConsistency(TestCase):
         'nn.functional.avg_pool2d': [torch.float32, torch.int64],
         'nn.functional.adaptive_avg_pool1d': [torch.float32],
         'nn.functional.adaptive_avg_pool2d': [torch.float32],
+
+        # failures due to issue #102048039: powerWithPrimaryTensor() with integer input may return wrong results
+        'pow': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        '__rpow__': [torch.int16, torch.int32, torch.uint8, torch.int64],
     }
 
     UNIMPLEMENTED_OPS = {
@@ -9750,10 +9754,6 @@ class TestConsistency(TestCase):
         'tensordot': [torch.int16, torch.int32, torch.int64, torch.uint8],
         'zeros_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'bincount': [torch.int16, torch.int32, torch.int64, torch.uint8],
-
-        # failures due to issue #102048039: powerWithPrimaryTensor() with integer input may return wrong results
-        'pow': [torch.int16, torch.int32, torch.int64, torch.uint8],
-        '__rpow__': [torch.int16, torch.int32],
     }
 
     UNDEFINED_BEHAVIOUR = {
@@ -9823,7 +9823,7 @@ def get_error_message(self, key, op_name, dtype):
             return f"Running test with {op_name} expected to fail due to unsupported MPS data type so skipping"
         elif key in self.UNIMPLEMENTED_OPS and dtype in self.UNIMPLEMENTED_OPS[key]:
             return f"Running test with {op_name} expected to fail due to missing op implementation"
-        return f"Running test with {op_name} hangs so skipping"
+        return None
 
     def compare_with_CUDA(self, op, mps_out, atol, rtol):
         cuda_out = self.CUDA_RESULT[op.name]
@@ -9842,7 +9842,9 @@ def test_output_match(self, device, dtype, op):
 
         key = op.name + op.variant_test_name
         if key in self.MPS_SKIP_LIST:
-            self.skipTest(self.get_error_message(key, op.name, dtype))
+            msg = self.get_error_message(key, op.name, dtype)
+            if msg is not None:
+                self.skipTest(msg)
 
         # Make this an expecttest manually
         # When this env variable is set, generate a new ALLOWLIST_OP

From a88a61248187b29907f2a03598af37db11089006 Mon Sep 17 00:00:00 2001
From: Ronian526 <11454459+Ronian526@users.noreply.github.com>
Date: Mon, 30 Jan 2023 12:35:43 -0800
Subject: [PATCH 1914/1922] Fix layer norm (#262)

- remove the duplicate calculation from layer_norm_mps
- raise atol rtol for native_layer_norm
---
 aten/src/ATen/native/mps/operations/Normalization.mm | 6 ++----
 test/test_mps.py                                     | 4 ++++
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Normalization.mm b/aten/src/ATen/native/mps/operations/Normalization.mm
index 964ce2da1ec69..fd0a8471c7545 100644
--- a/aten/src/ATen/native/mps/operations/Normalization.mm
+++ b/aten/src/ATen/native/mps/operations/Normalization.mm
@@ -927,8 +927,6 @@ string get_mem_string(c10::MemoryFormat memory_format) {
   at::Tensor mean = std::get<1>(outputs);
   at::Tensor variance = std::get<2>(outputs);
 
-  at::Tensor rstd = at::rsqrt(at::add(variance, eps));
-
   std::vector<int64_t> stat_shape;
   for (const auto idx : c10::irange(axis)) {
     stat_shape.push_back(input_shape[idx]);
@@ -938,8 +936,8 @@ string get_mem_string(c10::MemoryFormat memory_format) {
     stat_shape.push_back(1);
   }
   mean = mean.view(stat_shape);
-  rstd = rstd.view(stat_shape);
-  return std::make_tuple(out, mean, rstd);
+  variance = variance.view(stat_shape);
+  return std::make_tuple(out, mean, variance);
 }
 
 std::tuple<Tensor, Tensor, Tensor> layer_norm_backward_mps(
diff --git a/test/test_mps.py b/test/test_mps.py
index d18cbf4e6df29..772c4001686fb 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -9391,6 +9391,7 @@ class TestConsistency(TestCase):
         'zero_': ['f16', 'f32'],
         '_native_batch_norm_legit': ['f32'],
         'native_batch_norm': ['f32'],
+        'native_layer_norm': ['f32'],
     }
 
     # These ops that are problematic. So never run them even when
@@ -9903,6 +9904,9 @@ def get_samples():
                 elif (op.name == "masked.mean"):
                     atol = 7e-4
                     rtol = 2e-3
+                elif (op.name == "native_layer_norm"):
+                    atol = 1e-4
+                    rtol = 1.3e-5
                 else:
                     atol = None
                     rtol = None

From d3d4b9d113c5a7ed1fee4c06b35484e503ade8ac Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Mon, 30 Jan 2023 15:47:01 -0500
Subject: [PATCH 1915/1922] Fix the crash in View ops when slicing wrong
 lengths (#267)

---
 aten/src/ATen/native/mps/operations/View.mm | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/View.mm b/aten/src/ATen/native/mps/operations/View.mm
index fc6eaf8e40ef8..cd28295eab9e2 100644
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@@ -284,17 +284,19 @@
         // Find what dimension and native length was for the specified stride
         NSDictionary *srcDimLengthOffset = srcStrideToDimLengthOffset[[NSString stringWithFormat:@"%lld",dstStrides[dstDim]]];
 
+        dstDimToSliceLength[dstDim] = dstSizes[dstDim];
+        dstDimToSliceOffset[dstDim] = [srcDimLengthOffset[@"offset"] intValue];
+
         // Stride does not exist in source tensor, or the specified size is too long. Not possible
         // TODO: Longer length with same stride + removal of dim(s) above this is a flatten/reshape. Consider adding support
-        if (!srcDimLengthOffset || dstSizes[dstDim] > [srcDimLengthOffset[@"length"] intValue])
+        if (!srcDimLengthOffset ||
+            // the offset + length of destination should not be larger than source's length when slicing
+            dstDimToSliceOffset[dstDim] + dstDimToSliceLength[dstDim] > [srcDimLengthOffset[@"length"] intValue]) {
           return nil;
-
+        }
         // Get the src dimension corresponding to the requested stride
         NSNumber *srcDim = srcDimLengthOffset[@"dim"];
         [dstDimOrder insertObject:srcDim atIndex:0];
-
-        dstDimToSliceLength[dstDim] = dstSizes[dstDim];
-        dstDimToSliceOffset[dstDim] = [srcDimLengthOffset[@"offset"] intValue];
       }
     }
   }

From 4bea291cf89de8a1f1a04f961eb8f18ab09ad84a Mon Sep 17 00:00:00 2001
From: Ronian526 <11454459+Ronian526@users.noreply.github.com>
Date: Mon, 30 Jan 2023 17:30:33 -0800
Subject: [PATCH 1916/1922] Adding blocklist for macOS 12 (#266)

* Adding blocklist for macOS 12
- add blocklist for macOS 12
- move nn.functional.conv_transpose2d to the list

* - add allowlist for macOS 13.3
- move nn.functional.conv_transpose2d to the list
- move pow and __rpow__ to the list
---
 test/test_mps.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/test/test_mps.py b/test/test_mps.py
index 772c4001686fb..5bb48cde50c84 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -16,6 +16,7 @@
 import torch.nn.functional as F
 import itertools
 import yaml
+import platform
 from collections import defaultdict
 from torch._six import inf
 from torch.nn import Parameter
@@ -9438,6 +9439,9 @@ class TestConsistency(TestCase):
         # failures due to issue #102048039: powerWithPrimaryTensor() with integer input may return wrong results
         'pow': [torch.int16, torch.int32, torch.int64, torch.uint8],
         '__rpow__': [torch.int16, torch.int32, torch.uint8, torch.int64],
+
+        # failures before macOS 13.3
+        'nn.functional.conv_transpose2d': [torch.float32],
     }
 
     UNIMPLEMENTED_OPS = {
@@ -9798,6 +9802,16 @@ class TestConsistency(TestCase):
         "true_divide"
     }
 
+    BLOCKLIST_MACOS_12 = {
+        'nn.functional.conv_transpose2d': [torch.float32, torch.float16],
+    }
+
+    ALLOWLIST_MACOS_13_3 = {
+        'pow': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        '__rpow__': [torch.uint8],
+        'nn.functional.conv_transpose2d': [torch.float32],
+    }
+
     dirname = os.path.dirname(__file__)
     filename = os.path.join(dirname, "cuda_results.yaml")
     with open(filename) as f:
@@ -9813,6 +9827,8 @@ class TestConsistency(TestCase):
     NEW_ALLOW_LIST = defaultdict(list)
     NEW_ALLOW_LIST_GRAD = defaultdict(list)
 
+    product_version = float('.'.join(platform.mac_ver()[0].split('.')[:2]))
+
     def get_error_message(self, key, op_name, dtype):
         if key in self.FAST_MATH_PRECISION_ISSUES and dtype in self.FAST_MATH_PRECISION_ISSUES[key]:
             return f"Running test with {op_name} fails due to precision issues (fast math) so skipping"
@@ -9824,6 +9840,8 @@ def get_error_message(self, key, op_name, dtype):
             return f"Running test with {op_name} expected to fail due to unsupported MPS data type so skipping"
         elif key in self.UNIMPLEMENTED_OPS and dtype in self.UNIMPLEMENTED_OPS[key]:
             return f"Running test with {op_name} expected to fail due to missing op implementation"
+        elif self.product_version < 13.0 and key in self.BLOCKLIST_MACOS_12 and dtype in self.BLOCKLIST_MACOS_12[key]:
+            return f"Running test with {op_name} expected to fail on macOS 12"
         return None
 
     def compare_with_CUDA(self, op, mps_out, atol, rtol):
@@ -9843,6 +9861,11 @@ def test_output_match(self, device, dtype, op):
 
         key = op.name + op.variant_test_name
         if key in self.MPS_SKIP_LIST:
+            msg = self.get_error_message(key, op.name, dtype)
+            if msg is not None and not (self.product_version >= 13.3 and
+                                        key in self.ALLOWLIST_MACOS_13_3 and dtype in self.ALLOWLIST_MACOS_13_3[key]):
+                self.skipTest(msg)
+        if self.product_version < 13.0 and key in self.BLOCKLIST_MACOS_12:
             msg = self.get_error_message(key, op.name, dtype)
             if msg is not None:
                 self.skipTest(msg)

From 8d873bbf69556f7f4d2d529e06ba5d7d29620c3d Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Mon, 30 Jan 2023 20:31:08 -0500
Subject: [PATCH 1917/1922] Fix is_mps_available() regression on non-MPS
 devices (#268)

- This patch fixes a regression caused by recent MPS module interface. Since we now compile torch._C.is_mps_available()
only if USE_MPS is defined, then it may cause failures on CUDA (and other devices when USE_MPS is not defined) if we upstream.
So, this patch checks if is_mps_available is implemented first and then calls it.
- Also use the unique name `default_mps_generator` to avoid conflicts with CPU default generator
---
 torch/backends/mps/__init__.py |  2 ++
 torch/csrc/mps/Module.cpp      |  2 +-
 torch/mps/__init__.py          | 14 ++++++++------
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/torch/backends/mps/__init__.py b/torch/backends/mps/__init__.py
index b6cec317eb54d..1664c87ee5de7 100644
--- a/torch/backends/mps/__init__.py
+++ b/torch/backends/mps/__init__.py
@@ -11,4 +11,6 @@ def is_built() -> bool:
 @_lru_cache()
 def is_available() -> bool:
     r"""Returns a bool indicating if MPS is currently available."""
+    if not hasattr(torch._C, '_is_mps_available'):
+        return False
     return torch._C._is_mps_available()
diff --git a/torch/csrc/mps/Module.cpp b/torch/csrc/mps/Module.cpp
index 6d2b4b451b3f1..4ea490005ef72 100644
--- a/torch/csrc/mps/Module.cpp
+++ b/torch/csrc/mps/Module.cpp
@@ -33,7 +33,7 @@ static PyObject* MPSModule_initExtension(PyObject* self, PyObject* noargs) {
   // NOLINTNEXTLINE(performance-unnecessary-copy-initialization)
   auto gen = at::mps::detail::getDefaultMPSGenerator();
   auto default_mps_generator = (THPGenerator*)THPGenerator_initDefaultGenerator(gen);
-  set_module_attr("default_generator", (PyObject*) default_mps_generator);
+  set_module_attr("default_mps_generator", (PyObject*) default_mps_generator);
 
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
diff --git a/torch/mps/__init__.py b/torch/mps/__init__.py
index 16d9ea209a245..52032943ff67e 100644
--- a/torch/mps/__init__.py
+++ b/torch/mps/__init__.py
@@ -9,7 +9,7 @@
 
 _initialized = False
 _initialization_lock = threading.Lock()
-default_generator: torch._C.Generator = ()  # type: ignore[assignment]
+default_mps_generator: torch._C.Generator = None  # type: ignore[assignment]
 
 def init():
     r"""Initialize PyTorch's MPS state.
@@ -32,6 +32,8 @@ def _lazy_init():
 @lru_cache()
 def is_available() -> bool:
     r"""Returns a bool indicating if MPS is currently available."""
+    if not hasattr(torch._C, '_is_mps_available'):
+        return False
     return torch._C._is_mps_available()
 
 def synchronize() -> None:
@@ -43,7 +45,7 @@ def synchronize() -> None:
 def get_rng_state() -> Tensor:
     r"""Returns the random number generator state as a ByteTensor."""
     _lazy_init()
-    return default_generator.get_state()
+    return default_mps_generator.get_state()
 
 def set_rng_state(new_state: Tensor) -> None:
     r"""Sets the random number generator state.
@@ -52,7 +54,7 @@ def set_rng_state(new_state: Tensor) -> None:
     """
     _lazy_init()
     new_state_copy = new_state.clone(memory_format=torch.contiguous_format)
-    default_generator.set_state(new_state_copy)
+    default_mps_generator.set_state(new_state_copy)
 
 def manual_seed(seed: int) -> None:
     r"""Sets the seed for generating random numbers
@@ -63,17 +65,17 @@ def manual_seed(seed: int) -> None:
         return
     _lazy_init()
     seed = int(seed)
-    default_generator.manual_seed(seed)
+    default_mps_generator.manual_seed(seed)
 
 def seed() -> None:
     r"""Sets the seed for generating random numbers to a random number."""
     _lazy_init()
-    default_generator.seed()
+    default_mps_generator.seed()
 
 def is_initialized():
     r"""Returns whether PyTorch's MPS state has been initialized."""
     return _initialized
 
 __all__ = [
-    'default_generator', 'get_rng_state', 'is_available', 'manual_seed',
+    'default_mps_generator', 'get_rng_state', 'is_available', 'manual_seed',
     'seed', 'set_rng_state', 'synchronize', 'init', 'is_initialized']

From 11651e111836fa558cc906d75e4ce9c1b93591f7 Mon Sep 17 00:00:00 2001
From: Kulin Seth <kulin_seth@apple.com>
Date: Mon, 30 Jan 2023 17:31:22 -0800
Subject: [PATCH 1918/1922] Add im2col and col2im to Fallback as they are
 mostly used in Preprocessing layers. (#264)

---
 aten/src/ATen/mps/MPSFallback.mm | 2 ++
 test/test_mps.py                 | 7 +++++++
 2 files changed, 9 insertions(+)

diff --git a/aten/src/ATen/mps/MPSFallback.mm b/aten/src/ATen/mps/MPSFallback.mm
index e5cc5e237c728..bb2ea6e693793 100644
--- a/aten/src/ATen/mps/MPSFallback.mm
+++ b/aten/src/ATen/mps/MPSFallback.mm
@@ -59,6 +59,8 @@ Tensor slow_conv2d_forward_mps(
   m.impl("repeat_interleave.self_int", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
   m.impl("_fft_c2c", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
   m.impl("_fft_r2c", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
+  m.impl("im2col", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>()); // Used in  preprocessing by nn.Unfold
+  m.impl("col2im", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
   m.impl("linalg_vector_norm", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
   m.impl("_slow_conv2d_forward", slow_conv2d_forward_mps);
   m.impl("upsample_nearest3d.vec", torch::CppFunction::makeFromBoxedFunction<&mps_fallback>());
diff --git a/test/test_mps.py b/test/test_mps.py
index 5bb48cde50c84..5f270b73b465b 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -4123,6 +4123,13 @@ def helper(n, c):
 
         helper(3, 1)
 
+    def test_im2col(self):
+        def helper(x):
+            return torch.nn.functional.unfold( x, kernel_size=(10, 15), dilation=2, padding=5, stride=3)
+        x_cpu = torch.rand(1, 1, 200, 100)
+        x = x_cpu.detach().clone().to('mps')
+        self.assertEqual(helper(x_cpu), helper(x))
+
     def test_select(self):
         def helper(n, c):
             cpu_x = torch.randn(n, c, device='cpu', dtype=torch.float, requires_grad=True)

From 42e7d51b6179aea3da7e25c5499a3abc0768f15c Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 31 Jan 2023 10:21:51 -0800
Subject: [PATCH 1919/1922] Fix lintrunner (#271)

---
 test/test_mps.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index 5f270b73b465b..4fe168c8da80e 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -4125,7 +4125,7 @@ def helper(n, c):
 
     def test_im2col(self):
         def helper(x):
-            return torch.nn.functional.unfold( x, kernel_size=(10, 15), dilation=2, padding=5, stride=3)
+            return torch.nn.functional.unfold(x, kernel_size=(10, 15), dilation=2, padding=5, stride=3)
         x_cpu = torch.rand(1, 1, 200, 100)
         x = x_cpu.detach().clone().to('mps')
         self.assertEqual(helper(x_cpu), helper(x))

From d1b64eb2f6c47450b0a9bd1d282b2e415a276ea5 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 31 Jan 2023 13:14:19 -0800
Subject: [PATCH 1920/1922] Update gradient allowlist and blocklist (#269)

* Update gradient allowlist and blocklist

* Remove prints

* Fix lintrunner
---
 test/test_mps.py | 449 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 420 insertions(+), 29 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index 4fe168c8da80e..2af28acc634fd 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -9229,16 +9229,18 @@ class TestConsistency(TestCase):
     }
 
     ALLOWLIST_OP_GRAD = {
+        'H': ['f16', 'f32'],
+        'T': ['f16', 'f32'],
+        '__getitem__': ['f16', 'f32'],
         '__radd__': ['f16', 'f32'],
         '__rdiv__': ['f16', 'f32'],
         '__rmatmul__': ['f32'],
+        '__rmod__': ['f16', 'f32'],
         '__rmul__': ['f16', 'f32'],
-        'masked.log_softmax': ['f32'],
-        'masked.logaddexp': ['f32'],
-        'masked.softmax': ['f32'],
-        'masked.softmin': ['f32'],
-        'masked.std': ['f32'],
-        'masked.var': ['f32'],
+        '__rpow__': ['f32'],
+        '__rsub__': ['f16', 'f32'],
+        '_native_batch_norm_legit': ['f32'],
+        '_softmax_backward_data': ['f32'],
         'abs': ['f16', 'f32'],
         'acos': ['f32'],
         'acosh': ['f32'],
@@ -9250,156 +9252,544 @@ class TestConsistency(TestCase):
         'addmv': ['f32'],
         'addr': ['f32'],
         'all': ['f16', 'f32'],
+        'amax': ['f16', 'f32'],
+        'amin': ['f16', 'f32'],
+        'angle': ['f16', 'f32'],
         'any': ['f16', 'f32'],
         'arange': ['f16', 'f32'],
         'argmax': ['f16', 'f32'],
         'argmin': ['f16', 'f32'],
+        'argsort': ['f16', 'f32'],
+        'argwhere': ['f16', 'f32'],
+        'as_strided': ['f16', 'f32'],
+        'as_strided_scatter': ['f16', 'f32'],
         'asin': ['f32'],
         'asinh': ['f32'],
         'atan': ['f32'],
         'atan2': ['f32'],
+        'atanh': ['f32'],
         'atleast_1d': ['f16', 'f32'],
         'atleast_2d': ['f16', 'f32'],
         'atleast_3d': ['f16', 'f32'],
         'baddbmm': ['f32'],
+        'bernoulli': ['f32'],
+        'bfloat16': ['f16', 'f32'],
         'block_diag': ['f16', 'f32'],
         'bmm': ['f32'],
+        'bool': ['f16', 'f32'],
         'broadcast_shapes': ['f32'],
+        'broadcast_tensors': ['f16', 'f32'],
+        'broadcast_to': ['f16', 'f32'],
+        'bucketize': ['f16', 'f32'],
+        'byte': ['f16', 'f32'],
+        'cartesian_prod': ['f16', 'f32'],
+        'cat': ['f16', 'f32'],
+        'cdist': ['f32'],
         'ceil': ['f32'],
+        'char': ['f16', 'f32'],
+        'cholesky': ['f32'],
+        'cholesky_inverse': ['f32'],
+        'cholesky_solve': ['f32'],
         'chunk': ['f16', 'f32'],
+        'clamp': ['f32'],
+        'clamp_max': ['f16', 'f32'],
+        'clamp_min': ['f16', 'f32'],
         'clone': ['f16', 'f32'],
         'column_stack': ['f16', 'f32'],
+        'combinations': ['f16', 'f32'],
         'conj': ['f16', 'f32'],
         'conj_physical': ['f16', 'f32'],
+        'constant_pad_nd': ['f16', 'f32'],
         'contiguous': ['f16', 'f32'],
+        'copysign': ['f16', 'f32'],
         'corrcoef': ['f32'],
         'cos': ['f32'],
         'cosh': ['f32'],
-        'cumsum': ['f16', 'f32'],
+        'count_nonzero': ['f16', 'f32'],
+        'cov': ['f32'],
+        'cross': ['f32'],
+        'cummax': ['f32'],
+        'cummin': ['f32'],
+        'cumprod': ['f32'],
+        'cumsum': ['f32'],
+        'cumulative_trapezoid': ['f32'],
         'deg2rad': ['f16', 'f32'],
-        'diag': ['f32'],
+        'diag': ['f16', 'f32'],
         'diag_embed': ['f16', 'f32'],
-        'diagflat': ['f32'],
+        'diagflat': ['f16', 'f32'],
+        'diagonal': ['f16', 'f32'],
+        'diagonal_copy': ['f16', 'f32'],
         'diagonal_scatter': ['f16', 'f32'],
         'diff': ['f16', 'f32'],
-        'dist': ['f32'],
+        'digamma': ['f32'],
+        'dist': ['f16', 'f32'],
+        'div': ['f16', 'f32'],
         'dot': ['f32'],
+        'double': ['f16', 'f32'],
+        'dsplit': ['f16', 'f32'],
+        'dstack': ['f16', 'f32'],
+        'einsum': ['f32'],
+        'empty_like': ['f16', 'f32'],
+        'eq': ['f16', 'f32'],
         'erf': ['f32'],
+        'erfc': ['f32'],
+        'erfinv': ['f32'],
         'exp': ['f32'],
         'exp2': ['f16', 'f32'],
+        'expand': ['f16', 'f32'],
+        'expand_as': ['f16', 'f32'],
+        'expm1': ['f32'],
+        'fft.fftshift': ['f16', 'f32'],
+        'fft.hfft': ['f32'],
+        'fft.hfft2': ['f32'],
+        'fft.hfftn': ['f32'],
+        'fft.ifftshift': ['f16', 'f32'],
+        'fft.irfft': ['f32'],
+        'fft.irfft2': ['f32'],
+        'fft.irfftn': ['f32'],
         'fill': ['f16', 'f32'],
         'flatten': ['f16', 'f32'],
         'flip': ['f16', 'f32'],
         'fliplr': ['f16', 'f32'],
         'flipud': ['f16', 'f32'],
-        'float': ['f32'],
+        'float': ['f16', 'f32'],
+        'float_power': ['f16', 'f32'],
         'floor': ['f32'],
-        'gradient': ['f32'],
-        'half': ['f16'],
+        'fmax': ['f16', 'f32'],
+        'fmin': ['f16', 'f32'],
+        'fmod': ['f16', 'f32'],
+        'frac': ['f16', 'f32'],
+        'frexp': ['f16', 'f32'],
+        'full': ['f16', 'f32'],
+        'full_like': ['f16', 'f32'],
+        'gather': ['f16', 'f32'],
+        'ge': ['f16', 'f32'],
+        'gradient': ['f16', 'f32'],
+        'grid_sampler_2d': ['f32'],
+        'gt': ['f16', 'f32'],
+        'half': ['f16', 'f32'],
+        'histc': ['f32'],
+        'hsplit': ['f16', 'f32'],
         'hstack': ['f16', 'f32'],
-        'index_select': ['f32'],
+        'hypot': ['f32'],
+        'i0': ['f32'],
+        'index_add': ['f16', 'f32'],
+        'index_copy': ['f16', 'f32'],
+        'index_fill': ['f16', 'f32'],
+        'index_put': ['f16', 'f32'],
+        'index_reduce': ['f16', 'f32'],
+        'index_select': ['f16', 'f32'],
+        'inner': ['f32'],
+        'int': ['f16', 'f32'],
         'isclose': ['f16', 'f32'],
         'isfinite': ['f16', 'f32'],
+        'isin': ['f32'],
         'isinf': ['f16', 'f32'],
         'isnan': ['f16', 'f32'],
+        'isneginf': ['f16', 'f32'],
+        'isposinf': ['f16', 'f32'],
         'isreal': ['f16', 'f32'],
-        'kron': ['f32'],
-        'linalg.matrix_norm': ['f16'],
+        'kron': ['f16', 'f32'],
+        'kthvalue': ['f32'],
+        'ldexp': ['f16', 'f32'],
+        'le': ['f16', 'f32'],
+        'lerp': ['f32'],
+        'lgamma': ['f32'],
+        'linalg.cholesky': ['f32'],
+        'linalg.cholesky_ex': ['f32'],
+        'linalg.cond': ['f32'],
+        'linalg.cross': ['f32'],
+        'linalg.det': ['f32'],
+        'linalg.eigh': ['f32'],
+        'linalg.eigvalsh': ['f32'],
+        'linalg.householder_product': ['f32'],
+        'linalg.inv': ['f32'],
+        'linalg.inv_ex': ['f32'],
+        'linalg.ldl_factor': ['f32'],
+        'linalg.ldl_factor_ex': ['f32'],
+        'linalg.lstsq': ['f32'],
+        'linalg.lu': ['f32'],
+        'linalg.lu_factor': ['f32'],
+        'linalg.lu_factor_ex': ['f32'],
+        'linalg.lu_solve': ['f32'],
+        'linalg.matrix_norm': ['f16', 'f32'],
+        'linalg.matrix_power': ['f32'],
+        'linalg.matrix_rank': ['f32'],
+        'linalg.multi_dot': ['f32'],
+        'linalg.norm': ['f16', 'f32'],
+        'linalg.pinv': ['f32'],
+        'linalg.qr': ['f32'],
+        'linalg.slogdet': ['f32'],
+        'linalg.solve': ['f32'],
+        'linalg.solve_ex': ['f32'],
+        'linalg.solve_triangular': ['f32'],
         'linalg.svd': ['f32'],
+        'linalg.svdvals': ['f32'],
+        'linalg.tensorinv': ['f32'],
+        'linalg.tensorsolve': ['f32'],
+        'linalg.vander': ['f32'],
+        'linalg.vecdot': ['f32'],
+        'linalg.vector_norm': ['f16', 'f32'],
         'linspace': ['f16', 'f32'],
         'log': ['f32'],
         'log10': ['f32'],
         'log1p': ['f32'],
         'log2': ['f32'],
-        'log_softmax': ['f32'],
+        'log_softmax': ['f32', 'f16'],
         'logaddexp': ['f32'],
+        'logaddexp2': ['f32'],
+        'logcumsumexp': ['f32'],
+        'logdet': ['f32'],
+        'logical_and': ['f16', 'f32'],
         'logical_not': ['f16', 'f32'],
+        'logical_or': ['f16', 'f32'],
+        'logical_xor': ['f16', 'f32'],
+        'logit': ['f32'],
         'logspace': ['f32'],
+        'logsumexp': ['f32'],
+        'long': ['f16', 'f32'],
+        'lt': ['f16', 'f32'],
+        'lu': ['f32'],
+        'lu_solve': ['f32'],
+        'lu_unpack': ['f32'],
+        'mH': ['f16', 'f32'],
+        'mT': ['f16', 'f32'],
+        'masked.amax': ['f16', 'f32'],
+        'masked.amin': ['f16', 'f32'],
+        'masked.argmax': ['f16', 'f32'],
+        'masked.argmin': ['f16', 'f32'],
+        'masked.cumprod': ['f32'],
+        'masked.cumsum': ['f32'],
+        'masked.log_softmax': ['f32'],
+        'masked.logaddexp': ['f32'],
+        'masked.logsumexp': ['f32'],
+        'masked.mean': ['f16', 'f32'],
+        'masked.median': ['f32'],
+        'masked.norm': ['f16', 'f32'],
+        'masked.normalize': ['f16', 'f32'],
+        'masked.prod': ['f32'],
+        'masked.softmax': ['f32'],
+        'masked.softmin': ['f32'],
+        'masked.std': ['f32'],
+        'masked.sum': ['f16', 'f32'],
+        'masked.var': ['f16', 'f32'],
+        'masked_fill': ['f16', 'f32'],
+        'masked_scatter': ['f16', 'f32'],
+        'masked_select': ['f16', 'f32'],
         'matmul': ['f32'],
-        'einsum': ['f32'],
+        'matrix_exp': ['f32'],
+        'max': ['f16', 'f32'],
+        'max_pool2d_with_indices_backward': ['f32'],
+        'maximum': ['f16', 'f32'],
+        'mean': ['f16', 'f32'],
+        'median': ['f32'],
+        'meshgrid': ['f16', 'f32'],
+        'min': ['f16', 'f32'],
+        'minimum': ['f16', 'f32'],
         'mm': ['f32'],
+        'mode': ['f16', 'f32'],
+        'movedim': ['f16', 'f32'],
+        'msort': ['f16', 'f32'],
+        'mul': ['f16', 'f32'],
+        'multinomial': ['f32'],
         'mv': ['f32'],
+        'mvlgamma': ['f32'],
+        'nan_to_num': ['f16', 'f32'],
+        'nanmean': ['f16', 'f32'],
+        'nanmedian': ['f32'],
+        'nanquantile': ['f32'],
+        'nansum': ['f16', 'f32'],
+        'narrow': ['f16', 'f32'],
+        'native_batch_norm': ['f32'],
+        'native_dropout_backward': ['f16', 'f32'],
+        'native_layer_norm': ['f32'],
+        'ne': ['f16', 'f32'],
         'neg': ['f16', 'f32'],
+        'new_empty': ['f16', 'f32'],
+        'new_empty_strided': ['f16', 'f32'],
+        'new_full': ['f16', 'f32'],
+        'new_ones': ['f16', 'f32'],
+        'new_zeros': ['f16', 'f32'],
+        'nn.functional._scaled_dot_product_attention': ['f32'],
+        'nn.functional.adaptive_avg_pool1d': ['f32'],
+        'nn.functional.adaptive_avg_pool2d': ['f32'],
+        'nn.functional.adaptive_avg_pool3d': ['f16', 'f32'],
         'nn.functional.adaptive_max_pool1d': ['f32'],
         'nn.functional.adaptive_max_pool2d': ['f32'],
+        'nn.functional.adaptive_max_pool3d': ['f32'],
+        'nn.functional.alpha_dropout': ['f32'],
+        'nn.functional.avg_pool1d': ['f32'],
+        'nn.functional.avg_pool2d': ['f32'],
+        'nn.functional.avg_pool3d': ['f32'],
+        'nn.functional.batch_norm': ['f32'],
+        'nn.functional.bilinear': ['f32'],
         'nn.functional.binary_cross_entropy': ['f32'],
+        'nn.functional.binary_cross_entropy_with_logits': ['f32'],
         'nn.functional.celu': ['f32'],
         'nn.functional.conv1d': ['f32'],
         'nn.functional.conv2d': ['f32'],
         'nn.functional.conv_transpose1d': ['f32'],
+        'nn.functional.conv_transpose2d': ['f32'],
+        'nn.functional.conv_transpose3d': ['f32'],
         'nn.functional.cosine_embedding_loss': ['f32'],
+        'nn.functional.cosine_similarity': ['f32'],
+        'nn.functional.cross_entropy': ['f32'],
+        'nn.functional.ctc_loss': ['f32'],
+        'nn.functional.dropout': ['f32'],
+        'nn.functional.dropout2d': ['f32'],
+        'nn.functional.dropout3d': ['f32'],
         'nn.functional.elu': ['f32'],
-        'nn.functional.feature_alpha_dropout': ['f16', 'f32'],
+        'nn.functional.embedding': ['f16', 'f32'],
+        'nn.functional.embedding_bag': ['f16', 'f32'],
+        'nn.functional.feature_alpha_dropout': ['f32', 'f16'],
+        'nn.functional.fractional_max_pool2d': ['f32'],
+        'nn.functional.fractional_max_pool3d': ['f32'],
+        'nn.functional.gaussian_nll_loss': ['f32'],
+        'nn.functional.gelu': ['f32'],
         'nn.functional.glu': ['f32'],
+        'nn.functional.grid_sample': ['f32'],
+        'nn.functional.group_norm': ['f32'],
+        'nn.functional.hardshrink': ['f32'],
+        'nn.functional.hardsigmoid': ['f32'],
+        'nn.functional.hardswish': ['f32'],
         'nn.functional.hardtanh': ['f32'],
         'nn.functional.hinge_embedding_loss': ['f32'],
         'nn.functional.huber_loss': ['f16', 'f32'],
         'nn.functional.instance_norm': ['f32'],
+        'nn.functional.interpolate': ['f32'],
         'nn.functional.kl_div': ['f32'],
         'nn.functional.l1_loss': ['f16', 'f32'],
+        'nn.functional.layer_norm': ['f32'],
         'nn.functional.leaky_relu': ['f32'],
-        'nn.functional.local_response_norm': ['f32', 'i64'],
+        'nn.functional.linear': ['f32'],
+        'nn.functional.local_response_norm': ['f32'],
+        'nn.functional.logsigmoid': ['f32'],
         'nn.functional.margin_ranking_loss': ['f32'],
+        'nn.functional.max_pool1d': ['f32'],
+        'nn.functional.max_pool2d': ['f32'],
+        'nn.functional.max_pool3d': ['f32'],
+        'nn.functional.max_unpool1d': ['f32'],
+        'nn.functional.max_unpool2d': ['f32'],
+        'nn.functional.max_unpool3d': ['f32'],
+        'nn.functional.mish': ['f32'],
         'nn.functional.mse_loss': ['f32'],
-        'nn.functional.pad': ['f16', 'f32', 'i16', 'i32', 'i64'],
+        'nn.functional.multi_margin_loss': ['f32'],
+        'nn.functional.multilabel_margin_loss': ['f32'],
+        'nn.functional.multilabel_soft_margin_loss': ['f32'],
+        'nn.functional.nll_loss': ['f32'],
+        'nn.functional.normalize': ['f32'],
+        'nn.functional.pad': ['f16', 'f32'],
         'nn.functional.pairwise_distance': ['f16', 'f32'],
+        'nn.functional.pdist': ['f32'],
+        'nn.functional.pixel_shuffle': ['f16', 'f32'],
+        'nn.functional.pixel_unshuffle': ['f16', 'f32'],
         'nn.functional.poisson_nll_loss': ['f32'],
+        'nn.functional.prelu': ['f32'],
         'nn.functional.relu': ['f32'],
         'nn.functional.relu6': ['f32'],
+        'nn.functional.rrelu': ['f32'],
         'nn.functional.selu': ['f32'],
         'nn.functional.silu': ['f32'],
+        'nn.functional.smooth_l1_loss': ['f32'],
         'nn.functional.soft_margin_loss': ['f32'],
-        'nn.functional.softmin': ['f32'],
+        'nn.functional.softmin': ['f32', 'f16'],
         'nn.functional.softplus': ['f32'],
+        'nn.functional.softshrink': ['f32'],
         'nn.functional.softsign': ['f16', 'f32'],
+        'nn.functional.tanhshrink': ['f32'],
         'nn.functional.threshold': ['f32'],
         'nn.functional.triplet_margin_loss': ['f32'],
         'nn.functional.triplet_margin_with_distance_loss': ['f32'],
+        'nn.functional.unfold': ['f16', 'f32'],
         'nn.functional.upsample_bilinear': ['f32'],
-        'norm': ['f32', 'f16'],
+        'nn.functional.upsample_nearest': ['f32'],
+        'nonzero': ['f16', 'f32'],
+        'norm': ['f16', 'f32'],
+        'normal': ['f16', 'f32'],
+        'ones': ['f16', 'f32'],
+        'ones_like': ['f16', 'f32'],
+        'ormqr': ['f32'],
+        'outer': ['f16', 'f32'],
+        'pca_lowrank': ['f32'],
+        'permute': ['f16', 'f32'],
+        'pinverse': ['f32'],
+        'polygamma': ['f32'],
         'positive': ['f16', 'f32'],
+        'pow': ['f32'],
+        'prod': ['f32'],
+        'put': ['f16', 'f32'],
+        'qr': ['f32'],
+        'quantile': ['f32'],
         'rad2deg': ['f16', 'f32'],
+        'rand_like': ['f16', 'f32'],
+        'randint': ['f16', 'f32'],
+        'randint_like': ['f16', 'f32'],
+        'randn_like': ['f16', 'f32'],
+        'ravel': ['f16', 'f32'],
         'real': ['f16', 'f32'],
         'reciprocal': ['f16', 'f32'],
+        'remainder': ['f16', 'f32'],
+        'renorm': ['f16', 'f32'],
         'repeat': ['f16', 'f32'],
         'repeat_interleave': ['f16', 'f32'],
+        'reshape': ['f16', 'f32'],
+        'reshape_as': ['f16', 'f32'],
         'resolve_conj': ['f16', 'f32'],
         'resolve_neg': ['f16', 'f32'],
+        'roll': ['f16', 'f32'],
+        'rot90': ['f16', 'f32'],
         'round': ['f32'],
         'rsqrt': ['f32'],
+        'rsub': ['f16', 'f32'],
+        'scatter': ['f16', 'f32'],
+        'scatter_add': ['f16', 'f32'],
+        'scatter_reduce': ['f16', 'f32'],
+        'searchsorted': ['f16', 'f32'],
+        'segment_reduce': ['f16', 'f32'],
+        'select': ['f16', 'f32'],
         'select_scatter': ['f16', 'f32'],
+        'sgn': ['f16', 'f32'],
+        'short': ['f16', 'f32'],
+        'sigmoid': ['f32'],
         'sign': ['f16', 'f32'],
+        'signbit': ['f16', 'f32'],
         'sin': ['f32'],
+        'sinc': ['f32'],
         'sinh': ['f32'],
+        'slice': ['f16', 'f32'],
         'slice_scatter': ['f16', 'f32'],
-        'softmax': ['f32'],
+        'softmax': ['f32', 'f16'],
+        'sort': ['f16', 'f32'],
+        'special.airy_ai': ['f32'],
+        'special.bessel_j0': ['f32'],
+        'special.bessel_j1': ['f32'],
+        'special.bessel_y0': ['f32'],
+        'special.bessel_y1': ['f32'],
+        'special.chebyshev_polynomial_t': ['f32'],
+        'special.chebyshev_polynomial_u': ['f32'],
+        'special.entr': ['f32'],
+        'special.erfcx': ['f32'],
+        'special.hermite_polynomial_h': ['f32'],
+        'special.hermite_polynomial_he': ['f32'],
+        'special.i0e': ['f32'],
+        'special.i1': ['f32'],
+        'special.i1e': ['f32'],
+        'special.laguerre_polynomial_l': ['f32'],
+        'special.log_ndtr': ['f32'],
+        'special.modified_bessel_i0': ['f32'],
+        'special.modified_bessel_i1': ['f32'],
+        'special.modified_bessel_k0': ['f32'],
+        'special.modified_bessel_k1': ['f32'],
+        'special.ndtr': ['f32'],
+        'special.ndtri': ['f32'],
+        'special.polygamma': ['f32'],
+        'special.scaled_modified_bessel_k0': ['f32'],
+        'special.scaled_modified_bessel_k1': ['f32'],
+        'special.spherical_bessel_j0': ['f32'],
+        'special.xlog1py': ['f16', 'f32'],
         'split': ['f16', 'f32'],
+        'split_with_sizes': ['f16', 'f32'],
         'sqrt': ['f32'],
         'square': ['f16', 'f32'],
         'squeeze': ['f16', 'f32'],
         'stack': ['f16', 'f32'],
-        'sub': ['f32'],
+        'std': ['f16', 'f32'],
+        'std_mean': ['f16', 'f32'],
+        'sub': ['f16', 'f32'],
+        'sum': ['f16', 'f32'],
         'sum_to_size': ['f16', 'f32'],
         'svd': ['f32'],
+        'svd_lowrank': ['f32'],
+        'symeig': ['f32'],
         't': ['f16', 'f32'],
+        'take': ['f16', 'f32'],
+        'take_along_dim': ['f16', 'f32'],
+        'tan': ['f32'],
         'tanh': ['f32'],
+        'tensor_split': ['f16', 'f32'],
         'tensordot': ['f32'],
         'tile': ['f16', 'f32'],
+        'to': ['f16', 'f32'],
+        'topk': ['f32'],
+        'trace': ['f32'],
+        'transpose': ['f16', 'f32'],
+        'trapezoid': ['f16', 'f32'],
+        'trapz': ['f16', 'f32'],
+        'triangular_solve': ['f32'],
         'tril': ['f16', 'f32'],
         'triu': ['f16', 'f32'],
         'true_divide': ['f16', 'f32'],
         'trunc': ['f32'],
         'unbind': ['f16', 'f32'],
         'unflatten': ['f16', 'f32'],
+        'unfold': ['f16', 'f32'],
+        'unfold_copy': ['f16', 'f32'],
+        'uniform': ['f16', 'f32'],
         'unsqueeze': ['f16', 'f32'],
+        'var': ['f16', 'f32'],
+        'var_mean': ['f16', 'f32'],
+        'vdot': ['f32'],
         'view': ['f16', 'f32'],
         'view_as': ['f16', 'f32'],
+        'view_copy': ['f16', 'f32'],
         'vsplit': ['f16', 'f32'],
         'vstack': ['f16', 'f32'],
+        'where': ['f16', 'f32'],
+        'xlogy': ['f16', 'f32'],
         'zero_': ['f16', 'f32'],
-        '_native_batch_norm_legit': ['f32'],
-        'native_batch_norm': ['f32'],
-        'native_layer_norm': ['f32'],
+        'zeros': ['f16', 'f32'],
+        'zeros_like': ['f16', 'f32'],
+    }
+
+    BLOCKLIST_OP_GRAD = {
+        # Unimplemented ops
+        '__getitem__': ['f16'],
+        'combinations': ['f16', 'f32'],
+        'logaddexp2': ['f32'],
+        'masked_select': ['f16', 'f32'],
+        'nn.functional.binary_cross_entropy_with_logits': ['f16', 'f32'],
+        'nn.functional.group_norm': ['f32'],
+        'prod': ['f32'],
+        'sgn': ['f16', 'f32'],
+        'unfold_copy': ['f16', 'f32'],
+        'unfold': ['f16', 'f32'],
+        'trace': ['f32'],
+
+        # Hard crash
+        'linalg.norm': ['f16'],
+        'linalg.norm_subgradients': ['f16'],
+        'max': ['f16', 'f32'],
+        'maximum': ['f16', 'f32'],
+        'min': ['f16', 'f32'],
+        'minimum': ['f16', 'f32'],
+        'nn.functional.linear': ['f32'],
+        'nn.functional.prelu': ['f32'],
+        'nn.functional.tanhshrink': ['f32'],
+        'sigmoid': ['f32'],
+
+        # Correctness issues
+        'nn.functional.conv_transpose2d': ['f32'],
+        'atanh': ['f32'],
+        'div': ['f16'],
+        'gradient': ['f16'],
+        'kron': ['f16'],
+        'linalg.solve_triangular': ['f32'],
+        'linalg.vector_norm': ['f16'],
+        'nn.functional.bilinear': ['f32'],
+        'nn.functional.cross_entropy': ['f32'],
+        'nn.functional.gelu': ['f32'],
+        'nn.functional.layer_norm': ['f32'],
+        'nn.functional.nll_loss': ['f32'],
+        'nn.functional.smooth_l1_loss': ['f32'],
+        'std': ['f16'],
+        'triangular_solve': ['f32'],
+        'var': ['f16'],
+        'nn.functional.embedding': ['f16'],
+
+        # Unsupported dtype
+        'special.ndtr': ['f32'],
+        'trapezoid': ['f16', 'f32'],
+        'trapz': ['f16', 'f32'],
     }
 
     # These ops that are problematic. So never run them even when
@@ -9893,7 +10283,8 @@ def test_output_match(self, device, dtype, op):
                 if dtype_abbrs[dtype] not in self.ALLOWLIST_OP[op.name]:
                     self.skipTest(f"{op.name} is in the allow list for MPS but {dtype} is excluded")
 
-            if op.name not in self.ALLOWLIST_OP_GRAD or dtype_abbrs[dtype] not in self.ALLOWLIST_OP_GRAD[op.name]:
+            if (op.name not in self.ALLOWLIST_OP_GRAD or dtype_abbrs[dtype] not in self.ALLOWLIST_OP_GRAD[op.name] or
+               (op.name in self.BLOCKLIST_OP_GRAD and dtype_abbrs[dtype] in self.BLOCKLIST_OP_GRAD[op.name])):
                 run_grad_test = False
 
         def get_samples():

From c00bcc73e0d85ab62d2ae4c3767f16a592208fe5 Mon Sep 17 00:00:00 2001
From: Ronian526 <11454459+Ronian526@users.noreply.github.com>
Date: Tue, 31 Jan 2023 17:06:05 -0800
Subject: [PATCH 1921/1922] - block conv_transpose3d, giving error messages
 (#272)

---
 aten/src/ATen/native/mps/operations/Convolution.mm | 1 +
 1 file changed, 1 insertion(+)

diff --git a/aten/src/ATen/native/mps/operations/Convolution.mm b/aten/src/ATen/native/mps/operations/Convolution.mm
index 7336eabb27ee6..20432d2933e65 100644
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@@ -509,6 +509,7 @@ Tensor _mps_convolution_transpose(
     const Tensor& input_t, const Tensor& weight_t,
     IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation,
     int64_t groups) {
+  TORCH_CHECK(input_t.dim() < 5, "ConvTranspose 3D is not supported on MPS");
 
   auto output_t = mps_convolution_transpose_forward(
     input_t, weight_t, padding, output_padding, stride, dilation, groups);

From 1fff5b57877a433e3862192760b765758a6fe960 Mon Sep 17 00:00:00 2001
From: Shuhan Ding <sding23@apple.com>
Date: Thu, 2 Feb 2023 17:39:29 -0800
Subject: [PATCH 1922/1922] cat with memory format

---
 aten/src/ATen/native/mps/operations/Shape.mm | 21 ++++++++++++++------
 test/test_mps.py                             |  4 ++++
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Shape.mm b/aten/src/ATen/native/mps/operations/Shape.mm
index 3f460437c1c00..de90af408d112 100644
--- a/aten/src/ATen/native/mps/operations/Shape.mm
+++ b/aten/src/ATen/native/mps/operations/Shape.mm
@@ -256,9 +256,6 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
               "torch.cat(): all input tensors and out must be on the same device, but inputs are on ",
               notSkippedTensor.device(), " and out is on ", out.device());
 
-  if (out.suggest_memory_format() == MemoryFormat::ChannelsLast) {
-    out.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
-  }
   std::vector<int64_t> size(notSkippedTensor.sizes().vec());
 
   // Compute size of the result in the cat dimension
@@ -283,6 +280,19 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
     return;
   }
 
+  if (memory_format !=  MemoryFormat::Contiguous) {
+    switch (dimension) {
+      case 0:
+        break;
+      case 1:
+        dimension = out.dim() - dimension;
+        break;
+      default:
+        dimension--;
+        break;
+    }
+  }
+
   struct CachedGraph : public MPSCachedGraph {
     CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
     std::vector<MPSGraphTensor*> inputTensors_;
@@ -333,8 +343,7 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
                                          toType:MPSDataTypeBool
                                            name:@"outputTensor"];
           }
-          newCachedGraph->outputTensor_ = memory_format == MemoryFormat::ChannelsLast ?
-                                         convertNHWCtoNCHW(mpsGraph, outputTensor) : outputTensor;
+          newCachedGraph->outputTensor_ = outputTensor;
         }
         return newCachedGraph;
       });
@@ -362,7 +371,7 @@ void check_shape_except_dim(const Tensor &first, const Tensor &second,
       outputDataType = MPSDataTypeInt8;
     }
     Placeholder outputPlaceholder = Placeholder(
-      cachedGraph->outputTensor_, out, /*mpsShape=*/nil, /*gatherTensorData=*/false, outputDataType);
+      cachedGraph->outputTensor_, out, /*mpsShape=*/getMPSShape(out, memory_format), /*gatherTensorData=*/false, outputDataType);
 
     NSMutableDictionary *feeds = [[NSMutableDictionary new] autorelease];
     for (int i = 0; i < inputPlaceholders.size(); i++) {
diff --git a/test/test_mps.py b/test/test_mps.py
index 2af28acc634fd..b70ff1c43fae4 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -2186,6 +2186,7 @@ def rotate_subset(data):
             cpu_result = rotate_subset(data)
             mps_result = rotate_subset(mps_data)
             self.assertEqual(cpu_result, mps_result.to("cpu"))
+            self.assertEqual(cpu_result.is_contiguous(), mps_result.is_contiguous())
 
     # See https://github.com/pytorch/pytorch/issues/85967
     def test_from_numpy_non_contiguous(self):
@@ -10334,6 +10335,9 @@ def get_samples():
 
                 self.assertEqual(cpu_out, mps_out, atol=atol, rtol=rtol)
 
+                if op.name in ["cat"]:
+                    self.assertEqual(cpu_out.is_contiguous(), mps_out.is_contiguous())
+
             except Exception as e:
                 if any(s in str(e).lower() for s in ["int64", "macos 13"]):
                     self.skipTest(f"{str(e)}")